diff --git a/.bazelrc b/.bazelrc
index c70c57136102b483a4332ca22f775d7a2c5b849e..1a9c46362e530ab8345d40845b7c2a954133823e 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -10,6 +10,9 @@ build:android_arm64 --config=android
 build:android_arm64 --cpu=arm64-v8a
 build:android_arm64 --fat_apk_cpu=arm64-v8a
 
+# Sets the default Apple platform to macOS.
+build --apple_platform_type=macos
+
 # Config to use a mostly-static build and disable modular op registration
 # support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
 # By default, TensorFlow will build with a dependence on
@@ -67,6 +70,7 @@ build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
 build:gdr --define=with_gdr_support=true
 build:ngraph --define=with_ngraph_support=true
 build:verbs --define=with_verbs_support=true
+build:numa --define=with_numa_support=true
 
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
@@ -90,11 +94,20 @@ build --define=grpc_no_ares=true
 build:dynamic_kernels --define=dynamic_loaded_kernels=true
 build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 
+# Build TF with C++ 17 features.
+build:c++17 --cxxopt=-std=c++1z
+build:c++17 --cxxopt=-stdlib=libc++
+build:c++1z --cxxopt=-std=c++1z
+build:c++1z --cxxopt=-stdlib=libc++
+
 # Default paths for TF_SYSTEM_LIBS
 build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
 
+# Disable MKL-DNN contraction kernels by default.
+build --define=tensorflow_mkldnn_contraction_kernel=0
+
 # Default options should come above this line
 
 # Options from ./configure
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4a296f265f7b9521c46d350cec26ff199f43eb6c..73782143a3d4b1742f33bb96845ed300eedb6f50 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -55,24 +55,28 @@ TensorFlow coding style.
 
 #### General guidelines and philosophy for contribution
 
-* Include unit tests when you contribute new features, as they help to
-  a) prove that your code works correctly, and b) guard against future breaking
-  changes to lower the maintenance cost.
-* Bug fixes also generally require unit tests, because the presence of bugs
-  usually indicates insufficient test coverage.
-* Keep API compatibility in mind when you change code in core TensorFlow,
-  e.g., code in [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core) and  [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
-  TensorFlow has reached version 1 and hence cannot make
-  non-backward-compatible API changes without a major release. Reviewers of your
-  pull request will comment on any API compatibility issues.
-* When you contribute a new feature to TensorFlow, the maintenance burden is (by
-  default) transferred to the TensorFlow team. This means that benefit of the
-  contribution must be compared against the cost of maintaining the feature.
-* Full new features (e.g., a new op implementing a cutting-edge algorithm)
-  typically will live in
-  [tensorflow/contrib](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib)
-  to get some airtime before decision is made regarding whether they are to be
-  migrated to the core.
+*   Include unit tests when you contribute new features, as they help to a)
+    prove that your code works correctly, and b) guard against future breaking
+    changes to lower the maintenance cost.
+*   Bug fixes also generally require unit tests, because the presence of bugs
+    usually indicates insufficient test coverage.
+*   Keep API compatibility in mind when you change code in core TensorFlow,
+    e.g., code in
+    [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core)
+    and
+    [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
+    TensorFlow has reached version 1 and hence cannot make
+    non-backward-compatible API changes without a major release. Reviewers of
+    your pull request will comment on any API compatibility issues.
+*   When you contribute a new feature to TensorFlow, the maintenance burden is
+    (by default) transferred to the TensorFlow team. This means that benefit of
+    the contribution must be compared against the cost of maintaining the
+    feature.
+*   Full new features (e.g., a new op implementing a cutting-edge algorithm)
+    typically will live in
+    [tensorflow/addons](https://github.com/tensorflow/addons) to get some
+    airtime before decision is made regarding whether they are to be migrated to
+    the core.
 
 #### License
 
@@ -150,41 +154,45 @@ may exist in your changes.
 
 There are two ways to run TensorFlow unit tests.
 
-1. Using tools and libraries installed directly on your system.
+1.  Using tools and libraries installed directly on your system.
 
-   Refer to the
-   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
-   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
-   for the required packages. Alternatively, use the said
-   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
-   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
-   for development to avoid installing the packages directly on your system.
+    Refer to the
+    [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel)
+    and
+    [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+    for the required packages. Alternatively, use the said
+    [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+    `tensorflow/tensorflow:nightly-devel` and
+    `tensorflow/tensorflow:nightly-devel-gpu` for development to avoid
+    installing the packages directly on your system (in which case remember to
+    change directory from `/root` to `/tensorflow` once you get into the running
+    container so `bazel` can find the `tensorflow` workspace).
 
-   Once you have the packages installed, you can run a specific unit test in
-   bazel by doing as follows:
+    Once you have the packages installed, you can run a specific unit test in
+    bazel by doing as follows:
 
-   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
-   the `cuda` option flag
+    If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+    the `cuda` option flag
 
-   ```bash
-   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+    ```bash
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
 
-   export flags="--config=opt --config=cuda -k"
-   ```
+    export flags="--config=opt --config=cuda -k"
+    ```
 
-   For example, to run all tests under tensorflow/python, do:
+    For example, to run all tests under tensorflow/python, do:
 
-   ```bash
-   bazel test ${flags} //tensorflow/python/...
-   ```
+    ```bash
+    bazel test ${flags} //tensorflow/python/...
+    ```
 
-2. Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
+2.  Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
 
-   ```bash
-   # Install Docker first, then this will build and run cpu tests
-   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-   ```
-
-   See
-   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+    ```bash
+    # Install Docker first, then this will build and run cpu tests
+    tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+    ```
 
+    See
+    [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build)
+    for details.
diff --git a/METADATA b/METADATA
index d452effd8df8eda2b118bdb05cdba5a5676a83b4..7c290b77762322a54842e44fdd5d25089f7e82e2 100644
--- a/METADATA
+++ b/METADATA
@@ -23,7 +23,7 @@ third_party {
     type: GIT
     value: "https://github.com/tensorflow/tensorflow"
   }
-  version: "v1.4.0"
-  last_upgrade_date { year: 2017 month: 11 day: 13 }
+  version: "v1.13.0"
+  last_upgrade_date { year: 2019 month: 3 day: 15 }
   license_type: NOTICE
 }
diff --git a/README.md b/README.md
index 4e37b239b16e6eeefc587aeb242a03e1f88eddbd..96a8ecf4f693d5634da63f4ecc6f4e9c35751f5b 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,8 @@ organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
-TensorFlow provides stable Python API and C APIs as well as without API backwards compatibility guarantee like C++, Go, Java, JavaScript and Swift.
+TensorFlow provides stable Python and C APIs as well as non-guaranteed backwards
+compatible API's for C++, Go, Java, JavaScript and Swift.
 
 Keep up to date with release announcements and security updates by
 subscribing to
diff --git a/RELEASE.md b/RELEASE.md
index 0a56e6909870e398c9d6349576cd2f8e6734f072..48d7e3140aacf526412dfc7999e3e0c0931df89e 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,118 @@
+# Release 1.13.0
+
+## Major Features and Improvements
+
+* TensorFlow Lite has moved from contrib to core. This means that Python modules are under `tf.lite` and source code is now under `tensorflow/lite` rather than `tensorflow/contrib/lite`.
+* TensorFlow GPU binaries are now built against CUDA 10 and TensorRT 5.0.
+* Support for Python3.7 on all operating systems.
+* Moved NCCL to core.
+
+## Behavioral changes
+
+* Disallow conversion of python floating types to uint32/64 (matching behavior of other integer types) in `tf.constant`.
+* Make the `gain` argument of convolutional orthogonal initializers (`convolutional_delta_orthogonal`, `convolutional_orthogonal_1D`, `convolutional_orthogonal_2D`, `convolutional_orthogonal_3D`) have consistent behavior with the `tf.initializers.orthogonal` initializer, i.e. scale the output l2-norm by `gain` and NOT by `sqrt(gain)`. (Note that these functions are currently in `tf.contrib` which is not guaranteed backward compatible).
+
+## Bug Fixes and Other Changes
+
+* Documentation
+  * Update the doc with the details about the rounding mode used in quantize_and_dequantize_v2.
+  * Clarify that tensorflow::port::InitMain() _should_ be called before using the TensorFlow library.  Programs failing to do this are not portable to all platforms.
+* Deprecations and Symbol renames.
+   * Removing deprecations for the following endpoints: `tf.acos`, `tf.acosh`, `tf.add`, `tf.as_string`, `tf.asin`, `tf.asinh`, `tf.atan`, `tf.atan2`, `tf.atanh`, `tf.cos`, `tf.cosh`, `tf.equal`, `tf.exp`, `tf.floor`, `tf.greater`, `tf.greater_equal`, `tf.less`, `tf.less_equal`, `tf.log`, `tf.logp1`, `tf.logical_and`, `tf.logical_not`, `tf.logical_or`, `tf.maximum`, `tf.minimum`, `tf.not_equal`, `tf.sin`, `tf.sinh`, `tf.tan`
+  * Deprecate `tf.data.Dataset.shard`.
+  * Deprecate `saved_model.loader.load` which is replaced by `saved_model.load` and `saved_model.main_op`, which will be replaced by `saved_model.main_op` in V2.
+  * Deprecate tf.QUANTIZED_DTYPES. The official new symbol is tf.dtypes.QUANTIZED_DTYPES.
+  * Update sklearn imports for deprecated packages.
+  * Deprecate `Variable.count_up_to` and `tf.count_up_to` in favor of `Dataset.range`.
+  * Export `confusion_matrix` op as `tf.math.confusion_matrix` instead of `tf.train.confusion_matrix`.
+  * Add `tf.dtypes.` endpoint for every constant in dtypes.py; moving endpoints in versions.py to corresponding endpoints in `tf.sysconfig.` and `tf.version.`; moving all constants under `tf.saved_model` submodules to `tf.saved_model` module. New endpoints are added in V1 and V2 but existing endpoint removals are only applied in V2.
+  * Deprecates behavior where device assignment overrides collocation constraints inside a collocation context manager.
+* Keras & Python API
+  * Add to Keras functionality analogous to `tf.register_tensor_conversion_function`.
+  * Subclassed Keras models can now be saved through `tf.contrib.saved_model.save_keras_model`.
+  * `LinearOperator.matmul` now returns a new `LinearOperator`.
+* New ops and improved op functionality
+  * Add a Nearest Neighbor Resize op.
+  * Add an `ignore_unknown` argument to `parse_values` which suppresses ValueError for unknown hyperparameter types. Such * Add `tf.linalg.matvec` convenience function.
+  * `tf.einsum()`raises `ValueError` for unsupported equations like `"ii->"`.
+  * Add DCT-I and IDCT-I in `tf.signal.dct` and `tf.signal.idct`.
+  * Add LU decomposition op.
+  * Add quantile loss to gradient boosted trees in estimator.
+  * Add `round_mode` to `QuantizeAndDequantizeV2` op to select rounding algorithm.
+  * Add `unicode_encode`, `unicode_decode`, `unicode_decode_with_offsets`, `unicode_split`, `unicode_split_with_offset`, and `unicode_transcode` ops. Amongst other things, this Op adds the ability to encode, decode, and transcode a variety of input text encoding formats into the main Unicode encodings (UTF-8, UTF-16-BE, UTF-32-BE)
+  * Add "unit" attribute to the substr op, which allows obtaining the substring of a string containing unicode characters.
+  * Broadcasting support for Ragged Tensors.
+  * `SpaceToDepth` supports uint8 data type.
+  * Support multi-label quantile regression in estimator.
+  * We now use "div" as the default partition_strategy in `tf.nn.safe_embedding_lookup_sparse`, `tf.nn.sampled_softmax` and `tf.nn.nce_loss`.
+  hyperparameter are ignored.
+* Performance
+  * Improve performance of GPU cumsum/cumprod by up to 300x.
+  * Added support for weight decay in most TPU embedding optimizers, including AdamW and MomentumW.
+* TensorFlow 2.0 Development
+  * Add a command line tool to convert to TF2.0, tf_upgrade_v2
+  * Merge `tf.spectral` into `tf.signal` for TensorFlow 2.0.
+  * Change the default recurrent activation function for LSTM from 'hard_sigmoid' to 'sigmoid' in 2.0. Historically recurrent activation is 'hard_sigmoid' since it is fast than 'sigmoid'. With new unified backend between CPU and GPU mode, since the CuDNN kernel is using sigmoid, we change the default for CPU mode to sigmoid as well. With that, the default LSTM will be compatible with both CPU and GPU kernel. This will enable user with GPU to use CuDNN kernel by default and get a 10x performance boost in training. Note that this is checkpoint breaking change. If user want to use their 1.x pre-trained checkpoint, please construct the layer with LSTM(recurrent_activation='hard_sigmoid') to fallback to 1.x behavior.
+* TensorFlow Lite
+  * Move from `tensorflow/contrib/lite` to `tensorflow/lite`.
+  * Add experimental Java API for injecting TensorFlow Lite delegates
+  * Add support for strings in TensorFlow Lite Java API.
+* `tf.contrib`:
+  * Add Apache Ignite Filesystem plugin to support accessing Apache IGFS.
+  * Dropout now takes `rate` argument, `keep_prob` is deprecated.
+  * Estimator occurrences references `tf.contrib.estimator` were changed to `tf.estimator`:
+    * `tf.contrib.estimator.BaselineEstimator` with `tf.estimator.BaselineEstimator`
+    * `tf.contrib.estimator.DNNLinearCombinedEstimator` with `tf.estimator.DNNLinearCombinedEstimator`
+    * `tf.contrib.estimator.DNNEstimator` with `tf.estimator.DNNEstimator`
+    * `tf.contrib.estimator.LinearEstimator` with `tf.estimator.LinearEstimator`
+    * `tf.contrib.estimator.InMemoryEvaluatorHook` and tf.estimator.experimental.InMemoryEvaluatorHook`.
+    * `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`.
+  * Expose `tf.distribute.Strategy as the new name for tf.contrib.distribute.DistributionStrategy.
+  * Migrate linear optimizer from contrib to core.
+  * Move `tf.contrib.signal` to `tf.signal` (preserving aliases in tf.contrib.signal).
+  * Users of `tf.contrib.estimator.export_all_saved_models` and related should switch to `tf.estimator.Estimator.experimental_export_all_saved_models`.
+* tf.data:
+  * Add `tf.data.experimental.StatsOptions()`, to configure options to collect statistics from `tf.data.Dataset` pipeline using `StatsAggregator`. Add nested option, `experimental_stats` (which takes a `tf.data.experimen tal.StatsOptions` object), to `tf.data.Options`. Deprecates `tf.data.experimental.set_stats_agregator`.
+  * Performance optimizations:
+    * Add `tf.data.experimental.OptimizationOptions()`, to configure options to enable `tf.data` performance optimizations. Add nested option, `experimental_optimization` (which takes a `tf.data.experimental.OptimizationOptions` object), to `tf.data.Options`. Remove performance optimization options from `tf.data.Options`, and add them under `tf.data.experimental.OptimizationOptions` instead.
+    * Enable `map_and_batch_fusion` and `noop_elimination` optimizations by default. They can be disabled by configuring `tf.data.experimental.OptimizationOptions` to set `map_and_batch = False` or `noop_elimination = False` respectively. To disable all default optimizations, set `apply_default_optimizations = False`.
+    * Support parallel map in `map_and_filter_fusion`.
+    * Disable static optimizations for input pipelines that use non-resource `tf.Variable`s.
+  * Add NUMA-aware MapAndBatch dataset.
+  * Deprecate `tf.data.Dataset.make_one_shot_iterator()` in V1, removed it from V2, and added tf.compat.v1.data.make_one_shot_iterator()`.
+  * Deprecate `tf.data.Dataset.make_initializable_iterator()` in V1, removed it from V2, and added `tf.compat.v1.data.make_initializable_iterator()`.
+  * Enable nested dataset support in core `tf.data` transformations.
+  * For `tf.data.Dataset` implementers: Added `tf.data.Dataset._element_structured property` to replace `Dataset.output_{types,shapes,classes}`.
+  * Make `num_parallel_calls` of `tf.data.Dataset.interleave` and `tf.data.Dataset.map` work in Eager mode.
+* Toolchains
+  * Fixed OpenSSL compatibility by avoiding `EVP_MD_CTX_destroy`.
+  * Added bounds checking to printing deprecation warnings.
+  * Upgraded CUDA dependency to 10.0
+  * To build with Android NDK r14b, add "#include <linux/compiler.h>" to android-ndk-r14b/platforms/android-14/arch-*/usr/include/linux/futex.h
+  * Removed `:android_tensorflow_lib_selective_registration*` targets, use `:android_tensorflow_lib_lite*` targets instead.
+* XLA
+  * Move `RoundToEven` function to xla/client/lib/math.h.
+  * A new environment variable `TF_XLA_DEBUG_OPTIONS_PASSTHROUGH` set to "1" or "true" allows the debug options passed within an XRTCompile op to be passed directly to the XLA compilation backend. If such variable is not set (service side), only a restricted set will be passed through.
+  * Allow the XRTCompile op to return the ProgramShape resulted form the XLA compilation as a second return argument.
+  * XLA HLO graphs can now be rendered as SVG/HTML.
+* Estimator
+  * Replace all occurences of `tf.contrib.estimator.BaselineEstimator` with `tf.estimator.BaselineEstimator`
+  * Replace all occurences of `tf.contrib.estimator.DNNLinearCombinedEstimator` with `tf.estimator.DNNLinearCombinedEstimator`
+  * Replace all occurrences of `tf.contrib.estimator.DNNEstimator` with `tf.estimator.DNNEstimator`
+  * Replace all occurrences of `tf.contrib.estimator.LinearEstimator` with `tf.estimator.LinearEstimator`
+  * Users of `tf.contrib.estimator.export_all_saved_models` and related should switch to `tf.estimator.Estimator.experimental_export_all_saved_models`.
+  * Update `regression_head` to the new Head API for Canned Estimator V2.
+  * Switch `multi_class_head` to Head API for Canned Estimator V2.
+  * Replace all occurences of `tf.contrib.estimator.InMemoryEvaluatorHook` and `tf.contrib.estimator.make_stop_at_checkpoint_step_hook` with `tf.estimator.experimental.InMemoryEvaluatorHook` and `tf.estimator.experimental.make_stop_at_checkpoint_step_hook`
+  * Migrate linear optimizer from contrib to core.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abhinav Upadhyay, Ag Ramesh, akikaaa, Alexis Louis, Anders Huss, Andreas Madsen, Andrew Banchich, Andy Craze, Anton Dmitriev, Artem Malykh, Avijit-Nervana, Balint Cristian, Benjamin Tan Wei Hao, Bhavani Subramanian, Brendan Finan, Brian Nemsick, Bryan Cutler, By Shen, Cao Zongyan, Castiel, Chris Antaki, Christian Goll, Cibifang, Clayne Robison, Codrut Grosu, Cong Xu, Dalmo Cirne, Daniel Hunter, Dougal J. Sutherland, Edvard Fagerholm, EFanZh, Erik Smistad, Evgeniy Polyakov, Feiyang Chen, franklin5, Fred Reiss, Gautam, gehring, Geoffrey Irving, George Sterpu, Gitea, Grzegorz George Pawelczak, Guozhong Zhuang, himkt, Hoeseong Kim, Huan Li (李卓桓), HuiyangFei, hyunyoung, Isaac Burbank, jackonan, Jacky Ko, Jason Furmanek, Jason Zaman, Javier Luraschi, Jiang,Zhoulong, joaak, John Lin, Jonathan Wyatt Hoech, josephyearsley, Josh Gordon, Julian Niedermeier, Karl Lessard, Keno Fischer, lanhin, Leon Graser, leondgarse, Li, Guizi, Li, Yiqiang, lxl910915, Mahmoud Abuzaina, manhyuk, Marcela Morales Quispe, margaretmz, Matt Conley, Max Pumperla, mbhuiyan, mdfaijul, Meng, Peng, Michael, Michael Gielda, mrTsjolder, Muhammad Wildan, neargye, Nehal J Wani, NEWPLAN, Niranjan Hasabnis, Nutti, olicht, Pan Daoxin, Pedro Monreal, Peng Yu, pillarpond, Pooya Davoodi, qiezi, Rholais Lii, Richard Yu, Rin Arakaki, Roger Iyengar, sahilbadyal, Sami Kama, Sandip Giri, Scott Leishman, Serge Panev, Seunghoon Park, Shafi Dayatar, shengfuintel, Shimin Guo, Siju, silent567, Stefan Dyulgerov, steven, Tao Wei, Thor Johnsen, Tingbo Lu, tomguluson92, Tongxuan Liu, Trevor Morris, Ubuntu, Vadim Borisov, vanderliang, wangsiyu, Wen Yun, Wen-Heng (Jack) Chung, wenxizhu, William D. Irons, Xiaoming (Jason) Cui, Yan Facai (颜发才), Yanbo Liang, Yaniv Blumenfeld, Yash Gaurkar, Yicheng Fan, Yong Tang, Yongjoon Lee, Yuan (Terry) Tang, Yuxin Wu, zldrobit
+
 # Release 1.12.0
 
 ## Major Features and Improvements
diff --git a/WORKSPACE b/WORKSPACE
index 957b8d8528dc9b5e2ea134921b28601aa6fed2d1..ef44c25553cc9ea00a6d73d89a7b9c39481efbdc 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -29,7 +29,7 @@ load(
 bazel_toolchains_repositories()
 
 load(
-    "@io_bazel_rules_docker//container:container.bzl",
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
     container_repositories = "repositories",
 )
 
@@ -43,29 +43,47 @@ remote_config_workspace()
 # Apple and Swift rules.
 http_archive(
     name = "build_bazel_rules_apple",
-    sha256 = "4fe4ee824200b48821730f89ff260984332dc3551db587c24691235d1d96a8a7",
-    strip_prefix = "rules_apple-0.10.0",
-    urls = ["https://github.com/bazelbuild/rules_apple/archive/0.10.0.tar.gz"],
+    sha256 = "4b90786009fa8df25230442244bad2832ba8d6bc4987f68150a7de59c8827e90",
+    strip_prefix = "rules_apple-0.14.0",
+    urls = ["https://github.com/bazelbuild/rules_apple/archive/0.14.0.tar.gz"],
 )
-http_archive(
-    name = "build_bazel_rules_swift",
-    sha256 = "6544ff5615febec0342de1127144d2f3e43ea80fb7f9b1ade65e6a184e39e618",
-    strip_prefix = "rules_swift-0.5.0",
-    urls = ["https://github.com/bazelbuild/rules_swift/archive/0.5.0.tar.gz"],
+http_file(
+    name = "xctestrunner",
+    executable = 1,
+    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.6/ios_test_runner.par"],
 )
+
 http_archive(
     name = "bazel_skylib",
-    sha256 = "eb5c57e4c12e68c0c20bc774bfbc60a568e800d025557bc4ea022c6479acc867",
-    strip_prefix = "bazel-skylib-0.6.0",
-    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/0.6.0.tar.gz"],
+    sha256 = "2c62d8cd4ab1e65c08647eb4afe38f51591f43f7f0885e7769832fa137633dcb",
+    strip_prefix = "bazel-skylib-0.7.0",
+    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/0.7.0.tar.gz"],
 )
-http_file(
-    name = "xctestrunner",
-    executable = 1,
-    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.5/ios_test_runner.par"],
+
+http_archive(
+    name = "build_bazel_apple_support",
+    sha256 = "835663c4bb02f4bf01dce8a2a176df7fa682dbb867d3698ae12258c1628bb8f0",
+    strip_prefix = "apple_support-0.5.0",
+    urls = ["https://github.com/bazelbuild/apple_support/archive/0.5.0.tar.gz"],
+)
+
+http_archive(
+    name = "build_bazel_rules_swift",
+    sha256 = "32d124878cd49775d84f59ba90440c8b23b7c775aec8fec1978f751c76ddee8a",
+    strip_prefix = "rules_swift-0.7.0",
+    urls = ["https://github.com/bazelbuild/rules_swift/archive/0.7.0.tar.gz"],
 )
-load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
-apple_rules_dependencies(ignore_version_differences = True)
+
+http_archive(
+    name = "com_github_apple_swift_swift_protobuf",
+    type = "zip",
+    strip_prefix = "swift-protobuf-1.2.0/",
+    urls = ["https://github.com/apple/swift-protobuf/archive/1.2.0.zip"],
+)
+
+# Use swift_rules_dependencies to fetch the tolchains.
+# Since we defined all the "git_repository" rules above, the following call will
+# skip redefining them.
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
 swift_rules_dependencies()
 
@@ -134,4 +152,3 @@ http_archive(
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
-
diff --git a/configure.py b/configure.py
index adc9ef9caca8c0128c63896fdebbbadf7f86da81..fe724c9e6acf4eccda275799b4c23a6b58bfed3b 100644
--- a/configure.py
+++ b/configure.py
@@ -50,11 +50,18 @@ _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 _TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
 _TF_WORKSPACE_ROOT = ''
 _TF_BAZELRC = ''
+_TF_CURRENT_BAZEL_VERSION = None
 
 NCCL_LIB_PATHS = [
     'lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''
 ]
 
+# List of files to be configured for using Bazel on Apple platforms.
+APPLE_BAZEL_FILES = [
+    'tensorflow/lite/experimental/objc/BUILD',
+    'tensorflow/lite/experimental/swift/BUILD'
+]
+
 if platform.machine() == 'ppc64le':
   _DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/powerpc64le-linux-gnu/'
 else:
@@ -256,6 +263,7 @@ def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
 
+
 def cleanup_makefile():
   """Delete any leftover BUILD files from the Makefile build.
 
@@ -330,8 +338,8 @@ def get_var(environ_cp,
           'Environment variable %s must be set as a boolean indicator.\n'
           'The following are accepted as TRUE : %s.\n'
           'The following are accepted as FALSE: %s.\n'
-          'Current value is %s.' % (var_name, ', '.join(true_strings),
-                                    ', '.join(false_strings), var))
+          'Current value is %s.' %
+          (var_name, ', '.join(true_strings), ', '.join(false_strings), var))
 
   while var is None:
     user_input_origin = get_input(question)
@@ -764,11 +772,12 @@ def check_ndk_level(android_ndk_home_path):
   else:
     raise Exception('Unable to parse NDK revision.')
   if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
-    print('WARNING: The API level of the NDK in %s is %s, which is not '
-          'supported by Bazel (officially supported versions: %s). Please use '
-          'another version. Compiling Android targets may result in confusing '
-          'errors.\n' % (android_ndk_home_path, ndk_api_level,
-                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+    print(
+        'WARNING: The API level of the NDK in %s is %s, which is not '
+        'supported by Bazel (officially supported versions: %s). Please use '
+        'another version. Compiling Android targets may result in confusing '
+        'errors.\n' %
+        (android_ndk_home_path, ndk_api_level, _SUPPORTED_ANDROID_NDK_VERSIONS))
   return ndk_api_level
 
 
@@ -785,8 +794,7 @@ def set_gcc_host_compiler_path(environ_cp):
       environ_cp,
       var_name='GCC_HOST_COMPILER_PATH',
       var_default=default_gcc_host_compiler_path,
-      ask_for_var=
-      'Please specify which gcc should be used by nvcc as the host compiler.',
+      ask_for_var='Please specify which gcc should be used by nvcc as the host compiler.',
       check_success=os.path.exists,
       error_msg='Invalid gcc path. %s cannot be found.',
   )
@@ -1224,8 +1232,8 @@ def set_tf_nccl_install_path(environ_cp):
       # Reset and Retry
       print(
           'Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
-          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
-                                              nccl_hdr_path))
+          'O/S agnostic package of NCCL 2' %
+          (tf_nccl_version, nccl_lib_path, nccl_hdr_path))
 
       environ_cp['TF_NCCL_VERSION'] = ''
   else:
@@ -1237,6 +1245,7 @@ def set_tf_nccl_install_path(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
   write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
 
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1273,13 +1282,15 @@ def set_tf_cuda_compute_capabilities(environ_cp):
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated '
-        'Cuda compute capabilities you want to '
+        'CUDA compute capabilities you want to '
         'build with.\nYou can find the compute '
         'capability of your device at: '
         'https://developer.nvidia.com/cuda-gpus.\nPlease'
         ' note that each additional compute '
         'capability significantly increases your '
-        'build time and binary size. [Default is: %s]: ' %
+        'build time and binary size, and that '
+        'TensorFlow only supports compute '
+        'capabilities >= 3.5 [Default is: %s]: ' %
         default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
@@ -1292,13 +1303,18 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        print('Invalid compute capability: ' % compute_capability)
+        print('Invalid compute capability: %s' % compute_capability)
         all_valid = False
       else:
-        ver = int(m.group(0).split('.')[0])
-        if ver < 3:
-          print('Only compute capabilities 3.0 or higher are supported.')
+        ver = float(m.group(0))
+        if ver < 3.0:
+          print('ERROR: TensorFlow only supports CUDA compute capabilities 3.0 '
+                'and higher. Please re-specify the list of compute '
+                'capabilities excluding version %s.' % ver)
           all_valid = False
+        if ver < 3.5:
+          print('WARNING: XLA does not support CUDA compute capabilities '
+                'lower than 3.5. Disable XLA when running on older GPUs.')
 
     if all_valid:
       break
@@ -1482,7 +1498,36 @@ def set_other_mpi_vars(environ_cp):
   else:
     raise ValueError(
         'Cannot find the MPI library file in %s/lib or %s/lib64 or %s/lib32' %
-        mpi_home, mpi_home, mpi_home)
+        (mpi_home, mpi_home, mpi_home))
+
+
+def system_specific_test_config(env):
+  """Add default test flags required for TF tests to bazelrc."""
+  write_to_bazelrc('test --flaky_test_attempts=3')
+  write_to_bazelrc('test --test_size_filters=small,medium')
+  write_to_bazelrc(
+      'test --test_tag_filters=-benchmark-test,-no_oss,-oss_serial')
+  write_to_bazelrc('test --build_tag_filters=-benchmark-test,-no_oss')
+  if is_windows():
+    if env.get('TF_NEED_CUDA', None) == '1':
+      write_to_bazelrc(
+          'test --test_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+      write_to_bazelrc(
+          'test --build_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-no_windows,-gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_windows,-gpu')
+  elif is_macos():
+    write_to_bazelrc('test --test_tag_filters=-gpu,-nomac,-no_mac')
+    write_to_bazelrc('test --build_tag_filters=-gpu,-nomac,-no_mac')
+  elif is_linux():
+    if env.get('TF_NEED_CUDA', None) == '1':
+      write_to_bazelrc('test --test_tag_filters=-no_gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_gpu')
+      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-gpu')
+      write_to_bazelrc('test --build_tag_filters=-gpu')
 
 
 def set_system_libs_flag(environ_cp):
@@ -1508,15 +1553,14 @@ def set_windows_build_flags(environ_cp):
   write_to_bazelrc('build --config monolithic')
   # Suppress warning messages
   write_to_bazelrc('build --copt=-w --host_copt=-w')
+  # Fix winsock2.h conflicts
+  write_to_bazelrc(
+      'build --copt=-DWIN32_LEAN_AND_MEAN --host_copt=-DWIN32_LEAN_AND_MEAN')
   # Output more verbose information when something goes wrong
   write_to_bazelrc('build --verbose_failures')
   # The host and target platforms are the same in Windows build. So we don't
   # have to distinct them. This avoids building the same targets twice.
   write_to_bazelrc('build --distinct_host_configuration=false')
-  # Enable short object file path to avoid long path issue on Windows.
-  # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
-  # Short object file path will be enabled by default.
-  write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
 
   if get_var(
       environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
@@ -1537,9 +1581,30 @@ def config_info_line(name, help_text):
   print('\t--config=%-12s\t# %s' % (name, help_text))
 
 
+def configure_apple_bazel_rules():
+  """Configures Bazel rules for building on Apple platforms.
+
+  Enables analyzing and building Apple Bazel rules on Apple platforms. This
+  function will only be executed if `is_macos()` is true.
+  """
+  if not is_macos():
+    return
+  for filepath in APPLE_BAZEL_FILES:
+    print(
+        'Configuring %s file to analyze and build Bazel rules on Apple platforms.'
+        % filepath)
+    existing_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath + '.apple')
+    renamed_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath)
+    os.rename(existing_filepath, renamed_filepath)
+  if _TF_CURRENT_BAZEL_VERSION is None or _TF_CURRENT_BAZEL_VERSION < 23000:
+    print(
+        'Building Bazel rules on Apple platforms requires Bazel 0.23 or later.')
+
+
 def main():
   global _TF_WORKSPACE_ROOT
   global _TF_BAZELRC
+  global _TF_CURRENT_BAZEL_VERSION
 
   parser = argparse.ArgumentParser()
   parser.add_argument(
@@ -1556,7 +1621,8 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.19.0', '0.21.0')
+  current_bazel_version = check_bazel_version('0.19.0', '0.23.2')
+  _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
 
   reset_tf_configure_bazelrc()
 
@@ -1577,6 +1643,8 @@ def main():
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
+  else:
+    environ_cp['TF_CONFIGURE_APPLE_BAZEL_RULES'] = '0'
 
   # The numpy package on ppc64le uses OpenBLAS which has multi-threading
   # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
@@ -1679,6 +1747,16 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
+  system_specific_test_config(os.environ)
+
+  if get_var(
+      environ_cp, 'TF_CONFIGURE_APPLE_BAZEL_RULES',
+      'Configure Bazel rules for Apple platforms', False,
+      ('Would you like to configure Bazel rules for building on Apple platforms?'
+      ), 'Configuring Bazel rules for Apple platforms.',
+      'Not configuring Bazel rules for Apple platforms.'):
+    configure_apple_bazel_rules()
+
   print('Preconfigured Bazel build configs. You can use any of the below by '
         'adding "--config=<>" to your build command. See .bazelrc for more '
         'details.')
@@ -1687,8 +1765,10 @@ def main():
   config_info_line('gdr', 'Build with GDR support.')
   config_info_line('verbs', 'Build with libverbs support.')
   config_info_line('ngraph', 'Build with Intel nGraph support.')
-  config_info_line('dynamic_kernels',
-                   '(Experimental) Build kernels into separate shared objects.')
+  config_info_line('numa', 'Build with NUMA support.')
+  config_info_line(
+      'dynamic_kernels',
+      '(Experimental) Build kernels into separate shared objects.')
 
   print('Preconfigured Bazel build configs to DISABLE default on features:')
   config_info_line('noaws', 'Disable AWS S3 filesystem support.')
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 29d71c323ab5ee860ebf48c332cfd7f607f3f0c3..24d34cf9c154c2eec8f840118e704018106c9848 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -16,6 +16,8 @@ exports_files([
 ])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_additional_binary_deps",
@@ -40,12 +42,16 @@ load(
 
 # @unused
 TENSORFLOW_API_INIT_FILES_V2 = (
-    TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+    TENSORFLOW_API_INIT_FILES +
+    get_compat_files(TENSORFLOW_API_INIT_FILES, 2) +
+    get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
 # @unused
-TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
-    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+TENSORFLOW_API_INIT_FILES_V1 = (
+    TENSORFLOW_API_INIT_FILES_V1 +
+    get_compat_files(TENSORFLOW_API_INIT_FILES, 2) +
+    get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
 # Config setting used when building for products
@@ -90,6 +96,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "emscripten",
+    values = {"crosstool_top": "//external:android/emscripten"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "raspberry_pi_armeabi",
     values = {
@@ -135,12 +147,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "windows",
     values = {"cpu": "x64_windows"},
@@ -153,9 +159,18 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "macos",
+    values = {
+        "apple_platform_type": "macos",
+        "cpu": "darwin",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "ios",
-    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
+    values = {"apple_platform_type": "ios"},
     visibility = ["//visibility:public"],
 )
 
@@ -294,6 +309,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_numa_support",
+    define_values = {"with_numa_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between framework_shared_object and a bunch of other configurations
 # due to limitations in nested select() statements.
 config_setting(
@@ -343,6 +364,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "using_rocm_hipcc",
+    define_values = {
+        "using_rocm_hipcc": "true",
+    },
+)
+
 config_setting(
     name = "with_mpi_support",
     values = {"define": "with_mpi_support=true"},
@@ -381,16 +409,7 @@ config_setting(
 
 package_group(
     name = "internal",
-    packages = [
-        "-//third_party/tensorflow/python/estimator",
-        "//learning/deepmind/...",
-        "//learning/meta_rank/...",
-        "//tensorflow/...",
-        "//tensorflow_estimator/contrib/...",
-        "//tensorflow_fold/llgtm/...",
-        "//tensorflow_text/...",
-        "//third_party/py/tensor2tensor/...",
-    ],
+    packages = ["//tensorflow/..."],
 )
 
 load(
@@ -451,11 +470,10 @@ tf_cc_shared_object(
     name = "libtensorflow_framework.so",
     framework_so = [],
     linkopts = select({
-        "//tensorflow:darwin": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_framework_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_framework_version_script.lds)",
         ],
     }),
     linkstatic = 1,
@@ -486,21 +504,27 @@ tf_cc_shared_object(
 # symbols in object files.
 
 tf_cc_shared_object(
-    name = "libtensorflow.so",
+    name = "tensorflow",
     linkopts = select({
-        "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/c:exported_symbols.lds)",
+        "//tensorflow:macos": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
-        "//tensorflow:windows": [],
+        "//tensorflow:windows": [
+        ],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/c:version_script.lds)",
         ],
     }),
+    per_os_targets = True,
     visibility = ["//visibility:public"],
+    # add win_def_file for tensorflow
+    win_def_file = select({
+        # We need this DEF file to properly export symbols on Windows
+        "//tensorflow:windows": ":tensorflow_filtered_def_file",
+        "//conditions:default": None,
+    }),
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
@@ -512,20 +536,25 @@ tf_cc_shared_object(
 )
 
 tf_cc_shared_object(
-    name = "libtensorflow_cc.so",
+    name = "tensorflow_cc",
     linkopts = select({
-        "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow:tf_exported_symbols.lds)",
+        "//tensorflow:macos": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
+    per_os_targets = True,
     visibility = ["//visibility:public"],
+    # add win_def_file for tensorflow_cc
+    win_def_file = select({
+        # We need this DEF file to properly export symbols on Windows
+        "//tensorflow:windows": ":tensorflow_filtered_def_file",
+        "//conditions:default": None,
+    }),
     deps = [
         "//tensorflow:tf_exported_symbols.lds",
         "//tensorflow:tf_version_script.lds",
@@ -539,6 +568,92 @@ tf_cc_shared_object(
     ] + if_ngraph(["@ngraph_tf//:ngraph_tf"]),
 )
 
+# ** Targets for Windows build (start) **
+
+# Build a shared library (DLL) by cc_binary from tf_custom_op_library_additional_deps_impl,
+# it contains all object code from its dependencies.
+# This target is only used for parsing the symbols to be exported in tensorflow.dll.
+# Do NOT depend on it.
+tf_native_cc_binary(
+    name = "tf_custom_op_library_additional_deps.dll",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = tf_custom_op_library_additional_deps_impl(),
+)
+
+# Get a DEF file generated by parsing all object files
+# of tf_custom_op_library_additional_deps.so
+filegroup(
+    name = "tensorflow_def_file",
+    srcs = [":tf_custom_op_library_additional_deps.dll"],
+    output_group = "def_file",
+)
+
+# Filter the DEF file to reduce the number of symbols to 64K or less.
+# Note that we also write the name of the pyd file into DEF file so that
+# the dynamic libraries of custom ops can find it at runtime.
+genrule(
+    name = "tensorflow_filtered_def_file",
+    srcs = [":tensorflow_def_file"],
+    outs = ["tensorflow_filtered_def_file.def"],
+    cmd = select({
+        "//tensorflow:windows": """
+              $(location @local_config_def_file_filter//:def_file_filter) \\
+              --input $(location :tensorflow_def_file) \\
+              --output $@
+          """,
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    tools = ["@local_config_def_file_filter//:def_file_filter"],
+    visibility = ["//visibility:public"],
+)
+
+# The interface library (tensorflow.dll.if.lib) for linking tensorflow DLL library (tensorflow.dll) on Windows.
+# To learn more about import library (called interface library in Bazel):
+#     https://docs.microsoft.com/en-us/cpp/build/linking-an-executable-to-a-dll?view=vs-2017#linking-implicitly
+filegroup(
+    name = "get_tensorflow_dll_import_lib",
+    srcs = ["//tensorflow:tensorflow.dll"],
+    output_group = "interface_library",
+    visibility = ["//visibility:public"],
+)
+
+# Rename the import library for tensorflow.dll from tensorflow.dll.if.lib to tensorflow.lib
+genrule(
+    name = "tensorflow_dll_import_lib",
+    srcs = [":get_tensorflow_dll_import_lib"],
+    outs = ["tensorflow.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
+# The interface library (tensorflow_cc.dll.if.lib) for linking tensorflow DLL library (tensorflow_cc.dll) on Windows.
+# To learn more about import library (called interface library in Bazel):
+#     https://docs.microsoft.com/en-us/cpp/build/linking-an-executable-to-a-dll?view=vs-2017#linking-implicitly
+filegroup(
+    name = "get_tensorflow_cc_dll_import_lib",
+    srcs = ["//tensorflow:tensorflow_cc.dll"],
+    output_group = "interface_library",
+    visibility = ["//visibility:public"],
+)
+
+# Rename the import library for tensorflow.dll from tensorflow_cc.dll.if.lib to tensorflow.lib
+genrule(
+    name = "tensorflow_cc_dll_import_lib",
+    srcs = [":get_tensorflow_cc_dll_import_lib"],
+    outs = ["tensorflow_cc.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
+# ** Targets for Windows build (end) **
+
 exports_files(
     [
         "tf_version_script.lds",
@@ -599,13 +714,20 @@ gen_api_init_files(
     name = "tf_python_api_gen_v1",
     srcs = [
         "api_template_v1.__init__.py",
+        "compat_template.__init__.py",
         "compat_template_v1.__init__.py",
     ],
     api_version = 1,
-    compat_api_versions = [1],
-    compat_init_templates = ["compat_template_v1.__init__.py"],
+    compat_api_versions = [
+        1,
+        2,
+    ],
+    compat_init_templates = [
+        "compat_template_v1.__init__.py",
+        "compat_template.__init__.py",
+    ],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
+    output_files = TENSORFLOW_API_INIT_FILES_V1,
     output_package = "tensorflow._api.v1",
     root_file_name = "v1.py",
     root_init_template = "api_template_v1.__init__.py",
@@ -615,11 +737,18 @@ gen_api_init_files(
     name = "tf_python_api_gen_v2",
     srcs = [
         "api_template.__init__.py",
+        "compat_template.__init__.py",
         "compat_template_v1.__init__.py",
     ],
     api_version = 2,
-    compat_api_versions = [1],
-    compat_init_templates = ["compat_template_v1.__init__.py"],
+    compat_api_versions = [
+        1,
+        2,
+    ],
+    compat_init_templates = [
+        "compat_template_v1.__init__.py",
+        "compat_template.__init__.py",
+    ],
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index a93799bfe84b0f9c4743e1ad0effd6e69ad7f3f2..7bd6b7223989cddfea935f0ed2bcf7536015feea 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -26,14 +26,28 @@ import sys as _sys
 
 # API IMPORTS PLACEHOLDER
 
+# Make sure directory containing top level submodules is in
+# the __path__ so that "from tensorflow.foo import bar" works.
+# We're using bitwise, but there's nothing special about that.
+_API_MODULE = bitwise  # pylint: disable=undefined-variable
+_current_module = _sys.modules[__name__]
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
+if not hasattr(_current_module, '__path__'):
+  __path__ = [_tf_api_dir]
+elif _tf_api_dir not in __path__:
+  __path__.append(_tf_api_dir)
+
 # pylint: disable=g-bad-import-order
 from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg="Limited tf.summary API due to missing TensorBoard installation")
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=(
         'tensorflow_estimator.python.estimator.api._v2.estimator'))
 
-_current_module = _sys.modules[__name__]
 if not hasattr(_current_module, 'estimator'):
   _component_api_helper.package_hook(
       parent_package_str=__name__,
@@ -42,14 +56,6 @@ if not hasattr(_current_module, 'estimator'):
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow.python.keras.api._v2.keras'))
-# Make sure directory containing top level submodules is in
-# the __path__ so that "from tensorflow.foo import bar" works.
-# We're using bitwise, but there's nothing special about that.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
-if not hasattr(_current_module, '__path__'):
-  __path__ = [_tf_api_dir]
-elif _tf_api_dir not in __path__:
-  __path__.append(_tf_api_dir)
 
 # Enable TF2 behaviors
 from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
@@ -111,5 +117,11 @@ try:
 except NameError:
   pass
 
+# Add module aliases
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
+  initializers = keras.initializers
 
 # pylint: enable=undefined-variable
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index eeca8f0d566a6401cb64e4fe3f0ee3c5aeb4ece2..5eb25a81b7f765f551bc4f1b7ba99b35dbc6b7bb 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -70,7 +70,7 @@ _API_MODULE = app  # pylint: disable=undefined-variable
 
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))  # pylint: disable=undefined-variable
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
 if not hasattr(_current_module, '__path__'):
   __path__ = [_tf_api_dir]
 elif _tf_api_dir not in __path__:
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 4e2fe34d28c9363ebba690c5491b258a4dba11b3..00fea495fba80cef49e71f724985a14abb3932da 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -39,14 +39,19 @@ filegroup(
             "python_api.h",
             "*test*",
         ],
-    ),
+    ) + [
+        "//tensorflow/cc:srcs",
+        "//tensorflow/core/distributed_runtime:server_lib.h",
+    ],
     visibility = ["//visibility:public"],
 )
 
 tf_cuda_library(
     name = "c_api_internal",
-    srcs = ["c_api.h"],
-    hdrs = ["c_api_internal.h"],
+    hdrs = [
+        "c_api.h",
+        "c_api_internal.h",
+    ],
     visibility = [
         "//tensorflow:internal",
         "//tensorflow/c:__subpackages__",
@@ -59,6 +64,7 @@ tf_cuda_library(
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
+            "//tensorflow/core:lib_platform",
             "//tensorflow/core:op_gen_lib",
             "//tensorflow/core/distributed_runtime:server_lib",
         ],
@@ -67,22 +73,37 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "c_api",
-    srcs = [
-        "c_api.cc",
-        "c_api_function.cc",
-    ],
     hdrs = [
         "c_api.h",
     ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":c_api_no_xla",
+        ":c_api_internal",
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+tf_cuda_library(
+    name = "c_api_no_xla",
+    srcs = [
+        "c_api.cc",
+        "c_api_function.cc",
+    ],
+    hdrs = ["c_api.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [":c_api_internal"] + select({
         "//tensorflow:android": [
-            ":c_api_internal",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api_internal",
             "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
@@ -97,13 +118,8 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core/distributed_runtime:server_lib",
+            "//tensorflow/core/kernels:logging_ops",
         ],
-    }) + select({
-        "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
-            "//tensorflow/compiler/jit",
-        ],
-        "//conditions:default": [],
     }),
 )
 
@@ -129,6 +145,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -155,8 +172,8 @@ tf_cuda_library(
     hdrs = ["tf_status_helper.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":c_api",
         ":c_api_internal",
+        ":c_api_no_xla",
         "//tensorflow/core:lib",
     ],
 )
@@ -212,13 +229,13 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:framework",
@@ -270,7 +287,7 @@ tf_cuda_cc_test(
     ],
     kernels = [":test_op_kernel"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     tags = [
@@ -288,13 +305,23 @@ tf_cuda_cc_test(
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/compiler/jit",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:array",
@@ -309,7 +336,7 @@ tf_cc_test(
     srcs = ["c_api_experimental_test.cc"],
     data = ["testdata/tf_record"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     # We must ensure that the dependencies can be dynamically linked since
@@ -318,6 +345,7 @@ tf_cc_test(
     deps = [
         ":c_api",
         ":c_api_experimental",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_test_util",
@@ -334,6 +362,7 @@ tf_cc_test(
     srcs = ["c_api_function_test.cc"],
     deps = [
         ":c_api",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -376,7 +405,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["env_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     tags = ["noasan"],
@@ -397,7 +426,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["kernels_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     tags = ["noasan"],
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 9f2f83920cc73028fd2372afaf303e8b1c1c64f9..bbc16b85429ebaa38e7992878330c04c0bdb7f99 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -20,14 +20,19 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#ifndef __ANDROID__
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"  // NOLINT
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#endif
+#include "tensorflow/core/kernels/logging_ops.h"
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
@@ -257,6 +262,74 @@ int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
 size_t TF_TensorByteSize(const TF_Tensor* t) { return t->buffer->size(); }
 void* TF_TensorData(const TF_Tensor* t) { return t->buffer->data(); }
 
+int64_t TF_TensorElementCount(const TF_Tensor* t) {
+  int64_t result = 1;
+  int rank = TF_NumDims(t);
+  for (int dim = 0; dim < rank; ++dim) {
+    result *= TF_Dim(t, dim);
+  }
+  return result;
+}
+
+// Returns the number of elements that would be present in a tensor with the
+// given shape.
+static int64_t ShapeNumElements(const int64_t* dims, int num_dims) {
+  int64_t result = 1;
+  for (int dim = 0; dim < num_dims; ++dim) {
+    result *= dims[dim];
+  }
+  return result;
+}
+
+static void UnrefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Unref();
+  }
+}
+
+static void RefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Ref();
+  }
+}
+
+void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
+                          TF_Tensor* to, const int64_t* new_dims,
+                          int num_new_dims, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  size_t in_size = TF_DataTypeSize(TF_TensorType(from));
+  if (in_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor has a zero-sized data type");
+    return;
+  }
+  size_t out_size = TF_DataTypeSize(type);
+  if (out_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "output tensor has a zero-sized data type");
+    return;
+  }
+
+  if (ShapeNumElements(new_dims, num_new_dims) * out_size !=
+      TF_TensorElementCount(from) * in_size) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor is not compatible with output shape");
+    return;
+  }
+
+  tensorflow::TensorShapeProto p;
+  for (int i = 0; i < num_new_dims; ++i) {
+    p.add_dim()->set_size(new_dims[i]);
+  }
+  to->shape = tensorflow::TensorShape(p);
+  to->dtype = type;
+  if (to->buffer != from->buffer) {
+    UnrefIfNonNull(to->buffer);
+    to->buffer = from->buffer;
+    RefIfNonNull(to->buffer);
+  }
+}
+
 // --------------------------------------------------------------------------
 size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
                        size_t dst_len, TF_Status* status) {
@@ -295,7 +368,7 @@ static Status TF_StringDecode_Impl(const char* src, size_t src_len,
 size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
                        size_t* dst_len, TF_Status* status) {
   status->status = TF_StringDecode_Impl(src, src_len, dst, dst_len);
-  if (!status->status.ok()) return 0;
+  if (TF_GetCode(status) != TF_OK) return 0;
   return static_cast<size_t>(*dst - src) + *dst_len;
 }
 
@@ -350,7 +423,7 @@ TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions* opt,
                                               TF_Status* status) {
   Session* session;
   status->status = NewSession(opt->options, &session);
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     return new TF_DeprecatedSession({session});
   } else {
     DCHECK_EQ(nullptr, session);
@@ -542,7 +615,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
     offsets++;
     const string& s = srcarray(i);
     size_t consumed = TF_StringEncode(s.data(), s.size(), dst, dst_len, status);
-    if (!status->status.ok()) {
+    if (TF_GetCode(status) != TF_OK) {
       status->status = InvalidArgument(
           "invalid string tensor encoding (string #", i, " of ",
           srcarray.size(), "): ", status->status.error_message());
@@ -572,7 +645,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                       dimvec.size(), base, size, DeleteArray, base);
 }
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in,
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                        TF_Buffer* out) {
   if (out->data != nullptr) {
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
@@ -702,7 +775,7 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
       // TODO(nolivia): check this on a subset of the graph instead of all of
       // it.
       status->status = graph::ValidateGraphHasNoCycle(session->graph->graph);
-      if (!status->status.ok()) {
+      if (TF_GetCode(status) != TF_OK) {
         session->graph->mu.unlock();
         return false;
       }
@@ -722,7 +795,7 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
       *graph_def.mutable_library() = graph.flib_def().ToProto();
       session->graph->mu.unlock();
       status->status = session->session->Extend(graph_def);
-      if (!status->status.ok()) {
+      if (TF_GetCode(status) != TF_OK) {
         // Contract is we always delete input_values[i].
         return false;
       }
@@ -752,7 +825,7 @@ static bool TF_Run_Inputs(TF_Tensor* const* c_inputs,
   const int ninputs = input_pairs->size();
   for (int i = 0; i < ninputs; ++i) {
     status->status = TF_TensorToTensor(c_inputs[i], &(*input_pairs)[i].second);
-    if (!status->status.ok()) return false;
+    if (TF_GetCode(status) != TF_OK) return false;
   }
   return true;
 }
@@ -790,7 +863,7 @@ static void TF_Run_Helper(
     // Serialize back to upstream client, who now owns the new buffer
     if (run_metadata != nullptr) {
       status->status = MessageToBuffer(run_metadata_proto, run_metadata);
-      if (!status->status.ok()) return;
+      if (TF_GetCode(status) != TF_OK) return;
     }
   } else {
     // NOTE(zongheng): PRun does not support RunOptions yet.
@@ -810,7 +883,7 @@ static void TF_Run_Helper(
       continue;
     }
     c_outputs[i] = TF_TensorFromTensor(src, status);
-    if (!status->status.ok()) return;
+    if (TF_GetCode(status) != TF_OK) return;
   }
 }
 
@@ -867,7 +940,7 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
   string new_handle;
   status->status = s->session->PRunSetup(input_names, output_names,
                                          target_oper_names, &new_handle);
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     char* buf = new char[new_handle.size() + 1];
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
@@ -906,7 +979,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
   status->status = tensorflow::LoadLibrary(
       library_filename, &lib_handle->lib_handle, &lib_handle->op_list.data,
       &lib_handle->op_list.length);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     delete lib_handle;
     return nullptr;
   }
@@ -1010,7 +1083,7 @@ TensorId ToTensorId(const TF_Output& output) {
   return TensorId(output.oper->node.name(), output.index);
 }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 std::vector<tensorflow::Output> OutputsFromTFOutputs(TF_Output* tf_outputs,
                                                      int n) {
   std::vector<tensorflow::Output> outputs(n);
@@ -1028,7 +1101,7 @@ void TFOutputsFromOutputs(const std::vector<tensorflow::Output>& outputs,
     tf_outputs[i].index = outputs[i].index();
   }
 }
-#endif  // __ANDROID__
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 }  // namespace
 
@@ -1242,6 +1315,13 @@ void TF_SetAttrTypeList(TF_OperationDescription* desc, const char* attr_name,
                      reinterpret_cast<const DataType*>(values), num_values));
 }
 
+void TF_SetAttrPlaceholder(TF_OperationDescription* desc, const char* attr_name,
+                           const char* placeholder) {
+  tensorflow::AttrValue attr_value;
+  attr_value.set_placeholder(placeholder);
+  desc->node_builder.Attr(attr_name, attr_value);
+}
+
 void TF_SetAttrFuncName(TF_OperationDescription* desc, const char* attr_name,
                         const char* value, size_t length) {
   tensorflow::NameAttrList func_name;
@@ -1327,7 +1407,7 @@ void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
                       TF_Tensor* value, TF_Status* status) {
   Tensor t;
   status->status = TF_TensorToTensor(value, &t);
-  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
+  if (TF_GetCode(status) == TF_OK) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
@@ -1337,13 +1417,13 @@ void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
   std::vector<Tensor> t;
   t.reserve(num_values);
 
-  for (int i = 0; i < num_values && status->status.ok(); ++i) {
+  for (int i = 0; i < num_values && TF_GetCode(status) == TF_OK; ++i) {
     Tensor v;
     status->status = TF_TensorToTensor(values[i], &v);
     t.emplace_back(v);
   }
 
-  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
+  if (TF_GetCode(status) == TF_OK) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
@@ -1391,11 +1471,11 @@ static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
     }
     status->status = desc->node_builder.Finalize(&desc->graph->graph, &ret);
 
-    if (status->status.ok()) {
+    if (TF_GetCode(status) == TF_OK) {
       // Run shape inference function for newly added node.
       status->status = desc->graph->refiner.AddNode(ret);
     }
-    if (status->status.ok()) {
+    if (TF_GetCode(status) == TF_OK) {
       // Add the node to the name-to-node mapping.
       desc->graph->name_map[ret->name()] = ret;
     } else if (ret != nullptr) {
@@ -1444,7 +1524,7 @@ int TF_OperationOutputListLength(TF_Operation* oper, const char* arg_name,
   NameRangeMap name_ranges;
   status->status =
       NameRangesForNode(oper->node, oper->node.op_def(), nullptr, &name_ranges);
-  if (!status->status.ok()) return -1;
+  if (TF_GetCode(status) != TF_OK) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
     status->status = InvalidArgument("Input arg '", arg_name, "' not found");
@@ -1466,7 +1546,7 @@ int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
   NameRangeMap name_ranges;
   status->status =
       NameRangesForNode(oper->node, oper->node.op_def(), &name_ranges, nullptr);
-  if (!status->status.ok()) return -1;
+  if (TF_GetCode(status) != TF_OK) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
     status->status = InvalidArgument("Input arg '", arg_name, "' not found");
@@ -1564,7 +1644,7 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
                                             TF_Status* status) {
   TF_AttrMetadata metadata;
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return metadata;
+  if (TF_GetCode(status) != TF_OK) return metadata;
   switch (attr->value_case()) {
 #define SINGLE_CASE(kK, attr_type, size_expr) \
   case tensorflow::AttrValue::kK:             \
@@ -1671,7 +1751,7 @@ void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
                                void* value, size_t max_length,
                                TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   if (attr->value_case() != tensorflow::AttrValue::kS) {
     status->status =
         InvalidArgument("Attribute '", attr_name, "' is not a string");
@@ -1689,7 +1769,7 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
                                    int max_values, void* storage,
                                    size_t storage_size, TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   if (attr->value_case() != tensorflow::AttrValue::kList) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a list");
@@ -1722,7 +1802,7 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
   void func##List(TF_Operation* oper, const char* attr_name, c_type* values, \
                   int max_values, TF_Status* status) {                       \
     const auto* attr = GetAttrValue(oper, attr_name, status);                \
-    if (!status->status.ok()) return;                                        \
+    if (TF_GetCode(status) != TF_OK) return;                                 \
     if (attr->value_case() != tensorflow::AttrValue::kList) {                \
       status->status =                                                       \
           InvalidArgument("Value for '", attr_name, "' is not a list.");     \
@@ -1744,7 +1824,7 @@ void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
   PartialTensorShape shape;
   status->status =
       tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shape);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   auto len = std::min(shape.dims(), num_dims);
   for (int i = 0; i < len; ++i) {
     value[i] = shape.dim_size(i);
@@ -1758,7 +1838,7 @@ void TF_OperationGetAttrShapeList(TF_Operation* oper, const char* attr_name,
   std::vector<PartialTensorShape> shapes;
   status->status =
       tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shapes);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   auto len = std::min(static_cast<int>(shapes.size()), max_values);
   int64_t* p = storage;
   int storage_left = storage_size;
@@ -1786,7 +1866,7 @@ void TF_OperationGetAttrTensorShapeProto(TF_Operation* oper,
                                          const char* attr_name,
                                          TF_Buffer* value, TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   if (attr->value_case() != tensorflow::AttrValue::kShape) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a shape.");
@@ -1800,7 +1880,7 @@ void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
                                              TF_Buffer** values, int max_values,
                                              TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   if (attr->value_case() != tensorflow::AttrValue::kList) {
     status->status =
         InvalidArgument("Value for '", attr_name, "' is not a list");
@@ -1810,7 +1890,7 @@ void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
   for (int i = 0; i < len; ++i) {
     values[i] = TF_NewBuffer();
     status->status = MessageToBuffer(attr->list().shape(i), values[i]);
-    if (!status->status.ok()) {
+    if (TF_GetCode(status) != TF_OK) {
       // Delete everything allocated to far, the operation has failed.
       for (int j = 0; j <= i; ++j) {
         TF_DeleteBuffer(values[j]);
@@ -1825,7 +1905,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
   *value = nullptr;
   Tensor t;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   *value = TF_TensorFromTensor(t, status);
 }
 
@@ -1834,7 +1914,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
                                    TF_Status* status) {
   std::vector<Tensor> ts;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &ts);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
     values[i] = TF_TensorFromTensor(ts[i], status);
@@ -1845,7 +1925,7 @@ void TF_OperationGetAttrValueProto(TF_Operation* oper, const char* attr_name,
                                    TF_Buffer* output_attr_value,
                                    TF_Status* status) {
   const auto* attr = GetAttrValue(oper, attr_name, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   status->status = MessageToBuffer(*attr, output_attr_value);
 }
 
@@ -1923,7 +2003,7 @@ void TF_GraphGetOpDef(TF_Graph* graph, const char* op_name,
   {
     mutex_lock l(graph->mu);
     status->status = graph->graph.op_registry()->LookUpOpDef(op_name, &op_def);
-    if (!status->status.ok()) return;
+    if (TF_GetCode(status) != TF_OK) return;
   }
   status->status = MessageToBuffer(*op_def, output_op_def);
 }
@@ -2041,7 +2121,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
   tensorflow::ImportGraphDefResults results;
   status->status = tensorflow::ImportGraphDef(opts->opts, def, &graph->graph,
                                               &graph->refiner, &results);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
 
   // Add new nodes to name_map
   for (int i = last_node_id; i < graph->graph.num_node_ids(); ++i) {
@@ -2095,7 +2175,7 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults(
   auto results = new TF_ImportGraphDefResults();
   mutex_lock l(graph->mu);
   GraphImportGraphDefLocked(graph, def, options, results, status);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     delete results;
     return nullptr;
   }
@@ -2143,7 +2223,7 @@ void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
 
 namespace {
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 // Creates a placeholder representing an input to the cond or body graph.
 // TODO(skyewm): remove these from final graph
@@ -2153,7 +2233,7 @@ bool CreateInput(const TF_Output& parent_input, TF_Graph* g, const char* name,
   TF_SetAttrType(desc, "dtype", TF_OperationOutputType(parent_input));
   // TODO(skyewm): set placeholder shape
   TF_Operation* oper = TF_FinishOperation(desc, status);
-  if (!status->status.ok()) return false;
+  if (TF_GetCode(status) != TF_OK) return false;
   *input = {oper, 0};
   return true;
 }
@@ -2237,7 +2317,7 @@ bool ValidateInputWhileParams(const TF_WhileParams& params, TF_Status* s) {
   return true;
 }
 
-#endif  // __ANDROID__
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 void FreeWhileResources(const TF_WhileParams* params) {
   TF_DeleteGraph(params->cond_graph);
@@ -2256,9 +2336,9 @@ TF_WhileParams EmptyWhileParams() {
 
 TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs, int ninputs,
                            TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Creating while loops is not supported in Android. File a bug at "
+      "Creating while loops is not supported on mobile. File a bug at "
       "https://github.com/tensorflow/tensorflow/issues if this feature is "
       "important to you");
   return EmptyWhileParams();
@@ -2298,15 +2378,15 @@ TF_WhileParams TF_NewWhile(TF_Graph* g, TF_Output* inputs, int ninputs,
   TF_WhileParams params = {ninputs,    cond_graph,  cond_inputs,  cond_output,
                            body_graph, body_inputs, body_outputs, name};
 
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     FreeWhileResources(&params);
     return EmptyWhileParams();
   }
   return params;
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 namespace {
 
 // TODO(skyewm): make nodes in while loop unfetchable like in Python version
@@ -2381,13 +2461,13 @@ void TF_FinishWhileHelper(const TF_WhileParams* params, TF_Status* status,
 }
 
 }  // namespace
-#endif  // __ANDROID__
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 void TF_FinishWhile(const TF_WhileParams* params, TF_Status* status,
                     TF_Output* outputs) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Creating while loops is not supported in Android. File a bug at "
+      "Creating while loops is not supported on mobile. File a bug at "
       "https://github.com/tensorflow/tensorflow/issues if this feature is "
       "important to you");
 #else
@@ -2395,7 +2475,7 @@ void TF_FinishWhile(const TF_WhileParams* params, TF_Status* status,
   if (!ValidateConstWhileParams(*params, status)) return;
   TF_FinishWhileHelper(params, status, outputs);
   FreeWhileResources(params);
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 void TF_AbortWhile(const TF_WhileParams* params) { FreeWhileResources(params); }
@@ -2408,9 +2488,9 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
 void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, TF_Output* y,
                                int ny, TF_Output* x, int nx, TF_Output* dx,
                                TF_Status* status, TF_Output* dy) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Adding gradients is not supported in Android. File a bug at "
+      "Adding gradients is not supported on mobile. File a bug at "
       "https://github.com/tensorflow/tensorflow/issues if this feature is "
       "important to you");
 #else
@@ -2490,7 +2570,7 @@ void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, TF_Output* y,
 
   // Unpack the results from grad_outputs_arg.
   TFOutputsFromOutputs(dy_arg, dy);
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 // TF_Session functions ----------------------------------------------
@@ -2502,7 +2582,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
                           TF_Status* status) {
   Session* session;
   status->status = NewSession(opt->options, &session);
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
@@ -2519,11 +2599,11 @@ TF_Session* TF_LoadSessionFromSavedModel(
     const TF_SessionOptions* session_options, const TF_Buffer* run_options,
     const char* export_dir, const char* const* tags, int tags_len,
     TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status) {
-// TODO(ashankar): Remove the __ANDROID__ guard. This will require ensuring that
-// the tensorflow/cc/saved_model:loader build target is Android friendly.
-#ifdef __ANDROID__
+// TODO(sjr): Remove the IS_MOBILE_PLATFORM guard. This will require ensuring
+// that the tensorflow/cc/saved_model:loader build target is mobile friendly.
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Loading a SavedModel is not supported in Android. File a bug at "
+      "Loading a SavedModel is not supported on mobile. File a bug at "
       "https://github.com/tensorflow/tensorflow/issues if this feature is "
       "important to you");
   return nullptr;
@@ -2550,7 +2630,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
   status->status =
       tensorflow::LoadSavedModel(session_options->options, run_options_proto,
                                  export_dir, tag_set, &bundle);
-  if (!status->status.ok()) return nullptr;
+  if (TF_GetCode(status) != TF_OK) return nullptr;
 
   // Create a TF_Graph from the MetaGraphDef. This is safe as long as Session
   // extends using GraphDefs. The Graph instance is different, but equivalent
@@ -2567,7 +2647,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   if (meta_graph_def != nullptr) {
     status->status = MessageToBuffer(bundle.meta_graph_def, meta_graph_def);
-    if (!status->status.ok()) return nullptr;
+    if (TF_GetCode(status) != TF_OK) return nullptr;
   }
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
@@ -2575,7 +2655,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
   graph->sessions[session] = "";
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 void TF_CloseSession(TF_Session* s, TF_Status* status) {
@@ -2667,7 +2747,7 @@ void TF_SessionPRunSetup(TF_Session* session, const TF_Output* inputs,
   string new_handle;
   status->status = session->session->PRunSetup(input_names, output_names,
                                                target_names, &new_handle);
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     char* buf = new char[new_handle.size() + 1];
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
@@ -2729,9 +2809,9 @@ unsigned char TF_TryEvaluateConstant(TF_Graph* graph, TF_Output output,
       tensor, graph->refiner, *graph->graph.op_registry(),
       graph->graph.versions().producer(), &evaluated, &result_tensor);
   if (evaluated) {
-    DCHECK(status->status.ok());
+    DCHECK(TF_GetCode(status) == TF_OK);
     *result = TF_TensorFromTensor(result_tensor, status);
-    if (!status->status.ok()) evaluated = false;
+    if (TF_GetCode(status) != TF_OK) evaluated = false;
   }
   return evaluated;
 }
@@ -2750,9 +2830,9 @@ void TF_DeleteApiDefMap(TF_ApiDefMap* apimap) { delete apimap; }
 
 void TF_ApiDefMapPut(TF_ApiDefMap* api_def_map, const char* text,
                      size_t text_len, TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "ApiDefMap is not supported in Android.");
+      "ApiDefMap is not supported on mobile.");
 #else
   mutex_lock l(api_def_map->lock);
   if (api_def_map->update_docs_called) {
@@ -2763,14 +2843,14 @@ void TF_ApiDefMapPut(TF_ApiDefMap* api_def_map, const char* text,
   }
   string api_def_text(text, text_len);
   status->status = api_def_map->api_def_map.LoadApiDef(api_def_text);
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
                            size_t name_len, TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "ApiDefMap is not supported in Android.");
+      "ApiDefMap is not supported on mobile.");
   return nullptr;
 #else
   mutex_lock l(api_def_map->lock);
@@ -2786,19 +2866,19 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
 
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(*api_def, ret);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
   return ret;
-#endif  // __ANDROID__
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status) {
   tensorflow::KernelList kernel_list = tensorflow::GetAllRegisteredKernels();
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(kernel_list, ret);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
@@ -2810,7 +2890,7 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
       tensorflow::GetRegisteredKernelsForOp(name);
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(kernel_list, ret);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     TF_DeleteBuffer(ret);
     return nullptr;
   }
@@ -2819,16 +2899,16 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
 
 // TF_Server functions ----------------------------------------------
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 TF_Server::TF_Server(std::unique_ptr<tensorflow::ServerInterface> server)
     : target(server->target()), server(std::move(server)) {}
-#endif  // __ANDROID__
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 TF_Server* TF_NewServer(const void* proto, size_t proto_len,
                         TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Server functionality is not supported in Android");
+      "Server functionality is not supported on mobile");
   return nullptr;
 #else
   tensorflow::ServerDef server_def;
@@ -2840,41 +2920,41 @@ TF_Server* TF_NewServer(const void* proto, size_t proto_len,
 
   std::unique_ptr<tensorflow::ServerInterface> out_server;
   status->status = tensorflow::NewServer(server_def, &out_server);
-  if (!status->status.ok()) return nullptr;
+  if (TF_GetCode(status) != TF_OK) return nullptr;
 
   return new TF_Server(std::move(out_server));
-#endif
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 void TF_ServerStart(TF_Server* server, TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Server functionality is not supported in Android");
+      "Server functionality is not supported on mobile");
 #else
   status->status = server->server->Start();
-#endif
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 void TF_ServerStop(TF_Server* server, TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Server functionality is not supported in Android");
+      "Server functionality is not supported on mobile");
 #else
   status->status = server->server->Stop();
-#endif
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 void TF_ServerJoin(TF_Server* server, TF_Status* status) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   status->status = tensorflow::errors::Unimplemented(
-      "Server functionality is not supported in Android");
+      "Server functionality is not supported on mobile");
 #else
   status->status = server->server->Join();
-#endif
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 const char* TF_ServerTarget(TF_Server* server) {
-#ifdef __ANDROID__
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
   return nullptr;
 #else
   return server->target.c_str();
@@ -2882,8 +2962,15 @@ const char* TF_ServerTarget(TF_Server* server) {
 }
 
 void TF_DeleteServer(TF_Server* server) {
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
   delete server;
-#endif
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 }
+
+void TF_RegisterLogListener(void (*listener)(const char*)) {
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+  tensorflow::logging::RegisterListener(listener);
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+}
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index c7abba85521fccec07983cd5ab4f94a8368d6181..051de3a7dc0f8c630b6c81d2cfa960e5279c93c0 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -272,6 +272,39 @@ TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 // Return a pointer to the underlying data buffer.
 TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
+// Returns the number of elements in the tensor.
+TF_CAPI_EXPORT extern int64_t TF_TensorElementCount(const TF_Tensor* tensor);
+
+// Copy the internal data representation of `from` to `to`. `new_dims` and
+// `num_new_dims` specify the new shape of the `to` tensor, `type` specifies its
+// data type. On success, *status is set to TF_OK and the two tensors share the
+// same data buffer.
+//
+// This call requires that the `from` tensor and the given type and shape (dims
+// and num_dims) are "compatible" (i.e. they occupy the same number of bytes).
+// Specifically, given from_type_size = TF_DataTypeSize(TF_TensorType(from)):
+//
+// ShapeElementCount(dims, num_dims) * TF_DataTypeSize(type)
+//
+// must equal
+//
+// TF_TensorElementCount(from) * from_type_size
+//
+// where TF_ShapeElementCount would be the number of elements in a tensor with
+// the given shape.
+//
+// In addition, this function requires:
+//   * TF_DataTypeSize(TF_TensorType(from)) != 0
+//   * TF_DataTypeSize(type) != 0
+//
+// If any of the requirements are not met, *status is set to
+// TF_INVALID_ARGUMENT.
+TF_CAPI_EXPORT extern void TF_TensorBitcastFrom(const TF_Tensor* from,
+                                                TF_DataType type, TF_Tensor* to,
+                                                const int64_t* new_dims,
+                                                int num_new_dims,
+                                                TF_Status* status);
+
 // --------------------------------------------------------------------------
 // Encode the string `src` (`src_len` bytes long) into `dst` in the format
 // required by TF_STRING tensors. Does not write to memory more than `dst_len`
@@ -516,6 +549,10 @@ TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
                                               const char* attr_name,
                                               const TF_DataType* values,
                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrPlaceholder(TF_OperationDescription* desc,
+                                                 const char* attr_name,
+                                                 const char* placeholder);
+
 // Set a 'func' attribute to the specified name.
 // `value` must point to a string of length `length` bytes.
 TF_CAPI_EXPORT extern void TF_SetAttrFuncName(TF_OperationDescription* desc,
@@ -1277,6 +1314,28 @@ TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction(
     int noutputs, const TF_Output* outputs, const char* const* output_names,
     const TF_FunctionOptions* opts, const char* description, TF_Status* status);
 
+// Similar to TF_GraphToFunction but allows specifying control outputs of the
+// function.
+//
+//  The arguments of TF_GraphToFunction have the same meaning, but the new
+//  arguments are as follows:
+//
+//    ncontrol_outputs: Number of control outputs of the function.
+//    control_outputs: vector of TF_Operation objects to be marked as control
+//      outputs of the function. Operations marked as control outputs are
+//      guaranteed to execute.
+//    control_output_names: Optional. If not nullptr, vector of strings, one
+//      per control output, with their names to be added to the function's
+//      OpDef.
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status);
+
 // Returns the name of the graph function.
 // The return value points to memory that is only usable until the next
 // mutation to *func.
@@ -1710,6 +1769,14 @@ TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
 // it will be stopped and joined.
 TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
 
+// Register a listener method that processes printed messages.
+//
+// If any listeners are registered, the print operator will call all listeners
+// with the printed messages and immediately return without writing to the
+// logs.
+TF_CAPI_EXPORT extern void TF_RegisterLogListener(
+    void (*listener)(const char*));
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index f04b285037dff403428ed74fe90eac60339fe36b..7ff4084decc686b067226ecaecf2af29d51d42f2 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -128,6 +129,14 @@ const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   return ret;
 }
 
+char* TF_FunctionDebugString(TF_Function* func, size_t* len) {
+  const auto& debug_str = func->fdef.DebugString();
+  *len = debug_str.size();
+  char* ret = static_cast<char*>(malloc(*len + 1));
+  memcpy(ret, debug_str.c_str(), *len + 1);
+  return ret;
+}
+
 // On success, returns a set of TF_Function instances from `text_proto` of
 // GraphDef type. These functions must be deleted by calling TF_DeleteFunction.
 //
@@ -8737,6 +8746,12 @@ static void CheckOk(TF_Status* status) {
 
 void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   auto* status = TF_NewStatus();
+  if (!TFE_TensorHandleIsConcrete(handle)) {
+    VLOG(1) << "Symbolic tensor: " << handle;
+    TF_DeleteStatus(status);
+    return;
+  }
+
   TF_Tensor* t = TFE_TensorHandleResolve(handle, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
@@ -8748,6 +8763,11 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
+void TFE_OpPrintDebugString(TFE_Op* op) {
+  VLOG(1) << "TFE_OpPrintDebugString() over " << op;
+  LOG(INFO) << op->operation.DebugString();
+}
+
 struct TFE_ExecuteOpNotification {
   TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
   tensorflow::Notification n;
@@ -8941,3 +8961,189 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
   }
   status->status = EnableCollectiveOps(server_def, ctx);
 }
+
+std::string tensorflow::getTF_OutputDebugString(TF_Output node) {
+  return absl::Substitute("TF_Output($0, $1)", node.oper, node.index);
+}
+
+using tensorflow::getTF_OutputDebugString;
+
+TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(TF_Output t,
+                                                  TF_DataType dtype) {
+  auto ret = new TFE_TensorHandle(t, dtype);
+  VLOG(1) << "Storing TFOutput " << getTF_OutputDebugString(t)
+          << " into tensor handle " << ret << " with internal handle "
+          << ret->handle;
+  return ret;
+}
+
+unsigned char TFE_TensorHandleIsConcrete(TFE_TensorHandle* handle) {
+  assert(handle->handle != nullptr);
+  return handle->handle->getSymbolicTensor() == nullptr;
+}
+
+TF_Output TFE_GetTFOutputFromTensorHandle(TFE_TensorHandle* handle,
+                                          TF_Status* status) {
+  if (TFE_TensorHandleIsConcrete(handle)) {
+    status->status =
+        tensorflow::errors::Internal("Not a symbolic tensor: ", handle);
+    return TF_Output{nullptr, -1};
+  }
+
+  auto* sym_tensor = handle->handle->getSymbolicTensor();
+  CHECK(sym_tensor != nullptr);
+  auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+  VLOG(1) << "Retrieving " << getTF_OutputDebugString(ret)
+          << " from tensor handle " << handle;
+  CHECK_GE(sym_tensor->index, 0);
+  return ret;
+}
+
+TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph) {
+  return new TFE_TraceContext(graph);
+}
+
+void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx) { delete trace_ctx; }
+
+// If `handle` is already symbolic, return it. Otherwise map it to a new
+// symbolic tensor (a PlaceHolder op) and return that.
+static TF_Output getOrCreateSymbolicTensor(TFE_TraceContext* trace_ctx,
+                                           tensorflow::TensorHandle* handle,
+                                           TF_Status* status) {
+  VLOG(1) << "Getting symbolic tensor for input tensor handle " << handle
+          << ": " << handle->DebugString();
+
+  auto* sym_tensor = handle->getSymbolicTensor();
+  if (sym_tensor != nullptr) {
+    auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+    VLOG(1) << "This handle is a symbolic tensor " << sym_tensor << ": "
+            << getTF_OutputDebugString(ret);
+    return ret;
+  }
+
+  auto find_it = trace_ctx->input_tensor_map.find(handle);
+  if (find_it != trace_ctx->input_tensor_map.end()) {
+    VLOG(1) << "There exists a map entry from this concrete tensor to: "
+            << getTF_OutputDebugString(find_it->second);
+    return find_it->second;
+  }
+
+  auto node_name = tensorflow::strings::StrCat("additional_input_",
+                                               trace_ctx->node_counter++);
+  VLOG(1) << "Adding a place holder node named " << node_name;
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, "Placeholder", node_name.c_str());
+  TF_SetAttrType(desc, "dtype",
+                 static_cast<TF_DataType>(handle->dtype) /*TF_FLOAT*/);
+  auto* result = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) {
+    return TF_Output{nullptr, -1};
+  }
+
+  auto ret = TF_Output{result, 0};
+  VLOG(1) << "Creating a new map entry to map to: "
+          << getTF_OutputDebugString(ret);
+  trace_ctx->input_tensor_map[handle] = ret;
+  // `handle` could be destroyed before it's read from `input_tensor_map` (say
+  // during a subsequent TFE_FinalizeInputTensorsFromTraceContext() call), so we
+  // increment its ref count to extend its life span to that of `trace_ctx`.
+  handle->Ref();
+  VLOG(1) << "Ref count for handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  return ret;
+}
+
+TF_Operation* TFE_AddEagerOpToGraph(TFE_Op* op, TFE_TraceContext* trace_ctx,
+                                    TFE_TensorHandle** retvals,
+                                    int* num_retvals, TF_Status* status) {
+  VLOG(1) << "Calling TFE_AddEagerOpToGraph() with op " << op << ": "
+          << op->operation.DebugString();
+
+  const auto& op_type = op->operation.Name();
+  auto op_name =
+      tensorflow::strings::StrCat(op_type, "_", trace_ctx->node_counter++);
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, op_type.c_str(), op_name.c_str());
+
+  VLOG(1) << "Adding attrs.";
+  tensorflow::AttrValueMap attrs;
+  op->operation.Attrs().FillAttrValueMap(&attrs);
+  for (const auto& attr : attrs) {
+    desc->node_builder.Attr(attr.first, attr.second);
+  }
+
+  VLOG(1) << "Adding inputs.";
+  const auto& inputs = op->operation.Inputs();
+  size_t inputIndex = 0;
+  const tensorflow::OpDef& op_def = desc->node_builder.op_def();
+  for (const tensorflow::OpDef::ArgDef& input_arg : op_def.input_arg()) {
+    // TODO(bgogul): Add support for number attributes.
+    DCHECK(input_arg.number_attr().empty())
+        << "Number attributes is not implemented yet.";
+    if (input_arg.type_list_attr().empty()) {
+      auto symbolic_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+      TF_AddInput(desc, symbolic_input);
+      continue;
+    }
+    const std::string& type_list_attr = input_arg.type_list_attr();
+    const auto& attr_value = attrs[type_list_attr];
+    DCHECK(attr_value.value_case() == tensorflow::AttrValue::kList)
+        << "Type list attribute should be a list!";
+    std::vector<TF_Output> list_inputs(attr_value.list().type_size());
+    for (TF_Output& list_input : list_inputs) {
+      list_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+    }
+    TF_AddInputList(desc, list_inputs.data(), list_inputs.size());
+  }
+
+  auto* graph_op = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) return nullptr;
+
+  VLOG(1) << "Op finalized; setting return tensors.";
+  *num_retvals = TF_OperationNumOutputs(graph_op);
+  VLOG(1) << "This op has " << *num_retvals << " outputs.";
+  for (int i = 0; i < *num_retvals; ++i) {
+    auto output = TF_Output{graph_op, i};
+    auto dtype = TF_OperationOutputType(output);
+    retvals[i] = TFE_NewTensorHandleFromTFOutput(output, dtype);
+  }
+  return graph_op;
+}
+
+int TFE_FinalizeInputTensorsFromTraceContext(TFE_TraceContext* trace_ctx) {
+  if (trace_ctx->input_tensors == nullptr) {
+    trace_ctx->input_tensors =
+        new std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>();
+    trace_ctx->input_tensors->reserve(trace_ctx->input_tensor_map.size());
+
+    for (auto input : trace_ctx->input_tensor_map) {
+      trace_ctx->input_tensors->emplace_back(input.first, input.second);
+    }
+  }
+  return trace_ctx->input_tensor_map.size();
+}
+
+TF_Output TFE_GetInputGraphNodeFromTraceContext(TFE_TraceContext* trace_ctx,
+                                                unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  return trace_ctx->input_tensors->at(idx).second;
+}
+
+TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  auto* handle = trace_ctx->input_tensors->at(idx).first;
+  VLOG(1) << "Ref count for internal handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  handle->Ref();
+  auto* ret = new TFE_TensorHandle(handle);
+  VLOG(1) << "Returning a new tensor handle " << ret << ": "
+          << handle->DebugString();
+  return ret;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index e6d04d0c2b25a3f7b1ebf50c58268f003595a520..8d1a8b82fbaf9901b6d9aecf6d092ae298c8dba3 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -84,6 +84,15 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_CreateRunOptions(
 TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
                                                       size_t* len);
 
+// Returns the function content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+//
+// Do not return const char*, because some foreign language binding
+// (e.g. swift) cannot then call free() on the returned pointer.
+TF_CAPI_EXPORT extern char* TF_FunctionDebugString(TF_Function* func,
+                                                   size_t* len);
+
 // Creates a stack of data set + iterator nodes, currently hard-coded to return
 // a sequence of 3 float values <42.0, 43.0, 44.0> over 3 calls. On success,
 // returns the IteratorGetNext node, which caller can run or feed into an node.
@@ -181,6 +190,8 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+TF_CAPI_EXPORT extern void TFE_OpPrintDebugString(TFE_Op* op);
+
 typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
 
 // Allows invoking a kernel asynchronously, and explicitly returns a
@@ -255,6 +266,54 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
                                                    const void* proto,
                                                    size_t proto_len,
                                                    TF_Status* status);
+
+// Create a symbolic tensor from the input graph node.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(
+    TF_Output t, TF_DataType data_type);
+
+// Returns 0 if the input tensor handle represents a symbolic tensor (i.e., a
+// graph node). Otherwise returns non-0.
+TF_CAPI_EXPORT extern unsigned char TFE_TensorHandleIsConcrete(
+    TFE_TensorHandle* handle);
+
+// If `handle` is a symbolic tensor, return the corresponding graph node
+// represented by TF_Output. Otherwise, return an error status.
+TF_CAPI_EXPORT extern TF_Output TFE_GetTFOutputFromTensorHandle(
+    TFE_TensorHandle* handle, TF_Status* status);
+
+typedef struct TFE_TraceContext TFE_TraceContext;
+
+// A trace context contains a trace graph, to which TFE_AddEagerOpToGraph()
+// calls add graph nodes as a way to symbolically execute the eager ops.
+//
+// It also contains a hash map from concrete input tensors to symbolic
+// tensors. That map will be used to create input tensors to the trace graph.
+TF_CAPI_EXPORT extern TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph);
+
+TF_CAPI_EXPORT extern void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx);
+
+// Symbolically executes `op`, by adding a corresponding node to the graph
+// associated with `trace_ctx`. This graph node outputs a set of symbolic
+// tensors in `retvals` and `num_retvals`. Returns the corresponding graph
+// operation on success, otherwise returns nullptr.
+TF_CAPI_EXPORT extern TF_Operation* TFE_AddEagerOpToGraph(
+    TFE_Op* op, TFE_TraceContext* trace_ctx, TFE_TensorHandle** retvals,
+    int* num_retvals, TF_Status* status);
+
+// Finalizes the trace graph and its inputs, and returns the number of inputs.
+// After this call, the next two APIs can be called to iterate over the input
+// tensors.
+TF_CAPI_EXPORT extern int TFE_FinalizeInputTensorsFromTraceContext(
+    TFE_TraceContext* trace_ctx);
+
+TF_CAPI_EXPORT extern TF_Output TFE_GetInputGraphNodeFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx);
+
+// Each input tensor should be consumed at most once.
+TF_CAPI_EXPORT extern TFE_TensorHandle*
+TFE_ConsumeInputConcreteTensorFromTraceContext(TFE_TraceContext* trace_ctx,
+                                               unsigned int idx);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index daa7701b7fe7e8ce757b6504329cf6434ad39778..2c92e38f03a9d01d285f475b1a8996c44475c5c2 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
@@ -296,5 +297,178 @@ TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, SymbolicTensor) {
+  TF_Status* status = TF_NewStatus();
+  auto node = TF_Output{nullptr, 1};
+  auto* sym_handle = TFE_NewTensorHandleFromTFOutput(node, TF_FLOAT);
+  TFE_TensorHandlePrintDebugString(sym_handle);
+  CHECK_EQ(TFE_TensorHandleDataType(sym_handle), TF_FLOAT);
+  ASSERT_FALSE(TFE_TensorHandleIsConcrete(sym_handle));
+  auto same_node = TFE_GetTFOutputFromTensorHandle(sym_handle, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(same_node.oper, node.oper);
+  ASSERT_EQ(same_node.index, node.index);
+  TFE_DeleteTensorHandle(sym_handle);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  ASSERT_TRUE(TFE_TensorHandleIsConcrete(m));
+  (void)TFE_GetTFOutputFromTensorHandle(m, status);
+  CHECK_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(m);
+
+  TF_DeleteStatus(status);
+}
+
+class AddEagerOpToGraphTest : public ::testing::Test {
+ protected:
+  AddEagerOpToGraphTest()
+      : status_(TF_NewStatus()),
+        eager_ctx_(nullptr),
+        graph_(TF_NewGraph()),
+        trace_ctx_(TFE_NewTraceContext(graph_)) {
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    eager_ctx_ = TFE_NewContext(opts, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    TFE_DeleteContextOptions(opts);
+  }
+
+  ~AddEagerOpToGraphTest() override {
+    TFE_DeleteTraceContext(trace_ctx_);
+    TF_DeleteGraph(graph_);
+    TFE_DeleteContext(eager_ctx_);
+    TF_DeleteStatus(status_);
+  }
+
+  template <typename Callable>
+  void AddEagerOpToGraphAndCheck(TFE_Op* op, Callable checker) {
+    TFE_TensorHandle* retvals[5];
+    int num_retvals = 5;
+    // Symbolically execute this op, which adds a graph node to `trace_ctx_`.
+    TF_Operation* graph_op =
+        TFE_AddEagerOpToGraph(op, trace_ctx_, retvals, &num_retvals, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_NOTNULL(graph_op);
+    // Check the expectations.
+    checker(graph_op);
+    for (int i = 0; i < num_retvals; ++i) {
+      TFE_DeleteTensorHandle(retvals[i]);
+    }
+  }
+
+  TF_Status* status_;
+  TFE_Context* eager_ctx_;
+  TF_Graph* graph_;
+  TFE_TraceContext* trace_ctx_;
+};
+
+TEST_F(AddEagerOpToGraphTest, DebugPrintAndSymbolicExecution) {
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* op = MatMulOp(eager_ctx_, m, m);
+
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpPrintDebugString(op);
+
+  TFE_TensorHandle* retvals[5];
+  int num_retvals = 5;
+  // Symbolically execute this op, which adds a graph node to `trace_ctx`.
+  TFE_AddEagerOpToGraph(op, trace_ctx_, retvals, &num_retvals, status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  int num_inputs = TFE_FinalizeInputTensorsFromTraceContext(trace_ctx_);
+  CHECK_EQ(num_inputs, 1);
+  auto input_sym_tensor = TFE_GetInputGraphNodeFromTraceContext(trace_ctx_,
+                                                                /*idx*/ 0);
+
+  LOG(INFO) << tensorflow::getTF_OutputDebugString(input_sym_tensor);
+  auto handle = TFE_ConsumeInputConcreteTensorFromTraceContext(trace_ctx_,
+                                                               /*idx*/ 0);
+  TFE_TensorHandlePrintDebugString(handle);
+  TFE_DeleteTensorHandle(handle);
+
+  CHECK_EQ(num_retvals, 1);
+  CHECK_EQ(TFE_TensorHandleDataType(retvals[0]), TF_FLOAT);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteOp(op);
+}
+
+TEST_F(AddEagerOpToGraphTest, ValueAttributesArePreserved) {
+  // Create MinOp
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* op = MinOp(eager_ctx_, axis, axis);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  // Check the attributes set by the call to MinOp above.
+  AddEagerOpToGraphAndCheck(op, [this, &axis](TF_Operation* graph_op) {
+    unsigned char value;
+    TF_OperationGetAttrBool(graph_op, "keep_dims", &value, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(value, 1);
+    TF_DataType dtype;
+    TF_OperationGetAttrType(graph_op, "Tidx", &dtype, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(dtype, TF_INT32);
+    TF_OperationGetAttrType(graph_op, "T", &dtype, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(dtype, TFE_TensorHandleDataType(axis));
+  });
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteOp(op);
+}
+
+TEST_F(AddEagerOpToGraphTest, ListAttributesArePreserved) {
+  // Create a "Squeeze" operator with list attributes.
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* squeeze = TFE_NewOp(eager_ctx_, "Squeeze", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpAddInput(squeeze, axis, status_);
+  TFE_OpSetAttrType(squeeze, "T", TF_INT32);
+  std::vector<int64_t> boundaries = {1, 2, 3, 4};
+  TFE_OpSetAttrIntList(squeeze, "squeeze_dims", boundaries.data(),
+                       boundaries.size());
+  // Check attributes are preserved.
+  AddEagerOpToGraphAndCheck(
+      squeeze, [this, &boundaries](TF_Operation* squeeze_graph_op) {
+        TF_DataType dtype;
+        TF_OperationGetAttrType(squeeze_graph_op, "T", &dtype, status_);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        CHECK_EQ(dtype, TF_INT32);
+        std::unique_ptr<int64_t[]> list(new int64_t[boundaries.size()]);
+        TF_OperationGetAttrIntList(squeeze_graph_op, "squeeze_dims", list.get(),
+                                   boundaries.size(), status_);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        EXPECT_TRUE(std::equal(list.get(), list.get() + boundaries.size(),
+                               boundaries.begin()));
+      });
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteOp(squeeze);
+}
+
+TEST_F(AddEagerOpToGraphTest, ListInputsAreAddedCorrectly) {
+  TFE_TensorHandle* scalar = TestScalarTensorHandle(static_cast<float>(1));
+  TFE_Op* identityn = TFE_NewOp(eager_ctx_, "IdentityN", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  constexpr size_t kNumInputs = 3;
+  for (size_t i = 0; i < kNumInputs; ++i) {
+    TFE_OpAddInput(identityn, scalar, status_);
+  }
+  TF_DataType types[kNumInputs] = {TF_FLOAT, TF_FLOAT, TF_FLOAT};
+  TFE_OpSetAttrTypeList(identityn, "T", types, kNumInputs);
+  AddEagerOpToGraphAndCheck(
+      identityn, [this, kNumInputs](TF_Operation* graph_op) {
+        EXPECT_EQ(TF_OperationNumInputs(graph_op), kNumInputs);
+        EXPECT_EQ(TF_OperationInputListLength(graph_op, "input", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        EXPECT_EQ(TF_OperationOutputListLength(graph_op, "output", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+      });
+  TFE_DeleteTensorHandle(scalar);
+  TFE_DeleteOp(identityn);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 28b9f8df9c873ee394eb6a241dd9ac06ba6c8796..68b530b718bd50adc21cbd50d0d4b92d075fd013 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -162,6 +162,11 @@ Status FillFunctionBody(
     const std::vector<const Node*>& body_nodes,
     const std::unordered_map<string, string>& tensor_renaming,
     FunctionDef* fdef) {
+  std::unordered_set<string> func_attr_names;
+  for (const auto& func_attr : fdef->signature().attr()) {
+    func_attr_names.insert(func_attr.name());
+  }
+
   std::vector<const Edge*> in_edges;
   std::vector<const Edge*> control_edges;
   for (const Node* node : body_nodes) {
@@ -243,6 +248,48 @@ Status FillFunctionBody(
     if (node->op_def().is_stateful()) {
       fdef->mutable_signature()->set_is_stateful(true);
     }
+
+    // If this node has any attributes with placeholder value, add the
+    // attribute to FunctionDef signature.
+    for (const auto& iter : node->attrs()) {
+      if (iter.second.placeholder().empty()) {
+        continue;
+      }
+
+      // If we already added the attribute, skip it.
+      string func_attr_name = iter.second.placeholder();
+      if (func_attr_names.find(func_attr_name) != func_attr_names.end()) {
+        continue;
+      }
+
+      // This node's attribute is a placeholder value, so it does not have type
+      // information. We check node's OpDef for attribute type.
+      string node_attr_name = iter.first;
+      const OpDef::AttrDef* node_attr_def = nullptr;
+      for (const auto& node_attr : node->op_def().attr()) {
+        if (node_attr.name() == node_attr_name) {
+          node_attr_def = &node_attr;
+        }
+      }
+      if (!node_attr_def) {
+#ifdef TENSORFLOW_LITE_PROTOS
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name);
+#else
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name, ", OpDef: ", node->op_def().DebugString());
+#endif
+      }
+      OpDef::AttrDef* attr_def = fdef->mutable_signature()->add_attr();
+      attr_def->set_name(func_attr_name);
+      attr_def->set_type(node_attr_def->type());
+
+      func_attr_names.insert(func_attr_name);
+    }
   }
   return Status::OK();
 }
@@ -255,6 +302,8 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           const std::vector<OutputTensor>& inputs,
                           const std::vector<OutputTensor>& outputs,
                           const std::vector<string>& output_names,
+                          const std::vector<const Node*>& control_outputs,
+                          const std::vector<string>& control_output_names,
                           const char* description, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
@@ -378,6 +427,29 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     fdef->mutable_signature()->set_name(fn_name);
   }
 
+  if (!control_output_names.empty() &&
+      (control_outputs.size() != control_output_names.size())) {
+    return InvalidArgument(
+        "Expected number of control outputs (", control_outputs.size(),
+        ") and the number of control output names (",
+        control_output_names.size(), ") to match but they do not.");
+  }
+  std::unordered_set<string> control_output_names_set;
+  for (int i = 0; i < control_outputs.size(); ++i) {
+    string signature_name;
+    if (!control_output_names.empty()) {
+      signature_name = control_output_names[i];
+    } else {
+      signature_name = control_outputs[i]->name();
+    }
+    if (!control_output_names_set.insert(signature_name).second) {
+      return errors::InvalidArgument("Repeated control output name: ",
+                                     signature_name);
+    }
+    fdef->mutable_signature()->add_control_output(signature_name);
+    (*fdef->mutable_control_ret())[signature_name] = control_outputs[i]->name();
+  }
+
   return Status::OK();
 }
 
@@ -485,14 +557,14 @@ Status ComputeBodyNodes(
 using tensorflow::Node;
 using tensorflow::string;
 
-TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
-                                unsigned char append_hash_to_fn_name,
-                                int num_opers, const TF_Operation* const* opers,
-                                int ninputs, const TF_Output* inputs,
-                                int noutputs, const TF_Output* outputs,
-                                const char* const* output_names,
-                                const TF_FunctionOptions* opts,
-                                const char* description, TF_Status* status) {
+TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status) {
   tensorflow::mutex_lock l(*const_cast<tensorflow::mutex*>(&fn_body->mu));
 
   // Process inputs.
@@ -500,13 +572,13 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
   std::unordered_map<const Node*, std::vector<int>> input_nodes;
   status->status = tensorflow::ProcessInputs(fn_body, fn_name, ninputs, inputs,
                                              &input_tensors, &input_nodes);
-  if (!status->status.ok()) return nullptr;
+  if (TF_GetCode(status) != TF_OK) return nullptr;
 
   // Process outputs.
   std::vector<tensorflow::OutputTensor> output_tensors;
   status->status = tensorflow::ProcessOutputs(fn_body, fn_name, noutputs,
                                               outputs, &output_tensors);
-  if (!status->status.ok()) return nullptr;
+  if (TF_GetCode(status) != TF_OK) return nullptr;
 
   // Process output names.
   std::vector<string> output_names_vec;
@@ -517,26 +589,55 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
     }
   }
 
+  // Process control output names.
+  std::vector<string> control_output_names_vec;
+  if (control_output_names) {
+    control_output_names_vec.reserve(ncontrol_outputs);
+    for (int i = 0; i < ncontrol_outputs; ++i) {
+      control_output_names_vec.push_back(string(output_names[i]));
+    }
+  }
+
   // Compute body nodes.
   std::vector<const Node*> body_nodes;
   status->status = tensorflow::ComputeBodyNodes(
       fn_body, fn_name, num_opers, opers, input_nodes, &body_nodes);
-  if (!status->status.ok()) return nullptr;
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  // Compute body nodes.
+  std::vector<const Node*> control_output_nodes;
+  for (int i = 0; i < ncontrol_outputs; ++i) {
+    control_output_nodes.push_back(&control_outputs[i]->node);
+  }
 
   // Do the actual function creation.
   TF_Function* tf_function = new TF_Function();
   DCHECK(append_hash_to_fn_name <= 1);
   status->status = tensorflow::GraphToFunctionDef(
       fn_body->graph, fn_name, append_hash_to_fn_name != 0, body_nodes,
-      input_tensors, output_tensors, output_names_vec, description,
-      &tf_function->fdef);
-  if (!status->status.ok()) {
+      input_tensors, output_tensors, output_names_vec, control_output_nodes,
+      control_output_names_vec, description, &tf_function->fdef);
+  if (TF_GetCode(status) != TF_OK) {
     TF_DeleteFunction(tf_function);
     return nullptr;
   }
   return tf_function;
 }
 
+TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
+                                unsigned char append_hash_to_fn_name,
+                                int num_opers, const TF_Operation* const* opers,
+                                int ninputs, const TF_Output* inputs,
+                                int noutputs, const TF_Output* outputs,
+                                const char* const* output_names,
+                                const TF_FunctionOptions* opts,
+                                const char* description, TF_Status* status) {
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, num_opers, opers, ninputs,
+      inputs, noutputs, outputs, output_names, 0, nullptr, nullptr, opts,
+      description, status);
+}
+
 const char* TF_FunctionName(TF_Function* func) {
   return func->fdef.signature().name().c_str();
 }
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 73fe73769bc1219ce865149d67d333c53371ccc5..946f8c4a2c3fb25f908d809e00bf579b40a8668b 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -1230,6 +1231,53 @@ void DefineFunction(const char* name, TF_Function** func,
   ASSERT_NE(*func, nullptr);
 }
 
+REGISTER_OP("CustomOp")
+    .Output("output: float32")
+    .Attr("index: int")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
+
+void NodeWithPlaceholderAttrHelper(TF_Graph* graph, TF_Status* s,
+                                   const char* name, const char* placeholder,
+                                   TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "CustomOp", name);
+  TF_SetAttrPlaceholder(desc, "index", placeholder);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TEST_F(CApiFunctionTest, GraphToFunctionDefWithPlaceholderAttr) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Operation *node1, *node2, *node3;
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node1", "v1",
+                                &node1);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node2", "v1",
+                                &node2);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node3", "v2",
+                                &node3);
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{node1, 0}, {node2, 0}, {node3, 0}};
+  func_ = TF_GraphToFunction(
+      func_graph.get(), "func", /*append_hash_to_fn_name=*/false, -1,
+      /*opers=*/nullptr, 0, inputs, 3, outputs,
+      /*output_names=*/nullptr,
+      /*opts=*/nullptr, /*description=*/nullptr, s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(func_, nullptr);
+
+  // Verify that FunctionDef has 2 attributes, "v1" and "v2".
+  ASSERT_EQ(func_->fdef.signature().attr().size(), 2);
+  EXPECT_EQ(func_->fdef.signature().attr(0).name(), "v1");
+  EXPECT_EQ(func_->fdef.signature().attr(0).type(), "int");
+  EXPECT_EQ(func_->fdef.signature().attr(1).name(), "v2");
+  EXPECT_EQ(func_->fdef.signature().attr(1).type(), "int");
+}
+
 TEST_F(CApiFunctionTest, SetGradientAndRun) {
   // Define the function and its grad
   DefineFunction(func_name_, &func_);
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 5ba26d3c585350aa510f9970cbfc246a9a108543..9a69c58718b3514287256124629f59443f38fd39 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -24,10 +24,12 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#ifndef __ANDROID__
-#include "tensorflow/core/distributed_runtime/server_lib.h"
+// Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/platform/platform.h"  // NO_LINT
+
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/framework/op_gen_lib.h"
-#endif
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -42,6 +44,7 @@ limitations under the License.
 namespace tensorflow {
 class Device;
 class DeviceMgr;
+class ServerInterface;
 }  // namespace tensorflow
 
 // Internal structures used by the C API. These are likely to change and should
@@ -167,27 +170,27 @@ struct TF_Function {
 struct TF_ApiDefMap {
   explicit TF_ApiDefMap(const tensorflow::OpList& op_list)
       :
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
         api_def_map(op_list),
-#endif
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
         update_docs_called(false) {
   }
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
   tensorflow::ApiDefMap api_def_map GUARDED_BY(lock);
-#endif
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
   bool update_docs_called GUARDED_BY(lock);
   tensorflow::mutex lock;
 };
 
-#ifndef __ANDROID__
+#if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 struct TF_Server {
   TF_Server(std::unique_ptr<tensorflow::ServerInterface> server);
 
   const tensorflow::string target;
   std::unique_ptr<tensorflow::ServerInterface> server;
 };
-#endif
+#endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 namespace tensorflow {
 
@@ -204,7 +207,8 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 
 TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
+                       TF_Buffer* out);
 
 // Set the shapes and types of the output's handle.
 //
@@ -228,6 +232,8 @@ void RecordMutation(TF_Graph* graph, const TF_Operation& op,
 bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
     LOCKS_EXCLUDED(session->graph->mu, session->mu);
 
+std::string getTF_OutputDebugString(TF_Output node);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index d5934a10395ae094f65d3bc8b6cd7b94dbd32410..2be03bf0de6277fc63c353ad6dc63bec096a6993 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -163,6 +163,7 @@ TEST(CAPI, AllocateTensor) {
   EXPECT_EQ(dims[0], TF_Dim(t, 0));
   EXPECT_EQ(dims[1], TF_Dim(t, 1));
   EXPECT_EQ(num_bytes, TF_TensorByteSize(t));
+  EXPECT_EQ(6, TF_TensorElementCount(t));
   TF_DeleteTensor(t);
 }
 
@@ -1467,6 +1468,41 @@ TEST(CAPI, DeletingNullPointerIsSafe) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, TestBitcastFrom_Reshape) {
+  int64_t dims[] = {2, 3};
+  TF_Tensor* a =
+      TF_AllocateTensor(TF_UINT64, dims, 2, 6 * TF_DataTypeSize(TF_UINT64));
+  TF_Tensor* b =
+      TF_AllocateTensor(TF_UINT64, nullptr, 0, TF_DataTypeSize(TF_UINT64));
+  EXPECT_NE(a, nullptr);
+  EXPECT_NE(b, nullptr);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(1, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  int64_t new_dims[] = {3, 2};
+  TF_Status* status = TF_NewStatus();
+  TF_TensorBitcastFrom(a, TF_UINT64, b, new_dims, 2, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status));
+  TF_DeleteStatus(status);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(6, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  // Check that a write to one tensor shows up in the other.
+  *(static_cast<int64_t*>(TF_TensorData(a))) = 4;
+  EXPECT_EQ(4, *(static_cast<int64_t*>(TF_TensorData(b))));
+  *(static_cast<int64_t*>(TF_TensorData(b))) = 6;
+  EXPECT_EQ(6, *(static_cast<int64_t*>(TF_TensorData(a))));
+
+  TF_DeleteTensor(a);
+  TF_DeleteTensor(b);
+}
+
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
diff --git a/tensorflow/c/c_test.c b/tensorflow/c/c_test.c
index b86d8eb8e300e02a3871ecd5f424a82c521b18fc..7468122cd567270c8454f886e478be34c2c15cbf 100644
--- a/tensorflow/c/c_test.c
+++ b/tensorflow/c/c_test.c
@@ -25,6 +25,16 @@ limitations under the License.
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/kernels.h"
 
+// A create function. This will never actually get called in this test, it's
+// just nice to know that it compiles.
+void* create(TF_OpKernelConstruction* ctx) {
+  TF_DataType type;
+  TF_Status* s = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "foobar", &type, s);
+  TF_DeleteStatus(s);
+  return NULL;
+}
+
 // A compute function. This will never actually get called in this test, it's
 // just nice to know that it compiles.
 void compute(void* kernel, TF_OpKernelContext* ctx) {
@@ -32,12 +42,7 @@ void compute(void* kernel, TF_OpKernelContext* ctx) {
   TF_Status* s = TF_NewStatus();
   TF_GetInput(ctx, 0, &input, s);
   TF_DeleteTensor(input);
-
-  TF_DataType type;
-  TF_OpKernelContext_GetAttrType(ctx, "foobar", &type, s);
-
   TF_DeleteStatus(s);
-
 }
 
 // Exercises tensorflow's C API.
@@ -80,7 +85,7 @@ int main(int argc, char** argv) {
   TF_StringStreamDone(s);
 
   TF_KernelBuilder* b =
-      TF_NewKernelBuilder("SomeOp", "SomeDevice", NULL, &compute, NULL);
+      TF_NewKernelBuilder("SomeOp", "SomeDevice", &create, &compute, NULL);
   TF_RegisterKernelBuilder("someKernel", b, status);
 
   TF_DeleteStatus(status);
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 51665fb9db948e165129e5cbdf0dc3fb28f90f91..445b2cd25812e1d73fdd85b61f2a234150b880a6 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -70,7 +70,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core/profiler/lib:eager_profiler",
+        "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -110,7 +110,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
-        "//tensorflow/core/profiler/lib:eager_profiler",
+        "//tensorflow/core/profiler/lib:profiler_session",
     ],
 )
 
@@ -147,6 +147,7 @@ tf_cuda_cc_test(
     ],
     deps = [
         ":c_api",
+        ":c_api_internal",
         ":c_api_test_util",
         "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
@@ -210,6 +211,8 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
         "//tensorflow/core:gpu_runtime",
     ],
 )
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index f0408677d403affaee66ebe4c7592f6bd6c74d09..1002f474fed20759c67eb765e8d4c81a0f529ecc 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -226,14 +226,84 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
 
   auto* device_mgr = grpc_server->worker_env()->device_mgr;
 
-  ctx->context.InitializeRemote(std::move(server),
-                                std::move(remote_eager_workers),
-                                std::move(remote_device_mgr), remote_contexts,
-                                r, device_mgr, keep_alive_secs);
+  return ctx->context.InitializeRemote(
+      std::move(server), std::move(remote_eager_workers),
+      std::move(remote_device_mgr), remote_contexts, r, device_mgr,
+      keep_alive_secs);
+#undef LOG_AND_RETURN_IF_ERROR
+}
 
+tensorflow::Status OpInferSingleInputAttrs(TFE_Op* op,
+                                           TFE_TensorHandle* input) {
+  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
+  const auto& input_def = ictx->op_def->input_arg(ictx->input_arg_idx++);
+  if (!input_def.number_attr().empty() || !input_def.type_list_attr().empty()) {
+    // Some clients that are still setting their input attributes manually are
+    // adding input list to their op by calling `TFE_OpAddInput` for each of
+    // its elements instead of calling `TFE_OpAddInputList`. When this happens,
+    // we cannot detect the end of such list, thus lose track of the input
+    // arguments in the op definition. To guarantee backward compatibility with
+    // those clients, disable automatic inference in this case.
+    op->inference_ctx.reset(nullptr);
+    return tensorflow::Status::OK();
+  }
+  const std::string& type_attr = input_def.type_attr();
+  if (!type_attr.empty() && ictx->attrs.find(type_attr) == ictx->attrs.end()) {
+    op->operation.MutableAttrs()->Set(type_attr, input->handle->dtype);
+    ictx->attrs.insert(type_attr);
+  }
+  return tensorflow::Status::OK();
+}
+
+void OpInferSingleTypeInputListAttrs(TFE_Op* op,
+                                     const tensorflow::OpDef::ArgDef& input_def,
+                                     TFE_TensorHandle** inputs,
+                                     int num_inputs) {
+  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
+  if (ictx->attrs.find(input_def.number_attr()) == ictx->attrs.end()) {
+    op->operation.MutableAttrs()->Set(input_def.number_attr(), num_inputs);
+    ictx->attrs.insert(input_def.number_attr());
+  }
+  if (ictx->attrs.find(input_def.type_attr()) == ictx->attrs.end()) {
+    op->operation.MutableAttrs()->Set(input_def.type_attr(),
+                                      inputs[0]->handle->dtype);
+    ictx->attrs.insert(input_def.type_attr());
+  }
+}
+
+void OpInferMixedTypeInputListAttrs(TFE_Op* op,
+                                    const tensorflow::OpDef::ArgDef& input_def,
+                                    TFE_TensorHandle** inputs, int num_inputs) {
+  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
+  if (ictx->attrs.find(input_def.type_list_attr()) == ictx->attrs.end()) {
+    std::unique_ptr<tensorflow::DataType[]> dtypes(
+        new tensorflow::DataType[num_inputs]);
+    for (int i = 0; i < num_inputs; ++i) {
+      dtypes[i] = inputs[i]->handle->dtype;
+    }
+    op->operation.MutableAttrs()->Set(
+        input_def.type_list_attr(),
+        tensorflow::gtl::ArraySlice<const tensorflow::DataType>(dtypes.get(),
+                                                                num_inputs));
+    ictx->attrs.insert(input_def.type_list_attr());
+  }
+}
+
+tensorflow::Status OpInferInputListAttrs(TFE_Op* op, TFE_TensorHandle** inputs,
+                                         int num_inputs) {
+  TFE_OpInferenceContext* ictx = op->inference_ctx.get();
+  const auto& input_def = ictx->op_def->input_arg(ictx->input_arg_idx++);
+  if (!input_def.type_list_attr().empty()) {
+    OpInferMixedTypeInputListAttrs(op, input_def, inputs, num_inputs);
+  } else if (!input_def.type_attr().empty() &&
+             !input_def.number_attr().empty()) {
+    OpInferSingleTypeInputListAttrs(op, input_def, inputs, num_inputs);
+  } else {
+    return tensorflow::errors::InvalidArgument("Invalid input list definition");
+  }
   return tensorflow::Status::OK();
-#undef LOG_AND_RETURN_IF_ERROR
 }
+
 }  // namespace
 
 extern "C" {
@@ -249,6 +319,7 @@ void TFE_ContextOptionsSetAsync(TFE_ContextOptions* options,
                                 unsigned char enable) {
   options->async = enable;
 }
+
 void TFE_ContextOptionsSetDevicePlacementPolicy(
     TFE_ContextOptions* options, TFE_ContextDevicePlacementPolicy policy) {
   options->policy = policy;
@@ -302,7 +373,9 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   return list;
 }
 
-void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context.ClearCaches(); }
+void TFE_ContextClearCaches(TFE_Context* ctx, TF_Status* status) {
+  status->status = ctx->context.ClearCaches();
+}
 
 // Set server_def on the context, possibly updating it.
 TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
@@ -356,6 +429,8 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
   if (h == nullptr) return;
+  VLOG(1) << "Deleting tensor handle " << h << " with internal handle "
+          << h->handle;
   if (h->handle) {
     h->handle->Unref();
   }
@@ -490,20 +565,29 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
   const tensorflow::AttrTypeMap* types;
   bool is_function = false;
   status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
-  if (status->status.ok()) {
-    if (is_function && !ctx->context.FindFunctionByName(name)) {
-      status->status = tensorflow::errors::NotFound(
-          "'", name,
-          "' is neither a type of a primitive operation nor a name "
-          "of a function registered in binary running on ",
-          tensorflow::port::Hostname(),
-          ". Make sure the operation or function is "
-          "registered in the binary running in this process.");
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  if (!is_function) {
+    const tensorflow::OpDef* op_def;
+    status->status = tensorflow::OpDefForOp(op_or_function_name, &op_def);
+    if (!status->status.ok()) {
       return nullptr;
     }
-    return new TFE_Op(ctx, name, is_function, types);
+    return new TFE_Op(ctx, name, false, types,
+                      new TFE_OpInferenceContext(op_def));
   }
-  return nullptr;
+  if (!ctx->context.FindFunctionByName(name)) {
+    status->status = tensorflow::errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        tensorflow::port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
+    return nullptr;
+  }
+  return new TFE_Op(ctx, name, true, types, nullptr);
 }
 
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
@@ -527,8 +611,21 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 #endif  // TENSORFLOW_EAGER_USE_XLA
 }
 
-void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  op->operation.AddInput(h->handle);
+void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
+  op->operation.AddInput(input->handle);
+  if (op->inference_ctx) {
+    status->status = OpInferSingleInputAttrs(op, input);
+  }
+}
+
+void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
+                        TF_Status* status) {
+  for (int i = 0; i < num_inputs; ++i) {
+    op->operation.AddInput(inputs[i]->handle);
+  }
+  if (op->inference_ctx) {
+    status->status = OpInferInputListAttrs(op, inputs, num_inputs);
+  }
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
@@ -712,6 +809,7 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
+  VLOG(1) << "Calling TFE_Execute() on op " << op;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
       *num_retvals);
   status->status =
@@ -754,12 +852,18 @@ void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
   status->status = ctx->context.AddFunctionDef(function->fdef);
 }
 
+unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
+  return ctx->context.FindFunctionDef(name) != nullptr;
+}
+
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(true);
+  ctx->context.SetShouldStoreGraphs(true);
+  ctx->context.SetShouldStoreStepStats(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(false);
+  ctx->context.SetShouldStoreGraphs(false);
+  ctx->context.SetShouldStoreStepStats(false);
 }
 
 }  // extern "C"
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 120748ab763a3358b6e38e64bb3b6fd2ea32f7c3..442593e8475be8ecac427fc63ce131d648437305 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -98,7 +98,8 @@ TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
 
 // Clears the internal caches in the TFE context. Useful when reseeding random
 // ops.
-TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx);
+TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx,
+                                                  TF_Status* status);
 
 // Sets a thread-local device placement policy. After this call, other calls to
 // TFE_Execute in the same thread will use the device policy specified here
@@ -282,9 +283,14 @@ TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
 TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
                                                    unsigned char enable);
 
-TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h,
+TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input,
                                           TF_Status* status);
 
+TF_CAPI_EXPORT extern void TFE_OpAddInputList(TFE_Op* op,
+                                              TFE_TensorHandle** inputs,
+                                              int num_inputs,
+                                              TF_Status* status);
+
 TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
                                                     const char* attr_name,
                                                     unsigned char* is_list,
@@ -393,6 +399,10 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
                                                   TF_Function* function,
                                                   TF_Status* status);
 
+// Checks whether a function is registered under `name`.
+TF_CAPI_EXPORT unsigned char TFE_ContextHasFunction(TFE_Context* ctx,
+                                                    const char* name);
+
 // Enables tracing of RunMetadata on the ops executed from this context.
 TF_CAPI_EXPORT extern void TFE_ContextEnableRunMetadata(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index ffcd5ace0b98597363abe63201bf6c328a03212f..b4192716c4fee66da5133fc592e39e26a66a98e8 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -32,13 +32,13 @@ std::vector<int64> TensorShapeAsVector(TFE_TensorHandle* handle,
                                        TF_Status* status) {
   std::vector<int64> shape;
   int rank = TFE_TensorHandleNumDims(handle, status);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     return shape;
   }
   shape.reserve(rank);
   for (int i = 0; i < rank; ++i) {
     shape.push_back(TFE_TensorHandleDim(handle, i, status));
-    if (!status->status.ok()) {
+    if (TF_GetCode(status) != TF_OK) {
       return shape;
     }
   }
@@ -53,7 +53,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     TFE_TensorHandle* handle, TF_Status* status) {
   const tensorflow::Tensor* tensor;
   status->status = handle->handle->Tensor(&tensor);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     return nullptr;
   }
 
@@ -139,7 +139,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
   // If the tensor is not an XLA tensor, the device shape is
   // the same as regular tensor shape.
   std::vector<int64> dev_dims = TensorShapeAsVector(handle, status);
-  if (!status->status.ok()) {
+  if (TF_GetCode(status) != TF_OK) {
     return nullptr;
   }
   return new TFE_TensorDebugInfo(dev_dims);
diff --git a/tensorflow/c/eager/c_api_debug_test.cc b/tensorflow/c/eager/c_api_debug_test.cc
index cddb9f6e00e9d639026f4bbe061d58f76771c0a9..4e987c745ecabd85c89d039468eb94ed51b4d00f 100644
--- a/tensorflow/c/eager/c_api_debug_test.cc
+++ b/tensorflow/c/eager/c_api_debug_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 TEST(CApiDebug, ScalarCPU) {
-  TFE_TensorHandle* h = TestScalarTensorHandle();
+  TFE_TensorHandle* h = TestScalarTensorHandle(1.0f);
   TF_Status* status = TF_NewStatus();
   TFE_TensorDebugInfo* debug_info = TFE_TensorHandleTensorDebugInfo(h, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 1ce03fb22693960627c27cd4aec58106a9ff3218..2e825341dfeae7cb2276add9006cc58ecdcdfe2b 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
 
 using tensorflow::string;
 
@@ -24,16 +26,20 @@ void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   op->operation.ConsumeInput(h->handle);
 }
 
-TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
+TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx) {
   return new TFE_Profiler(ctx);
 }
 
+bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
+  return profiler->profiler->Status().ok();
+}
+
 void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
 
 void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
                                    TF_Buffer* buf, TF_Status* status) {
   TFE_ContextAsyncWait(ctx, status);
-  if (!status->status.ok()) return;
+  if (TF_GetCode(status) != TF_OK) return;
   string content;
   status->status = profiler->profiler->SerializeToString(&content);
   void* data = tensorflow::port::Malloc(content.length());
@@ -44,3 +50,45 @@ void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
     tensorflow::port::Free(data);
   };
 }
+
+TFE_ProfilerContext* TFE_NewProfilerContext() {
+  return new TFE_ProfilerContext;
+}
+
+void TFE_ProfilerContextSetEagerContext(TFE_ProfilerContext* profiler_context,
+                                        TFE_Context* eager_context) {
+  profiler_context->profiler_context.eager_context = &eager_context->context;
+}
+
+void TFE_DeleteProfilerContext(TFE_ProfilerContext* profiler_context) {
+  delete profiler_context;
+}
+
+void TFE_StartProfilerServer(TFE_ProfilerContext* context, int port) {
+  // Release child thread intentionally. The child thread can be terminate by
+  // terminating the main thread.
+  tensorflow::StartProfilerServer(&context->profiler_context, port).release();
+}
+
+void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(true);
+}
+
+void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(false);
+}
+
+bool TFE_ProfilerClientStartTracing(const char* service_addr,
+                                    const char* logdir, const char* worker_list,
+                                    bool include_dataset_ops, int duration_ms,
+                                    int num_tracing_attempts) {
+  tensorflow::Status s =
+      tensorflow::profiler::client::ValidateHostPortPair(service_addr);
+  if (!s.ok()) {
+    return false;
+  }
+  s = tensorflow::profiler::client::StartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts);
+  return s.ok();
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 9eb80f521624e0116dd8ea5e4dbbf7e3d350a09c..219b9f40720a4fc212bd6e191b5bb441cf2abeb8 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -25,6 +25,8 @@ extern "C" {
 TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
                                               TF_Status* status);
 
+typedef struct TFE_ProfilerContext TFE_ProfilerContext;
+
 // A profiler which will start profiling when creating the object and will stop
 // when the object is destroyed. It will profile all operations run under the
 // given TFE_Context. Multiple instance of it can be created, but at most one
@@ -32,17 +34,59 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
 // Thread-safety: TFE_Profiler is thread-safe.
 typedef struct TFE_Profiler TFE_Profiler;
 
-TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx);
+TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
 TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
 
-// The output string is a binary string of tensorflow.tfprof.ProfileProto.
-// User can write the string to file for offline analysis by tfprof command-line
-// tools or graphical user interface.
+// The output string is a binary string of tensorflow.tpu.Trace. User can write
+// the string to file for offline analysis by tensorboard.
 TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Context* ctx,
                                                          TFE_Profiler* profiler,
                                                          TF_Buffer* buf,
                                                          TF_Status* status);
 
+// Return a new profiler context object.
+TF_CAPI_EXPORT extern TFE_ProfilerContext* TFE_NewProfilerContext(void);
+
+// Set the eager context in TFE_ProfilerServerOptions
+TF_CAPI_EXPORT extern void TFE_ProfilerContextSetEagerContext(
+    TFE_ProfilerContext* profiler_context, TFE_Context* eager_context);
+
+// Destroy a profiler context object.
+TF_CAPI_EXPORT extern void TFE_DeleteProfilerContext(
+    TFE_ProfilerContext* profiler_context);
+
+// Start a profiler grpc server which listens to specified port. It will start
+// the server on its own thread. It can be shutdown by terminating tensorflow.
+// It can be used in both Eager mode and graph mode. Creating multiple profiler
+// server is allowed. The service defined in
+// tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+// file following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_ProfilerContext* context,
+                                                   int port);
+
+// Enables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableGraphCollection(TFE_Context* ctx);
+
+// Disables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableGraphCollection(TFE_Context* ctx);
+
+// Send a grpc request to profiler server (service_addr) to perform on-demand
+// profiling and save the result into logdir which can be visualized by
+// TensorBoard. worker_list is the list of worker TPUs separated by ','. Set
+// include_dataset_opts to false to profile longer traces. It will block the
+// caller thread until receives tracing result.
+// This API is designed for TensorBoard, for end user, please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile instead following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern bool TFE_ProfilerClientStartTracing(
+    const char* service_addr, const char* logdir, const char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index c3d29bd2e094c1dca50e3e132bc9002503cb056b..d85048caa7c7f727271352883cb834a2575bd251 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 using tensorflow::string;
 
@@ -41,9 +41,12 @@ void ExecuteWithProfiling(bool async) {
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
   TFE_Context* ctx = TFE_NewContext(opts, status);
-  TFE_Profiler* profiler = TFE_NewProfiler(ctx);
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+  TFE_Profiler* profiler = TFE_NewProfiler(profiler_context);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
+  TFE_DeleteProfilerContext(profiler_context);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
@@ -70,17 +73,17 @@ void ExecuteWithProfiling(bool async) {
   TFE_ProfilerSerializeToString(ctx, profiler, profiler_result, status);
   TFE_DeleteProfiler(profiler);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  tensorflow::tfprof::ProfileProto profile_proto;
+  profiler::Trace profile_proto;
   EXPECT_TRUE(profile_proto.ParseFromString(
       {reinterpret_cast<const char*>(profiler_result->data),
        profiler_result->length}));
   string profile_proto_str = profile_proto.DebugString();
   if (!gpu_device_name.empty()) {
-    EXPECT_TRUE(HasSubstr(profile_proto_str, "gpu:0"));
+    EXPECT_TRUE(HasSubstr(profile_proto_str, "GPU:0"));
     // device name with "stream:all" is collected by Device Tracer.
     EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
   }
-  EXPECT_TRUE(HasSubstr(profile_proto_str, "cpu:0"));
+  EXPECT_TRUE(HasSubstr(profile_proto_str, "CPU:0"));
   TF_DeleteBuffer(profiler_result);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
@@ -100,5 +103,27 @@ void ExecuteWithProfiling(bool async) {
 TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
 TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
 
+TEST(CAPI, MultipleProfilerSession) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+
+  TFE_Profiler* profiler1 = TFE_NewProfiler(profiler_context);
+  EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
+
+  TFE_Profiler* profiler2 = TFE_NewProfiler(profiler_context);
+  EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
+
+  TFE_DeleteProfiler(profiler1);
+  TFE_DeleteProfiler(profiler2);
+  TFE_DeleteProfilerContext(profiler_context);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index d330aa4290ea7ddf0461a2277bb3d6eb215df241..35dafb9a7f14bfe1fad21bda35685598164f3895 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -52,7 +52,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/lib/eager_profiler.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
@@ -83,6 +83,12 @@ struct TFE_TensorHandle {
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
+
+  // Create a symbolic tensor.
+  TFE_TensorHandle(TF_Output t, TF_DataType dtype)
+      : handle(new tensorflow::TensorHandle(
+            tensorflow::OutputGraphNode{t.oper, t.index},
+            static_cast<tensorflow::DataType>(dtype))) {}
 };
 
 struct TFE_TensorDebugInfo {
@@ -93,19 +99,36 @@ struct TFE_TensorDebugInfo {
   std::vector<tensorflow::int64> dev_dims;
 };
 
+struct TFE_OpInferenceContext {
+  explicit TFE_OpInferenceContext(const tensorflow::OpDef* op_def)
+      : op_def(op_def) {}
+
+  const tensorflow::OpDef* op_def;  // op definition from protobuf
+  int input_arg_idx = 0;  // arg definition index for the next input to be added
+  tensorflow::gtl::FlatSet<std::string> attrs;  // attributes inferred so far
+};
+
 struct TFE_Op {
   TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
-         const tensorflow::AttrTypeMap* t)
-      : operation(&ctx->context, op, is_function, t) {}
+         const tensorflow::AttrTypeMap* t,
+         TFE_OpInferenceContext* inference_ctx)
+      : operation(&ctx->context, op, is_function, t),
+        inference_ctx(inference_ctx) {}
 
   tensorflow::EagerOperation operation;
+  std::unique_ptr<TFE_OpInferenceContext> inference_ctx;
+};
+
+struct TFE_ProfilerContext {
+  tensorflow::ProfilerContext profiler_context;
 };
 
 struct TFE_Profiler {
-  TFE_Profiler(TFE_Context* ctx)
-      : profiler(tensorflow::EagerProfiler::Create(&ctx->context)) {}
+  TFE_Profiler(TFE_ProfilerContext* ctx) {
+    profiler = tensorflow::ProfilerSession::Create(&ctx->profiler_context);
+  }
 
-  std::unique_ptr<tensorflow::EagerProfiler> profiler;
+  std::unique_ptr<tensorflow::ProfilerSession> profiler;
 };
 
 namespace tensorflow {
@@ -115,4 +138,24 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const char* attr_name, TF_Status* status);
 }  // namespace tensorflow
 
+struct TFE_TraceContext {
+  TF_Graph* const graph;
+
+  unsigned int node_counter = 0;
+  // Each tensor handle will have its ref count incremented when it's added as a
+  // map key, and decremented when this object is destroyed.
+  std::map<tensorflow::TensorHandle*, TF_Output> input_tensor_map;
+  std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>* input_tensors =
+      nullptr;
+
+  TFE_TraceContext(TF_Graph* graph) : graph(graph) {}
+
+  ~TFE_TraceContext() {
+    delete input_tensors;
+    for (auto input : input_tensor_map) {
+      input.first->Unref();
+    }
+  }
+};
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3d1ca4fb4b561a03ea9d879b1876fb1fd08a3139..b5e55420016bc9015ab71a515299838be953f5f4 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 
 #include <string.h>
 #include "absl/strings/match.h"
@@ -1626,4 +1627,158 @@ TEST(CAPI, TestTFE_TensorHandleCopySharingUnderlyingTensorHandle) {
   TFE_DeleteTensorHandle(h);
   TFE_DeleteTensorHandle(h_shares_tensor);
 }
+
+TEST(CAPI, TestTFE_OpInferSingleInputAttrs) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input = TestMatrixTensorHandle();
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* minOp = TFE_NewOp(ctx, "Min", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(minOp, input, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(minOp, axis, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  minOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
+  EXPECT_NE(attr_found, attr_values.cend());
+  EXPECT_EQ(attr_found->second.type(), tensorflow::DataType::DT_FLOAT);
+  attr_found = attr_values.find("Tidx");
+  EXPECT_NE(attr_found, attr_values.cend());
+  EXPECT_EQ(attr_found->second.type(), tensorflow::DataType::DT_INT32);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(minOp);
+  TFE_DeleteTensorHandle(input);
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+}
+
+TEST(CAPI, TestTFE_OpInferSingleTypeInputListAttrs) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input1 = TestMatrixTensorHandle();
+  TFE_TensorHandle* input2 = TestMatrixTensorHandle();
+  TFE_TensorHandle* dim = TestScalarTensorHandle(0);
+  TFE_Op* concatOp = TFE_NewOp(ctx, "Concat", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* inputs[] = {input1, input2};
+  TFE_OpAddInput(concatOp, dim, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInputList(concatOp, inputs, 2, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  concatOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
+  EXPECT_NE(attr_found, attr_values.cend());
+  EXPECT_EQ(attr_found->second.type(), tensorflow::DataType::DT_FLOAT);
+  attr_found = attr_values.find("N");
+  EXPECT_NE(attr_found, attr_values.cend());
+  EXPECT_EQ(attr_found->second.i(), 2);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(concatOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(concatOp);
+  TFE_DeleteTensorHandle(input1);
+  TFE_DeleteTensorHandle(input2);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteTensorHandle(dim);
+  TFE_DeleteContext(ctx);
+}
+
+TEST(CAPI, TestTFE_OpInferMixedTypeInputListAttrs) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* condition = TestScalarTensorHandle(true);
+  TFE_TensorHandle* t1 = TestMatrixTensorHandle();
+  TFE_TensorHandle* t2 = TestAxisTensorHandle();
+  TFE_Op* assertOp = TFE_NewOp(ctx, "Assert", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(assertOp, condition, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* data[] = {condition, t1, t2};
+  TFE_OpAddInputList(assertOp, data, 3, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  assertOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
+  EXPECT_NE(attr_found, attr_values.cend());
+  EXPECT_EQ(attr_found->second.list().type(0), tensorflow::DataType::DT_BOOL);
+  EXPECT_EQ(attr_found->second.list().type(1), tensorflow::DataType::DT_FLOAT);
+  EXPECT_EQ(attr_found->second.list().type(2), tensorflow::DataType::DT_INT32);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(assertOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(assertOp);
+  TFE_DeleteTensorHandle(condition);
+  TFE_DeleteTensorHandle(t1);
+  TFE_DeleteTensorHandle(t2);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+}
+
+TEST(CAPI, TestTFE_OpAttrsInferenceDisabledWhenNotCallingOpAddInputList) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input1 = TestMatrixTensorHandle();
+  TFE_TensorHandle* input2 = TestMatrixTensorHandle();
+  TFE_TensorHandle* dim = TestScalarTensorHandle(0);
+  TFE_Op* concatOp = TFE_NewOp(ctx, "Concat", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* inputs[] = {input1, input2};
+  TFE_OpAddInput(concatOp, dim, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  CHECK(concatOp->inference_ctx);
+  TFE_OpAddInput(concatOp, inputs[0], status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_FALSE(concatOp->inference_ctx) << "Inference context is still present";
+  TFE_OpAddInput(concatOp, inputs[1], status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  concatOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  EXPECT_EQ(attr_values.find("T"), attr_values.end());
+  EXPECT_EQ(attr_values.find("N"), attr_values.end());
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(concatOp);
+  TFE_DeleteTensorHandle(input1);
+  TFE_DeleteTensorHandle(input2);
+  TFE_DeleteTensorHandle(dim);
+  TFE_DeleteContext(ctx);
+}
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index bd38127d50c171af801dd1b937acefdba491b4a6..17d17c0b7f7909e8dc1aaea61ade2cce1c466a3f 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 using tensorflow::string;
 
-TFE_TensorHandle* TestScalarTensorHandle() {
-  float data[] = {1.0f};
+TFE_TensorHandle* TestScalarTensorHandle(float value) {
+  float data[] = {value};
   TF_Tensor* t = TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(float));
   memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
   TF_Status* status = TF_NewStatus();
@@ -33,6 +33,30 @@ TFE_TensorHandle* TestScalarTensorHandle() {
   return th;
 }
 
+TFE_TensorHandle* TestScalarTensorHandle(int value) {
+  int data[] = {value};
+  TF_Tensor* t = TF_AllocateTensor(TF_INT32, nullptr, 0, sizeof(int));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* TestScalarTensorHandle(bool value) {
+  bool data[] = {value};
+  TF_Tensor* t = TF_AllocateTensor(TF_BOOL, nullptr, 0, sizeof(bool));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* DoubleTestMatrixTensorHandle() {
   int64_t dims[] = {2, 2};
   double data[] = {1.0, 2.0, 3.0, 4.0};
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 75ef9459e93b4f2ed471c423a34565594efc1714..4ff3ff4301f63c001bec1eb23fb9e098b78c6a5e 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -20,7 +20,13 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 // Return a tensor handle containing a float scalar
-TFE_TensorHandle* TestScalarTensorHandle();
+TFE_TensorHandle* TestScalarTensorHandle(float value);
+
+// Return a tensor handle containing a int scalar
+TFE_TensorHandle* TestScalarTensorHandle(int value);
+
+// Return a tensor handle containing a bool scalar
+TFE_TensorHandle* TestScalarTensorHandle(bool value);
 
 // Return a tensor handle containing a 2x2 matrix of doubles
 TFE_TensorHandle* DoubleTestMatrixTensorHandle();
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 5c11f51e8749de84547ae873f5f55ebd42bc4b3d..29896b0d2bf6860775bed00284e7d6ff7992f474 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -639,7 +639,9 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
       }
     }
   }
-  CHECK(state.op_tape.empty());
+  if (!state.op_tape.empty()) {
+    return tensorflow::errors::Internal("Invalid tape state.");
+  }
   result->reserve(source_tensor_ids.size());
   gtl::FlatSet<int64> used_gradient_ids(source_tensor_ids.size());
   for (auto is : source_tensor_ids) {
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index c45efd7fae252dc7c6c98354faa66c4d690bd5e7..71181ae430ab64106e2a75937bd54fbf2efc61ac 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -48,9 +48,10 @@ TF_KernelBuilder* TF_NewKernelBuilder(
 }
 
 void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
-  DCHECK_NE(builder, nullptr);
-  delete builder->cc_builder;
-  delete builder;
+  if (builder != nullptr) {
+    delete builder->cc_builder;
+    delete builder;
+  }
 }
 
 namespace tensorflow {
@@ -172,9 +173,10 @@ void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
   cc_ctx->CtxFailure(s);
 }
 
-#define DEFINE_TF_GETATTR_(struct_name, func, c_type, cc_type)                 \
-  void struct_name##_GetAttr##func(struct_name* ctx, const char* attr_name,    \
-                                   c_type* val, TF_Status* status) {           \
+#define DEFINE_TF_GETATTR(func, c_type, cc_type)                               \
+  void TF_OpKernelConstruction_GetAttr##func(TF_OpKernelConstruction* ctx,     \
+                                             const char* attr_name,            \
+                                             c_type* val, TF_Status* status) { \
     TF_SetStatus(status, TF_OK, "");                                           \
     cc_type v;                                                                 \
     auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx); \
@@ -185,10 +187,6 @@ void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
     }                                                                          \
   }
 
-#define DEFINE_TF_GETATTR(func, c_type, cc_type)                     \
-  DEFINE_TF_GETATTR_(TF_OpKernelConstruction, func, c_type, cc_type) \
-  DEFINE_TF_GETATTR_(TF_OpKernelContext, func, c_type, cc_type)
-
 DEFINE_TF_GETATTR(Type, TF_DataType, tensorflow::DataType)
 
 TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index b015d0103969355e8566242bfcc007f697c6ae18..c47bfa8aa3a721d422a0a1536b924f3e53793193 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -137,15 +137,6 @@ TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrType(
     TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* val,
     TF_Status* status);
 
-// Interprets the named kernel context attribute as a TF_DataType and places it
-// into *val. *status is set to TF_OK.
-//
-// If the attribute could not be found or could not be interpreted as
-// TF_DataType, *status is populated with an error.
-TF_CAPI_EXPORT extern void TF_OpKernelContext_GetAttrType(
-    TF_OpKernelContext* ctx, const char* attr_name, TF_DataType* val,
-    TF_Status* status);
-
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..597182ab016a1b9564579ac0abf9667cf6d1dce9
--- /dev/null
+++ b/tensorflow/c/kernels/BUILD
@@ -0,0 +1,44 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_kernel_library",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+tf_kernel_library(
+    name = "bitcast_op",
+    prefix = "bitcast_op",
+    deps = [
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+    ],
+)
+
+tf_cc_test(
+    name = "bitcast_op_test",
+    srcs = ["bitcast_op_test.cc"],
+    deps = [
+        ":bitcast_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Changes to the Android srcs here should be replicated in
+# tensorflow/contrib/makefile/tf_op_files.txt
+# LINT.IfChange
+filegroup(
+    name = "android_all_ops",
+    srcs = [
+        "bitcast_op.cc",
+    ],
+)
+# LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
diff --git a/tensorflow/c/kernels/bitcast_op.cc b/tensorflow/c/kernels/bitcast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2f313af38677a3b93580fab1730363b43b32615
--- /dev/null
+++ b/tensorflow/c/kernels/bitcast_op.cc
@@ -0,0 +1,171 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+
+// BitcastOp implements a bitcast kernel, creating an output tensor that shares
+// the same data buffer as the input but with a different shape and/or data
+// type. Its inputs are:
+//
+//   * the input tensor
+//   * an attribute named "T" containing the TF_DataType of the input tensor
+//   * an attribute named "type" containing the TF_DataType of the output tensor
+//
+// Given an input tensor of shape [...], if the input DataType "T" is larger
+// than the output DataType "type", then the shape changes from [...]
+// to [..., sizeof(T)/sizeof(type)].
+//
+// If "T" is smaller than "type", the operator requires that the rightmost
+// dimension be equal to sizeof(type)/sizeof(T). The shape then goes from
+// [..., sizeof(type)/sizeof(T)] to [...].
+//
+// Bitcast is implemented as a low-level cast, so machines with different endian
+// orderings will give different results.
+typedef struct BitcastOp {
+  TF_DataType input_data_type;
+  TF_DataType output_data_type;
+  size_t in_size;
+  size_t out_size;
+} BitcastOp;
+
+static void* BitcastOp_Create(TF_OpKernelConstruction* ctx) {
+  auto* kernel = new BitcastOp;
+
+  TF_Status* s = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "T", &kernel->input_data_type, s);
+
+  if (TF_GetCode(s) == TF_OK) {
+    TF_OpKernelConstruction_GetAttrType(ctx, "type", &kernel->output_data_type,
+                                        s);
+  }
+
+  if (TF_GetCode(s) == TF_OK) {
+    kernel->in_size = TF_DataTypeSize(kernel->input_data_type);
+    kernel->out_size = TF_DataTypeSize(kernel->output_data_type);
+
+    size_t check_size = std::max(kernel->in_size, kernel->out_size) %
+                        std::min(kernel->in_size, kernel->out_size);
+    if (check_size != 0) {
+      std::ostringstream err;
+      err << "cannot convert between datatype " << kernel->input_data_type
+          << " and " << kernel->output_data_type;
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, err.str().c_str());
+    }
+  }
+
+  if (TF_GetCode(s) != TF_OK) {
+    TF_OpKernelConstruction_Failure(ctx, s);
+    delete kernel;
+    kernel = nullptr;
+  }
+
+  TF_DeleteStatus(s);
+  return kernel;
+}
+
+static void BitcastOp_Delete(void* kernel) {
+  delete static_cast<BitcastOp*>(kernel);
+}
+
+static void BitcastOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
+  auto* k = static_cast<BitcastOp*>(kernel);
+  int dim_count = 0;
+
+  TF_Tensor* tensor;
+  TF_Status* status = TF_NewStatus();
+  TF_GetInput(ctx, 0, &tensor, status);
+  if (TF_GetCode(status) == TF_OK) {
+    dim_count = TF_NumDims(tensor);
+    if (!(k->in_size >= k->out_size ||
+          (dim_count > 0 &&
+           TF_Dim(tensor, dim_count - 1) == k->out_size / k->in_size))) {
+      std::ostringstream err;
+      err << "Cannot bitcast from " << k->input_data_type << " to "
+          << k->output_data_type;
+      TF_SetStatus(status, TF_INVALID_ARGUMENT, err.str().c_str());
+    }
+  }
+
+  if (TF_GetCode(status) == TF_OK) {
+    auto* dims = new int64_t[dim_count + 1];
+    int new_dim_count = dim_count;
+    for (int dim = 0; dim < dim_count; ++dim) {
+      dims[dim] = TF_Dim(tensor, dim);
+    }
+    if (k->out_size < k->in_size) {
+      dims[new_dim_count++] = static_cast<int64_t>(k->in_size / k->out_size);
+    } else if (k->out_size > k->in_size) {
+      --new_dim_count;
+    }
+
+    TF_Tensor* output = TF_AllocateTensor(k->output_data_type, dims, 0,
+                                          TF_DataTypeSize(k->output_data_type));
+    TF_TensorBitcastFrom(tensor, k->output_data_type, output, dims,
+                         new_dim_count, status);
+    if (TF_GetCode(status) == TF_OK) {
+      TF_SetOutput(ctx, 0, output, status);
+    }
+    delete[] dims;
+    TF_DeleteTensor(output);
+  }
+
+  if (TF_GetCode(status) != TF_OK) {
+    TF_OpKernelContext_Failure(ctx, status);
+  }
+  TF_DeleteStatus(status);
+  TF_DeleteTensor(tensor);
+}
+
+static void RegisterBitcastOp() {
+  TF_Status* status = TF_NewStatus();
+
+  {
+    auto* builder = TF_NewKernelBuilder("Bitcast", tensorflow::DEVICE_CPU,
+                                        &BitcastOp_Create, &BitcastOp_Compute,
+                                        &BitcastOp_Delete);
+    TF_RegisterKernelBuilder("BitcastOp", builder, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status))
+        << "Error while registering bitcast kernel";
+  }
+
+#if GOOGLE_CUDA
+  {
+    auto* builder = TF_NewKernelBuilder("Bitcast", tensorflow::DEVICE_GPU,
+                                        &BitcastOp_Create, &BitcastOp_Compute,
+                                        &BitcastOp_Delete);
+    TF_RegisterKernelBuilder("BitcastOp", builder, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status))
+        << "Error while registering CUDA bitcast kernel";
+  }
+#endif
+
+  TF_DeleteStatus(status);
+}
+
+// A dummy static variable initialized by a lambda whose side-effect is to
+// register the bitcast kernel.
+static bool BitcastOpIsRegistered = []() {
+  if (SHOULD_REGISTER_OP_KERNEL("BitcastOp")) {
+    RegisterBitcastOp();
+  }
+  return true;
+}();
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06ffcca19dad13dc2fcae15125e25fc546562566
--- /dev/null
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+void TestBitcastOp(Tensor* input_tensor, DataType out_type,
+                   TensorShape expected_shape, error::Code expected_code) {
+  Status status;
+  NodeDef def;
+  def.set_op("Bitcast");
+  def.set_device(DEVICE_CPU);
+
+  AttrValue typeAttr;
+  SetAttrValue(input_tensor->dtype(), &typeAttr);
+
+  AttrValue outTypeAttr;
+  SetAttrValue(out_type, &outTypeAttr);
+
+  (*def.mutable_attr())["T"] = typeAttr;
+  (*def.mutable_attr())["type"] = outTypeAttr;
+
+  def.add_input(
+      strings::StrCat("input1: ", DataTypeString(input_tensor->dtype())));
+
+  std::unique_ptr<OpKernel> kernel =
+      CreateOpKernel(DeviceType(DEVICE_CPU), nullptr, nullptr, def, 1, &status);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  OpKernelContext::Params params;
+  DummyDevice dummy_device(nullptr, false);
+  params.device = &dummy_device;
+  params.op_kernel = kernel.get();
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  inputs.emplace_back(input_tensor);
+  params.inputs = &inputs;
+
+  OpKernelContext ctx(&params);
+  kernel->Compute(&ctx);
+  ASSERT_EQ(expected_code, ctx.status().code());
+  if (expected_code == error::OK) {
+    ASSERT_EQ(expected_shape, ctx.mutable_output(0)->shape())
+        << ctx.mutable_output(0)->shape().DebugString();
+  }
+}
+
+TEST(BitcastOpTest, TestUpcast) {
+  Tensor int8_input(DT_UINT8, {8});
+  for (int i = 0; i < 8; i++) {
+    int8_input.vec<uint8>()(i) = static_cast<uint8>(1);
+  }
+  TestBitcastOp(&int8_input, DT_UINT64, TensorShape(), error::OK);
+}
+
+TEST(BitcastOpTest, TestDowncast) {
+  Tensor int64_input(static_cast<uint64>(1));
+  TestBitcastOp(&int64_input, DT_UINT8, TensorShape({8}), error::OK);
+}
+
+TEST(BitcastOpTest, TestCastToSameSize) {
+  Tensor int32_input(DT_UINT32, {4, 6});
+  TestBitcastOp(&int32_input, DT_UINT8, TensorShape({4, 6, 4}), error::OK);
+}
+
+TEST(BitcastOpTest, TestImpossibleCast) {
+  Tensor int8_input(DT_UINT8, {1});
+  TestBitcastOp(&int8_input, DT_UINT32, TensorShape(), error::INVALID_ARGUMENT);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 531eb598fa8b6ff3f7c87641091f8d5a11752728..608887722f7bca44c884a3426d5e378e9387a530 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -36,6 +36,15 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
   struct MyCustomKernel* s = new struct MyCustomKernel;
   s->created = true;
   s->compute_called = false;
+
+  // Exercise attribute reads.
+  TF_DataType type;
+  TF_Status* status = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status));
+  EXPECT_EQ(TF_FLOAT, type);
+  TF_DeleteStatus(status);
+
   return s;
 }
 
@@ -43,17 +52,7 @@ static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
   struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
   s->compute_called = true;
   if (ctx != nullptr) {
-    TF_Status* status = TF_NewStatus();
-
     EXPECT_EQ(43, TF_StepId(ctx));
-
-    // Exercise attribute reads.
-    TF_DataType type;
-    TF_OpKernelContext_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
-    EXPECT_EQ(TF_OK, TF_GetCode(status));
-    EXPECT_EQ(TF_FLOAT, type);
-
-    TF_DeleteStatus(status);
   }
 }
 
@@ -224,4 +223,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
   }
 }
 
+TEST(TestKernel, DeleteKernelBuilderIsOkOnNull) {
+  TF_DeleteKernelBuilder(nullptr);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 98d8393332269ae349cf8aa5c0b612c6f17172e6..2c9d9f3a15b4dfec9d8fe511c8005cafc1d71ef7 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -41,6 +41,15 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
   RecordMutation(graph, *op, "setting attribute");
 }
 
+void ClearAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+               TF_Status* status) {
+  AttrValue attr_val;
+
+  mutex_lock l(graph->mu);
+  op->node.ClearAttr(attr_name);
+  RecordMutation(graph, *op, "clearing attribute");
+}
+
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
   mutex_lock l(graph->mu);
   op->node.set_requested_device(device);
@@ -80,7 +89,7 @@ void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
   status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
                                            &dst.oper->node, dst.index);
 
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     // This modification only updates the destination node for
     // the purposes of running this graph in a session. Thus, we don't
     // record the source node as being modified.
@@ -154,7 +163,7 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
     tensorflow::shape_inference::ShapeHandle shape;
     status->status =
         ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
-    if (!status->status.ok()) return;
+    if (TF_GetCode(status) != TF_OK) return;
     shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
   }
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
@@ -165,7 +174,7 @@ void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
   mutex_lock l(graph->mu);
   status->status = graph->graph.AddWhileInputHack(&new_src.oper->node,
                                                   new_src.index, &dst->node);
-  if (status->status.ok()) {
+  if (TF_GetCode(status) == TF_OK) {
     // This modification only updates the destination node for
     // the purposes of running this graph in a session. Thus, we don't
     // record the source node as being modified.
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 44779ca656165dd65590cb5e9ea3ccf71165ed63..f26c0cb2ae2f6e00a247660a02525901e87920cd 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -32,6 +32,11 @@ void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input);
 void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
              TF_Buffer* attr_value_proto, TF_Status* status);
 
+// Clears the attr in the node_def Protocol Buffer and sets a status upon
+// completion.
+void ClearAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+               TF_Status* status);
+
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
 // Updates 'dst' to consume 'new_src'.
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a09becc49b10d2c58f98fbcc11df5190f794c1d4..545e472115ad1a3f001754cbec37448696076c35 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -8,6 +8,19 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+filegroup(
+    name = "srcs",
+    srcs = [
+        "framework/gradients.h",
+        "framework/ops.h",
+        "framework/scope.h",
+        "framework/scope_internal.h",
+        "ops/array_ops.h",
+        "ops/while_loop.h",
+        "//tensorflow/cc/saved_model:loader.h",
+    ],
+)
+
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_library_with_android_deps",
@@ -150,6 +163,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -586,16 +600,32 @@ tf_gen_op_wrappers_cc(
     pkg = "//tensorflow/core",
 )
 
-cc_library_with_android_deps(
+tf_gen_op_wrappers_cc(
+    name = "tpu_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
+cc_library(
     name = "cc_op_gen_main",
     srcs = [
         "framework/cc_op_gen.cc",
         "framework/cc_op_gen.h",
         "framework/cc_op_gen_main.cc",
     ],
-    android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
-    ],
     copts = tf_copts(),
     data = [
         "//tensorflow/core/api_def:base_api_def",
@@ -666,7 +696,7 @@ tf_cc_binary(
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:windows": [],
-        "//tensorflow:darwin": [
+        "//tensorflow:macos": [
             "-lm",
             "-lpthread",
         ],
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 39593370d1c243e84dc5b6091724d1d404c102b0..43a33cbea6e1e4a50f61cc7d6d8d70cac6a603d2 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -321,6 +321,7 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"tensor", {"TensorProto", true}},
           {"list(tensor)", {"gtl::ArraySlice<TensorProto>", true}},
           {"func", {"NameAttrList", true}},
+          {"list(func)", {"gtl::ArraySlice<NameAttrList>", true}},
       };
 
   auto entry = attr_type_map->find(attr_type);
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index affd90b1bcc7cb4a8b3ffed6aeeb4bd480f5e314..303fdf64ec723864848096009a57dabda2fc93e4 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -96,7 +96,7 @@ class SymbolicGradientBuilder {
   // Used to identify nodes at which to stop backprop.
   std::unordered_set<int> GetStopBackpropNodes(
       const std::vector<bool>& reachable_nodes,
-      std::unordered_set<int> output_nodes);
+      const std::unordered_set<int>& output_nodes);
 
   const Scope& scope_;
   const ops::GradOpRegistry* registry_;
@@ -167,7 +167,6 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
 std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
   std::vector<bool> reachable_nodes(scope_.graph()->num_node_ids(), false);
   std::deque<Node*> queue;
-  std::vector<bool> visited(scope_.graph()->num_node_ids(), false);
   for (const Output& out : outputs_) {
     if (!reachable_nodes[out.node()->id()]) {
       queue.push_back(out.node());
@@ -180,10 +179,10 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
     queue.pop_front();
     for (const Edge* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
-      if (visited[e->src()->id()]) continue;
-      queue.push_back(e->src());
-      reachable_nodes[e->src()->id()] = true;
-      visited[e->src()->id()] = true;
+      if (!reachable_nodes[e->src()->id()]) {
+        queue.push_back(e->src());
+        reachable_nodes[e->src()->id()] = true;
+      }
     }
   }
   return reachable_nodes;
@@ -191,7 +190,7 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
 
 std::unordered_set<int> SymbolicGradientBuilder::GetStopBackpropNodes(
     const std::vector<bool>& reachable_nodes,
-    std::unordered_set<int> output_nodes) {
+    const std::unordered_set<int>& output_nodes) {
   // Output nodes that get transitively consumed by other `outputs_` are stored
   // in `internal_outputs`.
   std::unordered_set<int> internal_outputs;
@@ -201,9 +200,9 @@ std::unordered_set<int> SymbolicGradientBuilder::GetStopBackpropNodes(
   // `output_` node was encountered, pair.second will be nullptr.
   std::deque<std::pair<Node*, Node*>> queue;
   for (const Output& nout : inputs_) {
-    if (visited.find(nout.node()) == visited.end()) {
+    auto const& pair = visited.insert(nout.node());
+    if (pair.second) {
       queue.push_back(std::make_pair(nout.node(), static_cast<Node*>(nullptr)));
-      visited.insert(nout.node());
     }
   }
   // BFS from nodes in 'inputs_' along out edges for the entire graph. Internal
@@ -217,22 +216,23 @@ std::unordered_set<int> SymbolicGradientBuilder::GetStopBackpropNodes(
     for (const Edge* e : n->out_edges()) {
       // If a node is not reachable from outputs_, we can stop.
       if (e->IsControlEdge() || !reachable_nodes[e->dst()->id()]) continue;
-      if (visited.find(e->dst()) != visited.end()) continue;
-
-      int node_id = e->dst()->id();
-      Node* last_output_node = p.second;
-      if (output_nodes.find(node_id) != output_nodes.end()) {
-        // We reached an output node.
-        if (last_output_node != nullptr) {
-          // If we had already found an output node on this path so we mark
-          // it as an internal output.
-          internal_outputs.insert(last_output_node->id());
+
+      auto const& pair = visited.insert(e->dst());
+      if (pair.second) {
+        int node_id = e->dst()->id();
+        Node* last_output_node = p.second;
+        if (output_nodes.find(node_id) != output_nodes.end()) {
+          // We reached an output node.
+          if (last_output_node != nullptr) {
+            // If we had already found an output node on this path so we mark
+            // it as an internal output.
+            internal_outputs.insert(last_output_node->id());
+          }
+          // Mark this newly found output node to insert in the queue.
+          last_output_node = e->dst();
         }
-        // Mark this newly found output node to insert in the queue.
-        last_output_node = e->dst();
+        queue.push_back(std::make_pair(e->dst(), last_output_node));
       }
-      queue.push_back(std::make_pair(e->dst(), last_output_node));
-      visited.insert(e->dst());
     }
   }
   // Finally, we set stop_backprop_nodes to all output_nodes that aren't also
@@ -286,9 +286,9 @@ Status SymbolicGradientBuilder::Initialize() {
     std::unordered_set<Node*> visited;
     std::deque<Node*> queue;
     for (const Output& nout : inputs_) {
-      if (visited.find(nout.node()) == visited.end()) {
+      auto const& pair = visited.insert(nout.node());
+      if (pair.second) {
         queue.push_back(nout.node());
-        visited.insert(nout.node());
       }
     }
 
@@ -309,9 +309,9 @@ Status SymbolicGradientBuilder::Initialize() {
           // we don't expect it to receive a backpropagated gradient.
           // It will not be counted in num_expected_backprops.
           if (e->IsControlEdge() || !reachable_nodes[e->dst()->id()]) continue;
-          if (visited.find(e->dst()) == visited.end()) {
+          auto const& pair = visited.insert(e->dst());
+          if (pair.second) {
             queue.push_back(e->dst());
-            visited.insert(e->dst());
           }
           ++num_expected_backprops;
         }
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 81785b2d89b3d36b46992b7ae376b5175a806027..134d64af140f394825bb75477e31639be1aa8d50 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
@@ -153,6 +152,8 @@ Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
       exit_on_error_(other.impl()->exit_on_error_),
       kernel_label_(other.impl()->kernel_label_),
       device_(device),
+      assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -313,11 +314,10 @@ Status Scope::ToGraphDef(GraphDef* gdef) const {
   return Status::OK();
 }
 
-Status Scope::ToGraph(Graph* g) const {
+Status Scope::ToGraph(Graph* g, GraphConstructorOptions opts) const {
   if (ok()) {
     GraphDef graph_def;
     graph()->ToGraphDef(&graph_def);
-    GraphConstructorOptions opts;
     UpdateStatus(ConvertGraphDefToGraph(opts, graph_def, g));
   }
   return *impl()->status_;
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 0a75f23725c143e6b22ee6dffae1428ed8209fe8..1e17b74bc8f05869e50aa4ec645e57f7bcebc9f6 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
@@ -205,14 +206,15 @@ class Scope {
 
   // START_SKIP_DOXYGEN
 
-  /// If status() is Status::OK(), construct a Graph object using the default
+  /// If status() is Status::OK(), construct a Graph object using `opts` as the
   /// GraphConstructorOptions, and return Status::OK if graph construction was
   /// successful. Otherwise, return the error status.
   // TODO(josh11b, keveman): Make this faster; right now it converts
   // Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
   // edges from the source and to the sink node, resolves back edges
   // by name), and makes sure the resulting graph is valid.
-  Status ToGraph(Graph* g) const;
+  Status ToGraph(
+      Graph* g, GraphConstructorOptions opts = GraphConstructorOptions{}) const;
 
   // Calls AddNode() using this scope's ShapeRefiner. This exists in the public
   // API to prevent custom op wrappers from needing access to shape_refiner.h or
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index 05c287bdc62cdb8be7208ce3975f280aaa816766..056eea7eb5a5c796da01cd7353662da35aecddf9 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -29,13 +29,17 @@ Status ResizeNearestNeighborGradHelper(const Scope& scope, const Operation& op,
   bool align_corners;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners));
+  bool half_pixel_centers;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "half_pixel_centers",
+                                 &half_pixel_centers));
   // The internal gradient implementation needs the shape of the input image.
   // x_shape = shape(x)[1:3]
   //         = slice(shape(x), {1}, {3 - 1})
   auto x_shape = Slice(scope, Shape(scope, op.input(0)), {1}, {2});
   grad_outputs->push_back(internal::ResizeNearestNeighborGrad(
       scope, grad_inputs[0], x_shape,
-      internal::ResizeNearestNeighborGrad::AlignCorners(align_corners)));
+      internal::ResizeNearestNeighborGrad::AlignCorners(align_corners)
+          .HalfPixelCenters(half_pixel_centers)));
   grad_outputs->push_back(NoGradient());
   return scope.status();
 }
@@ -47,9 +51,13 @@ Status ResizeBilinearGradHelper(const Scope& scope, const Operation& op,
   bool align_corners;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners));
+  bool half_pixel_centers;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "half_pixel_centers",
+                                 &half_pixel_centers));
   grad_outputs->push_back(internal::ResizeBilinearGrad(
       scope, grad_inputs[0], op.input(0),
-      internal::ResizeBilinearGrad::AlignCorners(align_corners)));
+      internal::ResizeBilinearGrad::AlignCorners(align_corners)
+          .HalfPixelCenters(half_pixel_centers)));
   grad_outputs->push_back(NoGradient());
   return scope.status();
 }
@@ -61,9 +69,14 @@ Status ResizeBicubicGradHelper(const Scope& scope, const Operation& op,
   bool align_corners;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners));
+  bool half_pixel_centers;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "half_pixel_centers",
+                                 &half_pixel_centers));
+
   grad_outputs->push_back(internal::ResizeBicubicGrad(
       scope, grad_inputs[0], op.input(0),
-      internal::ResizeBicubicGrad::AlignCorners(align_corners)));
+      internal::ResizeBicubicGrad::AlignCorners(align_corners)
+          .HalfPixelCenters(half_pixel_centers)));
   grad_outputs->push_back(NoGradient());
   return scope.status();
 }
@@ -75,17 +88,40 @@ Status ScaleAndTranslateGradHelper(const Scope& scope, const Operation& op,
   string kernel_type;
   TF_RETURN_IF_ERROR(
       GetNodeAttr(op.node()->attrs(), "kernel_type", &kernel_type));
+  bool antialias;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "antialias", &antialias));
   grad_outputs->push_back(internal::ScaleAndTranslateGrad(
       scope, grad_inputs[0], op.input(0), op.input(2), op.input(3),
-      internal::ScaleAndTranslateGrad::KernelType(kernel_type)));
+      internal::ScaleAndTranslateGrad::KernelType(kernel_type)
+          .Antialias(antialias)));
 
   grad_outputs->push_back(NoGradient());
   grad_outputs->push_back(NoGradient());
   grad_outputs->push_back(NoGradient());
   return scope.status();
 }
+
 REGISTER_GRADIENT_OP("ScaleAndTranslate", ScaleAndTranslateGradHelper);
 
+Status CropAndResizeGradHelper(const Scope& scope, const Operation& op,
+                               const std::vector<Output>& grad_inputs,
+                               std::vector<Output>* grad_outputs) {
+  DataType input_type;
+  string method;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "method", &method));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "T", &input_type));
+  auto image_shape = Shape(scope, op.input(0));
+  grad_outputs->push_back(CropAndResizeGradImage(
+      scope, grad_inputs[0], op.input(1), op.input(2), image_shape, input_type,
+      CropAndResizeGradImage::Method(method)));
+  grad_outputs->push_back(CropAndResizeGradBoxes(
+      scope, grad_inputs[0], op.input(0), op.input(1), op.input(2)));
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+
+REGISTER_GRADIENT_OP("CropAndResize", CropAndResizeGradHelper);
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc
index 1d150226538093467e092e02f38090a327f9c9b6..d50f4f5750a680ed2ac20451b5522ede28cc474d 100644
--- a/tensorflow/cc/gradients/image_grad_test.cc
+++ b/tensorflow/cc/gradients/image_grad_test.cc
@@ -27,6 +27,7 @@ namespace tensorflow {
 namespace {
 
 using ops::Const;
+using ops::CropAndResize;
 using ops::ResizeBicubic;
 using ops::ResizeBilinear;
 using ops::ResizeNearestNeighbor;
@@ -51,7 +52,8 @@ class ImageGradTest : public ::testing::Test {
 
   template <typename T>
   void MakeOp(const OpType op_type, const Tensor& x_data, const Input& y_shape,
-              const bool align_corners, Output* x, Output* y) {
+              const bool align_corners, const bool half_pixel_centers,
+              Output* x, Output* y) {
     *x = Const<T>(scope_, x_data);
     switch (op_type) {
       case RESIZE_NEAREST:
@@ -61,22 +63,26 @@ class ImageGradTest : public ::testing::Test {
         return;
       case RESIZE_BILINEAR:
         *y = ResizeBilinear(scope_, *x, y_shape,
-                            ResizeBilinear::AlignCorners(align_corners));
+                            ResizeBilinear::AlignCorners(align_corners)
+                                .HalfPixelCenters(half_pixel_centers));
         return;
       case RESIZE_BICUBIC:
         *y = ResizeBicubic(scope_, *x, y_shape,
-                           ResizeBicubic::AlignCorners(align_corners));
+                           ResizeBicubic::AlignCorners(align_corners)
+                               .HalfPixelCenters(half_pixel_centers));
         return;
     }
     assert(false);
   }
 
   template <typename T>
-  void TestResizedShapeForType(const OpType op_type, const bool align_corners) {
+  void TestResizedShapeForType(const OpType op_type, const bool align_corners,
+                               const bool half_pixel_centers) {
     TensorShape x_shape({1, 2, 2, 1});
     Tensor x_data = MakeData<T>(x_shape);
     Output x, y;
-    MakeOp<T>(op_type, x_data, {4, 6}, align_corners, &x, &y);
+    MakeOp<T>(op_type, x_data, {4, 6}, align_corners, half_pixel_centers, &x,
+              &y);
 
     ClientSession session(scope_);
     std::vector<Tensor> outputs;
@@ -86,44 +92,64 @@ class ImageGradTest : public ::testing::Test {
   }
 
   void TestResizedShape(OpType op_type) {
-    for (const bool align_corners : {true, false}) {
-      TestResizedShapeForType<Eigen::half>(op_type, align_corners);
-      TestResizedShapeForType<float>(op_type, align_corners);
-      TestResizedShapeForType<double>(op_type, align_corners);
+    for (const bool half_pixel_centers : {true, false}) {
+      for (const bool align_corners : {true, false}) {
+        if (half_pixel_centers && align_corners) {
+          continue;
+        }
+        TestResizedShapeForType<Eigen::half>(op_type, align_corners,
+                                             half_pixel_centers);
+        TestResizedShapeForType<float>(op_type, align_corners,
+                                       half_pixel_centers);
+        TestResizedShapeForType<double>(op_type, align_corners,
+                                        half_pixel_centers);
+      }
     }
   }
 
   template <typename X_T, typename Y_T, typename JAC_T>
   void TestResizeToSmallerAndAlign(const OpType op_type,
-                                   const bool align_corners) {
+                                   const bool align_corners,
+                                   const bool half_pixel_centers) {
     TensorShape x_shape({1, 4, 6, 1});
     Tensor x_data = MakeData<X_T>(x_shape);
     Output x, y;
-    MakeOp<X_T>(op_type, x_data, {2, 3}, align_corners, &x, &y);
+    MakeOp<X_T>(op_type, x_data, {2, 3}, align_corners, half_pixel_centers, &x,
+                &y);
     JAC_T max_error;
     TF_ASSERT_OK((ComputeGradientError<X_T, Y_T, JAC_T>(
         scope_, x, x_data, y, {1, 2, 3, 1}, &max_error)));
-    EXPECT_LT(max_error, 1e-3);
+    EXPECT_LT(max_error, 1.5e-3);
   }
 
   template <typename X_T, typename Y_T, typename JAC_T>
   void TestResizeToLargerAndAlign(const OpType op_type,
-                                  const bool align_corners) {
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
     TensorShape x_shape({1, 2, 3, 1});
     Tensor x_data = MakeData<X_T>(x_shape);
     Output x, y;
-    MakeOp<X_T>(op_type, x_data, {4, 6}, align_corners, &x, &y);
+    MakeOp<X_T>(op_type, x_data, {4, 6}, align_corners, half_pixel_centers, &x,
+                &y);
     JAC_T max_error;
     TF_ASSERT_OK((ComputeGradientError<X_T, Y_T, JAC_T>(
         scope_, x, x_data, y, {1, 4, 6, 1}, &max_error)));
-    EXPECT_LT(max_error, 1e-3);
+    EXPECT_LT(max_error, 1.5e-3);
   }
 
   template <typename X_T, typename Y_T, typename JAC_T>
   void TestResize(OpType op_type) {
-    for (const bool align_corners : {true, false}) {
-      TestResizeToSmallerAndAlign<X_T, Y_T, JAC_T>(op_type, align_corners);
-      TestResizeToLargerAndAlign<X_T, Y_T, JAC_T>(op_type, align_corners);
+    for (const bool half_pixel_centers : {true, false}) {
+      for (const bool align_corners : {true, false}) {
+        // if (!half_pixel_centers) continue;
+        if (half_pixel_centers && align_corners) {
+          continue;
+        }
+        TestResizeToSmallerAndAlign<X_T, Y_T, JAC_T>(op_type, align_corners,
+                                                     half_pixel_centers);
+        TestResizeToLargerAndAlign<X_T, Y_T, JAC_T>(op_type, align_corners,
+                                                    half_pixel_centers);
+      }
     }
   }
 
@@ -170,29 +196,151 @@ class ScaleAndTranslateGradTest : public ::testing::Test {
   }
 
   template <typename T>
-  void MakeOp(const Tensor& x_data, const Input& y_shape, Output* x,
-              Output* y) {
+  void MakeOp(const Tensor& x_data, const Input& y_shape, Input scale,
+              Input translation, const string& kernel_type, bool antialias,
+              Output* x, Output* y) {
     *x = Const<T>(scope_, x_data);
-    *y = ScaleAndTranslate(scope_, *x, y_shape, {1.8f, 2.1f}, {0.5f, 0.7f});
+    *y = ScaleAndTranslate(scope_, *x, y_shape, scale, translation,
+                           ScaleAndTranslate::KernelType(kernel_type)
+                               .Antialias(antialias)
+                               .Antialias(antialias));
     TF_ASSERT_OK(scope_.status());
   }
 
   template <typename X_T, typename Y_T, typename JAC_T>
-  void TestResize() {
-    TensorShape x_shape({1, 2, 3, 1});
+  void TestScaleAndTranslate(const TensorShape x_shape, const int out_height,
+                             const int out_width, Input scale,
+                             Input translation, const string& kernel_type,
+                             bool antialias) {
     Tensor x_data = MakeData<X_T>(x_shape);
     Output x, y;
-    MakeOp<X_T>(x_data, {4, 6}, &x, &y);
+    MakeOp<X_T>(x_data, {out_height, out_width}, scale, translation,
+                kernel_type, antialias, &x, &y);
     JAC_T max_error;
     TF_ASSERT_OK((ComputeGradientError<X_T, Y_T, JAC_T>(
-        scope_, x, x_data, y, {1, 4, 6, 1}, &max_error)));
+        scope_, x, x_data, y, {1, out_height, out_width, 1}, &max_error)));
+    EXPECT_LT(max_error, 2e-3);
+  }
+
+  const std::vector<Input> kScales = {Input{1.0f, 1.0f}, Input{0.37f, 0.47f},
+                                      Input{2.1f, 2.1f}};
+  const std::vector<Input> kTranslations = {
+      Input{0.0f, 0.0f}, Input{3.14f, 1.19f}, Input{2.1f, 3.1f},
+      Input{100.0f, 200.0f}};
+  Scope scope_;
+};
+
+TEST_F(ScaleAndTranslateGradTest, TestGrads) {
+  const std::vector<std::string> kKernelTypes = {"lanczos1", "lanczos3",
+                                                 "lanczos5", "gaussian"};
+  constexpr int kOutHeight = 4;
+  constexpr int kOutWidth = 6;
+
+  const TensorShape kXShape = TensorShape({1, 2, 3, 1});
+  for (const Input scale : kScales) {
+    for (const Input translation : kTranslations) {
+      for (const std::string& kernel_type : kKernelTypes) {
+        TestScaleAndTranslate<float, float, float>(
+            kXShape, kOutHeight, kOutWidth, scale, translation, kernel_type,
+            true);
+      }
+    }
+  }
+}
+
+TEST_F(ScaleAndTranslateGradTest, TestGradsWithoutAntialias) {
+  constexpr int kOutHeight = 4;
+  constexpr int kOutWidth = 6;
+
+  const TensorShape kXShape = TensorShape({1, 2, 3, 1});
+  for (const Input scale : kScales) {
+    for (const Input translation : kTranslations) {
+      TestScaleAndTranslate<float, float, float>(kXShape, kOutHeight, kOutWidth,
+                                                 scale, translation, "lanczos3",
+                                                 false);
+    }
+  }
+}
+
+TEST_F(ScaleAndTranslateGradTest, TestGradsWithSameShape) {
+  const std::vector<std::string> kKernelTypes = {"lanczos3", "gaussian"};
+
+  constexpr int kOutHeight = 2;
+  constexpr int kOutWidth = 3;
+
+  const TensorShape kXShape = TensorShape({1, 2, 3, 1});
+  for (const Input scale : kScales) {
+    for (const Input translation : kTranslations) {
+      for (const std::string& kernel_type : kKernelTypes) {
+        TestScaleAndTranslate<float, float, float>(
+            kXShape, kOutHeight, kOutWidth, scale, translation, kernel_type,
+            true);
+      }
+    }
+  }
+}
+
+TEST_F(ScaleAndTranslateGradTest, TestGradsWithSmallerShape) {
+  const std::vector<std::string> kKernelTypes = {"lanczos3", "gaussian"};
+  constexpr int kOutHeight = 2;
+  constexpr int kOutWidth = 3;
+
+  const TensorShape kXShape = TensorShape({1, 4, 6, 1});
+  for (const Input scale : kScales) {
+    for (const Input translation : kTranslations) {
+      for (const std::string& kernel_type : kKernelTypes) {
+        TestScaleAndTranslate<float, float, float>(
+            kXShape, kOutHeight, kOutWidth, scale, translation, kernel_type,
+            true);
+      }
+    }
+  }
+}
+
+class CropAndResizeGradTest : public ::testing::Test {
+ protected:
+  CropAndResizeGradTest() : scope_(Scope::NewRootScope()) {}
+
+  template <typename T>
+  Tensor MakeData(const TensorShape& data_shape) {
+    DataType data_type = DataTypeToEnum<T>::v();
+    Tensor data(data_type, data_shape);
+    auto data_flat = data.flat<T>();
+    for (int i = 0; i < data_flat.size(); ++i) {
+      data_flat(i) = T(i);
+    }
+    return data;
+  }
+
+  template <typename T>
+  void MakeOp(const Tensor& x_data, const Input& boxes, const Input& box_ind,
+              const Input& crop_size, Output* x, Output* y) {
+    *x = Const<T>(scope_, x_data);
+    *y = CropAndResize(scope_, *x, boxes, box_ind, crop_size,
+                       CropAndResize::Method("bilinear"));
+    TF_ASSERT_OK(scope_.status());
+  }
+
+  template <typename X_T, typename Y_T, typename JAC_T>
+  void TestCropAndResize() {
+    TensorShape x_shape({1, 4, 2, 1});
+    Tensor x_data = MakeData<X_T>(x_shape);
+    TensorShape box_shape({1, 4});
+    Tensor boxes = MakeData<X_T>(box_shape);
+    Output x, y;
+    MakeOp<X_T>(x_data, boxes, {0}, {1, 1}, &x, &y);
+    JAC_T max_error;
+    TF_ASSERT_OK((ComputeGradientError<X_T, Y_T, JAC_T>(
+        scope_, x, x_data, y, {1, 1, 1, 1}, &max_error)));
     EXPECT_LT(max_error, 1e-3);
   }
 
   Scope scope_;
 };
 
-TEST_F(ScaleAndTranslateGradTest, Works) { TestResize<float, float, float>(); }
+TEST_F(CropAndResizeGradTest, TestCrop) {
+  TestCropAndResize<float, float, float>();
+}
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index cf65fe1ab99b49207a64e86310178141b30d07d7..e9838d9aba6554b40082187057851e9c896f8352 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -10,7 +10,7 @@ tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
     tags = [
-        "noguitar",  # b/77649654
+        "nogpu",  # b/77649654
     ],
     deps = [
         ":profiler",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 52345a376cc29ee47ccb9888c9bb26292468b5a9..dedd55f16afb879ea966dc89d14d88ee15d9e83e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -81,6 +81,7 @@ cc_library(
     ] + if_not_mobile([
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
     ]) + if_android([
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 10f7abf09e925c0c31cfd595ecee4605f189476f..66260fcf4a9b24f78d45010c6e86d4ee398b6d3d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
-#include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index 23e9dc40d23899b9cef168c9128b6d8ed1be3ee9..eeb910178902ca883ed211379ba3f188c139f92e 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -124,7 +124,9 @@ Status GetVariableNameToTensorMap(
     return Status::OK();
   }
   std::vector<string> variable_names;
+  variable_names.reserve(variable_names_set.size());
   std::vector<string> tensor_names;
+  tensor_names.reserve(variable_names_set.size());
   for (const string& node_name : variable_names_set) {
     variable_names.push_back(node_name);
     NodeDef* node_def = name_to_node_map.at(node_name);
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49cb74f19ef325c5861b124e458dd7e3b7f436e9
--- /dev/null
+++ b/tensorflow/compat_template.__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+import os as _os
+import sys as _sys
+
+# pylint: disable=g-bad-import-order
+
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg=(
+        "Limited tf.compat.v2.summary API due to missing TensorBoard "
+        "installation"))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v2.estimator'))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow.python.keras.api._v2.keras'))
+
+# We would like the following to work for fully enabling 2.0 in a 1.0 install:
+#
+# import tensorflow.compat.v2 as tf
+# tf.enable_v2_behavior()
+#
+# This make this one symbol available directly.
+from tensorflow.python.compat.v2_compat import enable_v2_behavior  # pylint: disable=g-import-not-at-top
+
+# Add module aliases
+_current_module = _sys.modules[__name__]
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
+  initializers = keras.initializers
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index b966c22b2319aef3b87ef54a283911718d37cf84..9549a71c41a0ba2aac58abd8cfb182aa4eaf3b4f 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -28,7 +28,8 @@ from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
-    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v1.estimator'))
 _component_api_helper.package_hook(
     parent_package_str=__name__,
     child_package_str=('tensorflow.python.keras.api._v1.keras'))
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 16151e77737429f4fbf690fc34b12a70bacebdc4..af016bf80e7a10d8729a1eb385466af48b5810cd 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "flags.h",
     ],
     deps = [
+        ":aot_only_var_handle_op",
         ":embedded_protocol_buffers",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
@@ -71,6 +72,7 @@ tf_cc_test(
         ":tfcompile_lib",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
@@ -205,6 +207,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aot_only_var_handle_op",
+    srcs = ["aot_only_var_handle_op.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "benchmark_test",
     srcs = ["benchmark_test.cc"],
diff --git a/tensorflow/compiler/aot/aot_only_var_handle_op.cc b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce36a979f424610a5aa952afa8db2245ed971a9
--- /dev/null
+++ b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Implementation of varhandle that binds a VarHandleOp to an XlaResource of the
+// same name. It is not safe to use this op in a JIT context.
+class XlaAotOnlyVarHandleOp : public XlaOpKernel {
+ public:
+  explicit XlaAotOnlyVarHandleOp(OpKernelConstruction* c);
+  void Compile(XlaOpKernelContext* context) override;
+
+ private:
+  string name_;
+};
+
+XlaAotOnlyVarHandleOp::XlaAotOnlyVarHandleOp(OpKernelConstruction* c)
+    : XlaOpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("shared_name", &name_));
+}
+
+void XlaAotOnlyVarHandleOp::Compile(XlaOpKernelContext* context) {
+  // Look for a resource of the same name. TF also keys that on the container
+  // and type attributes, but that doesn't seem necessary.
+  for (const auto& resource : context->xla_context()->resources()) {
+    if (resource->kind() == XlaResource::kVariable &&
+        resource->name() == name_) {
+      context->SetResourceOutput(0, resource.get());
+      return;
+    }
+  }
+  context->SetStatus(
+      errors::InvalidArgument("Variable: ", name_, " not configured"));
+}
+}  // namespace
+
+REGISTER_XLA_OP(Name("VarHandleOp").CompilationOnly(), XlaAotOnlyVarHandleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index d016632da2a9d7c2c2f81c02dd573787a0502923..2355fad8802a490fafb702f53d88312611f9ebf4 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -168,12 +168,12 @@ Status GenArgMethods(const tf2xla::Config& config,
                      const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (config.feed_size() != num_args) {
-    return errors::InvalidArgument("mismatch between feed_size(",
-                                   config.feed_size(), ") and num_args(",
-                                   num_args, ")");
+  if (config.feed_size() + config.variable_size() != num_args) {
+    return errors::InvalidArgument(
+        "mismatch between feed_size(", config.feed_size(), ")+variable_size(",
+        config.variable_size(), ") and num_args(", num_args, ")");
   }
-  for (int i = 0; i < num_args; ++i) {
+  for (int i = 0; i < config.feed_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(
         AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
@@ -212,12 +212,14 @@ Status GenResultMethods(const tf2xla::Config& config,
     // tuple result, and we rely on this to simplify code generation.
     return errors::Internal("codegen requires the XLA result to be a tuple");
   }
-  if (config.fetch_size() != ps.result().tuple_shapes_size()) {
+  size_t num_results = ps.result().tuple_shapes_size();
+  if (config.fetch_size() + config.variable_size() != num_results) {
     return errors::InvalidArgument("mismatch between fetch_size(",
-                                   config.feed_size(), ") and tuple_size(",
+                                   config.fetch_size(), ")+variable_size(",
+                                   config.variable_size(), ") and tuple_size(",
                                    ps.result().tuple_shapes_size(), ")");
   }
-  for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < config.fetch_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(AddRewritesForShape(
         i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
@@ -245,6 +247,51 @@ Status GenResultMethods(const tf2xla::Config& config,
   return Status::OK();
 }
 
+// Generate methods for variables.
+Status GenVariableMethods(const tf2xla::Config& config,
+                          const xla::ProgramShapeProto& ps, string* methods) {
+  size_t num_args = ps.parameters_size();
+  for (int i = config.feed_size(); i < num_args; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
+    const string code = R"(
+  void set_var_{{NAME}}_data({{TYPE}}* data) {
+    set_arg_data({{I}}, data);
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.feed_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  size_t num_results = ps.result().tuple_shapes_size();
+  for (int i = config.fetch_size(); i < num_results; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
+    string code = R"(
+  {{TYPE}}* var_{{NAME}}_data() {
+    return static_cast<{{TYPE}}*>(result_data({{I}}));
+  }
+  {{TYPE}}& var_{{NAME}}({{DIM_VARS}}) {
+    return (*static_cast<{{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+  const {{TYPE}}* var_{{NAME}}_data() const {
+    return static_cast<const {{TYPE}}*>(result_data({{I}}));
+  }
+  const {{TYPE}}& var_{{NAME}}({{DIM_VARS}}) const {
+    return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.fetch_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  return Status::OK();
+}
+
 // Generates code implementing {Arg,Result}Names(), where T is one of
 // tf2xla::{Feed,Fetch}. Each feed or fetch name results in a C-style string
 // literal in the array, with nullptr terminating the array.
@@ -291,6 +338,14 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
       TF_RETURN_IF_ERROR(ValidateCppIdent(fetch.name(), "fetch name"));
     }
   }
+  for (const tf2xla::Variable& variable : config.variable()) {
+    if (!variable.name().empty()) {
+      TF_RETURN_IF_ERROR(ValidateCppIdent(variable.name(), "variable name"));
+    } else {
+      TF_RETURN_IF_ERROR(
+          ValidateCppIdent(variable.node_name(), "variable name"));
+    }
+  }
   return Status::OK();
 }
 
@@ -339,9 +394,10 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
   const xla::ProgramShapeProto& ps = compile_result.program_shape;
-  string methods_arg, methods_result;
+  string methods_arg, methods_result, methods_variable;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
+  TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
   const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes(
       buffer_infos_for_args.data(), buffer_infos_for_args.size(),
       /*allocate_entry_params=*/true);
@@ -523,6 +579,21 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // buffers are managed internally, and may change after each call to Run.
 {{METHODS_RESULT}}
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+{{METHODS_VARIABLE}}
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = {{NUM_BUFFERS}};
@@ -589,6 +660,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
        include_hlo_profile_printer_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
+      {"{{METHODS_VARIABLE}}\n", methods_variable},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index c1788ca32a1d099284eeb870f9513891051fd29e..5580e55b691bd10698b63d86bc0194b25da743b9 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -172,6 +174,15 @@ TEST(CodegenTest, Golden) {
   tf2xla::Fetch* fetch = config.add_fetch();
   fetch->mutable_id()->set_node_name("fetch0");
   fetch->set_name("myfetch");
+  tf2xla::Variable* variable = config.add_variable();
+  variable->set_node_name("myvar");
+  variable->mutable_shape()->add_dim()->set_size(1);
+  variable->set_type(DT_FLOAT);
+  tf2xla::Variable* variable2 = config.add_variable();
+  variable2->set_node_name("my/var");
+  variable2->set_name("myvar2");
+  variable2->mutable_shape()->add_dim()->set_size(5);
+  variable2->set_type(DT_INT32);
   CompileResult compile_result;
   compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
       {},
@@ -186,9 +197,14 @@ TEST(CodegenTest, Golden) {
           {
               xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
               xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
           },
-          xla::ShapeUtil::MakeTupleShape(
-              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          xla::ShapeUtil::MakeTupleShape({
+              xla::ShapeUtil::MakeShape(xla::U32, {5, 6}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
+          }))
           .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 35994fc785d3e1d5e883c49bec96de315e189d2e..8591df538779e3bc0f6e55607180a6d49009735e 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -52,7 +52,7 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): f32[1], (unknown): s32[5]) -> (u32[5,6], f32[1], s32[5])
 //
 // Memory stats:
 //   arg bytes total:    104
@@ -214,6 +214,58 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
         result_data(0)))[dim0][dim1];
   }
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_var_myvar_data(float* data) {
+    set_arg_data(2, data);
+  }
+
+  void set_var_myvar2_data(tensorflow::int32* data) {
+    set_arg_data(3, data);
+  }
+
+  float* var_myvar_data() {
+    return static_cast<float*>(result_data(1));
+  }
+  float& var_myvar() {
+    return (*static_cast<float(*)[1]>(
+        result_data(1)))[0];
+  }
+  const float* var_myvar_data() const {
+    return static_cast<const float*>(result_data(1));
+  }
+  const float& var_myvar() const {
+    return (*static_cast<const float(*)[1]>(
+        result_data(1)))[0];
+  }
+
+  tensorflow::int32* var_myvar2_data() {
+    return static_cast<tensorflow::int32*>(result_data(2));
+  }
+  tensorflow::int32& var_myvar2(size_t dim0) {
+    return (*static_cast<tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+  const tensorflow::int32* var_myvar2_data() const {
+    return static_cast<const tensorflow::int32*>(result_data(2));
+  }
+  const tensorflow::int32& var_myvar2(size_t dim0) const {
+    return (*static_cast<const tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = 6;
@@ -257,7 +309,7 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   static const xla::ProgramShapeProto* StaticProgramShape() {
     static const xla::ProgramShapeProto* kShape = []() {
     xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 64);
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 132);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index 7f7b96428572705f30144e6c95cd4cf9c44ce2a3..2884597abcf29583e6192296b0e4ce6825d7c01a 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 9fc223bdc7c0e207ce2005cb86250aa77e709df8..0e46a9f5e9d68fa2174f7bd9b9fa7c3a82dfb715 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -108,10 +108,13 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
                         computation.Snapshot());
     // Serialize the HloSnapshot deterministically so that all the outputs of a
     // tf_library genrule are deterministic.
-    string proto;
-    TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
+    const size_t size = module->ByteSizeLong();
+    auto serialized = absl::make_unique<char[]>(size);
+    TF_RET_CHECK(
+        SerializeToBufferDeterministic(*module, serialized.get(), size));
     TF_RETURN_IF_ERROR(
-        WriteStringToFile(Env::Default(), flags.out_session_module, proto));
+        WriteStringToFile(Env::Default(), flags.out_session_module,
+                          absl::string_view(serialized.get(), size)));
   }
   xla::cpu::CpuAotCompilationOptions aot_opts(
       flags.target_triple, flags.target_cpu, flags.target_features,
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 10fa33ab5e84dcbc1629bee6214e8969046f19c2..ce8dae4262913c975ca69dedd0420f1457e11ee9 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -26,6 +26,8 @@ test_suite(
         ":test_graph_tfmatmulandadd_test",
         ":test_graph_tfsplits_test",
         ":test_graph_tftop_k_test",
+        ":test_graph_tfvariable_sequential_updates_test",
+        ":test_graph_tfvariable_test",
         ":tfcompile_test",
     ],
 )
@@ -69,6 +71,8 @@ genrule(
         "test_graph_tfmatmulandadd.pb",
         "test_graph_tfsplits.pb",
         "test_graph_tftop_k.pb",
+        "test_graph_tfvariable.pb",
+        "test_graph_tfvariable_sequential_updates.pb",
     ],
     # Set CUDA_VISIBLE_DEVICES='' to prevent the code we launch from using any
     # GPUs which might be present.  This is important because builds may run
@@ -222,6 +226,28 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfvariable",
+    testonly = 1,
+    config = "test_graph_tfvariable.config.pbtxt",
+    cpp_class = "VariableComp",
+    graph = "test_graph_tfvariable.pb",
+    tags = [
+        "manual",
+    ],
+)
+
+tf_library(
+    name = "test_graph_tfvariable_sequential_updates",
+    testonly = 1,
+    config = "test_graph_tfvariable_sequential_updates.config.pbtxt",
+    cpp_class = "VariableSequentialUpdatesComp",
+    graph = "test_graph_tfvariable_sequential_updates.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
@@ -241,6 +267,8 @@ tf_cc_test(
         ":test_graph_tfmatmulandadd_with_profiling",
         ":test_graph_tfsplits",
         ":test_graph_tftop_k",
+        ":test_graph_tfvariable",
+        ":test_graph_tfvariable_sequential_updates",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 64b861a73091642b03573543a5c55618bf33915d..7f5e907e26365c0d9ec65e6f00d410a87f452241 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -50,7 +50,7 @@ def tfadd_with_ckpt(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
@@ -65,7 +65,7 @@ def tfadd_with_ckpt_saver(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(name='abcprefix', write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
@@ -149,6 +149,25 @@ def tftop_k(_):
   array_ops.identity(output[1], name='indices')
 
 
+def tfvariable(_):
+  x = variables.Variable(1000.0, name='x')
+  old_x = x.value()
+  with ops.control_dependencies([old_x]):
+    new_x = x.assign_add(42.0)
+  array_ops.stack([old_x, new_x], name='result')
+
+
+def tfvariable_sequential_updates(_):
+  x = variables.Variable(1.0, name='x')
+  updates = control_flow_ops.no_op()
+  for _ in range(3):
+    with ops.control_dependencies([updates]):
+      x_val = x.read_value() + 1.0
+      updates = x.assign_sub(0.1 * x_val)
+
+  array_ops.identity(updates, name='result')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -171,6 +190,8 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
   write_graph(tftop_k, FLAGS.out_dir)
+  write_graph(tfvariable, FLAGS.out_dir)
+  write_graph(tfvariable_sequential_updates, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b4c4215a330b014f595edde001aba73ad7d8263
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
@@ -0,0 +1,12 @@
+# Text form of tensorflow.tf2xla.Config proto.
+fetch {
+  id { node_name: "result" }
+}
+
+variable {
+  node_name: "x"
+  shape {
+    dim { size: 1 }
+  }
+  type: DT_FLOAT
+}
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfvariable_sequential_updates.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfvariable_sequential_updates.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7312c40baf6957c273fc389efa11d08ed9f7a0dd
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfvariable_sequential_updates.config.pbtxt
@@ -0,0 +1,9 @@
+# Text form of tensorflow.tf2xla.Config proto.
+fetch {
+  id { node_name: "result" }
+}
+
+variable {
+  node_name: "x"
+  type: DT_FLOAT
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 4dd79e5882d7da61be029735ef2b165908c599f9..5bee7f2540a4177a9c4e726bb739d7b92a4dacfc 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tftop_k.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfvariable.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfvariable_sequential_updates.h"
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -473,6 +475,49 @@ TEST(TFCompileTest, TopK) {
   EXPECT_EQ(expected_indices[1], fn.result1(1));
 }
 
+TEST(TFCompileTest, Variable) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  VariableComp fn;
+  float x = 23;
+  fn.set_var_x_data(&x);
+
+  fn.set_thread_pool(&device);
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 23);
+  EXPECT_EQ(fn.result0(1, 0), 65);
+  EXPECT_EQ(fn.var_x(), 65);
+
+  EXPECT_EQ(fn.var_x_data(), &x);
+  EXPECT_EQ(x, 65);
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 65);
+  EXPECT_EQ(fn.result0(1, 0), 107);
+  EXPECT_EQ(fn.var_x(), 107);
+}
+
+TEST(TFCompileTest, VariableSequentialUpdates) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  // This implements the recursion:
+  // x[0] = 1.0
+  // x[n+1] = x[n] - 0.1*(x[n-1] + 1.0)
+  VariableSequentialUpdatesComp fn;
+  float x = 1;
+  fn.set_var_x_data(&x);
+
+  fn.set_thread_pool(&device);
+  // First calculate x[3]
+  fn.Run();
+  EXPECT_NEAR(x, 0.458f, 1e-6);
+
+  // Then calculate x[6]
+  fn.Run();
+  EXPECT_NEAR(x, 0.062882f, 1e-6);
+}
+
 TEST(TFCompileTest, AssertEqAndReturnDiff) {
   // Assert is converted into a no-op in XLA, so there is no failure even if the
   // two args are different.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 4051664c24cacad4a2d151ad3ac9009015900609..fd701ab7166eb6520ad9050abb5285c9d0e0b6bd 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -207,7 +207,7 @@ def tf_library(
         #
         # Note that setting the local=1 attribute on a *test target* causes the
         # test infrastructure to skip that test.  However this is a genrule, not
-        # a test target, and runs with --genrule_strategy=forced_forge, meaning
+        # a test target, and runs with --strategy=Genrule=forced_forge, meaning
         # the local=1 attribute is ignored, and the genrule is still run.
         #
         # https://www.bazel.io/versions/master/docs/be/general.html#genrule
@@ -392,6 +392,6 @@ def target_llvm_triple():
         "//tensorflow:android_x86": "i686-none-android",
         "//tensorflow:ios": "arm64-none-ios",
         "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
-        "//tensorflow:darwin": "x86_64-none-darwin",
+        "//tensorflow:macos": "x86_64-none-darwin",
         "//conditions:default": "x86_64-pc-linux",
     })
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index d548de8c44285f6d21dd778db464a31e1b19645b..0b6ab7e723d6e3a55da2f1c30b75f44cbdaa75bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -136,6 +136,10 @@ int main(int argc, char** argv) {
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
+  if (argc > 1 && absl::string_view(argv[1]) == "--help") {
+    std::cerr << usage << "\n";
+    return 0;
+  }
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   QCHECK(parsed_flags_ok) << "\n" << usage;
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 1d72d8c2d88c1d256341c46b04e620f2b198e7ea..4424c29e395adf550943c3da99606ad20ba1ad49 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -20,6 +20,8 @@ package(
     ],
 )
 
+# NB! Removing the cc_header_only_library import breaks the OSS build since
+# copybara injects some build rules that use it.
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
@@ -166,7 +168,6 @@ cc_library(
         ":xla_tensor",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
@@ -175,18 +176,29 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:stream_pool",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:fifo_queue",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:host_constant_op",
         "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:no_op",
@@ -198,7 +210,9 @@ cc_library(
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
+        "//tensorflow/core/kernels/data:optional_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
+        "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
@@ -253,11 +267,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
@@ -270,9 +284,7 @@ cc_library(
     hdrs = ["xla_compilation_cache.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -282,7 +294,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -406,7 +417,6 @@ cc_library(
     hdrs = ["shape_inference.h"],
     deps = [
         ":shape_inference_helpers",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -455,7 +465,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -481,6 +490,7 @@ cc_library(
     name = "compilation_passes",
     srcs = [
         "build_xla_ops_pass.cc",
+        "clone_constants_for_better_clustering.cc",
         "deadness_analysis.cc",
         "deadness_analysis_internal.h",
         "encapsulate_subgraphs_pass.cc",
@@ -493,6 +503,7 @@ cc_library(
     ],
     hdrs = [
         "build_xla_ops_pass.h",
+        "clone_constants_for_better_clustering.h",
         "deadness_analysis.h",
         "encapsulate_subgraphs_pass.h",
         "encapsulate_xla_computations_pass.h",
@@ -510,11 +521,11 @@ cc_library(
         ":union_find",
         ":xla_cluster_util",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/ops:xla_ops",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -526,11 +537,12 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -546,12 +558,20 @@ cc_library(
     srcs = ["xla_cluster_util.cc"],
     hdrs = ["xla_cluster_util.h"],
     deps = [
+        ":flags",
         ":resource_operation_safety_analysis",
         "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -592,11 +612,25 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "compilation_passes_test_main",
+    testonly = True,
+    srcs = ["compilation_passes_test_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":flags",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "compilation_passes_test",
     size = "small",
     srcs = [
         "build_xla_ops_pass_test.cc",
+        "clone_constants_for_better_clustering_test.cc",
         "encapsulate_subgraphs_pass_test.cc",
         "encapsulate_xla_computations_pass_test.cc",
         "extract_outside_compilation_pass_test.cc",
@@ -607,7 +641,9 @@ tf_cc_test(
     deps = [
         ":common",
         ":compilation_passes",
+        ":compilation_passes_test_main",
         ":encapsulate_util",
+        ":flags",
         ":node_matchers",
         ":xla_cluster_util",
         ":xla_cpu_device",
@@ -636,7 +672,6 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -660,6 +695,7 @@ tf_cc_test(
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -667,6 +703,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -681,6 +719,7 @@ cc_library(
         ":union_find",
         ":xla_cluster_util",
         "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 9f4042630edaec1b9519b6434d859a48372e8b15..6058e4195425fadd8c101c6d41303ed0a6ea69e8 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -16,18 +16,20 @@ limitations under the License.
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/control_flow_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -39,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace {
@@ -115,6 +118,13 @@ void MergeOutgoingControlEdges(const Scope& s, Node* old_node, Node* new_node) {
     return;
   }
 
+  if (ctrl_edges.size() == 1 && ctrl_edges.front()->dst()->IsSink()) {
+    // Avoid creating a Merge node if we can just add an edge to _SINK
+    // instead.
+    s.graph()->AddControlEdge(new_node, s.graph()->sink_node());
+    return;
+  }
+
   // We can't merge control edges directly so we instead first "convert" them to
   // normal values that can be merged, merge the values and then "convert" the
   // merged value back into control.
@@ -204,11 +214,10 @@ void RemoveAllIncomingControlEdges(Graph* g, Node* n) {
   }
 }
 
-// Returns true (into `result`) if `node` must be compiled.
-Status NodeRequiresCompilation(Node* n, bool* result) {
+// Returns true (into `result`) if a node placed on `device` must be compiled.
+Status DeviceRequiresCompilation(const string& device, bool* result) {
   DeviceType device_type("");
-  TF_RETURN_IF_ERROR(
-      DeviceToDeviceType(n->assigned_device_name(), &device_type));
+  TF_RETURN_IF_ERROR(DeviceToDeviceType(device, &device_type));
   const XlaOpRegistry::DeviceRegistration* registration = nullptr;
   if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
     return errors::Internal("Could not find compilation device ",
@@ -219,11 +228,97 @@ Status NodeRequiresCompilation(Node* n, bool* result) {
   return Status::OK();
 }
 
+// Replaces `n` with a `PartionedCall` op that calls the same function.
+Status ReplaceFunctionCallWithPartionedCall(
+    const GraphOptimizationPassOptions& options,
+    const FunctionLibraryDefinition& flib_def, Node* n, Graph* g,
+    const NameAttrList& func, const Scope& root) {
+  string config_string = options.session_options->config.SerializeAsString();
+
+  int input_count = absl::c_count_if(
+      n->in_edges(), [](const Edge* e) { return !e->IsControlEdge(); });
+
+  std::vector<Output> args(input_count);
+  for (const Edge* e : n->in_edges()) {
+    if (!e->IsControlEdge()) {
+      args[e->dst_input()] = Output(e->src(), e->src_output());
+    }
+  }
+
+  ops::PartitionedCall call(
+      root.WithOpName("partitioned_call"), args, n->output_types(), func,
+      ops::PartitionedCall::Attrs{}.ConfigProto(config_string));
+
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(e->src(), call.operation.node());
+    }
+  }
+
+  std::vector<const Edge*> edges_to_delete;
+
+  for (const Edge* e : n->out_edges()) {
+    edges_to_delete.push_back(e);
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(call.operation.node(), e->dst());
+    } else {
+      g->AddEdge(call.operation.node(), e->src_output(), e->dst(),
+                 e->dst_input());
+    }
+  }
+
+  for (const Edge* e : edges_to_delete) {
+    g->RemoveEdge(e);
+  }
+
+  g->RemoveNode(n);
+  return Status::OK();
+}
+
+Status InferDeviceForCluster(Node* n, const string& function_name,
+                             const FunctionLibraryDefinition& flib_def,
+                             string* result) {
+  const FunctionDef* func_def = flib_def.Find(function_name);
+  TF_RET_CHECK(func_def) << "Could not find " << function_name;
+
+  std::set<string> device_names;
+  for (const NodeDef& ndef : func_def->node_def()) {
+    VLOG(3) << ndef.DebugString();
+    if (!ndef.device().empty()) {
+      device_names.insert(ndef.device());
+    }
+  }
+
+  if (!n->assigned_device_name().empty()) {
+    // TODO(sanjoy): We need this because EncapsulateSubgraphsPass drops device
+    // assignment when constant folding.  We should fix EncapsulateSubgraphsPass
+    // instead.
+    device_names.insert(n->assigned_device_name());
+  }
+
+  std::vector<string> device_names_vector;
+  absl::c_copy(device_names, std::back_inserter(device_names_vector));
+
+  Status s = PickDeviceForXla(device_names_vector, true, result);
+  if (s.ok()) {
+    VLOG(2) << "For " << function_name << " PickDeviceForXla("
+            << absl::StrJoin(device_names_vector, ", ") << ") -> " << *result;
+  }
+  return s;
+}
+
 Status ReplaceNodeWithXlaCompileAndXlaRun(
+    const GraphOptimizationPassOptions& options,
     const FunctionLibraryDefinition& flib_def, bool lazy_compilation_enabled,
     Graph* g, Node* n) {
+  XlaClusterInfo cluster_info;
+  TF_RETURN_IF_ERROR(GetXlaClusterInfo(n, &cluster_info));
+
+  string device;
+  TF_RETURN_IF_ERROR(InferDeviceForCluster(n, cluster_info.function.name(),
+                                           flib_def, &device));
   bool requires_compilation;
-  TF_RETURN_IF_ERROR(NodeRequiresCompilation(n, &requires_compilation));
+  TF_RETURN_IF_ERROR(DeviceRequiresCompilation(device, &requires_compilation));
   if (!lazy_compilation_enabled) {
     requires_compilation = true;
   }
@@ -232,10 +327,7 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(
   Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
                    .NewSubScope(n->name())
                    .WithDevice(n->requested_device())
-                   .WithAssignedDevice(n->assigned_device_name());
-
-  XlaClusterInfo cluster_info;
-  TF_RETURN_IF_ERROR(GetXlaClusterInfo(n, &cluster_info));
+                   .WithAssignedDevice(device);
 
   ops::_XlaCompile xla_compile(root.WithOpName("xla_compile"),
                                /*constants=*/cluster_info.constant_inputs,
@@ -297,6 +389,9 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(
     g->AddControlEdge(
         DataToControl(root, inverse_predicated_compilation_key).node(), n);
     n->ClearAttr(kXlaCompiledKernelAttr);
+
+    TF_RETURN_IF_ERROR(ReplaceFunctionCallWithPartionedCall(
+        options, flib_def, n, g, cluster_info.function, root));
   }
 
   return Status::OK();
@@ -327,11 +422,11 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
-        *options.flib_def, lazy_compilation_enabled, graph, n));
+        options, *options.flib_def, lazy_compilation_enabled, graph, n));
   }
 
   if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile("build_xla_ops", *graph, options.flib_def);
+    DumpGraphToFile("build_xla_ops", *graph, options.flib_def);
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 390ffa694b6f127544d92f3024a02d877556aacd..902ec7f182b8f70dbd7f5b7f3138710845f4e3e1 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -54,9 +54,11 @@ using ::tensorflow::testing::matchers::Op;
 using ::tensorflow::testing::matchers::Out;
 using ::testing::_;
 
-Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
+Status BuildXlaOps(const Scope& s, const FunctionDefLibrary& fdef_lib,
+                   std::unique_ptr<Graph>* result) {
   auto graph = absl::make_unique<Graph>(OpRegistry::Global());
   TF_RETURN_IF_ERROR(s.ToGraph(graph.get()));
+  FunctionLibraryDefinition flib_def(graph->op_registry(), fdef_lib);
 
   // Assign all nodes to the CPU device.
   static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
@@ -68,7 +70,12 @@ Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
     }
   }
 
+  FixupSourceAndSinkEdges(graph.get());
+
+  SessionOptions session_options;
   GraphOptimizationPassOptions opt_options;
+  opt_options.session_options = &session_options;
+  opt_options.flib_def = &flib_def;
   opt_options.graph = &graph;
   BuildXlaOpsPass pass(/*enable_lazy_compilation=*/true);
   TF_RETURN_IF_ERROR(pass.Run(opt_options));
@@ -112,23 +119,23 @@ Node* MakeWrite(const Scope& scope, const string& id) {
 }
 
 FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
-  FunctionDefLibrary flib_def;
+  FunctionDefLibrary fdef_lib;
   FunctionDef func = FunctionDefHelper::Create(
       /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
       /*attr_def*/
       {}, /*node_def=*/{FunctionDefHelper::Const("one", 1.0f)},
       /*ret_def=*/{{"out", "out:output:0"}});
-  *flib_def.add_function() = std::move(func);
-  return flib_def;
+  *fdef_lib.add_function() = std::move(func);
+  return fdef_lib;
 }
 
 TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
   const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
 
-  FunctionDefLibrary flib_def =
+  FunctionDefLibrary fdef_lib =
       CreateFunctionDefLibWithConstFunction("cluster_0");
-  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(fdef_lib));
   Node* call;
   TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
   call->set_requested_device(kXlaDeviceName);
@@ -136,7 +143,7 @@ TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
   root.graph()->AddControlEdge(call, write_op);
 
   std::unique_ptr<Graph> graph;
-  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+  TF_ASSERT_OK(BuildXlaOps(root, fdef_lib, &graph));
 
   Node* write_op_new = FindNodeByName(graph.get(), write_op->name());
   ASSERT_NE(write_op_new, nullptr);
@@ -146,9 +153,9 @@ TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
 TEST_F(BuildXlaOpsTest, CleanFailureOnBogusAttr) {
   Scope root = Scope::NewRootScope().ExitOnError();
 
-  FunctionDefLibrary flib_def =
+  FunctionDefLibrary fdef_lib =
       CreateFunctionDefLibWithConstFunction("cluster_0");
-  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(fdef_lib));
 
   Node* call;
   TF_ASSERT_OK(
@@ -158,7 +165,7 @@ TEST_F(BuildXlaOpsTest, CleanFailureOnBogusAttr) {
   root.graph()->AddControlEdge(call, write_op);
 
   std::unique_ptr<Graph> graph;
-  Status failure_status = BuildXlaOps(root, &graph);
+  Status failure_status = BuildXlaOps(root, fdef_lib, &graph);
   ASSERT_FALSE(failure_status.ok());
   EXPECT_EQ(failure_status.code(), error::INVALID_ARGUMENT);
 }
@@ -166,9 +173,9 @@ TEST_F(BuildXlaOpsTest, CleanFailureOnBogusAttr) {
 TEST_F(BuildXlaOpsTest, OnNonXlaDevice) {
   Scope root = Scope::NewRootScope().ExitOnError();
 
-  FunctionDefLibrary flib_def =
+  FunctionDefLibrary fdef_lib =
       CreateFunctionDefLibWithConstFunction("cluster_0");
-  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(fdef_lib));
 
   Node* call;
   TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
@@ -182,14 +189,14 @@ TEST_F(BuildXlaOpsTest, OnNonXlaDevice) {
   auto xla_run =
       NodeWith(Op("_XlaRun"), Inputs(Out(1, predicated_compilation_key)));
   auto tf_call =
-      NodeWith(Op("cluster_0"),
+      NodeWith(Op("PartitionedCall"),
                CtrlDeps(NodeWith(Op("Identity"),
                                  Inputs(Out(0, predicated_compilation_key)))));
   auto merge = NodeWith(Op("Merge"), Inputs(Out(tf_call), Out(xla_run)));
   auto assign_var = NodeWith(Op("AssignVariableOp"), Inputs(_, Out(merge)));
 
   std::unique_ptr<Graph> graph;
-  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+  TF_ASSERT_OK(BuildXlaOps(root, fdef_lib, &graph));
 
   Node* write_op_new = FindNodeByName(graph.get(), write_op->name());
   ASSERT_NE(write_op_new, nullptr);
@@ -200,9 +207,9 @@ TEST_F(BuildXlaOpsTest, OnXlaDevice) {
   const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
 
-  FunctionDefLibrary flib_def =
+  FunctionDefLibrary fdef_lib =
       CreateFunctionDefLibWithConstFunction("cluster_0");
-  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(fdef_lib));
 
   Node* call;
   TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
@@ -212,7 +219,7 @@ TEST_F(BuildXlaOpsTest, OnXlaDevice) {
   Node* write_op = MakeWrite(root, Output(call), "write_result");
 
   std::unique_ptr<Graph> graph;
-  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+  TF_ASSERT_OK(BuildXlaOps(root, fdef_lib, &graph));
 
   auto xla_op =
       NodeWith(Op("_XlaRun"), Inputs(Out(NodeWith(Op("_XlaCompile")))));
@@ -223,5 +230,23 @@ TEST_F(BuildXlaOpsTest, OnXlaDevice) {
   ASSERT_NE(write_op_new, nullptr);
   EXPECT_THAT(write_op_new, assign_var);
 }
+
+TEST_F(BuildXlaOpsTest, NoExtraMergeForEdgeToSink) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary fdef_lib =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(fdef_lib));
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, fdef_lib, &graph));
+
+  Node* sink_node = graph->sink_node();
+  EXPECT_THAT(sink_node, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")),
+                                           NodeWith(Op("PartitionedCall")),
+                                           NodeWith(Op("NoOp")))));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
new file mode 100644
index 0000000000000000000000000000000000000000..848a6362a4a8f506d233b126461911067f26d9f2
--- /dev/null
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/clone_constants_for_better_clustering.h"
+
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+
+using se::port::StatusOr;
+
+string CloneConstantsForBetterClusteringPass::GenerateUniqueName(
+    const absl::flat_hash_set<string>& name_set, absl::string_view prefix) {
+  string candidate;
+  do {
+    candidate = absl::StrCat(prefix, "/clone_", unique_name_counter_++);
+  } while (name_set.contains(candidate));
+  return candidate;
+}
+
+StatusOr<Node*> CloneConstantsForBetterClusteringPass::CloneNode(
+    Graph* g, const absl::flat_hash_set<string>& name_set, Node* n) {
+  NodeDef new_in_def = n->def();
+  new_in_def.clear_input();
+  new_in_def.set_name(GenerateUniqueName(name_set, new_in_def.name()));
+  Status s;
+  Node* new_in = g->AddNode(new_in_def, &s);
+  TF_RETURN_IF_ERROR(s);
+
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(e->src(), new_in);
+    } else {
+      g->AddEdge(e->src(), e->src_output(), new_in, e->dst_input());
+    }
+  }
+
+  new_in->set_assigned_device_name(n->assigned_device_name());
+  return new_in;
+}
+
+namespace {
+// We only clone host constants for now since we want to avoid increasing memory
+// pressure on GPUs.
+StatusOr<bool> IsSmallHostConstant(Node* n) {
+  if (!n->IsConstant()) {
+    return false;
+  }
+
+  DeviceNameUtils::ParsedName parsed;
+  TF_RET_CHECK(
+      DeviceNameUtils::ParseFullName(n->assigned_device_name(), &parsed));
+  if (parsed.type != DEVICE_CPU) {
+    return false;
+  }
+
+  const TensorProto* proto = nullptr;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto));
+
+  // TODO(sanjoy): It may make sense to combine this threshold with XLA's "large
+  // constant" threshold, if there is one.
+  const int kSmallTensorThreshold = 16;
+  int64 total_elements = 1;
+  for (const auto& dim : proto->tensor_shape().dim()) {
+    if (dim.size() < 0) {
+      return errors::Internal("Unknown dimension size in constant tensor ",
+                              n->name());
+    }
+    total_elements *= dim.size();
+  }
+  return total_elements < kSmallTensorThreshold;
+}
+
+bool IsInPlaceOp(absl::string_view op_name) {
+  return op_name == "InplaceUpdate" || op_name == "InplaceAdd" ||
+         op_name == "InplaceSub";
+}
+}  // namespace
+
+Status CloneConstantsForBetterClusteringPass::CloneSmallHostConstantInputs(
+    Graph* g, const absl::flat_hash_set<string>& name_set, Node* n) {
+  std::vector<const Edge*> in_edges;
+  absl::c_copy(n->in_edges(), std::back_inserter(in_edges));
+  for (const Edge* e : in_edges) {
+    Node* input = e->src();
+    TF_ASSIGN_OR_RETURN(bool is_small_host_constant,
+                        IsSmallHostConstant(input));
+    if (is_small_host_constant && input->out_edges().size() != 1) {
+      VLOG(2) << "Cloning small host constant " << input->name();
+      TF_ASSIGN_OR_RETURN(Node* const input_cloned,
+                          CloneNode(g, name_set, input));
+      if (e->IsControlEdge()) {
+        g->AddControlEdge(input_cloned, e->dst());
+      } else {
+        int dst_input = e->dst_input();
+        TF_RET_CHECK(e->src_output() == 0)
+            << "expected constant to have exactly one non-control output, but "
+               "found output index = "
+            << e->src_output();
+        g->RemoveEdge(e);
+        g->AddEdge(input_cloned, 0, n, dst_input);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status CloneConstantsForBetterClusteringPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  if (GetGlobalJitLevel(options) == OptimizerOptions::OFF) {
+    return Status::OK();
+  }
+
+  Graph* g = options.graph->get();
+  absl::flat_hash_set<string> name_set;
+  absl::c_transform(g->nodes(), std::inserter(name_set, name_set.begin()),
+                    [](Node* n) { return n->name(); });
+  std::vector<Node*> nodes;
+  for (Node* n : g->nodes()) {
+    // We rely on the immutability of Tensors to safely clone Const operations.
+    // However, "in place" ops do not respect the immutability of Tensors so we
+    // avoid this transformation when such ops are present in the graph.
+    //
+    // In-place operations are problematic because they break the semantic
+    // illusion that tensorflow::Tensor instances are immutable.  For instance
+    // if we have the following graph:
+    //
+    // digraph {
+    //   SRC -> Const
+    //   SRC -> I
+    //   SRC -> V
+    //   Const -> Identity
+    //   Const -> InplaceAdd [label="x"]
+    //   I -> InplaceAdd [label="i"]
+    //   V -> InplaceAdd [label="v"]
+    //   InplaceAdd -> Identity [style=dotted]
+    // }
+    //
+    // then the value produced by `Identity` is Const+I*V since InplaceAdd
+    // modifies the tensor in place.  However, if we clone `Const` and turn the
+    // graph into:
+    //
+    // digraph {
+    //   SRC -> "Const/clone_1"
+    //   SRC -> "Const/clone_2"
+    //   SRC -> I
+    //   SRC -> V
+    //   "Const/clone_1" -> Identity
+    //   "Const/clone_2" -> InplaceAdd [label="x"]
+    //   I -> InplaceAdd [label="i"]
+    //   V -> InplaceAdd [label="v"]
+    //   InplaceAdd -> Identity [style=dotted]
+    // }
+    //
+    // then `Identity` no longer produces Const+I*V because the InplaceAdd
+    // operation only modifies Const/clone_2 in place.
+
+    if (IsInPlaceOp(n->type_string())) {
+      return Status::OK();
+    }
+    nodes.push_back(n);
+  }
+
+  // Iterate over a copy of the nodes to avoid iterating over g->nodes() while
+  // creating more nodes.
+  for (Node* n : nodes) {
+    TF_RETURN_IF_ERROR(CloneSmallHostConstantInputs(g, name_set, n));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.h b/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
new file mode 100644
index 0000000000000000000000000000000000000000..f67da75b34fb2e3eb9af9c16837fcc9a69bb94a8
--- /dev/null
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
+#define TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+// Clones small host constants in the graph to make it easier to form larger
+// clusters.
+//
+// This helps us in two ways:
+//
+//  - It reduces dependencies between clusters.  Let's say a constant C is used
+//    by nodes X and Y.  If X and Y are put in different clusters (for whatever
+//    reason) Y's cluster now has to wait for all the operations in X's cluster
+//    to finish before it starts running.
+//
+//  - It lets us create bigger clusters in multi-GPU benchmarks.  Consider the
+//    following graph:
+//
+//    digraph {
+//      Const -> GPU_1
+//      Const -> GPU_0_Y
+//      GPU_0_X -> GPU_0_Y
+//    }
+//
+//    We'd cluster Const and GPU_1 together (and place it on GPU_1), and this
+//    will block us from clustering GPU_0_X and GPU_0_Y together since that
+//    would increase the amount of work on GPU 0 waiting on work on GPU 1.
+//    However, cloning Const into two copies, one for GPU_0_Y and one for GPU_1
+//    will let us create one cluster containing {Const/copy_0, GPU_1} and
+//    another containing {Const/copy_1, GPU_0_X, GPU_0_Y}.
+//
+// We only clone small host constants now to avoid increasing memory consumption
+// too much.  Moreover, in practice the constants we have to duplicate are
+// things like the `perm` input to `Transpose` and the `size` input to `Slice`
+// which tend to be small anyway.
+
+class CloneConstantsForBetterClusteringPass : public GraphOptimizationPass {
+ public:
+  CloneConstantsForBetterClusteringPass() = default;
+
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  Status CloneSmallHostConstantInputs(
+      Graph* g, const absl::flat_hash_set<string>& name_set, Node* n);
+  string GenerateUniqueName(const absl::flat_hash_set<string>& name_set,
+                            absl::string_view prefix);
+  se::port::StatusOr<Node*> CloneNode(
+      Graph* g, const absl::flat_hash_set<string>& name_set, Node* n);
+
+  int unique_name_counter_ = 0;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31543d1c3f8571be946868aa53ebad3c95ba9a5a
--- /dev/null
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/clone_constants_for_better_clustering.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/compiler/jit/node_matchers.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+using ::tensorflow::testing::FindNodeByName;
+
+Status CloneConstantsForBetterClustering(const Scope& s,
+                                         std::unique_ptr<Graph>* result) {
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SessionOptions session_options;
+  session_options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(OptimizerOptions::ON_2);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  options.session_options = &session_options;
+
+  // Scope::ToGraph seems to drop assigned devices, probably because it goes
+  // through a GraphDef.  So explicitly maintain the device assignment.
+  // std::unordered_map<string, string> assigned_device_names;
+  // for (Node* n : s.graph()->nodes()) {
+  //   assigned_device_names[n->name()] = n->assigned_device_name();
+  // }
+  GraphConstructorOptions opts;
+  opts.expect_device_spec = true;
+  TF_RETURN_IF_ERROR(s.ToGraph(graph.get(), opts));
+  // for (Node* n : graph->nodes()) {
+  //   n->set_assigned_device_name(assigned_device_names[n->name()]);
+  // }
+
+  CloneConstantsForBetterClusteringPass rewriter;
+  TF_RETURN_IF_ERROR(rewriter.Run(options));
+  *result = std::move(graph);
+  return Status::OK();
+}
+
+const char* kCPU = "/job:localhost/replica:0/task:0/device:CPU:0";
+const char* kGPU = "/job:localhost/replica:0/task:0/device:GPU:0";
+
+TEST(CloneConstantsForBetterClusteringTest, Basic) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
+  Scope on_cpu = root.WithAssignedDevice(kCPU).WithDevice(kCPU);
+
+  Output in0 = ops::Placeholder(on_gpu.WithOpName("in0"), DT_FLOAT);
+  Output in1 = ops::Placeholder(on_gpu.WithOpName("in1"), DT_FLOAT);
+
+  Output perm = ops::Const(on_cpu.WithOpName("perm"), {3, 1, 2, 0});
+
+  {
+    Output tr0 = ops::Transpose(on_gpu.WithOpName("tr0"), in0, perm);
+    Output tr1 = ops::Transpose(on_gpu.WithOpName("tr1"), in1, perm);
+  }
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(CloneConstantsForBetterClustering(root, &result));
+
+  OutputTensor tr0_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr0")->input_tensor(1, &tr0_perm));
+
+  OutputTensor tr1_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr1")->input_tensor(1, &tr1_perm));
+
+  EXPECT_NE(tr0_perm.node, tr1_perm.node);
+}
+
+TEST(CloneConstantsForBetterClusteringTest, DontCloneNonHostConstants) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
+
+  Output in0 = ops::Placeholder(on_gpu.WithOpName("in0"), DT_FLOAT);
+  Output in1 = ops::Placeholder(on_gpu.WithOpName("in1"), DT_FLOAT);
+
+  Output perm = ops::Const(on_gpu.WithOpName("perm"), {3, 1, 2, 0});
+
+  {
+    Output tr0 = ops::Transpose(on_gpu.WithOpName("tr0"), in0, perm);
+    Output tr1 = ops::Transpose(on_gpu.WithOpName("tr1"), in1, perm);
+  }
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(CloneConstantsForBetterClustering(root, &result));
+
+  OutputTensor tr0_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr0")->input_tensor(1, &tr0_perm));
+
+  OutputTensor tr1_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr1")->input_tensor(1, &tr1_perm));
+
+  EXPECT_EQ(tr0_perm.node, tr1_perm.node);
+}
+
+TEST(CloneConstantsForBetterClusteringTest, DontCloneLargeConstants) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
+  Scope on_cpu = root.WithAssignedDevice(kCPU).WithDevice(kCPU);
+
+  Output in0 = ops::Placeholder(on_gpu.WithOpName("in0"), DT_FLOAT);
+  Output in1 = ops::Placeholder(on_gpu.WithOpName("in1"), DT_FLOAT);
+
+  Output perm = ops::Const(
+      on_cpu.WithOpName("perm"),
+      {17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
+
+  {
+    Output tr0 = ops::Transpose(on_gpu.WithOpName("tr0"), in0, perm);
+    Output tr1 = ops::Transpose(on_gpu.WithOpName("tr1"), in1, perm);
+  }
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(CloneConstantsForBetterClustering(root, &result));
+
+  OutputTensor tr0_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr0")->input_tensor(1, &tr0_perm));
+
+  OutputTensor tr1_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr1")->input_tensor(1, &tr1_perm));
+
+  EXPECT_EQ(tr0_perm.node, tr1_perm.node);
+}
+
+TEST(CloneConstantsForBetterClusteringTest, InplaceOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
+  Scope on_cpu = root.WithAssignedDevice(kCPU).WithDevice(kCPU);
+
+  Output in0 = ops::Placeholder(on_gpu.WithOpName("in0"), DT_FLOAT);
+  Output in1 = ops::Placeholder(on_gpu.WithOpName("in1"), DT_FLOAT);
+
+  Output perm = ops::Const(on_cpu.WithOpName("perm"), {3, 1, 2, 0});
+
+  {
+    Output tr0 = ops::Transpose(on_gpu.WithOpName("tr0"), in0, perm);
+    Output tr1 = ops::Transpose(on_gpu.WithOpName("tr1"), in1, perm);
+  }
+
+  Output in_place_add =
+      ops::InplaceAdd(on_cpu.WithOpName("tr0"), perm,
+                      ops::Placeholder(on_cpu.WithOpName("i"), DT_INT32), perm);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(CloneConstantsForBetterClustering(root, &result));
+
+  OutputTensor tr0_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr0")->input_tensor(1, &tr0_perm));
+
+  OutputTensor tr1_perm;
+  TF_ASSERT_OK(FindNodeByName(result.get(), "tr1")->input_tensor(1, &tr1_perm));
+
+  EXPECT_EQ(tr0_perm.node, tr1_perm.node);
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilation_passes_test_main.cc b/tensorflow/compiler/jit/compilation_passes_test_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c73702fa6428de59262455db51ed3a3192ee2dc1
--- /dev/null
+++ b/tensorflow/compiler/jit/compilation_passes_test_main.cc
@@ -0,0 +1,58 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+GTEST_API_ int main(int real_argc, char** real_argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  tensorflow::AppendMarkForCompilationPassFlags(&flag_list);
+  auto usage = tensorflow::Flags::Usage(real_argv[0], flag_list);
+
+  std::vector<char*> args;
+
+  args.reserve(real_argc + 1);
+  for (int i = 0; i < real_argc; i++) {
+    args.push_back(real_argv[i]);
+  }
+
+  struct FreeDeleter {
+    void operator()(char* ptr) { free(ptr); }
+  };
+
+  std::unique_ptr<char, FreeDeleter> enable_global_jit_arg(
+      strdup("--tf_xla_cpu_global_jit=true"));
+  args.push_back(enable_global_jit_arg.get());
+
+  std::unique_ptr<char, FreeDeleter> reduce_min_cluster_size_arg(
+      strdup("--tf_xla_min_cluster_size=2"));
+  args.push_back(reduce_min_cluster_size_arg.get());
+
+  int argc = args.size();
+
+  if (!tensorflow::Flags::Parse(&argc, &args.front(), flag_list)) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+
+  testing::InitGoogleTest(&argc, &args.front());
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc
index 6f1ff85f24a4c1fd3e6d54fcff9f8868aee6f750..7021985affa494ed40c64825c2bd1d221db4e3bb 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op.cc
@@ -126,8 +126,9 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
   const DataTypeVector& arg_types = (*fbody)->arg_types;
   std::vector<bool> const_args(arg_types.size());
   // If we can't analyze the const args. Bail out.
-  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
-      *((*fbody)->graph), &const_args, /*compile_time_const_nodes=*/nullptr));
+  TF_RETURN_IF_ERROR(
+      BackwardsConstAnalysis(*((*fbody)->graph), &const_args,
+                             /*compile_time_const_nodes=*/nullptr, flr));
 
   for (int i = 0; i < const_args.size(); ++i) {
     if (const_args[i]) {
@@ -153,11 +154,14 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
                          std::unique_ptr<OpKernel>* kernel) {
   TF_RETURN_IF_ERROR(CompilationRequested(*flr, node_def));
 
-  VLOG(3) << "Creating XlaLaunchOp for " << node_def.DebugString();
+  VLOG(3) << "Attemping to create XlaLaunchOp for " << node_def.DebugString();
 
   // Make sure that kernels have been registered on the JIT device.
   XlaOpRegistry::RegisterCompilationKernels();
   if (!IsCompilable(flr, node_def)) {
+    VLOG(1) << "Not creating XlaLaunchOp because function invoked by the "
+               "following node is not compilable: "
+            << node_def.DebugString();
     // node_def is calling a function that XLA can't compile.
     return errors::InvalidArgument("Not compilable: ",
                                    node_def.ShortDebugString());
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 0562838f628c66b1eb03af9d2a5139c01dca31c5..4856301cef4fd9426c04e1ff557e25ae37980575 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -110,7 +113,11 @@ class Predicate {
   enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol };
 
   virtual string ToString() const = 0;
-  int64 hash() const { return hash_; }
+
+  // An ID assigned to the Predicate at construction time.  Conceptually like a
+  // pointer, except that it is stable across runs.
+  int64 id() const { return id_; }
+
   virtual absl::Span<Predicate* const> GetOperands() const = 0;
 
   virtual Kind kind() const = 0;
@@ -123,29 +130,19 @@ class Predicate {
   static void Visit(Predicate* p, const FunctionTy& func);
 
  protected:
-  explicit Predicate(int64 hash) : hash_(hash) {}
+  explicit Predicate(int64 id) : id_(id) {}
 
  private:
-  const int64 hash_;
+  const int64 id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Predicate);
 };
 
-int64 HashPredicateSequence(Predicate::Kind kind,
-                            absl::Span<Predicate* const> preds) {
-  int64 hash = ::tensorflow::hash<Predicate::Kind>()(kind);
-  for (Predicate* pred : preds) {
-    hash = Hash64Combine(hash, pred->hash());
-  }
-  return hash;
-}
-
 // Represents a logical conjunction of a set of predicates.
 class AndPredicate : public Predicate {
  public:
-  explicit AndPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kAnd, operands)),
-        operands_(std::move(operands)) {}
+  explicit AndPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -174,9 +171,8 @@ class AndPredicate : public Predicate {
 // Represents a logical disjunction of a set of predicates.
 class OrPredicate : public Predicate {
  public:
-  explicit OrPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kOr, operands)),
-        operands_(std::move(operands)) {}
+  explicit OrPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -204,9 +200,8 @@ class OrPredicate : public Predicate {
 // Represents a logical negation of a set of predicates.
 class NotPredicate : public Predicate {
  public:
-  explicit NotPredicate(Predicate* operand)
-      : Predicate(HashPredicateSequence(Kind::kNot, {operand})),
-        operands_({operand}) {}
+  explicit NotPredicate(int64 id, Predicate* operand)
+      : Predicate(id), operands_({operand}) {}
 
   string ToString() const override {
     return absl::StrCat("~", operand()->ToString());
@@ -222,29 +217,38 @@ class NotPredicate : public Predicate {
   std::array<Predicate*, 1> operands_;
 };
 
-// Represents an infinite list of predicates.
+// Represents the liveness of an induction variable.  For users inside the loop
+// this represents the "current" liveness of the induction variable.  For users
+// outside the loop it represents the "last" liveness of the induction variable.
+//
+// More concretely, an and recurrence {S,&,X}<loop> represents the liveness of V
+// in the following graph:
 //
-// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands
-// for the list of predicates:
+//   V = Merge(S', V_NextIt)
+//   V = Op(V, X')
+//   V_NextIt = NextIteration(V)
 //
-//   S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ...
+// where Predicate(S') = S and Predicate(X') = X.
 //
-// where GenSym(<expression>, <id>) renames every SymbolPredicate in
-// <expression> by appending <id> to it, in effect creating a "fresh" symbol.
-// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on
-// subsequent iterations".
+// `X` may contain symbolic predicates and the operations corresponding to these
+// symbolic predicates are either in frame `loop` or outside it.  The symbols
+// that are inside frame `loop` are loop variant (i.e. can have different
+// liveness in each loop iteration) and the symbols that are outside frame
+// `loop` are loop invariant (i.e. have the same liveness across all
+// iterations).
 class AndRecurrencePredicate : public Predicate {
  public:
-  explicit AndRecurrencePredicate(Predicate* start, Predicate* step)
-      : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})),
-        operands_({start, step}) {}
+  explicit AndRecurrencePredicate(int64 id, Predicate* start, Predicate* step,
+                                  std::vector<string> frame)
+      : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
+  absl::Span<const string> frame() const { return frame_; }
 
   string ToString() const override {
     return absl::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
-                        "}");
+                        "}<", absl::StrJoin(frame(), ";"), ">");
   }
 
   Kind kind() const override { return Kind::kAndRecurrence; }
@@ -255,6 +259,7 @@ class AndRecurrencePredicate : public Predicate {
 
  private:
   std::array<Predicate*, 2> operands_;
+  std::vector<string> frame_;
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -264,8 +269,8 @@ class AndRecurrencePredicate : public Predicate {
 // symbols.
 class SymbolPredicate : public Predicate {
  public:
-  explicit SymbolPredicate(TensorId tensor_id, bool must_be_true)
-      : Predicate(Hash(tensor_id, must_be_true)),
+  explicit SymbolPredicate(int64 id, TensorId tensor_id, bool must_be_true)
+      : Predicate(id),
         tensor_id_(std::move(tensor_id)),
         must_be_true_(must_be_true) {}
 
@@ -281,20 +286,13 @@ class SymbolPredicate : public Predicate {
   // "tensor_id() is live and evaluates to true".
   //
   // If `must_be_true()` is false then this SymbolPredicate represents the
-  // proposition "tensor_id() is live (and may evalutate to any value)"
+  // proposition "tensor_id() is live (and may evaluate to any value)"
   TensorId tensor_id() const { return tensor_id_; }
   bool must_be_true() const { return must_be_true_; }
 
  private:
   TensorId tensor_id_;
   bool must_be_true_;
-
-  static int64 Hash(const TensorId tensor_id, bool must_be_true) {
-    return Hash64Combine(
-        ::tensorflow::hash<bool>()(must_be_true),
-        Hash64Combine(::tensorflow::hash<Predicate::Kind>()(Kind::kSymbol),
-                      TensorId::Hasher{}(tensor_id)));
-  }
 };
 
 template <typename FunctionTy>
@@ -333,34 +331,58 @@ class PredicateFactory {
   }
 
   Predicate* MakeNotPredicate(Predicate* pred) {
-    SignatureForNot signature = pred;
-    auto it = interned_not_instances_.find(signature);
-    if (it == interned_not_instances_.end()) {
-      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
-      Predicate* new_pred_ptr = new_pred.get();
-      interned_not_instances_.emplace(signature, std::move(new_pred));
-      return new_pred_ptr;
-    } else {
-      return it->second.get();
+    auto it = make_not_predicate_cache_.find(pred);
+    if (it != make_not_predicate_cache_.end()) {
+      return it->second;
     }
+
+    Predicate* result = MakeNotPredicateImpl(pred);
+
+    bool insert_successful =
+        make_not_predicate_cache_.insert({pred, result}).second;
+    (void)insert_successful;
+    DCHECK(insert_successful);
+
+    return result;
   }
 
-  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) {
-    auto it = interned_and_rec_instances_.find({start, step});
+  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step,
+                                        std::vector<string> frame) {
+    SignatureForAndRec signature(start, step, std::move(frame));
+    auto it = interned_and_rec_instances_.find(signature);
     if (it != interned_and_rec_instances_.end()) {
       return it->second.get();
     }
 
-    std::unique_ptr<Predicate> new_pred =
-        Make<AndRecurrencePredicate>(start, step);
+    std::unique_ptr<Predicate> new_pred = Make<AndRecurrencePredicate>(
+        std::get<0>(signature), std::get<1>(signature), std::get<2>(signature));
     Predicate* new_pred_ptr = new_pred.get();
-    CHECK(interned_and_rec_instances_
-              .emplace(SignatureForAndRec(start, step), std::move(new_pred))
-              .second);
+    bool inserted =
+        interned_and_rec_instances_.emplace(signature, std::move(new_pred))
+            .second;
+    (void)inserted;
+    DCHECK(inserted);
     return new_pred_ptr;
   }
 
-  Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
+  Status MakeSymbolPredicate(Node* node, int output_idx, bool must_be_true,
+                             Predicate** predicate) {
+    TensorId tensor_id(node->name(), output_idx);
+
+    bool is_boolean_tensor = node->output_type(tensor_id.index()) == DT_BOOL;
+    TF_RET_CHECK(!must_be_true || is_boolean_tensor);
+
+    if (node->type_string() == "Const" && must_be_true) {
+      const TensorProto* proto = nullptr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "value", &proto));
+
+      Tensor tensor(proto->dtype());
+      TF_RET_CHECK(tensor.FromProto(*proto));
+
+      *predicate = tensor.scalar<bool>()() ? MakeTrue() : MakeFalse();
+      return Status::OK();
+    }
+
     SignatureForSymbol signature = {tensor_id, must_be_true};
     auto it = interned_symbol_instances_.find(signature);
     if (it == interned_symbol_instances_.end()) {
@@ -369,20 +391,70 @@ class PredicateFactory {
       Predicate* new_pred_ptr = new_pred.get();
       interned_symbol_instances_.emplace(std::move(signature),
                                          std::move(new_pred));
-      return new_pred_ptr;
+      *predicate = new_pred_ptr;
     } else {
-      return it->second.get();
+      *predicate = it->second.get();
     }
+
+    return Status::OK();
   }
 
   Predicate* MakeTrue() { return MakeAndPredicate({}); }
   Predicate* MakeFalse() { return MakeOrPredicate({}); }
 
+  ~PredicateFactory() {
+    DCHECK_EQ(stack_depth_, 0) << "Unnested IncrementStackDepth?";
+  }
+
  private:
+  Predicate* MakeNotPredicateImpl(Predicate* pred) {
+    IncrementStackDepth stack_frame(this);
+    if (!stack_frame.HasOverflowed()) {
+      if (Predicate* simplified = SimplifyUsingDeMorgan(pred)) {
+        return simplified;
+      }
+
+      // ~~A => A
+      if (auto* not_pred = dynamic_cast<NotPredicate*>(pred)) {
+        return not_pred->operand();
+      }
+    }
+
+    SignatureForNot signature = pred;
+    auto it = interned_not_instances_.find(signature);
+    if (it == interned_not_instances_.end()) {
+      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
+      Predicate* new_pred_ptr = new_pred.get();
+      interned_not_instances_.emplace(signature, std::move(new_pred));
+      return new_pred_ptr;
+    } else {
+      return it->second.get();
+    }
+  }
+
+  Predicate* SimplifyUsingDeMorgan(Predicate* pred) {
+    // ~(A & B & C & ...) => ~A | ~B | ~C | ~...
+    // ~(A | B | C | ...) -> ~A & ~B & ~C & ~...
+    Predicate::Kind kind = pred->kind();
+
+    if (kind == Predicate::Kind::kAnd || kind == Predicate::Kind::kOr) {
+      std::vector<Predicate*> new_operands;
+      absl::c_transform(pred->GetOperands(), std::back_inserter(new_operands),
+                        [&](Predicate* p) { return MakeNotPredicate(p); });
+      return kind == Predicate::Kind::kOr ? MakeAndPredicate(new_operands)
+                                          : MakeOrPredicate(new_operands);
+    }
+
+    return nullptr;
+  }
+
   template <typename PredicateT, typename... Args>
   std::unique_ptr<Predicate> Make(Args&&... args) {
+    // If we ever expose the Predicate class outside this .cc file then we may
+    // want to make this hard to misuse (by accidentally passing in an arbitrary
+    // integer to the Predicate constructor for instance).
     return std::unique_ptr<PredicateT>(
-        new PredicateT(std::forward<Args>(args)...));
+        new PredicateT(id_counter_++, std::forward<Args>(args)...));
   }
 
   Predicate* MakeAndOrImpl(absl::Span<Predicate* const> operands, bool is_and);
@@ -402,7 +474,8 @@ class PredicateFactory {
   using SignatureForAndOr =
       std::pair<Predicate::Kind, absl::Span<Predicate* const>>;
   using SignatureForNot = Predicate*;
-  using SignatureForAndRec = std::pair<Predicate*, Predicate*>;
+  using SignatureForAndRec =
+      std::tuple<Predicate*, Predicate*, std::vector<string>>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
 
   struct HashSignatureForAndOr {
@@ -422,6 +495,36 @@ class PredicateFactory {
     }
   };
 
+  // Used to limit recursion to avoid blowing up the stack and cap compile time.
+  class IncrementStackDepth {
+   public:
+    explicit IncrementStackDepth(PredicateFactory* parent) : parent_(parent) {
+      parent_->stack_depth_++;
+    }
+
+    bool HasOverflowed() const {
+      const int kMaxStackDepth = 8;
+      return parent_->stack_depth_ >= kMaxStackDepth;
+    }
+
+    ~IncrementStackDepth() { parent_->stack_depth_--; }
+
+   private:
+    PredicateFactory* parent_;
+  };
+
+  // A cache for the MakeNotPredicate function.
+  //
+  // NB! This is *not* the same as `interned_not_instances_`.
+  // `interned_not_instances_` maps ensures pointer identity for `NotPredicate`
+  // instances, i.e., it ensures there at most one instance of Not(predicate)
+  // for any given predicate whereas `make_not_predicate_cache_` simply caches
+  // the result of the `MakeNotPredicate` function.  The values in
+  // `interned_not_instances_` are always instance of `NotPredicate` whereas the
+  // values in `make_not_predicate_cache_` may not be (for instance it will map
+  // Not(Not(A)) to A).
+  absl::flat_hash_map<Predicate*, Predicate*> make_not_predicate_cache_;
+
   absl::flat_hash_map<SignatureForAndOr, std::unique_ptr<Predicate>,
                       HashSignatureForAndOr>
       interned_and_or_instances_;
@@ -432,13 +535,15 @@ class PredicateFactory {
   absl::flat_hash_map<SignatureForSymbol, std::unique_ptr<Predicate>,
                       HashSignatureForSymbol>
       interned_symbol_instances_;
+  int64 id_counter_ = 0;
+  int stack_depth_ = 0;
 };
 
 Predicate* PredicateFactory::MakeInternedAndOr(
     std::vector<Predicate*> simplified_ops, Predicate::Kind pred_kind) {
   std::stable_sort(
       simplified_ops.begin(), simplified_ops.end(),
-      [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
+      [](Predicate* a, Predicate* b) { return a->id() < b->id(); });
 
   auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
   if (it != interned_and_or_instances_.end()) {
@@ -466,6 +571,13 @@ Predicate* PredicateFactory::MakeAndOrImpl(
     absl::Span<Predicate* const> operands, bool is_and) {
   Predicate::Kind pred_kind =
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
+
+  IncrementStackDepth stack_frame(this);
+  if (stack_frame.HasOverflowed()) {
+    return MakeInternedAndOr(
+        std::vector<Predicate*>(operands.begin(), operands.end()), pred_kind);
+  }
+
   Predicate::Kind other_pred_kind =
       is_and ? Predicate::Kind::kOr : Predicate::Kind::kAnd;
   absl::flat_hash_set<Predicate*> simplified_ops_set;
@@ -494,16 +606,31 @@ Predicate* PredicateFactory::MakeAndOrImpl(
 
   // Simplify "A&~A=>False" and "A|~A=>True".
   absl::flat_hash_set<Predicate*> negated_ops;
-  for (Predicate* op : simplified_ops) {
-    if (op->kind() == Predicate::Kind::kNot) {
-      negated_ops.insert(dynamic_cast<NotPredicate&>(*op).operand());
-    }
-  }
-
   for (Predicate* op : simplified_ops) {
     if (negated_ops.count(op)) {
+      // Simple case:
+      //
+      //   A & ~A & ... == False
+      //   A | ~A | ... == True
       return is_and ? MakeFalse() : MakeTrue();
     }
+
+    Predicate* negated_op = MakeNotPredicate(op);
+    if (negated_op->kind() == pred_kind) {
+      // Slightly more complicated case:
+      //
+      //   (~A | ~B | ~C) & A & B & C & ... ==
+      //   ~(A & B & C) & (A & B & C) & ... == False
+      //
+      //   (~A & ~B & ~C) | A | B | C | ... ==
+      //   ~(A | B | C) | (A | B | C) | ... == True
+      if (absl::c_all_of(negated_op->GetOperands(), [&](Predicate* p) {
+            return simplified_ops_set.contains(p);
+          })) {
+        return is_and ? MakeFalse() : MakeTrue();
+      }
+    }
+    negated_ops.insert(negated_op);
   }
 
   // If all ops contain the same subop, then factor it out thanks to the
@@ -619,6 +746,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   const Graph& graph_;
   absl::flat_hash_map<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
   PredicateFactory predicate_factory_;
+  std::vector<ControlFlowInfo> control_flow_info_;
   bool vlog_;
 };
 
@@ -640,7 +768,8 @@ Status DeadnessAnalysisImpl::GetInputPreds(
       auto it = predicate_map_.find(InputEdgeToTensorId(in_edge));
       if (it == predicate_map_.end()) {
         GraphCycles graph_cycles;
-        TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph_, &graph_cycles));
+        TF_RETURN_IF_ERROR(
+            CreateCycleDetectionGraph(&graph_, &graph_cycles).status());
 
         // If we didn't return with an error above then the graph is probably
         // fine and we have a bug in deadness analysis.
@@ -661,9 +790,12 @@ Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
   const Edge* pred_edge;
   TF_RETURN_IF_ERROR(n->input_edge(1, &pred_edge));
-  Predicate* true_switch = predicate_factory_.MakeSymbolPredicate(
-      TensorId(pred_edge->src()->name(), pred_edge->src_output()),
-      /*must_be_true=*/true);
+
+  Predicate* true_switch;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      pred_edge->src(), pred_edge->src_output(),
+      /*must_be_true=*/true, &true_switch));
+
   Predicate* false_switch = predicate_factory_.MakeNotPredicate(true_switch);
 
   // Output 0 is alive iff all inputs are alive and the condition is false.
@@ -761,6 +893,23 @@ Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
 
   return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr;
 }
+
+Status GetFullFrame(const Node* n, absl::Span<const ControlFlowInfo> cfi_infos,
+                    std::vector<string>* frame) {
+  int depth = 0;
+  for (const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()]; !n->IsSource();
+       n = cfi_iter->parent_frame, cfi_iter = &cfi_infos[n->id()]) {
+    frame->push_back(cfi_iter->frame_name);
+
+    if (depth++ > 5000) {
+      return errors::Internal(
+          "Frame of depth > 5000:  Probably malformed graph or a bug in "
+          "BuildControlFlowInfo");
+    }
+  }
+
+  return Status::OK();
+}
 }  // namespace
 
 Status DeadnessAnalysisImpl::HandleMerge(Node* n,
@@ -783,8 +932,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
     if (has_unvisited_backedge) {
       // We're visiting this merge for the first time and it has an unvisited
       // backedge.
-      Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate(
-          TensorId(n->name(), 0), /*must_be_true=*/false);
+      Predicate* input_data_pred;
+      TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+          n, /*output_idx=*/0, /*must_be_true=*/false, &input_data_pred));
+
       SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
                    should_revisit);
       return Status::OK();
@@ -825,8 +976,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
 
         Predicate* start =
             predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
-        Predicate* and_rec =
-            predicate_factory_.MakeAndRecurrencePredicate(start, step);
+        std::vector<string> frame;
+        TF_RETURN_IF_ERROR(GetFullFrame(n, control_flow_info_, &frame));
+        Predicate* and_rec = predicate_factory_.MakeAndRecurrencePredicate(
+            start, step, std::move(frame));
         SetPredicate(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit);
         return Status::OK();
       }
@@ -841,8 +994,10 @@ Status DeadnessAnalysisImpl::HandleRecv(Node* n,
   // acquire a dead signal from a _Send.
   std::vector<Predicate*> input_preds;
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
-  input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
-      TensorId(n->name(), 0), /*must_be_true=*/false));
+  Predicate* signal_is_alive;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      n, /*output_idx=*/0, /*must_be_true=*/false, &signal_is_alive));
+  input_preds.push_back(signal_is_alive);
   SetPredicate(n, {0, Graph::kControlSlot},
                predicate_factory_.MakeAndPredicate(input_preds),
                should_revisit);
@@ -892,6 +1047,24 @@ Status DeadnessAnalysisImpl::Populate() {
 
 Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
     absl::Span<Node* const> rpo) {
+  std::vector<string> unreachable_nodes;
+  // Compute the loop structure of the graph.
+  TF_RETURN_IF_ERROR(
+      BuildControlFlowInfo(&graph_, &control_flow_info_, &unreachable_nodes));
+
+  // Do some opportunistic error checking:
+  if (!unreachable_nodes.empty()) {
+    if (unreachable_nodes.size() > 5) {
+      unreachable_nodes.erase(unreachable_nodes.begin() + 5,
+                              unreachable_nodes.end());
+    }
+
+    return errors::InvalidArgument(
+        "Found unreachable nodes, most likely source and sink nodes not "
+        "connected: ",
+        absl::StrJoin(unreachable_nodes, ", "));
+  }
+
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
   //
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 8a73101c184e6190921fd7729742922bd96f4bcf..38a5118d9a721b814e1b52ce4202d4fb783e3ac3 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -123,10 +123,9 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1);
   Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10);
   Output loop_cond_expr =
-      ops::Less(root.WithOpName(prefix + "/less"), iv.output, final_value);
-  Output loop_cond =
-      ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
-  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
+      ops::Less(root.WithOpName(prefix + "/cond"), iv.output, final_value);
+  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output,
+                    loop_cond_expr);
   ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
                            latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
@@ -140,7 +139,7 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   root.graph()->AddControlEdge(iv.output.node(), increment_by.node());
   root.graph()->AddControlEdge(iv.output.node(), final_value.node());
 
-  return {iv.output, loop_cond};
+  return {iv.output, loop_cond_expr};
 }
 
 InductionVarInfo CreateInductionVariable(const Scope& root,
@@ -515,24 +514,27 @@ TEST(DeadnessAnalysisTest, Loop) {
 
     // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0
     // produce the same deadness.  But we're not that smart today.
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)],
+              "{#true,&,*iv0/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)],
+              "{#true,&,*iv1/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)],
+              "{#true,&,*iv2/cond:0}<fr0>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})");
+              "({#true,&,*iv0/cond:0}<fr0> & {#true,&,*iv1/cond:0}<fr0>)");
     EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})");
+              "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv2/cond:0}<fr0>)");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "loop", 0);
   Output dependent_iv0 =
-      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div0", "loop", iv.loop_cond, 0)
           .induction_var;
   Output dependent_iv1 =
-      CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div1", "loop", iv.loop_cond, 0)
           .induction_var;
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1);
 
@@ -549,13 +551,13 @@ TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
-              "{#true,&,*iv0/cond:0}");
+              "{#true,&,*iv0/cond:0}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
   }
 }
 
@@ -595,32 +597,33 @@ TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
 TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
   InductionVarInfo iv_outer =
-      CreateInductionVariable(root, "iv_outer", "frame", 0);
+      CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+  Output enter_constant_outer_loop = ops::internal::Enter(
+      root.WithOpName("constant_enter_outer_loop"),
+      ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
   ops::Switch inner_value(root.WithOpName("outer_is_live"),
-                          ops::Const(root.WithOpName("constant"), 5),
-                          iv_outer.loop_cond);
+                          enter_constant_outer_loop, iv_outer.loop_cond);
   InductionVarInfo iv_inner = CreateInductionVariable(
-      root, "iv_inner", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner/enter"),
-                           inner_value.output_true, "frame_inner"));
+      root, "iv_inner", "inner_loop", inner_value.output_true);
 
   Output dependent_outer_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
   Output dependent_outer_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
 
-  Output dependent_inner_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv0)
-          .induction_var;
-  Output dependent_inner_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv1)
-          .induction_var;
+  Output dependent_inner_iv0 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv0", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv0)
+                                   .induction_var;
+  Output dependent_inner_iv1 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv1", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv1)
+                                   .induction_var;
 
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0,
                          dependent_inner_iv1);
@@ -638,46 +641,51 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
-              "{#true,&,*iv_outer/cond:0}");
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
-              "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&,"
-              "*iv_inner/cond:0}");
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
+
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv_outer_0 =
-      CreateInductionVariable(root, "iv_outer_0", "frame", 0);
-  ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"),
-                            ops::Const(root.WithOpName("constant"), 5),
-                            iv_outer_0.loop_cond);
-  InductionVarInfo iv_inner_0 = CreateInductionVariable(
-      root, "iv_inner_0", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_0/enter"),
-                           inner_value_0.output_true, "frame_inner"));
-
-  InductionVarInfo iv_outer_1 =
-      CreateInductionVariable(root, "iv_outer_1", "frame", 1);
-  ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"),
-                                 ops::Const(root.WithOpName("constant"), 5),
-                                 iv_outer_1.loop_cond);
-  InductionVarInfo iv_inner_1 = CreateInductionVariable(
-      root, "iv_inner_1", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_1/enter"),
-                           inner_init_value_1.output_true, "frame_inner"));
-  Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var,
-                         iv_inner_1.induction_var);
+
+  std::array<Output, 2> outer_iv;
+  std::array<Output, 2> inner_iv;
+
+  for (int i : {0, 1}) {
+    InductionVarInfo iv_outer =
+        CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+    Output enter_constant_outer_loop = ops::internal::Enter(
+        root.WithOpName("constant_enter_outer_loop"),
+        ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+        ops::internal::Enter::Attrs().IsConstant(true));
+    ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                            enter_constant_outer_loop, iv_outer.loop_cond);
+    InductionVarInfo iv_inner = CreateInductionVariable(
+        root, "iv_inner", "inner_loop", inner_value.output_true);
+
+    outer_iv[i] = iv_outer.induction_var;
+    inner_iv[i] = iv_inner.induction_var;
+  }
+
+  Output add0 = ops::Add(root.WithOpName("add0"), inner_iv[0], inner_iv[1]);
 
   VLogGraphIfAsked(*root.graph());
 
@@ -692,21 +700,77 @@ TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
     PredicateMapTy predicate_map;
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)],
-              "{#true,&,*iv_outer_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)],
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)],
-              "{#true,&,*iv_outer_1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)],
-              "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[0])],
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[0])],
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[1])],
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[1])],
+              "{(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0} & "
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0})");
+              "({(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop> & {(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>)");
+  }
+}
+
+TEST(DeadnessAnalysisTest, AndRecurrenceNeedsFrameName) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_0 = CreateInductionVariable(root, "iv_0", "frame_0", 10);
+  InductionVarInfo iv_1 = CreateInductionVariable(root, "iv_1", "frame_1", 9);
+
+  Output init = CreateSwitch(root, "init").output_true;
+  Output step = CreateSwitch(root, "step").output_true;
+
+  std::array<Output, 2> exits;
+  std::array<Output, 2> next_iterations;
+
+  for (int i : {0, 1}) {
+    Output init_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("init_enter_frame_", i)), init,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+    Output step_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("step_enter_frame_", i)), step,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+
+    ops::Merge iv(root.WithOpName(absl::StrCat("expr_", i)),
+                  {init_enter, init_enter});
+    Output add = ops::Add(root.WithOpName(absl::StrCat("add_", i)), iv.output,
+                          step_enter);
+    next_iterations[i] = ops::NextIteration(
+        root.WithOpName(absl::StrCat("expr_", i, "_next_iteration")), add);
+    EXPECT_TRUE(
+        root.graph()
+            ->UpdateEdge(next_iterations[i].node(), 0, iv.output.node(), 1)
+            .ok());
+    exits[i] = ops::internal::Exit(root.WithOpName(absl::StrCat("exit_", i)),
+                                   iv.output);
+  }
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])],
+              predicate_map[ControlOutputFor(exits[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[1])], "");
+
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])],
+              predicate_map[ControlOutputFor(next_iterations[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[1])], "");
   }
 }
 
@@ -818,5 +882,82 @@ TEST(DeadnessAnalysisTest, RecvVsSwitchText) {
   EXPECT_EQ(predicate_map[logical_and_output_0], "(recv:0 & *recv:0)");
 }
 
+TEST(DeadnessAnalysisTest, DeMorgan) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output cond_0 = ops::Placeholder(root.WithOpName("cond_0"), DT_BOOL);
+  Output cond_1 = ops::Placeholder(root.WithOpName("cond_1"), DT_BOOL);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+
+  ops::Switch sw_0(root.WithOpName("switch_0"), value, cond_0);
+  ops::Switch sw_1(root.WithOpName("switch_1"), value, cond_1);
+
+  Output and_0_1 =
+      ops::Add(root.WithOpName("and_0_1"), sw_0.output_true, sw_1.output_true);
+
+  Output or_not0_not1 = ops::Merge(root.WithOpName("or_not0_not1"),
+                                   {sw_0.output_false, sw_1.output_false})
+                            .output;
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) & (~A | ~B) = (A & B) & ~(A & B) = False
+  Output should_always_be_dead =
+      ops::Add(root.WithOpName("should_always_be_dead"), and_0_1, or_not0_not1);
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) | (~A | ~B) = (A & B) | ~(A & B) = True
+  Output should_always_be_alive =
+      ops::Merge(root.WithOpName("should_always_be_alive"),
+                 {and_0_1, or_not0_not1})
+          .output;
+
+  std::unique_ptr<DeadnessAnalysis> result;
+  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_dead)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_alive)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantTrueSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_true = ops::Const(root.WithOpName("const_true"), true);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_true);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantFalseSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_false = ops::Const(root.WithOpName("const_false"), false);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_false);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#true");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#false");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index a21e083131de9b1ba20872338311768569b982f3..c3a0b6521da2355f0b25eeac08e7fac999442438 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -108,14 +109,14 @@ void MarkGuaranteedConstants(
   for (const auto& src_arg : src_arg_pairs) {
     srcs.push_back(src_arg.first);
   }
-  ReverseDFSFrom(graph, srcs, /*enter=*/nullptr,
-                 /*leave=*/[&guaranteed_const_nodes](const Node* n) {
-                   // TODO(vinuraja): Doesn't work in the presence of loops.
-                   if (AreAllParentsGuaranteedConst(*n,
-                                                    guaranteed_const_nodes)) {
-                     guaranteed_const_nodes.insert(n);
-                   }
-                 });
+  ReverseDFSFrom(
+      graph, srcs, /*enter=*/nullptr,
+      /*leave=*/[&guaranteed_const_nodes](const Node* n) {
+        // TODO(vinuraja): Doesn't work in the presence of loops.
+        if (AreAllParentsGuaranteedConst(*n, guaranteed_const_nodes)) {
+          guaranteed_const_nodes.insert(n);
+        }
+      });
 
   for (auto& src_arg : src_arg_pairs) {
     if (guaranteed_const_nodes.count(src_arg.first) != 0) {
@@ -1008,13 +1009,15 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, host_compute);
+        graph_->AddControlEdge(src_image, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the _HostCompute node to its ancestor host compute nodes.
       for (const auto& ancestor_name : host_compute_ancestors) {
         Node* ancestor = host_compute_node[ancestor_name];
-        graph_->AddControlEdge(ancestor, host_compute);
+        graph_->AddControlEdge(ancestor, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the consumers in the subgraph to the _HostCompute node.
@@ -1031,7 +1034,8 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(host_compute, dst_image);
+        graph_->AddControlEdge(host_compute, dst_image,
+                               /* allow_duplicates= */ true);
       }
     }
   }
@@ -1059,7 +1063,8 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
 void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
   if (sequencer_ != nullptr) {
     VLOG(2) << "ConnectSequencerToCallNode";
-    graph_out->AddControlEdge(sequencer_, call_node_);
+    graph_out->AddControlEdge(sequencer_, call_node_,
+                              /* allow_duplicates= */ true);
   }
 }
 
@@ -1120,10 +1125,9 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
 
   if (VLOG_IS_ON(1)) {
     VLOG(2) << "Build function def " << name;
-    dump_graph::DumpGraphToFile(absl::StrCat("encapsulate_fdef_graph_", name),
-                                *graph_, library);
-    dump_graph::DumpFunctionDefToFile(absl::StrCat("encapsulate_fdef_", name),
-                                      fdef);
+    DumpGraphToFile(absl::StrCat("encapsulate_fdef_graph_", name), *graph_,
+                    library);
+    DumpFunctionDefToFile(absl::StrCat("encapsulate_fdef_", name), fdef);
   }
 
   const FunctionDef* original_fdef = library->Find(name);
@@ -1186,11 +1190,10 @@ Status Encapsulator::Subgraph::ReplaceFunctionDef(
 
   if (VLOG_IS_ON(1)) {
     VLOG(2) << "Replace function def " << name;
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
-        library);
-    dump_graph::DumpFunctionDefToFile(
-        absl::StrCat("replace_encapsulate_fdef_", name), fdef);
+    DumpGraphToFile(absl::StrCat("replace_encapsulate_fdef_graph_", name),
+                    *graph_, library);
+    DumpFunctionDefToFile(absl::StrCat("replace_encapsulate_fdef_", name),
+                          fdef);
   }
 
   TF_RETURN_IF_ERROR(library->ReplaceFunction(name, fdef));
@@ -1279,7 +1282,8 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   // completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_,
+                            true /* skip duplicates check */);
 
   return Status::OK();
 }
@@ -1336,7 +1340,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   // subgraph completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_,
+                            /* allow_duplicates= */ true);
 
   return Status::OK();
 }
@@ -1446,7 +1451,8 @@ Status Encapsulator::CopySubgraphEdges(
         src_func_id == dst_func_id) {
       Graph* g = subgraphs_[src_func_id].GetGraph();
       if (edge->IsControlEdge()) {
-        g->AddControlEdge(src_image, dst_image);
+        g->AddControlEdge(src_image, dst_image,
+                          /* allow_duplicates= */ true);
       } else {
         g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
       }
@@ -1549,7 +1555,7 @@ Status Encapsulator::SplitIntoSubgraphs(FunctionLibraryDefinition* library) {
   if (VLOG_IS_ON(1)) {
     // Dump subgraphs.
     for (auto& entry : subgraphs_) {
-      dump_graph::DumpGraphToFile(
+      DumpGraphToFile(
           absl::StrCat("encapsulate_subgraphs_subgraph_", entry.first),
           *entry.second.GetGraph(), library);
     }
@@ -1732,7 +1738,8 @@ Status Encapsulator::CopyEdgeToOutputGraph(
     if (edges_added
             ->emplace(OutputTensor(src_image, -1), InputTensor(dst_image, -1))
             .second) {
-      graph_out->AddControlEdge(src_image, dst_image);
+      graph_out->AddControlEdge(src_image, dst_image,
+                                /* allow_duplicates= */ true);
     }
 
     return Status::OK();
@@ -1761,7 +1768,8 @@ Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
     const string& subgraph = ancestors.first;
     for (const string& ancestor : ancestors.second) {
       graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNode(),
-                                subgraphs_[subgraph].GetCallNode());
+                                subgraphs_[subgraph].GetCallNode(),
+                                /* allow_duplicates= */ true);
     }
   }
   return Status::OK();
@@ -2129,7 +2137,8 @@ Status CheckClusterDependencyForCycles(
     const string& ancestor, const string& successor,
     const std::unordered_map<string, std::unordered_set<string>>& ancestors,
     const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
-    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+    GraphCycles* cycle_detector,
+    std::unordered_map<string, int>* cycle_detector_map) {
   if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
     (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
   }
@@ -2173,7 +2182,7 @@ Status Encapsulator::FindClusterDependencies() {
   // We check that clusters are acyclic using this cycle detector.
   GraphCycles cycle_detector;
   // Map from cluster name to cycle detector node id.
-  std::map<string, int> cycle_detector_map;
+  std::unordered_map<string, int> cycle_detector_map;
   // Process the nodes in topologically-sorted order.
   std::vector<Node*> nodes;
   GetReversePostOrder(*graph_in_, &nodes);
@@ -2311,13 +2320,18 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
                               " in function library.");
     }
     FunctionBody* fbody = nullptr;
-    TF_RETURN_IF_ERROR(
-        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
-                                [library](const string& op, const OpDef** sig) {
-                                  return library->LookUpOpDef(op, sig);
-                                },
-                                &fbody));
-    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *fdef, node->attrs(), library,
+        [library](const string& op, const OpDef** sig) {
+          return library->LookUpOpDef(op, sig);
+        },
+        &fbody));
+
+    InlineFunctionBodyOptions inline_opts;
+    inline_opts.override_device = false;
+
+    TF_RETURN_IF_ERROR(InlineFunctionBody(*library, pruned_graph->get(), node,
+                                          fbody, inline_opts));
     delete fbody;
   }
 
@@ -2383,8 +2397,7 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
       &node_images, library));
 
   if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile("pruned_graph_for_shape_inference",
-                                *pruned_graph, library);
+    DumpGraphToFile("pruned_graph_for_shape_inference", *pruned_graph, library);
   }
 
   for (auto& subgraph_entry : subgraphs_) {
@@ -2515,19 +2528,49 @@ Status EncapsulateSubgraphsPass::Run(
     const GraphOptimizationPassOptions& options) {
   VLOG(1) << "EncapsulateSubgraphsPass::Run";
   if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile("encapsulate_subgraphs_before", **options.graph,
-                                options.flib_def);
+    DumpGraphToFile("encapsulate_subgraphs_before", **options.graph,
+                    options.flib_def);
   }
 
   std::unique_ptr<Graph> graph_out;
   FunctionLibraryDefinition* const library = options.flib_def;
 
+  // Constant folding below might need to run part of the function to compute
+  // constants. Create an FunctionLibraryRuntime with a single CPU device
+  // that can run the part of the function.
+  // NOTE: If this turns out to be slow, we can cache the FLRs keyed by
+  // `options`.
+  SessionOptions session_options;
+  auto* device_count = session_options.config.mutable_device_count();
+  device_count->insert({"CPU", 1});
+  std::vector<std::unique_ptr<Device>> devices;
+
+  DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU");
+  if (!cpu_factory) {
+    return errors::NotFound(
+        "CPU Factory not registered. Can't run EncapsulateSubgraphsPass");
+  }
+  TF_RETURN_IF_ERROR(cpu_factory->CreateDevices(
+      session_options, "/job:localhost/replica:0/task:0", &devices));
+  if (devices.empty()) {
+    return errors::NotFound(
+        "Failed to create a CPU device for EncapsulateSubgraphsPass");
+  }
+
+  std::unique_ptr<DeviceMgr> device_mgr =
+      absl::make_unique<DeviceMgr>(std::move(devices));
   OptimizerOptions opts;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
-      new ProcessFunctionLibraryRuntime(nullptr, options.session_options->env,
+      new ProcessFunctionLibraryRuntime(device_mgr.get(),
+                                        options.session_options->env,
                                         TF_GRAPH_DEF_VERSION, library, opts));
   FunctionLibraryRuntime* flr =
-      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+      pflr->GetFLR("/job:localhost/replica:0/task:0/device:CPU:0");
+  if (flr == nullptr) {
+    return errors::Internal(
+        "Failed to create and retrieve function library runtime to run "
+        "constant folding");
+  }
 
   auto rewrite_subgraph =
       [flr](const std::vector<OutputTensor>& arg_source_tensors,
@@ -2565,8 +2608,9 @@ Status EncapsulateSubgraphsPass::Run(
 
         const int num_args = input_permutation->size();
         std::vector<bool> const_args(num_args);
-        TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
-            **subgraph, &const_args, /*compile_time_const_nodes=*/nullptr));
+        TF_RETURN_IF_ERROR(
+            BackwardsConstAnalysis(**subgraph, &const_args,
+                                   /*compile_time_const_nodes=*/nullptr, flr));
 
         DataTypeVector arg_types(num_args);
         TF_RETURN_IF_ERROR(GetArgTypes(**subgraph, &arg_types));
@@ -2625,8 +2669,8 @@ Status EncapsulateSubgraphsPass::Run(
       "EncapsulateSubgraphsPass failed");
 
   if (VLOG_IS_ON(1)) {
-    dump_graph::DumpGraphToFile("encapsulate_subgraphs_after", *graph_out,
-                                options.flib_def);
+    DumpGraphToFile("encapsulate_subgraphs_after", *graph_out,
+                    options.flib_def);
   }
 
   *options.graph = std::move(graph_out);
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 1f8ec09e19c01d0a8b2a3761135ed53dfb2ad3b0..261519de3478c8b3e30d206a15944b5a686598e2 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -307,22 +307,6 @@ REGISTER_OP("XlaHostCompute")
     .Attr("shapes: list(shape) >= 0")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
-REGISTER_OP("_XlaSendFromHost")
-    .Input("inputs: Tinputs")
-    .Input("dynamic_key: string")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
-REGISTER_OP("_XlaRecvAtHost")
-    .Input("dynamic_key: string")
-    .Output("outputs: Toutputs")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
 REGISTER_OP("InputTest")
     .Output("o: float")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 3bb979e0698d2d6be42ed5bae66c25267928192c..6d1661222e3eaf9df4f9f91f2b426c80b55245b2 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index ec745cdbb7e237f8b4935dd41e9791fc75f5355d..4e65971191aab69b5f4df780dfed939613d38c0f 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -36,6 +40,25 @@ namespace {
 
 const char* const kXlaClusterOutput = "XlaClusterOutput";
 
+bool IsCpuGpuCompile(const Graph* graph) {
+  for (Node* n : graph->nodes()) {
+    string name;
+    // Only consider nodes being compiled.
+    if (!GetNodeAttr(n->attrs(),
+                     EncapsulateXlaComputationsPass::kXlaClusterAttr, &name)
+             .ok())
+      continue;
+    // Early return for any node with a device that is not a CPU or GPU.
+    DeviceNameUtils::ParsedName parsed;
+    if (DeviceNameUtils::ParseFullName(n->requested_device(), &parsed)) {
+      if (parsed.type != DEVICE_CPU && parsed.type != DEVICE_GPU) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 // Checks if a graph node is marked to be a guaranteed constant.
 bool is_guaranteed_constant(const Node& n) {
   bool guaranteed_constant = false;
@@ -173,10 +196,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // Nondeterminism in serialization would not lead to incorrect results, but
   // may cause spurious cache misses. DeterministicSerialization is a
   // best-effort deterministic serialization.
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(gdef, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
-  LOG(INFO) << "Subgraph fingerprint:" << fingerprint;
+  const size_t size = gdef.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(SerializeToBufferDeterministic(gdef, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
+  VLOG(1) << "Subgraph fingerprint:" << fingerprint;
   call_def->set_op(absl::StrCat(call_def->op(), "_", fingerprint));
   return Status::OK();
 }
@@ -348,18 +372,25 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 Status EncapsulateXlaComputationsPass::Run(
     const GraphOptimizationPassOptions& options) {
   VLOG(1) << "EncapsulateXlaComputations(): "
-          << dump_graph::DumpGraphToFile("encapsulate_xla_computations_before",
-                                         **options.graph, options.flib_def);
+          << DumpGraphToFile("encapsulate_xla_computations_before",
+                             **options.graph, options.flib_def);
+
+  const char* additional_help =
+      IsCpuGpuCompile(options.graph->get())
+          ? xla::status_macros::kPossibleAutoJitAlternative
+          : "";
 
-  TF_RETURN_IF_ERROR(Encapsulate(options.graph, options.flib_def));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(Encapsulate(options.graph, options.flib_def),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() half-way: "
-          << dump_graph::DumpGraphToFile("encapsulate_xla_computations_halfway",
-                                         **options.graph, options.flib_def);
+          << DumpGraphToFile("encapsulate_xla_computations_halfway",
+                             **options.graph, options.flib_def);
 
-  TF_RETURN_IF_ERROR(BuildXlaLaunchOps(options.graph->get()));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(BuildXlaLaunchOps(options.graph->get()),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() finished: "
-          << dump_graph::DumpGraphToFile("encapsulate_xla_computations_after",
-                                         **options.graph, options.flib_def);
+          << DumpGraphToFile("encapsulate_xla_computations_after",
+                             **options.graph, options.flib_def);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 2a770c527b2fae91352fd17dacb13495a3a73f34..4d383698d3a33e391c713a855b3f1521e9aab844 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -31,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -581,10 +581,9 @@ Status ConstructHostGraph(
       &host_graph, outside_compilation_attr_name));
 
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("extract_outside_compilation_host_graph_for_",
-                     xla_cluster_name),
-        host_graph, fld);
+    DumpGraphToFile(absl::StrCat("extract_outside_compilation_host_graph_for_",
+                                 xla_cluster_name),
+                    host_graph, fld);
   }
 
   FunctionDef host_graph_fdef;
@@ -789,7 +788,7 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
                               std::unordered_set<const Node*>{send_from_host});
 
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(shape_inference_graph_name, *g, fld);
+    DumpGraphToFile(shape_inference_graph_name, *g, fld);
   }
 
   // Replace original shape inference graph.
@@ -1620,7 +1619,7 @@ Status ExtractOutsideCompilationForFunction(
   TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
       fbody->graph, outside_compilation_attr_name));
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
+    DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_for_func_before_", func_name),
         *fbody->graph, fld);
   }
@@ -1705,7 +1704,7 @@ Status ExtractOutsideCompilationForFunction(
     TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
   }
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
+    DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
         *graph_out, fld);
   }
@@ -1719,7 +1718,7 @@ Status ExtractOutsideCompilation(
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
     FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("extract_outside_compilation_before", *g, fld);
+    DumpGraphToFile("extract_outside_compilation_before", *g, fld);
   }
 
   std::vector<string> shape_inference_graphs;
@@ -1747,7 +1746,7 @@ Status ExtractOutsideCompilation(
   }
 
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("extract_outside_compilation_after", *g, fld);
+    DumpGraphToFile("extract_outside_compilation_after", *g, fld);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 98e344b3a080aa8aab27cd41564a90427bac151e..7fcf2b42e4315de853a4116968ae1a24647f512a 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -23,7 +23,6 @@ namespace tensorflow {
 namespace {
 
 BuildXlaOpsPassFlags* build_ops_flags;
-DumpGraphFlags* dump_graph_flags;
 MarkForCompilationPassFlags* mark_for_compilation_flags;
 XlaDeviceFlags* device_flags;
 XlaOpsCommonFlags* ops_flags;
@@ -31,15 +30,6 @@ XlaOpsCommonFlags* ops_flags;
 std::vector<Flag>* flag_list;
 std::once_flag flags_init;
 
-void AppendDumpGraphFlagsInternal(std::vector<Flag>* flag_list) {
-  std::vector<Flag> new_flags = {
-      Flag("tf_dump_graph_prefix", &dump_graph_flags->tf_dump_graph_prefix,
-           "Path prefix to which graphs dumped during debugging should be "
-           "written."),
-  };
-  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
-}
-
 void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
   std::vector<Flag> new_flags = {
       Flag("tf_xla_auto_jit", &mark_for_compilation_flags->tf_xla_auto_jit,
@@ -68,7 +58,12 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_fusion_only",
            &mark_for_compilation_flags->tf_xla_fusion_only,
            "enable fusion of element-wise operations only using XLA when "
-           "global_jit_level is ON*.")};
+           "global_jit_level is ON*."),
+      Flag("tf_xla_disable_deadness_safety_checks_for_debugging",
+           &mark_for_compilation_flags
+                ->tf_xla_disable_deadness_safety_checks_for_debugging,
+           "Disable deadness related safety checks when clustering (this is "
+           "unsound).")};
   flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
 }
 
@@ -76,12 +71,9 @@ void AllocateAndParseFlags() {
   build_ops_flags = new BuildXlaOpsPassFlags;
   build_ops_flags->tf_xla_enable_lazy_compilation = true;
 
-  dump_graph_flags = new DumpGraphFlags;
-  dump_graph_flags->tf_dump_graph_prefix = "/tmp/";
-
   mark_for_compilation_flags = new MarkForCompilationPassFlags;
   mark_for_compilation_flags->tf_xla_auto_jit = 0;
-  mark_for_compilation_flags->tf_xla_min_cluster_size = 2;
+  mark_for_compilation_flags->tf_xla_min_cluster_size = 4;
   mark_for_compilation_flags->tf_xla_max_cluster_size =
       std::numeric_limits<int32>::max();
   mark_for_compilation_flags->tf_xla_clustering_debug = false;
@@ -89,6 +81,8 @@ void AllocateAndParseFlags() {
   mark_for_compilation_flags->tf_xla_clustering_fuel =
       std::numeric_limits<int64>::max();
   mark_for_compilation_flags->tf_xla_fusion_only = false;
+  mark_for_compilation_flags
+      ->tf_xla_disable_deadness_safety_checks_for_debugging = false;
 
   device_flags = new XlaDeviceFlags;
   device_flags->tf_xla_compile_on_demand = false;
@@ -107,7 +101,6 @@ void AllocateAndParseFlags() {
       Flag("tf_xla_always_defer_compilation",
            &ops_flags->tf_xla_always_defer_compilation, ""),
   });
-  AppendDumpGraphFlagsInternal(flag_list);
   AppendMarkForCompilationPassFlagsInternal(flag_list);
   xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
 }
@@ -119,11 +112,6 @@ const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
   return *build_ops_flags;
 }
 
-DumpGraphFlags* GetDumpGraphFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return dump_graph_flags;
-}
-
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
   std::call_once(flags_init, &AllocateAndParseFlags);
   return mark_for_compilation_flags;
@@ -144,9 +132,4 @@ void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
   AppendMarkForCompilationPassFlagsInternal(flag_list);
 }
 
-void AppendDumpGraphFlags(std::vector<Flag>* flag_list) {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  AppendDumpGraphFlagsInternal(flag_list);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 5ddea588eef5270880d91623dc05893da265960a..f87edcc1a33af081e74e65ed551c84a4fbc1163a 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -25,27 +25,39 @@ namespace tensorflow {
 
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
-  int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
-                          // computations on CPU and GPU devices.  0 = use
-                          // ConfigProto setting; -1 = off; 1 = on for things
-                          // very likely to be improved; 2 = on for everything.
-                          // Experimental.
-  int32 tf_xla_min_cluster_size;  // Minimum number of operators in an XLA
-                                  // compilation. Ignored for operators placed
-                                  // on an XLA device or operators explicitly
-                                  // marked for compilation.
-  int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
-                                  // compilation.
-  bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
-  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
-                                  // via SessionOptions.
-  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
-                                  // many ops will be marked as eligible for
-                                  // clustering.
-  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
-                            // is set to ON* and overrides its behavior. If
-                            // true, enable fusion of element-wise operations
-                            // only using XLA.
+  // Control compilation of operators into XLA computations on CPU and GPU
+  // devices.  0 = use ConfigProto setting; -1 = off; 1 = on for things very
+  // likely to be improved; 2 = on for everything.
+  //
+  // Experimental.
+  int32 tf_xla_auto_jit;
+
+  // Minimum number of operators in an XLA compilation. Ignored for operators
+  // placed on an XLA device or operators explicitly marked for compilation.
+  int32 tf_xla_min_cluster_size;
+
+  // Maximum number of operators in an XLA compilation.
+  int32 tf_xla_max_cluster_size;
+
+  // Dump graphs during XLA compilation.
+  bool tf_xla_clustering_debug;
+
+  // Enables global JIT compilation for CPU via SessionOptions.
+  bool tf_xla_cpu_global_jit;
+
+  // "Compiler fuel" for clustering.  Only this many ops will be marked as
+  // eligible for clustering.
+  int64 tf_xla_clustering_fuel;
+
+  // tf_xla_fusion_only is effective only when global_jit_level is set to ON*
+  // and overrides its behavior. If true, enable fusion of element-wise
+  // operations only using XLA.
+  bool tf_xla_fusion_only;
+
+  // If tf_xla_disable_deadness_safety_checks_for_debugging is set to true then
+  // we do not do deadness related safety checks.  This is unsound in general,
+  // but can be used as a debugging aid.
+  bool tf_xla_disable_deadness_safety_checks_for_debugging;
 };
 
 // Flags associated with the XLA bridge's xla_device module.
@@ -71,12 +83,6 @@ struct BuildXlaOpsPassFlags {
   bool tf_xla_enable_lazy_compilation;
 };
 
-// Flags for the XLA bridge's dump_graph module.
-struct DumpGraphFlags {
-  // Path prefix to which graphs dumped during debugging should be written.
-  string tf_dump_graph_prefix;
-};
-
 // Return a pointer to the DumpGraphFlags struct;
 // repeated calls return the same pointer.
 // This should be called only after Flags::Parse() has returned.
@@ -88,7 +94,6 @@ MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
 const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
 XlaDeviceFlags* GetXlaDeviceFlags();
 const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
-DumpGraphFlags* GetDumpGraphFlags();
 
 // Appends the flag definitions associated with
 // MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
@@ -96,8 +101,6 @@ DumpGraphFlags* GetDumpGraphFlags();
 // Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
 void AppendMarkForCompilationPassFlags(
     std::vector<tensorflow::Flag>* flag_list);
-void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index ce53f70b79d97ab087fefe542920b33f883632a2..23931a0d7cd4e6cb1f8ba99869db6a3b25b49b97 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
+#include <iterator>
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
@@ -26,12 +27,12 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace {
@@ -144,7 +145,9 @@ SliceInputs MakeSliceIndexAndSizeInt64(const Scope& host_scope,
 // same constant value.  This helps make the generated GraphDef more readable.
 class ConstantCache {
  public:
-  explicit ConstantCache(const Scope& s) : scope_(s) {}
+  explicit ConstantCache(const Scope& s,
+                         const std::vector<const Edge*>& control_deps)
+      : scope_(s), control_deps_(control_deps) {}
 
   Output Get1DHostConstant(int64 constant) {
     auto it = cache_.find(constant);
@@ -152,6 +155,9 @@ class ConstantCache {
       Output new_const =
           ops::Const(scope_.WithOpName("const_", constant), {constant});
       it = cache_.insert({constant, new_const}).first;
+      for (const Edge* e : control_deps_) {
+        scope_.graph()->AddControlEdge(e->src(), new_const.node());
+      }
     }
     return it->second;
   }
@@ -159,11 +165,13 @@ class ConstantCache {
  private:
   Scope scope_;
   std::unordered_map<int, Output> cache_;
+  std::vector<const Edge*> control_deps_;
 };
 
 // Returns a node computing the size of the Slice op with inputs `slice_inputs`.
 Status ComputeSliceSize(const Scope& host_scope,
-                        const SliceInputs& slice_inputs, Output* size) {
+                        const SliceInputs& slice_inputs,
+                        std::vector<const Edge*> control_deps, Output* size) {
   // If slice_size[i] >= 0 then slice_size[i] = slice_size[i].
   //
   // If slice_size[i] == -1 then slice_size[i] = input_size[i] -
@@ -183,7 +191,7 @@ Status ComputeSliceSize(const Scope& host_scope,
       ops::Shape(host_scope.WithOpName("input_shape"), slice_inputs.input,
                  ops::Shape::OutType(DT_INT64));
 
-  ConstantCache constant_pool(host_scope);
+  ConstantCache constant_pool(host_scope, control_deps);
 
   std::vector<Output> slice_size;
   for (int i = 0; i < slice_inputs.size_as_vector.size(); i++) {
@@ -209,11 +217,16 @@ Status ComputeSliceSize(const Scope& host_scope,
   }
 
   // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
-  *size =
-      slice_size.size() == 1
-          ? slice_size[0]
-          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
-                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  if (slice_size.size() == 1) {
+    *size = slice_size[0];
+  } else {
+    auto concat_axis = ops::Const(host_scope.WithOpName("concat_axis"), 0);
+    for (const Edge* e : control_deps) {
+      host_scope.graph()->AddControlEdge(e->src(), concat_axis.node());
+    }
+    *size = ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        concat_axis);
+  }
   return Status::OK();
 }
 
@@ -234,12 +247,21 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
           .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
   Scope host_scope = main_scope.WithAssignedDevice(host_name);
 
+  // In the future we may want to be clever here and avoid the extra Cast ops.
   SliceInputs slice_inputs_int64 =
       MakeSliceIndexAndSizeInt64(host_scope, slice_inputs);
 
+  // Create a list of all control dependencies to be copied when possibly
+  // replacing nodes related to slice_size.
+  Node* old_size;
+  std::vector<const Edge*> old_size_ctrl_deps;
+  TF_RETURN_IF_ERROR(slice->input_node(2, &old_size));
+  absl::c_copy_if(old_size->in_edges(), std::back_inserter(old_size_ctrl_deps),
+                  [](const Edge* e) { return e->IsControlEdge(); });
+
   Output slice_size;
-  TF_RETURN_IF_ERROR(
-      ComputeSliceSize(host_scope, slice_inputs_int64, &slice_size));
+  TF_RETURN_IF_ERROR(ComputeSliceSize(host_scope, slice_inputs_int64,
+                                      old_size_ctrl_deps, &slice_size));
 
   *result =
       ops::Slice(main_scope.WithAssignedDevice(slice->assigned_device_name())
@@ -291,9 +313,9 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// Return true if `n` is a slice we can rewrite to have a static shape
+// Return true if `n` is a slice we should rewrite to have a static shape
 // (i.e. have the output shape only depend on the "size" input).
-xla::StatusOr<bool> IsRewritableSlice(Node* n) {
+xla::StatusOr<bool> ShouldRewriteSlice(Node* n) {
   if (n->type_string() != "Slice") {
     return false;
   }
@@ -311,14 +333,20 @@ xla::StatusOr<bool> IsRewritableSlice(Node* n) {
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  return absl::c_all_of(slice_inputs->size_as_vector,
-                        [](int64 size_i) { return size_i >= -1; });
+  bool slice_size_has_error = absl::c_all_of(
+      slice_inputs->size_as_vector, [](int64 size_i) { return size_i >= -1; });
+  if (!slice_size_has_error) {
+    return false;
+  }
+
+  // No point in rewriting slices that have both size and begin as constants.
+  return !slice_inputs->begin.node()->IsConstant();
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
   std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, ShouldRewriteSlice(n));
     if (is_rewritable) {
       slices_to_rewrite.push_back(n);
     }
@@ -347,15 +375,15 @@ Status IncreaseDynamismForAutoJitPass::Run(
     const GraphOptimizationPassOptions& options) {
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_clustering_debug) {
-    dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
-                                **options.graph, options.flib_def);
+    DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
+                    **options.graph, options.flib_def);
   }
 
   bool changed;
   TF_RETURN_IF_ERROR(FindAndRewriteSlices(options.graph->get(), &changed));
   if (changed && flags->tf_xla_clustering_debug) {
-    dump_graph::DumpGraphToFile("increase_dynamism_for_auto_jit_pass",
-                                **options.graph, options.flib_def);
+    DumpGraphToFile("increase_dynamism_for_auto_jit_pass", **options.graph,
+                    options.flib_def);
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index a2f1b831ad7605237e23c15cc43b337e06265553..2add2c13f92f561904163012ee16cc17ce5badce 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -401,5 +401,57 @@ TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
                      Name("begin/static_shaped_slice/static_shaped_slice"))),
                  _)));
 }
+
+// New constants being created need to have control dependencies copied to
+// ensure correct control flow analysis in TF V2.
+TEST(SliceToDynamicSliceRewriteTest, WithControlDepsToConstant) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  // Add an additional dependency that should still exist in with the new size
+  // variables.
+  Output dependency = ops::Placeholder(root.WithOpName("dependency"), DT_BOOL);
+  root.graph()->AddControlEdge(dependency.node(), size.node());
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  // Check that the new constants have control dependencies.
+  Node* const_0 = testing::FindNodeByName(result.get(),
+                                          "slice/static_shaped_slice/const_0");
+  EXPECT_NE(const_0, nullptr);
+  EXPECT_THAT(const_0,
+              NodeWith(Op("Const"), CtrlDeps(NodeWith(Op("Placeholder"),
+                                                      Name("dependency")))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithConstBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Const(root.WithOpName("begin"), {10, 10});
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* slice_node = testing::FindNodeByName(result.get(), "slice");
+  EXPECT_THAT(slice_node,
+              NodeWith(Op("Slice"), Inputs(Out(NodeWith(Op("Placeholder"))),
+                                           Out(NodeWith(Op("Const"))),
+                                           Out(NodeWith(Op("Const"))))));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index f79bdc1e2e8d82c9144d1bb9923ad36d8541cbdb..7326b6c222b30fc929d87d6b56d2de624dc0b24d 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
+#include "tensorflow/compiler/jit/clone_constants_for_better_clustering.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 #include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
@@ -41,6 +42,9 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 26,
 
 // POST_REWRITE_FOR_EXEC passes that support auto-clustering to enable XLA:
 
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 5,
+                      CloneConstantsForBetterClusteringPass);
+
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 0583774714c6db7a2fa515fc8a0d304e1898db97..3524da23fb396d59b92aafa2892c8ca1d94d01ac 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -19,14 +19,16 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index ad71df5a694a5f8da94675049df1062a7edb6253..88d00f7f8e1b8bb5372dff3508dddcc216297e97 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -34,7 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
@@ -207,6 +210,28 @@ static Status BuildCompilationCache(OpKernelContext* ctx,
   if (!platform.ok()) {
     return platform.status();
   }
+
+  xla::StatusOr<xla::Compiler*> compiler_for_platform =
+      xla::Compiler::GetForPlatform(platform.ValueOrDie());
+  if (!compiler_for_platform.ok()) {
+    // In some rare cases (usually in unit tests with very small clusters) we
+    // may end up transforming an XLA cluster with at least one GPU operation
+    // (which would normally force the cluster to be compiled using XLA:GPU)
+    // into an XLA cluster with no GPU operations (i.e. containing only CPU
+    // operations).  Such a cluster can fail compilation (in way that
+    // MarkForCompilation could not have detected) if the CPU JIT is not linked
+    // in.
+    //
+    // So bail out of _XlaCompile in this case, and let the executor handle the
+    // situation for us.
+    const Status& status = compiler_for_platform.status();
+    if (status.code() == error::NOT_FOUND) {
+      return errors::Unimplemented("Could not find compiler for platform ",
+                                   platform.ValueOrDie()->Name(), ": ",
+                                   status.ToString());
+    }
+  }
+
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform.ValueOrDie());
   client_options.set_intra_op_parallelism_threads(
@@ -304,10 +329,19 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  OP_REQUIRES_OK(
-      ctx, CompileToLocalExecutable(ctx, function_, platform_info_, resources_,
-                                    constants_, /*lazy=*/false, &client,
-                                    &variables, &kernel, &executable));
+  {
+    Status s = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_, /*lazy=*/false,
+        &client, &variables, &kernel, &executable);
+    if (!s.ok() && (platform_info_.device_type().type_string() == DEVICE_CPU ||
+                    platform_info_.device_type().type_string() == DEVICE_GPU)) {
+      // Suggest auto jit if the failure was with GPU or CPU.
+      errors::AppendToMessage(&s,
+                              xla::status_macros::kPossibleAutoJitAlternative);
+    }
+
+    OP_REQUIRES_OK(ctx, s);
+  }
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index cb01690845a5205cf2f8a03db8b34fb279daecb7..6da181352337a0782e7fd8548680cf9c76ce7f33 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -29,11 +31,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -42,14 +44,30 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
 namespace {
+// The clusters we create here are eventually lowered into an
+// _XlaCompile/_XlaRun pair with a TF executor "fallback" that uses the
+// PartitionedCall op to execute the cluster in the regular graph executor if
+// need be.  PartitionedCall, however, reruns the entire TF graph optimization
+// pipeline over the cluster which includes this mark for compilation pass.  To
+// avoid endlessly recursing we tag nodes that we've already visited with this
+// attribute so that we can bail out if we see them a second time.
+//
+// TODO(sanjoy): This method is not robust since it is possible that the
+// optimizations run by PartitionedCall can mutate the cluster arbitrarily,
+// dropping the kXlaAlreadyClustered attributes from all nodes in the process.
+// The correct fix is to use the ConfigProto to pass in some sort of flag into
+// the PartitionedCall kernel that tells it to not rerun auto-clustering on the
+// cluster.
+const char* kXlaAlreadyClustered = "_XlaAlreadyClustered";
+
 // Aggregates information about what kinds of ops are allowed.
 struct OperationFilter {
   // Whether resource variable ops are allowed.  We do not allow resource
@@ -209,10 +227,9 @@ bool IsCompilableCall(const NodeDef& call_def,
   }
 
   FunctionLibraryRuntime::Handle handle;
-  Status status =
-      lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle);
+  Status status = InstantiateFunctionCall(call_def, *lib_runtime, &handle);
   if (!status.ok()) {
-    VLOG(2) << "Rejecting " << call_def.op()
+    VLOG(2) << "Rejecting " << call_def.DebugString()
             << ": could not instantiate: " << status;
     return false;
   }
@@ -441,7 +458,7 @@ Status FindCompilationCandidates(
   std::vector<bool> compile_time_const_nodes(graph.num_node_ids(), false);
   TF_RETURN_IF_ERROR(
       BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
-                             &compile_time_const_nodes));
+                             &compile_time_const_nodes, lib_runtime));
 
   int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
 
@@ -490,7 +507,7 @@ Status FindCompilationCandidates(
                                XlaOpRegistry::AutoclusteringPolicy::kAlways;
 
     OperationFilter op_filter;
-    op_filter.allow_resource_ops = registration->compile_resource_ops;
+    op_filter.allow_resource_ops = registration->compile_all_resource_ops;
     op_filter.allow_stateful_rng_ops = always_auto_cluster;
     op_filter.allow_control_trigger = always_auto_cluster;
     op_filter.allow_dummy_ops = always_auto_cluster;
@@ -525,7 +542,7 @@ Status FindCompilationCandidates(
       continue;
     }
 
-    if (!op_filter.allow_resource_ops &&
+    if (!registration->compile_all_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
       // We don't have a way of returning values of type DT_RESOURCE from XLA
       // computations so we avoid auto-clustering nodes producing DT_RESOURCE.
@@ -591,8 +608,8 @@ Status FindCompilationCandidates(
     }
     // We don't auto-cluster functional control flow nodes containing resource
     // operations because safety checks are trickier in this case.
-    // registration->compile_resource_ops is true for XLA_CPU/XLA_GPU but not
-    // for CPU/GPU.
+    // registration->compile_all_resource_ops is true for XLA_CPU/XLA_GPU but
+    // not for CPU/GPU.
     if (node->type_string() == "While" &&
         !IsCompilableWhile(*node, jit_device_type, op_filter, 0, lib_runtime)) {
       continue;
@@ -614,34 +631,21 @@ Status FindCompilationCandidates(
   return Status::OK();
 }
 
-// Determine the global jit level which is ON if either the
-// GraphOptimizationPassOptions has the jit ON, or if the --tf_xla_auto_jit flag
-// is true.
-OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
-    const GraphOptimizationPassOptions& options) {
-  OptimizerOptions::GlobalJitLevel global_jit_level =
-      options.session_options->config.graph_options()
-          .optimizer_options()
-          .global_jit_level();
-  if (global_jit_level == OptimizerOptions::DEFAULT) {
-    // To set compilation to be on by default, change the following line.
-    global_jit_level = OptimizerOptions::OFF;
-  }
-  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
-  if (flags->tf_xla_auto_jit == -1 ||
-      (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
-    // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
-    // the setting in ConfigProto.
-    global_jit_level =
-        static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
-  }
-  return global_jit_level;
-}
-
 struct Cluster {
   // Identifies the node that represents this cluster in the cycle detection
   // graph.
   int representative = -1;
+
+  // The set of devices the nodes in this cluster are placed on.
+  absl::flat_hash_set<string> devices;
+
+  // If there are resource operation in the cluster then this is the device that
+  // resource operations are placed on.  All resource operations in a cluster
+  // must be placed on the same device.
+  string resource_op_device;
+
+  // True if any node in the cluster has an _XlaCompile attribute set to true.
+  bool has_xla_compile_attr;
 };
 
 }  // anonymous namespace
@@ -683,12 +687,30 @@ Status MarkForCompilationPass::Run(
   // So fix up the source and sink edges before calling into deadness analysis.
   FixupSourceAndSinkEdges(options.graph->get());
 
+  // See explanation on `kXlaAlreadyClustered`.
+  for (Node* n : options.graph->get()->nodes()) {
+    if (n->attrs().Find(kXlaAlreadyClustered)) {
+      return Status::OK();
+    }
+  }
+
   std::unique_ptr<DeadnessAnalysis> deadness;
   {
     XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
     TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness));
   }
 
+  bool deadness_analysis_disabled =
+      GetMarkForCompilationPassFlags()
+          ->tf_xla_disable_deadness_safety_checks_for_debugging;
+
+  if (deadness_analysis_disabled) {
+    LOG(WARNING) << "Deadness analysis was manually disabled via "
+                    "--tf_xla_disable_deadness_safety_checks_for_debugging; "
+                    "auto-clustering "
+                    "is unsound!";
+  }
+
   auto is_compilable = [&](const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -721,9 +743,12 @@ Status MarkForCompilationPass::Run(
     // and some are dead) then don't compile it.  XLA cannot represent the
     // deadness semantics of these nodes correctly and auto-clustering these
     // nodes can cause deadness to propagate to nodes that should be live.
-    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
-      VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
-      return false;
+    if (!deadness_analysis_disabled) {
+      if (node->IsMerge() ||
+          deadness->HasInputsWithMismatchingDeadness(*node)) {
+        VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
+        return false;
+      }
     }
 
     // Check for fusable ops only if requested.
@@ -733,25 +758,9 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // Otherwise use the value of global_jit_level and the device's
-    // autoclustering policy.
-    bool should_compile =
-        registration->autoclustering_policy ==
-            XlaOpRegistry::AutoclusteringPolicy::kAlways ||
-        (registration->autoclustering_policy ==
-             XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
-         global_jit_level != OptimizerOptions::OFF);
-    if (!should_compile) {
-      if (global_jit_level == OptimizerOptions::OFF) {
-        VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
-      } else {
-        VLOG(2)
-            << "Rejecting " << node->name()
-            << ": autoclustering for device only when requested explicitly.";
-      }
-    }
-    return should_compile;
+    return true;
   };
+
   return RunImpl(options, is_compilable);
 }
 
@@ -927,7 +936,7 @@ static Status IgnoreResourceOpForSafetyAnalysis(const Node& n, bool* ignore) {
   if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
     *ignore = true;
   } else {
-    *ignore = registration->compile_resource_ops;
+    *ignore = registration->compile_all_resource_ops;
   }
   return Status::OK();
 }
@@ -935,6 +944,134 @@ static Status IgnoreResourceOpForSafetyAnalysis(const Node& n, bool* ignore) {
 // Sequence number generator to ensure clusters have unique names.
 static std::atomic<int64> cluster_sequence_num;
 
+// Returns true if the devices in `cluster_a` and `cluster_b` are compatible and
+// therefore not a hindrance for combining the two clusters into a larger
+// cluster.
+static Status AreDevicesCompatible(
+    const Cluster& cluster_a, const Cluster& cluster_b,
+    OptimizerOptions::GlobalJitLevel global_jit_level, bool* result) {
+  std::vector<string> devices;
+  absl::c_remove_copy(cluster_a.devices, std::back_inserter(devices), "");
+  absl::c_remove_copy(cluster_b.devices, std::back_inserter(devices), "");
+  absl::c_sort(devices);
+
+  if (devices.empty()) {
+    *result = false;
+    return Status::OK();
+  }
+
+  // First check if we will even be able to pick a device for the larger
+  // combined cluster.
+  bool can_pick_device;
+  TF_RETURN_IF_ERROR(CanPickDeviceForXla(
+      devices, /*allow_mixing_unknown_and_cpu=*/false, &can_pick_device));
+  if (!can_pick_device) {
+    *result = false;
+    return Status::OK();
+  }
+
+  string chosen_device;
+  TF_RETURN_IF_ERROR(PickDeviceForXla(
+      devices, /*allow_mixing_unknown_and_cpu=*/false, &chosen_device));
+
+  // If we are able to pick a device `chosen_device` for the larger cluster, the
+  // resource operations in `cluster_a` and `cluster_b` must be placed on the
+  // same device as `chosen_device`.  This is because the _XlaCompile and
+  // _XlaRun kernels are going to run on and therefore try to access the
+  // resource variables from `chosen_device`, which will be an error if the
+  // resource variables are placed on some other device.
+  auto resource_op_device_ok = [&](const string& resource_op_device) {
+    return resource_op_device.empty() || resource_op_device == chosen_device;
+  };
+
+  *result = resource_op_device_ok(cluster_a.resource_op_device) &&
+            resource_op_device_ok(cluster_b.resource_op_device);
+  if (!*result) {
+    return Status::OK();
+  }
+
+  // We will check this again later, but here we prune out clusters that would
+  // never have been sent to XLA to save compile time.  Without this change we
+  // will e.g. create a CPU cluster only to later notice that the user did not
+  // enable the CPU JIT via --tf_xla_cpu_global_jit.  With this change we avoid
+  // creating the cluster to begin with.
+  //
+  // TODO(b/126629785): It is possible that this is just papering over O(n^2)
+  // behavior in our clustering algorithm.
+  const XlaOpRegistry::DeviceRegistration* registration;
+  DeviceType device_type("");
+  TF_RETURN_IF_ERROR(DeviceToDeviceType(chosen_device, &device_type));
+  TF_RET_CHECK(
+      XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration))
+      << "chosen device = " << chosen_device
+      << "; device type = " << device_type.type() << "; devices ("
+      << devices.size() << ") = " << absl::StrJoin(devices, ", ");
+
+  *result = cluster_a.has_xla_compile_attr || cluster_b.has_xla_compile_attr ||
+            registration->autoclustering_policy ==
+                XlaOpRegistry::AutoclusteringPolicy::kAlways ||
+            (registration->autoclustering_policy ==
+                 XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
+             global_jit_level != OptimizerOptions::OFF);
+
+  return Status::OK();
+}
+
+// Returns `true` iff we should compile `cluster`.
+static Status ShouldCompileClusterImpl(
+    const Cluster& cluster, OptimizerOptions::GlobalJitLevel global_jit_level,
+    bool* should_compile, string* device) {
+  std::vector<string> devices;
+  absl::c_remove_copy(cluster.devices, std::back_inserter(devices), "");
+  absl::c_sort(devices);
+
+  string chosen_device;
+  TF_RETURN_IF_ERROR(PickDeviceForXla(
+      devices, /*allow_mixing_unknown_and_cpu=*/false, &chosen_device));
+
+  const XlaOpRegistry::DeviceRegistration* registration;
+  DeviceType device_type("");
+  TF_RETURN_IF_ERROR(DeviceToDeviceType(chosen_device, &device_type));
+  TF_RET_CHECK(
+      XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration))
+      << "chosen device = " << chosen_device
+      << "; device type = " << device_type.type() << "; devices ("
+      << devices.size() << ") = " << absl::StrJoin(devices, ", ");
+
+  *should_compile =
+      cluster.has_xla_compile_attr ||
+      registration->autoclustering_policy ==
+          XlaOpRegistry::AutoclusteringPolicy::kAlways ||
+      (registration->autoclustering_policy ==
+           XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
+       global_jit_level != OptimizerOptions::OFF);
+
+  VLOG(3) << (*should_compile ? "Compiling" : "Not compiling")
+          << " cluster with device " << chosen_device;
+
+  *device = std::move(chosen_device);
+  return Status::OK();
+}
+
+static Status ShouldCompileCluster(
+    absl::flat_hash_map<int, std::pair<bool, string>>* cache,
+    OptimizerOptions::GlobalJitLevel global_jit_level, const Cluster& cluster,
+    bool* should_compile, string* device) {
+  auto it = cache->find(cluster.representative);
+  if (it != cache->end()) {
+    *should_compile = it->second.first;
+    *device = it->second.second;
+    return Status::OK();
+  }
+
+  string device_s;
+  TF_RETURN_IF_ERROR(ShouldCompileClusterImpl(cluster, global_jit_level,
+                                              should_compile, &device_s));
+  cache->insert({cluster.representative, {*should_compile, device_s}});
+  *device = std::move(device_s);
+  return Status::OK();
+}
+
 Status MarkForCompilationPass::RunImpl(
     const GraphOptimizationPassOptions& options,
     const std::function<bool(const Node*, const DeviceType&)>&
@@ -960,7 +1097,11 @@ Status MarkForCompilationPass::RunImpl(
   }
 
   GraphCycles cycles;
-  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(graph, &cycles));
+  TF_ASSIGN_OR_RETURN(bool cycle_detection_graph_ok,
+                      CreateCycleDetectionGraph(graph, &cycles));
+  if (!cycle_detection_graph_ok) {
+    return Status::OK();
+  }
   TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps(
       graph, options.flib_def, IgnoreResourceOpForSafetyAnalysis, &cycles));
 
@@ -972,6 +1113,23 @@ Status MarkForCompilationPass::RunImpl(
   for (Node* node : compilation_candidates) {
     Cluster& cluster = clusters[node->id()].Get();
     cluster.representative = node->id();
+    const string& device = !node->assigned_device_name().empty()
+                               ? node->assigned_device_name()
+                               : node->requested_device();
+    if (HasResourceInput(*node) || HasResourceOutput(*node)) {
+      cluster.resource_op_device = device;
+    }
+    cluster.has_xla_compile_attr = false;
+    bool xla_compile_attr;
+    if (GetNodeAttr(node->attrs(), kXlaCompileAttr, &xla_compile_attr).ok()) {
+      cluster.has_xla_compile_attr |= xla_compile_attr;
+    }
+    if (options.flib_def->GetAttr(*node, kXlaCompileAttr, &xla_compile_attr)
+            .ok()) {
+      cluster.has_xla_compile_attr |= xla_compile_attr;
+    }
+
+    cluster.devices.insert(device);
     worklist.push_back(&clusters[node->id()]);
   }
 
@@ -985,7 +1143,8 @@ Status MarkForCompilationPass::RunImpl(
   // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
   // example, from the Grappler fusion pass).
   while (!worklist.empty()) {
-    int from = worklist.front()->Get().representative;
+    Cluster* cluster_from = &worklist.front()->Get();
+    int from = cluster_from->representative;
     worklist.pop_front();
 
     Node* node_from = graph->FindNodeId(from);
@@ -1009,13 +1168,17 @@ Status MarkForCompilationPass::RunImpl(
         // graph. No clustering is possible.
         continue;
       }
+
+      const Cluster& cluster_to = clusters[to].Get();
       Node* node_to = graph->FindNodeId(to);
       if (compilation_candidates.find(node_to) ==
           compilation_candidates.cend()) {
         continue;
       }
-      if (node_from->assigned_device_name() !=
-          node_to->assigned_device_name()) {
+      bool devices_compatible;
+      TF_RETURN_IF_ERROR(AreDevicesCompatible(
+          *cluster_from, cluster_to, global_jit_level, &devices_compatible));
+      if (!devices_compatible) {
         continue;
       }
       if (isolated_nodes.count(node_to)) {
@@ -1064,9 +1227,14 @@ Status MarkForCompilationPass::RunImpl(
         if (in_id >= graph->num_node_ids()) continue;
 
         Node* in = graph->FindNodeId(in_id);
-        if (compilation_candidates.find(in) != compilation_candidates.cend() &&
-            in->assigned_device_name() != node_to->assigned_device_name()) {
-          found_split = true;
+        const Cluster& cluster_in = clusters[in_id].Get();
+        if (compilation_candidates.find(in) != compilation_candidates.cend()) {
+          bool devices_compatible;
+          TF_RETURN_IF_ERROR(AreDevicesCompatible(
+              cluster_to, cluster_in, global_jit_level, &devices_compatible));
+          if (!devices_compatible) {
+            found_split = true;
+          }
         }
       }
       if (found_split) continue;
@@ -1080,6 +1248,12 @@ Status MarkForCompilationPass::RunImpl(
 
       // Merge the clusters. ContractEdge uses 'from' as the number of the
       // merged node, so make sure 'from' is the chosen representative.
+      cluster_from->devices.insert(cluster_to.devices.begin(),
+                                   cluster_to.devices.end());
+      if (!cluster_to.resource_op_device.empty()) {
+        cluster_from->resource_op_device = cluster_to.resource_op_device;
+      }
+      cluster_from->has_xla_compile_attr |= cluster_to.has_xla_compile_attr;
       clusters[from].Merge(&clusters[to]);
 
       worklist.push_back(&clusters[from]);
@@ -1089,23 +1263,37 @@ Status MarkForCompilationPass::RunImpl(
 
   // Count the number of non-trivial elements in each cluster.
   std::vector<int> effective_cluster_sizes(graph->num_node_ids());
+
+  // has_functional_control_flow remembers if a cluster contains a functional
+  // control flow node.
+  std::vector<bool> has_functional_control_flow(graph->num_node_ids());
+
   for (const Node* n : compilation_candidates) {
     int cluster = clusters[n->id()].Get().representative;
-    // Identity nodes will be removed if the node gets marked for compilation.
-    // Therefore we don't want to count them towards the effective cluster size.
-    if (n->def().op() != "Identity") {
+    // We want clusters to be big enough that the benefit from XLA's
+    // optimizations offsets XLA related overhead (for instance we add some
+    // Switch/Merge nodes into the graph to implement lazy compilation).  To
+    // this end, we don't count Identity and Constant nodes because they do not
+    // enable interesting optimizations by themselves.
+    if (!n->IsIdentity() && !n->IsConstant()) {
       effective_cluster_sizes[cluster]++;
     }
+    if (n->type_string() == "While" || n->type_string() == "If") {
+      has_functional_control_flow[cluster] = true;
+    }
   }
 
   // Names for each cluster.
   std::unordered_map<int, string> cluster_names;
 
   if (flags->tf_xla_clustering_debug) {
-    dump_graph::DumpGraphToFile("before_mark_for_compilation", **options.graph,
-                                options.flib_def);
+    DumpGraphToFile("before_mark_for_compilation", **options.graph,
+                    options.flib_def);
   }
 
+  absl::flat_hash_map<int, std::pair<bool, string>>
+      should_compile_cluster_cache;
+
   // Mark clusters for compilation that:
   // * are placed on a device that requires compilation (an XlaDevice),
   // * are explicitly marked for compilation (_XlaCompile=true), or
@@ -1113,7 +1301,17 @@ Status MarkForCompilationPass::RunImpl(
   //   if compilation is enabled, otherwise there will be no such candidates).
   const int min_cluster_size = flags->tf_xla_min_cluster_size;
   for (Node* n : compilation_candidates) {
-    int cluster = clusters[n->id()].Get().representative;
+    const Cluster& cluster = clusters[n->id()].Get();
+    bool should_compile;
+    string device;
+    TF_RETURN_IF_ERROR(ShouldCompileCluster(&should_compile_cluster_cache,
+                                            global_jit_level, cluster,
+                                            &should_compile, &device));
+    if (!should_compile) {
+      continue;
+    }
+
+    int cluster_repr = cluster.representative;
 
     // Compile if the user marked this node _XlaCompile=true
     bool compile_attr = false;
@@ -1125,32 +1323,26 @@ Status MarkForCompilationPass::RunImpl(
       marked_for_compilation = compile_attr;
     }
 
-    // Compile if this operator is placed on a device that requires
-    // compilation.
-    DeviceType device_type("");
-    TF_RETURN_IF_ERROR(
-        DeviceToDeviceType(n->assigned_device_name(), &device_type));
-    const XlaOpRegistry::DeviceRegistration* registration;
-    XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
+    // We assume that functional If and While nodes have at least
+    // min_cluster_size non-trivial nodes in them.  It would be more principled
+    // to (recursively) verify this fact, but that's probably not worth the
+    // trouble.
 
-    // Compile if this is a cluster of >= min_cluster_size compilable operators.
-    // Also, always compile if it contains at least one op that is marked for
-    // compilation that is not an Identity op.
-    if (effective_cluster_sizes[cluster] >= min_cluster_size ||
-        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation)) {
-      string& name = cluster_names[cluster];
+    if (effective_cluster_sizes[cluster_repr] >= min_cluster_size ||
+        has_functional_control_flow[cluster_repr] || marked_for_compilation) {
+      string& name = cluster_names[cluster_repr];
 
       if (name.empty()) {
         name = absl::StrCat("cluster_", cluster_sequence_num++);
       }
       n->AddAttr(kXlaClusterAttr, name);
+      n->AddAttr(kXlaAlreadyClustered, true);
       VLOG(3) << "Assigning node " << n->name() << " to cluster " << name;
     }
   }
 
   if (flags->tf_xla_clustering_debug) {
-    dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph,
-                                options.flib_def);
+    DumpGraphToFile("mark_for_compilation", **options.graph, options.flib_def);
 
     // We also dump out an annoated version of the TF graph where the nodes
     // names are prefixed with the cluster names.  This can help visualizing the
@@ -1162,6 +1354,8 @@ Status MarkForCompilationPass::RunImpl(
       if (absl::optional<absl::string_view> cluster_name =
               GetXlaClusterForNode(*n)) {
         n->set_name(absl::StrCat(*cluster_name, "/", n->name()));
+      } else if (n->type_string() == "VarHandleOp") {
+        n->set_name(absl::StrCat("varhandle/", n->name()));
       } else {
         // There is room for improvement here.  In particular, it may help to
         // split these unclustered nodes into classes where every node in a
@@ -1170,8 +1364,8 @@ Status MarkForCompilationPass::RunImpl(
       }
     }
 
-    dump_graph::DumpGraphToFile("mark_for_compilation_annotated", new_graph,
-                                options.flib_def);
+    DumpGraphToFile("mark_for_compilation_annotated", new_graph,
+                    options.flib_def);
   }
 
   VLogClusteringSummary(*graph);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index c2b6250f738fafa35b2c5f79e97cf1281b50a316..da0fbf35de5d6594d00b20fe6ee698050075fd8f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -38,6 +39,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
+using ::tensorflow::testing::FindNodeByName;
+
 namespace tensorflow {
 namespace {
 
@@ -192,35 +195,6 @@ TEST(XlaCompilationTest, HalfSupported) {
   EXPECT_FALSE(clusters.empty());
 }
 
-TEST(XlaCompilationTest, ConcatWithConstArg) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  GraphDef graphdef;
-  {
-    Tensor t(DT_INT32, TensorShape());
-    t.scalar<int32>()() = 0;
-    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-    Node* dim = ops::SourceOp("Const", builder.opts()
-                                           .WithName("Dim")
-                                           .WithAttr("dtype", DT_INT32)
-                                           .WithAttr("value", t));
-    Node* a = ops::SourceOp("Const", builder.opts()
-                                         .WithName("A")
-                                         .WithAttr("dtype", DT_FLOAT)
-                                         .WithAttr("value", t));
-
-    NodeBuilder concat_builder("Concat", "Concat",
-                               builder.opts().op_registry());
-    concat_builder.Input(dim).Input({a, a}).Attr("N", 2);
-    builder.opts().FinalizeBuilder(&concat_builder);
-
-    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
-  }
-
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  auto clusters = GetClusters(*graph);
-  EXPECT_EQ(3, clusters.size());  // Everything should be compiled.
-}
-
 TEST(XlaCompilationTest, FunctionCalls) {
   FunctionDef compilable = FunctionDefHelper::Define(
       "CompilableFn", {"n_a:float", "n_b:float"}, {"n_c:float"}, {},
@@ -424,12 +398,8 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopesGlobalJitOverridden) {
 
   FunctionDefLibrary flib;
   FunctionLibraryDefinition flib_def(graph->op_registry(), flib);
-  SessionOptions session_options;
-  session_options.config.mutable_graph_options()
-      ->mutable_optimizer_options()
-      ->set_global_jit_level(OptimizerOptions::ON_2);
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
-      &graph, &flib_def, &session_options));
+  TF_ASSERT_OK(
+      MarkForCompilationPassTestHelper::MarkForCompilation(&graph, &flib_def));
   auto clusters = GetClusters(*graph);
 
   // The computation is: C = A + relu(A)
@@ -460,7 +430,8 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+  TF_ASSERT_OK(
+      MarkForCompilationPassTestHelper::MarkForCompilation(&graph, false));
   auto clusters = GetClusters(*graph);
 
   // The computation is: C = A + relu(A)
@@ -478,20 +449,28 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
                                          .WithName("A")
                                          .WithAttr("dtype", DT_FLOAT)
                                          .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaCompileAttr, true)
                                          .WithAttr(kXlaScopeAttr, "Scope1"));
-    Node* b = ops::UnaryOp(
-        "Relu", a,
-        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "Scope1"));
-    Node* c = ops::BinaryOp(
-        "MatMul", a, b,
-        builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "Scope2"));
-    ops::BinaryOp(
-        "Add", b, c,
-        builder.opts().WithName("D").WithAttr(kXlaScopeAttr, "Scope2"));
+    Node* b = ops::UnaryOp("Relu", a,
+                           builder.opts()
+                               .WithName("B")
+                               .WithAttr(kXlaCompileAttr, true)
+                               .WithAttr(kXlaScopeAttr, "Scope1"));
+    Node* c = ops::BinaryOp("MatMul", a, b,
+                            builder.opts()
+                                .WithName("C")
+                                .WithAttr(kXlaCompileAttr, true)
+                                .WithAttr(kXlaScopeAttr, "Scope2"));
+    ops::BinaryOp("Add", b, c,
+                  builder.opts()
+                      .WithName("D")
+                      .WithAttr(kXlaCompileAttr, true)
+                      .WithAttr(kXlaScopeAttr, "Scope2"));
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+  TF_ASSERT_OK(
+      MarkForCompilationPassTestHelper::MarkForCompilation(&graph, false));
   auto clusters = GetClusters(*graph);
 
   // The computation is: D = relu(A) + (A @ relu(A))
@@ -513,31 +492,39 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
                                          .WithName("A")
                                          .WithAttr("dtype", DT_FLOAT)
                                          .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaCompileAttr, true)
                                          .WithAttr(kXlaScopeAttr, "ScopeA"));
-    Node* b = ops::UnaryOp(
-        "Relu", a,
-        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
+    Node* b = ops::UnaryOp("Relu", a,
+                           builder.opts()
+                               .WithName("B")
+                               .WithAttr(kXlaCompileAttr, true)
+                               .WithAttr(kXlaScopeAttr, "ScopeB"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
     TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+  TF_ASSERT_OK(
+      MarkForCompilationPassTestHelper::MarkForCompilation(&graph, false));
   auto clusters = GetClusters(*graph);
 
   // The computation is: C = A @ relu(A)
   // where A sits in ScopeA, relu(A) sits in ScopeB, and C sits in ScopeC.
   // In this case, we cannot fuse anything.
-  EXPECT_EQ(2, clusters.size());
+  EXPECT_EQ(3, clusters.size());
   EXPECT_NE(clusters["A"], clusters["B"]);
   EXPECT_EQ(clusters["B"], clusters["C"]);
 }
 
 namespace {
-Node* MakeRead(const Scope& scope, const string& id) {
+Node* MakeRead(const Scope& scope, const string& id,
+               Node** var_handle_op = nullptr) {
   Output var_handle =
       ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
   Output read =
       ops::ReadVariableOp(scope.WithOpName("Read" + id), var_handle, DT_FLOAT);
+  if (var_handle_op) {
+    *var_handle_op = var_handle.node();
+  }
   return read.node();
 }
 
@@ -590,10 +577,7 @@ TEST(XlaCompilationTest, ResourcesClusteringDisallowed) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   absl::flat_hash_map<string, std::vector<string>> cluster_sets =
       GetClusterSets(*graph);
-  ASSERT_EQ(cluster_sets.size(), 1);
-  std::vector<string> expected_clustered_nodes = {"AssignmentW",
-                                                  "ValueToAssignW"};
-  ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes);
+  ASSERT_EQ(cluster_sets.size(), 0);
 }
 
 TEST(XlaCompilationTest, ChainOfOps) {
@@ -621,15 +605,11 @@ TEST(XlaCompilationTest, ChainOfOps) {
   absl::flat_hash_map<string, std::vector<string>> cluster_sets =
       GetClusterSets(*graph, &cluster_names);
 
-  ASSERT_EQ(cluster_sets.size(), 2);
-
-  std::vector<string> expected_clustered_nodes_a = {"AssignmentW0", "ConstN0",
-                                                    "ValueToAssignW0"};
-  ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a);
+  ASSERT_EQ(cluster_sets.size(), 1);
 
-  std::vector<string> expected_clustered_nodes_b = {
+  std::vector<string> expected_clustered_nodes_a = {
       "AssignmentW1", "ConstN1", "ReadR0", "ValueToAssignW1"};
-  ASSERT_EQ(cluster_sets[cluster_names[1]], expected_clustered_nodes_b);
+  ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a);
 }
 
 TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
@@ -688,9 +668,7 @@ TEST(XlaCompilationTest, Retval) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
 
-  EXPECT_EQ(2, clusters.size());
-  EXPECT_TRUE(clusters.find("R") == clusters.cend());
-  EXPECT_EQ(clusters["A"], clusters["B"]);
+  EXPECT_TRUE(clusters.empty());
 }
 
 TEST(XlaCompilationTest, DontCountIdentityOps) {
@@ -709,22 +687,6 @@ TEST(XlaCompilationTest, DontCountIdentityOps) {
   EXPECT_TRUE(clusters.empty());
 }
 
-TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  Scope root = Scope::NewRootScope().ExitOnError();
-  {
-    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
-    auto b = ops::Identity(root.WithOpName("B"), a);
-    b.node()->AddAttr(kXlaCompileAttr, true);
-    auto r = ops::_Retval(root.WithOpName("R"), b, 0);
-  }
-  TF_ASSERT_OK(root.ToGraph(graph.get()));
-  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  auto clusters = GetClusters(*graph);
-
-  EXPECT_TRUE(clusters.empty());
-}
-
 TEST(XlaCompilationTest, ConstOp) {
   // valid data type
   {
@@ -980,8 +942,10 @@ TEST(XlaCompilationTest, DontClusterMergingNodes) {
   absl::string_view xla_gpu_dev1 =
       "/job:worker/replica:0/task:0/device:XLA_GPU:1";
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  Output a = ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2});
-  Output b = ops::Const(root.WithOpName("B_dev1"), 1.0f, {2, 2});
+  Output a = ops::Tanh(root.WithOpName("tanh_A_dev0"),
+                       ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2}));
+  Output b = ops::Tanh(root.WithOpName("tanh_B_dev1"),
+                       ops::Const(root.WithOpName("B_dev1"), 1.0f, {2, 2}));
   Output matmul0 = ops::MatMul(root.WithOpName("MatMul0_dev0"), a, a);
   Output matmul1 = ops::MatMul(root.WithOpName("MatMul1_dev1"), b, b);
 
@@ -1223,5 +1187,132 @@ TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
   EXPECT_NE(clusters["test/tensor_list_reserve"], "");
 }
 
+const char* kCPU0 = "/job:worker/replica:0/task:0/device:CPU:0";
+const char* kGPU0 = "/job:worker/replica:0/task:0/device:GPU:0";
+const char* kXLA_GPU0 = "/job:worker/replica:0/task:0/device:XLA_GPU:0";
+const char* kGPU1 = "/job:worker/replica:0/task:0/device:GPU:1";
+
+TEST(XlaCompilationTest, CreateCombinedCpuGpuClusters) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::Add(root.WithOpName("test/x"), a, b);
+  Output y = ops::MatMul(root.WithOpName("test/y"), a, b);
+  Output z = ops::Add(root.WithOpName("test/z"), x, y);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), "test/z")->set_assigned_device_name(kGPU0);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/x"], "");
+
+  EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
+  EXPECT_EQ(clusters["test/y"], clusters["test/z"]);
+}
+
+TEST(XlaCompilationTest, DontCreateGpu0AndGpu1Clusters) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::Add(root.WithOpName("test/x"), a, b);
+  Output y = ops::Add(root.WithOpName("test/y"), x, x);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kGPU1);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_EQ(clusters["test/x"], "");
+  EXPECT_EQ(clusters["test/y"], "");
+}
+
+TEST(XlaCompilationTest, DontCreateCombinedCpuUnknownClusters) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::Add(root.WithOpName("test/x"), a, b);
+  Output y = ops::Add(root.WithOpName("test/y"), x, x);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kXLA_GPU0);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_EQ(clusters["test/x"], "");
+  EXPECT_EQ(clusters["test/y"], "");
+}
+
+TEST(XlaCompilationTest, ClusterResourceOpsWhenSafe) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Node* var_handle;
+  Node* resource_read = MakeRead(root, "read", &var_handle);
+  Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
+
+  string resource_read_name = resource_read->name();
+  string var_handle_name = var_handle->name();
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  FindNodeByName(graph.get(), "test/b")->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), resource_read_name)
+      ->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), var_handle_name)->set_assigned_device_name(kGPU0);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/b"], "");
+  EXPECT_EQ(clusters["test/b"], clusters[resource_read_name]);
+}
+
+TEST(XlaCompilationTest, DontClusterResourceOpsWhenUnsafe) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Node* var_handle;
+  Node* resource_read = MakeRead(root, "read", &var_handle);
+  Output b = ops::Add(root.WithOpName("test/b"), Output(resource_read, 0), a);
+
+  string resource_read_name = resource_read->name();
+  string var_handle_name = var_handle->name();
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  FindNodeByName(graph.get(), "test/b")->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), resource_read_name)
+      ->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), var_handle_name)->set_assigned_device_name(kCPU0);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_EQ(clusters["test/b"], "");
+  EXPECT_EQ(clusters[resource_read_name], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index 64a3301745790132fe3149bf8fb52d6c45ecc3c1..5f0ebe150fa0300940d52e036f7a60ca9fef22e5 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tensorflow {
 /*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
-    SessionOptions* session_options) {
+    bool enable_global_jit) {
   // Assign all unassigned nodes to the CPU device.
   static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
   for (Node* n : (*graph)->nodes()) {
@@ -30,31 +30,32 @@ namespace tensorflow {
     }
   }
 
+  SessionOptions session_options;
+  if (enable_global_jit) {
+    session_options.config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_global_jit_level(OptimizerOptions::ON_2);
+  }
+
   // Call AddDevices to register the XLA devices.
   //
   // It may be worth refactoring out XlaOpRegistry::RegisterCompilationDevice to
   // make this more direct, but probably not worth it solely for this test.
   std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(*session_options, "", &devices));
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_options, "", &devices));
 
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
-  opt_options.session_options = session_options;
+  opt_options.session_options = &session_options;
   opt_options.flib_def = flib_def;
   MarkForCompilationPass pass;
   return pass.RunImpl(opt_options);
 }
 
 /*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
-    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def) {
-  SessionOptions session_options;
-  return MarkForCompilation(graph, flib_def, &session_options);
-}
-
-/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
-    std::unique_ptr<Graph>* graph) {
+    std::unique_ptr<Graph>* graph, bool enable_global_jit) {
   FunctionDefLibrary flib;
   FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
-  return MarkForCompilation(graph, &flib_def);
+  return MarkForCompilation(graph, &flib_def, enable_global_jit);
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
index 216baaf933dc1f7e694289eea5d23996b595f4d4..df751978562aab8b89aa3966a461c614b1adde5b 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
@@ -23,17 +23,14 @@ class MarkForCompilationPassTestHelper {
  public:
   // Runs the MarkForCompilation pass on `graph` after assigning all nodes in
   // `graph` to the CPU device.  To make testing easier, ignores device
-  // registration, _XlaCompile attributes, input deadness and global jit level.
+  // registration, _XlaCompile attributes and input deadness.
   static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
                                    FunctionLibraryDefinition* flib_def,
-                                   SessionOptions* session_options);
-
-  // Like `MarkForCompilation` but creates a default SessionOptions.
-  static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
-                                   FunctionLibraryDefinition* flib_def);
+                                   bool enable_global_jit = true);
 
   // Like `MarkForCompilation` but creates `flib_def` from the op registry.
-  static Status MarkForCompilation(std::unique_ptr<Graph>* graph);
+  static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
+                                   bool enable_global_jit = true);
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index e1fd2aaee2822daeffb415d053c9c4f56002a856..ffc5d0edbcc7668d5ee137c3c8bbe74167e37a1a 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -20,9 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -272,12 +276,20 @@ Status MustCompileNode(const Node* n, bool* must_compile) {
 // We assume here that the extra repeated (repeated compared to a clustered f
 // where it will always be constant folded) host-side computation of f does not
 // regress performance in any significant manner.  We will have to revisit this
-// algorith with a more complex cost model if this assumption turns out to be
+// algorithm with a more complex cost model if this assumption turns out to be
 // incorrect.
-Status PartiallyDeclusterGraph(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph,
+                               const FunctionLibraryDefinition* flib_def,
+                               Env* env) {
   std::vector<bool> compile_time_const_nodes(graph->num_node_ids());
-  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
-      *graph, nullptr, &compile_time_const_nodes, IsIntraClusterEdge));
+  OptimizerOptions opts;
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      nullptr, env, TF_GRAPH_DEF_VERSION, flib_def, opts);
+  FunctionLibraryRuntime* lib_runtime =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*graph, nullptr,
+                                            &compile_time_const_nodes,
+                                            lib_runtime, IsIntraClusterEdge));
 
   std::vector<Node*> rpo;
   GetReversePostOrder(*graph, &rpo, /*stable_comparator=*/NodeComparatorName(),
@@ -341,7 +353,19 @@ Status PartiallyDeclusterPass::Run(
 
   TF_RETURN_IF_ERROR(
       reduce_device_to_host_copies::PartiallyDeclusterGraph(graph));
-  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(graph));
+  if (options.flib_def == nullptr) {
+    return errors::InvalidArgument(
+        "GraphOptimizationPassOptions::flib_def must be set for "
+        "PartiallyDeclusterPass.");
+  }
+  if (options.session_options == nullptr ||
+      options.session_options->env == nullptr) {
+    return errors::InvalidArgument(
+        "GraphOptimizationPassOptions::session_options::env must be set for "
+        "PartiallyDeclusterPass.");
+  }
+  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(
+      graph, options.flib_def, options.session_options->env));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 1d81a8f4fcbf050663626b1f7660afd71f4027bc..3494d0ee7efb51a5620f68bc1772e111db493c8d 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -90,6 +92,12 @@ Status PartiallyDecluster(std::unique_ptr<Graph>* graph) {
 
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
+  FunctionDefLibrary fdef_lib;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  opt_options.flib_def = &flib_def;
+  SessionOptions session_options;
+  session_options.env = Env::Default();
+  opt_options.session_options = &session_options;
   PartiallyDeclusterPass pass;
   return pass.Run(opt_options);
 }
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index a27e0d9f2a6ecddfdbdb29be673084d77a178d8a..a9c53a943bee58355b634586806c5bedd6fe67b5 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/jit/shape_inference.h"
 
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index fef28fc810cb4e544fe3f271f0b96cebd8a96779..cb8ac06207e00395ef35c23dc96f8461182aa45f 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -17,11 +17,19 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -43,7 +51,7 @@ string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
     return "";
   }
 
-  auto node_name = [cycles, &graph](int node_id) {
+  auto node_name = [&graph](int node_id) {
     if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
       return string("(null)");
     }
@@ -103,7 +111,8 @@ bool HasForwardedRefInput(const Node& node) {
   return false;
 }
 
-Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
+xla::StatusOr<bool> CreateCycleDetectionGraph(const Graph* graph,
+                                              GraphCycles* cycles) {
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     // We rely on the node IDs in the cycle detection graph being consecutive
     // integers starting from 0.
@@ -166,9 +175,11 @@ Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
       }
 
       if (!cycles->InsertEdge(src, dst)) {
-        return errors::Internal(
-            "Cycle detected when adding ", src_type, "->", dst_type,
-            " edge: ", DescribeCycle(cycles, *graph, src, dst));
+        // TODO(b/127521408): We can probably handle this situation with a more
+        // sophisticated SCC based algorithm, but for now we bail out.
+        VLOG(1) << "Cycle detected when adding " << src_type << "->" << dst_type
+                << " edge: " << DescribeCycle(cycles, *graph, src, dst);
+        return false;
       }
       // Drop the original edge.
       continue;
@@ -186,7 +197,8 @@ Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
           DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id()));
     }
   }
-  return Status::OK();
+
+  return true;
 }
 
 absl::optional<absl::string_view> GetXlaClusterForNode(const Node& node) {
@@ -236,4 +248,126 @@ Status AdjustCycleDetectionGraphForResourceOps(
   return Status::OK();
 }
 
+Status PickDeviceForXlaImpl(absl::Span<const string> device_names,
+                            bool allow_mixing_unknown_and_cpu,
+                            bool* out_can_pick_device,
+                            string* out_device_picked) {
+  if (out_can_pick_device) {
+    *out_can_pick_device = true;
+  }
+
+#define FAILED_TO_PICK_DEVICE(failing_status) \
+  do {                                        \
+    if (out_can_pick_device) {                \
+      *out_can_pick_device = false;           \
+      return Status::OK();                    \
+    } else {                                  \
+      return failing_status;                  \
+    }                                         \
+  } while (false)
+
+  TF_RET_CHECK(!device_names.empty()) << "No devices to choose from";
+  DCHECK_NE(out_can_pick_device == nullptr, out_device_picked == nullptr);
+
+  absl::flat_hash_set<absl::string_view> device_names_set;
+  for (absl::string_view device_name : device_names) {
+    if (!device_name.empty()) {
+      device_names_set.insert(device_name);
+    }
+  }
+
+  absl::optional<absl::string_view> maybe_gpu_device;
+  absl::optional<absl::string_view> maybe_cpu_device;
+  absl::optional<absl::string_view> maybe_unknown_device;
+
+  for (absl::string_view device_name : device_names_set) {
+    DeviceNameUtils::ParsedName parsed_name;
+    TF_RET_CHECK(DeviceNameUtils::ParseFullName(device_name, &parsed_name))
+        << device_name;
+    if (parsed_name.type == "GPU") {
+      if (maybe_gpu_device) {
+        FAILED_TO_PICK_DEVICE(errors::Internal(
+            "Multiple GPU devices ", absl::StrJoin(device_names, ", ")));
+      }
+      maybe_gpu_device = device_name;
+    } else if (parsed_name.type == "CPU") {
+      if (maybe_cpu_device) {
+        FAILED_TO_PICK_DEVICE(errors::Internal(
+            "Multiple CPU devices ", absl::StrJoin(device_names, ", ")));
+      }
+      maybe_cpu_device = device_name;
+    } else {
+      if (maybe_unknown_device) {
+        FAILED_TO_PICK_DEVICE(errors::Internal(
+            "Multiple unknown devices ", absl::StrJoin(device_names, ", ")));
+      }
+      maybe_unknown_device = device_name;
+    }
+  }
+
+  if (maybe_unknown_device && maybe_gpu_device) {
+    FAILED_TO_PICK_DEVICE(errors::Internal(
+        "Found both unknown and GPU devices: ", *maybe_unknown_device, ", ",
+        *maybe_gpu_device));
+  }
+
+  if (!allow_mixing_unknown_and_cpu) {
+    if (maybe_unknown_device && maybe_cpu_device) {
+      FAILED_TO_PICK_DEVICE(errors::Internal(
+          "Found both unknown and CPU devices: ", *maybe_unknown_device, ", ",
+          *maybe_cpu_device));
+    }
+  }
+
+  if (out_device_picked) {
+    if (maybe_gpu_device) {
+      *out_device_picked = string(*maybe_gpu_device);
+    } else if (maybe_unknown_device) {
+      *out_device_picked = string(*maybe_unknown_device);
+    } else {
+      *out_device_picked = string(*maybe_cpu_device);
+    }
+  }
+
+  return Status::OK();
+
+#undef FAILED_TO_PICK_DEVICE
+}
+
+Status PickDeviceForXla(absl::Span<const string> device_names,
+                        bool allow_mixing_unknown_and_cpu,
+                        string* out_device_picked) {
+  return PickDeviceForXlaImpl(device_names, allow_mixing_unknown_and_cpu,
+                              /*out_can_pick_device=*/nullptr,
+                              out_device_picked);
+}
+
+Status CanPickDeviceForXla(absl::Span<const string> device_names,
+                           bool allow_mixing_unknown_and_cpu,
+                           bool* out_can_pick_device) {
+  return PickDeviceForXlaImpl(device_names, allow_mixing_unknown_and_cpu,
+                              out_can_pick_device,
+                              /*out_device_picked=*/nullptr);
+}
+
+OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+    const GraphOptimizationPassOptions& options) {
+  OptimizerOptions::GlobalJitLevel global_jit_level =
+      options.session_options->config.graph_options()
+          .optimizer_options()
+          .global_jit_level();
+  if (global_jit_level == OptimizerOptions::DEFAULT) {
+    // To set compilation to be on by default, change the following line.
+    global_jit_level = OptimizerOptions::OFF;
+  }
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+  if (flags->tf_xla_auto_jit != OptimizerOptions::DEFAULT) {
+    // If the flag tf_xla_auto_jit is a valid, non-DEFAULT setting, it overrides
+    // the setting in ConfigProto.
+    global_jit_level =
+        static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
+  }
+  return global_jit_level;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index fa6eaab3900b37baf7271c8c431c8384ceeda59f..af01e1d30231b0bb2c4aac5183e8b02c2e595135 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -20,7 +20,10 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
@@ -52,7 +55,11 @@ bool HasForwardedRefInput(const Node& node);
 // Creates a graph representation to enable cycle detection when clustering.
 // This representation handles loops in graph by disconnecting each loop from
 // the enclosing graph.
-Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles);
+//
+// Returns true for success and false for valid graphs that we can't handle yet
+// (b/127521408).
+xla::StatusOr<bool> CreateCycleDetectionGraph(const Graph* graph,
+                                              GraphCycles* cycles);
 
 // Returns the XLA cluster in which `node` is placed if it is in an XLA cluster,
 // otherwise returns nullopt.
@@ -74,6 +81,57 @@ Status AdjustCycleDetectionGraphForResourceOps(
     const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
     GraphCycles* cycles);
 
+// Picks the device for which XLA should compile a cluster that contains
+// operations placed in devices in `device_names`.  For instance a cluster that
+// contains operations solely placed on the CPU will be compiled into a CPU
+// executable by XLA, whereas a cluster that contains operations placed on the
+// CPU and also operations placed on the GPU will be compiled into a GPU
+// executable.
+//
+// Returns a non-OK Status if no unambiguous choice of device exists.
+//
+// We choose the device using the following rules:
+//
+//  - It is an error for `device_names` to contain more than one device of the
+//    same type.
+//  - GPU is preferred over CPU.
+//  - If `allow_mixing_unknown_and_cpu` is true then unknown devices are
+//    preferred over CPU.
+//  - XLA devices count as "unrecognized devices".
+//
+// This set of rules above implicitly assume that XLA:GPU can compile all
+// operations in the cluster that XLA:CPU can compile, and if
+// `allow_mixing_unknown_and_cpu` then the unrecognized device can also compile
+// all operations in the cluster that XLA:CPU can compile.
+//
+// We provide the `allow_mixing_unknown_and_cpu` knob so that we can do both of
+// the following things:
+//
+// - Let MarkForCompilationPass not inject CPU-placed operations into clusters
+//   that will run on unknown devices (because the unknown XLA backend may not
+//   support every operation supported by CPU).
+// - Let BuildXlaOpsPass successfully infer a compilation device for a cluster
+//   that contains nodes placed on both the CPU and on unknown devices.  In this
+//   case it is the responsibility of the optimization pass that injected the
+//   CPU nodes into the cluster to ensure that these nodes can be compiled by
+//   the unknown XLA backend.
+Status PickDeviceForXla(absl::Span<const string> device_names,
+                        bool allow_mixing_unknown_and_cpu,
+                        string* out_device_picked);
+
+// This is like `PickDeviceForXla` except that it returns false (instead of a
+// non-OK Status) in `out_can_pick_device` if no unambiguous choice of device
+// exists.
+Status CanPickDeviceForXla(absl::Span<const string> device_names,
+                           bool allow_mixing_unknown_and_cpu,
+                           bool* out_can_pick_device);
+
+// Determine the global jit level which is ON if either the
+// GraphOptimizationPassOptions has the jit ON, or if the --tf_xla_auto_jit flag
+// is true.
+OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+    const GraphOptimizationPassOptions& options);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc
index 65bbf3efe85ba30f44531ff6d54b041786dca0a5..cbaac719f2e62cb85e8ac5893d0f4d50be0dae5c 100644
--- a/tensorflow/compiler/jit/xla_cluster_util_test.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -42,7 +45,7 @@ TEST(CreateCycleDetectionGraph, ConnectivityThroughEnterExitRegion) {
   FixupSourceAndSinkEdges(root.graph());
 
   GraphCycles cycles;
-  TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles));
+  TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles).status());
   EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id()));
 }
 
@@ -61,8 +64,93 @@ TEST(CreateCycleDetectionGraph, ConnectivityThroughMultipleEnterExitRegions) {
   FixupSourceAndSinkEdges(root.graph());
 
   GraphCycles cycles;
-  TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles));
+  TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles).status());
   EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id()));
 }
+
+TEST(CreateCycleDetectionGraph, ReachingEnterExit) {
+  // TODO(b/127521408): We can lift this limitation with some work.
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output a = ops::Const(root.WithOpName("a"), Input::Initializer(0.0));
+  Output enter_0 =
+      ops::internal::Enter(root.WithOpName("enter_0"), a, "frame_0");
+  Output exit_0 = ops::internal::Exit(root.WithOpName("exit_0"), enter_0);
+
+  Output add = ops::Add(root.WithOpName("add"), exit_0, exit_0);
+
+  Output enter_1 =
+      ops::internal::Enter(root.WithOpName("enter_1"), add, "frame_0");
+  Output exit_1 = ops::internal::Exit(root.WithOpName("exit_1"), enter_1);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  GraphCycles cycles;
+  TF_ASSERT_OK_AND_ASSIGN(bool ok,
+                          CreateCycleDetectionGraph(root.graph(), &cycles));
+  EXPECT_FALSE(ok);
+}
+
+void CheckPickDeviceResult(absl::string_view expected_result,
+                           bool allow_mixing_unknown_and_cpu,
+                           absl::Span<const absl::string_view> inputs) {
+  std::vector<string> inputs_string;
+  absl::c_transform(inputs, std::back_inserter(inputs_string),
+                    [](absl::string_view sv) { return string(sv); });
+  string result;
+  TF_ASSERT_OK(
+      PickDeviceForXla(inputs_string, allow_mixing_unknown_and_cpu, &result))
+      << "inputs = [" << absl::StrJoin(inputs, ", ")
+      << "], allow_mixing_unknown_and_cpu=" << allow_mixing_unknown_and_cpu
+      << ", expected_result=" << expected_result;
+  EXPECT_EQ(result, expected_result);
+}
+
+void CheckPickDeviceHasError(bool allow_mixing_unknown_and_cpu,
+                             absl::Span<const absl::string_view> inputs) {
+  std::vector<string> inputs_string;
+  absl::c_transform(inputs, std::back_inserter(inputs_string),
+                    [](absl::string_view sv) { return string(sv); });
+  string result;
+  EXPECT_FALSE(
+      PickDeviceForXla(inputs_string, allow_mixing_unknown_and_cpu, &result)
+          .ok());
+}
+
+const char* kCPU0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+const char* kGPU0 = "/job:localhost/replica:0/task:0/device:GPU:0";
+const char* kXPU0 = "/job:localhost/replica:0/task:0/device:XPU:0";
+
+const char* kCPU1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+const char* kGPU1 = "/job:localhost/replica:0/task:0/device:GPU:1";
+const char* kXPU1 = "/job:localhost/replica:0/task:0/device:XPU:1";
+
+TEST(PickDeviceForXla, UniqueDevice) {
+  CheckPickDeviceResult(kGPU0, false, {kGPU0, kGPU0});
+}
+
+TEST(PickDeviceForXla, DeviceOrder) {
+  CheckPickDeviceResult(kGPU0, false, {kGPU0, kCPU0});
+  CheckPickDeviceResult(kXPU0, true, {kXPU0, kCPU0});
+}
+
+TEST(PickDeviceForXla, MultipleUnknownDevices) {
+  CheckPickDeviceHasError(false, {kXPU0, kXPU1});
+}
+
+TEST(PickDeviceForXla, GpuAndUnknown) {
+  CheckPickDeviceHasError(false, {kGPU0, kXPU1});
+}
+
+TEST(PickDeviceForXla, UnknownAndCpu) {
+  CheckPickDeviceHasError(false, {kXPU0, kCPU1});
+}
+
+TEST(PickDeviceForXla, MultipleDevicesOfSameType) {
+  CheckPickDeviceHasError(false, {kCPU0, kCPU1});
+  CheckPickDeviceHasError(false, {kGPU0, kGPU1});
+  CheckPickDeviceHasError(false, {kXPU0, kXPU1});
+  CheckPickDeviceHasError(false, {kCPU0, kCPU1, kGPU0});
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bff4cc57ee1f3ac0fc12aaa93b1588553aec8c45..f53a1e5d403156c5a81925878927a45b8ff51716 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <numeric>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -30,11 +30,11 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -70,9 +70,9 @@ string XlaCompilationCache::DebugString() const {
 // arguments in the supplied list.
 string XlaCompilationCache::Signature::HumanString() const {
   string result = name;
-  for (const auto& a : arg_types) {
-    absl::StrAppend(&result, ",", DataTypeString(a.first),
-                    a.second.DebugString());
+  for (const auto& a : arg_shapes) {
+    absl::StrAppend(&result, ",", DataTypeString(a.first));
+    absl::StrAppend(&result, " [", absl::StrJoin(a.second, ","), "]");
   }
 
   for (const auto& v : arg_values) {
@@ -83,7 +83,7 @@ string XlaCompilationCache::Signature::HumanString() const {
 
 bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
   if (name != other.name) return false;
-  if (arg_types != other.arg_types) return false;
+  if (arg_shapes != other.arg_shapes) return false;
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
@@ -99,10 +99,10 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 uint64 XlaCompilationCache::Signature::Hash::operator()(
     const XlaCompilationCache::Signature& signature) const {
   uint64 h = std::hash<string>()(signature.name);
-  for (const auto& arg : signature.arg_types) {
+  for (const auto& arg : signature.arg_shapes) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
-    h = Hash64Combine(h, std::hash<int>()(arg.second.dims()));
-    for (int dim : arg.second.dim_sizes()) {
+    h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
+    for (int dim : arg.second) {
       h = Hash64Combine(h, std::hash<int>()(dim));
     }
   }
@@ -126,7 +126,7 @@ XlaCompilationCache::BuildSignature(
         break;
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kResource:
-        signature.arg_types.emplace_back(arg.type, arg.shape);
+        signature.arg_shapes.emplace_back(arg.type, arg.DimensionSizes());
         break;
       default:
         return errors::InvalidArgument(
@@ -205,6 +205,10 @@ Status XlaCompilationCache::CompileSingleOp(
   NameAttrList name;
   name.set_name(def.op());
   *name.mutable_attr() = def.attr();
+  // Remove the "_class" attribute from the attribute set used to create the
+  // compilation cache key. This attribute is information for the colocator
+  // and causes false uniqueness between nodes.
+  name.mutable_attr()->erase("_class");
   auto compile_op = [&](XlaCompiler* compiler,
                         XlaCompiler::CompilationResult* result) {
     std::vector<DataType> result_dtypes(ctx->num_outputs());
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 02aa8f8839e2c033e06d043b0f17d89a08d5d9e6..7748b4700f39da4f952278ca6c6d2cadff4d3fb8 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -95,7 +95,9 @@ class XlaCompilationCache : public ResourceBase {
   struct Signature {
     string name;
 
-    std::vector<std::pair<DataType, TensorShape>> arg_types;
+    // List of Tensor types & shapes for compile-time constant arguments to the
+    // compilation, ordered by argument number.
+    std::vector<std::pair<DataType, std::vector<int64>>> arg_shapes;
 
     // List of Tensor values for compile-time constant arguments to the
     // compilation, ordered by argument number. Tensors must be in host memory.
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 94dc61d55fb047c0ea81d98fde24cb55387c27d7..f6e73ab7fecebee80cfbea5f26cc12dd7fc32094 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -46,7 +46,7 @@ Status XlaCpuDeviceFactory::CreateDevices(
       compile_on_demand
           ? XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested
           : XlaOpRegistry::AutoclusteringPolicy::kAlways;
-  registration.compile_resource_ops = true;
+  registration.compile_all_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
@@ -83,9 +83,10 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 13> kAllXlaCpuTypes = {
+constexpr std::array<DataType, 14> kAllXlaCpuTypes = {
     {DT_UINT8, DT_QUINT8, DT_INT8, DT_QINT8, DT_INT32, DT_QINT32, DT_INT64,
-     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL}};
+     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL,
+     DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_CPU, XlaCompileOp, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index e2397f6fcb8677f4bd5151646f9ebacd3e23af5b..0c4a1ce80b9b7e731a92bf80f22a1705a7064c68 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -51,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
@@ -102,7 +102,8 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      absl::make_unique<XlaDeviceAllocator>();
+      absl::make_unique<XlaDeviceAllocator>(
+          backend->stream_executors()[device_ordinal]);
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -289,17 +290,17 @@ xla::StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextLocked() {
     TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "host_to_device_stream",
                                             &host_to_device_stream_,
                                             &need_new_device_context));
-    TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "device_to_host_stream",
-                                            &device_to_host_stream_,
-                                            &need_new_device_context));
     for (std::shared_ptr<se::Stream>& stream : device_to_device_streams_) {
       TF_RETURN_IF_ERROR(
           EnsureStreamOkLocked(backend, "device_to_device_stream", &stream,
                                &need_new_device_context));
     }
     host_to_device_stream = host_to_device_stream_;
-    device_to_host_stream = device_to_host_stream_;
     device_to_device_streams = device_to_device_streams_;
+    // The data transfer requests from device to host could arrive out of order,
+    // so a single stream would cause deadlock. For this case,
+    // xla_device_context would borrow a stream for each transfer request.
+    device_to_host_stream = nullptr;
   } else {
     host_to_device_stream = stream_;
     device_to_host_stream = stream_;
@@ -428,7 +429,7 @@ void XlaDevice::Sync(const DoneCallback& done) {
   // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
   // This achieves a device-wide sync.
   stream->ThenEnqueueOnBackgroundThread(
-      [this, stream, done](se::StreamExecutor*) {
+      [stream, done](se::StreamExecutor*) {
         tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
                                          /*is_expensive=*/true);
         done(stream->ok() ? Status::OK()
@@ -479,7 +480,24 @@ bool XlaDevice::AllowsSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
-Status XlaDevice::CurrentStatus() {
+void XlaDevice::SetHandleDeviceErrorCallback(std::function<Status()> callback) {
+  mutex_lock lock(mu_);
+  device_error_callback_ = callback;
+}
+
+Status XlaDevice::HandleDeviceError() {
+  std::function<Status()> local_device_error_callback;
+  {
+    mutex_lock lock(mu_);
+    local_device_error_callback = device_error_callback_;
+  }
+  if (local_device_error_callback != nullptr) {
+    return local_device_error_callback();
+  }
+  return Status::OK();
+}
+
+Status XlaDevice::RefreshStatus() {
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -488,7 +506,14 @@ Status XlaDevice::CurrentStatus() {
   if (!stream) {
     return Status::OK();
   }
-  return stream->ok() ? Status::OK() : errors::Internal("XlaDevice is not OK.");
+  Status status = stream->RefreshStatus();
+  if (!status.ok()) {
+    // Ignore errors from HandleDeviceError, since by definition the status is
+    // already non-ok, so there's nothing extra to report if HandleDeviceError
+    // itself returns an error.
+    HandleDeviceError().IgnoreError();
+  }
+  return status;
 }
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index e35a1c7d29514dc5777bdbd3858c56401d7b9044..51910c6fabc7e3565ef89b8eb2852f3257162055 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -169,10 +169,12 @@ class XlaDevice : public LocalDevice {
   // Instructs this XlaDevice to return 'sync_on_completion' for
   // AllowsSyncOnCompletion().
   void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
-
   bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
-  Status CurrentStatus() override LOCKS_EXCLUDED(mu_);
+  // Installs an error handling callback when RefreshStatus sees !status.ok().
+  void SetHandleDeviceErrorCallback(std::function<Status()> callback);
+
+  Status RefreshStatus() override LOCKS_EXCLUDED(mu_);
 
  private:
   xla::LocalClient* client() const;
@@ -188,6 +190,9 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadataFromDevice(DeviceBase* device,
                                       const XlaDevice::Metadata** metadata);
 
+  // Handles error when RefreshStatus sees !status.ok().
+  Status HandleDeviceError();
+
   mutable mutex mu_;
   // The metadata of this XlaDevice.
   const Metadata xla_metadata_;
@@ -207,14 +212,12 @@ class XlaDevice : public LocalDevice {
   std::shared_ptr<se::Stream> stream_ GUARDED_BY(mu_);
   // If false, only stream_ is valid and all computation and transfers use
   // stream_. If true, computation is performed by stream_ and transfers are
-  // performed by host_to_device/device_to_host_stream.
+  // performed by host_to_device/device_to_device stream or borrowing a stream
+  // for each device to host transfer.
   const bool use_multiple_streams_;
   // If use_multiple_streams_, host to device transfers are performed using this
   // stream.
   std::shared_ptr<se::Stream> host_to_device_stream_ GUARDED_BY(mu_);
-  // If use_multiple_streams_, device to host transfers are performed using this
-  // stream.
-  std::shared_ptr<se::Stream> device_to_host_stream_ GUARDED_BY(mu_);
   // If use_multiple_streams_, transfers between different devices are performed
   // using these streams.
   std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_
@@ -238,6 +241,9 @@ class XlaDevice : public LocalDevice {
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = true;
 
+  // A callback that will be invoked when RefreshStatus sees a status error.
+  std::function<Status()> device_error_callback_ GUARDED_BY(mu_);
+
   // Set of devices to use. This controls which of the devices on the given
   // platform will have resources allocated. For GPUs this will be
   // filled from visible_gpu_devices list from session configuration.
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 1f3afe8822d441a5ce37617fe18d7767e9bc72e4..b273cbb36e12b9d3d3750c5f21ff4c2ff6e7a6a5 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -25,11 +25,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
+#include "tensorflow/stream_executor/platform/port.h"
 
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator() {}
+XlaDeviceAllocator::XlaDeviceAllocator(
+    stream_executor::StreamExecutor* stream_executor)
+    : stream_executor_(stream_executor) {}
+
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
@@ -48,7 +52,21 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
   delete XlaTensor::FromOpaquePointer(ptr);
 }
 
-void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
+absl::optional<AllocatorStats> XlaDeviceAllocator::GetStats() {
+  absl::optional<stream_executor::AllocatorStats> se_stats =
+      stream_executor_->GetAllocatorStats();
+  if (!se_stats) {
+    return absl::nullopt;
+  }
+
+  tensorflow::AllocatorStats tf_stats;
+  tf_stats.num_allocs = se_stats->num_allocs;
+  tf_stats.bytes_in_use = se_stats->bytes_in_use;
+  tf_stats.peak_bytes_in_use = se_stats->peak_bytes_in_use;
+  tf_stats.largest_alloc_size = se_stats->largest_alloc_size;
+  tf_stats.bytes_limit = se_stats->bytes_limit;
+  return tf_stats;
+}
 
 XlaDeviceContext::XlaDeviceContext(
     std::shared_ptr<se::Stream> compute_stream,
@@ -67,7 +85,6 @@ XlaDeviceContext::XlaDeviceContext(
       shape_representation_fn_(std::move(shape_representation_fn)),
       thread_pool_(thread_pool) {
   CHECK(host_to_device_stream_ != nullptr);
-  CHECK(device_to_host_stream_ != nullptr);
   CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
     shape_representation_fn_ = [](const TensorShape& shape,
@@ -131,7 +148,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         xla::ShapeUtil::MakeShape(shape.element_type(),
                                   xla::AsInt64Slice(shape.dimensions())));
 
-    VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+    VLOG(2) << "Transfer to device as literal: " << literal.ToString() << " "
             << xla_tensor->shaped_buffer().ToString();
     if (UseMultipleStreams() &&
         !transfer_manager_->CanShapedBufferBeAccessedNow(
@@ -196,8 +213,23 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
           << cpu_tensor->shape().DebugString() << " "
           << device_tensor->shape().DebugString();
 
+  std::shared_ptr<se::Stream> device_to_host_stream;
+  if (device_to_host_stream_) {
+    device_to_host_stream = device_to_host_stream_;
+  } else {
+    stream_executor::port::StatusOr<xla::StreamPool::Ptr> ptr_or_status =
+        client_->mutable_backend()->BorrowStream(
+            stream_->parent()->device_ordinal());
+    if (!ptr_or_status.status().ok()) {
+      done(ptr_or_status.status());
+      return;
+    }
+    device_to_host_stream =
+        std::shared_ptr<se::Stream>(std::move(ptr_or_status.ValueOrDie()));
+  }
+
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
-  xla_tensor->WaitForDefinitionEventOnStream(device_to_host_stream_.get());
+  xla_tensor->WaitForDefinitionEventOnStream(device_to_host_stream.get());
 
   // Transfer manager requires the shape of the shaped buffer to be the same as
   // literal shape except for the layout.  Set the literal to use xla_tensor's
@@ -210,11 +242,13 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
       cpu_tensor, &literal));
 
   TensorReference ref(*device_tensor);
+  // Explicitly capture device_to_host_stream to make sure the stream is alive
+  // before the transfer finishes.
   transfer_manager_->TransferLiteralFromDevice(
-      device_to_host_stream_.get(), xla_tensor->shaped_buffer(), literal,
-      [ref, xla_tensor, done](xla::Status status) {
+      device_to_host_stream.get(), xla_tensor->shaped_buffer(), literal,
+      [ref, xla_tensor, done, device_to_host_stream](xla::Status status) {
         done([&]() -> Status {
-          VLOG(1) << "Transfer from device as literal: "
+          VLOG(2) << "Transfer from device as literal: "
                   << xla_tensor->shaped_buffer().ToString();
           return status;
         }());
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index e45db989fac720df6c3458c93a6b8dbb0919f930..efbc4bc148acaf4ed0bb0617084e946b6a8dbc00 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -34,14 +34,18 @@ namespace tensorflow {
 // empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator();
+  XlaDeviceAllocator(se::StreamExecutor* stream_executor);
   ~XlaDeviceAllocator() override;
 
   string Name() override;
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
+
+ private:
+  // The stream executor of the device.
+  se::StreamExecutor* stream_executor_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
@@ -71,9 +75,6 @@ class XlaDeviceContext : public DeviceContext {
   se::Stream* host_to_device_stream() const {
     return host_to_device_stream_.get();
   }
-  se::Stream* device_to_host_stream() const {
-    return device_to_host_stream_.get();
-  }
   se::Stream* device_to_device_stream(int index) const {
     return device_to_device_streams_.at(index).get();
   }
@@ -95,7 +96,8 @@ class XlaDeviceContext : public DeviceContext {
   // idential to stream_, but must not be nullptr.
   std::shared_ptr<se::Stream> host_to_device_stream_;
   // The stream to use for transferring data from device to host. Can be
-  // idential to stream_, but must not be nullptr.
+  // idential to stream_. If nullptr, borrow a stream from backend for each
+  // transfer request to support out-of-order requests.
   std::shared_ptr<se::Stream> device_to_host_stream_;
   // Streams to use for transferring data directly between different devices,
   // e.g., over NVLINK.
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 927f983ba9ef23c8509523f42366c0c89c29db9f..eac6586203eec777d2bccc4b64bc44fa1a3813a1 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -25,9 +25,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/control_flow_ops.h"
 #include "tensorflow/core/kernels/data/generator_dataset_op.h"
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/function_ops.h"
+#include "tensorflow/core/kernels/host_constant_op.h"
 #include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -92,11 +94,22 @@ class XlaAssignVariableOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Const").Device(DEVICE).TypeConstraint("dtype", TYPES),             \
       ConstantOp);                                                             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("HostConst").Device(DEVICE).HostMemory("output"), _HostConstantOp); \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", DT_STRING),          \
       IdentityOp);                                                             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Identity").Device(DEVICE).TypeConstraint<Variant>("T"),            \
+      IdentityOp);                                                             \
+  REGISTER_KERNEL_BUILDER(Name("Identity")                                     \
+                              .Device(DEVICE)                                  \
+                              .TypeConstraint<ResourceHandle>("T")             \
+                              .HostMemory("input")                             \
+                              .HostMemory("output"),                           \
+                          IdentityOp);                                         \
   REGISTER_KERNEL_BUILDER(Name("IdentityN").Device(DEVICE), IdentityNOp);      \
   REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp);  \
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
@@ -195,9 +208,7 @@ class XlaAssignVariableOp : public OpKernel {
       Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp);   \
                                                                                \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name(kArgOp).Device(DEVICE).HostMemory("output").TypeConstraint("T",     \
-                                                                      TYPES),  \
-      ArgOp);                                                                  \
+      Name(kArgOp).Device(DEVICE).TypeConstraint("T", TYPES), ArgOp);          \
   REGISTER_KERNEL_BUILDER(Name(kArgOp)                                         \
                               .Device(DEVICE)                                  \
                               .HostMemory("output")                            \
@@ -206,11 +217,8 @@ class XlaAssignVariableOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name(kArgOp).Device(DEVICE).TypeConstraint<Variant>("T"), ArgOp);        \
                                                                                \
-  REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
-                              .Device(DEVICE)                                  \
-                              .TypeConstraint("T", TYPES)                      \
-                              .HostMemory("input"),                            \
-                          RetvalOp);                                           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kRetOp).Device(DEVICE).TypeConstraint("T", TYPES), RetvalOp);       \
   REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
                               .Device(DEVICE)                                  \
                               .TypeConstraint<ResourceHandle>("T")             \
@@ -241,6 +249,8 @@ class XlaAssignVariableOp : public OpKernel {
                           data::AnonymousIteratorHandleOp);                    \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
+                          data::IteratorGetNextAsOptionalOp);                  \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
                           data::IteratorGetNextSyncOp);                        \
   REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
@@ -251,6 +261,15 @@ class XlaAssignVariableOp : public OpKernel {
                               .Device(DEVICE)                                  \
                               .HostMemory("string_handle"),                    \
                           data::IteratorFromStringHandleOp);                   \
+  REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE),                 \
+                          data::OptionalNoneOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE),            \
+                          data::OptionalFromValueOp);                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("OptionalHasValue").Device(DEVICE).HostMemory("has_value"),         \
+      data::OptionalHasValueOp);                                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE),             \
+                          data::OptionalGetValueOp);                           \
   REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
                               .Device(DEVICE)                                  \
                               .HostMemory("output")                            \
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
index bc0db558d8d0b7c666efcfac5c4926144b830380..a2a06f57698538b03ff08c99f570661b7312b0ec 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -208,7 +209,12 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
   }
 
   GraphCycles cycles;
-  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph, &cycles));
+  TF_ASSIGN_OR_RETURN(bool cycle_detection_graph_ok,
+                      CreateCycleDetectionGraph(&graph, &cycles));
+  if (!cycle_detection_graph_ok) {
+    return Status::OK();
+  }
+
   TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps(
       &graph, &graph.flib_def(), /*resource_ops_to_ignore=*/{}, &cycles));
 
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index b29f6a009b9e9fdba76ac55386a4bec2f339cc0e..b37926073ac9da835dbad8911f998081d5f2c143 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -66,7 +66,7 @@ Status XlaGpuDeviceFactory::CreateDevices(
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
   registration.autoclustering_policy =
       XlaOpRegistry::AutoclusteringPolicy::kAlways;
-  registration.compile_resource_ops = true;
+  registration.compile_all_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index e1a582406153d2af447fa9d4ebcaf0bf0842b132..15f5ddbd7ba845af8fe8796f69bb0db93a0ef6be 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -47,7 +47,7 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
   registration.autoclustering_policy =
       XlaOpRegistry::AutoclusteringPolicy::kAlways;
-  registration.compile_resource_ops = true;
+  registration.compile_all_resource_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 554227f09de0ab4d9e07f199b957657f3121ff06..c915b7118d09abe467ebf0b1d74a1efab94fd724 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index f80cb1812f00d36ddb7c28ae0e77c58498058ef3..0c3adb0bcf97a30d3d195546f5958e5b0527fcf2 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -72,7 +72,7 @@ py_test(
 
 tf_xla_py_test(
     name = "adadelta_test",
-    size = "large",
+    size = "medium",
     srcs = ["adadelta_test.py"],
     deps = [
         ":xla_test",
@@ -138,6 +138,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "add_n_test",
+    size = "small",
+    srcs = ["add_n_test.py"],
+    # TensorList ops are not implemented in the on-demand compilation model yet.
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "addsign_test",
     size = "small",
@@ -170,13 +186,6 @@ tf_xla_py_test(
     name = "argminmax_test",
     size = "small",
     srcs = ["argminmax_test.py"],
-    # ArgMax needs CustomCall on CPU, which is not available in normal
-    # (not precompiled) TensorFlow. The flag below excludes the CPU
-    # backend.
-    disabled_backends = [
-        "cpu",
-        "cpu_ondemand",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -243,9 +252,44 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_xla_py_test(
+    name = "self_adjoint_eig_op_test",
+    size = "medium",
+    srcs = ["self_adjoint_eig_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_xla_py_test(
+    name = "svd_op_test",
+    size = "medium",
+    srcs = ["svd_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -278,10 +322,9 @@ tf_xla_py_test(
     ],
 )
 
-# This test is large because occasionally the cpu test is long for testConcatLargeNumberOfTensors
 tf_xla_py_test(
     name = "concat_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["concat_ops_test.py"],
     deps = [
         ":xla_test",
@@ -407,7 +450,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "eager_test",
-    size = "large",
+    size = "medium",
     srcs = ["eager_test.py"],
     deps = [
         ":xla_test",
@@ -439,7 +482,7 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
-    shard_count = 3,
+    shard_count = 6,
     tags = ["optonly"],
     deps = [
         ":xla_test",
@@ -819,6 +862,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "stateful_random_ops_test",
+    size = "small",
+    srcs = ["stateful_random_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:stateful_random_ops",
+    ],
+)
+
 tf_xla_py_test(
     name = "stateless_random_ops_test",
     size = "small",
@@ -835,7 +892,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "tensor_array_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["tensor_array_ops_test.py"],
     # TensorArray ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
@@ -1060,6 +1117,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -1094,7 +1152,6 @@ cc_library(
         "//tensorflow/core:tensorflow_opensource",
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1193,10 +1250,6 @@ tf_xla_py_test(
     name = "quantized_ops_test",
     size = "medium",
     srcs = ["quantized_ops_test.py"],
-    disabled_backends = [
-        "cpu",
-        "cpu_ondemand",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
diff --git a/tensorflow/compiler/tests/adadelta_test.py b/tensorflow/compiler/tests/adadelta_test.py
index b7b7fda293b69d6f0cec61d0d234277636a3670d..6cf16cc07ff503c4f3e008cfb720224abe5e9166 100644
--- a/tensorflow/compiler/tests/adadelta_test.py
+++ b/tensorflow/compiler/tests/adadelta_test.py
@@ -32,10 +32,18 @@ class AdadeltaOptimizerTest(xla_test.XLATestCase):
 
   def testBasic(self):
     num_updates = 4  # number of ADADELTA steps to perform
+    if "CPU" in self.device:
+      # To avoid timeout on CPU.
+      all_grad = [0.2, 0.01]
+      all_lr = [1.0, 0.1]
+    else:
+      all_grad = [0.2, 0.1, 0.01]
+      all_lr = [1.0, 0.5, 0.1]
+
     for dtype in self.float_types:
       with self.cached_session(), self.test_scope():
-        for grad in [0.2, 0.1, 0.01]:
-          for lr in [1.0, 0.5, 0.1]:
+        for grad in all_grad:
+          for lr in all_lr:
             var0_init = [1.0, 2.0]
             var1_init = [3.0, 4.0]
             var0 = resource_variable_ops.ResourceVariable(
diff --git a/tensorflow/compiler/tests/add_n_test.py b/tensorflow/compiler/tests/add_n_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba11c6d322f126130410a82ef760ce40e2c4e430
--- /dev/null
+++ b/tensorflow/compiler/tests/add_n_test.py
@@ -0,0 +1,84 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AddN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class XlaAddNTest(xla_test.XLATestCase):
+
+  def testAddTensorLists(self):
+    with self.cached_session(), self.test_scope():
+      l1 = list_ops.tensor_list_reserve(
+          element_shape=[], element_dtype=dtypes.float32, num_elements=3)
+      l2 = list_ops.tensor_list_reserve(
+          element_shape=[], element_dtype=dtypes.float32, num_elements=3)
+      l1 = list_ops.tensor_list_set_item(l1, 0, 5.)
+      l2 = list_ops.tensor_list_set_item(l2, 2, 10.)
+
+      l = math_ops.add_n([l1, l2])
+      self.assertAllEqual(
+          list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+          [5.0, 0.0, 10.0])
+
+  def testAddTensorListsFailsIfLeadingDimsMismatch(self):
+    with self.cached_session(), self.test_scope():
+      l1 = list_ops.tensor_list_reserve(
+          element_shape=[], element_dtype=dtypes.float32, num_elements=2)
+      l2 = list_ops.tensor_list_reserve(
+          element_shape=[], element_dtype=dtypes.float32, num_elements=3)
+      l = math_ops.add_n([l1, l2])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "TensorList arguments to AddN must all have the same shape"):
+        list_ops.tensor_list_stack(l, element_dtype=dtypes.float32).eval()
+
+  def testAddTensorListsFailsIfElementShapesMismatch(self):
+    with self.cached_session() as session, self.test_scope():
+      # Use placeholders instead of constant values for shapes to prevent TF's
+      # shape inference from catching this early.
+      l1_element_shape = array_ops.placeholder(dtype=dtypes.int32)
+      l2_element_shape = array_ops.placeholder(dtype=dtypes.int32)
+      l1 = list_ops.tensor_list_reserve(
+          element_shape=l1_element_shape,
+          element_dtype=dtypes.float32,
+          num_elements=3)
+      l2 = list_ops.tensor_list_reserve(
+          element_shape=l2_element_shape,
+          element_dtype=dtypes.float32,
+          num_elements=3)
+      l = math_ops.add_n([l1, l2])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "TensorList arguments to AddN must all have the same shape"):
+        session.run(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32), {
+                l1_element_shape: [],
+                l2_element_shape: [2]
+            })
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index a3651b4b0de2bd34f57464c9552a8fd577866510..c829c50b5518b29c96c0b0117a6cd143911bd1fc 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -311,6 +311,30 @@ class BinaryOpsTest(xla_test.XLATestCase):
           dtype(7),
           expected=np.array([[-6], [-5]], dtype=dtype))
 
+      if dtype in [np.float32, np.float64]:
+        x = np.array([
+            -0.0, 0.0, -0.0, +0.0, np.inf, np.inf, -np.inf, -np.inf, 2.0, 2.0,
+            1.0
+        ],
+                     dtype=dtype)
+        y = np.array(
+            [-0.0, 0.0, +0.0, -0.0, 1.0, -1.0, 1.0, -1.0, 2.0, 1.0, 2.0],
+            dtype=dtype)
+        expected = np.nextafter(x, y)
+
+        # We use assertAllEqual to expose any bugs hidden by relative or
+        # absolute error tolerances.
+        def NextAfterEqualityTest(result, expected, rtol):
+          del rtol
+          return self.assertAllEqual(result, expected)
+
+        self._testBinary(
+            math_ops.nextafter,
+            x,
+            y,
+            expected=expected,
+            equality_test=NextAfterEqualityTest)
+
       # min/max not supported for complex
       if dtype not in self.complex_types | {np.uint8, np.int8}:
         self._testBinary(
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index 5d5e486f616937601214aa169a4c329ab78932c8..eec69ea7d2d9af9ff570f927fb25b668ccce2b97 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -119,7 +119,7 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def testSamplingCorrectness(self):
     np.random.seed(1618)  # Make it reproducible.
-    num_samples = 21000
+    num_samples = 40000
 
     rand_probs = np.random.dirichlet([1., 1., 2., 3.])
     rand_probs2 = np.random.dirichlet([1., 4., 5.], size=3)  # batched
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2187f57960f80300d631bdc7eb8fe5e9c8dddeea..76750decd2963ea12680a46d7340f48e8b011fa9 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -294,6 +294,9 @@ class ConcatTest(xla_test.XLATestCase):
   # The purpose of this is to ensure that XLA on GPU will not run out of memory
   # with too many arguments.
   def testConcatLargeNumberOfTensors(self):
+    if "CPU" in self.device:
+      self.skipTest("This test can time out on CPU, so we will just allow "
+                    "other backends to catch this specific error.")
     with self.cached_session():
       with self.test_scope():
         for concat_dim in range(2):
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index bf5ea7b1fb6fb3c774c4db20d059f131990d20d3..b7d08df9f7d144b71fd0b09535e10b8f596ea6ca 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -72,7 +72,7 @@ class DenseLayerTest(test.TestCase):
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -97,7 +97,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -126,7 +126,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index c9fce39f6c5111f93a54708b59b4c42c3ba844b6..632eccbb097b4e84f10f926e89d7fa439c8a38cd 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -607,6 +608,21 @@ class EagerFunctionTest(xla_test.XLATestCase):
     self.assertEqual(11.0, plus_one.numpy())
     self.assertEqual(9.0, minus_one.numpy())
 
+  def testScanInDefun(self):
+    with self.test_scope():
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='data')
+      v = constant_op.constant(2.0, name='v')
+
+      @def_function.function
+      def f(y):
+        # pylint: disable=unnecessary-lambda
+        return functional_ops.scan(
+            lambda a, x: math_ops.multiply(a, x), y, initializer=v)
+        # pylint: enable=unnecessary-lambda
+
+      r = f(elems)
+      self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
+
 
 class ExcessivePaddingTest(xla_test.XLATestCase):
   """Test that eager execution works with TPU flattened tensors.
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 374942a0b339b816944ea5529e4f84134b60017b..56a8e1b1667f154f6cec475ee0f4f8b308121c09 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -191,6 +191,20 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     mean_val = np.random.random_sample(scale_shape).astype(np.float32)
     var_val = np.random.random_sample(scale_shape).astype(np.float32)
     epsilon = 0.001
+
+    # The TensorFlow FusedBatchNormGrad training operation takes two inputs with
+    # implementation defined values.  In theory the only correct value these
+    # inputs are the corresponding reserve_space_{1|2} outputs from the
+    # FusedBatchNorm training operation.  However, in practice, we rely on the
+    # first one being mean on {C|G}PU, and the second one being variance on CPU
+    # and inverse(sqrt(variance + epsilon)) on GPU (we test this assumption
+    # separately).
+    reserve_space_1_val = mean_val
+    if self.device == "XLA_GPU":
+      reserve_space_2_val = np.reciprocal(np.sqrt(var_val + epsilon))
+    else:
+      reserve_space_2_val = var_val
+
     data_format_src = "NHWC"
     grad_x_ref, grad_scale_ref, grad_offset_ref = self._reference_grad(
         x_val, grad_val, scale_val, mean_val, var_val, epsilon, data_format_src)
@@ -207,18 +221,26 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
           np.float32, shape=x_val_converted.shape, name="grad")
       x = array_ops.placeholder(
           np.float32, shape=x_val_converted.shape, name="x")
-      mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean")
-      var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
+      reserve_space_1 = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="reserve_space_1")
+      reserve_space_2 = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="reserve_space_2")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
       grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
-          grad, x, scale, mean, var, data_format=data_format, is_training=True)
+          grad,
+          x,
+          scale,
+          reserve_space_1,
+          reserve_space_2,
+          data_format=data_format,
+          is_training=True)
 
       grad_x_val, grad_scale_val, grad_offset_val = sess.run(
           [grad_x, grad_scale, grad_offset], {
               grad: grad_val_converted,
               x: x_val_converted,
-              mean: mean_val,
-              var: var_val,
+              reserve_space_1: reserve_space_1_val,
+              reserve_space_2: reserve_space_2_val,
               scale: scale_val
           })
 
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 12741c4d4af86a8c669afc01e92ca5a761c692ab..c8b71f802c41ef8143f3e7b951b3e0134dcda97b 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -68,8 +68,8 @@ class RGBToHSVTest(xla_test.XLATestCase):
                                                 {batch0: inp})
 
       # Verify that processing batch elements together is the same as separate
-      self.assertAllClose(batch1, join1)
-      self.assertAllClose(batch2, join2)
+      self.assertAllCloseAccordingToType(batch1, join1, half_rtol=0.000002)
+      self.assertAllCloseAccordingToType(batch2, join2, half_rtol=0.000002)
       self.assertAllCloseAccordingToType(
           batch2, inp, bfloat16_atol=0.03, half_rtol=0.02)
 
@@ -423,7 +423,7 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
       out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
       if large_tolerance:
         self.assertAllClose(
-            expected[np.newaxis, :, :, np.newaxis], out, rtol=0.03, atol=0.1)
+            expected[np.newaxis, :, :, np.newaxis], out, rtol=2e-4, atol=2e-4)
       else:
         self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
 
@@ -452,7 +452,7 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
         np.array([[1, 2], [3, 4]], dtype=np.float32), [4, 4],
         expected=np.array(
             [[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4]],
-            dtype=np.float32))
+            dtype=np.float32), large_tolerance=True)
 
   def testAlignCorners3x3To2x2(self):
     self._assertForwardOpMatchesExpected(
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index dbea9849e217519874352b789588a2af62f1c826..777a15629804207a8873e3e16b370c8b65056e7b 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -513,9 +513,10 @@ class ElementWiseFusionTest(test.TestCase):
   def testElementWiseClustering(self):
     arg0 = np.random.rand(2, 2).astype(np.float32)
     arg1 = np.random.rand(2, 2).astype(np.float32)
-    os.environ["TF_XLA_FLAGS"] = (
-        "--tf_xla_fusion_only=true "
-        "--tf_xla_cpu_global_jit " + os.environ.get("TF_XLA_FLAGS", ""))
+    old_tf_xla_flags = os.environ.get("TF_XLA_FLAGS", "")
+    os.environ["TF_XLA_FLAGS"] = ("--tf_xla_fusion_only=true "
+                                  "--tf_xla_min_cluster_size=2 "
+                                  "--tf_xla_cpu_global_jit " + old_tf_xla_flags)
     tf_op, tf_count = self.simpleTest(arg0, arg1,
                                       config_pb2.OptimizerOptions.OFF)
     self.assertEqual(0, tf_count)
@@ -525,6 +526,7 @@ class ElementWiseFusionTest(test.TestCase):
     self.assertEqual(2, tfef_count)
 
     self.assertAllClose(tf_op, tfef_op, rtol=1e-1)
+    os.environ["TF_XLA_FLAGS"] = old_tf_xla_flags
 
 
 class LazyCompilationTest(test.TestCase):
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
index c61965b97fc142ce452cf28def8c937f692d2f84..0eec070a906670ff36c772edda22f8291b5b734a 100644
--- a/tensorflow/compiler/tests/matrix_band_part_test.py
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -167,6 +167,11 @@ class MatrixBandPartTest(xla_test.XLATestCase, parameterized.TestCase):
       },
   )
   def testMatrixBandPart(self, batch_shape, rows, cols):
+    # TODO(b/125505881): Disabled due to LLVM backend crash.
+    if self.device == 'XLA_CPU' and cols == 7 and rows == 1 and batch_shape == [
+        1, 3, 2
+    ]:
+      pass
     for dtype in self.float_types:
       with self.cached_session():
         mat = np.ones(batch_shape + [rows, cols]).astype(dtype)
diff --git a/tensorflow/compiler/tests/plugin.bzl b/tensorflow/compiler/tests/plugin.bzl
index fbc8781a3e59faecf985cde5114bf56a041c4be0..46a854d1459b7ea9d9fe3cf7689faee557c2cf84 100644
--- a/tensorflow/compiler/tests/plugin.bzl
+++ b/tensorflow/compiler/tests/plugin.bzl
@@ -18,13 +18,12 @@
 #   git update-index --assume-unchanged tensorflow/compiler/tests/plugin.bzl
 
 plugins = {
-  #"example": {
-  #  "device":"XLA_MY_DEVICE",
-  #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
-  #   "tags":[],
-  #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
-  #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
-  #   "deps":[],
-  #},
+    #"example": {
+    #  "device":"XLA_MY_DEVICE",
+    #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
+    #   "tags":[],
+    #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
+    #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
+    #   "deps":[],
+    #},
 }
-
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 97ffad34c00b8ec16eb1ec109ba5d980e0ce673d..0611d6749fad053657c62368bc230cfe7c929e66 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -36,7 +36,7 @@ class RandomOpsTest(xla_test.XLATestCase):
 
   def _random_types(self):
     return set(self.numeric_types) - set(
-        self.complex_types) - {np.uint8, np.int8}
+        self.complex_types) - {np.uint64, np.int64, np.uint8, np.int8}
 
   def _testRngIsNotConstant(self, rng, dtype):
     # Tests that 'rng' does not always return the same value.
@@ -122,8 +122,8 @@ class RandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == count)
-        self.assertTrue((y <= b).sum() == count)
+        self.assertEqual((y >= a).sum(), count)
+        self.assertEqual((y <= b).sum(), count)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 1521cc760b85b176acb27c1489640e92ef90e247..7623ab79371a9cd6b3512cda392e0e401242a7a1 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -62,7 +62,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index 693f8513bc54e30060a2e963abd504768535a50a..a9a87b8fb3104f8b9870c41e2aa28b0c48c12921 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -134,6 +134,12 @@ class ScatterNdTest(xla_test.XLATestCase):
     expected = np.array([0, 11, 0, 10, 9, 0, 0, 12], dtype=np.int32)
     self.assertAllEqual(expected, self._runScatterNd(indices, updates, [8]))
 
+  def testRepeatedIndices(self):
+    indices = np.array([[0], [1], [0], [1]], dtype=np.int32)
+    updates = np.array([9, 10, 11, 12], dtype=np.float32)
+    expected = np.array([20, 22], dtype=np.int32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [2]))
+
   def testSimple2(self):
     indices = np.array([[1, 0], [1, 1]], dtype=np.int32)
     updates = np.array([11., 12.], dtype=np.float32)
diff --git a/tensorflow/compiler/tests/self_adjoint_eig_op_test.py b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb5c82b22ea1d7400b54045edee0ca0782ce979
--- /dev/null
+++ b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
@@ -0,0 +1,62 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.self_adjoint_eig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class SelfAdjointEigOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _test(self, dtype, shape):
+    np.random.seed(1)
+    x_np = np.random.uniform(
+        low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
+    x_np = x_np + np.swapaxes(x_np, -1, -2)
+    n = shape[-1]
+
+    e_np, _ = np.linalg.eigh(x_np)
+    with self.cached_session() as sess:
+      x_tf = array_ops.placeholder(dtype)
+      with self.test_scope():
+        e, v = linalg_ops.self_adjoint_eig(x_tf)
+      e_val, v_val = sess.run([e, v], feed_dict={x_tf: x_np})
+
+      v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n)
+      self.assertAlmostEqual(np.mean(v_diff**2), 0.0, delta=1e-6)
+      self.assertAlmostEqual(np.mean((e_val - e_np)**2), 0.0, delta=1e-6)
+
+  SIZES = [1, 2, 5, 10, 32]
+  DTYPES = [np.float32]
+  PARAMS = itertools.product(SIZES, DTYPES)
+
+  @parameterized.parameters(*PARAMS)
+  def testSelfAdjointEig(self, n, dtype):
+    for batch_dims in [(), (3,)] + [(3, 2)] * (n < 10):
+      self._test(dtype, batch_dims + (n, n))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateful_random_ops_test.py b/tensorflow/compiler/tests/stateful_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0535579bf29641d34f818c7fe079a9c2d59073c
--- /dev/null
+++ b/tensorflow/compiler/tests/stateful_random_ops_test.py
@@ -0,0 +1,282 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful random-number generation ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.client import device_lib
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import stateful_random_ops as \
+random
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def xla_device_name():
+  devices = device_lib.list_local_devices()
+  def find_type(device_type):
+    for d in devices:
+      if d.device_type == device_type:
+        return d.name
+    return None
+  name = find_type("TPU") or find_type("XLA_GPU") or find_type("XLA_CPU")
+  if name is None:
+    raise ValueError(
+        "Can't find any XLA device. Available devices:\n%s" % devices)
+  return str(name)
+
+
+class StatefulRandomOpsTest(xla_test.XLATestCase):
+  """Test cases for stateful random-number generator operators."""
+
+  @test_util.run_v2_only
+  def testSimple(self):
+    """A simple test.
+    """
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
+      gen.normal(shape=(3,))
+      gen.uniform(shape=(3,), minval=0, maxval=10, dtype=dtypes.uint32)
+      gen.uniform_full_int(shape=(3,))
+
+  @test_util.run_v2_only
+  def testDefun(self):
+    """Test for defun.
+    """
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
+      @def_function.function
+      def f():
+        x = gen.normal(shape=(3,))
+        y = gen.uniform(shape=(3,), minval=0, maxval=10, dtype=dtypes.uint32)
+        z = gen.uniform_full_int(shape=(3,))
+        return (x, y, z)
+      f()
+
+  @test_util.run_v2_only
+  def testThreefry2x32(self):
+    """Tests ThreeFry2x32 conforms to known results.
+    """
+    # Based on
+    # https://github.com/google/jax/blob/8565a3486adf16beb388b2364c9cd930d7a0d92d/tests/random_test.py#L65-L85
+    # which is in turn based on
+    # https://github.com/DEShawResearch/Random123-Boost/blob/65e3d874b67aa7b3e02d5ad8306462f52d2079c0/libs/random/test/test_threefry.cpp#L30-L32
+
+    def uint32s_to_uint64(a, b):
+      return b << 32 | a
+
+    def verify(counter1, counter2, key1, key2, expect1, expect2):
+      counter = uint32s_to_uint64(counter1, counter2)
+      key = uint32s_to_uint64(key1, key2)
+      random.get_global_generator().reset([counter, key])
+      got = random.get_global_generator().uniform_full_int(
+          shape=(2,), dtype=dtypes.uint32)
+      expect = [expect1, expect2]
+      self.assertAllEqual(expect, got)
+      random.get_global_generator().reset([counter, key])
+      got = random.get_global_generator().uniform_full_int(
+          shape=(), dtype=dtypes.uint64)
+      self.assertAllEqual(uint32s_to_uint64(*expect), got)
+
+    with ops.device(xla_device_name()):
+      random.reset_global_generator(seed=0, algorithm=random.RNG_ALG_THREEFRY)
+      verify(0x00000000, 0x00000000, 0x00000000, 0x00000000,
+             0x6b200159, 0x99ba4efe)
+      verify(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+             0x1cb996fc, 0xbb002be7)
+      verify(0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
+             0xc4923a9c, 0x483df7a0)
+
+  @test_util.run_v2_only
+  def testNewState(self):
+    """Tests that the new state is correct.
+    """
+    with ops.device(xla_device_name()):
+      counter = 57
+      key = 0x1234
+      size = 46
+      seed = [counter, key]
+      gen = random.Generator(
+          seed=seed, algorithm=random.RNG_ALG_THREEFRY)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint32)
+      self.assertAllEqual([counter+(size+1)//2, key], gen.state.read_value())
+      gen.reset(seed=seed)
+      gen.uniform_full_int(shape=(size,), dtype=dtypes.uint64)
+      self.assertAllEqual([counter+size, key], gen.state.read_value())
+
+  def _testRngIsNotConstant(self, rng, dtype):
+    # Tests that 'rng' does not always return the same value.
+    # The random-number generator, if working correctly, should produce the
+    # same output multiple times with low probability.
+    x = rng(dtype).numpy()
+    y = rng(dtype).numpy()
+    self.assertFalse(np.array_equal(x, y))
+
+  @test_util.run_v2_only
+  def testUniformIsNotConstant(self):
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      def rng(dtype):
+        maxval = dtype.max
+        # Workaround for b/125364959
+        if dtype == dtypes.uint64:
+          maxval = 10000000
+        return gen.uniform(shape=[2], dtype=dtype, maxval=maxval)
+
+      for dtype in {dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64}:
+        self._testRngIsNotConstant(rng, dtype)
+
+  @test_util.run_v2_only
+  def testNormalIsNotConstant(self):
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      def rng(dtype):
+        return gen.normal(shape=[2], dtype=dtype)
+
+      for dtype in {dtypes.float32}:
+        self._testRngIsNotConstant(rng, dtype)
+
+  @test_util.run_v2_only
+  def testUniformIntIsInRange(self):
+    minval = 2
+    maxval = 33
+    size = 1000
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      for dtype in {dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64}:
+        x = gen.uniform(
+            shape=[size], dtype=dtype, minval=minval, maxval=maxval).numpy()
+        self.assertTrue(np.all(x >= minval))
+        self.assertTrue(np.all(x < maxval))
+
+  @test_util.run_v2_only
+  def testNormalIsFinite(self):
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      for dtype in {dtypes.float32}:
+        x = gen.normal(shape=[10000], dtype=dtype).numpy()
+        self.assertTrue(np.all(np.isfinite(x)))
+
+  def _chi_squared(self, x, bins):
+    """Pearson's Chi-squared test."""
+    x = np.ravel(x)
+    n = len(x)
+    histogram, _ = np.histogram(x, bins=bins, range=(0, 1))
+    expected = n / float(bins)
+    return np.sum(np.square(histogram - expected) / expected)
+
+  @test_util.run_v2_only
+  def testDistributionOfUniform(self):
+    """Use Pearson's Chi-squared test to test for uniformity."""
+    with ops.device(xla_device_name()):
+      n = 1000
+      seed = 12
+      for dtype in {dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64}:
+        gen = random.Generator(seed=seed, algorithm=random.RNG_ALG_THREEFRY)
+        maxval = 1
+        if dtype.is_integer:
+          maxval = 100
+        x = gen.uniform(shape=[n], maxval=maxval, dtype=dtype).numpy()
+        if maxval > 1:
+          # Normalize y to range [0, 1).
+          x = x.astype(float) / maxval
+        # Tests that the values are distributed amongst 10 bins with equal
+        # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
+        # p=0.05. This test is probabilistic and would be flaky if the random
+        # seed were not fixed.
+        val = self._chi_squared(x, 10)
+        self.assertLess(val, 16.92)
+
+  def _normal_cdf(self, x):
+    """Cumulative distribution function for a standard normal distribution."""
+    return 0.5 + 0.5 * np.vectorize(math.erf)(x / math.sqrt(2))
+
+  def _anderson_darling(self, x):
+    """Anderson-Darling test for a standard normal distribution."""
+    x = np.sort(np.ravel(x))
+    n = len(x)
+    i = np.linspace(1, n, n)
+    z = np.sum((2 * i - 1) * np.log(self._normal_cdf(x)) +
+               (2 * (n - i) + 1) * np.log(1 - self._normal_cdf(x)))
+    return -n - z / n
+
+  @test_util.run_v2_only
+  def testDistributionOfNormal(self):
+    """Use Anderson-Darling test to test distribution appears normal."""
+    with ops.device(xla_device_name()):
+      n = 1000
+      for dtype in {dtypes.float32}:
+        gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+        x = gen.normal(shape=[n], dtype=dtype).numpy()
+        # The constant 2.492 is the 5% critical value for the Anderson-Darling
+        # test where the mean and variance are known. This test is probabilistic
+        # so to avoid flakiness the seed is fixed.
+        self.assertLess(self._anderson_darling(x.astype(float)), 2.492)
+
+  @test_util.run_v2_only
+  def testErrors(self):
+    """Tests that proper errors are raised.
+    """
+    shape = [2, 3]
+    with ops.device(xla_device_name()):
+      gen = random.Generator(seed=1234, algorithm=random.RNG_ALG_THREEFRY)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          r"algorithm must be of shape \[\], not"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            gen.state.handle, [0, 0], shape)
+      with self.assertRaisesWithPredicateMatch(
+          TypeError, "Requested dtype: int64"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            gen.state.handle, 1.1, shape)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          "Unsupported algorithm id"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            gen.state.handle, 123, shape)
+      var = variables.Variable([0, 0], dtype=dtypes.uint32)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          "Type mismatch for read of variable .* Expected int64; got"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            var.handle, random.RNG_ALG_THREEFRY, shape)
+      var = variables.Variable([[0]], dtype=dtypes.int64)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          "RNG state must have one and only one dimension, not"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            var.handle, random.RNG_ALG_THREEFRY, shape)
+      var = variables.Variable([0], dtype=dtypes.int64)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          "For the ThreeFry algorithm, the size of state must be at least"):
+        gen_stateful_random_ops.stateful_standard_normal_v2(
+            var.handle, random.RNG_ALG_THREEFRY, shape)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index ee7ca7e6f196e114ff18e2597145e5c198980b08..df5914a518e06e4190c623a14287de8daefebd40 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -167,8 +167,8 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == n)
-        self.assertTrue((y <= b).sum() == n)
+        self.assertEqual((y >= a).sum(), n)
+        self.assertEqual((y <= b).sum(), n)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/svd_op_test.py b/tensorflow/compiler/tests/svd_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..73bec949f3d4a08076853b537d610c156897757c
--- /dev/null
+++ b/tensorflow/compiler/tests/svd_op_test.py
@@ -0,0 +1,81 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.svd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class SvdOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _compute_usvt(self, s, u, v):
+    m = u.shape[-1]
+    n = v.shape[-1]
+    if m <= n:
+      v = v[..., :m]
+    else:
+      u = u[..., :n]
+
+    return np.matmul(u * s[..., None, :], np.swapaxes(v, -1, -2))
+
+  def _testSvdCorrectness(self, dtype, shape):
+    np.random.seed(1)
+    x_np = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(dtype)
+    m, n = shape[-2], shape[-1]
+    _, s_np, _ = np.linalg.svd(x_np)
+    with self.cached_session() as sess:
+      x_tf = array_ops.placeholder(dtype)
+      with self.test_scope():
+        s, u, v = linalg_ops.svd(x_tf, full_matrices=True)
+      s_val, u_val, v_val = sess.run([s, u, v], feed_dict={x_tf: x_np})
+      u_diff = np.matmul(u_val, np.swapaxes(u_val, -1, -2)) - np.eye(m)
+      v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n)
+      # Check u_val and v_val are orthogonal matrices.
+      self.assertLess(np.linalg.norm(u_diff), 1e-2)
+      self.assertLess(np.linalg.norm(v_diff), 1e-2)
+      # Check that the singular values are correct, i.e., close to the ones from
+      # numpy.lingal.svd.
+      self.assertLess(np.linalg.norm(s_val - s_np), 1e-2)
+      # The tolerance is set based on our tests on numpy's svd. As our tests
+      # have batch dimensions and all our operations are on float32, we set the
+      # tolerance a bit larger. Numpy's svd calls LAPACK's svd, which operates
+      # on double precision.
+      self.assertLess(
+          np.linalg.norm(self._compute_usvt(s_val, u_val, v_val) - x_np), 2e-2)
+
+  SIZES = [1, 2, 5, 10, 32, 64]
+  DTYPES = [np.float32]
+  PARAMS = itertools.product(SIZES, DTYPES)
+
+  @parameterized.parameters(*PARAMS)
+  def testSvd(self, n, dtype):
+    for batch_dims in [(), (3,)] + [(3, 2)] * (n < 10):
+      self._testSvdCorrectness(dtype, batch_dims + (n, n))
+      self._testSvdCorrectness(dtype, batch_dims + (2 * n, n))
+      self._testSvdCorrectness(dtype, batch_dims + (n, 2 * n))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index d7e26d79c4c054860ade5c8960a3bca984e020b0..e64aa26cd4bb5f9130def4e0f3a1799db9f2428e 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -25,7 +26,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
@@ -41,8 +44,10 @@ def _make_converter(dtype):
   return _converter
 
 
+@test_util.with_control_flow_v2
 class TensorArrayTest(xla_test.XLATestCase):
 
+  @test_util.disable_control_flow_v2("Tries to evaluate flow")
   def testTensorArrayWriteRead(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -117,6 +122,7 @@ class TensorArrayTest(xla_test.XLATestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0],
                    [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0))
 
+  @test_util.disable_control_flow_v2("b/122315751 (concat)")
   def testTensorArrayWriteConcat(self):
     for dtype in self.numeric_tf_types:
       self._testTensorArrayWriteConcat(dtype)
@@ -224,10 +230,12 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(convert([[2.0, 201.0]]), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
+  @test_util.disable_control_flow_v2("b/122315872 (split)")
   def testTensorArraySplitRead(self):
     for dtype in self.numeric_tf_types:
       self._testTensorArraySplitRead(dtype)
 
+  @test_util.disable_control_flow_v2("TensorArray.grad is not supported in v2")
   def testTensorGradArrayWriteRead(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -261,6 +269,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual([[-2.0]], g_d2)
 
+  @test_util.disable_control_flow_v2("TensorArray.grad is not supported in v2")
   def testTensorGradArrayDynamicWriteRead(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -300,6 +309,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(3, vs)
       self.assertAllEqual(3, g_vs)
 
+  @test_util.disable_control_flow_v2("TensorArray.grad is not supported in v2")
   def testTensorGradAccessTwiceReceiveSameObject(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -317,6 +327,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
+  @test_util.disable_control_flow_v2("b/124334470")
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -327,6 +338,7 @@ class TensorArrayTest(xla_test.XLATestCase):
           "TensorArray dtype is float but op has dtype int32"):
         ta.write(-1, np.int32(7)).flow.eval()
 
+  @test_util.disable_control_flow_v2("b/124334096 verify dtype")
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     # Find two different floating point types, create an array of
     # the first type, but try to read the other type.
@@ -347,6 +359,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         # Test reading from a different index than the one we wrote to
         w0.read(1)
 
+  @test_util.disable_control_flow_v2("b/122315872 (split)")
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -406,6 +419,7 @@ class TensorArrayTest(xla_test.XLATestCase):
           r"Mismatched TensorArray sizes"):
         wb1_grad.flow.eval()
 
+  @test_util.disable_control_flow_v2("TensorArray.grad is not supported in v2")
   def testTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in self.numeric_tf_types:
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
@@ -510,6 +524,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
+  @test_util.disable_control_flow_v2("b/122315751 (concat)")
   def testTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
@@ -557,6 +572,7 @@ class TensorArrayTest(xla_test.XLATestCase):
   def testTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
+  @test_util.disable_control_flow_v2("b/122315751(concat), b/122315872(split)")
   def testTensorArrayGradientSplitConcat(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -903,6 +919,7 @@ class TensorArrayTest(xla_test.XLATestCase):
           "zero-size TensorArrays."):
         ta.stack().eval()
 
+  @test_util.disable_control_flow_v2("b/124335246")
   def testTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
@@ -916,7 +933,9 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
-      self.assertAllEqual([0, 5], ta.concat().eval().shape)
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        # TODO(b/122315751): Enable this.
+        self.assertAllEqual([0, 5], ta.concat().eval().shape)
 
   def testTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
@@ -944,11 +963,13 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(convert([1.0, -1.0]), read_vals[0])
       self.assertAllEqual(convert([10.0, -10.0]), read_vals[1])
 
+  @test_util.disable_control_flow_v2("b/122315734 (scatter)")
   def testTensorArrayScatterRead(self):
     for dtype in self.numeric_tf_types:
       self._testTensorArrayScatterRead(dtype)
     self._testTensorArrayScatterRead(dtypes.bool)
 
+  @test_util.disable_control_flow_v2("b/122315734 (scatter)")
   def testTensorArrayScatterReadAndGradients(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -977,6 +998,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
+  @test_util.disable_control_flow_v2("b/122315378 (gather)")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -1052,4 +1074,6 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertEqual(size1_v, 4)
 
 if __name__ == "__main__":
+  os.environ["TF_XLA_FLAGS"] = ("--tf_xla_min_cluster_size=2 " +
+                                os.environ.get("TF_XLA_FLAGS", ""))
   test.main()
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index 1ecdb22cd0bc7e42d7ff67d20544fd26a65f6204..e07b150d6013582a9a9d3bb14e82cf9a4e962bc1 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -18,29 +18,26 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import os
 import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
-
-
 class ListOpsTest(xla_test.XLATestCase):
 
   def testElementShape(self):
     with self.cached_session() as sess, self.test_scope():
       dim = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(dim, 15), num_elements=20,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(dim, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       e32 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
       e64 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int64)
       self.assertAllEqual(sess.run(e32, {dim: 10}), (10, 15))
@@ -48,8 +45,10 @@ class ListOpsTest(xla_test.XLATestCase):
 
   def testPushPop(self):
     with self.cached_session() as sess, self.test_scope():
-      l = list_ops.tensor_list_reserve(
-          element_shape=(7, 15), num_elements=10, element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(1.0, shape=(7, 15)))
       l = list_ops.tensor_list_push_back(
@@ -62,8 +61,10 @@ class ListOpsTest(xla_test.XLATestCase):
   def testDoNotConstantFoldVariants(self):
     with self.cached_session() as sess, self.test_scope():
       val = array_ops.placeholder(dtype=dtypes.float32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(7, 15), num_elements=10, element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
       # Note: Pushing a Placeholder will force the constant folding code
       # to build a Const node with a DT_VARIANT output. This tests that XLA
       # passes a cf_consider_fn which prevent folding such nodes.
@@ -78,10 +79,10 @@ class ListOpsTest(xla_test.XLATestCase):
 
   def testPushPopSeparateLists(self):
     with self.cached_session() as sess, self.test_scope():
-      l = list_ops.tensor_list_reserve(
-          element_shape=scalar_shape(),
-          num_elements=20,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=[],
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
       l2 = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
       l3 = list_ops.tensor_list_push_back(l, constant_op.constant(3.0))
@@ -102,7 +103,7 @@ class ListOpsTest(xla_test.XLATestCase):
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Set the max number of elements"):
-        self.assertEqual(sess.run(e), 1.0 * np.ones((7, 15)))
+        self.assertAllEqual(sess.run(e), 1.0 * np.ones((7, 15)))
 
   def testEmptyTensorListMax(self):
     with self.cached_session() as sess, self.test_scope():
@@ -114,6 +115,103 @@ class ListOpsTest(xla_test.XLATestCase):
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       self.assertAllEqual(sess.run(e), 3.0 * np.ones((10, 15)))
 
+  def testListFromTensor(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l, e0 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 2.0)
+      l, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e1, 1.0)
+      self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+
+  def testGetSet(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 1.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 2.0])
+
+  def testSetDoesNotUpdatePushIndex(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_shape=[], element_dtype=dtypes.float32, max_num_elements=2)
+      # SetItem should not change the push index.
+      l = list_ops.tensor_list_set_item(l, 1, 3.)
+      l = list_ops.tensor_list_push_back(l, 5.)
+      l = list_ops.tensor_list_push_back(l, 7.)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [5., 7.])
+
+  def testGetSetReserved(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 0.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 0.0])
+
+  def testSetStackReservedUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=None, num_elements=2)
+      l = list_ops.tensor_list_set_item(l, 0, [3.0, 4.0])
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [[3.0, 4.0], [0., 0.]])
+
+  def testPushInEmptyListWithUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None, max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, [3.0, 4.0])
+      # Pushing an element with a different shape should raise an error.
+      with self.assertRaisesRegexp(errors.InternalError, "shape"):
+        l = list_ops.tensor_list_push_back(l, 5.)
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32))
+
+  def testGetSetReservedNonScalar(self):
+    with self.cached_session() as sess, self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32,
+          element_shape=(7, 15),
+          num_elements=2)
+      l = list_ops.tensor_list_set_item(
+          l, 0, constant_op.constant(1.0, shape=(7, 15)))
+      e1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      e2 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e1), np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e2), np.zeros((7, 15)))
+
+  def testStack(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[],
+          max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t.shape.as_list(), [None])
+      self.assertAllEqual(t, [1.0, 2.0])
+
+  def testStackWithUninitializedTensors(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [0., 0., 0.])
 
 if __name__ == "__main__":
+  os.environ['TF_XLA_FLAGS'] = ('--tf_xla_min_cluster_size=2 ' +
+                                os.environ.get('TF_XLA_FLAGS', ''))
   test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 3c2875ba477fa71e9e56a18d10efe0808533dd03..159fa6685b5c333c4669f1f141a4d41f267255f4 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -72,8 +72,9 @@ class UnaryOpsTest(xla_test.XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
+        self.assertEqual(output.dtype, expected.dtype)
         self.assertAllCloseAccordingToType(
-            result, expected, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+            expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
       else:
         equality_test(result, expected, rtol=rtol, atol=atol)
 
@@ -260,7 +261,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)),
+          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]],
+                                     dtype=dtype)).astype(dtype),
           rtol=1e-4,
           atol=1e-6)
 
@@ -391,6 +393,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0]], dtype=dtype),
+          expected=np.array([[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.is_finite,
           np.array(
@@ -705,7 +712,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
           np.array([[2, -1]], dtype=dtype),
-          expected=np.array([[2, 1]], dtype=dtype))
+          expected=np.array([[2, 1]], dtype=np.real(dtype(0)).dtype))
 
       self._assertOpOutputMatchesExpected(
           math_ops.negative,
@@ -743,6 +750,10 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype),
           expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[np.nan]], dtype=dtype),
+          expected=np.array([[0.0]], dtype=dtype))
 
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
@@ -760,7 +771,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
         lambda x: gen_nn_ops.bias_add_grad(x, data_format="NCHW"),
         np.array(
             [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], dtype=np.float32),
-        expected=np.array([10., 26.], dtype=np.float32))
+        expected=np.array([14., 22.], dtype=np.float32))
 
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
@@ -811,6 +822,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
         np.array([1, 2, 0], np.int32),
         expected=np.array([2, 0, 1], dtype=np.int32))
 
+  def testInvertPermutationTwiceIsNoop(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.invert_permutation(array_ops.invert_permutation(x)),
+        np.array([1, 2, 0], np.int32),
+        expected=np.array([1, 2, 0], dtype=np.int32))
+
   def testRank(self):
     rank_op = lambda x: array_ops.rank_internal(x, optimize=False)
     for dtype in self.numeric_types:
@@ -865,6 +882,17 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-1], [1], [4]], dtype=dtype),
           expected=np.int32(3))
 
+  def testSizeWithInt64OutType(self):
+
+    def size_op(x):
+      return array_ops.size_internal(x, optimize=False, out_type=np.int64)
+
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          size_op,
+          np.array([[-1], [1], [4]], dtype=dtype),
+          expected=np.int64(3))
+
   def testUnpack(self):
     self._assertOpOutputMatchesExpected(
         array_ops.unstack,
@@ -928,6 +956,15 @@ class UnaryOpsTest(xla_test.XLATestCase):
                       [[9], [10], [13], [14]], [[11], [12], [15], [16]]]],
                     dtype=dtype), data_format))
 
+      self._assertOpOutputMatchesExpected(
+          make_op("NCHW_VECT_C"),
+          np.arange(32, dtype=dtype).reshape((1, 8, 1, 1, 4)),
+          expected=np.array([[[[[0, 1], [8, 9]], [[16, 17], [24, 25]]],
+                              [[[2, 3], [10, 11]], [[18, 19], [26, 27]]],
+                              [[[4, 5], [12, 13]], [[20, 21], [28, 29]]],
+                              [[[6, 7], [14, 15]], [[22, 23], [30, 31]]]]],
+                            dtype=dtype))
+
   def testSpaceToDepth(self):
 
     def make_op(data_format):
@@ -971,10 +1008,19 @@ class UnaryOpsTest(xla_test.XLATestCase):
                                                      [13, 14, 15, 16]]]],
                     dtype=dtype), data_format))
 
+      self._assertOpOutputMatchesExpected(
+          make_op("NCHW_VECT_C"),
+          np.arange(32, dtype=dtype).reshape((1, 2, 2, 2, 4)),
+          expected=np.array([[[[[0, 1, 2, 3, 16, 17, 18, 19]]],
+                              [[[4, 5, 6, 7, 20, 21, 22, 23]]],
+                              [[[8, 9, 10, 11, 24, 25, 26, 27]]],
+                              [[[12, 13, 14, 15, 28, 29, 30, 31]]]]],
+                            dtype=dtype))
+
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
     zero = np.asarray(0).astype(dtype)
-    expected = np.logaddexp(zero, features)
+    expected = np.logaddexp(zero, features).astype(dtype)
     self._assertOpOutputMatchesExpected(
         nn_ops.softplus, features, expected=expected, rtol=1e-6, atol=9.1e-6)
 
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index fcd7ac5ba1ca5049246e93e6f5f76746fb28c6b8..18c5870e0decb686f4df1c16bbb4a340c93ad21d 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -485,7 +485,7 @@ class SliceAssignTest(xla_test.XLATestCase):
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+    with self.assertRaisesRegexp(errors.FailedPreconditionError,
                                  "uninitialized variable"):
       with self.test_session() as sess, self.test_scope():
         v = resource_variable_ops.ResourceVariable([1, 2])
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
index 4ee144beb7f3243be069d59ee4a613484fe183b3..15a31111cb6b51f6d6e501b86d906d9ba53d1c22 100644
--- a/tensorflow/compiler/tests/while_test.py
+++ b/tensorflow/compiler/tests/while_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -25,7 +26,12 @@ from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -125,6 +131,121 @@ class WhileTest(xla_test.XLATestCase):
       result = sess.run(loop_outputs, {init_index: 0})
       self.assertAllClose(result, [10, 7], rtol=1e-3)
 
-
-if __name__ == '__main__':
+  def _testMaxItersSimple(self):
+    if is_compile_on_demand():
+      self.skipTest("list_ops are not supported in cpu_ondemand")
+    with self.cached_session() as sess, self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      v = constant_op.constant(1.0)
+      p = array_ops.placeholder(dtype=dtypes.int32)
+
+      def create_while_loop():
+        iterations = array_ops.size(p, name="iterations")
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            lambda i, x: (i + 1, v * x), (0, 1.0),
+            maximum_iterations=iterations,
+            name="outer")
+        return array_ops.identity(r[1])
+
+      output = create_while_loop()
+      output = gradients_impl.gradients(output, v)[0]
+
+      result = sess.run(output, feed_dict={p: [0, 0, 0]})
+      print(result)
+      xla_context.Exit()
+
+  def testMaxItersSimple(self):
+    self.skipTest("Fails with v1 control flow")
+    # This fails with old control.
+    # self._testMaxItersSimple()
+
+  @test_util.enable_control_flow_v2
+  def testMaxItersSimpleV2(self):
+    self._testMaxItersSimple()
+
+  def _testNestedWhileLoopWithMaxItersFromOuterContext(self):
+    if is_compile_on_demand():
+      self.skipTest("list_ops are not supported in cpu_ondemand")
+    with self.cached_session() as sess, self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      v = constant_op.constant(1.0)
+      p = array_ops.placeholder(dtype=dtypes.int32)
+
+      def mid_body_builder(iterations):
+
+        def mid_body(i, x):
+          r = control_flow_ops.while_loop(
+              lambda *_: True,
+              lambda i, x: (i + 1, v * x), (0, x),
+              maximum_iterations=iterations,
+              name="inner")
+          return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
+        return mid_body
+
+      def outer_body(i, x):
+        iterations = array_ops.size(p, name="iterations")
+        return (i + 1, x + control_flow_ops.while_loop(
+            lambda *_: True,
+            mid_body_builder(iterations), (0, x),
+            maximum_iterations=iterations,
+            name="mid")[1])
+
+      def create_while_loop():
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            outer_body, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
+        return array_ops.identity(r[1])
+
+      # p:placeholder
+      # j = 0
+      # i, x = 0, 1.
+      # while j++ < 5:
+      #   i1, x1 = 0, x
+      #   while i1++ < len(p):
+      #     i2, x2 = 0, x1
+      #     while i2++ < len(p):
+      #       x2 = v * x2
+      #     x1 = grad(x1 + x2, v)
+      #   x = x1
+      # output = x
+      output = create_while_loop()
+      sess.run(output, feed_dict={p: [0, 0, 0]})
+      xla_context.Exit()
+
+  def testNestedWhileLoopWithMaxItersFromOuterContext(self):
+    self._testNestedWhileLoopWithMaxItersFromOuterContext()
+
+  @test_util.enable_control_flow_v2
+  def testNestedWhileLoopWithMaxItersFromOuterContextV2(self):
+    self._testNestedWhileLoopWithMaxItersFromOuterContext()
+
+  @test_util.enable_control_flow_v2
+  def testMap(self):
+    if is_compile_on_demand():
+      self.skipTest("list_ops are not supported in cpu_ondemand")
+    with self.cached_session(), self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      nums = [1, 2, 3, 4, 5, 6]
+      elems = constant_op.constant(nums, name="data")
+      r = map_fn.map_fn(lambda x: math_ops.multiply(math_ops.add(x, 3), 2),
+                        elems)
+      self.assertAllEqual(r, np.array([(x + 3) * 2 for x in nums]))
+      xla_context.Exit()
+
+
+def is_compile_on_demand():
+  return ("TF_XLA_FLAGS" in os.environ and
+          "tf_xla_compile_on_demand" in os.environ["TF_XLA_FLAGS"])
+
+
+if __name__ == "__main__":
+  os.environ["TF_XLA_FLAGS"] = ("--tf_xla_min_cluster_size=2 " +
+                                os.environ.get("TF_XLA_FLAGS", ""))
   test.main()
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..12a51f7d32dc08f9ed40bdf57350ae451c90ce83
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -0,0 +1,447 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_shared_object",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "trt_op_kernels",
+    srcs = [
+        "kernels/get_serialized_resource_op.cc",
+        "kernels/trt_engine_op.cc",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trt_allocator",
+        ":trt_conversion",
+        ":trt_logging",
+        ":trt_plugins",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+    alwayslink = 1,
+)
+
+tf_cc_shared_object(
+    name = "python/ops/libtftrt.so",
+    copts = tf_copts(is_external = True),
+    linkopts = ["-lm"],
+    deps = [
+        ":trt_op_kernels",
+        ":trt_op_libs",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_cuda_cc_test(
+    name = "get_serialized_resource_op_test",
+    size = "small",
+    srcs = ["kernels/get_serialized_resource_op_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        # TODO(laigd): consider splitting get_serialized_resource_op out from
+        # TF-TRT.
+        ":trt_op_kernels",
+        ":trt_op_libs",
+        ":trt_resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "trt_engine_op",
+        "get_serialized_resource_op",
+    ],
+)
+
+cc_library(
+    name = "trt_op_libs",
+    deps = [
+        ":get_serialized_resource_op_op_lib",
+        ":trt_engine_op_op_lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_logging",
+    srcs = ["utils/trt_logger.cc"],
+    hdrs = ["utils/trt_logger.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_gen_op_wrapper_py(
+    name = "trt_ops",
+    deps = [
+        ":trt_op_libs",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_ops_loader",
+    srcs = ["python/ops/trt_ops.py"],
+    dso = [
+        "python/ops/libtftrt.so",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+    kernels = [
+        ":trt_op_kernels",
+        ":trt_op_libs",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_resources",
+    srcs = [
+        "utils/trt_int8_calibrator.cc",
+        "utils/trt_resources.cc",
+    ],
+    hdrs = [
+        "utils/trt_int8_calibrator.h",
+        "utils/trt_lru_cache.h",
+        "utils/trt_resources.h",
+    ],
+    deps = [
+        ":trt_allocator",
+        ":trt_logging",
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_allocator",
+    srcs = ["utils/trt_allocator.cc"],
+    hdrs = ["utils/trt_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cc_test(
+    name = "trt_allocator_test",
+    size = "small",
+    srcs = ["utils/trt_allocator_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "trt_lru_cache_test",
+    size = "small",
+    srcs = ["utils/trt_lru_cache_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_resources",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Library for the node-level conversion portion of TensorRT operation creation
+tf_cuda_library(
+    name = "trt_conversion",
+    srcs = [
+        "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
+    ],
+    hdrs = [
+        "convert/convert_graph.h",
+        "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
+    ],
+    deps = [
+        ":segment",
+        ":trt_allocator",
+        ":trt_plugins",
+        ":trt_logging",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "convert_graph_test",
+    size = "medium",
+    srcs = ["convert/convert_graph_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_conversion",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "convert_nodes_test",
+    size = "medium",
+    srcs = ["convert/convert_nodes_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_logging",
+        ":trt_conversion",
+        ":trt_plugins",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+# Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+        "segment/union_find.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "segment_test",
+    size = "small",
+    srcs = ["segment/segment_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":segment",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["convert/utils.cc"],
+    hdrs = ["convert/utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_proto_parsing",
+    ],
+)
+
+cc_library(
+    name = "py_utils",
+    srcs = ["utils/py_utils.cc"],
+    hdrs = ["utils/py_utils.h"],
+    copts = tf_copts(),
+    deps = if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
similarity index 73%
rename from tensorflow/contrib/tensorrt/convert/convert_graph.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index e2350f69a6ace005c319aa97efa99f5e36e6a831..f2a6b74ec248cf9f2ee66de830900a9d7841b55e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <fstream>
 #include <list>
@@ -24,13 +24,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
@@ -63,100 +62,15 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
-// Returns compiled TRT version information {Maj, Min, Patch}
-std::vector<int> GetLinkedTensorRTVersion() {
-  return {NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
-}
-
-// Returns loaded TRT library version {Maj, Min, Patch}
-std::vector<int> GetLoadedTensorRTVersion() {
-  int ver = getInferLibVersion();
-  int ver_major = ver / 1000;
-  ver = ver - ver_major * 1000;
-  int ver_minor = ver / 100;
-  int ver_patch = ver - ver_minor * 100;
-  return {ver_major, ver_minor, ver_patch};
-}
+using absl::StrAppend;
+using absl::StrCat;
 
 TrtCandidateSelector::TrtCandidateSelector(
-    const grappler::GraphProperties& graph_properties, int precision_mode)
+    const grappler::GraphProperties& graph_properties,
+    TrtPrecisionMode precision_mode)
     : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
-Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
-  // TODO(laigd): move this set to TrtNodeValidator where it should belong.
-  // LINT.IfChange
-  static const std::set<string> candidate_ops = {
-      "Abs",
-      "Add",
-      "AvgPool",
-      "BatchMatMul",
-      "BiasAdd",
-      "ConcatV2",
-      "Const",
-      "Conv2D",
-      "DepthwiseConv2dNative",
-      "Div",
-      "Exp",
-      "ExpandDims",
-      "FusedBatchNorm",
-      "FusedBatchNormV2",
-      "Identity",
-      "Log",
-      "MatMul",
-      "Max",
-      "MaxPool",
-      "Maximum",
-      "Mean",
-      "Min",
-      "Minimum",
-      "Mul",
-      "Neg",
-      "Pad",
-      "Prod",
-      "RealDiv",
-      "Reciprocal",
-      "Relu",
-      "Relu6",
-      "Reshape",
-      "Rsqrt",
-      "Rsqrt",
-      "Sigmoid",
-      "Snapshot",
-      "Softmax",
-      "Sqrt",
-      "Square",
-      "Squeeze",
-      "StridedSlice",
-      "Sub",
-      "Sum",
-      "Tanh",
-      "TopKV2",
-      "Transpose",
-  };
-  bool is_supported_op_type =
-      (candidate_ops.count(node->type_string()) ||
-       PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
-  static const std::set<string> quantize_ops = {
-      "QuantizeAndDequantizeV2",
-      "QuantizeAndDequantizeV3",
-      "FakeQuantWithMinMaxVars",
-      "FakeQuantWithMinMaxArgs",
-  };
-  // In INT8 mode, we will always apply the quantization ranges provided by
-  // these ops to the relevant tensors. This happens regardless of the value of
-  // use_calibration.
-  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
-    is_supported_op_type = true;
-  }
-  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
-  if (!is_supported_op_type) {
-    return errors::Unimplemented("Op type ", node->type_string(),
-                                 " is not supported");
-  }
-
+Status TrtCandidateSelector::IsTensorRTCandidate(const Node* node) {
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
   std::vector<std::pair<const NodeDef*, int>> input_node_and_ports;
@@ -166,83 +80,32 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
                                       input_edge->src_output());
   }
   return validator_.ValidateNode(node->def(), input_node_and_ports,
-                                 graph_properties_);
+                                 precision_mode_, graph_properties_);
 }
 
 namespace {
 
-tensorflow::Status BuildNodeMap(
-    const tensorflow::Graph& graph,
-    std::unordered_map<string, tensorflow::Node*>* node_map) {
+Status BuildNodeMap(const Graph& graph,
+                    std::unordered_map<string, Node*>* node_map) {
   for (auto* node : graph.op_nodes()) {
     if (!node_map->insert({node->name(), node}).second) {
-      return tensorflow::errors::AlreadyExists(
-          "Node name is not unique in graph: " + node->name());
+      return errors::AlreadyExists("Node name is not unique in graph: " +
+                                   node->name());
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
 
-// Function to get calibration from ResourceMgr and put them into nodedef.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
-    bool is_dyn_op) {
-  LOG(INFO) << "Starting Calib Conversion";
-  infer_graph->CopyFrom(graph_def);
-  auto trt_rm = TRTResourceManager::instance();
-  auto calib_rm = trt_rm->getManager("TRTCalibration");
-  int num_nodes = infer_graph->node_size();
-  if (!is_dyn_op) {
-    LOG(WARNING) << "Construction of static int8 engine is not implemented "
-                    "yet!. Dynamic engine will be constructed";
-  }
-  for (int i = 0; i < num_nodes; ++i) {
-    auto n = infer_graph->mutable_node(i);
-    if (n->op() == "TRTEngineOp") {
-      VLOG(1) << "Processing " << n->name();
-      const string& container_name = n->attr().at("segment_funcdef_name").s();
-      TRTCalibrationResource* cres = nullptr;
-      auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
-      if (!status.ok()) {
-        LOG(ERROR) << "Could not get Calibration information. Did you run with "
-                      "calibration data?";
-        return tensorflow::errors::FailedPrecondition(
-            "Need to run graph with calibration data first!");
-      }
-      if (cres->calibrator_) {
-        cres->calibrator_->waitAndSetDone();
-        cres->thr_->join();
-        const auto& calibration_table =
-            cres->calibrator_->getCalibrationTableAsString();
-        if (!calibration_table.size()) {
-          LOG(ERROR) << "Calibration table is empty";
-          return tensorflow::errors::Unknown(
-              "Calibration table is missing. This shouldn't have happened!");
-        }
-        n->mutable_attr()->at("calibration_data").set_s(calibration_table);
-      } else {
-        LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
-        return tensorflow::errors::Unknown(
-            "Can't get TRTCalibrator from resource manager!");
-      }
-      cres->Unref();
-      TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name));
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertGraphDefToTensorRT(
-    const tensorflow::GraphDef& graph_def,
-    const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode, int minimum_segment_size, bool is_dyn_op,
-    int max_cached_engines, std::vector<int> cached_engine_batches,
-    bool use_calibration) {
+Status ConvertGraphDefToTensorRT(
+    const GraphDef& graph_def, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    GraphDef* new_graph_def, TrtPrecisionMode precision_mode,
+    int minimum_segment_size, bool is_dyn_op, int max_cached_engines,
+    std::vector<int> cached_engine_batches, bool use_calibration) {
   // Create GrapplerItem.
-  tensorflow::grappler::GrapplerItem item;
+  grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;
 
@@ -256,13 +119,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // Create single machine cluster. Note that this will create a session and
   // initialize the gpu devices.
   const int num_cpu_cores =
-      tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  const int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+      grappler::GetNumAvailableLogicalCPUCores();
+  const int num_gpus = grappler::GetNumAvailableGPUs();
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
   const int timeout_s = 60 * 10;
-  std::unique_ptr<tensorflow::grappler::Cluster> cluster(
-      new tensorflow::grappler::SingleMachine(
+  std::unique_ptr<grappler::Cluster> cluster(
+      new grappler::SingleMachine(
           timeout_s, num_cpu_cores, num_gpus));
   // These settings are the defaults in tensorflow/python/grappler/cluster.py.
   cluster->DisableDetailedStats(true);
@@ -273,18 +136,17 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // Create virtual cluster. Grappler requires a virtual cluster with a proper
   // GPU device in order to calculate flops>0 or fails with FATAL in dbg mode.
   // We add numbers from a Pascal card here to have flops>0.
-  tensorflow::DeviceProperties device_properties;
+  DeviceProperties device_properties;
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
   device_properties.set_num_cores(3584);
   device_properties.set_frequency(1531);
-  std::unique_ptr<tensorflow::grappler::Cluster> cluster(
-      new tensorflow::grappler::VirtualCluster(
-          {{"/GPU:0", device_properties}}));
+  std::unique_ptr<grappler::Cluster> cluster(
+      new grappler::VirtualCluster({{"/GPU:0", device_properties}}));
 #endif
 
   // Create RewriterConfig.
-  tensorflow::ConfigProto config_proto;
+  ConfigProto config_proto;
   auto& rw_cfg =
       *config_proto.mutable_graph_options()->mutable_rewrite_options();
   // TODO(aaroey): use only const folding and layout for the time being since
@@ -298,7 +160,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   parameters["max_batch_size"].set_i(max_batch_size);
   parameters["is_dynamic_op"].set_b(is_dyn_op);
   parameters["max_workspace_size_bytes"].set_i(max_workspace_size_bytes);
-  TF_RETURN_IF_ERROR(GetPrecisionModeName(
+  TF_RETURN_IF_ERROR(TrtPrecisionModeToName(
       precision_mode, parameters["precision_mode"].mutable_s()));
   parameters["maximum_cached_engines"].set_i(max_cached_engines);
   if (!cached_engine_batches.empty()) {
@@ -310,7 +172,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   parameters["use_calibration"].set_b(use_calibration);
 
   // Run optimizer.
-  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto);
+  grappler::MetaOptimizer meta_opt(nullptr, config_proto);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
 
   if (VLOG_IS_ON(5)) {
@@ -324,20 +186,18 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 }
 
 struct EdgePtrCompare {
-  bool operator()(const tensorflow::Edge* lhs,
-                  const tensorflow::Edge* rhs) const {
+  bool operator()(const Edge* lhs, const Edge* rhs) const {
     return lhs->id() < rhs->id();
   }
 };
 
 // Function to get subsegment information structure.
-tensorflow::Status GetEngineInfo(
-    const tensorflow::Graph* g,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<const Node*>& segment_nodes,
-    const std::unordered_map<string, tensorflow::Node*>& node_map,
-    const std::vector<tensorflow::Node*>& reverse_topo_order,
-    EngineInfo* info) {
+Status GetEngineInfo(const Graph* g,
+                     const grappler::GraphProperties& graph_properties,
+                     const std::set<const Node*>& segment_nodes,
+                     const std::unordered_map<string, Node*>& node_map,
+                     const std::vector<Node*>& reverse_topo_order,
+                     EngineInfo* info) {
   std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
   std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
   std::set<string> segment_devices;
@@ -384,8 +244,8 @@ tensorflow::Status GetEngineInfo(
 
     // Create input connections. Sort edges first to make determnistic since
     // in_edges is a set of pointers.
-    std::vector<const tensorflow::Edge*> in_edges(node->in_edges().begin(),
-                                                  node->in_edges().end());
+    std::vector<const Edge*> in_edges(node->in_edges().begin(),
+                                      node->in_edges().end());
     std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
     for (const auto edge : in_edges) {
       auto input_node = edge->src();
@@ -436,8 +296,8 @@ tensorflow::Status GetEngineInfo(
     }
     // Create output connections. Sort edges first to make determnistic since
     // out_edges is a set of pointers.
-    std::vector<const tensorflow::Edge*> out_edges(node->out_edges().begin(),
-                                                   node->out_edges().end());
+    std::vector<const Edge*> out_edges(node->out_edges().begin(),
+                                       node->out_edges().end());
     std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
     for (const auto edge : out_edges) {
       auto output_node = edge->dst();
@@ -471,9 +331,13 @@ tensorflow::Status GetEngineInfo(
   // Construct the const nodes first.
   subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
                         added_const_nodes.end());
+  string scope_name;
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
       g, graph_properties, subgraph_nodes, &info->connections,
-      &info->segment_graph_def, &info->engine_name));
+      &info->segment_graph_def, &scope_name));
+  info->engine_name = StrCat(scope_name, info->engine_name);
+  VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name
+          << "' to a GraphDef";
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
@@ -496,7 +360,7 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
                         const size_t my_engine_id,
                         const std::vector<Node*>& engine_nodes,
                         const bool is_input_edge, const string& node_name,
-                        tensorflow::Node** node, int* port) {
+                        Node** node, int* port) {
   for (size_t t = 0; t < infos.size(); ++t) {
     if (t == my_engine_id) {
       continue;
@@ -533,20 +397,20 @@ void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
 //         one). Connect to the pre-existing engine node instead.
 // 3. In this way, we ensure the graph is topologically sort-able after each
 //    invocation of CreateTRTNode().
-tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
-                                 int max_batch_size, tensorflow::Graph* graph,
-                                 nvinfer1::IGpuAllocator* alloc,
-                                 std::vector<Node*>* engine_nodes) {
+Status CreateTRTNode(const ConversionParams& params,
+                     const std::vector<EngineInfo>& infos, int pos,
+                     int max_batch_size, Graph* graph,
+                     nvinfer1::IGpuAllocator* alloc,
+                     std::vector<Node*>* engine_nodes) {
   const auto& info = infos.at(pos);
-  TRT_RETURN_IF_TEST_VALUE(StrCat(info.engine_name, ":CreateTRTNode"), "fail");
-  std::vector<tensorflow::TensorShapeProto> output_shape_protos;
-  std::vector<tensorflow::TensorShapeProto> input_shape_protos;
-  std::vector<tensorflow::PartialTensorShape> input_shapes;
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
-  std::vector<tensorflow::Node*> input_nodes;
-  std::vector<tensorflow::Node*> control_input_nodes;
+  std::vector<TensorShapeProto> output_shape_protos;
+  std::vector<TensorShapeProto> input_shape_protos;
+  std::vector<PartialTensorShape> input_shapes;
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  std::vector<Node*> input_nodes;
+  std::vector<Node*> control_input_nodes;
   std::unordered_set<string> control_input_names;
-  std::vector<tensorflow::DataType> out_types;
+  std::vector<DataType> out_types;
 
   VLOG(1) << "Processing " << info.engine_name;
   // Collect needed info for creating the engine node in the graph
@@ -558,8 +422,8 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       if (!conn.is_input_edge) continue;
 
       // Rewrire control input if it's not found in original graph.
-      tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id);
-      int port = tensorflow::Graph::kControlSlot;
+      Node* input_node = graph->FindNodeId(conn.outside_id);
+      int port = Graph::kControlSlot;
       if (!input_node) {
         UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
                            conn.outside_node_name, &input_node, &port);
@@ -575,7 +439,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       // Data edges
       if (!conn.is_input_edge) {
         // Set the shapes and data types of output edge.
-        tensorflow::TensorShapeProto out_shape;
+        TensorShapeProto out_shape;
         // shape of the output node inside segment
         conn.inside_shape.AsProto(&out_shape);
         if (output_shape_protos.size() <= conn.port_number) {
@@ -586,7 +450,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
         out_types.at(conn.port_number) = conn.connection_type;
       } else {
         // Set the shapes and data types of input edge.
-        tensorflow::TensorShapeProto in_shape;
+        TensorShapeProto in_shape;
         conn.outside_shape.AsProto(&in_shape);
         if (input_shape_protos.size() <= conn.port_number) {
           input_shape_protos.resize(conn.port_number + 1);
@@ -599,7 +463,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
         if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
           for (int i = 1; i < conn.outside_shape.dims(); i++) {
             if (conn.outside_shape.dim_size(i) <= 0) {
-              return tensorflow::errors::Internal(
+              return errors::Internal(
                   "Input shapes must be fully defined when in static mode. "
                   "Please try is_dynamic_op=True (shape was ",
                   conn.outside_shape.DebugString(), ")");
@@ -608,7 +472,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
         }
 
         // Rewrire data input if it's not found in original graph.
-        tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id);
+        Node* input_node = graph->FindNodeId(conn.outside_id);
         int port = conn.outside_port;
         if (!input_node) {
           UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
@@ -631,13 +495,12 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   // avoid crash later. Constant folding should've folded the ops that make up
   // these segments.
   if (inputs.empty()) {
-    return tensorflow::errors::Internal(
-        "Segment has no inputs (possible "
-        "constfold failure)");
+    return errors::Internal(
+        "Segment has no inputs (possible constfold failure)");
   }
 
   const bool calibrate_int8 =
-      (info.precision_mode == INT8MODE && info.use_calibration);
+      (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
   // Build the engine and get its serialized representation.
   string segment_string;
   if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
@@ -650,14 +513,15 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
+        info.segment_graph_def,
+        calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
-    segment_string =
-        string((const char*)engine_data->data(), engine_data->size());
+    segment_string = string(static_cast<const char*>(engine_data->data()),
+                            engine_data->size());
     if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
@@ -666,15 +530,9 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     segment_string = info.segment_graph_def.SerializeAsString();
   }
 
-  // TODO(aaroey): use enum instead, and add a helper method to do the
-  // conversion.
   string prec_string;
-  TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE && calibrate_int8 &&
-      !TRTResourceManager::instance()->getManager("TRTCalibration")) {
-    LOG(ERROR) << "Failed to construct calibration storage";
-  }
-  tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
+  TF_RETURN_IF_ERROR(TrtPrecisionModeToName(info.precision_mode, &prec_string));
+  NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
   if (!info.device.empty()) node_builder.Device(info.device);
   if (VLOG_IS_ON(1)) {
     string ins = StrCat(info.engine_name, " inputs= ");
@@ -692,14 +550,16 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       !info.cached_engine_batches.empty()) {
     LOG(WARNING) << "Cached engine batches are ignored for static engines";
   }
-  tensorflow::NodeDef trt_node;
-  tensorflow::Status status =
+  NodeDef trt_node;
+  Status status =
       node_builder.Attr("input_shapes", input_shape_protos)
           .Attr("output_shapes", output_shape_protos)
           .Attr("static_engine",
                 info.engine_type == EngineInfo::EngineType::TRTStatic)
           .Attr("segment_funcdef_name",
-                StrCat(info.engine_name, "_native_segment"))
+                params.use_function_backup
+                    ? StrCat(info.engine_name, "_native_segment")
+                    : "")
           .Attr("serialized_segment", segment_string)
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
@@ -718,7 +578,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   // here, this segment will be skipped
   // TODO(aaroey): let it return proper error status for the following logic
   // instead of checking fail.
-  tensorflow::Node* engine_node = graph->AddNode(trt_node, &status);
+  Node* engine_node = graph->AddNode(trt_node, &status);
   (*engine_nodes)[pos] = engine_node;
   if (!status.ok()) {
     LOG(ERROR) << "Adding node failed " << status;
@@ -745,7 +605,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     if (conn.is_input_edge) {
       continue;
     }
-    tensorflow::Node* output_node = graph->FindNodeId(conn.outside_id);
+    Node* output_node = graph->FindNodeId(conn.outside_id);
     int port = conn.outside_port;
     if (!output_node) {
       UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/false,
@@ -768,20 +628,19 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
 }
 
 // Function to construct a funcdef from the segment and add it to the graph.
-tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
-    tensorflow::Graph* graph, const tensorflow::GraphDef& segment,
-    const string& engine_name) {
-  tensorflow::Graph sgraph(graph->flib_def());
-  tensorflow::GraphConstructorOptions gcopts;
-  TF_RETURN_IF_ERROR(
-      tensorflow::ConvertGraphDefToGraph(gcopts, segment, &sgraph));
-  std::map<string, tensorflow::Node*> io_nodes;
+Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph,
+                                                const GraphDef& segment,
+                                                const string& engine_name) {
+  Graph sgraph(graph->flib_def());
+  GraphConstructorOptions gcopts;
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, &sgraph));
+  std::map<string, Node*> io_nodes;
   int num_inputs = 0;
   for (auto n : sgraph.op_nodes()) {
-    if (tensorflow::str_util::StartsWith(n->name(), kInputPHName)) {
+    if (str_util::StartsWith(n->name(), kInputPHName)) {
       num_inputs++;
       io_nodes.insert({n->name(), n});
-    } else if (tensorflow::str_util::StartsWith(n->name(), kOutputPHName)) {
+    } else if (str_util::StartsWith(n->name(), kOutputPHName)) {
       io_nodes.insert({n->name(), n});
     }
   }
@@ -789,14 +648,14 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
   for (int i = 0; i < num_inputs; ++i) {
     auto name = StrCat(kInputPHName, i);
     auto node = io_nodes[name];
-    tensorflow::NodeDef nd;
-    tensorflow::NodeDefBuilder node_builder(
-        StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp);
+    NodeDef nd;
+    NodeDefBuilder node_builder(StrCat(name, "_Arg"),
+                                FunctionLibraryDefinition::kArgOp);
     VLOG(1) << "Adding " << StrCat(name, "_Arg");
     TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
                            .Attr("index", i)
                            .Finalize(&nd));
-    tensorflow::Status s;
+    Status s;
     auto node_arg = sgraph.AddNode(nd, &s);
     if (!s.ok()) {
       LOG(ERROR) << "Couldn't add _Arg node for " << name;
@@ -816,15 +675,14 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
   for (int i = 0; i < io_nodes.size() - num_inputs; ++i) {
     auto name = StrCat(kOutputPHName, i);
     auto node = io_nodes[name];
-    tensorflow::NodeDef nd;
-    tensorflow::NodeDefBuilder node_builder(
-        StrCat(name, "_Ret"), tensorflow::FunctionLibraryDefinition::kRetOp);
+    NodeDef nd;
+    NodeDefBuilder node_builder(StrCat(name, "_Ret"),
+                                FunctionLibraryDefinition::kRetOp);
     auto edge = *(node->in_edges().begin());
-    tensorflow::NodeDefBuilder::NodeOut nout(
-        edge->src()->name(), edge->src_output(),
-        edge->src()->output_type(edge->src_output()));
+    NodeDefBuilder::NodeOut nout(edge->src()->name(), edge->src_output(),
+                                 edge->src()->output_type(edge->src_output()));
     VLOG(1) << " input " << nout.node << ":" << nout.index
-            << " dtype=" << tensorflow::DataTypeString(nout.data_type);
+            << " dtype=" << DataTypeString(nout.data_type);
     // nvcc complains that Input(<brace-enclosed initializer list>) is
     // ambiguous, so do not use Input({nout}).
     node_builder.Input(nout);
@@ -834,7 +692,7 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
     if (VLOG_IS_ON(3)) {
       VLOG(3) << nd.DebugString();
     }
-    tensorflow::Status s;
+    Status s;
     auto node_ret = sgraph.AddNode(nd, &s);
     if (!s.ok()) {
       LOG(ERROR) << "Couldn't add _Ret node for " << name;
@@ -850,23 +708,29 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
     }
     sgraph.RemoveNode(node);
   }
-  tensorflow::FunctionDefLibrary fdeflib;
+  FunctionDefLibrary fdeflib;
   auto native_segment = fdeflib.add_function();
-  TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(
       sgraph, StrCat(engine_name, "_native_segment"), native_segment));
+  // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on
+  // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32
+  // would be on host if the op generating the tensor has host memory tag set.
+  (*native_segment
+        ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr]
+      .set_b(true);
   if (VLOG_IS_ON(7)) {
     VLOG(7) << engine_name << " Function_Def ";
     VLOG(7) << native_segment->DebugString();
   }
   VLOG(1) << "Adding funcdef to graphlib";
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
-    const ConversionParams& params, const EngineInfo& engine) {
+std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
+                                                 const EngineInfo& engine) {
   int cuda_device_id = -1;
-  tensorflow::Allocator* dev_allocator = nullptr;
+  Allocator* dev_allocator = nullptr;
   if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
       engine.device.empty()) {
     // If device is not set, use the first found GPU device for the conversion.
@@ -894,7 +758,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 
   // Use the device requested by the engine.
   auto device_set = params.cluster->GetDeviceSet();
-  std::vector<tensorflow::Device*> devices;
+  std::vector<Device*> devices;
   DeviceNameUtils::ParsedName parsed_name;
   if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
       parsed_name.has_id) {
@@ -908,7 +772,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
       StrAppend(&msg, ". Will get the allocator from first one.");
       LOG(WARNING) << msg;
     }
-    tensorflow::AllocatorAttributes alloc_attr;
+    AllocatorAttributes alloc_attr;
     cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
     dev_allocator = devices[0]->GetAllocator(alloc_attr);
     VLOG(1) << "Using allocator " << dev_allocator->Name()
@@ -921,26 +785,38 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 }
 
 // Entry function from optimization pass.
-// TODO(aaeory): parameter should use pointer type.
-tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
+Status ConvertAfterShapes(const ConversionParams& params) {
+  // Sanity checks.
+  if (params.precision_mode == TrtPrecisionMode::INT8) {
+    if (params.use_calibration && !params.use_function_backup) {
+      return errors::InvalidArgument(
+          "Calibration requires enabling fallback to TF function execution.");
+    }
+  } else {
+    if (params.use_calibration) {
+      return errors::InvalidArgument(
+          "Calibration with FP32 or FP16 is not supported.");
+    }
+  }
+
   // Convert graphdef to graph.
-  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
-                                             params.input_graph_def->library());
-  tensorflow::Graph graph(flib);
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph));
+  FunctionLibraryDefinition flib(OpRegistry::Global(),
+                                 params.input_graph_def->library());
+  Graph graph(flib);
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            *params.input_graph_def, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
+  segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : *(params.output_names)) {
     segment_options.exclude_node_list.insert(node);
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
-  tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
+  segment::SegmentNodesVector initial_segments;
   TrtCandidateSelector candidate_selector(*params.graph_properties,
                                           params.precision_mode);
-  TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
+  TF_RETURN_IF_ERROR(segment::SegmentGraph(
       &graph,
       std::bind(&TrtCandidateSelector::IsTensorRTCandidate, &candidate_selector,
                 std::placeholders::_1),
@@ -952,20 +828,21 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
             << initial_segments.size();
 
   // Get the EngineInfo for each segment.
-  std::unordered_map<string, tensorflow::Node*> node_map;
+  std::unordered_map<string, Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
   float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(initial_segments.size());
-  std::vector<tensorflow::Node*> reverse_topo_order;
-  tensorflow::GetPostOrder(graph, &reverse_topo_order);
+  std::vector<Node*> reverse_topo_order;
+  GetPostOrder(graph, &reverse_topo_order);
   size_t total_engine_bytes_size = 0;
   std::vector<size_t> engine_bytes_size;
-  tensorflow::tensorrt::segment::SegmentNodesVector converted_segments;
+  segment::SegmentNodesVector converted_segments;
   converted_segments.reserve(initial_segments.size());
   for (size_t t = 0; t < initial_segments.size(); t++) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
+    curr_engine.engine_name = StrCat("TRTEngineOp_", t);
     Status status =
         GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
                       node_map, reverse_topo_order, &curr_engine);
@@ -975,23 +852,20 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    if (params.use_calibration && params.precision_mode != INT8MODE) {
-      return errors::InvalidArgument(
-          "Calibration with FP32 or FP16 is not supported.");
-    }
     curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
                                    ? EngineInfo::EngineType::TRTDynamic
                                    : EngineInfo::EngineType::TRTStatic);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    StrAppend(&curr_engine.engine_name, "TRTEngineOp_", t);
-    status = RegisterSegmentFunctionToFunctionLibrary(
-        &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
-    if (!status.ok()) {
-      LOG(WARNING) << "Failed to register segment graphdef as a function " << t
-                   << ": " << status;
-      continue;
+    if (params.use_function_backup) {
+      status = RegisterSegmentFunctionToFunctionLibrary(
+          &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
+      if (!status.ok()) {
+        LOG(WARNING) << "Failed to register segment graphdef as a function "
+                     << t << ": " << status;
+        continue;
+      }
     }
 
     engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
@@ -1042,8 +916,9 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
     }
     cudaSetDevice(cuda_device_id);
-    auto status = CreateTRTNode(engine_segments, i, params.max_batch_size,
-                                &graph, alloc.get(), &engine_nodes);
+    auto status =
+        CreateTRTNode(params, engine_segments, i, params.max_batch_size, &graph,
+                      alloc.get(), &engine_nodes);
 
     string msg = StrCat("TensorRT node ", engine.engine_name,
                         " added for segment ", i, " consisting of ",
@@ -1073,7 +948,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
   VLOG(1) << "Returning from conversion";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace convert
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..88fda49d38ef38522b8d21b73c0fd0061509c8d1
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// Helper class for the segmenter to determine whether given TF node is
+// supported by TRT.
+class TrtCandidateSelector {
+ public:
+  TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
+                       TrtPrecisionMode precision_mode);
+
+  // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
+  // to TRT subgraph and later converted into TRT engine.
+  Status IsTensorRTCandidate(const Node* node);
+
+ private:
+  // The TF-TRT node converter used to verify whether individual node is
+  // supported. It will operate in validation-only mode.
+  TrtNodeValidator validator_;
+
+  // GraphProperties of the graph whose nodes are to be validated by
+  // IsTensorRTCandidate().
+  const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const TrtPrecisionMode precision_mode_;
+};
+
+struct ConversionParams {
+  const GraphDef* input_graph_def = nullptr;
+  const std::vector<string>* output_names = nullptr;
+  size_t max_batch_size = 1;
+  size_t max_workspace_size_bytes = 1 << 30;
+  GraphDef* output_graph_def = nullptr;
+  TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
+  int minimum_segment_size = 3;
+  const grappler::GraphProperties* graph_properties = nullptr;
+  const grappler::Cluster* cluster = nullptr;
+  // Whether to create engine on conversion or execution time
+  bool is_dyn_op = false;
+  // maximum number of cached engines
+  int max_cached_engines = 1;
+  bool use_calibration = true;
+  // list of cached engines
+  std::vector<int> cached_engine_batches;
+  // Whether to use function fallback for TRTEngineOp
+  bool use_function_backup = true;
+};
+
+// - max_batch_size: maximum batch size which can be used for inference for
+//   optimization targets inference run with max batch size.
+// - max_workspace_size_bytes: The upper bound of memory allowance for engine
+//   building.
+Status ConvertGraphDefToTensorRT(
+    const GraphDef& graph_def, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    GraphDef* new_graph_def,
+    TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32,
+    int minimum_segment_size = 3, bool is_dyn_op = false,
+    int max_cached_engines = 1, std::vector<int> cached_engine_batches = {},
+    bool use_calibration = true);
+
+// Method to call from optimization pass
+Status ConvertAfterShapes(const ConversionParams& params);
+
+// Helper method for the conversion, expose for testing.
+std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
+                                                 const EngineInfo& engine);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
similarity index 95%
rename from tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 2d2bfeb192c1893824c7b30bfad593c62c203392..1a754181debf41865190aa7f9ca6a76efea98181 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -75,7 +75,7 @@ TEST(TrtCandidateSelector, Basics) {
                                          feed, const_1, matmul_attrs);
 
   // Unsupported op.
-  auto unsupported_op = ops::Sin(s.WithOpName("sin"), feed);
+  auto unsupported_op = ops::Erf(s.WithOpName("sin"), feed);
 
   // Incompatible input.
   auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE);
@@ -98,7 +98,8 @@ TEST(TrtCandidateSelector, Basics) {
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+  for (const TrtPrecisionMode precision_mode :
+       {TrtPrecisionMode::FP32, TrtPrecisionMode::INT8}) {
     TrtCandidateSelector selector(graph_properties, precision_mode);
     TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
     ExpectStatus(
@@ -107,13 +108,13 @@ TEST(TrtCandidateSelector, Basics) {
         "transpose_a is not supported for TensorRT FullyConnected "
         "(op: MatMul), at: incompatible_matmul");
     ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+                 error::UNIMPLEMENTED, "Op type Erf is not supported");
     ExpectStatus(
         selector.IsTensorRTCandidate(
             matmul_with_incompatible_input.operation.node()),
         error::INTERNAL,
         "Failed to convert input with index 0 to a TRT_TensorOrWeights");
-    if (precision_mode == INT8MODE) {
+    if (precision_mode == TrtPrecisionMode::INT8) {
       TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
     } else {
       ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
similarity index 68%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 8b7279ad0336b3c791c130a840b7926e6b99df94..8aeecaff92531ffceda4ae878bb104830c8649d9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
 #include <cstring>
@@ -24,11 +24,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"        // NOLINT
@@ -43,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -54,10 +57,10 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                                \
-  do {                                                                    \
-    return tensorflow::errors::Internal(                                  \
-        "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                           \
+  do {                                                               \
+    return errors::Internal("TFTRT::", __FUNCTION__,                 \
+                            " failed to add TRT layer, at: ", node); \
   } while (0)
 
 #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \
@@ -80,32 +83,119 @@ namespace tensorrt {
 const char* const kInputPHName = "TensorRTInputPH_";
 const char* const kOutputPHName = "TensorRTOutputPH_";
 
+bool IsEngineInput(absl::string_view name) {
+  return absl::StartsWith(name, kInputPHName);
+}
+bool IsEngineOutput(absl::string_view name) {
+  return absl::StartsWith(name, kOutputPHName);
+}
+
 namespace convert {
-using ::tensorflow::str_util::Split;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
-inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
-                                       nvinfer1::DataType* trt_dtype) {
+inline Status ConvertDType(DataType tf_dtype, nvinfer1::DataType* trt_dtype) {
   switch (tf_dtype) {
-    case tensorflow::DataType::DT_FLOAT:
+    case DataType::DT_FLOAT:
       *trt_dtype = nvinfer1::DataType::kFLOAT;
       break;
     // TODO(aaroey): this should be DT_QINT8 which is not a well supported type.
-    case tensorflow::DataType::DT_INT8:
+    case DataType::DT_INT8:
       *trt_dtype = nvinfer1::DataType::kINT8;
       break;
-    case tensorflow::DataType::DT_HALF:
+    case DataType::DT_HALF:
       *trt_dtype = nvinfer1::DataType::kHALF;
       break;
-    case tensorflow::DataType::DT_INT32:
+    case DataType::DT_INT32:
       *trt_dtype = nvinfer1::DataType::kINT32;
       break;
     default:
-      return tensorflow::errors::InvalidArgument(
-          "Unsupported data type ", tensorflow::DataTypeString(tf_dtype));
+      return errors::InvalidArgument("Unsupported data type ",
+                                     DataTypeString(tf_dtype));
+  }
+  return Status::OK();
+}
+
+class TFAttrs {
+ public:
+  explicit TFAttrs(const NodeDef& tf_node) {
+    for (const auto& attr : tf_node.attr()) {
+      attrs_.insert({attr.first, &attr.second});
+    }
+  }
+
+  bool count(const string& key) const { return attrs_.count(key); }
+
+  AttrValue const* at(const string& key) const {
+    if (!attrs_.count(key)) {
+      LOG(FATAL) << "Attribute not found: " << key;
+    }
+    return attrs_.at(key);
+  }
+
+  template <typename T>
+  T get(const string& key) const;
+
+  template <typename T>
+  T get(const string& key, const T& default_value) const {
+    return attrs_.count(key) ? this->get<T>(key) : default_value;
+  }
+
+  std::vector<string> GetAllAttrKeys() const {
+    std::vector<string> attr_list;
+    for (const auto& attr_item : attrs_) {
+      attr_list.emplace_back(attr_item.first);
+    }
+    return attr_list;
   }
-  return tensorflow::Status::OK();
+
+ private:
+  typedef std::map<string, AttrValue const*> AttrMap;
+  AttrMap attrs_;
+};
+
+template <>
+string TFAttrs::get<string>(const string& key) const {
+  return this->at(key)->s();
+}
+
+template <>
+std::vector<int64> TFAttrs::get<std::vector<int64>>(const string& key) const {
+  auto attr = this->at(key)->list().i();
+  return std::vector<int64>(attr.begin(), attr.end());
+}
+
+template <>
+std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
+  auto attr = this->at(key)->list().f();
+  return std::vector<float>(attr.begin(), attr.end());
+}
+
+template <>
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
+  nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
+  TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
+  return trt_dtype;
+}
+
+template <>
+DataType TFAttrs::get<DataType>(const string& key) const {
+  return this->at(key)->type();
+}
+
+template <>
+float TFAttrs::get<float>(const string& key) const {
+  return this->at(key)->f();
+}
+
+template <>
+bool TFAttrs::get<bool>(const string& key) const {
+  return this->at(key)->b();
+}
+
+template <>
+int64 TFAttrs::get<int64>(const string& key) const {
+  return this->at(key)->i();
 }
 
 template <typename TensorShapeType>
@@ -126,13 +216,23 @@ Status TensorShapeArrayToTrtDims(const std::vector<int>& shape,
   PartialTensorShape tensor_shape;
   TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(shape, &tensor_shape));
   *out = TensorShapeToTrtDims(tensor_shape, ignore_first_dim);
-  return tensorflow::Status::OK();
+  return Status::OK();
+}
+
+// TODO(laigd): use this utility function in more places.
+Status RemoveBatchDimension(nvinfer1::Dims* dims) {
+  if (dims->nbDims < 2) {
+    return errors::InvalidArgument(
+        "Dropping batch dimension requires dims with rank>=2.");
+  }
+  std::copy(dims->d + 1, dims->d + dims->nbDims, dims->d);
+  dims->nbDims--;
+  return Status::OK();
 }
 
 void GetOutputProperties(const grappler::GraphProperties& graph_properties,
                          const Node* node, const int out_port,
-                         PartialTensorShape* shape,
-                         tensorflow::DataType* dtype) {
+                         PartialTensorShape* shape, DataType* dtype) {
   if (graph_properties.HasOutputProperties(node->name())) {
     auto output_params = graph_properties.GetOutputProperties(node->name());
     auto out_shape = output_params.at(out_port);
@@ -146,8 +246,7 @@ void GetOutputProperties(const grappler::GraphProperties& graph_properties,
 
 void GetInputProperties(const grappler::GraphProperties& graph_properties,
                         const Node* node, const int in_port,
-                        PartialTensorShape* shape,
-                        tensorflow::DataType* dtype) {
+                        PartialTensorShape* shape, DataType* dtype) {
   if (graph_properties.HasInputProperties(node->name())) {
     auto input_params = graph_properties.GetInputProperties(node->name());
     auto in_shape = input_params.at(in_port);
@@ -159,7 +258,7 @@ void GetInputProperties(const grappler::GraphProperties& graph_properties,
 }
 
 Status ValidateTensorProperties(const string& producer_node_type,
-                                const tensorflow::DataType dtype,
+                                const DataType dtype,
                                 const PartialTensorShape& shape,
                                 bool validation_only,
                                 nvinfer1::DataType* trt_dtype,
@@ -183,6 +282,15 @@ Status ValidateTensorProperties(const string& producer_node_type,
   *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true);
   *batch_size = shape.dim_size(0);
 
+  // Don't convert empty tensors (dim value of 0).
+  for (int d = 1; d < shape.dims(); ++d) {
+    if (shape.dim_size(d) == 0) {
+      return errors::Unimplemented(
+          "Input tensor with shape ", shape.DebugString(),
+          " is an empty tensor, which is not supported by TRT");
+    }
+  }
+
   if (validation_only) return Status::OK();
   // Following are validations at runtime.
 
@@ -285,31 +393,31 @@ Status Converter::GetTrtBroadcastShape(
   }
 
   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  auto compute_output_dims =
-      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
-                    int* output_dims_array, nvinfer1::Dims* output_dims) {
-        const nvinfer1::Dims input_dims = input.GetTrtDims();
-        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
-        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
-                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
-        if (input.is_tensor()) {
-          const int true_input_dims = input_dims.nbDims + 1;
-          if (true_input_dims < broadcast_num_dims) {
-            return errors::InvalidArgument(
-                "Broadcasting beyond batch dimension is not supported ",
-                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
-                broadcast_num_dims, ")");
-          }
-          // Set the batch dimension to -1, since batch size is not supposed to
-          // be broadcasted.
-          output_dims_array[0] = -1;
-        }
-        // Copy to output dimensions (stripping the batch dimension).
-        output_dims->nbDims = broadcast_num_dims - 1;
-        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
-                  output_dims->d);
-        return Status::OK();
-      };
+  auto compute_output_dims = [](const TRT_TensorOrWeights& input,
+                                int broadcast_num_dims, int* output_dims_array,
+                                nvinfer1::Dims* output_dims) {
+    const nvinfer1::Dims input_dims = input.GetTrtDims();
+    std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+    std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+              output_dims_array + broadcast_num_dims - input_dims.nbDims);
+    if (input.is_tensor()) {
+      const int true_input_dims = input_dims.nbDims + 1;
+      if (true_input_dims < broadcast_num_dims) {
+        return errors::InvalidArgument(
+            "Broadcasting beyond batch dimension is not supported ",
+            "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+            broadcast_num_dims, ")");
+      }
+      // Set the batch dimension to -1, since batch size is not supposed to
+      // be broadcasted.
+      output_dims_array[0] = -1;
+    }
+    // Copy to output dimensions (stripping the batch dimension).
+    output_dims->nbDims = broadcast_num_dims - 1;
+    std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+              output_dims->d);
+    return Status::OK();
+  };
 
   // Compute the output dimensions.
   const int broadcast_num_dims =
@@ -341,14 +449,80 @@ nvinfer1::ITensor* Converter::CreateConstantLayer(
   if (!layer) return nullptr;
   const nvinfer1::DataType trt_dtype = trt_weights.type;
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
+#if !IS_TRT_VERSION_GE(5, 1, 3)
   // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
   // the data type below, it will always be kFLOAT regardless what the data type
   // of the weights is. Once NVIDIA fixes this bug, we should remove the data
   // type setting logic below and test should still pass.
   trt_tensor->setType(trt_dtype);
+#endif
   return trt_tensor;
 }
 
+Status CreateBroadcastableScalarConstant(OpConverterParams* params, float value,
+                                         const nvinfer1::Dims& dims,
+                                         const nvinfer1::ITensor** tensor,
+                                         const char* dtype_attr_name = "T") {
+  TFAttrs attrs(params->node_def);
+  DataType dtype;
+  if (attrs.count(dtype_attr_name)) {
+    dtype = attrs.get<DataType>(dtype_attr_name);
+  } else {
+    dtype = DT_FLOAT;  // Default to FP32.
+  }
+
+  // In order to be broadcastable, the number of dims has to match.
+  nvinfer1::Dims broadcastable_dims(dims);
+  for (int i = 0; i < broadcastable_dims.nbDims; i++) {
+    broadcastable_dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights =
+      params->weight_store->GetTempWeights(dtype, broadcastable_dims);
+  void* raw_ptr = const_cast<void*>(weights.GetValues());
+  switch (dtype) {
+    case DataType::DT_FLOAT:
+      static_cast<float*>(raw_ptr)[0] = value;
+      break;
+    case DataType::DT_HALF:
+      static_cast<Eigen::half*>(raw_ptr)[0] = Eigen::half(value);
+      break;
+    default:
+      return errors::InvalidArgument("Unsupported data type ",
+                                     DataTypeString(dtype));
+  }
+  *tensor = params->converter->CreateConstantLayer(weights, broadcastable_dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
+  params->converter->ProvideQuantizationRange(
+      const_cast<nvinfer1::ITensor*>(*tensor), value, value);
+  return Status::OK();
+}
+
+// Convert an axis from TF format to TRT format while validating. TF format
+// includes the batch dimension, while TRT does not. TF can also use negative
+// indices.
+// TODO(tmorris): Use this method in more ops.
+Status ConvertAxis(int tf_axis, int trt_nb_dims, absl::string_view node_name,
+                   int* trt_axis) {
+  const int tf_nb_dims = trt_nb_dims + 1;
+  // Check bounds.
+  if (tf_axis < -tf_nb_dims || tf_axis >= tf_nb_dims) {
+    return errors::InvalidArgument(
+        "Axis value of ", tf_axis, " is out of bounds, must be in range [",
+        -tf_nb_dims, ", ", tf_nb_dims, "), at ", node_name);
+  }
+  // Make negative axis positive.
+  if (tf_axis < 0) tf_axis += tf_nb_dims;
+  // Don't allow axis to be the batch dimension.
+  if (tf_axis == 0) {
+    return errors::Unimplemented(
+        "TensorRT does not allow manipulation of the batch dimension, at ",
+        node_name);
+  }
+  // Remove batch dimension.
+  *trt_axis = tf_axis - 1;
+  return Status::OK();
+}
+
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
                       const nvinfer1::Dims& dim_r) {
   if (dim_l.nbDims != dim_r.nbDims) {
@@ -362,7 +536,16 @@ inline bool DimsEqual(const nvinfer1::Dims& dim_l,
   return true;
 }
 
-inline nvinfer1::Dims GetTrtDimsForTensor(const tensorflow::Tensor& tensor) {
+bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
+  if (inputs.size() == 0) return true;
+  int length = inputs.at(0).size();
+  for (int i = 1; i < inputs.size(); i++) {
+    if (inputs.at(i).size() != length) return false;
+  }
+  return true;
+}
+
+inline nvinfer1::Dims GetTrtDimsForTensor(const Tensor& tensor) {
   nvinfer1::Dims dims;
   dims.nbDims = tensor.dims();
   for (int i = 0; i < dims.nbDims; i++) {
@@ -445,7 +628,7 @@ nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
 }
 
 size_t TRT_ShapedWeights::size_bytes() const {
-  return this->count() * tensorflow::DataTypeSize(this->type_);
+  return this->count() * DataTypeSize(this->type_);
 }
 
 string TRT_ShapedWeights::DebugString() const {
@@ -493,12 +676,22 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
 
   void setLocation(nvinfer1::TensorLocation location) override {}
 
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   bool setDynamicRange(float min, float max) override { return true; }
 
   float getDynamicRange() const override { return 0; }
 #endif
 
+#if IS_TRT_VERSION_GE(5, 1, 0)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   nvinfer1::DataType trt_dtype_;
   nvinfer1::Dims trt_dims_;
@@ -569,89 +762,6 @@ string TRT_TensorOrWeights::DebugString() const {
   return output;
 }
 
-class TFAttrs {
- public:
-  explicit TFAttrs(const tensorflow::NodeDef& tf_node) {
-    for (const auto& attr : tf_node.attr()) {
-      attrs_.insert({attr.first, &attr.second});
-    }
-  }
-
-  bool count(const string& key) const { return attrs_.count(key); }
-
-  tensorflow::AttrValue const* at(const string& key) const {
-    if (!attrs_.count(key)) {
-      LOG(FATAL) << "Attribute not found: " << key;
-    }
-    return attrs_.at(key);
-  }
-
-  template <typename T>
-  T get(const string& key) const;
-
-  template <typename T>
-  T get(const string& key, const T& default_value) const {
-    return attrs_.count(key) ? this->get<T>(key) : default_value;
-  }
-
-  std::vector<string> GetAllAttrKeys() const {
-    std::vector<string> attr_list;
-    for (const auto& attr_item : attrs_) {
-      attr_list.emplace_back(attr_item.first);
-    }
-    return attr_list;
-  }
-
- private:
-  typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
-  AttrMap attrs_;
-};
-
-template <>
-string TFAttrs::get<string>(const string& key) const {
-  return this->at(key)->s();
-}
-
-template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(const string& key) const {
-  auto attr = this->at(key)->list().i();
-  return std::vector<int>(attr.begin(), attr.end());
-}
-
-template <>
-std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
-  auto attr = this->at(key)->list().f();
-  return std::vector<float>(attr.begin(), attr.end());
-}
-
-template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
-  nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
-  TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
-  return trt_dtype;
-}
-
-template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(
-    const string& key) const {
-  return this->at(key)->type();
-}
-
-template <>
-float TFAttrs::get<float>(const string& key) const {
-  return this->at(key)->f();
-}
-
-template <>
-bool TFAttrs::get<bool>(const string& key) const {
-  return this->at(key)->b();
-}
-
-template <>
-int TFAttrs::get<int>(const string& key) const {
-  return this->at(key)->i();
-}
-
 // TODO(jie): reorder4 & reorder2 should be merged?
 // TODO(aaroey): fix the order of parameters.
 template <typename T>
@@ -693,7 +803,7 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
   const nvinfer1::DimsHW istrides = {1, k};
   const nvinfer1::DimsHW ostrides = {c, 1};
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case DataType::DT_FLOAT: {
       Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
                istrides,
                // TODO(aaroey): get rid of all the const_cast like this.
@@ -701,7 +811,7 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
                ostrides);
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
+    case DataType::DT_HALF: {
       Reorder2(
           {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
           istrides,
@@ -737,14 +847,14 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   const nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
   const nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case DataType::DT_FLOAT: {
       Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
                istrides,
                static_cast<float*>(const_cast<void*>(oweights->GetValues())),
                ostrides);
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
+    case DataType::DT_HALF: {
       Reorder4(
           {k, c, r, s}, static_cast<Eigen::half const*>(iweights.GetValues()),
           istrides,
@@ -759,7 +869,7 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   }
 }
 
-TRT_ShapedWeights TrtWeightStore::GetTempWeights(tensorflow::DataType type,
+TRT_ShapedWeights TrtWeightStore::GetTempWeights(DataType type,
                                                  const nvinfer1::Dims& dims) {
   TensorShape shape;
   // TODO(laigd): make it return a status.
@@ -771,6 +881,13 @@ TRT_ShapedWeights TrtWeightStore::GetTempWeights(tensorflow::DataType type,
   return weights;
 }
 
+const std::set<string>* TrtNodeValidator::quantize_ops = new std::set<string>{
+    "QuantizeAndDequantizeV2",
+    "QuantizeAndDequantizeV3",
+    "FakeQuantWithMinMaxVars",
+    "FakeQuantWithMinMaxArgs",
+};
+
 TrtNodeValidator::TrtNodeValidator() { RegisterOpValidators(); }
 
 Status TrtNodeValidator::ConvertToTensorOrWeights(
@@ -816,9 +933,27 @@ Status TrtNodeValidator::ConvertToTensorOrWeights(
 }
 
 Status TrtNodeValidator::ValidateNode(
-    const tensorflow::NodeDef& node_def,
+    const NodeDef& node_def,
     const std::vector<std::pair<const NodeDef*, int>>& input_node_and_ports,
+    const TrtPrecisionMode precision_mode,
     const grappler::GraphProperties& graph_properties) {
+  const string& op = node_def.op();
+  // It doesn't support validation of plugins.
+  if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) return Status::OK();
+
+  // In INT8 mode, we will always apply the quantization ranges provided by
+  // these ops to the relevant tensors. This happens regardless of the value of
+  // use_calibration.
+  bool is_supported_op = false;
+  if (quantize_ops->count(op)) {
+    is_supported_op = (precision_mode == TrtPrecisionMode::INT8);
+  } else {
+    is_supported_op = op_validators_.count(node_def.op());
+  }
+  if (!is_supported_op) {
+    return errors::Unimplemented("Op type ", op, " is not supported.");
+  }
+
   // Convert input NodeDef and corresponding output ports to
   // TRT_TensorOrWeights.
   std::vector<TRT_TensorOrWeights> inputs;
@@ -835,14 +970,7 @@ Status TrtNodeValidator::ValidateNode(
     inputs.push_back(tensor_or_weights);
   }
 
-  // Validate the node.
-  const auto iter = op_validators_.find(node_def.op());
-  if (iter == op_validators_.end()) {
-    // If validator is not registered, it means no validation is needed.
-    return Status::OK();
-  }
-
-  OpConverter validator = iter->second;
+  OpConverter validator = op_validators_[node_def.op()];
   OpConverterParams params(
       /*arg_converter=*/nullptr, node_def, inputs, /*arg_outputs=*/nullptr,
       /*arg_validation_only=*/true, &weight_store_);
@@ -863,7 +991,7 @@ Status TrtNodeValidator::ConvertConstToWeights(
 }
 
 Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
-                     int precision_mode, bool use_calibration)
+                     TrtPrecisionMode precision_mode, bool use_calibration)
     : trt_network_(trt_network),
       precision_mode_(precision_mode),
       use_calibration_(use_calibration) {
@@ -881,7 +1009,7 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     TF_RETURN_IF_ERROR(plugin_converter_(&params));
   } else {
     if (!op_registry_.count(op)) {
-      return errors::Unimplemented("No converter registered for op: " + op);
+      return errors::Unimplemented("No converter registered for op: ", op);
     }
     OpConverter op_converter = op_registry_.at(op);
     TF_RETURN_IF_ERROR(op_converter(&params));
@@ -890,7 +1018,7 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
   for (size_t i = 0; i < outputs.size(); ++i) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
-    if (i != 0) output_name = StrCat(output_name, ":", i);
+    if (i != 0) absl::StrAppend(&output_name, ":", i);
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
@@ -898,7 +1026,7 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     // in ConvertIdentity.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
-      if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
+      if (!IsEngineInput(tensor_name)) {
         // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
         // them to match their corresponding TensorFlow name.
         // Note: ITensors that we create internally within TF-TRT which are
@@ -944,27 +1072,31 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
 }
 
 Status Converter::RenameAndMarkOutputTensors(
-    const std::vector<std::pair<string, string>>& output_tensors) {
+    const std::vector<Converter::EngineOutputInfo>& output_tensors) {
   for (const auto& output : output_tensors) {
     TRT_TensorOrWeights tensor_or_weights;
-    TF_RETURN_IF_ERROR(GetTensorOrWeights(output.first, &tensor_or_weights));
+    TF_RETURN_IF_ERROR(
+        GetTensorOrWeights(output.source_tensor_name, &tensor_or_weights));
     if (!tensor_or_weights.is_tensor()) {
-      return errors::InvalidArgument("Output ", output.first,
+      return errors::InvalidArgument("Output ", output.source_tensor_name,
                                      " is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (tensor == nullptr) {
-      return errors::NotFound("Output tensor not found: ", output.first);
+      return errors::NotFound("Output tensor not found: ",
+                              output.source_tensor_name);
     }
-    // Check if this tensor has already been marked as an output.
+    // Check if this tensor has already been marked as an input or output.
+    //
     // ConvertIdentity can cause the same tensor to be repeated in
     // output_tensors, which can cause us to overwrite the name of the output
     // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
     // we won't be able to locate OutputPH_0 during runtime. To fix this,
     // duplicate the tensor using no-op shuffle.
+    //
     // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
     // in ConvertIdentity.
-    if (tensorflow::str_util::StartsWith(tensor->getName(), kOutputPHName)) {
+    if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) {
       // Using shuffle layer for identity by not setting reshape or transpose.
       nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
       TFTRT_RETURN_ERROR_IF_NULLPTR(
@@ -972,10 +1104,13 @@ Status Converter::RenameAndMarkOutputTensors(
       MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
       tensor = layer->getOutput(0);
     }
-    tensor->setName(output.second.c_str());
-    VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
-            << output.second;
+    tensor->setName(output.dest_node_name.c_str());
     network()->markOutput(*tensor);
+    // Set type after marking as output. TRT only supports setType for engine
+    // outputs and inputs (type is inferred otherwise).
+    tensor->setType(output.trt_dtype);
+    VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
+            << ", which feeds TF node " << output.dest_node_name;
   }
   return Status::OK();
 }
@@ -1022,11 +1157,11 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   const auto dims = input_tensor->getDimensions();
 
   if (order_with_batch_dim.size() - 1 != size_t(dims.nbDims)) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Rank of perm for transpose does not match with that of the input.");
   }
   if (order_with_batch_dim[0] != 0) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Transpose at batch dimension is not supported.");
   }
 
@@ -1052,7 +1187,7 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   layer->setReshapeDimensions(reshape_dims);
 
   *output_tensor = layer->getOutput(0);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
@@ -1089,6 +1224,7 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
 
 Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                         const nvinfer1::Dims& dims,
+                                        const bool validation_only,
                                         const nvinfer1::ITensor** tensor) {
   // If -1 is not used for one of the dims, we can check if the shapes are
   // compatible.
@@ -1105,6 +1241,10 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                    DebugString(input.GetTrtDims()), " vs ",
                                    DebugString(dims), ")");
   }
+  if (validation_only) {
+    *tensor = nullptr;
+    return Status::OK();
+  }
 
   if (input.is_tensor()) {
     if (DimsEqual(input.GetTrtDims(), dims)) {
@@ -1121,7 +1261,7 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
   } else {
     *tensor = CreateConstantLayer(input.weights(), dims);
     TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
-    if (precision_mode() == INT8MODE && !use_calibration()) {
+    if (precision_mode() == TrtPrecisionMode::INT8 && !use_calibration()) {
       // If we are in int8 mode and not calibrating, we need to explicitly set a
       // quantization range for the output tensor of the IConstantLayer. Here we
       // set the range to [min(weights), max(weights)].
@@ -1140,7 +1280,7 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                min_range, max_range);
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 void Converter::MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
@@ -1156,12 +1296,12 @@ void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
 }
 
 void Converter::MaybeApplyQuantizationRanges() {
-  if (precision_mode() != INT8MODE) return;
+  if (precision_mode() != TrtPrecisionMode::INT8) return;
 
   // Infer ranges across marked ops.
   PropagateQuantizationRanges();
   // Apply ranges.
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   for (auto pair : quantization_ranges_) {
     nvinfer1::ITensor* tensor = pair.first;
     const float range = pair.second;
@@ -1238,7 +1378,7 @@ void Converter::PropagateQuantizationRanges() {
   }
 }
 
-Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
+Status Converter::GetInputs(const NodeDef& node_def,
                             std::vector<TRT_TensorOrWeights>* inputs) const {
   for (auto const& input_name : node_def.input()) {
     /*************************************************************************
@@ -1273,15 +1413,73 @@ Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
       StrAppend(&msg, node_def.name(), " should have an input named '", name,
                 "' but it is not available");
       LOG(ERROR) << msg;
-      return tensorflow::errors::InvalidArgument(msg);
+      return errors::InvalidArgument(msg);
+    }
+  }
+  return Status::OK();
+}
+
+// Checks that the number of inputs match, and enforces that the inputs marked
+// as true are constant weights. true means that the input must be a weight,
+// while false means the input must be a tensor. In the future, false will mean
+// the input can be a tensor or weight.
+Status CheckInputsWeights(
+    const OpConverterParams& params,
+    const std::vector<std::pair<string, bool>>& inputs_is_weight) {
+  const auto& inputs = params.inputs;
+  const auto& node_def = params.node_def;
+  if (inputs.size() != inputs_is_weight.size()) {
+    return errors::InvalidArgument(
+        node_def.op(), " got ", inputs.size(), " inputs but expected ",
+        inputs_is_weight.size(), ", at ", node_def.name());
+  }
+  for (int i = 0; i < inputs.size(); i++) {
+    if (inputs_is_weight[i].second && inputs.at(i).is_tensor()) {
+      return errors::Unimplemented("The input \"", inputs_is_weight[i].first,
+                                   "\" for ", node_def.op(),
+                                   " must be a constant, at ", node_def.name());
+    }
+    // TODO(tmorris): Remove this check and provide a method to automatically
+    // retrive an input as a tensor, converting via CreateConstantLayer if it
+    // was originally a weight. We will want a caching mechanism to prevent many
+    // duplicate constants from being created.
+    if (!inputs_is_weight[i].second && inputs.at(i).is_weights()) {
+      return errors::Unimplemented("The input \"", inputs_is_weight[i].first,
+                                   "\" for ", node_def.op(),
+                                   " must be a tensor, at ", node_def.name());
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_dtypes,
+                      const char* dtype_attr_name = "T") {
+  const auto& node_def = params.node_def;
+  TFAttrs attrs(node_def);
+  if (!attrs.count(dtype_attr_name)) {
+    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
+                                   " not found.");
+  }
+  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
+  if (!allowed_dtypes.count(op_dtype)) {
+    // Build string list of allowed types.
+    std::ostringstream ss;
+    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
+      if (it != allowed_dtypes.begin()) ss << ", ";
+      ss << DataTypeString(*it);
+    }
+    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
+                                 " is not supported for ", node_def.op(),
+                                 ", must be one of [", ss.str(), "], at ",
+                                 node_def.name());
+  }
+  return Status::OK();
 }
 
 TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
                                     const TRT_ShapedWeights& weights_src) {
-  auto dtype_new = tensorflow::DataType::DT_HALF;
+  auto dtype_new = DataType::DT_HALF;
   TRT_ShapedWeights weights =
       store->GetTempWeights(dtype_new, weights_src.shape_);
   const float* src = static_cast<const float*>(weights_src.GetValues());
@@ -1340,18 +1538,17 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
   }
 }
 
-tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
-                                TRT_ShapedWeights* oweights,
-                                LambdaFactory unary_op) {
+Status UnaryCompute(const TRT_ShapedWeights& iweights,
+                    TRT_ShapedWeights* oweights, LambdaFactory unary_op) {
   CHECK_EQ(iweights.type_, oweights->type_);
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case DataType::DT_FLOAT: {
       auto inp = static_cast<float const*>(iweights.GetValues());
       auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
+    case DataType::DT_HALF: {
       auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
       auto oup =
           static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
@@ -1360,11 +1557,10 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
       break;
     }
     default:
-      return tensorflow::errors::Unimplemented(
-          "Data type not supported: " +
-          tensorflow::DataTypeString(iweights.type_));
+      return errors::Unimplemented("Data type not supported: " +
+                                   DataTypeString(iweights.type_));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 // If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
@@ -1471,7 +1667,7 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
         const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
-  if (params->converter->precision_mode() == FP16MODE) {
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
@@ -1514,7 +1710,7 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
       // Because of this issue, fall back to BinaryTensorOpTensor if we are
       // doing INT8 with no calibration. There is most likely no performance
       // penalty by falling back here.
-      if (params->converter->precision_mode() == INT8MODE &&
+      if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
           !params->converter->use_calibration()) {
         return errors::Unimplemented(
             "Intermediate quantization range cannot be determined without"
@@ -1561,67 +1757,71 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
   // Pass the output
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
-
-tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
+Status ConvertConv2DHelper(OpConverterParams* params, int group,
+                           bool is_conv2d_backprop_input) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return tensorflow::errors::InvalidArgument("Two inputs are expected for ",
-                                               node_def.op(), ", at ",
-                                               node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        node_def.op(), " is only implemented for tensors, not weights, at ",
-        node_def.name());
-  }
-  if (inputs.at(1).is_tensor()) {
-    return tensorflow::errors::Unimplemented("Kernel for ", node_def.op(),
-                                             " must be constant weights, at ",
-                                             node_def.name());
+  TRT_TensorOrWeights backprop_output_size;
+  const nvinfer1::ITensor* tensor = nullptr;
+  if (is_conv2d_backprop_input) {
+    // In the case when Conv2dBackpropInput is used for conv2d_transpose, these
+    // inputs correspond to: output size, filter, and input.
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params,
+        {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
+    backprop_output_size = inputs.at(0);
+    tensor = inputs.at(2).tensor();
+  } else {
+    TF_RETURN_IF_ERROR(
+        CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
+    tensor = inputs.at(0).tensor();
   }
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
   if (weights_rsck.shape_.nbDims != 4) {
-    return tensorflow::errors::InvalidArgument(
-        "Conv2D expects kernel of dimension 4, at " + node_def.name());
+    return errors::InvalidArgument("Conv2D expects kernel of dimension 4, at " +
+                                   node_def.name());
   }
   TFAttrs attrs(node_def);
   auto data_format = attrs.get<string>("data_format");
   int c_index = (data_format == "NHWC") ? 3 : 1;
   int h_index = (data_format == "NHWC") ? 1 : 2;
   int w_index = (data_format == "NHWC") ? 2 : 3;
-  auto tf_dilations = attrs.get<std::vector<int>>("dilations");
+  auto tf_dilations = attrs.get<std::vector<int64>>("dilations");
   if (tf_dilations.size() != 4) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Convolution dilations field must specify 4 dimensions, at ",
         node_def.name());
   }
   if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Dilation rate must be 1 for batch and channel dimensions, at ",
         node_def.name());
   }
   const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
-
-  const auto tf_stride = attrs.get<std::vector<int>>("strides");
+  if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) {
+    return errors::Unimplemented(
+        "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported",
+        ", at ", node_def.name());
+  }
+
+  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   if (tf_stride.size() != 4) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Convolution strides field must specify 4 dimensions, at ",
         node_def.name());
   }
   if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Stride must be 1 for batch and channel dimensions, at ",
         node_def.name());
   }
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
-  if (params->validation_only) return tensorflow::Status::OK();
-
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  if (params->validation_only) return Status::OK();
 
   // Transpose to NCHW (NCHW is required for IConvLayer).
   const bool need_transpose = (data_format == "NHWC");
@@ -1632,19 +1832,23 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
 
-  // For depthwise convolution, group will be 0 so set num_groups to size of
-  // input's channel dim. For a non-depthwise conv, num_groups will be 1.
+  // group == 0 signifies that this is a depthwise convolution, so set
+  // num_groups to size of input's channel dim. For a non-depthwise conv,
+  // num_groups will be 1.
   const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
 
-  if (params->converter->precision_mode() == FP16MODE) {
-    weights_rsck =
-        ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
+    weights_rsck = ConvertFP32ToFP16(params->weight_store, weights_rsck);
   }
+  // For conv, TF weights are RSCK, and TRT expects KCRS.
+  // For backprop, TF weights are RSKC, and TRT expects CKRS.
+  // Therefore, this reorder will work for both cases.
   TRT_ShapedWeights weights =
       params->weight_store->GetTempWeights(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
   TRT_ShapedWeights biases(weights.type_);
-  const int noutput = weights.shape_.d[0] * num_groups;
+  const int output_axis = is_conv2d_backprop_input ? 1 : 0;
+  const int noutput = weights.shape_.d[output_axis] * num_groups;
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
@@ -1655,9 +1859,23 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
     nvinfer1::DimsHW effective_kernel_size = kernel_size;
     effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
     effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
-    padding = CreateSamePadding(
-        stride, effective_kernel_size,
-        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
+    std::vector<int64_t> input_dims;
+    if (is_conv2d_backprop_input) {
+      // For backprop, calculate padding based on "input_sizes" input, which
+      // actually corresponds to output size. ("input_sizes" makes sense in the
+      // context of Conv2DBackpropInput).
+      // We use h_index and w_index instead of 1 and 2 because we havent
+      // transposed backprop_output_size along with the input.
+      auto output_size_weights = static_cast<int*>(
+          const_cast<void*>(backprop_output_size.weights().GetValues()));
+      input_dims = {output_size_weights[h_index], output_size_weights[w_index]};
+    } else {
+      // Use 1 and 2 because tensor_dim has the dimensions of the transposed
+      // input.
+      input_dims = {static_cast<int>(tensor_dim.d[1]),
+                    static_cast<int>(tensor_dim.d[2])};
+    }
+    padding = CreateSamePadding(stride, effective_kernel_size, input_dims);
   } else {
     padding = {{0, 0}, {0, 0}};
   }
@@ -1676,17 +1894,32 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   }
 
   // Add convolution.
-  nvinfer1::IConvolutionLayer* layer =
-      params->converter->network()->addConvolution(
-          *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
-          weights.GetTrtWeights(), biases.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setStride(stride);
-  layer->setPadding({padding[0].first, padding[1].first});
-  layer->setName(node_def.name().c_str());
-  layer->setNbGroups(num_groups);
-  layer->setDilation(dilation);
-  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  nvinfer1::ILayer* conv_layer = nullptr;
+  if (is_conv2d_backprop_input) {
+    nvinfer1::IDeconvolutionLayer* layer =
+        params->converter->network()->addDeconvolution(
+            *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
+            weights.GetTrtWeights(), biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStride(stride);
+    layer->setPadding({padding[0].first, padding[1].first});
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    conv_layer = layer;
+  } else {
+    nvinfer1::IConvolutionLayer* layer =
+        params->converter->network()->addConvolution(
+            *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
+            weights.GetTrtWeights(), biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStride(stride);
+    layer->setPadding({padding[0].first, padding[1].first});
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    layer->setDilation(dilation);
+    conv_layer = layer;
+  }
+  const nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
 
   // Restore transpose.
   if (need_transpose) {
@@ -1696,19 +1929,7 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   }
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
-                                       ConvolutionType type) {
-  switch (type) {
-    case ConvolutionType::DEFAULT:
-      return ConvertConv2DHelper(params, 1);
-    case ConvolutionType::DEPTHWISE_CONV:
-      return ConvertConv2DHelper(params, 0);
-  }
-  return tensorflow::errors::Unimplemented("Unsupported convolution type, at ",
-                                           params->node_def.name());
+  return Status::OK();
 }
 
 Status BinaryTensorOpTensor(OpConverterParams* params,
@@ -1723,6 +1944,7 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
       {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
+      {"Pow", nvinfer1::ElementWiseOperation::kPOW},
   };
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end()) {
@@ -1750,10 +1972,10 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
   const nvinfer1::ITensor* tensor_l = nullptr;
   const nvinfer1::ITensor* tensor_r = nullptr;
   status = params->converter->PrepareTensorForShape(
-      operand_l, broadcasted_dims_l, &tensor_l);
+      operand_l, broadcasted_dims_l, /*validation_only=*/false, &tensor_l);
   if (status.ok()) {
     status = params->converter->PrepareTensorForShape(
-        operand_r, broadcasted_dims_r, &tensor_r);
+        operand_r, broadcasted_dims_r, /*validation_only=*/false, &tensor_r);
   }
   if (!status.ok()) {
     return errors::Internal("Failed to convert binary op ", node_def.name(),
@@ -1776,10 +1998,10 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
 
   // Pass the output
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertPlugin(OpConverterParams* params) {
+Status ConvertPlugin(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   // prepare input
@@ -1804,7 +2026,7 @@ tensorflow::Status ConvertPlugin(OpConverterParams* params) {
     size_t size_data = data.size() * sizeof(float);
     if (!plugin->SetAttribute(attr_key, static_cast<void*>(data.data()),
                               size_data)) {
-      return tensorflow::errors::InvalidArgument("plugin SetAttribute failed");
+      return errors::InvalidArgument("plugin SetAttribute failed");
     }
   }
 
@@ -1815,17 +2037,15 @@ tensorflow::Status ConvertPlugin(OpConverterParams* params) {
     nvinfer1::ITensor* output_tensor = layer->getOutput(i);
     params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertTranspose(OpConverterParams* params) {
+Status ConvertTranspose(OpConverterParams* params) {
   const auto& inputs = params->inputs;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at ", params->node_def.name());
-  }
-
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"x", false}, {"perm", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   // Get the permutation from weights.
   TRT_ShapedWeights weights = inputs.at(1).weights();
   const int* weights_ptr =
@@ -1852,22 +2072,21 @@ tensorflow::Status ConvertTranspose(OpConverterParams* params) {
       params->converter->TransposeTensor(input_tensor, perm, &output_tensor));
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertReshape(OpConverterParams* params) {
+Status ConvertReshape(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects weights for shape, at ", node_def.name());
-  }
-
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"tensor", false}, {"shape", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   TRT_ShapedWeights weights = inputs.at(1).weights();
   if (weights.count() == 0) {
-    return tensorflow::errors::Unimplemented(
-        "Reshape to shape=[] is not supported, at ", node_def.name());
+    return errors::Unimplemented("Reshape to shape=[] is not supported, at ",
+                                 node_def.name());
   }
 
   const int* weights_ptr =
@@ -1949,27 +2168,19 @@ tensorflow::Status ConvertReshape(OpConverterParams* params) {
   // Start conversion.
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, reshape_dims, &output_tensor));
+      input_tensor, reshape_dims, /*validation_only=*/false, &output_tensor));
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
+Status ConvertExpandDims(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return tensorflow::errors::InvalidArgument(
-        "Two inputs expected for ExpandDims, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "ExpandDims expects tensor for input, at ", node_def.name());
-  }
-  if (!inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "ExpandDims expects weights for axis, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   // Get input shape as vector.
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
@@ -1980,15 +2191,15 @@ tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
   // Get axis to expand on.
   TRT_ShapedWeights weights = inputs.at(1).weights();
   if (weights.count() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        "ExpandDims axis must be a scalar, at ", node_def.name());
+    return errors::InvalidArgument("ExpandDims axis must be a scalar, at ",
+                                   node_def.name());
   }
   const int* weights_ptr =
       static_cast<int*>(const_cast<void*>(weights.GetValues()));
   int axis = weights_ptr[0];
   // Make sure axis is valid.
   if ((axis < (-input_rank - 1)) || (axis > input_rank)) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Axis for ExpandDims is invalid, must be in the range "
         "[-rank(input) - 1, rank(input)], at ",
         node_def.name());
@@ -1996,7 +2207,7 @@ tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
   // Convert negative axis to corresponding positive axis.
   if (axis < 0) axis += input_rank + 1;
   if (axis == 0) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Modifying batch dimension is not supported for ExpandDims, at ",
         node_def.name());
   }
@@ -2010,23 +2221,18 @@ tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
                                                /*ignore_first_dim=*/true));
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, new_dims, &output_tensor));
+      input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
+Status ConvertSqueeze(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        "One input expected for Squeeze, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Squeeze expects tensor for input, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   // Get input shape.
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
@@ -2036,15 +2242,15 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   const int input_rank = input_dims.size();
   // Mark axes to remove by setting them to 0.
   TFAttrs attrs(node_def);
-  auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
-  if (squeeze_dims.size() == 0) {
-    return tensorflow::errors::Unimplemented(
+  auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
+  if (squeeze_dims.empty()) {
+    return errors::Unimplemented(
         "Squeeze is only implemented for explicit dims, at ", node_def.name());
   }
   for (int axis : squeeze_dims) {
     // Make sure axis is valid.
     if ((axis < -input_rank) || (axis >= input_rank)) {
-      return tensorflow::errors::InvalidArgument(
+      return errors::InvalidArgument(
           "Axis for Squeeze is invalid, must be in the range "
           "[-rank(input), rank(input)), at ",
           node_def.name());
@@ -2053,14 +2259,14 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
     if (axis < 0) axis += input_rank;
     // Don't squeeze batch dim.
     if (axis == 0) {
-      return tensorflow::errors::Unimplemented(
-          "Cannot squeeze batch dimension, at ", node_def.name());
+      return errors::Unimplemented("Cannot squeeze batch dimension, at ",
+                                   node_def.name());
     }
     // Make sure target dimension is size 1.
     if (input_dims[axis] != 1) {
-      return tensorflow::errors::InvalidArgument(
-          "Cannot squeeze a dimension which isn't size 1, at ",
-          node_def.name());
+      return errors::InvalidArgument(
+          "Cannot squeeze ", axis, "th dimension ", input_dims[axis],
+          " which isn't size 1, at ", node_def.name());
     }
     // Mark dim for removal by setting to 0.
     input_dims[axis] = 0;
@@ -2076,117 +2282,79 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
                                                /*ignore_first_dim=*/true));
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      input_tensor, new_dims, &output_tensor));
+      input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
-}
-
-// Gets the bounds (start or end) from the weights of a StridedSlice op.
-tensorflow::Status GetStridedSliceBound(const std::vector<int>& input_dims,
-                                        const TRT_ShapedWeights& bound_weights,
-                                        int mask, bool begin, string node_name,
-                                        std::vector<int>* output_bound) {
-  const string bound_name = (begin) ? "begin" : "end";
-  const int* weights_ptr = static_cast<int*>(bound_weights.GetValues());
-  *output_bound =
-      std::vector<int>(weights_ptr, weights_ptr + bound_weights.count());
-  if (output_bound->size() != input_dims.size()) {
-    return tensorflow::errors::InvalidArgument(
-        "StridedSlice \"", bound_name, "\" specified ",
-        std::to_string(output_bound->size()), " dimensions, but input rank is ",
-        std::to_string(input_dims.size()), ", at ", node_name);
-  }
-  for (int i = 0; i < output_bound->size(); i++) {
-    if ((1 << i) & mask) {
-      // Apply mask.
-      (*output_bound)[i] = (begin) ? 0 : input_dims[i];
-      // Masked bound will always result in a valid, non-negative bound, so we
-      // don't need the following checks. For the common case of using masks on
-      // a undefined batch dim (-1), we specifically don't want to do the
-      // following checks because they will erroneously detect an out of range
-      // bound or try to correct the negative value.
-      continue;
-    }
-    // Make sure bound is valid.
-    if (((*output_bound)[i] < -input_dims[i]) ||
-        ((*output_bound)[i] > input_dims[i])) {
-      return tensorflow::errors::InvalidArgument(
-          bound_name, " value of ", std::to_string((*output_bound)[i]),
-          " for StridedSlice is invalid, must be in the range "
-          "[-dim_size(i), dim_size(i)], at ",
-          node_name);
-    }
-    // Convert negative values to their positive equivalent.
-    if ((*output_bound)[i] < 0) {
-      (*output_bound)[i] += input_dims[i];
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
-  const auto& inputs = params->inputs;
+  return Status::OK();
+}
+
+Status ConvertStridedSliceHelper(OpConverterParams* params,
+                                 const TRT_TensorOrWeights& input,
+                                 std::vector<int> begin, std::vector<int> size,
+                                 const std::vector<int>& stride) {
   const auto& node_def = params->node_def;
-  if (inputs.size() != 4) {
-    return tensorflow::errors::InvalidArgument(
-        "StridedSlice expects 4 inputs, at ", node_def.name());
-  }
-  if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights() ||
-      !inputs.at(3).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "StridedSlice expects weights for begin, end, and strides, at ",
-        node_def.name());
-  }
-  if (!inputs.at(0).is_tensor()) {
-    return tensorflow::errors::Unimplemented(
-        "StridedSlice is only implemented for tensors, at ", node_def.name());
-  }
   // Get input dims.
-  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  nvinfer1::Dims dims = input.GetTrtDims();
   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  if (inputs.at(0).is_tensor()) {
-    // Temporarily add batch dimension so that indexes line up properly.
-    input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
-  }
-  if (input_dims.size() > 4) {
-    return tensorflow::errors::Unimplemented(
-        "StridedSlice is not implemented for tensors with rank > 4, at ",
-        node_def.name());
+  // Temporarily add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), -1);
+  // Check bounds.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (begin[i] < 0 || begin[i] > input_dims[i]) {
+      return errors::InvalidArgument("\"begin\" for dimension ",
+                                     std::to_string(i), " in ", node_def.op(),
+                                     " is out of range, at ", node_def.name());
+    }
+    const int end = begin[i] + size[i];
+    if (end < 0 || end > input_dims[i]) {
+      return errors::InvalidArgument("\"begin\" + \"size\" for dimension ",
+                                     std::to_string(i), " in ", node_def.op(),
+                                     " is out of range, at ", node_def.name());
+    }
+    if (size[i] <= 0) {
+      return errors::InvalidArgument("\"size\" cannot be negative or zero for ",
+                                     node_def.op(), ", at ", node_def.name());
+    }
   }
-  TFAttrs attrs(node_def);
-  // Get begin and end bounds per axis.
-  std::vector<int> begin, end;
-  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(1).weights(),
-                                          attrs.get<int>("begin_mask"), true,
-                                          node_def.name(), &begin));
-  TF_RETURN_IF_ERROR(GetStridedSliceBound(input_dims, inputs.at(2).weights(),
-                                          attrs.get<int>("end_mask"), false,
-                                          node_def.name(), &end));
-  // Get strides per axis (must all be 1).
-  TRT_ShapedWeights stride_weights = inputs.at(3).weights();
-  const int* stride_weights_ptr = static_cast<int*>(stride_weights.GetValues());
-  std::vector<int> strides(stride_weights_ptr,
-                           stride_weights_ptr + stride_weights.count());
-  for (int x : strides) {
+// TRT 5.1 adds a slice layer. For older versions, we attempt to use the
+// padding layer with negative padding.
+#if IS_TRT_VERSION_GE(5, 1, 0) && 0
+  // TODO(laigd): TRT 5.1 RC has a bug when ISliceLayer is used along with
+  // IConcatenationLayer, so disable ISliceLayer for now until it's fixed.
+  // Use ISliceLayer.
+  nvinfer1::Dims begin_dims, size_dims, stride_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(begin, &begin_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &size_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(stride, &stride_dims,
+                                               /*ignore_first_dim=*/true));
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
+      *const_cast<nvinfer1::ITensor*>(input.tensor()), begin_dims, size_dims,
+      stride_dims);
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+#else
+  // Use IPaddingLayer.
+  // Strides must be 1 in this case.
+  for (int x : stride) {
     if (x != 1) {
-      return tensorflow::errors::Unimplemented(
-          "StridedSlice is only implemented for stride of 1, at ",
+      return errors::Unimplemented(
+          "Strides other than 1 are not supported with this version of TRT, "
+          "at ",
           node_def.name());
     }
   }
-  // Unsupported mask options.
-  for (const string& attr :
-       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
-    int attr_val = attrs.get<int>(attr);
-    if (attr_val != 0) {
-      return tensorflow::errors::Unimplemented(
-          attr, " is not supported for StridedSlice, at ", node_def.name());
-    }
+  // Rank must be 2, 3 or 4.
+  if (input_dims.size() > 4) {
+    return errors::Unimplemented(node_def.op(),
+                                 " for tensors with rank > 4 is "
+                                 "not supported in this version of "
+                                 "TRT, at ",
+                                 node_def.name());
   }
-
-  nvinfer1::ITensor* tensor =
-      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
   // Reshape if necessary to 4-D, since IPaddingLayer requires a 4-D input.
   const bool need_reshape = (input_dims.size() != 4);
   int reshape_dims_added = 0;
@@ -2196,7 +2364,7 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
     while (input_dims.size() < 4) {
       input_dims.insert(input_dims.begin() + 1, 1);
       begin.insert(begin.begin() + 1, 0);
-      end.insert(end.begin() + 1, 1);
+      size.insert(size.begin() + 1, 1);
       reshape_dims_added++;
     }
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &reshape_dims,
@@ -2204,24 +2372,23 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   }
   // Find dimensions which need to be sliced.
   std::vector<int> pad_dims;
-  for (int i = 0; i < input_dims.size(); i++) {
-    if ((begin[i] != 0) || (end[i] != input_dims[i])) {
-      if (i == 0) {
-        return tensorflow::errors::Unimplemented(
-            "StridedSlice can't modify batch dim, at ", node_def.name());
-      } else if ((end[i] - begin[i]) < 0) {
-        return tensorflow::errors::InvalidArgument(
-            "New size of sliced dimension is negative, at ", node_def.name());
-      }
+  for (int i = 1; i < input_dims.size(); i++) {
+    if ((begin[i] != 0) || (begin[i] + size[i] != input_dims[i])) {
       pad_dims.push_back(i);
     }
   }
-  if (pad_dims.size() == 0) {
-    // No dimensions are changed. We could create a padding layer anyway with
-    // values of 0.
+  if (pad_dims.empty()) {
+    // No dimensions are changed, so this is a no-op. We could just return the
+    // input without creating a new layer. TRT will crash if an empty engine
+    // with no layers is attempted to be created, so we add a no-op shuffle to
+    // prevent our unit tests from breaking.
+    // TODO(tmorris): Allow empty engines in the unit tests and return the input
+    // as output here.
     if (params->validation_only) return Status::OK();
-    params->outputs->push_back(inputs.at(0));
-    return tensorflow::Status::OK();
+    nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+        *const_cast<nvinfer1::ITensor*>(input.tensor()));
+    params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+    return Status::OK();
   } else if (pad_dims.size() == 1) {
     // Only one dim is modified but we have to have 2, mark a second dim which
     // will have padding of 0. The dim we add is chosen to avoid an unecessary
@@ -2232,17 +2399,20 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
       pad_dims.push_back(3);
     }
   } else if (pad_dims.size() > 2) {
-    return tensorflow::errors::Unimplemented(
-        "StridedSlice can only modify 2 dimensions, at ", node_def.name());
+    return errors::Unimplemented(
+        node_def.op(),
+        " can only modify up to 2 dimensions in this version of TRT, at ",
+        node_def.name());
   }
   std::sort(pad_dims.begin(), pad_dims.end());
   // Convert to pre/post padding values. Since TRT does not have a StridedSlice
-  // or Slice layer, we instead create an IPaddingLayer with negative padding.
+  // or Slice layer prior to 5.1, we instead create an IPaddingLayer with
+  // negative padding.
   nvinfer1::DimsHW pre_padding, post_padding;
   for (int i = 0; i < pad_dims.size(); i++) {
     const int axis = pad_dims[i];
     pre_padding.d[i] = -begin[axis];
-    post_padding.d[i] = end[axis] - input_dims[axis];
+    post_padding.d[i] = (begin[axis] + size[axis]) - input_dims[axis];
   }
 
   // IPaddingLayer will always apply the padding to dims 2,3 (input format is
@@ -2262,10 +2432,11 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   if (params->validation_only) return Status::OK();
 
   // Start conversion.
+  nvinfer1::ITensor* tensor = const_cast<nvinfer1::ITensor*>(input.tensor());
   if (need_reshape) {
     const nvinfer1::ITensor* output_tensor = nullptr;
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        inputs.at(0), reshape_dims, &output_tensor));
+        input, reshape_dims, /*validation_only=*/false, &output_tensor));
     tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
   }
   if (need_transpose) {
@@ -2274,7 +2445,6 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
         tensor, transpose_order, &output_tensor));
     tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
   }
-
   // Add padding layer
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
@@ -2282,7 +2452,6 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
   params->converter->MarkQuantizationRangesAsInferrable(tensor,
                                                         layer->getOutput(0));
   tensor = layer->getOutput(0);
-
   // Restore transpose
   if (need_transpose) {
     const nvinfer1::ITensor* output_tensor = nullptr;
@@ -2295,14 +2464,14 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
     // Calculate output dimensions
     for (int i = 0; i < pad_dims.size(); i++) {
       const int axis = pad_dims[i];
-      input_dims[axis] = end[axis] - begin[axis];
+      input_dims[axis] = size[axis];
     }
     // Remove added 1 dimensions
     for (int i = 0; i < reshape_dims_added; i++) {
       int value = input_dims[1];
       if (value != 1) {
-        return tensorflow::errors::Internal(
-            "StridedSlice error when reshaping, at ", node_def.name());
+        return errors::Internal("StridedSlice error when reshaping, at ",
+                                node_def.name());
       }
       input_dims.erase(input_dims.begin() + 1);
     }
@@ -2312,45 +2481,181 @@ tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
                                                  /*ignore_first_dim=*/true));
     const nvinfer1::ITensor* output_tensor = nullptr;
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        TRT_TensorOrWeights(tensor), new_dims, &output_tensor));
+        TRT_TensorOrWeights(tensor), new_dims, /*validation_only=*/false,
+        &output_tensor));
     tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
   }
 
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(tensor)));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertConv2D(OpConverterParams* params) {
-  return ConvertConv2DHelper(params, ConvolutionType::DEFAULT);
+  return Status::OK();
+#endif
 }
 
-tensorflow::Status ConvertConv2DDepthwise(OpConverterParams* params) {
-  return ConvertConv2DHelper(params, ConvolutionType::DEPTHWISE_CONV);
+Status ConvertSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"input", false}, {"begin", true}, {"size", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> size = inputs.at(2).weights().ToVector<int>();
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  if (!AllLengthsEqual({input_dims, begin, size})) {
+    return errors::InvalidArgument(
+        "Length of begin and size arguments must equal rank of input for "
+        "Slice, at ",
+        node_def.name());
+  }
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = begin[0] != 0;
+  // If size[0]s is not -1, we can only know if the batch dimension is
+  // unmodified when the batch size is defined. When the batch size is
+  // undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool size_is_modified =
+      size[0] != -1 && (!batch_size_is_defined ||
+                        (batch_size_is_defined && size[0] != input_dims[0]));
+  if (begin_is_modified || size_is_modified) {
+    return errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Size of -1 signifies to take all remaining elements.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (size[i] == -1) {
+      size[i] = input_dims[i] - begin[i];
+    }
+  }
+  // Stride is 1 for all dims.
+  std::vector<int> stride(begin.size(), 1);
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
 }
 
-tensorflow::Status ConvertPool(OpConverterParams* params) {
+Status ConvertStridedSlice(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        node_def.op(), " is only implemented for tensors, not weights, at ",
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  // Get begin and end bounds per axis.
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> end = inputs.at(2).weights().ToVector<int>();
+  std::vector<int> stride = inputs.at(3).weights().ToVector<int>();
+  if (!AllLengthsEqual({input_dims, begin, end, stride})) {
+    return errors::InvalidArgument(
+        "Length of begin, end, and stride arguments must equal rank of input "
+        "for StridedSlice, at ",
         node_def.name());
   }
+  // Unsupported mask options.
+  TFAttrs attrs(node_def);
+  for (const string& attr :
+       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
+    int attr_val = attrs.get<int64>(attr);
+    if (attr_val != 0) {
+      return errors::Unimplemented(
+          attr, " is not supported for StridedSlice, at ", node_def.name());
+    }
+  }
+  const int begin_mask = attrs.get<int64>("begin_mask");
+  const int end_mask = attrs.get<int64>("end_mask");
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = !(begin_mask & 1) && begin[0] != 0;
+  const bool stride_is_modified = stride[0] != 1;
+  // If the batch size is -1 and the end mask is not set, we can only know if
+  // the batch dimension is unmodified when the batch size is defined. When the
+  // batch size is undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool end_is_modified =
+      !(end_mask & 1) && (!batch_size_is_defined ||
+                          (batch_size_is_defined && end[0] != input_dims[0]));
+  if (begin_is_modified || stride_is_modified || end_is_modified) {
+    return errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Standarize begin and end bounds by applying masks, making negative values
+  // positive, and correcting out of bounds ranges (StridedSlice does this
+  // silently).
+  for (int i = 1; i < input_dims.size(); i++) {
+    // Begin
+    if ((1 << i) & begin_mask) {
+      begin[i] = 0;
+    } else if (begin[i] < 0) {
+      begin[i] += input_dims[i];
+    }
+    begin[i] = std::max(0, std::min(begin[i], input_dims[i]));
+    // End
+    if ((1 << i) & end_mask) {
+      end[i] = input_dims[i];
+    } else if (end[i] < 0) {
+      end[i] += input_dims[i];
+    }
+    end[i] = std::max(0, std::min(end[i], input_dims[i]));
+  }
+  // Negative or zero strides currently not supported.
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (stride[i] <= 0) {
+      return errors::Unimplemented(
+          "Negative or zero stride values are not supported for StridedSlice, "
+          "at ",
+          node_def.name());
+    }
+  }
+  // TRT Slice layer uses (begin, size) instead of (begin, end)
+  std::vector<int> size(input_dims.size());
+  for (int i = 0; i < input_dims.size(); i++) {
+    // Divide by stride (round up)
+    size[i] = (end[i] - begin[i] + stride[i] - 1) / stride[i];
+  }
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
+}
+
+Status ConvertConv2D(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false);
+}
+
+Status ConvertConv2DDepthwise(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false);
+}
+
+Status ConvertConv2DBackpropInput(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
+}
+
+Status ConvertPool(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   nvinfer1::PoolingType type;
   if (node_def.op() == "MaxPool") {
     type = nvinfer1::PoolingType::kMAX;
   } else if (node_def.op() == "AvgPool") {
     type = nvinfer1::PoolingType::kAVERAGE;
   } else {
-    return tensorflow::errors::Unimplemented(
-        "Unsupported pooling type: ", node_def.op(), ", at ", node_def.name());
+    return errors::Unimplemented("Unsupported pooling type: ", node_def.op(),
+                                 ", at ", node_def.name());
   }
   TFAttrs attrs(node_def);
   const string padding_type = attrs.get<string>("padding");
   if ((padding_type != "SAME") && (padding_type != "VALID")) {
-    return tensorflow::errors::Unimplemented(
-        "Unsupported padding type: ", padding_type, ", at ", node_def.name());
+    return errors::Unimplemented("Unsupported padding type: ", padding_type,
+                                 ", at ", node_def.name());
   }
   if (params->validation_only) return Status::OK();
 
@@ -2365,10 +2670,10 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
   }
 
-  const auto tf_stride = attrs.get<std::vector<int>>("strides");
+  const auto tf_stride = attrs.get<std::vector<int64>>("strides");
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
-  const auto tf_kernel = attrs.get<std::vector<int>>("ksize");
+  const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
   const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
 
   auto tensor_dim = tensor->getDimensions();
@@ -2420,21 +2725,61 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
   }
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertActivation(OpConverterParams* params) {
+// TODO(tmorris): Use ActivationType::kLEAKY_RELU in TRT 5.1+ once perf
+// improves.
+Status ConvertLeakyRelu(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        node_def.op(), " expects one input, at ", node_def.name());
-  }
-  if (!inputs.at(0).is_tensor()) {
-    return tensorflow::errors::Unimplemented(
-        node_def.op(), " is only implemented for tensors, at ",
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+
+  TFAttrs attrs(node_def);
+  const float alpha = attrs.get<float>("alpha");
+  if (alpha < 0.0f || alpha > 1.0f) {
+    return errors::Unimplemented(
+        "Alpha value for LeakyRelu must be between 0 and 1, at ",
         node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+
+  // Input Tensor
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  // Create const for alpha.
+  const nvinfer1::ITensor* const_alpha_tensor = nullptr;
+  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
+      params, alpha, tensor->getDimensions(), &const_alpha_tensor));
+  // alpha * x
+  nvinfer1::IElementWiseLayer* mul_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          *const_cast<nvinfer1::ITensor*>(const_alpha_tensor),
+          nvinfer1::ElementWiseOperation::kPROD);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name());
+  // max(x, alpha * x)
+  nvinfer1::IElementWiseLayer* max_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          *const_cast<nvinfer1::ITensor*>(mul_layer->getOutput(0)),
+          nvinfer1::ElementWiseOperation::kMAX);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = max_layer->getOutput(0);
+  params->converter->MarkQuantizationRangesAsInferrable(
+      output_tensor, const_cast<nvinfer1::ITensor*>(mul_layer->getOutput(0)));
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+Status ConvertActivation(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   static const std::unordered_map<string, nvinfer1::ActivationType> ops{
       {"Relu", nvinfer1::ActivationType::kRELU},
       {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
@@ -2442,11 +2787,10 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) {
   };
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end()) {
-    return tensorflow::errors::Unimplemented(
-        "Activation op: ", node_def.op(),
-        " not supported at: ", node_def.name());
+    return errors::Unimplemented("Activation op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
   }
-  if (params->validation_only) return tensorflow::Status::OK();
+  if (params->validation_only) return Status::OK();
 
   // Start conversion.
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
@@ -2462,25 +2806,25 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) {
     params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ConvertQuantize(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if ((inputs.size() == 0) ||
-      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
-      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
-      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
-      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
-    return errors::InvalidArgument("Invalid number of inputs for ",
-                                   node_def.op(), ", at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    // TensorRT will automatically quantize weights, so we will ignore ranges
-    // for weights.
-    params->outputs->push_back(inputs.at(0));
-    return Status::OK();
+  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  } else if (node_def.op() == "FakeQuantWithMinMaxVars") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params, {{"input", false}, {"min", true}, {"max", true}}));
+  } else if (node_def.op() == "QuantizeAndDequantizeV2") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params, {{"input", false}, {"input_min", true}, {"input_max", true}}));
+  } else if (node_def.op() == "QuantizeAndDequantizeV3") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
+                                                    {"input_min", true},
+                                                    {"input_max", true},
+                                                    {"num_bits", true}}));
   }
   float min_range = 0.0f;
   float max_range = 0.0f;
@@ -2497,11 +2841,6 @@ Status ConvertQuantize(OpConverterParams* params) {
              node_def.op() == "QuantizeAndDequantizeV2" ||
              node_def.op() == "QuantizeAndDequantizeV3") {
     // Get ranges via inputs.
-    if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
-      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
-                                     " must be weights not tensors, at ",
-                                     node_def.name());
-    }
     auto get_weights_value = [&inputs](int index) {
       auto raw_weights = static_cast<float*>(
           const_cast<void*>(inputs.at(index).weights().GetValues()));
@@ -2532,20 +2871,13 @@ Status ConvertQuantize(OpConverterParams* params) {
   return Status::OK();
 }
 
-// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
-// Relu6 natively.
-tensorflow::Status ConvertRelu6(OpConverterParams* params) {
+// TODO(tmorris): Use ActivationType::kCLIP in TRT 5.1+ once perf improves.
+Status ConvertRelu6(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        "Invalid number of inputs for Relu6, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Relu6 is only implemented for tensors, not weights, at ",
-        node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (params->validation_only) return Status::OK();
   // ***************************************************************************
   // TensorRT does not implement Relu6 natively. This function converts Relu6 op
@@ -2569,24 +2901,10 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
                                               6.0f);
 
-  // Create a constant layer to store the floating point weight i.e. 6.0f This
-  // tensor will be broadcasted uniformly during elementwise `min` operation.
-  // The constant has to have the same rank as the input in order for TRT to
-  // broadcast
-  nvinfer1::Dims dims;
-  dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    dims.d[i] = 1;
-  }
-  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
-      tensorflow::DataType::DT_FLOAT, dims);
-  auto weights_ptr =
-      static_cast<float*>(const_cast<void*>(weights.GetValues()));
-  weights_ptr[0] = 6.0f;
-  nvinfer1::ITensor* const6_tensor =
-      params->converter->CreateConstantLayer(weights, dims);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_tensor, node_def.name());
-  params->converter->ProvideQuantizationRange(const6_tensor, 0.0f, 6.0f);
+  // Create a constant layer to store the floating point weight i.e. 6.0f
+  const nvinfer1::ITensor* const6_tensor = nullptr;
+  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
+      params, 6.0f, relu_layer->getOutput(0)->getDimensions(), &const6_tensor));
 
   // ElementWise Min Operation
   // Min op is a nop for INT8 execution path, as the input tensor
@@ -2594,7 +2912,8 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   nvinfer1::IElementWiseLayer* relu6_layer =
       params->converter->network()->addElementWise(
           *const_cast<nvinfer1::ITensor*>(relu_layer->getOutput(0)),
-          *const6_tensor, nvinfer1::ElementWiseOperation::kMIN);
+          *const_cast<nvinfer1::ITensor*>(const6_tensor),
+          nvinfer1::ElementWiseOperation::kMIN);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
   nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
@@ -2603,26 +2922,19 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   return Status::OK();
 }
 
-tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
+Status ConvertBiasAdd(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return errors::InvalidArgument("Input expects tensor and weights, at ",
-                                   node_def.name());
-  }
-  TFAttrs attrs(node_def);
-  tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
-  if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
-    return errors::Unimplemented("Data type is not supported, for node ",
-                                 node_def.name(), " got ",
-                                 DataTypeString(tf_dtype));
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"value", false}, {"bias", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* tensor =
       const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
   const nvinfer1::Dims original_dims = tensor->getDimensions();
+  TFAttrs attrs(node_def);
   const string data_format = attrs.get<string>("data_format");
   const int channel_index =
       (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
@@ -2668,7 +2980,7 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   }
 
   TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (params->converter->precision_mode() == FP16MODE) {
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
@@ -2712,43 +3024,69 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   return Status::OK();
 }
 
-Status GetTensorDimsWithProtoShape(const Tensor& tensor,
-                                   int tensor_proto_array_len,
-                                   nvinfer1::Dims* dims) {
+void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
   if (tensor.dims() > 0) {
     *dims = GetTrtDimsForTensor(tensor);
-    if (TrtDimsNumElements(*dims) != tensor_proto_array_len &&
-        tensor_proto_array_len != 1) {
-      return errors::InvalidArgument(
-          "Broadcast on weights only supports kCHANNEL and kUNIFORM");
-    }
   } else {
     dims->nbDims = 1;
     // No dimension provided. Flatten it.
-    dims->d[0] = tensor_proto_array_len;
+    dims->d[0] = tensor.NumElements();
     dims->type[0] = nvinfer1::DimensionType::kSPATIAL;
     for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
       dims->d[i] = 0;
     }
   }
-  return Status::OK();
 }
 
-template <typename CType>
-Status TfTensorToTrtWeights(const DataType dtype, const Tensor& tensor,
-                            const CType* tensor_proto_array,
-                            int tensor_proto_array_len, TrtWeightStore* store,
+Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
                             TRT_ShapedWeights* weights) {
+  const DataType dtype = tensor.dtype();
+
+  // We always convert the integer constants to INT32, since TRT INT8 is for
+  // quantized inference.
+  //
+  // TODO(aaroey): FP16 will remain in half format and is not converted to
+  // FP32, but the converter currently uses all float weights as FP32. Fix
+  // this.
+  const DataType converted_dtype =
+      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
+                                                                  : dtype);
+
+  // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
+
+  if (tensor.NumElements() == 0) {
+    // Return empty weights having converted dtype.
+    *weights = TRT_ShapedWeights(converted_dtype);
+    return Status::OK();
+  }
+
   nvinfer1::Dims weight_dims;
-  TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(tensor, tensor_proto_array_len,
-                                                 &weight_dims));
-  *weights = store->GetTempWeights(dtype, weight_dims);
-  void* dst = const_cast<void*>(weights->GetValues());
-  if (tensor_proto_array_len == 1) {
-    std::fill_n((CType*)dst, TrtDimsNumElements(weight_dims),
-                *tensor_proto_array);
+  GetTensorDimsWithProtoShape(tensor, &weight_dims);
+  *weights = weight_store->GetTempWeights(converted_dtype, weight_dims);
+
+  // Copy the tensor directly if the tensor does not require cast to the
+  // supported type.
+  if (converted_dtype == dtype) {
+    char* dst = static_cast<char*>(const_cast<void*>(weights->GetValues()));
+    memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
+    return Status::OK();
+  }
+
+  // Copy tensor elements after casting them to the converted DataType.
+  int32* dst = static_cast<int32*>(const_cast<void*>(weights->GetValues()));
+  if (dtype == DT_INT16) {
+    const int16* src = tensor.flat<int16>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
+  } else if (dtype == DT_INT8) {
+    const int8* src = tensor.flat<int8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   } else {
-    memcpy(dst, tensor_proto_array, weights->size_bytes());
+    // dtype can only be DT_UINT8 at this point.
+    TFTRT_CHECK_EQ_TYPE(dtype, DT_UINT8);
+    const uint8* src = tensor.flat<uint8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   }
   return Status::OK();
 }
@@ -2758,7 +3096,7 @@ Status TfTensorToTrtWeights(const DataType dtype, const Tensor& tensor,
 // weights to params->outputs. We did this since TrtNodeValidator needs the
 // weights as input to other nodes, and use it to determine whether those nodes
 // are supported by TRT.
-tensorflow::Status ConvertConst(OpConverterParams* params) {
+Status ConvertConst(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (!inputs.empty()) {
@@ -2766,117 +3104,55 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
         "Constant node is expected to have empty input list: ",
         node_def.name());
   }
-  TFAttrs attrs(node_def);
-  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
-  // We always convert the integer constants to kINT32, since TRT kINT8 is for
-  // quantized inference.
-  const DataType converted_dtype =
-      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
-                                                                  : dtype);
-  nvinfer1::DataType trt_dtype;
-  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
 
   // Create shaped weights as output
   const auto& tensor_proto = node_def.attr().at("value").tensor();
-  tensorflow::Tensor tensor;
+  Tensor tensor;
   if (!tensor.FromProto(tensor_proto)) {
-    return tensorflow::errors::Internal("Cannot parse weight tensor proto: ",
-                                        node_def.name());
+    return errors::Internal("Cannot parse weight tensor proto: ",
+                            node_def.name());
   }
 
-  TRT_ShapedWeights weights(converted_dtype);
-  if (tensor.NumElements() == 0) {
-    // Do nothing.
-  } else if (!tensor_proto.float_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.float_val().begin(),
-        tensor_proto.float_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.int_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.int_val().begin(),
-        tensor_proto.int_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.half_val().empty()) {
-    // TODO(aaroey): implement fp16 conversion.
-    return errors::Unimplemented("fp16 constant is not supported yet.");
-  } else if (!tensor_proto.tensor_content().empty()) {
-    // TODO(aaroey): fp16 will remain in half format and is not converted to
-    // fp32, but the converter currently uses all float weights as fp32. Fix
-    // this.
-    const auto& content = tensor_proto.tensor_content();
-    if (content.size() > 0) {
-      const int dtype_size = tensorflow::DataTypeSize(dtype);
-      if (content.size() % dtype_size != 0) {
-        return errors::FailedPrecondition("Tensor content size ",
-                                          content.size(),
-                                          " is not a multiple of ", dtype_size);
-      }
-      nvinfer1::Dims weights_dim;
-      TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(
-          tensor, content.size() / dtype_size, &weights_dim));
-      const int64_t size_bytes = TrtDimsNumElements(weights_dim) * dtype_size;
-      if (content.size() != size_bytes) {
-        return errors::FailedPrecondition(
-            "Tensor size and TensorProto content size mismatch: ", size_bytes,
-            " vs ", content.size());
-      } else if (tensor.NumElements() != content.size() / dtype_size) {
-        return errors::FailedPrecondition(
-            "Tensor elements count and TensorProto content size mismatch: ",
-            tensor.NumElements(), " vs ", content.size() / dtype_size);
-      }
-      weights =
-          params->weight_store->GetTempWeights(converted_dtype, weights_dim);
-      if (dtype_size == tensorflow::DataTypeSize(converted_dtype)) {
-        port::CopyToArray(content, static_cast<char*>(
-                                       const_cast<void*>(weights.GetValues())));
-      } else {
-        // Copy out the weights as original data type.
-        std::vector<uint8_t> temp_weights(content.size());
-        port::CopyToArray(content,
-                          reinterpret_cast<char*>(temp_weights.data()));
-        int32* dst =
-            static_cast<int32*>(const_cast<void*>(weights.GetValues()));
-        // Copy to the weight store as converted data type.
-        if (dtype == DT_INT16) {
-          int16* data = reinterpret_cast<int16*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_INT8) {
-          int8* data = reinterpret_cast<int8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_UINT8) {
-          uint8* data = reinterpret_cast<uint8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else {
-          return errors::FailedPrecondition(
-              "Unexpected data type: ", DataTypeString(dtype),
-              " at: ", node_def.name());
-        }
-      }
-    }
-  } else {
-    return errors::Unimplemented("Not supported constant type, at ",
-                                 node_def.name());
+  TFAttrs attrs(node_def);
+  const DataType dtype = attrs.get<DataType>("dtype");
+  if (dtype != tensor.dtype()) {
+    return errors::InvalidArgument("DataType mismatch between attr (",
+                                   DataTypeString(dtype), ") and tensor (",
+                                   DataTypeString(tensor.dtype()), ")");
   }
+
+  TRT_ShapedWeights weights;
+  TF_RETURN_IF_ERROR(
+      TfTensorToTrtWeights(tensor, params->weight_store, &weights));
+
   if (params->outputs != nullptr) {
     params->outputs->push_back(TRT_TensorOrWeights(weights));
   }
   return Status::OK();
 }
 
-tensorflow::Status ConvertIdentity(OpConverterParams* params) {
+Status ConvertIdentity(OpConverterParams* params) {
   // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
   // 5.0, however once we know that it does it would be nice to use that
   // instead.
+  if (params->validation_only) return Status::OK();
   params->outputs->push_back(params->inputs.at(0));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
+  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
+  // false}}));
   if (inputs.size() != 2) {
-    return errors::InvalidArgument("Binary ops require two inputs, at ",
+    return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
+                                   " inputs but expected 2, at ",
                                    node_def.name());
   }
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
   // Constant folding should have been done by TensorFlow
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
@@ -2908,135 +3184,165 @@ Status ConvertBinary(OpConverterParams* params) {
   // If both input are tensors, or one of them is weights but the conversion
   // above failed, try the conversion using BinaryTensorOpTensor.
   if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
-    if (!status.ok()) VLOG(1) << status;
+    if (!status.ok()) VLOG(2) << status;
     status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
   }
   return status;
 }
 
-tensorflow::Status ConvertUnary(OpConverterParams* params) {
+Status ConvertRsqrt(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  static const std::unordered_map<string, nvinfer1::UnaryOperation> ops{
-      {"Neg", nvinfer1::UnaryOperation::kNEG},
-      {"Exp", nvinfer1::UnaryOperation::kEXP},
-      {"Log", nvinfer1::UnaryOperation::kLOG},
-      {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"Abs", nvinfer1::UnaryOperation::kABS},
-      {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
-  };
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  if (params->validation_only) return Status::OK();
 
-  if (inputs.size() != 1) {
-    return tensorflow::errors::FailedPrecondition(
-        "Unary ops require single tensor input, at ", node_def.name());
+  // TODO(tmorris): params->converter is null during validation. Allow
+  // precision_mode and use_calibration to be accessed during validation and
+  // include this check in validation.
+  // We will need a quantization range for intermediate tensor if not using
+  // calibration.
+  //
+  //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+  //                     ^
+  //               need range here
+  if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
+      !params->converter->use_calibration()) {
+    return errors::Unimplemented(
+        "Intermediate quantization range cannot be determined without"
+        " calibration for Rsqrt, consider replacing with "
+        "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+        node_def.name());
   }
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  // Sqrt
+  nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::UnaryOperation::kSQRT);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
+  // Recip
+  nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
+      *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
+  return Status::OK();
+}
 
-  // TODO(jie): check type
-  const nvinfer1::ITensor* tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), inputs.at(0).GetTrtDims(), &tensor));
+const std::unordered_map<string, nvinfer1::UnaryOperation>*
+UnaryOperationMap() {
+  static auto* const m =
+      new std::unordered_map<string, nvinfer1::UnaryOperation>({
+        {"Neg", nvinfer1::UnaryOperation::kNEG},
+            {"Exp", nvinfer1::UnaryOperation::kEXP},
+            {"Log", nvinfer1::UnaryOperation::kLOG},
+            {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
+            {"Abs", nvinfer1::UnaryOperation::kABS},
+            {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
+#if IS_TRT_VERSION_GE(5, 1, 0)
+            {"Sin", nvinfer1::UnaryOperation::kSIN},
+            {"Cos", nvinfer1::UnaryOperation::kCOS},
+            {"Tan", nvinfer1::UnaryOperation::kTAN},
+            {"Sinh", nvinfer1::UnaryOperation::kSINH},
+            {"Cosh", nvinfer1::UnaryOperation::kCOSH},
+            {"Asin", nvinfer1::UnaryOperation::kASIN},
+            {"Acos", nvinfer1::UnaryOperation::kACOS},
+            {"Atan", nvinfer1::UnaryOperation::kATAN},
+            {"Asinh", nvinfer1::UnaryOperation::kASINH},
+            {"Acosh", nvinfer1::UnaryOperation::kACOSH},
+            {"Atanh", nvinfer1::UnaryOperation::kATANH},
+            {"Ceil", nvinfer1::UnaryOperation::kCEIL},
+            {"Floor", nvinfer1::UnaryOperation::kFLOOR},
+#endif
+      });
+  return m;
+}
 
-  nvinfer1::IUnaryLayer* layer;
-  if (node_def.op() == "Rsqrt") {
-    // We will need a quantization range for intermediate tensor if not using
-    // calibration.
-    //
-    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
-    //                     ^
-    //               need range here
-    if (params->converter->precision_mode() == INT8MODE &&
-        !params->converter->use_calibration()) {
-      return errors::Unimplemented(
-          "Intermediate quantization range cannot be determined without"
-          " calibration for Rsqrt, consider replacing with "
-          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
-          node_def.name());
-    }
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kSQRT);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-    tensor = layer->getOutput(0);
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kRECIP);
-  } else if (ops.count(node_def.op()) != 0) {
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor), ops.at(node_def.op()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op: ", node_def.op(), " not supported, at ", node_def.name());
+Status ConvertUnary(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  auto op_pair = UnaryOperationMap()->find(node_def.op());
+  if (op_pair == UnaryOperationMap()->end()) {
+    return errors::Unimplemented("Unary op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Set quantization ranges.
+  if (node_def.op() == "Sin" || node_def.op() == "Cos") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  } else if (node_def.op() == "Asin" || node_def.op() == "Atan") {
+    params->converter->ProvideQuantizationRange(output_tensor, -M_PI_2, M_PI_2);
+  } else if (node_def.op() == "Acos") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, M_PI);
+  } else if (node_def.op() == "Neg" || node_def.op() == "Abs") {
+    // Neg and Abs will have same range as input since TRT uses symmetric
+    // quantization.
+    // TODO(tmorris): Should we infer ranges for Ceil and Floor as well?
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), output_tensor);
+  }
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertSquare(OpConverterParams* params) {
+Status ConvertSquare(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument("Square expects one input, at ",
-                                               node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Square is only implemented for tensors, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   if (params->validation_only) return Status::OK();
 
   // Constant 2 with same rank as input
-  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
-  for (int i = 0; i < dims.nbDims; i++) {
-    dims.d[i] = 1;
-  }
-  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
-      tensorflow::DataType::DT_FLOAT, dims);
-  auto weights_ptr =
-      static_cast<float*>(const_cast<void*>(weights.GetValues()));
-  weights_ptr[0] = 2.f;
-  nvinfer1::ITensor* const2_tensor =
-      params->converter->CreateConstantLayer(weights, dims);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_tensor, node_def.name());
+  const nvinfer1::ITensor* const2_tensor = nullptr;
+  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
+      params, 2.0f, inputs.at(0).GetTrtDims(), &const2_tensor));
 
   // ElementWise Pow Operation
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
           *const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
-          *const2_tensor, nvinfer1::ElementWiseOperation::kPOW);
+          *const_cast<nvinfer1::ITensor*>(const2_tensor),
+          nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertReduce(OpConverterParams* params) {
+Status ConvertReduce(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   TRT_ShapedWeights index_list = inputs.at(1).weights();
 
   TFAttrs attrs(node_def);
-  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
-
   // Only expect to handle INT32 as attributes for now
-  if (index_type != tensorflow::DataType::DT_INT32) {
-    return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
+  if (attrs.get<DataType>("Tidx") != DataType::DT_INT32) {
+    return errors::Unimplemented("Tidx supports only DT_INT32");
   }
 
   int axes = 0;
   if (index_list.count() == 0) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "TRT cannot support reduce on all (batch) dimensions, at",
         node_def.name());
   } else {
@@ -3046,7 +3352,7 @@ tensorflow::Status ConvertReduce(OpConverterParams* params) {
       int axis = index_list_data[i];
       if (axis < 0) axis += tensor->getDimensions().nbDims + 1;
       if (axis == 0) {
-        return tensorflow::errors::InvalidArgument(
+        return errors::InvalidArgument(
             "TRT cannot reduce at batch dimension, at", node_def.name());
       }
       axes |= (1 << (axis - 1));
@@ -3065,9 +3371,10 @@ tensorflow::Status ConvertReduce(OpConverterParams* params) {
   } else if (node_def.op() == "Mean") {
     reduce_operation = nvinfer1::ReduceOperation::kAVG;
   } else {
-    return tensorflow::errors::Unimplemented("Op not supported ", node_def.op(),
-                                             " , at ", node_def.name());
+    return errors::Unimplemented("Op not supported ", node_def.op(), ", at ",
+                                 node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
   const auto keep_dims = attrs.get<bool>("keep_dims");
   nvinfer1::ILayer* layer = params->converter->network()->addReduce(
@@ -3076,18 +3383,16 @@ tensorflow::Status ConvertReduce(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertPad(OpConverterParams* params) {
+Status ConvertPad(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  // TODO(aaroey): make a routine for this check and reuse it.
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
   // Implement tensor binaryOp weight [channel wise] for now;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
@@ -3100,19 +3405,18 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
   TFAttrs attrs(node_def);
   // Padding type here is done through TF type
   //   so I can leverage their EnumToDataType for my cast
-  auto padding_type = attrs.get<tensorflow::DataType>("Tpaddings");
+  auto padding_type = attrs.get<DataType>("Tpaddings");
   // TODO(jie): handle data type conversion for TRT?
 
   if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Pad only supports explicit padding on 4 dimensional tensor, at ",
         node_def.name());
   }
 
   // Only expect to handle INT32 as attributes for now
-  if (padding_type != tensorflow::DataType::DT_INT32) {
-    return tensorflow::errors::Unimplemented(
-        "Tpaddings supports only DT_INT32");
+  if (padding_type != DataType::DT_INT32) {
+    return errors::Unimplemented("Tpaddings supports only DT_INT32");
   }
   auto pad_data = static_cast<int*>(const_cast<void*>(pads.GetValues()));
 
@@ -3124,27 +3428,27 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
   }
 
   // No padding at all, we should exit
-  if (pad_index.size() == 0) {
+  if (pad_index.empty()) {
     params->outputs->push_back(inputs.at(0));
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   // Only supports padding on less than 2 axis GIE-2579
   if (pad_index.size() > 2) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Padding layer does not support padding on > 2");
   }
 
   // Padding on batch dimension is not supported
   if (pad_index[0] == 0) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Padding layer does not support padding on batch dimension");
   }
 
   // Not doing the legit thing here. ignoring padding on dim 1 and 3;
   // TODO(jie): implement pad as uff parser
   if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Padding layer does not support padding on dimension 1 and 3 yet");
   }
   if (params->validation_only) return Status::OK();
@@ -3185,17 +3489,21 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
 
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertConcat(OpConverterParams* params) {
+Status ConvertConcat(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  // TODO(tmorris): There is a bug with Concat and INT32 in TRT - it is supposed
+  // to be supported.
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   // not including the last input (axis) here
   int input_size = static_cast<int>(inputs.size()) - 1;
 
   if (!inputs.at(0).is_tensor()) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Concat in TRT support only Tensor input, at ", node_def.name());
   }
 
@@ -3203,13 +3511,13 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
   TRT_ShapedWeights axis = inputs.at(input_size).weights();
 
   TFAttrs attrs(node_def);
-  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
+  auto index_type = attrs.get<DataType>("Tidx");
 
   // TODO(jie): handle data type
   // Only expect to handle INT32 as index attributes for now
-  if (index_type != tensorflow::DataType::DT_INT32)
-    return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32, at ",
-                                             node_def.name());
+  if (index_type != DataType::DT_INT32)
+    return errors::Unimplemented("Tidx supports only DT_INT32, at ",
+                                 node_def.name());
 
   int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues())));
 
@@ -3218,11 +3526,11 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
   auto dim = inputs.at(0).tensor()->getDimensions();
   // dimension check
   if (index > dim.nbDims + 1) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Concatenate on axis out of dimension range, at ", node_def.name());
   }
   if (index == 0) {
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Concatenate on batch dimension not supported, at ", node_def.name());
   }
   if (index < 0) {
@@ -3236,14 +3544,14 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
     auto tensor_i = inputs.at(i).tensor();
     auto dim_i = tensor_i->getDimensions();
     if (dim_i.nbDims != dim.nbDims) {
-      return tensorflow::errors::InvalidArgument(
+      return errors::InvalidArgument(
           "Concatenate receives inputs with inconsistent dimensions, at ",
           node_def.name());
     }
     for (int j = 0; j < dim.nbDims; j++) {
       // check dimension consistency on non-concatenate axis
       if (j != index - 1 && dim_i.d[j] != dim.d[j]) {
-        return tensorflow::errors::InvalidArgument(
+        return errors::InvalidArgument(
             "Concatenate receives inputs with inconsistent shape, at",
             node_def.name());
       }
@@ -3251,7 +3559,7 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
 
     inputs_vec.push_back(tensor_i);
   }
-  if (params->validation_only) return tensorflow::Status::OK();
+  if (params->validation_only) return Status::OK();
 
   // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   nvinfer1::IConcatenationLayer* layer =
@@ -3262,17 +3570,24 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
   layer->setAxis(index - 1);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
+Status ConvertFusedBatchNorm(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false},
+                                                  {"scale", true},
+                                                  {"offset", true},
+                                                  {"mean", true},
+                                                  {"variance", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   TFAttrs attrs(node_def);
   float epsilon = attrs.get<float>("epsilon");
   auto data_format = attrs.get<string>("data_format");
   if (data_format != "NCHW") {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
   }
   bool is_training = attrs.get<bool>("is_training");
@@ -3284,38 +3599,23 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
                  << "are using Keras, please call "
                  << "keras.backend.set_learning_phase(0) before constructing "
                  << "your model. At " << node_def.name();
-    return tensorflow::errors::Unimplemented(
-        node_def.op(), " only supports is_training=false, at ",
-        node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        node_def.op(),
-        " is only implemented for tensor inputs, not weights, at ",
-        node_def.name());
-  }
-  for (int i = 1; i < 5; i++) {
-    if (inputs.at(i).is_tensor()) {
-      return tensorflow::errors::Unimplemented(
-          node_def.op(),
-          " must have constant inputs for scale, offset, mean and variance, "
-          "at ",
-          node_def.name());
-    }
+    return errors::Unimplemented(node_def.op(),
+                                 " only supports is_training=false, at ",
+                                 node_def.name());
   }
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
   //  Check parameter types
   auto parameter_type = inputs.at(1).weights().type_;
-  if ((parameter_type != tensorflow::DataType::DT_FLOAT) &&
-      (parameter_type != tensorflow::DataType::DT_HALF)) {
-    return tensorflow::errors::Unimplemented(
+  if ((parameter_type != DataType::DT_FLOAT) &&
+      (parameter_type != DataType::DT_HALF)) {
+    return errors::Unimplemented(
         "only float32 or float16 weight data type is supported, for node " +
-        node_def.name() + " got " + tensorflow::DataTypeString(parameter_type));
+        node_def.name() + " got " + DataTypeString(parameter_type));
   }
   for (int i = 1; i < 5; i++) {
     if (inputs.at(i).weights().type_ != parameter_type) {
-      return tensorflow::errors::Unimplemented(
+      return errors::Unimplemented(
           "Inconsistent parameter type for batchnorm is not supported, at: " +
           node_def.name());
     }
@@ -3324,7 +3624,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   TRT_ShapedWeights dummy_power_weights(parameter_type);
   size_t nweight = 0;
   for (int i = 1; i < 5; i++) {
-    nweight = std::max(nweight, (size_t)inputs.at(i).weights().count());
+    nweight = std::max<size_t>(nweight, inputs.at(i).weights().count());
   }
   TRT_ShapedWeights* ptr_shape_weights = nullptr;
   for (int i = 1; i < 5; i++) {
@@ -3332,7 +3632,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
       ptr_shape_weights =
           const_cast<TRT_ShapedWeights*>(&(inputs.at(i).weights()));
     } else if (inputs.at(i).weights().count() != 1) {
-      return tensorflow::errors::InvalidArgument(
+      return errors::InvalidArgument(
           "Inconsistent batchnorm parameter count, at: " + node_def.name());
     }
   }
@@ -3366,16 +3666,16 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
     float batchnorm_data[4];
     for (int j = 0; j < 4; j++) {
       if (inputs.at(j + 1).weights().count() != 1) {
-        if (parameter_type == tensorflow::DT_FLOAT) {
+        if (parameter_type == DT_FLOAT) {
           batchnorm_data[j] = vals_array[j][i];
-        } else if (parameter_type == tensorflow::DT_HALF) {
+        } else if (parameter_type == DT_HALF) {
           batchnorm_data[j] =
               Eigen::half_impl::half_to_float(cast_vals_array[j][i]);
         }
       } else {
-        if (parameter_type == tensorflow::DT_FLOAT) {
+        if (parameter_type == DT_FLOAT) {
           batchnorm_data[j] = vals_array[j][0];
-        } else if (parameter_type == tensorflow::DT_HALF) {
+        } else if (parameter_type == DT_HALF) {
           batchnorm_data[j] =
               Eigen::half_impl::half_to_float(cast_vals_array[j][0]);
         }
@@ -3387,10 +3687,10 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
     float variance = batchnorm_data[3];
     float combined_scale_val = scale / sqrtf(variance + epsilon);
     float combined_offset_val = offset - mean * combined_scale_val;
-    if (parameter_type == tensorflow::DT_FLOAT) {
+    if (parameter_type == DT_FLOAT) {
       combined_scale_vals[i] = combined_scale_val;
       combined_offset_vals[i] = combined_offset_val;
-    } else if (parameter_type == tensorflow::DT_HALF) {
+    } else if (parameter_type == DT_HALF) {
       cast_combined_scale_vals[i] = Eigen::half(combined_scale_val);
       cast_combined_offset_vals[i] = Eigen::half(combined_offset_val);
     }
@@ -3406,17 +3706,88 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
-                                       TRT_TensorOrWeights tensor_input,
-                                       TRT_ShapedWeights weights_raw,
-                                       bool transpose_weight,
-                                       string node_name) {
+Status ConvertGather(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"params", false}, {"indices", false}, {"axis", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32},
+      /*dtype_attr_name=*/"Tparams"));
+  absl::Span<const int> axis = inputs.at(2).weights().GetSpan<int>();
+  if (axis.size() != 1) {
+    return errors::InvalidArgument("Axis for GatherV2 must be a scalar, at ",
+                                   node_def.name());
+  }
+  int trt_axis = 0;
+  TF_RETURN_IF_ERROR(ConvertAxis(axis[0], inputs.at(0).GetTrtDims().nbDims,
+                                 node_def.name(), &trt_axis));
+  TRT_TensorOrWeights params_tensor = inputs.at(0);
+  TRT_TensorOrWeights indices_tensor = inputs.at(1);
+  if (indices_tensor.batch_size() != 1) {
+    return errors::InvalidArgument("Only indices with batch 1 are supported.");
+  }
+  // Both input are tensors, and the TF gather result will have rank:
+  // (params.nbDims + 1) + (indices.nbDims + 1) - 1,
+  // where "+ 1" adds the batch dim.
+  const int tf_gather_output_rank = params_tensor.GetTrtDims().nbDims +
+                                    indices_tensor.GetTrtDims().nbDims + 1;
+  if (tf_gather_output_rank > nvinfer1::Dims::MAX_DIMS + 1) {
+    return errors::InvalidArgument(
+        "Result of gather has dimension greater than ",
+        nvinfer1::Dims::MAX_DIMS + 1);
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Note on how IGatherLayer works: if both the data and indices tensors have
+  // a batch size dimension of size N, it performs:
+  // for batchid in xrange(N):
+  //   output[batchid, a0, ..., an, i, ..., j, b0, ..., bn] = (
+  //       data[batchid, a0, ..., an, indices[batchid, i, ..., j] b0, ..., bn])
+  nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
+      *const_cast<nvinfer1::ITensor*>(params_tensor.tensor()),
+      *const_cast<nvinfer1::ITensor*>(indices_tensor.tensor()), trt_axis);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+
+  nvinfer1::ITensor* gather_output = layer->getOutput(0);
+  nvinfer1::Dims trt_gather_output_dims = gather_output->getDimensions();
+  // Note for the "- 2": one is for the output batch dim encapsulated by TF-TRT,
+  // and the other is for the output dimension that is squeezed by IGatherLayer
+  // because of the implicit batch dim in the indices (see the above note).
+  if (trt_gather_output_dims.nbDims != tf_gather_output_rank - 2) {
+    return errors::Internal(
+        "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ",
+        tf_gather_output_rank - 2,
+        ", actual nbDims: ", trt_gather_output_dims.nbDims);
+  }
+  // Reshape the output so after adding the implicit batch dim it'll match the
+  // output shape of TF GatherV2.
+  for (int i = trt_gather_output_dims.nbDims; i > trt_axis; --i) {
+    trt_gather_output_dims.d[i] = trt_gather_output_dims.d[i - 1];
+  }
+  trt_gather_output_dims.d[trt_axis] = 1;
+  ++trt_gather_output_dims.nbDims;
+
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      TRT_TensorOrWeights(gather_output), trt_gather_output_dims,
+      /*validation_only=*/false, &output_tensor));
+
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return Status::OK();
+}
+
+Status ConvertMatMulHelper(OpConverterParams* params,
+                           TRT_TensorOrWeights tensor_input,
+                           TRT_ShapedWeights weights_raw, bool transpose_weight,
+                           string node_name) {
   nvinfer1::ITensor* output_tensor;
   if (!tensor_input.is_tensor()) {
-    return tensorflow::errors::InvalidArgument("Input 0 expects tensor");
+    return errors::InvalidArgument("Input 0 expects tensor");
   }
   const nvinfer1::ITensor* tensor = tensor_input.tensor();
 
@@ -3436,7 +3807,7 @@ tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
     input_dim.d[input_dim.nbDims++] = 1;
   }
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      tensor_input, input_dim, &tensor));
+      tensor_input, input_dim, /*validation_only=*/false, &tensor));
 
   nvinfer1::IFullyConnectedLayer* layer =
       params->converter->network()->addFullyConnected(
@@ -3449,29 +3820,22 @@ tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
   auto output_dim = output_tensor->getDimensions();
   output_dim.nbDims = 1;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      TRT_TensorOrWeights(output_tensor), output_dim, &temp_tensor));
+      TRT_TensorOrWeights(output_tensor), output_dim, /*validation_only=*/false,
+      &temp_tensor));
   output_tensor = const_cast<nvinfer1::ITensor*>(temp_tensor);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-// inputs are both two dimensional (tensorflow::ops::MatMul)
-tensorflow::Status ConvertMatMul(OpConverterParams* params) {
+// inputs are both two dimensional (ops::MatMul)
+Status ConvertMatMul(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return errors::InvalidArgument("Input expects tensor and weights, at ",
-                                   node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"a", false}, {"b", true}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
 
   TFAttrs attrs(node_def);
-  tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
-  if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
-    return errors::Unimplemented("Data type is not supported, for node ",
-                                 node_def.name(), " got ",
-                                 DataTypeString(tf_dtype));
-  }
   bool transpose_a = attrs.get<bool>("transpose_a");
   bool transpose_b = attrs.get<bool>("transpose_b");
 
@@ -3486,66 +3850,64 @@ tensorflow::Status ConvertMatMul(OpConverterParams* params) {
                              transpose_b, node_def.name());
 }
 
-tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
+Status ConvertBatchMatMul(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  TFAttrs attrs(node_def);
-
-  tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
-  if (tf_dtype != tensorflow::DataType::DT_FLOAT &&
-      tf_dtype != tensorflow::DataType::DT_HALF) {
-    return tensorflow::errors::Unimplemented(
-        "data type is not supported, for node " + node_def.name() + " got " +
-        tensorflow::DataTypeString(tf_dtype));
+  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
+  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
+  // false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  if (inputs.size() != 2) {
+    return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
+                                   " inputs but expected 2, at ",
+                                   node_def.name());
   }
-
-  bool transpose_a = attrs.get<bool>("adj_x");
-  bool transpose_b = attrs.get<bool>("adj_y");
-
-  auto dims = inputs.at(0).GetTrtDims();
+  if (inputs[0].is_weights() && inputs[1].is_weights()) {
+    return errors::InvalidArgument(
+        "All inputs are weights, but Grappler is expected to fold them.");
+  }
+  TFAttrs attrs(node_def);
+  const bool transpose_a = attrs.get<bool>("adj_x");
+  const bool transpose_b = attrs.get<bool>("adj_y");
+  const auto dims = inputs.at(0).GetTrtDims();
   if (dims.nbDims == 1) {  // NC * CK is only supported through fully connected
     if (transpose_a == false && inputs.at(0).is_tensor() &&
         inputs.at(1).is_weights()) {
       return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1).weights(),
                                  transpose_b, node_def.name());
     } else {
-      return tensorflow::errors::InvalidArgument(
-          "Invalid configuration for MatMul, at: " + node_def.name());
+      return errors::InvalidArgument("Invalid configuration for MatMul, at: ",
+                                     node_def.name());
     }
   }
 
-  const nvinfer1::ITensor* tensor_l;
-  const nvinfer1::ITensor* tensor_r;
-  auto dims_l = inputs.at(0).GetTrtDims();
-  auto dims_r = inputs.at(1).GetTrtDims();
-  if (inputs.at(0).is_weights()) {
-    if (inputs.at(0).GetTrtDims().d[0] != 1) {
-      return tensorflow::errors::InvalidArgument(
-          "Input 0 as weight assumes broadcast across batch for MatMul, at: " +
-          node_def.name());
-    } else {
-      for (int i = 0; i < dims_l.nbDims - 1; i++) {
-        dims_l.d[i] = dims_l.d[i + 1];
+  auto get_tensor_with_proper_dims = [params](
+                                         const TRT_TensorOrWeights& input,
+                                         const nvinfer1::ITensor** tensor) {
+    auto dims = input.GetTrtDims();
+    if (input.is_weights()) {
+      // The other operand must be a tensor, this is ensured by earlier checks.
+      // Checks that the batch dimension is not changed by broadcasting.
+      if (dims.d[0] != 1) {
+        return errors::InvalidArgument(
+            "Input weight attempts to broadcast across batch dimension for "
+            "BatchMatMul, at ",
+            params->node_def.name());
       }
-      dims_l.nbDims--;
+      // Remove the batch dimension from the weights.
+      TF_RETURN_IF_ERROR(RemoveBatchDimension(&dims));
     }
-  }
-  if (inputs.at(1).is_weights()) {
-    if (inputs.at(1).GetTrtDims().d[0] != 1) {
-      return tensorflow::errors::InvalidArgument(
-          "Input 1 as weight assumes broadcast across batch for MatMul, at: " +
-          node_def.name());
-    } else {
-      for (int i = 0; i < dims_r.nbDims - 1; i++) {
-        dims_r.d[i] = dims_r.d[i + 1];
-      }
-      dims_r.nbDims--;
-    }
-  }
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), dims_l, &tensor_l));
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), dims_r, &tensor_r));
+    // Create tensor and reshape if necessary.
+    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+        input, dims, params->validation_only, tensor));
+    return Status::OK();
+  };
+  const nvinfer1::ITensor* tensor_l;
+  const nvinfer1::ITensor* tensor_r;
+  TF_RETURN_IF_ERROR(get_tensor_with_proper_dims(inputs.at(0), &tensor_l));
+  TF_RETURN_IF_ERROR(get_tensor_with_proper_dims(inputs.at(1), &tensor_r));
+  if (params->validation_only) return Status::OK();
 
   nvinfer1::IMatrixMultiplyLayer* layer =
       params->converter->network()->addMatrixMultiply(
@@ -3554,20 +3916,25 @@ tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
+Status ConvertSoftmax(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   int nbDims = tensor->getDimensions().nbDims;
   if (nbDims == 0) {
-    return tensorflow::errors::InvalidArgument(
-        "TensorRT Softmax cannot apply on batch dimension, at" +
+    return errors::InvalidArgument(
+        "TensorRT Softmax cannot apply on batch dimension, at",
         node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+
   nvinfer1::ISoftMaxLayer* layer = params->converter->network()->addSoftMax(
       *const_cast<nvinfer1::ITensor*>(tensor));
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
@@ -3578,65 +3945,80 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
   // Quantization range for SoftMax is always (0, 1)
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertTopK(OpConverterParams* params) {
+Status ConvertTopK(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"k", true}}));
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
-  int nbDims = tensor->getDimensions().nbDims;
-  if (nbDims == 0) {
-    return tensorflow::errors::InvalidArgument(
-        "TensorRT TopK cannot apply on batch dimension, at" + node_def.name());
+  const int num_dims = tensor->getDimensions().nbDims;
+  if (num_dims == 0) {
+    return errors::InvalidArgument(
+        "TensorRT TopK cannot apply on batch dimension, at", node_def.name());
   }
 
   TRT_ShapedWeights k_w = inputs.at(1).weights();
-  int k = *(static_cast<int*>(const_cast<void*>(k_w.GetValues())));
-
-  nvinfer1::TopKOperation op;
-  uint32_t reducedAxes = 0;
-  if (node_def.op() == "TopKV2") {
-    op = nvinfer1::TopKOperation::kMAX;
-    reducedAxes |= 1 << (nbDims - 1);
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "Operation: " + node_def.op() +
-        " not implemented, at: " + node_def.name());
+  if (k_w.count() != 1) {
+    return errors::InvalidArgument("k value of TopK should be a scalar, at",
+                                   node_def.name());
   }
+  // Note that ITopKLayer always have sorted outputs, so we don't need to handle
+  // the 'sorted' attribute of the node.
+  if (params->validation_only) return Status::OK();
 
+  const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX;
+  const int k = *(static_cast<int*>(const_cast<void*>(k_w.GetValues())));
+  const uint32_t reduce_axes = 1 << (num_dims - 1);
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
-      *const_cast<nvinfer1::ITensor*>(tensor), op, k, reducedAxes);
+      *const_cast<nvinfer1::ITensor*>(tensor), op, k, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
-  // Tensor type for network output is not inferred. Indices should be INT32
-  // (default is float).
-  output_indices_tensor->setType(nvinfer1::DataType::kINT32);
   params->outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 static void RegisterValidatableOpConverters(
     std::unordered_map<string, OpConverter>* registration) {
-  // TODO(laigd): support all op types.
   (*registration)["BiasAdd"] = ConvertBiasAdd;
   (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
   (*registration)["Conv2D"] = ConvertConv2D;
+  (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
   (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   (*registration)["ExpandDims"] = ConvertExpandDims;
+  (*registration)["GatherV2"] = ConvertGather;
+  (*registration)["LeakyRelu"] = ConvertLeakyRelu;
   (*registration)["MatMul"] = ConvertMatMul;
   (*registration)["Pad"] = ConvertPad;
   (*registration)["Relu6"] = ConvertRelu6;
   (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["Rsqrt"] = ConvertRsqrt;
+  (*registration)["Slice"] = ConvertSlice;
   (*registration)["Square"] = ConvertSquare;
   (*registration)["Squeeze"] = ConvertSqueeze;
   (*registration)["StridedSlice"] = ConvertStridedSlice;
   (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["TopKV2"] = ConvertTopK;
+
+  // TODO(ben,jie): this is a temp hack.
+  (*registration)["Identity"] = ConvertIdentity;  // Identity should be removed
+  (*registration)["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
+
+  (*registration)["Sum"] = ConvertReduce;
+  (*registration)["Prod"] = ConvertReduce;
+  (*registration)["Max"] = ConvertReduce;
+  (*registration)["Min"] = ConvertReduce;
+  (*registration)["Mean"] = ConvertReduce;
+  (*registration)["Softmax"] = ConvertSoftmax;
+  (*registration)["BatchMatMul"] = ConvertBatchMatMul;
 
   for (auto quantization_op_type :
        {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
@@ -3644,7 +4026,7 @@ static void RegisterValidatableOpConverters(
     (*registration)[quantization_op_type] = ConvertQuantize;
   }
   for (auto binary_op_type :
-       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum", "Pow"}) {
     (*registration)[binary_op_type] = ConvertBinary;
   }
   for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
@@ -3656,6 +4038,9 @@ static void RegisterValidatableOpConverters(
   for (auto normalization_op_type : {"FusedBatchNorm", "FusedBatchNormV2"}) {
     (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
   }
+  for (auto unary_op_pair : *UnaryOperationMap()) {
+    (*registration)[unary_op_pair.first] = ConvertUnary;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
@@ -3664,36 +4049,14 @@ void TrtNodeValidator::RegisterOpValidators() {
 
 void Converter::RegisterOpConverters() {
   RegisterValidatableOpConverters(&op_registry_);
-  // TODO(ben,jie): this is a temp hack.
-  op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
-  op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
-
-  op_registry_["Rsqrt"] = ConvertUnary;
-  op_registry_["Reciprocal"] = ConvertUnary;
-  op_registry_["Exp"] = ConvertUnary;
-  op_registry_["Log"] = ConvertUnary;
-  op_registry_["Sqrt"] = ConvertUnary;
-  op_registry_["Abs"] = ConvertUnary;
-  op_registry_["Neg"] = ConvertUnary;
-
-  op_registry_["Sum"] = ConvertReduce;
-  op_registry_["Prod"] = ConvertReduce;
-  op_registry_["Max"] = ConvertReduce;
-  op_registry_["Min"] = ConvertReduce;
-  op_registry_["Mean"] = ConvertReduce;
-  op_registry_["Softmax"] = ConvertSoftmax;
-  op_registry_["BatchMatMul"] = ConvertBatchMatMul;
-  op_registry_["TopKV2"] = ConvertTopK;
-
   plugin_converter_ = ConvertPlugin;
 }
 
-tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+Status ConvertGraphDefToEngine(
+    const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
     size_t max_workspace_size_bytes,
-    const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    Logger* logger, nvinfer1::IGpuAllocator* allocator,
-    TRTInt8Calibrator* calibrator,
+    const std::vector<PartialTensorShape>& input_shapes, Logger* logger,
+    nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully) {
   engine->reset();
@@ -3705,9 +4068,13 @@ tensorflow::Status ConvertGraphDefToEngine(
   builder->setMaxBatchSize(max_batch_size);
   builder->setMaxWorkspaceSize(max_workspace_size_bytes);
   builder->setGpuAllocator(allocator);
-  if (precision_mode == FP16MODE) {
-    builder->setHalf2Mode(true);
-  } else if (precision_mode == INT8MODE) {
+  if (precision_mode == TrtPrecisionMode::FP16) {
+    builder->setFp16Mode(true);
+  } else if (precision_mode == TrtPrecisionMode::INT8) {
+    // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
+    // use them in situations where they are faster than INT8 or where INT8 is
+    // not supported for a given layer.
+    builder->setFp16Mode(true);
     builder->setInt8Mode(true);
     if (use_calibration) {
       builder->setInt8Calibrator(calibrator);
@@ -3720,25 +4087,23 @@ tensorflow::Status ConvertGraphDefToEngine(
   auto trt_network =
       TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetwork());
   if (!trt_network) {
-    return tensorflow::errors::Internal(
-        "Failed to create TensorRT network object");
+    return errors::Internal("Failed to create TensorRT network object");
   }
 
   // Build the network
   VLOG(1) << "Starting engine conversion ";
   Converter converter(trt_network.get(), precision_mode, use_calibration);
-  std::vector<std::pair<string, string>> output_tensors;
+  std::vector<Converter::EngineOutputInfo> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
     string node_name = node_def.name();
     VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op();
-    if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
-        (node_def.op() == "Placeholder")) {
+    if (IsEngineInput(node_name) && (node_def.op() == "Placeholder")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kInputPHName), &slot_number)) {
-        return tensorflow::errors::InvalidArgument(
-            "Failed to parse slot number from ", node_name);
+        return errors::InvalidArgument("Failed to parse slot number from ",
+                                       node_name);
       }
       nvinfer1::DataType trt_dtype;
       nvinfer1::Dims trt_dims;
@@ -3761,18 +4126,23 @@ tensorflow::Status ConvertGraphDefToEngine(
       // engines offline, by calling sess.run() and cache/serialize the engines.
       TF_RETURN_IF_ERROR(
           converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size));
-    } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
-               (node_def.op() == "Identity")) {
+    } else if (IsEngineOutput(node_name) && (node_def.op() == "Identity")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
-        return tensorflow::errors::InvalidArgument(
-            "Failed to parse slot number from ", node_name);
+        return errors::InvalidArgument("Failed to parse slot number from ",
+                                       node_name);
       }
+      // Get output type that TensorFlow expects
+      TFAttrs attrs(node_def);
+      DataType tf_dtype = attrs.get<DataType>("T");
+      nvinfer1::DataType trt_dtype;
+      TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
       if (output_tensors.size() <= slot_number) {
         output_tensors.resize(slot_number + 1);
       }
-      output_tensors.at(slot_number) = {node_def.input(0), node_name};
+      output_tensors.at(slot_number) = {node_def.input(0), node_name,
+                                        trt_dtype};
     } else {
       VLOG(2) << "Converting node: " << node_def.name() << " , "
               << node_def.op();
@@ -3789,18 +4159,17 @@ tensorflow::Status ConvertGraphDefToEngine(
   VLOG(1) << "Starting engine creation";
   engine->reset(builder->buildCudaEngine(*converter.network()));
   if (engine->get() == nullptr) {
-    return tensorflow::errors::Internal("Failed to build TensorRT engine");
+    return errors::Internal("Failed to build TensorRT engine");
   }
   VLOG(1) << "Finished conversion";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ConvertSegmentToGraphDef(
-    const tensorflow::Graph* graph,
-    const tensorflow::grappler::GraphProperties& graph_properties,
+Status ConvertSegmentToGraphDef(
+    const Graph* graph, const grappler::GraphProperties& graph_properties,
     const std::vector<const Node*>& subgraph_nodes,  // In topological order
-    std::vector<EngineConnection>* connections,
-    tensorflow::GraphDef* segment_def, string* common_scope) {
+    std::vector<EngineConnection>* connections, GraphDef* segment_def,
+    string* scope_name) {
   std::set<string> marker_nodes;
   // Update connection shapes/data types and add corresponding input/output
   // nodes in the segment graphdef.
@@ -3810,12 +4179,12 @@ tensorflow::Status ConvertSegmentToGraphDef(
     auto outside_node = graph->FindNodeId(connection.outside_id);
     if (!outside_node) {
       // This should never happen, unless the original graph is problematic.
-      return tensorflow::errors::NotFound(
-          "Cannot find node with id ", connection.outside_id, " in the graph.");
+      return errors::NotFound("Cannot find node with id ",
+                              connection.outside_id, " in the graph.");
     }
     // Updates the shape and data types of input/output connections.
-    tensorflow::DataType dtype;
-    tensorflow::PartialTensorShape partial_shape;
+    DataType dtype;
+    PartialTensorShape partial_shape;
     if (connection.is_input_edge) {
       GetOutputProperties(graph_properties,
                           graph->FindNodeId(connection.outside_id),
@@ -3841,7 +4210,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       }
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
-      tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
+      NodeDefBuilder builder(node_name, "Placeholder");
       auto status = builder.Attr("shape", partial_shape)
                         .Attr("dtype", dtype)
                         .Finalize(seg_node);
@@ -3860,7 +4229,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       }
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
-      tensorflow::NodeDefBuilder builder(node_name, "Identity");
+      NodeDefBuilder builder(node_name, "Identity");
       auto status =
           builder
               .Input(connection.inside_node_name, connection.inside_port, dtype)
@@ -3879,7 +4248,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
     local_scope = GetCommonNameScope(local_scope, node->name());
     old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
-    snode->CopyFrom(node->def());
+    *snode = node->def();
     VLOG(2) << "Copying " << snode->name() << " to subgraph";
   }
   // Update the inputs of the new input nodes to point to placeholder nodes.
@@ -3910,14 +4279,14 @@ tensorflow::Status ConvertSegmentToGraphDef(
       TensorId input = ParseTensorName(snode->input(input_idx));
       if (!subgraph_node_names.count(
               string(input.first.data(), input.first.size())) &&
-          !str_util::StartsWith(input.first, kInputPHName)) {
+          !IsEngineInput(input.first)) {
         if (input.second == Graph::kControlSlot) {
           VLOG(1) << "... removing control inputs " << input.first
                   << " from subgraph.";
           ++input_idx;
           continue;
         } else {
-          return tensorflow::errors::InvalidArgument(
+          return errors::InvalidArgument(
               "Found non control input outside the segment that is not an "
               "engine connection to ",
               snode->name(), ": ", input.first);
@@ -3933,13 +4302,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
       snode->mutable_input()->RemoveLast();
     }
   }
-  *common_scope = local_scope;
-  VLOG(1) << "Converted TensorRT candidate segment @scope '" << local_scope
-          << "' to a GraphDef";
-  return tensorflow::Status::OK();
+  *scope_name = local_scope;
+  return Status::OK();
 }
 
-bool OutputEdgeValidator::operator()(const tensorflow::Edge* out_edge) const {
+bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
   if (out_edge->IsControlEdge()) return true;
   if (out_edge->src()->type_string() == "Const") {
     VLOG(1) << "--> Need to remove output node " << out_edge->src()->name()
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
similarity index 82%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.h
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 4ea5775f04ca8faa9e5019d43ba146a8e5ff41b7..068482a3f64b474199ca73b4e46a3938316e6880 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
 #include <string>
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -43,6 +43,12 @@ extern const char* const kOutputPHName;
 
 namespace convert {
 
+#define IS_TRT_VERSION_GE(major, minor, patch)                  \
+  ((NV_TENSORRT_MAJOR > major) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH >= patch))
+
 struct EngineConnection {
   // Constructs a non-control edge.
   EngineConnection(const string& outside, int out_id, int out_port,
@@ -74,14 +80,14 @@ struct EngineConnection {
   const string outside_node_name;
   const int outside_id;
   const int outside_port;
-  tensorflow::PartialTensorShape outside_shape;  // Only set for input edge.
+  PartialTensorShape outside_shape;  // Only set for input edge.
 
   const string inside_node_name;
   const int inside_id;
   const int inside_port;
-  tensorflow::PartialTensorShape inside_shape;  // Only set for output edge.
+  PartialTensorShape inside_shape;  // Only set for output edge.
 
-  tensorflow::DataType connection_type;
+  DataType connection_type;
   const bool is_input_edge;
 
   // The port number of the TRT node connected with this edge.
@@ -92,12 +98,12 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE),
+        precision_mode(TrtPrecisionMode::FP32),
         use_calibration(true) {}
 
   string engine_name;
   string device;
-  tensorflow::GraphDef segment_graph_def;
+  GraphDef segment_graph_def;
 
   // Non-control input connections inside this vector are sorted in a way such
   // that, the segment nodes connecting to them are topological sorted.
@@ -109,7 +115,7 @@ struct EngineInfo {
   int64 max_workspace_size_bytes;
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
-  int precision_mode;
+  TrtPrecisionMode precision_mode;
   bool use_calibration;
 };
 
@@ -123,14 +129,14 @@ struct EngineInfo {
 //   topological order.
 // - segment_def: the output GraphDef, whose non-input/output nodedefs will be
 //   sorted in topological order.
+// - scope_name: the name of the scope where the TRTEngineOp will be placed.
 //
 // TODO(aaroey): add tests to validate these properties.
-tensorflow::Status ConvertSegmentToGraphDef(
-    const tensorflow::Graph* graph,
-    const tensorflow::grappler::GraphProperties& graph_properties,
+Status ConvertSegmentToGraphDef(
+    const Graph* graph, const grappler::GraphProperties& graph_properties,
     const std::vector<const Node*>& subgraph_nodes,
-    std::vector<EngineConnection>* connections,
-    tensorflow::GraphDef* segment_def, string* common_scope);
+    std::vector<EngineConnection>* connections, GraphDef* segment_def,
+    string* scope_name);
 
 // Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
 // 'builder' successfully build the engine. If the result is not ok, 'engine'
@@ -140,12 +146,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
 // - convert_successfully: indicates whether the converson to TensorRT network
 //   is successful. This is different than successfully building the engine:
 //   building can still fail afterwards.
-tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+Status ConvertGraphDefToEngine(
+    const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
     size_t max_workspace_size_bytes,
-    const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    Logger* logger, nvinfer1::IGpuAllocator* allocator,
-    TRTInt8Calibrator* calibrator,
+    const std::vector<PartialTensorShape>& input_shapes, Logger* logger,
+    nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully);
 
@@ -155,7 +160,7 @@ class OutputEdgeValidator {
  public:
   // Return true if the specified edge is eligible to be an output edge of the
   // TRT segment.
-  bool operator()(const tensorflow::Edge* out_edge) const;
+  bool operator()(const Edge* out_edge) const;
 };
 
 string DebugString(const nvinfer1::DimensionType type);
@@ -178,6 +183,8 @@ class TRT_ShapedWeights {
 
   nvinfer1::Weights GetTrtWeights() const;
 
+  // Returns the raw pointer to the underlying buffer which holds the weights
+  // value.
   void* GetValues() const {
     return const_cast<char*>(tensor_.tensor_data().data());
   }
@@ -188,15 +195,30 @@ class TRT_ShapedWeights {
 
   string DebugString() const;
 
+  template <typename T>
+  absl::Span<const T> GetSpan() const {
+    return absl::Span<const T>(tensor_.flat<T>().data(), count());
+  }
+
+  template <typename T>
+  std::vector<T> ToVector() const {
+    auto span = GetSpan<T>();
+    return std::vector<T>(span.data(), span.data() + span.size());
+  }
+
   // TODO(aaroey): make these private.
   nvinfer1::Dims shape_;  // Note: shape.type[] is not used.
-  tensorflow::DataType type_;
+  DataType type_;
 
  private:
   // This constructor is only used by TrtWeightStore, which creates the
   // underlying buffer.
   TRT_ShapedWeights(DataType type, nvinfer1::Dims dims, Tensor tensor);
 
+  // All weights should be stored inside TrtWeightStore to make sure lifetime of
+  // all the underlying tensors are available until the engine is built. For
+  // this reason, tensor_ should never be reassigned to a different value that
+  // is not already present in the TrtWeightStore.
   Tensor tensor_;
 
   friend class TrtWeightStore;
@@ -212,8 +234,7 @@ class TRT_ShapedWeights {
 class TrtWeightStore {
  public:
   // Get a TRT_ShapedWeights with 'type' and 'dims'.
-  TRT_ShapedWeights GetTempWeights(tensorflow::DataType type,
-                                   const nvinfer1::Dims& dims);
+  TRT_ShapedWeights GetTempWeights(DataType type, const nvinfer1::Dims& dims);
 
   // Get a TRT_ShapedWeights with the same data type and dimensions as
   // 'weights'.
@@ -324,8 +345,7 @@ class Converter;
 
 // Parameters for each op converter.
 struct OpConverterParams {
-  OpConverterParams(Converter* arg_converter,
-                    const tensorflow::NodeDef& arg_node_def,
+  OpConverterParams(Converter* arg_converter, const NodeDef& arg_node_def,
                     const std::vector<TRT_TensorOrWeights>& arg_inputs,
                     std::vector<TRT_TensorOrWeights>* arg_outputs,
                     bool arg_validation_only, TrtWeightStore* arg_weight_store)
@@ -337,7 +357,7 @@ struct OpConverterParams {
         weight_store(arg_weight_store) {}
 
   Converter* converter;
-  const tensorflow::NodeDef& node_def;
+  const NodeDef& node_def;
   const std::vector<TRT_TensorOrWeights>& inputs;
   std::vector<TRT_TensorOrWeights>* outputs;
   const bool validation_only;
@@ -362,9 +382,12 @@ class TrtNodeValidator {
   Status ValidateNode(
       const NodeDef& node_def,
       const std::vector<std::pair<const NodeDef*, int>>& input_node_and_ports,
+      const TrtPrecisionMode precision_mode,
       const grappler::GraphProperties& graph_properties);
 
  private:
+  static const std::set<string>* quantize_ops;
+
   void RegisterOpValidators();
 
   // Convert a Const node to a TRT_TensorOrWeights.
@@ -396,28 +419,38 @@ class TrtNodeValidator {
 // Class to convert TF nodes to TRT network.
 class Converter {
  public:
-  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
-            bool use_calibration);
+  // Used for Converter::RenameAndMarkOutputTensors()
+  struct EngineOutputInfo {
+    // The TRT tensor name which produces the output.
+    string source_tensor_name;
+    // The TensorFlow node name which is receiving the output from the TRT
+    // engine. This should always be the Identity node created in
+    // ConvertSegmentToGraphDef.
+    string dest_node_name;
+    // Output type. TensorRT requires this to be explicitly set for engine
+    // outputs.
+    nvinfer1::DataType trt_dtype;
+  };
+
+  Converter(nvinfer1::INetworkDefinition* trt_network,
+            TrtPrecisionMode precision_mode, bool use_calibration);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by the TRT engine builder to build a TRT network from a TF
   // function/subgraph.
 
   // Convert the node to TRT network.
-  Status ConvertNode(const tensorflow::NodeDef& node_def);
+  Status ConvertNode(const NodeDef& node_def);
 
   // Add input tensor to the TRT network with given 'name', 'dtype', 'dims' and
   // 'batch_size'.
   Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
                         const nvinfer1::Dims& dims, int batch_size);
 
-  // Mark the tensors with names specified by output_tensors[i].first as output
-  // of the TRT network, and set their names in the TRT network as
-  // output_tensors[i].second. The tensor names (output_tensors[i].first) are
-  // standard TF tensor names, i.e. node names followed by output slot number
-  // (or just the node name if the tensor is the first output of the node).
+  // Mark the tensors with names specified by source_tensor_name as output of
+  // the TRT network, and set their names in the TRT network as dest_node_name.
   Status RenameAndMarkOutputTensors(
-      const std::vector<std::pair<string, string>>& output_tensors);
+      const std::vector<EngineOutputInfo>& output_tensors);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by op converters to convert individual TF node and add layers
@@ -428,7 +461,7 @@ class Converter {
   nvinfer1::INetworkDefinition* network() { return trt_network_; }
 
   // What precision are we targeting?
-  int precision_mode() const { return precision_mode_; }
+  TrtPrecisionMode precision_mode() const { return precision_mode_; }
 
   // Calibration will be or was previously performed on this network?
   bool use_calibration() const { return use_calibration_; }
@@ -460,8 +493,13 @@ class Converter {
                          const nvinfer1::ITensor** output_tensor);
 
   // Converts 'input' into 'tensor' with shape specified by 'dims'.
+  //
+  // If validation_only is true, it doesn't do the conversion but only do some
+  // minimum validation for the eligibility of the conversion, and *tensor will
+  // be set to nullptr.
   Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                const nvinfer1::Dims& dims,
+                               const bool validation_only,
                                const nvinfer1::ITensor** tensor);
 
   // Return OK if the broadcast scheme is supported and compute the shapes after
@@ -488,7 +526,7 @@ class Converter {
   Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output);
 
   // Get the inputs of 'node_def' from trt_tensors_.
-  Status GetInputs(const tensorflow::NodeDef& node_def,
+  Status GetInputs(const NodeDef& node_def,
                    std::vector<TRT_TensorOrWeights>* inputs) const;
 
   void RegisterOpConverters();
@@ -530,7 +568,7 @@ class Converter {
   std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
       quantization_infer_;
 
-  const int precision_mode_;
+  const TrtPrecisionMode precision_mode_;
 
   const bool use_calibration_;
 
@@ -544,6 +582,9 @@ class Converter {
   friend class OpConverterTest;
 };
 
+// Map of all supported UnaryOperations
+const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
@@ -551,4 +592,4 @@ class Converter {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
similarity index 62%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index c8739b46c150b04c0e28bf01b821ae5f6ea5c1d2..853b313367c9439c81ecb57b33e97bf8a1c1c481 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <memory>
 #include <unordered_map>
@@ -21,11 +21,17 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -35,7 +41,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
@@ -50,9 +58,10 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::NanSensitiveFloatNear;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -101,13 +110,17 @@ DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
 }
 
 NodeDef MakeNodeDef(const string& name, const string& op,
-                    const std::vector<string>& inputs) {
+                    const std::vector<string>& inputs,
+                    const std::map<string, AttrValue> attrs = {}) {
   NodeDef node_def;
   node_def.set_name(name);
   node_def.set_op(op);
   for (const string& input : inputs) {
     node_def.add_input(input);
   }
+  for (const auto& attr : attrs) {
+    (*node_def.mutable_attr())[attr.first] = attr.second;
+  }
   return node_def;
 }
 
@@ -115,7 +128,7 @@ template <typename T>
 NodeDef MakeConstNodeDef(const string& name, const std::vector<T>& vals,
                          const TensorShape& shape) {
   Scope s = Scope::NewRootScope();
-  Tensor t = ::tensorflow::test::AsTensor<T>(vals, shape);
+  Tensor t = test::AsTensor<T>(vals, shape);
   auto const_op = ops::Const(s.WithOpName(name), t);
   return const_op.node()->def();
 }
@@ -152,7 +165,7 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
 }
 
 template <typename T>
-void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
   for (int i = 0; i < lhs.size(); i++) {
     EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
@@ -163,7 +176,7 @@ void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
 // EXPECT_FLOAT_EQ.
 template <>
 void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
-                     const std::vector<Eigen::half>& rhs) {
+                     absl::Span<const Eigen::half> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
   for (int i = 0; i < lhs.size(); i++) {
     EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
@@ -225,7 +238,7 @@ class FakeITensor : public nvinfer1::ITensor {
     location_ = location;
   }
 
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   bool setDynamicRange(float min, float max) override {
     dynamic_range_ = std::max(std::abs(min), std::abs(max));
     return true;
@@ -234,6 +247,16 @@ class FakeITensor : public nvinfer1::ITensor {
   float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
+#if IS_TRT_VERSION_GE(5, 1, 0)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   string name_;
   nvinfer1::Dims dims_;
@@ -371,8 +394,8 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
 
 class ValidatorTest : public ::testing::Test {
  public:
-  void AddOpValidator(const string& op_name, OpConverter op_validator) {
-    validator_.op_validators_[op_name] = op_validator;
+  std::unordered_map<string, OpConverter>& op_validators() {
+    return validator_.op_validators_;
   }
 
   Status ConvertToTensorOrWeights(
@@ -383,10 +406,18 @@ class ValidatorTest : public ::testing::Test {
         node_def, output_port, graph_properties, tensor_or_weights);
   }
 
+  const std::set<string>* GetQuantizeOps() { return validator_.quantize_ops; }
+
  protected:
   TrtNodeValidator validator_;
 };
 
+TEST_F(ValidatorTest, QuantizeOpsAreRegistered) {
+  for (const string& quantize_op : *GetQuantizeOps()) {
+    QCHECK(op_validators().count(quantize_op));
+  }
+}
+
 TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
   // Convert Const.
   {
@@ -459,18 +490,30 @@ TEST_F(ValidatorTest, ValidateNode) {
   };
   NodeDef node_def = MakeNodeDef("my_op", "MyOp", {});
 
-  // Validator not registered, validation should pass.
-  TF_EXPECT_OK(validator_.ValidateNode(node_def, {}, graph_properties));
+  // Validator not registered.
+  ExpectStatus(validator_.ValidateNode(node_def, {}, TrtPrecisionMode::FP32,
+                                       graph_properties),
+               error::UNIMPLEMENTED, "Op type MyOp is not supported.");
 
   // Register validator.
-  AddOpValidator("MyOp", op_converter);
-  TF_EXPECT_OK(validator_.ValidateNode(node_def, {}, graph_properties));
+  op_validators()["MyOp"] = op_converter;
+  TF_EXPECT_OK(validator_.ValidateNode(node_def, {}, TrtPrecisionMode::FP32,
+                                       graph_properties));
   EXPECT_EQ(false, start_conversion);
 
   // Let the converter return error.
   should_fail = true;
-  ExpectStatus(validator_.ValidateNode(node_def, {}, graph_properties),
+  ExpectStatus(validator_.ValidateNode(node_def, {}, TrtPrecisionMode::FP32,
+                                       graph_properties),
                error::INVALID_ARGUMENT);
+
+  // Test quantization ops, they're only supported in INT8 mode. The success
+  // case is tested in OpConverterTest.ConvertQuantize.
+  node_def = MakeNodeDef("my_op", "FakeQuantWithMinMaxArgs", {});
+  ExpectStatus(validator_.ValidateNode(node_def, {}, TrtPrecisionMode::FP32,
+                                       graph_properties),
+               error::UNIMPLEMENTED,
+               "Op type FakeQuantWithMinMaxArgs is not supported.");
 }
 
 class ConverterTest : public ::testing::Test {
@@ -478,8 +521,7 @@ class ConverterTest : public ::testing::Test {
   ConverterTest() {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
-    converter_.reset(new Converter(network_.get(),
-                                   /*precision_mode=*/FP32MODE,
+    converter_.reset(new Converter(network_.get(), TrtPrecisionMode::FP32,
                                    /*use_calibration=*/false));
     weight_store_ = &converter_->weight_store_;
   }
@@ -674,23 +716,34 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   TRT_TensorOrWeights tw(input_tensor);
   const nvinfer1::ITensor* output_tensor = nullptr;
 
-  // Shape size doesn't match.
-  ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
-                                                 &output_tensor),
-               error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
-
-  // TODO(aaroey): we should check the case where uninferred dimensions are not
-  // an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
-
-  // Infer shape, ok.
-  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}),
-                                                 &output_tensor));
-  ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
+  for (bool validation_only : {false, true}) {
+    // Shape size doesn't match.
+    ExpectStatus(
+        converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
+                                          validation_only, &output_tensor),
+        error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
+
+    // TODO(aaroey): we should check the case where uninferred dimensions are
+    // not an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
+
+    // Infer shape, ok.
+    TF_EXPECT_OK(converter_->PrepareTensorForShape(
+        tw, GetTestDims({-1, 2}), validation_only, &output_tensor));
+    if (validation_only) {
+      EXPECT_EQ(nullptr, output_tensor);
+    } else {
+      ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
+    }
 
-  // Regular shape.
-  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
-                                                 &output_tensor));
-  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+    // Regular shape.
+    TF_EXPECT_OK(converter_->PrepareTensorForShape(
+        tw, GetTestDims({10, 3}), validation_only, &output_tensor));
+    if (validation_only) {
+      EXPECT_EQ(nullptr, output_tensor);
+    } else {
+      ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+    }
+  }
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
@@ -698,9 +751,15 @@ TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
       weight_store_->GetTempWeights(DT_FLOAT, GetTestDims({2, 3, 5}));
   TRT_TensorOrWeights tw(weights);
   const nvinfer1::ITensor* output_tensor = nullptr;
-  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
-                                                 &output_tensor));
-  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+  for (bool validation_only : {false, true}) {
+    TF_EXPECT_OK(converter_->PrepareTensorForShape(
+        tw, GetTestDims({10, 3}), validation_only, &output_tensor));
+    if (validation_only) {
+      EXPECT_EQ(nullptr, output_tensor);
+    } else {
+      ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+    }
+  }
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -781,7 +840,7 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
   // input -> infer1 -> infer2 -> infer3
   FakeITensor input, infer_1, infer_2, infer_3;
   FakeITensor not_infer;
-  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+  Converter int8_converter(/*trt_network=*/nullptr, TrtPrecisionMode::INT8,
                            /*use_calibration=*/true);
   int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
   int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
@@ -791,7 +850,7 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
 
   // Input range should be inferred along the chain and applied to tensors.
   int8_converter.MaybeApplyQuantizationRanges();
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   EXPECT_EQ(input.getDynamicRange(), 5.0f);
   EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
   EXPECT_EQ(infer_2.getDynamicRange(), 5.0f);
@@ -926,6 +985,83 @@ TEST_F(ConverterTest, CreateConstantLayer) {
   }
 }
 
+class ConvertGraphDefToEngineTest : public ::testing::Test {
+ public:
+  Status RunConvertGraphDefToEngine(Scope* s) {
+    GraphDef gdef;
+    TF_EXPECT_OK(s->ToGraphDef(&gdef));
+    std::vector<PartialTensorShape> input_shapes;
+    int batch_size = -1;
+    for (const NodeDef& node : gdef.node()) {
+      absl::string_view node_name(node.name());
+      if (str_util::ConsumePrefix(&node_name, kInputPHName)) {
+        int port = -1;
+        EXPECT_TRUE(absl::SimpleAtoi(node_name, &port)) << node.name();
+        if (input_shapes.size() < port + 1) input_shapes.resize(port + 1);
+        input_shapes[port] =
+            PartialTensorShape(node.attr().at("shape").shape());
+        if (batch_size == -1) {
+          batch_size = input_shapes[port].dim_size(0);
+        } else {
+          EXPECT_EQ(batch_size, input_shapes[port].dim_size(0));
+        }
+      }
+    }
+    // TODO(laigd): execute the engine and get outputs.
+    return ConvertGraphDefToEngine(
+        gdef, TrtPrecisionMode::FP32, /*max_batch_size=*/1,
+        /*max_workspace_size_bytes=*/64 << 20, input_shapes, &logger_,
+        /*allocator=*/nullptr, /*calibrator=*/nullptr, &engine_,
+        /*use_calibration=*/false, /*convert_successfully=*/nullptr);
+  }
+
+ protected:
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+
+ private:
+  Logger logger_;
+};
+
+TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName(StrCat(kInputPHName, 0)), DT_FLOAT,
+                                ops::Placeholder::Shape({1, 1}));
+  auto output = ops::Identity(s.WithOpName("identity1"), input);
+  output = ops::Identity(s.WithOpName("identity2"), output);
+  output = ops::Identity(s.WithOpName(StrCat(kOutputPHName, 0)), output);
+  // If the converter marks the input tensor as output tensor, the conversion
+  // below will fail with:
+  // > TensorRTOutputPH_0 cannot be both input and output
+  // > Network must have at least one output
+  TF_EXPECT_OK(RunConvertGraphDefToEngine(&s));
+}
+
+// Input/output data format for OpConverterTest::BuildAndRun().
+struct InputOutputData {
+  void* Buffer() const {
+    return const_cast<char*>(tensor.tensor_data().data());
+  }
+
+  size_t TotalBytes() const { return tensor.TotalBytes(); }
+
+  const char* name;
+  Tensor tensor;
+};
+
+template <typename T>
+Tensor ConstructTensor(int data_size, const T& value = T()) {
+  std::vector<T> values(data_size, value);
+  return test::AsTensor<T>(values);
+}
+
+using DataVec = std::vector<InputOutputData>;
+
+template <typename T>
+inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
+  const auto& tensor_map = data.tensor.flat<T>();
+  return absl::Span<const T>(tensor_map.data(), tensor_map.size());
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -951,11 +1087,11 @@ class OpConverterTest : public ::testing::Test {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
     builder_->setMaxBatchSize(1);
+    builder_->setMaxWorkspaceSize(1 << 26);
 
     // Reset the validator and converter.
     validator_.reset(new TrtNodeValidator);
-    converter_.reset(new Converter(network_.get(),
-                                   /*precision_mode=*/FP32MODE,
+    converter_.reset(new Converter(network_.get(), precision_mode_to_test_,
                                    /*use_calibration=*/false));
 
     // Reset other related artifacts.
@@ -963,47 +1099,85 @@ class OpConverterTest : public ::testing::Test {
     validator_inputs_.clear();
   }
 
-  // TODO(laigd): test fp16 and int8 support.
-  template <typename T>
-  void BuildAndRun(
-      const std::vector<std::pair<const char*, const std::vector<T>>>&
-          input_data,
-      const char* output_name, std::vector<T>* output_data) {
+  void CheckDataTypeMatches(const DataVec& datas) {
+    for (const auto& data : datas) {
+      const int input_index = engine_->getBindingIndex(data.name);
+      ASSERT_NE(-1, input_index);
+      const nvinfer1::DataType trt_dtype =
+          engine_->getBindingDataType(input_index);
+      const DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
+      ASSERT_EQ(data.tensor.dtype(), tf_dtype)
+          << DataTypeString(data.tensor.dtype()) << " vs. "
+          << DataTypeString(tf_dtype);
+    }
+  }
+
+  // TODO(laigd): test fp16 and int8 support for more converters.
+  void BuildAndRun(const DataVec& input_data, DataVec* output_data,
+                   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32) {
     // Mark the output tensor as TRT engine output.
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
-        {{string(output_name), string(output_name)}}));
+    std::vector<Converter::EngineOutputInfo> output_info;
+    for (const auto& data : *output_data) {
+      output_info.push_back(
+          {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
+    }
+    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
 
     // Build the TRT engine.
+    if (precision_mode == TrtPrecisionMode::FP16) {
+      builder_->setFp16Mode(true);
+    } else if (precision_mode == TrtPrecisionMode::INT8) {
+      // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
+      // use them in situations where they are faster than INT8 or where INT8 is
+      // not supported for a given layer.
+      builder_->setFp16Mode(true);
+      builder_->setInt8Mode(true);
+    }
     ASSERT_EQ(nullptr, engine_.get());
     engine_.reset(builder_->buildCudaEngine(*converter_->network()));
     CHECK_NOTNULL(engine_.get());
+    CheckDataTypeMatches(input_data);
+    CheckDataTypeMatches(*output_data);
 
     // Execute the TRT engine.
-    ASSERT_LE(input_data.size() + 1, 3);
-    void* buffers[3];
-    for (const auto name_and_data : input_data) {
-      const int input_size = name_and_data.second.size() * sizeof(T);
-      const int input_index = engine_->getBindingIndex(name_and_data.first);
-      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
-      ASSERT_EQ(
-          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
-                             input_size, cudaMemcpyHostToDevice, stream_));
+    const int num_bindings = input_data.size() + output_data->size();
+    std::vector<void*> buffers(num_bindings);
+
+    for (const auto& data : input_data) {
+      const int input_index = engine_->getBindingIndex(data.name);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], data.TotalBytes()));
+      ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], data.Buffer(),
+                                   data.TotalBytes(), cudaMemcpyHostToDevice,
+                                   stream_));
+    }
+    struct SizeAndIndex {
+      SizeAndIndex(int in_size, int in_index)
+          : size(in_size), index(in_index) {}
+      int size;
+      int index;
+    };
+    std::vector<SizeAndIndex> output_infos;
+    for (const auto& data : *output_data) {
+      const int output_index = engine_->getBindingIndex(data.name);
+      output_infos.emplace_back(data.TotalBytes(), output_index);
+      ASSERT_EQ(0, cudaMalloc(&buffers[output_index], data.TotalBytes()));
     }
 
-    const int output_size = output_data->size() * sizeof(T);
-    const int output_index = engine_->getBindingIndex(output_name);
-    ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
-
-    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
-
+    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
-    execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
-    ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
-                                 output_size, cudaMemcpyDeviceToHost, stream_));
+    execution_context->enqueue(/*batchSize=*/1, buffers.data(), stream_,
+                               nullptr);
+
+    for (int i = 0; i < output_infos.size(); ++i) {
+      const auto& output_info = output_infos[i];
+      ASSERT_EQ(0, cudaMemcpyAsync(output_data->at(i).Buffer(),
+                                   buffers[output_info.index], output_info.size,
+                                   cudaMemcpyDeviceToHost, stream_));
+    }
     cudaStreamSynchronize(stream_);
 
-    for (int i = 0; i < input_data.size() + 1; ++i) {
+    for (int i = 0; i < num_bindings; ++i) {
       ASSERT_EQ(0, cudaFree(buffers[i]));
     }
   }
@@ -1075,9 +1249,10 @@ class OpConverterTest : public ::testing::Test {
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-    ExpectStatus(validator_->ValidateNode(node_def, input_node_and_ports,
-                                          graph_properties),
-                 expected_code, expected_msg_substr);
+    ExpectStatus(
+        validator_->ValidateNode(node_def, input_node_and_ports,
+                                 precision_mode_to_test_, graph_properties),
+        expected_code, expected_msg_substr);
   }
 
   void RunConversion(const NodeDef& node_def,
@@ -1107,6 +1282,10 @@ class OpConverterTest : public ::testing::Test {
   std::unique_ptr<Converter> converter_;
   std::unique_ptr<TrtNodeValidator> validator_;
 
+ protected:
+  // TODO(laigd): parameterize the test and make the precision mode a parameter.
+  TrtPrecisionMode precision_mode_to_test_ = TrtPrecisionMode::FP32;
+
  private:
   Logger logger_;
   TrtUniquePtrType<nvinfer1::IBuilder> builder_;
@@ -1122,6 +1301,30 @@ class OpConverterTest : public ::testing::Test {
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
 
+template <typename T>
+void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
+  out->Clear();
+  if (tensor.NumElements() == 0) return;
+
+  // TensorProto does not need to have all the elements present and can truncate
+  // trailing elements with the same value for compressed representation. Such
+  // elements are derived based on the tensor shape.
+  const auto flat = tensor.flat<T>();
+  int64 last_index = 0;
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    if (flat(i) != flat(last_index)) {
+      last_index = i;
+    }
+  }
+
+  int num_out_elements = last_index + 1;
+  out->Reserve(num_out_elements);
+  out->AddNAlreadyReserved(num_out_elements);
+  const T* src = flat.data();
+  T* dst = out->mutable_data();
+  std::copy(src, src + num_out_elements, dst);
+}
+
 template <DataType dtype, typename InputCType, typename OutputCType>
 void TestConvertConst(OpConverterTest* test) {
   NodeDef node_def;
@@ -1134,11 +1337,23 @@ void TestConvertConst(OpConverterTest* test) {
                             const std::vector<OutputCType>& expected_value) {
     test->Reset();
 
-    auto& attr = *node_def.mutable_attr();
+    TensorProto* tensor_attr =
+        (*node_def.mutable_attr())["value"].mutable_tensor();
+    tensor_attr->Clear();
+
     if (as_tensor_content) {
-      tensor.AsProtoTensorContent(attr["value"].mutable_tensor());
+      tensor.AsProtoTensorContent(tensor_attr);
     } else {
-      tensor.AsProtoField(attr["value"].mutable_tensor());
+      tensor.shape().AsProto(tensor_attr->mutable_tensor_shape());
+      tensor_attr->set_dtype(tensor.dtype());
+
+      if (tensor.dtype() == DT_FLOAT) {
+        CopyTensorElements<float>(tensor, tensor_attr->mutable_float_val());
+      } else if (tensor.dtype() == DT_INT32) {
+        CopyTensorElements<int32>(tensor, tensor_attr->mutable_int_val());
+      } else {
+        tensor.AsProtoField(tensor_attr);
+      }
     }
     test->RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
@@ -1151,26 +1366,41 @@ void TestConvertConst(OpConverterTest* test) {
   {
     // By default empty tensor will pick DT_FLOAT as data type and we fix it
     // here.
-    attr["value"].mutable_tensor()->set_dtype(dtype);
-    Tensor t;  // Empty tensor.
+    Tensor t(dtype);  // Empty tensor.
     reset_and_test(t, false, {}, {});
   }
   {
-    Tensor t = ::tensorflow::test::AsScalar<InputCType>(12);
+    Tensor t = test::AsScalar<InputCType>(12);
     reset_and_test(t, false, {1}, {12});
     reset_and_test(t, true, {1}, {12});
   }
   {
-    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 2});
+    Tensor t = test::AsTensor<InputCType>({1, 2});
     reset_and_test(t, false, {2}, {1, 2});
     reset_and_test(t, true, {2}, {1, 2});
   }
   {
-    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 2, 3, 4, 5, 6},
-                                                        TensorShape({2, 3}));
+    Tensor t =
+        test::AsTensor<InputCType>({1, 2, 3, 4, 5, 6}, TensorShape({2, 3}));
     reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
     reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
   }
+  {
+    // Set all tensor elements to the same value. Such tensors are encoded
+    // using a single element list in tensor proto.
+    Tensor t =
+        test::AsTensor<InputCType>({1, 1, 1, 1, 1, 1}, TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1});
+  }
+  {
+    // Set trailing tensor elements to the same value. Such tensors are
+    // encoded by truncating all equal elements except the first one.
+    Tensor t =
+        test::AsTensor<InputCType>({2, 2, 1, 1, 1, 1}, TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1});
+  }
 }
 
 TEST_F(OpConverterTest, ConvertConst) {
@@ -1200,7 +1430,7 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     NodeDef node_def = MakeNodeDef("my_transpose", "Transpose", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_transpose");
+        "Transpose got 0 inputs but expected 2, at my_transpose");
   }
 
   // Get the NodeDef for Transpose.
@@ -1216,8 +1446,8 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_transpose");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"perm\" for Transpose must be a constant, at my_transpose");
   }
   {
     // Transpose at batch dimension, should fail.
@@ -1247,10 +1477,12 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_transpose", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 4, 2, 5, 3, 6));
   }
 }
 
@@ -1260,7 +1492,7 @@ TEST_F(OpConverterTest, ConvertReshape) {
     NodeDef node_def = MakeNodeDef("my_reshape", "Reshape", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects weights for shape, at my_reshape");
+        "Reshape got 0 inputs but expected 2, at my_reshape");
   }
 
   // Get the NodeDef for Reshape.
@@ -1276,8 +1508,8 @@ TEST_F(OpConverterTest, ConvertReshape) {
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Input expects weights for shape, at my_reshape");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"shape\" for Reshape must be a constant, at my_reshape");
   }
   {
     // Reshape to scalar, should fail.
@@ -1290,11 +1522,6 @@ TEST_F(OpConverterTest, ConvertReshape) {
   }
 
   struct TestParams {
-    TestParams(int input_batch_size, const std::vector<int>& input_tensor_dims,
-               const std::vector<int>& input_shape)
-        : batch_size(input_batch_size),
-          tensor_dims(input_tensor_dims),
-          shape(input_shape) {}
     int batch_size;
     std::vector<int> tensor_dims;
     std::vector<int> shape;
@@ -1337,10 +1564,12 @@ TEST_F(OpConverterTest, ConvertReshape) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_reshape", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
@@ -1350,7 +1579,7 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     NodeDef node_def = MakeNodeDef("my_matmul", "MatMul", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_matmul");
+        "MatMul got 0 inputs but expected 2, at my_matmul");
   }
 
   // Get the NodeDef for MatMul.
@@ -1372,9 +1601,9 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     NodeDef node_def = get_matmul_nodedef(DT_INT32, false, false);
     AddTestTensor("input", {2}, /*batch_size=*/1, nvinfer1::DataType::kINT32);
     AddTestWeights<int32>("weights", {2, 1}, {3, 5});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Data type is not supported, for node my_matmul got int32");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Data type int32 is not supported for MatMul, "
+                               "must be one of [float, half], at my_matmul");
   }
   // transpose_a is set.
   for (bool transpose_b : {false, true}) {
@@ -1400,12 +1629,13 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(2);
-    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
+    DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
+    BuildAndRun(input_data, &output_data);
     if (transpose_b) {
-      EXPECT_THAT(output_data, ElementsAre(1, 3));
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
     } else {
-      EXPECT_THAT(output_data, ElementsAre(2, 3));
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(2, 3));
     }
   }
 }
@@ -1459,23 +1689,28 @@ void TestConvertBiasAdd(OpConverterTest* test) {
       const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
       ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
                 num_input);
-      std::vector<CType> output_data(num_input);
-      test->BuildAndRun<CType>(
-          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
-          &output_data);
+
+      const DataVec input_data{
+          {"input", ConstructTensor<CType>(num_input, CType(0))}};
+      DataVec output_data{{"my_biasadd", ConstructTensor<CType>(num_input)}};
+      test->BuildAndRun(input_data, &output_data);
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2), CType(3)));
         } else {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2)));
         }
       } else {
         if (data_format == "NHWC") {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
-                                               CType(1), CType(2), CType(3)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2), CType(3), CType(1),
+                                  CType(2), CType(3)));
         } else {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
-                                               CType(2), CType(2), CType(2)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(1), CType(1), CType(2),
+                                  CType(2), CType(2)));
         }
       }
     }
@@ -1488,7 +1723,7 @@ TEST_F(OpConverterTest, ConvertBiasAdd) {
     NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_biasadd");
+        "BiasAdd got 0 inputs but expected 2, at my_biasadd");
   }
 
   // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
@@ -1553,21 +1788,27 @@ void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<CType> output_data(2);
-    test->BuildAndRun<CType>(
-        {{"input",
-          /*input_data=*/swap_inputs ? operand2 : operand1}},
-        "my_binary", &output_data);
+    const DataVec input_data{
+        {"input", test::AsTensor<CType>(swap_inputs ? operand2 : operand1)}};
+    DataVec output_data{{"my_binary", ConstructTensor<CType>(2)}};
+    test->BuildAndRun(
+        input_data, &output_data,
+        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
     if (node_def.op() == "Add") {
-      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(5), CType(10.5)));
     } else if (node_def.op() == "Sub") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1), CType(4.5)));
     } else if (node_def.op() == "Mul") {
-      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(6), CType(22.5)));
     } else if (node_def.op() == "Div") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1.5), CType(2.5)));
     } else if (node_def.op() == "RealDiv") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1.5), CType(2.5)));
     } else {
       ASSERT_TRUE(false);
     }
@@ -1602,13 +1843,14 @@ void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<CType> output_data(4);
-    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
+    DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
+    test->BuildAndRun(input_data, &output_data);
     if (weights_dims.size() == 1) {
-      EXPECT_THAT(output_data,
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                   ElementsAre(CType(11), CType(22), CType(13), CType(24)));
     } else {
-      EXPECT_THAT(output_data,
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                   ElementsAre(CType(11), CType(12), CType(23), CType(24)));
     }
   }
@@ -1636,9 +1878,10 @@ void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
   EXPECT_TRUE(output.is_tensor());
   ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
 
-  std::vector<CType> output_data(4);
-  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
-  EXPECT_THAT(output_data,
+  const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
+  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
+  test->BuildAndRun(input_data, &output_data);
+  EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
               ElementsAre(CType(11), CType(12), CType(13), CType(14)));
 }
 
@@ -1686,17 +1929,19 @@ void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
   // Check the result of running the engine.
   const int expected_num_outputs =
       TrtDimsNumElements(GetTestDims(expected_output_dims));
-  std::vector<CType> output_data(expected_num_outputs);
-  test->BuildAndRun<CType>(
-      {{"input",
-        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
-      "my_binary", &output_data);
+  const DataVec input_data{
+      {"input", ConstructTensor<CType>(num_inputs, CType(2))}};
+  DataVec output_data{
+      {"my_binary", ConstructTensor<CType>(expected_num_outputs)}};
+  test->BuildAndRun(input_data, &output_data);
   if (node_def.op() == "Add") {
-    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
-                                 expected_num_outputs, CType(3))));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(3))));
   } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
-                                 expected_num_outputs, CType(1))));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(1))));
   } else {
     ASSERT_TRUE(false);
   }
@@ -1723,51 +1968,64 @@ void TestBinaryTensorOpTensor(OpConverterTest* test) {
   EXPECT_TRUE(output.is_tensor());
   ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
 
-  std::vector<CType> output_data(4);
+  const DataVec input_data{
+      {"input1", test::AsTensor<CType>({CType(3), CType(6)})},
+      {"input2", test::AsTensor<CType>({CType(2), CType(3)})}};
+  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
   // After broadcasting first input becomes {3, 6, 3, 6} and second input
   // becomes {2, 3, 2, 3}.
-  test->BuildAndRun<CType>(
-      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
-      "my_binary", &output_data);
+  test->BuildAndRun(
+      input_data, &output_data,
+      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
   if (node_def.op() == "Add") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(5), CType(8), CType(6), CType(9)));
   } else if (node_def.op() == "Sub") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1), CType(4), CType(0), CType(3)));
   } else if (node_def.op() == "Mul") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(6), CType(12), CType(9), CType(18)));
   } else if (node_def.op() == "Div") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
   } else if (node_def.op() == "RealDiv") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
   } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(2), CType(2), CType(3), CType(3)));
   } else if (node_def.op() == "Maximum") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else if (node_def.op() == "Pow") {
+    ExpectArrayNear(
+        std::vector<CType>{CType(9), CType(36), CType(27), CType(216)},
+        GetSpanForData<CType>(output_data[0]));
   } else {
     ASSERT_TRUE(false);
   }
 }
 
 TEST_F(OpConverterTest, ConvertBinary) {
+  AttrValue dtype;
+  dtype.set_type(DT_FLOAT);
   // Input size doesn't match, should fail.
   for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) {
     Reset();
-    NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
+    NodeDef node_def =
+        MakeNodeDef("my_add", "Add", {num_inputs, "input"}, {{"T", dtype}});
     AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
     RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Binary ops require two inputs, at my_add");
+                               StrCat("Add got ", std::to_string(num_inputs),
+                                      " inputs but expected 2, at my_add")
+                                   .c_str());
   }
   {
     // Both inputs are weights.
     Reset();
-    NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"});
+    NodeDef node_def =
+        MakeNodeDef("my_add", "Add", {"weights1", "weights2"}, {{"T", dtype}});
     AddTestWeights<float>("weights1", {1}, {1});
     AddTestWeights<float>("weights2", {1}, {1});
     RunValidationAndConversion(
@@ -1782,15 +2040,12 @@ TEST_F(OpConverterTest, ConvertBinary) {
   TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
-#if 0
-  // TODO(b/119560144): it doesn't support FP16 constants and the following test
-  // will fail.
+
   TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
   TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
-#endif
 
   // Test BinaryTensorOpWeight() with channel-wise broadcasting.
   TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
@@ -1821,6 +2076,7 @@ TEST_F(OpConverterTest, ConvertBinary) {
   TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
   TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
   TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Pow, DT_FLOAT>(this);
 
   TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
@@ -1829,17 +2085,23 @@ TEST_F(OpConverterTest, ConvertBinary) {
   TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Pow, DT_HALF>(this);
 }
 
 TEST_F(OpConverterTest, ConvertQuantize) {
-  for (const string& op :
-       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
-        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+  precision_mode_to_test_ = TrtPrecisionMode::INT8;
+  const std::pair<string, int> op_with_num_inputs[4] = {
+      {"FakeQuantWithMinMaxArgs", 1},
+      {"FakeQuantWithMinMaxVars", 3},
+      {"QuantizeAndDequantizeV2", 3},
+      {"QuantizeAndDequantizeV3", 4}};
+  for (const auto& pair : op_with_num_inputs) {
     // Input list is empty, should fail.
-    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    NodeDef node_def = MakeNodeDef("my_quantize", pair.first, {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+        StrCat(pair.first, " got 0 inputs but expected ",
+               std::to_string(pair.second), ", at my_quantize")
             .c_str());
   }
   {
@@ -1926,9 +2188,9 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestTensor("weights_min", {1});
     AddTestTensor("weights_max", {1});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
-        "tensors, at my_quantize");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input_min\" for QuantizeAndDequantizeV2 must be a constant"
+        ", at my_quantize");
   }
   {
     // QuantizeAndDequantizeV3 ranges set via inputs, ok.
@@ -1955,46 +2217,6 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
 }
 
-TEST_F(OpConverterTest, ConvertRelu6) {
-  {
-    // Input list is empty, should fail.
-    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Invalid number of inputs for Relu6, at my_relu6");
-  }
-
-  // Get the NodeDef for Relu6.
-  Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
-  const NodeDef node_def = relu6.operation.node()->def();
-  {
-    // Input is weights, should fail.
-    Reset();
-    AddTestWeights<float>("input", {1}, {1.0f});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Relu6 is only implemented for tensors, not weights, at my_relu6");
-  }
-  {
-    // Clip tensor values and set quantization ranges, ok.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
-    EXPECT_TRUE(output.is_tensor());
-    auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
-
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
-  }
-}
-
 template <DataType dtype>
 void TestConvertSquare(OpConverterTest* test) {
   test->Reset();
@@ -2005,7 +2227,8 @@ void TestConvertSquare(OpConverterTest* test) {
   auto square = ops::Square(s.WithOpName("my_square"), input);
   NodeDef node_def = square.operation.node()->def();
 
-  test->AddTestTensor("input", {1, 20});
+  test->AddTestTensor("input", {1, 20}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
   test->RunValidationAndConversion(node_def);
   TRT_TensorOrWeights output;
   TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
@@ -2013,24 +2236,30 @@ void TestConvertSquare(OpConverterTest* test) {
   ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
 
   const int num_inputs = 20;
-  std::vector<CType> input_data(num_inputs);
-  std::vector<CType> expected_output_data(num_inputs);
-  for (int i = 0; i < 20; i++) {
+  std::vector<CType> inputs(num_inputs);
+  std::vector<CType> expected_outputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
     const CType value = CType(i - 9);
-    input_data[i] = value;
-    expected_output_data[i] = value * value;
-  }
-  std::vector<CType> output_data(num_inputs);
-  test->BuildAndRun<CType>({{"input", input_data}}, "my_square", &output_data);
-  ExpectArrayNear(expected_output_data, output_data);
+    inputs[i] = value;
+    expected_outputs[i] = value * value;
+  }
+  const DataVec input_data{{"input", test::AsTensor<CType>(inputs)}};
+  // Engine outputs are converted to FP16 automatically if we set FP16 mode in
+  // the builder.
+  DataVec output_data{{"my_square", ConstructTensor<CType>(num_inputs)}};
+  test->BuildAndRun(
+      input_data, &output_data,
+      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+  ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
 }
 
 TEST_F(OpConverterTest, ConvertSquare) {
   {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_square", "Square", {});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Square expects one input, at my_square");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Square got 0 inputs but expected 1, at my_square");
   }
   {
     // Input is weights, should fail.
@@ -2042,15 +2271,13 @@ TEST_F(OpConverterTest, ConvertSquare) {
     AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Square is only implemented for tensors, at my_square");
+        "The input \"x\" for Square must be a tensor, at my_square");
   }
 
   // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
   // test DT_INT32 type here.
   TestConvertSquare<DT_FLOAT>(this);
-  // TODO(tmorris): Looks like there may be a bug with this layer for FP16
-  // inputs. Disabling for now.
-  // TestConvertSquare<DT_HALF>(this);
+  TestConvertSquare<DT_HALF>(this);
 }
 
 TEST_F(OpConverterTest, ConvertActivation) {
@@ -2058,7 +2285,7 @@ TEST_F(OpConverterTest, ConvertActivation) {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_act", "Relu", {});
     RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Relu expects one input, at my_act");
+                               "Relu got 0 inputs but expected 1, at my_act");
   }
   {
     // Input is weights, should fail.
@@ -2070,16 +2297,26 @@ TEST_F(OpConverterTest, ConvertActivation) {
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Relu is only implemented for tensors, at my_act");
+        "The input \"input\" for Relu must be a tensor, at my_act");
   }
 
+  constexpr float kAlpha = 0.2f;
+
   // Get nodedef for activation layer.
   auto get_act_nodedef = [](string op_name) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Relu") {
+    if (op_name == "LeakyRelu") {
+      auto act =
+          ops::internal::LeakyRelu(s.WithOpName("my_act"), input,
+                                   ops::internal::LeakyRelu::Alpha(kAlpha));
+      return act.operation.node()->def();
+    } else if (op_name == "Relu") {
       auto act = ops::Relu(s.WithOpName("my_act"), input);
       return act.operation.node()->def();
+    } else if (op_name == "Relu6") {
+      auto act = ops::Relu6(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
     } else if (op_name == "Sigmoid") {
       auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
       return act.operation.node()->def();
@@ -2092,8 +2329,12 @@ TEST_F(OpConverterTest, ConvertActivation) {
   };
   // Get expected output for activation layer.
   auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "Relu") {
+    if (op_name == "LeakyRelu") {
+      return (input > 0.0f) ? input : input * kAlpha;
+    } else if (op_name == "Relu") {
       return (input > 0.0f) ? input : 0.0f;
+    } else if (op_name == "Relu6") {
+      return std::min(std::max(input, 0.0f), 6.0f);
     } else if (op_name == "Sigmoid") {
       return 1.0f / (1.0f + std::exp(-input));
     } else if (op_name == "Tanh") {
@@ -2104,7 +2345,8 @@ TEST_F(OpConverterTest, ConvertActivation) {
   };
 
   // Ok.
-  for (string op_name : {"Relu", "Sigmoid", "Tanh"}) {
+  for (const string& op_name :
+       {"LeakyRelu", "Relu", "Relu6", "Sigmoid", "Tanh"}) {
     Reset();
     NodeDef node_def = get_act_nodedef(op_name);
     AddTestTensor("input", {1, 2, 3});
@@ -2113,13 +2355,20 @@ TEST_F(OpConverterTest, ConvertActivation) {
     TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+    if (op_name == "Relu6") {
+      // Relu6 should set quantization range automatically.
+      auto ranges = quantization_ranges();
+      EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    }
 
-    const std::vector<float> input_data = {-100, -2, -1, 0, 1, 100};
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", input_data}}, "my_act", &output_data);
-    for (int i = 0; i < input_data.size(); i++) {
-      const float expected_output = get_act_output(op_name, input_data[i]);
-      EXPECT_FLOAT_EQ(output_data[i], expected_output);
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 100};
+    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
+    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    for (int i = 0; i < input.size(); i++) {
+      const float expected_output = get_act_output(op_name, input[i]);
+      EXPECT_FLOAT_EQ(GetSpanForData<float>(output_data[0])[i],
+                      expected_output);
     }
   }
 }
@@ -2130,7 +2379,7 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     NodeDef node_def = MakeNodeDef("my_expanddims", "ExpandDims", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Two inputs expected for ExpandDims, at my_expanddims");
+        "ExpandDims got 0 inputs but expected 2, at my_expanddims");
   }
 
   // Get the NodeDef for ExpandDims.
@@ -2145,18 +2394,18 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     Reset();
     AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("weights", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "ExpandDims expects tensor for input, at my_expanddims");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"input\" for ExpandDims must be a "
+                               "tensor, at my_expanddims");
   }
   {
     // Axis is a tensor, should fail.
     Reset();
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "ExpandDims expects weights for axis, at my_expanddims");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"axis\" for ExpandDims must be a "
+                               "constant, at my_expanddims");
   }
   {
     // Add dim at batch dimension, should fail.
@@ -2203,11 +2452,6 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, int axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     int axis;
     std::vector<int> expected_output_dims;
@@ -2232,10 +2476,12 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_expanddims",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_expanddims", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
@@ -2243,8 +2489,9 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
   {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_squeeze", "Squeeze", {});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "One input expected for Squeeze, at my_squeeze");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Squeeze got 0 inputs but expected 1, at my_squeeze");
   }
   {
     // No attrs, should fail.
@@ -2264,7 +2511,7 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     ops::Squeeze::Attrs squeeze_attrs;
-    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);  // non-absl ok
     auto squeeze =
         ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
     return squeeze.operation.node()->def();
@@ -2277,7 +2524,7 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Squeeze expects tensor for input, at my_squeeze");
+        "The input \"input\" for Squeeze must be a tensor, at my_squeeze");
   }
   {
     // Squeeze batch dim, should fail.
@@ -2317,11 +2564,6 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     std::vector<int> axis;
     std::vector<int> expected_output_dims;
@@ -2352,10 +2594,12 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_squeeze",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_squeeze", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
@@ -2365,13 +2609,13 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     NodeDef node_def = MakeNodeDef("my_strided_slice", "StridedSlice", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "StridedSlice expects 4 inputs, at my_strided_slice");
+        "StridedSlice got 0 inputs but expected 4, at my_strided_slice");
   }
 
   // Get nodedef for StridedSlice layer.
   auto get_strided_slice_nodedef =
-      [](int begin_mask = 0, int end_mask = 0, int ellipsis_mask = 0,
-         int new_axis_mask = 0, int shrink_axis_mask = 0) -> NodeDef {
+      [](int64 begin_mask = 0, int64 end_mask = 0, int64 ellipsis_mask = 0,
+         int64 new_axis_mask = 0, int64 shrink_axis_mask = 0) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
@@ -2396,9 +2640,9 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "StridedSlice is only implemented for tensors, at my_strided_slice");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"input\" for StridedSlice must be a "
+                               "tensor, at my_strided_slice");
   }
   {
     // Begin, end, strides are tensors, should fail.
@@ -2409,8 +2653,8 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestTensor("end", {4});
     AddTestTensor("strides", {4});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "StridedSlice expects weights for begin, end, and strides, at "
+        node_def, error::UNIMPLEMENTED,
+        "The input \"begin\" for StridedSlice must be a constant, at "
         "my_strided_slice");
   }
   {
@@ -2438,46 +2682,62 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "StridedSlice can't modify batch dim, at my_strided_slice");
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
   }
   {
-    // Stride is not 1, should fail.
+    // Dynamic batch size without end_mask, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 2, -1, 3});
-    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
-                               "StridedSlice is only implemented for stride of "
-                               "1, at my_strided_slice");
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
+  }
+  {
+    // Dynamic batch size but using end_mask, ok.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(/*begin_mask=*/0,
+                                                 /*end_mask=*/1);
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 2});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(node_def);
   }
+// TRT 5.1+ supports strides
+#if IS_TRT_VERSION_GE(5, 1, 0)
   {
-    // Begin out of bounds, should fail.
+    // Negative strides, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestTensor("input", {1, 2, 3});
-    AddTestWeights<int32>("begin", {4}, {1, 2, 3, 4});
-    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "begin value of 2 for StridedSlice is invalid, must be in the range "
-        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, -1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Negative or zero stride values are not "
+                               "supported for StridedSlice, at "
+                               "my_strided_slice");
   }
+#else
   {
-    // End out of bounds, should fail.
+    // Stride is not 1, should fail.
     Reset();
     NodeDef node_def = get_strided_slice_nodedef();
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
-    AddTestWeights<int32>("end", {4}, {1, 2, 3, 4});
-    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "end value of 2 for StridedSlice is invalid, must be in the range "
-        "[-dim_size(i), dim_size(i)], at my_strided_slice");
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 2, 1, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Strides other than 1 are not supported with "
+                               "this version of TRT, at my_strided_slice");
   }
+#endif
   {
     // Size of sliced dim is negative, should fail.
     Reset();
@@ -2486,126 +2746,183 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
     AddTestWeights<int32>("begin", {4}, {0, 0, 2, 0});
     AddTestWeights<int32>("end", {4}, {1, 1, 0, 3});
     AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "New size of sliced dimension is negative, at my_strided_slice");
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "\"size\" cannot be negative or zero for "
+                               "StridedSlice, at my_strided_slice");
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims,
-               const std::vector<int>& expected_output_dims,
-               const std::vector<int>& begin, const std::vector<int>& end,
-               const std::vector<int>& begin_mask,
-               const std::vector<int>& end_mask,
-               const std::vector<int>& expected_output)
-        : input_dims(input_dims),
-          expected_output_dims(expected_output_dims),
-          begin(begin),
-          end(end),
-          expected_output(expected_output) {
-      // Masks are provided in terms of vectors for readability. Convert them to
-      // binary here.
-      this->begin_mask = 0;
-      for (int i = 0; i < begin_mask.size(); i++) {
-        if (begin_mask[i]) this->begin_mask |= (1 << i);
-      }
-      this->end_mask = 0;
-      for (int i = 0; i < end_mask.size(); i++) {
-        if (end_mask[i]) this->end_mask |= (1 << i);
-      }
-    }
-
     std::vector<int> input_dims;
-    std::vector<int> expected_output_dims;
     std::vector<int> begin;
     std::vector<int> end;
+    std::vector<int> strides;
     int begin_mask;
     int end_mask;
-    std::vector<int> expected_output;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  auto get_mask = [](const std::vector<int>& mask) {
+    int result = 0;
+    for (int i = 0; i < mask.size(); i++) {
+      if (mask[i]) result += (1 << i);
+    }
+    return result;
   };
 
+  // Same input is used for all tests.
+  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
+
+#if IS_TRT_VERSION_GE(5, 1, 0)
+  const int kStridedSliceOKCases = 23;
+#else
+  const int kStridedSliceOKCases = 19;
+#endif
   // Ok.
-  const int kStridedSliceOKCases = 18;
   TestParams ok_params[kStridedSliceOKCases] = {
-      // 2D Crop.
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 1, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{5, 6}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      // 2D Crop, with transpose.
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 2, 1},
-                 /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 1, 3}, /*expected_output_dims=*/{1, 1, 2},
-                 /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 0, 0},
-                 /*expected_output=*/{5, 6}},
-      // 2D Crop, with reshape.
-      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
-                 /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 0},
-                 /*expected_output=*/{1, 2}},
-      TestParams{/*input_dims=*/{2, 3}, /*expected_output_dims=*/{1, 2},
-                 /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 1},
-                 /*expected_output=*/{5, 6}},
-      // 1D Crop.
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 2, 2},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 0},
-                 /*expected_output=*/{1, 2, 4, 5}},
-      TestParams{/*input_dims=*/{1, 2, 3}, /*expected_output_dims=*/{1, 1, 3},
-                 /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{4, 5, 6}},
-      // 1D Crop, with transpose.
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
-                 /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 0, 1, 1},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{2, 3, 1}, /*expected_output_dims=*/{1, 3, 1},
-                 /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0},
-                 /*begin_mask=*/{0, 0, 0, 0}, /*end_mask=*/{1, 1, 1, 1},
-                 /*expected_output=*/{4, 5, 6}},
-      // 1D Crop, with reshape.
-      TestParams{/*input_dims=*/{6}, /*expected_output_dims=*/{3},
-                 /*begin=*/{0, 0}, /*end=*/{0, 3},
-                 /*begin_mask=*/{0, 0}, /*end_mask=*/{1, 0},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{1, 6}, /*expected_output_dims=*/{1, 3},
-                 /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 1, 0},
-                 /*expected_output=*/{3, 4, 5}},
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
-                 /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{3, 4, 5}},
-      // Negative axis.
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{3, 1},
-                 /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{1, 2, 3}},
-      TestParams{/*input_dims=*/{6, 1}, /*expected_output_dims=*/{5, 1},
-                 /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0},
-                 /*begin_mask=*/{0, 0, 0}, /*end_mask=*/{1, 0, 1},
-                 /*expected_output=*/{1, 2, 3, 4, 5}},
+    // 2D Crop.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, 0, 0},
+               /*end=*/{0, 0, 1, 2}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0, 0}),
+               /*expected_output_dims=*/{1, 1, 2}, /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with reshape.
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{1, 2}},
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 1}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{5, 6}},
+    // 1D Crop.
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 0}), /*expected_output_dims=*/{1, 2, 2},
+        /*expected_output=*/{1, 2, 4, 5}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 3},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{1, 2, 3}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with reshape.
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 3}, /*strides=*/{1, 1},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{1, 6},
+               /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0}),
+               /*expected_output_dims=*/{1, 3},
+               /*expected_output=*/{3, 4, 5}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{3, 4, 5}},
+    // Negative axis.
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{5, 1},
+               /*expected_output=*/{1, 2, 3, 4, 5}},
+    // Clamp out of bounds begin and end.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, -9999, -9},
+               /*end=*/{0, 1, 1000, 4}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0, 0}),
+               /*expected_output_dims=*/{1, 2, 3},
+               /*expected_output=*/{1, 2, 3, 4, 5, 6}},
+#if IS_TRT_VERSION_GE(5, 1, 0)
+    // Strides
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 5}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 1}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{2, 4, 6}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 2}, /*end=*/{0, 6}, /*strides=*/{1, 3},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{2},
+               /*expected_output=*/{3, 6}},
+#endif
   };
 
   for (int i = 0; i < kStridedSliceOKCases; i++) {
@@ -2618,17 +2935,166 @@ TEST_F(OpConverterTest, ConvertStridedSlice) {
                           ok_params[i].begin);
     AddTestWeights<int32>("end", {static_cast<int>(ok_params[i].end.size())},
                           ok_params[i].end);
-    std::vector<int> strides(ok_params[i].input_dims.size(), 1);
-    AddTestWeights<int32>("strides", {static_cast<int>(strides.size())},
-                          strides);
+    AddTestWeights<int32>("strides",
+                          {static_cast<int>(ok_params[i].strides.size())},
+                          ok_params[i].strides);
     RunValidationAndConversion(node_def);
 
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output));
-    std::vector<float> output_data(ok_params[i].expected_output.size());
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_strided_slice",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAreArray(ok_params[i].expected_output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{{"input", test::AsTensor<float>(ok_input)}};
+    DataVec output_data{
+        {"my_strided_slice",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSlice) {
+  // Get nodedef for Slice layer.
+  auto get_slice_nodedef = []() -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
+    auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32);
+    auto slice = ops::Slice(s.WithOpName("my_slice"), input, begin, size);
+    return slice.operation.node()->def();
+  };
+
+  {
+    // Begin is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, -1, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Begin is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 3, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Size is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, -2});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 3 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Size is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 3, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 2 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Modify batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size with size[0] not -1, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size but using size[0] of -1, ok.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {-1, 1, 2, 2});
+    RunValidationAndConversion(node_def);
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<int> begin;
+    std::vector<int> size;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Ok.
+  const int kSliceOKCases = 5;
+  TestParams ok_params[kSliceOKCases] = {
+      TestParams{{1, 2, 3},
+                 {0, 0, 0, 0},
+                 {-1, -1, -1, -1},
+                 {1, 2, 3},
+                 {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, 1, 2, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, -1, 2, 2}, {1, 2, 2}, {1, 2, 4, 5}},
+      TestParams{{6}, {0, 1}, {1, 5}, {5}, {2, 3, 4, 5, 6}},
+      TestParams{{6}, {0, 1}, {-1, 3}, {3}, {2, 3, 4}},
+  };
+
+  for (int i = 0; i < kSliceOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("begin",
+                          {static_cast<int>(ok_params[i].begin.size())},
+                          ok_params[i].begin);
+    AddTestWeights<int32>("size", {static_cast<int>(ok_params[i].size.size())},
+                          ok_params[i].size);
+    RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_slice", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_slice", ConstructTensor<float>(
+                                         ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
   }
 }
 
@@ -2638,22 +3104,34 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     NodeDef node_def = MakeNodeDef("my_conv2d", "Conv2D", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Two inputs are expected for Conv2D, at my_conv2d");
+        "Conv2D got 0 inputs but expected 2, at my_conv2d");
   }
 
   // Get nodedef for Conv2D layer.
   auto get_conv2d_nodedef =
       [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCHW",
-         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+         string data_format = "NCHW", std::vector<int> dilations = {1, 1, 1, 1},
+         bool is_conv2d_backprop_input = false) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
-    ops::Conv2D::Attrs attrs =
-        ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
-    auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
-                              padding, attrs);
-    return conv2d.operation.node()->def();
+    if (is_conv2d_backprop_input) {
+      auto input_sizes =
+          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+      ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
+                                                  .DataFormat(data_format)
+                                                  .Dilations(dilations);
+      auto conv2d =
+          ops::Conv2DBackpropInput(s.WithOpName("my_conv2d"), input_sizes,
+                                   filter, input, strides, padding, attrs);
+      return conv2d.operation.node()->def();
+    } else {
+      ops::Conv2D::Attrs attrs =
+          ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+      auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter,
+                                strides, padding, attrs);
+      return conv2d.operation.node()->def();
+    }
   };
 
   {
@@ -2664,7 +3142,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Conv2D is only implemented for tensors, not weights, at my_conv2d");
+        "The input \"input\" for Conv2D must be a tensor, at my_conv2d");
   }
   {
     // Filter is tensor, should fail.
@@ -2674,7 +3152,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     AddTestTensor("weights", {3, 3, 1, 1});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Kernel for Conv2D must be constant weights, at my_conv2d");
+        "The input \"filter\" for Conv2D must be a constant, at my_conv2d");
   }
   {
     // Filter is not 4D, should fail.
@@ -2719,6 +3197,19 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                                "Dilation rate must be 1 for batch and channel "
                                "dimensions, at my_conv2d");
   }
+  {
+    // Dilation + Conv2DBackpropInput, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 2, 1}, true);
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation with Conv2DBackpropInput "
+                               "(conv2d_transpose) is not supported, "
+                               "at my_conv2d");
+  }
   {
     // Strides is not 4D, should fail.
     Reset();
@@ -2743,25 +3234,6 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims,
-               const std::vector<float>& input,
-               const std::vector<int>& filter_dims,
-               const std::vector<float>& filter,
-               const std::vector<int>& strides, const string& padding,
-               const string& data_format, const std::vector<int>& dilations,
-               const std::vector<int>& expected_output_dims,
-               const std::vector<float>& expected_output)
-        : input_dims(input_dims),
-          input(input),
-          filter_dims(filter_dims),
-          filter(filter),
-          strides(strides),
-          padding(padding),
-          data_format(data_format),
-          dilations(dilations),
-          expected_output_dims(expected_output_dims),
-          expected_output(expected_output) {}
-
     std::vector<int> input_dims;
     std::vector<float> input;
     std::vector<int> filter_dims;
@@ -2770,12 +3242,13 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     string padding;
     string data_format;
     std::vector<int> dilations;
+    bool is_conv2d_backprop_input;
     std::vector<int> expected_output_dims;
     std::vector<float> expected_output;
   };
 
   // Ok.
-  const int kConv2DOKCases = 6;
+  const int kConv2DOKCases = 7;
   TestParams ok_params[kConv2DOKCases] = {
       // Basic
       TestParams{/*input_dims=*/{1, 2, 3},
@@ -2786,6 +3259,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 2},
                  /*expected_output=*/{1, 1, 0, 1}},
       // SAME padding (Asymmetric)
@@ -2797,6 +3271,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 3},
                  /*expected_output=*/{1, 1, -2, 0, 1, -4}},
       // SAME padding (Symmetric)
@@ -2808,6 +3283,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 3},
                  /*expected_output=*/{1, 2, -1, 3, 1, -3}},
       // NHWC
@@ -2819,6 +3295,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{2, 2, 1},
                  /*expected_output=*/{1, 1, 0, 1}},
       // Dilated
@@ -2830,6 +3307,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 2},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 1},
                  /*expected_output=*/{2, 1}},
       // Strided
@@ -2841,28 +3319,400 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
                  /*expected_output_dims=*/{1, 2, 2},
                  /*expected_output=*/{1, 0, 1, 3}},
+      // Transpose Strided
+      TestParams{/*input_dims=*/{1, 2, 2},
+                 /*input=*/{0, 1, 2, 3},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/true,
+                 /*expected_output_dims=*/{1, 2, 4},
+                 /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
   };
 
   for (int i = 0; i < kConv2DOKCases; i++) {
     Reset();
-    NodeDef node_def =
-        get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
-                           ok_params[i].data_format, ok_params[i].dilations);
+    NodeDef node_def = get_conv2d_nodedef(
+        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
+        ok_params[i].dilations, ok_params[i].is_conv2d_backprop_input);
     AddTestTensor("input", ok_params[i].input_dims);
     AddTestWeights<float>("weights", ok_params[i].filter_dims,
                           ok_params[i].filter);
+    if (ok_params[i].is_conv2d_backprop_input) {
+      AddTestWeights<float>(
+          "input_sizes",
+          {static_cast<int>(ok_params[i].expected_output.size())},
+          ok_params[i].expected_output);
+    }
     RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
-    std::vector<float> output_data(ok_params[i].expected_output.size());
-    BuildAndRun<float>({{"input", ok_params[i].input}}, "my_conv2d",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAreArray(ok_params[i].expected_output));
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>(ok_params[i].input)}};
+    DataVec output_data{
+        {"my_conv2d",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertTopK) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_topk", "TopKV2", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "TopKV2 got 0 inputs but expected 2, at my_topk");
+  }
+
+  for (const auto dtype : {DT_FLOAT, DT_INT32}) {
+    // Get the NodeDef for TopKV2.
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+    auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights);
+    const NodeDef& node_def = topk.operation.node()->def();
+    {
+      // K is a tensor, should fail.
+      Reset();
+      AddTestTensor("input", {1, 2, 3}, /*batch_size=*/1,
+                    /*trt_dtype=*/TfDataTypeToTrt(dtype));
+      AddTestTensor("weights", {2});
+      RunValidationAndConversion(
+          node_def, error::UNIMPLEMENTED,
+          "The input \"k\" for TopKV2 must be a constant, at my_topk");
+    }
+    {
+      // Ok.
+      Reset();
+      AddTestTensor("input", {1, 2, 5});
+      AddTestWeights<int32>("weights", {1}, {2});
+      RunValidationAndConversion(node_def);
+      TRT_TensorOrWeights outputs[2];
+      TF_EXPECT_OK(GetTensorOrWeights("my_topk", &outputs[0]));
+      TF_EXPECT_OK(GetTensorOrWeights("my_topk:1", &outputs[1]));
+      for (auto& output : outputs) {
+        EXPECT_TRUE(output.is_tensor());
+        ExpectTrtDimsEqualsArray({1, 2, 2}, output.tensor()->getDimensions());
+      }
+
+      const DataVec input_data{
+          {"input", test::AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
+      DataVec output_data{{"my_topk", ConstructTensor<float>(4)},
+                          {"my_topk:1", ConstructTensor<int32>(4)}};
+      BuildAndRun(input_data, &output_data);
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                  ElementsAre(6, 5, 7, 1));
+      EXPECT_THAT(GetSpanForData<int32>(output_data[1]),
+                  ElementsAre(4, 2, 1, 2));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestConvertGather(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), dtype);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+
+  struct TestParams {
+    std::vector<int> params_dims;
+    std::vector<int> indices_dims;
+    std::vector<int> indices;
+    int axis;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Input is the same {1, 2, 3, 4, 5, 6} for all cases.
+  const int kGatherOKCases = 5;
+  const std::vector<CType> params_input = {CType(1), CType(2), CType(3),
+                                           CType(4), CType(5), CType(6)};
+  TestParams ok_params[kGatherOKCases] = {
+      // Indices are always of rank>1, and output rank is
+      // rank(params) + rank(indices) - 1.
+      // TODO(laigd): do we support 0-rank ITensor as indices?
+      TestParams{{1, 2, 3}, {1}, {0}, 3, {1, 2, 1, 1}, {1, 4}},
+      TestParams{{1, 2, 3}, {1}, {1}, 3, {1, 2, 1, 1}, {2, 5}},
+      TestParams{{1, 2, 3}, {1}, {2}, -1, {1, 2, 1, 1}, {3, 6}},
+      TestParams{
+          {1, 2, 3}, {3}, {2, 0, 1}, 3, {1, 2, 1, 3}, {3, 1, 2, 6, 4, 5}},
+      TestParams{{3, 2},
+                 {2, 2},
+                 {0, 0, 1, 0},
+                 2,
+                 {3, 1, 2, 2},
+                 {1, 1, 2, 1, 3, 3, 4, 3, 5, 5, 6, 5}},
+  };
+
+  // Ok.
+  for (int i = 0; i < kGatherOKCases; i++) {
+    test->Reset();
+    test->AddTestTensor("params", ok_params[i].params_dims, 1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestTensor("indices", ok_params[i].indices_dims, 1,
+                        nvinfer1::DataType::kINT32);
+    test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_gather", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    // Create input in CType and convert expected output to CType.
+    std::vector<CType> converted_expected_output(
+        ok_params[i].expected_output.begin(),
+        ok_params[i].expected_output.end());
+
+    const DataVec input_data{
+        {"params", test::AsTensor<CType>(params_input)},
+        {"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+    DataVec output_data{
+        {"my_gather",
+         ConstructTensor<CType>(ok_params[i].expected_output.size())}};
+    test->BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(converted_expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertGather) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_gather", "GatherV2", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "GatherV2 got 0 inputs but expected 3, at my_gather");
+  }
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), DT_FLOAT);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestTensor("axis", {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"axis\" for GatherV2 must be a constant, at my_gather");
+  }
+  {
+    // Axis is out of bounds, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {4});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Axis value of 4 is out of bounds, must be in "
+                               "range [-4, 4), at my_gather");
+  }
+  {
+    // Axis is batch dimension, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {0});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "TensorRT does not allow manipulation of the "
+                               "batch dimension, at my_gather");
+  }
+
+  Reset();
+  TestConvertGather<DT_FLOAT>(this);
+  TestConvertGather<DT_HALF>(this);
+  TestConvertGather<DT_INT32>(this);
+}
+
+TEST_F(OpConverterTest, ConvertUnary) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_unary", "Neg", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Neg got 0 inputs but expected 1, at my_unary");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
+    const NodeDef& node_def = neg.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"x\" for Neg must be a tensor, at my_unary");
+  }
+
+  // Get nodedef for unary layer.
+  auto get_unary_nodedef = [](string op_name) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    if (op_name == "Abs") {
+      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acos") {
+      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acosh") {
+      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asin") {
+      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asinh") {
+      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atan") {
+      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atanh") {
+      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Ceil") {
+      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cos") {
+      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cosh") {
+      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Exp") {
+      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Floor") {
+      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Log") {
+      auto unary = ops::Log(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Neg") {
+      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Reciprocal") {
+      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Rsqrt") {
+      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sin") {
+      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sinh") {
+      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sqrt") {
+      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Tan") {
+      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    }
+    EXPECT_TRUE(false);
+    return NodeDef();
+  };
+  // Get expected output for unary layer.
+  auto get_unary_output = [](string op_name, float input) -> float {
+    if (op_name == "Abs") {
+      return std::abs(input);
+    } else if (op_name == "Acos") {
+      return std::acos(input);
+    } else if (op_name == "Acosh") {
+      return std::acosh(input);
+    } else if (op_name == "Asin") {
+      return std::asin(input);
+    } else if (op_name == "Asinh") {
+      return std::asinh(input);
+    } else if (op_name == "Atan") {
+      return std::atan(input);
+    } else if (op_name == "Atanh") {
+      return std::atanh(input);
+    } else if (op_name == "Ceil") {
+      return std::ceil(input);
+    } else if (op_name == "Cos") {
+      return std::cos(input);
+    } else if (op_name == "Cosh") {
+      return std::cosh(input);
+    } else if (op_name == "Exp") {
+      return std::exp(input);
+    } else if (op_name == "Floor") {
+      return std::floor(input);
+    } else if (op_name == "Log") {
+      return std::log(input);
+    } else if (op_name == "Neg") {
+      return -input;
+    } else if (op_name == "Reciprocal") {
+      return 1.0 / input;
+    } else if (op_name == "Rsqrt") {
+      return 1.0 / std::sqrt(input);
+    } else if (op_name == "Sin") {
+      return std::sin(input);
+    } else if (op_name == "Sinh") {
+      return std::sinh(input);
+    } else if (op_name == "Sqrt") {
+      return std::sqrt(input);
+    } else if (op_name == "Tan") {
+      return std::tan(input);
+    }
+    EXPECT_TRUE(false);
+    return 0;
+  };
+
+  // Get list of ops to test.
+  std::vector<string> ops_to_test;
+  // Add all ops supported by ConvertUnary.
+  auto* map = UnaryOperationMap();
+  ops_to_test.reserve(map->size());
+  for (auto& pair : *map) {
+    ops_to_test.push_back(pair.first);
+  }
+  // Add other unary ops to test.
+  ops_to_test.push_back("Rsqrt");
+  // Ok.
+  for (string op_name : ops_to_test) {
+    Reset();
+    NodeDef node_def = get_unary_nodedef(op_name);
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
+    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    for (int i = 0; i < input.size(); ++i) {
+      const float expected_output = get_unary_output(op_name, input[i]);
+      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
+                  NanSensitiveFloatNear(expected_output, 0.0001));
+    }
   }
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
similarity index 82%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index d57f2300f8e6e6ce79c538133da6bc5cf5ead2f5..d325d11dfff54cc4e4d282bc513ee056fdf97271 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -12,9 +12,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
@@ -30,15 +32,15 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 // TODO(sami): Remove VLOG messages once the code matures
-using tensorflow::str_util::Uppercase;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
+using str_util::Uppercase;
 
-tensorflow::Status TRTOptimizationPass::Init(
-    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+Status TRTOptimizationPass::Init(
+    const RewriterConfig_CustomGraphOptimizer* config) {
   VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
   if (config == nullptr) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
   const auto params = config->parameter_map();
   if (params.count("minimum_segment_size")) {
@@ -64,18 +66,20 @@ tensorflow::Status TRTOptimizationPass::Init(
     max_workspace_size_bytes_ = params.at("max_workspace_size_bytes").i();
   }
   if (params.count("precision_mode")) {
-    TF_RETURN_IF_ERROR(GetPrecisionMode(
+    TF_RETURN_IF_ERROR(TrtPrecisionModeFromName(
         Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
   if (params.count("use_calibration")) {
     use_calibration_ = params.at("use_calibration").b();
   }
-  return tensorflow::Status::OK();
+  if (params.count("use_function_backup")) {
+    use_function_backup_ = params.at("use_function_backup").b();
+  }
+  return Status::OK();
 }
 
-void TRTOptimizationPass::PrintDebugInfo(
-    tensorflow::grappler::Cluster* cluster,
-    const tensorflow::grappler::GrapplerItem& item) {
+void TRTOptimizationPass::PrintDebugInfo(grappler::Cluster* cluster,
+                                         const grappler::GrapplerItem& item) {
   LOG(INFO) << "Cluster = " << cluster;
   string offset("  ");
   string offset2 = StrCat(offset, offset);
@@ -85,7 +89,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "type             = " << cluster->type();
     LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
     const auto dev_names = cluster->GetDeviceNames();
-    if (dev_names.size()) {
+    if (!dev_names.empty()) {
       LOG(INFO) << offset << " Device names:";
       for (const auto s : dev_names) {
         LOG(INFO) << offset2 << s;
@@ -93,7 +97,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
     std::unordered_map<string, uint64> peak_mem;
     auto status = cluster->GetPeakMemoryUsage(&peak_mem);
-    if (status == tensorflow::Status::OK()) {
+    if (status == Status::OK()) {
       LOG(INFO) << offset << "Peak Memory Usage :";
       for (auto s : peak_mem) {
         LOG(INFO) << offset2 << s.first << " = " << s.second;
@@ -101,7 +105,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
 
     const auto dev_props = cluster->GetDevices();
-    if (dev_props.size()) {
+    if (!dev_props.empty()) {
       LOG(INFO) << offset << "Device properties:";
       for (auto k : dev_props) {
         LOG(INFO) << offset2 << k.first;
@@ -129,7 +133,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
   }
   LOG(INFO) << "item: " << item.id;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     LOG(INFO) << offset << "Feeds  :";
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
@@ -138,7 +142,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   } else {
     LOG(INFO) << offset << "No Feeds";
   }
-  if (item.fetch.size()) {
+  if (!item.fetch.empty()) {
     LOG(INFO) << offset << "Fetches  :";
     for (const auto& f : item.fetch) {
       LOG(INFO) << offset2 << f;
@@ -147,7 +151,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "No Fetches";
   }
 
-  if (item.init_ops.size()) {
+  if (!item.init_ops.empty()) {
     LOG(INFO) << offset << "init ops  :";
     for (const auto& f : item.init_ops) {
       LOG(INFO) << offset2 << f;
@@ -158,7 +162,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   LOG(INFO) << "Save Op = " << item.save_op;
   LOG(INFO) << "Restore Op = " << item.restore_op;
   LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
-  if (item.keep_ops.size()) {
+  if (!item.keep_ops.empty()) {
     LOG(INFO) << offset << "keep ops  :";
     for (const auto& f : item.keep_ops) {
       LOG(INFO) << offset2 << f;
@@ -175,9 +179,9 @@ void TRTOptimizationPass::PrintDebugInfo(
   }
 }
 
-tensorflow::Status TRTOptimizationPass::Optimize(
-    tensorflow::grappler::Cluster* cluster,
-    const tensorflow::grappler::GrapplerItem& item, GraphDef* optimized_graph) {
+Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
+                                     const grappler::GrapplerItem& item,
+                                     GraphDef* optimized_graph) {
   VLOG(1) << "Called TRTOptimization Pass " << name_;
   // This is a hack to workaround optimizer issue. MetaOptimizer calls
   // optimization passes on function objects as well, we should not modify
@@ -188,14 +192,14 @@ tensorflow::Status TRTOptimizationPass::Optimize(
                  << " is probably called on funcdef! This optimizer must *NOT* "
                     "be called on function objects.";
     *optimized_graph = item.graph;
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
   if (VLOG_IS_ON(3)) {
     LOG(INFO) << CurrentStackTrace();
     PrintDebugInfo(cluster, item);
   }
   int max_dim = -1;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
       if (shape.dims() > 0) {
@@ -221,11 +225,11 @@ tensorflow::Status TRTOptimizationPass::Optimize(
                    << " adjusting maximum batch size to match input batch size";
     }
   }
-  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  grappler::GraphProperties static_graph_properties(item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
-  tensorflow::tensorrt::convert::ConversionParams cp;
+  ConversionParams cp;
 
-  if (use_calibration_ && precision_mode_ != INT8MODE) {
+  if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
     VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
             << "Falling back to use_calibration = False."
             << "Note that the default value of use_calibration is True.";
@@ -243,7 +247,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     // If the last token is not an integer, it must be part of the name.
     // Otherwise it is port number.
     if (tokens.size() > 1 &&
-        !strings::safe_strto32(tokens.back(), &dumm_port)) {
+        !strings::safe_strto32(tokens.back(), &dumm_port)) {  // non-absl ok
       StrAppend(&s, ":", tokens.back());
     }
     nodes_to_preserve.push_back(s);
@@ -261,27 +265,24 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   cp.cached_engine_batches = batches_;
   cp.max_cached_engines = max_cached_batches_;
   cp.use_calibration = use_calibration_;
-  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
+  cp.use_function_backup = use_function_backup_;
+  auto status = ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
 }
 
-void TRTOptimizationPass::Feedback(
-    tensorflow::grappler::Cluster* cluster,
-    const tensorflow::grappler::GrapplerItem& item,
-    const GraphDef& optimized_graph, double result) {}
-
-}  // namespace convert
-}  // namespace tensorrt
-}  // namespace tensorflow
+void TRTOptimizationPass::Feedback(grappler::Cluster* cluster,
+                                   const grappler::GrapplerItem& item,
+                                   const GraphDef& optimized_graph,
+                                   double result) {}
 
 class VerboseCustomGraphOptimizerRegistrar
-    : public tensorflow::grappler::CustomGraphOptimizerRegistrar {
+    : public grappler::CustomGraphOptimizerRegistrar {
  public:
   VerboseCustomGraphOptimizerRegistrar(
-      const tensorflow::grappler::CustomGraphOptimizerRegistry::Creator& cr,
-      const tensorflow::string& name)
-      : tensorflow::grappler::CustomGraphOptimizerRegistrar(cr, name) {
+      const grappler::CustomGraphOptimizerRegistry::Creator& cr,
+      const string& name)
+      : grappler::CustomGraphOptimizerRegistrar(cr, name) {
     VLOG(1) << "Constructing a CustomOptimizationPass registration object for "
             << name;
   }
@@ -291,10 +292,13 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
     []() {
       VLOG(1)
           << "Instantiating CustomOptimizationPass object TensorRTOptimizer";
-      return new tensorflow::tensorrt::convert::TRTOptimizationPass(
-          "TensorRTOptimizer");
+      return new TRTOptimizationPass("TensorRTOptimizer");
     },
     ("TensorRTOptimizer"));
 
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
 #endif
 #endif
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
similarity index 63%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7..d3fd914b30210e10b211cfc6281964af620c1427 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
 
 #include <string>
 
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
@@ -29,46 +30,49 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
+class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
  public:
   TRTOptimizationPass(const string& name = "TRTOptimizationPass")
       : name_(name),
         minimum_segment_size_(3),
-        precision_mode_(0),
+        precision_mode_(TrtPrecisionMode::FP32),
         maximum_batch_size_(-1),
         is_dynamic_op_(false),
         max_cached_batches_(1),
         max_workspace_size_bytes_(256LL << 20),
-        use_calibration_(true) {
+        use_calibration_(true),
+        use_function_backup_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
   string name() const override { return name_; };
 
-  tensorflow::Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
-                              config = nullptr) override;
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override;
 
-  tensorflow::Status Optimize(tensorflow::grappler::Cluster* cluster,
-                              const tensorflow::grappler::GrapplerItem& item,
-                              GraphDef* optimized_graph) override;
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
 
-  void Feedback(tensorflow::grappler::Cluster* cluster,
-                const tensorflow::grappler::GrapplerItem& item,
+  void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
 
-  void PrintDebugInfo(tensorflow::grappler::Cluster* cluster,
-                      const tensorflow::grappler::GrapplerItem& item);
+  void PrintDebugInfo(grappler::Cluster* cluster,
+                      const grappler::GrapplerItem& item);
 
  private:
   const string name_;
   int minimum_segment_size_;
-  int precision_mode_;
+  TrtPrecisionMode precision_mode_;
   int maximum_batch_size_;
   bool is_dynamic_op_;
   std::vector<int> batches_;
   int max_cached_batches_;
   int64_t max_workspace_size_bytes_;
   bool use_calibration_;
+
+  // Whether to allow TF function fallback path in TRTEngineOp.
+  bool use_function_backup_;
 };
 
 }  // namespace convert
@@ -77,4 +81,4 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
 
 #endif  // GOOGLE_CUDA
 #endif  // GOOGLE_TENSORRT
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/test/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
similarity index 50%
rename from tensorflow/contrib/tensorrt/test/utils.h
rename to tensorflow/compiler/tf2tensorrt/convert/utils.cc
index 4bb4120206cfaae70107e55d1818e3af2f02717a..ca21c193d6313ec1883788486f05d49f889a7145 100644
--- a/tensorflow/contrib/tensorrt/test/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -13,32 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tensorrt {
-namespace test {
-
-// Helper methods to inject values used by testing tools.
-void EnableTestValue();
-void ClearTestValues(const string& pattern);
-void AddTestValue(const string& label, const string& value);
-string GetTestValue(const string& label);
-
-#define TRT_RETURN_IF_TEST_VALUE(label, value_to_return)     \
-  do {                                                       \
-    if (::tensorflow::tensorrt::test::GetTestValue(label) == \
-        value_to_return) {                                   \
-      return errors::Internal("Injected manually");          \
-    }                                                        \
-  } while (0)
-
-}  // namespace test
+
+Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name) {
+  switch (mode) {
+    case TrtPrecisionMode::FP32:
+      *name = "FP32";
+      break;
+    case TrtPrecisionMode::FP16:
+      *name = "FP16";
+      break;
+    case TrtPrecisionMode::INT8:
+      *name = "INT8";
+      break;
+    default:
+      return errors::OutOfRange("Unknown precision mode");
+  }
+  return Status::OK();
+}
+
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) {
+  if (name == "FP32") {
+    *mode = TrtPrecisionMode::FP32;
+  } else if (name == "FP16") {
+    *mode = TrtPrecisionMode::FP16;
+  } else if (name == "INT8") {
+    *mode = TrtPrecisionMode::INT8;
+  } else {
+    return errors::InvalidArgument("Invalid precision mode name: ", name);
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
similarity index 69%
rename from tensorflow/contrib/tensorrt/convert/utils.h
rename to tensorflow/compiler/tf2tensorrt/convert/utils.h
index 0592f31462af2b20f3a13fe5119e89c2ba42dd8a..91c8c660f85dcea9ad4d6b33a7c0fd979be0f819 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
 
 #include <memory>
 
@@ -33,18 +33,13 @@ struct TrtDestroyer {
 template <typename T>
 using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
 
-bool IsGoogleTensorRTEnabled();
+enum class TrtPrecisionMode { FP32, FP16, INT8 };
 
-// TODO(aaroey): use an enum instead.
-const int FP32MODE = 0;
-const int FP16MODE = 1;
-const int INT8MODE = 2;
+Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name);
 
-Status GetPrecisionModeName(const int precision_mode, string* name);
-
-Status GetPrecisionMode(const string& name, int* precision_mode);
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode);
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
similarity index 87%
rename from tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op.cc
rename to tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
index f68bc2b48583904a2b5b7ef7139505b3c141c165..e252f9111d61dce0b0821f72b3c56f2516fc20f3 100644
--- a/tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
-
 #include <memory>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -49,7 +46,7 @@ class GetSerializedResourceOp : public OpKernel {
     SerializableResourceBase* resource = nullptr;
     OP_REQUIRES_OK(context, context->resource_manager()->Lookup(
                                 container, resource_name, &resource));
-    ::tensorflow::core::ScopedUnref sc(resource);
+    core::ScopedUnref sc(resource);
 
     // Serialize the resource as output.
     string serialized_resource;
@@ -70,4 +67,3 @@ REGISTER_KERNEL_BUILDER(Name("GetSerializedResourceOp").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_GET_SERIALIZED_RESOURCE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op_test.cc
rename to tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
index a91228e4c420ad241ab673a254364203e324a282..ec038ebda073c8050321d5668b15a2c6faa72a4b 100644
--- a/tensorflow/contrib/tensorrt/kernels/get_serialized_resource_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <fstream>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
similarity index 70%
rename from tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 7548c8ccda4571843f4b4792c9d97f7972963274..30f29902d73487de1a59d70518a7bab06448b578 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -12,40 +12,47 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
-
 #include <algorithm>
+#include <memory>
+#include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 static Logger logger;
+using absl::StrAppend;
+using absl::StrCat;
 using ::nvinfer1::IRuntime;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
 
 // A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
-class AsyncHelper : public tensorflow::core::RefCounted {
+class AsyncHelper : public core::RefCounted {
  public:
   AsyncHelper(AsyncOpKernel::DoneCallback done) { done_ = done; }
   ~AsyncHelper() override { done_(); }
@@ -54,37 +61,115 @@ class AsyncHelper : public tensorflow::core::RefCounted {
   AsyncOpKernel::DoneCallback done_;
 };
 
-#define TYPECASE(dt, X, Y)                                                \
-  case dt: {                                                              \
-    return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+//  This OP can construct TRTEngine on the fly and if construction of engine
+//  fails, executes equivalent subgraph as a TensorFlow function.
+class TRTEngineOp : public AsyncOpKernel {
+ public:
+  explicit TRTEngineOp(OpKernelConstruction* context);
+
+  void ComputeAsync(OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+ private:
+  // Execute calibration
+  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Construct a function handle for executing native funcdef graph
+  Status ConstructFunctionHandle(OpKernelContext* ctx);
+
+  // Execute replaced native segment as function Op.
+  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Execute the tensorrt engine. Returns whether we need to retry by running
+  // the native segment.
+  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
+
+  // Allocate necessary resources for calibration
+  Status AllocateCalibrationResources(OpKernelContext* ctx,
+                                      SerializableResourceBase** cr);
+
+  // Get engine for the input shape
+  EngineContext* GetEngine(const std::vector<TensorShape>& input_shapes,
+                           OpKernelContext* ctx);
+
+  // Return engine batch in cached_engne_batch_sizes_ which is closest to input
+  // batch.
+  bool GetCompatibleCachedEngine(
+      const std::vector<TensorShape>& actual_input_shapes,
+      std::vector<TensorShape>* engine_input_shapes);
+
+  std::vector<string> input_nodes_;
+  std::vector<string> output_nodes_;
+
+  // serialized protobuf segment or trt engine depending on static_engine_ flag.
+  string serialized_segment_;
+
+  // Name of the function for TF native execution of the segment. If empty, it
+  // means TF native execution is not allowed, and if TRT engine fails to run
+  // an error will be returned.
+  string funcdef_name_;
+
+  // GraphDef representation of the segment.
+  GraphDef segment_graph_;
+
+  // Engine Precision mode.
+  TrtPrecisionMode precision_mode_;
+
+  // Whether engine is constructed during the conversion or needs to be
+  // constructed from protobuf segment.
+  bool static_engine_;
+
+  // Whether to calibrate INT8 engine.
+  bool calibration_mode_;
+
+  // Batches of the cached engines
+  std::vector<int> cached_engine_batches_;
+
+  // Maximum number of cached engines
+  int max_cached_engines_;
+
+  int64 workspace_size_;
+  mutex engine_mutex_;
+  FunctionLibraryRuntime::Handle native_func_;
+
+  // The finalized calibrator for inference.
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+
+  // If true, create calibration graph for INT8 mode. Otherwise, we are using
+  // user-provided quantization ranges.
+  bool use_calibration_;
+};
+
+#define TYPECASE(dt, X, Y)                                    \
+  case dt: {                                                  \
+    return (void*)X->flat<EnumToDataType<dt>::Type>().data(); \
   }
 
 void* GetTensorAddress(const Tensor* tensor_ptr) {
   auto tensor_type = tensor_ptr->dtype();
   switch (tensor_type) {
-    TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);
-    TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
-    TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
+    TYPECASE(DT_FLOAT, tensor_ptr, dest_ptr);
+    TYPECASE(DT_HALF, tensor_ptr, dest_ptr);
+    TYPECASE(DT_INT8, tensor_ptr, dest_ptr);
     default: {
-      LOG(ERROR) << "Unsupported Data type "
-                 << tensorflow::DataTypeString(tensor_type);
+      LOG(ERROR) << "Unsupported Data type " << DataTypeString(tensor_type);
       return nullptr;
     }
   }
 }
 
-tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
+Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
   VLOG(1) << "Constructing function handle";
   auto lib = ctx->function_library();
   if (lib == nullptr) {
-    return tensorflow::errors::Internal("Context function library is null");
+    return errors::Internal("Context function library is null");
   }
   auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
   if (fdef == nullptr) {
-    return tensorflow::errors::Internal("Native FunctionDef ", funcdef_name_,
-                                        " can't be found in function library");
+    return errors::Internal("Native FunctionDef ", funcdef_name_,
+                            " can't be found in function library");
   }
-  tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops;
+  FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.overlay_lib = nullptr;
   inst_ops.state_handle = "";
   inst_ops.target = ctx->device()->name();
@@ -109,11 +194,15 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   if (!static_engine_) {
     if (!segment_graph_.ParseFromString(serialized_segment_)) {
       LOG(ERROR) << "Parsing segment graph failed!";
-      context->SetStatus(tensorflow::errors::InvalidArgument(
-          "Failed to parse segment graphdef!"));
+      context->SetStatus(
+          errors::InvalidArgument("Failed to parse segment graphdef!"));
       return;
     }
-    serialized_segment_.resize(0);
+    VLOG(1) << "Size of serialized GraphDef: "
+            << serialized_segment_.capacity();
+    string tmp;
+    // Swap with temporary empty string to deallocate the CPU memory.
+    serialized_segment_.swap(tmp);
   }
   VLOG(1) << "Constructing " << name();
   string precision_string;
@@ -124,16 +213,18 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
                  context->GetAttr("calibration_data", &calibration_data));
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
-  OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
+  OP_REQUIRES_OK(context,
+                 TrtPrecisionModeFromName(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("use_calibration", &use_calibration_));
-  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
-                       calibration_data.size() == 0);
-  if (calibration_data.size()) {
+  calibration_mode_ =
+      (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 &&
+       calibration_data.empty());
+  if (!calibration_data.empty()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
   }
-  native_func_ = tensorflow::kInvalidHandle;
+  native_func_ = kInvalidHandle;
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
   OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches",
@@ -150,9 +241,15 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
 
 void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
+  if (funcdef_name_.empty()) {
+    const string err_msg = StrCat("Fallback path is disabled, for ", name());
+    LOG(WARNING) << err_msg;
+    ctx->SetStatus(errors::Internal(err_msg));
+    return;
+  }
   std::vector<Tensor> inputs;
   std::vector<Tensor>* outputs = new std::vector<Tensor>();
-  if (native_func_ == tensorflow::kInvalidHandle) {
+  if (native_func_ == kInvalidHandle) {
     auto status = ConstructFunctionHandle(ctx);
     if (!status.ok()) {
       LOG(ERROR) << "Couldn't construct function handle " << funcdef_name_;
@@ -161,19 +258,20 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
     }
   }
   auto lib = ctx->function_library();
-  tensorflow::FunctionLibraryRuntime::Options opts;
+  FunctionLibraryRuntime::Options opts;
   opts.step_id = ctx->step_id();
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
+  inputs.reserve(ctx->num_inputs());
   for (int i = 0; i < ctx->num_inputs(); i++) {
     inputs.push_back(ctx->input(i));
   }
   helper->Ref();  // Increment count for calculating native graph
   VLOG(1) << "Executing native segment: " << name();
   lib->Run(opts, native_func_, inputs, outputs,
-           [this, ctx, outputs, helper](const tensorflow::Status& s) {
-             tensorflow::core::ScopedUnref sc(helper);
+           [this, ctx, outputs, helper](const Status& s) {
+             core::ScopedUnref sc(helper);
              if (!s.ok()) {
                LOG(ERROR) << "Failed to execute native segment " << this->name()
                           << ": " << s;
@@ -184,8 +282,6 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
              for (size_t t = 0; t < outputs->size(); ++t) {
                ctx->set_output(t, outputs->at(t));
              }
-             test::AddTestValue(StrCat(this->name(), ":ExecuteNativeSegment"),
-                                "done");
              delete outputs;
            });
 }
@@ -194,20 +290,17 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      AsyncHelper* helper) {
   VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
-  tensorflow::core::ScopedUnref sc(helper);
-  // TODO(aaroey): remove the ResourceMgr singleton.
-  auto trt_rm = TRTResourceManager::instance();
-  auto res_mgr = trt_rm->getManager("TRTCalibration");
+  core::ScopedUnref sc(helper);
+  auto res_mgr = ctx->resource_manager();
   TRTCalibrationResource* calib_res = nullptr;
-  auto status = res_mgr->LookupOrCreate(
-      funcdef_name_, "Calibrator", &calib_res,
-      {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
-        return this->AllocateCalibrationResources(ctx, cr);
-      }});
-  if (!status.ok()) {
-    ctx->SetStatus(status);
-    return;
-  }
+  OP_REQUIRES_OK(ctx,
+                 res_mgr->LookupOrCreate(
+                     "TF_TRT_Calibration", name(),
+                     reinterpret_cast<SerializableResourceBase**>(&calib_res),
+                     {[ctx, this](SerializableResourceBase** cr) -> Status {
+                       return this->AllocateCalibrationResources(ctx, cr);
+                     }}));
+  core::ScopedUnref calib_sc(calib_res);
   int num_inputs = ctx->num_inputs();
   // Pass input data to calibrator
   std::unordered_map<string, void*> input_data;
@@ -215,7 +308,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
     const Tensor& t = ctx->input(i);
     void* data_address = GetTensorAddress(&t);
     if (data_address == nullptr) {
-      ctx->SetStatus(tensorflow::errors::InvalidArgument(
+      ctx->SetStatus(errors::InvalidArgument(
           "Unsupported data type encountered in input ", i));
       return;
     }
@@ -233,7 +326,6 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                                 ->implementation()
                                                 ->GpuStreamMemberHack()));
   calib_res->calibrator_->setBatch(input_data, *stream);
-  test::AddTestValue(StrCat(name(), ":ExecuteCalibration"), "done");
   VLOG(2) << "Passed calibration data";
   ExecuteNativeSegment(ctx, helper);
 }
@@ -271,21 +363,22 @@ bool TRTEngineOp::GetCompatibleCachedEngine(
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
-  tensorflow::core::ScopedUnref sc(helper);
+  core::ScopedUnref sc(helper);
   if (calibration_mode_) {
     ExecuteCalibration(ctx, helper);
     return;
   }
   // Get shapes of inputs to engine.
-  std::vector<tensorflow::TensorShape> input_shapes;
+  std::vector<TensorShape> input_shapes;
+  input_shapes.reserve(ctx->num_inputs());
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    input_shapes.emplace_back(ctx->input(i).shape());
+    input_shapes.push_back(ctx->input(i).shape());
   }
   EngineContext* engine_context = GetEngine(input_shapes, ctx);
   if (!engine_context->cuda_engine) {
-    LOG(WARNING) << "Engine retrieval for input shapes: "
-                 << TensorShapeUtils::ShapeListString(input_shapes)
-                 << " failed. Running native segment for " << name();
+    VLOG(1) << "Engine retrieval for input shapes: "
+            << TensorShapeUtils::ShapeListString(input_shapes)
+            << " failed. Running native segment for " << name();
     ExecuteNativeSegment(ctx, helper);
     return;
   }
@@ -312,8 +405,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     const string input_name = StrCat(kInputPHName, i);
     const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
-      LOG(ERROR) << "Input node not found, at " << input_name;
-      return kRetry;
+      const string msg =
+          StrCat("Input node ", input_name, " not found, at ", name());
+      LOG(ERROR) << msg;
+      ctx->SetStatus(errors::NotFound(msg));
+      return !kRetry;
     }
 
     const Tensor& input_tensor = ctx->input(i);
@@ -326,7 +422,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        buffers[binding_index] =
+            const_cast<float*>(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(ERROR) << "FP16 inputs are not supported yet!";
@@ -335,10 +432,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         LOG(ERROR) << "INT8 inputs are not supported yet!";
         return kRetry;
       case nvinfer1::DataType::kINT32:
-        buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
+        buffers[binding_index] =
+            const_cast<int32*>(input_tensor.flat<int32>().data());
         break;
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
         return kRetry;
     }
   }
@@ -362,8 +460,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         return kRetry;
       }
     } else {
-      LOG(ERROR) << "Output node not found, at " << output_name;
-      return kRetry;
+      const string msg =
+          StrCat("Ouput node ", output_name, " not found, at ", name());
+      LOG(ERROR) << msg;
+      ctx->SetStatus(errors::NotFound(msg));
+      return !kRetry;
     }
     auto status = ctx->allocate_output(i, output_shape, &output_tensor);
     if (!status.ok()) {
@@ -377,7 +478,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<float>().data());
+            const_cast<float*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(WARNING) << "half size is not supported yet!";
@@ -387,7 +488,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         return kRetry;
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<int32>().data());
+            const_cast<int32*>(output_tensor->flat<int32>().data());
         break;
       default:
         LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
@@ -403,7 +504,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
 
   // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex
   // for it.
-  tensorflow::mutex_lock lock(engine_context->mu);
+  mutex_lock lock(engine_context->mu);
   // TODO(jie): trt enqueue does not return error
   auto ret = engine_context->execution_context->enqueue(num_batch, &buffers[0],
                                                         *stream, nullptr);
@@ -411,7 +512,6 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
     return kRetry;
   }
-  test::AddTestValue(StrCat(name(), ":ExecuteTrtEngine"), "done");
   // Synchronization will be done by TF.
   return !kRetry;
 }
@@ -419,15 +519,15 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
 EngineContext* TRTEngineOp::GetEngine(
     const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
   static EngineContext empty_context;
-  tensorflow::mutex_lock lock(engine_mutex_);
+  mutex_lock lock(engine_mutex_);
   // TODO(tmorris): using first input to get batch size - is this reliable?
   const int batch_size = input_shapes[0].dim_size(0);
 
   // Get engine cache
   TRTEngineCacheResource* cache_res = nullptr;
   auto status = ctx->resource_manager()->LookupOrCreate(
-      "TRTEngineCache", funcdef_name_, &cache_res,
-      {[this, ctx](TRTEngineCacheResource** cr) -> tensorflow::Status {
+      "TRTEngineCache", name(), &cache_res,
+      {[this, ctx](TRTEngineCacheResource** cr) -> Status {
         *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
         return Status::OK();
       }});
@@ -435,7 +535,7 @@ EngineContext* TRTEngineOp::GetEngine(
     ctx->SetStatus(status);
     return &empty_context;
   }
-  tensorflow::core::ScopedUnref sc(cache_res);
+  core::ScopedUnref sc(cache_res);
   auto& cache = cache_res->cache_;
   auto allocator = cache_res->allocator_.get();
   if (allocator == nullptr) {
@@ -477,7 +577,11 @@ EngineContext* TRTEngineOp::GetEngine(
                       TrtUniquePtrType<nvinfer1::IExecutionContext>(
                           raw_static_engine->createExecutionContext())));
     // Runtime is safe to delete after engine creation
-    serialized_segment_.clear();
+    VLOG(1) << "Size of serialized TRT engine: "
+            << serialized_segment_.capacity();
+    string tmp;
+    // Swap with temporary empty string to deallocate the CPU memory.
+    serialized_segment_.swap(tmp);
     if (max_batch_size < batch_size) {
       return &empty_context;
     }
@@ -487,7 +591,7 @@ EngineContext* TRTEngineOp::GetEngine(
   // Handle the dynamic engine case.
   // See if there is a compatible engine cached. The batch size should be <= the
   // cached batch size.
-  std::vector<tensorflow::TensorShape> engine_input_shapes;
+  std::vector<TensorShape> engine_input_shapes;
   const bool matched_successfully =
       GetCompatibleCachedEngine(input_shapes, &engine_input_shapes);
   // If matched, use that engine. Otherwise, we will look in cache for that
@@ -509,11 +613,11 @@ EngineContext* TRTEngineOp::GetEngine(
     LOG(INFO) << "Building a new TensorRT engine for " << name()
               << " input shapes: "
               << TensorShapeUtils::ShapeListString(engine_input_shapes);
+
     // Convert to partial shapes
-    std::vector<PartialTensorShape> partial_shapes;
-    for (int i = 0; i < engine_input_shapes.size(); i++) {
-      partial_shapes.emplace_back(engine_input_shapes[i]);
-    }
+    std::vector<PartialTensorShape> partial_shapes(engine_input_shapes.begin(),
+                                                   engine_input_shapes.end());
+
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
@@ -521,14 +625,12 @@ EngineContext* TRTEngineOp::GetEngine(
         partial_shapes, &logger, allocator, calibrator_.get(), &engine,
         use_calibration_, &convert_successfully);
     if (!status.ok()) {
-      if (convert_successfully) {
-        // This means it fail to build the engine even when the network is built
-        // successfully, probably due to internal issues. In this case we don't
-        // retry in the future.
-        cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
-      }
-      LOG(WARNING) << "Engine creation for batch size " << batch_size
-                   << " failed " << status;
+      LOG(WARNING) << "Engine creation for " << name() << " failed. "
+                   << "The native segment will be used instead. "
+                   << "Reason: " << status;
+      // Store an empty engine in the cache for these input shapes so we don't
+      // try to build the same failing engine again.
+      cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
       return &empty_context;
     }
     VLOG(1) << "Conversion is done";
@@ -541,12 +643,12 @@ EngineContext* TRTEngineOp::GetEngine(
   return cache.at(engine_input_shapes).get();
 }
 
-tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    OpKernelContext* ctx, TRTCalibrationResource** cr) {
+Status TRTEngineOp::AllocateCalibrationResources(
+    OpKernelContext* ctx, SerializableResourceBase** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
-  auto alloc = ctx->device()->GetAllocator(tensorflow::AllocatorAttributes());
+  auto alloc = ctx->device()->GetAllocator(AllocatorAttributes());
   if (!alloc) {
     LOG(WARNING) << "Can't get device allocator will not be able to "
                     "allocate memory from TensorFlow memory pool";
@@ -557,12 +659,12 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   // Get the input shapes.
   const int batch_size = ctx->input(0).dim_size(0);
   const int num_inputs = ctx->num_inputs();
-  std::vector<tensorflow::PartialTensorShape> shapes;
+  std::vector<PartialTensorShape> shapes;
   cres->device_tensors_.resize(num_inputs);
   VLOG(1) << " Constructing calibrator";
   for (int i = 0; i < num_inputs; i++) {
     // allocate workspace on device for inputs
-    const tensorflow::Tensor& t = ctx->input(i);
+    const Tensor& t = ctx->input(i);
     shapes.emplace_back(t.shape());
     Tensor* device_tensor;
     TF_RETURN_IF_ERROR(ctx->allocate_persistent(
@@ -570,7 +672,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
     void* device_address = GetTensorAddress(device_tensor);
     if (device_address == nullptr) {
-      return tensorflow::errors::InvalidArgument(
+      return errors::InvalidArgument(
           "Unsupported data type encountered in input ", i);
     }
     cres->device_buffers_.emplace(
@@ -585,7 +687,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
       ctx->device()->tensorflow_gpu_device_info()->gpu_id;
   if (platform_gpu_id < 0) {
     LOG(ERROR) << "Can't get gpu_device_info from context->device()";
-    return tensorflow::errors::InvalidArgument(
+    return errors::InvalidArgument(
         "Context->device doesn't contain device info!");
   }
   const int64 workspace_size_bytes = workspace_size_;
@@ -607,9 +709,10 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     // TODO(aaroey): maybe setting the max batch size using the python
     // calibration wrapper class.
     auto s = convert::ConvertGraphDefToEngine(
-        *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(),
-        workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
-        cres->calibrator_.get(), &cres->engine_,
+        *segment_graph, TrtPrecisionMode::INT8,
+        cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes,
+        &cres->logger_, cres->allocator_.get(), cres->calibrator_.get(),
+        &cres->engine_,
         /*use_calibration=*/true,
         /*convert_successfully=*/nullptr);
     if (!s.ok()) {
@@ -619,7 +722,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     VLOG(1) << "Calibration loop terminated " << label;
   }));
   VLOG(1) << "initialized calibrator resource";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
diff --git a/tensorflow/contrib/tensorrt/ops/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
similarity index 100%
rename from tensorflow/contrib/tensorrt/ops/get_serialized_resource_op.cc
rename to tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
similarity index 86%
rename from tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index b84d2fe0b8cef3475f2a7d0f5383d5e11cde099a..791ddc41b4ff7cadb80618a1f017d8af32c862df 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -24,12 +24,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace shape_inference {
-extern Status TRTEngineOpShapeInference(InferenceContext* c);
-}
-
-// NOTE: please try NOT to add/modify/remove attributes or inputs/outputs to the
-// list below, this will break backward compatibility!
+// NOTE: when making changes please follow
+// https://www.tensorflow.org/guide/extend/op#backwards_compatibility to not
+// break backward compatibility.
 //
 // TODO(laigd): consider making this op stateful. The only problem is it uses TF
 // function which has to be stateless, but we can use function library as the
@@ -41,8 +38,6 @@ REGISTER_OP("TRTEngineOp")
     .Attr("segment_funcdef_name: string")
     .Attr("InT: list({int8,float16,float32,int32})")
     .Attr("OutT: list({int8,float16,float32,int32})")
-    .Attr("static_engine: bool = true")
-    .Attr("fixed_input_size: bool = true")
     .Attr("cached_engine_batches: list(int) >= 0 = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
@@ -57,8 +52,10 @@ REGISTER_OP("TRTEngineOp")
     // implementation, we do require all input tensor to carry the same batch
     // size, but this could change in the future). Hence we disable shape
     // inference function as a workaround.
-    // .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::UnknownShape)
+    // Deprecated attributes.
+    .Attr("fixed_input_size: bool = true")
+    .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 062f86e8bb4dc753925e4e2baf0bc80a5312a94f..a4341c530fffca88c82813cc2ace2c0ae1df5345 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+
 #include <cassert>
 #include <cstring>
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index 754920b60ca7439513a91ad0354833a2482b29c1..f495d857037c79a1783f8eb232fb57c20e229169 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
 
 #include <iostream>
 #include <unordered_map>
@@ -71,4 +71,4 @@ class PluginTensorRT : public nvinfer1::IPlugin {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
similarity index 89%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
index cccc91226265ed139fb8db0b71c40b868f729562..dd73d15029d6fe5515c823223ffe743e52dde6e9 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -33,7 +33,7 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
     return nullptr;
   }
 
-  tensorflow::mutex_lock lock(instance_m_);
+  mutex_lock lock(instance_m_);
   auto plugin_ptr =
       plugin_registry_[encoded_op_name].first(serial_data, serial_length);
   owned_plugins_.emplace_back(plugin_ptr);
@@ -44,7 +44,7 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
 PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) {
   if (!IsPlugin(op_name)) return nullptr;
 
-  tensorflow::mutex_lock lock(instance_m_);
+  mutex_lock lock(instance_m_);
   auto plugin_ptr = plugin_registry_[op_name].second();
   owned_plugins_.emplace_back(plugin_ptr);
 
@@ -56,7 +56,7 @@ bool PluginFactoryTensorRT::RegisterPlugin(
     PluginConstructFunc construct_func) {
   if (IsPlugin(op_name)) return false;
 
-  tensorflow::mutex_lock lock(instance_m_);
+  mutex_lock lock(instance_m_);
   auto ret = plugin_registry_.emplace(
       op_name, std::make_pair(deserialize_func, construct_func));
 
@@ -64,7 +64,7 @@ bool PluginFactoryTensorRT::RegisterPlugin(
 }
 
 void PluginFactoryTensorRT::DestroyPlugins() {
-  tensorflow::mutex_lock lock(instance_m_);
+  mutex_lock lock(instance_m_);
   owned_plugins_.clear();
 }
 
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
similarity index 85%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
index bbae9fb65c22cf69d2e7954436fd04dd16f7f6c8..cce4f52d9f1080fe0174b5fcb5dd0afdaf6e7769 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
 
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -69,7 +69,7 @@ class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
   // TODO(jie): Owned plugin should be associated with different sessions;
   //            should really hand ownership of plugins to resource management;
   std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
-  tensorflow::mutex instance_m_;
+  mutex instance_m_;
 };
 
 class TrtPluginRegistrar {
@@ -89,9 +89,8 @@ class TrtPluginRegistrar {
                                         construct_func)              \
   REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
 #define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
-  static ::tensorflow::tensorrt::TrtPluginRegistrar trt_plugin_registrar##ctr \
-      TF_ATTRIBUTE_UNUSED = ::tensorflow::tensorrt::TrtPluginRegistrar(       \
-          name, deserialize_func, construct_func)
+  static TrtPluginRegistrar trt_plugin_registrar##ctr TF_ATTRIBUTE_UNUSED =   \
+      TrtPluginRegistrar(name, deserialize_func, construct_func)
 
 }  // namespace tensorrt
 }  // namespace tensorflow
@@ -99,4 +98,4 @@ class TrtPluginRegistrar {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
index 129bdcdbc2f8d9d5215f45f381bcadf35e4fa75e..7d9c465c22beed0e252cbc26d6c533a0789d4f49 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
index a8f60886c03c174a612e7a135b6eb7bb7cb9997a..f3d6b4ff476139693a5251ddf58a3200d8af8efc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include <cassert>
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
similarity index 82%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
index 274ce42fec9283c643004d45fba461879fc5f2dc..e5eff15c19694093c7a5ea933a41375e8e01c8b9 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
 
 #include <functional>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
@@ -43,4 +43,4 @@ string ExtractOpName(const void* serial_data, size_t serial_length,
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..92aae7bb6b4520be97ca70fbe99586aab912e598
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper of TRTEngineOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import platform
+from tensorflow.python.framework import errors
+
+_tf_trt_so = None
+_module_lock = threading.Lock()
+
+
+def load_trt_ops():
+  """Load TF-TRT op libraries so if it hasn't been loaded already."""
+  global _tf_trt_so
+
+  if platform.system() == "Windows":
+    raise RuntimeError("Windows platforms are not supported")
+
+  with _module_lock:
+    if _tf_trt_so:
+      return
+
+    try:
+      # pylint: disable=g-import-not-at-top,unused-variable
+      # This will call register_op_list() in
+      # tensorflow/python/framework/op_def_registry.py, but it doesn't register
+      # the op or the op kernel in C++ runtime.
+      from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import trt_engine_op
+      # pylint: enable=g-import-not-at-top,unused-variable
+    except ImportError as e:
+      print("**** Failed to import TF-TRT ops. This is because the binary was "
+            "not built with CUDA or TensorRT enabled. ****")
+      raise e
+
+    try:
+      # pylint: disable=g-import-not-at-top
+      from tensorflow.python.framework import load_library
+      from tensorflow.python.platform import resource_loader
+      # pylint: enable=g-import-not-at-top
+
+      # Loading the shared object will cause registration of the op and the op
+      # kernel if we link TF-TRT dynamically.
+      _tf_trt_so = load_library.load_op_library(
+          resource_loader.get_path_to_datafile("libtftrt.so"))
+    except errors.NotFoundError as e:
+      no_trt_message = (
+          "**** Failed to initialize TensorRT. This is either because the "
+          "TensorRT installation path is not in LD_LIBRARY_PATH, or because "
+          "you do not have it installed. If not installed, please go to "
+          "https://developer.nvidia.com/tensorrt to download and install "
+          "TensorRT ****")
+      print(no_trt_message)
+      raise e
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
similarity index 89%
rename from tensorflow/contrib/tensorrt/segment/segment.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment.cc
index ecaffa3023bc8f317d956181b44639bc80efda29..593b991d09494fac503022f5c016274175f2b250 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include <queue>
 #include <set>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/segment/union_find.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -29,13 +30,16 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
-// A simple graph representation to mirror tensorflow::Graph. This structure
+// A simple graph representation to mirror Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
 // the need to create a copy of the graph. It is composed of edges and nodes.
 // Nodes keep pointers to original TF nodes.
@@ -71,7 +75,7 @@ class SimpleEdge {
 
 class SimpleNode {
  public:
-  SimpleNode(const tensorflow::Node* node, const int id);
+  SimpleNode(const Node* node, const int id);
 
   const std::vector<SimpleEdge*>& in_edges() const { return in_edges_; }
   const std::vector<SimpleEdge*>& out_edges() const { return out_edges_; }
@@ -95,11 +99,11 @@ class SimpleNode {
   }
 
   const string& name() const { return node_->name(); }
-  const tensorflow::Node* tf_node() const { return node_; }
+  const Node* tf_node() const { return node_; }
   int id() const { return id_; }
 
  private:
-  const tensorflow::Node* node_;
+  const Node* node_;
   std::vector<SimpleEdge*> in_edges_;
   std::vector<SimpleEdge*> out_edges_;
   int id_;
@@ -109,7 +113,7 @@ class SimpleNode {
 
 class SimpleGraph {
  public:
-  explicit SimpleGraph(const tensorflow::Graph* g);
+  explicit SimpleGraph(const Graph* g);
   ~SimpleGraph();
 
   void AddControlEdge(SimpleNode* src, SimpleNode* dst);
@@ -122,15 +126,11 @@ class SimpleGraph {
     return nodes_[node_id];
   }
   int num_node_ids() const { return nodes_.size(); }
-  const SimpleNode* source_node() const {
-    return nodes_[tensorflow::Graph::kSourceId];
-  }
-  const SimpleNode* sink_node() const {
-    return nodes_[tensorflow::Graph::kSinkId];
-  }
+  const SimpleNode* source_node() const { return nodes_[Graph::kSourceId]; }
+  const SimpleNode* sink_node() const { return nodes_[Graph::kSinkId]; }
 
  private:
-  const tensorflow::Graph* g_;
+  const Graph* g_;
   std::vector<SimpleNode*> nodes_;
   std::vector<SimpleEdge*> edges_;
   // free_edge_ids_ and free_node_ids_ contain freed indices.
@@ -138,15 +138,14 @@ class SimpleGraph {
   std::set<int> free_node_ids_;
 };
 
-SimpleNode::SimpleNode(const tensorflow::Node* node, const int id)
-    : node_(node), id_(id) {
+SimpleNode::SimpleNode(const Node* node, const int id) : node_(node), id_(id) {
   if (node_) {
     in_edges_.reserve(node_->in_edges().size());
     out_edges_.reserve(node_->out_edges().size());
   }
 }
 
-SimpleGraph::SimpleGraph(const tensorflow::Graph* g) : g_(g) {
+SimpleGraph::SimpleGraph(const Graph* g) : g_(g) {
   int n_nodes = g_->num_node_ids();
   nodes_.resize(n_nodes, nullptr);
   nodes_[g->kSourceId] = new SimpleNode(g->source_node(), g->kSourceId);
@@ -190,8 +189,8 @@ void SimpleGraph::AddEdge(SimpleNode* src, int out_port, SimpleNode* dst,
   } else {
     edges_.push_back(nullptr);
   }
-  bool is_control = (out_port == tensorflow::Graph::kControlSlot);
-  is_control |= (in_port == tensorflow::Graph::kControlSlot);
+  bool is_control = (out_port == Graph::kControlSlot);
+  is_control |= (in_port == Graph::kControlSlot);
   auto edge = new SimpleEdge(i, src, out_port, dst, in_port, is_control);
   edges_[i] = edge;
   src->out_edges_.push_back(edge);
@@ -199,8 +198,7 @@ void SimpleGraph::AddEdge(SimpleNode* src, int out_port, SimpleNode* dst,
 }
 
 void SimpleGraph::AddControlEdge(SimpleNode* src, SimpleNode* dst) {
-  AddEdge(src, tensorflow::Graph::kControlSlot, dst,
-          tensorflow::Graph::kControlSlot);
+  AddEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
 }
 
 void SimpleGraph::RemoveEdge(const SimpleEdge* edge) {
@@ -237,15 +235,14 @@ struct SimpleEdgePtrCompare {
 };
 
 struct NodePtrCompare {
-  bool operator()(const tensorflow::Node* lhs,
-                  const tensorflow::Node* rhs) const {
+  bool operator()(const Node* lhs, const Node* rhs) const {
     return lhs->name() < rhs->name();
   }
 };
 
 namespace {
 
-// Copied from TF ReverseDFS, which only works for tensorflow::Graph.
+// Copied from TF ReverseDFS, which only works for Graph.
 void StableDFS(const SimpleGraph& g, bool reverse,
                const std::vector<const SimpleNode*>& start,
                const std::function<bool(const SimpleNode*)>& enter,
@@ -367,8 +364,7 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
       if (in_edge->src() != src) {
         SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         if (e->src() == graph->source_node()) {
-          graph->AddEdge(e->src(), e->src_output(), src,
-                         tensorflow::Graph::kControlSlot);
+          graph->AddEdge(e->src(), e->src_output(), src, Graph::kControlSlot);
         } else {
           graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
         }
@@ -387,8 +383,7 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
       if (e->dst() == graph->sink_node()) {
         VLOG(1) << " edge to sink node " << src->name() << " -> "
                 << e->dst()->name();
-        graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
-                       e->dst_input());
+        graph->AddEdge(src, Graph::kControlSlot, e->dst(), e->dst_input());
       } else {
         graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
       }
@@ -406,12 +401,12 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
   }
 }
 
-tensorflow::Status SegmentGraph(
-    const tensorflow::Graph* tf_graph,
-    const std::function<Status(const tensorflow::Node*)>& candidate_fn,
-    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
-    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments) {
+Status SegmentGraph(const Graph* tf_graph,
+                    const std::function<Status(const Node*)>& candidate_fn,
+                    const std::function<bool(const Edge*)>& input_candidate_fn,
+                    const std::function<bool(const Edge*)>& output_candidate_fn,
+                    const SegmentOptions& options,
+                    SegmentNodesVector* segments) {
   // Steps:
   // 1. run the segmentation algorithm to find all the segments, which uses
   //    candidate_fn to determine the candidates segment nodes;
@@ -548,7 +543,7 @@ tensorflow::Status SegmentGraph(
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the segment nodes set.
-  std::map<string, std::set<const tensorflow::Node*, NodePtrCompare>> sg_map;
+  std::map<string, std::set<const Node*, NodePtrCompare>> sg_map;
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the device names that the nodes in the segment are
@@ -574,7 +569,7 @@ tensorflow::Status SegmentGraph(
         device_maps[u.ParentValue()->name()].insert(
             tf_node->requested_device());
       } else {
-        VLOG(1) << "Node " << tf_node->name()
+        VLOG(2) << "Node " << tf_node->name()
                 << " has no device assigned requested device is: "
                 << tf_node->requested_device();
       }
@@ -584,17 +579,16 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 2 ---------------------------------
   // Remove ineligible input/output nodes.
   for (auto& itr : sg_map) {
-    std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
-        itr.second;
+    std::set<const Node*, NodePtrCompare>& segment_nodes = itr.second;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
-      std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
+      std::deque<const Node*> in_nodes_que, out_nodes_que;
       // Find an input node that is not eligible and add it to the queue.
       // Nodes that has no incoming edges should not be treated as "input",
       // as there are really no inputs to them. Similar for output nodes.
       for (auto node : segment_nodes) {
         bool added = false;
-        for (const tensorflow::Edge* edge : node->in_edges()) {
+        for (const Edge* edge : node->in_edges()) {
           if (!edge->IsControlEdge() && !edge->src()->IsSource() &&
               !segment_nodes.count(edge->src())) {  // 'node' is an input node.
             if (!input_candidate_fn(edge)) {
@@ -605,7 +599,7 @@ tensorflow::Status SegmentGraph(
           }
         }
         if (added) continue;  // Only adding the node once to either queue.
-        for (const tensorflow::Edge* edge : node->out_edges()) {
+        for (const Edge* edge : node->out_edges()) {
           if (!edge->dst()->IsSink() && !edge->IsControlEdge() &&
               !segment_nodes.count(edge->dst())) {  // 'node' is an output node.
             if (!output_candidate_fn(edge)) {
@@ -633,13 +627,11 @@ tensorflow::Status SegmentGraph(
       // remove all their inputs, and for non-const output nodes remove all
       // their outputs. In this way, for common cases the number of removed
       // nodes should be minimum.
-      auto remove_nodes = [&segment_nodes](
-                              bool is_input_nodes,
-                              std::deque<const tensorflow::Node*>* que) {
+      auto remove_nodes = [&segment_nodes](bool is_input_nodes,
+                                           std::deque<const Node*>* que) {
         // Run a BFS on the queue to find all the input/output nodes.
-        std::set<const tensorflow::Node*, NodePtrCompare> visited;
-        std::set<const tensorflow::Node*, NodePtrCompare> logged(que->begin(),
-                                                                 que->end());
+        std::set<const Node*, NodePtrCompare> visited;
+        std::set<const Node*, NodePtrCompare> logged(que->begin(), que->end());
         while (!que->empty()) {
           auto node = que->front();
           que->pop_front();
@@ -676,10 +668,13 @@ tensorflow::Status SegmentGraph(
     const string& segment_root = itr.first;
     // Return format does not require set comparator.
     std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
-    if (VLOG_IS_ON(1)) {
-      string s = "parent=" + segment_root + ":";
-      for (auto node : segment_nodes) s += " " + node->name();
-      VLOG(1) << "Segment " << segments->size() << ": " << s;
+    if (VLOG_IS_ON(1) && !segment_nodes.empty()) {
+      string s;
+      for (auto node : segment_nodes) {
+        StrAppend(&s, "\n[Op type: ", node->type_string(), "] ", node->name());
+      }
+      VLOG(1) << "Nodes in segment " << segments->size()
+              << " with parent=" << segment_root << ":" << s;
     }
 
     // Don't use small segments.
@@ -718,9 +713,12 @@ tensorflow::Status SegmentGraph(
       VLOG(1) << "Devices " << s;
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
similarity index 72%
rename from tensorflow/contrib/tensorrt/segment/segment.h
rename to tensorflow/compiler/tf2tensorrt/segment/segment.h
index 6cc92cdb5df396a6bca26119f152487bc3685a6d..e31f1a989d9d9f203554811093e830ee8b139a6e 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
 
 #include <set>
 #include <vector>
@@ -24,8 +24,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
@@ -42,22 +44,25 @@ struct SegmentOptions {
 
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
-// @param graph tensorflow::Graph of the network
+// @param graph Graph of the network
 // @param candidate_fn A function that returns OK for a Node* if
 // that node can be handled by TensorRT.
 // @param segments Returns the TensorRT segments/subgraphs. Each entry
 // in the vector describes a subgraph by giving a set of the names of
 // all the NodeDefs in that subgraph.
 // @return the status.
-tensorflow::Status SegmentGraph(
-    const tensorflow::Graph* tf_graph,
-    const std::function<Status(const tensorflow::Node*)>& candidate_fn,
-    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
-    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments);
+Status SegmentGraph(const Graph* tf_graph,
+                    const std::function<Status(const Node*)>& candidate_fn,
+                    const std::function<bool(const Edge*)>& input_candidate_fn,
+                    const std::function<bool(const Edge*)>& output_candidate_fn,
+                    const SegmentOptions& options,
+                    SegmentNodesVector* segments);
 
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
similarity index 91%
rename from tensorflow/contrib/tensorrt/segment/segment_test.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 4ac02327ae68069278066b6e7e931bb9449c2603..84b690ecba6fcb9718a1008ee61383a84a381a46 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -26,17 +26,19 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 namespace test {
-namespace ops = ::tensorflow::ops;
 
 class SegmentTest : public ::testing::Test {
  protected:
-  std::function<Status(const tensorflow::Node*)> MakeCandidateFn(
+  std::function<Status(const Node*)> MakeCandidateFn(
       const std::set<string>& node_names) {
-    return [node_names](const tensorflow::Node* node) -> Status {
+    return [node_names](const Node* node) -> Status {
       if (node_names.find(node->name()) != node_names.end()) {
         return Status::OK();
       }
@@ -44,22 +46,21 @@ class SegmentTest : public ::testing::Test {
     };
   }
 
-  std::function<bool(const tensorflow::Edge*)> MakeInputEdgeCandidateFn(
+  std::function<bool(const Edge*)> MakeInputEdgeCandidateFn(
       const std::set<string>& node_names) {
-    return [node_names](const tensorflow::Edge* in_edge) -> bool {
+    return [node_names](const Edge* in_edge) -> bool {
       return node_names.find(in_edge->dst()->name()) != node_names.end();
     };
   }
 
-  std::function<bool(const tensorflow::Edge*)> MakeOutputEdgeCandidateFn(
+  std::function<bool(const Edge*)> MakeOutputEdgeCandidateFn(
       const std::set<string>& node_names) {
-    return [node_names](const tensorflow::Edge* out_edge) -> bool {
+    return [node_names](const Edge* out_edge) -> bool {
       return node_names.find(out_edge->src()->name()) != node_names.end();
     };
   }
 
-  void RunTest(const tensorflow::Graph* graph,
-               const std::set<string>& candidates,
+  void RunTest(const Graph* graph, const std::set<string>& candidates,
                const std::set<string>& input_candidates,
                const std::set<string>& output_candidates,
                const std::vector<std::set<string>>& expected_segments) {
@@ -103,7 +104,7 @@ std::set<string> operator-(const std::set<string>& lhs, const string& rhs) {
 
 TEST_F(SegmentTest, Empty) {
   Scope s = Scope::NewRootScope();
-  tensorflow::Graph g(OpRegistry::Global());
+  Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
   // Expect no segments/subgraphs.
   RunTest(&g, {}, {}, {}, {});
@@ -126,7 +127,7 @@ TEST_F(SegmentTest, Simple) {
   auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
   auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
   auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
-  tensorflow::Graph g(OpRegistry::Global());
+  Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
 
   // All Add operations are candidates, and we expect all of them to be
@@ -173,7 +174,7 @@ TEST_F(SegmentTest, AvoidCycle) {
   auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
   auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
   auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
-  tensorflow::Graph g(OpRegistry::Global());
+  Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
 
   // add2 is not a TRT candidate so there should be no segments generated.
@@ -204,7 +205,7 @@ TEST_F(SegmentTest, Multiple) {
   auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
   auto add4 = ops::Add(s.WithOpName("add4"), add2, add5);
   auto add6 = ops::Add(s.WithOpName("add6"), add5, add8);
-  tensorflow::Graph g(OpRegistry::Global());
+  Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
 
   const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4",
@@ -251,7 +252,7 @@ TEST_F(SegmentTest, BigIfElse) {
   auto add5 = ops::Add(s.WithOpName("add5"), add4, add4);
   auto add6 = ops::Add(s.WithOpName("add6"), add5, add5);
   auto add7 = ops::Add(s.WithOpName("add7"), add3, add6);
-  tensorflow::Graph g(OpRegistry::Global());
+  Graph g(OpRegistry::Global());
   TF_EXPECT_OK(s.ToGraph(&g));
 
   // Make add2 not a TRT candidate, and we expect 2 segments.
@@ -265,3 +266,6 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/segment/union_find.h
rename to tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 1c64ebbb0ae532a4776ab8963515d19fd3b23b4c..6458ae692fd7c922b5fc3bea2e55b613447dbde0 100644
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
 namespace tensorflow {
 namespace tensorrt {
@@ -76,4 +76,4 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
similarity index 100%
rename from tensorflow/contrib/tensorrt/tensorrt_test.cc
rename to tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
similarity index 53%
rename from tensorflow/contrib/tensorrt/convert/utils.cc
rename to tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index e7a1febb8c076891596741fe30721e7acca15a73..1b8ab1e2720e2eba3654a51beb972303fd55e029 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+#endif
 
 namespace tensorflow {
 namespace tensorrt {
@@ -34,35 +35,30 @@ bool IsGoogleTensorRTEnabled() {
 #endif
 }
 
-Status GetPrecisionModeName(const int precision_mode, string* name) {
-  switch (precision_mode) {
-    case FP32MODE:
-      *name = "FP32";
-      break;
-    case FP16MODE:
-      *name = "FP16";
-      break;
-    case INT8MODE:
-      *name = "INT8";
-      break;
-    default:
-      return tensorflow::errors::OutOfRange("Unknown precision mode");
-  }
-  return Status::OK();
+void GetLinkedTensorRTVersion(int* major, int* minor, int* patch) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  *major = NV_TENSORRT_MAJOR;
+  *minor = NV_TENSORRT_MINOR;
+  *patch = NV_TENSORRT_PATCH;
+#else
+  *major = 0;
+  *minor = 0;
+  *patch = 0;
+#endif
 }
 
-Status GetPrecisionMode(const string& name, int* precision_mode) {
-  if (name == "FP32") {
-    *precision_mode = FP32MODE;
-  } else if (name == "FP16") {
-    *precision_mode = FP16MODE;
-  } else if (name == "INT8") {
-    *precision_mode = INT8MODE;
-  } else {
-    return tensorflow::errors::InvalidArgument("Invalid precision mode name: ",
-                                               name);
-  }
-  return Status::OK();
+void GetLoadedTensorRTVersion(int* major, int* minor, int* patch) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  int ver = getInferLibVersion();
+  *major = ver / 1000;
+  ver = ver - *major * 1000;
+  *minor = ver / 100;
+  *patch = ver - *minor * 100;
+#else
+  *major = 0;
+  *minor = 0;
+  *patch = 0;
+#endif
 }
 
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f52bb6f1badfa44f35878d788c85b998cb99b472
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool IsGoogleTensorRTEnabled();
+
+// Return compile time TensorRT library version information {Maj, Min, Patch}.
+void GetLinkedTensorRTVersion(int* major, int* minor, int* patch);
+
+// Return runtime time TensorRT library version information {Maj, Min, Patch}.
+void GetLoadedTensorRTVersion(int* major, int* minor, int* patch);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_PY_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
similarity index 95%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 7a2e93414aed56525eaeac876cdac20404bcf6ab..a18f758a5512141ef180844dd4fabe960cbed4f2 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -72,7 +72,7 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
                                    uint32_t flags) {
   if (size == 0) return nullptr;
   // WAR for allocator alignment requirement. Certain cuda API calls require GPU
-  // memory with alignemtn to cudaDeviceProp::textureAlignment.
+  // memory with alignment to cudaDeviceProp::textureAlignment.
   // See issue #20856
   alignment = 512;
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
@@ -94,7 +94,7 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
   return mem;
 }
 
-TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
+TRTDeviceAllocator::TRTDeviceAllocator(Allocator* allocator)
     : allocator_(allocator) {
   VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
 }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
similarity index 89%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index f857a9de055ee7668f0bf9bc97e030354505081b..8ec06d7456c28505fe45859e42d83cc569d90dc5 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
 
 #include <unordered_map>
 
@@ -59,7 +59,7 @@ class TRTCudaAllocator : public TRTBaseAllocator {
 class TRTDeviceAllocator : public TRTBaseAllocator {
   // Allocator implementation wrapping TF device allocators.
  public:
-  TRTDeviceAllocator(tensorflow::Allocator* allocator);
+  TRTDeviceAllocator(Allocator* allocator);
 
   // TODO(aaroey): base class doesn't have a virtual destructor, work with
   // Nvidia to fix it.
@@ -70,7 +70,7 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
   void free(void* memory) override;
 
  private:
-  tensorflow::Allocator* allocator_;
+  Allocator* allocator_;
 
   // supporting alignment from allocation request requires a map to free;
   std::unordered_map<void*, void*> mem_map_;
@@ -81,4 +81,4 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
index beb1284208e4c10ffe1d36ef411cf08f11dbcb78..e457c64928e5df84c7e2726ba3621420f013dbc9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index dab1dd9343be7d5b033a3e04bf0b49fbbf37e9e5..33a5c719ba9d750fc5ab173435512ef73ff3fce8 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 
 #include <atomic>
 #include <unordered_map>
@@ -50,7 +50,7 @@ TRTInt8Calibrator::TRTInt8Calibrator(const string& calib_data)
 
 bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
                                  const cudaStream_t stream) {
-  tensorflow::mutex_lock lock(cond_mtx_);
+  mutex_lock lock(cond_mtx_);
 
   // Wait while the queue is full or calibration is running.
   while ((calib_running_ || batch_is_set_) && !done_) cond_.wait(lock);
@@ -87,7 +87,7 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
 
 bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
                                  int num_bindings) {
-  tensorflow::mutex_lock lock(cond_mtx_);
+  mutex_lock lock(cond_mtx_);
   // Notify finish of last round of calibration.
   calib_running_ = false;
   cond_.notify_all();
@@ -111,7 +111,7 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
 }
 
 void TRTInt8Calibrator::waitAndSetDone() {
-  tensorflow::mutex_lock lock(cond_mtx_);
+  mutex_lock lock(cond_mtx_);
   // Wait while the queue is full or calibration is running, so we don't miss
   // the last batch.
   while ((calib_running_ || batch_is_set_) && !done_) cond_.wait(lock);
@@ -128,14 +128,14 @@ const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
 }
 
 void TRTInt8Calibrator::setDone() {
-  tensorflow::mutex_lock lock(cond_mtx_);
+  mutex_lock lock(cond_mtx_);
   done_ = true;
   cond_.notify_all();
 }
 
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
                                               std::size_t length) {
-  calibration_table_ = string((const char*)ptr, length);
+  calibration_table_ = string(static_cast<const char*>(ptr), length);
   VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr
           << " length=" << length;
 }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
similarity index 86%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 65466c9741989fda5f82fc27d813d026f35fe386..d34e244f6c7fe201915cb4b52808d3e0e3c57fa0 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
 
 #include <atomic>
 #include <string>
@@ -34,7 +34,12 @@ namespace tensorrt {
 // TRTs pull model for calibration. When TRT implements a means for
 // a push calibration This class should be updated accordingly
 
+// IInt8EntropyCalibrator2 is prefferred for TRT 5.1+.
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+#else
 struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+#endif
  public:
   // Construct a calibrator for future calibration.
   TRTInt8Calibrator(
@@ -73,10 +78,10 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   const int batch_size_;
 
   // mutex for condition_variable
-  tensorflow::mutex cond_mtx_;
+  mutex cond_mtx_;
 
   // condition variable to implement producer-consumer queue for calibration
-  tensorflow::condition_variable cond_;
+  condition_variable cond_;
 
   // Is calibration finished?
   bool done_;
@@ -96,4 +101,4 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 
 #endif
 #endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
similarity index 90%
rename from tensorflow/contrib/tensorrt/log/trt_logger.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index dda0dc9e712eb726800abfb6084f4f708d04825b..6bc842ed5ca7e03018157060a332338cdc926f14 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -26,6 +26,9 @@ namespace tensorrt {
 void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+    case Severity::kVERBOSE:
+#endif
     case Severity::kINFO: {  // Mark TRT info messages as debug!
       VLOG(2) << name_ << " " << msg;
       break;
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
similarity index 86%
rename from tensorflow/contrib/tensorrt/log/trt_logger.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 96ccacb791e40143c5c4d9d691bb353702f9a28b..22f4de970a80765b0e1e7e8816134d83aaec7c73 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -41,4 +41,4 @@ class Logger : public nvinfer1::ILogger {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
similarity index 87%
rename from tensorflow/contrib/tensorrt/resources/trt_lru_cache.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index afd1b83e744907debc6df0c5acb219369ff89bdb..8ece326446d9f3cb20d5ea02406e71e6e346446e 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_LRU_CACHE_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_LRU_CACHE_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
 
 #include <list>
 #include <unordered_map>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -100,26 +100,24 @@ class LRUCache {
   }
 
   // Creates n free positions in cache
-  tensorflow::Status DiscardOld(size_t n = 0) {
+  Status DiscardOld(size_t n = 0) {
     if (n > capacity_) {
-      return tensorflow::errors::Internal(
-          "Insufficient capacity in cache (capacity = ", capacity_,
-          ", requested ", n, ")");
+      return errors::Internal("Insufficient capacity in cache (capacity = ",
+                              capacity_, ", requested ", n, ")");
     }
     while (objects_.size() > (capacity_ - n)) {
       key_type discard_key = keys_.back();
       keys_.pop_back();
       objects_.erase(discard_key);
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 };
 
 // Define a hash function for vector<TensorShape> because it is used as the key
 // for the engine cache.
 struct VectorTensorShapeHasher {
-  std::size_t operator()(
-      const std::vector<tensorflow::TensorShape>& key) const {
+  std::size_t operator()(const std::vector<TensorShape>& key) const {
     return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
   }
 };
@@ -141,12 +139,12 @@ struct EngineContext {
       GUARDED_BY(mu);
 };
 
-class TRTEngineCacheResource : public tensorflow::ResourceBase {
+class TRTEngineCacheResource : public ResourceBase {
  public:
   TRTEngineCacheResource(OpKernelContext* ctx, size_t capacity)
       : cache_(capacity) {
     auto device = ctx->device();
-    auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+    auto alloc = device->GetAllocator(AllocatorAttributes());
     if (!alloc) {
       LOG(ERROR) << "Can't find device allocator for gpu device "
                  << device->name();
@@ -189,4 +187,4 @@ class TRTEngineCacheResource : public tensorflow::ResourceBase {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_LRU_CACHE_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_lru_cache_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/resources/trt_lru_cache_test.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
index a0959479ceebf3b05908cd8f6af896821b635672..0aa5eb8f7d4ad062c2d8622fa5aa55f823f80dd5 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_lru_cache_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
 
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
similarity index 92%
rename from tensorflow/contrib/tensorrt/resources/trt_resources.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
index c19eb34dab8d25fc6b32dba2a7da667aafd8c205..534e59f06b7d8f6768d1fc58e6a96cfe692fa14f 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -48,8 +48,8 @@ Status TRTCalibrationResource::SerializeToString(string* serialized) {
   calibrator_->waitAndSetDone();
   thr_->join();
   *serialized = calibrator_->getCalibrationTableAsString();
-  if (!serialized->size()) {
-    return tensorflow::errors::Unknown("Calibration table is empty.");
+  if (serialized->empty()) {
+    return errors::Unknown("Calibration table is empty.");
   }
   return Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
similarity index 78%
rename from tensorflow/contrib/tensorrt/resources/trt_resources.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
index d0a87f2c3136cb98e86f37c8d248d7e66c9d2ebe..abfed2c1816732a6e7d7ef396d1923edf0d90f32 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
 
 #include <list>
 #include <sstream>
@@ -23,10 +23,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
@@ -37,7 +37,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
-class SerializableResourceBase : public tensorflow::ResourceBase {
+class SerializableResourceBase : public ResourceBase {
  public:
   virtual Status SerializeToString(string* serialized) = 0;
 };
@@ -60,7 +60,7 @@ class TRTCalibrationResource : public SerializableResourceBase {
   TrtUniquePtrType<nvinfer1::IBuilder> builder_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   std::unique_ptr<TRTBaseAllocator> allocator_;
-  tensorflow::tensorrt::Logger logger_;
+  Logger logger_;
   // TODO(sami): Use threadpool threads!
   std::unique_ptr<std::thread> thr_;
 };
@@ -70,4 +70,4 @@ class TRTCalibrationResource : public SerializableResourceBase {
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 0366ec45fb75a21b98ebfc4bdaa903bfa908de7a..e1df032ba937f8c19b4a3cbfa16cddc08165a3a8 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test")
 
 package_group(
     name = "internal",
@@ -24,7 +24,7 @@ package(
 )
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
 
 cc_library(
     name = "tf2xla_supported_ops_lib",
@@ -60,6 +60,14 @@ xla_proto_library(
     ],
 )
 
+xla_py_proto_library(
+    name = "tf2xla_py",
+    has_services = False,
+    api_version = 2,
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_proto"],
+)
+
 xla_proto_library(
     name = "host_compute_metadata_proto",
     srcs = ["host_compute_metadata.proto"],
@@ -76,7 +84,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":common",
-        ":dump_graph",
         ":functionalize_control_flow",
         ":tf2xla_proto",
         ":tf2xla_util",
@@ -191,7 +198,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":common",
-        ":dump_graph",
         ":host_compute_metadata_proto",
         ":sharding_util",
         ":side_effect_util",
@@ -204,6 +210,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
@@ -224,6 +231,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -281,6 +289,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -315,11 +324,13 @@ tf_cc_test(
         ":tf2xla_util",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -436,22 +447,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "dump_graph",
-    srcs = [
-        "dump_graph.cc",
-    ],
-    hdrs = [
-        "dump_graph.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/jit:flags",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "functionalize_control_flow_util",
     srcs = [
@@ -483,7 +478,6 @@ cc_library(
         ":functionalize_control_flow_util",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:union_find",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
@@ -511,7 +505,6 @@ cc_library(
         ":functionalize_while",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:union_find",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
@@ -548,7 +541,6 @@ cc_library(
         ":functionalize_control_flow_util",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:union_find",
-        "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
@@ -675,3 +667,25 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+tf_cuda_cc_test(
+    name = "fused_batchnorm_reserve_space_test",
+    size = "medium",
+    srcs = ["fused_batchnorm_reserve_space_test.cc"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/jit",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index a57095f91e43f6b31b58e5a5f36331241451b545..6aff436da4f613a399c006b922b8aba3ce65a2e5 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -20,15 +20,26 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
+
+Status GetCompileTimeConstInputs(const Node* node,
+                                 std::vector<int>* const_input_idxs,
+                                 FunctionLibraryRuntime* flib_runtime);
+
 // Backwards dataflow analysis that finds arguments to a graph that must be
 // compile-time constants.
 Status BackwardsConstAnalysis(const Graph& g,
                               std::vector<bool>* compile_time_const_arg_indices,
                               std::vector<bool>* compile_time_const_nodes,
+                              FunctionLibraryRuntime* flib_runtime,
                               std::function<bool(const Edge&)> edge_filter) {
   std::vector<bool> compile_time_const_nodes_impl;
   if (compile_time_const_nodes) {
@@ -61,7 +72,18 @@ Status BackwardsConstAnalysis(const Graph& g,
       }
       for (const Edge* pred : node->in_edges()) {
         if (!pred->IsControlEdge() && edge_filter(*pred)) {
-          (*compile_time_const_nodes)[pred->src()->id()] = true;
+          // If the src node of the `pred` is an IdentityN do not mark it as a
+          // compile-time const. Only mark the corresponding input to the
+          // IdentityN node as a const.
+          // Note: XLA IdentityN op simply forwards its inputs so this is safe.
+          while (edge_filter(*pred) &&
+                 pred->src()->type_string() == "IdentityN") {
+            status = pred->src()->input_edge(pred->src_output(), &pred);
+            if (!status.ok()) return;
+          }
+          if (edge_filter(*pred)) {
+            (*compile_time_const_nodes)[pred->src()->id()] = true;
+          }
         }
       }
       return;
@@ -69,17 +91,29 @@ Status BackwardsConstAnalysis(const Graph& g,
 
     // Mark any compile-time constant operator arguments as const.
     std::vector<int> const_input_idxs;
-    status = XlaOpRegistry::CompileTimeConstantInputs(
-        node->def(), node->op_def(), &const_input_idxs);
+    status = GetCompileTimeConstInputs(node, &const_input_idxs, flib_runtime);
 
     if (!status.ok()) {
       return;
     }
 
     for (Edge const* edge : node->in_edges()) {
-      if (absl::c_binary_search(const_input_idxs, edge->dst_input()) &&
+      if (!edge->IsControlEdge() &&
+          absl::c_binary_search(const_input_idxs, edge->dst_input()) &&
           edge_filter(*edge)) {
-        (*compile_time_const_nodes)[edge->src()->id()] = true;
+        // Do not mark IdentityN nodes as compile-time const.
+        // If the src node of the `pred` is an IdentityN do not mark it as a
+        // compile-time const. Only mark the corresponding input to the
+        // IdentityN node as a const.
+        // Note: XLA IdentityN op simply forwards its inputs so this is safe.
+        while (edge_filter(*edge) &&
+               edge->src()->type_string() == "IdentityN") {
+          status = edge->src()->input_edge(edge->src_output(), &edge);
+          if (!status.ok()) return;
+        }
+        if (edge_filter(*edge)) {
+          (*compile_time_const_nodes)[edge->src()->id()] = true;
+        }
       }
     }
   };
@@ -91,4 +125,61 @@ Status BackwardsConstAnalysis(const Graph& g,
   return status;
 }
 
+Status GetCompileTimeConstInputs(const Node* node,
+                                 std::vector<int>* const_input_idxs,
+                                 FunctionLibraryRuntime* flib_runtime) {
+  if (node->type_string() != "While") {
+    return XlaOpRegistry::CompileTimeConstantInputs(node->def(), node->op_def(),
+                                                    const_input_idxs);
+  }
+  // For While nodes, recurse into the body and cond graphs.
+  // TODO(b/124403063): Implement similar functionality for cond nodes and other
+  // functional ops.
+  NameAttrList cond_function;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "cond", &cond_function));
+  NameAttrList body_function;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "body", &body_function));
+  FunctionLibraryRuntime::Handle cond_handle;
+  FunctionLibraryRuntime::Handle body_handle;
+  TF_RETURN_IF_ERROR(flib_runtime->Instantiate(
+      cond_function.name(), AttrSlice(&cond_function.attr()), &cond_handle));
+  TF_RETURN_IF_ERROR(flib_runtime->Instantiate(
+      body_function.name(), AttrSlice(&body_function.attr()), &body_handle));
+  const FunctionBody* fcond = flib_runtime->GetFunctionBody(cond_handle);
+  const FunctionBody* fbody = flib_runtime->GetFunctionBody(body_handle);
+  TF_RET_CHECK(fcond);
+  TF_RET_CHECK(fbody);
+  int num_inputs = fbody->fdef.signature().input_arg_size();
+
+  // Stores which of the loop inputs are expected to be compile time constants.
+  std::vector<bool> compile_time_const_arg_indices(num_inputs);
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
+      *(fcond->graph), &compile_time_const_arg_indices,
+      /*compile_time_const_nodes=*/nullptr, flib_runtime));
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
+      *(fbody->graph), &compile_time_const_arg_indices,
+      /*compile_time_const_nodes=*/nullptr, flib_runtime));
+  for (int i = 0; i < num_inputs; i++) {
+    if (compile_time_const_arg_indices[i]) {
+      // Check that this input is actually a loop invariant.
+      // NOTE(srbs): Ideally this should raise an error if the loop body
+      // requires the input at this index to be a compile time const but it is
+      // not a loop invariant. However, that causes problems because const
+      // analysis is performed for the entire graph (in the
+      // MarkForCompilationPass for example) and not just for the ops
+      // that will actually be run using XLA kernels. So we silently return here
+      // and let the error be raised during the actual compilation of the
+      // XLA graph.
+      Node* arg_i = fbody->arg_nodes[i];
+      Node* ret_i = fbody->ret_nodes[i];
+      const Node* ret_i_input_0;
+      TF_RETURN_IF_ERROR(ret_i->input_node(0, &ret_i_input_0));
+      if (ret_i_input_0->id() == arg_i->id()) {
+        const_input_idxs->push_back(i);
+      }
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/const_analysis.h b/tensorflow/compiler/tf2xla/const_analysis.h
index 49b3c6d413c6b637fa825bf182be7cc36e49b6c8..1663cbff41c3e10ba586c60eca475b760dee4896 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.h
+++ b/tensorflow/compiler/tf2xla/const_analysis.h
@@ -34,11 +34,13 @@ namespace tensorflow {
 // `compile_time_const_nodes`, if `compile_time_const_nodes` is not null.
 //
 // Only propagate const-ness along edges for which `edge_filter` returns true.
-Status BackwardsConstAnalysis(const Graph& g,
-                              std::vector<bool>* compile_time_const_arg_indices,
-                              std::vector<bool>* compile_time_const_nodes,
-                              std::function<bool(const Edge&)> edge_filter =
-                                  [](const Edge& e) { return true; });
+Status BackwardsConstAnalysis(
+    const Graph& g, std::vector<bool>* compile_time_const_arg_indices,
+    std::vector<bool>* compile_time_const_nodes,
+    FunctionLibraryRuntime* flib_runtime,
+    std::function<bool(const Edge&)> edge_filter = [](const Edge& e) {
+      return true;
+    });
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 40c6d0e01701d9104a200d9ea27706a0a7c12146..ed5f004550f0cb57e1545436c90bb6a9e8c19652 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -44,8 +44,8 @@ TEST(ConstAnalysisTest, Basics) {
 
   std::vector<bool> const_args(4, false);
   std::vector<bool> const_nodes(root.graph()->num_node_ids(), false);
-  TF_ASSERT_OK(
-      BackwardsConstAnalysis(*root.graph(), &const_args, &const_nodes));
+  TF_ASSERT_OK(BackwardsConstAnalysis(*root.graph(), &const_args, &const_nodes,
+                                      /*flib_runtime=*/nullptr));
 
   // Arg 0 doesn't need to be constant since the graph only uses its shape.
   // Arg 1 must be constant because it flows to the shape argument of a Reshape.
@@ -82,7 +82,8 @@ TEST(ConstAnalysisTest, TopologicalOrder) {
 
     std::vector<bool> const_args(3, false);
     TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
-                                        /*compile_time_const_nodes=*/nullptr));
+                                        /*compile_time_const_nodes=*/nullptr,
+                                        /*flib_runtime=*/nullptr));
 
     EXPECT_EQ(const_args, std::vector<bool>({true, true, false}));
   }
@@ -103,7 +104,8 @@ TEST(ConstAnalysisTest, DontFollowControlDependencies) {
 
   std::vector<bool> const_args(2, false);
   TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
-                                      /*compile_time_const_nodes=*/nullptr));
+                                      /*compile_time_const_nodes=*/nullptr,
+                                      /*flib_runtime=*/nullptr));
 
   EXPECT_EQ(const_args, std::vector<bool>({false, true}));
 }
@@ -128,7 +130,8 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_0) {
 
   std::vector<bool> const_args(2, false);
   TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
-                                      /*compile_time_const_nodes=*/nullptr));
+                                      /*compile_time_const_nodes=*/nullptr,
+                                      /*flib_runtime=*/nullptr));
 
   EXPECT_EQ(const_args, std::vector<bool>({false, false}));
 }
@@ -152,7 +155,8 @@ TEST(ConstAnalysisTest, RespectExplicitAttr_1) {
 
   std::vector<bool> const_args(1, false);
   TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
-                                      /*compile_time_const_nodes=*/nullptr));
+                                      /*compile_time_const_nodes=*/nullptr,
+                                      /*flib_runtime=*/nullptr));
 
   EXPECT_EQ(const_args, std::vector<bool>({true}));
 }
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
deleted file mode 100644
index 64fdbbebc65bff4ed0b965fcdd534cc9696472b6..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
-// debugging.
-
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
-
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/core/util/dump_graph.h"
-
-namespace tensorflow {
-namespace dump_graph {
-
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return tensorflow::DumpGraphDefToFile(
-      name, graph_def, GetDumpGraphFlags()->tf_dump_graph_prefix);
-}
-
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def) {
-  return tensorflow::DumpGraphToFile(name, graph, flib_def,
-                                     GetDumpGraphFlags()->tf_dump_graph_prefix);
-}
-
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return tensorflow::DumpFunctionDefToFile(
-      name, fdef, GetDumpGraphFlags()->tf_dump_graph_prefix);
-}
-
-}  // namespace dump_graph
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph.h b/tensorflow/compiler/tf2xla/dump_graph.h
deleted file mode 100644
index bbf01eb90dbd0478e873da785c3dcb6145096f17..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
-// debugging.
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_H_
-#define TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_H_
-
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/graph/graph.h"
-
-namespace tensorflow {
-namespace dump_graph {
-
-// Dumps 'graph_def' to a file, as a GraphDef text proto. Returns the file name
-// chosen.
-//
-// Automatically picks a file name. Prefixes 'name' with the value of the
-// --tf_dump_graph_prefix flag and suffixes it with ".pbtxt" to form a name.
-// If a graph has already been dumped by this process with the same name,
-// suffixes with "_n.pbtxt", where 'n' is a sequence number.
-string DumpGraphDefToFile(const string& name, GraphDef const& graph_def);
-
-// Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
-// and an optional function library 'flib_def'. Returns the file name chosen.
-string DumpGraphToFile(const string& name, Graph const& graph,
-                       const FunctionLibraryDefinition* flib_def = nullptr);
-
-// Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
-// proto. Returns the file name chosen.
-string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef);
-
-}  // namespace dump_graph
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 7ae96e1d484900e28e8c23c3bb2232401144ad82..6e093400e475ec0316ac072f4ddf1de5bdeec876 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -34,13 +33,53 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 using xla::StatusOr;
 
 namespace tensorflow {
 namespace functionalize_cond {
 
+bool AncestorNode::operator<(const AncestorNode& other) const {
+  return (output_tensor.node->id() < other.output_tensor.node->id()) ||
+         (output_tensor.node->id() == other.output_tensor.node->id() &&
+          output_tensor.index < other.output_tensor.index) ||
+         (output_tensor.node->id() == other.output_tensor.node->id() &&
+          output_tensor.index == other.output_tensor.index &&
+          type < other.type);
+}
+
+bool AncestorNode::operator==(const AncestorNode& other) const {
+  return output_tensor.node->id() == other.output_tensor.node->id() &&
+         output_tensor.index == other.output_tensor.index && type == other.type;
+}
+
+size_t AncestorNode::Hash::operator()(const AncestorNode& ancestor) const {
+  size_t h = std::hash<int>()(ancestor.output_tensor.node->id());
+  h = Hash64Combine(h, std::hash<int>()(ancestor.output_tensor.index));
+  return Hash64Combine(h, std::hash<int>()(static_cast<int>(ancestor.type)));
+}
+
+typedef std::tuple<StateMap::CondId, StateMap::AncestorId, OutputTensor>
+    ClusterTuple;
+
+struct ClusterTupleLessThan {
+  bool operator()(const ClusterTuple& a, const ClusterTuple& b) const {
+    if (std::tie(std::get<0>(a), std::get<1>(a)) <
+        std::tie(std::get<0>(b), std::get<1>(b))) {
+      return true;
+    } else if (std::tie(std::get<0>(a), std::get<1>(a)) ==
+               std::tie(std::get<0>(b), std::get<1>(b))) {
+      return StateMap::OutputTensorLess()(std::get<2>(a), std::get<2>(b));
+    } else {
+      return false;
+    }
+  }
+};
+
 // TODO(jpienaar): Move to OutputTensor.
 string DebugString(const OutputTensor& tensor) {
   return absl::StrCat(tensor.node->name(), ":", tensor.index);
@@ -145,10 +184,10 @@ size_t StateMap::Hash::operator()(const StateMap::AncestorState& map) const {
   if (map.empty()) return 0;
   // Compute hash of the front element.
   auto it = map.begin();
-  size_t h = hash<Node*>()(*it);
+  size_t h = AncestorNode::Hash()(*it);
   for (++it; it != map.end(); ++it) {
     // Combine the has with the different elements in the map.
-    h = Hash64Combine(h, hash<Node*>()(*it));
+    h = Hash64Combine(h, AncestorNode::Hash()(*it));
   }
   return h;
 }
@@ -229,7 +268,17 @@ string StateMap::CondStateToString(StateMap::CondId id) const {
 }
 
 string StateMap::AncestorStateToString(const Node* node) const {
-  if (auto id = LookupAncestorId(node)) return NodesToString(*id);
+  if (auto id = LookupAncestorId(node)) {
+    return absl::StrCat(
+        "{",
+        absl::StrJoin(*id, ",",
+                      [](string* output, const AncestorNode& ancestor) {
+                        absl::StrAppend(output,
+                                        ancestor.output_tensor.node->name(),
+                                        ":", ancestor.output_tensor.index);
+                      }),
+        "}");
+  }
   return "{}";
 }
 
@@ -247,7 +296,9 @@ class Conditional {
   Status AddMerge(Node* m);
 
   // Constructs an If node from the merge nodes.
-  Status BuildAndReplace(Graph* graph, FunctionLibraryDefinition* library);
+  Status BuildAndReplace(
+      Graph* graph, FunctionLibraryDefinition* library,
+      std::unordered_map<Node*, OutputTensor>* merge_to_replacement);
 
  private:
   // Extracts the then/else bodies: creates new graphs with the nodes
@@ -262,10 +313,15 @@ class Conditional {
   Status BuildIfNode(Graph* graph, FunctionLibraryDefinition* library);
 
   // Adds input edges to If node.
-  Status AddInputEdges(Graph* graph);
+  Status AddInputEdges(
+      Graph* graph,
+      const std::unordered_map<Node*, OutputTensor>& merge_to_replacement);
 
   // Adds output edges from If node.
-  Status AddOutputEdges(Graph* graph);
+  // Record new output tensor for all Merge nodes in 'merge_to_replacement'.
+  Status AddOutputEdges(
+      Graph* graph,
+      std::unordered_map<Node*, OutputTensor>* merge_to_replacement);
 
   // Adds switch node that is part of this conditional.
   Status AddSwitch(Node* s);
@@ -564,7 +620,32 @@ Status Conditional::ExtractBodies(Graph* graph) {
             stack.push_back(src);
           }
         } else if (e->IsControlEdge()) {
-          external_control_inputs_.push_back(src);
+          // Here we have a control flow edge between src and dst that are not
+          // in the same context. This is an external control dependency except
+          // for one case: where the only difference between CondId of e->src()
+          // and CondId of e->dst() is that e->src() has {PRED, kNeither} and
+          // e->dst() has {PRED, kThenBranch/kElseBranch}. This happens in
+          // gradients code for tf.cond(), where e->src() is a control pivot
+          // node for a branch and e->dst() is a data node in that branch.
+          bool is_external_control_input = true;
+          if (!state_map_->IsEmpty(src_id) && !state_map_->IsEmpty(dst_id)) {
+            std::vector<StateMap::CondState::value_type> diff;
+            std::set_symmetric_difference(
+                src_id->begin(), src_id->end(), dst_id->begin(), dst_id->end(),
+                std::back_inserter(diff), CondStateLess());
+            if (diff.size() == 2 && diff[0].first == diff[1].first &&
+                (diff[0].second == BranchType::kNeither ||
+                 diff[1].second == BranchType::kNeither)) {
+              auto src_branch = src_id->find(diff[0].first);
+              if (src_branch != src_id->end() &&
+                  src_branch->second == BranchType::kNeither) {
+                is_external_control_input = false;
+              }
+            }
+          }
+          if (is_external_control_input) {
+            external_control_inputs_.push_back(src);
+          }
         } else {
           // This shouldn't happen, this means we have an external data input
           // not entering via a switch node. Work around this by for
@@ -654,7 +735,7 @@ Status Conditional::BuildIfNode(Graph* graph,
 
     VLOG(3) << "FunctionalizeControlFlow (" << branch_name[branch_index]
             << "): "
-            << dump_graph::DumpGraphToFile(
+            << DumpGraphToFile(
                    "functionalize_cond_body_" + branch_name[branch_index],
                    *bodies_[branch_index], nullptr);
 
@@ -705,9 +786,9 @@ Status Conditional::BuildIfNode(Graph* graph,
   }
   builder.Device(predicate_.node->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(),
-                                        predicate_.index,
-                                        predicate_.node->output_type(0)));
+  builder.Input(
+      NodeDefBuilder::NodeOut(predicate_.node->name(), predicate_.index,
+                              predicate_.node->output_type(predicate_.index)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -720,12 +801,29 @@ Status Conditional::BuildIfNode(Graph* graph,
   return Status::OK();
 }
 
-Status Conditional::AddInputEdges(Graph* graph) {
+Status Conditional::AddInputEdges(
+    Graph* graph,
+    const std::unordered_map<Node*, OutputTensor>& merge_to_replacement) {
   VLOG(2) << "AddInputEdges for " << if_node_->name();
   int index = 0;
   // Add predicate input.
-  graph->AddEdge(const_cast<Node*>(predicate_.node), predicate_.index, if_node_,
-                 index++);
+  if (predicate_.node->IsMerge()) {
+    // If the predicate is a Merge node, we should not use Merge output as
+    // predicate. Instead, we should use the corresponding If output in
+    // 'merge_to_replacement'. Otherwise, this Conditional's If node is still
+    // connected to the predicate Merge node; and when we call
+    // DeleteReachableAndDeadNodes(), the predicate Merge node and this
+    // Conditional's If node will be removed.
+    auto iter = merge_to_replacement.find(predicate_.node);
+    if (iter == merge_to_replacement.end()) {
+      return errors::Internal("Cannot find replacement for Merge node ",
+                              predicate_.node->name());
+    }
+    graph->AddEdge(iter->second.node, iter->second.index, if_node_, index++);
+  } else {
+    graph->AddEdge(const_cast<Node*>(predicate_.node), predicate_.index,
+                   if_node_, index++);
+  }
   // Add function body inputs.
   for (auto& arg : cond_arg_nodes_) {
     if (arg.src_output == Graph::kControlSlot) {
@@ -740,7 +838,9 @@ Status Conditional::AddInputEdges(Graph* graph) {
   return Status::OK();
 }
 
-Status Conditional::AddOutputEdges(Graph* graph) {
+Status Conditional::AddOutputEdges(
+    Graph* graph,
+    std::unordered_map<Node*, OutputTensor>* merge_to_replacement) {
   VLOG(2) << "AddOutputEdges for " << if_node_->name();
   int i = 0;
   for (Node* node : merges_) {
@@ -764,6 +864,10 @@ Status Conditional::AddOutputEdges(Graph* graph) {
         graph->AddEdge(if_node_, i, dst, dst_input);
       }
     }
+
+    // Record corresponding output tensor in 'merge_to_replacement'.
+    (*merge_to_replacement)[node] = OutputTensor{if_node_, i};
+
     ++i;
   }
   for (Node* n : external_control_outputs_) {
@@ -773,8 +877,9 @@ Status Conditional::AddOutputEdges(Graph* graph) {
   return Status::OK();
 }
 
-Status Conditional::BuildAndReplace(Graph* graph,
-                                    FunctionLibraryDefinition* library) {
+Status Conditional::BuildAndReplace(
+    Graph* graph, FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, OutputTensor>* merge_to_replacement) {
   VLOG(1) << "Build If and replace merge nodes "
           << NodesToString(this->merges_);
   if (replaced_) return Status::OK();
@@ -793,8 +898,8 @@ Status Conditional::BuildAndReplace(Graph* graph,
   }
 
   TF_RETURN_IF_ERROR(BuildIfNode(graph, library));
-  TF_RETURN_IF_ERROR(AddInputEdges(graph));
-  TF_RETURN_IF_ERROR(AddOutputEdges(graph));
+  TF_RETURN_IF_ERROR(AddInputEdges(graph, *merge_to_replacement));
+  TF_RETURN_IF_ERROR(AddOutputEdges(graph, merge_to_replacement));
   TF_RETURN_IF_ERROR(parent_->PropagateUpdatedState(if_node_));
 
   // Check that the if_node doesn't feed into itself.
@@ -913,10 +1018,18 @@ StatusOr<StateMap::CondId> FunctionalizeCond::JoinCondStatesNonMerge(
       both.insert(kv);
     } else {
       if (it->second != kv.second) {
-        return errors::InvalidArgument(
-            "Graph contains node with inputs predicated on incompatible "
-            "predicates: ",
-            DebugString(src), " and ", DebugString(dst));
+        if (it->second == BranchType::kNeither) {
+          // BranchType for 'src' is kNeither. Use the BranchType in 'dst'.
+          it->second = kv.second;
+        } else if (kv.second == BranchType::kNeither) {
+          // BranchType for 'dst' is kNeither. Use the BranchType in 'src'.
+          // No need to change it->second.
+        } else {
+          return errors::InvalidArgument(
+              "Graph contains node with inputs predicated on incompatible "
+              "predicates: ",
+              DebugString(src), " and ", DebugString(dst));
+        }
       }
     }
   }
@@ -936,6 +1049,10 @@ StatusOr<StateMap::CondId> FunctionalizeCond::JoinCondStatesMerge(
   VLOG(4) << "Joining (for merge) " << DebugString(src) << " and "
           << DebugString(dst);
   if (state_map_.IsEmpty(dst)) return src;
+  if (state_map_.IsEmpty(src)) {
+    return errors::Internal("Merge node ", merge->name(),
+                            " has input that's not in any CondContext.");
+  }
 
   if (state_map_.IsDead(src)) return src;
   if (state_map_.IsDead(dst)) return dst;
@@ -981,7 +1098,17 @@ StateMap::CondId FunctionalizeCond::StateAlongEdge(const Edge* e) {
     if (id != nullptr) state = *id;
     OutputTensor predicate;
     TF_CHECK_OK(GetSwitchPredicate(*src, &predicate));
-    if (!e->IsControlEdge()) {
+    if (e->IsControlEdge()) {
+      // In gradients of tf.cond(), in each branch, we have a NoOp node as
+      // control pivot. These NoOp nodes have control dependency from Switch
+      // node. If we don't record this into CondState, branches might have
+      // incorrect CondState (e.g. if the branch only has a Const data node).
+      // We set it to kNeither because there is no way to tell whether it's
+      // for true branch or false branch. This node's desendents might have
+      // other incoming edges with defined BranchType, and we correctly handle
+      // merging kNeither with other defined BranchType in StateAlongEdge().
+      state[predicate] = BranchType::kNeither;
+    } else {
       state[predicate] = BranchType(e->src_output());
     }
     return state_map_.GetCondId(state);
@@ -1170,8 +1297,17 @@ Status FunctionalizeCond::DetermineAncestorState(Node* dst) {
     if (other_id != id && other_id != nullptr) {
       state.insert(other_id->begin(), other_id->end());
     }
-    if (IsSwitch(src) || IsMerge(src)) {
-      state.insert(src);
+    if (IsMerge(src)) {
+      state.insert({{src, 0}, AncestorNode::AncestorNodeType::kMerge});
+    } else if (IsSwitch(src)) {
+      OutputTensor pred;
+      // For dead switch nodes, GetSwitchPredicate() will fail, and we use
+      // the switch node directly as ancestor.
+      if (GetSwitchPredicate(*src, &pred).ok()) {
+        state.insert({pred, AncestorNode::AncestorNodeType::kPred});
+      } else {
+        state.insert({{src, 0}, AncestorNode::AncestorNodeType::kSwitch});
+      }
     }
     return state_map_.GetAncestorId(state);
   };
@@ -1317,16 +1453,30 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   // Sort the merge nodes from innermost outwards.
   SortMergeNodes(&merge_order);
 
-  // Cluster merge nodes by CondId and AncestorId in order of nesting.
-  using ClusterPair = std::pair<StateMap::CondId, StateMap::AncestorId>;
+  // Cluster merge nodes by (CondId, AncestorId, predicate) in order of
+  // nesting. (CondId, AncestorId) is not enough, e.g.
+  //   pred1 = array_ops.placeholder(dtypes.bool, name='pred1')
+  //   pred2 = array_ops.placeholder(dtypes.bool, name='pred2')
+  //   cond1 = control_flow_ops.cond(pred1, ...)
+  //   cond2 = control_flow_ops.cond(pred2, ...)
+  //   cond3 = control_flow_ops.cond(pred1, use cond1 and cond2)
+  //   cond4 = control_flow_ops.cond(pred2, use cond1 and cond2)
+  // cond3 and cond4 have the same (CondId, AncestorId), but they should not
+  // be merged into one "If" node (because they have different predicates).
   std::deque<std::vector<Node*>> merge_clusters;
-  std::map<ClusterPair, int> merge_cluster_index;
+  std::map<ClusterTuple, int, ClusterTupleLessThan> merge_cluster_index;
   for (Node* merge : merge_order) {
     auto cond_id = state_map_.LookupCondId(merge);
     if (state_map_.IsDead(cond_id)) continue;
 
-    ClusterPair key =
-        std::make_pair(cond_id, state_map_.LookupAncestorId(merge));
+    auto predicate = merge_to_predicate_.find(merge);
+    if (predicate == merge_to_predicate_.end()) {
+      return errors::Internal("Cannot find predicate for Merge node ",
+                              merge->name());
+    }
+
+    ClusterTuple key = std::make_tuple(
+        cond_id, state_map_.LookupAncestorId(merge), predicate->second);
     auto idx = merge_cluster_index.find(key);
     if (idx == merge_cluster_index.end()) {
       merge_cluster_index[key] = merge_clusters.size();
@@ -1345,7 +1495,8 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     Conditional cond(merge_to_predicate_.at(cluster.front()), this,
                      &state_map_);
     for (Node* merge : cluster) TF_RETURN_IF_ERROR(cond.AddMerge(merge));
-    TF_RETURN_IF_ERROR(cond.BuildAndReplace(graph_, library_));
+    TF_RETURN_IF_ERROR(
+        cond.BuildAndReplace(graph_, library_, &merge_to_replacement_));
 
     if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract");
   }
@@ -1365,9 +1516,8 @@ void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
                             state_map_.AncestorStateToString(n)));
   }
   LOG(INFO) << "FunctionalizeControlFlow (" << name << "): "
-            << dump_graph::DumpGraphToFile(
-                   absl::StrCat("functionalize_cond_", name), *graph_,
-                   library_);
+            << DumpGraphToFile(absl::StrCat("functionalize_cond_", name),
+                               *graph_, library_);
 }
 
 void FunctionalizeCond::AddSwitchId(int switch_id) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index 8525d7af61b4471e53a9ae16b081060bfd234c9c..d85800fb8ee65a354716bf6601c6bc40eca9a10d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -43,6 +43,33 @@ enum class BranchType {
   kNeither = 3,
 };
 
+// When we keep track of which switch/merge node's feed into a node, we record
+// 1) predicate for non-dead switch node,
+// 2) the switch node itself for dead switch node,
+// 3) the merge node itself for merge node.
+// Case 1) is an optimization. With this optimization, if there are nodes from
+// different switch nodes but those switch nodes have the same predicate, the
+// nodes will still have same AncestorState, and they will be clustered into a
+// single "If".
+struct AncestorNode {
+  enum class AncestorNodeType {
+    kPred = 0,
+    kSwitch = 1,
+    kMerge = 2,
+  };
+
+  OutputTensor output_tensor;
+  AncestorNodeType type;
+
+  // Compare two AncestorNodes by (node id, index, type).
+  bool operator<(const AncestorNode& other) const;
+  bool operator==(const AncestorNode& other) const;
+
+  struct Hash {
+    size_t operator()(const AncestorNode&) const;
+  };
+};
+
 // StateMap is responsible for mapping from each graph Node to
 // * a CondState, where each CondState is a map from predicate to branch (i,e.,
 //   what predicates have to hold or not hold).
@@ -68,7 +95,7 @@ class StateMap {
   using CondId = const CondState*;
 
   // Keep track of which switch/merge node's feed into a node's values.
-  using AncestorState = std::set<Node*>;
+  using AncestorState = std::set<AncestorNode>;
 
   // Every unique ID is mapped to a AncestorState.
   using AncestorId = const AncestorState*;
@@ -232,6 +259,9 @@ class FunctionalizeCond {
   // Mapping from merge nodes to predicate.
   std::unordered_map<Node*, OutputTensor> merge_to_predicate_;
 
+  // Mapping from merge nodes to corresponding If node outputs.
+  std::unordered_map<Node*, OutputTensor> merge_to_replacement_;
+
   FunctionLibraryDefinition* library_;
   Graph* graph_;
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index b0aabd63bbda784b3b7103a438ce025eea0cd93b..05fa1ee92dc172bd11cec9f99e3884996e00791f 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -101,6 +101,17 @@ TEST_F(FunctionalizeCondTest, JoinCondStates) {
   TF_EXPECT_OK(t.status());
 }
 
+TEST_F(FunctionalizeCondTest, JoinCondStatesMergeWithInputNotInCondContext) {
+  Tensor val_tensor(DT_INT32, TensorShape());
+  val_tensor.flat<int>().setZero();
+  Node* val = test::graph::Constant(graph_.get(), val_tensor, "val");
+  Node* m = test::graph::Merge(graph_.get(), val, val);
+
+  StateMap::CondState cond_state;
+  auto joined_or = JoinCondStatesMerge(m, /*src=*/nullptr, &cond_state);
+  EXPECT_FALSE(joined_or.ok());
+}
+
 }  // namespace
 }  // namespace functionalize_cond
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 3dfd3f854c8646ebbf06d3378201d22e8741b7eb..9fe25dfe3e7d3cf6970851fb9a6b0c56c044da94 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_while.h"
@@ -43,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -50,8 +50,7 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
-          << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
-                                         library);
+          << DumpGraphToFile("functionalize_initial", *graph, library);
 
   // Functionalize and remove while loops from graph.
   TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(lookup_library, graph, library));
@@ -62,8 +61,7 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
   TF_RETURN_IF_ERROR(FunctionalizeCond(graph, library));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
-          << dump_graph::DumpGraphToFile("functionalize_final", *graph,
-                                         library);
+          << DumpGraphToFile("functionalize_final", *graph, library);
 
   return Status::OK();
 }
@@ -200,13 +198,13 @@ Status FunctionalizeControlFlowForFunction(
 
     // Functionalize the function body.
     if (VLOG_IS_ON(4)) {
-      dump_graph::DumpGraphToFile(
+      DumpGraphToFile(
           absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
           *g, fld);
     }
     TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld));
     if (VLOG_IS_ON(4)) {
-      dump_graph::DumpGraphToFile(
+      DumpGraphToFile(
           absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
           fld);
     }
@@ -234,8 +232,8 @@ Status FunctionalizeControlFlowPass::Run(
     const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("functionalize_control_flow_before", *graph,
-                                options.flib_def);
+    DumpGraphToFile("functionalize_control_flow_before", *graph,
+                    options.flib_def);
   }
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
       new ProcessFunctionLibraryRuntime(
@@ -279,8 +277,8 @@ Status FunctionalizeControlFlowPass::Run(
   }
 
   if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("functionalize_control_flow_after", *graph,
-                                options.flib_def);
+    DumpGraphToFile("functionalize_control_flow_after", *graph,
+                    options.flib_def);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index d87436a7b4ac37c74d0f0df921779c8716290013..517924bfc71d977bdf4222dbfbbfcb692f544f5b 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -36,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace {
@@ -293,8 +293,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
                          Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
-          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
-                                         library);
+          << DumpGraphToFile("functionalize_before", *graph, library);
 
   // Split loop-varying Enter nodes with multiple successors. If the same
   // Tensor is fed as input to multiple loop arguments, we may end up with a
@@ -490,8 +489,8 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   TF_RETURN_IF_ERROR(FunctionalizeCond(body_graph.get(), library));
 
   VLOG(2) << "Frame " << frame->name << " condition: "
-          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
-          << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
+          << DumpGraphToFile("loop_condition", *cond_graph, library)
+          << " body: " << DumpGraphToFile("loop_body", *body_graph);
 
   static std::atomic<int64> sequence_num(0LL);
   int64 id = ++sequence_num;
@@ -585,8 +584,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   frame->parent->nodes.insert(while_node);
 
   VLOG(2) << "Frame " << frame->name << " after: "
-          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
-                                         library);
+          << DumpGraphToFile("functionalize_after", *graph, library);
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4535ece374ceb801e450af98a21d5a4c5e8f2a29
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+Status GetTestDevice(Session* session, string* test_device) {
+  std::vector<DeviceAttributes> devices;
+  TF_RETURN_IF_ERROR(session->ListDevices(&devices));
+
+  bool found_cpu = absl::c_any_of(devices, [&](const DeviceAttributes& device) {
+    return device.device_type() == "CPU";
+  });
+
+  bool found_gpu = absl::c_any_of(devices, [&](const DeviceAttributes& device) {
+    return device.device_type() == "GPU";
+  });
+
+  if (!found_gpu && !found_cpu) {
+    return errors::Internal("Expected at least one CPU or GPU!");
+  }
+
+  *test_device = found_gpu ? "GPU" : "CPU";
+  VLOG(2) << "Using test device " << *test_device;
+  return Status::OK();
+}
+
+void FillZeros(Tensor* tensor) {
+  auto flat = tensor->flat<float>();
+  for (int i = 0; i < flat.size(); i++) {
+    flat.data()[i] = 0.0f;
+  }
+}
+
+// This tests check that the implementation outputs from FusedBatchnorm
+// training, reserve_space_{1|2}, are what we assume them to be in the TF/XLA
+// lowering.
+//
+// If this test starts failing then it doesn't indicate that TF/cudnn have
+// violated their contract, but it indicates that we need to update the TF/XLA
+// lowering for FusedBatchnorm training to match the new implementation defined
+// behavior.
+TEST(FusedBatchnormReserveSpaceTest, Test) {
+  using ::tensorflow::ops::Const;
+  using ::tensorflow::ops::FusedBatchNorm;
+
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions{}));
+
+  string test_device;
+  TF_ASSERT_OK(GetTestDevice(session.get(), &test_device));
+
+  Scope root = tensorflow::Scope::NewRootScope();
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+
+  Tensor scale_data(DT_FLOAT, TensorShape({10}));
+  FillZeros(&scale_data);
+  Output scale =
+      Const(root.WithOpName("scale"), Input::Initializer(scale_data));
+
+  Tensor offset_data(DT_FLOAT, TensorShape({10}));
+  FillZeros(&offset_data);
+  Output offset =
+      Const(root.WithOpName("offset"), Input::Initializer(offset_data));
+
+  Tensor mean_data(DT_FLOAT, TensorShape({0}));
+  Output mean = Const(root.WithOpName("offset"), Input::Initializer(mean_data));
+
+  Tensor variance_data(DT_FLOAT, TensorShape({0}));
+  Output variance =
+      Const(root.WithOpName("variance"), Input::Initializer(variance_data));
+
+  string tf_device = absl::StrCat("/device:", test_device, ":0");
+  string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
+
+  FusedBatchNorm fused_batch_norm_tf(
+      root.WithOpName("fused_batch_norm_tf").WithDevice(tf_device), input,
+      scale, offset, mean, variance, FusedBatchNorm::Attrs{}.IsTraining(true));
+  FusedBatchNorm fused_batch_norm_xla(
+      root.WithOpName("fused_batch_norm_xla").WithDevice(xla_device), input,
+      scale, offset, mean, variance, FusedBatchNorm::Attrs{}.IsTraining(true));
+
+  tensorflow::GraphDef graph;
+  TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+  TF_ASSERT_OK(session->Create(graph));
+
+  Tensor input_data(DT_FLOAT, TensorShape({10, 10, 10, 10}));
+  auto flat_input = input_data.flat<float>();
+  for (int i = 0; i < flat_input.size(); i++) {
+    flat_input.data()[i] = (i - 5) / 1000.0f;
+  }
+
+  std::vector<Tensor> results;
+  TF_ASSERT_OK(session->Run({{"input", input_data}},
+                            {fused_batch_norm_tf.reserve_space_1.name(),
+                             fused_batch_norm_xla.reserve_space_1.name(),
+                             fused_batch_norm_tf.reserve_space_2.name(),
+                             fused_batch_norm_xla.reserve_space_2.name()},
+                            {}, &results));
+
+  test::ExpectClose(results[0], results[1], /*atol=*/1e-4);
+  test::ExpectClose(results[2], results[3], /*atol=*/1e-4);
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 0c2bb0223905b22613a64ad54f07151f7f8590b2..e80b6f50ac37f71e7cb15289b471ccc1310b922b 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
@@ -34,7 +33,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -56,9 +58,9 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
   auto client = ctx->compiler()->client();
   std::vector<bool> arg_must_be_compile_time_constant(expressions.size());
 
-  TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*graph, &arg_must_be_compile_time_constant,
-                             /*compile_time_const_nodes=*/nullptr));
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
+      *graph, &arg_must_be_compile_time_constant,
+      /*compile_time_const_nodes=*/nullptr, ctx->function_library()));
 
   args->resize(expressions.size());
   for (int i = 0; i < args->size(); ++i) {
@@ -87,8 +89,13 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
         }
         break;
       case XlaExpression::Kind::kResource:
+        // TODO(b/126601755): This is a fairly common use case in TF 2.0 that
+        // we can hit when inlining is disabled or fails.
         return errors::Unimplemented(
             "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kTensorList:
+        return errors::Unimplemented(
+            "TensorList as function argument is not yet implemented.");
       case XlaExpression::Kind::kInvalid:
         return errors::InvalidArgument("Invalid function argument");
     }
@@ -121,6 +128,8 @@ Status GraphCompiler::Compile() {
 
   for (Node* n : topo_sorted_nodes) {
     OpKernel* op_kernel_raw = nullptr;
+    // The kernel is not actually run for functional ops, we just need it
+    // for metadata.
     Status s = flib_->CreateKernel(n->def(), &op_kernel_raw);
     // Transfer ownership of the kernel to a local smart pointer.
     std::unique_ptr<OpKernel> op_kernel(op_kernel_raw);
@@ -154,7 +163,7 @@ Status GraphCompiler::Compile() {
 
     OpKernelContext op_context(&params, n->num_outputs());
     VLOG(3) << "Translating " << params.op_kernel->name();
-    if (IsFunctional(n)) {
+    if (IsFunctionCall(*flib_->GetFunctionLibraryDefinition(), *n)) {
       TF_RETURN_IF_ERROR(CompileFunctionalNode(n, &op_context));
     } else {
       device_->Compute(CHECK_NOTNULL(params.op_kernel), &op_context);
@@ -179,15 +188,37 @@ Status GraphCompiler::Compile() {
   return Status::OK();
 }
 
-bool GraphCompiler::IsFunctional(Node* n) {
-  return n->type_string() == FunctionLibraryDefinition::kGradientOp ||
-         (flib_->GetFunctionLibraryDefinition()->Find(n->def().op()) !=
-          nullptr);
+namespace {
+
+Status GetFunctionNameAndAttr(const FunctionLibraryRuntime& flib,
+                              const Node& node, NameAttrList* func) {
+  if (node.IsPartitionedCall()) {
+    const AttrValue* attr_value;
+    TF_RETURN_IF_ERROR(
+        node.attrs().Find(FunctionLibraryDefinition::kFuncAttr, &attr_value));
+    if (!attr_value->has_func()) {
+      return errors::InvalidArgument(
+          "The attribute value for attribute 'f' in node ", node.DebugString(),
+          " does not have 'func' field set");
+    }
+    *func = attr_value->func();
+    return Status::OK();
+  }
+
+  if (flib.GetFunctionLibraryDefinition()->Find(node.def().op())) {
+    func->set_name(node.type_string());
+  } else {
+    func->set_name(FunctionLibraryDefinition::kGradientOp);
+  }
+  *func->mutable_attr() = node.def().attr();
+  return Status::OK();
 }
 
+}  // namespace
+
 Status GraphCompiler::CompileFunctionalNode(Node* n,
                                             OpKernelContext* op_context) {
-  TF_RET_CHECK(IsFunctional(n));
+  TF_RET_CHECK(IsFunctionCall(*flib_->GetFunctionLibraryDefinition(), *n));
   // For functional nodes, compile them using compiler from the context and call
   // into the functions.
   XlaOpKernelContext xla_op_context(op_context);
@@ -198,12 +229,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   XlaCompiler* compiler = xla_op_context.compiler();
 
   NameAttrList func;
-  if (flib_->GetFunctionLibraryDefinition()->Find(n->def().op())) {
-    func.set_name(n->def().op());
-  } else {
-    func.set_name(FunctionLibraryDefinition::kGradientOp);
-  }
-  *func.mutable_attr() = n->def().attr();
+  TF_RETURN_IF_ERROR(GetFunctionNameAndAttr(*flib_, *n, &func));
 
   std::vector<const XlaExpression*> expressions;
 
@@ -224,7 +250,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
   bool add_token_input_output =
-      HasNodeAttr(n->def(), kXlaTokenInputNodesAttrName);
+      func.attr().find(kXlaTokenInputNodesAttrName) != func.attr().end();
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = false;
@@ -244,8 +270,9 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   }
   if (add_token_input_output) {
     std::vector<string> token_input_nodes;
-    TF_RETURN_IF_ERROR(
-        GetNodeAttr(n->def(), kXlaTokenInputNodesAttrName, &token_input_nodes));
+    TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(&func.attr()),
+                                   kXlaTokenInputNodesAttrName,
+                                   &token_input_nodes));
     std::vector<xla::XlaOp> token_inputs;
     for (const string& node_name : token_input_nodes) {
       auto token_or = compiler->GetNodeToken(node_name);
@@ -281,6 +308,7 @@ void GraphCompiler::PartiallySetupParams(OpKernelContext::Params* params) {
   params->inputs = &tensor_inputs_;
   params->step_container = step_container_;
   params->resource_manager = device_->resource_manager();
+  params->function_library = flib_;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.h b/tensorflow/compiler/tf2xla/graph_compiler.h
index e9f02201cf6bed5495dff7dff76c5bafe7771516..eb02534e7fb42cb5c4f1df710debcafd76594c07 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.h
+++ b/tensorflow/compiler/tf2xla/graph_compiler.h
@@ -73,10 +73,6 @@ class GraphCompiler {
   // across multiple nodes visit.
   void PartiallySetupParams(OpKernelContext::Params* params);
 
-  // Tests if a node is a functional node. A functional node represents a
-  // defined computation and should be compiled using `compiler_`.
-  bool IsFunctional(Node* n);
-
   // Compiles a functional node and writes result to OpkernelContext. A
   // functional node represents a defined computation and should be compiled
   // using `compiler_`.
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 47209d285f1a077fd80f779a406e6980892f1646..cf29778688887e9b787f4865b88db4438a1ac88d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -33,7 +33,9 @@ tf_kernel_library(
         "diag_op.cc",
         "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
+        "einsum_op.cc",
         "elu_op.cc",
+        "empty_op.cc",
         "extract_image_patches_op.cc",
         "fake_param_op.cc",
         "fake_quantize_ops.cc",
@@ -63,6 +65,7 @@ tf_kernel_library(
         "qr_op.cc",
         "quantize_and_dequantize_op.cc",
         "random_ops.cc",
+        "random_ops_util.h",
         "reduce_window_op.cc",
         "reduction_ops.cc",
         "reduction_ops.h",
@@ -88,6 +91,7 @@ tf_kernel_library(
         "sparse_to_dense_op.cc",
         "split_op.cc",
         "stack_ops.cc",
+        "stateful_random_ops.cc",
         "stateless_random_ops.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
@@ -106,18 +110,24 @@ tf_kernel_library(
         "xla_pad_op.cc",
         "xla_reduce_op.cc",
         "xla_select_and_scatter_op.cc",
+        "xla_self_adjoint_eig_op.cc",
+        "xla_svd_op.cc",
     ],
     hdrs = [
         "index_ops.h",
         "shape_util.h",
     ],
+    tags = ["optonly"],
     deps = [
+        ":case_op",
         ":conv_op_helpers",
         ":if_op",
+        ":tensor_list_utils",
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
+        "//tensorflow/compiler/tf2xla/lib:data_format",
         "//tensorflow/compiler/tf2xla/lib:random",
         "//tensorflow/compiler/tf2xla/lib:scatter",
         "//tensorflow/compiler/tf2xla/lib:util",
@@ -133,7 +143,6 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:loops",
         "//tensorflow/compiler/xla/client/lib:math",
@@ -142,32 +151,46 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:quantize",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/compiler/xla/client/lib:svd",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_bounds_check",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:linalg_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:sparse_ops_op_lib",
         "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
+        "//tensorflow/core:stateful_random_ops_op_lib",
         "//tensorflow/core:stateless_random_ops_op_lib",
-        "//tensorflow/core/kernels:bounds_check",
-        "//tensorflow/core/kernels:concat_lib",
+        "//tensorflow/core:training_ops_op_lib",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
-        "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:list_kernels",
-        "//tensorflow/core/kernels:no_op",
-        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:partitioned_function_ops",
         "//tensorflow/core/kernels:pooling_ops",
         "//tensorflow/core/kernels:random_op",
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:sparse_to_dense_op",
         "//tensorflow/core/kernels:stack_ops",
+        "//tensorflow/core/kernels:stateful_random_ops",
         "//tensorflow/core/kernels:training_ops",
-        "//tensorflow/core/kernels:transpose_op",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -212,28 +235,46 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/core:framework",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core/kernels:conv_ops",
-        "//tensorflow/core/kernels:ops_util",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "tensor_list_utils",
+    srcs = ["tensor_list_utils.cc"],
+    hdrs = ["tensor_list_utils.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "while_op",
     srcs = ["while_op.cc"],
     hdrs = ["while_op.h"],
     deps = [
+        ":tensor_list_utils",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -254,6 +295,23 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "case_op",
+    srcs = ["case_op.cc"],
+    hdrs = ["case_op.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 # Kernels that have a dummy (no-op) implementation.
 tf_kernel_library(
     name = "xla_dummy_ops",
@@ -287,9 +345,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_bounds_check",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:argmax_op",
-        "//tensorflow/core/kernels:bounds_check",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index 41a453da80dec6b6f57a4d222e2c33ef6b786a10..b8d853bc36bf1682326c9728f9262522a6606a32 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace {
@@ -30,9 +32,39 @@ class AddNOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
                 errors::InvalidArgument("AddN requires at least one argument"));
 
-    xla::XlaOp sum = ctx->Input(0);
-    for (int i = 1; i < ctx->num_inputs(); ++i) {
-      sum = xla::Add(sum, ctx->Input(i));
+    XlaExpression::Kind kind = ctx->InputExpression(0).kind();
+    xla::XlaOp sum;
+    switch (kind) {
+      case XlaExpression::Kind::kTensorList: {
+        OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(0), &sum));
+        TensorShape sum_shape;
+        OP_REQUIRES_OK(ctx,
+                       GetTensorListBufferShape(ctx->Input(0), &sum_shape));
+        for (int i = 1; i < ctx->num_inputs(); ++i) {
+          xla::XlaOp operand;
+          OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(i), &operand));
+          // Check that the shapes match.
+          TensorShape operand_shape;
+          OP_REQUIRES_OK(
+              ctx, GetTensorListBufferShape(ctx->Input(i), &operand_shape));
+          OP_REQUIRES(
+              ctx, sum_shape.dim_sizes() == operand_shape.dim_sizes(),
+              errors::InvalidArgument(
+                  "TensorList arguments to AddN must all have the same ",
+                  "shape.\n", "Expected: ", sum_shape.DebugString(), "\n",
+                  "Found: ", operand_shape.DebugString()));
+          sum = xla::Add(sum, operand);
+        }
+        xla::XlaOp push_index;
+        OP_REQUIRES_OK(ctx, GetTensorListPushIndex(ctx->Input(0), &push_index));
+        OP_REQUIRES_OK(ctx, BuildTensorList(sum, push_index, &sum));
+        break;
+      }
+      default:
+        sum = ctx->Input(0);
+        for (int i = 1; i < ctx->num_inputs(); ++i) {
+          sum = xla::Add(sum, ctx->Input(i));
+        }
     }
 
     ctx->SetOutput(0, sum);
@@ -42,7 +74,7 @@ class AddNOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(AddNOp);
 };
 
-REGISTER_XLA_OP(Name("AddN"), AddNOp);
+REGISTER_XLA_OP(Name("AddN").AllowVariantTypes(), AddNOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 795ea09831e183a26fb3498b9bbaf9c3adaef9ed..5554d7a377d38554058aa731770ee10e400bc535 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -53,7 +53,11 @@ class XlaArgOp : public XlaOpKernel {
     const XlaExpression& arg = ctx->xla_context()->args()[index_];
     OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
                 errors::InvalidArgument("Invalid/missing argument expression"));
-    ctx->SetOutputExpression(0, arg);
+    if (ctx->expected_output_dtype(0) == DT_VARIANT) {
+      ctx->SetTensorListOutput(0, arg.handle());
+    } else {
+      ctx->SetOutputExpression(0, arg);
+    }
   }
 
  private:
@@ -63,6 +67,8 @@ class XlaArgOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp);
 };
 
-REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes().CompilationOnly(), XlaArgOp);
+REGISTER_XLA_OP(
+    Name("_Arg").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    XlaArgOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 0e2f335f3354e3ae6008bdc0ac0b80683fe479c1..f1d78c87527eb5f818dcf92209feabe33653a625 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -34,6 +36,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
         errors::InvalidArgument("Invalid data format: ", data_format_str));
+    is_on_gpu_ = ctx->device_type().type_string() == DEVICE_GPU_XLA_JIT;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -71,7 +74,18 @@ class FusedBatchNormOp : public XlaOpKernel {
       // variance to the gradient. Here we maintain the same behavior by setting
       // them to the mean and variance calculated by BatchNormTraining.
       ctx->SetOutput(3, xla::GetTupleElement(output, 1));
-      ctx->SetOutput(4, xla::GetTupleElement(output, 2));
+      if (is_on_gpu_) {
+        // The last two outputs from the FusedBatchNorm training TensorFlow GPU
+        // op are implementation defined.  For now we rely on the in-practice
+        // behavior of the op:
+        //   output 3 is the mean
+        //   output 4 is rsqrt(variance + epsilon)
+        xla::XlaOp variance = xla::GetTupleElement(output, 2);
+        ctx->SetOutput(4, xla::Rsqrt(xla::Add(
+                              variance, xla::ScalarLike(variance, epsilon_))));
+      } else {
+        ctx->SetOutput(4, xla::GetTupleElement(output, 2));
+      }
     } else {
       xla::XlaOp output = xla::BatchNormInference(
           input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
@@ -89,6 +103,7 @@ class FusedBatchNormOp : public XlaOpKernel {
   float epsilon_;
   TensorFormat data_format_;
   bool is_training_;
+  bool is_on_gpu_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
@@ -104,6 +119,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
         errors::InvalidArgument("Invalid data format: ", data_format_str));
+    is_on_gpu_ = ctx->device_type().type_string() == DEVICE_GPU_XLA_JIT;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -130,6 +146,22 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     xla::XlaOp scale_backprop;
     xla::XlaOp offset_backprop;
     if (is_training_) {
+      if (is_on_gpu_) {
+        // The last two inputs to the FusedBatchNormGrad training TensorFlow GPU
+        // op are implementation defined.  For now we rely on the in-practice
+        // behavior of the op: input 3 is the mean input 4 is rsqrt(variance +
+        // epsilon)
+        //
+        // The XLA op expects:
+        //   input 3 is the mean
+        //   input 4 is the variance
+        //
+        // so we adjust input 4 here.
+        xla::XlaOp one = xla::ScalarLike(var, 1.0f);
+        xla::XlaOp epsilon = xla::ScalarLike(var, epsilon_);
+        var = xla::Sub(one / (var * var), epsilon);
+      }
+
       xla::XlaOp output =
           xla::BatchNormGrad(activations, scale, mean, var, grad_backprop,
                              epsilon_, feature_index);
@@ -158,9 +190,8 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       offset_backprop = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
-      auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
-      auto scratch1 = xla::Pow(
-          xla::Add(var, xla::ConstantR0<float>(b, epsilon_)), neg_half);
+      auto epsilon = XlaHelpers::FloatLiteral(b, scale_dtype, epsilon_);
+      auto scratch1 = xla::Rsqrt(xla::Add(var, epsilon));
 
       // scratch2 = sum(y_backprop * (x - mean))
       auto mul =
@@ -187,6 +218,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   TensorFormat data_format_;
   float epsilon_;
   bool is_training_;
+  bool is_on_gpu_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index e7f369b761f36a717ea5fb536780af91a8955b1e..33bdf9aec3167b0277f3c1db18c9e247ed9bb5d1 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -48,8 +48,11 @@ class BiasOp : public XlaOpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(bias_shape),
                 errors::InvalidArgument("Biases must be 1D: ",
                                         bias_shape.DebugString()));
-    int feature_dim = (data_format_ == FORMAT_NHWC) ? input_shape.dims() - 1
-                                                    : input_shape.dims() - 3;
+
+    // feature_dim is the channel (C) dimension of the data.
+    int feature_dim = (data_format_ == FORMAT_NHWC)
+                          ? input_shape.dims() - 1
+                          : /*data_format == FORMAT_NCHW*/ 1;
     OP_REQUIRES(
         ctx, feature_dim >= 0,
         errors::InvalidArgument("Input tensor does not have enough dimensions "
@@ -91,9 +94,10 @@ class BiasAddGradOp : public XlaOpKernel {
                 errors::InvalidArgument("Input tensor must be at least 2D: ",
                                         out_backprop_shape.DebugString()));
 
+    // feature_dim is the channel (C) dimension of the data.
     int feature_dim = (data_format_ == FORMAT_NHWC)
                           ? out_backprop_shape.dims() - 1
-                          : out_backprop_shape.dims() - 3;
+                          : /*data_format == FORMAT_NCHW*/ 1;
     OP_REQUIRES(
         ctx, feature_dim >= 0,
         errors::InvalidArgument("Input tensor does not have enough dimensions "
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 5e9280c1fe692037b0a842a92ef5a8c28b854a54..f69b5dc022287825f05353349cddc6cd0f497a0e 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -77,7 +79,28 @@ static xla::XlaOp DivNoNanImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
 XLA_MAKE_BINARY(DivNoNan,
                 DivNoNanImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 
-// Implementation of FloorDiv. Pseudo-code:
+// Implementation of MulNoNan. Pseudo-code:
+// if (y == 0) {
+//   return 0
+// } else {
+//   return x * y;
+// }
+static xla::XlaOp MulNoNanImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
+                               xla::XlaOp y, const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  auto zero = XlaHelpers::Zero(b, dtype);
+  auto y_equals_0 = xla::Eq(y, zero);
+  auto zeros = xla::ZerosLike(x);
+  auto result = xla::Select(y_equals_0, zeros, xla::Mul(x, y));
+  return result;
+}
+XLA_MAKE_BINARY(MulNoNan,
+                MulNoNanImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+
+// Implementation of FloorDiv.
+//
+// For floating-point values, simply returns floor(x / y).  For integers, does:
+//
 // if ((x < 0) != (y < 0)) {
 //   T abs_x = std::abs(x);
 //   T abs_y = std::abs(y);
@@ -88,6 +111,9 @@ XLA_MAKE_BINARY(DivNoNan,
 static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  if (DataTypeIsFloating(dtype)) {
+    return xla::Floor(xla::Div(x, y));
+  }
   if (DataTypeIsUnsigned(dtype)) {
     return xla::Div(x, y);
   }
@@ -97,11 +123,7 @@ static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
   auto abs_x = xla::Abs(x);
   auto abs_y = xla::Abs(y);
   auto t = xla::Neg(xla::Sub(xla::Add(abs_x, abs_y), one));
-  auto result = xla::Select(different_sign, xla::Div(t, abs_y), xla::Div(x, y));
-  if (DataTypeIsFloating(dtype)) {
-    result = xla::Floor(result);
-  }
-  return result;
+  return xla::Select(different_sign, xla::Div(t, abs_y), xla::Div(x, y));
 }
 XLA_MAKE_BINARY(FloorDiv,
                 FloorDivImpl(b, input_type(0), lhs, rhs, broadcast_helper));
@@ -157,7 +179,7 @@ XLA_MAKE_BINARY(RealDiv, xla::Div(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(ReciprocalGrad, xla::Neg(xla::Mul(rhs, xla::Mul(lhs, lhs))));
 XLA_MAKE_BINARY(
     RsqrtGrad,
-    xla::Mul(xla::Pow(lhs, XlaHelpers::IntegerLiteral(b, input_type(0), 3)),
+    xla::Mul((lhs * lhs) * lhs,
              xla::Div(rhs, XlaHelpers::IntegerLiteral(b, input_type(0), -2)),
              extend_dimensions));
 XLA_MAKE_BINARY(
@@ -165,12 +187,8 @@ XLA_MAKE_BINARY(
     xla::Div(xla::Mul(rhs, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
              lhs, extend_dimensions));
 
-static xla::XlaOp Square(xla::XlaBuilder* builder, const xla::XlaOp& x) {
-  return xla::Mul(x, x);
-}
-
 XLA_MAKE_BINARY(SquaredDifference,
-                Square(b, xla::Sub(lhs, rhs, extend_dimensions)));
+                xla::Square(xla::Sub(lhs, rhs, extend_dimensions)));
 
 XLA_MAKE_BINARY(TruncateDiv, xla::Div(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(TruncateMod, xla::Rem(lhs, rhs, extend_dimensions));
@@ -195,8 +213,8 @@ XLA_MAKE_BINARY(SoftplusGrad,
 // softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2
 XLA_MAKE_BINARY(SoftsignGrad,
                 xla::Div(lhs,
-                         Square(b, xla::Add(XlaHelpers::One(b, input_type(0)),
-                                            xla::Abs(rhs)))));
+                         xla::Square(xla::Add(XlaHelpers::One(b, input_type(0)),
+                                              xla::Abs(rhs)))));
 
 XLA_MAKE_BINARY(TanhGrad,
                 xla::Mul(rhs, xla::Sub(XlaHelpers::One(b, input_type(0)),
@@ -204,6 +222,8 @@ XLA_MAKE_BINARY(TanhGrad,
 
 XLA_MAKE_BINARY(Pow, xla::Pow(lhs, rhs, extend_dimensions));
 
+XLA_MAKE_BINARY(NextAfter, xla::NextAfter(lhs, rhs));
+
 #undef XLA_MAKE_BINARY
 
 class ApproximateEqualOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24623768f3897179575fe4cec6190a9a877a5202
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -0,0 +1,297 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/case_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+XlaCaseOp::XlaCaseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branches_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_types_));
+  if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_).ok()) {
+    has_token_input_output_ = false;
+  } else {
+    has_token_input_output_ = !token_input_nodes_.empty();
+  }
+}
+
+// TODO(b/35949885): There is duplication here with the handling of the
+// while_op. Refactor the common code out/rework.
+void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
+  xla::XlaBuilder* b = ctx->builder();
+  int num_branches = branches_.size();
+  OP_REQUIRES(ctx, num_branches >= 1,
+              errors::InvalidArgument("Must provide at least one case branch"));
+  OP_REQUIRES(ctx, input_type(0) == DT_INT32,
+              errors::InvalidArgument(
+                  "branch_index argument must be a int32 for XLA compilation"));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(0)),
+              errors::InvalidArgument(
+                  "branch_index argument must be scalar for XLA compilation"));
+
+  VLOG(1) << "Building Case: " << input_types_.size() << " inputs";
+
+  std::vector<XlaCompiler::Argument> arguments(input_types_.size());
+  int num_resource_args = 0;
+  for (int i = 0; i < input_types_.size(); ++i) {
+    XlaCompiler::Argument& arg = arguments[i];
+    DataType type = ctx->input_type(i + 1);
+
+    if (type == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
+
+      arg.initialized = resource->initialized();
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = resource->kind();
+
+      arg.type = resource->type();
+      arg.shape = resource->shape();
+      OP_REQUIRES(ctx, arg.initialized,
+                  errors::Unimplemented("Uninitialized arguments: ", arg.name));
+      arg.max_array_size = resource->max_array_size();
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+      arg.name = resource->name();
+      VLOG(2) << "Resource " << resource->name()
+              << " type: " << DataTypeString(arg.type)
+              << " shape: " << arg.HumanString()
+              << " initialized: " << arg.initialized;
+
+      num_resource_args++;
+    } else {
+      arg.kind = XlaCompiler::Argument::kParameter;
+      arg.type = input_types_[i];
+      arg.shape = ctx->InputShape(i + 1);
+      VLOG(2) << "Arg type: " << DataTypeString(arg.type)
+              << " shape: " << arg.HumanString();
+    }
+  }
+
+  // Compile each branch of the conditional.
+  XlaCompiler::CompileOptions options;
+  options.use_tuple_arg = true;
+  options.resolve_compile_time_constants = false;
+  options.return_updated_values_for_all_resources = true;
+  options.is_entry_computation = false;
+  options.add_token_input_output = has_token_input_output_;
+  XlaCompiler* compiler = ctx->compiler();
+
+  std::vector<XlaCompiler::CompilationResult> branch_results(num_branches);
+  std::vector<XlaCompiler::CompilationResult*> branch_results_p(num_branches);
+  for (int j = 0; j < num_branches; ++j) {
+    OP_REQUIRES_OK(ctx,
+                   compiler->CompileFunction(options, branches_[j], arguments,
+                                             &branch_results[j]));
+    branch_results_p[j] = &branch_results[j];
+  }
+
+  bool has_tensor_array_gradients = false;
+  for (XlaCompiler::CompilationResult* result : branch_results_p) {
+    for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      XlaCompiler::Argument& arg = arguments[update.input_index];
+
+      // Add any TensorArray gradients touched by the then/else computation to
+      // the enclosing graph.
+      for (const string& grad_source : update.tensor_array_gradients_accessed) {
+        VLOG(5) << "TensorArray " << resource->name() << " accessed gradient "
+                << grad_source;
+        XlaResource* gradient;
+        OP_REQUIRES_OK(ctx, resource->GetOrCreateTensorArrayGradient(
+                                grad_source, b, &gradient));
+      }
+      // Add all of the TensorArray gradients to the argument. For simplicity,
+      // we always pass all known gradients.
+      for (const auto& gradient : resource->tensor_array_gradients()) {
+        arg.tensor_array_gradients.insert(gradient.first);
+      }
+      if (!resource->tensor_array_gradients().empty()) {
+        has_tensor_array_gradients = true;
+      }
+    }
+  }
+
+  // Recompile the functions to update the argument shapes for tensor arrays.
+  if (has_tensor_array_gradients) {
+    for (int j = 0; j < num_branches; ++j) {
+      branch_results[j] = {};
+      OP_REQUIRES_OK(ctx,
+                     compiler->CompileFunction(options, branches_[j], arguments,
+                                               &branch_results[j]));
+    }
+  }
+
+  xla::Shape branch0_input_shape;
+  std::vector<const xla::XlaComputation*> result_computations(num_branches);
+  for (int j = 0; j < num_branches; ++j) {
+    // Check that all branches have identical input shapes.
+    OP_REQUIRES(ctx, branch_results[j].xla_input_shapes.size() == 1,
+                errors::FailedPrecondition("Expected one input shape"));
+    xla::Shape branch_input_shape = branch_results[j].xla_input_shapes[0];
+    if (j == 0) {
+      branch0_input_shape = branch_input_shape;
+    }
+    OP_REQUIRES(ctx, branch_input_shape.IsTuple(),
+                errors::FailedPrecondition("Expected tuple shape"));
+    OP_REQUIRES(ctx, branch_results[j].xla_input_shapes.size() == 1,
+                errors::FailedPrecondition("Expected one input shape"));
+    OP_REQUIRES(
+        ctx,
+        xla::ShapeUtil::Compatible(branch0_input_shape, branch_input_shape),
+        errors::InvalidArgument(
+            "Input shapes of 0 and ", j, " branches do not match: ",
+            xla::ShapeUtil::HumanString(branch0_input_shape), " vs. ",
+            xla::ShapeUtil::HumanString(branch_input_shape)));
+
+    // Check that all branches have identical output shapes.
+    OP_REQUIRES(
+        ctx,
+        xla::ShapeUtil::Compatible(branch_results[0].xla_output_shape,
+                                   branch_results[j].xla_output_shape),
+        errors::InvalidArgument(
+            "Output shapes of 0 and ", j, " branches do not match: ",
+            xla::ShapeUtil::HumanString(branch_results[0].xla_output_shape),
+            " vs. ",
+            xla::ShapeUtil::HumanString(branch_results[j].xla_output_shape)));
+
+    if (j == 0) {
+      VLOG(2) << "Input shape: "
+              << xla::ShapeUtil::HumanString(branch0_input_shape);
+      VLOG(2) << "Output shape: "
+              << xla::ShapeUtil::HumanString(
+                     branch_results[0].xla_output_shape);
+    }
+
+    // We set return_updated_values_for_all_resources=true and we pass the same
+    // arguments to both computations, so the resource update count must match.
+    OP_REQUIRES(ctx,
+                branch_results[0].resource_updates.size() ==
+                    branch_results[j].resource_updates.size(),
+                errors::FailedPrecondition(
+                    "Different number of resources in 0 and ", j, " branch"));
+    for (int i = 0; i < branch_results[0].resource_updates.size(); ++i) {
+      const auto& lhs = branch_results[0].resource_updates[i];
+      const auto& rhs = branch_results[j].resource_updates[i];
+      bool equal = lhs.input_index == rhs.input_index &&
+                   lhs.shape == rhs.shape &&
+                   lhs.tensor_array_gradients_accessed ==
+                       rhs.tensor_array_gradients_accessed;
+      OP_REQUIRES(ctx, equal,
+                  errors::FailedPrecondition("Mismatch in resource of 0 and ",
+                                             j, " branch for resource ", i));
+    }
+    result_computations[j] = branch_results[j].computation.get();
+  }
+
+  // Prepare the input arg Tuple.
+  int num_inputs = branch_results[0].input_mapping.size();
+  std::vector<xla::XlaOp> inputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    int input_num = branch_results[0].input_mapping[i] + 1;
+    if (has_token_input_output_ && i == num_inputs - 1) {
+      // Set token input for this "case" op.
+      std::vector<xla::XlaOp> token_inputs;
+      for (const string& node_name : token_input_nodes_) {
+        auto token_or = compiler->GetNodeToken(node_name);
+        OP_REQUIRES_OK(ctx, token_or.status());
+        token_inputs.push_back(token_or.ValueOrDie());
+      }
+      inputs[i] = xla::AfterAll(b, token_inputs);
+    } else if (ctx->input_type(input_num) == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource));
+      OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b));
+    } else {
+      inputs[i] = ctx->Input(i + 1);
+    }
+  }
+  auto input_tuple = xla::Tuple(b, inputs);
+
+  xla::XlaOp outputs =
+      xla::Conditional(ctx->Input(0), absl::MakeSpan(result_computations),
+                       std::vector<xla::XlaOp>(num_branches, input_tuple));
+  // Sets non-variable outputs.
+  for (int i = 0; i < output_types_.size(); ++i) {
+    xla::XlaOp output_handle = xla::GetTupleElement(outputs, i);
+    if (VLOG_IS_ON(2)) {
+      LOG(INFO) << "Setting output " << i;
+      auto shape_or = b->GetShape(output_handle);
+      if (shape_or.ok()) {
+        LOG(INFO) << "Shape for output " << i << ": "
+                  << xla::ShapeUtil::HumanString(shape_or.ValueOrDie());
+      } else {
+        LOG(INFO) << "Shape unknown for output " << i;
+      }
+    }
+    ctx->SetOutput(i, output_handle);
+  }
+  if (has_token_input_output_) {
+    // Set token output for this "Case" op. Token output is the last output of
+    // XLA computation, which comes after all "normal" TF outputs and resource
+    // updates. For "Case" node, num of resource updates equals to number of
+    // resource args because we set `return_updated_values_for_all_resources`
+    // to true in XlaCompiler option.
+    xla::XlaOp token_output =
+        xla::GetTupleElement(outputs, output_types_.size() + num_resource_args);
+    auto shape_or = b->GetShape(token_output);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+    OP_REQUIRES(ctx, shape_or.ValueOrDie().IsToken(),
+                errors::FailedPrecondition(
+                    "Token output is not token type: ",
+                    xla::ShapeUtil::HumanString(shape_or.ValueOrDie())));
+    OP_REQUIRES_OK(ctx, compiler->SetNodeToken(name(), token_output));
+  }
+
+  // Updates the values of any resource variables modified by the conditional
+  // bodies.
+  for (const XlaCompiler::CompilationResult& result : branch_results) {
+    for (int i = 0; i < result.resource_updates.size(); ++i) {
+      const XlaCompiler::ResourceUpdate& update = result.resource_updates[i];
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx,
+                     ctx->GetResourceInput(update.input_index + 1, &resource));
+      if (update.modified) {
+        int pos = static_cast<int>(result.outputs.size()) + i;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetFromPack(
+                           arguments[update.input_index].tensor_array_gradients,
+                           xla::GetTupleElement(outputs, pos), b));
+      }
+      VLOG(2) << "Case variable: pos: " << update.input_index
+              << " name: " << resource->name()
+              << " modified: " << update.modified
+              << " type: " << DataTypeString(update.type)
+              << " shape: " << update.shape.DebugString();
+    }
+  }
+  VLOG(1) << "Done building Case";
+}
+
+REGISTER_XLA_OP(Name("Case").AllowResourceTypes(), XlaCaseOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea14b18149cb5bc9162d42b384eb3a5e943ad8be
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// This TensorFlow op provides a functional switch/case primitive.
+//
+// The outputs of the branches must agree on the number, types, and
+// shapes of the Tensors carried around the two bodies.
+//
+// Computations in branch bodies may read from and write to resource variables.
+// Resource variables may be passed as arguments to the branch function's
+// bodies. The XlaCompiler converts resource variable arguments
+// into parameters to the XLA computation and moves them to the end of the
+// parameter list, and by using the `return_updated_values_for_all_variables`
+// we ensure that all variables that appear in the input also appear at the
+// end of the branch bodies output. This ensures the branch bodies output
+// signatures match.
+//
+// It is the user's responsibility to ensure that each non-variable _Arg matches
+// the corresponding _Retval.
+class XlaCaseOp : public XlaOpKernel {
+ public:
+  explicit XlaCaseOp(OpKernelConstruction* ctx);
+
+  void Compile(XlaOpKernelContext* ctx) override;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaCaseOp);
+
+  std::vector<NameAttrList> branches_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
+  bool has_token_input_output_;
+  std::vector<string> token_input_nodes_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index db58c2e651f0acf0dd4330575b5ce63fd0a1fb69..ca2152d6c103e05c06809d85d9529720ff112217 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -36,19 +36,6 @@ class CastOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Truncate", &use_truncation_));
   }
 
-  xla::PrimitiveType GetUnsignedIntTypeOfSameWidth(int64 src_bitwidth) {
-    switch (src_bitwidth) {
-      case 16:
-        return xla::U16;
-      case 32:
-        return xla::U32;
-      case 64:
-        return xla::U64;
-      default:
-        return xla::PRIMITIVE_TYPE_INVALID;
-    }
-  }
-
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
     xla::XlaOp input = ctx->Input(0);
@@ -85,7 +72,7 @@ class CastOp : public XlaOpKernel {
         // source datatype.
         int64 mask = ~((1L << mantissa_difference) - 1);
         xla::PrimitiveType same_width_int =
-            GetUnsignedIntTypeOfSameWidth(src_bitwidth);
+            xla::primitive_util::UnsignedIntegralTypeForBitWidth(src_bitwidth);
         OP_REQUIRES(ctx, same_width_int != xla::PRIMITIVE_TYPE_INVALID,
                     errors::Unimplemented("Unexpected type bitwidth"));
         input = xla::BitcastConvertType(
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index c2b4c28d1566f5429c5d8109db94af0c3762b131..a99c6ee4431852166eec0a71bb7ad74fd5c135d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -112,9 +113,12 @@ class CategoricalOp : public XlaOpKernel {
                                     xla::PrimitiveType type,
                                     XlaOpKernelContext* ctx) {
     xla::XlaBuilder* builder = ctx->builder();
-    auto uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
+    auto uniforms = xla::RngUniform(
+        xla::MinPositiveNormalValue(builder, type),
+        xla::One(builder, uniform_shape.element_type()), uniform_shape);
     return xla::Log(-xla::Log(uniforms));
   }
 
@@ -143,9 +147,13 @@ class StatelessCategoricalOp : public CategoricalOp {
     if (uniform_shape.element_type() == xla::BF16) {
       uniform_shape.set_element_type(xla::F32);
     }
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
     auto uniforms = xla::StatelessRngUniform(
-        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
-        XlaHelpers::One(builder, DT_FLOAT));
+        {seed0, seed1}, uniform_shape,
+        xla::MinPositiveNormalValue(builder, uniform_shape.element_type()),
+        xla::One(builder, uniform_shape.element_type()));
     return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 0ed3044efa5b1060d2b0ad2d5563b0e02ebf66ec..e6b30a38e0379fc09af07af686f4c5f3a737ecda 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/cholesky.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
@@ -24,7 +25,9 @@ class CholeskyOp : public XlaOpKernel {
  public:
   explicit CholeskyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    ctx->SetOutput(0, xla::Cholesky(ctx->Input(0)));
+    ctx->SetOutput(0,
+                   xla::Triangle(xla::Cholesky(ctx->Input(0), /*lower=*/true),
+                                 /*lower=*/true));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index cd7c7f4a82df7a65829787efcb1fd2f77870e945..09c97de13eb2ed951ca705cda89b7f293808cdf0 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -24,14 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 5b4f863f7418ecda0db502ce25fed2d0042bf3ca..6512ba25ce63a80b89f6148fce5444a7c0fee925 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -26,15 +26,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -203,7 +203,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
     StringPiece label, int num_spatial_dims, const xla::Shape& input_shape,
     const xla::Shape& filter_shape, const xla::Shape& out_backprop_shape,
     absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims,
+    absl::Span<const int64> explicit_paddings) {
   TensorShape input_tensor_shape, filter_tensor_shape,
       out_backprop_tensor_shape;
   TF_RETURN_IF_ERROR(XLAShapeToTensorShape(input_shape, &input_tensor_shape));
@@ -212,8 +213,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
       XLAShapeToTensorShape(out_backprop_shape, &out_backprop_tensor_shape));
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_tensor_shape, filter_tensor_shape,
-      out_backprop_tensor_shape, dilations, strides, padding,
-      /*explicit_paddings=*/{}, data_format, dims);
+      out_backprop_tensor_shape, dilations, strides, padding, explicit_paddings,
+      data_format, dims);
 }
 
 }  // anonymous namespace
@@ -227,10 +228,9 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
   TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &attrs.dilations));
   TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &attrs.strides));
   TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &attrs.padding));
-  // TODO(reedwm): Support explicit padding.
   if (attrs.padding == EXPLICIT) {
-    return errors::Unimplemented(
-        "XLA does not yet support Conv2D with explicit padding.");
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
   }
 
   string data_format;
@@ -303,6 +303,11 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(StringPiece /*type_string*/,
     window_strides[i] = attrs.strides.at(dim);
     rhs_dilation[i] = attrs.dilations.at(dim);
 
+    if (attrs.padding == EXPLICIT) {
+      padding[i] = {attrs.explicit_paddings.at(dim * 2),
+                    attrs.explicit_paddings.at(dim * 2 + 1)};
+    }
+
     int64 unused_output_size;
     TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
         input_shape.dimensions(dim), filter_shape.dimensions(i),
@@ -337,7 +342,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
   TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
       type_string, attrs.num_spatial_dims, input_shape, expanded_filter_shape,
       out_backprop_shape, attrs.dilations, attrs.strides, attrs.padding,
-      attrs.data_format, &dims));
+      attrs.data_format, &dims, attrs.explicit_paddings));
 
   // The input gradients are computed by a convolution of the output
   // gradients and the filter, with some appropriate padding. See the
@@ -420,7 +425,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
       type_string, attrs.num_spatial_dims, activations_shape,
       expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
-      attrs.padding, attrs.data_format, &dims));
+      attrs.padding, attrs.data_format, &dims, attrs.explicit_paddings));
 
   // The activations (inputs) form the LHS of the convolution.
   // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
@@ -441,12 +446,6 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   std::vector<int64> window_strides(attrs.num_spatial_dims);
   std::vector<int64> ones(attrs.num_spatial_dims, 1);
 
-  // The activations (inputs) form the LHS of the convolution.
-  // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
-  // For the gradient computation, we flip the roles of the batch and
-  // feature dimensions.
-  // Each spatial entry has size in_depth * batch
-
   // Swap n_dim and c_dim in the activations.
   dnums.set_input_batch_dimension(c_dim);
   dnums.set_input_feature_dimension(n_dim);
@@ -475,12 +474,14 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(dim);
+    rhs_dilation[i] = dims.spatial_dims[i].stride;
+    window_strides[i] = attrs.dilations[dim];
 
     // We will also need to pad the input with zeros such that after the
     // convolution, we get the right size for the filter.
     // The padded_in_rows should be such that when we convolve this with the
     // expanded_out_rows as a filter, we should get filter_rows back.
-    //
+
     const int64 padded_in_size =
         dims.spatial_dims[i].expanded_output_size +
         (dims.spatial_dims[i].filter_size - 1) * attrs.dilations[dim];
@@ -501,6 +502,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // We apply negative padding in this case.
     const int64 pad_total = padded_in_size - dims.spatial_dims[i].input_size;
 
+    // + For the EXPLICIT padding, we pad the top/left side with the explicit
+    //   padding and pad the bottom/right side with the remaining space.
     // + For the VALID padding, we don't pad anything on the top/left side
     //   and pad the bottom/right side with the remaining space.
     // + For the SAME padding, we pad top/left side the same as bottom/right
@@ -509,12 +512,12 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // In addition, if the padded input size is smaller than the input size,
     // we need to ignore some training elements of the input. We do this by
     // applying negative padding on the right/bottom.
-    const int64 pad_before =
-        attrs.padding == Padding::SAME ? std::max<int64>(pad_total / 2, 0) : 0;
-
+    const int64 pad_before = attrs.padding == Padding::EXPLICIT
+                                 ? attrs.explicit_paddings[2 * dim]
+                                 : attrs.padding == Padding::SAME
+                                       ? std::max<int64>(pad_total / 2, 0)
+                                       : 0;
     padding[i] = {pad_before, pad_total - pad_before};
-    rhs_dilation[i] = dims.spatial_dims[i].stride;
-    window_strides[i] = attrs.dilations[dim];
   }
 
   // Besides padding the input, we will also expand output_rows to
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 6e1b70a47850ae5c05939f8dfb7ec129c031df21..d893eca7f9ba07dded76eb215af4779080fa66b9 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -47,6 +47,7 @@ struct ConvOpAttrs {
   std::vector<int32> dilations;
   std::vector<int32> strides;
   Padding padding;
+  std::vector<int64> explicit_paddings;
   TensorFormat data_format;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index eafdba876ae9e2c38694f065cf83bb3725b8460e..e74ada47517d1ee98cad5d8523872bbb4eab6e7e 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -25,15 +25,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/conv_grad_ops.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index e96a1adce43c750314715107b4a1954d4a5b4e40..9fe91d16d77d601f94fe35b48cbe97452d7a6a72 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/lib/data_format.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -30,11 +31,6 @@ class DepthToSpaceOp : public XlaOpKernel {
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
 
-    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument("Unsupported data format ",
-                                        ToString(data_format_),
-                                        "; expected formats NHWC or NCHW"));
-
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -42,19 +38,36 @@ class DepthToSpaceOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_tensor_shape = ctx->InputShape(0);
-    int input_rank = input_tensor_shape.dims();
+    xla::XlaOp input = ctx->Input(0);
+
+    TensorFormat data_format = data_format_;
+    // If the data is in a vectorized format, reformat it into a non-vectorized
+    // version first. We'll undo the transformation later.
+    if (data_format == FORMAT_NCHW_VECT_C) {
+      data_format = FORMAT_NCHW;
+      auto input_reshaped = NCHW_VECT_CToNCHW(input);
+      OP_REQUIRES_OK(ctx, input_reshaped.status());
+      input = input_reshaped.ValueOrDie();
+    }
+
+    OP_REQUIRES(ctx, data_format == FORMAT_NCHW || data_format == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_)));
+
+    xla::XlaBuilder* builder = input.builder();
+    auto input_xla_shape = builder->GetShape(input);
+    OP_REQUIRES_OK(ctx, input_xla_shape.status());
+    const std::vector<int64>& input_shape =
+        input_xla_shape.ValueOrDie().dimensions();
+    int input_rank = input_shape.size();
+
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
                 errors::InvalidArgument("Input rank should be ", kRequiredDims,
                                         "; got: ", input_rank));
-    const absl::InlinedVector<int64, 4> input_shape =
-        input_tensor_shape.dim_sizes();
-
-    xla::XlaOp input = ctx->Input(0);
 
-    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
-    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format);
 
     std::vector<int64> reshaped_shape;
     std::vector<int64> transpose_order;
@@ -62,7 +75,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     reshaped_shape.reserve(input_rank);
     transpose_order.reserve(input_rank);
     output_shape.reserve(input_rank);
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format == FORMAT_NHWC) {
       reshaped_shape.push_back(input_shape[0]);
       for (int i = 0; i < num_spatial_dims; ++i) {
         reshaped_shape.push_back(input_shape[1 + i]);
@@ -153,6 +166,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //
     xla::XlaOp output = xla::Reshape(permuted_reshaped, output_shape);
 
+    // If this used to be a vectorized format turn it back now.
+    if (data_format != data_format_) {
+      DCHECK(data_format == FORMAT_NCHW && data_format_ == FORMAT_NCHW_VECT_C);
+      auto output_reshaped = NCHWToNCHW_VECT_C(output);
+      OP_REQUIRES_OK(ctx, output_reshaped.status());
+      output = output_reshaped.ValueOrDie();
+    }
+
     ctx->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 6e6ba21daf5bf3eab5bfc15378e77b6dd253da7c..b119997cf39e210ed8e0ae730a08829e72b238b4 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/einsum_op.cc b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3334dc1de826d4946eb362223d4428858b23f0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr std::array<DataType, 2> kEinsumTypes = {{DT_BFLOAT16, DT_FLOAT}};
+
+class EinsumOp : public XlaOpKernel {
+ public:
+  explicit EinsumOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("equation", &equation_));
+  }
+
+  ~EinsumOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp lhs = ctx->Input(0);
+    xla::XlaOp rhs = ctx->Input(1);
+    const TensorShape a_shape = ctx->InputShape(0);
+    const TensorShape b_shape = ctx->InputShape(1);
+    ctx->SetOutput(0, xla::Einsum(lhs, rhs, equation_));
+  }
+
+ private:
+  string equation_;
+  TF_DISALLOW_COPY_AND_ASSIGN(EinsumOp);
+};
+
+REGISTER_XLA_OP(Name("XlaEinsum").TypeConstraint("T", kEinsumTypes), EinsumOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index 5fdb1d972c55efb876972d3f472b53a1f7cde1c2..87bb9d49c0c97181bac33da01ec7e0b10cf5d6fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/empty_op.cc b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00d2ce7c12fdc96483612059d1c792c847df04f3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific Empty Op.
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class EmptyOp : public XlaOpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // The output of this Op is a tensor of shape 'shape' with each
+    // element set to the default value of 'dtype'. If 'init' is false then
+    // the result values may be left undefined, though we don't do that here.
+    const TensorShape shape_shape = ctx->InputShape("shape");
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape_shape),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape_shape.DebugString()));
+
+    std::vector<int64> shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("shape", &shape));
+
+    auto default_value = xla::Zero(ctx->builder(), type_);
+    auto result = xla::Broadcast(default_value, shape);
+    ctx->SetOutput(0, result);
+  }
+
+ private:
+  DataType dtype_;
+  xla::PrimitiveType type_;
+  bool init_;
+};
+
+REGISTER_XLA_OP(Name("Empty").CompileTimeConstantInput("shape"), EmptyOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 6df8b5367d2390e65995beb1583b225755e6ee9f..5ac288d8a346b0119892d804941608a286d7b721 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -21,14 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/conv_grad_ops.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -68,6 +67,13 @@ class GenericFftOp : public XlaOpKernel {
       }
       for (int i = 0; i < fft_rank_; i++) {
         int index = input_shape.dims() - fft_rank_ + i;
+        OP_REQUIRES(
+            ctx,
+            input_shape.dim_size(index) == 0 ||
+                input_shape.dim_size(index) >= expected_sizes[i],
+            errors::InvalidArgument(
+                "Input dimension ", index, " must have length of at least ",
+                expected_sizes[i], " but got: ", input_shape.dim_size(index)));
         if (input_shape.dim_size(index) > expected_sizes[i]) {
           slice_sizes[index] = expected_sizes[i];
         } else {
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index af1085d5b35077b7ebd144bfb2473485e3b3de6b..a4a786e8a0608f3deeee7a0bd0bca5d631d461ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/kernels/partitioned_function_ops.h"
 
 namespace tensorflow {
 namespace {
@@ -107,6 +108,10 @@ class SymbolicGradientOp : public AsyncOpKernel {
 };
 
 REGISTER_XLA_OP(Name(kGradientOp), SymbolicGradientOp);
+REGISTER_XLA_OP(Name("PartitionedCall").AllowResourceTypes(),
+                PartitionedCallOp);
+REGISTER_XLA_OP(Name("StatefulPartitionedCall").AllowResourceTypes(),
+                PartitionedCallOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 41c31d0ed58fe9bc9bbde0bd58993c975f04fd60..6472045265e4d930a5da770a68f5c502192201ae 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -167,13 +167,13 @@ class GatherOp : public XlaOpKernel {
 
       OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
       const auto params_dims = input_shape.dims();
-      if (axis < 0) {
-        axis += params_dims;
-      }
       OP_REQUIRES(
-          context, 0 <= axis && axis < params_dims,
+          context, -params_dims <= axis && axis < params_dims,
           errors::InvalidArgument("Expected axis in the range [", -params_dims,
                                   ", ", params_dims, "), but got ", axis));
+      if (axis < 0) {
+        axis += params_dims;
+      }
     }
 
     DataType index_type = input_type(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 19dd38c46ef154ea74bcbb6721dd04924702efcc..8b27e8e85a37bd5aa757b0cdd7e00e9fa3c0cf6e 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -38,9 +38,13 @@ class IdentityOp : public XlaOpKernel {
 
 // XLA_* devices also register a "real" Identity operator so we suppress the
 // dummy operator using CompilationOnly().
-REGISTER_XLA_OP(Name("Identity").AllowResourceTypes().CompilationOnly(),
-                IdentityOp);
-REGISTER_XLA_OP(Name("IdentityN").AllowResourceTypes().CompilationOnly(),
+REGISTER_XLA_OP(
+    Name("Identity").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    IdentityOp);
+REGISTER_XLA_OP(Name("IdentityN")
+                    .AllowResourceTypes()
+                    .AllowVariantTypes()
+                    .CompilationOnly(),
                 IdentityOp);
 REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 954ae0b596f33243fad1374473c689adb580f6a4..aa5637e2669555da17af8bb05ab08beeba6a89c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -80,7 +80,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.name = resource->name();
       VLOG(2) << "Resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.HumanString()
               << " initialized: " << arg.initialized;
 
       num_resource_args++;
@@ -89,7 +89,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.type = input_types_[i];
       arg.shape = ctx->InputShape(i + 1);
       VLOG(2) << "Arg type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString();
+              << " shape: " << arg.HumanString();
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index b96d45316f626e678a64392a4315979eeeb6e83c..d19d48e5dd95962fe4a4e4026eaf6b06b7898564 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -134,14 +135,15 @@ int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
 // If the 2D kernel would be very large, the 1D kernel can be applied once in
 // each dimension due to the symmetry of the kernel along all axis to reduce the
 // computational intensity.
-xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder,
+                                xla::PrimitiveType type, int64 n) {
   std::vector<float> kernel(n * 2 - 1);
   for (int64 i = 0; i < n; ++i) {
     float v = (i + 1.0f) / n;
     kernel[i] = v;
     kernel[n * 2 - 2 - i] = v;
   }
-  return xla::ConstantR1<float>(builder, kernel);
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
 }
 
 // Unlike the bilinear kernel, which is triangular, the nearest neighbor
@@ -153,11 +155,12 @@ xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder, int64 n) {
 // to the right (because an existing non TPU kernel
 // for nearest neighbor resize already chose to default to the right,
 // so we want to be consistent).
-xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder,
+                                       xla::PrimitiveType type, int64 n) {
   std::vector<float> kernel(n * 2 - 1, 0.0f);
   std::fill(&kernel[n / 2], &kernel[(3 * n) / 2], 1.0f);
 
-  return xla::ConstantR1<float>(builder, kernel);
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
 }
 
 // Kernels with more than 16 spatial elements are considered intense and the
@@ -165,42 +168,66 @@ xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder, int64 n) {
 const int64 kMax2DKernelSize = 16;
 
 xla::XlaOp MakeGeneralResizeKernel(xla::XlaBuilder* builder,
+                                   xla::PrimitiveType type,
                                    absl::Span<const int64> kernel_size,
                                    int64 channels, bool is_kernel_bilinear) {
   auto make_kernel_func =
       is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
 
-  auto depthwise_kernel = xla::Broadcast(
-      xla::Zero(builder, xla::F32),
-      {(2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1});
+  std::vector<int64> depthwise_kernel_sizes = {
+      (2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1};
+  auto depthwise_kernel =
+      xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[1]),
+                          depthwise_kernel_sizes, /*broadcast_dimensions=*/{1});
 
-  return xla::Mul(
-      xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[1]),
-               /*broadcast_dimensions=*/{1}),
-      make_kernel_func(builder, kernel_size[0]),
-      /*broadcast_dimensions=*/{0});
+  return xla::Mul(depthwise_kernel,
+                  make_kernel_func(builder, type, kernel_size[0]),
+                  /*broadcast_dimensions=*/{0});
 }
 
 xla::XlaOp MakeGeneralResizeKernelInDim(xla::XlaBuilder* builder,
+                                        xla::PrimitiveType type,
                                         absl::Span<const int64> kernel_size,
                                         int64 channels, int64 dim,
                                         bool is_kernel_bilinear) {
   auto make_kernel_func =
       is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
 
-  auto depthwise_kernel =
-      xla::Broadcast(xla::Zero(builder, xla::F32),
-                     {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
-                      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1});
-  return xla::Add(depthwise_kernel, make_kernel_func(builder, kernel_size[dim]),
-                  /*broadcast_dimensions=*/{dim});
+  std::vector<int64> depthwise_kernel_sizes = {
+      dim == 0 ? (2 * kernel_size[0] - 1) : 1,
+      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1};
+  return xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[dim]),
+                             depthwise_kernel_sizes,
+                             /*broadcast_dimensions=*/{dim});
+}
+
+xla::XlaOp BroadcastSpatialDimensions(xla::XlaBuilder* builder,
+                                      const xla::XlaOp& input,
+                                      int32 spatial_dimensions_offset,
+                                      absl::Span<const int64> in_size,
+                                      absl::Span<const int64> out_size) {
+  // Add broadcasts to handle expanding from a size == 1 dimension to a
+  // size > 1 dimension.
+  auto broadcast_shape_or_status = builder->GetShape(input);
+  if (!broadcast_shape_or_status.ok()) {
+    return builder->ReportError(broadcast_shape_or_status.status());
+  }
+  xla::Shape broadcast_shape = broadcast_shape_or_status.ValueOrDie();
+  for (int32 i = 0; i < in_size.size(); ++i) {
+    if (in_size[i] == 1 && out_size[i] > 1) {
+      broadcast_shape.set_dimensions(spatial_dimensions_offset + i,
+                                     out_size[i]);
+    }
+  }
+  return xla::BroadcastInDim(input, broadcast_shape.dimensions(),
+                             /*broadcast_dimensions=*/{0, 1, 2, 3});
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolution(
-    xla::XlaBuilder* builder, const xla::XlaOp& input,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> out_size, const int64 channels, const bool align_corners,
-    bool is_kernel_bilinear) {
+    xla::XlaBuilder* builder, const xla::XlaOp& input, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> out_size, const int64 channels,
+    const bool align_corners, bool is_kernel_bilinear) {
   // Picture for a 1x3 to 1x4 bilinear resize:
   // stride = 2, kernel size = 3
   // Input:
@@ -287,7 +314,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
   // Split convolutions into independent dimensions if they would be a very
   // large kernel.
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
                                                 channels, is_kernel_bilinear);
     output =
         xla::ConvGeneralDilated(input_data, kernel, dims.stride,
@@ -299,7 +326,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
                                 /*feature_group_count=*/channels);
   } else {
     xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         input_data, kernel0, {dims.stride[0], 1},
         /*padding=*/
@@ -308,7 +335,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
     xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         output, kernel1, {1, dims.stride[1]},
         /*padding=*/
@@ -320,19 +347,14 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
 
   // Add broadcasts to handle expanding from a size == 1 dimension to a
   // size > 1 dimension.
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    if (in_size[i] == 1 && out_size[i] > 1) {
-      output = xla::Add(output, xla::ConstantR1<float>(builder, out_size[i], 0),
-                        /*broadcast_dimensions=*/{1 + i});
-    }
-  }
-  return output;
+  return BroadcastSpatialDimensions(
+      builder, output, /*spatial_dimensions_offset=*/1, in_size, out_size);
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
-    xla::XlaBuilder* builder, const xla::XlaOp& grad,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> grad_size, const int64 channels,
+    xla::XlaBuilder* builder, const xla::XlaOp& grad, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> grad_size, const int64 channels,
     const bool align_corners, bool is_kernel_bilinear) {
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, grad_size, align_corners);
@@ -353,19 +375,14 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
   xla::XlaOp output;
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, dims.kernel_size,
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
                                                 channels, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a size == 1
     // dimension to a size > 1 dimension. This has the effect of summing the
     // gradient contributions in that dimension.
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      if (in_size[i] == 1 && grad_size[i] > 1) {
-        kernel =
-            xla::Add(kernel, xla::ConstantR1<float>(builder, grad_size[i], 0),
-                     /*broadcast_dimensions=*/{i});
-      }
-    }
+    kernel = BroadcastSpatialDimensions(
+        builder, kernel, /*spatial_dimensions_offset=*/0, in_size, grad_size);
 
     output = xla::ConvGeneralDilated(
         grad, kernel, /*window_strides=*/dims.kernel_size,
@@ -377,22 +394,22 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
         /*feature_group_count=*/channels);
   } else {
     xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 0, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
     xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
-        builder, dims.kernel_size, channels, 1, is_kernel_bilinear);
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a
     // size == 1 dimension to a size > 1 dimension. This has the effect of
     // summing the gradient contributions in that dimension.
     if (in_size[0] == 1 && grad_size[0] > 1) {
-      kernel0 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[0], 0),
-                   /*broadcast_dimensions=*/{0});
+      kernel0 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0, {1},
+                                           {grad_size[0]});
     }
     if (in_size[1] == 1 && grad_size[1] > 1) {
-      kernel1 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[1], 0),
-                   /*broadcast_dimensions=*/{1});
+      kernel1 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0,
+                                           in_size, grad_size);
     }
 
     output = xla::ConvGeneralDilated(
@@ -423,7 +440,7 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
     }
   }
   if (pad_output) {
-    output = xla::Pad(output, xla::ConstantR0<float>(builder, 0.0f), padding);
+    output = xla::Pad(output, xla::Zero(builder, type), padding);
   }
   return output;
 }
@@ -458,6 +475,7 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
   const int num_spatial_dims = 2;
 
   xla::XlaOp input = ctx->Input(0);
+  xla::PrimitiveType input_type = ctx->input_xla_type(0);
 
   // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
   // dimension i.
@@ -475,8 +493,11 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
                        {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
   }
 
-  // Output is always type float.
-  input = xla::ConvertElementType(input, xla::F32);
+  // Output is always type float if 'is_kernel_bilinear' is true.
+  if (is_kernel_bilinear) {
+    input = xla::ConvertElementType(input, xla::F32);
+    input_type = xla::F32;
+  }
 
   // Special Case:
   // Instead of doing a ResizeUsingDilationAndConvolution directly,
@@ -504,19 +525,19 @@ void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
         std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
                                             (in_size[1] - 1) * 2 + 1};
         output = ResizeUsingDilationAndConvolution(
-            b, input, num_spatial_dims, in_size, next_out_size, channels,
-            align_corners_, is_kernel_bilinear);
+            b, input, input_type, num_spatial_dims, in_size, next_out_size,
+            channels, align_corners_, is_kernel_bilinear);
         input = output;
         in_size = next_out_size;
       } else {
         output = ResizeUsingDilationAndConvolution(
-            b, input, num_spatial_dims, in_size, out_size, channels,
+            b, input, input_type, num_spatial_dims, in_size, out_size, channels,
             align_corners_, is_kernel_bilinear);
         in_size = out_size;
       }
     } else {
       output = ResizeUsingDilationAndConvolution(
-          b, input, num_spatial_dims, in_size, out_size, channels,
+          b, input, input_type, num_spatial_dims, in_size, out_size, channels,
           align_corners_, is_kernel_bilinear);
       in_size = out_size;
     }
@@ -631,19 +652,19 @@ class ResizeBilinearGradOp : public XlaOpKernel {
           std::vector<int64> next_grad_size = {(in_size[0] - 1) * 2 + 1,
                                                (in_size[1] - 1) * 2 + 1};
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, next_grad_size, channels,
-              align_corners_, true);
+              b, grad, xla::F32, num_spatial_dims, in_size, next_grad_size,
+              channels, align_corners_, true);
           grad = output;
           in_size = next_grad_size;
         } else {
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, grad_size, channels,
+              b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
               align_corners_, true);
           in_size = grad_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolutionGradOp(
-            b, grad, num_spatial_dims, in_size, grad_size, channels,
+            b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
             align_corners_, true);
         in_size = grad_size;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 978e9480eac5b522d1ee2d51a61841c6f1bbba0c..c1539f48d4f729510b2d930de91666a7c31f1ef0 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 XlaArgMinMaxOp::XlaArgMinMaxOp(OpKernelConstruction* ctx, bool is_min)
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index 30b993045c86c6d01f8eabe55986f132f8938643..e4bbdef6480104a1051acfc647644deb65c80171 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 47cf8c6675bc120653c2a5ab6d4b07376dc382ee..39d96e748b3a2a852c03c0dd53ec175f0c66a43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -25,9 +25,6 @@ limitations under the License.
 namespace tensorflow {
 
 EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
-  // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 2 * sizeof(void*));
-
   float* input = static_cast<float*>(data[0]);
   int64 input_size = *static_cast<int64*>(data[1]);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index 93f029731c34e84000a3dc00df8af05654cccf2d..7f25d34c3ef82e5360fd2d7c1cd12dd8c6f40507 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index 90c0ebefb24ec2c4378782e9b15d3f57c33032a4..5a6569c8954d1686dc9d7577a66feb720241ea13 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -31,7 +32,10 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
-        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+        /*lower=*/lower_, /*unit_diagonal=*/false,
+        /*transpose_a=*/
+        adjoint_ ? xla::TriangularSolveOptions::ADJOINT
+                 : xla::TriangularSolveOptions::NO_TRANSPOSE);
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc
index 65ab9da8d7ca0509a4a69c43727a0e6c0435908a..da50b75251beb2f97400cc7d2ffb5f4d05a3fb6e 100644
--- a/tensorflow/compiler/tf2xla/kernels/no_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc
@@ -13,12 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
 
+namespace {
+
+class NoOp : public OpKernel {
+ public:
+  explicit NoOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+  bool IsExpensive() override { return false; }
+};
+
+}  // namespace
+
 // XLA_* devices also register a "real" NoOp operator so we suppress the
 // dummy operator using CompilationOnly().
 REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index a9b519d8928cc2807831fd6b4f12e60b7d58ea55..6ca100a2f2bf90e1d61829aa45a44cbc97090ed1 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -24,13 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 06c6cc37ec90192486ba15010bfeb763a9ffb987..85223795aa8da93964efc0252eb34df3ebb6df3f 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -26,11 +26,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 01b047f732f0e9fb3b45b272e7886e2f8cf4fff4..d6c70d4af1c2e921b70b0869f0163c8481017c7d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -279,9 +280,9 @@ class TruncatedNormalOp : public XlaOpKernel {
 
     xla::XlaBuilder* b = ctx->builder();
 
-    xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
+    xla::XlaOp one = xla::One(b, xla_shape.element_type());
     xla::XlaOp min_positive =
-        XlaHelpers::FloatLiteral(b, dtype, std::numeric_limits<float>::min());
+        xla::MinPositiveNormalValue(b, xla_shape.element_type());
     auto uniform = xla::RngUniform(min_positive, one, xla_shape);
     ctx->SetOutput(0, TruncatedNormal(uniform));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.h b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d107be6f13c48e26a4ba67fefa641c6ce811aa80
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
+
+#include <cmath>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Converts to bfloat16 if `dtype` equals DT_BFLOAT16, no-op otherwise.
+// It masks the last 16 bit. With normal rounding, values near "maxval" would be
+// converted to "maxval" which is out of range ["minval", "maxval"). In
+// addition, the distribution near the limit is not uniform.
+xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RANDOM_OPS_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index e4046c795577983bff1a8053743bf4d3a258e583..1f417037284c87753b219ea5ce1d4edce0ce6336 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -37,10 +37,14 @@ class RetvalOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     const Tensor& input = ctx->op_kernel_context()->input(0);
 
-    OP_REQUIRES(ctx, input.dtype() == dtype_,
-                errors::InvalidArgument(
-                    "Type mismatch: actual ", DataTypeString(input.dtype()),
-                    " vs. expect ", DataTypeString(dtype_)));
+    // DT_VARIANT types represent Tensor Lists and are wrapped in a DT_UINT8
+    // tensor so we skip the check here.
+    if (dtype_ != DT_VARIANT) {
+      OP_REQUIRES(ctx, input.dtype() == dtype_,
+                  errors::InvalidArgument(
+                      "Type mismatch: actual ", DataTypeString(input.dtype()),
+                      " vs. expect ", DataTypeString(dtype_)));
+    }
     auto frame = ctx->call_frame();
     if (frame) {
       // If 'frame' is non-null, this is an inner function call inside a JIT
@@ -59,8 +63,9 @@ class RetvalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_XLA_OP(Name("_Retval").AllowResourceTypes().CompilationOnly(),
-                RetvalOp);
+REGISTER_XLA_OP(
+    Name("_Retval").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    RetvalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index d7b38e86cc985d608116488f9e76756a8e904f9c..4d73469fb1858a252906bca190402ab8743e4cfb 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -67,84 +67,59 @@ class ReverseSequenceOp : public XlaOpKernel {
       return;
     }
 
-    // Given the input
-    //
-    // 012345
-    // 6789AB
-    //
-    // and sequence lens {2, 3} we:
-    //
-    // 1. Reverse and pad each row to get
-    //
-    //    543210XXXXXX
-    //    BA9876XXXXXX
-    //
-    // 2. Gather out the suffix from each row to get
-    //
-    //    10XXXX
-    //    876XXX
-    //
-    // 3. Select from the input and the array created by (2) to get the result.
-    //
-    //    102345
-    //    8769AB
-    const xla::PrimitiveType input_type = context->input_xla_type(0);
     const xla::PrimitiveType seq_lens_type = context->input_xla_type(1);
     const int64 max_seq_len = input_shape.dim_size(seq_dim_);
 
-    xla::XlaOp rev = xla::Rev(input, {seq_dim_});
-
-    auto padding_config = xla::MakeNoPaddingConfig(input_shape.dims());
-    padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high(
-        max_seq_len);
-    xla::XlaOp padded =
-        xla::Pad(rev, xla::Zero(builder, input_type), padding_config);
-
-    // Form a start indices tensor with shape [2, batch_size]. For each batch
-    // entry we have a (batch offset, seq offset) pair.
-    xla::XlaOp start_indices = xla::ConcatInDim(
+    // Create [batch, sequence, 2] tensor that contains the indices where the
+    // real data belongs
+    xla::XlaOp back = xla::Sub(seq_lens, xla::ScalarLike(seq_lens, 1));
+    xla::XlaOp batch_idx = xla::Iota(
         builder,
-        {
-            xla::Iota(builder,
-                      xla::ShapeUtil::MakeShape(seq_lens_type, {1, batch_size}),
-                      /*iota_dimension=*/1),
-            xla::Reshape(xla::ScalarLike(seq_lens, max_seq_len) - seq_lens,
-                         {1, batch_size}),
-        },
-        /*dimension=*/0);
+        xla::ShapeUtil::MakeShape(seq_lens_type, {batch_size, max_seq_len, 1}),
+        /*iota_dimension=*/0);
+    xla::XlaOp forward_idx = xla::Iota(
+        builder,
+        xla::ShapeUtil::MakeShape(seq_lens_type, {batch_size, max_seq_len, 1}),
+        /*iota_dimension=*/1);
+    xla::XlaOp reverse_idx = xla::Sub(back, forward_idx, {0});
+    reverse_idx = xla::Select(xla::Lt(reverse_idx, xla::ZerosLike(reverse_idx)),
+                              forward_idx, reverse_idx);
+    if (batch_dim_ > seq_dim_) {
+      // The output of the XLA gather op keeps indices dimensions in the same
+      // order as they appear in the input. If the batch_dim_ needs to be after
+      // the seq_dim_ in the output, it also needs to be that way in the input
+      // so we transpose.
+      batch_idx = xla::Transpose(batch_idx, {1, 0, 2});
+      forward_idx = xla::Transpose(forward_idx, {1, 0, 2});
+      reverse_idx = xla::Transpose(reverse_idx, {1, 0, 2});
+    }
+    xla::XlaOp start_indices =
+        xla::ConcatInDim(builder, {batch_idx, reverse_idx},
+                         /*dimension=*/2);
 
     xla::GatherDimensionNumbers dnums;
-    // The first dimension of start_indices contains the batch/seq dim choice.
-    dnums.set_index_vector_dim(0);
+    dnums.set_index_vector_dim(2);
+    // The first and second element in the third dimension of reverse_idx are
+    // the batch_dim_ offset and the seq_dim_ offset respectively.
     dnums.add_start_index_map(batch_dim_);
     dnums.add_start_index_map(seq_dim_);
 
-    // All other dimensions other than the batch dim are offset dimensions.
+    // batch_dim_ and seq_dim_ are collapsed and the other dimensions are kept
+    // in the gather.
     for (int i = 0; i < input_shape.dims(); ++i) {
-      if (i != batch_dim_) {
+      if (i != batch_dim_ && i != seq_dim_) {
         dnums.add_offset_dims(i);
+      } else {
+        dnums.add_collapsed_slice_dims(i);
       }
     }
-    dnums.add_collapsed_slice_dims(batch_dim_);
 
     auto slice_sizes = input_shape.dim_sizes();
     slice_sizes[batch_dim_] = 1;
+    slice_sizes[seq_dim_] = 1;
 
-    xla::XlaOp output = xla::Gather(padded, start_indices, dnums, slice_sizes);
-
-    // Mask out elements after the sequence length, and copy the corresponding
-    // elements from the input.
-    xla::XlaOp iota = xla::Iota(builder, seq_lens_type, max_seq_len);
-    std::vector<int64> dims(input_shape.dims(), 1);
-    dims[batch_dim_] = batch_size;
-    auto mask = xla::Lt(iota, xla::Reshape(seq_lens, dims), {seq_dim_});
-
-    // Broadcast the mask up to the input shape.
-    mask = xla::Or(mask, xla::Broadcast(xla::ConstantR0<bool>(builder, false),
-                                        input_shape.dim_sizes()));
-
-    output = xla::Select(mask, output, input);
-    context->SetOutput(0, output);
+    context->SetOutput(0,
+                       xla::Gather(input, start_indices, dnums, slice_sizes));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 4b9e1a578be2445091228953df7e5c5e82b42c28..8431724f438f67c07740212e1e31926777fef3ae 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -23,14 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index a95e7adacf194ba6eb33cbeb56abe1a5a2479337..a1c18bed3f94008af8038f32324c79aa5b2abded 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -110,10 +110,16 @@ class ScatterNdOp : public XlaOpKernel {
     auto updates = context->Input(1);
     auto result =
         XlaScatter(buffer, updates, indices,
-                   /*indices_are_vectors=*/true, /*combiner=*/{}, builder);
+                   /*indices_are_vectors=*/true, /*combiner=*/Combine, builder);
     OP_REQUIRES_OK(context, result.status());
     context->SetOutput(0, result.ValueOrDie());
   }
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Add(x, y);
+  }
 };
 
 REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstantInput("shape"),
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 9e4c57c9bf73369662274f6b783418e18ff860c2..aaf8c6075dd292e33e70683774a6c1bf374183e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index b1fa2915d59e4e5e2f2523e20e9a37898d087117..7a620d2a6518f8686ef570b33aac971d1dccb6c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -157,9 +157,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const float step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
@@ -171,9 +173,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const double step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 85b0367f73cf31b95e2cd2297e9c1476cfac9d50..280b68383c28d1b9d88f7b2ac0f8fab47244c05d 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -104,7 +104,7 @@ class SizeOp : public XlaOpKernel {
     for (int64 i = 0; i < rank; ++i) {
       size = xla::Mul(size, xla::GetDimensionSize(ctx->Input(0), i));
     }
-    size = xla::ConvertElementType(size, xla::S32);
+    size = xla::ConvertElementType(size, ctx->output_xla_type(0));
     ctx->SetOutput(0, size);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 76ea5f525598f511f295eb5a30f3cf603fbf57aa..b18e3f965c427aec456ce2b188dad79485df23cc 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 88da64e5a217a0c026106f03cb26958f6738446c..1be651da4704d5be1ce0a33312b6a67158a60285 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 3293c13b21bc4825c83f494b7f2d48a9b3000f9e..96863d6d1bae77dedfd02fe6469e53b311b4269a 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/lib/data_format.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -30,11 +31,6 @@ class SpaceToDepthOp : public XlaOpKernel {
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
 
-    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument("Unsupported data format ",
-                                        ToString(data_format_),
-                                        "; expected formats NHWC or NCHW"));
-
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -42,19 +38,36 @@ class SpaceToDepthOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_tensor_shape = ctx->InputShape(0);
-    int input_rank = input_tensor_shape.dims();
+    xla::XlaOp input = ctx->Input(0);
+
+    TensorFormat data_format = data_format_;
+    // If the data is in a vectorized format, reformat it into a non-vectorized
+    // version first. We'll undo the transformation later.
+    if (data_format == FORMAT_NCHW_VECT_C) {
+      data_format = FORMAT_NCHW;
+      auto input_reshaped = NCHW_VECT_CToNCHW(input);
+      OP_REQUIRES_OK(ctx, input_reshaped.status());
+      input = input_reshaped.ValueOrDie();
+    }
+
+    OP_REQUIRES(ctx, data_format == FORMAT_NCHW || data_format == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_)));
+
+    xla::XlaBuilder* builder = input.builder();
+    auto input_xla_shape = builder->GetShape(input);
+    OP_REQUIRES_OK(ctx, input_xla_shape.status());
+    const std::vector<int64>& input_shape =
+        input_xla_shape.ValueOrDie().dimensions();
+    int input_rank = input_shape.size();
+
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
                 errors::InvalidArgument("Input rank should be ", kRequiredDims,
                                         "; got ", input_rank));
-    const absl::InlinedVector<int64, 4> input_shape =
-        input_tensor_shape.dim_sizes();
-
-    xla::XlaOp input = ctx->Input(0);
 
-    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
-    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format);
 
     std::vector<int64> reshaped_shape;
     std::vector<int64> transpose_order;
@@ -62,7 +75,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     reshaped_shape.reserve(input_rank);
     transpose_order.reserve(input_rank);
     output_shape.reserve(input_rank);
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format == FORMAT_NHWC) {
       int64 block_elems = 1;
       for (int i = 0; i < num_spatial_dims; ++i) {
         OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
@@ -157,6 +170,14 @@ class SpaceToDepthOp : public XlaOpKernel {
     //
     xla::XlaOp output = xla::Reshape(permuted_reshaped, output_shape);
 
+    // If this used to be a vectorized format turn it back now.
+    if (data_format != data_format_) {
+      DCHECK(data_format == FORMAT_NCHW && data_format_ == FORMAT_NCHW_VECT_C);
+      auto output_reshaped = NCHWToNCHW_VECT_C(output);
+      OP_REQUIRES_OK(ctx, output_reshaped.status());
+      output = output_reshaped.ValueOrDie();
+    }
+
     ctx->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index d0c5231e843aefa68490e29475ee96bd92859aac..a93d137e96519837ae289f08ff4d32960970aad9 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -24,14 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1d68835e1271b9e83d98eff6c3973d2a2593e5e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -0,0 +1,362 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+
+#include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
+#include "tensorflow/compiler/tf2xla/lib/random.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+std::pair<xla::ThreeFry2x32State, xla::XlaOp> GetInputsFromCounter(
+    xla::XlaOp counter, const int64 size) {
+  auto builder = counter.builder();
+  auto input_u64 = Iota(builder, xla::U64, size);
+  input_u64 = input_u64 + counter;
+  counter = counter + xla::ConstantR0<uint64>(builder, size);
+  return std::make_pair(xla::Uint64ToUint32s(input_u64), counter);
+}
+
+// `StatelessRngUniformU32` uses ThreeFry2x32’s counter space too
+// wastefully, only able to generate 2^32*2 int32 numbers for each key, while
+// the real capacity is 2^64*2. Counter-space efficiency is important for
+// stateful ops, hence the following 2 new functions.
+std::pair<xla::XlaOp, xla::XlaOp> StatefulRngUniformU32(
+    xla::XlaOp key, xla::XlaOp counter, const xla::Shape& shape) {
+  auto builder = key.builder();
+  const int64 size = xla::ShapeUtil::ElementsIn(shape);
+  const int64 half_size = xla::CeilOfRatio<int64>(size, 2);
+  const bool size_is_odd = (half_size * 2 != size);
+  auto inputs_counter = GetInputsFromCounter(counter, half_size);
+  auto inputs = inputs_counter.first;
+  counter = inputs_counter.second;
+  auto outputs = xla::ThreeFry2x32(inputs, xla::Uint64ToUint32s(key));
+  if (size_is_odd) {
+    outputs[1] = Slice(outputs[1], {0}, {half_size - 1}, {1});
+  }
+  auto result = ConcatInDim(builder, outputs, 0);
+  return std::make_pair(Reshape(result, xla::AsInt64Slice(shape.dimensions())),
+                        counter);
+}
+
+std::pair<xla::XlaOp, xla::XlaOp> StatefulRngUniformU64(
+    xla::XlaOp key, xla::XlaOp counter, const xla::Shape& shape) {
+  const int64 size = xla::ShapeUtil::ElementsIn(shape);
+  auto inputs_counter = GetInputsFromCounter(counter, size);
+  auto inputs = inputs_counter.first;
+  counter = inputs_counter.second;
+  auto outputs = ThreeFry2x32(inputs, Uint64ToUint32s(key));
+  auto result = Uint32sToUint64(outputs);
+  return std::make_pair(Reshape(result, xla::AsInt64Slice(shape.dimensions())),
+                        counter);
+}
+
+std::pair<xla::XlaOp, xla::XlaOp> StatefulRngUniform(xla::XlaOp key,
+                                                     xla::XlaOp counter,
+                                                     const xla::Shape& shape,
+                                                     xla::XlaOp minval,
+                                                     xla::XlaOp maxval) {
+  auto builder = key.builder();
+  xla::PrimitiveType type = shape.element_type();
+  switch (type) {
+    case xla::F32: {
+      auto bits_counter = StatefulRngUniformU32(key, counter, shape);
+      auto bits = bits_counter.first;
+      counter = bits_counter.second;
+      return std::make_pair(xla::StatelessRngUniformF32(bits, minval, maxval),
+                            counter);
+    }
+    case xla::U32:  // fall through
+    case xla::S32: {
+      auto bits_counter = StatefulRngUniformU32(key, counter, shape);
+      auto bits = bits_counter.first;
+      counter = bits_counter.second;
+      return std::make_pair(
+          xla::StatelessRngUniformInt(bits, minval, maxval, type, xla::U32),
+          counter);
+    }
+    case xla::U64:  // fall through
+    case xla::S64: {
+      auto bits_counter = StatefulRngUniformU64(key, counter, shape);
+      auto bits = bits_counter.first;
+      counter = bits_counter.second;
+      return std::make_pair(
+          xla::StatelessRngUniformInt(bits, minval, maxval, type, xla::U64),
+          counter);
+    }
+    default:
+      return std::make_pair(builder->ReportError(xla::Unimplemented(
+                                "Types other than F32, U32, S32, U64 and S64 "
+                                "are not implemented by "
+                                "StatefulRngUniform.")),
+                            counter);
+  }
+}
+
+template <typename A, typename B, typename A2>
+std::pair<A2, B> map_first(std::function<A2(A)> f, std::pair<A, B> p) {
+  return std::make_pair(f(p.first), p.second);
+}
+
+std::pair<xla::XlaOp, xla::XlaOp> StatefulRngUniformFullInt(
+    xla::XlaOp key, xla::XlaOp counter, const xla::Shape& shape) {
+  xla::PrimitiveType type = shape.element_type();
+  switch (type) {
+    case xla::U32:
+      return StatefulRngUniformU32(key, counter, shape);
+    case xla::S32: {
+      // Needs explicit function type because of type-inference failure.
+      std::function<xla::XlaOp(xla::XlaOp)> f = [](xla::XlaOp x) {
+        return BitcastConvertType(x, xla::S32);
+      };
+      return map_first(f, StatefulRngUniformU32(key, counter, shape));
+    }
+    case xla::U64:
+      return StatefulRngUniformU64(key, counter, shape);
+    case xla::S64: {
+      std::function<xla::XlaOp(xla::XlaOp)> f = [](xla::XlaOp x) {
+        return BitcastConvertType(x, xla::S64);
+      };
+      return map_first(f, StatefulRngUniformU64(key, counter, shape));
+    }
+    default:
+      auto builder = key.builder();
+      return std::make_pair(
+          builder->ReportError(xla::Unimplemented(
+              "Types other than U32, S32, U64 and S64 are not implemented by "
+              "StatefulRngUniformFullInt; got: %s",
+              xla::primitive_util::LowercasePrimitiveTypeName(type))),
+          counter);
+  }
+}
+
+template <typename ListB, typename ListA, typename F>
+ListB Map(F f, ListA const& list_a) {
+  ListB list_b;
+  for (auto a : list_a) {
+    list_b.push_back(f(a));
+  }
+  return list_b;
+}
+
+xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
+                         absl::Span<const xla::XlaOp> scalars) {
+  return ConcatInDim(
+      builder,
+      Map<std::vector<xla::XlaOp>>(
+          [](xla::XlaOp x) { return xla::Reshape(x, {1}); }, scalars),
+      0);
+}
+
+using sampler_return_type = xla::StatusOr<std::pair<xla::XlaOp, xla::XlaOp>>;
+
+// A helper function containing the common part of several kernels below.
+// Precondition: 'algorithm' and 'shape' are compile-time constants.
+Status CompileImpl(XlaOpKernelContext* ctx, int state_input_idx,
+                   int alg_input_idx, int shape_input_idx,
+                   std::function<sampler_return_type(xla::XlaOp, xla::XlaOp,
+                                                     TensorShape)> const&
+                       sample_with_threefry) {
+  auto alg_shape = ctx->InputShape(alg_input_idx);
+  if (alg_shape.dims() != 0) {
+    return errors::InvalidArgument("algorithm must be of shape [], not ",
+                                   alg_shape.DebugString());
+  }
+  xla::Literal alg_literal;
+  TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
+  auto alg = alg_literal.Get<Algorithm>({});
+
+  if (alg == RNG_ALG_THREEFRY) {
+    xla::XlaOp var;
+    TensorShape var_shape;
+    TF_RETURN_IF_ERROR(ctx->ReadVariableInput(
+        state_input_idx, STATE_ELEMENT_DTYPE, &var_shape, &var));
+    if (var_shape.dims() != 1) {
+      return errors::InvalidArgument(
+          "RNG state must have one and only one dimension, not ",
+          var_shape.dims());
+    }
+    auto state_size = var_shape.dim_size(0);
+    if (state_size < THREEFRY_MIN_STATE_SIZE) {
+      return errors::InvalidArgument(
+          "For the ThreeFry algorithm, the size of state"
+          " must be at least ",
+          THREEFRY_MIN_STATE_SIZE, "; got ", state_size);
+    }
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(ctx->ConstantInputAsShape(shape_input_idx, &shape));
+
+    static constexpr int COUNTER_SIZE = 1;
+    auto counter = BitcastConvertType(
+        xla::Reshape(xla::Slice(var, {0}, {COUNTER_SIZE}, {1}), {}), xla::U64);
+    auto key = BitcastConvertType(
+        xla::Reshape(xla::Slice(var, {COUNTER_SIZE}, {COUNTER_SIZE + 1}, {1}),
+                     {}),
+        xla::U64);
+
+    auto status_or_value = sample_with_threefry(counter, key, shape);
+    if (!status_or_value.ok()) {
+      return status_or_value.status();
+    }
+    auto output_counter = status_or_value.ConsumeValueOrDie();
+    auto output = output_counter.first;
+    counter = output_counter.second;
+    ctx->SetOutput(0, output);
+    auto builder = ctx->builder();
+    var = ConcatScalars(builder, {counter, key});
+    xla::PrimitiveType state_element_type;
+    TF_RETURN_IF_ERROR(
+        DataTypeToPrimitiveType(STATE_ELEMENT_DTYPE, &state_element_type));
+    var = BitcastConvertType(var, state_element_type);
+    TF_RETURN_IF_ERROR(
+        ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("Unsupported algorithm id: ", alg);
+  }
+}
+
+class StatefulStandardNormalOp : public XlaOpKernel {
+ public:
+  explicit StatefulStandardNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto builder = ctx->builder();
+    auto sample_with_threefry =
+        // Needs explicit lambda return type because it fails to be inferred.
+        [builder, this](xla::XlaOp counter, xla::XlaOp key,
+                        TensorShape shape) -> sampler_return_type {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
+
+      auto uniform_counter = StatefulRngUniform(
+          key, counter, xla_shape,
+          xla::ConstantR0<float>(builder, std::nextafter(-1.0f, 0.0f)),
+          xla::ConstantR0<float>(builder, 1.0));
+      auto uniform = uniform_counter.first;
+      counter = uniform_counter.second;
+      // Convert uniform distribution to normal distribution by computing
+      // sqrt(2) * erfinv(x)
+      auto normal =
+          xla::ScalarLike(uniform, std::sqrt(2.0)) * xla::ErfInv(uniform);
+      normal = MaybeConvertF32ToBF16(normal, dtype_);
+      return {{normal, counter}};
+    };
+    OP_REQUIRES_OK(ctx,
+                   CompileImpl(ctx, /*state_input_idx=*/0, /*alg_input_idx=*/1,
+                               /*shape_input_idx=*/2, sample_with_threefry));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatefulStandardNormalOp);
+};
+
+// TODO(wangpeng): Support plain float16 and float64 to get rid of the
+//   `TypeConstraint`.
+REGISTER_XLA_OP(Name("StatefulStandardNormalV2")
+                    .CompileTimeConstantInput("algorithm")
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype", {DT_FLOAT, DT_BFLOAT16}),
+                StatefulStandardNormalOp);
+
+class StatefulUniformIntOp : public XlaOpKernel {
+ public:
+  explicit StatefulUniformIntOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp minval = ctx->Input(3);
+    xla::XlaOp maxval = ctx->Input(4);
+    auto sample_with_threefry = [minval, maxval, this](
+                                    xla::XlaOp counter, xla::XlaOp key,
+                                    TensorShape shape) -> sampler_return_type {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+      return StatefulRngUniform(key, counter, xla_shape, minval, maxval);
+    };
+    OP_REQUIRES_OK(ctx,
+                   CompileImpl(ctx, /*state_input_idx=*/0, /*alg_input_idx=*/1,
+                               /*shape_input_idx=*/2, sample_with_threefry));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatefulUniformIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatefulUniformInt")
+                    .CompileTimeConstantInput("algorithm")
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_UINT32, DT_INT64, DT_UINT64}),
+                StatefulUniformIntOp);
+
+class StatefulUniformFullIntOp : public XlaOpKernel {
+ public:
+  explicit StatefulUniformFullIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto sample_with_threefry = [this](
+                                    xla::XlaOp counter, xla::XlaOp key,
+                                    TensorShape shape) -> sampler_return_type {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+      return StatefulRngUniformFullInt(key, counter, xla_shape);
+    };
+    OP_REQUIRES_OK(ctx,
+                   CompileImpl(ctx, /*state_input_idx=*/0, /*alg_input_idx=*/1,
+                               /*shape_input_idx=*/2, sample_with_threefry));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatefulUniformFullIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatefulUniformFullInt")
+                    .CompileTimeConstantInput("algorithm")
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_UINT32, DT_INT64, DT_UINT64}),
+                StatefulUniformFullIntOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 50653d7b3973b73d580cdeec5d71943b575d7cc9..e143a711730720c0566f079e00965d876c869a99 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 
+#include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
 #include "tensorflow/compiler/tf2xla/lib/random.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -31,12 +32,8 @@ limitations under the License.
 #include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
-namespace {
 
 xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
-  // Mask the last 16 bit. With normal rounding, values near "maxval" would be
-  // converted to "maxval" which is out of range ["minval", "maxval"). In
-  // addition, the distribution near the limit is not uniform.
   if (dtype == DT_BFLOAT16) {
     xla::XlaBuilder* builder = input.builder();
     auto output = xla::BitcastConvertType(input, xla::U32) &
@@ -48,6 +45,26 @@ xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
   }
 }
 
+xla::XlaOp Uniform2NormalUsingSqrtErfinv(xla::XlaOp uniform) {
+  // Convert uniform distribution to normal distribution by computing
+  // sqrt(2) * erfinv(x)
+  return xla::ScalarLike(uniform, std::sqrt(2.0)) * xla::ErfInv(uniform);
+}
+
+// A wrapper of xla::StatelessRngUniform. Returns an op that produces random
+// values with uniform distribution in the range [minval, maxval) for the given
+// shape and given two 32-bit seeds. Currently only shapes of type F32, S32 and
+// S64 are implemented.
+xla::XlaOp StatelessRandomUniformImpl(const xla::Shape& shape, DataType dtype,
+                                      xla::XlaOp seed, xla::XlaOp minval,
+                                      xla::XlaOp maxval) {
+  xla::XlaOp seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+  xla::XlaOp seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+  return xla::StatelessRngUniform({seed0, seed1}, shape, minval, maxval);
+}
+
+namespace {
+
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
@@ -69,12 +86,8 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
-
-    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
-    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
-
-    auto uniform = xla::StatelessRngUniform(
-        {seed0, seed1}, xla_shape, xla::ConstantR0<float>(builder, 0.0),
+    xla::XlaOp uniform = StatelessRandomUniformImpl(
+        xla_shape, dtype_, seed, xla::ConstantR0<float>(builder, 0.0),
         xla::ConstantR0<float>(builder, 1.0));
     uniform = MaybeConvertF32ToBF16(uniform, dtype_);
     ctx->SetOutput(0, uniform);
@@ -123,12 +136,8 @@ class StatelessRandomUniformIntOp : public XlaOpKernel {
 
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
-
-    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
-    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
-
-    auto uniform =
-        xla::StatelessRngUniform({seed0, seed1}, xla_shape, minval, maxval);
+    xla::XlaOp uniform =
+        StatelessRandomUniformImpl(xla_shape, dtype_, seed, minval, maxval);
     ctx->SetOutput(0, uniform);
   }
 
@@ -164,18 +173,11 @@ class StatelessRandomNormalOp : public XlaOpKernel {
     xla::XlaBuilder* builder = ctx->builder();
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
-
-    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
-    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
-
-    auto uniform = xla::StatelessRngUniform(
-        {seed0, seed1}, xla_shape,
+    xla::XlaOp uniform = StatelessRandomUniformImpl(
+        xla_shape, dtype_, seed,
         xla::ConstantR0<float>(builder, std::nextafter(-1.0f, 0.0f)),
         xla::ConstantR0<float>(builder, 1.0));
-    // Convert uniform distribution to normal distribution by computing
-    // sqrt(2) * erfinv(x)
-    auto normal =
-        xla::ScalarLike(uniform, std::sqrt(2.0)) * xla::ErfInv(uniform);
+    xla::XlaOp normal = Uniform2NormalUsingSqrtErfinv(uniform);
     normal = MaybeConvertF32ToBF16(normal, dtype_);
     ctx->SetOutput(0, normal);
   }
@@ -211,18 +213,15 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
     xla::XlaOp seed = ctx->Input(1);
     xla::XlaBuilder* builder = ctx->builder();
 
-    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
-    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
-
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
-    auto uniform = xla::StatelessRngUniform(
-        {seed0, seed1}, xla_shape,
-        xla::ConstantR0<float>(builder, std::numeric_limits<float>::min()),
-        xla::ConstantR0<float>(builder, 1.0));
-    auto output = TruncatedNormal(uniform);
-    output = MaybeConvertF32ToBF16(output, dtype_);
-    ctx->SetOutput(0, output);
+    xla::XlaOp uniform = StatelessRandomUniformImpl(
+        xla_shape, dtype_, seed,
+        xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
+        xla::One(builder, xla_shape.element_type()));
+    xla::XlaOp truncated_normal = TruncatedNormal(uniform);
+    truncated_normal = MaybeConvertF32ToBF16(truncated_normal, dtype_);
+    ctx->SetOutput(0, truncated_normal);
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 2273b592466431f59abcc43fcac4c37eecd53bff..9da1504bff12b54c9ae10cb5c2fa00214642b551 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
 
@@ -291,7 +291,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     absl::InlinedVector<xla::XlaOp, 4> slice_begin;
     absl::InlinedVector<int64, 4> slice_dims;
     for (int i = 0; i < begin.size(); ++i) {
-      // TODO(phawkins): implement strides != 1
+      // TODO(b/121179231): implement strides != 1
       OP_REQUIRES(
           ctx, strides[i] == 1 || strides[i] == -1,
           errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 22e64789a3a0db104834baa8a2a914d2cd1742d1..b98b98ce50af2cb811297989899b06d33296bf13 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -27,14 +27,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 5dda62fd492a2e3bf9caf502c04f8ba8674a6510..9bc565e5d638bca682fadfc93044adb550c12893 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -18,36 +18,81 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 namespace {
 
-Status GetTensorListShape(xla::XlaBuilder* builder, xla::XlaOp op,
-                          TensorShape* tensor_list_shape) {
-  auto shape_or_status = builder->GetShape(op);
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
+class TensorListLengthOp : public XlaOpKernel {
+ public:
+  explicit TensorListLengthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp index;
+    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(ctx->Input(0), &index));
+    ctx->SetOutput(0, index);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListLengthOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListLength"), TensorListLengthOp);
+
+// Creates an empty list with size (leading_dim, *element_shape) if
+// element_shape is known at compile time. Otherwise creates one with size
+// (leading_dim, 0) which gets initialized later in `GetInitializedList`.
+Status CreateZerosList(XlaOpKernelContext* ctx, int element_shape_index,
+                       int64 leading_dim, DataType dtype, xla::XlaOp* list) {
+  TensorShape list_shape;
+  list_shape.AddDim(leading_dim);
+  xla::XlaOp element_shape_handle = ctx->Input(element_shape_index);
+  TF_ASSIGN_OR_RETURN(
+      bool is_element_shape_compile_time_const,
+      element_shape_handle.builder()->IsConstant(element_shape_handle));
+  PartialTensorShape partial_element_shape;
+  if (is_element_shape_compile_time_const) {
+    TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(
+        element_shape_index, &partial_element_shape));
+  }
+  if (is_element_shape_compile_time_const &&
+      partial_element_shape.IsFullyDefined()) {
+    TensorShape element_shape;
+    partial_element_shape.AsTensorShape(&element_shape);
+    list_shape.AppendShape(element_shape);
+  } else {
+    // If element_shape is not a compile time constant or if it is not fully
+    // defined we will have to wait for the first write call to fully allocate
+    // the array.
+    // TODO(srbs): We are using element_shape of [0] as a proxy to denote an
+    // uninitialized list. A better implementation may be to represent the
+    // list as a 3-tuple containining an explicit "initialized" flag. However,
+    // we would still need to create a dummy tensor for the first tuple
+    // element.
+    list_shape.AddDim(0);
   }
-  xla::Shape shape = shape_or_status.ValueOrDie();
-  TF_RET_CHECK(shape.IsTuple());
-  return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
-                               tensor_list_shape);
+  *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
+                         list_shape.dim_sizes());
+  return Status::OK();
 }
 
 class TensorListReserveOp : public XlaOpKernel {
@@ -57,19 +102,18 @@ class TensorListReserveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
     int64 num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
 
-    TensorShape tensor_shape;
-    tensor_shape.AddDim(num_elements);
-    tensor_shape.AppendShape(element_shape);
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, CreateZerosList(ctx, 0, num_elements, dtype_, &buffer));
 
-    xla::XlaBuilder* b = ctx->builder();
-    ctx->SetOutput(0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                                    tensor_shape.dim_sizes()),
-                                     xla::ConstantR0<int32>(b, 0)}));
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(
+        ctx, BuildTensorList(
+                 buffer, xla::ConstantR0<int32>(ctx->builder(), num_elements),
+                 &output_list));
+    ctx->SetTensorListOutput(0, output_list);
   }
 
  private:
@@ -90,8 +134,6 @@ class EmptyTensorListOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
     OP_REQUIRES(
@@ -99,14 +141,15 @@ class EmptyTensorListOp : public XlaOpKernel {
         errors::InvalidArgument("XLA compilation requires a fixed tensor list "
                                 "size. Set the max number of elements."));
 
-    TensorShape tensor_shape;
-    tensor_shape.AddDim(max_num_elements);
-    tensor_shape.AppendShape(element_shape);
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx,
+                   CreateZerosList(ctx, 0, max_num_elements, dtype_, &buffer));
 
-    xla::XlaBuilder* b = ctx->builder();
-    ctx->SetOutput(0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                                    tensor_shape.dim_sizes()),
-                                     xla::ConstantR0<int32>(b, 0)}));
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(
+        ctx, BuildTensorList(buffer, xla::ConstantR0<int32>(ctx->builder(), 0),
+                             &output_list));
+    ctx->SetTensorListOutput(0, output_list);
   }
 
  private:
@@ -130,7 +173,7 @@ class TensorListElementShapeOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
     TensorShape shape;
-    OP_REQUIRES_OK(ctx, GetTensorListShape(b, ctx->Input(0), &shape));
+    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
     shape.RemoveDim(0);
 
     switch (shape_type_) {
@@ -160,6 +203,185 @@ class TensorListElementShapeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListElementShape"), TensorListElementShapeOp);
 
+class TensorListGetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListGetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp state = ctx->Input(0);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
+
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(state, &buffer));
+    xla::XlaOp index = ctx->Input(1);
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+    auto slice_shape = shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    xla::XlaOp read = xla::DynamicSlice(buffer, start_indices, slice_shape);
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+
+    ctx->SetOutput(0, xla::Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp);
+
+class TensorListStackOp : public XlaOpKernel {
+ public:
+  explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(ctx->Input(0), &buffer));
+    ctx->SetOutput(0, buffer);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListStackOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
+
+class TensorListFromTensorOp : public XlaOpKernel {
+ public:
+  explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsPartialShape(1, &element_shape));
+
+    const TensorShape tensor_shape = ctx->InputShape(0);
+    // Ensure that tensor_shape is compatible with element_shape.
+    PartialTensorShape unused;
+    OP_REQUIRES_OK(
+        ctx,
+        element_shape.MergeWith(
+            PartialTensorShape(
+                absl::Span<const int64>(tensor_shape.dim_sizes()).subspan(1)),
+            &unused));
+    OP_REQUIRES(ctx, tensor_shape.dims() > 0,
+                errors::InvalidArgument("Input value must be at least a "
+                                        "vector but received shape: ",
+                                        tensor_shape.DebugString()));
+    const int num_elements = tensor_shape.dim_size(0);
+
+    xla::XlaBuilder* b = ctx->builder();
+    const xla::XlaOp tensor = ctx->Input(0);
+
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(
+        ctx, BuildTensorList(tensor, xla::ConstantR0<int32>(b, num_elements),
+                             &output_list));
+    ctx->SetTensorListOutput(0, output_list);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListFromTensorOp);
+};
+
+REGISTER_XLA_OP(
+    Name("TensorListFromTensor").CompileTimeConstantInput("element_shape"),
+    TensorListFromTensorOp);
+
+// Returns the 0'th element of `tuple` containing the list tensor if it has been
+// initialized already else creates one lazily. This allows lazy initialization
+// of the list on the first call to SetItem or PushBack.
+Status GetInitializedList(const xla::XlaOp& input_list,
+                          const TensorShape& element_shape, DataType dtype,
+                          xla::XlaOp* output_list_buffer) {
+  bool is_already_initialized;
+  TF_RETURN_IF_ERROR(
+      IsTensorListInitialized(input_list, &is_already_initialized));
+  TensorShape input_list_shape;
+  TF_RETURN_IF_ERROR(GetTensorListBufferShape(input_list, &input_list_shape));
+  TensorShape input_list_element_shape = input_list_shape;
+  input_list_element_shape.RemoveDim(0);
+
+  if (is_already_initialized) {
+    TF_RET_CHECK(element_shape == input_list_element_shape);
+    TF_RETURN_IF_ERROR(GetTensorListBuffer(input_list, output_list_buffer));
+    return Status::OK();
+  }
+
+  int64 leading_dim = input_list_shape.dim_size(0);
+  TensorShape output_list_shape = element_shape;
+  output_list_shape.InsertDim(0, leading_dim);
+
+  xla::XlaOp output_list;
+  TF_RETURN_IF_ERROR(
+      InitializeTensorList(input_list, output_list_shape, &output_list));
+  TF_RETURN_IF_ERROR(GetTensorListBuffer(output_list, output_list_buffer));
+  return Status::OK();
+}
+
+class TensorListSetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp tl = ctx->Input(0);
+    TensorShape elem_shape = ctx->InputShape(2);
+
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetInitializedList(tl, elem_shape, dtype_, &buffer));
+    xla::XlaOp push_index;
+    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(tl, &push_index));
+
+    xla::XlaOp index = ctx->Input(1);
+    xla::XlaOp value = ctx->Input(2);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = xla::Reshape(value, slice_shape.dim_sizes());
+
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(ctx, BuildTensorList(xla::DynamicUpdateSlice(buffer, update,
+                                                                start_indices),
+                                        push_index, &output_list));
+    ctx->SetTensorListOutput(0, output_list);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListSetItem"), TensorListSetItemOp);
+
 class TensorListPushBackOp : public XlaOpKernel {
  public:
   explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -168,11 +390,15 @@ class TensorListPushBackOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp list_tuple = ctx->Input(0);
     TensorShape elem_shape = ctx->InputShape(1);
 
-    xla::XlaOp ta = xla::GetTupleElement(list, 0);
-    xla::XlaOp index = xla::GetTupleElement(list, 1);
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx,
+                   GetInitializedList(list_tuple, elem_shape, dtype_, &buffer));
+
+    xla::XlaOp index;
+    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(list_tuple, &index));
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
@@ -184,11 +410,12 @@ class TensorListPushBackOp : public XlaOpKernel {
     slice_shape.InsertDim(0, 1LL);
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
-    ctx->SetOutput(
-        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
-                          index + xla::ConstantR0<int32>(b, 1)}));
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(
+        ctx,
+        BuildTensorList(xla::DynamicUpdateSlice(buffer, update, start_indices),
+                        index + xla::ConstantR0<int32>(b, 1), &output_list));
+    ctx->SetTensorListOutput(0, output_list);
   }
 
  private:
@@ -210,10 +437,12 @@ class TensorListPopBackOp : public XlaOpKernel {
     xla::XlaOp state = ctx->Input(0);
 
     TensorShape shape;
-    OP_REQUIRES_OK(ctx, GetTensorListShape(b, state, &shape));
+    OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(0), &shape));
 
-    xla::XlaOp ta = xla::GetTupleElement(state, 0);
-    xla::XlaOp index = xla::GetTupleElement(state, 1);
+    xla::XlaOp ta;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(state, &ta));
+    xla::XlaOp index;
+    OP_REQUIRES_OK(ctx, GetTensorListPushIndex(state, &index));
 
     index = index - xla::ConstantR0<int32>(b, 1);
 
@@ -224,13 +453,13 @@ class TensorListPopBackOp : public XlaOpKernel {
     auto slice_shape = shape.dim_sizes();
     slice_shape[0] = 1LL;
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
     xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
     // Remove the leading '1' dimension.
     std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
 
-    ctx->SetOutput(0, xla::Tuple(b, {ta, index}));
+    xla::XlaOp output_list;
+    OP_REQUIRES_OK(ctx, BuildTensorList(ta, index, &output_list));
+    ctx->SetTensorListOutput(0, output_list);
     ctx->SetOutput(1, xla::Reshape(read, value_shape));
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa6ee2ac35e3584ce5580d06fc02c5fb97f54edd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+bool IsTensorListInput(XlaOpKernelContext* ctx, int index) {
+  return ctx->InputExpression(index).kind() == XlaExpression::Kind::kTensorList;
+}
+
+Status BuildTensorList(const xla::XlaOp& buffer, const xla::XlaOp& push_index,
+                       xla::XlaOp* output_list) {
+  TF_RET_CHECK(buffer.builder());
+  *output_list = xla::Tuple(buffer.builder(), {buffer, push_index});
+  return Status::OK();
+}
+
+Status GetTensorListBuffer(const xla::XlaOp& op, xla::XlaOp* buffer) {
+  TF_RET_CHECK(op.builder());
+  *buffer = xla::GetTupleElement(op, 0);
+  return Status::OK();
+}
+
+Status GetTensorListPushIndex(const xla::XlaOp& op, xla::XlaOp* push_index) {
+  TF_RET_CHECK(op.builder());
+  *push_index = xla::GetTupleElement(op, 1);
+  return Status::OK();
+}
+
+Status GetTensorListBufferShape(const xla::XlaOp& op,
+                                TensorShape* buffer_shape) {
+  TF_RET_CHECK(op.builder());
+  TensorShape shape;
+  TF_ASSIGN_OR_RETURN(const xla::Shape& list_tuple_shape,
+                      op.builder()->GetShape(op));
+  return GetTensorListBufferShape(list_tuple_shape, buffer_shape);
+}
+
+Status GetTensorListBufferShape(const xla::Shape& list_shape,
+                                TensorShape* buffer_shape) {
+  TF_RET_CHECK(list_shape.IsTuple());
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(
+      xla::ShapeUtil::GetTupleElementShape(list_shape, 0), buffer_shape));
+  return Status::OK();
+}
+
+Status IsTensorListInitialized(const xla::XlaOp& op, bool* is_initialized) {
+  TensorShape list_shape;
+  TF_RETURN_IF_ERROR(GetTensorListBufferShape(op, &list_shape));
+  *is_initialized = !(list_shape.dims() == 2 && list_shape.dim_size(1) == 0);
+  return Status::OK();
+}
+
+Status InitializeTensorList(const xla::XlaOp& uninitialized_list,
+                            const TensorShape& buffer_shape,
+                            xla::XlaOp* output_list) {
+  TensorShape input_buffer_shape;
+  TF_RETURN_IF_ERROR(
+      GetTensorListBufferShape(uninitialized_list, &input_buffer_shape));
+  if (input_buffer_shape.dim_size(0) != buffer_shape.dim_size(0)) {
+    return errors::InvalidArgument(
+        "Number of elements in input list does not match buffer size. ",
+        "input list size: ", input_buffer_shape.dim_size(0),
+        "buffer size: ", buffer_shape.dim_size(0));
+  }
+  xla::XlaBuilder* builder = uninitialized_list.builder();
+  xla::XlaOp input_buffer;
+  TF_RETURN_IF_ERROR(GetTensorListBuffer(uninitialized_list, &input_buffer));
+  TF_ASSIGN_OR_RETURN(const xla::Shape& input_buffer_xla_shape,
+                      builder->GetShape(input_buffer));
+  auto new_buffer = xla::Broadcast(
+      xla::ConstantLiteral(builder, xla::LiteralUtil::Zero(
+                                        input_buffer_xla_shape.element_type())),
+      buffer_shape.dim_sizes());
+  xla::XlaOp push_index;
+  TF_RETURN_IF_ERROR(GetTensorListPushIndex(uninitialized_list, &push_index));
+  return BuildTensorList(new_buffer, push_index, output_list);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..937af6f8d77499248b46069822cac291aae0d60b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
+
+// TensorList utilities.
+//
+// Tensor lists are represented as tuple consisting of a pre-allocated buffer
+// consisting of the tensors (and where dim 0 is the list index), along with a
+// scalar telling us the next index to push a value at.
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+// Whether the input expression at `index` corresponds to a TensorList.
+bool IsTensorListInput(XlaOpKernelContext* ctx, int index);
+
+// Builds a TensorList from its constituents, `buffer` and `push_index`.
+Status BuildTensorList(const xla::XlaOp& buffer, const xla::XlaOp& push_index,
+                       xla::XlaOp* output_list);
+
+// Returns the buffer for the TensorList.
+Status GetTensorListBuffer(const xla::XlaOp& op, xla::XlaOp* buffer);
+
+// Returns the push_index for the TensorList.
+Status GetTensorListPushIndex(const xla::XlaOp& op, xla::XlaOp* push_index);
+
+// Returns the shape of the TensorList buffer.
+Status GetTensorListBufferShape(const xla::XlaOp& op,
+                                TensorShape* buffer_shape);
+
+// Inputs the TensorList shape and returns the buffer shape.
+Status GetTensorListBufferShape(const xla::Shape& list_shape,
+                                TensorShape* buffer_shape);
+
+// Returns whether the TensorList has been initialized.
+//
+// A TensorList is considered initialized if its element_shape is completely
+// known.
+Status IsTensorListInitialized(const xla::XlaOp& op, bool* is_initialized);
+
+// Inputs an uninitialized list and a buffer_shape and returns an initialized
+// list. The initialized list uses the dtype and push index of the uninitialized
+// list and is filled with zeros.
+Status InitializeTensorList(const xla::XlaOp& uninitialized_list,
+                            const TensorShape& buffer_shape,
+                            xla::XlaOp* output_list);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index ee3bdf3394e37c757f31724e73e95417becaa534..22cfd16008899c1ad3c73453bec34a0b0d2e8c78 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 26d4214099d1d07c1b2e275d783654d9cd948e28..247db8d5d172b04e414b1ff0e53f12b533f36944 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
 namespace {
@@ -856,15 +855,12 @@ class ResourceApplyAdadelta : public XlaOpKernel {
     xla::XlaOp grad = ctx->Input(6);
 
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp neg_half = XlaHelpers::FloatLiteral(b, dtype_, -0.5);
-    xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
     xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
-    xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
 
-    accum = rho * accum + (one - rho) * xla::Pow(grad, two);
-    xla::XlaOp update = xla::Pow(accum_update + epsilon, half) *
-                        xla::Pow(accum + epsilon, neg_half) * grad;
-    accum_update = rho * accum_update + (one - rho) * xla::Pow(update, two);
+    accum = rho * accum + (one - rho) * xla::Square(grad);
+    xla::XlaOp update =
+        xla::Sqrt(accum_update + epsilon) * xla::Rsqrt(accum + epsilon) * grad;
+    accum_update = rho * accum_update + (one - rho) * xla::Square(update);
     var = var - update * lr;
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum));
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index c9b324a243e4cc3ec64daa3ca0d285336a0d0154..65569576d4146dfdd1464251c9b42a7621d3ebc8 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -18,15 +18,15 @@ limitations under the License.
 // handles all transposes, while Eigen needs a restricted DoTranspose
 // helper.
 
-#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -128,29 +128,46 @@ class InvertPermutationOp : public XlaOpKernel {
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
 
-    std::vector<int64> perm;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
-
-    int size = perm.size();
+    auto e = ctx->InputExpression(0);
+    auto tensor_or_status = e.ResolveConstant(ctx->compiler()->client());
+    OP_REQUIRES_OK(ctx, tensor_or_status.status());
+    // If the input is a constant, we also want the output to be a constant.
+    // Some models rely on the result of InvertPermutation being a constant.
+    // TODO(b/32495713): Remove this when we can check whether Scatter is
+    // constant. Right now, we always assume it is non-constant because we don't
+    // check the embedded computation.
+    if (tensor_or_status.ValueOrDie().has_value()) {
+      std::vector<int64> perm;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
+
+      int size = perm.size();
+
+      std::vector<int32> output(size);
+      std::fill_n(output.data(), size, -1);
+      for (int i = 0; i < size; ++i) {
+        const int64 d = perm[i];
+        OP_REQUIRES(ctx, FastBoundsCheck(d, size),
+                    errors::InvalidArgument(d, " is not between 0 and ", size));
+        OP_REQUIRES(ctx, output[d] == -1,
+                    errors::InvalidArgument(d, " is duplicated in the input."));
+        output[d] = i;
+      }
 
-    std::vector<int32> output(size);
-    std::fill_n(output.data(), size, -1);
-    for (int i = 0; i < size; ++i) {
-      const int64 d = perm[i];
-      OP_REQUIRES(ctx, FastBoundsCheck(d, size),
-                  errors::InvalidArgument(d, " is not between 0 and ", size));
-      OP_REQUIRES(ctx, output[d] == -1,
-                  errors::InvalidArgument(d, " is duplicated in the input."));
-      output[d] = i;
+      ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
+    } else {
+      auto indices = ctx->Input(0);
+      int size = ctx->InputShape(0).num_elements();
+      auto iota = xla::Iota(ctx->builder(), xla::S32, size);
+      auto result = XlaScatter(iota, iota, indices,
+                               /*indices_are_vectors=*/false, /*combiner=*/{},
+                               ctx->builder());
+      OP_REQUIRES_OK(ctx, result.status());
+      ctx->SetOutput(0, result.ValueOrDie());
     }
-
-    ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
   }
 };
 
-REGISTER_XLA_OP(Name("InvertPermutation")
-                    .TypeConstraint("T", DT_INT32)
-                    .CompileTimeConstantInput("x"),
+REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
                 InvertPermutationOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a0ea6422d732b00fc1b8cf855d9c9ad603b87c82..7c4176eb839f85e6d68565d22e04f982354a7282 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -65,11 +65,8 @@ XLAJIT_MAKE_UNARY(Exp, xla::Exp(x));
 XLAJIT_MAKE_UNARY(Expm1, xla::Expm1(x));
 XLAJIT_MAKE_UNARY(Floor, xla::Floor(x));
 XLAJIT_MAKE_UNARY(IsFinite, xla::IsFinite(x));
-XLAJIT_MAKE_UNARY(
-    IsInf,
-    xla::Eq(xla::Abs(x),
-            xla::ScalarLike(x, std::numeric_limits<double>::infinity())));
-XLAJIT_MAKE_UNARY(IsNan, xla::Ne(x, x));
+XLAJIT_MAKE_UNARY(IsInf, xla::IsInf(x));
+XLAJIT_MAKE_UNARY(IsNan, xla::IsNan(x));
 // Return 1/x
 XLAJIT_MAKE_UNARY(Inv, xla::ScalarLike(x, 1.0) / x);
 XLAJIT_MAKE_UNARY(Reciprocal, xla::ScalarLike(x, 1.0) / x);
@@ -92,8 +89,9 @@ xla::XlaOp Sigmoid(xla::XlaOp x) {
 }
 XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(x));
 
-// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign, xla::Sign(x));
+// Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
+XLAJIT_MAKE_UNARY(Sign,
+                  xla::Select(xla::Ne(x, x), xla::ZerosLike(x), xla::Sign(x)));
 XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x));
 
 // softplus(x) = log(1 + exp(x))
@@ -116,82 +114,10 @@ XLAJIT_MAKE_UNARY(Tanh, xla::Tanh(x));
 
 XLAJIT_MAKE_UNARY(Real, xla::Real(x));
 XLAJIT_MAKE_UNARY(Imag, xla::Imag(x));
-
-#undef XLAJIT_MAKE_UNARY
-
-// Erf/Erfc.  For x in (-1, 1), the erf approximation is used; erfc polynomial
-// is used outside of this range.
-class ErfOp : public XlaOpKernel {
- public:
-  explicit ErfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp x = ctx->Input(0);
-    xla::XlaOp one = xla::ScalarLike(x, 1.0);
-    auto y =
-        xla::Select(xla::Gt(xla::Abs(x), one), one - xla::Erfc(x), xla::Erf(x));
-    ctx->SetOutput(0, y);
-  }
-};
-REGISTER_XLA_OP(Name("Erf"), ErfOp);
-
-class ErfcOp : public XlaOpKernel {
- public:
-  explicit ErfcOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp x = ctx->Input(0);
-    xla::XlaOp one = xla::ScalarLike(x, 1.0);
-    auto y =
-        xla::Select(xla::Lt(xla::Abs(x), one), one - xla::Erf(x), xla::Erfc(x));
-    ctx->SetOutput(0, y);
-  }
-};
-REGISTER_XLA_OP(Name("Erfc"), ErfcOp);
-
-class LgammaOp : public XlaOpKernel {
- public:
-  explicit LgammaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  // Calculate lgamma using the Lanczos approximation
-  // (https://en.wikipedia.org/wiki/Lanczos_approximation).
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp input = ctx->Input(0);
-    xla::PrimitiveType input_type = ctx->input_xla_type(0);
-
-    if (input_type == xla::F16 || input_type == xla::BF16) {
-      // The approximation works better with at least 32-bits of accuracy.
-      xla::XlaOp input_f32 = xla::ConvertElementType(input, xla::F32);
-      xla::XlaOp result_f32 = xla::Lgamma(input_f32);
-      xla::XlaOp result_x16 = xla::ConvertElementType(result_f32, input_type);
-      ctx->SetOutput(0, result_x16);
-    } else {
-      xla::XlaOp result = xla::Lgamma(input);
-      ctx->SetOutput(0, result);
-    }
-  }
-};  // namespace
-REGISTER_XLA_OP(Name("Lgamma"), LgammaOp);
-
-class DigammaOp : public XlaOpKernel {
- public:
-  explicit DigammaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  // Calculate lgamma using the Lanczos approximation
-  // (https://en.wikipedia.org/wiki/Lanczos_approximation).
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp input = ctx->Input(0);
-    xla::PrimitiveType input_type = ctx->input_xla_type(0);
-
-    if (input_type == xla::F16 || input_type == xla::BF16) {
-      // The approximation works better with at least 32-bits of accuracy.
-      xla::XlaOp input_f32 = xla::ConvertElementType(input, xla::F32);
-      xla::XlaOp result_f32 = xla::Digamma(input_f32);
-      xla::XlaOp result_x16 = xla::ConvertElementType(result_f32, input_type);
-      ctx->SetOutput(0, result_x16);
-    } else {
-      xla::XlaOp result = xla::Digamma(input);
-      ctx->SetOutput(0, result);
-    }
-  }
-};  // namespace
-REGISTER_XLA_OP(Name("Digamma"), DigammaOp);
+XLAJIT_MAKE_UNARY(Erf, xla::Erf(x));
+XLAJIT_MAKE_UNARY(Erfc, xla::Erfc(x));
+XLAJIT_MAKE_UNARY(Lgamma, xla::Lgamma(x));
+XLAJIT_MAKE_UNARY(Digamma, xla::Digamma(x));
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index 8671632976023fded04c26a9780c1a67638b0916..2d95f2f30a86f3a9c95e528858c53ab48d7a02e8 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -24,13 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 941b04363f8386a7bdbe8c91ea34c9754592a52d..885031ca0b8a57731f8020937307bd37624d41f1 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/while_op.h"
 
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -25,21 +27,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
+const char kPropagateCompileTimeConsts[] = "_xla_propagate_compile_time_consts";
+
 namespace {
 
 // Builds XlaCompiler argument descriptions `args` from `ctx`.
 Status MakeXlaCompilerArgumentsFromInputs(
     XlaOpKernelContext* ctx, std::vector<XlaCompiler::Argument>* args,
-    bool* has_uninitialized_vars, bool* has_tensor_arrays) {
+    bool* has_uninitialized_vars, bool* has_tensor_arrays,
+    bool* has_uninitialized_tensor_lists) {
   VLOG(2) << "Num inputs " << ctx->num_inputs();
   args->resize(ctx->num_inputs());
   *has_uninitialized_vars = false;
   *has_tensor_arrays = false;
+  *has_uninitialized_tensor_lists = false;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
     VLOG(2) << " Input " << i << " type: " << DataTypeString(ctx->input_type(i))
             << " shape: " << ctx->InputShape(i).DebugString();
@@ -70,18 +78,177 @@ Status MakeXlaCompilerArgumentsFromInputs(
       arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.ShapeHumanString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
-      arg.type = ctx->input_type(i);
-      arg.shape = ctx->InputShape(i);
+      arg.type = type;
+      TF_ASSIGN_OR_RETURN(arg.shape, ctx->builder()->GetShape(ctx->Input(i)));
+      if (IsTensorListInput(ctx, i)) {
+        // arg.initialized == false means that the element_shape of the list
+        // was not available at the time of building the list so an empty list
+        // was created instead. If so, the body function of While is run once
+        // to infer the shape of the list before actually building the While op.
+        TF_RETURN_IF_ERROR(
+            IsTensorListInitialized(ctx->Input(i), &arg.initialized));
+        if (!arg.initialized) {
+          *has_uninitialized_tensor_lists = true;
+        }
+      }
     }
   }
   return Status::OK();
 }
 
+// Populates loop invariant indices to true in `loop_invariants`.
+void GetLoopInvariants(XlaOpKernelContext* ctx,
+                       const NameAttrList& body_name_attr,
+                       std::vector<bool>* const loop_invariants) {
+  const FunctionBody* body;
+  OP_REQUIRES_OK(ctx, ctx->compiler()->FindFunctionBody(body_name_attr, &body));
+  for (int i = 0; i < body->ret_nodes.size(); i++) {
+    const Node* arg = body->arg_nodes[i];
+    const Node* ret = body->ret_nodes[i];
+    const Node* ret_input_0;
+    OP_REQUIRES_OK(ctx, ret->input_node(0, &ret_input_0));
+    (*loop_invariants)[i] = ret_input_0->id() == arg->id();
+  }
+}
+
+// Converts entries in `args` which are loop invariants and have compile
+// time constant inputs to constants so that they can be propagated in the loop
+// body.
+Status ConvertLoopInvariantsToConst(
+    XlaOpKernelContext* ctx, const NameAttrList& body_name_attr,
+    std::vector<XlaCompiler::Argument>* args,
+    std::vector<bool>* compile_time_const_arg_indices,
+    int* num_compile_time_const_args, xla::Client* client) {
+  std::vector<bool> loop_invariants(ctx->num_inputs());
+  GetLoopInvariants(ctx, body_name_attr, &loop_invariants);
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    XlaCompiler::Argument& arg = (*args)[i];
+    const XlaExpression& expression = ctx->InputExpression(i);
+    // If this is a loop invariant and the input tensor is a compile time
+    // constant build a kConstant type argument.
+    if (arg.kind != XlaCompiler::Argument::kResource && loop_invariants[i]) {
+      // NOTE: We can not simple check that this is Kind::kConstant because
+      // this could be the output of a MetadataOnly op e.g. Size.
+      xla::StatusOr<absl::optional<Tensor>> maybe_constant =
+          expression.ResolveConstant(client);
+      if (maybe_constant.ok() && maybe_constant.ValueOrDie().has_value()) {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.type = expression.dtype();
+        arg.constant_value = std::move(maybe_constant.ValueOrDie().value());
+        arg.shape = expression.GetShape().ValueOrDie();
+        compile_time_const_arg_indices->at(i) = true;
+        (*num_compile_time_const_args)++;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyBodyInputAndOutputShapeMatch(
+    XlaOpKernelContext* ctx,
+    const std::vector<bool>& compile_time_const_arg_indices,
+    const XlaCompiler::CompilationResult& body, bool has_token_input_output) {
+  xla::Shape body_input_shape = body.xla_input_shapes[0];
+  xla::Shape body_output_shape;
+  body_output_shape.set_element_type(xla::TUPLE);
+  for (int i = 0; i < ctx->num_outputs(); i++) {
+    if (!compile_time_const_arg_indices[i]) {
+      *(body_output_shape.add_tuple_shapes()) =
+          body.xla_output_shape.tuple_shapes(i);
+    }
+  }
+  // If `body` has a token output, append its shape to `body_output_shape`.
+  if (has_token_input_output) {
+    *(body_output_shape.add_tuple_shapes()) =
+        body.xla_output_shape.tuple_shapes(ctx->num_inputs());
+  }
+  if (!xla::ShapeUtil::Compatible(body_input_shape, body_output_shape)) {
+    return errors::InvalidArgument(
+        "Input and output shapes of loop body do not match: ",
+        xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
+        xla::ShapeUtil::HumanString(body_output_shape));
+  }
+  return Status::OK();
+}
+
+xla::StatusOr<xla::XlaComputation> BuildWrappedCond(
+    XlaOpKernelContext* ctx, const XlaCompiler::CompilationResult& cond) {
+  xla::Shape cond_input_shape = cond.xla_input_shapes[0];
+  std::unique_ptr<xla::XlaBuilder> cb =
+      ctx->builder()->CreateSubBuilder("cond_wrapper");
+  auto inputs = xla::Parameter(cb.get(), 0, cond_input_shape, "inputs");
+  auto outputs = xla::Call(cb.get(), *cond.computation, {inputs});
+  xla::GetTupleElement(outputs, 0);
+  return cb->Build();
+}
+
+xla::StatusOr<xla::XlaComputation> BuildWrappedBody(
+    XlaOpKernelContext* ctx, const XlaCompiler::CompilationResult& body,
+    const std::vector<bool>& compile_time_const_arg_indices,
+    int num_compile_time_const_args, bool has_token_input_output) {
+  if (num_compile_time_const_args <= 0) {
+    return xla::XlaComputation(body.computation->proto());
+  }
+  xla::XlaComputation body_wrapper;
+  std::unique_ptr<xla::XlaBuilder> cb =
+      ctx->builder()->CreateSubBuilder("body_wrapper");
+  xla::Shape body_input_shape = body.xla_input_shapes[0];
+  auto inputs = xla::Parameter(cb.get(), 0, body_input_shape, "inputs");
+  // Call the original body function which has mismatched inputs and outputs
+  // and strip the compile time consts from the list of outputs. While requires
+  // the inputs and outputs of its body function to match.
+  auto outputs = xla::Call(cb.get(), *body.computation, {inputs});
+  std::vector<xla::XlaOp> non_compile_time_const_outputs;
+  for (int i = 0; i < compile_time_const_arg_indices.size(); i++) {
+    if (!compile_time_const_arg_indices[i]) {
+      non_compile_time_const_outputs.push_back(
+          xla::GetTupleElement(outputs, i));
+    }
+  }
+  // If `body` has a token output, append it to
+  // `non_compile_time_const_outputs`.
+  if (has_token_input_output) {
+    non_compile_time_const_outputs.push_back(
+        xla::GetTupleElement(outputs, ctx->num_outputs()));
+  }
+  xla::Tuple(cb.get(), non_compile_time_const_outputs);
+  return cb->Build();
+}
+
+xla::XlaOp BuildWhile(XlaOpKernelContext* ctx,
+                      const xla::XlaComputation& wrapped_cond,
+                      const xla::XlaComputation& wrapped_body,
+                      const xla::XlaOp& initial_values,
+                      const std::vector<int>& input_mapping,
+                      const std::vector<bool>& compile_time_const_arg_indices,
+                      int num_compile_time_const_args,
+                      bool has_token_input_output) {
+  xla::XlaOp while_result =
+      xla::While(wrapped_cond, wrapped_body, initial_values);
+  std::vector<xla::XlaOp> padded_while_outputs(ctx->num_outputs());
+  int while_result_index = 0;
+  for (int i = 0; i < ctx->num_inputs(); i++) {
+    if (!compile_time_const_arg_indices[i]) {
+      padded_while_outputs[input_mapping[while_result_index]] =
+          xla::GetTupleElement(while_result, while_result_index);
+      while_result_index++;
+    } else {
+      padded_while_outputs[i] = ctx->Input(i);
+    }
+  }
+  // If `body` has a token output, append it to `padded_while_outputs`.
+  if (has_token_input_output) {
+    padded_while_outputs.push_back(xla::GetTupleElement(
+        while_result, ctx->num_inputs() - num_compile_time_const_args));
+  }
+  return xla::Tuple(ctx->builder(), padded_while_outputs);
+}
+
 }  // anonymous namespace
 
 XlaWhileOp::XlaWhileOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -95,6 +262,10 @@ XlaWhileOp::XlaWhileOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   } else {
     has_token_input_output_ = !token_input_nodes_.empty();
   }
+  if (ctx->HasAttr(kPropagateCompileTimeConsts)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kPropagateCompileTimeConsts,
+                                     &propagate_compile_time_consts_));
+  }
 }
 
 void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
@@ -103,13 +274,33 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   std::vector<XlaCompiler::Argument> arguments;
   bool has_uninitialized_vars;
   bool has_tensor_arrays;
-  OP_REQUIRES_OK(
-      ctx, MakeXlaCompilerArgumentsFromInputs(
-               ctx, &arguments, &has_uninitialized_vars, &has_tensor_arrays));
+  bool has_uninitialized_tensor_lists;
+  OP_REQUIRES_OK(ctx, MakeXlaCompilerArgumentsFromInputs(
+                          ctx, &arguments, &has_uninitialized_vars,
+                          &has_tensor_arrays, &has_uninitialized_tensor_lists));
 
   xla::XlaBuilder* builder = ctx->builder();
   XlaCompiler* compiler = ctx->compiler();
 
+  // Indices of loop vars which satisfy the following conditions:
+  // 1. They are loop invariants.
+  // 2. The op inputs at these indices are compile time constants.
+  //
+  // These compile time consts do not appear as _Args in the cond/body functions
+  // and are replaced by kConstant nodes instead. As as result, the compiled
+  // body function does not have matching input and output shape. We fix this
+  // by rewriting the body computation (see body_wrapper below) to output
+  // just the non compile-time-const values and later pad up the while output
+  // with the const args.
+  std::vector<bool> compile_time_const_arg_indices(ctx->num_inputs());
+  int num_compile_time_const_args = 0;
+  if (propagate_compile_time_consts_) {
+    OP_REQUIRES_OK(ctx, ConvertLoopInvariantsToConst(
+                            ctx, body_name_attr_, &arguments,
+                            &compile_time_const_arg_indices,
+                            &num_compile_time_const_args, compiler->client()));
+  }
+
   VLOG(1) << "Compiling body";
 
   // All resource that are inputs to the loop's body must also be
@@ -145,10 +336,13 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   //    Hence we can use the output shapes and TensorArray gradients of each
   //    resource as the "true" shapes.
   // 2) again with the "correct" resource information determined by (1).
-  if (has_uninitialized_vars || has_tensor_arrays) {
+  if (has_uninitialized_vars || has_tensor_arrays ||
+      has_uninitialized_tensor_lists) {
     VLOG(2) << "Recompiling loop body: has_uninitialized_vars: "
             << has_uninitialized_vars
-            << " has_tensor_arrays: " << has_tensor_arrays;
+            << " has_tensor_arrays: " << has_tensor_arrays
+            << " has_uninitialized_tensor_lists: "
+            << has_uninitialized_tensor_lists;
     // Initializes any uninitialized resource with zero values of the
     // shape determined by the first compilation.
     for (int i = 0; i < body.resource_updates.size(); ++i) {
@@ -185,6 +379,23 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
     }
+
+    // Set the shape of any uninitialized TensorLists to the shape determined by
+    // the first compilation. Note that, unlike resources, we do not initialize
+    // the input list with zeros here, that is done later.
+    xla::Shape body_output_shape = body.xla_output_shape;
+    OP_REQUIRES(ctx, body_output_shape.IsTuple(),
+                errors::FailedPrecondition(
+                    "xla_output_shape of while body must be a tuple."));
+    for (int i = 0; i < arguments.size(); i++) {
+      XlaCompiler::Argument& arg = arguments[i];
+      if (arg.initialized || !IsTensorListInput(ctx, i)) {
+        continue;
+      }
+      arg.shape = body_output_shape.tuple_shapes(i);
+      arg.initialized = true;
+    }
+
     // Recompile the body with the "correct" resource shapes.
     VLOG(1) << "Recompiling body with corrected resource shapes";
     body = {};
@@ -225,12 +436,13 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
                   "Input shapes of loop body and condition do not match: ",
                   xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
                   xla::ShapeUtil::HumanString(cond_input_shape)));
-  OP_REQUIRES(
-      ctx, xla::ShapeUtil::Compatible(body_input_shape, body.xla_output_shape),
-      errors::InvalidArgument(
-          "Input and output shapes of loop body do not match: ",
-          xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
-          xla::ShapeUtil::HumanString(body.xla_output_shape)));
+
+  // Check that the shape of the body outputs excluding the compile time const
+  // args (which are pruned from the body outputs in body_wapper) matches the
+  // shape of the inputs.
+  OP_REQUIRES_OK(ctx, VerifyBodyInputAndOutputShapeMatch(
+                          ctx, compile_time_const_arg_indices, body,
+                          has_token_input_output_));
 
   xla::Shape expected_cond_output_shape_without_side_effect =
       xla::ShapeUtil::MakeTupleShape(
@@ -267,8 +479,28 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       XlaResource* resource;
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource));
       OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], builder));
+    } else if (IsTensorListInput(ctx, input_num)) {
+      // If the list received as input is uninitialized but its shape was
+      // inferred in the first compilation pass we create a new list filled
+      // with zeros and used that as the input to the while op.
+      TensorShape input_list_shape;
+      OP_REQUIRES_OK(ctx, GetTensorListBufferShape(ctx->Input(input_num),
+                                                   &input_list_shape));
+      TensorShape body_arg_shape;
+      OP_REQUIRES_OK(ctx,
+                     GetTensorListBufferShape(body_input_shape.tuple_shapes(i),
+                                              &body_arg_shape));
+      // Shape of the input list may differ from the shape of the body/cond
+      // input if the list's shape was inferred after the first compilation and
+      // the body/cond was recompiled with the updated shape of the list.
+      if (input_list_shape != body_arg_shape) {
+        OP_REQUIRES_OK(ctx, InitializeTensorList(ctx->Input(input_num),
+                                                 body_arg_shape, &inputs[i]));
+      } else {
+        inputs[i] = ctx->Input(input_num);
+      }
     } else {
-      inputs[i] = ctx->Input(i);
+      inputs[i] = ctx->Input(input_num);
     }
   }
 
@@ -277,26 +509,28 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   VLOG(1) << "Building while loop";
 
   // Wraps the condition in a computation that unpacks the output tuple.
-  xla::XlaComputation cond_wrapper;
-  {
-    std::unique_ptr<xla::XlaBuilder> cb =
-        builder->CreateSubBuilder("cond_wrapper");
-    auto inputs = xla::Parameter(cb.get(), 0, cond_input_shape, "inputs");
-    auto outputs = xla::Call(cb.get(), *cond.computation, {inputs});
-    xla::GetTupleElement(outputs, 0);
-    xla::StatusOr<xla::XlaComputation> result = cb->Build();
-    OP_REQUIRES_OK(ctx, result.status());
-    cond_wrapper = std::move(result.ValueOrDie());
-  }
-
-  xla::XlaOp while_result = xla::While(cond_wrapper, *body.computation, init);
+  xla::StatusOr<xla::XlaComputation> cond_result = BuildWrappedCond(ctx, cond);
+  OP_REQUIRES_OK(ctx, cond_result.status());
+  xla::XlaComputation wrapped_cond = std::move(cond_result.ValueOrDie());
+
+  // Remove compile time const args from the list of body outputs.
+  xla::StatusOr<xla::XlaComputation> body_result =
+      BuildWrappedBody(ctx, body, compile_time_const_arg_indices,
+                       num_compile_time_const_args, has_token_input_output_);
+  OP_REQUIRES_OK(ctx, body_result.status());
+  xla::XlaComputation wrapped_body = std::move(body_result.ValueOrDie());
+
+  // Builds the While op and pads its output with the compile time const args.
+  xla::XlaOp while_result =
+      BuildWhile(ctx, wrapped_cond, wrapped_body, init, body.input_mapping,
+                 compile_time_const_arg_indices, num_compile_time_const_args,
+                 has_token_input_output_);
 
   // Sets non-variable outputs and determine when resource variables start.
   int resource_index = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
     if (ctx->input_type(i) != DT_RESOURCE) {
-      ctx->SetOutput(body.input_mapping[i],
-                     xla::GetTupleElement(while_result, i));
+      ctx->SetOutput(i, xla::GetTupleElement(while_result, i));
       ++resource_index;
     } else {
       break;
@@ -341,8 +575,11 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   VLOG(1) << "Done building while loop";
 }
 
-REGISTER_XLA_OP(Name("While").AllowResourceTypes(), XlaWhileOp);
-REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes(), XlaWhileOp);
-REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp);
+REGISTER_XLA_OP(Name("While").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
+REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
+REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
index aeeff40e68f8b778628b9e85bd9b4ddcb73883a5..16ec8d0e520b5a282318f8e5225bcec65818e3e8 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+extern const char kPropagateCompileTimeConsts[];
+
 // This TensorFlow op provides a functional iteration primitive.
 //
 // The inputs and outputs of the loop body must agree on the number, types, and
@@ -58,6 +60,10 @@ class XlaWhileOp : public XlaOpKernel {
   NameAttrList body_name_attr_;
   bool has_token_input_output_;
   std::vector<string> token_input_nodes_;
+  // Whether to propagate compile time consts into the loop body.
+  // This is not supported by default now since it may cause HBM memory
+  // overheads.
+  bool propagate_compile_time_consts_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaWhileOp);
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
index b20adc592a0d3d2129c897218ddbfc891b4cd40a..0b5b66ae52f9b9e8251813a1af2fc35eff4a42ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
@@ -35,10 +35,9 @@ class XlaConvOp : public XlaOpKernel {
     string precision_config_attr;
     OP_REQUIRES_OK(
         context, context->GetAttr("precision_config", &precision_config_attr));
-    OP_REQUIRES(
-        context,
-        precision_config_.ParsePartialFromString(precision_config_attr),
-        errors::InvalidArgument("Error parsing convolution dimension numbers"));
+    OP_REQUIRES(context,
+                precision_config_.ParsePartialFromString(precision_config_attr),
+                errors::InvalidArgument("Error parsing precison config."));
   }
 
   void Compile(XlaOpKernelContext* context) override {
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..233ac8e7b455403f8ee65b95b1403ecefdb92c6b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSelfAdjointEigOp : public XlaOpKernel {
+ public:
+  explicit XlaSelfAdjointEigOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result =
+        xla::SelfAdjointEig(ctx->Input(0), lower_, max_iter_, epsilon_);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+
+ private:
+  bool lower_;
+  int32 max_iter_;
+  float epsilon_;
+};
+
+class SelfAdjointEigV2Op : public XlaOpKernel {
+ public:
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    int n = input_shape.dim_size(input_shape.dims() - 1);
+    // This is based on heuristics that approx log(n) sweep updates are needed.
+    // Note: the heuristics provides no theoretical guarantee, max_iter=100 and
+    // epsilon should be used to determine exit condition.
+    int max_iter = 2 * tensorflow::Log2Ceiling(n);
+    auto result = xla::SelfAdjointEig(ctx->Input(0), true, max_iter, 1e-6);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaSelfAdjointEig").TypeConstraint("T", kFloatTypes),
+                XlaSelfAdjointEigOp);
+REGISTER_XLA_OP(Name("SelfAdjointEigV2").TypeConstraint("T", kFloatTypes),
+                SelfAdjointEigV2Op);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de627b8d1b32b1af0ef834daab9550a80f1c3fa0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/lib/svd.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSvdOp : public XlaOpKernel {
+ public:
+  explicit XlaSvdOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+    string precision_config_attr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("precision_config", &precision_config_attr));
+    OP_REQUIRES(ctx,
+                precision_config_.ParsePartialFromString(precision_config_attr),
+                errors::InvalidArgument("Error parsing precison config."));
+    if (precision_config_.operand_precision_size() == 0) {
+      precision_config_.mutable_operand_precision()->Add(
+          xla::PrecisionConfig::HIGHEST);
+    }
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = xla::SVD(ctx->Input(0), max_iter_, epsilon_,
+                           precision_config_.operand_precision(0));
+    ctx->SetOutput(0, result.d);
+    ctx->SetOutput(1, result.u);
+    ctx->SetOutput(2, result.v);
+  }
+
+ private:
+  int32 max_iter_;
+  float epsilon_;
+  xla::PrecisionConfig precision_config_;
+};
+
+class SvdOp : public XlaOpKernel {
+ public:
+  explicit SvdOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("compute_uv", &compute_uv_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("full_matrices", &full_matrices_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    int m = input_shape.dim_size(input_shape.dims() - 2);
+    int n = input_shape.dim_size(input_shape.dims() - 1);
+    // This is based on heuristics that approx log(n) sweep updates are needed.
+    // Note: the heuristics provides no theoretical guarantee, max_iter=100 and
+    // epsilon should be used to determine exit condition.
+    int max_iter = 2 * tensorflow::Log2Ceiling(std::max(m, n));
+    auto result = xla::SVD(ctx->Input(0), max_iter, 1e-6);
+    ctx->SetOutput(0, result.d);
+    if (compute_uv_) {
+      int p = std::min(m, n);
+      if (!full_matrices_) {
+        if (p < m) {
+          result.u = xla::SliceInMinorDims(result.u, {0, 0}, {m, p});
+        }
+        if (p < n) {
+          result.v = xla::SliceInMinorDims(result.v, {0, 0}, {n, p});
+        }
+      }
+      ctx->SetOutput(1, result.u);
+      ctx->SetOutput(2, result.v);
+    }
+  }
+
+ private:
+  bool compute_uv_;
+  bool full_matrices_;
+};
+
+REGISTER_XLA_OP(Name("XlaSvd").TypeConstraint("T", kFloatTypes), XlaSvdOp);
+REGISTER_XLA_OP(Name("Svd").TypeConstraint("T", kFloatTypes), SvdOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 3d7b0bc959f9dbf3c1b9749379e2ea0d285b302b..f9ce50be6e343144ab0691872381fa6c5aa45f6c 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -82,3 +82,15 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "data_format",
+    srcs = ["data_format.cc"],
+    hdrs = ["data_format.h"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.cc b/tensorflow/compiler/tf2xla/lib/data_format.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0253bcdc5f922a970e24782aba67fbd1a907e091
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/data_format.cc
@@ -0,0 +1,87 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/data_format.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+xla::StatusOr<xla::XlaOp> Contract(xla::XlaOp input, int64 dim) {
+  xla::XlaBuilder* builder = input.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
+
+  if (input_shape.dimensions().back() != 4) {
+    return errors::InvalidArgument("Expected last dimension to be 4; got ",
+                                   input_shape.dimensions().back());
+  }
+
+  // Transpose the input so C is directly followed by VECT_C.
+  std::vector<int64> permutation;
+  for (int64 i = 0; i != input_shape.rank() - 1; ++i) {
+    permutation.push_back(i);
+    if (i == dim) {
+      permutation.push_back(input_shape.rank() - 1);
+    }
+  }
+
+  // Now merge the adjacent dimensions with a reshape.
+  std::vector<int64> contracted_shape(input_shape.dimensions().begin(),
+                                      input_shape.dimensions().end() - 1);
+  contracted_shape[dim] *= 4;
+
+  return xla::Reshape(xla::Transpose(input, permutation), contracted_shape);
+}
+
+xla::StatusOr<xla::XlaOp> Expand(xla::XlaOp input, int64 dim) {
+  xla::XlaBuilder* builder = input.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
+
+  if (input_shape.dimensions(dim) % 4 != 0) {
+    return errors::InvalidArgument(
+        "Expected vectorized dimension to be evenly divisible by 4; got ",
+        input_shape.dimensions(dim));
+  }
+
+  // Split the `dim` into two dimensions with a reshape. The size of the new
+  // dimension is always 4.
+  std::vector<int64> expanded_shape(input_shape.dimensions());
+  expanded_shape[dim] /= 4;
+  expanded_shape.insert(expanded_shape.begin() + dim, 4);
+
+  // Move the newly created dimension to the end with a transpose.
+  std::vector<int64> permutation;
+  for (int64 i = 0; i != expanded_shape.size(); ++i) {
+    permutation.push_back(i);
+    if (i == dim) {
+      ++i;
+    }
+  }
+  permutation.push_back(dim + 1);
+
+  return xla::Transpose(xla::Reshape(input, expanded_shape), permutation);
+}
+
+}  // namespace
+
+xla::StatusOr<xla::XlaOp> NCHW_VECT_CToNCHW(xla::XlaOp input) {
+  return Contract(input, 1);
+}
+
+xla::StatusOr<xla::XlaOp> NCHWToNCHW_VECT_C(xla::XlaOp input) {
+  return Expand(input, 1);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.h b/tensorflow/compiler/tf2xla/lib/data_format.h
new file mode 100644
index 0000000000000000000000000000000000000000..839723b0ea87ffff67b3d79e362759a42b8f88fe
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/data_format.h
@@ -0,0 +1,37 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+// Reformat from NCHW_VECT_C to NCHW.
+//
+// Prerequisites: the last dimension of the input must be of size 4.
+xla::StatusOr<xla::XlaOp> NCHW_VECT_CToNCHW(xla::XlaOp input);
+
+// Reformat from NCHW to NCHW_VECT_C.
+//
+// Prerequisites: the vectorized dimension `C` must be a multiple of 4.
+xla::StatusOr<xla::XlaOp> NCHWToNCHW_VECT_C(xla::XlaOp input);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_DATA_FORMAT_H_
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 7140b6a1227a53290c3747892a55886a7f48513b..cb6e0fbef4b5e0a43514a7e42135d5a1c5cc9f17 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -17,6 +17,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index af641131ed76a8d6a7291c360302fa17c94af014..b82c04bd5a8795314465f67ceb3485648eeaf453 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -56,6 +58,75 @@ lhs_output: the broadcasted LHS tensor
 rhs_output: the broadcasted RHS tensor
 )doc");
 
+REGISTER_OP("XlaSelfAdjointEig")
+    .Input("a: T")
+    .Attr("lower: bool")
+    .Attr("max_iter: int")
+    .Attr("epsilon: float")
+    .Output("w: T")
+    .Output("v: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of self-adjoint matrices
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+i=0...N-1.
+
+a: the input tensor.
+
+lower: a boolean specifies whether the calculation is done with the lower
+  triangular part or the upper triangular part.
+
+max_iter: maximum number of sweep update, i.e., the whole lower triangular
+  part or upper triangular part based on parameter lower. Heuristically, it has
+  been argued that approximatly logN sweeps are needed in practice (Ref: Golub &
+  van Loan "Matrix Computation").
+
+epsilon: the tolerance ratio.
+
+w: The eigenvalues in ascending order, each repeated according to its
+  multiplicity.
+v: The column v[..., :, i] is the normalized eigenvector corresponding to the
+  eigenvalue w[..., i].
+)doc");
+
+REGISTER_OP("XlaSvd")
+    .Input("a: T")
+    .Attr("max_iter: int")
+    .Attr("epsilon: float")
+    .Attr("precision_config: string")
+    .Output("s: T")
+    .Output("u: T")
+    .Output("v: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of self-adjoint matrices
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost M-by-N matrices in
+tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[...,:,:]).
+
+a: the input tensor.
+
+max_iter: maximum number of sweep update, i.e., the whole lower triangular
+  part or upper triangular part based on parameter lower. Heuristically, it has
+  been argued that approximatly log(min (M, N)) sweeps are needed in practice
+  (Ref: Golub & van Loan "Matrix Computation").
+
+epsilon: the tolerance ratio.
+
+precision_config: a serialized xla::PrecisionConfig proto.
+
+s: Singular values. The values are sorted in reverse order of magnitude, so
+  s[..., 0] is the largest value, s[..., 1] is the second largest, etc.
+u: Left singular vectors.
+v: Right singular vectors.
+)doc");
+
 REGISTER_OP("XlaConv")
     .Input("lhs: T")
     .Input("rhs: T")
@@ -437,5 +508,86 @@ transpose_output: Boolean to determine if output is transposed. transpose_output
      is faster when input is large and rank of input is higher than 1.
 )doc");
 
+REGISTER_OP("XlaEinsum")
+    .Input("a: T")
+    .Input("b: T")
+    .Output("product: T")
+    .Attr("equation: string")
+    .Attr("T: {bfloat16, float}")
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      shape_inference::ShapeHandle input_a = context->input(0);
+      shape_inference::ShapeHandle input_b = context->input(1);
+
+      int64 rank_a, rank_b;
+      if (context->RankKnown(input_a)) {
+        rank_a = context->Rank(input_a);
+      } else {
+        return errors::InvalidArgument("input 0's rank is unknown.");
+      }
+      if (context->RankKnown(input_b)) {
+        rank_b = context->Rank(input_b);
+      } else {
+        return errors::InvalidArgument("input 1's rank is unknown.");
+      }
+      string equation;
+      TF_RETURN_IF_ERROR(context->GetAttr("equation", &equation));
+
+      std::map<char, shape_inference::DimensionHandle> left_map;
+      std::map<char, shape_inference::DimensionHandle> right_map;
+      std::vector<shape_inference::DimensionHandle> dims;
+
+      std::vector<string> equation_split = absl::StrSplit(equation, "->");
+
+      if (equation_split.size() != 2) {
+        return errors::InvalidArgument("Expected one \"->\" in equation. Got: ",
+                                       equation);
+      }
+
+      std::vector<string> lhs_rhs_split =
+          absl::StrSplit(equation_split[0], ',');
+      if (lhs_rhs_split.size() != 2) {
+        return errors::InvalidArgument("Expected one \",\" in equation. Got: ",
+                                       equation);
+      }
+
+      if (rank_a != lhs_rhs_split[0].size()) {
+        return errors::InvalidArgument(absl::StrCat(
+            "Expected equation[0] with size: ", rank_a, " Got '",
+            lhs_rhs_split[0], "'", " with size: ", lhs_rhs_split[0].size()));
+      }
+
+      if (rank_b != lhs_rhs_split[1].size()) {
+        return errors::InvalidArgument(absl::StrCat(
+            "Expected equation[1] with size: ", rank_b, " Got '",
+            lhs_rhs_split[1], "'", " with size: ", lhs_rhs_split[1].size()));
+      }
+
+      for (const char& c : lhs_rhs_split[0]) {
+        left_map[c] = context->Dim(input_a, left_map.size());
+      }
+      for (const char& c : lhs_rhs_split[1]) {
+        right_map[c] = context->Dim(input_b, right_map.size());
+      }
+
+      for (const char& c : equation_split[1]) {
+        if (left_map.count(c)) {
+          dims.push_back(left_map[c]);
+        } else if (right_map.count(c)) {
+          dims.push_back(right_map[c]);
+        } else {
+          return errors::InvalidArgument("Invalid equation: ", equation);
+        }
+      }
+
+      context->set_output(0, context->MakeShape(dims));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op which supports basic einsum op with 2 inputs and 1 output.
+
+This op has better TPU performnce since it doesn't have explicitly reshape and
+transpose operations as tf.einsum does.
+)doc");
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 345193c936a885e5a9e468979c4b73b5b0c9e5c2..8732ee04d7577f83ff427548e271a90fa827c5f4 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -291,8 +291,40 @@ def dot_general(lhs, rhs, dimension_numbers, precision_config=None, name=None):
       name=name)
 
 
+def self_adjoint_eig(a, lower, max_iter, epsilon):
+  return gen_xla_ops.xla_self_adjoint_eig(a, lower, max_iter, epsilon)
+
+
+def svd(a, max_iter, epsilon, precision_config=None):
+  precision_config_proto = ""
+  if precision_config:
+    precision_config_proto = precision_config.SerializeToString()
+  return gen_xla_ops.xla_svd(a, max_iter, epsilon, precision_config_proto)
+
+
 dynamic_slice = gen_xla_ops.xla_dynamic_slice
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
+einsum = gen_xla_ops.xla_einsum
+
+
+@ops.RegisterGradient('XlaEinsum')
+def _einsum_grad(op, grad):
+  equation = op.get_attr('equation')
+  inputs, output = equation.split('->')
+  left, right = inputs.split(',')
+
+  return [
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[1],
+          equation='{},{}->{}'.format(output, right, left),
+          name=None),
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[0],
+          equation='{},{}->{}'.format(output, left, right),
+          name=None)
+  ]
 
 # TODO(phawkins): generalize tf.pad to support interior padding, and then remove
 # the XLA-specific pad operator.
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index c20d6a5fd1f3bd7dad30cb3359d13ed4609a2250..29ebf46e4bf72c650d3768acd997de1fe3c1322c 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -82,6 +82,9 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
   add("ResourceStridedSliceAssign"           , kReadWrite, kVariable);
+  add("StatefulStandardNormalV2"             , kReadWrite, kVariable);
+  add("StatefulUniformFullInt"               , kReadWrite, kVariable);
+  add("StatefulUniformInt"                   , kReadWrite, kVariable);
   add("VarIsInitializedOp"                   , kRead,      kVariable);
   add("VariableShape"                        , kRead,      kVariable);
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 9fac16a9700419b189bf5393c2b8bd7d76c6c1cc..fb44ae0391a3e02b0584436d6e95dcac3777b320 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -45,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -164,12 +164,10 @@ Status RewriteAndPruneGraph(
   std::unordered_set<const Node*> retval_nodes;
   TF_RETURN_IF_ERROR(
       AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
-  VLOG(2) << "Post rewrite: "
-          << dump_graph::DumpGraphToFile("tf2xla_post_rewrite", *graph);
+  VLOG(2) << "Post rewrite: " << DumpGraphToFile("tf2xla_post_rewrite", *graph);
   PruneForReverseReachability(graph, retval_nodes);
   FixupSourceAndSinkEdges(graph);
-  VLOG(2) << "Post prune: "
-          << dump_graph::DumpGraphToFile("tfcompile_post_prune", *graph);
+  VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
   std::set<string> missing_feeds, missing_fetches;
   for (const tf2xla::Feed& feed : config.feed()) {
@@ -243,7 +241,9 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
+    arg.shape = shape;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
@@ -252,7 +252,8 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
+Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
+                         const tf2xla::Config& config, xla::Client* client,
                          xla::XlaComputation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
@@ -262,6 +263,29 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   std::vector<XlaCompiler::Argument> xla_args;
   TF_RETURN_IF_ERROR(CreateXlaArgs(*graph, &xla_args));
 
+  std::vector<xla::XlaBuilder::InputOutputAlias> xla_aliases;
+  // Populate arguments with resource variables from the config. The variables
+  // get turned into inputs and outputs.
+  int64 input_num = xla_args.size();
+  int64 output_num = config.fetch_size();
+  for (const tf2xla::Variable& variable : config.variable()) {
+    XlaCompiler::Argument arg;
+    arg.type = variable.type();
+    arg.kind = XlaCompiler::Argument::kResource;
+    arg.shape = variable.shape();
+    arg.name = variable.node_name();
+    arg.resource_kind = XlaResource::kVariable;
+    arg.initialized = true;
+    xla_args.push_back(std::move(arg));
+
+    // We want to alias the input and output of the variable, so the updates are
+    // carried out in-place.
+    xla_aliases.push_back({/*output_index=*/{output_num},
+                           /*param_number=*/input_num, /*param_index=*/{}});
+    ++input_num;
+    ++output_num;
+  }
+
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
@@ -274,7 +298,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   XlaCompiler::CompilationResult result;
   TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
                                            "tfcompile", std::move(graph),
-                                           xla_args, &result));
+                                           xla_args, xla_aliases, &result));
   *computation = std::move(*result.computation);
 
   int num_const_results = 0;
@@ -359,7 +383,8 @@ Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             xla::XlaComputation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphToXla(std::move(graph), config, client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.proto b/tensorflow/compiler/tf2xla/tf2xla.proto
index 18c9089f5fa0e9792a4763d9bfac4c4e826eb5b2..5627af7452b99da594c1c214d0b556d8d70544d5 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.proto
+++ b/tensorflow/compiler/tf2xla/tf2xla.proto
@@ -39,6 +39,15 @@ message Fetch {
   string name = 2;  // Optional name for generated code.
 };
 
+// Variable represents a resource variable with the given name, shape and type.
+message Variable {
+  string node_name = 1;
+  string name =
+      2;  // Optional name for generated code. If empty, node_name will be used.
+  TensorShapeProto shape = 3;
+  DataType type = 4;
+}
+
 // Config represents configuration information for tf2xla conversion.
 message Config {
   // Each feed is a positional input argument for the generated computation.
@@ -47,4 +56,6 @@ message Config {
   // Each fetch is a positional output argument for the generated computation.
   // The order of each entry matches the order of each output argument.
   repeated Fetch fetch = 2;
+  // Each variable is a named input and output of the generated computation.
+  repeated Variable variable = 3;
 };
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 18d87727c500619bf386be7d8c7085724f44aba3..e5e4bf8bcf43e80d0579672bb172643654e38343 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
+#include <functional>
 #include <queue>
 #include <random>
 #include <set>
@@ -113,7 +114,7 @@ Status ReplaceArgUsageWithConstNode(
   // Collect all _Arg nodes.
   std::unordered_map<int, Node*> arg_nodes;
   for (Node* n : g->op_nodes()) {
-    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       arg_nodes[index] = n;
@@ -122,7 +123,12 @@ Status ReplaceArgUsageWithConstNode(
 
   for (const auto& iter : const_input_index_to_node) {
     int arg_index = iter.first;
-    Node* const_node = g->CopyNode(iter.second);
+    NodeDef const_def = iter.second->def();
+    const_def.set_name(g->NewName(const_def.name()));
+    Status s;
+    Node* const_node = g->AddNode(const_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
     Node* arg_node = arg_nodes[arg_index];
 
     // Collect all usages of the _Arg node.
@@ -265,6 +271,13 @@ Status PropagateConstIntoWhileNode(Graph* g, Node* while_node,
     }
 
     // Check if i-th retval's input comes from i-th arg directly.
+    // For resource variable input of While nodes, TF2XLA convention is to place
+    // them at the end of all inputs (after all data inputs), and *not* return
+    // them. So number of While node inputs might be larger than number of its
+    // outputs.
+    if (i >= body_func->signature().output_arg_size()) {
+      continue;
+    }
     const OpDef_ArgDef& output_arg = body_func->signature().output_arg(i);
     auto output_arg_input = body_func->ret().find(output_arg.name());
     if (output_arg_input == body_func->ret().end()) {
@@ -543,7 +556,9 @@ uint32 GetXLARandomSeed() {
   // after an overflow. When seeded with zero, some XLA backends
   // can return all zeros instead of random numbers.
   static std::atomic<uint32> counter(InitialRandomSeed());
-  return counter.fetch_add(2);
+  uint32 seed = counter.fetch_add(2);
+  std::srand(seed);
+  return std::rand() | 1;
 }
 
 // TODO(b/77601805): add tests for associated function related stuff.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 202e929315cacd4d6cdfc69d50639d8a427ec6c2..28b4744470e7d28863b5f7275f829b9bd59641e1 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -329,5 +331,90 @@ TEST(CachedFunctionHandles, Basic) {
   TF_EXPECT_OK(cached_function_handles.ReleaseAllHandles());
 }
 
+TEST(PropagateConstIntoFunctionalNodes, WhileLoopWithResourceInput) {
+  FunctionLibraryDefinition fld(OpRegistry::Global(), {});
+  {
+    // Cond graph & body graph.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto pred = ops::_Arg(scope.WithOpName("pred"), DT_BOOL, 0);
+    auto input = ops::_Arg(scope.WithOpName("input"), DT_RESOURCE, 1);
+    auto ret = ops::_Retval(scope.WithOpName("ret"), pred, 0);
+    Graph graph(OpRegistry::Global());
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "cond", &cond_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(cond_fdef));
+    FunctionDef body_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "body", &body_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(body_fdef));
+  }
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto pred = ops::Const(scope.WithOpName("pred"), false, TensorShape({}));
+  auto input = ops::Const(scope.WithOpName("input"), 0, TensorShape({}));
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op =
+      ops::While(scope.WithOpName("while"),
+                 std::initializer_list<Input>{pred, input}, cond_fn, body_fn);
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
+}
+
+TEST(PropagateConstIntoFunctionalNodes, CopiedConstNodeHasUniqueName) {
+  FunctionLibraryDefinition fld(OpRegistry::Global(), {});
+  {
+    // Cond graph & body graph.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto pred = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto input = ops::_Arg(scope.WithOpName("arg1"), DT_BOOL, 1);
+    auto duplicate_name = ops::NoOp(scope.WithOpName("duplicate_name"));
+    auto ret = ops::_Retval(scope.WithOpName("ret"), pred, 0);
+    Graph graph(OpRegistry::Global());
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "cond", &cond_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(cond_fdef));
+    FunctionDef body_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "body", &body_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(body_fdef));
+  }
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto pred =
+      ops::Const(scope.WithOpName("duplicate_name"), false, TensorShape({}));
+  auto input = ops::Const(scope.WithOpName("input"), false, TensorShape({}));
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op =
+      ops::While(scope.WithOpName("while"),
+                 std::initializer_list<Input>{pred, input}, cond_fn, body_fn);
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
+
+  // Check that in rewritten body function, the NoOp node still has name
+  // "duplicate_name", and the copied Const node has name "duplicate_name/_0".
+  auto node_name_index = graph.BuildNodeNameIndex();
+  Node* while_node = node_name_index["while"];
+  ASSERT_NE(while_node, nullptr);
+  TF_ASSERT_OK(GetNodeAttr(while_node->def(), "body", &body_fn));
+  const FunctionDef* rewritten_body_fn = fld.Find(body_fn.name());
+  ASSERT_NE(rewritten_body_fn, nullptr);
+  std::unordered_map<string, NodeDef> nodes;
+  for (const NodeDef& node_def : rewritten_body_fn->node_def()) {
+    nodes[node_def.name()] = node_def;
+  }
+  auto noop_def = nodes.find("duplicate_name");
+  ASSERT_NE(noop_def, nodes.end());
+  EXPECT_EQ(noop_def->second.op(), "NoOp");
+  auto const_def = nodes.find("duplicate_name/_0");
+  ASSERT_NE(const_def, nodes.end());
+  EXPECT_EQ(const_def->second.op(), "Const");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index ddb284966eeb97cc7c9d3ed77fb313e567975e59..f98d07d196ea8551f1a5b53fa2e88e7bc43639de 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -42,7 +42,7 @@ class XlaCompilationAllocator : public Allocator {
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // Regardless of the size requested, always allocates an XlaExpression.
-    // Respects the aligment request because there is alignment checking even
+    // Respects the alignment request because there is alignment checking even
     // for Tensors whose data is never accessed.
     void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
@@ -60,8 +60,6 @@ class XlaCompilationAllocator : public Allocator {
   // buffers, so they get ids to track.
   bool ShouldAllocateEmptyTensors() override { return true; }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   // Don't run any constructors or destructors for complex objects,
   // since there is no backing store for the tensor to run them
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 15fd2656862e43532e33066414c5eac73593a5c0..86a25177d271c49d185de4f37a892297ba8b22c9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
@@ -31,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -42,9 +42,12 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace {
@@ -57,7 +60,11 @@ Status CheckSignature(const DataTypeVector& types,
                             " elements while function has ", types.size());
   }
   for (int i = 0; i < types.size(); ++i) {
-    if (types[i] != args[i].type && types[i] != DT_RESOURCE) {
+    // Don't perform type checks on resource variables and tensor
+    // lists (DT_VARIANT) as we have to trick the type system in order to
+    // plumb them through. DT_VARIANTS are wrapped in a DT_UINT8 tensor.
+    if (types[i] != args[i].type && types[i] != DT_RESOURCE &&
+        types[i] != DT_VARIANT) {
       return errors::Internal(
           "Argument ", i, " has declared type ", DataTypeString(args[i].type),
           " but function parameter has type ", DataTypeString(types[i]));
@@ -85,14 +92,14 @@ ComputeArgAndRetvalCores(const Graph& graph) {
   std::map<int, int> arg_cores;
   std::map<int, int> retval_cores;
   for (const Node* n : graph.nodes()) {
-    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
       if (core < 0) continue;
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       TF_RET_CHECK(index >= 0) << "Negative _Arg index";
       arg_cores[index] = core;
-    } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->IsRetval()) {
       TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
       if (core < 0) continue;
       int index;
@@ -178,9 +185,10 @@ Status BuildComputation(
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
 
-  // Keeps track of which retvals have layout to update. The first element is
-  // the output index, second element is the new layout.
-  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
+  // Keeps track of the layout of each retval. If a retval is not in this list,
+  // a descending layout is used. The first element is the output index, second
+  // element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_index_and_layout;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -192,6 +200,8 @@ Status BuildComputation(
         output.shape = output.constant_value.shape();
         break;
 
+      case XlaExpression::Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case XlaExpression::Kind::kXlaOp: {
         output.is_constant = false;
         TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
@@ -207,7 +217,7 @@ Status BuildComputation(
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
                                                     output.shape, output.type));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
-          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
+          retval_index_and_layout.emplace_back(elems.size(), shape.layout());
         } else if (it != retval_cores.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
@@ -280,6 +290,11 @@ Status BuildComputation(
       // Ensures the correct sharding is applied to the output.
       handle = identity_op(handle);
 
+      // Set layout of the retval to device representation layout.
+      if (resource->representation_shape().has_value()) {
+        retval_index_and_layout.emplace_back(
+            elems.size(), resource->representation_shape()->layout());
+      }
       elems.push_back(handle);
     }
   }
@@ -309,15 +324,15 @@ Status BuildComputation(
                       computation->GetProgramShape());
   *output_shape = program_shape.result();
   // Update the output layout to the layout of retval.
-  for (auto& update : retval_to_update_layout) {
+  for (auto& index_and_layout : retval_index_and_layout) {
     if (!always_return_tuple && elems.size() == 1) {
-      *output_shape->mutable_layout() = update.second;
+      *output_shape->mutable_layout() = index_and_layout.second;
       continue;
     }
 
-    xla::Shape* output_sub_shape =
-        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
-    *output_sub_shape->mutable_layout() = update.second;
+    xla::Shape* output_sub_shape = xla::ShapeUtil::GetMutableSubshape(
+        output_shape, {index_and_layout.first});
+    *output_sub_shape->mutable_layout() = index_and_layout.second;
   }
   return Status::OK();
 }
@@ -333,8 +348,21 @@ bool XlaCompiler::Argument::operator==(
                other.tensor_array_gradients)) {
     return false;
   }
-  if (shape != other.shape) {
-    return false;
+  if (absl::holds_alternative<xla::Shape>(shape)) {
+    if (!absl::holds_alternative<xla::Shape>(other.shape)) {
+      return false;
+    }
+    if (!xla::Shape::Equal()(absl::get<xla::Shape>(shape),
+                             absl::get<xla::Shape>(other.shape))) {
+      return false;
+    }
+  } else {
+    if (!absl::holds_alternative<TensorShape>(other.shape)) {
+      return false;
+    }
+    if (absl::get<TensorShape>(shape) != absl::get<TensorShape>(other.shape)) {
+      return false;
+    }
   }
   if (constant_value.shape() != other.constant_value.shape()) {
     return false;
@@ -348,7 +376,7 @@ string XlaCompiler::Argument::HumanString() const {
     common = absl::StrCat(" name=", name);
   }
   absl::StrAppend(&common, " type=", DataTypeString(type),
-                  " shape=", shape.DebugString());
+                  " shape=", ShapeHumanString());
   switch (kind) {
     case kInvalid:
       return "invalid";
@@ -375,6 +403,23 @@ string XlaCompiler::Argument::HumanString() const {
   }
 }
 
+std::vector<int64> XlaCompiler::Argument::DimensionSizes() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return xla::InlinedVectorToVector(
+        absl::get<TensorShape>(shape).dim_sizes());
+  } else {
+    return absl::get<xla::Shape>(shape).dimensions();
+  }
+}
+
+string XlaCompiler::Argument::ShapeHumanString() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return absl::get<TensorShape>(shape).DebugString();
+  } else {
+    return absl::get<xla::Shape>(shape).DebugString();
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
@@ -536,29 +581,27 @@ Status XlaCompiler::CompileFunction(
   // lowest-numbered core that consumes the argument. We choose the
   // lowest-numbered core so the assignment is deterministic.
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) ==
-        FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
     }
   }
   // Do _Retval as a second loop, in case the retval's input is an _Arg (which
   // may have gotten a device assignment from the first loop).
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) ==
-        FunctionLibraryDefinition::kRetOp) {
+    if (n->IsRetval()) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
     }
   }
 
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "XlaCompiler::CompileFunction: "
-            << dump_graph::DumpGraphToFile(
+            << DumpGraphToFile(
                    absl::StrCat("xla_compile_function_", function_id), *graph);
   }
 
   VLOG(1) << "====================================================";
   TF_RETURN_IF_ERROR(
-      CompileGraph(options, function_id, std::move(graph), args, result));
+      CompileGraph(options, function_id, std::move(graph), args, {}, result));
   VLOG(1) << "====================================================";
 
   cache_[{function_id, arg_vector}] = *result;
@@ -574,11 +617,22 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
       if (is_entry_computation) {
-        TF_ASSIGN_OR_RETURN(
-            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
+        TensorShape shape;
+        if (absl::holds_alternative<TensorShape>(arg.shape)) {
+          shape = absl::get<TensorShape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(
+              XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &shape));
+        }
+        TF_ASSIGN_OR_RETURN(*xla_shape,
+                            options_.shape_representation_fn(shape, arg.type));
       } else {
-        TF_RETURN_IF_ERROR(
-            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
+        if (absl::holds_alternative<xla::Shape>(arg.shape)) {
+          *xla_shape = absl::get<xla::Shape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
+              arg.type, absl::get<TensorShape>(arg.shape), xla_shape));
+        }
       }
       return Status::OK();
     }
@@ -587,8 +641,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
-                                              arg.shape, arg.type));
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
+          TF_ASSIGN_OR_RETURN(*xla_shape,
+                              options_.shape_representation_fn(
+                                  absl::get<TensorShape>(arg.shape), arg.type));
 
           return Status::OK();
         }
@@ -597,9 +653,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
           if (!arg.tensor_array_gradients.empty()) {
@@ -614,9 +671,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
               TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
@@ -646,14 +704,15 @@ Status XlaCompiler::BuildArguments(
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
     const std::map<int, int>& arg_cores,
     std::vector<XlaExpression>* arg_expressions,
-    std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
+    std::vector<int>* input_to_args, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
 
   // Argument numbers of arguments and resources that are to be passed to the
-  // XLA computation as runtime parameters.
-  input_mapping->clear();
-  input_mapping->reserve(args.size());
+  // XLA computation as runtime parameters. `input_to_args[a] = b` means that
+  // the a'th XLA input corresponds to the b'th original arg indexes.
+  input_to_args->clear();
+  input_to_args->reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
   for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
@@ -663,24 +722,25 @@ Status XlaCompiler::BuildArguments(
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
+        TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource =
             context->AddResource(absl::make_unique<XlaResource>(
-                arg.resource_kind, i, arg.name, arg.type, arg.shape,
-                xla::XlaOp(),
+                arg.resource_kind, i, arg.name, arg.type,
+                absl::get<TensorShape>(arg.shape), xla::XlaOp(),
                 /*max_array_size=*/arg.max_array_size,
                 /*tensor_array_gradients=*/arg.tensor_array_gradients,
                 /*tensor_array_multiple_writes_aggregate=*/true));
         arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
-          input_mapping->push_back(i);
+          input_to_args->push_back(i);
         }
         break;
       }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
-        input_mapping->push_back(i);
+        input_to_args->push_back(i);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -692,15 +752,23 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
-  if (input_mapping->empty()) {
+  if (input_to_args->empty()) {
     return Status::OK();
   }
 
-  std::vector<xla::Shape> arg_shapes(input_mapping->size());
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+  // `arg_to_inputs[c] = d` means that the c'th original arg index corresponds
+  // to the d'th XLA input. Note that the value -1 corresponds to constants, or
+  // other args that don't correspond to an input.
+  std::vector<int> arg_to_inputs(args.size(), -1);
+  for (int i = 0; i < input_to_args->size(); i++) {
+    arg_to_inputs[input_to_args->at(i)] = i;
+  }
+
+  std::vector<xla::Shape> arg_shapes(input_to_args->size());
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
     // Computes the shapes of non-constant arguments.
     TF_RETURN_IF_ERROR(XLAShapeForArgument(
-        args[(*input_mapping)[i]], is_entry_computation, &arg_shapes[i]));
+        args[(*input_to_args)[i]], is_entry_computation, &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -717,13 +785,13 @@ Status XlaCompiler::BuildArguments(
   builder->SetOpMetadata(arg_metadata);
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::XlaOp> arg_handles(input_mapping->size());
+  std::vector<xla::XlaOp> arg_handles(input_to_args->size());
   if (use_tuple_arg) {
     xla::XlaOp tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
-      for (int64 parameter : *input_mapping) {
+      for (int64 parameter : *input_to_args) {
         auto it = arg_cores.find(parameter);
         const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
@@ -735,7 +803,19 @@ Status XlaCompiler::BuildArguments(
     } else {
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
+            /*target_param_num=*/0, /*target_param_index=*/{i},
+            dim_and_arg_num.first));
+      }
+    }
+
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -744,7 +824,7 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -753,6 +833,17 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::Parameter(builder, i, (*input_shapes)[i],
                                       absl::StrCat("arg", i));
     }
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/dynamic_size_param_index, {},
+            /*target_param_num=*/i, /*target_param_index=*/{},
+            dim_and_arg_num.first));
+      }
+    }
   }
 
   builder->ClearOpMetadata();
@@ -760,12 +851,12 @@ Status XlaCompiler::BuildArguments(
   // Fill in the handles in non-constant arguments, and reshape parameters
   // back to their correct shapes.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
     VLOG(2) << "  XLA arg " << i
             << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
-            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
-    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
+            << " name: " << arg.name << " TF arg " << input_to_args->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_to_args->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
@@ -782,7 +873,7 @@ Status XlaCompiler::BuildArguments(
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
           arg_expression = XlaExpression::XlaOp(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
+              xla::Reshape(arg_handles[i], arg.DimensionSizes()), arg.type);
         } else {
           arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
@@ -843,7 +934,8 @@ Status XlaCompiler::CompileSingleOp(
   }
   FixupSourceAndSinkEdges(graph.get());
 
-  return CompileGraph(options, node_def.name(), std::move(graph), args, result);
+  return CompileGraph(options, node_def.name(), std::move(graph), args, {},
+                      result);
 }
 
 namespace {
@@ -862,6 +954,28 @@ Status ValidateFunctionDef(const FunctionDef* fdef,
   return Status::OK();
 }
 
+// If node is PartitionedCall or StatefulPartitionedCall, returns the
+// name from the "f" attr, else returns node.def().op().
+// Returned pointer points to the internal string either in node's attributes
+// or in its NodeDef. This pointer is valid as long as the node has not been
+// modified.
+Status GetPotentialFunctionName(const Node& node, const string** name) {
+  if (node.IsPartitionedCall()) {
+    const AttrValue* attr_value;
+    TF_RETURN_IF_ERROR(
+        node.attrs().Find(FunctionLibraryDefinition::kFuncAttr, &attr_value));
+    if (!attr_value->has_func()) {
+      return errors::InvalidArgument(
+          "The attribute value for attribute 'f' in node ", node.DebugString(),
+          " does not have 'func' field set");
+    }
+    *name = &attr_value->func().name();
+    return Status::OK();
+  }
+  *name = &node.type_string();
+  return Status::OK();
+}
+
 // Check that the graph doesn't have any invalid nodes (e.g. incompatible with
 // given device_type, invalid data type, missing attributes...)
 Status ValidateGraph(const Graph* graph,
@@ -881,7 +995,9 @@ Status ValidateGraph(const Graph* graph,
     if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
       continue;
     }
-    const FunctionDef* fdef = flib_def.Find(node->def().op());
+    const string* function_name;
+    TF_RETURN_IF_ERROR(GetPotentialFunctionName(*node, &function_name));
+    const FunctionDef* fdef = flib_def.Find(*function_name);
     Status s;
     if (fdef) {
       s = ValidateFunctionDef(fdef, flib_def);
@@ -926,20 +1042,19 @@ void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
 
 }  // namespace
 
-Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
-                                 string const& name,
-                                 std::unique_ptr<Graph> graph,
-                                 absl::Span<const XlaCompiler::Argument> args,
-                                 CompilationResult* result) {
+Status XlaCompiler::CompileGraph(
+    const XlaCompiler::CompileOptions& options, string const& name,
+    std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
+    absl::Span<const xla::XlaBuilder::InputOutputAlias> user_aliases,
+    CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
 
   TF_RETURN_IF_ERROR(PropagateConstIntoFunctionalNodes(
       graph.get(), options_.flib_def, local_flib_def_.get()));
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "XlaCompiler::CompileGraph: "
-            << dump_graph::DumpGraphToFile(
-                   absl::StrCat("xla_compile_graph_", name), *graph,
-                   flib_runtime_->GetFunctionLibraryDefinition());
+            << DumpGraphToFile(absl::StrCat("xla_compile_graph_", name), *graph,
+                               flib_runtime_->GetFunctionLibraryDefinition());
   }
 
   // Report the error here if initialization failed.
@@ -978,6 +1093,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
 
+  // Propagate any aliases given to us by the user.
+  for (const xla::XlaBuilder::InputOutputAlias& alias : user_aliases) {
+    builder.SetUpAlias(alias.output_index, alias.param_number,
+                       alias.param_index);
+  }
+
   PushNodeTokenMapping();
   // Use std::set instead of std::unordered_set to ensure determinism.
   std::set<std::string> output_node_token_inputs;
@@ -1023,8 +1144,17 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   result->outputs.resize(context->retvals().size());
   std::vector<XlaExpression> retvals = context->retvals();
   if (options.resolve_compile_time_constants) {
-    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
-        client(), absl::Span<XlaExpression>(retvals)));
+    Status status = ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals));
+
+    // If the HloEvaluator has not implemented an expression, just evaluate it
+    // at runtime.
+    if (status.code() == error::UNIMPLEMENTED) {
+      ConvertConstantsToExpressions(&builder,
+                                    absl::Span<XlaExpression>(retvals));
+    } else {
+      TF_RETURN_IF_ERROR(status);
+    }
   } else {
     ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 0d801b73a8c2651305328384377751254ecaa41d..0b0908e9d6913f2664e4d976611b1218be44ff2b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stack>
 
 #include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
@@ -124,7 +125,8 @@ class XlaCompiler {
     DataType type = DT_INVALID;
 
     // The shape of the argument. For:
-    // * a parameter: the shape of the parameter.
+    // * a parameter: the shape of the parameter. We allow setting the xla shape
+    //   if known. This helps avoid conversions to and from TensorShape.
     // * a constant: ignored; the shape given by constant_value is used
     //     instead.
     // * an uninitialized resource: ignored. We don't yet know the shape of an
@@ -133,7 +135,7 @@ class XlaCompiler {
     // * an initialized TensorArray or Stack resource: the shape of an entry in
     //   the TensorArray/Stack. Note this is the size of a single entry, not the
     //   XLA data structure that represents the complete stack/array.
-    TensorShape shape;
+    absl::variant<TensorShape, xla::Shape> shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -157,10 +159,20 @@ class XlaCompiler {
     // as `tensor_array_gradients`.
     std::set<string> tensor_array_gradients;
 
+    // dynamic dims to arg number map. Empty if no dynamic shapes.
+    std::map<int32, int32> dynamic_dim_to_arg_num_map;
+    bool is_pad_arg = false;
+
     bool operator==(const Argument& other) const;
 
     // Returns a human-readable summary of the argument.
     string HumanString() const;
+
+    // Returns the dimension sizes for either TensorShape or xla::Shape.
+    std::vector<int64> DimensionSizes() const;
+
+    // Returns the human-readable string for either TensorShape or xla::Shape.
+    string ShapeHumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -327,10 +339,11 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  Status CompileGraph(const CompileOptions& options, string const& name,
-                      std::unique_ptr<Graph> graph,
-                      absl::Span<const Argument> args,
-                      CompilationResult* result);
+  Status CompileGraph(
+      const CompileOptions& options, string const& name,
+      std::unique_ptr<Graph> graph, absl::Span<const Argument> args,
+      absl::Span<const xla::XlaBuilder::InputOutputAlias> user_aliases,
+      CompilationResult* result);
 
   // Compiles a single Op, given by `node_def`, into an
   // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
@@ -404,11 +417,11 @@ class XlaCompiler {
   Status SetNodeToken(const string& node_name, const xla::XlaOp& op);
   xla::StatusOr<xla::XlaOp> GetNodeToken(const string& node_name);
 
- private:
   // Sets the function body `fbody` to the one registered as `function`.
   Status FindFunctionBody(const NameAttrList& function,
                           const FunctionBody** fbody);
 
+ private:
   // Returns the optimized graph object in this function body.
   std::unique_ptr<Graph> GetGraph(const FunctionBody* fbody);
 
@@ -420,7 +433,7 @@ class XlaCompiler {
                         XlaContext* context,
                         const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
-                        std::vector<int>* input_mapping,
+                        std::vector<int>* input_to_args,
                         std::vector<xla::Shape>* input_shapes,
                         bool is_entry_computation);
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 492010f7317d32a8a620147cd2cd9356d4f13fde..1818d4290324aa398f8f90ff11725dc48948b621 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -175,9 +175,9 @@ TEST_F(XlaCompilerTest, EmptyReturnValues) {
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph),
-                                     /*args=*/{}, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(
+      XlaCompiler::CompileOptions(), "add", std::move(graph),
+      /*args=*/{}, /*user_aliases=*/{}, &result));
 
   TF_ASSERT_OK(client_->Execute(*result.computation, {}).status());
 }
@@ -207,7 +207,8 @@ TEST_F(XlaCompilerTest, Simple) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   // Tests that the generated computation works.
   xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
@@ -258,7 +259,7 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
   compile_options.always_return_tuple = false;
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
-                                     args, &result));
+                                     args, /*user_aliases=*/{}, &result));
 
   // Tests that the generated computation works.
   xla::Literal param0_literal = xla::LiteralUtil::CreateR1<int32>({7, 42});
@@ -277,6 +278,99 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(param0_literal, actual_literal));
 }
 
+// Tests that the compiler can correctly propagate the layout assigned by
+// shape_representation_fn_ to return types.
+TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType dt) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dt, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
+// The layout of resource variable shouldn't change after transpose
+TEST_F(XlaCompilerTest, TransposeVariables) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto transposed_read = ops::Transpose(scope, read, {1, 0});
+  auto reshape = ops::Reshape(scope, transposed_read, {2, 3});
+  auto d = ops::_Retval(scope.WithOpName("D"), reshape, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "transpose",
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {1, 0});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
 // Tests that the compiler doesn't reorder the parameters.
 TEST_F(XlaCompilerTest, MixedOrderArguments) {
   for (bool swap_order : {false, true}) {
@@ -319,7 +413,7 @@ TEST_F(XlaCompilerTest, MixedOrderArguments) {
     compile_options.always_return_tuple = false;
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
-                                       args, &result));
+                                       args, /*user_aliases=*/{}, &result));
 
     EXPECT_THAT(result.input_mapping, ::testing::ElementsAre(0, 1));
   }
@@ -349,9 +443,9 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
   XlaCompiler compiler(DefaultOptions());
 
   XlaCompiler::CompilationResult result;
-  Status status =
-      compiler.CompileGraph(XlaCompiler::CompileOptions(), "reshape",
-                            std::move(graph), args, &result);
+  Status status = compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                        "reshape", std::move(graph), args,
+                                        /*user_aliases=*/{}, &result);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
       absl::StrContains(status.error_message(), "depends on a parameter"))
@@ -395,7 +489,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     compile_options.resolve_compile_time_constants = true;
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
-                                       std::move(graph_copy), args, &result));
+                                       std::move(graph_copy), args,
+                                       /*user_aliases=*/{}, &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_TRUE(result.outputs[0].is_constant);
@@ -428,7 +523,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     compile_options.resolve_compile_time_constants = false;
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
-                                       std::move(graph_copy), args, &result));
+                                       std::move(graph_copy), args,
+                                       /*user_aliases=*/{}, &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_FALSE(result.outputs[0].is_constant);
@@ -514,7 +610,8 @@ TEST_F(XlaCompilerTest, ConstantOutputsOfFunctionalNode) {
   compile_options.resolve_compile_time_constants = true;
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   ASSERT_EQ(2, result.outputs.size());
   EXPECT_TRUE(result.outputs[0].is_constant);
@@ -556,7 +653,8 @@ TEST_F(XlaCompilerTest, ResourceManager) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "dummy",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   EXPECT_EQ(1, resource->Get());
 
@@ -592,7 +690,8 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
     XlaCompiler compiler(options);
 
     TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "dummy",
-                                       std::move(graph), args, &results[i]));
+                                       std::move(graph), args,
+                                       /*user_aliases=*/{}, &results[i]));
   }
 
   for (int64 i = 1; i < test_count; ++i) {
@@ -658,7 +757,8 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   ASSERT_EQ(1, result.resource_updates.size());
   const XlaCompiler::ResourceUpdate& update = result.resource_updates[0];
@@ -717,7 +817,8 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   EXPECT_EQ(0, result.resource_updates.size());
 }
@@ -749,7 +850,8 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   EXPECT_EQ(1, result.resource_updates.size());
 }
@@ -824,7 +926,8 @@ TEST_F(XlaCompilerTest, FunctionCallWithConstants) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 }
 
 // Tests CompileFunction with a local function lookup failing, fails with
@@ -907,7 +1010,8 @@ TEST_F(XlaCompilerTest, Variables) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
   RunAndCheckVariablesComputation(client_, result);
 }
 
@@ -942,7 +1046,7 @@ TEST_F(XlaCompilerTest, ResultLayoutSingle) {
   auto compile_options = XlaCompiler::CompileOptions();
   compile_options.always_return_tuple = false;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "id", std::move(graph),
-                                     args, &result));
+                                     args, /*user_aliases=*/{}, &result));
   EXPECT_TRUE(xla::ShapeUtil::Equal(
       result.xla_output_shape,
       xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
@@ -978,7 +1082,8 @@ TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "id",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
   xla::Shape result_shape =
       xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
 
@@ -1008,7 +1113,8 @@ TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
 
   // Tests that the generated computation works.
   xla::Literal param1_literal = xla::LiteralUtil::CreateR1<int32>({-3, 101});
@@ -1058,7 +1164,8 @@ TEST_F(XlaCompilerTest, ReturnResourceHandle) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+                                     std::move(graph), args,
+                                     /*user_aliases=*/{}, &result));
   RunAndCheckVariablesComputation(client_, result);
 }
 
@@ -1109,7 +1216,7 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
-                                     args, &result));
+                                     args, /*user_aliases=*/{}, &result));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
                           client_->GetComputationShape(*result.computation));
@@ -1179,7 +1286,7 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   XlaCompiler::CompilationResult result;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
-                                     args, &result));
+                                     args, /*user_aliases=*/{}, &result));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
                           client_->GetComputationShape(*result.computation));
@@ -1260,7 +1367,8 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   std::vector<XlaCompiler::Argument> args;
   XlaCompiler::CompilationResult result;
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
-                                 std::move(graph), args, &result);
+                                 std::move(graph), args, /*user_aliases=*/{},
+                                 &result);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(absl::StrContains(status.error_message(), "InvalidOp"))
       << status.error_message();
@@ -1285,7 +1393,8 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
   XlaCompiler::CompilationResult result;
   XlaCompiler compiler(DefaultOptions());
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type",
-                                 std::move(graph), args, &result);
+                                 std::move(graph), args, /*user_aliases=*/{},
+                                 &result);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(absl::StrContains(status.error_message(),
                                 "is not in the list of allowed values"))
@@ -1311,7 +1420,8 @@ TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
     CopyGraph(*graph, graph_copy.get());
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp",
-                                       std::move(graph_copy), args, &result));
+                                       std::move(graph_copy), args,
+                                       /*user_aliases=*/{}, &result));
   }
 }
 
@@ -1360,7 +1470,7 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     CopyGraph(*graph, graph_copy.get());
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
-                                       args, &result));
+                                       args, /*user_aliases=*/{}, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 1);
     EXPECT_TRUE(result.xla_output_shape.IsTuple());
     EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 1);
@@ -1378,7 +1488,7 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     CopyGraph(*graph, graph_copy.get());
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
-                                       args, &result));
+                                       args, /*user_aliases=*/{}, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 2);
     EXPECT_TRUE(result.xla_input_shapes[1].IsToken());
     EXPECT_TRUE(result.xla_output_shape.IsTuple());
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 6139bf3cea0790c2697130a993e92be96c81848b..3f787fd86c9f7366a7728dcf146a3797ba672bc3 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -76,7 +76,7 @@ XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
-  return LookupOrCreate(type, &max_func_, [this, type] {
+  return LookupOrCreate(type, &max_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
@@ -92,7 +92,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
-  return LookupOrCreate(type, &min_func_, [this, type] {
+  return LookupOrCreate(type, &min_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
@@ -108,7 +108,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
-  return LookupOrCreate(type, &add_func_, [this, type] {
+  return LookupOrCreate(type, &add_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
@@ -124,7 +124,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
-  return LookupOrCreate(type, &mul_func_, [this, type] {
+  return LookupOrCreate(type, &mul_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index ca0309166b7c73d1a5a818091e2a30fa112a4de4..3d228c92adcbe3d093a4fe70d157e57ab3e80c80 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -46,6 +46,14 @@ XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
   return e;
 }
 
+XlaExpression XlaExpression::TensorList(xla::XlaOp tensor_list) {
+  XlaExpression e;
+  e.kind_ = Kind::kTensorList;
+  e.dtype_ = DT_VARIANT;
+  e.handle_ = tensor_list;
+  return e;
+}
+
 XlaExpression XlaExpression::Resource(XlaResource* resource) {
   XlaExpression e;
   e.kind_ = Kind::kResource;
@@ -64,6 +72,8 @@ string XlaExpression::HumanString() const {
       return "xla_op";
     case Kind::kResource:
       return "resource";
+    case Kind::kTensorList:
+      return "tensor_list";
   }
 }
 
@@ -76,6 +86,8 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
             HostTensorToBorrowingLiteral(constant_value_, &literal));
         return xla::ConstantLiteral(builder, literal);
       }
+      case Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case Kind::kXlaOp:
         if (builder != handle_.builder()) {
           return errors::InvalidArgument(
@@ -96,7 +108,10 @@ xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
       return {constant_value()};
     case Kind::kXlaOp:
       break;
+    case Kind::kTensorList:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kResource:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kInvalid:
       return errors::InvalidArgument(
           "ResolveConstant called on XlaExpression: ", HumanString());
@@ -134,6 +149,8 @@ xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
       TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
       return shape;
     }
+    case Kind::kTensorList:
+      return TensorShape({});
     case Kind::kResource:
       return TensorShape({});
     case Kind::kInvalid:
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index bed6761d362a98d344003c1edea342e68c31ef07..ac0232d8924cf2c9e35ad3f0772a3a2adc18af87 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -32,11 +32,16 @@ namespace tensorflow {
 // * a constant tensor.
 // * an xla::XlaOp, representing a symbolic XLA value.
 // * a resource, e.g., a variable, represented as an XlaResource pointer.
+// * a tensor list, represented by a tuple of tensors and the list length.
 //
 // Constant tensors are mostly an optimization to avoid passing large constants
 // to XLA, but are also sometimes used to represent tensors that have no XLA
 // representation, for example, DT_STRING tensors. A canonical use case might be
 // an error message string.
+//
+// Tensor lists are very similar to xla::XlaOp, however they require some
+// specific logic around shape management since the tuples are not supported by
+// TensorFlow.
 class XlaExpression {
  public:
   enum class Kind {
@@ -44,6 +49,7 @@ class XlaExpression {
     kConstant,
     kXlaOp,
     kResource,
+    kTensorList,
   };
 
   XlaExpression();
@@ -62,6 +68,9 @@ class XlaExpression {
   // be derived from the XLA type.
   static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
 
+  // Builds a tensor list expression.
+  static XlaExpression TensorList(xla::XlaOp tensor_list);
+
   // Builds a resource expression.
   static XlaExpression Resource(XlaResource* resource);
 
@@ -100,7 +109,8 @@ class XlaExpression {
 
   DataType dtype_ = DT_INVALID;
 
-  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp or
+  // a tuple expression if kind_ == kTensorList.
   xla::XlaOp handle_;
 
   // The value of the constant, if kind_ == kConstant.
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 04a5d934064a9083a41cc210b48df65bbc862fff..7bb1ad27467a5b281626de4203169e575288f9ee 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -81,61 +81,27 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
   return Status::OK();
 }
 
-template <typename T>
-static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
-  Tensor linspace(DataTypeToEnum<T>::v(), shape);
-  auto linspace_flat = linspace.flat<T>();
-  for (int64 i = 0; i < depth; ++i) {
-    linspace_flat(i) = i;
-  }
-  return linspace;
-}
-
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
                           const xla::XlaOp& indices, const xla::XlaOp& on_value,
                           const xla::XlaOp& off_value, xla::XlaOp* one_hot) {
-  const int indices_dims = indices_shape.dims();
-  const int output_dims = indices_dims + 1;
-
-  TensorShape output_shape = indices_shape;
-  output_shape.InsertDim(axis, depth);
-
-  // Build a Tensor populated with values 0, 1, 2, ... depth.
-  std::vector<int64> linspace_dims(output_dims, 1);
-  linspace_dims[axis] = depth;
-  TensorShape linspace_shape(linspace_dims);
-  Tensor linspace;
-  switch (index_type) {
-    case DT_UINT8:
-      linspace = MakeLinspaceTensor<uint8>(linspace_shape, depth);
-      break;
-    case DT_INT32:
-      linspace = MakeLinspaceTensor<int32>(linspace_shape, depth);
-      break;
-    case DT_INT64:
-      linspace = MakeLinspaceTensor<int64>(linspace_shape, depth);
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type ",
-                                     DataTypeString(index_type));
-  }
-
-  xla::BorrowingLiteral linspace_literal;
-  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal));
-
   // Broadcast the linspace constant across the indices along the new axis,
   // and test equality at each position.
   std::vector<int64> broadcast_dims(indices_shape.dims());
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-  xla::XlaOp one_hot_bool = xla::Eq(
-      indices, xla::ConstantLiteral(builder, linspace_literal), broadcast_dims);
+
+  TensorShape output_shape = indices_shape;
+  output_shape.InsertDim(axis, depth);
+  xla::Shape iota_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(index_type, output_shape, &iota_shape));
 
   // Selects the user-provided off_value and on_value values.
-  *one_hot = xla::Select(one_hot_bool,
-                         xla::Broadcast(on_value, output_shape.dim_sizes()),
-                         xla::Broadcast(off_value, output_shape.dim_sizes()));
+  *one_hot = xla::Select(
+      xla::Eq(indices, xla::Iota(builder, iota_shape, axis), broadcast_dims),
+      xla::Broadcast(on_value, output_shape.dim_sizes()),
+      xla::Broadcast(off_value, output_shape.dim_sizes()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 58bd173e61aa3263fae4b494914707833c7a624f..ee11f3a3de658c7e5108605122b84fbc3e1cd963 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -93,7 +93,7 @@ TensorShape XlaOpKernelContext::InputShape(absl::string_view name) {
 }
 
 DataType XlaOpKernelContext::input_type(int index) const {
-  return context_->input(index).dtype();
+  return context_->input_dtype(index);
 }
 
 DataType XlaOpKernelContext::InputType(absl::string_view name) {
@@ -229,7 +229,8 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
 static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                    std::vector<int64>* out) {
   if (literal.shape().rank() != 1) {
-    return errors::InvalidArgument("value is not 1D");
+    return errors::InvalidArgument("value is not 1D, rank: ",
+                                   literal.shape().rank());
   }
   int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
@@ -318,6 +319,27 @@ Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
   return Status::OK();
 }
 
+Status XlaOpKernelContext::ConstantInputAsPartialShape(
+    int index, PartialTensorShape* shape) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  // If `literal` is a scalar it's value must be -1.
+  if (literal.shape().rank() == 0) {
+    int64 shape_val;
+    TF_RETURN_IF_ERROR(LiteralToInt64Scalar(literal, &shape_val));
+    if (shape_val != -1) {
+      return errors::InvalidArgument(
+          "Cannot convert value to PartialTensorShape: ", shape_val);
+    }
+    *shape = PartialTensorShape();  // Shape with unknown rank.
+    return Status::OK();
+  }
+  std::vector<int64> dims;
+  TF_RETURN_IF_ERROR(LiteralToInt64Vector(literal, &dims));
+  *shape = PartialTensorShape(dims);
+  return Status::OK();
+}
+
 Status XlaOpKernelContext::InputList(absl::string_view name,
                                      std::vector<xla::XlaOp>* handles,
                                      std::vector<TensorShape>* shapes) {
@@ -353,8 +375,8 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
   if (!variable->initialized()) {
-    return errors::InvalidArgument("Read of uninitialized variable ",
-                                   variable->name());
+    return errors::FailedPrecondition("Read of uninitialized variable ",
+                                      variable->name());
   }
   if (variable->type() != type) {
     return errors::InvalidArgument(
@@ -446,6 +468,16 @@ void XlaOpKernelContext::SetOutputExpression(int index,
   }
 }
 
+xla::PrimitiveType XlaOpKernelContext::output_xla_type(int index) {
+  xla::PrimitiveType type;
+  Status status = DataTypeToPrimitiveType(expected_output_dtype(index), &type);
+  if (!status.ok()) {
+    SetStatus(status);
+    return xla::PRIMITIVE_TYPE_INVALID;
+  }
+  return type;
+}
+
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
   SetOutputExpression(
       index,
@@ -456,6 +488,11 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
+void XlaOpKernelContext::SetTensorListOutput(int index,
+                                             const xla::XlaOp& handle) {
+  SetOutputExpression(index, XlaExpression::TensorList(handle));
+}
+
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
   SetOutputExpression(index, XlaExpression::Resource(resource));
 }
@@ -497,6 +534,7 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
     handle = xla::Reshape(handle,
                           xla::AsInt64Slice(representation_shape.dimensions()));
   }
+  variable->SetRepresentationShape(representation_shape);
   return variable->SetValue(handle);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 1858844bc05a6e12abbf07af83cad816590ddd03..cc2d5e8de3eb020ba41dfed7d730b48cd0534b4c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -138,6 +138,10 @@ class XlaOpKernelContext {
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
 
+  // Converts a constant 1D int32 or int64 tensor, or a scalar with value -1
+  // into a PartialTensorShape.
+  Status ConstantInputAsPartialShape(int index, PartialTensorShape* shape);
+
   // Returns the named list-valued immutable input in "list", as
   // defined in the OpDef.  If the named output is not list-valued,
   // returns a one-element list.
@@ -155,6 +159,11 @@ class XlaOpKernelContext {
     return context_->expected_output_dtype(index);
   }
 
+  // Returns the type of output `index` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType output_xla_type(int index);
+
   // Sets output `index` to the XlaOp `handle`.
   // All outputs should be set using SetOutput and SetConstantOutput, not
   // via the underlying OpKernelContext.
@@ -168,6 +177,9 @@ class XlaOpKernelContext {
   // Returns an XlaExpression describing the value of 'index'.
   void SetOutputExpression(int index, const XlaExpression& expression);
 
+  // Sets output `index` to the Tensor List `handle`.
+  void SetTensorListOutput(int index, const xla::XlaOp& handle);
+
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
   Status status() { return context_->status(); }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 14237df69081016817fbd1a5332f22996e7f264d..1106c027c034938b5d454a4b26e2925f65100a3d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -73,6 +73,11 @@ XlaOpRegistry::~XlaOpRegistry() = default;
                  << " have incompatible allow_resource_types settings.";
     return false;
   }
+  if (x.allow_variant_types != y.allow_variant_types) {
+    LOG(WARNING) << "Registrations of " << x.name
+                 << " have incompatible allow_variant_types settings.";
+    return false;
+  }
   if (!x.has_device_whitelist && !y.has_device_whitelist) {
     LOG(WARNING) << "Duplicate registrations of " << x.name
                  << "with no device whitelists.";
@@ -132,6 +137,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   static void* registration_init = [&registry]() {
     MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
     bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+    VLOG(2) << "tf_xla_cpu_global_jit = " << cpu_global_jit;
 
     mutex_lock lock(registry.mutex_);
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) {
@@ -142,7 +148,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
           cpu_global_jit
               ? XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally
               : XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested;
-      registration.compile_resource_ops = false;
+      registration.compile_all_resource_ops = false;
     }
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) {
       DeviceRegistration& registration =
@@ -150,7 +156,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
       registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
       registration.autoclustering_policy =
           XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally;
-      registration.compile_resource_ops = false;
+      registration.compile_all_resource_ops = false;
     }
     return nullptr;
   }();
@@ -289,6 +295,9 @@ void XlaOpRegistry::RegisterCompilationKernels() {
           if (op_registration->allow_resource_types) {
             allowed_values->add_type(DT_RESOURCE);
           }
+          if (op_registration->allow_variant_types) {
+            allowed_values->add_type(DT_VARIANT);
+          }
           // Don't build KernelDefs that have unsatisfiable type constraints.
           if (allowed_values->type().empty()) {
             unsatisfiable_type_constraint = true;
@@ -485,6 +494,11 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowResourceTypes() {
   return *this;
 }
 
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowVariantTypes() {
+  registration_->allow_variant_types = true;
+  return *this;
+}
+
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, DataType allowed) {
   std::set<DataType>& types =
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index ce3b6b298c6dc5a08e7b794bbab3a28575967d28..bf4d2e1a9ddb2f89889d2ec15f4e685fa39b51cf 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -51,10 +51,10 @@ constexpr std::array<DataType, 12> kNumericTypes = {
     {DT_UINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_INT32, DT_INT64, DT_HALF,
      DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 15> kCpuAllTypes = {
+constexpr std::array<DataType, 16> kCpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
      DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL}};
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 constexpr std::array<DataType, 15> kGpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
@@ -89,7 +89,7 @@ class XlaOpRegistry {
     AutoclusteringPolicy autoclustering_policy;
 
     // Enable compilation of operators that use DT_RESOURCE types?
-    bool compile_resource_ops = false;
+    bool compile_all_resource_ops = false;
   };
 
   // Registers an XLA backend. `compilation_device_name` is the name of the
@@ -212,6 +212,10 @@ class XlaOpRegistry {
     // allow DT_RESOURCE.
     bool allow_resource_types = false;
 
+    // Should we allow variant types for type attributes? Used by While to
+    // allow TensorList which is of type DT_VARIANT.
+    bool allow_variant_types = false;
+
     // Mapping from attribute name to a list of supported types.
     std::unordered_map<string, std::set<DataType>> type_constraints;
 
@@ -233,9 +237,9 @@ class XlaOpRegistry {
 
   // Returns true if registrations x and y can both be added to the registry.
   // This is always the case if they refer to different ops. If they refer to
-  // the same op name, they must: have the same values for compilation_only and
-  // allow_resource_types; use a device_whitelist; and their
-  // whitelists must not intersect.
+  // the same op name, they must: have the same values for compilation_only,
+  // allow_resource_types and allow_variant_types; use a device_whitelist; and
+  // their whitelists must not intersect.
   static bool IsCompatible(const OpRegistration& x, const OpRegistration& y);
 
   static Status CompileTimeConstantInputs(const NodeDef& node_def,
@@ -293,6 +297,9 @@ class XlaOpRegistrationBuilder {
   // Allow DT_RESOURCE types for type parameters.
   XlaOpRegistrationBuilder& AllowResourceTypes();
 
+  // Allow DT_VARIANT types for type parameters.
+  XlaOpRegistrationBuilder& AllowVariantTypes();
+
   // Mark 'input_name' as an argument whose value must be known at compile-time.
   XlaOpRegistrationBuilder& CompileTimeConstantInput(
       absl::string_view input_name);
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 736588bb8b89ba756cdce77eeebff8d1fcf4774c..ab3a5bdd9bc580c16d65d35c3be3ba8204511f83 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -86,6 +86,12 @@ class XlaResource {
   // variables have new values that need to be written back.
   const xla::XlaOp& initial_value() const { return initial_value_; }
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  const absl::optional<xla::Shape>& representation_shape() const {
+    return representation_shape_;
+  }
+
   // A variable is initialized if it has a value.
   bool initialized() const { return value_.valid(); }
 
@@ -100,6 +106,11 @@ class XlaResource {
   // Sets the current value of the resource to an all-zero value.
   Status SetZeroValue(xla::XlaBuilder* builder);
 
+  // Sets the representational shape of the resource on device.
+  void SetRepresentationShape(const xla::Shape& shape) {
+    representation_shape_ = absl::make_optional(shape);
+  }
+
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
@@ -160,6 +171,10 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  absl::optional<xla::Shape> representation_shape_;
+
   int64 max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 636e5ef721f58c009566c10a653d09a7667619c0..ea3378085b94af466972128f7c18fd6b8b0984bc 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -57,6 +57,24 @@ xla_proto_library(
     ],
 )
 
+cc_library(
+    name = "comparison_util",
+    srcs = [
+        "comparison_util.cc",
+    ],
+    hdrs = [
+        "comparison_util.h",
+    ],
+    visibility = [":friends"],
+    deps = [
+        ":statusor",
+        ":types",
+        ":util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "execution_options_util",
     srcs = [
@@ -150,8 +168,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":status",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor/lib",
     ],
 )
@@ -194,7 +210,7 @@ cc_library(
         ":types",
         ":util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -830,10 +846,10 @@ cc_library(
     deps =
         [
             ":parse_flags_from_env",
+            ":status",
             "//tensorflow/compiler/xla:xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
         ],
 )
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 27c075e8f13f6777af4e837501d97a33034313f5..d5ade8f626205b0c60dadd2216c9a49d1112832b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -118,6 +118,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:local_service",
@@ -212,6 +213,7 @@ cc_library(
         ":padding",
         ":sharding_builder",
         ":xla_computation",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -246,6 +248,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 1f594e551af381d7537e947892cbf7e0b5b3b861..f2d124d099b09af0612c6b23d899467c170edcd6 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -58,6 +58,12 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_num_replicas(
+    int num_replicas) {
+  num_replicas_ = num_replicas;
+  return *this;
+}
+
 string ExecutableBuildOptions::ToString() const {
   string result_layout = "nullopt";
   if (result_layout_set_) {
@@ -65,8 +71,8 @@ string ExecutableBuildOptions::ToString() const {
   }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
-      "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
+      "num_replicas=%d}",
+      device_ordinal_, result_layout, num_replicas_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index a58090253bfac7779e4b61bc7231a0f0d945cc00..1d85fb34304b95d1fccdb0b0d6a7a65e739fae18 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -67,12 +67,18 @@ class ExecutableBuildOptions {
   // debugging.
   string ToString() const;
 
+  // The number of replicas of this computation that are to be executed.
+  // Defaults to 1.
+  int num_replicas() const { return num_replicas_; }
+  ExecutableBuildOptions& set_num_replicas(int num_replicas);
+
  private:
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
   absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  int num_replicas_ = 1;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 27f9c61848dfb0a2c6739081f2893713412b975a..1ddd3c2a4550605dd78a9d0079bea58f7d3a6582 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -49,48 +49,6 @@ xla_test(
     ],
 )
 
-cc_library(
-    name = "cholesky",
-    srcs = ["cholesky.cc"],
-    hdrs = ["cholesky.h"],
-    deps = [
-        ":math",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:loops",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
-        "//tensorflow/core:lib",
-    ],
-)
-
-xla_test(
-    name = "cholesky_test",
-    srcs = ["cholesky_test.cc"],
-    tags = ["optonly"],
-    deps = [
-        ":arithmetic",
-        ":cholesky",
-        ":matrix",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "comparators",
     srcs = ["comparators.cc"],
@@ -184,6 +142,7 @@ cc_library(
     srcs = ["math.cc"],
     hdrs = ["math.h"],
     deps = [
+        ":arithmetic",
         ":constants",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -195,8 +154,10 @@ xla_test(
     name = "math_test",
     srcs = ["math_test.cc"],
     deps = [
+        ":constants",
         ":math",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -213,13 +174,18 @@ cc_library(
     deps = [
         ":arithmetic",
         ":constants",
+        ":slicing",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -230,12 +196,16 @@ xla_test(
     deps = [
         ":matrix",
         ":slicing",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -274,7 +244,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
     ],
 )
@@ -326,12 +295,7 @@ cc_library(
     srcs = ["slicing.cc"],
     hdrs = ["slicing.h"],
     deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
     ],
@@ -345,7 +309,6 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -357,6 +320,7 @@ cc_library(
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
+        ":comparators",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -372,7 +336,6 @@ xla_test(
         ":sorting",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -433,51 +396,105 @@ cc_library(
 )
 
 cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
+    name = "self_adjoint_eig",
+    srcs = ["self_adjoint_eig.cc"],
+    hdrs = ["self_adjoint_eig.h"],
     deps = [
-        "//tensorflow/compiler/xla:literal",
+        ":arithmetic",
+        ":comparators",
+        ":constants",
+        ":loops",
+        ":math",
+        ":matrix",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
 
 xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-        "noasan",  # sometimes times out, http://b/78650012
+    name = "self_adjoint_eig_test",
+    srcs = ["self_adjoint_eig_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
     ],
+    real_hardware_only = True,
+    shard_count = 10,
+    tags = ["optonly"],
     deps = [
+        ":arithmetic",
+        ":constants",
+        ":matrix",
+        ":self_adjoint_eig",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "svd",
+    srcs = ["svd.cc"],
+    hdrs = ["svd.h"],
+    deps = [
+        ":arithmetic",
+        ":comparators",
+        ":constants",
+        ":loops",
         ":math",
         ":matrix",
-        ":triangular_solve",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "svd_test",
+    srcs = ["svd_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
+    ],
+    real_hardware_only = True,
+    shard_count = 10,
+    tags = ["optonly"],
+    deps = [
+        ":arithmetic",
+        ":constants",
+        ":matrix",
+        ":slicing",
+        ":svd",
         "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/cholesky.h b/tensorflow/compiler/xla/client/lib/cholesky.h
deleted file mode 100644
index 0bae26837c0f14dd0cfab82cf426becc787ec11c..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/cholesky.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// Computes the Cholesky decompositions of a batch of symmetric positive
-// definite matrices.
-// `a` must be a (batched) square matrix; i.e., it must have rank >= 2 with the
-// two minor dimensions equal.
-// The algorithm implements a blocked Cholesky decomposition; `block_size` is
-// the block size to use.
-// TODO(phawkins): check for negative values on the diagonal and return an
-// error, instead of silently yielding NaNs.
-// TODO(znado): handle the complex Hermitian case
-xla::XlaOp Cholesky(
-    xla::XlaOp a, int64 block_size = 256,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc
index 1ada7b4a964ccf7ca400b937abbe425bef083468..6bd56a8df0a5d0417f747a158664ed0daa8a7b40 100644
--- a/tensorflow/compiler/xla/client/lib/constants.cc
+++ b/tensorflow/compiler/xla/client/lib/constants.cc
@@ -80,6 +80,24 @@ XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type) {
+  switch (type) {
+    case F16:
+      return ConstantR0<Eigen::half>(builder,
+                                     std::numeric_limits<Eigen::half>::min());
+    case BF16:
+      return ConstantR0<bfloat16>(builder, bfloat16::min_positive_normal());
+    case F32:
+      return ConstantR0<float>(builder, std::numeric_limits<float>::min());
+    case F64:
+      return ConstantR0<double>(builder, std::numeric_limits<double>::min());
+    default:
+      return builder->ReportError(
+          InvalidArgument("Invalid type for MinPositiveNormalValue (%s).",
+                          PrimitiveType_Name(type)));
+  }
+}
+
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) {
   return ConstantLiteral(builder, LiteralUtil::MaxValue(type));
 }
@@ -100,4 +118,28 @@ XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    switch (type) {
+      case F16:
+        return ConstantR0<Eigen::half>(
+            builder, Eigen::NumTraits<Eigen::half>::quiet_NaN());
+      case BF16:
+        return ConstantR0<bfloat16>(
+            builder, bfloat16(std::numeric_limits<float>::quiet_NaN()));
+      case F32:
+        return ConstantR0<float>(builder,
+                                 std::numeric_limits<float>::quiet_NaN());
+      case F64:
+        return ConstantR0<double>(builder,
+                                  std::numeric_limits<double>::quiet_NaN());
+      default:
+        return InvalidArgument(
+            "Operand to NanValue was %s, but must be a real-valued "
+            "floating-point type.",
+            PrimitiveType_Name(type));
+    }
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index a38282e8dbd7c8ac247a9c16f1e756c6e23a1360..47b8f1b44ffa12b2b15be0e865d693a709962e6e 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -90,6 +90,27 @@ XlaOp ScalarLike(XlaOp prototype, T value) {
   });
 }
 
+// Returns an array or scalar containing copies of `value` cast to the same
+// run-type type as `prototype` and broadcast to the same dimensions as
+// `prototype`.
+//
+// If `prototype` is not a scalar or array, returns an error.
+template <typename T>
+XlaOp FullLike(XlaOp prototype, T value) {
+  XlaBuilder* builder = prototype.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype));
+    if (ShapeUtil::IsScalar(shape) || shape.IsArray()) {
+      return Broadcast(ScalarLike(prototype, value), shape.dimensions());
+    } else {
+      return InvalidArgument(
+          "Prototype shape for BroadcastConstantLike must be a scalar or "
+          "array, but was %s",
+          shape.ToString());
+    }
+  });
+}
+
 // Returns a scalar with value '0' of 'type'.
 XlaOp Zero(XlaBuilder* builder, PrimitiveType type);
 
@@ -114,6 +135,9 @@ XlaOp MinValue(XlaBuilder* builder, PrimitiveType type);
 // point type, this is equal to -MaxFiniteValue().
 XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns the minimum positive normal value for floating-point type `type`.
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type);
+
 // Returns the maximum representable finite or infinite value for 'type'.
 // Returns 'inf' for floating-point types.
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
@@ -121,6 +145,9 @@ XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
 // Returns the maximum representable finite value for 'type'.
 XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns a nan for the given type.  Only valid for real-valued fp types.
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index f4320f65c1f76d4d4c384110b39d6606773aaf01..180175b7495b32250af8ae77c8c7fba804703885 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -155,5 +155,12 @@ XLA_TEST_F(ConstantsTest, MaxValueF32) {
                              {});
 }
 
+XLA_TEST_F(ConstantsTest, NanValueF32) {
+  XlaBuilder builder(TestName());
+  NanValue(&builder, F32);
+  ComputeAndCompareR0<float>(&builder, std::numeric_limits<float>::quiet_NaN(),
+                             {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 3d0e3a2b93fe7347597be7f0fb6ee8147948a3e5..20d3c0fc549d9cbb14c8d8e271ff386a06b5ecab 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This macro is required to make MSVC defines math constants in math.h
+#define _USE_MATH_DEFINES
+#include <math.h>
+
 #include "tensorflow/compiler/xla/client/lib/math.h"
 
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -22,51 +27,112 @@ limitations under the License.
 
 namespace xla {
 
-XlaOp Sqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, 0.5)); }
+// Returns operation(operand), except if `operand` is one of the types in
+// upcast_types, in which case first converts it to F32, and then converts the
+// result down to the original type.
+static XlaOp DoWithUpcastToF32(XlaOp operand,
+                               absl::Span<const PrimitiveType> upcast_types,
+                               const std::function<XlaOp(XlaOp)>& operation) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+    PrimitiveType elem_ty = shape.element_type();
+    bool needs_upcast = absl::c_linear_search(upcast_types, elem_ty);
+
+    if (needs_upcast) {
+      operand = ConvertElementType(operand, F32);
+    }
+    XlaOp result = operation(operand);
+    if (needs_upcast) {
+      result = ConvertElementType(result, elem_ty);
+    }
+    return result;
+  });
+}
 
-XlaOp Rsqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, -0.5)); }
+// TODO(jlebar): Use this function in more places in this file to restrict the
+// domain of other functions.
+static Status EnsureOperandIsRealFp(absl::string_view op_name, XlaOp operand) {
+  auto& b = *operand.builder();
+  TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+  auto elem_ty = shape.element_type();
+  if (!primitive_util::IsFloatingPointType(elem_ty)) {
+    return InvalidArgument(
+        "Operands to %s must be real-valued floating-point, but got %s",
+        op_name, PrimitiveType_Name(elem_ty));
+  }
+  return Status::OK();
+}
 
-XlaOp Square(XlaOp operand) { return operand * operand; }
+XlaOp IsPosInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsPosInf", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+    // Note that this is only correct for floating-point types.  If we wanted it
+    // to be correct for all types, we'd need to Gt(MaxFiniteValue).
+    return Eq(operand, MaxValue(&b, shape.element_type()));
+  });
+}
 
-XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
+XlaOp IsNegInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNegInf", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+    // Note that this is only correct for floating-point types.  If we wanted it
+    // to be correct for all types, we'd need to Lt(MinFiniteValue).
+    return Eq(operand, MinValue(&b, shape.element_type()));
+  });
+}
 
-namespace {
+XlaOp IsInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsInf", operand));
+    return IsPosInf(Abs(operand));
+  });
+}
 
-// Polynomials for computing erf/erfc.  Originally from cephes.
-// Note we use float for compatibility across devices, at the cost of some
-// precision for 64 bit computations.
-//
-// Coefficients are in descending order.
-std::array<float, 9> kErfcPCoefficient = {
-    2.46196981473530512524E-10, 5.64189564831068821977E-1,
-    7.46321056442269912687E0,   4.86371970985681366614E1,
-    1.96520832956077098242E2,   5.26445194995477358631E2,
-    9.34528527171957607540E2,   1.02755188689515710272E3,
-    5.57535335369399327526E2};
-std::array<float, 9> kErfcQCoefficient = {
-    1.00000000000000000000E0, 1.32281951154744992508E1,
-    8.67072140885989742329E1, 3.54937778887819891062E2,
-    9.75708501743205489753E2, 1.82390916687909736289E3,
-    2.24633760818710981792E3, 1.65666309194161350182E3,
-    5.57535340817727675546E2};
-std::array<float, 6> kErfcRCoefficient = {
-    5.64189583547755073984E-1, 1.27536670759978104416E0,
-    5.01905042251180477414E0,  6.16021097993053585195E0,
-    7.40974269950448939160E0,  2.97886665372100240670E0};
-std::array<float, 7> kErfcSCoefficient = {
-    1.00000000000000000000E0, 2.26052863220117276590E0,
-    9.39603524938001434673E0, 1.20489539808096656605E1,
-    1.70814450747565897222E1, 9.60896809063285878198E0,
-    3.36907645100081516050E0};
-std::array<float, 5> kErfTCoefficient = {
-    9.60497373987051638749E0, 9.00260197203842689217E1,
-    2.23200534594684319226E3, 7.00332514112805075473E3,
-    5.55923013010394962768E4};
-std::array<float, 6> kErfUCoefficient = {
-    1.00000000000000000000E0, 3.35617141647503099647E1,
-    5.21357949780152679795E2, 4.59432382970980127987E3,
-    2.26290000613890934246E4, 4.92673942608635921086E4};
-}  // namespace
+XlaOp IsNan(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNan", operand));
+    return Ne(operand, operand);
+  });
+}
+
+XlaOp IsNegZero(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNegZero", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+
+    // The bitwise representation of -0 in bfloat16 and IEEE 754 is 0x80...0
+    // (sign bit on, all other bits off).
+    switch (shape.element_type()) {
+      case F64:
+        return Eq(BitcastConvertType(operand, U64),
+                  ConstantR0WithType(&b, U64, uint64{1} << 63));
+      case F32:
+        return Eq(BitcastConvertType(operand, U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      case F16:
+      case BF16:
+        // Not all XLA backends handle U16 well, so we convert to F32/U32.
+        // TODO(jlebar): It would be nice if we could stay in (B)F16/U16 for
+        // backends that *do* support it.
+        return Eq(BitcastConvertType(ConvertElementType(operand, F32), U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      default:
+        LOG(FATAL) << "Expected real fp type.";
+    }
+  });
+}
+
+XlaOp Square(XlaOp operand) { return operand * operand; }
+
+XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
 
 // Evaluate the polynomial given coefficients and `x`.
 // N.B. Coefficients should be supplied in decreasing order.
@@ -78,27 +144,97 @@ XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients) {
   return poly;
 }
 
-// Compute an approximation of the error function complement (1 - erf(x)).
-XlaOp Erfc(XlaOp x) {
+// Computes an approximation of the error function complement (1 - erf(x)).
+//
+// Precondition: abs(x) >= 1.  Otherwise, use ErfImpl.
+//
+// This follows Cephes's f32 implementation of erfc, and so it may have errors
+// for double precision.
+//
+// See also these alternate implementations of erf and erfc:
+//
+//   https://stackoverflow.com/questions/35148198
+//   https://stackoverflow.com/questions/35966695
+//
+static XlaOp ErfcImpl(XlaOp x) {
+  // Coefficients for erfc(f32), from Cephes.
+  //
+  // erfc(x) = exp(-x^2) P(1/x), 1 < x < 2
+  static std::array<float, 9> kErfcPCoefficient{
+      +2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
+      -5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
+      +3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
+  };
+  // erfc(x) = exp(-x^2) 1/x P(1/x^2), 2 < x < 14
+  static std::array<float, 8> kErfcRCoefficient{
+      -1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
+      +2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
+      -2.820767439740514E-1, +5.641895067754075E-1,
+  };
+
   XlaOp abs_x = Abs(x);
   XlaOp z = Exp(-x * x);
+  XlaOp q = ScalarLike(x, 1) / abs_x;
+  XlaOp y = q * q;
+  XlaOp p = Select(Lt(abs_x, ScalarLike(x, 2.0)),
+                   EvaluatePolynomial(y, kErfcPCoefficient),
+                   EvaluatePolynomial(y, kErfcRCoefficient));
+  y = z * q * p;
+  return Select(Lt(x, ScalarLike(x, 0)), ScalarLike(x, 2.0) - y, y);
+}
 
-  XlaOp pp = EvaluatePolynomial(abs_x, kErfcPCoefficient);
-  XlaOp pq = EvaluatePolynomial(abs_x, kErfcQCoefficient);
-  XlaOp pr = EvaluatePolynomial(abs_x, kErfcRCoefficient);
-  XlaOp ps = EvaluatePolynomial(abs_x, kErfcSCoefficient);
-
-  XlaOp y = Select(Lt(abs_x, ScalarLike(x, 8.0)), z * pp / pq, z * pr / ps);
+// Compute a polynomial approximation of the error function.
+//
+// Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
+//
+// This follows Cephes's f32 implementation of erf, so it may have errors for
+// double precision.
+static XlaOp ErfImpl(XlaOp x) {
+  // Coefficients for by erf(f32), from Cephes.
+  //
+  // erf(x) = x P(x^2), 0 < x < 1
+  static std::array<float, 7> kErfTCoefficient{
+      +7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
+      -2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
+      +1.128379165726710E+0,
+  };
+
+  return x * EvaluatePolynomial(x * x, kErfTCoefficient);
+}
 
-  return Select(Lt(x, ScalarLike(x, 0.0)), ScalarLike(x, 2.0) - y, y);
+XlaOp Erfc(XlaOp x) {
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erfc", x));
+
+    // erfc(x) =
+    //   erfc_impl(x)           if x > 1
+    //   1 - erf_impl(x)        otherwise
+    //
+    // Erf(c)Impl don't have enough precision when run with bf16 intermediates
+    // (not surprising!), so upcast to f32 in this case.
+    return DoWithUpcastToF32(x, {BF16}, [](XlaOp x) {
+      return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl(x),
+                    ScalarLike(x, 1) - ErfImpl(x));
+    });
+  });
 }
 
-// Compute a polynomial approximation of the error function.
 XlaOp Erf(XlaOp x) {
-  XlaOp z = x * x;
-  XlaOp pt = EvaluatePolynomial(z, kErfTCoefficient);
-  XlaOp pu = EvaluatePolynomial(z, kErfUCoefficient);
-  return x * pt / pu;
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erf", x));
+    // erf(x) =
+    //   erf_impl(x)            if x < 1
+    //   1 - erfc_impl(x)       otherwise
+    //
+    // Erf(c)Impl don't have enough precision when run with bf16 intermediates
+    // (not surprising!), so upcast to f32 in this case.
+    return DoWithUpcastToF32(x, {BF16}, [](XlaOp x) {
+      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl(x),
+                    ScalarLike(x, 1) - ErfcImpl(x));
+    });
+  });
 }
 
 // Approximation for the inverse error function from
@@ -114,36 +250,40 @@ XlaOp Erf(XlaOp x) {
 //   }
 //   return p*x
 XlaOp ErfInv(XlaOp x) {
-  XlaBuilder* b = x.builder();
-  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, b->GetShape(x));
-    constexpr int kDegree = 9;
-    constexpr std::array<float, 9> w_less_than_5_constants = {
-        2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-        -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
-        -0.00417768164f,  0.246640727f,    1.50140941f};
-    constexpr std::array<float, 9> w_greater_than_5_constants = {
-        -0.000200214257f, 0.000100950558f, 0.00134934322f,
-        -0.00367342844f,  0.00573950773f,  -0.0076224613f,
-        0.00943887047f,   1.00167406f,     2.83297682f};
+  constexpr int kDegree = 9;
+  constexpr std::array<float, 9> w_less_than_5_constants = {
+      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
+      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
+      -0.00417768164f,  0.246640727f,    1.50140941f};
+  constexpr std::array<float, 9> w_greater_than_5_constants = {
+      -0.000200214257f, 0.000100950558f, 0.00134934322f,
+      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
+      0.00943887047f,   1.00167406f,     2.83297682f};
 
-    auto one = ScalarLike(x, 1.0);
-    auto w = -Log((one - x) * (one + x));
-
-    auto lt = Lt(w, ScalarLike(x, 5.0));
-    auto coefficient = [&](int i) {
-      return Select(lt,
-                    Broadcast(ScalarLike(x, w_less_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())),
-                    Broadcast(ScalarLike(x, w_greater_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())));
-    };
-    w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
-    auto p = coefficient(0);
-    for (int i = 1; i < kDegree; ++i) {
-      p = coefficient(i) + p * w;
-    }
-    return p * x;
+  auto one = ScalarLike(x, 1.0);
+  auto w = -Log((one - x) * (one + x));
+
+  auto lt = Lt(w, ScalarLike(x, 5.0));
+  auto coefficient = [&](int i) {
+    return Select(lt, FullLike(x, w_less_than_5_constants[i]),
+                  FullLike(x, w_greater_than_5_constants[i]));
+  };
+  w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
+  auto p = coefficient(0);
+  for (int i = 1; i < kDegree; ++i) {
+    p = coefficient(i) + p * w;
+  }
+
+  // Result modulo edge cases.
+  XlaOp result = p * x;
+
+  // Handle edge cases, namely erfinv(+/-1) = +/-inf.  (The above computation is
+  // indeterminate, and can give nan or -/+inf.)
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, b.GetShape(x));
+    return Select(Eq(Abs(x), ScalarLike(x, 1)),
+                  x * MaxValue(&b, shape.element_type()), result);
   });
 }
 
@@ -171,49 +311,108 @@ static constexpr std::array<double, 8> kLanczosCoefficients = {
 // t(z) = z + kLanczosGamma + 1/2
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 XlaOp Lgamma(XlaOp input) {
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
-
-  XlaOp pi = ScalarLike(input, M_PI);
-  XlaOp log_pi = ScalarLike(input, std::log(M_PI));
-  XlaOp log_sqrt_two_pi = ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
-
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
-
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
-
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // gamma(x) = pi / sin(pi * x) * gamma(1 - x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
-
-  XlaOp x = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    x = x + lanczos_coefficient / (z + index + one);
-  }
+  auto do_it = [](XlaOp input) {
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
+
+    XlaOp pi = ScalarLike(input, M_PI);
+    XlaOp log_pi = ScalarLike(input, std::log(M_PI));
+    XlaOp log_sqrt_two_pi =
+        ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
+
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
+
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp x = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      x = x + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
-
-  // If z = a + 0j, the analytic continuation of log reduces to taking the
-  // absolute value of the real part.
-  // Re(log(z)) = Re(log|z| + arg(z)j)
-  //            = log|a|
-  XlaOp reflection = log_pi - Log(Abs(Sin(pi * input))) - log_y;
-  XlaOp result = Select(need_to_reflect, reflection, log_y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    // Compute the final result (modulo reflection).  t(z) may be large, and we
+    // need to be careful not to overflow to infinity in the first term of
+    //
+    //   (z + 1/2) * log(t(z)) - t(z).
+    //
+    // Therefore we compute this as
+    //
+    //   (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
+    //
+    XlaOp log_y = log_sqrt_two_pi + (z + one_half - t / log_t) * log_t + Log(x);
+
+    // Compute the reflected value, used when x < 0.5:
+    //
+    //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
+    //
+    // (The abs is because lgamma is the log of the absolute value of the gamma
+    // function.)
+    //
+    // We have to be careful when computing the final term above. gamma(x) goes
+    // to +/-inf at every integer x < 0, and this is controlled by the
+    // sin(pi * x) term.  The slope is large, so precision is particularly
+    // important.
+    //
+    // Because abs(sin(pi * x)) has period 1, we can equivalently use
+    // abs(sin(pi * frac(x))), where frac(x) is the fractional part of x.  This
+    // is more numerically accurate: It doesn't overflow to inf like pi * x can,
+    // and if x is an integer, it evaluates to 0 exactly, which is significant
+    // because we then take the log of this value, and log(0) is inf.
+    //
+    // We don't have a frac(x) primitive in XLA and computing it is tricky, but
+    // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for
+    // our purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
+    //
+    // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
+    // to 1.  To remedy this, we can use the fact that sin(pi * x) in the domain
+    // [0, 1] is symmetric across the line Y=0.5.
+    //
+    XlaOp abs_input = Abs(input);
+    XlaOp abs_frac_input = abs_input - Floor(abs_input);
+    // Convert values of abs_frac_input > 0.5 to (1 - frac_input) to improve
+    // precision of pi * abs_frac_input for values of abs_frac_input close to 1.
+    XlaOp reduced_frac_input =
+        Select(Gt(abs_frac_input, ScalarLike(abs_frac_input, 0.5)),
+               ScalarLike(abs_frac_input, 1) - abs_frac_input, abs_frac_input);
+    XlaOp reflection_denom = Log(Sin(pi * reduced_frac_input));
+
+    // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
+    // then it "wins" and the result is +/-inf.
+    XlaOp reflection =
+        Select(IsFinite(reflection_denom), log_pi - reflection_denom - log_y,
+               -reflection_denom);
+    XlaOp result = Select(need_to_reflect, reflection, log_y);
+
+    // lgamma(+/-inf) = +inf.
+    XlaOp inf_bcast = FullLike(input, std::numeric_limits<float>::infinity());
+    return Select(IsInf(input), inf_bcast, result);
+  };
+
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Lgamma", input));
+    // F16 and BF16 don't provide sufficient precision for intermediate results
+    // here (although it's better than you might expect!), so do the
+    // computations in F32.
+    return DoWithUpcastToF32(input, {BF16, F16}, do_it);
+  });
 }
 
 // Compute the Digamma function using Lanczos' approximation from "A Precision
@@ -224,69 +423,101 @@ XlaOp Lgamma(XlaOp input) {
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 // A'(z) = sigma(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
 XlaOp Digamma(XlaOp input) {
-  XlaOp zero = ScalarLike(input, 0);
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
-
-  XlaOp pi = ScalarLike(input, M_PI);
-
-  XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
-
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
-
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
-
-  XlaOp num = zero;
-  XlaOp denom = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
-    denom = denom + lanczos_coefficient / (z + index + one);
-  }
+  auto do_it = [](XlaOp input) {
+    XlaOp zero = ScalarLike(input, 0);
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
+
+    XlaOp pi = ScalarLike(input, M_PI);
+
+    XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
+
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp num = zero;
+    XlaOp denom = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
+      denom = denom + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp y = log_t + num / denom - lanczos_gamma / t;
-  XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input);
-  XlaOp result = Select(need_to_reflect, reflection, y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    XlaOp y = log_t + num / denom - lanczos_gamma / t;
+
+    // We need to be careful how we compute cot(pi * input) below: For
+    // near-integral values of `input`, pi * input can lose precision.
+    //
+    // Input is already known to be less than 0.5 (otherwise we don't have to
+    // reflect).  We shift values smaller than -0.5 into the range [-.5, .5] to
+    // increase precision of pi * input and the resulting cotangent.
+    XlaOp reduced_input = input + Abs(Floor(input + ScalarLike(input, 0.5)));
+    XlaOp reflection =
+        y - pi * Cos(pi * reduced_input) / Sin(pi * reduced_input);
+    XlaOp real_result = Select(need_to_reflect, reflection, y);
+
+    // Digamma has poles at negative integers and zero; return nan for those.
+    return Select(And(Le(input, zero), Eq(input, Floor(input))),
+                  FullLike(input, std::numeric_limits<float>::quiet_NaN()),
+                  real_result);
+  };
+
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Digamma", input));
+    return DoWithUpcastToF32(input, {BF16, F16}, do_it);
+  });
 }
 
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = ScalarLike(x, 0.5);
-  auto one = ScalarLike(x, 1.0);
-  auto two = ScalarLike(x, 2.0);
-
-  auto round_val = Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * Floor(half * x);
-  auto is_odd = Eq(nearest_even_int, one);
-  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
-                round_val + one, round_val);
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs (What does it even mean to round a complex
+    // number?  Do you round each component equally?  In that case, you should
+    // just ask for that explicitly.)
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("RoundToEven", x));
+
+    auto half = ScalarLike(x, 0.5);
+    auto one = ScalarLike(x, 1.0);
+    auto two = ScalarLike(x, 2.0);
+
+    auto round_val = Floor(x);
+    auto fraction = x - round_val;
+    auto nearest_even_int = round_val - two * Floor(half * x);
+    auto is_odd = Eq(nearest_even_int, one);
+    return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                  round_val + one, round_val);
+  });
 }
 
 // Trigonometric functions.
 
-// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
+//           pi                                if x == -1
 XlaOp Acos(XlaOp x) {
-  return ScalarLike(x, 2.0) *
-         Atan2(Sqrt(ScalarLike(x, 1.0) - x * x), ScalarLike(x, 1.0) + x);
+  return Select(Ne(x, FullLike(x, -1)),
+                ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
+                                           ScalarLike(x, 1.0) + x),
+                FullLike(x, M_PI));
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
@@ -330,4 +561,82 @@ XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
   });
 }
 
+XlaOp NextAfter(XlaOp from, XlaOp to) {
+  auto builder = from.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(from));
+    int bitwidth = primitive_util::BitWidth(shape.element_type());
+    auto int_type = primitive_util::UnsignedIntegralTypeForBitWidth(bitwidth);
+    auto from_as_int = BitcastConvertType(from, int_type);
+    auto to_as_int = BitcastConvertType(to, int_type);
+
+    // The result is NaN if either "from" or "to" are NaN.
+    auto from_is_nan = Ne(from, from);
+    auto to_is_nan = Ne(to, to);
+    auto nan_input = Or(from_is_nan, to_is_nan);
+    auto result_for_nan =
+        Broadcast(ScalarLike(from, std::numeric_limits<double>::quiet_NaN()),
+                  shape.dimensions());
+    result_for_nan = BitcastConvertType(result_for_nan, int_type);
+
+    // The sign bit is the MSB.
+    const int64 sign_mask = int64{1} << (bitwidth - 1);
+    // Discard the sign bit to make the result non-negative.
+    auto from_abs = And(from_as_int, ScalarLike(from_as_int, ~sign_mask));
+    auto to_abs = And(to_as_int, ScalarLike(to_as_int, ~sign_mask));
+
+    // When both "from" and "to" are equal, the result is "to".
+    // N.B. It would not make a difference if we chose the result to be "from".
+    auto from_and_to_are_equal = Eq(from_as_int, to_as_int);
+    auto result_for_equal = to_as_int;
+
+    // When both "from" and "to" are both 0, the result is "to". This ensures we
+    // get a zero signed like "to".
+    auto from_is_zero = Eq(from_abs, ZerosLike(from_abs));
+    auto to_is_zero = Eq(to_abs, ZerosLike(to_abs));
+    auto result_for_both_zero = to_as_int;
+
+    auto from_sign = And(from_as_int, ScalarLike(from_as_int, sign_mask));
+    auto to_sign = And(to_as_int, ScalarLike(to_as_int, sign_mask));
+
+    // If from == 0 && to != 0, we need to return the smallest subnormal number
+    // signed like "to".
+    auto result_for_from_zero_to_non_zero =
+        Or(to_sign, ScalarLike(from_as_int, 1));
+
+    // If the sign of "from" and "to" disagree:
+    // - we need to make the magnitude of "from" smaller so that it is closer to
+    //   zero.
+    //
+    // Otherwise the signs agree:
+    // - "from" with a magnitude larger than "to" means we need to make the
+    //   magnitude smaller.
+    // - "from" with a magnitude smaller than "to" means we need to make the
+    //   magnitude larger.
+    // - "from" with the same magnitude and sign as "to" has already been
+    //   handled.
+    auto signs_disagree = Ne(from_sign, to_sign);
+    auto from_magnitude_larger_than_to = Gt(from_abs, to_abs);
+    auto result_has_smaller_magnitude =
+        Or(from_magnitude_larger_than_to, signs_disagree);
+    auto magnitude_adjustment =
+        Select(result_has_smaller_magnitude,
+               Broadcast(ScalarLike(from_as_int, -1), shape.dimensions()),
+               Broadcast(ScalarLike(from_as_int, 1), shape.dimensions()));
+    auto result = Add(from_as_int, magnitude_adjustment);
+    // Handle from == ±0.
+    result = Select(from_is_zero,
+                    Select(to_is_zero, result_for_both_zero,
+                           result_for_from_zero_to_non_zero),
+                    result);
+    // Handle from == to.
+    result = Select(from_and_to_are_equal, result_for_equal, result);
+    // Handle isnan(from) || isnan(to).
+    result = Select(nan_input, result_for_nan, result);
+
+    // Cast back to the original type.
+    return BitcastConvertType(result, shape.element_type());
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 17612bf9fdc0f1eabb338671c93c025c5b268872..71a3acedcec0a8e65561d4139baeaf532ec8bf46 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -20,11 +20,22 @@ limitations under the License.
 
 namespace xla {
 
-// Computes the square root of 'operand'.
-XlaOp Sqrt(XlaOp operand);
-
-// Computes the reciprocal of the square root of 'operand'.
-XlaOp Rsqrt(XlaOp operand);
+// Determines whether operand is +/-inf or nan.
+//
+// Raises an error if called on integral or complex values.
+XlaOp IsPosInf(XlaOp operand);
+XlaOp IsNegInf(XlaOp operand);
+XlaOp IsInf(XlaOp operand);
+XlaOp IsNan(XlaOp operand);
+
+// Determines whether operand is equal to -0.
+//
+// Raises an error for integral or complex values.
+XlaOp IsNegZero(XlaOp operand);
+
+// Returns the next number after 'from' in the direction of 'to' the same way
+// std::nextafter(from, to) would.
+XlaOp NextAfter(XlaOp from, XlaOp to);
 
 // Computes the square of 'operand'.
 XlaOp Square(XlaOp operand);
@@ -32,7 +43,7 @@ XlaOp Square(XlaOp operand);
 // Computes the reciprocal of 'operand'.
 XlaOp Reciprocal(XlaOp operand);
 
-// Evaluates a polynomial given coefficients and `x`.
+// Evaluates a polynomial given coefficients and 'x'.
 // N.B. Coefficients should be supplied in decreasing order.
 XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients);
 
@@ -86,7 +97,7 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
-// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// Applies a complex conjugation operation if 'a' is complex and 'conjugate'
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index ae2ea225d1aadd7b3a794eabeca866c498f34760..50613ce50255b8e211f6e64afbe0add290dfc647 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -30,6 +32,185 @@ class MathTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
+// Write TYPED_TESTs within the class definition so that we don't have to litter
+// "this->" everywhere.
+template <typename T>
+class MathTypedTest : public MathTest {
+ public:
+  void TestLogEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}}), &b));
+    ComputeAndCompareR1<T>(&b,
+                           {-std::numeric_limits<T>::infinity(),
+                            -std::numeric_limits<T>::infinity()},
+                           {}, error_spec_);
+  }
+
+  void TestLog1pEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log1p(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}, T{-1.0}}), &b));
+    ComputeAndCompareR1<T>(
+        &b, {T{0.0}, T{-0.0}, -std::numeric_limits<T>::infinity()}, {},
+        error_spec_);
+  }
+
+  void TestIsInfOrNan() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    auto x =
+        ConstantR1<T>(&b, {
+                              T{0},
+                              T{100},
+                              T{-1000},
+                              T{std::numeric_limits<T>::max()},
+                              T{std::numeric_limits<T>::lowest()},
+                              T{std::numeric_limits<float>::infinity()},
+                              T{-std::numeric_limits<float>::infinity()},
+                              T{std::numeric_limits<float>::quiet_NaN()},
+                              T{std::numeric_limits<float>::signaling_NaN()},
+                          });
+    Tuple(&b, {IsFinite(x), IsInf(x), IsPosInf(x), IsNegInf(x), IsNan(x)});
+
+    auto expected = LiteralUtil::MakeTupleOwned(
+        LiteralUtil::CreateR1<bool>(
+            {true, true, true, true, true, false, false, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, true, true, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, true, false, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, false, true, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, false, false, true, true}));
+    ComputeAndCompareLiteral(&b, expected, {});
+  }
+
+  void TestIsNegZero() {
+    SetFastMathDisabled(true);
+    XlaBuilder b(TestName());
+    T inf(std::numeric_limits<float>::infinity());
+    T nan(std::numeric_limits<float>::quiet_NaN());
+    IsNegZero(AddParam(
+        LiteralUtil::CreateR1<T>({T{-0.0}, T{0}, T{1}, T{-1}, inf, -inf, nan}),
+        &b));
+
+    ComputeAndCompareLiteral(
+        &b,
+        LiteralUtil::CreateR1<bool>(
+            {true, false, false, false, false, false, false}),
+        {}, error_spec_);
+  }
+
+  // sqrt(x) == pow(x, 0.5) except that
+  //
+  //   pow(-inf, 0.5) == inf, while
+  //   sqrt(-inf)     == nan.
+  //
+  // Check that none of our backends are incorrectly assuming that sqrt(x) ==
+  // pow(x, 0.5) without checking this edge case.
+  //
+  // For good measure, we also check pow with an exponent other than 0.5.
+  void TestSqrtPowInequivalence() {
+    SetFastMathDisabled(true);
+
+    // Tests disable constant folding by default, but this test needs it
+    // enabled, otherwise we don't tickle the bug we're trying to catch.
+    // Specifically, without constant folding, the constants we pass to Pow
+    // below are hidden behind a reshape that's never folded away!
+    mutable_debug_options()->clear_xla_disable_hlo_passes();
+
+    const T inf(std::numeric_limits<float>::infinity());
+    const T nan(std::numeric_limits<float>::quiet_NaN());
+
+    XlaBuilder b(TestName());
+    auto x = AddParam(LiteralUtil::CreateR1<T>({-inf}), &b);
+    ConcatInDim(
+        &b, {Sqrt(x), Pow(x, ScalarLike(x, 0.5)), Pow(x, ScalarLike(x, 0.3))},
+        0);
+    std::vector<T> expected = {nan, inf, inf};
+    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+  }
+
+  void TestErfEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    auto x = AddParam(LiteralUtil::CreateR1<T>({T{-1}, T{1}, T{0}}), &b);
+    ErfInv(x);
+
+    const T inf(std::numeric_limits<float>::infinity());
+    std::vector<T> expected = {-inf, inf, T{0}};
+
+    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+  }
+};
+
+// TODO(b/123355973): Add bfloat16 to TestTypes once it's working.
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+using TestTypes = ::testing::Types<float>;
+#else
+using TestTypes = ::testing::Types<float, Eigen::half>;
+#endif
+
+TYPED_TEST_CASE(MathTypedTest, TestTypes);
+
+XLA_TYPED_TEST(MathTypedTest, LogEdgeCases) { this->TestLogEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, Log1pEdgeCases) { this->TestLog1pEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, IsInfOrNan) { this->TestIsInfOrNan(); }
+XLA_TYPED_TEST(MathTypedTest, IsNegZero) { this->TestIsNegZero(); }
+XLA_TYPED_TEST(MathTypedTest, SqrtPowInequivalence) {
+  this->TestSqrtPowInequivalence();
+}
+XLA_TYPED_TEST(MathTypedTest, ErfInvEdgeCases) { this->TestErfEdgeCases(); }
+
+// Check that certain ops only support real, floating-point inputs.
+//
+// TODO(jlebar): Expand this test to cover more ops.
+XLA_TEST_F(MathTest, RealFpOnlyOps) {
+  for (int64 i = PrimitiveType_MIN; i <= PrimitiveType_MAX; ++i) {
+    auto ty = static_cast<PrimitiveType>(i);
+    SCOPED_TRACE(PrimitiveType_Name(ty));
+    Shape shape;
+    if (primitive_util::IsArrayType(ty)) {
+      shape = ShapeUtil::MakeShape(ty, {42});
+    } else if (ty == PrimitiveType::TUPLE) {
+      shape = ShapeUtil::MakeTupleShape({});
+    } else if (ty == PrimitiveType::OPAQUE) {
+      shape = ShapeUtil::MakeOpaqueShape();
+    } else if (ty == PrimitiveType::TOKEN) {
+      shape = ShapeUtil::MakeTokenShape();
+    } else {
+      continue;
+    }
+
+    for (const auto& test :
+         std::vector<std::pair<std::function<XlaOp(XlaOp)>, string>>({
+             {IsFinite, "is_finite"},
+             {IsInf, "is_inf"},
+             {IsPosInf, "is_pos_inf"},
+             {IsNegInf, "is_neg_inf"},
+             {IsNan, "is_nan"},
+             {Erf, "erf"},
+             {Erfc, "erfc"},
+             {Lgamma, "lgamma"},
+             {Digamma, "digamma"},
+             {RoundToEven, "round_to_even"},
+         })) {
+      SCOPED_TRACE(test.second);
+      XlaBuilder b(TestName());
+      XlaOp p = Parameter(&b, 0, shape, "p0");
+      test.first(p);
+
+      EXPECT_EQ(b.first_error().ok(), primitive_util::IsFloatingPointType(ty));
+    }
+  }
+}
+
 XLA_TEST_F(MathTest, SqrtF32) {
   XlaBuilder builder(TestName());
   Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
@@ -106,6 +287,29 @@ XLA_TEST_F(MathTest, Lgamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+XLA_TEST_F(MathTest, LgammaF16) {
+  SetFastMathDisabled(true);
+
+  XlaBuilder b(TestName());
+
+  // These seemingly arbitrary inputs came from debugging the lgamma
+  // implementation against a test which tried all possible f16 values.
+  auto x = ConstantR1<half>(&b, {
+                                    half(-7360.0),
+                                    half(-4066.0),
+                                    half(-5.9605e-08),
+                                });
+  Lgamma(x);
+  std::vector<half> expected = {
+      std::numeric_limits<half>::infinity(),
+      std::numeric_limits<half>::infinity(),
+      half(16.64),
+  };
+  ComputeAndCompareR1<half>(&b, expected, {}, ErrorSpec{0.1});
+}
+#endif
+
 XLA_TEST_F(MathTest, Digamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 0.5, 1 / 3.0, 0.25, 1 / 6.0, 0.125,
@@ -148,5 +352,40 @@ XLA_TEST_F(MathTest, RoundToEven) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, ErfRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erf(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, ErfcRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erfc(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, LgammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Lgamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, DigammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Digamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, RoundToEvenRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  RoundToEven(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index dcec2139e47fc86d81a8877b4dccc43eb2b7207f..a055a8e625c680cf5232896c95cd35b78cb172bc 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -15,17 +15,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include <array>
 #include <numeric>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -37,7 +46,7 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
   return ConvertElementType(indicator, type);
 }
 
-XlaOp GetMatrixDiagonal(XlaOp x) {
+XlaOp GetMatrixDiagonal(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
@@ -45,10 +54,13 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
+
+    auto offset = ConstantR0WithType(builder, S32, k);
+
     absl::Span<const int64> major_dims =
         AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + offset;
     auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
     auto mask = Broadcast(indicator, major_dims);
 
@@ -58,9 +70,21 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
         primitive_util::IsIntegralType(shape.element_type())
             ? CreateScalarOrComputation(shape.element_type(), builder)
             : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    // k == 0, we can save one slice op.
+    if (k == 0) {
+      return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                    reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    } else if (k > 0) {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 2});
+      return SliceInMinorDims(result, {std::min<int64>(k, n)},
+                              {std::min(m + k, n)});
+    } else {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 1});
+      return SliceInMinorDims(result, {std::min<int64>(-k, m)},
+                              {std::min(m, n - k)});
+    }
   });
 }
 
@@ -91,77 +115,224 @@ XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
 
 XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
 
-XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config) {
+  for (auto dim : output_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(y_config, dim)) {
+      if (absl::c_count(output_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated output dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has output dimension without corresponding input dimension.");
+  }
+  for (auto dim : x_config) {
+    if (absl::c_linear_search(y_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(x_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated lhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has lhs dimension without corresponding rhs or output "
+        "dimension.");
+  }
+  for (auto dim : y_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(y_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated rhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has rhs dimension without corresponding lhs or output "
+        "dimension.");
+  }
+  return Status::OK();
+}
+
+xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+                  absl::Span<const int64> y_config,
+                  absl::Span<const int64> output_config,
+                  xla::PrecisionConfig::Precision precision) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+    TF_RETURN_IF_ERROR(
+        ValidateEinsumNumericDimensions(x_config, y_config, output_config));
+    const int64 x_rank = x_config.size();
+    const int64 y_rank = y_config.size();
+    const int64 output_rank = output_config.size();
+    absl::flat_hash_set<int64> x_map;
+    absl::flat_hash_set<int64> y_map;
+    absl::flat_hash_set<int64> output_map;
+
+    auto find = [&](const absl::flat_hash_set<int64>& map, int64 d) {
+      return map.count(d) != 0;
+    };
 
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (x_shape.rank() != y_shape.rank()) {
-      return InvalidArgument(
-          "Arguments to BatchDot have different ranks: %s vs. %s",
-          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    auto insert = [&](absl::flat_hash_set<int64>& map, char d) {
+      CHECK(!find(map, d));
+      map.insert(d);
+    };
+
+    for (auto d : x_config) {
+      insert(x_map, d);
     }
-    const int ndims = x_shape.rank();
-    if (ndims < 2) {
-      return InvalidArgument(
-          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+
+    for (auto d : y_config) {
+      insert(y_map, d);
     }
 
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return InvalidArgument(
-            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
-            i, ShapeUtil::HumanString(x_shape),
-            ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
+    for (auto d : output_config) {
+      insert(output_map, d);
     }
 
-    int x_inner_dim = ndims - 1;
-    int y_inner_dim = ndims - 2;
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return InvalidArgument(
-          "Dimensions %d and %d of arguments to BatchDot must be equal: "
-          "shapes %s vs %s",
-          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
-          ShapeUtil::HumanString(y_shape));
+    DotDimensionNumbers dnums;
+    std::vector<int64> lhs_outer_dims;
+    auto is_batch_dim = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d) && find(output_map, d);
+    };
+    auto is_contracting = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d);
+    };
+    auto rhs_dimension_number = [&](int64 d) {
+      return absl::c_find(y_config, d) - y_config.begin();
+    };
+    for (int64 i = 0; i < x_rank; ++i) {
+      auto dim_name = x_config[i];
+      if (is_batch_dim(dim_name)) {
+        dnums.add_lhs_batch_dimensions(i);
+        dnums.add_rhs_batch_dimensions(rhs_dimension_number(dim_name));
+      } else if (is_contracting(dim_name)) {
+        dnums.add_lhs_contracting_dimensions(i);
+        dnums.add_rhs_contracting_dimensions(rhs_dimension_number(dim_name));
+      } else {
+        lhs_outer_dims.push_back(i);
+      }
     }
 
-    // Check for zero lhs/rhs dim size.
-    if (ShapeUtil::IsZeroElementArray(x_shape) ||
-        ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+    std::vector<int64> rhs_outer_dims;
+    for (int64 i = 0; i < y_rank; ++i) {
+      auto dim_name = y_config[i];
+      if (!is_batch_dim(dim_name) && !is_contracting(dim_name)) {
+        rhs_outer_dims.push_back(i);
       }
-      int x_outer_dim = ndims - 2;
-      int y_outer_dim = ndims - 1;
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return Broadcast(
-          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
+    }
+
+    auto output_dimension_number = [&](char d) {
+      return absl::c_find(output_config, d) - output_config.begin();
+    };
+
+    std::vector<int64> output_dims;
+    output_dims.reserve(output_rank);
+    for (auto d : dnums.lhs_batch_dimensions()) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : lhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : rhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(y_config[d]));
+    }
+
+    std::vector<int64> transpose_dims(output_rank);
+    for (int64 i = 0; i < output_rank; ++i) {
+      transpose_dims[output_dims[i]] = i;
     }
 
     PrecisionConfig precision_proto;
     precision_proto.add_operand_precision(precision);
     precision_proto.add_operand_precision(precision);
+    return Transpose(DotGeneral(x, y, dnums, &precision_proto), transpose_dims);
+  });
+}
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    const int ndims = x_shape.rank();
+    batch_dimension_numbers.reserve(ndims - 2);
+    for (int i = 0; i < ndims - 2; ++i) {
+      batch_dimension_numbers.push_back(i);
+    }
+    std::vector<int64> x_config = batch_dimension_numbers;
+    x_config.push_back(ndims - 2);
+    x_config.push_back(ndims);
+    std::vector<int64> y_config = batch_dimension_numbers;
+    y_config.push_back(ndims);
+    y_config.push_back(ndims - 1);
+    std::vector<int64> output_config = batch_dimension_numbers;
+    output_config.push_back(ndims - 2);
+    output_config.push_back(ndims - 1);
+    return Einsum(x, x_config, y, y_config, output_config, precision);
+  });
+}
+
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config) {
+  std::array<std::vector<int64>, 3> einsum_config_numeric;
+  std::vector<absl::string_view> main_split =
+      absl::StrSplit(einsum_config, ',');
 
-    DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+  if (main_split.size() != 2) {
+    return InvalidArgument("Expected one \",\" in einsum_config.");
+  }
+
+  auto maybe_invalid_character = [](char d) {
+    if (absl::ascii_isalpha(d)) {
+      return Status::OK();
+    }
+    if (d == '.') {
+      return InvalidArgument("Unsupported \"...\" or \".\" in einsum config.");
     }
+    return InvalidArgument("Unexpected character in einsum config.");
+  };
+
+  auto& x_config = einsum_config_numeric[0];
+  x_config.reserve(main_split[0].size());
+  for (auto d : main_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    x_config.push_back(static_cast<int64>(d));
+  }
+  std::vector<absl::string_view> y_output_split =
+      absl::StrSplit(main_split[1], "->");
+  if (y_output_split.size() != 2) {
+    return InvalidArgument("Expected one \"->\" in einsum_config.");
+  }
+  auto& y_config = einsum_config_numeric[1];
+  y_config.reserve(y_output_split[0].size());
+  for (auto d : y_output_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    y_config.push_back(static_cast<int64>(d));
+  }
+  auto& output_config = einsum_config_numeric[2];
+  output_config.reserve(y_output_split[1].size());
+  for (auto d : y_output_split[1]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    output_config.push_back(static_cast<int64>(d));
+  }
+  return einsum_config_numeric;
+}
 
-    return DotGeneral(x, y, dot_dnums, &precision_proto);
+XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
+             PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto einsum_config_numeric,
+                        ParseEinsumString(einsum_config));
+    return Einsum(x, einsum_config_numeric[0], y, einsum_config_numeric[1],
+                  einsum_config_numeric[2], precision);
   });
 }
 
@@ -181,4 +352,5 @@ XlaOp TransposeInMinorDims(XlaOp x) {
 XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
   return transpose ? TransposeInMinorDims(x) : x;
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 916cd83748e7028c474065b86bf02d85166d2c9c..60c41ec45a086726086dac7227fc432a9c62d0c8 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
+#include <array>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -26,10 +30,15 @@ namespace xla {
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 
-// Get the diagonals of the last two dimensions. If 'x' has shape
-// [..., M, N], then the output has shape [..., min(M, N)], containing the
-// diagonal elements (i.e., with indices [..., i, i]).
-XlaOp GetMatrixDiagonal(XlaOp x);
+// Get the diagonals of the last two dimensions. Use k>0 for diagonals above the
+// main diagonal, and k<0 for diagonals below the main diagonal.
+//
+// If 'x' has shape [..., M, N]
+//  If k >= 0: then the output has shape [..., min(M, N - k)], containing the
+//            diagonal elements (i.e., with indices [..., i, i + k]).
+//  If k < 0: then the output has shape [..., min(M + k, N)], containing the
+//            diagonal elements (i.e., with indices [..., i - k, i]).
+XlaOp GetMatrixDiagonal(XlaOp x, int k = 0);
 
 // Returns a lower-triangular mask, i.e., true below the `diagonal`-th diagonal
 // and false above that diagonal.
@@ -65,6 +74,40 @@ xla::XlaOp BatchDot(
     xla::XlaOp x, xla::XlaOp y,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
+// Parse an einsum string into dimension numbers:
+//   "ab,cb->ac"
+// becomes:
+//   {{0, 1},{2, 1},{0, 2}}
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config);
+
+// Determine if each dimension label is in at least two inputs.
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config);
+
+// Supports two operand einsum notation like "ab,cb->ac".
+xla::XlaOp Einsum(
+    xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Same as above but supporting numeric labels on dimensins. So "ab,cb->ac"
+// becomes:
+//   x_config = {0, 1}
+//   y_config = {2, 1}
+//   output_config = {0, 2}
+xla::XlaOp Einsum(
+    xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+    absl::Span<const int64> y_config, absl::Span<const int64> output_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
 
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
index 0593a7517ac125ca8dc5395cee76f6bc23232cd3..a93fc2ccb92912a10b9b6c2192b81cd73566f2a0 100644
--- a/tensorflow/compiler/xla/client/lib/matrix_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -51,13 +53,24 @@ void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
-
-  XlaOp a;
-  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
-  GetMatrixDiagonal(a);
-  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
-
-  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
+  std::map<int, Array2D<T>> k_and_expected = {
+      {0, {{0, 5, 10}, {12, 17, 22}}},
+      {1, {{1, 6, 11}, {13, 18, 23}}},
+      {2, {{2, 7}, {14, 19}}},
+      {3, {{3}, {15}}},
+      {4, {{}, {}}},
+      {-1, {{4, 9}, {16, 21}}},
+      {-2, {{8}, {20}}},
+      {-3, {{}, {}}},
+      {-4, {{}, {}}},
+  };
+  for (const auto& kv : k_and_expected) {
+    XlaOp a;
+    auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
+    GetMatrixDiagonal(a, kv.first);
+
+    ComputeAndCompareR2<T>(&builder, kv.second, {a_data.get()});
+  }
 }
 
 XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
@@ -101,5 +114,78 @@ XLA_TEST_F(MatrixTest, RowBatchDot) {
   ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
                              {a_data.get(), row_data.get(), index_data.get()});
 }
+
+XLA_TEST_F(MatrixTest, Einsum) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
+
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  Einsum(l_index, row, "abc,adc->abd");
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(MatrixTest, ParseEinsumString) {
+  auto to_vec = [](absl::string_view s) {
+    std::vector<int64> v;
+    v.reserve(s.size());
+    for (auto c : s) {
+      v.push_back(int64{c});
+    }
+    return v;
+  };
+
+  auto to_string = [&](absl::string_view x, absl::string_view y,
+                       absl::string_view o) {
+    return absl::StrCat(x, ",", y, "->", o);
+  };
+
+  std::vector<std::vector<string>> good_test_cases = {{"ab", "bc", "ac"},
+                                                      {"Bab", "Bbc", "Bac"},
+                                                      {"ab", "cd", "dcba"},
+                                                      {"abc", "abd", "cbd"}};
+  for (auto test_case : good_test_cases) {
+    auto parse_result_or_status =
+        ParseEinsumString(to_string(test_case[0], test_case[1], test_case[2]));
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    for (int i = 0; i < 3; ++i) {
+      EXPECT_EQ(parse_result[i], to_vec(test_case[i]));
+    }
+    EXPECT_TRUE(ValidateEinsumNumericDimensions(
+                    parse_result[0], parse_result[1], parse_result[2])
+                    .ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_parsing = {
+      "", "a", "ab->ba", "ab,bc,cd->ad", "a...b,bc->a...c"};
+  for (auto test_case : einsum_strings_that_fail_parsing) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_FALSE(parse_result_or_status.status().ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_numeric_validation = {
+      "a,b->c", "ab,bc->acd", "abz,bc->ac", "ab,bcz->ac"};
+  for (auto test_case : einsum_strings_that_fail_numeric_validation) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    EXPECT_FALSE(ValidateEinsumNumericDimensions(
+                     parse_result[0], parse_result[1], parse_result[2])
+                     .ok());
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 85b9e1827dcef5ed907d893277deb5a52f8f30e9..63b3b07ddc2a64aad4c3b14853958f2bcfa08b59 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+
 #include <cmath>
 
 #include "absl/base/casts.h"
@@ -30,11 +32,8 @@ XlaOp RotateLeftU32(XlaOp v, int distance) {
          ShiftRightLogical(v, ConstantR0<uint32>(v.builder(), 32 - distance));
 }
 
-using ThreeFry2x32State = std::array<XlaOp, 2>;
+}  // namespace
 
-// Implements the ThreeFry counter-based PRNG algorithm.
-// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
-// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
 ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) {
   XlaBuilder* builder = input[0].builder();
   key[0] = BitcastConvertType(key[0], U32);
@@ -127,15 +126,28 @@ XlaOp StatelessRngUniformU32(std::array<XlaOp, 2> key, const Shape& shape) {
   return Reshape(result, AsInt64Slice(shape.dimensions()));
 }
 
+ThreeFry2x32State Uint64ToUint32s(XlaOp u64) {
+  auto builder = u64.builder();
+  auto const32 = ConstantR0WithType(builder, U64, 32);
+  auto fst = ConvertElementType(u64, U32);
+  auto snd = ConvertElementType(ShiftRightLogical(u64, const32), U32);
+  return {fst, snd};
+}
+
+XlaOp Uint32sToUint64(ThreeFry2x32State u32s) {
+  auto builder = u32s[0].builder();
+  return ConvertElementType(u32s[0], U64) |
+         ShiftLeft(ConvertElementType(u32s[1], U64),
+                   ConstantR0WithType(builder, U64, 32));
+}
+
 XlaOp StatelessRngUniformU64(std::array<XlaOp, 2> key, const Shape& shape) {
   XlaBuilder* builder = key[0].builder();
   const int64 size = ShapeUtil::ElementsIn(shape);
   ThreeFry2x32State inputs = GetInputs(size, builder);
   ThreeFry2x32State outputs = ThreeFry2x32(inputs, key);
   // low 32 bit: outputs[0], high 32 bit: outputs[1]
-  auto result = ConvertElementType(outputs[0], U64) |
-                ShiftLeft(ConvertElementType(outputs[1], U64),
-                          ConstantR0WithType(builder, U64, 32));
+  auto result = Uint32sToUint64(outputs);
   return Reshape(result, AsInt64Slice(shape.dimensions()));
 }
 
@@ -161,10 +173,6 @@ XlaOp StatelessRngUniformF32(XlaOp bits, XlaOp minval, XlaOp maxval) {
 XlaOp StatelessRngUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
                              PrimitiveType type, PrimitiveType unsigned_type) {
   XlaBuilder* builder = bits.builder();
-  // TODO(b/72573764): Generate real uniform integer distribution.
-  // The following algorithm is the same one that TF uses right now, but it's
-  // uniform only when maxval - minval is a divisor of the range that bits is
-  // generated from.
   auto range = BitcastConvertType(maxval, unsigned_type) -
                BitcastConvertType(minval, unsigned_type);
   auto dist = Rem(bits, range);
@@ -175,8 +183,6 @@ XlaOp StatelessRngUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
          BitcastConvertType(dist - dist_div_2, type);
 }
 
-}  // namespace
-
 XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
                           XlaOp minval, XlaOp maxval) {
   XlaBuilder* builder = seeds[0].builder();
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index 2603818de26888566a533334e49b039b126db66e..7b0b4c2439e538f004c8b9d6e5eb2553e485ee72 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -23,12 +23,38 @@ limitations under the License.
 
 namespace xla {
 
+// Implements the ThreeFry counter-based PRNG algorithm.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+using ThreeFry2x32State = std::array<XlaOp, 2>;
+ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key);
+
 // Returns a tensor containing 'shape' random values uniformly distributed in
 // the range [minval, maxval). Requires 2 32-bit integer seeds.
 // Currently only 'shape's of type F32, S32 and S64 are implemented.
 XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
                           XlaOp minval, XlaOp maxval);
 
+// Converts a 32-bit (signed or unsigned) integer random number `bits` into a
+// float32 in the range [minval, maxval).
+XlaOp StatelessRngUniformF32(XlaOp bits, XlaOp minval, XlaOp maxval);
+
+// Converts an integer random number 'bits' of type 'type' to a random number
+// in the range [minval, maxval), of the same type. 'unsigned_type' is the
+// unsigned version of 'type' (could be the same) with the same bit width.
+// The algorithm is the same one that TF uses right now, but it's
+// uniform only when maxval - minval is a divisor of the range that bits is
+// generated from.
+// TODO(b/72573764): Generate real uniform integer distribution.
+XlaOp StatelessRngUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
+                             PrimitiveType type, PrimitiveType unsigned_type);
+
+// The following 2 functions, for converting between one uint64 and two uint32s,
+// use the contract "lower 32 bits for the first uint32, higher 32 bits for the
+// second".
+ThreeFry2x32State Uint64ToUint32s(XlaOp u64);
+XlaOp Uint32sToUint64(ThreeFry2x32State u32s);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c0680b883acdfd93290558fe324e049d458b799
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -0,0 +1,466 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Jacobi rotation (also known as Givens rotation):
+// G = [[ c, s],
+//      [-s, c]]
+// matmul(G_T, G) = I
+struct JacobiRotation {
+  XlaOp c;          // cosine.
+  XlaOp s;          // sine.
+};
+
+// JacobiUpdate holds the intermediate orthogonal matrix, Jacobi-rotated matrix.
+struct JacobiUpdate {
+  XlaOp v;
+  XlaOp w;
+};
+
+struct FrobeniusNorms {
+  XlaOp off_diagonal_norm;
+  XlaOp total_norm;
+};
+
+// Given an n-by-n symmetric A and integers p and q that satisfy 0 <= p < q < n,
+// it computes a rotation matrix G = [[c, s], [-s, c]], such that
+//                        G_T * A[[p, q], [p, q]] * G
+// is diagonalized.
+//
+//  def sym_schur2x2(A, p, q):
+//      if np.abs(A[p, q]) > 1e-6:
+//          tau = (A[q, q] - A[p, p]) / (2 * A[p, q])
+//          if tau >= 0:
+//              t = 1.0 / (tau + np.sqrt(1 + tau ** 2))
+//          else:
+//              t = -1.0 / (-tau + np.sqrt(1 + tau ** 2))
+//          c = 1.0 / np.sqrt(1.0 + t ** 2)
+//          s = t * c
+//      else:
+//          c = 1.0
+//          s = 0.0
+//      return c, s
+StatusOr<JacobiRotation> SymmetricShurDecomposition2x2(XlaOp a, XlaOp p,
+                                                       XlaOp q, XlaOp tol) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+
+  auto zero = ScalarLike(a, 0.0);
+  auto one = ScalarLike(a, 1.0);
+  auto two = ScalarLike(a, 2.0);
+
+  auto pqs = DynamicSliceInMinorDims(a, {p, q}, {1, 1});
+
+  auto ps = DynamicSliceInMinorDims(a, {p, p}, {1, 1});
+  auto qs = DynamicSliceInMinorDims(a, {q, q}, {1, 1});
+
+  auto tau = (qs - ps) / (pqs * two);
+  auto t_pos = one / (tau + Sqrt(one + Square(tau)));
+  auto t_neg = -one / (-tau + Sqrt(one + Square(tau)));
+  auto t = Select(Ge(tau, zero), t_pos, t_neg);
+
+  auto c_temp = Rsqrt(one + Square(t));
+  auto s_temp = t * c_temp;
+
+  auto c = Select(Ge(Abs(pqs), tol), c_temp, ZerosLike(c_temp) + one);
+  auto s = Select(Ge(Abs(pqs), tol), s_temp, ZerosLike(s_temp));
+  // Renormalize c and s to compensate for low precision arithmetic, this step
+  // is redundant if high precision float is used, like float64.
+  auto rnorm = Rsqrt(Square(c) + Square(s));
+
+  JacobiRotation schur;
+
+  schur.c = c * rnorm;
+  schur.s = s * rnorm;
+
+  return schur;
+}
+
+StatusOr<JacobiUpdate> Update(JacobiUpdate jacobi_update, XlaOp p, XlaOp q,
+                              XlaOp tol, int64 n) {
+  XlaBuilder* builder = jacobi_update.w.builder();
+  TF_ASSIGN_OR_RETURN(JacobiRotation schur, SymmetricShurDecomposition2x2(
+                                                jacobi_update.w, p, q, tol));
+
+  TF_ASSIGN_OR_RETURN(Shape w_shape, builder->GetShape(jacobi_update.w));
+  const std::vector<int64> batch_dims(w_shape.dimensions().begin(),
+                                      w_shape.dimensions().end() - 2);
+  const int64 num_dims = w_shape.rank();
+
+  auto zero = ScalarLike(p, 0);
+
+  XlaOp c = schur.c;
+  XlaOp s = schur.s;
+
+  auto slice_p = DynamicSliceInMinorDims(jacobi_update.w, {p, zero}, {1, n});
+  auto slice_q = DynamicSliceInMinorDims(jacobi_update.w, {q, zero}, {1, n});
+
+  auto slice_p_new = c * slice_p - s * slice_q;
+  auto slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {p, zero});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {q, zero});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.w, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.w, {zero, q}, {n, 1});
+
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {zero, p});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {zero, q});
+
+  // Zero out a_{pq} explicitly.
+  std::vector<int64> pq_dims(batch_dims.begin(), batch_dims.end());
+  pq_dims.push_back(1);
+  pq_dims.push_back(1);
+  auto pq_zero = ScalarLike(jacobi_update.w, 0.0);
+  auto pq_zeros = Broadcast(pq_zero, pq_dims);
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {p, q});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {q, p});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.v, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.v, {zero, q}, {n, 1});
+
+  std::vector<int64> broadcast_dims(batch_dims.size());
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims.push_back(num_dims - 1);
+
+  // Renormalize the p-th and q-th columns. This step is redundant if high
+  // precision floats are used, like 64-bit float. But for 32-bit float, it
+  // becomes necessary. This step will not increase the overall complexity.
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_p_new = Mul(
+      slice_p_new,
+      Rsqrt(Reduce(Square(slice_p_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+  slice_q_new = s * slice_p + c * slice_q;
+  slice_q_new = Mul(
+      slice_q_new,
+      Rsqrt(Reduce(Square(slice_q_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_p_new, {zero, p});
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_q_new, {zero, q});
+
+  return jacobi_update;
+}
+
+StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w) {
+  XlaBuilder* builder = w.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
+  const int64 num_dims = shape.rank();
+  auto frobenius_norm =
+      Sqrt(Reduce(Square(w), ScalarLike(w, 0.0),
+                  CreateScalarAddComputation(shape.element_type(), builder),
+                  {num_dims - 2, num_dims - 1}));
+  auto diag = GetMatrixDiagonal(w);
+  auto diag_square =
+      Reduce(Square(diag), ScalarLike(w, 0.0),
+             CreateScalarAddComputation(shape.element_type(), builder),
+             {num_dims - 2});
+
+  FrobeniusNorms frobenius_norms;
+
+  frobenius_norms.off_diagonal_norm =
+      Sqrt(Max(Square(frobenius_norm) - diag_square, ScalarLike(w, 0.0)));
+  frobenius_norms.total_norm = frobenius_norm;
+
+  return frobenius_norms;
+}
+
+StatusOr<std::vector<XlaOp>> WhileLoopFn(
+    absl::Span<const XlaOp> initial_values,  //
+    int matrix_dimension,                    //
+    int max_sweep_updates,                   //
+    PrimitiveType index_type,                //
+    absl::string_view name,                  //
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    auto k = values[0];
+    auto max_sweeps = ScalarLike(k, max_sweep_updates);
+    auto sweep_update_cond = Gt(max_sweeps, k);
+
+    auto norms = ComputeFrobeniusNorms(values[2]).ValueOrDie();
+    auto tol = norms.total_norm * values[3];
+    auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
+                              xla::ConstantR0<bool>(cond_builder, false),
+                              CreateScalarOrComputation(PRED, cond_builder));
+
+    return And(sweep_update_cond, tol_cond);
+  };
+
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    auto while_cond_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_cond_builder) -> StatusOr<XlaOp> {
+      auto p = values_inner[0];
+      return Lt(p, ScalarLike(p, matrix_dimension - 1));
+    };
+
+    auto while_body_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_body_builder) -> StatusOr<std::vector<XlaOp>> {
+      auto while_cond_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_cond_builder) -> StatusOr<XlaOp> {
+        auto q = values_innermost[1];
+        return Lt(q, ScalarLike(q, matrix_dimension));
+      };
+      auto while_body_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_body_builder)
+          -> StatusOr<std::vector<XlaOp>> {
+        auto p = values_innermost[0];
+        auto q = values_innermost[1];
+
+        JacobiUpdate jacobi_update;
+        jacobi_update.v = values_innermost[2];
+        jacobi_update.w = values_innermost[3];
+
+        auto tol = values_innermost[4];
+
+        TF_ASSIGN_OR_RETURN(jacobi_update,
+                            Update(jacobi_update, p, q, tol, matrix_dimension));
+
+        std::vector<XlaOp> updated_values_innermost;
+        updated_values_innermost.reserve(values_innermost.size());
+
+        updated_values_innermost.push_back(p);
+        updated_values_innermost.push_back(q + ScalarLike(q, 1));
+        updated_values_innermost.push_back(jacobi_update.v);
+        updated_values_innermost.push_back(jacobi_update.w);
+        updated_values_innermost.push_back(tol);
+
+        return updated_values_innermost;
+      };
+
+      std::vector<XlaOp> values_innermost(5);
+      auto p = values_inner[0];
+      auto q = p + ScalarLike(p, 1);
+      values_innermost[0] = p;                // index p.
+      values_innermost[1] = q;                // index q.
+      values_innermost[2] = values_inner[1];  // v.
+      values_innermost[3] = values_inner[2];  // w.
+      values_innermost[4] = values_inner[3];  // tol.
+      TF_ASSIGN_OR_RETURN(
+          values_innermost,
+          WhileLoopHelper(while_cond_fn_innermost, while_body_fn_innermost,
+                          values_innermost, absl::StrCat(name, "-Innermost"),
+                          inner_body_builder));
+
+      std::vector<XlaOp> updated_values_inner;
+      updated_values_inner.reserve(values_inner.size());
+
+      updated_values_inner.push_back(p + ScalarLike(p, 1));
+      updated_values_inner.push_back(values_innermost[2]);
+      updated_values_inner.push_back(values_innermost[3]);
+      updated_values_inner.push_back(values_innermost[4]);
+      return updated_values_inner;
+    };
+    // Indexes.
+    XlaOp k = values[0];
+
+    std::vector<XlaOp> values_inner(4);
+    values_inner[0] = ScalarLike(k, 0);  // index p.
+    values_inner[1] = values[1];         // v.
+    values_inner[2] = values[2];         // w.
+    values_inner[3] = values[3];         // tol.
+    TF_ASSIGN_OR_RETURN(
+        values_inner,
+        WhileLoopHelper(while_cond_fn_inner, while_body_fn_inner, values_inner,
+                        absl::StrCat(name, "-Inner"), body_builder));
+
+    std::vector<XlaOp> updated_values;
+    updated_values.reserve(values_inner.size());
+
+    updated_values.push_back(k + ScalarLike(k, 1));
+    updated_values.push_back(values_inner[1]);
+    updated_values.push_back(values_inner[2]);
+    updated_values.push_back(values_inner[3]);
+
+    return updated_values;
+  };
+  std::vector<XlaOp> values;
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              initial_values, name, builder));
+
+  return values;
+}
+
+StatusOr<SelfAdjointEigResult> SortByEigenvalues(SelfAdjointEigResult result) {
+  XlaBuilder* builder = result.v.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(result.v));
+  const int64 num_dims = shape.rank();
+  auto dimensions = shape.dimensions();
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims[num_dims - 2] = num_dims - 1;
+  result.w = BroadcastInDim(result.w, dimensions, broadcast_dims);
+
+  XlaOp sort_result =
+      Sort({result.w, result.v},
+           CreateScalarLtComputation(
+               {shape.element_type(), shape.element_type()}, builder),
+           num_dims - 1);
+  result.w = GetMatrixDiagonal(GetTupleElement(sort_result, 0));
+  result.v = GetTupleElement(sort_result, 1);
+  return result;
+}
+
+}  // namespace
+
+// This is the cyclic Jacobi iteration. Please note that the eigenvalues are
+// possibly not ordered.
+//
+//  def jacobi(A):
+//      n, _ = A.shape
+//      V = np.eye(n)
+//      frobenius_norm = np.linalg.norm(A)
+//      diag_norm = np.linalg.norm(np.diag(A))
+//      off_diag_norm = np.sqrt(
+//          frobenius_norm - diag_norm) * np.sqrt(frobenius_norm + diag_norm)
+//      while off_diag_norm > 1e-6 * frobenius_norm:
+//          for p in range(n - 1):
+//              for q in range(p + 1, n):
+//                  c, s = sym_schur2x2(A, p, q)
+//                  A[[p, q], :] = np.matmul(np.array([[c, -s], [s, c]]),
+//                                           A[[p, q], :])
+//                  A[:, [p, q]] = np.matmul(A[:, [p, q]],
+//                                           np.array([[c, s], [-s, c]]))
+//                  V[:, [p, q]] = np.matmul(V[:, [p, q]],
+//                                               np.array([[c, s], [-s, c]]))
+//          frobenius_norm = np.linalg.norm(A)
+//          diag_norm = np.linalg.norm(np.diag(A))
+//          off_diag_norm = np.sqrt(
+//              frobenius_norm - diag_norm) * np.sqrt(
+//                  frobenius_norm + diag_norm)
+//
+//      return A, V
+//
+// TODO(kuny): Implement parallel order Jacobi.
+//
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
+                                    float epsilon) {
+  XlaBuilder* builder = a.builder();
+  auto return_error = [&](const Status& status) {
+    SelfAdjointEigResult result;
+    result.v = builder->ReportError(status);
+    result.w = builder->ReportError(status);
+    return result;
+  };
+  auto shape_with_status = builder->GetShape(a);
+  if (!shape_with_status.status().ok()) {
+    return return_error(shape_with_status.status());
+  }
+  Shape a_shape = shape_with_status.ValueOrDie();
+  const int64 num_dims = a_shape.rank();
+  if (num_dims < 2) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must have rank >= 2: got shape %s.",
+        a_shape.ToString()));
+  }
+  PrimitiveType type = a_shape.element_type();
+  if (!primitive_util::IsFloatingPointType(type)) {
+    return return_error(InvalidArgument(
+        "Type of the input matrix must be float: got %s.", a_shape.ToString()));
+  }
+
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+
+  if (m != n) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must be square matrices: got shape "
+        "(%d, %d).",
+        m, n));
+  }
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+
+  auto tol = ScalarLike(a, epsilon);
+
+  auto v_init = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
+  auto w_init = Triangle(a, lower);
+  w_init = w_init + TransposeInMinorDims(w_init) - w_init * v_init;
+
+  auto output_with_status = WhileLoopFn(
+      {
+          Zero(builder, S32),  // k
+          v_init,              // v
+          w_init,              // w
+          tol,                 //
+      },                       //
+      n,                       //
+      max_iter,                //
+      S32,                     //
+      "CyclicJacobi",          //
+      builder);
+  if (!output_with_status.status().ok()) {
+    return return_error(output_with_status.status());
+  }
+
+  auto output = output_with_status.ValueOrDie();
+
+  SelfAdjointEigResult result;
+  result.v = output[1];
+  result.w = GetMatrixDiagonal(output[2]);
+
+  return SortByEigenvalues(result).ValueOrDie();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a089891d6a2d80c0c265a3310539b4f1c5db4d5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The eigenvalue decomposition of a symmetric matrix, the original matrix is
+// recovered by v * w * v_t.
+struct SelfAdjointEigResult {
+  // The i-th column is the normalized eigenvector corresponding to the
+  // eigenvalue w[i]. Will return a matrix object if a is a matrix object.
+  XlaOp v;
+  // The eigenvalues in ascending order, each repeated according to its
+  // multiplicity.
+  XlaOp w;
+};
+
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower = true,
+                                    int64 max_iter = 100, float epsilon = 1e-6);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99bec8a9ab5396c3414e4b79b42169099b66ac1a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
@@ -0,0 +1,315 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+
+class SelfAdjointEigTest : public ClientLibraryTestBase {
+ protected:
+  void SetUp() override {
+    ClientLibraryTestBase::SetUp();
+    batch_3d_4x4_ = Array3D<float>{
+        {
+            {4, 6, 8, 10},
+            {6, 45, 54, 63},
+            {8, 54, 146, 166},
+            {10, 63, 166, 310},
+        },
+        {
+            {16, 24, 8, 12},
+            {24, 61, 82, 48},
+            {8, 82, 100, 6},
+            {12, 48, 6, 62},
+        },
+    };
+    matrix2d_8x8_ = Array2D<float>{
+        {14., 123., 49., 112., 115., 173., 182., 125.},
+        {123., 14., 60., 118., 150., 130., 91., 72.},
+        {49., 60., 138., 111., 106., 101., 115., 142.},
+        {112., 118., 111., 142., 91., 130., 25., 61.},
+        {115., 150., 106., 91., 116., 121., 128., 85.},
+        {173., 130., 101., 130., 121., 70., 151., 132.},
+        {182., 91., 115., 25., 128., 151., 66., 92.},
+        {125., 72., 142., 61., 85., 132., 92., 156.},
+    };
+    low_rank_4x4_ = Array2D<float>{
+        // x = [[1, 2, 3, 4], [1, -1, 1, -1]]
+        // matmul(x.T, x)
+        {2, 1, 4, 3},
+        {1, 5, 5, 9},
+        {4, 5, 10, 11},
+        {3, 9, 11, 17},
+    };
+  }
+  void TearDown() override { ClientLibraryTestBase::TearDown(); }
+
+  Array3D<float> GetUnitMatrix3D(const Array3D<float>& matrix) {
+    Array3D<float> result(matrix.n1(), matrix.n2(), matrix.n3(), 0.0);
+    for (int i = 0; i < matrix.n1(); ++i) {
+      for (int j = 0; j < matrix.n2(); ++j) {
+        result({i, j, j}) = 1.0;
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> ExtractTriangularMatrix(const Array3D<float>& matrix,
+                                         bool lower) {
+    Array3D<float> result(matrix);
+    for (int i = 0; i < result.n1(); ++i) {
+      for (int j = 0; j < result.n2(); ++j) {
+        if (lower) {
+          for (int k = j + 1; k < result.n3(); ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        } else {
+          for (int k = 0; k < j; ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  XlaOp ComputeMatmulVWVt(SelfAdjointEigResult result, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(result.v).ValueOrDie();
+    std::vector<int64> out_dims = shape.dimensions();
+    std::vector<int64> broadcast_dims(shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+
+    broadcast_dims[shape.rank() - 2] = shape.rank() - 1;
+    auto vw = Mul(result.v, BroadcastInDim(result.w, out_dims, broadcast_dims));
+    return BatchDot(vw, TransposeInMinorDims(result.v),
+                    PrecisionConfig::HIGHEST);
+  }
+
+  XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(m1).ValueOrDie();
+    int64 size = 1;
+    for (auto d : shape.dimensions()) {
+      size *= d;
+    }
+    return ReduceAll(Abs(m1 - m2), ConstantR0WithType(builder, F32, 0),
+                     CreateScalarAddComputation(F32, builder)) /
+           ConstantR0WithType(builder, F32, size);
+  }
+
+  Array2D<float> GenerateRandomSymmetricMatrix(int size) {
+    Array2D<float> result{size, size, 0.0};
+    // TODO(b/128001705): This seed should not be needed but makes the test
+    // avoid inputs which trigger numerical instability.
+    result.FillRandom(10 /* stddev */, 2 /* mean */, 12346 /* seed */);
+    for (int i = 0; i < size; ++i) {
+      for (int j = 0; j < i; ++j) {
+        result({j, i}) = result({i, j});
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> batch_3d_4x4_;
+  Array2D<float> matrix2d_8x8_;
+  Array2D<float> low_rank_4x4_;
+  Array2D<int> wrong_type_4x4_;
+};
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, true), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, false), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a, false);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(batch_3d_4x4_),
+                             {a_data.get()}, ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(low_rank_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR2<float>(&builder, low_rank_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
+  XlaBuilder builder(TestName());
+
+  // This is computed by numpy.linalg.eigh with float32.
+  std::vector<float> expected{-182.69205, -116.86245, -105.74489, -9.545369,
+                              37.81711,   104.732285, 120.29153,  868.00385};
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  Add(result.w, ZerosLike(result.w));
+
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
+  XlaBuilder builder(TestName());
+
+  float expected_vals = 1e-3;
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  // np.sum(norm(eye(n) - matmul(conj(T(v)), v)) / n**2
+  GetAverageAbsoluteError(IdentityMatrix(&builder, F32, 8, 8),
+                          BatchDot(TransposeInMinorDims(result.v), result.v),
+                          &builder);
+
+  ComputeAndCompareR0<float>(&builder, expected_vals, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Wrong_Type_Int) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<int>(wrong_type_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  EXPECT_FALSE(result.v.valid());
+  EXPECT_FALSE(result.w.valid());
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_8x8) {
+  XlaBuilder builder(TestName());
+  int size = 8;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_16x16) {
+  XlaBuilder builder(TestName());
+  int size = 16;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_32x32) {
+  XlaBuilder builder(TestName());
+  int size = 32;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_256x256) {
+  XlaBuilder builder(TestName());
+  int size = 256;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_512x512) {
+  XlaBuilder builder(TestName());
+  int size = 512;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 77145ba7d4c72435450d3e33d57b2507eb84d2fc..0878cbeaf9ae1d85051ea3b5844f5837286c7dc2 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -134,4 +134,51 @@ XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
   });
 }
 
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
+    ShapeUtil::AppendMajorDimension(1, &index_shape);
+    std::vector<XlaOp> to_concat;
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    to_concat.reserve(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      if (i == dim) {
+        to_concat.push_back(Reshape(index, index_shape.dimensions()));
+      } else {
+        to_concat.push_back(Iota(builder, index_shape, i));
+      }
+    }
+    XlaOp gather_indices = ConcatInDim(builder, to_concat, input_shape.rank());
+    std::vector<int64> slice_sizes(input_shape.rank(), 1);
+    GatherDimensionNumbers gather_dnums;
+    gather_dnums.set_index_vector_dim(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      gather_dnums.add_collapsed_slice_dims(i);
+      gather_dnums.add_start_index_map(i);
+    }
+    return Gather(input, gather_indices, gather_dnums, slice_sizes);
+  });
+}
+
+XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64 dim) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
+    std::vector<int64> slice_sizes = input_shape.dimensions();
+    slice_sizes[dim] = 1;
+    GatherDimensionNumbers gather_dnums;
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      if (i != dim) {
+        gather_dnums.add_offset_dims(i);
+      }
+    }
+    gather_dnums.set_index_vector_dim(index_shape.rank());
+    gather_dnums.add_collapsed_slice_dims(dim);
+    gather_dnums.add_start_index_map(dim);
+    return Gather(input, index, gather_dnums, slice_sizes);
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890..bb6191df7c442f23a63f0d0b80c9b534c31e30fc 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -43,6 +43,28 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts);
 
+// Gathers values along an axis specified by dim.
+//
+// For a 3-D tensor the output is specified by:
+//
+// out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+// out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+// out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+//
+// If `input` is an n-dimensional tensor with size
+// [X0,X1,X2,..XN] and dim = i `index` must be an n-dimensional tensor with size
+// [X0,X1,...Y,Xi+1,...,X[N] where y >= 1 and `out` will have the same sizes as
+// `index`.
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim);
+
+// Returns a new tensor which indexes the input tensor along dimension dim using
+// the entries in index.
+//
+// The returned tensor has the same number of dimensions as the original tensor
+// (input). The dimth dimension has the same size as the length of index; other
+// dimensions have the same size as in the original tensor.
+XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64 dim);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 8d362119e01006555db0f82d02626175936e1d05..408a82ca3c6eeeae7edac8511769ec9c0d5a5f44 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -102,5 +102,56 @@ XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, TorchGather) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<int>({{1, 2}, {3, 4}}, 0, "input", &builder, &input);
+  auto index_data =
+      CreateR2Parameter<int>({{0, 0}, {1, 0}}, 1, "index", &builder, &index);
+  TorchGather(input, index, 1);
+
+  ComputeAndCompareR2<int>(&builder, {{1, 1}, {4, 3}},
+                           {input_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(SlicingTest, TorchIndexSelectOn0) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<float>({{0.1427, 0.0231, -0.5414, -1.0009},
+                                {-0.4664, 0.2647, -0.1228, -1.1068},
+                                {-1.1734, -0.6571, 0.7230, -0.6004}},
+                               0, "input", &builder, &input);
+  auto index_data =
+      CreateR1Parameter<int>({0, 2}, 1, "index", &builder, &index);
+  TorchIndexSelect(input, index, 0);
+
+  ComputeAndCompareR2<float>(
+      &builder,
+      {{0.1427, 0.0231, -0.5414, -1.0009}, {-1.1734, -0.6571, 0.7230, -0.6004}},
+      {input_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(SlicingTest, TorchIndexSelectOn1) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<float>({{0.1427, 0.0231, -0.5414, -1.0009},
+                                {-0.4664, 0.2647, -0.1228, -1.1068},
+                                {-1.1734, -0.6571, 0.7230, -0.6004}},
+                               0, "input", &builder, &input);
+  auto index_data =
+      CreateR1Parameter<int>({0, 2}, 1, "index", &builder, &index);
+  TorchIndexSelect(input, index, 1);
+
+  ComputeAndCompareR2<float>(
+      &builder, {{0.1427, -0.5414}, {-0.4664, -0.1228}, {-1.1734, 0.7230}},
+      {input_data.get(), index_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index e8553a08bb014e790822a14e128686b60b8d6b7c..ddc39f4d874cd3613a763b969091e7e65ff1c783 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -30,7 +31,13 @@ XlaOp TopK(XlaOp input, int64 k) {
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
     XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    XlaOp sort_result = Sort(Neg(input), {iota_s32});
+    // TODO(b/122298745): Get rid of Neg() and use CreateScalarGtComputation
+    // once the TPU backend supports the comparison computations.
+    XlaOp sort_result =
+        Sort({Neg(input), iota_s32},
+             CreateScalarLtComputation({input_shape.element_type(), S32},
+                                       iota_s32.builder()),
+             last_dim, /*is_stable=*/true);
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/svd.cc b/tensorflow/compiler/xla/client/lib/svd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd4547dbab6e49a502a0d0e9afa67b509fe4d1e6
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/svd.cc
@@ -0,0 +1,884 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/client/lib/svd.h"
+
+#include <memory>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Given a matrix A, define H,
+//   H = A * (I - beta * v_T * v) if v is a row vector, or
+//   H = (I - beta * v * v_T) if v is column vector.
+// A * H or H * A zeros out trailing part of some row or column of A.
+//
+// [x0, ..., x_{k-1}, xk, x_{k+1}, ..., x_{n-1}] * H
+//       = [x0, ..., x_{k-1}, xnorm, 0, ..., 0]
+//
+// Here xnorm = norm([x_k, x_{k+1}, ..., x_{n - 1}])
+struct HouseHolderResult {
+  XlaOp v;
+  XlaOp beta;
+  XlaOp a;
+};
+
+// Jacobi rotation (also known as Givens rotation):
+// G = [[ c, s],
+//      [-s, c]]
+// matmul(G_T, G) = I
+struct JacobiRotation {
+  XlaOp c;  // cosine.
+  XlaOp s;  // sine.
+};
+
+// JacobiUpdate holds the intermediate orthogonal matrix, Jacobi-rotated matrix.
+struct JacobiUpdate {
+  XlaOp v;
+  XlaOp w;
+};
+
+// OneSidedJacobiRotation holds the left and right Jacobi rotations. Refer to
+// GetOneSidedJacobiRotation for the effect of applying OneSidedJacobiRotation
+// to a matrix.
+struct OneSidedJacobiRotation {
+  JacobiRotation rot_l;
+  JacobiRotation rot_r;
+};
+
+struct FrobeniusNorms {
+  XlaOp off_diagonal_norm;
+  XlaOp total_norm;
+};
+
+// Householder reflection on the trailing elements of a vector.
+//
+// H = I - beta * [1, v]' * [1, v]
+//
+// H * x = [..., xnorm, 0, ..., 0]
+//          ..., j, j + 1, ..., n
+//
+// def house(x, j, eps):
+//    sigma = np.linalg.norm(x[(j + 1):])
+//    v = np.zeros_like(x)
+//    v[(j + 1):] = x[(j + 1):]
+//    if sigma < eps:
+//        beta = 0
+//    else:
+//        mu = sigma * np.sqrt((x[j]/sigma)**2 + 1)
+//        if x[j] <= 0:
+//            v[j] = x[j] - mu
+//        else:
+//            v[j] = -sigma / (x[j] + mu) * sigma
+//        beta = 2 / ((sigma / v[j])**2 + 1)
+//        v = v / v[j]
+//    v[j] = 1
+//    return v, beta
+//
+// Householder reflection on the trailing elements of a row of a matrix. After
+// applying it on the matrix, all elements in [i, (j+1):] become zeros, i.e.,
+//
+// H = I - beta * [1, v]' * [1, v], then,
+//
+// A[i, j:] * H = [sigma, 0, 0, ..., 0]
+//
+StatusOr<HouseHolderResult> HouseRow(XlaOp a, XlaOp i, XlaOp j, XlaOp eps,
+                                     PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int64 num_dims = a_shape.rank();
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  XlaOp zero = ScalarLike(i, 0);
+  XlaOp x = DynamicSliceInMinorDims(a, {i, zero}, {1, n});
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int k = 0; k < num_batch_dims; ++k) {
+    batch_dims[k] = ShapeUtil::GetDimension(a_shape, k);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+  auto idx = Iota(builder, ShapeUtil::MakeShape(S32, x_shape.dimensions()),
+                  num_dims - 1);
+  auto zeros = ZerosLike(x);
+  auto v = Select(Gt(idx, j), x, zeros);
+
+  auto one = ScalarLike(v, 1.0);
+
+  auto sigma =
+      Sqrt(Reduce(Square(v), ScalarLike(v, 0.0),
+                  CreateScalarAddComputation(x_shape.element_type(), builder),
+                  {num_dims - 1}));
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  auto x_0j = DynamicSliceInMinorDims(x, {zero, j}, {1, 1});
+  auto mu = Mul(sigma, Sqrt(Square(Div(x_0j, sigma, broadcast_dims)) + one),
+                broadcast_dims);
+
+  auto v_0j = Select(
+      Le(x_0j, ScalarLike(x_0j, 0.0)), Sub(x_0j, mu),
+      -Mul(sigma, Div(sigma, Add(x_0j, mu), broadcast_dims), broadcast_dims));
+
+  auto beta = Div(ScalarLike(v_0j, 2.0),
+                  (Square(Div(sigma, v_0j, broadcast_dims)) + one));
+
+  v = Select(
+      BroadcastInDim(Lt(sigma, eps), x_shape.dimensions(), broadcast_dims), v,
+      v / v_0j);
+  v = Select(Eq(idx, j), zeros + one, v);
+
+  beta = Select(Lt(Add(sigma, ZerosLike(beta), broadcast_dims), eps),
+                ZerosLike(beta), beta);
+
+  HouseHolderResult result;
+  result.v = v;
+  result.beta = beta;
+  result.a =
+      Sub(a, Mul(beta, BatchDot(BatchDot(a, TransposeInMinorDims(v), precision),
+                                v, precision)));
+
+  return result;
+}
+
+// Householder reflection on the trailing elements of a col of a matrix. After
+// applying it on the matrix, all elements in [(i+1):, j] become zeros, i.e.,
+//
+// H = I - beta * [1; v] * [1; v]', then,
+//
+// H * A[i:, j] = [xnorm, 0, 0, ..., 0]
+//
+StatusOr<HouseHolderResult> HouseCol(XlaOp a, XlaOp i, XlaOp j, XlaOp eps,
+                                     PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int64 num_dims = a_shape.rank();
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  XlaOp zero = ScalarLike(i, 0);
+  XlaOp x = DynamicSliceInMinorDims(a, {zero, j}, {m, 1});
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int k = 0; k < num_batch_dims; ++k) {
+    batch_dims[k] = ShapeUtil::GetDimension(a_shape, k);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+  auto idx = Iota(builder, ShapeUtil::MakeShape(S32, x_shape.dimensions()),
+                  num_dims - 2);
+  auto zeros = ZerosLike(x);
+  auto v = Select(Gt(idx, i), x, zeros);
+
+  auto one = ScalarLike(v, 1.0);
+
+  auto sigma =
+      Sqrt(Reduce(Square(v), ScalarLike(v, 0.0),
+                  CreateScalarAddComputation(x_shape.element_type(), builder),
+                  {num_dims - 2}));
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims[num_dims - 2] = num_dims - 1;
+  auto x_0i = DynamicSliceInMinorDims(x, {i, zero}, {1, 1});
+  auto mu = Mul(sigma, Sqrt(Square(Div(x_0i, sigma, broadcast_dims)) + one),
+                broadcast_dims);
+
+  auto v_0i = Select(
+      Le(x_0i, ScalarLike(x_0i, 0.0)), Sub(x_0i, mu),
+      -Mul(sigma, Div(sigma, Add(x_0i, mu), broadcast_dims), broadcast_dims));
+
+  auto beta = Div(ScalarLike(v_0i, 2.0),
+                  (Square(Div(sigma, v_0i, broadcast_dims)) + one));
+
+  v = Select(
+      BroadcastInDim(Lt(sigma, eps), x_shape.dimensions(), broadcast_dims), v,
+      v / v_0i);
+  v = Select(Eq(idx, i), zeros + one, v);
+
+  beta = Select(Lt(Add(sigma, ZerosLike(beta), broadcast_dims), eps),
+                ZerosLike(beta), beta);
+
+  HouseHolderResult result;
+  result.v = v;
+  result.beta = beta;
+  result.a = Sub(
+      a, Mul(beta, BatchDot(v, BatchDot(TransposeInMinorDims(v), a, precision),
+                            precision)));
+
+  return result;
+}
+
+// Apply column and row householder reflections for bidiagonalization.
+//
+// def house_bidiag(A):
+//    xz, yz = A.shape
+//    LL = np.eye(xz)
+//    RR = np.eye(yz)
+//    for i in range(yz - 1):
+//        v, beta = house_col(A, i, i, 1e-8)
+//        L = np.eye(xz) - beta * np.outer(v, v)
+//        LL = np.matmul(LL, L)
+//        A = np.matmul(L, A)
+//        if i < yz - 2:
+//            v, beta = house_row(A, i, i + 1, 1e-8)
+//            R = np.eye(yz) - beta * np.outer(v, v)
+//            RR = np.matmul(RR, R)
+//            A = np.matmul(A, R)
+//    return LL, A, RR
+//
+StatusOr<SVDResult> HouseHolderBidiagonalization(
+    XlaOp a, XlaOp eps, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int64 num_dims = a_shape.rank();
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  XlaOp u_init = Broadcast(
+      IdentityMatrix(builder, a_shape.element_type(), m, m), batch_dims);
+  XlaOp v_init = Broadcast(
+      IdentityMatrix(builder, a_shape.element_type(), n, n), batch_dims);
+
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    auto i = values[0];
+    return Lt(i, ScalarLike(i, n - 2));
+  };
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    auto i = values[0];
+    auto one = ScalarLike(i, 1);
+
+    auto u = values[1];
+    auto v = values[2];
+    auto a = values[3];
+    auto eps = values[4];
+
+    TF_ASSIGN_OR_RETURN(HouseHolderResult house_col,
+                        HouseCol(a, i, i, eps, precision));
+    u = Sub(u, Mul(house_col.beta,
+                   BatchDot(BatchDot(u, house_col.v, precision),
+                            TransposeInMinorDims(house_col.v), precision)));
+    a = house_col.a;
+
+    TF_ASSIGN_OR_RETURN(HouseHolderResult house_row,
+                        HouseRow(a, i, i + one, eps, precision));
+    v = Sub(
+        v,
+        Mul(house_row.beta,
+            BatchDot(BatchDot(v, TransposeInMinorDims(house_row.v), precision),
+                     house_row.v, precision)));
+    a = house_row.a;
+
+    std::vector<XlaOp> updated_values;
+    updated_values.reserve(values.size());
+
+    updated_values.push_back(i + one);
+    updated_values.push_back(u);
+    updated_values.push_back(v);
+    updated_values.push_back(a);
+    updated_values.push_back(eps);
+    return updated_values;
+  };
+
+  std::vector<XlaOp> values(5);
+  values[0] = Zero(builder, S32);
+  values[1] = u_init;
+  values[2] = v_init;
+  values[3] = a;
+  values[4] = eps;
+
+  TF_ASSIGN_OR_RETURN(values,
+                      WhileLoopHelper(while_cond_fn, while_body_fn, values,
+                                      "HouseHolderBidiagonalization", builder));
+
+  for (int k = 2; k > 0; --k) {
+    if (n - k >= 0) {
+      XlaOp index = ScalarLike(values[0], n - k);
+      TF_ASSIGN_OR_RETURN(HouseHolderResult house_col,
+                          HouseCol(values[3], index, index, eps, precision));
+      values[1] =
+          Sub(values[1],
+              Mul(house_col.beta,
+                  BatchDot(BatchDot(values[1], house_col.v, precision),
+                           TransposeInMinorDims(house_col.v), precision)));
+      values[3] = house_col.a;
+    }
+  }
+
+  SVDResult result;
+  result.u = values[1];
+  result.v = values[2];
+  result.d = values[3];
+  return result;
+}
+
+// MakeJacobi computes a rotation matrix G = [[c, s], [-s, c]], such that
+//                        G_T * [[ps, pqs], [pqs, qs]] * G
+// is diagonalized.
+//
+//  def make_jacobi(ps, qs, pqs, eps):
+//     if np.abs(a_pq) > eps:
+//         tau = (a_qq - a_pp) / (2 * a_pq)
+//         if tau >= 0:
+//             t = 1.0 / (tau + np.sqrt(1 + tau ** 2))
+//         else:
+//             t = -1.0 / (-tau + np.sqrt(1 + tau ** 2))
+//         c = 1.0 / np.sqrt(1.0 + t ** 2)
+//         s = t * c
+//     else:
+//         c = 1.0
+//         s = 0.0
+//     return c, s
+//
+StatusOr<JacobiRotation> MakeJacobi(XlaOp ps, XlaOp qs, XlaOp pqs, XlaOp eps) {
+  auto zero = ScalarLike(ps, 0.0);
+  auto one = ScalarLike(ps, 1.0);
+  auto two = ScalarLike(ps, 2.0);
+
+  auto tau = (qs - ps) / (pqs * two);
+  auto t_pos = one / (tau + Sqrt(one + Square(tau)));
+  auto t_neg = -one / (-tau + Sqrt(one + Square(tau)));
+  auto t = Select(Ge(tau, zero), t_pos, t_neg);
+
+  auto c_temp = Rsqrt(one + Square(t));
+  auto s_temp = t * c_temp;
+
+  auto c = Select(Ge(Abs(pqs), eps), c_temp, ZerosLike(c_temp) + one);
+  auto s = Select(Ge(Abs(pqs), eps), s_temp, ZerosLike(s_temp));
+  // Renormalize c and s to compensate for low precision arithmetic, this step
+  // is redundant if high precision float is used, like float64.
+  auto rnorm = Rsqrt(Square(c) + Square(s));
+
+  JacobiRotation rot;
+
+  rot.c = c * rnorm;
+  rot.s = s * rnorm;
+
+  return rot;
+}
+
+// One sided Jacobi rotations. For a matrix,
+//  [a_pp, a_pq]
+//  [a_qp, a_qq]
+// After applying Jacobi rotations on both sides, the matrix is diagonalized.
+//  [b_pp, 0]
+//  [0, b_qq]
+//
+// def jacobi_rot(a, p, q, eps):
+//     t = a[p, p] + a[q, q]
+//     d = a[q, p] - a[p, q]
+//
+//     if np.abs(d) < eps:
+//         s = 0.0
+//         c = 1.0
+//     else:
+//         u = t / d
+//         tmp = np.sqrt(1.0 + u**2)
+//         s = -1.0 / tmp
+//         c = u / tmp
+//
+//     rot = np.array([[c, s], [-s, c]])
+//     m_tmp = rot.T @ a[[p, q], [p, q]]
+//     c_r, s_r = make_jacobi(m_tmp[0, 0], m_tmp[1, 1], m_tmp[0, 1])
+//     rot_r = np.array([[c_r, s_r], [-s_r, c_r]])
+//     rot_l = rot @ rot_r
+//    return rot_l, rot_r
+//
+StatusOr<OneSidedJacobiRotation> GetOneSidedJacobiRotation(XlaOp a, XlaOp p,
+                                                           XlaOp q, XlaOp eps) {
+  XlaOp a_pp = DynamicSliceInMinorDims(a, {p, p}, {1, 1});
+  XlaOp a_pq = DynamicSliceInMinorDims(a, {p, q}, {1, 1});
+  XlaOp a_qp = DynamicSliceInMinorDims(a, {q, p}, {1, 1});
+  XlaOp a_qq = DynamicSliceInMinorDims(a, {q, q}, {1, 1});
+
+  XlaOp one = ScalarLike(a, 1.0);
+
+  XlaOp t = a_pp + a_qq;
+  XlaOp d = a_qp - a_pq;
+
+  XlaOp u = Div(t, d);
+  XlaOp tmp = Rsqrt(one + Square(u));
+
+  JacobiRotation rot;
+
+  XlaOp zeros = ZerosLike(tmp);
+  XlaOp ones = zeros + one;
+
+  rot.s = Select(Lt(Abs(d), eps), zeros, -tmp);
+  rot.c = Select(Lt(Abs(d), eps), ones, Mul(u, tmp));
+
+  XlaOp a_pp_new = rot.c * a_pp - rot.s * a_qp;
+  XlaOp a_pq_new = rot.c * a_pq - rot.s * a_qq;
+  XlaOp a_qq_new = rot.s * a_pq + rot.c * a_qq;
+
+  OneSidedJacobiRotation rots;
+  TF_ASSIGN_OR_RETURN(rots.rot_r,
+                      MakeJacobi(a_pp_new, a_qq_new, a_pq_new, eps));
+
+  rots.rot_l.c = rot.c * rots.rot_r.c - rot.s * rots.rot_r.s;
+  rots.rot_l.s = rot.s * rots.rot_r.c + rot.c * rots.rot_r.s;
+
+  return rots;
+}
+
+// Apply one-sided Jacobi on elements at indices pp, pq, qp, qq.
+StatusOr<SVDResult> OneSidedJacobiUpdate(SVDResult svd_result, XlaOp p, XlaOp q,
+                                         XlaOp eps) {
+  XlaOp u = svd_result.u;
+  XlaOp v = svd_result.v;
+  XlaOp d = svd_result.d;
+  XlaBuilder* builder = d.builder();
+  TF_ASSIGN_OR_RETURN(Shape d_shape, builder->GetShape(d));
+  const int64 num_dims = d_shape.rank();
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(d_shape, i);
+  }
+  const int64 m = ShapeUtil::GetDimension(d_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(d_shape, -1);
+
+  TF_ASSIGN_OR_RETURN(OneSidedJacobiRotation onesided_jacobi,
+                      GetOneSidedJacobiRotation(d, p, q, eps));
+
+  auto zero = ScalarLike(p, 0);
+
+  // Zero out a_{pq} explicitly.
+  std::vector<int64> pq_dims(batch_dims.begin(), batch_dims.end());
+  pq_dims.push_back(1);
+  pq_dims.push_back(1);
+  auto pq_zero = ScalarLike(d, 0.0);
+  auto pq_zeros = Broadcast(pq_zero, pq_dims);
+
+  std::vector<int64> broadcast_dims(batch_dims.size());
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims.push_back(num_dims - 1);
+
+  // Apply Jacobi Rotation on the left.
+  auto slice_p = DynamicSliceInMinorDims(d, {p, zero}, {1, n});
+  auto slice_q = DynamicSliceInMinorDims(d, {q, zero}, {1, n});
+  auto slice_p_new =
+      onesided_jacobi.rot_l.c * slice_p - onesided_jacobi.rot_l.s * slice_q;
+  auto slice_q_new =
+      onesided_jacobi.rot_l.s * slice_p + onesided_jacobi.rot_l.c * slice_q;
+  d = DynamicUpdateSliceInMinorDims(d, slice_p_new, {p, zero});
+  d = DynamicUpdateSliceInMinorDims(d, slice_q_new, {q, zero});
+
+  // Apply Jacobi Rotation on the right.
+  slice_p = DynamicSliceInMinorDims(d, {zero, p}, {m, 1});
+  slice_q = DynamicSliceInMinorDims(d, {zero, q}, {m, 1});
+  slice_p_new =
+      onesided_jacobi.rot_r.c * slice_p - onesided_jacobi.rot_r.s * slice_q;
+  slice_q_new =
+      onesided_jacobi.rot_r.s * slice_p + onesided_jacobi.rot_r.c * slice_q;
+  d = DynamicUpdateSliceInMinorDims(d, slice_p_new, {zero, p});
+  d = DynamicUpdateSliceInMinorDims(d, slice_q_new, {zero, q});
+
+  d = DynamicUpdateSliceInMinorDims(d, pq_zeros, {p, q});
+  d = DynamicUpdateSliceInMinorDims(d, pq_zeros, {q, p});
+
+  // Apply left Jacobi Rotation on U.
+  slice_p = DynamicSliceInMinorDims(u, {zero, p}, {m, 1});
+  slice_q = DynamicSliceInMinorDims(u, {zero, q}, {m, 1});
+  slice_p_new =
+      onesided_jacobi.rot_l.c * slice_p - onesided_jacobi.rot_l.s * slice_q;
+
+  slice_p_new = Mul(
+      slice_p_new,
+      Rsqrt(Reduce(Square(slice_p_new), pq_zero,
+                   CreateScalarAddComputation(d_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  slice_q_new =
+      onesided_jacobi.rot_l.s * slice_p + onesided_jacobi.rot_l.c * slice_q;
+
+  slice_q_new = Mul(
+      slice_q_new,
+      Rsqrt(Reduce(Square(slice_q_new), pq_zero,
+                   CreateScalarAddComputation(d_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  u = DynamicUpdateSliceInMinorDims(u, slice_p_new, {zero, p});
+  u = DynamicUpdateSliceInMinorDims(u, slice_q_new, {zero, q});
+
+  // Apply right Jacobi Rotation on V.
+  slice_p = DynamicSliceInMinorDims(v, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(v, {zero, q}, {n, 1});
+  slice_p_new =
+      onesided_jacobi.rot_r.c * slice_p - onesided_jacobi.rot_r.s * slice_q;
+
+  slice_p_new = Mul(
+      slice_p_new,
+      Rsqrt(Reduce(Square(slice_p_new), pq_zero,
+                   CreateScalarAddComputation(d_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  slice_q_new =
+      onesided_jacobi.rot_r.s * slice_p + onesided_jacobi.rot_r.c * slice_q;
+
+  slice_q_new = Mul(
+      slice_q_new,
+      Rsqrt(Reduce(Square(slice_q_new), pq_zero,
+                   CreateScalarAddComputation(d_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  v = DynamicUpdateSliceInMinorDims(v, slice_p_new, {zero, p});
+  v = DynamicUpdateSliceInMinorDims(v, slice_q_new, {zero, q});
+
+  svd_result.d = d;
+  svd_result.u = u;
+  svd_result.v = v;
+
+  return svd_result;
+}
+
+StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w) {
+  XlaBuilder* builder = w.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
+  const int64 num_dims = shape.rank();
+  auto frobenius_norm =
+      Sqrt(Reduce(Square(w), ScalarLike(w, 0.0),
+                  CreateScalarAddComputation(shape.element_type(), builder),
+                  {num_dims - 2, num_dims - 1}));
+  auto diag = GetMatrixDiagonal(w);
+  auto diag_square =
+      Reduce(Square(diag), ScalarLike(w, 0.0),
+             CreateScalarAddComputation(shape.element_type(), builder),
+             {num_dims - 2});
+
+  FrobeniusNorms frobenius_norms;
+
+  frobenius_norms.off_diagonal_norm =
+      Sqrt(Max(Square(frobenius_norm) - diag_square, ScalarLike(w, 0.0)));
+  frobenius_norms.total_norm = frobenius_norm;
+
+  return frobenius_norms;
+}
+
+// Main boby of One-sided Jacobi Method.
+StatusOr<std::vector<XlaOp>> WhileLoopFn(
+    absl::Span<const XlaOp> initial_values,  //
+    int matrix_dimension,                    //
+    int max_sweep_updates,                   //
+    absl::string_view name,                  //
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    auto k = values[0];
+    auto max_sweeps = ScalarLike(k, max_sweep_updates);
+    auto sweep_update_cond = Gt(max_sweeps, k);
+
+    auto norms = ComputeFrobeniusNorms(values[3]).ValueOrDie();
+    auto tol = norms.total_norm * values[4];
+    auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
+                              xla::ConstantR0<bool>(cond_builder, false),
+                              CreateScalarOrComputation(PRED, cond_builder));
+
+    return And(sweep_update_cond, tol_cond);
+  };
+
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    auto while_cond_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_cond_builder) -> StatusOr<XlaOp> {
+      auto p = values_inner[0];
+      return Lt(p, ScalarLike(p, matrix_dimension - 1));
+    };
+
+    auto while_body_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_body_builder) -> StatusOr<std::vector<XlaOp>> {
+      auto while_cond_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_cond_builder) -> StatusOr<XlaOp> {
+        auto q = values_innermost[1];
+        return Lt(q, ScalarLike(q, matrix_dimension));
+      };
+      auto while_body_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_body_builder)
+          -> StatusOr<std::vector<XlaOp>> {
+        auto p = values_innermost[0];
+        auto q = values_innermost[1];
+
+        SVDResult onesided_jacobi_update;
+        onesided_jacobi_update.u = values_innermost[2];
+        onesided_jacobi_update.v = values_innermost[3];
+        onesided_jacobi_update.d = values_innermost[4];
+
+        auto eps = values_innermost[5];
+
+        TF_ASSIGN_OR_RETURN(
+            onesided_jacobi_update,
+            OneSidedJacobiUpdate(onesided_jacobi_update, p, q, eps));
+
+        std::vector<XlaOp> updated_values_innermost;
+        updated_values_innermost.reserve(values_innermost.size());
+
+        updated_values_innermost.push_back(p);
+        updated_values_innermost.push_back(q + ScalarLike(q, 1));
+        updated_values_innermost.push_back(onesided_jacobi_update.u);
+        updated_values_innermost.push_back(onesided_jacobi_update.v);
+        updated_values_innermost.push_back(onesided_jacobi_update.d);
+        updated_values_innermost.push_back(eps);
+
+        return updated_values_innermost;
+      };
+
+      std::vector<XlaOp> values_innermost(6);
+      auto p = values_inner[0];
+      auto q = p + ScalarLike(p, 1);
+      values_innermost[0] = p;                // index p.
+      values_innermost[1] = q;                // index q.
+      values_innermost[2] = values_inner[1];  // u.
+      values_innermost[3] = values_inner[2];  // v.
+      values_innermost[4] = values_inner[3];  // d.
+      values_innermost[5] = values_inner[4];  // eps.
+      TF_ASSIGN_OR_RETURN(
+          values_innermost,
+          WhileLoopHelper(while_cond_fn_innermost, while_body_fn_innermost,
+                          values_innermost, absl::StrCat(name, "-Innermost"),
+                          inner_body_builder));
+
+      std::vector<XlaOp> updated_values_inner;
+      updated_values_inner.reserve(values_inner.size());
+
+      updated_values_inner.push_back(p + ScalarLike(p, 1));
+      updated_values_inner.push_back(values_innermost[2]);
+      updated_values_inner.push_back(values_innermost[3]);
+      updated_values_inner.push_back(values_innermost[4]);
+      updated_values_inner.push_back(values_innermost[5]);
+      return updated_values_inner;
+    };
+    // Indexes.
+    XlaOp k = values[0];
+
+    std::vector<XlaOp> values_inner(5);
+    values_inner[0] = ScalarLike(k, 0);  // index p.
+    values_inner[1] = values[1];         // u.
+    values_inner[2] = values[2];         // v.
+    values_inner[3] = values[3];         // d.
+    values_inner[4] = values[4];         // eps.
+    TF_ASSIGN_OR_RETURN(
+        values_inner,
+        WhileLoopHelper(while_cond_fn_inner, while_body_fn_inner, values_inner,
+                        absl::StrCat(name, "-Inner"), body_builder));
+
+    std::vector<XlaOp> updated_values;
+    updated_values.reserve(values_inner.size());
+
+    updated_values.push_back(k + ScalarLike(k, 1));
+    updated_values.push_back(values_inner[1]);
+    updated_values.push_back(values_inner[2]);
+    updated_values.push_back(values_inner[3]);
+    updated_values.push_back(values_inner[4]);
+
+    return updated_values;
+  };
+  std::vector<XlaOp> values;
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              initial_values, name, builder));
+
+  return values;
+}
+
+// Sort singular values in decending order, and make sure they are non-negative
+// by flipping the signs of negative diagonal values and transferring the signs
+// to V. And for numeric stability, renormalize U and V.
+StatusOr<SVDResult> SortBySingularValuesAndPostProcessing(SVDResult result) {
+  XlaBuilder* builder = result.d.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(result.d));
+  const int64 num_dims = shape.rank();
+  auto dimensions = shape.dimensions();
+  const int64 m = ShapeUtil::GetDimension(shape, -2);
+  const int64 n = ShapeUtil::GetDimension(shape, -1);
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims[num_dims - 2] = num_dims - 1;
+
+  auto d = GetMatrixDiagonal(result.d);
+
+  auto zeros = ZerosLike(d);
+  auto one = ScalarLike(d, 1.0);
+
+  // Make all the singular values to be non-negative by transferring the signs
+  // to V.
+  auto sign = Select(Ge(d, zeros), zeros + one, zeros - one);
+  d = Select(Ge(d, zeros), d, -d);
+  result.v = Mul(result.v, sign, broadcast_dims);
+
+  d = BroadcastInDim(d, dimensions, broadcast_dims);
+
+  // As m >= n, only first m columns vectors are needed to be permuted, and the
+  // rest of m - n vectors are appended after the sorting is done.
+  XlaOp sort_u_result =
+      Sort({-d, SliceInMinorDims(result.u, {0, 0}, {m, n})},
+           CreateScalarLtComputation(
+               {shape.element_type(), shape.element_type()}, builder),
+           num_dims - 1);
+
+  // TODO(kuny): using CreateScalarGtComputation after b/124862300 is fixed.
+  XlaOp sort_v_result =
+      Sort({SliceInMinorDims(-d, {0, 0}, {n, n}), result.v},
+           CreateScalarLtComputation(
+               {shape.element_type(), shape.element_type()}, builder),
+           num_dims - 1);
+  // Make sure all the signular values are non-negative.
+  result.d = Max(-GetMatrixDiagonal(GetTupleElement(sort_v_result, 0)),
+                 ScalarLike(d, 0.0));
+
+  result.v = GetTupleElement(sort_v_result, 1);
+  result.v = Mul(
+      result.v,
+      Rsqrt(Reduce(Square(result.v), ScalarLike(d, 0.0),
+                   CreateScalarAddComputation(shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  // Append the rest of m - n vectors.
+  result.u = ConcatInDim(builder,
+                         {GetTupleElement(sort_u_result, 1),
+                          SliceInMinorDims(result.u, {0, n}, {m, m})},
+                         num_dims - 1);
+  result.u = Mul(
+      result.u,
+      Rsqrt(Reduce(Square(result.u), ScalarLike(d, 0.0),
+                   CreateScalarAddComputation(shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  return result;
+}
+
+}  // namespace
+
+// def jacobi_svd(A):
+//    U, D, V = house_bidiag(A)
+//    m, n = D.shape
+//    iter, max_iter = 0, 100
+//    frobenius_norm = np.linalg.norm(D)
+//    diag_norm = np.linalg.norm(np.diag(D))
+//    off_diag_norm = np.sqrt(
+//        frobenius_norm - diag_norm) * np.sqrt(frobenius_norm + diag_norm)
+//    while off_diag_norm > 1e-6 * frobenius_norm and iter < max_iter:
+//        iter += 1
+//        for p in range(m - 1):
+//            for q in range(p + 1, n):
+//                rot_l, rot_r = jacobi_rot(D[p][p], D[p][q], D[q][p], D[q][q])
+//                D[[p, q], :] = np.matmul(rot_l.T, D[[p, q], :])
+//                D[:, [p, q]] = np.matmul(D[:, [p, q]], rot_r)
+//                U[:, [p, q]] = np.matmul(U[:, [p, q]], rot_l)
+//                V[:, [p, q]] = np.matmul(V[:, [p, q]], rot_r)
+//        frobenius_norm = np.linalg.norm(D)
+//        diag_norm = np.linalg.norm(np.diag(D))
+//        off_diag_norm = np.sqrt(
+//            frobenius_norm - diag_norm) * np.sqrt(frobenius_norm + diag_norm)
+//
+//    return U, np.diag(D), V
+//
+SVDResult SVD(XlaOp a, int64 max_iter, float epsilon,
+              PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  auto return_error = [&](const Status& status) {
+    SVDResult result;
+    result.u = builder->ReportError(status);
+    result.v = builder->ReportError(status);
+    result.d = builder->ReportError(status);
+    return result;
+  };
+  auto shape_with_status = builder->GetShape(a);
+  if (!shape_with_status.status().ok()) {
+    return return_error(shape_with_status.status());
+  }
+  Shape a_shape = shape_with_status.ValueOrDie();
+  const int64 num_dims = a_shape.rank();
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+  int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  bool maybe_transpose = m < n;
+
+  if (maybe_transpose) {
+    a = TransposeInMinorDims(a);
+    std::swap(m, n);
+  }
+
+  auto eps = ScalarLike(a, epsilon);
+
+  SVDResult svd_result =
+      HouseHolderBidiagonalization(a, eps, precision).ValueOrDie();
+
+  auto output_with_status = WhileLoopFn(
+      {
+          Zero(builder, S32),  // k
+          svd_result.u,        // u
+          svd_result.v,        // v
+          svd_result.d,        // d
+          eps,                 // epsilon
+      },                       //
+      n,                       //
+      max_iter,                //
+      "CyclicOneSidedJacobi",  //
+      builder);
+  if (!output_with_status.status().ok()) {
+    return return_error(output_with_status.status());
+  }
+
+  auto output = output_with_status.ValueOrDie();
+
+  svd_result.u = output[1];
+  svd_result.v = output[2];
+  svd_result.d = output[3];
+  svd_result = SortBySingularValuesAndPostProcessing(svd_result).ValueOrDie();
+  if (maybe_transpose) {
+    std::swap(svd_result.u, svd_result.v);
+  }
+  return svd_result;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/svd.h b/tensorflow/compiler/xla/client/lib/svd.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a29539d9fc635d085bd7ccc48e693fe72307213
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/svd.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SVD_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SVD_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The singular value decomposition of a given matrix A[..., M, N], the original
+// matrix is recovered by u * diag(d) * v_t, where the first dims(A) - 2
+// dimensions are batch dimensions.
+struct SVDResult {
+  // The columns of U are the left-singular vectors, e.g.,
+  // U[..., :, :]_T * U[..., :, :] = I.
+  XlaOp u;
+  // Vector(s) with the singular values, within each vector sorted in descending
+  // order. The first dims(D) - 1 dimensions have the same size as the batch
+  // dimensions of A. And U[..., :, i] * D[..., i] = A[..., :, :] * V[..., :,
+  // i].
+  XlaOp d;
+  // The columns of V are the right-singular vectors. e.g.,
+  // V[..., :, :]_T * U[..., :, :] = I.
+  XlaOp v;
+};
+
+// TODO(kuny): Add a bool flag that supports SVD with economy (reduced)
+// representation, which is more memory efficient, especially in the case of
+// tall-skinny matrices.
+SVDResult SVD(XlaOp a, int64 max_iter = 100, float epsilon = 1e-6,
+              PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SVD_H_
diff --git a/tensorflow/compiler/xla/client/lib/svd_test.cc b/tensorflow/compiler/xla/client/lib/svd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c6ae93d8107b83b2884b851fdb44b3fe77a3c4
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/svd_test.cc
@@ -0,0 +1,278 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/svd.h"
+#include <utility>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+
+class SVDTest : public ClientLibraryTestBase {
+ protected:
+  void SetUp() override {
+    ClientLibraryTestBase::SetUp();
+    batch_3d_4x5_ = Array3D<float>{
+        {
+            {4, 6, 8, 10, 1},
+            {6, 45, 54, 63, 1},
+            {8, 54, 146, 166, 1},
+            {10, 63, 166, 310, 1},
+        },
+        {
+            {16, 24, 8, 12, 6},
+            {24, 61, 82, 48, 5},
+            {8, 82, 100, 6, 4},
+            {12, 48, 6, 62, 3},
+        },
+    };
+  }
+  void TearDown() override { ClientLibraryTestBase::TearDown(); }
+
+  Array3D<float> GetUnitMatrix3D(int32 batch_dim, int32 mat_dim) {
+    Array3D<float> result(batch_dim, mat_dim, mat_dim, 0.0);
+    for (int i = 0; i < batch_dim; ++i) {
+      for (int j = 0; j < mat_dim; ++j) {
+        result({i, j, j}) = 1.0;
+      }
+    }
+    return result;
+  }
+
+  XlaOp ComputeMatmulUDVT(SVDResult result, XlaBuilder* builder) {
+    Shape u_shape = builder->GetShape(result.u).ValueOrDie();
+    Shape v_shape = builder->GetShape(result.v).ValueOrDie();
+
+    int64 m = ShapeUtil::GetDimension(u_shape, -1);
+    int64 n = ShapeUtil::GetDimension(v_shape, -1);
+
+    auto v = result.v;
+    auto u = result.u;
+    auto d = result.d;
+
+    auto zero = Zero(builder, S32);
+    if (m > n) {
+      u = DynamicSliceInMinorDims(u, {zero, zero}, {m, n});
+    } else if (m < n) {
+      v = DynamicSliceInMinorDims(v, {zero, zero}, {n, m});
+    }
+
+    int num_dims = u_shape.rank();
+    std::vector<int64> broadcast_dims(num_dims - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+    broadcast_dims[num_dims - 2] = num_dims - 1;
+    return BatchDot(Mul(u, d, broadcast_dims), TransposeInMinorDims(v),
+                    PrecisionConfig::HIGHEST);
+  }
+
+  Array3D<float> ExtractTriangularMatrix(const Array3D<float>& matrix,
+                                         bool lower) {
+    Array3D<float> result(matrix);
+    for (int i = 0; i < result.n1(); ++i) {
+      for (int j = 0; j < result.n2(); ++j) {
+        if (lower) {
+          for (int k = j + 1; k < result.n3(); ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        } else {
+          for (int k = 0; k < j; ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(m1).ValueOrDie();
+    int64 size = 1;
+    for (auto d : shape.dimensions()) {
+      size *= d;
+    }
+    return ReduceAll(Abs(m1 - m2), ConstantR0WithType(builder, F32, 0),
+                     CreateScalarAddComputation(F32, builder)) /
+           ConstantR0WithType(builder, F32, size);
+  }
+
+  Array2D<float> GenerateRandomMatrix(int xsize, int ysize) {
+    Array2D<float> result{xsize, ysize, 0.0};
+    result.FillRandom(10 /* stddev */, 2 /* mean */);
+    return result;
+  }
+
+  Array3D<float> batch_3d_4x5_;
+};
+
+XLA_TEST_F(SVDTest, Simple2D) {
+  XlaBuilder builder(TestName());
+
+  Array2D<float> simple_2d_4x4_ = Array2D<float>{
+      {4, 6, 8, 10},
+      {6, 45, 54, 63},
+      {8, 54, 146, 166},
+      {10, 63, 166, 310},
+  };
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(simple_2d_4x4_, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  ComputeMatmulUDVT(result, &builder);
+
+  ComputeAndCompareR2<float>(&builder, simple_2d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Test_VWVt_EQ_A_2x4x5) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x5_, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-8);
+  ComputeMatmulUDVT(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x5_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Test_Orthogonality_U) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x5_, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-8);
+  ComputeMatmulUDVT(result, &builder);
+  BatchDot(result.u, TransposeInMinorDims(result.u));
+
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 4), {a_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(SVDTest, Test_Orthogonality_V) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x5_, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-8);
+  BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 5), {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, TestSingleValuesMatchNumpy) {
+  XlaBuilder builder(TestName());
+
+  auto singular_values = Array2D<float>{
+      {431.05153007, 49.88334164, 20.94464584, 3.24845468},
+      {179.73128591, 68.05162245, 21.77679503, 13.94319712},
+  };
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x5_, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-8);
+  Add(result.d, ZerosLike(result.d));
+
+  ComputeAndCompareR2<float>(&builder, singular_values, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x128) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(512, 128);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x256) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(128, 256);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_256x128) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(256, 128);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x512) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(128, 512);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x256) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(512, 256);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x512) {
+  XlaBuilder builder(TestName());
+  Array2D<float> a_val = GenerateRandomMatrix(512, 512);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SVD(a, 100, 1e-6);
+  GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
deleted file mode 100644
index 50a3b30ebd1c15eb6d2ace4e351cb41f21db7093..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// Solves systems of linear equations with lower or upper triangular coefficient
-// matrices by forward- or back-substitution. Broadcasting along leading
-// dimensions, this routine solves one of the matrix systems
-//   `op(a) * x = b`,  or `x * op(a) = b`,
-// for the variable `x` given `a` and `b`, where `op(a)` is either
-//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
-// That is, the innermost matrices in the output satisfy a scalar system
-// depending on the value of the value of (left_side, transpose_a, conjugate_a)
-// according to:
-//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
-//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
-//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
-//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
-//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
-//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
-// where * denotes complex conjugation and where the index `k` is summed over.
-//
-// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
-// square matrices. If lower is true (false), then the strictly upper (lower)
-// triangular part of each innermost matrix in `a` is assumed to be zero and is
-// not accessed.
-// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
-// tensor of shape `[..., K, M]`.
-// `left_side` is a boolean, indicating whether to solve a system of the form
-// op(a) * x = b (true) or x * op(a) = b (false).
-// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
-// (true) or upper-triangular (false).
-// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
-// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
-// conjugated (independently of whether they are transposed), so that when both
-// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
-//
-// Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
-// blocking is used.
-XlaOp TriangularSolve(
-    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
-    bool conjugate_a, int64 block_size = 128,
-    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 48b5f94538f453785194bc434a91ee0a10c020c2..ae7d3d9f9920385476aecb2ce7c2a02b76adc93f 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
@@ -185,7 +186,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
   TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot()));
-  TF_RETURN_IF_ERROR(executable_->DumpHloSnapshot());
+  DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot());
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index ddb36680e8b185b053368baffa6f1d5cac50dc07..2dd8c130e26e75a7c2690d6848d4b081234eea37 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -67,10 +67,10 @@ class LocalExecutable {
       const ExecutableRunOptions& run_options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
-  // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
+  // invoke it, and the result. Enabled by flag: --xla_dump_hlo_snapshots.
   //
-  // The given ServiceExecutableRunOptions override any values from TF_XLA_FLAGS
-  // environment variable.
+  // The given ServiceExecutableRunOptions override any values from the
+  // XLA_FLAGS environment variable.
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const absl::Span<const ShapedBuffer* const> arguments);
@@ -114,7 +114,7 @@ class LocalClient : public Client {
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // The given ExecutableBuildOptions override any values from TF_XLA_FLAGS
+  // The given ExecutableBuildOptions overrides any values from XLA_FLAGS
   // environment variable.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 20298d175dc83abc4fe16212300eb587385ae583..2f574366694a61ee18876596df23eaacec1b1129 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -267,8 +267,8 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
       for (int64 index : target_param_index) {
         param_shape_ptr = param_shape_ptr->mutable_tuple_shapes(index);
       }
-      param_shape_ptr->set_dynamic_dimension(target_dim_num,
-                                             /*is_dynamic=*/true);
+      // TODO(b/121223198): Set `is_dynamic` to the parameter shape when XLA
+      // backend can handle dynamic dimensions.
       *instr.mutable_shape() = param_shape.ToProto();
     }
   }
@@ -299,46 +299,51 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   return build_status.ConsumeValueOrDie();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build() {
+Status XlaBuilder::GetCurrentStatus() const {
   if (!first_error_.ok()) {
     string backtrace;
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
-  return Build(instructions_.back().id());
+  return Status::OK();
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
+  return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root) {
+StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
+                                           bool remove_dynamic_dimensions) {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
-  return Build(root.handle());
+  return Build(root.handle(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
-  if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
-  }
+StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
+                                           bool remove_dynamic_dimensions) {
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
 
   // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
   // all dynamic dimensions before building xla program until we have support in
   // the backend.
-  std::function<void(ShapeProto*)> remove_dynamic_dimension =
-      [&](ShapeProto* shape) {
-        if (shape->tuple_shapes_size() != 0) {
-          for (int64 i = 0; i < shape->tuple_shapes_size(); ++i) {
-            remove_dynamic_dimension(shape->mutable_tuple_shapes(i));
+  if (remove_dynamic_dimensions) {
+    std::function<void(ShapeProto*)> remove_dynamic_dimension =
+        [&](ShapeProto* shape) {
+          if (shape->tuple_shapes_size() != 0) {
+            for (int64 i = 0; i < shape->tuple_shapes_size(); ++i) {
+              remove_dynamic_dimension(shape->mutable_tuple_shapes(i));
+            }
           }
-        }
-        for (int64 i = 0; i < shape->dimensions_size(); ++i) {
-          shape->set_is_dynamic_dimension(i, false);
-        }
-      };
+          for (int64 i = 0; i < shape->dimensions_size(); ++i) {
+            shape->set_is_dynamic_dimension(i, false);
+          }
+        };
 
-  for (auto& instruction : instructions_) {
-    remove_dynamic_dimension(instruction.mutable_shape());
+    for (auto& instruction : instructions_) {
+      remove_dynamic_dimension(instruction.mutable_shape());
+    }
   }
 
   HloComputationProto entry;
@@ -475,7 +480,8 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
 }
 
 XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
-                           absl::Span<const int64> broadcast_dimensions) {
+                           absl::Span<const int64> broadcast_dimensions,
+                           absl::optional<ComparisonDirection> direction) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
@@ -484,6 +490,17 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
     *instr.mutable_shape() = shape.ToProto();
+    if (binop == HloOpcode::kCompare) {
+      if (!direction.has_value()) {
+        return InvalidArgument(
+            "kCompare expects a ComparisonDirection, but none provided.");
+      }
+      instr.set_comparison_direction(ComparisonDirectionToString(*direction));
+    } else if (direction.has_value()) {
+      return InvalidArgument(
+          "A comparison direction is provided for a non-compare opcode: %s.",
+          HloOpcodeString(binop));
+    }
 
     const int64 lhs_rank = lhs_shape.rank();
     const int64 rhs_rank = rhs_shape.rank();
@@ -497,16 +514,19 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : shape.dimensions()) {
-        to_size.push_back(size);
+      std::vector<bool> to_size_is_dynamic;
+      for (int i = 0; i < shape.rank(); i++) {
+        to_size.push_back(shape.dimensions(i));
+        to_size_is_dynamic.push_back(shape.is_dynamic_dimension(i));
       }
       for (int64 from_dim = 0; from_dim < from_shape.rank(); from_dim++) {
         int64 to_dim = broadcast_dimensions[from_dim];
         to_size[to_dim] = from_shape.dimensions(from_dim);
+        to_size_is_dynamic[to_dim] = from_shape.is_dynamic_dimension(from_dim);
       }
 
-      const Shape& broadcasted_shape =
-          ShapeUtil::MakeShape(from_shape.element_type(), to_size);
+      const Shape& broadcasted_shape = ShapeUtil::MakeShape(
+          from_shape.element_type(), to_size, to_size_is_dynamic);
       TF_ASSIGN_OR_RETURN(
           XlaOp broadcasted_operand,
           InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
@@ -566,16 +586,6 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
   });
 }
 
-XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -665,8 +675,17 @@ XlaOp XlaBuilder::BroadcastInDim(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
-    const auto& output_shape =
+    auto output_shape =
         ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+    for (int i = 0; i < broadcast_dimensions.size(); i++) {
+      if (broadcast_dimensions[i] < 0 ||
+          broadcast_dimensions[i] > out_dim_size.size()) {
+        return InvalidArgument("Broadcast dimension %lld is out of bound",
+                               broadcast_dimensions[i]);
+      }
+      output_shape.set_dynamic_dimension(broadcast_dimensions[i],
+                                         operand_shape.is_dynamic_dimension(i));
+    }
 
     TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
                            operand_shape, output_shape, broadcast_dimensions)
@@ -988,36 +1007,6 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
   });
 }
 
-XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs,
                       const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1038,6 +1027,18 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    // If one operand is a scalar, just multiply the two operands.
+    if (ShapeUtil::IsScalar(lhs_shape) || ShapeUtil::IsScalar(rhs_shape)) {
+      if (dimension_numbers.rhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.lhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.rhs_contracting_dimensions_size() != 0 ||
+          dimension_numbers.lhs_contracting_dimensions_size() != 0) {
+        return InvalidArgument(
+            "Dots with scalar operands must have no contracting or batch "
+            "dimensions");
+      }
+      return xla::Mul(lhs, rhs);
+    }
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
@@ -1533,147 +1534,6 @@ XlaOp XlaBuilder::CustomCall(
   });
 }
 
-XlaOp XlaBuilder::Complex(const XlaOp& real, const XlaOp& imag,
-                          absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Conj(const XlaOp& operand) {
-  return Complex(Real(operand), Neg(Imag(operand)));
-}
-
-XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kXor, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Not(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNot, operand);
-}
-
-XlaOp XlaBuilder::ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                            absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightArithmetic(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightLogical(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Abs(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kAbs, operand);
-}
-
-XlaOp XlaBuilder::Atan2(const XlaOp& y, const XlaOp& x,
-                        absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Exp(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExp, operand);
-}
-
-XlaOp XlaBuilder::Expm1(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExpm1, operand);
-}
-
-XlaOp XlaBuilder::Floor(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kFloor, operand);
-}
-
-XlaOp XlaBuilder::Ceil(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCeil, operand);
-}
-
-XlaOp XlaBuilder::Round(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kRoundNearestAfz, operand);
-}
-
-XlaOp XlaBuilder::Log(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog, operand);
-}
-
-XlaOp XlaBuilder::Log1p(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog1p, operand);
-}
-
-XlaOp XlaBuilder::Sign(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSign, operand);
-}
-
-XlaOp XlaBuilder::Clz(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kClz, operand);
-}
-
-XlaOp XlaBuilder::Cos(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCos, operand);
-}
-
-XlaOp XlaBuilder::Sin(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSin, operand);
-}
-
-XlaOp XlaBuilder::Tanh(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kTanh, operand);
-}
-
-XlaOp XlaBuilder::Real(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kReal, operand);
-}
-
-XlaOp XlaBuilder::Imag(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kImag, operand);
-}
-
-XlaOp XlaBuilder::IsFinite(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kIsFinite, operand);
-}
-
 XlaOp XlaBuilder::Transpose(const XlaOp& operand,
                             absl::Span<const int64> permutation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1704,36 +1564,146 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   });
 }
 
+namespace {
+// Switch from a floating point value to a integer value in such a way that when
+// using the integer value to compare, we get the same result for normal values,
+// and -Nan is treated as the smallest value, and Nan is treated as the largest
+// value.
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? numeric_limits<int32>::max() - x : x;
+// then y is ordered as an int32 such that finite values have the obvious order,
+// -0 is ordered before 0, and -NaN and NaN appear at the beginning and end of
+// the ordering.
+// Note that in order to avoid -x to overflow, we calculate
+// numeric_limits<int32>::max() - x as unsigned, and then convert back to
+// signed.
+XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
+                                            int64 bit_width) {
+  PrimitiveType signed_type;
+  PrimitiveType unsigned_type;
+  XlaOp max_value;
+  switch (bit_width) {
+    case 16:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint16>(std::numeric_limits<int16>::max()));
+      signed_type = S16;
+      unsigned_type = U16;
+      break;
+    case 32:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint32>(std::numeric_limits<int32>::max()));
+      signed_type = S32;
+      unsigned_type = U32;
+      break;
+    case 64:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint64>(std::numeric_limits<int64>::max()));
+      signed_type = S64;
+      unsigned_type = U64;
+      break;
+    default:
+      return value.builder()->ReportError(
+          InvalidArgument("Invalid bit width %lld for Comparator floating "
+                          "point parameter.",
+                          bit_width));
+  }
+  auto signed_value = BitcastConvertType(value, signed_type);
+  auto unsigned_value = BitcastConvertType(value, unsigned_type);
+  auto flipped_value =
+      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
+  auto is_negative =
+      Lt(signed_value,
+         ConstantLiteral(value.builder(), LiteralUtil::Zero(signed_type)));
+  return Select(is_negative, flipped_value, signed_value);
+}
+}  // namespace
+
 XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                        int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    std::vector<XlaOp> operands{keys};
+    for (const XlaOp& value : values) {
+      operands.push_back(value);
+    }
+    // Build the default less-than comparator (copied from lib/comparators.cc).
+    // TODO(b/122298745): Remove the deprecated API method so that this code
+    // duplication can be deleted.
+    auto b = this->CreateSubBuilder("comparator");
+    std::vector<PrimitiveType> operand_types;
+    for (const XlaOp& operand : operands) {
+      TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(operand));
+      operand_types.push_back(operand_shape.element_type());
+    }
+
+    int64 parameter_count = 0;
+    XlaOp first_lhs_param;
+    XlaOp first_rhs_param;
+
+    for (auto operand_type : operand_types) {
+      auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
+      auto lhs_param =
+          b->Parameter(parameter_count * 2, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".lhs"));
+      auto rhs_param =
+          b->Parameter(parameter_count * 2 + 1, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".rhs"));
+      if (parameter_count == 0) {
+        first_lhs_param = lhs_param;
+        first_rhs_param = rhs_param;
+      }
+      ++parameter_count;
+    }
+    if (primitive_util::IsFloatingPointType(operand_types[0])) {
+      PrimitiveType compare_type = operand_types[0];
+      // Special-case handling for BF16. We currently do not support direct
+      // comparisons with BF16, so we convert to F32 and then use the F32
+      // comparison logic.
+      if (compare_type == BF16) {
+        compare_type = F32;
+        first_lhs_param = b->ConvertElementType(first_lhs_param, F32);
+        first_rhs_param = b->ConvertElementType(first_rhs_param, F32);
+      }
+      int64 bit_width = primitive_util::BitWidth(compare_type);
+      first_lhs_param =
+          BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
+      first_rhs_param =
+          BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
+    }
+    Lt(first_lhs_param, first_rhs_param);
+
+    TF_ASSIGN_OR_RETURN(auto comparator, b->Build());
+    return Sort(operands, comparator, dimension, /*is_stable=*/false);
+  });
+}
+
+XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
+                       const XlaComputation& comparator, int64 dimension,
+                       bool is_stable) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
+    instr.set_is_stable(is_stable);
     std::vector<const Shape*> operand_shape_ptrs;
-    TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
-    operand_shape_ptrs.push_back(&keys_shape);
-    TF_ASSIGN_OR_RETURN(std::vector<Shape> values_shapes,
-                        GetOperandShapes(values));
-    absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
+    TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
+                        GetOperandShapes(operands));
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
                                          HloOpcode::kSort, operand_shape_ptrs));
     *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
-      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
+      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(operands[0]));
       dimension = keys_shape.rank() - 1;
     }
     instr.add_dimensions(dimension);
-    std::vector<XlaOp> operands{keys};
-    operands.insert(operands.end(), values.begin(), values.end());
+    AddCalledComputation(comparator, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
   });
 }
 
-XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1759,10 +1729,6 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   });
 }
 
-XlaOp XlaBuilder::Neg(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNegate, operand);
-}
-
 XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand,
                         const XlaOp& max) {
   return TernaryOp(HloOpcode::kClamp, min, operand, max);
@@ -1926,32 +1892,46 @@ XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                               const XlaComputation& true_computation,
                               const XlaOp& false_operand,
                               const XlaComputation& false_computation) {
+  // The index of true_computation must be 0 and that of false computation
+  // must be 1.
+  return Conditional(predicate, {&true_computation, &false_computation},
+                     {true_operand, false_operand});
+}
+
+XlaOp XlaBuilder::Conditional(
+    const XlaOp& branch_index,
+    absl::Span<const XlaComputation* const> branch_computations,
+    absl::Span<const XlaOp> branch_operands) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate));
-    TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape,
-                        GetShape(true_operand));
-    TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape,
-                        true_computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape,
-                        GetShape(false_operand));
-    TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
-                        false_computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(
-        Shape shape,
-        ShapeInference::InferConditionalShape(
-            predicate_shape, true_operand_shape, false_operand_shape,
-            true_computation_shape, false_computation_shape));
+    TF_ASSIGN_OR_RETURN(const Shape& branch_index_shape,
+                        GetShape(branch_index));
+    std::vector<Shape> branch_operand_shapes(branch_operands.size());
+    std::vector<ProgramShape> branch_computation_shapes(
+        branch_computations.size());
+    for (int j = 0; j < branch_operands.size(); ++j) {
+      TF_ASSIGN_OR_RETURN(branch_operand_shapes[j],
+                          GetShape(branch_operands[j]));
+      TF_ASSIGN_OR_RETURN(branch_computation_shapes[j],
+                          branch_computations[j]->GetProgramShape());
+    }
+    TF_ASSIGN_OR_RETURN(const Shape shape,
+                        ShapeInference::InferConditionalShape(
+                            branch_index_shape, branch_computation_shapes,
+                            branch_operand_shapes));
     *instr.mutable_shape() = shape.ToProto();
 
-    // The index of true_computation must be 0 and that of false computation
-    // must be 1.
-    AddCalledComputation(true_computation, &instr);
-    AddCalledComputation(false_computation, &instr);
+    for (const XlaComputation* branch_computation : branch_computations) {
+      AddCalledComputation(*branch_computation, &instr);
+    }
 
+    std::vector<XlaOp> operands(1, branch_index);
+    for (const XlaOp branch_operand : branch_operands) {
+      operands.emplace_back(branch_operand);
+    }
     return AddInstruction(std::move(instr), HloOpcode::kConditional,
-                          {predicate, true_operand, false_operand});
+                          absl::MakeSpan(operands));
   });
 }
 
@@ -2143,8 +2123,8 @@ XlaOp XlaBuilder::CrossReplicaSum(
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
     const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
     auto b = CreateSubBuilder("sum");
-    b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
-           b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
+    Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
+        b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
     TF_ASSIGN_OR_RETURN(auto computation, b->Build());
     return CrossReplicaSum(operand, computation, replica_groups,
                            /*channel_id=*/absl::nullopt);
@@ -2253,6 +2233,14 @@ XlaOp XlaBuilder::CollectivePermute(
   });
 }
 
+XlaOp XlaBuilder::ReplicaId() {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeShape(U32, {}).ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kReplicaId, {});
+  });
+}
+
 XlaOp XlaBuilder::SelectAndScatter(const XlaOp& operand,
                                    const XlaComputation& select,
                                    absl::Span<const int64> window_dimensions,
@@ -2932,32 +2920,39 @@ XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index) {
 
 XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Eq(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kEq);
 }
 
 XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ne(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kNe);
 }
 
 XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ge(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGe);
 }
 
 XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Gt(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kGt);
 }
 
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Lt(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLe);
 }
 
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Le(lhs, rhs, broadcast_dimensions);
+  return Compare(lhs, rhs, broadcast_dimensions, ComparisonDirection::kLt);
+}
+
+XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction) {
+  return lhs.builder()->BinaryOp(HloOpcode::kCompare, lhs, rhs,
+                                 broadcast_dimensions, direction);
 }
 
 XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
@@ -3031,6 +3026,44 @@ XlaOp Fft(const XlaOp& operand, FftType fft_type,
   return operand.builder()->Fft(operand, fft_type, fft_length);
 }
 
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(const Shape& b_shape, builder->GetShape(b));
+    xla::TriangularSolveOptions& options =
+        *instr.mutable_triangular_solve_options();
+    options.set_left_side(left_side);
+    options.set_lower(lower);
+    options.set_unit_diagonal(unit_diagonal);
+    options.set_transpose_a(transpose_a);
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTriangularSolveShape(
+                                         a_shape, b_shape, options));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return builder->AddInstruction(std::move(instr),
+                                   HloOpcode::kTriangularSolve, {a, b});
+  });
+}
+
+XlaOp Cholesky(XlaOp a, bool lower) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& a_shape, builder->GetShape(a));
+    xla::CholeskyOptions& options = *instr.mutable_cholesky_options();
+    options.set_lower(lower);
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferCholeskyShape(a_shape));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return builder->AddInstruction(std::move(instr), HloOpcode::kCholesky, {a});
+  });
+}
+
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape, const string& config) {
   return builder->Infeed(shape, config);
 }
@@ -3060,78 +3093,96 @@ XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
                              operand_shapes_with_layout);
 }
 
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+XlaOp Complex(const XlaOp& lhs, const XlaOp& rhs,
               absl::Span<const int64> broadcast_dimensions) {
-  return real.builder()->Complex(real, imag, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kComplex, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Conj(const XlaOp& operand) { return operand.builder()->Conj(operand); }
+XlaOp Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Add(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAdd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Sub(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kSubtract, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Mul(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMultiply, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Div(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kDivide, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Rem(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kRemainder, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Max(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMaximum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Min(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMinimum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->And(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAnd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Or(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kOr, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Xor(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kXor, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Not(const XlaOp& operand) { return operand.builder()->Not(operand); }
+XlaOp Not(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNot, operand);
+}
 
 XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
                 absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftLeft(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftLeft, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightArithmetic(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                         absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightLogical(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
@@ -3203,6 +3254,8 @@ XlaOp CollectivePermute(
   return operand.builder()->CollectivePermute(operand, source_target_pairs);
 }
 
+XlaOp ReplicaId(XlaBuilder* builder) { return builder->ReplicaId(); }
+
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                        absl::Span<const int64> window_dimensions,
                        absl::Span<const int64> window_strides, Padding padding,
@@ -3224,48 +3277,73 @@ XlaOp SelectAndScatterWithGeneralPadding(
       init_value, scatter);
 }
 
-XlaOp Abs(const XlaOp& operand) { return operand.builder()->Abs(operand); }
+XlaOp Abs(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kAbs, operand);
+}
 
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+XlaOp Atan2(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions) {
-  return y.builder()->Atan2(y, x, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAtan2, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Exp(const XlaOp& operand) { return operand.builder()->Exp(operand); }
-
-XlaOp Expm1(const XlaOp& operand) { return operand.builder()->Expm1(operand); }
-
-XlaOp Floor(const XlaOp& operand) { return operand.builder()->Floor(operand); }
-
-XlaOp Ceil(const XlaOp& operand) { return operand.builder()->Ceil(operand); }
-
-XlaOp Round(const XlaOp& operand) { return operand.builder()->Round(operand); }
-
-XlaOp Log(const XlaOp& operand) { return operand.builder()->Log(operand); }
-
-XlaOp Log1p(const XlaOp& operand) { return operand.builder()->Log1p(operand); }
-
-XlaOp Sign(const XlaOp& operand) { return operand.builder()->Sign(operand); }
-
-XlaOp Clz(const XlaOp& operand) { return operand.builder()->Clz(operand); }
-
-XlaOp Cos(const XlaOp& operand) { return operand.builder()->Cos(operand); }
-
-XlaOp Sin(const XlaOp& operand) { return operand.builder()->Sin(operand); }
-
-XlaOp Tanh(const XlaOp& operand) { return operand.builder()->Tanh(operand); }
-
-XlaOp Real(const XlaOp& operand) { return operand.builder()->Real(operand); }
-
-XlaOp Imag(const XlaOp& operand) { return operand.builder()->Imag(operand); }
+XlaOp Exp(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExp, operand);
+}
+XlaOp Expm1(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand);
+}
+XlaOp Floor(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kFloor, operand);
+}
+XlaOp Ceil(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCeil, operand);
+}
+XlaOp Round(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kRoundNearestAfz, operand);
+}
+XlaOp Log(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog, operand);
+}
+XlaOp Log1p(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
+}
+XlaOp Sign(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
+}
+XlaOp Clz(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kClz, operand);
+}
+XlaOp Cos(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCos, operand);
+}
+XlaOp Sin(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
+}
+XlaOp Tanh(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
+}
+XlaOp Real(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kReal, operand);
+}
+XlaOp Imag(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kImag, operand);
+}
+XlaOp Sqrt(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand);
+}
+XlaOp Rsqrt(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand);
+}
 
 XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Pow(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kPower, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp IsFinite(const XlaOp& operand) {
-  return operand.builder()->IsFinite(operand);
+  return operand.builder()->UnaryOp(HloOpcode::kIsFinite, operand);
 }
 
 XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type) {
@@ -3276,7 +3354,9 @@ XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type) {
   return operand.builder()->BitcastConvertType(operand, new_element_type);
 }
 
-XlaOp Neg(const XlaOp& operand) { return operand.builder()->Neg(operand); }
+XlaOp Neg(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNegate, operand);
+}
 
 XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation) {
   return operand.builder()->Transpose(operand, permutation);
@@ -3290,6 +3370,12 @@ XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values, int64 dimension) {
   return keys.builder()->Sort(keys, values, dimension);
 }
 
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension, bool is_stable) {
+  return operands[0].builder()->Sort(operands, comparator, dimension,
+                                     is_stable);
+}
+
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) {
   return min.builder()->Clamp(min, operand, max);
 }
@@ -3322,6 +3408,13 @@ XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                                           false_computation);
 }
 
+XlaOp Conditional(const XlaOp& branch_index,
+                  absl::Span<const XlaComputation* const> branch_computations,
+                  absl::Span<const XlaOp> branch_operands) {
+  return branch_index.builder()->Conditional(branch_index, branch_computations,
+                                             branch_operands);
+}
+
 XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                       const int mantissa_bits) {
   return operand.builder()->ReducePrecision(operand, exponent_bits,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 8908d172fa89632ead48f954de12066af12411c7..80f93a8b6de98e124bf9fed3969ffcec7c4a95c4 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
@@ -56,6 +57,9 @@ class XlaOp {
   }
   ~XlaOp() = default;
 
+  XlaOp(const XlaOp& other) = default;
+  XlaOp& operator=(const XlaOp& other) = default;
+
   // Precondition: !IsUninitialized().
   //
   // It's very common to do foo.builder()->bar().  Without this precondition, if
@@ -197,11 +201,19 @@ class XlaBuilder {
   // status. Note that all ops that have been enqueued will be moved to the
   // computation being returned. The root of the computation will be the last
   // added operation.
-  StatusOr<XlaComputation> Build();
+  //
+  // `remove_dynamic_dimensions` tells the builder whether to remove the
+  // dyanmic dimensions information in all ops.
+  //
+  // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
+  // dynamic dimensions information when XLA backend can handle dynamic
+  // dimensions.
+  StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = true);
 
   // Overload of Build which specifies a particular root instruction for the
   // computation.
-  StatusOr<XlaComputation> Build(XlaOp root);
+  StatusOr<XlaComputation> Build(XlaOp root,
+                                 bool remove_dynamic_dimensions = true);
 
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
@@ -227,6 +239,10 @@ class XlaBuilder {
   // See also set_die_immediately_on_error().
   Status first_error() const { return first_error_; }
 
+  // Returns the current status of the builder, complete with the stack trace
+  // information.
+  Status GetCurrentStatus() const;
+
   // Returns the shape of the given op.
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
@@ -269,6 +285,10 @@ class XlaBuilder {
   // and its real dynamic size is represented by `dynamic_param_index` in
   // parameter `dynamic_param_num`.
   //
+  // Note that this should be called before the dynamic parameters are used to
+  // create other operations, otherwise created operations won't have the
+  // dynamic dimensions information.
+  //
   // TODO(b/119520625): Remove this API once we have more dynamic shape infra
   // ready.
   Status SetDynamicBinding(int64 dynamic_size_param_num,
@@ -284,16 +304,19 @@ class XlaBuilder {
     input_output_aliases_.push_back({output_index, param_number, param_index});
   }
 
- private:
   // Describes an input/output alias as inserted by the SetUpAlias() API.
   struct InputOutputAlias {
+    // Specifies the index of the aliased buffer in the result tuple.
     ShapeIndex output_index;
+    // Specifies the parameter containing the buffer to be aliased.
     int64 param_number;
+    // Specifies the index of the aliased buffer in the parameter
     ShapeIndex param_index;
   };
 
+ private:
   // Build helper which takes the id of the root operation..
-  StatusOr<XlaComputation> Build(int64 root_id);
+  StatusOr<XlaComputation> Build(int64 root_id, bool remove_dynamic_dimensions);
 
   // Description for the methods below can be found in the corresponding public
   // functions section in this file.
@@ -303,38 +326,6 @@ class XlaBuilder {
 
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  template <typename NativeT>
-  XlaOp ConstantR0(NativeT value);
-  template <typename NativeT>
-  XlaOp ConstantR1(absl::Span<const NativeT> values);
-  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  XlaOp ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                    const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  XlaOp ConstantR1(int64 length, NativeT value);
-
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
@@ -382,24 +373,6 @@ class XlaBuilder {
 
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
@@ -464,50 +437,6 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Conj(const XlaOp& operand);
-
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Not(const XlaOp& operand);
-
-  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                  absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
-                             absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
-                          absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
@@ -550,6 +479,8 @@ class XlaBuilder {
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+  XlaOp ReplicaId();
+
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -564,44 +495,6 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  XlaOp Abs(const XlaOp& operand);
-
-  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-              absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Exp(const XlaOp& operand);
-
-  XlaOp Expm1(const XlaOp& operand);
-
-  XlaOp Floor(const XlaOp& operand);
-
-  XlaOp Ceil(const XlaOp& operand);
-
-  XlaOp Round(const XlaOp& operand);
-
-  XlaOp Log(const XlaOp& operand);
-
-  XlaOp Log1p(const XlaOp& operand);
-
-  XlaOp Sign(const XlaOp& operand);
-
-  XlaOp Clz(const XlaOp& operand);
-
-  XlaOp Cos(const XlaOp& operand);
-
-  XlaOp Sin(const XlaOp& operand);
-
-  XlaOp Tanh(const XlaOp& operand);
-
-  XlaOp Real(const XlaOp& operand);
-
-  XlaOp Imag(const XlaOp& operand);
-
-  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp IsFinite(const XlaOp& operand);
-
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
   XlaOp Iota(PrimitiveType type, int64 size);
@@ -612,14 +505,15 @@ class XlaBuilder {
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  XlaOp Neg(const XlaOp& operand);
-
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
+  ABSL_DEPRECATED("Use form with comparator computation instead")
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
+  XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+             int64 dimension = -1, bool is_stable = false);
 
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -639,6 +533,10 @@ class XlaBuilder {
                     const XlaOp& false_operand,
                     const XlaComputation& false_computation);
 
+  XlaOp Conditional(const XlaOp& branch_index,
+                    absl::Span<const XlaComputation* const> branch_computations,
+                    absl::Span<const XlaOp> branch_operands);
+
   XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                         const int mantissa_bits);
 
@@ -699,9 +597,11 @@ class XlaBuilder {
 
   // Internal helper method that does the building for an arbitrary binary op.
   // broadcast_dimensions specifies which dimensions to use for broadcasting
-  // when the operation is between tensors of different ranks.
+  // when the operation is between tensors of different ranks. The direction is
+  // only used if opcode is kCompare.
   XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
-                 absl::Span<const int64> broadcast_dimensions);
+                 absl::Span<const int64> broadcast_dimensions,
+                 absl::optional<ComparisonDirection> direction = absl::nullopt);
 
   // Internal helper method that does the building for an arbitrary ternary op.
   XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
@@ -811,48 +711,6 @@ class XlaBuilder {
                          const Shape& shape, const string& name);
   friend XlaOp ConstantLiteral(XlaBuilder* builder,
                                const LiteralSlice& literal);
-  template <typename NativeT>
-  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          absl::Span<const NativeT> values);
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2(
-      XlaBuilder* builder,
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                           const Array<NativeT>& values,
-                                           const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArray(XlaBuilder* builder,
-                                 const Array<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                               const Array2D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                                     const Array2D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                               const Array3D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                                     const Array3D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                               const Array4D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                                     const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
 
   friend XlaOp Broadcast(const XlaOp& operand,
                          absl::Span<const int64> broadcast_sizes);
@@ -912,6 +770,9 @@ class XlaBuilder {
                   absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
                   absl::Span<const int64> broadcast_dimensions);
+  friend XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+                       absl::Span<const int64> broadcast_dimensions,
+                       ComparisonDirection direction);
   friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
@@ -950,6 +811,10 @@ class XlaBuilder {
       const PrecisionConfig* precision_config);
   friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
                    absl::Span<const int64> fft_length);
+  friend XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                               bool unit_diagonal,
+                               TriangularSolveOptions::Transpose transpose_a);
+  friend XlaOp Cholesky(XlaOp a, bool lower);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
                       const string& config);
   friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
@@ -1028,6 +893,7 @@ class XlaBuilder {
   friend XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
+  friend XlaOp ReplicaId(XlaBuilder* builder);
   friend XlaOp SelectAndScatter(const XlaOp& operand,
                                 const XlaComputation& select,
                                 absl::Span<const int64> window_dimensions,
@@ -1058,6 +924,8 @@ class XlaBuilder {
   friend XlaOp Tanh(const XlaOp& operand);
   friend XlaOp Real(const XlaOp& operand);
   friend XlaOp Imag(const XlaOp& operand);
+  friend XlaOp Sqrt(const XlaOp& operand);
+  friend XlaOp Rsqrt(const XlaOp& operand);
   friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
                    absl::Span<const int64> broadcast_dimensions);
   friend XlaOp IsFinite(const XlaOp& operand);
@@ -1074,6 +942,9 @@ class XlaBuilder {
   friend XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
   friend XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                     int64 dimension);
+  friend XlaOp Sort(absl::Span<const XlaOp> operands,
+                    const XlaComputation& comparator, int64 dimension,
+                    bool is_stable);
   friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
@@ -1088,6 +959,10 @@ class XlaBuilder {
                            const XlaComputation& true_computation,
                            const XlaOp& false_operand,
                            const XlaComputation& false_computation);
+  friend XlaOp Conditional(
+      const XlaOp& branch_index,
+      absl::Span<const XlaComputation* const> branch_computations,
+      absl::Span<const XlaOp> branch_operands);
   friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                const int mantissa_bits);
   friend XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
@@ -1410,6 +1285,11 @@ XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
+// Enqueues a comparison instruction onto the computation.
+XlaOp Compare(const XlaOp& lhs, const XlaOp& rhs,
+              absl::Span<const int64> broadcast_dimensions,
+              ComparisonDirection direction);
+
 // Enqueues a dot instruction onto the computation.
 XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
           const PrecisionConfig* precision_config = nullptr);
@@ -1469,6 +1349,45 @@ XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
           absl::Span<const int64> fft_length);
 
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves for x in one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+//
+// * `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+//   square matrices. If `lower` is true (false), then the strictly upper
+//   (lower) triangular part of each innermost matrix in `a` is assumed to be
+//   zero and is not accessed.
+// * `b` is a tensor of shape `[..., M, K]` if `left_side` is true, otherwise a
+//   tensor of shape `[..., K, M]`.
+// * `left_side` is a boolean, indicating whether to solve a system of the form
+//   op(a) * x = b (true) or x * op(a) = b (false).
+// * `lower` is a boolean, indicating whether the argument `a` is
+//   lower-triangular (true) or upper-triangular (false).
+// * If `unit_diagonal` is true, the diagonal elements of `a` are assumed to be
+//   1 and not accessed.
+// * `transpose_a` indicates which function `op` we use to transform the tensor
+//   `a`: the identity function, transpose(a), or conjugate(transpose(a))
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a);
+
+// Computes the Cholesky decompositions of a batch of symmetric (Hermitian)
+// positive definite matrices.
+// `a` must be a (batched) square matrix; i.e., it must have rank >= 2 with the
+// two minor dimensions equal.
+// If `lower` is true, the data from the lower triangle is used; if false, the
+// upper triangle is used. The input data in the other triangle of the input
+// does not affect the output. Returns the output in the same lower/uppper
+// triangle. The data returned in the other output triangle is arbitrary and
+// implementation-defined.
+//
+// The value returned if `a` is not Hermitian positive definite is
+// implementation-defined.
+XlaOp Cholesky(XlaOp a, bool lower);
+
 // Enqueues an infeed instruction onto the computation, which writes data of
 // the given shape to the infeed buffer of the device.
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
@@ -1568,9 +1487,33 @@ XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
+// Overload to call And with 3 or more operands.  We need the following somewhat
+// convoluted overload set to disambiguate with the overload that takes the
+// `broadcast_dimensions` optional param.
+inline XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+  return And(op1, And(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
+          const XlaOpTs&... operands) {
+  return And(op1, And(op2, And(op3, operands...)));
+}
+
 XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
+// Overload to call Or with 3 or more operands.  As with `And`, we need the
+// following complicated overload set to handle the default arg in the `Or`
+// overload above.
+inline XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+  return Or(op1, Or(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
+         const XlaOpTs&... operands) {
+  return Or(op1, Or(op2, Or(op3, operands...)));
+}
+
 XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
@@ -1663,6 +1606,9 @@ XlaOp CollectivePermute(
     const XlaOp& operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+// Enqueues an operation that returns the replica ID.
+XlaOp ReplicaId(XlaBuilder* builder);
+
 // Enqueues an operation that scatters the `source` array to the selected
 // indices of each window.
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
@@ -1730,14 +1676,24 @@ XlaOp Real(const XlaOp& operand);
 // Enqueues an imaginary-part instruction onto the computation.
 XlaOp Imag(const XlaOp& operand);
 
+// Enqueues a sqrt computation onto the computation.
+XlaOp Sqrt(const XlaOp& operand);
+
+// Enqueues a rsqrt computation onto the computation.
+XlaOp Rsqrt(const XlaOp& operand);
+
 // Enqueues a lhs^rhs computation onto the computation.
 XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
-// Enqueues an operator that tests if the operand's values are finite, i.e.,
-// not Inf or NaN. Defined only for floating-point types. Returns an array of
-// booleans with the same shape where entries are true iff the corresponding
-// entry was NaN.
+// Enqueues an operator that tests if the operand's values are finite, i.e., not
+// +/-Inf or NaN.  Returns an array of booleans with the same shape where
+// entries are true iff the corresponding entry was not infinite or NaN.
+//
+// Defined only for real-valued (i.e. not complex) floating-point types; raises
+// an error for other types.
+//
+// See also IsInf, IsPosInf, IsNegInf, and IsNan in lib/math.h.
 XlaOp IsFinite(const XlaOp& operand);
 
 // Enqueues an iota operation onto the computation.
@@ -1773,7 +1729,7 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // of keys, in ascending order.
 // * If the keys have higher rank, the keys are sorted along the provided
 // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will indepenently sort every column, and a dimension value of 1
+// value of 0 will independently sort every column, and a dimension value of 1
 // will independently sort each row. If no dimension number is provided, then
 // the last dimension is chosen by default.
 //
@@ -1783,9 +1739,39 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // * The result is a tuple that consists of a sorted tensor of keys (along the
 // provided dimension, as above) as the first element, and tensors with their
 // corresponding values as the other elements.
+ABSL_DEPRECATED("Use form with comparator computation instead")
 XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
            int64 dimension = -1);
 
+// Enqueues a sort instruction onto the computation, using 'comparator' for
+// comparisons. 'comparator' needs to define a strict weak order. 'is_stable'
+// determines whether the stable sorting should be used.
+// If only one operand is provided:
+// * If the operand is a rank-1 tensor (an array), the result is a sorted array.
+//   The resulting sorting order has the property that for all index positions
+//   i, j with i < j, either
+//   comparator(value[i], value[j]) = comparator(value[j], value[i]) = false or
+//   comparator(value[i], value[j]) = true.
+// * If the operand has higher rank, the operand is sorted along the provided
+//   dimension. For example, for a rank-2 tensor (a matrix), a dimension value
+//   of 0 will independently sort every column, and a dimension value of 1 will
+//   independently sort each row. If no dimension number is provided, then the
+//   last dimension is chosen by default. For the dimension which is sorted, the
+//   same sorting order applies as in the rank-1 case.
+//
+// If more than one operand is provided:
+// * All operands must be tensors with the same dimensions. The element types of
+//   the tensors may be different.
+// * The result is a tuple that consists of the operands in sorted order (along
+//   the provided dimension, as above). The same permutation as implied by the
+//   comparison computation is applied to all operand tensors. When comparing
+//   two index positions, 'comparator' is called with 2 * n scalar parameters,
+//   where parameter 2 * i and 2 * i + 1 correspond to the value of operand i at
+//   two index positions.
+// Default comparator computations can be found in lib/comparators.h
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension = -1, bool is_stable = false);
+
 // Enqueues a clamp instruction onto the computation.
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -1812,6 +1798,15 @@ XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                   const XlaOp& false_operand,
                   const XlaComputation& false_computation);
 
+// Enqueues either a predicated (if/else) or indexed (switch/case/default)
+// conditional node onto the computation. N >= 1 branch_computations and
+// branch_operands are matched by index. branch_index selects the branch that
+// will be executed. Out of range branch_index uses the N-1'th
+// branch_computation as default.
+XlaOp Conditional(const XlaOp& branch_index,
+                  absl::Span<const XlaComputation* const> branch_computations,
+                  absl::Span<const XlaOp> branch_operands);
+
 // Enqueues a ReducePrecision node onto the computation.
 XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                       const int mantissa_bits);
@@ -1924,81 +1919,6 @@ XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 // Implementation details below this point.
 //
 
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(absl::Span<const NativeT> values) {
-  return ConstantLiteral(LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                              const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
 // Free function template implementations.
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index feee8187c7db846b04bf763c28476f0d71f3dd30..12656a89943d6cef213a714df5e29289e7b1ada3 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -39,7 +40,8 @@ using ::testing::HasSubstr;
 class XlaBuilderTest : public ::testing::Test {
  protected:
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(/*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -50,7 +52,8 @@ class XlaBuilderTest : public ::testing::Test {
   // Overload which explicitly specifies the root instruction.
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b,
                                                       XlaOp root) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root));
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(root, /*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -132,6 +135,38 @@ TEST_F(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
       op::ShiftRightLogical(op::Constant(), op::Constant()));
 }
 
+TEST_F(XlaBuilderTest, VariadicAnd) {
+  XlaBuilder b(TestName());
+  Shape s = ShapeUtil::MakeShape(PRED, {});
+  And(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
+      Parameter(&b, 2, s, "p2"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  // Don't specify in the test whether And(x, y, z) is right- or
+  // left-associative; accept either one.
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      ::testing::AnyOf(op::And(op::Parameter(0),
+                               op::And(op::Parameter(1), op::Parameter(2))),
+                       op::And(op::And(op::Parameter(0), op::Parameter(1)),
+                               op::Parameter(2))));
+}
+
+TEST_F(XlaBuilderTest, VariadicOr) {
+  XlaBuilder b(TestName());
+  Shape s = ShapeUtil::MakeShape(PRED, {});
+  Or(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
+     Parameter(&b, 2, s, "p2"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  // Don't specify in the test whether Or(x, y, z) is right- or
+  // left-associative; accept either one.
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      ::testing::AnyOf(
+          op::Or(op::Parameter(0), op::Or(op::Parameter(1), op::Parameter(2))),
+          op::Or(op::Or(op::Parameter(0), op::Parameter(1)),
+                 op::Parameter(2))));
+}
+
 TEST_F(XlaBuilderTest, ShiftRightOperatorOnNonIntegerProducesError) {
   XlaBuilder b(TestName());
   ConstantR0<float>(&b, 1) >> ConstantR0<float>(&b, 2);
@@ -447,10 +482,9 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
 }
 
 TEST_F(XlaBuilderTest, DynamicParameter) {
-  std::vector<XlaComputation> computations;
-  XlaBuilder b("builder");
+  XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6}, {true})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
   ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/1,
@@ -463,9 +497,450 @@ TEST_F(XlaBuilderTest, DynamicParameter) {
                                  ->parameter_instruction(0)
                                  ->shape()
                                  .tuple_shapes(1);
-  // TODO(b/121223198): The dynamic dimension should be set once we enable
-  // dynamic dimensions in xla builder.
-  EXPECT_FALSE(param_shape.is_dynamic_dimension(0));
+  EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicUnary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Neg(gte);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}, {true}),
+       ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinaryHasBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
+       ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  BroadcastInDim(gte, /*out_dim_size=*/{3, 5, 4},
+                 /*broadcast_dimensions=*/{1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {10}, {true}),
+       ShapeUtil::MakeShape(F32, {1, 15}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1, /*broadcast_dimensions=*/{0});  // f32[<=10, 15]
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(PRED, {10}, {true}),
+       ShapeUtil::MakeShape(F32, {10}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+
+  Select(gte0, gte1, gte1);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicPad) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pad_val = ConstantR0<float>(&b, -1);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  PaddingConfig padding_config;
+  for (int i = 0; i < 2; i++) {
+    auto dimension = padding_config.add_dimensions();
+    dimension->set_edge_padding_low(0);
+    dimension->set_edge_padding_high(0);
+    dimension->set_interior_padding(0);
+  }
+  Pad(gte, pad_val, padding_config);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicConvolution) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}, {true, false, false, false}),
+       ShapeUtil::MakeShape(F32, {2, 2, 128, 8}, {false, false, true, false}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto input = GetTupleElement(p0, 0);
+  auto filter = GetTupleElement(p0, 1);
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.set_input_feature_dimension(3);
+  dnums.set_output_feature_dimension(3);
+  dnums.add_kernel_spatial_dimensions(0);
+  dnums.add_kernel_spatial_dimensions(1);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.set_kernel_output_feature_dimension(3);
+  ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                            /*feature_group_count=*/1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {true, false, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicDot) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4}, {true, true, false}),
+       ShapeUtil::MakeShape(F32, {2, 4, 5}, {true, false, false}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+
+  auto lhs = GetTupleElement(p0, 0);
+  auto rhs = GetTupleElement(p0, 1);
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+  DotGeneral(lhs, rhs, dnums);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduce) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4, 3}, {false, true, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  Reduce(gte, init, sum, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduceWindow) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
+               /*window_strides=*/{1, 1, 1}, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectAndScatter) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
+       ShapeUtil::MakeShape(F32, {2, 2, 2}, {true, false, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  XlaBuilder bge(TestName());
+  Ge(Parameter(&bge, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+     Parameter(&bge, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto ge, bge.Build());
+
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto source = GetTupleElement(p0, 1);
+  SelectAndScatter(gte0, ge, {1, 2, 4}, {1, 2, 4}, Padding::kValid, source,
+                   init, sum);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReshape) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6},
+                            {false, false, true, true, false}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/2));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/3));
+  auto gte = GetTupleElement(p0, 0);  // f32[2, 3, <=4, <=5, 6]
+  Reshape(gte, /*new_sizes=*/{6, 4, 1, 5, 2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(3));
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {false, true, false, true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelect) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/1));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Select(pred, gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_FALSE(result_shape.is_dynamic_dimension(2));
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectNotCompatible) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, false, true}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto gte0 = GetTupleElement(p0, 0);  // f32[4,<=5,6]
+  auto gte1 = GetTupleElement(p0, 1);  // f32[4,5,<=6]
+  Select(pred, gte0, gte1);
+  Status status = BuildHloModule(&b).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("Operands to select must be the same shape; "
+                                   "got f32[4,<=5,6] and f32[4,5,<=6]"));
+}
+
+TEST_F(XlaBuilderTest, DynamicTranspose) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 5}, {true, false}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Transpose(gte, /*permutation=*/{1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {false, true}))
+      << result_shape;
 }
 
 TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de34ad678e799dad8f8404ac92e5f4830d85368a
--- /dev/null
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+std::string ComparisonDirectionToString(ComparisonDirection direction) {
+  switch (direction) {
+    case ComparisonDirection::kEq:
+      return "EQ";
+    case ComparisonDirection::kNe:
+      return "NE";
+    case ComparisonDirection::kGe:
+      return "GE";
+    case ComparisonDirection::kGt:
+      return "GT";
+    case ComparisonDirection::kLe:
+      return "LE";
+    case ComparisonDirection::kLt:
+      return "LT";
+  }
+}
+
+StatusOr<ComparisonDirection> StringToComparisonDirection(
+    absl::string_view direction_name) {
+  static auto* direction_map =
+      new absl::flat_hash_map<string, ComparisonDirection>({
+          {"EQ", ComparisonDirection::kEq},
+          {"NE", ComparisonDirection::kNe},
+          {"GE", ComparisonDirection::kGe},
+          {"GT", ComparisonDirection::kGt},
+          {"LE", ComparisonDirection::kLe},
+          {"LT", ComparisonDirection::kLt},
+      });
+  auto it = direction_map->find(direction_name);
+  if (it == direction_map->end()) {
+    return InvalidArgument("Unknown comparison direction: %s", direction_name);
+  }
+  return it->second;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b150c3cfadfa4ad22459c7f1f7514c73f393ded
--- /dev/null
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
+
+#include "absl/base/macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+// Represents different comparison operations.
+enum class ComparisonDirection : uint8 {
+  kEq,
+  kNe,
+  kGe,
+  kGt,
+  kLe,
+  kLt,
+};
+
+string ComparisonDirectionToString(ComparisonDirection direction);
+
+StatusOr<ComparisonDirection> StringToComparisonDirection(
+    absl::string_view direction_name);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index a9a91648ac377987e7f226116e11c9c697ace103..2437bf04b0f931c577c5d122df8fa99b2d49c08c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -33,7 +33,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_multi_thread_eigen(true);
   opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   opts.set_xla_eliminate_hlo_implicit_broadcast(true);
-  opts.set_xla_hlo_dump_as_html(false);
+  opts.set_xla_dump_hlo_as_html(false);
 #ifdef INTEL_MKL
   opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
@@ -84,6 +84,14 @@ static void AllocateFlags() {
     };
   };
 
+  auto string_setter_for =
+      [](void (DebugOptions::*member_setter)(const string& value)) {
+        return [member_setter](const string& value) {
+          (flag_values->*member_setter)(value);
+          return true;
+        };
+      };
+
   // Custom "sub-parser" lambda for xla_disable_hlo_passes.
   auto setter_for_xla_disable_hlo_passes = [](string comma_separated_values) {
     std::vector<string> disabled_passes =
@@ -114,56 +122,26 @@ static void AllocateFlags() {
       };
 
   flag_objects = new std::vector<tensorflow::Flag>({
-      tensorflow::Flag(
-          "xla_generate_hlo_graph",
-          flag_values->mutable_xla_generate_hlo_graph(),
-          "HLO modules matching this regex will be dumped to a .dot file "
-          "throughout various stages in compilation."),
-      tensorflow::Flag(
-          "xla_hlo_graph_addresses",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-          flag_values->xla_hlo_graph_addresses(),
-          "With xla_generate_hlo_graph, show addresses of HLO ops in "
-          "graph dump."),
-      tensorflow::Flag(
-          "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
-          "With xla_generate_hlo_graph, dump the graphs into this path."),
-      tensorflow::Flag(
-          "xla_hlo_dump_as_graphdef",
-          bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
-          flag_values->xla_hlo_dump_as_graphdef(),
-          "Dump HLO graphs as TensorFlow GraphDefs."),
-      tensorflow::Flag("xla_hlo_dump_as_html",
-                       bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_html),
-                       flag_values->xla_hlo_dump_as_html(),
-                       "Dump HLO graphs as an HTML (DOT rendered into SVG "
-                       "inlined in HTML)."),
-      tensorflow::Flag(
-          "xla_hlo_graph_sharding_color",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-          flag_values->xla_hlo_graph_sharding_color(),
-          "Assign colors based on sharding assignments when generating the "
-          "HLO graphs."),
-      tensorflow::Flag(
-          "xla_hlo_tfgraph_device_scopes",
-          bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
-          flag_values->xla_hlo_tfgraph_device_scopes(),
-          "When generating TensorFlow HLO graphs, if the HLO instructions "
-          "are assigned to a specific device, prefix the name scope with "
-          "\"devX\" with X being the device ordinal."),
-      tensorflow::Flag(
-          "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
-          "HLO modules matching this regex will be dumped to LOG(INFO)."),
-      tensorflow::Flag(
-          "xla_generate_hlo_text_to",
-          flag_values->mutable_xla_generate_hlo_text_to(),
-          "Dump all HLO modules as text into the provided directory path."),
       tensorflow::Flag(
           "xla_cpu_enable_fast_math",
           bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
           flag_values->xla_cpu_enable_fast_math(),
           "Enable unsafe fast-math optimizations in the CPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
+      tensorflow::Flag(
+          "xla_cpu_fast_math_honor_nans",
+          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
+          flag_values->xla_cpu_fast_math_honor_nans(),
+          "When xla_cpu_enable_fast_math is true then this controls whether we "
+          "allow operations to produce NaNs.  Ignored when "
+          "xla_cpu_enable_fast_math is false."),
+      tensorflow::Flag(
+          "xla_cpu_fast_math_honor_infs",
+          bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
+          flag_values->xla_cpu_fast_math_honor_infs(),
+          "When xla_cpu_enable_fast_math is true then this controls whether we "
+          "allow operations to produce infinites.  Ignored when "
+          "xla_cpu_enable_fast_math is false."),
       tensorflow::Flag(
           "xla_gpu_enable_fast_min_max",
           bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
@@ -222,9 +200,6 @@ static void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
           flag_values->xla_embed_ir_in_executable(),
           "Embed the compiler IR as a string in the executable."),
-      tensorflow::Flag(
-          "xla_dump_ir_to", flag_values->mutable_xla_dump_ir_to(),
-          "Dump the compiler IR into this directory as individual files."),
       tensorflow::Flag(
           "xla_eliminate_hlo_implicit_broadcast",
           bool_setter_for(
@@ -259,20 +234,6 @@ static void AllocateFlags() {
           int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
           flag_values->xla_gpu_max_kernel_unroll_factor(),
           "Specify the maximum kernel unroll factor for the GPU backend."),
-      tensorflow::Flag(
-          "xla_dump_optimized_hlo_proto_to",
-          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
-          "Dump Hlo after all hlo passes are executed as proto binary into "
-          "this directory."),
-      tensorflow::Flag(
-          "xla_dump_unoptimized_hlo_proto_to",
-          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
-          "Dump HLO before any hlo passes are executed as proto binary into "
-          "this directory."),
-      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
-                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
-                       "Dump HLO after each pass as an HloProto in binary file "
-                       "format into this directory."),
       tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
@@ -295,14 +256,6 @@ static void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_hlo_profile),
           flag_values->xla_hlo_profile(),
           "Instrument the computation to collect per-HLO cycle counts"),
-      tensorflow::Flag("xla_dump_computations_to",
-                       flag_values->mutable_xla_dump_computations_to(),
-                       "Dump computations that XLA executes into the provided "
-                       "directory path"),
-      tensorflow::Flag("xla_dump_executions_to",
-                       flag_values->mutable_xla_dump_executions_to(),
-                       "Dump parameters and results of computations that XLA "
-                       "executes into the provided directory path"),
       tensorflow::Flag("xla_backend_extra_options",
                        setter_for_xla_backend_extra_options, "",
                        "Extra options to pass to a backend; "
@@ -355,6 +308,79 @@ static void AllocateFlags() {
               &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
           flag_values->xla_gpu_disable_ptxas_optimizations(),
           "In XLA:GPU run ptxas in -O0 (default is -O3)."),
+
+      tensorflow::Flag(
+          "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
+          flag_values->xla_dump_to(),
+          "Directory into which debugging data is written.  If not specified "
+          "but another dumping flag is passed, data will be written to stdout. "
+          " To explicitly write to stdout, set this to \"-\".  The values "
+          "\"sponge\" and \"test_undeclared_outputs_dir\" have a special "
+          "meaning: They cause us to dump into the directory specified by the "
+          "environment variable TEST_UNDECLARED_OUTPUTS_DIR."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_text",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
+          flag_values->xla_dump_hlo_as_text(),
+          "Dumps HLO modules as text before and after optimizations.  Results "
+          "are written to the --xla_dump_to dir, or, if no dir is specified, "
+          "to stdout."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_proto",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
+          flag_values->xla_dump_hlo_as_proto(),
+          "Dumps HLO modules as HloProtos to the directory specified by "
+          "--xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_dot",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
+          flag_values->xla_dump_hlo_as_dot(),
+          "Dumps HLO modules rendered as dot files to the directory "
+          "specified by --xla_dump_to."),
+      tensorflow::Flag("xla_dump_hlo_as_html",
+                       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_html),
+                       flag_values->xla_dump_hlo_as_html(),
+                       "Dumps HLO modules rendered as HTML files to the "
+                       "directory specified by --xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_url",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
+          flag_values->xla_dump_hlo_as_url(),
+          "Tries to dump HLO modules rendered as URLs to stdout (and also to "
+          "the directory specified by --xla_dump_to). This is not implemented "
+          "by default; you need to add a plugin which calls "
+          "RegisterGraphToURLRenderer()."),
+      tensorflow::Flag(
+          "xla_dump_hlo_snapshots",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
+          flag_values->xla_dump_hlo_snapshots(),
+          "Every time an HLO module is run, dumps an HloSnapshot to the "
+          "directory specified by --xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_module_re",
+          string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
+          flag_values->xla_dump_hlo_module_re(),
+          "Limits dumping only to modules which match this regular expression. "
+          " Default is to dump all modules."),
+      tensorflow::Flag(
+          "xla_dump_hlo_pass_re",
+          string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
+          flag_values->xla_dump_hlo_pass_re(),
+          "If specified, dumps HLO before and after optimization passes which "
+          "match this regular expression, in addition to dumping at the very "
+          "beginning and end of compilation."),
+      tensorflow::Flag(
+          "xla_hlo_graph_addresses",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+          flag_values->xla_hlo_graph_addresses(),
+          "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
+          "the address in memory of each HloInstruction object."),
+      tensorflow::Flag(
+          "xla_hlo_graph_sharding_color",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+          flag_values->xla_hlo_graph_sharding_color(),
+          "Assign colors based on sharding assignments when generating the "
+          "HLO graphs."),
   });
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
diff --git a/tensorflow/compiler/xla/error_spec.h b/tensorflow/compiler/xla/error_spec.h
index a1463aa15941b9c265db94e2eb3cc176fab6695b..4359f3b7deb8e585494cb2a9c7115eac6a312c8e 100644
--- a/tensorflow/compiler/xla/error_spec.h
+++ b/tensorflow/compiler/xla/error_spec.h
@@ -30,6 +30,19 @@ struct ErrorSpec {
   // In effect, this allows the tested operation to produce incorrect results
   // for inputs outside its mathematical domain.
   bool relaxed_nans;
+
+  // If this is true, then we treat each +/-inf in the actual result as
+  // equivalent to our choice of either +/-inf or the min/max floating-point
+  // value.
+  //
+  // If the expected result is +/-inf, the actual result must still be +/-inf.
+  //
+  // In effect, this allows the tested operation to overflow, so long as it's
+  // overflowing on "large" values.
+  //
+  // (We could have a symmetric more_infs_ok flag if necessary; right now it
+  // appears not to be.)
+  bool fewer_infs_ok = false;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 0f9b591c70d4fd96147958d18bd5fb7dd78a7f3f..230f3b202a4b531c381665471c3856c3feba5a3a 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -77,7 +77,7 @@ ExecutionProfile* ExecutableRunOptions::execution_profile() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_assignment(
-    DeviceAssignment* device_assignment) {
+    const DeviceAssignment* device_assignment) {
   device_assignment_ = device_assignment;
   return *this;
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index ba3217f31b55bd1428f67da6154a46c8bc304053..1e744953bd3be58afba5b81c0e2a8ba26665f9c4 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Pulls in the ::stream_executor -> ::xla::se namespace alias.
-#include "tensorflow/compiler/xla/types.h"
-
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -28,12 +25,6 @@ class Stream;
 class Platform;
 }  // namespace stream_executor
 
-namespace tensorflow {
-namespace thread {
-class ThreadPool;
-}  // namespace thread
-}  // namespace tensorflow
-
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
@@ -83,7 +74,7 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
 
   ExecutableRunOptions& set_device_assignment(
-      DeviceAssignment* device_assignment);
+      const DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
   ExecutableRunOptions& set_rng_seed(int rng_seed);
@@ -92,7 +83,7 @@ class ExecutableRunOptions {
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
-  DeviceAssignment* device_assignment_ = nullptr;
+  const DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index 267701e9c0e42a21d2cda6238520f6a9692e7e76..d756cd74c98b98a6fda099690d966562bd694e2c 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -25,6 +25,8 @@ upper_tabs:
         path: /xla/operation_semantics
       - title: Shapes and layout
         path: /xla/shapes
+      - title: Tiled layout
+        path: /xla/tiled_layout
       - title: Using AOT compilation
         path: /xla/tfcompile
       - heading: Tutorials
diff --git a/tensorflow/compiler/xla/g3doc/_project.yaml b/tensorflow/compiler/xla/g3doc/_project.yaml
index 33d8bdb27a664d9e282d1d65c007ebf5838b196a..1cacee703dca30f9c4af6a4964839bb9fa4b0140 100644
--- a/tensorflow/compiler/xla/g3doc/_project.yaml
+++ b/tensorflow/compiler/xla/g3doc/_project.yaml
@@ -8,3 +8,4 @@ use_site_branding: true
 hide_from_products_list: true
 content_license: cc3-apache2
 buganizer_id: 171704
+include: /_project_included.yaml
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index 85fa16ccc7f48a3dce840564e79097c9e136767f..d7ce5ee1ba6abbe60507f99a82ef2839a9f8a213 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -144,7 +144,8 @@ Execute the python script to train the model with XLA and turn on a debugging
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+XLA_FLAGS="--xla_hlo_profile --xla_dump_to=/tmp/foo --xla_dump_hlo_as_text"
+python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
@@ -153,28 +154,10 @@ should look similar to the picture below with one long bar labeled `XlaLaunch`.
   <img style="width:100%" src="./images/jit_timeline_gpu_xla.png">
 </div>
 
-To understand what is happening in `XlaLaunch`, look at the console output for
-statements similar to the following:
+To understand what is happening in `XlaLaunch`, look at the console output. Each
+XLA cluster that's launched will have a corresponding profile (from
+`--xla_hlo_profile`) showing how long each HLO took to run.
 
-```shell
-computation cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=1].v82 [CPU:
-pipeline start, before inline]: /tmp/hlo_graph_0.dot
-
-```
-
-The console statements point to the location of `hlo_graph_xx.dot` files that
-contain information about the graph created by XLA. The process that XLA takes
-to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram
-in succession.
-
-To Render the .dot file into a png, install
-[GraphViz](https://www.graphviz.org/download/) and run:
-
-```shell
-dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
-```
-
-The result will look like the following:
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/jit_gpu_xla_graph.png">
-</div>
+`/tmp/foo` will contain the HLO before and after optimizations for each HLO
+module that's run. You can read this as-is, or you can visualize it using
+`tensorflow/compiler/xla/tools:interactive_graphviz`.
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index c5f9377f98868cdf6d5c711cf80ede5d41fd8305..7d718c5301018980522e0d09835da0c6c40239c5 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -322,6 +322,37 @@ Invokes a computation with the given arguments.
 The arity and types of the `args` must match the parameters of the
 `computation`. It is allowed to have no `args`.
 
+## Cholesky
+
+See also
+[`XlaBuilder::Cholesky`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Computes the
+[Cholesky decomposition](https://en.wikipedia.org/wiki/Cholesky_decomposition)
+of a batch of symmetric (Hermitian) positive definite matrices.
+
+<b> `Cholesky(a, lower)` </b>
+
+Arguments | Type    | Semantics
+--------- | ------- | -----------------------------------------------------
+`a`       | `XlaOp` | a rank > 2 array of a complex or floating-point type.
+`lower`   | `bool`  | whether to use the upper or lower triangle of `a`.
+
+If `lower` is `true`, computes lower-triangular matrices `l` such that $$ a = l
+. l^T $$. If `lower` is `false`, computes upper-triangular matrices `u` such
+that $$ a = u^T . u $$.
+
+Input data is read only from the lower/upper triangle of `a`, depending on the
+value of `lower`. Values from the other triangle are ignored. Output data is
+returned in the same triangle; the values in the other triangle are
+implementation-defined and may be anything.
+
+If the rank of `a` is greater than 2, `a` is treated as a batch of matrices,
+where all except the minor 2 dimensions are batch dimensions.
+
+If `a` is not symmetric (Hermitian) positive definite, the result is
+implementation-defined.
+
 ## Clamp
 
 See also
@@ -510,25 +541,49 @@ See also
 false_computation)` </b>
 
 Arguments           | Type             | Semantics
-------------------- | ---------------- | ---------------------------------
+------------------- | ---------------- | --------------------------------------
 `pred`              | `XlaOp`          | Scalar of type `PRED`
-`true_operand`      | `XlaOp`          | Argument of type `T_0`
-`true_computation`  | `XlaComputation` | XlaComputation of type `T_0 -> S`
-`false_operand`     | `XlaOp`          | Argument of type `T_1`
-`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S`
+`true_operand`      | `XlaOp`          | Argument of type $$ T_0 $$
+`true_computation`  | `XlaComputation` | XlaComputation of type $$ T_0 \to S$$
+`false_operand`     | `XlaOp`          | Argument of type $$ T_1 $$
+`false_computation` | `XlaComputation` | XlaComputation of type $$ T_1 \to S $$
 
 Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
 is `false`, and returns the result.
 
-The `true_computation` must take in a single argument of type `T_0` and will be
-invoked with `true_operand` which must be of the same type. The
-`false_computation` must take in a single argument of type `T_1` and will be
+The `true_computation` must take in a single argument of type $$ T_0 $$ and will
+be invoked with `true_operand` which must be of the same type. The
+`false_computation` must take in a single argument of type $$ T_1 $$ and will be
 invoked with `false_operand` which must be of the same type. The type of the
 returned value of `true_computation` and `false_computation` must be the same.
 
 Note that only one of `true_computation` and `false_computation` will be
 executed depending on the value of `pred`.
 
+<b> `Conditional(branch_index, branch_computations, branch_operands)` </b>
+
+| Arguments             | Type                  | Semantics                    |
+| --------------------- | --------------------- | ---------------------------- |
+| `branch_index`        | `XlaOp`               | Scalar of type `PRED` or     |
+:                       :                       : `S32`                        :
+| `branch_computations` | sequence of N         | XlaComputations of type $$   |
+:                       : `XlaComputation`      : T_0 \to S , T_1 \to S , ..., :
+:                       :                       : T_{N-1} \to S $$             :
+| `branch_operands`     | sequence of N `XlaOp` | Arguments of type $$ T_0 ,   |
+:                       :                       : T_1 , ..., T_{N-1} $$        :
+
+Executes `branch_computations[branch_index]`, and returns the result. If
+`branch_index` is a `PRED`, then the `true` branch is in position 0 and the
+`false` branch is in position 1. If `branch_index` is an `S32` which is < 0
+or >= N, then `branch_computations[N-1]` is executed as the default branch.
+
+Each `branch_computations[b]` must take in a single argument of type `T_b` and
+will be invoked with `branch_operands[b]` which must be of the same type. The
+type of the returned value of each `branch_computations[b]` must be the same.
+
+Note that only one of the `branch_computations` will be executed depending on
+the value of `branch_index`.
+
 ## Conv (convolution)
 
 See also
@@ -1186,7 +1241,7 @@ if and only if the corresponding input element is finite.
 
 <b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ -0 & x = -0\\ NaN & x = NaN\\ +0 & x = +0\\ 1 & x > 0 \end{cases}$$
 
 using the comparison operator of the element type of `operand`.
 
@@ -1608,15 +1663,18 @@ Applies a reduction function to one or more arrays in parallel.
 
 <b> `Reduce(operands..., init_values..., computation, dimensions)` </b>
 
-Arguments     | Type                  | Semantics
-------------- | --------------------- | ---------------------------------------
-`operands`    | Sequence of N `XlaOp` | N arrays of types `T_0, ..., T_N`.
-`init_values` | Sequence of N `XlaOp` | N scalars of types `T_0, ..., T_N`.
-`computation` | `XlaComputation`      | computation of type
-              :                       : `T_0, ..., T_N, T_0, ..., T_N -> Collate(T_0, ..., T_N)`
-`dimensions`  | `int64` array         | unordered array of dimensions to reduce
+| Arguments     | Type                  | Semantics                            |
+| ------------- | --------------------- | ------------------------------------ |
+| `operands`    | Sequence of N `XlaOp` | N arrays of types `T_0, ..., T_N`.   |
+| `init_values` | Sequence of N `XlaOp` | N scalars of types `T_0, ..., T_N`.  |
+| `computation` | `XlaComputation`      | computation of type `T_0, ..., T_N,  |
+:               :                       : T_0, ..., T_N ->` `Collate(T_0, ..., :
+:               :                       : T_N)`                                :
+| `dimensions`  | `int64` array         | unordered array of dimensions to     |
+:               :                       : reduce                               :
 
 Where:
+
 * N is required to be greater or equal to 1.
 * All input arrays must have the same dimensions.
 * If `N = 1`, `Collate(T)` is `T`.
@@ -1626,10 +1684,10 @@ The output of the op is `Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type
 `T_i`, the dimensions of which are described below.
 
 This operation reduces one or more dimensions of each input array into scalars.
-The rank of each returned array is `rank(operand) - len(dimensions)`.
-`init_value` is the initial value used for every reduction and may be inserted
+The rank of each returned array is `rank(operand) - len(dimensions)`. The
+initial value used for every reduction is `init_value`, and it may be inserted
 anywhere during computation by the back-end. In most cases, `init_value` is an
-identity of the reduction function (for example, 0 for addition). The applied
+identity of the reduction function (for example, `0` for addition). The applied
 `computation` is always passed the `init_value` on the left-hand side.
 
 The evaluation order of the reduction function is arbitrary and may be
@@ -1640,10 +1698,10 @@ Some reduction functions like addition are not strictly associative for floats.
 However, if the range of the data is limited, floating-point addition is close
 enough to being associative for most practical uses. It is possible to conceive
 of some completely non-associative reductions, however, and these will produce
-incorrect or unpredictable results in XLA reductions.
+incorrect or unpredictable results in XLA.
 
 As an example, when reducing across one dimension in a single 1D array with
-values [10, 11, 12, 13], with reduction function `f` (this is `computation`)
+values `[10, 11, 12, 13]`, with reduction function `f` (this is `computation`)
 then that could be computed as
 
 `f(10, f(11, f(12, f(init_value, 13)))`
@@ -1722,16 +1780,27 @@ preserved in the output, but some dimensions may get assigned new numbers (since
 the rank changes).
 
 We can also reduce multiple dimensions. Add-reducing dimensions 0 and 1 produces
-the 1D array `| 20 28 36 |`.
+the 1D array `[20, 28, 36]`.
 
 Reducing the 3D array over all its dimensions produces the scalar `84`.
 
+### Variadic Reduce
+
 When `N > 1`, reduce function application is slightly more complex, as it is
-applied simultaneously to all inputs. For example, consider the following
-reduction function, which can be used to compute the max and the argmax of a a
-1-D array in parallel:
+applied simultaneously to all inputs. The operands are supplied to the
+computation in the following order:
 
-```
+*   Running reduced value for the first operand
+*   ...
+*   Running reduced value for the N'th operand
+*   Input value for the first operand
+*   ...
+*   Input value for the N'th operand
+
+For example, consider the following reduction function, which can be used to
+compute the max and the argmax of a 1-D array in parallel:
+
+```python
 f: (Float, Int, Float, Int) -> Float, Int
 f(max, argmax, value, index):
   if value >= argmax:
@@ -1743,6 +1812,7 @@ f(max, argmax, value, index):
 For 1-D Input arrays `V = Float[N], K = Int[N]`, and init values
 `I_V = Float, I_K =  Int`, the result `f_(N-1)` of reducing across the only
 input dimension is equivalent to the following recursive application:
+
 ```
 f_0 = f(I_V, I_K, V_0, K_0)
 f_1 = f(f_0.first, f_0.second, V_1, K_1)
@@ -1873,6 +1943,20 @@ non-deterministic. Therefore, the reduction function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
 context of [`Reduce`](#reduce) for more details.
 
+## ReplicaId
+
+See also
+[`XlaBuilder::ReplicaId`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Returns the unique ID (U32 scalar) of the replica.
+
+<b> `ReplicaId()` </b>
+
+The unique ID of each replica is an unsigned integer in the interval `[0, N)`,
+where `N` is the number of replicas. Since all the replicas are running the same
+program, a `ReplicaId()` call in the program will return a different value on
+each replica.
+
 ## Reshape
 
 See also
@@ -2425,6 +2509,46 @@ Permutes the operand dimensions with the given permutation, so
 This is the same as Reshape(operand, permutation,
                             Permute(permutation, operand.shape.dimensions)).
 
+## TriangularSolve
+
+See also
+[`XlaBuilder::TriangularSolve`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Solves systems of linear equations with lower or upper triangular coefficient
+matrices by forward- or back-substitution. Broadcasting along leading
+dimensions, this routine solves one of the matrix systems `op(a) * x =
+b`, or `x * op(a) = b`, for the variable `x`, given `a` and `b`, where `op(a)` is
+either `op(a) = a`, or `op(a) = Transpose(a)`, or `op(a) = Conj(Transpose(a))`.
+
+<b> `TriangularSolve(a, b, left_side, lower, unit_diagonal, transpose_a)` </b>
+
+| Arguments       | Type        | Semantics                                    |
+| --------------- | ----------- | -------------------------------------------- |
+| `a`             | `XlaOp`     | a rank > 2 array of a complex or             |
+:                 :             : floating-point type with shape `[..., M,     :
+:                 :             : M]`.                                         :
+| `b`             | `XlaOp`     | a rank > 2 array of the same type with shape |
+:                 :             : `[..., M, K]` if `left_side` is true, `[..., :
+:                 :             : K, M]` otherwise.                            :
+| `left_side`     | `bool`      | indicates whether to solve a system of the   |
+:                 :             : form `op(a) * x = b` (`true`) or `x *        :
+:                 :             : op(a) = b` (`false`).                        :
+| `lower`         | `bool`      | whether to use the upper or lower triangle   |
+:                 :             : of `a`.                                      :
+| `unit_diagonal` | `bool`      | if `true`, the diagonal elements of `a` are  |
+:                 :             : assumed to be `1` and not accessed.          :
+| `transpose_a`   | `Transpose` | whether to use `a` as is, transpose it or    |
+:                 :             : take its conjugate transpose.                :
+
+Input data is read only from the lower/upper triangle of `a`, depending on the
+value of `lower`. Values from the other triangle are ignored. Output data is
+returned in the same triangle; the values in the other triangle are
+implementation-defined and may be anything.
+
+If the rank of `a` and `b` are greater than 2, they are treated as batches of
+matrices, where all except the minor 2 dimensions are batch dimensions. `a` and
+`b` must have equal batch dimensions.
+
 ## Tuple
 
 See also
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/tiled_layout.md
similarity index 96%
rename from tensorflow/compiler/xla/g3doc/layout_with_tiling.md
rename to tensorflow/compiler/xla/g3doc/tiled_layout.md
index 5e990851af7495ebd4417e44f1d955fcc14dadf1..21e88ceab6208cdf940826d769fd93713044d5a0 100644
--- a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
+++ b/tensorflow/compiler/xla/g3doc/tiled_layout.md
@@ -1,9 +1,7 @@
 # Tiled layout
 
-*Note: This doc describes how tiled layout is intended to work. Tiling is being
-implemented, but this is an early effort and it is currently not even guaranteed
-to get an Unimplemented error if one tries to use tiling - it may be just
-silently ignored.*
+Caution: Tiled layout is *pre-release* and this describes how it's intended to
+work. Errors may be silently ignored.
 
 <center> ![](images/xla_array_layout_figure1.png)
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 7e22a32e545e4155545ffcfb9582187eadec3a82..eebd8245abe759b71b3fe732943761325ea04b81 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
index e3b5fcd5274881cec31ecf906e3461685f82a1f4..000c4fdc40519214fa9fa721a8987b77b534442b 100644
--- a/tensorflow/compiler/xla/layout.cc
+++ b/tensorflow/compiler/xla/layout.cc
@@ -30,7 +30,19 @@ TileProto Tile::ToProto() const {
 }
 
 string Tile::ToString() const {
-  return absl::StrCat("(", absl::StrJoin(dimensions(), ","), ")");
+  std::vector<string> elements;
+  for (auto dim : dimensions()) {
+    if (dim >= 0) {
+      elements.push_back(std::to_string(dim));
+    } else {
+      if (dim == kCombineDimension) {
+        elements.push_back("*");
+      } else {
+        elements.push_back(absl::StrCat("Invalid value ", dim));
+      }
+    }
+  }
+  return absl::StrCat("(", absl::StrJoin(elements, ","), ")");
 }
 
 /* static */ Layout Layout::CreateFromProto(const LayoutProto& proto) {
@@ -64,23 +76,43 @@ LayoutProto Layout::ToProto() const {
 }
 
 string Layout::ToString() const {
-  // TODO(b/119839262): Emit tiles in string.
   if (format() == SPARSE) {
+    CHECK_EQ(tiles_size(), 0) << "Sparse layout should not be tiled.";
     return absl::StrCat("sparse{", max_sparse_elements(), "}");
   } else if (format() == DENSE) {
-    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","), "}");
+    string colon_string = tiles().empty() ? "" : "T";
+    for (Tile tile : tiles()) {
+      absl::StrAppend(&colon_string, tile.ToString());
+    }
+    if (element_size_in_bits() != 0) {
+      absl::StrAppend(&colon_string, "E(", element_size_in_bits(), ")");
+    }
+    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","),
+                        colon_string.empty() ? "" : ":", colon_string, "}");
   } else {
     CHECK_EQ(format(), INVALID_FORMAT);
     return "invalid{}";
   }
 }
 
+bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
+  if (lhs.format() != rhs.format() ||
+      lhs.minor_to_major() != rhs.minor_to_major() ||
+      lhs.max_sparse_elements() != rhs.max_sparse_elements()) {
+    return false;
+  }
+  if (!ignore_tiles_ && lhs.tiles() != rhs.tiles()) {
+    return false;
+  }
+  if (!ignore_element_size_ &&
+      lhs.element_size_in_bits() != rhs.element_size_in_bits()) {
+    return false;
+  }
+  return true;
+}
+
 bool Layout::operator==(const Layout& other) const {
-  return (other.format() == format() &&
-          other.minor_to_major() == minor_to_major() &&
-          other.element_size_in_bits() == element_size_in_bits() &&
-          other.max_sparse_elements() == max_sparse_elements() &&
-          other.tiles() == tiles());
+  return Equal()(*this, other);
 }
 
 std::ostream& operator<<(std::ostream& out, const Tile& tile) {
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index 313368c39e4c976fc481941eb17325101f2ba69a..acc449b781b503142b24ed7229e3559230bb1599 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -55,6 +55,20 @@ class Tile {
   // Returns the dimensions of the tile.
   const std::vector<int64>& dimensions() const { return dimensions_; }
 
+  Tile& add_dimensions(int64 value) {
+    dimensions_.push_back(value);
+    return *this;
+  }
+
+  Tile& clear_dimensions() {
+    dimensions_.clear();
+    return *this;
+  }
+
+  // This dimension size means the corresponding dimension in the shape is
+  // combined with the next minor dimension before tiling is applied.
+  static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
+
  private:
   // The bounds of the tile.
   std::vector<int64> dimensions_;
@@ -71,10 +85,12 @@ class Layout {
 
   // Constructs a dense tiled layout with the given minor-to-major order and
   // tiles.
-  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles)
+  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+         int64 element_size_in_bits = 0)
       : format_(DENSE),
         minor_to_major_(minor_to_major.begin(), minor_to_major.end()),
-        tiles_(tiles.begin(), tiles.end()) {}
+        tiles_(tiles.begin(), tiles.end()),
+        element_size_in_bits_(element_size_in_bits) {}
 
   // Construct a shape from a LayoutProto.
   static Layout CreateFromProto(const LayoutProto& proto);
@@ -85,6 +101,37 @@ class Layout {
   // Returns a human-readable string that represents this layout.
   string ToString() const;
 
+  // Equal is a configurable functor to check the equality of two layouts.
+  //
+  // Examples:
+  //
+  // - Comparing two layouts ignoring their difference in tiles:
+  //   Equal().IgnoreTiles()(layout1, layout2);
+  //
+  // - Comparing two layouts ignoring their difference in tiles and element
+  //   size:
+  //   Equal().IgnoreTiles().IgnoreElementSize()(layout1, layout2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Layout& lhs, const Layout& rhs);
+
+    Equal& IgnoreTiles() {
+      ignore_tiles_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreElementSize() {
+      ignore_element_size_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_tiles_ = false;
+    bool ignore_element_size_ = false;
+  };
+
   bool operator==(const Layout& other) const;
   bool operator!=(const Layout& other) const { return !(*this == other); }
 
@@ -159,7 +206,7 @@ class Layout {
     element_size_in_bits_ = 0;
   }
 
- public:
+ private:
   // The format of this layout.
   Format format_ = INVALID_FORMAT;
 
@@ -172,11 +219,11 @@ class Layout {
   // memory.  This field must be zero unless the format is SPARSE.
   int64 max_sparse_elements_ = 0;
 
-  // The number of bits used to store an individual array element.
-  int64 element_size_in_bits_ = 0;
-
   // The tiles used in tiling-based layout.
   std::vector<Tile> tiles_;
+
+  // The number of bits used to store an individual array element.
+  int64 element_size_in_bits_ = 0;
 };
 
 std::ostream& operator<<(std::ostream& out, const Tile& Tile);
diff --git a/tensorflow/compiler/xla/layout_test.cc b/tensorflow/compiler/xla/layout_test.cc
index fb6abd3f6523b978e72b21ec082ae06973e86243..f5d71c553ed2e0cfd5d5945144dd476557582b5f 100644
--- a/tensorflow/compiler/xla/layout_test.cc
+++ b/tensorflow/compiler/xla/layout_test.cc
@@ -38,10 +38,13 @@ TEST_F(LayoutTest, ToString) {
             "sparse{123}");
   EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
   EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}).ToString(),
-            "{3,2,1,0}");
+            "{3,2,1,0:T(42,123)(4,5)}");
   EXPECT_EQ(
       Layout({1, 0}, {Tile({2, 55})}).set_element_size_in_bits(42).ToString(),
-      "{1,0}");
+      "{1,0:T(2,55)E(42)}");
+  EXPECT_EQ(
+      Layout({1, 0}, {Tile({-2, 55})}).set_element_size_in_bits(42).ToString(),
+      "{1,0:T(Invalid value -2,55)E(42)}");
 }
 
 TEST_F(LayoutTest, StreamOut) {
@@ -84,6 +87,15 @@ TEST_F(LayoutTest, Equality) {
             Layout().set_format(SPARSE).set_max_sparse_elements(42));
   EXPECT_NE(Layout().set_format(SPARSE).set_max_sparse_elements(42),
             Layout().set_format(SPARSE).set_max_sparse_elements(24));
+
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2})));
+  EXPECT_TRUE(Layout::Equal().IgnoreTiles()(Layout({0, 1, 2}, {Tile({42, 44})}),
+                                            Layout({0, 1, 2})));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {}, 32), Layout({0, 1, 2}, {}, 1)));
+  EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(Layout({0, 1, 2}, {}, 32),
+                                                  Layout({0, 1, 2}, {}, 1)));
 }
 
 TEST_F(LayoutTest, LayoutToFromProto) {
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 2fe9b56c6bdffb931726f60ab75081361b43ebb4..62314118ca9713a04cb4e3cf6ad261b966d85f15 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -54,12 +54,24 @@ void SetDefaultLayoutToContainer(std::vector<int64>* minor_to_major) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::MakeLayout(
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   Layout layout;
   layout.set_format(DENSE);
   for (int64 dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
+  for (Tile tile : tiles) {
+    for (int64 dim : tile.dimensions()) {
+      if (dim < 0 && dim != Tile::kCombineDimension) {
+        LOG(FATAL) << "Tile dimension size needs to be mininum int64 value if "
+                      "it's negative. Value is "
+                   << dim;
+      }
+    }
+    *layout.add_tiles() = tile;
+  }
+  layout.set_element_size_in_bits(element_size_in_bits);
   return layout;
 }
 
@@ -235,6 +247,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       }
       dimensions_in_layout[dim] = true;
     }
+  } else {
+    if (layout.tiles_size() != 0) {
+      return InvalidArgument("Only dense layouts can be tiled.");
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 609dba67bcdbcb11be0906b7d87a52a17ba0dfbd..9997aef465daa48ee77050e03d97cde0ea2425cc 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -36,7 +36,9 @@ class LayoutUtil {
  public:
   // Creates a layout with the given minor-to-major dimension order. (This is a
   // convenience function for protobuf construction.)
-  static Layout MakeLayout(absl::Span<const int64> minor_to_major);
+  static Layout MakeLayout(absl::Span<const int64> minor_to_major,
+                           absl::Span<const Tile> tiles = {},
+                           int64 element_size_in_bits = 0);
 
   // Similar to MakeLayout, but take indices in reverse order.
   static Layout MakeLayoutFromMajorToMinor(
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 4cc94c270cd64eb19761cc1044861c7d185b7888..12da214063676717aa075e66aa54974f4cc2b31b 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -317,6 +317,81 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
+TEST_F(LayoutUtilTest, HumanStringWithTiling) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3, 4}, {0, 1, 2});
+  Tile* tile;
+
+  // No tiling.
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape), "f32[2,3,4]{0,1,2}");
+
+  // 2D tile.
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  tile->add_dimensions(1024);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512,1024)}");
+
+  // 1D tile.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512)}");
+
+  // 2 tiles.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 4}, {1, 2, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(16);
+  tile->add_dimensions(256);
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(1);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,4]{1,2,0:T(16,256)(2,1)}");
+
+  // PRED with element size of 8 bits.
+  shape = ShapeUtil::MakeShapeWithLayout(PRED, {8, 8, 8}, {0, 2, 1});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)}");
+
+  // PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)E(32)}");
+
+  // No tile. PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:E(32)}");
+
+  // Tile with negative dimension size for combining dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 1004}, {2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,1004]{2,1,0:T(2,*,128)}");
+
+  // Tile with two negative dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {8, 2, 3, 1004}, {3, 2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[8,2,3,1004]{3,2,1,0:T(2,*,*,128)}");
+}
+
 TEST_F(LayoutUtilTest, ValidateLayout_ValidArrayLayout) {
   Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
   auto status =
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 8600e8752cfbe072407391559d210d0b49bea511..5cd738d0f7769ceac7eb3bdbc5abd3196d9cf99c 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -44,7 +44,6 @@ namespace xla {
 namespace {
 
 using absl::StrCat;
-using absl::StrFormat;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
@@ -1628,26 +1627,20 @@ bool LiteralBase::IsAllFloat(float value) const {
           return true;
         }
 
-        auto piece_is_all = [&]() {
-          switch (shape().element_type()) {
-            case F32:
-              return AllElementsEqualValue<float>(piece.data<float>(), value);
-            case F64:
-              return AllElementsEqualValue<double>(piece.data<double>(), value);
-            case F16:
-              return AllElementsEqualValue<half>(piece.data<half>(),
-                                                 static_cast<half>(value));
-            case BF16:
-              return AllElementsEqualValue<bfloat16>(
-                  piece.data<bfloat16>(), static_cast<bfloat16>(value));
-            default:
-              return false;
-          }
-        };
-        if (!piece_is_all()) {
-          return false;
+        switch (shape().element_type()) {
+          case F32:
+            return AllElementsEqualValue<float>(piece.data<float>(), value);
+          case F64:
+            return AllElementsEqualValue<double>(piece.data<double>(), value);
+          case F16:
+            return AllElementsEqualValue<half>(piece.data<half>(),
+                                               static_cast<half>(value));
+          case BF16:
+            return AllElementsEqualValue<bfloat16>(
+                piece.data<bfloat16>(), static_cast<bfloat16>(value));
+          default:
+            return false;
         }
-        return true;
       });
 }
 
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 041151fda1280d6ae7b35d5857ca79788d4f7203..c418be895d6c3faa6a85ca2c73c6f42b0a021104 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -963,6 +963,10 @@ void MutableLiteralBase::AppendSparseElement(
   CHECK(LayoutUtil::IsSparseArray(subshape));
   int64 rank = subshape.rank();
   CHECK_EQ(multi_index.size(), rank);
+  for (int64 i = 0; i < rank; ++i) {
+    CHECK_GE(multi_index[i], 0);
+    CHECK_LT(multi_index[i], subshape.dimensions(i));
+  }
   int64 last_element = p.sparse_indices()->index_count();
   CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout()));
   p.sparse_indices()->Append(multi_index);
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 69efa06d39a7f13e10004bec4470bb0937e73afd..9b3de75dd4e9d495778af86fb8fc07909ab4ba81 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -200,32 +200,26 @@ int64 RecursiveElementCount(const Shape& shape) {
   }
 }
 
-// Returns whether the actual and expected values are mismatched with respect to
-// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+// Returns whether the given value is infinity.
 template <typename NativeT>
-bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-  if (relaxed_nans) {
-    return !std::isnan(expected) && std::isnan(actual);
-  } else {
-    return std::isnan(expected) != std::isnan(actual);
-  }
+bool IsInf(NativeT val) {
+  return std::isinf(val);
 }
 
 template <>
-bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
-  return NanMismatch<float>(static_cast<float>(expected),
-                            static_cast<float>(actual), relaxed_nans);
+bool IsInf<half>(half val) {
+  return std::isinf(static_cast<float>(val));
 }
 
-// Returns whether the given value is infinity.
+// Returns whether the given value is nan.
 template <typename NativeT>
-bool IsInf(NativeT val) {
-  return std::isinf(val);
+float IsNan(NativeT value) {
+  return std::isnan(value);
 }
 
 template <>
-bool IsInf<half>(half val) {
-  return std::isinf(static_cast<float>(val));
+float IsNan(half value) {
+  return IsNan<float>(static_cast<float>(value));
 }
 
 // Converts the given floating-point value to a string.
@@ -376,21 +370,39 @@ class NearComparator {
   // the given literal_index and keeps track of various mismatch statistics.
   template <typename T>
   void CompareValues(T expected, T actual, int64 linear_index) {
-    const bool is_nan_mismatch =
-        NanMismatch(expected, actual, error_.relaxed_nans);
     float abs_error;
     float rel_error;
     if (CompareEqual<T>(expected, actual, {linear_index})) {
       abs_error = 0;
       rel_error = 0;
-    } else if (is_nan_mismatch) {
-      num_nan_mismatches_++;
-      // A nan mismatch is considered to have infinite error. rel_error is used
-      // for sorting a std::set of the top mismatchs, and a nan value here will
-      // result in undefined behavior because nan's do not satisfy the strict
-      // weak ordering requirement of std containers.
-      abs_error = std::numeric_limits<float>::infinity();
-      rel_error = std::numeric_limits<float>::infinity();
+    } else if (IsNan(expected) || IsNan(actual)) {
+      if ((!error_.relaxed_nans && IsNan(expected) != IsNan(actual)) ||
+          (error_.relaxed_nans && !IsNan(expected) && IsNan(actual))) {
+        num_nan_mismatches_++;
+        // A nan mismatch is considered to have infinite error. rel_error is
+        // used for sorting a std::set of the top mismatchs, and a nan value
+        // here will result in undefined behavior because nan's do not satisfy
+        // the strict weak ordering requirement of std containers.
+        abs_error = std::numeric_limits<float>::infinity();
+        rel_error = std::numeric_limits<float>::infinity();
+      } else {
+        abs_error = 0;
+        rel_error = 0;
+      }
+    } else if (IsInf(actual) && !IsInf(expected) && error_.fewer_infs_ok) {
+      // `fewer_infs_ok` gives us the option of comparing as though `actual`
+      // were float_max/min rather than inf.
+      T actual_finite = actual > T{0} ? std::numeric_limits<T>::max()
+                                      : std::numeric_limits<T>::lowest();
+      abs_error = FpAbsoluteValue(actual_finite - expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     } else if (IsInf(expected) || IsInf(actual)) {
       // If either the expected or actual value is infinity but not both,
       // then both absolute and relative error are regarded as inifity.
@@ -410,8 +422,7 @@ class NearComparator {
     }
     const bool is_abs_mismatch = abs_error > error_.abs;
     const bool is_rel_mismatch = rel_error > error_.rel;
-    const bool is_mismatch =
-        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+    const bool is_mismatch = is_abs_mismatch && is_rel_mismatch;
 
     // Update the error of the relative bucket only if the *absolute* error
     // bound is exceeded and vice versa.
@@ -725,7 +736,7 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
 // via recursion. shape_index is the ShapeIndex of expected (or actual)
 // currently being compared.
 Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
-                  const ErrorSpec& error, bool detailed_message,
+                  const ErrorSpec& error, absl::optional<bool> detailed_message,
                   const MiscompareCallback& miscompare_callback,
                   const ShapeIndex& shape_index) {
   TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
@@ -766,30 +777,32 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
+    bool use_detailed_message = detailed_message.value_or(
+        ShapeUtil::ElementsIn(expected.shape()) >= 64);
     switch (expected.shape().element_type()) {
       case BF16:
         return NearComparator<bfloat16>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F16:
         return NearComparator<half>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F32:
         return NearComparator<float>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F64:
         return NearComparator<double>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case C64:
         return NearComparator<complex64>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case C128:
         return NearComparator<complex128>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       default:
         LOG(FATAL) << "Unsupported primitive type in near comparator: "
@@ -880,7 +893,7 @@ Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
 }
 
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback) {
   VLOG(1) << "Expected literal:";
   XLA_VLOG_LINES(1, expected.ToString());
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
index 9e5bf7c1d062ef0f25d07a80d6ded8106df5dacc..23fff3fa348f1652eaec344da4c40ccf3ad1079a 100644
--- a/tensorflow/compiler/xla/literal_comparison.h
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -55,9 +55,10 @@ using MiscompareCallback =
 // being compared.
 //
 // If detailed_message is true, then the error message in the assertion result
-// will contain a more detailed breakdown of mismatches.
+// will contain a more detailed breakdown of mismatches.  By default, we display
+// a detailed message only for "large" inputs.
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback);
 
 // Calling ToString on a literal with over 100 million elements takes around
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index ad1699a1ae65180d56617b069d8b2e1d7d81c38c..bad65ac32018fafcc7634b989f1b4b0867aa5c0d 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 
-#include <cctype>
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/logging.h"
@@ -249,7 +249,7 @@ string MetricTableReport::MetricString(double metric) {
   string output;
   // Copy leading non-digit characters unconditionally.
   // This picks up the leading sign.
-  while (!sp1.empty() && !isdigit(sp1[0])) {
+  while (!sp1.empty() && !absl::ascii_isdigit(sp1[0])) {
     output.push_back(sp1[0]);
     sp1.remove_prefix(1);
   }
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 5b568888d14f21c1330556d017eafba6c8dd2228..e1e22f784172b5f3850f0bc510322dfad9e7f1bb 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -37,7 +38,7 @@ limitations under the License.
 
 namespace xla {
 
-static const char kWS[] = " \t\r\n";           // whitespace
+static const char kWS[] = " \t\r\n";  // whitespace
 
 // The following struct represents an argv[]-style array, parsed
 // from data gleaned from the environment.
@@ -104,7 +105,8 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
     // Set e to the index just past the end of the flag.
     size_t e = b;
     while (e != flag_str.size() && isascii(flag_str[e]) &&
-           (strchr("-_", flag_str[e]) != nullptr || isalnum(flag_str[e]))) {
+           (strchr("-_", flag_str[e]) != nullptr ||
+            absl::ascii_isalnum(flag_str[e]))) {
       e++;
     }
     if (e != flag_str.size() && flag_str[e] == '=' &&
@@ -184,6 +186,14 @@ bool ParseFlagsFromEnvAndDieIfUnknown(
   tensorflow::mutex_lock lock(env_argv_mu);
   auto* env_argv = &EnvArgvs()[string(envvar)];
   SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "For env var " << envvar << " found arguments:";
+    for (int i = 0; i < env_argv->argc; i++) {
+      VLOG(1) << "  argv[" << i << "] = " << env_argv->argv[i];
+    }
+  }
+
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
 
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 3386d2e09758192a32d981f94533b534b7399766..1eedddf72c1d393cb1b88e589881e24de02ad802 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -97,6 +97,21 @@ int BitWidth(PrimitiveType type) {
   }
 }
 
+xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth) {
+  switch (src_bitwidth) {
+    case 8:
+      return xla::U8;
+    case 16:
+      return xla::U16;
+    case 32:
+      return xla::U32;
+    case 64:
+      return xla::U64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
 PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index d32505335daa429c459b948d20f387713ac2a1d7..295d353003276b4c1731f7d6a378fd1ae0288d3c 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -151,6 +151,8 @@ bool IsArrayType(PrimitiveType primitive_type);
 // Returns the number of bits in the representation for a given type.
 int BitWidth(PrimitiveType type);
 
+PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth);
+
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
 PrimitiveType ComplexComponentType(PrimitiveType complex_type);
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index ac342bf40fbc0052acbb09a346b9d062561ed06b..e476015f94ffdd5225cf75fc845b1e8ba2067ce8 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -38,42 +38,14 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
   return (serialized1 == serialized2);
 }
 
-namespace {
-
-std::pair<tensorflow::mutex*, std::vector<std::function<string(string)>>*>
-GetDirectoryExpanders() {
-  static auto* mutex = new tensorflow::mutex;
-  static auto* singleton = new std::vector<std::function<string(string)>>;
-  return {mutex, singleton};
-}
-
-// Runs all the directory expanders over x and returns the result.
-string Expand(string x) {
-  auto pair = GetDirectoryExpanders();
-  tensorflow::mutex_lock lock(*pair.first);
-  for (const auto& f : *pair.second) {
-    x = f(x);
-  }
-  return x;
-}
-
-}  // namespace
-
 Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
                             const string& directory, const string& file_name) {
   tensorflow::Env* env = tensorflow::Env::Default();
-  string expanded_dir = Expand(directory);
-  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(expanded_dir));
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory));
   string safe_file_name = SanitizeFileName(file_name) + ".pb";
-  const string path = tensorflow::io::JoinPath(expanded_dir, safe_file_name);
+  const string path = tensorflow::io::JoinPath(directory, safe_file_name);
   return tensorflow::WriteBinaryProto(env, path, message);
 }
 
-void RegisterDirectoryExpander(const std::function<string(string)>& expander) {
-  auto pair = GetDirectoryExpanders();
-  tensorflow::mutex_lock lock(*pair.first);
-  pair.second->push_back(expander);
-}
-
 }  // namespace protobuf_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index f22fc8b8499dd4a5329276040331a2ed9e89bea9..e20a7e95a63e15fb375f6f6da5b3a75843bc5396 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index ddffafa9017a565f01c3214360a958e6840e9148..f84e87d0c0a324a2194f6fe527358c6183e287be 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,8 +3,8 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
 
 py_library(
     name = "xla_client",
@@ -13,8 +13,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pywrap_xla",
-        "//tensorflow/compiler/xla:xla_data_proto_py",
-        "//tensorflow/compiler/xla/service:hlo_proto_py",
     ],
 )
 
@@ -33,6 +31,7 @@ py_test(
     deps = [
         ":custom_call_for_test",
         ":xla_client",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -59,10 +58,6 @@ cc_library(
     srcs = ["local_computation_builder.cc"],
     hdrs = ["local_computation_builder.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -74,18 +69,42 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:svd",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "xrt",
+    srcs = ["xrt.cc"],
+    hdrs = ["xrt.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -93,11 +112,19 @@ cc_library(
 
 tf_py_wrap_cc(
     name = "pywrap_xla",
-    srcs = ["xla.i"],
+    srcs = [
+        "xla.i",
+    ],
     swig_includes = [
         "local_computation_builder.i",
+        "xla_data.i",
         "//tensorflow/python:platform/base.i",
     ],
+    version_script = select({
+        "//tensorflow:macos": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
     deps = [
         ":local_computation_builder",
         ":numpy_bridge",
@@ -105,7 +132,29 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ] + if_cuda_is_configured([
-        "//tensorflow/compiler/xla/service:gpu_plugin",
-    ]),
+    ] + xla_python_default_plugins(),
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_xrt",
+    srcs = [
+        "xrt.i",
+    ],
+    swig_includes = [
+        "xla_data.i",
+        "//tensorflow/python:platform/base.i",
+    ],
+    version_script = select({
+        "//tensorflow:macos": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":numpy_bridge",
+        ":xrt",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
 )
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index a1bb9d6b67e32552f6fb5d6523b5ba04b590f808..ffbfa7aaffe756ecb5580955822f3c06f1fb80f5 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -20,29 +20,23 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/xla/client/lib/cholesky.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -51,72 +45,6 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// TODO(b/118641336): Factor out XRT parts into a small c++ library of their
-// own.
-
-// TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of
-// device handles instead of needing to set the number of replicas at XLA
-// service initialization time.
-tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
-int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
-LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
-
-string* GetPlatformNameString() {
-  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
-      new string("Host");
-  return platform_name_string;
-}
-
-Status InitializeReplicaCount(int replica_count) {
-  if (replica_count < 1) {
-    return InvalidArgument("Replica count must be >= 1; got %d.",
-                           replica_count);
-  }
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the replica count to %d, but a local XLA service was "
-        "previously created with a replica count of %d.",
-        replica_count, g_replica_count);
-  }
-  g_replica_count = replica_count;
-  return Status::OK();
-}
-
-Status InitializePlatformName(const string& platform_name) {
-  string* g_platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the platform name to %s, but a local XLA service was "
-        "previously created with a platform name of %s.",
-        platform_name, *g_platform_name);
-  }
-  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
-  *g_platform_name = platform_name;
-  return Status::OK();
-}
-
-int GetReplicaCount() {
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  return g_replica_count;
-}
-
-StatusOr<LocalClient*> GetOrCreateLocalClient() {
-  string* platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return g_local_client;
-  }
-  LocalClientOptions options;
-  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
-  options.set_number_of_replicas(g_replica_count);
-  TF_ASSIGN_OR_RETURN(g_local_client,
-                      ClientLibrary::GetOrCreateLocalClient(options));
-  CHECK(g_local_client != nullptr);
-  return g_local_client;
-}
-
 Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
   const char* name = "xla._CPU_CUSTOM_CALL_TARGET";
   if (!PyCapsule_IsValid(capsule, name)) {
@@ -131,62 +59,66 @@ Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
   return Status::OK();
 }
 
-Status TransferToInfeedLocal(const Literal& literal) {
-  VLOG(1) << "Infeeding literal without replica number; shape: "
-          << literal.shape();
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  return client->TransferToInfeedLocal(literal, /*device_ordinal=*/0);
-}
+LocalClient::LocalClient(xla::LocalClient* client) : client_(client) {}
 
-Status TransferToInfeedLocalReplica(const Literal& literal,
-                                    int replica_number) {
-  VLOG(1) << "Infeeding shape " << literal.shape()
-          << " to replica number: " << replica_number;
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferToInfeedLocal(literal, device_ordinal);
+/* static */ StatusOr<LocalClient> LocalClient::Get(
+    const string& platform_name) {
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(platform_name));
+  if (platform->VisibleDeviceCount() <= 0) {
+    return InvalidArgument("Platform %s has no visible devices.",
+                           platform_name);
+  }
+  LocalClientOptions options;
+  options.set_platform(platform);
+  TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
+                      ClientLibrary::GetOrCreateLocalClient(options));
+  CHECK(client != nullptr);
+  return LocalClient(client);
 }
 
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number) {
-  VLOG(1) << "Outfeeding literal from replica number: " << replica_number
-          << " shape: " << shape;
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferFromOutfeedLocal(shape, device_ordinal);
+// Returns the number of devices known to the XLA client.
+int LocalClient::DeviceCount() const { return client_->device_count(); }
+
+Status LocalClient::TransferToInfeed(const Literal& literal,
+                                     int device_ordinal) {
+  VLOG(1) << "Infeeding literal to device " << device_ordinal
+          << "; shape: " << literal.shape();
+  return client_->TransferToInfeed(literal, device_ordinal);
 }
 
-static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
-                                             int device_ordinal,
-                                             const Literal& arg) {
-  return client->LiteralToShapedBuffer(arg, device_ordinal,
-                                       client->backend().memory_allocator());
+StatusOr<Literal> LocalClient::TransferFromOutfeed(const Shape& shape,
+                                                   int device_ordinal) {
+  VLOG(1) << "Outfeeding literal from device " << device_ordinal
+          << "; shape: " << shape;
+  return client_->TransferFromOutfeed(&shape, device_ordinal);
 }
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
     const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-    int replica_number) {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
-          << replica_number << "/" << device_ordinal;
+    const LocalClient& client, int device_ordinal) {
+  VLOG(1) << "Creating shaped buffer from literal on device ordinal: "
+          << device_ordinal;
+  auto literal_to_buffer = [&](const Literal& arg) {
+    return client.client()->LiteralToShapedBuffer(
+        arg, device_ordinal, client.client()->backend().memory_allocator());
+  };
+
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, device_ordinal, relaid);
+      return literal_to_buffer(relaid);
     }
-    return ToBuffer(client, device_ordinal, argument);
+    return literal_to_buffer(argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
-  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
+  return new LocalShapedBuffer(std::move(buf).ValueOrDie(), client.client());
 }
 
-LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
-    : shaped_buffer_(std::move(shaped_buffer)) {}
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer,
+                                     xla::LocalClient* client)
+    : shaped_buffer_(std::move(shaped_buffer)), client_(client) {}
 
 const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
   return &shaped_buffer_;
@@ -199,8 +131,7 @@ const Shape& LocalShapedBuffer::shape() const {
 }
 
 StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  return client->ShapedBufferToLiteral(*shaped_buffer());
+  return client_->ShapedBufferToLiteral(*shaped_buffer());
 }
 
 LocalShapedBufferTuple::LocalShapedBufferTuple(
@@ -231,140 +162,94 @@ StatusOr<LocalShapedBuffer*> LocalShapedBufferTuple::Release(int i) {
 
 int64 LocalShapedBufferTuple::size() const { return elements_.size(); }
 
-XrtAllocation::XrtAllocation(int64 handle, Shape shape,
-                             const string& session_target)
-    : handle_(handle), shape_(shape), session_target_(session_target) {}
-
-XrtAllocation::~XrtAllocation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
+StatusOr<LocalShapedBufferTuple*> LocalShapedBuffer::DestructureTuple() {
+  const Shape tuple_shape = shape();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
   }
-}
-
-/* static */
-StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
-    const Literal& argument, const string& session_target) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() = argument.ToProto();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto literal_string =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
-  TF_RETURN_IF_ERROR(root.status());
 
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({literal_string, alloc.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+  DeviceMemoryAllocator* allocator = shaped_buffer()->memory_allocator();
+  ShapedBuffer tuple_buffer = Release();
 
-  int64 handle = outputs[0].scalar<int64>()();
-  return new XrtAllocation(handle, argument.shape(), session_target);
-}
-
-const int64 XrtAllocation::handle() const { return handle_; }
-
-const Shape& XrtAllocation::shape() const { return shape_; }
-
-StatusOr<Literal> XrtAllocation::ToLiteral() const {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
-  TF_RETURN_IF_ERROR(root.status());
+  // Extract some metadata we use to construct scoped buffers.
+  const se::Platform* platform = tuple_buffer.platform();
+  int device_ordinal = tuple_buffer.device_ordinal();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
+  std::vector<LocalShapedBuffer*> results;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    // Create a shaped buffer for this destructured tuple element.
+    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
+    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
+    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
 
-  xla::LiteralProto response;
-  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
-  return Literal::CreateFromProto(response);
-}
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& index) {
+          ShapeIndex original(index);
+          original.push_front(i);
+          se::DeviceMemoryBase* device_memory =
+              shape_tree.mutable_element(original);
+          shaped_buffer.set_buffer(*device_memory, index);
+          *device_memory = se::DeviceMemoryBase();
+        });
 
-XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
-    : elements_(std::move(elements)) {
-  for (auto* element : elements_) {
-    CHECK(element != nullptr);
+    VLOG(3) << "Completed tuple element: " << i;
+    results.push_back(new LocalShapedBuffer(
+        ScopedShapedBuffer(std::move(shaped_buffer), allocator), client_));
   }
+  // Deallocate the root buffer.
+  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
+  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
+  return new LocalShapedBufferTuple(std::move(results));
 }
 
-XrtAllocationTuple::~XrtAllocationTuple() {
-  for (XrtAllocation* element : elements_) {
-    if (element != nullptr) {
-      delete element;
-    }
-  }
-}
+LocalExecutable::LocalExecutable(
+    std::unique_ptr<xla::LocalExecutable> executable,
+    xla::DeviceAssignment device_assignment, xla::LocalClient* client)
+    : executable_(std::move(executable)),
+      device_assignment_(std::move(device_assignment)),
+      client_(client) {}
 
-StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
-  XrtAllocation* element = elements_[i];
-  if (element == nullptr) {
-    return InvalidArgument("Attempted to release already-released element %d.",
-                           i);
+std::vector<int> LocalExecutable::DeviceOrdinals() const {
+  int num_replicas = device_assignment_.replica_count();
+  std::vector<int> device_ordinals;
+  device_ordinals.reserve(num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    device_ordinals.push_back(device_assignment_(i, 0));
   }
-  elements_[i] = nullptr;
-  return element;
+  return device_ordinals;
 }
 
-int64 XrtAllocationTuple::size() const { return elements_.size(); }
-
-CompiledLocalComputation::CompiledLocalComputation(
-    std::unique_ptr<LocalExecutable> executable)
-    : executable_(std::move(executable)) {}
-
-StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
+StatusOr<LocalShapedBuffer*> LocalExecutable::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  if (num_replicas() != 1) {
+    return InvalidArgument(
+        "Attempted to execute computation with %d replicas using Execute()",
+        num_replicas());
+  }
   StatusOr<ScopedShapedBuffer> result_buffer_status;
-  if (!device_ordinal_status.ok()) {
-    result_buffer_status = device_ordinal_status.status();
-  } else {
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
-    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
-            << device_ordinal;
+  const int device_ordinal = device_assignment_(0, 0);
+  VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+          << device_ordinal;
 
-    std::vector<const ShapedBuffer*> argument_buffers;
-    argument_buffers.reserve(argument_handles.size());
-    for (auto& handle : argument_handles) {
-      argument_buffers.push_back(handle->shaped_buffer());
-    }
-
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(1, /*computation_count=*/1)
-            .ConsumeValueOrDie();
+  std::vector<const ShapedBuffer*> argument_buffers;
+  argument_buffers.reserve(argument_handles.size());
+  for (auto& handle : argument_handles) {
+    argument_buffers.push_back(handle->shaped_buffer());
+  }
 
-    ExecutableRunOptions options;
-    options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
-    options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+  ExecutableRunOptions options;
+  options.set_device_ordinal(device_ordinal);
+  options.set_allocator(client_->backend().memory_allocator());
+  options.set_intra_op_thread_pool(
+      client_->backend().eigen_intra_op_thread_pool_device());
+  options.set_device_assignment(&device_assignment_);
 
-    result_buffer_status = executable_->Run(argument_buffers, options);
-  }
+  result_buffer_status = executable_->Run(argument_buffers, options);
 
   if (!result_buffer_status.ok()) {
     return InternalError(
@@ -372,34 +257,30 @@ StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
         "%s.",
         result_buffer_status.status().ToString());
   }
-  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie(),
+                               client_);
 }
 
-StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+StatusOr<LocalShapedBufferTuple*> LocalExecutable::ExecutePerReplica(
     absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
-  const int num_replicas = GetReplicaCount();
+  const int num_devices = client_->device_count();
 
-  if (argument_handles.size() != num_replicas) {
+  if (argument_handles.size() != num_replicas()) {
     return InvalidArgument(
         "Attempted to execute with %d replicas when replica count is %d",
-        argument_handles.size(), num_replicas);
+        argument_handles.size(), num_devices);
+  }
+  if (argument_handles.size() > num_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when device count is %d",
+        argument_handles.size(), num_devices);
   }
 
-  VLOG(1) << "Executing with " << num_replicas << " replicas.";
-
-  // Each replica populates a StatusOr result, but only the output value of
-  // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
-  auto execute = [this, client, num_replicas, &argument_handles,
-                  &results](int replica) {
-    StatusOr<int> device_ordinal_status =
-        client->ReplicaNumberToDeviceOrdinal(replica);
-    if (!device_ordinal_status.ok()) {
-      results[replica] = device_ordinal_status.status();
-      return;
-    }
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
+  VLOG(1) << "Executing with " << num_replicas() << " replicas.";
+
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas());
+  auto execute = [this, &argument_handles, &results](int replica) {
+    const int device_ordinal = device_assignment_(replica, 0);
     VLOG(3) << "Replica " << replica
             << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -409,41 +290,35 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
       argument_buffers.push_back(handle->shaped_buffer());
     }
 
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(num_replicas, /*computation_count=*/1)
-            .ConsumeValueOrDie();
-
     ExecutableRunOptions options;
     options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
+    options.set_allocator(client_->backend().memory_allocator());
     options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+        client_->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment_);
     StatusOr<ScopedShapedBuffer> result_buffer_status =
         executable_->Run(argument_buffers, options);
 
     results[replica] = std::move(result_buffer_status);
   };
 
-  if (num_replicas == 1) {
+  if (num_replicas() == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
     execute(0);
   } else {
     // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        num_replicas - 1);
+                                        num_replicas() - 1);
 
-    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+    for (int replica = 0; replica < num_replicas() - 1; ++replica) {
       pool.Schedule([&execute, replica] { execute(replica); });
     }
-    execute(num_replicas - 1);
+    execute(num_replicas() - 1);
   }
 
-  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
-  for (int replica = 0; replica < num_replicas; ++replica) {
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas());
+  for (int replica = 0; replica < num_replicas(); ++replica) {
     auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
@@ -452,151 +327,43 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
           replica, statusor.status().ToString());
     }
     wrapped_results[replica] =
-        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie(), client_);
   }
 
   return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
-static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  return std::move(*program_shape.mutable_result());
-}
-
-CompiledXrtComputation::CompiledXrtComputation(
-    const ProgramShape& program_shape, int64 handle,
-    const string& session_target)
-    : program_shape_(program_shape),
-      handle_(handle),
-      session_target_(session_target) {}
-
-CompiledXrtComputation::~CompiledXrtComputation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({computation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
-  }
-}
-
-StatusOr<XrtAllocation*> CompiledXrtComputation::Execute(
-    absl::Span<XrtAllocation* const> argument_handles) {
-  const int num_expected_arguments = program_shape().parameters().size();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  std::vector<tensorflow::Output> arguments;
-  arguments.reserve(num_expected_arguments);
-  for (int i = 0; i < num_expected_arguments; ++i) {
-    arguments.push_back(
-        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
-  }
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto execution_config =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
-                                             execution_config, arguments);
-  TF_RETURN_IF_ERROR(root.status());
-
-  TF_RET_CHECK(argument_handles.size() == arguments.size());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(false);
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  for (int i = 0; i < arguments.size(); ++i) {
-    inputs.insert({arguments[i], argument_handles[i]->handle()});
-  }
-  inputs.insert({computation_handle, handle()});
-  inputs.insert({execution_config, e.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
-
-  int64 output = outputs[0].scalar<int64>()();
-  return new XrtAllocation(output, program_shape().result(), session_target_);
-}
-
-const ProgramShape& CompiledXrtComputation::program_shape() const {
-  return program_shape_;
-}
-
-int64 CompiledXrtComputation::handle() const { return handle_; }
-
-LocalComputation::LocalComputation(XlaComputation computation)
+Computation::Computation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
-StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
+StatusOr<LocalExecutable*> Computation::Compile(
     const std::vector<Shape>& argument_shapes,
-    const ExecutableBuildOptions* build_options) {
+    const ExecutableBuildOptions* build_options, const LocalClient& client) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
     argument_shape_pointers.push_back(&argument_shape);
   }
 
-  TF_ASSIGN_OR_RETURN(LocalClient * client, GetOrCreateLocalClient());
   ExecutableBuildOptions options;
   if (build_options != nullptr) {
     options = *build_options;
   }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
-      client->Compile(computation_, argument_shape_pointers, options));
-  return new CompiledLocalComputation(std::move(local_executable));
-}
-
-StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
-    const std::vector<Shape>& argument_shapes, const string& session_target) {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto compile = tensorflow::ops::XRTCompile(root, program);
-  TF_RETURN_IF_ERROR(root.status());
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  ProgramShape shapes;
-  for (auto& shape : argument_shapes) {
-    *shapes.add_parameters() = shape;
-  }
-  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
-  LayoutUtil::SetToDefaultLayout(&shapes);
-  *config->mutable_program_shape() = shapes.ToProto();
-  auto snapshot = computation().Snapshot().ValueOrDie();
-  *c.mutable_hlo_snapshot() = *snapshot;
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({program, c.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+      client.client()->Compile(computation_, argument_shape_pointers, options));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      client.client()->backend().computation_placer()->AssignDevices(
+          options.num_replicas(), /*computation_count=*/1));
 
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation().GetProgramShape());
-  int64 handle = outputs[0].scalar<int64>()();
-  return new CompiledXrtComputation(program_shape, handle, session_target);
+  return new LocalExecutable(std::move(local_executable),
+                             std::move(device_assignment), client.client());
 }
 
-const XlaComputation& LocalComputation::computation() const {
-  return computation_;
-}
+const XlaComputation& Computation::computation() const { return computation_; }
 
-string LocalComputation::GetSerializedProto() const {
+string Computation::GetSerializedProto() const {
   string result;
   if (!computation_.proto().SerializeToString(&result)) {
     LOG(ERROR) << "Failed to serialize the HloModuleProto.";
@@ -605,132 +372,173 @@ string LocalComputation::GetSerializedProto() const {
   return result;
 }
 
-StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
-  return swig::GetReturnValueShape(computation_);
+StatusOr<string> Computation::GetHloText() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  HloPrintOptions options;
+  options = HloPrintOptions::ShortParsable();
+  options.set_print_large_constants(false);
+  return hlo_module->ToString(options);
+}
+
+StatusOr<string> Computation::GetHloDotGraph() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  return RenderGraph(*hlo_module->entry_computation(), /*label=*/"",
+                     hlo_module->config().debug_options(),
+                     RenderedGraphFormat::kDot);
+}
+
+StatusOr<ProgramShape> Computation::GetProgramShape() const {
+  return computation_.GetProgramShape();
+}
+
+StatusOr<Shape> Computation::GetReturnValueShape() const {
+  TF_ASSIGN_OR_RETURN(ProgramShape shape, computation_.GetProgramShape());
+  return std::move(*shape.mutable_result());
 }
 
 LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
 
 const XlaOp& LocalOp::op() const { return op_; }
 
-LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
+ComputationBuilder::ComputationBuilder(const string& computation_name)
     : builder_(computation_name) {}
 
-void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
+void ComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
   builder_.SetOpMetadata(metadata);
 }
 
-void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
+void ComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
+StatusOr<Computation*> ComputationBuilder::Build() {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build());
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
-                                           const Shape& shape,
-                                           const string& name) {
+LocalOp ComputationBuilder::Parameter(int64 parameter_number,
+                                      const Shape& shape, const string& name) {
   return xla::Parameter(&builder_, parameter_number, shape, name);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildWithRoot(
-    const LocalOp& root) {
+StatusOr<Computation*> ComputationBuilder::BuildWithRoot(const LocalOp& root) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build(root.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetShape(const LocalOp& operand) {
+StatusOr<Shape> ComputationBuilder::GetShape(const LocalOp& operand) {
   return builder_.GetShape(operand.op());
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
+StatusOr<Shape> ComputationBuilder::GetReturnValueShape() {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape, builder_.GetProgramShape());
   return program_shape.result();
 }
 
-LocalOp LocalComputationBuilder::Infeed(const Shape& shape) {
+LocalOp ComputationBuilder::ReplicaId() { return xla::ReplicaId(&builder_); }
+
+LocalOp ComputationBuilder::Infeed(const Shape& shape) {
   return xla::Infeed(&builder_, shape);
 }
 
-void LocalComputationBuilder::Outfeed(const LocalOp& operand,
-                                      const Shape& shape,
-                                      const string& outfeed_config) {
+void ComputationBuilder::Outfeed(const LocalOp& operand, const Shape& shape,
+                                 const string& outfeed_config) {
   xla::Outfeed(operand.op(), shape, outfeed_config);
 }
 
-LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
+LocalOp ComputationBuilder::ConstantLiteral(const Literal& literal) {
   return xla::ConstantLiteral(&builder_, literal);
 }
 
-LocalOp LocalComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
+LocalOp ComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
   return xla::Iota(&builder_, element_type, size);
 }
 
-LocalOp LocalComputationBuilder::BroadcastedIota(const Shape& shape,
-                                                 int64 dimension) {
+LocalOp ComputationBuilder::BroadcastedIota(const Shape& shape,
+                                            int64 dimension) {
   return xla::Iota(&builder_, shape, dimension);
 }
 
-LocalOp LocalComputationBuilder::Broadcast(
-    const LocalOp& operand, absl::Span<const int64> broadcast_sizes) {
+LocalOp ComputationBuilder::Broadcast(const LocalOp& operand,
+                                      absl::Span<const int64> broadcast_sizes) {
   return xla::Broadcast(operand.op(), broadcast_sizes);
 }
 
-LocalOp LocalComputationBuilder::BroadcastInDim(
+LocalOp ComputationBuilder::BroadcastInDim(
     const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
     absl::Span<const int64> broadcast_dimensions) {
   return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
 }
 
-LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
-                                     const LocalOp& padding_value,
-                                     const PaddingConfig& padding_config) {
+LocalOp ComputationBuilder::Pad(const LocalOp& operand,
+                                const LocalOp& padding_value,
+                                const PaddingConfig& padding_config) {
   return xla::Pad(operand.op(), padding_value.op(), padding_config);
 }
 
-LocalOp LocalComputationBuilder::Reshape(const LocalOp& operand,
-                                         absl::Span<const int64> dimensions,
-                                         absl::Span<const int64> new_sizes) {
+LocalOp ComputationBuilder::Reshape(const LocalOp& operand,
+                                    absl::Span<const int64> dimensions,
+                                    absl::Span<const int64> new_sizes) {
   return xla::Reshape(operand.op(), dimensions, new_sizes);
 }
 
-LocalOp LocalComputationBuilder::Collapse(const LocalOp& operand,
-                                          absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Collapse(const LocalOp& operand,
+                                     absl::Span<const int64> dimensions) {
   return xla::Collapse(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) {
-  return xla::CrossReplicaSum(operand.op());
+LocalOp ComputationBuilder::AllToAll(
+    const LocalOp& operand, int64 split_dimension, int64 concat_dimension,
+    int64 split_count, absl::Span<const ReplicaGroup> replica_groups) {
+  std::vector<ReplicaGroup> rg;
+  rg.reserve(replica_groups.size());
+  for (int i = 0; i < replica_groups.size(); ++i) {
+    rg.push_back(replica_groups[i]);
+  }
+  return xla::AllToAll(operand.op(), split_dimension, concat_dimension,
+                       split_count, rg);
+}
+
+LocalOp ComputationBuilder::CrossReplicaSum(
+    const LocalOp& operand, absl::Span<const ReplicaGroup> replica_groups) {
+  return xla::CrossReplicaSum(operand.op(), replica_groups);
 }
 
-LocalOp LocalComputationBuilder::Slice(const LocalOp& operand,
-                                       absl::Span<const int64> start_indices,
-                                       absl::Span<const int64> limit_indices,
-                                       absl::Span<const int64> strides) {
+LocalOp ComputationBuilder::Slice(const LocalOp& operand,
+                                  absl::Span<const int64> start_indices,
+                                  absl::Span<const int64> limit_indices,
+                                  absl::Span<const int64> strides) {
   return xla::Slice(operand.op(), start_indices, limit_indices, strides);
 }
 
-LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand,
-                                            int64 start_index,
-                                            int64 limit_index, int64 stride,
-                                            int64 dimno) {
+LocalOp ComputationBuilder::SliceInDim(const LocalOp& operand,
+                                       int64 start_index, int64 limit_index,
+                                       int64 stride, int64 dimno) {
   return xla::SliceInDim(operand.op(), start_index, limit_index, stride, dimno);
 }
 
-LocalOp LocalComputationBuilder::DynamicSlice(
-    const LocalOp& operand, const LocalOp& start_indices,
-    absl::Span<const int64> slice_sizes) {
+LocalOp ComputationBuilder::DynamicSlice(const LocalOp& operand,
+                                         const LocalOp& start_indices,
+                                         absl::Span<const int64> slice_sizes) {
   return xla::DynamicSlice(operand.op(), start_indices.op(), slice_sizes);
 }
 
-LocalOp LocalComputationBuilder::DynamicUpdateSlice(
-    const LocalOp& operand, const LocalOp& update,
-    const LocalOp& start_indices) {
+LocalOp ComputationBuilder::DynamicUpdateSlice(const LocalOp& operand,
+                                               const LocalOp& update,
+                                               const LocalOp& start_indices) {
   return xla::DynamicUpdateSlice(operand.op(), update.op(), start_indices.op());
 }
 
-LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
-                                             int64 dimension) {
+LocalOp ComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
+                                        int64 dimension) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -739,18 +547,18 @@ LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
   return xla::ConcatInDim(&builder_, xla_ops, dimension);
 }
 
-LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const LocalOp& operand, const LocalComputation& select,
+LocalOp ComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const LocalOp& operand, const Computation& select,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding, const LocalOp& source,
-    const LocalOp& init_value, const LocalComputation& scatter) {
+    const LocalOp& init_value, const Computation& scatter) {
   return xla::SelectAndScatterWithGeneralPadding(
       operand.op(), select.computation(), window_dimensions, window_strides,
       padding, source.op(), init_value.op(), scatter.computation());
 }
 
-LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
+LocalOp ComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(elements.size());
   for (const auto& op : elements) {
@@ -760,22 +568,22 @@ LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   return xla::Tuple(&builder_, xla_ops);
 }
 
-LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
-                                                 int64 index) {
+LocalOp ComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
+                                            int64 index) {
   return xla::GetTupleElement(tuple_data.op(), index);
 }
 
-LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
+LocalOp ComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
   return xla::Dot(lhs.op(), rhs.op());
 }
 
-LocalOp LocalComputationBuilder::DotGeneral(
+LocalOp ComputationBuilder::DotGeneral(
     const LocalOp& lhs, const LocalOp& rhs,
     const DotDimensionNumbers& dimension_numbers) {
   return xla::DotGeneral(lhs.op(), rhs.op(), dimension_numbers);
 }
 
-LocalOp LocalComputationBuilder::ConvGeneralDilated(
+LocalOp ComputationBuilder::ConvGeneralDilated(
     const LocalOp& lhs, const LocalOp& rhs,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding,
@@ -787,18 +595,18 @@ LocalOp LocalComputationBuilder::ConvGeneralDilated(
                                  feature_group_count);
 }
 
-LocalOp LocalComputationBuilder::ConvertElementType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::ConvertElementType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::ConvertElementType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::BitcastConvertType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::BitcastConvertType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::BitcastConvertType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
-                                      absl::Span<const LocalOp> operands) {
+LocalOp ComputationBuilder::Call(const Computation& local_computation,
+                                 absl::Span<const LocalOp> operands) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -807,7 +615,7 @@ LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
   return xla::Call(&builder_, local_computation.computation(), xla_ops);
 }
 
-LocalOp LocalComputationBuilder::CustomCall(
+LocalOp ComputationBuilder::CustomCall(
     const string& call_target_name, absl::Span<const LocalOp> operands,
     const Shape& shape_with_layout,
     const std::vector<Shape>& operand_shapes_with_layout,
@@ -822,19 +630,19 @@ LocalOp LocalComputationBuilder::CustomCall(
                                    operand_shapes_with_layout, opaque);
 }
 
-LocalOp LocalComputationBuilder::Transpose(
-    const LocalOp& operand, absl::Span<const int64> permutation) {
+LocalOp ComputationBuilder::Transpose(const LocalOp& operand,
+                                      absl::Span<const int64> permutation) {
   return xla::Transpose(operand.op(), permutation);
 }
 
-LocalOp LocalComputationBuilder::Rev(const LocalOp& operand,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Rev(const LocalOp& operand,
+                                absl::Span<const int64> dimensions) {
   return xla::Rev(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
-                                     const LocalComputation& local_computation,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Map(absl::Span<const LocalOp> operands,
+                                const Computation& local_computation,
+                                absl::Span<const int64> dimensions) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -845,17 +653,17 @@ LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
                   dimensions);
 }
 
-LocalOp LocalComputationBuilder::Reduce(
+LocalOp ComputationBuilder::Reduce(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> dimensions_to_reduce) {
   return xla::Reduce(operand.op(), init_value.op(),
                      local_computation.computation(), dimensions_to_reduce);
 }
 
-LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+LocalOp ComputationBuilder::ReduceWindowWithGeneralPadding(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const int64> base_dilations,
@@ -867,51 +675,50 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
       padding);
 }
 
-LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
-                                           const LocalOp& sigma,
-                                           const Shape& shape) {
+LocalOp ComputationBuilder::RngNormal(const LocalOp& mu, const LocalOp& sigma,
+                                      const Shape& shape) {
   return xla::RngNormal(mu.op(), sigma.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
-                                            const Shape& shape) {
+LocalOp ComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
+                                       const Shape& shape) {
   return xla::RngUniform(a.op(), b.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::While(const LocalComputation& condition,
-                                       const LocalComputation& body,
-                                       const LocalOp& init) {
+LocalOp ComputationBuilder::While(const Computation& condition,
+                                  const Computation& body,
+                                  const LocalOp& init) {
   return xla::While(condition.computation(), body.computation(), init.op());
 }
 
-LocalOp LocalComputationBuilder::Conditional(
-    const LocalOp& predicate, const LocalOp& true_operand,
-    const LocalComputation& true_computation, const LocalOp& false_operand,
-    const LocalComputation& false_computation) {
+LocalOp ComputationBuilder::Conditional(const LocalOp& predicate,
+                                        const LocalOp& true_operand,
+                                        const Computation& true_computation,
+                                        const LocalOp& false_operand,
+                                        const Computation& false_computation) {
   return xla::Conditional(predicate.op(), true_operand.op(),
                           true_computation.computation(), false_operand.op(),
                           false_computation.computation());
 }
 
-StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
+StatusOr<bool> ComputationBuilder::IsConstant(const LocalOp& operand) {
   return builder_.IsConstant(operand.op());
 }
 
-LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
+LocalOp ComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
   return xla::Sort(operand.op(), {}, dimension);
 }
 
-LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
-                                            const LocalOp& values,
-                                            int64 dimension) {
+LocalOp ComputationBuilder::SortKeyVal(const LocalOp& keys,
+                                       const LocalOp& values, int64 dimension) {
   return xla::Sort(keys.op(), {values.op()}, dimension);
 }
 
-LocalOp LocalComputationBuilder::Cholesky(const LocalOp& a) {
-  return xla::Cholesky(a.op());
+LocalOp ComputationBuilder::Cholesky(const LocalOp& a, bool lower) {
+  return xla::Cholesky(a.op(), lower);
 }
 
-LocalOp LocalComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
+LocalOp ComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
   XlaBuilder* builder = a.op().builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto qr, xla::QRDecomposition(a.op(), full_matrices));
@@ -919,16 +726,32 @@ LocalOp LocalComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
   });
 }
 
-LocalOp LocalComputationBuilder::TriangularSolve(const LocalOp& a,
-                                                 const LocalOp& b,
-                                                 bool left_side, bool lower,
-                                                 bool transpose_a,
-                                                 bool conjugate_a) {
-  return xla::TriangularSolve(a.op(), b.op(), left_side, lower, transpose_a,
-                              conjugate_a);
+LocalOp ComputationBuilder::Eigh(const LocalOp& a, bool lower) {
+  XlaBuilder* builder = a.op().builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    auto eigh = xla::SelfAdjointEig(a.op(), lower);
+    return xla::Tuple(builder, {eigh.v, eigh.w});
+  });
 }
 
-LocalOp LocalComputationBuilder::Gather(
+LocalOp ComputationBuilder::SVD(const LocalOp& a) {
+  XlaBuilder* builder = a.op().builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    auto svd = xla::SVD(a.op());
+    return xla::Tuple(builder, {svd.u, svd.d, svd.v});
+  });
+}
+
+LocalOp ComputationBuilder::TriangularSolve(const LocalOp& a, const LocalOp& b,
+                                            bool left_side, bool lower,
+                                            bool unit_diagonal,
+                                            int transpose_a) {
+  return xla::TriangularSolve(
+      a.op(), b.op(), left_side, lower, unit_diagonal,
+      xla::TriangularSolveOptions::Transpose(transpose_a));
+}
+
+LocalOp ComputationBuilder::Gather(
     const LocalOp& input, const LocalOp& start_indices,
     const GatherDimensionNumbers& dimension_numbers,
     absl::Span<const int64> slice_sizes) {
@@ -936,24 +759,24 @@ LocalOp LocalComputationBuilder::Gather(
                      slice_sizes);
 }
 
-LocalOp LocalComputationBuilder::Scatter(
+LocalOp ComputationBuilder::Scatter(
     const LocalOp& input, const LocalOp& scatter_indices,
-    const LocalOp& updates, const LocalComputation& update_computation,
+    const LocalOp& updates, const Computation& update_computation,
     const ScatterDimensionNumbers& dimension_numbers) {
   return xla::Scatter(input.op(), scatter_indices.op(), updates.op(),
                       update_computation.computation(), dimension_numbers);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
+StatusOr<Computation*> ComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       builder_.BuildConstantSubGraph(operand.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-#define _FORWARD(method_name, return_sig, args_sig, args)    \
-  return_sig LocalComputationBuilder::method_name args_sig { \
-    return xla::method_name args;                            \
+#define _FORWARD(method_name, return_sig, args_sig, args) \
+  return_sig ComputationBuilder::method_name args_sig {   \
+    return xla::method_name args;                         \
   }
 
 #define _FORWARD_UNOP(method_name) \
@@ -995,6 +818,7 @@ _FORWARD_BINOP(Atan2)
 _FORWARD_BINOP(Pow)
 _FORWARD_BINOP(Complex)
 _FORWARD_UNOP(Not)
+_FORWARD_UNOP(Clz)
 _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Expm1)
@@ -1040,108 +864,9 @@ void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
 }
 
-void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
-
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation) {
-  delete computation;
-}
-
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation) {
-  delete computation;
-}
-
-void DeleteLocalComputation(LocalComputation* computation) {
-  delete computation;
-}
-
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer) {
-  const Shape tuple_shape = local_shaped_buffer->shape();
+void DeleteLocalExecutable(LocalExecutable* computation) { delete computation; }
 
-  if (!tuple_shape.IsTuple()) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
-
-  DeviceMemoryAllocator* allocator =
-      local_shaped_buffer->shaped_buffer()->memory_allocator();
-  ShapedBuffer tuple_buffer = local_shaped_buffer->Release();
-
-  // Extract some metadata we use to construct scoped buffers.
-  const se::Platform* platform = tuple_buffer.platform();
-  int device_ordinal = tuple_buffer.device_ordinal();
-
-  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
-  std::vector<LocalShapedBuffer*> results;
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    // Create a shaped buffer for this destructured tuple element.
-    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
-    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
-    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
-
-    ShapeUtil::ForEachSubshape(
-        subshape, [&](const Shape& s, const ShapeIndex& index) {
-          ShapeIndex original(index);
-          original.push_front(i);
-          se::DeviceMemoryBase* device_memory =
-              shape_tree.mutable_element(original);
-          shaped_buffer.set_buffer(*device_memory, index);
-          *device_memory = se::DeviceMemoryBase();
-        });
-
-    VLOG(3) << "Completed tuple element: " << i;
-    results.push_back(new LocalShapedBuffer(
-        ScopedShapedBuffer(std::move(shaped_buffer), allocator)));
-  }
-  // Deallocate the root buffer.
-  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
-  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
-  return new LocalShapedBufferTuple(std::move(results));
-}
-
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target) {
-  const Shape& tuple_shape = allocation->shape();
-
-  if (!tuple_shape.IsTuple()) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
-  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
-  TF_RETURN_IF_ERROR(root.status());
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  std::vector<XrtAllocation*> results;
-  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    inputs.clear();
-    inputs.insert({base_handle, allocation->handle()});
-    inputs.insert({shape_index, {i}});
-    std::vector<tensorflow::Tensor> outputs;
-    auto status = session.Run(inputs, {subtuple}, &outputs);
-    if (!status.ok()) {
-      // Clean up before returning non-ok status.
-      for (int j = 0; j < results.size(); ++j) {
-        delete results[j];
-      }
-      return status;
-    }
-    const int64 subtuple_handle = outputs[0].scalar<int64>()();
-    const Shape& subtuple_shape =
-        ShapeUtil::GetTupleElementShape(tuple_shape, i);
-    results.push_back(
-        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
-  }
-  return new XrtAllocationTuple(std::move(results));
-}
+void DeleteComputation(Computation* computation) { delete computation; }
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 98759cf984751d2cef8df4449d392ace786a8ebc..5046c1ec011032cb7166c281a297388a8e02c4e8 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -22,9 +22,6 @@ limitations under the License.
 #include <Python.h>
 
 #include "absl/types/span.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -35,42 +32,42 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// Initializes the number of replicas that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializeReplicaCount(int replica_count);
-
-// Initializes the platform name that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializePlatformName(const string& platform_name);
-
-// Returns the replica count that is currently set, regardless of whether the
-// local XLA service has been instantiated yet or not.
-int GetReplicaCount();
-
 // Registers a 'fn_capsule' as a CPU custom call target.
 // 'fn_capsule' is a void* pointer encapsulated in a PyCapsule object, with name
 // "xla._CPU_CUSTOM_CALL_TARGET".
 Status RegisterCpuCustomCallTarget(const string& name, PyObject* fn_capsule);
 
-// Wraps the local client's infeed-transfer function.
-//
-// The default device ordinal (0) is used.
-Status TransferToInfeedLocal(const Literal& literal);
+// Wrapper around an xla::LocalClient.
+class LocalClient {
+ public:
+  // Initializes a local XLA client for `platform_name`. Returns an error if no
+  /// such platform exists, or if the platform has no visible devices.
+  static StatusOr<LocalClient> Get(const string& platform_name);
+
+  // Copyable and moveable; the class is just a wrapper around a
+  // xla::LocalClient pointer for convenient SWIG wrapping.
+
+  // Returns the number of devices known to the XLA client.
+  int DeviceCount() const;
+
+  // Wraps the local client's infeed-transfer function.
+  //
+  // The default device ordinal (0) is used.
+  Status TransferToInfeed(const Literal& literal, int device_ordinal);
 
-// Transfers the given literal to the infeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-Status TransferToInfeedLocalReplica(const Literal& literal, int replica_number);
+  // Transfers a literal of the given shape from the outfeed of the given
+  // replica.
+  StatusOr<Literal> TransferFromOutfeed(const Shape& shape, int device_ordinal);
 
-// Transfers a literal of the given shape from the outfeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number);
+  xla::LocalClient* client() const { return client_; }
+
+ private:
+  LocalClient(xla::LocalClient* client);
+
+  xla::LocalClient* client_;
+};
+
+class LocalShapedBufferTuple;
 
 // Represents a reference to literals that live in a device-allocated buffer via
 // XLA. Specifically, wraps a ScopedShapedBuffer produced by transferring a
@@ -79,9 +76,9 @@ class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
       const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-      int replica_number);
+      const LocalClient& client, int device_ordinal);
 
-  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer, xla::LocalClient* client);
   StatusOr<Literal> ToLiteral() const;
   const Shape& shape() const;
   const ScopedShapedBuffer* shaped_buffer() const;
@@ -90,8 +87,13 @@ class LocalShapedBuffer {
   // analogous to std::unique_ptr::release().
   ShapedBuffer Release();
 
+  // Destructures a tuple-valued LocalShapedBuffer into its constituent
+  // elements in LocalShapedBufferTuple form.
+  StatusOr<LocalShapedBufferTuple*> DestructureTuple();
+
  private:
   ScopedShapedBuffer shaped_buffer_;
+  xla::LocalClient* client_;
 };
 
 // Result of a tuple destructuring operation on a LocalShapedBuffer -- this
@@ -117,68 +119,20 @@ class LocalShapedBufferTuple {
   std::vector<LocalShapedBuffer*> elements_;
 };
 
-// Destructures a tuple-valued LocalShapedBuffer into its constitutent elements
-// in LocalShapedBufferTuple form.
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer);
-
-// Represents a reference to literals that live in a device-allocated buffer via
-// XRT. Specifically, wraps an int64 handle produced by running the allocation
-// graph, and an XLA shape to track the referent's shape.
-class XrtAllocation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which allocation and deallocation
-  // graphs are run.
-  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
-                                              const string& session_target);
-
-  XrtAllocation(int64 handle, Shape shape, const string& session_target);
-  ~XrtAllocation();
-  StatusOr<Literal> ToLiteral() const;
-  const Shape& shape() const;
-  const int64 handle() const;
-
- private:
-  const int64 handle_;
-  const Shape shape_;
-  const string session_target_;
-};
-
-// Result of a tuple destructuring operation on an XrtAllocation.
-class XrtAllocationTuple {
- public:
-  // Note: any XrtAllocation elements that are not Release()'d will be
-  // deallocated in the destructor.
-  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
-
-  ~XrtAllocationTuple();
-
-  // Releases the ith element to the caller. Further attempts to release the ith
-  // element will return an invalid argument error.
-  StatusOr<XrtAllocation*> Release(int i);
-
-  // Returns the number of elements in the destructured tuple.
-  int64 size() const;
-
- private:
-  std::vector<XrtAllocation*> elements_;
-};
-
-// Destructures a tuple-valued XrtAllocation into its constitutent elements
-// in XrtAllocationTuple form.
-//
-// Accepts a `session_target` argument, used in constructing the
-// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
-// and passed along in constructing each constituent XrtAllocation.
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target);
-
 // Represents a compiled computation that can be executed given handles to
 // device-allocated literals. Specifically, wraps an XLA LocalExecutable.
-class CompiledLocalComputation {
+class LocalExecutable {
  public:
-  CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+  LocalExecutable(std::unique_ptr<xla::LocalExecutable> executable,
+                  xla::DeviceAssignment device_assignment,
+                  xla::LocalClient* client);
+
+  int num_replicas() const {
+    return executable_->build_options().num_replicas();
+  }
+
+  // Returns the device ordinals to which each replica is assigned.
+  std::vector<int> DeviceOrdinals() const;
 
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
@@ -190,47 +144,22 @@ class CompiledLocalComputation {
       absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
 
  private:
-  std::unique_ptr<LocalExecutable> executable_;
+  const std::unique_ptr<xla::LocalExecutable> executable_;
+  const xla::DeviceAssignment device_assignment_;
+  xla::LocalClient* const client_;
 };
 
-// Represents a compiled computation that can be executed given handles to
-// device-allocated literals. Specifically, wraps an XRT computation handle.
-class CompiledXrtComputation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the execution graph is run.
-  CompiledXrtComputation(const ProgramShape& program_shape, int64 handle,
-                         const string& session_target);
-  ~CompiledXrtComputation();
-
-  StatusOr<XrtAllocation*> Execute(
-      absl::Span<XrtAllocation* const> argument_handles);
-
-  const ProgramShape& program_shape() const;
-  int64 handle() const;
-
- private:
-  const ProgramShape program_shape_;
-  const int64 handle_;
-  const string session_target_;
-};
-
-// Wraps a XlaComputation produced by a LocalComputationBuilder. The
+// Wraps a XlaComputation produced by a ComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
 // made available to Python via SWIG.
-class LocalComputation {
+class Computation {
  public:
-  LocalComputation(XlaComputation computation);
+  Computation(XlaComputation computation);
 
-  StatusOr<CompiledLocalComputation*> Compile(
+  StatusOr<LocalExecutable*> Compile(
       const std::vector<Shape>& argument_shapes,
-      const ExecutableBuildOptions* build_options);
-
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the compilation graph is run.
-  StatusOr<CompiledXrtComputation*> CompileForXrt(
-      const std::vector<Shape>& argument_shapes, const string& session_target);
+      const ExecutableBuildOptions* build_options, const LocalClient& client);
 
   const XlaComputation& computation() const;
 
@@ -239,6 +168,15 @@ class LocalComputation {
   // string on failure.
   string GetSerializedProto() const;
 
+  // Returns the computation in human-readable HLO text format.
+  StatusOr<string> GetHloText() const;
+
+  // Returns the computation in graphviz dot format.
+  StatusOr<string> GetHloDotGraph() const;
+
+  // Returns the program shape for this computation.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
@@ -246,7 +184,7 @@ class LocalComputation {
   XlaComputation computation_;
 };
 
-// Wraps a XlaOp produced by a LocalComputationBuilder. This class is intended
+// Wraps a XlaOp produced by a ComputationBuilder. This class is intended
 // to be made available to Python via SWIG.
 class LocalOp {
  public:
@@ -263,20 +201,20 @@ class LocalOp {
 //   Python.
 // - Set up the underlying builder to use the client library's
 //   LocalClient.
-// - Wrap Computations in LocalComputations for Python access.
-// - Correspondingly unwrap incoming LocalComputations.
-class LocalComputationBuilder {
+// - Wrap Computations in Computations for Python access.
+// - Correspondingly unwrap incoming Computations.
+class ComputationBuilder {
  public:
-  LocalComputationBuilder(const string& computation_name);
+  ComputationBuilder(const string& computation_name);
 
   void SetOpMetadata(const OpMetadata& metadata);
   void ClearOpMetadata();
 
-  // Returns an owned LocalComputation to the caller on success.
-  StatusOr<LocalComputation*> Build();
+  // Returns an owned Computation to the caller on success.
+  StatusOr<Computation*> Build();
 
-  // Returns an owned LocalComputation to the caller on success with given root.
-  StatusOr<LocalComputation*> BuildWithRoot(const LocalOp& root);
+  // Returns an owned Computation to the caller on success with given root.
+  StatusOr<Computation*> BuildWithRoot(const LocalOp& root);
 
   LocalOp Parameter(int64 parameter_number, const Shape& shape,
                     const string& name);
@@ -286,6 +224,8 @@ class LocalComputationBuilder {
   // Returns the shape of the current return value for the computation.
   StatusOr<Shape> GetReturnValueShape();
 
+  LocalOp ReplicaId();
+
   LocalOp Infeed(const Shape& shape);
 
   void Outfeed(const LocalOp& operand, const Shape& shape,
@@ -312,7 +252,12 @@ class LocalComputationBuilder {
 
   LocalOp Collapse(const LocalOp& operand, absl::Span<const int64> dimensions);
 
-  LocalOp CrossReplicaSum(const LocalOp& operand);
+  LocalOp AllToAll(const LocalOp& operand, int64 split_dimension,
+                   int64 concat_dimension, int64 split_count,
+                   absl::Span<const ReplicaGroup> replica_groups);
+
+  LocalOp CrossReplicaSum(const LocalOp& operand,
+                          absl::Span<const ReplicaGroup> replica_groups);
 
   LocalOp Slice(const LocalOp& operand, absl::Span<const int64> start_indices,
                 absl::Span<const int64> limit_indices,
@@ -330,11 +275,11 @@ class LocalComputationBuilder {
   LocalOp ConcatInDim(absl::Span<const LocalOp> operands, int64 dimension);
 
   LocalOp SelectAndScatterWithGeneralPadding(
-      const LocalOp& operand, const LocalComputation& select,
+      const LocalOp& operand, const Computation& select,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64> > padding, const LocalOp& source,
-      const LocalOp& init_value, const LocalComputation& scatter);
+      const LocalOp& init_value, const Computation& scatter);
 
   LocalOp Tuple(absl::Span<const LocalOp> elements);
 
@@ -360,7 +305,7 @@ class LocalComputationBuilder {
   LocalOp BitcastConvertType(const LocalOp& operand,
                              PrimitiveType new_element_type);
 
-  LocalOp Call(const LocalComputation& local_computation,
+  LocalOp Call(const Computation& local_computation,
                absl::Span<const LocalOp> operands);
 
   LocalOp CustomCall(const string& call_target_name,
@@ -375,16 +320,16 @@ class LocalComputationBuilder {
   LocalOp Rev(const LocalOp& operand, absl::Span<const int64> dimensions);
 
   LocalOp Map(absl::Span<const LocalOp> operands,
-              const LocalComputation& local_computation,
+              const Computation& local_computation,
               absl::Span<const int64> dimensions);
 
   LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value,
-                 const LocalComputation& local_computation,
+                 const Computation& local_computation,
                  absl::Span<const int64> dimensions_to_reduce);
 
   LocalOp ReduceWindowWithGeneralPadding(
       const LocalOp& operand, const LocalOp& init_value,
-      const LocalComputation& local_computation,
+      const Computation& local_computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const int64> base_dilations,
@@ -396,13 +341,13 @@ class LocalComputationBuilder {
 
   LocalOp RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape);
 
-  LocalOp While(const LocalComputation& condition, const LocalComputation& body,
+  LocalOp While(const Computation& condition, const Computation& body,
                 const LocalOp& init);
 
   LocalOp Conditional(const LocalOp& predicate, const LocalOp& true_operand,
-                      const LocalComputation& true_computation,
+                      const Computation& true_computation,
                       const LocalOp& false_operand,
-                      const LocalComputation& false_computation);
+                      const Computation& false_computation);
 
   StatusOr<bool> IsConstant(const LocalOp& operand);
 
@@ -413,21 +358,27 @@ class LocalComputationBuilder {
 
   LocalOp QR(const LocalOp& a, bool full_matrices);
 
-  LocalOp Cholesky(const LocalOp& a);
+  LocalOp Cholesky(const LocalOp& a, bool lower);
+
+  LocalOp Eigh(const LocalOp& a, bool lower);
+
+  LocalOp SVD(const LocalOp& a);
 
+  // `transpose_a` is the integer value of a TriangularSolveOptions::Transpose
+  // enum. We use an integer here so we don't have to teach SWIG about the
+  // enum.
   LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
-                          bool lower, bool transpose_a, bool conjugate_a);
+                          bool lower, bool unit_diagonal, int transpose_a);
 
   LocalOp Gather(const LocalOp& input, const LocalOp& start_indices,
                  const GatherDimensionNumbers& dimension_numbers,
                  absl::Span<const int64> slice_sizes);
 
   LocalOp Scatter(const LocalOp& input, const LocalOp& scatter_indices,
-                  const LocalOp& updates,
-                  const LocalComputation& update_computation,
+                  const LocalOp& updates, const Computation& update_computation,
                   const ScatterDimensionNumbers& dimension_numbers);
 
-  StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
+  StatusOr<Computation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
@@ -469,6 +420,7 @@ class LocalComputationBuilder {
   _FORWARD_BINOP(Pow)
   _FORWARD_BINOP(Complex)
   _FORWARD_UNOP(Not)
+  _FORWARD_UNOP(Clz)
   _FORWARD_UNOP(Abs)
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Expm1)
@@ -516,10 +468,8 @@ class LocalComputationBuilder {
 
 // Functions for freeing resources from the Python side.
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer);
-void DeleteXrtAllocation(XrtAllocation* allocation);
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation);
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation);
-void DeleteLocalComputation(LocalComputation* computation);
+void DeleteLocalExecutable(LocalExecutable* computation);
+void DeleteComputation(Computation* computation);
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 39bff3f5719fd6b2ee1856848ade7152a8e7ee3e..a1a4f007f2fc9ce730f9c7fd11dadae85250edb5 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -16,89 +16,6 @@ limitations under the License.
 // SWIG typemaps and declarations for building, compiling, and
 // executing XLA computations, wrapping most of what is declared in
 // local_computation_builder.h.
-//
-// The typemaps below implement/assert the following correspondences
-// (with elaborations below):
-//
-//    C++                                  Python
-// -------------------------------------+---------------------------------------
-//  Span<int64>                        <-  sequence of int
-//  Span<LocalOp>                      <-  sequence of LocalOp
-//  Literal                            <-> (nested tuple of) numpy ndarray
-//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
-//  Shape                               -> pair holding (dtype, dimensions)
-//                                     <-  object duck-typed as xla_client.Shape
-//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
-//  PrimitiveType                      <-  int
-//  Span<pair<int64, in64>>            <-  sequence of int pairs
-//  PaddingConfig proto                <-  corresponding Python proto
-//  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
-//  DotDimensionNumbers proto          <-  corresponding Python proto
-//  GatherDimensionNumbers proto       <-  corresponding Python proto
-//  ScatterDimensionNumbers proto      <-  corresponding Python proto
-//
-// Arrows indicate whether a conversion only ever occurs in one
-// direction, or whether it is maintained bidirectionally.
-//
-// The Python objects corresponding to C++ Literals have the type:
-//
-//   T = ndarray | (T, ...)
-//
-// where a terminal numpy ndarray translates to a Literal with a
-// non-tuple Shape, an XLA primitive element type corresponding to the
-// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
-// to a tuple-shaped Literal whose tuple components are translated
-// recursively. For example, if x is a numpy ndarray in Python, with
-// shape (2, 3) and dtype of dtype('float32'), then x translates to a
-// Literal with rank 2, dimension 2 and 3, and XLA primitive type
-// F32. Meanwhile,
-//
-//   (x, (x, x), (x,)),
-//
-// translates to a tuple-shaped XLA Literal, whose component subshapes
-// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
-//
-// Shapes output by C++ become Python objects with the type:
-//
-//   T            = (dtype, S)
-//   S            = DIMENSIONS | TUPLE_SHAPES
-//   DIMENSIONS   = (int, ...)
-//   TUPLE_SHAPES = (T, ...)
-//
-// In the pair described by the T rule, the terminal dtype determines
-// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
-// dtype('O'), numpy's object dtype, the structure represents a tuple
-// shape and the expansion of the non-terminal S is
-// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
-// and S expands into DIMENSIONS giving dimension sizes. For example:
-//
-//   (dtype('float32'), (3, 5, 7))
-//
-// describes a 3x5x7 array of F32s, and
-//
-//   (dtype('O'), ((dtype('float32'), (2, 3)),
-//                 (dtype('float64'), (4, 5))))
-//
-// describes a tuple shape with two subshapes: the first a 2x3 F32,
-// and the other a 4x5 F64.
-//
-// The Python int corresponding to a PrimitiveType enum must be valid
-// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
-//
-// The SWIG object wrappers generated by this file are not intended
-// for end use, but rather for internal use in the Python XLA client,
-// xla_client.py.
-//
-// One central reason for the Python-side indirection is that the
-// Python-side objects produced by the typemaps in this file are
-// further packaged up by xla_client before being passed on. For
-// instance, the Python pair produced for a C++ Shape is further
-// wrapped in a Python class (xla_client.Shape) so as not to expose
-// the raw pair externally.
-//
-// Other SWIG object wrappers (e.g. of LocalComputation) are further
-// wrapped by xla_client in order to set up a custom destructor that
-// triggers memory deallocation on the C++ side.
 
 %module(threads="1") local_computation_builder
 
@@ -106,6 +23,7 @@ limitations under the License.
 %nothread;
 
 %include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
 
 %{
 // Must be included first
@@ -123,87 +41,6 @@ limitations under the License.
 using namespace xla;
 using namespace xla::swig;
 
-namespace xla {
-
-namespace swig {
-
-bool GetIntAttr(PyObject* o, const char* field, int64* result) {
-  PyObject* fo = PyObject_GetAttrString(o, field);
-  if (!fo) {
-    return false;
-  }
-  const int64 value = numpy::PyIntOrPyLongToLong(fo);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(fo);
-    return false;
-  }
-  Py_DECREF(fo);
-  *result = value;
-  return true;
-}
-
-// Returns "ok"; true if there is no error, false if there was an error.
-bool HandleStringAttribute(PyObject* o,
-                           const char* attr_name,
-                           std::function<void(string s)> f) {
-  if (!PyObject_HasAttrString(o, attr_name)) {
-    return true;  // It's ok for the object to not have the attribute.
-  }
-  PyObject* attr = PyObject_GetAttrString(o, attr_name);
-  if (attr == nullptr) {
-    return false;  // An error occurred getting the attribute.
-  }
-  if (attr == Py_None) {
-    Py_DECREF(attr);
-    return true;  // The attribute is None, which we consider ok.
-  }
-  if (!PyString_Check(attr)) {
-    string message = absl::StrFormat("%s must be a string or none; got %s",
-        attr_name, numpy::PyObjectCppRepr(attr));
-    PyErr_SetString(PyExc_TypeError, message.c_str());
-    Py_DECREF(attr);
-    return false;  // Type error, not ok.
-  }
-  f(PyString_AsString(attr));
-  Py_DECREF(attr);
-  return true;  // Handled string attribute, ok!
-}
-
-bool HandleRepeatedInt64Attribute(
-    PyObject* o, const char* attr_name,
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
-  PyObject* seq = PyObject_GetAttrString(o, attr_name);
-  if (!seq) {
-    return false;
-  }
-
-  int length = PySequence_Size(seq);
-  if (length == -1) {
-    Py_DECREF(seq);
-    return false;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(seq, i);
-    if (!item) {
-      Py_DECREF(seq);
-      return false;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(seq);
-      return false;
-    }
-    *field->Add() = dimension;
-    Py_DECREF(item);
-  }
-  Py_DECREF(seq);
-  return true;
-}
-
-}  // namespace swig
-}  // namespace xla
 %}
 
 // Required to use PyArray_* functions.
@@ -211,57 +48,6 @@ bool HandleRepeatedInt64Attribute(
 tensorflow::ImportNumpy();
 %}
 
-// Basic types
-
-%typemap(out) StatusOr<bool> {
-  if ($1.ok()) {
-    $result = PyBool_FromLong($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) Status {
-  if (!$1.ok()) {
-    PyErr_SetString(
-        PyExc_RuntimeError, $1.ToString().c_str());
-    SWIG_fail;
-  }
-  Py_INCREF(Py_None);
-  $result = Py_None;
-}
-
-%typemap(in) absl::Span<const int64>
-    (std::vector<int64> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.resize(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
-    if (temps[i] == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    Py_DECREF(py_int);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
 // Computation builder types
 
 %typemap(in) absl::Span<const xla::swig::LocalOp>(
@@ -286,12 +72,12 @@ tensorflow::ImportNumpy();
 
 // Computation and buffer/allocation types
 
-%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalClient> {
   if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
+    xla::swig::LocalClient value = $1.ValueOrDie();
     {
-      auto* $1 = value;
-      $typemap(out, xla::swig::CompiledLocalComputation*)
+      auto $1 = value;
+      $typemap(out, xla::swig::LocalClient)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -299,12 +85,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::CompiledXrtComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalExecutable*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::CompiledXrtComputation*)
+      $typemap(out, xla::swig::LocalExecutable*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -338,38 +124,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
+%typemap(out) StatusOr<xla::swig::Computation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocation*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocationTuple*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::LocalComputation*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::LocalComputation*)
+      $typemap(out, xla::swig::Computation*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -429,448 +189,6 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
-%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
-    (std::vector<XrtAllocation*> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    XrtAllocation* xrta;
-    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
-                         SWIG_POINTER_EXCEPTION)) == -1) {
-      SWIG_fail;
-    }
-    temps.push_back(xrta);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// Literal
-
-%typemap(out) StatusOr<Literal> {
-  if ($1.ok()) {
-    Literal value = $1.ConsumeValueOrDie();
-    $result = numpy::PyObjectFromXlaLiteral(*value);
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
-  literal_status = numpy::XlaLiteralFromPyObject($input);
-  if (!literal_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $1 = &literal_status.ValueOrDie();
-}
-
-%typemap(out) Literal {
-  $result = numpy::PyObjectFromXlaLiteral(*$1);
-}
-
-%typemap(out) StatusOr<Literal> {
-  if (!$1.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $result = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
-}
-
-%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
-    if (!literal_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps.push_back(literal_status.ConsumeValueOrDie());
-    Py_DECREF(o);
-  }
-  $1 = &temps;
-}
-
-// OpMetadata
-
-%typemap(in) const OpMetadata& (OpMetadata temp) {
-  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-// Shape
-
-%typemap(out) const Shape& {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(out) StatusOr<Shape> {
-  if ($1.ok()) {
-    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Shape& (Shape temp) {
-  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-%typemap(in) const absl::optional<Shape>& (
-    absl::optional<Shape> temp) {
-  if ($input == Py_None) {
-    temp = absl::nullopt;
-    $1 = &temp;
-  } else {
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temp = std::move(statusor).ValueOrDie();
-    $1 = &temp;
-  }
-}
-
-%typemap(out) std::unique_ptr<Shape> {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-    Py_DECREF(o);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temps.push_back(statusor.ConsumeValueOrDie());
-  }
-  $1 = &temps;
-}
-
-%typemap(in) const std::vector<absl::optional<Shape> >& (
-    std::vector<absl::optional<Shape> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (o == Py_None) {
-      temps.push_back(absl::nullopt);
-    } else {
-      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-      Py_DECREF(o);
-      if (!statusor.ok()) {
-        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        SWIG_fail;
-      }
-      temps.push_back(statusor.ConsumeValueOrDie());
-    }
-  }
-  $1 = &temps;
-}
-
-// PrimitiveType
-
-%typemap(in) PrimitiveType {
-  PyObject* py_int = numpy::PyNumberToPyInt($input);
-  if (!py_int) {
-    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    SWIG_fail;
-  }
-  const long value = numpy::PyIntOrPyLongToLong(py_int);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  if (!PrimitiveType_IsValid(value)) {
-    PyErr_SetString(
-        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  $1 = static_cast<PrimitiveType>(value);
-}
-
-// Span<pair<int64, in64>>
-
-%typemap(in) absl::Span<const std::pair<int64, int64> >
-    (std::vector<std::pair<int64, int64> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (!o) {
-      SWIG_fail;
-    }
-    PyObject* first = PyTuple_GetItem(o, 0);
-    if (!first) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
-    if (!first_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "First pair item cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* second = PyTuple_GetItem(o, 1);
-    if (!second) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
-    if (!second_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Second pair item cannot be converted to int");
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
-    if (first_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
-    if (second_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    temps.push_back(std::make_pair(first_value, second_value));
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// DotDimensionNumbers
-
-%typemap(in) const DotDimensionNumbers&
-    (DotDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "lhs_contracting_dimensions",
-        dimension_numbers.mutable_lhs_contracting_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "rhs_contracting_dimensions",
-        dimension_numbers.mutable_rhs_contracting_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "lhs_batch_dimensions",
-        dimension_numbers.mutable_lhs_batch_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "rhs_batch_dimensions",
-        dimension_numbers.mutable_rhs_batch_dimensions())) {
-    SWIG_fail;
-  }
-
-  $1 = &dimension_numbers;
-}
-
-// PaddingConfig
-
-%typemap(in) const PaddingConfig&
-    (PaddingConfig padding_config) {
-  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
-  if (!dimensions) {
-    SWIG_fail;
-  }
-
-  int length = PySequence_Size(dimensions);
-  if (length == -1) {
-    Py_DECREF(dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(dimensions, i);
-    if (!item) {
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    int64 edge_padding_low, edge_padding_high, interior_padding;
-    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
-        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
-        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
-      Py_DECREF(item);
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    Py_DECREF(item);
-
-    PaddingConfig::PaddingConfigDimension* dimension =
-        padding_config.add_dimensions();
-    dimension->set_edge_padding_low(edge_padding_low);
-    dimension->set_edge_padding_high(edge_padding_high);
-    dimension->set_interior_padding(interior_padding);
-  }
-  Py_DECREF(dimensions);
-
-  $1 = &padding_config;
-}
-
-// ConvolutionDimensionNumbers
-
-%typemap(in) const ConvolutionDimensionNumbers&
-    (ConvolutionDimensionNumbers dimension_numbers) {
-  int64 value;
-
-  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_batch_dimension(value);
-
-  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_feature_dimension(value);
-
-  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_batch_dimension(value);
-
-  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_input_feature_dimension(value);
-
-  if (!HandleRepeatedInt64Attribute(
-        $input, "input_spatial_dimensions",
-        dimension_numbers.mutable_input_spatial_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "kernel_spatial_dimensions",
-        dimension_numbers.mutable_kernel_spatial_dimensions())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "output_spatial_dimensions",
-        dimension_numbers.mutable_output_spatial_dimensions())) {
-    SWIG_fail;
-  }
-
-  $1 = &dimension_numbers;
-}
-
-// GatherDimensionNumbers
-
-%typemap(in) const GatherDimensionNumbers&
-    (GatherDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "offset_dims",
-        dimension_numbers.mutable_offset_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "collapsed_slice_dims",
-        dimension_numbers.mutable_collapsed_slice_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "start_index_map",
-        dimension_numbers.mutable_start_index_map())) {
-    SWIG_fail;
-  }
-
-  int64 value;
-  if (!GetIntAttr($input, "index_vector_dim", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_index_vector_dim(value);
-
-  $1 = &dimension_numbers;
-}
-
-// ScatterDimensionNumbers
-
-%typemap(in) const ScatterDimensionNumbers&
-    (ScatterDimensionNumbers dimension_numbers) {
-  if (!HandleRepeatedInt64Attribute(
-        $input, "update_window_dims",
-        dimension_numbers.mutable_update_window_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "inserted_window_dims",
-        dimension_numbers.mutable_inserted_window_dims())) {
-    SWIG_fail;
-  }
-  if (!HandleRepeatedInt64Attribute(
-        $input, "scatter_dims_to_operand_dims",
-        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
-    SWIG_fail;
-  }
-
-  int64 value;
-  if (!GetIntAttr($input, "index_vector_dim", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_index_vector_dim(value);
-
-  $1 = &dimension_numbers;
-}
-
 // ExecutableBuildOptions
 
 %typemap(in) const ExecutableBuildOptions*
@@ -878,41 +196,38 @@ tensorflow::ImportNumpy();
   if ($input == Py_None) {
     $1 = NULL;
   } else {
-    if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_generate_hlo_graph(std::move(s));
+    if (!HandleStringAttribute($input, "dump_to", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_to(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_optimized_hlo_proto_to(std::move(s));
+    if (!HandleStringAttribute($input, "dump_hlo_pass_re", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_pass_re(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_unoptimized_hlo_proto_to(std::move(s));
+    if (!HandleStringAttribute($input, "dump_hlo_module_re", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_module_re(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_per_pass_hlo_proto_to(std::move(s));
+    if (!HandleBoolAttribute($input, "dump_hlo_as_text", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_as_text(b);
     })) {
       return nullptr;
     }
-
-    PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
-    if (o == NULL) {
-      SWIG_fail;
+    if (!HandleBoolAttribute($input, "dump_hlo_as_proto", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_as_proto(b);
+    })) {
+      return nullptr;
     }
-    if (o != Py_None) {
-      if (!PyBool_Check(o)) {
-        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
-        SWIG_fail;
-      }
-      build_options.mutable_debug_options()->set_xla_hlo_profile(o == Py_True);
+    if (!HandleBoolAttribute($input, "hlo_profile", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_hlo_profile(b);
+    })) {
+      return nullptr;
     }
-    Py_DECREF(o);
 
-    o = PyObject_GetAttrString($input, "result_shape");
+    PyObject* o = PyObject_GetAttrString($input, "result_shape");
     if (o == nullptr) {
       return nullptr;
     }
@@ -927,6 +242,12 @@ tensorflow::ImportNumpy();
     }
     Py_DECREF(o);
 
+    int64 num_replicas;
+    if (!GetIntAttr($input, "num_replicas", &num_replicas)) {
+      SWIG_fail;
+    }
+    build_options.set_num_replicas(num_replicas);
+
     $1 = &build_options;
   }
 }
@@ -934,159 +255,154 @@ tensorflow::ImportNumpy();
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
-%unignore xla::swig::InitializeReplicaCount;
-%unignore xla::swig::InitializePlatformName;
-%unignore xla::swig::GetReplicaCount;
 %unignore xla::swig::RegisterCpuCustomCallTarget;
-%unignore xla::swig::TransferToInfeedLocal;
-%unignore xla::swig::TransferToInfeedLocalReplica;
-%unignore xla::swig::TransferFromOutfeedLocalReplica;
+%unignore xla::swig::LocalClient;
+%unignore xla::swig::LocalClient::Get;
+%unignore xla::swig::LocalClient::DeviceCount;
+%unignore xla::swig::LocalClient::TransferToInfeed;
+%unignore xla::swig::LocalClient::TransferFromOutfeed;
 %unignore xla::swig::LocalShapedBuffer;
 %unignore xla::swig::LocalShapedBuffer::FromLiteral;
 %unignore xla::swig::LocalShapedBuffer::ToLiteral;
 %unignore xla::swig::LocalShapedBuffer::shape;
+%unignore xla::swig::LocalShapedBuffer::DestructureTuple;
 %unignore xla::swig::LocalShapedBufferTuple;
 %unignore xla::swig::LocalShapedBufferTuple::Release;
 %unignore xla::swig::LocalShapedBufferTuple::size;
-%unignore xla::swig::XrtAllocation;
-%unignore xla::swig::XrtAllocation::FromLiteral;
-%unignore xla::swig::XrtAllocation::ToLiteral;
-%unignore xla::swig::XrtAllocation::shape;
-%unignore xla::swig::XrtAllocationTuple;
-%unignore xla::swig::XrtAllocationTuple::Release;
-%unignore xla::swig::XrtAllocationTuple::size;
-%unignore xla::swig::CompiledLocalComputation;
-%unignore xla::swig::CompiledLocalComputation::Execute;
-%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
-%unignore xla::swig::CompiledXrtComputation;
-%unignore xla::swig::CompiledXrtComputation::Execute;
-%unignore xla::swig::LocalComputation;
-%unignore xla::swig::LocalComputation::Compile;
-%unignore xla::swig::LocalComputation::CompileForXrt;
-%unignore xla::swig::LocalComputation::GetReturnValueShape;
-%unignore xla::swig::LocalComputation::GetSerializedProto;
+%unignore xla::swig::LocalExecutable;
+%unignore xla::swig::LocalExecutable::DeviceOrdinals;
+%unignore xla::swig::LocalExecutable::Execute;
+%unignore xla::swig::LocalExecutable::ExecutePerReplica;
+%unignore xla::swig::Computation;
+%unignore xla::swig::Computation::Compile;
+%unignore xla::swig::Computation::GetProgramShape;
+%unignore xla::swig::Computation::GetReturnValueShape;
+%unignore xla::swig::Computation::GetSerializedProto;
+%unignore xla::swig::Computation::GetHloText;
+%unignore xla::swig::Computation::GetHloDotGraph;
 %unignore xla::swig::LocalOp;
-%unignore xla::swig::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::Build;
-%unignore xla::swig::LocalComputationBuilder::BuildWithRoot;
-%unignore xla::swig::LocalComputationBuilder::SetOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::ClearOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::Parameter;
-%unignore xla::swig::LocalComputationBuilder::GetShape;
-%unignore xla::swig::LocalComputationBuilder::GetReturnValueShape;
-%unignore xla::swig::LocalComputationBuilder::Infeed;
-%unignore xla::swig::LocalComputationBuilder::Outfeed;
-%unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
-%unignore xla::swig::LocalComputationBuilder::ConstantR0;
-%unignore xla::swig::LocalComputationBuilder::Iota;
-%unignore xla::swig::LocalComputationBuilder::BroadcastedIota;
-%unignore xla::swig::LocalComputationBuilder::Broadcast;
-%unignore xla::swig::LocalComputationBuilder::BroadcastInDim;
-%unignore xla::swig::LocalComputationBuilder::Pad;
-%unignore xla::swig::LocalComputationBuilder::Reshape;
-%unignore xla::swig::LocalComputationBuilder::Collapse;
-%unignore xla::swig::LocalComputationBuilder::CrossReplicaSum;
-%unignore xla::swig::LocalComputationBuilder::Slice;
-%unignore xla::swig::LocalComputationBuilder::SliceInDim;
-%unignore xla::swig::LocalComputationBuilder::DynamicSlice;
-%unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
-%unignore xla::swig::LocalComputationBuilder::ConcatInDim;
-%unignore xla::swig::LocalComputationBuilder::SelectAndScatterWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::Select;
-%unignore xla::swig::LocalComputationBuilder::Tuple;
-%unignore xla::swig::LocalComputationBuilder::GetTupleElement;
-%unignore xla::swig::LocalComputationBuilder::ConvertElementType;
-%unignore xla::swig::LocalComputationBuilder::BitcastConvertType;
-%unignore xla::swig::LocalComputationBuilder::Call;
-%unignore xla::swig::LocalComputationBuilder::Transpose;
-%unignore xla::swig::LocalComputationBuilder::Rev;
-%unignore xla::swig::LocalComputationBuilder::Clamp;
-%unignore xla::swig::LocalComputationBuilder::Map;
-%unignore xla::swig::LocalComputationBuilder::Reduce;
-%unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::RngNormal;
-%unignore xla::swig::LocalComputationBuilder::RngUniform;
-%unignore xla::swig::LocalComputationBuilder::RngBernoulli;
-%unignore xla::swig::LocalComputationBuilder::While;
-%unignore xla::swig::LocalComputationBuilder::Conditional;
-%unignore xla::swig::LocalComputationBuilder::IsConstant;
-%unignore xla::swig::LocalComputationBuilder::Eq;
-%unignore xla::swig::LocalComputationBuilder::Ne;
-%unignore xla::swig::LocalComputationBuilder::Ge;
-%unignore xla::swig::LocalComputationBuilder::Gt;
-%unignore xla::swig::LocalComputationBuilder::Lt;
-%unignore xla::swig::LocalComputationBuilder::Le;
-%unignore xla::swig::LocalComputationBuilder::Dot;
-%unignore xla::swig::LocalComputationBuilder::DotGeneral;
-%unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
-%unignore xla::swig::LocalComputationBuilder::Add;
-%unignore xla::swig::LocalComputationBuilder::Sub;
-%unignore xla::swig::LocalComputationBuilder::Mul;
-%unignore xla::swig::LocalComputationBuilder::Div;
-%unignore xla::swig::LocalComputationBuilder::Rem;
-%unignore xla::swig::LocalComputationBuilder::Max;
-%unignore xla::swig::LocalComputationBuilder::Min;
-%unignore xla::swig::LocalComputationBuilder::And;
-%unignore xla::swig::LocalComputationBuilder::Or;
-%unignore xla::swig::LocalComputationBuilder::Xor;
-%unignore xla::swig::LocalComputationBuilder::ShiftLeft;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightArithmetic;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightLogical;
-%unignore xla::swig::LocalComputationBuilder::Not;
-%unignore xla::swig::LocalComputationBuilder::Abs;
-%unignore xla::swig::LocalComputationBuilder::Exp;
-%unignore xla::swig::LocalComputationBuilder::Expm1;
-%unignore xla::swig::LocalComputationBuilder::Floor;
-%unignore xla::swig::LocalComputationBuilder::Ceil;
-%unignore xla::swig::LocalComputationBuilder::Round;
-%unignore xla::swig::LocalComputationBuilder::Log;
-%unignore xla::swig::LocalComputationBuilder::Log1p;
-%unignore xla::swig::LocalComputationBuilder::Sign;
-%unignore xla::swig::LocalComputationBuilder::Cos;
-%unignore xla::swig::LocalComputationBuilder::Sin;
-%unignore xla::swig::LocalComputationBuilder::Tanh;
-%unignore xla::swig::LocalComputationBuilder::Atan2;
-%unignore xla::swig::LocalComputationBuilder::IsFinite;
-%unignore xla::swig::LocalComputationBuilder::Pow;
-%unignore xla::swig::LocalComputationBuilder::Neg;
-%unignore xla::swig::LocalComputationBuilder::Sort;
-%unignore xla::swig::LocalComputationBuilder::SortKeyVal;
-%unignore xla::swig::LocalComputationBuilder::Sqrt;
-%unignore xla::swig::LocalComputationBuilder::Rsqrt;
-%unignore xla::swig::LocalComputationBuilder::Square;
-%unignore xla::swig::LocalComputationBuilder::Reciprocal;
-%unignore xla::swig::LocalComputationBuilder::Erfc;
-%unignore xla::swig::LocalComputationBuilder::Erf;
-%unignore xla::swig::LocalComputationBuilder::ErfInv;
-%unignore xla::swig::LocalComputationBuilder::Lgamma;
-%unignore xla::swig::LocalComputationBuilder::Digamma;
-%unignore xla::swig::LocalComputationBuilder::Acos;
-%unignore xla::swig::LocalComputationBuilder::Asin;
-%unignore xla::swig::LocalComputationBuilder::Atan;
-%unignore xla::swig::LocalComputationBuilder::Tan;
-%unignore xla::swig::LocalComputationBuilder::Acosh;
-%unignore xla::swig::LocalComputationBuilder::Asinh;
-%unignore xla::swig::LocalComputationBuilder::Atanh;
-%unignore xla::swig::LocalComputationBuilder::Cosh;
-%unignore xla::swig::LocalComputationBuilder::Sinh;
-%unignore xla::swig::LocalComputationBuilder::Real;
-%unignore xla::swig::LocalComputationBuilder::Imag;
-%unignore xla::swig::LocalComputationBuilder::Conj;
-%unignore xla::swig::LocalComputationBuilder::Complex;
-%unignore xla::swig::LocalComputationBuilder::Cholesky;
-%unignore xla::swig::LocalComputationBuilder::QR;
-%unignore xla::swig::LocalComputationBuilder::TriangularSolve;
-%unignore xla::swig::LocalComputationBuilder::CustomCall;
-%unignore xla::swig::LocalComputationBuilder::Gather;
-%unignore xla::swig::LocalComputationBuilder::Scatter;
-%unignore xla::swig::DeleteLocalComputation;
-%unignore xla::swig::DestructureLocalShapedBufferTuple;
-%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::Build;
+%unignore xla::swig::ComputationBuilder::BuildWithRoot;
+%unignore xla::swig::ComputationBuilder::SetOpMetadata;
+%unignore xla::swig::ComputationBuilder::ClearOpMetadata;
+%unignore xla::swig::ComputationBuilder::Parameter;
+%unignore xla::swig::ComputationBuilder::GetShape;
+%unignore xla::swig::ComputationBuilder::GetReturnValueShape;
+%unignore xla::swig::ComputationBuilder::ReplicaId;
+%unignore xla::swig::ComputationBuilder::Infeed;
+%unignore xla::swig::ComputationBuilder::Outfeed;
+%unignore xla::swig::ComputationBuilder::ConstantLiteral;
+%unignore xla::swig::ComputationBuilder::ConstantR0;
+%unignore xla::swig::ComputationBuilder::Iota;
+%unignore xla::swig::ComputationBuilder::BroadcastedIota;
+%unignore xla::swig::ComputationBuilder::Broadcast;
+%unignore xla::swig::ComputationBuilder::BroadcastInDim;
+%unignore xla::swig::ComputationBuilder::Pad;
+%unignore xla::swig::ComputationBuilder::Reshape;
+%unignore xla::swig::ComputationBuilder::Collapse;
+%unignore xla::swig::ComputationBuilder::AllToAll;
+%unignore xla::swig::ComputationBuilder::CrossReplicaSum;
+%unignore xla::swig::ComputationBuilder::Slice;
+%unignore xla::swig::ComputationBuilder::SliceInDim;
+%unignore xla::swig::ComputationBuilder::DynamicSlice;
+%unignore xla::swig::ComputationBuilder::DynamicUpdateSlice;
+%unignore xla::swig::ComputationBuilder::ConcatInDim;
+%unignore xla::swig::ComputationBuilder::SelectAndScatterWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::Select;
+%unignore xla::swig::ComputationBuilder::Tuple;
+%unignore xla::swig::ComputationBuilder::GetTupleElement;
+%unignore xla::swig::ComputationBuilder::ConvertElementType;
+%unignore xla::swig::ComputationBuilder::BitcastConvertType;
+%unignore xla::swig::ComputationBuilder::Call;
+%unignore xla::swig::ComputationBuilder::Transpose;
+%unignore xla::swig::ComputationBuilder::Rev;
+%unignore xla::swig::ComputationBuilder::Clamp;
+%unignore xla::swig::ComputationBuilder::Map;
+%unignore xla::swig::ComputationBuilder::Reduce;
+%unignore xla::swig::ComputationBuilder::ReduceWindowWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::RngNormal;
+%unignore xla::swig::ComputationBuilder::RngUniform;
+%unignore xla::swig::ComputationBuilder::RngBernoulli;
+%unignore xla::swig::ComputationBuilder::While;
+%unignore xla::swig::ComputationBuilder::Conditional;
+%unignore xla::swig::ComputationBuilder::IsConstant;
+%unignore xla::swig::ComputationBuilder::Eq;
+%unignore xla::swig::ComputationBuilder::Ne;
+%unignore xla::swig::ComputationBuilder::Ge;
+%unignore xla::swig::ComputationBuilder::Gt;
+%unignore xla::swig::ComputationBuilder::Lt;
+%unignore xla::swig::ComputationBuilder::Le;
+%unignore xla::swig::ComputationBuilder::Dot;
+%unignore xla::swig::ComputationBuilder::DotGeneral;
+%unignore xla::swig::ComputationBuilder::ConvGeneralDilated;
+%unignore xla::swig::ComputationBuilder::Add;
+%unignore xla::swig::ComputationBuilder::Sub;
+%unignore xla::swig::ComputationBuilder::Mul;
+%unignore xla::swig::ComputationBuilder::Div;
+%unignore xla::swig::ComputationBuilder::Rem;
+%unignore xla::swig::ComputationBuilder::Max;
+%unignore xla::swig::ComputationBuilder::Min;
+%unignore xla::swig::ComputationBuilder::And;
+%unignore xla::swig::ComputationBuilder::Or;
+%unignore xla::swig::ComputationBuilder::Xor;
+%unignore xla::swig::ComputationBuilder::ShiftLeft;
+%unignore xla::swig::ComputationBuilder::ShiftRightArithmetic;
+%unignore xla::swig::ComputationBuilder::ShiftRightLogical;
+%unignore xla::swig::ComputationBuilder::Not;
+%unignore xla::swig::ComputationBuilder::Clz;
+%unignore xla::swig::ComputationBuilder::Abs;
+%unignore xla::swig::ComputationBuilder::Exp;
+%unignore xla::swig::ComputationBuilder::Expm1;
+%unignore xla::swig::ComputationBuilder::Floor;
+%unignore xla::swig::ComputationBuilder::Ceil;
+%unignore xla::swig::ComputationBuilder::Round;
+%unignore xla::swig::ComputationBuilder::Log;
+%unignore xla::swig::ComputationBuilder::Log1p;
+%unignore xla::swig::ComputationBuilder::Sign;
+%unignore xla::swig::ComputationBuilder::Cos;
+%unignore xla::swig::ComputationBuilder::Sin;
+%unignore xla::swig::ComputationBuilder::Tanh;
+%unignore xla::swig::ComputationBuilder::Atan2;
+%unignore xla::swig::ComputationBuilder::IsFinite;
+%unignore xla::swig::ComputationBuilder::Pow;
+%unignore xla::swig::ComputationBuilder::Neg;
+%unignore xla::swig::ComputationBuilder::Sort;
+%unignore xla::swig::ComputationBuilder::SortKeyVal;
+%unignore xla::swig::ComputationBuilder::Sqrt;
+%unignore xla::swig::ComputationBuilder::Rsqrt;
+%unignore xla::swig::ComputationBuilder::Square;
+%unignore xla::swig::ComputationBuilder::Reciprocal;
+%unignore xla::swig::ComputationBuilder::Erfc;
+%unignore xla::swig::ComputationBuilder::Erf;
+%unignore xla::swig::ComputationBuilder::ErfInv;
+%unignore xla::swig::ComputationBuilder::Lgamma;
+%unignore xla::swig::ComputationBuilder::Digamma;
+%unignore xla::swig::ComputationBuilder::Acos;
+%unignore xla::swig::ComputationBuilder::Asin;
+%unignore xla::swig::ComputationBuilder::Atan;
+%unignore xla::swig::ComputationBuilder::Tan;
+%unignore xla::swig::ComputationBuilder::Acosh;
+%unignore xla::swig::ComputationBuilder::Asinh;
+%unignore xla::swig::ComputationBuilder::Atanh;
+%unignore xla::swig::ComputationBuilder::Cosh;
+%unignore xla::swig::ComputationBuilder::Sinh;
+%unignore xla::swig::ComputationBuilder::Real;
+%unignore xla::swig::ComputationBuilder::Imag;
+%unignore xla::swig::ComputationBuilder::Conj;
+%unignore xla::swig::ComputationBuilder::Complex;
+%unignore xla::swig::ComputationBuilder::Cholesky;
+%unignore xla::swig::ComputationBuilder::QR;
+%unignore xla::swig::ComputationBuilder::Eigh;
+%unignore xla::swig::ComputationBuilder::SVD;
+%unignore xla::swig::ComputationBuilder::TriangularSolve;
+%unignore xla::swig::ComputationBuilder::CustomCall;
+%unignore xla::swig::ComputationBuilder::Gather;
+%unignore xla::swig::ComputationBuilder::Scatter;
+%unignore xla::swig::DeleteComputation;
 %unignore xla::swig::DeleteLocalShapedBuffer;
-%unignore xla::swig::DeleteXrtAllocation;
-%unignore xla::swig::DeleteCompiledLocalComputation;
-%unignore xla::swig::DeleteCompiledXrtComputation;
+%unignore xla::swig::DeleteLocalExecutable;
 
 %thread;
 %include "tensorflow/compiler/xla/python/local_computation_builder.h"
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 52c5c621f7294c5da341879d15b77559fe870551..de7b1e48a8dbb8f2cdb1709258a218f4a07e7688 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -26,6 +26,10 @@ namespace swig {
 
 namespace numpy {
 
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
 int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case PRED:
@@ -123,28 +127,42 @@ bool NumpyTypeIsValid(int np_type) {
   }
 }
 
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape) {
   int np_typenum = PrimitiveTypeToNumpyType(shape.element_type());
   PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
 
-  PyObject* dimensions;
+  Safe_PyObjectPtr dimensions;
   if (shape.IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(shape);
-    dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
+    dimensions = make_safe(PyTuple_New(ShapeUtil::TupleElementCount(shape)));
     for (int i = 0; i < num_elements; ++i) {
       PyTuple_SET_ITEM(
-          dimensions, i,
-          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
+          dimensions.get(), i,
+          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i))
+              .release());
     }
   } else {
     int rank = shape.rank();
-    dimensions = PyTuple_New(rank);
+    dimensions = make_safe(PyTuple_New(rank));
     for (int i = 0; i < rank; ++i) {
-      PyTuple_SET_ITEM(dimensions, i,
+      PyTuple_SET_ITEM(dimensions.get(), i,
                        LongToPyIntOrPyLong(ShapeUtil::GetDimension(shape, i)));
     }
   }
-  return PyTuple_Pack(2, np_dtype, dimensions);
+  return make_safe(PyTuple_Pack(2, np_dtype, dimensions.release()));
+}
+
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape) {
+  Safe_PyObjectPtr arg_shapes = make_safe(PyTuple_New(shape.parameters_size()));
+  for (int i = 0; i < shape.parameters_size(); ++i) {
+    PyTuple_SET_ITEM(arg_shapes.get(), i,
+                     PyShapeInfoFromXlaShape(shape.parameters(i)).release());
+  }
+
+  Safe_PyObjectPtr result_shape = PyShapeInfoFromXlaShape(shape.result());
+  return make_safe(
+      PyTuple_Pack(2, arg_shapes.release(), result_shape.release()));
 }
 
 // Precondition: o->ob_type == &PyArrayDescr_Type
@@ -349,13 +367,17 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   return result;
 }
 
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal) {
   if (literal.shape().IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
-    PyObject* tuple = PyTuple_New(num_elements);
+    std::vector<Safe_PyObjectPtr> elems(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      TF_ASSIGN_OR_RETURN(elems[i],
+                          PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+    }
+    Safe_PyObjectPtr tuple = make_safe(PyTuple_New(num_elements));
     for (int i = 0; i < num_elements; i++) {
-      PyTuple_SET_ITEM(tuple, i,
-                       PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+      PyTuple_SET_ITEM(tuple.get(), i, elems[i].release());
     }
     return tuple;
   } else {
@@ -365,10 +387,10 @@ PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
       dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
     }
     int np_type = PrimitiveTypeToNumpyType(literal.shape().element_type());
-    PyObject* array =
-        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0);
-    CopyLiteralToNumpyArray(np_type, literal,
-                            reinterpret_cast<PyArrayObject*>(array));
+    Safe_PyObjectPtr array = make_safe(
+        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0));
+    TF_RETURN_IF_ERROR(CopyLiteralToNumpyArray(
+        np_type, literal, reinterpret_cast<PyArrayObject*>(array.get())));
     return array;
   }
 }
@@ -408,6 +430,12 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_BOOL:
       CopyNumpyArrayToLiteral<bool>(py_array, literal);
       break;
+    case NPY_INT8:
+      CopyNumpyArrayToLiteral<int8>(py_array, literal);
+      break;
+    case NPY_INT16:
+      CopyNumpyArrayToLiteral<int16>(py_array, literal);
+      break;
     case NPY_INT32:
       CopyNumpyArrayToLiteral<int32>(py_array, literal);
       break;
@@ -417,6 +445,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_UINT8:
       CopyNumpyArrayToLiteral<uint8>(py_array, literal);
       break;
+    case NPY_UINT16:
+      CopyNumpyArrayToLiteral<uint16>(py_array, literal);
+      break;
     case NPY_UINT32:
       CopyNumpyArrayToLiteral<uint32>(py_array, literal);
       break;
@@ -445,12 +476,18 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
   return Status::OK();
 }
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array) {
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array) {
   switch (np_type) {
     case NPY_BOOL:
       CopyLiteralToNumpyArray<bool>(literal, py_array);
       break;
+    case NPY_INT8:
+      CopyLiteralToNumpyArray<int8>(literal, py_array);
+      break;
+    case NPY_INT16:
+      CopyLiteralToNumpyArray<int16>(literal, py_array);
+      break;
     case NPY_INT32:
       CopyLiteralToNumpyArray<int32>(literal, py_array);
       break;
@@ -460,6 +497,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
     case NPY_UINT8:
       CopyLiteralToNumpyArray<uint8>(literal, py_array);
       break;
+    case NPY_UINT16:
+      CopyLiteralToNumpyArray<uint16>(literal, py_array);
+      break;
     case NPY_UINT32:
       CopyLiteralToNumpyArray<uint32>(literal, py_array);
       break;
@@ -482,8 +522,10 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
       CopyLiteralToNumpyArray<complex128>(literal, py_array);
       break;
     default:
-      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+      return InvalidArgument(
+          "No XLA literal container for Numpy type number: %d", np_type);
   }
+  return Status::OK();
 }
 
 PyObject* LongToPyIntOrPyLong(long x) {  // NOLINT
@@ -525,6 +567,118 @@ PyObject* PyNumberToPyInt(PyObject* o) {
 
 }  // namespace numpy
 
+bool GetIntAttr(PyObject* o, const char* field, int64* result) {
+  PyObject* fo = PyObject_GetAttrString(o, field);
+  if (!fo) {
+    return false;
+  }
+  const int64 value = numpy::PyIntOrPyLongToLong(fo);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(fo);
+    return false;
+  }
+  Py_DECREF(fo);
+  *result = value;
+  return true;
+}
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+#if PY_MAJOR_VERSION < 3
+  if (!PyString_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyString_AsString(attr));
+#else
+  if (!PyBytes_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyBytes_AsString(attr));
+#endif
+
+  Py_DECREF(attr);
+  return true;  // Handled string attribute, ok!
+}
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleBoolAttribute(PyObject* o, const char* attr_name,
+                         std::function<void(bool b)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+  if (!PyBool_Check(attr)) {
+    string message = absl::StrFormat("%s must be a boolean or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyObject_IsTrue(attr));
+  Py_DECREF(attr);
+  return true;  // Handled boolean attribute, ok!
+}
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
+  PyObject* seq = PyObject_GetAttrString(o, attr_name);
+  if (!seq) {
+    return false;
+  }
+
+  int length = PySequence_Size(seq);
+  if (length == -1) {
+    Py_DECREF(seq);
+    return false;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(seq, i);
+    if (!item) {
+      Py_DECREF(seq);
+      return false;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(seq);
+      return false;
+    }
+    *field->Add() = dimension;
+    Py_DECREF(item);
+  }
+  Py_DECREF(seq);
+  return true;
+}
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 40ff2d9ad214cc4dcad42234fa296834cbc92882..d7a611d7fd5c708b313db04cce8e05f1a72c5e47 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -36,6 +36,16 @@ namespace swig {
 
 namespace numpy {
 
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
+
+Safe_PyObjectPtr make_safe(PyObject* object);
+
 // Maps XLA primitive types (PRED, S8, F32, ..., and TUPLE) to numpy
 // dtypes (NPY_BOOL, NPY_INT8, NPY_FLOAT32, ..., and NPY_OBJECT), and
 // vice versa.
@@ -54,7 +64,13 @@ bool NumpyTypeIsValid(int np_type);
 // providing the array dimensions.
 //
 // The return value is a new reference.
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape);
+
+// Returns a pair of (arg_shapes, result_shape), where arg_shapes is a tuple
+// of argument shapes and result_shape is the result shape. Each shape is as
+// described in in PyShapeInfoFromXlaShape's comment.
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape);
 
 // Converts a Python object with a method interface mathing that of
 // xla_client.Shape into an XLA Shape object.
@@ -74,7 +90,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
 // array data.
 //
 // The return value is a new reference.
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal);
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal);
 
 // Converts a Numpy ndarray or a nested Python tuple thereof to a
 // corresponding XLA literal.
@@ -90,8 +106,8 @@ StatusOr<Literal> XlaLiteralFromPyObject(PyObject* o);
 Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
                                Literal* literal);
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array);
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array);
 
 template <typename NativeT>
 void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
@@ -120,6 +136,20 @@ PyObject* PyNumberToPyInt(PyObject* o);
 
 }  // namespace numpy
 
+// Miscellaneous swig helpers that don't have a better home.
+
+bool GetIntAttr(PyObject* o, const char* field, int64* result);
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f);
+bool HandleBoolAttribute(PyObject* o, const char* attr_name,
+                         std::function<void(bool b)> f);
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field);
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..ef77ed3d95850fdfc7145e6fe1df4833d20bb7df
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
@@ -0,0 +1,2 @@
+_PyInit__pywrap_xla
+_init_pywrap_xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..d31cfce7be7b6accf05ef77f3485904099965afc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
@@ -0,0 +1,6 @@
+xla {
+  global:
+    PyInit_*;
+  local:
+    *;
+};
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 1684cb20e6d2de8bf4a2545092f82cc42394790a..cb7d19d39b1d50a5bec564b59779c6f93018ed81 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An in-process, local XLA client in Python, supporting AOT compilation."""
+"""An XLA client in Python, supporting AOT compilation."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 import enum  # pylint: disable=g-bad-import-order
 import inspect
@@ -29,17 +30,39 @@ import numpy as np
 import six
 from six.moves import xrange
 
-from tensorflow.compiler.xla import xla_data_pb2
+# Note this module does *not* depend on any Python protocol buffers. The XLA
+# Python bindings are currently packaged both as part of jaxlib and as part
+# of TensorFlow. If we use protocol buffers here, then importing both jaxlib
+# and TensorFlow may fail with duplicate protocol buffer message definitions.
+
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
-from tensorflow.compiler.xla.service import hlo_pb2
+
+# Import the XRT backend, if available.
+try:
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.compiler.xla.python import pywrap_xrt as xrt_api
+except ImportError:
+  xrt_api = None
 
 
 # Most functions are snake_case for consistency with other modules, whereas
-# method names of ComputationBuilder and LocalComputation are CamelCase for
+# method names of ComputationBuilder and Computation are CamelCase for
 # consistency with XLA.
 # pylint: disable=invalid-name
 
 
+# Version of the XLA Python client.
+#
+# JAX packages the XLA python plugin as a binary pip module (jaxlib) that is
+# packaged separately from the Python code that consumes it (jax).
+#
+# We occasionally need to make backwards-incompatible changes to jaxlib, in
+# which case we need to be able to detect when incompatible versions are
+# installed.
+def version():
+  return (0, 1, 8)
+
+
 _OP_METADATA_FIELDS = [
     'op_type',
     'op_name',
@@ -49,22 +72,163 @@ _OP_METADATA_FIELDS = [
 OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
 
 
+@six.add_metaclass(abc.ABCMeta)
+class Backend(object):
+  """Abstract base class for XLA backends."""
+
+  @abc.abstractmethod
+  def device_count(self):
+    """Returns the number of devices known to the backend."""
+
+  @abc.abstractmethod
+  def buffer_from_pyval(self, pyval, device=0):
+    """Allocates a fresh buffer and populates it with `pyval`."""
+
+  @abc.abstractmethod
+  def delete_buffer(self, c_buffer):
+    """Deletes buffer `c_buffer`."""
+
+  @abc.abstractmethod
+  def destructure_tuple(self, c_buffer):
+    """Destructures a tuple buffer into a sequence of buffers."""
+
+  @abc.abstractmethod
+  def compile(self, computation, argument_shapes, result_shape,
+              compile_options):
+    """Compiles a computation. Returns an executable."""
+
+  @abc.abstractmethod
+  def delete_executable(self, executable):
+    """Deletes an executable."""
+
+  @abc.abstractmethod
+  def execute(self, executable, args):
+    """Runs an executable without replication."""
+
+  @abc.abstractmethod
+  def execute_replicated(self, executable, per_replica_args):
+    """Runs an executable in a replicated manner."""
+
+
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
+class XlaLocalBackend(Backend):
+  """XLA backend implemented using the in-process xla::LocalClient API."""
+
+  def __init__(self, platform=None):
+    platform = platform or _get_default_platform_name()
+    self.client = c_api.LocalClient.Get(_maybe_encode_string(platform))
+    self._delete_buffer = c_api.DeleteLocalShapedBuffer
+    self._delete_executable = c_api.DeleteLocalExecutable
+
+  def device_count(self):
+    return self.client.DeviceCount()
+
+  def buffer_from_pyval(self, pyval, device=0):
+    return c_api.LocalShapedBuffer.FromLiteral(pyval, None, self.client, device)
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = c_buffer.DestructureTuple()
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return c_computation.Compile(argument_shapes, compile_options, self.client)
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    output_buffer_tup = executable.ExecutePerReplica(per_replica_args)
+    size = output_buffer_tup.size()
+    return [output_buffer_tup.Release(i) for i in xrange(size)]
+
+
+class XrtBackend(Backend):
+  """XLA backend implemented using XRT."""
+
+  def __init__(self, target):
+    self.target = target
+    self._delete_buffer = xrt_api.DeleteXrtAllocation
+    self._delete_executable = xrt_api.DeleteXrtExecutable
+
+  def device_count(self):
+    return 1  # Multidevice execution not implemented.
+
+  def buffer_from_pyval(self, pyval, device=0):
+    if device != 0:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return xrt_api.XrtAllocation.FromLiteral(pyval,
+                                             _maybe_encode_string(self.target))
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = xrt_api.DestructureXrtAllocationTuple(
+        c_buffer, _maybe_encode_string(self.target))
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return xrt_api.XrtExecutable.CompileForXrt(
+        c_computation.GetSerializedProto(), argument_shapes, result_shape,
+        _maybe_encode_string(self.target))
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    if len(per_replica_args) != 1:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return [executable.Execute(per_replica_args[0])]
+
+
+_default_platform_name = 'Host'
+_default_backend = None
+
+
+def _get_default_platform_name():
+  return _default_platform_name
+
+
+def _get_default_local_backend():
+  global _default_backend
+  global _default_platform_name
+  if _default_backend is None:
+    _default_backend = XlaLocalBackend(_default_platform_name)
+  return _default_backend
+
+
 class BackendType(enum.Enum):
   XLA_LOCAL = 1
   XRT = 2
 
 
-BackendSpec = collections.namedtuple('Backend', ('backend_type', 'target'))
-XLA_LOCAL_BACKEND = BackendSpec(BackendType.XLA_LOCAL, 'local')
-
-
-def OpMetadataToProto(pyobj):
-  proto = xla_data_pb2.OpMetadata()
-  for field in _OP_METADATA_FIELDS:
-    attr = getattr(pyobj, field)
-    if attr is not None:
-      setattr(proto, field, attr)
-  return proto
+def BackendSpec(backend, target):
+  """Compatibility wrapper to support older clients. Do not use in new code."""
+  if backend == BackendType.XLA_LOCAL:
+    return _get_default_local_backend()
+  elif backend == BackendType.XRT:
+    return XrtBackend(target)
+  else:
+    raise ValueError('Unknown backend {}'.format(backend))
 
 
 def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
@@ -78,13 +242,6 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
-def _maybe_encode_string(s):
-  if six.PY3:
-    return s.encode('utf-8')
-  else:
-    return s
-
-
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -113,8 +270,7 @@ def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
     pad_sizes = [max((out_size - 1) * stride + filter_size - in_size, 0)
                  for out_size, stride, filter_size, in_size
                  in zip(out_shape, window_strides, rhs_dims, lhs_dims)]
-    return [(pad_size // 2, pad_size - pad_size // 2)
-            for pad_size in pad_sizes]
+    return [(pad_size // 2, pad_size - pad_size // 2) for pad_size in pad_sizes]
   else:
     msg = 'Unexpected PaddingType value: {}'
     raise ValueError(msg.format(padding_type))
@@ -122,6 +278,7 @@ def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
 
 _UNARY_OPS = [
     'Not',
+    'Clz',
     'Abs',
     'Exp',
     'Expm1',
@@ -185,29 +342,56 @@ _BINARY_OPS = [
 ]
 
 
+class PrimitiveType(enum.IntEnum):
+  """Python copy of the XLA PrimitiveType enum.
+
+  Must match the corresponding protocol buffer.
+  """
+  PRIMITIVE_TYPE_INVALID = 0
+  PRED = 1
+  S8 = 2
+  S16 = 3
+  S32 = 4
+  S64 = 5
+  U8 = 6
+  U16 = 7
+  U32 = 8
+  U64 = 9
+  BF16 = 16
+  F16 = 10
+  F32 = 11
+  F64 = 12
+  C64 = 15
+  C128 = 18
+  TUPLE = 13
+  OPAQUE = 14
+  TOKEN = 17
+
+
 XLA_ELEMENT_TYPE_TO_DTYPE = {
-    xla_data_pb2.PRED: np.dtype('bool'),
-    xla_data_pb2.S8: np.dtype('int8'),
-    xla_data_pb2.S16: np.dtype('int16'),
-    xla_data_pb2.S32: np.dtype('int32'),
-    xla_data_pb2.S64: np.dtype('int64'),
-    xla_data_pb2.U8: np.dtype('uint8'),
-    xla_data_pb2.U16: np.dtype('uint16'),
-    xla_data_pb2.U32: np.dtype('uint32'),
-    xla_data_pb2.U64: np.dtype('uint64'),
-    xla_data_pb2.F16: np.dtype('float16'),
-    xla_data_pb2.F32: np.dtype('float32'),
-    xla_data_pb2.F64: np.dtype('float64'),
-    xla_data_pb2.C64: np.dtype('complex64'),
-    xla_data_pb2.C128: np.dtype('complex128'),
-    xla_data_pb2.TUPLE: np.dtype(np.object),
+    PrimitiveType.PRED: np.dtype('bool'),
+    PrimitiveType.S8: np.dtype('int8'),
+    PrimitiveType.S16: np.dtype('int16'),
+    PrimitiveType.S32: np.dtype('int32'),
+    PrimitiveType.S64: np.dtype('int64'),
+    PrimitiveType.U8: np.dtype('uint8'),
+    PrimitiveType.U16: np.dtype('uint16'),
+    PrimitiveType.U32: np.dtype('uint32'),
+    PrimitiveType.U64: np.dtype('uint64'),
+    PrimitiveType.F16: np.dtype('float16'),
+    PrimitiveType.F32: np.dtype('float32'),
+    PrimitiveType.F64: np.dtype('float64'),
+    PrimitiveType.C64: np.dtype('complex64'),
+    PrimitiveType.C128: np.dtype('complex128'),
+    PrimitiveType.TUPLE: np.dtype(np.object),
 }
 
 # Note the conversion on the key. Numpy has a known issue wherein dtype hashing
 # doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
 # when keying by dtype in this dict, we use the string form of dtypes.
-DTYPE_TO_XLA_ELEMENT_TYPE = {str(dt): et
-                             for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()}
+DTYPE_TO_XLA_ELEMENT_TYPE = {
+    str(dt): et for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()
+}
 
 
 def dtype_to_etype(dtype):
@@ -223,33 +407,18 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend, replica):
+  def __init__(self, c_buffer, backend, device):
     self.c_buffer = c_buffer
     self._backend = backend
-    self._replica = replica
-    if backend.backend_type == BackendType.XRT:
-      self._delete = c_api.DeleteXrtAllocation
-    else:
-      self._delete = c_api.DeleteLocalShapedBuffer
+    self._device = device
 
   @staticmethod
-  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, device=0, backend=None):
     """Allocate and copy to XLA the given python value."""
+    backend = backend or _get_default_local_backend()
     pyval = require_numpy_array_layout(pyval)
-    num_replicas = get_replica_count()
-    if not 0 <= replica < num_replicas:
-      raise ValueError(
-          'Attempt to place buffer on replica {} when the replica count is {}'
-          .format(replica, num_replicas))
-    if backend.backend_type == BackendType.XRT:
-      if replica != 0:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      cbuf = c_api.XrtAllocation.FromLiteral(
-          pyval, _maybe_encode_string(backend.target))
-    else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
-    return LocalBuffer(cbuf, backend, replica)
+    cbuf = backend.buffer_from_pyval(pyval, device)
+    return LocalBuffer(cbuf, backend, device)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -257,29 +426,24 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
-  def replica(self):
-    return self._replica
+  def device(self):
+    return self._device
 
   def delete(self):
     if self.c_buffer is not None:
-      self._delete(self.c_buffer)
+      # Python may have freed c_api first.
+      if c_api:
+        self._backend.delete_buffer(self.c_buffer)
       self.c_buffer = None
 
   def destructure(self):
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
-    if self._backend.backend_type == BackendType.XRT:
-      result = c_api.DestructureXrtAllocationTuple(
-          self.c_buffer, _maybe_encode_string(self._backend.target))
-    else:
-      result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
+    result = self._backend.destructure_tuple(self.c_buffer)
     self.delete()
-    size = result.size()
-    destructured = tuple(
-        LocalBuffer(
-            result.Release(i), replica=self._replica, backend=self._backend)
-        for i in xrange(size))
-    return destructured
+    return tuple(
+        LocalBuffer(sub_buffer, device=self._device, backend=self._backend)
+        for sub_buffer in result)
 
   def is_deleted(self):
     return self.c_buffer is None
@@ -288,6 +452,13 @@ class LocalBuffer(object):
     self.delete()
 
 
+class Format(enum.IntEnum):
+  """Python copy of the Format protocol buffer enum."""
+  INVALID_FORMAT = 0
+  DENSE = 1
+  SPARSE = 2
+
+
 class Shape(object):
   """Represents an XLA shape.
 
@@ -317,8 +488,8 @@ class Shape(object):
     if (not isinstance(dimensions, tuple) or
         not all(isinstance(i, int) for i in dimensions)):
       dimensions = tuple(int(i) for i in dimensions)
-    return Shape(dimensions, np.dtype(element_type),
-                 minor_to_major=minor_to_major)
+    return Shape(
+        dimensions, np.dtype(element_type), minor_to_major=minor_to_major)
 
   @staticmethod
   def from_pyval(pyval):
@@ -397,8 +568,8 @@ class Shape(object):
     """Map f over each leaf-level array subshape.
 
     Args:
-      f: The function to apply. Whenever f returns None, the identity is
-        applied instead.
+      f: The function to apply. Whenever f returns None, the identity is applied
+        instead.
 
     Returns:
       A new Shape with the mapped leaves.
@@ -423,22 +594,56 @@ class Shape(object):
       raise ValueError('not an array shape')
     if not isinstance(minor_to_major, tuple):
       raise TypeError('minor_to_major must be a tuple')
-    updated = Shape.array_shape(
-        self.element_type(), self.dimensions(), minor_to_major)
+    updated = Shape.array_shape(self.element_type(), self.dimensions(),
+                                minor_to_major)
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
+  def with_major_to_minor_layout_if_absent(self):
+    """Returns a copy of a shape with missing layouts set to major-to-minor."""
+
+    def f(a):
+      if a.minor_to_major():
+        return None
+      return a.update_minor_to_major(tuple(xrange(a.rank() - 1, -1, -1)))
+
+    return self.map_leaves(f)
+
+  def serialize(self, proto):
+    """Serializes 'shape' into proto."""
+    if self.is_tuple():
+      proto.element_type = PrimitiveType.TUPLE
+      for shape in self.tuple_shapes():
+        shape.serialize(proto.tuple_shapes.add())
+    else:
+      proto.element_type = dtype_to_etype(self.element_type())
+      proto.dimensions.extend(self.dimensions())
+      proto.is_dynamic_dimension.extend([False for _ in self.dimensions()])
+      if self.minor_to_major():
+        proto.layout.format = Format.DENSE
+        proto.layout.minor_to_major.extend(self.minor_to_major())
+
+
+ProgramShape = collections.namedtuple('ProgramShape',
+                                      ('parameter_shapes', 'result_shape'))
+
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
-  if element_type == xla_data_pb2.TUPLE:
+  if element_type == PrimitiveType.TUPLE:
     shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
     return Shape.tuple_shape(shapes)
   else:
     return Shape.array_shape(dtype, dims)
 
 
+def _wrap_program_shape(shape_info):
+  arg_shapes, result_shape = shape_info
+  return ProgramShape([_wrap_shape(arg) for arg in arg_shapes],
+                      _wrap_shape(result_shape))
+
+
 def require_numpy_array_layout(value):
   if isinstance(value, tuple):
     return tuple(require_numpy_array_layout(x) for x in value)
@@ -454,14 +659,16 @@ class CompileOptions(object):
   """
 
   def __init__(self):
-    self.generate_hlo_graph = None
-    self.dump_optimized_hlo_proto_to = None
-    self.dump_unoptimized_hlo_proto_to = None
-    self.dump_per_pass_hlo_proto_to = None
-    self.hlo_profile = False
+    self.xla_dump_to = None
+    self.dump_hlo_pass_re = None
+    self.dump_hlo_module_re = None
+    self.dump_hlo_as_text = None
+    self.dump_hlo_as_proto = None
+    self.hlo_profile = None
+    self.num_replicas = get_replica_count()
 
 
-def transfer_to_infeed(value, replica_number=None):
+def transfer_to_infeed(value, device_ordinal=0):
   """Transfers the given value into the XLA infeed queue.
 
   XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
@@ -471,96 +678,92 @@ def transfer_to_infeed(value, replica_number=None):
   Args:
     value: the value that the caller would like to enqueue into the XLA infeed
       queue
-    replica_number: the replica number to infeed the value to -- if not
-      provided, then the default replica (trivially replica 0) is used.
+    device_ordinal: the device to infeed the value to. Each device has a
+      distinct infeed queue.
   """
-  if replica_number is None:
-    c_api.TransferToInfeedLocal(require_numpy_array_layout(value))
-  else:
-    c_api.TransferToInfeedLocalReplica(
-        require_numpy_array_layout(value), replica_number)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  backend.client.TransferToInfeed(
+      require_numpy_array_layout(value), device_ordinal)
 
 
-def transfer_from_outfeed(shape, replica_number=None):
-  """Transfers a literal of the given shape from replica_number's outfeed.
+def transfer_from_outfeed(shape, device_ordinal=0):
+  """Transfers a literal of the given shape from `device_ordinal`'s outfeed.
 
   Args:
     shape: The shape of the value to transfer from outfeed.
-    replica_number: The replica number ordinal to transfer the outfeed value
-      from. (Each replica has a distinct outfeed queue.)
+    device_ordinal: The device ordinal to transfer the outfeed value from. Each
+      device has a distinct outfeed queue..
 
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  return backend.client.TransferFromOutfeed(shape, device_ordinal)
 
 
-class LocalComputation(object):
-  """Python wrapper for a local XLA Computation.
+class Computation(object):
+  """Python wrapper for an XLA Computation.
 
-  A LocalComputation can be executed if it is compiled. Otherwise, it
-  can still be used as a Computation where required by the
-  ComputationBuilder methods.
+  A Computation can be compiled to form an Executable, or used as a
+  subcomputation in ComputationBuilder methods.
   """
 
-  def __init__(self, c_computation, is_compiled, backend=XLA_LOCAL_BACKEND):
+  def __init__(self, c_computation, backend=None):
     self._c_computation = c_computation
+    # The backend argument is deprecated. Pass a backend to Compile() instead.
     self._backend = backend
-    self._is_compiled = is_compiled
-
-    # Ensure a reference to C-based destructor for use in __del__.
-    if is_compiled:
-      if backend.backend_type == BackendType.XRT:
-        assert isinstance(c_computation, c_api.CompiledXrtComputation)
-        self._delete = c_api.DeleteCompiledXrtComputation
-      else:
-        assert isinstance(c_computation, c_api.CompiledLocalComputation)
-        self._delete = c_api.DeleteCompiledLocalComputation
-    else:
-      assert isinstance(c_computation, c_api.LocalComputation)
-      self._delete = c_api.DeleteLocalComputation
+    self._delete_computation = c_api.DeleteComputation
 
   @property
   def computation(self):
-    if self._is_compiled:
-      raise ValueError(
-          'Attempt to read the XLA computation of a compiled LocalComputation.')
     return self._c_computation
 
-  def GetProto(self):
-    """Get the HloModuleProto proto object in this local computation.
+  def GetSerializedProto(self):
+    """Gets the serialized HloModuleProto proto object in this computation.
 
     Returns:
-       An HloModuleProto proto object that has the whole-graph information.
+       A string containing a serialized HloModuleProto proto containing the
+       computation and its dependencies.
     """
-    serialized = self.computation.GetSerializedProto()
-    proto = hlo_pb2.HloModuleProto.FromString(serialized)
-    return proto
+    return self.computation.GetSerializedProto()
+
+  def GetHloText(self):
+    """Get the textual HLO representation of this computation.
 
-  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
-    """Compiles an un-compiled local computation.
+    Returns:
+       A string containing the textual HLO.
+    """
+    return self.computation.GetHloText()
+
+  def GetHloDotGraph(self):
+    """Get a Graphviz Dot representation of this computation.
+
+    Returns:
+       A string containing the graphviz dot graph.
+    """
+    return self.computation.GetHloDotGraph()
 
-    Local computations are the result of a "LocalComputationBuild'ing" process
-    -- they start in uncompiled form, and via a call to Compile() turn into a
-    compiled local computation.
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None,
+              backend=None):
+    """Compiles a computation.
 
-    Raises:
-      ValueError: if this is already a compiled local computation.
+    Computations are the result of a "ComputationBuild'ing" process.
 
     Arguments:
       argument_shapes: parameter shapes -- they are first laid out by layout_fn
         if layout_fn is provided. Otherwise, the default layout for those shapes
         will be used.
-      compile_options: options to use for compilation, includes an optional
-        laid out result shape for the computation.
+      compile_options: options to use for compilation, includes an optional laid
+        out result shape for the computation.
       layout_fn: lambda that is used to lay out the argument/result shapes.
+      backend: a `Backend` for which an executable should be generated.
 
     Returns:
-      A newly *compiled* local computation instance.
+      A Executable instance.
     """
-    if self._is_compiled:
-      raise ValueError('Attempt to compile a compiled local XLA computation.')
-
+    backend = backend or self._backend or _get_default_local_backend()
     result_shape = _wrap_shape(self.computation.GetReturnValueShape())
 
     if layout_fn:
@@ -573,32 +776,52 @@ class LocalComputation(object):
 
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
-    if self._backend.backend_type == BackendType.XRT:
-      c = self.computation.CompileForXrt(
-          argument_shapes, _maybe_encode_string(self._backend.target))
-    else:
-      c = self.computation.Compile(argument_shapes, compile_options)
-    return LocalComputation(c, is_compiled=True, backend=self._backend)
+    c = backend.compile(self.computation, argument_shapes, result_shape,
+                        compile_options)
+    return Executable(c, backend=backend)
 
   def CompileWithExampleArguments(self,
                                   arguments=(),
                                   compile_options=None,
-                                  layout_fn=None):
+                                  layout_fn=None,
+                                  backend=None):
     return self.Compile(
         argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
-        layout_fn=layout_fn)
+        layout_fn=layout_fn,
+        backend=backend)
+
+  def GetProgramShape(self):
+    return _wrap_program_shape(self._c_computation.GetProgramShape())
 
   def GetReturnValueShape(self):
     return _wrap_shape(self._c_computation.GetReturnValueShape())
 
+  def __del__(self):
+    if self._c_computation:
+      self._delete_computation(self._c_computation)
+
+
+class Executable(object):
+  """Python wrapper for an XLA Executable."""
+
+  def __init__(self, c_executable, backend=None):
+    self._c_executable = c_executable
+    self._device_ordinals = c_executable.DeviceOrdinals()
+    self._backend = backend
+
+  def DeviceOrdinals(self):
+    """Returns a list containing the device ordinals for each replica."""
+    return self._device_ordinals
+
   def Execute(self, arguments=(), check_for_deleted_args=True):
     """Execute on one replica with LocalBuffer arguments and return value."""
     if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
       raise ValueError('Executing with deleted local buffer argument')
     raw_args = [arg.c_buffer for arg in arguments]
-    output_buffer = self._c_computation.Execute(raw_args)
-    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+    output_buffer = self._backend.execute(self._c_executable, raw_args)
+    return LocalBuffer(
+        output_buffer, backend=self._backend, device=self._device_ordinals[0])
 
   def ExecutePerReplica(self, arguments=None):
     """Execute on many replicas with LocalBuffer arguments and return value.
@@ -608,14 +831,12 @@ class LocalComputation(object):
         sequence comprises the arguments for execution on the i'th replica.
 
     Returns:
-      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      A list of the computation's outputs for each replica, as a LocalBuffer. If
       a shallow sequence of arguments was passed in for `arguments`, then the
       sole, zero'th replica's output is returned instead, as a LocalBuffer.
     """
-    if not self._is_compiled:
-      raise ValueError('Cannot execute an uncompiled local XLA computation.')
     if arguments is None:
-      arguments = ((),) * get_replica_count()
+      arguments = ((),) * len(self._device_ordinals)
     else:
       arguments = [list(replica_args) for replica_args in arguments]
 
@@ -624,37 +845,35 @@ class LocalComputation(object):
       for arg in replica_args:
         if arg.is_deleted():
           raise ValueError('Executing with deleted local buffer argument')
-        if arg.replica() != replica:
+        if arg.device() != self._device_ordinals[replica]:
           raise ValueError(
-              'Executing on replica {} with argument from replica {}'.format(
-                  replica, arg.replica()))
+              'Executing on device {} with argument from device {}'.format(
+                  self._device_ordinals[replica], arg.device()))
 
     # Pull out argument buffer handles
+    # pylint: disable=g-complex-comprehension
     stripped_args = [
         [arg.c_buffer for arg in replica_args] for replica_args in arguments
     ]
 
     # Execute
-    if self._backend.backend_type == BackendType.XRT:
-      if len(stripped_args) > 1:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      output_buffers = [self._c_computation.Execute(stripped_args[0])]
-    else:
-      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
-      size = output_buffer_tup.size()
-      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+    output_buffers = self._backend.execute_replicated(self._c_executable,
+                                                      stripped_args)
 
     # Wrap output handles in LocalBuffer instances
     return tuple(
-        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        LocalBuffer(
+            output_buffer,
+            backend=self._backend,
+            device=self._device_ordinals[replica])
         for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
     """Execute on one replica with Python values as arguments and output."""
 
     def put(arg):
-      return LocalBuffer.from_pyval(arg, backend=self._backend)
+      return LocalBuffer.from_pyval(
+          arg, device=self._device_ordinals[0], backend=self._backend)
 
     arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
@@ -662,24 +881,27 @@ class LocalComputation(object):
   def ExecuteWithPythonValuesPerReplica(self, arguments):
     """Execute on many replicas with Python values as arguments and output."""
 
-    def put(arg, replica):
-      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+    def put(arg, device):
+      return LocalBuffer.from_pyval(arg, device, backend=self._backend)
 
-    arguments = [[put(arg, replica)
-                  for arg in replica_args]
-                 for replica, replica_args in enumerate(arguments)]
+    # pylint: disable=g-complex-comprehension
+    arguments = [[
+        put(arg, self._device_ordinals[replica]) for arg in replica_args
+    ] for replica, replica_args in enumerate(arguments)]
     return [out.to_py() for out in self.ExecutePerReplica(arguments)]
 
   def __del__(self):
-    self._delete(self._c_computation)
+    # Python may have freed c_api first.
+    if c_api and self._c_executable:
+      self._backend.delete_executable(self._c_executable)
 
 
 class ComputationBuilder(object):
   """XLA computation builder.
 
   Enqueues XLA ops in sequence and in order to build a
-  LocalComputation, which in turn can be compiled into a
-  CompiledLocalComputation, which in turn can be locally executed.
+  Computation, which in turn can be compiled into a
+  LocalExecutable, which in turn can be locally executed.
   """
 
   # The methods of this class map 1-to-1 onto the XLA C++
@@ -690,16 +912,24 @@ class ComputationBuilder(object):
   # pylint: disable=g-doc-args
 
   def __init__(self, name):
-    self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
+    self._client = c_api.ComputationBuilder(name.encode('utf8'))
     self._parameter_numbering = itertools.count()
 
-  def Build(self, root=None, backend=XLA_LOCAL_BACKEND):
+  def Build(self, root=None, backend=None):
+    """Builds a `Computation` from the contents of the builder.
+
+    Args:
+      root: if not None, the operator containing the return value of the
+        computation.
+      backend: deprecated. Pass a `backend` to `Computation.Compile` instead.
+
+    Returns:
+      A `Computation`.
+    """
     if root is not None:
-      return LocalComputation(
-          self._client.BuildWithRoot(root), is_compiled=False, backend=backend)
+      return Computation(self._client.BuildWithRoot(root), backend=backend)
     else:
-      return LocalComputation(
-          self._client.Build(), is_compiled=False, backend=backend)
+      return Computation(self._client.Build(), backend=backend)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -732,8 +962,8 @@ class ComputationBuilder(object):
     """Enqueues a constant op onto the computation.
 
     Args:
-      value: value for the constant, as a np.array with an explicit dtype set
-             to one of the supported types.
+      value: value for the constant, as a np.array with an explicit dtype set to
+        one of the supported types.
 
     Returns:
       A LocalOp.
@@ -802,9 +1032,9 @@ class ComputationBuilder(object):
     Args:
       shape: the parameter's shape as a Shape object.
       name: optional string name for the parameter.
-      parameter_num: parameter number in the computation function. If None,
-        the next linear parameter number is used. The default value capability
-        can be used for auto-numbering. If you're using auto-numbering for some
+      parameter_num: parameter number in the computation function. If None, the
+        next linear parameter number is used. The default value capability can
+        be used for auto-numbering. If you're using auto-numbering for some
         parameters, use it for *all* parameters to avoid clashes.
 
     Returns:
@@ -821,8 +1051,8 @@ class ComputationBuilder(object):
     """Enqueues a Parameter op onto the computation.
 
     Args:
-      value: a Numpy array, or a nested tuple thereof, from which the
-        shape is inferred.
+      value: a Numpy array, or a nested tuple thereof, from which the shape is
+        inferred.
       name: as in ParameterWithShape.
       parameter_num: as in ParameterWithShape.
 
@@ -877,8 +1107,8 @@ class ComputationBuilder(object):
     Args:
       operand: the operand LocalOp to broadcast.
       shape: tuple of integers, the expected output shape.
-      broadcast_dimensions: tuple of integers identifying which dimensions
-        of the output are to be broadcast into.
+      broadcast_dimensions: tuple of integers identifying which dimensions of
+        the output are to be broadcast into.
 
     Returns:
       A LocalOp representing the added broadcast-in-dimensions op.
@@ -930,20 +1160,28 @@ class ComputationBuilder(object):
   def GetComputationStats(self):
     raise NotImplementedError()
 
+  def ReplicaId(self):
+    """Enqueues a ReplicaId operation onto the computation.
+
+    Returns:
+      A LocalOp representing the replica id.
+    """
+    return self._client.ReplicaId()
+
   def Pad(self, operand, padding_value, padding_config):
     """Enqueues a Pad operation onto the computation.
 
     Args:
       operand: LocalOp representing the array to pad.
       padding_value: LocalOp representing the scalar pad value.
-      padding_config: either an xla_data_pb2.PaddingConfig or a list of integer
-        triples (edge_padding_low, edge_padding_high, interior_padding)
-        representing the configuration of the padding operation.
+      padding_config: either a PaddingConfig or a list of integer triples
+        (edge_padding_low, edge_padding_high, interior_padding) representing the
+        configuration of the padding operation.
 
     Returns:
       A LocalOp representing the added Pad op.
     """
-    if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
+    if isinstance(padding_config, tuple) or isinstance(padding_config, list):
       padding_config = GetPaddingConfigFromTriples(padding_config)
     return self._client.Pad(operand, padding_value, padding_config)
 
@@ -964,16 +1202,62 @@ class ComputationBuilder(object):
       dimensions = tuple(range(ndim))
     return self._client.Reshape(operand, dimensions, new_sizes)
 
-  def CrossReplicaSum(self, operand):
+  def AllToAll(self,
+               operand,
+               split_dimension,
+               concat_dimension,
+               replica_groups=None):
+    """AllToAll op.
+
+    Args:
+      operand: LocalOp representing the input array
+      split_dimension: the dimension along which the operand is split
+      concat_dimension: the dimension along which the split blocks are
+        concatenated
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the all-to-all is performed. If not supplied or None (the
+        default), all replicas belong to the same group.
+
+    Returns:
+      A LocalOp that represents the all-to-all concatenation.
+    """
+    if replica_groups is None:
+      replica_groups_protos = []  # special value for XLA API
+    else:
+      replica_groups = list(replica_groups)
+      replica_groups_protos = [
+          _make_replica_group_proto(group) for group in replica_groups
+      ]
+    if not replica_groups:
+      split_count = get_replica_count()
+    else:
+      split_count = len(replica_groups[0])
+      if not all(split_count == len(g) for g in replica_groups):
+        raise ValueError('Replica groups must be equally sized')
+    return self._client.AllToAll(operand, split_dimension, concat_dimension,
+                                 split_count, replica_groups_protos)
+
+  def CrossReplicaSum(self, operand, replica_groups=None):
     """CrossReplicaSum op.
 
     Args:
       operand: the operand to sum across replica instances.
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the cross-replica sum is performed. If not supplied or None
+        (the default), all replicas belong to the same group.
 
     Returns:
-      A LocalOp that has the sum of the value among all replicas.
+      A LocalOp that represents on each replica the sum of its group's values.
     """
-    return self._client.CrossReplicaSum(operand)
+    if replica_groups is None:
+      replica_groups = []  # special value for XLA API
+    else:
+      replica_groups = [
+          _make_replica_group_proto(group) for group in replica_groups
+      ]
+    return self._client.CrossReplicaSum(operand, replica_groups)
 
   def Collapse(self, operand, dimensions):
     """Collapse op."""
@@ -1000,8 +1284,8 @@ class ComputationBuilder(object):
     """Select and scatter op, used by the gradient of ReduceWindow.
 
     Args:
-      operand: LocalOp for array of dimension N and type T over
-        which the windows slide.
+      operand: LocalOp for array of dimension N and type T over which the
+        windows slide.
       select: Computation of type (T, T) -> Pred to apply to the elements of
         each window to indicate which element is selected.
       window_dimensions: sequence of N integers for dimensions of the window.
@@ -1016,8 +1300,8 @@ class ComputationBuilder(object):
       A LocalOp representing the added SelectAndScatter op.
     """
     pads = _convert_padding_type_to_pad_values(
-        padding, self.GetShape(operand).dimensions(),
-        window_dimensions, window_strides)
+        padding, self.GetShape(operand).dimensions(), window_dimensions,
+        window_strides)
     return self._client.SelectAndScatterWithGeneralPadding(
         operand, select.computation, window_dimensions, window_strides, pads,
         source, init_value, scatter.computation)
@@ -1071,8 +1355,8 @@ class ComputationBuilder(object):
 
     Args:
       operand: LocalOp for the N dimensional array to be sliced.
-      start_indices: LocalOp for the 1D array of N integers
-        containing the starting indices of the slice.
+      start_indices: LocalOp for the 1D array of N integers containing the
+        starting indices of the slice.
       slice_sizes: iterable of N integers containing the slice sizes in each
         dimension.
 
@@ -1089,6 +1373,7 @@ class ComputationBuilder(object):
       update: N dimensional array comprising the slice update.
       start_indices: Rank-1 array of N integers comprising the starting indices
         of the slice along each dimension.
+
     Returns:
       A LocalOp representing the added DynamicUpdateSlice op.
     """
@@ -1122,8 +1407,8 @@ class ComputationBuilder(object):
 
     Args:
       computation_to_apply: a Computation object.
-      operands: an iterable of LocalOp. The number and types of
-        operands must match the arity of computation_to_apply.
+      operands: an iterable of LocalOp. The number and types of operands must
+        match the arity of computation_to_apply.
 
     Returns:
       A LocalOp representing the added call op.
@@ -1200,8 +1485,8 @@ class ComputationBuilder(object):
       A LocalOp representing the added ReduceWindow op.
     """
     pads = _convert_padding_type_to_pad_values(
-        padding, self.GetShape(operand).dimensions(), window_dimensions,
-        window_strides)
+        padding,
+        self.GetShape(operand).dimensions(), window_dimensions, window_strides)
     return self._client.ReduceWindowWithGeneralPadding(
         operand, init_value, computation_to_apply.computation,
         window_dimensions, window_strides, (), (), pads)
@@ -1234,10 +1519,8 @@ class ComputationBuilder(object):
 
     Args:
       mu: A LocalOp to an F32 scalar specifying the mean.
-      sigma: A LocalOp to an F32 scalar specifying the standard
-        deviation.
+      sigma: A LocalOp to an F32 scalar specifying the standard deviation.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
-
     Returns: a LocalOp to the generated array of F32 values.
     """
     shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
@@ -1247,16 +1530,15 @@ class ComputationBuilder(object):
     """Enqueues an RngUniform operation onto the computation.
 
     Args:
-      a: a LocalOp to an F32, S32, or U32 scalar (consistent with
-        the type of b) specifying the low end of the interval [a, b) over which
-        values are generated.
-      b: a LocalOp to an F32, S32, or U32 scalar (consistent with
-        the type of a) specifying the high end of the interval [a, b) over which
-        values are generated.
+      a: a LocalOp to an F32, S32, or U32 scalar (consistent with the type of b)
+        specifying the low end of the interval [a, b) over which values are
+        generated.
+      b: a LocalOp to an F32, S32, or U32 scalar (consistent with the type of a)
+        specifying the high end of the interval [a, b) over which values are
+        generated.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
-
-    Returns: a LocalOp to the generated array of values with the
-      same numeric type (F32, S32, or U32) as the arguments a and b.
+    Returns: a LocalOp to the generated array of values with the same numeric
+      type (F32, S32, or U32) as the arguments a and b.
     """
     shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
     return self._client.RngUniform(a, b, shape)
@@ -1268,7 +1550,6 @@ class ComputationBuilder(object):
       cond: a Computation for the loop condition, which has type T -> PRED
       body: a Computation for the loop body, which has type T -> T
       init: a LocalOp for the initial parameter, which has type T
-
     Returns: a LocalOp representing the While operation.
     """
     return self._client.While(cond.computation, body.computation, init)
@@ -1283,19 +1564,17 @@ class ComputationBuilder(object):
       true_computation: a Computation to apply to true_operand, type T_0 -> S
       false_operand: a ComputationDatahandle of type T_1
       false_computation: a Computation to apply to false_operand, type T_1 -> S
-
     Returns: a LocalOp representing the Conditional operation.
     """
-    return self._client.Conditional(
-        pred, true_operand, true_computation.computation, false_operand,
-        false_computation.computation)
+    return self._client.Conditional(pred, true_operand,
+                                    true_computation.computation, false_operand,
+                                    false_computation.computation)
 
   def IsConstant(self, operand):
     """Checks whether the given operand is a compile-time constant.
 
     Args:
       operand: a ComputationDataHandle to test.
-
     Returns: bool indicating whether `operand` is a compile-time constant,
       meaning its value does not depend on any parametersor, or on stateful
       operators such as `RngNormal` or `Infeed`.
@@ -1307,7 +1586,7 @@ class ComputationBuilder(object):
 
     Args:
       operand: a LocalOp to test.
-    Returns: a LocalComputation that is rooted on the given `operand` which is a
+    Returns: a Computation that is rooted on the given `operand` which is a
       compile-time constant.
     """
     return self._client.BuildConstantSubGraph(operand)
@@ -1318,7 +1597,6 @@ class ComputationBuilder(object):
     Args:
       lhs: LocalOp for the rank 1 or rank 2 left-hand-side array.
       rhs: LocalOp for the rank 1 or rank 2 right-hand-side array.
-
     Returns: a LocalOp representing the Dot operation.
     """
     return self._client.Dot(lhs, rhs)
@@ -1329,14 +1607,13 @@ class ComputationBuilder(object):
     Args:
       lhs: LocalOp for the left-hand-side array.
       rhs: LocalOp for the right-hand-side array.
-      dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
-        tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
+      dimension_numbers: either a DotDimensionNumbers or a nested tuple
+        ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
         integers representing the dimensions to treat as contracting dimensions
         and batch dimensions on each input operand.
-
     Returns: a LocalOp representing the DotGeneral operation.
     """
-    if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
+    if isinstance(dimension_numbers, tuple):
       dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
     return self._client.DotGeneral(lhs, rhs, dimension_numbers)
 
@@ -1349,15 +1626,15 @@ class ComputationBuilder(object):
       window_strides: length-N array-like of integer kernel strides.
       padding: PaddingType representing either 'SAME' or 'VALID' padding.
       feature_group_count: number of feature groups for grouped convolution.
-
     Returns: a LocalOp representing the Conv operation.
     """
     pads = _convert_padding_type_to_pad_values(
-        padding, self.GetShape(lhs).dimensions()[2:],
+        padding,
+        self.GetShape(lhs).dimensions()[2:],
         self.GetShape(rhs).dimensions()[2:], window_strides)
     return self.ConvGeneralDilated(
-        lhs, rhs, window_strides, pads, (), (),
-        dimension_numbers=None, feature_group_count=feature_group_count)
+        lhs, rhs, window_strides, pads, (), (), dimension_numbers=None,
+        feature_group_count=feature_group_count)
 
   def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
                              lhs_dilation, rhs_dilation, feature_group_count=1):
@@ -1382,7 +1659,7 @@ class ComputationBuilder(object):
   def _GetConvDimensionNumbers(self, num_spatial_dims):
     """Create ConvolutionDimensionNumbers proto for convolutions."""
     nd = num_spatial_dims
-    dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
+    dimension_numbers = ConvolutionDimensionNumbers()
     dimension_numbers.input_batch_dimension = 0
     dimension_numbers.input_feature_dimension = 1
     dimension_numbers.output_batch_dimension = 0
@@ -1406,35 +1683,33 @@ class ComputationBuilder(object):
       padding: length-N array-like of pairs of integers of (low, high) padding.
       lhs_dilation: length-N array-like of integer dilation factors.
       rhs_dilation: length-N array-like of integer dilation factors.
-      dimension_numbers: optional, either an
-        xla_data_pb2.ConvolutionDimensionNumbers proto instance or a tuple
-        (lhs_spec, rhs_spec, out_spec) where each element is a string of length
-        N+2 identifying by position (1) batch dimensions in lhs, rhs, and the
-        output with the character 'N', (2) feature dimensions in lhs and the
-        output with the character 'C', (3) input and output feature dimensions
-        in rhs with the characters 'I' and 'O' respectively, and (4) spatial
-        dimension correspondences between lhs, rhs, and the output using any
-        distinct characters. For example, to indicate dimension numbers
-        consistent with the Conv operation with two spatial dimensions, one
-        could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate
-        dimension numbers consistent with the TensorFlow Conv2D operation, one
-        could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of
-        convolution dimension specification, window strides are associated with
-        spatial dimension character labels according to the order in which the
-        labels appear in the rhs_spec string, so that window_strides[0] is
-        matched with the dimension corresponding to the first character
-        appearing in rhs_spec that is not 'I' or 'O'. By default, use the same
-        dimension numbering as Conv and ConvWithGeneralPadding.
+      dimension_numbers: optional, either a ConvolutionDimensionNumbers object
+        or a tuple (lhs_spec, rhs_spec, out_spec). Each element is a string of
+        length N+2 identifying by position: (1) batch dimensions in lhs, rhs,
+          and the output with the character 'N', (2) feature dimensions in lhs
+          and the output with the character 'C', (3) input and output feature
+          dimensions in rhs with the characters 'I' and 'O' respectively, and
+          (4) spatial dimension correspondences between lhs, rhs, and the output
+          using any distinct characters. For example, to indicate dimension
+          numbers consistent with the Conv operation with two spatial
+          dimensions, one could use ('NCHW', 'OIHW', 'NCHW'). As another
+          example, to indicate dimension numbers consistent with the TensorFlow
+          Conv2D operation, one could use ('NHWC', 'HWIO', 'NHWC'). When using
+          the latter form of convolution dimension specification, window strides
+          are associated with spatial dimension character labels according to
+          the order in which the labels appear in the rhs_spec string, so that
+          window_strides[0] is matched with the dimension corresponding to the
+          first character appearing in rhs_spec that is not 'I' or 'O'. By
+          default, use the same dimension numbering as Conv and
+          ConvWithGeneralPadding.
       feature_group_count: number of feature groups for grouped convolution.
-
     Returns: a LocalOp representing the ConvGenralDilated operation.
     """
     if dimension_numbers is None:
       dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    elif not isinstance(dimension_numbers,
-                        xla_data_pb2.ConvolutionDimensionNumbers):
+    elif isinstance(dimension_numbers, tuple):
       lhs_spec, rhs_spec, out_spec = dimension_numbers
-      dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
+      dimension_numbers = ConvolutionDimensionNumbers()
 
       dimension_numbers.input_batch_dimension = lhs_spec.index('N')
       dimension_numbers.input_feature_dimension = lhs_spec.index('C')
@@ -1451,10 +1726,9 @@ class ComputationBuilder(object):
       dimension_numbers.output_spatial_dimensions.extend(
           sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
                  key=lambda i: rhs_spec.index(out_spec[i])))
-    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                           lhs_dilation, rhs_dilation,
-                                           dimension_numbers,
-                                           feature_group_count)
+    return self._client.ConvGeneralDilated(
+        lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
+        dimension_numbers, feature_group_count)
 
   def Sort(self, operand, dimension=-1):
     """Enqueues a sort operation onto the computation."""
@@ -1464,31 +1738,50 @@ class ComputationBuilder(object):
     """Enqueues a key-value sort operation onto the computation."""
     return self._client.SortKeyVal(keys, values, dimension)
 
-  def Cholesky(self, a):
+  def Cholesky(self, a, lower=True):
     """Enqueues a Cholesky decomposition onto the computation."""
-    return self._client.Cholesky(a)
+    return self._client.Cholesky(a, lower)
 
   def QR(self, a, full_matrices=True):
     """Enqueues a QR decomposition onto the computation."""
     return self._client.QR(a, full_matrices)
 
-  def TriangularSolve(self, a, b, left_side=False, lower=False,
-                      transpose_a=False, conjugate_a=False):
+  def TriangularSolve(self,
+                      a,
+                      b,
+                      left_side=False,
+                      lower=False,
+                      transpose_a=False,
+                      conjugate_a=False,
+                      unit_diagonal=False):
     """Enqueues a triangular-solve operation onto the computation."""
-    return self._client.TriangularSolve(
-        a, b, left_side, lower, transpose_a, conjugate_a)
+    if not transpose_a:
+      transpose = 1
+      if conjugate_a:
+        a = self.Conj(a)
+    else:
+      transpose = 3 if conjugate_a else 2
+    return self._client.TriangularSolve(a, b, left_side, lower, unit_diagonal,
+                                        transpose)
+
+  def Eigh(self, a, full_matrices=True):
+    """Enqueues a symmetric/Hermitian eigendecomposition."""
+    return self._client.Eigh(a, full_matrices)
+
+  def SVD(self, a):
+    """Enqueues a singular value decomposition."""
+    return self._client.SVD(a)
 
   def Gather(self, a, start_indices, dimension_numbers, slice_sizes):
     """Enqueues a Gather operation onto the computation."""
-    return self._client.Gather(a, start_indices, dimension_numbers,
-                               slice_sizes)
+    return self._client.Gather(a, start_indices, dimension_numbers, slice_sizes)
 
   def Scatter(self, a, scatter_indices, updates, update_computation,
               dimension_numbers):
     """Enqueues a Scatter operation onto the computation."""
     return self._client.Scatter(
         a, scatter_indices, updates, update_computation.computation,
-        dimension_numbers,)
+        dimension_numbers)
 
 
 def _forward_methods_to_local_builder():
@@ -1496,7 +1789,7 @@ def _forward_methods_to_local_builder():
 
   Set up methods, corresponding to unary and binary XLA operations,
   whose calls are forwarded in a boilerplate manner to the underlying
-  LocalComputationBuilder C-extension API.
+  ComputationBuilder C-extension API.
   """
 
   def forward_to_local_builder_with_handles(target_method, is_binop=False):
@@ -1516,22 +1809,27 @@ def _forward_methods_to_local_builder():
 
   for method_name in _UNARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name))
+        getattr(c_api.ComputationBuilder, method_name))
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
   for method_name in _BINARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name), is_binop=True)
+        getattr(c_api.ComputationBuilder, method_name), is_binop=True)
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
 
 _forward_methods_to_local_builder()
 
+_default_replica_count = 1
+
 
 def initialize_replica_count(replica_count):
-  """Initializes the desired replica count to use on XLA service init.
+  """Initializes the default replica count to use.
+
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
 
   Args:
     replica_count: number of replicas that are desired for set up during XLA
@@ -1540,29 +1838,30 @@ def initialize_replica_count(replica_count):
   Raises:
     A runtime exception if the XLA service has already been initialized.
   """
-  c_api.InitializeReplicaCount(replica_count)
-
+  global _default_replica_count
+  _default_replica_count = replica_count
 
-def initialize_platform_name(platform_name):
-  """Initializes the desired platform name to use on XLA service init.
 
-  Args:
-    platform_name: string name of platform.
+def get_replica_count():
+  """Returns the default replica count.
 
-  Raises:
-    A runtime exception if the XLA service has already been initialized.
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
   """
-  platform_name = _maybe_encode_string(platform_name)
-  c_api.InitializePlatformName(platform_name)
+  return _default_replica_count
 
 
-def get_replica_count():
-  """Returns the current replica count used for the XLA service.
+def initialize_platform_name(platform_name):
+  """Initializes the default platform name to use for XLA.
 
-  Note: this will return a value whether the XLA service has been initialized
-  yet or not.
+  Args:
+    platform_name: string name of platform.
   """
-  return c_api.GetReplicaCount()
+  global _default_platform_name
+  _default_platform_name = platform_name
+
+  # Make sure the platform is valid by trying to instantiate it.
+  _get_default_local_backend()
 
 
 def register_cpu_custom_call_target(name, fn):
@@ -1575,22 +1874,111 @@ def register_cpu_custom_call_target(name, fn):
   c_api.RegisterCpuCustomCallTarget(name, fn)
 
 
+class PaddingConfigDimension(object):
+  """Python representation of a xla.PaddingConfigDimension protobuf."""
+  __slots__ = ('edge_padding_low', 'edge_padding_high', 'interior_padding')
+
+  def __init__(self):
+    self.edge_padding_low = []
+    self.edge_padding_high = []
+    self.interior_padding = []
+
+
+class PaddingConfig(object):
+  """Python representation of a xla.PaddingConfig protobuf."""
+  __slots__ = ('dimensions',)
+
+  def __init__(self):
+    self.dimensions = []
+
+
 def GetPaddingConfigFromTriples(triples):
   """Create PaddingConfig proto from list of triples of integers."""
-  padding_config = xla_data_pb2.PaddingConfig()
+  padding_config = PaddingConfig()
   for lo, hi, interior in triples:
-    dimension = padding_config.dimensions.add()
+    dimension = PaddingConfigDimension()
     dimension.edge_padding_low = lo
     dimension.edge_padding_high = hi
     dimension.interior_padding = interior
+    padding_config.dimensions.append(dimension)
   return padding_config
 
 
+class DotDimensionNumbers(object):
+  """Python representation of a xla.DotDimensionNumbers protobuf."""
+  __slots__ = ('lhs_contracting_dimensions', 'rhs_contracting_dimensions',
+               'lhs_batch_dimensions', 'rhs_batch_dimensions')
+
+  def __init__(self):
+    self.lhs_contracting_dimensions = []
+    self.rhs_contracting_dimensions = []
+    self.lhs_batch_dimensions = []
+    self.rhs_batch_dimensions = []
+
+
 def GetDotDimensionsFromLists(dimension_numbers):
   (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
-  dot_dims_proto = xla_data_pb2.DotDimensionNumbers()
+  dot_dims_proto = DotDimensionNumbers()
   dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
   dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
   dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
   dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
   return dot_dims_proto
+
+
+class ConvolutionDimensionNumbers(object):
+  """Python representation of a xla.ConvolutionDimensionNumbers protobuf."""
+  __slots__ = ('input_batch_dimension', 'input_feature_dimension',
+               'input_spatial_dimensions', 'kernel_input_feature_dimension',
+               'kernel_output_feature_dimension', 'kernel_spatial_dimensions',
+               'output_batch_dimension', 'output_feature_dimension',
+               'output_spatial_dimensions')
+
+  def __init__(self):
+    self.input_batch_dimension = 0
+    self.input_feature_dimension = 0
+    self.input_spatial_dimensions = []
+    self.kernel_input_feature_dimension = 0
+    self.kernel_output_feature_dimension = 0
+    self.kernel_spatial_dimensions = []
+    self.output_batch_dimension = 0
+    self.output_feature_dimension = 0
+    self.output_spatial_dimensions = []
+
+
+class GatherDimensionNumbers(object):
+  """Python representation of a xla.GatherDimensionNumbers protobuf."""
+  __slots__ = ('offset_dims', 'collapsed_slice_dims', 'start_index_map',
+               'index_vector_dim')
+
+  def __init__(self):
+    self.offset_dims = []
+    self.collapsed_slice_dims = []
+    self.start_index_map = []
+    self.index_vector_dim = 0
+
+
+class ScatterDimensionNumbers(object):
+  """Python representation of a xla.ScatterDimensionNumbers protobuf."""
+  __slots__ = ('update_window_dims', 'inserted_window_dims',
+               'scatter_dims_to_operand_dims', 'index_vector_dim')
+
+  def __init__(self):
+    self.update_window_dims = []
+    self.inserted_window_dims = []
+    self.scatter_dims_to_operand_dims = []
+    self.index_vector_dim = 0
+
+
+class ReplicaGroup(object):
+  """Python representation of a xla.ReplicaGroup protobuf."""
+  __slots__ = ('replica_ids',)
+
+  def __init__(self):
+    self.replica_ids = []
+
+
+def _make_replica_group_proto(replica_group):
+  replica_group_proto = ReplicaGroup()
+  replica_group_proto.replica_ids.extend(replica_group)
+  return replica_group_proto
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 874e087eb6d4b785066edae21b1d11ebb024cd3e..1aedc43c02f1c127de5f1f971637203c3434cc91 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -24,12 +24,25 @@ import threading
 
 import numpy as np
 
+from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import custom_call_for_test
 from tensorflow.compiler.xla.python import xla_client
 import unittest
 
 
-class LocalComputationTest(unittest.TestCase):
+class EnumTest(unittest.TestCase):
+  """Verifies Python enumerations match their protocol buffer equivalents."""
+
+  def testPrimitiveType(self):
+    for name, value in xla_client.PrimitiveType.__members__.items():
+      self.assertEqual(value, getattr(xla_data_pb2, name))
+
+  def testFormat(self):
+    for name, value in xla_client.Format.__members__.items():
+      self.assertEqual(value, getattr(xla_data_pb2, name))
+
+
+class ComputationTest(unittest.TestCase):
   """Base class for running an XLA Computation through the local client."""
 
   def _NewComputation(self, name=None):
@@ -85,9 +98,35 @@ def NumpyArrayBool(*args, **kwargs):
   return np.array(*args, dtype=np.bool, **kwargs)
 
 
-class ComputationsWithConstantsTest(LocalComputationTest):
+class ComputationPrinting(unittest.TestCase):
+
+  def ExampleComputation(self):
+    builder = xla_client.ComputationBuilder("acomputation")
+    p0 = builder.ParameterFromNumpy(np.float32(0))
+    p1 = builder.ParameterFromNumpy(np.zeros((4,), np.float32))
+    builder.Mul(p0, p1)
+    return builder.Build()
+
+  def testComputationToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.GetHloText()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testComputationToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = computation.GetHloDotGraph()
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+
+class ComputationsWithConstantsTest(ComputationTest):
   """Tests focusing on Constant ops."""
 
+  def testConstantScalarSumS8(self):
+    c = self._NewComputation()
+    root = c.Add(c.Constant(np.int8(1)), c.Constant(np.int8(2)))
+    self.assertEqual(c.GetShape(root), c.GetReturnValueShape())
+    self._ExecuteAndCompareExact(c, expected=np.int8(3))
+
   def testConstantScalarSumF32(self):
     c = self._NewComputation()
     root = c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
@@ -204,16 +243,6 @@ class ComputationsWithConstantsTest(LocalComputationTest):
                         c.Constant(NumpyArrayS32([1])))
     self._ExecuteAndCompareClose(c, expected=[2**31 - 1])
 
-  def testGetProto(self):
-    c = self._NewComputation()
-    c.Add(
-        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
-        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
-    built = c.Build()
-    proto = built.GetProto()  # HloModuleProto
-    self.assertTrue(len(proto.computations) == 1)
-    self.assertTrue(len(proto.computations[0].instructions) == 3)
-
   def testSum2DF64(self):
     c = self._NewComputation()
     c.Add(
@@ -298,7 +327,7 @@ class ComputationsWithConstantsTest(LocalComputationTest):
     self._ExecuteAndCompareClose(c, expected=0.75)
 
 
-class ParametersTest(LocalComputationTest):
+class ParametersTest(ComputationTest):
   """Tests focusing on Parameter ops and argument-passing."""
 
   def setUp(self):
@@ -378,7 +407,7 @@ class ParametersTest(LocalComputationTest):
         expected=[-4.3, 1.3, -6.3, 3.3])
 
 
-class LocalBufferTest(LocalComputationTest):
+class LocalBufferTest(ComputationTest):
   """Tests focusing on execution with LocalBuffers."""
 
   def _Execute(self, c, arguments):
@@ -476,7 +505,7 @@ class LocalBufferTest(LocalComputationTest):
     self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
 
 
-class SingleOpTest(LocalComputationTest):
+class SingleOpTest(ComputationTest):
   """Tests for single ops.
 
   The goal here is smoke testing - to exercise the most basic functionality of
@@ -502,11 +531,11 @@ class SingleOpTest(LocalComputationTest):
 
   def testConvertElementType(self):
     xla_types = {
-        np.bool: xla_client.xla_data_pb2.PRED,
-        np.int32: xla_client.xla_data_pb2.S32,
-        np.int64: xla_client.xla_data_pb2.S64,
-        np.float32: xla_client.xla_data_pb2.F32,
-        np.float64: xla_client.xla_data_pb2.F64,
+        np.bool: xla_client.PrimitiveType.PRED,
+        np.int32: xla_client.PrimitiveType.S32,
+        np.int64: xla_client.PrimitiveType.S64,
+        np.float32: xla_client.PrimitiveType.F32,
+        np.float64: xla_client.PrimitiveType.F64,
     }
 
     def _ConvertAndTest(template, src_dtype, dst_dtype):
@@ -527,13 +556,13 @@ class SingleOpTest(LocalComputationTest):
 
   def testBitcastConvertType(self):
     xla_x32_types = {
-        np.int32: xla_client.xla_data_pb2.S32,
-        np.float32: xla_client.xla_data_pb2.F32,
+        np.int32: xla_client.PrimitiveType.S32,
+        np.float32: xla_client.PrimitiveType.F32,
     }
 
     xla_x64_types = {
-        np.int64: xla_client.xla_data_pb2.S64,
-        np.float64: xla_client.xla_data_pb2.F64,
+        np.int64: xla_client.PrimitiveType.S64,
+        np.float64: xla_client.PrimitiveType.F64,
     }
 
     def _ConvertAndTest(template, src_dtype, dst_dtype, dst_etype):
@@ -553,6 +582,18 @@ class SingleOpTest(LocalComputationTest):
       for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
         _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype])
 
+  # TODO(b/123523486) implement AllToAll on CPU
+  def DISABLED_testAllToAllOneReplica(self):
+    samples = [
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples[:1]:
+      c = self._NewComputation()
+      c.AllToAll(c.Constant(lhs), 0, 0)
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
   def testCrossReplicaSumOneReplica(self):
     samples = [
         NumpyArrayF32(42.0),
@@ -565,6 +606,23 @@ class SingleOpTest(LocalComputationTest):
       c.CrossReplicaSum(c.Constant(lhs))
       self._ExecuteAndCompareExact(c, expected=lhs)
 
+  def testReplicaId(self):
+    c = self._NewComputation()
+    _ = c.ReplicaId()
+    self._ExecuteAndCompareExact(c, expected=0)
+
+  def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
+    samples = [
+        NumpyArrayF32(42.0),
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples:
+      c = self._NewComputation()
+      c.CrossReplicaSum(c.Constant(lhs), [[0]])
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
   def testDotMatrixVectorF32(self):
     c = self._NewComputation()
     lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
@@ -608,7 +666,7 @@ class SingleOpTest(LocalComputationTest):
     lhs = NumpyArrayF32(rng.randn(10, 3, 4))
     rhs = NumpyArrayF32(rng.randn(10, 4, 5))
 
-    dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers()
+    dimension_numbers = xla_client.DotDimensionNumbers()
     dimension_numbers.lhs_contracting_dimensions.append(2)
     dimension_numbers.rhs_contracting_dimensions.append(1)
     dimension_numbers.lhs_batch_dimensions.append(0)
@@ -727,6 +785,12 @@ class SingleOpTest(LocalComputationTest):
     c.Not(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=~arr)
 
+  def testCountLeadingZeros(self):
+    c = self._NewComputation()
+    arr = NumpyArrayS32([0x7FFF, 0x12345678])
+    c.Clz(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=[17, 3])
+
   def testExp(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -914,12 +978,13 @@ class SingleOpTest(LocalComputationTest):
 
   def testPadWithPaddingConfig(self):
     c = self._NewComputation()
-    padding_config = xla_client.xla_data_pb2.PaddingConfig()
+    padding_config = xla_client.PaddingConfig()
     for lo, hi, interior in [(1, 2, 1), (0, 1, 0)]:
-      dimension = padding_config.dimensions.add()
+      dimension = xla_client.PaddingConfigDimension()
       dimension.edge_padding_low = lo
       dimension.edge_padding_high = hi
       dimension.interior_padding = interior
+      padding_config.dimensions.append(dimension)
     c.Pad(
         c.Constant(NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
         c.Constant(NumpyArrayF32(0.0)),
@@ -962,14 +1027,13 @@ class SingleOpTest(LocalComputationTest):
         c.Constant(NumpyArrayF32(2)))
     self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
 
-  # TODO(b/72689392): re-enable when bug S32 resolved
-  def DISABLED_testClampS32(self):
+  def testClampS32(self):
     c = self._NewComputation()
     c.Clamp(
         c.Constant(NumpyArrayS32(-1)),
         c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
         c.Constant(NumpyArrayS32(2)))
-    self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2])
+    self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
 
   def testSelect(self):
     c = self._NewComputation()
@@ -1102,6 +1166,26 @@ class SingleOpTest(LocalComputationTest):
     q, r = self._Execute(c, ())
     np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
 
+  def testEigh(self):
+    a = np.array(
+        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
+        dtype=np.float32)
+    a = (a + a.T) / 2
+
+    c = self._NewComputation()
+    c.Eigh(c.Constant(a), full_matrices=True)
+    v, w = self._Execute(c, ())
+    self.assertLess(np.linalg.norm(np.dot(a, v) - w * v), 1e-3)
+
+  def testSVD(self):
+    a = np.array(
+        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
+        dtype=np.float32)
+    c = self._NewComputation()
+    c.SVD(c.Constant(a))
+    u, d, v = self._Execute(c, ())
+    self.assertLess(np.linalg.norm(a - np.matmul(u * d, v.T)), 1e-3)
+
   def testTriangularSolve(self):
     a_vals = np.array(
         [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
@@ -1132,7 +1216,7 @@ class SingleOpTest(LocalComputationTest):
   def testGather(self):
     a = np.arange(9).astype(np.int32).reshape((3, 3))
     indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
-    dnums = xla_client.xla_data_pb2.GatherDimensionNumbers()
+    dnums = xla_client.GatherDimensionNumbers()
     dnums.offset_dims.append(1)
     dnums.offset_dims.append(2)
     dnums.start_index_map.append(0)
@@ -1145,7 +1229,7 @@ class SingleOpTest(LocalComputationTest):
     np.testing.assert_allclose(g, expected, rtol=1e-4)
 
 
-class EmbeddedComputationsTest(LocalComputationTest):
+class EmbeddedComputationsTest(ComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
 
   def _CreateConstantS32Computation(self):
@@ -1596,7 +1680,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
     scatter_indices = np.array([0, 2], dtype=np.int32)
     updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
 
-    dnums = xla_client.xla_data_pb2.ScatterDimensionNumbers()
+    dnums = xla_client.ScatterDimensionNumbers()
     dnums.update_window_dims.append(1)
     dnums.inserted_window_dims.append(0)
     dnums.scatter_dims_to_operand_dims.append(0)
@@ -1609,7 +1693,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
     self._ExecuteAndCompareClose(c, expected=expected)
 
 
-class ErrorTest(LocalComputationTest):
+class ErrorTest(ComputationTest):
 
   def setUp(self):
     self.f32_scalar_2 = NumpyArrayF32(2.0)
@@ -1626,7 +1710,7 @@ class ErrorTest(LocalComputationTest):
         lambda: c.Build().CompileWithExampleArguments([self.f32_scalar_2]))
 
 
-class ComputationRootTest(LocalComputationTest):
+class ComputationRootTest(ComputationTest):
   """Tests related to setting the root of the computation."""
 
   def testComputationRootDifferentFromLastOp(self):
diff --git a/tensorflow/compiler/xla/python/xla_data.i b/tensorflow/compiler/xla/python/xla_data.i
new file mode 100644
index 0000000000000000000000000000000000000000..b18583c64d400bdb7b3bc50b3548df23f4a8c469
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_data.i
@@ -0,0 +1,654 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps for building, compiling, and executing XLA computations.
+//
+// The typemaps below implement/assert the following correspondences
+// (with elaborations below):
+//
+//    C++                                  Python
+// -------------------------------------+---------------------------------------
+//  Span<int64>                        <-  sequence of int
+//  vector<int>                        ->  sequence of int
+//  Span<LocalOp>                      <-  sequence of LocalOp
+//  Literal                            <-> (nested tuple of) numpy ndarray
+//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  ProgramShape                       ->  pair of ([arg_shapes], ret_shape)
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
+//  PrimitiveType                      <-  int
+//  Span<pair<int64, in64>>            <-  sequence of int pairs
+//  PaddingConfig proto                <-  ducktyped Python proto
+//  ConvolutionDimensionNumbers proto  <-  ducktyped Python proto
+//  DotDimensionNumbers proto          <-  ducktyped Python proto
+//  GatherDimensionNumbers proto       <-  ducktyped Python proto
+//  ScatterDimensionNumbers proto      <-  ducktyped Python proto
+//  Span<ReplicaGroup proto>           <-  sequence of ReplicaGroup Python proto
+//
+// Arrows indicate whether a conversion only ever occurs in one
+// direction, or whether it is maintained bidirectionally.
+//
+// The Python objects corresponding to C++ Literals have the type:
+//
+//   T = ndarray | (T, ...)
+//
+// where a terminal numpy ndarray translates to a Literal with a
+// non-tuple Shape, an XLA primitive element type corresponding to the
+// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
+// to a tuple-shaped Literal whose tuple components are translated
+// recursively. For example, if x is a numpy ndarray in Python, with
+// shape (2, 3) and dtype of dtype('float32'), then x translates to a
+// Literal with rank 2, dimension 2 and 3, and XLA primitive type
+// F32. Meanwhile,
+//
+//   (x, (x, x), (x,)),
+//
+// translates to a tuple-shaped XLA Literal, whose component subshapes
+// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
+//
+// Shapes output by C++ become Python objects with the type:
+//
+//   T            = (dtype, S)
+//   S            = DIMENSIONS | TUPLE_SHAPES
+//   DIMENSIONS   = (int, ...)
+//   TUPLE_SHAPES = (T, ...)
+//
+// In the pair described by the T rule, the terminal dtype determines
+// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
+// dtype('O'), numpy's object dtype, the structure represents a tuple
+// shape and the expansion of the non-terminal S is
+// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
+// and S expands into DIMENSIONS giving dimension sizes. For example:
+//
+//   (dtype('float32'), (3, 5, 7))
+//
+// describes a 3x5x7 array of F32s, and
+//
+//   (dtype('O'), ((dtype('float32'), (2, 3)),
+//                 (dtype('float64'), (4, 5))))
+//
+// describes a tuple shape with two subshapes: the first a 2x3 F32,
+// and the other a 4x5 F64.
+//
+// The Python int corresponding to a PrimitiveType enum must be valid
+// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
+//
+// The SWIG object wrappers generated by this file are not intended
+// for end use, but rather for internal use in the Python XLA client,
+// xla_client.py.
+//
+// One central reason for the Python-side indirection is that the
+// Python-side objects produced by the typemaps in this file are
+// further packaged up by xla_client before being passed on. For
+// instance, the Python pair produced for a C++ Shape is further
+// wrapped in a Python class (xla_client.Shape) so as not to expose
+// the raw pair externally.
+//
+// Other SWIG object wrappers (e.g. of Computation) are further
+// wrapped by xla_client in order to set up a custom destructor that
+// triggers memory deallocation on the C++ side.
+//
+
+
+%module(threads="1") xla_data
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+// Must be included first
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Basic types
+
+
+%typemap(out) std::vector<int> {
+  PyObject* out = PyList_New($1.size());
+  for (int i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM(out, i, PyInt_FromLong($1[i]));
+  }
+  $result = out;
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<string> {
+  if ($1.ok()) {
+    $result = PyString_FromString($1.ConsumeValueOrDie().c_str());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) Status {
+  if (!$1.ok()) {
+    PyErr_SetString(
+        PyExc_RuntimeError, $1.ToString().c_str());
+    SWIG_fail;
+  }
+  Py_INCREF(Py_None);
+  $result = Py_None;
+}
+
+%typemap(in) absl::Span<const int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Literal
+
+%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
+  literal_status = numpy::XlaLiteralFromPyObject($input);
+  if (!literal_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $1 = &literal_status.ValueOrDie();
+}
+
+%typemap(out) Literal (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  obj_status = numpy::PyObjectFromXlaLiteral(*$1);
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(out) StatusOr<Literal> (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  if (!$1.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+  obj_status = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
+    if (!literal_status.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps.push_back(literal_status.ConsumeValueOrDie());
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// OpMetadata
+
+%typemap(in) const OpMetadata& (OpMetadata temp) {
+  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+// Shape
+
+%typemap(out) const Shape& {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(out) StatusOr<Shape> {
+  if ($1.ok()) {
+    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(out) StatusOr<ProgramShape> {
+  if ($1.ok()) {
+    $result = numpy::PyProgramShapeInfoFromXlaProgramShape(
+        $1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) const Shape& (Shape temp) {
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+%typemap(in) const absl::optional<Shape>& (
+    absl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = absl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
+%typemap(out) std::unique_ptr<Shape> {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<absl::optional<Shape> >& (
+    std::vector<absl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(absl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        SWIG_fail;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
+  }
+  $1 = &temps;
+}
+
+// PrimitiveType
+
+%typemap(in) PrimitiveType {
+  PyObject* py_int = numpy::PyNumberToPyInt($input);
+  if (!py_int) {
+    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
+    SWIG_fail;
+  }
+  const long value = numpy::PyIntOrPyLongToLong(py_int);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  if (!PrimitiveType_IsValid(value)) {
+    PyErr_SetString(
+        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  $1 = static_cast<PrimitiveType>(value);
+}
+
+// Span<pair<int64, in64>>
+
+%typemap(in) absl::Span<const std::pair<int64, int64> >
+    (std::vector<std::pair<int64, int64> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (!o) {
+      SWIG_fail;
+    }
+    PyObject* first = PyTuple_GetItem(o, 0);
+    if (!first) {
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
+    if (!first_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "First pair item cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* second = PyTuple_GetItem(o, 1);
+    if (!second) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
+    if (!second_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Second pair item cannot be converted to int");
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
+    if (first_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
+    if (second_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    temps.push_back(std::make_pair(first_value, second_value));
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_contracting_dimensions",
+        dimension_numbers.mutable_lhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_contracting_dimensions",
+        dimension_numbers.mutable_rhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_batch_dimensions",
+        dimension_numbers.mutable_lhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_batch_dimensions",
+        dimension_numbers.mutable_rhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// PaddingConfig
+
+%typemap(in) const PaddingConfig&
+    (PaddingConfig padding_config) {
+  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
+  if (!dimensions) {
+    SWIG_fail;
+  }
+
+  int length = PySequence_Size(dimensions);
+  if (length == -1) {
+    Py_DECREF(dimensions);
+    SWIG_fail;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(dimensions, i);
+    if (!item) {
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    int64 edge_padding_low, edge_padding_high, interior_padding;
+    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
+        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
+        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
+      Py_DECREF(item);
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    Py_DECREF(item);
+
+    PaddingConfig::PaddingConfigDimension* dimension =
+        padding_config.add_dimensions();
+    dimension->set_edge_padding_low(edge_padding_low);
+    dimension->set_edge_padding_high(edge_padding_high);
+    dimension->set_interior_padding(interior_padding);
+  }
+  Py_DECREF(dimensions);
+
+  $1 = &padding_config;
+}
+
+// ConvolutionDimensionNumbers
+
+%typemap(in) const ConvolutionDimensionNumbers&
+    (ConvolutionDimensionNumbers dimension_numbers) {
+  int64 value;
+
+  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_batch_dimension(value);
+
+  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_feature_dimension(value);
+
+  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_batch_dimension(value);
+
+  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(value);
+
+  if (!HandleRepeatedInt64Attribute(
+        $input, "input_spatial_dimensions",
+        dimension_numbers.mutable_input_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "kernel_spatial_dimensions",
+        dimension_numbers.mutable_kernel_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "output_spatial_dimensions",
+        dimension_numbers.mutable_output_spatial_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// GatherDimensionNumbers
+
+%typemap(in) const GatherDimensionNumbers&
+    (GatherDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "offset_dims",
+        dimension_numbers.mutable_offset_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "collapsed_slice_dims",
+        dimension_numbers.mutable_collapsed_slice_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "start_index_map",
+        dimension_numbers.mutable_start_index_map())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// ScatterDimensionNumbers
+
+%typemap(in) const ScatterDimensionNumbers&
+    (ScatterDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "update_window_dims",
+        dimension_numbers.mutable_update_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "inserted_window_dims",
+        dimension_numbers.mutable_inserted_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "scatter_dims_to_operand_dims",
+        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// Span<const ReplicaGroup>
+
+%typemap(in) absl::Span<const ReplicaGroup >
+    (std::vector<ReplicaGroup > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    ReplicaGroup rgrp;
+    if (!HandleRepeatedInt64Attribute(
+            o, "replica_ids",
+            rgrp.mutable_replica_ids())) {
+        SWIG_fail;
+    }
+    temps.push_back(rgrp);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c55abc17f87c369e3d5b2140a84014e07921a9a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -0,0 +1,297 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace swig {
+
+XrtAllocation::XrtAllocation(int64 handle, Shape shape,
+                             const string& session_target)
+    : handle_(handle), shape_(shape), session_target_(session_target) {}
+
+XrtAllocation::~XrtAllocation() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+/* static */
+StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
+    const Literal& argument, const string& session_target) {
+  xrt::XLAAllocation alloc;
+  *alloc.mutable_value() = argument.ToProto();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto literal_string =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({literal_string, alloc.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtAllocation(handle, argument.shape(), session_target);
+}
+
+const int64 XrtAllocation::handle() const { return handle_; }
+
+const Shape& XrtAllocation::shape() const { return shape_; }
+
+StatusOr<Literal> XrtAllocation::ToLiteral() const {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+
+  xla::LiteralProto response;
+  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
+  return Literal::CreateFromProto(response);
+}
+
+XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
+    : elements_(std::move(elements)) {
+  for (auto* element : elements_) {
+    CHECK(element != nullptr);
+  }
+}
+
+XrtAllocationTuple::~XrtAllocationTuple() {
+  for (XrtAllocation* element : elements_) {
+    if (element != nullptr) {
+      delete element;
+    }
+  }
+}
+
+StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
+  XrtAllocation* element = elements_[i];
+  if (element == nullptr) {
+    return InvalidArgument("Attempted to release already-released element %d.",
+                           i);
+  }
+  elements_[i] = nullptr;
+  return element;
+}
+
+int64 XrtAllocationTuple::size() const { return elements_.size(); }
+
+StatusOr<XrtExecutable*> XrtExecutable::CompileForXrt(
+    const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+    const Shape& result_shape, const string& session_target) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto compile = tensorflow::ops::XRTCompile(root, program);
+  TF_RETURN_IF_ERROR(root.status());
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  ProgramShape program_shape;
+  for (auto& shape : argument_shapes) {
+    *program_shape.add_parameters() = shape;
+  }
+  *program_shape.mutable_result() = result_shape;
+
+  LayoutUtil::SetToDefaultLayout(&program_shape);
+  *config->mutable_program_shape() = program_shape.ToProto();
+  c.mutable_hlo_snapshot()
+      ->mutable_hlo()
+      ->mutable_hlo_module()
+      ->ParsePartialFromString(hlo_module_proto);
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({program, c.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtExecutable(program_shape, handle, session_target);
+}
+
+XrtExecutable::XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                             const string& session_target)
+    : program_shape_(program_shape),
+      handle_(handle),
+      session_target_(session_target) {}
+
+XrtExecutable::~XrtExecutable() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({computation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+StatusOr<XrtAllocation*> XrtExecutable::Execute(
+    absl::Span<XrtAllocation* const> argument_handles) {
+  const int num_expected_arguments = program_shape().parameters().size();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  std::vector<tensorflow::Output> arguments;
+  arguments.reserve(num_expected_arguments);
+  for (int i = 0; i < num_expected_arguments; ++i) {
+    arguments.push_back(
+        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
+  }
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto execution_config =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
+                                             execution_config, arguments);
+  TF_RETURN_IF_ERROR(root.status());
+
+  TF_RET_CHECK(argument_handles.size() == arguments.size());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  for (int i = 0; i < arguments.size(); ++i) {
+    inputs.insert({arguments[i], argument_handles[i]->handle()});
+  }
+  inputs.insert({computation_handle, handle()});
+  inputs.insert({execution_config, e.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
+
+  int64 output = outputs[0].scalar<int64>()();
+  return new XrtAllocation(output, program_shape().result(), session_target_);
+}
+
+const ProgramShape& XrtExecutable::program_shape() const {
+  return program_shape_;
+}
+
+int64 XrtExecutable::handle() const { return handle_; }
+
+void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
+
+void DeleteXrtExecutable(XrtExecutable* computation) { delete computation; }
+
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target) {
+  const Shape& tuple_shape = allocation->shape();
+
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
+  }
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
+  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  std::vector<XrtAllocation*> results;
+  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    inputs.clear();
+    inputs.insert({base_handle, allocation->handle()});
+    inputs.insert({shape_index, {i}});
+    std::vector<tensorflow::Tensor> outputs;
+    auto status = session.Run(inputs, {subtuple}, &outputs);
+    if (!status.ok()) {
+      // Clean up before returning non-ok status.
+      for (int j = 0; j < results.size(); ++j) {
+        delete results[j];
+      }
+      return status;
+    }
+    const int64 subtuple_handle = outputs[0].scalar<int64>()();
+    const Shape& subtuple_shape =
+        ShapeUtil::GetTupleElementShape(tuple_shape, i);
+    results.push_back(
+        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
+  }
+  return new XrtAllocationTuple(std::move(results));
+}
+
+}  // namespace swig
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xrt.h b/tensorflow/compiler/xla/python/xrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..710c3af3fa6b407127643797dbabad201cf076d4
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.h
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape.h"
+
+namespace xla {
+namespace swig {
+
+// Represents a reference to literals that live in a device-allocated buffer via
+// XRT. Specifically, wraps an int64 handle produced by running the allocation
+// graph, and an XLA shape to track the referent's shape.
+class XrtAllocation {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which allocation and deallocation
+  // graphs are run.
+  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
+                                              const string& session_target);
+
+  XrtAllocation(int64 handle, Shape shape, const string& session_target);
+  ~XrtAllocation();
+  StatusOr<Literal> ToLiteral() const;
+  const Shape& shape() const;
+  const int64 handle() const;
+
+ private:
+  const int64 handle_;
+  const Shape shape_;
+  const string session_target_;
+};
+
+// Result of a tuple destructuring operation on an XrtAllocation.
+class XrtAllocationTuple {
+ public:
+  // Note: any XrtAllocation elements that are not Release()'d will be
+  // deallocated in the destructor.
+  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
+
+  ~XrtAllocationTuple();
+
+  // Releases the ith element to the caller. Further attempts to release the ith
+  // element will return an invalid argument error.
+  StatusOr<XrtAllocation*> Release(int i);
+
+  // Returns the number of elements in the destructured tuple.
+  int64 size() const;
+
+ private:
+  std::vector<XrtAllocation*> elements_;
+};
+
+// Destructures a tuple-valued XrtAllocation into its constituent elements
+// in XrtAllocationTuple form.
+//
+// Accepts a `session_target` argument, used in constructing the
+// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
+// and passed along in constructing each constituent XrtAllocation.
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target);
+
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. Specifically, wraps an XRT computation handle.
+class XrtExecutable {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the compilation graph is run.
+  static StatusOr<XrtExecutable*> CompileForXrt(
+      const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+      const Shape& result_shape, const string& session_target);
+
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the execution graph is run.
+  XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                const string& session_target);
+  ~XrtExecutable();
+
+  std::vector<int> DeviceOrdinals() const { return {0}; }
+
+  StatusOr<XrtAllocation*> Execute(
+      absl::Span<XrtAllocation* const> argument_handles);
+
+  const ProgramShape& program_shape() const;
+  int64 handle() const;
+
+ private:
+  const ProgramShape program_shape_;
+  const int64 handle_;
+  const string session_target_;
+};
+
+// Functions for freeing resources from the Python side.
+void DeleteXrtAllocation(XrtAllocation* allocation);
+void DeleteXrtExecutable(XrtExecutable* computation);
+
+}  // namespace swig
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
diff --git a/tensorflow/compiler/xla/python/xrt.i b/tensorflow/compiler/xla/python/xrt.i
new file mode 100644
index 0000000000000000000000000000000000000000..456dd7be86e479b46815fc16b51a10431fe2060d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.i
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wrappers for XRT ops.
+
+%module(threads="1") xrt
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
+
+%{
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Computation and buffer/allocation types
+
+%typemap(out) StatusOr<xla::swig::XrtExecutable*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtExecutable*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocationTuple*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
+    (std::vector<XrtAllocation*> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    XrtAllocation* xrta;
+    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
+      SWIG_fail;
+    }
+    temps.push_back(xrta);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+
+%ignoreall
+%unignore xla;
+%unignore xla::swig;
+%unignore xla::swig::XrtAllocation;
+%unignore xla::swig::XrtAllocation::FromLiteral;
+%unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
+%unignore xla::swig::XrtAllocationTuple;
+%unignore xla::swig::XrtAllocationTuple::Release;
+%unignore xla::swig::XrtAllocationTuple::size;
+%unignore xla::swig::XrtExecutable;
+%unignore xla::swig::XrtExecutable::CompileForXrt;
+%unignore xla::swig::XrtExecutable::DeviceOrdinals;
+%unignore xla::swig::XrtExecutable::Execute;
+%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::DeleteXrtAllocation;
+%unignore xla::swig::DeleteXrtExecutable;
+
+%thread;
+%include "tensorflow/compiler/xla/python/xrt.h"
+%nothread;
+
+%unignoreall
diff --git a/tensorflow/compiler/xla/python_api/xla_literal.py b/tensorflow/compiler/xla/python_api/xla_literal.py
index 757e41a78ad2b57d2ef6e1f3055160be22c7b3ed..19bd685ab2260485d2a86f0a682d0cdd36712fdb 100644
--- a/tensorflow/compiler/xla/python_api/xla_literal.py
+++ b/tensorflow/compiler/xla/python_api/xla_literal.py
@@ -69,7 +69,7 @@ def _ConvertNumpyArrayToLiteral(ndarray):
 
   if ndarray.ndim == 0:
     getattr(literal, type_record.literal_field_name).append(
-        _np.asscalar(ndarray.astype(type_record.literal_field_type)))
+        ndarray.astype(type_record.literal_field_type).item())
   else:
     # Ndarrays with boolean dtypes need special type conversion with protobufs
     if ndarray.dtype in {_np.bool_, _np.dtype('bool')}:
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index a1b0f4045ff071454451f9fe3942ac974f4f47ac..4d4500d840500f6a521a20fe8c94919af65ce31b 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -94,7 +94,7 @@ TEST_F(ReferenceUtilTest, Reduce4Dto1DZeroSizedArray) {
 }
 
 TEST_F(ReferenceUtilTest, MapArray2D) {
-  auto identity = [](float value) { return log(exp(value)); };
+  auto identity = [](float value) { return log(std::exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
   auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2NearArray2D(*matrix_, actual_literal,
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index e39e17c11068047a9b0bb97d296504281bdf5c23..64d2da499db04b4eed32da6335f142e5cff3495a 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -114,6 +114,7 @@ tf_cc_test(
         ":bfloat16_normalization",
         ":bfloat16_support",
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -166,6 +167,23 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "dump",
+    srcs = ["dump.cc"],
+    hdrs = ["dump.h"],
+    deps = [
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_proto_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
@@ -272,6 +290,7 @@ tf_cc_test(
     srcs = ["hlo_evaluator_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_element_type_converter",
         ":hlo_evaluator",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
@@ -284,7 +303,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -333,6 +351,7 @@ cc_library(
         ":hlo_proto",
         ":name_uniquer",
         "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -679,7 +698,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
@@ -701,7 +719,9 @@ cc_library(
         ":compilation_cache",
         ":compiler",
         ":computation_layout",
+        ":computation_placer",
         ":device_memory_allocator",
+        ":dump",
         ":dynamic_dimension_inference",
         ":executable",
         ":execution_tracker",
@@ -781,6 +801,7 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
+        ":dump",
         ":platform_util",
         ":service",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -810,8 +831,8 @@ cc_library(
     name = "gpu_plugin",
     deps = [
         ":service",
-        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
+        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ],
@@ -880,6 +901,7 @@ cc_library(
     deps = [
         ":computation_layout",
         ":device_memory_allocator",
+        ":dump",
         ":hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
@@ -912,6 +934,7 @@ cc_library(
     hdrs = ["compiler.h"],
     deps = [
         ":buffer_value",
+        ":computation_placer",
         ":executable",
         ":hlo",
         ":hlo_module_config",
@@ -1203,7 +1226,6 @@ cc_library(
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -1351,6 +1373,7 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_ordering",
         ":hlo_pass",
         ":logical_buffer",
@@ -1461,11 +1484,15 @@ cc_library(
     hdrs = ["hlo_creation_utils.h"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1510,6 +1537,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "op_expander_pass",
+    srcs = ["op_expander_pass.cc"],
+    hdrs = ["op_expander_pass.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "gather_expander",
     srcs = ["gather_expander.cc"],
@@ -1518,6 +1559,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1541,6 +1583,51 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "triangular_solve_expander",
+    srcs = ["triangular_solve_expander.cc"],
+    hdrs = ["triangular_solve_expander.h"],
+    deps = [
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "cholesky_expander",
+    srcs = ["cholesky_expander.cc"],
+    hdrs = ["cholesky_expander.h"],
+    deps = [
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 tf_cc_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -1602,7 +1689,7 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":hlo",
         ":hlo_casting_utils",
-        ":hlo_matchers",
+        ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
         ":pattern_matcher",
@@ -1748,6 +1835,8 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_evaluator",
+        ":pattern_matcher",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -1808,55 +1897,57 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "defuser",
-    srcs = ["defuser.cc"],
-    hdrs = ["defuser.h"],
+    name = "while_loop_trip_count_annotator",
+    srcs = ["while_loop_trip_count_annotator.cc"],
+    hdrs = ["while_loop_trip_count_annotator.h"],
     deps = [
-        ":call_graph",
         ":hlo",
         ":hlo_pass",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
+        ":while_loop_analysis",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
 
 tf_cc_test(
-    name = "defuser_test",
-    srcs = ["defuser_test.cc"],
+    name = "while_loop_trip_count_annotator_test",
+    srcs = ["while_loop_trip_count_annotator_test.cc"],
     deps = [
-        ":defuser",
-        ":hlo_matchers",
+        ":pattern_matcher",
+        ":while_loop_simplifier",
+        ":while_loop_trip_count_annotator",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
     ],
 )
 
 cc_library(
-    name = "implicit_broadcast_remover",
-    srcs = ["implicit_broadcast_remover.cc"],
-    hdrs = ["implicit_broadcast_remover.h"],
+    name = "defuser",
+    srcs = ["defuser.cc"],
+    hdrs = ["defuser.h"],
     deps = [
+        ":call_graph",
         ":hlo",
-        ":hlo_dce",
         ":hlo_pass",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
 tf_cc_test(
-    name = "implicit_broadcast_remover_test",
-    srcs = ["implicit_broadcast_remover_test.cc"],
+    name = "defuser_test",
+    srcs = ["defuser_test.cc"],
     deps = [
+        ":defuser",
         ":hlo_matchers",
-        ":implicit_broadcast_remover",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1930,9 +2021,11 @@ cc_library(
     hdrs = ["dynamic_dimension_inference.h"],
     deps = [
         ":hlo",
+        ":while_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
@@ -1964,16 +2057,18 @@ tf_cc_test(
     srcs = ["dynamic_padder_test.cc"],
     deps = [
         ":dynamic_padder",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":hlo_runner",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -1984,6 +2079,9 @@ tf_cc_test(
     srcs = ["dynamic_dimension_inference_test.cc"],
     deps = [
         ":dynamic_dimension_inference",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_runner",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1991,9 +2089,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -2105,6 +2200,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -2161,30 +2257,15 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
-tf_cc_binary(
-    name = "graphviz_example",
-    srcs = ["graphviz_example.cc"],
-    deps = [
-        ":hlo",
-        ":hlo_graph_dumper",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 tf_cc_test(
     name = "hlo_module_test",
     srcs = ["hlo_module_test.cc"],
@@ -2282,6 +2363,7 @@ cc_library(
     deps = [
         ":call_graph",
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -2303,6 +2385,7 @@ tf_cc_test(
     srcs = ["hlo_dataflow_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
         ":hlo_matchers",
@@ -2473,6 +2556,7 @@ tf_cc_test(
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_matchers",
         ":instruction_fusion",
         ":tuple_points_to_analysis",
@@ -2545,6 +2629,7 @@ cc_library(
     hdrs = ["copy_insertion.h"],
     deps = [
         ":buffer_liveness",
+        ":dump",
         ":hlo",
         ":hlo_alias_analysis",
         ":hlo_dce",
@@ -2800,6 +2885,7 @@ cc_library(
         "hlo_pass_pipeline.h",
     ],
     deps = [
+        ":dump",
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
@@ -2809,6 +2895,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -2848,7 +2935,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -3026,8 +3112,6 @@ cc_library(
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -3143,6 +3227,7 @@ cc_library(
     hdrs = ["hlo_module_config.h"],
     deps = [
         ":computation_layout",
+        ":computation_placer",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -3189,48 +3274,19 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_tfgraph_builder",
-    srcs = ["hlo_tfgraph_builder.cc"],
-    hdrs = ["hlo_tfgraph_builder.h"],
-    deps = [
-        ":hlo",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_tfgraph_builder_test",
-    srcs = ["hlo_tfgraph_builder_test.cc"],
-    deps = [
-        ":hlo_tfgraph_builder",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "hlo_graph_dumper",
-    srcs = [
-        "hlo_graph_dumper.cc",
-        "hlo_graph_html_renderer.cc",
-    ],
+    srcs = ["hlo_graph_dumper.cc"],
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_execution_profile",
-        ":hlo_tfgraph_builder",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
@@ -3253,6 +3309,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -3270,7 +3327,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -3422,10 +3478,13 @@ cc_library(
     srcs = ["hlo_runner.cc"],
     hdrs = ["hlo_runner.h"],
     deps = [
+        ":backend",
+        ":compiler",
         ":computation_placer",
         ":executable",
         ":hlo",
         ":hlo_module_group",
+        ":hlo_parser",
         ":transfer_manager",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -3433,11 +3492,9 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
@@ -3486,6 +3543,37 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "stable_sort_expander",
+    srcs = ["stable_sort_expander.cc"],
+    hdrs = ["stable_sort_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "stable_sort_expander_test",
+    srcs = ["stable_sort_expander_test.cc"],
+    deps = [
+        ":algebraic_simplifier",
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":stable_sort_expander",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
@@ -3582,7 +3670,6 @@ cc_library(
         ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
     ],
@@ -3612,7 +3699,6 @@ cc_library(
         ":hlo_memory_scheduler",
         ":hlo_pass",
         ":hlo_pass_pipeline",
-        ":implicit_broadcast_remover",
         "//tensorflow/compiler/xla:statusor",
     ],
 )
@@ -3638,7 +3724,6 @@ cc_library(
         ":hlo_evaluator",
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3655,12 +3740,13 @@ tf_cc_test(
     extra_copts = ["-Wno-string-plus-int"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3677,11 +3763,14 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -3745,6 +3834,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "optimize_input_output_buffer_alias",
+    srcs = ["optimize_input_output_buffer_alias.cc"],
+    hdrs = ["optimize_input_output_buffer_alias.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_cc_test(
+    name = "optimize_input_output_buffer_alias_test",
+    srcs = ["optimize_input_output_buffer_alias_test.cc"],
+    deps = [
+        ":optimize_input_output_buffer_alias",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_library(
     name = "ar_crs_combiner",
     srcs = ["ar_crs_combiner.cc"],
@@ -3758,7 +3888,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8a02c48146ea264fdcba85325c84e27b70090170..b223fc8b1b50851f786a0a725ecc584b97b9838f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -52,6 +53,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -107,7 +109,7 @@ bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
 
   int exp;
   double mantissa = std::frexp(*val, &exp);
-  // frexp returns a value in the range (-1; -0.5] U [0.5, 1).  A return value
+  // frexp returns a value in the range (-1, -0.5] U [0.5, 1).  A return value
   // of +/-0.5 therefore indicates that the floating point value is a power of
   // 2.
   return mantissa == 0.5 || mantissa == -0.5;
@@ -219,6 +221,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandlePower(HloInstruction* power) override;
 
+  Status HandleRemainder(HloInstruction* remainder) override;
+
   Status HandleReshape(HloInstruction* reshape) override;
 
   Status HandleReduce(HloInstruction* reduce) override;
@@ -276,15 +280,51 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
         hlo));
   }
 
-  // Helper method to perform and add reduction in a single dimension.
-  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+  // Converts to primitive type if the input hlo is not that type, otherwise
+  // returns the original hlo.
+  HloInstruction* AsType(HloInstruction* hlo,
+                         const PrimitiveType element_type) {
+    if (hlo->shape().element_type() == element_type) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateConvert(
+        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
+  }
+
+  // Transposes a dot operand such that the batch dimensions are the msot major,
+  // and the contracting dimensions are most minor.
+  StatusOr<HloInstruction*> NormalizeDotOperandToBatchMajorAndContractingMinor(
+      HloInstruction* dot_operand, absl::Span<const int64> batch_dimensions,
+      absl::Span<const int64> contracting_dimensions) {
+    std::vector<int64> transpose_dimensions(batch_dimensions.begin(),
+                                            batch_dimensions.end());
+    for (int64 i = 0; i < dot_operand->shape().rank(); ++i) {
+      if (!(absl::c_linear_search(batch_dimensions, i) ||
+            absl::c_linear_search(contracting_dimensions, i))) {
+        transpose_dimensions.push_back(i);
+      }
+    }
+    transpose_dimensions.insert(transpose_dimensions.end(),
+                                contracting_dimensions.begin(),
+                                contracting_dimensions.end());
+    return MakeTransposeHlo(dot_operand, transpose_dimensions);
+  }
+
+  // Helper method to perform and add reduction on a list of dimensions.
+  HloInstruction* AddReduce(HloInstruction* hlo, absl::Span<const int64> dims) {
     HloInstruction* zero =
         computation_->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::Zero(hlo->shape().element_type()).Clone()));
     HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation();
-    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    Shape shape = ShapeUtil::FilterDimensions(
+        [&](int64 dim) { return !absl::c_linear_search(dims, dim); },
+        hlo->shape());
     return computation_->AddInstruction(HloInstruction::CreateReduce(
-        shape, hlo, zero, {dim}, AddReduce_computation));
+        shape, hlo, zero, dims, AddReduce_computation));
+  }
+
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    return AddReduce(hlo, std::vector<int64>{dim});
   }
 
   // Convenience method for replacing an instruction with a bitcast. If operand
@@ -812,10 +852,82 @@ Status InvertConstant(const HloInstruction& constant, Literal* result) {
     return T{1.0} / constant.literal().Get<T>(indices);
   });
 }
+
+template <typename T>
+std::unique_ptr<HloInstruction> TryDivideToShift(HloInstruction* divide,
+                                                 HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(divide->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(divide->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), a, zero_like_a,
+              ComparisonDirection::kLt));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+
+      auto* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, abs_dividend,
+          shift_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(divide->shape(), HloOpcode::kSelect,
+                                           dividend_is_negative,
+                                           neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+      HloInstruction* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+      return HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, a, shift_amount);
+    }
+  }
+
+  return nullptr;
+}
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
-  Shape* shape;
   HloInstruction *a, *b, *c, *d;
   CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
@@ -824,6 +936,61 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return Status::OK();
   }
 
+  // A / B => A >> log2(B) if B is a power of 2.
+  switch (divide->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
+  Shape* shape;
   // exp(A)/exp(B) => exp(A-B)
   if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
                         .WithShape(m::Shape(&shape)))) {
@@ -864,6 +1031,24 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
                     divide->shape(), HloOpcode::kMultiply, a, new_power));
   }
 
+  // A/sqrt(B) => A*rsqrt(X).
+  if (Match(divide, m::Divide(m::Op(&a), m::Sqrt(m::Op(&b))))) {
+    auto* rsqrt = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kRsqrt, b));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(rsqrt->shape(),
+                                             HloOpcode::kMultiply, a, rsqrt));
+  }
+
+  // A/rsqrt(B) => A*sqrt(B).
+  if (Match(divide, m::Divide(m::Op(&a), m::Rsqrt(m::Op(&b))))) {
+    auto* sqrt = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kSqrt, b));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(sqrt->shape(),
+                                             HloOpcode::kMultiply, a, sqrt));
+  }
+
   // Simplifying integral division would produce unexpected results.
   if (ShapeUtil::ElementIsIntegral(divide->shape())) {
     return Status::OK();
@@ -874,8 +1059,9 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   // (Backends can do this transformation, but generally only if the constant is
   // a scalar.)
   if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) {
-    Literal new_literal(b->shape());
-    switch (b->shape().element_type()) {
+    Shape result_shape = b->literal().shape();
+    Literal new_literal(result_shape);
+    switch (result_shape.element_type()) {
       case F16:
         TF_RETURN_IF_ERROR(InvertConstant<half>(*b, &new_literal));
         break;
@@ -958,7 +1144,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   const int64 rhs_rank = rhs->shape().rank();
   const int64 lhs_rank = lhs->shape().rank();
   const auto& dnums = dot->dot_dimension_numbers();
-  if (dnums.rhs_contracting_dimensions_size() > 1) {
+  if (dnums.rhs_contracting_dimensions_size() != 1) {
     return false;
   }
   if (dot_rank > 2 && (lhs_rank != rhs_rank || lhs_rank != dot_rank)) {
@@ -988,16 +1174,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     std::swap(rhs_collapsing_dim, rhs_kept_dim);
   }
 
-  auto as_type = [&](HloInstruction* hlo, const PrimitiveType element_type) {
-    if (hlo->shape().element_type() == element_type) {
-      return hlo;
-    }
-    return computation_->AddInstruction(HloInstruction::CreateConvert(
-        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
-  };
-
   auto reshape_if_necessary = [&](HloInstruction* hlo) {
-    hlo = as_type(hlo, dot->shape().element_type());
+    hlo = AsType(hlo, dot->shape().element_type());
     if (!ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
       hlo = computation_->AddInstruction(
           HloInstruction::CreateReshape(dot->shape(), hlo));
@@ -1006,7 +1184,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   };
 
   auto add_reduce_in_f32 = [&](HloInstruction* hlo, const int64 dim) {
-    return AddReduce(as_type(hlo, F32), dim);
+    return AddReduce(AsType(hlo, F32), dim);
   };
 
   auto broadcast = [&](HloInstruction* hlo, const Shape& shape,
@@ -1041,7 +1219,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     return true;
   }
 
-  // Simplify outer product into multiply with implicit broadcasting.
+  // Simplify outer product into multiply with broadcasting.
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
   if (rhs_rank == 2 && rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
@@ -1115,8 +1293,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     return dims;
   };
 
-  // If the contracting dimension is 1, remove the degnerate dimnesions from the
-  // lhs and rhs, broadcast each to the result shape and multiply.
+  // If the contracting dimension is 1, remove the degnerate dimnensions from
+  // the lhs and rhs, broadcast each to the result shape and multiply.
   if (lhs->shape().dimensions(lhs_collapsing_dim) == 1 &&
       (rhs_kept_dim == rhs_rank - 1 ||
        (rhs_collapsing_dim == rhs_rank - 1 && rhs_kept_dim == rhs_rank - 2))) {
@@ -1453,7 +1631,9 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-
+  if (options_.is_layout_sensitive()) {
+    return Status::OK();
+  }
   // Replace a zero element dot with a broadcast of the constant 0.
   if (ShapeUtil::IsZeroElementArray(dot->shape()) ||
       ShapeUtil::IsZeroElementArray(lhs->shape()) ||
@@ -1470,6 +1650,117 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
       dot->shape().element_type() != BF16) {
     return Status::OK();
   }
+
+  // If there are no contracting dimensions, a dot can be rewritten as
+  // mul(broadcast(transpose(x)),broadcast(transpose(y)))
+  if (dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 0) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    if (dot->shape().rank() != lhs->shape().rank()) {
+      std::vector<int64> lhs_broadcast_dims(lhs->shape().rank());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_lhs, lhs_broadcast_dims));
+    }
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+    if (dot->shape().rank() != rhs->shape().rank()) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      for (int64 i = lhs->shape().rank(); i < dot->shape().rank(); ++i) {
+        rhs_broadcast_dims.push_back(i);
+      }
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_rhs, rhs_broadcast_dims));
+    }
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
+                                          new_lhs, new_rhs));
+  }
+
+  // If the lhs or rhs have only batch and contracting dimensions, a dot can be
+  // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
+  if ((dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+           dot->dot_dimension_numbers().lhs_contracting_dimensions_size() ==
+       lhs->shape().rank()) ||
+      (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() +
+           dot->dot_dimension_numbers().rhs_batch_dimensions_size() ==
+       rhs->shape().rank())) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+
+    int64 lhs_outer_dims =
+        lhs->shape().rank() -
+        (dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    int64 rhs_outer_dims =
+        rhs->shape().rank() -
+        (dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().rhs_contracting_dimensions_size());
+    CHECK(lhs_outer_dims == 0 || rhs_outer_dims == 0);
+    if (rhs_outer_dims > 0) {
+      std::vector<int64> lhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      lhs_broadcast_dims.resize(lhs->shape().rank());
+      std::iota(lhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().lhs_batch_dimensions_size(),
+                lhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+                    rhs_outer_dims);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_rhs->shape(), new_lhs, lhs_broadcast_dims));
+    } else if (lhs_outer_dims > 0) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().rhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      rhs_broadcast_dims.resize(rhs->shape().rank());
+      std::iota(rhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().rhs_batch_dimensions_size(),
+                rhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+                    lhs_outer_dims);
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_lhs->shape(), new_rhs, rhs_broadcast_dims));
+    }
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                        MakeBinaryHlo(HloOpcode::kMultiply, new_lhs, new_rhs));
+    std::vector<int64> reduce_dims(
+        dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    new_dot = AsType(new_dot, F32);
+    const int64 outer_dims = std::max(rhs_outer_dims, lhs_outer_dims);
+    absl::c_iota(
+        reduce_dims,
+        outer_dims + dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+    new_dot = AddReduce(new_dot, reduce_dims);
+    new_dot = AsType(new_dot, dot->shape().element_type());
+    return ReplaceInstruction(dot, new_dot);
+  }
+
   if (lhs->shape().rank() > 2 || rhs->shape().rank() > 2 ||
       dot->shape().rank() > 2) {
     if (options_.enable_dot_strength_reduction() &&
@@ -1508,7 +1799,11 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   }
 
   // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+  if (dot->dot_dimension_numbers().lhs_batch_dimensions_size() == 0 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 1 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0) == 1 &&
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0) == 0 &&
+      lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
     DotDimensionNumbers dot_dimension_numbers;
     dot_dimension_numbers.add_lhs_contracting_dimensions(1);
     dot_dimension_numbers.add_rhs_contracting_dimensions(0);
@@ -2159,14 +2454,151 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
   return changed;
 }
 
+namespace {
+template <typename T>
+std::unique_ptr<HloInstruction> TryRemainderToAnd(HloInstruction* remainder,
+                                                  HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(remainder->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(remainder->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), a, zero_like_a,
+              ComparisonDirection::kLt));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      auto* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          remainder->shape(), HloOpcode::kAnd, abs_dividend, mask_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(
+          remainder->shape(), HloOpcode::kSelect, dividend_is_negative,
+          neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      HloInstruction* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+      return HloInstruction::CreateBinary(remainder->shape(), HloOpcode::kAnd,
+                                          a, mask_amount);
+    }
+  }
+  return nullptr;
+}
+}  // namespace
+
+Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
+  HloInstruction *a, *b;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  // A % B => A & (B - 1) if B is a power of 2.
+  switch (remainder->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
   // Reshape directly to empty constant if the shape contains zero-element
   // dimension.
   if (ShapeUtil::IsZeroElementArray(reshape->shape())) {
+    // If the instruction doesn't have a layout, use a default layout for
+    // the literal result.
+    Shape reshaped_shape = reshape->shape();
+    if (!LayoutUtil::HasLayout(reshaped_shape)) {
+      LayoutUtil::SetToDefaultLayout(&reshaped_shape);
+    }
     auto empty_constant = HloInstruction::CreateConstant(
-        Literal::CreateFromShape(reshape->shape()));
+        Literal::CreateFromShape(reshaped_shape));
 
     return ReplaceWithNewInstruction(reshape, std::move(empty_constant));
   }
@@ -2261,11 +2693,11 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
         int64 start = slice->slice_starts(i);
         int64 low = padding_config.dimensions(i).edge_padding_low();
         int64 data = pad->operand(0)->shape().dimensions(i);
-        if (start >= low && start < low + data) {
-          return false;
+        if (start < low || start >= low + data) {
+          return true;
         }
       }
-      return true;
+      return false;
     }();
 
     if (in_padding) {
@@ -2406,6 +2838,27 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
                    new_slice_starts, new_slice_limits, slice->slice_strides()));
   }
 
+  auto only_broadcast_dims_sliced = [&] {
+    if (slice->operand(0)->opcode() != HloOpcode::kBroadcast) {
+      return false;
+    }
+    for (int64 dim : slice->operand(0)->dimensions()) {
+      if (slice->slice_starts(dim) != 0 || slice->slice_strides(dim) != 1 ||
+          slice->slice_limits(dim) !=
+              slice->operand(0)->shape().dimensions(dim)) {
+        return false;
+      }
+    }
+    return true;
+  };
+  if (only_broadcast_dims_sliced()) {
+    return ReplaceWithNewInstruction(
+        slice,
+        HloInstruction::CreateBroadcast(
+            slice->shape(), slice->mutable_operand(0)->mutable_operand(0),
+            slice->mutable_operand(0)->dimensions()));
+  }
+
   TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifyScalarSlice(slice));
   if (replaced) {
     return Status::OK();
@@ -2452,28 +2905,72 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Most of those optimizations can be done for multi-output
-  // reduces.
-  if (reduce->shape().IsTuple()) {
-    return Status::OK();
-  }
+Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
+  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
+  bool multi_output_reduce = reduce->shape().IsTuple();
+
+  // For tuple reduce, we require all reduce shapes to be the same, up to the
+  // element types, so we can just the first operand and the first result as a
+  // representative.
+  auto arg = reduce->inputs()[0];
+  auto init_value = reduce->init_values()[0];
+  const Shape& reduce_result_shape =
+      multi_output_reduce ? reduce->shape().tuple_shapes(0) : reduce->shape();
 
-  auto arg = reduce->mutable_operand(0);
-  auto init_value = reduce->mutable_operand(1);
   absl::Span<const int64> dimensions(reduce->dimensions());
   HloComputation* function = reduce->to_apply();
   if (ShapeUtil::IsZeroElementArray(arg->shape()) ||
-      ShapeUtil::IsZeroElementArray(reduce->shape())) {
-    return ReplaceWithNewInstruction(
-        reduce,
-        HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
+      ShapeUtil::IsZeroElementArray(reduce_result_shape)) {
+    if (multi_output_reduce) {
+      std::vector<HloInstruction*> broadcast_inits;
+      int64 inputs = reduce->input_count();
+      for (int64 i = 0; i < inputs; ++i) {
+        broadcast_inits.push_back(computation_->AddInstruction(
+            HloInstruction::CreateBroadcast(reduce->shape().tuple_shapes(i),
+                                            reduce->init_values()[i], {})));
+      }
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateTuple(broadcast_inits));
+    } else {
+      return ReplaceWithNewInstruction(
+          reduce,
+          HloInstruction::CreateBroadcast(reduce_result_shape, init_value, {}));
+    }
+  }
+
+  // If the reduction results in the same number of elements, then the only
+  // possible side effect would be a reshape. Since the init_value is an
+  // identity of the reduction function, we can therefore replace the reduce
+  // with a simple reshape, ignoring the reduction function completely.
+  if (ShapeUtil::ElementsIn(reduce_result_shape) ==
+      ShapeUtil::ElementsIn(arg->shape())) {
+    if (multi_output_reduce) {
+      std::vector<HloInstruction*> reshaped_args;
+      int64 inputs = reduce->input_count();
+      for (int64 i = 0; i < inputs; ++i) {
+        reshaped_args.push_back(
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                reduce->shape().tuple_shapes(i), reduce->inputs()[i])));
+      }
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateTuple(reshaped_args));
+    } else {
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateReshape(reduce_result_shape, arg));
+    }
+  }
+
+  // TODO(b/112040122): Most of those optimizations below can be done for
+  // multi-output reduces.
+  if (multi_output_reduce) {
+    return Status::OK();
   }
 
   // A Transpose feeding a reduce can simply permute the reduction dimensions
   // field if the output of the reduce is a vector or scalar. Higher ranked
   // result may require a transpose of the output.
-  if (reduce->shape().rank() <= 1 && arg->opcode() == HloOpcode::kTranspose) {
+  if (reduce_result_shape.rank() <= 1 &&
+      arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
@@ -2481,20 +2978,10 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     }
     return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateReduce(
-                    reduce->shape(), arg->mutable_operand(0), init_value,
+                    reduce_result_shape, arg->mutable_operand(0), init_value,
                     new_reduce_dimensions, function));
   }
 
-  // If the reduction results in the same number of elements, then the only
-  // possible side effect would be a reshape. Since the init_value is an
-  // identity of the reduction function, we can therefore replace the reduce
-  // with a simple reshape, ignoring the reduction function completely.
-  if (ShapeUtil::ElementsIn(reduce->shape()) ==
-      ShapeUtil::ElementsIn(arg->shape())) {
-    return ReplaceWithNewInstruction(
-        reduce, HloInstruction::CreateReshape(reduce->shape(), arg));
-  }
-
   // If a reduce feeds a reduce with the same computation and initial value,
   // they can be combined into a single reduce.
   if (arg->opcode() == HloOpcode::kReduce &&
@@ -2520,9 +3007,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(),
                reduce_dims.end(), std::back_inserter(new_dimensions));
     return ReplaceWithNewInstruction(
-        reduce,
-        HloInstruction::CreateReduce(reduce->shape(), arg->mutable_operand(0),
-                                     init_value, new_dimensions, function));
+        reduce, HloInstruction::CreateReduce(
+                    reduce_result_shape, arg->mutable_operand(0), init_value,
+                    new_dimensions, function));
   }
 
   // A reshape that collapses multiple dimensions into a dimension being
@@ -2565,7 +3052,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
       }
       return ReplaceWithNewInstruction(
           reduce, HloInstruction::CreateReduce(
-                      reduce->shape(), arg->mutable_operand(0), init_value,
+                      reduce_result_shape, arg->mutable_operand(0), init_value,
                       new_reduce_dimensions, function));
     }
   }
@@ -2580,11 +3067,11 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     HloInstruction* old_reduce = nullptr;
     for (HloInstruction* operand : arg->operands()) {
       HloInstruction* new_reduce = computation_->AddInstruction(
-          HloInstruction::CreateReduce(reduce->shape(), operand, init_value,
+          HloInstruction::CreateReduce(reduce_result_shape, operand, init_value,
                                        reduce->dimensions(), function));
       if (old_reduce != nullptr) {
         new_reduce = computation_->AddInstruction(HloInstruction::CreateMap(
-            reduce->shape(), {old_reduce, new_reduce}, function));
+            reduce_result_shape, {old_reduce, new_reduce}, function));
       }
       old_reduce = new_reduce;
     }
@@ -2784,7 +3271,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   }
 
   if (is_effective_broadcast()) {
-    VLOG(10) << "Replacing pad/reduce-window with (implicit) broadcast.";
+    VLOG(10) << "Replacing pad/reduce-window with broadcast.";
     auto fadd = [this](std::unique_ptr<HloInstruction> x) {
       return computation_->AddInstruction(std::move(x));
     };
@@ -2846,109 +3333,6 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
-
-  if (!options_.enable_permutation_sort_replacement()) {
-    return Status::OK();
-  }
-  // Check if we are sorting a permutation. In that case, we know that the keys
-  // will be sorted to the identity permutation, and we can represent the
-  // changes to the 'values' parameter as a scatter.
-  if (sort->operand_count() == 2 &&
-      operand->opcode() == HloOpcode::kGetTupleElement) {
-    const HloInstruction* other_sort = operand->operand(0);
-    // Check whether the 'values' parameter is the result of another sort with
-    // the same sort dimension.
-    if (other_sort->opcode() == HloOpcode::kSort &&
-        other_sort->operand_count() >= 2 &&
-        other_sort->dimensions(0) == dimension_to_sort &&
-        other_sort->operand(operand->tuple_index())->opcode() ==
-            HloOpcode::kIota) {
-      auto* iota =
-          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
-      // The sort operand needs to be an integral iota, and the iota dimension
-      // needs to be the dimension that was sorted.
-      if (iota->iota_dimension() == dimension_to_sort &&
-          ShapeUtil::ElementIsIntegral(iota->shape())) {
-        // We use the following construction method for a Scatter that applies
-        // the permutation from 'keys' to the 'values' parameter.
-        // - Take the "keys" parameter of the second sort and reshape it to have
-        //   another "1" dimension at the end.
-        // - Concatenate it with iotas of the same extended shape with all
-        //   different iota_dimensions except the dimension_to_sort in the order
-        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
-        //   dimension_to_sort = 1, we would have concatenate of (iota with
-        //   iota_dimension=0, keys, iota with iota_dimension = 2)
-        // - Use this as the indices parameter of scatter, and set updates
-        //   of the scatter to be a reshaped 'values' parameter of sort (adding
-        //   'rank' many 1 dimensions at the end).
-        int64 rank = operand->shape().rank();
-        Shape extended_shape = operand->shape();
-        extended_shape.add_dimensions(1);
-        extended_shape.mutable_layout()->add_minor_to_major(rank);
-        auto reshaped_permutation = computation_->AddInstruction(
-            HloInstruction::CreateReshape(extended_shape, operand));
-        std::vector<HloInstruction*> concat_operands;
-        for (int64 i = 0; i < rank; ++i) {
-          if (i == dimension_to_sort) {
-            concat_operands.push_back(reshaped_permutation);
-          } else {
-            concat_operands.push_back(computation_->AddInstruction(
-                HloInstruction::CreateIota(extended_shape, i)));
-          }
-        }
-        Shape concat_shape = operand->shape();
-        concat_shape.add_dimensions(rank);
-        concat_shape.mutable_layout()->add_minor_to_major(rank);
-        auto scatter_indices =
-            rank > 1 ? computation_->AddInstruction(
-                           HloInstruction::CreateConcatenate(
-                               concat_shape, concat_operands, rank))
-                     : reshaped_permutation;
-
-        // We don't care about the operand, it will be completely overridden by
-        // the updates.
-        auto scatter_operand = computation_->AddInstruction(
-            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
-
-        // Construct the updates operand of scatter.
-        Shape update_shape = sort->operand(1)->shape();
-        for (int64 i = 0; i < rank; ++i) {
-          update_shape.add_dimensions(1);
-          update_shape.mutable_layout()->add_minor_to_major(rank + i);
-        }
-        auto scatter_updates =
-            computation_->AddInstruction(HloInstruction::CreateReshape(
-                update_shape, sort->mutable_operand(1)));
-
-        // Construct the updates computation, which simply replaces the operand
-        // values with the update values.
-        HloComputation::Builder b("update_replace_computation");
-        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
-        b.AddInstruction(
-            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
-        auto scalar_rhs = b.AddInstruction(
-            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
-        auto update_replace_computation =
-            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
-
-        ScatterDimensionNumbers dim_numbers;
-        dim_numbers.set_index_vector_dim(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          dim_numbers.add_update_window_dims(rank + i);
-          dim_numbers.add_scatter_dims_to_operand_dims(i);
-        }
-        auto scatter =
-            computation_->AddInstruction(HloInstruction::CreateScatter(
-                sort->operand(1)->shape(), scatter_operand, scatter_indices,
-                scatter_updates, update_replace_computation, dim_numbers));
-        return ReplaceWithNewInstruction(
-            sort, HloInstruction::CreateTuple(
-                      {computation_->AddInstruction(HloInstruction::CreateIota(
-                           operand->shape(), dimension_to_sort)),
-                       scatter}));
-      }
-    }
-  }
   return Status::OK();
 }
 
@@ -3325,6 +3709,11 @@ Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
         HloInstruction::CreateBroadcast(
             map->shape(), computation_->AddInstruction(std::move(clone)), {}));
   }
+  // Inline the map if the map computation only contains an elementwise
+  // operation that can accept arbitrary shapes.
+  if (map_root->opcode() == HloOpcode::kFusion || !map_root->IsElementwise()) {
+    return Status::OK();
+  }
   std::vector<HloInstruction*> new_operands;
   for (auto* root_operand : map_root->operands()) {
     if (root_operand->opcode() != HloOpcode::kParameter) {
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index ff3f638b22e290f6f6237a5a72a257aa23ecd78b..df5a8c2ec141458a95fafb76b1e99e4b04a61b28 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -71,17 +71,6 @@ class AlgebraicSimplifierOptions {
     return enable_conv_simplification_;
   }
 
-  // If enable_permutation_sort_replacement is true, a sort op that is known to
-  // sort a permutation will be replaced with a scatter op.
-  void set_enable_permutation_sort_replacement(
-      bool enable_permutation_sort_replacement) {
-    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
-  }
-
-  bool enable_permutation_sort_replacement() const {
-    return enable_permutation_sort_replacement_;
-  }
-
   // If enable_window_reduce_replacement is true, the kReduceWindow instruction
   // can be optimized by replacement with simpler operations.
   void set_enable_window_reduce_to_reduce_replacement(
@@ -99,7 +88,6 @@ class AlgebraicSimplifierOptions {
   bool is_layout_sensitive_{false};
   bool enable_dot_strength_reduction_{true};
   bool enable_conv_simplification_{true};
-  bool enable_permutation_sort_replacement_{false};
   bool enable_window_reduce_to_reduce_replacement_{true};
 };
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 0dea498456eda3a6dc070493312e9825e6787c32..7f399ce0f112822bd476054cc848b5c3fe26b389 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -194,6 +195,86 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
                   m::Broadcast(m::ConstantScalar(0.125)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, UnsignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT d = u32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::ShiftRightLogical(
+                  m::Parameter(0), m::Broadcast(m::ConstantScalar(3)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT d = s32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_shift =
+      m::ShiftRightLogical(match_abs, m::Broadcast(m::ConstantScalar(3)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_shift), match_shift)));
+}
+
+TEST_F(AlgebraicSimplifierTest, UnsignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT r = u32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::AndAnyOrder(m::Parameter(0),
+                                        m::Broadcast(m::ConstantScalar(7)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT r = s32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_and =
+      m::AndAnyOrder(match_abs, m::Broadcast(m::ConstantScalar(7)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_and), match_and)));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -424,6 +505,30 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
                                       m::Broadcast(m::Op().Is(zero)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, KeepNontrivialMap) {
+  const char* kModuleStr = R"(
+    HloModule m
+    fusion {
+      x = f32[] parameter(0)
+      c = f32[] constant(42)
+      m = f32[] multiply(x, x)
+      ROOT a = f32[] add(m, c)
+    }
+
+    map {
+      x = f32[] parameter(0)
+      ROOT f = f32[] fusion(x), kind=kLoop, calls=fusion
+    }
+
+    ENTRY test {
+      p = f32[2,2] parameter(0)
+      ROOT map = f32[2,2] map(p), dimensions={0,1}, to_apply=map
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
@@ -2147,9 +2252,8 @@ TEST_F(AlgebraicSimplifierTest, TransposeIsReshape) {
       ROOT reshaped_again = f32[10] reshape(f32[10,1,1] transposed)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2614,6 +2718,33 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfBroadcastToBroadcast) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0}), "param"));
+  HloInstruction* broadcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::MakeShape(F32, {dim0, dim1}), param, {0}));
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0, dim1 - 9}), broadcast,
+      /*start_indices=*/{0, 3},
+      /*limit_indices=*/{dim0, dim1 - 6}, /*strides=*/{1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Broadcast(m::Parameter(0)))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
   HloComputation::Builder builder(TestName());
   const int64 dim0 = 11;
@@ -2668,93 +2799,23 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {1});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false, &builder,
+                           module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
-TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
-  const char* hlo_string = R"(
-    HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options;
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              GmockMatch(m::Tuple(
-                  m::Iota(),
-                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
-                             m::Reshape()))));
-}
-
-TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
-  // Same as ReplacePermutationSortWithScatter except that the iota has F32
-  // type.
-  const char* hlo_string = R"(
-    HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = f32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options;
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-}
-
-TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
-  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
-  // don't match.
-  const char* hlo_string = R"(
-   HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options;
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-}
-
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {5, 0});
   Shape values_shape = ShapeUtil::MakeShape(S32, {5, 0});
@@ -2764,10 +2825,11 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
       HloInstruction::CreateParameter(1, values_shape, "values0"));
   auto values1 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, values_shape, "values1"));
-  builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
-      keys, {values0, values1}));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(ShapeUtil::MakeTupleShape(
+                               {keys_shape, values_shape, values_shape}),
+                           {keys, values0, values1}, 0, /*is_stable=*/false,
+                           &builder, module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -3703,8 +3765,8 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums,
                                                    DefaultPrecisionConfig(2)));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
@@ -3949,7 +4011,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
       param = f32[3,4] parameter(0)
       constant = f32[] constant(0.0)
       pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
-      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[4:5]}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -3960,6 +4022,27 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   const char* hlo_string = R"(
     HloModule module
@@ -3981,6 +4064,29 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadSomeDimsInPadding) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY entry () -> f32[1]{0} {
+      constant.val = f32[] constant(4)
+      constant.pad = f32[] constant(-7)
+      reshape.1 = f32[1,1,1]{2,1,0} reshape(f32[] constant.val)
+      pad = f32[3,3,3]{2,1,0} pad(f32[1,1,1]{2,1,0} reshape.1, f32[] constant.pad), padding=0_2x0_2x2_0
+      slice = f32[1,1,1]{2,1,0} slice(f32[3,3,3]{2,1,0} pad), slice={[0:1], [0:1], [0:1]}
+      ROOT reshape.2 = f32[1]{0} reshape(f32[1,1,1]{2,1,0} slice)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::ConstantScalar(-7.0))));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   const char* hlo_string = R"(
     HloModule module
@@ -4211,10 +4317,24 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   int m, k, n;
   PrimitiveType element_type;
   std::tie(m, k, n, element_type) = GetParam();
-
-  Shape dot_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, n});
-  Shape lhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, m, k});
-  Shape rhs_shape = ShapeUtil::MakeShape(element_type, {1, 3, 5, k, n});
+  std::vector<int64> lhs_dims = {1, 3, 5};
+  std::vector<int64> rhs_dims = lhs_dims;
+  std::vector<int64> output_dims = lhs_dims;
+  if (m > 0) {
+    lhs_dims.push_back(m);
+    output_dims.push_back(m);
+  }
+  if (k > 0) {
+    lhs_dims.push_back(k);
+    rhs_dims.push_back(k);
+  }
+  if (n > 0) {
+    rhs_dims.push_back(n);
+    output_dims.push_back(n);
+  }
+  Shape dot_shape = ShapeUtil::MakeShape(element_type, output_dims);
+  Shape lhs_shape = ShapeUtil::MakeShape(element_type, lhs_dims);
+  Shape rhs_shape = ShapeUtil::MakeShape(element_type, rhs_dims);
   HloComputation::Builder builder(TestName());
 
   auto lhs = builder.AddInstruction(
@@ -4228,16 +4348,18 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(2);
-  dot_dnums.add_lhs_contracting_dimensions(4);
-  dot_dnums.add_rhs_contracting_dimensions(3);
+  if (k > 0) {
+    dot_dnums.add_lhs_contracting_dimensions(m > 0 ? 4 : 3);
+    dot_dnums.add_rhs_contracting_dimensions(3);
+  }
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
   auto computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
-  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
-  const bool computation_should_be_modified = dot_should_be_transformed;
-  EXPECT_EQ(changed, computation_should_be_modified);
+  const bool dot_should_be_transformed =
+      m == 1 || k == 1 || n == 1 || m == -1 || k == -1 || n == -1;
+  EXPECT_EQ(changed, dot_should_be_transformed);
   bool has_no_dot = true;
   for (const auto& hlo : computation->instructions()) {
     if (hlo->opcode() == HloOpcode::kDot) {
@@ -4248,10 +4370,12 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   EXPECT_EQ(has_no_dot, dot_should_be_transformed);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    BatchDotStrengthReductionTestInstantiation, BatchDotStrengthReductionTest,
-    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
-                       ::testing::Values(1, 2), ::testing::Values(F32, BF16)));
+INSTANTIATE_TEST_SUITE_P(BatchDotStrengthReductionTestInstantiation,
+                         BatchDotStrengthReductionTest,
+                         ::testing::Combine(::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(F32, BF16)));
 
 class DotStrengthReductionTest
     : public AlgebraicSimplifierTest,
@@ -4691,5 +4815,156 @@ INSTANTIATE_TEST_SUITE_P(
     DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
     ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
 
+TEST_F(AlgebraicSimplifierTest, TupleReduceReshape) {
+  const char* hlo_string = R"(
+HloModule module
+
+reducer {
+  parameter.1 = f32[] parameter(0)
+  parameter.3 = f32[] parameter(2)
+  add.2 = f32[] add(parameter.1, parameter.3)
+  parameter.0 = f32[] parameter(1)
+  parameter.2 = f32[] parameter(3)
+  add.3 = f32[] add(parameter.0, parameter.2)
+  ROOT tuple.4 = (f32[], f32[]) tuple(add.2, add.3)
+}
+
+ENTRY entry {
+  parameter.6 = (f32[], f32[]) parameter(0)
+  get-tuple-element.10 = f32[] get-tuple-element(parameter.6), index=0
+  get-tuple-element.11 = f32[] get-tuple-element(parameter.6), index=1
+  constant = f32[] constant(0)
+  ROOT reduce = (f32[], f32[]) reduce(get-tuple-element.10, get-tuple-element.11, constant, constant), dimensions={}, to_apply=reducer
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Tuple(
+                        m::Reshape(m::GetTupleElement(m::Parameter(), 0)),
+                        m::Reshape(m::GetTupleElement(m::Parameter(), 1)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, TupleReduceBroadcast) {
+  const char* hlo_string = R"(
+HloModule module
+
+reducer {
+  parameter.1 = f32[] parameter(0)
+  parameter.3 = f32[] parameter(2)
+  mul.2 = f32[] add(parameter.1, parameter.3)
+  parameter.0 = f32[] parameter(1)
+  parameter.2 = f32[] parameter(3)
+  add.3 = f32[] add(parameter.0, parameter.2)
+  ROOT tuple.4 = (f32[], f32[]) tuple(mul.2, add.3)
+}
+
+ENTRY entry {
+  parameter.6 = (f32[0, 10, 10], f32[0, 10, 10]) parameter(0)
+  get-tuple-element.10 = f32[0, 10, 10] get-tuple-element(parameter.6), index=0
+  get-tuple-element.11 = f32[0, 10, 10] get-tuple-element(parameter.6), index=1
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(1)
+  ROOT reduce = (f32[10, 10], f32[10, 10]) reduce(get-tuple-element.10, get-tuple-element.11, constant.0, constant.1), dimensions={0}, to_apply=reducer
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Tuple(m::Broadcast(m::ConstantScalar(0)),
+                                        m::Broadcast(m::ConstantScalar(1)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1}), "param"));
+  HloInstruction* broadcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::MakeShape(F32, {0, 1}), param, {1}));
+
+  // Create a reshape with zero sized result and without layout.
+  Shape reshaped_shape = ShapeUtil::MakeShape(F32, {0});
+  reshaped_shape.clear_layout();
+  builder.AddInstruction(
+      HloInstruction::CreateReshape(reshaped_shape, broadcast));
+
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+}
+
+TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+  shape.clear_layout();
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+
+  HloInstruction* const_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(20.0f)));
+  builder.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                                      param, const_value));
+
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Multiply()));
+}
+
+// Test that 1/sqrt(X) is simplified to rsqrt(X).
+TEST_F(AlgebraicSimplifierTest, RecipSqrt) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      sqrt = f32[] sqrt(p0)
+      ROOT div = f32[] divide(p1, sqrt)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(m::Parameter(1),
+                                             m::Rsqrt(m::Parameter(0)))));
+}
+
+// Test that 1/rsqrt(X) is simplified to sqrt(X).
+TEST_F(AlgebraicSimplifierTest, RecipRsqrt) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      rsqrt = f32[] rsqrt(p0)
+      ROOT div = f32[] divide(p1, rsqrt)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(m::Parameter(1),
+                                             m::Sqrt(m::Parameter(0)))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 8e327ee84b4c1cdab12a837aa684f14dd0c172c4..52d6982c70f7962ea9f54db0a4b1f2089a122c1c 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -29,18 +29,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
-namespace {
-
 namespace m = match;
 
-// Returns true iff the argument instruction is an AllReduce, followed by a
-// certain sequence of instructions and then a CRS. It must be possible to move
-// the AR past each instruction in the sequence.
-bool MatchesArCrsPattern(HloInstruction* instruction) {
+// Checks if the argument instruction is an AllReduce, followed by a certain
+// sequence of instructions and then a CRS. It must be possible to move
+// the AR past each instruction in the sequence. Returns the CRS, which is the
+// last instruction in the sequence.
+absl::optional<ArCrsCombiner::ArCrsPair> ArCrsCombiner::MatchesArCrsPattern(
+    HloInstruction* instruction) {
   auto can_ar_move_past_instruction = [](HloInstruction* instruction) -> bool {
     if (instruction->user_count() != 1) {
       return false;
@@ -73,21 +72,26 @@ bool MatchesArCrsPattern(HloInstruction* instruction) {
   if (!instruction->IsCrossModuleAllReduce() ||
       !computation_is_addition(instruction->called_computations()[0]) ||
       instruction->user_count() != 1) {
-    return false;
+    return absl::nullopt;
   }
   auto next = instruction->users()[0];
+  int64 distance = 1;
   while (!next->IsCrossReplicaAllReduce()) {
     if (can_ar_move_past_instruction(next)) {
       next = next->users()[0];
     } else {
-      return false;
+      return absl::nullopt;
     }
+    ++distance;
+  }
+  if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
+      computation_is_addition(next->called_computations()[0])) {
+    return absl::optional<ArCrsPair>(ArCrsPair(instruction, next, distance));
+  } else {
+    return absl::nullopt;
   }
-  return computation_is_addition(next->called_computations()[0]);
 }
 
-}  // namespace
-
 absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
     HloInstruction* instruction) {
   CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
@@ -99,7 +103,7 @@ absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
       return caller_instruction;
     }
   }
-  return absl::optional<HloInstruction*>();
+  return absl::nullopt;
 }
 
 std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
@@ -229,10 +233,56 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
 }
 
 void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  // Say that two or more ARs lead to the same CRS: (AR1, CRS), (AR2, CRS),
+  // ... , (ARn, CRS).
+  // If as we traverse the HLO graph we start tracking the pair (AR2, CRS),
+  // and later find that AR1's distance from the CRS is longer, we discard
+  // AR2 and start tracking AR1. We put the discarded ids in this set, in order
+  // to skip processing of short paths when we encounter the other ARs that
+  // have the same id as AR2.
+  absl::flat_hash_set<int64> discarded_ar_ids;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (MatchesArCrsPattern(instruction)) {
-        all_reduce_map_[*(instruction->all_reduce_id())].push_back(instruction);
+      auto maybe_pair = MatchesArCrsPattern(instruction);
+      if (maybe_pair) {
+        auto pair = *maybe_pair;
+        int64 ar_id = *(instruction->all_reduce_id());
+        if (discarded_ar_ids.find(ar_id) != discarded_ar_ids.end()) {
+          continue;
+        }
+        auto it = crs_reserved_map_.find(pair.crs);
+        if (it != crs_reserved_map_.end()) {
+          auto prev_ar_id = it->second;
+          // Since there is another AR paired with CRS,
+          // all_reduce_map_[prev_ar_id] should exist, but
+          // all_reduce_map_[ar_id] shouldn't.
+          CHECK(all_reduce_map_.find(ar_id) == all_reduce_map_.end());
+          CHECK_NE(prev_ar_id, ar_id);
+          auto prev_pair = all_reduce_map_[prev_ar_id].back();
+          int64 prev_distance = prev_pair.distance;
+          if (prev_distance < pair.distance) {
+            // The current AR's distance to CRS is longer than the previously
+            // tracked AR, so we discard the previous AR.
+            all_reduce_map_.erase(prev_ar_id);
+            discarded_ar_ids.insert(prev_ar_id);
+            all_reduce_map_[ar_id].push_back(pair);
+            crs_reserved_map_[pair.crs] = ar_id;
+          } else {
+            // Discard the current AR id because we are keeping the previously
+            // tracked AR.
+            discarded_ar_ids.insert(ar_id);
+          }
+        } else {
+          if (all_reduce_map_.find(ar_id) != all_reduce_map_.end()) {
+            int64 prev_distance = all_reduce_map_[ar_id].back().distance;
+            CHECK_EQ(prev_distance, pair.distance)
+                << "All ARs with the same AR ID must have the same distance "
+                   "from the corresponding CRSs. Found: "
+                << prev_distance << " and " << pair.distance;
+          }
+          all_reduce_map_[ar_id].push_back(pair);
+          crs_reserved_map_[pair.crs] = ar_id;
+        }
       }
     }
   }
@@ -241,11 +291,11 @@ void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
 void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
   for (auto it : all_reduce_map_) {
     auto all_reduce_id = it.first;
-    auto instruction_vec = it.second;
-    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
-    auto instr_0 = instruction_vec[0];
-    for (int i = 1; i < instruction_vec.size(); ++i) {
-      auto instr_i = instruction_vec[i];
+    auto pairs_vec = it.second;
+    CHECK_EQ(pairs_vec.size(), num_spatial_partitions_);
+    auto instr_0 = pairs_vec[0].ar;
+    for (int i = 1; i < pairs_vec.size(); ++i) {
+      auto instr_i = pairs_vec[i].ar;
       auto next_0 = instr_0->users()[0];
       auto next_i = instr_i->users()[0];
       absl::flat_hash_map<int64, int64> visited_pairs;
@@ -269,8 +319,9 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
     return false;
   }
   for (auto it : all_reduce_map_) {
-    auto instruction_vec = it.second;
-    for (auto all_reduce : instruction_vec) {
+    auto pairs_vec = it.second;
+    for (auto pair : pairs_vec) {
+      auto all_reduce = pair.ar;
       auto parent_computation = all_reduce->parent();
       auto all_reduce_id = all_reduce->all_reduce_id();
       auto prev = all_reduce->mutable_operand(0);
@@ -291,16 +342,23 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
                                      ? next->operands()[1]
                                      : next->operands()[0];
             // To move the AR past the addition/subtraction, we need to divide
-            // other_operand by the number of spatial partitions.
-            auto shape = other_operand->shape();
-            Literal lit(shape);
-            lit.PopulateWithValue<float>(num_spatial_partitions_);
-            auto divisor = parent_computation->AddInstruction(
-                HloInstruction::CreateConstant(lit.Clone()));
-            auto division =
-                parent_computation->AddInstruction(HloInstruction::CreateBinary(
-                    shape, HloOpcode::kDivide, other_operand, divisor));
-            TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            // other_operand by the number of spatial partitions, except if
+            // other_operand is a cross-module AR, which can be eliminated.
+            if (other_operand->IsCrossModuleAllReduce() &&
+                other_operand->user_count() == 1) {
+              TF_CHECK_OK(other_operand->ReplaceAllUsesWith(
+                  other_operand->mutable_operand(0)));
+            } else {
+              auto shape = other_operand->shape();
+              Literal lit(shape);
+              lit.PopulateWithValue<float>(num_spatial_partitions_);
+              auto divisor = parent_computation->AddInstruction(
+                  HloInstruction::CreateConstant(lit.Clone()));
+              auto division = parent_computation->AddInstruction(
+                  HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                               other_operand, divisor));
+              TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            }
             break;
           }
           default:
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 6f54b97615b270bc6b180dd47d9aff6473752b47..f503e1d5f2b519687e40818a61f0c0be9dfd3ab0 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -26,11 +26,47 @@ limitations under the License.
 namespace xla {
 
 // When the HLO graph contains a cross-module AllReduce, followed by some simple
-// linear operations, followed by a cross-replica AllReduce, we can combine the
-// CMAR and the CRAR, to use an efficient AllReduce implementation that fully
-// utilizes the interconnect bandwidth.
+// linear operations, followed by a cross-replica AllReduce (also known as
+// cross-replica sum, or CRS), we can combine the CMAR and the CRAR, to use an
+// efficient AllReduce implementation that fully utilizes the interconnect
+// bandwidth.
 // Such sequences appear in spatially partitioned models.
-// This pass must run right after spatial partitioning.
+// This pass must run right after spatial partitioning, when the code is still
+// in a single HLO module.
+//
+// The steps are:
+// 1) Find CMARs followed by simple ops followed by CRARs.
+// 2) Group CMARs by all_reduce_id. They must all be rewritten.
+// 3) Prove that the CMAR patterns in each core produce the same result.
+// 4) Eliminate the CMAR, and if it feeds an addition/subtraction, divide the
+//    other operand by the number of spatial partitions.
+// 5) Turn the CRAR into an all-core AllReduce.
+//
+// The pass also handles the case where multiple CMARs lead to the same CRAR,
+// and eliminates all CMARs. This graph:
+//
+//        Y
+//        |
+//  X   CMAR_2   Z
+//  |      \    /
+// CMAR_1     +
+//    \     /
+//       +
+//       |
+//     CRAR
+//
+// gets rewritten to:
+//
+//           Z   num_partitions
+//            \  /
+//       Y    div
+//        \   /
+//    X     +
+//     \   /
+//       +
+//       |
+//  all-core AR
+//
 class ArCrsCombiner : public HloModulePass {
  public:
   ArCrsCombiner(int num_spatial_partitions)
@@ -43,6 +79,28 @@ class ArCrsCombiner : public HloModulePass {
                                                HloInstruction* i2);
 
  private:
+  // We used this struct because multiple ARs could be paired with the same CRS.
+  // In this case, we want to select the AR that is furthest from the CRS,
+  // because it makes it easier to eliminate all ARs during RewriteGraph.
+  struct ArCrsPair {
+    HloInstruction* ar;
+    HloInstruction* crs;
+    // The length of the path from AR to CRS in the HLO graph.
+    int64 distance;
+
+    ArCrsPair(HloInstruction* all_reduce, HloInstruction* cross_replica_sum,
+              int64 dist)
+        : ar(all_reduce), crs(cross_replica_sum), distance(dist) {}
+
+    string ToString() {
+      return absl::StrCat("(AR: ", ar->name(), ", CRS: ", crs->name(),
+                          ", distance: ", distance, ")");
+    }
+  };
+
+  absl::optional<ArCrsCombiner::ArCrsPair> MatchesArCrsPattern(
+      HloInstruction* instruction);
+
   // If the passed instruction is a while parameter, and the while body is only
   // called by a single while instruction, return the while instruction.
   absl::optional<HloInstruction*> WhileFromBodyParameter(
@@ -80,8 +138,13 @@ class ArCrsCombiner : public HloModulePass {
 
   int num_spatial_partitions_;
 
-  // Map from all-reduce ids to the all reduce instructions.
-  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+  // Map from all-reduce ids to the AR/CRS pairs.
+  absl::flat_hash_map<int64, std::vector<ArCrsPair>> all_reduce_map_;
+
+  // Map from a CRS instruction to the all-reduce ID of the AR paired with the
+  // CRS. Sometimes, several ARs in the code could be paired with the same CRS.
+  // We use this map to pick a single AR/CRS path to rewrite.
+  absl::flat_hash_map<HloInstruction*, int64> crs_reserved_map_;
 
   std::unique_ptr<CallGraph> call_graph_;
 };
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 08eb77f0320ff47099873bcdeebff7844aaac125..b972b1289b92a8f29818ff74512f679b9f44a131 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -221,7 +221,7 @@ HloModule foobar
   %x = (f32[2,2], f32[2,2]) parameter(0)
   %constant.0 = s32[] constant(0)
   %constant.1 = s32[] constant(1)
-  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+  ROOT %greater-than = pred[] compare(s32[] %constant.1, s32[] %constant.0), direction=GT
 }
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
@@ -258,7 +258,7 @@ HloModule foobar
   %x = (f32[2,2], f32[2,2]) parameter(0)
   %constant.0 = s32[] constant(0)
   %constant.1 = s32[] constant(1)
-  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+  ROOT %greater-than = pred[] compare(s32[] %constant.1, s32[] %constant.0), direction=GT
 }
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
@@ -296,7 +296,7 @@ HloModule foobar
   %x = (f32[2,2], f32[2,2]) parameter(0)
   %constant.0 = s32[] constant(0)
   %constant.1 = s32[] constant(1)
-  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+  ROOT %greater-than = pred[] compare(s32[] %constant.1, s32[] %constant.0), direction=GT
 }
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
@@ -773,5 +773,405 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   CompareReplicaGroups(replica_groups_before, replica_groups_after);
 }
 
+TEST_F(ArCrsCombinerTest, RewriteMultipleAdds) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.1 = f32[] constant(1)
+  %constant.2 = f32[] constant(2)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.11 = f32[]
+      add(%constant.1, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %add.12 = f32[]
+      add(%constant.2, %add.11),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%add.12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.21 = f32[]
+      add(%constant.1, %all-reduce.ar.2),
+      sharding={maximal device=0}
+  %add.22 = f32[]
+      add(%constant.2, %add.21),
+      sharding={maximal device=0}
+  %all-reduce.2 = f32[]
+      all-reduce(%add.22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter()))),
+                        op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter())))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArSubtractCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+  %sub.1 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%sub.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+  %sub.2 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%sub.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter())),
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsLeft) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar11, %const1),
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%add11, %ar12),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar21, %const1),
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%add21, %ar22),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Parameter())),
+                        op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsRight) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar12, %const1),
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%ar11, %add11),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar22, %const1),
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%ar21, %add21),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant())))),
+                op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant()))))));
+
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, OneReplicaDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%p),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%convert.1),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=1}
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%convert.2),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 215e8ced4bb3f98a26ac4eb9912a7fd4d917852f..d016d3e03d5e994841b81cda6214b6ff7cb550be 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
@@ -67,18 +66,38 @@ const absl::optional<std::set<int>>& BackendOptions::allowed_devices() const {
   return allowed_devices_;
 }
 
+namespace {
+
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+  explicit EigenThreadPoolWrapper(tensorflow::thread::ThreadPool* pool)
+      : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+
+  void Schedule(std::function<void()> fn) override {
+    pool_->Schedule(std::move(fn));
+  }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+  tensorflow::thread::ThreadPool* pool_ = nullptr;
+};
+
+}  // namespace
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
-struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper(const int num_threads)
+struct Backend::IntraOpThreadPool {
+  explicit IntraOpThreadPool(const int num_threads)
       : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
                                                 "XLAEigen", num_threads)),
-        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        wrapper(new EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
 
   std::unique_ptr<tensorflow::thread::ThreadPool> pool;
-  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<EigenThreadPoolWrapper> wrapper;
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
@@ -146,8 +165,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tensorflow::port::NumSchedulableCPUs();
-    intra_op_thread_pool_wrapper_.reset(
-        new EigenThreadPoolWrapper(num_threads));
+    intra_op_thread_pool_.reset(new IntraOpThreadPool(num_threads));
   }
 }
 
@@ -159,17 +177,17 @@ int Backend::default_device_ordinal() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->device.get();
+  return intra_op_thread_pool_->device.get();
 }
 
 tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->pool.get();
+  return intra_op_thread_pool_->pool.get();
 }
 
 StatusOr<se::StreamExecutor*> Backend::stream_executor(
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index c35f033dc0180409ae3888c2050021da83f5c72a..e7f29a044b95015aa7e547373c24971646833280 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -156,7 +156,6 @@ class Backend {
   Status ResetDevices();
 
  private:
-  struct EigenThreadPoolWrapper;
   Backend(se::Platform* platform, Compiler* compiler,
           absl::Span<se::StreamExecutor* const> stream_executors,
           TransferManager* transfer_manager,
@@ -183,7 +182,8 @@ class Backend {
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
 
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
-  std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
+  struct IntraOpThreadPool;
+  std::unique_ptr<IntraOpThreadPool> intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
index eda026ac5685dc469a6230094eb28b3618e36400..dbabd82dd55465dd4c85a56aea849a3e3702d6bf 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -28,6 +28,13 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
                  *rhs = batch_dot->mutable_operand(1);
   const Shape& lhs_shape = lhs->shape();
 
+  // A dot with no contracting dims will be rewritten into a multiply by
+  // AlgebraicSimplifier. Dots with multiple contracting dims are currently
+  // unsupported.
+  if (dim_numbers.lhs_contracting_dimensions_size() != 1) {
+    return false;
+  }
+
   std::vector<int64> degenerate_dims;
   for (int64 batch_dim : dim_numbers.lhs_batch_dimensions()) {
     if (lhs_shape.dimensions(batch_dim) == 1) {
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 52ec1a794c5e9f4452a4bf2b648f453d8acfe976..a81f394a38f091b89b7f1e4d26653ff549f35b75 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -169,5 +169,47 @@ main {
                   /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/2)));
 }
 
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsNonContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,101] parameter(0)
+  b = f32[1,101] parameter(1)
+  ROOT dot = f32[1,101,101] dot(a,b), lhs_batch_dims={0},
+                                      lhs_contracting_dims={},
+                                      rhs_batch_dims={0},
+                                      rhs_contracting_dims={}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsMultipleContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  lhs = f32[1,5,17,10,13] parameter(0)
+  rhs = f32[1,9,10,13,6,5] parameter(1)
+  ROOT dot = f32[10,1,17,9,6] dot(lhs,rhs), lhs_batch_dims={3,0},
+                                            rhs_batch_dims={2,0},
+                                            lhs_contracting_dims={1,4},
+                                            rhs_contracting_dims={5,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index e5f5c3edb2ac0c217317fbf809463aa31af9af59..d14e803be6ad6d0b7a7e22442de7e6da77f93577 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -95,15 +95,8 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
       HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* exponent = add_instruction(HloInstruction::CreateBroadcast(
-        operand->shape(),
-        add_instruction(HloInstruction::CreateConvert(
-            ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-            add_instruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<float>(-0.5f))))),
-        {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kPower,
-                                        operand, exponent);
+    return HloInstruction::CreateUnary(operand->shape(), HloOpcode::kRsqrt,
+                                       operand);
   }
 
   std::unique_ptr<HloInstruction> Mean(
@@ -524,7 +517,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted);
 
   // Grad[Y] * (X - E[X]).
-  auto grad_output_times_activiation_minus_mean =
+  auto grad_output_times_activation_minus_mean =
       add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
                  activation_minus_mean);
 
@@ -532,9 +525,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       GetOrCreateScalarAddComputation(ptype);
 
   // sum(Grad[Y] * (X - E[X])).
-  auto sum_grad_output_times_activiation_minus_mean =
+  auto sum_grad_output_times_activation_minus_mean =
       add(HloInstruction::CreateReduce(
-          feature_shape, grad_output_times_activiation_minus_mean, zero,
+          feature_shape, grad_output_times_activation_minus_mean, zero,
           dimensions_without_feature, add_reduce_computation));
 
   // Grad[beta] = Sum(Grad[Y]).
@@ -544,7 +537,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
   auto grad_scale = add_binary(feature_shape, HloOpcode::kMultiply,
-                               sum_grad_output_times_activiation_minus_mean,
+                               sum_grad_output_times_activation_minus_mean,
                                rsqrt_var_add_epsilon);
 
   // I2 = Sum(Grad[Y])
@@ -553,7 +546,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   // I3 = Sum(Grad[Y] * (X - E[X]))
   auto i3 = add(HloInstruction::CreateBroadcast(
-      activation_shape, sum_grad_output_times_activiation_minus_mean,
+      activation_shape, sum_grad_output_times_activation_minus_mean,
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 8e8fbbd935b154e5a77d68e60d861601d740bf03..34b516184fa861bd71f99f70a32782d242f11914 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -60,7 +60,7 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
 
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, input_shape, "activiation"));
+      HloInstruction::CreateParameter(0, input_shape, "activation"));
 
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, scale_shape, "scale"));
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index d1b14d604f0559b6b18f7d1fba127669c241c8a3..72459961485f77b690eed6b8bde2cd03ebe770f1 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -84,7 +84,12 @@ Status BFloat16NormalizationVisitor::InsertConvertAfterOutput(
   auto convert = computation->AddInstruction(
       HloInstruction::CreateConvert(hlo->shape(), hlo));
   for (auto* user : materialized_users) {
-    TF_RETURN_IF_ERROR(hlo->ReplaceUseWith(user, convert));
+    if (user->opcode() == HloOpcode::kConvert &&
+        user->shape().element_type() == F32) {
+      TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(hlo));
+    } else {
+      TF_RETURN_IF_ERROR(hlo->ReplaceUseWith(user, convert));
+    }
   }
   if (is_root) {
     computation->set_root_instruction(convert);
@@ -205,6 +210,28 @@ Status BFloat16NormalizationVisitor::HandleMultipleOutputs(
     return Status::OK();
   }
 
+  std::vector<HloComputation*> bf16_called_comps;
+  for (auto* comp : hlo->called_computations()) {
+    bool comp_has_bf16 = false;
+    if (comp->root_instruction()->shape().element_type() == F32) {
+      f32_count += 1;
+    } else if (comp->root_instruction()->shape().element_type() == BF16) {
+      bf16_count += 1;
+      comp_has_bf16 = true;
+    }
+    for (auto* param : comp->parameter_instructions()) {
+      if (param->shape().element_type() == F32) {
+        f32_count += 1;
+      } else if (param->shape().element_type() == BF16) {
+        bf16_count += 1;
+        comp_has_bf16 = true;
+      }
+    }
+    if (comp_has_bf16) {
+      bf16_called_comps.push_back(comp);
+    }
+  }
+
   std::vector<HloInstruction*> materialized_users = hlo->users();
   std::vector<HloInstruction*> output_elements(hlo->operand_count());
   auto original_shape = hlo->shape();
@@ -236,7 +263,7 @@ Status BFloat16NormalizationVisitor::HandleMultipleOutputs(
     computation_->set_root_instruction(tuple);
   }
   *tuple->mutable_shape() = original_shape;
-  return Status::OK();
+  return ConvertCalledComputations(hlo, bf16_called_comps);
 }
 
 Status BFloat16NormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 551ac4be73a7630d213a53ca3606aa7f890cd794..7dd46ca4e048210843e227c79f639be1bd34fe30 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -282,8 +283,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, s32_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, sort, 0));
 
@@ -308,8 +312,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, bf16_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, f32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
 
   auto computation = module->AddEntryComputation(builder.Build());
 
@@ -319,6 +326,14 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
   EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32);
   EXPECT_NE(computation->root_instruction(), sort);
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kTuple);
+  EXPECT_EQ(sort->to_apply()->parameter_instruction(1)->shape().element_type(),
+            F32);
+  // Make sure that no convert to BF16 was added to the 'to_apply' comparison
+  // computation.
+  auto users = sort->to_apply()->parameter_instruction(1)->users();
+  for (auto user : users) {
+    EXPECT_NE(user->opcode(), HloOpcode::kConvert);
+  }
 }
 
 // Tests that the normalization should not cause unsupported mixed precision due
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index a9b5d9916e400b39039248098c22a715e44ccfd2..357d38a5548b2aaa120f06eed26fe54c9f3f46ac 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -109,8 +109,8 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
   HloInstruction* add1 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, add0, b));
-  HloInstruction* pred = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {2, 4}), HloOpcode::kEq, a, b));
+  HloInstruction* pred = builder.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {2, 4}), a, b, ComparisonDirection::kEq));
   HloInstruction* sel = builder.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, pred, c, add1));
   HloInstruction* xpose =
@@ -574,8 +574,8 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
       HloInstruction::CreateParameter(0, shape, "cond_param"));
   auto cond_dot =
       builder_cond.AddInstruction(CreateDot(shape, cond_param, cond_param));
-  auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+  auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}),
       builder_cond.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
           builder_cond.AddInstruction(
@@ -583,9 +583,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
                                           cond_dot, {0, 0}, {1, 1}, {1, 1})))),
       builder_cond.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
-          builder_cond.AddInstruction(HloInstruction::CreateSlice(
-              ShapeUtil::MakeShape(F32, {1, 1}), cond_dot, {1, 1}, {2, 2},
-              {1, 1}))))));
+          builder_cond.AddInstruction(
+              HloInstruction::CreateSlice(ShapeUtil::MakeShape(F32, {1, 1}),
+                                          cond_dot, {1, 1}, {2, 2}, {1, 1})))),
+      ComparisonDirection::kGt));
   auto cond = module->AddEmbeddedComputation(builder_cond.Build());
 
   auto builder_body = HloComputation::Builder("body");
@@ -631,8 +632,8 @@ TEST_F(BFloat16PropagationTest,
   auto builder_cond = HloComputation::Builder("cond");
   auto cond_param = builder_cond.AddInstruction(
       HloInstruction::CreateParameter(0, shape, "cond_param"));
-  builder_cond.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+  builder_cond.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}),
       builder_cond.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
           builder_cond.AddInstruction(HloInstruction::CreateSlice(
@@ -642,7 +643,8 @@ TEST_F(BFloat16PropagationTest,
           ShapeUtil::MakeShape(F32, {}),
           builder_cond.AddInstruction(HloInstruction::CreateSlice(
               ShapeUtil::MakeShape(F32, {1, 1}), cond_param, {1, 1}, {2, 2},
-              {1, 1}))))));
+              {1, 1})))),
+      ComparisonDirection::kGt));
   auto cond = module->AddEmbeddedComputation(builder_cond.Build());
 
   auto builder_body = HloComputation::Builder("body");
@@ -705,8 +707,8 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, cond_rhs, cond_rhs));
   auto cond_dot =
       builder_cond.AddInstruction(CreateDot(shape, cond_lhs, cond_add_rhs));
-  builder_cond.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+  builder_cond.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}),
       builder_cond.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
           builder_cond.AddInstruction(
@@ -714,9 +716,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
                                           cond_dot, {0, 0}, {1, 1}, {1, 1})))),
       builder_cond.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
-          builder_cond.AddInstruction(HloInstruction::CreateSlice(
-              ShapeUtil::MakeShape(F32, {1, 1}), cond_dot, {1, 1}, {2, 2},
-              {1, 1}))))));
+          builder_cond.AddInstruction(
+              HloInstruction::CreateSlice(ShapeUtil::MakeShape(F32, {1, 1}),
+                                          cond_dot, {1, 1}, {2, 2}, {1, 1})))),
+      ComparisonDirection::kGt));
   auto cond = module->AddEmbeddedComputation(builder_cond.Build());
 
   auto builder_body = HloComputation::Builder("body");
@@ -800,8 +803,8 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
           shape, HloOpcode::kAdd, cond0_rhs, cond0_rhs));
   auto cond0_dot =
       builder_cond0.AddInstruction(CreateDot(shape, cond0_lhs, cond0_add_rhs));
-  builder_cond0.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+  builder_cond0.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}),
       builder_cond0.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
           builder_cond0.AddInstruction(
@@ -809,9 +812,10 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
                                           cond0_dot, {0, 0}, {1, 1}, {1, 1})))),
       builder_cond0.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
-          builder_cond0.AddInstruction(HloInstruction::CreateSlice(
-              ShapeUtil::MakeShape(F32, {1, 1}), cond0_dot, {1, 1}, {2, 2},
-              {1, 1}))))));
+          builder_cond0.AddInstruction(
+              HloInstruction::CreateSlice(ShapeUtil::MakeShape(F32, {1, 1}),
+                                          cond0_dot, {1, 1}, {2, 2}, {1, 1})))),
+      ComparisonDirection::kGt));
   auto cond0 = module->AddEmbeddedComputation(builder_cond0.Build());
 
   // Condition computation for the second while.
@@ -828,8 +832,8 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
           shape, HloOpcode::kAdd, cond1_lhs, cond1_lhs));
   auto cond1_dot =
       builder_cond1.AddInstruction(CreateDot(shape, cond1_add_lhs, cond1_rhs));
-  builder_cond1.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+  builder_cond1.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}),
       builder_cond1.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
           builder_cond1.AddInstruction(
@@ -837,9 +841,10 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
                                           cond1_dot, {0, 0}, {1, 1}, {1, 1})))),
       builder_cond1.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {}),
-          builder_cond1.AddInstruction(HloInstruction::CreateSlice(
-              ShapeUtil::MakeShape(F32, {1, 1}), cond1_dot, {1, 1}, {2, 2},
-              {1, 1}))))));
+          builder_cond1.AddInstruction(
+              HloInstruction::CreateSlice(ShapeUtil::MakeShape(F32, {1, 1}),
+                                          cond1_dot, {1, 1}, {2, 2}, {1, 1})))),
+      ComparisonDirection::kGt));
   auto cond1 = module->AddEmbeddedComputation(builder_cond1.Build());
 
   // Body computation shared by both whiles.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index d07615b828990f80e2f905837c46f5f2e15d5a63..cb682f49a5c8097b2fa5ce15ea9fdbbcf46668b4 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -191,6 +191,7 @@ Status GatherComputationsByAllocationType(
           case HloOpcode::kReduceWindow:
           case HloOpcode::kScatter:
           case HloOpcode::kSelectAndScatter:
+          case HloOpcode::kSort:
           case HloOpcode::kFusion:
             // Map/reduce etc computations are always thread-local.
             worklist.push_back(std::make_pair(subcomputation,
@@ -752,7 +753,8 @@ namespace {
 bool MayInterfereAcrossSubcomputations(BufferAssignment* assignment,
                                        const LogicalBuffer& a_buffer,
                                        const LogicalBuffer& b_buffer) {
-  auto call_graph = assignment->liveness().hlo_ordering().call_graph();
+  const CallGraph& call_graph =
+      assignment->liveness().hlo_ordering().call_graph();
   const HloInstruction* a_ancestor;
   const HloInstruction* b_ancestor;
   std::tie(a_ancestor, b_ancestor) =
@@ -1011,10 +1013,14 @@ Status BufferAssigner::AssignBuffersForComputation(
       // callers.
       BufferAllocation* allocation =
           assignment->NewAllocation(*buffer, buffer_size);
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              instruction->parameter_number(), buffer->index());
       allocation->set_entry_computation_parameter(
-          instruction->parameter_number(), buffer->index());
-      VLOG(3) << "New allocation #" << allocation->index()
-              << " for entry computation parameter: " << *buffer;
+          instruction->parameter_number(), buffer->index(),
+          parameter_has_alias);
+      VLOG(3) << "Mark allocation #" << allocation->index()
+              << " as entry computation parameter: " << *buffer;
       continue;
     }
 
@@ -1416,12 +1422,14 @@ BufferAssigner::MergeColocatedBufferSets(
           << colocated_buffer_sets.size();
 
   // Returns true if the given buffer is for the entry parameter.
-  auto is_entry_parameter = [](const LogicalBuffer& buffer) {
+  auto is_readonly_entry_parameter = [](const LogicalBuffer& buffer) {
     auto* instruction = buffer.instruction();
     auto* computation = instruction->parent();
     auto* module = computation->parent();
     return instruction->opcode() == HloOpcode::kParameter &&
-           computation == module->entry_computation();
+           computation == module->entry_computation() &&
+           !module->input_output_alias_config().ParameterHasAlias(
+               instruction->parameter_number(), buffer.index());
   };
 
   std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
@@ -1443,7 +1451,7 @@ BufferAssigner::MergeColocatedBufferSets(
   for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
     for (auto& buffer : colocated_buffer_sets[i]) {
       if (buffer_liveness.MaybeLiveOut(*buffer) ||
-          is_entry_parameter(*buffer) ||
+          is_readonly_entry_parameter(*buffer) ||
           buffer->instruction()->opcode() == HloOpcode::kConstant) {
         set_can_be_merged[i] = false;
         break;
@@ -1612,62 +1620,46 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
       } else if (opcode == HloOpcode::kConditional) {
-        const HloInstruction* conditional_hlo = instruction;
+        const HloInstruction* conditional = instruction;
         ShapeUtil::ForEachSubshape(
-            conditional_hlo->shape(),
-            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+            conditional->shape(),
+            [this, conditional, &points_to_analysis, colocated_buffer_sets](
                 const Shape& /*subshape*/, const ShapeIndex& index) {
               std::vector<const LogicalBuffer*> colocated_set;
-              // Add conditional.result.
-              AddBufferToColocatedSet(conditional_hlo, index,
-                                      points_to_analysis, &colocated_set);
-              // Add conditional.true_computation.root.
-              AddBufferToColocatedSet(
-                  conditional_hlo->true_computation()->root_instruction(),
-                  index, points_to_analysis, &colocated_set);
-              // Add conditional.false_computation.root.
-              AddBufferToColocatedSet(
-                  conditional_hlo->false_computation()->root_instruction(),
-                  index, points_to_analysis, &colocated_set);
+              // Add cond.result.
+              AddBufferToColocatedSet(conditional, index, points_to_analysis,
+                                      &colocated_set);
+              for (int j = 0; j < conditional->branch_count(); ++j) {
+                // Add each cond.branch_computation[j].root.
+                AddBufferToColocatedSet(
+                    conditional->branch_computation(j)->root_instruction(),
+                    index, points_to_analysis, &colocated_set);
+              }
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
 
-        // Add true_operand and conditional.true_computation.parameter(0) as a
-        // colocated buffer set. Note that this has to be done for each subshape
-        // in the true_operand of the conditional.
-        ShapeUtil::ForEachSubshape(
-            conditional_hlo->operand(1)->shape(),
-            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
-                const Shape& /*subshape*/, const ShapeIndex& index) {
-              std::vector<const LogicalBuffer*> true_set;
-              // Add conditional.true_operand.
-              AddBufferToColocatedSet(conditional_hlo->operand(1), index,
-                                      points_to_analysis, &true_set);
-              // Add conditional.true_computation.parameter_instruction(0).
-              AddBufferToColocatedSet(
-                  conditional_hlo->true_computation()->parameter_instruction(0),
-                  index, points_to_analysis, &true_set);
-              AddSetToColocatedBufferSets(true_set, colocated_buffer_sets);
-            });
-
-        // Add false_operand and conditional.false_computation.parameter(0) as a
-        // colocated buffer set. Note that this has to be done for each subshape
-        // in the false_operand of the conditional.
-        ShapeUtil::ForEachSubshape(
-            conditional_hlo->operand(2)->shape(),
-            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
-                const Shape& /*subshape*/, const ShapeIndex& index) {
-              std::vector<const LogicalBuffer*> false_set;
-              // Add conditional.false_operand.
-              AddBufferToColocatedSet(conditional_hlo->operand(2), index,
-                                      points_to_analysis, &false_set);
-              // Add conditional.false_computation.parameter_instruction(0).
-              AddBufferToColocatedSet(
-                  conditional_hlo->false_computation()->parameter_instruction(
-                      0),
-                  index, points_to_analysis, &false_set);
-              AddSetToColocatedBufferSets(false_set, colocated_buffer_sets);
-            });
+        for (int j = 0; j < conditional->branch_count(); ++j) {
+          // Add branch_operand[j] (which is operand[j+1]) and
+          // cond.branch_computation[j].parameter(0) as a colocated
+          // buffer set. Note that this has to be done for each subshape in the
+          // branch_operand of the case.
+          ShapeUtil::ForEachSubshape(
+              conditional->operand(j + 1)->shape(),
+              [this, j, conditional, &points_to_analysis,
+               colocated_buffer_sets](const Shape& /*subshape*/,
+                                      const ShapeIndex& index) {
+                std::vector<const LogicalBuffer*> branch_set;
+                // Add cond.operand[j+1].
+                AddBufferToColocatedSet(conditional->operand(j + 1), index,
+                                        points_to_analysis, &branch_set);
+                // Add cond.branch_computation[j].parameter_instruction(0).
+                AddBufferToColocatedSet(
+                    conditional->branch_computation(j)->parameter_instruction(
+                        0),
+                    index, points_to_analysis, &branch_set);
+                AddSetToColocatedBufferSets(branch_set, colocated_buffer_sets);
+              });
+        }
       }
     }
   }
@@ -1733,10 +1725,6 @@ void BufferAssigner::AssignColocatedBufferSets(
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
         allocation = assignment->NewAllocation(*buffer, buffer_size);
-        if (entry_parameter_number >= 0) {
-          allocation->set_entry_computation_parameter(
-              entry_parameter_number, *entry_parameter_shape_idx);
-        }
         if (is_constant) {
           allocation->set_constant(true);
         }
@@ -1750,6 +1738,16 @@ void BufferAssigner::AssignColocatedBufferSets(
       }
       colocated_buffers->insert(buffer);
     }
+
+    // If an allocation contains a parameter, set corresponding fields.
+    if (entry_parameter_number >= 0) {
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              entry_parameter_number, *entry_parameter_shape_idx);
+      allocation->set_entry_computation_parameter(entry_parameter_number,
+                                                  *entry_parameter_shape_idx,
+                                                  parameter_has_alias);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 4baab9b6ad71293d48d5ed70c2922fdf40ef119a..448dec3b1aa0c0f85e1060a70e965fcf3952c320 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -96,7 +96,11 @@ class BufferAllocation {
   // Whether this allocation is readonly i.e. backed by memory we cannot write
   // to.
   bool is_readonly() const {
-    return is_entry_computation_parameter() || is_constant();
+    // Entry parameters are generally readonly, except when they are aliased
+    // with any output.
+    return (is_entry_computation_parameter() &&
+            !is_parameter_aliased_with_output_) ||
+           is_constant();
   }
 
   bool is_tuple() const { return is_tuple_; }
@@ -274,8 +278,10 @@ class BufferAllocation {
   void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
 
   void set_entry_computation_parameter(int64 parameter_number,
-                                       ShapeIndex param_shape_index) {
+                                       ShapeIndex param_shape_index,
+                                       bool parameter_aliased_with_output) {
     is_entry_computation_parameter_ = true;
+    is_parameter_aliased_with_output_ = parameter_aliased_with_output;
     parameter_number_ = parameter_number;
     param_shape_index_ = std::move(param_shape_index);
   }
@@ -305,6 +311,9 @@ class BufferAllocation {
   // outlast the computation.
   bool is_entry_computation_parameter_ = false;
 
+  // Whether this entry computation parameter is aliased with output.
+  bool is_parameter_aliased_with_output_ = false;
+
   // If this allocation holds an entry computation parameter, this field
   // indicates the index (starting from 0) of the parameter.
   int64 parameter_number_ = 0;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 1b4e93a2f303e5aad3e4081f36e2417277f62c71..704585033f076972dbd359fcd832834374566fa8 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -190,8 +190,9 @@ class BufferAssignmentTest : public HloTestBase {
         HloInstruction::CreateParameter(0, t_s32_f32v4_, "x"));
     auto index = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(const4->shape(), param, 0));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, index, const4));
+    builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), index,
+                                      const4, ComparisonDirection::kLt));
     return builder.Build();
   }
 
@@ -465,6 +466,40 @@ TEST_F(BufferAssignmentTest, Basic) {
   GetAssignedOutputAllocation(*buffers, sub);
 }
 
+TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
+  // If an input buffer and output buffer aliases, the input buffer can be
+  // reused for other intermediate results.
+  //
+  // param0[100] ----- (neg1) -- (neg2)
+  //    |                           |
+  //    + -------- Aliased ---------+
+
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec100_, "p0"));
+  auto neg_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param));
+  auto neg_2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, neg_1));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias(
+      {}, 0, {}, HloInputOutputAliasConfig::kUserAlias));
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  BufferAllocation param_buffer = GetAssignedInputAllocation(*buffers, param);
+  BufferAllocation neg_1_buffer = GetAllocation(*buffers, neg_1, {});
+  BufferAllocation neg_2_buffer = GetAllocation(*buffers, neg_2, {});
+
+  // Everything use one buffer.
+  EXPECT_EQ(param_buffer.index(), neg_1_buffer.index());
+  EXPECT_EQ(neg_2_buffer.index(), neg_1_buffer.index());
+}
+
 TEST_F(BufferAssignmentTest, AddCannotReuse) {
   // Pass in a special rule to indicate that "add" cannot reuse any buffer.
   //
@@ -1829,8 +1864,8 @@ class WhileBufferAssignmentTest : public HloTestBase {
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)));
     auto ten = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(10)));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, zero, ten));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), zero, ten, ComparisonDirection::kLt));
     return builder.Build();
   }
 
@@ -2101,8 +2136,9 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(4)));
     auto param =
         builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x"));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param, const4));
+    builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param,
+                                      const4, ComparisonDirection::kLt));
     return builder.Build();
   };
 
@@ -2496,7 +2532,7 @@ while_condition {
   state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
   get-tuple-element = s32[] get-tuple-element(state), index=0
   get-tuple-element.1 = s32[] constant(3)
-  ROOT less-than.339.338 = pred[] less-than(get-tuple-element, get-tuple-element.1)
+  ROOT less-than.339.338 = pred[] compare(get-tuple-element, get-tuple-element.1), direction=LT
 }
 
 ENTRY entry_computation {
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 94af788c54f6c722997311bec50da3ed93aa3cee..98304757cae91d22466ed25f8c6e36ce90a848db 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -64,6 +64,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return CallContext::kParallel;
     default:
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index f41367914c0cb5fe66b1dbbc5ec6f8b7a67d592c..57a636fd740995d6cce933fe19d5592a64bde5cf 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -30,7 +30,7 @@ namespace xla {
 
 // The context in which a computation is called by another computation.
 enum class CallContext {
-  // In a parallel contex the computation is applied to each element of the
+  // In a parallel context the computation is applied to each element of the
   // array argument(s). kMap and kReduce instructions call computations in
   // parallel context.
   kParallel,
@@ -256,6 +256,10 @@ class CallGraph {
  private:
   CallGraph(const HloModule* module);
 
+  // Not copyable.
+  CallGraph(const CallGraph&) = delete;
+  CallGraph& operator=(const CallGraph&) = delete;
+
   // Sets the call contexts for every node in the graph.
   void SetCallContexts();
 
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 5de724f8924b78008ba4c56603b61bf93fbc5e7c..458aef1499954c5852486b015cb1474a5e70457f 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -83,8 +83,9 @@ class CallGraphTest : public HloTestBase {
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     HloInstruction* zero = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
+    builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param0,
+                                      zero, ComparisonDirection::kGt));
     return builder.Build();
   }
 
diff --git a/tensorflow/compiler/xla/client/lib/cholesky.cc b/tensorflow/compiler/xla/service/cholesky_expander.cc
similarity index 74%
rename from tensorflow/compiler/xla/client/lib/cholesky.cc
rename to tensorflow/compiler/xla/service/cholesky_expander.cc
index 414bd1494cd32f32a5c37e84119de930678a776b..1c39cf9bc0a093ec54715d4180b49094ca6266a0 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/service/cholesky_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/cholesky.h"
+#include "tensorflow/compiler/xla/service/cholesky_expander.h"
 
 #include <memory>
 #include <vector>
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
@@ -135,10 +135,8 @@ XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
   });
 }
 
-}  // namespace
-
-XlaOp Cholesky(XlaOp a, int64 block_size,
-               PrecisionConfig::Precision precision) {
+XlaOp BuildCholesky(XlaOp a, int64 block_size,
+                    PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -194,12 +192,12 @@ XlaOp Cholesky(XlaOp a, int64 block_size,
         // l[i+k:, i:i+k] =
         //     trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
         auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k});
-        auto update = TriangularSolve(factorized, panel,
-                                      /*left_side=*/false,
-                                      /*lower=*/true,
-                                      /*transpose_a=*/true,
-                                      /*conjugate_a=*/false,
-                                      /*block_size=*/block_size);
+        auto update =
+            TriangularSolve(factorized, panel,
+                            /*left_side=*/false,
+                            /*lower=*/true,
+                            /*unit_diagonal=*/false,
+                            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
         l = UpdateSliceInMinorDims(l, update, {i + k, i});
       }
     }
@@ -207,4 +205,55 @@ XlaOp Cholesky(XlaOp a, int64 block_size,
   });
 }
 
+}  // namespace
+
+bool CholeskyExpander::InstructionMatchesPattern(HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kCholesky;
+}
+
+StatusOr<HloInstruction*> CholeskyExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  const CholeskyOptions& options = instruction->cholesky_options();
+  const string name = absl::StrFormat(
+      "xla.cholesky_%s_%s", instruction->operand(0)->shape().ToString(),
+      options.lower() ? "lower" : "upper");
+
+  HloModule* module = instruction->parent()->parent();
+
+  HloComputation*& computation =
+      computation_cache_.emplace(name, nullptr).first->second;
+  if (!computation) {
+    // Builds a new expansion.
+    //
+    // TODO(b/62327888): We do something unusual here: we build the computation
+    // using the XlaBuilder API, which is nominally an XLA client API. We do
+    // this because the external APIs for building complicated computations
+    // (XlaBuilder) are much more ergonomic than the internal ones. As it turns
+    // out, XlaBuilder isn't really a client API—what it does is build a
+    // HloModuleProto protocol buffer, that we can then deserialize and clone
+    // into our HloModule. Ideally we would avoid the protocol buffer step;
+    // that is left as an exercise for future work.
+    XlaBuilder builder(name);
+    XlaOp a = Parameter(&builder, 0, instruction->operand(0)->shape(), "a");
+    XlaOp l = BuildCholesky(MaybeTransposeInMinorDims(a, !options.lower()),
+                            /*block_size=*/128,
+                            /*precision=*/PrecisionConfig::HIGHEST);
+    MaybeTransposeInMinorDims(l, !options.lower());
+
+    TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        xla_computation.GetProgramShape());
+    HloModuleConfig config(program_shape);
+    TF_ASSIGN_OR_RETURN(auto new_module, HloModule::CreateFromProto(
+                                             xla_computation.proto(), config));
+    HloCloneContext context(module);
+    computation =
+        module->DeepCloneComputation(new_module->entry_computation(), &context);
+  }
+
+  return instruction->parent()->AddInstruction(HloInstruction::CreateCall(
+      instruction->shape(), instruction->operands(), computation));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.h b/tensorflow/compiler/xla/service/cholesky_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2958db1b8ca676f3872016ac6a62b872a6b6649
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cholesky_expander.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CHOLESKY_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CHOLESKY_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+class CholeskyExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "cholesky_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CHOLESKY_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 1965925fa7f6d50b1d7af918bc3468d4b4d5d0a2..a4758c2b9dbba8a1c560c8f2dc7a182e456f5e69 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -70,26 +71,14 @@ CompileOnlyService::CompileAheadOfTime(
     TF_RET_CHECK(instance.computation.has_host_program_shape());
 
     const DebugOptions& debug_options = options.debug_options();
-
-    // Dump computation proto if flag is set.
-    const string& directory_path = debug_options.xla_dump_computations_to();
-    if (!directory_path.empty()) {
-      HloSnapshot hlo_snapshot;
-      *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = instance.computation;
-      string filename =
-          absl::StrCat("computation_", instance.computation.id(), "__",
-                       instance.computation.entry_computation_name());
-      const string& per_host_path = tensorflow::io::JoinPath(
-          directory_path, tensorflow::port::Hostname());
-
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
-    }
-
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
     *execution_options.mutable_shape_with_output_layout() =
         instance.result_layout->ToProto();
+    if (options.has_static_device_assignment()) {
+      TF_RETURN_IF_ERROR(options.static_device_assignment().Serialize(
+          execution_options.mutable_device_assignment()));
+    }
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(
@@ -99,7 +88,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
         HloModule::CreateFromProto(instance.computation, *module_config));
-    TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*hlo_module));
+    DumpHloModuleIfEnabled(*hlo_module, "before_optimizations");
     hlo_modules.push_back(std::move(hlo_module));
   }
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index d4db95da8eb901af8a6675f2991def73ccfe8ee6..9b483bd97e91720ded089abca593541ae532dedd 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -82,12 +83,24 @@ class AotCompilationOptions {
   const DebugOptions& debug_options() const { return debug_options_; }
   DebugOptions* mutable_debug_options() { return &debug_options_; }
 
+  bool has_static_device_assignment() const {
+    return static_device_assignment_.has_value();
+  }
+  const DeviceAssignment& static_device_assignment() const {
+    CHECK(static_device_assignment_.has_value());
+    return *static_device_assignment_;
+  }
+  void set_static_device_assignment(const DeviceAssignment& device_assignment) {
+    static_device_assignment_ = device_assignment;
+  }
+
  protected:
   AotCompilationOptions();
 
  private:
   DeviceMemoryAllocator* device_allocator_ = nullptr;
   DebugOptions debug_options_;
+  absl::optional<DeviceAssignment> static_device_assignment_;
 };
 
 // Abstract superclass describing metadata produced during ahead-of-time
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index 4ea3a13f2835c5fef99c274f14d7d683c9ff5fc8..f1d0ca44f08688ccda5b4385d65eabc0fc2fc5e6 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -33,8 +33,8 @@ limitations under the License.
 namespace xla {
 
 // Tries to replace a conditional with a call operation of the corresponding
-// computation. If the given conditional has a constant predicate, tries to
-// replace it with a call to its true/false computation as appropriate and then
+// computation. If the given conditional has a constant branch_index, tries to
+// replace it with a call to its corresponding branch computation and then
 // inline that computation.
 //
 // Returns true if it made a change to the graph.
@@ -50,24 +50,30 @@ static StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
     return false;
   }
 
-  if (conditional->operand(0)->opcode() != HloOpcode::kConstant) {
-    VLOG(2) << "Not attempting to remove conditional as its predicate is not a "
-               "compile-time constant: "
-            << conditional->ToShortString();
-    return false;
-  }
+  // We can always inline a 1-branch conditional due to default branch fallback.
+  int branch_index = 0;
+  if (conditional->branch_count() > 1) {
+    if (conditional->operand(0)->opcode() != HloOpcode::kConstant) {
+      VLOG(2) << "Not attempting to remove conditional as its branch_index is "
+                 "not a compile-time constant: "
+              << conditional->ToShortString();
+      return false;
+    }
 
+    if (conditional->operand(0)->shape().element_type() == PRED) {
+      branch_index = conditional->operand(0)->literal().Get<bool>({}) ? 0 : 1;
+    } else {
+      branch_index = conditional->operand(0)->literal().Get<int32>({});
+      if (branch_index < 0 || branch_index >= conditional->branch_count()) {
+        branch_index = conditional->branch_count() - 1;
+      }
+    }
+  }
   auto computation = conditional->parent();
   HloInstruction* call_op;
-  if (conditional->operand(0)->literal().Get<bool>({})) {
-    call_op = computation->AddInstruction(HloInstruction::CreateCall(
-        conditional->shape(), {conditional->mutable_operand(1)},
-        conditional->true_computation()));
-  } else {
-    call_op = computation->AddInstruction(HloInstruction::CreateCall(
-        conditional->shape(), {conditional->mutable_operand(2)},
-        conditional->false_computation()));
-  }
+  call_op = computation->AddInstruction(HloInstruction::CreateCall(
+      conditional->shape(), {conditional->mutable_operand(branch_index + 1)},
+      conditional->branch_computation(branch_index)));
   conditional->SetupDerivedInstruction(call_op);
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op));
   TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status());
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index b3ed27d9a849eced006eb3b01977ad2fe7ed7367..434bbe9ffd5da58901a65d1c51b77c33b9afa81c 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -191,8 +191,9 @@ HloInstruction* GetExpandedFilterMask(
   // linspace to create a diagonal predicate.
   Shape predicate_shape = ShapeUtil::MakeShape(
       PRED, AsInt64Slice(expanded_filter_shape.dimensions()));
-  return add_instruction(HloInstruction::CreateBinary(
-      predicate_shape, HloOpcode::kEq, broadcasted_mask1, broadcasted_mask2));
+  return add_instruction(HloInstruction::CreateCompare(
+      predicate_shape, broadcasted_mask1, broadcasted_mask2,
+      ComparisonDirection::kEq));
 }
 
 // This function handles batch_group_counts which are relevant only for
@@ -215,81 +216,15 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
   };
 
   int64 input_batch_dimension = dim_numbers.input_batch_dimension();
-  int64 input_feature_dimension = dim_numbers.input_feature_dimension();
   int64 output_batch_dimension = dim_numbers.output_batch_dimension();
   int64 output_feature_dimension = dim_numbers.output_feature_dimension();
-  int64 kernel_input_feature_dimension =
-      dim_numbers.kernel_input_feature_dimension();
 
   int64 input_batch = activation->shape().dimensions(input_batch_dimension);
 
   // We are not yet supporting batch_group of sizes greater than 1.
   TF_RET_CHECK(input_batch == batch_group_count);
 
-  if (is_cost_viable_(convolution)) {
-    // Add a dimension to the activation, and reshape.
-    Shape reshaped_activation_shape = activation->shape();
-    ShapeUtil::AppendMajorDimension(1, &reshaped_activation_shape);
-
-    activation = add(
-        HloInstruction::CreateReshape(reshaped_activation_shape, activation));
-
-    // Add a dimension to the filter, and reshape.
-    Shape reshaped_filter_shape = filter->shape();
-    ShapeUtil::AppendMajorDimension(1, &reshaped_filter_shape);
-
-    filter = add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
-
-    int64 new_spatial_dim = reshaped_activation_shape.dimensions().size() - 1;
-
-    Shape new_output_shape = convolution->shape();
-    ShapeUtil::AppendMajorDimension(1, &new_output_shape);
-
-    int64 input_feature =
-        activation->shape().dimensions(input_feature_dimension);
-
-    // The code below edits convolution dimension numbers. Please refer to
-    // conv_op_helpers.cc to find how the dimensions were set up originally.
-
-    // Effectively, the new input batch becomes 1, and so does the kernel
-    // input feature. The original input batch now becomes a spatial dimension.
-    // The output batch (remember that the output is the new kernel for in
-    // backprop) becomes a spatial dimension too.
-
-    dim_numbers.set_input_batch_dimension(new_spatial_dim);
-    dim_numbers.set_input_feature_dimension(input_batch_dimension);
-    dim_numbers.set_kernel_input_feature_dimension(new_spatial_dim);
-
-    dim_numbers.add_input_spatial_dimensions(input_feature_dimension);
-    dim_numbers.add_kernel_spatial_dimensions(kernel_input_feature_dimension);
-
-    dim_numbers.add_output_spatial_dimensions(output_batch_dimension);
-    dim_numbers.set_output_batch_dimension(new_spatial_dim);
-
-    // Add window for the new spatial dimension.
-    Window new_window = convolution->window();
-    auto* dim = new_window.add_dimensions();
-    dim->set_window_dilation(1);
-    dim->set_base_dilation(1);
-    dim->set_stride(1);
-    dim->set_size(input_feature);
-
-    auto new_convolution = add(HloInstruction::CreateConvolve(
-        new_output_shape, activation, filter,
-        /*feature_group_count=*/batch_group_count, /*batch_group_count=*/1,
-        new_window, dim_numbers, convolution->precision_config()));
-
-    // Delete the extra spatial dimension, and reshape.
-    Shape reshaped_convolution_shape = ShapeUtil::DeleteDimension(
-        new_spatial_dim - 1, new_convolution->shape());
-    auto reshaped_convolution = HloInstruction::CreateReshape(
-        reshaped_convolution_shape, new_convolution);
-
-    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-        convolution, std::move(reshaped_convolution)));
-
-    changed_ = true;
-  } else {
+  if (!is_cost_viable_(convolution) || filter_expansion_) {
     // We first obtain the expanded the filter (which is the convolution
     // output). The batch dimension is the expanded one (which originally
     // represents kernel input feature dimension). We mask the filter to zero
@@ -316,14 +251,27 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
         expanded_filter_shape, HloOpcode::kSelect, filter_mask, new_convolution,
         zero_filter));
 
-    auto zero_literal = LiteralUtil::CreateR0(0.0f);
-    TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(F32));
+    PrimitiveType reduce_type = new_filter->shape().element_type();
+    auto reduce_window_shape = new_convolution->shape();
+    reduce_window_shape.set_dimensions(output_batch_dimension, 1);
+
+    // Ensure that data input to reduce window uses at least 32 bits.
+    if (primitive_util::BitWidth(reduce_type) < primitive_util::BitWidth(F32)) {
+      reduce_type = F32;
+      reduce_window_shape.set_element_type(F32);
+      Shape convert_shape = new_filter->shape();
+      convert_shape.set_element_type(F32);
+      new_filter =
+          add(HloInstruction::CreateConvert(convert_shape, new_filter));
+    }
+
+    auto zero_literal = LiteralUtil::Zero(reduce_type);
     auto zero_scalar =
         add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
     auto reduce_function = [&]() -> HloComputation* {
       HloComputation::Builder b("add_computation");
-      Shape shape = ShapeUtil::MakeShape(F32, {});
+      Shape shape = ShapeUtil::MakeShape(reduce_type, {});
       auto lhs =
           b.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
       auto rhs =
@@ -333,19 +281,6 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
       return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
     };
 
-    auto reduce_window_shape = new_convolution->shape();
-    reduce_window_shape.set_dimensions(output_batch_dimension, 1);
-
-    // Ensure that data input to reduce window is of type F32.
-    if (primitive_util::BitWidth(new_filter->shape().element_type()) <
-        primitive_util::BitWidth(F32)) {
-      reduce_window_shape.set_element_type(F32);
-      Shape convert_shape = new_filter->shape();
-      convert_shape.set_element_type(F32);
-      new_filter =
-          add(HloInstruction::CreateConvert(convert_shape, new_filter));
-    }
-
     // Create the reduce window.
     Window window;
     for (int64 i = 0; i < new_convolution->shape().dimensions_size(); ++i) {
@@ -375,6 +310,7 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
 
     TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
         convolution, std::move(reduce_window_converted)));
+    changed_ = true;
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index 585b81a5db632901be863893bf723fcba19388ea..9cee3eda95252d6c7d725fbb03030bd58f52e71f 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -109,16 +109,16 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[16,19,19,512]{3,2,1,0}, filter: f32[16
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  auto cost_model = [](HloInstruction* conv) { return true; };
+  auto cost_model = [](HloInstruction* conv) { return false; };
   ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
                                       true);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  // Make sure the convolution is converted to one with batch_group_count = 1.
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
-  EXPECT_EQ(root->operand(0)->batch_group_count(), 1);
-  // Verify that the convolution is replaced by a reshape.
-  EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
+
+  // Verify that the convolution is replaced by a convert.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvert);
+  // Make sure the convert is being fed by a reduce window.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kReduceWindow);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 5e26a63cebfa9b2e50f4b13335c10c246999d4df..8cb64a335301cd8b340ead9ad7d6ec868d9b2065 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -193,8 +194,8 @@ bool IndicesToCopyForWhile(const HloDataflowAnalysis& dataflow,
 // Add kCopy instructions around the given kWhile instruction to eliminate any
 // possible live range interference of HLO values assuming a dependency-based
 // ordering (HloDependencyOrdering). Copies are added conservatively. There
-// likely are copies which are not strictly necessary, but there are removed
-// later in the pass via CopyRemover.
+// likely are copies which are not strictly necessary, but they are removed
+// later in the pass via RemoveUnnecessaryCopies.
 //
 //
 // Elements (each ShapeIndex) in the loop state are considered independently.  A
@@ -310,17 +311,16 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
-// We add copies for all the indices of the true and false computation roots,
-// in order to resolve interference. We later rely on the CopyRemover to drop
-// the unnecessary ones.
+// We add copies for all the indices of the true and false computation roots, in
+// order to resolve interference. We later rely on RemoveUnnecessaryCopies to
+// drop the unnecessary ones.
 Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
                                HloInstruction* conditional) {
   VLOG(2) << "Adding copies for kConditional instruction "
           << conditional->name();
   TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
 
-  for (HloComputation* computation :
-       {conditional->true_computation(), conditional->false_computation()}) {
+  for (HloComputation* computation : conditional->branch_computations()) {
     HloInstruction* root = computation->root_instruction();
     std::vector<HloInstruction*> users = root->users();
     TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
@@ -335,7 +335,8 @@ Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
 
 // Conservatively adds copies before root instruction of entry computation and
 // each aliased parameter to resolve interference of aliased input and output
-// buffer. We later rely on the CopyRemover to drop the unnecessary ones.
+// buffer. We later rely on RemoveUnnecessaryCopies to drop the unnecessary
+// ones.
 Status AddCopiesForAliasedInputOutputs(HloModule* module) {
   HloComputation* entry = module->entry_computation();
   HloInstruction* root = entry->root_instruction();
@@ -433,587 +434,528 @@ Status StripControlDependenciesFrom(HloInstruction* instruction) {
   return Status::OK();
 }
 
-// Class for removing unnecessary copies from the module.
+// Class which tracks the HLO values within each HLO buffer in the module
+// during copy removal.
 //
-// kCopy instructions are added conservatively to guarantee no live range
-// interference between HLO values. This class uses a more fine-grained analysis
-// to remove some of these added copies which are not strictly necessary.
+// The values are held in a linked list where there is one list for each
+// buffer. Removing a copy instruction merges together the values in the
+// source buffer of the copy to the destination buffer of the copy. This class
+// tracks these value lists as copies are removed from the graph (and value
+// lists are merged).
+//
+// The CopyRemover object is initialized to match the state of
+// HloAliasAnalysis. However, as copies are removed this state diverges. The
+// values-to-buffer mapping is maintained outside of HloAliasAnalysis because
+// a fully updatable alias analysis is very slow.
 class CopyRemover {
  public:
-  CopyRemover(const HloAliasAnalysis& alias_analysis,
-              const HloOrdering& ordering, HloModule* module)
-      : module_(module),
-        alias_analysis_(alias_analysis),
-        buffer_value_tracker_(*module, alias_analysis, ordering) {}
-
-  // Try to elide the given copy. The copy is elided if the instruction is not
-  // necessary to prevent live-range interference of HLO values. Returns true if
-  // copy was elided.
+  // The values held in a single HLO buffer are represented using a linked
+  // list. An element type in this list is ValueNode.
   //
-  // The copy instruction is not actually removed here. Instead it is left for
-  // dead in the graph. Later calls to DCE will remove the instruction.
-  StatusOr<bool> TryElideCopy(HloInstruction* copy) {
-    if (buffer_value_tracker_.TryElideCopy(copy)) {
-      TF_RETURN_IF_ERROR(StripControlDependenciesFrom(copy));
-      TF_RETURN_IF_ERROR(copy->ReplaceAllUsesWith(copy->mutable_operand(0)));
-      return true;
-    }
-    return false;
-  }
-
-  string ToString() const {
-    string out = absl::StrCat("CopyRemover, module ", module_->name(), "\n");
-    StrAppend(&out, "  Buffer values, in dependency order:\n");
-    for (const HloBuffer& buffer : alias_analysis_.buffers()) {
-      StrAppend(&out, "    HloBuffer ", buffer.id(), ":\n");
-    }
-    return out;
-  }
-
- private:
-  // Class which tracks the HLO values within each HLO buffer in the module
-  // during copy removal.
-  //
-  // The values are held in a linked list where there is one list for each
-  // buffer. Removing a copy instruction merges together the values in the
-  // source buffer of the copy to the destination buffer of the copy. This class
-  // tracks these value lists as copies are removed from the graph (and value
-  // lists are merged).
-  //
-  // The BufferValueTracker object is initialized to match the state of
-  // HloAliasAnalysis. However, as copies are removed this state diverges. The
-  // values-to-buffer mapping is maintained outside of HloAliasAnalysis because
-  // a fully updatable alias analysis is very slow.
-  class BufferValueTracker {
-   public:
-    // The values held in a single HLO buffer are represented using a linked
-    // list. An element type in this list is ValueNode.
-    //
-    // This linked list is hand-rolled to enable efficient splicing of lists
-    // using only references to list elements without knowing which lists are
-    // being spliced. std::list requires a reference to the list object to
-    // splice.
-    struct ValueNode {
-      explicit ValueNode(const HloValue* v) : value(v) {}
-
-      const HloValue* value;
-
-      // The uses are maintained outside of HloValue::uses() because
-      // HloValue::uses() is not updatable (a fully updatable dataflow analysis
-      // is slow).
-      std::vector<const HloUse*> uses;
-
-      // next/prev elements in the linked list. The list is circularly linked so
-      // these values are never null for elements in the list.
-      ValueNode* prev = nullptr;
-      ValueNode* next = nullptr;
-    };
+  // This linked list is hand-rolled to enable efficient splicing of lists
+  // using only references to list elements without knowing which lists are
+  // being spliced. std::list requires a reference to the list object to
+  // splice.
+  struct ValueNode {
+    explicit ValueNode(const HloValue* v) : value(v) {}
+
+    const HloValue* value;
+
+    // The uses are maintained outside of HloValue::uses() because
+    // HloValue::uses() is not updatable (a fully updatable dataflow analysis
+    // is slow).
+    std::vector<const HloUse*> uses;
+
+    // next/prev elements in the linked list. The list is circularly linked so
+    // these values are never null for elements in the list.
+    ValueNode* prev = nullptr;
+    ValueNode* next = nullptr;
+  };
 
-    BufferValueTracker(const HloModule& module,
-                       const HloAliasAnalysis& alias_analysis,
-                       const HloOrdering& ordering)
-        : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
-      // Construct a list for each HLO buffer in the alias analysis. Maintain a
-      // map from HloValue to the respective list element representing that
-      // value. The map is used to construct the copy info map below.
-      absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
-      for (const HloBuffer& buffer : alias_analysis.buffers()) {
-        // Verify values contained in the buffer are strictly ordered. This
-        // should always be the case after adding copies to eliminate
-        // interference. Specifically, the addition of the control flow edges
-        // between copies added around aliased operations (kWhile) guarantees
-        // this strict order.
-        for (const HloValue* value_a : buffer.values()) {
-          if (value_a->shape().IsToken()) {
-            // Token values have no representation and cannot interfere.
-            continue;
-          }
-          for (const HloValue* value_b : buffer.values()) {
-            if (value_a != value_b) {
-              DCHECK(ordering_.LiveRangeStrictlyBefore(*value_a, *value_b,
-                                                       dataflow_) ||
-                     ordering_.LiveRangeStrictlyBefore(*value_b, *value_a,
-                                                       dataflow_))
-                  << value_a->ToShortString() << " and "
-                  << value_b->ToShortString() << " are not ordered";
-            }
+  CopyRemover(const HloModule& module, const HloAliasAnalysis& alias_analysis,
+              const HloOrdering& ordering)
+      : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
+    // Construct a list for each HLO buffer in the alias analysis. Maintain a
+    // map from HloValue to the respective list element representing that
+    // value. The map is used to construct the copy info map below.
+    absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
+    for (const HloBuffer& buffer : alias_analysis.buffers()) {
+      // Verify values contained in the buffer are strictly ordered. This
+      // should always be the case after adding copies to eliminate
+      // interference. Specifically, the addition of the control flow edges
+      // between copies added around aliased operations (kWhile) guarantees
+      // this strict order.
+      for (const HloValue* value_a : buffer.values()) {
+        if (value_a->shape().IsToken()) {
+          // Token values have no representation and cannot interfere.
+          continue;
+        }
+        for (const HloValue* value_b : buffer.values()) {
+          if (value_a != value_b) {
+            DCHECK(ordering_.LiveRangeStrictlyBefore(*value_a, *value_b,
+                                                     dataflow_) ||
+                   ordering_.LiveRangeStrictlyBefore(*value_b, *value_a,
+                                                     dataflow_))
+                << value_a->ToShortString() << " and "
+                << value_b->ToShortString() << " are not ordered";
           }
         }
-
-        std::vector<const HloValue*> values = buffer.values();
-        absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
-          return ordering_.IsDefinedBefore(*a, *b);
-        });
-
-        // Create a list containing all of the values in the buffer.
-        AddValueList(values, &value_to_node);
       }
 
-      // Create copy_map_ which contains the source and destination values
-      // of all copies.
-      CreateCopyMap(module, value_to_node);
+      std::vector<const HloValue*> values = buffer.values();
+      absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
+        return ordering_.IsDefinedBefore(*a, *b);
+      });
 
-      XLA_VLOG_LINES(3, ToString());
-      TF_DCHECK_OK(Verify());
+      // Create a list containing all of the values in the buffer.
+      AddValueList(values, &value_to_node);
     }
 
-    // Add a list containing the given values to BufferValueTracker. This
-    // represents the values contained in a single buffer. For each value in
-    // 'values' an entry is created in value_to_node which indicates the
-    // respective ValueNode representing that value.
-    void AddValueList(
-        absl::Span<const HloValue* const> values,
-        absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node) {
-      ValueNode* tail = nullptr;
-      ValueNode* head = nullptr;
-      for (const HloValue* value : values) {
-        auto new_node = new ValueNode(value);
-        (*value_to_node)[value] = new_node;
-
-        // Copy the HLO values's uses into the ValueNode for the value. These
-        // uses in ValueNode are updated as copies are removed.
-        new_node->uses.reserve(value->uses().size());
-        for (const HloUse& use : value->uses()) {
-          new_node->uses.push_back(&use);
-        }
+    // Create copy_map_ which contains the source and destination values
+    // of all copies.
+    CreateCopyMap(module, value_to_node);
 
-        // Connect the new node into the linked list.
-        if (tail == nullptr) {
-          head = new_node;
-        } else {
-          tail->next = new_node;
-          new_node->prev = tail;
-        }
-        tail = new_node;
+    XLA_VLOG_LINES(3, ToString());
+    TF_DCHECK_OK(Verify());
+  }
+
+  // Add a list containing the given values to CopyRemover. This
+  // represents the values contained in a single buffer. For each value in
+  // 'values' an entry is created in value_to_node which indicates the
+  // respective ValueNode representing that value.
+  void AddValueList(
+      absl::Span<const HloValue* const> values,
+      absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node) {
+    ValueNode* tail = nullptr;
+    ValueNode* head = nullptr;
+    for (const HloValue* value : values) {
+      auto new_node = new ValueNode(value);
+      (*value_to_node)[value] = new_node;
+
+      // Copy the HLO values's uses into the ValueNode for the value. These
+      // uses in ValueNode are updated as copies are removed.
+      new_node->uses.reserve(value->uses().size());
+      for (const HloUse& use : value->uses()) {
+        new_node->uses.push_back(&use);
       }
 
-      // The linked list is circular so connect the head and tail.
-      tail->next = head;
-      head->prev = tail;
-      value_lists_.insert(head);
+      // Connect the new node into the linked list.
+      if (tail == nullptr) {
+        head = new_node;
+      } else {
+        tail->next = new_node;
+        new_node->prev = tail;
+      }
+      tail = new_node;
     }
 
-    // This method also fills in copy_map_ which indicates which nodes
-    // in the value lists corresponding to the source and destination values of
-    // kCopy instructions. value_to_node should map each HloValue to its
-    // respective ValueNode.
-    void CreateCopyMap(
-        const HloModule& module,
-        const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
-      for (HloComputation* computation : module.computations()) {
-        for (HloInstruction* instruction : computation->instructions()) {
-          // Add copies with unambiguous source values to the map. Copies with
-          // ambiguous sources are not removable.
-          if (instruction->opcode() == HloOpcode::kCopy) {
-            const HloValueSet& src_value_set =
-                dataflow_.GetValueSet(instruction->operand(0));
-            if (src_value_set.values().size() == 1) {
-              CopyNodes& copy_node = copy_map_[instruction];
-              copy_node.dest =
-                  value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
-              copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
-            }
+    // The linked list is circular so connect the head and tail.
+    tail->next = head;
+    head->prev = tail;
+    value_lists_.insert(head);
+  }
+
+  // This method also fills in copy_map_ which indicates which nodes
+  // in the value lists corresponding to the source and destination values of
+  // kCopy instructions. value_to_node should map each HloValue to its
+  // respective ValueNode.
+  void CreateCopyMap(
+      const HloModule& module,
+      const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
+    for (HloComputation* computation : module.computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        // Add copies with unambiguous source values to the map. Copies with
+        // ambiguous sources are not removable.
+        if (instruction->opcode() == HloOpcode::kCopy) {
+          const HloValueSet& src_value_set =
+              dataflow_.GetValueSet(instruction->operand(0));
+          if (src_value_set.values().size() == 1) {
+            CopyNodes& copy_node = copy_map_[instruction];
+            copy_node.dest =
+                value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
+            copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
           }
         }
       }
     }
+  }
 
-    ~BufferValueTracker() {
-      for (const ValueNode* head : value_lists_) {
-        const ValueNode* p = head;
-        do {
-          const ValueNode* tmp = p->next;
-          delete p;
-          p = tmp;
-        } while (p != head);
-      }
+  ~CopyRemover() {
+    for (const ValueNode* head : value_lists_) {
+      const ValueNode* p = head;
+      do {
+        const ValueNode* tmp = p->next;
+        delete p;
+        p = tmp;
+      } while (p != head);
     }
+  }
 
-    // Verify invariants within the linked lists.
-    Status Verify() const {
-      for (const ValueNode* head : value_lists_) {
-        const ValueNode* p = head;
-        do {
-          // Verify links between elements are consistent.
-          TF_RET_CHECK(p->prev->next == p);
-          TF_RET_CHECK(p->next->prev == p);
-
-          const HloInstruction* def = p->value->defining_instruction();
-          if (def->opcode() == HloOpcode::kCopy &&
-              ContainsKey(copy_map_, def)) {
-            TF_RET_CHECK(copy_map_.at(def).dest == p);
-          }
-          for (const HloUse* use : p->uses) {
-            if (use->instruction->opcode() == HloOpcode::kCopy &&
-                ContainsKey(copy_map_, use->instruction)) {
-              TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
-            }
+  // Verify invariants within the linked lists.
+  Status Verify() const {
+    for (const ValueNode* head : value_lists_) {
+      const ValueNode* p = head;
+      do {
+        // Verify links between elements are consistent.
+        TF_RET_CHECK(p->prev->next == p);
+        TF_RET_CHECK(p->next->prev == p);
+
+        const HloInstruction* def = p->value->defining_instruction();
+        if (def->opcode() == HloOpcode::kCopy && ContainsKey(copy_map_, def)) {
+          TF_RET_CHECK(copy_map_.at(def).dest == p);
+        }
+        for (const HloUse* use : p->uses) {
+          if (use->instruction->opcode() == HloOpcode::kCopy &&
+              ContainsKey(copy_map_, use->instruction)) {
+            TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
           }
+        }
 
-          p = p->next;
-        } while (p != head);
-      }
-      return Status::OK();
+        p = p->next;
+      } while (p != head);
     }
+    return Status::OK();
+  }
 
-    // Try to elide the given copy. Elision of a copy is possible only if no
-    // live range interference is introduced by the copy's elimination. If
-    // elision is possible, then the internal state (value lists) are updated,
-    // and true is returned. Returns false otherwise.
-    bool TryElideCopy(const HloInstruction* copy) {
-      VLOG(2) << "Trying to remove " << copy->name();
+  // Try to elide the given copy. Elision of a copy is possible only if no
+  // live range interference is introduced by the copy's elimination. If
+  // elision is possible, then the internal state (value lists) are updated,
+  // and true is returned. Returns false otherwise.
+  bool TryElideCopy(const HloInstruction* copy) {
+    VLOG(2) << "Trying to remove " << copy->name();
 
-      if (!ContainsKey(copy_map_, copy)) {
-        VLOG(2) << copy->name() << " is not removable";
-        return false;
-      }
-      if (!ShapeUtil::Equal(copy->shape(), copy->operand(0)->shape())) {
-        VLOG(2) << copy->name() << " is not removable (shape mismatch)";
+    if (!ContainsKey(copy_map_, copy)) {
+      VLOG(2) << copy->name() << " is not removable";
+      return false;
+    }
+    if (!ShapeUtil::Equal(copy->shape(), copy->operand(0)->shape())) {
+      VLOG(2) << copy->name() << " is not removable (shape mismatch)";
+      return false;
+    }
+    const CopyNodes& copy_node = copy_map_.at(copy);
+    ValueNode* src = copy_node.src;
+    ValueNode* dest = copy_node.dest;
+    DCHECK(src != nullptr);
+    DCHECK(dest != nullptr);
+
+    auto is_live_range_before = [this](const ValueNode& a, const ValueNode& b) {
+      VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
+      if (LiveRangeBefore(a, b)) {
+        VLOG(2) << "  Live range of " << a.value->ToShortString()
+                << " is before " << b.value->ToShortString();
+        return true;
+      } else {
+        VLOG(2) << "  Live range of " << a.value->ToShortString()
+                << " is not before " << b.value->ToShortString();
         return false;
       }
-      const CopyNodes& copy_node = copy_map_.at(copy);
-      ValueNode* src = copy_node.src;
-      ValueNode* dest = copy_node.dest;
-      DCHECK(src != nullptr);
-      DCHECK(dest != nullptr);
-
-      auto is_live_range_before = [this](const ValueNode& a,
-                                         const ValueNode& b) {
-        VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
-        if (LiveRangeBefore(a, b)) {
-          VLOG(2) << "  Live range of " << a.value->ToShortString()
-                  << " is before " << b.value->ToShortString();
-          return true;
-        } else {
-          VLOG(2) << "  Live range of " << a.value->ToShortString()
-                  << " is not before " << b.value->ToShortString();
-          return false;
-        }
-      };
+    };
 
-      VLOG(3) << copy->name() << " copies value "
-              << src->value->ToShortString();
-      VLOG(3) << "Source buffer values: " << ValueListToString(src);
-      VLOG(3) << "Dest buffer values: " << ValueListToString(dest);
+    VLOG(3) << copy->name() << " copies value " << src->value->ToShortString();
+    VLOG(3) << "Source buffer values: " << ValueListToString(src);
+    VLOG(3) << "Dest buffer values: " << ValueListToString(dest);
 
-      // A kCopy instruction copies an HLO value from a source buffer and
-      // defines an HLO value in a destination buffer. Most generally, the
-      // source and destination buffers may each hold more than one value at
-      // different points in the computation so we define the following:
-      //
-      //   Values in source buffer:      {s_0, ..., s_n}
-      //   Values in destination buffer: {d_0, ..., d_m}
-      //
-      // A kCopy instruction between these buffers copies a value s_x in the
-      // source buffer and defines a value d_y in the destination buffer. The
-      // elision of a copy merges the source and destination buffers together,
-      // so the list of values for the source and destination buffers are
-      // merged.
-      //
-      // We handle two different cases for copy elision:
-      //
-      //  (1) the kCopy defines the first value in the destination buffer (d_0).
+    // A kCopy instruction copies an HLO value from a source buffer and
+    // defines an HLO value in a destination buffer. Most generally, the
+    // source and destination buffers may each hold more than one value at
+    // different points in the computation so we define the following:
+    //
+    //   Values in source buffer:      {s_0, ..., s_n}
+    //   Values in destination buffer: {d_0, ..., d_m}
+    //
+    // A kCopy instruction between these buffers copies a value s_x in the
+    // source buffer and defines a value d_y in the destination buffer. The
+    // elision of a copy merges the source and destination buffers together,
+    // so the list of values for the source and destination buffers are
+    // merged.
+    //
+    // We handle two different cases for copy elision:
+    //
+    //  (1) the kCopy defines the first value in the destination buffer (d_0).
+    //
+    //  (2) the kCopy copies the last value in the source buffer (s_n).
+    //
+    // For the remaining case where the kCopy copies a not-last value from the
+    // source buffer to a not-first value of the destination buffer, the kCopy
+    // instruction cannot be removed. This case is generated, for example, if
+    // the kCopy copies a while body parameter of the loop state at one tuple
+    // index to a different tuple index in the while body root. Removal of the
+    // copy necessarily results in live range interference of values in the
+    // loop state at the two different tuple indices.
+    //
+    //  We can only perform copy elision if the resulting merged values have
+    //  totally ordered live ranges; otherwise the merged buffer would have
+    //  live range interference.
+    if (src->next == dest) {
+      // In the process of eliding copies, its possible for a copy to have the
+      // same source and destination buffer. In this case, the copy can be
+      // safely removed.
+      VLOG(2) << copy->name() << " source and destination buffers are same.";
+    } else if (IsHead(*dest)) {
+      // The copy copies an arbitrary value in the source buffer (call it s_x)
+      // and defines d_0, the first value in the destination buffer. After
+      // merging, the values in the combined buffer must be strictly ordered
+      // as follows** to elide the copy:
       //
-      //  (2) the kCopy copies the last value in the source buffer (s_n).
+      // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
       //
-      // For the remaining case where the kCopy copies a not-last value from the
-      // source buffer to a not-first value of the destination buffer, the kCopy
-      // instruction cannot be removed. This case is generated, for example, if
-      // the kCopy copies a while body parameter of the loop state at one tuple
-      // index to a different tuple index in the while body root. Removal of the
-      // copy necessarily results in live range interference of values in the
-      // loop state at the two different tuple indices.
+      // Removing the copy eliminates d_0, and uses of d_0 become uses of
+      // s_x. In the above ordering, the live range of d_m must be ordered
+      // before the live range of s_{x+1} and the definition and all uses of
+      // s_x must be ordered before the definition of d_1. These conditions
+      // are checked below prior to elision.
       //
-      //  We can only perform copy elision if the resulting merged values have
-      //  totally ordered live ranges; otherwise the merged buffer would have
-      //  live range interference.
-      if (src->next == dest) {
-        // In the process of eliding copies, its possible for a copy to have the
-        // same source and destination buffer. In this case, the copy can be
-        // safely removed.
-        VLOG(2) << copy->name() << " source and destination buffers are same.";
-      } else if (IsHead(*dest)) {
-        // The copy copies an arbitrary value in the source buffer (call it s_x)
-        // and defines d_0, the first value in the destination buffer. After
-        // merging, the values in the combined buffer must be strictly ordered
-        // as follows** to elide the copy:
-        //
-        // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
-        //
-        // Removing the copy eliminates d_0, and uses of d_0 become uses of
-        // s_x. In the above ordering, the live range of d_m must be ordered
-        // before the live range of s_{x+1} and the definition and all uses of
-        // s_x must be ordered before the definition of d_1. These conditions
-        // are checked below prior to elision.
-        //
-        // ** Technically it might be possible to have a non-interfering
-        //    non-trivial interleaving of the values of the source and
-        //    destination buffers in the resulting order. However, this case is
-        //    slow and complicated to check and likely not worth it. So instead
-        //    we simply check for the case where *all* values of the destination
-        //    buffer (d_1 through d_m) are spliced into the point where the copy
-        //    used to be.
-        VLOG(2) << copy->name() << " defines the first value in its buffer";
-        ValueNode* next_dest = Next(*dest);
-        if (next_dest != nullptr) {
-          // Live range of 'from' value (s_x) must be before 'next_dest' (d_1);
-          if (!is_live_range_before(*src, *next_dest)) {
-            return false;
-          }
-        }
-        ValueNode* next_src = Next(*src);
-
-        if (next_src != nullptr) {
-          // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
-          ValueNode* last_dest = dest->prev;
-          DCHECK(IsTail(*last_dest));
-          if (!is_live_range_before(*last_dest, *next_src)) {
-            return false;
-          }
-        }
-
-        // Splice in destination buffer values list right after 'src'.
-        SpliceAfter(dest, src);
-      } else if (IsTail(*src)) {
-        // The copy copies the last value in the source buffer, s_n, and defines
-        // an arbitrary value in the destination buffer, d_y.  After
-        // merging, the values in the combined buffer must be strictly ordered
-        // as follows** to elide the copy:
-        //
-        // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
-        //
-        // Removing the copy eliminates d_y, and uses of d_y become uses of
-        // s_n. To enforce the above order, the live range of d_{y-1} must be
-        // before the live range of s_0, and the live range of s_n must be
-        // before the live range of d_{y+1}.
-        //
-        // ** See comment above in the code handling Case (1).
-        VLOG(2) << copy->name() << " copies the last value ("
-                << src->value->ToShortString() << ") in its buffer";
-
-        ValueNode* prev_dest = Prev(*dest);
-        // nullptr condition handled above in the first 'if' case.
-        DCHECK(prev_dest != nullptr);
-        ValueNode* first_src = src->next;
-        DCHECK(IsHead(*first_src));
-        if (!is_live_range_before(*prev_dest, *first_src)) {
-          // Live range of value d_{y-1} is not before s_0.
+      // ** Technically it might be possible to have a non-interfering
+      //    non-trivial interleaving of the values of the source and
+      //    destination buffers in the resulting order. However, this case is
+      //    slow and complicated to check and likely not worth it. So instead
+      //    we simply check for the case where *all* values of the destination
+      //    buffer (d_1 through d_m) are spliced into the point where the copy
+      //    used to be.
+      VLOG(2) << copy->name() << " defines the first value in its buffer";
+      ValueNode* next_dest = Next(*dest);
+      if (next_dest != nullptr) {
+        // Live range of 'from' value (s_x) must be before 'next_dest' (d_1);
+        if (!is_live_range_before(*src, *next_dest)) {
           return false;
         }
-        ValueNode* next_dest = Next(*dest);
-        if (next_dest != nullptr) {
-          if (!is_live_range_before(*src, *next_dest)) {
-            // Live range of value s_n is not before d_{y+1}.
-            return false;
-          }
-        }
-
-        // Splice source buffer values list right after 'prev_dest'.
-        SpliceAfter(first_src, prev_dest);
-      } else {
-        VLOG(2)
-            << copy->name()
-            << " copies value in middle of source buffer to value in middle "
-               "of destination buffer";
-        return false;
       }
+      ValueNode* next_src = Next(*src);
 
-      RemoveCopyValue(dest);
-
-      XLA_VLOG_LINES(4, ToString());
-      TF_DCHECK_OK(Verify());
-
-      return true;
-    }
-
-    // Delete the given ValueNode associated with a elided kCopy
-    // instruction. This should be called after splicing the value lists of the
-    // source and destination buffers together.
-    void RemoveCopyValue(ValueNode* copy_value_node) {
-      CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
-               HloOpcode::kCopy);
-      ValueNode* operand_node = copy_value_node->prev;
-      CHECK(operand_node != copy_value_node);
-
-      VLOG(2) << "Removing copy " << operand_node->value->ToShortString()
-              << " => " << copy_value_node->value->ToShortString();
-
-      // Splice out the copy value node.
-      operand_node->next = copy_value_node->next;
-      copy_value_node->next->prev = operand_node;
-
-      // Patch up uses. Remove use of copy from operand_node uses.
-      auto it = absl::c_find_if(
-          operand_node->uses, [copy_value_node](const HloUse* use) {
-            return use->instruction ==
-                   copy_value_node->value->defining_instruction();
-          });
-      CHECK(it != operand_node->uses.end());
-      operand_node->uses.erase(it);
-
-      // If the elided copy has any uses which are themselves kCopy instructions
-      // then patch up the copy info to reflect the that this kCopy instruction
-      // has a different operand (the operand of the elided copy).
-      for (const HloUse* copy_use : copy_value_node->uses) {
-        operand_node->uses.push_back(copy_use);
-        if (copy_use->instruction->opcode() == HloOpcode::kCopy &&
-            ContainsKey(copy_map_, copy_use->instruction)) {
-          copy_map_.at(copy_use->instruction).src = operand_node;
+      if (next_src != nullptr) {
+        // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
+        ValueNode* last_dest = dest->prev;
+        DCHECK(IsTail(*last_dest));
+        if (!is_live_range_before(*last_dest, *next_src)) {
+          return false;
         }
       }
 
-      // Delete the copy info and the value node.
-      copy_map_.erase(copy_value_node->value->defining_instruction());
-      delete copy_value_node;
-    }
-
-    // Returns true if the live range of given value 'a' is before the live
-    // range of 'b'.
-    //
-    // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
-    // updated as copies are removed.
-    bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
-      if (a.uses.empty()) {
-        VLOG(2) << "Empty uses for " << *a.value;
-        return ordering_.IsDefinedBefore(*a.value, *b.value);
+      // Splice in destination buffer values list right after 'src'.
+      SpliceAfter(dest, src);
+    } else if (IsTail(*src)) {
+      // The copy copies the last value in the source buffer, s_n, and defines
+      // an arbitrary value in the destination buffer, d_y.  After
+      // merging, the values in the combined buffer must be strictly ordered
+      // as follows** to elide the copy:
+      //
+      // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
+      //
+      // Removing the copy eliminates d_y, and uses of d_y become uses of
+      // s_n. To enforce the above order, the live range of d_{y-1} must be
+      // before the live range of s_0, and the live range of s_n must be
+      // before the live range of d_{y+1}.
+      //
+      // ** See comment above in the code handling Case (1).
+      VLOG(2) << copy->name() << " copies the last value ("
+              << src->value->ToShortString() << ") in its buffer";
+
+      ValueNode* prev_dest = Prev(*dest);
+      // nullptr condition handled above in the first 'if' case.
+      DCHECK(prev_dest != nullptr);
+      ValueNode* first_src = src->next;
+      DCHECK(IsHead(*first_src));
+      if (!is_live_range_before(*prev_dest, *first_src)) {
+        // Live range of value d_{y-1} is not before s_0.
+        return false;
       }
-      for (const HloUse* use : a.uses) {
-        VLOG(2) << "Checking use " << *use << " against " << *b.value;
-        if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
-          VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
+      ValueNode* next_dest = Next(*dest);
+      if (next_dest != nullptr) {
+        if (!is_live_range_before(*src, *next_dest)) {
+          // Live range of value s_n is not before d_{y+1}.
           return false;
         }
-        VLOG(2) << "Use " << *use << " is before " << *b.value;
       }
-      return true;
-    }
 
-    // Returns whether 'node' is the last node in its list.
-    bool IsTail(const ValueNode& node) const {
-      return ContainsKey(value_lists_, node.next);
+      // Splice source buffer values list right after 'prev_dest'.
+      SpliceAfter(first_src, prev_dest);
+    } else {
+      VLOG(2) << copy->name()
+              << " copies value in middle of source buffer to value in middle "
+                 "of destination buffer";
+      return false;
     }
 
-    // Returns whether 'node' is the first node in its list.
-    bool IsHead(const ValueNode& node) const {
-      return ContainsKey(value_lists_, &node);
-    }
+    RemoveCopyValue(dest);
 
-    // Returns the next node in the list after 'node'. If 'node' is the
-    // tail, then nullptr is returned.
-    ValueNode* Next(const ValueNode& node) const {
-      if (IsTail(node)) {
-        return nullptr;
-      } else {
-        return node.next;
+    XLA_VLOG_LINES(4, ToString());
+    TF_DCHECK_OK(Verify());
+
+    return true;
+  }
+
+  // Delete the given ValueNode associated with a elided kCopy
+  // instruction. This should be called after splicing the value lists of the
+  // source and destination buffers together.
+  void RemoveCopyValue(ValueNode* copy_value_node) {
+    CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
+             HloOpcode::kCopy);
+    ValueNode* operand_node = copy_value_node->prev;
+    CHECK(operand_node != copy_value_node);
+
+    VLOG(2) << "Removing copy " << operand_node->value->ToShortString()
+            << " => " << copy_value_node->value->ToShortString();
+
+    // Splice out the copy value node.
+    operand_node->next = copy_value_node->next;
+    copy_value_node->next->prev = operand_node;
+
+    // Patch up uses. Remove use of copy from operand_node uses.
+    auto it = absl::c_find_if(operand_node->uses, [copy_value_node](
+                                                      const HloUse* use) {
+      return use->instruction == copy_value_node->value->defining_instruction();
+    });
+    CHECK(it != operand_node->uses.end());
+    operand_node->uses.erase(it);
+
+    // If the elided copy has any uses which are themselves kCopy instructions
+    // then patch up the copy info to reflect the that this kCopy instruction
+    // has a different operand (the operand of the elided copy).
+    for (const HloUse* copy_use : copy_value_node->uses) {
+      operand_node->uses.push_back(copy_use);
+      if (copy_use->instruction->opcode() == HloOpcode::kCopy &&
+          ContainsKey(copy_map_, copy_use->instruction)) {
+        copy_map_.at(copy_use->instruction).src = operand_node;
       }
     }
 
-    // Returns the previous node in the list before 'node'. If 'node'
-    // is the head, then nullptr is returned.
-    ValueNode* Prev(const ValueNode& node) const {
-      if (IsHead(node)) {
-        return nullptr;
-      } else {
-        return node.prev;
+    // Delete the copy info and the value node.
+    copy_map_.erase(copy_value_node->value->defining_instruction());
+    delete copy_value_node;
+  }
+
+  // Returns true if the live range of given value 'a' is before the live
+  // range of 'b'.
+  //
+  // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
+  // updated as copies are removed.
+  bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
+    if (a.uses.empty()) {
+      VLOG(2) << "Empty uses for " << *a.value;
+      return ordering_.IsDefinedBefore(*a.value, *b.value);
+    }
+    for (const HloUse* use : a.uses) {
+      VLOG(2) << "Checking use " << *use << " against " << *b.value;
+      if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
+        VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
+        return false;
       }
+      VLOG(2) << "Use " << *use << " is before " << *b.value;
     }
+    return true;
+  }
 
-    // Splices the entire linked list with 'head' as its head right after the
-    // node 'insert_after' in another linked list.
-    void SpliceAfter(ValueNode* head, ValueNode* insert_after) {
-      DCHECK(IsHead(*head));
-      value_lists_.erase(head);
+  // Returns whether 'node' is the last node in its list.
+  bool IsTail(const ValueNode& node) const {
+    return ContainsKey(value_lists_, node.next);
+  }
 
-      ValueNode* tail = head->prev;
-      tail->next = insert_after->next;
-      insert_after->next->prev = tail;
+  // Returns whether 'node' is the first node in its list.
+  bool IsHead(const ValueNode& node) const {
+    return ContainsKey(value_lists_, &node);
+  }
 
-      insert_after->next = head;
-      head->prev = insert_after;
+  // Returns the next node in the list after 'node'. If 'node' is the
+  // tail, then nullptr is returned.
+  ValueNode* Next(const ValueNode& node) const {
+    if (IsTail(node)) {
+      return nullptr;
+    } else {
+      return node.next;
     }
+  }
 
-    string ValueListToString(const ValueNode* element) {
-      const ValueNode* head = element;
-      while (!IsHead(*head)) {
-        head = Prev(*head);
-      }
-      std::vector<const HloValue*> values;
-      for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
-        values.push_back(p->value);
-      }
-      return absl::StrCat("{",
-                          absl::StrJoin(values, ", ",
-                                        [](string* s, const HloValue* value) {
-                                          StrAppend(s, value->ToShortString());
-                                        }),
-                          "}");
+  // Returns the previous node in the list before 'node'. If 'node'
+  // is the head, then nullptr is returned.
+  ValueNode* Prev(const ValueNode& node) const {
+    if (IsHead(node)) {
+      return nullptr;
+    } else {
+      return node.prev;
     }
+  }
 
-    string ToString() const {
-      string out = absl::StrCat("BufferValueTracker:\n");
-      StrAppend(&out, "  Def-use chains in each buffer:\n");
-      for (const ValueNode* head : value_lists_) {
-        StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
-                  ":\n");
-        const ValueNode* p = head;
-        do {
-          StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
-                    absl::StrJoin(p->uses, "; ",
-                                  [](string* s, const HloUse* use) {
-                                    StrAppend(s, use->ToString());
-                                  }),
-                    "\n");
-
-          p = p->next;
-        } while (p != head);
-      }
-      StrAppend(&out, "  Potentially removable copies:\n");
-      for (const auto& pair : copy_map_) {
-        const HloInstruction* copy = pair.first;
-        const CopyNodes& copy_info = pair.second;
-
-        StrAppend(&out, "    ", copy->name(), " : ",
-                  copy_info.src->value->ToShortString(), " => ",
-                  copy_info.dest->value->ToShortString(), "\n");
-      }
-      return out;
-    }
+  // Splices the entire linked list with 'head' as its head right after the
+  // node 'insert_after' in another linked list.
+  void SpliceAfter(ValueNode* head, ValueNode* insert_after) {
+    DCHECK(IsHead(*head));
+    value_lists_.erase(head);
 
-   private:
-    const HloDataflowAnalysis& dataflow_;
-    const HloOrdering& ordering_;
-
-    // The heads of all the value lists. Each value list represents the HLO
-    // values contained in a particular HLO buffer. The values in the list are
-    // in dependency order.
-    absl::flat_hash_set<const ValueNode*> value_lists_;
-
-    // Copy removal requires fast access to the value list elements
-    // corresponding to the source and destination values of the kCopy
-    // instruction. This data structure holds pointers to these elements for
-    // each kCopy instruction in the graph.
-    struct CopyNodes {
-      // The source and destinations values of the kCopy instruction.
-      ValueNode* src = nullptr;
-      ValueNode* dest = nullptr;
-    };
-    absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
-  };
+    ValueNode* tail = head->prev;
+    tail->next = insert_after->next;
+    insert_after->next->prev = tail;
 
-  HloModule* module_;
-  const HloAliasAnalysis& alias_analysis_;
+    insert_after->next = head;
+    head->prev = insert_after;
+  }
 
-  // Object tracking the HLO values contained in each HLO buffer.
-  BufferValueTracker buffer_value_tracker_;
-};
+  string ValueListToString(const ValueNode* element) {
+    const ValueNode* head = element;
+    while (!IsHead(*head)) {
+      head = Prev(*head);
+    }
+    std::vector<const HloValue*> values;
+    for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
+      values.push_back(p->value);
+    }
+    return absl::StrCat("{",
+                        absl::StrJoin(values, ", ",
+                                      [](string* s, const HloValue* value) {
+                                        StrAppend(s, value->ToShortString());
+                                      }),
+                        "}");
+  }
 
-void MaybeDumpModule(const string& message, const HloModule& module) {
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << message;
-    XLA_VLOG_LINES(3, module.ToString());
-    hlo_graph_dumper::MaybeDumpHloModule(module, message);
+  string ToString() const {
+    string out = absl::StrCat("CopyRemover:\n");
+    StrAppend(&out, "  Def-use chains in each buffer:\n");
+    for (const ValueNode* head : value_lists_) {
+      StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
+                ":\n");
+      const ValueNode* p = head;
+      do {
+        StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
+                  absl::StrJoin(p->uses, "; ",
+                                [](string* s, const HloUse* use) {
+                                  StrAppend(s, use->ToString());
+                                }),
+                  "\n");
+
+        p = p->next;
+      } while (p != head);
+    }
+    StrAppend(&out, "  Potentially removable copies:\n");
+    for (const auto& pair : copy_map_) {
+      const HloInstruction* copy = pair.first;
+      const CopyNodes& copy_info = pair.second;
+
+      StrAppend(&out, "    ", copy->name(), " : ",
+                copy_info.src->value->ToShortString(), " => ",
+                copy_info.dest->value->ToShortString(), "\n");
+    }
+    return out;
   }
-}
+
+ private:
+  const HloDataflowAnalysis& dataflow_;
+  const HloOrdering& ordering_;
+
+  // The heads of all the value lists. Each value list represents the HLO
+  // values contained in a particular HLO buffer. The values in the list are
+  // in dependency order.
+  absl::flat_hash_set<const ValueNode*> value_lists_;
+
+  // Copy removal requires fast access to the value list elements
+  // corresponding to the source and destination values of the kCopy
+  // instruction. This data structure holds pointers to these elements for
+  // each kCopy instruction in the graph.
+  struct CopyNodes {
+    // The source and destinations values of the kCopy instruction.
+    ValueNode* src = nullptr;
+    ValueNode* dest = nullptr;
+  };
+  absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
+};
 
 }  // namespace
 
@@ -1156,23 +1098,29 @@ Status CopyInsertion::VerifyNoLiveRangeInterference(const HloOrdering& ordering,
 
 Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                               HloModule* module) {
-  MaybeDumpModule("after adding copies to resolve interference", *module);
-
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
-  CopyRemover copy_remover(*alias_analysis, ordering, module);
-  XLA_VLOG_LINES(3, copy_remover.ToString());
+
+  CopyRemover copy_remover(*module, *alias_analysis, ordering);
+  if (VLOG_IS_ON(3)) {
+    LOG(INFO) << "Removing unnecessary copies in " << module->name();
+    LOG(INFO) << "Buffer values, in dependency order: ";
+    for (const HloBuffer& buffer : alias_analysis->buffers()) {
+      LOG(INFO) << "    HloBuffer " << buffer.id();
+    }
+  }
 
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kCopy) {
-        TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status());
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          copy_remover.TryElideCopy(instruction)) {
+        TF_RETURN_IF_ERROR(StripControlDependenciesFrom(instruction));
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
       }
     }
   }
-  MaybeDumpModule("after removing unnecessary copies", *module);
-
   return Status::OK();
 }
 
@@ -1201,8 +1149,6 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   // interference. If all copies were added in step (1) then copy removal would
   // also have to reason about things like constants and parameters live out of
   // the computation.
-  MaybeDumpModule("before copy insertion", *module);
-
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   if (!call_graph->IsFlattened()) {
     return FailedPrecondition(
@@ -1231,23 +1177,25 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   HloDCE dce;
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
+  DumpHloModuleDuringPassIfEnabled(
+      name(), "after adding copies to resolve interference", *module);
 
   DependencyHloOrdering dep_ordering(module);
   TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module));
 
   TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module));
+  DumpHloModuleDuringPassIfEnabled(name(), "after removing unnecessary copies",
+                                   *module);
 
   TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
-
-  MaybeDumpModule("after adding special-case copies", *module);
+  DumpHloModuleDuringPassIfEnabled(name(), "after adding special-case copies",
+                                   *module);
 
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
   TF_DCHECK_OK(
       VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module));
 
-  MaybeDumpModule("after copy insertion", *module);
-
   if (VLOG_IS_ON(1)) {
     int64 num_total_copies = 0;
     for (HloComputation* computation : module->computations()) {
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 4d4074943e3bf9f6f2a37abc63f037c2dab06e0f..6fa3161e57814ffa2c0030f607a05bda2a4e121f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -420,9 +420,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             limit_const->shape(), loop_state, 0));
-    builder.AddInstruction(
-        HloInstruction::CreateBinary(condition_result_shape_, HloOpcode::kLt,
-                                     induction_variable, limit_const));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        condition_result_shape_, induction_variable, limit_const,
+        ComparisonDirection::kLt));
     return builder.Build();
   }
 
@@ -1842,7 +1842,7 @@ HloModule TokensShouldNotBeCopied
   %param = (s32[], token[]) parameter(0)
   %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0
   %constant = s32[] constant(42)
-  ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant)
+  ROOT %less-than = pred[] compare(s32[] %get-tuple-element, s32[] %constant), direction=LT
 }
 
 ENTRY %TokensShouldNotBeCopied () -> s32[] {
@@ -1855,8 +1855,7 @@ ENTRY %TokensShouldNotBeCopied () -> s32[] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          HloRunner::CreateModuleFromString(
-                              module_string, GetDebugOptionsForTest()));
+                          ParseAndReturnVerifiedModule(module_string));
   InsertCopies(module.get());
 
   // There should be no copies added because tokens should not be copied.
@@ -2061,7 +2060,7 @@ if-condition.v4 {
   p.2 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
   get-tuple-element.67 = s32[] get-tuple-element(p.2), index=0
   constant.4 = s32[] constant(0)
-  ROOT equal-to = pred[] equal-to(get-tuple-element.67, constant.4)
+  ROOT equal-to = pred[] compare(get-tuple-element.67, constant.4), direction=EQ
 }
 
 _functionalize_body_1__.v28 {
@@ -2071,7 +2070,7 @@ _functionalize_body_1__.v28 {
   add.4 = s32[] add(get-tuple-element.68, constant.7)
   get-tuple-element.69 = s32[] get-tuple-element(arg_tuple.4), index=1
   get-tuple-element.70 = s32[] get-tuple-element(arg_tuple.4), index=2
-  less-than-or-equal-to = pred[] less-than-or-equal-to(get-tuple-element.69, get-tuple-element.70)
+  less-than-or-equal-to = pred[] compare(get-tuple-element.69, get-tuple-element.70), direction=LE
   constant.8 = s32[] constant(0)
   select = s32[] select(less-than-or-equal-to, constant.8, constant.7)
   get-tuple-element.71 = s32[] get-tuple-element(arg_tuple.4), index=3
@@ -2088,7 +2087,7 @@ cond_wrapper.v3.1 {
   inputs.1 = (s32[], s32[], s32[], s32[]) parameter(0)
   get-tuple-element.75 = s32[] get-tuple-element(inputs.1), index=0
   constant.11 = s32[] constant(7)
-  ROOT less-than.2 = pred[] less-than(get-tuple-element.75, constant.11)
+  ROOT less-than.2 = pred[] compare(get-tuple-element.75, constant.11), direction=LT
 }
 
 _functionalize_body_2__.v25 {
@@ -2111,7 +2110,7 @@ cond_wrapper.v3.2 {
   inputs.2 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
   get-tuple-element.83 = s32[] get-tuple-element(inputs.2), index=1
   constant.13 = s32[] constant(5)
-  ROOT less-than.3 = pred[] less-than(get-tuple-element.83, constant.13)
+  ROOT less-than.3 = pred[] compare(get-tuple-element.83, constant.13), direction=LT
 }
 
 ENTRY TestComputation {
@@ -2119,8 +2118,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2144,7 +2142,7 @@ if-condition.v4 {
   p.2 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
   get-tuple-element.67 = s32[] get-tuple-element(p.2), index=0
   constant.4 = s32[] constant(0)
-  ROOT equal-to = pred[] equal-to(get-tuple-element.67, constant.4)
+  ROOT equal-to = pred[] compare(get-tuple-element.67, constant.4), direction=EQ
 }
 
 if-body.v5.1 {
@@ -2161,7 +2159,7 @@ if-condition.v4.1 {
   p.4 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
   get-tuple-element.71 = s32[] get-tuple-element(p.4), index=0
   constant.6 = s32[] constant(1)
-  ROOT equal-to.1 = pred[] equal-to(get-tuple-element.71, constant.6)
+  ROOT equal-to.1 = pred[] compare(get-tuple-element.71, constant.6), direction=EQ
 }
 
 _functionalize_body_1__.v28 {
@@ -2171,7 +2169,7 @@ _functionalize_body_1__.v28 {
   add.4 = s32[] add(get-tuple-element.72, constant.7)
   get-tuple-element.73 = s32[] get-tuple-element(arg_tuple.4), index=1
   get-tuple-element.74 = s32[] get-tuple-element(arg_tuple.4), index=2
-  less-than-or-equal-to = pred[] less-than-or-equal-to(get-tuple-element.73, get-tuple-element.74)
+  less-than-or-equal-to = pred[] compare(get-tuple-element.73, get-tuple-element.74), direction=LE
   constant.8 = s32[] constant(0)
   select = s32[] select(less-than-or-equal-to, constant.8, constant.7)
   get-tuple-element.75 = s32[] get-tuple-element(arg_tuple.4), index=3
@@ -2189,7 +2187,7 @@ cond_wrapper.v3.1 {
   inputs.1 = (s32[], s32[], s32[], s32[]) parameter(0)
   get-tuple-element.78 = s32[] get-tuple-element(inputs.1), index=0
   constant.11 = s32[] constant(7)
-  ROOT less-than.2 = pred[] less-than(get-tuple-element.78, constant.11)
+  ROOT less-than.2 = pred[] compare(get-tuple-element.78, constant.11), direction=LT
 }
 
 _functionalize_body_2__.v25 {
@@ -2212,7 +2210,7 @@ cond_wrapper.v3.2 {
   inputs.2 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
   get-tuple-element.86 = s32[] get-tuple-element(inputs.2), index=1
   constant.13 = s32[] constant(5)
-  ROOT less-than.3 = pred[] less-than(get-tuple-element.86, constant.13)
+  ROOT less-than.3 = pred[] compare(get-tuple-element.86, constant.13), direction=LT
 }
 
 ENTRY TestComputation {
@@ -2220,8 +2218,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2238,7 +2235,7 @@ cond.inner {
 
 body.inner {
   param.body.inner = pred[] parameter(0)
-  ROOT neg = pred[] negate(param.body.inner)
+  ROOT not = pred[] not(param.body.inner)
 }
 
 cond.outer {
@@ -2255,9 +2252,8 @@ ENTRY TestComputation {
   ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   InsertCopies(module.get());
 
   // There should only be a single copy inserted, and it's in the entry
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index d4535b204d7f3ad8d4e24beea5d0dd79e7a15ab0..66ceb57227cf20139aa355ffd0caea48fa1ddc50 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -95,6 +95,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
@@ -111,6 +112,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:cholesky_expander",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
@@ -136,6 +138,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:transpose_folding",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
@@ -336,15 +339,15 @@ cc_library(
     srcs = ["ir_function.cc"],
     hdrs = ["ir_function.h"],
     deps = [
+        ":cpu_runtime",
         ":ir_emission_utils",
         ":shape_partition",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -375,6 +378,7 @@ cc_library(
         ":vector_support_library",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -746,6 +750,7 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/build_defs.bzl b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
index e78330b21689fdd818cd97128bbcaaa9e0118602..ffa1cd4ec8e26e7dbe92e7b99cf65e99db5400b9 100644
--- a/tensorflow/compiler/xla/service/cpu/build_defs.bzl
+++ b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
@@ -1,12 +1,11 @@
 """build_defs for service/cpu."""
 
-
 def runtime_copts():
-  """Returns copts used for CPU runtime libraries."""
-  return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
-      "//tensorflow:android_arm": ["-mfpu=neon"],
-      "//conditions:default": []
-  }) + select({
-      "//tensorflow:android": ["-O2"],
-      "//conditions:default": []
-  }))
+    """Returns copts used for CPU runtime libraries."""
+    return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
+        "//tensorflow:android_arm": ["-mfpu=neon"],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:android": ["-O2"],
+        "//conditions:default": [],
+    }))
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 796a7cf94d02b0ad42366387a9d3f8d589b8840a..39926182894590e469c3cbb8d8620f3553fce873 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -66,9 +66,14 @@ class FilteredPassManager : public llvm::legacy::PassManager {
   explicit FilteredPassManager(bool disable_expensive_passes)
       : disable_expensive_passes_(disable_expensive_passes) {}
   void add(llvm::Pass* p) override {
+    llvm::StringRef PassName = p->getPassName();
+    if (PassName.contains("Warn about non-applied transformations")) {
+      delete p;
+      return;
+    }
     if (disable_expensive_passes_) {
-      llvm::StringRef PassName = p->getPassName();
       if (PassName.contains("Unroll loops")) {
+        delete p;
         return;
       }
     }
@@ -89,7 +94,7 @@ std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
 
   if (pre_optimization_hook_) {
-    TF_CHECK_OK(pre_optimization_hook_(module));
+    pre_optimization_hook_(module);
   }
 
   // Add the appropriate TargetLibraryInfo and TargetTransformInfo.
@@ -133,7 +138,7 @@ std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
 
   if (post_optimization_hook_) {
-    TF_CHECK_OK(post_optimization_hook_(module));
+    post_optimization_hook_(module);
   }
 
   // Generate code.
@@ -145,17 +150,11 @@ std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
   std::unique_ptr<llvm::MemoryBuffer> memory_buffer(
       new llvm::SmallVectorMemoryBuffer(std::move(stream_buffer)));
 
-  if (VLOG_IS_ON(2)) {
+  if (post_codegen_hook_) {
     llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>> obj_file =
         llvm::object::ObjectFile::createObjectFile(*memory_buffer);
     if (obj_file) {
-      StatusOr<DisassemblerResult> disasm_result =
-          disassembler_->DisassembleObjectFile(*obj_file.get());
-      if (disasm_result.ok()) {
-        XLA_VLOG_LINES(2, disasm_result.ValueOrDie().text);
-      } else {
-        LOG(WARNING) << "Could not disassemble object file!";
-      }
+      post_codegen_hook_(*obj_file.get());
     } else {
       LOG(WARNING) << "Could convert memory buffer to object file!";
     }
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index c38b896c5019b48fd2a16a51abd59e12ebdb29eb..edcd47e9e89748a1df814619f10870afb531bc7e 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -32,19 +31,21 @@ namespace cpu {
 class CompilerFunctor {
  public:
   explicit CompilerFunctor(
-      llvm::TargetMachine* target_machine, const Disassembler* disassembler,
-      int opt_level, bool optimize_for_size, bool enable_fast_math,
+      llvm::TargetMachine* target_machine, int opt_level,
+      bool optimize_for_size, bool enable_fast_math,
       bool disable_expensive_passes,
       LLVMCompiler::ModuleHook pre_optimization_hook = nullptr,
-      LLVMCompiler::ModuleHook post_optimization_hook = nullptr)
+      LLVMCompiler::ModuleHook post_optimization_hook = nullptr,
+      std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook =
+          nullptr)
       : target_machine_(target_machine),
-        disassembler_(CHECK_NOTNULL(disassembler)),
         opt_level_(opt_level),
         optimize_for_size_(optimize_for_size),
         enable_fast_math_(enable_fast_math),
         disable_expensive_passes_(disable_expensive_passes),
-        pre_optimization_hook_(pre_optimization_hook),
-        post_optimization_hook_(post_optimization_hook) {}
+        pre_optimization_hook_(std::move(pre_optimization_hook)),
+        post_optimization_hook_(std::move(post_optimization_hook)),
+        post_codegen_hook_(std::move(post_codegen_hook)) {}
 
   // Compile a Module to an ObjectFile.
   std::unique_ptr<llvm::MemoryBuffer> operator()(
@@ -61,13 +62,13 @@ class CompilerFunctor {
                              unsigned opt_level, unsigned size_level) const;
 
   llvm::TargetMachine* target_machine_;
-  const Disassembler* disassembler_;
   const unsigned opt_level_;
   const bool optimize_for_size_;
   const bool enable_fast_math_;
   const bool disable_expensive_passes_;
   LLVMCompiler::ModuleHook pre_optimization_hook_;
   LLVMCompiler::ModuleHook post_optimization_hook_;
+  std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index eafda68510d93ee54f2aead60a84f3e97b3fe1f4..7de159cf647190c18d4c02a1acbff31419d9e759 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/cholesky_expander.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 #include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
@@ -69,6 +70,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -95,6 +97,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
@@ -105,6 +108,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 
 namespace xla {
 namespace cpu {
@@ -255,6 +259,9 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<MapInliner>();
 
+  pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<TriangularSolveExpander>();
+
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
@@ -312,6 +319,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       },
       TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
+
+  pipeline.AddPass<CpuLayoutAssignment>(
+      module->mutable_entry_computation_layout(),
+      LayoutAssignment::InstructionCanChangeLayout, target_machine_features);
+
   pipeline.AddPass<CpuInstructionFusion>();
 
   pipeline.AddPass<ScatterExpander>();
@@ -319,10 +331,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
-
-  pipeline.AddPass<CpuLayoutAssignment>(
-      module->mutable_entry_computation_layout(),
-      LayoutAssignment::InstructionCanChangeLayout, target_machine_features);
   return pipeline.Run(module).status();
 }
 
@@ -342,13 +350,10 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   {
     auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
         "simplification after layout assignement");
-    // TODO(b/117156505): When the bug is fixed, the CPU backend should not
-    // produce layout changing elementwise operations. We will then pass
-    // LayoutAssignment::InstructionCanChangeLayout to the HLO verifier to
-    // enable stricter verification.
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
-        /*allow_mixed_precision=*/false);
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
     AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
     options.set_enable_dot_strength_reduction(false);
@@ -404,10 +409,20 @@ auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
 llvm::TargetOptions CompilerTargetOptions(
     const HloModuleConfig& module_config) {
   llvm::TargetOptions target_options;
-  llvm_ir::SetTargetOptions(
-      /*fast_math_enabled=*/module_config.debug_options()
-          .xla_cpu_enable_fast_math(),
-      &target_options);
+  // In LLVM backend flags, UnsafeFPMath does not explicitly imply NoInfs, etc.
+  if (module_config.debug_options().xla_cpu_enable_fast_math()) {
+    target_options.UnsafeFPMath = true;
+    target_options.NoInfsFPMath =
+        module_config.debug_options().xla_cpu_fast_math_honor_infs();
+    target_options.NoNaNsFPMath =
+        module_config.debug_options().xla_cpu_fast_math_honor_nans();
+    target_options.NoSignedZerosFPMath = true;
+  } else {
+    target_options.UnsafeFPMath = false;
+    target_options.NoInfsFPMath = false;
+    target_options.NoNaNsFPMath = false;
+    target_options.NoSignedZerosFPMath = false;
+  }
   return target_options;
 }
 
@@ -426,53 +441,32 @@ llvm::CodeGenOpt::Level CodeGenOptLevel(const HloModuleConfig& module_config) {
   }
 }
 
-Status InitializeModuleHooks(
+std::pair<LLVMCompiler::ModuleHook, LLVMCompiler::ModuleHook> GetIRModuleHooks(
     const HloModule& hlo_module,
     const LLVMCompiler::ModuleHook& user_pre_optimization_hook,
-    const LLVMCompiler::ModuleHook& user_post_optimization_hook,
-    LLVMCompiler::ModuleHook* pre_optimization_ir_hook,
-    LLVMCompiler::ModuleHook* post_optimization_ir_hook) {
-  const string& ir_dump_directory =
-      hlo_module.config().debug_options().xla_dump_ir_to();
-  if (ir_dump_directory.empty()) {
-    *pre_optimization_ir_hook = user_pre_optimization_hook;
-    *post_optimization_ir_hook = user_post_optimization_hook;
-    return Status::OK();
-  }
-
-  const string& hlo_module_name = hlo_module.name();
-
+    const LLVMCompiler::ModuleHook& user_post_optimization_hook) {
   // Create the IR hooks. If applicable, each IR hook does the following:
   //
   //  * Calls the user supplied module hook.
   //  * Writes out the IR to a file in the output directory designated by
-  //    --xla_dump_ir_to
-
-  *pre_optimization_ir_hook =
-      [user_pre_optimization_hook, ir_dump_directory,
-       hlo_module_name](const llvm::Module& llvm_module) {
-        if (user_pre_optimization_hook) {
-          TF_RETURN_IF_ERROR(user_pre_optimization_hook(llvm_module));
-        }
-        return llvm_ir::DumpIRToDirectory(/*directory_name=*/ir_dump_directory,
-                                          /*hlo_module_name=*/hlo_module_name,
-                                          llvm_module,
-                                          /*optimized=*/false);
-      };
-
-  *post_optimization_ir_hook =
-      [user_post_optimization_hook, ir_dump_directory,
-       hlo_module_name](const llvm::Module& llvm_module) {
-        if (user_post_optimization_hook) {
-          TF_RETURN_IF_ERROR(user_post_optimization_hook(llvm_module));
-        }
-        return llvm_ir::DumpIRToDirectory(/*directory_name=*/ir_dump_directory,
-                                          /*hlo_module_name=*/hlo_module_name,
-                                          llvm_module,
-                                          /*optimized=*/true);
-      };
-
-  return Status::OK();
+  //    --xla_dump_to
+  const HloModule* hlo_module_ptr = &hlo_module;
+  auto hook = [user_pre_optimization_hook, user_post_optimization_hook,
+               hlo_module_ptr](bool optimized,
+                               const llvm::Module& llvm_module) {
+    const auto& user_hook =
+        !optimized ? user_pre_optimization_hook : user_post_optimization_hook;
+    if (user_hook) {
+      user_hook(llvm_module);
+    }
+    llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized);
+  };
+  return {[hook](const llvm::Module& llvm_module) {
+            return hook(/*optimized=*/false, llvm_module);
+          },
+          [hook](const llvm::Module& llvm_module) {
+            return hook(/*optimized=*/true, llvm_module);
+          }};
 }
 
 Status VerifyLlvmModule(const llvm::Module& llvm_module) {
@@ -486,7 +480,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
       << "Invalid LLVM IR before optimizations:\n"
       << err_stream.str()
       << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-         "Rerun with --xla_dump_ir_to to get the IR. ";
+         "Rerun with --xla_dump_to to get the IR. ";
   return Status::OK();
 }
 
@@ -530,9 +524,6 @@ Status CreateHloProfilingArtifacts(
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
     DeviceMemoryAllocator* /*device_allocator*/) {
-  VLOG(2) << "Before optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
-
   std::unique_ptr<llvm::TargetMachine> jit_target_machine =
       SimpleOrcJIT::InferTargetMachineForJIT(
           CompilerTargetOptions(module->config()),
@@ -540,29 +531,72 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
   TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
                                   jit_target_machine.get()));
-
-  VLOG(2) << "After optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
   return std::move(module);
 }
 
+namespace {
+
+// Post-compilation callback functor for use by SimpleOrcJIT.
+//
+// Dumps disassembled machine code if dumping is enabled for the module.
+struct OrcJITPostCompilationHook {
+  // Gets an std::function that implements this hook.
+  static std::function<void(const llvm::object::ObjectFile& obj_file)> Create(
+      const HloModule* module) {
+    // This struct is not copyable, but std::functions must be.  So to create an
+    // std::function out of this struct, we have to wrap it in a shared_ptr.
+    auto wrapped = std::make_shared<OrcJITPostCompilationHook>(module);
+    return [wrapped](const llvm::object::ObjectFile& obj_file) {
+      (*wrapped)(obj_file);
+    };
+  }
+
+  // Constructor can't be private because we want to call it from
+  // std::make_shared, but users should call Create() instead.
+  explicit OrcJITPostCompilationHook(const HloModule* module)
+      : module(module),
+        target_machine(SimpleOrcJIT::InferTargetMachineForJIT(
+            CompilerTargetOptions(module->config()),
+            CodeGenOptLevel(module->config()))),
+        disassembler(*target_machine) {}
+
+ private:
+  void operator()(const llvm::object::ObjectFile& obj_file) {
+    if (!DumpingEnabledForHloModule(*module)) {
+      return;
+    }
+    StatusOr<DisassemblerResult> disasm_or =
+        disassembler.DisassembleObjectFile(obj_file);
+    string text = disasm_or.ok() ? std::move(disasm_or).ValueOrDie().text
+                                 : absl::StrCat("Error disassembling: ",
+                                                disasm_or.status().ToString());
+    DumpToFileInDirOrStdout(*module, /*file_suffix=*/"s", text);
+  }
+
+  const HloModule* module;
+  // disassembler keeps references to data inside of target_machine.
+  std::unique_ptr<llvm::TargetMachine> target_machine;
+  Disassembler disassembler;
+};
+
+}  // namespace
+
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
-  const string timer_message =
-      "Compiling [" + module->name() + "] for CPU using JIT";
-  XLA_SCOPED_LOGGING_TIMER(timer_message);
-
   VLOG(1) << "Compiling: " << module->name();
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
+
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
                  &llvm_ir::InitializeLLVMCommandLineOptions, module->config());
 
   ModuleHook pre_optimization_ir_hook;
   ModuleHook post_optimization_ir_hook;
-  TF_RETURN_IF_ERROR(InitializeModuleHooks(
-      *module, user_pre_optimization_hook_, user_post_optimization_hook_,
-      &pre_optimization_ir_hook, &post_optimization_ir_hook));
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   auto llvm_context = absl::make_unique<llvm::LLVMContext>();
@@ -575,7 +609,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       options::OptimizeForSizeRequested(module->config()),
       module->config().debug_options().xla_cpu_enable_fast_math(),
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
-      pre_optimization_ir_hook, post_optimization_ir_hook);
+      pre_optimization_ir_hook, post_optimization_ir_hook,
+      OrcJITPostCompilationHook::Create(module.get()));
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
@@ -596,8 +631,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_optimized_hlo_proto_to =
-      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
@@ -616,13 +649,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                           /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
-  XLA_VLOG_LINES(2, assignment->ToString());
-
-  if (!xla_dump_optimized_hlo_proto_to.empty()) {
-    HloProto proto = MakeHloProto(*module, *assignment);
-    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                            assignment->ToString());
   }
+  DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
   // Each computation is a single function.  Emit all embedded computations
   // before the entry computation. The order of computations returned from
@@ -633,7 +664,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
-                       &target_machine_features);
+                       &target_machine_features,
+#ifdef MEMORY_SANITIZER
+                       /*emit_code_for_msan=*/true
+#else
+                       /*emit_code_for_msan=*/false
+#endif
+  );
 
   TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
@@ -670,9 +707,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   if (embed_ir_in_executable) {
     ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
   }
-  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
-  XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   // JIT compile the LLVM IR module to in-memory machine code.
   jit->AddModule(std::move(llvm_module));
@@ -719,8 +755,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   }
   const CpuAotCompilationOptions& options =
       static_cast<const CpuAotCompilationOptions&>(aot_options);
-  llvm::StringRef target_triple = llvm_ir::AsStringRef(options.triple());
-  llvm::Triple triple(llvm::Triple::normalize(target_triple));
+  llvm::Triple triple(llvm::Triple::normalize(options.triple()));
   std::string error;
   const llvm::Target* target =
       llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
@@ -758,13 +793,12 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
       pie_level = llvm::PIELevel::Large;
       break;
   }
-  llvm::StringRef cpu_name = llvm_ir::AsStringRef(options.cpu_name());
-  llvm::StringRef features = llvm_ir::AsStringRef(options.features());
   llvm::CodeGenOpt::Level opt_level = CodeGenOptLevel(modules[0]->config());
-  std::unique_ptr<llvm::TargetMachine> target_machine = absl::WrapUnique(
-      target->createTargetMachine(triple.getTriple(), cpu_name, features,
-                                  CompilerTargetOptions(modules[0]->config()),
-                                  reloc_model, llvm::None, opt_level));
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      absl::WrapUnique(target->createTargetMachine(
+          triple.getTriple(), options.cpu_name(), options.features(),
+          CompilerTargetOptions(modules[0]->config()), reloc_model, llvm::None,
+          opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   llvm::LLVMContext llvm_context;
@@ -783,15 +817,9 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     HloModule* module = modules[i].get();
     VLOG(1) << "Compiling ahead-of-time: " << module->name();
 
-    VLOG(2) << "Before optimization:";
-    XLA_VLOG_LINES(2, module->ToString());
-
     TF_RETURN_IF_ERROR(
         RunHloPasses(module, /*is_aot_compile=*/true, target_machine.get()));
 
-    VLOG(2) << "After optimization:";
-    XLA_VLOG_LINES(2, module->ToString());
-
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
                         ScheduleModule(module, BufferSizeBytesFunction()));
 
@@ -806,15 +834,11 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                             /*allocate_buffers_for_constants=*/true));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    const string xla_dump_optimized_hlo_proto_to =
-        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
+    if (DumpingEnabledForHloModule(*module)) {
+      DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                              assignment->ToString());
     }
+    DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
@@ -831,7 +855,9 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
-                         &target_machine_features);
+                         &target_machine_features,
+                         // TODO(b/66051036): Run full msan for AOT.
+                         /*emit_code_for_msan=*/false);
 
     TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
@@ -856,33 +882,43 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                             /*is_top_level_computation=*/true,
                             schedule.sequence(computation).instructions()));
 
-    CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
+    CHECK(entry_function->getName() == entry_point_name);
 
-    ModuleHook pre_optimization_ir_dump_hook;
-    ModuleHook post_optimization_ir_dump_hook;
-    TF_RETURN_IF_ERROR(InitializeModuleHooks(
-        *module, user_pre_optimization_hook_, user_post_optimization_hook_,
-        &pre_optimization_ir_dump_hook, &post_optimization_ir_dump_hook));
+    ModuleHook pre_optimization_ir_hook;
+    ModuleHook post_optimization_ir_hook;
+    std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+        GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                         user_post_optimization_hook_);
 
     // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run the
     // pre-optimization IR dump hook before returning.
     {
       Status verify_status = VerifyLlvmModule(llvm_module);
-      if (!verify_status.ok() && pre_optimization_ir_dump_hook) {
-        pre_optimization_ir_dump_hook(llvm_module).IgnoreError();
+      if (!verify_status.ok() && pre_optimization_ir_hook) {
+        pre_optimization_ir_hook(llvm_module);
       }
       TF_RETURN_IF_ERROR(verify_status);
     }
 
-    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(llvm_module));
+    auto post_codegen_hook = [&](const llvm::object::ObjectFile& obj_file) {
+      if (!DumpingEnabledForHloModule(*module)) {
+        return;
+      }
+      StatusOr<DisassemblerResult> disasm_or =
+          Disassembler(*target_machine).DisassembleObjectFile(obj_file);
+      string text = disasm_or.ok()
+                        ? std::move(disasm_or).ValueOrDie().text
+                        : absl::StrCat("Error disassembling: ",
+                                       disasm_or.status().ToString());
+      DumpToFileInDirOrStdout(*module, /*file_suffix=*/"s", text);
+    };
 
-    Disassembler disassembler(*target_machine);
     CompilerFunctor compiler_functor(
-        target_machine.get(), &disassembler, opt_level,
+        target_machine.get(), opt_level,
         options::OptimizeForSizeRequested(module->config()),
         module->config().debug_options().xla_cpu_enable_fast_math(),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
-        pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook);
+        pre_optimization_ir_hook, post_optimization_ir_hook, post_codegen_hook);
     std::unique_ptr<llvm::MemoryBuffer> object_file =
         compiler_functor(llvm_module);
     ObjectFileData object_file_data(object_file->getBufferStart(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
index 7fbe0fa157c57eb0c274662a1de95cf5328ccfa8..4ac61f44d9f38425da2d1fc6b9495cb4deba5047 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 6f79ad7c1468f27c74d84770ec6358fbcd1c1f09..5793f0080b64cf8d576760e274586ac88cdab5d0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 
 namespace xla {
 namespace cpu {
@@ -42,9 +43,10 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
-bool IsMatrixVectorDot(const HloInstruction* hlo) {
+bool IsNonComplexMatrixVectorDot(const HloInstruction* hlo) {
   const Shape& hlo_shape = hlo->shape();
-  return hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() == 2 &&
+  return !ShapeUtil::ElementIsComplex(hlo_shape) &&
+         hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() == 2 &&
          (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
 }
 
@@ -55,7 +57,8 @@ bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
 
 bool CanBeOutputFused(const HloInstruction* producer,
                       const HloInstruction* consumer) {
-  return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
+  return consumer->opcode() == HloOpcode::kAdd &&
+         IsNonComplexMatrixVectorDot(producer) &&
          HasExactlyOneUse(*producer) == 1;
 }
 
@@ -96,12 +99,16 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
-  // TODO(b/28644064): see if the "producer->operand_count() == 0" check is
-  // necessary.
-  if (producer->operand_count() == 0 ||
-      !InstructionFusion::ShouldFuse(consumer, operand_index)) {
-    VLOG(2)
-        << "Not fusing: producer has no operands, or !ShouldFuse(consumer).";
+  if (!InstructionFusion::ShouldFuse(consumer, operand_index)) {
+    VLOG(2) << "Not fusing: !ShouldFuse(consumer).";
+    return false;
+  }
+
+  // Fuse constants in general but avoid creating 2-instruction fusions with
+  // just a constant and another node.
+  if (producer->opcode() == HloOpcode::kConstant &&
+      consumer->opcode() != HloOpcode::kFusion) {
+    VLOG(2) << "Not fusing: insufficient non-constant nodes.";
     return false;
   }
 
@@ -111,6 +118,14 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
+  // Don't fuse if fusing would cause too much code duplication because of
+  // inefficiencies in the fusion emitter.
+  // TODO(b/119692968): Remove this once the fusion emitter can handle
+  // arbitrary fusion nodes.
+  if (FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer)) {
+    return false;
+  }
+
   if (consumer->opcode() == HloOpcode::kDot) {
     // In the general case we call out to optimized "black box" GEMM routines
     // for Dot, which precludes fusion.  However, in very specific cases, we try
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index c4bde837e57e82584c2a007858ed8d55608acd3c..cdb52d1dd987a47ffcb6b5cad40821d0aca011ee 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -623,65 +623,10 @@ TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
       module.get(),
       {HloOpcode::kDynamicSlice, HloOpcode::kDynamicSlice,
        HloOpcode::kDynamicUpdateSlice, HloOpcode::kReshape,
-       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kConstant, HloOpcode::kParameter, HloOpcode::kParameter,
        HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
-// Tests that we do not fuse instructions in cases where instructions in the
-// fusion would reuse elements from its operand due to an implicit broadcast.
-TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastUnary) {
-  Shape small_shape = ShapeUtil::MakeShape(F32, {1, 4});
-  Shape large_shape = ShapeUtil::MakeShape(F32, {3, 4});
-
-  HloComputation::Builder builder(TestName());
-
-  HloInstruction* small_param =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          /*parameter_number=*/0, small_shape, "param"));
-  HloInstruction* small_exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(small_shape, HloOpcode::kExp, small_param));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(large_shape, HloOpcode::kExp, small_exp));
-
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  module->AddEntryComputation(builder.Build());
-
-  auto did_fusion = CpuInstructionFusion().Run(module.get());
-  ASSERT_TRUE(did_fusion.ok());
-  EXPECT_FALSE(did_fusion.ValueOrDie());
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              Not(op::Fusion()));
-}
-
-// Like ReuseViaImplicitBroadcastUnary but with a binary operation.
-TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
-  Shape small_shape = ShapeUtil::MakeShape(F32, {1, 4});
-  Shape large_shape = ShapeUtil::MakeShape(F32, {3, 4});
-
-  HloComputation::Builder builder(TestName());
-
-  HloInstruction* small_param =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          /*parameter_number=*/0, small_shape, "param"));
-  HloInstruction* large_param =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          /*parameter_number=*/1, large_shape, "param"));
-  HloInstruction* small_exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(small_shape, HloOpcode::kExp, small_param));
-
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      large_shape, HloOpcode::kAdd, small_exp, large_param));
-
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  module->AddEntryComputation(builder.Build());
-
-  auto did_fusion = CpuInstructionFusion().Run(module.get());
-  ASSERT_TRUE(did_fusion.ok());
-  EXPECT_FALSE(did_fusion.ValueOrDie());
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              Not(op::Fusion()));
-}
-
 void CreateComputationForDotAddOutputFusionTest(const string& test_name,
                                                 HloModule* module, int m, int k,
                                                 int n,
@@ -811,7 +756,7 @@ TEST_P(GatherLoopFusionTest, GatherLoopFusion) {
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
       {HloOpcode::kGather, HloOpcode::kAdd, HloOpcode::kBroadcast,
-       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
+       HloOpcode::kConstant, HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
 std::vector<GatherLoopFusionTestSpec> GetGatherLoopFusionTestSpecs() {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index a9febe891b5e9d1eb9e6b297952b50d1d26a3396..d8878e622c0500fc5328aa6c295a9e24a3a037f7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -84,31 +84,8 @@ extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName =
     "__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";
 extern const char* const kParallelForkJoinSymbolName =
     "__xla_cpu_runtime_ParallelForkJoin";
-extern const char* const kKeyValueSortPREDSymbolName =
-    "__xla_cpu_runtime_KeyValueSortPRED";
-extern const char* const kKeyValueSortS8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS8";
-extern const char* const kKeyValueSortU8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU8";
-extern const char* const kKeyValueSortS16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS16";
-extern const char* const kKeyValueSortU16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU16";
-extern const char* const kKeyValueSortF16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF16";
-extern const char* const kKeyValueSortS32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS32";
-extern const char* const kKeyValueSortU32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU32";
-extern const char* const kKeyValueSortF32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF32";
-extern const char* const kKeyValueSortS64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS64";
-extern const char* const kKeyValueSortU64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU64";
-extern const char* const kKeyValueSortF64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF64";
-
+extern const char* const kKeyValueSortSymbolName =
+    "__xla_cpu_runtime_KeyValueSort";
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_";
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index b2e760a224ad8eaa61dae57b0f9cece04a7e54ae..3a2b44d8c1a80128d3577c374e751e73a89e9d59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -64,18 +64,7 @@ extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
 extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
 extern const char* const kParallelForkJoinSymbolName;
-extern const char* const kKeyValueSortPREDSymbolName;
-extern const char* const kKeyValueSortS8SymbolName;
-extern const char* const kKeyValueSortU8SymbolName;
-extern const char* const kKeyValueSortS16SymbolName;
-extern const char* const kKeyValueSortU16SymbolName;
-extern const char* const kKeyValueSortF16SymbolName;
-extern const char* const kKeyValueSortS32SymbolName;
-extern const char* const kKeyValueSortU32SymbolName;
-extern const char* const kKeyValueSortF32SymbolName;
-extern const char* const kKeyValueSortS64SymbolName;
-extern const char* const kKeyValueSortU64SymbolName;
-extern const char* const kKeyValueSortF64SymbolName;
+extern const char* const kKeyValueSortSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 3361a5973f5e8c91802b26d68477347b196d3cac..fae9670051a654f38f09856368ffb700b0c7a085 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 48510181bd01c87c9db764396b556fdf34e6c8c4..6107d40ab51bf7fb510d65194d52b8072aff9c85 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -250,11 +250,6 @@ void DotOpEmitter::EmitTiledLlvmIrGemm() {
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
       GetGemmTileSize();
 
-  const bool enable_fast_math =
-      hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
-  const bool optimize_for_size =
-      options::OptimizeForSizeRequested(hlo_module_config_);
-
   EmitSmallGemm(
       /*scalar_type=*/primitive_type,
       /*m=*/m, /*k=*/k, /*n=*/n,
@@ -262,9 +257,7 @@ void DotOpEmitter::EmitTiledLlvmIrGemm() {
       /*max_vector_count=*/tile_size_n_in_vector_width,
       /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
       /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k, /*lhs=*/lhs,
-      /*rhs=*/rhs, /*result=*/target, b_,
-      /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size);
+      /*rhs=*/rhs, /*result=*/target, b_, hlo_module_config_);
 }
 
 void DotOpEmitter::EmitTiledLlvmIrGemv() {
@@ -323,11 +316,6 @@ void DotOpEmitter::EmitTiledLlvmIrGemv() {
   llvm::Value* rhs_op =
       swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer();
 
-  const bool enable_fast_math =
-      hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
-  const bool optimize_for_size =
-      options::OptimizeForSizeRequested(hlo_module_config_);
-
   const int target_vector_register_element_size =
       target_machine_features_.vector_register_num_elements(
           *b_->GetInsertBlock()->getParent(), primitive_type);
@@ -349,9 +337,7 @@ void DotOpEmitter::EmitTiledLlvmIrGemv() {
         /*tile_rows=*/vector_register_element_size, /*tile_cols=*/tiling_factor,
         /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
         /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
-        /*result=*/result_op, b_,
-        /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size);
+        /*result=*/result_op, b_, hlo_module_config_);
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
@@ -361,9 +347,7 @@ void DotOpEmitter::EmitTiledLlvmIrGemv() {
         /*tile_cols=*/vector_register_element_size,
         /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
         /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
-        /*result=*/result_op, b_,
-        /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size);
+        /*result=*/result_op, b_, hlo_module_config_);
   }
 }
 
@@ -445,10 +429,12 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
   llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(dot_hlo_name_), b_);
-  llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
-      lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
-  llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
-      rhs_array_, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
+  std::vector<llvm::Value*> lhs_multi_index =
+      loop_nest.EmitOperandArrayLoopNest(
+          lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
+  std::vector<llvm::Value*> rhs_multi_index =
+      loop_nest.EmitOperandArrayLoopNest(
+          rhs_array_, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
 
   // Create the loop which does the sum of products reduction.
   //
@@ -468,8 +454,12 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
 
   // The final entry in the rhs and lhs indexes is the indvar of the
   // reduction loop.
-  lhs_index[lhs_reduction_dimension] = reduction_loop->GetIndVarValue();
-  rhs_index[rhs_reduction_dimension] = reduction_loop->GetIndVarValue();
+  lhs_multi_index[lhs_reduction_dimension] = reduction_loop->GetIndVarValue();
+  llvm_ir::IrArray::Index lhs_index(lhs_multi_index, lhs_shape,
+                                    b_->getInt64Ty());
+  rhs_multi_index[rhs_reduction_dimension] = reduction_loop->GetIndVarValue();
+  llvm_ir::IrArray::Index rhs_index(rhs_multi_index, rhs_shape,
+                                    b_->getInt64Ty());
 
   // For computing the sum of products we alloca a single location to store the
   // dot product result as we accumulate it within the reduction loop. After the
@@ -532,18 +522,20 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   // the rhs and lhs indexes with the reduction dimensions removed. The terms
   // from the rhs index are the lower dimensions in the index so we add them
   // first.
-  llvm_ir::IrArray::Index target_index(lhs_index.GetType());
+  std::vector<llvm::Value*> target_multi_index;
   for (int dimension = 0; dimension < lhs_index.size(); ++dimension) {
     if (dimension != lhs_reduction_dimension) {
-      target_index.push_back(lhs_index[dimension]);
+      target_multi_index.push_back(lhs_index[dimension]);
     }
   }
   for (int dimension = 0; dimension < rhs_index.size(); ++dimension) {
     if (dimension != rhs_reduction_dimension) {
-      target_index.push_back(rhs_index[dimension]);
+      target_multi_index.push_back(rhs_index[dimension]);
     }
   }
 
+  llvm_ir::IrArray::Index target_index(
+      target_multi_index, target_array_.GetShape(), lhs_index.GetType());
   target_array_.EmitWriteArrayElement(target_index, result, b_);
 
   // Set the IR builder insert point to the exit basic block of the outer most
@@ -643,11 +635,13 @@ Status DotOpEmitter::EmitCallToRuntime() {
   llvm::Function* function = b_->GetInsertBlock()->getParent();
   llvm::Module* module = function->getParent();
 
-  llvm::Function* matmul_func = llvm::cast<llvm::Function>(
-      module->getOrInsertFunction(fn_name, matmul_type));
-  matmul_func->setCallingConv(llvm::CallingConv::C);
-  matmul_func->setDoesNotThrow();
-  matmul_func->setOnlyAccessesArgMemory();
+  llvm::FunctionCallee matmul_func =
+      module->getOrInsertFunction(fn_name, matmul_type);
+  if (auto* fn = llvm::dyn_cast<llvm::Function>(matmul_func.getCallee())) {
+    fn->setCallingConv(llvm::CallingConv::C);
+    fn->setDoesNotThrow();
+    fn->setOnlyAccessesArgMemory();
+  }
 
   // The Eigen runtime function expects column-major layout. If the matrices are
   // row major, then use the following identity to compute the product:
@@ -919,11 +913,11 @@ llvm_ir::IrArray SliceOutInnerArray(llvm_ir::IrArray outer_array,
   llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
 
   Shape inner_shape = DropFirstDim(outer_array.GetShape());
-  llvm_ir::IrArray::Index slice_index(b->getInt64Ty());
-  slice_index.push_back(batch_index);
-  slice_index.InsertAt(
-      /*index=*/1, outer_array.GetShape().dimensions_size() - 1,
-      b->getInt64(0));
+  std::vector<llvm::Value*> multidim_index(inner_shape.rank() + 1,
+                                           b->getInt64(0));
+  multidim_index[0] = batch_index;
+  llvm_ir::IrArray::Index slice_index(multidim_index, outer_array.GetShape(),
+                                      batch_index->getType());
   llvm::Value* slice_ptr = outer_array.EmitArrayElementAddress(slice_index, b);
   llvm::Type* slice_ptr_type =
       llvm_ir::ShapeToIrType(inner_shape, module)->getPointerTo();
@@ -961,8 +955,8 @@ Status EmitBatchDotOperation(
   KernelSupportLibrary ksl(b);
 
   return ksl.ForWithStatus(
-      "bdot", /*start=*/0, /*end=*/batch_count, /*step=*/1,
-      [&](llvm::Value* indvar) {
+      llvm_ir::IrName(&dot, "bdot"), /*start=*/0, /*end=*/batch_count,
+      /*step=*/1, [&](llvm::Value* indvar) {
         DotDimensionNumbers adjusted_dim_numbers = dot.dot_dimension_numbers();
         adjusted_dim_numbers.clear_lhs_batch_dimensions();
         adjusted_dim_numbers.clear_rhs_batch_dimensions();
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index c8312d80bd5012e5bcb42a410db18a7fa77a2eb6..fb021f277b0c796c4126e36da5dea405c603ccd1 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+using xla::llvm_ir::IrArray;
+
 namespace xla {
 namespace cpu {
 
@@ -51,10 +53,11 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
       return Unimplemented("atan2");
   }
   // Create a function declaration.
-  llvm::Function* function =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          llvm_ir::AsStringRef(function_name), lhs->getType(), lhs->getType(),
-          rhs->getType()));
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(function_name, lhs->getType(), lhs->getType(),
+                                rhs->getType())
+          .getCallee());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
@@ -85,9 +88,11 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
       return Unimplemented("tanh");
   }
   // Create a function declaration.
-  llvm::Function* function = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(llvm_ir::AsStringRef(function_name),
-                                   value->getType(), value->getType()));
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(function_name, value->getType(),
+                                value->getType())
+          .getCallee());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
@@ -102,21 +107,43 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
 llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) {
-  if (hlo->opcode() == HloOpcode::kMap) {
-    return [this, hlo, &operand_to_generator](
-               const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-      std::vector<llvm::Value*> operands;
-      for (int i = 0; i < hlo->operand_count(); i++) {
-        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(i))(
-                                ElementwiseSourceIndex(index, *hlo, i)));
-        operands.push_back(operand_value);
-      }
-      return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
-                                           operands, llvm_ir::IrName(hlo));
-    };
+  switch (hlo->opcode()) {
+    case HloOpcode::kMap:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> operands;
+        for (int i = 0; i < hlo->operand_count(); i++) {
+          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                              operand_to_generator.at(hlo->operand(i))(index));
+          operands.push_back(operand_value);
+        }
+        return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
+                                             operands, llvm_ir::IrName(hlo));
+      };
+    case HloOpcode::kReduceWindow:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return ir_emitter_->EmitElementalReduceWindow(
+            Cast<HloReduceWindowInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)), index);
+      };
+    case HloOpcode::kConvolution:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return ir_emitter_->EmitElementalConvolution(
+            Cast<HloConvolutionInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
+      };
+    case HloOpcode::kReduce:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return ir_emitter_->EmitElementalReduce(
+            Cast<HloReduceInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
+      };
+    default:
+      return ElementalIrEmitter::MakeElementGenerator(hlo,
+                                                      operand_to_generator);
   }
-  return ElementalIrEmitter::MakeElementGenerator(hlo, operand_to_generator);
 }
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index a8b139aec9e96b6bb580baf74789df7c998cebf8..2cc618e430215e26cb41c0a24a9c01b1ae33cec1 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -72,7 +72,8 @@ bool PotentiallyImplementedAsEigenConvolution(
   CHECK(
       ShapeUtil::SameElementTypeIgnoringFpPrecision(input_shape, kernel_shape));
   // TODO(b/65408531): Explore using Eigen dot for complex64 type.
-  if (ShapeUtil::ElementIsComplex(input_shape)) {
+  PrimitiveType primitive_type = input_shape.element_type();
+  if (primitive_type != F16 && primitive_type != F32) {
     return false;
   }
   if (window_util::HasWindowReversal(convolution.window())) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f8a997045a66545bca9a35b2e582bf015e659b48..19e7b13bb3ac3c08c0131510e97cb90ead1c9437 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -74,10 +74,8 @@ limitations under the License.
 namespace xla {
 
 namespace {
-using llvm_ir::AsStringRef;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
-namespace gtl = tensorflow::gtl;
 }  // namespace
 
 namespace cpu {
@@ -87,7 +85,8 @@ IrEmitter::IrEmitter(
     llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
-    const TargetMachineFeatures* target_machine_features)
+    const TargetMachineFeatures* target_machine_features,
+    bool emit_code_for_msan)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
@@ -97,10 +96,9 @@ IrEmitter::IrEmitter(
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
-      target_machine_features_(*target_machine_features) {
-  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
-      /*fast_math_enabled=*/hlo_module_config_.debug_options()
-          .xla_cpu_enable_fast_math()));
+      target_machine_features_(*target_machine_features),
+      emit_code_for_msan_(emit_code_for_msan) {
+  b_.setFastMathFlags(llvm_ir::GetCpuFastMathFlags(hlo_module_config_));
   Status s = GatherComputationsByAllocationType(
       &hlo_module, &thread_local_computations_, &global_computations_);
   absl::c_sort(thread_local_computations_);
@@ -159,11 +157,9 @@ void IrEmitter::InitializeIrFunction(const string& function_name) {
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
   // Create and initialize new IrFunction.
-  compute_function_.reset(new IrFunction(
-      function_name, linkage,
-      options::OptimizeForSizeRequested(hlo_module_config_),
-      hlo_module_config_.debug_options().xla_cpu_enable_fast_math(), module_,
-      &b_, num_dynamic_loop_bounds_));
+  compute_function_.reset(new IrFunction(function_name, linkage,
+                                         hlo_module_config_, module_, &b_,
+                                         num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -172,8 +168,7 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
   emitted_value_[bitcast] =
       BitCast(GetEmittedValueFor(bitcast->operand(0)),
-              IrShapeType(bitcast->shape())->getPointerTo(),
-              AsStringRef(IrName(bitcast)));
+              IrShapeType(bitcast->shape())->getPointerTo(), IrName(bitcast));
   return Status::OK();
 }
 
@@ -188,6 +183,7 @@ llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
       /*Initializer=*/initializer,
       /*Name=*/"");
   result_global->setAlignment(MinimumAlignmentForShape(literal.shape()));
+  result_global->setUnnamedAddr(llvm::GlobalVariable::UnnamedAddr::Global);
   return llvm::ConstantExpr::getBitCast(
       result_global, IrShapeType(literal.shape())->getPointerTo());
 }
@@ -302,7 +298,7 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const Shape& shape = get_tuple_element->shape();
   emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement(
       shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape),
-      GetEmittedValueFor(operand), &b_, module_);
+      GetEmittedValueFor(operand), &b_);
   return Status::OK();
 }
 
@@ -322,7 +318,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select));
   llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred),
                            GetEmittedValueFor(on_true),
-                           GetEmittedValueFor(on_false), &b_, module_);
+                           GetEmittedValueFor(on_false), &b_);
   return Status::OK();
 }
 
@@ -345,8 +341,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
                       assignment_.GetUniqueSlice(infeed, {1}));
   llvm::Value* token_address = EmitBufferPointer(
       token_slice, ShapeUtil::GetTupleElementShape(infeed->shape(), 1));
-  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_,
-                     module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_);
 
   if (data_shape.IsTuple()) {
     TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape));
@@ -377,7 +372,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
     }
 
     llvm_ir::EmitTuple(llvm_ir::IrArray(data_address, data_shape),
-                       tuple_element_addresses, &b_, module_);
+                       tuple_element_addresses, &b_);
   } else {
     TF_RETURN_IF_ERROR(
         EmitXfeedTransfer(XfeedKind::kInfeed, data_shape, data_address));
@@ -412,11 +407,18 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   llvm::Function* acquire_func;
   if (kind == XfeedKind::kInfeed) {
-    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type));
+    acquire_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type)
+            .getCallee());
   } else {
-    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kAcquireOutfeedBufferForPopulationSymbolName, acquire_type));
+    acquire_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kAcquireOutfeedBufferForPopulationSymbolName,
+                acquire_type)
+            .getCallee());
   }
   acquire_func->setCallingConv(llvm::CallingConv::C);
 
@@ -429,11 +431,19 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   llvm::Function* release_func;
   if (kind == XfeedKind::kInfeed) {
-    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kReleaseInfeedBufferAfterDequeueSymbolName, release_type));
+    release_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kReleaseInfeedBufferAfterDequeueSymbolName,
+                release_type)
+            .getCallee());
   } else {
-    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kReleaseOutfeedBufferAfterPopulationSymbolName, release_type));
+    release_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kReleaseOutfeedBufferAfterPopulationSymbolName,
+                release_type)
+            .getCallee());
   }
   release_func->setCallingConv(llvm::CallingConv::C);
 
@@ -483,7 +493,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
         ShapeUtil::GetTupleElementShape(operand_shape, i);
     llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement(
         tuple_element_shape, i, MinimumAlignmentForShape(tuple_element_shape),
-        value, &b_, module_);
+        value, &b_);
     TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kOutfeed,
                                          tuple_element_shape, tuple_element));
   }
@@ -495,6 +505,27 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   const HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
+  PrimitiveType keys_type = keys_shape.element_type();
+  switch (keys_type) {
+    case PRED:
+    case S8:
+    case U8:
+    case S16:
+    case U16:
+    case BF16:
+    case F16:
+    case S32:
+    case U32:
+    case F32:
+    case S64:
+    case U64:
+    case F64:
+      break;
+    default:
+      return Unimplemented(
+          "Element type %s not supported in the Sort op on CPU.",
+          PrimitiveType_Name(keys_type));
+  }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
     ShapeIndex shape_index =
@@ -542,109 +573,50 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
 
-  PrimitiveType keys_type = keys_shape.element_type();
-  const char* fn_name = nullptr;
-  llvm::Type* keys_native_type = nullptr;
-  switch (keys_type) {
-    case PRED:
-      fn_name = runtime::kKeyValueSortPREDSymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S8:
-      fn_name = runtime::kKeyValueSortS8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case U8:
-      fn_name = runtime::kKeyValueSortU8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S16:
-      fn_name = runtime::kKeyValueSortS16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case U16:
-      fn_name = runtime::kKeyValueSortU16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case F16:
-      fn_name = runtime::kKeyValueSortF16SymbolName;
-      keys_native_type = b_.getHalfTy()->getPointerTo();
-      break;
-    case S32:
-      fn_name = runtime::kKeyValueSortS32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case U32:
-      fn_name = runtime::kKeyValueSortU32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case F32:
-      fn_name = runtime::kKeyValueSortF32SymbolName;
-      keys_native_type = b_.getFloatTy()->getPointerTo();
-      break;
-    case S64:
-      fn_name = runtime::kKeyValueSortS64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case U64:
-      fn_name = runtime::kKeyValueSortU64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case F64:
-      fn_name = runtime::kKeyValueSortF64SymbolName;
-      keys_native_type = b_.getDoubleTy()->getPointerTo();
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
-  }
-
+  auto less_than_function = FindOrDie(emitted_functions_, sort->to_apply());
+  CHECK(absl::c_binary_search(thread_local_computations_, sort->to_apply()));
   llvm::FunctionType* key_value_sort_type = llvm::FunctionType::get(
       b_.getVoidTy(),
-      {keys_native_type, b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
+      {b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
        b_.getInt8PtrTy()->getPointerTo(), b_.getInt32Ty(),
-       b_.getInt32Ty()->getPointerTo()},
+       b_.getInt32Ty()->getPointerTo(), b_.getInt1Ty(), b_.getInt8PtrTy(),
+       b_.getInt64Ty()->getPointerTo(), less_than_function->getType()},
       /*isVarArg=*/false);
-  auto* key_value_sort_func = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(fn_name, key_value_sort_type));
+  auto* key_value_sort_func = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(runtime::kKeyValueSortSymbolName,
+                                key_value_sort_type)
+          .getCallee());
   key_value_sort_func->setCallingConv(llvm::CallingConv::C);
   key_value_sort_func->setDoesNotThrow();
-  llvm::Value* values;
-  llvm::Value* sizes;
-  if (sort->values_count() == 0) {
-    values = llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo());
-    sizes = llvm::Constant::getNullValue(b_.getInt32Ty()->getPointerTo());
-  } else {
-    values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt8PtrTy(), b_.getInt32(sort->values_count()),
-        "cc_values_alloca", &b_);
-    sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt32Ty(), b_.getInt32(sort->values_count()), "cc_sizes_alloca",
-        &b_);
-    for (int64 i = 0; i < sort->values_count(); ++i) {
-      llvm::Value* value_as_i8ptr =
-          PointerCast(destination_addresses[i + 1], b_.getInt8PtrTy());
-      llvm::Value* slot_in_values_alloca =
-          ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
-      Store(value_as_i8ptr, slot_in_values_alloca);
-      llvm::Value* slot_in_sizes_alloca =
-          ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
-      llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
-          sort->operand(i + 1)->shape().element_type()));
-      Store(size, slot_in_sizes_alloca);
-    }
+  llvm::Value* values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt8PtrTy(), b_.getInt32(sort->operand_count()), "cc_values_alloca",
+      &b_);
+  llvm::Value* sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt32Ty(), b_.getInt32(sort->operand_count()), "cc_sizes_alloca",
+      &b_);
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    llvm::Value* value_as_i8ptr =
+        PointerCast(destination_addresses[i], b_.getInt8PtrTy());
+    llvm::Value* slot_in_values_alloca =
+        ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
+    Store(value_as_i8ptr, slot_in_values_alloca);
+    llvm::Value* slot_in_sizes_alloca =
+        ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
+    llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
+        sort->operand(i)->shape().element_type()));
+    Store(size, slot_in_sizes_alloca);
   }
 
   Call(key_value_sort_func,
-       {PointerCast(destination_addresses[0], keys_native_type),
-        b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
+       {b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
         b_.getInt64(lower_dimensions), values,
-        b_.getInt32(sort->values_count()), sizes});
+        b_.getInt32(sort->operand_count()), sizes,
+        b_.getInt1(sort->is_stable()), GetExecutableRunOptionsArgument(),
+        GetProfileCountersArgument(), less_than_function});
 
   if (sort->values_count() > 0) {
-    llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_,
-                       module_);
+    llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_);
   }
   return Status::OK();
 }
@@ -655,7 +627,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (auto operand : tuple->operands()) {
     base_ptrs.push_back(GetEmittedValueFor(operand));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_);
   return Status::OK();
 }
 
@@ -665,8 +637,9 @@ llvm::Value* IrEmitter::EmitElementalMap(
   return EmitThreadLocalCall(*map_instr.to_apply(), elemental_operands, name);
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
-    HloReduceWindowInstruction* reduce_window,
+StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
+    const HloReduceWindowInstruction* reduce_window,
+    const llvm_ir::ElementGenerator& input_generator,
     const llvm_ir::IrArray::Index& index) {
   const HloInstruction* operand = reduce_window->operand(0);
   const Window& window = reduce_window->window();
@@ -692,21 +665,22 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
 
   SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
-  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), index.size());
+  std::vector<llvm::Value*> input_multi_index(index.size());
   llvm::Value* in_bounds_condition = nullptr;
   for (size_t i = 0; i < index.size(); ++i) {
     llvm::Value* strided_index =
         NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_index[i] = NSWSub(
+    input_multi_index[i] = NSWSub(
         NSWAdd(strided_index,
                NSWMul(window_index[i],
                       b_.getInt64(window.dimensions(i).window_dilation()))),
         b_.getInt64(window.dimensions(i).padding_low()));
 
     // We need to verify that we are not in the dilated base area.
-    llvm::Value* dilation_condition = ICmpEQ(
-        SRem(input_index[i], b_.getInt64(window.dimensions(i).base_dilation())),
-        b_.getInt64(0));
+    llvm::Value* dilation_condition =
+        ICmpEQ(SRem(input_multi_index[i],
+                    b_.getInt64(window.dimensions(i).base_dilation())),
+               b_.getInt64(0));
     if (in_bounds_condition == nullptr) {
       in_bounds_condition = dilation_condition;
     } else {
@@ -714,15 +688,16 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
     }
 
     // Apply base dilation to the index.
-    input_index[i] =
-        SDiv(input_index[i], b_.getInt64(window.dimensions(i).base_dilation()));
-
-    // We need to check if 0 <= input_index[i] < bound, as otherwise we are in
-    // the padding so that we can skip the computation. That is equivalent to
-    // input_index[i] < bound as an *unsigned* comparison, since a negative
-    // value will wrap to a large positive value.
+    input_multi_index[i] =
+        SDiv(input_multi_index[i],
+             b_.getInt64(window.dimensions(i).base_dilation()));
+
+    // We need to check if 0 <= input_multi_index[i] < bound, as otherwise we
+    // are in the padding so that we can skip the computation. That is
+    // equivalent to input_multi_index[i] < bound as an *unsigned* comparison,
+    // since a negative value will wrap to a large positive value.
     llvm::Value* index_condition =
-        ICmpULT(input_index[i],
+        ICmpULT(input_multi_index[i],
                 b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
     if (in_bounds_condition == nullptr) {
       in_bounds_condition = index_condition;
@@ -737,8 +712,10 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
   SetToFirstInsertPoint(if_data.true_block, &b_);
 
   // We are not in the padding, so carry out the computation.
-  llvm_ir::IrArray input_array(GetIrArrayFor(operand));
-  llvm::Value* input_value = input_array.EmitReadArrayElement(input_index, &b_);
+  llvm_ir::IrArray::Index input_index(input_multi_index, operand->shape(),
+                                      b_.getInt64Ty());
+  TF_ASSIGN_OR_RETURN(llvm::Value* const input_value,
+                      input_generator(input_index));
   llvm::Value* result = EmitThreadLocalCall(
       *reduce_window->to_apply(), {Load(accumulator_address), input_value},
       "reducer_function");
@@ -749,11 +726,6 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
 }
 
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
-  TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
-      /*instruction=*/*reduce_window,
-      /*operands=*/{reduce_window->operand(0)},
-      /*supported_types=*/{F32, BF16, S32, F16}));
-
   // Pseudo code for reduce window:
   //
   //   for (coordinates O in the output)
@@ -767,11 +739,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   //
   // This is completely un-optimized and just here to have something
   // that works.
-  return EmitTargetElementLoop(
-      reduce_window, [&](const llvm_ir::IrArray::Index& index) {
-        return EmitTargetElementLoopBodyForReduceWindow(
-            Cast<HloReduceWindowInstruction>(reduce_window), index);
-      });
+  return DefaultAction(reduce_window);
 }
 
 Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
@@ -851,15 +819,16 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // Compute the operand index to visit and evaluate the condition whether the
   // operand index is within the bounds. The unsigned comparison includes
   // checking whether the operand index >= 0.
-  llvm_ir::IrArray::Index operand_index(b_.getInt64Ty(), source_index.size());
+  std::vector<llvm::Value*> operand_multi_index(source_index.size());
   llvm::Value* in_bounds_condition = b_.getTrue();
   for (int64 i = 0; i < rank; ++i) {
     llvm::Value* strided_index =
         NSWMul(source_index[i], b_.getInt64(window.dimensions(i).stride()));
-    operand_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]),
-                              b_.getInt64(window.dimensions(i).padding_low()));
+    operand_multi_index[i] =
+        NSWSub(NSWAdd(strided_index, window_index[i]),
+               b_.getInt64(window.dimensions(i).padding_low()));
     llvm::Value* index_condition =
-        ICmpULT(operand_index[i],
+        ICmpULT(operand_multi_index[i],
                 b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
     in_bounds_condition = And(in_bounds_condition, index_condition);
   }
@@ -885,6 +854,8 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
         }
       };
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
+  llvm_ir::IrArray::Index operand_index(
+      operand_multi_index, operand_array.GetShape(), b_.getInt64Ty());
   llvm::Value* operand_data =
       operand_array.EmitReadArrayElement(operand_index, &b_);
   Store(operand_data, selected_value_address);
@@ -918,16 +889,18 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // location is computed by calling the `scatter` function with the source
   // value and the current output value.
   SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), &b_);
-  llvm_ir::IrArray::Index selected_index(source_index.GetType());
+  std::vector<llvm::Value*> selected_multi_index;
   for (int64 i = 0; i < rank; ++i) {
     llvm::Value* selected_index_address_slot =
         InBoundsGEP(selected_index_address, {b_.getInt32(i)});
-    selected_index.push_back(Load(selected_index_address_slot));
+    selected_multi_index.push_back(Load(selected_index_address_slot));
   }
   llvm_ir::IrArray source_array(GetIrArrayFor(source));
   llvm::Value* source_value =
       source_array.EmitReadArrayElement(source_index, &b_);
   llvm_ir::IrArray output_array(GetIrArrayFor(select_and_scatter));
+  llvm_ir::IrArray::Index selected_index(
+      selected_multi_index, output_array.GetShape(), source_index.GetType());
   llvm::Value* output_value =
       output_array.EmitReadArrayElement(selected_index, &b_);
   llvm::Value* scatter_value =
@@ -974,8 +947,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
                           hlo_module_config_, target_machine_features_);
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
-    HloConvolutionInstruction* convolution,
+StatusOr<llvm::Value*> IrEmitter::EmitElementalConvolution(
+    const HloConvolutionInstruction* convolution,
+    const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& kernel_generator,
     const llvm_ir::IrArray::Index& index) {
   const HloInstruction* lhs = convolution->operand(0);
   const HloInstruction* rhs = convolution->operand(1);
@@ -1080,30 +1055,34 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
 
   // We are not in the padding, so carry out the computation.
   int num_dims = num_spatial_dims + 2;
-  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), num_dims);
+  std::vector<llvm::Value*> input_multi_index(num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
+    input_multi_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
   }
-  input_index[dnums.input_feature_dimension()] = input_feature;
-  input_index[dnums.input_batch_dimension()] = batch;
+  input_multi_index[dnums.input_feature_dimension()] = input_feature;
+  input_multi_index[dnums.input_batch_dimension()] = batch;
 
-  llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs));
-  llvm_ir::IrArray::Index kernel_index(b_.getInt64Ty(), num_dims);
+  std::vector<llvm::Value*> kernel_multi_index(num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    kernel_index[dnums.kernel_spatial_dimensions(i)] =
+    kernel_multi_index[dnums.kernel_spatial_dimensions(i)] =
         window.dimensions(i).window_reversal()
             ? NSWSub(b_.getInt64(window.dimensions(i).size() - 1),
                      kernel_spatial[i])
             : kernel_spatial[i];
   }
 
-  kernel_index[dnums.kernel_input_feature_dimension()] = input_feature;
-  kernel_index[dnums.kernel_output_feature_dimension()] = output_feature;
+  kernel_multi_index[dnums.kernel_input_feature_dimension()] = input_feature;
+  kernel_multi_index[dnums.kernel_output_feature_dimension()] = output_feature;
 
-  llvm_ir::IrArray input_array(GetIrArrayFor(lhs));
-  llvm::Value* product =
-      FMul(input_array.EmitReadArrayElement(input_index, &b_),
-           kernel_array.EmitReadArrayElement(kernel_index, &b_));
+  llvm_ir::IrArray::Index input_index(input_multi_index, lhs->shape(),
+                                      b_.getInt64Ty());
+  TF_ASSIGN_OR_RETURN(llvm::Value* const input_value,
+                      input_generator(input_index));
+  llvm_ir::IrArray::Index kernel_index(kernel_multi_index, rhs->shape(),
+                                       b_.getInt64Ty());
+  TF_ASSIGN_OR_RETURN(llvm::Value* const kernel_value,
+                      kernel_generator(kernel_index));
+  llvm::Value* product = FMul(input_value, kernel_value);
   llvm::Value* sum = FAdd(Load(sum_address), product);
   Store(sum, sum_address);
 
@@ -1116,7 +1095,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   auto rhs = convolution->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*convolution, /*operands=*/{lhs, rhs},
-      /*supported_types=*/{F16, F32, C64, C128}));
+      /*supported_types=*/{F16, F32, F64, C64, C128}));
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
   // different data layouts.
@@ -1229,8 +1208,8 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         LOG(WARNING) << "Using Eigen instead of MKL-DNN for single-threaded "
                         "conv2d function.";
       }
-      llvm::Function* conv_func = llvm::cast<llvm::Function>(
-          module_->getOrInsertFunction(fn_name, conv_type));
+      llvm::Function* conv_func = llvm::dyn_cast<llvm::Function>(
+          module_->getOrInsertFunction(fn_name, conv_type).getCallee());
       conv_func->setCallingConv(llvm::CallingConv::C);
       conv_func->setDoesNotThrow();
       conv_func->setOnlyAccessesArgMemory();
@@ -1271,11 +1250,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   //
   // See the description of convolution in the XLA documentation for the pseudo
   // code for convolution.
-  return EmitTargetElementLoop(
-      convolution, [&](const llvm_ir::IrArray::Index& index) {
-        return EmitTargetElementLoopBodyForConvolution(
-            Cast<HloConvolutionInstruction>(convolution), index);
-      });
+  return DefaultAction(convolution);
 }
 
 Status IrEmitter::HandleFft(HloInstruction* fft) {
@@ -1313,8 +1288,8 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
                             ? runtime::kEigenFftSymbolName
                             : runtime::kEigenSingleThreadedFftSymbolName;
 
-  llvm::Function* fft_func = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(fn_name, fft_type));
+  llvm::Function* fft_func = llvm::dyn_cast<llvm::Function>(
+      module_->getOrInsertFunction(fn_name, fft_type).getCallee());
   fft_func->setCallingConv(llvm::CallingConv::C);
   fft_func->setDoesNotThrow();
   fft_func->setOnlyAccessesInaccessibleMemOrArgMem();
@@ -1368,7 +1343,7 @@ Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
     MemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
            /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_);
   return Status::OK();
 }
 
@@ -1617,22 +1592,23 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
 
   llvm_ir::ForLoopNest reduction_loop_nest(IrName(arg, "vectorized_inner"),
                                            &b_);
-  llvm_ir::IrArray::Index reduced_dims_index =
+  std::vector<llvm::Value*> input_multi_index =
       reduction_loop_nest.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                        "reduction_dim");
 
   SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(), &b_);
 
   llvm_ir::IrArray arg_array(GetIrArrayFor(arg));
-  llvm_ir::IrArray::Index input_index = reduced_dims_index;
   llvm_ir::IrArray::Index::const_iterator it = output_index.begin();
 
-  for (size_t i = 0; i < input_index.size(); ++i) {
-    if (input_index[i] == nullptr) {
-      input_index[i] = *it++;
+  for (auto& i : input_multi_index) {
+    if (i == nullptr) {
+      i = *it++;
     }
   }
   CHECK(output_index.end() == it);
+  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                      b_.getInt64Ty());
 
   llvm::Value* input_address = BitCast(
       arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy());
@@ -1744,8 +1720,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   //  }
 
   llvm_ir::ForLoopNest loop_nest(IrName(reduce), &b_);
-  llvm_ir::IrArray::Index array_index(b_.getInt64Ty(),
-                                      reduce->shape().dimensions_size());
+  std::vector<llvm::Value*> array_multi_index(
+      reduce->shape().dimensions_size());
   for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
        --i) {
     int64 dimension = LayoutUtil::Minor(reduce->shape().layout(), i);
@@ -1753,7 +1729,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     int64 end_index = reduce->shape().dimensions(dimension);
     std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
         start_index, end_index, absl::StrFormat("dim.%d", dimension));
-    array_index[dimension] = loop->GetIndVarValue();
+    array_multi_index[dimension] = loop->GetIndVarValue();
   }
 
   int64 innermost_dimension = LayoutUtil::Minor(reduce->shape().layout(), 0);
@@ -1774,12 +1750,14 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     std::unique_ptr<llvm_ir::ForLoop> loop =
         loop_nest.AddLoop(start_index, end_index, vectorization_factor,
                           absl::StrFormat("dim.%d", innermost_dimension));
-    array_index[innermost_dimension] = loop->GetIndVarValue();
+    array_multi_index[innermost_dimension] = loop->GetIndVarValue();
 
     SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &b_);
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(), vectorization_factor);
+    llvm_ir::IrArray::Index array_index(array_multi_index, reduce->shape(),
+                                        b_.getInt64Ty());
     TF_ASSIGN_OR_RETURN(std::vector<llvm::Value*> accumulator,
                         EmitInnerLoopForVectorizedReduction(
                             reduction_generator, array_index, vector_type,
@@ -1805,13 +1783,15 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   // in the following case:
   if (innermost_dimension_size % vectorization_factor) {
     // TODO(b/63775531): Consider using a scalar loop here to save on code size.
-    array_index[innermost_dimension] =
+    array_multi_index[innermost_dimension] =
         b_.getInt64(innermost_dimension_size -
                     (innermost_dimension_size % vectorization_factor));
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(),
         innermost_dimension_size % vectorization_factor);
+    llvm_ir::IrArray::Index array_index(array_multi_index, reduce->shape(),
+                                        b_.getInt64Ty());
     TF_ASSIGN_OR_RETURN(std::vector<llvm::Value*> accumulator,
                         EmitInnerLoopForVectorizedReduction(
                             reduction_generator, array_index, vector_type,
@@ -1831,10 +1811,12 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   return true;
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
-    HloReduceInstruction* reduce, const llvm_ir::IrArray::Index& index) {
-  const HloInstruction* arg = reduce->mutable_operand(0);
-  const HloInstruction* init_value = reduce->mutable_operand(1);
+StatusOr<llvm::Value*> IrEmitter::EmitElementalReduce(
+    const HloReduceInstruction* reduce,
+    const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& initial_value_generator,
+    const llvm_ir::IrArray::Index& index) {
+  const HloInstruction* arg = reduce->operand(0);
   absl::Span<const int64> dimensions(reduce->dimensions());
 
   // Initialize an accumulator with init_value.
@@ -1842,9 +1824,10 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
   llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), "accumulator",
       &b_, MinimumAlignmentForPrimitiveType(accumulator_type));
-  llvm::Value* init_value_addr = GetEmittedValueFor(init_value);
-  llvm::Value* load_init_value = Load(init_value_addr);
-  Store(load_init_value, accumulator_addr);
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value* const init_value,
+      initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
+  Store(init_value, accumulator_addr);
 
   // The enclosing loops go over all the target elements. Now we have to compute
   // the actual target element. For this, we build a new loop nest to iterate
@@ -1852,7 +1835,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
   // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
   // are placed for each dimension in dimensions, and all the rest are nullptrs.
   llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-  const llvm_ir::IrArray::Index reduced_dims_index =
+  std::vector<llvm::Value*> input_multi_index =
       loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                          "reduction_dim");
 
@@ -1863,19 +1846,20 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
   // fill in the rest of the dimensions with induction Value*s taken from
   // 'index' which iterates over the target array.  See the high-level
   // description in the XLA documentation for details.
-  llvm_ir::IrArray arg_array(GetIrArrayFor(arg));
-  llvm_ir::IrArray::Index input_index = reduced_dims_index;
   llvm_ir::IrArray::Index::const_iterator it = index.begin();
 
-  for (size_t i = 0; i < input_index.size(); ++i) {
-    if (input_index[i] == nullptr) {
-      input_index[i] = *it++;
+  for (auto& i : input_multi_index) {
+    if (i == nullptr) {
+      i = *it++;
     }
   }
   CHECK(index.end() == it);
+  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                      b_.getInt64Ty());
 
   // Apply the reduction function to the loaded value.
-  llvm::Value* input_element = arg_array.EmitReadArrayElement(input_index, &b_);
+  TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
+                      input_generator(input_index));
   llvm::Value* result = EmitThreadLocalCall(
       *reduce->to_apply(), {Load(accumulator_addr), input_element},
       "reduce_function");
@@ -1886,7 +1870,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
+  // TODO(b/118333695): Support variadic reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on CPU");
   }
@@ -1910,11 +1894,11 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
     }
   }
 
-  return EmitTargetElementLoop(reduce,
-                               [&](const llvm_ir::IrArray::Index& index) {
-                                 return EmitTargetElementLoopBodyForReduce(
-                                     Cast<HloReduceInstruction>(reduce), index);
-                               });
+  return DefaultAction(reduce);
+}
+
+Status IrEmitter::HandleAllToAll(HloInstruction*) {
+  return Unimplemented("AllToAll is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSend(HloInstruction* send) {
@@ -2021,15 +2005,17 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
 
   const int64 num_outer_loops = outer_dims.size();
   llvm_ir::ForLoopNest loops(IrName(slice), &b_);
-  llvm_ir::IrArray::Index target_index =
+  std::vector<llvm::Value*> target_multi_index =
       loops.AddLoopsForShapeOnDimensions(slice->shape(), outer_dims, "slice");
 
   // Only the indices for the outer dimensions have been initialized in
   // target_index. The rest of the indices should get initialized to 0, since
   // for the rest of the dimensions the copy writes to the full dimension.
-  std::replace(target_index.begin(), target_index.end(),
+  std::replace(target_multi_index.begin(), target_multi_index.end(),
                static_cast<llvm::Value*>(nullptr),
                static_cast<llvm::Value*>(b_.getInt64(0)));
+  llvm_ir::IrArray::Index target_index(target_multi_index, slice->shape(),
+                                       b_.getInt64Ty());
 
   if (num_outer_loops > 0) {
     SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
@@ -2037,7 +2023,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
 
   llvm_ir::IrArray source_array = GetIrArrayFor(operand);
   const llvm_ir::IrArray::Index source_index = target_index.SourceIndexOfSlice(
-      /*shape=*/slice->shape(), /*starts=*/slice->slice_starts(),
+      /*operand_shape=*/operand->shape(), /*starts=*/slice->slice_starts(),
       /*strides=*/slice->slice_strides(), /*builder=*/&b_);
 
   llvm::Value* memcpy_dest =
@@ -2140,18 +2126,20 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   // Compute the output index the operand element should be assigned to.
   // output_index := edge_padding_low + operand_index * (interior_padding + 1)
   const PaddingConfig& padding_config = pad->padding_config();
-  llvm_ir::IrArray::Index output_index(operand_index.GetType());
+  std::vector<llvm::Value*> output_multi_index;
   for (size_t i = 0; i < operand_index.size(); ++i) {
     llvm::Value* offset =
         Mul(operand_index[i],
             b_.getInt64(padding_config.dimensions(i).interior_padding() + 1));
     llvm::Value* index = Add(
         offset, b_.getInt64(padding_config.dimensions(i).edge_padding_low()));
-    output_index.push_back(index);
+    output_multi_index.push_back(index);
   }
 
   // Store the operand element to the computed output location.
   llvm_ir::IrArray output_array(GetIrArrayFor(pad));
+  llvm_ir::IrArray::Index output_index(
+      output_multi_index, output_array.GetShape(), operand_index.GetType());
   output_array.EmitWriteArrayElement(output_index, operand_data, &b_);
 
   SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
@@ -2240,7 +2228,6 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   absl::Span<HloInstruction* const> operands(custom_call->operands());
-  absl::string_view custom_call_target(custom_call->custom_call_target());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
@@ -2253,13 +2240,34 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
         InBoundsGEP(operands_alloca, {b_.getInt64(i)});
     Store(operand_as_i8ptr, slot_in_operands_alloca);
   }
-  auto* custom_call_ir_function =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          AsStringRef(custom_call_target),
-          llvm::FunctionType::get(
-              /*Result=*/b_.getVoidTy(),
-              /*Params=*/{i8_ptr_type, operands_alloca->getType()},
-              /*isVarArg=*/false)));
+  if (emit_code_for_msan_) {
+    // Mark the alloca as initialized for msan. The buffer gets read by the
+    // custom callee, which might be msan-instrumented.
+    // TODO(b/66051036): Run the msan instrumentation pass instead.
+    const llvm::DataLayout& dl = module_->getDataLayout();
+    llvm::Type* intptr_type = b_.getIntPtrTy(dl);
+    auto* msan_unpoison_ir_function = llvm::cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                "__msan_unpoison",
+                llvm::FunctionType::get(
+                    /*Result=*/b_.getVoidTy(),
+                    /*Params=*/{i8_ptr_type, intptr_type}, /*isVarArg=*/false))
+            .getCallee());
+    Call(msan_unpoison_ir_function,
+         {PointerCast(operands_alloca, i8_ptr_type),
+          llvm::ConstantInt::get(
+              intptr_type, *operands_alloca->getAllocationSizeInBits(dl) / 8)});
+  }
+  auto* custom_call_ir_function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(
+              custom_call->custom_call_target(),
+              llvm::FunctionType::get(
+                  /*Result=*/b_.getVoidTy(),
+                  /*Params=*/{i8_ptr_type, operands_alloca->getType()},
+                  /*isVarArg=*/false))
+          .getCallee());
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
   // Write the tuple table if the output is a tuple.
@@ -2275,7 +2283,7 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
       llvm::Value* addr = EmitBufferPointer(slice, elem_shape);
       base_ptrs.push_back(addr);
     }
-    llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_, module_);
+    llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_);
   }
   auto* output_address_arg =
       PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
@@ -2337,7 +2345,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
 
   // Terminates the current block with a branch to a while header.
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
-      module_->getContext(), AsStringRef(IrName(xla_while, "header")),
+      module_->getContext(), IrName(xla_while, "header"),
       compute_function_->function());
   Br(header_bb);
   b_.SetInsertPoint(header_bb);
@@ -2350,11 +2358,11 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0));
 
   // Branches to the body or to the while exit depending on the condition.
-  llvm::BasicBlock* body_bb = llvm::BasicBlock::Create(
-      module_->getContext(), AsStringRef(IrName(xla_while, "body")),
-      compute_function_->function());
+  llvm::BasicBlock* body_bb =
+      llvm::BasicBlock::Create(module_->getContext(), IrName(xla_while, "body"),
+                               compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
-      module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
+      module_->getContext(), IrName(xla_while, "exit"));
   CondBr(while_predicate, body_bb, exit_bb);
 
   // Calls the body function from the body block.
@@ -2409,11 +2417,13 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   llvm_ir::IrArray target_array = GetIrArrayFor(concatenate);
 
   llvm_ir::ForLoopNest loops(IrName(concatenate), &b_);
-  llvm_ir::IrArray::Index outer_dims_index =
+  std::vector<llvm::Value*> target_multi_index =
       loops.AddLoopsForShapeOnDimensions(output_shape, outer_dims, "concat");
-  std::replace(outer_dims_index.begin(), outer_dims_index.end(),
+  std::replace(target_multi_index.begin(), target_multi_index.end(),
                static_cast<llvm::Value*>(nullptr),
                static_cast<llvm::Value*>(b_.getInt64(0)));
+  llvm_ir::IrArray::Index target_index(target_multi_index, output_shape,
+                                       b_.getInt64Ty());
 
   if (!outer_dims.empty()) {
     SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
@@ -2425,10 +2435,9 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   // Contiguous subregions from each operand to the concatenate contribute to a
   // contiguous subregion in the target buffer starting at target_region_begin.
-  llvm::Value* target_region_begin =
-      BitCast(target_array.EmitArrayElementAddress(outer_dims_index, &b_,
-                                                   "target_region"),
-              i8_ptr_type);
+  llvm::Value* target_region_begin = BitCast(
+      target_array.EmitArrayElementAddress(target_index, &b_, "target_region"),
+      i8_ptr_type);
   int64 byte_offset_into_target_region = 0;
 
   int64 inner_dims_product =
@@ -2443,7 +2452,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
     const Shape& input_shape = operand->shape();
     llvm_ir::IrArray source_array = GetIrArrayFor(operand);
     llvm::Value* copy_source_address = BitCast(
-        source_array.EmitArrayElementAddress(outer_dims_index, &b_, "src_addr"),
+        source_array.EmitArrayElementAddress(target_index, &b_, "src_addr"),
         i8_ptr_type);
 
     llvm::Value* copy_target_address =
@@ -2520,53 +2529,109 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
 }
 
 Status IrEmitter::HandleConditional(HloInstruction* conditional) {
-  auto pred = conditional->operand(0);
-  TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) &&
-               pred->shape().element_type() == PRED)
-      << "Predicate on a Conditional must be bool; got: "
-      << ShapeUtil::HumanString(pred->shape());
-
-  HloComputation* true_computation = conditional->true_computation();
-  HloComputation* false_computation = conditional->false_computation();
-  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
-                                true_computation->root_instruction()->shape()))
-      << "Shape of conditional should be same as the shape of the true "
-      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
-      << " and "
-      << ShapeUtil::HumanString(true_computation->root_instruction()->shape());
-
-  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
-                                false_computation->root_instruction()->shape()))
-      << "Shape of conditional should be same as the shape of the false "
-      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
-      << " and "
-      << ShapeUtil::HumanString(false_computation->root_instruction()->shape());
+  auto branch_index = conditional->operand(0);
+  int num_branches = conditional->branch_count();
+  TF_RET_CHECK(ShapeUtil::IsScalar(branch_index->shape()) &&
+               (branch_index->shape().element_type() == PRED ||
+                branch_index->shape().element_type() == S32))
+      << "Branch index on a conditional must be scalar bool or int32; got: "
+      << ShapeUtil::HumanString(branch_index->shape());
+
+  for (int b = 0; b < num_branches; ++b) {
+    HloComputation* br_computation = conditional->branch_computation(b);
+    TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                  br_computation->root_instruction()->shape()))
+        << "Shape of conditional should be same as the shape of the " << b
+        << "th branch computation; got: "
+        << ShapeUtil::HumanString(conditional->shape()) << " and "
+        << ShapeUtil::HumanString(br_computation->root_instruction()->shape());
+  }
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional));
 
-  // Generating:
-  //   if (pred)
-  //     cond_result = true_computation(true_operand)
-  //   else
-  //     cond_result = false_computation(false_operand)
-  llvm::LoadInst* pred_value =
-      Load(GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
-  llvm::Value* pred_cond = ICmpNE(
-      pred_value,
-      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
-      "boolean_predicate");
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_);
-
-  SetToFirstInsertPoint(if_data.true_block, &b_);
-  EmitGlobalCall(*conditional->true_computation(),
-                 IrName(conditional, "_true"));
-
-  SetToFirstInsertPoint(if_data.false_block, &b_);
-  EmitGlobalCall(*conditional->false_computation(),
-                 IrName(conditional, "_false"));
-
-  SetToFirstInsertPoint(if_data.after_block, &b_);
+  if (branch_index->shape().element_type() == PRED) {
+    // Emit an if-else to LLVM:
+    //   if (pred)
+    //     cond_result = true_computation(true_operand)
+    //   else
+    //     cond_result = false_computation(false_operand)
+    llvm::LoadInst* pred_value = Load(
+        GetIrArrayFor(branch_index).GetBasePointer(), "load_predicate_value");
+    llvm::Value* pred_cond =
+        ICmpNE(pred_value,
+               llvm::ConstantInt::get(
+                   llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+               "boolean_predicate");
+    llvm_ir::LlvmIfData if_data =
+        llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_);
+
+    SetToFirstInsertPoint(if_data.true_block, &b_);
+    EmitGlobalCall(*conditional->branch_computation(0),
+                   IrName(conditional, "_true"));
+
+    SetToFirstInsertPoint(if_data.false_block, &b_);
+    EmitGlobalCall(*conditional->branch_computation(1),
+                   IrName(conditional, "_false"));
+
+    SetToFirstInsertPoint(if_data.after_block, &b_);
+    return Status::OK();
+  }
+  // We emit a switch statement to LLVM:
+  // switch (branch_index) {
+  //   default:
+  //     result = branch_computations[num_branches-1](operands[num_branches-1]);
+  //     break;
+  //   case 0:
+  //     result = branch_computations[0](operands[0]); break;
+  //   case 1:
+  //     result = branch_computations[1](operands[1]); break;
+  //   ...
+  //   case [[num_branches-2]]:
+  //     result = branch_computations[num_branches-2](operands[num_branches-2]);
+  //     break;
+  // }
+  llvm::LoadInst* branch_index_value = Load(
+      GetIrArrayFor(branch_index).GetBasePointer(), "load_branch_index_value");
+
+  auto case_block = b_.GetInsertBlock();
+  llvm::BasicBlock* after_block;
+  // Add a terminator to the case block, if necessary.
+  if (case_block->getTerminator() == nullptr) {
+    after_block = llvm_ir::CreateBasicBlock(nullptr, "case-after", &b_);
+    b_.SetInsertPoint(case_block);
+    b_.CreateBr(after_block);
+  } else {
+    after_block =
+        case_block->splitBasicBlock(b_.GetInsertPoint(), "case-after");
+  }
+  // Our basic block should now end with an unconditional branch.  Remove it;
+  // we're going to replace it with a switch based branch.
+  case_block->getTerminator()->eraseFromParent();
+
+  // Lower the default branch computation.
+  auto default_block = llvm_ir::CreateBasicBlock(nullptr, "case-default", &b_);
+  b_.SetInsertPoint(default_block);
+  EmitGlobalCall(*conditional->branch_computation(num_branches - 1),
+                 IrName(conditional, "_default"));
+  b_.CreateBr(after_block);
+
+  // Prepare the switch (branch_index) { ... } instruction.
+  b_.SetInsertPoint(case_block);
+  llvm::SwitchInst* case_inst =
+      b_.CreateSwitch(branch_index_value, default_block, num_branches - 1);
+  // Lower each branch's computation.
+  for (int b = 0; b < num_branches - 1; ++b) {  // last branch is default
+    // Lower the case b: { ... ; break; } computation.
+    auto branch_block =
+        llvm_ir::CreateBasicBlock(nullptr, absl::StrCat("case-branch", b), &b_);
+    b_.SetInsertPoint(branch_block);
+    EmitGlobalCall(*conditional->branch_computation(b),
+                   IrName(conditional, absl::StrCat("_branch", b)));
+    b_.CreateBr(after_block);
+    case_inst->addCase(b_.getInt32(b), branch_block);
+  }
+
+  SetToFirstInsertPoint(after_block, &b_);
   return Status::OK();
 }
 
@@ -2641,7 +2706,7 @@ llvm::Value* IrEmitter::GetProfileCounterCommon(
   int64 prof_counter_idx = it->second;
   string counter_name = IrName("prof_counter", hlo.name());
   return GEP(GetProfileCountersArgument(), b_.getInt64(prof_counter_idx),
-             AsStringRef(counter_name));
+             counter_name);
 }
 
 void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b,
@@ -2685,7 +2750,7 @@ llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(llvm::IRBuilder<>* b) {
 void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo) {
   auto* cycle_start = ReadCycleCounter(b);
-  cycle_start->setName(AsStringRef(IrName(hlo, "cycle_start")));
+  cycle_start->setName(IrName(hlo, "cycle_start"));
   cycle_starts_[hlo] = cycle_start;
   if (first_read_cycle_start_ == nullptr) {
     first_read_cycle_start_ = cycle_start;
@@ -2696,7 +2761,7 @@ void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo,
                                                  llvm::Value* prof_counter) {
   auto* cycle_end = ReadCycleCounter(b);
-  cycle_end->setName(AsStringRef(IrName(hlo, "cycle_end")));
+  cycle_end->setName(IrName(hlo, "cycle_end"));
   auto* cycle_start = cycle_starts_[hlo];
   UpdateProfileCounter(b, prof_counter, cycle_end, cycle_start);
   last_read_cycle_end_ = cycle_end;
@@ -2869,7 +2934,7 @@ Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
                       assignment_.GetUniqueTopLevelSlice(op));
   llvm::Value* addr = EmitBufferPointer(slice, target_shape);
-  addr->setName(AsStringRef(IrName(op)));
+  addr->setName(IrName(op));
   emitted_value_[op] = addr;
   return Status::OK();
 }
@@ -2909,7 +2974,7 @@ Status IrEmitter::EmitTargetElementLoop(
     for (int64 i = 0; i < output_arrays.size(); ++i) {
       tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
     }
-    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &b_, module_);
+    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &b_);
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index a6fb11dcbf9bb201ba8837866e2f509c48bfd061..e183ae01070e7d42701a3a32d5ddb8667e163663 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -72,13 +72,15 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
+  // emit_code_for_msan: whether emitted code should be compatible with msan.
   IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
             llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
                 computation_to_profile_idx,
-            const TargetMachineFeatures* target_machine);
+            const TargetMachineFeatures* target_machine,
+            bool emit_code_for_msan);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -116,6 +118,23 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       const HloMapInstruction& map_instr,
       absl::Span<llvm::Value* const> elemental_operands,
       absl::string_view name);
+  // Emit code to emit the element at `index` for a reduce window instruction.
+  StatusOr<llvm::Value*> EmitElementalReduceWindow(
+      const HloReduceWindowInstruction* reduce_window,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::IrArray::Index& index);
+  // Emit code to emit the element at `index` for a convolution instruction.
+  StatusOr<llvm::Value*> EmitElementalConvolution(
+      const HloConvolutionInstruction* convolution,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& kernel_generator,
+      const llvm_ir::IrArray::Index& index);
+  // Emit code to emit the element at `index` for a reduce instruction.
+  StatusOr<llvm::Value*> EmitElementalReduce(
+      const HloReduceInstruction* reduce,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& initial_value_generator,
+      const llvm_ir::IrArray::Index& index);
 
  protected:
   //
@@ -125,6 +144,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // special in some way are handled explicitly in HandleFoo methods.
   Status DefaultAction(HloInstruction* hlo) override;
 
+  Status HandleAllToAll(HloInstruction* instruction) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleConstant(HloInstruction* constant) override;
   Status HandleCopy(HloInstruction* copy) override;
@@ -250,14 +270,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice,
                                  const Shape& target_shape);
 
-  // Emits a function into the current module. This can be used for
-  // computations embedded inside other computations, such as the
-  // function that a map operation applies.
-  StatusOr<llvm::Function*> EmitFunction(
-      HloComputation* function,  // The function to emit.
-      absl::string_view
-          function_name_suffix);  // Used for LLVM IR register names.
-
   // Emits a call to a thread local function (e.g. to the computation nested
   // within a reduce or a map).  Thread local callees (by definition) only write
   // to and read from thread local allocations.
@@ -532,17 +544,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Returns the number of bytes within the shape.
   int64 ByteSizeOf(const Shape& shape) const;
 
-  StatusOr<llvm::Value*> EmitTargetElementLoopBodyForMap(
-      HloMapInstruction* map, const llvm_ir::IrArray::Index& index);
-  StatusOr<llvm::Value*> EmitTargetElementLoopBodyForReduceWindow(
-      HloReduceWindowInstruction* reduce_window,
-      const llvm_ir::IrArray::Index& index);
-  StatusOr<llvm::Value*> EmitTargetElementLoopBodyForConvolution(
-      HloConvolutionInstruction* convolution,
-      const llvm_ir::IrArray::Index& index);
-  StatusOr<llvm::Value*> EmitTargetElementLoopBodyForReduce(
-      HloReduceInstruction* reduce, const llvm_ir::IrArray::Index& index);
-
   enum class XfeedKind {
     kInfeed,
     kOutfeed,
@@ -582,6 +583,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<const HloComputation*> thread_local_computations_;
   std::vector<const HloComputation*> global_computations_;
 
+  bool emit_code_for_msan_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index adfb8392bf6fa356f0a5cdab3ff74036eca8918e..42acd72f9661df8e6687cf544957dce112954dc5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -24,11 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
-
-namespace {
-using llvm_ir::AsStringRef;
-}  // namespace
-
 namespace cpu {
 
 static std::vector<llvm::Type*> GetComputeFunctionParams(
@@ -48,15 +43,14 @@ static std::vector<llvm::Type*> GetComputeFunctionParams(
 
 IrFunction::IrFunction(const string& function_name,
                        llvm::Function::LinkageTypes linkage,
-                       const bool optimize_for_size_requested,
-                       const bool enable_fast_math, llvm::Module* llvm_module,
-                       llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds)
+                       const HloModuleConfig& module_config,
+                       llvm::Module* llvm_module, llvm::IRBuilder<>* b,
+                       int64 num_dynamic_loop_bounds)
     : b_(b),
       llvm_module_(llvm_module),
       caller_insert_point_guard_(*b),
       num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
-  Initialize(function_name, linkage, optimize_for_size_requested,
-             enable_fast_math);
+  Initialize(function_name, linkage, module_config);
 }
 
 IrFunction::~IrFunction() {
@@ -75,8 +69,7 @@ DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
 
 void IrFunction::Initialize(const string& function_name,
                             llvm::Function::LinkageTypes linkage,
-                            const bool optimize_for_size_requested,
-                            const bool enable_fast_math) {
+                            const HloModuleConfig& module_config) {
   // The function signature is:
   //   void function(i8* retval, i8* run_options, i8** params, i8**
   //   buffer_table,
@@ -147,11 +140,8 @@ void IrFunction::Initialize(const string& function_name,
   // Functions with local linkage get an inlining bonus.  Because we know
   // a-priori that embedded functions (non-entry functions) will not have its
   // name resolved, give it local linkage.
-  function_ =
-      llvm_ir::CreateFunction(function_type, linkage,
-                              /*enable_fast_math=*/enable_fast_math,
-                              /*optimize_for_size=*/optimize_for_size_requested,
-                              function_name, llvm_module_);
+  function_ = llvm_ir::CreateCpuFunction(function_type, linkage, module_config,
+                                         function_name, llvm_module_);
 
   // Set meaningful names for the function's arguments: useful for debugging.
   llvm::Function::arg_iterator arg_iter = function_->arg_begin();
@@ -193,7 +183,7 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
   CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
   string name = absl::StrCat("dynamic_loop_bound_", offset);
   return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
-                                      b_->getInt64(offset), AsStringRef(name)));
+                                      b_->getInt64(offset), name));
 }
 
 // Emits code to allocate an array of parameter address pointers, and store
@@ -216,10 +206,9 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
         absl::StrCat(name, "_parameter_addresses"), b);
 
     for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-      llvm::Value* parameter_as_i8ptr =
-          b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(),
-                           AsStringRef(absl::StrCat(name, "_parameter_", i,
-                                                    "_address_as_i8ptr")));
+      llvm::Value* parameter_as_i8ptr = b->CreateBitCast(
+          parameter_addresses[i], b->getInt8PtrTy(),
+          absl::StrCat(name, "_parameter_", i, "_address_as_i8ptr"));
       llvm::Value* slot_in_param_addresses =
           b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)});
       b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
@@ -266,9 +255,11 @@ Status EmitCallToParallelForkJoin(
       /*Params=*/compute_function_params,
       /*isVarArg=*/false);
 
-  llvm::Function* fork_join_func =
-      llvm::cast<llvm::Function>(module->getOrInsertFunction(
-          runtime::kParallelForkJoinSymbolName, fork_join_type));
+  llvm::Function* fork_join_func = llvm::dyn_cast<llvm::Function>(
+      module
+          ->getOrInsertFunction(runtime::kParallelForkJoinSymbolName,
+                                fork_join_type)
+          .getCallee());
   fork_join_func->setCallingConv(llvm::CallingConv::C);
   fork_join_func->setDoesNotThrow();
 
@@ -322,7 +313,7 @@ Status EmitCallToParallelForkJoin(
       /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
       /*Initializer=*/partitions_array,
       /*Name=*/
-      AsStringRef(absl::StrCat(name, "_parallel_dimension_partitions")));
+      absl::StrCat(name, "_parallel_dimension_partitions"));
 
   // Add argument specifying parallel dimension partitions.
   fork_join_arguments.push_back(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 623a5f185fa1fd0526bc8664e2ba11c9dde79b1d..02bcec9dfc783d4eea653d6d74e903909b666b98 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -52,8 +53,7 @@ namespace cpu {
 class IrFunction {
  public:
   IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
-             const bool optimize_for_size_requested,
-             const bool enable_fast_math, llvm::Module* llvm_module,
+             const HloModuleConfig& module_config, llvm::Module* llvm_module,
              llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds);
   ~IrFunction();
 
@@ -92,7 +92,7 @@ class IrFunction {
   // Initialize an llvm::Function with standard signature based on arguments.
   void Initialize(const string& function_name,
                   llvm::Function::LinkageTypes linkage,
-                  bool optimize_for_size_requested, bool enable_fast_math);
+                  const HloModuleConfig& module_config);
 
   // Emit ir to read and return the ir value for the dynamic loop bound at
   // 'offset' from the "dynamic_loop_bounds" argument of this function.
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index f9722ffadac801521ddcbb568dd4435fd02e951b..93ef51754d21ad3ff4e24298c89649ef4c2742fb 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -36,57 +36,88 @@ const char* const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32AVX";
 const char* const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32AVX";
 
 namespace {
-llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
-                                          llvm::StringRef function_name,
-                                          int vector_width,
-                                          bool enable_fast_math) {
-  llvm::Function* vector_tanh_function = module->getFunction(function_name);
-  if (vector_tanh_function == nullptr) {
+
+// Replaces calls to the function `fn_name` with the code generated by
+// fn_body_generator.
+//
+// We assume that fn_name accepts either a scalar f32 or a vector of
+// vector_width f32s, and that fn_body_generator generates a function body with
+// the same inputs/outputs as fn_name.
+void RewriteCalls(
+    llvm::Module* module, const char* fn_name,
+    std::function<llvm::Value*(llvm::IRBuilder<>* b, llvm::Value* input,
+                               int32 vector_width)>
+        fn_body_generator,
+    int32 vector_width, bool enable_fast_math) {
+  llvm::Function* fn = module->getFunction(fn_name);
+  if (fn == nullptr) {
     // If the function declaration is not present in the module, there can't be
     // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
+    return;
   }
 
-  llvm::LLVMContext* context = &module->getContext();
+  // Our task is to generate a function body for `fn`, but we can't generate a
+  // function body for an LLVM intrinsic. So if fn is an intrinsic, replace it
+  // with a new function.
+  if (fn->isIntrinsic()) {
+    llvm::Function* new_fn = llvm::Function::Create(
+        fn->getFunctionType(), llvm::GlobalValue::InternalLinkage,
+        llvm::Twine("xla_impl.") + fn_name, module);
+    fn->replaceAllUsesWith(new_fn);
+    fn->eraseFromParent();
+    fn = new_fn;
+  }
 
-  llvm::BasicBlock* vector_tanh_body =
-      llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
+  llvm::LLVMContext* context = &module->getContext();
 
-  llvm::IRBuilder<> b(vector_tanh_body);
+  llvm::BasicBlock* fn_body = llvm::BasicBlock::Create(*context, "body", fn);
+  llvm::IRBuilder<> b(fn_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast(enable_fast_math);
   b.setFastMathFlags(fast_math_flags);
 
-  llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
-  b.CreateRet(llvm_ir::EmitFastTanh(&b, input));
-
-  DCHECK(!llvm::verifyFunction(*vector_tanh_function));
-  return vector_tanh_function;
-}
+  llvm::Value* input = &*fn->arg_begin();
 
-llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
-                                         llvm::StringRef function_name,
-                                         int vector_width,
-                                         bool enable_fast_math) {
-  llvm::Function* vector_exp_function = module->getFunction(function_name);
-  if (vector_exp_function == nullptr) {
-    // If the function declaration is not present in the module, there can't be
-    // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
+  // Upcast to vector type if input is a scalar.
+  if (vector_width == 1) {
+    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1);
+    input = b.CreateInsertElement(llvm::UndefValue::get(v1_type), input,
+                                  uint64_t{0});
   }
 
-  llvm::LLVMContext* context = &module->getContext();
+  // Generate the vectorized code.
+  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
+  llvm::Value* result = fn_body_generator(&b, input, vector_width);
+
+  // Downcast result to scalar type if necessary.
+  if (vector_width == 1) {
+    result = b.CreateExtractElement(result, uint64_t{0});
+  }
+  b.CreateRet(result);
+  DCHECK(!llvm::verifyFunction(*fn));
 
-  llvm::BasicBlock* vector_exp_body =
-      llvm::BasicBlock::Create(*context, "body", vector_exp_function);
+  // Force-inline `fn` into all of its callers and then delete `fn`.
+  //
+  // TODO(b/73081976): Should we avoid inlining these in some cases?
+  std::vector<llvm::CallInst*> calls_to_inline;
+  for (auto* user : fn->users()) {
+    calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
+  }
+  for (auto* call_to_inline : calls_to_inline) {
+    llvm::InlineFunctionInfo inline_function_info;
+    CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
+  }
+  fn->eraseFromParent();
+}
 
-  llvm::IRBuilder<> b(vector_exp_body);
-  llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  b.setFastMathFlags(fast_math_flags);
+llvm::Value* GenerateVF32Tanh(llvm::IRBuilder<>* b, llvm::Value* input,
+                              int32 /*vector_width*/) {
+  return llvm_ir::EmitFastTanh(b, input);
+}
 
-  VectorSupportLibrary vsl(F32, vector_width, &b, "exp_f32");
+llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
+                             int32 vector_width) {
+  VectorSupportLibrary vsl(F32, vector_width, b, "exp_f32");
 
   // This implements the same polynomial approximation as implemented in Eigen3.
 
@@ -107,7 +138,6 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   const llvm::APFloat cephes_exp_p4 = GetIeeeF32(1.6666665459E-1);
   const llvm::APFloat cephes_exp_p5 = GetIeeeF32(5.0000001201E-1);
 
-  llvm::Value* input = &*vector_exp_function->arg_begin();
   llvm::Value* input_clamped =
       vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
   llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, half));
@@ -128,49 +158,24 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
+      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      b.CreateVectorSplat(vector_width, b.getInt32(23));
+      b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width);
   // fx is clamped so we don't have to worry about it being out of range for
   // i32.
-  llvm::Value* emm0 = b.CreateFPToSI(fx, i32_vector_type);
-  emm0 = b.CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = b.CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = b.CreateBitCast(emm0, vsl.vector_type());
-
-  llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
+  llvm::Value* emm0 = b->CreateFPToSI(fx, i32_vector_type);
+  emm0 = b->CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = b->CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = b->CreateBitCast(emm0, vsl.vector_type());
 
-  b.CreateRet(result);
-
-  DCHECK(!llvm::verifyFunction(*vector_exp_function));
-  return vector_exp_function;
+  return vsl.Max(vsl.Mul(y, emm0_f32), input);
 }
 
-llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
-                                         llvm::StringRef function_name,
-                                         int vector_width,
-                                         bool enable_fast_math) {
-  llvm::Function* vector_log_function = module->getFunction(function_name);
-  if (vector_log_function == nullptr) {
-    // If the function declaration is not present in the module, there can't be
-    // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
-  }
-
-  llvm::LLVMContext* context = &module->getContext();
-
-  llvm::BasicBlock* vector_log_body =
-      llvm::BasicBlock::Create(*context, "body", vector_log_function);
-
-  llvm::IRBuilder<> b(vector_log_body);
-  llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  b.setFastMathFlags(fast_math_flags);
-
-  llvm::Value* input = &*vector_log_function->arg_begin();
-  VectorSupportLibrary vsl(F32, vector_width, &b, "log_f32");
+llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
+                             int32 vector_width) {
+  VectorSupportLibrary vsl(F32, vector_width, b, "log_f32");
 
   const llvm::APFloat half = GetIeeeF32(0.5);
   const llvm::APFloat one = GetIeeeF32(1.0);
@@ -193,129 +198,107 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   // The smallest non denormalized float number.
   const llvm::APFloat min_norm_pos = GetIeeeF32FromBitwiseRep(0x00800000);
   const llvm::APFloat minus_inf = GetIeeeF32FromBitwiseRep(0xff800000);
+  const llvm::APFloat pos_inf = GetIeeeF32FromBitwiseRep(0x7f800000);
   const llvm::APFloat inv_mant_mask = GetIeeeF32FromBitwiseRep(~0x7f800000);
 
   // invalid_mask is set if x is negative or NaN (and therefore output
   // must be NaN).
   llvm::Value* invalid_mask = vsl.FCmpULEMask(input, vsl.GetZeroVector());
-  llvm::Value* iszero_mask = vsl.FCmpEQMask(input, vsl.GetZeroVector());
+  llvm::Value* is_zero_mask = vsl.FCmpEQMask(input, vsl.GetZeroVector());
+  llvm::Value* is_pos_inf_mask = vsl.FCmpEQMask(input, pos_inf);
 
   // Cut off denormalized stuff.
-  input = vsl.Max(min_norm_pos, input);
+  llvm::Value* tmp0 = vsl.Max(min_norm_pos, input);
 
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
+      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      b.CreateVectorSplat(vector_width, b.getInt32(23));
+      b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width);
 
-  llvm::Value* emm0 =
-      b.CreateLShr(b.CreateBitCast(input, i32_vector_type), vector_constant_23);
+  llvm::Value* emm0 = b->CreateLShr(b->CreateBitCast(tmp0, i32_vector_type),
+                                    vector_constant_23);
 
   // Keep only the fractional part.
-  input = vsl.FloatAnd(input, inv_mant_mask);
-  input = vsl.FloatOr(input, half);
+  tmp0 = vsl.FloatAnd(tmp0, inv_mant_mask);
+  tmp0 = vsl.FloatOr(tmp0, half);
 
-  emm0 = b.CreateSub(emm0, vector_constant_0x7f);
-  llvm::Value* e = vsl.Add(one, b.CreateSIToFP(emm0, vsl.vector_type()));
+  emm0 = b->CreateSub(emm0, vector_constant_0x7f);
+  llvm::Value* e = vsl.Add(one, b->CreateSIToFP(emm0, vsl.vector_type()));
 
   // part2:
   //   if( x < SQRTHF ) {
   //     e -= 1;
   //     x = x + x - 1.0;
   //   } else { x = x - 1.0; }
-  llvm::Value* mask = vsl.FCmpOLTMask(input, cephes_SQRTHF);
-  llvm::Value* tmp = vsl.FloatAnd(input, mask);
-  input = vsl.Sub(input, one);
+  llvm::Value* mask = vsl.FCmpOLTMask(tmp0, cephes_SQRTHF);
+  llvm::Value* tmp1 = vsl.FloatAnd(tmp0, mask);
+  tmp0 = vsl.Sub(tmp0, one);
   e = vsl.Sub(e, vsl.FloatAnd(mask, one));
-  input = vsl.Add(input, tmp);
+  tmp0 = vsl.Add(tmp0, tmp1);
 
-  llvm::Value* x2 = vsl.Mul(input, input);
-  llvm::Value* x3 = vsl.Mul(x2, input);
+  llvm::Value* x2 = vsl.Mul(tmp0, tmp0);
+  llvm::Value* x3 = vsl.Mul(x2, tmp0);
 
   llvm::Value *y, *y1, *y2;
-  y = vsl.MulAdd(input, cephes_log_p0, cephes_log_p1);
-  y1 = vsl.MulAdd(input, cephes_log_p3, cephes_log_p4);
-  y2 = vsl.MulAdd(input, cephes_log_p6, cephes_log_p7);
-  y = vsl.MulAdd(y, input, cephes_log_p2);
-  y1 = vsl.MulAdd(y1, input, cephes_log_p5);
-  y2 = vsl.MulAdd(y2, input, cephes_log_p8);
+  y = vsl.MulAdd(tmp0, cephes_log_p0, cephes_log_p1);
+  y1 = vsl.MulAdd(tmp0, cephes_log_p3, cephes_log_p4);
+  y2 = vsl.MulAdd(tmp0, cephes_log_p6, cephes_log_p7);
+  y = vsl.MulAdd(y, tmp0, cephes_log_p2);
+  y1 = vsl.MulAdd(y1, tmp0, cephes_log_p5);
+  y2 = vsl.MulAdd(y2, tmp0, cephes_log_p8);
   y = vsl.MulAdd(y, x3, y1);
   y = vsl.MulAdd(y, x3, y2);
   y = vsl.Mul(y, x3);
 
   y1 = vsl.Mul(cephes_log_q1, e);
-  tmp = vsl.Mul(half, x2);
+  llvm::Value* tmp2 = vsl.Mul(half, x2);
   y = vsl.Add(y, y1);
-  input = vsl.Sub(input, tmp);
+  tmp0 = vsl.Sub(tmp0, tmp2);
   y2 = vsl.Mul(cephes_log_q2, e);
-  input = vsl.Add(input, y);
-  input = vsl.Add(input, y2);
+  tmp0 = vsl.Add(tmp0, y);
+  tmp0 = vsl.Add(tmp0, y2);
 
-  // Negative arg will be NAN, 0 will be -INF.
-  llvm::Value* or_lhs =
-      vsl.FloatAndNot(iszero_mask, vsl.FloatOr(input, invalid_mask));
-  llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf);
-  llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs);
+  // Contains +/-inf where +/-inf is the correct answer, otherwise 0.
+  llvm::Value* result_inf = vsl.FloatOr(vsl.FloatAnd(is_zero_mask, minus_inf),
+                                        vsl.FloatAnd(is_pos_inf_mask, pos_inf));
 
-  b.CreateRet(result);
+  // Contains a finite result or nan.  This is the correct answer only if both
+  // result_minus_inf and result_pos_inf are both 0.
+  //
+  // (This implementation works because 0xffffffff is a nan.)
+  llvm::Value* result_finite_or_nan = vsl.FloatOr(tmp0, invalid_mask);
 
-  DCHECK(!llvm::verifyFunction(*vector_log_function));
-  return vector_log_function;
+  // Combine the above into a final result.
+  return vsl.FloatOr(result_inf,
+                     vsl.FloatAndNot(vsl.FloatOr(is_zero_mask, is_pos_inf_mask),
+                                     result_finite_or_nan));
 }
 }  // namespace
 
 void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
-  auto* tanh_v4f32 =
-      EmitVectorF32TanhIfNeeded(module, kTanhV4F32SymbolName,
-                                /*vector_width=*/4, enable_fast_math);
-  auto* tanh_v8f32 =
-      EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName,
-                                /*vector_width=*/8, enable_fast_math);
-
-  auto* exp_v4f32 =
-      EmitVectorF32ExpIfNeeded(module, kExpV4F32SymbolName,
-                               /*vector_width=*/4, enable_fast_math);
-  auto* exp_v8f32 =
-      EmitVectorF32ExpIfNeeded(module, kExpV8F32SymbolName,
-                               /*vector_width=*/8, enable_fast_math);
-
-  auto* log_v4f32 =
-      EmitVectorF32LogIfNeeded(module, kLogV4F32SymbolName,
-                               /*vector_width=*/4, enable_fast_math);
-  auto* log_v8f32 =
-      EmitVectorF32LogIfNeeded(module, kLogV8F32SymbolName,
-                               /*vector_width=*/8, enable_fast_math);
-
-  // Gather all the call sites, force inline them and then delete the vector
-  // function bodies.
-  //
-  // TODO(b/73081976): Should we avoid inlining these intrinsics in some cases?
-
-  std::vector<llvm::CallInst*> calls_to_inline;
-  for (auto* function :
-       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
-    if (function != nullptr) {
-      for (auto* user : function->users()) {
-        calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
-      }
-    }
-  }
-
-  for (auto* call_to_inline : calls_to_inline) {
-    llvm::InlineFunctionInfo inline_function_info;
-    CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
-  }
-
-  for (auto* function :
-       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
-    if (function != nullptr) {
-      function->eraseFromParent();
-    }
-  }
+  // Curry some params to RewriteCalls.
+  auto rewrite_calls =
+      std::bind(RewriteCalls, module, std::placeholders::_1,
+                std::placeholders::_2, std::placeholders::_3, enable_fast_math);
+
+  rewrite_calls("tanhf", GenerateVF32Tanh, /*vector_width=*/1);
+  rewrite_calls("llvm.tanh.f32", GenerateVF32Tanh, /*vector_width=*/1);
+  rewrite_calls(kTanhV4F32SymbolName, GenerateVF32Tanh, /*vector_width=*/4);
+  rewrite_calls(kTanhV8F32SymbolName, GenerateVF32Tanh, /*vector_width=*/8);
+
+  rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
+  rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
+  rewrite_calls(kExpV4F32SymbolName, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls(kExpV8F32SymbolName, GenerateVF32Exp, /*vector_width=*/8);
+
+  rewrite_calls("logf", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls("llvm.log.f32", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls(kLogV4F32SymbolName, GenerateVF32Log, /*vector_width=*/4);
+  rewrite_calls(kLogV8F32SymbolName, GenerateVF32Log, /*vector_width=*/8);
 }
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index a6f4273a5a70aab0bc88383283d2a55b1ecb1681..ffbd0d68ce9b5677d96761a5b10caed8335ef56a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -39,7 +39,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
 
   llvm_ir::ForLoopNest loop_nest(loop_name, b_);
   const int64 num_dims = shape_.dimensions_size();
-  llvm_ir::IrArray::Index array_index(index_type, num_dims);
+  std::vector<llvm::Value*> array_multi_index(num_dims);
 
   // Add loops from outer-most to inner-most dimensions.
   for (int i = LayoutUtil::MinorToMajor(shape_).size() - 1; i >= 0; --i) {
@@ -54,14 +54,14 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
       std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
           /*suffix=*/absl::StrFormat("dim.%d", dimension), start_index,
           end_index);
-      array_index[dimension] = loop->GetIndVarValue();
+      array_multi_index[dimension] = loop->GetIndVarValue();
     } else {
       // Emit static loop bounds for this dimension.
       std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
           /*start_index=*/0,
           /*end_index=*/shape_.dimensions(dimension),
           /*suffix=*/absl::StrFormat("dim.%d", dimension));
-      array_index[dimension] = loop->GetIndVarValue();
+      array_multi_index[dimension] = loop->GetIndVarValue();
     }
   }
   // Point IR builder at inner loop BB.
@@ -71,6 +71,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK(exit_bb_ != nullptr);
 
+  llvm_ir::IrArray::Index array_index(array_multi_index, shape_, index_type);
   return {array_index};
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
index 2d9492eacfea34bec3b0f1115e171a5328b7cdc3..6f72ddadf94d4c5b9add2ee66e0f4ac9a8ae9099 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
@@ -69,8 +69,13 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ParallelForkJoin(
   CHECK_EQ(params, nullptr);
   CHECK_GT(num_partitions, 1);
   CHECK_GT(num_partitioned_dims, 0);
+  CHECK_NE(function_ptr, nullptr);
+  CHECK_NE(partitions, nullptr);
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  CHECK_NE(run_options, nullptr);
+  CHECK_NE(run_options->intra_op_thread_pool(), nullptr);
+
   ComputeFunctionType function =
       reinterpret_cast<ComputeFunctionType>(function_ptr);
   // Compute partition stride in 'partitions' array.
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 722aa3120ef4d8c957873ac58c361f19632dde1f..70a6d0af02c0c2db7208db561cf29e35a74707b2 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -15,12 +15,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstring>
-#include <limits>
 #include <memory>
+#include <numeric>
 #include <string>
-#include <utility>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/dynamic_annotations.h"
@@ -28,80 +26,15 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace {
-using tensorflow::int16;
 using tensorflow::int32;
 using tensorflow::int64;
-using tensorflow::int8;
-using tensorflow::uint16;
-using tensorflow::uint32;
-using tensorflow::uint64;
-using tensorflow::uint8;
-
-template <typename KeyType>
-void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements);
-}
-
-// We would like a total order of floating point numbers so that the
-// sort has a predictable behavior in the presence of NaNs. Rather
-// than using floating point comparison, we use the following trick:
-// If f is a float, and
-// x = bit_cast<int32>(f);
-// y = x < 0 ? 0x7FFFFFFF - x : x;
-// then y is ordered as an int32 such that finite values have the
-// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-// the beginning and end of the ordering.
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-CastType Convert(KeyType value) {
-  CastType casted_value;
-  memcpy(&casted_value, &value, sizeof(CastType));
-  if (casted_value < 0) {
-    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
-           casted_value;
-  }
-  return casted_value;
-}
-
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-bool LessThan(KeyType lhs, KeyType rhs) {
-  return Convert<CastType, UnsignedCastType>(lhs) <
-         Convert<CastType, UnsignedCastType>(rhs);
-}
-
-template <>
-void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<double, int64>& lhs,
-                      const std::pair<double, int64>& rhs) -> bool {
-                     return LessThan<int64, uint64>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<float, int64>& lhs,
-                      const std::pair<float, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
-                  int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<Eigen::half, int64>& lhs,
-                      const std::pair<Eigen::half, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(
-                         Eigen::half_impl::half_to_float(lhs.first),
-                         Eigen::half_impl::half_to_float(rhs.first));
-                   });
-}
+}  // namespace
 
-template <typename KeyType>
-void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
-                      int32 values_count,
-                      int32* values_primitive_type_size_in_bytes) {
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
+    int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*)) {
   // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
   // code, so msan can't tell they are initialized.
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
@@ -121,8 +54,9 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
   int64 num_iteration_elements = a * c;
   int64 sort_dimension_offset = c;
 
-  std::unique_ptr<std::pair<KeyType, int64>[]> row_to_sort(
-      new std::pair<KeyType, int64>[sort_dimension_elements]);
+  std::unique_ptr<int64[]> indices(new int64[sort_dimension_elements]);
+  std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
+  std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
   for (int64 index = 0; index < num_iteration_elements; ++index) {
@@ -135,24 +69,33 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     int64 base_offset =
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
-    // TODO(b/26783907): We could define a custom iterator class that references
-    // all arrays. Then we could avoid the intermediate copy. However this
-    // would become more complicated, and it is not clear if the benefit is high
-    // enough.
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      row_to_sort[i] =
-          std::make_pair(keys[base_offset + i * sort_dimension_offset], i);
-    }
-    KeyValueSort(row_to_sort.get(), sort_dimension_elements);
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      keys[base_offset + i * sort_dimension_offset] = row_to_sort[i].first;
+    auto compare_function = [&](int64 a, int64 b) -> bool {
+      int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      for (int32 i = 0; i < values_count; ++i) {
+        comparison_values[i * 2] = values[i] + memory_index_lhs;
+        comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
+      }
+      char result = 0;  // Overwritten by less_than.
+      less_than(&result, run_options, comparison_values.get(), nullptr,
+                prof_counters);
+      return result != 0u;
+    };
+    if (is_stable) {
+      std::stable_sort(indices.get(), indices.get() + sort_dimension_elements,
+                       compare_function);
+    } else {
+      std::sort(indices.get(), indices.get() + sort_dimension_elements,
+                compare_function);
     }
 
-    // Reorder the values according to the order defined by the keys.
+    // Reorder the values according to the order defined by 'indices'.
     for (int32 idx = 0; idx < values_count; ++idx) {
       for (int64 i = 0; i < sort_dimension_elements; ++i) {
         int64 memory_index =
-            (base_offset + row_to_sort[i].second * sort_dimension_offset) *
+            (base_offset + indices[i] * sort_dimension_offset) *
             values_primitive_type_size_in_bytes[idx];
 
         reordered_values[i] =
@@ -168,88 +111,3 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     }
   }
 }
-}  // namespace
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS8(
-    int8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU8(
-    uint8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS16(
-    int16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU16(
-    uint16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, int64 a, int64 b, int64 c, char** values,
-    int32 values_count, int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS32(
-    int32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU32(
-    uint32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS64(
-    int64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU64(
-    uint64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
index 7821099386969e855ea1737cf53ef49c15c6e93b..50c2911c3bd392b6df12717c34d250ce86ad26e0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
@@ -21,76 +21,26 @@ limitations under the License.
 
 extern "C" {
 
-// 'keys' represents a 3-dimensional shape with dimensions [a, b, c]. The 'b'
-// dimension of 'keys' is sorted into ascending order. If 'values_count' is <=
-// 0, 'values' and 'values_primitive_type_size_in_bytes' can be nullptr.
-// If 'values_count' > 0, they contain exactly 'values_count' many elements.
-// Each element of 'values' also represents a 3-dimensional shape with
-// dimensions [a, b, c], and the size of the primitive type of the i-th shape
-// has exactly 'values_primitive_type_size_in_bytes[i]' bytes. The elements in
-// each 'values' shape are reordered in such a way that if the element at index
-// 'i' in 'keys' was moved to index 'j', the element at index 'i' in a 'values'
-// shape is also moved to index 'j' (which means that the same elements
-// correspond to each other as before).
-extern void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
+// Each entry in 'values' represents a 3-dimensional shape with dimensions
+// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
+// according to the results of comparisons using the provided 'less_than'
+// function. 'values_count' must be > 0 and specifies the number of entries in
+// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
+// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
+// bytes. 'is_stable' specifies whether the sorting should be stable.
+// 'run_options' and 'prof_counters' are passed through to the less-than
+// function, which expects the following arguments:
+// - pointer to the return value buffer (char*)
+// - xla::ExecutableRunOptions = 'run_options' (char*)
+// - pointers to the parameter buffers (char**)
+// - pointers to the buffer tables = nullptr for thread local functions (char**)
+// - profile counters = 'prof_counters' (int64*)
+extern void __xla_cpu_runtime_KeyValueSort(
+    tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
     char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS8(
-    tensorflow::int8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU8(
-    tensorflow::uint8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS16(
-    tensorflow::int16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU16(
-    tensorflow::uint16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS32(
-    tensorflow::int32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU32(
-    tensorflow::uint32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS64(
-    tensorflow::int64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU64(
-    tensorflow::uint64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
+    tensorflow::int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, tensorflow::int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*));
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 12ab6360c560d31fb70c416b8519006ea8675d41..f5d3d840e23cfe26f269e7569ce5e6f7a10f7db6 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -91,14 +91,14 @@ SimpleOrcJIT::InferTargetMachineForJIT(
   return target_machine;
 }
 
-SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
-                           llvm::CodeGenOpt::Level opt_level,
-                           bool optimize_for_size, bool enable_fast_math,
-                           bool disable_expensive_passes,
-                           LLVMCompiler::ModuleHook pre_optimization_hook,
-                           LLVMCompiler::ModuleHook post_optimization_hook)
+SimpleOrcJIT::SimpleOrcJIT(
+    const llvm::TargetOptions& target_options,
+    llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
+    bool enable_fast_math, bool disable_expensive_passes,
+    LLVMCompiler::ModuleHook pre_optimization_hook,
+    LLVMCompiler::ModuleHook post_optimization_hook,
+    std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook)
     : target_machine_(InferTargetMachineForJIT(target_options, opt_level)),
-      disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
           execution_session_,
@@ -128,12 +128,13 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
           [this](VModuleKeyT, const llvm::object::ObjectFile& object) {
             this->NotifyObjectFreed(object);
           }),
-      compile_layer_(object_layer_,
-                     CompilerFunctor(target_machine_.get(), &disassembler_,
-                                     opt_level, optimize_for_size,
-                                     enable_fast_math, disable_expensive_passes,
-                                     std::move(pre_optimization_hook),
-                                     std::move(post_optimization_hook))),
+      compile_layer_(
+          object_layer_,
+          CompilerFunctor(target_machine_.get(), opt_level, optimize_for_size,
+                          enable_fast_math, disable_expensive_passes,
+                          std::move(pre_optimization_hook),
+                          std::move(post_optimization_hook),
+                          std::move(post_codegen_hook))),
       gdb_jit_event_listener_(
           llvm::JITEventListener::createGDBRegistrationListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
@@ -240,18 +241,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortPRED);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
 
   registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
   registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
@@ -348,6 +338,11 @@ bool RegisterKnownJITSymbols() {
                      reinterpret_cast<void*>(memset_pattern16));
 #endif
 
+#ifdef MEMORY_SANITIZER
+  registry->Register("__msan_unpoison",
+                     reinterpret_cast<void*>(__msan_unpoison));
+#endif
+
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 3307c2f93d796bbdcd49af7f68e9f6c388e402ca..075a018987d70feccc56bc8cc376791ec66ea0c9 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
-#include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
@@ -51,29 +50,20 @@ class SimpleOrcJIT {
   using VModuleKeyT = llvm::orc::VModuleKey;
 
   // Create a new JIT, targeting the host architecture.
-  // The |target_options| parameter allows customization of certain code
-  // generation properties of the TargetMachine (whether or not float point math
-  // can be reassociated, etc.).
-  // The |opt_level| parameter controls the optimization level of the code
-  // generator.
-  // The |optimize_for_size| parameter specifies that the code generator should
-  // optimize to reduce code size, potentially at the cost of performance.
-  // The |disable_expensive_passes| parameter will disable certain optimization
-  // passes
-  // The |pre_optimization_hook| is invoked on the module before any IR
-  // level optimizations are applied.
-  // The |post_optimization_hook| is invoked on the module after all IR
-  // level optimizations are applied.
-  SimpleOrcJIT(const llvm::TargetOptions& target_options,
-               llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-               bool enable_fast_math, bool disable_expensive_passes,
-               LLVMCompiler::ModuleHook pre_optimization_hook,
-               LLVMCompiler::ModuleHook post_optimization_hook);
-
-  // Data layout this JIT was created with.
+  //
+  // {pre,post}_optimization_hook is invoked on the module before/after all
+  // LLVM IR-level optimizations.  post_codegen_hook is invoked after
+  // compiling to machine code.
+  SimpleOrcJIT(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
+      bool enable_fast_math, bool disable_expensive_passes,
+      LLVMCompiler::ModuleHook pre_optimization_hook,
+      LLVMCompiler::ModuleHook post_optimization_hook,
+      std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
+
   const llvm::DataLayout& data_layout() const { return data_layout_; }
 
-  // Target triple (host) this JIT was created with.
   const llvm::Triple& target_triple() const {
     return target_machine_->getTargetTriple();
   }
@@ -107,7 +97,6 @@ class SimpleOrcJIT {
 
   std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
-  const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
   llvm::orc::ExecutionSession execution_session_;
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 0b4ac9dc29f88a19d967b7f04ffe42879711b54e..8b7f843582b697058fe328fe69990122d868ada4 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // Tests that we call into Eigen for dot operations as needed.
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index e30f95311fce229f9c559d3bb40142151e8bf3e3..f4da6856940f5f810d2d724c2f0607e780b06bf2 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -56,8 +56,8 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
 
 TEST_F(CpuExternalConstantsTest, Basic) {
   TestWithArray(/*rows=*/1024, /*cols=*/1024, R"(
-CHECK-NOT: @constant_global_0 = external constant [1024 x [1024 x float]], align 16
-CHECK: @0 = private constant [4194304 x i8] {{.*}}, align 16
+CHECK-NOT: @constant_global_0 = external unnamed_addr constant [1024 x [1024 x float]], align 16
+CHECK: @0 = private unnamed_addr constant [4194304 x i8] {{.*}}, align 16
 )");
 }
 
@@ -65,8 +65,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) {
   // The constant array in this test case is small enough that there is no need
   // to externalize it.
   TestWithArray(/*rows=*/4, /*cols=*/4, R"(
-CHECK-NOT: @constant_global_0 = external constant [16 x float], align 8
-CHECK: @0 = private constant [64 x i8] {{.*}}, align 8
+CHECK-NOT: @constant_global_0 = external unnamed_addr constant [16 x float], align 8
+CHECK: @0 = private unnamed_addr constant [64 x i8] {{.*}}, align 8
 )");
 }
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index 04a81dfd35f459ff1fdb3181dc8fc65c62a37d4f..a72ebe2beea9be59f10e45dc8b296690d47aaa3b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -39,6 +39,13 @@ class CpuFusionTest : public HloTestBase {
   CpuFusionTest() {}
 
   ErrorSpec error_spec_{0.0001, 1e-5};
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.add_xla_disable_hlo_passes("layout-assignment");
+    return debug_options;
+  }
 };
 
 TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
@@ -267,12 +274,9 @@ TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
   EXPECT_EQ(4, fusion1->fused_instruction_count());
   EXPECT_EQ(4, fusion2->fused_instruction_count());
 
-  // Each fusion instruction should have one parameter and the parameter should
-  // be the constant.
-  EXPECT_EQ(1, fusion1->operand_count());
-  EXPECT_EQ(constant, fusion1->operand(0));
-  EXPECT_EQ(1, fusion2->operand_count());
-  EXPECT_EQ(constant, fusion2->operand(0));
+  // The fusion has no parameters, everything is fused including constants.
+  EXPECT_EQ(0, fusion1->operand_count());
+  EXPECT_EQ(0, fusion2->operand_count());
 }
 
 TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
@@ -324,10 +328,9 @@ TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
 
   auto fusion_inst = tuple->operand(0);
   // There should be three fused instructions: negate2, exp2, and the fused
-  // parameter.
+  // constant.
   EXPECT_EQ(3, fusion_inst->fused_instruction_count());
-  EXPECT_EQ(1, fusion_inst->operand_count());
-  EXPECT_EQ(constant, fusion_inst->operand(0));
+  EXPECT_EQ(0, fusion_inst->operand_count());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 3fb0e3cd91fd2088884a0792f882fd7de72f0584..9078b8fd1ff6cb0ddac89d5fcd13a9ccfae07763 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
@@ -59,8 +59,9 @@ class CpuUnaryIntrinsicTest
 
     string features{spec.features.data(), spec.features.size()};
     if (!features.empty()) {
-      std::replace_if(features.begin(), features.end(),
-                      [](char c) { return c != '_' && !isalnum(c); }, '_');
+      std::replace_if(
+          features.begin(), features.end(),
+          [](char c) { return c != '_' && !absl::ascii_isalnum(c); }, '_');
     } else {
       features = "";
     }
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index 3934c03a04c978009282b3cd0d39bacf9b12a356..e07ac9edc89de85f36dfdbbaa29886bc44b4c4a9 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -26,10 +26,16 @@ TEST_F(CpuKeyValueSortTest, SortR1) {
   const string hlo_text = R"(
 HloModule KeyValueSort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY main {
   a = f32[10] parameter(0)
 
-  ROOT result = f32[10] sort(f32[10] a), dimensions={0}
+  ROOT result = f32[10] sort(f32[10] a), dimensions={0}, to_apply=compare
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 0584c0484f810a03ccccd522163f54535440ef8b..fc670201125c1c1a9182ddd9667b8d2bb76b7a03 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -56,8 +56,8 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [48 x i8]
-CHECK-NOT: private constant [48 x i8]
+CHECK: private unnamed_addr constant [48 x i8]
+CHECK-NOT: private unnamed_addr constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -102,10 +102,10 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK-DAG: private constant [4 x i8]
-CHECK-DAG: private constant [8 x i8]
-CHECK-NOT: private constant [4 x i8]
-CHECK-NOT: private constant [8 x i8]
+CHECK-DAG: private unnamed_addr constant [4 x i8]
+CHECK-DAG: private unnamed_addr constant [8 x i8]
+CHECK-NOT: private unnamed_addr constant [4 x i8]
+CHECK-NOT: private unnamed_addr constant [8 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index a7702c2aeeaff8a46a2c4f2785ccb873ea2c08e5..030bd41c2fc73eac41fe43c1acdf862d5dc97f98 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -75,8 +75,9 @@ TEST_F(CpuNoAliasTest, Concat) {
   // the buffers in the HLO module.  We'll inspect these loads to ensure that
   // they have the expected alias information.
   llvm::Module ir_module("test", context);
-  llvm::Function* func = llvm::cast<llvm::Function>(
-      ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context)));
+  llvm::Function* func = llvm::dyn_cast<llvm::Function>(
+      ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context))
+          .getCallee());
   llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func);
   llvm::IRBuilder<> b(bb);
   auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0);
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index aab7f0b393881642437f1891256bd138823a3b87..76727c481bc394b24581e46afdb157ba6041a019 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -38,7 +38,7 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [48 x i8]
+CHECK: private unnamed_addr constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
index eb6c44b70ab34d0a294880b5de4fe0b3ba5e19e5..7668f364bad050a60f74db4e9054b4f9c6df04d2 100644
--- a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -938,24 +938,76 @@ void TiledSmallGemmEmitter::EmitTiledGemm(
   });
 }
 
+llvm::Type* GetPointerToElementType(llvm::Type* pointer_type) {
+  llvm::Type* type =
+      llvm::cast<llvm::PointerType>(pointer_type)->getElementType();
+  while (auto* array_type = llvm::dyn_cast<llvm::ArrayType>(type)) {
+    type = array_type->getElementType();
+  }
+
+  return type->getPointerTo();
+}
+
+struct GemvBuffersWithCanonicalType {
+  llvm::Value* lhs_canonicalized;
+  llvm::Value* rhs_canonicalized;
+  llvm::Value* addend_canonicalized;
+  llvm::Value* result_canonicalized;
+};
+
+GemvBuffersWithCanonicalType GetGemvBuffersWithCanonicalType(
+    llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+    llvm::Value* result, llvm::IRBuilder<>* b) {
+  // We characterize a GEMV operation via M and K, since N is implicitly 1.
+  // This means the GEMV that multiplies (say) [5,6] with [6,1] is implemented
+  // by the same GEMV that multiplies [5,6] with [1,6].  However, the
+  // `llvm::Types` for the inputs to the two GEMVs don't match (in a trivial
+  // sense -- the in memory representations are the same) since they're computed
+  // from the `xla::Shape`s.  Since we want to be able to call the same
+  // `llvm::Function` for the two GEMVs we canonicalize the types of the GEMV
+  // inputs here into the same type.
+  GemvBuffersWithCanonicalType buffers_with_canonical_type;
+  llvm::Type* lhs_type = lhs->getType();
+  llvm::Type* rhs_type = rhs->getType();
+  llvm::Type* addend_type = addend ? addend->getType() : nullptr;
+  llvm::Type* result_type = result->getType();
+
+  buffers_with_canonical_type.lhs_canonicalized =
+      b->CreateBitCast(lhs, GetPointerToElementType(lhs_type));
+  buffers_with_canonical_type.rhs_canonicalized =
+      b->CreateBitCast(rhs, GetPointerToElementType(rhs_type));
+  buffers_with_canonical_type.addend_canonicalized =
+      addend ? b->CreateBitCast(addend, GetPointerToElementType(addend_type))
+             : nullptr;
+  buffers_with_canonical_type.result_canonicalized =
+      b->CreateBitCast(result, GetPointerToElementType(result_type));
+
+  return buffers_with_canonical_type;
+}
+
 }  // namespace
 
 void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
                       int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
                       llvm::Value* rhs, llvm::Value* addend,
                       llvm::Value* result, llvm::IRBuilder<>* b,
-                      bool enable_fast_math, bool optimize_for_size) {
+                      const HloModuleConfig& module_config) {
   RowMajorMatrixVectorProductEmitter::Config config(
       /*scalar_type=*/scalar_type,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
-      /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
-      rhs, addend, result,
-      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
-          llvm::Value* result) {
+      module_config, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
         RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                    result, b);
         emitter.Emit();
@@ -966,18 +1018,23 @@ void EmitColumnMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
                          int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
                          llvm::Value* rhs, llvm::Value* addend,
                          llvm::Value* result, llvm::IRBuilder<>* b,
-                         bool enable_fast_math, bool optimize_for_size) {
+                         const HloModuleConfig& module_config) {
   ColumnMajorMatrixVectorProductEmitter::Config config(
       /*scalar_type=*/scalar_type,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
-      /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
-      rhs, addend, result,
-      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
-          llvm::Value* result) {
+      module_config, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
         ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                       result, b);
         emitter.Emit();
@@ -989,7 +1046,7 @@ void EmitSmallGemm(PrimitiveType scalar_type, int64 m, int64 k, int64 n,
                    int64 min_vectorization_width, int64 tile_size_m,
                    int64 tile_size_k, llvm::Value* lhs, llvm::Value* rhs,
                    llvm::Value* result, llvm::IRBuilder<>* b,
-                   bool enable_fast_math, bool optimize_for_size) {
+                   const HloModuleConfig& module_config) {
   TiledSmallGemmEmitter::Config config(
       /*scalar_type=*/scalar_type,
       TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
@@ -999,9 +1056,7 @@ void EmitSmallGemm(PrimitiveType scalar_type, int64 m, int64 k, int64 n,
       /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
 
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
-      /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
-      rhs, result,
+      module_config, b, config.GetCacheKey(), lhs, rhs, result,
       [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result) {
         TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
                                                  /*rhs=*/rhs,
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
index 0a82326cc3704bce8c122261383249c60eda1f3a..77581a53cfb9bc6330a38f0029486a708d837d4f 100644
--- a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
 
 #include "llvm/IR/IRBuilder.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -29,15 +30,15 @@ void EmitRowMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
                       tensorflow::int64 tile_cols, tensorflow::int64 m,
                       tensorflow::int64 k, llvm::Value* lhs, llvm::Value* rhs,
                       llvm::Value* addend, llvm::Value* result,
-                      llvm::IRBuilder<>* b, bool enable_fast_math,
-                      bool optimize_for_size);
+                      llvm::IRBuilder<>* b,
+                      const HloModuleConfig& module_config);
 
 void EmitColumnMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
                          tensorflow::int64 tile_cols, tensorflow::int64 m,
                          tensorflow::int64 k, llvm::Value* lhs,
                          llvm::Value* rhs, llvm::Value* addend,
                          llvm::Value* result, llvm::IRBuilder<>* b,
-                         bool enable_fast_math, bool optimize_for_size);
+                         const HloModuleConfig& module_config);
 
 void EmitSmallGemm(PrimitiveType scalar_type, tensorflow::int64 m,
                    tensorflow::int64 k, tensorflow::int64 n,
@@ -46,8 +47,7 @@ void EmitSmallGemm(PrimitiveType scalar_type, tensorflow::int64 m,
                    tensorflow::int64 min_vectorization_width,
                    tensorflow::int64 tile_size_m, tensorflow::int64 tile_size_k,
                    llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result,
-                   llvm::IRBuilder<>* b, bool enable_fast_math,
-                   bool optimize_for_size);
+                   llvm::IRBuilder<>* b, const HloModuleConfig& module_config);
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 5690d2be2fe3e21c96b51a5226e0b29148217fd1..c444fd7d4aa88fa21b1aa2b2f058bd689b234b15 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -114,6 +114,9 @@ class VectorSupportLibrary {
   // raison d'etre) less cluttered.
 
   llvm::Value* FCmpEQMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpEQMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FCmpEQMask(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
   llvm::Value* FCmpULEMask(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* FCmpOLTMask(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* FCmpOLTMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index ed37099a5428075928ec98b134632867d58bbfe7..490e057fcbcae66e90873fd0009fc80af431d901 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/defuser.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
 
 namespace xla {
 
@@ -49,7 +48,6 @@ Despecializer::Despecializer() : pipeline_("despecializer") {
   pipeline_.AddPass<HloDescheduler>();
   pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
-  pipeline_.AddPass<ImplicitBroadcastRemover>();
   pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
 }
 
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
index 46dcc3a438cbdf3ff1b3c99fa15b35ee7a4e280e..b6afaa17aa24608189bd29bd0371bc95709a5aaf 100644
--- a/tensorflow/compiler/xla/service/despecializer.h
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -28,8 +28,8 @@ namespace xla {
 // optimized for one specific platform on a different platform (undoing platform
 // specific passes) with matching numerics for comparison.
 //
-// Current despecialization passes are Defuser, ImplicitBroadcastRemover,
-// and BFloat16MixedPrecisionRemoval.
+// Current despecialization passes are HloDescheduler, ControlDepRemover,
+// Defuser and BFloat16MixedPrecisionRemoval.
 class Despecializer : public HloModulePass {
  public:
   Despecializer();
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 2132468b9067ad4d5644d6cf3908a488a20ced05..246f2af09b5539612ef0e75929833f532dfa4083 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -103,11 +103,20 @@ class DfsHloVisitorBase {
   virtual Status HandlePower(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
+  virtual Status HandleSqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual Status HandleRsqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 680dd256bb15bd3a9eaff7241174c1d2833002c6..79ce3f82e8c1fe91d590ea7c47fa219ce8e8a80f 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -91,6 +91,12 @@ class DfsHloVisitorWithDefaultBase
   Status HandleFft(HloInstructionPtr fft) override {
     return DefaultAction(fft);
   }
+  Status HandleTriangularSolve(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  Status HandleCholesky(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
@@ -100,6 +106,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCollectivePermute(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleReplicaId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
index 825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062..bd638917ccf4398d478d8b465a2029c1f6e3cc02 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -73,15 +73,14 @@ ENTRY TestComputation {
   abs = f32[] abs(arg)
   add = f32[] add(arg, gte)
   broadcast = f32[42] broadcast(add), dimensions={}
-  slice = f32[0] slice(broadcast), slice={[1:2]}
+  slice = f32[1] slice(broadcast), slice={[1:2]}
   copy = f32[] copy(arg)
-  eq = pred[] equal-to(arg, gte)
+  eq = pred[] compare(arg, gte), direction=EQ
   neg = f32[] negate(arg)
   ROOT convert = f64[] convert(f32[] arg)
 })";
   std::unique_ptr<HloModule> module =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
-          .ConsumeValueOrDie();
+      ParseAndReturnVerifiedModule(hlo_string).ConsumeValueOrDie();
   ElementwiseTestVisitor visitor;
   TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
 }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 855424067d26d4968270e5f24b11f5a053b70a55..559b9c1f2c9f341293ca89adc61e3312fd9f313c 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -297,7 +297,12 @@ StatusOr<bool> DotDecomposer::Run(HloModule* module) {
       const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
       // A dot it not canonical if there are more than one contracting
       // dimension.
-      if (dnums.lhs_contracting_dimensions_size() > 1) {
+      if (dnums.lhs_contracting_dimensions_size() != 1) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty() &&
+          dnums.lhs_contracting_dimensions().empty()) {
         non_canonical_dots.push_back(instruction);
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
new file mode 100644
index 0000000000000000000000000000000000000000..06d045650297a1efa52742e3a06066376235de5e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -0,0 +1,407 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "absl/strings/ascii.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+using absl::StrCat;
+using absl::StrFormat;
+using absl::string_view;
+
+struct CanonicalDebugOptions {
+  explicit CanonicalDebugOptions(const DebugOptions& opts)
+      : dump_to(opts.xla_dump_to()),
+        dump_as_text(opts.xla_dump_hlo_as_text()),
+        dump_as_proto(opts.xla_dump_hlo_as_proto()),
+        dump_as_dot(opts.xla_dump_hlo_as_dot()),
+        dump_as_html(opts.xla_dump_hlo_as_html()),
+        dump_as_url(opts.xla_dump_hlo_as_url()),
+        dump_snapshots(opts.xla_dump_hlo_snapshots()) {
+    // This constructor examines the values in `opts` and turns on other flags
+    // based on what we think is the user's intent.  To reduce confusion about
+    // what was a user-specified value versus an extrapolated value, within this
+    // function we treat this struct's members as write-only, and read only from
+    // `opts`.
+
+    // If dump_to is empty, default to dumping to stdout.
+    if (opts.xla_dump_to().empty()) {
+      dump_to = "-";
+    }
+
+    // Did the user specifiy an explicit format for dumping?
+    bool output_format_specified =
+        opts.xla_dump_hlo_as_text() || opts.xla_dump_hlo_as_proto() ||
+        opts.xla_dump_hlo_as_dot() || opts.xla_dump_hlo_as_html() ||
+        opts.xla_dump_hlo_as_url() || opts.xla_dump_hlo_snapshots();
+
+    // If we haven't specified an output format, default to dumping as text.
+    if (!output_format_specified) {
+      dump_as_text = true;
+    }
+
+    // If we specified a regular expression restricting which modules to dump,
+    // respect that.
+    //
+    // If we didn't specify which modules to dump but we passed some other flag
+    // which implies dumping modules, dump all modules.
+    //
+    // Otherwise, don't dump any HLO modules.
+    if (!opts.xla_dump_hlo_module_re().empty()) {
+      // RE2 object is not copyable, and we can't capture "by move", so we
+      // resort to this hack.
+      string pattern = opts.xla_dump_hlo_module_re();
+      should_dump_module = [pattern](string_view module_name) {
+        return RE2::PartialMatch(string(module_name), pattern);
+      };
+    } else if (!opts.xla_dump_hlo_pass_re().empty() ||
+               !opts.xla_dump_to().empty() || output_format_specified) {
+      should_dump_module = [](string_view) { return true; };
+    } else {
+      should_dump_module = [](string_view) { return false; };
+    }
+
+    // Initialize should_dump_pass.  This one is easy: We only dump per-pass
+    // data if the user asked for it explicitly.
+    if (!opts.xla_dump_hlo_pass_re().empty()) {
+      string pattern = opts.xla_dump_hlo_pass_re();
+      should_dump_pass = [pattern](string_view pass_name) {
+        return RE2::PartialMatch(string(pass_name), pattern);
+      };
+    } else {
+      should_dump_pass = [](string_view) { return false; };
+    }
+
+    // Output dirs "sponge" and "test_undeclared_outputs_dir" (case-insensitive)
+    // have a special meaning: Dump into the directory specified by the
+    // environment variable TEST_UNDECLARED_OUTPUTS_DIR.
+    string dump_to_lower = absl::AsciiStrToLower(opts.xla_dump_to());
+    if (dump_to_lower == "sponge" ||
+        dump_to_lower == "test_undeclared_outputs_dir") {
+      const char* dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
+      if (dir != nullptr) {
+        dump_to = dir;
+      } else {
+        LOG(ERROR) << "--xla_dump_to=" << opts.xla_dump_to()
+                   << ", but environment variable TEST_UNDECLARED_OUTPUTS_DIR "
+                      "is not set, so cannot dump anywhere.";
+        should_dump_module = [](string_view) { return false; };
+        should_dump_pass = [](string_view) { return false; };
+      }
+    }
+  }
+
+  bool dumping_to_stdout() const { return dump_to == "-"; }
+
+  string dump_to;
+  std::function<bool(string_view module_name)> should_dump_module;
+  std::function<bool(string_view pass_name)> should_dump_pass;
+
+  // dump_ir isn't present here because this file is mostly concerned with
+  // dumping HLO.
+  bool dump_as_text;
+  bool dump_as_proto;
+  bool dump_as_dot;
+  bool dump_as_html;
+  bool dump_as_url;
+  bool dump_snapshots;
+};
+
+string FilenameFor(const HloModule& module, string_view suffix) {
+  return StrFormat("module_%04d.%s", module.unique_id(), suffix);
+}
+
+void DumpToFileInDirImpl(string_view filename, string_view contents,
+                         const CanonicalDebugOptions& opts) {
+  if (opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+
+  const string& dir = opts.dump_to;
+  VLOG(1) << "Dumping " << filename << " to " << dir;
+
+  tensorflow::Env* env = tensorflow::Env::Default();
+  // Two threads can race to observe the absence of the dump directory and
+  // simultaneously try to create it, causing the "losing" thread to get a
+  // "directory already exists" error.  We can work around this by checking
+  // again whether the dir exists.
+  if (!env->IsDirectory(dir).ok()) {
+    auto status = env->RecursivelyCreateDir(dir);
+    if (!status.ok() && !env->IsDirectory(dir).ok()) {
+      LOG(ERROR) << "Could not create directory " << dir
+                 << " for dumping XLA debug data: " << status;
+      return;
+    }
+  }
+
+  string file_path =
+      tensorflow::io::JoinPath(dir, SanitizeFileName(string(filename)));
+  auto status = tensorflow::WriteStringToFile(env, file_path, contents);
+  if (!status.ok()) {
+    LOG(ERROR) << "Could not write XLA debug data to " << file_path << ": "
+               << status;
+  }
+}
+
+void DumpToFileInDirOrStdoutImpl(string_view filename, string_view contents,
+                                 const CanonicalDebugOptions& opts) {
+  // Dump to stdout if that's called for.
+  if (opts.dumping_to_stdout()) {
+    std::cout << "*** Begin " << filename << " ***\n"
+              << contents << "\n*** End " << filename << " ***" << std::endl;
+    return;
+  }
+
+  // Otherwise, dump to a file.
+  DumpToFileInDirImpl(filename, contents, opts);
+}
+
+void DumpHloModuleImpl(const HloModule& module,
+                       const BufferAssignment* buffer_assn,
+                       const HloExecutionProfile* profile, string_view suffix,
+                       const CanonicalDebugOptions& opts) {
+  string filename = FilenameFor(module, suffix);
+
+  if (opts.dump_as_text) {
+    DumpToFileInDirOrStdoutImpl(StrCat(filename, ".txt"), module.ToString(),
+                                opts);
+  }
+
+  if (opts.dump_as_proto) {
+    HloProto module_proto =
+        buffer_assn ? MakeHloProto(module, *buffer_assn) : MakeHloProto(module);
+    string pb;
+    if (!tensorflow::SerializeToStringDeterministic(module_proto, &pb)) {
+      pb = "Failed to serialize HLO module proto.";
+    }
+    DumpToFileInDirImpl(StrCat(filename, ".hlo.pb"), pb, opts);
+  }
+
+  auto render_graph = [&](RenderedGraphFormat format) {
+    StatusOr<string> rendered_graph = RenderGraph(
+        *module.entry_computation(),
+        /*label=*/filename, module.config().debug_options(), format, profile);
+    if (rendered_graph.ok()) {
+      return std::move(rendered_graph).ValueOrDie();
+    }
+    return StrFormat("Error rendering graph: %s",
+                     rendered_graph.status().ToString());
+  };
+
+  if (opts.dump_as_dot) {
+    DumpToFileInDirImpl(StrFormat("%s.dot", filename),
+                        render_graph(RenderedGraphFormat::kDot), opts);
+  }
+
+  if (opts.dump_as_html) {
+    DumpToFileInDirImpl(StrFormat("%s.html", filename),
+                        render_graph(RenderedGraphFormat::kHtml), opts);
+  }
+
+  // Special case for rendering graphs as URLs.  We'll dump them to a file
+  // because why not, but we always log them to stdout as well.
+  if (opts.dump_as_url) {
+    string url = render_graph(RenderedGraphFormat::kUrl);
+    std::cout << filename << " --> " << url << std::endl;
+    if (!opts.dumping_to_stdout()) {
+      DumpToFileInDirImpl(StrFormat("%s.url", filename), url, opts);
+    }
+  }
+}
+
+static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+
+// Maps a module's unique ID to a counter indicating how many times we've dumped
+// this module during the compilation pipeline.  This lets us keep the filenames
+// ordered nicely.
+//
+// Entries added here leak forever; we have no way to GC them when a module
+// dies.  But we only add an entry if dumping is enabled for this module, and
+// dumping a module leaks buffer space in stdout or bytes on disk *way* faster
+// than this hashtable leaks memory.
+static auto& module_id_to_step_number GUARDED_BY(mu) =
+    *new absl::flat_hash_map<int64, int64>();
+
+}  // namespace
+
+void DumpToFileInDir(const HloModule& module, string_view suffix,
+                     string_view contents) {
+  DumpToFileInDirImpl(FilenameFor(module, suffix), contents,
+                      CanonicalDebugOptions(module.config().debug_options()));
+}
+
+void DumpToFileInDirOrStdout(const HloModule& module, string_view suffix,
+                             string_view contents) {
+  DumpToFileInDirOrStdoutImpl(
+      FilenameFor(module, suffix), contents,
+      CanonicalDebugOptions(module.config().debug_options()));
+}
+
+void DumpHloModuleIfEnabled(const HloModule& module, string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                      name, opts);
+  }
+}
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const BufferAssignment& buffer_assn,
+                            string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, &buffer_assn, /*profile=*/nullptr, name, opts);
+  }
+}
+
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const HloExecutionProfile& profile,
+                            string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, &profile, name, opts);
+  }
+}
+
+bool DumpingEnabledForHloModule(string_view hlo_module_name,
+                                const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).should_dump_module(hlo_module_name);
+}
+
+bool DumpingToStdout(const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).dumping_to_stdout();
+}
+
+void DumpHloModuleBetweenPassesIfEnabled(string_view pipeline_name,
+                                         string_view before_pass_name,
+                                         string_view after_pass_name,
+                                         const HloModule& module) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name())) {
+    return;
+  }
+
+  if (!opts.should_dump_pass(before_pass_name) &&
+      !opts.should_dump_pass(after_pass_name)) {
+    return;
+  }
+
+  int64 step_number;
+  {
+    tensorflow::mutex_lock lock(mu);
+    step_number = module_id_to_step_number[module.unique_id()]++;
+  }
+
+  string filename_suffix =
+      StrFormat("%04d.%s.after_%s.before_%s", step_number, pipeline_name,
+                after_pass_name, before_pass_name);
+  DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                    filename_suffix, opts);
+}
+
+void DumpHloModuleDuringPassIfEnabled(string_view pass_name,
+                                      string_view step_name,
+                                      const HloModule& module) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name()) ||
+      !opts.should_dump_pass(pass_name)) {
+    return;
+  }
+
+  int64 step_number;
+  {
+    tensorflow::mutex_lock lock(mu);
+    step_number = module_id_to_step_number[module.unique_id()]++;
+  }
+
+  string filename_suffix =
+      StrFormat("%04d.%s.%s", step_number, pass_name, step_name);
+  DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                    filename_suffix, opts);
+}
+
+void DumpHloSnapshotIfEnabled(const HloModule& module,
+                              const HloSnapshot& snapshot) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name()) || !opts.dump_snapshots) {
+    return;
+  }
+  int64 execution_count;
+  {
+    static auto& module_id_to_execution_count GUARDED_BY(mu) =
+        *new absl::flat_hash_map<int64, int64>();
+    tensorflow::mutex_lock lock(mu);
+    execution_count = module_id_to_execution_count[module.unique_id()]++;
+  }
+  string filename =
+      StrCat(FilenameFor(module, StrFormat("execution_%04d", execution_count)),
+             ".hlo_snapshot.pb");
+  if (opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+  string pb;
+  if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
+    LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
+  }
+  DumpToFileInDirImpl(filename, pb, opts);
+}
+
+void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
+                              const DebugOptions& opts) {
+  CanonicalDebugOptions canonical_opts(opts);
+  string name = snapshot.hlo().hlo_module().name();
+  if (!canonical_opts.should_dump_module(name) ||
+      !canonical_opts.dump_snapshots) {
+    return;
+  }
+
+  // We don't have a unique id for an HloSnapshot, so in this overload we just
+  // have to use its name.
+  int64 execution_count;
+  {
+    static auto& module_name_to_execution_count GUARDED_BY(mu) =
+        *new absl::flat_hash_map<string, int64>();
+    tensorflow::mutex_lock lock(mu);
+    execution_count = module_name_to_execution_count[name]++;
+  }
+  string filename = StrFormat("module_%s.execution_%04d.hlo_snapshot.pb", name,
+                              execution_count);
+  if (canonical_opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+  string pb;
+  if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
+    LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
+  }
+  DumpToFileInDirImpl(filename, pb, canonical_opts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h
new file mode 100644
index 0000000000000000000000000000000000000000..6edc9b28ddeb73a453bca4b96f1e34e3538c1e50
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dump.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+// Consolidated utilities for logging information during compilation, usually
+// based on the options specified in the DebugOptions proto.
+//
+// Most functions here take an HloModule and read the DebugOptions from the
+// module's config.
+
+namespace xla {
+
+class BufferAssignment;
+class HloExecutionProfile;
+class HloSnapshot;
+
+// Writes the given string to a file in the xla_dump_to directory specified by
+// module's DebugOptions.
+//
+// If module doesn't have an xla_dump_to directory, does nothing.
+void DumpToFileInDir(const HloModule& module, absl::string_view file_suffix,
+                     absl::string_view contents);
+
+// Like DumpToFileInDir, except if module doesn't have an xla_dump_to directory
+// specified, or if that directory is equal to "-", writes to stdout instead.
+void DumpToFileInDirOrStdout(const HloModule& module,
+                             absl::string_view file_suffix,
+                             absl::string_view contents);
+
+// Dumps the given HLO module if dumping is enabled for the module.  Exactly
+// where and in what formats it's dumped is determined by the module's config.
+//
+// If you pass an HloExecutionProfile, note that currently only DOT-based output
+// formats (i.e. --xla_dump_as_{dot,html,url}) are able to incorporate it into
+// their output.  Other formats will just ignore the profile.
+void DumpHloModuleIfEnabled(const HloModule& module, absl::string_view name);
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const BufferAssignment& buffer_assn,
+                            absl::string_view name);
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const HloExecutionProfile& profile,
+                            absl::string_view name);
+
+// Dumps the given HLO module after running one HLO pass and before running
+// another, if that's enabled.
+void DumpHloModuleBetweenPassesIfEnabled(absl::string_view pipeline_name,
+                                         absl::string_view before_pass_name,
+                                         absl::string_view after_pass_name,
+                                         const HloModule& module);
+
+// Dumps the given HLO module during the given HLO pass, if that's enabled.
+//
+// "step" is a human-readable description of where we are in the middle of this
+// pass.  For example, "before-assigning-layouts".
+void DumpHloModuleDuringPassIfEnabled(absl::string_view pass_name,
+                                      absl::string_view step,
+                                      const HloModule& module);
+
+// Dumps the given HloSnapshot to the module's xla_dump_dir, if this is enabled.
+//
+// Prefer the first overload below, as this will give filenames that are
+// consistent with the other methods here.  The second overload (which doesn't
+// take an HloModule) is useful in the cases when you're dumping an HloSnapshot
+// and simply don't have an HloModule.
+void DumpHloSnapshotIfEnabled(const HloModule& module,
+                              const HloSnapshot& snapshot);
+void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
+                              const DebugOptions& opts);
+
+// Returns true if we should dump data for an HloModule.  This is useful if you
+// want to check if DumpToFileInDir{,OrStdout} will do anything before
+// generating an expensive string.
+bool DumpingEnabledForHloModule(absl::string_view hlo_module_name,
+                                const DebugOptions& opts);
+inline bool DumpingEnabledForHloModule(const HloModule& module) {
+  return DumpingEnabledForHloModule(module.name(),
+                                    module.config().debug_options());
+}
+
+// Returns true if DumpToFileInDirOrStdout and DumpHloModuleIfEnabled will write
+// to stdout, rather than to a file on disk.
+//
+// This is useful if you want to do something different when writing to stdout.
+// For example, maybe you have (almost-)duplicate data that you wouldn't mind
+// writing to two files, but you don't want to print twice.
+bool DumpingToStdout(const DebugOptions& opts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 2b158d7a6ec510ce4cbc56bddc5cca71ac4f14f4..5f7d8a761252c03d43acb4869da6b1f59c823357 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -18,19 +18,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 
 namespace xla {
 
-namespace {
-bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
-  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
-         window_dimension.padding_low() == 0 &&
-         window_dimension.padding_high() == 0 &&
-         window_dimension.window_dilation() == 1 &&
-         window_dimension.base_dilation() == 1;
-}
-}  // namespace
-
 class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
  public:
   explicit DynamicDimensionInferenceVisitor(
@@ -53,6 +45,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleDot(HloInstruction* hlo) override;
 
+  Status HandleTuple(HloInstruction* hlo) override;
+
   Status HandleTranspose(HloInstruction* hlo) override;
 
   Status HandleReshape(HloInstruction* hlo) override;
@@ -77,6 +71,10 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleElementwiseBinary(HloInstruction* hlo) override;
 
+  Status HandleWhile(HloInstruction* hlo) override;
+
+  Status HandleSlice(HloInstruction* hlo) override;
+
  private:
   using OperandDynamicDimensionFn = std::function<Status(
       HloInstruction* operand, ShapeIndex index, int64 dimension,
@@ -122,12 +120,22 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
       });
 }
 
+Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        index.push_front(operand_index);
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
 Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
                int64 operand_index, HloInstruction* dynamic_size) {
         int64 broadcast_dim = hlo->dimensions(dimension);
-        parent_->SetDynamicSize(hlo, index, broadcast_dim, dynamic_size);
+        parent_->SetDynamicSize(hlo, {}, broadcast_dim, dynamic_size);
         return Status::OK();
       });
 }
@@ -349,7 +357,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
         const WindowDimension& window_dimension =
             reduce_window->window().dimensions(dimension);
 
-        if (!IsTrivialWindowDimension(window_dimension)) {
+        if (!window_util::IsTrivialWindowDimension(window_dimension)) {
           return Unimplemented(
               "Dynamic Spatial reduce window is not supported: %s",
               reduce_window->ToString());
@@ -370,7 +378,7 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
         const WindowDimension& window_dimension =
             select_and_scatter->window().dimensions(dimension);
 
-        if (!IsTrivialWindowDimension(window_dimension)) {
+        if (!window_util::IsTrivialWindowDimension(window_dimension)) {
           return Unimplemented(
               "Dynamic Spatial select and scatter is not supported: %s",
               select_and_scatter->ToString());
@@ -383,6 +391,140 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
       });
 }
 
+Status DynamicDimensionInferenceVisitor::HandleSlice(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex /*index*/, int64 dimension,
+               int64 /*operand_index*/, HloInstruction* dynamic_size) {
+        if (hlo->slice_starts(dimension) != 0 ||
+            hlo->slice_strides(dimension) != 1 ||
+            hlo->slice_limits(dimension) !=
+                operand->shape().dimensions(dimension)) {
+          return Unimplemented(
+              "Dynamic dimension propagation on Slice where it doesn't slice "
+              "out an entire dimension is not supported %s",
+              hlo->ToString());
+        }
+
+        parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
+  // While loop is handled by passing dynamic size hlos as parameters into the
+  // hlo while loop. This is done by replacing the original while with a new
+  // one.
+  //
+  // Before:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op1_x = ... // dynamic dimension size of op1
+  // while = while(op1, op2)
+  //
+  //
+  // After:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op1_x = ... // dynamic dimension size of op1
+  // while = while(op1, op2, op1_x)
+  //
+  // In the above graph, op_x is the bound of the dynamic dimension size of op1
+  // and is wired into the while loop as new parameter.
+  //
+  // TODO(b/119843103): Once we implement dynamic bounds in XLA backend, dynamic
+  // bound can be propagated through native xla values instead of relying on
+  // additional parameter.
+
+  // dynamic_size_to_operand_id_index_map keeps track of dynamic size operations
+  // to their operand ids in the new while loop.
+  absl::flat_hash_map<HloInstruction*, int64>
+      dynamic_size_to_operand_id_index_map;
+
+  // operands_to_add collects dynamic sizes that need to be added to the while
+  // loop as parameters. Note that a dynamic size is ignored if it is already
+  // part of the parameter. i.e.:
+  //
+  // We don't do:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op_x = ... // dynamic dimension size of both op1 and op2
+  // while = while(op1, op2, op_x, op_x) // 4 parameters
+  //
+  // But we do:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op_x = ... // dynamic dimension size of both op1 and op2
+  // while = while(op1, op2, op_x)
+  //
+  // An alternative is to do this in a while loop CSE pass.
+  //
+  std::vector<HloInstruction*> operands_to_add;
+  int64 operand_count = hlo->shape().tuple_shapes_size();
+  TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex, int64, int64,
+               HloInstruction* dynamic_size) {
+        const HloInstruction* tuple_operand = hlo->operand(0);
+        for (int64 i = 0; i < tuple_operand->operand_count(); ++i) {
+          if (dynamic_size == tuple_operand->operand(i)) {
+            dynamic_size_to_operand_id_index_map[dynamic_size] = i;
+            return Status::OK();
+          }
+        }
+        auto iter = dynamic_size_to_operand_id_index_map.find(dynamic_size);
+        if (iter == dynamic_size_to_operand_id_index_map.end()) {
+          operands_to_add.push_back(dynamic_size);
+          dynamic_size_to_operand_id_index_map[dynamic_size] = operand_count++;
+        }
+        return Status::OK();
+      }));
+
+  if (!operands_to_add.empty()) {
+    // Only replace the while loop if there are new parameters to add.
+    HloInstruction* old_tuple_operand = hlo->mutable_operand(0);
+    TF_ASSIGN_OR_RETURN(
+        WhileUtil::MakeInstructionsLiveInResult result,
+        WhileUtil::MakeInstructionsLiveIn(hlo, operands_to_add));
+    // WhileUtil creates a new while hlo and tuple. Update the dynamic size
+    // mapping for the newly created tuple.
+    HloInstruction* new_tuple_operand =
+        result.new_while_instr->mutable_operand(0);
+    parent_->CopyMapping(/*from=*/old_tuple_operand, /*to=*/new_tuple_operand);
+    hlo = result.new_while_instr;
+  }
+
+  // We have replaced the while loop, now set the dynamic dimensions for the
+  // newly created while loop so that the hlos that consumes the while loop can
+  // see the dynamic dimensions. Also sets the dynamic parameter binding for
+  // running inference in the while loop.
+  DynamicParameterBinding binding_for_while;
+  TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        DynamicParameterBinding::DynamicParameter dynamic_parameter{
+            operand_index,
+            {dynamic_size_to_operand_id_index_map[dynamic_size]}};
+        DynamicParameterBinding::DynamicDimension dynamic_dimension{
+            operand_index, index, dimension};
+        TF_RETURN_IF_ERROR(
+            binding_for_while.Bind(dynamic_parameter, dynamic_dimension));
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      }));
+
+  // Run inference in while body and condition.
+  TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
+      hlo->while_body(), binding_for_while, parent_));
+  TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
+      hlo->while_condition(), binding_for_while, parent_));
+
+  return Status::OK();
+}
+
 Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
   return param_bindings_.ForEachBinding(
       [&](const DynamicParameterBinding::DynamicParameter& dynamic_parameter,
@@ -430,6 +572,20 @@ Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
   return Status::OK();
 }
 
+void DynamicDimensionInference::CopyMapping(HloInstruction* from,
+                                            HloInstruction* to) {
+  auto iter = per_hlo_dynamic_dimensions_.find(from);
+  if (iter != per_hlo_dynamic_dimensions_.end()) {
+    for (auto& dynamic_dimension : iter->second) {
+      HloInstruction* dynamic_size =
+          GetDynamicSize(dynamic_dimension.inst, dynamic_dimension.index,
+                         dynamic_dimension.dim);
+      SetDynamicSize(to, dynamic_dimension.index, dynamic_dimension.dim,
+                     dynamic_size);
+    }
+  }
+}
+
 /* static */
 StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
     HloModule* module) {
@@ -439,6 +595,20 @@ StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
   return inference;
 }
 
+string DynamicDimensionInference::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicDimensionInference: ");
+  for (const auto& mapping : dynamic_mapping_) {
+    const DynamicDimension& dynamic_dimension = mapping.first;
+    pieces.push_back(absl::StrFormat(
+        " -- instruction %s at %s has dim %lld as dynamic"
+        " dimension, which is represented by instruction %s",
+        dynamic_dimension.inst->ToString(), dynamic_dimension.index.ToString(),
+        dynamic_dimension.dim, mapping.second->ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
 DynamicDimensionInference::DynamicDimensionInference(HloModule* module)
     : module_(module) {}
 
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 164d15bf111a92e3da957f609b54ee0662ef18b1..d0f2998328f3028ccbd5b33690a514371a03b5a1 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -88,6 +88,11 @@ class DynamicDimensionInference {
     iter.first->second.emplace(DynamicDimension{inst, index, dim});
   }
 
+  // Copies the internal mapping from instruction `from` to instruction `to`.
+  // This is useful when an instruction is replaced by the other during the
+  // inferencing process.
+  void CopyMapping(HloInstruction* from, HloInstruction* to);
+
   // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
   // module_.
   Status AnalyzeDynamicDimensions();
@@ -101,6 +106,8 @@ class DynamicDimensionInference {
   using DynamicMapping = absl::flat_hash_map<DynamicDimension, HloInstruction*>;
   DynamicMapping dynamic_mapping_;
 
+  // A convenient mapping from an hlo to the set of dynamic dimensions that it
+  // holds.
   using PerHloDynamicDimensions =
       absl::flat_hash_map<HloInstruction*,
                           absl::flat_hash_set<DynamicDimension>>;
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index b42e67b4bbcf731d89dd8af9e46b405235a92d8a..36456e5649ad53b3996d41d8659feaf6205b48b3 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -43,7 +43,6 @@ class DynamicDimensionInferenceTest : public HloTestBase {
   }
 
   Status RunInference() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
     TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
                         DynamicDimensionInference::Run(module_.get()));
 
@@ -68,8 +67,8 @@ class DynamicDimensionInferenceTest : public HloTestBase {
         0, ShapeUtil::MakeShape(F32, {}), "lhs"));
     auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
         1, ShapeUtil::MakeShape(F32, {}), "rhs"));
-    embedded_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGe, lhs, rhs));
+    embedded_builder.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), lhs, rhs, ComparisonDirection::kGe));
     return module_->AddEmbeddedComputation(embedded_builder.Build());
   }
 
@@ -88,6 +87,8 @@ TEST_F(DynamicDimensionInferenceTest, ParamTest) {
       HloInstruction::CreateParameter(1, scalar_shape_, "param"));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   // Set up dynamic parameter binding.
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
       DynamicParameterBinding::DynamicParameter{1, {}},
@@ -112,6 +113,7 @@ TEST_F(DynamicDimensionInferenceTest, ParamTestTuple) {
       DynamicParameterBinding::DynamicParameter{0, {1}},
       DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
               op::GetTupleElement(param, 1));
@@ -137,6 +139,7 @@ TEST_F(DynamicDimensionInferenceTest, GetTupleElement) {
       DynamicParameterBinding::DynamicParameter{0, {1}},
       DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
               op::GetTupleElement(param, 1));
@@ -167,6 +170,7 @@ TEST_F(DynamicDimensionInferenceTest, ElementwiseTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(negate, {}, 1), size_param);
 }
@@ -197,6 +201,7 @@ TEST_F(DynamicDimensionInferenceTest, ReduceTestI) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), size_param);
 }
@@ -228,6 +233,7 @@ TEST_F(DynamicDimensionInferenceTest, ReduceTestII) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 2}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 1), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
@@ -271,6 +277,7 @@ TEST_F(DynamicDimensionInferenceTest, DotTest) {
       DynamicParameterBinding::DynamicParameter{2, {}},
       DynamicParameterBinding::DynamicDimension{1, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
@@ -319,6 +326,7 @@ TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
       DynamicParameterBinding::DynamicParameter{2, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 1), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 0), nullptr);
@@ -356,6 +364,7 @@ TEST_F(DynamicDimensionInferenceTest, TransposeTest) {
       DynamicParameterBinding::DynamicParameter{3, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 2}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 0), size_param_3);
   EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 1), size_param_2);
@@ -386,6 +395,7 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 3}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
   EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 1), size_param);
@@ -415,6 +425,7 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeTestUnimplemented) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   Status status = RunInference();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
 }
@@ -439,12 +450,103 @@ TEST_F(DynamicDimensionInferenceTest, BroadcastTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 0), nullptr);
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 1), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 2), nullptr);
 }
 
+TEST_F(DynamicDimensionInferenceTest, WhileTest) {
+  // Test the ability to trace into while loops.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
+
+  // Body:
+  //
+  //   Param
+  //   |  |
+  // GTE1 GTE2
+  //   |  |
+  //    ADD
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto gte_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, body_param, 0));
+  auto gte_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, body_param, 1));
+  auto add = body_builder.AddInstruction(
+      HloInstruction::CreateBinary(input_shape, HloOpcode::kAdd, gte_0, gte_1));
+  body_builder.AddInstruction(HloInstruction::CreateTuple({add, add}));
+
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_->AddEmbeddedComputation(cond_builder.Build());
+
+  // Entry:
+  //
+  //  Param
+  //   |
+  //  While
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, tuple_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 0}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  // Test that dynamic dimension inference does the right thing. A lambda is
+  // used here since we want to test twice by running inference again
+  // (idempotency).
+  auto test_dynamic_dimension = [&]() {
+    HloInstruction* while_hlo = nullptr;
+    // The while hlo has been replaced, find the new one.
+    for (HloInstruction* inst : module_->entry_computation()->instructions()) {
+      if (inst->opcode() == HloOpcode::kWhile) {
+        while_hlo = inst;
+      }
+    }
+    ASSERT_NE(while_hlo, nullptr);
+    // The original while shape has 2 parameters. With dynamic size passed in
+    // as an extra parameter, the tuple should have 3 elements.
+    EXPECT_EQ(while_hlo->shape().tuple_shapes_size(), 3);
+    HloInstruction* add = nullptr;
+    for (HloInstruction* inst : while_hlo->while_body()->instructions()) {
+      if (inst->opcode() == HloOpcode::kAdd) {
+        add = inst;
+      }
+    }
+    EXPECT_NE(add, nullptr);
+    EXPECT_NE(inference_->GetDynamicSize(add, {}, 0), nullptr);
+    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {0}, 0), size_param);
+    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {1}, 0), size_param);
+  };
+
+  TF_ASSERT_OK(RunInference());
+  test_dynamic_dimension();
+  TF_ASSERT_OK(RunInference());
+  test_dynamic_dimension();
+}
+
 TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
   // Test the ability to trace reduce window batch dimensions.
   auto builder = HloComputation::Builder(TestName());
@@ -490,6 +592,7 @@ TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
 }
@@ -543,9 +646,32 @@ TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{2, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(sns, {}, 0), size_param);
 }
 
+TEST_F(DynamicDimensionInferenceTest, SliceTest) {
+  auto builder = HloComputation::Builder(TestName());
+
+  auto data_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {5, 7}), "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto* slice = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {5, 7}), data_param, /*start_indices=*/{0, 0},
+      /*limit_indices=*/{5, 7}, /*strides=*/{1, 1}));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(slice, {}, 1), size_param);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 4db280f817141bd52e3a5b9564600a618f81aeac..9e1efa44299609803f168832360eaa2f3e579fe5 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -67,12 +67,18 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
     case HloOpcode::kPad: {
       return inst->mutable_operand(1);
     }
+
+    case HloOpcode::kSelectAndScatter: {
+      return inst->mutable_operand(2);
+    }
     case HloOpcode::kParameter:
     case HloOpcode::kGetDimensionSize:
     case HloOpcode::kReshape:
     case HloOpcode::kTuple:
     case HloOpcode::kAllReduce:
     case HloOpcode::kBroadcast:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kSlice:
       return nullptr;
     default:
       return UnimplementedStrCat("Unimplimented padding for instruction: ",
@@ -80,6 +86,22 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
   }
 }
 
+bool ShouldSkipPadOnOperand(const HloInstruction* inst, int64 operand_num,
+                            int64 dimension) {
+  if ((inst->opcode() == HloOpcode::kReduceWindow ||
+       inst->opcode() == HloOpcode::kSelectAndScatter) &&
+      operand_num == 0 && inst->window().dimensions(dimension).size() == 1) {
+    return true;
+  }
+
+  if (operand_num == 0 && inst->opcode() == HloOpcode::kConvolution &&
+      inst->convolution_dimension_numbers().input_batch_dimension() ==
+          dimension) {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
 
 StatusOr<bool> DynamicPadder::Run(HloModule* module) {
@@ -105,6 +127,11 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
           }
           VLOG(1) << "Has dynamic dimension of operand" << operand_num << " @"
                   << dim;
+
+          if (ShouldSkipPadOnOperand(inst, operand_num, dim)) {
+            continue;
+          }
+
           TF_ASSIGN_OR_RETURN(HloInstruction * identity_value,
                               ChooseIdentityValue(inst));
           if (identity_value == nullptr) {
@@ -133,9 +160,10 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
           HloInstruction* broadcasted_effective_size =
               computation->AddInstruction(HloInstruction::CreateBroadcast(
                   mask_shape, dynamic_size, {}));
-          HloInstruction* pred = computation->AddInstruction(
-              HloInstruction::CreateBinary(pred_shape, HloOpcode::kLt, iota,
-                                           broadcasted_effective_size));
+          HloInstruction* pred =
+              computation->AddInstruction(HloInstruction::CreateCompare(
+                  pred_shape, iota, broadcasted_effective_size,
+                  ComparisonDirection::kLt));
 
           HloInstruction* broadcasted_identity_value =
               computation->AddInstruction(HloInstruction::CreateBroadcast(
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index 55a11286e4596d87c330315322cae704fc5cd707..2963deaa317e1b04346046bde35a6025abe0924e 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -41,10 +43,7 @@ class DynamicPadderTest : public HloTestBase {
   DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
 
   StatusOr<bool> RunPadder() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before padder");
-
     DynamicPadder padder;
-
     return padder.Run(module_.get());
   }
 
@@ -133,19 +132,84 @@ TEST_F(DynamicPadderTest, ConvolutionTest) {
 
   module_->AddEntryComputation(builder.Build());
 
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(conv->operand(0));
+}
+
+TEST_F(DynamicPadderTest, ConvolutionNoPad) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
   // Set up dynamic parameter binding for non-contracting dimension.
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
       DynamicParameterBinding::DynamicParameter{2, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
 
-  // Set up binding for contracting dimensions.
+  TF_ASSERT_OK(RunPadder().status());
+
+  EXPECT_THAT(conv->operand(0), op::Parameter());
+}
+
+TEST_F(DynamicPadderTest, ReduceWindowNoPadForTrivialWindow) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {4, 5});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {3, 5});
+
+  auto input = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "input"));
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  TF_ASSERT_OK_AND_ASSIGN(Window window, ParseWindow("size=2x1 pad=0_0x0_0"));
+  auto output = builder.AddInstruction(HloInstruction::CreateReduceWindow(
+      reduce_shape, input, init, window, GetScalarAddComputation()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
-      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
   TF_ASSERT_OK(RunPadder().status());
 
-  ExpectPadded(conv->operand(0));
+  EXPECT_THAT(output->operand(0), op::Parameter());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
index 5549cccfa86f9445ae0aa68748fde2b131ee5a5e..7f0ae692f7414dbdcccda8b287c9059bcf920df1 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -29,7 +29,8 @@ Status DynamicParameterBinding::Bind(
 }
 
 absl::optional<DynamicParameterBinding::DynamicParameter>
-DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+DynamicParameterBinding::GetBinding(
+    const DynamicDimension& dynamic_dimension) const {
   auto param_iter = bindings_.find(dynamic_dimension);
   if (param_iter == bindings_.end()) {
     return absl::nullopt;
@@ -111,7 +112,8 @@ Status DynamicParameterBinding::Verify(const HloModule& module) const {
   return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
                             const DynamicDimension& dynamic_dimension)
                             -> Status {
-    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_parameter.parameter_num >= 0 &&
+                 dynamic_parameter.parameter_num < entry->num_parameters());
     TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
     TF_RET_CHECK(ShapeUtil::IndexIsValid(
         entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
index dd474d8eed1b2c30ddb8f624a864198c74eacaba..57af2c43d3c65f7340e6a9f04e5abbf052ebceea 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -89,7 +89,7 @@ class DynamicParameterBinding {
   //
   // Returns nullopt if the binding is not set.
   absl::optional<DynamicParameter> GetBinding(
-      const DynamicDimension& dynamic_dimension);
+      const DynamicDimension& dynamic_dimension) const;
 
   using BindingFn =
       std::function<Status(const DynamicParameter& dynamic_parameter,
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
index 83a6d83dffde7995bd8e43917d13c5fd2705ba6f..b5d57cda4f469a384dc0affdae9e5f93a70ac418 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -33,7 +33,15 @@ limitations under the License.
 
 namespace xla {
 namespace {
-class DynamicParameterBindingTest : public HloTestBase {};
+class DynamicParameterBindingTest : public HloTestBase {
+ protected:
+  // Serialize and then deserialize a binding.
+  void SerializeAndDeserialize(DynamicParameterBinding* binding) {
+    DynamicParameterBindingProto proto = binding->ToProto();
+    TF_ASSERT_OK_AND_ASSIGN(*binding,
+                            DynamicParameterBinding::CreateFromProto(proto));
+  }
+};
 
 TEST_F(DynamicParameterBindingTest, SimpleBinding) {
   // 'b' is a dynamic shape; 'a' represents the real size of b's first
@@ -56,15 +64,20 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
                    DynamicParameterBinding::DynamicDimension{1, {}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
-                                                    /*parameter_index=*/{},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                      /*parameter_index=*/{},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBinding) {
@@ -89,16 +102,21 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
@@ -127,26 +145,35 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-
-  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param2);
-  EXPECT_EQ(param2->parameter_num, 0);
-  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
-
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+    absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param2);
+    EXPECT_EQ(param2->parameter_num, 0);
+    EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+
+  test(binding);
+
+  SerializeAndDeserialize(&binding);
+
+  // Test the binding again after deserialization.
+  test(binding);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 727e0bfa52d45b6f8c67d7d04613e4865f18a53c..53513fa52268dc00de75f644ac3ed77648238337 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -49,7 +49,6 @@ limitations under the License.
 namespace xla {
 
 using absl::StrCat;
-using llvm_ir::AsStringRef;
 using llvm_ir::IrArray;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
@@ -208,10 +207,8 @@ llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) {
-  if (op->opcode() == HloOpcode::kCopy) {
-    return operand_value;
-  } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
-             op->operand(0)->shape().element_type() == PRED) {
+  if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+      op->operand(0)->shape().element_type() == PRED) {
     return EmitIntegerUnaryOp(op, operand_value);
   } else if (ShapeUtil::ElementIsComplex(op->operand(0)->shape())) {
     return EmitComplexUnaryOp(op, operand_value);
@@ -423,6 +420,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kTanh:
       return EmitTanh(op->shape().element_type(), operand_value);
+    case HloOpcode::kSqrt:
+      return EmitSqrt(op->shape().element_type(), operand_value);
+    case HloOpcode::kRsqrt:
+      return EmitRsqrt(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
                                           {operand_value},
@@ -436,18 +437,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                                           {operand_value},
                                           {operand_value->getType()}, b_);
     case HloOpcode::kRoundNearestAfz:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round,
-                                          {operand_value},
-                                          {operand_value->getType()}, b_);
+      return EmitRoundNearestAfz(op->shape().element_type(), operand_value);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = FCmpOEQ(operand_value, zero);
-      auto olt = FCmpOLT(operand_value, zero);
-      return Select(oeq, zero,
-                    Select(olt, llvm::ConstantFP::get(type, -1.0),
-                           llvm::ConstantFP::get(type, 1.0)));
+      auto ne0_i1 = FCmpONE(operand_value, zero);
+      auto ne0_float = UIToFP(ne0_i1, type);
+      llvm::Value* result = llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::copysign, {ne0_float, operand_value},
+          {operand_value->getType()}, b_);
+      auto is_nan = FCmpUNO(operand_value, operand_value);
+      result = Select(is_nan, operand_value, result);
+      return result;
     }
     case HloOpcode::kIsFinite: {
       // abs(x) o!= inf, this works because the comparison returns false if
@@ -653,6 +654,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs),
                              FDiv(EmitExtractImag(operand_value), cplx_abs)));
     }
+    case HloOpcode::kSqrt: {
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto c = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
+      return EmitComplexPower(op, a, b, c, d);
+    }
+    case HloOpcode::kRsqrt: {
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto c = llvm::ConstantFP::get(a->getType(), -0.5);
+      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
+      return EmitComplexPower(op, a, b, c, d);
+    }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
                                 FNeg(EmitExtractImag(operand_value)));
@@ -703,25 +718,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     // We use ordered comparisons for everything except kNe, where we use an
     // unordered comparison.  This makes x != y equivalent to !(x == y), and
     // matches C++'s semantics.
-    case HloOpcode::kEq:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kNe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kLt:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kGt:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGT, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kLe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLE, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kGe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGE, lhs_value,
-                                     rhs_value, b_);
-
+    case HloOpcode::kCompare: {
+      switch (op->comparison_direction()) {
+        case ComparisonDirection::kEq:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kNe:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kLt:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kGt:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGT, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kLe:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLE, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kGe:
+          return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGE, lhs_value,
+                                         rhs_value, b_);
+      }
+    }
     case HloOpcode::kMaximum:
       return EmitFloatMax(lhs_value, rhs_value);
     case HloOpcode::kMinimum:
@@ -736,6 +754,43 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
   }
 }
 
+// (a+bi)^(c+di) =
+//    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
+//    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexPower(
+    const HloInstruction* op, llvm::Value* a, llvm::Value* b, llvm::Value* c,
+    llvm::Value* d) {
+  PrimitiveType component_type =
+      primitive_util::ComplexComponentType(op->shape().element_type());
+  auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b));
+  auto zero = llvm::ConstantFP::get(a->getType(), 0);
+  auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+  auto one = llvm::ConstantFP::get(a->getType(), 1);
+  auto half_c = FMul(one_half, c);
+
+  TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
+                      EmitPow(component_type, aa_p_bb, half_c));
+
+  auto neg_d = FNeg(d);
+  TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
+  auto neg_d_arg_lhs = FMul(neg_d, arg_lhs);
+  TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
+                      EmitExp(component_type, neg_d_arg_lhs));
+  auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+  TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
+  auto half_d = FMul(one_half, d);
+  auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb));
+  TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
+  TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
+  // d^c is 0 if d is 0 and c > 0. 0^0 is defined to be 1.0, see
+  // Branch Cuts for Complex Elementary Functions or Much Ado About
+  // Nothing's Sign Bit, W. Kahan, Section 10.
+  return Select(
+      And(And(FCmpOEQ(aa_p_bb, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
+      EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
+      EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   switch (op->opcode()) {
@@ -786,58 +841,34 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     // We use ordered comparisons for everything except kNe, where we use an
     // unordered comparison.  This makes x != y equivalent to !(x == y), and
     // matches C++'s semantics.
-    case HloOpcode::kEq:
-      return And(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
-                                         EmitExtractReal(lhs_value),
-                                         EmitExtractReal(rhs_value), b_),
-                 llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
-                                         EmitExtractImag(lhs_value),
-                                         EmitExtractImag(rhs_value), b_));
-    case HloOpcode::kNe:
-      return Or(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
-                                        EmitExtractReal(lhs_value),
-                                        EmitExtractReal(rhs_value), b_),
-                llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
-                                        EmitExtractImag(lhs_value),
-                                        EmitExtractImag(rhs_value), b_));
-
+    case HloOpcode::kCompare: {
+      switch (op->comparison_direction()) {
+        case ComparisonDirection::kEq:
+          return And(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
+                                             EmitExtractReal(lhs_value),
+                                             EmitExtractReal(rhs_value), b_),
+                     llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
+                                             EmitExtractImag(lhs_value),
+                                             EmitExtractImag(rhs_value), b_));
+        case ComparisonDirection::kNe:
+          return Or(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
+                                            EmitExtractReal(lhs_value),
+                                            EmitExtractReal(rhs_value), b_),
+                    llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
+                                            EmitExtractImag(lhs_value),
+                                            EmitExtractImag(rhs_value), b_));
+        default:
+          return Unimplemented(
+              "complex comparison '%s'",
+              ComparisonDirectionToString(op->comparison_direction()));
+      }
+    }
     case HloOpcode::kPower: {
-      // (a+bi)^(c+di) =
-      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
-      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
-      PrimitiveType component_type =
-          primitive_util::ComplexComponentType(op->shape().element_type());
       auto a = EmitExtractReal(lhs_value);
       auto b = EmitExtractImag(lhs_value);
       auto c = EmitExtractReal(rhs_value);
       auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b));
-      auto zero = llvm::ConstantFP::get(a->getType(), 0);
-      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto one = llvm::ConstantFP::get(a->getType(), 1);
-      auto half_c = FMul(one_half, c);
-
-      TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
-                          EmitPow(component_type, aa_p_bb, half_c));
-
-      auto neg_d = FNeg(d);
-      TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
-      auto neg_d_arg_lhs = FMul(neg_d, arg_lhs);
-      TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
-                          EmitExp(component_type, neg_d_arg_lhs));
-      auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
-      TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
-      auto half_d = FMul(one_half, d);
-      auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb));
-      TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
-      TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
-      // 0^c is 0 if d is 0 and c > 0. 0^0 is defined to be 1.0, see
-      // Branch Cuts for Complex Elementary Functions or Much Ado About
-      // Nothing's Sign Bit, W. Kahan, Section 10.
-      return Select(
-          And(And(FCmpOEQ(aa_p_bb, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
-          EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
-          EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
+      return EmitComplexPower(op, a, b, c, d);
     }
     default:
       return Unimplemented("binary complex op '%s'",
@@ -855,6 +886,9 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
   return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
 }
 
+// TODO(b/123355973): We have an implementation of erfinv in math.cc.  We
+// shouldn't have two implementations, especially since this one isn't testable
+// (it's only observable via a normally-distributed RNG).
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) {
   if (prim_type != F16 && prim_type != F32 && prim_type != F64) {
@@ -1047,6 +1081,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   return Select(x_is_small, for_small_x, for_large_x);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSqrt(PrimitiveType prim_type,
+                                                    llvm::Value* value) {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {value},
+                                      {value->getType()}, b_);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitRsqrt(PrimitiveType prim_type,
+                                                     llvm::Value* value) {
+  TF_ASSIGN_OR_RETURN(auto sqrt, EmitSqrt(prim_type, value));
+  return FDiv(llvm::ConstantFP::get(sqrt->getType(), 1.0), sqrt);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
@@ -1088,6 +1134,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
   return Select(x_is_small, for_small_x, for_large_x);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitRoundNearestAfz(
+    PrimitiveType /*prim_type*/, llvm::Value* value) {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round, {value},
+                                      {value->getType()}, b_);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) {
@@ -1235,28 +1287,32 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
       return EmitIntegerDivide(lhs_value, rhs_value, is_signed);
     case HloOpcode::kRemainder:
       return EmitIntegerRemainder(lhs_value, rhs_value, is_signed);
-    case HloOpcode::kEq:
-      return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kNe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_NE, lhs_value,
-                                     rhs_value, b_);
-    case HloOpcode::kLt:
-      return llvm_ir::EmitComparison(
-          is_signed ? llvm::CmpInst::ICMP_SLT : llvm::CmpInst::ICMP_ULT,
-          lhs_value, rhs_value, b_);
-    case HloOpcode::kGt:
-      return llvm_ir::EmitComparison(
-          is_signed ? llvm::CmpInst::ICMP_SGT : llvm::CmpInst::ICMP_UGT,
-          lhs_value, rhs_value, b_);
-    case HloOpcode::kLe:
-      return llvm_ir::EmitComparison(
-          is_signed ? llvm::CmpInst::ICMP_SLE : llvm::CmpInst::ICMP_ULE,
-          lhs_value, rhs_value, b_);
-    case HloOpcode::kGe:
-      return llvm_ir::EmitComparison(
-          is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
-          lhs_value, rhs_value, b_);
+    case HloOpcode::kCompare: {
+      switch (op->comparison_direction()) {
+        case ComparisonDirection::kEq:
+          return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kNe:
+          return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_NE, lhs_value,
+                                         rhs_value, b_);
+        case ComparisonDirection::kLt:
+          return llvm_ir::EmitComparison(
+              is_signed ? llvm::CmpInst::ICMP_SLT : llvm::CmpInst::ICMP_ULT,
+              lhs_value, rhs_value, b_);
+        case ComparisonDirection::kGt:
+          return llvm_ir::EmitComparison(
+              is_signed ? llvm::CmpInst::ICMP_SGT : llvm::CmpInst::ICMP_UGT,
+              lhs_value, rhs_value, b_);
+        case ComparisonDirection::kLe:
+          return llvm_ir::EmitComparison(
+              is_signed ? llvm::CmpInst::ICMP_SLE : llvm::CmpInst::ICMP_ULE,
+              lhs_value, rhs_value, b_);
+        case ComparisonDirection::kGe:
+          return llvm_ir::EmitComparison(
+              is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
+              lhs_value, rhs_value, b_);
+      }
+    }
     case HloOpcode::kMinimum:
       return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
@@ -1309,46 +1365,6 @@ llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
                 lhs_value, rhs_value);
 }
 
-llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
-    const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
-    int64 operand_no) {
-  CHECK(hlo.IsElementwise())
-      << "HLO " << hlo.ToString() << " is not elementwise.";
-
-  const Shape& operand_shape = hlo.operand(operand_no)->shape();
-  // If the operand is scalar, the source index is always {}.
-  if (ShapeUtil::IsScalar(operand_shape)) {
-    return llvm_ir::IrArray::Index(target_index.GetType());
-  }
-
-  // If no implicit broadcast is needed for this operand, returns the target
-  // index as the source index.
-  //
-  // `IrArray::Index` may contain a physical linear which we can propagate to
-  // our operand only if our layouts match.  "only if" is a bit strong since
-  // e.g. we can still forward the linear index if the operand shape is
-  // [5,1,1,5]{3,2,1,0} and the HLO shape is[5,1,1,5]{3,1,2,0}, but those cases
-  // are probably not worth handling here for now.
-  if (ShapeUtil::CompatibleIgnoringElementType(operand_shape, hlo.shape()) &&
-      LayoutUtil::Equal(operand_shape.layout(), hlo.shape().layout())) {
-    return target_index;
-  }
-
-  // If implicit broadcast is needed, the source dimensions that are broadcast
-  // have index 0.
-  CHECK_EQ(operand_shape.rank(), hlo.shape().rank());
-  llvm_ir::IrArray::Index source_index(target_index.GetType());
-  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
-    if (hlo.shape().dimensions(i) == operand_shape.dimensions(i)) {
-      source_index.push_back(target_index[i]);
-    } else {
-      CHECK_EQ(1, operand_shape.dimensions(i));
-      source_index.push_back(target_index.GetConstantWithIndexType(0));
-    }
-  }
-  return source_index;
-}
-
 StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
     const HloInstruction* hlo,
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
@@ -1362,26 +1378,69 @@ StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
       llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_);
   llvm::Type* raw_value_ty = raw_value->getType();
 
-  // Convert raw integer to float in range [0, 1) if the element is a float.
+  // If we're generating a floating-point value, convert the raw integer R (i.e.
+  // `raw_value`) to a float in the range [0, 1).
+  //
+  // The basic approach is to choose a significand and exponent such that the
+  // significand is uniformly distributed and the exponent is distributed, well,
+  // exponentially (it's more likely to be close to 0 than far from 0).
+  //
+  // An easy way to do this is to say that the significand is the first S bits
+  // of R, and the exponent is determined by the number of trailing zeroes in R,
+  // exp = 2^-(cttz(R) + 1).  (+1 because the largest exponent should be -1;
+  // this way the largest value we can return is 1.999... * 2^-1 = 1-ε.)
+  //
+  // This results in a small bias.  Namely, if R has enough trailing zeroes, the
+  // significand and exponent will "overlap".  As a concrete example, consider
+  //
+  //         20 X's                 12 zeroes
+  //   R = 0bXXXXXXXXXXXXXXXXXXXX000000000000
+  //
+  // Here the exponent is 2^-13 because R has 12 trailing zeroes.  The
+  // significand is made up of the first 23 most-significant bits of R, which we
+  // observe contain 3 zeroes.  This is biased because any random value with
+  // exponent 2^-12 will have a significand which ends in `000`.
+  //
+  // For f32s, this problem occurs only when there are more than 32-23 = 9
+  // trailing zeros, which happens with probability 0.5^10 = ~0.1%. Moreover the
+  // probability of a large bias (i.e. many trailing 0s in the significand) is
+  // exponentially low.  So we deem this acceptable.
   llvm::Value* elem_value = raw_value;
   if (elem_ir_ty->isFloatingPointTy()) {
-    unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits();
-    CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64);
-    // Perform the division using the float type with the same number of bits
-    // as the raw value to avoid overflow.
-    if (raw_value_size_in_bits == 32) {
-      elem_value = UIToFP(elem_value, b_->getFloatTy());
-      elem_value = FDiv(elem_value,
-                        llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32)));
-    } else {
-      elem_value = UIToFP(elem_value, b_->getDoubleTy());
-      elem_value = FDiv(
-          elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64)));
-    }
-
-    if (elem_ir_ty != elem_value->getType()) {
-      elem_value = FPTrunc(elem_value, elem_ir_ty);
-    }
+    const auto& dest_flt_semantics = elem_ir_ty->getFltSemantics();
+    const int bits = raw_value_ty->getPrimitiveSizeInBits();
+    CHECK_GE(bits, llvm::APFloat::semanticsSizeInBits(dest_flt_semantics));
+
+    // Subtract 1 because semanticsPrecision includes the "hidden bit", i.e. the
+    // implicit "1." at the beginning of the significand.
+    const int significand_bits =
+        llvm::APFloat::semanticsPrecision(dest_flt_semantics) - 1;
+
+    llvm::Value* cttz = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::cttz, {raw_value, /*is_zero_undef=*/b_->getFalse()},
+        {raw_value->getType()}, b_);
+    llvm::Value* significand = LShr(raw_value, bits - significand_bits);
+
+    // Exponent bias is -127 for f32, meaning that if the exponent is E and the
+    // significand is S, then the value of the number is 2^(E - 127) * (1.S).
+    //
+    // We want cttz == 0 to correspond to 2^-1, so our exponent is computed as
+    // E = 126 - cttz.
+    //
+    // For f64, this is all the same, except the bias is -1023.
+    //
+    // In IEEE floating point, the absolute value of the exponent bias equals
+    // the value of the largest possible exponent.
+    const int bias = -llvm::APFloat::semanticsMaxExponent(dest_flt_semantics);
+    llvm::Value* exponent =
+        Sub(llvm::ConstantInt::get(cttz->getType(), -bias - 1), cttz);
+
+    // Now just slot everything into place!  The `Trunc` is here because
+    // raw_value may be larger than our float destination.
+    elem_value =
+        BitCast(Trunc(Or(Shl(exponent, significand_bits), significand),
+                      b_->getIntNTy(elem_ir_ty->getPrimitiveSizeInBits())),
+                elem_ir_ty);
   }
 
   // Convert the value for the requested distribution.
@@ -1611,14 +1670,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& index) {
   TF_ASSIGN_OR_RETURN(llvm::Value * pred_value,
-                      operand_to_generator.at(hlo->operand(0))(
-                          ElementwiseSourceIndex(index, *hlo, 0)));
+                      operand_to_generator.at(hlo->operand(0))(index));
   TF_ASSIGN_OR_RETURN(llvm::Value * on_true_value,
-                      operand_to_generator.at(hlo->operand(1))(
-                          ElementwiseSourceIndex(index, *hlo, 1)));
+                      operand_to_generator.at(hlo->operand(1))(index));
   TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
-                      operand_to_generator.at(hlo->operand(2))(
-                          ElementwiseSourceIndex(index, *hlo, 2)));
+                      operand_to_generator.at(hlo->operand(2))(index));
   return Select(Trunc(pred_value, b_->getInt1Ty()), on_true_value,
                 on_false_value);
 }
@@ -1628,14 +1684,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& index) {
   TF_ASSIGN_OR_RETURN(llvm::Value * min_value,
-                      operand_to_generator.at(hlo->operand(0))(
-                          ElementwiseSourceIndex(index, *hlo, 0)));
+                      operand_to_generator.at(hlo->operand(0))(index));
   TF_ASSIGN_OR_RETURN(llvm::Value * arg_value,
-                      operand_to_generator.at(hlo->operand(1))(
-                          ElementwiseSourceIndex(index, *hlo, 1)));
+                      operand_to_generator.at(hlo->operand(1))(index));
   TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
-                      operand_to_generator.at(hlo->operand(2))(
-                          ElementwiseSourceIndex(index, *hlo, 2)));
+                      operand_to_generator.at(hlo->operand(2))(index));
   PrimitiveType prim_type = hlo->shape().element_type();
   if (primitive_util::IsFloatingPointType(prim_type)) {
     return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
@@ -1668,8 +1721,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     exit_block = llvm_ir::CreateBasicBlock(
         /*insert_before=*/nullptr, IrName(hlo, "merge"), b_);
   } else {
-    exit_block = init_block->splitBasicBlock(b_->GetInsertPoint(),
-                                             AsStringRef(IrName(hlo, "merge")));
+    exit_block =
+        init_block->splitBasicBlock(b_->GetInsertPoint(), IrName(hlo, "merge"));
     init_block->getTerminator()->eraseFromParent();
   }
 
@@ -1715,37 +1768,40 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_);
     source_index_phis[operand_id] =
         PHI(source_index.GetType(), operand_usage_count[operand_id]);
-    auto operand_index = source_index;
-    operand_index[concat_dim] = source_index_phis[operand_id];
+    std::vector<llvm::Value*> operand_multi_index = source_index.multidim();
+    operand_multi_index[concat_dim] = source_index_phis[operand_id];
 
     // Create the terminator of the block before calling operand generators,
     // because they require non-degenerate basic blocks.
     b_->SetInsertPoint(llvm::BranchInst::Create(
         exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
+    llvm_ir::IrArray::Index operand_index(operand_multi_index, operand->shape(),
+                                          source_index.GetType());
     TF_ASSIGN_OR_RETURN(llvm::Value * value,
                         operand_to_generator.at(operand)(operand_index));
     output->addIncoming(value, b_->GetInsertBlock());
     b_->SetInsertPoint(init_block, saved_insert_point);
   }
 
+  std::vector<llvm::Value*> source_multi_index = source_index.multidim();
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
-    auto concat_dim_size =
-        llvm::ConstantInt::get(source_index[concat_dim]->getType(),
-                               operand->shape().dimensions(concat_dim));
+    auto concat_dim_size = source_index.GetConstantWithIndexType(
+        operand->shape().dimensions(concat_dim));
     int64 operand_id = to_unique_operand_id[operand];
-    source_index_phis[operand_id]->addIncoming(source_index[concat_dim],
+    source_index_phis[operand_id]->addIncoming(source_multi_index[concat_dim],
                                                b_->GetInsertBlock());
-    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size),
+    CondBr(ICmpULT(source_multi_index[concat_dim], concat_dim_size),
            emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
     b_->SetInsertPoint(false_block);
-    source_index[concat_dim] = Sub(source_index[concat_dim], concat_dim_size);
+    source_multi_index[concat_dim] =
+        Sub(source_multi_index[concat_dim], concat_dim_size);
   }
 
   Unreachable();
@@ -1762,23 +1818,15 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
   const int64 rank = input_hlo->shape().rank();
   // Use the same index type for all tensor accesses in the same kernel.
   llvm::Type* index_type = index.GetType();
-  llvm_ir::IrArray::Index slice_start_index(index_type, rank);
+  std::vector<llvm::Value*> slice_start_multi_index(rank);
   for (int64 i = 0; i < rank; ++i) {
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    // TODO(b/118437727): Remove the R1 path.
-    llvm::Value* start_index_value;
-    if (hlo->operand(1)->shape().rank() == 1) {
-      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-      TF_ASSIGN_OR_RETURN(start_index_value,
-                          operand_to_generator.at(hlo->operand(1))(dim_index));
-    } else {
-      llvm_ir::IrArray::Index zero_index(index_type);
-      TF_ASSIGN_OR_RETURN(
-          start_index_value,
-          operand_to_generator.at(hlo->operand(1 + i))(zero_index));
-    }
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(1 + i))(zero_index));
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
@@ -1793,17 +1841,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
         EmitIntegralMax(index_typed_const(0), start_index_value, is_signed),
         is_signed);
 
-    start_index_value->setName(
-        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-    slice_start_index[i] = start_index_value;
+    start_index_value->setName(IrName(hlo, StrCat("start_idx", i)));
+    slice_start_multi_index[i] = start_index_value;
   }
 
-  llvm_ir::IrArray::Index input_index(index_type, rank);
+  std::vector<llvm::Value*> input_multi_index(rank);
   for (int64 i = 0; i < rank; ++i) {
     // Emit IR which computes:
     //   input_index = start_index + offset_index
-    input_index[i] = Add(slice_start_index[i], index[i]);
+    input_multi_index[i] = Add(slice_start_multi_index[i], index[i]);
   }
+  llvm_ir::IrArray::Index input_index(input_multi_index, input_hlo->shape(),
+                                      index_type);
   return operand_to_generator.at(input_hlo)(input_index);
 }
 
@@ -1825,7 +1874,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   llvm::Type* index_type = index.GetType();
   // This is the index into `operand` that holds the element we want to
   // generate.
-  IrArray::Index operand_index(index_type);
+  std::vector<llvm::Value*> operand_multi_index;
 
   // First copy in the window indices to operand_index. Also collect a mapping
   // from operand dimension to output window dimension. Elided window dimensions
@@ -1834,26 +1883,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   for (int64 i = 0, e = operand_shape.dimensions_size(), operand_index_dim = 0;
        i < e; i++) {
     if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
-      operand_index.push_back(index.GetConstantWithIndexType(0));
+      operand_multi_index.push_back(index.GetConstantWithIndexType(0));
     } else {
       int64 output_window_dim = dim_numbers.offset_dims(operand_index_dim++);
       operand_to_output_dim[i] = output_window_dim;
-      operand_index.push_back(index[output_window_dim]);
+      operand_multi_index.push_back(index[output_window_dim]);
     }
   }
 
   // This is the index of the index vector in the start_indices tensor.
-  IrArray::Index gather_index_index(index_type);
+  std::vector<llvm::Value*> gather_index_index_components;
   {
-    std::vector<llvm::Value*> gather_index_index_components;
     for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) {
       if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
-        gather_index_index.push_back(index[i]);
+        gather_index_index_components.push_back(index[i]);
       }
     }
 
-    if (gather_index_index.size() != indices_shape.dimensions_size()) {
-      gather_index_index.InsertAt(dim_numbers.index_vector_dim(), nullptr);
+    if (gather_index_index_components.size() !=
+        indices_shape.dimensions_size()) {
+      gather_index_index_components.insert(
+          gather_index_index_components.begin() +
+              dim_numbers.index_vector_dim(),
+          nullptr);
     }
   }
 
@@ -1881,11 +1933,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
                         gather_dim_component_extended, is_signed),
         is_signed);
 
-    operand_index[operand_dim] =
-        Add(operand_index[operand_dim], gather_dim_component_extended_inbound);
+    operand_multi_index[operand_dim] =
+        Add(operand_multi_index[operand_dim],
+            gather_dim_component_extended_inbound);
   };
 
   if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) {
+    IrArray::Index gather_index_index(gather_index_index_components,
+                                      indices_shape, index_type);
     TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
                         indices_generator(gather_index_index));
     add_to_operand_index(gather_dim_component, 0);
@@ -1893,13 +1948,16 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
     int64 index_vector_size =
         indices_shape.dimensions(dim_numbers.index_vector_dim());
     for (int64 i = 0; i < index_vector_size; i++) {
-      gather_index_index[dim_numbers.index_vector_dim()] =
+      gather_index_index_components[dim_numbers.index_vector_dim()] =
           index.GetConstantWithIndexType(i);
+      IrArray::Index gather_index_index(gather_index_index_components,
+                                        indices_shape, index_type);
       TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
                           indices_generator(gather_index_index));
       add_to_operand_index(gather_dim_component, i);
     }
   }
+  IrArray::Index operand_index(operand_multi_index, operand_shape, index_type);
   return operand_generator(operand_index);
 }
 
@@ -1912,8 +1970,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   const HloInstruction* start_hlo = hlo->operand(2);
   // Calculate slice start/end indices.
   const int64 rank = input_hlo->shape().rank();
-  llvm_ir::IrArray::Index slice_start_index(index.GetType(), rank);
-  llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank);
+  std::vector<llvm::Value*> slice_start_multi_index(rank);
+  std::vector<llvm::Value*> slice_limit_multi_index(rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
   // 'input' is set to 'update'
   llvm::Value* slice_intersection = b_->getTrue();
@@ -1924,18 +1982,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
       return llvm::ConstantInt::get(index_type, c);
     };
 
-    llvm::Value* start_index_value;
-    // TODO(b/118437727): Remove the R1 path.
-    if (hlo->operand(2)->shape().rank() == 1) {
-      llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-      TF_ASSIGN_OR_RETURN(start_index_value,
-                          operand_to_generator.at(hlo->operand(2))(dim_index));
-    } else {
-      llvm_ir::IrArray::Index zero_index(index_type);
-      TF_ASSIGN_OR_RETURN(
-          start_index_value,
-          operand_to_generator.at(hlo->operand(2 + i))(zero_index));
-    }
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(2 + i))(zero_index));
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
@@ -1952,16 +2002,16 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
         EmitIntegralMax(index_typed_const(0), start_index_value, is_signed),
         is_signed);
 
-    start_index_value->setName(
-        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-    slice_start_index[i] = start_index_value;
-    slice_limit_index[i] = Add(slice_start_index[i], update_dim_size);
+    start_index_value->setName(IrName(hlo, StrCat("start_idx", i)));
+    slice_start_multi_index[i] = start_index_value;
+    slice_limit_multi_index[i] =
+        Add(slice_start_multi_index[i], update_dim_size);
 
     slice_intersection =
-        And(slice_intersection, ICmpSGE(index[i], slice_start_index[i]),
+        And(slice_intersection, ICmpSGE(index[i], slice_start_multi_index[i]),
             "slice_intersection");
     slice_intersection =
-        And(slice_intersection, ICmpSLT(index[i], slice_limit_index[i]),
+        And(slice_intersection, ICmpSLT(index[i], slice_limit_multi_index[i]),
             "slice_intersection");
   }
 
@@ -1977,10 +2027,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   // Handle true BB (return data from 'update')
   SetToFirstInsertPoint(if_data.true_block, b_);
   // Compute update index for intersection case.
-  llvm_ir::IrArray::Index update_index(index.GetType(), rank);
+  std::vector<llvm::Value*> update_multi_index(rank);
   for (int64 i = 0; i < rank; ++i) {
-    update_index[i] = Sub(index[i], slice_start_index[i]);
+    update_multi_index[i] = Sub(index[i], slice_start_multi_index[i]);
   }
+  llvm_ir::IrArray::Index update_index(update_multi_index, update_hlo->shape(),
+                                       index.GetType());
   TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
                       operand_to_generator.at(update_hlo)(update_index));
   Store(true_value, ret_value_addr);
@@ -1999,27 +2051,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
     const HloInstruction* hlo,
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& padded_index) {
-  auto index = padded_index;
+  std::vector<llvm::Value*> multi_index = padded_index.multidim();
   llvm::Value* in_bounds = b_->getTrue();
-  for (size_t i = 0; i < index.size(); ++i) {
+  for (size_t i = 0; i < multi_index.size(); ++i) {
     auto index_typed_const = [=](int64 n) {
-      return llvm::ConstantInt::get(index[i]->getType(), n);
+      return padded_index.GetConstantWithIndexType(n);
     };
     const auto& pad_dim = hlo->padding_config().dimensions(i);
-    index[i] = Sub(index[i], index_typed_const(pad_dim.edge_padding_low()));
+    multi_index[i] =
+        Sub(multi_index[i], index_typed_const(pad_dim.edge_padding_low()));
+    in_bounds = And(in_bounds, ICmpSGE(multi_index[i], index_typed_const(0)),
+                    "in_bounds");
     in_bounds =
-        And(in_bounds, ICmpSGE(index[i], index_typed_const(0)), "in_bounds");
-    in_bounds = And(
-        in_bounds,
-        ICmpEQ(
-            index_typed_const(0),
-            URem(index[i], index_typed_const(pad_dim.interior_padding() + 1))),
-        "in_bounds");
-    index[i] =
-        SDiv(index[i], index_typed_const(pad_dim.interior_padding() + 1));
+        And(in_bounds,
+            ICmpEQ(index_typed_const(0),
+                   URem(multi_index[i],
+                        index_typed_const(pad_dim.interior_padding() + 1))),
+            "in_bounds");
+    multi_index[i] =
+        SDiv(multi_index[i], index_typed_const(pad_dim.interior_padding() + 1));
     in_bounds =
         And(in_bounds,
-            ICmpSLT(index[i],
+            ICmpSLT(multi_index[i],
                     index_typed_const(hlo->operand(0)->shape().dimensions(i))),
             "in_bounds");
   }
@@ -2035,6 +2088,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
   llvm_ir::LlvmIfData if_data =
       llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
   SetToFirstInsertPoint(if_data.true_block, b_);
+  llvm_ir::IrArray::Index index(multi_index, hlo->operand(0)->shape(),
+                                padded_index.GetType());
   TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
                       operand_to_generator.at(hlo->operand(0))(index));
   Store(operand_value, ret_value_addr);
@@ -2094,17 +2149,27 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   // Given an output index [a,b,c,d,e] in the result, we compute:
   //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
 
-  IrArray::Index lhs_index(index_type), rhs_index(index_type);
-
+  std::vector<llvm::Value*> lhs_multi_index, rhs_multi_index;
   for (int64 i = 0; i < lhs_dims - 1; i++) {
-    lhs_index.push_back(dot_result_index[i]);
+    lhs_multi_index.push_back(dot_result_index[i]);
   }
-  lhs_index.InsertAt(lhs_contracting_dim, inner_loop->GetIndVarValue());
-
-  for (int64 i = 0; i < rhs_dims - 1; i++) {
-    rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
+  lhs_multi_index.insert(lhs_multi_index.begin() + lhs_contracting_dim,
+                         inner_loop->GetIndVarValue());
+  IrArray::Index lhs_index(lhs_multi_index, hlo->operand(0)->shape(),
+                           index_type);
+
+  int64 num_batch_dims = dim_numbers.rhs_batch_dimensions_size();
+  for (int64 i = 0; i < num_batch_dims; i++) {
+    rhs_multi_index.push_back(
+        dot_result_index[dim_numbers.rhs_batch_dimensions(i)]);
+  }
+  for (int64 i = 0; i < rhs_dims - 1 - num_batch_dims; i++) {
+    rhs_multi_index.push_back(dot_result_index[lhs_dims - 1 + i]);
   }
-  rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue());
+  rhs_multi_index.insert(rhs_multi_index.begin() + rhs_contracting_dim,
+                         inner_loop->GetIndVarValue());
+  IrArray::Index rhs_index(rhs_multi_index, hlo->operand(1)->shape(),
+                           index_type);
 
   llvm::Value* current_accumulator = Load(accumulator_alloca);
   TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
@@ -2144,7 +2209,6 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kClz:
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
-    case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
@@ -2156,30 +2220,26 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
+                            operand_to_generator.at(hlo->operand(0))(index));
         return EmitUnaryOp(hlo, operand_value);
       };
     case HloOpcode::kAdd:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
+    case HloOpcode::kCompare:
     case HloOpcode::kComplex:
     case HloOpcode::kDivide:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kOr:
     case HloOpcode::kXor:
     case HloOpcode::kPower:
@@ -2193,11 +2253,9 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* lhs = hlo->operand(0);
         const HloInstruction* rhs = hlo->operand(1);
         TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value,
-                            operand_to_generator.at(lhs)(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
+                            operand_to_generator.at(lhs)(index));
         TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value,
-                            operand_to_generator.at(rhs)(
-                                ElementwiseSourceIndex(index, *hlo, 1)));
+                            operand_to_generator.at(rhs)(index));
         return EmitBinaryOp(hlo, lhs_value, rhs_value);
       };
     case HloOpcode::kSelect:
@@ -2214,8 +2272,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
+                            operand_to_generator.at(hlo->operand(0))(index));
         return EmitReducePrecision(hlo, operand_value);
       };
     case HloOpcode::kConcatenate:
@@ -2228,13 +2285,14 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& target_index) -> StatusOr<llvm::Value*> {
         const HloInstruction* operand = hlo->operand(0);
-        auto source_index = target_index;
+        std::vector<llvm::Value*> source_multi_index = target_index.multidim();
         for (int64 dim : hlo->dimensions()) {
-          source_index[dim] =
-              Sub(llvm::ConstantInt::get(target_index[dim]->getType(),
-                                         hlo->shape().dimensions(dim) - 1),
-                  target_index[dim]);
+          source_multi_index[dim] = Sub(target_index.GetConstantWithIndexType(
+                                            hlo->shape().dimensions(dim) - 1),
+                                        target_index[dim]);
         }
+        llvm_ir::IrArray::Index source_index(
+            source_multi_index, operand->shape(), target_index.GetType());
         return operand_to_generator.at(operand)(source_index);
       };
     case HloOpcode::kBroadcast:
@@ -2308,7 +2366,8 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index = index.SourceIndexOfSlice(
-            /*shape=*/hlo->shape(), /*starts=*/hlo->slice_starts(),
+            /*operand_shape=*/hlo->operand(0)->shape(),
+            /*starts=*/hlo->slice_starts(),
             /*strides=*/hlo->slice_strides(), /*builder=*/b_);
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
@@ -2345,6 +2404,17 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return operand_to_generator.at(operand)(
             index.SourceIndexOfReshape(hlo->shape(), operand->shape(), b_));
       };
+    case HloOpcode::kCopy:
+      return [hlo, &operand_to_generator](
+                 const IrArray::Index& target_index) -> StatusOr<llvm::Value*> {
+        IrArray::Index source_index(target_index.multidim(),
+                                    hlo->operand(0)->shape(),
+                                    target_index.GetType());
+        TF_ASSIGN_OR_RETURN(
+            llvm::Value * operand_value,
+            operand_to_generator.at(hlo->operand(0))(source_index));
+        return operand_value;
+      };
     case HloOpcode::kTranspose:
       return [this, hlo,
               &operand_to_generator](const IrArray::Index& target_index) {
@@ -2366,6 +2436,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  -> StatusOr<llvm::Value*> {
         return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
+    case HloOpcode::kReplicaId:
+      return [this, hlo](const IrArray::Index&) -> StatusOr<llvm::Value*> {
+        if (hlo_module_config_.replica_count() != 1) {
+          return Unimplemented("Replication is not implemented on CPU/GPU.");
+        }
+        llvm::Type* type = llvm_ir::PrimitiveTypeToIrType(
+            hlo->shape().element_type(), module_);
+        return llvm::ConstantInt::getNullValue(type);
+      };
     default:
       return [hlo](const IrArray::Index& index) {
         return Unimplemented("Unhandled opcode for elemental IR emission: %s",
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index d3e2acaabd4f602171def70ccd3d4fd5adce0d0d..6b1c85b7e5f45037a84b0fc97570dc61bdcd3313 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -119,6 +119,12 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                          llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                          llvm::Value* value);
+
+  virtual StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                           llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
                                            llvm::Value* value);
 
@@ -140,6 +146,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                           llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitRoundNearestAfz(PrimitiveType prim_type,
+                                                     llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x);
 
@@ -150,15 +159,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
                                   llvm::Value* imag);
 
-  // A helper method for MakeElementGenerator. Given an elementwise op `hlo` and
-  // the target array index, computes the source array index of its
-  // `operand_no`-th operand.
-  //
-  // Precondition: `hlo` is an elementwise op.
-  llvm_ir::IrArray::Index ElementwiseSourceIndex(
-      const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
-      int64 operand_no);
-
   // Identifier of the thread unique among all threads on the device
   virtual llvm::Value* EmitThreadId() { return b_->getIntN(128, 0); }
 
@@ -211,13 +211,21 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   const HloModuleConfig& hlo_module_config_;
 
  private:
+  // Computes the complex power function, returns (a + i*b)^(c + i*d).
+  StatusOr<llvm::Value*> EmitComplexPower(const HloInstruction* op,
+                                          llvm::Value* a, llvm::Value* b,
+                                          llvm::Value* c, llvm::Value* d);
+
   // Returns a ElementGenerator for an RNG HloInstruction using the Philox
   // random number generation algorithm.
   llvm_ir::ElementGenerator MakePhiloxRngElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator);
+
   // Converts the raw value generated by a random number generation algorithm
   // to the distribution requested by the RNG HloInstruction.
+  //
+  // Precondition: raw_value has at least as many bits as hlo's element type.
   StatusOr<llvm::Value*> ConvertValueForDistribution(
       const HloInstruction* hlo,
       const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
index 852f34e06df35242b13110ae4411b8c969c26019..ac18346faa120f2d08ad6eba437266198aa6aff7 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -32,7 +32,7 @@ class ElementalIrEmitterExecutionTest : public HloTestBase {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            ParseHloString(hlo_text, config));
+                            ParseAndReturnVerifiedModule(hlo_text, config));
     EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), args, nullopt));
   }
 };
@@ -60,5 +60,38 @@ ENTRY main {
   Literal rhs = LiteralUtil::CreateR3<int32>({{{3}, {4}}});
   RunTest(hlo_text, {&lhs, &rhs});
 }
+
+XLA_TEST_F(ElementalIrEmitterExecutionTest, BatchDot) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+fused_computation.1 {
+  param_0 = f64[1,1,8]{2,1,0} parameter(0)
+  r.1 = f64[2,4]{1,0} reshape(param_0)
+  param_1 = f64[1,2,2,2,1]{4,3,2,1,0} parameter(1)
+  r.2 = f64[2,4,1]{2,1,0} reshape(param_1)
+  ROOT dot = f64[2,1]{1,0} dot(r.1, r.2), lhs_batch_dims={0},
+                                          lhs_contracting_dims={1},
+                                          rhs_batch_dims={0},
+                                          rhs_contracting_dims={1}
+}
+
+ENTRY resampler_Resampler.49 {
+  p0 = f64[1,1,8]{2,1,0} parameter(0)
+  p1 = f64[1,2,2,2,1]{4,3,2,1,0} parameter(1)
+  ROOT f = f64[2,1]{1,0} fusion(p0, p1), kind=kLoop, calls=fused_computation.1
+}
+)";
+
+  HloModuleConfig config;
+  auto debug_options = GetDebugOptionsForTest();
+  // Disable the layout assignment pass because it would throw away the layouts
+  // in the fusion computation, but not recreate them.
+  debug_options.add_xla_disable_hlo_passes("layout-assignment");
+  config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{4e-3, 4e-3}));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 10b8c01ff1383658fcfb2271c177ba54347f985a..7b60c983b3093f1e8f91cacef1c45b50e0d9ac38 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 
-
 namespace xla {
 
 StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
@@ -138,8 +138,6 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
     XLA_LOG_LINES(
         tensorflow::INFO,
         profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
-    hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
-                                         profile_ptr.get());
   }
 
   return return_value;
@@ -147,37 +145,4 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
 
 int64 Executable::SizeInBytes() { return -1; }
 
-Status Executable::DumpHloSnapshot() {
-  TF_RET_CHECK(dumping_snapshot());
-  TF_RET_CHECK(hlo_snapshot_->has_hlo() &&
-               hlo_snapshot_->hlo().has_hlo_module());
-  const string& directory_path =
-      module_config().debug_options().xla_dump_executions_to();
-  const auto& module = hlo_snapshot_->hlo().hlo_module();
-  string filename =
-      absl::StrFormat("computation_%d__%s__execution_%d", module.id(),
-                      module.entry_computation_name(), ++execution_count_);
-  return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_);
-}
-
-/* static */ Status Executable::DumpToDirectory(
-    const string& directory_path, string filename,
-    const HloSnapshot& hlo_session) {
-  tensorflow::Env* env = tensorflow::Env::Default();
-  if (!env->IsDirectory(directory_path).ok()) {
-    // NB! CreateDir does not work reliably with multiple XLA threads -- two
-    // threads can race to observe the absence of the dump directory and
-    // simultaneously try to create it, causing the "losing" thread to get a
-    // "directory already exists" error.
-    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
-  }
-  filename = SanitizeFileName(std::move(filename));
-  string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  string result;
-  TF_RET_CHECK(
-      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
-  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
-                                       result);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index b34bca55a48b113c325dbf28c03f7a0f5b71f658..a08ec181d494cb575b291d3aabbec6c1e3d56226 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -184,11 +184,6 @@ class Executable {
   }
   bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; }
   HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); }
-  Status DumpHloSnapshot();
-
-  // Dump hlo snapshot to directory_path/filename.
-  static Status DumpToDirectory(const string& directory_path, string filename,
-                                const HloSnapshot& hlo_session);
 
  protected:
   mutable tensorflow::mutex mutex_;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 85409b330b11537158059dcce8c2a96c98d38f30..f16a4485550a4262be8089c7d6c7c8252830dc1b 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -26,7 +26,7 @@ namespace xla {
 
 namespace {
 
-// Helper to replace the called computation at a while-, call-, or
+// Helper to replace the called computation at a while-, call-, case-, or
 // conditional-instruction. This function replaces exactly one instance of
 // 'computation' with 'new_computation' even if 'instruction' calls
 // 'computation' more than once.
@@ -49,11 +49,14 @@ void ReplaceCalledComputation(HloInstruction* instruction,
       break;
     }
     case HloOpcode::kConditional: {
-      if (computation == instruction->true_computation()) {
-        instruction->set_true_computation(new_computation);
-      } else {
-        CHECK_EQ(computation, instruction->false_computation());
-        instruction->set_false_computation(new_computation);
+      for (int b = 0; b < instruction->branch_count(); ++b) {
+        if (b == instruction->branch_count() - 1) {
+          CHECK_EQ(computation, instruction->branch_computation(b));
+        }
+        if (computation == instruction->branch_computation(b)) {
+          instruction->set_branch_computation(b, new_computation);
+          break;
+        }
       }
       break;
     }
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 8eeb930b48165a2e3c622581e05cb5f7063fa1fa..ef35311b08b8ada098b3dd1f3c70692b8f148add 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -81,8 +81,9 @@ class FlattenCallGraphTest : public HloTestBase {
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     HloInstruction* zero = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
+    builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param0,
+                                      zero, ComparisonDirection::kGt));
     return builder.Build();
   }
 
@@ -158,9 +159,9 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
             0, ShapeUtil::MakeShape(PRED, {}), "param0"));
     HloInstruction* false_constant = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
-    builder.AddInstruction(
-        HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
-                                     HloOpcode::kEq, param0, false_constant));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), param0, false_constant,
+        ComparisonDirection::kEq));
     cond_computation = module->AddEmbeddedComputation(builder.Build());
   }
 
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 01cef499665c050d4453382289168276028e1d26..1838f65e6eae3108b8c5cbd43e221140c3777a2b 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -112,6 +112,14 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
     int64 operand_rank) {
   HloComputation* computation = index_vector->parent();
   const Shape& index_shape = index_vector->shape();
+
+  if (operand_rank == 0) {
+    // This is Gather from a scalar. So, the index vector in operand space must
+    // be a zero-sized vector.
+    return computation->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateFromDimensions(index_shape.element_type(), {0})));
+  }
+
   HloInstruction* zero =
       computation->AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1})));
@@ -153,10 +161,9 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
            dim_numbers.index_vector_dim() ==
                gather.operand(1)->shape().dimensions_size());
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   HloInstruction* index_vector;
 
@@ -222,7 +229,7 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
       {operand, start_indices, updated_accumulator}};
 }
 
-static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
+static HloInstruction* CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
     absl::Span<const int64> slice_sizes, int64 gather_loop_trip_count,
     const GatherDimensionNumbers& dim_numbers) {
@@ -297,7 +304,7 @@ static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
 // [3,1] out of operand into an accumulator of shape [4,3,1].  We then
 // reshape this result to [2,2,3] and finally transpose it to [2,3,2].
 
-StatusOr<HloInstruction*> GatherExpander::ExpandGather(
+StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
     HloInstruction* gather_instr) {
   CHECK(!ShapeUtil::IsZeroElementArray(gather_instr->shape()));
 
@@ -332,12 +339,10 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
   CHECK_EQ(gather_loop_trip_count,
            canonical_start_indices->shape().dimensions(0));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_init,
-      CreateGatherLoopAccumulatorInitValue(
-          computation, output_shape.element_type(),
-          gather_instr->gather_slice_sizes(), gather_loop_trip_count,
-          gather_instr->gather_dimension_numbers()));
+  HloInstruction* accumulator_init = CreateGatherLoopAccumulatorInitValue(
+      computation, output_shape.element_type(),
+      gather_instr->gather_slice_sizes(), gather_loop_trip_count,
+      gather_instr->gather_dimension_numbers());
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
@@ -364,25 +369,11 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
                                    output_rank);
 }
 
-StatusOr<bool> GatherExpander::Run(HloModule* module) {
-  auto is_nontrivial_gather = [](HloInstruction* inst) {
-    return inst->opcode() == HloOpcode::kGather &&
-           // Avoid expanding gather ops that produce zero sized tensors,
-           // instead punt these to ZeroSizedHloElimination.
-           !ShapeUtil::IsZeroElementArray(inst->shape());
-  };
-
-  std::vector<HloInstruction*> gather_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(computation->instructions(),
-                    std::back_inserter(gather_instrs), is_nontrivial_gather);
-  }
-
-  for (HloInstruction* inst : gather_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandGather(inst));
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
-  }
-
-  return !gather_instrs.empty();
+bool GatherExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kGather &&
+         // Avoid expanding gather ops that produce zero sized tensors,
+         // instead punt these to ZeroSizedHloElimination.
+         !ShapeUtil::IsZeroElementArray(inst->shape());
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 8af9c6b71fbc391bf7c0e9809e979b65135a6df3..5625a37cb46ca5b70f69d86bc424f6512bfb293f 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -16,20 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
 
 // This pass rewrites gather operations into (roughly) while loops of dynamic
 // slices.  This lets backends that don't support gather directly to
 // nevertheless have a minimum level of support.
-class GatherExpander : public HloModulePass {
+class GatherExpander : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "gather_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  StatusOr<HloInstruction*> ExpandGather(HloInstruction* gather_instr);
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* gather_inst) override;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 7d450f4b53cdea209f2ef10ba785be6ec3b8bf8d..d6a7ec90b59adf72d1e9b447e419cfc8d79fcf4e 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -58,7 +57,8 @@ Status GenericTransferManager::WriteSingleTupleIndexTable(
 
 void GenericTransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
-    MutableBorrowingLiteral literal, std::function<void(Status)> done) {
+    MutableBorrowingLiteral literal, std::function<void(Status)> done,
+    const TransferMetadata* /*transfer_metadata*/) {
   Status status = stream->BlockHostUntilDone();
   if (!status.ok()) {
     return done(status);
@@ -98,7 +98,8 @@ Status GenericTransferManager::TransferLiteralFromDeviceInternal(
 
 Status GenericTransferManager::TransferLiteralToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
-    const ShapedBuffer& device_buffer) {
+    const ShapedBuffer& device_buffer,
+    const TransferMetadata* /*transfer_metadata*/) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
           << ShapeUtil::HumanString(shape)
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 86c8b1c145a25149a25e7b272babc5c858d476af..acfd8dd64c1a907977bd8d893c9e9dc87425eab5 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -40,14 +40,15 @@ class GenericTransferManager : public TransferManager {
 
   se::Platform::Id PlatformId() const override;
 
-  void TransferLiteralFromDevice(se::Stream* stream,
-                                 const ShapedBuffer& device_buffer,
-                                 MutableBorrowingLiteral literal,
-                                 std::function<void(Status)> done) override;
+  void TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      MutableBorrowingLiteral literal, std::function<void(Status)> done,
+      const TransferMetadata* transfer_metadata) override;
 
   Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
-      const ShapedBuffer& device_buffer) override;
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata) override;
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const LiteralSlice& literal) override;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index dc17aa4426236f54e5f03c28634278d45f462158..53cb8c4f49e67d5cc2c37a8c8b7b95d470bce919 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -7,7 +7,7 @@ load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library", "if_cuda")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -156,7 +156,6 @@ cc_library(
         "ir_emitter_unnested.h",
     ],
     deps = [
-        ":backend_configs",
         ":buffer_allocations",
         ":cudnn_conv_runner",
         ":elemental_ir_emitter",
@@ -164,8 +163,10 @@ cc_library(
         ":gpu_executable",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
+        ":nccl_all_reduce_thunk",
         ":parallel_loop_emitter",
         ":partition_assignment",
+        ":thunk",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -179,6 +180,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:name_uniquer",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:while_loop_analysis",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
@@ -287,9 +289,44 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "thunk",
+    srcs = ["thunk.cc"],
+    hdrs = ["thunk.h"],
+    deps = [
+        ":buffer_allocations",
+        ":hlo_execution_profiler",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+tf_cuda_library(
+    name = "nccl_all_reduce_thunk",
+    srcs = ["nccl_all_reduce_thunk.cc"],
+    hdrs = ["nccl_all_reduce_thunk.h"],
+    deps = [
+        ":buffer_allocations",
+        ":hlo_execution_profiler",
+        ":thunk",
+        "@com_google_absl//absl/synchronization",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/cuda:cuda_activation",
+        "//tensorflow/stream_executor/cuda:cuda_gpu_executor",
+    ] + if_cuda([
+        "@local_config_nccl//:nccl",
+    ]),
+)
+
 cc_library(
     name = "gpu_executable",
     srcs = [
+        "cholesky_thunk.cc",
         "conditional_thunk.cc",
         "convolution_thunk.cc",
         "copy_thunk.cc",
@@ -303,12 +340,13 @@ cc_library(
         "memset_thunk.cc",
         "outfeed_thunk.cc",
         "sequential_thunk.cc",
-        "thunk.cc",
         "thunk_schedule.cc",
+        "triangular_solve_thunk.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
     ],
     hdrs = [
+        "cholesky_thunk.h",
         "conditional_thunk.h",
         "convolution_thunk.h",
         "copy_thunk.h",
@@ -322,20 +360,23 @@ cc_library(
         "memset_thunk.h",
         "outfeed_thunk.h",
         "sequential_thunk.h",
-        "thunk.h",
         "thunk_schedule.h",
+        "triangular_solve_thunk.h",
         "tuple_thunk.h",
         "while_thunk.h",
     ],
     deps = [
         ":buffer_allocations",
         ":cudnn_conv_runner",
+        ":cusolver_context",
         ":hlo_execution_profiler",
         ":infeed_manager",
         ":ir_emission_utils",
+        ":nccl_all_reduce_thunk",  # fixdeps: keep
         ":outfeed_manager",
         ":partition_assignment",
         ":stream_assignment",
+        ":thunk",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_tree",
@@ -364,6 +405,10 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -399,16 +444,21 @@ cc_library(
         ":backend_configs",
         ":buffer_comparator",
         ":cudnn_conv_runner",
+        ":gpu_autotuning_proto",
         ":gpu_executable",
         ":ir_emission_utils",
+        ":scratch_allocator",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:lib",
+        "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/util/proto:proto_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
@@ -416,6 +466,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "scratch_allocator",
+    srcs = ["scratch_allocator.cc"],
+    hdrs = ["scratch_allocator.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
     name = "cudnn_conv_runner",
     srcs = ["cudnn_conv_runner.cc"],
@@ -472,6 +534,43 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "cusolver_context",
+    srcs = ["cusolver_context.cc"],
+    hdrs = ["cusolver_context.h"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:blas",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+)
+
+cc_library(
+    name = "cusolver_rewriter",
+    srcs = ["cusolver_rewriter.cc"],
+    hdrs = ["cusolver_rewriter.h"],
+    deps = [
+        ":cusolver_context",
+        ":ir_emission_utils",
+        ":scratch_allocator",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:blas",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "instruction_fusion",
     srcs = ["instruction_fusion.cc"],
@@ -484,6 +583,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -555,6 +655,44 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_sanitize_constant_names",
+    srcs = ["gpu_sanitize_constant_names.cc"],
+    hdrs = ["gpu_sanitize_constant_names.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_sanitize_constant_names_test",
+    srcs = ["gpu_sanitize_constant_names_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_sanitize_constant_names",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "fusion_merger",
     srcs = ["fusion_merger.cc"],
@@ -567,6 +705,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -640,7 +779,8 @@ cc_library(
     srcs = ["gpu_transfer_manager.cc"],
     hdrs = ["gpu_transfer_manager.h"],
     deps = [
-        ":gpu_compiler",
+        ":infeed_manager",
+        ":nvptx_compiler",
         ":outfeed_manager",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -653,7 +793,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
@@ -663,15 +802,17 @@ cc_library(
 )
 
 cc_library(
-    name = "gpu_compiler",
+    name = "nvptx_compiler",
     srcs = ["nvptx_compiler.cc"],
     hdrs = ["nvptx_compiler.h"],
     deps = [
+        ":cudnn_batchnorm_rewriter",
         ":cudnn_conv_algorithm_picker",
         ":cudnn_conv_pad_for_tensor_cores",
         ":cudnn_conv_padding_legalization",
         ":cudnn_conv_rewriter",
         ":cudnn_fused_conv_rewriter",
+        ":cusolver_rewriter",
         ":fusion_merger",
         ":gpu_constants",
         ":gpu_copy_insertion",
@@ -679,6 +820,7 @@ cc_library(
         ":gpu_hlo_schedule",
         ":gpu_hlo_support_checker",
         ":gpu_layout_assignment",
+        ":gpu_sanitize_constant_names",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
@@ -700,6 +842,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -719,12 +862,13 @@ cc_library(
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:sort_simplifier",
+        "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_trip_count_annotator",
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
-        "//tensorflow/compiler/xla/service/gpu:cudnn_batchnorm_rewriter",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:cuda_libdevice_path",
@@ -960,6 +1104,7 @@ cc_library(
     hdrs = ["gpu_fusible.h"],
     deps = [
         ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
     ],
 )
@@ -998,8 +1143,8 @@ tf_cc_test(
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":ir_emission_utils",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
@@ -1042,3 +1187,13 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
+
+xla_proto_library(
+    name = "gpu_autotuning_proto",
+    srcs = ["gpu_autotuning.proto"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/core:autotuning_proto_cc",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7daef16cb62338cfa5b027136ecd4262288eec8d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
@@ -0,0 +1,119 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace xla {
+namespace gpu {
+
+CholeskyThunk::CholeskyThunk(const CholeskyOptions& options,
+                             BufferAllocation::Slice a_buffer,
+                             BufferAllocation::Slice workspace_buffer,
+                             BufferAllocation::Slice info_buffer,
+                             PrimitiveType type, int64 batch_size, int64 n,
+                             const HloInstruction* hlo)
+    : Thunk(Kind::kCholesky, hlo),
+      uplo_(options.lower() ? se::blas::UpperLower::kLower
+                            : se::blas::UpperLower::kUpper),
+      a_buffer_(a_buffer),
+      workspace_buffer_(workspace_buffer),
+      info_buffer_(info_buffer),
+      type_(type),
+      batch_size_(batch_size),
+      a_batch_stride_(n * n *
+                      ShapeUtil::ByteSizeOfPrimitiveType(
+                          hlo->operand(0)->shape().element_type())),
+      n_(n) {}
+
+Status CholeskyThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  VLOG(3) << "type=" << PrimitiveType_Name(type_)
+          << " uplo=" << se::blas::UpperLowerString(uplo_)
+          << " batch_size=" << batch_size_ << " n=" << n_
+       << " a=" << a_buffer_.ToString()
+       << " workspace=" << workspace_buffer_.ToString()
+       << " info=" << info_buffer_.ToString();
+
+  CusolverContext* context;
+  {
+    tensorflow::mutex_lock lock(mu_);
+    auto result = contexts_.emplace(stream, CusolverContext());
+    if (result.second) {
+      TF_ASSIGN_OR_RETURN(result.first->second,
+                          CusolverContext::Create(stream));
+    }
+    context = &result.first->second;
+  }
+
+  char* a_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(a_buffer_).opaque());
+  int* info_base = static_cast<int*>(
+      buffer_allocations.GetDeviceAddress(info_buffer_).opaque());
+  se::DeviceMemoryBase workspace_data =
+      buffer_allocations.GetDeviceAddress(workspace_buffer_);
+  for (int64 i = 0; i < batch_size_; ++i) {
+    se::DeviceMemoryBase a_data =
+        se::DeviceMemoryBase(a_base + i * a_batch_stride_, a_batch_stride_);
+    se::DeviceMemory<int> info_data(
+        se::DeviceMemoryBase(info_base + i, sizeof(int)));
+    switch (type_) {
+      case F32: {
+        TF_RETURN_IF_ERROR(
+            context->Potrf(uplo_, n_, se::DeviceMemory<float>(a_data), n_,
+                           info_data, se::DeviceMemory<float>(workspace_data)));
+        break;
+      }
+      case F64: {
+        TF_RETURN_IF_ERROR(context->Potrf(
+            uplo_, n_, se::DeviceMemory<double>(a_data), n_, info_data,
+            se::DeviceMemory<double>(workspace_data)));
+        break;
+      }
+      case C64: {
+        TF_RETURN_IF_ERROR(context->Potrf(
+            uplo_, n_, se::DeviceMemory<std::complex<float>>(a_data), n_,
+            info_data, se::DeviceMemory<std::complex<float>>(workspace_data)));
+        break;
+      }
+      case C128: {
+        TF_RETURN_IF_ERROR(context->Potrf(
+            uplo_, n_, se::DeviceMemory<std::complex<double>>(a_data), n_,
+            info_data, se::DeviceMemory<std::complex<double>>(workspace_data)));
+        break;
+      }
+      default:
+        return InvalidArgument("Invalid type for cholesky %s",
+                               PrimitiveType_Name(type_));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..cde245a7e8bc0909059d4643cae3de138bddcdec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
+
+#include "absl/base/thread_annotations.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a Cholesky
+// decomposition (LAPACK potrf). It is generated by IrEmitter.
+//
+// Thread-compatible.
+class CholeskyThunk : public Thunk {
+ public:
+  static StatusOr<int64> ScratchBufferSize(int64 n);
+  CholeskyThunk(const CholeskyOptions& options,
+                BufferAllocation::Slice a_buffer,
+                BufferAllocation::Slice workspace_buffer,
+                BufferAllocation::Slice info_buffer,
+                PrimitiveType type,
+                int64 batch_size, int64 n, const HloInstruction* hlo);
+
+  CholeskyThunk(const CholeskyThunk&) = delete;
+  CholeskyThunk& operator=(const CholeskyThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  se::blas::UpperLower uplo_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice workspace_buffer_;
+  const BufferAllocation::Slice info_buffer_;
+
+  const PrimitiveType type_;
+  const int64 batch_size_;
+  const int64 a_batch_stride_;
+  const int64 n_;
+
+  tensorflow::mutex mu_;
+  absl::flat_hash_map<se::Stream*, CusolverContext> contexts_ GUARDED_BY(mu_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 9ed523998bf07567133fdac0e40b12b8ce4ea3b0..ea6392498264f25d53bec2309bfdf7bdcf6a2a2e 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -24,25 +24,35 @@ namespace xla {
 namespace gpu {
 
 ConditionalThunk::ConditionalThunk(
-    const BufferAllocation::Slice& predicate_buffer_index,
-    const BufferAllocation::Slice& true_operand_buffer_index,
-    const BufferAllocation::Slice& false_operand_buffer_index,
-    ThunkSequence true_thunk_sequence, ThunkSequence false_thunk_sequence,
+    const BufferAllocation::Slice& branch_index_buffer_index,
+    absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes,
+    std::vector<ThunkSequence> branch_thunk_sequences,
     const HloInstruction* hlo)
     : Thunk(Kind::kConditional, hlo),
-      predicate_buffer_index_(predicate_buffer_index),
-      true_operand_buffer_index_(true_operand_buffer_index),
-      false_operand_buffer_index_(false_operand_buffer_index),
-      // Pass nullptr as the HloInstruction* to the true_thunk_ and false_thunk_
-      // constructors because these SequentialThunks are logically "part of"
-      // this ConditionalThunk, and shouldn't be profiled separately from it.
-      true_thunk_(std::move(true_thunk_sequence), nullptr),
-      false_thunk_(std::move(false_thunk_sequence), nullptr) {}
+      branch_index_is_bool_(hlo->operand(0)->shape().element_type() == PRED),
+      branch_index_buffer_index_(branch_index_buffer_index),
+      branch_operand_buffer_indexes_(branch_operand_buffer_indexes.begin(),
+                                     branch_operand_buffer_indexes.end()) {
+  // Pass nullptr as the HloInstruction* to the branch_thunks_
+  // constructors because these SequentialThunks are logically "part of"
+  // this ConditionalThunk, and shouldn't be profiled separately from it.
+  branch_thunks_.reserve(branch_thunk_sequences.size());
+  for (auto& branch_thunk_sequence : branch_thunk_sequences) {
+    branch_thunks_.emplace_back(
+        new SequentialThunk(std::move(branch_thunk_sequence), nullptr));
+  }
+}
 
 Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
-  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable, executor));
-  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable, executor));
+  if (branch_index_is_bool_) {
+    TF_RET_CHECK(branch_thunks_.size() == 2);
+  } else {
+    TF_RET_CHECK(!branch_thunks_.empty());
+  }
+  for (auto& branch_thunk : branch_thunks_) {
+    TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
+  }
   return Status::OK();
 }
 
@@ -51,31 +61,38 @@ Status ConditionalThunk::ExecuteOnStream(
     HloExecutionProfiler* profiler) {
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
   // Copy the predicate value from device.
-  bool predicate;
-  se::DeviceMemoryBase predicate_address =
-      buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
-  stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
+  int32 branch_index = -1;
+  bool pred = false;
+  se::DeviceMemoryBase branch_index_address =
+      buffer_allocations.GetDeviceAddress(branch_index_buffer_index_);
+  if (branch_index_is_bool_) {
+    stream->ThenMemcpy(&pred, branch_index_address, sizeof(bool));
+  } else {
+    stream->ThenMemcpy(&branch_index, branch_index_address, sizeof(int32));
+  }
 
   Status block_status = stream->BlockHostUntilDone();
   if (!block_status.ok()) {
-    return InternalError("Failed to retrieve predicate value on stream %p: %s.",
-                         stream, block_status.error_message());
+    return InternalError(
+        "Failed to retrieve branch_index value on stream %p: %s.", stream,
+        block_status.error_message());
   }
-
-  // Execute the true or the false computation depending on the value of the
-  // predicate.
-  if (predicate) {
-    profiler->StartHloComputation();
-    TF_RETURN_IF_ERROR(
-        true_thunk_.ExecuteOnStream(buffer_allocations, stream, profiler));
-    profiler->FinishHloComputation(hlo_instruction()->true_computation());
+  if (branch_index_is_bool_) {
+    branch_index = pred ? 0 : 1;
   } else {
-    profiler->StartHloComputation();
-    TF_RETURN_IF_ERROR(
-        false_thunk_.ExecuteOnStream(buffer_allocations, stream, profiler));
-    profiler->FinishHloComputation(hlo_instruction()->false_computation());
+    // Handle default scenario for branch_index not in [0, num_branches).
+    if (branch_index < 0 || branch_index >= hlo_instruction()->branch_count()) {
+      branch_index = hlo_instruction()->branch_count() - 1;
+    }
   }
 
+  // Execute the branch computation corresponding to the value of branch_index.
+  profiler->StartHloComputation();
+  TF_RETURN_IF_ERROR(branch_thunks_[branch_index]->ExecuteOnStream(
+      buffer_allocations, stream, profiler));
+  profiler->FinishHloComputation(
+      hlo_instruction()->branch_computation(branch_index));
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index aef24342c9fe182eb54b1c2beff840a76e7b8115..c0093ca6397e636bee953ddf0af8c48caaaadae0 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
 
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
@@ -38,12 +42,11 @@ namespace gpu {
 // false computation share the same allocation.
 class ConditionalThunk : public Thunk {
  public:
-  ConditionalThunk(const BufferAllocation::Slice& predicate_buffer_index,
-                   const BufferAllocation::Slice& true_operand_buffer_index,
-                   const BufferAllocation::Slice& false_operand_buffer_index,
-                   ThunkSequence true_thunk_sequence,
-                   ThunkSequence false_thunk_sequence,
-                   const HloInstruction* hlo);
+  ConditionalThunk(
+      const BufferAllocation::Slice& branch_index_buffer_index,
+      absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes,
+      std::vector<ThunkSequence> branch_thunk_sequences,
+      const HloInstruction* hlo);
 
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
@@ -55,11 +58,10 @@ class ConditionalThunk : public Thunk {
                          HloExecutionProfiler* profiler) override;
 
  private:
-  BufferAllocation::Slice predicate_buffer_index_;
-  BufferAllocation::Slice true_operand_buffer_index_;
-  BufferAllocation::Slice false_operand_buffer_index_;
-  SequentialThunk true_thunk_;
-  SequentialThunk false_thunk_;
+  const bool branch_index_is_bool_;
+  BufferAllocation::Slice branch_index_buffer_index_;
+  std::vector<BufferAllocation::Slice> branch_operand_buffer_indexes_;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
index 60289506524759580dbb9b82147c78c4ce1cb25e..2cceb0422d08ff7951308b0727941f5437785447 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
@@ -188,13 +188,8 @@ Status Visitor::HandleBatchNormGrad(HloInstruction* batch_norm) {
           computation_->AddInstruction(HloInstruction::CreateBroadcast(
               batch_norm->operand(3)->shape(), epsilon, {}))));
   HloInstruction* inverse_stddev =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          var_plus_epsilon->shape(), HloOpcode::kPower, var_plus_epsilon,
-          computation_->AddInstruction(HloInstruction::CreateBroadcast(
-              var_plus_epsilon->shape(),
-              computation_->AddInstruction(HloInstruction::CreateConstant(
-                  LiteralUtil::CreateR0<float>(-.5))),
-              {}))));
+      computation_->AddInstruction(HloInstruction::CreateUnary(
+          var_plus_epsilon->shape(), HloOpcode::kRsqrt, var_plus_epsilon));
 
   std::vector<HloInstruction*> operands(batch_norm->operands().begin(),
                                         batch_norm->operands().end());
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 309b0aca64954e64509d731dce28ce9d8da4ee43..02eb191cf58a2e7723de2c37bdcf3875b350be9a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -14,17 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+#include "google/protobuf/any.pb.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -32,49 +38,8 @@ namespace {
 
 using absl::optional;
 using se::DeviceMemoryBase;
-using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
-
-class ScratchAllocator : public se::ScratchAllocator {
- public:
-  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
-      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
-  }
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
-                                                  int64 byte_size) override;
-
- private:
-  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
-StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
-    se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        absl::StrFormat(
-            "Allocating %d bytes exceeds the memory limit of %d bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
-                      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                                  /*retry_on_failure=*/false));
-  total_allocated_bytes_ += byte_size;
-
-  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
-  allocated_buffers_.push_back(std::move(allocated_buffer));
-  return se::DeviceMemory<uint8>(buffer_addr);
-}
+using tensorflow::AutotuneResult;
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          se::StreamExecutor* stream_exec) {
@@ -132,6 +97,31 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
   return tensorflow::mutex_lock{it->second};
 }
 
+tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  tensorflow::CudnnVersion cudnn_version;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      cudnn_version.set_major(version.major_version());
+      cudnn_version.set_minor(version.minor_version());
+      cudnn_version.set_patch(version.patch());
+    }
+  }
+  return cudnn_version;
+}
+
+tensorflow::ComputeCapability GetComputeCapability(
+    se::StreamExecutor* stream_executor) {
+  tensorflow::ComputeCapability cc;
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  cc.set_major(cc_major);
+  cc.set_minor(cc_minor);
+  return cc;
+}
+
 }  // anonymous namespace
 
 // We could have caching here so that we don't redo this work for two identical
@@ -145,8 +135,7 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
-CudnnConvAlgorithmPicker::PickBestAlgorithm(
+StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
     const HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
   // with some work on the HLO routines.
@@ -233,8 +222,6 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
           &stream, ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
   initialize_buffer(result_buffer);
 
-  se::dnn::ProfileResult best_result;
-  int64 best_result_bytes_used = 0;
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
 
@@ -244,6 +231,7 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
   // this algorithm considered correct, though.
   optional<AlgorithmDesc> first_algorithm;
   TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
+  std::vector<AutotuneResult> profile_results;
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
     se::dnn::ProfileResult profile_result;
@@ -254,73 +242,111 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(
     RunConvOptions options;
     options.profile_result = &profile_result;
     options.algo_override = alg;
-    bool launch_ok =
+    Status launch_status =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, options)
-            .ok();
-
-    if (launch_ok && profile_result.is_valid()) {
-      const bool crash_on_checking_failure =
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_crash_on_verification_failures();
-      if (comparator.has_value()) {
-        StatusOr<bool> result = comparator->CompareEqual(
-            se::DeviceMemory<Eigen::half>(result_buffer));
-        if (!result.ok()) {
-          LOG(ERROR) << "Unable to compare "
-                     << AlgorithmToString(*first_algorithm) << " against "
-                     << AlgorithmToString(alg) << " for " << instr->ToString()
-                     << ": " << result.status();
-          CHECK(!crash_on_checking_failure);
-        } else if (!result.ValueOrDie()) {
-          LOG(ERROR) << "Results mismatch between different convolution "
-                        "algorithms. This is likely a bug in convolution, or "
-                        "an excessive loss of precision in convolution. "
-                     << instr->ToString() << " for "
-                     << AlgorithmToString(*first_algorithm) << " vs "
-                     << AlgorithmToString(alg);
-          CHECK(!crash_on_checking_failure);
-        }
-      } else if (cross_check_enabled) {
-        auto comp = F16BufferComparator::Create(
-            se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
-            &stream);
-        if (comp.ok()) {
-          comparator.emplace(comp.ConsumeValueOrDie());
-          first_algorithm.emplace(alg);
-        } else {
-          LOG(ERROR) << "Fail to initialize buffer comparator: "
-                     << comp.status() << ", instruction: " << instr->ToString();
-          CHECK(!crash_on_checking_failure);
-        }
+                     &scratch_allocator, &stream, options);
+
+    if (!launch_status.ok()) {
+      continue;
+    }
+
+    if (!profile_result.is_valid()) {
+      continue;
+    }
+
+    profile_results.emplace_back();
+    AutotuneResult& result = profile_results.back();
+    result.mutable_conv()->set_algorithm(alg.algo_id());
+    result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
+
+    int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+    result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
+    *result.mutable_success()->mutable_run_time() =
+        tensorflow::proto_utils::ToDurationProto(
+            absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+    const bool crash_on_checking_failure =
+        instr->GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_crash_on_verification_failures();
+
+    if (comparator.has_value()) {
+      StatusOr<bool> compare_result = comparator->CompareEqual(
+          se::DeviceMemory<Eigen::half>(result_buffer));
+      if (!compare_result.ok()) {
+        LOG(ERROR) << "Unable to compare "
+                   << AlgorithmToString(*first_algorithm) << " against "
+                   << AlgorithmToString(alg) << " for " << instr->ToString()
+                   << ": " << compare_result.status();
+        CHECK(!crash_on_checking_failure);
+      } else if (!compare_result.ValueOrDie()) {
+        LOG(ERROR) << "Results mismatch between different convolution "
+                      "algorithms. This is likely a bug in convolution, or "
+                      "an excessive loss of precision in convolution. "
+                   << instr->ToString() << " for "
+                   << AlgorithmToString(*first_algorithm) << " vs "
+                   << AlgorithmToString(alg);
+        CHECK(!crash_on_checking_failure);
+        auto* failure = result.mutable_reference_conv();
+        failure->set_algorithm(first_algorithm->algo_id());
+        failure->set_tensor_ops_enabled(first_algorithm->tensor_ops_enabled());
       }
-      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
-              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
-              << "ms and using " << NumBytesToString(scratch_bytes_used)
-              << " of scratch (Best result: "
-              << best_result.elapsed_time_in_ms() << "ms, "
-              << NumBytesToString(best_result_bytes_used) << " of scratch)";
-      if (profile_result.elapsed_time_in_ms() <
-          best_result.elapsed_time_in_ms()) {
-        best_result = profile_result;
-        best_result_bytes_used = scratch_bytes_used;
+    } else if (cross_check_enabled) {
+      auto comp = F16BufferComparator::Create(
+          se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
+          &stream);
+      if (comp.ok()) {
+        comparator.emplace(comp.ConsumeValueOrDie());
+        first_algorithm.emplace(alg);
+      } else {
+        LOG(ERROR) << "Fail to initialize buffer comparator: " << comp.status()
+                   << ", instruction: " << instr->ToString();
+        CHECK(!crash_on_checking_failure);
       }
-    } else {
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
     }
   }
-  if (best_result.is_valid()) {
-    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
-            << AlgorithmToString(best_result.algorithm()) << ", takes "
-            << best_result.elapsed_time_in_ms() << "ms, and uses "
-            << best_result_bytes_used << "B of scratch memory.";
-    return AutotuneResult{best_result.algorithm().algo_id(),
-                          best_result.algorithm().tensor_ops_enabled(),
-                          best_result_bytes_used,
-                          absl::Milliseconds(best_result.elapsed_time_in_ms())};
+
+  // Log the autotuning result.
+  {
+    tensorflow::AutotuningLog log;
+    {
+      ConvInstructionLog instr_log;
+      *instr_log.mutable_instruction() = instr->ToProto();
+      for (const auto* op : instr->operands()) {
+        *instr_log.add_operand_shapes() = op->shape().ToProto();
+      }
+      log.mutable_instr()->PackFrom(instr_log);
+    }
+    for (const auto& profile : profile_results) {
+      *log.add_results() = profile;
+    }
+    *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
+    *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
+    VLOG(2) << "Autotuning result:\n" << log.DebugString();
+    tensorflow::Logger::Singleton()->LogProto(log);
+  }
+
+  auto* profile_results_end = profile_results.data() + profile_results.size();
+
+  const AutotuneResult* best_result = std::min_element(
+      profile_results.data(), profile_results_end,
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        // The successful one should have a smaller key, since we are doing
+        // min_element. If they are both unsuccessful, keep the earlier one in
+        // the vector by comparing pointers.
+        return std::make_tuple(!lhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   lhs.success().run_time()),
+                               &lhs) <
+               std::make_tuple(!rhs.has_success(),
+                               tensorflow::proto_utils::FromDurationProto(
+                                   rhs.success().run_time()),
+                               &rhs);
+      });
+
+  if (best_result != profile_results_end && best_result->has_success()) {
+    return *best_result;
   }
 
   return InternalError(
@@ -341,22 +367,23 @@ StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
   }
 
   auto best_algo = std::move(best_algo_or).ValueOrDie();
-  VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
-          << " and " << NumBytesToString(best_algo.scratch_bytes)
+  VLOG(1) << "Setting cudnn conv to use algorithm "
+          << best_algo.conv().algorithm() << " and "
+          << NumBytesToString(best_algo.success().scratch_bytes())
           << " of scratch memory: " << instr->ToString()
-          << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
+          << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
 
   // Replace instr with a new CustomCall which has the correct algorithm, and
   // whose output shape has the appropriate amount of scratch memory.
   HloComputation* computation = instr->parent();
   Shape new_call_shape = ShapeUtil::MakeTupleShape(
       {instr->shape().tuple_shapes(0),
-       ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
+       ShapeUtil::MakeShape(U8, {best_algo.success().scratch_bytes()})});
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
-  backend_config.set_algorithm(best_algo.algorithm);
-  backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
+  backend_config.set_algorithm(best_algo.conv().algorithm());
+  backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
 
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 4991db0948589e479a202f4082d96df275f6e088..6ab9c7a9ecec98c9a70ea0578cdb3eb4f1d3c12d 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -47,16 +48,9 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  struct AutotuneResult {
-    int64 algorithm;
-    bool tensor_ops_enabled;
-    int64 scratch_bytes;
-    absl::Duration runtime;
-  };
-
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(
+  StatusOr<tensorflow::AutotuneResult> PickBestAlgorithm(
       const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..923b7bc452870f47505711e8abd4ce236be7815a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
@@ -0,0 +1,159 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
+
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Type traits to get CUDA complex types from std::complex<T>.
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+
+template <typename T>
+inline typename CUDAComplexT<T>::type* ToDevicePointer(se::DeviceMemory<T> p) {
+  return static_cast<typename CUDAComplexT<T>::type*>(p.opaque());
+}
+
+cublasFillMode_t CUDABlasUpperLower(se::blas::UpperLower uplo) {
+  switch (uplo) {
+    case se::blas::UpperLower::kUpper:
+      return CUBLAS_FILL_MODE_UPPER;
+    case se::blas::UpperLower::kLower:
+      return CUBLAS_FILL_MODE_LOWER;
+    default:
+      LOG(FATAL) << "Invalid value of blas::UpperLower.";
+  }
+}
+
+// Converts a cuSolver status to a Status.
+Status CusolverStatusToStatus(cusolverStatus_t status) {
+  switch (status) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return Status::OK();
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return FailedPrecondition("cuSolver has not been initialized");
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return ResourceExhausted("cuSolver allocation failed");
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return InvalidArgument("cuSolver invalid value error");
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return FailedPrecondition("cuSolver architecture mismatch error");
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return Unknown("cuSolver mapping error");
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return Unknown("cuSolver execution failed");
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return Internal("cuSolver internal error");
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return Unimplemented("cuSolver matrix type not supported error");
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return Unimplemented("cuSolver not supported error");
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return InvalidArgument("cuSolver zero pivot error");
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return FailedPrecondition("cuSolver invalid license error");
+    default:
+      return Unknown("Unknown cuSolver error");
+  }
+}
+
+}  // namespace
+
+StatusOr<CusolverContext> CusolverContext::Create(se::Stream* stream) {
+  cusolverDnHandle_t handle;
+  TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnCreate(&handle)));
+  CusolverContext context(stream, handle);
+
+  // StreamExecutor really should just expose the Cuda stream to clients...
+  const cudaStream_t* cuda_stream =
+      CHECK_NOTNULL(reinterpret_cast<const cudaStream_t*>(
+          stream->implementation()->GpuStreamMemberHack()));
+  TF_RETURN_IF_ERROR(
+      CusolverStatusToStatus(cusolverDnSetStream(handle, *cuda_stream)));
+
+  return std::move(context);
+}
+
+CusolverContext::CusolverContext(se::Stream* stream, cusolverDnHandle_t handle)
+    : stream_(stream), handle_(handle) {}
+
+CusolverContext::CusolverContext(CusolverContext&& other) {
+  handle_ = other.handle_;
+  stream_ = other.stream_;
+  other.handle_ = nullptr;
+  other.stream_ = nullptr;
+}
+
+CusolverContext& CusolverContext::operator=(CusolverContext&& other) {
+  std::swap(handle_, other.handle_);
+  std::swap(stream_, other.stream_);
+  return *this;
+}
+
+CusolverContext::~CusolverContext() {
+  if (handle_) {
+    Status status = CusolverStatusToStatus(cusolverDnDestroy(handle_));
+    if (!status.ok()) {
+      LOG(ERROR) << "cusolverDnDestroy failed: " << status;
+    }
+  }
+}
+
+#define CALL_LAPACK_TYPES(m) \
+  m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
+
+#define DN_SOLVER_FN(method, type_prefix) cusolverDn##type_prefix##method
+
+#define POTRF_BUFFER_SIZE_INSTANCE(T, type_prefix)                            \
+  StatusOr<int64> CusolverContext::PotrfBufferSize(                           \
+      se::blas::UpperLower uplo, int n, se::DeviceMemory<T> A, int lda) {     \
+    int size = -1;                                                            \
+    TF_RETURN_IF_ERROR(CusolverStatusToStatus(DN_SOLVER_FN(                   \
+        potrf_bufferSize, type_prefix)(handle(), CUDABlasUpperLower(uplo), n, \
+                                       ToDevicePointer(A), lda, &size)));     \
+    return size;                                                              \
+  }
+
+CALL_LAPACK_TYPES(POTRF_BUFFER_SIZE_INSTANCE);
+
+#define POTRF_INSTANCE(T, type_prefix)                                    \
+  Status CusolverContext::Potrf(                                          \
+      se::blas::UpperLower uplo, int n, se::DeviceMemory<T> A, int lda,   \
+      se::DeviceMemory<int> lapack_info, se::DeviceMemory<T> workspace) { \
+    return CusolverStatusToStatus(DN_SOLVER_FN(potrf, type_prefix)(       \
+        handle(), CUDABlasUpperLower(uplo), n, ToDevicePointer(A), lda,   \
+        ToDevicePointer(workspace), workspace.ElementCount(),             \
+        ToDevicePointer(lapack_info)));                                   \
+  }
+
+CALL_LAPACK_TYPES(POTRF_INSTANCE);
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.h b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdd89c3a8d599e2291b60abcd67e267a96d3ac8f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_CONTEXT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_CONTEXT_H_
+
+#include <complex>
+
+#include "cuda/include/cublas_v2.h"
+#include "cuda/include/cusolverDn.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+class CusolverContext {
+ public:
+  static StatusOr<CusolverContext> Create(se::Stream* stream);
+  CusolverContext() = default;
+  ~CusolverContext();
+
+  CusolverContext(const CusolverContext&) = delete;
+  CusolverContext(CusolverContext&&);
+  CusolverContext& operator=(const CusolverContext&) = delete;
+  CusolverContext& operator=(CusolverContext&&);
+
+  se::Stream* stream() const { return stream_; }
+  cusolverDnHandle_t handle() const { return handle_; }
+
+  // Computes the Cholesky factorization A = L * L^T for a single matrix.
+  // Returns Status::OK() if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
+  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<float> dev_A,
+               int lda, se::DeviceMemory<int> dev_lapack_info,
+               se::DeviceMemory<float> workspace);
+  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<double> dev_A,
+               int lda, se::DeviceMemory<int> dev_lapack_info,
+               se::DeviceMemory<double> workspace);
+  Status Potrf(se::blas::UpperLower uplo, int n,
+               se::DeviceMemory<std::complex<float>> dev_A, int lda,
+               se::DeviceMemory<int> dev_lapack_info,
+               se::DeviceMemory<std::complex<float>> workspace);
+  Status Potrf(se::blas::UpperLower uplo, int n,
+               se::DeviceMemory<std::complex<double>> dev_A, int lda,
+               se::DeviceMemory<int> dev_lapack_info,
+               se::DeviceMemory<std::complex<double>> workspace);
+
+  // Returns the size of the `workspace` required by Potrf, in number of
+  // elements of size T.
+  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
+                                  se::DeviceMemory<float> dev_A, int lda);
+  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
+                                  se::DeviceMemory<double> dev_A, int lda);
+  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
+                                  se::DeviceMemory<std::complex<float>> dev_A,
+                                  int lda);
+  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
+                                  se::DeviceMemory<std::complex<double>> dev_A,
+                                  int lda);
+
+ private:
+  CusolverContext(se::Stream* stream, cusolverDnHandle_t handle);
+
+  se::Stream* stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_CONTEXT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7861eb1ef04d4fa5ba5690ee388b77a3f354f88e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@@ -0,0 +1,216 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h"
+
+#include <cstdlib>
+#include <numeric>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+void SetFortranLayout(Shape* shape) {
+  LayoutUtil::SetToDefaultLayout(shape);
+  int n = shape->mutable_layout()->minor_to_major_size();
+  CHECK_GE(n, 2);
+  std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
+            shape->mutable_layout()->mutable_minor_to_major()->at(1));
+}
+
+StatusOr<HloInstruction*> CreateCholesky(CusolverContext* context,
+                                         ScratchAllocator* allocator,
+                                         HloInstruction* operand,
+                                         const CholeskyOptions& options,
+                                         const OpMetadata& metadata) {
+  HloComputation* computation = operand->parent();
+
+  Shape a_shape = operand->shape();
+  int ndim = a_shape.dimensions_size();
+  CHECK_GE(ndim, 2);
+  int64 n = a_shape.dimensions(ndim - 1);
+
+  int64 batch_size = std::accumulate(a_shape.dimensions().begin(),
+                                     a_shape.dimensions().end() - 2, int64{1},
+                                     [](int64 a, int64 b) { return a * b; });
+
+  // Find the workspace size.
+  se::blas::UpperLower uplo = options.lower() ? se::blas::UpperLower::kLower
+                                              : se::blas::UpperLower::kUpper;
+  int64 workspace_size;  // Number of elements of size a_shape.element_type()
+  switch (a_shape.element_type()) {
+    case F32: {
+      TF_ASSIGN_OR_RETURN(auto a,
+                          allocator->Allocate<float>(context->stream(), n * n));
+      TF_ASSIGN_OR_RETURN(workspace_size,
+                          context->PotrfBufferSize(uplo, n, a, n));
+      break;
+    }
+    case F64: {
+      TF_ASSIGN_OR_RETURN(
+          auto a, allocator->Allocate<double>(context->stream(), n * n));
+      TF_ASSIGN_OR_RETURN(workspace_size,
+                          context->PotrfBufferSize(uplo, n, a, n));
+      break;
+    }
+    case C64: {
+      TF_ASSIGN_OR_RETURN(auto a, allocator->Allocate<std::complex<float>>(
+                                      context->stream(), n * n));
+      TF_ASSIGN_OR_RETURN(workspace_size,
+                          context->PotrfBufferSize(uplo, n, a, n));
+      break;
+    }
+    case C128: {
+      TF_ASSIGN_OR_RETURN(auto a, allocator->Allocate<std::complex<double>>(
+                                      context->stream(), n * n));
+      TF_ASSIGN_OR_RETURN(workspace_size,
+                          context->PotrfBufferSize(uplo, n, a, n));
+      break;
+    }
+    default:
+      return InvalidArgument("Invalid type for cholesky decomposition: %s",
+                             a_shape.ToString());
+  }
+
+  // TODO(phawkins): Ideally we would relax this constraint. What we actually
+  // want is that:
+  // a) the batch dimensions are major, in no particular order.
+  // b) the two minor dimensions are in fortran (column-major) order,
+
+  SetFortranLayout(&a_shape);
+
+  // This call returns a tuple of (cholesky_result, workspace, info) where:
+  // * cholesky_result is the result of the Cholesky decomposition,
+  // * workspace is temporary scratch memory used by cuSolver.
+  // * info contains the Potrf success/failure status.
+  // Currently we have no meaningful way to report an error, so we simply
+  // discard the success/failure information. Obviously this is suboptimal.
+  Shape call_shape = ShapeUtil::MakeTupleShape(
+      {a_shape,
+       ShapeUtil::MakeShape(operand->shape().element_type(), {workspace_size}),
+       ShapeUtil::MakeShape(S32, {batch_size})});
+
+  HloInstruction* custom_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          call_shape, {operand}, kCusolverCholeskyCallTarget, {a_shape}));
+  custom_call->set_metadata(metadata);
+  TF_RETURN_IF_ERROR(custom_call->set_backend_config(options));
+  return custom_call;
+}
+
+}  // namespace
+
+// Tries to rewrite a single convolution into a call to cudnn.
+StatusOr<bool> RunOnInstruction(CusolverContext* context,
+                                ScratchAllocator* allocator,
+                                HloInstruction* instruction) {
+  if (instruction->opcode() != HloOpcode::kCholesky) {
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * custom_call,
+      CreateCholesky(context, allocator, instruction->mutable_operand(0),
+                     instruction->cholesky_options(), instruction->metadata()));
+
+  VLOG(1) << "Replacing " << instruction->ToString() << " with "
+          << custom_call->ToString();
+
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
+  // the conv result and replace `conv` with it.
+  TF_RETURN_IF_ERROR(instruction->parent()->ReplaceWithNewInstruction(
+      instruction, HloInstruction::CreateGetTupleElement(instruction->shape(),
+                                                         custom_call, 0)));
+  return true;
+}
+
+// Rewrites the convolutions in the given computation into calls to cudnn.
+// Returns true if it made any changes.
+StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
+  std::vector<HloInstruction*> cusolver_calls;
+  for (auto* hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kCholesky) {
+      cusolver_calls.push_back(hlo);
+    }
+  }
+
+  if (cusolver_calls.empty()) {
+    return false;
+  }
+
+  // Create a stream for us to do our work on. We don't really need to do any
+  // work, just allocate memory, but that's the cuSolver API.
+  se::Stream stream{stream_exec_};
+  stream.Init();
+  const auto device_ordinal = stream_exec_->device_ordinal();
+
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // StreamExecutorMemoryAllocator for stream_exec_.
+  DeviceMemoryAllocator* allocator;
+  absl::optional<StreamExecutorMemoryAllocator> se_allocator;
+  if (allocator_ != nullptr) {
+    allocator = allocator_;
+  } else {
+    se_allocator.emplace(stream_exec_->platform(),
+                         absl::Span<se::StreamExecutor* const>({stream_exec_}));
+    allocator = &*se_allocator;
+  }
+  ScratchAllocator scratch_allocator(device_ordinal, allocator);
+
+  TF_ASSIGN_OR_RETURN(CusolverContext context,
+                      CusolverContext::Create(&stream));
+
+  bool changed = false;
+  for (HloInstruction* instruction : cusolver_calls) {
+    TF_ASSIGN_OR_RETURN(
+        bool result,
+        RunOnInstruction(&context, &scratch_allocator, instruction));
+    changed |= result;
+  }
+  return changed;
+}
+
+CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
+                                   DeviceMemoryAllocator* allocator)
+    : stream_exec_(stream_exec), allocator_(allocator) {}
+
+StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c82233188f7de1e188876f13465f7face76a0a8b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites Cholesky calls into CustomCall HLOs that call into cuSolver.
+class CusolverRewriter : public HloModulePass {
+ public:
+  CusolverRewriter(se::StreamExecutor* stream_exec,
+                   DeviceMemoryAllocator* allocator);
+  absl::string_view name() const override { return "cusolver-rewriter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+
+  se::StreamExecutor* stream_exec_;   // never null
+  DeviceMemoryAllocator* allocator_;  // may be null
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 2ab754a471070d5f90a3eaebd0600ff180d2fe5d..551f7d773aad0356d9e31c4f952908592936f2b0 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "llvm/IR/DerivedTypes.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 // IWYU pragma: no_include "llvm/IR/Attributes.gen.inc"
@@ -191,39 +192,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPowerOp(
   PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
-  llvm::Type* llvm_ty = lhs_value->getType();
-
-  auto make_sqrt = [&, this]() -> StatusOr<llvm::Value*> {
-    // NVPTX has four relevant square root instructions:
-    //   sqrt.approx{.ftz}.f32
-    //   sqrt.rn{.ftz}.f32
-    //   sqrt.rn.f64
-    //   rsqrt.approx.f64
-    // We rely on LLVM's NVPTX backend to pick the right one based on our
-    // fast-math options.  (If fast-math is enabled, llvm may compute the 64-bit
-    // sqrt from the rsqrt approximation.)
-    return EmitLlvmIntrinsicMathCall("llvm.sqrt", {lhs_value}, {lhs_input_type},
-                                     output_type);
-  };
-
-  const HloInstruction* rhs = op->operand(1);
-  if (IsFPLiteralWithValue(rhs, .5)) {
-    VLOG(10) << "emitting pow(A, .5) as sqrt(A): " << op->ToString();
-    return make_sqrt();
-  }
-
-  if (IsFPLiteralWithValue(rhs, -.5)) {
-    VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString();
-    // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
-    // rsqrt.approx instruction.
-    //
-    // TODO(jlebar): Does this happen with fastmath disabled?  If not, should
-    // we force-enable it?
-    TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt());
-    return FDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
-  }
-
-  VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString();
   return EmitLibdeviceMathCall("__nv_pow", {lhs_value, rhs_value},
                                {lhs_input_type, rhs_input_type}, output_type);
 }
@@ -270,6 +238,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSqrt(PrimitiveType prim_type,
+                                                       llvm::Value* value) {
+  return EmitLibdeviceMathCall("__nv_sqrt", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitRsqrt(PrimitiveType prim_type,
+                                                        llvm::Value* value) {
+  return EmitLibdeviceMathCall("__nv_rsqrt", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
                                                         llvm::Value* lhs,
                                                         llvm::Value* rhs) {
@@ -293,6 +271,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   return FPCast(fast_tanh, value->getType());
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitRoundNearestAfz(
+    PrimitiveType prim_type, llvm::Value* value) {
+  // Use libdevice __nv_round instead of llvm.round. This is to workaround a
+  // bug in the PTX backend, which implements llvm.round with PTX cvt.rni.
+  // When the llvm.round is fixed, we may still want to use __nv_round here as
+  // expanding the non-trivial implementation early while inlining allows better
+  // optimizations.
+  return EmitLibdeviceMathCall("__nv_round", {value}, {prim_type}, prim_type);
+}
+
 llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
     const string& callee_name, absl::Span<llvm::Value* const> operands,
     absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
@@ -308,9 +296,11 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
       false);  // No variadic arguments.
 
   // Declares the callee if it is not declared already.
-  llvm::Function* callee = llvm::cast<llvm::Function>(
-      b_->GetInsertBlock()->getModule()->getOrInsertFunction(
-          llvm_ir::AsStringRef(callee_name), callee_type));
+  llvm::Function* callee = llvm::dyn_cast<llvm::Function>(
+      b_->GetInsertBlock()
+          ->getModule()
+          ->getOrInsertFunction(callee_name, callee_type)
+          .getCallee());
 
   for (auto attribute : attributes) {
     callee->addFnAttr(attribute);
@@ -395,12 +385,12 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
 
         SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
 
-        IrArray::Index input_index(index_type, index.size());
+        std::vector<llvm::Value*> input_multi_index(index.size());
         llvm::Value* in_bounds = b_->getInt1(true);
         for (size_t i = 0; i < index.size(); ++i) {
           llvm::Value* stridden_index = NSWMul(
               index[i], index_typed_const(window.dimensions(i).stride()));
-          input_index[i] = NSWSub(
+          input_multi_index[i] = NSWSub(
               NSWAdd(stridden_index,
                      NSWMul(window_index[i],
                             index_typed_const(
@@ -409,24 +399,24 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
 
           // We need to verify that we are not in the dilated base area.
           llvm::Value* dilation_condition = ICmpEQ(
-              SRem(input_index[i],
+              SRem(input_multi_index[i],
                    index_typed_const(window.dimensions(i).base_dilation())),
               index_typed_const(0));
           in_bounds = And(in_bounds, dilation_condition);
 
           // Apply base dilation to the index.
-          input_index[i] =
-              SDiv(input_index[i],
+          input_multi_index[i] =
+              SDiv(input_multi_index[i],
                    index_typed_const(window.dimensions(i).base_dilation()));
 
-          // We must check whether 0 ≤ input_index[i] < bound, as otherwise
-          // we are in the pad and so can skip the computation. This
+          // We must check whether 0 ≤ input_multi_index[i] < bound, as
+          // otherwise we are in the pad and so can skip the computation. This
           // comparison is equivalent to the unsigned comparison
-          // input_index[i] < bound, as a negative value wraps to a large
+          // input_multi_index[i] < bound, as a negative value wraps to a large
           // positive value.
           in_bounds =
               And(in_bounds,
-                  ICmpULT(input_index[i],
+                  ICmpULT(input_multi_index[i],
                           index_typed_const(operand->shape().dimensions(i))));
         }
 
@@ -435,6 +425,8 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         SetToFirstInsertPoint(if_data.true_block, b_);
 
         // We are not in pad, so do the computation.
+        IrArray::Index input_index(input_multi_index, operand->shape(),
+                                   index_type);
         TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
                             operand_to_generator.at(operand)(input_index));
         TF_ASSIGN_OR_RETURN(
@@ -446,7 +438,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         return Load(accum_ptr);
       };
     case HloOpcode::kReduce:
-      // TODO(b/112040122): This should be supported.
+      // TODO(b/118332391): This should be supported.
       CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
@@ -461,19 +453,22 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         b()->CreateStore(init_value, accum_ptr);
 
         llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions(
-            operand->shape(), hlo->dimensions(), "reduction_dim");
+        std::vector<llvm::Value*> input_multi_index =
+            loops.AddLoopsForShapeOnDimensions(
+                operand->shape(), hlo->dimensions(), "reduction_dim");
         if (!ShapeUtil::IsScalar(hlo->shape())) {
-          // Here only input_index[hlo->dimensions()] are non-null, so we must
-          // set the rest.
+          // Here only input_multi_index[hlo->dimensions()] are non-null, so we
+          // must set the rest.
           size_t j = 0;
-          for (size_t i = 0; i < input_index.size(); ++i) {
-            if (input_index[i] == nullptr) {
-              input_index[i] = output_index[j++];
+          for (auto& i : input_multi_index) {
+            if (i == nullptr) {
+              i = output_index[j++];
             }
           }
           CHECK_EQ(output_index.size(), j);
         }
+        llvm_ir::IrArray::Index input_index(
+            input_multi_index, hlo->operand(0)->shape(), index_type);
 
         SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
         TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index e8b56a39ce58b6aab35c1c977553c7ff7e753273..e9d08177ad979871890a32374657d8479c0cf669 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -76,6 +76,12 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
                                    llvm::Value* value) override;
 
+  StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                  llvm::Value* value) override;
+
+  StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                   llvm::Value* value) override;
+
   StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
                                  llvm::Value* rhs) override;
 
@@ -85,6 +91,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
 
+  StatusOr<llvm::Value*> EmitRoundNearestAfz(PrimitiveType prim_type,
+                                             llvm::Value* value) override;
+
   llvm::Value* EmitThreadId() override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 91930eccdff94bb2fc85636f3a4b2d661c618d87..0649f42e54e552bded9d4f5a7f9c01c90b0e46fa 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -62,7 +63,7 @@ double CalculateBytesReadByFusionParameter(HloInstruction* param) {
 
   // Iterate through 'instructions' accumulating byte sizes of each instruction
   // shape. For each 'instruction' in 'instructions', if all users of
-  // 'instruction' are Slice instructions, accumuates the byte sizes of each
+  // 'instruction' are Slice instructions, accumulates the byte sizes of each
   // Slice for a more accurate estimate of bytes read.
   double bytes = 0.0;
   for (auto& instruction : instructions) {
@@ -95,27 +96,6 @@ double CalculateBytesReadByFusionInstruction(HloInstruction* fusion) {
   return bytes;
 }
 
-// Returns the flops to bytes transferred ratio of instruction 'fusion'.
-double CalculateFlopsToBytesRatio(HloInstruction* fusion) {
-  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
-  // Calculate total bytes transferred in/out.
-  double bytes = CalculateBytesReadByFusionInstruction(fusion);
-  // Add bytes written to root instructions buffer.
-  if (fusion->IsMultiOutputFusion()) {
-    for (auto& operand : fusion->fused_expression_root()->operands()) {
-      bytes += ShapeUtil::ByteSizeOf(operand->shape());
-    }
-  } else {
-    bytes += ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
-  }
-  // Calculate flops for all fused instructions. Use a null shape size function
-  // because we don't care about bytes accessed by the ops.
-  HloCostAnalysis analysis([](const Shape& shape) { return 0; });
-  TF_CHECK_OK(fusion->fused_expression_root()->Accept(&analysis));
-  // Return flops / bytes.
-  return bytes > 0.0 ? analysis.flop_count() / bytes : analysis.flop_count();
-}
-
 // Returns bytes transferred by instruction 'fusion', including the bytes
 // that would be read by all users.
 double GetCurrentBytesTransferred(HloInstruction* fusion) {
@@ -169,8 +149,8 @@ class FusionInstructionMerger {
   int num_fail_not_loop_fusion_ = 0;
   int num_fail_merge_all_users_ = 0;
   int num_fail_expensive_fused_instruction_ = 0;
-  int num_fail_flops_to_byte_ratio_ = 0;
   int num_fail_net_bytes_transferred_ratio_ = 0;
+  int num_fail_inefficient_fusion_emitter_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FusionInstructionMerger);
 };
@@ -190,15 +170,13 @@ Status FusionInstructionMerger::Run() {
           << " not_loop_fusion: " << num_fail_not_loop_fusion_
           << " merge_all_users: " << num_fail_merge_all_users_
           << " expensive_instruction: " << num_fail_expensive_fused_instruction_
-          << " flops_to_byte_ratio: " << num_fail_flops_to_byte_ratio_
           << " net_bytes_transferred: " << num_fail_net_bytes_transferred_ratio_
-          << " }";
+          << " inefficient_fusion_emitter: "
+          << num_fail_inefficient_fusion_emitter_ << " }";
   return Status::OK();
 }
 
 Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
-  VLOG(3) << "FusionInstructionMerger ENTRY fusion: " << fusion->name()
-          << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion);
   ++total_visited_;
   // Skip 'fusion' instruction if there are no users into which we can merge.
   if (fusion->users().empty()) {
@@ -256,15 +234,6 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  // Skip 'fusion' instruction if its flops to bytes transferred ratio
-  // exceeds the threshold value.
-  if (CalculateFlopsToBytesRatio(fusion) >
-      FusionMerger::GetThresholdFlopsToBytesRatio()) {
-    VLOG(3) << "Not merging " << fusion->name()
-            << ": flops-to-bytes ratio is not favorable.";
-    ++num_fail_flops_to_byte_ratio_;
-    return Status::OK();
-  }
   // Skip 'fusion' instruction if merging it into all users would result in a
   // net increase in bytes transferred (currently allowing the net bytes
   // transferred to be exceeded up to ~10% in exhange for eliminating the
@@ -280,6 +249,23 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     ++num_fail_net_bytes_transferred_ratio_;
     return Status::OK();
   }
+
+  // Skip 'fusion' instruction if merging it into at least one of the users
+  // would cause too much code duplication because of inefficiencies in the
+  // fusion emitter.
+  // TODO(b/119692968): Remove this once the fusion emitter can handle arbitrary
+  // fusion nodes.
+  if (absl::c_any_of(fusion->users(), [fusion](const HloInstruction* user) {
+        return FusedIrEmitter::IsFusedIrEmitterInefficient(/*consumer=*/user,
+                                                           /*producer=*/fusion);
+      })) {
+    VLOG(3) << "Not merging " << fusion->name()
+            << ": Contains one or more users where fusing would cause "
+               "inefficiencies in the fusion emitter.";
+    ++num_fail_inefficient_fusion_emitter_;
+    return Status::OK();
+  }
+
   // Merge fused instructions from 'fusion' into each user.
   std::vector<HloInstruction*> users = fusion->users();
   for (HloInstruction* user : users) {
@@ -288,7 +274,6 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   }
   ++total_merged_;
   VLOG(2) << "Merged fusion instruction: " << fusion->name()
-          << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion)
           << " merged_to_current_bytes_ratio: " << merged_to_current_bytes_ratio
           << " into users { "
           << absl::StrJoin(users, ", ",
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index f19996edfe3dd923aa686a19621ce28a4aed5a45..a49d68002f8de5bb5640731f3cd31572593ee837 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -37,8 +37,6 @@ class FusionMerger : public HloModulePass {
   absl::string_view name() const override { return "fusion merger"; }
 
   StatusOr<bool> Run(HloModule* module) override;
-
-  static double GetThresholdFlopsToBytesRatio() { return 1.0; }
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 7cc869ed9e89688d6ea06428a7bade3ebe55ea23..31b73fd250cc812807149f6d1028761cf1f35ebf 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -99,62 +99,6 @@ ENTRY MergeSharedFusionInstruction.Computation0 {
   EXPECT_EQ(7, operand2->fused_instruction_count());
 }
 
-// Tests that we do not merge a fusion instruction that above flops to bytes
-// threshold.
-//
-// Fusion2 is not merged because it exceeds the threshold flops-to-bytes ratio.
-TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
-  auto module = ParseHloString(R"(
-HloModule FlopsToBytesRatioThresholdExceeded
-
-comp.2 {
-  state.param_1.1 = (f32[4]{0}, f32[4]{0}) parameter(0)
-  get-tuple-element.3 = f32[4]{0} get-tuple-element(state.param_1.1), index=0
-  get-tuple-element.4 = f32[4]{0} get-tuple-element(state.param_1.1), index=2
-  multiply.29 = f32[4]{0} multiply(get-tuple-element.3, get-tuple-element.4)
-  multiply.30 = f32[4]{0} multiply(get-tuple-element.3, multiply.29)
-  multiply.31 = f32[4]{0} multiply(get-tuple-element.3, multiply.30)
-  multiply.32 = f32[4]{0} multiply(get-tuple-element.3, multiply.31)
-  multiply.33 = f32[4]{0} multiply(get-tuple-element.3, multiply.32)
-  multiply.34 = f32[4]{0} multiply(get-tuple-element.3, multiply.33)
-  multiply.35 = f32[4]{0} multiply(get-tuple-element.3, multiply.34)
-  multiply.36 = f32[4]{0} multiply(get-tuple-element.3, multiply.35)
-  multiply.37 = f32[4]{0} multiply(get-tuple-element.3, multiply.36)
-  multiply.38 = f32[4]{0} multiply(get-tuple-element.3, multiply.37)
-  multiply.39 = f32[4]{0} multiply(get-tuple-element.3, multiply.38)
-  multiply.40 = f32[4]{0} multiply(get-tuple-element.3, multiply.39)
-  ROOT multiply.41 = f32[4]{0} multiply(get-tuple-element.3, multiply.40)
-}
-
-comp.1 {
-  multiply.12.param_1.1 = f32[4]{0} parameter(1)
-  constant.param_1.3 = f32[4]{0} parameter(0)
-  add.3 = f32[4]{0} add(multiply.12.param_1.1, constant.param_1.3)
-  ROOT multiply.16 = f32[4]{0} multiply(add.3, constant.param_1.3)
-}
-
-comp {
-  multiply.12.param_1 = f32[4]{0} parameter(1)
-  constant.param_1.1 = f32[4]{0} parameter(0)
-  multiply.15 = f32[4]{0} multiply(multiply.12.param_1, constant.param_1.1)
-  ROOT add.2 = f32[4]{0} add(multiply.15, constant.param_1.1)
-}
-
-ENTRY FlopsToBytesRatioThresholdExceeded.Computation1 {
-  constant = f32[4]{0} constant({1, 1, 1, 1})
-  state = (f32[4]{0}, f32[4]{0}) parameter(0)
-  fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
-  fusion.3 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp.1
-  fusion.4 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp
-  ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
-})")
-                    .ValueOrDie();
-  // Run fusion merger pass, which should detect that the flops/bytes of the
-  // shared fusion instruction exceeds the threshold ratio, and therefore
-  // cannot be merged with other fusion instructions.
-  EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
-}
-
 // Tests that threshold for bytes transferred if merged is exceeded.
 //
 // Fusion2 is not merged because it exceeds the threshold bytes transferred.
@@ -319,6 +263,62 @@ TEST_F(FusionMergerTest, WillNotMergeReduceUnfriendlyLayouts) {
   EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
+// TODO(b/119692968): Remove this test once fusion emitter is fixed.
+TEST_F(FusionMergerTest, WillNotMergeIfFusionEmitterIsInefficient) {
+  auto module = ParseHloString(R"(
+    HloModule m
+
+    %fused_computation (param_0.10: f32[6]) -> f32[1] {
+      %param_0.10 = f32[6]{0} parameter(0)
+      %add.7 = f32[6]{0} add(%param_0.10, %param_0.10)
+      %slice.21 = f32[5]{0} slice(%add.7), slice={[0:5]}
+      %slice.18 = f32[5]{0} slice(%add.7), slice={[1:6]}
+      %add.5 = f32[5]{0} add(%slice.21, %slice.18)
+      %slice.15 = f32[4]{0} slice(%add.5), slice={[0:4]}
+      %slice.12 = f32[4]{0} slice(%add.5), slice={[1:5]}
+      %add.4 = f32[4]{0} add(%slice.15, %slice.12)
+      %slice.9 = f32[3]{0} slice(%add.4), slice={[0:3]}
+      %slice.6 = f32[3]{0} slice(%add.4), slice={[1:4]}
+      %add.2 = f32[3]{0} add(%slice.9, %slice.6)
+      %slice.3 = f32[2]{0} slice(%add.2), slice={[0:2]}
+      %slice.2 = f32[2]{0} slice(%add.2), slice={[1:3]}
+      %add.1 = f32[2]{0} add(%slice.3, %slice.2)
+      %slice.1 = f32[1]{0} slice(%add.1), slice={[0:1]}
+      %slice.0 = f32[1]{0} slice(%add.1), slice={[1:2]}
+      ROOT %add.0 = f32[1]{0} add(%slice.1, %slice.0)
+    }
+
+    %fused_computation.1 (param_0.21: f32[11], param_1.21: f32[11]) -> f32[6] {
+      %param_0.21 = f32[11]{0} parameter(0)
+      %param_1.21 = f32[11]{0} parameter(1)
+      %add.16 = f32[11]{0} add(%param_0.21, %param_1.21)
+      %slice.51 = f32[10]{0} slice(%add.16), slice={[0:10]}
+      %slice.48 = f32[10]{0} slice(%add.16), slice={[1:11]}
+      %add.14 = f32[10]{0} add(%slice.51, %slice.48)
+      %slice.45 = f32[9]{0} slice(%add.14), slice={[0:9]}
+      %slice.42 = f32[9]{0} slice(%add.14), slice={[1:10]}
+      %add.13 = f32[9]{0} add(%slice.45, %slice.42)
+      %slice.39 = f32[8]{0} slice(%add.13), slice={[0:8]}
+      %slice.36 = f32[8]{0} slice(%add.13), slice={[1:9]}
+      %add.11 = f32[8]{0} add(%slice.39, %slice.36)
+      %slice.33 = f32[7]{0} slice(%add.11), slice={[0:7]}
+      %slice.30 = f32[7]{0} slice(%add.11), slice={[1:8]}
+      %add.10 = f32[7]{0} add(%slice.33, %slice.30)
+      %slice.27 = f32[6]{0} slice(%add.10), slice={[0:6]}
+      %slice.24 = f32[6]{0} slice(%add.10), slice={[1:7]}
+      ROOT %add.8 = f32[6]{0} add(%slice.27, %slice.24)
+    }
+
+    ENTRY entry {
+      p0 = f32[11]{0} parameter(0)
+      p1 = f32[11]{0} parameter(1)
+      f1 = f32[6]{0} fusion(p0, p1), kind=kLoop, calls=%fused_computation.1
+      ROOT f2 = f32[1] fusion(f1), kind=kLoop, calls=%fused_computation
+    })")
+                    .ValueOrDie();
+  EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 86c9bc6a345047fb5329af0be45c8981cc427f50..a7053e6a013be3ccf5725cbe003558be77104af1 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -428,7 +428,8 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
         scratch_data = scratch_mem->device_memory();
       }
       const MatrixDescriptor scratch_descriptor(
-          scratch_data, false, output_num_cols, output_num_rows, batch_size);
+          scratch_data, false, output_matrix.num_rows, output_matrix.num_cols,
+          batch_size);
 
       StatusOr<se::blas::AlgorithmType> best_algorithm = GetGemmAutotuneFn(
           element_type)(lhs_matrix, rhs_matrix, scratch_descriptor, alpha_,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
new file mode 100644
index 0000000000000000000000000000000000000000..ec4f6e9c91331f0142ed5434949871b5edc27462
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -0,0 +1,13 @@
+// This is used for convolution logging. Also see
+// tensorflow/core/protobuf/autotuing.h
+syntax = "proto3";
+
+package xla.gpu;
+
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+message ConvInstructionLog {
+  xla.HloInstructionProto instruction = 1;
+  repeated xla.ShapeProto operand_shapes = 2;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 842ba2fdcd31a451cec1be543e102e0a46077f38..0cbd92a9553fca3cf73fdef30ad6a58c5c82b923 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -15,7 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
+#include <iterator>
+#include <vector>
+
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
 namespace gpu {
@@ -131,5 +138,51 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                              get_loop_shape(instr_2));
 }
 
+bool IsInputFusibleScatter(const HloInstruction& instr) {
+  if (instr.opcode() == HloOpcode::kScatter ||
+      (instr.opcode() == HloOpcode::kFusion &&
+       instr.fusion_kind() == HloInstruction::FusionKind::kInput &&
+       instr.fused_expression_root()->opcode() == HloOpcode::kScatter)) {
+    return true;
+  }
+  return false;
+}
+
+bool IsInputFusible(const HloInstruction& instr) {
+  // Input fusion only handles non-elemental reduction and scatter operations.
+  return IsInputFusibleReduction(instr) || IsInputFusibleScatter(instr);
+}
+
+bool IsLoopFusible(const HloInstruction& instr) {
+  // Don't fuse get-tuple-element on GPU: We can, but it's slower than not
+  // fusing.  We never generate kernels for unfused GTEs.  Instead, if an
+  // unfused GTE is an input to a kernel (including a fusion kernel), we
+  // compute the address of the GTE at the top of the kernel.  Often we know the
+  // address of the GTE result statically, so we can do this without chasing any
+  // pointers.
+  return (instr.IsElementwise() && instr.operand_count() > 0) ||
+         instr.opcode() == HloOpcode::kBitcast ||
+         instr.opcode() == HloOpcode::kBroadcast ||
+         instr.opcode() == HloOpcode::kConcatenate ||
+         instr.opcode() == HloOpcode::kDynamicSlice ||
+         instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
+         (instr.opcode() == HloOpcode::kFusion &&
+          instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
+         instr.opcode() == HloOpcode::kGather ||
+         instr.opcode() == HloOpcode::kIota ||
+         instr.opcode() == HloOpcode::kPad ||
+         (instr.opcode() == HloOpcode::kReduce &&
+          !IsReductionToVector(instr)) ||
+         instr.opcode() == HloOpcode::kReduceWindow ||
+         instr.opcode() == HloOpcode::kReshape ||
+         instr.opcode() == HloOpcode::kReverse ||
+         instr.opcode() == HloOpcode::kSlice ||
+         instr.opcode() == HloOpcode::kTranspose;
+}
+
+bool IsFusible(const HloInstruction& instr) {
+  return IsInputFusible(instr) || IsLoopFusible(instr);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e9d7ba1c4cfa865532a0d06c2ed883a2fea4e2cd..3a59c74f64b5486c52e9cdac43f343d70f0d2558 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -24,6 +24,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Whether 'instr' can occur inside fusions, i.e. whether it is a candidate
+// for being fused. Note that further restrictions apply, e.g. Scatter must
+// be the root of an input fusion.
+bool IsFusible(const HloInstruction& instr);
+
+bool IsInputFusible(const HloInstruction& instr);
+
+bool IsLoopFusible(const HloInstruction& instr);
+
 // The code emitted for reduce-rooted input fusions (EmitReductionToVector)
 // suffers from poor data locality if the layouts of input parameters differ. In
 // such situtations it is better not to fuse. Only input params with
@@ -46,9 +55,13 @@ bool IsReduceInputFusion(const HloInstruction& instr);
 // is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
+// Whether `instr` is fusible as root of a scatter input fusions, i.e. `instr`
+// is either an unfused scatter op or a scatter input fusion.
+bool IsInputFusibleScatter(const HloInstruction& instr);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
-// This function works for both, sibling and producer-conumser multi-output
+// This function works for both, sibling and producer-consumer multi-output
 // fusion.
 // So far, multi-output fusion is supported for loop fusions and reduce
 // input fusions only. It is up to the caller to ensure the instructions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index 15d4ee206ce8debcb8a5dbc6ec65d29ba257d302..ee64b3a7596170a2bbf0a430cd6c7de5cebf2da1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -62,7 +62,7 @@ TEST_F(GpuFusibleTest,
       copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1)
       c0 = f16[] constant(0)
       broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={}
-      greater-than = pred[128,1024,32,32]{1,3,2,0} greater-than(copy, broadcast)
+      greater-than = pred[128,1024,32,32]{1,3,2,0} compare(copy, broadcast), direction=GT
       ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast)
     }
     fused_reduce {
@@ -122,7 +122,7 @@ TEST_F(GpuFusibleTest,
       p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1)
       c0 = f16[] constant(0)
       broadcast = f16[128,1024,32,32]{3,2,1,0} broadcast(c0), dimensions={}
-      greater-than = pred[128,1024,32,32]{3,2,1,0} greater-than(p1.1, broadcast)
+      greater-than = pred[128,1024,32,32]{3,2,1,0} compare(p1.1, broadcast), direction=GT
       select = f16[128,1024,32,32]{3,2,1,0} select(greater-than, p0.1, broadcast)
       ROOT root = f16[128,1024,32,32]{1,3,2,0} copy(select)
     }
@@ -507,7 +507,7 @@ TEST_F(GpuFusibleTest,
       p1.1 = f32[2,2,2]{2,1,0} parameter(1)
       c0 = f32[] constant(0)
       broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={}
-      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      greater-than = pred[2,2,2]{2,1,0} compare(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast), direction=GT
       p0.1 = f32[2,2,2]{2,1,0} parameter(0)
       ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast)
     }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
index 4268fb2c7a813b3b53e4cd48746028a7b369f28e..4765f67c4b17e97419182e341573f75ad3d6ac30 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 58bdd4209a2315cdb7d29e920faded4d1a6a5876..a6d80f0b6dddb3d8d0fd00c639e11c71da6a9f09 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -240,6 +240,32 @@ Status GpuLayoutAssignment::AddBackendConstraints(
         TF_RETURN_IF_ERROR(
             constraints->SetBufferLayout(keys_layout, *output_buffer));
       }
+    } else if (instruction->opcode() == HloOpcode::kTriangularSolve) {
+      // TODO(phawkins): Ideally we would relax this constraint. What we
+      // actually want is that:
+      // a) the batch dimensions are major, in no particular order.
+      // b) the two minor dimensions are in fortran (column-major) order,
+      // although for the 'a' argument we could potentially accept row-major
+      // order and fold the transpose into the operator.
+      auto set_fortran_layout = [](Shape* shape) {
+        LayoutUtil::SetToDefaultLayout(shape);
+        int n = shape->mutable_layout()->minor_to_major_size();
+        CHECK_GE(n, 2);
+        std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
+                  shape->mutable_layout()->mutable_minor_to_major()->at(1));
+      };
+      Shape op0_shape = instruction->operand(0)->shape();
+      Shape op1_shape = instruction->operand(1)->shape();
+      Shape output_shape = instruction->shape();
+      set_fortran_layout(&op0_shape);
+      set_fortran_layout(&op1_shape);
+      set_fortran_layout(&output_shape);
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op0_shape, instruction, 0));
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op1_shape, instruction, 1));
+      TF_RETURN_IF_ERROR(
+          constraints->SetInstructionLayout(output_shape, instruction));
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 29756d27260b0f41b2dd4b649ea9b1610ff90268..3630c3e38c59c8a2557befe38f3415d6bab1ee38 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -368,12 +368,21 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
 TEST_F(LayoutAssignmentTest, SortLayout) {
   const char* hlo_text = R"(
   HloModule SortLayout
+
+  compare {
+    p.0.lhs = f32[] parameter(0)
+    p.0.rhs = f32[] parameter(1)
+    p.1.lhs = f32[] parameter(2)
+    p.1.rhs = f32[] parameter(3)
+    ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+  }
+
   ENTRY sort {
     keys = f32[3,2]{0,1} constant({{0,1},{0,1},{0,1}})
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}) sort(keys, transpose),
-      dimensions={1}
+      dimensions={1}, to_apply=compare
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e38ceca18de30e0e1fa75a7a4bd865e000b7d22
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace gpu {
+
+StatusOr<bool> GpuSanitizeConstantNames::Run(HloModule* module) {
+  bool changed = false;
+
+  NameUniquer instr_name_uniquer(/*separator=*/"_");
+  // Collect the names used for the non-constant HLO instructions.+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() == HloOpcode::kConstant) {
+        continue;
+      }
+
+      const string& old_name = instr->name();
+      instr->UniquifyName(&instr_name_uniquer);
+      CHECK_EQ(old_name, instr->name());
+    }
+  }
+
+  // Sanitize the names for the constant HLO instructions and make them unique.
+  // This is not merged into the above loop because we don't want this pass to
+  // change the names of non-constant instructions, that is, if a constant HLO
+  // conflicts with a non-constant HLO, we change the name of the constant HLO
+  // even though the non-constant HLO comes after in the HLO module.
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() != HloOpcode::kConstant) {
+        continue;
+      }
+      string sanitized_name = llvm_ir::SanitizeConstantName(*instr);
+      instr->SetAndSanitizeName(sanitized_name);
+      instr->UniquifyName(&instr_name_uniquer);
+      changed = true;
+    }
+  }
+
+  return changed;
+}  // namespace gpu
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
similarity index 56%
rename from tensorflow/compiler/xla/service/implicit_broadcast_remover.h
rename to tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
index 9c48b7db613b049536c76237b4cfebbbc47448f3..8d583d047e25698e86032020b7fc20df87f5ab68 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,30 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
-
-#include <utility>
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
+namespace gpu {
 
-// Pass which replaces all implicit broadcasts with their equivalent sequence of
-// explicit broadcast and reshape instructions.
-class ImplicitBroadcastRemover : public HloModulePass {
+// Sanitizes HLO instruction names for the GPU backend. Currently, it only
+// replaces . and - in the HLO constant instruction names with _ to please the
+// LLVM PTX backend.
+class GpuSanitizeConstantNames : public HloModulePass {
  public:
-  ImplicitBroadcastRemover() {}
-  ~ImplicitBroadcastRemover() override {}
-
-  absl::string_view name() const override {
-    return "implicit-broadcast-remover";
-  }
+  absl::string_view name() const override { return "sanitize-constant-names"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 };
 
+}  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5adee8cc61f18f356406d8c089dd43565957739
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using SanitizeConstantNamesTest = HloTestBase;
+
+TEST_F(SanitizeConstantNamesTest, InstructionNameWithHyphenSanitized) {
+  const char *const kHloString = R"(
+    HloModule HyphenInInstructionName
+      ENTRY kernelEntry {
+        ROOT equal-to = s32[2]{0} constant({42, 73})
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->name(), "equal_to");
+}
+
+TEST_F(SanitizeConstantNamesTest, InstructionNameWithDotSanitized) {
+  const char *const kHloString = R"(
+    HloModule HyphenInInstructionName
+      ENTRY kernelEntry {
+        ROOT equal.to = s32[2]{0} constant({42, 73})
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->name(), "equal_to");
+}
+
+TEST_F(SanitizeConstantNamesTest, BufferSanitizedNameCollisionResolved) {
+  const char *const kHloString = R"(
+    HloModule BufferSanitizedName
+      ENTRY kernelEntry {
+      equal.to = s32[2]{0} constant({42, 73})
+      equal-to = s32[2]{0} constant({67, 3})
+      ROOT equal_to = s32[2]{0} add(equal.to, equal-to)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  EXPECT_THAT(FindInstruction(module.get(), "equal_to_1"), op::Constant());
+  EXPECT_THAT(FindInstruction(module.get(), "equal_to_2"), op::Constant());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 8c6a6914792a96ab517fa5f20ff2215e4785490e..e593f535642e15f28a4a1c1f321881ba3c694548 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 69aaaceca112364a4fd562f6a5eff1629fd3fc54..3c50c2b1d8e185975958ba08527aeb21686050e1 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -113,10 +113,9 @@ void HloToIrBindings::EmitBasePointersForHlos(
             BindHloToIrValue(*non_io_hlo, b_->CreateAlloca(pointee_type),
                              index);
           } else if (slice.allocation()->is_constant()) {
-            llvm::Value* global_for_constant =
-                module_->getGlobalVariable(llvm_ir::AsStringRef(
-                    llvm_ir::ConstantBufferAllocationToGlobalName(
-                        *slice.allocation())));
+            llvm::Value* global_for_constant = module_->getGlobalVariable(
+                llvm_ir::ConstantBufferAllocationToGlobalName(
+                    *slice.allocation()));
             BindHloToIrValue(*non_io_hlo, global_for_constant);
           } else {
             const int64 offset = slice.offset();
@@ -136,11 +135,11 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
   if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
     return llvm_ir::EmitGetTupleElement(
         gte->shape(), gte->tuple_index(), /*alignment=*/1,
-        GetTypedIrValue(*gte->operand(0), {}, base_ptr), b_, module_);
+        GetTypedIrValue(*gte->operand(0), {}, base_ptr), b_);
   }
   return llvm_ir::EmitGetTupleElement(
       gte->shape(), gte->tuple_index(), /*alignment=*/1,
-      EmitGetTupleElement(gte->operand(0), base_ptr), b_, module_);
+      EmitGetTupleElement(gte->operand(0), base_ptr), b_);
 }
 
 // Returns true if `value` has a name that should not be changed.
@@ -166,11 +165,10 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
     typed_ir_value = b_->CreateBitCast(ir_value, pointee_type->getPointerTo());
   }
   if (!HasMeaningfulName(ir_value)) {
-    ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw")));
+    ir_value->setName(llvm_ir::IrName(&hlo, "raw"));
   }
   if (!HasMeaningfulName(typed_ir_value)) {
-    typed_ir_value->setName(
-        llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed")));
+    typed_ir_value->setName(llvm_ir::IrName(&hlo, "typed"));
   }
   return typed_ir_value;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 6151dd8ff4c92bb81bd756c68cc9377633c8c9d5..62f625defc3e3eb3b1dd01e0992281edc810454d 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -28,31 +29,6 @@ namespace gpu {
 
 namespace {
 
-bool IsFusible(const HloInstruction& hlo) {
-  // Don't fuse get-tuple-element on GPU: We can, but it's slower than not
-  // fusing.  We never generate kernels for unfused GTEs.  Instead, if an
-  // unfused GTE is an input to a kernel (including a fusion kernel), we
-  // compute the address of the GTE at the top of the kernel.  Often we know the
-  // address of the GTE result statically, so we can do this without chasing any
-  // pointers.
-  return (hlo.IsElementwise() && hlo.operand_count() > 0) ||
-         hlo.opcode() == HloOpcode::kBitcast ||
-         hlo.opcode() == HloOpcode::kBroadcast ||
-         hlo.opcode() == HloOpcode::kConcatenate ||
-         hlo.opcode() == HloOpcode::kDynamicSlice ||
-         hlo.opcode() == HloOpcode::kDynamicUpdateSlice ||
-         hlo.opcode() == HloOpcode::kFusion ||
-         hlo.opcode() == HloOpcode::kGather ||
-         hlo.opcode() == HloOpcode::kIota || hlo.opcode() == HloOpcode::kPad ||
-         hlo.opcode() == HloOpcode::kReduce ||
-         hlo.opcode() == HloOpcode::kReduceWindow ||
-         hlo.opcode() == HloOpcode::kReshape ||
-         hlo.opcode() == HloOpcode::kReverse ||
-         hlo.opcode() == HloOpcode::kScatter ||
-         hlo.opcode() == HloOpcode::kSlice ||
-         hlo.opcode() == HloOpcode::kTranspose;
-}
-
 bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   if (constant->opcode() != HloOpcode::kConstant ||
       !ShapeUtil::IsScalar(constant->shape())) {
@@ -138,8 +114,8 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
 }
 
-bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
-                                      int64 operand_index) {
+bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
+                                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
   // Check if we can use output fusion for (A @ B) * alpha
@@ -275,29 +251,29 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
       !InstructionFusion::ShouldFuse(consumer, operand_index)) {
     return false;
   }
-
-  // We put this check last because it's potentially expensive.
-  return !FusionWouldBeTooLarge(consumer, producer);
+  return true;
 }
 
-bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
-                                                     int64 operand_index) {
-  const HloInstruction* producer = consumer->operand(operand_index);
-  // The IR emitter has limited support for non-loop fusions with multi output
-  // at present.
-  // TODO(tjoerg): Relax this constraint to allow for arbitraty kinds of fusion.
-  if (consumer->opcode() == HloOpcode::kFusion &&
-      consumer->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
+                                      int64 operand_index) {
+  if (!ShouldFuseInexpensiveChecks(consumer, operand_index)) {
     return false;
   }
-  // Multi-output fusion requires instructions with compatible shapes.
-  if (!ShapeUtil::Compatible(producer->shape(), consumer->shape())) {
+  auto producer = consumer->operand(operand_index);
+  // The following checks are potentially expensive.
+  if (FusionWouldBeTooLarge(consumer, producer)) {
     return false;
   }
-  // TODO(tjoerg): Stop calling `ShouldFuse` to relax the criteria for
-  // multi-output fusion. In particular, do not check whether an instruction is
-  // expensive to duplicate, since this doesn't matter here.
-  return GpuInstructionFusion::ShouldFuse(consumer, operand_index);
+  // Also check that our emitter can handle the fusion node. We currently can
+  // have exponential time/memory requirements for emitting certain fusion
+  // kernels, in which case we don't want to fuse.
+  // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
+  return !FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer);
+}
+
+bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
+                                                     int64 operand_index) {
+  return false;
 }
 
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index c91f6343a69268ca687004dbe0ffbb863271a95c..2f8f40b4b5ef4f0d203f8d476ebfd21032c27b62 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -49,6 +49,12 @@ class GpuInstructionFusion : public InstructionFusion {
 
   HloInstruction::FusionKind ChooseKind(
       const HloInstruction* producer, const HloInstruction* consumer) override;
+
+ private:
+  // This method is called by ShouldFuse() to do all the computationally
+  // inexpensive checks whether we should fuse the operand into 'consumer'.
+  bool ShouldFuseInexpensiveChecks(HloInstruction* consumer,
+                                   int64 operand_index);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 688604cd36e5a45debf855aacd29d05ecda92341..a05ab86cf77a134a1fc387d93cb482aa1ff5345b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -506,202 +506,11 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) {
     })")
                     .ValueOrDie();
 
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(
-      fusion->fused_expression_root(),
-      op::Tuple(op::Add(op::Subtract(), op::Parameter()), op::Subtract()));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
-  // tanh --> add --> tuple
-  //  \---------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     tanh = f32[4,3]{1,0} tanh(p0)
-     add = f32[4,3]{1,0} add(tanh, p1)
-     ROOT tuple = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(tanh, add)
-    })")
-                    .ValueOrDie();
-
-  // TODO(tjoerg): Allow multi-output fusion for expensive operations like tanh.
+  // Multi-output fusion is disabled here and performed in the
+  // GpuMultiOutputFusion pass instead.
   ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
                    .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion2) {
-  // sub --> add1 --\--------\
-  //  \----------> add2 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(sub, add1)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add1, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Subtract(), op::Add()),
-                        op::Add(op::Subtract(), op::Parameter())));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion3) {
-  // sub --> add1 ----\--------\
-  //  \ --> add2 --> add3 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     p3 = f32[4,3]{1,0} parameter(3)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(p2, sub)
-     add3 = f32[4,3]{1,0} add(add1, add2)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add3, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Add(), op::Add()),
-                        op::Add(op::Parameter(), op::Subtract())));
-}
-
-TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
-  // sub --> mul ---\
-  //  \--> call --> add --> tuple
-  auto module = ParseHloString(R"(
-  HloModule test_module
-  ENTRY OutputFusion {
-    c = f32[] constant(42)
-    p0 = f32[4,3]{1,0} parameter(0)
-    p1 = f32[4,3]{1,0} parameter(1)
-    sub = f32[4,3]{1,0} subtract(p0, p1)
-    mul = f32[4,3]{1,0} multiply(sub, c)
-    call = f32[4,3]{1,0} custom-call(sub), custom_call_target="foo"
-    add = f32[4,3]{1,0} add(mul, call)
-    ROOT tuple = (f32[4,3]{1,0}) tuple(add)
-  })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  // Visit instructions in post order to detect cycles.
-  // TODO(tjoerg): Add cycle detection to the HloVerifier.
-  class DummyVisitor : public DfsHloVisitorWithDefault {
-   public:
-    DummyVisitor() {}
-    Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
-      return Status::OK();
-    }
-  } visitor;
-  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
-    // Accept will return a FailedPrecondition when a cycle is detected.
-    EXPECT_TRUE(computation->root_instruction()->Accept(&visitor).ok());
-  }
-}
-
-TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
-  // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3])
-  //  \-------------------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[2,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[2,3]{1,0} parameter(2)
-     sub = f32[2,3]{1,0} subtract(p0, p2)
-     add = f32[4,3]{1,0} add(sub, p1)
-     ROOT tuple = (f32[2,3]{1,0}, f32[4,3]{1,0}) tuple(sub, add)
-    })")
-                    .ValueOrDie();
-
-  // Multi-output fusion requires shapes to be compatible. Since `sub` and `add`
-  // have incompatible shapes, expect that no multi-output fusion happens.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
-  auto module = ParseHloString(R"(
-  HloModule test_module
-
-  add_computation {
-    add_lhs = f32[] parameter(0)
-    add_rhs = f32[] parameter(1)
-    ROOT add_root = f32[] add(add_lhs, add_rhs)
-  }
-
-  fused_computation {
-    p1 = f32[10] parameter(0)
-    zero = f32[] constant(0)
-    ROOT f2_root = f32[] reduce(p1, zero), dimensions={0},
-           to_apply=add_computation
-  }
-
-  ENTRY entry {
-    p0 = f32[10] parameter(0)
-    mul = f32[10] multiply(p0, p0)
-    fusion = f32[] fusion(mul), kind=kInput, calls=fused_computation
-    ROOT tuple = (f32[10], f32[]) tuple(fusion, mul)
-  })")
-                    .ValueOrDie();
-
-  // Multi-output fusion is not supported for non-loop fusions at present. Since
-  // `fused_computation` is a input fusion, expect no multi-output fusion to
-  // happen.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
+                   .ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, FuseScalarConstant) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 82bdd677d96d3d0826bb4127b32d074eb632b1a3..6b9cbdd94b334ab7a4f61a4e3e43250ed9648cd0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -143,6 +142,16 @@ bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
          target == kCudnnConvBiasActivationForwardCallTarget;
 }
 
+const char* const kCusolverCholeskyCallTarget = "__cusolver$cholesky";
+
+bool IsCustomCallToCusolver(const HloInstruction& hlo) {
+  if (hlo.opcode() != HloOpcode::kCustomCall) {
+    return false;
+  }
+  const auto& target = hlo.custom_call_target();
+  return target == kCusolverCholeskyCallTarget;
+}
+
 bool ImplementedAsLibraryCall(const HloInstruction& hlo) {
   return ImplementedAsGemm(hlo) || IsCustomCallToDnnBatchNorm(hlo) ||
          IsCustomCallToDnnConvolution(hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index ebf4d926b7a280e10b09a2532caba7ad6ab3ceb2..f1a7aabb4db57b6818b29bdde73d87f0706f2827 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -131,6 +131,19 @@ extern const char* const kCudnnConvBiasActivationForwardCallTarget;
 // kConvolution opcode.
 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
 
+// Returns true if `hlo` will be implemented as a call to a cuSolver routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCusolver... constants, but returns *false* for HLOs with
+// say, a kCholesky opcode.
+bool IsCustomCallToCusolver(const HloInstruction& hlo);
+
+// Cholesky decomposition. Takes a (batched) matrix as input, and returns a
+// tuple of (result, workspace, info), where result is the result of the
+// Cholesky decomposition, workspace is scratch space for cuSolver, and info
+// is a success/failure code per batch element.
+extern const char* const kCusolverCholeskyCallTarget;
+
 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
 // or cuDNN convolution.
 bool ImplementedAsLibraryCall(const HloInstruction& hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 0007a9a8a3369d8ac010640127e1561615a6d813..f04e8241e5767da539964e93ebf48a79a2b5024c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -115,7 +115,7 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
           get_tuple_element->shape(), get_tuple_element->tuple_index(),
           // TODO(b/26344050): tighten the alignment here
           // based on the real element type.
-          /*alignment=*/1, GetBasePointer(*operand), &b_, module_));
+          /*alignment=*/1, GetBasePointer(*operand), &b_));
   return Status::OK();
 }
 
@@ -144,7 +144,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (const HloInstruction* operand : tuple->operands()) {
     base_ptrs.push_back(GetBasePointer(*operand));
   }
-  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &b_, module_);
+  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &b_);
   return Status::OK();
 }
 
@@ -434,7 +434,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select),
                            GetIrArray(*pred, *tuple_select),
                            GetBasePointer(*on_true), GetBasePointer(*on_false),
-                           &b_, module_);
+                           &b_);
   return Status::OK();
 }
 
@@ -492,8 +492,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
       result = InsertValue(result, value.first, {0});
       result = InsertValue(result, value.second, {1});
-    } else {
+    } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
       result = FMul(lhs_value, rhs_value);
+    } else {
+      TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+      result = Mul(lhs_value, rhs_value);
     }
     target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_);
     return Status::OK();
@@ -525,16 +528,18 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
   llvm_ir::ForLoopNest loop_nest(IrName(dot), &b_);
-  llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
-      lhs_array, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
-  llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
-      rhs_array, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
+  std::vector<llvm::Value*> lhs_multi_index =
+      loop_nest.EmitOperandArrayLoopNest(
+          lhs_array, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
+  std::vector<llvm::Value*> rhs_multi_index =
+      loop_nest.EmitOperandArrayLoopNest(
+          rhs_array, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
 
   // We don't have to iterate over the batch dimensions in both arrays, simplify
   // the loop nest of the rhs.
   for (int i = 0; i != dnums.lhs_batch_dimensions_size(); ++i) {
     DCHECK(absl::c_linear_search(dnums.lhs_batch_dimensions(), i));
-    rhs_index[i] = lhs_index[i];
+    rhs_multi_index[i] = lhs_multi_index[i];
   }
 
   // Create the reduction loop which does the sum of products reduction.
@@ -545,8 +550,8 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
 
   // The final entry in the rhs and lhs indexes is the indvar of the reduction
   // loop.
-  lhs_index[lhs_reduction_dimension] = reduction_loop->GetIndVarValue();
-  rhs_index[rhs_reduction_dimension] = reduction_loop->GetIndVarValue();
+  lhs_multi_index[lhs_reduction_dimension] = reduction_loop->GetIndVarValue();
+  rhs_multi_index[rhs_reduction_dimension] = reduction_loop->GetIndVarValue();
 
   // For computing the sum of products we alloca a single location to store the
   // dot product result as we accumulate it within the reduction loop. After the
@@ -571,7 +576,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   TF_RET_CHECK(!reduction_loop->GetBodyBasicBlock()->empty());
   b_.SetInsertPoint(
       &*reduction_loop->GetBodyBasicBlock()->getFirstInsertionPt());
+  llvm_ir::IrArray::Index lhs_index(lhs_multi_index, lhs_array.GetShape(),
+                                    b_.getInt64Ty());
   llvm::Value* lhs_element = lhs_array.EmitReadArrayElement(lhs_index, &b_);
+  llvm_ir::IrArray::Index rhs_index(rhs_multi_index, rhs_array.GetShape(),
+                                    b_.getInt64Ty());
   llvm::Value* rhs_element = rhs_array.EmitReadArrayElement(rhs_index, &b_);
   llvm::Value* accum = Load(accum_address);
   llvm::Value* updated_accum;
@@ -583,9 +592,13 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
     llvm::Value* accum_imag = Imag(accum, &b_);
     llvm::Value* imag_sum = FAdd(accum_imag, value.second);
     updated_accum = InsertValue(updated_accum, imag_sum, {1});
-  } else {
+  } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
     llvm::Value* product = FMul(lhs_element, rhs_element);
     updated_accum = FAdd(accum, product);
+  } else {
+    TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+    llvm::Value* product = Mul(lhs_element, rhs_element);
+    updated_accum = Add(accum, product);
   }
   Store(updated_accum, accum_address);
 
@@ -593,20 +606,22 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // address. The index into the target address is the concatenation of the rhs
   // and lhs indexes with the reduction dimensions removed. The terms from the
   // rhs index are the lower dimensions in the index so we add them first.
-  llvm_ir::IrArray::Index target_index(index_type);
+  std::vector<llvm::Value*> target_multi_index;
   for (size_t dimension = 0; dimension < lhs_index.size(); ++dimension) {
     if (dimension != lhs_reduction_dimension) {
-      target_index.push_back(lhs_index[dimension]);
+      target_multi_index.push_back(lhs_index[dimension]);
     }
   }
   // Skip over the batch dimensions to not have them in the index twice.
   for (size_t dimension = dnums.lhs_batch_dimensions_size();
        dimension < rhs_index.size(); ++dimension) {
     if (dimension != rhs_reduction_dimension) {
-      target_index.push_back(rhs_index[dimension]);
+      target_multi_index.push_back(rhs_index[dimension]);
     }
   }
   SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &b_);
+  llvm_ir::IrArray::Index target_index(target_multi_index,
+                                       target_array.GetShape(), index_type);
   target_array.EmitWriteArrayElement(
       target_index,
       Load(accum_address),  // The value written to the target array.
@@ -647,7 +662,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
+  // TODO(b/118332391): Support variadic reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on GPU");
   }
@@ -671,7 +686,7 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
         // Value*s are placed for each dimension in dimensions, and all the rest
         // are nullptrs.
         llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-        const llvm_ir::IrArray::Index reduced_dims_index =
+        std::vector<llvm::Value*> input_multi_index =
             loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                "reduction_dim");
 
@@ -682,17 +697,18 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
         // filled in. We fill in the rest of the dimensions with induction
         // Value*s taken from 'index' which iterates over the target array.
         // See the high-level description in the XLA documentation for details.
-        llvm_ir::IrArray::Index input_index = reduced_dims_index;
         llvm_ir::IrArray::Index::const_iterator it = index.begin();
 
-        for (size_t i = 0; i < input_index.size(); ++i) {
-          if (input_index[i] == nullptr) {
-            input_index[i] = *it++;
+        for (auto& i : input_multi_index) {
+          if (i == nullptr) {
+            i = *it++;
           }
         }
         CHECK(index.end() == it);
 
         // Apply the reduction function to the loaded value.
+        llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                            b_.getInt64Ty());
         llvm::Value* input_address =
             GetIrArray(*arg, *reduce).EmitArrayElementAddress(input_index, &b_);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 66c65f69758e5a2f4420935279835eaf086fea45..8c02416eef452c932e2adeebf0da7ff245f87447 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -77,10 +77,10 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation(
   llvm::Function* function = llvm::Function::Create(
       function_type,                       // The function type.
       llvm::GlobalValue::InternalLinkage,  // The linkage type.
-      llvm_ir::AsStringRef(ir_emitter_context_->name_uniquer()->GetUniqueName(
+      ir_emitter_context_->name_uniquer()->GetUniqueName(
           llvm_ir::SanitizeFunctionName(
-              nested_computation.name()))),  // The name of the function.
-      ir_emitter_context_->llvm_module());   // The parent LLVM module.
+              nested_computation.name())),  // The name of the function.
+      ir_emitter_context_->llvm_module());  // The parent LLVM module.
   for (size_t arg_no = 0; arg_no < argument_dereferenceable_bytes.size();
        ++arg_no) {
     int64 arg_size = argument_dereferenceable_bytes[arg_no];
@@ -123,7 +123,7 @@ Status IrEmitterNested::EmitTargetElementLoop(
         ConstructIrArrayForOutputs(hlo);
     TF_RETURN_IF_ERROR(
         llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop());
-    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), target_arrays, &b_, module_);
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), target_arrays, &b_);
     return Status::OK();
   }
   return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo), &b_)
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 294a454931b5cfa368bf094c428a1e942f4556b8..07038607bdc783238eb2f349bf636e420e6ae20b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -38,8 +38,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
@@ -55,11 +55,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -74,6 +76,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -102,6 +105,8 @@ using absl::StrCat;
 using llvm_ir::IrArray;
 using llvm_ir::IrName;
 
+namespace m = match;
+
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
 const int64 kMinDimensionToTransposeTiled = 16;
@@ -226,7 +231,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
     if (alloc->IsPreallocatedTempBuffer()) {
       fn_arg->setName("temp_buf");
     } else {
-      fn_arg->setName(llvm_ir::AsStringRef(StrCat("alloc", alloc->index())));
+      fn_arg->setName(StrCat("alloc", alloc->index()));
     }
   }
 
@@ -476,6 +481,51 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     return Status::OK();
   }
 
+  if (custom_call->custom_call_target() == kCusolverCholeskyCallTarget) {
+    TF_ASSIGN_OR_RETURN(CholeskyOptions options,
+                        custom_call->backend_config<CholeskyOptions>());
+
+    const Shape& shape = custom_call->operand(0)->shape();
+    int ndim = shape.dimensions_size();
+    CHECK_GE(ndim, 2);
+    int64 n = shape.dimensions(ndim - 1);
+
+    const auto& dims = shape.dimensions();
+    int64 batch_size = std::accumulate(dims.begin(), dims.end() - 2, int64{1},
+                                       [](int64 a, int64 b) { return a * b; });
+
+    auto operand_buffer = GetAllocationSlice(*custom_call->operand(0));
+
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    auto a_buffer = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto workspace_buffer = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+    auto info_buffer = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
+
+    std::vector<std::unique_ptr<Thunk>> thunks;
+
+    if (operand_buffer != a_buffer) {
+      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/operand_buffer,
+          /*destination_buffer=*/a_buffer,
+          /*mem_size=*/ShapeUtil::ByteSizeOf(shape), custom_call));
+    }
+
+    thunks.push_back(absl::make_unique<CholeskyThunk>(
+        options, a_buffer, workspace_buffer, info_buffer,
+        custom_call->operand(0)->shape().element_type(), batch_size, n,
+        custom_call));
+
+    // Elide the sequential thunk if there's no copy.
+    if (thunks.size() == 1) {
+      AddThunkToThunkSequence(std::move(thunks[0]));
+    } else {
+      AddThunkToThunkSequence(
+          absl::make_unique<SequentialThunk>(std::move(thunks), custom_call));
+    }
+
+    return Status::OK();
+  }
+
   return IrEmitter::HandleCustomCall(custom_call);
 }
 
@@ -487,6 +537,41 @@ Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
   return Status::OK();
 }
 
+Status IrEmitterUnnested::HandleTriangularSolve(HloInstruction* hlo) {
+  auto has_fortran_layout = [](const Layout& layout) {
+    int n = layout.minor_to_major_size();
+    return layout.minor_to_major(0) == n - 2 &&
+           layout.minor_to_major(1) == n - 1;
+  };
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(0)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(1)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->shape().layout()));
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+
+  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
+  // aren't the same buffer.
+  auto operand_buffer = GetAllocationSlice(*hlo->operand(1));
+  auto destination_buffer = GetAllocationSlice(*hlo);
+  if (operand_buffer != destination_buffer) {
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/operand_buffer,
+        /*destination_buffer=*/destination_buffer,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()), hlo));
+  }
+
+  thunks.push_back(BuildTriangularSolveThunk(hlo));
+
+  // Elide the sequential thunk if there's no copy.
+  if (thunks.size() == 1) {
+    AddThunkToThunkSequence(std::move(thunks[0]));
+  } else {
+    AddThunkToThunkSequence(
+        absl::make_unique<SequentialThunk>(std::move(thunks), hlo));
+  }
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   if (HloInstruction::FusionKind::kInput == fusion->fusion_kind()) {
@@ -546,7 +631,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // a 1D array. The specialized version requires a initializer thunk that
         // initializes the output array to the initial value of the reduce.
         if (root->opcode() == HloOpcode::kReduce && root->shape().IsTuple()) {
-          // TODO(b/112040122): Support variadic reduce.
+          // TODO(b/118332391): Support variadic reduce.
           return Unimplemented("Variadic reduce is not supported on GPU");
         }
         return EmitReductionToVector(fusion);
@@ -635,7 +720,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
 }
 
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support multi-output reduce.
+  // TODO(b/118332391): Support multi-output reduce.
   if (!reduce->shape().IsArray()) {
     return Unimplemented("Multi-output reduce is not supported on GPU");
   }
@@ -778,16 +863,16 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // Compute the operand index to visit and evaluate the condition whether the
     // operand index is within the bounds. The unsigned comparison includes
     // checking whether the operand index >= 0.
-    IrArray::Index operand_index(index_type, source_index.size());
+    std::vector<llvm::Value*> operand_multi_index(source_index.size());
     llvm::Value* in_bounds_condition = b_.getInt1(true);
     for (int64 i = 0; i < rank; ++i) {
       llvm::Value* strided_index = NSWMul(
           source_index[i], index_typed_constant(window.dimensions(i).stride()));
-      operand_index[i] =
+      operand_multi_index[i] =
           NSWSub(NSWAdd(strided_index, window_index[i]),
                  index_typed_constant(window.dimensions(i).padding_low()));
       llvm::Value* index_condition = ICmpULT(
-          operand_index[i],
+          operand_multi_index[i],
           index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i)));
       in_bounds_condition = And(in_bounds_condition, index_condition);
     }
@@ -812,6 +897,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       }
     };
     IrArray operand_array = GetIrArray(*operand, *select_and_scatter);
+    IrArray::Index operand_index(operand_multi_index, operand->shape(),
+                                 index_type);
     llvm::Value* operand_data =
         operand_array.EmitReadArrayElement(operand_index, &b_);
     Store(operand_data, selected_value_address);
@@ -822,7 +909,6 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // potentially update the selected value and index with the currently
     // visiting operand.
     llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &b_);
-    const Shape output_shape = ShapeUtil::MakeShape(PRED, {});
     llvm::Value* operand_address =
         operand_array.EmitArrayElementAddress(operand_index, &b_);
     llvm::Value* select_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
@@ -854,15 +940,18 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // value and the current output value.
     llvm_ir::SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
                                    &b_);
-    IrArray::Index selected_index(operand_index.GetType());
+    std::vector<llvm::Value*> selected_multi_index;
     for (int64 i = 0; i < rank; ++i) {
       llvm::Value* selected_index_address_slot =
           InBoundsGEP(selected_index_address, {b_.getInt32(i)});
-      selected_index.push_back(Load(selected_index_address_slot));
+      selected_multi_index.push_back(Load(selected_index_address_slot));
     }
     llvm::Value* source_value_address =
         GetIrArray(*source, *select_and_scatter)
             .EmitArrayElementAddress(source_index, &b_);
+    IrArray::Index selected_index(selected_multi_index,
+                                  select_and_scatter->shape(),
+                                  operand_index.GetType());
     llvm::Value* output_value_address =
         GetIrArray(*select_and_scatter, *select_and_scatter)
             .EmitArrayElementAddress(selected_index, &b_);
@@ -891,13 +980,12 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
   // Build ForThunk for conformant while loops, otherwise build WhileThunk.
-  // TODO(b/112163966): Move trip count computation earlier in the pipeline.
-  if (auto loop_trip_count = ComputeWhileLoopTripCount(xla_while)) {
-    AddThunkToThunkSequence(BuildForThunk(xla_while, *loop_trip_count));
-    VLOG(3) << "Built ForThunk for while: " << xla_while->name();
+  auto config = xla_while->backend_config<WhileLoopBackendConfig>();
+  if (config.ok() && config.ValueOrDie().has_known_trip_count()) {
+    AddThunkToThunkSequence(
+        BuildForThunk(xla_while, config.ValueOrDie().known_trip_count().n()));
   } else {
     AddThunkToThunkSequence(BuildWhileThunk(xla_while));
-    VLOG(3) << "Built WhileThunk for while: " << xla_while->name();
   }
   return Status::OK();
 }
@@ -959,18 +1047,18 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
       BuildKernelThunk(scatter,
                        /*implements_whole_instruction=*/thunks.empty()));
 
-  TF_RETURN_IF_ERROR(
-      EmitScatter(thunks.back().get(), scatter,
-                  /*scatter_indices_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*scatter_indices, *scatter)
-                        .EmitReadArrayElement(index, &b_, "scatter_index");
-                  },
-                  /*updates_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*updates, *scatter)
-                        .EmitReadArrayElement(index, &b_, "update");
-                  }));
+  TF_RETURN_IF_ERROR(EmitScatter(
+      thunks.back().get(), scatter,
+      /*scatter_indices_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*scatter_indices, *scatter)
+            .EmitReadArrayElement(index, &b_, "scatter_index");
+      },
+      /*updates_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*updates, *scatter)
+            .EmitReadArrayElement(index, &b_, "update");
+      }));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -1041,16 +1129,20 @@ Status IrEmitterUnnested::EmitScatter(
 
     // Now load the indices corresponding to the current window from
     // scatter_indices.
-    llvm_ir::IrArray::Index raw_scatter_index_index(input_scatter_multidim,
-                                                    index.GetType());
-    raw_scatter_index_index.InsertAt(dim_numbers.index_vector_dim(), nullptr);
+    std::vector<llvm::Value*> raw_scatter_index_multidim =
+        input_scatter_multidim;
+    raw_scatter_index_multidim.insert(
+        raw_scatter_index_multidim.begin() + dim_numbers.index_vector_dim(),
+        nullptr);
     llvm::Value* is_in_bounds = b_.getTrue();
     for (int64 i = 0, e = dim_numbers.scatter_dims_to_operand_dims_size();
          i != e; ++i) {
       // Our index is stored along index_vector_dim, insert that into the lookup
       // index into scatter_indices.
-      raw_scatter_index_index[dim_numbers.index_vector_dim()] =
-          raw_scatter_index_index.GetConstantWithIndexType(i);
+      raw_scatter_index_multidim[dim_numbers.index_vector_dim()] =
+          index.GetConstantWithIndexType(i);
+      llvm_ir::IrArray::Index raw_scatter_index_index(
+          raw_scatter_index_multidim, scatter_indices_shape, index.GetType());
 
       int64 operand_dim = dim_numbers.scatter_dims_to_operand_dims(i);
       TF_ASSIGN_OR_RETURN(
@@ -1118,17 +1210,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
   int64 dimension_to_sort = sort->dimensions(0);
-  // In case there is a 'values' parameter that is a iota, we take note and use
-  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
-  // sort.
-  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
-    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
-        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
-        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
-            dimension_to_sort) {
-      iota_values_parameter_index = i;
-    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -1241,25 +1323,23 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
-    IrArray keys_array;
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(sort->operand_count() - 1);
+    values_arrays.reserve(sort->operand_count());
     for (int64 i = 0; i < sort->operand_count(); ++i) {
       ShapeIndex shape_index =
           sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-      if (i == 0) {
-        keys_array = GetIrArray(*sort, *sort, shape_index);
-      } else {
-        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
-      }
+      values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
     }
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, keys_array, values_arrays,
-        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(sort), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
-        kTileSize);
+        kTileSize,
+        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
+          return EmitCallToNestedComputation(*sort->to_apply(), operands,
+                                             output);
+        });
   };
   std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
@@ -1296,11 +1376,55 @@ Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
   return IrEmitter::HandleTupleSelect(tuple_select);
 }
 
+namespace {
+
+bool IsScalarAddComputation(HloComputation* computation) {
+  return Match(computation->root_instruction(),
+               m::AddAnyOrder(m::Parameter(0), m::Parameter(1))
+                   .WithShape(m::Shape().IsEffectiveScalar()));
+}
+
+}  // namespace
+
 Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
+  VLOG(2) << "AllReduce; replica count: " << hlo_module_config_.replica_count()
+          << "; operand count: " << crs->operand_count()
+          << "; NCCL is enabled: " << NcclAllReduceThunk::NcclIsEnabled();
+
+  // Note the replica_count == 1 case is handled via device-to-device copy
+  // below.
+  bool should_use_nccl_thunk =
+      hlo_module_config_.replica_count() > 1 &&
+      crs->IsCrossReplicaAllReduce() &&
+      crs->operand_count() == 1 &&  // One array to reduce.
+      crs->operand(0)->shape().element_type() == F32 &&
+      // Check the computation is a summation.
+      IsScalarAddComputation(crs->to_apply());
+
+  if (should_use_nccl_thunk) {
+    CHECK(crs->operand(0)->shape().IsArray())
+        << "Operands to all-reduce must be arrays: " << crs->ToString();
+    AddThunkToThunkSequence(absl::make_unique<NcclAllReduceThunk>(
+        /*replica_count=*/hlo_module_config_.replica_count(),
+        /*elements=*/ShapeUtil::ElementsIn(crs->operand(0)->shape()),
+        /*source_address=*/GetAllocationSlice(*crs->operand(0)),
+        /*destination_buffer=*/GetAllocationSlice(*crs), crs));
+    return Status::OK();
+  }
+
   if (hlo_module_config_.replica_count() != 1) {
-    // TODO(b/33011107): Support nontrivial cross replica sum on GPU.
-    return Unimplemented(
-        "AllReduce with >1 replica is not implemented on GPU.");
+    // TODO(b/33011107): Support more AllReduce configurations on GPU.
+    string message = absl::StrFormat(
+        "Requested AllReduce not implemented on GPU; replica_count: %d; "
+        "operand_count: %d; IsCrossReplicaAllReduce: %d; NCCL support: %d",
+        hlo_module_config_.replica_count(), crs->operand_count(),
+        crs->IsCrossReplicaAllReduce(), NcclAllReduceThunk::NcclIsEnabled());
+    if (crs->operand_count() > 0) {
+      absl::StrAppendFormat(
+          &message, "; first operand array element-type: %s",
+          PrimitiveType_Name(crs->operand(0)->shape().element_type()));
+    }
+    return Unimplemented("%s", message);
   }
 
   // CRS with one operand and one replica is simply the identity function.
@@ -1543,8 +1667,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
     llvm::Value* loc;
     if (slice.allocation()->is_constant()) {
       loc = ir_emitter_context_->llvm_module()->getGlobalVariable(
-          llvm_ir::AsStringRef(llvm_ir::ConstantBufferAllocationToGlobalName(
-              *slice.allocation())));
+          llvm_ir::ConstantBufferAllocationToGlobalName(*slice.allocation()));
       CHECK_NE(loc, nullptr);
     } else {
       loc = InBoundsGEP(kernel_args.at(slice.allocation()),
@@ -1573,7 +1696,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   }
 
   return absl::make_unique<KernelThunk>(
-      non_constant_buffers, llvm_ir::AsString(kernel->getName()),
+      non_constant_buffers, kernel->getName(),
       implements_whole_instruction ? inst : nullptr, unroll_factor);
 }
 
@@ -1758,6 +1881,29 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
       /*output_shape=*/inst->shape(), inst);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildTriangularSolveThunk(
+    const HloInstruction* inst) {
+  const HloInstruction* a = inst->operand(0);
+  const HloInstruction* b = inst->operand(1);
+  int64 m = b->shape().dimensions(b->shape().rank() - 2);
+  int64 n = b->shape().dimensions(b->shape().rank() - 1);
+  int64 batch_size = std::accumulate(
+      b->shape().dimensions().begin(), b->shape().dimensions().end() - 2,
+      int64{1}, [](int64 a, int64 b) { return a * b; });
+  int64 elem_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(inst->shape().element_type());
+  int64 a_batch_stride = inst->triangular_solve_options().left_side()
+                             ? m * m * elem_size
+                             : n * n * elem_size;
+  int64 b_batch_stride = m * n * elem_size;
+  return absl::make_unique<TriangularSolveThunk>(
+      inst->triangular_solve_options(),
+      /*a_input_buffer=*/GetAllocationSlice(*a),
+      /*b_input_buffer=*/GetAllocationSlice(*inst),
+      inst->shape().element_type(), batch_size, m, n, a_batch_stride,
+      b_batch_stride, inst);
+}
+
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
@@ -1931,41 +2077,32 @@ Status CheckWhileBuffersShareAllocation(
 // Checks that the buffers used in a conditional instruction are shared with the
 // operands and result as follows:
 //   * The result buffer of the conditional should share the allocation with the
-//     result buffers of the true and false computations.
-//   * The buffer of operand 1 should share the allocation with the buffer of
-//     the parameter 0 instruction of the true computation.
-//   * The buffer of operand 2 should share the allocation with the buffer of
-//     the parameter 0 instruction of the false computation.
+//     result buffers of each branch computation.
+//   * The buffer of operand b+1 should share the allocation with the buffer of
+//     the parameter 0 instruction of the b'th computation.
 Status CheckConditionalBuffersShareAllocation(
     const HloInstruction* conditional,
     const BufferAssignment& buffer_assignment) {
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       conditional->shape(),
       [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
-        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
-            conditional, conditional->true_computation()->root_instruction(),
-            index, buffer_assignment));
-        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
-            conditional, conditional->false_computation()->root_instruction(),
-            index, buffer_assignment));
+        for (auto branch_computation : conditional->branch_computations()) {
+          TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+              conditional, branch_computation->root_instruction(), index,
+              buffer_assignment));
+        }
         return Status::OK();
       }));
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      conditional->operand(1)->shape(),
-      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
-        return CheckHloBuffersShareAllocation(
-            conditional->operand(1),
-            conditional->true_computation()->parameter_instruction(0), index,
-            buffer_assignment);
-      }));
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      conditional->operand(2)->shape(),
-      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
-        return CheckHloBuffersShareAllocation(
-            conditional->operand(2),
-            conditional->false_computation()->parameter_instruction(0), index,
-            buffer_assignment);
-      }));
+  for (int j = 0; j < conditional->branch_count(); ++j) {
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        conditional->operand(j + 1)->shape(),
+        [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+          return CheckHloBuffersShareAllocation(
+              conditional->operand(j + 1),
+              conditional->branch_computation(j)->parameter_instruction(0),
+              index, buffer_assignment);
+        }));
+  }
   return Status::OK();
 }
 
@@ -2018,22 +2155,20 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
   TF_CHECK_OK(CheckConditionalBuffersShareAllocation(
       hlo, ir_emitter_context_->buffer_assignment()));
 
-  HloComputation* true_computation = hlo->true_computation();
-  IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
-                                    ir_emitter_context_);
-  TF_CHECK_OK(true_computation->Accept(&ir_emitter_true));
-
-  HloComputation* false_computation = hlo->false_computation();
-  IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
-                                     ir_emitter_context_);
-  TF_CHECK_OK(false_computation->Accept(&ir_emitter_false));
+  std::vector<BufferAllocation::Slice> branch_operands;
+  std::vector<ThunkSequence> branch_thunks;
+  for (int j = 0; j < hlo->branch_count(); ++j) {
+    branch_operands.emplace_back(GetAllocationSlice(*hlo->operand(j + 1)));
+    HloComputation* branch_computation = hlo->branch_computation(j);
+    IrEmitterUnnested ir_emitter(hlo_module_config_, branch_computation,
+                                 ir_emitter_context_);
+    TF_CHECK_OK(branch_computation->Accept(&ir_emitter));
+    branch_thunks.push_back(std::move(*ir_emitter.ConsumeThunkSequence()));
+  }
 
   return absl::make_unique<ConditionalThunk>(
-      GetAllocationSlice(*hlo->operand(0)),
-      GetAllocationSlice(*hlo->operand(1)),
-      GetAllocationSlice(*hlo->operand(2)),
-      std::move(*ir_emitter_true.ConsumeThunkSequence()),
-      std::move(*ir_emitter_false.ConsumeThunkSequence()), hlo);
+      GetAllocationSlice(*hlo->operand(0)), branch_operands,
+      std::move(branch_thunks), hlo);
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
@@ -2066,7 +2201,7 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   // kernel *anyway*.
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(hlo);
   KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_);
   });
 
   // For multioutput fusion, we need to emit each operand and the root.
@@ -2133,7 +2268,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
     const std::vector<llvm::Value*>& param_buffers,
@@ -2782,13 +2916,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
       reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
           index,
           GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
-  int num_partial_results = reduction_info->GetNumberOfPartialResults();
-  if (num_partial_results > 1) {
-    // Clear the linear index field of the IrArray::Index to enable the use of
-    // GetElementPointer with array types. This enables the vectorization of
-    // the computation for different partial results.
-    input_index.ClearLinearIndex();
-  }
   absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
       reduction_info->GetPartialResultAddresses();
   absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
@@ -2967,12 +3094,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
   // *anyway*.
   if (!reduction_info && unnested_hlo->IsMultiOutputFusion()) {
-    KernelSupportLibrary{&b_}.If(
-        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
-                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
-                             module_);
-        });
+    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+      llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                         ConstructIrArrayForOutputs(*unnested_hlo), &b_);
+    });
   }
 
   // For each tiled parameter, cast its input IrArray to the corresponding
@@ -3710,8 +3835,7 @@ Status IrEmitterUnnested::EmitConstantGlobals() {
         global_type, /*isConstant=*/should_emit_initializer,
         llvm::GlobalValue::ExternalLinkage,
         /*Initializer=*/initializer,
-        llvm_ir::AsStringRef(
-            llvm_ir::ConstantBufferAllocationToGlobalName(allocation)));
+        llvm_ir::ConstantBufferAllocationToGlobalName(allocation));
     global_for_const->setAlignment(kConstantBufferAlignBytes);
     ir_emitter_context_->llvm_module()->getGlobalList().push_back(
         global_for_const);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 21b842bb2cd63ac454f85556df20ae5877cecbe1..9890ce122dfdc7444d769b6eb695a7c0932408c3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -176,6 +176,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAfterAll(HloInstruction* after_all) override;
@@ -319,6 +320,12 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
 
+  // Returns a CholeskyThunk that calls cuSolver to implement `inst`.
+  std::unique_ptr<Thunk> BuildCholeskyThunk(const HloInstruction* inst);
+
+  // Returns a TriangularSolveThunk that calls cuBlas to implement `inst`.
+  std::unique_ptr<Thunk> BuildTriangularSolveThunk(const HloInstruction* inst);
+
   // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
   // to make sure `inst` outlives the lifetime of the returned Thunk object.
   std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
@@ -352,9 +359,9 @@ class IrEmitterUnnested : public IrEmitter {
   std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
                                        const int64 loop_limit);
 
-  // Returns a ConditionalThunk that executes the thunk sequence for
-  // 'true_computation' or 'false_computation' depending on the value of the
-  // predicate in the given conditional instruction.
+  // Returns a ConditionalThunk which executes the thunk sequence for the
+  // 'branch_computation' corresponding to the predicate/branch_index of the
+  // given conditional instruction.
   std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
 
   Status Postprocess(HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 153aab97d9eb971734c5ea95564895631bc2a9fa..275a638e8c02d8f841790d39cac0cba175d31fab 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -140,10 +140,9 @@ static string GetSmName(std::pair<int, int> compute_capability) {
 
 // Convenience function for producing a name of a temporary compilation product
 // from the input filename.
-string MakeNameForTempProduct(const std::string& input_filename,
+string MakeNameForTempProduct(absl::string_view input_filename,
                               absl::string_view extension) {
-  return ReplaceFilenameExtension(absl::string_view(tensorflow::io::Basename(
-                                      llvm_ir::AsString(input_filename))),
+  return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
                                   extension);
 }
 
@@ -254,11 +253,8 @@ string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
     llvm::buffer_ostream pstream(stream);
     // The extension is stripped by IrDumpingPassManager, so we need to
     // get creative to add a suffix.
-    string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
     IrDumpingPassManager codegen_passes(
-        ReplaceFilenameExtension(
-            absl::string_view(tensorflow::io::Basename(module_id)),
-            "-nvptx.dummy"),
+        MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
         "", false);
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
@@ -336,7 +332,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   // If the module has no functions or globals, there's nothing to compile. Just
   // return an empty string.
   if (module->empty() && module->global_empty()) {
-    VLOG(2) << "Module '" << llvm_ir::AsString(module->getName())
+    VLOG(2) << "Module '" << module->getName().str()
             << "' is empty. Skipping compilation.";
     return string();
   }
@@ -492,11 +488,10 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
 
   string ptx;
   {
-    tensorflow::tracing::ScopedActivity activity(
-        "Compiling IR", llvm_ir::AsString(module->getName()),
-        /*is_expensive=*/true);
-    XLA_SCOPED_LOGGING_TIMER("Compile module " +
-                             llvm_ir::AsString(module->getName()));
+    tensorflow::tracing::ScopedActivity activity("Compiling IR",
+                                                 module->getName().str(),
+                                                 /*is_expensive=*/true);
+    XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
     TF_ASSIGN_OR_RETURN(
         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
                                 libdevice_dir_path));
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 40b87b16a195564c9b98497f79a70f1db0539d87..4b78d48210a5d65713d3ff63ef943335d0ddd4db 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -437,7 +437,7 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduceFusion) {
       p1.1 = f32[2,2,2]{2,1,0} parameter(1)
       c0 = f32[] constant(0)
       broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={}
-      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      greater-than = pred[2,2,2]{2,1,0} compare(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast), direction=GT
       p0.1 = f32[2,2,2]{2,1,0} parameter(0)
       ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast)
     }
@@ -505,7 +505,7 @@ TEST_F(MultiOutputFusionTest,
       p1.1 = f16[2,2,2]{2,1,0} parameter(1)
       c0 = f16[] constant(0)
       broadcast = f16[2,2,2]{2,1,0} broadcast(f16[] c0), dimensions={}
-      greater-than = pred[2,2,2]{2,1,0} greater-than(f16[2,2,2]{2,1,0} p1.1, f16[2,2,2]{2,1,0} broadcast)
+      greater-than = pred[2,2,2]{2,1,0} compare(f16[2,2,2]{2,1,0} p1.1, f16[2,2,2]{2,1,0} broadcast), direction=GT
       p0.1 = f16[2,2,2]{2,1,0} parameter(0)
       ROOT select = f16[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f16[2,2,2]{2,1,0} p0.1, f16[2,2,2]{2,1,0} broadcast)
     }
@@ -548,7 +548,7 @@ TEST_F(MultiOutputFusionTest,
       copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1)
       c0 = f16[] constant(0)
       broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={}
-      greater-than = pred[128,1024,32,32]{1,3,2,0} greater-than(copy, broadcast)
+      greater-than = pred[128,1024,32,32]{1,3,2,0} compare(copy, broadcast), direction=GT
       ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast)
     }
     fused_reduce {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3051db3af4ae4380e4a38f50ad8ebc89642e645f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -0,0 +1,356 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
+
+#include "tensorflow/compiler/xla/util.h"
+
+#if GOOGLE_CUDA
+#include "absl/synchronization/blocking_counter.h"
+#include "third_party/nccl/nccl.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#endif
+
+namespace xla {
+namespace gpu {
+
+/* static */ bool NcclAllReduceThunk::NcclIsEnabled() {
+#if GOOGLE_CUDA
+  return true;
+#else
+  return false;
+#endif
+}
+
+#if GOOGLE_CUDA
+namespace {
+
+// GPU-replica-driving host threads (i.e. the threads that call
+// GpuExecutable::Execute) build up this structure to describe their
+// participating replica, and then call to
+// GlobalRendezvousManager::SubmitParticipant.
+struct ParticipantData {
+  // Number of replicas particiating in the AllReduce.
+  int64 replica_count;
+
+  int64 element_count;
+  int64 device_ordinal;
+  int64 generation_counter;
+
+  // TODO(b/125951860): We should vet that we're buffer allocating such that
+  // source_buffer == destination_buffer if that avoids a NCCL copy (will depend
+  // on how well the NCCL in-place implementation performs vs the out-of-place
+  // implementation).
+  se::DeviceMemoryBase source_data;
+  se::DeviceMemoryBase destination_data;
+  se::Stream* stream;
+
+  NcclAllReduceThunk* originator;
+
+  string ToString() const {
+    return absl::StrFormat(
+        "ParticipantData{replica_count=%d, element_count=%d, "
+        "device_ordinal=%d, generation_counter=%d, stream=%p, originator=%p}",
+        replica_count, element_count, device_ordinal, generation_counter,
+        stream, originator);
+  }
+};
+
+// Class that gets instantiated as a singleton in GetGlobalRendezvous() to
+// coordinate participating threads in performing an AllReduce operation.
+//
+// This manager is responsible for establishing communication channels and
+// ultimately enqueueing the NCCL library operation onto the participating
+// streams.
+class GlobalRendezvousManager {
+ public:
+  // The GpuExecutable-executing threads call this in order to a) establish the
+  // all-reduce rendezvous and b) enqueue the AllReduce operation on the caller
+  // thread's associated stream (given in "participant").
+  //
+  // Implementation note: since the rendezvous we're creating here is global, we
+  // try to be paranoid about the fact that the *correct* one is happening.  In
+  // an ideal world we'd have some StreamExecutor se::Platform level construct
+  // that we could use for cross-device networking primitives (e.g. via a
+  // NetworkSupport interface) that could be shared between TensorFlow and XLA,
+  // but this is a reasonable stopgap measure to get multi-GPU-replica up and
+  // running properly for single-host, single-concurrent-XLA-module usage.
+  Status SubmitParticipant(ParticipantData participant);
+
+  // Returns the current generation number of AllReduce operations.
+  // (Currently one AllReduce operation occurs per generation.)
+  int64 GetCurrentGeneration() {
+    tensorflow::mutex_lock lock(mutex_);
+    return current_generation_;
+  }
+
+ private:
+  // Called by the primary thread to set up the communication links.
+  //
+  // TODO(b/125951860): This performs lots of (presumably) unnecessary host-side
+  // synchronization so that we can be paranoid about semantics in the earliest
+  // implementation. In the limit we should only need to synchronize host
+  // replica threads when the "number of replicas" or "participating device
+  // ordinals" change, to set up a new NCCL "communication" context, at which
+  // point we can enqueue onto device streams without host synchronization in
+  // our code -- this will likely be helpful for "lots of little AllReduce"
+  // cases.
+  Status InitializeCommunicationChannels() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Called when all necessary participants are present, the functionality
+  // that's implemented by all executing threads lives in here.
+  Status DoAllReduce(ParticipantData data, ncclComm_t comm);
+
+  // Puts all state back into a "reset" state for the next generation of
+  // AllReduce requests.
+  void DeinitializeGeneration() EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    for (ncclComm_t& comm : comms_) {
+      ncclCommDestroy(comm);
+    }
+    comms_.clear();
+    participants_.clear();
+    current_generation_++;
+    initialized_ = false;
+    done_ = absl::nullopt;
+  }
+
+  tensorflow::mutex mutex_;
+  tensorflow::condition_variable all_participants_present_;
+  tensorflow::condition_variable deinitialized_;
+
+  // Communication handles that correspond to the participants below.
+  std::vector<ncclComm_t> comms_ GUARDED_BY(mutex_);
+
+  Status initialize_status_ GUARDED_BY(mutex_);
+  std::vector<ParticipantData> participants_ GUARDED_BY(mutex_);
+  int64 current_generation_ GUARDED_BY(mutex_) = 0;
+  bool initialized_ GUARDED_BY(mutex_) = false;
+
+  // The participating threads wait for this to count down in order to know we
+  // can begin the teardown process.
+  absl::optional<tensorflow::BlockingCounter> done_;
+};
+
+Status GlobalRendezvousManager::SubmitParticipant(ParticipantData participant) {
+  auto all_participants_present = [this, &participant]()
+                                      EXCLUSIVE_LOCKS_REQUIRED(mutex_) -> bool {
+    return participants_.size() >= participant.replica_count;
+  };
+
+  // We remember the participant index at which we are inserted and use that
+  // same index for referring to auxiliary metadata (e.g. the ncclComm_t handle
+  // index) below.
+  int64 index;
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+
+    // Spot check for consistent replica counts among submitting threads.
+    if (!participants_.empty() &&
+        (participants_.back().replica_count != participant.replica_count ||
+         participants_.back().originator != participant.originator)) {
+      return InvalidArgument(
+          "Running two XLA modules with AllReduces in parallel is not "
+          "supported. It is possible this is due to a bug where were try to "
+          "run two different AllReduces from the same module at once. "
+          "(Attempted a rendezvous with a different replica count from other "
+          "participants; existing: %s; submitted: %s)",
+          participants_.back().ToString(), participant.ToString());
+    }
+    index = participants_.size();
+    participants_.push_back(participant);
+
+    if (all_participants_present()) {
+      all_participants_present_.notify_all();
+    }
+  }
+
+  // We pull into our thread a) the communication handle and b) whether we're
+  // the "primary" thread for this rendezvous -- the "primary" thread has some
+  // additional responsibilities for setup/teardown.
+  ncclComm_t comm;
+  bool primary;
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    while (!all_participants_present()) {
+      // Once all the participants have arrived, all participating threads will
+      // cross this barrier, though only (the first) one will be the "primary".
+      all_participants_present_.wait(lock);
+    }
+
+    // Somebody will be the first -- that thread has some additional
+    // responsibilities.
+    primary = !initialized_;
+
+    CHECK_EQ(participant.generation_counter, current_generation_);
+
+    // Bump the generation counter so the other threads know we've completed the
+    // global rendezvous and have set up the AllReduce.
+    if (primary) {
+      VLOG(3) << "Primary initializing accounting data.";
+      initialized_ = true;
+      done_.emplace(participant.replica_count);
+      initialize_status_ = InitializeCommunicationChannels();
+      VLOG(3) << "Done initializing communication channels; status: "
+              << initialize_status_;
+      if (!initialize_status_.ok()) {
+        DeinitializeGeneration();
+      }
+    }
+
+    if (!initialize_status_.ok()) {
+      // TODO(b/125951860): If this fails once, it will fail forever.
+      return initialize_status_;
+    }
+
+    comm = comms_[index];
+
+    // Drop the lock at the end of scope so other participants may enter.
+  }
+
+  VLOG(3) << "Performing all reduce from device ordinal: "
+          << participant.device_ordinal;
+
+  Status all_reduce_status = DoAllReduce(participant, comm);
+
+  VLOG(3) << "Waiting for all participants to complete enqueue.";
+
+  done_->DecrementCount();
+
+  if (primary) {
+    // Primary thread clears out the AllReduce state when everybody is done to
+    // make it clean-slate for any subsequent AllReduce request (e.g. number of
+    // replicas may change in the next request).
+    //
+    // Note surrounding TODOs for only reinitializing this when the replica
+    // count / participants actually change -- lots of "playing it safe"
+    // happening in this first cut.
+    done_->Wait();
+    VLOG(3) << "All participants completed enqueue.";
+    VLOG(3) << "Primary thread clearing.";
+    tensorflow::mutex_lock lock(mutex_);
+    DeinitializeGeneration();
+    VLOG(3) << "Generation is now: " << current_generation_;
+    deinitialized_.notify_all();
+  } else {
+    VLOG(3) << "Waiting to deinitialize.";
+    tensorflow::mutex_lock lock(mutex_);
+    while (initialized_) {
+      deinitialized_.wait(lock);
+    }
+  }
+
+  VLOG(3) << "Returning status: " << all_reduce_status;
+  return all_reduce_status;
+}
+
+Status GlobalRendezvousManager::InitializeCommunicationChannels() {
+  std::vector<int> ordinals;
+  for (ParticipantData& data : participants_) {
+    ordinals.push_back(data.device_ordinal);
+  }
+  comms_.resize(ordinals.size());
+  VLOG(3) << "Participants: " << participants_.size()
+          << "; initializing comms.";
+  ncclResult_t result = ncclCommInitAll(comms_.data(), comms_.size(),
+                                        /*devlist=*/ordinals.data());
+  if (result != ncclSuccess) {
+    comms_.clear();
+    return InternalError(
+        "Failed to initialize NCCL communication channels for %d participants: "
+        "%s",
+        participants_.size(), ncclGetErrorString(result));
+  }
+  return Status::OK();
+}
+
+Status GlobalRendezvousManager::DoAllReduce(ParticipantData participant,
+                                            ncclComm_t comm) {
+  se::StreamExecutor* executor = participant.stream->parent();
+  se::cuda::ScopedActivateExecutorContext scoped_context(executor);
+  cudaStream_t* cu_stream = reinterpret_cast<cudaStream_t*>(
+      participant.stream->implementation()->GpuStreamMemberHack());
+  VLOG(3) << "Using stream pointer: " << cu_stream
+          << " on device: " << participant.device_ordinal;
+  void* send_buffer = participant.source_data.opaque();
+  void* recv_buffer = participant.destination_data.opaque();
+  ncclResult_t result = ncclAllReduce(send_buffer, recv_buffer,
+                                      /*count=*/participant.element_count,
+                                      /*datatype=*/ncclFloat,
+                                      /*op=*/ncclSum,
+                                      /*comm=*/comm,
+                                      /*stream=*/*cu_stream);
+  TF_RET_CHECK(ncclSuccess == result)
+      << "Failed to perform all-reduce: " << ncclGetErrorString(result);
+
+  VLOG(3) << "Done performing all reduce for ordinal: "
+          << participant.device_ordinal;
+
+  return Status::OK();
+}
+
+static GlobalRendezvousManager* GetGlobalRendezvous() {
+  static auto* manager = new GlobalRendezvousManager;
+  return manager;
+}
+
+}  // namespace
+
+Status NcclAllReduceThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  auto* global_rendezvous = GetGlobalRendezvous();
+
+  ParticipantData participant;
+  participant.replica_count = replica_count_;
+  participant.element_count = element_count_;
+  participant.device_ordinal = stream->parent()->device_ordinal();
+  participant.generation_counter = global_rendezvous->GetCurrentGeneration();
+  participant.source_data = buffer_allocations.GetDeviceAddress(source_buffer_);
+  participant.destination_data =
+      buffer_allocations.GetDeviceAddress(destination_buffer_);
+  participant.stream = stream;
+  participant.originator = this;
+
+  return GetGlobalRendezvous()->SubmitParticipant(std::move(participant));
+}
+#else
+
+Status NcclAllReduceThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  return Unimplemented(
+      "NCCL support is not available: this binary was not built with a CUDA "
+      "compiler, which is necessary to build the NCCL source library.");
+}
+
+#endif  // GOOGLE_CUDA
+
+NcclAllReduceThunk::NcclAllReduceThunk(
+    int64 replica_count, int64 element_count,
+    const BufferAllocation::Slice& source_buffer,
+    const BufferAllocation::Slice& destination_buffer,
+    const HloInstruction* all_reduce)
+    : Thunk(Thunk::kNcclAllReduce, all_reduce),
+      replica_count_(replica_count),
+      element_count_(element_count),
+      source_buffer_(source_buffer),
+      destination_buffer_(destination_buffer) {}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a8d1356c0023e2c7f49c3731693e10beba54a6d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that performs a NCCL-based All-Reduce among CUDA GPU-based replicas.
+class NcclAllReduceThunk : public Thunk {
+ public:
+  // Returns whether NCCL operations appear possible to perform; e.g. if we
+  // haven't done a build with the CUDA compiler enabled, we can't compile the
+  // NCCL header, and thus this will be false.
+  //
+  // When this is false, the ExecuteOnStream() call will simply return a status
+  // error.
+  static bool NcclIsEnabled();
+
+  // TODO(b/125951860): Plumb more datatypes / reduction operators. Initial
+  // implementation is simply F32 summation.
+  NcclAllReduceThunk(int64 replica_count, int64 element_count,
+                     const BufferAllocation::Slice& source_buffer,
+                     const BufferAllocation::Slice& destination_buffer,
+                     const HloInstruction* all_reduce);
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  const int64 replica_count_;
+  const int64 element_count_;
+  const BufferAllocation::Slice source_buffer_;
+  const BufferAllocation::Slice destination_buffer_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 48f718b514cc9809d4100627f85af7aa05445d36..39cb71c09133164f299f56f755d31bd74ebc3c5b 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
@@ -53,6 +55,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
@@ -81,10 +84,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
 #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -117,6 +122,9 @@ std::vector<string> GetCudaRootCandidates(
     const HloModuleConfig& hlo_module_config) {
   std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
 
+  // "." is our last resort, even though it probably won't work.
+  potential_cuda_roots.push_back(".");
+
   // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
   // highest priority.
   string xla_gpu_cuda_data_dir =
@@ -128,9 +136,23 @@ std::vector<string> GetCudaRootCandidates(
   return potential_cuda_roots;
 }
 
+void PrintCantFindCudaMessage(absl::string_view msg,
+                              const HloModuleConfig& hlo_module_config) {
+  LOG(WARNING) << msg;
+  LOG(WARNING) << "Searched in the following directories:";
+  for (const auto& dir : GetCudaRootCandidates(hlo_module_config)) {
+    LOG(WARNING) << "  " << dir;
+  }
+  LOG(WARNING)
+      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
+         "in HloModule's DebugOptions.  For most apps, setting the environment "
+         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
+}
+
 // Returns the directory containing nvvm libdevice files.
 string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
-  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
+  const auto& candidate_dirs = GetCudaRootCandidates(hlo_module_config);
+  for (const string& cuda_root : candidate_dirs) {
     string libdevice_dir =
         tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
     VLOG(2) << "Looking for libdevice at " << libdevice_dir;
@@ -139,8 +161,14 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
       return libdevice_dir;
     }
   }
-  LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
-  // Last resort: maybe in the current folder.
+  PrintCantFindCudaMessage(
+      "Can't find directory containing CUDA libevice.  This may result in "
+      "compilation or runtime failures, if the program we try to run uses "
+      "routines from libdevice.",
+      hlo_module_config);
+
+  // GetCudaRotCandidates always inclues ".", but but if everything fails, we
+  // return it anyway.  Better than returning the empty string.
   return ".";
 }
 
@@ -171,6 +199,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     pipeline.AddPass<ConvolutionGroupConverter>(
         cost_model,
         /*convert_batch_groups_only=*/true);
+    // Expand the sort op to support stable sorting if required.
+    pipeline.AddPass<StableSortExpander>();
     // Convert BF16 operations to F32 operations so that the GPU backend can
     // support BF16 operations without directly implementing a BF16 lowering for
     // most ops.
@@ -200,7 +230,6 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       pipeline.AddPass<ZeroSizedHloElimination>();
 
       AlgebraicSimplifierOptions options;
-      options.set_enable_permutation_sort_replacement(true);
       pass.AddPass<AlgebraicSimplifier>(options);
       pass.AddPass<SortSimplifier>();
       pass.AddPass<TupleSimplifier>();
@@ -221,15 +250,27 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
         TransposeFolding::NeverFoldTranspose);
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
+
+    // Run WhileLoopTripCountAnnotator at the end of the simplification
+    // pipeline, before layout assignment and fusion.  This pass does some
+    // pattern-matching on while bodies/conditions, and this is where the HLO is
+    // "nicest".
+    //
+    // It's important that we don't make semantic changes (e.g. unrolling) to
+    // any `while` loops after this point, because otherwise the trip-count
+    // annotations added by this pass may not be correct after the
+    // modifications.
+    pipeline.AddPass<WhileLoopTripCountAnnotator>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   {
     // Convert convolutions into CustomCalls to cudnn, then canonicalize them
-    // (CudnnConvPaddingLegalization).
+    // (CudnnConvPaddingLegalization). Also expand cuSolver calls.
     HloPassPipeline pipeline("conv_canonicalization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
+    pipeline.AddPass<CusolverRewriter>(stream_exec, device_allocator);
     pipeline.AddPass<CudnnConvRewriter>();
     pipeline.AddPass<CudnnFusedConvRewriter>();
     pipeline.AddPass<CudnnConvPaddingLegalization>();
@@ -273,7 +314,6 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
     AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
-    options.set_enable_permutation_sort_replacement(true);
     pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
@@ -303,6 +343,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // wouldn't be able to simplify away the new_tuple bits.
     pipeline.AddPass<CudnnConvAlgorithmPicker>(stream_exec, device_allocator,
                                                compiler);
+
     // Clean up new_tuple described above.
     pipeline.AddPass<TupleSimplifier>();
 
@@ -377,6 +418,7 @@ Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<GpuCopyInsertion>();
+  pipeline.AddPass<GpuSanitizeConstantNames>();
   return pipeline.Run(hlo_module).status();
 }
 
@@ -577,9 +619,6 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
-  VLOG(3) << "*** HLO Before Optimization";
-  XLA_VLOG_LINES(3, module->ToString());
-
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tracing::ScopedActivity activity("HLO Transforms", module->name(),
                                    /*is_expensive=*/true);
@@ -633,19 +672,11 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
           /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true));
-  // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
-  // include headers, so no need for us to print them ourselves.
-  XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
-  XLA_VLOG_LINES(2, buffer_assignment->ToString());
-  VLOG(3) << "*** HLO After Optimization";
-  XLA_VLOG_LINES(3, module->ToString());
-  const string xla_dump_optimized_hlo_proto_to =
-      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
-  if (!xla_dump_optimized_hlo_proto_to.empty()) {
-    HloProto proto = MakeHloProto(*module, *buffer_assignment);
-    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                            buffer_assignment->ToString());
   }
+  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
                                       &stream_exec->GetDeviceDescription(),
@@ -663,26 +694,16 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   }
 
   if (user_pre_optimization_hook_) {
-    TF_CHECK_OK(user_pre_optimization_hook_(llvm_module));
+    user_pre_optimization_hook_(llvm_module);
   }
   string ir_module_string_before_opt;
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  if (VLOG_IS_ON(3) || embed_ir_in_executable) {
+  if (embed_ir_in_executable) {
     ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
-    VLOG(3) << "LLVM module before optimizations:";
-    XLA_VLOG_LINES(3, ir_module_string_before_opt);
   }
 
-  const string& ir_dump_directory =
-      module->config().debug_options().xla_dump_ir_to();
-
-  if (!ir_dump_directory.empty()) {
-    TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
-        /*directory_name=*/ir_dump_directory,
-        /*hlo_module_name=*/module->name(), llvm_module,
-        /*optimized=*/false));
-  }
+  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/false);
 
   {
     XLA_SCOPED_LOGGING_TIMER(
@@ -696,7 +717,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
         << "Invalid LLVM IR before optimizations:\n"
         << err_stream.str()
         << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-           "Rerun with --xla_dump_ir_to to get the IR. ";
+           "Rerun with --xla_dump_to to get the IR. ";
   }
 
   string libdevice_dir;
@@ -729,35 +750,14 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
                                           module->config(), libdevice_dir));
   }
 
-  if (!ir_dump_directory.empty()) {
-    TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
-        /*directory_name=*/ir_dump_directory,
-        /*hlo_module_name=*/module->name(), llvm_module,
-        /*optimized=*/true));
-  }
+  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true);
 
   if (user_post_optimization_hook_) {
-    TF_CHECK_OK(user_post_optimization_hook_(llvm_module));
+    user_post_optimization_hook_(llvm_module);
   }
-  VLOG(3) << "LLVM module after optimizations:";
-  XLA_VLOG_LINES(3, llvm_ir::DumpModuleToString(llvm_module));
-  VLOG(3) << "PTX:";
-  XLA_VLOG_LINES(3, ptx);
-
   // Write PTX to IR dump directory, if IR dumping was requested.
-  if (!ir_dump_directory.empty()) {
-    const string ptx_outfile = tensorflow::io::JoinPath(
-        ir_dump_directory, absl::StrCat(module->name(), ".ptx"));
-    auto status = [&] {
-      auto* env = tensorflow::Env::Default();
-      TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory));
-      TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_outfile, ptx));
-      return Status::OK();
-    }();
-    if (!status.ok()) {
-      LOG(WARNING) << "Couldn't dump PTX for module " << module->name()
-                   << " to " << ptx_outfile << ": " << status;
-    }
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "ptx", ptx);
   }
 
   const std::vector<uint8> cubin =
@@ -766,20 +766,27 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
-  VLOG(3) << "Printing the thunk schedule...";
-  XLA_VLOG_LINES(3, thunk_schedule->ToString());
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "thunk_schedule",
+                            thunk_schedule->ToString());
+  }
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
-  if (module->config().hlo_profiling_enabled()) {
+  if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
     cost_analysis.set_bytes_per_second(
         stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
-    profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-    profile_printer = CreateHloProfilePrinterData(
-        *profile_index_map, cost_analysis, entry_computation->name());
+    VLOG(1) << "HLO memory read+written: "
+            << tensorflow::strings::HumanReadableNumBytes(
+                   cost_analysis.bytes_accessed());
+    if (module->config().hlo_profiling_enabled()) {
+      profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
+      profile_printer = CreateHloProfilePrinterData(
+          *profile_index_map, cost_analysis, entry_computation->name());
+    }
   }
 
   auto* gpu_executable = new GpuExecutable(
@@ -843,10 +850,11 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
             log_warning = !warning_done.exchange(true);
           }
           if (log_warning) {
-            LOG(WARNING)
-                << "Failed to compile ptx to cubin.  Will attempt to let "
-                   "GPU driver compile the ptx. "
-                << maybe_cubin.status();
+            PrintCantFindCudaMessage(
+                "Can't find ptxas binary.  Will back to the GPU driver "
+                "for PTX -> sass compilation.  This is OK so long as you don't "
+                "see a warning below about an out-of-date driver version.",
+                hlo_module_config);
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 8154d75d23a6d49153ccb6824402aff73f365617..cb012649200c6386d3ae25d088aa3b16bd40be82 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index bfed4f5230dfe37bca48560ce83a2dd82c8950a4..10bc82488ff56135f4585e62c2f71c11a359e542 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -41,7 +41,7 @@ std::ostream& operator<<(std::ostream& out,
 
 int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
   int64 threads_per_block = device_desc.threads_per_block_limit();
-  if (threads_per_block == 0) {
+  if (threads_per_block <= 0) {
     static std::atomic<int64> log_count{0};
     if (log_count.fetch_add(1) < 8) {
       LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
@@ -71,18 +71,17 @@ LaunchDimensions CalculateLaunchDimensions(
   num_elements = num_elements / unroll_factor;
 
   // Since we don't do any inter-warp communication, we're free to choose any
-  // block size we want, subject to hardware constraints.  We choose the
-  // smallest block size that allows the GPU to reach full occupancy (assuming
-  // the kernel uses sufficiently few registers).  This gives us max performance
-  // when the kernel uses few registers, and lets us scale down gracefully as
-  // the kernel uses more registers.
+  // block size we want, subject to hardware constraints.  We choose the largest
+  // block size allowed, as empirically, this is a performance win on almost
+  // (but not all) benchmarks.
   //
-  // Specifically, we choose the number of threads per block such that
+  // My guess is that using a larger block size encourages ptxas to decrease
+  // per-thread register usage, thus allowing for higher occupancy, but I
+  // haven't verified this.
   //
-  //   <num threads per block> * <max blocks per core> = <max threads per core>
-
+  // TODO(jlebar): Investigate this further, and tune this heuristic so we can
+  // run faster on the few benchmarks where smaller block size helps.
   int64 threads_per_block = ThreadsPerBlockLimit(device_desc);
-
   if (num_elements < threads_per_block) {
     threads_per_block = num_elements;
     VLOG(2) << "Update # of threads per block to the element count ("
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..197367e81687eeddea8778267075e66ef1819341
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
+
+namespace xla {
+namespace gpu {
+
+StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        absl::StrFormat(
+            "Allocating %d bytes exceeds the memory limit of %d bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
+  total_allocated_bytes_ += byte_size;
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..620c7e78912eb7d9730bae02aab8f85b5fd2c096
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+class ScratchAllocator : public se::ScratchAllocator {
+ public:
+  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
+    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+  }
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
+                                                  int64 byte_size) override;
+
+  template <typename T>
+  StatusOr<se::DeviceMemory<T>> Allocate(se::Stream* stream,
+                                         int64 num_elements) {
+    TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8> bytes,
+                        AllocateBytes(stream, num_elements * sizeof(T)));
+    return se::DeviceMemory<T>(bytes);
+  }
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index a1ed8499040359fe7265a7317b0577a990a2234c..d33e9cf714ee3810b1fb2fa8c05c3ed399d27bfb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index 6814be779e0b02c38e3bc7008f036b845d88cb6f..963716e70500e1e4eac930e13937ae547ace0c1e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -48,8 +48,9 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
       HloInstruction::CreateParameter(0, param_shape, "x"));
   HloInstruction* param_y = builder.AddInstruction(
       HloInstruction::CreateParameter(1, param_shape, "y"));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
+  builder.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {5, 7, 2}), param_x, param_y,
+      ComparisonDirection::kGe));
 
   auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
@@ -73,7 +74,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) {
       x = f32[5,7,2]{2,1,0} parameter(0)
       y = f32[5,14]{1,0} parameter(1)
       reshape = f32[5,7,2]{2,1,0} reshape(y)
-      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, reshape)
+      ROOT gte = pred[5,7,2]{2,1,0} compare(x, reshape), direction=GE
     })",
                                config)
                     .ValueOrDie();
@@ -98,7 +99,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
       y = f32[14]{0} parameter(1)
       reshape = f32[7,2]{1,0} reshape(y)
       broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2}
-      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, broadcast)
+      ROOT gte = pred[5,7,2]{2,1,0} compare(x, broadcast), direction=GE
     })",
                                config)
                     .ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
index f91a22d482bc8bc046977870a7a4d18ca1acde68..06b06a5b1ee1fb9996be3ebe326893c4160a7e29 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index c78605cebbc671272b8df9faf0e0cc54be2f5b1c..f43e05904dd6ffdc37c77a08d5ddc49bf08c8feb 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -18,42 +18,52 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
+absl::string_view ThunkKindToString(Thunk::Kind kind) {
   switch (kind) {
+    case Thunk::kCholesky:
+      return "kCholesky";
     case Thunk::kConditional:
-      return os << "kConditional";
+      return "kConditional";
     case Thunk::kConvolution:
-      return os << "kConvolution";
+      return "kConvolution";
     case Thunk::kCopy:
-      return os << "kCopy";
+      return "kCopy";
     case Thunk::kCudnnBatchNormBackward:
-      return os << "kCudnnBatchNormBackward";
+      return "kCudnnBatchNormBackward";
     case Thunk::kCudnnBatchNormForwardInference:
-      return os << "kCudnnBatchNormForwardInference";
+      return "kCudnnBatchNormForwardInference";
     case Thunk::kCudnnBatchNormForwardTraining:
-      return os << "kCudnnBatchNormForwardTraining";
+      return "kCudnnBatchNormForwardTraining";
+    case Thunk::kNcclAllReduce:
+      return "kNcclAllReduce";
     case Thunk::kFft:
-      return os << "kFft";
+      return "kFft";
     case Thunk::kGemm:
-      return os << "kGemm";
+      return "kGemm";
     case Thunk::kInfeed:
-      return os << "kInfeed";
+      return "kInfeed";
     case Thunk::kKernel:
-      return os << "kKernel";
+      return "kKernel";
     case Thunk::kMemset32BitValue:
-      return os << "kMemset32BitValue";
+      return "kMemset32BitValue";
     case Thunk::kMemzero:
-      return os << "kMemzero";
+      return "kMemzero";
     case Thunk::kOutfeed:
-      return os << "kOutfeed";
+      return "kOutfeed";
     case Thunk::kSequential:
-      return os << "kSequential";
+      return "kSequential";
+    case Thunk::kTriangularSolve:
+      return "kTriangularSolve";
     case Thunk::kTuple:
-      return os << "kTuple";
+      return "kTuple";
     case Thunk::kWhile:
-      return os << "kWhile";
+      return "kWhile";
   }
 }
 
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
+  return os << ThunkKindToString(kind);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index e68bee035a029178844282995429eaa960cc4817..56d1176ff4ed3deede4006d16fe8f8a4c66a2a92 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -42,12 +42,14 @@ class GpuExecutable;
 class Thunk {
  public:
   enum Kind {
+    kCholesky,
     kConditional,
     kConvolution,
     kCopy,
     kCudnnBatchNormBackward,
     kCudnnBatchNormForwardInference,
     kCudnnBatchNormForwardTraining,
+    kNcclAllReduce,
     kFft,
     kGemm,
     kInfeed,
@@ -56,6 +58,7 @@ class Thunk {
     kMemzero,
     kOutfeed,
     kSequential,
+    kTriangularSolve,
     kTuple,
     kWhile,
   };
@@ -103,6 +106,7 @@ class Thunk {
 // A sequence of thunks.
 using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
 
+absl::string_view ThunkKindToString(Thunk::Kind);
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 25bad67bab9375559c431466571c62acd0452b01..daa5f33e5604c434aaab7fa454fc3e89d68599d5 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include <algorithm>
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -144,11 +147,32 @@ const std::list<const Thunk*>& ThunkSchedule::DependsOn(
 }
 
 string ThunkSchedule::ToString() const {
+  if (thunk_total_order_.empty()) {
+    return "No thunks.";
+  }
+
+  const Thunk* thunk_with_longest_kind = *absl::c_max_element(
+      thunk_total_order_, [](const Thunk* a, const Thunk* b) {
+        return ThunkKindToString(a->kind()).length() <
+               ThunkKindToString(b->kind()).length();
+      });
+  int64 max_thunk_kind_len =
+      ThunkKindToString(thunk_with_longest_kind->kind()).length();
+
   string result = "Total order:\n";
   for (Thunk* thunk : thunk_total_order_) {
-    absl::StrAppend(&result, "\t", thunk->hlo_instruction()->ToString(), "\n");
+    // Write out the thunk kind, padded out to max_thunk_kind_len.
+    absl::string_view kind_str = ThunkKindToString(thunk->kind());
+    absl::StrAppend(&result, kind_str,
+                    string(max_thunk_kind_len - kind_str.length(), ' '), "\t");
+    if (thunk->hlo_instruction() != nullptr) {
+      absl::StrAppend(&result, thunk->hlo_instruction()->ToString());
+    } else {
+      absl::StrAppend(&result, "(no HloInstruction)");
+    }
+    absl::StrAppend(&result, "\n");
   }
-  absl::StrAppend(&result, "Dependencies:\n");
+  absl::StrAppend(&result, "\nDependencies:\n");
   for (const auto& entry : depends_on_) {
     const Thunk* dependent = entry.first;
     for (const Thunk* dependency : entry.second) {
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5200a2af412979c7e38d95c5a9bd5bc2ab64f086
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
@@ -0,0 +1,149 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace xla {
+namespace gpu {
+
+TriangularSolveThunk::TriangularSolveThunk(
+    const TriangularSolveOptions& options,
+    const BufferAllocation::Slice& a_buffer,
+    const BufferAllocation::Slice& b_buffer, PrimitiveType type,
+    int64 batch_size, int64 m, int64 n, int64 a_batch_stride,
+    int64 b_batch_stride, const HloInstruction* hlo)
+    : Thunk(Kind::kTriangularSolve, hlo),
+      uplo_(options.lower() ? se::blas::UpperLower::kLower
+                            : se::blas::UpperLower::kUpper),
+      side_(options.left_side() ? se::blas::Side::kLeft
+                                : se::blas::Side::kRight),
+      unit_diagonal_(options.unit_diagonal() ? se::blas::Diagonal::kUnit
+                                             : se::blas::Diagonal::kNonUnit),
+      a_buffer_(a_buffer),
+      b_buffer_(b_buffer),
+      type_(type),
+      batch_size_(batch_size),
+      m_(m),
+      n_(n),
+      a_batch_stride_(a_batch_stride),
+      b_batch_stride_(b_batch_stride) {
+  transpose_a_ = [&] {
+    switch (options.transpose_a()) {
+      case TriangularSolveOptions::NO_TRANSPOSE:
+        return se::blas::Transpose::kNoTranspose;
+      case TriangularSolveOptions::TRANSPOSE:
+        return se::blas::Transpose::kTranspose;
+      case TriangularSolveOptions::ADJOINT:
+        return se::blas::Transpose::kConjugateTranspose;
+      default:
+        LOG(ERROR) << "Invalid triangular solve transpose value "
+                   << options.transpose_a();
+        return se::blas::Transpose::kNoTranspose;
+    }
+  }();
+}
+
+Status TriangularSolveThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  VLOG(3) << "uplo=" << se::blas::UpperLowerString(uplo_)
+          << " side=" << se::blas::SideString(side_)
+          << " diagonal=" << se::blas::DiagonalString(unit_diagonal_)
+          << " batch_size=" << batch_size_ << " m=" << m_ << " n=" << n_
+          << " a_batch_stride=" << a_batch_stride_
+          << " b_batch_stride=" << b_batch_stride_;
+
+  const int lda = side_ == se::blas::Side::kLeft ? m_ : n_;
+  const int ldb = m_;
+
+  char* a_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(a_buffer_).opaque());
+  char* b_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(b_buffer_).opaque());
+  for (int64 i = 0; i < batch_size_; ++i) {
+    bool launch_ok;
+    se::DeviceMemoryBase a_data =
+        se::DeviceMemoryBase(a_base + i * a_batch_stride_, a_batch_stride_);
+    se::DeviceMemoryBase b_data =
+        se::DeviceMemoryBase(b_base + i * b_batch_stride_, b_batch_stride_);
+    switch (type_) {
+      case F32: {
+        se::DeviceMemory<float> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0f,
+                                       se::DeviceMemory<float>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case F64: {
+        se::DeviceMemory<double> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0,
+                                       se::DeviceMemory<double>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case C64: {
+        se::DeviceMemory<std::complex<float>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0f,
+                               se::DeviceMemory<std::complex<float>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      case C128: {
+        se::DeviceMemory<std::complex<double>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0,
+                               se::DeviceMemory<std::complex<double>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      default:
+        return InvalidArgument("Invalid type for triangular solve %d", type_);
+    }
+    if (!launch_ok) {
+      return InternalError("Unable to launch triangular solve for thunk %p",
+                           this);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..c947162ea32f197f808d099859eadbbc55a65ab1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a triangular
+// solve (BlasTrsm). It is generated by IrEmitter.
+//
+// Thread-compatible.
+class TriangularSolveThunk : public Thunk {
+ public:
+  TriangularSolveThunk(const TriangularSolveOptions& options,
+                       const BufferAllocation::Slice& a_buffer,
+                       const BufferAllocation::Slice& b_buffer,
+                       PrimitiveType type, int64 batch_size, int64 m, int64 n,
+                       int64 a_batch_stride, int64 b_batch_stride,
+                       const HloInstruction* hlo);
+
+  TriangularSolveThunk(const TriangularSolveThunk&) = delete;
+  TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  const se::blas::UpperLower uplo_;
+  const se::blas::Side side_;
+  const se::blas::Diagonal unit_diagonal_;
+  se::blas::Transpose transpose_a_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+
+  const PrimitiveType type_;
+  const int64 batch_size_;
+  const int64 m_;
+  const int64 n_;
+  const int64 a_batch_stride_;
+  const int64 b_batch_stride_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
index c552c2925497f1c4808d74a615d35cdbeeba1858..bbbcc2dbb0f71d08462a1aad6d97e7fd07b2a1fb 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 2dce7749bbd8da2673ae607eee3d731d9917e8fe..64a5fe5fdd2ebb5430767973c2ba3d1bf498455c 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -44,9 +44,9 @@ class WhileTransformerTest : public HloTestBase {
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             limit_const->shape(), loop_state, tuple_index));
-    builder.AddInstruction(
-        HloInstruction::CreateBinary(condition_result_shape_, HloOpcode::kLt,
-                                     induction_variable, limit_const));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        condition_result_shape_, induction_variable, limit_const,
+        ComparisonDirection::kLt));
     return builder.Build();
   }
 
@@ -106,24 +106,6 @@ class WhileTransformerTest : public HloTestBase {
     return while_hlo;
   }
 
-  void RunFusionPasses() {
-    // Run standard fusion passes.
-    TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/false)
-                     .Run(module_.get())
-                     .status());
-    TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/true)
-                     .Run(module_.get())
-                     .status());
-  }
-
-  void RunCopyInsertionPass() {
-    HloVerifier verifier(/*layout_sensitive=*/false,
-                         /*allow_mixed_precision=*/false);
-    TF_ASSERT_OK(verifier.Run(module_.get()).status());
-    CopyInsertion copy_insertion;
-    TF_ASSERT_OK(copy_insertion.Run(module_.get()).status());
-  }
-
   Shape GetLoopStateShape(const int64 ind_var_tuple_index) {
     if (ind_var_tuple_index == 0) {
       return ShapeUtil::MakeTupleShape(
@@ -146,10 +128,6 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
   auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 0);
-  // Run HLO Optimization passes.
-  RunFusionPasses();
-  RunCopyInsertionPass();
-
   auto result = ComputeWhileLoopTripCount(while_hlo);
   ASSERT_TRUE(result);
   EXPECT_EQ(10, *result);
@@ -161,10 +139,6 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
       module_->AddEmbeddedComputation(BuildConditionComputation(1, 10));
   auto body = module_->AddEmbeddedComputation(BuildBodyComputation(1, 0, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 1, 0);
-  // Run HLO Optimization passes.
-  RunFusionPasses();
-  RunCopyInsertionPass();
-
   auto result = ComputeWhileLoopTripCount(while_hlo);
   ASSERT_TRUE(result);
   EXPECT_EQ(10, *result);
@@ -176,10 +150,6 @@ TEST_F(WhileTransformerTest, ImpossibleLoopLimit) {
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 5));
   auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 10);
-  // Run HLO Optimization passes.
-  RunFusionPasses();
-  RunCopyInsertionPass();
-
   auto result = ComputeWhileLoopTripCount(while_hlo);
   ASSERT_TRUE(result);
   EXPECT_EQ(0, *result);
@@ -191,10 +161,6 @@ TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
   auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, -1));
   auto while_hlo = BuildWhileInstruction(condition, body, 0, 0);
-  // Run HLO Optimization passes.
-  RunFusionPasses();
-  RunCopyInsertionPass();
-
   auto result = ComputeWhileLoopTripCount(while_hlo);
   ASSERT_FALSE(result);
 }
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
deleted file mode 100644
index ef70b688778df5115e2b5fe572d253a6948d076f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Example HLO graph which demonstrates Graphviz dumper for HLO
-// computations. When run, pushes the example DOT graph to the Graphviz service
-// and prints the URL. Useful for seeing effect of changes to the graph
-// generation code.
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// Adds a computation to the given HLO module which adds a scalar constant to
-// its parameter and returns the result.
-HloComputation* AddScalarConstantComputation(int64 addend, HloModule* module) {
-  auto builder = HloComputation::Builder(absl::StrCat("add_", addend));
-  auto x_value = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "x_value"));
-  auto half = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.5)));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      half->shape(), HloOpcode::kAdd, x_value, half));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Adds a computation to the given HLO module which sums its two parameters and
-// returns the result.
-HloComputation* ScalarSumComputation(HloModule* module) {
-  auto builder = HloComputation::Builder("add");
-  auto lhs = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "lhs"));
-  auto rhs = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "rhs"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Adds a computation to the given HLO module which forwards its argument to a
-// kCall instruction which then calls the given computation.
-HloComputation* CallForwardingComputation(HloComputation* computation,
-                                          HloModule* module) {
-  auto builder = HloComputation::Builder("call_forward");
-  auto arg = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "arg"));
-  builder.AddInstruction(
-      HloInstruction::CreateCall(arg->shape(), {arg}, computation));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Create a large, arbitrary computation with many different kinds of
-// instructions. Sets the computation as the entry to an HLO module and returns
-// the module.
-std::unique_ptr<HloModule> MakeBigGraph() {
-  HloModuleConfig config;
-  auto module = absl::make_unique<HloModule>("BigGraph", config);
-
-  auto builder = HloComputation::Builder("TestBigGraphvizGraph");
-
-  // Shapes used in the computation.
-  auto mshape = ShapeUtil::MakeShape(F32, {3, 5});
-  auto vshape = ShapeUtil::MakeShape(F32, {3});
-  auto sshape = ShapeUtil::MakeShape(F32, {3});
-
-  // Create a set of parameter instructions.
-  auto param_v0 =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, vshape, "foo"));
-  auto param_v1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, vshape, "bar"));
-  auto param_v2 =
-      builder.AddInstruction(HloInstruction::CreateParameter(2, vshape, "baz"));
-  auto param_s =
-      builder.AddInstruction(HloInstruction::CreateParameter(3, sshape, "qux"));
-  auto param_m =
-      builder.AddInstruction(HloInstruction::CreateParameter(4, mshape, "zzz"));
-
-  // Add an arbitrary expression of different instructions.
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(vshape, HloOpcode::kCopy, param_v0));
-  auto clamp = builder.AddInstruction(HloInstruction::CreateTernary(
-      vshape, HloOpcode::kClamp, copy, param_v1, param_v2));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  PrecisionConfig precision_config;
-  precision_config.mutable_operand_precision()->Resize(
-      /*new_size=*/2, PrecisionConfig::DEFAULT);
-  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
-      vshape, clamp, param_v0, dot_dnums, precision_config));
-  auto tuple = builder.AddInstruction(
-      HloInstruction::CreateTuple({dot, param_s, clamp}));
-  auto scalar = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(sshape, tuple, 2));
-  auto add_one = AddScalarConstantComputation(1.0, module.get());
-  auto rng = builder.AddInstruction(
-      HloInstruction::CreateRng(vshape, RNG_UNIFORM, {param_m, param_m}));
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto add_computation = ScalarSumComputation(module.get());
-  builder.AddInstruction(
-      HloInstruction::CreateReduce(vshape, rng, one, {1}, add_computation));
-  auto map1 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {scalar}, add_one));
-  auto map2 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {map1}, add_one));
-  auto map3 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {map2}, add_one));
-
-  // Create a fusion instruction containing the chain of map instructions.
-  auto fusion = builder.AddInstruction(HloInstruction::CreateFusion(
-      sshape, HloInstruction::FusionKind::kLoop, map3));
-  fusion->FuseInstruction(map2);
-  fusion->FuseInstruction(map1);
-
-  // Add a random trace instruction.
-  builder.AddInstruction(HloInstruction::CreateTrace("trace", dot));
-
-  // Add a call instruction will calls the call-forwarding computation to call
-  // another computation.
-  auto call_computation = CallForwardingComputation(add_one, module.get());
-  builder.AddInstruction(
-      HloInstruction::CreateCall(fusion->shape(), {fusion}, call_computation));
-
-  module->AddEntryComputation(builder.Build());
-  return module;
-}
-
-}  // namespace
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  auto module = xla::MakeBigGraph();
-
-  printf("Graph URL: %s\n", xla::hlo_graph_dumper::DumpGraph(
-                                *module->entry_computation(),
-                                "Example computation", xla::DebugOptions())
-                                .c_str());
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index dbbf43082f2c1d21f5ef42f53804bf0969903a58..3e0631aeb4aa374cb5748650e1c7529e26e10b34 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -158,7 +158,7 @@ class HeapSimulator {
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
                       const BufferValue* buffer,
                       const HloInstruction* instruction,
-                      const BufferValue* shared_with_canonical);
+                      const BufferValue* share_with_canonical);
 
   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
   // in which case we are calculating the same allocs/frees twice in the
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index dc40b9446ad1bffcb757543e52fc9ab20de6d52e..2f1628038204833c1aa1061e81878c07d7f7529d 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -54,8 +54,8 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
       HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
   // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
   HloInstruction* cond_lt = cond_builder.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
-                                   HloOpcode::kLt, cond_iter, cond_data));
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), cond_iter,
+                                    cond_data, ComparisonDirection::kLt));
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(cond_builder.Build());
 
@@ -113,7 +113,8 @@ TEST_F(MinimumMemoryForSequenceTest, SubcomputationAccounting) {
   //   %slice = f32[1]{0} slice(f32[4]{0} %cond_param), slice={[0:1]}
   //   %reshape = f32[] reshape(f32[1]{0} %slice)
   //   %constant = f32[] constant(0)
-  //   ROOT %not-equal-to = pred[] not-equal-to(f32[] %reshape, f32[] %constant)
+  //   ROOT %not-equal-to = pred[] compare(f32[] %reshape, f32[] %constant),
+  //   direction=NE
   // }
 
   // ENTRY %SubcomputationAccounting () -> f32[2,4] {
@@ -143,9 +144,9 @@ TEST_F(MinimumMemoryForSequenceTest, SubcomputationAccounting) {
       cond_builder.AddInstruction(HloInstruction::CreateReshape(r0f32, slice));
   HloInstruction* zero = cond_builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
-  HloInstruction* cond_comparison =
-      cond_builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, reshape, zero));
+  HloInstruction* cond_comparison = cond_builder.AddInstruction(
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), reshape,
+                                    zero, ComparisonDirection::kNe));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
 
   // param - 1
@@ -703,8 +704,8 @@ TEST_F(HeapSimulatorTest, WholeModule) {
   HloInstruction* cond_data = cond_builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
   HloInstruction* cond_lt = cond_builder.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
-                                   HloOpcode::kLt, cond_iter, cond_data));
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), cond_iter,
+                                    cond_data, ComparisonDirection::kLt));
   HloComputation* cond_computation =
       tracker.module()->AddEmbeddedComputation(cond_builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 263b42a29dbb0dbc0fb6eca7968674ff242f45ed..54ee92943cc6eef1d5961ab9cdc529bab15083d7 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -29,12 +29,13 @@ limitations under the License.
 syntax = "proto3";
 
 package xla;
+
 import "tensorflow/compiler/xla/xla_data.proto";
 
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 59
+// Next ID: 64
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -145,6 +146,9 @@ message HloInstructionProto {
   // FFT length.
   repeated int64 fft_length = 32;
 
+  // Comparison direction only used for kCompare.
+  string comparison_direction = 63;
+
   // Gather dimension numbers.
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
   repeated int64 gather_slice_sizes = 34;
@@ -175,6 +179,9 @@ message HloInstructionProto {
   // partners.
   bool is_host_transfer = 47;
 
+  // Whether this Sort instruction should be stable.
+  bool is_stable = 60;
+
   xla.ScatterDimensionNumbers scatter_dimension_numbers = 48;
 
   // Precision configuration for the instruction. Has backend-specific meaning.
@@ -193,6 +200,15 @@ message HloInstructionProto {
   // operand.
   bool constrain_layout = 56;
   repeated xla.ShapeProto operand_shapes_with_layout = 57;
+
+  // Options for TriangularSolve
+  xla.TriangularSolveOptions triangular_solve_options = 59;
+
+  // Options for Cholesky
+  xla.CholeskyOptions cholesky_options = 62;
+
+  // Describes how parameters behave with regards to replicas.
+  xla.ParameterReplication parameter_replication = 61;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index e511f1951c5dd07ebb64fa38fd5b7f6a0e87b429..7d02f4b3d756df9d1fcbddfa85df2a41a62d9169 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -293,7 +293,7 @@ class BufferValueMap {
             VLOG(3)
                 << "  value @ " << position << " is root of "
                 << callsite.instruction()->name()
-                << "; true/false branch roots must share buffer among them : "
+                << "; branch computation roots must share buffer among them : "
                 << cond_value.ToShortString();
             aliased_buffers->push_back(GetBufferForValue(cond_value));
           }
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index b6dbf07959c541bceaa8eda5a0101503970ee832..e344fbc54a8a72c6195d1d8590f0f1c56428f641 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -48,7 +47,6 @@ class HloAliasAnalysisTest : public HloTestBase {
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
     analysis_ = HloAliasAnalysis::Run(module_.get(),
                                       /*fusion_can_share_buffer=*/nullptr)
                     .ConsumeValueOrDie();
@@ -126,6 +124,7 @@ TEST_F(HloAliasAnalysisTest, BinaryOperation) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -160,6 +159,7 @@ TEST_F(HloAliasAnalysisTest, TupleAndGtes) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -203,6 +203,7 @@ TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({param0, param1, param0}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -237,6 +238,8 @@ TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -281,6 +284,8 @@ TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -370,6 +375,8 @@ TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -421,6 +428,7 @@ TEST_F(HloAliasAnalysisTest, SingleCall) {
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -462,6 +470,7 @@ TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) {
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -547,6 +556,7 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -647,6 +657,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
 
   FlattenCallGraph flattener;
   TF_ASSERT_OK(flattener.Run(module_.get()).status());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -738,6 +749,7 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) {
   auto entry_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition2, outer_body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -811,6 +823,7 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -872,6 +885,7 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) {
       tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -960,6 +974,7 @@ TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) {
       HloInstruction::CreateWhile(tuple_shape, condition, body, select));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -997,6 +1012,7 @@ TEST_F(HloAliasAnalysisTest, Bitcast) {
       scalar_shape_, HloOpcode::kBitcast, constant));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1017,6 +1033,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) {
   builder.AddInstruction(HloInstruction::CreateTuple({constant, bitcast}));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1056,6 +1073,7 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate, xla_while}));
 
   HloComputation* entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index f9b64d12ae83139efa21ca67e565908bd78f9780..48a51d302bbf054d904c54ab933d87fc910d0714 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -124,6 +124,24 @@ HloInstruction* HloComputation::AddParameter(
   return instructions_.back().get();
 }
 
+HloInstruction* HloComputation::AddEntryComputationParameter(
+    std::unique_ptr<HloInstruction> instruction) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kParameter);
+  CHECK_EQ(instruction->parameter_number(), num_parameters());
+  CHECK(parent()->entry_computation() == this);
+
+  HloModuleConfig config = parent()->config();
+  config.mutable_entry_computation_layout()->add_parameter_layout(
+      ShapeLayout(instruction->shape()));
+  parent()->set_config(config);
+
+  instruction->set_parent(this);
+  param_instructions_.push_back(instruction.get());
+  AddInstructionInternal(std::move(instruction));
+
+  return instructions_.back().get();
+}
+
 Status HloComputation::RemoveParameter(int64 param_no) {
   CHECK_GE(param_no, 0);
   CHECK_LT(param_no, param_instructions_.size());
@@ -296,7 +314,7 @@ void ComputeComputationPostOrder(HloComputation* computation,
 }  // namespace
 
 void HloComputation::ComputeInstructionPostOrder(
-    const HloComputation::ChannelDependencyMap& channel_dependency_map,
+    const HloComputation::ChannelDependencyGroup& channel_dependency_group,
     std::vector<HloInstruction*>* post_order, HloInstruction* root,
     absl::flat_hash_map<HloInstruction*, VisitState>* visited) const {
   std::vector<HloInstruction*> dfs_stack;
@@ -320,66 +338,75 @@ void HloComputation::ComputeInstructionPostOrder(
 
     visited->insert({current, kVisiting});
 
-    // Add the operands to the stack in reverse order so the first operand is
-    // processed first. This will produce a more natural ordering and a nicer
-    // result for things like HLO stringification.
-    const auto& operands = current->operands();
-    for (int64 i = operands.size() - 1; i >= 0; --i) {
-      dfs_stack.emplace_back(operands[i]);
-    }
-
-    for (HloInstruction* op : current->control_predecessors()) {
-      dfs_stack.emplace_back(op);
-    }
-
-    // Add inputs for send->recv_done dependencies and all-reduce
-    // dependencies.
-    switch (current->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(current->channel_id());
-        if (it != channel_dependency_map.end()) {
-          for (HloInstruction* op : it->second) {
-            dfs_stack.emplace_back(op);
-          }
-        }
-        break;
+    const auto get_channel_id =
+        [](HloInstruction* inst) -> absl::optional<int64> {
+      switch (inst->opcode()) {
+        case HloOpcode::kRecvDone:
+          return inst->channel_id();
+        case HloOpcode::kAllReduce:
+          return inst->all_reduce_id();
+        default:
+          return absl::nullopt;
       }
-      case HloOpcode::kAllReduce: {
-        auto all_reduce_id = current->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            for (HloInstruction* op : it->second) {
-              dfs_stack.emplace_back(op);
-            }
-          }
+    };
+
+    // When adding a predecessor to the dfs_stack, we need to also add its
+    // associated channel dependencies.
+    const auto add_dfs_stack = [&](HloInstruction* inst) {
+      auto channel_id = get_channel_id(inst);
+      if (channel_id && channel_dependency_group.count(*channel_id)) {
+        auto it = channel_dependency_group.find(*channel_id);
+        for (HloInstruction* cinst : it->second) {
+          dfs_stack.emplace_back(cinst);
         }
-        break;
+      } else {
+        dfs_stack.emplace_back(inst);
       }
-      default:
-        break;
+    };
+
+    const auto add_predecessors = [&](HloInstruction* inst) {
+      // Add the operands to the stack in reverse order so the first operand is
+      // processed first. This will produce a more natural ordering and a nicer
+      // result for things like HLO stringification.
+      const auto& operands = inst->operands();
+      for (int64 i = operands.size() - 1; i >= 0; --i) {
+        add_dfs_stack(operands[i]);
+      }
+
+      for (HloInstruction* op : inst->control_predecessors()) {
+        add_dfs_stack(op);
+      }
+    };
+
+    // If the current instruction is a channel instruction, add the dependencies
+    // from all associated instructions of the channel.
+    auto channel_id = get_channel_id(current);
+    if (channel_id && channel_dependency_group.count(*channel_id)) {
+      auto it = channel_dependency_group.find(*channel_id);
+      for (HloInstruction* cinst : it->second) {
+        add_predecessors(cinst);
+      }
+    } else {
+      add_predecessors(current);
     }
   }
 }
 
-HloComputation::ChannelDependencyMap
+HloComputation::ChannelDependencyGroup
 HloComputation::ComputeChannelDependencies() const {
-  ChannelDependencyMap channel_dependency_map;
+  ChannelDependencyGroup channel_dependency_group;
   for (const auto& instruction : instructions_) {
     switch (instruction->opcode()) {
-      case HloOpcode::kSend: {
-        channel_dependency_map[instruction->channel_id()].push_back(
+      case HloOpcode::kSend:
+      case HloOpcode::kRecvDone:
+        channel_dependency_group[instruction->channel_id()].push_back(
             instruction.get());
         break;
-      }
       case HloOpcode::kAllReduce: {
         auto all_reduce_id = instruction->all_reduce_id();
         if (all_reduce_id) {
-          auto& dependencies = channel_dependency_map[all_reduce_id.value()];
-          absl::c_copy(instruction->operands(),
-                       std::back_inserter(dependencies));
-          absl::c_copy(instruction->control_predecessors(),
-                       std::back_inserter(dependencies));
+          channel_dependency_group[all_reduce_id.value()].push_back(
+              instruction.get());
         }
         break;
       }
@@ -387,11 +414,11 @@ HloComputation::ComputeChannelDependencies() const {
         break;
     }
   }
-  return channel_dependency_map;
+  return channel_dependency_group;
 }
 
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
-  auto channel_dependency_map = ComputeChannelDependencies();
+  auto channel_dependency_group = ComputeChannelDependencies();
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   std::vector<HloInstruction*> trace_instructions;
@@ -404,7 +431,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
       // users).
       trace_instructions.push_back(instruction.get());
     } else if (instruction->users().empty()) {
-      ComputeInstructionPostOrder(channel_dependency_map, &post_order,
+      ComputeInstructionPostOrder(channel_dependency_group, &post_order,
                                   instruction.get(), &visited);
     }
   }
@@ -695,21 +722,34 @@ bool HloComputation::operator==(const HloComputation& other) const {
   }
   absl::flat_hash_set<std::pair<const HloInstruction*, const HloInstruction*>>
       visited;
-  std::function<bool(const HloInstruction*, const HloInstruction*)> eq =
-      [&visited, &eq](const HloInstruction* a, const HloInstruction* b) {
-        // If <a,b> are visited but not identical, the recursion should have
-        // been aborted. So, if <a,b> are visited at this point, they must be
-        // identical.
-        if (visited.contains(std::make_pair(a, b))) {
-          return true;
-        }
-        visited.emplace(a, b);
-        return a->Identical(
-            *b, eq, [](const HloComputation* a, const HloComputation* b) {
-              return *a == *b;
-            });
-      };
-  return eq(root_instruction(), other.root_instruction());
+  std::vector<std::pair<const HloInstruction*, const HloInstruction*>> worklist;
+
+  worklist.push_back({root_instruction(), other.root_instruction()});
+
+  while (!worklist.empty()) {
+    auto pair = worklist.back();
+    worklist.pop_back();
+
+    if (visited.contains(pair)) {
+      continue;
+    }
+    visited.emplace(pair);
+    // TODO(b/123082518): Avoid recursively invoking == becasue it may
+    // cause a stack overflow with deeply nested subcomputations.
+    bool identical_ignoring_operands = pair.first->Identical(
+        *pair.second,
+        [](const HloInstruction*, const HloInstruction*) { return true; },
+        [](const HloComputation* a, const HloComputation* b) {
+          return *a == *b;
+        });
+    if (!identical_ignoring_operands) {
+      return false;
+    }
+    for (size_t i = 0; i < pair.first->operands().size(); ++i) {
+      worklist.push_back({pair.first->operand(i), pair.second->operand(i)});
+    }
+  }
+  return true;
 }
 
 Status HloComputation::ReplaceWithNewInstruction(
@@ -844,15 +884,15 @@ Status HloComputation::Accept(
 std::unique_ptr<HloComputation> HloComputation::Clone(
     const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
-      /*replacements=*/std::unordered_map<const HloInstruction*,
-                                          std::unique_ptr<HloInstruction>>(),
+      /*replacements=*/absl::flat_hash_map<const HloInstruction*,
+                                           std::unique_ptr<HloInstruction>>(),
       /*extra_parameters=*/{}, context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
@@ -863,7 +903,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
@@ -876,7 +916,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
@@ -886,7 +926,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
     absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const string& suffix) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index e6a1eb89cfdb474f79c184ea0eb77dba8ccd5f03..a48cfa1f1b22ffd748fe9fe3ddb7f36d8d0dee4d 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -118,11 +117,20 @@ class HloComputation {
   // instruction.
   Status RemoveUnusedParameters();
 
-  // Add new parameter instruction to the computation.
+  // Adds a new parameter instruction to a fusion computation.
+  //
   // This should be a new parameter. Instruction will be appended to parameters
   // and inserted to the instruction list.
   HloInstruction* AddParameter(std::unique_ptr<HloInstruction> instruction);
 
+  // Adds a new parameter instruction to the entry computation and update
+  // the parent module config to reflect the change.
+  //
+  // This should be a new parameter. Instruction will be appended to parameters
+  // and inserted to the instruction list.
+  HloInstruction* AddEntryComputationParameter(
+      std::unique_ptr<HloInstruction> instruction);
+
   // Remove an instruction from the computation. The instruction must have no
   // users. Instruction is deallocated with this call.
   Status RemoveInstruction(HloInstruction* instruction);
@@ -329,7 +337,8 @@ class HloComputation {
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
-      std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      absl::flat_hash_map<const HloInstruction*,
+                          std::unique_ptr<HloInstruction>>
           replacements,
       absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const string& suffix = "clone");
@@ -369,13 +378,13 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // all-reduce the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
+  // Returns a map from channel-id to the group of instructions associated with
+  // the channel. These instructions will be considered as a single node for
+  // dependency purposes. Send and RecvDone are in the group, and AllReduces
+  // with the same channel id are in the group.
+  using ChannelDependencyGroup =
       absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
+  ChannelDependencyGroup ComputeChannelDependencies() const;
 
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
@@ -391,6 +400,10 @@ class HloComputation {
     fusion_instruction_ = fusion_instruction;
   }
 
+  // Clear the unique ID of the computation so that it can be re-assigned, such
+  // as for the purpose of compacting the unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // The id of this computation should be unique within the module.
   void SetUniqueId(int64 id) {
     CHECK_EQ(unique_id_, -1);
@@ -434,7 +447,7 @@ class HloComputation {
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
-      const HloComputation::ChannelDependencyMap& channel_dependency_map,
+      const HloComputation::ChannelDependencyGroup& channel_dependency_map,
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
       absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 251c7bbec418d8c3e8b27277160e608840726996..3fa6f80b1b9d604bcf299e8e6694f852cca8e765 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -17,14 +17,16 @@ limitations under the License.
 
 #include <memory>
 #include <set>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -37,6 +39,7 @@ namespace xla {
 namespace {
 
 namespace m = match;
+namespace op = xla::testing::opcode_matchers;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -506,12 +509,13 @@ TEST_F(HloComputationTest, CloneWithReplacements) {
       HloInstruction::CreateParameter(1, r0f32_, "p.0.rhs"));
   auto param2 =
       builder.AddInstruction(HloInstruction::CreateParameter(2, r0s64, "p.1"));
-  auto lt = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param0, param1));
+  auto lt = builder.AddInstruction(
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param0,
+                                    param1, ComparisonDirection::kLt));
   auto module = CreateNewVerifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/lt));
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(param2,
                        HloInstruction::CreateParameter(2, r0s32, "p.1"));
@@ -645,5 +649,57 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
+std::unique_ptr<HloComputation> MakeAddNComputation(int n) {
+  auto builder = HloComputation::Builder("add_n");
+  auto result = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "x_value"));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  for (int i = 0; i < n; ++i) {
+    result = builder.AddInstruction(HloInstruction::CreateBinary(
+        one->shape(), HloOpcode::kAdd, result, one));
+  }
+  return builder.Build();
+}
+
+TEST_F(HloComputationTest, DeepEquality) {
+  auto computation_a = MakeAddNComputation(200000);
+  auto computation_b = MakeAddNComputation(200000);
+  EXPECT_TRUE(*computation_a == *computation_b);
+
+  auto computation_c = MakeAddNComputation(199999);
+  EXPECT_FALSE(*computation_a == *computation_c);
+  EXPECT_FALSE(*computation_c == *computation_b);
+}
+
+// Tests that cross-module AllReduce instructions are ordered before all their
+// predecessors and after all their successors.
+TEST_F(HloComputationTest, InstructionPostOrderWithAllReduce) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param = f32[128] parameter(0), sharding={maximal device=0}
+  crs0 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=0}
+  crs1 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=1}
+  add = f32[128] add(crs0, crs0), sharding={maximal device=0}
+  ROOT t = (f32[128], f32[128]) tuple(add, crs1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+  EXPECT_THAT(module->entry_computation()->MakeInstructionPostOrder(),
+              ElementsAre(op::Parameter(), op::AllReduce(), op::AllReduce(),
+                          op::Add(), op::Tuple()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index c4b4fa62ddcb46b8ac46567da5ab32a6a1f4914c..13b1c82709523fc98b02551d14bc9a9cdacc5fc1 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -91,9 +91,10 @@ Status HloCostAnalysis::HandleElementwiseOp(
   auto opcode = hlo_instruction->opcode();
   // We treat transcendental operations separately since one transcendental
   // operation can correspond to several floating point ops.
-  if (opcode == HloOpcode::kExp || opcode == HloOpcode::kPower ||
-      opcode == HloOpcode::kTanh || opcode == HloOpcode::kSin ||
-      opcode == HloOpcode::kCos) {
+  if (opcode == HloOpcode::kExp || opcode == HloOpcode::kLog ||
+      opcode == HloOpcode::kPower || opcode == HloOpcode::kSqrt ||
+      opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
+      opcode == HloOpcode::kSin || opcode == HloOpcode::kCos) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
@@ -237,24 +238,17 @@ Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
 
 Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
-  const Shape& rhs_shape = dot->operand(1)->shape();
+  const Shape& dot_shape = dot->shape();
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width =
-      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
-  // First divide by reduction width before multiplying by rhs elements to avoid
-  // overflow.
-  int64 fma_count;
-  if (reduction_width == 0) {
-    fma_count = 0;
-  } else {
-    fma_count = (ShapeUtil::ElementsIn(lhs_shape) / reduction_width) *
-                ShapeUtil::ElementsIn(rhs_shape);
+  int64 reduction_width = 1;
+  for (auto dim : dnums.lhs_contracting_dimensions()) {
+    reduction_width *= lhs_shape.dimensions(dim);
   }
-
-  // We count an FMA operation as 2 floating point operations.
-  current_properties_[kFlopsKey] = kFmaFlops * fma_count;
+  // Each output elment requires reduction_width FMA operations.
+  current_properties_[kFlopsKey] =
+      kFmaFlops * ShapeUtil::ElementsIn(dot_shape) * reduction_width;
   return Status::OK();
 }
 
@@ -531,7 +525,8 @@ Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
   }
 
   const int64 fma_count = (input_feature / convolution->feature_group_count()) *
-                          output_feature * batch *
+                          output_feature *
+                          (batch / convolution->batch_group_count()) *
                           Product(valid_position_counts);
   current_properties_[kFlopsKey] = fma_count * kFmaFlops;
   return Status::OK();
@@ -552,6 +547,32 @@ Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
+  float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
+  bytes_accessed += GetShapeSize(hlo->operand(1)->shape());
+  current_properties_[kBytesAccessedKey] = bytes_accessed;
+
+  const Shape& a_shape = hlo->operand(0)->shape();
+  const Shape& b_shape = hlo->operand(1)->shape();
+  // Estimate as batch * mn^2 / 2 flops.
+  int64 elems = a_shape.dimensions(a_shape.dimensions_size() - 1);
+  elems *= ShapeUtil::ElementsIn(b_shape);
+  current_properties_[kFlopsKey] = kFmaFlops * elems;
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
+  float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
+  current_properties_[kBytesAccessedKey] = bytes_accessed;
+
+  const Shape& a_shape = hlo->operand(0)->shape();
+  // Estimate as batch * n^3 / 3 flops.
+  int64 elems = a_shape.dimensions(a_shape.dimensions_size() - 1);
+  elems *= ShapeUtil::ElementsIn(a_shape);
+  current_properties_[kFlopsKey] = elems / 3;
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
@@ -577,6 +598,10 @@ Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
@@ -659,19 +684,22 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
 }
 
 Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
-  // Compute the cost of the true and false computations and take the maximum
-  // from those for each property.
+  // Compute the cost of the branch computations and take the maximum from those
+  // for each property.
   TF_ASSIGN_OR_RETURN(
-      const Properties true_computation_properties,
-      ProcessUnnestedSubcomputation(conditional->true_computation()));
-  TF_ASSIGN_OR_RETURN(
-      const Properties false_computation_properties,
-      ProcessUnnestedSubcomputation(conditional->false_computation()));
-  current_properties_ = true_computation_properties;
-  for (const auto& property : false_computation_properties) {
-    if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
-      current_properties_[property.first] =
-          std::max(current_properties_[property.first], property.second);
+      const Properties branch0_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->branch_computation(0)));
+  current_properties_ = branch0_computation_properties;
+  for (int j = 1; j < conditional->branch_count(); ++j) {
+    TF_ASSIGN_OR_RETURN(
+        const Properties branch_computation_properties,
+        ProcessUnnestedSubcomputation(conditional->branch_computation(j)));
+    for (const auto& property : branch_computation_properties) {
+      if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_,
+                                               property)) {
+        auto& current_property = current_properties_[property.first];
+        current_property = std::max(current_property, property.second);
+      }
     }
   }
   current_should_compute_bottleneck_time_ = false;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index b52305626dd67336eb31098d086ad357f12d96c7..4480554de507f20b5d44b87a19e58236252bad1d 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -71,9 +71,12 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleDot(const HloInstruction* dot) override;
   Status HandleConvolution(const HloInstruction* convolution) override;
   Status HandleFft(const HloInstruction* fft) override;
+  Status HandleTriangularSolve(const HloInstruction* hlo) override;
+  Status HandleCholesky(const HloInstruction* hlo) override;
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  Status HandleReplicaId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
   Status HandleRng(const HloInstruction* random) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 82dd57d3c656ef7a61f6ab9e110f44ef551fac30..4d42770ba784ba15fae9518b40a75d8a2f038e66 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -157,6 +158,87 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) {
             sizeof(float) * (10 * 5 + 5 * 30 + 10 * 30));
 }
 
+TEST_F(HloCostAnalysisTest, DotGeneral) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_contracting_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral2) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_batch_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 5 * 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral3) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs");
+  auto rhs = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 + 5 * 30 + 5 * 5 * 10 * 30));
+}
+
 TEST_F(HloCostAnalysisTest, Map) {
   XlaBuilder builder("map");
   auto input = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10}), "in");
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index bb5d21c654c73da257d53e4f8486b2e83019b534..d9c5f7c66de03a50f6566092ab274a6bb99d4229 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -17,10 +17,15 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -37,6 +42,18 @@ StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
       HloInstruction::CreateBinary(binary_op_shape, opcode, lhs, rhs));
 }
 
+StatusOr<HloInstruction*> MakeCompareHlo(ComparisonDirection direction,
+                                         HloInstruction* lhs,
+                                         HloInstruction* rhs) {
+  HloComputation* computation = lhs->parent();
+  CHECK_EQ(computation, rhs->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape binary_op_shape,
+      ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, lhs, rhs));
+  return computation->AddInstruction(
+      HloInstruction::CreateCompare(binary_op_shape, lhs, rhs, direction));
+}
+
 StatusOr<HloInstruction*> MakePadHlo(HloInstruction* operand,
                                      HloInstruction* padding_value,
                                      const PaddingConfig& padding_config) {
@@ -156,9 +173,9 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
       dynamic_update_slice_shape, operand, update, scalar_start_indices));
 }
 
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds) {
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds) {
   HloComputation* computation = operand->parent();
   Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                                result_shape_bounds);
@@ -268,6 +285,29 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
       select_shape, HloOpcode::kSelect, pred, on_true, on_false));
 }
 
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module) {
+  CHECK(!operands.empty()) << "Sort Hlo requires at least one operand.";
+  HloComputation* compare_computation;
+  XlaBuilder b("Sort.Compare");
+  std::vector<PrimitiveType> operand_types(operands.size());
+  for (int64 i = 0; i < operands.size(); ++i) {
+    operand_types[i] = operands[i]->shape().element_type();
+  }
+  XlaComputation comparator = CreateScalarLtComputation(operand_types, &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module);
+  compare_computation =
+      module->DeepCloneComputation(new_module->entry_computation(), &context);
+  return builder->AddInstruction(HloInstruction::CreateSort(
+      sort_shape, dimension_to_sort, operands, compare_computation, is_stable));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -394,9 +434,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
   return MakePadHlo(operand, zero, padding_config);
 }
 
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions) {
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions) {
   HloInstruction* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
   return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 8e5ddbbd503a501bd493aec43a2ccd4db883ef0c..f163112f7ff54bd525f24c235bd8b0d195c33a5f 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -32,6 +32,12 @@ namespace xla {
 StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
                                         HloInstruction* rhs);
 
+// Creates a compare HLO instruction and adds it to the computation containing
+// `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
+StatusOr<HloInstruction*> MakeCompareHlo(ComparisonDirection direction,
+                                         HloInstruction* lhs,
+                                         HloInstruction* rhs);
+
 // Creates a pad HLO instruction and adds it to the computation containing
 // `operand` and `padding_value` (`operand` and `padding_value` must be in the
 // same computation).
@@ -82,9 +88,9 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
 
 // Creates a broadcast HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds);
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
@@ -123,6 +129,15 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
                                         HloInstruction* on_true,
                                         HloInstruction* on_false);
 
+// Creates a Sort HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation. Also creates a
+// default compare sub-computation which sorts the first operand into ascending
+// order. 'is_stable' specifies whether the sorting should be stable.
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module);
+
 // Creates an R1 Constant HLO instruction of the given PrimitiveType with the
 // given values and adds it to the given computation.
 template <typename NativeT>
@@ -198,9 +213,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
 // Broadcasts a zero value of type `element_type` into a tensor with element
 // type `element_type` and dimension bounds `broadcast_dimensions`.  The
 // broadcast instruction is emitted into `computation`.
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions);
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions);
 
 // Creates a HLO computation that takes arguments of type `domain` and produces
 // a value of type `range`.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index 3715e12b4e2baf7bc2149237457c16c3919c5083..6025e6a77941369f75ebaa98bdf0979669b3a03c 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -191,9 +191,8 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), S32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
@@ -211,9 +210,8 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), F32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index e602107cbe64320a8e8e740168cb294ec6be9667..849cac278ee379122ba1ff9fade3bf003969b8a7 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 3144a84805454488f417391f40ed6b9e9facc752..06a832743864896c8feca3c807dfc4faa6445a03 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -35,48 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
-namespace {
-
-// We have this pattern in dynamaic update slice fusion, which should be
-// supported:
-//
-// Parameters: p0, p1
-// Fusion
-//   ds = DynamicSlice(p0, p1)
-//   ROOT DynamicUpdateslice(p0, ds, p1)
-//
-// In this case, we should be able to reuse p0 and output, although p0 has
-// multiple uses.
-bool MultiDynamicSliceUseShareSameIndices(absl::Span<const HloUse> uses) {
-  if (uses.empty()) {
-    return false;
-  }
-  const HloInstruction* indices = nullptr;
-  for (HloUse use : uses) {
-    auto user = use.instruction;
-    if (user->opcode() == HloOpcode::kDynamicUpdateSlice) {
-      if (indices == nullptr) {
-        indices = user->operand(2);
-      } else if (indices != user->operand(2)) {
-        return false;
-      }
-      if (use.operand_number != 0) {
-        return false;
-      }
-    } else if (user->opcode() == HloOpcode::kDynamicSlice) {
-      if (indices == nullptr) {
-        indices = user->operand(1);
-      } else if (indices != user->operand(1)) {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace
 
 using absl::StrAppend;
 using absl::StrCat;
@@ -414,11 +374,11 @@ bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
 bool HloDataflowAnalysis::UpdateConditionalValueSet(
     HloInstruction* conditional) {
   CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
-  const InstructionValueSet* const inputs[] = {
-      &GetInstructionValueSet(
-          conditional->true_computation()->root_instruction()),
-      &GetInstructionValueSet(
-          conditional->false_computation()->root_instruction())};
+  std::vector<const InstructionValueSet*> inputs(conditional->branch_count());
+  for (int j = 0; j < conditional->branch_count(); ++j) {
+    inputs[j] = &GetInstructionValueSet(
+        conditional->branch_computation(j)->root_instruction());
+  }
   if (ssa_form_) {
     return Phi(conditional, inputs);
   } else {
@@ -546,20 +506,23 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
     } else if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
       CHECK_EQ(parameter->parameter_number(), 0);
       auto conditional = callsite.instruction();
-      // Conditional has 3 operands. Operand 0 is the predicate, operand 1 is
-      // the argument to the true computation and operand 2 is the argument to
-      // the false computation.
+      // Conditional has branch_count+1 operands. Operand 0 is the branch_index,
+      // operands 1 and onward are the arguments to the branch computations.
       //
-      // If the parameter belongs to conditional's true computation, then
+      // If the parameter belongs to conditional's branch 0 computation, then
       // operand 1 is forwarded to this parameter instruction. If the parameter
-      // belongs to conditional's false computation, then operand 2 is forwarded
-      // to this parameter instruction.
-      if (parameter->parent() == conditional->true_computation()) {
-        inputs.push_back(&GetInstructionValueSet(conditional->operand(1)));
-      } else {
-        CHECK_EQ(parameter->parent(), conditional->false_computation());
-        inputs.push_back(&GetInstructionValueSet(conditional->operand(2)));
+      // belongs to conditional's branch 5 computation, then operand 6 is
+      // forwarded to this parameter instruction.
+      bool found_parent = false;
+      for (int j = 0; j < conditional->branch_count(); ++j) {
+        if (parameter->parent() == conditional->branch_computation(j)) {
+          inputs.push_back(
+              &GetInstructionValueSet(conditional->operand(j + 1)));
+          found_parent = true;
+          break;
+        }
       }
+      CHECK(found_parent);
       need_phi = true;
     } else {
       LOG(FATAL) << "CallContext::kSequential computations should only be "
@@ -710,19 +673,17 @@ void HloDataflowAnalysis::Propagate() {
       // parameter(s) of the computation need to be updated.
       if (user->opcode() == HloOpcode::kConditional) {
         // If operand 0 is the use of instruction, then no parameters need to be
-        // updated, since that is the predicate of the conditional.
-        // If operand 1 is the use of instruction, then the true_computation's
-        // parameter need to be updated.
-        // If operand 2 is the use of instruction, then the false_computation's
-        // parameter need to be updated.
+        // updated, since that is the branch_index of the conditional.
+        // If operand n+1 is the use of instruction, then the branch_computation
+        // n's parameter need to be updated.
         //
-        // Note that the same instruction can be used in both operand 1 and
-        // operand 2.
-        if (user->operand(1) == instruction) {
-          add_to_worklist(user->true_computation()->parameter_instruction(0));
-        }
-        if (user->operand(2) == instruction) {
-          add_to_worklist(user->false_computation()->parameter_instruction(0));
+        // Note that the same instruction can be used in multiple branches'
+        // operands.
+        for (int j = 0; j < user->branch_count(); ++j) {
+          if (user->operand(j + 1) == instruction) {
+            add_to_worklist(
+                user->branch_computation(j)->parameter_instruction(0));
+          }
         }
       } else {
         for (HloComputation* called_computation : user->called_computations()) {
@@ -744,8 +705,8 @@ void HloDataflowAnalysis::Propagate() {
       const CallGraphNode& call_graph_node =
           call_graph_->GetNode(instruction->parent());
       for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-        if ((callsite.instruction()->opcode() == HloOpcode::kCall) ||
-            (callsite.instruction()->opcode() == HloOpcode::kConditional)) {
+        if (callsite.instruction()->opcode() == HloOpcode::kCall ||
+            callsite.instruction()->opcode() == HloOpcode::kConditional) {
           add_to_worklist(callsite.instruction());
         } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
           // Add the while itself, and the body and condition parameters.
@@ -983,6 +944,79 @@ bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
   return true;
 }
 
+// Given a fusion whose root is a dynamic-update-slice op, determines whether
+// the fusion's output buffer can be shared with the buffer of fusion_param,
+// which must be a fused parameter of the fusion.
+//
+// Preconditions:
+//
+//  - fusion's root is a dynamic-update-slice op.
+//  - fusion_param is a parameter within the fusion.
+//
+// fusion_param may point to a subelement of the actual parameter instruction if
+// the param is a tuple; i.e. fusion_param->index() need not be the empty list.
+//
+// Returns true if:
+//
+//  * fusion is a loop or input fusion, AND
+//  * fusion_param is used by the root of dynamic-update-slice as the "base" of
+//    the update, i.e. the thing being updated, AND
+//  * all other uses of fusion_param are dynamic-slices that slice the same
+//    indices as are overwritten in the dynamic-update-slice.
+//
+// In the case that there are no other uses of fusion_param (last bullet point
+// is vacuously true) it's easy to see why an in-place DUS is safe; this is just
+// the "natural" implementation of DUS.  If there are other users, in-place DUS
+// is safe on the assumption that the thread which writes element i of the
+// output will be the only one to read element i of fusion_param (via the
+// dynamic-slice ops).
+static bool CanDoInPlaceDynamicUpdateSlice(HloInstruction* fusion,
+                                           const HloValue& fusion_param_value) {
+  auto* root =
+      Cast<HloDynamicUpdateSliceInstruction>(fusion->fused_expression_root());
+  auto* fusion_param = fusion_param_value.instruction();
+  CHECK_EQ(fusion_param->opcode(), HloOpcode::kParameter);
+  CHECK_EQ(fusion_param->parent(), fusion->fused_instructions_computation());
+
+  // fusion must be a loop or input fusion.
+  auto kind = fusion->fusion_kind();
+  if (kind != HloInstruction::FusionKind::kLoop &&
+      kind != HloInstruction::FusionKind::kInput) {
+    return false;
+  }
+
+  // fusion_param must be used by the root as the "base" of the
+  // dynamic-update-slice.  The natural way to check this would be
+  //
+  //   `if (root->operand(0) != fusion_param)`
+  //
+  // but we also have to handle the case where the fusion parameter is
+  // tuple-shaped and we're considering just one element of that tuple, i.e.
+  // fusion_param.index() != {}.
+  if (absl::c_count_if(fusion_param_value.uses(), [&](const HloUse& use) {
+        return use.instruction == root;
+      }) != 1) {
+    return false;
+  }
+
+  // All other uses of fusion_param must be dynamic-slices that slice the same
+  // indices as are overwritten by the dynamic-update-slice.
+  for (const HloUse& use : fusion_param_value.uses()) {
+    auto* user = use.instruction;
+    if (user == root) {
+      continue;
+    }
+
+    // Check that `user` is a dynamic-slice op and has the same slice indices as
+    // `root`.
+    auto* ds = DynCast<HloDynamicSliceInstruction>(user);
+    if (!ds || ds->index_operands() != root->index_operands()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index) const {
@@ -999,35 +1033,34 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   }
 
   if (user->opcode() == HloOpcode::kFusion) {
-    if (fusion_can_share_buffer_ != nullptr) {
-      return fusion_can_share_buffer_(user, operand);
-    }
     // Get the parameter associated with 'operand';
     HloInstruction* fusion_param =
         user->fused_parameter(user->operand_index(operand));
 
-    const HloValue& value = GetValueDefinedAt(fusion_param, operand_index);
-    if (MultiDynamicSliceUseShareSameIndices(value.uses())) {
-      return true;
+    const HloValue& fusion_param_value =
+        GetValueDefinedAt(fusion_param, operand_index);
+
+    // TODO(b/80315712): This code is in a bit of a weird intermediate state
+    // at the moment. The in-place DUS check really needs to be common to all
+    // backends, so it runs first. Then we run the backend-specific check if
+    // provided, or go through the target-indepdendent check if not.
+    // Unfortunately, the notionally "target-independent" path actually contains
+    // some target-specific code, so we can't run all of it *in addition* to the
+    // target-specific function, like the interface documentation says.
+    if (user->fused_expression_root()->opcode() ==
+        HloOpcode::kDynamicUpdateSlice) {
+      return CanDoInPlaceDynamicUpdateSlice(user, fusion_param_value);
     }
+
+    if (fusion_can_share_buffer_ != nullptr) {
+      return fusion_can_share_buffer_(user, operand);
+    }
+
     if (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
         user->fusion_kind() == HloInstruction::FusionKind::kInput) {
-      if (user->fused_expression_root()->opcode() ==
-          HloOpcode::kDynamicUpdateSlice) {
-        // Loop fusion with kDynamicUpdateSlice fused root.
-        //
-        // Returns true iff there is exactly one use of 'operand' at shape index
-        // 'operand_index', and this singleton use is the fused root at operand
-        // index 0.
-        if (value.uses().size() == 1) {
-          const HloUse& use = value.uses()[0];
-          return use.instruction == user->fused_expression_root() &&
-                 use.operand_number == 0;
-        }
-        return false;
-      }
       return AreTransitiveUsesElementwiseOrTuple(fusion_param);
     }
+
     if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
         user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
@@ -1049,8 +1082,8 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Returns true iff there is exactly one use of 'operand' at shape index
       // 'operand_index', and this singleton use is the fused root (at operand
       // index 'other_add_operand_index').
-      if (value.uses().size() == 1) {
-        const HloUse& use = value.uses()[0];
+      if (fusion_param_value.uses().size() == 1) {
+        const HloUse& use = fusion_param_value.uses()[0];
         return use.instruction == user->fused_expression_root() &&
                use.operand_number == other_add_operand_index;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 4a7c4963b7b399e625da907b3810c42df7ee2bd3..f0b18d6fc9554e53fd2f4712fe9787f660d25b8a 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -49,7 +50,6 @@ class HloDataflowAnalysisTest : public HloTestBase,
   // reference to the generated analysis stored in analysis_.
   const HloDataflowAnalysis& RunAnalysis(bool ssa_form,
                                          bool bitcast_defines_value = false) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before dataflow analysis");
     analysis_ =
         HloDataflowAnalysis::Run(*module_, ssa_form, bitcast_defines_value)
             .ConsumeValueOrDie();
@@ -108,6 +108,7 @@ TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -156,6 +157,7 @@ TEST_P(HloDataflowAnalysisTest, TupleAndGtes) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -211,6 +213,7 @@ TEST_P(HloDataflowAnalysisTest, NestedTuple) {
   auto gte_out = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, gte_tuple, 0));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -266,6 +269,7 @@ TEST_P(HloDataflowAnalysisTest, SingleCall) {
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -319,6 +323,7 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kSubtract, call1, call2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -371,6 +376,7 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) {
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -433,6 +439,7 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, outer_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -508,6 +515,7 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -613,6 +621,7 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) {
   auto xla_while2 = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -700,6 +709,7 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
   auto entry_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -795,6 +805,7 @@ TEST_P(HloDataflowAnalysisTest, SwizzlingWhile) {
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -850,6 +861,7 @@ TEST_P(HloDataflowAnalysisTest, ArraySelect) {
       scalar_shape_, HloOpcode::kSelect, pred, constant1, constant2));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -892,6 +904,7 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) {
       tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -963,6 +976,7 @@ TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) {
       tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1052,6 +1066,7 @@ TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) {
       HloInstruction::CreateWhile(tuple->shape(), condition, body, tuple));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1094,6 +1109,7 @@ TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) {
       scalar_shape_, HloOpcode::kBitcast, constant));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   {
@@ -1130,6 +1146,7 @@ TEST_P(HloDataflowAnalysisTest, TupleCopy) {
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(tuple->shape(), HloOpcode::kCopy, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1163,6 +1180,7 @@ TEST_P(HloDataflowAnalysisTest, SendAndSendDone) {
       HloInstruction::CreateSend(param, token, /*channel_id=*/0));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1188,6 +1206,7 @@ TEST_P(HloDataflowAnalysisTest, RecvAndRecvDone) {
       HloInstruction::CreateRecv(scalar_shape_, token, /*channel_id=*/0));
   auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1223,6 +1242,7 @@ TEST_P(HloDataflowAnalysisTest, ElementwiseChainInterference) {
       HloInstruction::CreateUnary(vector_shape_, HloOpcode::kLog, exp));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1260,6 +1280,7 @@ TEST_P(HloDataflowAnalysisTest, MultipleEntryParameters_Sequential) {
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   HloSchedule schedule(module_.get());
@@ -1338,6 +1359,7 @@ TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) {
       HloInstruction::CreateWhile(scalar_shape_, condition, body, param));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   bool ssa_form = GetParam();
   RunAnalysis(ssa_form);
 
@@ -1408,6 +1430,7 @@ TEST_P(HloDataflowAnalysisTest, NonElementwiseOperand) {
       HloInstruction::CreateReverse(vector_shape_, negate, {0}));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1439,6 +1462,7 @@ TEST_P(HloDataflowAnalysisTest, OverlappedValues) {
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1478,6 +1502,7 @@ TEST_P(HloDataflowAnalysisTest, OverlappedValuesSequentialOrder) {
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   HloSchedule schedule(module_.get());
@@ -1536,6 +1561,7 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       vector_shape_, HloOpcode::kAdd, negate, call));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1588,6 +1614,7 @@ TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
       scalar_shape_, pred, constant1, true_computation, constant2,
       false_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
@@ -1681,6 +1708,7 @@ TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
       scalar_shape_, pred, tuple_operand, true_computation, tuple_operand,
       false_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
@@ -1815,6 +1843,7 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
       scalar_shape_, pred1, tuple_operand, inner_conditional_computation,
       constant3, computation3));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
@@ -2169,6 +2198,66 @@ TEST_F(CanShareOperandBufferWithUserTest,
       dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithDifferentIndices) {
+  const char* kModule = R"(
+    HloModule test
+
+    fused_computation {
+      p0 = f32[10,20,30] parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = s32[] parameter(3)
+      slice = f32[1,1,30] dynamic-slice(p0, p1, p2, p3), dynamic_slice_sizes={1,1,30}
+      ROOT dus = f32[10,20,30] dynamic-update-slice(p0, slice, p1, p3, p2)
+    }
+
+    ENTRY test {
+      p0 = f32[10,20,30] parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = s32[] parameter(3)
+      ROOT fusion = f32[10,20,30] fusion(p0, p1, p2, p3), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(kModule));
+  auto* fusion = module_->entry_computation()->root_instruction();
+  auto* param = module_->entry_computation()->parameter_instruction(0);
+
+  RunAnalysis();
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithSameIndices) {
+  const char* kModule = R"(
+    HloModule test
+
+    fused_computation {
+      p0 = f32[10,20,30] parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = s32[] parameter(3)
+      slice = f32[1,1,30] dynamic-slice(p0, p1, p2, p3), dynamic_slice_sizes={1,1,30}
+      ROOT dus = f32[10,20,30] dynamic-update-slice(p0, slice, p1, p2, p3)
+    }
+
+    ENTRY test {
+      p0 = f32[10,20,30] parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = s32[] parameter(3)
+      ROOT fusion = f32[10,20,30] fusion(p0, p1, p2, p3), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(kModule));
+  auto* fusion = module_->entry_computation()->root_instruction();
+  auto* param = module_->entry_computation()->parameter_instruction(0);
+
+  RunAnalysis();
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2178,8 +2267,8 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
       HloInstruction::CreateParameter(0, in_shape, "param0"));
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, in_shape, "param1"));
-  auto result = builder.AddInstruction(
-      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+  auto result = builder.AddInstruction(HloInstruction::CreateCompare(
+      out_shape, param0, param1, ComparisonDirection::kEq));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -2356,14 +2445,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, -1, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -2371,6 +2463,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -2378,11 +2471,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
@@ -2495,8 +2591,8 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     auto builder = HloComputation::Builder(TestName() + ".Cond");
     auto data = builder.AddInstruction(
         HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), data, data, ComparisonDirection::kEq));
     return builder.Build();
   };
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index b5d72b386f89568cc3066b2e497be98428d1ed0c..d0073237ac2aab565175896e42c4503e26a9966b 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -223,8 +223,9 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) {
         HloInstruction::CreateParameter(0, shape, "cond_param"));
     auto constant = cond_builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-    cond_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param, constant));
+    cond_builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param,
+                                      constant, ComparisonDirection::kLt));
   }
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index 19b5734825df833fd34d634e4c1630dd75e96c4c..3746fbbda02b09d0660c209d09698359375b9e02 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -27,8 +27,6 @@ namespace {
 
 StatusOr<bool> RunInternal(HloModule* module,
                            HloDomainIsolator::DomainCreator* creator) {
-  hlo_graph_dumper::MaybeDumpHloModule(*module, "Before Domain Isolator");
-
   int64 added_domains = 0;
   for (HloComputation* computation : module->computations()) {
     // Walk in post order and place all the required kDomain instructions.
@@ -56,9 +54,6 @@ StatusOr<bool> RunInternal(HloModule* module,
     }
   }
   VLOG(3) << "Added " << added_domains << " kDomain instructions";
-  if (added_domains > 0) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module, "After Domain Isolator");
-  }
   return added_domains > 0;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
index 67fad0769f5eb5ceca64ebd2aa78c6469f2c813d..4975c3fbb93da266bbb542793953d4e365325d93 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_remover.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
@@ -59,8 +59,6 @@ Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
 
 StatusOr<bool> HloDomainRemover::RunContext::Run() {
   VLOG(4) << "Processing metadata domain: '" << remover_->kind_ << "'";
-  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Remover");
-
   int64 removed_domains = 0;
   for (HloComputation* computation : module_->computations()) {
     // First create the domain instruciton sets. A domain instruction set is
@@ -97,9 +95,6 @@ StatusOr<bool> HloDomainRemover::RunContext::Run() {
   }
   VLOG(3) << "Removed " << removed_domains << " kDomain instructions of '"
           << remover_->kind_ << "' kind";
-  if (removed_domains > 0) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Remover");
-  }
   return removed_domains > 0;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 9b0f2b2a0f4dd5d1d1191e9ab0637cc3034b50da..7d6b86056af3fc2128fe1642bbfa0ca6f9ef1da0 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -127,6 +127,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops where it does not make sense to convert them.
       if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
           opcode == HloOpcode::kTuple || opcode == HloOpcode::kConvert ||
+          opcode == HloOpcode::kBitcastConvert ||
           opcode == HloOpcode::kGetTupleElement ||
           opcode == HloOpcode::kInfeed || opcode == HloOpcode::kOutfeed) {
         continue;
@@ -145,7 +146,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
           opcode == HloOpcode::kMap || opcode == HloOpcode::kReduce ||
           opcode == HloOpcode::kReduceWindow || opcode == HloOpcode::kScatter ||
           opcode == HloOpcode::kSelectAndScatter ||
-          opcode == HloOpcode::kConditional) {
+          opcode == HloOpcode::kSort || opcode == HloOpcode::kConditional) {
         continue;
       }
       TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index a3b56a44a0b02923585c1dcb69571479236188a3..4171f738620dbf545e5883b8c26169fae4b93643 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -28,15 +28,7 @@ using ::testing::Eq;
 using ::testing::Not;
 using ::testing::ResultOf;
 
-class HloElementTypeConverterTest : public HloTestBase {
- public:
-  std::unique_ptr<HloModule> CreateModuleFromHloString(
-      const string& hlo_string) {
-    return HloRunner::CreateModuleFromString(hlo_string,
-                                             GetDebugOptionsForTest())
-        .ValueOrDie();
-  }
-};
+using HloElementTypeConverterTest = HloTestBase;
 
 TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
   const string& hlo_string = R"(
@@ -47,7 +39,7 @@ TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
            custom_call_target="foo"
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -63,7 +55,7 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
       outfeed = token[] outfeed(infeed.data, token0)
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -73,17 +65,16 @@ TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
   const string& hlo_string = R"(
     HloModule NestedTuples
     ENTRY NestedTuples.v5 {
-      constant.4 = bf16[] constant(42)
       constant.2 = f32[2]{0} constant({1, 2})
-      constant.3 = bf16[] constant(42)
-      add = bf16[] add(constant.2, constant.3)
-      tuple = (f32[2]{0}, bf16[]) tuple(constant.2, add)
+      constant.3 = bf16[2]{0} constant({42, 42})
+      add = bf16[2]{0} add(constant.2, constant.3)
+      tuple = (f32[2]{0}, bf16[2]{0}) tuple(constant.2, add)
       constant.5 = bf16[2]{0} constant({22, 44})
-      ROOT tuple.1 = ((f32[2]{0}, bf16[]), bf16[2]{0}) tuple(tuple, constant.5)
+      ROOT tuple.1 = ((f32[2]{0}, bf16[2]{0}), bf16[2]{0}) tuple(tuple, constant.5)
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -111,7 +102,7 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -135,7 +126,7 @@ ENTRY main {
   ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -161,7 +152,7 @@ ENTRY main {
   ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
@@ -185,5 +176,19 @@ ENTRY main {
   EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
 }
 
+TEST_F(HloElementTypeConverterTest, BitcastConvertIsUnmodified) {
+  const string& hlo_string = R"(
+  HloModule test
+
+  ENTRY test {
+    p = bf16[] parameter(0)
+    ROOT c = u16[] bitcast-convert(p)
+  })";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  HloElementTypeConverter converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, RunHloPass(&converter, module.get()));
+  EXPECT_FALSE(converted);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 7589c992025d84355a7b19adeecfa93822196d5e..71c745149f1774ad83f4d1a41b67105255dddde9 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdlib>
 #include <functional>
+#include <iterator>
 #include <string>
 #include <type_traits>
-#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -29,11 +29,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -56,43 +56,40 @@ namespace xla {
 namespace {
 
 template <typename OperandT>
-StatusOr<Literal> Compare(const Shape& shape, HloOpcode opcode,
+StatusOr<Literal> Compare(const Shape& shape, ComparisonDirection direction,
                           LiteralSlice lhs_literal, LiteralSlice rhs_literal) {
   std::function<bool(OperandT, OperandT)> compare_op;
-  switch (opcode) {
-    case HloOpcode::kEq:
+  switch (direction) {
+    case ComparisonDirection::kEq:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el == rhs_el;
       };
       break;
-    case HloOpcode::kNe:
+    case ComparisonDirection::kNe:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el != rhs_el;
       };
       break;
-    case HloOpcode::kGe:
+    case ComparisonDirection::kGe:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el >= rhs_el;
       };
       break;
-    case HloOpcode::kGt:
+    case ComparisonDirection::kGt:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el > rhs_el;
       };
       break;
-    case HloOpcode::kLe:
+    case ComparisonDirection::kLe:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el <= rhs_el;
       };
       break;
-    case HloOpcode::kLt:
+    case ComparisonDirection::kLt:
       compare_op = [](OperandT lhs_el, OperandT rhs_el) {
         return lhs_el < rhs_el;
       };
       break;
-    default:
-      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
-                 << HloOpcodeString(opcode);
   }
 
   Literal result(shape);
@@ -106,24 +103,25 @@ StatusOr<Literal> Compare(const Shape& shape, HloOpcode opcode,
 }
 
 template <>
-StatusOr<Literal> Compare<complex64>(const Shape& shape, HloOpcode opcode,
+StatusOr<Literal> Compare<complex64>(const Shape& shape,
+                                     ComparisonDirection direction,
                                      LiteralSlice lhs_literal,
                                      LiteralSlice rhs_literal) {
   std::function<bool(complex64, complex64)> compare_op;
-  switch (opcode) {
-    case HloOpcode::kEq:
+  switch (direction) {
+    case ComparisonDirection::kEq:
       compare_op = [](complex64 lhs_el, complex64 rhs_el) {
         return lhs_el == rhs_el;
       };
       break;
-    case HloOpcode::kNe:
+    case ComparisonDirection::kNe:
       compare_op = [](complex64 lhs_el, complex64 rhs_el) {
         return lhs_el != rhs_el;
       };
       break;
     default:
-      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
-                 << HloOpcodeString(opcode);
+      LOG(FATAL) << "unhandled direction for conversion to Comparison: "
+                 << ComparisonDirectionToString(direction);
   }
 
   Literal result(shape);
@@ -137,24 +135,25 @@ StatusOr<Literal> Compare<complex64>(const Shape& shape, HloOpcode opcode,
 }
 
 template <>
-StatusOr<Literal> Compare<complex128>(const Shape& shape, HloOpcode opcode,
+StatusOr<Literal> Compare<complex128>(const Shape& shape,
+                                      ComparisonDirection direction,
                                       LiteralSlice lhs_literal,
                                       LiteralSlice rhs_literal) {
   std::function<bool(complex128, complex128)> compare_op;
-  switch (opcode) {
-    case HloOpcode::kEq:
+  switch (direction) {
+    case ComparisonDirection::kEq:
       compare_op = [](complex128 lhs_el, complex128 rhs_el) {
         return lhs_el == rhs_el;
       };
       break;
-    case HloOpcode::kNe:
+    case ComparisonDirection::kNe:
       compare_op = [](complex128 lhs_el, complex128 rhs_el) {
         return lhs_el != rhs_el;
       };
       break;
     default:
-      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
-                 << HloOpcodeString(opcode);
+      LOG(FATAL) << "unhandled direction for conversion to Comparison: "
+                 << ComparisonDirectionToString(direction);
   }
 
   Literal result(shape);
@@ -491,15 +490,52 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 
 Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
   auto operand = is_finite->operand(0);
-  if (!ShapeUtil::ElementIsFloating(operand->shape())) {
-    return InvalidArgument(
-        "expected element type in shape to be float for IsFinite op, got: %s",
-        PrimitiveType_Name(operand->shape().element_type()));
-  }
+  auto elem_ty = operand->shape().element_type();
+  switch (elem_ty) {
+    case PRED:
+    case TUPLE:
+    case OPAQUE:
+    case TOKEN:
+    case S8:
+    case S16:
+    case S32:
+    case S64:
+    case U8:
+    case U16:
+    case U32:
+    case U64:
+    case C64:
+    case C128:
+    // Explicitly enumerate all types in this switch so that when we add a new
+    // type, we'll get a compile error here.
+    case PRIMITIVE_TYPE_INVALID:
+    case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
+    case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
+      return InvalidArgument(
+          "expected element type in shape to be floating point, but "
+          "got: %s",
+          PrimitiveType_Name(elem_ty));
 
-  switch (operand->shape().element_type()) {
-    case F16:
-      return Unimplemented("unhandled primitive type: F16.");
+    case F16: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, Eigen::half>(
+          is_finite,
+          [](Eigen::half elem_operand) {
+            return std::isfinite(static_cast<float>(elem_operand));
+          },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
+    case BF16: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, bfloat16>(
+          is_finite,
+          [](bfloat16 elem_operand) {
+            return std::isfinite(static_cast<float>(elem_operand));
+          },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
     case F32: {
       auto result_or = ElementWiseUnaryOpImpl<bool, float>(
           is_finite,
@@ -516,9 +552,6 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
       TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
       break;
     }
-    default:
-      LOG(FATAL) << "HandleIsFinite: unknown/unhandled primitive type: "
-                 << PrimitiveType_Name(operand->shape().element_type());
   }
 
   return Status::OK();
@@ -542,7 +575,7 @@ Status HloEvaluator::HandleReal(HloInstruction* real) {
       break;
     }
     case C128: {
-      auto result_or = ElementWiseUnaryOpImpl<float, complex128>(
+      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
           real, [](complex128 elem_operand) { return std::real(elem_operand); },
           GetEvaluatedLiteralFor(operand));
       TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
@@ -637,20 +670,11 @@ Status HloEvaluator::HandleComplex(HloInstruction* complex) {
 }
 
 Status HloEvaluator::HandleCompare(HloInstruction* compare) {
-  HloOpcode opcode = compare->opcode();
+  ComparisonDirection direction = compare->comparison_direction();
   auto lhs = compare->operand(0);
   auto rhs = compare->operand(1);
-  // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-  // removed.
-  if (!(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
-        ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-    return Unimplemented(
-        "Implicit broadcasting is currently unsupported in HLO evaluator "
-        "Shape Mismatch: %s vs %s vs %s",
-        ShapeUtil::HumanString(compare->shape()),
-        ShapeUtil::HumanString(lhs->shape()),
-        ShapeUtil::HumanString(rhs->shape()));
-  }
+  DCHECK(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
+         ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
   TF_RET_CHECK(lhs->shape().element_type() == rhs->shape().element_type());
 
@@ -662,76 +686,76 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
     case PRED: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
-          Compare<bool>(compare->shape(), opcode, lhs_literal, rhs_literal));
+          Compare<bool>(compare->shape(), direction, lhs_literal, rhs_literal));
     } break;
     case U8: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<uint8>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<uint8>(compare->shape(), direction,
+                                         lhs_literal, rhs_literal));
     } break;
     case U16: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<uint16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<uint16>(compare->shape(), direction,
+                                          lhs_literal, rhs_literal));
     } break;
     case U32: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<uint32>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<uint32>(compare->shape(), direction,
+                                          lhs_literal, rhs_literal));
     } break;
     case U64: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<uint64>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<uint64>(compare->shape(), direction,
+                                          lhs_literal, rhs_literal));
     } break;
     case S8: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
-          Compare<int8>(compare->shape(), opcode, lhs_literal, rhs_literal));
+          Compare<int8>(compare->shape(), direction, lhs_literal, rhs_literal));
     } break;
     case S16: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<int16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<int16>(compare->shape(), direction,
+                                         lhs_literal, rhs_literal));
     } break;
     case S32: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<int32>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<int32>(compare->shape(), direction,
+                                         lhs_literal, rhs_literal));
     } break;
     case S64: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<int64>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<int64>(compare->shape(), direction,
+                                         lhs_literal, rhs_literal));
     } break;
     case F16: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
-          Compare<half>(compare->shape(), opcode, lhs_literal, rhs_literal));
+          Compare<half>(compare->shape(), direction, lhs_literal, rhs_literal));
     } break;
     case BF16: {
       TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<bfloat16>(compare->shape(), opcode,
+                          Compare<bfloat16>(compare->shape(), direction,
                                             lhs_literal, rhs_literal));
     } break;
     case F32: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<float>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<float>(compare->shape(), direction,
+                                         lhs_literal, rhs_literal));
     } break;
     case F64: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<double>(compare->shape(), opcode, lhs_literal, rhs_literal));
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<double>(compare->shape(), direction,
+                                          lhs_literal, rhs_literal));
     } break;
     case C64: {
       TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<complex64>(compare->shape(), opcode,
+                          Compare<complex64>(compare->shape(), direction,
                                              lhs_literal, rhs_literal));
     } break;
     case C128: {
       TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<complex128>(compare->shape(), opcode,
+                          Compare<complex128>(compare->shape(), direction,
                                               lhs_literal, rhs_literal));
     } break;
     default:
@@ -1192,8 +1216,8 @@ Status HloEvaluator::HandleCall(HloInstruction* call) {
   HloEvaluator embedded_evaluator;
   embedded_evaluator.set_dynamic_dimension_inference(
       dynamic_dimension_inference_);
-  Literal result = embedded_evaluator.Evaluate(*computation, arg_literals)
-                       .ConsumeValueOrDie();
+  TF_ASSIGN_OR_RETURN(Literal result,
+                      embedded_evaluator.Evaluate(*computation, arg_literals));
 
   evaluated_[call] = std::move(result);
   return Status::OK();
@@ -1227,37 +1251,35 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   HloEvaluator embedded_evaluator;
   embedded_evaluator.set_dynamic_dimension_inference(
       dynamic_dimension_inference_);
-  Literal result =
-      embedded_evaluator.Evaluate(*readded_computation, arg_literals)
-          .ConsumeValueOrDie();
+  TF_ASSIGN_OR_RETURN(Literal result, embedded_evaluator.Evaluate(
+                                          *readded_computation, arg_literals));
 
   evaluated_[fusion] = std::move(result);
   return Status::OK();
 }
 
 Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
-  const auto& pred = GetEvaluatedLiteralFor(conditional->operand(0));
-  const auto& true_computation_arg =
-      GetEvaluatedLiteralFor(conditional->operand(1));
-  const auto& false_computation_arg =
-      GetEvaluatedLiteralFor(conditional->operand(2));
-
-  auto* true_computation = conditional->true_computation();
-  auto* false_computation = conditional->false_computation();
+  const auto& branch_index_literal =
+      GetEvaluatedLiteralFor(conditional->operand(0));
+  int branch_index;
+  if (conditional->operand(0)->shape().element_type() == PRED) {
+    branch_index = branch_index_literal.Get<bool>({}) ? 0 : 1;
+  } else {
+    branch_index = branch_index_literal.Get<int32>({});
+    if (branch_index < 0 || branch_index >= conditional->branch_count()) {
+      branch_index = conditional->branch_count() - 1;
+    }
+  }
+  const auto& branch_computation_arg =
+      GetEvaluatedLiteralFor(conditional->operand(1 + branch_index));
 
   HloEvaluator embedded_evaluator;
   embedded_evaluator.set_dynamic_dimension_inference(
       dynamic_dimension_inference_);
-  Literal result;
-  if (pred.Get<bool>({})) {
-    result =
-        embedded_evaluator.Evaluate(*true_computation, {&true_computation_arg})
-            .ConsumeValueOrDie();
-  } else {
-    result = embedded_evaluator
-                 .Evaluate(*false_computation, {&false_computation_arg})
-                 .ConsumeValueOrDie();
-  }
+  TF_ASSIGN_OR_RETURN(Literal result,
+                      embedded_evaluator.Evaluate(
+                          *conditional->branch_computation(branch_index),
+                          {&branch_computation_arg}));
 
   evaluated_[conditional] = std::move(result);
   return Status::OK();
@@ -1327,168 +1349,212 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   return Status::OK();
 }
 
-// Key-value sort is a special snowflake: it's templated on two different
-// element types, one for the keys, and one for the values. Jump through some
-// hoops to make this work.
 namespace {
-template <typename KeyType, typename ValueType>
-StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
-                                       const Literal& keys_literal,
-                                       const Literal& values_literal) {
-  auto rank = keys_literal.shape().rank();
-  TF_RET_CHECK(
-      ShapeUtil::SameDimensions(keys_literal.shape(), values_literal.shape()))
-      << "Sort keys and values must have the same dimensions";
-  TF_RET_CHECK(sort->operand_count() >= 2) << "Expected key-value sort";
-  // We need to sort an array of keys and an array of values, where the
-  // sorted order of the values is determined by the keys. The simplest(?)
-  // way to do this is to go to an array-of-pairs representation, sort the
-  // array using the keys, and then go back to pair-of-arrays.
-  VLOG(3) << "HandleSort keys_literal: " << keys_literal.ToString();
-  VLOG(3) << "HandleSort values_literal: " << values_literal.ToString();
-
-  if (rank == 0) {
-    // Nothing to sort.
-    return LiteralUtil::MakeTuple({&keys_literal, &values_literal});
+template <typename NativeT>
+Literal ExtractLiteralFromIndexPositions(const Literal& from,
+                                         absl::Span<int64 const> indices,
+                                         bool extract_as_scalar) {
+  if (extract_as_scalar) {
+    return LiteralUtil::CreateR0<NativeT>(from.Get<NativeT>({indices[0]}));
+  }
+  // We use a InlinedVector here because we need to convert it to an
+  // absl::Span later, and this would not work with std::vector<bool>.
+  absl::InlinedVector<NativeT, 10> values;
+  for (int64 index : indices) {
+    values.push_back(from.Get<NativeT>({index}));
+  }
+  return LiteralUtil::CreateR1<NativeT>(values);
+}
+
+StatusOr<Literal> ExtractFromIndexPositions(const Literal& from,
+                                            absl::Span<int64 const> indices,
+                                            bool extract_as_scalar = false) {
+  if (extract_as_scalar) {
+    CHECK_EQ(indices.size(), 1);
+  }
+  PrimitiveType type = from.shape().element_type();
+  switch (type) {
+    case PRED: {
+      return ExtractLiteralFromIndexPositions<bool>(from, indices,
+                                                    extract_as_scalar);
+    }
+    case U8: {
+      return ExtractLiteralFromIndexPositions<uint8>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case S8: {
+      return ExtractLiteralFromIndexPositions<int8>(from, indices,
+                                                    extract_as_scalar);
+    }
+    case BF16: {
+      return ExtractLiteralFromIndexPositions<bfloat16>(from, indices,
+                                                        extract_as_scalar);
+    }
+    case F16: {
+      return ExtractLiteralFromIndexPositions<Eigen::half>(from, indices,
+                                                           extract_as_scalar);
+    }
+    case U16: {
+      return ExtractLiteralFromIndexPositions<uint16>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S16: {
+      return ExtractLiteralFromIndexPositions<int16>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case F32: {
+      return ExtractLiteralFromIndexPositions<float>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case U32: {
+      return ExtractLiteralFromIndexPositions<uint32>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S32: {
+      return ExtractLiteralFromIndexPositions<int32>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case F64: {
+      return ExtractLiteralFromIndexPositions<double>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case U64: {
+      return ExtractLiteralFromIndexPositions<uint64>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S64: {
+      return ExtractLiteralFromIndexPositions<int64>(from, indices,
+                                                     extract_as_scalar);
+    }
+    default:
+      return InvalidArgument("Unsupported type for Sort: %s",
+                             PrimitiveType_Name(type));
   }
+}
+}  // namespace
 
-  Literal keys_result_literal(keys_literal.shape());
-  Literal values_result_literal(values_literal.shape());
+Status HloEvaluator::HandleSort(HloInstruction* sort) {
+  TF_RET_CHECK(sort->operand_count() >= 1)
+      << "Expected at least 1 operand for sort";
+  for (int64 i = 1; i < sort->operand_count(); ++i) {
+    TF_RET_CHECK(ShapeUtil::SameDimensions(sort->operand(0)->shape(),
+                                           sort->operand(i)->shape()))
+        << "All Sort operands must have the same dimensions";
+  }
+
+  if (VLOG_IS_ON(3)) {
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      VLOG(3) << "HandleSort operand " << i << " literal: "
+              << GetEvaluatedLiteralFor(sort->operand(i)).ToString();
+    }
+  }
+  Shape key_shape = sort->operand(0)->shape();
+  auto rank = key_shape.rank();
+  std::vector<Literal> result_literals;
+  result_literals.reserve(sort->operand_count());
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    result_literals.emplace_back(sort->operand(i)->shape());
+  }
   std::vector<int64> zero_base(rank, 0);
   std::vector<int64> increment(rank, 1);
   int64 sort_dim = sort->dimensions(0);
-  int64 sort_dim_elements = keys_literal.shape().dimensions(sort_dim);
+  int64 sort_dim_elements = key_shape.dimensions(sort_dim);
   increment[sort_dim] = sort_dim_elements;
+  HloEvaluator embedded_evaluator(max_loop_iterations_);
   // Iterate through each dimension except 'sort_dim'.
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      keys_literal.shape(), zero_base,
-      AsInt64Slice(keys_literal.shape().dimensions()), increment,
+      key_shape, zero_base, AsInt64Slice(key_shape.dimensions()), increment,
       [&](absl::Span<const int64> indices) -> StatusOr<bool> {
-        // Extract a slice from the keys and values literals that correspond to
+        // Extract a slice from each operand literal that corresponds to
         // exactly the row in dimension 'sort_dim'.
         std::vector<int64> limit_indices(indices.begin(), indices.end());
         absl::c_for_each(limit_indices, [](int64& index) { ++index; });
         limit_indices[sort_dim] = sort_dim_elements;
-        TF_ASSIGN_OR_RETURN(auto keys_to_sort,
-                            keys_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& keys_data = keys_to_sort.data<KeyType>();
-        TF_ASSIGN_OR_RETURN(auto values_to_sort,
-                            values_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& values_data = values_to_sort.data<ValueType>();
-        using kv_pair = std::pair<KeyType, ValueType>;
-        std::vector<kv_pair> key_value_vector;
-        key_value_vector.reserve(keys_data.size());
-        for (int i = 0; i < keys_data.size(); ++i) {
-          key_value_vector.push_back(
-              std::make_pair(keys_data[i], values_data[i]));
+        std::vector<Literal> literals_to_sort;
+        literals_to_sort.reserve(sort->operand_count());
+        for (int64 i = 0; i < sort->operand_count(); ++i) {
+          TF_ASSIGN_OR_RETURN(auto literal_to_sort,
+                              GetEvaluatedLiteralFor(sort->operand(i))
+                                  .Slice(indices, limit_indices)
+                                  .Reshape({sort_dim_elements}));
+          literals_to_sort.push_back(std::move(literal_to_sort));
         }
-        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
-                         [](const kv_pair& a, const kv_pair& b) {
-                           return SafeLess<KeyType>(a.first, b.first);
-                         });
-        std::vector<KeyType> result_keys;
-        // We use a InlinedVector here because we need to convert it to an
-        // absl::Span later, and this would not work with std::vector<bool>.
-        absl::InlinedVector<ValueType, 10> result_values;
-        for (const auto& key_value : key_value_vector) {
-          result_keys.push_back(key_value.first);
-          result_values.push_back(key_value.second);
+        std::vector<int64> indices_to_sort(sort_dim_elements);
+        std::iota(indices_to_sort.begin(), indices_to_sort.end(), 0);
+        Status compare_status = Status::OK();
+        auto comparator = [sort, &compare_status, &embedded_evaluator,
+                           &literals_to_sort](int64 a, int64 b) {
+          std::vector<Literal> literals;
+          literals.reserve(2 * sort->operand_count());
+          for (int64 i = 0; i < sort->operand_count(); ++i) {
+            auto lhs = ExtractFromIndexPositions(literals_to_sort[i], {a},
+                                                 /*extract_as_scalar=*/true);
+            if (!lhs.ok()) {
+              compare_status = lhs.status();
+              return false;
+            }
+            literals.push_back(std::move(lhs.ValueOrDie()));
+            auto rhs = ExtractFromIndexPositions(literals_to_sort[i], {b},
+                                                 /*extract_as_scalar=*/true);
+            if (!rhs.ok()) {
+              compare_status = rhs.status();
+              return false;
+            }
+            literals.push_back(std::move(rhs.ValueOrDie()));
+          }
+          std::vector<const Literal*> literal_ptrs;
+          absl::c_transform(literals, std::back_inserter(literal_ptrs),
+                            [](const Literal& literal) { return &literal; });
+
+          auto computed_result =
+              embedded_evaluator.Evaluate(*sort->to_apply(), literal_ptrs);
+          // Clear visit states so that we can use the evaluator again
+          // on the same computation.
+          embedded_evaluator.ResetVisitStates();
+          if (!computed_result.ok()) {
+            compare_status = computed_result.status();
+            return false;
+          }
+          return computed_result.ValueOrDie().Get<bool>({});
+        };
+        if (Cast<HloSortInstruction>(sort)->is_stable()) {
+          std::stable_sort(indices_to_sort.begin(), indices_to_sort.end(),
+                           comparator);
+        } else {
+          std::sort(indices_to_sort.begin(), indices_to_sort.end(), comparator);
+        }
+        if (!compare_status.ok()) {
+          return compare_status;
         }
-        Literal sorted_keys(ShapeUtil::MakeShape(
-            keys_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_keys.PopulateR1(absl::Span<const KeyType>(result_keys));
-        Literal sorted_values(ShapeUtil::MakeShape(
-            values_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_values.PopulateR1(absl::Span<const ValueType>(result_values));
         std::vector<int64> slice_dimensions(rank, 1);
         slice_dimensions[sort_dim] = sort_dim_elements;
         std::vector<int64> start_indices(rank, 0);
-        TF_ASSIGN_OR_RETURN(auto sorted_keys_reshaped,
-                            sorted_keys.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(keys_result_literal.CopySliceFrom(
-            sorted_keys_reshaped, start_indices, indices, slice_dimensions));
-        TF_ASSIGN_OR_RETURN(auto sorted_values_reshaped,
-                            sorted_values.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(values_result_literal.CopySliceFrom(
-            sorted_values_reshaped, start_indices, indices, slice_dimensions));
+        for (int64 i = 0; i < sort->operand_count(); ++i) {
+          TF_ASSIGN_OR_RETURN(
+              Literal sorted_literal,
+              ExtractFromIndexPositions(literals_to_sort[i], indices_to_sort));
+          TF_ASSIGN_OR_RETURN(auto sorted_literal_reshaped,
+                              sorted_literal.Reshape(slice_dimensions));
+          TF_RETURN_IF_ERROR(result_literals[i].CopySliceFrom(
+              sorted_literal_reshaped, start_indices, indices,
+              slice_dimensions));
+        }
         return true;
       }));
 
-  Literal result_tuple;
-  result_tuple =
-      LiteralUtil::MakeTuple({&keys_result_literal, &values_result_literal});
-  VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
-  return std::move(result_tuple);
-}
-
-template <typename KeyType>
-StatusOr<Literal> EvaluateSortCurried(HloInstruction* sort,
-                                      const Literal& keys_literal,
-                                      const Literal& values_literal) {
-  switch (values_literal.shape().element_type()) {
-    case PRED:
-      return EvaluateSortInternal<KeyType, bool>(sort, keys_literal,
-                                                 values_literal);
-    case F32:
-      return EvaluateSortInternal<KeyType, float>(sort, keys_literal,
-                                                  values_literal);
-    case U32:
-      return EvaluateSortInternal<KeyType, uint32>(sort, keys_literal,
-                                                   values_literal);
-    case S32:
-      return EvaluateSortInternal<KeyType, int32>(sort, keys_literal,
-                                                  values_literal);
-    case BF16:
-      return EvaluateSortInternal<KeyType, bfloat16>(sort, keys_literal,
-                                                     values_literal);
-    default:
-      return InvalidArgument("Unsupported type for Sort");
-  }
-}
-
-StatusOr<Literal> EvaluateSort(HloInstruction* sort,
-                               const Literal& keys_literal,
-                               const Literal& values_literal) {
-  switch (sort->operand(0)->shape().element_type()) {
-    case F32:
-      return EvaluateSortCurried<float>(sort, keys_literal, values_literal);
-    case U32:
-      return EvaluateSortCurried<uint32>(sort, keys_literal, values_literal);
-    case S32:
-      return EvaluateSortCurried<int32>(sort, keys_literal, values_literal);
-    case BF16:
-      return EvaluateSortCurried<bfloat16>(sort, keys_literal, values_literal);
-    default:
-      return InvalidArgument("Unsupported type for Sort");
-  }
-}
-}  // namespace
-
-Status HloEvaluator::HandleSort(HloInstruction* sort) {
-  if (!sort->shape().IsTuple()) {
-    return DefaultAction(sort);
+  if (sort->operand_count() == 1) {
+    evaluated_[sort] = std::move(result_literals[0]);
   } else {
-    // This is a really stupid work-around for the fact it's hard to support a
-    // multi-value sort directly, due to the fact we need to template the
-    // evaluation function on all of the value types.
-    std::vector<Literal> sort_results_backing;
-    for (int64 i = 0; i < sort->operand_count(); ++i) {
-      auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)),
-                                 GetEvaluatedLiteralFor(sort->operand(i)));
-      if (!result.ok()) {
-        return result.status();
-      }
-      sort_results_backing.push_back(
-          std::move(result.ValueOrDie().DecomposeTuple()[1]));
-    }
-    std::vector<const Literal*> sort_results;
-    absl::c_transform(sort_results_backing, std::back_inserter(sort_results),
+    std::vector<const Literal*> literal_ptrs;
+    absl::c_transform(result_literals, std::back_inserter(literal_ptrs),
                       [](const Literal& literal) { return &literal; });
-    evaluated_[sort] = LiteralUtil::MakeTuple(sort_results);
-    return Status::OK();
+
+    Literal result_tuple = LiteralUtil::MakeTuple(literal_ptrs);
+    VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
+
+    evaluated_[sort] = std::move(result_tuple);
   }
+  return Status::OK();
 }
 
 Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
@@ -1507,6 +1573,27 @@ Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
   }
 }
 
+Status HloEvaluator::HandleCustomCall(HloInstruction* custom_call) {
+  if (!custom_call_handler_) {
+    // No handler is registered; this means custom-calls are not allowed.
+    return DefaultAction(custom_call);
+  }
+
+  // Evaluate input operands so the handler has access to the operand data.
+  std::vector<const Literal*> operands;
+  operands.reserve(custom_call->operand_count());
+  for (const HloInstruction* operand : custom_call->operands()) {
+    operands.push_back(&GetEvaluatedLiteralFor(operand));
+  }
+
+  // Synchronously issue the handler to populate the instruction output literal.
+  TF_ASSIGN_OR_RETURN(
+      auto output, custom_call_handler_(custom_call, absl::MakeSpan(operands)));
+
+  evaluated_[custom_call] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::Preprocess(HloInstruction* hlo) {
   VLOG(2) << "About to visit HLO: " << hlo->ToString();
   return ShapeUtil::ValidateShape(hlo->shape());
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index ccb8af4fb07fedb054693b78e8bab49527d38700..357975a131d0c7e63c06e96852468b43d97a37f2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 
+#include <functional>
 #include <memory>
 
 #include "absl/container/node_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -132,6 +134,23 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Enable the fast path for certain operations like dot or convolution.
   void set_use_fast_path(bool value) { use_fast_path_ = value; }
 
+  // Handles evaluation of a custom-call op.
+  // Operand literals are provided in |operands| and implementations must
+  // populate |output| before returning.
+  using CustomCallHandler = std::function<StatusOr<Literal>(
+      HloInstruction* custom_call, absl::Span<const Literal*> operands)>;
+
+  // Sets a handler that is called during evaluation for custom-call ops.
+  // If no handler is defined the default error behavior will occur. The handler
+  // will be provided evaluated literals for all operands and is expected to
+  // return an output literal of the appropriate shape.
+  void set_custom_call_handler(
+      std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                      absl::Span<const Literal*> operands)>
+          handler) {
+    custom_call_handler_ = std::move(handler);
+  }
+
   // Returns the result of a matrix multiply `lhs x rhs`.
   static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
       const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
@@ -219,6 +238,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleReduce(HloInstruction* reduce) override;
 
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+
   // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
   // expanded in a semantic-preserving way into other HLOs by adding exanpsion
   // HLO pass to the HLO optimization pass during compilation, which can then be
@@ -310,16 +331,21 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   std::vector<const Literal*> arg_literals_;
 
   // Max loop iterations to execute with no maximum if negative.
-  int64 max_loop_iterations_;
+  int64 max_loop_iterations_ = 0;
 
   // Module-level seed handle.
-  uint64 seed_;
+  uint64 seed_ = 0;
   // RNG engine.
   std::minstd_rand0 engine_;
 
   // DynamicDimensionInference is used to evaluate GetDimensionSize, which
   // returns the dynamic dimension size of its operand.
-  DynamicDimensionInference* dynamic_dimension_inference_;
+  DynamicDimensionInference* dynamic_dimension_inference_ = nullptr;
+
+  // Optional handler for custom_call ops.
+  std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                  absl::Span<const Literal*> operands)>
+      custom_call_handler_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index d34fa48efb2e8ec2d35cdbb24441964f3d7c8b92..335859fb52961aa0f88aec69b73bf073409db451 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -58,12 +58,12 @@ class HloEvaluatorTest : public HloTestBase {
  public:
   HloEvaluatorTest() : use_bfloat16_(false) {}
 
-  Literal Evaluate(absl::Span<const Literal* const> arg_literals = {}) {
+  StatusOr<Literal> Evaluate(
+      absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
       HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_.Evaluate(*m_->entry_computation(), arg_literals)
-        .ConsumeValueOrDie();
+    return evaluator_.Evaluate(*m_->entry_computation(), arg_literals);
   }
 
   // Evaluate function that takes in a local module instead of using m_
@@ -86,7 +86,7 @@ class HloEvaluatorTest : public HloTestBase {
     b.AddInstruction(HloInstruction::CreateUnary(expected.shape(), opcode, c1));
     m_->AddEntryComputation(b.Build());
 
-    Literal result = Evaluate();
+    TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
     auto element_type = expected.shape().element_type();
     if (element_type == F32 || element_type == F64) {
@@ -106,7 +106,25 @@ class HloEvaluatorTest : public HloTestBase {
         HloInstruction::CreateBinary(expected.shape(), opcode, c1, c2));
     m_->AddEntryComputation(b.Build());
 
-    Literal result = Evaluate();
+    TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
+  void TestTernaryOp(HloOpcode opcode, Literal expected, Literal src0,
+                     Literal src1, Literal src2) {
+    HloComputation::Builder b(TestName());
+    auto operand0 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src0)));
+    auto operand1 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src1)));
+    auto operand2 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src2)));
+    b.AddInstruction(HloInstruction::CreateTernary(
+        expected.shape(), opcode, operand0, operand1, operand2));
+    m_->AddEntryComputation(b.Build());
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
@@ -145,13 +163,40 @@ TEST_P(HloEvaluatorBf16Test, DoesClamp) {
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
 
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+// Verifies that clamping of int64 does not cause loss of precision
+TEST_P(HloEvaluatorBf16Test, DoesClampInt64) {
+  auto ones = [](int bits) { return (int64{1} << bits) - 1; };
+
+  auto low =
+      LiteralUtil::CreateR2<int64>({{0, ones(54)}, {ones(54), ones(58)}});
+  auto value = LiteralUtil::CreateR2<int64>({{0, ones(56)}, {0, ones(58)}});
+  auto high = LiteralUtil::CreateR2<int64>(
+      {{ones(54), ones(55)}, {ones(56), ones(58)}});
+
+  Shape shape = low.shape();
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  b.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
+  m_->AddEntryComputation(b.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+
+  auto expected =
+      LiteralUtil::CreateR2<int64>({{0, ones(55)}, {ones(54), ones(58)}});
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
   auto low = LiteralUtil::CreateR0<float>(0.f);
   auto value = LiteralUtil::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
@@ -166,7 +211,7 @@ TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({{0, 0}, {1, 1}});
 
@@ -191,7 +236,7 @@ TEST_P(HloEvaluatorBf16Test, DoesSelect) {
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate({});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({}));
 
   auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
 
@@ -254,6 +299,20 @@ TEST_F(HloEvaluatorTest, DoesDivideInt64) {
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
+
+TEST_F(HloEvaluatorTest, DoesClampS64) {
+  auto low = LiteralUtil::CreateR1<int64>(
+      {-8616761059752331528LL, 6780561065411491190LL, -8616761059752331528LL});
+  auto value = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491180LL, 4241131823772864090LL});
+  auto high = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491180LL, 8616761059752331528LL, 3832151243857508051LL});
+  auto expected = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491190LL, 3832151243857508051LL});
+  TestTernaryOp(HloOpcode::kClamp, std::move(expected), std::move(low),
+                std::move(value), std::move(high));
+}
+
 TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
@@ -308,6 +367,19 @@ TEST_F(HloEvaluatorTest, DoesNotR2) {
                                     {0, std::numeric_limits<int>::min()}});
   TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand));
 }
+
+TEST_F(HloEvaluatorTest, DoesRealC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_real = LiteralUtil::CreateR1<double>({1, -100});
+  TestUnaryOp(HloOpcode::kReal, std::move(expected_real), std::move(x));
+}
+
+TEST_F(HloEvaluatorTest, DoesImagC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_imag = LiteralUtil::CreateR1<double>({0, 4});
+  TestUnaryOp(HloOpcode::kImag, std::move(expected_imag), std::move(x));
+}
+
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
 TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
@@ -332,7 +404,7 @@ TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
                                                 lhs_instruction, param_rhs2));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate(args);
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate(args));
 
   auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
 
@@ -356,7 +428,7 @@ TEST_F(HloEvaluatorTest, DoesReshape) {
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate({});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({}));
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
   result.EachCell<NativeT>([&](absl::Span<const int64> indices, NativeT value) {
@@ -377,7 +449,7 @@ TEST_F(HloEvaluatorTest, DoesBroadcast) {
       output_literal.shape(), literal_instruction, {1, 2}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate({});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({}));
 
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
@@ -396,7 +468,7 @@ TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
       /*broadcast_dimensions=*/{}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate({});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({}));
 
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
@@ -416,7 +488,7 @@ TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<int64>(
       {{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}});
@@ -438,7 +510,7 @@ TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR1<int64>({100, 200});
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
@@ -458,7 +530,7 @@ TEST_P(HloEvaluatorBf16Test, ConvertWithSameLayout) {
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
@@ -478,7 +550,7 @@ TEST_P(HloEvaluatorBf16Test, ConvertWithDifferentLayout) {
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
@@ -513,7 +585,7 @@ TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
       shape, operand_instruction, padding_value_instruction, padding_config));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<int32>(
       {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
@@ -540,7 +612,7 @@ TEST_P(HloEvaluatorBf16Test, Pad4DFloatArrayWithInteriorPadding) {
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected_array = absl::make_unique<Array4D<float>>(8, 5, 1, 1);
   expected_array->Fill(kPadValue);
@@ -584,7 +656,7 @@ TEST_P(HloEvaluatorBf16Test, NegativePadding2D) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 }
   auto expected_array = absl::make_unique<Array2D<float>>(1, 5);
@@ -629,7 +701,7 @@ TEST_P(HloEvaluatorBf16Test, NegativeAndInteriorPadding2D) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected_array = absl::make_unique<Array2D<float>>(0, 9);
   auto expected = LiteralUtil::CreateR2FromArray2D<float>(*expected_array);
@@ -668,7 +740,7 @@ TEST_P(HloEvaluatorBf16Test, DotRank2AndRank1) {
                                              DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   // clang-format off
   auto expected_array = Array2D<float>({
@@ -714,7 +786,7 @@ TEST_P(HloEvaluatorBf16Test, DotRank1AndRank2) {
                                              DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR1<float>({22.f, 28.f});
 
@@ -758,7 +830,7 @@ TEST_P(HloEvaluatorBf16Test, DotRank2AndRank2) {
                                              DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected_array = Array2D<float>({
       {22.f, 28.f},
@@ -800,7 +872,8 @@ TEST_P(HloEvaluatorBf16Test, DotRank4AndRank4) {
                                              DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+
   float expected_1 = 0;
   for (float i = 1.0f; i < 7.0f; ++i) {
     expected_1 += i * i + i;
@@ -856,7 +929,7 @@ TEST_P(HloEvaluatorBf16Test, SimpleConv1D) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
   auto expected = LiteralUtil::CreateR3FromArray3D<float>(expected_array);
@@ -911,7 +984,7 @@ TEST_P(HloEvaluatorBf16Test, Simple4x4Conv2DWith2x2Kernel) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array4D<float> expected_array(1, 1, 4, 4);
   // clang-format off
@@ -995,7 +1068,7 @@ TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensionsReversed) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   // clang-format off
   // Result dimensions: [feature=1, height=1, batch=1, width=2]
@@ -1073,7 +1146,7 @@ TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensions) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   // clang-format off
   // Result dimensions: [feature=1, height=1, batch=1, width=2]
@@ -1133,7 +1206,7 @@ TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithHighPadding) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array4D<float> expected_array(1, 1, 7, 7);
   expected_array.FillWithYX(Array2D<float>({
@@ -1197,7 +1270,7 @@ TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithLowAndHighPadding) {
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array4D<float> expected_array(1, 1, 8, 8);
   expected_array.FillWithYX(Array2D<float>({
@@ -1269,7 +1342,7 @@ TEST_P(HloEvaluatorBf16Test,
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array4D<float> expected_array(1, 1, 9, 3);
   expected_array.FillWithYX(Array2D<float>({
@@ -1341,7 +1414,7 @@ TEST_P(HloEvaluatorBf16Test, Conv2DGroupedConvolution) {
       DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   Array4D<float> expected_array(1, 1, 1, 8);
   expected_array.FillWithYX(
@@ -1459,7 +1532,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceAdd) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR1<float>({6, 18});
 
@@ -1511,7 +1584,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({{6, 7}});
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
@@ -1563,7 +1636,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxWindowDilation) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({{11}});
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
@@ -1620,7 +1693,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({{1, 3, 5}, {5, 11, 13}});
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
@@ -1681,7 +1754,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd6D) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   std::vector<int64> output_dims = {4, 3, 3, 3, 4, 4};
   Literal result_literal =
@@ -1713,7 +1786,7 @@ TEST_P(HloEvaluatorBf16Test, StridedSlice) {
                                                /*strides=*/{2, 3}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({
       {3},
@@ -1749,7 +1822,7 @@ TEST_P(HloEvaluatorBf16Test, DynamicSlice) {
       HloInstruction::CreateDynamicSlice(shape, operand, {zero, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({
       {2, 3, 4},
@@ -1787,7 +1860,7 @@ TEST_P(HloEvaluatorBf16Test, DynamicSliceModSlice) {
       HloInstruction::CreateDynamicSlice(shape, operand, {two, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<float>({
       {2, 3, 4},
@@ -1826,7 +1899,7 @@ TEST_P(HloEvaluatorBf16Test, DynamicSliceUpdate) {
       shape, operand, update, {zero, one}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<double>({
       {1, -2, -3},
@@ -1862,7 +1935,7 @@ TEST_P(HloEvaluatorBf16Test, SetAndGetTuples) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto expected = LiteralUtil::CreateR2<double>({
       {1, 2, 3},
@@ -1901,7 +1974,7 @@ TEST_P(HloEvaluatorBf16Test, SetAndGetNestedTuples) {
 
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   auto result_inner_literal =
       LiteralUtil::CreateR2FromArray2D<double>(*operand_array);
@@ -1939,7 +2012,7 @@ TEST_P(HloEvaluatorBf16Test, Reverse) {
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
   m_->AddEntryComputation(b.Build());
 
-  Literal result = Evaluate();
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
 
   // clang-format off
   auto expected = LiteralUtil::CreateR4FromArray4D<float>({
@@ -1979,11 +2052,12 @@ TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutions) {
   HloEvaluator evaluator;
   Literal param0_literal = LiteralUtil::CreateR1<float>({1, 2, 3, 4});
   Literal square_literal = LiteralUtil::CreateR1<float>({10, 20, 30, 40});
-  auto result = evaluator.EvaluateWithSubstitutions(
-      add, {{param0, &param0_literal}, {square, &square_literal}});
-  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result,
+      evaluator.EvaluateWithSubstitutions(
+          add, {{param0, &param0_literal}, {square, &square_literal}}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result.ValueOrDie()));
+      LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result));
 }
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
@@ -2004,11 +2078,11 @@ TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutionsWithConstantOperand) {
   // Evaluate add with square = {10, 20, 30, 40}.
   HloEvaluator evaluator;
   Literal square_literal = LiteralUtil::CreateR1<float>({10, 20, 30, 40});
-  auto result =
-      evaluator.EvaluateWithSubstitutions(add, {{square, &square_literal}});
-  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result,
+      evaluator.EvaluateWithSubstitutions(add, {{square, &square_literal}}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result.ValueOrDie()));
+      LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
@@ -2030,9 +2104,9 @@ ENTRY main {
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
-      Evaluate({&operand, &start_indices})));
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
@@ -2054,9 +2128,9 @@ ENTRY main {
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
-      Evaluate({&operand, &start_indices})));
+      LiteralUtil::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
@@ -2078,10 +2152,11 @@ ENTRY main {
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR3<int32>(
           {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
-      Evaluate({&operand, &start_indices})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
@@ -2105,9 +2180,9 @@ ENTRY main {
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  EXPECT_TRUE(
-      LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{-1, 1}, {-4, 4}}),
-                             Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR2<int32>({{-1, 1}, {-4, 4}}), result));
 }
 
 TEST_F(HloEvaluatorTest,
@@ -2132,9 +2207,9 @@ ENTRY main {
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  EXPECT_TRUE(
-      LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{-2, 2}, {-1, 1}}),
-                             Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR2<int32>({{-2, 2}, {-1, 1}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
@@ -2156,8 +2231,9 @@ ENTRY main {
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({1, 1});
-  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{5}}),
-                                     Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{5}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
@@ -2179,9 +2255,9 @@ ENTRY main {
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
-  EXPECT_TRUE(
-      LiteralTestUtil::Equal(LiteralUtil::CreateR3<int32>({{{8}}, {{5}}}),
-                             Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR3<int32>({{{8}}, {{5}}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
@@ -2202,8 +2278,9 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
-  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{}, {}}),
-                                     Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{}, {}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
@@ -2226,9 +2303,9 @@ ENTRY main {
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal start_indices =
       LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
-  EXPECT_TRUE(
-      LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{0, 1}, {2, 1}}),
-                             Evaluate({&operand, &start_indices})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&operand, &start_indices}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR2<int32>({{0, 1}, {2, 1}}), result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
@@ -2257,9 +2334,11 @@ ENTRY main {
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{10, 20, 30}, {4, 5, 6}, {70, 80, 90}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
@@ -2289,9 +2368,11 @@ ENTRY main {
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates =
       LiteralUtil::CreateR2<int32>({{10, 30}, {40, 60}, {70, 90}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{10, 2, 30}, {40, 5, 60}, {70, 8, 90}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
@@ -2321,9 +2402,11 @@ ENTRY main {
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{11, 22, 33}, {4, 5, 6}, {77, 88, 99}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
@@ -2353,9 +2436,11 @@ ENTRY main {
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{10, 40, 90}, {4, 5, 6}, {490, 640, 810}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_P(HloEvaluatorBf16Test, EvaluateScatter_TensorFlowScatter_F32) {
@@ -2386,10 +2471,12 @@ ENTRY main {
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({2, 1});
   Literal updates =
       LiteralUtil::CreateR2<float>({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Near(
       LiteralUtil::CreateR2<float>(
           {{1.1, 2.2, 3.3}, {6.7, 8.6, 8.2}, {8.1, 9.9, 10.6}}),
-      Evaluate({&operand, &scatter_indices, &updates}), ErrorSpec{0.1, 0.01}));
+      result, ErrorSpec{0.1, 0.01}));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
@@ -2419,9 +2506,11 @@ ENTRY main {
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
   Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {84, 105, 126}, {7, 8, 9}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
@@ -2452,9 +2541,11 @@ ENTRY main {
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
   Literal updates = LiteralUtil::CreateR3<int32>(
       {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32>({{11, 7, 38}, {44, 10, 71}, {77, 13, 104}}),
-      Evaluate({&operand, &scatter_indices, &updates})));
+      result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
@@ -2489,8 +2580,9 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-10, 10}, {-2, 2}, {-3, 3}},  //
                                     {{-40, 40}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      expected, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_F(HloEvaluatorTest,
@@ -2526,8 +2618,9 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-20, 20}, {-10, 10}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},      //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      expected, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
@@ -2558,8 +2651,9 @@ ENTRY main {
   Literal updates = LiteralUtil::CreateR2<int32>({{10}});
   Literal expected =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 10, 6}, {7, 8, 9}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      expected, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
@@ -2590,8 +2684,9 @@ ENTRY main {
   Literal updates = LiteralUtil::CreateR3<int32>({{{10}}, {{20}}});
   Literal expected =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 20, 6}, {7, 10, 9}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      expected, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
@@ -2619,8 +2714,9 @@ ENTRY main {
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{}, {}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      operand, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(operand, result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
@@ -2652,8 +2748,9 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
   Literal updates = LiteralUtil::CreateR2<int32>({{10, 20}, {30, 40}});
   Literal expected = LiteralUtil::CreateR1<int32>({10, 61, 32});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      expected, Evaluate({&operand, &scatter_indices, &updates})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          Evaluate({&operand, &scatter_indices, &updates}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
@@ -2776,8 +2873,16 @@ TEST_F(HloEvaluatorTest, DoesCompareBF16) {
        {bfloat16(0.25), bfloat16(-0.375), bfloat16(-0.127)}});
   auto expected =
       LiteralUtil::CreateR2<bool>({{false, true, true}, {false, true, true}});
-  TestBinaryOp(HloOpcode::kGe, std::move(expected), std::move(lhs),
-               std::move(rhs));
+
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
+  b.AddInstruction(HloInstruction::CreateCompare(expected.shape(), c1, c2,
+                                                 ComparisonDirection::kGe));
+  m_->AddEntryComputation(b.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
 TEST_P(HloEvaluatorBf16Test, Bf16Reduction) {
@@ -2801,7 +2906,48 @@ ENTRY main {
   Literal arg = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
   Literal expected = LiteralUtil::CreateR0<bfloat16>(bfloat16(44.0f));
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, Evaluate({&arg})));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&arg}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+TEST_F(HloEvaluatorTest, DontFailOnCallUnimplementedOps) {
+  // Infeed triggers unimplemented error within HandleCall, and we verify that
+  // the Evaluator does fail in such case.
+  const string hlo_text = R"(
+HloModule DontFailOnCall
+
+call {
+  token0 = token[] after-all()
+  ROOT infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
+}
+
+ENTRY main {
+  ROOT result = ((u32[3]{0}, pred[]), token[]) call(), to_apply=call
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto statusor = Evaluate();
+  EXPECT_FALSE(statusor.status().ok());
+}
+
+TEST_F(HloEvaluatorTest, DontFailOnFusionWithUnimplementedOps) {
+  // Infeed triggers unimplemented error within HandleFusion, and we verify that
+  // the Evaluator does fail in such case.
+  const string hlo_text = R"(
+HloModule DontFailOnFusion
+
+fused_computation {
+  token0 = token[] after-all()
+  ROOT infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
+}
+
+ENTRY main {
+  ROOT result = ((u32[3]{0}, pred[]), token[]) fusion(), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto statusor = Evaluate();
+  EXPECT_FALSE(statusor.status().ok());
 }
 
 TEST_P(HloEvaluatorBf16Test, SliceWithDifferentLayout) {
@@ -2819,7 +2965,7 @@ ENTRY main {
   Literal arg = LiteralUtil::CreateR3WithLayout<float>(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
       LayoutUtil::MakeLayout({0, 1, 2}));
-  Literal actual = Evaluate({&arg});
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&arg}));
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
@@ -2841,7 +2987,7 @@ ENTRY main {
   }
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
-  Literal actual = Evaluate({&args[0]});
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&args[0]}));
   if (use_bfloat16_) {
     EXPECT_TRUE(
         absl::c_equal(args[0].data<bfloat16>(), actual.data<bfloat16>()));
@@ -2867,7 +3013,8 @@ ENTRY main {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
-  std::vector<Literal> actual = Evaluate({}).DecomposeTuple();
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, Evaluate({}));
+  std::vector<Literal> actual = literal.DecomposeTuple();
   ASSERT_EQ(actual.size(), 3);
 
   uint32 pow30 = uint32{1} << 30;
@@ -2907,7 +3054,7 @@ ENTRY main {
   Literal size_arg = LiteralUtil::CreateR0<uint32>(3);
   Literal data_arg = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
 
-  Literal actual = Evaluate({&size_arg, &data_arg});
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&size_arg, &data_arg}));
 
   EXPECT_EQ(actual.GetFirstElement<uint32>(), static_cast<uint32>(3));
 }
@@ -2979,7 +3126,8 @@ TEST_F(HloEvaluatorTest, PreserveFusionInputLayout) {
 
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
-  Literal actual = Evaluate({&args[0]});
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&args[0]}));
   EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
 }
 
@@ -3000,7 +3148,7 @@ TEST_F(HloEvaluatorTest, PreserveFusionOutputLayout) {
 
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
-  Literal actual = Evaluate({&args[0]});
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual, Evaluate({&args[0]}));
   EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
 }
 
@@ -3022,11 +3170,123 @@ TEST_F(HloEvaluatorTest, PreserveMOFusionOutputLayout) {
 
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
-  Literal actual_tuple = Evaluate({&args[0]});
+  TF_ASSERT_OK_AND_ASSIGN(Literal actual_tuple, Evaluate({&args[0]}));
   std::vector<Literal> actual_literals = actual_tuple.DecomposeTuple();
   EXPECT_TRUE(
       absl::c_equal(args[0].data<float>(), actual_literals[0].data<float>()));
 }
 
+// Tests that custom_calls fail to evaluate when no handler is specified.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_NoHandler) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_NoHandler
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  EXPECT_EQ(HloEvaluator().Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::UNIMPLEMENTED);
+}
+
+// Tests when a custom_call handler returns an error.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_HandlerError) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_HandlerError
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        return InternalError("Test error");
+      });
+  EXPECT_EQ(evaluator.Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::INTERNAL);
+}
+
+// Tests the custom_call handler on calls with many inputs.
+// We sum the operands so that we can verify the operand and output literals
+// are properly mapped for access.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_ManyInputs) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_ManyInputs
+    ENTRY kernel_entry {
+      parameter.0 = u32[1]{0} parameter(0)
+      parameter.1 = u32[1]{0} parameter(1)
+      ROOT test_root = u32[1]{0} custom-call(parameter.0, parameter.1),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        EXPECT_EQ(HloOpcode::kCustomCall, custom_call->opcode());
+        EXPECT_EQ("_my_custom_call", custom_call->custom_call_target());
+        EXPECT_EQ(2, custom_call->operand_count());
+        EXPECT_EQ(2, operands.size());
+        auto output = Literal::CreateFromShape(custom_call->shape());
+        auto operand0_data = operands[0]->data<uint32>();
+        auto operand1_data = operands[1]->data<uint32>();
+        auto output_data = output.data<uint32>();
+        output_data[0] = operand0_data[0] + operand1_data[0];
+        return output;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      evaluator.Evaluate(*m_->entry_computation(), {&args[0], &args[1]}));
+  auto arg0_data = args[0].data<uint32>();
+  auto arg1_data = args[1].data<uint32>();
+  std::vector<uint32> expected_data = {arg0_data[0] + arg1_data[0]};
+  EXPECT_TRUE(absl::c_equal(expected_data, actual_literal.data<uint32>()));
+}
+
+TEST_F(HloEvaluatorTest, IsFiniteF16) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule test
+
+  ENTRY IsFiniteTest {
+    c = f16[6] constant({nan, 7, nan, -1, inf, -inf})
+    ROOT is-finite = pred[6] is-finite(c)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_THAT(actual_literal.data<bool>(),
+              ::testing::ElementsAre(false, true, false, true, false, false));
+}
+
+TEST_F(HloEvaluatorTest, IsFiniteBf16) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule test
+
+  ENTRY IsFiniteTest {
+    c = bf16[6] constant({nan, 7, nan, -1, inf, -inf})
+    ROOT is-finite = pred[6] is-finite(c)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_THAT(actual_literal.data<bool>(),
+              ::testing::ElementsAre(false, true, false, true, false, false));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 95a78408b0ba188487cc53acd5f641d0306cd8af..2d8a578985e8f603d4056bee8619725095ebc7bb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
 
 #include <cmath>
+#include <type_traits>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
@@ -43,46 +44,6 @@ template <typename T>
 using is_complex_t =
     absl::disjunction<std::is_same<T, complex64>, std::is_same<T, complex128>>;
 
-// It's UB to use std::sort with std::less<float>, because of NaNs. Define
-// "safe" less functions which are actually strict weak orders. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0.
-template <
-    typename NativeT,
-    typename std::enable_if<std::is_integral<NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return a < b;
-}
-
-template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  bool lhs_is_negative = std::signbit(a);
-  bool rhs_is_negative = std::signbit(b);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(a);
-  bool rhs_nan = std::isnan(b);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
-  }
-  return a < b;
-}
-
-template <typename NativeT,
-          typename std::enable_if<
-              std::is_same<NativeT, bfloat16>::value ||
-              std::is_same<NativeT, Eigen::half>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return SafeLess(static_cast<float>(a), static_cast<float>(b));
-}
-
 // ToArithmeticSafeType(T t):
 //  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
 //    integer, and
@@ -368,10 +329,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleLog1p(HloInstruction* expm1) {
+  Status HandleLog1p(HloInstruction* log1p) {
     TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[expm1],
-        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+        parent_->evaluated_[log1p],
+        ElementWiseUnaryOp(log1p, [](ElementwiseT elem_operand) {
           return std::log1p(elem_operand);
         }));
     return Status::OK();
@@ -462,9 +423,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleNegate<ReturnT>(negate);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
                         ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
@@ -474,6 +435,23 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, bfloat16>::value ||
+                std::is_same<NativeT, Eigen::half>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return std::isnan(elem_operand)
+                                     ? elem_operand
+                                     : std::copysign(
+                                           elem_operand != ElementwiseT(0),
+                                           elem_operand);
+                        }));
+    return Status::OK();
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
@@ -686,6 +664,23 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleSqrt(HloInstruction* sqrt) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sqrt],
+                        ElementWiseUnaryOp(sqrt, [](ElementwiseT elem_operand) {
+                          return std::sqrt(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleRsqrt(HloInstruction* rsqrt) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[rsqrt],
+        ElementWiseUnaryOp(rsqrt, [](ElementwiseT elem_operand) {
+          return static_cast<ElementwiseT>(1) / std::sqrt(elem_operand);
+        }));
+    return Status::OK();
+  }
+
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
@@ -916,9 +911,29 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleShiftRightLogical<ElementwiseT>(shrl);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  // Special case for integral type due to MSVC's std::isnan being unable to
+  // handle integral type.
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleClamp(HloInstruction* clamp) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
+          return static_cast<ElementwiseT>(
+              std::min(high, std::max(value, low)));
+        };
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    !std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
@@ -926,7 +941,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             return static_cast<ElementwiseT>(NAN);
           }
           return static_cast<ElementwiseT>(
-              std::fmin(high, std::fmax(value, low)));
+              std::min<NativeT>(high, std::max<NativeT>(value, low)));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -1187,7 +1202,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleDot(HloInstruction* dot) override {
-    if (parent_->use_fast_path_) {
+    if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
+        parent_->use_fast_path_) {
       return HandleDot<ReturnT>(dot);
     }
     return HandleDotSlowPath(dot);
@@ -1349,12 +1365,17 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                 static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
                 static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
 
-            for (int64 i = accumulate_index_sizes.size() - 1; i >= 0; --i) {
-              int64 value = ++accumulate_index[i];
-              if (value != accumulate_index_sizes[i]) {
-                break;
+            // If there are no contracting dimension accumulate_index_sizes is
+            // empty, do not try to count down from -1 to 0 since it is and
+            // infinite loop.
+            if (!accumulate_index_sizes.empty()) {
+              for (int64 i = accumulate_index_sizes.size() - 1; i >= 0; --i) {
+                int64 value = ++accumulate_index[i];
+                if (value != accumulate_index_sizes[i]) {
+                  break;
+                }
+                accumulate_index[i] = 0;
               }
-              accumulate_index[i] = 0;
             }
           }
 
@@ -1656,73 +1677,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                !is_complex_t<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleSort(HloInstruction* sort) {
-    auto keys = sort->operand(0);
-    TF_RET_CHECK(sort->operand_count() == 1)
-        << "Typed visitor does not support key-value sort";
-
-    const Literal& keys_literal = parent_->GetEvaluatedLiteralFor(keys);
-    int64 sort_dim = sort->dimensions(0);
-    int64 sort_dim_elements = keys->shape().dimensions(sort_dim);
-    int64 rank = keys->shape().rank();
-    if (rank == 0) {
-      // Nothing to sort.
-      parent_->evaluated_[sort] = keys_literal.Clone();
-      return Status::OK();
-    }
-    Literal result_literal(keys_literal.shape());
-    std::vector<int64> zero_base(rank, 0);
-    std::vector<int64> increment(rank, 1);
-    increment[sort_dim] = sort_dim_elements;
-    // Iterate through each dimension except 'sort_dim'.
-    TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-        keys->shape(), zero_base, AsInt64Slice(keys->shape().dimensions()),
-        increment, [&](absl::Span<const int64> indices) -> StatusOr<bool> {
-          // Extract a slice from the literal that corresponds to exactly the
-          // row in dimension 'sort_dim'.
-          std::vector<int64> limit_indices(indices.begin(), indices.end());
-          absl::c_for_each(limit_indices, [](int64& index) { ++index; });
-          limit_indices[sort_dim] = sort_dim_elements;
-          TF_ASSIGN_OR_RETURN(auto row_to_sort,
-                              keys_literal.Slice(indices, limit_indices)
-                                  .Reshape({sort_dim_elements}));
-          const auto& row_data = row_to_sort.data<NativeT>();
-
-          std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::stable_sort(result_data.begin(), result_data.end(),
-                           [](const NativeT& a, const NativeT& b) {
-                             return SafeLess<NativeT>(a, b);
-                           });
-          Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
-                                                  {sort_dim_elements}));
-          sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
-          std::vector<int64> slice_dimensions(rank, 1);
-          slice_dimensions[sort_dim] = sort_dim_elements;
-          TF_ASSIGN_OR_RETURN(auto sorted_row_reshaped,
-                              sorted_row.Reshape(slice_dimensions));
-          std::vector<int64> start_indices(rank, 0);
-          TF_RETURN_IF_ERROR(result_literal.CopySliceFrom(
-              sorted_row_reshaped, start_indices, indices, slice_dimensions));
-          return true;
-        }));
-    parent_->evaluated_[sort] = std::move(result_literal);
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<is_complex_t<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleSort(HloInstruction* sort) {
-    return UnsupportedTypeError(sort);
-  }
-
   Status HandleSort(HloInstruction* sort) override {
-    return HandleSort<ReturnT>(sort);
+    return UnsupportedTypeError(sort);
   }
 
   Status HandleReduce(HloInstruction* hlo) override {
@@ -2752,12 +2708,25 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         const Literal& high =
             parent_->GetEvaluatedLiteralFor(random->operand(1));
 
-        std::uniform_real_distribution<NativeT> generator(
-            low.Get<NativeT>({}), high.Get<NativeT>({}));
-
+        // std::uniform_real_distribution(a, b) can sometimes return a value
+        // equal to b.  Unclear if this is a spec bug or an implementation bug
+        // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
+        // interval, so we have to re-sample if we get `b` out.
+        //
+        // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
+        // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
+        // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
+        auto low_val = low.Get<NativeT>({});
+        auto high_val = high.Get<NativeT>({});
+        std::uniform_real_distribution<NativeT> generator(low_val, high_val);
         TF_RETURN_IF_ERROR(
             result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
-              return generator(parent_->engine_);
+              while (true) {
+                NativeT v = generator(parent_->engine_);
+                if (v != high_val) {
+                  return v;
+                }
+              }
             }));
         break;
       }
@@ -2891,21 +2860,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       absl::Span<HloInstruction* const> start_indices,
       const Shape& result_shape) {
     std::vector<int64> start;
-    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
-    // between the cases, this currently assumes there is at least 1 index. That
-    // is wrong in the general case, because for scalar indices, if the operand
-    // is scalar, then there are no indices. This problem with resolve itself.
-    const HloInstruction* first_index = start_indices[0];
-    if (first_index->shape().rank() == 1) {
-      auto start_indices_typed =
-          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
-      start = std::vector<int64>(start_indices_typed.begin(),
-                                 start_indices_typed.end());
-    } else {
-      for (HloInstruction* index : start_indices) {
-        start.push_back(
-            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
-      }
+
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
     }
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
@@ -2938,22 +2896,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto result = operand_literal.Clone();
     const auto rank = result.shape().rank();
     std::vector<int64> start;
-    // TODO(b/118437727): Remove the R1 code-path. Note that to distinguish
-    // between the cases, this currently assumes there is at least 1 index. That
-    // is wrong in the general case, because for scalar indices, if the operand
-    // is scalar, then there are no indices. This problem with resolve itself.
-    const HloInstruction* first_index = start_indices[0];
-    if (first_index->shape().rank() == 1) {
-      auto start_indices_typed =
-          parent_->GetEvaluatedLiteralFor(first_index).data<IndexT>();
-      start = std::vector<int64>(start_indices_typed.begin(),
-                                 start_indices_typed.end());
-    } else {
-      for (HloInstruction* index : start_indices) {
-        start.push_back(
-            parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
-      }
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
     }
+
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
     for (int64 i = 0; i < rank; ++i) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 4c7f5e9e7dfb12a8cb699bdf397eab21983342a1..116b32f5f4c772b6a9771e6cf9e5095c7c959775 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -38,21 +38,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace hlo_graph_dumper {
 namespace {
 
 using absl::nullopt;
@@ -259,14 +259,16 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
   // param0), check that the operation being performed is commutative.
   if (root->operand(0) == param1) {
     CHECK_EQ(root->operand(1), param0);
-    switch (root->opcode()) {
-      case HloOpcode::kLe:
-      case HloOpcode::kGe:
-      case HloOpcode::kGt:
-      case HloOpcode::kLt:
-        return nullopt;
-      default:
-        break;
+    if (root->opcode() == HloOpcode()) {
+      switch (root->comparison_direction()) {
+        case ComparisonDirection::kLe:
+        case ComparisonDirection::kGe:
+        case ComparisonDirection::kGt:
+        case ComparisonDirection::kLt:
+          return nullopt;
+        default:
+          break;
+      }
     }
   }
 
@@ -280,18 +282,22 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
       return "min";
     case HloOpcode::kMaximum:
       return "max";
-    case HloOpcode::kLe:
-      return "less-or-equal";
-    case HloOpcode::kGe:
-      return "greater-or-equal";
-    case HloOpcode::kGt:
-      return "greater-than";
-    case HloOpcode::kLt:
-      return "less-than";
-    case HloOpcode::kEq:
-      return "equal-to";
-    case HloOpcode::kNe:
-      return "not-equal-to";
+    case HloOpcode::kCompare: {
+      switch (root->comparison_direction()) {
+        case ComparisonDirection::kLe:
+          return "less-or-equal";
+        case ComparisonDirection::kGe:
+          return "greater-or-equal";
+        case ComparisonDirection::kGt:
+          return "greater-than";
+        case ComparisonDirection::kLt:
+          return "less-than";
+        case ComparisonDirection::kEq:
+          return "equal-to";
+        case ComparisonDirection::kNe:
+          return "not-equal-to";
+      }
+    }
     default:
       return nullopt;
   }
@@ -536,7 +542,12 @@ stylesheet=<
     }
   }
 
-  return StrFormat(fmt, graph_label, StrJoin(edge_css_rules, "\n"));
+  // Browsers require that we URI-encode the contents of our data URI.  (It
+  // seems this was a relatively recent change?) In practice, this means that we
+  // need to escape '#'.
+  return StrFormat(
+      fmt, graph_label,
+      absl::StrReplaceAll(StrJoin(edge_css_rules, "\n"), {{"#", "%23"}}));
 }
 
 string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); }
@@ -825,8 +836,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
     // collected from profiling tools. Those constants may not have a valid
     // literal.
     if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) {
-      return StrFormat("%s (%s)", constant->literal().ToString(),
-                       ShapeUtil::HumanString(constant->shape()));
+      return constant->literal().ToString();
     }
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
@@ -919,27 +929,22 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
+    case HloOpcode::kCompare:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
     case HloOpcode::kDivide:
-    case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
-    case HloOpcode::kLe:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
@@ -949,6 +954,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
@@ -957,6 +963,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
@@ -1011,6 +1018,8 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
     case HloOpcode::kFft:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kCholesky:
       return kDarkBlue;
     case HloOpcode::kReducePrecision:
       return kRed;
@@ -1037,6 +1046,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
+    case HloOpcode::kReplicaId:
       return kBrown;
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
@@ -1248,40 +1258,11 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
   return instr;
 }
 
-class GraphRendererRegistry {
- public:
-  void SetRenderer(std::shared_ptr<GraphRendererInterface> graph_renderer) {
-    tensorflow::mutex_lock lock(mu_);
-    graph_renderer_ = graph_renderer;
-  }
-
-  std::shared_ptr<GraphRendererInterface> GetDefaultRenderer() {
-    tensorflow::mutex_lock lock(mu_);
-    return graph_renderer_;
-  }
-
-  static GraphRendererRegistry* Default() {
-    static GraphRendererRegistry* registry = new GraphRendererRegistry();
-    return registry;
-  }
-
- private:
-  tensorflow::mutex mu_;
-  std::shared_ptr<GraphRendererInterface> graph_renderer_ GUARDED_BY(mu_);
-};
-
-}  // namespace
-
-Registrar::Registrar(std::shared_ptr<GraphRendererInterface> dumper) {
-  GraphRendererRegistry::Default()->SetRenderer(dumper);
-}
-
-namespace {
-
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
-                                      int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(
+    const HloInstruction* root, int64 radius,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   absl::flat_hash_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1297,6 +1278,9 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     if (depth == radius) {
       continue;
     }
+    if (boundary.contains(instr)) {
+      continue;
+    }
 
     // Traverse into instr's operands.
     //
@@ -1436,157 +1420,7 @@ NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
   });
 }
 
-string SaveGraph(const string& graph,
-                 GraphRendererInterface::GraphKind graph_kind,
-                 const string& dest_path) {
-  static std::atomic<int> output_num(0);
-  string file_extension;
-  switch (graph_kind) {
-    case GraphRendererInterface::DOT_GRAPH:
-      file_extension = ".dot";
-      break;
-    case GraphRendererInterface::TF_GRAPHDEF:
-      file_extension = ".pbtxt";
-      break;
-  }
-  string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
-  auto status = Status::OK();
-  auto env = tensorflow::Env::Default();
-  if (!env->CreateUniqueFileName(&path, file_extension)) {
-    status =
-        Status(tensorflow::error::Code::UNKNOWN,
-               StrCat("Failed to create temporary file to dump HLO graph: ",
-                      strerror(errno)));
-  } else {
-    status = tensorflow::WriteStringToFile(env, path, graph);
-  }
-  if (!status.ok()) {
-    LOG(WARNING) << "Saving HLO graph failed: " << status;
-  }
-  return path;
-}
-
-string ExportGraph(const string& graph,
-                   GraphRendererInterface::GraphKind graph_kind,
-                   const DebugOptions& debug_options) {
-  string path = debug_options.xla_hlo_graph_path();
-  if (!path.empty() && !debug_options.xla_hlo_dump_as_html()) {
-    return SaveGraph(graph, graph_kind, path);
-  } else {
-    auto graph_renderer =
-        GraphRendererRegistry::Default()->GetDefaultRenderer();
-    CHECK(graph_renderer != nullptr)
-        << "No registered renderer for the HLO graph. "
-           "Use --xla_hlo_graph_path=PATH --xla_hlo_dump_as_html=false to "
-           "export to local file system";
-    return graph_renderer->RenderGraph(graph, graph_kind, debug_options);
-  }
-}
-
-}  // namespace
-
-string DumpGraph(const HloComputation& computation, const string& label,
-                 const DebugOptions& debug_options,
-                 const HloExecutionProfile* hlo_execution_profile,
-                 bool show_backend_config) {
-  GraphRendererInterface::GraphKind graph_kind;
-  string graph;
-  if (debug_options.xla_hlo_dump_as_graphdef()) {
-    HloTfGraphBuilder builder(debug_options);
-    TF_CHECK_OK(builder.AddComputation(computation));
-    CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
-                                                          &graph));
-    graph_kind = GraphRendererInterface::TF_GRAPHDEF;
-  } else {
-    graph =
-        HloDotDumper(&computation, label, debug_options, show_backend_config,
-                     hlo_execution_profile, NodeFilter())
-            .Dump();
-    graph_kind = GraphRendererInterface::DOT_GRAPH;
-  }
-
-  string graph_url = ExportGraph(graph, graph_kind, debug_options);
-  LOG(INFO) << "computation " << computation.name() << " [" << label
-            << "]: " << graph_url;
-  return graph_url;
-}
-
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config) {
-  auto debug_options = node.GetModule()->config().debug_options();
-  string label =
-      StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
-  string graph =
-      HloDotDumper(node.parent(), label, debug_options, show_backend_config,
-                   /*profile=*/nullptr, filter)
-          .Dump();
-  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
-}
-
-string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
-                          int64 max_nodes, bool show_backend_config) {
-  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
-  auto debug_options = from.GetModule()->config().debug_options();
-
-  bool hit_limit = false;
-  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
-  string label;
-  if (!hit_limit) {
-    label = StrCat("All paths from ", from.name(), " to ", to.name());
-  } else {
-    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
-                   " to ", to.name(),
-                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
-                   "NODES***<br/><br/>");
-  }
-  string graph =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
-                   /*profile=*/nullptr, filter)
-          .Dump();
-  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
-}
-
-void DumpText(const HloModule& module, const string& label,
-              const string& directory_path, bool do_prefix) {
-  Env* env = Env::Default();
-  TF_CHECK_OK(env->RecursivelyCreateDir(directory_path));
-  string prefix = StrCat(env->NowMicros());
-  string filename =
-      do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
-  string path = JoinPath(directory_path, filename);
-  TF_CHECK_OK(WriteStringToFile(
-      env, path,
-      module.ToString(HloPrintOptions().set_print_large_constants(true))));
-  LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
-}
-
-string MaybeDumpHloModule(const HloModule& module, const string& label,
-                          const HloExecutionProfile* profile) {
-  const DebugOptions& debug_options = module.config().debug_options();
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
-          << " with generate_hlo_graph regex \""
-          << debug_options.xla_generate_hlo_graph() << "\"";
-  string graph_url;
-  if (!debug_options.xla_generate_hlo_graph().empty() &&
-      RE2::PartialMatch(module.name(),
-                        debug_options.xla_generate_hlo_graph())) {
-    graph_url =
-        DumpGraph(*module.entry_computation(), label, debug_options, profile);
-  }
-  if (!debug_options.xla_log_hlo_text().empty() &&
-      RE2::PartialMatch(module.name(), debug_options.xla_log_hlo_text())) {
-    LOG(INFO) << "HLO for module " << module.name();
-    LOG(INFO) << "Label: " << label;
-    XLA_LOG_LINES(2, module.ToString());
-  }
-  if (!debug_options.xla_generate_hlo_text_to().empty()) {
-    DumpText(module, label, debug_options.xla_generate_hlo_text_to());
-  }
-  return graph_url;
-}
-
-string WrapDotInHTML(const string& dot) {
+string WrapDotInHtml(absl::string_view dot) {
   static const char html_prefix[] = R"html(
 <!DOCTYPE html>
 <html>
@@ -1627,6 +1461,9 @@ string WrapDotInHTML(const string& dot) {
     var css_data = ''
     if (results !== null) {
         css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
+        // CSS inside DOT is URL-escaped, so we must unescape it
+        // before we can insert it into SVG.
+        css_data = unescape(css_data);
         dot_data = data.replace(cssregex, ''); // Remove the stylesheet
     }
 
@@ -1694,37 +1531,117 @@ string WrapDotInHTML(const string& dot) {
 </html>
 )html";
 
-  return html_prefix + dot + html_suffix;
+  return absl::StrCat(html_prefix, dot, html_suffix);
 }
 
-string RenderDotAsHTMLFile(const string& dot,
-                           const DebugOptions& debug_options) {
-  string html = WrapDotInHTML(dot);
+tensorflow::mutex url_renderer_mu(tensorflow::LINKER_INITIALIZED);
+std::function<StatusOr<string>(absl::string_view)>* url_renderer
+    GUARDED_BY(url_renderer_mu) = nullptr;
 
-  auto env = tensorflow::Env::Default();
-  std::vector<string> dirs;
-  string output_dir = debug_options.xla_hlo_graph_path();
-  if (output_dir.empty()) {
-    env->GetLocalTempDirectories(&dirs);
-  } else {
-    dirs.push_back(output_dir);
+// Precondition: url_renderer != nullptr.
+//
+// (We specify this as a precondition rather than checking it in here and
+// returning an error because we want to fail quickly when there's no URL
+// renderer available, and this function runs only after we've done all the work
+// of producing dot for the graph.)
+StatusOr<string> WrapDotInFormat(absl::string_view dot,
+                                 RenderedGraphFormat format)
+    EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+  switch (format) {
+    case RenderedGraphFormat::kUrl:
+      CHECK(url_renderer != nullptr)
+          << "Should have checked url_renderer != null before calling.";
+      return (*url_renderer)(dot);
+    case RenderedGraphFormat::kHtml:
+      return WrapDotInHtml(dot);
+    case RenderedGraphFormat::kDot:
+      return string(dot);
   }
-  // Try each directory, as they might be full, have inappropriate
-  // permissions or have different problems at times.
-  string output;
-  for (const string& dir : dirs) {
-    string filename = tensorflow::io::JoinPath(dir, "graph-");
-    if (env->CreateUniqueFileName(&filename, ".html")) {
-      output = filename;
-      break;
-    }
+}
+
+}  // namespace
+
+void RegisterGraphToURLRenderer(
+    std::function<StatusOr<string>(absl::string_view)> renderer) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (url_renderer != nullptr) {
+    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer.  Last call "
+                    "wins, but because order of initialization in C++ is "
+                    "nondeterministic, this may not be what you want.";
+  }
+  delete url_renderer;
+  url_renderer = new std::function<StatusOr<string>(absl::string_view)>(
+      std::move(renderer));
+}
+
+StatusOr<string> RenderGraph(const HloComputation& computation,
+                             absl::string_view label,
+                             const DebugOptions& debug_options,
+                             RenderedGraphFormat format,
+                             const HloExecutionProfile* hlo_execution_profile,
+                             bool show_backend_config) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return Unavailable("Can't render as URL; no URL renderer was registered.");
+  }
+
+  string rendered_dot =
+      HloDotDumper(&computation, label, debug_options, show_backend_config,
+                   hlo_execution_profile, NodeFilter())
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
+}
+
+StatusOr<string> RenderNeighborhoodAround(
+    const HloInstruction& node, int radius, RenderedGraphFormat format,
+    bool show_backend_config,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return FailedPrecondition(
+        "Can't render as URL; no URL renderer was registered.");
   }
-  if (output.empty()) {
-    LOG(FATAL) << "Failed to create unique output file name.";
+
+  string label =
+      StrCat("Neighborhood of ", radius, " nodes around ", node.name());
+  string rendered_dot =
+      HloDotDumper(node.parent(), label,
+                   node.GetModule()->config().debug_options(),
+                   show_backend_config, /*profile=*/nullptr,
+                   MakeNodeRadiusAroundFilter(&node, radius, boundary))
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
+}
+
+StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
+                                      const HloInstruction& to, int64 max_nodes,
+                                      RenderedGraphFormat format,
+                                      bool show_backend_config) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return FailedPrecondition(
+        "Can't render as URL; no URL renderer was registered.");
   }
-  TF_CHECK_OK(tensorflow::WriteStringToFile(env, output, html));
-  return "file://" + output;
+
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
+  } else {
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
+  }
+  string rendered_dot =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
 }
 
-}  // namespace hlo_graph_dumper
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 8e51454ef1cf992386cc7325e32705c08bf7712f..324ac67a6dd565f45bcb32455212ae08c925bc66 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -23,94 +23,76 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
-namespace xla {
-namespace hlo_graph_dumper {
-
-// Abstract interface for classes that render HLO graphs (e.g. DOT graph,
-// tensorflow GraphDef).
-class GraphRendererInterface {
- public:
-  enum GraphKind {
-    DOT_GRAPH,
-    TF_GRAPHDEF,
-  };
+// This file contains routines for rendering HLO computations into a
+// human-readable graphical format.
+//
+// Fundamentally all graphs are rendered using the DOT language, but they can be
+// packaged three different ways:
+//
+//  - as a raw DOT file, which can be rendered using `graphviz`.
+//
+//  - as an HTML file with an embedded DOT file, which can be viewed in a
+//    browser using a version of graphviz compiled to JavaScript
+//
+//  - as a URL hosted somewhere which somehow embeds the DOT file.
+//
+// This last option is not implemented by default, but you can add a plugin to
+// implement it via RegisterGraphToURLRenderer.
+//
+// TODO(jlebar): Rename this file to hlo_graph_renderer.
 
-  virtual ~GraphRendererInterface() = default;
+namespace xla {
 
-  // Renders a DOT graph, returning a description of the rendered output
-  // (e.g., a URL)
-  virtual string RenderGraph(const string& graph, GraphKind graph_kind,
-                             const DebugOptions& debug_options) = 0;
+// Different formats that a graph can be packaged as.
+enum class RenderedGraphFormat {
+  kDot,
+  kHtml,
+  kUrl,
 };
 
-// Dump the given HLO module if a dump is requested in its debug options. Based
-// on the debug options, either a graph dump, a text dump or both may be
-// generated. If a graph dump is generated, the description (e.g. an URL) is
-// returned; otherwise an empty string is returned.
-string MaybeDumpHloModule(const HloModule& module, const string& label,
-                          const HloExecutionProfile* profile = nullptr);
-
-// Dumps a graph of the computation and returns a description of the rendered
-// graph (e.g., a URL) based on the renderer. The "best" renderer in the
-// registry is used.
-string DumpGraph(const HloComputation& computation, const string& label,
-                 const DebugOptions& debug_options,
-                 const HloExecutionProfile* hlo_execution_profile = nullptr,
-                 bool show_backend_config = false);
-
-// Like DumpGraph, but renders only nodes "near" the given node in the graph.
+// Renders an HLO module as a human-readable visual graph.
+//
+// Note that this only works well for relatively small graphs (no more than a
+// few hundred nodes).  Beyond that, the dot is usually unrenderable,
+// unreadable, or both.  To view such graphs, use a tool such as
+// interactive_graphviz, which calls RenderNeighborhoodAround to render subsets
+// of a graph.
+StatusOr<string> RenderGraph(
+    const HloComputation& computation, absl::string_view label,
+    const DebugOptions& debug_options, RenderedGraphFormat format,
+    const HloExecutionProfile* hlo_execution_profile = nullptr,
+    bool show_backend_config = false);
+
+// Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
 // The number of nodes dumped is controlled by the radius parameter, which
 // (roughly) corresponds to the max distance a node may be from the primary node
 // before it's omitted from the graph.
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config = false);
-
-// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
-// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+//
+// The optional boundary specifies a set of boundary nodes, beyond which nodes
+// will be omitted even if they are within the radius.
+StatusOr<string> RenderNeighborhoodAround(
+    const HloInstruction& node, int radius, RenderedGraphFormat format,
+    bool show_backend_config = false,
+    const absl::flat_hash_set<const HloInstruction*>& boundary = {});
+
+// Renders nodes on any of the paths from `from` to `to`.  If there are more
+// than max_nodes on all paths, restricts to the max_nodes nodes on the shortest
 // paths.
-string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
-                          int64 max_nodes, bool show_backend_config = false);
+StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
+                                      const HloInstruction& to, int64 max_nodes,
+                                      RenderedGraphFormat format,
+                                      bool show_backend_config = false);
 
-// Dumps the HloModule::ToString() as a file into the provided directory path
-// suffixed with the provided label.
+// Registers a function which implements RenderedGraphFormat::kUrl.
 //
-// If do_prefix is true, a timestamp will be prepended onto the label to
-// construct a filename in the directory path; otherwise, the label is used
-// as the filename directly.
-void DumpText(const HloModule& module, const string& label,
-              const string& directory_path, bool do_prefix = true);
-
-// Renders DOT graph as inline SVG and saves it in an HTML file in a temprary
-// directory or directory specified via --xla_hlo_graph_path. Returns the file
-// URI pointing to the file.
-string RenderDotAsHTMLFile(const string& dot,
-                           const DebugOptions& debug_options);
-
-// Graph renderers may be added using a registration mechanism, e.g.:
-// XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
-// The renderer with the highest numeric priority value is used.
-
-#define XLA_REGISTER_GRAPH_RENDERER(factory, ...) \
-  XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, __COUNTER__, ##__VA_ARGS__)
-
-// Internal implementation details below this point.
-
-// Class that registers a graph renderer.
-class Registrar {
- public:
-  Registrar(std::shared_ptr<GraphRendererInterface> dumper);
-};
-
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...) \
-  static ::xla::hlo_graph_dumper::Registrar                     \
-      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(           \
-          std::make_shared<factory>(), ##__VA_ARGS__)
-
-// __COUNTER__ must go through another macro to be properly expanded
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr) ___##ctr##__object_
+// The input to the function is dot, and the output should be a URL or an error.
+//
+// There can only be one active renderer, and the last call to this function
+// wins.
+void RegisterGraphToURLRenderer(
+    std::function<StatusOr<string>(absl::string_view dot)> renderer);
 
-}  // namespace hlo_graph_dumper
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GRAPH_DUMPER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 064c53252c0ac4d4e7b93169ad7cbee4807cb963..fa1ff49de876ea21073c09616412e535438c8a02 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
@@ -31,24 +32,13 @@ namespace {
 using absl::StrCat;
 using ::testing::HasSubstr;
 
+using HloGraphDumperTest = HloTestBase;
+
 string TestName() {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
-class DotRenderer : public hlo_graph_dumper::GraphRendererInterface {
- public:
-  string RenderGraph(const string& graph, GraphKind graph_kind,
-                     const DebugOptions& debug_options) override {
-    return graph;
-  }
-
- private:
-  string last_graph_;
-};
-
-XLA_REGISTER_GRAPH_RENDERER(DotRenderer);
-
-TEST(HloGraphDumperTest, NestedFusion) {
+TEST_F(HloGraphDumperTest, NestedFusion) {
   HloComputation::Builder b("b");
 
   // Build param0 + param1 + param2 + param3 + param4.
@@ -90,8 +80,9 @@ TEST(HloGraphDumperTest, NestedFusion) {
           {fused_sums[1], fused_sums[0]}, HloInstruction::FusionKind::kLoop);
 
   // Generate the graph; all nodes should be present.
-  string graph = hlo_graph_dumper::DumpGraph(*root_computation, /*label=*/"",
-                                             DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"", DebugOptions(),
+                                RenderedGraphFormat::kDot));
   for (const HloComputation* computation :
        {root_computation,  //
         inner_fusion->fused_instructions_computation(),
@@ -113,12 +104,13 @@ TEST(HloGraphDumperTest, NestedFusion) {
     }
   }
   ASSERT_NE(inner_sum, nullptr);
-  EXPECT_THAT(
-      hlo_graph_dumper::DumpNeighborhoodAround(*inner_sum, /*radius=*/1),
-      HasSubstr(inner_sum->name()));
+  TF_ASSERT_OK_AND_ASSIGN(string neighborhood_graph,
+                          RenderNeighborhoodAround(*inner_sum, /*radius=*/1,
+                                                   RenderedGraphFormat::kDot));
+  EXPECT_THAT(neighborhood_graph, HasSubstr(inner_sum->name()));
 }
 
-TEST(HloGraphDumperTest, Constant) {
+TEST_F(HloGraphDumperTest, Constant) {
   HloComputation::Builder b("b");
   auto instruction = b.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(-42)));
@@ -126,13 +118,14 @@ TEST(HloGraphDumperTest, Constant) {
   HloModuleConfig config;
   HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build());
-  string graph = hlo_graph_dumper::DumpGraph(
-      *root_computation, /*label=*/"an_empty_graph", DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"an_empty_graph",
+                                DebugOptions(), RenderedGraphFormat::kDot));
   EXPECT_THAT(graph, HasSubstr("an_empty_graph"));
   EXPECT_THAT(graph, Not(HasSubstr("i_am_a_constant_root_instruction")));
 }
 
-TEST(HloGraphDumperTest, TupleConstant) {
+TEST_F(HloGraphDumperTest, TupleConstant) {
   Shape tuple_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(S32, {4, 5})});
   HloComputation::Builder b("b");
@@ -144,11 +137,30 @@ TEST(HloGraphDumperTest, TupleConstant) {
   HloModuleConfig config;
   HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build(gte));
-  string graph = hlo_graph_dumper::DumpGraph(
-      *root_computation, /*label=*/"tuple_constant", DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"tuple_constant",
+                                DebugOptions(), RenderedGraphFormat::kDot));
   EXPECT_THAT(graph, HasSubstr("tuple_constant"));
   EXPECT_THAT(graph, HasSubstr("constant (f32[3,2], s32[4,5])"));
 }
 
+TEST_F(HloGraphDumperTest, Compare) {
+  const char* hlo_string = R"(
+    HloModule comp
+
+    ENTRY comp {
+      param.0 = f32[10] parameter(0)
+      param.1 = f32[10] parameter(1)
+      ROOT lt = pred[10] compare(param.0, param.1), direction=LT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph,
+      RenderGraph(*module->entry_computation(), /*label=*/"tuple_constant",
+                  DebugOptions(), RenderedGraphFormat::kDot));
+  EXPECT_THAT(graph, HasSubstr("direction=LT"));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
deleted file mode 100644
index 84c4cf18df69816c611f4eb159ba247320ebc20e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Implementation of an DOT graph renderer that uses Javascript to render DOT to
-// SVG in a browser.
-
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-class GraphHtmlRenderer : public GraphRendererInterface {
- public:
-  string RenderGraph(const string& graph, GraphKind graph_kind,
-                     const DebugOptions& debug_options) override {
-    switch (graph_kind) {
-      case DOT_GRAPH:
-        return RenderDotAsHTMLFile(graph, debug_options);
-      default:
-        LOG(FATAL) << "Only DOT graphs can be rendered";
-    }
-  }
-};
-
-XLA_REGISTER_GRAPH_RENDERER(GraphHtmlRenderer);
-
-}  // namespace
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index b0b71dece81b561f492767db8c1ccbe3fde442d4..cd13c7a3ac7afe03fb99ed3114bdc6ac0f8ad6a7 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -56,7 +56,8 @@ class HloInputOutputAliasConfig {
 
   HloInputOutputAliasConfig() = default;
 
-  explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
+  explicit HloInputOutputAliasConfig(Shape output_shape)
+      : alias_(output_shape) {}
 
   virtual ~HloInputOutputAliasConfig() = default;
 
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index a46a107723de30176241aae01b268a8c10d991d3..265bfdf7f989b0821a98c1f774cb408b78f348fe 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 3c92554ad4ec48686d64c74a00f732a3bfee87bc..fe8a178f80fa3469f193aca467fc1bd9a9c0c6bc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -64,7 +64,35 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     const absl::flat_hash_map<int64, HloInstruction*>& instruction_map,
     const absl::flat_hash_map<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
-  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
+  HloOpcode opcode;
+  auto opcode_or = StringToHloOpcode(proto.opcode());
+  absl::optional<ComparisonDirection> comparison_direction;
+  if (opcode_or.ok()) {
+    opcode = opcode_or.ConsumeValueOrDie();
+  } else {
+    // Unknown opcode. Try auto-upgrading deprecated "less-than",
+    // "greater-than", etc opcodes, which are now rolled into the kCompare
+    // opcode.
+    if (proto.opcode() == "equal-to") {
+      comparison_direction = ComparisonDirection::kEq;
+    } else if (proto.opcode() == "not-equal-to") {
+      comparison_direction = ComparisonDirection::kNe;
+    } else if (proto.opcode() == "greater-than-or-equal-to") {
+      comparison_direction = ComparisonDirection::kGe;
+    } else if (proto.opcode() == "greater-than") {
+      comparison_direction = ComparisonDirection::kGt;
+    } else if (proto.opcode() == "less-than-or-equal-to") {
+      comparison_direction = ComparisonDirection::kLe;
+    } else if (proto.opcode() == "less-than") {
+      comparison_direction = ComparisonDirection::kLt;
+    }
+    if (comparison_direction) {
+      opcode = HloOpcode::kCompare;
+    } else {
+      return InvalidArgument("Unknown opcode: %s", proto.opcode());
+    }
+  }
+
   TF_RET_CHECK(proto.has_shape());
 
   std::unique_ptr<HloInstruction> instruction;
@@ -82,6 +110,15 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   const auto computations = [&computation_map, &proto](int index) {
     return computation_map.at(proto.called_computation_ids(index));
   };
+  const auto all_computations = [&computation_map, &proto]() {
+    std::vector<HloComputation*> result(proto.called_computation_ids_size());
+    std::transform(proto.called_computation_ids().begin(),
+                   proto.called_computation_ids().end(), result.begin(),
+                   [&computation_map](int64 computation_id) {
+                     return computation_map.at(computation_id);
+                   });
+    return result;
+  };
 
   TF_RET_CHECK(
       absl::c_all_of(proto.operand_ids(),
@@ -96,72 +133,73 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   Shape shape(proto.shape());
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
+  absl::optional<int> arity = HloOpcodeArity(opcode);
+  if (arity) {
+    TF_RET_CHECK(proto.operand_ids_size() == *arity)
+        << proto.opcode() << " instruction should have " << *arity
+        << " operands but sees " << proto.operand_ids_size();
+  }
+
   switch (opcode) {
     // Ops migrated to subclasses.
     case HloOpcode::kBatchNormTraining:
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "BatchNormTraining instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateBatchNormTraining(shape, operands(0), operands(1), operands(2),
                                   proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormInference:
-      TF_RET_CHECK(proto.operand_ids_size() == 5)
-          << "BatchNormInference instruction should have 5 operands but sees "
-          << proto.operand_ids_size();
       instruction = CreateBatchNormInference(
           shape, operands(0), operands(1), operands(2), operands(3),
           operands(4), proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormGrad:
-      TF_RET_CHECK(proto.operand_ids_size() == 5)
-          << "BatchNormGrad instruction should have 5 operands but sees "
-          << proto.operand_ids_size();
       instruction = CreateBatchNormGrad(shape, operands(0), operands(1),
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kFft: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Fft instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<int64> fft_length(proto.fft_length().begin(),
                                     proto.fft_length().end());
       instruction = CreateFft(shape, operands(0), proto.fft_type(),
                               absl::Span<const int64>(fft_length));
       break;
     }
+    case HloOpcode::kCompare: {
+      // Auto-upgraded from deprecated opcode skips the following.
+      if (!comparison_direction) {
+        TF_ASSIGN_OR_RETURN(
+            comparison_direction,
+            StringToComparisonDirection(proto.comparison_direction()));
+      }
+      instruction =
+          CreateCompare(shape, operands(0), operands(1), *comparison_direction);
+      break;
+    }
+    case HloOpcode::kTriangularSolve: {
+      instruction = CreateTriangularSolve(shape, operands(0), operands(1),
+                                          proto.triangular_solve_options());
+      break;
+    }
+    case HloOpcode::kCholesky: {
+      instruction =
+          CreateCholesky(shape, operands(0), proto.cholesky_options());
+      break;
+    }
     case HloOpcode::kSend:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Send instruction should have 2 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateSend(operands(0), operands(1), proto.channel_id(),
                                proto.is_host_transfer());
       break;
     case HloOpcode::kSendDone:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "SendDone instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateSendDone(operands(0), proto.is_host_transfer());
       break;
     case HloOpcode::kRecv:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Recv instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateRecv(shape.tuple_shapes(0), operands(0),
                                proto.channel_id(), proto.is_host_transfer());
       break;
     case HloOpcode::kRecvDone:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "RecvDone instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateRecvDone(operands(0), proto.is_host_transfer());
       break;
     case HloOpcode::kReverse:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Reverse instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateReverse(shape, operands(0),
                                   std::vector<int64>(proto.dimensions().begin(),
                                                      proto.dimensions().end()));
@@ -173,6 +211,26 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction =
           CreateConcatenate(shape, all_operands(), proto.dimensions(0));
       break;
+    case HloOpcode::kConditional: {
+      TF_RET_CHECK(proto.called_computation_ids_size() > 0)
+          << "conditional should have at least 1 called computation";
+      if (operands(0)->shape().element_type() == PRED) {
+        TF_RET_CHECK(proto.called_computation_ids_size() == 2)
+            << "conditional should have exactly 2 called computations but got "
+            << proto.called_computation_ids_size();
+      }
+      TF_RET_CHECK(proto.operand_ids_size() ==
+                   proto.called_computation_ids_size() + 1)
+          << "conditional should have one branch_index operand plus one "
+             "operand per called computation but got "
+          << proto.operand_ids_size() << " operands for "
+          << proto.called_computation_ids_size() << " branch computations";
+      auto cond_operands = all_operands();
+      instruction =
+          CreateConditional(shape, cond_operands[0], all_computations(),
+                            absl::MakeSpan(cond_operands).subspan(1));
+      break;
+    }
     case HloOpcode::kReduce:
       TF_RET_CHECK(proto.operand_ids_size() % 2 == 0)
           << "Reduce instruction should have an even number of operands but "
@@ -201,26 +259,21 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Sort instruction should have 1 dimension";
+      TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+          << "Sort instruction should one called computation but sees "
+          << proto.called_computation_ids_size();
       auto sort_operands = all_operands();
-      HloInstruction* keys = sort_operands[0];
-      instruction = CreateSort(
-          shape, proto.dimensions(0), keys,
-          absl::Span<HloInstruction* const>(sort_operands).subspan(1));
+      instruction = CreateSort(shape, proto.dimensions(0), all_operands(),
+                               computations(0), proto.is_stable());
       break;
     }
     case HloOpcode::kTranspose:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Transpose instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateTranspose(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
     case HloOpcode::kBroadcast:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Broadcast instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateBroadcast(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
@@ -233,9 +286,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateMap(shape, all_operands(), computations(0));
       break;
     case HloOpcode::kSlice: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Slice instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<int64> slice_starts, slice_limits, slice_strides;
       for (const HloInstructionProto::SliceDimensions& slice_dimensions :
            proto.slice_dimensions()) {
@@ -259,9 +309,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kTrace: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Trace instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_literal());
       TF_ASSIGN_OR_RETURN(auto literal,
                           Literal::CreateFromProto(proto.literal()));
@@ -295,18 +342,16 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kParameter:
       instruction =
           CreateParameter(proto.parameter_number(), shape, proto.name());
+      if (!proto.parameter_replication().replicated_at_leaf_buffers().empty()) {
+        instruction->set_parameter_replicated_at_leaf_buffers(
+            proto.parameter_replication().replicated_at_leaf_buffers());
+      }
       break;
     case HloOpcode::kGetTupleElement:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "GetTupleElement instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateGetTupleElement(shape, operands(0), proto.tuple_index());
       break;
     case HloOpcode::kReducePrecision:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "ReducePrecision instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateReducePrecision(
           shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
@@ -316,16 +361,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Infeed should have a tuple shape with 2 operands, but has: "
           << shape;
       const Shape& data_shape = ShapeUtil::GetTupleElementShape(shape, 0);
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Infeed instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
     case HloOpcode::kOutfeed: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Outfeed instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       Shape outfeed_shape(proto.outfeed_shape());
       TF_RETURN_IF_ERROR(
           ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
@@ -359,9 +398,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kCollectivePermute: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "CollectivePermute instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<std::pair<int64, int64>> source_target_pairs(
           proto.source_target_pairs_size());
       for (int i = 0; i < source_target_pairs.size(); i++) {
@@ -372,10 +408,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           CreateCollectivePermute(shape, operands(0), source_target_pairs);
       break;
     }
+    case HloOpcode::kReplicaId: {
+      instruction = CreateReplicaId();
+      break;
+    }
     case HloOpcode::kConvolution: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Convolution instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
       PrecisionConfig precision_config = proto.precision_config();
@@ -389,9 +426,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kReduceWindow:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "ReduceWindow instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
@@ -399,9 +433,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                        proto.window(), computations(0));
       break;
     case HloOpcode::kSelectAndScatter:
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "SelectAndScatter instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
           << "SelectAndScatter should have 2 called computations but sees "
           << proto.called_computation_ids_size();
@@ -444,9 +475,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
               std::max(static_cast<int64>(proto.batch_group_count()), 1LL));
       break;
     case HloOpcode::kPad:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Pad instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_padding_config());
       instruction =
           CreatePad(shape, operands(0), operands(1), proto.padding_config());
@@ -492,9 +520,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kGather: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Gather instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_gather_dimension_numbers())
           << "Gather instruction should have GatherDimensionNumbers set.";
       std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers =
@@ -509,9 +534,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kScatter: {
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "Scatter instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_scatter_dimension_numbers())
           << "Scatter instruction should have ScatterDimensionNumbers set.";
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
@@ -533,9 +555,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kDot: {
       TF_RET_CHECK(proto.has_dot_dimension_numbers())
           << "Dot instruction should have dot_dimension_numbers.";
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Dot instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       PrecisionConfig precision_config = proto.precision_config();
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
@@ -545,9 +564,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kDomain: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Domain instruction should have 1 operands but sees "
-          << proto.operand_ids_size();
       std::shared_ptr<const HloSharding> entry_hlo_sharding;
       std::shared_ptr<const HloSharding> exit_hlo_sharding;
       if (proto.has_domain_entry_sharding()) {
@@ -569,7 +585,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kGetDimensionSize:
-      TF_RET_CHECK(proto.operand_ids_size() == 1);
       TF_RET_CHECK(proto.dimensions_size() == 1);
       instruction =
           CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
@@ -689,8 +704,10 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       break;
     default:
@@ -710,15 +727,9 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kAtan2:
     case HloOpcode::kDivide:
     case HloOpcode::kComplex:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kPower:
     case HloOpcode::kRemainder:
     case HloOpcode::kSubtract:
@@ -783,6 +794,24 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
                                               fft_length);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCompare(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    ComparisonDirection direction) {
+  return absl::make_unique<HloCompareInstruction>(shape, lhs, rhs, direction);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateTriangularSolve(const Shape& shape, HloInstruction* a,
+                                      HloInstruction* b,
+                                      const TriangularSolveOptions& options) {
+  return absl::make_unique<HloTriangularSolveInstruction>(shape, a, b, options);
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCholesky(
+    const Shape& shape, HloInstruction* a, const CholeskyOptions& options) {
+  return absl::make_unique<HloCholeskyInstruction>(shape, a, options);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dimension_numbers,
@@ -825,6 +854,11 @@ HloInstruction::CreateCollectivePermute(
       shape, operand, source_target_pairs);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReplicaId() {
+  return absl::WrapUnique(
+      new HloInstruction(HloOpcode::kReplicaId, ShapeUtil::MakeShape(U32, {})));
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
@@ -932,6 +966,21 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConditional(
+    const Shape& shape, HloInstruction* branch_index,
+    absl::Span<HloComputation* const> branch_computations,
+    absl::Span<HloInstruction* const> branch_computation_args) {
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kConditional, shape));
+  instruction->AppendOperand(branch_index);
+  CHECK_EQ(branch_computations.size(), branch_computation_args.size());
+  for (int i = 0; i < branch_computations.size(); ++i) {
+    instruction->called_computations_.push_back(branch_computations[i]);
+    instruction->AppendOperand(branch_computation_args[i]);
+  }
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     absl::Span<const int64> start_indices,
@@ -1141,9 +1190,11 @@ HloInstruction::CreateBroadcastSequence(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSort(
-    const Shape& shape, int64 dimension, HloInstruction* keys,
-    absl::Span<HloInstruction* const> values) {
-  return absl::make_unique<HloSortInstruction>(shape, dimension, keys, values);
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable) {
+  return absl::make_unique<HloSortInstruction>(shape, dimension, operands,
+                                               compare, is_stable);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
@@ -1299,6 +1350,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kFft:
+    case HloOpcode::kCompare:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
@@ -1335,6 +1387,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kCholesky:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1355,8 +1409,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateUnary(shape, opcode_, new_operands[0]);
@@ -1368,12 +1424,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDivide:
     case HloOpcode::kMultiply:
     case HloOpcode::kSubtract:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kNe:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -1425,10 +1475,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
           CreateWhile(shape, while_condition(), while_body(), new_operands[0]);
       break;
     case HloOpcode::kConditional:
-      CHECK_EQ(new_operands.size(), 3);
-      clone = CreateConditional(shape, new_operands[0], new_operands[1],
-                                true_computation(), new_operands[2],
-                                false_computation());
+      CHECK_EQ(new_operands.size(), branch_count() + 1);
+      clone = CreateConditional(shape, new_operands[0],
+                                absl::MakeSpan(branch_computations()),
+                                new_operands.subspan(1));
       break;
     case HloOpcode::kAfterAll:
       if (new_operands.empty()) {
@@ -1441,6 +1491,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 2);
       clone = CreateAddDependency(new_operands[0], new_operands[1]);
       break;
+    case HloOpcode::kReplicaId:
+      CHECK_EQ(new_operands.size(), 0);
+      clone = CreateReplicaId();
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1685,38 +1739,35 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCos:
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
-    case HloOpcode::kLe:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
     case HloOpcode::kAnd:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
     case HloOpcode::kXor:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kNegate:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
     case HloOpcode::kReshape:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
     case HloOpcode::kTuple:
@@ -1732,16 +1783,16 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCall:
       return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kConditional:
-      return eq_computations(true_computation(), other.true_computation()) &&
-             eq_computations(false_computation(), other.false_computation());
-
-    case HloOpcode::kWhile: {
-      if (eq_computations(while_body(), other.while_body()) &&
-          eq_computations(while_condition(), other.while_condition())) {
-        return true;
+      for (int j = 0; j < branch_count(); ++j) {
+        if (!eq_computations(branch_computation(j),
+                             other.branch_computation(j))) {
+          return false;
+        }
       }
-      return false;
-    }
+      return true;
+    case HloOpcode::kWhile:
+      return (eq_computations(while_body(), other.while_body()) &&
+              eq_computations(while_condition(), other.while_condition()));
 
     // Ops migrated to subclasses should never come to this line.
     // TODO(b/80131774): Remove this switch when migration is complete.
@@ -1749,6 +1800,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kFft:
+    case HloOpcode::kCompare:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
@@ -1785,6 +1837,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kCholesky:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -1839,7 +1893,11 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
       << "this shape: " << ShapeUtil::HumanString(shape())
       << ", replacement shape: "
       << ShapeUtil::HumanString(new_producer->shape());
+  return ReplaceUseWithDifferentShape(user, new_producer);
+}
 
+Status HloInstruction::ReplaceUseWithDifferentShape(
+    HloInstruction* user, HloInstruction* new_producer) {
   VLOG(3) << "Replacing uses of " << name() << " in " << user->name()
           << " with " << new_producer->name();
 
@@ -1935,6 +1993,7 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kReduce:
     case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       return called_computations_[0];
     default:
@@ -1954,6 +2013,7 @@ void HloInstruction::set_to_apply(HloComputation* computation) {
     case HloOpcode::kReduce:
     case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       called_computations_[0] = computation;
       break;
@@ -1996,28 +2056,41 @@ HloInstruction* HloInstruction::while_init() const {
 
 HloComputation* HloInstruction::true_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
+  CHECK_EQ(PRED, operand(0)->shape().element_type());
   return called_computations_[kTrueComputationIndex];
 }
 
 HloComputation* HloInstruction::false_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
+  CHECK_EQ(PRED, operand(0)->shape().element_type());
   return called_computations_[kFalseComputationIndex];
 }
 
-void HloInstruction::set_true_computation(HloComputation* true_computation) {
-  // Don't allow changing the computation for fused instructions so we don't
-  // have to recompute called_instructions for the entire fusion instruction.
-  CHECK(!IsFused());
-  CHECK_EQ(HloOpcode::kConditional, opcode_);
-  called_computations_[kTrueComputationIndex] = true_computation;
+const std::vector<HloComputation*>& HloInstruction::branch_computations()
+    const {
+  CHECK(HloOpcode::kConditional == opcode_);
+  return called_computations_;
+}
+
+int HloInstruction::branch_count() const {
+  CHECK(HloOpcode::kConditional == opcode_);
+  return called_computations_.size();
+}
+
+HloComputation* HloInstruction::branch_computation(int b) const {
+  CHECK(HloOpcode::kConditional == opcode_);
+  CHECK_GE(b, 0);
+  CHECK_LT(b, called_computations_.size());
+  return called_computations_[b];
 }
 
-void HloInstruction::set_false_computation(HloComputation* false_computation) {
+void HloInstruction::set_branch_computation(int b,
+                                            HloComputation* computation) {
   // Don't allow changing the computation for fused instructions so we don't
   // have to recompute called_instructions for the entire fusion instruction.
   CHECK(!IsFused());
   CHECK_EQ(HloOpcode::kConditional, opcode_);
-  called_computations_[kFalseComputationIndex] = false_computation;
+  called_computations_[b] = computation;
 }
 
 string HloInstruction::SignatureString() const {
@@ -2064,8 +2137,10 @@ bool HloInstruction::IsElementwiseImpl(
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       CHECK_EQ(1, operand_count());
       return true;
@@ -2073,17 +2148,12 @@ bool HloInstruction::IsElementwiseImpl(
     // Binary elementwise operations, the same as in IsElementwiseBinary().
     case HloOpcode::kAdd:
     case HloOpcode::kAtan2:
+    case HloOpcode::kCompare:
     case HloOpcode::kComplex:
     case HloOpcode::kDivide:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kPower:
     case HloOpcode::kRemainder:
     case HloOpcode::kSubtract:
@@ -2218,15 +2288,27 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
       extra.push_back(
           StrCat("scatter=", PrintName(scatter()->name(), options)));
     } else if (opcode() == HloOpcode::kConditional) {
-      extra.push_back(StrCat("true_computation=",
-                             PrintName(true_computation()->name(), options)));
-      extra.push_back(StrCat("false_computation=",
-                             PrintName(false_computation()->name(), options)));
+      if (operand(0)->shape().element_type() == PRED) {
+        extra.push_back(StrCat("true_computation=",
+                               PrintName(true_computation()->name(), options)));
+        extra.push_back(
+            StrCat("false_computation=",
+                   PrintName(false_computation()->name(), options)));
+      } else {
+        extra.push_back(StrCat(
+            "branch_computations={",
+            StrJoin(branch_computations(), ", ",
+                    [&](string* out, const HloComputation* computation) {
+                      StrAppend(out, PrintName(computation->name(), options));
+                    }),
+            "}"));
+      }
     } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
                opcode() == HloOpcode::kReduceWindow ||
                opcode() == HloOpcode::kReduce ||
                opcode() == HloOpcode::kAllReduce ||
-               opcode() == HloOpcode::kScatter) {
+               opcode() == HloOpcode::kScatter ||
+               opcode() == HloOpcode::kSort) {
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
@@ -2252,10 +2334,20 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
         extra.push_back(StrCat("scatter=\n", scatter()->ToString(new_options)));
         break;
       case HloOpcode::kConditional:
-        extra.push_back(StrCat("true_computation=\n",
-                               true_computation()->ToString(new_options)));
-        extra.push_back(StrCat("false_computation=\n",
-                               false_computation()->ToString(new_options)));
+        if (operand(0)->shape().element_type() == PRED) {
+          extra.push_back(StrCat("true_computation=\n",
+                                 true_computation()->ToString(new_options)));
+          extra.push_back(StrCat("false_computation=\n",
+                                 false_computation()->ToString(new_options)));
+        } else {
+          extra.push_back(StrCat(
+              "branch_computations={\n",
+              StrJoin(branch_computations(), ",\n",
+                      [&](string* out, const HloComputation* computation) {
+                        StrAppend(out, computation->ToString(new_options));
+                      }),
+              "\n}"));
+        }
         break;
       case HloOpcode::kCall:
       case HloOpcode::kMap:
@@ -2263,6 +2355,7 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
       case HloOpcode::kReduce:
       case HloOpcode::kAllReduce:
       case HloOpcode::kScatter:
+      case HloOpcode::kSort:
         extra.push_back(
             StrCat("to_apply=\n", to_apply()->ToString(new_options)));
         break;
@@ -2403,12 +2496,7 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleGetTupleElement(this);
     case HloOpcode::kParameter:
       return visitor->HandleParameter(this);
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kNe:
+    case HloOpcode::kCompare:
       return visitor->HandleCompare(this);
     case HloOpcode::kComplex:
       return visitor->HandleComplex(this);
@@ -2464,6 +2552,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleAllToAll(this);
     case HloOpcode::kCollectivePermute:
       return visitor->HandleCollectivePermute(this);
+    case HloOpcode::kReplicaId:
+      return visitor->HandleReplicaId(this);
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this);
     case HloOpcode::kMap:
@@ -2498,6 +2588,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleCos(this);
     case HloOpcode::kSin:
       return visitor->HandleSin(this);
+    case HloOpcode::kSqrt:
+      return visitor->HandleSqrt(this);
+    case HloOpcode::kRsqrt:
+      return visitor->HandleRsqrt(this);
     case HloOpcode::kReal:
       return visitor->HandleReal(this);
     case HloOpcode::kImag:
@@ -2566,6 +2660,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleIota(this);
     case HloOpcode::kGetDimensionSize:
       return visitor->HandleGetDimensionSize(this);
+    case HloOpcode::kTriangularSolve:
+      return visitor->HandleTriangularSolve(this);
+    case HloOpcode::kCholesky:
+      return visitor->HandleCholesky(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2756,11 +2854,6 @@ bool HloInstruction::IsElementwise() const {
   return IsElementwiseImpl(absl::nullopt);
 }
 
-bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const {
-  CHECK(IsElementwise());
-  return !ShapeUtil::SameDimensions(shape(), operand(operand_idx)->shape());
-}
-
 bool HloInstruction::IsElementwiseOnOperand(int64 operand_idx) const {
   return IsElementwiseImpl(operand_idx);
 }
@@ -2870,9 +2963,7 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
       }
       return UseKind::kReuse;
     default:
-      return IsElementwise() && !ImplicitlyBroadcastsOperand(i)
-                 ? UseKind::kUse
-                 : UseKind::kReuse;
+      return IsElementwise() ? UseKind::kUse : UseKind::kReuse;
   }
 }
 
@@ -3277,6 +3368,19 @@ int64 HloInstruction::parameter_number() const {
   return Cast<HloParameterInstruction>(this)->parameter_number();
 }
 
+void HloInstruction::set_parameter_replicated_at_leaf_buffers(
+    absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+  return Cast<HloParameterInstruction>(this)
+      ->set_parameter_replicated_at_leaf_buffers(
+          parameter_replicated_at_leaf_buffers);
+}
+
+const absl::optional<std::vector<bool>>&
+HloInstruction::parameter_replicated_at_leaf_buffers() const {
+  return Cast<HloParameterInstruction>(this)
+      ->parameter_replicated_at_leaf_buffers();
+}
+
 int64 HloInstruction::tuple_index() const {
   return Cast<HloGetTupleElementInstruction>(this)->tuple_index();
 }
@@ -3433,4 +3537,17 @@ const DomainMetadata& HloInstruction::operand_side_metadata() const {
 const DomainMetadata& HloInstruction::user_side_metadata() const {
   return Cast<HloDomainInstruction>(this)->user_side_metadata();
 }
+
+ComparisonDirection HloInstruction::comparison_direction() const {
+  return Cast<HloCompareInstruction>(this)->direction();
+}
+
+const TriangularSolveOptions& HloInstruction::triangular_solve_options() const {
+  return Cast<HloTriangularSolveInstruction>(this)->triangular_solve_options();
+}
+
+const CholeskyOptions& HloInstruction::cholesky_options() const {
+  return Cast<HloCholeskyInstruction>(this)->cholesky_options();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 2c29b6c243bffccc346af12277dd4fc061250cbe..6f6a1b8505ede690ca3926db79be7bdd1f34c51a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -47,6 +48,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -384,6 +386,14 @@ class HloInstruction {
 
   // Creates a random number generation instruction that fills a shape with
   // random numbers from a given distribution.
+  //
+  // The parameters to the instruction are interpreted as follows:
+  //
+  //  - If `distribution` is RNG_UNIFORM, generates a number in range
+  //    [param0, param1).
+  //
+  //  - If `distribution` is RNG_NORMAL, generates a normally-distributed value
+  //    with mean `param0` and standard deviation `param1`.
   static std::unique_ptr<HloInstruction> CreateRng(
       const Shape& shape, RandomDistribution distribution,
       absl::Span<HloInstruction* const> parameters);
@@ -435,6 +445,18 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, FftType fft_type,
       absl::Span<const int64> fft_length);
 
+  // Creates a compare op, performing the comparison specified in direction.
+  static std::unique_ptr<HloInstruction> CreateCompare(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      ComparisonDirection direction);
+
+  static std::unique_ptr<HloInstruction> CreateTriangularSolve(
+      const Shape& shape, HloInstruction* a, HloInstruction* b,
+      const TriangularSolveOptions& options);
+
+  static std::unique_ptr<HloInstruction> CreateCholesky(
+      const Shape& shape, HloInstruction* a, const CholeskyOptions& options);
+
   // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
   // dimensions specified in 'dimension_numbers'.
   static std::unique_ptr<HloInstruction> CreateDot(
@@ -489,11 +511,14 @@ class HloInstruction {
   // Data is sent/received according to the (source_replica_id,
   // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
   // target_replica_id in any pair, the output on that replica is a tensor
-  // conssits of 0(s) in `shape`.
+  // consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectivePermute(
       const Shape& shape, HloInstruction* operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+  // Creates an instruction that returns a U32 replica ID.
+  static std::unique_ptr<HloInstruction> CreateReplicaId();
+
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
@@ -595,7 +620,6 @@ class HloInstruction {
   // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1,
   // ..., inputN.value1)
   // ...
-  // TODO(b/112040122): Add support to this in HLO passes and in backends.
   static std::unique_ptr<HloInstruction> CreateReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::Span<HloInstruction* const> init_values,
@@ -668,10 +692,15 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       absl::Span<const int64> dimensions);
 
-  // Creates a sort op, with a keys operand, and optional values operands.
+  // Creates a n-ary sort op with a 'compare' computation which is used for
+  // comparisons in the sorting algorithm. 'compare' gets 2 * n parameters,
+  // where parameters 2 * i and 2 * i + 1 are the values of the i-th operand at
+  // specific index positions which should be compared, and should return a
+  // PRED. 'is_stable' specifies whether stable sorting is required.
   static std::unique_ptr<HloInstruction> CreateSort(
-      const Shape& shape, int64 dimension, HloInstruction* keys,
-      absl::Span<HloInstruction* const> values = {});
+      const Shape& shape, int64 dimension,
+      absl::Span<HloInstruction* const> operands, HloComputation* compare,
+      bool is_stable);
 
   // Creates a while instruction, given a condition computation, a body
   // computation, and the initial value for the input of the computations. For
@@ -688,6 +717,11 @@ class HloInstruction {
       HloInstruction* true_computation_arg, HloComputation* true_computation,
       HloInstruction* false_computation_arg, HloComputation* false_computation);
 
+  static std::unique_ptr<HloInstruction> CreateConditional(
+      const Shape& shape, HloInstruction* branch_index,
+      absl::Span<HloComputation* const> branch_computations,
+      absl::Span<HloInstruction* const> branch_computation_args);
+
   static std::unique_ptr<HloInstruction> CreateGather(
       const Shape& shape, HloInstruction* operand,
       HloInstruction* start_indices,
@@ -929,6 +963,10 @@ class HloInstruction {
   // operands of it which could be created due to this replacement.
   Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
 
+  // Same as ReplaceUseWith(), but new_producer can have a different shape.
+  Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                      HloInstruction* new_producer);
+
   // Replaces the specified operand with new_operand. The old and new operands
   // must have compatible shapes ignoring floating-point precision.
   //
@@ -1030,14 +1068,23 @@ class HloInstruction {
 
   HloInstruction* while_init() const;
 
-  // Gets/sets the true and false HloComputation for Conditional. The setters
-  // should only be called by HloModule or HloComputation methods.
+  // Gets/sets the true and false HloComputation for Conditional.
   //
-  // Precondition: The instruction is a Conditional instruction.
+  // Precondition: The instruction is a predicated Conditional instruction.
   HloComputation* true_computation() const;
   HloComputation* false_computation() const;
-  void set_true_computation(HloComputation* true_computation);
-  void set_false_computation(HloComputation* false_computation);
+
+  // Gets the branch HloComputations for Conditional.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  const std::vector<HloComputation*>& branch_computations() const;
+  int branch_count() const;
+  HloComputation* branch_computation(int b) const;
+  // Sets a branch HloComputation for Conditional.
+  // The setter should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  void set_branch_computation(int b, HloComputation* computation);
 
   // Returns a string for the signature of this instruction if considered as a
   // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
@@ -1179,10 +1226,8 @@ class HloInstruction {
 
   // Returns true if this instruction performs an elementwise operation on
   // `operand_idx`-th operand. An instruction is elementwise on an operand iff,
-  // after performing necessary implicit broadcast
-  // (cs/IrArray::EmitArrayElementAddress), to compute the output at index
-  // {i_0,i_1,...,i_n}, the only element required from the operand (if any) is
-  // the element at {i_0,i_1,...,i_n}.
+  // to compute the output at index {i_0,i_1,...,i_n}, the only element required
+  // from the operand (if any) is the element at {i_0,i_1,...,i_n}.
   //
   // Note on performance: when this instruction is kFusion, this method, in the
   // worst case, scans all fused instructions. We could speed this up by
@@ -1198,12 +1243,6 @@ class HloInstruction {
   // Returns true if this is a cross-replica all-reduce instruction.
   bool IsCrossReplicaAllReduce() const;
 
-  // Returns true if this elementwise instruction implicitly broadcasts operand
-  // `operand_idx`.
-  //
-  // Precondition: this instruction should be an elementwise operation.
-  bool ImplicitlyBroadcastsOperand(int64 operand_idx) const;
-
   // Returns true if this instruction is binary and elementwise.
   bool IsElementwiseBinary() const;
 
@@ -1239,6 +1278,10 @@ class HloInstruction {
   // on the instruction's existing name.
   void UniquifyName(NameUniquer* name_uniquer);
 
+  // Clear the unique ID of the instruction so that it can be re-assigned, such
+  // as for the purpose of compacting the instruction unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // Set the unique id for this instruction to "id"
   void SetUniqueId(int id) {
     CHECK_EQ(unique_id_, -1);  // Should not be assigned already
@@ -1272,6 +1315,9 @@ class HloInstruction {
     backend_config_ = std::move(config_str);
   }
 
+  bool is_default_config() const { return is_default_config_; }
+  void set_default_config() { is_default_config_ = true; }
+
   // Returns a string representation of a proto in the format used by
   // raw_backend_config_string.
   //
@@ -1442,6 +1488,15 @@ class HloInstruction {
   // Delegates to HloParameterInstruction::parameter_number.
   int64 parameter_number() const;
 
+  // Delegates to
+  // HloParameterInstruction::set_parameter_replicated_at_leaf_buffers.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers);
+
+  // Delegates to HloParameterInstruction::parameter_replicated_at_leaf_buffers.
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const;
+
   // Delegates to HloGetTupleElementInstruction::tuple_index.
   int64 tuple_index() const;
 
@@ -1551,6 +1606,15 @@ class HloInstruction {
   // Delegates to HloDomainInstruction::user_side_metadata().
   const DomainMetadata& user_side_metadata() const;
 
+  // Delegates to HloCompareInstruction::direction().
+  ComparisonDirection comparison_direction() const;
+
+  // Delegates to HloTriangularSolveInstruction::triangular_solve_options().
+  const TriangularSolveOptions& triangular_solve_options() const;
+
+  // Delegates to HloCholeskyInstruction::cholesky_options().
+  const CholeskyOptions& cholesky_options() const;
+
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1717,6 +1781,10 @@ class HloInstruction {
   // HLO. See the documentation on backend_config().
   string backend_config_;
 
+  // This field is assigned to true when backend_config_ is assigned to
+  // a default configuration.
+  bool is_default_config_ = false;
+
   // String identifier for instruction.
   string name_;
 
@@ -1730,6 +1798,10 @@ class HloInstruction {
   TF_DISALLOW_COPY_AND_ASSIGN(HloInstruction);
 };
 
+// Explicit instantiations in hlo_instruction.cc.
+extern template Status HloInstruction::Accept(DfsHloVisitor*, bool, bool);
+extern template Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool);
+
 string ToString(HloInstruction::FusionKind kind);
 StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     const string& kind_name);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 35f031f29a7aca8db7ebe2fbcfdcebb7a778d703..85f2ddba8d303a74b8b72c97dd99952a5a57bcb0 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1655,7 +1655,7 @@ body (bparam: s32[]) -> s32[] {
 condition (cparam: s32[]) -> pred[] {
   xconstant = s32[] constant(5)
   cparam = s32[] parameter(0)
-  ROOT greater-than = pred[] greater-than(xconstant, cparam)
+  ROOT greater-than = pred[] compare(xconstant, cparam), direction=GT
 }
 
 ENTRY entry (param: s32[]) -> s32[] {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 3a0d71dd88b6f16eb5b8492f87ac6a5136584424..41b4ba2138061de7e31037d1f9ae49ec9137fff4 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace xla {
 namespace {
@@ -201,6 +202,164 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
+HloCompareInstruction::HloCompareInstruction(const Shape& shape,
+                                             HloInstruction* lhs,
+                                             HloInstruction* rhs,
+                                             ComparisonDirection direction)
+    : HloInstruction(HloOpcode::kCompare, shape), direction_(direction) {
+  AppendOperand(lhs);
+  AppendOperand(rhs);
+}
+
+HloInstructionProto HloCompareInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_comparison_direction(ComparisonDirectionToString(direction_));
+  return proto;
+}
+
+std::vector<string> HloCompareInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("direction=", ComparisonDirectionToString(direction()))};
+}
+
+bool HloCompareInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloCompareInstruction&>(other);
+  return direction() == casted_other.direction();
+}
+
+std::unique_ptr<HloInstruction> HloCompareInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 2);
+  return absl::make_unique<HloCompareInstruction>(shape, new_operands[0],
+                                                  new_operands[1], direction());
+}
+
+namespace {
+
+// Converts a protocol buffer message (e.g., TriangularSolveOptions) to a vector
+// of "key=value" attribute strings generically, using protocol buffer
+// reflection.
+//
+// Currently implements a small subset of cases; feel free to add more as
+// needed.
+std::vector<string> AttributeProtoToStringVector(
+    const tensorflow::protobuf::Message& message) {
+  const tensorflow::protobuf::Reflection* reflection = message.GetReflection();
+  std::vector<const tensorflow::protobuf::FieldDescriptor*> fields;
+  reflection->ListFields(message, &fields);
+
+  std::vector<string> output;
+  for (const tensorflow::protobuf::FieldDescriptor* field : fields) {
+    string s = absl::StrCat(field->name(), "=");
+    CHECK(!field->is_repeated()) << "Repeated fields aren't implemented";
+    switch (field->type()) {
+      case tensorflow::protobuf::FieldDescriptor::TYPE_BOOL: {
+        bool val = reflection->GetBool(message, field);
+        absl::StrAppend(&s, val ? "true" : "false");
+        break;
+      }
+      case tensorflow::protobuf::FieldDescriptor::TYPE_ENUM: {
+        const tensorflow::protobuf::EnumValueDescriptor* evd =
+            reflection->GetEnum(message, field);
+        absl::StrAppend(&s, evd->name());
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unimplemented field type: " << field->DebugString();
+    }
+    output.push_back(std::move(s));
+  }
+  return output;
+}
+
+}  // namespace
+
+HloTriangularSolveInstruction::HloTriangularSolveInstruction(
+    const Shape& shape, HloInstruction* a, HloInstruction* b,
+    const TriangularSolveOptions& options)
+    : HloInstruction(HloOpcode::kTriangularSolve, shape),
+      triangular_solve_options_(options) {
+  AppendOperand(a);
+  AppendOperand(b);
+}
+
+HloInstructionProto HloTriangularSolveInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_triangular_solve_options() = triangular_solve_options_;
+  return proto;
+}
+
+std::vector<string> HloTriangularSolveInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return AttributeProtoToStringVector(triangular_solve_options_);
+}
+
+bool HloTriangularSolveInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other =
+      static_cast<const HloTriangularSolveInstruction&>(other);
+  const auto& options = triangular_solve_options();
+  const auto& other_options = casted_other.triangular_solve_options();
+
+  return options.left_side() == other_options.left_side() &&
+         options.lower() == other_options.lower() &&
+         options.unit_diagonal() == other_options.unit_diagonal() &&
+         options.transpose_a() == other_options.transpose_a();
+}
+
+std::unique_ptr<HloInstruction>
+HloTriangularSolveInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 2);
+  return absl::make_unique<HloTriangularSolveInstruction>(
+      shape, new_operands[0], new_operands[1], triangular_solve_options());
+}
+
+HloCholeskyInstruction::HloCholeskyInstruction(const Shape& shape,
+                                               HloInstruction* a,
+                                               const CholeskyOptions& options)
+    : HloInstruction(HloOpcode::kCholesky, shape), cholesky_options_(options) {
+  AppendOperand(a);
+}
+
+HloInstructionProto HloCholeskyInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_cholesky_options() = cholesky_options_;
+  return proto;
+}
+
+std::vector<string> HloCholeskyInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return AttributeProtoToStringVector(cholesky_options_);
+}
+
+bool HloCholeskyInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloCholeskyInstruction&>(other);
+  const auto& options = cholesky_options();
+  const auto& other_options = casted_other.cholesky_options();
+
+  return options.lower() == other_options.lower();
+}
+
+std::unique_ptr<HloInstruction>
+HloCholeskyInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return absl::make_unique<HloCholeskyInstruction>(shape, new_operands[0],
+                                                   cholesky_options());
+}
+
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
                                                const Shape& shape,
                                                int64 channel_id,
@@ -383,6 +542,15 @@ HloInstructionProto HloAllReduceInstruction::ToProto() const {
   return proto;
 }
 
+bool HloAllReduceInstruction::IsNoop() const {
+  for (auto replica_group : replica_groups()) {
+    if (replica_group.replica_ids().size() != 1) {
+      return false;
+    }
+  }
+  return !all_reduce_id();
+}
+
 std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   std::vector<string> result =
@@ -600,14 +768,17 @@ std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
                                                  dimensions(), to_apply());
 }
 
-HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension,
-                                       HloInstruction* keys,
-                                       absl::Span<HloInstruction* const> values)
-    : HloInstruction(HloOpcode::kSort, shape), dimensions_({dimension}) {
-  AppendOperand(keys);
-  for (auto* value : values) {
+HloSortInstruction::HloSortInstruction(
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable)
+    : HloInstruction(HloOpcode::kSort, shape),
+      dimensions_({dimension}),
+      is_stable_(is_stable) {
+  for (auto* value : operands) {
     AppendOperand(value);
   }
+  AppendComputation(compare);
 }
 
 HloInstructionProto HloSortInstruction::ToProto() const {
@@ -615,12 +786,18 @@ HloInstructionProto HloSortInstruction::ToProto() const {
   for (int64 dimension : dimensions_) {
     proto.add_dimensions(dimension);
   }
+  proto.set_is_stable(is_stable());
   return proto;
 }
 
 std::vector<string> HloSortInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
+  std::vector<string> attrs;
+  attrs.push_back(StrCat("dimensions={", StrJoin(dimensions(), ","), "}"));
+  if (is_stable()) {
+    attrs.push_back("is_stable=true");
+  }
+  return attrs;
 }
 
 bool HloSortInstruction::IdenticalSlowPath(
@@ -628,15 +805,20 @@ bool HloSortInstruction::IdenticalSlowPath(
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloSortInstruction&>(other);
-  return dimensions() == casted_other.dimensions();
+  if (dimensions() != casted_other.dimensions()) {
+    return false;
+  }
+  if (is_stable() != casted_other.is_stable()) {
+    return false;
+  }
+  return eq_computations(to_apply(), other.to_apply());
 }
 
 std::unique_ptr<HloInstruction> HloSortInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  HloInstruction* keys = new_operands[0];
-  return absl::make_unique<HloSortInstruction>(shape, dimensions(0), keys,
-                                               new_operands.subspan(1));
+  return absl::make_unique<HloSortInstruction>(
+      shape, dimensions(0), new_operands, to_apply(), is_stable());
 }
 
 HloTransposeInstruction::HloTransposeInstruction(
@@ -1464,9 +1646,30 @@ HloParameterInstruction::HloParameterInstruction(int64 parameter_number,
 HloInstructionProto HloParameterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_parameter_number(parameter_number_);
+  if (parameter_replicated_at_leaf_buffers_) {
+    for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+      proto.mutable_parameter_replication()->add_replicated_at_leaf_buffers(
+          replicated);
+    }
+  }
   return proto;
 }
 
+std::vector<string> HloParameterInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  std::vector<string> result;
+  if (!parameter_replicated_at_leaf_buffers_) {
+    return result;
+  }
+  std::vector<string> buffers_replicated_strs;
+  for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+    buffers_replicated_strs.push_back(replicated ? "true" : "false");
+  }
+  result.push_back(StrCat("parameter_replication={",
+                          StrJoin(buffers_replicated_strs, ","), "}"));
+  return result;
+}
+
 string HloParameterInstruction::OperandsToStringWithCanonicalNameMap(
     const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
@@ -1686,6 +1889,7 @@ HloInstructionProto HloConvolutionInstruction::ToProto() const {
   *proto.mutable_convolution_dimension_numbers() =
       convolution_dimension_numbers_;
   proto.set_feature_group_count(feature_group_count_);
+  proto.set_batch_group_count(batch_group_count_);
   *proto.mutable_precision_config() = precision_config_;
   return proto;
 }
@@ -1723,6 +1927,9 @@ bool HloConvolutionInstruction::IdenticalSlowPath(
   if (feature_group_count_ != other.feature_group_count()) {
     return false;
   }
+  if (batch_group_count_ != other.batch_group_count()) {
+    return false;
+  }
   return protobuf_util::ProtobufEquals(window(), casted_other.window()) &&
          protobuf_util::ProtobufEquals(
              convolution_dimension_numbers(),
@@ -1841,6 +2048,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
       feature_group_count_(1),
+      batch_group_count_(1),
       layout_constrained_(false) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -1855,6 +2063,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
       feature_group_count_(1),
+      batch_group_count_(1),
       layout_constrained_(true),
       operand_shapes_with_layout_(operand_shapes_with_layout.begin(),
                                   operand_shapes_with_layout.end()) {
@@ -1875,6 +2084,7 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   proto.set_custom_call_target(custom_call_target_);
   proto.set_custom_call_opaque(opaque_);
   proto.set_feature_group_count(feature_group_count_);
+  proto.set_batch_group_count(batch_group_count_);
   if (layout_constrained()) {
     proto.set_constrain_layout(true);
     for (const Shape& shape : operand_shapes_with_layout_) {
@@ -1898,6 +2108,9 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (feature_group_count_ != 1) {
     extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   }
+  if (batch_group_count_ != 1) {
+    extra.push_back(StrCat("batch_group_count=", batch_group_count_));
+  }
   // By contract, we print the custom call target even if
   // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
@@ -1941,6 +2154,20 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   if (feature_group_count_ != casted_other.feature_group_count_) {
     return false;
   }
+  if (batch_group_count_ != casted_other.batch_group_count_) {
+    return false;
+  }
+  if (layout_constrained() != casted_other.layout_constrained()) {
+    return false;
+  }
+  if (layout_constrained()) {
+    for (int64 i = 0; i < operand_shapes_with_layout_.size(); ++i) {
+      if (!ShapeUtil::Equal(operand_shapes_with_layout_[i],
+                            casted_other.operand_shapes_with_layout_[i])) {
+        return false;
+      }
+    }
+  }
   return custom_call_target_ == casted_other.custom_call_target_ &&
          opaque_ == casted_other.opaque_;
 }
@@ -1951,6 +2178,10 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   auto cloned = absl::make_unique<HloCustomCallInstruction>(
       shape, new_operands, custom_call_target(), opaque());
+  if (layout_constrained()) {
+    cloned->layout_constrained_ = true;
+    cloned->operand_shapes_with_layout_ = operand_shapes_with_layout();
+  }
   if (window_ != nullptr) {
     cloned->set_window(*window_);
   }
@@ -1958,6 +2189,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     cloned->set_convolution_dimension_numbers(*convolution_dimension_numbers_);
   }
   cloned->set_feature_group_count(feature_group_count_);
+  cloned->set_batch_group_count(batch_group_count_);
   return std::move(cloned);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index e6111cfb57581589070b8e34556bdfe8239b4fd3..0bc0db41c0a1751589415cb03289794ba111c5e8 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -131,6 +131,81 @@ class HloFftInstruction : public HloInstruction {
   std::vector<int64> fft_length_;
 };
 
+class HloCompareInstruction : public HloInstruction {
+ public:
+  explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
+                                 HloInstruction* rhs,
+                                 ComparisonDirection direction);
+  ComparisonDirection direction() const { return direction_; }
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  ComparisonDirection direction_;
+};
+
+class HloTriangularSolveInstruction : public HloInstruction {
+ public:
+  explicit HloTriangularSolveInstruction(const Shape& shape, HloInstruction* a,
+                                         HloInstruction* b,
+                                         const TriangularSolveOptions& options);
+  const TriangularSolveOptions& triangular_solve_options() const {
+    return triangular_solve_options_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  TriangularSolveOptions triangular_solve_options_;
+};
+
+class HloCholeskyInstruction : public HloInstruction {
+ public:
+  explicit HloCholeskyInstruction(const Shape& shape, HloInstruction* a,
+                                  const CholeskyOptions& options);
+  const CholeskyOptions& cholesky_options() const { return cholesky_options_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  CholeskyOptions cholesky_options_;
+};
+
 class HloSendRecvInstruction : public HloInstruction {
  public:
   // Returns the channel id associated with the instruction. The id is
@@ -253,6 +328,10 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns true if the AllReduce does no communication, so it's equivalent
+  // to a mem copy.
+  bool IsNoop() const;
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -414,8 +493,8 @@ class HloReduceInstruction : public HloInstruction {
 class HloSortInstruction : public HloInstruction {
  public:
   explicit HloSortInstruction(const Shape& shape, int64 dimension,
-                              HloInstruction* keys,
-                              absl::Span<HloInstruction* const> values = {});
+                              absl::Span<HloInstruction* const> operands,
+                              HloComputation* compare, bool is_stable);
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
@@ -428,6 +507,7 @@ class HloSortInstruction : public HloInstruction {
   HloInstruction* mutable_keys() { return mutable_operand(0); }
   // Returns the number of value operands.
   int64 values_count() const { return operand_count() - 1; }
+  bool is_stable() const { return is_stable_; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -442,6 +522,7 @@ class HloSortInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   std::vector<int64> dimensions_;
+  bool is_stable_;
 };
 
 class HloTransposeInstruction : public HloInstruction {
@@ -783,10 +864,28 @@ class HloParameterInstruction : public HloInstruction {
   explicit HloParameterInstruction(int64 parameter_number, const Shape& shape,
                                    const string& name);
   int64 parameter_number() const { return parameter_number_; }
+
+  // Sets and gets the whether all replicas will receive the same parameter data
+  // for each leaf buffer in data parallelism.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+    CHECK_EQ(ShapeUtil::GetLeafCount(shape()),
+             parameter_replicated_at_leaf_buffers.size());
+    parameter_replicated_at_leaf_buffers_.emplace(
+        parameter_replicated_at_leaf_buffers.begin(),
+        parameter_replicated_at_leaf_buffers.end());
+  }
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const {
+    return parameter_replicated_at_leaf_buffers_;
+  }
+
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
  private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
@@ -800,6 +899,10 @@ class HloParameterInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   int64 parameter_number_ = 0;
+
+  // Specifies whether each buffer has the same parameter value on all replicas
+  // in data parallelism.
+  absl::optional<std::vector<bool>> parameter_replicated_at_leaf_buffers_;
 };
 
 class HloGetTupleElementInstruction : public HloInstruction {
@@ -899,9 +1002,7 @@ class HloOutfeedInstruction : public HloInstruction {
                                  HloInstruction* token_operand,
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
-  const Shape& outfeed_shape() const {
-    return outfeed_shape_;
-  }
+  const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
   // Returns a serialized representation of this instruction.
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 5e81515134256a3ec4b790b38af3f42f68a79b56..2255383322873a39c7076e0f4f0dd541bc79014d 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
@@ -37,8 +38,8 @@ constexpr int kError = -2;
 
 // [a-zA-Z0-9_.-]
 bool IsIdentifierChar(char c) {
-  return isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '.' ||
-         c == '_';
+  return absl::ascii_isalnum(static_cast<unsigned char>(c)) || c == '-' ||
+         c == '.' || c == '_';
 }
 
 }  // namespace
@@ -105,7 +106,7 @@ TokKind HloLexer::LexToken() {
     switch (current_char) {
       default:
         // [a-zA-Z_]
-        if (isalpha(static_cast<unsigned char>(current_char)) ||
+        if (absl::ascii_isalpha(static_cast<unsigned char>(current_char)) ||
             current_char == '_') {
           return LexIdentifier();
         }
@@ -152,6 +153,8 @@ TokKind HloLexer::LexToken() {
         return LexPercent();
       case ':':
         return TokKind::kColon;
+      case '*':
+        return TokKind::kAsterisk;
       case '[':
         return TokKind::kLsquare;
       case ']':
@@ -211,6 +214,15 @@ TokKind HloLexer::LexToken() {
         // A lone '/' is an error.
         return TokKind::kError;
       }
+      case '.':
+        if (PeekCurrentChar() == '.') {
+          current_ptr_++;
+          if (PeekCurrentChar() == '.') {
+            current_ptr_++;
+            return TokKind::kDots;
+          }
+        }
+        return TokKind::kError;
       case '"':
         return LexString();
     }
@@ -300,7 +312,7 @@ TokKind HloLexer::LexIdentifier() {
 // name ::= [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexPercent() {
   const char* name_start = current_ptr_;
-  if (isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
+  if (absl::ascii_isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
       PeekCurrentChar() == '_') {
     current_ptr_++;
     while (IsIdentifierChar(PeekCurrentChar())) {
@@ -454,6 +466,8 @@ string TokKindToString(TokKind kind) {
       return "kComma";
     case TokKind::kColon:
       return "kColon";
+    case TokKind::kAsterisk:
+      return "kAsterisk";
     case TokKind::kLsquare:
       return "kLsquare";
     case TokKind::kRsquare:
@@ -512,6 +526,8 @@ string TokKindToString(TokKind kind) {
       return "kInt";
     case TokKind::kDecimal:
       return "kDecimal";
+    case TokKind::kDots:
+      return "kDots";
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 94fac3cd8e9da7f273e7e521e21510f5188702e6..383fb4e862b8e32771879d055e663dc821a5c839 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -38,15 +38,17 @@ enum class TokKind {
   kError,
 
   // Tokens with no info.
-  kEqual,  // =
-  kComma,  // ,
-  kColon,  // :
+  kEqual,     // =
+  kComma,     // ,
+  kColon,     // :
+  kAsterisk,  // *
   kLsquare,
   kRsquare,  // [  ]
   kLbrace,
   kRbrace,  // {  }
   kLparen,
   kRparen,  // (  )
+  kDots,    // ...
 
   kArrow,  // ->
   kLeq,    // <=
@@ -107,7 +109,7 @@ class HloLexer {
         LOG(FATAL) << "This token does not have string value";
     }
   }
-  tensorflow::int64 GetInt64Val() const {
+  int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return token_state_.int64_val;
   }
@@ -170,7 +172,7 @@ class HloLexer {
     const char* token_start = nullptr;
     TokKind current_kind;
     string str_val;
-    tensorflow::int64 int64_val;
+    int64 int64_val;
     double decimal_val;
     PrimitiveType primitive_type_val;
   };
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
index 436cccb1fb9ecf6f4efad772c700c611b28ce628..45d3e9c460e51ac0c1fe613b3765583113d6acc1 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -255,7 +255,7 @@ TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) {
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -308,7 +308,7 @@ TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) {
     get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=1
     add.1 = s32[] add(get-tuple-element.3, get-tuple-element.4)
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(add.1, constant.2)
+    ROOT less-than = pred[] compare(add.1, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -360,7 +360,7 @@ TEST_F(HloLivenessAnalysisTest, WhileWithLiveTupleElements) {
     loop_var.2 = (s32[], s32[], s32[]) parameter(0)
     get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=0
     constant.1 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.4, constant.1)
+    ROOT less-than = pred[] compare(get-tuple-element.4, constant.1), direction=LT
   }
   ENTRY SimpleLoop {
     constant.2 = s32[] constant(0)
@@ -415,7 +415,7 @@ TEST_F(HloLivenessAnalysisTest, WhileWithOutfeed) {
     cond_param = (s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(cond_param), index=0
     constant.2 = s32[] constant(10)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -448,13 +448,13 @@ TEST_F(HloLivenessAnalysisTest, NestedWhileWithOutfeed) {
     cond_param = (s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(cond_param), index=0
     constant.2 = s32[] constant(10)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   OuterWhileCondition {
     cond_param.2 = (s32[]) parameter(0)
     get-tuple-element.5 = s32[] get-tuple-element(cond_param.2), index=0
     constant.5 = s32[] constant(5)
-    ROOT less-than.2 = pred[] less-than(get-tuple-element.5, constant.5)
+    ROOT less-than.2 = pred[] compare(get-tuple-element.5, constant.5), direction=LT
   }
   OuterWhileBody {
     body_param.2 = (s32[]) parameter(0)
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index d28e79d41ad5d58a8881cfb80d488684af26564f..47ed85be1967f2a8f8f397021a85b806449ca98a 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -89,6 +89,22 @@ bool HloParameterMatcher::MatchAndExplain(
   return true;
 }
 
+bool HloComparisonMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  if (instruction->comparison_direction() != direction_) {
+    *listener << "has wrong comparison direction (got "
+              << ComparisonDirectionToString(
+                     instruction->comparison_direction())
+              << ", want " << ComparisonDirectionToString(direction_) << ")";
+    return false;
+  }
+  return true;
+}
+
 bool HloGetTupleElementMatcher::MatchAndExplain(
     const HloInstruction* instruction,
     ::testing::MatchResultListener* listener) const {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 67488a6a9a0c9cba7f576f9036c3a0cbe1900fff..756f4d2c6bc65b04edea242ba15e3ee492d8cdb7 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -54,6 +54,21 @@ class HloParameterMatcher : public HloMatcher {
   int64 parameter_number_;
 };
 
+// Custom matcher for comparisons, which accepts a comparison direction.
+class HloComparisonMatcher : public HloMatcher {
+ public:
+  explicit HloComparisonMatcher(
+      ComparisonDirection direction,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCompare, operands), direction_(direction) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  ComparisonDirection direction_;
+};
+
 // Custom matcher for get-tuple-element instructions, which accepts a tuple
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
@@ -172,6 +187,7 @@ HLO_MATCHER(BatchNormGrad);
 HLO_MATCHER(Call);
 HLO_MATCHER(Ceil);
 HLO_MATCHER(Clamp);
+HLO_MATCHER(Compare);
 HLO_MATCHER(Concatenate);
 HLO_MATCHER(Conditional);
 HLO_MATCHER(Constant);
@@ -184,28 +200,22 @@ HLO_MATCHER(Divide);
 HLO_MATCHER(Domain);
 HLO_MATCHER(DynamicSlice);
 HLO_MATCHER(DynamicUpdateSlice);
-HLO_MATCHER(Eq);
 HLO_MATCHER(Exp);
 HLO_MATCHER(Floor);
 HLO_MATCHER(Fusion);
-HLO_MATCHER(Ge);
 HLO_MATCHER(AfterAll);
-HLO_MATCHER(Gt);
 HLO_MATCHER(Iota);
 HLO_MATCHER(Infeed);
 HLO_MATCHER(IsFinite);
-HLO_MATCHER(Le);
 HLO_MATCHER(Log);
 HLO_MATCHER(And);
 HLO_MATCHER(Not);
 HLO_MATCHER(Or);
 HLO_MATCHER(Xor);
-HLO_MATCHER(Lt);
 HLO_MATCHER(Map);
 HLO_MATCHER(Maximum);
 HLO_MATCHER(Minimum);
 HLO_MATCHER(Multiply);
-HLO_MATCHER(Ne);
 HLO_MATCHER(Negate);
 HLO_MATCHER(Outfeed);
 HLO_MATCHER(Pad);
@@ -256,6 +266,38 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Parameter() {
       new ::xla::testing::HloMatcher(HloOpcode::kParameter, {}));
 }
 
+// Comparison matchers below do not require any additional arguments.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Eq(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kEq, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Ne(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kNe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Ge(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kGe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Gt(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kGt, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Le(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kLe, {operands...}));
+}
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> Lt(M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloComparisonMatcher(
+      ComparisonDirection::kLt, {operands...}));
+}
+
 // GetTupleElement(operand, N) matches a GTE instruction which gets the N'th
 // tuple element of operand, while GetTupleElement(operand) matches any GTE
 // operation on operand, and GetTupleElement() matches any GTE operation at all.
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 7961aece541faeb66875885b380158756c503250..549fc603c705d1403da6e96f2ac6703079810623 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -220,5 +220,33 @@ ENTRY DotOperationFusion_TransposeFusion {
       "rhs_contracting_dimensions (got {0} want {1})");
 }
 
+TEST(HloMatchersTest, ComparisonMatcher) {
+  auto shape = ShapeUtil::MakeShape(F32, {1});
+  auto p0 = HloInstruction::CreateParameter(0, shape, "param.0");
+  auto p1 = HloInstruction::CreateParameter(1, shape, "param.1");
+  auto eq = HloInstruction::CreateCompare(shape, p0.get(), p1.get(),
+                                          ComparisonDirection::kEq);
+  auto ne = HloInstruction::CreateCompare(shape, p0.get(), p1.get(),
+                                          ComparisonDirection::kNe);
+  auto add =
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0.get(), p1.get());
+  auto le = HloInstruction::CreateCompare(shape, p0.get(), add.get(),
+                                          ComparisonDirection::kLe);
+
+  EXPECT_THAT(eq.get(), op::Compare());
+  EXPECT_THAT(eq.get(), op::Eq());
+  EXPECT_THAT(ne.get(), op::Compare());
+  EXPECT_THAT(ne.get(), op::Ne());
+  EXPECT_THAT(le.get(),
+              op::Compare(op::Parameter(0),
+                          op::Add(op::Parameter(0), op::Parameter(1))));
+  EXPECT_THAT(le.get(), op::Le(op::Parameter(0),
+                               op::Add(op::Parameter(0), op::Parameter(1))));
+
+  EXPECT_THAT(Explain(eq.get(), op::Add()), Eq(""));
+  EXPECT_THAT(Explain(eq.get(), op::Ne()),
+              Eq("has wrong comparison direction (got EQ, want NE)"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index d2740bcce26f04c5d7c8b64cfdaea53e3c697855..ca1046856d12b4ae870bc7e267dca34329ada665 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -205,6 +207,18 @@ class ListScheduler {
   // than not taking subcomputations into account at all. In the future, we may
   // improve accounting for subcomputation memory (b/65409243).
   int64 BytesFreedIfScheduled(const ReadyListEntry& entry) {
+    auto instruction = entry.instruction;
+    auto opcode = instruction->opcode();
+    // To keep the device busy between a host send and send-done, we schedule
+    // the send done as late as possible. Same for host recv-done. This is a
+    // hack because packing of computation between channel instructions
+    // normally happens in the module group scheduler, and the memory scheduler
+    // only tries to minimize memory.
+    if ((opcode == HloOpcode::kSendDone || opcode == HloOpcode::kRecvDone) &&
+        DynCast<HloSendRecvInstruction>(instruction)->is_host_transfer()) {
+      return INT_MIN;
+    }
+
     int64 freed_bytes = 0;
     for (const auto& kv : entry.used_buffer_unscheduled_use_counts) {
       auto buffer = kv->first;
@@ -216,7 +230,7 @@ class ListScheduler {
     // We only count the memory usage of the largest subcomputation, instead of
     // adding them all, because subcomputations won't execute in parallel.
     int64 max_subcomputation_bytes = 0;
-    for (const auto* c : entry.instruction->called_computations()) {
+    for (const auto* c : instruction->called_computations()) {
       auto it = memory_by_computation_.find(c);
       if (it != memory_by_computation_.end()) {
         int64 subcomputation_bytes = it->second;
@@ -227,9 +241,8 @@ class ListScheduler {
     }
     int64 bytes_defined;
     if (max_subcomputation_bytes > 0 &&
-        (entry.instruction->opcode() == HloOpcode::kWhile ||
-         entry.instruction->opcode() == HloOpcode::kCall ||
-         entry.instruction->opcode() == HloOpcode::kConditional)) {
+        (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
+         opcode == HloOpcode::kConditional)) {
       // The output buffer of while/call/conditional is always aliased with the
       // output buffer of the root instruction in the body. Don't double count.
       bytes_defined = max_subcomputation_bytes;
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index 7227bfb27c74758d2b79e404afc9eb97a1ca894d..76cc29cbb7848eb424d07abf11a95ffd59e9eed6 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -118,7 +118,7 @@ class HloTrivialScheduler : public HloModulePass {
 };
 
 // A trivial pass which clears the schedule currently set on the
-// HloModule. After this pass runs HloModudle::has_schedule will return false.
+// HloModule. After this pass runs HloModule::has_schedule will return false.
 class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index bc0d7e2bc00eab014f2660c95a51b966642eaee9..200d08c562e0e334cf49e0d3c57caf1d67399387 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -254,8 +254,9 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   HloInstruction* zero_vector =
       cond_builder.AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateR1<float>({0, 0, 0, 0})));
-  cond_builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), cond_param,
+                                    zero_vector, ComparisonDirection::kNe));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
 
   // param - 1
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 258f918f47a313b4b89fb260457b1b119dc16177..edcda8f9a7b974b95a12348577c335a3e6d8fcce 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -158,17 +158,12 @@ void HloModule::ReplaceComputations(
           break;
         }
         case HloOpcode::kConditional: {
-          HloComputation* new_true_computation =
-              tensorflow::gtl::FindWithDefault(
-                  replacements, instruction->true_computation(), nullptr);
-          if (new_true_computation != nullptr) {
-            instruction->set_true_computation(new_true_computation);
-          }
-          HloComputation* new_false_computation =
-              tensorflow::gtl::FindWithDefault(
-                  replacements, instruction->false_computation(), nullptr);
-          if (new_false_computation != nullptr) {
-            instruction->set_false_computation(new_false_computation);
+          for (int b = 0; b < instruction->branch_count(); ++b) {
+            HloComputation* new_computation = tensorflow::gtl::FindWithDefault(
+                replacements, instruction->branch_computation(b), nullptr);
+            if (new_computation != nullptr) {
+              instruction->set_branch_computation(b, new_computation);
+            }
           }
           break;
         }
@@ -246,11 +241,39 @@ HloModuleProto HloModule::ToProto() const {
   return proto;
 }
 
+Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
+  absl::flat_hash_set<string> computation_names;
+  absl::flat_hash_set<int> computation_ids;
+  absl::flat_hash_set<string> instruction_names;
+  absl::flat_hash_set<int> instruction_ids;
+
+  for (const HloComputation* computation : computations()) {
+    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
+        << "Computation name is not unique: " << computation->name();
+    computation_names.insert(computation->name());
+
+    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
+        << "Computation id is not unique: " << computation->unique_id();
+    computation_ids.insert(computation->unique_id());
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
+          << "Instruction name is not unique: " << instruction->name();
+      instruction_names.insert(instruction->name());
+
+      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
+          << "Instruction id is not unique: " << instruction->unique_id();
+      instruction_ids.insert(instruction->unique_id());
+    }
+  }
+  return Status::OK();
+}
+
 /* static */
 StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config) {
   VLOG(2) << "CreateFromProto()";
-  XLA_VLOG_LINES(2, proto.DebugString());
+  XLA_VLOG_LINES(3, proto.DebugString());
 
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
@@ -329,28 +352,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
                       DynamicParameterBinding::CreateFromProto(
                           proto.dynamic_parameter_binding()));
 
-  absl::flat_hash_set<string> computation_names;
-  absl::flat_hash_set<string> instruction_names;
-  absl::flat_hash_set<int> computation_ids;
-  absl::flat_hash_set<int> instruction_ids;
-  for (HloComputation* computation : module->computations()) {
-    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
-        << "Computation name is not unique: " << computation->name();
-    computation_names.insert(computation->name());
-
-    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
-        << "Computation id is not unique: " << computation->unique_id();
-    computation_ids.insert(computation->unique_id());
-    for (HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
-          << "Instruction name is not unique: " << instruction->name();
-      instruction_names.insert(instruction->name());
-
-      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
-          << "Instruction id is not unique: " << instruction->unique_id();
-      instruction_ids.insert(instruction->unique_id());
-    }
-  }
+  TF_RETURN_IF_ERROR(
+      module->CheckUniqueNamesAndIdsForComputationsAndInstructions());
 
   if (proto.has_schedule()) {
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index f1310e4b270898a21dbb4f86123edde4ba8993d0..2c63247eea8292f52e95b6171100221336450c13 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -167,6 +167,12 @@ class HloModule {
   // Gets the number of computations in this module.
   int64 computation_count() const { return computations_.size(); }
 
+  // Returns the mutable computation for the given index.
+  HloComputation* mutable_computation(int64 idx) {
+    CHECK(idx >= 0 && idx < computations_.size());
+    return computations_[idx].get();
+  }
+
   // Gets the number of instructions in this module.
   int64 instruction_count() const;
 
@@ -187,6 +193,7 @@ class HloModule {
   std::vector<HloComputation*> MakeNonfusionComputations() const;
 
   const HloModuleConfig& config() const { return config_; }
+  void set_config(HloModuleConfig& config) { config_ = config; }
 
   // Return a string representation of the module.
   //
@@ -264,6 +271,18 @@ class HloModule {
   const HloSchedule& schedule() const { return *schedule_; }
   HloSchedule& schedule() { return *schedule_; }
 
+  HloComputation* AddComputationAndUnifyNamesAndIds(
+      std::unique_ptr<HloComputation> computation, bool is_entry) {
+    computation->ClearUniqueIdInternal();
+    for (auto* instruction : computation->instructions()) {
+      instruction->ClearUniqueIdInternal();
+    }
+    return AddComputationInternal(std::move(computation), is_entry,
+                                  /*uniquify_identifiers=*/true);
+  }
+
+  Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 68c18836eb01484b819e7b7bd26f099dcf56e7ba..cee46fe10a2376555d82d2fcbce756aefaf4b982 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -101,6 +102,20 @@ class HloModuleConfig {
     return intra_op_parallelism_threads_;
   }
 
+  // Checks if this config has a static device assignment.
+  bool has_static_device_assignment() const {
+    return static_device_assignment_.has_value();
+  }
+
+  // Getter and setter of the compile-time known device assignment.
+  const DeviceAssignment& static_device_assignment() const {
+    CHECK(static_device_assignment_.has_value());
+    return *static_device_assignment_;
+  }
+  void set_static_device_assignment(const DeviceAssignment& device_assignment) {
+    static_device_assignment_ = device_assignment;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -117,6 +132,9 @@ class HloModuleConfig {
   int64 intra_op_parallelism_threads_ = -1;
 
   DebugOptions debug_options_;
+
+  // Compile-time known device assignment.
+  absl::optional<DeviceAssignment> static_device_assignment_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index f6e2866204955ac024c2b6f972de449cc3df4c15..84988a9ecb31f3e5058a2c7aa3a44954bd9c9ac9 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -86,7 +86,7 @@ TEST_F(HloModuleDceTest, WhileWithLiveOutputs) {
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -125,7 +125,7 @@ TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) {
     loop_var.2 = (s32[], f32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.3 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.3)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.3), direction=LT
   }
   ENTRY SimpleLoop {
     constant.4 = s32[] constant(0)
@@ -163,7 +163,7 @@ TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -206,7 +206,7 @@ TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) {
     loop_var.2 = (s32[], s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -248,7 +248,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   SimpleLoop.body1 {
     loop_var.3 = (s32[], s32[3]{0}) parameter(0)
@@ -263,7 +263,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
     loop_var.4 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
     constant.4 = s32[] constant(5)
-    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+    ROOT less-than.1 = pred[] compare(get-tuple-element.6, constant.4), direction=LT
   }
   ENTRY SimpleLoop {
     constant.5 = s32[] constant(0)
@@ -316,7 +316,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
     loop_var.2 = (s32[3]{0}, s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
     constant.2 = s32[] constant(5)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   SimpleLoop.body1 {
     loop_var.3 = (s32[], s32[3]{0}) parameter(0)
@@ -331,7 +331,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
     loop_var.4 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
     constant.4 = s32[] constant(5)
-    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+    ROOT less-than.1 = pred[] compare(get-tuple-element.6, constant.4), direction=LT
   }
   ENTRY SimpleLoop {
     constant.5 = s32[] constant(0)
@@ -383,7 +383,7 @@ TEST_F(HloModuleDceTest, WhileWithOutfeed) {
     cond_param = (s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(cond_param), index=0
     constant.2 = s32[] constant(10)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(0)
@@ -418,7 +418,7 @@ TEST_F(HloModuleDceTest, WhileWithOnlyLoopVariableBumping) {
     cond_param = (s32[], s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(cond_param), index=0
     constant.2 = s32[] constant(10)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     p0 = (s32[]) parameter(0)
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 47734bc55cc00d605f4e318400be88639450343c..bc258a77000d17cdb6b1d1005b6dac70e300e398 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -45,11 +45,8 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
     case ComputationKind::kWhileBody:
       repr += ":WHILE_BODY";
       break;
-    case ComputationKind::kConditionalTrue:
-      repr += ":CONDITIONAL_TRUE";
-      break;
-    case ComputationKind::kConditionalFalse:
-      repr += ":CONDITIONAL_FALSE";
+    case ComputationKind::kConditionalBranch:
+      repr += absl::StrCat(":CONDITIONAL_BRANCH_", index_);
       break;
     case ComputationKind::kCallFunction:
       repr += ":CALL";
@@ -307,10 +304,10 @@ Status HloModuleGroupMetadata::RecordInstructions() {
       tracked_instructions_[hlo->while_body()] =
           TrackedInstruction(hlo, ComputationKind::kWhileBody);
     } else if (hlo->opcode() == HloOpcode::kConditional) {
-      tracked_instructions_[hlo->true_computation()] =
-          TrackedInstruction(hlo, ComputationKind::kConditionalTrue);
-      tracked_instructions_[hlo->false_computation()] =
-          TrackedInstruction(hlo, ComputationKind::kConditionalFalse);
+      for (int b = 0; b < hlo->branch_count(); ++b) {
+        tracked_instructions_[hlo->branch_computation(b)] =
+            TrackedInstruction(hlo, ComputationKind::kConditionalBranch, b);
+      }
     } else if (hlo->opcode() == HloOpcode::kCall) {
       tracked_instructions_[hlo->to_apply()] =
           TrackedInstruction(hlo, ComputationKind::kCallFunction);
@@ -389,9 +386,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
                instruction1->opcode() == HloOpcode::kCall);
   VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
           << instruction2->ToString();
-
-  if (!ContainsKey(companion_set_index_, instruction1) &&
-      !ContainsKey(companion_set_index_, instruction2)) {
+  if (instruction1 == instruction2) {
+    return Status::OK();
+  } else if (!ContainsKey(companion_set_index_, instruction1) &&
+             !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
         absl::make_unique<std::vector<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
@@ -419,7 +417,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
     for (HloInstruction* hlo : Companions(instruction2)) {
       companion_set_index_[hlo] = companion_set_index_[instruction1];
     }
-    companion_sets_.erase(companion_sets_.begin() + index_to_remove);
+    // We can't remove the set from the vector because companion_set_index_
+    // references sets by their index in this vector, so we reset to nullptr
+    // instead.
+    companion_sets_[index_to_remove].reset(nullptr);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 3ed95c10504141139d83eb8679a0b8144b15ad0d..07becfc3638a550b661e2ee0d4f10ac5e836e481 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -67,8 +67,7 @@ class HloModuleGroupMetadata {
     kInvalid,
     kWhileCondition,
     kWhileBody,
-    kConditionalTrue,
-    kConditionalFalse,
+    kConditionalBranch,
     kCallFunction,
   };
 
@@ -80,12 +79,13 @@ class HloModuleGroupMetadata {
   class TrackedInstruction {
    public:
     TrackedInstruction() = default;
-    TrackedInstruction(HloInstruction* instruction, ComputationKind kind)
-        : instruction_(instruction), kind_(kind) {}
+    TrackedInstruction(HloInstruction* instruction, ComputationKind kind,
+                       int index = -1)
+        : instruction_(instruction), kind_(kind), index_(index) {}
 
     bool operator==(const TrackedInstruction& rhs) const {
       return instruction_->opcode() == rhs.instruction_->opcode() &&
-             kind_ == rhs.kind_;
+             kind_ == rhs.kind_ && index_ == rhs.index_;
     }
     bool operator!=(const TrackedInstruction& rhs) const {
       return !operator==(rhs);
@@ -98,6 +98,7 @@ class HloModuleGroupMetadata {
    private:
     HloInstruction* instruction_ = nullptr;
     ComputationKind kind_ = ComputationKind::kInvalid;
+    int index_ = -1;
   };
 
   // Represents a channel and the instructions that form the channel.
@@ -173,7 +174,8 @@ class HloModuleGroupMetadata {
   // Returns the number of modules for devices (excluding the host module).
   int64 GetDeviceModulesCount() const;
 
-  // Returns the companion instructions for the given instruction.
+  // Returns the companion set for the given instruction, including the
+  // instruction itself.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::vector<HloInstruction*>& Companions(
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 91417bd2d9a6ca8a5192a37302e6a91e49a94d77..b2a361f504cb341bea04f50557099e46da3610d4 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -207,6 +207,39 @@ std::vector<HloInstruction*> HloModuleGroupUtil::RootInstructions(
   return roots;
 }
 
+string HloModuleGroupUtil::CycleToString(HloInstruction* init_instruction) {
+  std::vector<string> names;
+  absl::flat_hash_set<HloInstruction*> seen;
+
+  std::function<bool(HloInstruction*)> helper =
+      [&](HloInstruction* instruction) {
+        if (seen.find(instruction) != seen.end()) {
+          if (instruction == init_instruction) {
+            names.push_back(instruction->name());
+            return true;
+          }
+          return false;
+        }
+        seen.insert(instruction);
+        for (HloInstruction* predecessor : GlobalPredecessors(instruction)) {
+          bool init_found = helper(predecessor);
+          if (init_found) {
+            names.push_back(instruction->name());
+            return true;
+          }
+        }
+        return false;
+      };
+
+  helper(init_instruction);
+  std::vector<string> pieces;
+  pieces.reserve(names.size());
+  for (auto name : names) {
+    pieces.push_back(name);
+  }
+  return absl::StrJoin(pieces, " --> ");
+}
+
 Status HloModuleGroupUtil::VisitTopologicalOrder(
     VisitStates* visit_state, const VisitFunction& visit_function,
     HloInstruction* root) {
@@ -269,22 +302,9 @@ Status HloModuleGroupUtil::VisitTopologicalOrder(
         // a cycle. Generate an error with the list of instructions in the
         // cycle.
         if ((*visit_state)[predecessor] == VisitState::kVisiting) {
-          string cyclic_instructions;
-          for (const auto& state : *visit_state) {
-            if (state.second == VisitState::kVisiting) {
-              absl::StrAppend(&cyclic_instructions, state.first->ToString(),
-                              "\n");
-            }
-          }
-          // TODO(b/64305524): Improve the error message to print out the
-          // instructions in a deterministic order that forms the cycle.
           return FailedPrecondition(
-              "Cross-computation cycle detected via communicating nodes. The "
-              "cycle contains the node %s. The cycle is found among the "
-              "following nodes. Note that the order of the nodes is arbitrary "
-              "and that the list may include nodes that are not part of the "
-              "cycle.\n%s",
-              predecessor->ToString(), cyclic_instructions);
+              "Cross-computation cycle detected via communicating nodes.\n%s",
+              CycleToString(predecessor));
         }
         stack.push(predecessor);
       }
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
index 862666b48c9aa423ba4eeea3052c17fcc1064fd2..d388fe51d0ddf8cebca678d13f2cea96ba8f6114 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -108,6 +108,8 @@ class HloModuleGroupUtil {
       HloInstruction* instruction, HloReachabilityMap* reachability_map);
 
  private:
+  string CycleToString(HloInstruction* instruction);
+
   const HloModuleGroupMetadata& metadata_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 4551a1c2e259b06818f913cb6a9e782436b7e594..8f459107b3262d6b110258d784e101e52b6f8f0d 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -44,36 +44,29 @@ StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name) {
   return it->second;
 }
 
-#define CHECK_DEFAULT(property_name, opcode_name) false
-#define CHECK_PROPERTY(property_name, opcode_name, value) \
-  (value & property_name)
-#define RESOLVE(_1, _2, target, ...) target
-#define HAS_PROPERTY(property, ...) \
-  RESOLVE(__VA_ARGS__, CHECK_PROPERTY, CHECK_DEFAULT)(property, __VA_ARGS__)
-
 bool HloOpcodeIsComparison(HloOpcode opcode) {
-  switch (opcode) {
-#define CASE_IS_COMPARISON(enum_name, ...) \
-  case HloOpcode::enum_name:               \
-    return HAS_PROPERTY(kHloOpcodeIsComparison, __VA_ARGS__);
-    HLO_OPCODE_LIST(CASE_IS_COMPARISON)
-#undef CASE_IS_COMPARISON
-  }
+  return opcode == HloOpcode::kCompare;
 }
 
 bool HloOpcodeIsVariadic(HloOpcode opcode) {
   switch (opcode) {
-#define CASE_IS_VARIADIC(enum_name, ...) \
-  case HloOpcode::enum_name:             \
-    return HAS_PROPERTY(kHloOpcodeIsVariadic, __VA_ARGS__);
+#define CASE_IS_VARIADIC(enum_name, opcode_name, arity, ...) \
+  case HloOpcode::enum_name:                                 \
+    return arity == kHloOpcodeIsVariadic;
     HLO_OPCODE_LIST(CASE_IS_VARIADIC)
 #undef CASE_IS_VARIADIC
   }
 }
 
-#undef HAS_PROPERTY
-#undef RESOLVE
-#undef CHECK_DEFAULT
-#undef CHECK_PROPERTY
+absl::optional<int> HloOpcodeArity(HloOpcode opcode) {
+  switch (opcode) {
+#define CASE_ARITY(enum_name, opcode_name, arity, ...)   \
+  case HloOpcode::enum_name:                             \
+    return arity == kHloOpcodeIsVariadic ? absl::nullopt \
+                                         : absl::make_optional(arity);
+    HLO_OPCODE_LIST(CASE_ARITY)
+#undef CASE_ARITY
+  }
+}
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 94122ac38ff2a3f7053b19e55f9a400c80ae2134..c5ccd49552a87512547b72b6ae1830e582521125 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -18,8 +18,11 @@ limitations under the License.
 
 #include <iosfwd>
 #include <string>
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -30,9 +33,9 @@ namespace xla {
 // See the XLA documentation for the semantics of each opcode.
 //
 // Each entry has the format:
-// (enum_name, opcode_name)
+// (enum_name, opcode_name, arity)
 // or
-// (enum_name, opcode_name, p1 | p2 | ...)
+// (enum_name, opcode_name, arity, p1 | p2 | ...)
 //
 // with p1, p2, ... are members of HloOpcodeProperty. They are combined
 // using bitwise-or.
@@ -44,102 +47,102 @@ namespace xla {
 // - In fully qualified names (HloInstruction::FullyQualifiedName()), to
 //   separate the qualifiers (name of the computation and potentially the
 //   fusion instruction) from the name
-#define HLO_OPCODE_LIST(V)                                   \
-  V(kAbs, "abs")                                             \
-  V(kAdd, "add")                                             \
-  V(kAddDependency, "add-dependency")                        \
-  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
-  V(kAllReduce, "all-reduce")                                \
-  V(kAllToAll, "all-to-all")                                 \
-  V(kAtan2, "atan2")                                         \
-  V(kBatchNormGrad, "batch-norm-grad")                       \
-  V(kBatchNormInference, "batch-norm-inference")             \
-  V(kBatchNormTraining, "batch-norm-training")               \
-  V(kBitcast, "bitcast")                                     \
-  V(kBitcastConvert, "bitcast-convert")                      \
-  V(kBroadcast, "broadcast")                                 \
-  V(kCall, "call", kHloOpcodeIsVariadic)                     \
-  V(kCeil, "ceil")                                           \
-  V(kClamp, "clamp")                                         \
-  V(kCollectivePermute, "collective-permute")                \
-  V(kClz, "count-leading-zeros")                             \
-  V(kComplex, "complex")                                     \
-  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
-  V(kConditional, "conditional")                             \
-  V(kConstant, "constant")                                   \
-  V(kConvert, "convert")                                     \
-  V(kConvolution, "convolution")                             \
-  V(kCopy, "copy")                                           \
-  V(kCos, "cosine")                                          \
-  V(kCustomCall, "custom-call")                              \
-  V(kDivide, "divide")                                       \
-  V(kDomain, "domain")                                       \
-  V(kDot, "dot")                                             \
-  V(kDynamicSlice, "dynamic-slice")                          \
-  V(kDynamicUpdateSlice, "dynamic-update-slice")             \
-  V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
-  V(kExp, "exponential")                                     \
-  V(kExpm1, "exponential-minus-one")                         \
-  V(kFft, "fft")                                             \
-  V(kFloor, "floor")                                         \
-  V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
-  V(kGather, "gather")                                       \
-  V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
-  V(kGetDimensionSize, "get-dimension-size")                 \
-  V(kGetTupleElement, "get-tuple-element")                   \
-  V(kGt, "greater-than", kHloOpcodeIsComparison)             \
-  V(kImag, "imag")                                           \
-  V(kInfeed, "infeed")                                       \
-  V(kIota, "iota")                                           \
-  V(kIsFinite, "is-finite")                                  \
-  V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
-  V(kLog, "log")                                             \
-  V(kLog1p, "log-plus-one")                                  \
-  V(kAnd, "and")                                             \
-  V(kNot, "not")                                             \
-  V(kOr, "or")                                               \
-  V(kXor, "xor")                                             \
-  V(kLt, "less-than", kHloOpcodeIsComparison)                \
-  V(kMap, "map", kHloOpcodeIsVariadic)                       \
-  V(kMaximum, "maximum")                                     \
-  V(kMinimum, "minimum")                                     \
-  V(kMultiply, "multiply")                                   \
-  V(kNe, "not-equal-to", kHloOpcodeIsComparison)             \
-  V(kNegate, "negate")                                       \
-  V(kOutfeed, "outfeed")                                     \
-  V(kPad, "pad")                                             \
-  V(kParameter, "parameter")                                 \
-  V(kPower, "power")                                         \
-  V(kReal, "real")                                           \
-  V(kRecv, "recv")                                           \
-  V(kRecvDone, "recv-done")                                  \
-  V(kReduce, "reduce")                                       \
-  V(kReducePrecision, "reduce-precision")                    \
-  V(kReduceWindow, "reduce-window")                          \
-  V(kRemainder, "remainder")                                 \
-  V(kReshape, "reshape")                                     \
-  V(kReverse, "reverse")                                     \
-  V(kRng, "rng")                                             \
-  V(kRoundNearestAfz, "round-nearest-afz")                   \
-  V(kScatter, "scatter")                                     \
-  V(kSelect, "select")                                       \
-  V(kSelectAndScatter, "select-and-scatter")                 \
-  V(kSend, "send")                                           \
-  V(kSendDone, "send-done")                                  \
-  V(kShiftLeft, "shift-left")                                \
-  V(kShiftRightArithmetic, "shift-right-arithmetic")         \
-  V(kShiftRightLogical, "shift-right-logical")               \
-  V(kSign, "sign")                                           \
-  V(kSin, "sine")                                            \
-  V(kSlice, "slice")                                         \
-  V(kSort, "sort")                                           \
-  V(kSubtract, "subtract")                                   \
-  V(kTanh, "tanh")                                           \
-  V(kTrace, "trace")                                         \
-  V(kTranspose, "transpose")                                 \
-  V(kTuple, "tuple", kHloOpcodeIsVariadic)                   \
-  V(kTupleSelect, "tuple-select")                            \
-  V(kWhile, "while")
+#define HLO_OPCODE_LIST(V)                                             \
+  V(kAbs, "abs", 1)                                                    \
+  V(kAdd, "add", 2)                                                    \
+  V(kAddDependency, "add-dependency", 2)                               \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                      \
+  V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                    \
+  V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                     \
+  V(kAtan2, "atan2", 2)                                                \
+  V(kBatchNormGrad, "batch-norm-grad", 5)                              \
+  V(kBatchNormInference, "batch-norm-inference", 5)                    \
+  V(kBatchNormTraining, "batch-norm-training", 3)                      \
+  V(kBitcast, "bitcast", 1)                                            \
+  V(kBitcastConvert, "bitcast-convert", 1)                             \
+  V(kBroadcast, "broadcast", 1)                                        \
+  V(kCall, "call", kHloOpcodeIsVariadic)                               \
+  V(kCeil, "ceil", 1)                                                  \
+  V(kCholesky, "cholesky", 1)                                          \
+  V(kClamp, "clamp", 3)                                                \
+  V(kCollectivePermute, "collective-permute", 1)                       \
+  V(kClz, "count-leading-zeros", 1)                                    \
+  V(kCompare, "compare", 2)                                            \
+  V(kComplex, "complex", 2)                                            \
+  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)                 \
+  V(kConditional, "conditional", kHloOpcodeIsVariadic)                 \
+  V(kConstant, "constant", 0)                                          \
+  V(kConvert, "convert", 1)                                            \
+  V(kConvolution, "convolution", 2)                                    \
+  V(kCopy, "copy", 1)                                                  \
+  V(kCos, "cosine", 1)                                                 \
+  V(kCustomCall, "custom-call", kHloOpcodeIsVariadic)                  \
+  V(kDivide, "divide", 2)                                              \
+  V(kDomain, "domain", 1)                                              \
+  V(kDot, "dot", 2)                                                    \
+  V(kDynamicSlice, "dynamic-slice", kHloOpcodeIsVariadic)              \
+  V(kDynamicUpdateSlice, "dynamic-update-slice", kHloOpcodeIsVariadic) \
+  V(kExp, "exponential", 1)                                            \
+  V(kExpm1, "exponential-minus-one", 1)                                \
+  V(kFft, "fft", 1)                                                    \
+  V(kFloor, "floor", 1)                                                \
+  V(kFusion, "fusion", kHloOpcodeIsVariadic)                           \
+  V(kGather, "gather", 2)                                              \
+  V(kGetDimensionSize, "get-dimension-size", 1)                        \
+  V(kGetTupleElement, "get-tuple-element", 1)                          \
+  V(kImag, "imag", 1)                                                  \
+  V(kInfeed, "infeed", 1)                                              \
+  V(kIota, "iota", 0)                                                  \
+  V(kIsFinite, "is-finite", 1)                                         \
+  V(kLog, "log", 1)                                                    \
+  V(kLog1p, "log-plus-one", 1)                                         \
+  V(kAnd, "and", 2)                                                    \
+  V(kNot, "not", 1)                                                    \
+  V(kOr, "or", 2)                                                      \
+  V(kXor, "xor", 2)                                                    \
+  V(kMap, "map", kHloOpcodeIsVariadic)                                 \
+  V(kMaximum, "maximum", 2)                                            \
+  V(kMinimum, "minimum", 2)                                            \
+  V(kMultiply, "multiply", 2)                                          \
+  V(kNegate, "negate", 1)                                              \
+  V(kOutfeed, "outfeed", 2)                                            \
+  V(kPad, "pad", 2)                                                    \
+  V(kParameter, "parameter", 0)                                        \
+  V(kPower, "power", 2)                                                \
+  V(kReal, "real", 1)                                                  \
+  V(kRecv, "recv", 1)                                                  \
+  V(kRecvDone, "recv-done", 1)                                         \
+  V(kReduce, "reduce", kHloOpcodeIsVariadic)                           \
+  V(kReducePrecision, "reduce-precision", 1)                           \
+  V(kReduceWindow, "reduce-window", 2)                                 \
+  V(kRemainder, "remainder", 2)                                        \
+  V(kReplicaId, "replica-id", 0)                                       \
+  V(kReshape, "reshape", 1)                                            \
+  V(kReverse, "reverse", 1)                                            \
+  V(kRng, "rng", kHloOpcodeIsVariadic)                                 \
+  V(kRoundNearestAfz, "round-nearest-afz", 1)                          \
+  V(kRsqrt, "rsqrt", 1)                                                \
+  V(kScatter, "scatter", 3)                                            \
+  V(kSelect, "select", 3)                                              \
+  V(kSelectAndScatter, "select-and-scatter", 3)                        \
+  V(kSend, "send", 2)                                                  \
+  V(kSendDone, "send-done", 1)                                         \
+  V(kShiftLeft, "shift-left", 2)                                       \
+  V(kShiftRightArithmetic, "shift-right-arithmetic", 2)                \
+  V(kShiftRightLogical, "shift-right-logical", 2)                      \
+  V(kSign, "sign", 1)                                                  \
+  V(kSin, "sine", 1)                                                   \
+  V(kSlice, "slice", 1)                                                \
+  V(kSort, "sort", kHloOpcodeIsVariadic)                               \
+  V(kSqrt, "sqrt", 1)                                                  \
+  V(kSubtract, "subtract", 2)                                          \
+  V(kTanh, "tanh", 1)                                                  \
+  V(kTrace, "trace", 1)                                                \
+  V(kTranspose, "transpose", 1)                                        \
+  V(kTriangularSolve, "triangular-solve", 2)                           \
+  V(kTuple, "tuple", kHloOpcodeIsVariadic)                             \
+  V(kTupleSelect, "tuple-select", 3)                                   \
+  V(kWhile, "while", 1)
 
 enum class HloOpcode {
 #define DECLARE_ENUM(enum_name, opcode_name, ...) enum_name,
@@ -147,12 +150,16 @@ enum class HloOpcode {
 #undef DECLARE_ENUM
 };
 
+// Arity value that denotes that an operator is variadic.
+enum {
+  kHloOpcodeIsVariadic = -1,
+};
+
 // List of properties associated with opcodes.
 // Properties are defined as increasing powers of two, so that we can use
 // bitwise-or to combine properties, and bitwise-and to test for them.
 enum HloOpcodeProperty {
   kHloOpcodeIsComparison = 1 << 0,
-  kHloOpcodeIsVariadic = 1 << 1,
 };
 
 // Returns a string representation of the opcode.
@@ -171,6 +178,10 @@ bool HloOpcodeIsComparison(HloOpcode opcode);
 // Returns true iff the given opcode has variadic operands.
 bool HloOpcodeIsVariadic(HloOpcode opcode);
 
+// Returns the arity of opcode. If the opcode is variadic,
+// returns nullopt.
+absl::optional<int> HloOpcodeArity(HloOpcode opcode);
+
 // Returns the number of HloOpcode values.
 inline const uint32_t HloOpcodeCount() {
 #define HLO_COUNT_ONE(...) +1
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 6f3f83f63a05fafaa3f3ddcff8a7cac7cb7b06d5..136e6702b21c05583abaf0e328800cedfd778aa8 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -42,23 +42,27 @@ TEST(HloOpcodeTest, OpcodeProperties) {
 
     // Test some properties.
     switch (opcode) {
-      case HloOpcode::kEq:
-      case HloOpcode::kNe:
-      case HloOpcode::kGt:
-      case HloOpcode::kLt:
-      case HloOpcode::kGe:
-      case HloOpcode::kLe:
+      case HloOpcode::kCompare:
         EXPECT_TRUE(HloOpcodeIsComparison(opcode));
         break;
       default:
         EXPECT_FALSE(HloOpcodeIsComparison(opcode));
     }
     switch (opcode) {
+      case HloOpcode::kAfterAll:
+      case HloOpcode::kAllReduce:
+      case HloOpcode::kAllToAll:
       case HloOpcode::kCall:
       case HloOpcode::kConcatenate:
+      case HloOpcode::kConditional:
+      case HloOpcode::kCustomCall:
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kFusion:
       case HloOpcode::kMap:
-      case HloOpcode::kAfterAll:
+      case HloOpcode::kReduce:
+      case HloOpcode::kRng:
+      case HloOpcode::kSort:
       case HloOpcode::kTuple:
         EXPECT_TRUE(HloOpcodeIsVariadic(opcode));
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 0cec61c257bb84e467290fb52ec9063a32ed558d..831771fe63b8dd4c276ad3ec05ea90b4d475e7e0 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -66,24 +66,31 @@ bool HloOrdering::ExecutesBefore(const HloInstruction* a,
     }
   }
 
-  // If the common ancestor is a conditional instruction, even though the true
-  // and false computations are not really ordered per-se, we define the true
-  // computation to be ordered before the false one.
-  // This ensures that buffers can still be shared among the two computations
+  // If the common ancestor is a conditional instruction, even though the branch
+  // computations are not really ordered per-se, we define the 0th branch
+  // computation to be ordered before the 1st one, before the 2nd and so forth.
+  // This ensures that buffers can still be shared among branch computations
   // as they will forcibly have disjoint liveness.
   if (a_ancestor == b_ancestor &&
-      a_ancestor->opcode() == HloOpcode::kConditional) {
-    const HloComputation* true_computation = a_ancestor->true_computation();
-    const HloComputation* false_computation = a_ancestor->false_computation();
-    if (call_graph_->InstructionIsNestedIn(a, true_computation) &&
-        call_graph_->InstructionIsNestedIn(b, false_computation)) {
+      (a_ancestor->opcode() == HloOpcode::kConditional)) {
+    int a_branch = -1;
+    int b_branch = -1;
+    for (int j = 0; j < a_ancestor->branch_count(); ++j) {
+      if (call_graph_->InstructionIsNestedIn(
+              a, a_ancestor->branch_computation(j))) {
+        a_branch = j;
+      }
+      if (call_graph_->InstructionIsNestedIn(
+              b, a_ancestor->branch_computation(j))) {
+        b_branch = j;
+      }
+    }
+    if (a_branch != -1 && a_branch < b_branch) {
       return true;
     }
-    // If 'b' is the conditional ancestor, and 'a' is within the true or false
-    // computations, 'a' executes before 'b'.
-    if (b == a_ancestor &&
-        (call_graph_->InstructionIsNestedIn(a, true_computation) ||
-         call_graph_->InstructionIsNestedIn(a, false_computation))) {
+    // If 'b' is the conditional ancestor, and 'a' is within a branch
+    // computation, 'a' executes before 'b'.
+    if (b == a_ancestor && a_branch != -1) {
       return true;
     }
   }
@@ -144,17 +151,17 @@ bool HloOrdering::IsDefinedBefore(const HloValue& a, const HloValue& b) const {
            b.defining_instruction()->while_condition()))) {
     return true;
   }
-  // If 'b' is a conditional phi and 'a' is in the true or false computation,
-  // then 'a' executes before 'b'.
+  // If 'b' is a conditional phi and 'a' is in some branch computation, then 'a'
+  // executes before 'b'.
   if (b.is_phi() &&
-      b.defining_instruction()->opcode() == HloOpcode::kConditional &&
-      (call_graph_->InstructionIsNestedIn(
-           a.defining_instruction(),
-           b.defining_instruction()->true_computation()) ||
-       call_graph_->InstructionIsNestedIn(
-           a.defining_instruction(),
-           b.defining_instruction()->false_computation()))) {
-    return true;
+      b.defining_instruction()->opcode() == HloOpcode::kConditional) {
+    for (int j = 0; j < b.defining_instruction()->branch_count(); ++j) {
+      if (call_graph_->InstructionIsNestedIn(
+              a.defining_instruction(),
+              b.defining_instruction()->branch_computation(j))) {
+        return true;
+      }
+    }
   }
   return ExecutesBefore(a.defining_instruction(), b.defining_instruction());
 }
@@ -225,17 +232,14 @@ bool HloOrdering::UseIsBeforeValueDefinition(
 
   if (use.instruction->opcode() == HloOpcode::kConditional) {
     const HloInstruction* conditional = use.instruction;
-    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           conditional->true_computation())) {
-      VLOG(4) << "  use is conditional " << use.instruction->name()
-              << " and def is in TRUE computation";
-      return true;
-    }
-    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           conditional->false_computation())) {
-      VLOG(4) << "  use is conditional " << use.instruction->name()
-              << " and def is in FALSE computation";
-      return true;
+    for (int j = 0; j < conditional->branch_count(); ++j) {
+      if (call_graph_->InstructionIsNestedIn(
+              value.defining_instruction(),
+              conditional->branch_computation(j))) {
+        VLOG(4) << "  use is conditional " << use.instruction->name()
+                << " and def is in " << j << "th branch computation";
+        return true;
+      }
     }
     if (value.defining_instruction() == use.instruction) {
       VLOG(4) << "  use is conditional " << use << " and def is "
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 3ca77e60cd5275c22eb0e338cd5437fc44b49958..8e8b9d663ea2540dec0b1011c32204c52ef6beca 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -306,7 +306,7 @@ condition.v4 {
   constant.2 = s32[] constant(2)
   prev.2 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) parameter(0)
   get-tuple-element.8 = s32[] get-tuple-element(prev.2), index=0
-  ROOT greater-than = pred[] greater-than(constant.2, get-tuple-element.8)
+  ROOT greater-than = pred[] compare(constant.2, get-tuple-element.8), direction=GT
 }
 
 fused_computation {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 638396308c2a9c1f20e47f78b594d54f07c0c4e5..8e76a1f262e988d19a0689f4c691844d9f28a559 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include <type_traits>
 
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
@@ -21,17 +22,21 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace xla {
 
@@ -44,8 +49,6 @@ using absl::StrCat;
 using absl::StrFormat;
 using absl::StrJoin;
 
-const double kF16max = 65504;
-
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
 HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
@@ -60,6 +63,10 @@ HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   return schedule;
 }
 
+// Some functions accept either a linear index or a multi-dimensional index
+// (used for indexing into sparse literals).
+using LinearOrMultiIndex = absl::variant<int64, absl::Span<const int64>>;
+
 // Parser for the HloModule::ToString() format text.
 class HloParser {
  public:
@@ -77,6 +84,7 @@ class HloParser {
   // Stand alone parsing utils for various aggregate data types.
   StatusOr<Shape> ParseShapeOnly();
   StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
@@ -102,7 +110,7 @@ class HloParser {
   // Parse a single instruction worth of text.
   bool ParseSingleInstruction(HloModule* module);
 
-  // ParseXXX returns false if an error occurred.
+  // Parses a module, returning false if an error occurred.
   bool ParseHloModule(HloModule* module);
 
   bool ParseComputations(HloModule* module);
@@ -118,21 +126,30 @@ class HloParser {
   bool ParseNonTupleLiteral(Literal* literal, const Shape& shape);
   bool ParseDenseLiteral(Literal* literal, const Shape& shape);
   bool ParseSparseLiteral(Literal* literal, const Shape& shape);
-  template <typename LiteralNativeT>
-  bool ParseSparseLiteralHelper(Literal* literal, const Shape& shape);
 
-  // Sets the sub-value of literal at the given index to the given value. The
-  // literal's shape must have the default layout.
-  bool SetValueInLiteral(tensorflow::int64 value,
-                         tensorflow::int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(double value, tensorflow::int64 linear_index,
+  // Sets the sub-value of literal at the given linear or sparse index to the
+  // given value. If the literal is dense, it myst have the default layout.
+  //
+  // `loc` should be the source location of the value.
+  bool SetValueInLiteral(LocTy loc, int64 value, LinearOrMultiIndex index,
                          Literal* literal);
-  bool SetValueInLiteral(bool value, tensorflow::int64 linear_index,
+  bool SetValueInLiteral(LocTy loc, double value, LinearOrMultiIndex index,
                          Literal* literal);
+  bool SetValueInLiteral(LocTy loc, bool value, LinearOrMultiIndex index,
+                         Literal* literal);
+  bool SetValueInLiteral(LocTy loc, std::complex<double> value,
+                         LinearOrMultiIndex index, Literal* literal);
+  // `loc` should be the source location of the value.
   template <typename LiteralNativeT, typename ParsedElemT>
-  bool SetValueInLiteralHelper(ParsedElemT value,
-                               tensorflow::int64 linear_index,
-                               Literal* literal);
+  bool SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
+                               LinearOrMultiIndex index, Literal* literal);
+
+  // Checks whether the given value is within the range of LiteralNativeT.
+  // `loc` should be the source location of the value.
+  template <typename LiteralNativeT, typename ParsedElemT>
+  bool CheckParsedValueIsInRange(LocTy loc, ParsedElemT value);
+  template <typename LiteralNativeT>
+  bool CheckParsedValueIsInRange(LocTy loc, std::complex<double> value);
 
   bool ParseOperands(std::vector<HloInstruction*>* operands);
   // Fills parsed operands into 'operands' and expects a certain number of
@@ -143,9 +160,9 @@ class HloParser {
   // Describes the start, limit, and stride on every dimension of the operand
   // being sliced.
   struct SliceRanges {
-    std::vector<tensorflow::int64> starts;
-    std::vector<tensorflow::int64> limits;
-    std::vector<tensorflow::int64> strides;
+    std::vector<int64> starts;
+    std::vector<int64> limits;
+    std::vector<int64> strides;
   };
 
   // The data parsed for the kDomain instruction.
@@ -164,10 +181,13 @@ class HloParser {
     kBracedInt64List,
     kBracedInt64ListList,
     kHloComputation,
+    kBracedHloComputationList,
     kFftType,
+    kComparisonDirection,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
+    kParameterReplication,
     kInstructionList,
     kSliceRanges,
     kPaddingConfig,
@@ -220,6 +240,21 @@ class HloParser {
   bool ParseAttributeHelper(const std::unordered_map<string, AttrConfig>& attrs,
                             std::unordered_set<string>* seen_attrs);
 
+  // Parses an attribute string into a protocol buffer `message`.
+  // Since proto3 has no notion of mandatory fields, `required_attrs` gives the
+  // set of mandatory attributes.
+  bool ParseAttributesAsProtoMessage(
+      const std::unordered_set<string>& required_attrs,
+      tensorflow::protobuf::Message* message);
+
+  // Parses one attribute. If it has already been seen, return error. Returns
+  // true and adds to seen_attrs on success.
+  //
+  // Do not call this except in ParseAttributesAsProtoMessage.
+  bool ParseAttributeAsProtoMessageHelper(
+      tensorflow::protobuf::Message* message,
+      std::unordered_set<string>* seen_attrs);
+
   // Parses a name and finds the corresponding hlo computation.
   bool ParseComputationName(HloComputation** value);
   // Parses a list of names and finds the corresponding hlo instructions.
@@ -232,21 +267,23 @@ class HloParser {
   bool ParseMetadata(OpMetadata* metadata);
   bool ParseSharding(OpSharding* sharding);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
+  bool ParseParameterReplication(ParameterReplication* parameter_replication);
 
   // Parses the metadata behind a kDOmain instruction.
   bool ParseDomain(DomainData* domain);
 
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
+  bool ParseDxD(const string& name, std::vector<int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
+  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParsePrecisionList(std::vector<PrecisionConfig::Precision>* result);
+  bool ParseHloComputation(HloComputation** result);
+  bool ParseHloComputationList(std::vector<HloComputation*>* result);
   bool ParseShapeList(std::vector<Shape>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim,
-                      std::vector<tensorflow::int64>* result);
+                      const TokKind delim, std::vector<int64>* result);
   // 'parse_and_add_item' is an lambda to parse an element in the list and add
   // the parsed element to the result. It's supposed to capture the result.
   bool ParseList(const TokKind start, const TokKind end, const TokKind delim,
@@ -261,13 +298,16 @@ class HloParser {
                            std::vector<bool>* dynamic_dimensions);
   bool ParseShape(Shape* result);
   bool ParseLayout(Layout* layout);
+  bool ParseTiles(std::vector<Tile>* tiles);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
+  bool ParseComparisonDirection(ComparisonDirection* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
   bool ParsePrecision(PrecisionConfig::Precision* result);
-  bool ParseInt64(tensorflow::int64* result);
+  bool ParseInt64(int64* result);
   bool ParseDouble(double* result);
+  bool ParseComplex(std::complex<double>* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
 
@@ -627,6 +667,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   std::unordered_map<string, AttrConfig> attrs;
   optional<OpSharding> sharding;
   attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
+  optional<ParameterReplication> parameter_replication;
+  attrs["parameter_replication"] = {/*required=*/false,
+                                    AttrTy::kParameterReplication,
+                                    &parameter_replication};
   optional<std::vector<HloInstruction*>> predecessors;
   attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList,
                                    &predecessors};
@@ -640,11 +684,17 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
-      tensorflow::int64 parameter_number;
+      int64 parameter_number;
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
-          !ParseInt64(&parameter_number) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
+          !ParseInt64(&parameter_number)) {
+        return false;
+      }
+      if (parameter_number < 0) {
+        Error(lexer_.GetLoc(), "parameter number must be >= 0");
+        return false;
+      }
+      if (!ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
           !ParseAttributes(attrs)) {
         return false;
       }
@@ -666,7 +716,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kIota: {
-      optional<tensorflow::int64> iota_dimension;
+      optional<int64> iota_dimension;
       attrs["iota_dimension"] = {/*required=*/true, AttrTy::kInt64,
                                  &iota_dimension};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
@@ -695,8 +745,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -713,12 +765,6 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kSubtract:
     case HloOpcode::kAtan2:
     case HloOpcode::kComplex:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kNe:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -830,6 +876,14 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateCollectivePermute(shape, operands[0], pairs));
       break;
     }
+    case HloOpcode::kReplicaId: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateReplicaId());
+      break;
+    }
     case HloOpcode::kReshape: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -861,17 +915,21 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSort: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
+      optional<bool> is_stable = false;
+      attrs["is_stable"] = {/*required=*/false, AttrTy::kBool, &is_stable};
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
           dimensions->size() != 1) {
         return false;
       }
-      instruction = builder->AddInstruction(HloInstruction::CreateSort(
-          shape, dimensions->at(0),
-          /*keys=*/operands[0],
-          /*values=*/absl::Span<HloInstruction* const>(operands).subspan(1)));
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateSort(shape, dimensions->at(0), operands,
+                                     to_apply.value(), is_stable.value()));
       break;
     }
     case HloOpcode::kTuple: {
@@ -897,7 +955,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecv: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -913,7 +971,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecvDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -931,7 +989,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSend: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -946,7 +1004,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSendDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -964,7 +1022,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<tensorflow::int64> index;
+      optional<int64> index;
       attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -1047,7 +1105,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     }
     case HloOpcode::kFft: {
       optional<FftType> fft_type;
-      optional<std::vector<tensorflow::int64>> fft_length;
+      optional<std::vector<int64>> fft_length;
       attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
       attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &fft_length};
@@ -1059,8 +1117,43 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           shape, operands[0], *fft_type, *fft_length));
       break;
     }
+    case HloOpcode::kTriangularSolve: {
+      TriangularSolveOptions options;
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributesAsProtoMessage(
+              /*required_attrs=*/std::unordered_set<string>(), &options)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateTriangularSolve(
+              shape, operands[0], operands[1], options));
+      break;
+    }
+    case HloOpcode::kCompare: {
+      optional<ComparisonDirection> direction;
+      attrs["direction"] = {/*required=*/true, AttrTy::kComparisonDirection,
+                            &direction};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateCompare(
+          shape, operands[0], operands[1], *direction));
+      break;
+    }
+    case HloOpcode::kCholesky: {
+      CholeskyOptions options;
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributesAsProtoMessage(
+              /*required_attrs=*/std::unordered_set<string>(), &options)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateCholesky(shape, operands[0], options));
+      break;
+    }
     case HloOpcode::kBroadcast: {
-      optional<std::vector<tensorflow::int64>> broadcast_dimensions;
+      optional<std::vector<int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &broadcast_dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1072,7 +1165,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kConcatenate: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
@@ -1087,7 +1180,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> to_apply;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1103,7 +1196,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> reduce_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
-      optional<std::vector<tensorflow::int64>> dimensions_to_reduce;
+      optional<std::vector<int64>> dimensions_to_reduce;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions_to_reduce};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1124,7 +1217,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReverse: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1168,7 +1261,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDynamicSlice: {
-      optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
+      optional<std::vector<int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
       LocTy loc = lexer_.GetLoc();
@@ -1207,7 +1300,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTranspose: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1221,7 +1314,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormTraining: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
@@ -1237,7 +1330,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormInference: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1254,7 +1347,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormGrad: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1336,8 +1429,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReducePrecision: {
-      optional<tensorflow::int64> exponent_bits;
-      optional<tensorflow::int64> mantissa_bits;
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
       attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
                                 &exponent_bits};
       attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
@@ -1355,18 +1448,36 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kConditional: {
       optional<HloComputation*> true_computation;
       optional<HloComputation*> false_computation;
-      attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation,
-                                   &true_computation};
-      attrs["false_computation"] = {/*required=*/true, AttrTy::kHloComputation,
-                                    &false_computation};
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
+      optional<std::vector<HloComputation*>> branch_computations;
+      if (!ParseOperands(&operands)) {
+        return false;
+      }
+      const bool branch_index_is_bool =
+          operands[0]->shape().element_type() == PRED;
+      if (branch_index_is_bool) {
+        attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation,
+                                     &true_computation};
+        attrs["false_computation"] = {
+            /*required=*/true, AttrTy::kHloComputation, &false_computation};
+      } else {
+        attrs["branch_computations"] = {/*required=*/true,
+                                        AttrTy::kBracedHloComputationList,
+                                        &branch_computations};
+      }
+      if (!ParseAttributes(attrs)) {
+        return false;
+      }
+      if (branch_index_is_bool) {
+        branch_computations.emplace({*true_computation, *false_computation});
+      }
+      if (branch_computations->empty() ||
+          operands.size() != branch_computations->size() + 1) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateConditional(
-          shape, /*pred=*/operands[0],
-          /*true_computation_arg=*/operands[1], *true_computation,
-          /*false_computation_arg=*/operands[2], *false_computation));
+          shape, /*branch_index=*/operands[0],
+          absl::MakeSpan(*branch_computations),
+          absl::MakeSpan(operands).subspan(1)));
       break;
     }
     case HloOpcode::kCustomCall: {
@@ -1375,6 +1486,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
+      optional<int64> batch_group_count;
       optional<std::vector<Shape>> operand_layout_constraints;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
@@ -1384,6 +1496,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
                                       &feature_group_count};
+      attrs["batch_group_count"] = {/*required=*/false, AttrTy::kInt64,
+                                    &batch_group_count};
       attrs["operand_layout_constraints"] = {
           /*required=*/false, AttrTy::kShapeList, &operand_layout_constraints};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1439,19 +1553,22 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       if (feature_group_count.has_value()) {
         instruction->set_feature_group_count(*feature_group_count);
       }
+      if (batch_group_count.has_value()) {
+        instruction->set_batch_group_count(*batch_group_count);
+      }
       break;
     }
     case HloOpcode::kDot: {
-      optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
+      optional<std::vector<int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> rhs_contracting_dims;
+      optional<std::vector<int64>> rhs_contracting_dims;
       attrs["rhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> lhs_batch_dims;
+      optional<std::vector<int64>> lhs_batch_dims;
       attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &lhs_batch_dims};
-      optional<std::vector<tensorflow::int64>> rhs_batch_dims;
+      optional<std::vector<int64>> rhs_batch_dims;
       attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &rhs_batch_dims};
       optional<std::vector<PrecisionConfig::Precision>> operand_precision;
@@ -1495,19 +1612,19 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<tensorflow::int64>> offset_dims;
+      optional<std::vector<int64>> offset_dims;
       attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &offset_dims};
-      optional<std::vector<tensorflow::int64>> collapsed_slice_dims;
+      optional<std::vector<int64>> collapsed_slice_dims;
       attrs["collapsed_slice_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims};
-      optional<std::vector<tensorflow::int64>> start_index_map;
+      optional<std::vector<int64>> start_index_map;
       attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List,
                                   &start_index_map};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<tensorflow::int64>> slice_sizes;
+      optional<std::vector<int64>> slice_sizes;
       attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &slice_sizes};
 
@@ -1529,17 +1646,17 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kScatter: {
-      optional<std::vector<tensorflow::int64>> update_window_dims;
+      optional<std::vector<int64>> update_window_dims;
       attrs["update_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &update_window_dims};
-      optional<std::vector<tensorflow::int64>> inserted_window_dims;
+      optional<std::vector<int64>> inserted_window_dims;
       attrs["inserted_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &inserted_window_dims};
-      optional<std::vector<tensorflow::int64>> scatter_dims_to_operand_dims;
+      optional<std::vector<int64>> scatter_dims_to_operand_dims;
       attrs["scatter_dims_to_operand_dims"] = {/*required=*/true,
                                                AttrTy::kBracedInt64List,
                                                &scatter_dims_to_operand_dims};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
 
@@ -1580,7 +1697,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
     case HloOpcode::kGetDimensionSize:
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1605,6 +1722,18 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     instruction->set_sharding(
         HloSharding::FromProto(sharding.value()).ValueOrDie());
   }
+  if (parameter_replication) {
+    int leaf_count = ShapeUtil::GetLeafCount(instruction->shape());
+    const auto& replicated =
+        parameter_replication->replicated_at_leaf_buffers();
+    if (leaf_count != replicated.size()) {
+      return Error(lexer_.GetLoc(),
+                   StrCat("parameter has ", leaf_count,
+                          " leaf buffers, but parameter_replication has ",
+                          replicated.size(), " elements."));
+    }
+    instruction->set_parameter_replicated_at_leaf_buffers(replicated);
+  }
   if (predecessors) {
     for (auto* pre : *predecessors) {
       Status status = pre->AddControlDependencyTo(instruction);
@@ -1669,8 +1798,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
-  std::vector<tensorflow::int64> devices;
-  std::vector<tensorflow::int64> tile_assignment_dimensions;
+  std::vector<int64> devices;
+  std::vector<int64> tile_assignment_dimensions;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
       case TokKind::kw_maximal:
@@ -1696,7 +1825,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           }
 
           do {
-            tensorflow::int64 dim;
+            int64 dim;
             if (!ParseInt64(&dim)) {
               return false;
             }
@@ -1708,7 +1837,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
             return false;
           }
           do {
-            tensorflow::int64 device;
+            int64 device;
             if (!ParseInt64(&device)) {
               return false;
             }
@@ -1752,10 +1881,10 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           "dimensions");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    for (tensorflow::int64 dim : tile_assignment_dimensions) {
+    for (int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
-    for (tensorflow::int64 device : devices) {
+    for (int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
   }
@@ -1764,6 +1893,32 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   return true;
 }
 
+// parameter_replication ::=
+//   '{' ('true' | 'false')* (',' ('true' | 'false'))*  '}'
+bool HloParser::ParseParameterReplication(
+    ParameterReplication* parameter_replication) {
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start parameter_replication attribute")) {
+    return false;
+  }
+
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    do {
+      if (lexer_.GetKind() == TokKind::kw_true) {
+        parameter_replication->add_replicated_at_leaf_buffers(true);
+      } else if (lexer_.GetKind() == TokKind::kw_false) {
+        parameter_replication->add_replicated_at_leaf_buffers(false);
+      } else {
+        return false;
+      }
+      lexer_.Lex();
+    } while (EatIfPresent(TokKind::kComma));
+  }
+
+  return ParseToken(TokKind::kRbrace,
+                    "expected '}' to end parameter_replication attribute");
+}
+
 // domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ','
 //            'exit=' exit_sharding '}'
 bool HloParser::ParseDomain(DomainData* domain) {
@@ -1816,130 +1971,146 @@ bool HloParser::ParseInstructionNames(
                     "expects '}' at the end of instruction name list");
 }
 
-bool HloParser::SetValueInLiteral(tensorflow::int64 value,
-                                  tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, int64 value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case S8:
-      return SetValueInLiteralHelper<tensorflow::int8>(value, linear_index,
-                                                       literal);
+      return SetValueInLiteralHelper<int8>(loc, value, index, literal);
     case S16:
-      return SetValueInLiteralHelper<tensorflow::int16>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int16>(loc, value, index, literal);
     case S32:
-      return SetValueInLiteralHelper<tensorflow::int32>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int32>(loc, value, index, literal);
     case S64:
-      return SetValueInLiteralHelper<tensorflow::int64>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int64>(loc, value, index, literal);
     case U8:
-      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint8>(loc, value, index,
                                                         literal);
     case U16:
-      return SetValueInLiteralHelper<tensorflow::uint16>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint16>(loc, value, index,
                                                          literal);
     case U32:
-      return SetValueInLiteralHelper<tensorflow::uint32>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint32>(loc, value, index,
                                                          literal);
     case U64:
-      return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint64>(loc, value, index,
                                                          literal);
     case PRED:
       // Bool type literals with rank >= 1 are printed in 0s and 1s.
-      return SetValueInLiteralHelper<bool>(static_cast<bool>(value),
-                                           linear_index, literal);
+      return SetValueInLiteralHelper<bool>(loc, static_cast<bool>(value), index,
+                                           literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(double value, tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, double value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case F16:
-      return SetValueInLiteralHelper<Eigen::half>(value, linear_index, literal);
+      return SetValueInLiteralHelper<Eigen::half>(loc, value, index, literal);
     case BF16:
-      return SetValueInLiteralHelper<tensorflow::bfloat16>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::bfloat16>(loc, value, index,
                                                            literal);
     case F32:
-      return SetValueInLiteralHelper<float>(value, linear_index, literal);
+      return SetValueInLiteralHelper<float>(loc, value, index, literal);
     case F64:
-      return SetValueInLiteralHelper<double>(value, linear_index, literal);
+      return SetValueInLiteralHelper<double>(loc, value, index, literal);
     default:
       LOG(FATAL) << "unknown floating point primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(bool value, tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, bool value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case PRED:
-      return SetValueInLiteralHelper<bool>(value, linear_index, literal);
+      return SetValueInLiteralHelper<bool>(loc, value, index, literal);
     default:
       LOG(FATAL) << PrimitiveType_Name(shape.element_type())
                  << " is not PRED type";
   }
 }
 
+bool HloParser::SetValueInLiteral(LocTy loc, std::complex<double> value,
+                                  LinearOrMultiIndex index, Literal* literal) {
+  const Shape& shape = literal->shape();
+  switch (shape.element_type()) {
+    case C64:
+      return SetValueInLiteralHelper<std::complex<float>>(loc, value, index,
+                                                          literal);
+    case C128:
+      return SetValueInLiteralHelper<std::complex<double>>(loc, value, index,
+                                                           literal);
+    default:
+      LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+                 << " is not a complex type type";
+  }
+}
+
+template <typename T>
+string StringifyValue(T val) {
+  return StrCat(val);
+}
+template <>
+string StringifyValue(std::complex<double> val) {
+  return StrFormat("(%f, %f)", std::real(val), std::imag(val));
+}
+
 template <typename LiteralNativeT, typename ParsedElemT>
-bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
-                                        tensorflow::int64 linear_index,
+bool HloParser::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
+                                        LinearOrMultiIndex index,
                                         Literal* literal) {
-  // Check that linear_index is in range.
-  if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
-    return TokenError(
-        StrCat("trys to set value ", value, " to a literal in shape ",
-               ShapeUtil::HumanString(literal->shape()), " at linear index ",
-               linear_index, ", but the index is out of range"));
+  if (!CheckParsedValueIsInRange<LiteralNativeT>(loc, value)) {
+    return false;
   }
 
-  if (std::isnan(value) ||
-      (std::numeric_limits<ParsedElemT>::has_infinity &&
-       (std::numeric_limits<ParsedElemT>::infinity() == value ||
-        -std::numeric_limits<ParsedElemT>::infinity() == value))) {
-    // Skip range checking for non-finite value.
-  } else if (literal->shape().element_type() == F16 ||
-             literal->shape().element_type() == BF16) {
-    if (value > kF16max || value < -kF16max) {
-      return TokenError(StrCat(
-          "value ", value, " is out of range for literal's primitive type ",
-          PrimitiveType_Name(literal->shape().element_type())));
+  // Check that the index is in range and assign into the literal
+  if (auto* linear_index = absl::get_if<int64>(&index)) {
+    if (*linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
+      return Error(loc, StrCat("trys to set value ", StringifyValue(value),
+                               " to a literal in shape ",
+                               ShapeUtil::HumanString(literal->shape()),
+                               " at linear index ", *linear_index,
+                               ", but the index is out of range"));
     }
-  } else if (std::is_unsigned<LiteralNativeT>::value) {
-    CHECK((std::is_same<ParsedElemT, tensorflow::int64>::value ||
-           std::is_same<ParsedElemT, bool>::value))
-        << "Unimplemented checking for ParsedElemT";
-
-    ParsedElemT upper_bound;
-    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
-      upper_bound = std::numeric_limits<ParsedElemT>::max();
-    } else {
-      upper_bound =
-          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
+    literal->data<LiteralNativeT>().at(*linear_index) =
+        static_cast<LiteralNativeT>(value);
+  } else {
+    auto* multi_index = absl::get_if<absl::Span<const int64>>(&index);
+    CHECK(multi_index != nullptr);
+
+    auto invalid_idx = [&](string msg) {
+      return Error(loc, StrFormat("Invalid sparse index [%s]. %s",
+                                  absl::StrJoin(*multi_index, ", "), msg));
+    };
+
+    const auto& shape = literal->shape();
+    if (shape.rank() != multi_index->size()) {
+      return invalid_idx(
+          StrFormat("Has rank %d, but constant has shape %s, which has rank %d",
+                    multi_index->size(), shape.ToString(), shape.rank()));
     }
-    if (value > upper_bound || value < 0) {
-      // Value is out of range for LiteralNativeT.
-      return TokenError(StrCat(
-          "value ", value, " is out of range for literal's primitive type ",
-          PrimitiveType_Name(literal->shape().element_type())));
-    }
-  } else if (value > static_cast<ParsedElemT>(
-                         std::numeric_limits<LiteralNativeT>::max()) ||
-             value < static_cast<ParsedElemT>(
-                         std::numeric_limits<LiteralNativeT>::lowest())) {
-    // Value is out of range for LiteralNativeT.
-    return TokenError(StrCat(
-        "value ", value, " is out of range for literal's primitive type ",
-        PrimitiveType_Name(literal->shape().element_type())));
+    for (int64 i = 0; i < shape.rank(); ++i) {
+      auto idx = (*multi_index)[i];
+      if (idx < 0) {
+        return invalid_idx(StrFormat(
+            "Sub-index value at %d, namely %d, cannot be negative.", i, idx));
+      }
+      if (idx >= shape.dimensions(i)) {
+        return invalid_idx(
+            StrFormat("Sub-index at %d, namely %d, doesn't fit within shape "
+                      "dimension %d in %s",
+                      i, idx, shape.dimensions(i), shape.ToString()));
+      }
+    }
+    literal->AppendSparseElement(*multi_index,
+                                 static_cast<LiteralNativeT>(value));
   }
-
-  literal->data<LiteralNativeT>().at(linear_index) =
-      static_cast<LiteralNativeT>(value);
   return true;
 }
 
@@ -1996,12 +2167,16 @@ bool HloParser::ParseNonTupleLiteral(Literal* literal, const Shape& shape) {
 }
 
 bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
-  const tensorflow::int64 rank = shape.rank();
+  // Cast `rank` to int because we call shape.dimensions(int rank) below, and if
+  // `rank` is an int64, that's an implicit narrowing conversion, which is
+  // implementation-defined behavior.
+  const int rank = static_cast<int>(shape.rank());
+
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
-  tensorflow::int64 nest_level = 0;
-  tensorflow::int64 linear_index = 0;
+  int64 nest_level = 0;
+  int64 linear_index = 0;
   // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
   // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
   // when we are parsing the 2nd '{' (right before '1'), we are seeing a
@@ -2009,17 +2184,35 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   // the first '}' (right after '3'), it means the sub-array ends, and the
   // sub-array is supposed to contain exactly 3 elements, so check if
   // elems_seen_per_dim[1] is 3.
-  std::vector<tensorflow::int64> elems_seen_per_dim(rank);
+  std::vector<int64> elems_seen_per_dim(rank);
   auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
-    std::vector<tensorflow::int64> elems_seen_until_dim(
-        elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
+    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
+                                            elems_seen_per_dim.begin() + dim);
     return StrCat("[",
                   StrJoin(elems_seen_until_dim, ",",
-                          [](string* out, const tensorflow::int64& num_elems) {
+                          [](string* out, const int64& num_elems) {
                             StrAppend(out, num_elems - 1);
                           }),
                   "]");
   };
+
+  auto add_one_elem_seen = [&] {
+    if (rank > 0) {
+      if (nest_level != rank) {
+        return TokenError(absl::StrFormat(
+            "expects nested array in rank %d, but sees %d", rank, nest_level));
+      }
+      elems_seen_per_dim[rank - 1]++;
+      if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) {
+        return TokenError(absl::StrFormat(
+            "expects %d elements on the minor-most dimension, but "
+            "sees more",
+            shape.dimensions(rank - 1)));
+      }
+    }
+    return true;
+  };
+
   do {
     switch (lexer_.GetKind()) {
       default:
@@ -2055,6 +2248,31 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         lexer_.Lex();
         break;
       }
+      case TokKind::kLparen: {
+        if (!primitive_util::IsComplexType(shape.element_type())) {
+          return TokenError(
+              absl::StrFormat("unexpected '(' in literal.  Parens are only "
+                              "valid for complex literals"));
+        }
+
+        std::complex<double> value;
+        LocTy loc = lexer_.GetLoc();
+        if (!add_one_elem_seen() || !ParseComplex(&value) ||
+            !SetValueInLiteral(loc, value, linear_index++, literal)) {
+          return false;
+        }
+        break;
+      }
+      case TokKind::kDots: {
+        if (nest_level != 1) {
+          return TokenError(absl::StrFormat(
+              "expects `...` at nest level 1, but sees it at nest level %d",
+              nest_level));
+        }
+        elems_seen_per_dim[0] = shape.dimensions(0);
+        lexer_.Lex();
+        break;
+      }
       case TokKind::kComma:
         // Skip.
         lexer_.Lex();
@@ -2066,23 +2284,11 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
       case TokKind::kw_nan:
       case TokKind::kw_inf:
       case TokKind::kNegInf: {
-        if (rank > 0) {
-          if (nest_level != rank) {
-            return TokenError(
-                absl::StrFormat("expects nested array in rank %d, but sees %d",
-                                rank, nest_level));
-          }
-          elems_seen_per_dim[rank - 1]++;
-          if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) {
-            return TokenError(absl::StrFormat(
-                "expects %d elements on the minor-most dimension, but "
-                "sees more",
-                shape.dimensions(rank - 1)));
-          }
-        }
+        add_one_elem_seen();
         if (lexer_.GetKind() == TokKind::kw_true ||
             lexer_.GetKind() == TokKind::kw_false) {
-          if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true,
+          if (!SetValueInLiteral(lexer_.GetLoc(),
+                                 lexer_.GetKind() == TokKind::kw_true,
                                  linear_index++, literal)) {
             return false;
           }
@@ -2090,12 +2296,12 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         } else if (primitive_util::IsIntegralType(shape.element_type()) ||
                    shape.element_type() == PRED) {
           LocTy loc = lexer_.GetLoc();
-          tensorflow::int64 value;
+          int64 value;
           if (!ParseInt64(&value)) {
             return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
           }
-          if (!SetValueInLiteral(value, linear_index++, literal)) {
+          if (!SetValueInLiteral(loc, value, linear_index++, literal)) {
             return false;
           }
         } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
@@ -2106,7 +2312,7 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
                 loc, StrCat("expect floating point value for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
           }
-          if (!SetValueInLiteral(value, linear_index++, literal)) {
+          if (!SetValueInLiteral(loc, value, linear_index++, literal)) {
             return false;
           }
         } else {
@@ -2123,48 +2329,7 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
 }
 
 bool HloParser::ParseSparseLiteral(Literal* literal, const Shape& shape) {
-  switch (shape.element_type()) {
-    case PRED:
-      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
-    case S8:
-      return ParseSparseLiteralHelper<tensorflow::int8>(literal, shape);
-    case S16:
-      return ParseSparseLiteralHelper<tensorflow::int16>(literal, shape);
-    case S32:
-      return ParseSparseLiteralHelper<tensorflow::int32>(literal, shape);
-    case S64:
-      return ParseSparseLiteralHelper<tensorflow::int64>(literal, shape);
-    case U8:
-      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
-    case U16:
-      return ParseSparseLiteralHelper<tensorflow::uint16>(literal, shape);
-    case U32:
-      return ParseSparseLiteralHelper<tensorflow::uint32>(literal, shape);
-    case U64:
-      return ParseSparseLiteralHelper<tensorflow::uint64>(literal, shape);
-    case F16:
-      return ParseSparseLiteralHelper<Eigen::half>(literal, shape);
-    case F32:
-      return ParseSparseLiteralHelper<float>(literal, shape);
-    case BF16:
-      return ParseSparseLiteralHelper<tensorflow::bfloat16>(literal, shape);
-    case F64:
-      return ParseSparseLiteralHelper<double>(literal, shape);
-    default:
-      return Error(lexer_.GetLoc(),
-                   StrCat("invalid primitive type for sparse literal: ",
-                          PrimitiveType_Name(shape.element_type())));
-  }
-}
-
-template <typename LiteralNativeT>
-bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
-  std::vector<tensorflow::int64> index;
-
-  tensorflow::int64 rank = shape.rank();
-
   *literal = Literal(shape);
-
   if (!ParseToken(TokKind::kLbrace,
                   "expects '{' at the beginning of a sparse literal")) {
     return false;
@@ -2176,61 +2341,66 @@ bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
       break;
     }
 
-    LocTy index_loc = lexer_.GetLoc();
-    index.clear();
+    std::vector<int64> index;
     if (lexer_.GetKind() == TokKind::kInt) {
-      tensorflow::int64 single_index = lexer_.GetInt64Val();
+      int64 single_index = lexer_.GetInt64Val();
       lexer_.Lex();
-      if (rank != 1) {
-        return Error(
-            index_loc,
-            StrCat("invalid single-dimensional index for shape with rank ",
-                   rank, ": ", single_index));
-      }
       index.push_back(single_index);
     } else {
       if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
                           &index)) {
         return false;
       }
-      if (index.size() != rank) {
-        return Error(
-            index_loc,
-            StrCat("invalid multi-dimension index for shape with rank ", rank,
-                   ": [", StrJoin(index, ", "), "]"));
-      }
     }
     if (!ParseToken(TokKind::kColon,
                     "expects ':' after after the sparse array index and before "
                     "the sparse array value")) {
       return false;
     }
+
     LocTy value_loc = lexer_.GetLoc();
-    LiteralNativeT value;
     if (lexer_.GetKind() == TokKind::kw_true ||
         lexer_.GetKind() == TokKind::kw_false) {
-      value = static_cast<LiteralNativeT>(lexer_.GetKind() == TokKind::kw_true);
+      bool value = lexer_.GetKind() == TokKind::kw_true;
+      if (!SetValueInLiteral(lexer_.GetLoc(), value, index, literal)) {
+        return false;
+      }
       lexer_.Lex();
     } else if (primitive_util::IsIntegralType(shape.element_type())) {
-      tensorflow::int64 value_s64;
-      if (!ParseInt64(&value_s64)) {
+      int64 value;
+      if (!ParseInt64(&value)) {
         return Error(value_loc,
                      StrCat("expects integer for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
       }
-      value = static_cast<LiteralNativeT>(value_s64);
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
     } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
-      double value_f64;
-      if (!ParseDouble(&value_f64)) {
+      double value;
+      if (!ParseDouble(&value)) {
         return Error(value_loc,
                      StrCat("expects floating point value for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
       }
-      value = static_cast<LiteralNativeT>(value_f64);
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
+    } else if (primitive_util::IsComplexType(shape.element_type())) {
+      std::complex<double> value;
+      if (!ParseComplex(&value)) {
+        return Error(value_loc,
+                     StrCat("expects complex value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
+      }
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
     } else {
       LOG(FATAL) << "Unexpected element type: "
                  << PrimitiveType_Name(shape.element_type());
     }
+
     if (lexer_.GetKind() != TokKind::kRbrace &&
         !ParseToken(TokKind::kComma,
                     "expects ',' separator between sparse array elements")) {
@@ -2244,14 +2414,114 @@ bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
           StrCat("number of sparse elements exceeds maximum for layout: ",
                  ShapeUtil::HumanStringWithLayout(shape)));
     }
-
-    literal->AppendSparseElement(index, value);
   }
 
   literal->SortSparseElements();
   return true;
 }
 
+// MaxFiniteValue is a type-traits helper used by
+// HloParser::CheckParsedValueIsInRange.
+template <typename T>
+struct MinMaxFiniteValue {
+  static T max() { return std::numeric_limits<T>::max(); }
+  static T min() { return std::numeric_limits<T>::lowest(); }
+};
+
+template <>
+struct MinMaxFiniteValue<Eigen::half> {
+  static double max() {
+    // Sadly this is not constexpr, so this forces `value` to be a method.
+    return static_cast<double>(Eigen::NumTraits<Eigen::half>::highest());
+  }
+  static double min() { return -max(); }
+};
+
+template <>
+struct MinMaxFiniteValue<bfloat16> {
+  static double max() { return static_cast<double>(bfloat16::highest()); }
+  static double min() { return -max(); }
+};
+
+template <typename LiteralNativeT, typename ParsedElemT>
+bool HloParser::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
+  PrimitiveType literal_ty =
+      primitive_util::NativeToPrimitiveType<LiteralNativeT>();
+  if (std::isnan(value) ||
+      (std::numeric_limits<ParsedElemT>::has_infinity &&
+       (std::numeric_limits<ParsedElemT>::infinity() == value ||
+        -std::numeric_limits<ParsedElemT>::infinity() == value))) {
+    // Skip range checking for non-finite value.
+  } else if (std::is_unsigned<LiteralNativeT>::value) {
+    CHECK((std::is_same<ParsedElemT, int64>::value ||
+           std::is_same<ParsedElemT, bool>::value))
+        << "Unimplemented checking for ParsedElemT";
+
+    ParsedElemT upper_bound;
+    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
+      upper_bound = std::numeric_limits<ParsedElemT>::max();
+    } else {
+      upper_bound =
+          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
+    }
+    if (value > upper_bound || value < 0) {
+      // Value is out of range for LiteralNativeT.
+      return Error(loc, StrCat("value ", value,
+                               " is out of range for literal's primitive type ",
+                               PrimitiveType_Name(literal_ty), " namely [0, ",
+                               upper_bound, "]."));
+    }
+  } else if (value > MinMaxFiniteValue<LiteralNativeT>::max() ||
+             value < MinMaxFiniteValue<LiteralNativeT>::min()) {
+    // Value is out of range for LiteralNativeT.
+    return Error(loc, StrCat("value ", value,
+                             " is out of range for literal's primitive type ",
+                             PrimitiveType_Name(literal_ty), " namely [",
+                             MinMaxFiniteValue<LiteralNativeT>::min(), ", ",
+                             MinMaxFiniteValue<LiteralNativeT>::max(), "]."));
+  }
+  return true;
+}
+
+template <typename LiteralNativeT>
+bool HloParser::CheckParsedValueIsInRange(LocTy loc,
+                                          std::complex<double> value) {
+  // e.g. `float` for std::complex<float>
+  using LiteralComplexComponentT =
+      decltype(std::real(std::declval<LiteralNativeT>()));
+
+  // We could do simply
+  //
+  //   return CheckParsedValueIsInRange<LiteralNativeT>(std::real(value)) &&
+  //          CheckParsedValueIsInRange<LiteralNativeT>(std::imag(value));
+  //
+  // but this would give bad error messages on failure.
+
+  auto check_component = [&](absl::string_view name, double v) {
+    if (std::isnan(v) || v == std::numeric_limits<double>::infinity() ||
+        v == -std::numeric_limits<double>::infinity()) {
+      // Skip range-checking for non-finite values.
+      return true;
+    }
+
+    double min = MinMaxFiniteValue<LiteralComplexComponentT>::min();
+    double max = MinMaxFiniteValue<LiteralComplexComponentT>::max();
+    if (v < min || v > max) {
+      // Value is out of range for LitearlComplexComponentT.
+      return Error(
+          loc,
+          StrCat(name, " part ", v,
+                 " is out of range for literal's primitive type ",
+                 PrimitiveType_Name(
+                     primitive_util::NativeToPrimitiveType<LiteralNativeT>()),
+                 ", namely [", min, ", ", max, "]."));
+    }
+    return true;
+  };
+  return check_component("real", std::real(value)) &&
+         check_component("imaginary", std::imag(value));
+}
+
 // operands ::= '(' operands1 ')'
 // operands1
 //   ::= /*empty*/
@@ -2409,24 +2679,23 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kInt64: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        static_cast<optional<tensorflow::int64>*>(attr_out_ptr)
-            ->emplace(result);
+        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
         return true;
       }
       case AttrTy::kInt32: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        if (result != static_cast<tensorflow::int32>(result)) {
+        if (result != static_cast<int32>(result)) {
           return Error(attr_loc, "value out of range for int32");
         }
-        static_cast<optional<tensorflow::int32>*>(attr_out_ptr)
-            ->emplace(static_cast<tensorflow::int32>(result));
+        static_cast<optional<int32>*>(attr_out_ptr)
+            ->emplace(static_cast<int32>(result));
         return true;
       }
       case AttrTy::kFloat: {
@@ -2444,20 +2713,21 @@ bool HloParser::ParseAttributeHelper(
       }
       case AttrTy::kHloComputation: {
         HloComputation* result = nullptr;
-        if (lexer_.GetKind() == TokKind::kLbrace) {
-          // This means it is a nested computation.
-          if (!ParseInstructionList(&result, /*computation_name=*/"_")) {
-            return false;
-          }
-        } else {
-          // This means it is a computation name.
-          if (!ParseComputationName(&result)) {
-            return false;
-          }
+        if (!ParseHloComputation(&result)) {
+          return false;
         }
         static_cast<optional<HloComputation*>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kBracedHloComputationList: {
+        std::vector<HloComputation*> result;
+        if (!ParseHloComputationList(&result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<HloComputation*>>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
       case AttrTy::kFftType: {
         FftType result;
         if (!ParseFftType(&result)) {
@@ -2466,6 +2736,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<FftType>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kComparisonDirection: {
+        ComparisonDirection result;
+        if (!ParseComparisonDirection(&result)) {
+          return false;
+        }
+        static_cast<optional<ComparisonDirection>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
       case AttrTy::kWindow: {
         Window result;
         if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) {
@@ -2491,6 +2770,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
         return true;
       }
+      case AttrTy::kParameterReplication: {
+        ParameterReplication parameter_replication;
+        if (!ParseParameterReplication(&parameter_replication)) {
+          return false;
+        }
+        static_cast<optional<ParameterReplication>*>(attr_out_ptr)
+            ->emplace(parameter_replication);
+        return true;
+      }
       case AttrTy::kInstructionList: {
         std::vector<HloInstruction*> result;
         if (!ParseInstructionNames(&result)) {
@@ -2510,19 +2798,19 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kBracedInt64List: {
-        std::vector<tensorflow::int64> result;
+        std::vector<int64> result;
         if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
                             &result)) {
           return false;
         }
-        static_cast<optional<std::vector<tensorflow::int64>>*>(attr_out_ptr)
+        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
       case AttrTy::kBracedInt64ListList: {
-        std::vector<std::vector<tensorflow::int64>> result;
+        std::vector<std::vector<int64>> result;
         auto parse_and_add_item = [&]() {
-          std::vector<tensorflow::int64> item;
+          std::vector<int64> item;
           if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace,
                               TokKind::kComma, &item)) {
             return false;
@@ -2534,8 +2822,7 @@ bool HloParser::ParseAttributeHelper(
                        parse_and_add_item)) {
           return false;
         }
-        static_cast<optional<std::vector<std::vector<tensorflow::int64>>>*>(
-            attr_out_ptr)
+        static_cast<optional<std::vector<std::vector<int64>>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
@@ -2610,6 +2897,95 @@ bool HloParser::ParseAttributeHelper(
   return true;
 }
 
+// attributes ::= (',' attribute)*
+bool HloParser::ParseAttributesAsProtoMessage(
+    const std::unordered_set<string>& required_attrs,
+    tensorflow::protobuf::Message* message) {
+  LocTy loc = lexer_.GetLoc();
+  std::unordered_set<string> seen_attrs;
+  while (EatIfPresent(TokKind::kComma)) {
+    if (!ParseAttributeAsProtoMessageHelper(message, &seen_attrs)) {
+      return false;
+    }
+  }
+  // Check that all required attrs were seen.
+  for (const string& attr : required_attrs) {
+    if (seen_attrs.find(attr) == seen_attrs.end()) {
+      return Error(loc,
+                   StrFormat("attribute %s is expected but not seen", attr));
+    }
+  }
+  return true;
+}
+
+bool HloParser::ParseAttributeAsProtoMessageHelper(
+    tensorflow::protobuf::Message* message,
+    std::unordered_set<string>* seen_attrs) {
+  LocTy loc = lexer_.GetLoc();
+  string name;
+  if (!ParseAttributeName(&name)) {
+    return Error(loc, "error parsing attributes");
+  }
+  VLOG(1) << "Parsing attribute " << name;
+  if (!seen_attrs->insert(name).second) {
+    return Error(loc, StrFormat("attribute %s already exists", name));
+  }
+  const tensorflow::protobuf::Descriptor* descriptor = message->GetDescriptor();
+  const tensorflow::protobuf::FieldDescriptor* fd =
+      descriptor->FindFieldByName(name);
+  if (!fd) {
+    string allowed_attrs = "Allowed attributes: ";
+
+    for (int i = 0; i < descriptor->field_count(); ++i) {
+      if (i == 0) {
+        absl::StrAppend(&allowed_attrs, descriptor->field(i)->name());
+      } else {
+        absl::StrAppend(&allowed_attrs, ", ", descriptor->field(i)->name());
+      }
+    }
+    return Error(loc, StrFormat("unexpected attribute \"%s\".  %s", name,
+                                allowed_attrs));
+  }
+  const tensorflow::protobuf::Reflection* reflection = message->GetReflection();
+  CHECK(!fd->is_repeated());  // Repeated fields not implemented.
+  bool success = [&] {
+    switch (fd->type()) {
+      case tensorflow::protobuf::FieldDescriptor::TYPE_BOOL: {
+        bool result;
+        if (!ParseBool(&result)) {
+          return false;
+        }
+        reflection->SetBool(message, fd, result);
+        return true;
+      }
+      case tensorflow::protobuf::FieldDescriptor::TYPE_ENUM: {
+        if (lexer_.GetKind() != TokKind::kIdent) {
+          return TokenError(
+              StrFormat("expects %s type", fd->enum_type()->name()));
+        }
+        string val = lexer_.GetStrVal();
+        const tensorflow::protobuf::EnumValueDescriptor* evd =
+            fd->enum_type()->FindValueByName(val);
+        if (evd == nullptr) {
+          return TokenError(StrFormat("expects %s type but sees: %s",
+                                      fd->enum_type()->name(), val));
+        }
+        reflection->SetEnum(message, fd, evd);
+        lexer_.Lex();
+        return true;
+      }
+      default:
+        LOG(ERROR) << "Unimplemented protocol buffer type "
+                   << fd->DebugString();
+        return false;
+    }
+  }();
+  if (!success) {
+    return Error(loc, StrFormat("error parsing attribute %s", name));
+  }
+  return true;
+}
+
 bool HloParser::ParseComputationName(HloComputation** value) {
   string name;
   LocTy loc = lexer_.GetLoc();
@@ -2736,7 +3112,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   absl::string_view rhs = split2[0];
   absl::string_view out = split2[1];
 
-  const tensorflow::int64 rank = lhs.length();
+  const int64 rank = lhs.length();
   if (rank != rhs.length() || rank != out.length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
@@ -2847,7 +3223,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
     return false;
   }
-  std::vector<std::vector<tensorflow::int64>> ranges;
+  std::vector<std::vector<int64>> ranges;
   if (lexer_.GetKind() == TokKind::kRbrace) {
     // empty
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
@@ -2894,6 +3270,29 @@ bool HloParser::ParsePrecisionList(
                    parse_and_add_item);
 }
 
+bool HloParser::ParseHloComputation(HloComputation** result) {
+  if (lexer_.GetKind() == TokKind::kLbrace) {
+    // This means it is a nested computation.
+    return ParseInstructionList(result, /*computation_name=*/"_");
+  }
+  // This means it is a computation name.
+  return ParseComputationName(result);
+}
+
+bool HloParser::ParseHloComputationList(std::vector<HloComputation*>* result) {
+  auto parse_and_add_item = [&]() {
+    HloComputation* computation;
+    if (!ParseHloComputation(&computation)) {
+      return false;
+    }
+    LOG(INFO) << "parsed computation " << computation->name();
+    result->push_back(computation);
+    return true;
+  };
+  return ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                   parse_and_add_item);
+}
+
 // shapelist ::= '{' shapes '}'
 // precision_elements
 //   ::= /*empty*/
@@ -2917,9 +3316,9 @@ bool HloParser::ParseShapeList(std::vector<Shape>* result) {
 //   ::= int64_val (delim int64_val)*
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
-                               std::vector<tensorflow::int64>* result) {
+                               std::vector<int64>* result) {
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     if (!ParseInt64(&i)) {
       return false;
     }
@@ -2995,7 +3394,7 @@ bool HloParser::ParseParamList() {
 bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
                                     std::vector<bool>* dynamic_dimensions) {
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     bool is_dynamic = false;
     if (lexer_.GetKind() == TokKind::kLeq) {
       is_dynamic = true;
@@ -3012,22 +3411,108 @@ bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
                    parse_and_add_item);
 }
 
-// layout ::= '{' int64_list '}'
+// tiles
+//   ::= /*empty*/
+//   ::= 'T' '(' dim_list ')'
+// dim_list
+//   ::= /*empty*/
+//   ::= (int64 | '*') (',' (int64 | '*'))*
+bool HloParser::ParseTiles(std::vector<Tile>* tiles) {
+  auto parse_and_add_tile_dimension = [&]() {
+    tensorflow::int64 i;
+    if (ParseInt64(&i)) {
+      tiles->back().add_dimensions(i);
+      return true;
+    }
+    if (lexer_.GetKind() == TokKind::kAsterisk) {
+      tiles->back().add_dimensions(Tile::kCombineDimension);
+      lexer_.Lex();
+      return true;
+    }
+    return false;
+  };
+
+  do {
+    tiles->push_back(Tile());
+    if (!ParseList(TokKind::kLparen, TokKind::kRparen, TokKind::kComma,
+                   parse_and_add_tile_dimension)) {
+      return false;
+    }
+  } while (lexer_.GetKind() == TokKind::kLparen);
+  return true;
+}
+
+// layout ::= '{' int64_list (':' tiles element_size_in_bits)? '}'
+// element_size_in_bits
+//   ::= /*empty*/
+//   ::= 'E' '(' int64 ')'
 bool HloParser::ParseLayout(Layout* layout) {
   std::vector<int64> minor_to_major;
+  std::vector<Tile> tiles;
+  tensorflow::int64 element_size_in_bits = 0;
+
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     if (!ParseInt64(&i)) {
       return false;
     }
     minor_to_major.push_back(i);
     return true;
   };
-  if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
-                 parse_and_add_item)) {
+
+  if (!ParseToken(TokKind::kLbrace,
+                  StrCat("expects layout to start with ",
+                         TokKindToString(TokKind::kLbrace)))) {
     return false;
   }
-  *layout = LayoutUtil::MakeLayout(minor_to_major);
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    if (lexer_.GetKind() == TokKind::kInt) {
+      // Parse minor to major.
+      do {
+        if (!parse_and_add_item()) {
+          return false;
+        }
+      } while (EatIfPresent(TokKind::kComma));
+    }
+
+    if (lexer_.GetKind() == TokKind::kColon) {
+      lexer_.Lex();
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "T") {
+        lexer_.Lex();
+        ParseTiles(&tiles);
+      }
+
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "E") {
+        // Parse element size in bits.
+        lexer_.Lex();
+        if (!ParseToken(TokKind::kLparen,
+                        StrCat("expects element size in bits to start with ",
+                               TokKindToString(TokKind::kLparen)))) {
+          return false;
+        }
+        if (!ParseInt64(&element_size_in_bits)) {
+          return false;
+        }
+        if (!ParseToken(TokKind::kRparen,
+                        StrCat("expects element size in bits to end with ",
+                               TokKindToString(TokKind::kRparen)))) {
+          return false;
+        }
+      }
+    }
+  }
+  if (!ParseToken(TokKind::kRbrace,
+                  StrCat("expects layout to end with ",
+                         TokKindToString(TokKind::kRbrace)))) {
+    return false;
+  }
+
+  std::vector<Tile> vec_tiles(tiles.size());
+  for (int i = 0; i < tiles.size(); i++) {
+    vec_tiles[i] = Tile(tiles[i]);
+  }
+  *layout =
+      LayoutUtil::MakeLayout(minor_to_major, vec_tiles, element_size_in_bits);
   return true;
 }
 
@@ -3079,7 +3564,7 @@ bool HloParser::ParseShape(Shape* result) {
     lexer_.Lex();
     const string message =
         "expects a brace-bracketed integer for sparse layout";
-    tensorflow::int64 max_sparse_elements;
+    int64 max_sparse_elements;
     if (!ParseToken(TokKind::kLbrace, message) ||
         !ParseInt64(&max_sparse_elements) ||
         !ParseToken(TokKind::kRbrace, message)) {
@@ -3099,13 +3584,20 @@ bool HloParser::ParseShape(Shape* result) {
   //
   // The open brace could either be the start of a computation or the start of a
   // layout for the f32[123] shape. We consider it the start of a layout if the
-  // next token after the open brace is a integer
+  // next token after the open brace is an integer or a colon.
   if (lexer_.GetKind() == TokKind::kLbrace &&
-      lexer_.LookAhead() == TokKind::kInt) {
+      (lexer_.LookAhead() == TokKind::kInt ||
+       lexer_.LookAhead() == TokKind::kColon)) {
     Layout layout;
     if (!ParseLayout(&layout)) {
       return false;
     }
+    if (layout.minor_to_major_size() != result->rank()) {
+      return Error(
+          lexer_.GetLoc(),
+          StrFormat("Dimensions size is %ld, but minor to major size is %ld.",
+                    result->rank(), layout.minor_to_major_size()));
+    }
     *result->mutable_layout() = layout;
   }
   return true;
@@ -3148,15 +3640,14 @@ bool HloParser::ParseString(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name,
-                         std::vector<tensorflow::int64>* result) {
+bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
     return Error(loc, StrFormat("sub-attribute '%s=' already exists", name));
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
-    tensorflow::int64 number;
+    int64 number;
     if (!ParseInt64(&number)) {
       return Error(loc, StrFormat("expects sub-attribute '%s=i'", name));
     }
@@ -3175,8 +3666,7 @@ bool HloParser::ParseDxD(const string& name,
   return TokenError("expects token type kInt or kDxD");
 }
 
-bool HloParser::ParseWindowPad(
-    std::vector<std::vector<tensorflow::int64>>* pad) {
+bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
   LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
     return Error(loc, "sub-attribute 'pad=' already exists");
@@ -3186,7 +3676,7 @@ bool HloParser::ParseWindowPad(
   }
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> low_high;
+    std::vector<int64> low_high;
     if (!SplitToInt64s(padding_dim_str, '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
@@ -3209,7 +3699,7 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> padding_dim;
+    std::vector<int64> padding_dim;
     if (!SplitToInt64s(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
@@ -3231,7 +3721,7 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
   optional<string> op_type;
   optional<string> op_name;
   optional<string> source_file;
-  optional<tensorflow::int32> source_line;
+  optional<int32> source_line;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
@@ -3283,6 +3773,22 @@ bool HloParser::ParseFftType(FftType* result) {
   return true;
 }
 
+bool HloParser::ParseComparisonDirection(ComparisonDirection* result) {
+  VLOG(1) << "ParseComparisonDirection";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects comparison direction");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToComparisonDirection(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        StrFormat("expects comparison direction but sees: %s", val));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(1) << "ParseFusionKind";
   if (lexer_.GetKind() != TokKind::kIdent) {
@@ -3334,7 +3840,7 @@ bool HloParser::ParsePrecision(PrecisionConfig::Precision* result) {
   return true;
 }
 
-bool HloParser::ParseInt64(tensorflow::int64* result) {
+bool HloParser::ParseInt64(int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
     return TokenError("expects integer");
@@ -3346,9 +3852,18 @@ bool HloParser::ParseInt64(tensorflow::int64* result) {
 
 bool HloParser::ParseDouble(double* result) {
   switch (lexer_.GetKind()) {
-    case TokKind::kDecimal:
-      *result = lexer_.GetDecimalVal();
+    case TokKind::kDecimal: {
+      double val = lexer_.GetDecimalVal();
+      // If GetDecimalVal returns +/-inf, that means that we overflowed
+      // `double`.
+      if (std::isinf(val)) {
+        return TokenError(StrCat("Constant is out of range for double (+/-",
+                                 std::numeric_limits<double>::max(),
+                                 ") and so is unparsable."));
+      }
+      *result = val;
       break;
+    }
     case TokKind::kInt:
       *result = static_cast<double>(lexer_.GetInt64Val());
       break;
@@ -3368,6 +3883,42 @@ bool HloParser::ParseDouble(double* result) {
   return true;
 }
 
+bool HloParser::ParseComplex(std::complex<double>* result) {
+  if (lexer_.GetKind() != TokKind::kLparen) {
+    return TokenError("expects '(' before complex number");
+  }
+  lexer_.Lex();
+
+  double real;
+  LocTy loc = lexer_.GetLoc();
+  if (!ParseDouble(&real)) {
+    return Error(loc,
+                 "expect floating-point value for real part of complex number");
+  }
+
+  if (lexer_.GetKind() != TokKind::kComma) {
+    return TokenError(
+        absl::StrFormat("expect comma after real part of complex literal"));
+  }
+  lexer_.Lex();
+
+  double imag;
+  loc = lexer_.GetLoc();
+  if (!ParseDouble(&imag)) {
+    return Error(
+        loc,
+        "expect floating-point value for imaginary part of complex number");
+  }
+
+  if (lexer_.GetKind() != TokKind::kRparen) {
+    return TokenError(absl::StrFormat("expect ')' after complex number"));
+  }
+
+  *result = std::complex<double>(real, imag);
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseBool(bool* result) {
   if (lexer_.GetKind() != TokKind::kw_true &&
       lexer_.GetKind() != TokKind::kw_false) {
@@ -3441,6 +3992,21 @@ StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
+StatusOr<std::vector<bool>> HloParser::ParseParameterReplicationOnly() {
+  lexer_.Lex();
+  ParameterReplication parameter_replication;
+  if (!ParseParameterReplication(&parameter_replication)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after parameter replication");
+  }
+  return std::vector<bool>(
+      parameter_replication.replicated_at_leaf_buffers().begin(),
+      parameter_replication.replicated_at_leaf_buffers().end());
+}
+
 StatusOr<Window> HloParser::ParseWindowOnly() {
   lexer_.Lex();
   Window window;
@@ -3556,6 +4122,11 @@ StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   return parser.ParseShardingOnly();
 }
 
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseParameterReplicationOnly();
+}
+
 StatusOr<Window> ParseWindow(absl::string_view str) {
   HloParser parser(str);
   return parser.ParseWindowOnly();
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 450a54c54c156c2ae27475d145a8e83dc841b431..a96260b4d75e515a4cb23d315444142cae1b9587 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,11 +44,16 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
-// e.g., "{replicated}".
+// Parses sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string, e.g.,
+// "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
+// Parses parameter replication from str. str is supposed to contain the body of
+// the parameter replication, i.e. just the rhs of the
+// "parameter_replication={...}" attribute string, e.g., "{true, false}".
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str);
+
 // Parses the result of window_util::ToString(const Window&).
 StatusOr<Window> ParseWindow(absl::string_view str);
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 6ba16cc82ac1da2a30610d9dfb56cacc100ae05f..6f4171bca82b1c287cfe1fef16e95f96215ac702 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -63,6 +63,19 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// parameter replication
+{
+"ParamReplication",
+R"(HloModule param_replication_module
+
+ENTRY %param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
+  %a = f32[] parameter(0), parameter_replication={true}
+  %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
+  ROOT %tuple = (f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0}))) tuple(f32[] %a, (f32[2,4]{1,0}, (f32[2,4]{1,0})) %b)
+}
+
 )"
 },
 // pred constant
@@ -209,7 +222,7 @@ R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
-  %greater-than = pred[4]{0} greater-than(f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated}
+  %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, sharding={replicated}
   ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
 }
 
@@ -279,7 +292,7 @@ R"(HloModule WhileWithScalarS32Result_module
 %condition.v3 (prev.2: s32[]) -> pred[] {
   %constant.1 = s32[] constant(5)
   %prev.2 = s32[] parameter(0)
-  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %prev.2)
+  ROOT %greater-than = pred[] compare(s32[] %constant.1, s32[] %prev.2), direction=GT
 }
 
 ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
@@ -461,7 +474,7 @@ R"(HloModule R4F32OverlapSmall_module
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
-  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
+  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE
 }
 
 %add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
@@ -487,7 +500,7 @@ R"(HloModule select_and_scatter_scalar
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
-  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
+  ROOT %greater-than-or-equal-to = pred[] compare(f32[] %lhs, f32[] %rhs), direction=GE
 }
 
 %add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
@@ -562,6 +575,19 @@ ENTRY %Transpose.v3 (input: c128[1,2,3]) -> c128[1,2,3] {
   ROOT %transpose = c128[1,2,3]{2,1,0} transpose(c128[1,2,3]{2,1,0} %input), dimensions={0,1,2}
 }
 
+)"
+},
+// Triangular solve
+{
+"TriangularSolve",
+R"(HloModule TriangularSolve_module
+
+ENTRY %SimpleRightLowerNotranspose.4 (a.1: f32[4,4], b.2: f32[3,4]) -> f32[3,4] {
+  %a.1 = f32[4,4]{1,0} parameter(0)
+  %b.2 = f32[3,4]{1,0} parameter(1)
+  ROOT %triangular-solve.3 = f32[3,4]{1,0} triangular-solve(f32[4,4]{1,0} %a.1, f32[3,4]{1,0} %b.2), lower=true, transpose_a=NO_TRANSPOSE
+}
+
 )"
 },
 // Dynamic slice
@@ -782,7 +808,17 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
 R"(HloModule sparse_f32
 
 ENTRY %sparse () -> f32[2,3,4] {
-  ROOT %foo = f32[2,3,4]sparse{10} constant({[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3})
+  ROOT %foo = f32[2,3,4]sparse{10} constant({[0, 1, 2]: 1, [1, 2, 2]: 2, [1, 2, 3]: 3})
+}
+
+)"
+},
+{
+"SparseC128",
+R"(HloModule sparse_c128
+
+ENTRY %sparse () -> c128[2,3,4] {
+  ROOT %foo = c128[2,3,4]sparse{10} constant({[0, 1, 2]: (1, 0), [1, 2, 2]: (2, 5), [1, 2, 3]: (3, 10)})
 }
 
 )"
@@ -894,6 +930,58 @@ ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123
   ROOT %custom-call = (f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={(f32[2,2]{1,0}, f32[42,2,3]{2,0,1}), f32[123,4]{1,0}}
 }
 
+)"
+},
+// Parse c64 literal
+{
+"ParseC64Literal",
+R"(HloModule ParseC64Literal
+
+ENTRY %ParseC64Literal () -> c64[2] {
+  ROOT %c = c64[2]{0} constant({(1, 2), (-inf, nan)})
+}
+
+)"
+},
+// Parse c128 literal
+{
+"ParseC128Literal",
+R"(HloModule ParseC128Literal
+
+ENTRY %ParseC128Literal () -> c128[2] {
+  ROOT %c = c128[2]{0} constant({(1, 2), (-inf, nan)})
+}
+
+)"
+},
+// Indexed Conditional
+{
+"IndexedConditional",
+R"(HloModule indexed_conditional
+
+%Negate (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %negate = f32[] negate(f32[] %x)
+}
+
+%Identity (y: f32[]) -> f32[] {
+  %y = f32[] parameter(0)
+  ROOT %copy = f32[] copy(f32[] %y)
+}
+
+%Floor (z: f32[]) -> f32[] {
+  %z = f32[] parameter(0)
+  ROOT %floor = f32[] floor(f32[] %z)
+}
+
+ENTRY %Parameters1.v4 () -> f32[] {
+  %constant = s32[] constant(1)
+  %constant.1 = f32[] constant(56)
+  %constant.2 = f32[] constant(12)
+  %constant.3 = f32[] constant(13)
+  ROOT %conditional = f32[] conditional(s32[] %constant, f32[] %constant.1, f32[] %constant.2, f32[] %constant.3), branch_computations={%Negate, %Identity, %Floor}
+}
+
 )"
 },
   });
@@ -949,7 +1037,7 @@ R"(HloModule TupleReduce
 max_argmax {
   value = f32[] parameter(2)
   prev_max = f32[] parameter(0)
-  is_next_larger = pred[] greater-than-or-equal-to(value, prev_max)
+  is_next_larger = pred[] compare(value, prev_max), direction=GE
   max = f32[] select(is_next_larger, value, prev_max)
   index = s32[] parameter(3)
   prev_argmax = s32[] parameter(1)
@@ -1015,9 +1103,15 @@ ENTRY ReducePrecision {
 "SortKey",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   x = f32[1024]{0} parameter(0)
-  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1027,10 +1121,18 @@ ENTRY Sort {
 "SortKeyValue",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1040,9 +1142,15 @@ ENTRY Sort {
 "SortKeyR2",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   x = f32[1024,16]{0,1} parameter(0)
-  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}
+  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1052,10 +1160,18 @@ ENTRY Sort {
 "SortKeyValueR2",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values = s32[1024,16]{0,1} parameter(1)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1065,20 +1181,80 @@ ENTRY Sort {
 "SortManyValues",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.2.lhs = u32[] parameter(4)
+  p.2.rhs = u32[] parameter(5)
+  p.3.lhs = f32[] parameter(6)
+  p.3.rhs = f32[] parameter(7)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values.0 = s32[1024,16]{0,1} parameter(1)
   values.1 = u32[1024,16]{0,1} parameter(2)
   values.2 = f32[1024,16]{0,1} parameter(3)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}, to_apply=compare
+}
+
+)"
+},
+// Sort (Key) is_stable=true
+{
+"SortKeyStable",
+R"(HloModule sort
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY Sort {
+  x = f32[1024]{0} parameter(0)
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, is_stable=true, to_apply=compare
 }
 
 )"
 },
-// Conditional
+// Indexed Conditional
 {
-"Conditional",
-R"(HloModule conditional
+"IndexedConditional",
+R"(HloModule indexed_conditional
+
+Negate {
+  x = f32[] parameter(0)
+  ROOT negate = f32[] negate(x)
+}
+
+Identity {
+  y = f32[] parameter(0)
+  ROOT copy = f32[] copy(y)
+}
+
+Floor {
+  z = f32[] parameter(0)
+  ROOT floor = f32[] floor(z)
+}
+
+ENTRY Parameters1.v4 {
+  constant = s32[] constant(1)
+  constant.1 = f32[] constant(56)
+  constant.2 = f32[] constant(12)
+  constant.3 = f32[] constant(13)
+  ROOT conditional = f32[] conditional(constant, constant.1, constant.2, constant.3), branch_computations={Negate, Identity, Floor}
+}
+
+)"
+},
+// Predicated Conditional
+{
+"PredicatedConditional",
+R"(HloModule pred_conditional
 
 Negate {
   x = f32[] parameter(0)
@@ -1248,6 +1424,17 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
+)"
+},
+// replica-id
+{
+"ReplicaId",
+R"(HloModule replica-id
+
+ENTRY Replica-id {
+  ROOT replica-id = u32[] replica-id()
+}
+
 )"
 },
 // Iota
@@ -1277,10 +1464,18 @@ ENTRY Computation {
 "ScheduledModule",
 R"(HloModule scheduled_module, is_scheduled=true
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lhs = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1314,6 +1509,30 @@ ENTRY AddDependency {
   ROOT sum = f32[] add(neg, exp)
 }
 
+)"
+},
+
+// A module containing constants equal to the min/max values of various data
+// types.
+{
+"MinMaxValues",
+R"(HloModule MinMaxValues
+
+ENTRY MinMaxValues {
+  x.s8 = s8[2]{0} constant({-128, 127})
+  x.s16 = s16[2]{0} constant({-32768, 32767})
+  x.s32 = s32[2]{0} constant({-2147483648, 2147483647})
+  x.u8 = u8[2]{0} constant({0, 255})
+  x.u16 = u16[2]{0} constant({0, 65535})
+  x.u32 = u32[2]{0} constant({0, 4294967295})
+  x.f16 = f16[2]{0} constant({-65504, 65504})
+  x.bf16 = bf16[2]{0} constant({-3.38953e+38, 3.38953e+38})
+  x.f32 = f32[2]{0} constant({-3.40282e+38, 3.40282e+38})
+  x.f64 = f64[2]{0} constant({-1.79769e+308, 1.79769e+308})
+  x.c64 = c64[2]{0} constant({(-3.40282e+38, 3.40282e+38), (3.40282e+38, -3.40282e+38)})
+  ROOT c.c128 = c128[2]{0} constant({(-1.79769e+308, 1.79769e+308), (1.79769e+308, -1.79769e+308)})
+}
+
 )"
 },
 });
@@ -1340,7 +1559,7 @@ class HloParameterizedParserTest
  protected:
   // Expects "ToString(ParseHloString(string)) == string", that is, parses the
   // string, asserts that it succeeded, stringifies the parsed module, and
-  // checks that the it equals the original string.
+  // checks that it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -1437,7 +1656,7 @@ TEST_F(HloParserTest, WrongOperandsSize) {
 
 ENTRY %blabla (x: f32[]) -> pred[] {
   %x = f32[]{} parameter(0)
-  %eq = pred[]{} equal-to(f32[]{} %x)
+  %eq = pred[]{} compare(f32[]{} %x), direction=EQ
 }
 
 )";
@@ -1449,7 +1668,7 @@ TEST_F(HloParserTest, OperandNotFound) {
   const string original = R"(HloModule operand_not_found:
 ENTRY %blabla (x: f32[]) -> pred[] {
   %x = f32[]{} parameter(0)
-  %eq = pred[]{} equal-to(f32[]{} %x, f32[]{} %y)
+  %eq = pred[]{} compare(f32[]{} %x, f32[]{} %y), direction=EQ
 }
 )";
   auto result = ParseHloString(original);
@@ -1543,6 +1762,37 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
                   "is out of range for literal's primitive type F16");
 }
 
+TEST_F(HloParserTest, ConstantBf16NoOverflow) {
+  // 65505 is in range for bf16.
+  const string original = R"(
+  HloModule test_module
+  ENTRY test {
+    ROOT c = bf16[] constant(-65505)
+  })";
+  EXPECT_EQ(Status::OK(), ParseHloString(original).status());
+}
+
+TEST_F(HloParserTest, ConstantBf16Overflow) {
+  // 1e100 is out of range for bf16.
+  const string original = R"(
+  HloModule test_module
+  ENTRY test {
+    ROOT c = bf16[] constant(1e100)
+  })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "out of range");
+}
+
+TEST_F(HloParserTest, ConstantF16OverflowInSparseArray) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[0]: 0, [1]: -65505})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "is out of range for literal's primitive type F16");
+}
+
 TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
   const string original = R"(
       HloModule ConstantUnsignedUnderflow_module
@@ -1577,6 +1827,46 @@ TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) {
   EXPECT_NE(Status::OK(), result.status());
 }
 
+TEST_F(HloParserTest, ConstantC64Overflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test () -> c64[] {
+        ROOT c = c64[] constant((1e100, 0))
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantC64Underflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test () -> c64[] {
+        ROOT c = c64[] constant((0, -1e100))
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantF64Overflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test {
+        ROOT c = f64[] constant(1.8e308)
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantF64Underflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test {
+        ROOT c = f64[] constant(-1.8e308)
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
 TEST_F(HloParserTest, ConstantWithExp) {
   const string original = R"(HloModule ConstantWithExp_module
 
@@ -1592,6 +1882,19 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
   // printed as "300".
 }
 
+TEST_F(HloParserTest, ShortConstant) {
+  const string original = R"(HloModule ShortCOnstant_module
+
+ENTRY %ShortConstant.v4 () -> f32[67,89] {
+  ROOT %constant.1 = f32[67,89]{1,0} constant({...})
+}
+
+)";
+  auto result = ParseHloString(original);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(result.ValueOrDie()->ToString(HloPrintOptions()), original);
+}
+
 TEST_F(HloParserTest, AttibutesAnyOrder) {
   const string original = R"(HloModule any_order_module
 
@@ -2074,6 +2377,31 @@ TEST(HloParserSingleOpTest, CanonicalOpWithNested) {
       text);
 }
 
+TEST(HloParserSingleOpTest, CanonicalOpIndexedConditionalInlinedBranches) {
+  const string text =
+      R"(f32[5,10]{1,0} conditional(s32[], f32[5,10]{1,0}, f32[5,10]{1,0}, f32[5,10]{1,0}), branch_computations={
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  ROOT tmp_1 = f32[5,10]{1,0} ceil(f32[5,10]{1,0} tmp_0)
+},
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  ROOT tmp_1 = f32[5,10]{1,0} floor(f32[5,10]{1,0} tmp_0)
+},
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  ROOT tmp_1 = f32[5,10]{1,0} copy(f32[5,10]{1,0} tmp_0)
+}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_EQ(
+      computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
+      text);
+}
+
 TEST(HloParserSingleOpTest, SingleOpWithNested) {
   const string text =
       R"(%fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %p0, f32[2]{0} %p1), kind=kLoop, calls=
@@ -2291,6 +2619,46 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
                   " with the shape of the operand instruction f32[2,2]{1,0}.");
 }
 
+TEST_F(HloParserTest, OutOfRangeSparseIndex) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[100]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, NegativeSparseIndex) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({-1: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, SparseIndexWithRankTooLarge) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[0, 0]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, SparseIndexWithRankTooSmall) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5, 5]sparse{10} constant({[0]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
 TEST_F(HloParserTest, ParseShapeStringR2F32) {
   string shape_string = "f32[123,456]";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
@@ -2335,6 +2703,60 @@ TEST_F(HloParserTest, ParseShapeStringWithLayout) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
+  // One tile.
+  string shape_string = "f32[123,456]{0,1:T(2,128)}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1}, {Tile({2, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with negative dimension size for combining dimensions.
+  shape_string = "f32[123,456,789]{0,1,2:T(2, * , 128)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456, 789}, {0, 1, 2},
+                                     {Tile({2, Tile::kCombineDimension, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Two tiles.
+  shape_string = "bf16[123,456,789]{2,1,0:T(2,*,128)(2,1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(
+      BF16, {123, 456, 789}, {2, 1, 0},
+      {Tile({2, Tile::kCombineDimension, 128}), Tile({2, 1})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with element size in bits.
+  shape_string = "pred[123,456]{1,0:T(2,128)E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0},
+                                            {Tile({2, 128})}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Element size in bits without tile.
+  shape_string = "pred[123,456]{1,0:E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0}, {}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Wrong minor_to_major.
+  shape_string = "f32[123,456,789]{1:T(2, * , 128)}";
+  auto result = ParseShape(shape_string);
+  ExpectHasSubstr(result.status().error_message(),
+                  "Dimensions size is 3, but minor to major size is 1.");
+}
+
 TEST_F(HloParserTest, ParseShapeStringWithSparseLayout) {
   string shape_string = "f32[123,456]sparse{10}";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
@@ -2391,5 +2813,24 @@ TEST_F(HloParserTest, ParseDynamicTuple) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST_F(HloParserTest, NegativeParameterNumber) {
+  const string hlo_string = "par0 = f32[3,5] parameter(-1)";
+  auto result = ParseHloString(hlo_string);
+  ASSERT_FALSE(result.status().ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("parameter number must be >= 0"));
+}
+
+TEST_F(HloParserTest, WrongNumberOfParameterLeafBuffersInReplication) {
+  const string hlo_string =
+      "par0 = (f32[3,5], f32[]) parameter(0), "
+      "parameter_replication={true,false,true}";
+  auto result = ParseHloString(hlo_string);
+  ASSERT_FALSE(result.status().ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("parameter has 2 leaf buffers, but "
+                                   "parameter_replication has 3 elements"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 791b1a97b0b82edf19ff1588fd8d5d996ac0fef4..35dc9c0029f9871334cb500c6b71f0c86ab136d7 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,9 +40,36 @@ class HloPassFix : public Pass {
     int64 iteration_count = 0;
     int64 limit =
         std::max(static_cast<int64>(1000), module->instruction_count());
+    VLOG(3) << "Running HloPassFix.";
     while (changed_this_iteration) {
       TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module));
       changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      ++iteration_count;
+      if (iteration_count == limit) {
+        LOG(ERROR)
+            << "Unexpectedly high number of iterations in HLO passes ("
+            << iteration_count
+            << ")\nIf compilation hangs here, please file a bug with XLA.";
+      }
+    }
+    return changed;
+  }
+
+  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) override {
+    bool changed = false;
+    bool changed_this_iteration = true;
+    int64 iteration_count = 0;
+    int64 limit = 1000;
+    for (const HloModule* module : module_group->modules()) {
+      limit = std::max<int64>(limit, module->instruction_count());
+    }
+    VLOG(3) << "Running HloPassFix.";
+    while (changed_this_iteration) {
+      TF_ASSIGN_OR_RETURN(changed_this_iteration,
+                          Pass::RunOnModuleGroup(module_group));
+      changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == limit) {
         LOG(ERROR)
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index ae8c08cf1d16ad6738962f3be7c1b5512110b1d1..0ca04cf8c58fd835449df035f9c486825b842942 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -99,30 +100,8 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
 void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
                                    absl::string_view after_pass_name,
                                    absl::string_view before_pass_name) {
-  const string& proto_dump_path =
-      module.config().debug_options().xla_dump_per_pass_hlo_proto_to();
-  if (!proto_dump_path.empty()) {
-    static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-    static auto* const module_id_to_pass_number =
-        new absl::flat_hash_map<int64, int64>();
-
-    tensorflow::mutex_lock lock(mu);
-    const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
-
-    const string filename = SanitizeFileName(
-        absl::StrFormat("module_%04d.%04d.%s.after_%s", module.unique_id(),
-                        pass_number, name(), after_pass_name));
-
-    TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(
-        MakeHloProto(module), proto_dump_path, filename));
-  }
-
-  const string message =
-      absl::StrCat("after ", after_pass_name, ", before ", before_pass_name);
-  hlo_graph_dumper::MaybeDumpHloModule(module, message);
-  VLOG(3) << "HLO " << message << ":";
-  VLOG(3) << module.entry_computation_layout().ToString();
-  XLA_VLOG_LINES(3, module.ToString());
+  DumpHloModuleBetweenPassesIfEnabled(name(), before_pass_name, after_pass_name,
+                                      module);
 }
 
 void HloPassPipeline::MaybeDumpHlo(const HloModuleGroup& module_group,
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 0fced7f15bdaf1dbe349e3b0fc6ada68393c6512..af07eb83a5c48d3380bf527ff3292e80bb441698 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -54,7 +54,9 @@ void HloReachabilityMap::SetReachabilityToUnionHelper(
   }
   bit_vector->Set(GetIndex(instruction));
   for (const HloInstruction* input : inputs) {
-    bit_vector->OrWith(GetBitVector(input));
+    if (input != instruction) {
+      bit_vector->OrWith(GetBitVector(input));
+    }
   }
 }
 
@@ -77,28 +79,51 @@ std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
     const HloComputation* computation) {
   const auto& all = computation->MakeInstructionPostOrder();
   auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = computation->ComputeChannelDependencies();
+  auto channel_group = computation->ComputeChannelDependencies();
 
-  std::vector<HloInstruction*> inputs;
   for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
+    std::vector<HloInstruction*> inputs;
+    const auto add_input = [&channel_group, &inputs](HloInstruction* input) {
+      inputs.push_back(input);
+      if (input->opcode() == HloOpcode::kAllReduce && input->all_reduce_id()) {
+        auto it = channel_group.find(*input->all_reduce_id());
+        if (it != channel_group.end()) {
+          inputs.insert(inputs.end(), it->second.begin(), it->second.end());
+        }
+      }
+    };
+
+    const auto add_dependencies = [&add_input](const HloInstruction* hlo) {
+      for (HloInstruction* operand : hlo->operands()) {
+        add_input(operand);
+      }
+      for (HloInstruction* predecessor : hlo->control_predecessors()) {
+        add_input(predecessor);
+      }
+    };
+
+    add_dependencies(hlo);
 
     switch (hlo->opcode()) {
       case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
+        auto it = channel_group.find(hlo->channel_id());
+        if (it != channel_group.end()) {
+          for (HloInstruction* channel : it->second) {
+            if (channel->opcode() == HloOpcode::kSend) {
+              add_input(channel);
+            }
+          }
         }
         break;
       }
       case HloOpcode::kAllReduce: {
         auto all_reduce_id = hlo->all_reduce_id();
         if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
+          auto it = channel_group.find(all_reduce_id.value());
+          if (it != channel_group.end()) {
+            for (HloInstruction* all_reduce : it->second) {
+              add_dependencies(all_reduce);
+            }
           }
         }
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 5c1793933ca2f927d3172de7135a0e583f70d8f9..a175e4643de2ac6ce07ac00da914d7ab7acca541 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -188,7 +188,8 @@ class InstructionList {
   Item* CreateItem(HloInstruction* inst) {
     Item* item = new Item;
     item->instruction = inst;
-    CHECK(item_map_.insert({inst, item}).second) << "inserting inst twice";
+    CHECK(item_map_.insert({inst, item}).second)
+        << "inserting inst twice " << inst->name();
     return item;
   }
 
@@ -426,11 +427,12 @@ class MemoryUsageTracker {
   // the given uses.
   Buffer& RematerializeBuffer(const Buffer& original_buffer, Item* remat_item,
                               ItemList&& rematerialized_uses) {
-    CHECK(original_buffer.defining_instruction->placed);
-    CHECK(!original_buffer.has_indirect_uses);
-    CHECK(!original_buffer.live_out);
+    CHECK(original_buffer.defining_instruction->placed)
+        << original_buffer.defining_instruction->instruction->name();
+    CHECK(!original_buffer.has_indirect_uses) << original_buffer.ToString();
+    CHECK(!original_buffer.live_out) << original_buffer.ToString();
     for (Item* use : rematerialized_uses) {
-      CHECK(!use->placed);
+      CHECK(!use->placed) << use->instruction->name();
     }
     return NewBuffer(remat_item, original_buffer.size,
                      std::move(rematerialized_uses), /*live_out=*/false,
@@ -683,8 +685,8 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
           << ", remat_instruction = " << remat_item->instruction->name();
 
   TF_RET_CHECK(in_progress_item_ != nullptr);
-  TF_RET_CHECK(original_item->placed);
-  TF_RET_CHECK(!remat_item->placed);
+  TF_RET_CHECK(original_item->placed) << original_item->instruction->name();
+  TF_RET_CHECK(!remat_item->placed) << remat_item->instruction->name();
 
   // Construct the list of buffers used and defined by the rematerialization.
   remat_item->buffers_used = original_item->buffers_used;
@@ -713,7 +715,7 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
     ItemList unplaced_users;
     for (Item* user : old_buffer.users) {
       if (user->placed) {
-        CHECK(IsFinished(user));
+        CHECK(IsFinished(user)) << user->instruction->name();
         placed_users.push_back(user);
       } else {
         unplaced_users.push_back(user);
@@ -1098,7 +1100,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         Item* successor_item = instruction_list.GetItem(successor);
         // Assert to make sure we never remat an operation with control
         // successor already placed.
-        CHECK(!successor_item->placed);
+        CHECK(!successor_item->placed) << successor_item->instruction->name();
         place_before.push_back(successor_item);
       }
       instruction_list.InsertBeforeInstructions(remat_item, place_before);
@@ -1168,7 +1170,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // Verify some invariants on the memory tracker.
   CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto* instruction : computation->instructions()) {
-    CHECK(memory_tracker.IsPlaced(instruction));
+    CHECK(memory_tracker.IsPlaced(instruction)) << instruction->name();
   }
 
   VLOG(1) << "In computation " << computation->name() << " rematerialized "
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index d7d66ae1c4592723ca991d5ee971fa72cc1af90a..837367745866386921368cc7d49105029aadaf98 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -168,6 +169,35 @@ StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
       /*profile=*/profile);
 }
 
+StatusOr<Literal> HloRunner::Execute(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const Literal* const> arguments,
+    ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
+                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      ExecuteWithDeviceBuffers(
+                          /*executable=*/executable.get(),
+                          /*arguments=*/argument_buffers,
+                          /*profile=*/profile));
+  return TransferLiteralFromDevice(result);
+}
+
+StatusOr<Literal> HloRunner::Execute(std::unique_ptr<Executable> executable,
+                                     const absl::Span<const Literal> arguments,
+                                     ExecutionProfile* profile) {
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<const Literal*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return Execute(
+      /*module=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
     const absl::Span<const ShapedBuffer* const> arguments, bool run_hlo_passes,
@@ -206,7 +236,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutionProfile* profile) {
   // Get service run options.
@@ -225,7 +255,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ScopedShapedBuffer> arguments,
     ExecutionProfile* profile) {
   std::vector<const ShapedBuffer*> argument_pointers;
@@ -240,14 +270,11 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    std::unique_ptr<HloModule> module,
-    const ReplicatedExecuteOptions& options) {
+    std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment, bool use_threads) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-  TF_ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
 
@@ -264,13 +291,13 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64 index = 0;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    int64 device = device_assignment(i, 0);
+    int64 device = (*device_assignment)(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend().stream_executor(device));
     streams.push_back(absl::make_unique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
-        device, streams.back().get(), &device_assignment));
+        device, streams.back().get(), device_assignment));
 
     // Copy arguments to device.
     for (const Literal* argument : options.arguments) {
@@ -300,7 +327,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   }
   if (options.infeed != nullptr) {
     for (int64 i = 0; i < options.num_replicas; ++i) {
-      int64 device = device_assignment(i, 0);
+      int64 device = (*device_assignment)(i, 0);
       pool->Schedule([this, device, &options]() {
         se::StreamExecutor* executor =
             backend().stream_executor(device).ValueOrDie();
@@ -318,7 +345,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   }
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
     for (int64 i = 0; i < options.num_replicas; ++i) {
-      int64 device = device_assignment(i, 0);
+      int64 device = (*device_assignment)(i, 0);
       pool->Schedule([this, device, &options]() {
         se::StreamExecutor* executor =
             backend().stream_executor(device).ValueOrDie();
@@ -340,9 +367,39 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> results,
-                      executable->ExecuteOnStreams(service_run_options,
-                                                   argument_buffer_slices));
+  std::vector<ScopedShapedBuffer> results;
+  if (!use_threads) {
+    TF_ASSIGN_OR_RETURN(results,
+                        executable->ExecuteOnStreams(service_run_options,
+                                                     argument_buffer_slices));
+  } else {
+    tensorflow::mutex mutex;
+    std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+        options.num_replicas);
+    {
+      LOG(INFO) << "Creating thread pool for " << options.num_replicas
+                << " replicas";
+      tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
+                                          "replicas", options.num_replicas);
+      for (int64 i = 0; i < options.num_replicas; ++i) {
+        pool.Schedule([&, i] {
+          auto result = executable->ExecuteOnStream(
+              &service_run_options[i], argument_buffer_slices[i], nullptr);
+          tensorflow::mutex_lock lock(mutex);
+          thread_results[i] = std::move(result);
+        });
+      }
+
+      // Note: the thread pool destructor guarantees it completes all work
+      // before we leave this scope.
+    }
+    for (auto& thread_result : thread_results) {
+      if (!thread_result.ok()) {
+        return thread_result.status();
+      }
+      results.push_back(std::move(thread_result).ValueOrDie());
+    }
+  }
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<Literal> exec_results;
@@ -356,6 +413,16 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return std::move(exec_results);
 }
 
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
+    bool use_threads) {
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  return ExecuteReplicated(std::move(module), options, &device_assignment,
+                           use_threads);
+}
+
 StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   if (run_hlo_passes) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index bb792cf8c9825ff67ca33bbcf2c3c32b1a0ecb85..0c1ae3a0abbd1be8e92de79c88ca4073ca09f569 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -60,7 +60,7 @@ class HloRunner {
     // The number of times the infeed literal should be fed to the HLO module.
     // For a clean exit, this should match the iterations-per-loop parameter
     // used when generating the HLO module proto (that is usually the main
-    // while bounary counter). A value higher then iterations-per-loop would
+    // while boundary counter). A value higher then iterations-per-loop would
     // lead to infeed threads feeding to a gone computation, while a lower
     // value would trigger a stuck ExecuteReplicated() call (the computation
     // will be trying to infeed data which will never come).
@@ -124,6 +124,14 @@ class HloRunner {
                             bool run_hlo_passes = true,
                             ExecutionProfile* profile = nullptr);
 
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal* const> arguments,
+                            ExecutionProfile* profile = nullptr);
+
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal> arguments,
+                            ExecutionProfile* profile = nullptr);
+
   // As Execute(), but accepts and returns device buffers instead of host
   // buffers.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
@@ -136,13 +144,16 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  // In the following two calls, "executable" is not a unique_ptr to allow
+  // reuse of the Executable.  This call may update the profile information in
+  // *executable.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutionProfile* profile = nullptr);
 
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ScopedShapedBuffer> arguments,
       ExecutionProfile* profile = nullptr);
 
@@ -154,9 +165,19 @@ class HloRunner {
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
+  //
+  // use_threads indicates whether this replicated computation will be executed
+  // with a thread-per-replica, vs using an implicitly async call such as
+  // Executable::ExecuteOnStreams.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options, bool use_threads = false);
+
+  // Same as above, but with specified device assignment.
   StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      const ReplicatedExecuteOptions& options);
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, bool use_threads = false);
 
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index 0e56e6f760e35ddcb45c6f58771d78405a09acfe..ecc8dbe6560fdfaa4b7801b9f482b3482f2eb083 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -228,7 +228,7 @@ HloModule UpdateScheduleWithMultipleComputations
   %param = (s32[], token[]) parameter(0)
   %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0
   %constant = s32[] constant(42)
-  ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant)
+  ROOT %less-than = pred[] compare(s32[] %get-tuple-element, s32[] %constant), direction=LT
 }
 
 ENTRY %WhileLoop () -> s32[] {
@@ -297,7 +297,7 @@ HloModule UpdateScheduleWithMultipleComputations
   %param = (s32[], token[]) parameter(0)
   %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0
   %constant = s32[] constant(42)
-  ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant)
+  ROOT %less-than = pred[] compare(s32[] %get-tuple-element, s32[] %constant), direction=LT
 }
 
 ENTRY %WhileLoop () -> s32[] {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 37cc146bd7a6f2aef9373bd4afd8572ffac6473c..f1d7e60f2b5a68408f6d428a0ec47fba3c9c4f12 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -96,13 +96,13 @@ string HloSharding::ToString() const {
 
   if (replicated_) {
     return "{replicated}";
-  } else if (maximal_) {
+  }
+  if (maximal_) {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
-  } else {
-    return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","),
-                  "]", StrJoin(tile_assignment_, ","), "}");
   }
+  return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
+                StrJoin(tile_assignment_, ","), "}");
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
@@ -328,8 +328,8 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
             status = tensorflow::errors::InvalidArgument(
                 StrCat("core ", core, " is not unique in tile assignment"));
           }
+          seen_cores.insert(core);
         }
-        seen_cores.insert(core);
       });
   if (!status.ok()) {
     return status;
@@ -347,7 +347,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
         ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
-  // The correct constructor have to be used to create tile maximal shardings.
+  // The correct constructor has to be used to create tile maximal shardings.
   if (tile_assignment_.num_elements() == 1) {
     return tensorflow::errors::InvalidArgument(
         "Tile assignment only contains a single device. If a replicated "
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 5789ae09988d2a85247c5b8c037a172b3699f3b7..dd57ea83f1cb33aa052facb607bc040d2e708633 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -118,7 +118,7 @@ class HloSharding {
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
-  // Retrieves an histogram of the devices used by the sharding. The returned
+  // Retrieves a histogram of the devices used by the sharding. The returned
   // map has the device number as key, and the occurrence count as value.
   // If a sharding does not have a device, it will not be incuded in the
   // histogram. The count argument, if not nullptr, will receive the total
@@ -260,6 +260,19 @@ class HloSharding {
   bool replicated_;
   bool maximal_;
   bool tuple_;
+  // This field is only used if replicated_ is false. If maximal_ is true, then
+  // the field contains a rank 1 array with a single element, which is the
+  // device the HLO is assigned to. If maximal_ is false, the field contains an
+  // array with the same rank as the corresponding HLO. The dimension sizes of
+  // the array describe the number of ways the HLO is partitioned along each
+  // dimension. The values of the array specify which device each tile of
+  // the HLO is assigned to. The index of each value determines which tile it
+  // takes.
+  // For example, {{{2, 3}}, {{5, 7}}} (whose ToString representation is
+  // "{devices=[2,1,2]2,3,5,7}"), means that dimension 1 is split two way and
+  // dimension 3 is split 2 way. Core 5, whose index is [2,1,1] will take the
+  // tile that contains the 2nd half of dimension 1 and the 1st half of
+  // dimension 3.
   Array<int64> tile_assignment_;
   // Only non-empty when tuple_ is true. If a tuple is empty then one entry is
   // present for the root. This is a flattened list of all the leaf shardings in
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 80634677e78e4a35dcb9bf7de018a88122c3c030..9e234e025586ff14f99da73afc5610c627303a36 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -84,7 +84,7 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should fail because of more devices used then `num_device`.
+    // Test should fail because of more devices used than `num_device`.
     HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3}));
     EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}),
                                        /*num_devices=*/2));
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index c1073911ea9dc3811c195e27bcbae9b00929ad17..6c0a1926c414819933cdf4c142702bcf0b65f2f4 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -87,17 +86,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(2, module->computation_count());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
@@ -126,17 +115,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(2, module->computation_count());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
@@ -166,17 +145,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_FALSE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
 }
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
deleted file mode 100644
index c1f69db74eafb7743e85f499f2f4828ed0375501..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-LIcensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using absl::StrAppend;
-using absl::StrCat;
-using tensorflow::GraphDef;
-using tensorflow::NodeDef;
-using tensorflow::TensorShapeProto;
-
-string GetOpDefName(const HloInstruction* instruction) {
-  string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
-  tensorflow::str_util::TitlecaseString(&name, "-");  // non-absl ok
-  name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
-
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    string fusion_name = ToString(instruction->fusion_kind());
-    StrAppend(&name, absl::string_view(fusion_name).substr(1));
-  }
-  return name;
-}
-
-TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
-  TensorShapeProto tensor_shape;
-  const Shape& shape = instruction->shape();
-  for (auto dim : shape.dimensions()) {
-    tensor_shape.add_dim()->set_size(dim);
-  }
-  return tensor_shape;
-}
-
-string GetDeviceName(int device) { return StrCat("/device/XLA:", device); }
-
-void CleanNodeName(string* name) {
-  name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
-  const string chars_to_replace = "<>[]";
-  auto pred = [&](char c) {
-    return absl::c_linear_search(chars_to_replace, c);
-  };
-  std::replace_if(name->begin(), name->end(), pred, '_');
-}
-
-}  // namespace
-
-HloTfGraphBuilder::HloTfGraphBuilder(const DebugOptions& debug_options)
-    : debug_options_(debug_options) {}
-
-Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
-  VLOG(2) << "Adding computation " << computation.name();
-  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
-    for (auto* instruction : embedded->instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(instruction));
-    }
-  }
-  for (auto* instruction : computation.instructions()) {
-    TF_RETURN_IF_ERROR(AddInstruction(instruction));
-  }
-  return Status::OK();
-}
-
-const GraphDef& HloTfGraphBuilder::GetGraphDef() const { return graph_def_; }
-
-const string& HloTfGraphBuilder::GetNodeNameForInstruction(
-    const HloInstruction* instruction) {
-  if (ContainsKey(instruction_to_node_name_, instruction)) {
-    return instruction_to_node_name_[instruction];
-  }
-  auto append = [](string* str, const string& other) {
-    if (str->empty()) {
-      *str = other;
-    } else if (!other.empty()) {
-      StrAppend(str, "/", other);
-    }
-  };
-  string node_name;
-  if (debug_options_.xla_hlo_tfgraph_device_scopes()) {
-    auto device = instruction->sharding_unique_device();
-    if (device) {
-      node_name = StrCat("dev", *device);
-    }
-  }
-  // If an instruction is fused, put it in the subgraph of the fusion;
-  // otherwise, put it in the computation subgraph.
-  const HloComputation* computation = instruction->parent();
-  if (computation->IsFusionComputation()) {
-    append(&node_name,
-           GetNodeNameForInstruction(computation->FusionInstruction()));
-  } else {
-    append(&node_name, computation->name());
-    if (!instruction->metadata().op_name().empty()) {
-      // Always make computations contain TF ops but not the other way around.
-      append(&node_name, instruction->metadata().op_name());
-    }
-  }
-  string instruction_name = instruction->name();
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    StrAppend(&instruction_name, ".", instruction->parameter_number());
-  }
-  append(&node_name, instruction_name);
-  CleanNodeName(&node_name);
-  auto ret =
-      instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
-  CHECK(ret.second);
-  return ret.first->second;
-}
-
-void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
-                                     NodeDef* node_def) const {
-  auto& attrs = *node_def->mutable_attr();
-
-  // Set the number of arguments for instructions that have variadic operands.
-  if (HloOpcodeIsVariadic(instruction->opcode())) {
-    tensorflow::AttrValue attr_value;
-    attr_value.set_i(instruction->operands().size());
-    attrs["arg_num"] = attr_value;
-  }
-
-  // Set the node type.
-  attrs["type"].set_s(
-      xla::PrimitiveType_Name(instruction->shape().element_type()));
-
-  // Set the framework op (e.g. Tensorflow op) that generated this XLA op.
-  attrs["tf_op_type"].set_s(instruction->metadata().op_type());
-  attrs["tf_op_name"].set_s(instruction->metadata().op_name());
-
-  // Set the shape of the output tensor. "_output_shapes" is a special attribute
-  // name used by Tensorboard for shapes of output tensors.
-  tensorflow::AttrValue shapes;
-  *shapes.mutable_list()->add_shape() = GetTensorShape(instruction);
-  attrs["_output_shapes"] = shapes;
-
-  // Set the layout.
-  if (LayoutUtil::HasLayout(instruction->shape())) {
-    string layout_string;
-    if (instruction->shape().IsTuple()) {
-      // For tuples, emit the full shape because the layout of a tuple is not
-      // represented in a single Layout field.
-      layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
-    } else {
-      layout_string = StrCat(
-          "{",
-          absl::StrJoin(LayoutUtil::MinorToMajor(instruction->shape()), ","),
-          "}");
-    }
-    attrs["layout"].set_s(layout_string);
-  }
-
-  // Set op-specific attributes.
-  switch (instruction->opcode()) {
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kBroadcast:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReverse:
-    case HloOpcode::kTranspose:
-      for (auto dim : instruction->dimensions()) {
-        attrs["dims"].mutable_list()->add_i(dim);
-      }
-      break;
-    case HloOpcode::kGetTupleElement:
-      attrs["index"].set_i(instruction->tuple_index());
-      break;
-    case HloOpcode::kRng:
-      attrs["dist"].set_s(
-          RandomDistribution_Name(instruction->random_distribution()));
-      break;
-    case HloOpcode::kConstant:
-      if (ShapeUtil::IsScalar(instruction->shape())) {
-        attrs["value"].set_s(instruction->literal().GetAsString({}));
-      }
-      break;
-    case HloOpcode::kCustomCall:
-      attrs["custom_call_target"].set_s(instruction->custom_call_target());
-      break;
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
-      attrs["channel_id"].set_i(instruction->channel_id());
-      break;
-    default:
-      break;
-  }
-}
-
-Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) {
-  if (!visited_instructions_.insert(instruction).second) {
-    // Skip instructions that have already been added.
-    return Status::OK();
-  }
-
-  NodeDef* node_def = graph_def_.add_node();
-  node_def->set_name(GetNodeNameForInstruction(instruction));
-  node_def->set_op(GetOpDefName(instruction));
-
-  auto device = instruction->sharding_unique_device();
-  if (device) {
-    node_def->set_device(GetDeviceName(*device));
-  }
-  SetNodeAttrs(instruction, node_def);
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    for (auto* fused_instruction : instruction->fused_instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(fused_instruction));
-    }
-  }
-  // Add all edges including control edges.
-  for (unsigned i = 0; i < instruction->operands().size(); ++i) {
-    *node_def->add_input() = GetNodeNameForInstruction(instruction->operand(i));
-  }
-  // Called computations are control dependencies.
-  for (const auto* called_computation : instruction->called_computations()) {
-    *node_def->add_input() = StrCat(
-        "^", GetNodeNameForInstruction(called_computation->root_instruction()));
-  }
-  return Status::OK();
-}
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
deleted file mode 100644
index c4876b852e32d34693202f4023aa20ad2b301ffd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-
-// This constructs a tensorflow graph for HLO computations.
-class HloTfGraphBuilder {
- public:
-  HloTfGraphBuilder(const DebugOptions& debug_options = DebugOptions());
-
-  // Adds a computation to the graph.
-  Status AddComputation(const HloComputation& computation);
-
-  const tensorflow::GraphDef& GetGraphDef() const;
-
- private:
-  // Gets the node name of an instruction. The node name is hierarchical. For
-  // example, if an instruction is fused, it will be put in a subgraph of the
-  // fusion instruction.
-  const string& GetNodeNameForInstruction(const HloInstruction* instruction);
-
-  void SetNodeAttrs(const HloInstruction* instruction,
-                    tensorflow::NodeDef* node_def) const;
-
-  Status AddInstruction(const HloInstruction* instruction);
-
-  DebugOptions debug_options_;
-  tensorflow::GraphDef graph_def_;
-  // This records instructions that have been visited.
-  std::unordered_set<const HloInstruction*> visited_instructions_;
-  // A cache that maps instruction to the node name.
-  std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
-};
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
deleted file mode 100644
index 1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using ::tensorflow::GraphDef;
-
-class HloTfGraphBuilderTest : public HloTestBase {
- protected:
-  HloTfGraphBuilderTest() {}
-  HloTfGraphBuilder generator_;
-
-  // Create a computation which takes a scalar and returns its negation.
-  std::unique_ptr<HloComputation> CreateNegateComputation() {
-    auto builder = HloComputation::Builder("Negate");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-    return builder.Build();
-  }
-
-  // Creates a computation which calls map with the given computation.
-  std::unique_ptr<HloComputation> CreateMapComputation(
-      HloComputation *map_computation) {
-    auto builder = HloComputation::Builder("Map");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateMap(r0f32_, {param}, map_computation));
-    return builder.Build();
-  }
-  Shape r0f32_ = ShapeUtil::MakeShape(PrimitiveType::F32, {});
-};
-
-static const tensorflow::AttrValue &GetNodeAttr(const tensorflow::NodeDef &node,
-                                                const string &attr_name) {
-  auto attr = node.attr().find(attr_name);
-  CHECK(attr != node.attr().end());
-  return attr->second;
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
-  auto builder = HloComputation::Builder("Concatenate");
-  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2});
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, shape, "param1"));
-  builder.AddInstruction(HloInstruction::CreateConcatenate(
-      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {param_1, param_2}, 1));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  const auto &node = graph_def.node(2);
-  EXPECT_EQ(node.name(), "Concatenate/concatenate");
-
-  // Check dimensions.
-  auto dims_value = GetNodeAttr(node, "dims");
-  EXPECT_EQ(dims_value.list().i_size(), 1);
-  EXPECT_EQ(dims_value.list().i(0), 1);
-
-  // Check shapes.
-  auto shape_value = GetNodeAttr(node, "_output_shapes");
-  EXPECT_EQ(shape_value.list().shape_size(), 1);
-  EXPECT_EQ(shape_value.list().shape(0).dim_size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(0).size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(1).size(), 4);
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
-  auto builder = HloComputation::Builder("Const");
-  HloInstruction *instruction = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
-  OpMetadata metadata;
-  metadata.set_op_name("x");
-  metadata.set_op_type("y");
-  instruction->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 1);
-  const auto &node = graph_def.node(0);
-  EXPECT_EQ(GetNodeAttr(node, "value").s(), "123");
-  EXPECT_EQ(GetNodeAttr(node, "type").s(), "S32");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_name").s(), "x");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_type").s(), "y");
-}
-
-TEST_F(HloTfGraphBuilderTest, SimpleNegateComputation) {
-  auto negate_computation = CreateNegateComputation();
-  TF_CHECK_OK(generator_.AddComputation(*negate_computation));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 2);
-  EXPECT_EQ(graph_def.node(0).name(), "Negate/param0.0");
-  EXPECT_EQ(graph_def.node(0).op(), "HloParameter");
-  EXPECT_EQ(graph_def.node(1).name(), "Negate/negate");
-  EXPECT_EQ(graph_def.node(1).op(), "HloNegate");
-  EXPECT_EQ(graph_def.node(1).input_size(), 1);
-  EXPECT_EQ(graph_def.node(1).input(0), "Negate/param0.0");
-}
-
-TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  auto ge = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  OpMetadata metadata;
-  metadata.set_op_name("x/y");
-  metadata.set_op_type("Y");
-  ge->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
-  // Create computations with a diamond-shaped callgraph.
-  auto negate_computation = CreateNegateComputation();
-  auto map1_computation = CreateMapComputation(negate_computation.get());
-  auto map2_computation = CreateMapComputation(negate_computation.get());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto map1 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map1_computation.get()));
-  auto map2 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map2_computation.get()));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, map1, map2));
-  auto computation = builder.Build();
-  TF_CHECK_OK(generator_.AddComputation(*computation));
-  EXPECT_GT(generator_.GetGraphDef().node_size(), 0);
-}
-
-}  // namespace
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 36340d3d78e059aae7ac8b341cf1b87384818ba9..375ae2c477d7a0aea8445d9c237991eee3353a04 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -50,6 +50,7 @@ bool IsCallerInstruction(HloInstruction* hlo) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return true;
     default:
@@ -57,15 +58,6 @@ bool IsCallerInstruction(HloInstruction* hlo) {
   }
 }
 
-Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
-  if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
-    return InternalError(
-        "Called computations specified for non-caller instruction  %s",
-        hlo->ToString());
-  }
-  return VerifyNotSparse(hlo->shape());
-}
-
 namespace {
 
 Status CheckOperandCount(const HloInstruction* hlo, int expected) {
@@ -90,6 +82,21 @@ Status CheckParameterCount(const HloInstruction* calling_instruction,
 
 }  // namespace
 
+Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
+  if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
+    return InternalError(
+        "Called computations specified for non-caller instruction  %s",
+        hlo->ToString());
+  }
+  TF_RETURN_IF_ERROR(VerifyNotSparse(hlo->shape()));
+
+  absl::optional<int> arity = HloOpcodeArity(hlo->opcode());
+  if (arity) {
+    TF_RETURN_IF_ERROR(CheckOperandCount(hlo, *arity));
+  }
+  return Status::OK();
+}
+
 Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
   return CheckUnaryShape(hlo);
 }
@@ -121,14 +128,12 @@ Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
 }
 
 Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
 }
 
 Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
@@ -139,7 +144,6 @@ Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
 }
 
 Status ShapeVerifier::HandleDot(HloInstruction* dot) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dot, 2));
   TF_ASSIGN_OR_RETURN(const Shape expected,
                       ShapeInference::InferDotOpShape(
                           dot->operand(0)->shape(), dot->operand(1)->shape(),
@@ -148,7 +152,6 @@ Status ShapeVerifier::HandleDot(HloInstruction* dot) {
 }
 
 Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convolution, 2));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferConvolveShape(
@@ -159,7 +162,6 @@ Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
 }
 
 Status ShapeVerifier::HandleFft(HloInstruction* fft) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(fft, 1));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferFftShape(fft->operand(0)->shape(), fft->fft_type(),
@@ -167,6 +169,21 @@ Status ShapeVerifier::HandleFft(HloInstruction* fft) {
   return CheckShape(fft, expected);
 }
 
+Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
+  TF_ASSIGN_OR_RETURN(const Shape expected,
+                      ShapeInference::InferTriangularSolveShape(
+                          hlo->operand(0)->shape(), hlo->operand(1)->shape(),
+                          hlo->triangular_solve_options()));
+  return CheckShape(hlo, expected);
+}
+
+Status ShapeVerifier::HandleCholesky(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
+  TF_ASSIGN_OR_RETURN(const Shape expected, ShapeInference::InferCholeskyShape(
+                                                hlo->operand(0)->shape()));
+  return CheckShape(hlo, expected);
+}
+
 Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : crs->operands()) {
@@ -184,14 +201,16 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
                     ShapeInference::InferAllToAllTupleShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
+  return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
+}
+
 Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
                              hlo->operand(0)->shape()));
 }
 
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_precision, 1));
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
                                           reduce_precision->exponent_bits(),
@@ -225,7 +244,6 @@ Status ShapeVerifier::CheckOperandAndParameter(
 }
 
 Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
@@ -236,7 +254,6 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
@@ -312,7 +329,6 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reverse, 1));
   return CheckShape(
       reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
                                                  reverse->dimensions()));
@@ -323,13 +339,48 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
     return InternalError("Expected at least 1 operand for %s instruction: %s",
                          HloOpcodeString(sort->opcode()), sort->ToString());
   }
+  HloComputation* compare = sort->to_apply();
+
+  // Check that the 'compare' computation returns a PRED.
+  Shape compare_shape = compare->root_instruction()->shape();
+  if (!ShapesSame(compare_shape, ShapeUtil::MakeShape(PRED, {}))) {
+    return InternalError(
+        "The Sort compare computation shape does not lead to a scalar "
+        "predicate shape: %s",
+        StringifyShape(compare_shape));
+  }
+
+  // Check that the number of parameters of the 'compare' computation is
+  // correct.
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(sort, compare, sort->operand_count() * 2));
+
+  // Verify that the operands of the compare computation have the correct scalar
+  // shapes.
+  for (int64 parameter_idx = 0; parameter_idx < compare->num_parameters();
+       ++parameter_idx) {
+    int64 operand_idx = parameter_idx / 2;
+    Shape expected_scalar_shape = ShapeUtil::MakeShape(
+        sort->operand(operand_idx)->shape().element_type(), {});
+    Shape actual_parameter_shape =
+        compare->parameter_instruction(parameter_idx)->shape();
+    if (!ShapeUtil::CompatibleIgnoringFpPrecision(expected_scalar_shape,
+                                                  actual_parameter_shape)) {
+      return InternalError(
+          "Expected the %lld-th parameter of the compare computation of sort "
+          "to have shape %s, but got %s",
+          parameter_idx, StringifyShape(expected_scalar_shape),
+          StringifyShape(actual_parameter_shape));
+    }
+  }
+
+  // Verify that all operand shapes have the same dimensions.
   for (int64 operand = 1; operand < sort->operand_count(); ++operand) {
     if (!ShapeUtil::SameDimensions(sort->operand(0)->shape(),
                                    sort->operand(operand)->shape())) {
       return InternalError(
-          "Expected sort to have to have the same dimensions for the keys "
-          "and the values. Keys shape is: %s\n, Values shape (operand index "
-          "%lld) is: %s",
+          "Expected sort to have to have the same dimensions for all operands. "
+          "First operand shape is: %s\n, shape (operand index %lld) is: %s",
           StringifyShape(sort->operand(0)->shape()), operand,
           StringifyShape(sort->operand(operand)->shape()));
     }
@@ -338,7 +389,6 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
 }
 
 Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(constant, 0));
   if (!Cast<HloConstantInstruction>(constant)->HasLiteral()) {
     return InternalError("Constant is required to have a valid literal: %s",
                          constant->ToString());
@@ -347,8 +397,10 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
 }
 
 Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 0));
   auto* iota = Cast<HloIotaInstruction>(instruction);
+  if (!iota->shape().IsArray()) {
+    return InternalError("Iota does not support non-array result.");
+  }
   const int64 rank = iota->shape().rank();
   if (rank == 0) {
     return InternalError("Iota does not support scalars.");
@@ -362,13 +414,30 @@ Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(get_tuple_element, 1));
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
                         get_tuple_element->operand(0)->shape(),
                         get_tuple_element->tuple_index()));
 }
 
+namespace {
+Status SameElementTypesForOperandsAndToApplyParameters(
+    const HloInstruction& instruction, int64 num_operands_to_check) {
+  const ProgramShape& to_apply = instruction.to_apply()->ComputeProgramShape();
+  for (int i = 0; i < num_operands_to_check; ++i) {
+    const Shape& parameter_shape = to_apply.parameters(i);
+    const Shape& operand_shape = instruction.operands()[i]->shape();
+    if (!ShapeUtil::SameElementType(parameter_shape, operand_shape)) {
+      return InvalidArgument(
+          "Shape mismatch between to_apply computation"
+          " parameter and operand %d in %s.",
+          i, instruction.ToString().c_str());
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
 Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   if (reduce->operand_count() % 2 != 0) {
     return InternalError(
@@ -380,13 +449,18 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   for (const HloInstruction* operand : reduce->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(reduce, ShapeInference::InferReduceShape(
-                                operand_shapes, reduce->dimensions(),
-                                reduce->to_apply()->ComputeProgramShape()));
+  TF_RETURN_IF_ERROR(
+      CheckShape(reduce, ShapeInference::InferReduceShape(
+                             operand_shapes, reduce->dimensions(),
+                             reduce->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *reduce, reduce->operands().size() - 1);
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(bitcast, 1));
   // Bitcasts are not allowed to change the element type.
   if (bitcast->operand(0)->shape().element_type() !=
       bitcast->shape().element_type()) {
@@ -399,7 +473,6 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
 }
 
 Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(broadcast, 1));
   // HLO broadcast has no exact analog at the proto level so there is no
   // ShapeInference method. Check the output shape explicitly.
   const Shape& operand_shape = broadcast->operand(0)->shape();
@@ -419,7 +492,6 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
 }
 
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reshape, 1));
   // Check for mixed precision.
   const Shape& operand_shape = reshape->operand(0)->shape();
   TF_RET_CHECK(SameElementType(reshape->shape(), operand_shape));
@@ -429,14 +501,12 @@ Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
 }
 
 Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(transpose, 1));
   return CheckShape(
       transpose, ShapeInference::InferTransposeShape(
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
 Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 0));
   return Status::OK();
 }
 
@@ -496,7 +566,6 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(slice, 1));
   return CheckShape(slice,
                     ShapeInference::InferSliceShape(
                         slice->operand(0)->shape(), slice->slice_starts(),
@@ -538,23 +607,33 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
   // arbitrary map dimensions.
   std::vector<int64> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
-  return CheckShape(map, ShapeInference::InferMapShape(
-                             operand_shapes,
-                             map->to_apply()->ComputeProgramShape(), map_dims));
+
+  TF_RETURN_IF_ERROR(CheckShape(
+      map,
+      ShapeInference::InferMapShape(
+          operand_shapes, map->to_apply()->ComputeProgramShape(), map_dims)));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *map, map->operands().size());
 }
 
 Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_window, 2));
-  return CheckShape(
+  TF_RETURN_IF_ERROR(CheckShape(
       reduce_window,
       ShapeInference::InferReduceWindowShape(
           reduce_window->operand(0)->shape(),
           reduce_window->operand(1)->shape(), reduce_window->window(),
-          reduce_window->to_apply()->ComputeProgramShape()));
+          reduce_window->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(*reduce_window,
+                                                               1);
 }
 
 Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(
       instruction,
       ShapeInference::InferSelectAndScatterShape(
@@ -565,7 +644,6 @@ Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(xla_while, 1));
   TF_RETURN_IF_ERROR(
       CheckParameterCount(xla_while, xla_while->while_body(), 1));
   TF_RETURN_IF_ERROR(
@@ -589,33 +667,32 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
 }
 
 Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(conditional, 3));
-  TF_RETURN_IF_ERROR(
-      CheckParameterCount(conditional, conditional->true_computation(), 1));
-  TF_RETURN_IF_ERROR(
-      CheckParameterCount(conditional, conditional->false_computation(), 1));
-  TF_RETURN_IF_ERROR(CheckOperandAndParameter(
-      conditional, 1, conditional->true_computation(), 0));
-  TF_RETURN_IF_ERROR(CheckOperandAndParameter(
-      conditional, 2, conditional->false_computation(), 0));
-  TF_RETURN_IF_ERROR(
-      CheckShape(conditional,
-                 conditional->true_computation()->root_instruction()->shape()));
-  TF_RETURN_IF_ERROR(CheckShape(
-      conditional,
-      conditional->false_computation()->root_instruction()->shape()));
+  const int num_branches = conditional->branch_count();
+  if (conditional->operand(0)->shape().element_type() == PRED) {
+    TF_RET_CHECK(num_branches == 2);
+  } else {
+    TF_RET_CHECK(num_branches >= 1);
+  }
+  TF_RETURN_IF_ERROR(CheckOperandCount(conditional, num_branches + 1));
+  for (int j = 0; j < num_branches; ++j) {
+    TF_RETURN_IF_ERROR(CheckParameterCount(
+        conditional, conditional->branch_computation(j), 1));
+    TF_RETURN_IF_ERROR(CheckOperandAndParameter(
+        conditional, j + 1, conditional->branch_computation(j), 0));
+    TF_RETURN_IF_ERROR(CheckShape(
+        conditional,
+        conditional->branch_computation(j)->root_instruction()->shape()));
+  }
   return Status::OK();
 }
 
 Status ShapeVerifier::HandlePad(HloInstruction* pad) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(pad, 2));
   return CheckShape(pad, ShapeInference::InferPadShape(pad->operand(0)->shape(),
                                                        pad->operand(1)->shape(),
                                                        pad->padding_config()));
 }
 
 Status ShapeVerifier::HandleSend(HloInstruction* send) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(send, 2));
   return CheckShape(send,
                     ShapeUtil::MakeTupleShape({send->operand(0)->shape(),
                                                ShapeUtil::MakeShape(U32, {}),
@@ -623,12 +700,10 @@ Status ShapeVerifier::HandleSend(HloInstruction* send) {
 }
 
 Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(send_done, 1));
   return CheckShape(send_done, ShapeUtil::MakeTokenShape());
 }
 
 Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(recv, 1));
   return CheckShape(
       recv, ShapeUtil::MakeTupleShape(
                 {ShapeUtil::GetTupleElementShape(recv->shape(), 0),
@@ -636,7 +711,6 @@ Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
 }
 
 Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(recv_done, 1));
   return CheckShape(
       recv_done,
       ShapeUtil::MakeTupleShape(
@@ -646,7 +720,6 @@ Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
 
 Status ShapeVerifier::HandleBatchNormTraining(
     HloInstruction* batch_norm_training) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_training, 3));
   return CheckShape(batch_norm_training,
                     ShapeInference::InferBatchNormTrainingShape(
                         batch_norm_training->operand(0)->shape(),
@@ -657,7 +730,6 @@ Status ShapeVerifier::HandleBatchNormTraining(
 
 Status ShapeVerifier::HandleBatchNormInference(
     HloInstruction* batch_norm_inference) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_inference, 5));
   return CheckShape(batch_norm_inference,
                     ShapeInference::InferBatchNormInferenceShape(
                         batch_norm_inference->operand(0)->shape(),
@@ -669,7 +741,6 @@ Status ShapeVerifier::HandleBatchNormInference(
 }
 
 Status ShapeVerifier::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_grad, 5));
   return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
                                          batch_norm_grad->operand(0)->shape(),
                                          batch_norm_grad->operand(1)->shape(),
@@ -737,7 +808,6 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
 }  // namespace
 
 Status ShapeVerifier::HandleGather(HloInstruction* gather) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(gather, 2));
   return CheckShape(
       gather,
       ShapeInference::InferGatherShape(
@@ -746,7 +816,6 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
 }
 
 Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(scatter, 3));
   return CheckShape(
       scatter, ShapeInference::InferScatterShape(
                    scatter->operand(0)->shape(), scatter->operand(1)->shape(),
@@ -764,7 +833,6 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
 }
 
 Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2));
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
   return CheckShape(add_dependency, add_dependency->operand(0)->shape());
 }
@@ -846,14 +914,12 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
 }
 
 Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   return CheckShape(instruction,
                     ShapeInference::InferUnaryOpShape(instruction->opcode(),
                                                       instruction->operand(0)));
 }
 
 Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   return CheckShape(
       instruction, ShapeInference::InferBinaryOpShape(instruction->opcode(),
                                                       instruction->operand(0),
@@ -861,7 +927,6 @@ Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::CheckTernaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(instruction,
                     ShapeInference::InferTernaryOpShape(
                         instruction->opcode(), instruction->operand(0),
@@ -1237,8 +1302,8 @@ Status CheckFusionInstruction(HloInstruction* fusion) {
   return Status::OK();
 }
 
-// Checks that the non-scalar operand shapes are compatible to the output
-// shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+// Checks that the operand shapes are compatible to the output shape, i.e.,
+// that there are no implicit broadcasts.
 Status CheckElementwiseInstruction(HloInstruction* instruction) {
   const Shape& out_shape = instruction->shape();
   for (HloInstruction* operand : instruction->operands()) {
@@ -1307,17 +1372,13 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   Status HandleConditional(HloInstruction* conditional) override {
-    if (conditional->true_computation()->num_parameters() != 1) {
-      return FailedPrecondition(
-          "True computation %s of %s must have 1 parameter insted of %d",
-          conditional->true_computation()->name(), conditional->ToString(),
-          conditional->true_computation()->num_parameters());
-    }
-    if (conditional->false_computation()->num_parameters() != 1) {
-      return FailedPrecondition(
-          "False computation %s of %s must have 1 parameter insted of %d",
-          conditional->false_computation()->name(), conditional->ToString(),
-          conditional->false_computation()->num_parameters());
+    for (int b = 0; b < conditional->branch_count(); ++b) {
+      if (conditional->branch_computation(b)->num_parameters() != 1) {
+        return FailedPrecondition(
+            "Branch computation %s of %s must have 1 parameter insted of %d",
+            conditional->branch_computation(b)->name(), conditional->ToString(),
+            conditional->branch_computation(b)->num_parameters());
+      }
     }
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 479905b317d5639ff2cebc4d1044e21b527693f6..d427a1586c3cd1d1abbd6606f33067e36cabad98 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -52,9 +52,12 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
+  Status HandleCholesky(HloInstruction* hlo) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
   Status HandleInfeed(HloInstruction*) override;
   Status HandleOutfeed(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index de0335a5e8f18d1321e9ca6e6cf5057999f9adc4..523890b3c7268c06cdb6aaa67749f26a1cb62855 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -535,5 +535,84 @@ TEST_F(HloVerifierTestAllowMixedPrecision, SelectMixedPrecisionAllowed) {
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTest, IotaNonArrayResult) {
+  const char* const hlo_string = R"(
+  HloModule IotaTupleResult
+
+  ENTRY  kernelEntry {
+    ROOT iota = () iota(), iota_dimension=24
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("does not support non-array result"));
+}
+
+static const char* const kMapOperandComputationMismatchHlo = R"(
+  HloModule MapOperandComputationMismatch
+
+  Computation {
+    param0 = f32[] parameter(0)
+    constant = f32[] constant(1)
+    ROOT add = f32[] add(param0, constant)
+  }
+
+  ENTRY kernelEntry {
+  param = f64[] parameter(0)
+  ROOT map = f32[] map(param), dimensions={}, to_apply=Computation
+})";
+
+TEST_F(HloVerifierTest, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr(
+          "Shape mismatch between to_apply computation parameter and operand"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+static const char* const kReduceOperandComputationMismatchHlo = R"(
+  HloModule ReduceOperandComputationMismatch
+  computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernelEntry {
+    arg0 = f16[64,64,224,224]{3,2,1,0} parameter(0)
+    constant = f16[] constant(0)
+    reduce = f16[64]{0} reduce(arg0, constant), dimensions={0,2,3}, to_apply=computation
+  })";
+
+TEST_F(HloVerifierTest, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to f32[64]"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
deleted file mode 100644
index ada21345014dac70d61129aaf7bbc7466a7db914..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-namespace {
-
-// Visitor for removing implicit broadcasts.
-class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault {
- public:
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
-    return Status::OK();
-  }
-
-  Status HandleElementwiseBinary(HloInstruction* hlo) override {
-    return ReplaceImplicitBroadcastOperands(hlo);
-  }
-
-  Status HandleClamp(HloInstruction* hlo) override {
-    // Clamp is the only element-wise ternary operation.
-    return ReplaceImplicitBroadcastOperands(hlo);
-  }
-
-  // Returns whether any modification has been made to any visited instruction.
-  bool changed() const { return changed_; }
-
- private:
-  // Iterates through the operands of 'hlo' and replace any operands which are
-  // implicitly broadcast with the equivalent sequence of broadcast and reshape
-  // instructions. An operand is considered to be implicitly broadcast if the
-  // operand shape does have the same dimensions as the shape of 'hlo'.
-  Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) {
-    auto fadd = [hlo](std::unique_ptr<HloInstruction> x) {
-      return hlo->parent()->AddInstruction(std::move(x));
-    };
-    std::vector<HloInstruction*> operands;
-    bool operands_changed = false;
-    for (int i = 0; i < hlo->operand_count(); ++i) {
-      HloInstruction* operand = hlo->mutable_operand(i);
-      if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) {
-        HloInstruction* new_operand = hlo->parent()->AddInstruction(
-            HloInstruction::CreateBroadcastSequence(hlo->shape(), operand,
-                                                    fadd));
-        operands.push_back(new_operand);
-        operands_changed = true;
-      } else {
-        operands.push_back(operand);
-      }
-    }
-    if (operands_changed) {
-      // Create a new HLO instruction because the HloInstruction::Replace*
-      // methods check that the shape does not change with the replacement.
-      HloInstruction* new_hlo = hlo->parent()->AddInstruction(
-          hlo->CloneWithNewOperands(hlo->shape(), operands));
-      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
-      changed_ = true;
-    }
-    return Status::OK();
-  }
-
-  bool changed_ = false;
-};
-
-}  // namespace
-
-StatusOr<bool> ImplicitBroadcastRemover::Run(HloModule* module) {
-  VLOG(1) << "Removing implicit broadcast from module " << module->name();
-  XLA_VLOG_LINES(2,
-                 "Before removing implicit broadcasts:\n" + module->ToString());
-
-  ImplicitBroadcastVisitor visitor;
-  for (HloComputation* computation : module->computations()) {
-    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
-  }
-
-  if (visitor.changed()) {
-    // HLO instructions with implicitly broadcast operands are cloned and left
-    // for dead. Remove them.
-    HloDCE dce;
-    TF_RETURN_IF_ERROR(dce.Run(module).status());
-  }
-
-  XLA_VLOG_LINES(2,
-                 "After removing implicit broadcasts:\n" + module->ToString());
-
-  return visitor.changed();
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
deleted file mode 100644
index cf6cf897fe11eda01ba6b22119bba34ac2bef8fe..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
-
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace op = xla::testing::opcode_matchers;
-
-namespace xla {
-namespace {
-
-class ImplicitBroadcastRemoverTest : public HloTestBase {
- protected:
-  ImplicitBroadcastRemover remover_;
-};
-
-TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
-  auto param0 =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
-  auto param1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_FALSE(remover_.Run(m.get()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Parameter(), op::Parameter()));
-}
-
-TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
-  auto param1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-
-  EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-
-  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
-  root = computation->root_instruction();
-
-  EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
-
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-}
-
-TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
-  auto param0 =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1"));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kSubtract, param0, param1));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
-
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Subtract(op::Parameter(),
-                                 op::Broadcast(op::Reshape(op::Parameter()))));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-}
-
-TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
-  auto param1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kSubtract, param0, param1));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
-
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root,
-              op::Subtract(op::Broadcast(op::Parameter()), op::Parameter()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-}
-
-TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1"));
-  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
-      2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2"));
-  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
-                                                       param0, param1, param2));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
-
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
-                              op::Broadcast(op::Reshape(op::Parameter())),
-                              op::Broadcast(op::Reshape(op::Parameter()))));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
-}
-
-TEST_F(ImplicitBroadcastRemoverTest,
-       TernaryScalarAndDegenerateDimensionBroadcast) {
-  auto m = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
-  auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1"));
-  auto param2 =
-      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2"));
-  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
-                                                       param0, param1, param2));
-
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
-
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
-                              op::Broadcast(op::Reshape(op::Parameter())),
-                              op::Parameter()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
-  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 76bf48870d55e82497ba5f63e9e2e2a322cb330e..c5d32a4b9ad8c708ec0870173fa72320238e8464 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
-namespace gtl = ::tensorflow::gtl;
 
 namespace {
 using Analysis = IndexedArrayAnalysis;
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 295465c8481bcb7d1385192febe0d09614e393b3..62107b5a88d4e37552fa5a6384700a9291a9c655 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <ctype.h>
-
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
@@ -43,7 +42,7 @@ class IndexedArrayAnalysisTest : public HloTestBase {
     string result;
 
     for (char c : text) {
-      if (!isspace(c)) {
+      if (!absl::ascii_isspace(c)) {
         result.push_back(c);
       } else if (!result.empty() && result.back() != ' ') {
         result.push_back(' ');
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index b97060535d998e174639dceca5cde517cef01e30..4868cf961aa1bb02e37b2207c4231ca52f3d28a7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -65,6 +65,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
+    case HloOpcode::kCompare:
     case HloOpcode::kComplex:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConstant:
@@ -72,21 +73,15 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCopy:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kEq:
     case HloOpcode::kFloor:
-    case HloOpcode::kGe:
     case HloOpcode::kGetTupleElement:
-    case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kInfeed:
     case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
@@ -95,6 +90,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kPad:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
@@ -125,6 +121,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kCall:
+    case HloOpcode::kCholesky:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
     case HloOpcode::kAllReduce:
@@ -150,13 +147,16 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
+    case HloOpcode::kTriangularSolve:
     case HloOpcode::kWhile:
     case HloOpcode::kGetDimensionSize:
       return true;
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 545662543cca40e42b0f0302e14152e5283f9e4f..599489b3785be50ba7a145f298a13d6bb995a1cf 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:cholesky_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
@@ -50,7 +51,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 4818b2dae0a9951346600a9b2906488c3ef7e06e..a8f8ab4f725d904a529dbd50c1c199972a1c0895 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -20,7 +20,9 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/cholesky_expander.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -42,10 +45,44 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
+namespace {
+
+// Handles custom_call ops during evaluation by routing them through the global
+// CPU registry used by other CPU-based backends.
+StatusOr<Literal> HandleEvaluatorCustomCall(
+    HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+  // Find the target C function in the global registry.
+  auto* registry = xla::cpu::CustomCallTargetRegistry::Global();
+  void* target_fn = registry->Lookup(custom_call->custom_call_target());
+  if (!target_fn) {
+    return NotFound("Custom call target '%s' was not registered",
+                    custom_call->custom_call_target());
+  }
+
+  // Populate pointers to operand and output literal data.
+  std::vector<const void*> operand_data;
+  operand_data.reserve(operands.size());
+  for (const auto* literal : operands) {
+    operand_data.push_back(literal->untyped_data());
+  }
+  auto output = Literal::CreateFromShape(custom_call->shape());
+  void* output_data = output.untyped_data();
+
+  // Call the target function matching the C ABI used by the CPU backends.
+  auto* typed_fn = reinterpret_cast<void (*)(void*, const void**)>(target_fn);
+  (*typed_fn)(output_data, operand_data.data());
+
+  return std::move(output);
+}
+
+}  // namespace
+
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   pipeline.AddPass<DynamicIndexSplitter>();
+  pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout);
@@ -83,10 +120,12 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   // In this case we are using an HloEvaluator at execution time, so we don't
   // need to compile anything
 
-  // Create executable from only the Hlo module.
   auto evaluator = absl::make_unique<HloEvaluator>();
   evaluator->set_use_fast_path(
       hlo_module->config().debug_options().xla_hlo_evaluator_use_fast_path());
+  evaluator->set_custom_call_handler(HandleEvaluatorCustomCall);
+
+  // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
       absl::make_unique<InterpreterExecutable>(std::move(hlo_module),
                                                std::move(evaluator));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 10ff7bb6d46ee3b2cd1228b4b7a49269be8c65d3..039954a1837abfc6d5205a39b448238403d74c02 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -588,48 +587,56 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           body_layout.result_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kConditional) {
-      // The layout of the true and false computations must match, and must
-      // be the layout of the kConditional instruction.
-      TF_RET_CHECK(instruction->operand_count() == 3);
-
-      HloComputation* true_computation = instruction->true_computation();
-      HloComputation* false_computation = instruction->false_computation();
-      const HloInstruction* true_operand = instruction->operand(1);
-      const HloInstruction* false_operand = instruction->operand(2);
-
-      TF_RET_CHECK(true_computation->num_parameters() == 1);
-      TF_RET_CHECK(false_computation->num_parameters() == 1);
-      ComputationLayout& true_computation_layout =
-          FindOrDie(computation_layouts_, true_computation);
-      ComputationLayout& false_computation_layout =
-          FindOrDie(computation_layouts_, false_computation);
-
-      DCHECK(ShapeUtil::Compatible(true_operand->shape(),
-                                   true_computation_layout.parameter_shape(0)));
-      DCHECK(ShapeUtil::Compatible(
-          false_operand->shape(), false_computation_layout.parameter_shape(0)));
-      if (true_computation_layout.result_layout() !=
-          false_computation_layout.result_layout()) {
-        // We assign layouts in DFS fashion, so the true and false computations
-        // might have negotiated a different layout. But for the conditional
-        // instruction POV the layout must match, so we run again on the false
-        // computation, this time with proper computation layout.
-        VLOG(2) << "Reset %conditional false computation result layout: "
-                   "false_computation="
-                << false_computation->name()
-                << " conditional=" << instruction->name() << " shape="
-                << true_computation_layout.result_layout().ToString();
-        *false_computation_layout.mutable_result_layout() =
-            true_computation_layout.result_layout();
+      // Find the conditional branch with the most instructions and force all
+      // other computations to match that layout. A potentially better decison
+      // could count the number FLOPs or how constrained the layouts are.
+      int64 largest_branch = 0;
+      int64 largest_instruction_count =
+          instruction->branch_computation(0)->instruction_count();
+      for (int j = 1; j < instruction->branch_count(); ++j) {
+        const int64 instruction_count =
+            instruction->branch_computation(j)->instruction_count();
+        if (instruction_count > largest_instruction_count) {
+          largest_branch = j;
+          largest_instruction_count = instruction_count;
+        }
+      }
+      ComputationLayout& best_branch_computation_layout =
+          FindOrDie(computation_layouts_,
+                    instruction->branch_computation(largest_branch));
+      for (int k = 0; k < instruction->branch_count(); ++k) {
+        // Visit the best branch first.
+        int j = (k + largest_branch) % instruction->branch_count();
+        TF_RET_CHECK(instruction->branch_computation(j)->num_parameters() == 1);
+        ComputationLayout& branch_computation_layout =
+            FindOrDie(computation_layouts_, instruction->branch_computation(j));
+
+        DCHECK(ShapeUtil::Compatible(
+            instruction->operand(j + 1)->shape(),
+            branch_computation_layout.parameter_shape(0)));
+        if (best_branch_computation_layout.result_layout() !=
+            branch_computation_layout.result_layout()) {
+          // We assign layouts in DFS fashion, so the largest_branch and current
+          // branch computations might have negotiated a different layout. But
+          // for the case instruction POV the layout must match, so we run again
+          // on the branch j computation, this time with proper computation
+          // layout.
+          VLOG(2) << "Reset %conditional branch " << j
+                  << " computation result layout: branch_computation="
+                  << instruction->branch_computation(j)->name()
+                  << " case=" << instruction->name() << " shape="
+                  << best_branch_computation_layout.result_layout().ToString();
+          *branch_computation_layout.mutable_result_layout() =
+              best_branch_computation_layout.result_layout();
+        }
+        if (k == 0) {
+          TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+              best_branch_computation_layout.result_shape(), instruction));
+        }
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            branch_computation_layout.parameter_shape(0), instruction, j + 1,
+            /*mandatory=*/true));
       }
-      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
-          true_computation_layout.result_shape(), instruction));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          true_computation_layout.parameter_shape(0), instruction, 1,
-          /*mandatory=*/true));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          false_computation_layout.parameter_shape(0), instruction, 2,
-          /*mandatory=*/true));
     }
   }
   // Finally set the result layout to match ComputationLayout, if there is one.
@@ -699,28 +706,21 @@ Status CheckWhileLayout(HloInstruction* while_inst,
 
 Status CheckConditionalLayout(
     HloInstruction* instruction,
-    const ComputationLayout& true_computation_layout,
-    const ComputationLayout& false_computation_layout) {
-  HloComputation* true_computation = instruction->true_computation();
-  HloComputation* false_computation = instruction->false_computation();
-  const HloInstruction* true_operand = instruction->operand(1);
-  const HloInstruction* false_operand = instruction->operand(2);
-
-  TF_RET_CHECK(true_computation_layout.result_layout() ==
-               false_computation_layout.result_layout());
-  TF_RET_CHECK(true_computation_layout.result_layout().MatchesLayoutInShape(
-      instruction->shape()));
-  TF_RET_CHECK(true_computation_layout.result_layout().MatchesLayoutInShape(
-      true_computation->root_instruction()->shape()));
-  TF_RET_CHECK(false_computation_layout.result_layout().MatchesLayoutInShape(
-      instruction->shape()));
-  TF_RET_CHECK(false_computation_layout.result_layout().MatchesLayoutInShape(
-      false_computation->root_instruction()->shape()));
-  TF_RET_CHECK(true_computation_layout.parameter_layout(0).MatchesLayoutInShape(
-      true_operand->shape()));
-  TF_RET_CHECK(
-      false_computation_layout.parameter_layout(0).MatchesLayoutInShape(
-          false_operand->shape()));
+    absl::Span<const ComputationLayout> branch_computation_layouts) {
+  for (int j = 0; j < instruction->branch_count(); ++j) {
+    const HloInstruction* branch_operand = instruction->operand(j + 1);
+    TF_RET_CHECK(branch_computation_layouts[0].result_layout() ==
+                 branch_computation_layouts[j].result_layout());
+    TF_RET_CHECK(
+        branch_computation_layouts[j].result_layout().MatchesLayoutInShape(
+            instruction->shape()));
+    TF_RET_CHECK(
+        branch_computation_layouts[j].result_layout().MatchesLayoutInShape(
+            instruction->branch_computation(j)->root_instruction()->shape()));
+    TF_RET_CHECK(
+        branch_computation_layouts[j].parameter_layout(0).MatchesLayoutInShape(
+            branch_operand->shape()));
+  }
   return Status::OK();
 }
 
@@ -937,13 +937,16 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
               FindOrDie(computation_layouts_, instruction->while_condition()),
               FindOrDie(computation_layouts_, instruction->while_body())));
           break;
-        case HloOpcode::kConditional:
+        case HloOpcode::kConditional: {
+          std::vector<ComputationLayout> branch_computation_layouts;
+          for (auto branch_computation : instruction->branch_computations()) {
+            branch_computation_layouts.emplace_back(
+                FindOrDie(computation_layouts_, branch_computation));
+          }
           TF_RETURN_IF_ERROR(CheckConditionalLayout(
-              instruction,
-              FindOrDie(computation_layouts_, instruction->true_computation()),
-              FindOrDie(computation_layouts_,
-                        instruction->false_computation())));
+              instruction, absl::MakeSpan(branch_computation_layouts)));
           break;
+        }
         default:
           break;
       }
@@ -1008,7 +1011,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the operand's layout to the output.
     if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
-        instruction->shape().rank() == 1) {
+        ShapeUtil::TrueRank(instruction->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
     }
@@ -1019,16 +1022,6 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     Shape operand_shape = operand->shape();
     *operand_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(operand_shape);
-    if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) {
-      return absl::make_unique<Layout>(operand_shape.layout());
-    }
-    if (operand_shape.rank() == output_shape.rank()) {
-      *operand_shape.mutable_layout() = output_layout;
-      if (ShapeUtil::ReshapeIsBitcast(operand_shape,
-                                      output_shape_with_layout)) {
-        return absl::make_unique<Layout>(output_layout);
-      }
-    }
     auto aligned_operand_shape =
         ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
     if (aligned_operand_shape) {
@@ -1078,7 +1071,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the outputs's layout to the operand.
-    if (operand->shape().rank() == 1 &&
+    if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
         ShapeUtil::TrueRank(user->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
@@ -1090,16 +1083,6 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     Shape output_shape = user->shape();
     *output_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(output_shape);
-    if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) {
-      return absl::make_unique<Layout>(output_shape.layout());
-    }
-    if (operand->shape().rank() == output_shape.rank()) {
-      *output_shape.mutable_layout() = operand_layout;
-      if (ShapeUtil::ReshapeIsBitcast(output_shape,
-                                      operand_shape_with_layout)) {
-        return absl::make_unique<Layout>(operand_layout);
-      }
-    }
     auto aligned_user_shape =
         ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
     if (aligned_user_shape) {
@@ -1585,8 +1568,9 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           fused_instruction->mutable_shape()));
     } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
       // Nop; leave the infeed layout alone.
-    } else {
+    } else if (fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
       // Other instructions don't have layouts inside of fusion nodes.
+      // But do not clear layouts for other instructions in custom fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1932,12 +1916,6 @@ Status LayoutAssignment::PropagateComputationLayouts(
 
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
-  XLA_VLOG_LINES(3, module->ToString());
-  if (VLOG_IS_ON(10)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before layout assignment",
-                                module->config().debug_options());
-  }
   TF_RETURN_IF_ERROR(Init());
 
   // Verify computation layout is sane.
@@ -1992,13 +1970,6 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                                  entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
-  VLOG(3) << "After layout assignment:";
-  XLA_VLOG_LINES(3, module->ToString());
-  if (VLOG_IS_ON(10)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after layout assignment",
-                                module->config().debug_options());
-  }
   // All layouts are reset then reassigned by this pass.
   return true;
 }
@@ -2016,6 +1987,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
+    case HloOpcode::kCompare:
     case HloOpcode::kComplex:
     case HloOpcode::kConcatenate:
     case HloOpcode::kConditional:
@@ -2027,24 +1999,18 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFft:
     case HloOpcode::kFloor:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
-    case HloOpcode::kLe:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
-    case HloOpcode::kLt:
     case HloOpcode::kMap:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
@@ -2057,6 +2023,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kRemainder:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kScatter:
     case HloOpcode::kSelect:
     case HloOpcode::kSelectAndScatter:
@@ -2067,8 +2034,11 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kCholesky:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kWhile:
       return false;
@@ -2094,6 +2064,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
     case HloOpcode::kRng:
     case HloOpcode::kSend:
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index c8cf3c47d380012fdb0206c0d20d67e6a13017ae..efca6be331ee93f69e506ff1240db1ee089924a2 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -1084,7 +1084,7 @@ TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
       tup.1 = (s32[], token[], f32[512,1024]{0,1}) parameter(0)
       counter.1 = s32[] get-tuple-element(tup.1), index=0
       five = s32[] constant(5)
-      ROOT lt = pred[] less-than(counter.1, five)
+      ROOT lt = pred[] compare(counter.1, five), direction=LT
     }
 
     body.2 (tup: (s32[], token[], f32[512,1024]{0,1})) -> (s32[], token[], f32[512,1024]{0,1}) {
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index 182d8edbe30da292f28aeab53be646ce6651839f..afd9f37038387e2ba11d7f3c5e184dc4ad163584 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -37,7 +37,7 @@ class LLVMCompiler : public Compiler {
   // A callback of this type can be run before and/or after IR-level
   // optimization to e.g. dump out the generated IR to disk or gather some
   // statistics.
-  using ModuleHook = std::function<Status(const llvm::Module&)>;
+  using ModuleHook = std::function<void(const llvm::Module&)>;
 
   void SetPreOptimizationHook(ModuleHook hook) {
     CHECK(!user_pre_optimization_hook_)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index c5d59fb28e02ce229967fb3856012d608fb83c5d..e51a394827d1b7c145c54f4202bc6403d0fa517b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -67,9 +67,11 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
+        "//tensorflow/compiler/xla/service/cpu:cpu_options",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -161,6 +163,7 @@ cc_library(
         ":llvm_util",
         ":loop_emitter",
         ":tuple_ops",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -169,6 +172,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -236,7 +240,7 @@ cc_library(
     hdrs = ["kernel_support_library.h"],
     deps = [
         ":llvm_loop",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        ":llvm_util",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index ce3d922ca7a9bdea3a520959a8b8d284bc3e0d64..c915a472707f8e591af6edcb3ce84e837b96016b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -115,7 +115,7 @@ llvm::MDNode* AliasAnalysis::GetAliasScopeMetadataForBuffer(
 
   llvm::MDBuilder metadata_builder(domain->getContext());
   llvm::MDNode* scope = metadata_builder.createAliasScope(
-      AsStringRef("buffer: " + buffer_slice.ToString()), domain);
+      "buffer: " + buffer_slice.ToString(), domain);
   llvm::MDNode* scope_list = llvm::MDNode::get(domain->getContext(), scope);
   return scope_list;
 }
@@ -197,7 +197,7 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
   std::vector<llvm::Metadata*> scopes;
   for (const BufferAllocation::Slice noalias_slice : buffers) {
     llvm::MDNode* scope = metadata_builder.createAliasScope(
-        AsStringRef("buffer: " + noalias_slice.ToString()), domain);
+        "buffer: " + noalias_slice.ToString(), domain);
     scopes.push_back(scope);
   }
   llvm::MDNode* noalias_list =
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index b6ae4932f5707f1d15af1e09a735a7de2e48fac5..db900856993b3a22f79767b2c98d79e051571194 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -46,7 +46,7 @@ condition {
   condition.state = f32[] parameter(0)
   addend = f32[] custom-call(condition.state), custom_call_target="FakeCustomCallTarget"
   add = f32[] add(addend, condition.state)
-  ROOT greater-than = pred[] greater-than(const.100, add)
+  ROOT greater-than = pred[] compare(const.100, add), direction=GT
 }
 
 ENTRY while3 {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index bdce4a171b8a58f617f1d56e6cf6db5354846703..1ea5a42b0b398818b0946eaa9e214100007bada4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -41,14 +41,26 @@ static const HloInstruction& InstrForConstantBufferAllocation(
   return *const_instr;
 }
 
-string ConstantBufferAllocationToGlobalName(
-    const BufferAllocation& allocation) {
-  string instr_name = InstrForConstantBufferAllocation(allocation).name();
+string SanitizeConstantName(const HloInstruction& instr) {
+  CHECK_EQ(instr.opcode(), HloOpcode::kConstant);
+  string instr_name = instr.name();
   for (char& c : instr_name) {
-    if (c == '.') {
+    // Having a hyphen or a dot in a global variable name can crash the LLVM PTX
+    // backend.
+    if (c == '.' || c == '-') {
       c = '_';
     }
   }
+  return instr_name;
+}
+
+string ConstantBufferAllocationToGlobalName(
+    const BufferAllocation& allocation) {
+  const HloInstruction& instr = InstrForConstantBufferAllocation(allocation);
+  string instr_name = instr.name();
+  // Check that names are sanitized and stored in the HLO instructions
+  // before constant buffer allocation.
+  DCHECK_EQ(instr_name, SanitizeConstantName(instr));
   return absl::StrCat("buffer_for_", instr_name);
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
index bfb6eecb87f6a1b756b3a8da3377f608dd7f0be7..03e98a66900095889292cbff9d9924a9abe83ab0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
@@ -20,6 +20,10 @@ limitations under the License.
 
 namespace xla {
 namespace llvm_ir {
+// Sanitizes the HLO constant instruction name so that it can be used for the
+// name of the corresponding constant buffer. In particular, it replaces . and
+// - with _.
+string SanitizeConstantName(const HloInstruction& instr);
 // In XLA:GPU we map constant buffer allocations to globals in the generated
 // LLVM IR.  This function gives us the name of the global variable a constant
 // buffer is mapped to.  Not used on XLA:CPU.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index c66eaec8fb0e4c03f6967fec0cf0ae9661cdf470..4974cb57db3bb85f90382bfeb7794fe414befdbd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -47,29 +47,30 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
 
   // Read start indices from start_indices_generator.
   const int64 rank = output_shape.rank();
-  IrArray::Index start_index(b->getInt64Ty(), rank);
+  std::vector<llvm::Value*> start_multi_index(rank);
   for (int64 i = 0; i < rank; ++i) {
-    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(i));
+    TF_ASSIGN_OR_RETURN(start_multi_index[i], start_indices_generator(i));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
-        start_index[i]->getType(), output_shape.dimensions(i));
+        start_multi_index[i]->getType(), output_shape.dimensions(i));
     llvm::Value* update_dim_size = llvm::ConstantInt::get(
-        start_index[i]->getType(), update_shape.dimensions(i));
+        start_multi_index[i]->getType(), update_shape.dimensions(i));
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
     llvm::Value* max_bound = b->CreateSub(output_dim_size, update_dim_size);
-    llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
-    start_index[i] =
+    llvm::Value* zero =
+        llvm::ConstantInt::get(start_multi_index[i]->getType(), 0);
+    start_multi_index[i] =
         b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
                                                 : llvm::ICmpInst::ICMP_UGE,
-                                      zero, start_index[i]),
-                        zero, start_index[i]);
+                                      zero, start_multi_index[i]),
+                        zero, start_multi_index[i]);
 
-    start_index[i] =
+    start_multi_index[i] =
         b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
                                                 : llvm::ICmpInst::ICMP_ULE,
-                                      max_bound, start_index[i]),
-                        max_bound, start_index[i]);
+                                      max_bound, start_multi_index[i]),
+                        max_bound, start_multi_index[i]);
   }
 
   auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
@@ -78,14 +79,16 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     //
     //   output_index[dim] = start_index[dim] + update_index[dim]
     //
-    IrArray::Index output_index(start_index.GetType(), rank);
+    std::vector<llvm::Value*> output_multi_index(rank);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* start_index0 =
-          b->CreateSExtOrBitCast(start_index[i], update_index[i]->getType());
-      output_index[i] = b->CreateAdd(start_index0, update_index[i]);
+      llvm::Value* start_index0 = b->CreateSExtOrBitCast(
+          start_multi_index[i], update_index[i]->getType());
+      output_multi_index[i] = b->CreateAdd(start_index0, update_index[i]);
     }
 
     // Do output[output_index] = update[update_index].
+    IrArray::Index output_index(output_multi_index, output_shape,
+                                b->getInt64Ty());
     TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
                         update_array_generator(update_index));
     output_array.EmitWriteArrayElement(output_index, update_data, b);
@@ -113,20 +116,10 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
   Shape output_shape = output_array.GetShape();
   Shape update_shape = update_array.GetShape();
 
-  IndexGenerator start_indices_generator;
-  // TODO(b/118437727): Remove the R1 path, and rename the variables.
-  if (start_indices_array.GetShape().rank() == 1) {
-    start_indices_generator = [&](int64 index) {
-      return start_indices_array.EmitReadArrayElement(
-          IrArray::Index({b->getInt64(index)}), b);
-    };
-  } else {
-    start_indices_generator = [&](int64 index) {
-      return operand_arrays[2 + index].EmitReadArrayElement(
-          IrArray::Index(b->getInt64Ty()), b);
-    };
-  }
-
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    return operand_arrays[2 + index].EmitReadArrayElement(
+        IrArray::Index(b->getInt64Ty()), b);
+  };
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
     return update_array.EmitReadArrayElement(index, b);
   };
@@ -178,21 +171,11 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
 
-  // TODO(b/118437727): Remove the R1 path, and rename the variables.
-  IndexGenerator start_indices_generator;
-  if (start_indices->shape().rank() == 1) {
-    start_indices_generator = [&](int64 index) {
-      return fused_emitter.GetGenerator(start_indices)(
-          IrArray::Index({b->getInt64(index)}));
-    };
-  } else {
-    start_indices_generator = [&](int64 index) {
-      ElementGenerator element_generator =
-          fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
-      return element_generator(IrArray::Index(b->getInt64Ty()));
-    };
-  }
-
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    ElementGenerator element_generator =
+        fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+    return element_generator(IrArray::Index(b->getInt64Ty()));
+  };
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index e440f05e2b2f0d4a2a4c7b326b4881183de4d235..cf9df95ad575425404e4f88919078f158ec5c1de 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -15,14 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 
+#include <algorithm>
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -58,9 +66,9 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
       }
       VLOG(3) << "The cached generated value can't be reused, because it is in "
                  "a different BB ("
-              << llvm_ir::AsString(generated_value_bb->getName())
+              << generated_value_bb->getName().str()
               << ") from the current insertion block ("
-              << llvm_ir::AsString(b_->GetInsertBlock()->getName()) << ").";
+              << b_->GetInsertBlock()->getName().str() << ").";
     }
 
     TF_ASSIGN_OR_RETURN(generated_value_cache_[hlo][index.multidim()],
@@ -78,8 +86,11 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
         llvm_ir::ConvertLiteralToIrConstant(literal, module_);
     llvm::GlobalVariable* global = new llvm::GlobalVariable(
         *b_->GetInsertBlock()->getModule(), initializer->getType(),
-        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
+        /*isConstant=*/true,
+        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+        /*Initializer=*/initializer,
         /*Name=*/"");
+    global->setUnnamedAddr(llvm::GlobalVariable::UnnamedAddr::Global);
     llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
         global,
         llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
@@ -110,9 +121,9 @@ Status FusedIrEmitter::HandleGetTupleElement(
     }
 
     // Lookup tuple element pointer.
-    return llvm_ir::EmitGetTupleElement(
-        get_tuple_element->shape(), get_tuple_element->tuple_index(),
-        /*alignment=*/1, tuple_ptr, b_, module_);
+    return llvm_ir::EmitGetTupleElement(get_tuple_element->shape(),
+                                        get_tuple_element->tuple_index(),
+                                        /*alignment=*/1, tuple_ptr, b_);
   };
 
   if (!get_tuple_element->shape().IsTuple()) {
@@ -192,4 +203,101 @@ FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetGenerator(
   return indexed_generators_.at(instruction);
 }
 
+bool FusedIrEmitter::IsFusedIrEmitterInefficient(
+    const HloInstruction* consumer, const HloInstruction* producer) {
+  if (consumer->opcode() != HloOpcode::kFusion) {
+    return false;
+  }
+  // Collects for each instruction in the fusion node from which (indirect)
+  // users newly created index values are passed. Roughly speaking, we reuse
+  // index values if the shapes are equal when ignoring the element type (we may
+  // reuse also if the shape change is a bitcast, but we don't consider that
+  // here). By ignoring potential reuses our estimate whether the fusion emitter
+  // is inefficient is a bit more conservative than necessary.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      indexing_users;
+  // Stores the number of different index accesses for each instruction in the
+  // fusion node. The fusion emitter caches access with the same index, so this
+  // value indicates how many times a specific instruction will be emitted.
+  absl::flat_hash_map<const HloInstruction*, int64> index_usage_count;
+  index_usage_count[consumer] = 1;
+
+  auto evaluate_fusion_computation = [&indexing_users, &index_usage_count](
+                                         const HloInstruction* fusion) {
+    auto postorder =
+        fusion->fused_instructions_computation()->MakeInstructionPostOrder();
+    std::reverse(postorder.begin(), postorder.end());
+    for (const auto* instruction : postorder) {
+      if (instruction->opcode() == HloOpcode::kParameter) {
+        continue;
+      }
+      int64& total = index_usage_count[instruction];
+      if (indexing_users[instruction].empty()) {
+        total = index_usage_count[fusion];
+      } else {
+        total = 0;
+        for (const auto* user : indexing_users[instruction]) {
+          int64 weight = 1;
+          // Concatenate is special: the index differs for each operand, so
+          // in the worst case we have to deal with as many index values as
+          // the number of operands of Concatenate. By considering the worst
+          // case, we are more conservative than necessary regarding
+          // refusing to fuse.
+          if (user->opcode() == HloOpcode::kConcatenate) {
+            weight = user->operand_count();
+          }
+          total += index_usage_count[user] * weight;
+        }
+      }
+      for (const auto* operand : instruction->operands()) {
+        // For simplicity we assume that all shape and layout changing
+        // operations invalidate index reuse.
+        if (Shape::Equal().IgnoreElementType()(operand->shape(),
+                                               instruction->shape())) {
+          // If the index is reused, it means the operand gets index values
+          // from the same set of (indirect) users as 'instruction' itself.
+          indexing_users[operand].insert(indexing_users[instruction].begin(),
+                                         indexing_users[instruction].end());
+        } else {
+          // If the index is not reused, it means 'instruction' computes a
+          // new index derived from the index it gets.
+          indexing_users[operand].insert(instruction);
+        }
+      }
+    }
+  };
+  evaluate_fusion_computation(consumer);
+
+  // Also account for the 'producer' if it would be fused. Find the operand it
+  // corresponds to.
+  for (int64 operand_num = 0; operand_num < consumer->operand_count();
+       ++operand_num) {
+    if (consumer->operand(operand_num) == producer) {
+      auto instruction = consumer->fused_parameter(operand_num);
+      int64& total = index_usage_count[producer];
+      total = 0;
+      for (const auto* user : indexing_users[instruction]) {
+        total += index_usage_count[user];
+      }
+      break;
+    }
+  }
+
+  // If 'producer' is a fusion node as well, also evaluate it.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    evaluate_fusion_computation(producer);
+  }
+
+  // Sum up the total number of emitted ops.
+  int64 total = 0;
+  for (const auto& entry : index_usage_count) {
+    total += entry.second;
+  }
+
+  // Check that the code duplication has at most a factor of 8 (where 8 is an
+  // arbitrary constant that seems to work).
+  return total > 8 * index_usage_count.size();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index e6d52a580c04a920d3f0e8ed6f39c1cae587cf1b..b1aa6d59634463956491b586d84fb6a6945a3fdf 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -91,6 +91,14 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
     tiled_parameter_info_ = info;
   }
 
+  // Evaluates whether fusing 'producer' into 'consumer' might cause exponential
+  // behavior in FusedIrEmitter. We currently can have exponential time/memory
+  // requirements for emitting certain fusion kernels, in which case we don't
+  // want to fuse.
+  // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
+  static bool IsFusedIrEmitterInefficient(const HloInstruction* consumer,
+                                          const HloInstruction* producer);
+
  protected:
   // Returns the IrArrays for the fusion instruction operands.
   llvm_ir::IrArray& GetIrArrayForFusedParameter(int64 parameter_number) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 8ee07ae8331e986f9d271be5e39065f0d87853b1..7bf8bde9dededbfefed48449f19987798f62c434 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -29,6 +29,14 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
+                      llvm::Value* linear, const Shape& shape,
+                      llvm::Type* index_type)
+    : Index(multidim, shape, index_type) {
+  CHECK_NE(linear, nullptr);
+  linear_ = linear;
+}
+
 void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
                                  llvm::Value* linear, const Shape& shape,
                                  llvm::IRBuilder<>* b) const {
@@ -74,36 +82,22 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
 }
 
 IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
-                      llvm::Value* linear, const Shape& shape)
+                      const Shape& shape, llvm::Type* index_type)
     : multidim_(multidim.begin(), multidim.end()),
-      linear_(linear),
+      linear_(nullptr),
       layout_(shape.layout()),
-      dims_(shape.dimensions().begin(), shape.dimensions().end()) {
-  if (size()) {
-    index_type_ = multidim_[0]->getType();
-  } else {
-    CHECK_NE(linear_, nullptr);
-    index_type_ = linear_->getType();
-  }
+      dims_(shape.dimensions().begin(), shape.dimensions().end()),
+      index_type_(index_type) {
   CHECK_NE(index_type_, nullptr);
   CHECK_EQ(shape.dimensions_size(), multidim.size());
+  for (const auto* dim : multidim) {
+    CHECK_NE(dim, nullptr);
+  }
   CHECK(LayoutUtil::HasLayout(shape))
       << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
       << " should have a layout.";
 }
 
-IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
-                      const Shape& shape, llvm::IRBuilder<>* b)
-    : multidim_(multidim.begin(), multidim.end()),
-      layout_(shape.layout()),
-      dims_(shape.dimensions().begin(), shape.dimensions().end()) {
-  CHECK_GT(multidim_.size(), 0);
-  index_type_ = multidim[0]->getType();
-  CHECK_NE(index_type_, nullptr);
-  CHECK_EQ(shape.dimensions_size(), multidim.size());
-  CHECK(LayoutUtil::HasLayout(shape));
-}
-
 IrArray::IrArray(llvm::Value* base_ptr, Shape shape)
     : base_ptr_(base_ptr), shape_(std::move(shape)) {
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
@@ -117,10 +111,10 @@ IrArray::IrArray(llvm::Value* base_ptr, Shape shape)
     ++depth;
   }
 
-  if (!shape_->IsArray() || ShapeUtil::IsScalar(*shape_)) {
+  if (!shape_.IsArray() || ShapeUtil::IsScalar(shape_)) {
     DCHECK(depth == 1 || depth == 0) << depth;
   } else {
-    DCHECK_EQ(depth, shape_->rank()) << shape.ShortDebugString();
+    DCHECK_EQ(depth, shape_.rank()) << shape.ShortDebugString();
   }
 }
 
@@ -178,30 +172,30 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape(
   if (linear() != nullptr && LayoutUtil::HasLayout(input_shape) &&
       LayoutUtil::HasLayout(output_shape) &&
       ShapeUtil::ReshapeIsBitcast(input_shape, output_shape)) {
-    return Index(source_multidim_index, linear(), input_shape);
+    return Index(source_multidim_index, linear(), input_shape, index_type_);
   }
   return Index(source_multidim_index, index_type_);
 }
 
 IrArray::Index IrArray::Index::SourceIndexOfSlice(
-    const Shape& shape, absl::Span<const int64> starts,
+    const Shape& operand_shape, absl::Span<const int64> starts,
     absl::Span<const int64> strides, llvm::IRBuilder<>* builder) const {
-  Index source_index(index_type_, multidim_.size());
+  std::vector<llvm::Value*> source_multi_index(multidim_.size());
   for (int i = 0; i < multidim_.size(); ++i) {
     int64 stride = strides[i];
     auto type = multidim_[i]->getType();
 
     if (stride != 1) {
-      source_index[i] = builder->CreateAdd(
+      source_multi_index[i] = builder->CreateAdd(
           builder->CreateMul(multidim_[i],
                              llvm::ConstantInt::get(type, stride)),
           llvm::ConstantInt::get(type, starts[i]));
     } else {
-      source_index[i] = builder->CreateAdd(
+      source_multi_index[i] = builder->CreateAdd(
           multidim_[i], llvm::ConstantInt::get(type, starts[i]));
     }
   }
-  return source_index;
+  return Index(source_multi_index, operand_shape, index_type_);
 }
 
 IrArray::Index IrArray::Index::SourceIndexOfTranspose(
@@ -214,7 +208,7 @@ IrArray::Index IrArray::Index::SourceIndexOfTranspose(
   if (linear() != nullptr && LayoutUtil::HasLayout(operand_shape) &&
       LayoutUtil::HasLayout(shape) &&
       ShapeUtil::TransposeIsBitcast(operand_shape, shape, dimension_mapping)) {
-    return Index(operand_multidim_index, linear(), operand_shape);
+    return Index(operand_multidim_index, linear(), operand_shape, index_type_);
   }
 
   return Index(operand_multidim_index);
@@ -250,7 +244,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBitcast(
   std::vector<llvm::Value*> multi_index(operand_shape.dimensions_size());
   Delinearize(&multi_index, linear_index, operand_shape, builder);
 
-  return Index(multi_index, linear_index, operand_shape);
+  return Index(multi_index, linear_index, operand_shape, index_type_);
 }
 
 IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
@@ -303,9 +297,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
     divisor *= shape.dimensions(LayoutUtil::Major(shape.layout(), i));
   }
   if (divisor > 1) {
-    linear = builder->CreateUDiv(
-        linear,
-        IrArray::Index(linear->getType()).GetConstantWithIndexType(divisor));
+    linear = builder->CreateUDiv(linear, GetConstantWithIndexType(divisor));
   }
   if (min_broadcasted_dimension > 0) {
     int64 mod = 1;
@@ -313,11 +305,9 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
          ++i) {
       mod *= shape.dimensions(LayoutUtil::Major(shape.layout(), i));
     }
-    linear = builder->CreateURem(
-        linear,
-        IrArray::Index(linear->getType()).GetConstantWithIndexType(mod));
+    linear = builder->CreateURem(linear, GetConstantWithIndexType(mod));
   }
-  return Index(source_index, linear, operand_shape);
+  return Index(source_index, linear, operand_shape, index_type_);
 }
 
 llvm::Value* IrArray::Index::Linearize(absl::Span<const int64> dimensions,
@@ -341,20 +331,21 @@ llvm::Value* IrArray::Index::Linearize(absl::Span<const int64> dimensions,
 
 llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
                                               llvm::IRBuilder<>* b,
-                                              absl::string_view name) const {
-  if (ShapeUtil::IsScalar(*shape_)) {
+                                              absl::string_view name,
+                                              bool use_linear_index) const {
+  if (ShapeUtil::IsScalar(shape_)) {
     // Special handling of scalars: a scalar pretends to have the same value for
     // every index, thus effectively implementing broadcasting of its value
     // over higher-rank arrays.
     return base_ptr_;
   }
-  CHECK_EQ(index.size(), shape_->rank());
+  CHECK_EQ(index.size(), shape_.rank());
 
-  if (index.LinearValidOnShape(*shape_)) {
+  if (use_linear_index && index.LinearValidOnShape(shape_)) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
     return b->CreateInBoundsGEP(
         b->CreateBitCast(base_ptr_,
-                         PrimitiveTypeToIrType(shape_->element_type(), module)
+                         PrimitiveTypeToIrType(shape_.element_type(), module)
                              ->getPointerTo()),
         {index.linear()}, llvm_ir::AsStringRef(name));
   }
@@ -364,7 +355,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     // When dimension i is of size 1, LLVM optimization is able to replace
     // index[i] with 0. However, setting index[i] to 0 here still allows LLVM to
     // produce better code in some cases.
-    auto dim = shape_->dimensions(i);
+    auto dim = shape_.dimensions(i);
     actual_index.push_back(
         dim == 1 ? llvm::ConstantInt::get(index[i]->getType(), 0) : index[i]);
   }
@@ -377,8 +368,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
   CHECK_GT(index.size(), 0);
   std::vector<llvm::Value*> gep_indices(
       1, llvm::ConstantInt::get(index[0]->getType(), 0));
-  for (int64 i = 0; i < LayoutUtil::MinorToMajor(*shape_).size(); ++i) {
-    int64 dimension = LayoutUtil::Major(shape_->layout(), i);
+  for (int64 i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
   return b->CreateInBoundsGEP(base_ptr_, gep_indices,
@@ -399,16 +390,20 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
 
 llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
                                            llvm::IRBuilder<>* b,
-                                           absl::string_view name) const {
-  llvm::Value* element_address = EmitArrayElementAddress(index, b, name);
+                                           absl::string_view name,
+                                           bool use_linear_index) const {
+  llvm::Value* element_address =
+      EmitArrayElementAddress(index, b, name, use_linear_index);
   llvm::LoadInst* load = b->CreateLoad(element_address);
   AnnotateLoadStoreInstructionWithMetadata(load);
   return load;
 }
 
 void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                                    llvm::IRBuilder<>* b) const {
-  llvm::Value* element_address = EmitArrayElementAddress(index, b);
+                                    llvm::IRBuilder<>* b,
+                                    bool use_linear_index) const {
+  llvm::Value* element_address =
+      EmitArrayElementAddress(index, b, "", use_linear_index);
   llvm::StoreInst* store = b->CreateStore(value, element_address);
   AnnotateLoadStoreInstructionWithMetadata(store);
 }
@@ -423,18 +418,5 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
   return new_irarray;
 }
 
-/* static */ IrArray::Index IrArray::BumpIndex(const Index& index,
-                                               int64 which_dimension,
-                                               int64 addend,
-                                               llvm::IRBuilder<>* b) {
-  Index new_index = index;
-  new_index[which_dimension] = b->CreateAdd(
-      index[which_dimension],
-      llvm::ConstantInt::get(index[which_dimension]->getType(), addend), "",
-      /*HasNUW=*/true,
-      /*HasNSW=*/true);
-  return new_index;
-}
-
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index b706ebd311cbb706e7e4698b93319e37e664d10a..d0a8b6356d6d76022e801520a1436481672bf678 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -55,17 +55,8 @@ class IrArray {
   // multidimensional index, which LLVM DCE can delete.
   class Index {
    public:
-    // Constructs an index of rank "size". Each dimension of the index is
-    // initialized to "value".
-    explicit Index(size_t size, llvm::Value* value)
-        : multidim_(size, value), index_type_(value->getType()) {
-      CHECK_NE(index_type_, nullptr);
-    }
-
-    // Constructs an index of rank "size". Each dimension of the index is
-    // initialized to nullptr.
-    explicit Index(llvm::Type* index_ty, size_t size = 0)
-        : multidim_(size, nullptr), index_type_(index_ty) {
+    // Constructs an index for a scalar shape.
+    explicit Index(llvm::Type* index_ty) : index_type_(index_ty) {
       CHECK(index_ty->isIntegerTy());
     }
 
@@ -77,7 +68,10 @@ class IrArray {
       if (size() == 0) {
         index_type_ = index_ty;
       } else {
-        index_type_ = (*this)[0]->getType();
+        for (const auto* dim : multidim) {
+          CHECK_NE(dim, nullptr);
+        }
+        index_type_ = multidim[0]->getType();
         if (index_ty != nullptr) {
           CHECK_EQ(index_type_, index_ty);
         }
@@ -96,25 +90,20 @@ class IrArray {
     // Precondition: "shape" has a layout.
     Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilder<>* b);
 
-    // Constructs an index from the given multi-dimensional index and the shape
-    // that it indexes into.
+    // Constructs an index from a multi-dimensional index. 'shape' is the shape
+    // for which the multi-dimensional index is used. 'index_type' is the type
+    // of the index.
     //
     // Precondition: "shape" has a layout.
     Index(absl::Span<llvm::Value* const> multidim, const Shape& shape,
-          llvm::IRBuilder<>* b);
-
-    // Constructs an index from both a multi-dimensional index and a linear
-    // index. "shape" has the same meaning as that in the constructor that takes
-    // only a linear index.
-    Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
-          const Shape& shape);
+          llvm::Type* index_type);
 
     // Returns an index that adds `addend` to the given `dim` of the object.
     Index AddOffsetToDim(llvm::Value* addend, int64 dim,
                          llvm::IRBuilder<>* b) const {
-      IrArray::Index index = *this;
-      index[dim] = b->CreateAdd(index[dim], addend);
-      return index;
+      std::vector<llvm::Value*> multi_index = multidim();
+      multi_index[dim] = b->CreateAdd(multi_index[dim], addend);
+      return Index(multi_index, index_type_);
     }
 
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
@@ -123,43 +112,28 @@ class IrArray {
     size_t size() const { return multidim().size(); }
 
     llvm::Value* operator[](size_t i) const { return multidim()[i]; }
-    llvm::Value*& operator[](size_t i) { return mutable_multidim()[i]; }
-
-    void push_back(llvm::Value* value) { mutable_multidim().push_back(value); }
-    void InsertAt(int64 index, llvm::Value* value) {
-      CHECK_LE(index, size());
-      mutable_multidim().insert(mutable_multidim().begin() + index, value);
-    }
-    void InsertAt(int64 index, int64 count, llvm::Value* value) {
-      CHECK_LE(index, size());
-      mutable_multidim().insert(mutable_multidim().begin() + index, count,
-                                value);
-    }
 
-    using iterator = std::vector<llvm::Value*>::iterator;
     using const_iterator = std::vector<llvm::Value*>::const_iterator;
 
-    iterator begin() { return mutable_multidim().begin(); }
-    iterator end() { return mutable_multidim().end(); }
-
     const_iterator begin() const { return multidim().begin(); }
     const_iterator end() const { return multidim().end(); }
 
-    llvm::Value* back() const { return multidim().back(); }
-
     bool LinearValidOnShape(const Shape& a) const;
 
     // Given that "this" is the target index of a reshape from `operand_shape`
     // to `shape`, returns the source index.
-    Index SourceIndexOfReshape(const Shape& shape, const Shape& operand_shape,
+    Index SourceIndexOfReshape(const Shape& output_shape,
+                               const Shape& input_shape,
                                llvm::IRBuilder<>* builder) const;
 
     // Returns the index into the source operand from which a slice operation
     // selects a value to be placed into index "this". The slice is described
     // by starting indices `starts` and stride values `strides`.
     //
-    // Precondition: "this" is an index into a slice whose shape is `shape`.
-    Index SourceIndexOfSlice(const Shape& shape, absl::Span<const int64> starts,
+    // Precondition: "this" is an index into a slice whose operand shape is
+    // `operand_shape`.
+    Index SourceIndexOfSlice(const Shape& operand_shape,
+                             absl::Span<const int64> starts,
                              absl::Span<const int64> strides,
                              llvm::IRBuilder<>* builder) const;
 
@@ -194,14 +168,14 @@ class IrArray {
       return llvm::ConstantInt::get(index_type_, c);
     }
 
-    void ClearLinearIndex() { linear_ = nullptr; }
-
    private:
-    // Changing the multi-dimensional index invalidates the linear index.
-    std::vector<llvm::Value*>& mutable_multidim() {
-      linear_ = nullptr;
-      return multidim_;
-    }
+    // Constructs an index from both a multi-dimensional index and a linear
+    // index. 'shape' is the shape on which the index is used. 'index_type' is
+    // the type of the index.
+    //
+    // Precondition: "shape" has a layout.
+    Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
+          const Shape& shape, llvm::Type* index_type);
 
     void Delinearize(std::vector<llvm::Value*>* multidim, llvm::Value* linear,
                      const Shape& shape, llvm::IRBuilder<>* b) const;
@@ -242,9 +216,7 @@ class IrArray {
   llvm::Value* GetBasePointer() const { return base_ptr_; }
   llvm::Type* GetElementLlvmType() const { return element_type_; }
 
-  const Shape& GetShape() const {
-    return *shape_;
-  }
+  const Shape& GetShape() const { return shape_; }
 
   // Emit a sequence of instructions to compute the address of the element in
   // the given array at the given index. Returns the address of the element as
@@ -253,7 +225,8 @@ class IrArray {
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
   llvm::Value* EmitArrayElementAddress(const Index& index, llvm::IRBuilder<>* b,
-                                       absl::string_view name = "") const;
+                                       absl::string_view name = "",
+                                       bool use_linear_index = true) const;
 
   // Attach metadata this IrArray instance knows about to "instruction".
   void AnnotateLoadStoreInstructionWithMetadata(
@@ -266,15 +239,23 @@ class IrArray {
   //
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
   llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b,
-                                    absl::string_view name = "") const;
+                                    absl::string_view name = "",
+                                    bool use_linear_index = true) const;
 
   // Emit IR to write the given value to the array element at the given index.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                             llvm::IRBuilder<>* b) const;
+                             llvm::IRBuilder<>* b,
+                             bool use_linear_index = true) const;
 
   // Returns a new IrArray whose shape is "new_shape" and base pointer is a
   // bitcast of the base pointer of "this" IrArray.
+  // 'use_linear_index' can be used to specify whether the linear index (if
+  // available) or the multi-dimensional index should be used.
   IrArray CastToShape(const Shape& new_shape, llvm::IRBuilder<>* b) const;
 
   void AddAliasScopeMetadata(llvm::MDNode* alias_scope) {
@@ -318,11 +299,6 @@ class IrArray {
 
   const std::map<int, llvm::MDNode*>& metadata() const { return metadata_; }
 
-  // Bumps the "which_dimension" value within the provided index by the provided
-  // addend.
-  static Index BumpIndex(const Index& index, int64 which_dimension,
-                         int64 addend, llvm::IRBuilder<>* b);
-
  private:
   // Add the specified LLVM IR metadata to loads/stores associated with this
   // IrArray.
@@ -337,7 +313,7 @@ class IrArray {
   llvm::Type* element_type_;
 
   // Shape of the XLA array.
-  absl::optional<Shape> shape_;
+  Shape shape_;
 
   // The list of key/value pairs used when attaching metadata to emitted
   // loads/stores for this array.  They keys are the metadata kinds and the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index cf5083e8c13b9485035923895cec1ad05049c644..02c719502ee7b0a732ae74acec364f89d51ae0c1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -269,6 +269,11 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpUNE(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpUNO(Args&&... args) {
+    return mixin_builder()->CreateFCmpUNO(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FDiv(Args&&... args) {
     return mixin_builder()->CreateFDiv(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 5eeb29c478a371dae83251771f2dc4844672d3e9..e1dc7e747654a2539c3216d418226003e89de3b1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -70,7 +70,7 @@ Status KernelSupportLibrary::IfWithStatus(
 }
 
 void KernelSupportLibrary::EmitAndCallOutlinedKernel(
-    bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+    const HloModuleConfig& module_config, llvm::IRBuilder<>* b,
     absl::string_view kernel_name,
     KernelSupportLibrary::ArgumentVector arguments,
     const std::function<void(KernelSupportLibrary::ArgumentVector)>&
@@ -101,10 +101,9 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
     auto* function_type =
         llvm::FunctionType::get(b->getVoidTy(), arg_types, /*isVarArg=*/false);
 
-    function = llvm_ir::CreateFunction(
-        function_type, llvm::GlobalValue::InternalLinkage,
-        /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, kernel_name, module);
+    function = llvm_ir::CreateCpuFunction(function_type,
+                                          llvm::GlobalValue::InternalLinkage,
+                                          module_config, kernel_name, module);
 
     llvm::IRBuilder<>::InsertPointGuard guard(*b);
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 612b839cfa15711061e1ae53358a72d5220e1801..b66ce6b835e6bdecd606cc9919575c11e32f6a0a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -263,33 +263,33 @@ class KernelSupportLibrary {
   // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
   // Currently we only support at most one nullptr value in `arguments`.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      const HloModuleConfig& module_config, llvm::IRBuilder<>* b,
       absl::string_view kernel_name, ArgumentVector arguments,
       const std::function<void(ArgumentVector)>& kernel_body_generator);
 
   // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      const HloModuleConfig& module_config, llvm::IRBuilder<>* b,
       absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
       llvm::Value* arg2,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
           kernel_body_generator) {
-    EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, b, kernel_name, {arg0, arg1, arg2},
-        [&](ArgumentVector args) {
-          kernel_body_generator(args[0], args[1], args[2]);
-        });
+    EmitAndCallOutlinedKernel(module_config, b, kernel_name, {arg0, arg1, arg2},
+                              [&](ArgumentVector args) {
+                                kernel_body_generator(args[0], args[1],
+                                                      args[2]);
+                              });
   }
 
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      const HloModuleConfig& module_config, llvm::IRBuilder<>* b,
       absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
       llvm::Value* arg2, llvm::Value* arg3,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
                                llvm::Value*)>& kernel_body_generator) {
     EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, b, kernel_name,
-        {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) {
+        module_config, b, kernel_name, {arg0, arg1, arg2, arg3},
+        [&](ArgumentVector args) {
           kernel_body_generator(args[0], args[1], args[2], args[3]);
         });
   }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index cd8dd72cd775d5e0b52f96a2326367da0775e7eb..3c4aee70bca657e4a6232834ad1bfcbfa460571b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -185,15 +185,15 @@ IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
 
 IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
     const IrArray::Index& tile_index) {
-  IrArray::Index elem_index = tile_index;
+  std::vector<llvm::Value*> elem_multi_index = tile_index.multidim();
   for (int i = DimY; i < DimTot; ++i) {
-    elem_index[i] =
+    elem_multi_index[i] =
         b_->CreateMul(tile_index[i],
                       llvm::ConstantInt::get(tile_index[i]->getType(),
                                              GetTileSizeForDimension(i)),
                       "tile_origin." + std::to_string(i));
   }
-  return elem_index;
+  return IrArray::Index(elem_multi_index, tile_index.GetType());
 }
 
 llvm::GlobalVariable* KernelMappingScheme::GetSharedMemoryBufferForElementType(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index fe320bbe727111fbc986cc1fbc217feed74d30f1..e08248b9cc46134675311a82f69f02572a495fe4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -74,8 +73,8 @@ void ForLoop::Emit(llvm::IRBuilder<>* b) {
 
     // Split the preheader to create an exit basic block. The exit basic block
     // will contain all instructions at or after insert_point.
-    exit_bb_ = preheader_bb_->splitBasicBlock(
-        insert_point, AsStringRef(GetQualifiedName("loop_exit")));
+    exit_bb_ = preheader_bb_->splitBasicBlock(insert_point,
+                                              GetQualifiedName("loop_exit"));
 
     // splitBasicBlock adds an unconditional branch between the split basic
     // blocks. Remove it. An unconditional branch will be added below from the
@@ -95,9 +94,8 @@ void ForLoop::Emit(llvm::IRBuilder<>* b) {
   llvm::Function* func = preheader_bb_->getParent();
   b->SetInsertPoint(&func->getEntryBlock(),
                     func->getEntryBlock().getFirstInsertionPt());
-  llvm::Value* indvar_address =
-      b->CreateAlloca(start_index_->getType(), nullptr,
-                      AsStringRef(GetQualifiedName("invar_address")));
+  llvm::Value* indvar_address = b->CreateAlloca(
+      start_index_->getType(), nullptr, GetQualifiedName("invar_address"));
 
   // Preheader basic block.
   // Initialize induction variable starting index. Create branch to the header.
@@ -111,8 +109,7 @@ void ForLoop::Emit(llvm::IRBuilder<>* b) {
   // Emit the loop conditional branch. Load and compare indvar with ending
   // index and jump to loop exit if equal. Jump to body otherwise.
   b->SetInsertPoint(header_bb_);
-  indvar_ =
-      b->CreateLoad(indvar_address, AsStringRef(GetQualifiedName("indvar")));
+  indvar_ = b->CreateLoad(indvar_address, GetQualifiedName("indvar"));
   llvm::Value* exit_cond = b->CreateICmpUGE(indvar_, end_index_);
   b->CreateCondBr(/*Cond=*/exit_cond,
                   /*True=*/exit_bb_, /*False=*/body_bb_);
@@ -237,25 +234,26 @@ IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
                                              absl::string_view suffix) {
   std::vector<int64> dimensions(shape.rank());
   std::iota(dimensions.begin(), dimensions.end(), 0);
-  return AddLoopsForShapeOnDimensions(shape, dimensions, suffix);
+  return IrArray::Index(AddLoopsForShapeOnDimensions(shape, dimensions, suffix),
+                        shape, index_type_);
 }
 
-IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
+std::vector<llvm::Value*> ForLoopNest::AddLoopsForShapeOnDimensions(
     const Shape& shape, absl::Span<const int64> dimensions,
     absl::string_view suffix) {
-  llvm_ir::IrArray::Index index(index_type_, shape.dimensions_size());
+  std::vector<llvm::Value*> multi_index(shape.dimensions_size());
   for (int64 dimension : dimensions) {
     std::unique_ptr<llvm_ir::ForLoop> loop = AddLoop(
         /*start_index=*/0,
         /*end_index=*/shape.dimensions(dimension),
         /*suffix=*/
         llvm_ir::IrName(suffix, absl::StrCat(dimension)));
-    index[dimension] = loop->GetIndVarValue();
+    multi_index[dimension] = loop->GetIndVarValue();
   }
-  return index;
+  return multi_index;
 }
 
-IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
+std::vector<llvm::Value*> ForLoopNest::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
     absl::string_view name_suffix) {
   // Prepares the dimension list we will use to emit the loop nest. Outermost
@@ -271,18 +269,18 @@ IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
 
   // Create loop nest with one for-loop for each dimension of the
   // output.
-  llvm_ir::IrArray::Index index =
+  std::vector<llvm::Value*> multi_index =
       AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
   // Verify every dimension except the 'dimension_to_skip' dimension was set in
   // the index.
-  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
+  for (size_t dimension = 0; dimension < multi_index.size(); ++dimension) {
     if (dimension == dimension_to_skip) {
-      DCHECK_EQ(nullptr, index[dimension]);
+      DCHECK_EQ(nullptr, multi_index[dimension]);
     } else {
-      DCHECK_NE(nullptr, index[dimension]);
+      DCHECK_NE(nullptr, multi_index[dimension]);
     }
   }
-  return index;
+  return multi_index;
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index ac3bba3c9fd6a9eb4e7822474963fcc5a394baf7..1dbc9745c080e845140e0976b788d8465deaa2b4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -43,6 +43,9 @@ enum class UnrollMode {
 // A class for constructing a for-loop in LLVM IR.
 class ForLoop {
  public:
+  ForLoop(const ForLoop&) = delete;
+  ForLoop& operator=(const ForLoop&) = delete;
+
   // Emit a for-loop at the current insert point of the given IRBuilder.
   //
   // start_index and end_index are the loop bounds (end_index is not inclusive).
@@ -169,18 +172,11 @@ class ForLoop {
   llvm::Value* indvar_;
   UnrollMode unroll_mode_;
   bool prevent_vectorization_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ForLoop);
 };
 
 // A simple class for constructing nested for-loops.
 class ForLoopNest {
  public:
-  explicit ForLoopNest(llvm::IRBuilder<>* b, llvm::Type* index_ty = nullptr)
-      : ForLoopNest(/*name=*/"", b) {
-    SetIndexType(index_ty);
-  }
-
   ForLoopNest(absl::string_view name, llvm::IRBuilder<>* b,
               llvm::Type* index_ty = nullptr)
       : name_(name),
@@ -190,6 +186,8 @@ class ForLoopNest {
         b_(b) {
     SetIndexType(index_ty);
   }
+  ForLoopNest(const ForLoopNest&) = delete;
+  ForLoopNest& operator=(const ForLoopNest&) = delete;
 
   // Adds a loop to the nest. If no loop has been added yet then emit a loop at
   // the current insert point of the given builder. If one or more loops have
@@ -241,7 +239,7 @@ class ForLoopNest {
   // The return value is an index with the induction variables. The
   // size equals the rank of shape and there is a null for each
   // dimension that is not in "dimensions".
-  IrArray::Index AddLoopsForShapeOnDimensions(
+  std::vector<llvm::Value*> AddLoopsForShapeOnDimensions(
       const Shape& shape, absl::Span<const int64> dimensions,
       absl::string_view suffix);
 
@@ -252,9 +250,9 @@ class ForLoopNest {
   // dimensions of the index are filled except for 'dimension_to_skip'.
   // name_suffix is the string to append to the names of LLVM constructs (eg,
   // basic blocks) constructed by this method.
-  IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array,
-                                          int64 dimension_to_skip,
-                                          absl::string_view name_suffix);
+  std::vector<llvm::Value*> EmitOperandArrayLoopNest(
+      const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
+      absl::string_view name_suffix);
 
   // Convenience methods which return particular basic blocks of the outermost
   // or innermost loops. These methods return nullptr if no loops have been
@@ -289,8 +287,6 @@ class ForLoopNest {
   llvm::IRBuilder<>* b_;
 
   llvm::Type* index_type_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ForLoopNest);
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 807296329c07b8e4ac630486a1e1f59e4fdfa009..66219c156674318f074d7342f9f2a8a218b4f7e8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -58,14 +60,6 @@ llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* b) {
 
 }  // namespace
 
-string AsString(const std::string& str) {
-  return string(str.data(), str.length());
-}
-
-llvm::StringRef AsStringRef(absl::string_view str) {
-  return llvm::StringRef(str.data(), str.size());
-}
-
 std::unique_ptr<llvm::Module> DropConstantInitializers(
     const llvm::Module& module) {
   std::unique_ptr<llvm::Module> cloned_module = CloneModule(module);
@@ -81,7 +75,7 @@ string DumpModuleToString(const llvm::Module& module) {
   llvm::raw_string_ostream ostream(buffer_string);
   module.print(ostream, nullptr);
   ostream.flush();
-  return AsString(buffer_string);
+  return buffer_string;
 }
 
 llvm::CallInst* EmitCallToIntrinsic(
@@ -248,7 +242,7 @@ StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
     return InternalError("Encoded shape size exceeded int32 size limit.");
   }
   *shape_size = static_cast<int32>(encoded_shape.size());
-  return b->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
+  return b->CreateGlobalStringPtr(encoded_shape);
 }
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
@@ -293,7 +287,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
                                                      absl::string_view name,
                                                      llvm::IRBuilder<>* b,
                                                      int alignment) {
-  llvm::IRBuilder<>::InsertPoint insert_point = b->saveIP();
+  llvm::IRBuilder<>::InsertPointGuard guard(*b);
   llvm::Function* function = b->GetInsertBlock()->getParent();
   b->SetInsertPoint(&function->getEntryBlock(),
                     function->getEntryBlock().getFirstInsertionPt());
@@ -302,7 +296,6 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   if (alignment != 0) {
     alloca->setAlignment(alignment);
   }
-  b->restoreIP(insert_point);
   return alloca;
 }
 
@@ -334,7 +327,7 @@ LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name,
     b->CreateBr(if_data.after_block);
   } else {
     if_data.after_block = if_data.if_block->splitBasicBlock(
-        b->GetInsertPoint(), AsStringRef(absl::StrCat(name, "-after")));
+        b->GetInsertPoint(), absl::StrCat(name, "-after"));
   }
 
   // Our basic block should now end with an unconditional branch.  Remove it;
@@ -507,24 +500,25 @@ int64 ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout) {
   return ShapeUtil::ByteSizeOf(shape, pointer_size);
 }
 
-llvm::FastMathFlags GetFastMathFlags(bool fast_math_enabled) {
+llvm::FastMathFlags GetCpuFastMathFlags(const HloModuleConfig& module_config) {
   llvm::FastMathFlags flags;
-  if (fast_math_enabled) {
-    // Fast implies AllowReassoc, NoInfs, NoNaNs, NoSignedZeros,
-    // AllowReciprocal, AllowContract, and ApproxFunc.
-    flags.setFast();
+  if (!module_config.debug_options().xla_cpu_enable_fast_math()) {
+    return flags;
   }
-  return flags;
-}
 
-void SetTargetOptions(bool fast_math_enabled,
-                      llvm::TargetOptions* target_options) {
-  // In LLVM backend flags, UnsafeFPMath does not explicitly imply
-  // NoInfs, etc.
-  target_options->UnsafeFPMath = fast_math_enabled;
-  target_options->NoInfsFPMath = fast_math_enabled;
-  target_options->NoNaNsFPMath = fast_math_enabled;
-  target_options->NoSignedZerosFPMath = fast_math_enabled;
+  // Fast implies AllowReassoc, NoInfs, NoNaNs, NoSignedZeros, AllowReciprocal,
+  // AllowContract, and ApproxFunc.
+  flags.setFast();
+
+  if (module_config.debug_options().xla_cpu_fast_math_honor_nans()) {
+    flags.setNoNaNs(false);
+  }
+
+  if (module_config.debug_options().xla_cpu_fast_math_honor_infs()) {
+    flags.setNoInfs(false);
+  }
+
+  return flags;
 }
 
 std::map<int, llvm::MDNode*> MergeMetadata(
@@ -575,14 +569,6 @@ std::map<int, llvm::MDNode*> MergeMetadata(
   return result;
 }
 
-static string GetProcessUniqueIrFileName(absl::string_view prefix) {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static NameUniquer* uniquer = new NameUniquer(/*separator=*/"-");
-
-  tensorflow::mutex_lock lock(mu);
-  return uniquer->GetUniqueName(prefix);
-}
-
 static Status CreateAndWriteStringToFile(const string& directory_name,
                                          const string& file_name,
                                          const string& text) {
@@ -596,35 +582,34 @@ static Status CreateAndWriteStringToFile(const string& directory_name,
   return Status::OK();
 }
 
-Status DumpIRToDirectory(const string& directory_name,
-                         const string& hlo_module_name,
-                         const llvm::Module& llvm_module, bool optimized) {
+void DumpIrIfEnabled(const HloModule& hlo_module,
+                     const llvm::Module& llvm_module, bool optimized) {
+  const auto& debug_opts = hlo_module.config().debug_options();
+  if (!DumpingEnabledForHloModule(hlo_module)) {
+    return;
+  }
   // We can end up compiling different modules with the same name when using
   // XlaJitCompiledCpuFunction::Compile.  Avoid overwriting IR files previously
   // dumped from the same process in such cases.
-  string unique_and_safe_file_name = GetProcessUniqueIrFileName(
-      absl::StrCat("ir-", SanitizeFileName(hlo_module_name), "-",
-                   optimized ? "with" : "no", "-opt"));
-
-  string ir_file_name = tensorflow::io::JoinPath(
-      directory_name, absl::StrCat(unique_and_safe_file_name, ".ll"));
+  string suffix = absl::StrCat("ir-", optimized ? "with" : "no", "-opt");
+  DumpToFileInDirOrStdout(hlo_module, absl::StrCat(suffix, ".ll"),
+                          DumpModuleToString(llvm_module));
 
   // For some models the embedded constants can be huge, so also dump the module
-  // with the constants stripped to get IR that is easier to manipulate.
-  string ir_no_constant_initializers_file_name = tensorflow::io::JoinPath(
-      directory_name, absl::StrCat(unique_and_safe_file_name, "-noconst.ll"));
-
-  TF_RETURN_IF_ERROR(CreateAndWriteStringToFile(
-      directory_name, ir_file_name, DumpModuleToString(llvm_module)));
-  return CreateAndWriteStringToFile(
-      directory_name, ir_no_constant_initializers_file_name,
-      DumpModuleToString(*DropConstantInitializers(llvm_module)));
+  // with the constants stripped to get IR that is easier to manipulate.  Skip
+  // this if we're dumping to stdout; there's no point in duplicating everything
+  // when writing to the terminal.
+  if (!DumpingToStdout(debug_opts)) {
+    DumpToFileInDir(hlo_module, absl::StrCat(suffix, "-noconst.ll"),
+                    DumpModuleToString(*DropConstantInitializers(llvm_module)));
+  }
 }
 
-llvm::Function* CreateFunction(llvm::FunctionType* function_type,
-                               llvm::GlobalValue::LinkageTypes linkage,
-                               bool enable_fast_math, bool optimize_for_size,
-                               absl::string_view name, llvm::Module* module) {
+llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
+                                  llvm::GlobalValue::LinkageTypes linkage,
+                                  const HloModuleConfig& module_config,
+                                  absl::string_view name,
+                                  llvm::Module* module) {
   llvm::Function* function =
       llvm::Function::Create(function_type, linkage, AsStringRef(name), module);
   function->setCallingConv(llvm::CallingConv::C);
@@ -634,17 +619,23 @@ llvm::Function* CreateFunction(llvm::FunctionType* function_type,
   // created by the JIT compiled code.
   function->setHasUWTable();
 
-  if (enable_fast_math) {
+  if (module_config.debug_options().xla_cpu_enable_fast_math()) {
     function->addFnAttr("unsafe-fp-math", "true");
-    function->addFnAttr("no-infs-fp-math", "true");
-    function->addFnAttr("no-nans-fp-math", "true");
     function->addFnAttr("no-signed-zeros-fp-math", "true");
+
+    if (!module_config.debug_options().xla_cpu_fast_math_honor_nans()) {
+      function->addFnAttr("no-nans-fp-math", "true");
+    }
+
+    if (!module_config.debug_options().xla_cpu_fast_math_honor_infs()) {
+      function->addFnAttr("no-infs-fp-math", "true");
+    }
   }
 
   // Add the optize attribute to the function if optimizing for size. This
   // controls internal behavior of some optimization passes (e.g. loop
   // unrolling).
-  if (optimize_for_size) {
+  if (cpu::options::OptimizeForSizeRequested(module_config)) {
     function->addFnAttr(llvm::Attribute::OptimizeForSize);
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index c604c7c870adf734a29017e6accbd159317a9548..7b7d86364e2786f5989b9fa8c13de257fc1a80e5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -45,14 +45,13 @@ class TargetOptions;
 namespace xla {
 namespace llvm_ir {
 
-// Convert a std::string (used by LLVM's interfaces) to string.
-string AsString(const std::string& str);
-
 // Convert a absl::string_view to a llvm::StringRef. Note: both
 // absl::string_view and llvm::StringRef are non-owning pointers into a
 // string in memory. This method is used to feed strings to LLVM
 // & Clang APIs that expect llvm::StringRef.
-llvm::StringRef AsStringRef(absl::string_view str);
+inline llvm::StringRef AsStringRef(absl::string_view str) {
+  return llvm::StringRef(str.data(), str.size());
+}
 
 template <typename T>
 llvm::ArrayRef<T> AsArrayRef(const std::vector<T>& vec) {
@@ -71,7 +70,7 @@ string DumpToString(const T& entity) {
   llvm::raw_string_ostream ostream(buffer_string);
   entity.print(ostream);
   ostream.flush();
-  return AsString(buffer_string);
+  return buffer_string;
 }
 
 // Dump the given LLVM module to a string. This requires a function distinct
@@ -264,12 +263,7 @@ int64 ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout);
 
 // Gets an llvm::FastMathFlags that reflects the settings in the given
 // module config.
-llvm::FastMathFlags GetFastMathFlags(bool fast_math_enabled);
-
-// Sets values in the given TargetOptions struct according to the given
-// compilation options.
-void SetTargetOptions(bool fast_math_enabled,
-                      llvm::TargetOptions* target_options);
+llvm::FastMathFlags GetCpuFastMathFlags(const HloModuleConfig& module_config);
 
 // Computes a conservative union of the metadata in "a" and "b".  For
 // aliasing-related metadata, this means the result can be applied to
@@ -279,19 +273,19 @@ std::map<int, llvm::MDNode*> MergeMetadata(
     llvm::LLVMContext* context, const std::map<int, llvm::MDNode*>& a,
     const std::map<int, llvm::MDNode*>& b);
 
-// Dumps out `llvm_module` to a file in the directory named `directory_name`,
-// creating the directory if necessary.  A sanitized version of
-// `hlo_module_name` is incorporated into the file name.  If `optimized` is true
-// then a suffix of "-with-opt.ll" is used, else a suffix of "-no-opt.ll" is
-// used.
-Status DumpIRToDirectory(const string& directory_name,
-                         const string& hlo_module_name,
-                         const llvm::Module& llvm_module, bool optimized);
-
-llvm::Function* CreateFunction(llvm::FunctionType* function_type,
-                               llvm::GlobalValue::LinkageTypes linkage,
-                               bool enable_fast_math, bool optimize_for_size,
-                               absl::string_view name, llvm::Module* module);
+// Dumps out `llvm_module` to the path specified in DebugOptions, if dumping is
+// enabled for the given HLO module.
+//
+// A sanitized version of `hlo_module_name` is incorporated into the file name.
+// If `optimized` is true then a suffix of "-with-opt.ll" is used, else a suffix
+// of "-no-opt.ll" is used.
+void DumpIrIfEnabled(const HloModule& hlo_module,
+                     const llvm::Module& llvm_module, bool optimized);
+
+llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
+                                  llvm::GlobalValue::LinkageTypes linkage,
+                                  const HloModuleConfig& module_config,
+                                  absl::string_view name, llvm::Module* module);
 
 // Extracts the xla_backend_extra_options from `config` and passes those that
 // don't start with xla_ to LLVM.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 0dc120e0b0df47f261435f490a8459b49d989b53..638ff2c6e0b38ec15b8ee4a06098d0442dc1d203 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -99,15 +98,16 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // class so emit loops in order from most-major dimension down to most-minor
   // dimension (of the target shape).
   ForLoopNest loop_nest(loop_name, b_);
-  IrArray::Index array_index(index_type, shape_.dimensions_size());
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
   for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
     int64 dimension = LayoutUtil::Major(shape_.layout(), i);
     std::unique_ptr<ForLoop> loop = loop_nest.AddLoop(
         /*start_index=*/0,
         /*end_index=*/shape_.dimensions(dimension),
         /*suffix=*/absl::StrFormat("dim.%d", dimension));
-    array_index[dimension] = loop->GetIndVarValue();
+    array_multi_index[dimension] = loop->GetIndVarValue();
   }
+  IrArray::Index array_index(array_multi_index, shape_, index_type);
 
   // Set IR builder insertion point to the loop body basic block of the
   // innermost loop.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 89b6a36f96beedbcb7322e6164ac59221650d3d8..3ac8eb78cb03b216fc487c18c0a8d91caea1bc2a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -45,13 +46,14 @@ namespace llvm_ir {
 namespace {
 
 // Adds the inner comparison loop body where we compare elements.
-void EmitCompareLoopBody(
-    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
-    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+Status EmitCompareLoopBody(
+    int64 iteration_bound, int64 num_values, llvm::Value* element_pair_index,
     int64 xor_mask, llvm::Type* index_type,
-    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)>
+        element_address,
     std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
         write_element,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
     llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
   auto index_typed_constant = [&](int64 value) {
     return llvm::ConstantInt::get(index_type, value);
@@ -108,74 +110,44 @@ void EmitCompareLoopBody(
 
   // if (is_smaller_index && index_is_inbounds)
   KernelSupportLibrary ksl(b);
-  ksl.If("smaller_comparison_index", do_comparison, [&]() {
-    auto key1 = read_element(0, current_keys_index);
-    auto key2 = read_element(0, compare_keys_index);
-    auto compare_key1 = key1;
-    auto compare_key2 = key2;
-    bool is_signed_comparison = true;
-    if (primitive_util::IsFloatingPointType(key_type)) {
-      // We would like a total order of floating point numbers so that the
-      // sort has a predictable behavior in the presence of NaNs. Rather
-      // than using floating point comparison, we use the following trick:
-      // If f is a float, and
-      // x = bit_cast<int32>(f);
-      // y = x < 0 ? 0x7FFFFFFF - x : x;
-      // then y is ordered as an int32 such that finite values have the
-      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-      // the beginning and end of the ordering.
-      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-          key1->getType()->getPrimitiveSizeInBits()));
-      auto comparison_type = k->getType();
-      auto zero = llvm::ConstantInt::get(comparison_type, 0);
-      auto maybe_flip = [&](llvm::Value* v) {
-        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                               b->CreateSub(k, v), v);
-      };
-      compare_key1 = b->CreateBitCast(key1, comparison_type);
-      compare_key2 = b->CreateBitCast(key2, comparison_type);
-      compare_key1 = maybe_flip(compare_key1);
-      compare_key2 = maybe_flip(compare_key2);
-    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-      is_signed_comparison = false;
-    }
-    // If key2 < key1
-    auto is_smaller_than =
-        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                           : llvm::ICmpInst::ICMP_ULT,
-                      compare_key2, compare_key1);
-    if (iota_values_parameter_index >= 0) {
-      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
-      auto key_index1 =
-          read_element(iota_values_parameter_index, current_keys_index);
-      auto key_index2 =
-          read_element(iota_values_parameter_index, compare_keys_index);
-      auto index_is_smaller_than =
-          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
-      is_smaller_than = b->CreateOr(
-          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+  return ksl.IfWithStatus("smaller_comparison_index", do_comparison, [&]() {
+    std::vector<llvm::Value*> values_to_compare;
+    for (int i = 0; i < num_values; ++i) {
+      values_to_compare.push_back(element_address(i, compare_keys_index));
+      values_to_compare.push_back(element_address(i, current_keys_index));
     }
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    llvm::Value* compare_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
+        llvm_ir::PrimitiveTypeToIrType(PRED, module), "compare_return_buffer",
+        b);
+    TF_RETURN_IF_ERROR(
+        emit_compare_callback(values_to_compare, compare_return_buffer));
+    llvm::Value* result = b->CreateLoad(compare_return_buffer);
+
+    // Check if the 'compare' function returns true.
+    llvm::Value* is_smaller_than =
+        b->CreateICmpNE(result, llvm::ConstantInt::get(result->getType(), 0),
+                        "boolean_predicate");
     ksl.If("is_smaller_than", is_smaller_than, [&]() {
-      // Swap key1 with key2.
-      write_element(0, current_keys_index, key2);
-      write_element(0, compare_keys_index, key1);
-      for (int64 i = 1; i <= num_values; ++i) {
-        // Also swap the values.
-        auto value1 = read_element(i, current_keys_index);
-        auto value2 = read_element(i, compare_keys_index);
-        write_element(i, current_keys_index, value2);
-        write_element(i, compare_keys_index, value1);
+      for (int64 i = 0; i < num_values; ++i) {
+        // Swap the values.
+        auto value1 = b->CreateLoad(values_to_compare[i * 2]);
+        auto value2 = b->CreateLoad(values_to_compare[i * 2 + 1]);
+        write_element(i, current_keys_index, value1);
+        write_element(i, compare_keys_index, value2);
       }
     });
+    return Status::OK();
   });
 }
 
-void EmitTiledCompareLoop(
+Status EmitTiledCompareLoop(
     const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
-    int64 dimension_to_sort_bound, PrimitiveType keys_type,
-    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
-    const std::vector<llvm::Value*>& param_shmem_buffers,
-    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
+    int64 dimension_to_sort_bound, absl::Span<const int64> xor_masks,
+    const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
+    llvm::IRBuilder<>* b) {
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
@@ -200,7 +172,7 @@ void EmitTiledCompareLoop(
             [&]() {
               auto cache_index = b->CreateShl(thread_id, value_one);
               read_or_write(cache_index, current_keys_index);
-              // Increment to go the next index position.
+              // Increment to go to the next index position.
               current_keys_index = b->CreateAdd(current_keys_index, value_one);
               // Here we check whether the next index position is within bounds.
               ksl.If("inner_smaller_keys_index",
@@ -215,10 +187,12 @@ void EmitTiledCompareLoop(
       };
 
   // Copy operand tiles from the operand buffers to shared memory.
-  IrArray::Index keys_index = tiled_keys_index;
+  std::vector<llvm::Value*> keys_multi_index = tiled_keys_index.multidim();
   for (int64 i = 0; i < params.size(); ++i) {
     copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
-      keys_index[dimension_to_sort] = index;
+      keys_multi_index[dimension_to_sort] = index;
+      IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
+                                tiled_keys_index.GetType());
       auto value = params[i].EmitReadArrayElement(keys_index, b);
       b->CreateStore(value,
                      b->CreateGEP(param_shmem_buffers[i],
@@ -230,10 +204,18 @@ void EmitTiledCompareLoop(
   llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
 
   // Now emit the bodies of the comparison loops.
-  auto read_element = [&](int64 operand, llvm::Value* index) {
-    return b->CreateLoad(
+  auto element_address = [&](int64 operand, llvm::Value* index) {
+    auto shared_memory_address =
         b->CreateGEP(param_shmem_buffers[operand],
-                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+                     {tiled_keys_index.GetConstantWithIndexType(0), index});
+    auto ptr_type = shared_memory_address->getType();
+    // We need a generic pointer with address space 0 instead of a pointer to
+    // shared memory (address space 3) so that we can pass it to the comparison
+    // computation.
+    return b->CreateAddrSpaceCast(
+        shared_memory_address,
+        llvm::PointerType::get(ptr_type->getPointerElementType(),
+                               /*AddressSpace=*/0));
   };
   auto write_element = [&](int64 operand, llvm::Value* index,
                            llvm::Value* value) {
@@ -252,7 +234,7 @@ void EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      ksl.If(
+      TF_RETURN_IF_ERROR(ksl.IfWithStatus(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(tiled_keys_index[dimension_to_sort],
@@ -260,24 +242,24 @@ void EmitTiledCompareLoop(
               tiled_keys_index.GetConstantWithIndexType(
                   RoundDownToNearest(dimension_to_sort_bound, tile_size))),
           [&]() {
-            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
-                                params.size() - 1, iota_values_parameter_index,
-                                element_pair_index, xor_mask,
-                                tiled_keys_index.GetType(), read_element,
-                                write_element, b);
+            return EmitCompareLoopBody(
+                dimension_to_sort_bound % tile_size, params.size(),
+                element_pair_index, xor_mask, tiled_keys_index.GetType(),
+                element_address, write_element, emit_compare_callback, b);
           },
           [&]() {
-            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                                iota_values_parameter_index, element_pair_index,
-                                xor_mask, tiled_keys_index.GetType(),
-                                read_element, write_element, b,
-                                /*needs_bounds_checks=*/false);
-          });
+            return EmitCompareLoopBody(
+                tile_size, params.size(), element_pair_index, xor_mask,
+                tiled_keys_index.GetType(), element_address, write_element,
+                emit_compare_callback, b,
+                /*needs_bounds_checks=*/false);
+          }));
     } else {
-      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                          iota_values_parameter_index, element_pair_index,
-                          xor_mask, tiled_keys_index.GetType(), read_element,
-                          write_element, b, /*needs_bounds_checks=*/false);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          tile_size, params.size(), element_pair_index, xor_mask,
+          tiled_keys_index.GetType(), element_address, write_element,
+          emit_compare_callback, b,
+          /*needs_bounds_checks=*/false));
     }
     // Wait until all comparisons have happened.
     llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
@@ -286,7 +268,9 @@ void EmitTiledCompareLoop(
   // Copy the operand tiles back from shared memory to the operand buffers.
   for (int64 i = 0; i < params.size(); ++i) {
     copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
-      keys_index[dimension_to_sort] = index;
+      keys_multi_index[dimension_to_sort] = index;
+      IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
+                                tiled_keys_index.GetType());
       auto value = b->CreateLoad(b->CreateGEP(
           param_shmem_buffers[i],
           {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
@@ -301,17 +285,16 @@ void EmitTiledCompareLoop(
   // same location in shared memory because we have exactly tile_size / 2 many
   // threads, and the linear index calculated by ParallelLoopEmitter uses
   // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  return Status::OK();
 }
 }  // namespace
 
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim,
-                       const int64 tile_size) {
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, const int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback) {
   // Iterate through the keys shape in physical order, but skip the dimension to
   // sort and make it the innermost loop which is the loop where the comparisons
   // happen. In the dimension to sort, if we use tiling, we iterate through it
@@ -321,7 +304,7 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
   // within those 64 elements and are therefore independent of the other
   // comparisons).
 
-  const Shape& keys_shape = keys_array.GetShape();
+  const Shape& keys_shape = values_arrays[0].GetShape();
   int64 rank = keys_shape.rank();
   int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64> dimensions_in_iteration_order(rank);
@@ -338,18 +321,16 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
 
   Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
                                                dimensions_in_iteration_order);
-  std::vector<IrArray> params(1, keys_array);
-  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
 
   // Allocate shared memory for the tiled compare loop.
-  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  std::vector<llvm::Value*> param_shmem_buffers(values_arrays.size(), nullptr);
   if (xor_masks.size() > 1) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
-    for (int64 i = 0; i < params.size(); ++i) {
-      llvm::Type* tile_type =
-          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                   params[i].GetShape().element_type(), module),
-                               tile_size);
+    for (int64 i = 0; i < values_arrays.size(); ++i) {
+      llvm::Type* tile_type = llvm::ArrayType::get(
+          llvm_ir::PrimitiveTypeToIrType(
+              values_arrays[i].GetShape().element_type(), module),
+          tile_size);
       param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
           module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
@@ -371,30 +352,37 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
     //
     // This follows the algorithm described on Wikipedia:
     // https://en.wikipedia.org/wiki/Bitonic_sorter
-    IrArray::Index keys_index(tiles_index.GetType(), rank);
+    std::vector<llvm::Value*> keys_multi_index(rank);
     for (int64 i = 0; i < rank; ++i) {
-      keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
+      keys_multi_index[iteration_order_to_logical_order[i]] = tiles_index[i];
     }
     if (xor_masks.size() > 1) {
-      EmitTiledCompareLoop(keys_index, dimension_to_sort,
-                           dimension_to_sort_bound, keys_shape.element_type(),
-                           xor_masks, params, param_shmem_buffers,
-                           iota_values_parameter_index, tile_size, b);
+      IrArray::Index keys_index(keys_multi_index, values_arrays[0].GetShape(),
+                                tiles_index.GetType());
+      TF_RETURN_IF_ERROR(EmitTiledCompareLoop(
+          keys_index, dimension_to_sort, dimension_to_sort_bound, xor_masks,
+          values_arrays, param_shmem_buffers, tile_size, emit_compare_callback,
+          b));
     } else {
-      auto read_element = [&](int64 operand, llvm::Value* index) {
-        keys_index[dimension_to_sort] = index;
-        return params[operand].EmitReadArrayElement(keys_index, b);
+      auto element_address = [&](int64 operand, llvm::Value* index) {
+        keys_multi_index[dimension_to_sort] = index;
+        IrArray::Index keys_index(keys_multi_index,
+                                  values_arrays[operand].GetShape(),
+                                  tiles_index.GetType());
+        return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
       };
       auto write_element = [&](int64 operand, llvm::Value* index,
                                llvm::Value* value) {
-        keys_index[dimension_to_sort] = index;
-        params[operand].EmitWriteArrayElement(keys_index, value, b);
+        keys_multi_index[dimension_to_sort] = index;
+        IrArray::Index keys_index(keys_multi_index,
+                                  values_arrays[operand].GetShape(),
+                                  tiles_index.GetType());
+        values_arrays[operand].EmitWriteArrayElement(keys_index, value, b);
       };
-      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
-                          values_arrays.size(), iota_values_parameter_index,
-                          tiles_index[rank - 1], xor_masks[0],
-                          tiles_index.GetType(), read_element, write_element,
-                          b);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          dimension_to_sort_bound, values_arrays.size(), tiles_index[rank - 1],
+          xor_masks[0], tiles_index.GetType(), element_address, write_element,
+          emit_compare_callback, b));
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 685f9383acba416f51681270e4037d56abb4b6ea..b9341a34d1f2203db6e02c3df5d607174b6d0f74 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -28,19 +28,18 @@ limitations under the License.
 
 namespace xla {
 namespace llvm_ir {
+using EmitCallToNestedComputationCallback =
+    std::function<Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
-// dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
-// contains only powers of 2, or values 2^k - 1 (k > 0). If
-// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
-// that is a iota and can be used to make the sorting stable.
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim, int64 tile_size);
+// dimension of each array in 'values_arrays'. All other dimensions are kept
+// as-is. This implements the inner loop of BitonicSort. It is assumed that
+// 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index d8d2700e1934fd202d44a1dc60e71a99913d4537..3a4814b1857ec6e1fec3b79ef75d40db4fb99269 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -29,9 +29,14 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+static llvm::Module* getModuleFromBuilder(llvm::IRBuilder<>* b) {
+  return b->GetInsertBlock()->getModule();
+}
+
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* b, llvm::Module* module) {
+                     llvm::IRBuilder<>* b) {
+  llvm::Module* module = getModuleFromBuilder(b);
   CHECK(ShapeUtil::IsScalar(pred.GetShape()));
 
   llvm::LoadInst* pred_value =
@@ -65,7 +70,8 @@ void EmitTupleSelect(const IrArray& select, const IrArray& pred,
 }
 
 void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
-               llvm::IRBuilder<>* b, llvm::Module* module) {
+               llvm::IRBuilder<>* b) {
+  llvm::Module* module = getModuleFromBuilder(b);
   for (size_t i = 0; i < operands.size(); ++i) {
     auto* store = b->CreateStore(
         b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)),
@@ -76,18 +82,19 @@ void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
 }
 
 void EmitTuple(const IrArray& tuple, absl::Span<const IrArray> buffers,
-               llvm::IRBuilder<>* b, llvm::Module* module) {
+               llvm::IRBuilder<>* b) {
   std::vector<llvm::Value*> buffer_ptrs;
   buffer_ptrs.reserve(buffers.size());
   absl::c_transform(
       buffers, std::back_inserter(buffer_ptrs),
       [](const llvm_ir::IrArray& buffer) { return buffer.GetBasePointer(); });
-  llvm_ir::EmitTuple(tuple, buffer_ptrs, b, module);
+  llvm_ir::EmitTuple(tuple, buffer_ptrs, b);
 }
 
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* b, llvm::Module* module) {
+                                 llvm::IRBuilder<>* b) {
+  llvm::Module* module = getModuleFromBuilder(b);
   llvm::Value* element_ptr =
       b->CreateInBoundsGEP(operand, {b->getInt64(0), b->getInt64(index)});
   llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
index 94340b91d8eeea1ba4681c2e49c0894eab2f6cc0..67d6323aba2f0bc10e8099014a214fc3025893ac 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
@@ -61,17 +61,17 @@ namespace llvm_ir {
 //   output[i] = pred ? tuple_on_true[i] : tuple_on_false[i]
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* b, llvm::Module* module);
+                     llvm::IRBuilder<>* b);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand.
 void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
-               llvm::IRBuilder<>* b, llvm::Module* module);
+               llvm::IRBuilder<>* b);
 
 // Similar to EmitTuple above, except that the output buffers are provided in
 // the form of IrArray.
 void EmitTuple(const IrArray& tuple, absl::Span<const IrArray> buffers,
-               llvm::IRBuilder<>* b, llvm::Module* module);
+               llvm::IRBuilder<>* b);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand. A GetTupleElement instruction
@@ -79,7 +79,7 @@ void EmitTuple(const IrArray& tuple, absl::Span<const IrArray> buffers,
 // Returns an llvm value representing a pointer to the tuple element buffer.
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* b, llvm::Module* module);
+                                 llvm::IRBuilder<>* b);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 600b069ecdbabf6b05e6abb3a6b8d9b1a4b0ecf4..3470fe5b2c34bf832207ed546fad176319446f31 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -110,6 +110,7 @@ ExecutionOptions CreateExecutionOptions(
     *execution_options.mutable_shape_with_output_layout() =
         result_shape.ToProto();
   }
+  execution_options.set_num_replicas(build_options.num_replicas());
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index f6feed29935a1446499559d947dff0a8eefe5d2e..e55b83d17e90bc2ca0053a0421cf80ef6edd5bca 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -28,7 +29,7 @@ namespace {
 
 bool IsAllowed(char character) {
   auto c = static_cast<unsigned char>(character);
-  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+  return (absl::ascii_isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
 }
 
 }  // namespace
@@ -46,7 +47,7 @@ NameUniquer::NameUniquer(const string& separator) {
 
   string result = name;
   char c = static_cast<unsigned char>(result[0]);
-  if (!isalpha(c) && c != '_') {
+  if (!absl::ascii_isalpha(c) && c != '_') {
     result[0] = '_';
   }
   for (int i = 1; i < result.length(); i++) {
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.cc b/tensorflow/compiler/xla/service/op_expander_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02c9d4b387b112be39c204d35fe4fa1013ed064c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+StatusOr<bool> OpExpanderPass::Run(HloModule* module) {
+  std::vector<HloInstruction*> matching_instructions;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(
+        computation->instructions(), std::back_inserter(matching_instructions),
+        [&](HloInstruction* inst) { return InstructionMatchesPattern(inst); });
+  }
+
+  for (HloInstruction* inst : matching_instructions) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root,
+                        ExpandInstruction(inst));
+    if (expanded_root == nullptr) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+  }
+
+  return !matching_instructions.empty();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.h b/tensorflow/compiler/xla/service/op_expander_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..276e3d70b8ecd8742e0b277698765063198fe872
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass is an abstract superclass for passes that replace operations that
+// match a pattern. It is intended to be subclassed, not used directly.
+//
+// This pass is useful for legalizing HLO instructions that a particular backend
+// does not support into other HLO instructions.
+class OpExpanderPass : public HloModulePass {
+ public:
+  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
+
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // neeeded (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  virtual StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
new file mode 100644
index 0000000000000000000000000000000000000000..701c629add52a217f16877a085b9ef2d096623d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h"
+
+#include <queue>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// Returns true if the given shape is a non-nested tuple.
+bool IsNonNestedTuple(const Shape& shape) {
+  return shape.IsTuple() && !ShapeUtil::IsNestedTuple(shape);
+}
+
+}  // namespace
+
+StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
+    const Shape& input_shape, const Shape& output_shape,
+    HloInputOutputAliasConfig* alias_config) {
+  bool changed = false;
+  TF_RET_CHECK(LayoutUtil::HasLayout(input_shape));
+  TF_RET_CHECK(LayoutUtil::HasLayout(output_shape));
+  VLOG(1) << "input_shape:" << input_shape.ToString();
+  VLOG(1) << "output_shape:" << output_shape.ToString();
+
+  // For all buffers defined by the parameter, build a map from the byte
+  // size to the list of the buffers of that size.
+  absl::flat_hash_map<int64, std::queue<ShapeIndex>> size_to_input_index;
+  ShapeUtil::ForEachSubshape(
+      input_shape, [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return;
+        }
+        int64 bytes = size_func_(subshape);
+        size_to_input_index[bytes].push(index);
+      });
+
+  // For each result buffer shape index, take the first unused parameter
+  // buffer that matches the size.
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      output_shape, [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return Status::OK();
+        }
+        int64 bytes = size_func_(subshape);
+
+        auto it = size_to_input_index.find(bytes);
+        if (it != size_to_input_index.end() && !it->second.empty()) {
+          changed = true;
+          const ShapeIndex& input_index = it->second.front();
+          const ShapeIndex& output_index = index;
+          if (!alias_config->ParameterHasAlias(0, input_index) &&
+              !alias_config->OutputHasAlias(output_index)) {
+            TF_RETURN_IF_ERROR(alias_config->SetUpAlias(
+                output_index, 0, input_index,
+                HloInputOutputAliasConfig::AliasKind::kSystemAlias));
+          }
+          VLOG(3) << "Set up alias from with param index "
+                  << it->second.front().ToString() << ", shape size " << bytes
+                  << " and result subshape "
+                  << ShapeUtil::HumanStringWithLayout(subshape) << " at index "
+                  << index.ToString();
+          it->second.pop();
+        }
+        return Status::OK();
+      }));
+  return changed;
+}
+
+StatusOr<bool> OptimizeInputOutputBufferAlias::Run(HloModule* module) {
+  // User buffer alias only work for modules with 1 parameter.
+  if (module->entry_computation()->num_parameters() != 1) {
+    return false;
+  }
+
+  HloInputOutputAliasConfig* alias_config =
+      &module->input_output_alias_config();
+
+  return Build(module->entry_computation()->parameter_instruction(0)->shape(),
+               module->entry_computation()->root_instruction()->shape(),
+               alias_config);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
new file mode 100644
index 0000000000000000000000000000000000000000..79ce468e975300ed703ae0fd780f4b9d5328a4b3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// This pass opportunistically finds input and output buffers that can be
+// aliased, and writes the alias config into the HloModule.
+//
+// The input and the output buffers can be in any shape, and each output buffer
+// can alias with an input buffer with the same size. Each input buffer may only
+// alias with a single output buffer. For example, for the following parameter
+// and the output buffers,
+//
+//  Parameters : { P1(2MiB), P2(4MiB), P3(8MiB), P4(4MiB), P5(4MiB), ... }
+//  Outputs    : { O1(4MiB), O2(2MiB), O3(4MiB), O4(6MiB), O5(4MiB), ... }
+//
+// one potential aliasing would be (O1, P2), (O2, P1), (O3, P4), (O5, P5), ..
+class OptimizeInputOutputBufferAlias : public HloModulePass {
+  using ShapeSizeFunction = std::function<int64(const Shape&)>;
+
+ public:
+  OptimizeInputOutputBufferAlias(ShapeSizeFunction size_func)
+      : size_func_(size_func) {}
+  ~OptimizeInputOutputBufferAlias() override = default;
+
+  absl::string_view name() const override {
+    return "optimize_input_output_buffer_alias.h";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  friend class OptimizeInputOutputBufferAliasTest;
+
+  StatusOr<bool> Build(const Shape& input_shape, const Shape& output_shape,
+                       HloInputOutputAliasConfig* alias_config);
+  ShapeSizeFunction size_func_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41e90f9b6931619fd9824e2eda25e12e4c7197b0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// Tests that UserBufferAlias properly maps input and output buffer indices of
+// various shapes for aliasing.
+class OptimizeInputOutputBufferAliasTest : public HloTestBase {
+ protected:
+  OptimizeInputOutputBufferAliasTest() {
+    r1f32_ = ShapeUtil::MakeShape(F32, {4});
+    r2f32_ = ShapeUtil::MakeShape(F32, {4, 5});
+    r3f32_ = ShapeUtil::MakeShape(F32, {4, 5, 6});
+    r4f32_ = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
+
+    auto size_func = [](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape);
+    };
+
+    optimize_pass_ =
+        absl::make_unique<OptimizeInputOutputBufferAlias>(size_func);
+  }
+
+  // Returns the number of output indices that aliases with the input.
+  int64 AliasCount() {
+    int64 count = 0;
+
+    config_.ForEachAlias(
+        [&](const ShapeIndex&, const HloInputOutputAliasConfig::Alias&) {
+          count++;
+        });
+    return count;
+  }
+
+  bool BuildAliasConfig(const Shape& input_shape, const Shape& output_shape) {
+    config_ = HloInputOutputAliasConfig(output_shape);
+    auto changed = optimize_pass_->Build(input_shape, output_shape, &config_);
+    TF_CHECK_OK(changed.status());
+
+    return changed.ValueOrDie();
+  }
+
+  std::unique_ptr<OptimizeInputOutputBufferAlias> optimize_pass_;
+
+  HloInputOutputAliasConfig config_;
+
+  Shape r1f32_;
+  Shape r2f32_;
+  Shape r3f32_;
+  Shape r4f32_;
+};
+
+// All shapes are different, so no aliasing is available.
+TEST_F(OptimizeInputOutputBufferAliasTest, AllDifferentBufferSizes) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(AliasCount(), 0);
+}
+
+// Input and output shapes are equal, so buffers can alias at the same index.
+TEST_F(OptimizeInputOutputBufferAliasTest, OrderedNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(AliasCount(), 4);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex{1});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{2});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {3}), ShapeIndex{3});
+}
+
+// Only a subset of the tuple element shapes match between the input and the
+// output.
+TEST_F(OptimizeInputOutputBufferAliasTest, PartialReuseNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r1f32_, r2f32_, r2f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 2);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{1});
+}
+
+// The output shape is reverse of the input shape, but we can still reuse all
+// the buffers.
+TEST_F(OptimizeInputOutputBufferAliasTest, UnorderedNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r4f32_, r3f32_, r2f32_, r1f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 4);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{3});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex{2});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{1});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {3}), ShapeIndex{0});
+}
+
+TEST_F(OptimizeInputOutputBufferAliasTest, UnorderedNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeTupleShape({r1f32_}), r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape(
+      {r1f32_, ShapeUtil::MakeTupleShape({r3f32_, r2f32_}), r2f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 3);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0, 0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex({1, 1}));
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex({1, 0}));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 9e3d1060210790f60243195a1c1dff13f1fc7fc5..ae1df60d350babda12f0dc37aea41e01b8a51561 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -67,6 +67,7 @@ namespace xla {
 //     - WithOneUse: Instruction is used as an operand exactly once.
 //     - WithOneUser: Instruction is used by exactly one other instruction, but
 //       is possibly used more than once as an operand (e.g. multiply(x,x)).
+//     - WithComparisonDirection: instr has the given direction
 //
 //   Shape():
 //     - EqualTo
@@ -1671,6 +1672,40 @@ class HloInstructionPatternOneUserImpl
   }
 };
 
+class HloInstructionPatternComparisonDirectionImpl {
+ public:
+  explicit constexpr HloInstructionPatternComparisonDirectionImpl(
+      ComparisonDirection direction)
+      : direction_(direction) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has comparison direction "
+        << ComparisonDirectionToString(direction_);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kCompare ||
+        inst->comparison_direction() != direction_) {
+      EXPLAIN << "HloInstruction is not comparison "
+              << ComparisonDirectionToString(direction_);
+      return false;
+    }
+    return true;
+  }
+
+  ComparisonDirection direction_;
+};
+
 // Matches a constant scalar or effective scalar, optionally with a given value.
 template <typename ScalarTy>
 class HloConstantScalarImpl {
@@ -1956,6 +1991,14 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOneUserImpl());
   }
 
+  // Modifies the pattern to match only if the instruction has the given
+  // comparison direction.
+  auto WithComparisonDirection(ComparisonDirection direction) const
+      -> decltype(this->AppendImpl(
+          HloInstructionPatternComparisonDirectionImpl(direction))) {
+    return AppendImpl(HloInstructionPatternComparisonDirectionImpl(direction));
+  }
+
   void DescribeTo(std::ostream* os, int64 indent = 0) const {
     impl_.DescribeTo(os, indent);
   }
@@ -2053,10 +2096,12 @@ XLA_UNOP_PATTERN(RecvDone)
 XLA_UNOP_PATTERN(ReducePrecision)
 XLA_UNOP_PATTERN(Reshape)
 XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(Rsqrt)
 XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
 XLA_UNOP_PATTERN(Slice)
+XLA_UNOP_PATTERN(Sqrt)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
@@ -2116,18 +2161,13 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
 XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Compare)
 XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
-XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
-XLA_BINOP_PATTERN(Ge)
-XLA_BINOP_PATTERN(Gt)
-XLA_BINOP_PATTERN(Le)
-XLA_BINOP_PATTERN(Lt)
 XLA_COMMUTATIVE_BINOP_PATTERN(Maximum)
 XLA_COMMUTATIVE_BINOP_PATTERN(Minimum)
 XLA_COMMUTATIVE_BINOP_PATTERN(Multiply)
-XLA_COMMUTATIVE_BINOP_PATTERN(Ne)
 XLA_BINOP_PATTERN(Outfeed)
 XLA_BINOP_PATTERN(Pad)
 XLA_BINOP_PATTERN(Power)
@@ -2240,6 +2280,73 @@ XLA_VARIADIC_OP_PATTERN(Reduce);
 XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
+// Helpers for comparison instructions.
+#define XLA_COMPARE_PATTERN(NAME)                                              \
+  inline auto NAME()->decltype(                                                \
+      Op().WithOpcode(HloOpcode::kCompare)                                     \
+          .WithComparisonDirection(ComparisonDirection::k##NAME)) {            \
+    return Op()                                                                \
+        .WithOpcode(HloOpcode::kCompare)                                       \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
+  }                                                                            \
+                                                                               \
+  template <typename Lhs, typename Rhs>                                        \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                       \
+      ->decltype(Op().WithOpcode(HloOpcode::kCompare)                          \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
+                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
+                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
+    return Op()                                                                \
+        .WithOpcode(HloOpcode::kCompare)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                                \
+        .WithOperand(1, std::forward<Rhs>(rhs))                                \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
+  }                                                                            \
+                                                                               \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>           \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs)    \
+      ->decltype(Op(matched_inst)                                              \
+                     .WithOpcode(HloOpcode::kCompare)                          \
+                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
+                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
+                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
+    return Op(matched_inst)                                                    \
+        .WithOpcode(HloOpcode::kCompare)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                                \
+        .WithOperand(1, std::forward<Rhs>(rhs))                                \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
+  }
+
+#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                               \
+  XLA_COMPARE_PATTERN(NAME)                                                 \
+                                                                            \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
+                             Rhs&& rhs)                                     \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::kCompare)                       \
+                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
+                                                 std::forward<Rhs>(rhs))) { \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::kCompare)                                    \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
+                                    std::forward<Rhs>(rhs));                \
+  }                                                                         \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
+      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
+          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
+    return NAME##AnyOrder<const HloInstruction>(                            \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
+  }
+
+XLA_COMMUTATIVE_COMPARE_PATTERN(Eq);
+XLA_COMMUTATIVE_COMPARE_PATTERN(Ne);
+XLA_COMPARE_PATTERN(Ge);
+XLA_COMPARE_PATTERN(Gt);
+XLA_COMPARE_PATTERN(Le);
+XLA_COMPARE_PATTERN(Lt);
+
 // Helpers for matching non-constant instructions.
 inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
   return Op().IsNonConstant();
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
index 9ca2fb05c1f7ef093c58237cf21fbc7c813a592a..f51a18b13894d75300c46835fabd82a4ce0699af 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -23,7 +23,6 @@ namespace xla {
 namespace {
 
 namespace m = ::xla::match;
-using ::testing::Eq;
 using ::testing::Not;
 
 template <typename MatchedTy>
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 5c3c009a68bffbda8642fceedfb724879fbf1530..cbe8c4a2410d3f569933fe86ae0a8056b6b5ed85 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -931,5 +931,48 @@ TEST(PatternMatcherTest, OneUseAndOneUser) {
             "in p0 = f32[] parameter(0)");
 }
 
+TEST(HloMatchersTest, Comparison) {
+  auto shape = ShapeUtil::MakeShape(F32, {1});
+  auto p0 = HloInstruction::CreateParameter(0, shape, "param.0");
+  auto p1 = HloInstruction::CreateParameter(1, shape, "param.1");
+  auto eq = HloInstruction::CreateCompare(shape, p0.get(), p1.get(),
+                                          ComparisonDirection::kEq);
+  auto ne = HloInstruction::CreateCompare(shape, p0.get(), p1.get(),
+                                          ComparisonDirection::kNe);
+  auto add =
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0.get(), p1.get());
+  auto le = HloInstruction::CreateCompare(shape, p0.get(), add.get(),
+                                          ComparisonDirection::kLe);
+
+  EXPECT_TRUE(Match(eq.get(), m::Compare()));
+  EXPECT_TRUE(Match(eq.get(), m::Eq()));
+  EXPECT_TRUE(Match(eq.get(), m::Eq(m::Parameter(0), m::Parameter(1))));
+  EXPECT_TRUE(Match(eq.get(), m::EqAnyOrder(m::Parameter(1), m::Parameter(0))));
+  EXPECT_TRUE(Match(ne.get(), m::Compare()));
+  EXPECT_TRUE(Match(ne.get(), m::Ne()));
+  EXPECT_TRUE(Match(
+      le.get(),
+      m::Compare(m::Parameter(0), m::Add(m::Parameter(0), m::Parameter(1)))));
+  EXPECT_TRUE(Match(le.get(), m::Le(m::Parameter(0),
+                                    m::Add(m::Parameter(0), m::Parameter(1)))));
+
+  EXPECT_FALSE(Match(eq.get(), m::Add()));
+  EXPECT_FALSE(Match(eq.get(), m::Ne()));
+  EXPECT_FALSE(
+      Match(le.get(),
+            m::Eq(m::Parameter(0), m::Add(m::Parameter(0), m::Parameter(1)))));
+  EXPECT_FALSE(Match(eq.get(), m::Eq(m::Parameter(1), m::Parameter(0))));
+  EXPECT_DESC_AND_EXPLANATION(
+      eq, m::Ne().WithOneUser(),
+      "an HloInstruction:\n"
+      " * with opcode compare AND\n"
+      " * which has comparison direction NE AND\n"
+      " * which has exactly one user (but possibly is used "
+      "multiple times by that instruction)",
+      "HloInstruction is not comparison NE\n"
+      "in compare = f32[1]{0} compare(f32[1]{0} param.0, f32[1]{0} param.1), "
+      "direction=EQ");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index a62118df157edf67114ff41befbdce3da129fe93..9e2d740694012b05510fc098048c762b3057da0d 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -29,11 +29,6 @@ limitations under the License.
 //
 // Where the instruction must be elementwise, and both reshapes and transposes
 // are moved.
-//
-// Most elementwise instructions support implicit broadcast of scalar operands,
-// but select is a special-case.  The signature is Select(Pred, A, B), and the
-// only implicit scalar broadcast is on Pred, not on A or B. Since reshapes or
-// transposes to a scalar should be cheap, we simply never move them.
 
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
@@ -64,20 +59,14 @@ bool CanTriviallyChangeShape(const HloInstruction* instruction) {
   //
   // But it's not that simple. E.g. reshape(reshape(rng)) is only trivially
   // reshapable if *all* instructions in the chain have user_count == 1. And
-  // reshape(scalar) isn't trivial at all if the reshape itself isn't scalar; we
-  // rely on implicit scalar broadcast for scalars to be trivial. In addition,
-  // these cases make it harder to maintain correctness of the UpdateOperand
-  // logic below.
+  // reshape(scalar) isn't trivial at all if the reshape itself isn't scalar.
+  // In addition, these cases make it harder to maintain correctness of the
+  // UpdateOperand logic below.
   //
   // So don't handle these chains, unless you update the tests and code to deal
   // with these properly. One idea is to add a pass immediately beforehand that
   // collapses trivial runs of reshapes / transposes.
 
-  // Scalars can operate with any shape.
-  if (ShapeUtil::IsScalar(instruction->shape())) {
-    return true;
-  }
-
   // A constant can trivially reshape the literal it holds.
   if (instruction->opcode() == HloOpcode::kConstant) {
     return true;
@@ -143,8 +132,8 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
 
 // This function is called once we've decided to sink reshape/transpose operands
 // across an instruction. It returns an updated `operand` with a shape that
-// plays nicely with `new_operand_shape`; either it has the same shape (of the
-// correct type), or it is a scalar that may be implicitly broadcast.
+// plays nicely with `new_operand_shape`; it has the same shape (of the
+// correct type).
 HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
                               const Shape& new_operand_shape,
                               HloInstruction* operand) {
@@ -221,9 +210,8 @@ StatusOr<bool> PerformSinkReshapeOrTranspose(
         UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
   }
   if (HloOpcode::kFusion == instruction->opcode()) {
-    // Here we already know `instruction` is elementwise, and no operand is
-    // implicit broadcast as if it were the operands would not have easy shape
-    // changes, so all the fused instructions have the same dimensions.
+    // Here we already know `instruction` is elementwise, and all the fused
+    // instructions have the same dimensions.
     for (const auto& fused_instruction : instruction->fused_instructions()) {
       Shape* shape = fused_instruction->mutable_shape();
       shape->clear_dimensions();
@@ -287,21 +275,17 @@ bool IsReshapeMoveCandidate(HloInstruction* instruction) {
   }
 
   // Check whether all operands:
-  //    0. Have the same dimensions as the output -- if not, they may be
-  //       implicitly broadcast, which can confound the movement's
-  //       correctness.
+  //    0. Have the same dimensions as the output.
   //
   // And one of the following:
   //    1. Are reshapes or transposes that have the same input and
   //       output shapes as all other reshaped or transposed operands.
   //     or
-  //    2. Are one of kConstant, kRng, broadcast of a scalar value, and scalars
-  //     that can change shape trivially.
+  //    2. Are one of kConstant, kRng, broadcast of a scalar value.
   const HloInstruction* first_reshape_operand = nullptr;
   for (const HloInstruction* operand : instruction->operands()) {
     if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; may be "
-                 "implicitly broadcast, so preventing "
+      VLOG(5) << "Operand shape differs from output shape; so preventing "
                  "movement\n\toperand: "
               << operand->ToString(print_no_metadata) << "\n\tinstruction: "
               << instruction->ToString(print_no_metadata);
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index 036c3c36f648daf8963a6b25e300b93c1bdf78d9..e3a3feb86404634f0114f4cb8aa9b1c883e78f95 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -134,6 +134,13 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
     int64 operand_rank) {
   HloComputation* computation = index_vector->parent();
   const Shape& index_shape = index_vector->shape();
+
+  // Scatter of a scalar. Return a zero-sized vector of indices.
+  if (operand_rank == 0) {
+    return computation->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateFromDimensions(index_shape.element_type(), {0})));
+  }
+
   HloInstruction* zero =
       computation->AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1})));
@@ -171,12 +178,12 @@ static StatusOr<HloInstruction*> CheckIndexValidity(
   // Valid range for the index: [0, operand_dims - window_sizes]
 
   // Check if the index has any negative values.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * zero_index,
+  HloInstruction* zero_index =
       BroadcastZeros(computation, index->shape().element_type(),
-                     AsInt64Slice(index->shape().dimensions())));
-  TF_ASSIGN_OR_RETURN(HloInstruction * negative_index_check,
-                      MakeBinaryHlo(HloOpcode::kLe, zero_index, index));
+                     AsInt64Slice(index->shape().dimensions()));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * negative_index_check,
+      MakeCompareHlo(ComparisonDirection::kLe, zero_index, index));
 
   // Check if the index is OOB w.r.t. the operand dimensions and window sizes.
   std::vector<int64> max_valid_index(operand_dims.size());
@@ -187,9 +194,9 @@ static StatusOr<HloInstruction*> CheckIndexValidity(
       HloInstruction * max_valid_index_constant,
       MakeR1ConstantHlo<int64>(computation, index->shape().element_type(),
                                max_valid_index));
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * oob_index_check,
-      MakeBinaryHlo(HloOpcode::kGe, max_valid_index_constant, index));
+  TF_ASSIGN_OR_RETURN(HloInstruction * oob_index_check,
+                      MakeCompareHlo(ComparisonDirection::kGe,
+                                     max_valid_index_constant, index));
 
   // Combine the results of the two checks above.
   TF_ASSIGN_OR_RETURN(
@@ -222,10 +229,9 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
   bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
 
   // Build a vector form of the induction variable of the while loop.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   // Pick the index to scatter from scatter_indices based on the induction_var
   // and transform that to an index into the `operand` space.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 32707de700b5926105e15922f67c19e0ed7bd7b8..49c346d87fcbafb1e6780b0207f6250077c1e297 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -28,7 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -63,6 +65,10 @@ namespace {
 using absl::StrCat;
 using absl::StrFormat;
 
+// Argument used when calling DumpHloModuleIfEnabled before optimizations are
+// performed on an HloModule.
+constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
+
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
 Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
                        se::Stream* stream, TransferManager* transfer_manager,
@@ -296,11 +302,16 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
-  config->set_replica_count(options_.number_of_replicas());
   if (execution_options != nullptr) {
+    if (execution_options->num_replicas() > 0) {
+      config->set_replica_count(execution_options->num_replicas());
+    } else {
+      config->set_replica_count(options_.number_of_replicas());
+    }
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
+    config->set_replica_count(options_.number_of_replicas());
     config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
@@ -309,6 +320,15 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     config->set_intra_op_parallelism_threads(
         execute_backend_->eigen_intra_op_thread_pool()->NumThreads());
   }
+
+  if (execution_options != nullptr &&
+      execution_options->has_device_assignment()) {
+    TF_ASSIGN_OR_RETURN(
+        auto device_assignment,
+        DeviceAssignment::Deserialize(execution_options->device_assignment()));
+    config->set_static_device_assignment(*device_assignment);
+  }
+
   return std::move(config);
 }
 
@@ -333,21 +353,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   // Dump computation proto state if flag is set.
   std::vector<std::unique_ptr<HloSnapshot>> hlo_snapshots;
   for (int64 i = 0; i < module_protos.size(); ++i) {
-    const string& directory_path =
-        module_configs[i]->debug_options().xla_dump_computations_to();
-    const string& execution_directory_path =
-        module_configs[i]->debug_options().xla_dump_executions_to();
-    if (directory_path.empty() && execution_directory_path.empty()) {
-      continue;
-    }
     auto hlo_snapshot = absl::make_unique<HloSnapshot>();
     *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i];
-    if (!directory_path.empty()) {
-      string filename = StrFormat("computation_%d__%s", module_protos[i]->id(),
-                                  module_protos[i]->entry_computation_name());
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
-    }
     hlo_snapshots.push_back(std::move(hlo_snapshot));
   }
 
@@ -363,6 +370,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const HloModuleProto* proto = module_protos[i];
     const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module, CreateModuleFromProto(*proto, config));
+    DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
     module_group->push_back(std::move(module));
   }
 
@@ -372,7 +380,9 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
                                    std::move(executors), device_allocator));
 
   for (size_t i = 0; i < module_protos.size(); ++i) {
-    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
+    const auto& debug_opts = module_configs[i]->debug_options();
+    if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) &&
+        debug_opts.xla_dump_hlo_snapshots()) {
       executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i]));
     }
   }
@@ -470,24 +480,6 @@ Service::ExecuteParallelAndRegisterResult(
     }
   }
 
-  // For every stream that had profiling enabled, obtain and debug-dump the HLO
-  // profile.
-  for (auto& index_to_profiled_stream : index_to_profiled_streams) {
-    int64 device = index_to_profiled_stream.first;
-    se::Stream* stream = index_to_profiled_stream.second;
-    Executable* executable = executables[device];
-    const HloModule& module = executable->module();
-    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(),
-                                    &executable->hlo_profile_index_map());
-    TF_RETURN_IF_ERROR(
-        executable->PopulateExecutionProfile(&hlo_profile, stream));
-    XLA_LOG_LINES(
-        tensorflow::INFO,
-        hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));
-    hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute",
-                                         &hlo_profile);
-  }
-
   if (profile != nullptr) {
     CHECK(!timers.empty());
     std::vector<uint64> timer_nanoseconds;
@@ -746,16 +738,17 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
   }
 
   for (int i = 0; i < executable_ptrs.size(); i++) {
-    if (executable_ptrs[i]->dumping_snapshot()) {
+    Executable* executable = executable_ptrs[i];
+    if (executable->dumping_snapshot()) {
       TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
                           allocation_tracker_.ResolveForReplica(outputs[i], 0));
       TF_ASSIGN_OR_RETURN(auto stream,
                           execute_backend_->BorrowStream(all_executors[i][0]));
       TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                       execute_backend_->transfer_manager(),
-                                      executable_ptrs[i]->hlo_snapshot()));
-      // Dump out the ith snapshot.
-      TF_RETURN_IF_ERROR(executable_ptrs[i]->DumpHloSnapshot());
+                                      executable->hlo_snapshot()));
+      DumpHloSnapshotIfEnabled(executable->module(),
+                               *executable->hlo_snapshot());
     }
   }
 
@@ -795,26 +788,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name());
 
-  // Dump computation proto state if flag is set.
-  auto hlo_snapshot = absl::make_unique<HloSnapshot>();
-  const string& directory_path =
-      module_config->debug_options().xla_dump_computations_to();
-  const string& execution_directory_path =
-      module_config->debug_options().xla_dump_executions_to();
-  if (!directory_path.empty() || !execution_directory_path.empty()) {
-    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
-    if (!directory_path.empty()) {
-      string filename = StrFormat("computation_%d__%s", module_proto.id(),
-                                  module_proto.entry_computation_name());
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(module_proto, *module_config));
-
-  TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*module));
+  DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
 
   TF_ASSIGN_OR_RETURN(
       module, backend->compiler()->RunHloPasses(std::move(module), executor,
@@ -824,7 +800,11 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                       backend->compiler()->RunBackend(
                           std::move(module), executor, device_allocator));
 
-  if (!execution_directory_path.empty()) {
+  const auto& debug_opts = module_config->debug_options();
+  if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) &&
+      debug_opts.xla_dump_hlo_snapshots()) {
+    auto hlo_snapshot = absl::make_unique<HloSnapshot>();
+    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
     executable->set_hlo_snapshot(std::move(hlo_snapshot));
   }
 
@@ -934,7 +914,7 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
     TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                     execute_backend_->transfer_manager(),
                                     executable->hlo_snapshot()));
-    TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
+    DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot());
   }
 
   VLOG(1) << "successfully completed 'execute' request";
@@ -1156,9 +1136,7 @@ Status Service::GetComputationGraphStats(
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
-
-  hlo_graph_dumper::MaybeDumpHloModule(*module,
-                                       "computation statistics subject");
+  DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
 
   // Run HLO analysis to get the computation statistics.
   HloCostAnalysis analysis(
@@ -1197,16 +1175,4 @@ StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
   return replicas;
 }
 
-Status Service::MaybeDumpUnoptimizedHloModule(const HloModule& module) const {
-  const string xla_dump_unoptimized_hlo_proto_to =
-      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
-  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
-    return Status::OK();
-  }
-  HloProto proto = MakeHloProto(module);
-  return protobuf_util::DumpProtoToDirectory(
-      proto, xla_dump_unoptimized_hlo_proto_to,
-      StrCat(module.name(), ".unoptimized"));
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 6e31bea7283d5808adf90b06fe7ef927c2f7fbdc..f127e340b5950ae77bcfa22b638c1d9fc8a2024b 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -53,7 +53,7 @@ class ServiceOptions {
   ServiceOptions& set_platform(se::Platform* platform);
   se::Platform* platform() const;
 
-  // Set the number of replicas to use when compiling replicated
+  // Set the default number of replicas to use when compiling replicated
   // programs.
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
@@ -275,10 +275,6 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<se::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
-  // Dumps the (unoptimized) module given if the corresponding DebugOptions
-  // field has been set.
-  Status MaybeDumpUnoptimizedHloModule(const HloModule& module) const;
-
   // Returns the device handle that represents the replicated device for a
   // single computation that is not model-parallelized.
   DeviceHandle SingleComputationDeviceHandle() const;
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index fad7afefafd86d56f0d60b7ecfa1742d53d9452b..e1536684c066a1e29df6ad956ec1bebcfbccba72 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -167,6 +167,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   }
 
   std::vector<int64> output_dimensions(window.dimensions_size());
+  std::vector<bool> output_is_dynamic(window.dimensions_size());
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     const auto& dim = window.dimensions(i);
     if (dim.size() <= 0) {
@@ -196,6 +197,13 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           window.DebugString());
     }
 
+    if (base_shape.is_dynamic_dimension(i) &&
+        !window_util::IsTrivialWindowDimension(dim)) {
+      return Unimplemented(
+          "Dynamic shape is not supported for non trivial window: %s",
+          window_util::ToString(window));
+    }
+
     const int64 dilated_base = window_util::DilatedBound(
         ShapeUtil::GetDimension(base_shape, i), dim.base_dilation());
     const int64 padded_dilated_base =
@@ -205,9 +213,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
     output_dimensions[i] = window_util::StridedBound(
         padded_dilated_base, dilated_window, dim.stride());
+    output_is_dynamic[i] = base_shape.is_dynamic_dimension(i);
   }
 
-  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions);
+  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions,
+                                       output_is_dynamic);
 }
 
 }  // namespace
@@ -245,6 +255,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case HloOpcode::kExpm1:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       if (!ShapeUtil::ElementIsFloating(shape) &&
           !ShapeUtil::ElementIsComplex(shape)) {
@@ -500,17 +512,33 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            padding_config.ShortDebugString());
   }
 
+  if (!padding_value_shape.is_static()) {
+    return InvalidArgument("Dynamic padding value is not supported");
+  }
+
   std::vector<int64> dimensions(operand_shape.rank());
+  std::vector<bool> is_dynamic(operand_shape.rank());
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     const auto& p = padding_config.dimensions(i);
+    if (operand_shape.is_dynamic_dimension(i) && p.edge_padding_high() != 0 &&
+        p.edge_padding_low() != 0 && p.interior_padding() != 0) {
+      return InvalidArgument(
+          "Dynamic dimension on padding dimension is not supported.");
+    }
     dimensions[i] = operand_shape.dimensions(i) + p.edge_padding_low() +
                     p.edge_padding_high() +
                     std::max<int64>(operand_shape.dimensions(i) - 1, 0LL) *
                         p.interior_padding();
+    if (dimensions[i] < 0) {
+      return InvalidArgument("Padding result in negative size for dimension %d",
+                             i);
+    }
+    is_dynamic[i] = operand_shape.is_dynamic_dimension(i);
   }
+
   return ShapeUtil::MakeShape(
       ShapeUtil::HigherPrecisionElementType(operand_shape, padding_value_shape),
-      dimensions);
+      dimensions, is_dynamic);
 }
 
 // Current DotDimensionNumbers Requirements:
@@ -620,7 +648,9 @@ Status ValidateDotDimensionNumbers(
     const int64 rhs_contracting_dimension =
         dimension_numbers.rhs_contracting_dimensions(i);
     if (lhs.dimensions(lhs_contracting_dimension) !=
-        rhs.dimensions(rhs_contracting_dimension)) {
+            rhs.dimensions(rhs_contracting_dimension) ||
+        lhs.is_dynamic_dimension(lhs_contracting_dimension) !=
+            rhs.is_dynamic_dimension(rhs_contracting_dimension)) {
       return fail("Contracting dimension sizes do not match.");
     }
   }
@@ -634,7 +664,10 @@ Status ValidateDotDimensionNumbers(
   // Check that batch dimension numbers and sizes match.
   for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
     if (lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
-        rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i)) ||
+        lhs.is_dynamic_dimension(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.is_dynamic_dimension(
+                dimension_numbers.rhs_batch_dimensions(i))) {
       return fail("Batch dimension sizes must match for lhs/rhs.");
     }
   }
@@ -645,14 +678,17 @@ Status ValidateDotDimensionNumbers(
   // Generate the result dimensions in order, rhs dimensions followed by lhs
   // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
+  std::vector<bool> is_dynamic;
   for (int64 lhs_dim : dimension_numbers.lhs_batch_dimensions()) {
     dimensions.push_back(lhs.dimensions(lhs_dim));
+    is_dynamic.push_back(lhs.is_dynamic_dimension(lhs_dim));
   }
   for (int64 i = 0; i < lhs.rank(); i++) {
     if (!absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
                                i) &&
         !absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
       dimensions.push_back(lhs.dimensions(i));
+      is_dynamic.push_back(lhs.is_dynamic_dimension(i));
     }
   }
   for (int64 i = 0; i < rhs.rank(); i++) {
@@ -660,10 +696,11 @@ Status ValidateDotDimensionNumbers(
                                i) &&
         !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(), i)) {
       dimensions.push_back(rhs.dimensions(i));
+      is_dynamic.push_back(rhs.is_dynamic_dimension(i));
     }
   }
   Shape result = ShapeUtil::MakeShape(
-      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions);
+      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions, is_dynamic);
 
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred dot shape: " << ShapeUtil::HumanString(result);
@@ -681,13 +718,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   // dimension). In that case, the output shape has the non-1 dimension size
   // from the lhs/rhs pair in every index.
   std::vector<int64> output_dimensions(lhs.rank());
+  std::vector<bool> output_dimensions_is_dynamic(lhs.rank());
   for (int64 i = 0; i < lhs.rank(); ++i) {
     if (lhs.dimensions(i) == rhs.dimensions(i)) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else if (lhs.dimensions(i) == 1) {
       output_dimensions[i] = rhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = rhs.is_dynamic_dimension(i);
     } else if (rhs.dimensions(i) == 1) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
@@ -696,7 +737,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
   }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              output_dimensions);
+                              output_dimensions, output_dimensions_is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
@@ -775,6 +816,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
     int64 small_dimension_size = smaller_shape.dimensions(i);
     int64 large_dimension_size = larger_shape.dimensions(dimension_to_match);
+    bool small_is_dynamic = smaller_shape.is_dynamic_dimension(i);
+    bool large_is_dynamic =
+        larger_shape.is_dynamic_dimension(dimension_to_match);
     // Dimension sizes must be compatible: match or be degenerate (degenerate
     // case is handled by degenerate dimension broadcasting which occurs after
     // InDim broadcasting).
@@ -786,6 +830,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(smaller_shape),
           ShapeUtil::HumanString(larger_shape));
     }
+    if (small_is_dynamic != large_is_dynamic) {
+      if (small_dimension_size == large_dimension_size ||
+          (small_dimension_size == 1 && !small_is_dynamic) ||
+          (large_dimension_size == 1 && !large_is_dynamic)) {
+        // Do nothing. It's OK when the size-1 dimension is not static.
+      } else {
+        return InvalidArgument(
+            "Broadcast dimension %d dynamism mismatch: %s and %s.", i,
+            ShapeUtil::HumanString(smaller_shape),
+            ShapeUtil::HumanString(larger_shape));
+      }
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) {
@@ -795,6 +851,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
 
     output_shape.set_dimensions(dimension_to_match, small_dimension_size);
+    output_shape.set_dynamic_dimension(dimension_to_match, small_is_dynamic);
   }
 
   return output_shape;
@@ -924,12 +981,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       }
       return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                            broadcast_dimensions);
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kNe: {
+    case HloOpcode::kCompare: {
       TF_ASSIGN_OR_RETURN(const Shape& shape,
                           InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                                         broadcast_dimensions));
@@ -1227,16 +1279,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RETURN_IF_ERROR(
       ExpectArray(scale_shape, "scale input of batch norm inference"));
 
-  TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) ==
-               Status::OK());
-  TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape) ==
-               Status::OK());
-  TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
-               Status::OK());
-  TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(mean_shape) ==
-               Status::OK());
-  TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape) ==
-               Status::OK());
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(mean_shape));
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape));
 
   if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
@@ -1661,11 +1709,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
   if (batch_group_count > 1 && input_batch % kernel_output_features != 0) {
     return InvalidArgument(
-        "Expected output feature dimension (value %d) to be divisible by "
-        "input_batch (value %d) for batch group count %d; "
+        "Expected input batch (value %d) to be divisible by output feature "
+        "dimension size (value %d) for batch group count %d; "
         "got <conv>(%s, %s)\n"
         "Dimension numbers: {%s}.",
-        kernel_output_features, input_batch, batch_group_count,
+        input_batch, kernel_output_features, batch_group_count,
         ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
         dnums.DebugString());
   }
@@ -1682,7 +1730,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
         dnums.DebugString());
   }
+
   if (kernel_output_features % feature_group_count > 0) {
+    // A depthwise/grouped filter has the shape
+    // [space0, .. spaceN, GROUP_SIZE, NUM_OUTPUT_FEATURES]. When
+    // [space0, .. spaceN, GROUP_SIZE] is convolved with the input, a shape
+    // [space0, .. spaceN, feature_group_count] is formed. Therefore, the output
+    // feature count (which is equal to kernel output features) has to be a
+    // multiple of feature_group_count.
     return InvalidArgument(
         "Expected output feature dimension (value %d) to be divisible by "
         "feature_group_count (value %d); "
@@ -1731,8 +1786,33 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     dimensions[dnums.output_spatial_dimensions(i)] =
         window_output_shape.dimensions(i);
   }
+  std::vector<bool> is_dynamic(num_dims);
+  for (int i = 0; i < num_dims; i++) {
+    if (lhs.is_dynamic_dimension(i)) {
+      if (i == dnums.input_batch_dimension()) {
+        is_dynamic[dnums.output_batch_dimension()] = true;
+      } else if (i == dnums.input_feature_dimension()) {
+        // Input feature dimension is a contracting dimension, which does not
+        // affect the output dimension size. So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: lhs shape is %s ",
+            lhs.ToString());
+      }
+    }
+    if (rhs.is_dynamic_dimension(i)) {
+      if (i == dnums.kernel_input_feature_dimension()) {
+        // Kernel feature dimension does not affect the output dimension size.
+        // So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: rhs shape is %s ",
+            rhs.ToString());
+      }
+    }
+  }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              dimensions);
+                              dimensions, is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferFftShape(
@@ -1776,6 +1856,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
               fft_length[i]);
         }
       }
+      if (ShapeUtil::IsZeroElementArray(in)) {
+        return in;
+      }
       Shape result = ShapeUtil::ChangeElementType(in, C64);
       result.set_dimensions(result.dimensions_size() - 1,
                             fft_length[fft_rank - 1] / 2 + 1);
@@ -1817,6 +1900,78 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 #undef RET_CHECK_RANK
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferTriangularSolveShape(
+    const Shape& a, const Shape& b, const TriangularSolveOptions& options) {
+  if ((!ShapeUtil::ElementIsFloating(a) && !ShapeUtil::ElementIsComplex(a)) ||
+      a.element_type() != b.element_type()) {
+    return InvalidArgument(
+        "Expected element types in shape to be floating or complex and "
+        "identical for TriangularSolve; got %s and %s.",
+        PrimitiveType_Name(a.element_type()),
+        PrimitiveType_Name(b.element_type()));
+  }
+  if (a.rank() < 2) {
+    return InvalidArgument(
+        "The 'a' argument to TriangularSolve must have rank >= 2, got shape %s",
+        a.ToString());
+  }
+  if (b.rank() != a.rank()) {
+    return InvalidArgument(
+        "Arguments to triangular solve must have equal rank; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+    return InvalidArgument(
+        "The two minor dimensions of 'a' must have equal size, got %s.",
+        a.ToString());
+  }
+  if (a.dimensions(a.rank() - 1) !=
+      b.dimensions(b.rank() - (options.left_side() ? 2 : 1))) {
+    return InvalidArgument(
+        "The shared dimension of 'a' and 'b' does not match, got shapes %s and "
+        "%s",
+        a.ToString(), b.ToString());
+  }
+  absl::Span<const int64> a_batch_dims(a.dimensions());
+  absl::Span<const int64> b_batch_dims(b.dimensions());
+  a_batch_dims.remove_suffix(2);
+  b_batch_dims.remove_suffix(2);
+  if (a_batch_dims != b_batch_dims) {
+    return InvalidArgument(
+        "The leading batch dimensions of the arguments to triangular solve "
+        "must be equal; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (!TriangularSolveOptions_Transpose_IsValid(options.transpose_a()) ||
+      options.transpose_a() == TriangularSolveOptions::TRANSPOSE_INVALID) {
+    return InvalidArgument(
+        "Invalid transpose option value for triangular solve (%d).\n",
+        options.transpose_a());
+  }
+  return b;
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferCholeskyShape(
+    const Shape& a) {
+  if (!ShapeUtil::ElementIsFloating(a) && !ShapeUtil::ElementIsComplex(a)) {
+    return InvalidArgument(
+        "Expected element type in shape to be floating or complex for "
+        "Cholesky; got %s.",
+        PrimitiveType_Name(a.element_type()));
+  }
+  if (a.rank() < 2) {
+    return InvalidArgument(
+        "The 'a' argument to Cholesky must have rank >= 2, got shape %s",
+        a.ToString());
+  }
+  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+    return InvalidArgument(
+        "The two minor dimensions of 'a' must have equal size, got %s.",
+        a.ToString());
+  }
+  return a;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
@@ -1904,7 +2059,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int64 i = 1; i < num_reduced_args; ++i) {
     if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) {
       return InvalidArgument(
-          "All reduced tensors must have the sime dimension. Tensor 0 has "
+          "All reduced tensors must have the same dimension. Tensor 0 has "
           "shape %s, Tensor %d has shape %s",
           ShapeUtil::HumanString(*reduced_args[0]), i,
           ShapeUtil::HumanString(*reduced_args[i]));
@@ -1933,20 +2088,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   std::set<int64> dimensions_to_reduce_set(dimensions_to_reduce.begin(),
                                            dimensions_to_reduce.end());
   std::vector<int64> new_dimensions;
+  std::vector<bool> new_is_dynamic;
   for (int i = 0; i < arg.rank(); ++i) {
     if (dimensions_to_reduce_set.find(i) == dimensions_to_reduce_set.end()) {
       new_dimensions.push_back(arg.dimensions(i));
+      new_is_dynamic.push_back(arg.is_dynamic_dimension(i));
     }
   }
 
   if (ShapeUtil::IsScalar(to_apply.result())) {
     return ShapeUtil::MakeShape(to_apply.result().element_type(),
-                                new_dimensions);
+                                new_dimensions, new_is_dynamic);
   } else {
     std::vector<Shape> result_subshapes;
     for (const Shape& subshape : to_apply.result().tuple_shapes()) {
-      result_subshapes.push_back(
-          ShapeUtil::MakeShape(subshape.element_type(), new_dimensions));
+      result_subshapes.push_back(ShapeUtil::MakeShape(
+          subshape.element_type(), new_dimensions, new_is_dynamic));
     }
     return ShapeUtil::MakeTupleShape(result_subshapes);
   }
@@ -2020,6 +2177,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(source_shape),
         ShapeUtil::HumanString(window_result_shape));
   }
+
   return operand_shape;
 }
 
@@ -2260,8 +2418,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
     if (operand_shape.rank() != number_of_indices) {
       return InvalidArgument(
-          "Dynamic update slice start number of dimensions %d must match rank "
-          "%d of slice input (%s).",
+          "Dynamic update slice start number of dimensions %d must match "
+          "rank %d of slice input (%s).",
           number_of_indices, operand_shape.rank(),
           ShapeUtil::HumanString(operand_shape));
     }
@@ -2348,7 +2506,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(arg));
   }
 
-  if (index >= arg.tuple_shapes_size()) {
+  if (index < 0 || index >= arg.tuple_shapes_size()) {
     return InvalidArgument(
         "Cannot infer shape: attempt to index out of tuple bounds: %d "
         ">= %d in shape %s.",
@@ -2395,59 +2553,55 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferConditionalShape(
-    const Shape& predicate, const Shape& true_operand,
-    const Shape& false_operand, const ProgramShape& true_computation,
-    const ProgramShape& false_computation) {
-  if (!ShapeUtil::Equal(predicate, ShapeUtil::MakeShape(PRED, {}))) {
-    return InvalidArgument("Predicate must be a boolean; got %s.",
-                           ShapeUtil::HumanString(predicate));
-  }
-
-  if (true_computation.parameters_size() != 1) {
-    return InvalidArgument("true_computation must take 1 argument; got %d.",
-                           true_computation.parameters_size());
-  }
-  if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) {
-    auto true_shape_string = [&]() {
-      return StrFormat("true_operand: %s; true_computation: %s",
-                       ShapeUtil::HumanString(true_operand),
-                       ShapeUtil::HumanString(true_computation));
-    };
-    return InvalidArgument(
-        "true_operand must match the shape of the only parameter of "
-        "true_computation: got %s.",
-        true_shape_string());
+    const Shape& branch_index,
+    absl::Span<const ProgramShape> branch_computations,
+    absl::Span<const Shape> branch_operands) {
+  if (!ShapeUtil::Equal(branch_index, ShapeUtil::MakeShape(PRED, {})) &&
+      !ShapeUtil::Equal(branch_index, ShapeUtil::MakeShape(S32, {}))) {
+    return InvalidArgument("branch_index must be bool or int32; got %s.",
+                           ShapeUtil::HumanString(branch_index));
+  }
+  if (branch_index.element_type() == PRED) {
+    TF_RET_CHECK(2 == branch_computations.size());
+  } else {
+    TF_RET_CHECK(!branch_computations.empty());
   }
+  TF_RET_CHECK(branch_computations.size() == branch_operands.size());
 
-  if (false_computation.parameters_size() != 1) {
-    return InvalidArgument("false_computation must take 1 argument; got %d.",
-                           false_computation.parameters_size());
-  }
-  if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) {
-    auto false_shape_string = [&]() {
-      return StrFormat("false_operand: %s; false_computation: %s",
-                       ShapeUtil::HumanString(false_operand),
-                       ShapeUtil::HumanString(false_computation));
-    };
-    return InvalidArgument(
-        "false_operand must match the shape of the only parameter of "
-        "false_computation: got %s.",
-        false_shape_string());
-  }
-  if (!ShapeUtil::Compatible(true_computation.result(),
-                             false_computation.result())) {
-    auto shape_string = [&]() {
-      return StrFormat(
-          "true_computation result: %s; false_computation result: %s.",
-          ShapeUtil::HumanString(true_computation.result()),
-          ShapeUtil::HumanString(false_computation.result()));
-    };
-    return InvalidArgument(
-        "the result of true_computation and false_computation must have the "
-        "same shape: got %s.",
-        shape_string());
+  for (int j = 0; j < branch_computations.size(); ++j) {
+    if (branch_computations[j].parameters_size() != 1) {
+      return InvalidArgument(
+          "branch computation %d must take 1 argument; got %d.", j,
+          branch_computations[j].parameters_size());
+    }
+    if (!ShapeUtil::Compatible(branch_computations[j].parameters(0),
+                               branch_operands[j])) {
+      auto shape_string = [&]() {
+        return StrFormat("operand: %s; computation: %s",
+                         ShapeUtil::HumanString(branch_operands[j]),
+                         ShapeUtil::HumanString(branch_computations[j]));
+      };
+      return InvalidArgument(
+          "branch operand %d must match the shape of the only parameter of "
+          "branch computation %d: got %s.",
+          j, j, shape_string());
+    }
+
+    if (!ShapeUtil::Compatible(branch_computations[0].result(),
+                               branch_computations[j].result())) {
+      auto shape_string = [&]() {
+        return StrFormat(
+            "branch 0 computation result: %s; branch %d computation result: %s",
+            ShapeUtil::HumanString(branch_computations[0].result()), j,
+            ShapeUtil::HumanString(branch_computations[j].result()));
+      };
+      return InvalidArgument(
+          "the result of branch 0 computation and branch %d computation must "
+          "have the same shape: got %s.",
+          j, shape_string());
+    }
   }
-  return true_computation.result();
+  return branch_computations[0].result();
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferBroadcastShape(
@@ -2497,11 +2651,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         operand_shape.dimensions(i) != 1) {
       return InvalidArgument(
           "Input dimension should be either 1 or equal to the output dimension "
-          "it's broadcasting into; the %lldth operand dimension is %lld, the "
+          "it is broadcasting into; the %lldth operand dimension is %lld, the "
           "%lldth output dimension is %lld.",
           i, operand_shape.dimensions(i), broadcast_dimensions[i],
           output_shape.dimensions(broadcast_dimensions[i]));
     }
+    if (operand_shape.is_dynamic_dimension(i) !=
+        output_shape.is_dynamic_dimension(broadcast_dimensions[i])) {
+      return InvalidArgument(
+          "Broadcast input and output dynamism mismatch: %s and %s",
+          operand_shape.ToString(), output_shape.ToString());
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions[i - 1] >= broadcast_dimensions[i]) {
@@ -2544,6 +2704,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         StrJoin(dimensions, ","), ShapeUtil::HumanString(operand));
   }
 
+  std::vector<std::pair<int64, int64>> unmodified_dims =
+      ShapeUtil::DimensionsUnmodifiedByReshape(operand, inferred_shape);
+  for (auto& unmodified : unmodified_dims) {
+    if (operand.is_dynamic_dimension(unmodified.first)) {
+      inferred_shape.set_dynamic_dimension(unmodified.second, true);
+    }
+  }
+
   return inferred_shape;
 }
 
@@ -2551,11 +2719,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& operand, absl::Span<const int64> dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose"));
 
-  std::vector<int64> indices(operand.rank());
-  std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != operand.rank() ||
-      !std::is_permutation(dimensions.begin(), dimensions.end(),
-                           indices.begin())) {
+  if (!IsPermutation(dimensions, operand.rank())) {
     return InvalidArgument(
         "Transpose dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
@@ -2617,19 +2781,31 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         "Select's pred operand must have PRED element type; got %s.",
         ShapeUtil::HumanString(pred));
   }
-  if (ShapeUtil::CompatibleIgnoringElementType(pred, on_true) ||
+  if (Shape::Equal()
+          .IgnoreElementType()
+          .IgnoreLayout()
+          .IgnoreDynamicDimension()(pred, on_true) ||
       ShapeUtil::IsScalar(pred)) {
     // By this stage we know that pred's element type is PRED. Therefore, this
     // check restricts pred to be a PRED scalar, or a PRED array with the same
     // dimensions as on_true and on_false.
-    return ShapeUtil::ChangeElementType(
+    Shape inferred_shape = ShapeUtil::ChangeElementType(
         on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
-  } else {
-    return InvalidArgument(
-        "Select operation with non-scalar predicate with dimensionality "
-        " different from the other operands: %s.",
-        ShapeUtil::HumanString(pred));
+
+    // Propagate dynamic dimensions if pred is not a scalar.
+    if (!ShapeUtil::IsScalar(pred)) {
+      for (int i = 0; i < inferred_shape.rank(); i++) {
+        if (pred.is_dynamic_dimension(i)) {
+          inferred_shape.set_dynamic_dimension(i, true);
+        }
+      }
+    }
+    return inferred_shape;
   }
+  return InvalidArgument(
+      "Select operation with non-scalar predicate with dimensionality "
+      "different from the other operands: %s.",
+      ShapeUtil::HumanString(pred));
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTupleSelectShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 7d39ef38e05abf0a81683c1fb0f3999908b27d23..590a664224e6786bf387494139c66a69a43a5247 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -116,6 +116,13 @@ class ShapeInference {
   static StatusOr<Shape> InferFftShape(const Shape& in, FftType fft_type,
                                        absl::Span<const int64> fft_length);
 
+  // Infers the shape produced by the given triangular solve operation.
+  static StatusOr<Shape> InferTriangularSolveShape(
+      const Shape& a, const Shape& b, const TriangularSolveOptions& options);
+
+  // Infers the shape produced by the given triangular solve operation.
+  static StatusOr<Shape> InferCholeskyShape(const Shape& a);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferAllReduceShape(
@@ -201,11 +208,11 @@ class ShapeInference {
                                          const ProgramShape& body,
                                          const Shape& init);
 
-  // Infers the shape produced by a conditional operation.
+  // Infers the shape produced by a predicated or indexed conditional operation.
   static StatusOr<Shape> InferConditionalShape(
-      const Shape& predicate, const Shape& true_operand,
-      const Shape& false_operand, const ProgramShape& true_computation,
-      const ProgramShape& false_computation);
+      const Shape& branch_index,
+      absl::Span<const ProgramShape> branch_computations,
+      absl::Span<const Shape> branch_operands);
 
   // Infers the shape produced by a broadcast operation.
   static StatusOr<Shape> InferBroadcastShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 26120a06b823c9fddf378991cec434a880fb888d..a9cab3f3e694d3b03fbdf57484ca1b584a0b55bf 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -252,7 +252,7 @@ TEST_F(ShapeInferenceTest, ClampBadShapes) {
 
 TEST_F(ShapeInferenceTest, Complex) {
   auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
-                           const absl::Span<const int64>& bcast) {
+                           absl::Span<const int64> bcast) {
     return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
                                               bcast);
   };
@@ -896,6 +896,20 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
   ASSERT_TRUE(ShapeUtil::Equal(s32_, inferred1_status.ValueOrDie()));
 }
 
+TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  auto inferredNegative_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, -1);
+  auto inferred2_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
+  ASSERT_FALSE(inferredNegative_status.ok());
+  ASSERT_FALSE(inferred2_status.ok());
+  EXPECT_THAT(inferredNegative_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+  EXPECT_THAT(inferred2_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+}
+
 TEST_F(ShapeInferenceTest, InferPowShape) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status = ShapeInference::InferBinaryOpShape(
@@ -904,55 +918,10 @@ TEST_F(ShapeInferenceTest, InferPowShape) {
   ASSERT_TRUE(ShapeUtil::Equal(ten_floats, inferred_status.ValueOrDie()));
 }
 
-TEST_F(ShapeInferenceTest, InferCompareShapeEq) {
+TEST_F(ShapeInferenceTest, InferCompareShape) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kEq, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
-                               inferred_status.ValueOrDie()));
-}
-
-TEST_F(ShapeInferenceTest, InferCompareShapeGe) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kGe, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
-                               inferred_status.ValueOrDie()));
-}
-
-TEST_F(ShapeInferenceTest, InferCompareShapeGt) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kGt, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
-                               inferred_status.ValueOrDie()));
-}
-
-TEST_F(ShapeInferenceTest, InferCompareShapeLe) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kLe, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
-                               inferred_status.ValueOrDie()));
-}
-
-TEST_F(ShapeInferenceTest, InferCompareShapeLt) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kLt, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
-                               inferred_status.ValueOrDie()));
-}
-
-TEST_F(ShapeInferenceTest, InferCompareShapeNe) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(HloOpcode::kNe, ten_floats, f32_, {});
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kCompare, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -1467,6 +1436,14 @@ TEST_F(ShapeInferenceTest, Pad) {
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), inferred_shape));
+
+  dimension1->set_edge_padding_low(-20);
+  dimension1->set_edge_padding_high(-10);
+  auto negative_dimension_size = ShapeInference::InferPadShape(
+      input_shape, padding_value_shape, padding_config);
+  ASSERT_FALSE(negative_dimension_size.ok());
+  ASSERT_THAT(negative_dimension_size.status().error_message(),
+              HasSubstr("negative size for dimension 1"));
 }
 
 TEST_F(ShapeInferenceTest, Reverse) {
@@ -1550,79 +1527,176 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
-TEST_F(ShapeInferenceTest, Conditional) {
+TEST_F(ShapeInferenceTest, Rank1Transpose) {
+  Shape a_shape = ShapeUtil::MakeShape(F32, {5});
+  auto inferred_shape_and_status =
+      ShapeInference::InferTransposeShape(a_shape, {0});
+  EXPECT_IS_OK(inferred_shape_and_status);
+  Shape inferred_shape = inferred_shape_and_status.ValueOrDie();
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(inferred_shape, ShapeUtil::MakeShape(F32, {5})));
+}
+
+TEST_F(ShapeInferenceTest, ConditionalPred) {
   auto inferred_status0 = ShapeInference::InferConditionalShape(
-      pred_, vector_32_, vector_64_,
-      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+      {vector_32_, vector_64_});
   EXPECT_IS_OK(inferred_status0.status());
   EXPECT_TRUE(ShapeUtil::Equal(f32_, inferred_status0.ValueOrDie()));
 
   auto inferred_status1 = ShapeInference::InferConditionalShape(
-      pred_, matrix_32_48_, vector_32_,
-      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
-      ShapeUtil::MakeProgramShape({vector_32_}, vector_64_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
+      {matrix_32_48_, vector_32_});
   EXPECT_IS_OK(inferred_status1.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_64_, inferred_status1.ValueOrDie()));
 
   auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
   auto inferred_status2 = ShapeInference::InferConditionalShape(
-      pred_, matrix_32_48_, tuple_f32_v32,
-      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-      ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+       ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+      {matrix_32_48_, tuple_f32_v32});
   EXPECT_IS_OK(inferred_status2.status());
   EXPECT_TRUE(ShapeUtil::Equal(vector_32_, inferred_status2.ValueOrDie()));
 
   auto inferred_status_error0 = ShapeInference::InferConditionalShape(
-      s32_, vector_32_, vector_64_,
-      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+      f32_,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+      {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error0.ok());
   EXPECT_THAT(inferred_status_error0.status().error_message(),
-              HasSubstr("Predicate must be a boolean"));
+              HasSubstr("must be bool or int32"));
 
   auto inferred_status_error1 = ShapeInference::InferConditionalShape(
-      pred_, ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_,
-      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
-      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+      {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
   EXPECT_FALSE(inferred_status_error1.ok());
   EXPECT_THAT(inferred_status_error1.status().error_message(),
-              HasSubstr("true_computation must take 1 argument"));
+              HasSubstr("branch computation 0 must take 1 argument"));
 
   auto inferred_status_error2 = ShapeInference::InferConditionalShape(
-      pred_, vector_32_, vector_64_,
-      ShapeUtil::MakeProgramShape({vector_64_}, f32_),
-      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+      {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error2.ok());
   EXPECT_THAT(inferred_status_error2.status().error_message(),
-              HasSubstr("true_operand must match the shape of the only "
-                        "parameter of true_computation"));
+              HasSubstr("branch operand 0 must match the shape of the only "
+                        "parameter of branch computation 0"));
 
   auto inferred_status_error3 = ShapeInference::InferConditionalShape(
-      pred_, matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
-      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
+      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
   EXPECT_FALSE(inferred_status_error3.ok());
   EXPECT_THAT(inferred_status_error3.status().error_message(),
-              HasSubstr("false_computation must take 1 argument"));
+              HasSubstr("branch computation 1 must take 1 argument"));
 
   auto inferred_status_error4 = ShapeInference::InferConditionalShape(
-      pred_, vector_32_, vector_64_,
-      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-      ShapeUtil::MakeProgramShape({vector_32_}, f32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+      {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error4.ok());
   EXPECT_THAT(inferred_status_error4.status().error_message(),
-              HasSubstr("false_operand must match the shape of the only "
-                        "parameter of false_computation"));
+              HasSubstr("branch operand 1 must match the shape of the only "
+                        "parameter of branch computation 1"));
 
   auto inferred_status_error5 = ShapeInference::InferConditionalShape(
-      pred_, vector_32_, vector_64_,
-      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-      ShapeUtil::MakeProgramShape({vector_64_}, vector_32_));
+      pred_,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+      {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error5.ok());
   EXPECT_THAT(inferred_status_error5.status().error_message(),
-              HasSubstr("the result of true_computation and false_computation "
-                        "must have the same shape"));
+              HasSubstr("the result of branch 0 computation and branch 1 "
+                        "computation must have the same shape"));
+}
+
+TEST_F(ShapeInferenceTest, ConditionalIndexed) {
+  auto r0s32 = ShapeUtil::MakeShape(S32, {});
+  auto inferred_status0 = ShapeInference::InferConditionalShape(
+      r0s32,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+      {vector_32_, vector_64_, vector_64_});
+  EXPECT_IS_OK(inferred_status0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, inferred_status0.ValueOrDie()));
+
+  auto inferred_status1 = ShapeInference::InferConditionalShape(
+      r0s32,
+      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
+       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
+      {matrix_32_48_, vector_32_, matrix_32_48_});
+  EXPECT_IS_OK(inferred_status1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, inferred_status1.ValueOrDie()));
+
+  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  auto inferred_status2 = ShapeInference::InferConditionalShape(
+      r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+      {tuple_f32_v32});
+  EXPECT_IS_OK(inferred_status2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, inferred_status2.ValueOrDie()));
+
+  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
+      pred_,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+      {vector_32_, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_status_error0.ok());
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("2 == branch_computations.size()"));
+
+  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
+      r0s32,
+      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+       matrix_32_48_});
+  EXPECT_FALSE(inferred_status_error1.ok());
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("branch computation 1 must take 1 argument"));
+
+  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
+      r0s32,
+      {ShapeUtil::MakeProgramShape({r0s32}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+      {r0s32, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_status_error2.ok());
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("branch operand 2 must match the shape of the only "
+                        "parameter of branch computation 2"));
+
+  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
+      r0s32,
+      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+      {vector_32_, vector_32_, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_status_error3.ok());
+  EXPECT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("the result of branch 0 computation and branch 3 "
+                        "computation must have the same shape"));
+
+  auto inferred_status_error4 =
+      ShapeInference::InferConditionalShape(r0s32, {}, {});
+  EXPECT_FALSE(inferred_status_error4.ok());
+  EXPECT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("!branch_computations.empty()"));
 }
 
 TEST_F(ShapeInferenceTest, BadSlice) {
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
index 4a00e8d7b227f14d462ca53f695189f3f48754ee..122366a0f322a66963b364e1b19629cbd2d9aabe 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/statusor.h"
+
+#include <memory>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
 namespace {
@@ -39,8 +42,7 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
     return false;
   }
 
-  // Index 0 is the sorting key used by the sort HLO itself.
-  absl::flat_hash_set<int64> used_indices{0};
+  absl::flat_hash_set<int64> used_indices;
   for (const HloInstruction* user : sort->users()) {
     if (user->opcode() != HloOpcode::kGetTupleElement) {
       // Can't analyse users other then get-tuple-element.
@@ -49,15 +51,25 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
     used_indices.insert(user->tuple_index());
   }
 
+  // Also note which parameters are used by the comparator computation.
+  auto comparator = sort->to_apply();
+  for (int64 i = 0; i < sort->operand_count() * 2; ++i) {
+    if (comparator->parameter_instruction(i)->user_count() > 0) {
+      // operand i corresponds to parameters 2 * i and 2 * i + 1 of the
+      // computation.
+      used_indices.insert(i / 2);
+    }
+  }
+
   if (used_indices.size() == sort->operand_count()) {
     // All operands are used.
     return false;
   }
 
-  std::vector<HloInstruction*> operands{sort->mutable_operand(0)};
-  std::vector<Shape> new_shapes{sort->operand(0)->shape()};
-  for (int64 i = 1; i < sort->operand_count(); ++i) {
-    if (used_indices.count(i)) {
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> new_shapes;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (used_indices.contains(i)) {
       operands.push_back(sort->mutable_operand(i));
       new_shapes.push_back(sort->operand(i)->shape());
     }
@@ -68,6 +80,32 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
                              : ShapeUtil::MakeTupleShape(new_shapes);
   HloInstruction* new_sort = computation->AddInstruction(
       sort->CloneWithNewOperands(new_sort_shape, operands));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  int64 parameter_number = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    auto* old_lhs_parameter = comparator->parameter_instruction(i * 2);
+    auto* old_rhs_parameter = comparator->parameter_instruction(i * 2 + 1);
+    if (used_indices.contains(i)) {
+      Shape scalar_shape =
+          ShapeUtil::MakeShape(sort->operand(i)->shape().element_type(), {});
+      replacements[old_lhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".lhs"));
+      ++parameter_number;
+      replacements[old_rhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".rhs"));
+      ++parameter_number;
+    } else {
+      replacements[old_lhs_parameter] = nullptr;
+      replacements[old_rhs_parameter] = nullptr;
+    }
+  }
+  HloModule* module = sort->GetModule();
+  HloComputation* new_compare = module->AddEmbeddedComputation(
+      comparator->CloneWithReplacements(std::move(replacements)));
+  new_sort->set_to_apply(new_compare);
 
   // Map from original get-tuple-element tuple index to new HLO instruction
   absl::flat_hash_map<int64, HloInstruction*> result_map;
@@ -83,7 +121,8 @@ StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
       }
     }
   } else {
-    result_map[0] = new_sort;
+    CHECK_EQ(used_indices.size(), 1);
+    result_map[*used_indices.begin()] = new_sort;
   }
   std::vector<HloInstruction*> users(sort->users().begin(),
                                      sort->users().end());
diff --git a/tensorflow/compiler/xla/service/sort_simplifier_test.cc b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
index cd05fcf830d32e8bac4f8b260d3dd143ab98ad7b..284d50952776bd21c8131cbec7de3e4b6692ebc5 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
@@ -34,13 +34,21 @@ TEST_F(SortSimplifierTest, RemoveUnusedSortOperandArrayResult) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} parameter(1)
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
-        dimensions={1}
-      ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -58,17 +66,27 @@ TEST_F(SortSimplifierTest, RemoveUnusedSortOperandTuple) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,87] parameter(0)
-      values.0 = s32[64,87] parameter(1)
-      values.1 = u32[64,87] parameter(2)
-      sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
-          keys, values.0, values.1),
-        dimensions={1}
-      gte.0 = f32[64,87] get-tuple-element(sort), index=0
-      gte.1 = u32[64,87] get-tuple-element(sort), index=2
-      ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     p.2.lhs = u32[] parameter(4)
+     p.2.rhs = u32[] parameter(5)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,87] parameter(0)
+     values.0 = s32[64,87] parameter(1)
+     values.1 = u32[64,87] parameter(2)
+     sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
+         keys, values.0, values.1),
+       dimensions={1}, to_apply=compare
+     gte.0 = f32[64,87] get-tuple-element(sort), index=0
+     gte.1 = u32[64,87] get-tuple-element(sort), index=2
+     ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
@@ -86,17 +104,57 @@ TEST_F(SortSimplifierTest, DontRemoveUnusedSortKey) {
   const char* hlo_string = R"(
    HloModule permutation_sort
 
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} parameter(1)
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-    })";
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
   SortSimplifier simplifier;
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
+
+TEST_F(SortSimplifierTest, RemoveUnusedFirstOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  uint64 num_executions = 0;
+  do {
+    num_executions++;
+  } while (simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(num_executions, 2);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(1))));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.cc b/tensorflow/compiler/xla/service/stable_sort_expander.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae4ce32569a3e3aa56100837621994336445f088
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.cc
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Looks for a iota operand that can be used as tie breaker in the computation.
+// If no matching iota operand is found, a iota operand is added to Sort. The
+// comparison computation is adjusted to break ties using the values from the
+// iota operand.
+StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  auto* sort = Cast<HloSortInstruction>(instruction);
+  HloComputation* computation = sort->parent();
+
+  HloInstruction* expanded_sort = nullptr;
+  absl::flat_hash_set<int64> used_indices;
+  int64 iota_index = -1;
+  for (const HloInstruction* operand : sort->operands()) {
+    // We can only use the iota operand if it has an iota dimension which is the
+    // same as the dimension to sort. Also it should have an integral type that
+    // is large enough for the number of elements in the sort dimension. For
+    // now, we only allow S32, because we expect to find a S32 iota operand for
+    // all Sort ops which are created by TopK.
+    // TODO(b/122298745): Also support other types.
+    if (operand->opcode() == HloOpcode::kIota &&
+        Cast<HloIotaInstruction>(operand)->iota_dimension() ==
+            sort->sort_dimension() &&
+        operand->shape().element_type() == S32) {
+      iota_index = sort->operand_index(operand);
+      break;
+    }
+  }
+
+  // If there is currently no iota operand which we could use for making the
+  // sort stable, we will have to add a new such operand.
+  if (iota_index == -1) {
+    Shape iota_shape = sort->operand(0)->shape();
+    // We might need to use S64 if the number of elements in the sort dimension
+    // is bigger than 2^31 - 1.
+    // TODO(b/122298745): Handle Sort ops where S32 is too small for the number
+    // of elements in the sort dimension.
+    if (iota_shape.dimensions(sort->sort_dimension()) >
+        std::numeric_limits<int32>::max()) {
+      return Unimplemented(
+          "Stable sorting of more than 2^31-1 elements is not implemented");
+    }
+    iota_shape.set_element_type(S32);
+    auto iota = computation->AddInstruction(
+        HloInstruction::CreateIota(iota_shape, sort->sort_dimension()));
+
+    // Create a new comparator.
+    auto comparator = sort->to_apply();
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements;
+    std::vector<std::unique_ptr<HloInstruction>> extra_parameters;
+    std::vector<HloInstruction*> extra_parameter_ptrs;
+    Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".lhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2 + 1, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".rhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    sort->set_to_apply(sort->GetModule()->AddEmbeddedComputation(
+        comparator->CloneWithReplacements(std::move(replacements),
+                                          extra_parameter_ptrs)));
+
+    // Replace the original sort op.
+    std::vector<HloInstruction*> new_operands(sort->operands().begin(),
+                                              sort->operands().end());
+    new_operands.push_back(iota);
+    std::vector<Shape> new_shapes = sort->operand_count() == 1
+                                        ? std::vector<Shape>{sort->shape()}
+                                        : sort->shape().tuple_shapes();
+    new_shapes.push_back(iota_shape);
+    Shape new_sort_shape = ShapeUtil::MakeTupleShape(new_shapes);
+    HloInstruction* new_sort = computation->AddInstruction(
+        sort->CloneWithNewOperands(new_sort_shape, new_operands));
+
+    // Add a "wrapper" around the new sort op to make sure we have the same
+    // shape as before. For the rank 1 case, we only need a GetTupleElement,
+    // otherwise we create a Tuple consisting of GetTupleElements of the new
+    // sort.
+    std::vector<HloInstruction*> tuple_elements;
+    tuple_elements.reserve(sort->operand_count());
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      tuple_elements.push_back(
+          computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+              sort->operand(i)->shape(), new_sort, i)));
+    }
+    expanded_sort = tuple_elements[0];
+    if (tuple_elements.size() > 1) {
+      expanded_sort = computation->AddInstruction(
+          HloInstruction::CreateTuple(tuple_elements));
+    }
+    sort = Cast<HloSortInstruction>(new_sort);
+    iota_index = sort->operand_count() - 1;
+  }
+
+  // Modify the computation to break ties using the iota operand.
+  auto comparator = sort->to_apply();
+  std::vector<HloInstruction*> instructions_postorder =
+      comparator->MakeInstructionPostOrder();
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> replacements;
+  // Look up instr in the replacements map, and return either the replacement,
+  // or instr, if the replacement isn't present.
+  auto replace = [&](HloInstruction* instr) {
+    auto it = replacements.find(instr);
+    if (it == replacements.end()) {
+      return instr;
+    }
+    return it->second;
+  };
+  HloInstruction* old_root = comparator->root_instruction();
+  // The comparison computation gets 2 * n parameters (n being the number of
+  // operands of Sort), where parameters 2 * i and 2 * i + 1 correspond to two
+  // different scalars of operand i of Sort which are to be compared. The
+  // comparison computation should induce a strict weak order, so if
+  // to_apply(p1.lhs, p1.rhs, ..., pn.lhs, pn.rhs) is equal to
+  // to_apply(p1.rhs, p1.lhs, ..., pn.rhs, pn.lhs), we can conclude that the
+  // values to be compared are equivalent, and perform a tie-breaker comparison.
+  //
+  // We clone each instruction with at least one operand, but use as new
+  // operands of the instruction the replacements of the original operands.
+  // Parameter 2 * i is replaced by parameter 2 * i + 1 and vice versa. This
+  // should make sure that the cloned root instruction gives the result of the
+  // comparison computation when being called with each scalar pair reversed.
+  // parameters corresponding to the iota operand.
+  for (int64 i = 0; i < comparator->num_parameters(); ++i) {
+    replacements[comparator->parameter_instruction(i)] =
+        comparator->parameter_instruction(i ^ 1);
+  }
+  HloInstruction* cloned_root = nullptr;
+  for (HloInstruction* inst : instructions_postorder) {
+    if (inst->operand_count() == 0) {
+      continue;
+    }
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(inst->operand_count());
+    for (HloInstruction* operand : inst->operands()) {
+      new_operands.push_back(replace(operand));
+    }
+    auto new_instruction =
+        inst->CloneWithNewOperands(inst->shape(), new_operands);
+    replacements[inst] = new_instruction.get();
+    if (inst == old_root) {
+      cloned_root = new_instruction.get();
+    }
+    comparator->AddInstruction(std::move(new_instruction));
+  }
+  CHECK_NE(cloned_root, nullptr);
+  Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
+  HloInstruction* same =
+      comparator->AddInstruction(HloInstruction::CreateCompare(
+          scalar_pred, old_root, cloned_root, ComparisonDirection::kEq));
+  HloInstruction* tie_breaker =
+      comparator->AddInstruction(HloInstruction::CreateCompare(
+          scalar_pred, comparator->parameter_instruction(2 * iota_index),
+          comparator->parameter_instruction(2 * iota_index + 1),
+          ComparisonDirection::kLt));
+  HloInstruction* new_root =
+      comparator->AddInstruction(HloInstruction::CreateTernary(
+          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kSelect, same, tie_breaker,
+          old_root));
+  comparator->set_root_instruction(new_root);
+
+  return expanded_sort;
+}
+
+bool StableSortExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kSort &&
+         Cast<HloSortInstruction>(instruction)->is_stable();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.h b/tensorflow/compiler/xla/service/stable_sort_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..31b6fd92d25370218017c58072f1aa5e64df00c3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which expands Sort ops that have the is_stable field set to true
+// into equivalent Sort ops which guarantee stable sorting without relying on
+// the is_stable field.
+class StableSortExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "stable-sort-expander"; }
+
+ private:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander_test.cc b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61fb4392a32b73e912b6878b0aceed0f4e88a140
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
@@ -0,0 +1,359 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using StableSortExpanderTest = HloTestBase;
+
+// Checks whether 'a' and 'b' are roots of equivalent computations, except that
+// parameters 2 * i and 2 * i + 1 are switched.
+bool IsSameComputationExceptParams(const HloInstruction* a,
+                                   const HloInstruction* b) {
+  if (a->opcode() != b->opcode() || a->operand_count() != b->operand_count()) {
+    return false;
+  }
+  if (a->opcode() == HloOpcode::kParameter) {
+    // Check that parameters were switched.
+    return a->parameter_number() == (b->parameter_number() ^ 1);
+  }
+  // If the operation has no operands, it should actually be the same.
+  if (a->operand_count() == 0) {
+    return a == b;
+  }
+  // Otherwise recursively compare all operands.
+  for (int64 i = 0; i < a->operand_count(); ++i) {
+    if (!IsSameComputationExceptParams(a->operand(i), b->operand(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Check that the comparison computation has been modified to add a tie breaker
+// using 'iota_parameter'.
+void CheckComputationHasTieBreaker(const HloInstruction* root,
+                                   int64 iota_parameter) {
+  // With the tie breaker, the root instruction should be
+  //   Select(Eq(Comp(), CompReverse()), Lt(), Comp())
+  // with Comp() being the original comparison function, and CompReverse() being
+  // the copied comparison function where the parameters are reversed. Lt() is
+  // the tie breaker comparison using the Iota operand.
+  ASSERT_EQ(root->opcode(), HloOpcode::kSelect);
+  ASSERT_EQ(root->operand(0)->opcode(), HloOpcode::kCompare);
+  ASSERT_EQ(root->operand(0)->comparison_direction(), ComparisonDirection::kEq);
+
+  // Check that the tie breaker instruction is correct.
+  EXPECT_THAT(root->operand(1),
+              GmockMatch(m::Lt(m::Parameter(iota_parameter * 2),
+                               m::Parameter(iota_parameter * 2 + 1))));
+  EXPECT_EQ(root->operand(2), root->operand(0)->operand(0));
+
+  // Check that Comp() and CompReverse() are equivalent except that
+  // CompReverse() has reversed parameters.
+  EXPECT_TRUE(IsSameComputationExceptParams(root->operand(0)->operand(0),
+                                            root->operand(0)->operand(1)));
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortReuseIotaOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortReuseIotaOperandComplicatedComparison) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     max = u32[] constant(2147483647)
+     zero = s32[] constant(0)
+     lhs.signed = s32[] bitcast-convert(p.0.lhs)
+     lhs.unsigned = u32[] bitcast-convert(p.0.lhs)
+     lhs.flipped = u32[] subtract(max, lhs.unsigned)
+     lhs.flipped.signed = s32[] bitcast-convert(lhs.flipped)
+     lhs.is_negative = pred[] compare(lhs.flipped.signed, zero), direction=LT
+     lhs.converted = s32[] select(lhs.is_negative, lhs.flipped.signed, lhs.signed)
+     rhs.signed = s32[] bitcast-convert(p.0.rhs)
+     rhs.unsigned = u32[] bitcast-convert(p.0.rhs)
+     rhs.flipped = u32[] subtract(max, rhs.unsigned)
+     rhs.flipped.signed = s32[] bitcast-convert(rhs.flipped)
+     rhs.is_negative = pred[] compare(rhs.flipped.signed, zero), direction=LT
+     rhs.converted = s32[] select(rhs.is_negative, rhs.flipped.signed, rhs.signed)
+     ROOT lt = pred[] compare(lhs.converted, rhs.converted), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortAddIotaOperandAndChangeRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     ROOT sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, GmockMatch(m::Tuple(
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 0),
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 1))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, HonorIsStableFlag) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=false
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_FALSE(stabilizer.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortDontReuseIotaOperandWrongDimension) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=0
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortDontReuseIotaOperandWrongType) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = f32[] parameter(2)
+     p.1.rhs = f32[] parameter(3)
+     ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = f32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] compare(lhs, rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     ROOT sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1NoRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] compare(lhs, rhs), direction=LT
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+     ROOT neg = s32[64,8732]{1,0} negate(sort)
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Negate(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/1);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 15ef623cc7b2dbc31e9cba5c4783c39b8805a5aa..29a9d26ce2ccdd9fa4352b81c2be14950725124e 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -42,8 +42,11 @@ TransferManager::GetPlatformTransferManagers() {
   return r;
 }
 
+TransferManager::TransferMetadata::~TransferMetadata() {}
+
 StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
-    se::Stream* stream, const ShapedBuffer& device_buffer) {
+    se::Stream* stream, const ShapedBuffer& device_buffer,
+    const TransferMetadata* transfer_metadata) {
   StatusOr<Literal> ret;
 
   se::Stream* substream = stream->GetOrCreateSubStream();
@@ -54,11 +57,13 @@ StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
   tensorflow::Notification n;
   Status s;
   Literal literal(device_buffer.on_host_shape());
-  TransferLiteralFromDevice(substream, device_buffer, literal,
-                            [&](Status status) {
-                              s = status;
-                              n.Notify();
-                            });
+  TransferLiteralFromDevice(
+      substream, device_buffer, literal,
+      [&](Status status) {
+        s = status;
+        n.Notify();
+      },
+      transfer_metadata);
   n.WaitForNotification();
   if (!s.ok()) {
     return s;
@@ -68,25 +73,29 @@ StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
 
 Status TransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
-    const MutableBorrowingLiteral& literal) {
+    const MutableBorrowingLiteral& literal,
+    const TransferMetadata* transfer_metadata) {
   se::Stream* substream = stream->GetOrCreateSubStream();
   auto cleanup = tensorflow::gtl::MakeCleanup(
       [&]() { stream->ReturnSubStream(substream); });
 
   Status ret;
   tensorflow::Notification n;
-  TransferLiteralFromDevice(substream, device_buffer, literal,
-                            [&](Status status) {
-                              ret = status;
-                              n.Notify();
-                            });
+  TransferLiteralFromDevice(
+      substream, device_buffer, literal,
+      [&](Status status) {
+        ret = status;
+        n.Notify();
+      },
+      transfer_metadata);
   n.WaitForNotification();
   return ret;
 }
 
 Status TransferManager::TransferLiteralToDevice(
     se::Stream* stream, const LiteralSlice& literal,
-    const ShapedBuffer& device_buffer) {
+    const ShapedBuffer& device_buffer,
+    const TransferMetadata* transfer_metadata) {
   // Implement the synchronous version by waiting on the asynchronous version.
   // Use a substream so that if we are called from a HostCallback we don't
   // deadlock.
@@ -94,14 +103,14 @@ Status TransferManager::TransferLiteralToDevice(
   substream->ThenWaitFor(stream);
   auto cleanup = tensorflow::gtl::MakeCleanup(
       [&]() { stream->ReturnSubStream(substream); });
-  TF_RETURN_IF_ERROR(
-      TransferLiteralToDeviceAsync(substream, literal, device_buffer));
+  TF_RETURN_IF_ERROR(TransferLiteralToDeviceAsync(
+      substream, literal, device_buffer, transfer_metadata));
   return substream->BlockHostUntilDone();
 }
 
 StatusOr<Literal> TransferManager::TransferArrayFromDevice(
-    se::Stream* stream, const Shape& shape,
-    const se::DeviceMemoryBase& source) {
+    se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
+    const TransferMetadata* transfer_metadata) {
   StatusOr<Literal> ret;
   // Implement the synchronous version by waiting on the asynchronous version.
   // Use a substream so that if we are called from a HostCallback we don't
@@ -113,11 +122,13 @@ StatusOr<Literal> TransferManager::TransferArrayFromDevice(
   tensorflow::Notification n;
   Literal literal(shape);
   Status s;
-  TransferArrayFromDevice(substream, shape, source, literal,
-                          [&](Status status) {
-                            s = status;
-                            n.Notify();
-                          });
+  TransferArrayFromDevice(
+      substream, shape, source, literal,
+      [&](Status status) {
+        s = status;
+        n.Notify();
+      },
+      transfer_metadata);
   n.WaitForNotification();
   if (!s.ok()) {
     return s;
@@ -127,20 +138,23 @@ StatusOr<Literal> TransferManager::TransferArrayFromDevice(
 
 Status TransferManager::TransferArrayToDevice(
     se::Stream* stream, const LiteralSlice& literal,
-    const se::DeviceMemoryBase& dest) {
+    const se::DeviceMemoryBase& dest,
+    const TransferMetadata* transfer_metadata) {
   // Implement the synchronous version by waiting on the asynchronous version.
   // Use a substream so that if we are called from a HostCallback we don't
   // deadlock.
   se::Stream* substream = stream->GetOrCreateSubStream();
   auto cleanup = tensorflow::gtl::MakeCleanup(
       [&]() { stream->ReturnSubStream(substream); });
-  TF_RETURN_IF_ERROR(TransferArrayToDeviceAsync(substream, literal, dest));
+  TF_RETURN_IF_ERROR(
+      TransferArrayToDeviceAsync(substream, literal, dest, transfer_metadata));
   return substream->BlockHostUntilDone();
 }
 
 Status TransferManager::TransferArrayToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
-    const se::DeviceMemoryBase& dest) {
+    const se::DeviceMemoryBase& dest,
+    const TransferMetadata* transfer_metadata) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(on_device_shape.IsArray())
       << "On-device representation of "
@@ -156,12 +170,14 @@ Status TransferManager::TransferArrayToDeviceAsync(
                              stream->parent()->platform(),
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(dest, /*index=*/{});
-  return TransferLiteralToDevice(stream, literal, shaped_buffer);
+  return TransferLiteralToDevice(stream, literal, shaped_buffer,
+                                 transfer_metadata);
 }
 
 void TransferManager::TransferArrayFromDevice(
     se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
-    const MutableBorrowingLiteral& literal, std::function<void(Status)> done) {
+    const MutableBorrowingLiteral& literal, std::function<void(Status)> done,
+    const TransferMetadata* transfer_metadata) {
   if (!ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) {
     auto error = StrCat("Shape ", ShapeUtil::HumanString(shape),
                         " has a differently shaped representation on-device: ",
@@ -179,7 +195,7 @@ void TransferManager::TransferArrayFromDevice(
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(source, /*index=*/{});
   return TransferLiteralFromDevice(stream, shaped_buffer, literal,
-                                   std::move(done));
+                                   std::move(done), transfer_metadata);
 }
 
 /* static */ void TransferManager::RegisterTransferManager(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 43a50487c636da75224547286a31625db3f91330..2a934563b0b0a0cc095ca6a964a8f00de39f161b 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -52,16 +52,38 @@ class TransferManager {
     return host_shape;
   }
 
+  // Base class for specifying platform specific transfer metadata that can be
+  // used to tell the underlying implementation to perform specific optimization
+  // to a transfer. Actual metadata passed to supported transfer methods should
+  // subclass this class.
+  class TransferMetadata {
+   public:
+    virtual ~TransferMetadata() = 0;
+  };
   // Returns a literal containing the data held in the given ShapedBuffer
   // using the provided executor. This operation is performed synchronously
   // without waiting for any other operation on a stream to complete.
   //
   // This function should be avoided in favor of the asynchronous version below.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
   virtual StatusOr<Literal> TransferLiteralFromDevice(
-      se::Stream* stream, const ShapedBuffer& device_buffer);
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata);
+  StatusOr<Literal> TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer) {
+    return TransferLiteralFromDevice(stream, device_buffer, nullptr);
+  }
   virtual Status TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
-      const MutableBorrowingLiteral& literal);
+      const MutableBorrowingLiteral& literal,
+      const TransferMetadata* transfer_metadata);
+  Status TransferLiteralFromDevice(se::Stream* stream,
+                                   const ShapedBuffer& device_buffer,
+                                   const MutableBorrowingLiteral& literal) {
+    return TransferLiteralFromDevice(stream, device_buffer, literal, nullptr);
+  }
 
   // Begins transferring a literal containing the data held in the given
   // ShapedBuffer using the provided executor.
@@ -72,10 +94,20 @@ class TransferManager {
   //
   // device_buffer is copied by reference and must live at least until done() is
   // invoked.
-  virtual void TransferLiteralFromDevice(se::Stream* stream,
-                                         const ShapedBuffer& device_buffer,
-                                         MutableBorrowingLiteral literal,
-                                         std::function<void(Status)> done) = 0;
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  virtual void TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      MutableBorrowingLiteral literal, std::function<void(Status)> done,
+      const TransferMetadata* transfer_metadata) = 0;
+  void TransferLiteralFromDevice(se::Stream* stream,
+                                 const ShapedBuffer& device_buffer,
+                                 MutableBorrowingLiteral literal,
+                                 std::function<void(Status)> done) {
+    return TransferLiteralFromDevice(stream, device_buffer, literal, done,
+                                     nullptr);
+  }
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
@@ -85,9 +117,18 @@ class TransferManager {
   // This operation is performed synchronously without waiting for any other
   // operation on a stream to complete. This function should be avoided in favor
   // of the asynchronous version below.
-  virtual Status TransferLiteralToDevice(se::Stream* stream,
-                                         const LiteralSlice& literal,
-                                         const ShapedBuffer& device_buffer);
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  virtual Status TransferLiteralToDevice(
+      se::Stream* stream, const LiteralSlice& literal,
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata);
+  Status TransferLiteralToDevice(se::Stream* stream,
+                                 const LiteralSlice& literal,
+                                 const ShapedBuffer& device_buffer) {
+    return TransferLiteralToDevice(stream, literal, device_buffer, nullptr);
+  }
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
@@ -102,26 +143,44 @@ class TransferManager {
   // immediately after this function returns, however their constituent buffers
   // on both host and device must remain valid until the enqueued transfer has
   // completed on 'stream'.
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
   virtual Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
-      const ShapedBuffer& device_buffer) = 0;
+      const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata) = 0;
+  Status TransferLiteralToDeviceAsync(se::Stream* stream,
+                                      const LiteralSlice& literal,
+                                      const ShapedBuffer& device_buffer) {
+    return TransferLiteralToDeviceAsync(stream, literal, device_buffer,
+                                        nullptr);
+  }
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
-  Status TransferArrayToDevice(se::Stream* stream, const LiteralSlice& literal,
-                               const se::DeviceMemoryBase& dest);
-  void TransferArrayFromDevice(se::Stream* stream, const Shape& shape,
-                               const se::DeviceMemoryBase& source,
-                               const MutableBorrowingLiteral& literal,
-                               std::function<void(Status)> done);
-
-  Status TransferArrayToDeviceAsync(se::Stream* stream,
-                                    const LiteralSlice& literal,
-                                    const se::DeviceMemoryBase& dest);
-  StatusOr<Literal> TransferArrayFromDevice(se::Stream* stream,
-                                            const Shape& shape,
-                                            const se::DeviceMemoryBase& source);
+  //
+  // Optionally caller can specify platform-specific transfer metadata that
+  // tells the actual implementation to do something special.
+  Status TransferArrayToDevice(
+      se::Stream* stream, const LiteralSlice& literal,
+      const se::DeviceMemoryBase& dest,
+      const TransferMetadata* transfer_metadata = nullptr);
+  void TransferArrayFromDevice(
+      se::Stream* stream, const Shape& shape,
+      const se::DeviceMemoryBase& source,
+      const MutableBorrowingLiteral& literal, std::function<void(Status)> done,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  Status TransferArrayToDeviceAsync(
+      se::Stream* stream, const LiteralSlice& literal,
+      const se::DeviceMemoryBase& dest,
+      const TransferMetadata* transfer_metadata = nullptr);
+  StatusOr<Literal> TransferArrayFromDevice(
+      se::Stream* stream, const Shape& shape,
+      const se::DeviceMemoryBase& source,
+      const TransferMetadata* transfer_metadata = nullptr);
 
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
similarity index 81%
rename from tensorflow/compiler/xla/client/lib/triangular_solve.cc
rename to tensorflow/compiler/xla/service/triangular_solve_expander.cc
index c2f31742e9eff9f325fb71160b4ec3aea928d15e..790074ab834cd057a54b55c150a208e2b49d67b6 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 
 #include <memory>
 #include <vector>
@@ -33,6 +33,8 @@ limitations under the License.
 
 namespace xla {
 
+namespace {
+
 // Get the diagonal blocks of the coefficient matrix
 XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   XlaBuilder* builder = a.builder();
@@ -140,9 +142,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // zero (which can happen if the last block was padded) otherwise it will
     // introduce nans which will propagate
     auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
-    auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
+    auto ones = FullLike(diags, 1);
     diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
     auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
 
@@ -347,9 +347,10 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
   });
 }
 
-XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
-                      bool transpose_a, bool conjugate_a, int64 block_size,
-                      PrecisionConfig::Precision precision) {
+XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                           bool transpose_a, bool conjugate_a,
+                           bool unit_diagonal, int64 block_size,
+                           PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -402,12 +403,29 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
           block_size);
     }
 
+    block_size = std::max(
+        int64{1}, std::min(block_size, ShapeUtil::GetDimension(a_shape, -1)));
+
     if (ShapeUtil::IsZeroElementArray(b_shape)) {
       // The output has the same shape as 'b', and since the output has zero
       // elements, any such array will do.
       return b;
     }
 
+    // TODO(phawkins): consider pushing triangle masking into
+    // InvertDiagonalBlocks.
+    if (unit_diagonal) {
+      // Mask everything but the subdiagonal/superdiagonal elements.
+      a = lower ? Select(TriangleMask(a, -1), a, ZerosLike(a))
+                : Select(TriangleMask(a, 0), ZerosLike(a), a);
+      int64 k = ShapeUtil::GetDimension(a_shape, -1);
+      a = xla::Add(a, IdentityMatrix(builder, a_shape.element_type(), k, k),
+                   /*broadcast_dimensions=*/{ndims - 2, ndims - 1});
+    } else {
+      // Mask off the ignored elements of the triangular matrix a.
+      a = Triangle(a, lower);
+    }
+
     // We find the diagonal blocks of the coefficient matrix
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
@@ -415,11 +433,6 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
     auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
                                                 conjugate_a, precision);
 
-    // Mask off the ignored elements of the triangular matrix a.
-    // TODO(phawkins): it would probably be preferable to perform this masking
-    // block by block inside SolveWithInvertedDiagonalBlocks.
-    a = Triangle(a, lower);
-
     // We now find the solution using GEMMs
     auto x =
         SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
@@ -429,4 +442,66 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   });
 }
 
+}  // namespace
+
+bool TriangularSolveExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kTriangularSolve;
+}
+
+StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  const TriangularSolveOptions& options =
+      instruction->triangular_solve_options();
+  const string name = absl::StrFormat(
+      "xla.triangular_solve_%s_%s_%s_%s_%s_%s",
+      instruction->operand(0)->shape().ToString(),
+      instruction->operand(1)->shape().ToString(),
+      options.left_side() ? "left" : "right",
+      options.lower() ? "lower" : "upper",
+      TriangularSolveOptions_Transpose_Name(options.transpose_a()),
+      options.unit_diagonal() ? "unit" : "nonunit");
+
+  HloModule* module = instruction->parent()->parent();
+
+  HloComputation*& computation =
+      computation_cache_.emplace(name, nullptr).first->second;
+  if (!computation) {
+    // Builds a new expansion.
+    //
+    // We do something unusual here: we build the computation using the
+    // XlaBuilder API, which is nominally an XLA client API. We do this because
+    // the external APIs for building complicated computations (XlaBuilder)
+    // are much more ergonomic than the internal ones. As it turns out,
+    // XlaBuilder isn't really a client API—what it does is build a
+    // HloModuleProto protocol buffer, that we can then deserialize and clone
+    // into our HloModule. Ideally we would avoid the protocol buffer step;
+    // that is left as an exercise for future work.
+    XlaBuilder builder(name);
+    XlaOp a = Parameter(&builder, 0, instruction->operand(0)->shape(), "a");
+    XlaOp b = Parameter(&builder, 1, instruction->operand(1)->shape(), "b");
+    bool transpose_a =
+        options.transpose_a() != TriangularSolveOptions::NO_TRANSPOSE;
+    bool conjugate_a = options.transpose_a() == TriangularSolveOptions::ADJOINT;
+
+    BuildTriangularSolve(a, b, options.left_side(), options.lower(),
+                         transpose_a, conjugate_a, options.unit_diagonal(),
+                         /*block_size=*/128,
+                         /*precision=*/PrecisionConfig::HIGHEST);
+    TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        xla_computation.GetProgramShape());
+    HloModuleConfig config(program_shape);
+    TF_ASSIGN_OR_RETURN(auto new_module, HloModule::CreateFromProto(
+                                             xla_computation.proto(), config));
+    HloCloneContext context(module);
+    computation =
+        module->DeepCloneComputation(new_module->entry_computation(), &context);
+  }
+
+  return instruction->parent()->AddInstruction(HloInstruction::CreateCall(
+      instruction->shape(), instruction->operands(), computation));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..be2374ef8c86254d8db5ac1acac385aa0de7d3a5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+class TriangularSolveExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "triangular_solve_expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 5e505aaf02f157d0cba9dff42b1a9b89a6691504..cc82e9bb0287b5a586fb21fee35d3124a6d6f121 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -699,6 +699,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
 // (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
 //     0.
 // (5) The 'user' of 'operand' is Sort, and it is the only user.
+// (6) The 'user' of 'operand' is TriangularSolve, it is the second operand,
+//     and it is the only user.
 //
 // (2) and (3) can only be determined if points-to analysis is available.
 bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
@@ -779,6 +781,14 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && user_index[0] == operand_indices[0];
   }
+  if (user->opcode() == HloOpcode::kTriangularSolve) {
+    // Only valid if there are no other users.
+    if (operand->users().size() != 1) {
+      return false;
+    }
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 1;
+  }
   if (user->opcode() == HloOpcode::kCall) {
     // TODO(b/62548313): Remove when buffer assignment is module scoped and
     // does not assign buffers to calls.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index fd5759e44230db8223822d6ae0f511027f73d8f9..61b98673cbef0d1d9e588f2888a07373405feb9e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
@@ -933,8 +934,8 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
       HloInstruction::CreateParameter(0, in_shape, "param0"));
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, in_shape, "param1"));
-  auto result = builder.AddInstruction(
-      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+  auto result = builder.AddInstruction(HloInstruction::CreateCompare(
+      out_shape, param0, param1, ComparisonDirection::kEq));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -1065,14 +1066,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -1080,6 +1084,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -1087,11 +1092,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
@@ -1177,8 +1185,8 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     auto builder = HloComputation::Builder(TestName() + ".Cond");
     auto data = builder.AddInstruction(
         HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    builder.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), data, data, ComparisonDirection::kEq));
     return builder.Build();
   };
 
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index c93a9ba3176002a34fe84a29e62075de4d19168f..ffa89b6a797673240e139b90052351cd825bf09b 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -14,15 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+#include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 
 namespace xla {
 
 using absl::nullopt;
 using absl::optional;
+namespace m = match;
 
 // Finds and returns the non-constant operand in instr.
 //
@@ -48,41 +51,36 @@ static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
                                           const HloInstruction* gte_operand) {
   VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", "
           << gte_operand->ToString() << ")";
-  optional<int64> tuple_idx;
+
+  // Among the operands of `instr`, find one that is a get-tuple-element op.
+  auto gte_it = c_find_if(instr->operands(), [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kGetTupleElement;
+  });
+  if (gte_it == instr->operands().end()) {
+    VLOG(2) << "instr does not have a gte operand.";
+    return nullopt;
+  }
+
+  // All operands of `instr` must be either constants or of the form
+  //   get-tuple-element(gte_operand, tuple_idx)
+  // for the same value tuple_idx.
+  int64 tuple_idx = (*gte_it)->tuple_index();
   for (const HloInstruction* operand : instr->operands()) {
-    if (operand->IsConstant()) {
-      continue;
-    }
-    // Look through copies.
-    // TODO(b/68830972): We wouldn't need this if for loop matching on the GPU
-    // would run before copy insertion.
-    if (operand->opcode() == HloOpcode::kCopy) {
-      operand = operand->operand(0);
-    }
-    if (operand->opcode() != HloOpcode::kGetTupleElement) {
-      VLOG(2) << "instr uses something other than gte(gte_operand): "
-              << operand->ToString();
-      return nullopt;
-    }
-    if (operand->operand(0) != gte_operand) {
-      VLOG(2) << "instr has gte whose operand is not gte_operand: "
-              << operand->ToString();
+    if (!Match(operand, m::Constant()) &&
+        !Match(operand,
+               m::GetTupleElement(m::Op().Is(gte_operand), tuple_idx))) {
+      VLOG(2)
+          << "instr uses something other than a constant or gte(gte_operand, "
+          << tuple_idx << "): " << operand->ToString();
       return nullopt;
     }
-    if (tuple_idx && tuple_idx != operand->tuple_index()) {
-      VLOG(2) << "instr has operands with conflicting gte indices, "
-              << *tuple_idx << " vs " << operand->tuple_index();
-      return nullopt;
-    }
-
-    tuple_idx = operand->tuple_index();
   }
   return tuple_idx;
 }
 
 // Tries to get the tuple index of the induction variable of a while loop.
 //
-// Checks that the loop condition and root both plumb the induction variable
+// Checks that the loop condition and body both plumb the induction variable
 // through the same tuple index, and that they both apply exactly one op to the
 // induction variable before  deciding whether to do another loop iteration (in
 // the loop condition's case) or packing the induction variable into the result
@@ -98,8 +96,7 @@ static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
 //   root = tuple(..., inc, ...)  // inc is N'th operand of tuple().
 //
 // If so, returns N.  Otherwise, returns nullopt.
-static optional<int64> GetLoopInductionVarTupleIdx(
-    const HloInstruction* while_op) {
+optional<int64> GetLoopInductionVarTupleIdx(const HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
   VLOG(2) << "Finding induction variable for loop "
           << while_op->ToShortString();
@@ -166,8 +163,171 @@ static optional<int64> GetLoopInductionVarTupleIdx(
   return indvar_tuple_idx;
 }
 
+// Converts the given literal to a scalar int64, if possible.
+//
+// Fails if the literal is not an integral type or if the value it contains
+// cannot be represented in an int64.
+static optional<int64> LiteralAsScalarInt64(const Literal& l) {
+  if (!ShapeUtil::IsEffectiveScalar(l.shape())) {
+    VLOG(2) << "literal is not an effective scalar: " << l.ToString();
+    return nullopt;
+  }
+  switch (l.shape().element_type()) {
+    case S8:
+      return l.GetFirstElement<int8>();
+    case S16:
+      return l.GetFirstElement<int16>();
+    case S32:
+      return l.GetFirstElement<int32>();
+    case S64:
+      return l.GetFirstElement<int64>();
+    case U8:
+      return l.GetFirstElement<uint8>();
+    case U16:
+      return l.GetFirstElement<uint16>();
+    case U32:
+      return l.GetFirstElement<uint32>();
+    case U64: {
+      uint64 v = l.GetFirstElement<uint64>();
+      if (v > static_cast<uint64>(std::numeric_limits<int64>::max())) {
+        VLOG(2) << "uint64 literal is out of range for int64: " << v;
+        return nullopt;
+      }
+      return v;
+    }
+    default:
+      VLOG(2) << "literal is of non-integral type " << l.shape().ToString();
+      return nullopt;
+  }
+}
+
+// Computes a + b, returning nullopt if it overflows.
+optional<int64> CheckedAdd(int64 a, int64 b) {
+  // Overflow occurred iff `a` and `b` have the same sign and `a + b` has a
+  // different sign, see Hacker's Delignt 2nd Ed. pp 28.
+  uint64 aa = absl::bit_cast<uint64>(a);
+  uint64 bb = absl::bit_cast<uint64>(b);
+  int64 result = absl::bit_cast<int64>(aa + bb);
+  if (a >= 0 == b >= 0 && result >= 0 != a >= 0) {
+    return nullopt;
+  }
+  return result;
+}
+
+// Computes a - b, returning nullopt if it overflows.
+optional<int64> CheckedSubtract(int64 a, int64 b) {
+  uint64 aa = absl::bit_cast<uint64>(a);
+  uint64 bb = absl::bit_cast<uint64>(b);
+  int64 result = absl::bit_cast<int64>(aa - bb);
+  // Overflow occurred iff `a` and `b` have different signs and the sign of
+  // `a - b` is the same as that of `b`, see Hacker's Delight 2nd Ed. pp 29.
+  if (a >= 0 != b >= 0 && result >= 0 == b >= 0) {
+    return nullopt;
+  }
+  return result;
+}
+
+// Check if
+//  - `i` is initialized to a scalar constant K (namely, `indvar_init`),
+//  - the while condition does `i < N` or `i <= N`, and
+//  - the while body does `i++`.
+// If so, it's trivial to compute the loop bound.
+static optional<int64> PatternMatchLoopTripCount(HloInstruction* while_op,
+                                                 int64 indvar_tuple_idx,
+                                                 const Literal& indvar_init) {
+  // First, find the scalar constant K that `i` is initialized to.
+  optional<int64> indvar_init_val = LiteralAsScalarInt64(indvar_init);
+  if (!indvar_init_val) {
+    VLOG(2) << "Pattern-match failed: induction variable init is not a "
+               "constant scalar representable as an int64: "
+            << indvar_init.ToString();
+    return nullopt;
+  }
+
+  // Check that `i` goes as `i++` in the while body.
+  //
+  // TODO(jlebar): We could also handle i-- and other idioms.
+  auto* while_body = while_op->while_body();
+  auto* while_body_indvar_update =
+      while_body->root_instruction()->operand(indvar_tuple_idx);
+  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
+  if (!Match(while_body_indvar_update,
+             m::AddAnyOrder(m::Op().Is(while_body_indvar),
+                            m::ConstantEffectiveScalar(1)))) {
+    VLOG(2) << "Pattern-match failed: induction variable does not go as i++: "
+            << while_body_indvar_update->ToString();
+    return nullopt;
+  }
+
+  // Check that we do op(i, N) or op(N, i) as the while condition.  Capture the
+  // value N.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_root = while_cond->root_instruction();
+  auto* while_cond_indvar = NonConstantOperand(while_cond_root);
+  HloInstruction* while_cond_bound = nullptr;
+  if (!Match(while_cond_root,
+             m::Op().WithBinaryOperandsAnyOrder(
+                 m::Op().Is(while_cond_indvar),
+                 m::ConstantEffectiveScalar(&while_cond_bound)))) {
+    VLOG(2) << "Pattern-match failed: while condition is not of the form "
+               "op(i, N) or op(N, i).";
+    return nullopt;
+  }
+  // Note: If this succeeds, the constant `N` is representable as an int64 --
+  // that is, if it's an XLA U64, it fits within an int64.
+  optional<int64> while_cond_bound_val =
+      LiteralAsScalarInt64(while_cond_bound->literal());
+  if (!while_cond_bound_val) {
+    VLOG(2) << "Pattern-match failed: while condition induction variable is "
+               "not a constant scalar representable as an int64.";
+    return nullopt;
+  }
+
+  // Handle `i = K; i < N; ++i`.
+  if (Match(while_cond_root,
+            m::Op()
+                .WithComparisonDirection(ComparisonDirection::kLt)
+                .WithOperand(0, m::Op().Is(while_cond_indvar)))) {
+    VLOG(2) << "Pattern-match succeeded: loop condition is i < N: "
+            << while_cond_root->ToString();
+    optional<int64> trips =
+        CheckedSubtract(*while_cond_bound_val, *indvar_init_val);
+    if (trips) {
+      return std::max(int64{0}, *trips);
+    } else {
+      VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX.";
+      return nullopt;
+    }
+  }
+
+  // Handle `i = K; i <= N; ++i`.
+  if (Match(while_cond_root,
+            m::Op()
+                .WithComparisonDirection(ComparisonDirection::kLe)
+                .WithOperand(0, m::Op().Is(while_cond_indvar)))) {
+    VLOG(2) << "Pattern-match succeeded: loop condition is i <= N: "
+            << while_cond_root->ToString();
+    optional<int64> trips =
+        CheckedSubtract(*while_cond_bound_val, *indvar_init_val);
+    if (!trips) {
+      VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX";
+      return nullopt;
+    }
+    trips = CheckedAdd(*trips, 1);
+    if (!trips) {
+      VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX";
+      return nullopt;
+    }
+    return std::max<int64>(0, *trips);
+  }
+
+  VLOG(2) << "Pattern-match failed: while condition follows unknown pattern: "
+          << while_cond_root->ToString();
+  return nullopt;
+}
+
 optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
-                                          int64 max_value_returned) {
+                                          int64 max_brute_force_iters) {
   VLOG(2) << "Getting trip count for loop " << while_op->ToString();
 
   // The loop's induction variable is found at
@@ -188,23 +348,30 @@ optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
   auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
   StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
   if (!indvar_init_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable init: "
-            << indvar_init_result.status();
+    VLOG(2) << "Couldn't evaluate induction variable init, "
+            << indvar_init_result.status() << ", " << indvar_init->ToString();
     return nullopt;
   }
+  Literal indvar_iter_val = std::move(indvar_init_result).ValueOrDie();
+
+  // First, try to pattern-match.
+  if (auto trip_count = PatternMatchLoopTripCount(while_op, *indvar_tuple_idx,
+                                                  indvar_iter_val)) {
+    return trip_count;
+  }
 
+  // If our pattern-match failed, try brute-forcing the loop trip count.
   auto* while_body = while_op->while_body();
   auto* while_body_indvar_update =
       while_body->root_instruction()->operand(*indvar_tuple_idx);
   auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
 
-  // The initial value of the induction variable.
-  Literal indvar_iter_val = std::move(indvar_init_result).ValueOrDie();
-  for (int64 trip_count = 0; trip_count != max_value_returned + 1;
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_root = while_cond->root_instruction();
+  auto* while_cond_indvar = NonConstantOperand(while_cond_root);
+
+  for (int64 trip_count = 0; trip_count != max_brute_force_iters + 1;
        ++trip_count) {
-    auto* while_cond = while_op->while_condition();
-    auto* while_cond_root = while_cond->root_instruction();
-    auto* while_cond_indvar = NonConstantOperand(while_cond_root);
     StatusOr<Literal> result = evaluator.EvaluateWithSubstitutions(
         while_cond_root, {{while_cond_indvar, &indvar_iter_val}});
     if (!result.ok()) {
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index ac69a727bd6b403672a676400993fb7d8afc0a55..10b644599742a17b9d7ea08284abbd1c03b3a08d 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -22,16 +22,24 @@ limitations under the License.
 namespace xla {
 
 // Returns the precise trip count of the loop if it's statically known,
-// nullopt otherwise. max_value_returned limits the number of steps that are
-// evaluated while trying to brute force a loop trip count, trip counts larger
-// than max_value_returned result in nullopt.
-absl::optional<int64> ComputeWhileLoopTripCount(HloInstruction *while_op,
-                                                int64 max_value_returned = 128);
+// nullopt otherwise.
+//
+// max_brute_force_iters limits the number of steps that are evaluated while
+// trying to brute force a loop trip count. trip counts larger than
+// max_brute_force_iters may be returned if we can pattern-match the loop
+// condition.
+absl::optional<int64> ComputeWhileLoopTripCount(
+    HloInstruction *while_op, int64 max_brute_force_iters = 128);
 
 // Returns an upper bound on the trip count of the loop if it's statically
 // known, nullopt otherwise.
 absl::optional<int64> ComputeWhileLoopTripCountUpperBound(
     HloInstruction *while_op);
+
+// Returns the tuple index of the loop induction variable if there is such an
+// induction variable detected. Otherwise returns nullopt.
+absl::optional<int64> GetLoopInductionVarTupleIdx(
+    const HloInstruction *while_op);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis_test.cc b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
index 1da0fbeac89a93eaaef893e5f25dd3b87cc1d5d5..5a5dc742c0304d28d7fca3937fcdef5e7aea7ce9 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
@@ -40,7 +40,7 @@ TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
       p_cond = (f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=1
       const = s32[] constant(42)
-      ROOT result = pred[] equal-to(gte, const)
+      ROOT result = pred[] compare(gte, const), direction=EQ
     }
 
     ENTRY entry {
@@ -71,7 +71,7 @@ TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
       p_cond = (f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=1
       const = s32[] constant(42)
-      ROOT result = pred[] equal-to(gte, const)
+      ROOT result = pred[] compare(gte, const), direction=EQ
     }
 
     ENTRY entry {
@@ -104,7 +104,7 @@ TEST_F(WhileLoopAnalysisTest, ExactBound) {
       p_cond = (f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=1
       const = s32[] constant(42)
-      ROOT result = pred[] less-than(gte, const)
+      ROOT result = pred[] compare(gte, const), direction=LT
     }
 
     ENTRY entry {
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 3bcf5c38309a86e9e3cab3268f3f065005f7a923..8ab5e433e0f2893e93d7a83e4114352c8a3d82fd 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -260,7 +260,7 @@ condition {
   p_cond = (f32[],f32[]) parameter(0)
   p_cond.0 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=0
   p_cond.1 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=1
-  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+  ROOT result = pred[] compare(p_cond.0, p_cond.1), direction=LT
 }
 
 ENTRY entry {
@@ -300,7 +300,7 @@ condition {
   p_c.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_c), index=0
   p_c.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_c), index=1
   p_c.1.1 = f32[] get-tuple-element((f32[],f32[]) p_c.1), index=1
-  ROOT result = pred[] less-than(p_c.0, p_c.1.1)
+  ROOT result = pred[] compare(p_c.0, p_c.1.1), direction=LT
 }
 
 ENTRY entry {
@@ -342,7 +342,7 @@ condition {
   p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
   p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
   p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
-  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+  ROOT result = pred[] compare(p_cond.0, p_cond.1), direction=LT
 }
 
 ENTRY entry {
@@ -389,10 +389,10 @@ condition {
   p_cond = (f32[],f32[],f32[]) parameter(0)
   p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
   p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
-  lt.0 = pred[] less-than(p_cond.0, p_cond.2)
+  lt.0 = pred[] compare(p_cond.0, p_cond.2), direction=LT
   p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
   p_cond.2.c = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
-  lt.1 = pred[] less-than(p_cond.1, p_cond.2.c)
+  lt.1 = pred[] compare(p_cond.1, p_cond.2.c), direction=LT
   ROOT result = pred[] and(lt.0, lt.1)
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 3587c016b4420163a607422b1acc838646fab83a..f0bb646d9c0e3f563641f459b9e30a5133191305 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -556,7 +556,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
       p_cond = (f32[2], f32[2], f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=3
       const = s32[] constant(42)
-      ROOT result = pred[] equal-to(gte, const)
+      ROOT result = pred[] compare(gte, const), direction=EQ
     }
 
     ENTRY entry {
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 09d54095718029541a7a25aa62f9a2e9a177960d..386ffb995477ff1b4aef73080b6a6fd988dd1980 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -180,7 +180,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // replace the old instructions after we remove unused elements from the while
   // tuple.
   auto make_while_computation_replacements = [&](const HloComputation* comp) {
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements;
 
     auto* param = comp->parameter_instruction(0);
@@ -232,7 +232,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       while_cond->CloneWithReplacements(
           make_while_computation_replacements(while_cond));
 
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
   std::vector<HloInstruction*> new_while_body_root_elems;
   new_while_body_root_elems.reserve(new_to_old_tuple_idx.size());
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index ecca76b1e86d833c73fbb9bad6a341660a7d2669..65175fb6ab38a3b07ce81ec6b24a070529305f55 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -72,7 +72,7 @@ WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant({{LOOP_BOUND}})
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(42)
@@ -107,7 +107,7 @@ WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
     loop_var.2 = (s32[], s32[3]{0}, s32[]) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=2
-    ROOT less-than = pred[] less-than(get-tuple-element.3, get-tuple-element.4)
+    ROOT less-than = pred[] compare(get-tuple-element.3, get-tuple-element.4), direction=LT
   }
   ENTRY SimpleLoopWithIndirectLoopBound {
     constant.3 = s32[] constant(42)
@@ -237,7 +237,7 @@ TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
  NonTupleShapedLoop.condition {
    loop_var = s32[] parameter(0)
    constant = s32[] constant(100)
-   ROOT less-than = pred[] less-than(s32[] loop_var, s32[] constant)
+   ROOT less-than = pred[] compare(s32[] loop_var, s32[] constant), direction=LT
  }
  ENTRY INonTupleShapedLoop {
    constant.2 = s32[] constant(42)
@@ -387,7 +387,7 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
     param0 = (s32[], s32[], s32[]) parameter(0)
     get-tuple-element = s32[] get-tuple-element((s32[], s32[], s32[]) param0),
       index=2
-    ROOT equal-to = pred[] equal-to(s32[] constant.2, s32[] get-tuple-element)
+    ROOT equal-to = pred[] compare(s32[] constant.2, s32[] get-tuple-element), direction=EQ
   }
   ENTRY RemoveUnusedOperands {
     x = s32[] parameter(0)
@@ -471,7 +471,7 @@ TEST_F(WhileLoopSimplifierTest,
     loop_var.2 = (s32[], s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(44)
-    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(42)
@@ -503,7 +503,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
     loop_var.2 = (s32[], s32[3]{0}, s32[3]{0}) parameter(0)
     get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=0
     constant.2 = s32[] constant(47)
-    ROOT less-than = pred[] less-than(get-tuple-element.4, constant.2)
+    ROOT less-than = pred[] compare(get-tuple-element.4, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
     constant.3 = s32[] constant(42)
@@ -679,7 +679,7 @@ const char* const kSimpleMergeInductionVariablesModule = R"(
     b = TYPE[] get-tuple-element(param), index=1
     sum = TYPE[] power(a, b)
     ten = TYPE[] constant(10)
-    ROOT cond = pred[] less-than(sum, ten)
+    ROOT cond = pred[] compare(sum, ten), direction=LT
   }
   ENTRY Loop {
     a = TYPE[] constant(10)
diff --git a/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.cc b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03bb6792fe74e9eb90278cbd4152e609a7904c80
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.cc
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+StatusOr<bool> WhileLoopTripCountAnnotator::Run(HloModule* module) {
+  bool changed = false;
+  for (const HloComputation* comp : module->computations()) {
+    for (HloInstruction* instr : comp->instructions()) {
+      if (instr->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+      if (auto trip_count = ComputeWhileLoopTripCount(instr)) {
+        WhileLoopBackendConfig config;
+        config.mutable_known_trip_count()->set_n(*trip_count);
+        TF_RETURN_IF_ERROR(instr->set_backend_config(config));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cda2f10cefba821bccc1b5d3b5a33cd7a68e004
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Pass that annotates `while` loops with known trip counts.
+//
+// The annotation is stored as a backend-config on the while loop node.
+//
+// This pass should run after all passes that might semantically modify a while
+// loop, e.g. by unrolling it.  Otherwise, a loop could end up with a
+// backend-config that doesn't match its true trip-count.
+//
+// This pass does some pattern-matching on loop bodies and conditions, so it
+// should run after most HLO simplifications and before fusion and layout
+// assignment, which make pattern matching much more difficult by e.g.
+// introducing `copy` nodes.
+class WhileLoopTripCountAnnotator : public HloModulePass {
+ public:
+  ~WhileLoopTripCountAnnotator() override {}
+  absl::string_view name() const override {
+    return "while-loop-trip-count-annotator";
+  }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_trip_count_annotator_test.cc b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1e18bbdef6b84332e0187ae1bf73b67396eaf80
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class TripCountAnnotatorTest : public HloTestBase {};
+
+TEST_F(TripCountAnnotatorTest, KnownSmallTripCount) {
+  const char* kModuleStr = R"(
+    HloModule test
+    Body {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      one = s32[] constant(1)
+      i_plus_one = s32[] add(i, one)
+      ROOT tuple = (s32[]) tuple(i_plus_one)
+    }
+
+    Cond {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      trip_count = s32[] constant(10)
+      ROOT done = pred[] compare(i, trip_count), direction=LT
+    }
+
+    ENTRY test {
+      i_start = s32[] constant(0)
+      initial_tuple = (s32[]) tuple(i_start)
+      ROOT while = (s32[]) while(initial_tuple), condition=Cond, body=Body
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  WhileLoopTripCountAnnotator pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, m.get()));
+  ASSERT_TRUE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          m->entry_computation()
+                              ->root_instruction()
+                              ->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(10, config.known_trip_count().n());
+}
+
+TEST_F(TripCountAnnotatorTest, KnownLargeTripCount) {
+  const char* kModuleStr = R"(
+    HloModule test
+    Body {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      one = s32[] constant(1)
+      i_plus_one = s32[] add(i, one)
+      ROOT tuple = (s32[]) tuple(i_plus_one)
+    }
+
+    Cond {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      trip_count = s32[] constant(1000000)
+      ROOT done = pred[] compare(i, trip_count), direction=LT
+    }
+
+    ENTRY test {
+      i_start = s32[] constant(0)
+      initial_tuple = (s32[]) tuple(i_start)
+      ROOT while = (s32[]) while(initial_tuple), condition=Cond, body=Body
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  WhileLoopTripCountAnnotator pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, m.get()));
+  ASSERT_TRUE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          m->entry_computation()
+                              ->root_instruction()
+                              ->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(1000000, config.known_trip_count().n());
+}
+
+TEST_F(TripCountAnnotatorTest, NonzeroStart) {
+  const char* kModuleStr = R"(
+    HloModule test
+    Body {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      one = s32[] constant(1)
+      i_plus_one = s32[] add(i, one)
+      ROOT tuple = (s32[]) tuple(i_plus_one)
+    }
+
+    Cond {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      trip_count = s32[] constant(1000000)
+      ROOT done = pred[] compare(i, trip_count), direction=LT
+    }
+
+    ENTRY test {
+      i_start = s32[] constant(10)
+      initial_tuple = (s32[]) tuple(i_start)
+      ROOT while = (s32[]) while(initial_tuple), condition=Cond, body=Body
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  WhileLoopTripCountAnnotator pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, m.get()));
+  ASSERT_TRUE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          m->entry_computation()
+                              ->root_instruction()
+                              ->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(999990, config.known_trip_count().n());
+}
+
+TEST_F(TripCountAnnotatorTest, LessThanOrEqualTo) {
+  const char* kModuleStr = R"(
+    HloModule test
+    Body {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      one = s32[] constant(1)
+      i_plus_one = s32[] add(i, one)
+      ROOT tuple = (s32[]) tuple(i_plus_one)
+    }
+
+    Cond {
+      param = (s32[]) parameter(0)
+      i = s32[] get-tuple-element(param), index=0
+      trip_count = s32[] constant(1000000)
+      ROOT done = pred[] compare(i, trip_count), direction=LE
+    }
+
+    ENTRY test {
+      i_start = s32[] constant(10)
+      initial_tuple = (s32[]) tuple(i_start)
+      ROOT while = (s32[]) while(initial_tuple), condition=Cond, body=Body
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  WhileLoopTripCountAnnotator pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, m.get()));
+  ASSERT_TRUE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          m->entry_computation()
+                              ->root_instruction()
+                              ->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(999991, config.known_trip_count().n());
+}
+
+TEST_F(TripCountAnnotatorTest, Int64Overflow) {
+  // for(i = INT64_MIN; i < INT64_MAX; ++i)
+  //
+  // We store the trip count as an int64, so this loop is unanalyzable.
+  const char* kModuleStr = R"(
+    HloModule test
+    Body {
+      param = (s64[]) parameter(0)
+      i = s64[] get-tuple-element(param), index=0
+      one = s64[] constant(1)
+      i_plus_one = s64[] add(i, one)
+      ROOT tuple = (s64[]) tuple(i_plus_one)
+    }
+
+    Cond {
+      param = (s64[]) parameter(0)
+      i = s64[] get-tuple-element(param), index=0
+      trip_count = s64[] constant(9223372036854775807) // 2^63-1
+      ROOT done = pred[] compare(i, trip_count), direction=LE
+    }
+
+    ENTRY test {
+      i_start = s64[] constant(-9223372036854775808)  // -2^63
+      initial_tuple = (s64[]) tuple(i_start)
+      ROOT while = (s64[]) while(initial_tuple), condition=Cond, body=Body
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  WhileLoopTripCountAnnotator pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, m.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index d77386497a14b3e52be2ea7f655fa330f60e4a97..b6f65c763ea3c489f385ffe3e49b53a36a2877ba 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -166,7 +166,7 @@ MakeCountedLoopConditionComputation(const Shape& loop_state_shape,
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * compare,
-      MakeBinaryHlo(HloOpcode::kLt, indvar, trip_count_constant));
+      MakeCompareHlo(ComparisonDirection::kLt, indvar, trip_count_constant));
   cond_computation->set_root_instruction(compare);
   return std::move(cond_computation);
 }
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
index a76fbf3f66adae0a5e5357178bc576bbc74701c7..661b7aa7d99ca549da6a509812760a1665d60919 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -37,9 +37,15 @@ StatusOr<bool> ZeroSizedHloElimination::Run(HloModule* module) {
       }
       if (comp->IsRemovable(instruction) &&
           ShapeUtil::IsZeroElementArray(instruction->shape())) {
+        // If the instruction doesn't have a layout, use a default layout for
+        // the literal.
+        Shape shape = instruction->shape();
+        if (!LayoutUtil::HasLayout(shape)) {
+          LayoutUtil::SetToDefaultLayout(&shape);
+        }
         TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
-            instruction, HloInstruction::CreateConstant(
-                             Literal::CreateFromShape(instruction->shape()))));
+            instruction,
+            HloInstruction::CreateConstant(Literal::CreateFromShape(shape))));
         changed = true;
       }
     }
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index a546a6d39cc55d1f327b8449c7d26cd4c95dbf98..572a79609e7a912277af0fd2ba43f9a1e14a6f52 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -82,5 +82,18 @@ TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateConstant) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ZeroSizedHloEliminationTest, ZeroSizedInstructionWithoutLayoutFolded) {
+  Shape op_shape = ShapeUtil::MakeShape(F32, {4, 0});
+  op_shape.clear_layout();
+  HloInstruction* param1 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(1, op_shape, "zero sized param 1"));
+  HloInstruction* param2 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(2, op_shape, "zero sized param 2"));
+  builder_.AddInstruction(
+      HloInstruction::CreateBinary(op_shape, HloOpcode::kAdd, param1, param2));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index 1a029efe8543b5433ef5fe7923e1e804019ba0c0..94854047e530babe2234381a615aeb805f0d5933 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -34,8 +34,12 @@ Shape::Shape(const ShapeProto& shape_proto) {
   // instead of a constructor.
   if (shape_proto.dimensions_size() !=
       shape_proto.is_dynamic_dimension_size()) {
-    LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
-                  "fields does not match number of dimension fields";
+    if (shape_proto.is_dynamic_dimension_size() != 0) {
+      LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
+                    "fields does not match number of dimension fields";
+    } else {
+      LOG(WARNING) << "Malformed shape proto: is_dynamic_dimension is empty";
+    }
   }
   int64 num_dynamic_dimension_fields = std::min(
       shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
@@ -112,6 +116,68 @@ void Shape::DeleteDimension(int64 dim_to_delete) {
   }
 }
 
+bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
+  if (lhs.IsTuple()) {
+    return rhs.IsTuple() &&
+           absl::c_equal(
+               lhs.tuple_shapes(), rhs.tuple_shapes(),
+               [=](const Shape& l, const Shape& r) { return (*this)(l, r); });
+  } else if (!lhs.IsArray()) {
+    // Non-tuple, non-array tupes such as opaque and token types are trivially
+    // the same.
+    return lhs.element_type() == rhs.element_type();
+  }
+
+  if (!rhs.IsArray()) {
+    return false;
+  }
+
+  if (!ignore_element_type_) {
+    if ((ignore_fp_precision_ &&
+         !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
+        (!ignore_fp_precision_ && !ShapeUtil::SameElementType(lhs, rhs))) {
+      VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+      return false;
+    }
+  }
+
+  if (!ignore_layout_) {
+    if (lhs.layout().format() != rhs.layout().format()) {
+      VLOG(3) << "CompareShapes: lhs layout format != rhs layout format";
+      return false;
+    }
+    if (LayoutUtil::IsDenseArray(lhs)) {
+      Layout::Equal equal;
+      if (ignore_tiles_in_layout_) {
+        equal.IgnoreTiles();
+      }
+      if (ignore_element_size_in_layout_) {
+        equal.IgnoreElementSize();
+      }
+      if (!equal(lhs.layout(), rhs.layout())) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+        return false;
+      }
+    }
+  }
+
+  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+    return false;
+  }
+
+  if (!ignore_dynamic_dimension_) {
+    for (int i = 0; i < lhs.rank(); ++i) {
+      if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
+        VLOG(3)
+            << "CompareShapes: lhs and rhs have different dynamic dimensions.";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 std::ostream& operator<<(std::ostream& out, const Shape& shape) {
   out << shape.ToString(/*print_layout=*/true);
   return out;
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index dc4cdc31a74d43471b72a71d9d436408e0e62deb..78cea83c6d71e5965f10cd3a917ffccabd630462 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -72,6 +72,10 @@ class Shape {
     dynamic_dimensions_[dimension] = is_dynamic;
   }
 
+  const std::vector<bool>& dynamic_dimensions() const {
+    return dynamic_dimensions_;
+  }
+
   // Add dimension_upper_bound().
 
   // Removes the given dimension form the shape. Layout, if it exists, is
@@ -138,6 +142,59 @@ class Shape {
   string ShortDebugString() const { return ToProto().ShortDebugString(); }
   string DebugString() const { return ToProto().DebugString(); }
 
+  // Equal is a configurable functor to check the equality of two shapes.
+  //
+  // Examples:
+  //
+  // - Comparing two shapes ignoring their layout difference:
+  //   Equal().IgnoreLayout()(shape1, shape2);
+  //
+  // - Comparing two shapes ignoring their layout and element type difference:
+  //   Equal().IgnoreLayout().IgnoreElementType()(shape1, shape2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Shape& lhs, const Shape& rhs);
+
+    Equal& IgnoreLayout() {
+      ignore_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreTilesInLayout() {
+      ignore_tiles_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementSizeInLayout() {
+      ignore_element_size_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementType() {
+      ignore_element_type_ = true;
+      return *this;
+    }
+    Equal& IgnoreFpPrecision() {
+      ignore_fp_precision_ = true;
+      return *this;
+    }
+    Equal& IgnoreDynamicDimension() {
+      ignore_dynamic_dimension_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_layout_ = false;
+    bool ignore_tiles_in_layout_ = false;
+    bool ignore_element_size_in_layout_ = false;
+    bool ignore_element_type_ = false;
+    bool ignore_fp_precision_ = false;
+    bool ignore_dynamic_dimension_ = false;
+  };
+
+  // Test that all fields of the shape are the same, equivalent to Equal().
+  bool operator==(const Shape& other) const { return Equal()(*this, other); }
+  bool operator!=(const Shape& other) const { return !(*this == other); }
+
  private:
   // The element type of this shape (tuple, array, etc).
   PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
index 55ce5fe884e98e474253be9ef694f1b8137b4b01..dbdafcc0a1f7348af8394598363d570118cdd87e 100644
--- a/tensorflow/compiler/xla/shape_test.cc
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -35,6 +35,8 @@ class ShapeTest : public ::testing::Test {
   const Shape opaque_ = ShapeUtil::MakeOpaqueShape();
   const Shape token_ = ShapeUtil::MakeTokenShape();
   const Shape scalar_ = ShapeUtil::MakeShape(F32, {});
+  const Shape scalar_with_tile_ =
+      ShapeUtil::MakeShapeWithLayout(F32, {}, {}, {Tile({256})});
   const Shape matrix_ = ShapeUtil::MakeShape(U32, {1, 2});
   const Shape matrix2_ = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
   const Shape tuple_ =
@@ -66,6 +68,8 @@ TEST_F(ShapeTest, ShapeToString) {
 
   EXPECT_EQ("opaque[]", opaque_.ToString(/*print_layout=*/true));
   EXPECT_EQ("f32[]", scalar_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("f32[]{:T(256)}",
+            scalar_with_tile_.ToString(/*print_layout=*/true));
   EXPECT_EQ("u32[1,2]{1,0}", matrix_.ToString(/*print_layout=*/true));
   EXPECT_EQ("s32[3,4]{0,1}", matrix2_.ToString(/*print_layout=*/true));
   EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
@@ -85,6 +89,24 @@ TEST_F(ShapeTest, DynamicShapeToString) {
   EXPECT_EQ("f32[<=23,44,55]", array_shape.ToString());
 }
 
+TEST_F(ShapeTest, EqualityTest) {
+  // Different layouts.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {0, 1}));
+
+  // Different dims.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(F32, {44, 23}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+
+  // Different elements.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(S32, {44, 23}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+
+  // Equal shapes.
+  EXPECT_EQ(ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+}
+
 TEST_F(ShapeTest, IsStatic) {
   EXPECT_TRUE(opaque_.is_static());
   EXPECT_TRUE(token_.is_static());
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index f1ef202da0971926f608b4594099d8b4c3a6baae..acaa9cae7c2c2745a3ed413ca9f00b5bf0187a0c 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -85,82 +86,12 @@ bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
 }
 
 namespace {
-
-// Recursive helper for comparing the equality of two shapes. Returns true if
-// the shapes are the same. If compare_layouts is true, then layouts must also
-// match.
-bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
-                   bool ignore_fp_precision) {
-  if ((ignore_fp_precision &&
-       !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
-      (!ignore_fp_precision && !ShapeUtil::SameElementType(lhs, rhs))) {
-    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
-    return false;
-  }
-
-  if (lhs.IsTuple()) {
-    return absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         [=](const Shape& l, const Shape& r) {
-                           return CompareShapes(l, r, compare_layouts,
-                                                ignore_fp_precision);
-                         });
-  } else if (!lhs.IsArray()) {
-    // Non-tuple, non-array tupes such as opaque and token types are trivially
-    // the same.
-    return true;
-  }
-
-  if (compare_layouts) {
-    if (lhs.layout().format() != rhs.layout().format()) {
-      VLOG(3) << "CompareShapes: lhs layout format != rhs layout format";
-      return false;
-    }
-    if (LayoutUtil::IsDenseArray(lhs)) {
-      if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs),
-                         LayoutUtil::MinorToMajor(rhs))) {
-        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
-        return false;
-      }
-
-      const auto& lhs_tiles = lhs.layout().tiles();
-      const auto& rhs_tiles = rhs.layout().tiles();
-      if (lhs_tiles.size() != rhs_tiles.size()) {
-        return false;
-      }
-      for (int64 i = 0; i < lhs_tiles.size(); i++) {
-        if (!absl::c_equal(lhs_tiles[i].dimensions(),
-                           rhs_tiles[i].dimensions())) {
-          return false;
-        }
-      }
-
-      if (lhs.layout().element_size_in_bits() !=
-          rhs.layout().element_size_in_bits()) {
-        return false;
-      }
-    }
-  }
-
-  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
-    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
-    return false;
-  }
-
-  for (int i = 0; i < lhs.rank(); ++i) {
-    if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
-      VLOG(3)
-          << "CompareShapes: lhs and rhs have different dynamic dimensions.";
-      return false;
-    }
-  }
-  return true;
-}
-
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
 StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   if (dimensions.size() != minor_to_major.size()) {
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
@@ -171,23 +102,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   }
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeUtil::MakeValidatedShape(element_type, dimensions));
-  auto min2maj = shape.mutable_layout()->mutable_minor_to_major();
-  min2maj->clear();
-  for (int64 value : minor_to_major) {
-    min2maj->push_back(value);
-  }
+  *shape.mutable_layout() =
+      LayoutUtil::MakeLayout(minor_to_major, tiles, element_size_in_bits);
   if (!shape.has_layout()) {
     return InvalidArgument("Shape has no layout.");
   }
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return shape;
 }
-
 }  // namespace
 
 /* static */ bool ShapeUtil::Equal(const Shape& lhs, const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/false);
+  bool equal = Shape::Equal()(lhs, rhs);
+
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString()
             << ", rhs = " << rhs.ShortDebugString();
@@ -198,8 +125,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ bool ShapeUtil::EqualIgnoringFpPrecision(const Shape& lhs,
                                                       const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/true);
+  bool equal = Shape::Equal().IgnoreFpPrecision()(lhs, rhs);
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::EqualIgnoringFpPrecision differ: lhs = "
             << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
@@ -262,8 +188,10 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ Shape ShapeUtil::MakeShapeWithLayout(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
-  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major)
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
+  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major,
+                                     tiles, element_size_in_bits)
       .ValueOrDie();
 }
 
@@ -534,10 +462,14 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
               shape.is_dynamic_dimension(i) ? "<=" : "", shape.dimensions(i));
   }
   result += "]";
-  if (!IsScalar(shape) && shape.IsArray()) {
-    if (LayoutUtil::HasLayout(shape)) {
-      StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
+  if (IsScalar(shape)) {
+    string layout_str = LayoutUtil::HumanString(shape.layout());
+    // Don't print "{}" as layout for scalars.
+    if (layout_str != "{}") {
+      StrAppend(&result, layout_str);
     }
+  } else if (shape.IsArray() && LayoutUtil::HasLayout(shape)) {
+    StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
   }
   return result;
 }
@@ -563,37 +495,17 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  return CompareShapes(lhs, rhs, /*compare_layouts=*/false,
-                       /*ignore_fp_precision=*/false);
+  return Shape::Equal().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.IsArray()) {
-    return rhs.IsArray() && SameDimensions(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringElementType);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreElementType().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.IsArray()) {
-    return rhs.IsArray() && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
-           CompatibleIgnoringElementType(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringFpPrecision);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreFpPrecision().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
@@ -1022,6 +934,10 @@ Status ForEachMutableSubshapeHelper(
   for (auto dim : Permute(permutation, shape.dimensions())) {
     new_shape.add_dimensions(dim);
   }
+  for (int64 i = 0; i < shape.rank(); i++) {
+    new_shape.set_dynamic_dimension(permutation[i],
+                                    shape.is_dynamic_dimension(i));
+  }
 
   // If `shape` has a layout, by contract we choose a new layout such that the
   // transpose defined by this permutation is a bitcast.
@@ -1345,6 +1261,43 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     const Shape& input_shape, const Shape& output_shape) {
   CHECK(input_shape.IsArray());
   CHECK(output_shape.IsArray());
+  // Removing trivial dimensions from the shape simplifies the alignment
+  // algorithm since ones can go in any position.
+  if (HasDegenerateDimensions(input_shape) ||
+      HasDegenerateDimensions(output_shape)) {
+    auto simple_output_shape =
+        AlignLayouts(DropDegenerateDimensions(input_shape),
+                     DropDegenerateDimensions(output_shape));
+    if (!simple_output_shape) {
+      return absl::nullopt;
+    }
+
+    auto layout = simple_output_shape->layout().minor_to_major();
+    // For each one sized dimension in the output, increment the dimension
+    // numbers in layout that are more minor than the one.
+    absl::InlinedVector<int64, 8> dim_map;
+    dim_map.reserve(simple_output_shape->rank());
+    for (int64 i = 0; i < output_shape.rank(); ++i) {
+      if (output_shape.dimensions(i) != 1) {
+        dim_map.push_back(i);
+      }
+    }
+    for (int64& d : layout) {
+      d = dim_map[d];
+    }
+
+    // Add the ones in descending order to the layout. Descending layouts tend
+    // to reduce the number of copies inserted in layout assignment.
+    for (int64 i = output_shape.rank() - 1; i >= 0; --i) {
+      if (output_shape.dimensions(i) == 1) {
+        layout.push_back(i);
+      }
+    }
+    Shape output_shape_with_layout = output_shape;
+    *output_shape_with_layout.mutable_layout()->mutable_minor_to_major() =
+        layout;
+    return output_shape_with_layout;
+  }
 
   int64 input_rank = input_shape.rank();
   int64 output_rank = output_shape.rank();
@@ -1393,10 +1346,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   if (input_dimension_product != output_dimension_product) {
     return absl::nullopt;
   }
+
   // We also need to store an end element so that we know where the last
   // alignment part ends.
   alignment.push_back({input_rank, output_rank});
-
   // Now check if the physical layout can potentially be aligned to the output
   // shape by changing the physical layout of the output shape. We need to check
   // that all dimension numbers that belong to the same alignment part appear
@@ -1408,40 +1361,23 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   for (int64 i = 0; i < input_rank;) {
     int64 current_dimension_number = input_dimension_numbers[i];
 
-    // Skip trivial dimensions with a bound of 1.
-    if (input_shape.dimensions(current_dimension_number) == 1) {
-      ++i;
-      continue;
-    }
-
-    // Calculate the number of non-trivial dimension bounds in the input shape
-    // belonging to the current alignment part.
+    // Trivial dimensions are stripped.
+    CHECK_NE(input_shape.dimensions(current_dimension_number), 1);
     const int64 current_alignment_index =
         dimension_to_alignment_index[current_dimension_number];
     // Because of the special end element that we added, we can be sure that
     // 'current_alignment_index' is < alignment.size() - 1.
     CHECK_LT(current_alignment_index, alignment.size() - 1);
-    int64 num_non_trivial_dimensions_in_alignment_part = 0;
-    for (int64 j = alignment[current_alignment_index].first;
-         j < alignment[current_alignment_index + 1].first; ++j) {
-      if (input_shape.dimensions(j) != 1) {
-        ++num_non_trivial_dimensions_in_alignment_part;
-      }
-    }
 
     // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
     // dimension numbers (ignoring dimension numbers with dimension bound 1) are
     // in descending order and belong to the current alignment part.
-    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+    for (int64 j = 0; j < alignment[current_alignment_index + 1].first -
+                              alignment[current_alignment_index].first;
          ++i, ++j) {
       if (i == input_rank) {
         return absl::nullopt;
       }
-      // Skip trivial dimensions with a bound of 1.
-      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
-        --j;
-        continue;
-      }
       // If the current dimension number belongs to a different alignment part,
       // or the dimension numbers are not in descending order, we can return
       // early.
@@ -1452,22 +1388,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
       }
       current_dimension_number = input_dimension_numbers[i];
     }
-
     // The output dimension numbers that belong to the current alignment part
-    // need to appear in the same descending order as in the input. Again, we
-    // can skip dimensions with a bound of 1.
+    // need to appear in the same descending order as in the input.
     for (int64 j = alignment[current_alignment_index + 1].second - 1;
          j >= alignment[current_alignment_index].second; --j) {
-      if (output_shape.dimensions(j) != 1) {
-        output_layout.push_back(j);
-      }
-    }
-  }
-  // Now add all the dimensions with dimension bound 1 at the end of
-  // 'output_layout'.
-  for (int64 i = 0; i < output_rank; ++i) {
-    if (output_shape.dimensions(i) == 1) {
-      output_layout.push_back(i);
+      output_layout.push_back(j);
     }
   }
   CHECK_EQ(output_layout.size(), output_rank);
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index e98c6e024bec1f6db5c40d3cd3215ca44eb13698..7f610a6085d6fbe3d3143d5027cdc43d4b07bcbf 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -398,7 +398,9 @@ class ShapeUtil {
   // Returns a value shape such that shape.has_layout().
   static Shape MakeShapeWithLayout(PrimitiveType element_type,
                                    absl::Span<const int64> dimensions,
-                                   absl::Span<const int64> minor_to_major);
+                                   absl::Span<const int64> minor_to_major,
+                                   absl::Span<const Tile> tiles = {},
+                                   int64 element_size_in_bits = 0);
 
   static Shape MakeShapeWithSparseLayout(PrimitiveType element_type,
                                          absl::Span<const int64> dimensions,
@@ -675,11 +677,9 @@ class ShapeUtil {
 
   template <typename FnType>
   static void ForEachIndex(const Shape& shape, const FnType& visitor_function) {
-    ForEachIndexWithStatus(shape,
-                           [&](absl::Span<const int64> indices) {
-                             return StatusOr<bool>(visitor_function(indices));
-                           })
-        .IgnoreError();
+    ForEachIndexWithStatus(shape, [&](absl::Span<const int64> indices) {
+      return StatusOr<bool>(visitor_function(indices));
+    }).IgnoreError();
   }
 
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 61b4e73e060c18a3d0108e68d1117607d6c11c0f..020b062f6b1b032bab958772d3a6a1e35daee38b 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -710,6 +710,26 @@ TEST(ShapeUtilTest, PermuteDimensionsLayout) {
   } while (std::next_permutation(layout.begin(), layout.end()));
 }
 
+TEST(ShapeUtilTest, PermuteDynamicDimensions) {
+  Shape shape =
+      ShapeUtil::MakeShape(F32, {10, 100, 1000},
+                           /*dynamic_dimensions*/ {false, true, true});
+  SCOPED_TRACE(absl::StrCat("shape=", shape.ToString()));
+
+  std::vector<int64> permutation(3);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  do {
+    SCOPED_TRACE(absl::StrCat("permutation=", absl::StrJoin(permutation, ",")));
+
+    auto permuted = ShapeUtil::PermuteDimensions(permutation, shape);
+    for (int i = 0; i < shape.rank(); i++) {
+      EXPECT_EQ(permuted.dimensions(permutation[i]), shape.dimensions(i));
+      EXPECT_EQ(permuted.is_dynamic_dimension(permutation[i]),
+                shape.is_dynamic_dimension(i));
+    }
+  } while (std::next_permutation(permutation.begin(), permutation.end()));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithLayout(F32, {3, 2, 2}, {0, 1, 2}),
@@ -741,8 +761,15 @@ TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
   auto aligned_shape = ShapeUtil::AlignLayouts(
       input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
   EXPECT_TRUE(aligned_shape);
-  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
-              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithAllTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 1, 1, 1}, {0, 1, 3, 2});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 1, 1, 1, 1}));
+  EXPECT_TRUE(aligned_shape);
   EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
 }
 
diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc
index b88fe367d7416a26c1147fd5e10fb20772814fe5..aa7238f07d432aabb44d2cbed66786217e6a846c 100644
--- a/tensorflow/compiler/xla/status_macros.cc
+++ b/tensorflow/compiler/xla/status_macros.cc
@@ -25,6 +25,13 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+ABSL_CONST_INIT const char kPossibleAutoJitAlternative[] =
+    "This error might be occurring with the use of xla.compile. If it is not "
+    "necessary that every Op be compiled with XLA, an alternative is to use "
+    "auto_jit with OptimizerOptions.global_jit_level = ON_2 or the environment "
+    "variable TF_XLA_FLAGS=\"tf_xla_auto_jit=2\" which will attempt to use xla "
+    "to compile as much of the graph as the compiler is able to.";
+
 static Status MakeStatus(tensorflow::error::Code code, const string& message) {
   return Status(code, message);
 }
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index e51dd64e2a3dc7c359918cb08c6c94b2b4d9e91b..315136acc71670fa3ad48da4dc064e384ddadaa9 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -30,6 +30,10 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+// This is a useful error message when encountering XLA Compiler errors that
+// could be handled with the non-strict AutoJit mode.
+extern const char kPossibleAutoJitAlternative[];
+
 // Stream object used to collect error messages in MAKE_ERROR macros
 // or append error messages with APPEND_ERROR.  It accepts any
 // arguments with operator<< to build an error string, and then has an
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 4e7480c9af46beb95f49f3db4ff764326e5e9882..ab875c1c4bc7464e4b397838a6ad580eb18cdcc6 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -317,7 +317,12 @@ xla_test(
     name = "conv_depthwise_backprop_filter_test",
     timeout = "long",
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
-    shard_count = 1,
+    # these backends do not natively handle batch group counts.
+    blacklisted_backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 6,
     deps = [
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -547,6 +552,7 @@ xla_test(
 xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
+    shard_count = 2,
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -669,23 +675,21 @@ xla_test(
 )
 
 xla_test(
-    name = "exhaustive_f32_elementwise_op_test",
-    size = "enormous",
-    srcs = ["exhaustive_f32_elementwise_op_test.cc"],
-    backends = [
-        "cpu",
-        "gpu",
-    ],
+    name = "exhaustive_op_test",
+    srcs = ["exhaustive_op_test.cc"],
+    real_hardware_only = True,  # Very slow on the interpreter.
     shard_count = 48,
     tags = [
-        "broken",
-        "manual",
-        "notap",
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
     ],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
@@ -730,6 +734,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -788,6 +793,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -832,6 +838,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1141,7 +1148,7 @@ xla_test(
 xla_test(
     name = "reduce_test",
     srcs = ["reduce_test.cc"],
-    shard_count = 40,
+    shard_count = 31,
     tags = [
         "optonly",
     ],
@@ -1389,8 +1396,8 @@ xla_test(
 )
 
 xla_test(
-    name = "fmax_test",
-    srcs = ["fmax_test.cc"],
+    name = "fmax_fmin_test",
+    srcs = ["fmax_fmin_test.cc"],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1595,6 +1602,39 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "multi_device_all_reduce_test",
+    srcs = ["multi_device_all_reduce_test.cc"],
+    backends = ["gpu"],
+    tags = [
+        "manual",
+        "multi_gpu",
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
@@ -1783,7 +1823,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
+        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/stream_executor",
@@ -2155,3 +2195,46 @@ xla_test(
         "//tensorflow/compiler/xla:test",
     ],
 )
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "noasan",  # sometimes times out, http://b/78650012
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+xla_test(
+    name = "cholesky_test",
+    srcs = ["cholesky_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 7379fbcc22745f46f2a29732c4bda46f352d07e7..21458b40b10858599f15d1fcaf5acb617eafae0e 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -350,9 +349,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
-// TODO(b/119692968): This test runs OOM on the GPU and CPU backend.
-XLA_TEST_F(ArrayElementwiseOpTest,
-           DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) {
+XLA_TEST_F(ArrayElementwiseOpTest, DeeplyNestedAddWithSlices) {
   XlaBuilder builder(TestName());
   std::vector<float> values(30, 0.0);
   auto a_literal = LiteralUtil::CreateR1<float>(values);
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 05d4d04034bf50c8bb840e59b28a590fce048c19..c14d279ac560db33066ae4fc68b6290f7499bb39 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -34,6 +34,7 @@ def xla_test(
         xla_test_library_deps = [],
         backends = [],
         blacklisted_backends = [],
+        real_hardware_only = False,
         args = [],
         tags = [],
         copts = [],
@@ -108,6 +109,10 @@ def xla_test(
         use for that target.
       **kwargs: Additional keyword arguments to pass to native.cc_test.
     """
+
+    # All of the backends in all_backends are real hardware.
+    _ignore = [real_hardware_only]
+
     test_names = []
     if not backends:
         backends = all_backends
diff --git a/tensorflow/compiler/xla/client/lib/cholesky_test.cc b/tensorflow/compiler/xla/tests/cholesky_test.cc
similarity index 54%
rename from tensorflow/compiler/xla/client/lib/cholesky_test.cc
rename to tensorflow/compiler/xla/tests/cholesky_test.cc
index 095dd4fbf8b7c90047c4428b50c626c16e9c1e94..272d5784362dd347061e7178ff48f9fab4ffd822 100644
--- a/tensorflow/compiler/xla/client/lib/cholesky_test.cc
+++ b/tensorflow/compiler/xla/tests/cholesky_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/cholesky.h"
-
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <vector>
@@ -32,27 +31,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
+namespace xla {
 namespace {
 
-using xla::int64;
-
-using CholeskyTest = xla::ClientLibraryTestBase;
+using CholeskyTest = ClientLibraryTestBase;
 
-XLA_TEST_F(CholeskyTest, Simple) {
-  xla::XlaBuilder builder(TestName());
+XLA_TEST_F(CholeskyTest, Lower) {
+  XlaBuilder builder(TestName());
 
-  xla::Array2D<float> a_vals({
-      {4, 6, 8, 10},
-      {6, 45, 54, 63},
-      {8, 54, 146, 166},
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  Array2D<float> a_vals({
+      {4, nan, nan, nan},
+      {6, 45, nan, nan},
+      {8, 54, 146, nan},
       {10, 63, 166, 310},
   });
 
-  xla::XlaOp a;
+  XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  xla::Cholesky(a, /*block_size=*/2);
+  LowerTriangle(Cholesky(a, /*lower=*/true));
 
-  xla::Array2D<float> expected({
+  Array2D<float> expected({
       {2, 0, 0, 0},
       {3, 6, 0, 0},
       {4, 7, 9, 0},
@@ -60,34 +59,62 @@ XLA_TEST_F(CholeskyTest, Simple) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
-                             xla::ErrorSpec(1e-4, 1e-4));
+                             ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(CholeskyTest, Upper) {
+  XlaBuilder builder(TestName());
+
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {nan, 45, 54, 63},
+      {nan, nan, 146, 166},
+      {nan, nan, nan, 310},
+  });
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  UpperTriangle(Cholesky(a, /*lower=*/false));
+
+  Array2D<float> expected({
+      {2, 3, 4, 5},
+      {0, 6, 7, 8},
+      {0, 0, 9, 10},
+      {0, 0, 0, 11},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
+                             ErrorSpec(1e-4, 1e-4));
 }
 
 XLA_TEST_F(CholeskyTest, Simple2) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::Array2D<float> a_vals({
+  Array2D<float> a_vals({
       {16, 24, 8, 12},
       {24, 61, 82, 48},
       {8, 82, 456, 106},
       {12, 48, 106, 62},
   });
 
-  xla::XlaOp a;
+  XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  xla::Cholesky(a);
+  LowerTriangle(Cholesky(a, /*lower=*/true));
 
-  xla::Array2D<float> expected(
-      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}});
+  Array2D<float> expected({{4, 0, 0, 0},    //
+                           {6, 5, 0, 0},    //
+                           {2, 14, 16, 0},  //
+                           {3, 6, 1, 4}});
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
-                             xla::ErrorSpec(1e-4, 1e-4));
+                             ErrorSpec(1e-4, 1e-4));
 }
 
 XLA_TEST_F(CholeskyTest, SimpleBatched) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
-  xla::Array3D<float> a_vals({
+  Array3D<float> a_vals({
       {
           {4, 6, 8, 10},
           {6, 45, 54, 63},
@@ -102,65 +129,78 @@ XLA_TEST_F(CholeskyTest, SimpleBatched) {
       },
   });
 
-  xla::XlaOp a;
+  XlaOp a;
   auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
-  xla::Cholesky(a);
+  LowerTriangle(Cholesky(a, /*lower=*/true));
 
-  xla::Array3D<float> expected({
+  Array3D<float> expected({
       {
           {2, 0, 0, 0},
           {3, 6, 0, 0},
           {4, 7, 9, 0},
           {5, 8, 10, 11},
       },
-      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}},
+      {{4, 0, 0, 0},    //
+       {6, 5, 0, 0},    //
+       {2, 14, 16, 0},  //
+       {3, 6, 1, 4}},
   });
 
   ComputeAndCompareR3<float>(&builder, expected, {a_data.get()},
-                             xla::ErrorSpec(1e-4, 1e-4));
+                             ErrorSpec(1e-4, 1e-4));
 }
 
-using CholeskyTestCase = std::tuple<int64, int64>;
+using CholeskyTestCase = std::tuple<int64, int64, bool>;
 
 class RandomCholeskyTest
-    : public xla::ClientLibraryTestBase,
+    : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<CholeskyTestCase> {};
 
 XLA_TEST_P(RandomCholeskyTest, Random) {
-  xla::XlaBuilder builder(TestName());
+  XlaBuilder builder(TestName());
 
   auto test_params = GetParam();
   std::vector<int64> dimensions = {std::get<0>(test_params),
                                    std::get<1>(test_params),
                                    std::get<1>(test_params)};
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, dimensions);
+  bool lower = std::get<2>(test_params);
+  Shape shape = ShapeUtil::MakeShape(F32, dimensions);
   TF_ASSERT_OK_AND_ASSIGN(
-      auto literal,
-      xla::LiteralUtil::CreateRandomLiteral<xla::F32>(shape, 0.0, 1.0));
+      auto literal, LiteralUtil::CreateRandomLiteral<F32>(shape, 0.0, 1.0));
 
-  auto input = xla::Parameter(&builder, 0, shape, "input");
+  auto input = Parameter(&builder, 0, shape, "input");
   // Form a random positive definite matrix.
-  auto matrix = xla::BatchDot(input, TransposeInMinorDims(input),
-                              xla::PrecisionConfig::HIGHEST);
+  auto matrix =
+      BatchDot(input, TransposeInMinorDims(input), PrecisionConfig::HIGHEST);
 
-  auto cholesky = xla::Cholesky(matrix, /*block_size=*/4);
+  auto cholesky = Triangle(Cholesky(matrix, lower), lower);
 
   // Verify that ||matrix - cholesky * cholesky_t||_2 ~= 0
-  auto verification = xla::BatchDot(cholesky, TransposeInMinorDims(cholesky),
-                                    xla::PrecisionConfig::HIGHEST);
+  XlaOp verification;
+  if (lower) {
+    verification = BatchDot(cholesky, TransposeInMinorDims(cholesky),
+                            PrecisionConfig::HIGHEST);
+  } else {
+    verification = BatchDot(TransposeInMinorDims(cholesky), cholesky,
+                            PrecisionConfig::HIGHEST);
+  }
   auto delta = matrix - verification;
-  xla::Reduce(delta * delta, xla::ConstantR0<float>(&builder, 0.0),
-              CreateScalarAddComputation(xla::F32, &builder), {0, 1, 2});
+  Reduce(delta * delta, ConstantR0<float>(&builder, 0.0),
+         CreateScalarAddComputation(F32, &builder), {0, 1, 2});
 
   TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(literal));
   ComputeAndCompareR0<float>(&builder, 0.0, {input_data.get()},
-                             xla::ErrorSpec(1e-4, 1e-4));
+                             ErrorSpec(1e-4, 1e-4));
 }
 
 INSTANTIATE_TEST_SUITE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
-                         ::testing::Values(CholeskyTestCase{1, 1},
-                                           CholeskyTestCase{1, 2},
-                                           CholeskyTestCase{10, 5},
-                                           CholeskyTestCase{2, 20}));
+                         ::testing::Values(CholeskyTestCase{1, 1, true},
+                                           CholeskyTestCase{1, 2, true},
+                                           CholeskyTestCase{1, 50, true},
+                                           CholeskyTestCase{1, 50, false},
+                                           CholeskyTestCase{10, 5, true},
+                                           CholeskyTestCase{5, 10, false},
+                                           CholeskyTestCase{2, 20, true}));
 
 }  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index edb95c973b70e30702ed8490c15a48d4d5604170..0e99ede5d01fcfa88c54c9cbc5a6a85bf8f15ddf 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -41,8 +41,9 @@ constexpr char kInterpreter[] = "interpreter";
 
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
-Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
-  StatusOr<Client*> result =
+LocalClient* GetOrCreateLocalClientOrDie(
+    const LocalClientOptions& client_options) {
+  StatusOr<LocalClient*> result =
       ClientLibrary::GetOrCreateLocalClient(client_options);
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 3f65ed7fce4ff4b5c3781ac2581935bfacc69ce1..d700437ed355c144639f76d683055e211975fde9 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -385,8 +385,8 @@ class ClientLibraryTestBase : public ::testing::Test {
   StatusOr<std::pair<Literal, Literal>> ComputeValueAndReference(
       XlaBuilder* builder, absl::Span<const Literal> arguments);
 
-  Client* client_;
-  Client* ref_client_;  // To compute reference result.
+  LocalClient* client_;
+  LocalClient* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 32cac499c7439af80bafb88ac61b0b078f589599..f75c3fb01e2c854475537ca4b413f381cf74355c 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <random>
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -169,6 +170,11 @@ class ConditionalOpTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.001};
 };
 
+// Test fixture to run indexed conditional (switch/case) tests with varying
+// number of branches.
+class CaseOpTest : public ConditionalOpTest,
+                   public ::testing::WithParamInterface<int> {};
+
 // Test true and false computations that do not take any parameters.
 XLA_TEST_F(ConditionalOpTest, Parameters0) {
   XlaBuilder builder(TestName());
@@ -182,6 +188,36 @@ XLA_TEST_F(ConditionalOpTest, Parameters0) {
   ComputeAndCompareR0<float>(&builder, 56.0f, {pred_arg.get()}, error_spec_);
 }
 
+// Test branch computations that do not take any parameters.
+XLA_TEST_P(CaseOpTest, Parameters0) {
+  int num_branches = GetParam();
+  for (int bi = -1; bi <= num_branches; ++bi) {
+    SCOPED_TRACE(bi);
+    XlaBuilder builder(TestName());
+    XlaOp branch_index;
+    auto branch_index_arg = CreateR0Parameter<int32>(bi, 0, "branch_index_arg",
+                                                     &builder, &branch_index);
+    auto operand = Tuple(&builder, {});
+
+    std::vector<XlaOp> operands(num_branches, operand);
+    std::vector<XlaComputation> branches;
+    branches.reserve(num_branches);
+    std::vector<const XlaComputation*> branches_p(num_branches);
+    for (int i = 0; i < num_branches; ++i) {
+      branches.emplace_back(
+          CreateR0ConstantComputation(static_cast<float>(i) * 10));
+      branches_p[i] = &branches[i];
+    }
+    Conditional(branch_index, branches_p, operands);
+
+    float expected = 10 * static_cast<float>((bi < 0 || bi >= num_branches)
+                                                 ? num_branches - 1
+                                                 : bi);
+    ComputeAndCompareR0<float>(&builder, expected, {branch_index_arg.get()},
+                               error_spec_);
+  }
+}
+
 // Test true and false computations that take in 1 parameter.
 XLA_TEST_F(ConditionalOpTest, Parameters1) {
   XlaBuilder builder(TestName());
@@ -195,6 +231,45 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
   ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
 }
 
+// Test branch computations that take in 1 parameter.
+XLA_TEST_P(CaseOpTest, Parameters1) {
+  int num_branches = GetParam();
+  for (int bi = -1; bi <= num_branches; ++bi) {
+    SCOPED_TRACE(bi);
+    XlaBuilder builder(TestName());
+    XlaOp branch_index;
+    auto branch_index_arg = CreateR0Parameter<int32>(bi, 0, "branch_index_arg",
+                                                     &builder, &branch_index);
+
+    auto make_branch = [&builder, this](int i) {
+      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
+      Add(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+          Parameter(sb.get(), 0, r0f32_, "p0"));
+      return sb->BuildAndNoteError();
+    };
+    std::vector<XlaComputation> branches;
+    branches.reserve(num_branches);
+    std::vector<const XlaComputation*> branches_p(num_branches);
+    std::vector<XlaOp> operands;
+    operands.reserve(num_branches);
+    std::vector<float> expecteds(num_branches);
+    for (int i = 0; i < num_branches; ++i) {
+      branches.emplace_back(make_branch(i));
+      branches_p[i] = &branches[i];
+      auto fi = static_cast<float>(i);
+      operands.emplace_back(ConstantR0<float>(&builder, 10 * fi + 7));
+      expecteds[i] = 10 * fi + 7 + fi;
+    }
+
+    Conditional(branch_index, branches_p, operands);
+    float expected = (bi < 0 || bi >= num_branches)
+                         ? expecteds[num_branches - 1]
+                         : expecteds[bi];
+    ComputeAndCompareR0<float>(&builder, expected, {branch_index_arg.get()},
+                               error_spec_);
+  }
+}
+
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
@@ -331,6 +406,46 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
                              error_spec_);
 }
 
+// Test branch computations that take in 2 array parameters.
+XLA_TEST_P(CaseOpTest, Parameters2Array) {
+  int num_branches = GetParam();
+  for (int bi = -1; bi <= num_branches; ++bi) {
+    SCOPED_TRACE(bi);
+    XlaBuilder builder(TestName());
+    XlaOp branch_index;
+    auto branch_index_arg =
+        CreateR0Parameter<int32>(bi, 0, "pred", &builder, &branch_index);
+    auto operand1 = ConstantR1<float>(&builder, {24.0f, 56.0f});
+    auto operand2 = ConstantR1<float>(&builder, {10.0f, 11.0f});
+    auto operands = Tuple(&builder, {operand1, operand2});
+    auto make_branch = [&builder, this](int i) {
+      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
+      auto p = Parameter(sb.get(), 0, tuple_2_r1s2f32_, "p0");
+      Add(Mul(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+              GetTupleElement(p, 0)),
+          GetTupleElement(p, 1));
+      return sb->BuildAndNoteError();
+    };
+    std::vector<XlaComputation> branches;
+    branches.reserve(num_branches);
+    std::vector<const XlaComputation*> branches_p(num_branches);
+    for (int i = 0; i < num_branches; ++i) {
+      branches.emplace_back(make_branch(i));
+      branches_p[i] = &branches[i];
+    }
+    Conditional(branch_index, branches_p,
+                std::vector<XlaOp>(num_branches, operands));
+    auto modified_bi = static_cast<float>(
+        (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi);
+    ComputeAndCompareR1<float>(
+        &builder, {24.0f * modified_bi + 10, 56.0f * modified_bi + 11},
+        {branch_index_arg.get()}, error_spec_);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(CaseOpTest_Instantiation, CaseOpTest,
+                         ::testing::Values(1, 2, 3, 4, 5));
+
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
@@ -582,8 +697,8 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
   auto result = builder.Build();
   EXPECT_FALSE(result.ok());
   EXPECT_THAT(result.status().error_message(),
-              ::testing::HasSubstr("true_operand must match the shape of the "
-                                   "only parameter of true_computation"));
+              ::testing::HasSubstr("operand 0 must match the shape of the "
+                                   "only parameter of branch computation 0"));
 }
 
 XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 9174f2651cb90b364f869364fe108cf208c11a84..6530007871ced1d0bbffe2b44ccc8cf9bddd79e1 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -180,6 +181,29 @@ TEST_F(ConstantsTest, Token) {
   TF_ASSERT_OK(Execute(&builder, {}).status());
 }
 
+TEST_F(ConstantsTest, FullLike) {
+  XlaBuilder b(TestName());
+  auto val1 = Iota(&b, F32, 3);
+  auto val2 = FullLike(val1, 10);
+  val1 + val2;
+  ComputeAndCompareR1<float>(&b, {10, 11, 12}, {}, error_spec_);
+}
+
+TEST_F(ConstantsTest, IllegalFullLikeOnTuple) {
+  XlaBuilder b(TestName());
+  auto tuple = Tuple(&b, {Iota(&b, F32, 3), Iota(&b, F32, 1)});
+  FullLike(tuple, 10);  // Illegal; can't do FullLike on a tuple.
+  EXPECT_FALSE(b.Build().ok());
+}
+
+TEST_F(ConstantsTest, FullLikeScalar) {
+  XlaBuilder b(TestName());
+  auto scalar1 = ConstantR0WithType(&b, F32, 1);
+  auto scalar2 = FullLike(scalar1, 2);
+  scalar1 - scalar2;
+  ComputeAndCompareR0<float>(&b, -1, {}, error_spec_);
+}
+
 class ConstantsHloTest : public HloTestBase {};
 
 // TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
@@ -200,9 +224,7 @@ XLA_TEST_F(ConstantsHloTest, DISABLED_ON_GPU(BitcastOfConstant)) {
       ROOT result = s32[] call(parameter.0, constant-as-scalar), to_apply=func
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR0<int32>(1);
   auto result = ExecuteNoHloPasses(std::move(module), {&param});
   EXPECT_TRUE(LiteralTestUtil::Equal(param, result));
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
index 90c197140359d0021d08931b73f221d659e71144..dfbf0478e62713635446d11557367cfac6ab0dce 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -32,25 +32,26 @@ string GetFloatDataType(bool use_bfloat16) {
   return use_bfloat16 ? "bf16" : "f32";
 }
 
-struct DepthwiseConvolution2DSpec {
-  int64 output_batch, window;
+struct BatchGroupedConvolution2DSpec {
+  int64 output_batch, window, window_dilation;
   std::vector<int64> activation_dims;
-  std::vector<int64> activation_layout;
   std::vector<int64> kernel_dims;
-  std::vector<int64> kernel_layout;
   std::vector<int64> output_dims;
+  std::vector<int64> activation_and_kernel_layout;
   std::vector<int64> output_layout;
 };
 
-class DepthwiseConvolution2DTest
+class BatchGroupedConvolution2DTest
     : public HloTestBase,
       public ::testing::WithParamInterface<
-          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+          ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
 
-static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
-  std::vector<DepthwiseConvolution2DSpec> config_set;
+static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<BatchGroupedConvolution2DSpec> config_set;
   std::vector<std::vector<int64>> config_options = {
-      {16, 5, 5, 2}, {64, 4, 4, 16}, {2, 5, 5, 256}};
+      {8, 5, 3, 2},      {4, 5, 5, 2},    {8, 7, 4, 128},
+      {16, 20, 20, 256}, {256, 7, 5, 4},  {256, 6, 6, 4},
+      {256, 8, 8, 512},  {64, 7, 7, 960}, {64, 14, 14, 576}};
 
   for (auto option : config_options) {
     int64 feature = option[3];
@@ -58,93 +59,120 @@ static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
     int64 kernel_size = option[2];
     int64 batch = option[0];
 
-    std::vector<int64> kernel_layout = {3, 2, 1, 0};
-    DepthwiseConvolution2DSpec config;
+    BatchGroupedConvolution2DSpec config;
+    config.window_dilation = 1;
     config.output_batch = feature;
     config.window = kernel_size;
 
     config.activation_dims = {batch, activation_size, activation_size, feature};
-    config.activation_layout = {0, 3, 2, 1};
 
     config.kernel_dims = {batch, kernel_size, kernel_size, feature};
-    config.kernel_layout = {0, 2, 3, 1};
 
-    config.output_dims = {3, 3, feature, 1};
+    int64 output_space_size = 3 + activation_size - kernel_size;
+    config.output_dims = {output_space_size, output_space_size, feature, 1};
 
-    // Try this layout for all kernel shapes.
-    config.output_layout = {3, 2, 0, 1};
+    config.activation_and_kernel_layout = {0, 3, 1, 2};
+    config.output_layout = {2, 3, 0, 1};
     config_set.push_back(config);
+
+    BatchGroupedConvolution2DSpec different_layout_config = config;
+    different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+    config_set.push_back(different_layout_config);
+
+    // Add configurations for window dilation cases.
+    if (activation_size % 2 == 0 && activation_size == kernel_size) {
+      BatchGroupedConvolution2DSpec config;
+      config.window_dilation = 2;
+      config.output_batch = feature;
+      config.window = kernel_size / 2;
+      config.activation_dims = {batch, activation_size, activation_size,
+                                feature};
+      config.kernel_dims = {batch, kernel_size / 2, kernel_size / 2, feature};
+      config.activation_and_kernel_layout = {0, 3, 1, 2};
+      config.output_layout = {2, 3, 0, 1};
+
+      int64 output_space_size = 5;
+      config.output_dims = {output_space_size, output_space_size, feature, 1};
+
+      config_set.push_back(config);
+
+      BatchGroupedConvolution2DSpec different_layout_config = config;
+      different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+      config_set.push_back(different_layout_config);
+    }
   }
 
   return config_set;
 }
 
-string DepthwiseConvolution2DTestDataToString(
+string BatchGroupedConvolution2DTestDataToString(
     const ::testing::TestParamInfo<
-        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+        ::testing::tuple<BatchGroupedConvolution2DSpec, bool>>& data) {
   const auto& spec = ::testing::get<0>(data.param);
   const string data_type = GetFloatDataType(::testing::get<1>(data.param));
   string str = absl::StrCat(
       "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
-      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
-      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
-      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
-      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
-      absl::StrJoin(spec.output_layout, "_"), data_type);
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"),
+      "_activation_layout_",
+      absl::StrJoin(spec.activation_and_kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), data_type, "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"));
 
   // Test names are not allowed to contain the '-' character.
   absl::c_replace(str, '-', 'n');
   return str;
 }
 
-string BuildHloTextDepthwiseConvolution2D(
-    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+string BuildHloTextBatchGroupedConvolution2D(
+    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16) {
   const string data_type = GetFloatDataType(use_bfloat16);
   return absl::StrFormat(
       R"(
-    HloModule TensorFlowDepthwiseConv
+    HloModule TensorFlowDepthwiseConv, is_scheduled=true
 
     ENTRY main {
       activation = %s[%s]{%s} parameter(0)
       kernel = %s[%s]{%s} parameter(1)
       ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
-          window={size=%dx%d pad=1_1x1_1}, dim_labels=f01b_i01o->01fb,
+          window={size=%dx%d pad=1_%dx1_%d rhs_dilate=%dx%d}, dim_labels=f01b_i01o->01fb,
           batch_group_count=%d
     }
     )",
       data_type, absl::StrJoin(spec.activation_dims, ","),
-      absl::StrJoin(spec.activation_layout, ","), data_type,
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
       absl::StrJoin(spec.kernel_dims, ","),
-      absl::StrJoin(spec.kernel_layout, ","), data_type,
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
       absl::StrJoin(spec.output_dims, ","),
       absl::StrJoin(spec.output_layout, ","), data_type,
       absl::StrJoin(spec.activation_dims, ","),
-      absl::StrJoin(spec.activation_layout, ","), data_type,
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
       absl::StrJoin(spec.kernel_dims, ","),
-      absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
-      spec.output_batch);
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), spec.window,
+      spec.window, spec.window_dilation, spec.window_dilation,
+      spec.window_dilation, spec.window_dilation, spec.output_batch);
 }
 
-XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
-  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
+  const BatchGroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
   bool use_bfloat16 = ::testing::get<1>(GetParam());
   const string hlo_text =
-      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
-                            [](HloModule* module) -> Status {
-                              BFloat16MixedPrecisionRemoval remover;
-                              TF_RETURN_IF_ERROR(remover.Run(module).status());
-                              Despecializer despecializer;
-                              return despecializer.Run(module).status();
-                            }));
+      BuildHloTextBatchGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
+        BFloat16MixedPrecisionRemoval remover;
+        TF_RETURN_IF_ERROR(remover.Run(module).status());
+        Despecializer despecializer;
+        return despecializer.Run(module).status();
+      }));
 }
 
 INSTANTIATE_TEST_CASE_P(
-    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    BatchGroupedConvolution2DTestWithRandomIndices,
+    BatchGroupedConvolution2DTest,
     ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
                        ::testing::Bool()),
-    DepthwiseConvolution2DTestDataToString);
+    BatchGroupedConvolution2DTestDataToString);
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 9db9f2563b636c4f929585eb13a9c7f809833eda..cfee9c0f8a4c908d5dbdd5345ed7f839dfa4dee2 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -1945,7 +1945,7 @@ XLA_TEST_F(ConvolutionTest, ConvolveF32BackwardInputGroupedConvolution) {
 
 class ConvolutionHloTest : public HloTestBase {};
 
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64Forward)) {
+XLA_TEST_F(ConvolutionHloTest, ConvolveF64Forward) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1957,7 +1957,7 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF32ForwardReversed)) {
+XLA_TEST_F(ConvolutionHloTest, ConvolveF32ForwardReversed) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1969,7 +1969,7 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
+XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardFilter) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1981,7 +1981,7 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardInput)) {
+XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardInput) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index cad43d1b5547d74701760fa623e50466fc15c263..4687ed61a7de91bc1bce0efeadf1965ad7d52d55 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -172,8 +172,10 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
-  b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
+  b.AddInstruction(
+      custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
   module->AddEntryComputation(b.Build());
   ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
@@ -182,7 +184,7 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   Literal result = ExecuteAndTransfer(std::move(module), {&argument});
-  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
+  LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
 XLA_TEST_F(CustomCallTest, TupleOutput) {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 33d816292c5c391d2ca5a0ebaf4c80e9cc0dc88c..414d0b14a6b4f0307851fcc717c5e8a74a33782b 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -1157,6 +1158,53 @@ XLA_TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) {
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
+using EinsumParamType =
+    std::tuple<std::vector<int64>, std::vector<int64>, string>;
+class EinsumTest : public DotOperationTest,
+                   public ::testing::WithParamInterface<EinsumParamType> {};
+XLA_TEST_P(EinsumTest, SimpleEinsumTest) {
+  XlaBuilder builder(TestName());
+  auto x = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  auto y = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  Einsum(x, y, std::get<2>(GetParam()));
+  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+}
+
+std::vector<EinsumParamType> GetEinsumTestCases() {
+  using v = std::vector<int64>;
+  using p = EinsumParamType;
+  std::vector<p> test_cases = {
+      p{v{5, 6}, v{6, 7}, "mk,kn->mn"},
+      p{v{5, 6}, v{6, 7}, "mk,kn->nm"},
+      p{v{5, 6, 11}, v{6, 11, 7}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->Bnm"},
+      p{v{8, 55, 11, 3}, v{55, 11, 3, 29}, "mkBC,kBCn->BCnm"},
+      p{v{5, 6}, v{6, 7}, "ab,cd->dcba"},
+      p{v{6}, v{6, 7}, "b,bc->c"},
+      p{v{5, 6, 7}, v{5, 6, 7}, "abc,abc->ab"},
+      p{v{5, 6, 7}, v{7, 6, 5}, "abc,cba->ca"},
+      p{v{77}, v{77}, "a,a->a"},
+      p{v{77}, v{77, 55}, "a,ab->ba"},
+      p{v{2, 3, 77}, v{77, 2, 3, 55}, "ija,aijb->baij"},
+      p{v{55}, v{}, "a,->a"},
+      p{v{11, 111}, v{11}, "ab,a->ab"},
+      p{v{16, 34}, v{16, 34}, "ab,ab->ab"},
+      p{v{16, 3, 34}, v{3, 16, 34}, "abc,bac->abc"},
+      p{v{5, 19}, v{}, "ab,->ab"},
+  };
+  return test_cases;
+}
+
+INSTANTIATE_TEST_CASE_P(Einsum, EinsumTest,
+                        ::testing::ValuesIn(GetEinsumTestCases()));
+
 class DotOperationTextTest : public HloTestBase {};
 
 XLA_TEST_F(DotOperationTextTest, DotReorderedDotDims) {
@@ -1189,5 +1237,129 @@ ENTRY %test {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
 }
 
+XLA_TEST_F(DotOperationTextTest, DotWithNoDnums) {
+  absl::string_view hlo_string =
+      R"(
+HloModule DotWithNoDnums
+
+ENTRY %test {
+  %lhs = f32[2,3]{1,0} parameter(0)
+  %rhs = f32[4,5]{1,0} parameter(1)
+  ROOT %dot = f32[2,3,4,5]{3,2,1,0} dot(%lhs, %rhs)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, Einsum) {
+  absl::string_view hlo_string =
+      R"(
+HloModule Einsum
+
+ENTRY %test {
+  %lhs = f32[8,64,96]{2,1,0} parameter(0)
+  %rhs = f32[96,32,4]{2,1,0} parameter(1)
+  ROOT %dot = f32[8,64,32,4]{3,2,1,0}  dot(%lhs, %rhs), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_1) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(2)
+  rhs_1 = f32[1,40] parameter(1)
+
+  dot_0 = f32[20,1] dot(lhs, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[20,1] dot(lhs, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  ROOT result = f32[20,1] divide(dot_0, dot_1)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_2) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs_0 = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(1)
+  lhs_1 = f32[1,40] parameter(2)
+  rhs_1 = f32[20,40] parameter(3)
+
+  dot_0 = f32[20,1] dot(lhs_0, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[1,20] dot(lhs_1, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  dot_0_reshaped = f32[20] reshape(dot_0)
+  dot_1_reshaped = f32[20] reshape(dot_1)
+
+  ROOT result = f32[20] divide(dot_0_reshaped, dot_1_reshaped)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuIntegerDotCodegen)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s32[1,2,2] parameter(0)
+  arg1 = s32[1,2,1] parameter(1)
+  ROOT dot = s32[1,2,1] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuTransposeOutput)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule TransposeOutput
+
+ENTRY TransposeOutput {
+  p0 = f32[32,32] parameter(0)
+  p1 = f32[32,64] parameter(1)
+  dot = f32[32,64] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  ROOT tr = f32[64,32] transpose(dot), dimensions={1,0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, MatrixVectorComplex) {
+  absl::string_view hlo_string =
+      R"(
+HloModule MatrixVectorComplex
+
+ENTRY MatrixVectorComplex {
+  p0 = c64[5,5] parameter(0)
+  p1 = c64[5,1] parameter(1)
+  p2 = c64[5,1] parameter(2)
+  dot = c64[5,1] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT add = c64[5,1] add(dot, p2)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
deleted file mode 100644
index c84973e17b234c24c84f02a369ce0185f5772cca..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/base/casts.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-
-namespace xla {
-namespace {
-class ExhaustiveF32ElementwiseOpTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<std::pair<int64, int64>> {
- protected:
-  ErrorSpec error_spec_{0.0001, 0.0001, /*relaxed_nans=*/true};
-
-  template <typename EnqueueOpTy>
-  void ExhaustivelyTestF32Op(EnqueueOpTy enqueue_op,
-                             float (*evaluate_op)(float),
-                             std::pair<int64, int64> known_incorrect_range) {
-    int64 begin, end;
-    std::tie(begin, end) = GetParam();
-    int64 input_size = end - begin;
-    LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
-
-    XlaBuilder builder(TestName());
-
-    Literal input_literal =
-        LiteralUtil::CreateFromDimensions(F32, {input_size});
-    for (int64 i = begin; i < end; i++) {
-      if (i >= known_incorrect_range.first &&
-          i < known_incorrect_range.second) {
-        // If the operation is known to be buggy on a specific input clamp that
-        // input to 0 under the assumption that the op is at least correct on 0.
-        input_literal.Set({i - begin}, 0.0f);
-      } else {
-        input_literal.Set({i - begin}, absl::bit_cast<float, int>(i));
-      }
-    }
-
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                            client_->TransferToServer(input_literal));
-
-    auto input = Parameter(&builder, 0, input_literal.shape(), "input");
-    enqueue_op(&builder, input);
-
-    std::vector<float> expected_result;
-    expected_result.reserve(input_size);
-    for (int64 i = 0; i < input_size; i++) {
-      expected_result.push_back(evaluate_op(input_literal.Get<float>({i})));
-    }
-
-    ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
-                               error_spec_);
-  }
-};
-
-XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
-#ifdef XLA_TEST_BACKEND_CPU
-  // TODO(b/73141998): The vectorized Log implementation gives results outside
-  // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64).
-  std::pair<int64, int64> known_incorrect_range = {1, 8388608};
-#else
-  std::pair<int64, int64> known_incorrect_range = {0, 0};
-#endif
-
-  ExhaustivelyTestF32Op(
-      [](XlaBuilder* builder, const XlaOp& input) { Log(input); }, std::log,
-      known_incorrect_range);
-}
-
-XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
-#ifdef XLA_TEST_BACKEND_CPU
-  // TODO(b/73142289): The vectorized Exp implementation gives results outside
-  // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64):
-  std::pair<int64, int64> known_incorrect_range = {1107296256 + 11583654,
-                                                   1107296256 + 11629080};
-#else
-  std::pair<int64, int64> known_incorrect_range = {0, 0};
-#endif
-
-  ExhaustivelyTestF32Op(
-      [](XlaBuilder* builder, const XlaOp& input) { Exp(input); }, std::exp,
-      known_incorrect_range);
-}
-
-XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
-  ExhaustivelyTestF32Op(
-      [](XlaBuilder* builder, const XlaOp& input) { Tanh(input); }, std::tanh,
-      /*known_incorrect_range=*/{0, 0});
-}
-
-std::vector<std::pair<int64, int64>> CreateExhaustiveParameters() {
-  // We break up the 2^32-element space into small'ish chunks to keep peak
-  // memory usage low.
-  std::vector<std::pair<int64, int64>> result;
-  const int64 step = 1 << 25;
-  for (int64 i = 0; i < (1l << 32); i += step) {
-    result.push_back({i, i + step});
-  }
-  return result;
-}
-
-INSTANTIATE_TEST_CASE_P(ExhaustiveF32ElementwiseOpTestInstance,
-                        ExhaustiveF32ElementwiseOpTest,
-                        ::testing::ValuesIn(CreateExhaustiveParameters()));
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58bb9a217b805a142869149c19d7bcfc91a1aee1
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
@@ -0,0 +1,646 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include "absl/base/casts.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+using Eigen::half;
+
+template <typename T, size_t N>
+T EvaluatePolynomial(T x, const std::array<T, N>& coeffs) {
+  T result = 0;
+  for (T c : coeffs) {
+    result = result * x + c;
+  }
+  return result;
+}
+
+// There's no std::erfinv, so we have to implement it ourselves.  This follows
+// Wichura 1998, https://www.jstor.org/stable/2347330 which, notably, is a
+// different implementation from that in math.cc.
+float HostErfInv(float x) {
+  std::array<double, 8> kPolyA = {
+      8.8709406962545514830200e2, 1.1819493347062294404278e4,
+      2.3782041382114385731252e4, 1.6235862515167575384252e4,
+      4.8548868893843886794648e3, 6.9706266534389598238465e2,
+      4.7072688112383978012285e1, 1.1975323115670912564578e0,
+  };
+  std::array<double, 8> kPolyB = {
+      5.2264952788528545610e3, 2.8729085735721942674e4, 3.9307895800092710610e4,
+      2.1213794301586595867e4, 5.3941960214247511077e3, 6.8718700749205790830e2,
+      4.2313330701600911252e1, 1.0000000000000000000e0,
+  };
+  std::array<double, 8> kPolyC = {
+      7.74545014278341407640e-4, 2.27238449892691845833e-2,
+      2.41780725177450611770e-1, 1.27045825245236838258e0,
+      3.64784832476320460504e0,  5.76949722146069140550e0,
+      4.63033784615654529590e0,  1.42343711074968357734e0,
+  };
+  std::array<double, 8> kPolyD = {
+      1.4859850019840355905497876e-9, 7.7441459065157709165577218e-4,
+      2.1494160384252876777097297e-2, 2.0945065210512749128288442e-1,
+      9.7547832001787427186894837e-1, 2.3707661626024532365971225e0,
+      2.9036514445419946173133295e0,  1.4142135623730950488016887e0,
+  };
+  std::array<double, 8> kPolyE = {
+      2.01033439929228813265e-7, 2.71155556874348757815e-5,
+      1.24266094738807843860e-3, 2.65321895265761230930e-2,
+      2.96560571828504891230e-1, 1.78482653991729133580e0,
+      5.46378491116411436990e0,  6.65790464350110377720e0,
+  };
+  std::array<double, 8> kPolyF = {
+      2.891024605872965461538222e-15, 2.010321207683943062279931e-7,
+      2.611088405080593625138020e-5,  1.112800997078859844711555e-3,
+      2.103693768272068968719679e-2,  1.936480946950659106176712e-1,
+      8.482908416595164588112026e-1,  1.414213562373095048801689e0,
+  };
+
+  if (std::abs(x) > 1 || std::isnan(x)) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  if (std::abs(x) == 1) {
+    return std::copysign(std::numeric_limits<float>::infinity(), x);
+  }
+
+  float unsigned_result = [&] {
+    float y = std::abs(x);
+    if (y <= 0.85) {
+      double r = 0.180625 - 0.25 * y * y;
+      return (y * EvaluatePolynomial(r, kPolyA)) /
+             EvaluatePolynomial(r, kPolyB);
+    } else {
+      double r = std::sqrt(std::log(2.0) - std::log1p(-y));
+      if (r <= 5.0) {
+        r -= 1.6;
+        return EvaluatePolynomial(r, kPolyC) / EvaluatePolynomial(r, kPolyD);
+      } else {
+        r -= 5;
+        return EvaluatePolynomial(r, kPolyE) / EvaluatePolynomial(r, kPolyF);
+      }
+    }
+  }();
+  return std::copysign(unsigned_result, x);
+}
+
+// Digamma implementation using a polynomial from Cephes.  Notably this is a
+// different implementation from the one in math.cc.
+float HostDigamma(float x) {
+  // Euler-Mascheroni constant
+  float kGamma = 0.57721566490153286061;
+  float kPi = M_PI;
+
+  std::array<float, 4> kPoly = {
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2,
+  };
+
+  float reflection = 0;
+  if (x <= 0) {
+    float floor = std::floor(x);
+    if (x == floor) {
+      return std::numeric_limits<float>::quiet_NaN();
+    }
+    // Compute reflection term, pi * cot(pi * x).
+    reflection = x - floor;
+    if (reflection == 0.5) {
+      reflection = 0;
+    } else {
+      if (reflection > 0.5) {
+        reflection = x - (floor + 1.0f);
+      }
+      reflection = kPi / std::tan(kPi * reflection);
+    }
+    x = 1 - x;
+  }
+
+  float result = 0;
+  if (x <= 10 && x == std::floor(x)) {
+    // Special case for integers <= 10.
+    for (int i = 1; i < x; ++i) {
+      result += 1.0f / i;
+    }
+    result -= kGamma;
+  } else {
+    float w = 0;
+    for (; x < 10; ++x) {
+      w += 1.0f / x;
+    }
+    if (x < 1e8) {
+      float z = 1.0f / (x * x);
+      result = z * EvaluatePolynomial(z, kPoly);
+    }
+    result = std::log(x) - 0.5f / x - result - w;
+  }
+
+  // Compute the final, reflected value.
+  return result - reflection;
+}
+
+// For f32, f16, and bf16, we need 9, 5, and 4 decimal places of precision to be
+// guaranteed that we're printing the full number.
+//
+// (The general formula is, given a floating-point number with S significand
+// bits, the number of decimal digits needed to print it to full precision is
+//
+//   ceil(1 + S * log_10(2)) ~= ceil(1 + S * 0.30103).
+//
+// See https://people.eecs.berkeley.edu/~wkahan/Math128/BinDecBin.pdf.)
+string StringifyNum(float x) {
+  return absl::StrFormat("%0.9g (0x%08x)", x, absl::bit_cast<uint32>(x));
+}
+
+string StringifyNum(half x) {
+  return absl::StrFormat("%0.5g (0x%04x)", static_cast<float>(x),
+                         absl::bit_cast<uint16>(x));
+}
+
+string StringifyNum(bfloat16 x) {
+  return absl::StrFormat("%0.4g (0x%04x)", static_cast<float>(x),
+                         absl::bit_cast<uint16>(x));
+}
+
+// Test parameter is a tuple containing
+//   - primitive type under test,
+//   - (begin, end) range under test, as zero-extended int64s bitcast to the
+//     primtive type under test.
+class ExhaustiveOpTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, std::pair<int64, int64>>> {
+ public:
+  ExhaustiveOpTest()
+      : ty_(std::get<0>(GetParam())), platform_(client_->platform()->Name()) {}
+
+  void Run(std::function<XlaOp(XlaOp)> enqueue_op,
+           float (*evaluate_op)(float)) {
+    SetFastMathDisabled(true);
+
+    // Run all HLO passes.  In particular, constant folding is disabled by
+    // default for tests, but we need to run it in order to tickle some bugs.
+    mutable_debug_options()->clear_xla_disable_hlo_passes();
+
+    PrimitiveType ty;
+    std::tie(ty, std::ignore) = GetParam();
+
+    switch (ty) {
+      case F32:
+        SetDefaultErrSpec(0.0001, 0.0001);
+        RunImpl<float, uint32>(enqueue_op, evaluate_op);
+        break;
+      case F16:
+        SetDefaultErrSpec(0.001, 0.001);
+        RunImpl<half, uint16>(enqueue_op, evaluate_op);
+        break;
+      case BF16:
+        SetDefaultErrSpec(0.001, 0.01);
+        RunImpl<bfloat16, uint16>(enqueue_op, evaluate_op);
+        break;
+      default:
+        LOG(FATAL) << "Unhandled type.";
+    }
+  }
+
+  void SetDefaultErrSpec(float abs_err, float rel_err) {
+    if (!abs_err_.has_value()) {
+      abs_err_ = abs_err;
+    }
+    if (!rel_err_.has_value()) {
+      rel_err_ = rel_err;
+    }
+  }
+
+  template <typename T, typename IntegralT>
+  void RunImpl(std::function<XlaOp(XlaOp)> enqueue_op,
+               float (*evaluate_op)(float)) {
+    static_assert(
+        sizeof(T) == sizeof(IntegralT),
+        "IntegralT must be an unsigned integer type of the same width as T.");
+
+    PrimitiveType ty;
+    std::pair<int64, int64> test_range;
+    std::tie(ty, test_range) = GetParam();
+    int64 begin, end;
+    std::tie(begin, end) = test_range;
+
+    if (begin >= known_incorrect_begin_ && end <= known_incorrect_end_) {
+      LOG(INFO) << absl::StreamFormat(
+          "Skipping this shard, as the range under test, [%d, %d), falls "
+          "entirely within the known-incorrect range [%d, %d).",
+          begin, end, known_incorrect_begin_, known_incorrect_end_);
+      return;
+    }
+
+    LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
+
+    int64 input_size = end - begin;
+    Literal input_literal = LiteralUtil::CreateFromDimensions(ty, {input_size});
+    absl::Span<T> input_arr = input_literal.data<T>();
+    for (int64 i = 0; i < input_size; i++) {
+      IntegralT input_val = i + begin;
+      // If the operation is known to be buggy on a specific input clamp that
+      // input to 0 under the assumption that the op is at least correct on 0.
+      if (input_val >= known_incorrect_begin_ &&
+          input_val < known_incorrect_end_) {
+        input_arr[i] = T{0};
+      } else {
+        input_arr[i] = absl::bit_cast<T>(input_val);
+      }
+    }
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
+                            BuildAndRunComputation(enqueue_op, input_literal));
+    ExpectNear<T>(input_literal, result_literal, evaluate_op);
+  }
+
+  StatusOr<Literal> BuildAndRunComputation(
+      const std::function<XlaOp(XlaOp)>& enqueue_op,
+      const Literal& input_literal) {
+    XlaBuilder builder(TestName());
+    auto input = Parameter(&builder, 0, input_literal.shape(), "input");
+    enqueue_op(input);
+    TF_ASSIGN_OR_RETURN(XlaComputation comp, builder.Build());
+
+    // Build and run the computation using the LocalClient API, rather than the
+    // plain Client API, which is used by ClientLibraryTestBase.  This is
+    // because the plain Client API results does more memcpys to/from Literals,
+    // and that's slow given that we're touching a lot of data here.
+    //
+    // Copy debug options from ClientLibraryTestBase.  In particular, we're
+    // interested in disabling constant folding.
+    ExecutableBuildOptions build_opts;
+    *build_opts.mutable_debug_options() = *mutable_debug_options();
+    TF_ASSIGN_OR_RETURN(
+        auto executable,
+        client_->Compile(comp, {&input_literal.shape()}, build_opts));
+
+    TF_ASSIGN_OR_RETURN(
+        ScopedShapedBuffer input_data,
+        client_->LiteralToShapedBuffer(input_literal, /*device_ordinal=*/0));
+
+    ExecutableRunOptions run_opts;
+    run_opts.set_allocator(client_->backend().memory_allocator());
+    run_opts.set_intra_op_thread_pool(
+        client_->backend().eigen_intra_op_thread_pool_device());
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                        executable->Run({&input_data}, run_opts));
+
+    TF_ASSIGN_OR_RETURN(Literal result_literal,
+                        client_->ShapedBufferToLiteral(result));
+    return std::move(result_literal);
+  }
+
+  template <typename T>
+  bool IsClose(T expected, T actual) {
+    float expected_f32 = static_cast<float>(expected);
+    float actual_f32 = static_cast<float>(actual);
+    float abs_err = std::abs(expected_f32 - actual_f32);
+    float rel_err = abs_err / std::abs(expected_f32);
+    if (strict_signed_zeros_ && actual == T{0} && expected == T{0}) {
+      // Check sign of zero.
+      return std::signbit(actual_f32) == std::signbit(expected_f32);
+    }
+    return abs_err < *abs_err_ || rel_err < *rel_err_ ||
+           (std::isnan(expected_f32) && std::isnan(actual_f32)) ||
+           (std::isinf(expected_f32) && std::isinf(actual_f32) &&
+            (expected_f32 > 0) == (actual_f32 > 0));
+  }
+
+  template <typename T>
+  void ExpectNear(const Literal& input_literal, const Literal& result_literal,
+                  float (*evaluate_op)(float)) {
+    // We essentially reimplement LiteralTestUtil::Near here because
+    //  a) this streamlined implementation is much faster, and
+    //  b) we can print out better error messages (namely, we can print out
+    //     which floating-point value input failed, while LiteralTestUtil::Near
+    //     can only print out the input index that failed).
+    //  c) we need special handling of certain inputs.  For example, we say that
+    //     a denormal input has multiple correct outputs (namely, f(x) and f(0))
+    //     and just needs to be close to one of them.
+    absl::Span<const T> input_arr = input_literal.data<T>();
+    absl::Span<const T> result_arr = result_literal.data<T>();
+    ASSERT_EQ(result_arr.size(), input_arr.size());
+    int64 mismatches = 0;
+    // Hoisting these out of the loop is a nice speedup on shards that have many
+    // denormals.
+    const T expected_at_pos_zero = static_cast<T>(evaluate_op(0));
+    const T expected_at_neg_zero = static_cast<T>(evaluate_op(-0.0));
+    for (int64 i = 0; i < input_arr.size(); ++i) {
+      T input = input_arr[i];
+      float input_f32 = static_cast<float>(input);
+      T actual = result_arr[i];
+      T expected = static_cast<T>(evaluate_op(input_f32));
+
+      if (IsClose(expected, actual)) {
+        continue;
+      }
+
+      // Easy case: If `input` is not denormal and !IsClose(expected, actual),
+      // print an error.
+      //
+      // (This doesn't correctly detect f16 and bfloat16 denormals!  This seems
+      // to be OK for now, but at some point we may need to implement fpclassify
+      // for half and bfloat.)
+      if (std::fpclassify(input_f32) != FP_SUBNORMAL) {
+        PrintMismatch(&mismatches, [&] {
+          return absl::StrFormat("Mismatch on %s. Expected %s, but got %s.",
+                                 StringifyNum(input), StringifyNum(expected),
+                                 StringifyNum(actual));
+        });
+        continue;
+      }
+
+      // Otherwise, `input` is denormal.  For denormal inputs, we accept answers
+      // that are close to any of:
+      //
+      //   - evaluate_op(input)
+      //   - evaluate_op(+/-0), where the sign of 0 equal to the sign of
+      //     `input`,
+      //   - if relaxed_denormal_signs_, evaluate_op(-/+0), where the sign of
+      //     0 is the opposite of `input`.
+      T sign_preserving_ftz_expected =
+          std::signbit(input_f32) ? expected_at_neg_zero : expected_at_pos_zero;
+      T sign_nonpreserving_ftz_expected =
+          std::signbit(input_f32) ? expected_at_pos_zero : expected_at_neg_zero;
+      if (IsClose(sign_preserving_ftz_expected, actual) ||
+          (relaxed_denormal_signs_ &&
+           IsClose(sign_nonpreserving_ftz_expected, actual))) {
+        continue;
+      }
+
+      if (relaxed_denormal_signs_) {
+        PrintMismatch(&mismatches, [&] {
+          return absl::StrFormat(
+              "Mismatch on denormal value %s.  Expected one of:\n"
+              "  %10s (evaluated at full-precision value)\n"
+              "  %10s (evaluated after flushing to sign-preserving zero)\n"
+              "  %10s (evaluated after flushing to non-sign-preserving "
+              "zero)\n"
+              "but got %s.",
+              StringifyNum(input), StringifyNum(expected),
+              StringifyNum(sign_preserving_ftz_expected),
+              StringifyNum(sign_nonpreserving_ftz_expected),
+              StringifyNum(actual));
+        });
+      } else {
+        PrintMismatch(&mismatches, [&] {
+          return absl::StrFormat(
+              "Mismatch on denormal value %s.  Expected one of:\n"
+              "  %10s (evaluated at full-precision value)\n"
+              "  %10s (evaluated after flushing to sign-preserving zero)\n"
+              "but got %s.",
+              StringifyNum(input), StringifyNum(expected),
+              StringifyNum(sign_preserving_ftz_expected), StringifyNum(actual));
+        });
+      }
+    }
+    EXPECT_EQ(mismatches, 0);
+  }
+
+  template <typename ErrorGenerator>
+  void PrintMismatch(int64* mismatches, const ErrorGenerator& err_generator) {
+    // We send a few mismatches to gunit so they show up nicely in test logs.
+    // Then we send more to LOG(ERROR).  The remainder we squelch unless we're
+    // at vlog level 2.
+    constexpr int64 kMaxMismatchesLoggedToGunit = 10;
+    constexpr int64 kMaxMismatchesLoggedToErr = 1000;
+
+    (*mismatches)++;
+    if (*mismatches < kMaxMismatchesLoggedToGunit) {
+      FAIL() << err_generator();
+    } else if (*mismatches < kMaxMismatchesLoggedToErr || VLOG_IS_ON(2)) {
+      LOG(ERROR) << err_generator();
+    } else if (*mismatches == kMaxMismatchesLoggedToErr) {
+      LOG(ERROR) << "Not printing any more mismatches; pass "
+                    "--vmodule=exhaustive_f32__op_test=2 to see "
+                    "all of them.";
+    }
+  }
+
+  // The following members are set during construction so testcases can read
+  // these values and use them e.g. to influence the values given to the mutable
+  // members below.
+
+  // The primitive type under test.
+  const PrimitiveType ty_;
+
+  // The platform under test.
+  const string platform_;
+
+  // Tests can set the following variables for control over execution.  This is
+  // safe because each XLA_TEST_P instantiates a new instance of this class.
+
+  // Testing will ignore the given range (encoded as bitwise representations of
+  // the type under test zero-extended to int64).
+  int64 known_incorrect_begin_ = 0;
+  int64 known_incorrect_end_ = 0;
+
+  // If unset, reasonable defaults will be used depending on the type under
+  // test.
+  absl::optional<float> abs_err_;
+  absl::optional<float> rel_err_;
+
+  // If true, will consider -0 not near to +0 and vice versa.  Note that
+  // +epsilon may still be considered close to -0, depending on the error spec;
+  // this only covers the case when both `expected` and `actual` are equal to 0.
+  bool strict_signed_zeros_ = false;
+
+  // If true, allows denormals to be flushed to non-sign-preserving 0.
+  //
+  // For example, normally we'd expect sqrt(-denormal) to be either nan (sqrt of
+  // a negative number) or -inf (flush the denormal to sign-perserving zero,
+  // then sqrt(-0)).  But with this as true, we'll also accept 0 (sqrt(0)).
+  //
+  // XLA:GPU preserves denormal signs, but other backends don't.
+  bool relaxed_denormal_signs_ = platform_ != "CUDA";
+};
+
+XLA_TEST_P(ExhaustiveOpTest, Log) {
+  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
+    abs_err_ = 0.001;
+    rel_err_ = 0.001;
+  }
+
+  Run(Log, std::log);
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Log1p) {
+  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
+    abs_err_ = 0.001;
+    rel_err_ = 0.001;
+  }
+
+  Run(Log1p, std::log1p);
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Exp) {
+  if (platform_ == "Host" && ty_ == F32) {
+    // TODO(b/73142289): The vectorized Exp implementation gives results outside
+    // our error spec in this range.
+    known_incorrect_begin_ = 1107296256 + 11583654;
+    known_incorrect_end_ = 1107296256 + 11629080;
+  } else if (platform_ == "Host" && ty_ == BF16) {
+    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
+    //
+    // Mismatch on 88.5 (0x42b1).
+    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
+    known_incorrect_begin_ = 0x42b1;
+    known_incorrect_end_ = 0x42b2;
+  }
+
+  Run(Exp, std::exp);
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Expm1) {
+  // Expm1 has the same erroneous behavior on CPU as Exp.
+  if (platform_ == "Host" && ty_ == F32) {
+    // TODO(b/73142289): The vectorized Exp implementation gives results outside
+    // our error spec in this range.
+    known_incorrect_begin_ = 1107296256 + 11583654;
+    known_incorrect_end_ = 1107296256 + 11629080;
+  } else if (platform_ == "Host" && ty_ == BF16) {
+    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
+    //
+    // Mismatch on 88.5 (0x42b1).
+    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
+    known_incorrect_begin_ = 0x42b1;
+    known_incorrect_end_ = 0x42b2;
+  }
+
+  Run(Expm1, std::expm1);
+}
+
+// It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
+// this *did* find a bug, namely that some backends were assuming sqrt(x) ==
+// pow(x, 0.5), but this is not true for x == -inf.
+XLA_TEST_P(ExhaustiveOpTest, PowOneHalf) {
+  Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); },
+      +[](float x) { return std::pow(x, 0.5f); });
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Rsqrt) {
+  Run(
+      Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Sqrt) {
+  if (platform_ == "Host" || platform_ == "CUDA") {
+    strict_signed_zeros_ = true;
+  }
+
+  Run(Sqrt, std::sqrt);
+}
+
+// TODO(jlebar): Add remaining trig functions.  Don't forget Atan2!
+// TODO(jlebar): Test trig functions over complex inputs.
+XLA_TEST_P(ExhaustiveOpTest, Tanh) { Run(Tanh, std::tanh); }
+
+XLA_TEST_P(ExhaustiveOpTest, Erf) { Run(Erf, std::erf); }
+XLA_TEST_P(ExhaustiveOpTest, Erfc) { Run(Erfc, std::erfc); }
+XLA_TEST_P(ExhaustiveOpTest, ErfInv) { Run(ErfInv, HostErfInv); }
+XLA_TEST_P(ExhaustiveOpTest, Digamma) {
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    // TODO(b/123956399): This is a fairly high error, significantly higher than
+    // we see on CPU/GPU.
+    rel_err_ = 0.01;
+    abs_err_ = 0.01;
+  }
+
+  if (platform_ == "CUDA") {
+    // On GPU we get a wrong answer for the denormal inputs +/-2.93873588e-39
+    // (0x00200000 and 0x80200000).  These should return -/+inf (at least
+    // according to our reference implementation!) but XLA:GPU returns
+    // -/+3.40282326e+38 (0xff7ffffe and 0x7f7ffffe).
+    //
+    // I deem this an acceptable result, as XLA:GPU flushes denormals, and as
+    // the results we get here are very close to MAX_FLOAT.  We just hardcode
+    // these results, as this is better than ignoring these inputs altogether.
+    auto host_digamma_with_gpu_ftz_errors = +[](float x) {
+      if (absl::bit_cast<uint32>(x) == 0x00200000 ||
+          absl::bit_cast<uint32>(x) == 0x80200000) {
+        return std::copysign(std::numeric_limits<float>::max(), -x);
+      }
+      return HostDigamma(x);
+    };
+    Run(Digamma, host_digamma_with_gpu_ftz_errors);
+  } else {
+    Run(Digamma, HostDigamma);
+  }
+}
+XLA_TEST_P(ExhaustiveOpTest, Lgamma) {
+  // Our implementation gets within 0.0001 rel error except for ~20 denormal
+  // inputs on GPU.  Anyway 0.001 rel error should be good enough for lgamma.
+  if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
+    rel_err_ = 0.001;
+  }
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    // TODO(b/123956399): This is a fairly high error, significantly higher than
+    // we see on CPU/GPU.
+    rel_err_ = 0.01;
+    abs_err_ = 0.01;
+
+    // Overflows for to inf for input 4.08500343e+36 (0x7c44af8e).
+    if (ty_ == F32) {
+      known_incorrect_begin_ = 0x7c44af8e;
+      known_incorrect_end_ = 0x7c44af8e + 1;
+    }
+  }
+  Run(Lgamma, std::lgamma);
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Round) { Run(Round, std::round); }
+
+std::vector<std::pair<int64, int64>> CreateExhaustiveF32Ranges() {
+  // We break up the 2^32-element space into small'ish chunks to keep peak
+  // memory usage low.
+  std::vector<std::pair<int64, int64>> result;
+  const int64 step = 1 << 25;
+  for (int64 i = 0; i < (1l << 32); i += step) {
+    result.push_back({i, i + step});
+  }
+  return result;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    F32, ExhaustiveOpTest,
+    ::testing::Combine(::testing::Values(F32),
+                       ::testing::ValuesIn(CreateExhaustiveF32Ranges())));
+
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+INSTANTIATE_TEST_SUITE_P(
+    F16, ExhaustiveOpTest,
+    ::testing::Combine(::testing::Values(F16),
+                       ::testing::Values(std::make_pair(0, 1 << 16))));
+#endif
+
+#if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
+INSTANTIATE_TEST_SUITE_P(
+    BF16, ExhaustiveOpTest,
+    ::testing::Combine(::testing::Values(BF16),
+                       ::testing::Values(std::make_pair(0, 1 << 16))));
+#endif
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index 1b0bebe2d03a9a153cd0c80329ed0c49c91333a3..5d91326aad0671b484341db3aa6d618aa646e8c3 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -47,8 +47,9 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
   }
 
   tensorflow::SubProcess file_check_process;
-  file_check_process.SetProgram(file_check_path,
-                                {file_check_path, "-v", pattern_path});
+  file_check_process.SetProgram(
+      file_check_path,
+      {file_check_path, "-v", "-dump-input=always", pattern_path});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
                                       tensorflow::ACTION_PIPE);
   file_check_process.SetChannelAction(tensorflow::CHAN_STDERR,
diff --git a/tensorflow/compiler/xla/tests/fmax_fmin_test.cc b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7423ac0bcdb0bc305ee384fb98bd17413404ecef
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class FmaxSimpleTest : public ClientLibraryTestBase {};
+
+TEST_F(FmaxSimpleTest, FmaxTenValues) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
+  auto y = ConstantR1<float>(
+      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
+  Max(x, y);
+
+  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
+                                 5.0,  6.0, 7.0, 8.0, 9.0};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Max(param0, param1);
+  std::vector<float> expected = {INFINITY, INFINITY, NAN,      NAN, INFINITY,
+                                 -5,       NAN,      INFINITY, 8,   NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FminEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Min(param0, param1);
+  std::vector<float> expected = {INFINITY,  -INFINITY, NAN, NAN,       -4,
+                                 -INFINITY, NAN,       7,   -INFINITY, NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
deleted file mode 100644
index c5bbbe778df15d63a2586bd6291a7a33fc82aa52..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-class FmaxSimpleTest : public ClientLibraryTestBase {};
-
-TEST_F(FmaxSimpleTest, FmaxTenValues) {
-  XlaBuilder builder(TestName());
-  auto x = ConstantR1<float>(
-      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
-  auto y = ConstantR1<float>(
-      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
-  Max(x, y);
-
-  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
-                                 5.0,  6.0, 7.0, 8.0, 9.0};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 2178c9b3f3d39ac034c59585c6836d2bc59162c1..c410f1f6d8fbcd3be228c23614d1d69888c781b8 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -63,7 +63,11 @@ const float test_float_vals[3][test_width][test_height] = {
 class FusionTest : public HloTestBase {
  protected:
   template <typename T, int Arity>
-  void TestElementwise2D(HloOpcode opcode) {
+  void TestElementwise2D(
+      HloOpcode opcode,
+      absl::optional<ComparisonDirection> direction = absl::nullopt) {
+    // Create a variable for comparisons since they require the direction.
+    bool is_compare = std::is_same<T, bool>::value;
     Array2D<float> operand_data[Arity];
     for (int i = 0; i < Arity; ++i) {
       new (&operand_data[i]) Array2D<float>(test_width, test_height);
@@ -76,12 +80,16 @@ class FusionTest : public HloTestBase {
           xs[k] = test_float_vals[k][i][j];
           operand_data[k](i, j) = xs[k];
         }
-        answer_data(i, j) = ComputeElementwiseAnswer<T>(opcode, xs);
+        if (is_compare) {
+          answer_data(i, j) = ComputeElementwiseAnswerCompare(*direction, xs);
+        } else {
+          answer_data(i, j) = ComputeElementwiseAnswerFloat(opcode, xs);
+        }
       }
     }
 
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewUnverifiedModule();
+    auto hlo_module = CreateNewVerifiedModule();
 
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
@@ -98,8 +106,13 @@ class FusionTest : public HloTestBase {
         root_hlo = HloInstruction::CreateUnary(answer_shape, opcode, hlos[1]);
         break;
       case 2:
-        root_hlo = HloInstruction::CreateBinary(answer_shape, opcode, hlos[1],
-                                                hlos[2]);
+        if (is_compare) {
+          root_hlo = HloInstruction::CreateCompare(answer_shape, hlos[1],
+                                                   hlos[2], *direction);
+        } else {
+          root_hlo = HloInstruction::CreateBinary(answer_shape, opcode, hlos[1],
+                                                  hlos[2]);
+        }
         break;
       case 3:
         root_hlo = HloInstruction::CreateTernary(answer_shape, opcode, hlos[1],
@@ -124,13 +137,19 @@ class FusionTest : public HloTestBase {
   }
 
  private:
-  template <typename T>
-  T ComputeElementwiseAnswer(HloOpcode opcode, absl::Span<const float> xs);
+  float ComputeElementwiseAnswerFloat(HloOpcode opcode,
+                                      absl::Span<const float> xs);
+  bool ComputeElementwiseAnswerCompare(ComparisonDirection direction,
+                                       absl::Span<const float> xs);
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.add_xla_disable_hlo_passes("layout-assignment");
+    return debug_options;
+  }
 };
 
-template <>
-float FusionTest::ComputeElementwiseAnswer<float>(HloOpcode opcode,
-                                                  absl::Span<const float> xs) {
+float FusionTest::ComputeElementwiseAnswerFloat(HloOpcode opcode,
+                                                absl::Span<const float> xs) {
   switch (opcode) {
     case HloOpcode::kAdd:
       return xs[0] + xs[1];
@@ -153,24 +172,21 @@ float FusionTest::ComputeElementwiseAnswer<float>(HloOpcode opcode,
   }
 }
 
-template <>
-bool FusionTest::ComputeElementwiseAnswer<bool>(HloOpcode opcode,
-                                                absl::Span<const float> xs) {
-  switch (opcode) {
-    case HloOpcode::kEq:
+bool FusionTest::ComputeElementwiseAnswerCompare(ComparisonDirection direction,
+                                                 absl::Span<const float> xs) {
+  switch (direction) {
+    case ComparisonDirection::kEq:
       return xs[0] == xs[1];
-    case HloOpcode::kNe:
+    case ComparisonDirection::kNe:
       return xs[0] != xs[1];
-    case HloOpcode::kGt:
+    case ComparisonDirection::kGt:
       return xs[0] > xs[1];
-    case HloOpcode::kLt:
+    case ComparisonDirection::kLt:
       return xs[0] < xs[1];
-    case HloOpcode::kGe:
+    case ComparisonDirection::kGe:
       return xs[0] >= xs[1];
-    case HloOpcode::kLe:
+    case ComparisonDirection::kLe:
       return xs[0] <= xs[1];
-    default:
-      LOG(FATAL) << "No comparatory opcode: " << opcode;
   }
 }
 
@@ -183,7 +199,7 @@ XLA_TEST_F(FusionTest, Test) {
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -231,7 +247,7 @@ XLA_TEST_F(FusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -266,7 +282,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
       ShapeUtil::MakeShapeWithLayout(F32, {rand_dim0_size, dim1_size}, {1, 0});
   // Build simple fusion computation: y = x^2 (elementwise).
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
 
   auto two = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
@@ -290,7 +306,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
 
 XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -314,7 +330,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -329,7 +345,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -344,7 +360,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
@@ -359,7 +375,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
@@ -374,7 +390,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -389,7 +405,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
 
 XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
@@ -404,7 +420,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
@@ -419,7 +435,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -434,7 +450,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -449,7 +465,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
 
 XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -465,7 +481,7 @@ XLA_TEST_F(FusionTest, Reverse) {
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -483,7 +499,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -501,7 +517,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
 
 XLA_TEST_F(FusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -519,7 +535,7 @@ XLA_TEST_F(FusionTest, SliceNegate) {
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
@@ -541,7 +557,7 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto reshape1 = builder.AddInstruction(
@@ -559,7 +575,7 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
 
 XLA_TEST_F(FusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}})));
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -587,7 +603,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -606,8 +622,8 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
-  auto hlo_module = CreateNewUnverifiedModule();
+XLA_TEST_F(FusionTest, ReduceImplicitBroadcast) {
+  auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -630,7 +646,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
@@ -682,7 +698,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
 XLA_TEST_F(FusionTest, SharedConstant) {
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
@@ -740,64 +756,34 @@ XLA_TEST_F(FusionTest, Maximum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMaximum);
 }
 
-XLA_TEST_F(FusionTest, Equal2D) { TestElementwise2D<bool, 2>(HloOpcode::kEq); }
+XLA_TEST_F(FusionTest, Equal2D) {
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kEq);
+}
 
 XLA_TEST_F(FusionTest, Inequal2D) {
-  TestElementwise2D<bool, 2>(HloOpcode::kNe);
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kNe);
 }
 
 XLA_TEST_F(FusionTest, Greater2D) {
-  TestElementwise2D<bool, 2>(HloOpcode::kGt);
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGt);
 }
 
-XLA_TEST_F(FusionTest, Lesser2D) { TestElementwise2D<bool, 2>(HloOpcode::kLt); }
+XLA_TEST_F(FusionTest, Lesser2D) {
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLt);
+}
 
 XLA_TEST_F(FusionTest, GreaterOrEqual2D) {
-  TestElementwise2D<bool, 2>(HloOpcode::kGe);
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGe);
 }
 
 XLA_TEST_F(FusionTest, LesserOrEqual2D) {
-  TestElementwise2D<bool, 2>(HloOpcode::kLe);
+  TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLe);
 }
 
 XLA_TEST_F(FusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
-// TODO(b/117156505): Remove this test when the bug is fixed and the CPU backend
-// should not generate layout changing elementwise operations.
-#ifdef XLA_TEST_BACKEND_CPU
-XLA_TEST_F(FusionTest, LayoutChangingElementWiseOp) {
-  const string hlo_text = R"(
-HloModule Cluster
-
-fusion_c {
-  fusion.arg = f32[2,2]{1,0} parameter(0)
-  bitcast.0 = f32[2,2,1]{2,1,0} bitcast(fusion.arg)
-  tanh.0 = f32[2,2,1]{0,2,1} tanh(bitcast.0)
-  ROOT bitcast.2 = f32[2,2,1]{1,2,0} bitcast(tanh.0)
-}
-
-ENTRY main {
-  arg = f32[2,2]{1,0} parameter(0)
-  ROOT fusion = f32[2,2,1]{1,2,0} fusion(arg), kind=kLoop, calls=fusion_c
-}
-)";
-
-  Literal operand = LiteralUtil::CreateR2<float>({{0., 0.}, {1., 0.}});
-  HloModuleConfig config;
-  config.set_debug_options(GetDebugOptionsForTest());
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text, config));
-  TF_ASSERT_OK_AND_ASSIGN(Literal result,
-                          test_runner_.Execute(std::move(module), {&operand},
-                                               /*run_hlo_passes=*/false));
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR3<float>({{{0.}, {0.76159415595}}, {{0.}, {0.}}}),
-      result));
-}
-#endif
-
 class FusionClientLibraryTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index d65b67a535d43553a3a94f76482ad4618f9b8aab..16a1371ec8da5377bb64a202988f118852aa12e2 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -598,6 +598,26 @@ ENTRY main {
   RunTest(hlo_text, &operand, &start_indices);
 }
 
+XLA_TEST_F(GatherOperationTest, GatherFromScalar) {
+  const string hlo_text = R"(
+HloModule GatherFromScalar
+
+ENTRY main {
+  operand = f32[] parameter(0)
+  indices = s32[0]{0} parameter(1)
+  ROOT gather = f32[] gather(operand, indices),
+      offset_dims={},
+      collapsed_slice_dims={},
+      start_index_map={},
+      index_vector_dim=0,
+      slice_sizes={}
+}
+)";
+  Literal operand = LiteralUtil::CreateR0<float>(1);
+  Literal start_indices = LiteralUtil::CreateR1<int32>({});
+  RunTest(hlo_text, &operand, &start_indices);
+}
+
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
 // Disabled on interpreter since ExectuteAsyncOnStream is not supported.
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index 1115e50fe3120b7dbd891f07dedcacefa5ecf3ea..97b10083029c944d07422544277d8b52a2cea24b 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -73,7 +73,7 @@ half sign_imp(half value) {
 }
 
 half round_imp(half value) {
-  return half(round(static_cast<float>(std::move(value))));
+  return half(std::round(static_cast<float>(std::move(value))));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 66f72ba8d20b8ef1f436da4425b2bb6518ee9a94..a12fa04c01b2654c0970faff643d64c608dadf0d 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -205,6 +205,32 @@ Literal HloTestBase::ExecuteAndTransfer(std::unique_ptr<HloModule> module,
   return test_runner_.Execute(std::move(module), arguments).ValueOrDie();
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+    int64 num_replicas, bool use_threads) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  for (auto argument : arguments) {
+    options.arguments.push_back(argument);
+  }
+  return test_runner_.ExecuteReplicated(std::move(module), options,
+                                        use_threads);
+}
+
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+    int64 num_replicas, DeviceAssignment* device_assignment,
+    bool run_hlo_passes, bool use_threads) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  for (auto argument : arguments) {
+    options.arguments.push_back(argument);
+  }
+  return test_runner_.ExecuteReplicated(std::move(module), options,
+                                        device_assignment, use_threads);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
@@ -313,7 +339,10 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                        reference_preprocessor);
 }
 
-::testing::AssertionResult HloTestBase::Run(string_view hlo_string) {
+::testing::AssertionResult HloTestBase::Run(string_view hlo_string,
+                                            bool run_hlo_passes,
+                                            ExecutionProfile* profile,
+                                            string backend_config) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
@@ -321,19 +350,108 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
            << "Error while parsing HLO text format: "
            << module_or_status.status().ToString();
   }
+
+  std::unique_ptr<HloModule> module = std::move(module_or_status.ValueOrDie());
   const auto& fake_arguments =
-      MakeFakeArguments(module_or_status.ValueOrDie().get())
-          .ConsumeValueOrDie();
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
   std::vector<Literal*> fake_argument_ptrs;
   absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const Literal& literal) { return const_cast<Literal*>(&literal); });
-  return test_runner_
-                 .Execute(std::move(module_or_status.ValueOrDie()),
-                          fake_argument_ptrs, /*run_hlo_passes=*/true)
-                 .ok()
+
+  if (profile != nullptr) {
+    // We have to enable HLO profiling since otherwise currently the
+    // ExecutionProfile is not correct.
+    //
+    // TODO(b/119432044): Fix collection of the ExecutionProfile
+    // so that this is not necessary.
+    HloModuleConfig config = module->config();
+    DebugOptions debug_options = config.debug_options();
+    debug_options.set_xla_hlo_profile(true);
+    config.set_debug_options(debug_options);
+    module->set_config(config);
+  }
+
+  if (!backend_config.empty()) {
+    // Set backend configuration if it is given.
+    HloInstruction* instruction =
+        module->entry_computation()->root_instruction();
+    instruction->set_raw_backend_config_string(backend_config);
+  }
+
+  // return ::testing::AssertionSuccess();
+  auto output = test_runner_.Execute(std::move(module), fake_argument_ptrs,
+                                     /*run_hlo_passes=*/run_hlo_passes,
+                                     /*profile=*/profile);
+
+  return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure();
+             : ::testing::AssertionFailure() << output.status().error_message();
+}
+
+::testing::AssertionResult HloTestBase::RunMultipleTimes(
+    string_view hlo_string, bool run_hlo_passes,
+    std::vector<ExecutionProfile>* profiles, string backend_config) {
+  int n = profiles->size();
+  std::vector<std::vector<Literal*>> fake_argument_ptrs(n);
+  std::vector<std::vector<Literal>> fake_arguments(n);
+  std::vector<std::unique_ptr<Executable>> executables(n);
+
+  for (int i = 0; i < n; ++i) {
+    auto module_or_status =
+        HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+    if (!module_or_status.ok()) {
+      return ::testing::AssertionFailure()
+             << "Error while parsing HLO text format: "
+             << module_or_status.status().ToString();
+    }
+    std::unique_ptr<HloModule> module =
+        std::move(module_or_status.ValueOrDie());
+
+    fake_arguments[i] = MakeFakeArguments(module.get()).ConsumeValueOrDie();
+    absl::c_transform(
+        fake_arguments[i], std::back_inserter(fake_argument_ptrs[i]),
+        [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+    if (profiles != nullptr) {
+      // We have to enable HLO profiling since otherwise currently the
+      // ExecutionProfile is not correct.
+      //
+      // TODO(b/119432044): Fix collection of the ExecutionProfile
+      // so that this is not necessary.
+      HloModuleConfig config = module->config();
+      DebugOptions debug_options = config.debug_options();
+      debug_options.set_xla_hlo_profile(true);
+      config.set_debug_options(debug_options);
+      module->set_config(config);
+    }
+
+    if (!backend_config.empty()) {
+      // Set backend configuration if it is given.
+      HloInstruction* instruction =
+          module->entry_computation()->root_instruction();
+      instruction->set_raw_backend_config_string(backend_config);
+    }
+
+    auto executable =
+        test_runner_.CreateExecutable(std::move(module), run_hlo_passes);
+    if (!executable.ok()) {
+      return ::testing::AssertionFailure()
+             << executable.status().error_message();
+    }
+    executables[i] = std::move(executable.ValueOrDie());
+  }
+
+  for (int i = 0; i < n; ++i) {
+    auto output =
+        test_runner_.Execute(std::move(executables[i]), fake_argument_ptrs[i],
+                             /*profile=*/&((*profiles)[i]));
+    if (!output.ok()) {
+      return ::testing::AssertionFailure() << output.status().error_message();
+    }
+  }
+
+  return ::testing::AssertionSuccess();
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 69a4f96288c7285010e9adbdc33f1b394f58d8d2..6c6fe34d1cc67b83efc8f9ea9b8e05e56d58fb8c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -173,6 +173,21 @@ class HloTestBase : public ::testing::Test {
   Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
                              absl::Span<Literal* const> arguments);
 
+  // Executes the given module on multiple replicas.
+  //
+  // use_threads indicates whether this replicated computation will be executed
+  // with a thread-per-replica, vs using an implicitly async call such as
+  // Executable::ExecuteOnStreams.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64 num_replicas, bool use_threads);
+
+  // Same as above, but uses specified device assignment.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64 num_replicas, DeviceAssignment* device_assignment,
+      bool run_hlo_passes, bool use_threads);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
@@ -221,8 +236,14 @@ class HloTestBase : public ::testing::Test {
       const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
-  ::testing::AssertionResult Run(const absl::string_view hlo_string)
-      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult Run(const absl::string_view hlo_string,
+                                 bool run_hlo_passes = true,
+                                 ExecutionProfile* profile = nullptr,
+                                 string backend_config = "") TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunMultipleTimes(
+      const absl::string_view hlo_string, bool run_hlo_passes,
+      std::vector<ExecutionProfile>* profiles,
+      string backend_config = "") TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareFromFile(
       const string& filename, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 554eb24d44168caa7d7252015e3d99f2d567df9b..a2fd6070731943f15c773265f428b16f520d02ee 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -86,7 +86,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const LiteralSlice& expected, const LiteralSlice& actual,
-    const ErrorSpec& error_spec, bool detailed_message) {
+    const ErrorSpec& error_spec, absl::optional<bool> detailed_message) {
   return StatusToAssertion(literal_comparison::Near(
       expected, actual, error_spec, detailed_message, &OnMiscompare));
 }
@@ -97,7 +97,8 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
   if (error.has_value()) {
     VLOG(1) << "Expects near";
     return StatusToAssertion(literal_comparison::Near(
-        expected, actual, *error, /*detailed_message=*/false, &OnMiscompare));
+        expected, actual, *error, /*detailed_message=*/absl::nullopt,
+        &OnMiscompare));
   }
   VLOG(1) << "Expects equal";
   return StatusToAssertion(literal_comparison::Equal(expected, actual));
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 43cca91f64b2c0fbfde5054a361cf0f95302c23d..d7cf9bed98a3eb7479b6deb6838dc388a0869360 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -93,7 +93,7 @@ class LiteralTestUtil {
   static ::testing::AssertionResult Near(
       const LiteralSlice& expected, const LiteralSlice& actual,
       const ErrorSpec& error_spec,
-      bool detailed_message = false) TF_MUST_USE_RESULT;
+      absl::optional<bool> detailed_message = absl::nullopt) TF_MUST_USE_RESULT;
 
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 60eb21aafd23a8d724d1f08d5c87098b7c3dcd6b..f1779c856bb4fdb1c4de453d9270a75dfcb8682b 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -69,12 +69,12 @@ int main(int argc, char** argv) {
   } else if (target_cpu == "arm") {
     triple_string = "aarch64-none-linux-gnu";
   } else if (target_cpu == "local") {
-    triple_string = xla::llvm_ir::AsString(llvm::sys::getDefaultTargetTriple());
+    triple_string = llvm::sys::getDefaultTargetTriple();
   } else {
     LOG(FATAL) << "unsupported TARGET_CPU: " << target_cpu;
   }
 
-  llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
+  llvm::Triple triple(triple_string);
 
   xla::XlaComputation computation = builder.Build().ConsumeValueOrDie();
   xla::CompileOnlyClient::AotXlaComputationInstance instance{
diff --git a/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1513d89ba9c95b3097229b268d22832dee3e98cd
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class MultiDeviceAllReduceTest : public HloTestBase {};
+
+XLA_TEST_F(MultiDeviceAllReduceTest, TwoReplicasOneOperand) {
+  const char* module_str = R"(
+  HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    p = f32[3] parameter(0)
+    ROOT crs = f32[3] all-reduce(p), to_apply=add
+  })";
+  auto config = GetModuleConfigForTest();
+  config.set_replica_count(2);
+  auto module = ParseHloString(module_str, config).ValueOrDie();
+  auto literal = LiteralUtil::CreateR1<float>({1, 2, 3});
+  auto expected = LiteralUtil::CreateR1<float>({2, 4, 6});
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(std::move(module), {&literal}, 2,
+                                            /*use_threads=*/true));
+  EXPECT_EQ(expected, results[0]);
+  EXPECT_EQ(expected, results[1]);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 3f5135438fc59bea98527b1be30ee49339edd455..73c9d7ed4b09c2f78dc7226b5d755eb017ea664b 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -208,9 +208,7 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
       ROOT fusion = (s32[]) fusion(x), kind=kLoop, calls=fused_computation
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::MakeTupleOwned(
       LiteralUtil::MakeTupleOwned(
           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<int32>(42)),
@@ -229,7 +227,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
     fused_computation {
       p = f32[4] parameter(0)
       multiply = f32[4] multiply(p, p)
-      less-than = pred[4] less-than(p, multiply)
+      less-than = pred[4] compare(p, multiply), direction=LT
       ROOT tuple = (pred[4], f32[4]) tuple(less-than, multiply)
     }
 
@@ -241,9 +239,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
       const = f32[4] constant({0, 0, 0, 0})
       ROOT select = f32[4] select(gte0, gte1, const)
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0, 1.0}, result);
@@ -256,7 +252,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
     fused_computation {
       p = f32[] parameter(0)
       multiply = f32[] multiply(p, p)
-      less-than = pred[] less-than(p, multiply)
+      less-than = pred[] compare(p, multiply), direction=LT
       ROOT tuple = (pred[], f32[]) tuple(less-than, multiply)
     }
 
@@ -273,9 +269,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
       p1 = f32[3] parameter(0)
       ROOT map = f32[3] map(p1), to_apply=map_computation
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0}, result);
@@ -315,9 +309,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -346,9 +338,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -378,9 +368,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2]{0}, f32[2]{0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -410,9 +398,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -443,9 +429,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -478,9 +462,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -513,9 +495,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p, i, j), kind=kInput,
                                                               calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{0, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   auto init1 = LiteralUtil::CreateR0<float>(5);
@@ -549,9 +529,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) fusion(p),
                     kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR3<Eigen::half>(
       {{{Eigen::half(1), Eigen::half(2)}, {Eigen::half(3), Eigen::half(4)}},
        {{Eigen::half(5), Eigen::half(6)}, {Eigen::half(7), Eigen::half(8)}}});
diff --git a/tensorflow/compiler/xla/tests/plugin.bzl b/tensorflow/compiler/xla/tests/plugin.bzl
index 8a5d91363b619c6b214a96ad96e92742e3052541..107869fe59d43d0a9a3e2b14af2c09e4906d9f15 100644
--- a/tensorflow/compiler/xla/tests/plugin.bzl
+++ b/tensorflow/compiler/xla/tests/plugin.bzl
@@ -33,4 +33,3 @@
 # }
 
 plugins = {}
-
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index f80d29b9de440b11c36e8c9bc65d4a93353a6267..e2cf4c0be289b52d5cc581ea07752ed6e98da76f 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 18c99490a387923aaf68e06041cd11ed3b954aa5..6d2c2fc79cec82a0e37c772657d94fb7e2a27866 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include <stdlib.h>
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <string>
 #include <utility>
@@ -455,7 +456,7 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) {
   for (int64 colno = 0; colno < cols; ++colno) {
     float column_sum = 0;
     for (int64 rowno = 0; rowno < rows; ++rowno) {
-      column_sum += log(input_data(rowno, colno));
+      column_sum += std::log(input_data(rowno, colno));
     }
     expected.push_back(column_sum);
   }
@@ -486,7 +487,7 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
   for (int64 colno = 0; colno < cols; ++colno) {
     float column_sum = 0;
     for (int64 rowno = 0; rowno < rows; ++rowno) {
-      column_sum += log(input_data(rowno, colno));
+      column_sum += std::log(input_data(rowno, colno));
     }
     expected.push_back(column_sum);
   }
@@ -533,7 +534,7 @@ XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
     for (int64 colno = 0; colno < cols / 2; ++colno) {
       float column_sum = 0;
       for (int64 rowno = 0; rowno < rows; ++rowno) {
-        column_sum += tanh(input_data(rowno, major, colno));
+        column_sum += std::tanh(input_data(rowno, major, colno));
       }
       expected.push_back(column_sum);
     }
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 16c67d94c76bcf8984a2b3e4cb092026a6924aeb..352b59f248b86234c46a9b89e17a68e347d7af6c 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -611,6 +611,12 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     // values. (Technically, the requirement is that the iota length is
     // relatively prime to all of the dimensions involved in the reduce-window.)
     input.FillRepeatedIota(0, 137);
+    // Floating point sum reduction requires higher localized precision. We need
+    // the following normalization in order to enable testing of kAdd on large
+    // windows.
+    input.Each([&](absl::Span<const int64> /*indices*/, float* value) {
+      *value = *value / 10000000000.f;
+    });
     Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
     XlaOp parameter;
@@ -626,12 +632,6 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
         CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto reducer = param.reducer;
-    if (use_bfloat16()) {
-      // To avoid numerical issues, force the reducer to be kMax for bf16
-      // inputs.
-      reducer = kMax;
-    }
-
     auto computation = reducer == kAdd
                            ? CreateScalarAddComputation(FloatType(), &b)
                            : CreateScalarMaxComputation(FloatType(), &b);
@@ -697,15 +697,6 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
-    // With non-1x1 window.
-    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
-                           /*window_bounds=*/{2, 3, 1, 1},
-                           /*strides=*/{1, 1, 1, 1},
-                           /*pad_low=*/{0, 0, 0, 0},
-                           /*pad_high=*/{0, 0, 0, 0},
-                           /*layout=*/{3, 2, 1, 0},
-                           /*reducer=*/kAdd},
-
     // With max instead of add.
     R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
                            /*window_bounds=*/{2, 3, 1, 1},
@@ -778,15 +769,6 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
-    // With second minor dimension == 9.
-    R4ReduceWindowTestData{/*base_bounds=*/{2, 3, 9, 127},
-                           /*window_bounds=*/{1, 1, 1, 1},
-                           /*strides=*/{1, 1, 1, 1},
-                           /*pad_low=*/{0, 0, 0, 0},
-                           /*pad_high=*/{0, 0, 0, 0},
-                           /*layout=*/{3, 2, 1, 0},
-                           /*reducer=*/kAdd},
-
     // With minor dimension == 129.
     R4ReduceWindowTestData{/*base_bounds=*/{3, 2, 7, 129},
                            /*window_bounds=*/{1, 1, 1, 1},
@@ -814,7 +796,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
-    R4ReduceWindowTestData{/*base_bounds=*/{8, 256, 256, 3},
+    R4ReduceWindowTestData{/*base_bounds=*/{8, 100, 100, 3},
                            /*window_bounds=*/{1, 64, 64, 1},
                            /*strides=*/{1, 64, 64, 1},
                            /*pad_low=*/{0, 0, 0, 0},
@@ -828,6 +810,32 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
                            /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kMax},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 4, 5},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    // With 0321 layout.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 4, 5},
+                           /*strides=*/{1, 2, 3, 4},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{0, 3, 2, 1},
+                           /*reducer=*/kAdd},
+
+    // With 0123 layout.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 13, 17},
+                           /*window_bounds=*/{2, 3, 7, 9},
+                           /*strides=*/{1, 2, 5, 8},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{0, 1, 2, 3},
                            /*reducer=*/kAdd},
 };
 
@@ -866,58 +874,60 @@ const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
                            /*pad_high=*/{0, 0, 2, 0},
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kMax},
-};
-
-INSTANTIATE_TEST_CASE_P(
-    R4ReduceWindowLargeTestInstantiation, R4ReduceWindowLargeTest,
-    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
-                       ::testing::ValuesIn(use_bfloat16_params)),
-    R4ReduceWindowTestDataToString);
 
-class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {};
+    // Patterns generated by cumsum/cumprod.
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1021, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{1020, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
 
-// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
-XLA_TEST_P(R4ReduceWindowAnyDimsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
-  DoIt();
-}
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1, 1, 1021, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 1020, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
 
-const R4ReduceWindowTestData kR4ReduceWindowAnyDimsTestValues[] = {
-    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
-                           /*window_bounds=*/{2, 3, 4, 5},
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 16, 1021},
+                           /*window_bounds=*/{1, 1, 1, 1021},
                            /*strides=*/{1, 1, 1, 1},
-                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_low=*/{0, 0, 0, 1020},
                            /*pad_high=*/{0, 0, 0, 0},
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
-    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
-                           /*window_bounds=*/{2, 3, 1, 1},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1021, 1, 1, 1},
                            /*strides=*/{1, 1, 1, 1},
-                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_low=*/{1021, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
                            /*layout=*/{3, 2, 1, 0},
-                           /*reducer=*/kMax},
-    // With 0321 layout.
-    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
-                           /*window_bounds=*/{2, 3, 4, 5},
-                           /*strides=*/{1, 2, 3, 4},
-                           /*pad_low=*/{0, 0, 0, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 1021, 16},
+                           /*window_bounds=*/{1, 1, 1021, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 1021, 0},
                            /*pad_high=*/{0, 0, 0, 0},
-                           /*layout=*/{0, 3, 2, 1},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
-    // With 0123 layout.
-    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 23},
-                           /*window_bounds=*/{2, 3, 7, 9},
-                           /*strides=*/{1, 2, 5, 8},
-                           /*pad_low=*/{0, 0, 0, 0},
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 16, 1021},
+                           /*window_bounds=*/{1, 1, 1, 1021},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 1021},
                            /*pad_high=*/{0, 0, 0, 0},
-                           /*layout=*/{0, 1, 2, 3},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 };
 
 INSTANTIATE_TEST_CASE_P(
-    R4ReduceWindowAnyDimsTestInstantiation, R4ReduceWindowAnyDimsTest,
-    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowAnyDimsTestValues),
+    R4ReduceWindowLargeTestInstantiation, R4ReduceWindowLargeTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
                        ::testing::ValuesIn(use_bfloat16_params)),
     R4ReduceWindowTestDataToString);
 
@@ -1113,6 +1123,11 @@ struct R2ReduceWindowTestData {
     {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
      /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
+    // Regression test for b/72234705: bf16 lacks precision to store incremental
+    // results on very large windows. Using smaller window with minor dim 128.
+    {/*base_bounds=*/{8, 128}, /*window_bounds=*/{2, 128},
+     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 };
 
 string R2ReduceWindowTestDataToString(
@@ -1191,27 +1206,6 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(use_bfloat16_params)),
     R2ReduceWindowTestDataToString);
 
-class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
-
-// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
-XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) {
-  DoIt();
-}
-
-const R2ReduceWindowTestData kR2FailingValuesCpuGpuBf16Test[] = {
-    {/*base_bounds=*/{8, 128}, /*window_bounds=*/{8, 128},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
-     /*layout=*/{1, 0},
-     /*reducer=*/Reducer::kAdd},
-};
-
-INSTANTIATE_TEST_CASE_P(
-    R2ReduceWindowFailingInstantiation, R2ReduceWindowFailingCpuGpuBf16Test,
-    ::testing::Combine(::testing::ValuesIn(kR2FailingValuesCpuGpuBf16Test),
-                       ::testing::ValuesIn(use_bfloat16_params)),
-    R2ReduceWindowTestDataToString);
-
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
@@ -1321,9 +1315,9 @@ struct R1ReduceWindowTestData {
      /*reducer=*/Reducer::kMax},
 
     // The pattern generated by exclusive scan (cumsum/cumprod).
-    {/*base_bounds=*/{4096}, /*window_bounds=*/{4096},
+    {/*base_bounds=*/{4095}, /*window_bounds=*/{4095},
      /*strides=*/{1},
-     /*pad_low=*/{4096},
+     /*pad_low=*/{4095},
      /*pad_high=*/{0},
      /*reducer=*/Reducer::kMax},
 };
@@ -1532,6 +1526,25 @@ ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] {
   EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, ReduceWindowS64) {
+  const string hlo_string = R"(
+HloModule reduce-window
+
+%identity.pad_to_reduce_window (param0: s64[], param1: s64[]) -> s64[] {
+  %param0 = s64[] parameter(0)
+  ROOT %param1 = s64[] parameter(1)
+}
+
+ENTRY %reduce-window (parameter.0: s64[81,8], parameter.1: s64[]) -> s64[82,8] {
+  %parameter.0 = s64[81,8]{1,0} parameter(0)
+  %parameter.1 = s64[] parameter(1)
+  ROOT %reduce-window = s64[82,8]{1,0} reduce-window(s64[81,8]{1,0} %parameter.0, s64[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
+}
+
 XLA_TEST_F(HloTestBase, ReduceWindowF16) {
   const string hlo_string = R"(
 HloModule reduce-window
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 32de0fdf78f9c442e17c55e1b951e39122dac5ef..86d9999b4a4844b51db7fb56529ea4580797b809 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -718,5 +718,32 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, ScatterIntoScalar) {
+  const char* hlo_text = R"(
+HloModule ScatterIntoScalar
+
+update_s32 {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  parameter.1 = s32[] parameter(0)
+  parameter.2 = s32[0]{0} parameter(1)
+  parameter.3 = s32[] parameter(2)
+  ROOT scatter = s32[] scatter(parameter.1, parameter.2, parameter.3),
+      update_window_dims={},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={},
+      index_vector_dim=0,
+      to_apply=update_s32
+}
+)";
+  Literal operand = LiteralUtil::CreateR0<int32>(1);
+  Literal scatter_indices = LiteralUtil::CreateR1<int32>({});
+  Literal updates = LiteralUtil::CreateR0<int32>(2);
+  RunTest(hlo_text, &operand, &scatter_indices, &updates);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index f737b5158b3622d677aea5bf64a421a56e2c42dd..0dcb1c42db1b0884d80adeaf88d55eeda79fb7d8 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -146,6 +146,12 @@ INSTANTIATE_TEST_CASE_P(
                                   Padding::kValid,
                                   {3, 3, 1, 1},
                                   {3, 3, 1, 1}},
+        // Uncovered by b/126212776.
+        SelectAndScatterTestParam{{15, 1, 1, 1},
+                                  {2, 1, 1, 1},
+                                  Padding::kValid,
+                                  {14, 1, 1, 1},
+                                  {1, 1, 1, 1}},
         SelectAndScatterTestParam{{7, 3, 4, 4},
                                   {3, 1, 4, 4},
                                   Padding::kValid,
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 95c89b0ba6f29c453abab88e29bca13ee006455a..4ac3dbd80cfaf2340d8f79cef3e9e02058cf919c 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -112,6 +112,31 @@ void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
   }
 }
 
+template <typename ComplexT>
+void PopulateWithComplexData(Literal* result, std::minstd_rand0* engine,
+                             bool no_duplicates) {
+  using InnerFloatT = typename ComplexT::value_type;
+  CHECK(engine != nullptr);
+  CHECK_EQ(result->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<ComplexT>());
+  Shape floating_point_shape = ShapeUtil::ChangeElementType(
+      result->shape(), primitive_util::NativeToPrimitiveType<InnerFloatT>());
+  Literal real_lit(floating_point_shape);
+  Literal imaginary_lit(floating_point_shape);
+
+  PopulateWithFloatingPointData<InnerFloatT>(&real_lit, engine, no_duplicates);
+  PopulateWithFloatingPointData<InnerFloatT>(&imaginary_lit, engine,
+                                             no_duplicates);
+
+  absl::Span<const InnerFloatT> real_data = real_lit.data<InnerFloatT>();
+  absl::Span<const InnerFloatT> imaginary_data =
+      imaginary_lit.data<InnerFloatT>();
+  absl::Span<ComplexT> result_data = result->data<ComplexT>();
+  for (int i = 0; i < real_lit.data<InnerFloatT>().size(); i++) {
+    result_data[i] = ComplexT(real_data[i], imaginary_data[i]);
+  }
+}
+
 template <>
 void PopulateWithFloatingPointData<half>(Literal* literal,
                                          std::minstd_rand0* engine,
@@ -220,6 +245,12 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
     case U64:
       PopulateWithRandomIntegralData<uint64>(&literal, engine, no_duplicates);
       break;
+    case C64:
+      PopulateWithComplexData<complex64>(&literal, engine, no_duplicates);
+      break;
+    case C128:
+      PopulateWithComplexData<complex128>(&literal, engine, no_duplicates);
+      break;
     case PRED: {
       std::uniform_int_distribution<int> generator(0, 1);
       TF_CHECK_OK(
@@ -238,6 +269,79 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   return std::move(literal);
 }
 
+template <typename IntT>
+void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
+                                              std::minstd_rand0* engine,
+                                              IntT min, IntT max) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<IntT>());
+  std::uniform_int_distribution<IntT> generator(min, max);
+  for (IntT& value : literal->data<IntT>()) {
+    value = generator(*engine);
+  }
+}
+
+// Same as MakeFakeLiteralInternal but generates random numbers in the given
+// range [min, max]. Currently this works only for INT types.
+StatusOr<Literal> MakeFakeLiteralInternalWithBounds(const Shape& shape,
+                                                    std::minstd_rand0* engine,
+                                                    int64 min, int64 max) {
+  if (shape.IsTuple()) {
+    std::vector<Literal> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(
+          Literal element,
+          MakeFakeLiteralInternalWithBounds(element_shape, engine, min, max));
+      elements.push_back(std::move(element));
+    }
+    return LiteralUtil::MakeTupleOwned(std::move(elements));
+  }
+  if (engine == nullptr) {
+    return Literal::CreateFromShape(shape);
+  }
+  Literal literal(shape);
+  switch (shape.element_type()) {
+    case S8:
+      PopulateWithRandomIntegralDataWithBounds<int8>(
+          &literal, engine, static_cast<int8>(min), static_cast<int8>(max));
+      break;
+    case U8:
+      PopulateWithRandomIntegralDataWithBounds<uint8>(
+          &literal, engine, static_cast<uint8>(min), static_cast<uint8>(max));
+      break;
+    case S16:
+      PopulateWithRandomIntegralDataWithBounds<int16>(
+          &literal, engine, static_cast<int16>(min), static_cast<int16>(max));
+      break;
+    case U16:
+      PopulateWithRandomIntegralDataWithBounds<uint16>(
+          &literal, engine, static_cast<uint16>(min), static_cast<uint16>(max));
+      break;
+    case S32:
+      PopulateWithRandomIntegralDataWithBounds<int32>(
+          &literal, engine, static_cast<int32>(min), static_cast<int32>(max));
+      break;
+    case U32:
+      PopulateWithRandomIntegralDataWithBounds<uint32>(
+          &literal, engine, static_cast<uint32>(min), static_cast<uint32>(max));
+      break;
+    case S64:
+      PopulateWithRandomIntegralDataWithBounds<int64>(
+          &literal, engine, static_cast<int64>(min), static_cast<int64>(max));
+      break;
+    case U64:
+      PopulateWithRandomIntegralDataWithBounds<uint64>(
+          &literal, engine, static_cast<uint64>(min), static_cast<uint64>(max));
+      break;
+    default:
+      return Unimplemented(
+          "Unsupported type for fake random literal generation with bounds: %s",
+          ShapeUtil::HumanString(shape));
+  }
+  return std::move(literal);
+}
+
 enum class ConstantType { kUnknown, kZero, kOne };
 
 // Return the constant type required by this computation, if known.
@@ -297,6 +401,10 @@ std::vector<HloInstruction*> FindConstrainedUses(
       if ((opcode == HloOpcode::kDynamicSlice && op_num >= 1) ||
           (opcode == HloOpcode::kDynamicUpdateSlice && op_num >= 2)) {
         constrained_uses.push_back(instruction);
+      } else if ((opcode == HloOpcode::kGather ||
+                  opcode == HloOpcode::kScatter) &&
+                 op_num == 1) {
+        constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kFusion) {
         const HloInstruction* const to_analyze =
             instruction->fused_parameter(op_num);
@@ -356,6 +464,22 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
         }
         break;
       }
+      case HloOpcode::kGather:
+      case HloOpcode::kScatter: {
+        const Shape& operand_shape = use->operand(0)->shape();
+        if (use->operand(1) == &param) {
+          auto index_map =
+              use->opcode() == HloOpcode::kGather
+                  ? use->gather_dimension_numbers().start_index_map()
+                  : use->scatter_dimension_numbers()
+                        .scatter_dims_to_operand_dims();
+          for (const auto dim_in_operand : index_map) {
+            index_bound =
+                std::min(index_bound, operand_shape.dimensions(dim_in_operand));
+          }
+        }
+        break;
+      }
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
         needs_constant = true;
@@ -385,8 +509,8 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
     return Unimplemented("Conflicting operand generation constraints.");
   }
   if (index_bound != INT64_MAX) {
-    return MakeRandomIndex(index_bound, engine)
-        .Reshape(param.shape().dimensions());
+    return MakeFakeLiteralInternalWithBounds(param.shape(), engine, -1,
+                                             index_bound);
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 591d6c19228a313f530cdae18f4be37e7b517601..4337aa4bf9a071ecad268e1c4233fa705c6ed491 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -92,12 +92,13 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
                           MakeFakeArguments(module.get()));
   ASSERT_EQ(args.size(), 5);
 
-  EXPECT_EQ(args[0].Get<int32>({}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(args[1].Get<int32>({}), 0);
-  EXPECT_LE(args[0].Get<int32>({}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
   EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
@@ -122,12 +123,13 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
                           MakeFakeArguments(module.get()));
   ASSERT_EQ(args.size(), 7);
 
-  EXPECT_EQ(args[0].Get<int32>({}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(args[1].Get<int32>({}), 0);
-  EXPECT_LE(args[0].Get<int32>({}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(args[2].Get<int32>({}), 0);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
   EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
@@ -136,10 +138,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) {
   %parameter.0 = f32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -159,10 +169,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = s32[] parameter(0)
+  p.0.rhs = s32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) {
   %parameter.0 = s32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -182,10 +200,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
   auto module = ParseHloString(R"(
 HloModule sort, is_scheduled=true
 
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
 ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
   %parameter.0 = bf16[2,1452]{1,0} parameter(0)
   %parameter.1 = s32[2,1452]{1,0} parameter(1)
-  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -228,5 +254,77 @@ ENTRY %module (parameter.0: s32[], parameter.1: f32[20,20]) -> f32[] {
       << ShapeUtil::HumanString(args[1].shape());
 }
 
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForGather) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+ENTRY %module(paramater.0: f32[200,100,300], parameter.1: s32[10,2]) ->
+                                                          f32[10,300] {
+  %parameter.0 = f32[200,100,300] parameter(0)
+  %parameter.1 = s32[10,2] parameter(1)
+  ROOT gather = f32[10,300] gather(f32[200,100,300] %parameter.0,
+                                   s32[10,2] %parameter.1),
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
+      index_vector_dim=1,
+      slice_sizes={1,1,300}
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForScatter) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+scatter_update (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  ROOT rhs = f32[] parameter(1)
+}
+
+ENTRY main {
+  operand = f32[200,100,300] parameter(0)
+  indices = s32[10,2] parameter(1)
+  updates = f32[10,300] parameter(2)
+  ROOT scatter = f32[200,100,300] scatter(operand, indices, updates),
+    to_apply=scatter_update,
+    update_window_dims={1},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=1
+  }
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 3);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index b77cf38ed8e29973985406015c0a3936916ad5e6..38a2a9b8fba280ed17f6c26688fba94b4ec6ff0c 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -129,7 +129,7 @@ HloModule TokenInWhileLoop
   %param = (s32[], token[]) parameter(0)
   %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0
   %constant = s32[] constant(42)
-  ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant)
+  ROOT %less-than = pred[] compare(s32[] %get-tuple-element, s32[] %constant), direction=LT
 }
 
 ENTRY %TokenInWhileLoop () -> s32[] {
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
similarity index 77%
rename from tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
rename to tensorflow/compiler/xla/tests/triangular_solve_test.cc
index 284a2e9d183a6a7923fb59ac134ce3b3a3a96e35..24ab12136ff396bd9ac37bb058311b0d2d6f2515 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
-
 #include <memory>
 #include <numeric>
 #include <vector>
@@ -54,6 +52,20 @@ Array2D<float> AValsUpper() {
           {kNan, kNan, kNan, 11}};
 }
 
+Array2D<float> AValsLowerUnitDiagonal() {
+  return {{kNan, kNan, kNan, kNan},
+          {3, kNan, kNan, kNan},
+          {4, 7, kNan, kNan},
+          {5, 8, 10, kNan}};
+}
+
+Array2D<float> AValsUpperUnitDiagonal() {
+  return {{kNan, 3, 4, 5},
+          {kNan, kNan, 7, 8},
+          {kNan, kNan, kNan, 10},
+          {kNan, kNan, kNan, kNan}};
+}
+
 Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
 }
@@ -96,8 +108,8 @@ XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
       CreateR2Parameter<float>(Array2D<float>(0, 10), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 10),
                              {a_data.get(), b_data.get()});
@@ -111,8 +123,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
@@ -132,8 +144,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
@@ -153,8 +165,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
@@ -174,8 +186,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 0.08333334, 0.04629629, 0.03367003},
@@ -195,8 +207,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
@@ -217,8 +229,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -231,6 +243,25 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
                              ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsLowerUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected(
+      {{1., 2., 3.}, {1., -1., -3.}, {-4., 7., 18.}, {37., -61., -159.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   XlaBuilder builder(TestName());
 
@@ -239,8 +270,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/3);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -261,8 +292,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<float> expected({
       {0.5, 1.0, 1.5},
@@ -283,8 +314,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
 
   Array2D<float> expected({
       {-0.89646465, -0.69444444, -0.49242424},
@@ -297,6 +328,27 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
                              ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsUpperUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({{-1402., -1538., -1674.},
+                           {575., 631., 687.},
+                           {-93., -102., -111.},
+                           {10., 11., 12.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
   XlaBuilder builder(TestName());
 
@@ -307,8 +359,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
       CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/true,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::ADJOINT);
 
   Array2D<complex64> expected({
       {0.5, complex64(0.08333333, 0.08333333),
@@ -333,8 +385,8 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
       CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
   TriangularSolve(a, b,
                   /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   Array2D<complex64> expected({
       {0.5, 1., 1.5},
@@ -368,11 +420,12 @@ XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
   XlaOp a, b;
   auto a_data = CreateR3Parameter<float>(avals, 0, "a", &builder, &a);
   auto b_data = CreateR3Parameter<float>(bvals, 1, "b", &builder, &b);
-  BatchDot(ConstantR3FromArray3D(&builder, avals),
-           TriangularSolve(a, b,
-                           /*left_side=*/true, /*lower=*/false,
-                           /*transpose_a=*/false, /*conjugate_a=*/false,
-                           /*block_size=*/2));
+  BatchDot(
+      ConstantR3FromArray3D(&builder, avals),
+      TriangularSolve(a, b,
+                      /*left_side=*/true, /*lower=*/false,
+                      /*unit_diagonal=*/false,
+                      /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE));
 
   ComputeAndCompareR3<float>(&builder, bvals, {a_data.get(), b_data.get()},
                              ErrorSpec(1e-2, 1e-2));
@@ -382,7 +435,7 @@ struct TriangularSolveTestSpec {
   int m, n;  // A is mxm, B is mxn
   bool left_side;
   bool lower;
-  bool transpose_a;
+  TriangularSolveOptions::Transpose transpose_a;
 };
 
 class TriangularSolveParametricTest
@@ -408,11 +461,11 @@ XLA_TEST_P(TriangularSolveParametricTest, Random) {
   XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(avals, 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(bvals, 1, "b", &builder, &b);
-  auto x = TriangularSolve(a, b, spec.left_side, spec.lower, spec.transpose_a,
-                           /*conjugate_a=*/false,
-                           /*block_size=*/3);
+  auto x = TriangularSolve(a, b, spec.left_side, spec.lower,
+                           /*unit_diagonal=*/false, spec.transpose_a);
   auto a_tri = Triangle(a, spec.lower);
-  a_tri = MaybeTransposeInMinorDims(a_tri, spec.transpose_a);
+  a_tri = MaybeTransposeInMinorDims(
+      a_tri, spec.transpose_a != TriangularSolveOptions::NO_TRANSPOSE);
   if (spec.left_side) {
     BatchDot(a_tri, x);
   } else {
@@ -429,7 +482,9 @@ std::vector<TriangularSolveTestSpec> TriangularSolveTests() {
     for (int n : {5, 10}) {
       for (bool left_side : {false, true}) {
         for (bool lower : {false, true}) {
-          for (bool transpose_a : {false, true}) {
+          for (TriangularSolveOptions::Transpose transpose_a :
+               {TriangularSolveOptions::NO_TRANSPOSE,
+                TriangularSolveOptions::TRANSPOSE}) {
             specs.push_back({m, n, left_side, lower, transpose_a});
           }
         }
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 0def0577e09609ecb82a635ba2e1681a1cc27b1c..cdf2c34fcc3cc005e84626c39c8ab301a9040529 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -525,9 +525,7 @@ XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
       ROOT tuple.4 = (f32[1,3]{1,0}) tuple(copy)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({1, 2, 3}));
   auto result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -559,9 +557,7 @@ XLA_TEST_F(TupleHloTest,
       ROOT outfeed = token[] outfeed(tuple, token0)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param0 = LiteralUtil::CreateR1<float>({1, 2});
   auto param1 = LiteralUtil::CreateR1<float>({2, 3});
   auto param4 = LiteralUtil::CreateR0<bool>(false);
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 4fbd7f2fb174ac899c1e3b23801986cb52db96a2..c51f30f3b5db95962a719ec226dd03f41142a782 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -64,7 +64,9 @@ class UnaryOpTest : public ClientLibraryTestBase {
         &builder, {-2, 25, 0, static_cast<T>(-0.0), -123, inf<T>(), -inf<T>()});
     Sign(arg);
 
-    ComputeAndCompareR1<T>(&builder, {-1, 1, 0, 0, -1, 1, -1}, {});
+    ComputeAndCompareR1<T>(
+        &builder,
+        {-1, 1, static_cast<T>(+0.0), static_cast<T>(-0.0), -1, 1, -1}, {});
   }
 
   template <typename T>
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index c7337e8caae8f2ee25f4b25dc22439e08d2ecc25..7b7b8f5d02dc99607b30f898e18c5b448d421e07 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -40,8 +40,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace gtl = ::tensorflow::gtl;
-
 class HloProfileTest : public ClientLibraryTestBase {};
 
 struct ParsedProfileOutputLine {
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 7289ae7df65e56652eeeb67e536e4c721d97d999..fc7949d889dc8ed9fac425982cc555a6c42a7f1d 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 4412a6ec69ca3bb98f7b67e68802b262deaf9b8c..e9244ecf9f149ed439dc9beeb56a9442ebad6821 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -177,26 +177,6 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
-    name = "dumped_computation_to_tf_graphdef",
-    srcs = ["dumped_computation_to_tf_graphdef.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 tf_cc_binary(
     name = "hlo_proto_to_json",
     srcs = ["hlo_proto_to_json.cc"],
@@ -251,7 +231,14 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
+
+sh_test(
+    name = "interactive_graphviz_build_only_test",
+    srcs = ["interactive_graphviz_test.sh"],
+    data = [":interactive_graphviz"],
+)
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4375e7c138c9e8d193feaa7a39d63946c4ea3086..df2d3d18b9ff86c0dd2047c2415527aeb1c1f154 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 723569862c7550387e95003e3a673743464b67b8..35bb82ca22f46d2cdeaac3b9a87b253efe9a07d9 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
deleted file mode 100644
index f8bb9a6b1e217fc4e6e15c8a3302be61ed339c82..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Usage: dumped_computation_to_tf_graph some_binary_snapshot_proto*
-//
-// Dumps a tensorflow GraphDef in text format for a snapshot computation. The
-// dumped graph is an HLO computation with HLO instructions as nodes and can be
-// visualized on Tensorboard. Upload the dumped files on Tensorboard.
-//
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
-// ServiceInterface::SnapshotComputation to disk.
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-using tensorflow::Env;
-
-namespace xla {
-namespace tools {
-
-void RealMain(absl::Span<char* const> args) {
-  Client* client = ClientLibrary::LocalClientOrDie();
-  for (char* arg : args) {
-    HloSnapshot module;
-    TF_CHECK_OK(
-        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    XlaComputation computation =
-        client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = GetDebugOptionsFromFlags();
-    debug_options.set_xla_generate_hlo_graph(".*");
-    debug_options.set_xla_hlo_dump_as_graphdef(true);
-    ComputationStats stats =
-        client->GetComputationStats(computation, debug_options)
-            .ConsumeValueOrDie();
-    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
-  }
-}
-
-}  // namespace tools
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  absl::Span<char* const> args(argv, argc);
-  args.remove_prefix(1);  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
index c187222a11ee721b006194a68620c58749707193..4beb099b330cadf4540944979f38681bae07103c 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
+++ b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
@@ -36,9 +36,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
@@ -75,9 +74,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
@@ -120,9 +118,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
   {
     auto extracted_module =
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 8460ae3e4991ee091af72d2553a8491f627c722e..88f3a8bdde244bb16d54d13f1022e9b4be1ef893 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,9 @@ limitations under the License.
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
+// module to disk using the debug options
+//
+//   --xla_dump_to=DIR --xla_dump_hlo_as_proto
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 6c90cde5a75a93837ee149fd9b5a60e6413c2ac4..5652d303f0256713a05331cfbc1a8c4f0009c3fe 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,8 +29,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/string_view_utils.h"
-#include "absl/strings/util.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -39,6 +38,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/tools/hlo_extractor.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/subprocess.h"
@@ -56,7 +57,8 @@ bool ReadLine(const char *prompt, string *line) {
   return util::ReadLine(prompt, line);
 #else
   std::cout << prompt;
-  return std::getline(std::cin, *line);
+  std::getline(std::cin, *line);
+  return std::cin.good();
 #endif
 }
 
@@ -139,9 +141,10 @@ HloComputation* FindComputation(const HloModule& module,
 // Print a help message describing the various available commands.
 void DoHelpCommand() {
   std::cout << R"(Commands:
-  <instruction> [<width>]
-    Renders a neighborhood of <width> nodes around <instruction>.  If <width>
-    is not provided, the default value is )"
+  <instruction> [<width>] [/ <boundary_instruction>+]
+    Renders a neighborhood of <width> nodes around <instruction>, without going
+    beyond the optional boundary instructions.  If <width> is not provided, 
+    the default value is )"
             << kDefaultWidth << R"(.
   allpaths <instruction> <instruction> [<n>]
     Renders a subset of all paths from one instruction to the other.  Either
@@ -387,22 +390,18 @@ bool ExistsPathFromTo(const HloInstruction* from, const HloInstruction* to) {
   return false;
 }
 
-void DisplayGraphHandle(const Options &opts, const string& handle) {
-  std::cout << handle << std::endl;
+void OpenUrl(const Options& opts, absl::string_view url) {
+  std::cout << url << std::endl;
 
   // If it is a url, try to open it up in the user's browser too.
-  if (strings::StartsWithIgnoreCase(handle, "http://") ||
-      strings::StartsWithIgnoreCase(handle, "https://") ||
-      strings::StartsWithIgnoreCase(handle, "file://")) {
+  if (absl::StartsWithIgnoreCase(url, "http://") ||
+      absl::StartsWithIgnoreCase(url, "https://") ||
+      absl::StartsWithIgnoreCase(url, "file://")) {
     const char* browser_bin = opts.browser.empty() ? "/usr/bin/sensible-browser"
                                                    : opts.browser.c_str();
     tensorflow::SubProcess p;
-    p.SetProgram(browser_bin, {browser_bin, handle});
+    p.SetProgram(browser_bin, {browser_bin, string(url)});
     p.Start();
-  } else if (handle.empty()) {
-    std::cerr << "Unable to render graph, perhaps due to graphviz server "
-                 "timeout.  Run with --logtostderr to see."
-              << std::endl;
   } else {
     std::cerr << "\nExpected a URL, but got strange graph result (dumped "
                  "above).  If this isn't what you expected, maybe file a bug?"
@@ -410,6 +409,65 @@ void DisplayGraphHandle(const Options &opts, const string& handle) {
   }
 }
 
+// Renders a graph by calling `renderer`, and then tries to open it.
+//
+// `renderer` is a callback so we can try various formats.  In particular, the
+// URL format doesn't work out of the box; it requires you to register a plugin.
+void RenderAndDisplayGraph(
+    const Options& opts,
+    const std::function<StatusOr<string>(RenderedGraphFormat)>& renderer) {
+  StatusOr<string> url_result = renderer(RenderedGraphFormat::kUrl);
+  if (url_result.ok()) {
+    string url = url_result.ValueOrDie();
+    OpenUrl(opts, url);
+    return;
+  }
+
+  // Ignore UNAVAILABLE errors; these are expected when there's no URL renderer
+  // plugin registered.
+  if (url_result.status().code() != tensorflow::error::UNAVAILABLE) {
+    std::cerr << "Unable to render graph as URL: " << url_result.status()
+              << std::endl;
+    std::cerr << "Trying as HTML..." << std::endl;
+  }
+
+  auto* env = tensorflow::Env::Default();
+  StatusOr<string> html_result = renderer(RenderedGraphFormat::kHtml);
+  if (!html_result.ok()) {
+    std::cerr << "Failed to render graph as HTML: " << html_result.status()
+              << std::endl;
+    return;
+  }
+
+  std::vector<string> temp_dirs;
+  env->GetLocalTempDirectories(&temp_dirs);
+  if (temp_dirs.empty()) {
+    std::cerr << "Can't render graph as HTML because we can't find a suitable "
+                 "temp directory.  Try setting $TMPDIR?"
+              << std::endl;
+    return;
+  }
+
+  // Try to create a unique file inside of temp_dirs.front().  Notably, this
+  // file's name must end with ".html", otherwise web browsers will treat it as
+  // plain text, so we can't use Env::CreateUniqueFileName().
+  string temp_file_path = tensorflow::io::JoinPath(
+      temp_dirs.front(),
+      absl::StrFormat("interactive_graphviz.%d.html", env->NowMicros()));
+  auto status = tensorflow::WriteStringToFile(
+      env, temp_file_path, std::move(html_result).ValueOrDie());
+  if (status.ok()) {
+    OpenUrl(opts, absl::StrCat("file://", temp_file_path));
+    return;
+  }
+
+  std::cerr << "Failed to write rendered HTML graph to " << temp_file_path
+            << ": " << status;
+
+  // We don't bother trying kDot, because kHTML should always work (or if it
+  // doesn't, we don't have any reason to believe kDot will work better).
+}
+
 void DoAllPathsCommand(const Options& opts, const HloModule& module,
                        const std::vector<string>& tokens) {
   if (tokens.size() > 4) {
@@ -450,19 +508,15 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
     std::cerr << "No path from/to " << tokens[1] << " to/from " << tokens[2];
     return;
   }
-  DisplayGraphHandle(opts, hlo_graph_dumper::DumpAllPathsFromTo(
-      *from, *to, max_nodes, /*show_backend_config=*/show_backend_config));
+  RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+    return RenderAllPathsFromTo(*from, *to, max_nodes, format,
+                                /*show_backend_config=*/show_backend_config);
+  });
 }
 
 // Plot a given instruction neighborhood or computation with graphviz.
 void DoPlotCommand(const Options& opts, const HloModule& module,
                    const std::vector<string>& tokens) {
-  if (tokens.size() > 2) {
-    std::cerr << R"(Illegal input.  Enter e.g. "%fusion.1 42" or "%fusion.1".)"
-              << std::endl;
-    return;
-  }
-
   string node_name = tokens[0];
 
   // Find the node with the given name.
@@ -475,28 +529,62 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
   }
 
   uint64 graph_width = kDefaultWidth;
-  if (tokens.size() == 2) {
+  absl::flat_hash_set<const HloInstruction*> boundary;
+  if (tokens.size() >= 2) {
     if (comp) {
       std::cerr << "Can only use graph-size parameter with instructions, but "
                 << node_name << " is a computation." << std::endl;
       return;
     }
-    if (!absl::SimpleAtoi(tokens[1], &graph_width)) {
-      std::cerr << "Can't parse '" << tokens[1] << "' as an integer."
-                << std::endl;
-      return;
+
+    int bound_index = 1;
+    // Get the <width> if present.
+    if (absl::SimpleAtoi(tokens[bound_index], &graph_width)) {
+      bound_index++;
+    } else {
+      // <width> not found, need to reset graph_width.
+      graph_width = kDefaultWidth;
+    }
+    // Get the '/'.
+    if (bound_index < tokens.size()) {
+      // This token must be a '/'.
+      if (tokens[bound_index] != "/") {
+        std::cerr << "Expect a /, but get a '" << tokens[bound_index] << "'."
+                  << std::endl;
+        return;
+      }
+      bound_index++;
+    }
+    // Get the boundary nodes.
+    while (bound_index < tokens.size()) {
+      string bnode_name = tokens[bound_index];
+      const HloInstruction* binstr = FindInstruction(module, bnode_name);
+      if (!binstr) {
+        std::cerr << "Couldn't find HloInstruction named " << bnode_name << "."
+                  << std::endl;
+        return;
+      }
+      boundary.insert(binstr);
+      bound_index++;
     }
   }
 
   // Generate the graph and print the resulting string, which should be a
   // graphviz url.
   if (comp) {
-    DisplayGraphHandle(opts, hlo_graph_dumper::DumpGraph(
-        *comp, "", comp->parent()->config().debug_options(), nullptr,
-        /*show_backend_config=*/show_backend_config));
+    RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+      return RenderGraph(*comp, /*label=*/"",
+                         comp->parent()->config().debug_options(), format,
+                         /*hlo_execution_profile=*/nullptr,
+                         /*show_backend_config=*/show_backend_config);
+    });
   } else {
-    DisplayGraphHandle(opts, hlo_graph_dumper::DumpNeighborhoodAround(
-        *instr, graph_width, /*show_backend_config=*/show_backend_config));
+    RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+      return RenderNeighborhoodAround(
+          *instr, graph_width, format,
+          /*show_backend_config=*/show_backend_config,
+          /*boundary=*/boundary);
+    });
   }
 }
 
@@ -515,7 +603,7 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
                 << std::endl;
       continue;
     }
-    std::vector<string> tokens = strings::Split(line, ' ');
+    std::vector<string> tokens = absl::StrSplit(line, ' ', absl::SkipEmpty());
     if (tokens[0] == "quit" || tokens[0] == "exit") {
       break;
     } else if (tokens[0] == "help") {
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3e43aa7da062547fb5f187b885e997fc44bbb65
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+# /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================*/
+
+# This is a placeholder for a compile-only test for intractive_graphviz tool.
+
+exit 0
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index c01a47b510c0e4252e350960b995643b39b70d4a..d66561315b4ad7a5e3f1f7b1bc1e557b71da6705 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -90,8 +90,8 @@ struct Options {
   int num_runs = 1;
 };
 
-std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
-                                                   LocalClient* client) {
+StatusOr<std::unique_ptr<LocalExecutable>> CompileExecutable(
+    const HloSnapshot& module, LocalClient* client) {
   XlaComputation computation(module.hlo().hlo_module());
   std::vector<Shape> argument_layouts;
   argument_layouts.reserve(
@@ -102,9 +102,9 @@ std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
     argument_layouts.push_back(Shape(param));
     argument_layout_ptrs.push_back(&argument_layouts.back());
   }
-  return client
-      ->Compile(computation, argument_layout_ptrs, ExecutableBuildOptions())
-      .ValueOrDie();
+  ExecutableBuildOptions exec_build_options;
+  *exec_build_options.mutable_debug_options() = GetDebugOptionsFromFlags();
+  return client->Compile(computation, argument_layout_ptrs, exec_build_options);
 }
 
 absl::optional<Shape> GetXfeedShape(bool is_infeed,
@@ -329,7 +329,10 @@ StatusOr<HloSnapshot> ParseInputFile(const string& filename,
   fprintf(stderr, "%s: is not HloProto. Trying HLO text.\n", filename.c_str());
   string contents;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, filename, &contents));
-  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(contents);
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  StatusOr<std::unique_ptr<HloModule>> module =
+      ParseHloString(contents, config);
   if (module.ok()) {
     *snapshot.mutable_hlo()->mutable_hlo_module() =
         module.ValueOrDie()->ToProto();
@@ -357,7 +360,7 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
 
   // Compile all the modules in parallel.
   LOG(INFO) << "Compiling " << snapshots.size() << " modules in parallel.";
-  std::vector<std::unique_ptr<LocalExecutable>> executables;
+  std::vector<StatusOr<std::unique_ptr<LocalExecutable>>> executables;
   {
     // ThreadPool CHECK-fails if we give it 0 threads.
     tensorflow::thread::ThreadPool thread_pool(
@@ -374,7 +377,12 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
   LOG(INFO) << "Done compiling; now running the modules.";
 
   for (int64 i = 0; i < executables.size(); ++i) {
-    LocalExecutable* executable = executables[i].get();
+    if (!executables[i].ok()) {
+      LOG(ERROR) << "Compilation failed: " << executables[i].status();
+      exit_status = EXIT_FAILURE;
+      continue;
+    }
+    LocalExecutable* executable = executables[i].ValueOrDie().get();
     LOG(ERROR) << "Running iteration " << i;
     StatusOr<Literal> result_status =
         ReplayComputation(snapshots[i], executable, client, opts);
diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc
index cdf306dfd1027cf6022c5d8ae844b4308f580e8d..b80d0db8d812380d8144713109d1c05168713c77 100644
--- a/tensorflow/compiler/xla/tools/show_signature.cc
+++ b/tensorflow/compiler/xla/tools/show_signature.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 34b73b5206fa20d6dff7567afd78fd89897c8c33..bb8bbf57c4252b16836553334901a3c896a17f39 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdarg.h>
 #include <numeric>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -80,13 +81,9 @@ bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
   if (rank != permutation.size()) {
     return false;
   }
-  std::vector<int64> output(permutation.size(), -1);
-  for (auto index : permutation) {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, rank);
-    output[index] = 0;
-  }
-  return !absl::c_linear_search(output, -1);
+  absl::InlinedVector<int64, 8> trivial_permutation(rank);
+  absl::c_iota(trivial_permutation, 0);
+  return absl::c_is_permutation(permutation, trivial_permutation);
 }
 
 std::vector<int64> InversePermutation(
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index f2fd17dc99455a921bf875aad2a3661b4d456823..1754ae0e44f3420bf7eb7cfb3b558dd476b31455 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -260,6 +260,16 @@ Status Unavailable(const absl::FormatSpec<Args...>& format,
   return WithLogBacktrace(
       tensorflow::errors::Unavailable(absl::StrFormat(format, args...)));
 }
+template <typename... Args>
+Status Unknown(const absl::FormatSpec<Args...>& format, const Args&... args) {
+  return WithLogBacktrace(
+      tensorflow::errors::Unknown(absl::StrFormat(format, args...)));
+}
+template <typename... Args>
+Status Internal(const absl::FormatSpec<Args...>& format, const Args&... args) {
+  return WithLogBacktrace(
+      tensorflow::errors::Internal(absl::StrFormat(format, args...)));
+}
 
 template <typename... Args>
 Status InvalidArgumentStrCat(Args&&... concat) {
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index e001cc35f9fcea2783b3952e825838af6bbece72..f2e183110393c359c421031417117b79976bdab4 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -204,6 +204,14 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) {
          window_dim.padding_low() == 0 && window_dim.padding_high() == 0;
 }
 
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+
 int64 DilatedBound(int64 bound, int64 dilation) {
   CHECK_GE(bound, 0);
   CHECK_GE(dilation, 1);
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 099d7ecdd5c732ffc8c6ff6370288a2fc4144fa2..e7099285c340523c7d4e6240c7b039fd39443100 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -62,6 +62,10 @@ bool AllOrNoneReversed(const Window& window);
 // has window bound 1, no striding and no padding.
 bool IsInactiveWindowDimension(const Window& window, int64 logical_dim);
 
+// Returns true if the provided window dimension is trivial (inactive and has no
+// dilation)
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension);
+
 // Returns the new bound after dilation.
 //
 // If a window with the given bound in some dimension is dilated with the given
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 1439f1bcc5cec39203a7cb4b1f8604e7349382c6..cda2d7c7c6b2403868f6d01a485753fa29a8d95f 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,30 +1,47 @@
 """Wrapper around cc_proto_library used inside the XLA codebase."""
 
-load("//tensorflow/core:platform/default/build_config.bzl",
-     "cc_proto_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl",
-     "if_static")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "cc_proto_library",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
+load("//tensorflow:tensorflow.bzl", "if_cuda_is_configured")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
-  if kwargs.get('use_grpc_plugin'):
-    kwargs['use_grpc_namespace'] = True
-  cc_proto_library(name=name,
-                   srcs=srcs,
-                   deps=deps,
-                   cc_libs = if_static(
-                       ["@protobuf_archive//:protobuf"],
-                       otherwise=["@protobuf_archive//:protobuf_headers"],
-                   ),
-                   protoc="@protobuf_archive//:protoc",
-                   testonly=testonly,
-                   visibility=visibility,
-                   **kwargs)
+def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly = 0, **kwargs):
+    if kwargs.get("use_grpc_plugin"):
+        kwargs["use_grpc_namespace"] = True
+    cc_proto_library(
+        name = name,
+        srcs = srcs,
+        # Append well-known proto dep. As far as I know this is the only way
+        # for xla_proto_library to access google.protobuf.{Any,Duration,...}.
+        deps = deps + ["@protobuf_archive//:cc_wkt_protos"],
+        cc_libs = if_static(
+            ["@protobuf_archive//:protobuf"],
+            otherwise = ["@protobuf_archive//:protobuf_headers"],
+        ),
+        protoc = "@protobuf_archive//:protoc",
+        testonly = testonly,
+        visibility = visibility,
+        **kwargs
+    )
 
-def xla_py_grpc_library(**kwargs):
-  # Note: we don't currently define any special targets for Python GRPC in OSS.
-  _ignore = kwargs
-  pass
+def xla_py_proto_library(**kwargs):
+    # Note: we don't currently define a proto library target for Python in OSS.
+    _ignore = kwargs
+    pass
 
+def xla_py_grpc_library(**kwargs):
+    # Note: we don't currently define any special targets for Python GRPC in OSS.
+    _ignore = kwargs
+    pass
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
+
+# We link the GPU plugin into the XLA Python extension if CUDA is enabled.
+def xla_python_default_plugins():
+    return if_cuda_is_configured(["//tensorflow/compiler/xla/service:gpu_plugin"])
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index e2d7b6ef4666c533951960fd3dcf6869ec2b52c5..6155f3698ebaa1f4ebdd4e5eeb062181169de71c 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -15,11 +15,11 @@ limitations under the License.
 
 syntax = "proto3";
 
-import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
-
 package xla;
 
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
 // Options for the HLO insert-reduce-precision-operations pass.
 message HloReducePrecisionOptions {
   // Where and when the reduce-precision operations will be added.
@@ -61,41 +61,12 @@ message HloReducePrecisionOptions {
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields.
 message DebugOptions {
-  // HLO modules matching this regex will be dumped to a .dot file throughout
-  // various stages in compilation (file names are LOG(INFO)'d). Set to ".*" to
-  // dump *all* HLO modules.
-  string xla_generate_hlo_graph = 1;
-
   // Show addresses of HLO ops in graph dump.
   bool xla_hlo_graph_addresses = 2;
 
-  // Path to dump HLO graphs to.
-  string xla_hlo_graph_path = 4;
-
-  // Dump HLO graphs as TensorFlow GraphDefs.
-  bool xla_hlo_dump_as_graphdef = 5;
-
-  // HLO modules matching this regex will be dumped to LOG(INFO). Set to ".*" to
-  // dump *all* HLO modules.
-  string xla_log_hlo_text = 6;
-
-  // Dump all HLO modules as text into the provided directory path.
-  string xla_generate_hlo_text_to = 7;
-
-  // Dump Hlo after all hlo passes are executed as proto binary into this
-  // directory.
-  string xla_dump_optimized_hlo_proto_to = 8;
-
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
 
-  // Dumps computations that XLA executes into the provided directory path.
-  string xla_dump_computations_to = 10;
-
-  // Dumps parameters and results of computations that XLA executes into the
-  // provided directory path.
-  string xla_dump_executions_to = 11;
-
   // List of HLO passes to disable. These names must exactly match the pass
   // names as specified by the HloPassInterface::name() method.
   repeated string xla_disable_hlo_passes = 30;
@@ -115,9 +86,6 @@ message DebugOptions {
   // Embed the compiler IR as a string in the executable.
   bool xla_embed_ir_in_executable = 33;
 
-  // Dump the compiler IR into this directory as individual files.
-  string xla_dump_ir_to = 34;
-
   // Eliminate implicit broadcasts when lowering user computations to HLO
   // instructions; use explicit broadcast instead.
   bool xla_eliminate_hlo_implicit_broadcast = 35;
@@ -171,22 +139,12 @@ message DebugOptions {
   // HLO graph.
   bool xla_hlo_graph_sharding_color = 92;
 
-  // Prefix the name scopes of the TF graph exports with "devX" device
-  // assignments, if available.
-  bool xla_hlo_tfgraph_device_scopes = 93;
+  reserved 93;  // Was xla_hlo_tfgraph_device_scopes
 
   // If true, the GPU backend is free to use cudnn for HLO batch normalization
   // ops.
   bool xla_gpu_use_cudnn_batchnorm = 94;
 
-  // Dump HLO before any hlo passes are executed as proto binary into this
-  // directory.
-  string xla_dump_unoptimized_hlo_proto_to = 95;
-
-  // Dump HLO after each pass as an HloProto in binary file format into this
-  // directory.
-  string xla_dump_per_pass_hlo_proto_to = 96;
-
   // Generate calls to MKL-DNN in the CPU backend.
   bool xla_cpu_use_mkl_dnn = 97;
 
@@ -198,10 +156,21 @@ message DebugOptions {
   //
   //  - Reducing the precision of operations (e.g. using an approximate sin
   //    function, or transforming x/y into x * (1/y)).
-  //  - Assuming that operations never produce or consume NaN or +/- Inf.
+  //  - Assuming that operations never produce or consume NaN or +/- Inf (this
+  //    behavior can be adjusted using xla_cpu_fast_math_allow_{nans|infs}).
   //  - Assuming that +0 and -0 are indistinguishable.
   bool xla_cpu_enable_fast_math = 99;
 
+  // When xla_cpu_enable_fast_math is true then this controls whether we allow
+  // operations to produce NaNs.  Ignored when xla_cpu_enable_fast_math is
+  // false.
+  bool xla_cpu_fast_math_honor_nans = 120;
+
+  // When xla_cpu_enable_fast_math is true then this controls whether we allow
+  // operations to produce infinites.  Ignored when xla_cpu_enable_fast_math is
+  // false.
+  bool xla_cpu_fast_math_honor_infs = 121;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
@@ -224,9 +193,6 @@ message DebugOptions {
   // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
   bool xla_gpu_disable_ptxas_optimizations = 103;
 
-  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
-  bool xla_hlo_dump_as_html = 105;
-
   // Enable fast math with eigen in the HLO evaluator.
   bool xla_hlo_evaluator_use_fast_path = 106;
 
@@ -234,11 +200,75 @@ message DebugOptions {
   // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
   bool xla_allow_scalar_index_dynamic_ops = 107;
 
-  // Next id: 108
+  enum StepMarkerLocation {
+    // Generate step mark at each iteration of top level while loop, which
+    // is assumed to be a training loop. This is the default.
+    STEP_MARK_AT_ENTRY = 0;
+    // Generate step mark at program entry. This handles the case where each
+    // step are done by one or multiple programs execution. Only the first
+    // program will be tagged for generating step mark at program entry.
+    STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
+    // No step mark.
+    STEP_MARK_NONE = 2;
+  }
+  // Option to emit a target-specific marker to indicate the start of a training
+  // step. The location of the marker (if any) is determined by the option
+  // value.
+  StepMarkerLocation xla_step_marker_location = 108;
+
+  //
+  // BEGIN flags controlling dumping HLO modules for debugging.
+  //
+  // When dumping is enabled, HLO modules dumped at the very beginning and end
+  // of compilation, and optionally also during the pass pipeline.
+  //
+  // In general, if you set one of these flags, we will try to infer reasonable
+  // defaults for the others.  For example:
+  //
+  //  * Setting --xla_dump_to=/tmp/foo without specifying a format
+  //    with --xla_dump_hlo_as_* will turn on --xla_dump_hlo_as_text.
+  //
+  //  * Setting --xla_dump_hlo_as_text without specifying --xla_dump_to will
+  //    dump to stdout.
+  //
+
+  // Directory to dump into.
+  string xla_dump_to = 109;
+
+  // If specified, will only dump modules which match this regexp.
+  string xla_dump_hlo_module_re = 110;
+
+  // If this flag is specified, will also HLO before and after passes that match
+  // this regular expression.  Set to .* to dump before/after all passes.
+  string xla_dump_hlo_pass_re = 111;
+
+  // Specifies the format that HLO is dumped in.  Multiple of these may be
+  // specified.
+  bool xla_dump_hlo_as_text = 112;
+  bool xla_dump_hlo_as_proto = 113;
+  bool xla_dump_hlo_as_dot = 114;
+  bool xla_dump_hlo_as_url = 115;
+
+  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
+  bool xla_dump_hlo_as_html = 116;
+
+  // If true, every time an HLO module is run, we will dump an HloSnapshot
+  // (essentially, a serialized module plus its inputs) to the --xla_dump_to
+  // directory.
+  bool xla_dump_hlo_snapshots = 118;
+
+  //
+  // END flags controlling dumping HLO modules.
+  //
+
+  // Next id: 121
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
+
+  reserved 117;  // was xla_dump_to
+  reserved 5;    // Was xla_hlo_dump_as_graphdef
 }
 
 // These settings control how XLA compiles and/or runs code.  Not all settings
@@ -265,6 +295,14 @@ message ExecutionOptions {
   // computation on. The computation will be partitioned across these devices.
   // If not provided, the default device will be chosen.
   repeated DeviceHandle device_handles = 5;
+
+  // Number of replicas of the computation to run. If zero, uses the default
+  // number of replicas for the XLA service.
+  int32 num_replicas = 6;
+
+  // This optional field specifies the device assignment if known at compile
+  // time.
+  DeviceAssignmentProto device_assignment = 7;
 }
 
 message GetDeviceHandlesRequest {
@@ -302,8 +340,7 @@ message TransferToInfeedRequest {
   DeviceHandle device_handle = 3;
 }
 
-message TransferToInfeedResponse {
-}
+message TransferToInfeedResponse {}
 
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
@@ -322,8 +359,7 @@ message ResetDeviceRequest {
   DeviceHandle device_handle = 1;
 }
 
-message ResetDeviceResponse {
-}
+message ResetDeviceResponse {}
 
 message ComputationGraphStatsRequest {
   HloModuleProto computation = 1;
@@ -346,8 +382,7 @@ message UnregisterRequest {
   repeated GlobalDataHandle data = 1;
 }
 
-message UnregisterResponse {
-}
+message UnregisterResponse {}
 
 message CompileRequest {
   // The graph to be compiled.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index a64e2f5df5cacca05e83f31c941c57abd5ccf4de..6e5772a7396bae1674ec4e7393ba03506c9381e4 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 package xla;
+
 option cc_enable_arenas = true;
 
 // Primitive types are the individual values that can be held in rectangular
@@ -55,7 +56,7 @@ enum PrimitiveType {
   F64 = 12;
 
   // Complex values of fixed width.
-  C64 = 15;  // Paired F32 (real, imag), as in std::complex<float>.
+  C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
   C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
 
   // A tuple is a polymorphic sequence; e.g. a shape that holds different
@@ -199,7 +200,7 @@ message ShapeProto {
   // in this field represents an upper bound on the size of the dimension.
   repeated int64 dimensions = 3;
 
-  // For tuples only, the shapes of constitutent shapes in the tuple sequence.
+  // For tuples only, the shapes of constituent shapes in the tuple sequence.
   repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
@@ -367,7 +368,7 @@ message LiteralProto {
   repeated uint64 u64s = 7;
   repeated float f32s = 8;
   repeated double f64s = 9;
-  repeated float c64s = 12;  // Stored as interleaved real, imag floats.
+  repeated float c64s = 12;    // Stored as interleaved real, imag floats.
   repeated double c128s = 18;  // Stored as interleaved real, imag doubles.
   repeated LiteralProto tuple_literals = 10;
   // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
@@ -510,7 +511,7 @@ message ConvolutionDimensionNumbers {
   repeated int64 output_spatial_dimensions = 12;
 
   // Next = 13
-};
+}
 
 enum FftType {
   FFT = 0;    // Forward FFT; complex in, complex out.
@@ -529,7 +530,7 @@ message DotDimensionNumbers {
   repeated int64 lhs_batch_dimensions = 3;
   // The dimension numbers that represent the 'rhs' batch dimensions.
   repeated int64 rhs_batch_dimensions = 4;
-};
+}
 
 enum RandomDistribution {
   RNG_INVALID = 0;
@@ -545,6 +546,32 @@ enum RandomDistribution {
   // Next: 4
 }
 
+message TriangularSolveOptions {
+  // If true, solves ax = b. If false, solves xa = b.
+  bool left_side = 1;
+
+  // If true, 'a' is lower triangular. If false, 'a' is upper triangular.
+  bool lower = 2;
+
+  // If true, the diagonal elements of 'a' are assumed to be 1 and not accessed.
+  bool unit_diagonal = 3;
+
+  // Should we transpose or use the adjoint of 'a'?
+  enum Transpose {
+    TRANSPOSE_INVALID = 0;
+    NO_TRANSPOSE = 1;  // Don't transpose 'a'.
+    TRANSPOSE = 2;     // Transpose 'a'.
+    ADJOINT = 3;       // Complex conjugate and transpose 'a'.
+  };
+  Transpose transpose_a = 4;
+}
+
+message CholeskyOptions {
+  // If true, uses the lower triangle of `a`. If false, uses the upper triangle
+  // of `a`.
+  bool lower = 1;
+}
+
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
@@ -604,3 +631,32 @@ message PrecisionConfig {
 
   // Next: 2
 }
+
+// Describes whether all data-parallelism replicas will receive the same
+// parameter data at each buffer.
+message ParameterReplication {
+  // A list of boolean values for the flattened leaf buffers. Each value
+  // indicates whether the corresponding leaf buffer is replicated.
+  //
+  // If this field is empty, it means no buffer is replicated. Otherwise, the
+  // number of elements in this field must match the number of leaf buffers in
+  // the HLO instruction's shape.
+  repeated bool replicated_at_leaf_buffers = 1;
+}
+
+// A backend-config for kWhile loops that stores the loop's trip count, if it is
+// known.
+//
+// This is useful for backends that can implement a `for i in 0..N` loop more
+// efficiently than a `while` loop.  For example, on GPUs, we can implement a
+// `for i in 0..N` loop by enqueueing the kernels for the loop body N times,
+// whereas implementing a `while` loop requires a host-device sync on each
+// iteration.
+message WhileLoopBackendConfig {
+  message KnownTripCount {
+    int64 n = 1;
+  }
+  // This indirection lets us distinguish between known-trip-count == 0 and
+  // unknown-trip-count.
+  KnownTripCount known_trip_count = 1;
+}
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2dae746d034a1bf52e84de74dfb0c6e23aaed4d1..b2718c5c283358d98da175a8d3b21bb1f2b01c75 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -11,9 +11,15 @@ package(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_py_library",
     "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 xla_proto_library(
     name = "xrt_proto",
@@ -27,6 +33,12 @@ xla_proto_library(
     ],
 )
 
+tf_proto_library_py(
+    name = "xrt_proto",  # bzl adds a _py suffix
+    srcs = ["xrt.proto"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "xrt_utils",
     srcs = [
@@ -78,6 +90,25 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "xrt_ops_wrapper_py",
+    out = "xrt_ops.py",
+    deps = [
+        ":xrt_compile_ops_op_lib",
+        ":xrt_execute_op_op_lib",
+        ":xrt_state_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "xrt_ops",
+    kernels = ["//tensorflow/compiler/xrt/kernels:xrt_ops"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xrt_ops_wrapper_py",
+    ],
+)
+
 cc_library(
     name = "xrt_server",
     visibility = ["//visibility:public"],
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index dc02fd272fd8700c7f8fa64adf7ab57c88bab706..1e325191bba828e3d5e4599f87dcf4f4d0674945 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -51,7 +51,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
+        "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
         "//tensorflow/compiler/xrt:xrt_proto",
+        "//tensorflow/compiler/xrt:xrt_state_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 2ee1a6cd1aebcdbd65892b33e5044489070ab5c4..b791519c09758a4f4124c95add5351a9433ecb8f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -68,9 +68,11 @@ class XRTCompileOp : public OpKernel {
 
 Status CompilationCacheKey(const xrt::XLAComputation& computation,
                            string* key) {
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(computation, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
+  const size_t size = computation.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(
+      SerializeToBufferDeterministic(computation, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
   *key = absl::StrCat(fingerprint);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 116c193cab65410a5a7c3058f98cc2be2cbe9e67..42ef88168af4b6f391ffc2e69ab4c4000d1cbee1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 6a7f10652533920ba3fa48fba1d5161f7c4d4530..343f43b7159b55bad184eed2cada55c76085ffa0 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -122,6 +122,17 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<true, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle"),
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index e2c223b3dbb2311d0f42e1a36e316fd9d5f66040..6af73ecc85351a9b38ba526db076e9176d1cb2f1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -215,27 +217,29 @@ class XRTAllocateFromTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &tf_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("make_tuple", &make_tuple));
+    std::vector<int64> minor_to_major;
     if (ctx->HasAttr("layouts")) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major));
     }
     OP_REQUIRES(
         ctx, tf_shapes_.size() == dtypes_.size(),
         errors::InvalidArgument("shapes and dtypes must be the same length"));
     std::vector<xla::Shape> xla_shapes;
+    xla_shapes.reserve(tf_shapes_.size());
     for (int i = 0; i < tf_shapes_.size(); i++) {
       xla::Shape xla_shape;
       OP_REQUIRES_OK(
           ctx, TensorShapeToXLAShape(dtypes_[i], tf_shapes_[i], &xla_shape));
-      xla_shapes.push_back(xla_shape);
+      xla_shapes.push_back(std::move(xla_shape));
     }
     if (xla_shapes.size() > 1 || make_tuple) {
       shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes);
     } else {
       shape_.Swap(&xla_shapes.front());
     }
-    if (!minor_to_major_.empty()) {
+    if (!minor_to_major.empty()) {
       xla::Shape shape_with_layouts;
-      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major_,
+      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major,
                                              /*layout_func=*/nullptr,
                                              &shape_with_layouts));
       shape_.Swap(&shape_with_layouts);
@@ -304,7 +308,6 @@ class XRTAllocateFromTensorOp : public OpKernel {
  private:
   std::vector<TensorShape> tf_shapes_;
   DataTypeVector dtypes_;
-  std::vector<int64> minor_to_major_;
   xla::Shape shape_;
 };
 
@@ -487,7 +490,7 @@ class XRTReadLiteralOp : public OpKernel {
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
                             ctx, allocation->device_ordinal(), &device_ref));
 
-    xla::Literal literal;
+    xla::Literal literal(allocation->on_host_shape());
     OP_REQUIRES_OK(
         ctx, allocation->ToLiteral(device_ref.backend(),
                                    device_ref.device_ordinal(), &literal));
@@ -499,6 +502,96 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that reads a device-resident tuple to host memory and returns it as a
+// literal.
+template <class DeviceAccessor>
+class XRTReadToTensorOp : public OpKernel {
+ public:
+  explicit XRTReadToTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("release_handles", &discard_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  }
+  ~XRTReadToTensorOp() override = default;
+  XRTReadToTensorOp(const XRTReadToTensorOp&) = delete;
+  XRTReadToTensorOp& operator=(const XRTReadToTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTReadToTensorOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
+    // just scalars.)
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+
+    if (discard_) {
+      VLOG(2) << "Releasing handle " << allocation_handle;
+      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
+                              rm, allocation_handle));
+    }
+
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+
+    xla::Shape shape = allocation->on_host_shape();
+    int output = 0;
+    Status status = xla::ShapeUtil::ForEachMutableSubshapeWithStatus(
+        &shape,
+        [&](xla::Shape* subshape, const xla::ShapeIndex& index) -> Status {
+          if (subshape->IsTuple()) return Status::OK();
+
+          xla::PrimitiveType xla_type;
+          TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(
+              ctx->expected_output_dtype(output), &xla_type));
+          if (xla_type != subshape->element_type()) {
+            return errors::InvalidArgument(
+                "Type mismatch between buffer type (", subshape->ToString(),
+                ") and tensor type (",
+                DataTypeString(ctx->expected_output_dtype(output)),
+                ") for output tensor ", output);
+          }
+
+          TensorShape output_shape;
+          TF_RETURN_IF_ERROR(XLAShapeToTensorShape(*subshape, &output_shape));
+
+          Tensor* output_tensor;
+          TF_RETURN_IF_ERROR(
+              ctx->allocate_output(output, output_shape, &output_tensor));
+
+          XRTTupleAllocation* sub;
+          TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+              allocation, index, &sub, /*alias_parent_allocation=*/true));
+          core::ScopedUnref sub_unref(sub);
+
+          xla::MutableBorrowingLiteral literal;
+          TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(
+              xla::LayoutUtil::GetWithDefaultLayout(*subshape), output_tensor,
+              &literal));
+          TF_RETURN_IF_ERROR(sub->ToLiteral(
+              device_ref.backend(), device_ref.device_ordinal(), &literal));
+
+          ++output;
+          return Status::OK();
+        });
+    OP_REQUIRES_OK(ctx, status);
+  }
+  bool discard_;
+  DataTypeVector dtypes_;
+};
+
 // Op that writes a new literal value into device-resident memory.
 template <class DeviceAccessor>
 class XRTWriteLiteralOp : public OpKernel {
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 2e743fec4963a52ee1abf64525f26e3d89479670..87546fce4e4e7e38ef934d32ff95a60a4ad5492a 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -53,7 +53,7 @@ The shapes can differ from the corresponding input one, as long as the total
 number of elements matches. In other words, it is possible to feed an input
 tensor with shape {8} and have a corresponding shape {2,2,2}.
 layouts: A vector holding the requested layout in minor-to-major sequence.
-If empty, the default layout wil be used.
+If empty, the default layout will be used.
 For a tuple, the layouts vector holds a linearized minor-to-major numbers
 for all the tuple leaves, in the order they appear within the tuple.
 The elements within the layouts sequence corresponding to a given tuple
@@ -151,6 +151,27 @@ releases the handle.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTReadToTensor")
+    .Input("handles: int64")
+    .Attr("release_handles: bool = False")
+    .Attr("dtypes: list(type)")
+    .Output("tensors: dtypes")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(
+        R"(
+Copies allocated values from device memory and returns them as zero or more
+Tensors. If a handle refers to a non-tuple buffer, a single tensor is returned.
+In general, the tensors returned for a handle correspond to an in-order traversal
+of a the tuple-tree value referenced by the handle.
+
+'handles' contains ids returned from Ops that produced on-device allocations.
+At present, only a single (scalar) handle is supported.
+'dtypes' are the expected types for each `Tensor` to be returned. If the
+expected and actual tensor types do not match, an error is returned.
+'release_handles': if True, `handles` are released.
+'tensors' are the output Tensors.
+)");
+
 REGISTER_OP("XRTReleaseAllocationHandle")
     .Input("handle: int64")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 1e2a9584f88b73d7c92a929e93af60376a59170b..1b3bcbea4c1228944a6604fc923228024e74d700 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -221,7 +220,7 @@ XRTTupleAllocation::~XRTTupleAllocation() {
 }
 
 Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
-                                     xla::Literal* literal) {
+                                     xla::MutableLiteralBase* literal) {
   auto transfer_manager = backend->transfer_manager();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
 
@@ -235,9 +234,8 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
                                      " has been released");
     }
   }
-  TF_ASSIGN_OR_RETURN(*literal, transfer_manager->TransferLiteralFromDevice(
-                                    stream.get(), shaped_buffer));
-  return Status::OK();
+  return transfer_manager->TransferLiteralFromDevice(stream.get(),
+                                                     shaped_buffer, *literal);
 }
 
 Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index ddf2656e6f51775024a6d1cd0d7a387605faae6f..6519da30d02e41da5a862cadd2133bd8dd8b42d7 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -147,7 +147,7 @@ class XRTTupleAllocation : public ResourceBase {
 
   // Copies the allocation from device to host and returns it in literal.
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
-                   xla::Literal* literal);
+                   xla::MutableLiteralBase* literal);
 
   // Write a new literal value to the allocation.
   Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 3ef8bedc7324696cd255c72a851f0f2410e03848..8b7749b1919710296bb5b5ec2f7cb43b189830d2 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -55,21 +55,14 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
     return ref_options;
   }
   xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
-  options.set_xla_generate_hlo_text_to(
-      SafeDebugPath(ref_options.xla_generate_hlo_text_to()));
-  options.set_xla_dump_optimized_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_optimized_hlo_proto_to()));
-  options.set_xla_dump_computations_to(
-      SafeDebugPath(ref_options.xla_dump_computations_to()));
-  options.set_xla_dump_executions_to(
-      SafeDebugPath(ref_options.xla_dump_executions_to()));
+  options.set_xla_dump_to(SafeDebugPath(ref_options.xla_dump_to()));
+  options.set_xla_dump_hlo_as_proto(ref_options.xla_dump_hlo_as_proto());
+  options.set_xla_dump_hlo_as_text(ref_options.xla_dump_hlo_as_text());
+  options.set_xla_dump_hlo_snapshots(ref_options.xla_dump_hlo_snapshots());
+  options.set_xla_dump_hlo_pass_re(ref_options.xla_dump_hlo_pass_re());
   for (auto& pass : ref_options.xla_disable_hlo_passes()) {
     options.add_xla_disable_hlo_passes(pass);
   }
-  options.set_xla_dump_unoptimized_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_unoptimized_hlo_proto_to()));
-  options.set_xla_dump_per_pass_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_per_pass_hlo_proto_to()));
   return options;
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index a4c3d9623adfe3133af0c6ea055586b9544e659b..fb8dc070ad2437f7ece9dd5037089f972b988a38 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -59,6 +59,7 @@ py_library(
         "//tensorflow/contrib/labeled_tensor",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/learn:head_test_lib",
         "//tensorflow/contrib/legacy_seq2seq:seq2seq_py",
         "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
@@ -218,7 +219,6 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
-        "//tensorflow/contrib/tpu:all_ops",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
@@ -238,7 +238,7 @@ cc_library(
             "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
         ],
     }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
+        "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
     ]) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index f0b1c92cf7e4b760381da38febd9682ce2a4f27c..5608e7ddafa25757484d8c845c8c84a5691e143c 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -73,8 +73,7 @@ cc_binary(
         "-z defs",
         "-s",
         "-Wl,--gc-sections",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ]),
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 648f3ebb05646a66144bcb118347cbc391909409..5174afe0a63d37e3ea3e19ac9bab644d1d83ecf1 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -37,6 +37,7 @@ py_library(
 cc_library(
     name = "batch_ops_kernels",
     deps = [
+        "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core/kernels:batch_kernels",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 6138d7912601344ef7422fd50fb35c8401fd2e63..c6e1bc22baa4fe26621a0d31c9a7df64dfbe62fe 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
-
 namespace {
 
 class BigtableClientOp : public OpKernel {
@@ -277,9 +276,7 @@ class ToBigtableOp : public AsyncOpKernel {
             LOG(ERROR) << "Failure applying mutation on row ("
                        << failure.original_index()
                        << "): " << failure.mutation().row_key()
-                       << " - error: " << failure.status().error_message()
-                       << " (Details: " << failure.status().error_details()
-                       << ").";
+                       << " - error: " << failure.status().message() << ".";
           }
         }
         OP_REQUIRES_ASYNC(
@@ -341,8 +338,8 @@ class ToBigtableOp : public AsyncOpKernel {
   }
 
   template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
+  Status ParseScalarArgument(OpKernelContext* ctx, StringPiece argument_name,
+                             T* output) {
     const Tensor* argument_t;
     TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
     if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
@@ -360,5 +357,4 @@ REGISTER_KERNEL_BUILDER(Name("DatasetToBigtable").Device(DEVICE_CPU),
 
 }  // namespace
 }  // namespace data
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 98f906408c230a4382ffafe412ee9990d4384930..3a46e6e85d5dd12ed0adcec259e30d493add6232 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -27,11 +27,62 @@ Status GrpcStatusToTfStatus(const ::grpc::Status& status) {
       status.error_code() == ::grpc::StatusCode::OUT_OF_RANGE) {
     grpc_code = ::grpc::StatusCode::INTERNAL;
   }
-  return Status(static_cast<::tensorflow::error::Code>(status.error_code()),
+  return Status(static_cast<::tensorflow::error::Code>(grpc_code),
                 strings::StrCat("Error reading from Cloud Bigtable: ",
                                 status.error_message()));
 }
 
+namespace {
+::tensorflow::error::Code GcpErrorCodeToTfErrorCode(
+    ::google::cloud::StatusCode code) {
+  switch (code) {
+    case ::google::cloud::StatusCode::kOk:
+      return ::tensorflow::error::OK;
+    case ::google::cloud::StatusCode::kCancelled:
+      return ::tensorflow::error::CANCELLED;
+    case ::google::cloud::StatusCode::kUnknown:
+      return ::tensorflow::error::UNKNOWN;
+    case ::google::cloud::StatusCode::kInvalidArgument:
+      return ::tensorflow::error::INVALID_ARGUMENT;
+    case ::google::cloud::StatusCode::kDeadlineExceeded:
+      return ::tensorflow::error::DEADLINE_EXCEEDED;
+    case ::google::cloud::StatusCode::kNotFound:
+      return ::tensorflow::error::NOT_FOUND;
+    case ::google::cloud::StatusCode::kAlreadyExists:
+      return ::tensorflow::error::ALREADY_EXISTS;
+    case ::google::cloud::StatusCode::kPermissionDenied:
+      return ::tensorflow::error::PERMISSION_DENIED;
+    case ::google::cloud::StatusCode::kUnauthenticated:
+      return ::tensorflow::error::UNAUTHENTICATED;
+    case ::google::cloud::StatusCode::kResourceExhausted:
+      return ::tensorflow::error::RESOURCE_EXHAUSTED;
+    case ::google::cloud::StatusCode::kFailedPrecondition:
+      return ::tensorflow::error::FAILED_PRECONDITION;
+    case ::google::cloud::StatusCode::kAborted:
+      return ::tensorflow::error::ABORTED;
+    case ::google::cloud::StatusCode::kOutOfRange:
+      return ::tensorflow::error::OUT_OF_RANGE;
+    case ::google::cloud::StatusCode::kUnimplemented:
+      return ::tensorflow::error::UNIMPLEMENTED;
+    case ::google::cloud::StatusCode::kInternal:
+      return ::tensorflow::error::INTERNAL;
+    case ::google::cloud::StatusCode::kUnavailable:
+      return ::tensorflow::error::UNAVAILABLE;
+    case ::google::cloud::StatusCode::kDataLoss:
+      return ::tensorflow::error::DATA_LOSS;
+  }
+}
+}  // namespace
+
+Status GcpStatusToTfStatus(const ::google::cloud::Status& status) {
+  if (status.ok()) {
+    return Status::OK();
+  }
+  return Status(
+      GcpErrorCodeToTfErrorCode(status.code()),
+      strings::StrCat("Error reading from Cloud Bigtable: ", status.message()));
+}
+
 string RegexFromStringSet(const std::vector<string>& strs) {
   CHECK(!strs.empty()) << "The list of strings to turn into a regex was empty.";
   std::unordered_set<string> uniq(strs.begin(), strs.end());
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index e3b4535bac4a01a1277290e0d1ea6d3c7613731c..f6aa67fb0b5dc95c510bbf799f4f47496d59b00e 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -26,6 +26,7 @@ limitations under the License.
 namespace tensorflow {
 
 Status GrpcStatusToTfStatus(const ::grpc::Status& status);
+Status GcpStatusToTfStatus(const ::google::cloud::Status& status);
 
 string RegexFromStringSet(const std::vector<string>& strs);
 
@@ -89,22 +90,21 @@ class BigtableReaderDatasetIterator : public DatasetIterator<Dataset> {
  public:
   explicit BigtableReaderDatasetIterator(
       const typename DatasetIterator<Dataset>::Params& params)
-      : DatasetIterator<Dataset>(params), iterator_(nullptr, false) {}
+      : DatasetIterator<Dataset>(params) {}
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) override {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(EnsureIteratorInitialized());
     if (iterator_ == reader_->end()) {
-      grpc::Status status = reader_->Finish();
-      if (status.ok()) {
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-      return GrpcStatusToTfStatus(status);
+      *end_of_sequence = true;
+      return Status::OK();
+    }
+    if (!*iterator_) {
+      return GcpStatusToTfStatus(iterator_->status());
     }
     *end_of_sequence = false;
-    google::cloud::bigtable::Row& row = *iterator_;
+    google::cloud::bigtable::Row& row = **iterator_;
     Status s = ParseRow(ctx, row, out_tensors);
     // Ensure we always advance.
     ++iterator_;
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
index 2c6317157d25908c1ff66fc10bd188d93f040521..22b711a73d671a6609c45a55e9f6b13e2894b49d 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc
@@ -152,11 +152,11 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel {
         }
         if (input_tensors[0].NumElements() == 1) {
           // Single key lookup.
-          ::grpc::Status status;
+          ::google::cloud::Status status;
           auto pair = dataset()->table_->table().ReadRow(
               input_tensors[0].scalar<string>()(), dataset()->filter_, status);
           if (!status.ok()) {
-            return GrpcStatusToTfStatus(status);
+            return GcpStatusToTfStatus(status);
           }
           if (!pair.first) {
             return errors::DataLoss("Row key '",
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index e6fda9e61757f1441b3691c2a3d57c6f1a5a0d42..d9fce6e09f47ab05074f0b4c03dd8e672ed3d2ce 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -335,6 +335,17 @@ grpc::Status BigtableTestClient::ReadModifyWriteRow(
   return grpc::Status(grpc::StatusCode::UNIMPLEMENTED,
                       "ReadModifyWriteRow not implemented.");
 }
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::ReadModifyWriteRowResponse>>
+BigtableTestClient::AsyncReadModifyWriteRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to AsyncReadModifyWriteRow:" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::unique_ptr<
     grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
 BigtableTestClient::ReadRows(
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index 8e1326f2ce841368ea81fc7194a0588e5d6cd637..63d59b32dd17a2f58d3413932b69f4d704c84e48 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -46,6 +46,13 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
       google::bigtable::v2::ReadModifyWriteRowRequest const& request,
       google::bigtable::v2::ReadModifyWriteRowResponse* response) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::ReadModifyWriteRowResponse>>
+  AsyncReadModifyWriteRow(
+      grpc::ClientContext* context,
+      google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+      grpc::CompletionQueue* cq) override;
+
   std::unique_ptr<
       grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
   ReadRows(grpc::ClientContext* context,
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client_test.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client_test.cc
index 32611e2590d9a81f46d0b9dfc09fe7e0068e9671..cf6e619bfaf25101b7fea7ce59a31f7a688c0452 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client_test.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client_test.cc
@@ -39,7 +39,6 @@ TEST(BigtableTestClientTest, EmptyRowRead) {
       ::google::cloud::bigtable::Filter::Latest(1));
   auto rows = table.ReadRows(std::move(rowset), filter);
   EXPECT_EQ(rows.begin(), rows.end()) << "Some rows were returned in response!";
-  EXPECT_TRUE(rows.Finish().ok()) << "Error reading rows.";
 }
 
 TEST(BigtableTestClientTest, SingleRowWriteAndRead) {
@@ -55,15 +54,15 @@ TEST(BigtableTestClientTest, SingleRowWriteAndRead) {
   auto rows = table.ReadRows(std::move(rowset), filter);
   auto itr = rows.begin();
   EXPECT_NE(itr, rows.end()) << "No rows were returned in response!";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v1");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v1");
 
   ++itr;
   EXPECT_EQ(itr, rows.end());
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, MultiRowWriteAndSingleRowRead) {
@@ -82,15 +81,15 @@ TEST(BigtableTestClientTest, MultiRowWriteAndSingleRowRead) {
   auto itr = rows.begin();
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v1");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v1");
 
   ++itr;
   EXPECT_EQ(itr, rows.end()) << "Extra rows in the response.";
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, MultiRowWriteAndRead) {
@@ -109,33 +108,35 @@ TEST(BigtableTestClientTest, MultiRowWriteAndRead) {
   auto itr = rows.begin();
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v1");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v1");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r2");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v2");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r2");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v2");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r3");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v3");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r3");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v3");
 
   ++itr;
   EXPECT_EQ(itr, rows.end()) << "Extra rows in the response.";
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, MultiRowWriteAndPrefixRead) {
@@ -154,33 +155,35 @@ TEST(BigtableTestClientTest, MultiRowWriteAndPrefixRead) {
   auto itr = rows.begin();
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v1");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v1");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r2");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v2");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r2");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v2");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r3");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v3");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r3");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v3");
 
   ++itr;
   EXPECT_EQ(itr, rows.end()) << "Extra rows in the response.";
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, ColumnFiltering) {
@@ -206,33 +209,35 @@ TEST(BigtableTestClientTest, ColumnFiltering) {
   auto itr = rows.begin();
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v1");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v1");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r2");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v2");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r2");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v2");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r3");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "v3");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r3");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "v3");
 
   ++itr;
   EXPECT_EQ(itr, rows.end()) << "Extra rows in the response.";
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, RowKeys) {
@@ -257,33 +262,35 @@ TEST(BigtableTestClientTest, RowKeys) {
       table.ReadRows(::google::cloud::bigtable::RowRange::Prefix("r"), filter);
   auto itr = rows.begin();
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r1");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r1");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r2");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r2");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "");
 
   ++itr;
 
   EXPECT_NE(itr, rows.end()) << "Missing rows";
-  EXPECT_EQ(itr->row_key(), "r3");
-  EXPECT_EQ(itr->cells().size(), 1);
-  EXPECT_EQ(itr->cells()[0].family_name(), "f1");
-  EXPECT_EQ(itr->cells()[0].column_qualifier(), "c1");
-  EXPECT_EQ(itr->cells()[0].value(), "");
+  EXPECT_TRUE(*itr) << "Error reading row: " << itr->status().message();
+  EXPECT_EQ((*itr)->row_key(), "r3");
+  EXPECT_EQ((*itr)->cells().size(), 1);
+  EXPECT_EQ((*itr)->cells()[0].family_name(), "f1");
+  EXPECT_EQ((*itr)->cells()[0].column_qualifier(), "c1");
+  EXPECT_EQ((*itr)->cells()[0].value(), "");
 
   ++itr;
   EXPECT_EQ(itr, rows.end()) << "Extra rows in the response.";
-  EXPECT_TRUE(rows.Finish().ok());
 }
 
 TEST(BigtableTestClientTest, SampleKeys) {
diff --git a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
index 416b719e30aa5f2504449d151a48e95c9105c68b..39c2a2e775d5d5287b137bf33eef66251738e6d3 100644
--- a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
+++ b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
@@ -59,7 +59,7 @@ REGISTER_OP("BigtablePrefixKeyDataset")
     .Input("table: resource")
     .Input("prefix: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -68,14 +68,14 @@ REGISTER_OP("BigtableRangeKeyDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BigtableSampleKeysDataset")
     .Input("table: resource")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -85,7 +85,7 @@ REGISTER_OP("BigtableSampleKeyPairsDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -100,7 +100,7 @@ REGISTER_OP("BigtableScanDataset")
     .Input("columns: string")
     .Input("probability: float")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index fa64055dfd65a134afdf46cebccb7f7d96106502..736cf3da49e934d49d0587d729cff6eaaed8f254 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -475,15 +475,17 @@ class BigtableTable(object):
     """
     if timestamp is None:
       timestamp = -1  # Bigtable server provided timestamp.
-    for tensor_type in nest.flatten(dataset.output_types):
+    for tensor_type in nest.flatten(
+        dataset_ops.get_legacy_output_types(dataset)):
       if tensor_type != dtypes.string:
         raise ValueError("Not all elements of the dataset were `tf.string`")
-    for shape in nest.flatten(dataset.output_shapes):
+    for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
       if not shape.is_compatible_with(tensor_shape.scalar()):
         raise ValueError("Not all elements of the dataset were scalars")
     if len(column_families) != len(columns):
       raise ValueError("len(column_families) != len(columns)")
-    if len(nest.flatten(dataset.output_types)) != len(columns) + 1:
+    if len(nest.flatten(
+        dataset_ops.get_legacy_output_types(dataset))) != len(columns) + 1:
       raise ValueError("A column name must be specified for every component of "
                        "the dataset elements. (e.g.: len(columns) != "
                        "len(dataset.output_types))")
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index d3b23d949ee2c7674c3918d39e8b71d76eefcfec..64e4c4560ba3a1b177db12a09997ff7afe8775a3 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -193,8 +193,9 @@ py_test(
 
 py_test(
     name = "estimator_test",
-    size = "large",
+    size = "medium",
     srcs = ["estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index a178820841c4c8bcb7f5742babdb6d0f4825de31..5ffbb9067081d7440ab5e11290697b822051bee5 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -84,12 +84,10 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -179,8 +177,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         `[batch_size, label_dimension]`).
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
-      label_name: String, name of the key in label dict. Can be null if label
-          is a tensor (single headed models).
+      label_name: String, name of the key in label dict. Can be null if label is
+        a tensor (single headed models).
       weight_column_name: Name of the column for weights, or None if not
         weighted.
       model_dir: Directory for model exports, etc.
@@ -195,11 +193,11 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -286,11 +284,11 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -353,10 +351,9 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -376,12 +373,10 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -417,12 +412,12 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # When using this estimator, make sure to regularize the hessian (at least l2,
 # min_node_weight)!
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -449,8 +444,8 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -469,11 +464,11 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -519,6 +514,7 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -534,10 +530,8 @@ def core_multiclass_head(
 
   def loss_fn(labels, logits):
     result = losses.per_example_maxent_loss(
-        labels=labels,
-        logits=logits,
-        weights=weight_column,
-        num_classes=n_classes)
+        # Don't pass the weights: head already multiplies by them.
+        labels=labels, logits=logits, weights=None, num_classes=n_classes)
     return result[0]
 
   # pylint:disable=protected-access
@@ -564,7 +558,8 @@ def core_quantile_regression_head(
     result = losses.per_example_quantile_regression_loss(
         labels=labels,
         predictions=logits,
-        weights=weight_column,
+        # Don't pass the weights: head already multiplies by them.
+        weights=None,
         quantile=quantiles)
     return result[0]
 
@@ -623,11 +618,11 @@ class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
 
@@ -685,10 +680,9 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -703,12 +697,10 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
 
     Raises:
@@ -748,8 +740,7 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class CoreGradientBoostedDecisionTreeQuantileRegressor(
     core_estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -775,8 +766,8 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -795,11 +786,11 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
     if len(quantiles) > 1:
@@ -814,7 +805,9 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
           params={
               'head':
                   core_quantile_regression_head(
-                      quantiles[0], label_dimension=label_dimension),
+                      quantiles[0],
+                      label_dimension=label_dimension,
+                      weight_column=weight_column_name),
               'feature_columns':
                   feature_columns,
               'learner_config':
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 47d910d42a27db4b857eeb12209dfbb429dd1be2..5a8b2ba9caf0a9813cb5b3409b8a0dc3de0a45d7 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -399,8 +399,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
   def testQuantileRegression(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -413,7 +413,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
+        num_trees=12,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -428,31 +428,12 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper >= 0.92)
     self.assertTrue(frac_below_upper <= 0.98)
 
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
   # Multi-dimensional quantile regression.
   def testQuantileRegressionMultiDimLabel(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -467,7 +448,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
         quantiles=[0.95],
         learner_config=learner_config,
         label_dimension=2,
-        num_trees=100,
+        num_trees=18,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -490,35 +471,6 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_both_below_upper >= 0.91)
     self.assertTrue(frac_both_below_upper <= 0.99)
 
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        label_dimension=2,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.91)
-    self.assertTrue(frac_both_above_lower <= 0.99)
-
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -712,11 +664,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
-  # One dimensional quantile regression.
-  def testQuantileRegression(self):
+  # Quantile regression in core is the same as in non core estimator, so we
+  # just check that it does not fail.
+  def testQuantileRegressionDoesNotThroughException(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
+    learner_config.constraints.max_tree_depth = 1
     learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -731,112 +684,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_upper.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper >= 0.92)
-    self.assertTrue(frac_below_upper <= 0.98)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
-  # Multi-dimensional quantile regression.
-  def testQuantileRegressionMultiDimLabel(self):
-    learner_config = learner_pb2.LearnerConfig()
-    learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
-    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
-
-    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
-        two_dimension=True)
-    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
-
-    # 95% percentile.
-    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.95],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
+        num_trees=1,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
     model_upper.train(input_fn=train_input_fn, steps=1000)
     result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    count_below_upper = np.count_nonzero(upper > y, axis=0)
-    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
-    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
-    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
-    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper_0 >= 0.92)
-    self.assertTrue(frac_below_upper_0 <= 0.98)
-    self.assertTrue(frac_below_upper_1 >= 0.92)
-    self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.91)
-    self.assertTrue(frac_both_below_upper <= 0.99)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.91)
-    self.assertTrue(frac_both_above_lower <= 0.99)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index a6e422847d3914188bca9e6dff797ba1ffb06749..eecf3c5aeb6c6785cae3fd5808954a73db6190d6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
@@ -88,6 +89,12 @@ def model_builder(features,
 
   if config is None:
     raise ValueError("Missing estimator RunConfig.")
+  if config.session_config is not None:
+    session_config = config.session_config
+    session_config.allow_soft_placement = True
+  else:
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+  config = config.replace(session_config=session_config)
 
   center_bias = params["center_bias"]
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 6d78e27e8f69ea289b686af8402bd91967f997f4..65276242abaf96de8b1936365278b18f8bba93a9 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -538,7 +538,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
           partition_boundaries[non_empty_partitions[root_idx]];
 
       float best_gain = std::numeric_limits<float>::lowest();
-      int32 best_dimension_idx = 0;
       bool default_right = false;
       int32 best_element_idx = 0;
 
@@ -571,7 +570,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
       // Iterate through dimensions.
       for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
         const DimensionBoundary& dimension_and_start = dimension_boundaries[j];
-        const int32 dimension_id = dimension_and_start.dimension_id;
 
         int start_index = dimension_and_start.start_index;
         // Even for the last dimension, we always have additional dummy
@@ -630,7 +628,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_left;
               best_element_idx = element_idx;
               default_right = false;
-              best_dimension_idx = dimension_id;
             }
           }
           // Consider calculating the default direction only when there were
@@ -648,7 +645,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_right;
               best_element_idx = element_idx;
               default_right = true;
-              best_dimension_idx = dimension_id;
             }
           }
         }
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index d26af58419752170bbc58bba757ac43349fc2cff..22ad181fc3fb6b0d1b36dd4bb916e63dfb8753ce 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -193,7 +193,8 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
 
     num_minibatches = control_flow_ops.cond(
         ops.convert_to_tensor(self._loss_uses_sum_reduction),
-        lambda: math_ops.to_int64(1), lambda: num_minibatches)
+        lambda: math_ops.cast(1, dtypes.int64),
+        lambda: num_minibatches)
     partition_ids, gains, split_infos = (
         split_handler_ops.build_categorical_equality_splits(
             num_minibatches=num_minibatches,
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 0476bed2cd3f3ea5b47b10c51a819f17d6e37c74..0e6a9f8f3a0126ca9f14c9621c9f91bdbf66b338 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -312,9 +312,10 @@ def _make_dense_split(quantile_accumulator_handle, stats_accumulator_handle,
         gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
             stats_accumulator_handle, stamp_token, next_stamp_token))
   # For sum_reduction, we don't need to divide by number of minibatches.
-  num_minibatches = control_flow_ops.cond(loss_uses_sum_reduction,
-                                          lambda: math_ops.to_int64(1),
-                                          lambda: num_minibatches)
+  num_minibatches = control_flow_ops.cond(
+      loss_uses_sum_reduction,
+      lambda: math_ops.cast(1, dtypes.int64),
+      lambda: num_minibatches)
   # Put quantile and stats accumulator flushing in the dependency path.
   with ops.control_dependencies([flush_quantiles, partition_ids]):
     are_splits_ready = array_ops.identity(are_splits_ready)
@@ -488,9 +489,10 @@ def _make_sparse_split(
     num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
         gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
             stats_accumulator_handle, stamp_token, next_stamp_token))
-  num_minibatches = control_flow_ops.cond(loss_uses_sum_reduction,
-                                          lambda: math_ops.to_int64(1),
-                                          lambda: num_minibatches)
+  num_minibatches = control_flow_ops.cond(
+      loss_uses_sum_reduction,
+      lambda: math_ops.cast(1, dtypes.int64),
+      lambda: num_minibatches)
   # Put quantile and stats accumulator flushing in the dependency path.
   with ops.control_dependencies([flush_quantiles, partition_ids]):
     are_splits_ready = array_ops.identity(are_splits_ready)
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
index 42d69645acaae063fcd46bd1f6c819ccb68f48bd..aa3f24f08a0f762507df83def72e7d595265221f 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -227,7 +227,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
             tree_ensemble_config=tree_ensemble_config.SerializeToString(),
             name="restore_tree")
         resources.initialize_resources(resources.shared_resources()).run()
-        variables.initialize_all_variables().run()
+        variables.global_variables_initializer().run()
         my_saver = saver.Saver()
 
         # Add the second tree and replace the ensemble of the handle.
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index c3685b54e201f73039f6623443c67ba2b217a51e..f9945959812f030f76cb481cfcf91cba1f352fc1 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -33,7 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
@@ -96,18 +96,18 @@ class TreeEnsembleVariable(tracking.TrackableResource):
     self._init_op = None
     super(TreeEnsembleVariable, self).__init__()
 
-  def create_resource(self):
+  def _create_resource(self):
     return gen_model_ops.decision_tree_ensemble_resource_handle_op(
         self._container, shared_name=self._name, name=self._name)
 
-  def initialize(self):
+  def _initialize(self):
     return gen_model_ops.create_tree_ensemble_variable(
         self.resource_handle, self._stamp_token, self._tree_ensemble_config)
 
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 0c319cc9bd1f720eb404a9da05227c5807ec874f..82f9b17b3308d6a521c79ee7a6f48f6c3813a769 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
@@ -120,8 +120,8 @@ class QuantileAccumulator(tracking.TrackableResource):
     name = _PATTERN.sub("", name)
     with ops.name_scope(name, "QuantileAccumulator") as name:
       self._name = name
-      self._resource_handle = self.create_resource()
-      self._init_op = self.initialize()
+      self._resource_handle = self._create_resource()
+      self._init_op = self._initialize()
       is_initialized_op = self.is_initialized()
     resources.register_resource(self.resource_handle, self._init_op,
                                 is_initialized_op)
@@ -129,11 +129,11 @@ class QuantileAccumulator(tracking.TrackableResource):
                                                  self._init_op, name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-  def create_resource(self):
+  def _create_resource(self):
     return gen_quantile_ops.quantile_stream_resource_handle_op(
         container=self._container, shared_name=self._name, name=self._name)
 
-  def initialize(self):
+  def _initialize(self):
     return gen_quantile_ops.create_quantile_accumulator(
         self.resource_handle,
         self._init_stamp_token,
@@ -145,7 +145,7 @@ class QuantileAccumulator(tracking.TrackableResource):
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index ad1191d41236e71008bff8c8a7fbd42c16e3f9c5..1f6bbbf5740ec3c47697ea600eef030aa257707f 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
@@ -144,8 +144,8 @@ class StatsAccumulator(tracking.TrackableResource):
       name = _PATTERN.sub("", name)
     with ops.name_scope(name, "StatsAccumulator") as name:
       self._name = name
-      self._resource_handle = self.create_resource()
-      self._init_op = self.initialize()
+      self._resource_handle = self._create_resource()
+      self._init_op = self._initialize()
       is_initialized_op = self.is_initialized()
     resources.register_resource(self.resource_handle, self.initializer,
                                 is_initialized_op)
@@ -153,7 +153,7 @@ class StatsAccumulator(tracking.TrackableResource):
         self.resource_handle, self.initializer, self._is_scalar, name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-  def create_resource(self):
+  def _create_resource(self):
     if self._is_scalar:
       return (
           gen_stats_accumulator_ops.stats_accumulator_scalar_resource_handle_op(
@@ -163,7 +163,7 @@ class StatsAccumulator(tracking.TrackableResource):
           gen_stats_accumulator_ops.stats_accumulator_tensor_resource_handle_op(
               self._container, self._name, name=self._name))
 
-  def initialize(self):
+  def _initialize(self):
     if self._is_scalar:
       return gen_stats_accumulator_ops.create_stats_accumulator_scalar(
           self.resource_handle, self._stamp_token)
@@ -175,7 +175,7 @@ class StatsAccumulator(tracking.TrackableResource):
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index e78ec476ab3b43e5eb56a2502008bb8020ae97e0..bca850514be943c0fad4a980092dd9ffe313d746 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -228,7 +228,7 @@ def extract_features(features, feature_columns, use_core_columns):
       indices = array_ops.concat([
           array_ops.slice(categorical_tensor.indices, [0, 0], [-1, 1]),
           array_ops.expand_dims(
-              math_ops.to_int64(categorical_tensor.values), -1)
+              math_ops.cast(categorical_tensor.values, dtypes.int64), -1)
       ], 1)
       tensor = sparse_tensor.SparseTensor(
           indices=indices, values=weight_tensor.values, dense_shape=shape)
@@ -611,8 +611,9 @@ class GradientBoostedDecisionTreeModel(object):
         learner_pb2.LearnerConfig.TREE_PER_CLASS and
         self._logits_dimension != 1):
       # Choose the class for which the tree is built (one vs rest).
-      return math_ops.to_int32(
-          predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension)
+      return math_ops.cast(
+          predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension,
+          dtypes.int32)
     return constant_op.constant(-1, dtype=dtypes.int32)
 
   def update_stats(self, loss, predictions_dict, gradients=None, hessians=None):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 92068e88a76cb8bfdd394c1093347a8fb8a63449..61441b22e908110da60765d53a6968afdeecc502 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -43,7 +43,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keepdims=True)
+      math_ops.squared_difference(predictions, label), 1, keepdims=True)
   return loss
 
 
@@ -1149,9 +1149,9 @@ class GbdtTest(test_util.TensorFlowTestCase):
       expected_leaf_1 = [-3.4480, -3.4429, 13.8490, -3.45, -3.4508]
       expected_leaf_2 = [-1.2547, -1.3145, 1.52, 2.3875, -1.3264]
       self.assertArrayNear(expected_leaf_1,
-                           output.trees[0].nodes[1].leaf.vector.value, 1e-3)
+                           output.trees[0].nodes[1].leaf.vector.value, 3e-3)
       self.assertArrayNear(expected_leaf_2,
-                           output.trees[0].nodes[2].leaf.vector.value, 1e-3)
+                           output.trees[0].nodes[2].leaf.vector.value, 3e-3)
 
   def testTrainFnMulticlassDiagonalHessian(self):
     """Tests the GBDT train for multiclass diagonal hessian."""
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index 220e981618b7c0bfb1e4e98c087d83b451b9b3cf..40fdfcf45ac79ffcbab6ba4fbf8f9077a179b16f 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -43,7 +44,7 @@ def per_example_logistic_loss(labels, weights, predictions):
     loss: A Rank 2 (N, 1) tensor of per-example logistic loss.
     update_op: An update operation to update the loss's internal state.
   """
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
   unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
       labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
@@ -74,7 +75,7 @@ def per_example_quantile_regression_loss(labels, weights, predictions,
     loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
     update_op: An update operation to update the loss's internal state.
   """
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
   error = labels - predictions
   square_loss_right = array_ops.where(error * quantile < 1.0,
                                       math_ops.square(quantile * error),
@@ -112,7 +113,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
     loss: A Rank 2 (N, 1) tensor of per-example maxent loss
     update_op: An update operation to update the loss's internal state.
   """
-  labels = math_ops.to_int64(labels)
+  labels = math_ops.cast(labels, dtypes.int64)
   # If labels are of rank 1, make them rank 2.
   labels_shape = labels.get_shape()
   if len(labels_shape) != 2:
@@ -120,7 +121,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
   # Labels are indices of classes, convert them to one hot encodings.
   target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
   labels = math_ops.reduce_sum(input_tensor=target_one_hot, axis=[1])
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
 
   # Calculate softmax probabilities for each class.
   unnormalized_probs = math_ops.exp(logits)
@@ -166,7 +167,7 @@ def per_example_squared_loss(labels, weights, predictions):
     update_op: An update operation to update the loss's internal state.
   """
   unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keepdims=True)
+      math_ops.squared_difference(predictions, labels), 1, keepdims=True)
 
   return unweighted_loss * weights, control_flow_ops.no_op()
 
@@ -253,7 +254,7 @@ def per_example_exp_loss(labels, weights, predictions, name=None, eps=0.1):
     preds_converted = min_res
     return math_ops.exp(-preds_converted * labels_converted)
 
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
   unweighted_loss = exp_with_logits(
       name=name, eps=eps, labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
@@ -312,7 +313,7 @@ def per_example_full_exp_loss(labels, weights, predictions, name=None):
 
     return math_ops.exp(-1.0 * logits * labels_converted)
 
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
   unweighted_loss = full_exp_with_logits(
       name=name, labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 99ed4959fad9699f265183d71a1f3b609d7e6d30..a416588691f580143aa4e5ee53ca1e5cab9c42e0 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -27,7 +27,7 @@ Managing dependencies:
 @@NoDependency
 @@split_dependency
 
-Checkpointable data structures:
+Trackable data structures:
 @@List
 @@Mapping
 @@UniqueNameTracker
@@ -46,20 +46,20 @@ from __future__ import print_function
 
 from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker
 from tensorflow.contrib.checkpoint.python.python_state import NumpyState
-from tensorflow.contrib.checkpoint.python.python_state import PythonStateWrapper
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
-from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
+from tensorflow.core.protobuf.trackable_object_graph_pb2 import TrackableObjectGraph as CheckpointableObjectGraph
 from tensorflow.python.training.checkpoint_management import CheckpointManager
-from tensorflow.python.training.checkpointable.base import Checkpointable as CheckpointableBase
-from tensorflow.python.training.checkpointable.data_structures import List
-from tensorflow.python.training.checkpointable.data_structures import Mapping
-from tensorflow.python.training.checkpointable.data_structures import NoDependency
-from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
-from tensorflow.python.training.checkpointable.util import capture_dependencies
-from tensorflow.python.training.checkpointable.util import list_objects
-from tensorflow.python.training.checkpointable.util import object_metadata
-
+from tensorflow.python.training.tracking.base import Trackable as CheckpointableBase
+from tensorflow.python.training.tracking.data_structures import List
+from tensorflow.python.training.tracking.data_structures import Mapping
+from tensorflow.python.training.tracking.data_structures import NoDependency
+from tensorflow.python.training.tracking.python_state import PythonState as PythonStateWrapper
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import capture_dependencies
+from tensorflow.python.training.tracking.util import list_objects
+from tensorflow.python.training.tracking.util import object_metadata
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
+
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index 4e529322c7c76797938468b405cd175609dc0a73..caedf5b2d1d93dcbc40b0d07607c59597a38131a 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -12,7 +12,7 @@ py_library(
         ":python_state",
         ":split_dependency",
         ":visualize",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
@@ -22,8 +22,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
@@ -36,8 +36,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -47,7 +47,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -64,7 +64,7 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -76,7 +76,7 @@ py_library(
     deps = [
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -89,8 +89,8 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -101,8 +101,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -118,6 +118,6 @@ tf_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 97936d9e9dfd5d6e62fdf8312707a276b63e1267..a25d51980ea760dfb7f323497a397fbd94fd5f23 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training.checkpointable import base as checkpointable_lib
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base as trackable_lib
+from tensorflow.python.training.tracking import data_structures
 
 
-class UniqueNameTracker(data_structures.CheckpointableDataStructure):
-  """Adds dependencies on checkpointable objects with name hints.
+class UniqueNameTracker(data_structures.TrackableDataStructure):
+  """Adds dependencies on trackable objects with name hints.
 
   Useful for creating dependencies with locally unique names.
 
@@ -43,30 +43,30 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
 
   def __init__(self):
     super(UniqueNameTracker, self).__init__()
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     self._name_counts = {}
 
   @property
   def _values(self):
     return [dep.ref for dep in self._checkpoint_dependencies]
 
-  def track(self, checkpointable, base_name):
-    """Add a dependency on `checkpointable`.
+  def track(self, trackable, base_name):
+    """Add a dependency on `trackable`.
 
     Args:
-      checkpointable: An object to add a checkpoint dependency on.
+      trackable: An object to add a checkpoint dependency on.
       base_name: A name hint, which is uniquified to determine the dependency
         name.
     Returns:
-      `checkpointable`, for chaining.
+      `trackable`, for chaining.
     Raises:
-      ValueError: If `checkpointable` is not a checkpointable object.
+      ValueError: If `trackable` is not a trackable object.
     """
 
-    if not isinstance(checkpointable, checkpointable_lib.Checkpointable):
+    if not isinstance(trackable, trackable_lib.Trackable):
       raise ValueError(
-          ("Expected a checkpointable value, got %s which does not inherit "
-           "from CheckpointableBase.") % (checkpointable,))
+          ("Expected a trackable value, got %s which does not inherit "
+           "from tf.track.Trackable.") % (trackable,))
 
     def _format_name(prefix, number):
       if number > 0:
@@ -80,5 +80,5 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
       count += 1
       candidate = _format_name(base_name, count)
     self._name_counts[base_name] = count + 1
-    self._track_value(checkpointable, name=candidate)
-    return checkpointable
+    self._track_value(trackable, name=candidate)
+    return trackable
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
index a2d453ec6eb3dcf9aba4c52fe866756a92673c63..bace21939602666aa48a05d2abfe05ae6aae41e2 100644
--- a/tensorflow/contrib/checkpoint/python/containers_test.py
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -26,9 +26,9 @@ from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class UniqueNameTrackerTests(test.TestCase):
@@ -52,7 +52,7 @@ class UniqueNameTrackerTests(test.TestCase):
     save_root = util.Checkpoint(slots=slots)
     save_path = save_root.save(checkpoint_prefix)
 
-    restore_slots = tracking.AutoCheckpointable()
+    restore_slots = tracking.AutoTrackable()
     restore_root = util.Checkpoint(
         slots=restore_slots)
     status = restore_root.restore(save_path)
@@ -68,7 +68,7 @@ class UniqueNameTrackerTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testExample(self):
-    class SlotManager(tracking.AutoCheckpointable):
+    class SlotManager(tracking.AutoTrackable):
 
       def __init__(self):
         self.slotdeps = containers.UniqueNameTracker()
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
index 969c90c78871ebff02b360f8f09623df56c9c077..1ada05227ba566cd3dfbff406e8fed80dccde684 100644
--- a/tensorflow/contrib/checkpoint/python/python_state.py
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -17,13 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import functools
-import six
-
 import numpy
 
-from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import python_state as core_python_state
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -34,8 +31,8 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-class NumpyState(base.Checkpointable):
-  """A checkpointable object whose NumPy array attributes are saved/restored.
+class NumpyState(base.Trackable):
+  """A trackable object whose NumPy array attributes are saved/restored.
 
   Example usage:
 
@@ -72,7 +69,7 @@ class NumpyState(base.Checkpointable):
     """Create placeholder NumPy arrays for to-be-restored attributes.
 
     Typically `_lookup_dependency` is used to check by name whether a dependency
-    exists. We cheat slightly by creating a checkpointable object for `name` if
+    exists. We cheat slightly by creating a trackable object for `name` if
     we don't already have one, giving us attribute re-creation behavior when
     loading a checkpoint.
 
@@ -85,7 +82,7 @@ class NumpyState(base.Checkpointable):
     value = super(NumpyState, self)._lookup_dependency(name)
     if value is None:
       value = _NumpyWrapper(numpy.array([]))
-      new_reference = base.CheckpointableReference(name=name, ref=value)
+      new_reference = base.TrackableReference(name=name, ref=value)
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._unconditional_dependency_names[name] = value
       super(NumpyState, self).__setattr__(name, value)
@@ -101,7 +98,7 @@ class NumpyState(base.Checkpointable):
   def __setattr__(self, name, value):
     """Automatically wrap NumPy arrays assigned to attributes."""
     # TODO(allenl): Consider supporting lists/tuples, either ad-hoc or by making
-    # ndarrays checkpointable natively and using standard checkpointable list
+    # ndarrays trackable natively and using standard trackable list
     # tracking.
     if isinstance(value, (numpy.ndarray, numpy.generic)):
       try:
@@ -110,48 +107,26 @@ class NumpyState(base.Checkpointable):
         return
       except AttributeError:
         value = _NumpyWrapper(value)
-        self._track_checkpointable(value, name=name, overwrite=True)
+        self._track_trackable(value, name=name, overwrite=True)
     elif (name not in ("_setattr_tracking", "_update_uid")
           and getattr(self, "_setattr_tracking", True)):
-      # Mixing restore()-created attributes with user-added checkpointable
+      # Mixing restore()-created attributes with user-added trackable
       # objects is tricky, since we can't use the `_lookup_dependency` trick to
       # re-create attributes (we might accidentally steal the restoration for
-      # another checkpointable object). For now `NumpyState` objects must be
+      # another trackable object). For now `NumpyState` objects must be
       # leaf nodes. Theoretically we could add some extra arguments to
       # `_lookup_dependency` to figure out whether we should create a NumPy
       # array for the attribute or not.
       raise NotImplementedError(
           ("Assigned %s to the %s property of %s, which is not a NumPy array. "
-           "Currently mixing NumPy arrays and other checkpointable objects is "
+           "Currently mixing NumPy arrays and other trackable objects is "
            "not supported. File a feature request if this limitation bothers "
            "you.")
           % (value, name, self))
     super(NumpyState, self).__setattr__(name, value)
 
 
-@six.add_metaclass(abc.ABCMeta)
-class PythonStateWrapper(base.Checkpointable):
-  """Wraps a Python object for storage in an object-based checkpoint."""
-
-  @abc.abstractmethod
-  def _serialize(self):
-    """Callback for `PythonStringStateSaveable` to serialize the object."""
-
-  @abc.abstractmethod
-  def _deserialize(self, string_value):
-    """Callback for `PythonStringStateSaveable` to deserialize the object."""
-
-  def _gather_saveables_for_checkpoint(self):
-    """Specify callbacks for saving and restoring `array`."""
-    return {
-        "py_state": functools.partial(
-            base.PythonStringStateSaveable,
-            state_callback=self._serialize,
-            restore_callback=self._deserialize)
-        }
-
-
-class _NumpyWrapper(PythonStateWrapper):
+class _NumpyWrapper(core_python_state.PythonState):
   """Wraps a NumPy array for storage in an object-based checkpoint."""
 
   def __init__(self, array):
@@ -162,7 +137,7 @@ class _NumpyWrapper(PythonStateWrapper):
     """
     self.array = array
 
-  def _serialize(self):
+  def serialize(self):
     """Callback to serialize the array."""
     string_file = BytesIO()
     try:
@@ -172,7 +147,7 @@ class _NumpyWrapper(PythonStateWrapper):
       string_file.close()
     return serialized
 
-  def _deserialize(self, string_value):
+  def deserialize(self, string_value):
     """Callback to deserialize the array."""
     string_file = BytesIO(string_value)
     try:
diff --git a/tensorflow/contrib/checkpoint/python/python_state_test.py b/tensorflow/contrib/checkpoint/python/python_state_test.py
index 45494351ff4e6c8c75634d8563c3fb63c6089036..40d8fe836402c8b6c8240ef9f665b753c54ede0d 100644
--- a/tensorflow/contrib/checkpoint/python/python_state_test.py
+++ b/tensorflow/contrib/checkpoint/python/python_state_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class NumpyStateTests(test.TestCase):
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 3e9700ad74618e24843181d169f3fb39ac96bff6..d7b02b538909305b14e638761bd8ba67a948d2b4 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -21,7 +21,7 @@ import functools
 
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(checkpointable.Checkpointable):
+class _SplitDependency(trackable.Trackable):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
@@ -81,9 +81,9 @@ class _SplitDependency(checkpointable.Checkpointable):
       return control_flow_ops.no_op()
 
   def _gather_saveables_for_checkpoint(self):
-    """Looks to Checkpointable like a regular variable."""
+    """Looks to Trackable like a regular variable."""
     return {
-        checkpointable.VARIABLE_VALUE_KEY:
+        trackable.VARIABLE_VALUE_KEY:
         functools.partial(_CallbackSaveable,
                           dtype=self._dtype,
                           save_callback=self._save,
@@ -117,7 +117,7 @@ def split_dependency(component_names, component_dtypes,
       may return `None`).
 
   Returns:
-    A dictionary mapping from names to Checkpointable objects. If one is
+    A dictionary mapping from names to Trackable objects. If one is
     reachable from an object as a dependency, the others should be too; adding
     dependencies on some but not all of the objects will result in errors.
   """
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index 664a4e76ab31bf31c7a57924e4af866f2d746804..9bc01059481ff69064e3f9c682a764146b79a250 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -23,9 +23,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 def _split_variable_closure(variable):
@@ -44,7 +44,7 @@ def _combine_variable_closure(variable):
   return _consume_restore_buffer_fn
 
 
-class SaveTensorSlicesAsDeps(base.Checkpointable):
+class SaveTensorSlicesAsDeps(base.Trackable):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
@@ -56,17 +56,17 @@ class SaveTensorSlicesAsDeps(base.Checkpointable):
         consume_restore_buffer_fn=_combine_variable_closure(
             self.combined))
     for name, dep in split_dependencies.items():
-      self._track_checkpointable(dep, name=name)
+      self._track_trackable(dep, name=name)
 
 
-class HasRegularDeps(tracking.AutoCheckpointable):
+class HasRegularDeps(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
     self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
 
 
-class OnlyOneDep(tracking.AutoCheckpointable):
+class OnlyOneDep(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
index bac071c4cff383f60b707b6e42c13faf5e0ac948..faf90f018476b3c70a7bfa1346a5b590edbbddcd 100644
--- a/tensorflow/contrib/checkpoint/python/visualize.py
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def dot_graph_from_checkpoint(save_path):
@@ -51,7 +51,7 @@ def dot_graph_from_checkpoint(save_path):
     A graph in DOT format as a string.
   """
   reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-  object_graph = checkpointable_utils.object_metadata(save_path)
+  object_graph = trackable_utils.object_metadata(save_path)
   shape_map = reader.get_variable_to_shape_map()
   dtype_map = reader.get_variable_to_dtype_map()
   graph = 'digraph {\n'
@@ -63,7 +63,7 @@ def dot_graph_from_checkpoint(save_path):
       slot_ids.add(slot_reference.slot_variable_node_id)
   for node_id, node in enumerate(object_graph.nodes):
     if (len(node.attributes) == 1
-        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+        and node.attributes[0].name == trackable.VARIABLE_VALUE_KEY):
       if node_id in slot_ids:
         color = 'orange'
         tooltip_prefix = 'Slot variable'
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
index 583e3bc442893d825c337d73fb999d1e586738a1..98a22d573fdb6172cde100df461d9ae520c2c483 100644
--- a/tensorflow/contrib/checkpoint/python/visualize_test.py
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import adam
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 try:
   import pydot  # pylint: disable=g-import-not-at-top
@@ -57,7 +57,7 @@ class DotGraphTests(test.TestCase):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
       optimizer_step = resource_variable_ops.ResourceVariable(12)
-      save_checkpoint = checkpointable_utils.Checkpoint(
+      save_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model, optimizer_step=optimizer_step)
       optimizer.minimize(functools.partial(model, input_value))
       checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index 390b3e7550b3d991269bb84707c3500f2fa33290..a4dea85efd98893c881abbd3f7ebda78755b8189 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -23,7 +23,7 @@ from __future__ import print_function
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
@@ -36,7 +36,7 @@ _allowed_symbols = [
     'ClusterResolver',
     'SimpleClusterResolver',
     'UnionClusterResolver',
-    'GceClusterResolver',
+    'GCEClusterResolver',
     'KubernetesClusterResolver',
     'TFConfigClusterResolver',
     'TPUClusterResolver',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 10d93549ebbd4f7e900796d0516b0af1744224af..ef1e9f11a07a5be6c0b181f5e0b80e0e2214f972 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -25,7 +25,7 @@ from __future__ import print_function
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
@@ -43,7 +43,7 @@ _allowed_symbols = [
     'ClusterResolver',
     'SimpleClusterResolver',
     'UnionClusterResolver',
-    'GceClusterResolver',
+    'GCEClusterResolver',
     'KubernetesClusterResolver',
     'TFConfigClusterResolver',
     'TPUClusterResolver',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 55e61155c683c928efab9bb018868faec3e3df8c..5b49116ff6a4e17a774ea79b33ae1b948ba9f187 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Stub file for GceClusterResolver to maintain backwards compatibility."""
+"""Stub file for GCEClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,13 +23,14 @@ from __future__ import print_function
 # existing OSS code will not be broken.
 
 # pylint: disable=unused-import
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
+
 _allowed_symbols = [
-    'GceClusterResolver',
+    'GCEClusterResolver',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index b4f4b028f6f5c363e1c791985ad4ba3bcb62e0c6..9e9d85def83850fe61ed9dc36a60d828d46bd10a 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -154,12 +154,12 @@ suitable interface for project configuration and dependency setting.
 5.  Click on `Configure`, a new window will be prompted out, specify the
     generator mode for the project generation. For Windows, choose `Visual
     Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
-    press `Finish`. Wait for a moment, the default project dependecy would
+    press `Finish`. Wait for a moment, the default project dependency would
     automatically generate.
 6.  There are a few options that you can customize your own build. **The setting
     here is crucial for a successful build, please check all items carefully.**
 
-    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_ALL_KERNELS` should always be `on`
     *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
         to test build (optional)
     *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
@@ -278,7 +278,7 @@ suitable interface for project configuration and dependency setting.
     `make -sj<number-of-threads> install`
 
     Where `<number-of-threads>` is the threads used for the compilation, change
-    to any integer less or equal to your computer's maxiumum thread number.
+    to any integer less or equal to your computer's maximum thread number.
 
     Headers are discretely located in the build folders. Tensorflow library can
     be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
@@ -294,11 +294,12 @@ Here we assume that you have basic knowledge on gathering dependency with
     `CMakeLists.txt` and the c++ file `main.cxx`
 2.  Fill in the `main.cxx` with the code provided in
     [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
-3.  Fill in the `CMakeLists.txt` with following code: ``` cmake
+3.  Fill in the `CMakeLists.txt` with following code:
+
+    ```cmake
     cmake_minimum_required (VERSION 2.6) project (tf_hello)
 
     # Tensorflow
-
     find_package(Tensorflow REQUIRED)
     include_directories(${TENSORFLOW_INCLUDE_DIRS})
 
@@ -314,7 +315,8 @@ Here we assume that you have basic knowledge on gathering dependency with
     this CMakeList.txt, under development") endif()
 
     add_executable(tf_hello main.cxx) target_link_libraries(tf_hello
-    ${TENSORFLOW_LIBRARIES}) ```
+    ${TENSORFLOW_LIBRARIES})
+    ```
 
 4.  Configure the folder with cmake-gui, an error should be prompted out,
     requesting you to locate the folder containing `TensorflowConfig.cmake`.
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index e570c09ecb5e64130ed6f3375a51d74850cc3989..30b4e2dbdee1117df12ae7ab8ce902e667234fb0 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
+set(GRPC_TAG 62688b6a05cc85b47fb77dd408611734253e47e2)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 21ae9a08a6bb8f71e5935ddde2d7bb3ed0cd8bbc..fd205a4b9b065a4756fbe3985694bb64b93b85e6 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -13,6 +13,7 @@ tensorflow/core/lib
 tensorflow/core/lib/core
 tensorflow/core/profiler
 tensorflow/core/protobuf
+tensorflow/core/protobuf/tpu
 tensorflow/core/util
 tensorflow/examples
 tensorflow/examples/tutorials
@@ -70,8 +71,9 @@ tensorflow/python/summary/writer
 tensorflow/python/tools
 tensorflow/python/tools/api
 tensorflow/python/tools/api/generator
+tensorflow/python/tpu
 tensorflow/python/training
-tensorflow/python/training/checkpointable
+tensorflow/python/training/tracking
 tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
@@ -437,7 +439,6 @@ tensorflow/contrib/timeseries/python/timeseries/state_space_models
 tensorflow/contrib/tpu
 tensorflow/contrib/tpu/ops
 tensorflow/contrib/tpu/profiler
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/python
 tensorflow/contrib/tpu/python/ops
 tensorflow/contrib/tpu/python/profiler
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 013180c89083748b240ad061b342300e886d3568..b4603206da419f44af0857b9b933eb7df1b255ff 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,6 +1,7 @@
 tensorflow/core
 tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
+tensorflow/core/protobuf/tpu
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
@@ -12,7 +13,6 @@ tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
 tensorflow/contrib/tensor_forest/proto
 tensorflow/contrib/tensorboard/plugins/projector
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/profiler
 tensorflow/contrib/training/python/training
 tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index d8d1cc3aa2ca4fff3c950654b7cbd7085c76010c..cc263d7995c01100f1c51436bcb584b600c8c161 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -125,9 +125,9 @@ endfunction()
 
 file(GLOB_RECURSE tf_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/protobuf/tpu/*.proto"
     "${tensorflow_source_dir}/tensorflow/compiler/xla/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
 )
 
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index e32097ceddfec95b8677fc762d641d09078e5343..839682afdc6284b9fea53405f094106c25485e79 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -23,6 +23,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":xla",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
@@ -63,9 +64,9 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:summary_op_util",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
diff --git a/tensorflow/contrib/compiler/__init__.py b/tensorflow/contrib/compiler/__init__.py
index c4937dadfb8be3211377f0ae7017b95e7642dab0..797e5e8164e231e8b3806d40b32774711879b050 100644
--- a/tensorflow/contrib/compiler/__init__.py
+++ b/tensorflow/contrib/compiler/__init__.py
@@ -19,3 +19,4 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.compiler import jit
+from tensorflow.contrib.compiler import xla
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index 0f1be500f499ebba7e1907de663f8bbfa889bb17..2ccb27da12fe5692dd4360d5d52eb9950159c484 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -25,11 +25,11 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.compiler.jit.ops import xla_ops
 from tensorflow.compiler.jit.ops import xla_ops_grad  # pylint: disable=unused-import
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.distribute import summary_op_util
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import summary_op_util
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
@@ -144,6 +144,30 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
         logging.warning('... and %d more',
                         len(self._unsupported_ops) - _MAX_WARNING_LINES)
 
+  def _RemoveExternalControlEdges(self, op):
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
   def AddOp(self, op):
     """Create op in XLACompileContext and notifies outer context recursively."""
     # pylint: disable=protected-access
@@ -193,11 +217,14 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
     if external_control_inputs:
       # Use an identity to pull control inputs as data inputs. Note that we
       # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      external_control_inputs = [
-          array_ops.identity(x.outputs[0]).op
-          for x in external_control_inputs
-          if x.outputs
-      ]
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
       # pylint: disable=protected-access
       op._add_control_inputs(external_control_inputs)
       # pylint: enable=protected-access
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
index eee4329acbeb38c9f37f79227aeb3acd46dce5e7..619153df67c90cea5a5082a411972948bac5fe90 100644
--- a/tensorflow/contrib/constrained_optimization/BUILD
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -42,11 +42,6 @@ py_test(
     name = "candidates_test",
     srcs = ["python/candidates_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        # TODO(b/121223093): Re-enable this test after fixing "Distribution
-        # should match known solution" errors.
-        "no_mac",
-    ],
     deps = [
         ":constrained_optimization",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
index cb1dd7d836ae11700b2ffaaff4fda5b7f943f87d..7ffb6894d37444fd78015b6c124c46f2855c1cde 100644
--- a/tensorflow/contrib/constrained_optimization/README.md
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -1,5 +1,10 @@
 <!-- TODO(acotter): Add usage example of non-convex optimization and stochastic classification. -->
 
+**NOTE: As tensorflow.contrib is being
+[deprecated](https://github.com/tensorflow/community/pull/18), TFCO is moving to
+its own repository on
+[github](https://github.com/google-research/tensorflow_constrained_optimization).**
+
 # ConstrainedOptimization (TFCO)
 
 TFCO is a library for optimizing inequality-constrained problems in TensorFlow.
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
index a4c49d48bc5c763489215261a909573af0f19055..280e9acd88638a9385bfd9128ba6d3739879aab2 100644
--- a/tensorflow/contrib/constrained_optimization/python/candidates_test.py
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -52,12 +52,12 @@ class CandidatesTest(test.TestCase):
     distribution = candidates.find_best_candidate_distribution(
         objective_vector, constraints_matrix)
     # Verify that the solution is a probability distribution.
-    self.assertTrue(np.all(distribution >= 0))
+    self.assertTrue(np.all(distribution >= -1e-6))
     self.assertAlmostEqual(np.sum(distribution), 1.0)
     # Verify that the solution satisfies the constraints.
     maximum_constraint_violation = np.amax(
         np.dot(constraints_matrix, distribution))
-    self.assertLessEqual(maximum_constraint_violation, 0)
+    self.assertLessEqual(maximum_constraint_violation, 1e-6)
     # Verify that the solution matches that which we expect.
     expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
     self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 40e159b8fcbd1864284e208cb15d9ed96119f840..5c4c52766604b19864b0849fa23e7a7cafcc56e0 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -283,7 +283,7 @@ def crf_unary_score(tag_indices, sequence_lengths, inputs):
   offsets += array_ops.expand_dims(math_ops.range(max_seq_len) * num_tags, 0)
   # Use int32 or int64 based on tag_indices' dtype.
   if tag_indices.dtype == dtypes.int64:
-    offsets = math_ops.to_int64(offsets)
+    offsets = math_ops.cast(offsets, dtypes.int64)
   flattened_tag_indices = array_ops.reshape(offsets + tag_indices, [-1])
 
   unary_scores = array_ops.reshape(
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index f5219eb134d07c09b16a544f71d4c18986c19681..5ed80953a4930f0e9de75c2a0c96ce9fb4e2ab57 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -69,6 +69,8 @@ def RunLSTM(sess,
             time,
             num_layers=1,
             variable_seq_lengths=False,
+            time_major=True,
+            dynamic_shape_input=False,
             is_training=True,
             dropout=0.,
             num_dirs=True,
@@ -84,11 +86,14 @@ def RunLSTM(sess,
   random_seed.set_random_seed(0)
   np.random.seed(0)
 
-  inputs = variable_scope.get_variable(
-      "inputs",
-      initializer=np.random.rand(time, batch_size,
-                                 input_size).astype(dtype.as_numpy_dtype),
-      dtype=dtype)
+  shape = ([time, batch_size, input_size]
+           if time_major else [batch_size, time, input_size])
+  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+  inputs_static = variable_scope.get_variable(
+      "inputs", initializer=inputs_np, dtype=dtype)
+  inputs_dynamic = array_ops.placeholder(
+      dtype, shape=[None, None, None], name="inputs")
+  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
   initial_h_op = variable_scope.get_variable(
       "initial_h_op",
       initializer=np.random.rand(batch_size,
@@ -122,12 +127,12 @@ def RunLSTM(sess,
     cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
     outputs_op, state_tuple_op = rnn.dynamic_rnn(
         cell,
-        inputs,
+        inputs_static,
         sequence_length=lengths,
         initial_state=rnn_cell_impl.LSTMStateTuple(
             h=initial_h_op, c=initial_c_op),
         dtype=dtype,
-        time_major=True,
+        time_major=time_major,
         scope=None)
 
   # Convert to cudnn opaque param.
@@ -135,35 +140,38 @@ def RunLSTM(sess,
       num_layers, num_units, input_size)
   opaque_params = format_converter.tf_canonical_to_opaque([w, b])
 
-  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
-  cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
+  cu_initial_h_op = array_ops.expand_dims(
+      initial_h_op, axis=(0 if time_major else 1))
+  cu_initial_c_op = array_ops.expand_dims(
+      initial_c_op, axis=(0 if time_major else 1))
   cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
       inputs,
       cu_initial_h_op,
       cu_initial_c_op,
       opaque_params,
       sequence_lengths=lengths,
+      time_major=time_major,
       dropout=dropout,
       is_training=is_training,
       rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
   # Remove the trivial 1st dimension.
   cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
-      c=array_ops.squeeze(cu_c_op, axis=0),
-      h=array_ops.squeeze(cu_h_op, axis=0))
+      c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1),
+      h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1))
 
   if is_training:
     (inp_grad_op, hgrad_op,
      cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
-         outputs_op, [inputs, initial_h_op, initial_c_op, w, b])
+         outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b])
 
     (cu_inp_grad_op, cu_hgrad_op,
      cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
          cu_outputs_op,
          [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
     # Remove the trivial 1st dimension
-    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
     # Remove the trivial 1st dimension
-    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)
+    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1)
 
     cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
         opaque_grad_op)
@@ -183,10 +191,12 @@ def RunLSTM(sess,
         (hgrad_op, cgrad_op), wgrad_op, bgrad_op
     ])
     (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
-     cu_bgrad) = sess.run([
-         cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
-         (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
-     ])
+     cu_bgrad) = sess.run(
+         [
+             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
+             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
+         ],
+         feed_dict={inputs: inputs_np} if dynamic_shape_input else None)
 
     logging.vlog(1, "outputs: %s" % outputs)
     logging.vlog(1, "cu_outputs: %s" % cu_outputs)
@@ -205,7 +215,10 @@ def RunLSTM(sess,
             cu_bgrad)
   else:
     outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
-    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op])
+    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op],
+                                          feed_dict=({
+                                              inputs: inputs_np
+                                          } if dynamic_shape_input else None))
 
     logging.vlog(1, "outputs: %s" % outputs)
     logging.vlog(1, "cu_outputs: %s" % cu_outputs)
@@ -336,6 +349,8 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
                             num_layers,
                             dtype,
                             variable_seq_lengths,
+                            time_major,
+                            dynamic_shape_input=False,
                             rtol=3e-6,
                             atol=3e-6):
     with self.session(use_gpu=True) as sess:
@@ -347,7 +362,9 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
            batch_size,
            time,
            num_layers,
-           variable_seq_lengths=variable_seq_lengths)
+           variable_seq_lengths=variable_seq_lengths,
+           time_major=time_major,
+           dynamic_shape_input=dynamic_shape_input)
 
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       for s, cu_s in zip(state_tuple, cu_state_tuple):
@@ -359,13 +376,16 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training(self, num_units, input_size, batch_size, time, num_layers,
-                    variable_seq_lengths):
+                    variable_seq_lengths, time_major, dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -375,16 +395,22 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
         time,
         num_layers,
         dtypes.float32,
-        variable_seq_lengths=variable_seq_lengths)
+        variable_seq_lengths=variable_seq_lengths,
+        time_major=time_major,
+        dynamic_shape_input=dynamic_shape_input)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training_fp16(self, num_units, input_size, batch_size, time,
-                         num_layers, variable_seq_lengths):
+                         num_layers, variable_seq_lengths, time_major,
+                         dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -396,16 +422,21 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
         dtypes.float16,
         rtol=5e-3,
         atol=5e-4,
-        variable_seq_lengths=variable_seq_lengths)
+        variable_seq_lengths=variable_seq_lengths,
+        time_major=time_major,
+        dynamic_shape_input=dynamic_shape_input)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference(self, num_units, input_size, batch_size, time, num_layers,
-                     variable_seq_lengths):
+                     variable_seq_lengths, time_major, dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -417,7 +448,9 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
           time,
           num_layers,
           is_training=False,
-          variable_seq_lengths=variable_seq_lengths)
+          variable_seq_lengths=variable_seq_lengths,
+          time_major=time_major,
+          dynamic_shape_input=dynamic_shape_input)
 
       self.assertAllClose(outputs, cu_outputs)
       # h
@@ -426,13 +459,17 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(state_tuple.c, cu_state_tuple.c)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_fp16(self, num_units, input_size, batch_size, time,
-                          num_layers, variable_seq_lengths):
+                          num_layers, variable_seq_lengths, time_major,
+                          dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -445,7 +482,9 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
           num_layers,
           is_training=False,
           dtype=dtypes.float16,
-          variable_seq_lengths=variable_seq_lengths)
+          variable_seq_lengths=variable_seq_lengths,
+          time_major=time_major,
+          dynamic_shape_input=dynamic_shape_input)
 
       rtol, atol = 5e-3, 5e-4
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
@@ -457,13 +496,17 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
           state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
-                                  num_layers, variable_seq_lengths):
+                                  num_layers, variable_seq_lengths, time_major,
+                                  dynamic_shape_input):
     """Validates that dropout does not affect Cudnn Rnn inference."""
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
@@ -480,7 +523,9 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
             num_layers,
             is_training=False,
             dropout=0.,
-            variable_seq_lengths=variable_seq_lengths)
+            variable_seq_lengths=variable_seq_lengths,
+            time_major=time_major,
+            dynamic_shape_input=dynamic_shape_input)
 
     with ops.Graph().as_default() as g:
       with self.session(use_gpu=True, graph=g) as sess:
@@ -493,7 +538,9 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
             num_layers,
             is_training=False,
             dropout=1.,
-            variable_seq_lengths=variable_seq_lengths)
+            variable_seq_lengths=variable_seq_lengths,
+            time_major=time_major,
+            dynamic_shape_input=dynamic_shape_input)
 
     self.assertAllClose(cu_outputs, cu_outputs2)
     # h
@@ -510,6 +557,8 @@ def RunGRU(sess,
            num_layers=1,
            is_training=True,
            variable_seq_lengths=False,
+           time_major=True,
+           dynamic_shape_input=False,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
@@ -524,11 +573,14 @@ def RunGRU(sess,
   random_seed.set_random_seed(0)
   np.random.seed(0)
 
-  inputs = variable_scope.get_variable(
-      "inputs",
-      initializer=np.random.rand(time, batch_size,
-                                 input_size).astype(dtype.as_numpy_dtype),
-      dtype=dtype)
+  shape = ([time, batch_size, input_size]
+           if time_major else [batch_size, time, input_size])
+  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+  inputs_static = variable_scope.get_variable(
+      "inputs", initializer=inputs_np, dtype=dtype)
+  inputs_dynamic = array_ops.placeholder(
+      dtype, shape=[None, None, None], name="inputs")
+  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
   initial_h_op = variable_scope.get_variable(
       "initial_h_op",
       initializer=np.random.rand(batch_size,
@@ -573,11 +625,11 @@ def RunGRU(sess,
     cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
     outputs_op, h_op = rnn.dynamic_rnn(
         cell,
-        inputs,
+        inputs_static,
         sequence_length=lengths,
         initial_state=initial_h_op,
         dtype=dtype,
-        time_major=True,
+        time_major=time_major,
         scope=None)
 
   ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
@@ -588,13 +640,15 @@ def RunGRU(sess,
   opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
 
 
-  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_initial_h_op = array_ops.expand_dims(
+      initial_h_op, axis=(0 if time_major else 1))
   cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
       inputs,
       cu_initial_h_op,
       array_ops.zeros_like(cu_initial_h_op),  # not used
       opaque_params,
       sequence_lengths=lengths,
+      time_major=time_major,
       dropout=dropout,
       is_training=is_training,
       rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
@@ -602,12 +656,12 @@ def RunGRU(sess,
   if is_training:
     (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
      cib_grad_op, chb_grad_op) = gradients_impl.gradients(
-         outputs_op, [inputs, initial_h_op] + ws + bs)
+         outputs_op, [inputs_static, initial_h_op] + ws + bs)
 
     (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
         cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
     # Remove the trivial 1st dimension
-    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
 
     cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
         opaque_grad_op)
@@ -627,13 +681,15 @@ def RunGRU(sess,
         (gk_grad_op, cik_grad_op, chk_grad_op),
         (gb_grad_op, cib_grad_op, chb_grad_op)
     ])
-    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([
-        cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
-        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
-        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
-    ])
+    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run(
+        [
+            cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
+            (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
+            (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
+        ],
+        feed_dict={inputs: inputs_np} if dynamic_shape_input else None)
     # Remove the trivial 1st dimension
-    cu_h = np.squeeze(cu_h, axis=0)
+    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)
 
     logging.vlog(1, "outputs: %s" % outputs)
     logging.vlog(1, "cu_outputs: %s" % cu_outputs)
@@ -651,9 +707,12 @@ def RunGRU(sess,
             cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
   else:
     outputs, h = sess.run([outputs_op, h_op])
-    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
+    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op],
+                                feed_dict=({
+                                    inputs: inputs_np
+                                } if dynamic_shape_input else None))
     # Remove the trivial 1st dimension.
-    cu_h = np.squeeze(cu_h, axis=0)
+    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)
 
     logging.vlog(1, "outputs: %s" % outputs)
     logging.vlog(1, "cu_outputs: %s" % cu_outputs)
@@ -672,6 +731,8 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
                             num_layers,
                             dtype,
                             variable_seq_lengths,
+                            time_major,
+                            dynamic_shape_input=False,
                             rtol=3e-6,
                             atol=3e-6):
     with self.session(use_gpu=True) as sess:
@@ -683,7 +744,9 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
            batch_size,
            time,
            num_layers,
-           variable_seq_lengths=variable_seq_lengths)
+           variable_seq_lengths=variable_seq_lengths,
+           time_major=time_major,
+           dynamic_shape_input=dynamic_shape_input)
 
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
@@ -695,13 +758,16 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
         self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training(self, num_units, input_size, batch_size, time, num_layers,
-                    variable_seq_lengths):
+                    variable_seq_lengths, time_major, dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -711,16 +777,22 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
         time,
         num_layers,
         dtypes.float32,
-        variable_seq_lengths=variable_seq_lengths)
+        variable_seq_lengths=variable_seq_lengths,
+        time_major=time_major,
+        dynamic_shape_input=dynamic_shape_input)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training_fp16(self, num_units, input_size, batch_size, time,
-                         num_layers, variable_seq_lengths):
+                         num_layers, variable_seq_lengths, time_major,
+                         dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -732,16 +804,21 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
         dtypes.float16,
         rtol=5e-3,
         atol=5e-4,
-        variable_seq_lengths=variable_seq_lengths)
+        variable_seq_lengths=variable_seq_lengths,
+        time_major=time_major,
+        dynamic_shape_input=dynamic_shape_input)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference(self, num_units, input_size, batch_size, time, num_layers,
-                     variable_seq_lengths):
+                     variable_seq_lengths, time_major, dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -753,18 +830,24 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
           time,
           num_layers,
           is_training=False,
-          variable_seq_lengths=variable_seq_lengths)
+          variable_seq_lengths=variable_seq_lengths,
+          time_major=time_major,
+          dynamic_shape_input=dynamic_shape_input)
       self.assertAllClose(outputs, cu_outputs)
       self.assertAllClose(h, cu_h)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_fp16(self, num_units, input_size, batch_size, time,
-                          num_layers, variable_seq_lengths):
+                          num_layers, variable_seq_lengths, time_major,
+                          dynamic_shape_input):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -777,20 +860,26 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
           num_layers,
           is_training=False,
           dtype=dtypes.float16,
-          variable_seq_lengths=variable_seq_lengths)
+          variable_seq_lengths=variable_seq_lengths,
+          time_major=time_major,
+          dynamic_shape_input=dynamic_shape_input)
 
       rtol, atol = 5e-3, 5e-4
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
 
   @parameterized.named_parameters(
-      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
-          "variable_seq_lengths": [True, False],
-      }))
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, **{
+              "variable_seq_lengths": [True, False],
+              "time_major": [True, False],
+              "dynamic_shape_input": [True, False],
+          }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
-                                  num_layers, variable_seq_lengths):
+                                  num_layers, variable_seq_lengths, time_major,
+                                  dynamic_shape_input):
     """Validates that dropout does not affect Cudnn Rnn inference."""
     # Hand-picked dropouts are used below (0. and 1.)
     if not context.context().num_gpus():
@@ -807,7 +896,9 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
             num_layers,
             is_training=False,
             dropout=0.,
-            variable_seq_lengths=variable_seq_lengths)
+            variable_seq_lengths=variable_seq_lengths,
+            time_major=time_major,
+            dynamic_shape_input=dynamic_shape_input)
 
     with ops.Graph().as_default() as g:
       with self.session(use_gpu=True, graph=g) as sess:
@@ -820,7 +911,9 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
             num_layers,
             is_training=False,
             dropout=1.,
-            variable_seq_lengths=variable_seq_lengths)
+            variable_seq_lengths=variable_seq_lengths,
+            time_major=time_major,
+            dynamic_shape_input=dynamic_shape_input)
 
     self.assertAllClose(cu_outputs, cu_outputs2)
     self.assertAllClose(cu_h[0], cu_h2[0])
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index ca92c31236a7a3882415834eb32a994a120b6d2d..403f30909520dc5cd5f5919af843291fe1400b91 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -58,7 +58,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -709,7 +709,7 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
-class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+class CudnnRNNTestSaveRestoreTrackable(test_util.TensorFlowTestCase):
 
   def _VerifyCheckpoint(
       self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
@@ -718,7 +718,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with ops.device("gpu:0"):
       cudnn_layer = cudnn_cell_fn()
-      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      cudnn_checkpoint = trackable_utils.Checkpoint(cell=cudnn_layer)
       status = cudnn_checkpoint.restore(checkpoint_path)
       inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
                                    dtype=dtypes.float32)
@@ -726,7 +726,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       status.run_restore_ops()
     second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
     restore_layer = compatible_cell_fn()
-    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+    restore_layer_checkpoint = trackable_utils.Checkpoint(
         cell=restore_layer)
     status = restore_layer_checkpoint.restore(second_save_path)
     current_state = restore_layer.zero_state(1, dtypes.float32)
@@ -742,7 +742,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(restore_layer_output),
                         self.evaluate(cudnn_output)[-1, -1:, ...])
 
-  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+  def _TrackableSingleCellUnidirectionalTestTemplate(
       self, single_cell_fn, cudnn_cell_fn):
     # Single-layer cuDNN cells with object-based checkpointing should be
     # checkpoint compatible with either single CudnnCompatible cells or
@@ -759,7 +759,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       value = np.random.normal(size=variable.shape)
       expected_values.append(value)
       self.evaluate(variable.assign(value))
-    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    save_checkpoint = trackable_utils.Checkpoint(cell=save_cell_layer)
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -775,10 +775,10 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testLSTMCheckpointableSingleLayer(self):
+  def testLSTMTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+    self._TrackableSingleCellUnidirectionalTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
@@ -788,19 +788,19 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testGRUCheckpointableSingleLayer(self):
+  def testGRUTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
     with self.assertRaises(NotImplementedError):
       # TODO(allenl): Implement object-based saving for GRUs and other cells.
-      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+      self._TrackableSingleCellUnidirectionalTestTemplate(
           single_cell_fn=functools.partial(
               cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
           cudnn_cell_fn=functools.partial(
               cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
               direction=direction, name="awesome_gru"))
 
-  def _CheckpointableMultiLayerTestTemplate(
+  def _TrackableMultiLayerTestTemplate(
       self, single_cell_fn, cudnn_cell_fn, num_layers):
 
     def _MultiCellFn():
@@ -819,7 +819,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
         value = np.random.normal(size=variable.shape)
         expected_values.append(value)
         self.evaluate(variable.assign(value))
-      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      save_checkpoint = trackable_utils.Checkpoint(cell=save_layer)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -837,7 +837,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     num_units = 2
     num_layers = 3
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableMultiLayerTestTemplate(
+    self._TrackableMultiLayerTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 86ad8ae8073714657c78badb1e0b4a6d8c8ed5f0..c6ab003d6cca6a88a5e9e7971ed9211cab59cee0 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -104,7 +104,7 @@ class _CudnnRNN(base_layer.Layer):
 
   # Inference subgraph for unidirectional RNN on, e.g., CPU or mobile.
   with tf.Graph().as_default():
-    single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTM(num_units)
+    single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_units)
 
     # NOTE: Even if there's only one layer, the cell needs to be wrapped in
     # MultiRNNCell.
@@ -124,7 +124,7 @@ class _CudnnRNN(base_layer.Layer):
 
   # Inference subgraph for bidirectional RNN
   with tf.Graph().as_default():
-    single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTM(num_units)
+    single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_units)
     cells_fw = [single_cell() for _ in range(num_layers)]
     cells_bw = [single_cell() for _ in range(num_layers)]
 
@@ -378,20 +378,33 @@ class _CudnnRNN(base_layer.Layer):
            inputs,
            initial_state=None,
            sequence_lengths=None,
+           time_major=True,
            training=True):
     """Runs the forward step for the RNN model.
 
     Args:
-      inputs: `3-D` tensor with shape `[time_len, batch_size, input_size]`.
+      inputs: `3-D` tensor. If `time_major` is True (default), the Tensor shape
+        is [time_len, batch_size, input_size]. If `time_major` is False, the
+        shape is [batch_size, time_len, input_size].
       initial_state: a tuple of tensor(s) of shape
-        `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use
+        `[num_layers * num_dirs, batch_size, num_units]` if
+        `time_major` is True (default) or `[batch_size, num_layers * num_dirs,
+        num_units]` if `time_major` is False. If not provided, use
         zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs.
       sequence_lengths: an int32 array representing the variable sequence
         lengths in a batch. The size of the array has to equal the
         batch_size. If not provided, the same sequence length will be assumed.
+      time_major: The shape format of the `inputs` and `outputs` Tensors. If
+        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
+        If false, these Tensors must be shaped ['batch_size', 'max_time',
+        'depth']. By default this function accepts input and emits output in
+        time-major form. This param is only effective when 'sequence_lengths'
+        is used.
       training: whether this operation will be used in training or inference.
     Returns:
-      output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`.
+      output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`
+        if `time_major` is True (default) or `[batch_size, time_len,
+        num_dirs * num_units]` if `time_major` is False.
         It is a `concat([fwd_output, bak_output], axis=2)`.
       output_states: a tuple of tensor(s) of the same shape and structure as
         `initial_state`.
@@ -417,8 +430,8 @@ class _CudnnRNN(base_layer.Layer):
     else:
       # For model that doesn't take input_c, replace with a dummy tensor.
       c = array_ops.constant([], dtype=dtype)
-    outputs, (output_h, output_c) = self._forward(inputs, h, c, self.kernel,
-                                                  sequence_lengths, training)
+    outputs, (output_h, output_c) = self._forward(
+        inputs, h, c, self.kernel, sequence_lengths, time_major, training)
     if self._rnn_mode == CUDNN_LSTM:
       return outputs, (output_h, output_c)
     else:
@@ -482,7 +495,8 @@ class _CudnnRNN(base_layer.Layer):
           dropout=self._dropout,
           direction=self._direction)
 
-  def _forward(self, inputs, h, c, opaque_params, sequence_lengths, training):
+  def _forward(self, inputs, h, c, opaque_params, sequence_lengths, time_major,
+               training):
     output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn(  # pylint:disable=protected-access
         inputs,
         h,
@@ -491,6 +505,7 @@ class _CudnnRNN(base_layer.Layer):
         training,
         self._rnn_mode,
         sequence_lengths=sequence_lengths,
+        time_major=time_major,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
@@ -518,8 +533,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
-    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
-        checkpointable=self, dtype=self._plain_dtype)
+    self._saveable._add_trackable_dependencies(  # pylint: disable=protected-access
+        trackable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index f36e8d5022bc7e3f8268a161089153e5510dffc6..77afbeec0c57280fdd09ffe9019c6fe17d7b0de8 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking as checkpointable_lib
+from tensorflow.python.training.tracking import tracking as trackable_lib
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
@@ -737,13 +737,13 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
-  def _checkpointable_save(self, save_buffer):
+  def _trackable_save(self, save_buffer):
     weights, biases = self.format_converter.opaque_to_tf_canonical(
         self._variables)
     for name, tensor in zip(self._param_names, weights + biases):
       save_buffer[name] = array_ops.identity(tensor)
 
-  def _checkpointable_restore(self, restore_buffer):
+  def _trackable_restore(self, restore_buffer):
     tensors = [
         array_ops.identity(restore_buffer[name]) for name in self._param_names
     ]
@@ -752,26 +752,26 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         restored_shapes=None  # Unused
     )
 
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
+  def _add_trackable_dependencies(self, trackable, dtype):
+    """Add canonical weight dependencies to `trackable`.
 
     When saving or restoring, converts to or from the opaque buffer
     format. Weights are saved and loaded in the configuration expected by
     cuDNN-compatible cells.
 
     Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
+      trackable: An object inheriting from `Trackable` to add
         dependencies too (typically the cuDNN `Layer`).
       dtype: The dtype for the canonical parameter Tensors.
     """
     split_dependencies = split_dependency.split_dependency(
         component_names=self._param_names,
         component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
+        fill_save_buffer_fn=self._trackable_save,
+        consume_restore_buffer_fn=self._trackable_restore)
+    self._trackable_track_params(trackable, split_dependencies)
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Tracks parameters in a canonical configuration."""
     return  # NotImplementedError raised by the Layer.
 
@@ -819,7 +819,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_weights_names.append(prefix + "/kernel")
     tf_bias_names.append(prefix + "/bias")
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
     biases = []
     weights = []
@@ -833,12 +833,12 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
       # wrapping.
       kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
       bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+      trackable._track_trackable(kernel, name="kernel")  # pylint: disable=protected-access
+      trackable._track_trackable(bias, name="bias")  # pylint: disable=protected-access
     assert len(biases) == len(weights)
     for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.AutoCheckpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell = trackable_lib.AutoTrackable()
+      trackable._track_trackable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
       cell.bias = bias
       cell.kernel = kernel
 
@@ -956,6 +956,7 @@ def _cudnn_rnn(inputs,
                is_training,
                rnn_mode,
                sequence_lengths=None,
+               time_major=True,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
                direction=CUDNN_RNN_UNIDIRECTION,
                dropout=0.,
@@ -964,10 +965,12 @@ def _cudnn_rnn(inputs,
   """Cudnn RNN.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     input_c: the initial hidden state for c. This is only relevant for LSTM.
       A Tensor of the same shape as input_h.
     params: the parameter buffer created for this model.
@@ -977,6 +980,11 @@ def _cudnn_rnn(inputs,
       in a batch. The size of the array has to equal the batch_size. Default to
       None, in which case sequences in the batch are assumed to have the same
       length, which is inferred from inputs.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
     input_mode: indicate whether there is a linear projection between the
       input and the actual computation before the first layer. It could be
       'linear_input', 'skip_input' or 'auto_select'.
@@ -1017,6 +1025,14 @@ def _cudnn_rnn(inputs,
   }
   if sequence_lengths is not None:
     args["sequence_lengths"] = sequence_lengths
+    args["time_major"] = time_major
+    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
+  elif time_major is False:
+    batch_size = array_ops.shape(inputs)[0]
+    max_time = array_ops.shape(inputs)[1]
+    sequence_lengths = array_ops.fill([batch_size], max_time)
+    args["sequence_lengths"] = sequence_lengths
+    args["time_major"] = time_major
     outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
   elif use_cudnn_v2 != "1":
     outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
@@ -1031,6 +1047,7 @@ def cudnn_lstm(inputs,
                params,
                is_training,
                sequence_lengths=None,
+               time_major=True,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
                direction=CUDNN_RNN_UNIDIRECTION,
                dropout=0.,
@@ -1039,15 +1056,26 @@ def cudnn_lstm(inputs,
   """Cudnn LSTM.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     input_c: the initial hidden state for c. This is only relevant for LSTM.
       A Tensor of the same shape as input_h.
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
-      input_mode: indicate whether there is a linear projection between the
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
+    input_mode: indicate whether there is a linear projection between the
         input and the actual computation before the first layer. It could be
         'linear_input', 'skip_input' or 'auto_select'.
         'linear_input' (default) always applies a linear projection of input
@@ -1060,17 +1088,13 @@ def cudnn_lstm(inputs,
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
     seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
-    sequence_lengths: an int32 array representing the variable sequence lengths
-      in a batch. The size of the array has to equal the batch_size. Default to
-      None, in which case sequences in the batch are assumed to have the same
-      length, which is inferred from inputs.
     name: name of the operation.
   Returns:
     outputs, output_h, output_c
   """
   return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM,
-                    sequence_lengths, input_mode, direction, dropout, seed,
-                    name)
+                    sequence_lengths, time_major, input_mode, direction,
+                    dropout, seed, name)
 
 
 def _cudnn_rnn_no_input_c(inputs,
@@ -1079,6 +1103,7 @@ def _cudnn_rnn_no_input_c(inputs,
                           is_training,
                           rnn_mode,
                           sequence_lengths=None,
+                          time_major=True,
                           input_mode=CUDNN_INPUT_LINEAR_MODE,
                           direction=CUDNN_RNN_UNIDIRECTION,
                           dropout=0.,
@@ -1087,10 +1112,12 @@ def _cudnn_rnn_no_input_c(inputs,
   """Cudnn RNN w/o input_c.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
     rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
@@ -1098,6 +1125,11 @@ def _cudnn_rnn_no_input_c(inputs,
       in a batch. The size of the array has to equal the batch_size. Default to
       None, in which case sequences in the batch are assumed to have the same
       length, which is inferred from inputs.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
     input_mode: indicate whether there is a linear projection between the
       input and the actual computation before the first layer. It could be
       'linear_input', 'skip_input' or 'auto_select'.
@@ -1116,9 +1148,9 @@ def _cudnn_rnn_no_input_c(inputs,
     outputs, output_h
   """
   input_c = array_ops.constant([], dtype=input_h.dtype)
-  outputs, output_h, _ = _cudnn_rnn(inputs, input_h, input_c, params,
-                                    is_training, rnn_mode, sequence_lengths,
-                                    input_mode, direction, dropout, seed, name)
+  outputs, output_h, _ = _cudnn_rnn(
+      inputs, input_h, input_c, params, is_training, rnn_mode, sequence_lengths,
+      time_major, input_mode, direction, dropout, seed, name)
   return outputs, output_h
 
 
@@ -1127,6 +1159,7 @@ def cudnn_gru(inputs,
               params,
               is_training,
               sequence_lengths=None,
+              time_major=True,
               input_mode=CUDNN_INPUT_LINEAR_MODE,
               direction=CUDNN_RNN_UNIDIRECTION,
               dropout=0.,
@@ -1135,10 +1168,12 @@ def cudnn_gru(inputs,
   """Cudnn GRU.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
       input_mode: indicate whether there is a linear projection between the
@@ -1153,6 +1188,11 @@ def cudnn_gru(inputs,
       in a batch. The size of the array has to equal the batch_size. Default to
       None, in which case sequences in the batch are assumed to have the same
       length, which is inferred from inputs.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
@@ -1163,8 +1203,8 @@ def cudnn_gru(inputs,
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU,
-                               sequence_lengths, input_mode, direction, dropout,
-                               seed, name)
+                               sequence_lengths, time_major, input_mode,
+                               direction, dropout, seed, name)
 
 
 def cudnn_rnn_relu(inputs,
@@ -1176,14 +1216,17 @@ def cudnn_rnn_relu(inputs,
                    dropout=0.,
                    seed=0,
                    sequence_lengths=None,
+                   time_major=True,
                    name=None):
   """Cudnn RNN Relu.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
       input_mode: indicate whether there is a linear projection between the
@@ -1201,14 +1244,19 @@ def cudnn_rnn_relu(inputs,
     sequence_lengths: an int32 array representing the variable sequence lengths
       in a batch. The size of the array has to equal the batch_size. If not
       provided, the same sequence length will be assumed.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
     name: name of the operation.
 
   Returns:
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
-                               CUDNN_RNN_RELU, sequence_lengths, input_mode,
-                               direction, dropout, seed, name)
+                               CUDNN_RNN_RELU, sequence_lengths, time_major,
+                               input_mode, direction, dropout, seed, name)
 
 
 def cudnn_rnn_tanh(inputs,
@@ -1216,6 +1264,7 @@ def cudnn_rnn_tanh(inputs,
                    params,
                    is_training,
                    sequence_lengths=None,
+                   time_major=True,
                    input_mode=CUDNN_INPUT_LINEAR_MODE,
                    direction=CUDNN_RNN_UNIDIRECTION,
                    dropout=0.,
@@ -1224,10 +1273,12 @@ def cudnn_rnn_tanh(inputs,
   """Cudnn RNN Tanh.
 
   Args:
-    inputs: the input sequence to the RNN model. A Tensor of shape [?,
-      batch_size, input_size].
-    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-      batch_size, num_units].
+    inputs: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+    input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
       input_mode: indicate whether there is a linear projection between the
@@ -1242,6 +1293,11 @@ def cudnn_rnn_tanh(inputs,
       in a batch. The size of the array has to equal the batch_size. Default to
       None, in which case sequences in the batch are assumed to have the same
       length, which is inferred from inputs.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
+      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
+      By default this function accepts input and emits output in time-major
+      form. This param is only effective when 'sequence_lengths' is used.
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
@@ -1252,8 +1308,8 @@ def cudnn_rnn_tanh(inputs,
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
-                               CUDNN_RNN_TANH, sequence_lengths, input_mode,
-                               direction, dropout, seed, name)
+                               CUDNN_RNN_TANH, sequence_lengths, time_major,
+                               input_mode, direction, dropout, seed, name)
 
 
 def cudnn_rnn_opaque_params_to_canonical(rnn_mode,
@@ -1537,22 +1593,32 @@ class _CudnnRNN(object):
                input_c,
                params,
                is_training=True,
-               sequence_lengths=None):
+               sequence_lengths=None,
+               time_major=True):
     """Runs the forward step for the RNN model.
 
     Args:
-      input_data: the input sequence to the RNN model. A Tensor of shape [?,
-        batch_size, input_size].
-      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-        batch_size, num_units].
-      input_c: the initial hidden state for c. This is only relevant for LSTM.
-        A Tensor of the same shape as input_h.
+      input_data: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+      input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
+      input_c: the initial hidden state for c. This is only relevant for LSTM. A
+        Tensor of the same shape as input_h.
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
       sequence_lengths: an int32 array representing the variable sequence
         lengths in a batch. The size of the array has to equal the batch_size.
         Default to None, in which case sequences in the batch are assumed to
         have the same length, which is inferred from inputs.
+      time_major: The shape format of the `inputs` and `outputs` Tensors. If
+        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
+        If false, these Tensors must be shaped ['batch_size', 'max_time',
+        'depth']. By default this function accepts input and emits output in
+        time-major form. This param is only effective when 'sequence_lengths' is
+        used.
+
     Returns:
       output: the output sequence.
       output_h: the final state for h.
@@ -1566,6 +1632,7 @@ class _CudnnRNN(object):
         is_training,
         self._rnn_mode,
         sequence_lengths=sequence_lengths,
+        time_major=time_major,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
@@ -1666,14 +1733,17 @@ class CudnnLSTM(_CudnnRNN):
                input_c,
                params,
                sequence_lengths=None,
+               time_major=True,
                is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
-      input_data: the input sequence to the LSTM model. A Tensor of shape [?,
-        batch_size, input_size].
-      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-        batch_size, num_units].
+      input_data: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+      input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
       input_c: the initial hidden state for c. A Tensor of the same shape as
         input_h.
       params: the parameter buffer created for this model.
@@ -1681,6 +1751,12 @@ class CudnnLSTM(_CudnnRNN):
         lengths in a batch. The size of the array has to equal the batch_size.
         Default to None, in which case sequences in the batch are assumed to
         have the same length, which is inferred from inputs.
+      time_major: The shape format of the `inputs` and `outputs` Tensors. If
+        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
+        If false, these Tensors must be shaped ['batch_size', 'max_time',
+        'depth']. By default this function accepts input and emits output in
+        time-major form. This param is only effective when 'sequence_lengths'
+        is used.
       is_training: whether this operation will be used in training or inference.
     Returns:
       output: the output sequence.
@@ -1693,6 +1769,7 @@ class CudnnLSTM(_CudnnRNN):
         input_c,
         params,
         sequence_lengths=sequence_lengths,
+        time_major=time_major,
         is_training=is_training)
     return (output, output_h, output_c)
 
@@ -1752,19 +1829,28 @@ class _CudnnRNNNoInputC(_CudnnRNN):
                input_h,
                params,
                sequence_lengths=None,
+               time_major=True,
                is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
-      input_data: the input sequence to the RNN model. A Tensor of shape [?,
-        batch_size, input_size].
-      input_h: the initial hidden state for h. A Tensor of shape [num_layers,
-        batch_size, num_units].
+      input_data: the input sequence to the RNN model. If `time_major` is True
+        (default), the Tensor shape is [max_time, batch_size, input_size]. If
+        `time_major` is False, the shape is [batch_size, max_time, input_size].
+      input_h: the initial hidden state for h. If `time_major` is True
+        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
+        `time_major` is False, the shape is [batch_size, num_layers, num_units].
       params: the parameter buffer created for this model.
       sequence_lengths: an int32 array representing the variable sequence
         lengths in a batch. The size of the array has to equal the batch_size.
         Default to None, in which case sequences in the batch are assumed to
         have the same length, which is inferred from inputs.
+      time_major: The shape format of the `inputs` and `outputs` Tensors. If
+        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
+        If false, these Tensors must be shaped ['batch_size', 'max_time',
+        'depth']. By default this function accepts input and emits output in
+        time-major form. This param is only effective when 'sequence_lengths'
+        is used.
       is_training: whether this operation will be used in training or inference.
     Returns:
       output: the output sequence.
@@ -1777,6 +1863,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         is_training,
         self._rnn_mode,
         sequence_lengths=sequence_lengths,
+        time_major=time_major,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 6c5f8c6b00975b3fba041271309a93cecd9f5057..077571fcd2091b3b7216c57627a11989f3db1fdf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -25,11 +25,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class AssertElementShapeTest(test_base.DatasetTestBase):
 
   def test_assert_element_shape(self):
@@ -41,10 +43,12 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(5).map(create_dataset)
     expected_shapes = (tensor_shape.TensorShape(2),
                        tensor_shape.TensorShape((3, 4)))
-    self.assertEqual(expected_shapes, dataset.output_shapes)
+    self.assertEqual(expected_shapes,
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
+    self.assertEqual(expected_shapes,
+                     dataset_ops.get_legacy_output_shapes(result))
 
     iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
@@ -81,12 +85,14 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
                       tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
+    self.assertEqual(unknown_shapes,
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     expected_shapes = (tensor_shape.TensorShape(2),
                        tensor_shape.TensorShape((3, 4)))
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
+    self.assertEqual(expected_shapes,
+                     dataset_ops.get_legacy_output_shapes(result))
 
     iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
@@ -111,7 +117,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
                       tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
+    self.assertEqual(unknown_shapes,
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((3, 10)))
@@ -139,7 +146,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     # Partial shapes are merged with actual shapes:
     actual_shapes = (tensor_shape.TensorShape(2),
                      tensor_shape.TensorShape((3, 4)))
-    self.assertEqual(actual_shapes, result.output_shapes)
+    self.assertEqual(actual_shapes,
+                     dataset_ops.get_legacy_output_shapes(result))
 
     iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
@@ -176,12 +184,14 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
                       tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
+    self.assertEqual(unknown_shapes,
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     expected_shapes = (tensor_shape.TensorShape(2),
                        tensor_shape.TensorShape((None, 4)))
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
-    self.assertEqual(expected_shapes, result.output_shapes)
+    self.assertEqual(expected_shapes,
+                     dataset_ops.get_legacy_output_shapes(result))
 
     iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
@@ -206,7 +216,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
     unknown_shapes = (tensor_shape.TensorShape(None),
                       tensor_shape.TensorShape(None))
-    self.assertEqual(unknown_shapes, dataset.output_shapes)
+    self.assertEqual(unknown_shapes,
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((None, 10)))
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index b9840b1ff1a3df5a05db0e64f436637220f49f80..220f9934b67d1d2a97f6c0fd4ba7779f011e1b09 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -27,12 +27,14 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 prefix_path = "tensorflow/core/lib"
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class LMDBDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
index e7281d531870c75c638b5c48fa3fc6dc606a3623..78019fcc7d810da444f1407f3885d54e76a741c6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
@@ -25,10 +25,12 @@ from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 2527706709fae8e459aca3489324d4db3c784be6..95cf659a84ba560fb9e06ad47477dd69b17cae3a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -26,11 +26,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -230,7 +232,7 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=array_ops.expand_dims(
               math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
+          values=array_ops.fill([math_ops.cast(i, dtypes.int32)], i),
           dense_shape=[i])
 
     iterator = dataset_ops.make_initializable_iterator(
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 8c60459ca81cd7a7e08d90339011c54275ea9c0b..f8bb942c0a54d0892f382b1779ff830ab04b8258 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.util import deprecation
 
@@ -215,14 +216,14 @@ def assert_element_shape(expected_shapes):
     return nest.pack_sequence_as(elements, checked_tensors)
 
   def _apply_fn(dataset):
-    output_shapes = _merge_output_shapes(dataset.output_shapes,
-                                         expected_shapes)
+    output_shapes = _merge_output_shapes(
+        dataset_ops.get_legacy_output_shapes(dataset), expected_shapes)
     # pylint: disable=protected-access
     return batching._RestructuredDataset(
         dataset.map(_check_shape),
-        dataset.output_types,
+        dataset_ops.get_legacy_output_types(dataset),
         output_shapes=output_shapes,
-        output_classes=dataset.output_classes)
+        output_classes=dataset_ops.get_legacy_output_classes(dataset))
 
   return _apply_fn
 
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index 6708e01d08135a132b797e317cd2a241c3428f40..b3c2c984a9d9920cd1501bd1612757b23e92b7de 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
@@ -38,9 +37,7 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     self._window_shift = ops.convert_to_tensor(
         window_shift, dtype=dtypes.int64, name="window_shift")
 
-    input_structure = structure.convert_legacy_structure(
-        input_dataset.output_types, input_dataset.output_shapes,
-        input_dataset.output_classes)
+    input_structure = dataset_ops.get_structure(input_dataset)
     self._structure = input_structure._batch(None)  # pylint: disable=protected-access
     variant_tensor = ged_ops.experimental_sliding_window_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -59,7 +56,7 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     None, "stride is deprecated, use window_shift instead", "stride")
 @deprecation.deprecated(
     None, "Use `tf.data.Dataset.window(size=window_size, shift=window_shift, "
-    "stride=window_stride).flat_map(lambda x: x.batch(window.size))` "
+    "stride=window_stride).flat_map(lambda x: x.batch(window_size))` "
     "instead.")
 def sliding_window_batch(window_size,
                          stride=None,
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 6ae3ec7fb0892db5434d2364064ade574dc21e38..4260cfbc40d416ea654dfcd661b358db2b81db32 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -15,16 +15,26 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-# TODO(priyag): Figure out testonly issues that are preventing us from
-# including our tests in pip for now.
+py_library(
+    name = "distribute_test_lib_pip",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":combinations",
+        ":keras_correctness_test_lib",
+        ":keras_test_lib",
+        ":multi_worker_test_base",
+        ":single_loss_example",
+        ":strategy_test_lib",
+    ],
+)
 
-cuda_py_test(
+distribute_py_test(
     name = "values_test",
     srcs = ["values_test.py"],
-    additional_deps = [
+    main = "values_test.py",
+    deps = [
         ":combinations",
         ":mirrored_strategy",
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -37,9 +47,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
-    ],
-    tags = [
-        "no_pip",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -51,18 +59,13 @@ cuda_py_test(
         ":mirrored_strategy",
         ":multi_worker_test_base",
         "@absl_py//absl/testing:parameterized",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 py_library(
@@ -117,7 +120,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -126,17 +128,17 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:numpy_dataset",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
+        "//tensorflow/python/distribute:one_device_strategy",
+    ],
+)
+
+cuda_py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    additional_deps = [
+        ":strategy_test_lib",
+        ":combinations",
+        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -145,39 +147,34 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":mirrored_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:cross_device_utils",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute:numpy_dataset",
-        "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
     ],
 )
 
 py_library(
     name = "strategy_test_lib",
-    testonly = 1,
     srcs = ["strategy_test_lib.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -187,15 +184,12 @@ py_library(
 
 py_library(
     name = "combinations",
-    testonly = 1,
     srcs = ["combinations.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":mirrored_strategy",
         ":one_device_strategy",
+        ":parameter_server_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
@@ -204,6 +198,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -211,30 +206,12 @@ py_library(
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":combinations",
         "//tensorflow/python/eager:test",
     ],
 )
 
-py_test(
-    name = "one_device_strategy_test",
-    srcs = ["one_device_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":one_device_strategy",
-        ":strategy_test_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 # TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
@@ -260,18 +237,13 @@ cuda_py_test(
     tags = [
         "guitar",
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "multi_worker_test_base",
-    testonly = 1,
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -283,6 +255,33 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "keras_multi_worker_test",
+    srcs = ["keras_multi_worker_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
+        "//tensorflow/contrib/distribute/python:combinations",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:multi_worker_test_base",
+        "//tensorflow/contrib/distribute/python:parameter_server_strategy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:engine",
+    ],
+    shard_count = 3,
+    tags = [
+        # TODO(b/124344198): Add "multi_and_single_gpu",
+    ],
+)
+
 py_library(
     name = "step_fn",
     srcs = ["step_fn.py"],
@@ -298,18 +297,8 @@ py_library(
     srcs = ["tpu_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":one_device_strategy",
         "//tensorflow/contrib/tpu:tpu_lib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:numpy_dataset",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute:tpu_strategy",
     ],
 )
 
@@ -340,7 +329,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -350,7 +338,6 @@ distribute_py_test(
     main = "minimize_loss_test.py",
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
     deps = [
         ":combinations",
@@ -384,9 +371,6 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
@@ -404,7 +388,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -427,7 +410,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -438,10 +420,10 @@ cuda_py_test(
     additional_deps = [
         ":keras_test_lib",
     ],
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -471,7 +453,6 @@ cuda_py_test(
     shard_count = 48,
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
         # TODO(b/118768923): Re-enable {a,m,t}san test.
         "noasan",
         "nomsan",
@@ -499,7 +480,6 @@ distribute_py_test(
     main = "step_fn_test.py",
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
     deps = [
         ":combinations",
@@ -529,10 +509,10 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":monitor",
-        ":one_device_strategy",
         ":single_loss_example",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python:framework_ops",
@@ -540,7 +520,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -557,15 +536,13 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     additional_deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":multi_worker_test_base",
         ":mirrored_strategy",
@@ -581,16 +558,15 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "keras_test_lib",
-    testonly = 1,
     srcs = [
         "keras_backward_compat_test.py",
         "keras_test.py",
+        "keras_utils_test.py",
     ],
     deps = [
         ":combinations",
@@ -609,16 +585,33 @@ py_library(
 distribute_py_test(
     name = "keras_test",
     srcs = ["keras_test.py"],
+    full_precision = True,
     main = "keras_test.py",
-    shard_count = 16,
+    shard_count = 32,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883)
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_utils_test",
+    srcs = ["keras_utils_test.py"],
+    full_precision = True,
+    main = "keras_utils_test.py",
+    shard_count = 32,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
     deps = [
+        ":keras_test",
         ":keras_test_lib",
     ],
 )
@@ -629,11 +622,9 @@ distribute_py_test(
     srcs = ["keras_backward_compat_test.py"],
     full_precision = True,
     main = "keras_backward_compat_test.py",
-    shard_count = 16,
+    shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
@@ -644,13 +635,13 @@ distribute_py_test(
 
 py_library(
     name = "keras_correctness_test_lib",
-    testonly = 1,
     srcs = [
         "keras_correctness_test_base.py",
         "keras_dnn_correctness_test.py",
         "keras_embedding_model_correctness_test.py",
         "keras_image_model_correctness_test.py",
         "keras_lstm_model_correctness_test.py",
+        "keras_stateful_lstm_model_correctness_test.py",
     ],
     deps = [
         ":combinations",
@@ -677,8 +668,6 @@ distribute_py_test(
     shard_count = 19,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
@@ -698,8 +687,6 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
@@ -719,8 +706,6 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
@@ -740,7 +725,25 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_stateful_lstm_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_stateful_lstm_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_stateful_lstm_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
         "no_pip",
         "no_windows_gpu",
         "notsan",
@@ -756,7 +759,6 @@ distribute_py_test(
     main = "metrics_v1_test.py",
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
     deps = [
         ":combinations",
@@ -783,7 +785,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -794,7 +795,6 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:checkpoint_utils_test",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -802,7 +802,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -817,12 +816,11 @@ tf_xla_py_test(
     ],
     tags = [
         "no_oss",
-        "no_pip",
     ],
     deps = [
         ":tpu_strategy",
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index 3ef8b9574a36730dcc1d8fd42b6c7f364d84bbed..79369fc6b93b4491c9744653d8d64c5c8a4de30d 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -25,6 +25,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
@@ -33,12 +34,34 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import checkpoint_utils_test
+from tensorflow.python.training import saver as saver_lib
+
+
+def _create_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  v1 = variable_scope.get_variable("var1", [1, 10])
+  v2 = variable_scope.get_variable("var2", [10, 10])
+  sess.run(variables.global_variables_initializer())
+  v1_value, v2_value = sess.run([v1, v2])
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value, v2_value
 
 
 class CheckpointUtilsWithDistributionStrategyTest(
     test.TestCase, parameterized.TestCase):
 
+  def _get_test_object(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.cached_session() as session:
+      v1, v2 = _create_checkpoints(session, checkpoint_dir)
+    return checkpoint_dir, v1, v2
+
   @combinations.generate(combinations.combine(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
@@ -49,10 +72,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
       in_replica_mode=[True, False],
       mode=["graph"]))
   def testInitFromCheckpoint(self, distribution, in_replica_mode):
-    checkpoint_dir = self.get_temp_dir()
-    with self.cached_session() as session:
-      v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
-          session, checkpoint_dir)
+    checkpoint_dir, v1_value, v2_value = self._get_test_object()
 
     def init_and_verify(g):
       v1 = variable_scope.get_variable("new_var1", [1, 10])
@@ -75,6 +95,39 @@ class CheckpointUtilsWithDistributionStrategyTest(
       else:
         init_and_verify(g)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.default_strategy, combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
+          ],
+          in_replica_mode=[True, False],
+          mode=["graph"]))
+  def testInitFromDifferentNameObject(self, distribution, in_replica_mode):
+    checkpoint_dir, v1_value, _ = self._get_test_object()
+
+    def init_and_verify(g):
+      v1 = variable_scope.get_variable("new_var1", [1, 10])
+      # Use string add to create new object in each replica
+      prefix = "new_"
+      suffix = "var1"
+      new_var1 = prefix + suffix
+      checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+          "var1": new_var1,
+      })
+      with self.test_session(graph=g) as session:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(v1_value, self.evaluate(v1))
+
+    with ops.Graph().as_default() as g, distribution.scope():
+      if in_replica_mode:
+        distribution.extended.call_for_each_replica(init_and_verify, [g])
+      else:
+        init_and_verify(g)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/checkpointing_test.py b/tensorflow/contrib/distribute/python/checkpointing_test.py
index aa5b9f57b8a5bc12ee94399ec1fc5a55177a5b5d..eadf7233f2ae5ee50b71836ebfcc895163124ac2 100644
--- a/tensorflow/contrib/distribute/python/checkpointing_test.py
+++ b/tensorflow/contrib/distribute/python/checkpointing_test.py
@@ -30,15 +30,15 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import adam as adam_v1
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -49,8 +49,8 @@ class Subclassed(training.Model):
     super(Subclassed, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -76,7 +76,7 @@ class TrainingCheckpointTests(xla_test.XLATestCase):
       with strategy.scope():
         model = Subclassed()
         optimizer = adam_v1.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             optimizer_step=training_util.get_or_create_global_step())
         root.restore(checkpoint_management.latest_checkpoint(
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index eee07543251321ae0c9eef57851431cf97c65643..d4f76e3e7b937798c978f740e080f44a4a1cb418 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,30 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import cross_device_utils
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import input_lib
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import numpy_dataset
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import collective_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 
 
 # TODO(yuefengz): support in-graph replication.
 class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
+  *** contrib version ***
+
   It is similar to the MirroredStrategy but it uses collective ops for
   reduction.
 
@@ -53,331 +42,45 @@ class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   distributed environment.
   """
 
-  def __init__(self, num_gpus_per_worker=0):
+  def __init__(self,
+               num_gpus_per_worker=0,
+               communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
     """Initializes the object.
 
     Args:
       num_gpus_per_worker: number of local GPUs or GPUs per worker, the default
         is 0 meaning CPU only.
+      communication: optional Enum of type
+        `distribute.experimental.CollectiveCommunication`.  This provides a way
+        for the user to override the choice of collective op communication.
+        Possible values include `AUTO`, `RING`, and `NCCL`.
     """
     super(CollectiveAllReduceStrategy, self).__init__(
-        CollectiveAllReduceExtended(self, num_gpus_per_worker))
+        CollectiveAllReduceExtended(
+            self,
+            num_gpus_per_worker=num_gpus_per_worker,
+            communication=communication))
 
 
-class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+class CollectiveAllReduceExtended(
+    collective_all_reduce_strategy.CollectiveAllReduceExtended):
   """Implementation of CollectiveAllReduceStrategy."""
 
-  def __init__(self, container_strategy, num_gpus_per_worker):
-    distribute_lib.DistributionStrategyExtended.__init__(
-        self, container_strategy)
-    self._cross_device_ops = None
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local_worker(num_gpus_per_worker)
-    assert isinstance(self._get_cross_device_ops(),
-                      cross_device_ops_lib.CollectiveAllReduce)
-
-  def _initialize_local_worker(self, num_gpus_per_worker):
-    """Initializes the object for local training."""
-    self._is_chief = True
-    self._num_workers = 1
-
-    if num_gpus_per_worker:
-      local_devices = tuple(
-          "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
-      )
-    else:
-      local_devices = ("/device:CPU:0",)
-    self._worker_device = device_util.canonicalize("/device:CPU:0")
-    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
-
-    self._collective_keys = cross_device_utils.CollectiveKeys()
-    self._initialize_local(local_devices)
-    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
-    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus_per_worker,
-        collective_keys=self._collective_keys)
-
-    self._cluster_spec = None
-    self._task_type = None
-    self._task_id = None
-
-    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
-                 local_devices)
-
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
-    """Initializes the object for multi-worker training."""
-    if task_type is None or task_id is None:
-      raise ValueError("When `cluster_spec` is given, you must also specify "
-                       "`task_type` and `task_id`")
-    if task_type not in ("chief", "worker"):
-      raise ValueError(
-          "Unrecognized task_type: %r, valid task types are: \"chief\", "
-          "\"worker\"." % task_type)
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
-    if not self._num_workers:
-      raise ValueError("No `worker` or `chief` tasks can be found in "
-                       "`cluster_spec`.")
-
-    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
-                                                task_id)
-
-    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
-    if num_gpus_per_worker:
-      local_devices = tuple(
-          "%s/device:GPU:%d" % (self._worker_device, i)
-          for i in range(num_gpus_per_worker)
-      )
-    else:
-      local_devices = (self._worker_device,)
-
-    self._collective_keys = cross_device_utils.CollectiveKeys()
-    self._initialize_local(local_devices)
-    self._input_workers = input_lib.InputWorkers(
-        self._device_map, [(self._worker_device, self.worker_devices)])
-    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus_per_worker,
-        collective_keys=self._collective_keys)
-
-    # Add a default device so that ops without specified devices will not end up
-    # on other workers.
-    self._default_device = "/job:%s/task:%d" % (task_type, task_id)
-
-    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._task_type = task_type
-    self._task_id = task_id
-
-    logging.info(
-        "Multi-worker CollectiveAllReduceStrategy with "
-        "cluster_spec = %r, task_type = %r, task_id = %r, "
-        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
-        task_type, task_id, self._num_workers, local_devices)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    colocate_with = kwargs.pop("colocate_with", None)
-    if colocate_with is None:
-      device_map = self._device_map
-      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
-    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
-      with ops.device(colocate_with.device):
-        return next_creator(*args, **kwargs)
-    else:
-      device_map = colocate_with.device_map
-      logical_device = colocate_with.logical_device
-
-    def _real_mirrored_creator(devices, *args, **kwargs):
-      """Creates one MirroredVariable on the current worker."""
-      unique_var_name = ops.get_default_graph().unique_name(
-          kwargs["name"], mark_as_used=False).rstrip("/")
-      # pylint: disable=protected-access
-      collective_instance_key = self._collective_keys.get_instance_key(
-          key_id=unique_var_name)
-      # Only the first device participles in the broadcast of initial values.
-      group_key = self._collective_keys.get_group_key([devices[0]])
-      group_size = self._num_workers
-      if "initial_value" not in kwargs:
-        raise ValueError("Initial value must be specified.")
-      initial_value = kwargs["initial_value"]
-      if callable(initial_value):
-        initial_value_fn = initial_value
-      else:
-        initial_value_fn = lambda: initial_value
-
-      value_list = []
-      for i, d in enumerate(devices):
-        with ops.init_scope(), ops.device(d):
-          if i == 0:
-            # The initial value fn makes sure variables all initialized to
-            # same values. The first device of the chief worker will send their
-            # variable values to other workers.
-            def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
-              with ops.device(device):
-                initial_value = initial_value_fn()
-                assert not callable(initial_value)
-                initial_value = ops.convert_to_tensor(initial_value)
-
-                assert index == 0, index
-                if self._num_workers > 1:
-                  if self._is_chief:
-                    bcast_send = collective_ops.broadcast_send(
-                        initial_value, initial_value.shape, initial_value.dtype,
-                        group_size, group_key, collective_instance_key)
-                    with ops.control_dependencies([bcast_send]):
-                      return array_ops.identity(initial_value)
-                  else:
-                    return collective_ops.broadcast_recv(
-                        initial_value.shape, initial_value.dtype, group_size,
-                        group_key, collective_instance_key)
-                return initial_value
-          else:
-            # Give replicas meaningful distinct names:
-            var0name = value_list[0].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-
-            # Variables on non-first replica get initial values from the
-            # variables created on the first device of each worker.
-            def _overridden_initial_value_fn(device=d, index=i):
-              assert index > 0
-              with ops.device(device):
-                if context.executing_eagerly():
-                  return array_ops.identity(value_list[0].value())
-                else:
-                  return array_ops.identity(value_list[0].initial_value)
-
-          kwargs["initial_value"] = _overridden_initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            # Don't record operations (e.g. other variable reads) during
-            # variable creation.
-            with tape.stop_recording():
-              v = next_creator(*args, **kwargs)
-
-          if i == 0:
-            actual_var_name = v.name.split(":")[0]
-            assert unique_var_name == actual_var_name, "%r vs %r" % (
-                unique_var_name, actual_var_name)
-          assert not isinstance(v, values.DistributedVariable)
-          value_list.append(v)
-      return value_list
-
-    # pylint: disable=protected-access
-    return mirrored_strategy._create_mirrored_variable(
-        self._container_strategy(), device_map, logical_device,
-        _real_mirrored_creator, *args, **kwargs)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    # TODO(yuefengz): shard the dataset.
-    worker_index = 0
-    return input_lib.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, worker_index,
-        prefetch_on_device=True)
-
-  def _make_dataset_iterator(self, dataset):
-    return input_lib.DatasetIterator(dataset, self._input_workers,
-                                     self._num_replicas_in_sync)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    """Distributes the dataset to each local GPU."""
-    if self._cluster_spec is None:
-      input_pipeline_id = 0
-    else:
-      input_pipeline_id = multi_worker_util.id_in_cluster(
-          self._cluster_spec, self._task_type, self._task_id)
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=self._num_workers,
-        input_pipeline_id=input_pipeline_id,
-        num_replicas_in_sync=self._num_replicas_in_sync)
-
-    return input_lib.InputFunctionIterator(
-        input_fn, self._input_workers, [input_context])
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    """Configures the object.
-
-    Args:
-      session_config: a `tf.ConfigProto`
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type, such as "worker".
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `task_type` is not in the `cluster_spec`.
-    """
-    if not self._cluster_spec and cluster_spec:
-      # If a `cluster_spec` is already passed in, do nothing here.
-      # TODO(yuefengz): check `cluster_spec` is the same if this object has
-      # already been initialized with a `cluster_spec`.
-      self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
-                                    task_type, task_id)
-      assert isinstance(self._get_cross_device_ops(),
-                        cross_device_ops_lib.CollectiveAllReduce)
-
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    # Enable the scoped allocator optimization for CollectiveOps.  This
-    # optimization converts many small all-reduces into fewer larger
-    # all-reduces.
-    rewrite_options = updated_config.graph_options.rewrite_options
-    rewrite_options.scoped_allocator_optimization = (
-        rewriter_config_pb2.RewriterConfig.ON)
-    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
-    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
-    # clear and then append.
-    del rewrite_options.scoped_allocator_opts.enable_op[:]
-    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
-
-    if not self._cluster_spec:
-      return updated_config
-
-    assert self._task_type
-    assert self._task_id is not None
-
-    # Collective group leader is needed for collective ops to coordinate
-    # workers.
-    if "chief" in self._cluster_spec.jobs:
-      updated_config.experimental.collective_group_leader = (
-          "/job:chief/replica:0/task:0")
-    else:
-      if "worker" not in self._cluster_spec.jobs:
-        raise ValueError(
-            "You must have `chief` or `worker` jobs in the `cluster_spec`.")
-      updated_config.experimental.collective_group_leader = (
-          "/job:worker/replica:0/task:0")
-
-    # The device filters prevent communication between workers.
-    del updated_config.device_filters[:]
-    updated_config.device_filters.append(
-        "/job:%s/task:%d" % (self._task_type, self._task_id))
-
-    return updated_config
-
-  @property
-  def experimental_between_graph(self):
-    return True
-
-  @property
-  def experimental_should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return self._is_chief
-
-  @property
-  def should_save_summary(self):
-    return self._is_chief
-
-  @property
-  def _num_replicas_in_sync(self):
-    return len(self.worker_devices) * self._num_workers
-
-  # TODO(priyag): Delete this once all strategies use global batch size.
-  @property
-  def _global_batch_size(self):
-    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
-
-    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
-    batching.
-
-    Returns:
-      Boolean.
-    """
-    return True
+  def __init__(self,
+               container_strategy,
+               num_gpus_per_worker,
+               communication):
+    # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
+    # the constructor's interface to allow customized cluster resolver. Use
+    # SimpleClusterResolver to override num_accelerators.
+    tfconfig = TFConfigClusterResolver()
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=tfconfig.cluster_spec(),
+        task_type=tfconfig.task_type,
+        task_id=tfconfig.task_id,
+        num_accelerators={"GPU": num_gpus_per_worker},
+        rpc_layer=tfconfig.rpc_layer)
+    super(CollectiveAllReduceExtended, self).__init__(
+        container_strategy,
+        communication=communication,
+        cluster_resolver=cluster_resolver)
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 9b6236fd9f89ec30c1234c846930a05d9c32e99d..2dab18791ca3e67c7b2494eaf90295b9f85c0cbb 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -29,9 +29,14 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy as core_collective_all_reduce_strategy
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -49,6 +54,57 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class MockCollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCollectiveAllReduceStrategy, self).__init__(
+        core_collective_all_reduce_strategy.CollectiveAllReduceExtended(
+            self,
+            communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
+            cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        use_core_strategy=False):
+  sess_config = config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if use_core_strategy:
+    if cluster_spec and task_type and task_id is not None:
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators={'GPU': num_gpus})
+      target = 'grpc://' + cluster_spec[task_type][task_id]
+    else:
+      cluster_resolver = SimpleClusterResolver(
+          ClusterSpec({}), num_accelerators={'GPU': num_gpus})
+      target = ''
+
+    strategy = MockCollectiveAllReduceStrategy(cluster_resolver)
+    sess_config = strategy.update_config_proto(sess_config)
+  else:
+    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=num_gpus)
+    if task_type and task_id is not None:
+      strategy.configure(
+          session_config=sess_config,
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id)
+      target = 'grpc://' + cluster_spec[task_type][task_id]
+    else:
+      target = ''
+
+  return strategy, target, sess_config
 
 
 class CollectiveAllReduceStrategyTestBase(
@@ -64,16 +120,18 @@ class CollectiveAllReduceStrategyTestBase(
     CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
-  def _get_test_object(self, task_type, task_id, num_gpus=0):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=num_gpus)
-    session_config = config_pb2.ConfigProto()
-    if task_type and task_id is not None:
-      distribution.configure(
-          session_config=session_config,
-          cluster_spec=self._cluster_spec,
-          task_type=task_type,
-          task_id=task_id)
+  def _get_test_object(self,
+                       task_type,
+                       task_id,
+                       num_gpus=0,
+                       use_core_strategy=False):
+    strategy, target, session_config = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
+
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
@@ -81,16 +139,16 @@ class CollectiveAllReduceStrategyTestBase(
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution.extended._collective_keys = collective_keys
-    distribution.extended._cross_device_ops._collective_keys = (
-        collective_keys)
-    if task_type and task_id is not None:
-      return distribution, 'grpc://' + self._cluster_spec[task_type][
-          task_id], session_config
-    else:
-      return distribution, '', session_config
+    strategy.extended._collective_keys = collective_keys
+    strategy.extended._cross_device_ops._collective_keys = (collective_keys)
 
-  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+    return strategy, target, session_config
+
+  def _test_minimize_loss_graph(self,
+                                task_type,
+                                task_id,
+                                num_gpus,
+                                use_core_strategy=False):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
     with ops.Graph().as_default(), \
@@ -118,7 +176,7 @@ class CollectiveAllReduceStrategyTestBase(
       def update(v, g):
         return v.assign_sub(0.05 * g, use_locking=True)
 
-      one = d.broadcast(constant_op.constant([[1.]]))
+      one = constant_op.constant([[1.]])
 
       def step():
         """Perform one optimization step."""
@@ -158,7 +216,11 @@ class CollectiveAllReduceStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  def _test_complex_model(self, task_type, task_id, num_gpus):
+  def _test_complex_model(self,
+                          task_type,
+                          task_id,
+                          num_gpus,
+                          use_core_strategy=False):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
 
@@ -204,13 +266,17 @@ class CollectiveAllReduceStrategyTestBase(
                              target=master_target) as sess:
       with d.scope():
         train_op = d.extended.call_for_each_replica(model_fn)
-        train_op = d.group(d.unwrap(train_op))
+        train_op = d.group(d.experimental_local_results(train_op))
 
       sess.run(variables.global_variables_initializer())
       sess.run(train_op)
       return True
 
-  def _test_variable_initialization(self, task_type, task_id, num_gpus):
+  def _test_variable_initialization(self,
+                                    task_type,
+                                    task_id,
+                                    num_gpus,
+                                    use_core_strategy=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
@@ -228,7 +294,7 @@ class CollectiveAllReduceStrategyTestBase(
 
       x = distribution.extended.call_for_each_replica(model_fn)
       reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
-      x = distribution.unwrap(x)[0]
+      x = distribution.experimental_local_results(x)[0]
 
       sess.run(variables.global_variables_initializer())
 
@@ -239,8 +305,14 @@ class CollectiveAllReduceStrategyTestBase(
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
-  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
-                              expected_values):
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              test_reinitialize=True,
+                              use_core_strategy=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     devices = distribution.extended.worker_devices
@@ -263,13 +335,14 @@ class CollectiveAllReduceStrategyTestBase(
                   for r in range(len(devices))])
 
       # After re-initializing the iterator, should be able to iterate again.
-      sess.run(iterator.initialize())
+      if test_reinitialize:
+        sess.run(iterator.initialize())
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = sess.run([values.select_replica(r, next_element)
-                                   for r in range(len(devices))])
-        self.assertEqual(expected_value, computed_value)
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          self.assertEqual(expected_value, computed_value)
 
 
 class DistributedCollectiveAllReduceStrategyTest(
@@ -283,71 +356,116 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
-  def test_num_replicas_in_sync(self):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(cluster_spec=self._cluster_spec, task_type='worker',
-                           task_id=0)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def test_num_replicas_in_sync(self, use_core_strategy):
+    distribution, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=0,
+        num_gpus=2,
+        use_core_strategy=use_core_strategy)
     num_workers = len(self._cluster_spec.get('chief', []) +
                       self._cluster_spec.get('worker', []))
     self.assertEqual(2 * num_workers,
                      distribution.num_replicas_in_sync)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testVariableInitialization(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testVariableInitialization(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
-        num_gpus=num_gpus)
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testComplexModel(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testComplexModel(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
-        self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
+        self._test_complex_model,
+        self._cluster_spec,
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(yuefengz): Update how we use num_gpus and required_gpus
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testMakeInputFnIterator(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_dataset=[True, False],
+          use_core_strategy=[True, False]))
+  def DISABLED_testMakeInputFnIterator(self, num_gpus, use_dataset,
+                                       use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     # We use CPU as the device when num_gpus = 0
     devices_per_worker = max(1, num_gpus)
     expected_values = [[i+j for j in range(devices_per_worker)]
                        for i in range(0, 100, devices_per_worker)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=3*devices_per_worker,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator('worker', 1, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
 
-  def testUpdateConfigProto(self):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(
-        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProto(self, use_core_strategy):
+    strategy, _, _ = self._get_test_object(
+        task_type='worker',
+        task_id=1,
+        num_gpus=2,
+        use_core_strategy=use_core_strategy)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
     rewrite_options = config_proto.graph_options.rewrite_options
     rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
 
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify group leader
     self.assertEqual('/job:worker/replica:0/task:0',
@@ -363,6 +481,24 @@ class DistributedCollectiveAllReduceStrategyTest(
     self.assertEqual(['CollectiveReduce'],
                      new_rewrite_options.scoped_allocator_opts.enable_op)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOps(self):
+    mock_called = [False]
+
+    # pylint: disable=dangerous-default-value
+    def mock_enable_collective_ops(server_def, mock_called=mock_called):
+      self.assertEqual('worker', server_def.job_name)
+      self.assertEqual(1, server_def.task_index)
+      self.assertEqual('grpc', server_def.protocol)
+      mock_called[0] = True
+
+    with test.mock.patch.object(context.context(), 'enable_collective_ops',
+                                mock_enable_collective_ops):
+      strategy, _, _ = self._get_test_object(
+          task_type='worker', task_id=1, num_gpus=2, use_core_strategy=True)
+    self.assertTrue(strategy.extended._std_server_started)
+    self.assertTrue(mock_called[0])
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -406,80 +542,127 @@ class LocalCollectiveAllReduceStrategy(
 
   @combinations.generate(
       combinations.combine(
-          mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2))
-  def testMinimizeLoss(self, num_gpus):
+          mode=['graph', 'eager'],
+          num_gpus=[2, 4],
+          required_gpus=2,
+          use_core_strategy=[True, False]))
+  def testMinimizeLoss(self, num_gpus, use_core_strategy):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     if context.executing_eagerly():
-      strategy, _, _ = self._get_test_object(None, None, num_gpus)
+      strategy, _, _ = self._get_test_object(
+          None, None, num_gpus, use_core_strategy=use_core_strategy)
       self._test_minimize_loss_eager(strategy)
     else:
-      self._test_minimize_loss_graph(None, None, num_gpus)
+      self._test_minimize_loss_graph(
+          None, None, num_gpus, use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2))
-  def testComplexModel(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[2, 4],
+          required_gpus=2,
+          use_core_strategy=[True, False]))
+  def testComplexModel(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_complex_model(None, None, num_gpus)
+    self._test_complex_model(
+        None, None, num_gpus, use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph', 'eager'], required_gpus=2))
-  def testMakeInputFnIterator(self):
+      combinations.combine(
+          mode=['graph', 'eager'],
+          required_gpus=2,
+          use_dataset=[True, False],
+          use_core_strategy=[True, False]))
+  def DISABLED_testMakeInputFnIterator(self, use_dataset, use_core_strategy):
     num_gpus = 2
-    dataset_fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(5 * num_gpus)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    self._test_input_fn_iterator(None, None, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
 
-  def testAllReduceSum(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSum(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum(distribution)
 
-  def testAllReduceSumGradients(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSumGradients(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum_gradients(distribution)
 
-  def testAllReduceSumGradientTape(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSumGradientTape(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_sum_gradient_tape(distribution)
 
-  def testAllReduceMean(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMean(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean(distribution)
 
-  def testAllReduceMeanGradients(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMeanGradients(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean_gradients(distribution)
 
-  def testAllReduceMeanGradientTape(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMeanGradientTape(self, use_core_strategy):
     if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
-    distribution, target, config = self._get_test_object(None, None, num_gpus=2)
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
     with self.cached_session(config=config, target=target):
       self._test_all_reduce_mean_gradient_tape(distribution)
 
-  def testNumpyIterator(self):
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testNumpyIterator(self, use_core_strategy):
     num_gpus = 2
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    strategy, _, _ = self._get_test_object(None, None, num_gpus)
+    strategy, _, _ = self._get_test_object(
+        None, None, num_gpus=num_gpus, use_core_strategy=use_core_strategy)
     self._test_numpy_iterator(strategy)
 
 
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 304762e69e383ae51b914eb78e84cb3571cb12ed..250339fad7afb456869568f454f21ff470f7bc9d 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -45,17 +45,23 @@ import types
 import unittest
 from absl.testing import parameterized
 import six
-
-from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
-from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
-from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
+from tensorflow.contrib.distribute.python import parameter_server_strategy
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import one_device_strategy as one_device_lib
+from tensorflow.python.distribute import tpu_strategy as tpu_lib
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras_v2
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -226,7 +232,7 @@ def combine(**kwargs):
   if not kwargs:
     return [OrderedDict()]
 
-  sort_by_key = lambda k: k[0][0]
+  sort_by_key = lambda k: k[0]
   kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
   first = list(kwargs.items())[0]
 
@@ -321,11 +327,18 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
-def _get_tpu_strategy_creator(steps_per_run, **kwargs):
+def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
   def _create_tpu_strategy():
-    resolver = cluster_resolver.TPUClusterResolver("")
-    tpu_lib.initialize_tpu_system(resolver)
+    resolver = tpu_cluster_resolver.TPUClusterResolver("")
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = None
+    if use_single_core:
+      device_assignment = device_assignment_lib.DeviceAssignment(
+          topology, core_assignment=device_assignment_lib.
+          SINGLE_CORE_ASSIGNMENT)
+
     strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
+                                   device_assignment=device_assignment,
                                    **kwargs)
     return strategy
   return _create_tpu_strategy
@@ -339,20 +352,22 @@ default_strategy = NamedDistribution(
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
+one_device_strategy_gpu = NamedDistribution(
+    "OneDeviceGPU", lambda: one_device_lib.OneDeviceStrategy("/gpu:0"),
+    required_gpus=1)
 tpu_strategy = NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2),
     required_tpu=True)
 tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1),
     required_tpu=True)
-# TODO(b/122327153): Remove below two NamedDistributions.
-tpu_strategy_loop_on_device = NamedDistribution(
-    "TPULoopOnDevice", _get_tpu_strategy_creator(
-        steps_per_run=2, _disable_training_loop_on_host=True),
+tpu_strategy_one_core = NamedDistribution(
+    "TPUOneCore", _get_tpu_strategy_creator(
+        steps_per_run=2, use_single_core=True),
     required_tpu=True)
-tpu_strategy_one_step_loop_on_device = NamedDistribution(
-    "TPUOneStepLoopOnDevice", _get_tpu_strategy_creator(
-        steps_per_run=1, _disable_training_loop_on_host=True),
+tpu_strategy_one_step_one_core = NamedDistribution(
+    "TPUOneStepOneCore", _get_tpu_strategy_creator(
+        steps_per_run=1, use_single_core=True),
     required_tpu=True)
 
 mirrored_strategy_with_one_cpu = NamedDistribution(
@@ -385,6 +400,11 @@ core_mirrored_strategy_with_two_gpus = NamedDistribution(
     "CoreMirrored2GPUs",
     lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
+parameter_server_strategy_with_two_gpus = NamedDistribution(
+    "ParameterServer2GPUs",
+    lambda: parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2),
+    required_gpus=2)
 
 
 gradient_descent_optimizer_v1_fn = NamedObject(
@@ -404,10 +424,20 @@ gradient_descent_optimizer_v2_fn = NamedObject(
 adagrad_optimizer_v2_fn = NamedObject(
     "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))
 adam_optimizer_v2_fn = NamedObject(
-    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1.0))
 
 optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]
 
+gradient_descent_optimizer_keras_v2_fn = NamedObject(
+    "GradientDescentKerasV2",
+    lambda: gradient_descent_keras_v2.SGD(0.2))
+adagrad_optimizer_keras_v2_fn = NamedObject(
+    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
+adam_optimizer_keras_v2_fn = NamedObject(
+    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
+rmsprop_optimizer_keras_v2_fn = NamedObject(
+    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))
+
 graph_and_eager_modes = ["graph", "eager"]
 
 
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
index 86aa48cea889c6c2ce169b18bcabb6d08890fbed..9f3deadbec98c4f66061ca29b4d29a74b8de40b1 100644
--- a/tensorflow/contrib/distribute/python/combinations_test.py
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -42,6 +42,14 @@ class TestingCombinationsTest(test.TestCase):
         "b": 3
     }], combinations.combine(a=[1, 2], b=[2, 3]))
 
+  def test_arguments_sorted(self):
+    self.assertEqual([
+        OrderedDict([("aa", 1), ("ab", 2)]),
+        OrderedDict([("aa", 1), ("ab", 3)]),
+        OrderedDict([("aa", 2), ("ab", 2)]),
+        OrderedDict([("aa", 2), ("ab", 3)])
+    ], combinations.combine(ab=[2, 3], aa=[1, 2]))
+
   def test_combine_single_parameter(self):
     self.assertEqual([{
         "a": 1,
diff --git a/tensorflow/contrib/distribute/python/cross_device_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 54cce2988383fcf5e063726948fbbf62c7094ce5..2b8e0197961ae37b67dc8958054a03e164242dcd 100644
--- a/tensorflow/contrib/distribute/python/cross_device_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -23,6 +23,7 @@ import itertools
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
@@ -204,15 +205,15 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   reduction_to_one_combinations = combinations.combine(
       cross_device_ops=[
           combinations.NamedObject(
-              "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              "DefaultReductionToOneDevice",
+              cross_device_ops_lib.ReductionToOneDevice()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
@@ -228,20 +229,23 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
           combinations.NamedObject(
               "AllReduce",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_device_ops_lib.AllReduceCrossDeviceOps(
-                  "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+          combinations.NamedObject("NcclAllReduce",
+                                   cross_device_ops_lib.NcclAllReduce()),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
               cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus,
-                    combinations.core_mirrored_strategy_with_two_gpus],
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus
+      ],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
@@ -306,8 +310,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
       combinations.combine(
           cross_device_ops_instance=[
               combinations.NamedObject(
-                  "ReductionToOneDeviceCrossDeviceOps",
-                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
                   cross_device_ops_lib.AllReduceCrossDeviceOps())
@@ -426,6 +430,9 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
+NUM_WORKERS = 3
+
+
 class MultiWorkerCollectiveAllReduceTest(
     multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
 
@@ -433,9 +440,9 @@ class MultiWorkerCollectiveAllReduceTest(
 
   @classmethod
   def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
+    """Create a local cluster with 3 workers."""
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=0)
+        num_workers=NUM_WORKERS, num_ps=0)
 
   def setUp(self):
     super(MultiWorkerCollectiveAllReduceTest, self).setUp()
@@ -443,7 +450,12 @@ class MultiWorkerCollectiveAllReduceTest(
     # collective key base for different tests.
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
-  def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus=0,
+                        use_strategy_object=False,
+                        local_mode=False):
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
@@ -452,16 +464,24 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
       else:
         devices = ["/device:CPU:0"]
-      return collective_all_reduce_ops, devices, ""
+
+      if use_strategy_object:
+        # Still using contrib CollectiveAllReduceStrategy because we can specify
+        # num_gpus in its constructor.
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return strategy, devices, ""
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            1, num_gpus, collective_keys=collective_keys)
+        return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
             "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
@@ -469,8 +489,23 @@ class MultiWorkerCollectiveAllReduceTest(
         ]
       else:
         devices = ["/job:%s/task:%d" % (task_type, task_id)]
-      return (collective_all_reduce_ops, devices,
-              "grpc://" + self._cluster_spec[task_type][task_id])
+
+      if use_strategy_object:
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.configure(
+            cluster_spec=self._cluster_spec,
+            task_type=task_type,
+            task_id=task_id)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return (strategy, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            NUM_WORKERS, num_gpus, collective_keys=collective_keys)
+        return (collective_all_reduce_ops, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
 
   def _assert_values_equal(self, left, right, sess):
     if isinstance(left, list):
@@ -490,9 +525,18 @@ class MultiWorkerCollectiveAllReduceTest(
       for l, r in zip(left_values, right_values):
         self.assertEqual(l, r)
 
-  def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
+  def _test_reduction(self,
+                      task_type,
+                      task_id,
+                      num_gpus,
+                      use_strategy_object=False,
+                      local_mode=False):
     collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
+        task_type,
+        task_id,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=local_mode)
     if local_mode:
       num_workers = 1
       worker_device = None
@@ -500,6 +544,27 @@ class MultiWorkerCollectiveAllReduceTest(
       num_workers = len(self._cluster_spec.get("chief", [])) + len(
           self._cluster_spec.get("worker", []))
       worker_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    def _reduce(test_object, reduce_op, per_replica, destinations):
+      if use_strategy_object:
+        with test_object.scope():
+          # Mimic the behavior that distribution strategy usually strips the
+          # wrapper if there is only one value.
+          if len(per_replica.values) == 1:
+            per_replica = per_replica.values[0]
+          return test_object.extended.reduce_to(reduce_op, per_replica,
+                                                destinations)
+      else:
+        return test_object.reduce(reduce_op, per_replica, destinations)
+
+    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.batch_reduce_to(reduce_op,
+                                                      value_destination_pairs)
+      else:
+        return test_object.batch_reduce(reduce_op, value_destination_pairs)
+
     with ops.Graph().as_default(), \
          ops.device(worker_device), \
          self.cached_session(target=master_target) as sess:
@@ -524,26 +589,30 @@ class MultiWorkerCollectiveAllReduceTest(
       # test reduce()
       for destinations in all_destinations:
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean, destinations), sess)
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
+                destinations=destinations), _fake_mirrored(
+                    mean_2, destinations), sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica_2,
                 destinations=destinations),
@@ -553,17 +622,13 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean, d1),
-                _fake_mirrored(mean_2, d2)
-            ], sess)
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
+                          [(per_replica, d1), (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean * len(devices) * num_workers, d1),
                 _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
@@ -572,18 +637,36 @@ class MultiWorkerCollectiveAllReduceTest(
     return True
 
   @combinations.generate(
-      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
-  def testReductionDistributed(self, num_gpus):
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_strategy_object=[True, False]))
+  def testReductionDistributed(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
-                                    num_gpus)
+    self._run_between_graph_clients(
+        self._test_reduction,
+        self._cluster_spec,
+        num_gpus,
+        use_strategy_object=use_strategy_object)
 
   # Collective ops doesn't support strategy with one device.
-  def testReductionLocal(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[2],
+          required_gpus=2,
+          use_strategy_object=[True, False]))
+  def testReductionLocal(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._test_reduction(None, None, num_gpus, local_mode=True)
+    self._test_reduction(
+        None,
+        None,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index e17085628ba6d1dfc79839fd824801723f07a518..1ff1e7c1d255492e0535175dae7594d2ceb4010b 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -22,7 +22,6 @@ import shutil
 import tempfile
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.optimizer_v2 import adagrad
@@ -117,7 +116,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
       scores = estimator.evaluate(eval_input_fn)
 
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
 
     predictions = np.array([
         x[prediction_keys.PredictionKeys.PREDICTIONS]
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 3f55a8a1c8b88d1b8e4031547fa3fbe519983630..e8513943e8d3bb0afa7b468528bfc524fc6a5504 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -34,6 +34,7 @@ from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
 from tensorflow.contrib.optimizer_v2 import adagrad
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import estimator_training as dc_training
 from tensorflow.python.distribute.distribute_config import DistributeConfig
@@ -287,6 +288,34 @@ class DistributeCoordinatorIntegrationTest(
                                     cluster_spec)
     self._inspect_train_and_eval_events(estimator)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          eval_distribute_class=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def test_complete_flow_standalone_client_collective_nccl(
+      self, eval_distribute_class):
+    train_distribute = (
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=context.num_gpus(),
+            communication=cross_device_ops_lib.CollectiveCommunication.NCCL))
+
+    if eval_distribute_class:
+      eval_distribute = self._get_strategy_object(eval_distribute_class)
+    else:
+      eval_distribute = None
+
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    cluster_spec.pop("ps", None)
+    estimator = self._complete_flow(train_distribute, eval_distribute,
+                                    cluster_spec)
+    self._inspect_train_and_eval_events(estimator)
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
@@ -347,7 +376,7 @@ class DistributeCoordinatorIntegrationTest(
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
-  def test_complete_flow_indepedent_worker_between_graph(
+  def test_complete_flow_independent_worker_between_graph(
       self, train_distribute_cls, eval_distribute_cls):
     if (context.num_gpus() < 2 and eval_distribute_cls ==
         collective_all_reduce_strategy.CollectiveAllReduceStrategy):
@@ -399,8 +428,8 @@ class DistributeCoordinatorIntegrationTest(
               mirrored_strategy.CoreMirroredStrategy
           ],
           required_gpus=[0, 1]))
-  def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
-                                                    eval_distribute_cls):
+  def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls,
+                                                     eval_distribute_cls):
     train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index 84b106545e1326fddd3ed299462534af982dc102..58bede801ff13bb60ed4ada4810eb8ce2dbcb0a3 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -31,6 +31,12 @@ py_binary(
 
 py_binary(
     name = "keras_mnist",
+    srcs = ["keras_mnist.py"],
+    deps = [":keras_mnist_lib"],
+)
+
+py_library(
+    name = "keras_mnist_lib",
     srcs = [
         "keras_mnist.py",
     ],
@@ -39,3 +45,25 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
+
+py_binary(
+    name = "mnist_eager_multigpu",
+    srcs = [
+        "mnist_eager_multigpu.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "mnist_tf1_tpu",
+    srcs = [
+        "mnist_tf1_tpu.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c045a5586b9dad371d8c505f9cac4b792dd157fd
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
@@ -0,0 +1,169 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run MNIST on multiple GPUs on using MirroredStrategy with eager execution.
+
+By default, runs on all available GPUs, or CPU if no GPUs are available.
+
+NOTE: Currently, this takes more time than when running MNIST in eager without
+MirroredStrategy because of a number overheads. Therefore, this is just a
+proof of concept right now and cannot be used to actually scale up training.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+flags.DEFINE_integer("num_gpus", None, "How many GPUs should we run on?"
+                     "Defaults to all available GPUs, otherwise CPU.")
+flags.DEFINE_integer("batch_size", 64,
+                     "What should be the size of each batch?")
+flags.DEFINE_integer("num_epochs", 10, "How many epochs to run?")
+flags.DEFINE_float("learning_rate", 0.01, "Learning Rate")
+flags.DEFINE_float("momentum", 0.5, "SGD momentum")
+flags.DEFINE_boolean("use_function", False,
+                     "Should we wrap the step in a tf.function.")
+
+FLAGS = flags.FLAGS
+NUM_TRAIN_IMAGES = 60000
+
+
+def create_model():
+  max_pool = tf.keras.layers.MaxPooling2D((2, 2), (2, 2), padding="same")
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  return tf.keras.Sequential([
+      tf.keras.layers.Reshape(
+          target_shape=[28, 28, 1],
+          input_shape=(28, 28,)),
+      tf.keras.layers.Conv2D(2, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Conv2D(4, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(32, activation=tf.nn.relu),
+      tf.keras.layers.Dropout(0.4),
+      tf.keras.layers.Dense(10)])
+
+
+def compute_loss(logits, labels):
+  loss = tf.reduce_sum(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+  # Scale loss by global batch size.
+  return loss * (1. / FLAGS.batch_size)
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  # TODO(priyag): `strategy.make_numpy_iterator` can be used directly instead of
+  # converting to datasets.
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def main(unused_argv):
+  """Run a CNN model on MNIST data to demonstrate DistributedStrategies."""
+
+  tf.enable_v2_behavior()
+
+  num_gpus = FLAGS.num_gpus
+  if num_gpus is None:
+    devices = None
+  elif num_gpus == 0:
+    devices = ["/device:CPU:0"]
+  else:
+    devices = ["/device:GPU:{}".format(i) for i in range(num_gpus)]
+  strategy = tf.distribute.MirroredStrategy(devices)
+
+  with strategy.scope():
+    train_ds, test_ds = mnist_datasets()
+    train_ds = train_ds.shuffle(NUM_TRAIN_IMAGES).batch(FLAGS.batch_size)
+    test_ds = test_ds.batch(FLAGS.batch_size)
+
+    model = create_model()
+    optimizer = tf.keras.optimizers.SGD(FLAGS.learning_rate, FLAGS.momentum)
+    training_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "training_accuracy", dtype=tf.float32)
+    test_loss = tf.keras.metrics.Mean("test_loss", dtype=tf.float32)
+    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "test_accuracy", dtype=tf.float32)
+
+    def train_step(inputs):
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss = compute_loss(logits, labels)
+      grads = tape.gradient(loss, model.variables)
+      optimizer.apply_gradients(zip(grads, model.variables))
+      training_loss.update_state(loss)
+      training_accuracy.update_state(labels, logits)
+
+    def test_step(inputs):
+      images, labels = inputs
+      logits = model(images, training=False)
+      loss = compute_loss(logits, labels)
+      test_loss.update_state(loss)
+      test_accuracy.update_state(labels, logits)
+
+    train_iterator = strategy.make_dataset_iterator(train_ds)
+    test_iterator = strategy.make_dataset_iterator(test_ds)
+
+    for epoch in range(0, FLAGS.num_epochs):
+      # TODO(b/123315763): Create the tf.function outside this loop once we are
+      # able to initialize iterator in eager mode.
+      dist_train = lambda it: strategy.experimental_run(train_step, it)
+      dist_test = lambda it: strategy.experimental_run(test_step, it)
+      if FLAGS.use_function:
+        dist_train = tf.function(dist_train)
+        dist_test = tf.function(dist_test)
+
+      # Train
+      print("Starting epoch {}".format(epoch))
+      train_iterator.initialize()
+      while True:
+        try:
+          dist_train(train_iterator)
+        except tf.errors.OutOfRangeError:
+          break
+      print("Training loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          training_loss.result(), training_accuracy.result() * 100))
+      training_loss.reset_states()
+      training_accuracy.reset_states()
+
+      # Test
+      test_iterator.initialize()
+      while True:
+        try:
+          dist_test(test_iterator)
+        except tf.errors.OutOfRangeError:
+          break
+      print("Test loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          test_loss.result(), test_accuracy.result() * 100))
+      test_loss.reset_states()
+      test_accuracy.reset_states()
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/distribute/python/examples/mnist_tf1_tpu.py b/tensorflow/contrib/distribute/python/examples/mnist_tf1_tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae8194c576e67f7ba864f63885c9b028e4136e61
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/mnist_tf1_tpu.py
@@ -0,0 +1,188 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run MNIST on multiple GPUs on using MirroredStrategy with eager execution.
+
+By default, runs on all available GPUs, or CPU if no GPUs are available.
+
+NOTE: Currently, this takes more time than when running MNIST in eager without
+MirroredStrategy because of a number overheads. Therefore, this is just a
+proof of concept right now and cannot be used to actually scale up training.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+
+
+flags.DEFINE_string("tpu", None, "Name of the TPU to use.")
+flags.DEFINE_integer("batch_size", 64,
+                     "What should be the size of each batch?")
+flags.DEFINE_integer("num_epochs", 10, "How many epochs to run?")
+flags.DEFINE_float("learning_rate", 0.01, "Learning Rate")
+flags.DEFINE_float("momentum", 0.5, "SGD momentum")
+
+FLAGS = flags.FLAGS
+NUM_TRAIN_IMAGES = 60000
+
+
+def create_model():
+  max_pool = tf.keras.layers.MaxPooling2D((2, 2), (2, 2), padding="same")
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  return tf.keras.Sequential([
+      tf.keras.layers.Reshape(
+          target_shape=[28, 28, 1],
+          input_shape=(28, 28,)),
+      tf.keras.layers.Conv2D(2, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Conv2D(4, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(32, activation=tf.nn.relu),
+      tf.keras.layers.Dropout(0.4),
+      tf.keras.layers.Dense(10)])
+
+
+def compute_loss(logits, labels):
+  loss = tf.reduce_sum(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+  # Scale loss by global batch size.
+  return loss * (1. / FLAGS.batch_size)
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  # TODO(priyag): `strategy.make_numpy_iterator` can be used directly instead of
+  # converting to datasets.
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def main(argv):
+  """Run a CNN model on MNIST data to demonstrate DistributedStrategies."""
+  del argv  # Unused.
+  tf.disable_v2_behavior()
+
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu)
+  strategy = tf.contrib.distribute.TPUStrategy(cluster_resolver)
+
+  with strategy.scope():
+    train_ds, test_ds = mnist_datasets()
+    train_ds = train_ds.shuffle(NUM_TRAIN_IMAGES).batch(FLAGS.batch_size)
+    test_ds = test_ds.batch(FLAGS.batch_size)
+
+    model = create_model()
+    optimizer = tf.keras.optimizers.SGD(FLAGS.learning_rate, FLAGS.momentum)
+    training_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "training_accuracy", dtype=tf.float32)
+    test_loss = tf.keras.metrics.Mean("test_loss", dtype=tf.float32)
+    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "test_accuracy", dtype=tf.float32)
+
+    def train_step(inputs):  # pylint: disable=missing-docstring
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss = compute_loss(logits, labels)
+      grads = tape.gradient(loss, model.variables)
+      update_vars = optimizer.apply_gradients(zip(grads, model.variables))
+      update_loss = training_loss.update_state(loss)
+      update_accuracy = training_accuracy.update_state(labels, logits)
+
+      with tf.control_dependencies([update_vars, update_loss, update_accuracy]):
+        return tf.identity(loss)
+
+    def test_step(inputs):
+      images, labels = inputs
+      logits = model(images, training=False)
+      loss = compute_loss(logits, labels)
+      update_loss = test_loss.update_state(loss)
+      update_accuracy = test_accuracy.update_state(labels, logits)
+
+      with tf.control_dependencies([update_loss, update_accuracy]):
+        return tf.identity(loss)
+
+    train_iterator = strategy.make_dataset_iterator(train_ds)
+    test_iterator = strategy.make_dataset_iterator(test_ds)
+
+    dist_train = strategy.unwrap(
+        strategy.experimental_run(train_step, train_iterator))
+    dist_test = strategy.unwrap(
+        strategy.experimental_run(test_step, test_iterator))
+
+    training_loss_result = training_loss.result()
+    training_accuracy_result = training_accuracy.result()
+    test_loss_result = test_loss.result()
+    test_accuracy_result = test_accuracy.result()
+
+    tf.contrib.distribute.initialize_tpu_system(cluster_resolver)
+
+    train_iterator_init = train_iterator.initialize()
+    test_iterator_init = test_iterator.initialize()
+
+    all_variables = (
+        tf.global_variables() +
+        training_loss.variables +
+        training_accuracy.variables +
+        test_loss.variables +
+        test_accuracy.variables)
+
+    with tf.Session(cluster_resolver.master()) as session:
+      session.run([v.initializer for v in all_variables])
+
+      for epoch in range(0, FLAGS.num_epochs):
+        # Train
+        print("Starting epoch {}".format(epoch))
+        session.run(train_iterator_init)
+        while True:
+          try:
+            session.run(dist_train)
+          except tf.errors.OutOfRangeError:
+            break
+        print("Training loss: {:0.4f}, accuracy: {:0.2f}%".format(
+            session.run(training_loss_result),
+            session.run(training_accuracy_result) * 100))
+        training_loss.reset_states()
+        training_accuracy.reset_states()
+
+        # Test
+        session.run(test_iterator_init)
+        while True:
+          try:
+            session.run(dist_test)
+          except tf.errors.OutOfRangeError:
+            break
+        print("Test loss: {:0.4f}, accuracy: {:0.2f}%".format(
+            session.run(test_loss_result),
+            session.run(test_accuracy_result) * 100))
+        test_loss.reset_states()
+        test_accuracy.reset_states()
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("tpu")
+  app.run(main)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
index 79a9803d75a35445280c006fa023637c9b01fdcc..cfaee03a2003089366a506168be2942c279f45bf 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
+++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
@@ -91,9 +91,10 @@ def main(_):
     predict_features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
     return predict_features
 
-  predictions = estimator.predict(input_fn=predict_input_fn)
-  # TODO(anjalsridhar): This returns a generator object, figure out how to get
-  # meaningful results here.
+  prediction_iterable = estimator.predict(input_fn=predict_input_fn)
+  # Create a list containing each of the prediction dictionaries that map
+  # the key 'logits' to an array of model outputs.
+  predictions = [prediction_iterable.next() for _ in range(10)]
   print("Prediction results: {}".format(predictions))
 
 
diff --git a/tensorflow/contrib/distribute/python/input_lib_test.py b/tensorflow/contrib/distribute/python/input_lib_test.py
index f589cd6ad54ea8f33002cb067ef8d83d3d33036a..80a1c7bae8f34aaf6cfd9357da2b071c200adf8b 100644
--- a/tensorflow/contrib/distribute/python/input_lib_test.py
+++ b/tensorflow/contrib/distribute/python/input_lib_test.py
@@ -22,8 +22,6 @@ from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
@@ -31,275 +29,60 @@ from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.util import nest
 
 
-class PerReplicaDatasetTest(test.TestCase):
-
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = input_lib.InputWorkers(device_map)
-    per_replica_dataset = input_lib.PerReplicaDataset(dataset, input_workers, 0)
-    if context.executing_eagerly():
-      iterator = per_replica_dataset.make_one_shot_iterator()
-    else:
-      iterator = per_replica_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next_as_list()
-      computed_value = self.evaluate(next_element)
-      self.assertEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next_as_list()
-      self.evaluate(next_element)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testOneDevice(self):
-    devices = ["/device:CPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleDevices(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTupleDataset(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnevenDatasetBatches(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(devices, dataset, expected_values)
-
-  def testInitializableIterator(self):
-    with context.graph_mode():
-      devices = ["/device:CPU:0"]
-      # Using random input since that is only allowed with initializable
-      # iterator.
-      dataset = dataset_ops.Dataset.from_tensor_slices(
-          random_ops.random_uniform((10,)))
-
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = input_lib.InputWorkers(device_map)
-      per_replica_dataset = input_lib.PerReplicaDataset(
-          dataset, input_workers, 0)
-      iterator = per_replica_dataset.make_initializable_iterator()
-
-      self.evaluate(iterator.initializer)
-      next_element = iterator.get_next_as_list()
-      for _ in range(10):
-        self.evaluate(next_element)
-
-      # Should fail after the input is finished.
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-      # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(iterator.initializer)
-      for _ in range(10):
-        self.evaluate(next_element)
-
-
-class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
-
-  def _test_iterator(self, sess, iterator, devices, expected_values):
-    next_element = iterator.get_next()
-    for r, device in enumerate(devices):
-      v = values.select_replica(r, next_element)
-      # The `v` here can be a tuple.
-      for element in nest.flatten(v):
-        self.assertTrue(element.device in device)
-
-    for expected_value in expected_values:
-      t = [values.select_replica(r, next_element) for r in range(len(devices))]
-      actual = sess.run(t)
-      self.assertEqual(expected_value, actual)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      sess.run([values.select_replica(r, next_element)
-                for r in range(len(devices))])
-
-  def _test_dataset(self, dataset_fn, worker_devices, devices,
-                    expected_values):
-    device_map = values.ReplicaDeviceMap(devices)
-    input_workers = input_lib.InputWorkers(device_map, worker_devices)
-    multi_worker_dataset = input_lib.MultiWorkerDataset(
-        dataset_fn, input_workers)
-    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-    with self.cached_session() as sess:
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
-
-  def _cpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def _cpu_and_one_gpu_devices(self):
-    worker_devices = (
-        ("/job:worker/replica:0/task:0", (
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        )),
-        ("/job:worker/replica:0/task:1", (
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ))
-    )
-    devices = [
-        "/job:worker/replica:0/task:0/device:GPU:0",
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:GPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def testDataDistributionOneDevicePerWorker(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testDataDistributionTwoDevicePerWorker(self):
-    if context.num_gpus() < 1:
-      self.skipTest("A GPU is not available for this test.")
-    worker_devices, devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(
-          dataset_fn, worker_devices, devices,
-          [[0, 1, 0, 1], [2, 3, 2, 3], [4, 5, 4, 5], [6, 7, 6, 7]])
-
-  def testTupleDataset(self):
-    worker_devices, devices = self._cpu_devices()
-
-    with context.graph_mode():
-
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(8)
-        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(8)]
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         expected_values)
-
-  def testInitializableIterator(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      device_map = values.ReplicaDeviceMap(devices)
-      input_workers = input_lib.InputWorkers(device_map, worker_devices)
-      multi_worker_dataset = input_lib.MultiWorkerDataset(
-          dataset_fn, input_workers)
-      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-      # After re-initializing the iterator, should be able to iterate again.
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(
-          sess, multi_worker_iterator, devices,
-          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
-
-  def testValueErrorForIterator(self):
-    # Incompatiable arguments.
-    d1 = "/device:GPU:0"
-    d2 = "/device:GPU:1"
-    device_map = values.ReplicaDeviceMap([d1, d2])
-    input_workers = input_lib.InputWorkers(
-        device_map, (("w1", (d1,)), ("w2", (d2,))))
-    with self.assertRaises(ValueError):
-      input_lib.MultiWorkerDataIterator([("w1", None)], input_workers)
-
-  def testDuplicateDevices(self):
-    _, devices = self._cpu_devices()
-    devices.append("/job:worker/replica:0/task:0/device:CPU:0")
-    with self.assertRaises(ValueError):
-      _ = values.ReplicaDeviceMap(devices)
-
-
 class InputIteratorTestBase(test.TestCase):
 
-  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
-                     expected_values, sess=None, split_batch_by=None):
-    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+  def _create_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                       devices, split_batch_by):
     device_map = values.ReplicaDeviceMap(devices)
     input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
 
     if input_type == "input_fn":
-      input_contexts = [
-          distribute_lib.InputContext() for _ in worker_device_pairs]
-      input_fn = lambda _: dataset_fn()
-      iterator = input_lib.InputFunctionIterator(
-          input_fn, input_workers, input_contexts)
+      input_contexts = []
+      for i in range(input_workers.num_workers):
+        input_contexts.append(
+            distribute_lib.InputContext(
+                num_input_pipelines=input_workers.num_workers,
+                input_pipeline_id=i,
+                num_replicas_in_sync=len(devices)))
+
+      iterator = input_lib.InputFunctionIterator(dataset_fn, input_workers,
+                                                 input_contexts)
     else:
       iterator = input_lib.DatasetIterator(
-          dataset_fn(), input_workers, split_batch_by)
+          dataset_fn(distribute_lib.InputContext()), input_workers,
+          split_batch_by)
+    return iterator
+
+  def _test_iterator(self,
+                     input_type,
+                     dataset_fn,
+                     worker_device_pairs,
+                     expected_values,
+                     sess=None,
+                     split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+    iterator = self._create_iterator(
+        input_type, dataset_fn, worker_device_pairs, devices, split_batch_by)
 
     evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
-
     evaluate(control_flow_ops.group(iterator.initialize()))
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
       computed_value = evaluate(
           [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
+      self.assertEqual(len(expected_value), len(computed_value))
+      for i in range(len(expected_value)):
+        self.assertAllEqual(expected_value[i], computed_value[i])
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
-      evaluate([values.select_replica(r, next_element)
-                for r in range(len(devices))])
+      evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
 
     # After re-initializing the iterator, should be able to iterate again.
     evaluate(control_flow_ops.group(iterator.initialize()))
@@ -308,7 +91,9 @@ class InputIteratorTestBase(test.TestCase):
       next_element = iterator.get_next()
       computed_value = evaluate(
           [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertAllEqual(expected_value, computed_value)
+      self.assertEqual(len(expected_value), len(computed_value))
+      for i in range(len(expected_value)):
+        self.assertAllEqual(expected_value[i], computed_value[i])
 
 
 class InputIteratorSingleWorkerTest(InputIteratorTestBase,
@@ -319,7 +104,7 @@ class InputIteratorSingleWorkerTest(InputIteratorTestBase,
       input_type=["input_fn", "dataset"]))
   def testOneDeviceCPU(self, input_type):
     worker_device_pairs = [("", ["/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(10)
 
     expected_values = [[i] for i in range(10)]
 
@@ -332,7 +117,7 @@ class InputIteratorSingleWorkerTest(InputIteratorTestBase,
       required_gpus=1))
   def testTwoDevicesOneGPUOneCPU(self, input_type):
     worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(10)
 
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
 
@@ -345,7 +130,9 @@ class InputIteratorSingleWorkerTest(InputIteratorTestBase,
       required_gpus=1))
   def testTupleDataset(self, input_type):
     worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    def dataset_fn():
+
+    def dataset_fn(ctx):
+      del ctx
       dataset1 = dataset_ops.Dataset.range(10)
       dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
       return dataset_ops.Dataset.zip((dataset1, dataset2))
@@ -355,15 +142,17 @@ class InputIteratorSingleWorkerTest(InputIteratorTestBase,
     self._test_iterator(input_type, dataset_fn, worker_device_pairs,
                         expected_values)
 
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph", "eager"],
+          input_type=["input_fn", "dataset"],
+          required_gpus=1))
   def testUnevenDatasetBatches(self, input_type):
     worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(2)
 
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    # The last global batch only contains data for one replica.
+    expected_values = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8], []]]
     self._test_iterator(input_type, dataset_fn, worker_device_pairs,
                         expected_values)
 
@@ -375,7 +164,7 @@ class InputIteratorSingleWorkerTest(InputIteratorTestBase,
   def testBatchSplitting(self, input_type, split_batch_by):
     worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
     batch_size = 10
-    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(100).batch(batch_size)
 
     updated_batch_size = (
         batch_size // split_batch_by if split_batch_by else batch_size)
@@ -417,7 +206,7 @@ class InputIteratorMultiWorkerTest(
   def testOneDevicePerWorker(self, input_type):
     worker_devices = self._cpu_devices()
     with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      dataset_fn = lambda _: dataset_ops.Dataset.range(4)
       self._test_iterator(input_type, dataset_fn, worker_devices,
                           [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
 
@@ -428,7 +217,7 @@ class InputIteratorMultiWorkerTest(
   def testTwoDevicesPerWorker(self, input_type):
     worker_devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      dataset_fn = lambda _: dataset_ops.Dataset.range(4)
       self._test_iterator(input_type, dataset_fn, worker_devices,
                           [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
 
@@ -438,7 +227,9 @@ class InputIteratorMultiWorkerTest(
   def testTupleDataset(self, input_type):
     worker_devices = self._cpu_devices()
     with context.graph_mode(), self.cached_session() as sess:
-      def dataset_fn():
+
+      def dataset_fn(ctx):
+        del ctx
         dataset1 = dataset_ops.Dataset.range(4)
         dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
         return dataset_ops.Dataset.zip((dataset1, dataset2))
@@ -447,33 +238,35 @@ class InputIteratorMultiWorkerTest(
       self._test_iterator(input_type, dataset_fn, worker_devices,
                           expected_values, sess)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"], input_type=["input_fn", "dataset"], required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(2)
+      expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
+                         [[4, 5], [6, 7], [4, 5], [6, 7]], [[8], [], [8], []]]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"], input_type=["input_fn"], required_gpus=1))
+  def testDifferentDatasets(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+
+      def dataset_fn(ctx):
+        if ctx.input_pipeline_id == 0:
+          return dataset_ops.Dataset.range(8).batch(2)
+        else:
+          return dataset_ops.Dataset.range(9).batch(2)
 
-class SplitDatasetBatchTest(test.TestCase):
-
-  def testBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20)
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testMapAndBatchDataset(self):
-    dataset = dataset_ops.Dataset.range(100)
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, 20))
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
-
-  def testPrefetchDataset(self):
-    dataset = dataset_ops.Dataset.range(100).batch(20).prefetch(1)
-    split_batch_by = 2
-    result_dataset = input_lib._split_dataset_batch(dataset, split_batch_by)
-    expected_values = [range(i, i+10) for i in range(0, 100, 10)]
-    result = [self.evaluate(el) for el in result_dataset]
-    self.assertAllEqual(expected_values, result)
+      expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
+                         [[4, 5], [6, 7], [4, 5], [6, 7]], [[], [], [8], []]]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
index 92de8e643e7588365c23dc8513e197c0869c9ecf..2788f342c740f88bf8869fe1b5da7d48b4b31a61 100644
--- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -28,9 +28,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
@@ -316,15 +318,19 @@ def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-# TODO(priyag): Add v2 optimizers here.
 def strategy_and_optimizer_combinations():
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(
-          optimizer=[combinations.adagrad_optimizer_v1_fn,
-                     combinations.adam_optimizer_v1_fn,
-                     combinations.gradient_descent_optimizer_v1_fn,
-                     combinations.rmsprop_optimizer_v1_fn]))
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
 
 
 def strategy_and_input_combinations():
@@ -350,6 +356,7 @@ def strategy_for_numpy_input_combinations():
       mode=['graph'])
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
@@ -458,6 +465,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllEqual([6, 7], outs[1].shape)
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
@@ -741,13 +749,16 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-      grouped_models = distribution.unwrap(model._distributed_model)
+      grouped_models = distribution.experimental_local_results(
+          distributed_training_utils.get_distributed_model(
+              model, ModeKeys.TRAIN))
       with distribution.scope():
         for m in grouped_models:
           self.assertAllClose(0.001, keras.backend.get_value(
               m.optimizer.lr), atol=1e-05, rtol=1e-05)
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(
@@ -787,16 +798,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
             verbose=0,
             sample_weight=sample_weight)
 
-      # Test with not specifying the `steps` argument.
-      with self.assertRaisesRegexp(
-          ValueError, 'the `steps_per_epoch` argument'):
+      # Test with not specifying the `steps` argument for dataset with
+      # infinite cardinality.
+      dataset = dataset.repeat()
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps_per_epoch` argument'):
         model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'the `steps` argument'):
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps` argument'):
         model.evaluate(dataset, verbose=0)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'the `steps` argument'):
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps` argument'):
         model.predict(dataset, verbose=0)
 
   @combinations.generate(combinations.combine(
@@ -830,6 +846,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                   callbacks=[keras.callbacks.ReduceLROnPlateau()])
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyWithLossMasking(test.TestCase,
                                               parameterized.TestCase):
 
@@ -860,6 +877,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
       self.assertEqual(hist.history['loss'][0], 0)
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
@@ -892,6 +910,7 @@ class TestDistributionStrategyWithNormalizationLayer(
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
+@test_util.run_v1_only('model.compile(..distribute=..) only works in TF v1')
 class TestDistributionStrategyCorrectness(test.TestCase,
                                           parameterized.TestCase):
 
diff --git a/tensorflow/contrib/distribute/python/keras_correctness_test_base.py b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
index 08ed933f2986b76bc0f8363a5a81682aa8a24493..c2d840788ca82e05bc8d03eb6d9e7b9f3608d966 100644
--- a/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
+++ b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from absl.testing import parameterized
 import numpy as np
+import six
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
@@ -77,11 +79,7 @@ def strategies_for_embedding_models():
   and DefaultStrategy in order to prevent testing timeouts.
   """
 
-  strategies = [s for s in all_strategies
-                if not s.required_tpu and s.required_gpus is not None]
-  strategies.append(combinations.tpu_strategy_loop_on_device)
-  strategies.append(combinations.tpu_strategy_one_step_loop_on_device)
-  return strategies
+  return [s for s in all_strategies if s.required_tpu or s.required_gpus]
 
 
 def test_combinations_for_embedding_model():
@@ -93,6 +91,16 @@ def test_combinations_for_embedding_model():
            eager_mode_test_configuration())))
 
 
+def test_combinations_with_tpu_strategies():
+  tpu_strategies = [combinations.tpu_strategy,
+                    combinations.tpu_strategy_one_step]
+
+  return (
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          graph_mode_test_configuration()))
+
+
 class MaybeDistributionScope(object):
   """Provides a context allowing no distribution strategy."""
 
@@ -133,6 +141,19 @@ def get_batch_size(global_batch_size, distribution):
   return batch_size
 
 
+def get_data_size(data):
+  """Gets the size of data in list, tuple, dict, or a numpy array."""
+  assert isinstance(data, (np.ndarray, list, dict, tuple))
+
+  if isinstance(data, np.ndarray):
+    return len(data)
+
+  if isinstance(data, (list, tuple)):
+    return len(data[0])
+
+  return len(six.next(six.itervalues(data)))
+
+
 def get_correctness_test_inputs(use_numpy, use_validation_data,
                                 with_distribution, x_train, y_train, x_predict):
   """Generates the inputs for correctness check when enable Keras with DS."""
@@ -159,11 +180,12 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
           'y': y_train,
       }
     predict_inputs = {
-        'x': np.array(x_predict, dtype=np.float32),
+        'x': x_predict
     }
   else:
-    if len(x_train) < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
-      # Currently, we cannot detech the size of a dataset. So, the eval steps is
+    training_data_size = get_data_size(x_train)
+    if training_data_size < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
+      # Currently, we cannot detect the size of a dataset. So, the eval steps is
       # hard coded.
       raise ValueError('x_train must have at least '
                        '_GLOBAL_BATCH_SIZE * _EVAL_STEPS samples')
@@ -179,7 +201,7 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
         'y': None,
         'epochs': training_epochs,
         'shuffle': False,
-        'steps_per_epoch': len(x_train) // global_batch_size,
+        'steps_per_epoch': training_data_size // global_batch_size,
     }
     if use_validation_data:
       eval_inputs = None  # Remove the eval_inputs
@@ -195,7 +217,8 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
           'steps': _EVAL_STEPS,
       }
 
-    predict_batch_size = get_batch_size(len(x_predict), with_distribution)
+    predict_batch_size = get_batch_size(get_data_size(x_predict),
+                                        with_distribution)
     predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
     predict_dataset = batch_wrapper(predict_dataset, predict_batch_size,
                                     with_distribution)
@@ -207,11 +230,11 @@ def get_correctness_test_inputs(use_numpy, use_validation_data,
   return training_inputs, eval_inputs, predict_inputs
 
 
-def fit_eval_and_predict(
-    initial_weights, input_fn, model_fn, distribution=None):
+def fit_eval_and_predict(initial_weights, input_fn, model_fn,
+                         distribution=None, is_stateful_model=False):
   """Generates results for fit/predict/evaluate for given model."""
   model = model_fn(initial_weights=initial_weights, distribution=distribution)
-  training_inputs, eval_inputs, predict_inputs = input_fn(distribution)
+  training_inputs, eval_inputs, predict_inputs = input_fn()
 
   result = {}
   result['training_history_1'] = model.fit(**training_inputs).history
@@ -222,7 +245,15 @@ def fit_eval_and_predict(
   result['weights_1'] = model.get_weights()
 
   if predict_inputs is not None:
-    result['predict_result_1'] = model.predict(**predict_inputs)
+    # Check correctness of the result of predict() invoked
+    # multiple times -- as for stateful models, result of
+    # predict may differ for each batch.
+    predict_length = 1
+    if is_stateful_model:
+      predict_length = 3
+    for i in range(predict_length):
+      result_key = 'predict_result_{}'.format(i)
+      result[result_key] = model.predict(**predict_inputs)
 
   # Train and eval again to mimic user's flow.
 
@@ -241,19 +272,20 @@ def compare_results(results_with_ds, results_without_ds, distribution,
   """Compares results of model compiled with/without distribution strategy."""
 
   default_tolerance = 1e-5
-  tol_table = {}
-
-  if isinstance(distribution, (
-      mirrored_strategy.MirroredStrategy,
-      mirrored_strategy.CoreMirroredStrategy,
-      distribute_lib._DefaultDistributionStrategy)):  # pylint: disable=protected-access
-    # TODO(b/119257215): Weights are not exactly the same, so use larger
-    # tolerance for now. Predict should be related to weights.
-    tol_table = {
-        'weights_1': 1e-4,
-        'weights_2': 1e-4,
-        'predict_result_1': 1e-4,
-    }
+  relaxed_tolerance = 1e-4
+
+  def _get_compare_result_tolerance(key):
+    """Returns tolerance to compare results."""
+    # TODO(b/119257215): For MirroredStrategy, weights are not exactly the same,
+    # so use larger tolerance for now. Predict should be related to weights.
+    if (isinstance(distribution, (
+        mirrored_strategy.MirroredStrategy,
+        mirrored_strategy.CoreMirroredStrategy,
+        distribute_lib._DefaultDistributionStrategy)) and  # pylint: disable=protected-access
+        key.startswith(('weights_1', 'weights_2', 'predict_result'))):
+      return relaxed_tolerance
+
+    return default_tolerance
 
   for key in results_with_ds:
     if (key.startswith('training_history') and
@@ -263,8 +295,7 @@ def compare_results(results_with_ds, results_without_ds, distribution,
       # underlying bug is fixed.
       continue
 
-    tolerance = tol_table.get(key, default_tolerance)
-
+    tolerance = _get_compare_result_tolerance(key)
     testcase.assertAllClose(
         results_with_ds[key],
         results_without_ds[key],
@@ -315,6 +346,22 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
     y_train = x_train
     return (x_train.astype('float32'), y_train.astype('float32'), None)
 
+  def get_input_for_correctness_test(self, **kwargs):
+    """Generates inputs that are dictionaries.
+
+    We only provide a default implementation of this method here. If you need
+    more customized way of providing input to your model, overwrite this method.
+
+    Arguments:
+      **kwargs: key word arguments about how to create the input dictionaries
+
+    Returns:
+      Three dictionaries representing the input for fit(), evalutate() and
+      predict()
+    """
+
+    return get_correctness_test_inputs(**kwargs)
+
   def get_model(self, distribution=None):
     raise NotImplementedError
 
@@ -334,7 +381,8 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
                            distribution,
                            use_numpy,
                            use_validation_data,
-                           with_batch_norm=False):
+                           with_batch_norm=False,
+                           is_stateful_model=False):
     with self.cached_session():
       self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
       self.skip_unsupported_test_configuration(distribution)
@@ -342,23 +390,42 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
       # Train, eval, and predict datasets are created with the same input numpy
       # arrays.
       x_train, y_train, x_predict = self.get_data()
-
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
       # non-distribution run.
       model = self.get_model()
       initial_weights = model.get_weights()
 
-      def input_fn(dist):
-        return get_correctness_test_inputs(
-            use_numpy, use_validation_data, dist, x_train, y_train, x_predict)
+      ds_input_fn = functools.partial(
+          self.get_input_for_correctness_test,
+          use_numpy=use_numpy,
+          use_validation_data=use_validation_data,
+          with_distribution=distribution,
+          x_train=x_train,
+          y_train=y_train,
+          x_predict=x_predict)
+
+      nods_input_fn = functools.partial(
+          self.get_input_for_correctness_test,
+          use_numpy=use_numpy,
+          use_validation_data=use_validation_data,
+          with_distribution=None,
+          x_train=x_train,
+          y_train=y_train,
+          x_predict=x_predict)
 
       results_with_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, model_fn=self.get_model,
-          distribution=distribution)
+          initial_weights,
+          input_fn=ds_input_fn,
+          model_fn=self.get_model,
+          distribution=distribution,
+          is_stateful_model=is_stateful_model)
       results_without_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, model_fn=self.get_model,
-          distribution=None)
+          initial_weights,
+          input_fn=nods_input_fn,
+          model_fn=self.get_model,
+          distribution=None,
+          is_stateful_model=is_stateful_model)
 
       # First, special case, for multi-replica distributed training, batch norm
       # is not aggregated globally. So it is expected to have different weights.
@@ -371,6 +438,23 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
         compare_results(results_with_ds, results_without_ds, distribution,
                         testcase=self)
 
+  def get_input_for_dynamic_lr_test(self, **kwargs):
+    """Generates inputs that are dictionaries.
+
+    We only provide a default implementation of this method here. If you need
+    more customized way of providing input to your model, overwrite this method.
+
+    Arguments:
+      **kwargs: key word arguments about how to create the input dictionaries
+
+    Returns:
+      Three dictionaries representing the input for fit(), evalutate() and
+      predict()
+    """
+
+    training_input = kwargs
+    return training_input, None, None
+
   def run_dynamic_lr_test(self, distribution):
     with self.cached_session():
       self.set_up_test_config()
@@ -388,30 +472,41 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
         # same as TPU.
         update_freq = distribution.extended.steps_per_run
 
-      def input_fn(dist):
-        """Generates training test given test configuration."""
-        training_epochs = 2
-        global_batch_size = 64
-        batch_size = get_batch_size(global_batch_size, dist)
-
-        training_inputs = {
-            'batch_size': batch_size,
-            'x': x_train,
-            'y': y_train,
-            'epochs': training_epochs,
-            'shuffle': False,
-            'callbacks': [LearningRateBatchScheduler(update_freq)],
-            'validation_data': (x_train, y_train)
-        }
-        # In this test case, we do not care eval and predict.
-        eval_inputs, predict_inputs = None, None
-        return training_inputs, eval_inputs, predict_inputs
+      training_epochs = 2
+      global_batch_size = 64
+
+      ds_batch_size = get_batch_size(global_batch_size, distribution)
+      nods_batch_size = get_batch_size(global_batch_size, None)
+
+      ds_input_fn = functools.partial(
+          self.get_input_for_dynamic_lr_test,
+          x=x_train,
+          y=y_train,
+          batch_size=ds_batch_size,
+          shuffle=False,
+          epochs=training_epochs,
+          callbacks=[LearningRateBatchScheduler(update_freq)],
+          validation_data=(x_train, y_train))
+
+      nods_input_fn = functools.partial(
+          self.get_input_for_dynamic_lr_test,
+          x=x_train,
+          y=y_train,
+          batch_size=nods_batch_size,
+          shuffle=False,
+          epochs=training_epochs,
+          callbacks=[LearningRateBatchScheduler(update_freq)],
+          validation_data=(x_train, y_train))
 
       results_with_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          initial_weights,
+          input_fn=ds_input_fn,
+          model_fn=self.get_model,
           distribution=distribution)
       results_without_ds = fit_eval_and_predict(
-          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          initial_weights,
+          input_fn=nods_input_fn,
+          model_fn=self.get_model,
           distribution=None)
       compare_results(results_with_ds, results_without_ds, distribution,
                       testcase=self)
@@ -448,7 +543,7 @@ class TestDistributionStrategyEmbeddingModelCorrectnessBase(
         features, maxlen=max_words)
     x_train = np.asarray(features, dtype=np.float32)
     y_train = np.asarray(labels, dtype=np.int32).reshape((count, 1))
-    x_predict = x_train
+    x_predict = x_train[:_GLOBAL_BATCH_SIZE]
     return x_train, y_train, x_predict
 
 
diff --git a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
index 7afacab0ddbed8d5b448c2ed2b983bfa18d11b80..61202e30c4f33892d2675080fae07cc4d7102337 100644
--- a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
@@ -47,7 +47,9 @@ class TestDistributionStrategyDnnCorrectness(
       # We add few non-linear layers to make it non-trivial.
       model = keras.Sequential()
       model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(keras.layers.Dense(10, activation='relu'))
+      model.add(keras.layers.Dense(
+          10, activation='relu',
+          kernel_regularizer=keras.regularizers.l2(1e-4)))
       model.add(keras.layers.Dense(10, activation='relu'))
       model.add(keras.layers.Dense(1))
 
@@ -68,7 +70,7 @@ class TestDistributionStrategyDnnCorrectness(
     y_train = 3 * x_train
     x_train = x_train.astype('float32')
     y_train = y_train.astype('float32')
-    x_predict = [[1.], [2.], [3.], [4.]]
+    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
     return x_train, y_train, x_predict
 
   @combinations.generate(keras_correctness_test_base.
diff --git a/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
index 3913f9bc0cdfff6b562d5727ec33eb4d83f4a619..e881bb70ecc428e3f972cde5f19c1b61b1dc0f0b 100644
--- a/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
@@ -71,5 +71,80 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
+class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids_a = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_a')
+      word_ids_b = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_b')
+
+      def submodel(embedding, word_ids):
+        word_embed = embedding(word_ids)
+        rep = keras.layers.GlobalAveragePooling1D()(word_embed)
+        return keras.Model(inputs=[word_ids], outputs=[rep])
+
+      word_embed = keras.layers.Embedding(
+          input_dim=20,
+          output_dim=10,
+          input_length=max_words,
+          embeddings_initializer=keras.initializers.RandomUniform(0, 1))
+
+      a_rep = submodel(word_embed, word_ids_a).outputs[0]
+      b_rep = submodel(word_embed, word_ids_b).outputs[0]
+      sim = keras.layers.Dot(axes=1, normalize=True)([a_rep, b_rep])
+
+      model = keras.Model(inputs=[word_ids_a, word_ids_b], outputs=[sim])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='mse',
+          metrics=['mse'])
+    return model
+
+  def get_data(self,
+               count=(keras_correctness_test_base._GLOBAL_BATCH_SIZE *
+                      keras_correctness_test_base._EVAL_STEPS),
+               min_words=5,
+               max_words=10,
+               max_word_id=19,
+               num_classes=2):
+    features_a, labels_a, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    features_b, labels_b, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    y_train = np.zeros((count, 1), dtype=np.float32)
+    y_train[labels_a == labels_b] = 1.0
+    y_train[labels_a != labels_b] = -1.0
+    # TODO(b/123360757): Add tests for using list as inputs for multi-input
+    # models.
+    x_train = {
+        'words_a': features_a,
+        'words_b': features_b,
+    }
+    x_predict = x_train
+
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
+                                               use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
index f625664372dfb6814ccbe9539f6abe018d2a4447..3c2961456b2eede9570ce29f7a8900834f2ccfb7 100644
--- a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
@@ -23,7 +23,7 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import keras_correctness_test_base
 from tensorflow.python import keras
 from tensorflow.python.eager import test
-from tensorflow.python.training import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 
 
 class DistributionStrategyCnnCorrectnessTest(
@@ -33,7 +33,8 @@ class DistributionStrategyCnnCorrectnessTest(
     with keras_correctness_test_base.MaybeDistributionScope(distribution):
       image = keras.layers.Input(shape=(28, 28, 3), name='image')
       c1 = keras.layers.Conv2D(
-          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4))(
+          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4),
+          kernel_regularizer=keras.regularizers.l2(1e-4))(
               image)
       if self.with_batch_norm:
         c1 = keras.layers.BatchNormalization(name='bn1')(c1)
@@ -47,7 +48,7 @@ class DistributionStrategyCnnCorrectnessTest(
         model.set_weights(initial_weights)
 
       model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(
+          optimizer=gradient_descent.SGD(
               learning_rate=0.1),
           loss='sparse_categorical_crossentropy',
           metrics=['sparse_categorical_accuracy'])
diff --git a/tensorflow/contrib/distribute/python/keras_multi_worker_test.py b/tensorflow/contrib/distribute/python/keras_multi_worker_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a0625a0e4ee928ea49a345b263a4f596b2a3957
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_multi_worker_test.py
@@ -0,0 +1,460 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test multi-worker Keras.
+
+TODO(b/123845258): Move this to tensorflow core.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import functools
+import os
+import sys
+import threading
+
+from absl.testing import parameterized
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy as collective_strategy
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base as test_base
+from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+def _mnist_synthetic_dataset(batch_size, steps_per_epoch):
+  # train dataset
+  x_train = array_ops.ones([batch_size * steps_per_epoch, 28, 28, 1],
+                           dtype=dtypes.float32)
+  y_train = array_ops.ones([batch_size * steps_per_epoch, 1],
+                           dtype=dtypes.int32)
+  train_ds = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+  train_ds = train_ds.repeat()
+  # train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.batch(64, drop_remainder=True)
+
+  # eval dataset
+  x_test = random_ops.random_uniform([10000, 28, 28, 1], dtype=dtypes.float32)
+  y_test = random_ops.random_uniform([10000, 1],
+                                     minval=0,
+                                     maxval=9,
+                                     dtype=dtypes.int32)
+  eval_ds = dataset_ops.Dataset.from_tensor_slices((x_test, y_test))
+  eval_ds = eval_ds.repeat()
+  eval_ds = eval_ds.batch(64, drop_remainder=True)
+
+  return train_ds, eval_ds
+
+
+def _get_model(input_shape):
+  # Define a deterministically-initialized CNN model to recognize MNIST digits,
+  # commented out several layers to simplify it.
+  model = keras.models.Sequential()
+  model.add(
+      keras.layers.Conv2D(
+          32,
+          kernel_size=(3, 3),
+          activation='relu',
+          input_shape=input_shape,
+          kernel_initializer=keras.initializers.TruncatedNormal(seed=99)))
+  # model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
+  # model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
+  # model.add(keras.layers.Dropout(0.25))
+  model.add(keras.layers.Flatten())
+  # model.add(keras.layers.Dense(128, activation='relu'))
+  # model.add(keras.layers.Dropout(0.5))
+  model.add(
+      keras.layers.Dense(
+          10,
+          activation='softmax',
+          kernel_initializer=keras.initializers.TruncatedNormal(seed=99)))
+
+  # TODO(yuefengz): optimizer with slot variables doesn't work because of
+  # optimizer's bug.
+  # TODO(yuefengz): we should not allow non-v2 optimizer.
+  model.compile(
+      loss=keras.losses.sparse_categorical_crossentropy,
+      optimizer=gradient_descent.SGD(learning_rate=0.001),
+      metrics=['accuracy'])
+  return model
+
+
+def _clone_and_build_model(model, strategy):
+  # The new "original" model in worker 0.
+  with strategy.scope():
+    cloned_model = models.clone_model(model)
+
+  # Compile and build model.
+  if isinstance(model.optimizer, optimizers.TFOptimizer):
+    optimizer = model.optimizer
+    # TODO(yuefengz): figure out why the optimizer here is still a
+    # TFOptimizer.
+    while isinstance(optimizer, optimizers.TFOptimizer):
+      optimizer = optimizer.optimizer
+    optimizer = copy.deepcopy(optimizer)
+  else:
+    optimizer_config = model.optimizer.get_config()
+    optimizer = type(model.optimizer).from_config(optimizer_config)
+
+  cloned_model.compile(
+      optimizer,
+      model.loss,
+      metrics=metrics_module.clone_metrics(model._compile_metrics),
+      loss_weights=model.loss_weights,
+      sample_weight_mode=model.sample_weight_mode,
+      weighted_metrics=metrics_module.clone_metrics(
+          model._compile_weighted_metrics))
+  return cloned_model
+
+
+# TODO(b/123918215): Possibly merge this Callback with keras_test.Counter.
+class MultiWorkerVerificationCallback(callbacks.Callback):
+  """MultiWorkerVerificationCallback verifies the callbacks in multi-worker scheme.
+
+  This Callback is intended to be used for verifying the callback is indeed
+  called the correct number of times in various task types.
+
+  Attributes:
+    _task_dict: A nested dictionary storing the number of times a callback has
+                been called in specific task type, task index, and method name.
+                Look up structure is
+                task_name -> task_id -> tracking_method_name -> invoke_count
+                For example, a _task_dict of
+                {
+                    'ps': {
+                         0: {
+                             'on_epoch_begin': 2
+                         },
+                         1: {
+                             'on_epoch_begin': 2
+                         }
+                    },
+                    'worker': {
+                         0: {
+                             'on_epoch_begin': 2
+                         },
+                         1: {
+                             'on_epoch_begin': 2
+                         }
+                    }
+                }
+                indicates the ps task has 'on_epoch_begin' called twice on each
+                of the two indices, and likewise for worker task.
+  """
+
+  # TODO(rchao): Add other method calls to verify.
+  METHODS_TO_VERIFY = ['on_epoch_begin']
+
+  def __init__(self, num_epoch, num_worker):
+    """Initialize a MultiWorkerVerificationCallback.
+
+    Args:
+      num_epoch: Number of epochs this Callback is expected to be called for.
+      num_worker: Number of workers this Callback is expected to be called from.
+    """
+    super(MultiWorkerVerificationCallback, self).__init__()
+    self._num_epoch = num_epoch
+    self._num_worker = num_worker
+    self._task_dict = {
+        key: collections.defaultdict(lambda: collections.defaultdict(int))
+        for key in ['ps', 'worker']
+    }
+    self._lock = threading.Lock()
+    self._is_between_graph = None
+    self.wrap_methods(self.METHODS_TO_VERIFY)
+
+  @property
+  def is_between_graph(self):
+    return self._is_between_graph
+
+  @is_between_graph.setter
+  def is_between_graph(self, is_between_graph):
+    self._is_between_graph = is_between_graph
+
+  def wrap_methods(self, method_names):
+    """Wrap methods so that the counts of calls are tracked.
+
+    Args:
+      method_names: A list of names of methods to track calls.
+    """
+    for method_name in method_names:
+      method = getattr(self, method_name)
+
+      def wrapped_method(method_to_wrap, name, *arg, **kwargs):
+        # Use lock to ensure += operation is thread-safe.
+        with self._lock:
+          self._task_dict[test_base.get_task_type()][
+              test_base.get_task_index()][name] += 1
+        method_to_wrap(*arg, **kwargs)
+
+      setattr(self, method_name,
+              functools.partial(wrapped_method, method, method_name))
+
+  def verify(self, test_case):
+    method_count_dict = {
+        method_name: self._num_epoch for method_name in self.METHODS_TO_VERIFY
+    }
+    assert self._is_between_graph is not None
+    if self._is_between_graph:
+      # TODO(b/124171024): In between-graph replication, by default only the
+      # chief calls callback. Fix this test to cover that, as well as the rare
+      # cases where all workers call.
+      worker_call_count = {
+          i: method_count_dict for i in range(0, self._num_worker)
+      }
+    else:
+      # If in-graph, only the first worker calls callback methods.
+      worker_call_count = {0: method_count_dict}
+    test_case.assertDictEqual(
+        self._task_dict,
+        {
+            # PS' callback is not supposed to be called.
+            'ps': {},
+            # Each of the Worker should be called num_epoch of times.
+            'worker': worker_call_count
+        })
+
+
+# TODO(yuefengz): right now, fit or evaluate has to be called under distribution
+# strategy's scope.
+def _run_standalone_client(test_obj, strategy, cluster_spec):
+  input_shape = (28, 28, 1)
+  with strategy.scope():
+    orig_model = _get_model(input_shape)
+
+  def worker_fn(strategy):
+    with ops.Graph().as_default():
+      batch_size = 64
+      steps = 10
+
+      with strategy.scope():
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        model = _clone_and_build_model(orig_model, strategy)
+
+        orig_loss, orig_acc = model.evaluate(train_ds, steps=steps)
+
+        # Workaround for the metrics issue (b/122928955) in async training. This
+        # can only be used in standalone client mode.
+        dc_context.get_current_worker_context().wait_for_other_workers()
+
+        model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+
+        dc_context.get_current_worker_context().wait_for_other_workers()
+
+        trained_loss, trained_acc = model.evaluate(train_ds, steps=steps)
+
+      test_obj.assertLessEqual(trained_loss, orig_loss)
+      test_obj.assertGreaterEqual(trained_acc, orig_acc)
+
+  dc.run_distribute_coordinator(
+      worker_fn,
+      strategy,
+      mode=dc.CoordinatorMode.STANDALONE_CLIENT,
+      cluster_spec=cluster_spec)
+
+
+def get_strategy_object(strategy_cls):
+  if (strategy_cls == mirrored_strategy.MirroredStrategy or
+      strategy_cls == mirrored_strategy.CoreMirroredStrategy):
+    return strategy_cls(mirrored_strategy.all_local_devices())
+  else:
+    return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
+
+class KerasMultiWorkerTestStandaloneClient(test.TestCase,
+                                           parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers."""
+    super(KerasMultiWorkerTestStandaloneClient, cls).setUpClass()
+    cls._cluster_spec = test_base.create_in_process_cluster(
+        num_workers=2, num_ps=1, has_eval=False)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy,
+              collective_strategy.CollectiveAllReduceStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def testSimpleModelStandaloneClient(self, strategy_cls):
+    # With standalone client, training_utils.should_run_multi_worker returns
+    # False which means the distribute coordinator won't be called again in
+    # `fit`. This is still correct and intended since session is still
+    # configured under distribute coordinator's worker context and distribution
+    # strategy object is already configured by distribute coordinator for
+    # multi-worker training.
+    # The logic should be much clearer once standalone client is merged into
+    # core Keras as well.
+    strategy = get_strategy_object(strategy_cls)
+
+    _run_standalone_client(self, strategy, self._cluster_spec)
+
+
+class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
+                                            parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              collective_strategy.CollectiveAllReduceStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def testSimpleModelIndependentWorkerSync(self, strategy_cls):
+    num_workers = 2
+    num_epoch = 2
+
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    self._barrier = dc._Barrier(2)
+
+    # The verification callback will be shared by multiple threads.
+    verification_callback = MultiWorkerVerificationCallback(
+        num_epoch=num_epoch, num_worker=num_workers)
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      """Simulates an Independent Worker inside of a thread."""
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        strategy = get_strategy_object(strategy_cls)
+        verification_callback.is_between_graph = \
+            strategy.extended.experimental_between_graph
+        batch_size = 64
+        steps = 10
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+        orig_loss, _ = model.evaluate(train_ds, steps=steps)
+        callbacks_for_fit = nest.flatten(
+            kwargs.get('verification_callback', []))
+        history = model.fit(
+            x=train_ds,
+            epochs=num_epoch,
+            steps_per_epoch=steps,
+            callbacks=callbacks_for_fit)
+        self.assertIsInstance(history, keras.callbacks.History)
+        trained_loss, _ = model.evaluate(train_ds, steps=steps)
+        self.assertLess(trained_loss, orig_loss)
+
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        verification_callback=verification_callback)
+
+    threads_to_join = []
+    strategy = get_strategy_object(strategy_cls)
+    if strategy.extended.experimental_between_graph:
+      for ts in threads.values():
+        threads_to_join.extend(ts)
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    verification_callback.verify(self)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[parameter_server_strategy.ParameterServerStrategy],
+          required_gpus=[0, 1]))
+  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
+    num_workers = 2
+    num_epoch = 2
+    cluster_spec = test_base.create_cluster_spec(
+        num_workers=num_workers, num_ps=2)
+    self._barrier = dc._Barrier(4)
+
+    # The verification callback will be shared by multiple threads.
+    verification_callback = MultiWorkerVerificationCallback(
+        num_epoch=num_epoch, num_worker=num_workers)
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      """Simulates an Independent Worker inside of a thread."""
+      # TODO(rchao/yuefengz): The following is run by both worker and ps
+      # threads. The distribute coordinator should run std server immediately
+      # without configuring the session (or building the graph) on PS.
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        batch_size = 64
+        steps = 10
+        strategy = strategy_cls(num_gpus_per_worker=context.num_gpus())
+        verification_callback.is_between_graph = \
+            strategy.extended.experimental_between_graph
+
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+
+          # TODO(b/123868066): Verify callback for model.evaluate().
+          callbacks_for_fit = nest.flatten(
+              kwargs.get('verification_callback', []))
+          history = model.fit(
+              x=train_ds,
+              epochs=num_epoch,
+              steps_per_epoch=steps,
+              validation_data=val_ds,
+              validation_steps=steps,
+              callbacks=callbacks_for_fit)
+        self.assertIsInstance(history, keras.callbacks.History)
+
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        verification_callback=verification_callback)
+
+    threads_to_join = []
+    for task_type, ts in threads.items():
+      # This test can finish once the worker threads complete, and thus
+      # the ps threads don't need to be joined.
+      if task_type == 'ps':
+        continue
+      threads_to_join.extend(ts)
+    self.join_independent_workers(threads_to_join)
+    verification_callback.verify(self)
+
+
+if __name__ == '__main__':
+  # Enable manual variable initialization to make sure variables are initialized
+  # by `init_restore_or_wait_for_variables`.
+  backend.manual_variable_initialization(True)
+  with test.mock.patch.object(sys, 'exit', os._exit):
+    test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 5349794334b7f6ea3d718343fa84c693dd3d7a3c..c93d7afa7ceef2c9c272e91997e2871655cea079 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.python import keras
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -44,108 +45,71 @@ def get_model():
 
 class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus,
+              combinations.parameter_server_strategy_with_two_gpus,
+          ],
+          mode=['graph', 'eager']))
   def testKerasOptimizerWithUnequalInput(self, distribution):
-    def create_fn():
+    with distribution.scope():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
-      # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      def loss():
-        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
-
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
-      train_op = optimizer.minimize(loss, var_list=[var])
-      m = optimizer.get_slot(var, 'm')
-      v = optimizer.get_slot(var, 'v')
-      return (var, m, v, train_op, optimizer.iterations)
+      all_vars = []
 
-    devices = ['/device:GPU:0', '/device:CPU:0']
-    with distribution.scope():
-      (var, m, v, op,
-       counter) = distribution.extended.call_for_each_replica(create_fn)
+      def model_fn():
+
+        def loss_fn():
+          replica_id = _replica_id()
+          return math_ops.cast(replica_id + 1, dtype=dtypes.float32) * 0.5 * var
+
+        train_op = optimizer.minimize(loss_fn, var_list=[var])
+
+        return train_op, optimizer
+
+      def train_fn():
+        train_op, optimizer = distribution.extended.call_for_each_replica(
+            model_fn)
+        if not all_vars:
+          all_vars.append(var)
+          all_vars.append(optimizer.get_slot(var, 'm'))
+          all_vars.append(optimizer.get_slot(var, 'v'))
+        return distribution.group(train_op)
+
+      if not context.executing_eagerly():
+        with self.cached_session() as sess:
+          train_fn = sess.make_callable(train_fn())
       self.evaluate(variables.global_variables_initializer())
-      var_val = [2.0, 2.0, 2.0]
-      self.assertAllClose(
-          var_val,
-          self.evaluate(
-              [distribution.extended.read_var(var),
-               var.get(devices[0]),
-               var.get(devices[1])]))
-      self.assertAllClose([0, 0, 0],
-                          self.evaluate([
-                              distribution.extended.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
-
-      train_op = distribution.unwrap(op)
-      self.evaluate(train_op)
-      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
-      m_val = [1.2, 1.2, 1.2]
-      # assert slot variables in both replicas are the same.
-      self.assertAllClose(
-          m_val,
-          self.evaluate(
-              [distribution.extended.read_var(m),
-               m.get(devices[0]),
-               m.get(devices[1])]))
-      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
-      v_val = [1.8, 1.8, 1.8]
-      self.assertAllClose(
-          v_val,
-          self.evaluate(
-              [distribution.extended.read_var(v),
-               v.get(devices[0]),
-               v.get(devices[1])]))
+
+      # first step.
+      train_fn()
       # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
       #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
-      var_val = [1.99, 1.99, 1.99]
-      self.assertAllClose(
-          var_val,
-          self.evaluate(
-              [distribution.extended.read_var(var),
-               var.get(devices[0]),
-               var.get(devices[1])]))
-      self.assertAllClose([1, 1, 1],
-                          self.evaluate([
-                              distribution.extended.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
-
-      self.evaluate(train_op)
+      self.assertAllClose(1.99, self.evaluate(all_vars[0]))
+      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
+      self.assertAllClose(1.2, self.evaluate(all_vars[1]))
+      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
+      self.assertAllClose(1.8, self.evaluate(all_vars[2]))
+
+      # second step.
+      train_fn()
+      # var(1) = var(0) - lr * 2 = 1.98
+      self.assertAllClose(1.98, self.evaluate(all_vars[0]))
       # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
-      m_val = [1.44, 1.44, 1.44]
-      self.assertAllClose(
-          m_val,
-          self.evaluate(
-              [distribution.extended.read_var(m),
-               m.get(devices[0]),
-               m.get(devices[1])]))
+      self.assertAllClose(1.44, self.evaluate(all_vars[1]))
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
-      v_val = [2.16, 2.16, 2.16]
-      self.assertAllClose(
-          v_val,
-          self.evaluate(
-              [distribution.extended.read_var(v),
-               v.get(devices[0]),
-               v.get(devices[1])]))
-      self.assertAllClose([2, 2, 2],
-                          self.evaluate([
-                              distribution.extended.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+      self.assertAllClose(2.16, self.evaluate(all_vars[2]))
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.parameter_server_strategy_with_two_gpus,
+          ],
+          mode=['graph', 'eager']))
   def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
 
     with self.cached_session():
diff --git a/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5faf6c36b880a72bafc8d082cff2816f3b11a76
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
@@ -0,0 +1,99 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful tf.keras LSTM models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+def strategies_for_stateful_embedding_model():
+  """Returns TPUStrategy with single core device assignment."""
+
+  return [combinations.tpu_strategy_one_core,
+          combinations.tpu_strategy_one_step_one_core]
+
+
+def test_combinations_for_stateful_embedding_model():
+  return (
+      combinations.combine(
+          distribution=strategies_for_stateful_embedding_model(),
+          mode='graph',
+          use_numpy=False,
+          use_validation_data=False
+      ))
+
+
+class DistributionStrategyStatefulLstmModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
+
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,),
+          batch_size=batch_size,
+          dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      lstm_embed = keras.layers.LSTM(units=4,
+                                     return_sequences=False,
+                                     stateful=True)(word_embed)
+
+      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(test_combinations_for_stateful_embedding_model())
+  def test_stateful_lstm_model_correctness(self,
+                                           distribution,
+                                           use_numpy,
+                                           use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              is_stateful_model=True)
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_with_tpu_strategies())
+  def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
+      self, distribution, use_numpy, use_validation_data):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Single core must be used for computation '
+                                 'on stateful models. Consider adding '
+                                 '`device_assignment` parameter to '
+                                 'TPUStrategy'):
+      self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                                is_stateful_model=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 8a607dd070f859aca69ee857d5c5f091f107e0ca..2b733c2cfb1609998be413254c2ffa377d4acb73 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -25,17 +25,17 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import values
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -70,6 +70,32 @@ def simple_functional_model():
   return model
 
 
+def simple_subclassed_model(num_labels=_NUM_CLASS):
+
+  class _SimpleMLP(keras.Model):
+
+    def __init__(self, num_labels):
+      super(_SimpleMLP, self).__init__()
+      self.dense = keras.layers.Dense(num_labels)
+
+    def call(self, inputs):
+      return self.dense(inputs)
+
+  return _SimpleMLP(num_labels)
+
+
+def simple_multi_inputs_multi_outputs_model():
+  input_a = keras.layers.Input(shape=(16,), name='input_a')
+  input_b = keras.layers.Input(shape=(16,), name='input_b')
+
+  merged = keras.layers.concatenate([input_a, input_b], name='merge')
+  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
+  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
+  model = keras.models.Model(
+      inputs=[input_a, input_b], outputs=[output_c, output_d])
+  return model
+
+
 def multi_inputs_multi_outputs_model():
   input_a = keras.layers.Input(shape=(16,), name='input_a')
   input_b = keras.layers.Input(shape=(16,), name='input_b')
@@ -202,6 +228,22 @@ def get_predict_dataset(distribution):
   return dataset
 
 
+def convert_numpy_to_dataset_with_unknown_cardinality(inputs,
+                                                      targets=None):
+  if targets is not None:
+    input_slices = (inputs, targets)
+    dummy_op = (lambda inp, target: True)
+  else:
+    input_slices = inputs
+    dummy_op = (lambda inp: True)
+
+  original_dataset = (dataset_ops.Dataset.from_tensor_slices(
+      input_slices))
+  ds_with_unknown_cardinality = (original_dataset.filter(dummy_op).
+                                 batch(10, drop_remainder=True))
+  return ds_with_unknown_cardinality
+
+
 def multi_input_output_model():
   a = keras.layers.Input(shape=(3,), name='input_a')
   b = keras.layers.Input(shape=(5,), name='input_b')
@@ -216,9 +258,12 @@ def multi_input_output_model():
   return model
 
 
+# TODO(josh11b): Add combinations.one_device_strategy_gpu once it works with
+# TestDistributionStrategyWithCallbacks.test_callbacks_in_predict.
 strategies_minus_tpu = [
     combinations.default_strategy,
     combinations.one_device_strategy,
+    combinations.one_device_strategy_gpu,
     combinations.mirrored_strategy_with_gpu_and_cpu,
     combinations.mirrored_strategy_with_two_gpus,
     combinations.core_mirrored_strategy_with_gpu_and_cpu,
@@ -230,54 +275,56 @@ tpu_strategies = [
 
 
 def strategy_minus_tpu_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu,
-      mode=['graph', 'eager'])
+  return combinations.combine(distribution=strategies_minus_tpu,
+                              mode=['graph', 'eager'])
 
 
 def tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=tpu_strategies,
-      mode=['graph'])
+  return combinations.combine(distribution=tpu_strategies,
+                              mode=['graph'])
 
 
 def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-def all_strategy_combinations_minus_default():
-  strategy_minus_default_combinations = combinations.combine(
+def all_strategy_minus_default_and_tpu_combinations():
+  return combinations.combine(
       distribution=[
           combinations.one_device_strategy,
+          combinations.one_device_strategy_gpu,
           combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
           combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph', 'eager'])
-  return strategy_minus_default_combinations + tpu_strategy_combinations()
 
 
-# TODO(priyag): Add v2 optimizers here.
+def all_strategy_combinations_minus_default():
+  return (all_strategy_minus_default_and_tpu_combinations() +
+          tpu_strategy_combinations())
+
+
 def strategy_and_optimizer_combinations():
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(
-          optimizer=[combinations.adagrad_optimizer_v1_fn,
-                     combinations.adam_optimizer_v1_fn,
-                     combinations.gradient_descent_optimizer_v1_fn,
-                     combinations.rmsprop_optimizer_v1_fn]))
-
-
-def strategy_for_numpy_input_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu + tpu_strategies,
-      mode=['graph'])
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
 
 
 class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
                                         parameterized.TestCase):
 
   def setUp(self):
+    super(TestEstimatorDistributionStrategy, self).setUp()
     self._base_dir = os.path.join(self.get_temp_dir(),
                                   'keras_mirrored_strategy_test')
     gfile.MakeDirs(self._base_dir)
@@ -285,6 +332,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
 
   def tearDown(self):
+    super(TestEstimatorDistributionStrategy, self).tearDown()
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
@@ -429,7 +477,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -460,7 +508,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_63_samples, steps=None, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -506,7 +554,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           distributed_training_utils.get_input_params(
               distribution, input_63_samples, steps=1, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -540,7 +588,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=None, batch_size=3)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(self,
                                                                distribution):
     with self.cached_session():
@@ -557,7 +605,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=10, batch_size=13)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -588,7 +636,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         # with batch_size
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -640,7 +688,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
                 steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -665,6 +713,69 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllEqual([6, 7], outs[0].shape)
       self.assertAllEqual([6, 7], outs[1].shape)
 
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_with_partial_batch(self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+
+      with distribution.scope():
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss)
+
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+
+      # As sample size is 10, we batch by 4 so that the last batch is
+      # a partial batch. Also `predict()` using numpy array as inputs without
+      # distribution strategy uses entire sample as a single batch. As so,
+      # we remove parameters `batch_size` and `steps`.
+      predict_ground_truth = cpu_model.predict(inputs)
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      self.assertAllClose(
+          model_with_ds_strategy.predict(inputs, batch_size=4, steps=3),
+          predict_ground_truth,
+          atol=1e-5,
+          rtol=1e-5)
+      # Test that `steps` is inferred correctly when final partial batch exists.
+      self.assertAllClose(
+          model_with_ds_strategy.predict(inputs, batch_size=4),
+          predict_ground_truth,
+          atol=1e-5,
+          rtol=1e-5)
+
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_multi_output_model_with_partial_batch(
+      self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+
+      with distribution.scope():
+        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
+        model_with_ds_strategy.compile(optimizer, loss)
+
+      cpu_model = simple_multi_inputs_multi_outputs_model()
+      cpu_model.compile(optimizer, loss)
+
+      input_data, _ = get_multi_inputs_multi_outputs_data()
+      input_dict = {
+          'input_a': input_data['input_a'],
+          'input_b': input_data['input_b'],
+      }
+
+      # As sample size is 200, we batch by 18 so that the last batch is
+      # a partial batch. Also `fit()` using numpy array as inputs without
+      # distribution strategy uses entire sample as a single batch. As so,
+      # we remove parameters `batch_size` and `steps`.
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      self.assertAllClose(
+          model_with_ds_strategy.predict(input_dict, batch_size=18, steps=12),
+          cpu_model.predict(input_dict),
+          atol=1e-4, rtol=1e-4)
+
 
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
@@ -722,7 +833,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       self.assertEqual(interleaved_output.history['val_loss'],
                        [x[0] for x in user_controlled_output])
-      self.assertEqual(interleaved_output.history['val_mean_absolute_error'],
+      val_mean_absolute_error = interleaved_output.history.get(
+          'val_mean_absolute_error')
+      if not val_mean_absolute_error:
+        # The name of the metric changed in TF2.0
+        val_mean_absolute_error = interleaved_output.history['val_mae']
+      self.assertEqual(val_mean_absolute_error,
                        [x[1] for x in user_controlled_output])
       self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
                        [x[2] for x in user_controlled_output])
@@ -768,6 +884,95 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
+  @combinations.generate(all_strategy_combinations())
+  def test_fit_eval_and_predict_methods_on_dataset_without_steps(
+      self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((1000, 3), dtype=np.float32)
+      targets = np.zeros((1000, 4), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      fit_with_numpy = model.fit(inputs, targets, epochs=1,
+                                 batch_size=10).history
+      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+      predict_with_numpy = model.predict(inputs, batch_size=10)
+
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.batch(10, drop_remainder=True)
+      fit_with_ds = model.fit(dataset, epochs=1).history
+      eval_with_ds = model.evaluate(dataset)
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+      predict_dataset = predict_dataset.batch(10, drop_remainder=True)
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_on_dataset_with_unknown_cardinality_without_steps(
+      self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((1000, 3), dtype=np.float32)
+      targets = np.zeros((1000, 4), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      fit_with_numpy = model.fit(inputs, targets, epochs=1,
+                                 batch_size=10).history
+      fit_with_numpy_multiple_epochs = model.fit(
+          inputs, targets, epochs=2, batch_size=10).history
+      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+      predict_with_numpy = model.predict(inputs, batch_size=10)
+
+      dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs, targets)
+      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs)
+
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          dataset)), cardinality.UNKNOWN)
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          predict_dataset)), cardinality.UNKNOWN)
+
+      eval_with_ds = model.evaluate(dataset)
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+      if (distributed_training_utils.is_tpu_strategy(distribution) and
+          distribution.extended.steps_per_run != 1):
+        with self.assertRaisesRegexp(ValueError, '`steps_per_epoch` '
+                                     'should be specified'):
+          fit_with_ds = model.fit(dataset, epochs=1)
+      else:
+        fit_with_ds = model.fit(dataset,
+                                epochs=1).history
+        fit_with_ds_multiple_epochs = model.fit(dataset,
+                                                epochs=2).history
+        self.assertAllClose(
+            fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
+        self.assertAllClose(
+            fit_with_numpy_multiple_epochs,
+            fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4)
+
   @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
@@ -955,230 +1160,163 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
       self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
 
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_with_dataset_with_partial_batch(self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
 
-class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+      with distribution.scope():
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss)
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2))
-      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      # Removed device and input tensor shape details from the error message
-      # since the order of the device and the corresponding input tensor shape
-      # is not deterministic over different runs.
-      with self.assertRaisesRegexp(ValueError,
-                                   'Input tensor shapes do not match for '
-                                   'distributed tensor inputs '
-                                   'DistributedValues:.+'):
-        with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss)
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
-      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
-      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
-      x = values.DistributedValues(device_map, (a, b))
-      y = values.DistributedValues(device_map, (a, a))
-      # Removed device and input tensor dtype details from the error message
-      # since the order of the device and the corresponding input tensor dtype
-      # is not deterministic over different runs.
-      with self.assertRaisesRegexp(ValueError,
-                                   'Input tensor dtypes do not match for '
-                                   'distributed tensor inputs '
-                                   'DistributedValues:.+'):
-        with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs))
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_unsupported_features(self, distribution):
+      # As sample size is 10, we batch by 4 so that the last batch is
+      # a partial batch.
+      dataset_with_partial_batch = dataset.batch(4)
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+
+      self.assertAllClose(
+          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=3),
+          cpu_model.predict(dataset_with_partial_batch, steps=3),
+          atol=1e-5, rtol=1e-5)
+
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_multi_output_model_with_dataset_with_partial_batch(
+      self, distribution):
     with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+
       with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
+        model_with_ds_strategy.compile(optimizer, loss)
 
-      dataset = get_dataset(distribution)
+      cpu_model = simple_multi_inputs_multi_outputs_model()
+      cpu_model.compile(optimizer, loss)
 
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not '
-                      'supported when input `x` is a dataset or a '
-                      'dataset iterator.+'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported when input '
-                      '`x` is a dataset or a dataset iterator.'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
-
-      # Test with not specifying the `steps` argument.
-      with self.assertRaisesRegexp(
-          ValueError, 'the `steps_per_epoch` argument'):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError, 'the `steps` argument'):
-        model.evaluate(dataset, verbose=0)
-
-      with self.assertRaisesRegexp(ValueError, 'the `steps` argument'):
-        model.predict(dataset, verbose=0)
+      input_data, _ = get_multi_inputs_multi_outputs_data()
+      input_dict = {
+          'input_a': input_data['input_a'],
+          'input_b': input_data['input_b'],
+      }
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
-    with self.cached_session():
-      with distribution.scope():
-        model = get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(optimizer, loss, metrics=metrics)
+      dataset = dataset_ops.Dataset.from_tensor_slices(input_dict)
 
-      dataset = get_dataset(distribution)
+      # As sample size is 200, we batch by 18 using 12 steps per epoch so
+      # that the last batch is a partial batch.
+      dataset_with_partial_batch = dataset.batch(18)
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
 
-      def schedule(_):
-        return 0.001
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+      self.assertAllClose(
+          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=12),
+          cpu_model.predict(dataset_with_partial_batch, steps=12),
+          atol=1e-4, rtol=1e-4)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
 
+class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
+  class IdentityRegularizer(keras.regularizers.Regularizer):
 
-class TestDistributionStrategyWithLossMasking(test.TestCase,
-                                              parameterized.TestCase):
+    def __call__(self, x):
+      return array_ops.identity(x)
 
-  # TODO(priyag): Enable all strategies for this test. Currently it does not
-  # work for TPU due to some invalid datatype.
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_masking(self, distribution):
-    with self.cached_session():
-      np.random.seed(1337)
-      x = np.array([[[1], [1]], [[0], [0]]])
-      with distribution.scope():
-        model = keras.models.Sequential()
-        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
-        model.add(
-            keras.layers.TimeDistributed(
-                keras.layers.Dense(1, kernel_initializer='one')))
-        model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
-      self.assertEqual(hist.history['loss'][0], 0)
+  class AddLayer(keras.layers.Layer):
 
+    def build(self, _):
+      self.v = self.add_weight(
+          'v', (), initializer='ones',
+          regularizer=TestRegularizerLoss.IdentityRegularizer())
 
-class TestDistributionStrategyWithNormalizationLayer(
-    test.TestCase, parameterized.TestCase):
+    def call(self, inputs):
+      return inputs + self.v
 
-  @combinations.generate(combinations.times(
-      all_strategy_combinations(),
-      combinations.combine(fused=[True, False])))
-  def test_batchnorm_correctness(self, distribution, fused):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras.models.Sequential()
-        norm = keras.layers.BatchNormalization(
-            input_shape=(10,), momentum=0.8, fused=fused)
-        model.add(norm)
-        model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
-      x = x.astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
-      dataset = dataset.repeat(100)
-      dataset = batch_wrapper(dataset, 32, distribution)
+  @staticmethod
+  def loss_fn(_, y_pred):
+    return math_ops.reduce_mean(y_pred)
+
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_regularizer_loss(self, distribution):
+    batch_size = 2
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      batch_size //= distribution.num_replicas_in_sync
+
+      # Given an input x, which is always 1, and variable v, this model computes
+      # Loss=x+v+regularizer_loss, where regularizer_loss=v and the variable is
+      # initialized to 1. Therefore, this model computes Loss=1+2v, and so the
+      # gradient dLoss/dv = 2. This gradient of 2 is averaged over all examples
+      # in a batch and then multiplied by the learning rate of 1. As a result,
+      # the model update for one batch should subtract 2 from v, resulting in v
+      # being -1. If the regularizer loss is not scaled correctly by number of
+      # replicas, the variable value will be incorrect when number of replicas
+      # >1. For e.g. it will be -2 if num replicas = 2.
+    with distribution.scope():
+      x = keras.layers.Input(shape=(), batch_size=batch_size)
+      y = TestRegularizerLoss.AddLayer()(x)
+      model = keras.models.Model(inputs=x, outputs=y)
+      opt = gradient_descent_keras.SGD(1.)
+      model.compile(opt, loss=TestRegularizerLoss.loss_fn)
+      model.fit(
+          x=np.array([[1.], [1.]], dtype=np.float32),
+          y=np.array([[1.], [1.]], dtype=np.float32),
+          batch_size=batch_size)
+      v = model.get_weights()[0]
+      self.assertEqual(-1.0, v)
+
+
+class TestDistributionStrategyWithKerasModels(test.TestCase,
+                                              parameterized.TestCase):
 
-      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
-      predict_dataset = predict_dataset.repeat(100)
-      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)
+  @combinations.generate(all_strategy_combinations())
+  def test_distribution_strategy_on_sequential_model(self, distribution):
+    with distribution.scope():
+      model = simple_sequential_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(predict_dataset, steps=2)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+      inputs = np.zeros((20, 10), np.float32)
+      targets = np.zeros((20, 2), np.float32)
 
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
-class TestDistributionStrategyValidation(test.TestCase,
-                                         parameterized.TestCase):
+  @combinations.generate(all_strategy_combinations())
+  def test_distribution_strategy_on_functional_model(self, distribution):
+    with distribution.scope():
+      model = get_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-  @combinations.generate(all_strategy_combinations_minus_default())
-  def test_layer_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
-        with distribution.scope():
-          model = keras.Model(x, y)
-          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
   @combinations.generate(all_strategy_combinations_minus_default())
-  def test_model_outside_scope(self, distribution):
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'was not created in the distribution strategy'):
-        x = keras.layers.Input(shape=(3,), name='input')
-        y = keras.layers.Dense(4, name='dense')(x)
-        model = keras.Model(x, y)
-        with distribution.scope():
-          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-          loss = 'mse'
-          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-          model.compile(optimizer, loss, metrics=metrics)
+  def test_distribution_strategy_one_dimensional(self, distribution):
+    with distribution.scope():
+      inp = keras.layers.Input(shape=(10,))
+      out = keras.layers.Dense(3, activation='softmax')(inp)
+      model = keras.Model(inputs=[inp], outputs=[out])
+      model.compile(
+          optimizer='rmsprop',
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'],
+      )
+
+      x = np.random.random((64, 10)).astype('float32')
+      y = np.random.randint(3, size=64)
+
+      model.fit(x, y, epochs=1, steps_per_epoch=2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/keras_utils_test.py b/tensorflow/contrib/distribute/python/keras_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..da17722f4be46479d2614d81fb6b9af2f7e77e77
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_utils_test.py
@@ -0,0 +1,572 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.keras models with callbacks, checkpointing with dist strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_test as keras_test_lib
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rms_prop_keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import gradient_descent
+
+
+class Counter(keras.callbacks.Callback):
+  """Counts the number of times each callback method was run.
+
+  Attributes:
+    method_counts: dict. Contains the counts of time  each callback method was
+      run.
+  """
+
+  def __init__(self):
+    self.method_counts = collections.defaultdict(int)
+    methods_to_count = [
+        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
+        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
+        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
+        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
+        'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    ]
+    for method_name in methods_to_count:
+      setattr(self, method_name,
+              self.wrap_with_counts(method_name, getattr(self, method_name)))
+
+  def wrap_with_counts(self, method_name, method):
+
+    def _call_and_count(*args, **kwargs):
+      self.method_counts[method_name] += 1
+      return method(*args, **kwargs)
+
+    return _call_and_count
+
+
+class TestDistributionStrategyWithCallbacks(test.TestCase,
+                                            parameterized.TestCase):
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_fit(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    epochs = 2
+    steps_per_epoch = 5
+    validation_steps = 3
+
+    model.fit(
+        dataset,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        verbose=0,
+        validation_data=dataset,
+        validation_steps=validation_steps,
+        callbacks=[counter])
+
+    if isinstance(distribution, tpu_strategy.TPUStrategy):
+      # TPU Strategy can have multi step training, from extended.steps_per_run
+      # if steps_per_run = 1, then num_batch_call_per_epoch = steps_per_epoch
+      steps_per_run = distribution.extended.steps_per_run
+      num_batch_call_per_epoch = steps_per_epoch // steps_per_run
+      if steps_per_epoch % steps_per_run:
+        num_batch_call_per_epoch += 1
+    else:
+      num_batch_call_per_epoch = steps_per_epoch
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_batch_end': epochs * num_batch_call_per_epoch,
+            'on_epoch_begin': epochs,
+            'on_epoch_end': epochs,
+            'on_test_batch_begin': epochs * validation_steps,
+            'on_test_batch_end': epochs * validation_steps,
+            'on_test_begin': epochs,
+            'on_test_end': epochs,
+            'on_train_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_train_batch_end': epochs * num_batch_call_per_epoch,
+            'on_train_begin': 1,
+            'on_train_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_eval(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.evaluate(dataset, steps=5, callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_test_batch_begin': 5,
+            'on_test_batch_end': 5,
+            'on_test_begin': 1,
+            'on_test_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_predict(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.predict(
+        keras_test_lib.get_predict_dataset(dataset),
+        steps=5,
+        callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_predict_batch_begin': 5,
+            'on_predict_batch_end': 5,
+            'on_predict_begin': 1,
+            'on_predict_end': 1
+        })
+
+
+class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2))
+      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor shape details from the error message
+      # since the order of the device and the corresponding input tensor shape
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor shapes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
+      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor dtype details from the error message
+      # since the order of the device and the corresponding input tensor dtype
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor dtypes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not '
+          'supported when input `x` is a dataset or a '
+          'dataset iterator.+'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.5,
+            validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported when input '
+          '`x` is a dataset or a dataset iterator.'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test with not specifying the `steps` argument for dataset with infinite
+      # cardinality.
+      dataset = dataset.repeat()
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.evaluate(dataset, verbose=0)
+
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.ReduceLROnPlateau()])
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[combinations.one_device_strategy], mode=['graph']))
+  def test_distribution_strategy_with_add_metric_add_loss(self, distribution):
+    with distribution.scope():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+
+      err_msg = (
+          'We currently do not support compiling the model with distribution '
+          r'strategy if `model.add_loss\(tensor\)` or '
+          r'`model.add_metric\(tensor\)` has been called.')
+
+      # Test with add_metric.
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+      with self.assertRaisesRegex(ValueError, err_msg):
+        model.compile('sgd',)
+
+      # Test with add_loss.
+      model = keras.models.Model(x, y)
+      model.add_loss(math_ops.reduce_mean(y))
+      with self.assertRaisesRegex(ValueError, err_msg):
+        model.compile('sgd',)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[combinations.one_device_strategy], mode=['eager']))
+  def test_distribution_strategy_with_run_eagerly(self, distribution):
+    with distribution.scope():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+
+      err_msg = ('We currently do not support enabling `run_eagerly` with '
+                 'distribution strategy.')
+      with self.assertRaisesRegex(ValueError, err_msg):
+        model.compile('sgd', run_eagerly=True)
+
+  # TODO(b/124377929): Remove error assertions once subclassed models
+  # are supported in DistributedStrategy.
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_distribution_strategy_on_subclassed_model(self, distribution):
+    with distribution.scope():
+
+      class _SimpleMLP(keras.Model):
+
+        def __init__(self, num_labels):
+          super(_SimpleMLP, self).__init__()
+          self.dense = keras.layers.Dense(num_labels)
+
+        def call(self, inputs):
+          return self.dense(inputs)
+
+      model = _SimpleMLP(3)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We currently do not support distribution strategy with a '
+          '`Sequential` model that is created without '
+          '`input_shape`/`input_dim` set in its first layer or '
+          'a subclassed model.'):
+        model.compile('sgd')
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_distribution_strategy_on_deferred_sequential_model(
+      self, distribution):
+    with distribution.scope():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(16, activation='relu'))
+      model.add(keras.layers.Dense(3, activation='softmax'))
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We currently do not support distribution strategy with a '
+          '`Sequential` model that is created without '
+          '`input_shape`/`input_dim` set in its first layer or '
+          'a subclassed model.'):
+        model.compile('sgd')
+
+
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
+
+  # TODO(priyag): Enable all strategies for this test. Currently it does not
+  # work for TPU due to some invalid datatype.
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_masking(self, distribution):
+    with self.cached_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      with distribution.scope():
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Dense(1, kernel_initializer='one')))
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+      y = np.array([[[1], [1]], [[1], [1]]])
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
+      self.assertEqual(hist.history['loss'][0], 0)
+
+
+class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
+                                                     parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(fused=[True, False])))
+  def test_batchnorm_correctness(self, distribution, fused):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras.models.Sequential()
+        norm = keras.layers.BatchNormalization(
+            input_shape=(
+                10,
+                20,
+                30,
+            ), momentum=0.8, fused=fused)
+        model.add(norm)
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 20, 30))
+      x = x.astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
+      dataset = dataset.repeat(100)
+      dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
+
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
+      predict_dataset = predict_dataset.repeat(100)
+      predict_dataset = keras_test_lib.batch_wrapper(predict_dataset, 32,
+                                                     distribution)
+
+      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+      out = model.predict(predict_dataset, steps=2)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class TestDistributionStrategySaveLoadWeights(test.TestCase,
+                                              parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_h5(self, distribution):
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp('.h5')
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_trackable(self, distribution):
+    # TODO(b/123533246): Enable the test for TPU once bug is fixed
+    if (isinstance(distribution, tpu_strategy.TPUStrategy) and
+        distribution.extended.steps_per_run > 1):
+      self.skipTest('MultiStep TPU Strategy deadlocks with optimizer restore.')
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp()
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_layer_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        with distribution.scope():
+          model = keras.Model(x, y)
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_model_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        model = keras.Model(x, y)
+        with distribution.scope():
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index f06c9b75644b2890b7657f75e74e4e20a6f15705..16541c7a1e342f5636e238a301d0946d3e0c4bc4 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -220,7 +220,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def step_fn(ctx, inputs):
         del ctx  # Unused
-        fetches = distribution.unwrap(
+        fetches = distribution.experimental_local_results(
             distribution.extended.call_for_each_replica(
                 model_fn, args=(inputs,)))
         if update_ops_in_cross_replica_mode:
@@ -418,13 +418,15 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # tensors. But for non reduced losses, we need to have initial
         # values that are of the same structure as non reduced losses. In
         # MirroredStrategy, this will be a list of losses, in TPUStrategy
-        # it will be single tensor. Using `broadcast` followed by `unwrap`
-        # gives us the desired initial value structure.
+        # it will be single tensor. Using `call_for_each_replica` followed
+        # by `experimental_local_results` gives us the desired initial
+        # value structure.
+        not_reduced = distribution.experimental_local_results(
+            distribution.extended.call_for_each_replica(initial_loss))
         initial_loop_values = {
             "replica_loss_reduced": initial_loss(),
             "cross_replica_loss_reduced": initial_loss(),
-            "cross_replica_loss_not_reduced":
-            distribution.unwrap(distribution.broadcast(initial_loss()))
+            "cross_replica_loss_not_reduced": not_reduced,
         }
         ctx = distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=2,
@@ -468,11 +470,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
   def _verify_loss_output(self, initial_loss, loss_output, reduced,
                           distribution):
     if not reduced:
-      self.assertLen(distribution.unwrap(loss_output),
+      self.assertLen(distribution.experimental_local_results(loss_output),
                      distribution.num_replicas_in_sync)
       loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output)
     else:
-      unwrapped_output = distribution.unwrap(loss_output)
+      unwrapped_output = distribution.experimental_local_results(loss_output)
       self.assertLen(unwrapped_output, 1)
       loss_tensor = unwrapped_output[0]
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 2e23a51ee56ed1388a4387a51342aabce6d24bed..5391e083fc9b3ed99cc64bbed11bdeb8dea07f93 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
@@ -192,16 +190,6 @@ class MirroredExtended(CoreMirroredExtended):
     """
     return input_lib.DatasetIterator(dataset, self._input_workers)
 
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      return input_lib.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._input_workers, 0)
-    else:
-      return input_lib.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._input_workers,
-          auto_shard=self._auto_shard_dataset)
-
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index d6337d106fced921b8bda0a2faac99c2a77fab8e..8a1772b7f225bce18a96876f2585eb120c71a979 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
 import sys
 
 from absl.testing import parameterized
@@ -29,17 +30,19 @@ from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
@@ -103,7 +106,7 @@ class MirroredTwoDeviceDistributionTest(
       expected = sum(range(distribution.num_replicas_in_sync))
       self.assertEqual(expected, self.evaluate(reduced))
 
-  def testMakeInputFnIterator(self, distribution):
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
 
@@ -116,12 +119,33 @@ class MirroredTwoDeviceDistributionTest(
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+    def fn():
+      dataset = dataset_ops.Dataset.range(2).interleave(
+          (lambda _: dataset_ops.Dataset.range(10)), cycle_length=2)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    expected_values = [[i, i] for i in range(0, 10)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values, test_reinitialize=False)
+
   def testNumpyIterator(self, distribution):
     self._test_numpy_iterator(distribution)
 
   def testGlobalStepUpdate(self, distribution):
     self._test_global_step_update(distribution)
 
+  def testRun(self, distribution):
+    self._test_run(distribution)
+
   def testAllReduceSum(self, distribution):
     self._test_all_reduce_sum(distribution)
 
@@ -140,6 +164,9 @@ class MirroredTwoDeviceDistributionTest(
   def testAllReduceMeanGradientTape(self, distribution):
     self._test_all_reduce_mean_gradient_tape(distribution)
 
+  def testSummaryForReplicaZeroOnly(self, distribution):
+    self._test_summary_for_replica_zero_only(distribution)
+
 
 def one_device_combinations():
   return combinations.combine(
@@ -169,6 +196,9 @@ class MirroredOneDeviceDistributionTest(
   def testCallAndMergeExceptions(self, distribution):
     self._test_call_and_merge_exceptions(distribution)
 
+  def testRun(self, distribution):
+    self._test_run(distribution)
+
   def testAllReduceSum(self, distribution):
     self._test_all_reduce_sum(distribution)
 
@@ -219,7 +249,7 @@ class MirroredStrategyVariableCreatorStackTest(
         distribution.scope(), \
         variable_scope.variable_creator_scope(main_thread_creator):
       result = distribution.extended.call_for_each_replica(model_fn)
-      result = distribution.unwrap(result)
+      result = distribution.experimental_local_results(result)
       expected = ("main_thread:thread_0", "main_thread:thread_1")
       self.assertEqual(expected, result)
 
@@ -239,7 +269,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
     with distribution.scope():
       in_scope = ops.executing_eagerly_outside_functions()
       in_model_fn = distribution.extended.call_for_each_replica(model_fn)
-      unwrapped = distribution.unwrap(in_model_fn)
+      unwrapped = distribution.experimental_local_results(in_model_fn)
       self.assertEqual(in_scope, unwrapped[0])
       self.assertEqual(in_scope, originally)
 
@@ -247,10 +277,32 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
     with func_graph.FuncGraph("fg").as_default(), distribution.scope():
       in_scope = ops.executing_eagerly_outside_functions()
       in_model_fn = distribution.extended.call_for_each_replica(model_fn)
-      unwrapped = distribution.unwrap(in_model_fn)
+      unwrapped = distribution.experimental_local_results(in_model_fn)
       self.assertEqual(in_scope, unwrapped[0])
       self.assertEqual(in_scope, originally)
 
+  def testFunctionInCallForEachReplicaNoMergeCall(self, distribution):
+    @def_function.function
+    def model_fn():
+      return 0.
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual((0., 0.), self.evaluate(result.values))
+
+  def testFunctionInCallForEachReplicaWithMergeCall(self, distribution):
+    def merge_fn(_):
+      pass
+
+    @def_function.function
+    def model_fn():
+      ds_context.get_replica_context().merge_call(merge_fn)
+      return 0.
+
+    with distribution.scope():
+      with self.assertRaisesRegexp(
+          RuntimeError, "`merge_call` called while defining a new graph."):
+        distribution.extended.call_for_each_replica(model_fn)
 
 @combinations.generate(combinations.combine(
     distribution=[
@@ -412,7 +464,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       self.assertEqual("var0:0", v0.name)
       self.assertIsInstance(v1, values.MirroredVariable)
       self.assertEqual("common/var1:0", v1.name)
-      self.assertIsInstance(v2, values.ReplicaLocalVariable)
+      self.assertIsInstance(v2, values.SyncOnReadVariable)
       self.assertEqual("common/var2:0", v2.name)
       self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
       self.assertIsInstance(v3, values.MirroredVariable)
@@ -449,7 +501,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         self.assertEqual("main/var0:0", v0.name)
         self.assertIsInstance(v1, values.MirroredVariable)
         self.assertEqual("main/common/var1:0", v1.name)
-        self.assertIsInstance(v2, values.ReplicaLocalVariable)
+        self.assertIsInstance(v2, values.SyncOnReadVariable)
         self.assertEqual("main/common/var2:0", v2.name)
         self.assertEqual(variable_scope.VariableAggregation.SUM,
                          v2.aggregation)
@@ -589,6 +641,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             aggregation="invalid")
 
   def testNonMatchingVariableCreation(self, distribution):
+    self.skipTest("b/123075960")
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
       ds_context.get_replica_context().merge_call(lambda _: _)
@@ -600,7 +653,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       with self.assertRaises(RuntimeError):
         _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
-  def testReplicaLocalVariable(self, distribution):
+  def testSyncOnReadVariable(self, distribution):
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
@@ -616,8 +669,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           4.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.MEAN)
-      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
-      self.assertTrue(isinstance(v_mean, values.ReplicaLocalVariable))
+      self.assertIsInstance(v_sum, values.SyncOnReadVariable)
+      self.assertIsInstance(v_mean, values.SyncOnReadVariable)
       updates = [v_sum.assign_add(2.0 + replica_id),
                  v_mean.assign(6.0 * replica_id)]
       all_v_sum[replica_id] = v_sum
@@ -631,7 +684,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       return updates, v_sum, v_mean, c_sum, c_mean
 
     with distribution.scope():
-      # Create "sum" and "mean" versions of ReplicaLocalVariables.
+      # Create "sum" and "mean" versions of SyncOnReadVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
           distribution.extended.call_for_each_replica(model_fn))
       # Should see the same wrapping instance in all replicas.
@@ -648,7 +701,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Apply updates
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)])
+      self.evaluate([y for x in ret_ops  # pylint: disable=g-complex-comprehension
+                     for y in distribution.experimental_local_results(x)])
       expected_sum = 0.0
       expected_mean = 0.0
       for i, d in enumerate(distribution.extended.worker_devices):
@@ -694,16 +748,16 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       self.assertEqual(2, len(result))
       for v in result:
         self.assertIsInstance(v, values.DistributedValues)
-        _, v1 = distribution.unwrap(v)
+        _, v1 = distribution.experimental_local_results(v)
         self.assertStartsWith(v1._op.name, "replica_1/")
 
-  def testReplicaLocalVariableUpdate(self, distribution):
+  def testSyncOnReadVariableUpdate(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.SUM)
-      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      self.assertIsInstance(v_sum, values.SyncOnReadVariable)
       return v_sum
 
     def update(var, value):
@@ -714,7 +768,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Initialize variables.
       self.evaluate(variables.global_variables_initializer())
-      # Assert that the aggregated value of the replica local vars is the sum
+      # Assert that the aggregated value of the sync on read var is the sum
       # of the individual values before running the update ops.
       self.assertEqual(1.0, self.evaluate(ret_v_sum.get(
           distribution.extended.worker_devices[0]).read_value()))
@@ -724,7 +778,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       update_ops = distribution.extended.update(
           ret_v_sum, update, args=(5.0,), group=False)
       self.evaluate(update_ops)
-      # Assert that the aggregated value of the replica local vars is the sum
+      # Assert that the aggregated value of the sync on read vars is the sum
       # of the individual values after running the update ops.
       self.assertEqual(5.0, self.evaluate(ret_v_sum.get(
           distribution.extended.worker_devices[0]).read_value()))
@@ -733,11 +787,11 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
   def testVarDistributeStrategy(self, distribution):
     with distribution.scope():
       mirrored = variable_scope.variable(1.0)
-      replica_local = variable_scope.variable(
+      sync_on_read = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ)
       self.assertIs(distribution, mirrored.distribute_strategy)
-      self.assertIs(distribution, replica_local.distribute_strategy)
+      self.assertIs(distribution, sync_on_read.distribute_strategy)
 
 
 @combinations.generate(combinations.combine(
@@ -763,7 +817,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
         self.assertEqual(2, len(result))
         for v, name in zip(result, ["a", "b"]):
           self.assertIsInstance(v, values.DistributedValues)
-          v0, v1 = distribution.unwrap(v)
+          v0, v1 = distribution.experimental_local_results(v)
           self.assertEqual("main/foo/" + name + ":0", v0.name)
           self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name)
 
@@ -780,7 +834,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
       self.assertEqual(2, len(result))
       for v, name in zip(result, ["a", "b"]):
         self.assertIsInstance(v, values.DistributedValues)
-        v0, v1 = distribution.unwrap(v)
+        v0, v1 = distribution.experimental_local_results(v)
         self.assertEqual("foo/" + name + ":0", v0.name)
         self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
 
@@ -807,9 +861,9 @@ class MirroredStrategyNameScopeTest(test.TestCase):
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = distribution.unwrap(a)
-      b0, b1 = distribution.unwrap(result_b)
-      c0, c1 = distribution.unwrap(result_c)
+      a0, a1 = distribution.experimental_local_results(a)
+      b0, b1 = distribution.experimental_local_results(result_b)
+      c0, c1 = distribution.experimental_local_results(result_c)
       self.assertEqual("main/a:0", a0.name)
       self.assertEqual("main/a/replica_1:0", a1.name)
       self.assertEqual("main/b:0", b0.name)
@@ -836,9 +890,9 @@ class MirroredStrategyNameScopeTest(test.TestCase):
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = distribution.unwrap(a)
-      b0, b1 = distribution.unwrap(result_b)
-      c0, c1 = distribution.unwrap(result_c)
+      a0, a1 = distribution.experimental_local_results(a)
+      b0, b1 = distribution.experimental_local_results(result_b)
+      c0, c1 = distribution.experimental_local_results(result_c)
       self.assertEqual("a:0", a0.name)
       self.assertEqual("a/replica_1:0", a1.name)
       self.assertEqual("b:0", b0.name)
@@ -908,7 +962,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError, "You must specify an aggregation method to update a "
                       "MirroredVariable in Replica Context."):
-        self.evaluate(distribution.unwrap(
+        self.evaluate(distribution.experimental_local_results(
             distribution.extended.call_for_each_replica(model_fn)))
 
   def testAssignMirroredVarReplicaContextWithSum(self, distribution):
@@ -930,7 +984,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
           "with the given reduce op ReduceOp.SUM."):
-        self.evaluate(distribution.unwrap(
+        self.evaluate(distribution.experimental_local_results(
             distribution.extended.call_for_each_replica(model_fn)))
 
   def testAssignMirroredVarCrossDeviceContext(self, distribution):
@@ -962,7 +1016,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(0.5, self.evaluate(mirrored_var))
 
@@ -980,7 +1034,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(5.0, self.evaluate(mirrored_var))
 
@@ -1023,7 +1077,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(1.5, self.evaluate(mirrored_var))
 
@@ -1041,7 +1095,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(6.0, self.evaluate(mirrored_var))
 
@@ -1076,7 +1130,7 @@ class MirroredVariableUpdateTest(test.TestCase):
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(4.5, self.evaluate(mirrored_var))
 
@@ -1094,7 +1148,7 @@ class MirroredVariableUpdateTest(test.TestCase):
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(distribution.unwrap(
+      self.evaluate(distribution.experimental_local_results(
           distribution.extended.call_for_each_replica(model_fn)))
       self.assertEqual(4.0, self.evaluate(mirrored_var))
 
@@ -1104,7 +1158,7 @@ class MirroredVariableUpdateTest(test.TestCase):
         combinations.mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph", "eager"]))
-class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
+class MirroredAndSyncOnReadVariableInitializerTest(test.TestCase):
 
   def testAssignMirroredVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
@@ -1130,17 +1184,16 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
             1.0,
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+        self.assertIsInstance(v_sum, values.SyncOnReadVariable)
         return v_sum
 
       with distribution.scope():
-        replica_local_var = distribution.extended.call_for_each_replica(
+        sync_on_read_var = distribution.extended.call_for_each_replica(
             model_fn)
-        self.assertTrue(isinstance(replica_local_var,
-                                   values.ReplicaLocalVariable))
-        self.assertFalse(self.evaluate(replica_local_var.is_initialized()))
-        self.evaluate(replica_local_var.initializer)
-        self.assertTrue(self.evaluate(replica_local_var.is_initialized()))
+        self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
+        self.assertFalse(self.evaluate(sync_on_read_var.is_initialized()))
+        self.evaluate(sync_on_read_var.initializer)
+        self.assertTrue(self.evaluate(sync_on_read_var.is_initialized()))
 
 
 @combinations.generate(combinations.combine(
@@ -1148,7 +1201,7 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         combinations.mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph", "eager"]))
-class ReplicaLocalVariableAssignTest(test.TestCase):
+class SyncOnReadVariableAssignTest(test.TestCase):
 
   def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
@@ -1159,24 +1212,23 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       return v_sum
 
     with distribution.scope():
-      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
-      self.assertTrue(isinstance(replica_local_var,
-                                 values.ReplicaLocalVariable))
+      sync_on_read_var = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
       # values on each of the replicas.
       self.assertEqual(2.0, self.evaluate(
-          distribution.extended.read_var(replica_local_var)))
+          distribution.extended.read_var(sync_on_read_var)))
       # Assigning 6.0 in cross replica context will assign a value of
       # 6.0/num_replicas to each replica.
-      tlv_ops = replica_local_var.assign(6.0)
+      tlv_ops = sync_on_read_var.assign(6.0)
       self.evaluate(tlv_ops)
-      # On reading the replica local var we should get the assigned value back.
+      # On reading the sync on read var we should get the assigned value back.
       # The value on all the replicas are added before being returned by
       # `read_var`.
       self.assertEqual(6.0, self.evaluate(
-          distribution.extended.read_var(replica_local_var)))
+          distribution.extended.read_var(sync_on_read_var)))
 
   def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
@@ -1187,21 +1239,20 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       return v_sum
 
     with distribution.scope():
-      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
-      self.assertTrue(isinstance(replica_local_var,
-                                 values.ReplicaLocalVariable))
+      sync_on_read_var = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(sync_on_read_var, values.SyncOnReadVariable)
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
       # on all replicas which is the value assigned in replica context.
       self.assertEqual(1.0, self.evaluate(
-          distribution.extended.read_var(replica_local_var)))
-      tlv_ops = replica_local_var.assign(6.0)
+          distribution.extended.read_var(sync_on_read_var)))
+      tlv_ops = sync_on_read_var.assign(6.0)
       self.evaluate(tlv_ops)
-      # On reading the replica local var we should get the MEAN of all values
+      # On reading the sync on read var we should get the MEAN of all values
       # which is equal to the value assigned.
       self.assertEqual(6.0, self.evaluate(
-          distribution.extended.read_var(replica_local_var)))
+          distribution.extended.read_var(sync_on_read_var)))
 
 
 class MockModel(object):
@@ -1416,7 +1467,7 @@ class MultiWorkerMirroredStrategyTest(
       self.assertEqual(a.device, "/job:worker/task:0")
       self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
 
-  def testMakeInputFnIterator(self, distribution):
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     self._configure_distribution_strategy(distribution)
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
     num_gpus = context.num_gpus()
@@ -1437,6 +1488,32 @@ class MultiWorkerMirroredStrategyTest(
       self._test_input_fn_iterator(
           iterator, distribution.extended.worker_devices, expected_values, sess)
 
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    def fn():
+      dataset = dataset_ops.Dataset.range(100)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = []
+    for i in range(0, 100, num_gpus):
+      expected_values.append([i+j for j in range(num_gpus)] * num_workers)
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess,
+          test_reinitialize=False)
+
   def testUpdateConfigProto(self, distribution):
     distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
 
@@ -1470,6 +1547,31 @@ class MultiWorkerMirroredStrategyTestWithChief(
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
+  def testMinimizeLossGraphCoreMirroredStrategyWithOneNode(self):
+    cluster_spec = {}
+    cluster_spec["chief"] = self._cluster_spec["chief"]
+    tf_config = {"cluster": cluster_spec}
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(tf_config)}):
+      strategy = mirrored_strategy.CoreMirroredStrategy()
+      self.assertIsInstance(strategy.extended._inferred_cross_device_ops,
+                            cross_device_ops_lib.NcclAllReduce)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+  def testInitializeFromTFConfig(self):
+    tf_config = {"cluster": self._cluster_spec}
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(tf_config)}):
+      strategy = mirrored_strategy.CoreMirroredStrategy()
+      self.assertEqual(
+          max(context.num_gpus(), 1) * 3, strategy.num_replicas_in_sync)
+
+  def testSummaryForReplicaZeroOnly(self):
+    strategy = mirrored_strategy.CoreMirroredStrategy(
+        mirrored_strategy.all_local_devices())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_summary_for_replica_zero_only(strategy)
+
 
 def _replica_id():
   replica_id = ds_context.get_replica_context().replica_id_in_sync_group
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
index 16be839e1d155003b9490fbe3da6ab85b7d2d78a..c0651610cafc06a6d5f4206f4e64d27020fae30b 100644
--- a/tensorflow/contrib/distribute/python/monitor_test.py
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -23,9 +23,9 @@ import numpy
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import monitor as monitor_lib
-from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
 from tensorflow.python.client import session
+from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index c4622cdd2af2f6a9c936fe554bcc2eb76f805fdc..23f976f8fa4705578da2be414c05f1f14e13c9fa 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -56,7 +56,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
-      sess.run(distribution.unwrap(assign))
+      sess.run(distribution.experimental_local_results(assign))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
                      2.0 - 0.5 * (replica_id[0] - 1)]
@@ -82,7 +82,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var, assign_op = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([0.0, 0.0], var.eval())
-      sess.run(distribution.unwrap(assign_op))
+      sess.run(distribution.experimental_local_results(assign_op))
       # Mean of val across calls to replica_fn().
       average_val = [1.0 + 0.5 * (replica_id[0] - 1),
                      2.0 - 0.5 * (replica_id[0] - 1)]
@@ -155,7 +155,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
-      sess.run(distribution.unwrap(assign))
+      sess.run(distribution.experimental_local_results(assign))
       self.assertAllClose(
           [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
           var.eval())
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index b05aac431f65b4281d9ed9c2fa95c210d55f4008..ce448840f14e3816f1d40328239256fd5acd51bf 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -37,12 +37,16 @@ except ImportError as _error:  # pylint: disable=invalid-name
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.estimator import run_config
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
 
+
+original_run_std_server = dc._run_std_server  # pylint: disable=protected-access
+
 ASSIGNED_PORTS = set()
 lock = threading.Lock()
 
@@ -343,9 +347,9 @@ class MockOsEnv(collections.Mapping):
   def __iter__(self):
     if not hasattr(self._thread_local, 'dict'):
       self._thread_local.dict = dict()
-    for x in self._thread_local.dict.items():
+    for x in self._thread_local.dict:
       yield x
-    for x in self._dict.items():
+    for x in self._dict:
       yield x
 
   def __len__(self):
@@ -357,6 +361,22 @@ class MockOsEnv(collections.Mapping):
 class IndependentWorkerTestBase(test.TestCase):
   """Testing infra for independent workers."""
 
+  def _make_mock_run_std_server(self):
+    thread_local = threading.local()
+
+    def _mock_run_std_server(*args, **kwargs):
+      ret = original_run_std_server(*args, **kwargs)
+      # Wait for all std servers to be brought up in order to reduce the chance
+      # of remote sessions taking local ports that have been assigned to std
+      # servers. Only call this barrier the first time this function is run for
+      # each thread.
+      if not getattr(thread_local, 'server_started', False):
+        self._barrier.wait()
+      thread_local.server_started = True
+      return ret
+
+    return _mock_run_std_server
+
   def setUp(self):
     self._mock_os_env = MockOsEnv()
     self._mock_context = test.mock.patch.object(os, 'environ',
@@ -409,3 +429,25 @@ class IndependentWorkerTestBase(test.TestCase):
 
   def join_independent_workers(self, worker_threads):
     self._coord.join(worker_threads)
+
+
+def get_tf_config_task():
+  return json.loads(os.environ['TF_CONFIG'])['task']
+
+
+def get_tf_config_cluster_spec():
+  return json.loads(os.environ['TF_CONFIG'])['cluster']
+
+
+def get_task_type():
+  return get_tf_config_task()['type']
+
+
+def get_task_index():
+  return get_tf_config_task()['index']
+
+
+def is_chief():
+  return ('chief' not in get_tf_config_cluster_spec()
+          and get_task_type() == 'worker'
+          and get_task_index() == 0)
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 836cb7cc41b62352fd69a4a209d483ccf0fc498e..13a501394ee1fec2dfc1427f6d16d3a4624d7747 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -18,199 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import input_lib
-from tensorflow.python.distribute import numpy_dataset
-from tensorflow.python.distribute import values
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import one_device_strategy
 
-
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-class OneDeviceStrategy(distribute_lib.DistributionStrategy):
-  """A distribution strategy for running on a single device."""
-  # TODO(josh11b): Do we wrap values in types to generate errors if you are
-  # doing something that won't work with other DistributionStrategy
-  # implementations?
-
-  def __init__(self, device):
-    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
-
-
-class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of OneDeviceStrategy."""
-
-  def __init__(self, container_strategy, device):
-    super(OneDeviceExtended, self).__init__(container_strategy)
-    self._device = device
-    self._default_device = device
-    self._input_device = device_util.canonicalize("/device:CPU:0")
-    worker_device_pairs = [(self._input_device, [self._device])]
-    device_map = values.SingleDeviceMap(device)
-    self._input_workers = input_lib.InputWorkers(
-        device_map, worker_device_pairs)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    colocate_with = kwargs.pop("colocate_with", None)
-    if colocate_with is None:
-      with ops.device(self._device):
-        return next_creator(*args, **kwargs)
-    with ops.colocate_with(colocate_with):
-      return next_creator(*args, **kwargs)
-
-  def _validate_colocate_with_variable(self, colocate_with_variable):
-    values.validate_colocate(colocate_with_variable, self)
-
-  def _make_dataset_iterator(self, dataset):
-    """Make iterator from dataset without splitting the batch."""
-    return input_lib.DatasetIterator(dataset, self._input_workers)
-
-  def _distribute_dataset(self, dataset_fn):
-    return input_lib.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._input_workers, 0)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    return input_lib.InputFunctionIterator(
-        input_fn, self._input_workers, [distribute_lib.InputContext()])
-
-  def _experimental_make_numpy_dataset(self, numpy_input, session):
-    return numpy_dataset.one_host_numpy_dataset(
-        numpy_input, self._input_device, session)
-
-  def _broadcast_to(self, tensor, destinations):
-    del destinations
-    return tensor
-
-  # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
-                                          initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = input_lib.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_result = fn(ctx, iterator.get_next())
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    # TODO(priyag): Use max_iterations instead of an explicit counter.
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    strategy = self._container_strategy()
-    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
-      return fn(*args, **kwargs)
-
-  def _reduce_to(self, reduce_op, value, destinations):
-    del reduce_op, destinations
-    return value
-
-  def _update(self, var, fn, args, kwargs, group):
-    # The implementations of _update() and _update_non_slot() are identical
-    # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
-
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    del colocate_with
-    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def read_var(self, replica_local_var):
-    """Read the aggregate value of a replica-local variable."""
-    return array_ops.identity(replica_local_var)
-
-  def _unwrap(self, value):
-    return (value,)
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def _num_replicas_in_sync(self):
-    return 1
-
-  @property
-  def worker_devices(self):
-    return (self._device,)
-
-  @property
-  def parameter_devices(self):
-    return (self._device,)
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return (self._device,)
-
-  @property
-  def experimental_should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  # TODO(priyag): Delete this once all strategies use global batch size.
-  @property
-  def _global_batch_size(self):
-    """Global and per-replica batching are equivalent for OneDeviceStrategy."""
-    return True
-
-
-class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext for OneDeviceStrategy."""
-
-  def __init__(self, strategy):
-    zero = constant_op.constant(0, dtypes.int32)
-    distribute_lib.ReplicaContext.__init__(
-        self, strategy, replica_id_in_sync_group=zero)
-
-  @property
-  def devices(self):
-    return self._strategy.extended.worker_devices
+OneDeviceStrategy = one_device_strategy.OneDeviceStrategy
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index f81466a6c75f1cf287cdb00917872f77383c615e..e7eab1f7d772f66554a9d4646d97bffb0e0d578c 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -18,36 +18,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
-from tensorflow.python.framework import test_util
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.one_device_strategy,
+        combinations.one_device_strategy_gpu],
+    mode=["eager", "graph"]))
 class OneDeviceStrategyTest(
     strategy_test_lib.DistributionTestBase,
     strategy_test_lib.OneDeviceDistributionTestBase):
 
-  def _get_distribution_strategy(self):
-    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMakeInputFnIterator(self):
-    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i] for i in range(10)]
     input_fn = self._input_fn_to_test_input_context(
@@ -55,31 +54,49 @@ class OneDeviceStrategyTest(
         expected_num_replicas_in_sync=1,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    iterator = d.make_input_fn_iterator(input_fn)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, distribution.extended.worker_devices, expected_values)
+
+  def testMakeInputFnIteratorWithCallable(self, distribution):
+    def fn():
+      dataset = dataset_ops.Dataset.range(10)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(
-        iterator, d.extended.worker_devices, expected_values)
+        iterator, distribution.extended.worker_devices, expected_values,
+        test_reinitialize=False)
+
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testNumpyIterator(self):
-    self._test_numpy_iterator(self._get_distribution_strategy())
+  def testRun(self, distribution):
+    self._test_run(distribution)
 
-  def testAllReduceSum(self):
-    self._test_all_reduce_sum(self._get_distribution_strategy())
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
 
-  def testAllReduceSumGradients(self):
-    self._test_all_reduce_sum_gradients(self._get_distribution_strategy())
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
 
-  def testAllReduceSumGradientTape(self):
-    self._test_all_reduce_sum_gradient_tape(self._get_distribution_strategy())
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
 
-  def testAllReduceMean(self):
-    self._test_all_reduce_mean(self._get_distribution_strategy())
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
 
-  def testAllReduceMeanGradients(self):
-    self._test_all_reduce_mean_gradients(self._get_distribution_strategy())
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
 
-  def testAllReduceMeanGradientTape(self):
-    self._test_all_reduce_mean_gradient_tape(self._get_distribution_strategy())
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index e388061b17a9b92dedbbf9839049b13c8575a22c..01bb7fbedc645cf8892f83445d0635916fd4dcca 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -45,7 +45,7 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
       def run_step():
         return control_flow_ops.group(
-            distribution.unwrap(
+            distribution.experimental_local_results(
                 distribution.extended.call_for_each_replica(
                     model_fn, args=(iterator.get_next(),))))
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 0da269204fcf7d8217867ade7ab42a9690661a42..be863322256f7b5b93d91fa2e7ae1754b2494e3d 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -157,8 +157,8 @@ class ParameterServerExtended(CoreParameterServerExtended):
     cluster_resolver = SimpleClusterResolver(
         cluster_spec=tfconfig.cluster_spec(),
         task_type=tfconfig.task_type,
-        task_index=tfconfig.task_index,
-        num_accelerators=num_gpus_per_worker)
+        task_id=tfconfig.task_id,
+        num_accelerators={'GPU': num_gpus_per_worker})
     super(ParameterServerExtended, self).__init__(
         container_strategy, cluster_resolver=cluster_resolver)
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 9e7e201519e5c08adf3b89973d88244a3d0f07ab..9966f90a84bf986462a008468cd267f801990f73 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -90,12 +90,12 @@ def create_test_objects(cluster_spec=None,
       cluster_resolver = SimpleClusterResolver(
           cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
           task_type=task_type,
-          task_index=task_id,
-          num_accelerators=num_gpus)
+          task_id=task_id,
+          num_accelerators={'GPU': num_gpus})
       target = 'grpc://' + cluster_spec[WORKER][task_id]
     else:
       cluster_resolver = SimpleClusterResolver(
-          ClusterSpec({}), num_accelerators=num_gpus)
+          ClusterSpec({}), num_accelerators={'GPU': num_gpus})
       target = ''
 
     distribution = MockCoreParameterServerStrategy(cluster_resolver)
@@ -514,7 +514,7 @@ class ParameterServerStrategyTestBase(
       def update(v, g):
         return v.assign_sub(0.05 * g, use_locking=True)
 
-      one = d.broadcast(constant_op.constant([[1.]]))
+      one = constant_op.constant([[1.]])
 
       def step():
         """Perform one optimization step."""
@@ -571,6 +571,7 @@ class ParameterServerStrategyTestBase(
                               num_gpus,
                               input_fn,
                               expected_values,
+                              test_reinitialize=True,
                               use_core_strategy=False):
     distribution, master_target, config = self._get_test_objects(
         task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
@@ -594,13 +595,14 @@ class ParameterServerStrategyTestBase(
                   for r in range(len(devices))])
 
       # After re-initializing the iterator, should be able to iterate again.
-      sess.run(iterator.initialize())
+      if test_reinitialize:
+        sess.run(iterator.initialize())
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = sess.run([values.select_replica(r, next_element)
-                                   for r in range(len(devices))])
-        self.assertEqual(expected_value, computed_value)
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          self.assertEqual(expected_value, computed_value)
 
 
 class ParameterServerStrategyTest(
@@ -694,22 +696,31 @@ class ParameterServerStrategyTest(
   def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
     self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
       combinations.combine(
           mode=['graph'],
           num_gpus=[1, 2],
           required_gpus=1,
-          use_core_strategy=[True, False]))
-  def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy):
+          use_core_strategy=[True, False],
+          use_dataset=[True, False]))
+  def DISABLED_testMakeInputFnIteratorDistributed(
+      self, num_gpus, use_core_strategy, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
@@ -719,23 +730,33 @@ class ParameterServerStrategyTest(
         num_gpus,
         input_fn,
         expected_values,
+        test_reinitialize=use_dataset,
         use_core_strategy=use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   @combinations.generate(
       combinations.combine(
           mode=['graph'],
           num_gpus=[1, 2],
           required_gpus=1,
-          use_core_strategy=[True, False]))
-  def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy):
+          use_core_strategy=[True, False],
+          use_dataset=[True, False]))
+  def DISABLED_testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
+                                            use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)  # only one worker and pipeline for local.
@@ -745,6 +766,7 @@ class ParameterServerStrategyTest(
         num_gpus,
         input_fn,
         expected_values,
+        test_reinitialize=use_dataset,
         use_core_strategy=use_core_strategy)
 
   @combinations.generate(
@@ -784,31 +806,37 @@ class ParameterServerStrategyTest(
     # Verify isolate_session_state
     self.assertTrue(new_config.isolate_session_state)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceSum(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
     self._test_all_reduce_sum(distribution)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceSumGradients(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
     self._test_all_reduce_sum_gradients(distribution)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceSumGradientTape(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
     self._test_all_reduce_sum_gradient_tape(distribution)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceMean(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
     self._test_all_reduce_mean(distribution)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceMeanGradients(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
     self._test_all_reduce_mean_gradients(distribution)
 
+  @combinations.generate(combinations.combine(required_gpus=[2]))
   def testAllReduceMeanGradientTape(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=2)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 2e2ee92b6e20471f367895ea53c0864bb3d1dae7..a562a1758d4b90b0f8d1f35a79e2c6415534fb33 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -18,9 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
@@ -33,12 +37,17 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
+from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+from tensorflow.python.util import nest
 
 
 class _TestException(Exception):
@@ -86,6 +95,20 @@ def _merge_call_merge_raises_fn():
   ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
+def _events_from_logdir(test_case, logdir):
+  """Reads summary events from log directory."""
+  test_case.assertTrue(gfile.Exists(logdir))
+  files = gfile.ListDirectory(logdir)
+  test_case.assertLen(files, 1)
+  records = list(tf_record.tf_record_iterator(os.path.join(logdir, files[0])))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
 class DistributionTestBase(test.TestCase):
   """Some tests that should work with any DistributionStrategy."""
 
@@ -94,9 +117,6 @@ class DistributionTestBase(test.TestCase):
       l = core.Dense(1, use_bias=False)
 
       def loss(x):
-        # TODO(josh11b): What if this constant was instead a captured
-        # value?  Would it need to be a value that has been passed
-        # through d.broadcast()?
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
         return y * y
       # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
@@ -107,7 +127,7 @@ class DistributionTestBase(test.TestCase):
       def update(v, g):
         return v.assign_sub(0.2 * g)
 
-      one = d.broadcast(constant_op.constant([[1.]]))
+      one = constant_op.constant([[1.]])
 
       def step():
         """Perform one optimization step."""
@@ -152,9 +172,6 @@ class DistributionTestBase(test.TestCase):
       l = core.Dense(1, use_bias=False)
 
       def loss(x):
-        # TODO(josh11b): What if this constant was instead a captured
-        # value?  Would it need to be a value that has been passed
-        # through d.broadcast()?
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
         return y * y
 
@@ -163,7 +180,7 @@ class DistributionTestBase(test.TestCase):
       def update(v, g):
         return v.assign_sub(learning_rate * g)
 
-      one = d.broadcast(constant_op.constant([[1.]]))
+      one = constant_op.constant([[1.]])
 
       def step():
         """Perform one optimization step."""
@@ -197,6 +214,39 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
+  def _test_summary_for_replica_zero_only(self, d):
+    logdir = tempfile.mkdtemp()
+
+    def run_fn():
+      """Function executed for each replica."""
+      with summary_writer.as_default():
+        replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+        return summary_ops.scalar("a", replica_id)
+
+    with self.cached_session() as sess, d.scope(), \
+        summary_ops.always_record_summaries():
+      # We need global_step because summary writing op *always* has global_step
+      # as input, even when we always record summary or never record summary.
+      global_step = training_util.get_or_create_global_step()
+      if not context.executing_eagerly():
+        # When executing eagerly, variables are initialized immediately after
+        # creation, and its initializer will be None.
+        global_step.initializer.run()
+      summary_writer = summary_ops.create_file_writer(logdir)
+      output = d.extended.call_for_each_replica(run_fn)
+      unwrapped = d.unwrap(output)
+      if not context.executing_eagerly():
+        sess.run(summary_writer.init())
+        sess.run(unwrapped)
+        sess.run(summary_writer.close())
+
+      events = _events_from_logdir(self, logdir)
+      # There will be 2 entries: 1 summary file header entry, and 1 entry
+      # written by replica 0.
+      self.assertLen(events, 2)
+      self.assertEqual(events[1].summary.value[0].tag, "a")
+      self.assertEqual(events[1].summary.value[0].simple_value, 0.0)
+
   def _test_replica_id(self, d):
     with d.scope():
       expected_devices = [False] * len(d.extended.worker_devices)
@@ -224,7 +274,7 @@ class DistributionTestBase(test.TestCase):
         dist.extended.call_for_each_replica(_merge_call_merge_raises_fn)
 
   def _input_fn_to_test_input_context(self,
-                                      dataset_fn,
+                                      dataset_or_callable_fn,
                                       expected_num_replicas_in_sync,
                                       expected_num_input_pipelines,
                                       expected_input_pipeline_id):
@@ -248,12 +298,12 @@ class DistributionTestBase(test.TestCase):
         self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
         worker_id_counter[0] += 1
 
-      return dataset_fn()
+      return dataset_or_callable_fn()
 
     return _input_fn
 
   def _test_input_fn_iterator(self, iterator, devices, expected_values,
-                              sess=None):
+                              sess=None, test_reinitialize=True):
     evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
     evaluate(iterator.initialize())
 
@@ -269,13 +319,14 @@ class DistributionTestBase(test.TestCase):
           [values.select_replica(r, next_element) for r in range(len(devices))])
 
     # After re-initializing the iterator, should be able to iterate again.
-    evaluate(iterator.initialize())
+    if test_reinitialize:
+      evaluate(iterator.initialize())
 
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_replica(r, next_element) for r in range(len(devices))])
-      self.assertEqual(expected_value, computed_value)
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = evaluate([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
+        self.assertEqual(expected_value, computed_value)
 
   def _test_global_step_update(self, strategy):
     with strategy.scope():
@@ -295,7 +346,7 @@ class DistributionTestBase(test.TestCase):
 
       train_ops, value = strategy.extended.call_for_each_replica(model_fn)
       self.evaluate(strategy.group(train_ops))
-      global_step_tensors = strategy.unwrap(value)
+      global_step_tensors = strategy.experimental_local_results(value)
       global_step_values = self.evaluate(global_step_tensors)
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
 
@@ -314,7 +365,8 @@ class DistributionTestBase(test.TestCase):
 
       def run_and_concatenate(strategy, i):
         x, y = strategy.experimental_run(lambda z: z, i)
-        x, y = self.evaluate((strategy.unwrap(x), strategy.unwrap(y)))
+        x, y = self.evaluate((strategy.experimental_local_results(x),
+                              strategy.experimental_local_results(y)))
         return np.concatenate(x), np.concatenate(y)
 
       x_1, y_1 = run_and_concatenate(strategy, i)
@@ -330,6 +382,19 @@ class DistributionTestBase(test.TestCase):
 class OneDeviceDistributionTestBase(test.TestCase):
   """Some tests that should work with any one-device DistributionStrategy."""
 
+  def _test_run(self, strategy):
+    out1 = strategy.experimental_run_v2(lambda: constant_op.constant(4.))
+    self.assertAllEqual([4.], self.evaluate(strategy.unwrap(out1)))
+
+    out2 = strategy.experimental_run_v2(
+        lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
+    out2_vals = self.evaluate(nest.map_structure(strategy.unwrap, out2))
+    self.assertAllEqual([8.], out2_vals["a"])
+    self.assertAllEqual([16.], out2_vals["b"])
+
+    out3 = strategy.experimental_run_v2(lambda b, a: a + 2 * b + 2, kwargs=out2)
+    self.assertAllEqual([42.], self.evaluate(strategy.unwrap(out3)))
+
   def _test_all_reduce_sum(self, strategy):
     self._test_collective_comms(
         strategy, _all_sum, inputs=(4., [42., 43.]), expected=(4., [42., 43.]))
@@ -360,7 +425,8 @@ class OneDeviceDistributionTestBase(test.TestCase):
 
     self.evaluate(inputs.initialize())
     outputs = self.evaluate(
-        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+        list(map(strategy.experimental_local_results,
+                 strategy.experimental_run(comm_fn, inputs))))
     self.assertAllEqual([expected[0]], outputs[0])
     self.assertAllEqual([expected[1]], outputs[1])
 
@@ -380,7 +446,8 @@ class OneDeviceDistributionTestBase(test.TestCase):
     self.evaluate(inputs.initialize())
     self.assertAllEqual(
         expected_grads,
-        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+        self.evaluate(strategy.experimental_local_results(
+            strategy.experimental_run(step, inputs))))
 
   def _test_collective_comms_gradient_tape(
       self, strategy, comm_fn, inputs, expected_grads):
@@ -397,12 +464,27 @@ class OneDeviceDistributionTestBase(test.TestCase):
     self.evaluate(inputs.initialize())
     self.assertAllEqual(
         expected_grads,
-        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+        self.evaluate(strategy.experimental_local_results(
+            strategy.experimental_run(step, inputs))))
 
 
 class TwoDeviceDistributionTestBase(test.TestCase):
   """Some tests that should work with any two-device DistributionStrategy."""
 
+  def _test_run(self, strategy):
+    out1 = strategy.experimental_run_v2(
+        lambda: ds_context.get_replica_context().replica_id_in_sync_group + 1)
+    self.assertAllEqual([1, 2], self.evaluate(strategy.unwrap(out1)))
+
+    out2 = strategy.experimental_run_v2(
+        lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
+    out2_vals = self.evaluate(nest.map_structure(strategy.unwrap, out2))
+    self.assertAllEqual([2, 4], out2_vals["a"])
+    self.assertAllEqual([1, 4], out2_vals["b"])
+
+    out3 = strategy.experimental_run_v2(lambda b, a: a + 2 * b + 2, kwargs=out2)
+    self.assertAllEqual([6, 14], self.evaluate(strategy.unwrap(out3)))
+
   def _test_all_reduce_sum(self, strategy):
     self._test_collective_comms(
         strategy, _all_sum,
@@ -437,7 +519,8 @@ class TwoDeviceDistributionTestBase(test.TestCase):
 
     self.evaluate(inputs.initialize())
     outputs = self.evaluate(
-        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+        list(map(strategy.experimental_local_results,
+                 strategy.experimental_run(comm_fn, inputs))))
     self.assertAllEqual([expected[0], expected[0]], outputs[0])
     self.assertAllEqual([expected[1], expected[1]], outputs[1])
 
@@ -457,7 +540,8 @@ class TwoDeviceDistributionTestBase(test.TestCase):
     self.evaluate(inputs.initialize())
     self.assertAllEqual(
         expected_grads,
-        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+        self.evaluate(strategy.experimental_local_results(
+            strategy.experimental_run(step, inputs))))
 
   def _test_collective_comms_gradient_tape(
       self, strategy, comm_fn, inputs, expected_grads):
@@ -474,7 +558,8 @@ class TwoDeviceDistributionTestBase(test.TestCase):
     self.evaluate(inputs.initialize())
     self.assertAllEqual(
         expected_grads,
-        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+        self.evaluate(strategy.experimental_local_results(
+            strategy.experimental_run(step, inputs))))
 
 
 def _all_sum(value):
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index f55e6f0b2088f24e74400d330951a0471425f0eb..04e0af767bfaf94ed6a53ba9f8ed71ae4f9cdc4a 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,733 +21,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import functools
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
-from tensorflow.contrib.tpu.python.tpu import topology
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.contrib.tpu.python.tpu import training_loop
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import input_lib
-from tensorflow.python.distribute import numpy_dataset
-from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import nest
-
-
-def initialize_tpu_system(cluster_resolver=None):
-  """Initialize the TPU devices in a separate session and graph.
-
-  Args:
-    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
-        which provides information about the TPU cluster.
-  Returns:
-    The tf.contrib.tpu.Topology object for the topology of the TPU cluster.
-  """
-  if cluster_resolver is None:
-    cluster_resolver = resolver_lib.TPUClusterResolver("")
-  master = cluster_resolver.master()
-
-  logging.info("Initializing the TPU system.")
-  session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-
-  with ops.Graph().as_default():
-    with session_lib.Session(config=session_config, target=master) as sess:
-      serialized_topology = sess.run(tpu.initialize_system())
-  logging.info("Finished initializing TPU system.")
-  return topology.Topology(serialized=serialized_topology)
-
-
-def get_tpu_system_metadata(tpu_cluster_resolver):
-  """Retrieves TPU system metadata given a TPUClusterResolver."""
-  master = tpu_cluster_resolver.master()
-
-  # pylint: disable=protected-access
-  cluster_spec = tpu_cluster_resolver.cluster_spec()
-  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-  tpu_system_metadata = (
-      tpu_system_metadata_lib._query_tpu_system_metadata(
-          master,
-          cluster_def=cluster_def,
-          query_topology=False))
-
-  return tpu_system_metadata
-
-
-# TODO(jhseu): Deduplicate with MirroredStrategy?
-def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
-    strategy, device_map, logical_device, real_mirrored_creator,
-    *args, **kwargs):
-  # Figure out what collections this variable should be added to.
-  # We'll add the TPUMirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  # TODO(jhseu): Should we have different behavior for different
-  # synchronization settings?
-
-  # Get aggregation value
-  # TODO(jhseu): Support aggregation in a replica context.
-  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-  if aggregation not in [
-      vs.VariableAggregation.NONE,
-      vs.VariableAggregation.SUM,
-      vs.VariableAggregation.MEAN,
-      vs.VariableAggregation.ONLY_FIRST_REPLICA,
-  ]:
-    raise ValueError("Invalid variable aggregation mode: {} for variable: {}"
-                     .format(aggregation, kwargs["name"]))
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    devices = device_map.logical_to_actual_devices(logical_device)
-    value_list = real_mirrored_creator(devices, *args, **kwargs)
-    result = values.TPUMirroredVariable(
-        strategy, device_map, value_list, aggregation,
-        logical_device=logical_device)
-
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in value_list:
-        l.remove(v)
-    g.add_to_collections(collections, result)
-  return result
-
-
-class TPUStrategy(distribute_lib.DistributionStrategy):
-  """TPU distribution strategy implementation."""
-
-  def __init__(self,
-               tpu_cluster_resolver=None,
-               steps_per_run=None,
-               device_assignment=None,
-               **kwargs):
-    """Initializes the TPUStrategy object.
-
-    Args:
-      tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
-          which provides information about the TPU cluster.
-      steps_per_run: Number of steps to run on device before returning to the
-          host. Note that this can have side-effects on performance, hooks,
-          metrics, summaries etc.
-          This parameter is only used when Distribution Strategy is used with
-          estimator or keras.
-      device_assignment: Optional `tf.contrib.tpu.DeviceAssignment` to specify
-          the placement of replicas on the TPU cluster. Currently only supports
-          the usecase of using a single core within a TPU cluster.
-      **kwargs: Additional experimental flags. Will be removed in future.
-    """
-    super(TPUStrategy, self).__init__(TPUExtended(
-        self, tpu_cluster_resolver, steps_per_run, device_assignment))
-
-    self._disable_training_loop_on_host = False
-    if len(kwargs) > 1:
-      raise ValueError("TPUStrategy constructor only takes one experimental "
-                       "flag now")
-    if len(kwargs) == 1:
-      if "_disable_training_loop_on_host" not in kwargs:
-        raise ValueError("TPUStrategy constructor does not support arguments: "
-                         "{}".format(kwargs))
-      self._disable_training_loop_on_host = (
-          kwargs["_disable_training_loop_on_host"])
-
-  @property
-  def steps_per_run(self):
-    """DEPRECATED: use .extended.steps_per_run instead."""
-    return self._extended.steps_per_run
-
-
-class TPUExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of TPUStrategy."""
-
-  def __init__(self,
-               container_strategy,
-               tpu_cluster_resolver=None,
-               steps_per_run=None,
-               device_assignment=None):
-    super(TPUExtended, self).__init__(container_strategy)
-
-    if tpu_cluster_resolver is None:
-      tpu_cluster_resolver = resolver_lib.TPUClusterResolver("")
-
-    if steps_per_run is None:
-      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
-      # not specified.
-      steps_per_run = 1
-
-    self._tpu_cluster_resolver = tpu_cluster_resolver
-    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
-    self._device_assignment = device_assignment
-
-    # Device assignment is currently only supported for 1 core case.
-    if self._device_assignment:
-      assert isinstance(self._device_assignment,
-                        device_assignment_lib.DeviceAssignment)
-      if self._device_assignment.num_replicas != 1:
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-      if self._device_assignment.num_cores_per_replica != 1:
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
-        raise ValueError("Device assignment is only supported for a single "
-                         "core single replica case currently.")
-
-    # TODO(jhseu): Switch to DeviceAssignment to support pods and model
-    # parallelism.
-    self._device_index = {
-        d.name: i for i, d in enumerate(self._tpu_metadata.devices)
-        if "device:TPU:" in d.name
-    }
-    self._host_device = self.get_host_cpu_device(0)
-    self._tpu_devices = tuple(sorted(self._device_index.keys()))
-    # Only create variables for the number of replicas we're running.
-    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
-    self._device_map = values.ReplicaDeviceMap(self._tpu_devices)
-
-    # For input:
-    input_device_map = values.ReplicaDeviceMap(tuple(
-        self.get_host_cpu_device(hid) for hid in range(self.num_hosts)))
-    worker_devices = [
-        (self.get_host(hid), [self.get_host_cpu_device(hid)])
-        for hid in range(self.num_hosts)
-    ]
-    self._input_workers = input_lib.InputWorkers(
-        input_device_map, worker_devices)
-
-    # TODO(sourabhbajaj): Remove this once performance of running one step
-    # at a time is comparable to multiple steps.
-    self.steps_per_run = steps_per_run
-    self._require_static_shapes = True
-
-  def _validate_colocate_with_variable(self, colocate_with_variable):
-    values.validate_colocate_tpu_variable(colocate_with_variable, self)
-
-  def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
-                               input_shapes, iterations):
-    """Create an enqueue op for a single host identified using host_id.
-
-    The while_loop op returned will run `iterations` times and in each run
-    enqueue batches for each shard.
-
-    Args:
-      host_id: integer, id of the host to run the enqueue ops on.
-      multi_worker_iterator: MultiWorkerDataIterator to read the input data.
-      input_shapes: shape of inputs to be enqueue on the queue. This is same as
-        the value of `nest.flatten(iterator.output_shapes)`.
-      iterations: integer, number of iterations to be run; determines the
-        number of batches to be enqueued.
-
-    Returns:
-      while_loop_op running `iterations` times; in each run we enqueue a batch
-      on the infeed queue from the host with id `host_id` for each device shard.
-    """
-    host = self.get_host_cpu_device(host_id)
-    # TODO(sourabhbajaj): Possibly make changes to MultiWorkerDataset
-    # to work with TPU Prefetch so clean up this code.
-    iterator = (
-        multi_worker_iterator.get_iterator(self.get_host(host_id))._iterator)  # pylint: disable=protected-access
-
-    def _infeed_enqueue_ops_fn():
-      """Enqueue ops for one iteration."""
-      control_deps = []
-      sharded_inputs = []
-      enqueue_ops = []
-
-      with ops.device(host):
-        for _ in range(self.num_replicas_per_host):
-          # Use control dependencies to ensure a deterministic ordering.
-          with ops.control_dependencies(control_deps):
-            inputs = nest.flatten(iterator.get_next())
-            control_deps.extend(inputs)
-            sharded_inputs.append(inputs)
-
-      for core_id, shard_input in enumerate(sharded_inputs):
-        enqueue_ops.append(
-            tpu_ops.infeed_enqueue_tuple(
-                inputs=shard_input,
-                shapes=input_shapes,
-                device_ordinal=core_id))
-      return enqueue_ops
-
-    def enqueue_ops_loop_body(i):
-      """Callable for the loop body of the while_loop instantiated below."""
-      with ops.control_dependencies(_infeed_enqueue_ops_fn()):
-        return i + 1
-
-    with ops.device(host):
-      enqueue_op_per_host = control_flow_ops.while_loop(
-          lambda i: i < iterations,
-          enqueue_ops_loop_body,
-          [constant_op.constant(0)],
-          parallel_iterations=1)
-
-    return enqueue_op_per_host
-
-  def _make_dataset_iterator(self, dataset):
-    """Make iterators for each of the TPU hosts."""
-    return input_lib.DatasetIterator(dataset, self._input_workers,
-                                     self._num_replicas_in_sync)
-
-  def _distribute_dataset(self, dataset_fn):
-    return input_lib.MultiWorkerDataset(
-        functools.partial(self._call_dataset_fn, dataset_fn),
-        self._input_workers)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    input_contexts = []
-    num_workers = self._input_workers.num_workers
-    for i in range(num_workers):
-      input_contexts.append(distribute_lib.InputContext(
-          num_input_pipelines=num_workers,
-          input_pipeline_id=i,
-          num_replicas_in_sync=self._num_replicas_in_sync))
-    return input_lib.InputFunctionIterator(
-        input_fn, self._input_workers, input_contexts)
-
-  def _experimental_make_numpy_dataset(self, numpy_input, session):
-    return numpy_dataset.one_host_numpy_dataset(
-        numpy_input, numpy_dataset.SingleDevice(self.get_host_cpu_device(0)),
-        session)
-
-  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
-  # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
-  # a mechanism to infer the outputs of `fn`. Pending b/110550782.
-  def _experimental_run_steps_on_iterator(
-      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
-    output_shapes = multi_worker_iterator.output_shapes
-    shapes = nest.flatten(output_shapes)
-    if any(not s.is_fully_defined() for s in shapes):
-      raise ValueError(
-          "TPU currently requires fully defined shapes. Either use "
-          "set_shape() on the input tensors or use "
-          "dataset.batch(..., drop_remainder=True).")
-    types = nest.flatten(multi_worker_iterator.output_types)
-
-    enqueue_ops = [
-        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
-                                      iterations)
-        for host_id in range(self.num_hosts)]
-
-    def dequeue_fn():
-      dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
-      return nest.pack_sequence_as(output_shapes, dequeued)
-
-    # Wrap `fn` for repeat.
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-    ctx = input_lib.MultiStepContext()
-
-    def run_fn(*args, **kwargs):
-      """Single step on the TPU device."""
-      del args, kwargs
-      fn_result = fn(ctx, dequeue_fn())
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      if flat_last_step_outputs:
-        with ops.control_dependencies([fn_result]):
-          return [array_ops.identity(f) for f in flat_last_step_outputs]
-      else:
-        return fn_result
-
-    def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, initial_loop_values)
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop and TPU replicate context. This is useful in cases
-    # where we might need to exit these contexts and get back to the outer
-    # context to do some things, for e.g. create an op which should be
-    # evaluated only once at the end of the loop on the host. One such usage
-    # is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    # pylint: disable=protected-access
-    if self._container_strategy()._disable_training_loop_on_host:
-      replicate_inputs = [[]] * self._num_replicas_in_sync
-      replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
-    else:
-      def rewrite_fn(*args):
-        """The rewritten step fn running on TPU."""
-        del args
-        replicate_inputs = [[]] * self._num_replicas_in_sync
-        replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
-
-        # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
-        # will flatten it in this case. If run_fn has no tensor outputs,
-        # tpu.replicate returns a list of no_ops, we will keep the output as it
-        # is.
-        if isinstance(replicate_outputs[0], list):
-          replicate_outputs = nest.flatten(replicate_outputs)
-
-        return replicate_outputs
-
-      # TODO(sourabhbajaj): The input to while loop should be based on the
-      # output type of the step_fn
-      assert isinstance(initial_loop_values, list)
-      initial_loop_values = initial_loop_values * self._num_replicas_in_sync
-
-      # Put the while loop op on host 0.
-      with ops.device(self.get_host_cpu_device(0)):
-        replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
-                                                 initial_loop_values)
-
-    del self._outer_control_flow_context
-    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
-
-    if self._container_strategy()._disable_training_loop_on_host:
-      # Filter out any ops from the outputs, typically this would be the case
-      # when there were no tensor outputs.
-      last_step_tensor_outputs = [x for x in replicate_outputs
-                                  if not isinstance(x, ops.Operation)]
-
-      # Outputs are currently of the structure (grouped by device)
-      # [[output0_device0, output1_device0, output2_device0],
-      #  [output0_device1, output1_device1, output2_device1]]
-      # Convert this to the following structure instead: (grouped by output)
-      # [[output0_device0, output0_device1],
-      #  [output1_device0, output1_device1],
-      #  [output2_device0, output2_device1]]
-      last_step_tensor_outputs = [list(x) for x in
-                                  zip(*last_step_tensor_outputs)]
-    else:
-      if isinstance(replicate_outputs, list):
-        # Filter out any ops from the outputs, typically this would be the case
-        # when there were no tensor outputs.
-        last_step_tensor_outputs = [
-            x for x in replicate_outputs if not isinstance(x, ops.Operation)
-        ]
-
-        # Outputs are currently of the structure (flattened)
-        # [output0_device0, output1_device0, output2_device0,
-        #  output0_device1, output1_device1, output2_device1,
-        #  ...]
-        # Convert this to the following structure instead: (grouped by output)
-        # [[output0_device0, output0_device1],
-        #  [output1_device0, output1_device1],
-        #  [output2_device0, output2_device1]]
-        output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
-        last_step_tensor_outputs = [
-            last_step_tensor_outputs[i::output_num] for i in range(output_num)
-        ]
-      else:
-        # no tensors returned.
-        last_step_tensor_outputs = []
-
-    # Convert replicate_outputs to the original dict structure of
-    # last_step_outputs.
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been reduced, take the first value
-      # from the list as each value should be the same. Else return the full
-      # list of values.
-      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
-      # value.
-      if reduce_op is not None:
-        # TODO(priyag): Should this return the element or a list with 1 element
-        last_step_tensor_outputs_dict[name] = output[0]
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-
-    return ctx
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    # TODO(jhseu): Consider making it so call_for_each_replica implies that
-    # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    with _TPUReplicaContext(self._container_strategy()):
-      return fn(*args, **kwargs)
-
-  def _experimental_initialize_system(self):
-    """Experimental method added to be used by Estimator.
-
-    This is a private method only to be used by Estimator. Other frameworks
-    should directly be calling `tf.contrib.distribute.initialize_tpu_system`
-    """
-    initialize_tpu_system(self._tpu_cluster_resolver)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
-    colocate_with = kwargs.pop("colocate_with", None)
-    if colocate_with is None:
-      device_map = self._device_map
-      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
-    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
-      with ops.device(colocate_with.device):
-        return next_creator(*args, **kwargs)
-    else:
-      device_map = colocate_with.device_map
-      logical_device = colocate_with.logical_device
-
-    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      value_list = []
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = value_list[0].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-            # Initialize replicas with the same value:
-            if context.executing_eagerly():
-              kwargs["initial_value"] = array_ops.identity(
-                  value_list[0].value())
-            else:
-              def initial_value_fn(device=d):
-                with ops.device(device):
-                  return array_ops.identity(value_list[0].initial_value)
-              kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            v = next_creator(*args, **kwargs)
-          assert not isinstance(v, values.TPUMirroredVariable)
-          value_list.append(v)
-      return value_list
-
-    return _create_tpu_mirrored_variable(
-        self._container_strategy(), device_map, logical_device,
-        _real_mirrored_creator, *args, **kwargs)
-
-  def _reduce_to(self, reduce_op, value, destinations):
-    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if reduce_op == reduce_util.ReduceOp.MEAN:
-        # TODO(jhseu):  Revisit once we support model-parallelism.
-        value *= (1. / self._num_replicas_in_sync)
-      elif reduce_op != reduce_util.ReduceOp.SUM:
-        raise NotImplementedError(
-            "Currently only support sum & mean in TPUStrategy.")
-      return tpu_ops.cross_replica_sum(value)
-
-    if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerReplica or
-      # Mirrored values. For example, the same value could be present on all
-      # replicas in which case `value` would be a single value or value could
-      # be 0.
-      return cross_device_ops_lib.reduce_non_distributed_value(
-          reduce_op, self._device_map, value, destinations)
-
-    # Validate that the destination is same as the host device
-    # Note we don't do this when in replicate context as the reduction is
-    # performed on the TPU device itself.
-    devices = cross_device_ops_lib.get_devices_from(destinations)
-    if len(devices) == 1:
-      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
-          self._host_device)
-    else:
-      raise ValueError("Multiple devices are not supported for TPUStrategy")
-
-    output = math_ops.add_n(value)
-    if reduce_op == reduce_util.ReduceOp.MEAN:
-      return output * (1. / len(value))
-    return output
-
-  def _update(self, var, fn, args, kwargs, group):
-    assert isinstance(var, values.TPUMirroredVariable)
-    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if group:
-        return fn(var, *args, **kwargs)
-      else:
-        return (fn(var, *args, **kwargs),)
-
-    # Otherwise, we revert to MirroredStrategy behavior and update each variable
-    # directly.
-    updates = []
-    for i, (d, v) in enumerate(zip(var.devices, var.values)):
-      name = "update_%d" % i
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        # If args and kwargs are not mirrored, the value is returned as is.
-        updates.append(fn(v,
-                          *values.select_device_mirrored(d, args),
-                          **values.select_device_mirrored(d, kwargs)))
-    return values.update_regroup(self, self._device_map, updates, group)
-
-  def read_var(self, var):
-    assert isinstance(var, values.TPUMirroredVariable)
-    return var.read_value()
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      return tuple(val.get(device=d) for d in sorted(val.devices))
-    elif isinstance(val, list):
-      # TODO(josh11b): We need to remove this case; per device values should
-      # be represented using a PerReplica wrapper instead of a list with
-      # one entry per device.
-      return tuple(val)
-    elif isinstance(val, values.TPUMirroredVariable):
-      # pylint: disable=protected-access
-      if values._enclosing_tpu_context() is not None:
-        return (val,)
-      return val.values
-    return (val,)
-
-  def value_container(self, value):
-    return value
-
-  def _broadcast_to(self, tensor, destinations):
-    del destinations
-    return tensor
-
-  @property
-  def num_hosts(self):
-    if self._device_assignment is None:
-      return self._tpu_metadata.num_hosts
-
-    return len(set([self._device_assignment.host_device(r)
-                    for r in range(self._device_assignment.num_replicas)]))
-
-  @property
-  def num_replicas_per_host(self):
-    if self._device_assignment is None:
-      return self._tpu_metadata.num_of_cores_per_host
-
-    # TODO(sourabhbajaj): Remove this method we use inputs and remove infeed
-    # as the computation of num_replicas_per_host is not a constant
-    # when using device_assignment. This is a temporary workaround to support
-    # StatefulRNN as everything is 1 in that case.
-    # This method needs to take host_id as input for correct computation.
-    max_models_per_host = (self._tpu_metadata.num_of_cores_per_host //
-                           self._device_assignment.num_cores_per_replica)
-    models_per_host = min(self._device_assignment.num_replicas,
-                          max_models_per_host)
-    return models_per_host * self._device_assignment.num_cores_per_replica
-
-  @property
-  def _num_replicas_in_sync(self):
-    if self._device_assignment is None:
-      return self._tpu_metadata.num_cores
-    return (self._device_assignment.num_replicas *
-            self._device_assignment.num_cores_per_replica)
-
-  @property
-  def experimental_between_graph(self):
-    return False
-
-  @property
-  def experimental_should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  @property
-  def worker_devices(self):
-    return self._tpu_devices
-
-  @property
-  def parameter_devices(self):
-    return self._tpu_devices
-
-  def non_slot_devices(self, var_list):
-    return self._host_device
-
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    del colocate_with
-    with ops.device(self._host_device), distribute_lib.UpdateContext(
-        self._host_device):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def get_host(self, host_id):
-    if self._tpu_cluster_resolver.get_master() in ("", "local"):
-      return "/replica:0/task:0"
-    job_name = self._tpu_cluster_resolver.get_job_name() or "tpu_worker"
-    return "/job:%s/task:%d" % (job_name, host_id)
-
-  def get_host_cpu_device(self, host_id):
-    return self.get_host(host_id) + "/device:CPU:0"
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    del cluster_spec, task_type, task_id
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    updated_config.isolate_session_state = True
-    cluster_spec = self._tpu_cluster_resolver.cluster_spec()
-    if cluster_spec:
-      updated_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
-    return updated_config
-
-  # TODO(priyag): Delete this once all strategies use global batch size.
-  @property
-  def _global_batch_size(self):
-    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
-
-    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
-    batching.
-
-    Returns:
-      Boolean.
-    """
-    return True
-
-
-class _TPUReplicaContext(distribute_lib.ReplicaContext):
-  """Replication Context class for TPU Strategy."""
-
-  # TODO(sourabhbajaj): Call for each replica should be updating this.
-  def __init__(self, strategy):
-    # TODO(b/118385803): properly initialize replica_id, instead of always 0
-    replica_id = constant_op.constant(0, dtypes.int32)
-    distribute_lib.ReplicaContext.__init__(
-        self, strategy, replica_id_in_sync_group=replica_id)
-
-  @property
-  def devices(self):
-    distribute_lib.require_replica_context(self)
-    ds = self._strategy
-    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return (ds.extended.worker_devices[replica_id],)
+# pylint: disable=unused-import
+from tensorflow.python.distribute.tpu_strategy import TPUStrategy
+from tensorflow.python.tpu.tpu_strategy_util import initialize_tpu_system
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 51c58b0b2f3dc2ab63e22718825a471b8657f892..101c76ed0af4f178a98204b661f8f0a0f62a52fb 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -511,6 +511,34 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       sess.run({"complicated": mirrored})
 
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_cpu,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.tpu_strategy,
+      ],
+      mode=["graph", "eager"]))
+  def testAssignOutOfScope_mirrored(self, distribution):
+    with distribution.scope():
+      mirrored = variables_lib.Variable(1.)
+    if not isinstance(mirrored, values.MirroredVariable):
+      self.assertIsInstance(mirrored, values.TPUMirroredVariable)
+    self.evaluate(mirrored.assign(3.))
+    self.assertEqual(self.evaluate(mirrored.read_value()), 3.)
+    for component in mirrored.values:
+      self.assertEqual(self.evaluate(component.read_value()), 3.)
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.parameter_server_strategy_with_two_gpus],
+      mode=["graph", "eager"]))
+  def testAssignOutOfScope_aggregating(self, distribution):
+    with distribution.scope():
+      aggregating = variables_lib.Variable(1.)
+    self.assertIsInstance(aggregating, values.AggregatingVariable)
+    self.evaluate(aggregating.assign(3.))
+    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
+    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
 
 _devices = ("/device:GPU:0", "/device:CPU:0")
 
@@ -522,11 +550,11 @@ def _make_replica_local(method, strategy=None):
     with ops.device(d):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
-  replica_local = values.ReplicaLocalVariable(strategy, device_map, v, method)
+  replica_local = values.SyncOnReadVariable(strategy, device_map, v, method)
   return v, replica_local
 
 
-class ReplicaLocalVariablePropertiesTest(test.TestCase):
+class SyncOnReadVariablePropertiesTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -549,7 +577,7 @@ class ReplicaLocalVariablePropertiesTest(test.TestCase):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
     device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
-    replica_local = values.ReplicaLocalVariable(
+    replica_local = values.SyncOnReadVariable(
         None, device_map, (v,), variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, replica_local.name)
@@ -577,7 +605,7 @@ class ReplicaLocalVariablePropertiesTest(test.TestCase):
         combinations.mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph", "eager"]))
-class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
+class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
 
   def _assign_replica_local(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -656,7 +684,8 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, replica_local = _make_replica_local("sum", distribution)
+      v, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 3079175015a9aee1625404902070df8f13b2089c..c2300286d3be4bb757dac588623c47044a1a9db5 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -822,7 +822,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "affine_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/bijectors/affine_test.py"],
     additional_deps = [
         ":bijectors_py",
@@ -837,7 +837,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = ["noasan"],  # times out b/63678675
 )
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
index 7ae98878986eb10570b5e93a4a57d6bad6b38c0c..daab24e4333e1f0524f3016cfe367c20c7d1470a 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -158,10 +159,13 @@ def vector_size_to_square_matrix_size(d, validate_args, name=None):
     return int(n)
   else:
     with ops.name_scope(name, "vector_size_to_square_matrix_size", [d]) as name:
-      n = (-1. + math_ops.sqrt(1 + 8. * math_ops.to_float(d))) / 2.
+      n = (-1. + math_ops.sqrt(1 + 8. * math_ops.cast(d, dtypes.float32))) / 2.
       if validate_args:
-        with ops.control_dependencies([check_ops.assert_equal(
-            math_ops.to_float(math_ops.to_int32(n)), n,
-            message="Vector length is not a triangular number")]):
+        with ops.control_dependencies([
+            check_ops.assert_equal(
+                math_ops.cast(math_ops.cast(n, dtypes.int32), dtypes.float32),
+                n,
+                message="Vector length is not a triangular number")
+        ]):
           n = array_ops.identity(n)
       return math_ops.cast(n, d.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 452628257ea96713453bf2aa32b5baa9d6d0cb86..1006dfac49f36baa7cf5136f6f2982e3fd965298 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -249,9 +249,9 @@ class InverseGamma(distribution.Distribution):
       `self.allow_nan_stats` is `False`, an exception will be raised rather
       than returning `NaN`.""")
   def _variance(self):
-    var = (math_ops.square(self.rate)
-           / math_ops.square(self.concentration - 1.)
-           / (self.concentration - 2.))
+    var = (
+        math_ops.square(self.rate) / math_ops.squared_difference(
+            self.concentration, 1.) / (self.concentration - 2.))
     if self.allow_nan_stats:
       nan = array_ops.fill(
           self.batch_shape_tensor(),
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 19e99e03803e7f4cdfdb023feb04daaba68eceed..ad0f2317c99fc482d52d1bb7a3b3f4779d2ca439 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -304,14 +304,14 @@ def percentile(x,
     x = ops.convert_to_tensor(x, name="x")
     # Double is needed here and below, else we get the wrong index if the array
     # is huge along axis.
-    q = math_ops.to_double(q, name="q")
+    q = math_ops.cast(q, dtypes.float64, name="q")
     _get_static_ndims(q, expect_ndims=0)
 
     if validate_args:
       q = control_flow_ops.with_dependencies([
           check_ops.assert_rank(q, 0),
-          check_ops.assert_greater_equal(q, math_ops.to_double(0.)),
-          check_ops.assert_less_equal(q, math_ops.to_double(100.))
+          check_ops.assert_greater_equal(q, math_ops.cast(0., dtypes.float64)),
+          check_ops.assert_less_equal(q, math_ops.cast(100., dtypes.float64))
       ], q)
 
     if axis is None:
@@ -336,7 +336,7 @@ def percentile(x,
       y = _move_dims_to_flat_end(x, axis, x_ndims)
 
     frac_at_q_or_above = 1. - q / 100.
-    d = math_ops.to_double(array_ops.shape(y)[-1])
+    d = math_ops.cast(array_ops.shape(y)[-1], dtypes.float64)
 
     if interpolation == "lower":
       index = math_ops.ceil((d - 1) * frac_at_q_or_above)
@@ -349,7 +349,7 @@ def percentile(x,
     # let's use max/min to avoid out of bounds errors.
     d = array_ops.shape(y)[-1]
     # d - 1 will be distinct from d in int32.
-    index = clip_ops.clip_by_value(math_ops.to_int32(index), 0, d - 1)
+    index = clip_ops.clip_by_value(math_ops.cast(index, dtypes.int32), 0, d - 1)
 
     # Sort everything, not just the top 'k' entries, which allows multiple calls
     # to sort only once (under the hood) and use CSE.
diff --git a/tensorflow/contrib/distributions/python/ops/test_util.py b/tensorflow/contrib/distributions/python/ops/test_util.py
index 15b0820cbdf560e04a304c40a47e541006523b6d..b22ae1eb1543f86319dfcf2d841eb722516b1b34 100644
--- a/tensorflow/contrib/distributions/python/ops/test_util.py
+++ b/tensorflow/contrib/distributions/python/ops/test_util.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
@@ -125,7 +126,7 @@ class DiscreteScalarDistributionTestHelpers(object):
       atol: Python `float`-type indicating the admissible absolute error between
         analytical and sample statistics.
     """
-    x = math_ops.to_float(dist.sample(num_samples, seed=seed))
+    x = math_ops.cast(dist.sample(num_samples, seed=seed), dtypes.float32)
     sample_mean = math_ops.reduce_mean(x, axis=0)
     sample_variance = math_ops.reduce_mean(
         math_ops.square(x - sample_mean), axis=0)
@@ -180,7 +181,7 @@ class DiscreteScalarDistributionTestHelpers(object):
       lo = value_range[0]
       hi = value_range[1]
       if nbins is None:
-        nbins = math_ops.to_int32(hi - lo)
+        nbins = math_ops.cast(hi - lo, dtypes.int32)
       delta = (hi - lo) / math_ops.cast(
           nbins, dtype=value_range.dtype.base_dtype)
       edges = math_ops.range(
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 8966a9befcd3db4a3f397b319e80f37f84ad236b..d441e4735b64fe1176e77a978d281d46a7b287ab 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -144,7 +144,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 78ab155896cfeda4dd259a8529f4b1f77a12cf0b..48925b1bfacc6b59c210b2fb4b53a9a1a851673f 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class IteratorTest(test.TestCase):
@@ -238,7 +238,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
     iterator = datasets.Iterator(dataset)
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], iterator.get_next().numpy())
@@ -257,7 +257,7 @@ class IteratorTest(test.TestCase):
     dataset_2 = Dataset.range(10)
     iterator_3 = datasets.Iterator(dataset_2)
 
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], iterator_1.get_next().numpy())
     self.assertEqual(0, iterator_3.get_next().numpy())
@@ -279,7 +279,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(3)
     iterator = datasets.Iterator(dataset)
 
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertEqual(0, iterator.get_next().numpy())
     self.assertEqual(1, iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -293,7 +293,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(10)
     for i in range(5):
       iterator = datasets.Iterator(dataset)
-      checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+      checkpoint = trackable_utils.Checkpoint(iterator=iterator)
       checkpoint.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
       for j in range(2):
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index 97c299a911c9180bf69faa0fa46527e80eada790..3e0881754c750f4d36e2e4dd8b80835b031c658c 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -6,16 +6,16 @@ package(default_visibility = ["//tensorflow:internal"])
 py_library(
     name = "examples_pip",
     deps = [
-        "//tensorflow/contrib/eager/python/examples/densenet",
-        "//tensorflow/contrib/eager/python/examples/gan:mnist",
+        "//tensorflow/contrib/eager/python/examples/densenet:densenet_lib",
+        "//tensorflow/contrib/eager/python/examples/gan:mnist_lib",
         "//tensorflow/contrib/eager/python/examples/l2hmc",
         "//tensorflow/contrib/eager/python/examples/l2hmc:neural_nets",
-        "//tensorflow/contrib/eager/python/examples/linear_regression",
+        "//tensorflow/contrib/eager/python/examples/linear_regression:linear_regression_lib",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/revnet",
         "//tensorflow/contrib/eager/python/examples/revnet:config",
-        "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
-        "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/rnn_colorbot:rnn_colorbot_lib",
+        "//tensorflow/contrib/eager/python/examples/rnn_ptb:rnn_ptb_lib",
         "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index 56a682ec55a0bb07a16fe4d7d65a62f3c67c4292..fbb5daf230bb79f08a3d071062ddc0e8507ab324 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "densenet",
     srcs = ["densenet.py"],
     srcs_version = "PY2AND3",
+    deps = [":densenet_lib"],
+)
+
+py_library(
+    name = "densenet_lib",
+    srcs = ["densenet.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -17,13 +24,14 @@ py_binary(
 
 cuda_py_test(
     name = "densenet_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_test.py"],
     additional_deps = [
         ":densenet",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "optonly",
@@ -33,13 +41,14 @@ cuda_py_test(
 
 cuda_py_test(
     name = "densenet_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_graph_test.py"],
     additional_deps = [
         ":densenet",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "noasan",
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
index d64c8eb9ce122fa277567b2fbc632abfbc72df64..d99a519112787bad664232983208279cfb4d0036 100644
--- a/tensorflow/contrib/eager/python/examples/gan/BUILD
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "mnist",
     srcs = ["mnist.py"],
     srcs_version = "PY2AND3",
+    deps = [":mnist_lib"],
+)
+
+py_library(
+    name = "mnist_lib",
+    srcs = ["mnist.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,7 +27,7 @@ cuda_py_test(
     name = "mnist_test",
     srcs = ["mnist_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
@@ -30,7 +37,7 @@ cuda_py_test(
     name = "mnist_graph_test",
     srcs = ["mnist_graph_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index 1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95..e1a02db76f705414a34d232022f50124a5a6a3ed 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -13,11 +13,13 @@
         "\n",
         "# Convolutional VAE: An example with tf.keras and eager\n",
         "\n",
+        "This example has moved:\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
@@ -28,604 +30,14 @@
       },
       "source": [
         "![evolution of output during training](https://tensorflow.org/images/autoencoders/cvae.gif)\n",
-        "\n",
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) by training a Variational Autoencoder. (VAE, [[1]](https://arxiv.org/abs/1312.6114), [[2]](https://arxiv.org/abs/1401.4082)).\n",
         "\n"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "P-JuIu2N_SQf"
-      },
-      "outputs": [],
-      "source": [
-        "# to generate gifs\n",
-        "!pip install imageio"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "source": [
-        "## Import TensorFlow and enable Eager execution"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd"
-      },
-      "outputs": [],
-      "source": [
-        "from __future__ import absolute_import, division, print_function\n",
-        "\n",
-        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "tfe = tf.contrib.eager\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import os\n",
-        "import time\n",
-        "import numpy as np\n",
-        "import glob\n",
-        "import matplotlib.pyplot as plt\n",
-        "import PIL\n",
-        "import imageio\n",
-        "from IPython import display"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "source": [
-        "## Load the MNIST dataset\n",
-        "Each MNIST image is originally a vector of 784 integers, each of which is between 0-255 and represents the intensity of a pixel. We model each pixel with a Bernoulli distribution in our model, and we statically binarize the dataset."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna"
-      },
-      "outputs": [],
-      "source": [
-        "(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE"
-      },
-      "outputs": [],
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "\n",
-        "# Normalizing the images to the range of [0., 1.]\n",
-        "train_images /= 255.\n",
-        "test_images /= 255.\n",
-        "\n",
-        "# Binarization\n",
-        "train_images[train_images \u003e= .5] = 1.\n",
-        "train_images[train_images \u003c .5] = 0.\n",
-        "test_images[test_images \u003e= .5] = 1.\n",
-        "test_images[test_images \u003c .5] = 0."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ"
-      },
-      "outputs": [],
-      "source": [
-        "TRAIN_BUF = 60000\n",
-        "BATCH_SIZE = 100\n",
-        "\n",
-        "TEST_BUF = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "source": [
-        "## Use *tf.data* to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn"
-      },
-      "outputs": [],
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE)\n",
-        "test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TEST_BUF).batch(BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "source": [
-        "## Wire up the generative and inference network with *tf.keras.Sequential*\n",
-        "\n",
-        "In our VAE example, we use two small ConvNets for the generative and inference network. Since these neural nets are small, we use `tf.keras.Sequential` to simplify our code. Let $x$ and $z$ denote the observation and latent variable respectively in the following descriptions. \n",
-        "\n",
-        "### Generative Network\n",
-        "This defines the generative model which takes a latent encoding as input, and outputs the parameters for a conditional distribution of the observation, i.e. $p(x|z)$. Additionally, we use a unit Gaussian prior $p(z)$ for the latent variable.\n",
-        "\n",
-        "### Inference Network\n",
-        "This defines an approximate posterior distribution $q(z|x)$, which takes as input an observation and outputs a set of parameters for the conditional distribution of the latent representation. In this example, we simply model this distribution as a diagonal Gaussian. In this case, the inference network outputs the mean and log-variance parameters of a factorized Gaussian (log-variance instead of the variance directly is for numerical stability).\n",
-        "\n",
-        "### Reparameterization Trick\n",
-        "During optimization, we can sample from $q(z|x)$ by first sampling from a unit Gaussian, and then multiplying by the standard deviation and adding the mean. This ensures the gradients could pass through the sample to the inference network parameters.\n",
-        "\n",
-        "### Network architecture\n",
-        "For the inference network, we use two convolutional layers followed by a fully-connected layer. In the generative network, we mirror this architecture by using a fully-connected layer followed by three convolution transpose layers (a.k.a. deconvolutional layers in some contexts). Note, it's common practice to avoid using batch normalization when training VAEs, since the additional stochasticity due to using mini-batches may aggravate instability on top of the stochasticity from sampling."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "VGLbvBEmjK0a"
-      },
-      "outputs": [],
-      "source": [
-        "class CVAE(tf.keras.Model):\n",
-        "  def __init__(self, latent_dim):\n",
-        "    super(CVAE, self).__init__()\n",
-        "    self.latent_dim = latent_dim\n",
-        "    self.inference_net = tf.keras.Sequential(\n",
-        "      [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=32, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=64, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Flatten(),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Dense(latent_dim + latent_dim),\n",
-        "      ]\n",
-        "    )\n",
-        "\n",
-        "    self.generative_net = tf.keras.Sequential(\n",
-        "        [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n",
-        "          tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Reshape(target_shape=(7, 7, 32)),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=64,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=32,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=1, kernel_size=3, strides=(1, 1), padding=\"SAME\"),\n",
-        "        ]\n",
-        "    )\n",
-        "\n",
-        "  def sample(self, eps=None):\n",
-        "    if eps is None:\n",
-        "      eps = tf.random_normal(shape=(100, self.latent_dim))\n",
-        "    return self.decode(eps, apply_sigmoid=True)\n",
-        "\n",
-        "  def encode(self, x):\n",
-        "    mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)\n",
-        "    return mean, logvar\n",
-        "\n",
-        "  def reparameterize(self, mean, logvar):\n",
-        "    eps = tf.random_normal(shape=mean.shape)\n",
-        "    return eps * tf.exp(logvar * .5) + mean\n",
-        "\n",
-        "  def decode(self, z, apply_sigmoid=False):\n",
-        "    logits = self.generative_net(z)\n",
-        "    if apply_sigmoid:\n",
-        "      probs = tf.sigmoid(logits)\n",
-        "      return probs\n",
-        "\n",
-        "    return logits"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "source": [
-        "## Define the loss function and the optimizer\n",
-        "\n",
-        "VAEs train by maximizing the evidence lower bound (ELBO) on the marginal log-likelihood:\n",
-        "\n",
-        "$$\\log p(x) \\ge \\text{ELBO} = \\mathbb{E}_{q(z|x)}\\left[\\log \\frac{p(x, z)}{q(z|x)}\\right].$$\n",
-        "\n",
-        "In practice, we optimize the single sample Monte Carlo estimate of this expectation:\n",
-        "\n",
-        "$$\\log p(x| z) + \\log p(z) - \\log q(z|x),$$\n",
-        "where $z$ is sampled from $q(z|x)$.\n",
-        "\n",
-        "**Note**: we could also analytically compute the KL term, but here we incorporate all three terms in the Monte Carlo estimator for simplicity."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7"
-      },
-      "outputs": [],
-      "source": [
-        "def log_normal_pdf(sample, mean, logvar, raxis=1):\n",
-        "  log2pi = tf.log(2. * np.pi)\n",
-        "  return tf.reduce_sum(\n",
-        "      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n",
-        "      axis=raxis)\n",
-        "\n",
-        "def compute_loss(model, x):\n",
-        "  mean, logvar = model.encode(x)\n",
-        "  z = model.reparameterize(mean, logvar)\n",
-        "  x_logit = model.decode(z)\n",
-        "\n",
-        "  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n",
-        "  logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])\n",
-        "  logpz = log_normal_pdf(z, 0., 0.)\n",
-        "  logqz_x = log_normal_pdf(z, mean, logvar)\n",
-        "  return -tf.reduce_mean(logpx_z + logpz - logqz_x)\n",
-        "\n",
-        "def compute_gradients(model, x):\n",
-        "  with tf.GradientTape() as tape:\n",
-        "    loss = compute_loss(model, x)\n",
-        "  return tape.gradient(loss, model.trainable_variables), loss\n",
-        "\n",
-        "optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "def apply_gradients(optimizer, gradients, variables, global_step=None):\n",
-        "  optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We start by iterating over the dataset\n",
-        "* During each iteration, we pass the image to the encoder to obtain a set of mean and log-variance parameters of the approximate posterior $q(z|x)$\n",
-        "* We then apply the *reparameterization trick* to sample from $q(z|x)$\n",
-        "* Finally, we pass the reparameterized samples to the decoder to obtain the logits of the generative distribution $p(x|z)$\n",
-        "* **Note:** Since we use the dataset loaded by keras with 60k datapoints in the training set and 10k datapoints in the test set, our resulting ELBO on the test set is slightly higher than reported results in the literature which uses dynamic binarization of Larochelle's MNIST.\n",
-        "\n",
-        "## Generate Images\n",
-        "\n",
-        "* After training, it is time to generate some images\n",
-        "* We start by sampling a set of latent vectors from the unit Gaussian prior distribution $p(z)$\n",
-        "* The generator will then convert the latent sample $z$ to logits of the observation, giving a distribution $p(x|z)$\n",
-        "* Here we plot the probabilities of Bernoulli distributions\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo"
-      },
-      "outputs": [],
-      "source": [
-        "epochs = 100\n",
-        "latent_dim = 50\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# keeping the random vector constant for generation (prediction) so\n",
-        "# it will be easier to see the improvement.\n",
-        "random_vector_for_generation = tf.random_normal(\n",
-        "    shape=[num_examples_to_generate, latent_dim])\n",
-        "model = CVAE(latent_dim)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy"
-      },
-      "outputs": [],
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  predictions = model.sample(test_input)\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "\n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0], cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "\n",
-        "  # tight_layout minimizes the overlap between 2 sub-plots\n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ"
-      },
-      "outputs": [],
-      "source": [
-        "generate_and_save_images(model, 0, random_vector_for_generation)\n",
-        "\n",
-        "for epoch in range(1, epochs + 1):\n",
-        "  start_time = time.time()\n",
-        "  for train_x in train_dataset:\n",
-        "    gradients, loss = compute_gradients(model, train_x)\n",
-        "    apply_gradients(optimizer, gradients, model.trainable_variables)\n",
-        "  end_time = time.time()\n",
-        "\n",
-        "  if epoch % 1 == 0:\n",
-        "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset:\n",
-        "      loss(compute_loss(model, test_x))\n",
-        "    elbo = -loss.result()\n",
-        "    display.clear_output(wait=False)\n",
-        "    print('Epoch: {}, Test set ELBO: {}, '\n",
-        "          'time elapse for current epoch {}'.format(epoch,\n",
-        "                                                    elbo,\n",
-        "                                                    end_time - start_time))\n",
-        "    generate_and_save_images(\n",
-        "        model, epoch, random_vector_for_generation)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "source": [
-        "### Display an image using the epoch number"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL"
-      },
-      "outputs": [],
-      "source": [
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A"
-      },
-      "outputs": [],
-      "source": [
-        "display_image(epochs)  # Display images"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "source": [
-        "### Generate a GIF of all the saved images."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI"
-      },
-      "outputs": [],
-      "source": [
-        "with imageio.get_writer('cvae.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) \u003e round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp cvae.gif cvae.gif.png')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b"
-      },
-      "outputs": [],
-      "source": [
-        "display.Image(filename=\"cvae.gif.png\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "yQXO_dlXkKsT"
-      },
-      "source": [
-        "To downlod the animation from Colab uncomment the code below:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "4fSJS3m5HLFM"
-      },
-      "outputs": [],
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('cvae.gif')"
-      ]
     }
   ],
   "metadata": {
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "default_view": {},
       "name": "cvae.ipynb",
       "private_outputs": true,
       "provenance": [
@@ -635,8 +47,7 @@
         }
       ],
       "toc_visible": true,
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
     },
     "kernelspec": {
       "display_name": "Python 3",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 78fcd397087fd1fd64aebed7ac3b5c6b2f45c450..53767058838459e56215d286e9f8f8eb66287147 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -1,26 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "dcgan.ipynb",
-      "version": "0.3.2",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python2",
-      "display_name": "Python 2"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "0TD5ZrvEMbhZ"
       },
-      "cell_type": "markdown",
       "source": [
         "**Copyright 2018 The TensorFlow Authors**.\n",
         "\n",
@@ -28,851 +13,39 @@
         "\n",
         "# Generating Handwritten Digits with DCGAN\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "ITZuApL56Mny"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "This tutorial demonstrates how to generate images of handwritten digits using a Deep Convolutional Generative Adversarial Network ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)). The code is written in [tf.keras](https://www.tensorflow.org/programmers_guide/keras) with [eager execution](https://www.tensorflow.org/programmers_guide/eager) enabled. "
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "toc",
-        "id": "x2McrO9bMyLN"
-      },
-      "cell_type": "markdown",
-      "source": [
-        ">[Generating Handwritten Digits with DCGAN](#scrollTo=0TD5ZrvEMbhZ)\n",
-        "\n",
-        ">>[What are GANs?](#scrollTo=2MbKJY38Puy9)\n",
-        "\n",
-        ">>>[Import TensorFlow and enable eager execution](#scrollTo=e1_Y75QXJS6h)\n",
-        "\n",
-        ">>>[Load the dataset](#scrollTo=iYn4MdZnKCey)\n",
-        "\n",
-        ">>>[Use tf.data to create batches and shuffle the dataset](#scrollTo=PIGN6ouoQxt3)\n",
-        "\n",
-        ">>[Create the models](#scrollTo=THY-sZMiQ4UV)\n",
-        "\n",
-        ">>>[The Generator Model](#scrollTo=-tEyxE-GMC48)\n",
-        "\n",
-        ">>>[The Discriminator model](#scrollTo=D0IKnaCtg6WE)\n",
-        "\n",
-        ">>[Define the loss functions and the optimizer](#scrollTo=0FMYgY_mPfTi)\n",
-        "\n",
-        ">>>[Generator loss](#scrollTo=Jd-3GCUEiKtv)\n",
-        "\n",
-        ">>>[Discriminator loss](#scrollTo=PKY_iPSPNWoj)\n",
-        "\n",
-        ">>[Set up GANs for Training](#scrollTo=Rw1fkAczTQYh)\n",
-        "\n",
-        ">>[Train the GANs](#scrollTo=dZrd4CdjR-Fp)\n",
-        "\n",
-        ">>[Generated images](#scrollTo=P4M_vIbUi7c0)\n",
+        "This example has moved.\n",
         "\n",
-        ">>[Learn more about GANs](#scrollTo=k6qC-SbjK0yW)\n",
-        "\n"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "2MbKJY38Puy9"
       },
-      "cell_type": "markdown",
       "source": [
-        "## What are GANs?\n",
-        "GANs, or [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661), are a framework for estimating generative models. Two models are trained simultaneously by an adversarial process: a Generator, which is responsible for generating data (say, images), and a Discriminator, which is responsible for estimating the probability that an image was drawn from the training data (the image is real), or was produced by the Generator (the image is fake). During training, the Generator becomes progressively better at generating images, until the Discriminator is no longer able to distinguish real images from fake. \n",
-        "\n",
-        "![alt text](https://github.com/margaretmz/tensorflow/blob/margaret-dcgan/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png?raw=1)\n",
-        "\n",
-        "We will demonstrate this process end-to-end on MNIST. Below is an animation that shows a series of images produced by the Generator as it was trained for 50 epochs. Overtime, the generated images become increasingly difficult to distinguish from the training set.\n",
-        "\n",
-        "To learn more about GANs, we recommend MIT's [Intro to Deep Learning](http://introtodeeplearning.com/) course, which includes a lecture on Deep Generative Models ([video](https://youtu.be/JVb54xhEw6Y) | [slides](http://introtodeeplearning.com/materials/2018_6S191_Lecture4.pdf)). Now, let's head to the code!\n",
-        "\n",
         "![sample output](https://tensorflow.org/images/gan/dcgan.gif)"
       ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "dcgan.ipynb",
+      "provenance": [],
+      "version": "0.3.2"
     },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "u_2z-B3piVsw",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Install imgeio in order to generate an animated gif showing the image generating process\n",
-        "!pip install imageio"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Import TensorFlow and enable eager execution"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import glob\n",
-        "import imageio\n",
-        "import matplotlib.pyplot as plt\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import PIL\n",
-        "import time\n",
-        "\n",
-        "from IPython import display"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Load the dataset\n",
-        "\n",
-        "We are going to use the MNIST dataset to train the generator and the discriminator. The generator will generate handwritten digits resembling the MNIST data."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "BUFFER_SIZE = 60000\n",
-        "BATCH_SIZE = 256"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Use tf.data to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Create the models\n",
-        "\n",
-        "We will use tf.keras [Sequential API](https://www.tensorflow.org/guide/keras#sequential_model) to define the generator and discriminator models."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "-tEyxE-GMC48"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Generator Model\n",
-        "\n",
-        "The generator is responsible for creating convincing images that are good enough to fool the discriminator. The network architecture for the generator consists of [Conv2DTranspose](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2DTranspose) (Upsampling) layers. We start with a fully connected layer and upsample the image two times in order to reach the desired image size of 28x28x1. We increase the width and height, and reduce the depth as we move through the layers in the network. We use [Leaky ReLU](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LeakyReLU) activation for each layer except for the last one where we use a tanh activation."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "6bpTcDqoLWjY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_generator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Dense(7*7*256, use_bias=False, input_shape=(100,)))\n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Reshape((7, 7, 256)))\n",
-        "    assert model.output_shape == (None, 7, 7, 256) # Note: None is the batch size\n",
-        "    \n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 7, 7, 128)  \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 14, 14, 64)    \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))\n",
-        "    assert model.output_shape == (None, 28, 28, 1)\n",
-        "  \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "D0IKnaCtg6WE"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Discriminator model\n",
-        "\n",
-        "The discriminator is responsible for distinguishing fake images from real images. It's similar to a regular CNN-based image classifier."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "dw2tPLmk2pEP",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_discriminator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "       \n",
-        "    model.add(tf.keras.layers.Flatten())\n",
-        "    model.add(tf.keras.layers.Dense(1))\n",
-        "     \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "gDkA05NE6QMs",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator = make_generator_model()\n",
-        "discriminator = make_discriminator_model()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Define the loss functions and the optimizer\n",
-        "\n",
-        "Let's define the loss functions and the optimizers for the generator and the discriminator.\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Jd-3GCUEiKtv"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Generator loss\n",
-        "The generator loss is a sigmoid cross entropy loss of the generated images and an array of ones, since the generator is trying to generate fake images that resemble the real images."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "90BIcCKcDMxz",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generator_loss(generated_output):\n",
-        "    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PKY_iPSPNWoj"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Discriminator loss\n",
-        "\n",
-        "The discriminator loss function takes two inputs: real images, and generated images. Here is how to calculate the discriminator loss:\n",
-        "1. Calculate real_loss which is a sigmoid cross entropy loss of the real images and an array of ones (since these are the real images).\n",
-        "2. Calculate generated_loss which is a sigmoid cross entropy loss of the generated images and an array of zeros (since these are the fake images).\n",
-        "3. Calculate the total_loss as the sum of real_loss and generated_loss."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "wkMNfBWlT-PV",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def discriminator_loss(real_output, generated_output):\n",
-        "    # [1,1,...,1] with real output since it is true and we want our generated examples to look like it\n",
-        "    real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.ones_like(real_output), logits=real_output)\n",
-        "\n",
-        "    # [0,0,...,0] with generated images since they are fake\n",
-        "    generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.zeros_like(generated_output), logits=generated_output)\n",
-        "\n",
-        "    total_loss = real_loss + generated_loss\n",
-        "\n",
-        "    return total_loss"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "MgIc7i0th_Iu"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "The discriminator and the generator optimizers are different since we will train two networks separately."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator_optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "discriminator_optimizer = tf.train.AdamOptimizer(1e-4)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mWtinsGDPJlV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Checkpoints (Object-based saving)**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "CA1w-7s2POEy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
-        "                                 discriminator_optimizer=discriminator_optimizer,\n",
-        "                                 generator=generator,\n",
-        "                                 discriminator=discriminator)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Set up GANs for Training\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "5QC5BABamh_c"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Now it's time to put together the generator and discriminator to set up the Generative Adversarial Networks, as you see in the diagam at the beginning of the tutorial."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ff6oN6PZX27n"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training parameters**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 50\n",
-        "noise_dim = 100\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# We'll re-use this random vector used to seed the generator so\n",
-        "# it will be easier to see the improvement over time.\n",
-        "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
-        "                                                 noise_dim])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "jylSonrqSWfi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training method**\n",
-        "\n",
-        "We start by iterating over the dataset. The generator is given a random vector as an input which is processed to  output an image looking like a handwritten digit. The discriminator is then shown the real MNIST images as well as the generated images.\n",
-        "\n",
-        "Next, we calculate the generator and the discriminator loss. Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "3t5ibNo05jCB",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train_step(images):\n",
-        "   # generating noise from a normal distribution\n",
-        "      noise = tf.random_normal([BATCH_SIZE, noise_dim])\n",
-        "      \n",
-        "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
-        "        generated_images = generator(noise, training=True)\n",
-        "      \n",
-        "        real_output = discriminator(images, training=True)\n",
-        "        generated_output = discriminator(generated_images, training=True)\n",
-        "         \n",
-        "        gen_loss = generator_loss(generated_output)\n",
-        "        disc_loss = discriminator_loss(real_output, generated_output)\n",
-        "        \n",
-        "      gradients_of_generator = gen_tape.gradient(gen_loss, generator.variables)\n",
-        "      gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.variables)\n",
-        "      \n",
-        "      generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.variables))\n",
-        "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6TSZgwc2BUQ-"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "This model takes about ~30 seconds per epoch to train on a single Tesla K80 on Colab, as of October 2018. \n",
-        "\n",
-        "Eager execution can be slower than executing the equivalent graph as it can't benefit from whole-program optimizations on the graph, and also incurs overheads of interpreting Python code. By using [tf.contrib.eager.defun](https://www.tensorflow.org/api_docs/python/tf/contrib/eager/defun) to create graph functions, we get a ~20 secs/epoch performance boost (from ~50 secs/epoch down to ~30 secs/epoch). This way we get the best of both eager execution (easier for debugging) and graph mode (better performance)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Iwya07_j5p2A",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_step = tf.contrib.eager.defun(train_step)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train(dataset, epochs):  \n",
-        "  for epoch in range(epochs):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    for images in dataset:\n",
-        "      train_step(images)\n",
-        "\n",
-        "    display.clear_output(wait=True)\n",
-        "    generate_and_save_images(generator,\n",
-        "                               epoch + 1,\n",
-        "                               random_vector_for_generation)\n",
-        "    \n",
-        "    # saving (checkpoint) the model every 15 epochs\n",
-        "    if (epoch + 1) % 15 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "    \n",
-        "    print ('Time taken for epoch {} is {} sec'.format(epoch + 1,\n",
-        "                                                      time.time()-start))\n",
-        "  # generating after the final epoch\n",
-        "  display.clear_output(wait=True)\n",
-        "  generate_and_save_images(generator,\n",
-        "                           epochs,\n",
-        "                           random_vector_for_generation)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "2aFF7Hk3XdeW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate and save images**\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  # make sure the training parameter is set to False because we\n",
-        "  # don't want to train the batchnorm layer when doing inference.\n",
-        "  predictions = model(test_input, training=False)\n",
-        "\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "  \n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "        \n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "dZrd4CdjR-Fp"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Train the GANs\n",
-        "We will call the train() method defined above to train the generator and discriminator simultaneously. Note, training GANs can be tricky. It's important that the generator and discriminator do not overpower each other (e.g., that they train at a similar rate).\n",
-        "\n",
-        "At the beginning of the training, the generated images look like random noise. As training progresses, you can see the generated digits look increasingly real. After 50 epochs, they look very much like the MNIST digits."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "Ly3UN0SLLY2l",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "%%time\n",
-        "train(train_dataset, EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "rfM4YcPVPkNO"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Restore the latest checkpoint**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "XhXsd0srPo8c",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Generated images \n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mLskt7EfXAjr"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "After training, its time to generate some images! \n",
-        "The last step is to plot the generated images and voila!\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Display a single image using the epoch number\n",
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display_image(EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate a GIF of all the saved images**\n",
-        "\n",
-        "We will use imageio to create an animated gif using all the images saved during training."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) > round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp dcgan.gif dcgan.gif.png')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "cGhC3-fMWSwl"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Display the animated gif with all the mages generated during the training of GANs."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display.Image(filename=\"dcgan.gif.png\")"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6EEG-wePkmJQ"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Download the animated gif**\n",
-        "\n",
-        "Uncomment the code below to download an animated gif from Colab."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "4UJjSnIMOzOJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('dcgan.gif')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "k6qC-SbjK0yW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Learn more about GANs\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "xjjkT9KAK6H7"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "We hope this tutorial was helpful! As a next step, you might like to experiment with a different dataset, for example the Large-scale Celeb Faces Attributes (CelebA) dataset [available on Kaggle](https://www.kaggle.com/jessicali9530/celeba-dataset/home).\n",
-        "\n",
-        "To learn more about GANs:\n",
-        "\n",
-        "* Check out MIT's lecture (linked above), or [this](http://cs231n.stanford.edu/slides/2018/cs231n_2018_lecture12.pdf) lecture form Stanford's CS231n. \n",
-        "\n",
-        "* We also recommend the [CVPR 2018 Tutorial on GANs](https://sites.google.com/view/cvpr2018tutorialongans/), and the [NIPS 2016 Tutorial: Generative Adversarial Networks](https://arxiv.org/abs/1701.00160).\n"
-      ]
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png b/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png
deleted file mode 100644
index b715bd83ef117641c6429e0ac173dbe9b8d5fd88..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png and /dev/null differ
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 12c5eff2b4aa901bdab52bf545e95b1e4dce7468..979772acd3f823a8cc53ab5e026946ad3bb19353 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1174 +1,71 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "K2s1A9eLRPEj"
-   },
-   "source": [
-    "##### Copyright 2018 The TensorFlow Authors.\n",
-    "\n",
-    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Cffg2i257iMS"
-   },
-   "source": [
-    "# Image Captioning with Attention\n",
-    "\n",
-    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-    "</td><td>\n",
-    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "QASbY_HGo4Lq"
-   },
-   "source": [
-    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-    "\n",
-    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-    "\n",
-    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-    "\n",
-    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-    "\n",
-    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-    "\n",
-    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-    "\n",
-    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-    "\n",
-    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-    "\n",
-    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-    "\n",
-    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "U8l4RJ0XRPEm"
-   },
-   "outputs": [],
-   "source": [
-    "# Import TensorFlow and enable eager execution\n",
-    "# This code requires TensorFlow version >=1.9\n",
-    "import tensorflow as tf\n",
-    "tf.enable_eager_execution()\n",
-    "\n",
-    "# We'll generate plots of attention in order to see which parts of an image\n",
-    "# our model focuses on during captioning\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# Scikit-learn includes many helpful utilities\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.utils import shuffle\n",
-    "\n",
-    "import re\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import time\n",
-    "import json\n",
-    "from glob import glob\n",
-    "from PIL import Image\n",
-    "import pickle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "b6qbGw8MRPE5"
-   },
-   "source": [
-    "## Download and prepare the MS-COCO dataset\n",
-    "\n",
-    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-    "\n",
-    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "krQuPYTtRPE7"
-   },
-   "outputs": [],
-   "source": [
-    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-    "                                          cache_subdir=os.path.abspath('.'),\n",
-    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-    "                                          extract = True)\n",
-    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-    "\n",
-    "name_of_zip = 'train2014.zip'\n",
-    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-    "                                      cache_subdir=os.path.abspath('.'),\n",
-    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-    "                                      extract = True)\n",
-    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-    "else:\n",
-    "  PATH = os.path.abspath('.')+'/train2014/'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "aANEzb5WwSzg"
-   },
-   "source": [
-    "## Optionally, limit the size of the training set for faster training\n",
-    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "4G3b8x8_RPFD"
-   },
-   "outputs": [],
-   "source": [
-    "# read the json file\n",
-    "with open(annotation_file, 'r') as f:\n",
-    "    annotations = json.load(f)\n",
-    "\n",
-    "# storing the captions and the image name in vectors\n",
-    "all_captions = []\n",
-    "all_img_name_vector = []\n",
-    "\n",
-    "for annot in annotations['annotations']:\n",
-    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-    "    image_id = annot['image_id']\n",
-    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-    "    \n",
-    "    all_img_name_vector.append(full_coco_image_path)\n",
-    "    all_captions.append(caption)\n",
-    "\n",
-    "# shuffling the captions and image_names together\n",
-    "# setting a random state\n",
-    "train_captions, img_name_vector = shuffle(all_captions,\n",
-    "                                          all_img_name_vector,\n",
-    "                                          random_state=1)\n",
-    "\n",
-    "# selecting the first 30000 captions from the shuffled set\n",
-    "num_examples = 30000\n",
-    "train_captions = train_captions[:num_examples]\n",
-    "img_name_vector = img_name_vector[:num_examples]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "mPBMgK34RPFL"
-   },
-   "outputs": [],
-   "source": [
-    "len(train_captions), len(all_captions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "8cSW4u-ORPFQ"
-   },
-   "source": [
-    "## Preprocess the images using InceptionV3\n",
-    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-    "\n",
-    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-    "* Resizing the image to (299, 299)\n",
-    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "zXR0217aRPFR"
-   },
-   "outputs": [],
-   "source": [
-    "def load_image(image_path):\n",
-    "    img = tf.read_file(image_path)\n",
-    "    img = tf.image.decode_jpeg(img, channels=3)\n",
-    "    img = tf.image.resize_images(img, (299, 299))\n",
-    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-    "    return img, image_path"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "MDvIu4sXRPFV"
-   },
-   "source": [
-    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-    "\n",
-    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-    "* We avoid doing this during training so it does not become a bottleneck. \n",
-    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RD3vW4SsRPFW"
-   },
-   "outputs": [],
-   "source": [
-    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-    "                                                weights='imagenet')\n",
-    "new_input = image_model.input\n",
-    "hidden_layer = image_model.layers[-1].output\n",
-    "\n",
-    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "rERqlR3WRPGO"
-   },
-   "source": [
-    "## Caching the features extracted from InceptionV3\n",
-    "\n",
-    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-    "\n",
-    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-    "\n",
-    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-    "\n",
-    "```for img, path in image_dataset:``` \n",
-    "\n",
-    "to:\n",
-    "\n",
-    "```for img, path in tqdm(image_dataset):```."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Dx_fvbVgRPGQ"
-   },
-   "outputs": [],
-   "source": [
-    "# getting the unique images\n",
-    "encode_train = sorted(set(img_name_vector))\n",
-    "\n",
-    "# feel free to change the batch_size according to your system configuration\n",
-    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-    "                                encode_train).map(load_image).batch(16)\n",
-    "\n",
-    "for img, path in image_dataset:\n",
-    "  batch_features = image_features_extract_model(img)\n",
-    "  batch_features = tf.reshape(batch_features, \n",
-    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-    "\n",
-    "  for bf, p in zip(batch_features, path):\n",
-    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-    "    np.save(path_of_feature, bf.numpy())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nyqH3zFwRPFi"
-   },
-   "source": [
-    "## Preprocess and tokenize the captions\n",
-    "\n",
-    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-    "* Finally, we create a word --> index mapping and vice-versa.\n",
-    "* We will then pad all sequences to the be same length as the longest one. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "HZfK8RhQRPFj"
-   },
-   "outputs": [],
-   "source": [
-    "# This will find the maximum length of any caption in our dataset\n",
-    "def calc_max_length(tensor):\n",
-    "    return max(len(t) for t in tensor)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "oJGE34aiRPFo"
-   },
-   "outputs": [],
-   "source": [
-    "# The steps above is a general process of dealing with text processing\n",
-    "\n",
-    "# choosing the top 5000 words from the vocabulary\n",
-    "top_k = 5000\n",
-    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-    "                                                  oov_token=\"<unk>\", \n",
-    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-    "tokenizer.fit_on_texts(train_captions)\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "8Q44tNQVRPFt"
-   },
-   "outputs": [],
-   "source": [
-    "tokenizer.word_index['<pad>'] = 0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "0fpJb5ojRPFv"
-   },
-   "outputs": [],
-   "source": [
-    "# creating the tokenized vectors\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AidglIZVRPF4"
-   },
-   "outputs": [],
-   "source": [
-    "# padding each vector to the max_length of the captions\n",
-    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "gL0wkttkRPGA"
-   },
-   "outputs": [],
-   "source": [
-    "# calculating the max_length \n",
-    "# used to store the attention weights\n",
-    "max_length = calc_max_length(train_seqs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "M3CD75nDpvTI"
-   },
-   "source": [
-    "## Split the data into training and testing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "iS7DDMszRPGF"
-   },
-   "outputs": [],
-   "source": [
-    "# Create training and validation sets using 80-20 split\n",
-    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-    "                                                                    cap_vector, \n",
-    "                                                                    test_size=0.2, \n",
-    "                                                                    random_state=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "XmViPkRFRPGH"
-   },
-   "outputs": [],
-   "source": [
-    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "uEWM9xrYcg45"
-   },
-   "source": [
-    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Q3TnZ1ToRPGV"
-   },
-   "outputs": [],
-   "source": [
-    "# feel free to change these parameters according to your system's configuration\n",
-    "\n",
-    "BATCH_SIZE = 64\n",
-    "BUFFER_SIZE = 1000\n",
-    "embedding_dim = 256\n",
-    "units = 512\n",
-    "vocab_size = len(tokenizer.word_index)\n",
-    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-    "# these two variables represent that\n",
-    "features_shape = 2048\n",
-    "attention_features_shape = 64"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "SmZS2N0bXG3T"
-   },
-   "outputs": [],
-   "source": [
-    "# loading the numpy files \n",
-    "def map_func(img_name, cap):\n",
-    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-    "    return img_tensor, cap"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "FDF_Nm3tRPGZ"
-   },
-   "outputs": [],
-   "source": [
-    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-    "\n",
-    "# using map to load the numpy files in parallel\n",
-    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-    "\n",
-    "# shuffling and batching\n",
-    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-    "dataset = dataset.batch(BATCH_SIZE)\n",
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nrvoDphgRPGd"
-   },
-   "source": [
-    "## Model\n",
-    "\n",
-    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-    "\n",
-    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-    "\n",
-    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-    "* We squash that to a shape of (64, 2048).\n",
-    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-    "* The RNN(here GRU) attends over the image to predict the next word."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AAppCGLKRPGd"
-   },
-   "outputs": [],
-   "source": [
-    "def gru(units):\n",
-    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-    "  # significant speedup).\n",
-    "  if tf.test.is_gpu_available():\n",
-    "    return tf.keras.layers.CuDNNGRU(units, \n",
-    "                                    return_sequences=True, \n",
-    "                                    return_state=True, \n",
-    "                                    recurrent_initializer='glorot_uniform')\n",
-    "  else:\n",
-    "    return tf.keras.layers.GRU(units, \n",
-    "                               return_sequences=True, \n",
-    "                               return_state=True, \n",
-    "                               recurrent_activation='sigmoid', \n",
-    "                               recurrent_initializer='glorot_uniform')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "ja2LFTMSdeV3"
-   },
-   "outputs": [],
-   "source": [
-    "class BahdanauAttention(tf.keras.Model):\n",
-    "  def __init__(self, units):\n",
-    "    super(BahdanauAttention, self).__init__()\n",
-    "    self.W1 = tf.keras.layers.Dense(units)\n",
-    "    self.W2 = tf.keras.layers.Dense(units)\n",
-    "    self.V = tf.keras.layers.Dense(1)\n",
-    "  \n",
-    "  def call(self, features, hidden):\n",
-    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-    "    \n",
-    "    # hidden shape == (batch_size, hidden_size)\n",
-    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-    "    \n",
-    "    # score shape == (batch_size, 64, hidden_size)\n",
-    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-    "    \n",
-    "    # attention_weights shape == (batch_size, 64, 1)\n",
-    "    # we get 1 at the last axis because we are applying score to self.V\n",
-    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-    "    \n",
-    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-    "    context_vector = attention_weights * features\n",
-    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-    "    \n",
-    "    return context_vector, attention_weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AZ7R1RxHRPGf"
-   },
-   "outputs": [],
-   "source": [
-    "class CNN_Encoder(tf.keras.Model):\n",
-    "    # Since we have already extracted the features and dumped it using pickle\n",
-    "    # This encoder passes those features through a Fully connected layer\n",
-    "    def __init__(self, embedding_dim):\n",
-    "        super(CNN_Encoder, self).__init__()\n",
-    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-    "        \n",
-    "    def call(self, x):\n",
-    "        x = self.fc(x)\n",
-    "        x = tf.nn.relu(x)\n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "V9UbGQmERPGi"
-   },
-   "outputs": [],
-   "source": [
-    "class RNN_Decoder(tf.keras.Model):\n",
-    "  def __init__(self, embedding_dim, units, vocab_size):\n",
-    "    super(RNN_Decoder, self).__init__()\n",
-    "    self.units = units\n",
-    "\n",
-    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-    "    self.gru = gru(self.units)\n",
-    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-    "    \n",
-    "    self.attention = BahdanauAttention(self.units)\n",
-    "        \n",
-    "  def call(self, x, features, hidden):\n",
-    "    # defining attention as a separate model\n",
-    "    context_vector, attention_weights = self.attention(features, hidden)\n",
-    "    \n",
-    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-    "    x = self.embedding(x)\n",
-    "    \n",
-    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-    "    \n",
-    "    # passing the concatenated vector to the GRU\n",
-    "    output, state = self.gru(x)\n",
-    "    \n",
-    "    # shape == (batch_size, max_length, hidden_size)\n",
-    "    x = self.fc1(output)\n",
-    "    \n",
-    "    # x shape == (batch_size * max_length, hidden_size)\n",
-    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-    "    \n",
-    "    # output shape == (batch_size * max_length, vocab)\n",
-    "    x = self.fc2(x)\n",
-    "\n",
-    "    return x, state, attention_weights\n",
-    "\n",
-    "  def reset_state(self, batch_size):\n",
-    "    return tf.zeros((batch_size, self.units))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Qs_Sr03wRPGk"
-   },
-   "outputs": [],
-   "source": [
-    "encoder = CNN_Encoder(embedding_dim)\n",
-    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "-bYN7xA0RPGl"
-   },
-   "outputs": [],
-   "source": [
-    "optimizer = tf.train.AdamOptimizer()\n",
-    "\n",
-    "# We are masking the loss calculated for padding\n",
-    "def loss_function(real, pred):\n",
-    "    mask = 1 - np.equal(real, 0)\n",
-    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-    "    return tf.reduce_mean(loss_)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "PHod7t72RPGn"
-   },
-   "source": [
-    "## Training\n",
-    "\n",
-    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-    "* The decoder returns the predictions and the decoder hidden state.\n",
-    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-    "* Use teacher forcing to decide the next input to the decoder.\n",
-    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Vt4WZ5mhJE-E"
-   },
-   "outputs": [],
-   "source": [
-    "# adding this in a separate cell because if you run the training cell \n",
-    "# many times, the loss_plot array will be reset\n",
-    "loss_plot = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "UlA4VIQpRPGo"
-   },
-   "outputs": [],
-   "source": [
-    "EPOCHS = 20\n",
-    "\n",
-    "for epoch in range(EPOCHS):\n",
-    "    start = time.time()\n",
-    "    total_loss = 0\n",
-    "    \n",
-    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-    "        loss = 0\n",
-    "        \n",
-    "        # initializing the hidden state for each batch\n",
-    "        # because the captions are not related from image to image\n",
-    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-    "\n",
-    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-    "        \n",
-    "        with tf.GradientTape() as tape:\n",
-    "            features = encoder(img_tensor)\n",
-    "            \n",
-    "            for i in range(1, target.shape[1]):\n",
-    "                # passing the features through the decoder\n",
-    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "                loss += loss_function(target[:, i], predictions)\n",
-    "                \n",
-    "                # using teacher forcing\n",
-    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-    "        \n",
-    "        total_loss += (loss / int(target.shape[1]))\n",
-    "        \n",
-    "        variables = encoder.variables + decoder.variables\n",
-    "        \n",
-    "        gradients = tape.gradient(loss, variables) \n",
-    "        \n",
-    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-    "        \n",
-    "        if batch % 100 == 0:\n",
-    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-    "                                                          batch, \n",
-    "                                                          loss.numpy() / int(target.shape[1])))\n",
-    "    # storing the epoch end loss value to plot later\n",
-    "    loss_plot.append(total_loss / len(cap_vector))\n",
-    "    \n",
-    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-    "                                         total_loss/len(cap_vector)))\n",
-    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "1Wm83G-ZBPcC"
-   },
-   "outputs": [],
-   "source": [
-    "plt.plot(loss_plot)\n",
-    "plt.xlabel('Epochs')\n",
-    "plt.ylabel('Loss')\n",
-    "plt.title('Loss Plot')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "xGvOcLQKghXN"
-   },
-   "source": [
-    "## Caption!\n",
-    "\n",
-    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-    "* Stop predicting when the model predicts the end token.\n",
-    "* And store the attention weights for every time step."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RCWpDtyNRPGs"
-   },
-   "outputs": [],
-   "source": [
-    "def evaluate(image):\n",
-    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-    "\n",
-    "    hidden = decoder.reset_state(batch_size=1)\n",
-    "\n",
-    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-    "    img_tensor_val = image_features_extract_model(temp_input)\n",
-    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-    "\n",
-    "    features = encoder(img_tensor_val)\n",
-    "\n",
-    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-    "    result = []\n",
-    "\n",
-    "    for i in range(max_length):\n",
-    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-    "\n",
-    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-    "        result.append(tokenizer.index_word[predicted_id])\n",
-    "\n",
-    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
-    "            return result, attention_plot\n",
-    "\n",
-    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-    "\n",
-    "    attention_plot = attention_plot[:len(result), :]\n",
-    "    return result, attention_plot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "fD_y7PD6RPGt"
-   },
-   "outputs": [],
-   "source": [
-    "def plot_attention(image, result, attention_plot):\n",
-    "    temp_image = np.array(Image.open(image))\n",
-    "\n",
-    "    fig = plt.figure(figsize=(10, 10))\n",
-    "    \n",
-    "    len_result = len(result)\n",
-    "    for l in range(len_result):\n",
-    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-    "        ax.set_title(result[l])\n",
-    "        img = ax.imshow(temp_image)\n",
-    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-    "\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "K2s1A9eLRPEj"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+      ]
     },
-    "colab_type": "code",
-    "id": "io7ws3ReRPGv"
-   },
-   "outputs": [],
-   "source": [
-    "# captions on the validation set\n",
-    "rid = np.random.randint(0, len(img_name_val))\n",
-    "image = img_name_val[rid]\n",
-    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-    "result, attention_plot = evaluate(image)\n",
-    "\n",
-    "print ('Real Caption:', real_caption)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(img_name_val[rid])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Rprk3HEvZuxb"
-   },
-   "source": [
-    "## Try it on your own images\n",
-    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Cffg2i257iMS"
+      },
+      "source": [
+        "# Image Captioning with Attention\n",
+        "\n",
+        "This example has moved:\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+      ]
     },
-    "colab_type": "code",
-    "id": "9Psd1quzaAWg"
-   },
-   "outputs": [],
-   "source": [
-    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-    "image_extension = image_url[-4:]\n",
-    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-    "                                     origin=image_url)\n",
-    "\n",
-    "result, attention_plot = evaluate(image_path)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image_path, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(image_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "VJZXyJco6uLO"
-   },
-   "source": [
-    "# Next steps\n",
-    "\n",
-    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "collapsed_sections": [],
-   "default_view": {},
-   "name": "image_captioning_with_attention.ipynb",
-   "private_outputs": true,
-   "provenance": [
     {
-     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-     "timestamp": 1530222436922
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QASbY_HGo4Lq"
+      },
+      "source": [
+        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+        "\n",
+        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+        "\n",
+        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "image_captioning_with_attention.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+          "timestamp": 1530222436922
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
     }
-   ],
-   "toc_visible": true,
-   "version": "0.3.2",
-   "views": {}
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index bda9e77085e45ae31a228142135425e22a1c6780..c945c753b3ba36d16aa6985d23a5849f8f552304 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -13,633 +13,13 @@
         "\n",
         "# Text Generation using a RNN\n",
         "\n",
+        "This example has moved.\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on Github\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "BwpJ5IffzRG6"
-      },
-      "source": [
-        "This notebook demonstrates how to generate text using an RNN using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). If you like, you can write a similar [model](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/8.1-text-generation-with-lstm.ipynb) using less code. Here, we show a lower-level impementation that's useful to understand as prework before diving in to deeper examples in a similar, like [Neural Machine Translation with Attention](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "This notebook is an end-to-end example. When you run it, it will download a dataset of Shakespeare's writing. We'll use a collection of plays, borrowed from Andrej Karpathy's excellent [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).  The notebook will train a model, and use it to generate sample output.\n",
-        "  \n",
-        "Here is the output(with start string='w') after training a single layer GRU for 30 epochs with the default settings below:\n",
-        "\n",
-        "```\n",
-        "were to the death of him\n",
-        "And nothing of the field in the view of hell,\n",
-        "When I said, banish him, I will not burn thee that would live.\n",
-        "\n",
-        "HENRY BOLINGBROKE:\n",
-        "My gracious uncle--\n",
-        "\n",
-        "DUKE OF YORK:\n",
-        "As much disgraced to the court, the gods them speak,\n",
-        "And now in peace himself excuse thee in the world.\n",
-        "\n",
-        "HORTENSIO:\n",
-        "Madam, 'tis not the cause of the counterfeit of the earth,\n",
-        "And leave me to the sun that set them on the earth\n",
-        "And leave the world and are revenged for thee.\n",
-        "\n",
-        "GLOUCESTER:\n",
-        "I would they were talking with the very name of means\n",
-        "To make a puppet of a guest, and therefore, good Grumio,\n",
-        "Nor arm'd to prison, o' the clouds, of the whole field,\n",
-        "With the admire\n",
-        "With the feeding of thy chair, and we have heard it so,\n",
-        "I thank you, sir, he is a visor friendship with your silly your bed.\n",
-        "\n",
-        "SAMPSON:\n",
-        "I do desire to live, I pray: some stand of the minds, make thee remedies\n",
-        "With the enemies of my soul.\n",
-        "\n",
-        "MENENIUS:\n",
-        "I'll keep the cause of my mistress.\n",
-        "\n",
-        "POLIXENES:\n",
-        "My brother Marcius!\n",
-        "\n",
-        "Second Servant:\n",
-        "Will't ple\n",
-        "```\n",
-        "\n",
-        "Of course, while some of the sentences are grammatical, most do not make sense. But, consider:\n",
-        "\n",
-        "* Our model is character based (when we began training, it did not yet know how to spell a valid English word, or that words were even a unit of text).\n",
-        "\n",
-        "* The structure of the output resembles a play (blocks begin with a speaker name, in all caps similar to the original text). Sentences generally end with a period. If you look at the text from a distance (or don't read the invididual words too closely, it appears as if it's an excerpt from a play).\n",
-        "\n",
-        "As a next step, you can experiment training the model on a different dataset - any large text file(ASCII) will do, and you can modify a single line of code below to make that change. Have fun!\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "R3p22DBDsaCA"
-      },
-      "source": [
-        "## Install unidecode library\n",
-        "A helpful library to convert unicode to ASCII."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "wZ6LOM12wKGH"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install unidecode"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "WGyKZj3bzf9p"
-      },
-      "source": [
-        "## Import tensorflow and enable eager execution."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "yG_n40gFzf9s"
-      },
-      "outputs": [],
-      "source": [
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "# Note: Once you enable eager execution, it cannot be disabled. \n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import re\n",
-        "import random\n",
-        "import unidecode\n",
-        "import time"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EHDoRoc5PKWz"
-      },
-      "source": [
-        "## Download the dataset\n",
-        "\n",
-        "In this example, we will use the [shakespeare dataset](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt). You can use any other dataset that you like.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "pD_55cOxLkAb"
-      },
-      "outputs": [],
-      "source": [
-        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "UHjdCjDuSvX_"
-      },
-      "source": [
-        "## Read the dataset\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "-E5JvY3wzf94"
-      },
-      "outputs": [],
-      "source": [
-        "text = unidecode.unidecode(open(path_to_file).read())\n",
-        "# length of text is the number of characters in it\n",
-        "print (len(text))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Il9ww98izf-D"
-      },
-      "source": [
-        "Creating dictionaries to map from characters to their indices and vice-versa, which will be used to vectorize the inputs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "IalZLbvOzf-F"
-      },
-      "outputs": [],
-      "source": [
-        "# unique contains all the unique characters in the file\n",
-        "unique = sorted(set(text))\n",
-        "\n",
-        "# creating a mapping from unique characters to indices\n",
-        "char2idx = {u:i for i, u in enumerate(unique)}\n",
-        "idx2char = {i:u for i, u in enumerate(unique)}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "1v_qUYfAzf-I"
-      },
-      "outputs": [],
-      "source": [
-        "# setting the maximum length sentence we want for a single input in characters\n",
-        "max_length = 100\n",
-        "\n",
-        "# length of the vocabulary in chars\n",
-        "vocab_size = len(unique)\n",
-        "\n",
-        "# the embedding dimension \n",
-        "embedding_dim = 256\n",
-        "\n",
-        "# number of RNN (here GRU) units\n",
-        "units = 1024\n",
-        "\n",
-        "# batch size \n",
-        "BATCH_SIZE = 64\n",
-        "\n",
-        "# buffer size to shuffle our dataset\n",
-        "BUFFER_SIZE = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "LFjSVAlWzf-N"
-      },
-      "source": [
-        "## Creating the input and output tensors\n",
-        "\n",
-        "Vectorizing the input and the target text because our model cannot understand strings only numbers.\n",
-        "\n",
-        "But first, we need to create the input and output vectors.\n",
-        "Remember the max_length we set above, we will use it here. We are creating **max_length** chunks of input, where each input vector is all the characters in that chunk except the last and the target vector is all the characters in that chunk except the first.\n",
-        "\n",
-        "For example, consider that the string = 'tensorflow' and the max_length is 9\n",
-        "\n",
-        "So, the `input = 'tensorflo'` and `output = 'ensorflow'`\n",
-        "\n",
-        "After creating the vectors, we convert each character into numbers using the **char2idx** dictionary we created above."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "0UHJDA39zf-O"
-      },
-      "outputs": [],
-      "source": [
-        "input_text = []\n",
-        "target_text = []\n",
-        "\n",
-        "for f in range(0, len(text)-max_length, max_length):\n",
-        "    inps = text[f:f+max_length]\n",
-        "    targ = text[f+1:f+1+max_length]\n",
-        "\n",
-        "    input_text.append([char2idx[i] for i in inps])\n",
-        "    target_text.append([char2idx[t] for t in targ])\n",
-        "    \n",
-        "print (np.array(input_text).shape)\n",
-        "print (np.array(target_text).shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "MJdfPmdqzf-R"
-      },
-      "source": [
-        "## Creating batches and shuffling them using tf.data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "p2pGotuNzf-S"
-      },
-      "outputs": [],
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
-        "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "m8gPwEjRzf-Z"
-      },
-      "source": [
-        "## Creating the model\n",
-        "\n",
-        "We use the Model Subclassing API which gives us full flexibility to create the model and change it however we like. We use 3 layers to define our model.\n",
-        "\n",
-        "* Embedding layer\n",
-        "* GRU layer (you can use an LSTM layer here)\n",
-        "* Fully connected layer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "P3KTiiInzf-a"
-      },
-      "outputs": [],
-      "source": [
-        "class Model(tf.keras.Model):\n",
-        "  def __init__(self, vocab_size, embedding_dim, units, batch_size):\n",
-        "    super(Model, self).__init__()\n",
-        "    self.units = units\n",
-        "    self.batch_sz = batch_size\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "\n",
-        "    if tf.test.is_gpu_available():\n",
-        "      self.gru = tf.keras.layers.CuDNNGRU(self.units, \n",
-        "                                          return_sequences=True, \n",
-        "                                          return_state=True, \n",
-        "                                          recurrent_initializer='glorot_uniform')\n",
-        "    else:\n",
-        "      self.gru = tf.keras.layers.GRU(self.units, \n",
-        "                                     return_sequences=True, \n",
-        "                                     return_state=True, \n",
-        "                                     recurrent_activation='sigmoid', \n",
-        "                                     recurrent_initializer='glorot_uniform')\n",
-        "\n",
-        "    self.fc = tf.keras.layers.Dense(vocab_size)\n",
-        "        \n",
-        "  def call(self, x, hidden):\n",
-        "    x = self.embedding(x)\n",
-        "\n",
-        "    # output shape == (batch_size, max_length, hidden_size) \n",
-        "    # states shape == (batch_size, hidden_size)\n",
-        "\n",
-        "    # states variable to preserve the state of the model\n",
-        "    # this will be used to pass at every step to the model while training\n",
-        "    output, states = self.gru(x, initial_state=hidden)\n",
-        "\n",
-        "\n",
-        "    # reshaping the output so that we can pass it to the Dense layer\n",
-        "    # after reshaping the shape is (batch_size * max_length, hidden_size)\n",
-        "    output = tf.reshape(output, (-1, output.shape[2]))\n",
-        "\n",
-        "    # The dense layer will output predictions for every time_steps(max_length)\n",
-        "    # output shape after the dense layer == (max_length * batch_size, vocab_size)\n",
-        "    x = self.fc(output)\n",
-        "\n",
-        "    return x, states"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "trpqTWyvk0nr"
-      },
-      "source": [
-        "## Call the model and set the optimizer and the loss function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "7t2XrzEOzf-e"
-      },
-      "outputs": [],
-      "source": [
-        "model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "dkjWIATszf-h"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors\n",
-        "def loss_function(real, preds):\n",
-        "    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "3K6s6F79P7za"
-      },
-      "source": [
-        "## Checkpoints (Object-based saving)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "oAGisDdfP9rL"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
-        "                                 model=model)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "lPrP0XMUzf-p"
-      },
-      "source": [
-        "## Train the model\n",
-        "\n",
-        "Here we will use a custom training loop with the help of GradientTape()\n",
-        "\n",
-        "* We initialize the hidden state of the model with zeros and shape == (batch_size, number of rnn units). We do this by calling the function defined while creating the model.\n",
-        "\n",
-        "* Next, we iterate over the dataset(batch by batch) and calculate the **predictions and the hidden states** associated with that input.\n",
-        "\n",
-        "* There are a lot of interesting things happening here.\n",
-        "  * The model gets hidden state(initialized with 0), lets call that **H0** and the first batch of input, lets call that **I0**.\n",
-        "  * The model then returns the predictions **P1** and **H1**.\n",
-        "  * For the next batch of input, the model receives **I1** and **H1**.\n",
-        "  * The interesting thing here is that we pass **H1** to the model with **I1** which is how the model learns. The context learned from batch to batch is contained in the **hidden state**.\n",
-        "  * We continue doing this until the dataset is exhausted and then we start a new epoch and repeat this.\n",
-        "\n",
-        "* After calculating the predictions, we calculate the **loss** using the loss function defined above. Then we calculate the gradients of the loss with respect to the model variables(input)\n",
-        "\n",
-        "* Finally, we take a step in that direction with the help of the optimizer using the apply_gradients function.\n",
-        "\n",
-        "Note:- If you are running this notebook in Colab which has a **Tesla K80 GPU** it takes about 23 seconds per epoch.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "d4tSNwymzf-q"
-      },
-      "outputs": [],
-      "source": [
-        "# Training step\n",
-        "\n",
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    # initializing the hidden state at the start of every epoch\n",
-        "    hidden = model.reset_states()\n",
-        "    \n",
-        "    for (batch, (inp, target)) in enumerate(dataset):\n",
-        "          with tf.GradientTape() as tape:\n",
-        "              # feeding the hidden state back into the model\n",
-        "              # This is the interesting step\n",
-        "              predictions, hidden = model(inp, hidden)\n",
-        "              \n",
-        "              # reshaping the target because that's how the \n",
-        "              # loss function expects it\n",
-        "              target = tf.reshape(target, (-1,))\n",
-        "              loss = loss_function(target, predictions)\n",
-        "              \n",
-        "          grads = tape.gradient(loss, model.variables)\n",
-        "          optimizer.apply_gradients(zip(grads, model.variables))\n",
-        "\n",
-        "          if batch % 100 == 0:\n",
-        "              print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,\n",
-        "                                                            batch,\n",
-        "                                                            loss))\n",
-        "    # saving (checkpoint) the model every 5 epochs\n",
-        "    if (epoch + 1) % 5 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "\n",
-        "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
-        "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "01AR9vpNQMFF"
-      },
-      "source": [
-        "## Restore the latest checkpoint"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "tyvpYomYQQkF"
-      },
-      "outputs": [],
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "DjGz1tDkzf-u"
-      },
-      "source": [
-        "## Predicting using our trained model\n",
-        "\n",
-        "The below code block is used to generated the text\n",
-        "\n",
-        "* We start by choosing a start string and initializing the hidden state and setting the number of characters we want to generate.\n",
-        "\n",
-        "* We get predictions using the start_string and the hidden state\n",
-        "\n",
-        "* Then we use argmax to calculate the index of the predicted word. **We use this predicted word as our next input to the model**\n",
-        "\n",
-        "* **The hidden state returned by the model is fed back into the model so that it now has more context rather than just one word.** After we predict the next word, the modified hidden states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.\n",
-        "\n",
-        "* If you see the predictions, the model knows when to capitalize, make paragraphs and the text follows a shakespeare style of writing which is pretty awesome!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "WvuwZBX5Ogfd"
-      },
-      "outputs": [],
-      "source": [
-        "# Evaluation step(generating text using the model learned)\n",
-        "\n",
-        "# number of characters to generate\n",
-        "num_generate = 1000\n",
-        "\n",
-        "# You can change the start string to experiment\n",
-        "start_string = 'Q'\n",
-        "# converting our start string to numbers(vectorizing!) \n",
-        "input_eval = [char2idx[s] for s in start_string]\n",
-        "input_eval = tf.expand_dims(input_eval, 0)\n",
-        "\n",
-        "# empty string to store our results\n",
-        "text_generated = ''\n",
-        "\n",
-        "# hidden state shape == (batch_size, number of rnn units); here batch size == 1\n",
-        "hidden = [tf.zeros((1, units))]\n",
-        "for i in range(num_generate):\n",
-        "    predictions, hidden = model(input_eval, hidden)\n",
-        "\n",
-        "    # using argmax to predict the word returned by the model\n",
-        "    predicted_id = tf.argmax(predictions[-1]).numpy()\n",
-        "    \n",
-        "    # We pass the predicted word as the next input to the model\n",
-        "    # along with the previous hidden state\n",
-        "    input_eval = tf.expand_dims([predicted_id], 0)\n",
-        "    \n",
-        "    text_generated += idx2char[predicted_id]\n",
-        "\n",
-        "print (start_string + text_generated)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "AM2Uma_-yVIq"
-      },
-      "source": [
-        "## Next steps\n",
-        "\n",
-        "* Change the start string to a different character, or the start of a sentence.\n",
-        "* Experiment with training on a different, or with different parameters. [Project  Gutenberg](http://www.gutenberg.org/ebooks/100), for example, contains a large collection of books.\n",
-        "* Add another RNN layer.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gtEd86sX5cB2"
-      },
-      "outputs": [],
-      "source": [
-        ""
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     }
   ],
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
index 78548c51c90912ee5094aa88904d818916bcd688..35d509904211d98f124d2555fc48166e75cb0dd9 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 cuda_py_test(
     name = "l2hmc_test",
-    size = "large",
+    size = "medium",
     srcs = ["l2hmc_test.py"],
     additional_deps = [
         ":l2hmc",
@@ -36,6 +36,7 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = [
         "oss_serial",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 6a178ddcecc6b271fa3f074cb6768165b1d1177d..30afef83bc5c6c164c8456ed472f4d6064068a25 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "linear_regression",
     srcs = ["linear_regression.py"],
     srcs_version = "PY2AND3",
+    deps = [":linear_regression_lib"],
+)
+
+py_library(
+    name = "linear_regression_lib",
+    srcs = ["linear_regression.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,7 +27,7 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
     tags = [
@@ -34,7 +41,7 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_graph_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 099b712fc06d1d3eb9ab4095f8db7283690bda76..206ef9409df7b1dc21de42ba919d2ba97f334a8c 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -56,7 +56,7 @@ class LinearModel(tf.keras.Model):
 
 
 def mean_square_loss(model, xs, ys):
-  return tf.reduce_mean(tf.square(tf.subtract(model(xs), ys)))
+  return tf.reduce_mean(tf.squared_difference(model(xs), ys))
 
 
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 66d52a74943d0d81fde05ce51b019558b327978d..512605a17eb77a85a5ec98197f4ed8fda6863932 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -1,11 +1,28 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "nmt_with_attention.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "AOpGoE2T-YXS"
       },
+      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -13,19 +30,19 @@
         "\n",
         "# Neural Machine Translation with Attention\n",
         "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\n",
-        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
-        "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
+        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+        "</td><td>\n",
+        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "CiwtNgENbx2g"
       },
+      "cell_type": "markdown",
       "source": [
         "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
         "\n",
@@ -33,24 +50,22 @@
         "\n",
         "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
         "\n",
-        "\u003cimg src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\"\u003e\n",
+        "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
         "\n",
         "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "tnxXKDjq3jEL"
+        "id": "tnxXKDjq3jEL",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "from __future__ import absolute_import, division, print_function\n",
         "\n",
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
+        "# Import TensorFlow >= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "tf.enable_eager_execution()\n",
@@ -65,14 +80,16 @@
         "import time\n",
         "\n",
         "print(tf.__version__)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "wfodePkj3jEa"
       },
+      "cell_type": "markdown",
       "source": [
         "## Download and prepare the dataset\n",
         "\n",
@@ -91,14 +108,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "kRVATYOgJs1b"
+        "id": "kRVATYOgJs1b",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Download the file\n",
         "path_to_zip = tf.keras.utils.get_file(\n",
@@ -106,17 +121,17 @@
         "    extract=True)\n",
         "\n",
         "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "rd0jw-eC3jEh"
+        "id": "rd0jw-eC3jEh",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Converts the unicode file to ascii\n",
         "def unicode_to_ascii(s):\n",
@@ -128,7 +143,7 @@
         "    w = unicode_to_ascii(w.lower().strip())\n",
         "    \n",
         "    # creating a space between a word and the punctuation following it\n",
-        "    # eg: \"he is a boy.\" =\u003e \"he is a boy .\" \n",
+        "    # eg: \"he is a boy.\" => \"he is a boy .\" \n",
         "    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
         "    w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
         "    w = re.sub(r'[\" \"]+', \" \", w)\n",
@@ -140,19 +155,19 @@
         "    \n",
         "    # adding a start and an end token to the sentence\n",
         "    # so that the model know when to start and stop predicting.\n",
-        "    w = '\u003cstart\u003e ' + w + ' \u003cend\u003e'\n",
+        "    w = '<start> ' + w + ' <end>'\n",
         "    return w"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "OHn4Dct23jEm"
+        "id": "OHn4Dct23jEm",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# 1. Remove the accents\n",
         "# 2. Clean the sentences\n",
@@ -163,20 +178,20 @@
         "    word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
         "    \n",
         "    return word_pairs"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "9xbqO7Iie9bb"
+        "id": "9xbqO7Iie9bb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "# This class creates a word -\u003e index mapping (e.g,. \"dad\" -\u003e 5) and vice-versa \n",
-        "# (e.g., 5 -\u003e \"dad\") for each language,\n",
+        "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
+        "# (e.g., 5 -> \"dad\") for each language,\n",
         "class LanguageIndex():\n",
         "  def __init__(self, lang):\n",
         "    self.lang = lang\n",
@@ -192,23 +207,23 @@
         "    \n",
         "    self.vocab = sorted(self.vocab)\n",
         "    \n",
-        "    self.word2idx['\u003cpad\u003e'] = 0\n",
+        "    self.word2idx['<pad>'] = 0\n",
         "    for index, word in enumerate(self.vocab):\n",
         "      self.word2idx[word] = index + 1\n",
         "    \n",
         "    for word, index in self.word2idx.items():\n",
         "      self.idx2word[index] = word"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "eAY9k49G3jE_"
+        "id": "eAY9k49G3jE_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def max_length(tensor):\n",
         "    return max(len(t) for t in tensor)\n",
@@ -244,71 +259,71 @@
         "                                                                  padding='post')\n",
         "    \n",
         "    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "GOi42V79Ydlr"
       },
+      "cell_type": "markdown",
       "source": [
         "### Limit the size of the dataset to experiment faster (optional)\n",
         "\n",
-        "Training on the complete dataset of \u003e100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+        "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "cnxC7q-j3jFD"
+        "id": "cnxC7q-j3jFD",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Try experimenting with the size of that dataset\n",
         "num_examples = 30000\n",
         "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "4QILQkOs3jFG"
+        "id": "4QILQkOs3jFG",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Creating training and validation sets using an 80-20 split\n",
         "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
         "\n",
         "# Show length\n",
         "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "rgCLkfv5uO3d"
       },
+      "cell_type": "markdown",
       "source": [
         "### Create a tf.data dataset"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "TqHsArVZ3jFS"
+        "id": "TqHsArVZ3jFS",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "BUFFER_SIZE = len(input_tensor_train)\n",
         "BATCH_SIZE = 64\n",
@@ -320,27 +335,29 @@
         "\n",
         "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
         "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "TNfHIF71ulLu"
       },
+      "cell_type": "markdown",
       "source": [
         "## Write the encoder and decoder model\n",
         "\n",
-        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
+        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://github.com/tensorflow/nmt). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://github.com/tensorflow/nmt#background-on-the-attention-mechanism) from the seq2seq tutorial. The following diagram shows that each input word is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
         "\n",
         "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
         "\n",
         "Here are the equations that are implemented:\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\"\u003e\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
         "\n",
         "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
         "\n",
@@ -362,14 +379,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "avyJ_4VIUoHb"
+        "id": "avyJ_4VIUoHb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def gru(units):\n",
         "  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
@@ -385,17 +400,17 @@
         "                               return_state=True, \n",
         "                               recurrent_activation='sigmoid', \n",
         "                               recurrent_initializer='glorot_uniform')"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "nZ2rI24i3jFg"
+        "id": "nZ2rI24i3jFg",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Encoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
@@ -412,17 +427,17 @@
         "    \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.enc_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "yJ_B3mhW3jFk"
+        "id": "yJ_B3mhW3jFk",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Decoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
@@ -476,41 +491,41 @@
         "        \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.dec_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "P5UY8wko3jFp"
+        "id": "P5UY8wko3jFp",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
         "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "_ch_71VbIRfK"
       },
+      "cell_type": "markdown",
       "source": [
         "## Define the optimizer and the loss function"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WmTHr5iV3jFr"
+        "id": "WmTHr5iV3jFr",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
@@ -519,41 +534,43 @@
         "  mask = 1 - np.equal(real, 0)\n",
         "  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
         "  return tf.reduce_mean(loss_)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "DMVWzzsfNl4e"
       },
+      "cell_type": "markdown",
       "source": [
         "## Checkpoints (Object-based saving)"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Zj8bXQTgNwrF"
+        "id": "Zj8bXQTgNwrF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "checkpoint_dir = './training_checkpoints'\n",
         "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
         "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
         "                                 encoder=encoder,\n",
         "                                 decoder=decoder)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "hpObfY22IddU"
       },
+      "cell_type": "markdown",
       "source": [
         "## Training\n",
         "\n",
@@ -567,14 +584,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ddefjBMa3jF0"
+        "id": "ddefjBMa3jF0",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "EPOCHS = 10\n",
         "\n",
@@ -592,7 +607,7 @@
         "            \n",
         "            dec_hidden = enc_hidden\n",
         "            \n",
-        "            dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']] * BATCH_SIZE, 1)       \n",
+        "            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       \n",
         "            \n",
         "            # Teacher forcing - feeding the target as the next input\n",
         "            for t in range(1, targ.shape[1]):\n",
@@ -625,14 +640,16 @@
         "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
         "                                        total_loss / N_BATCH))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "mU3Ce8M6I3rz"
       },
+      "cell_type": "markdown",
       "source": [
         "## Translate\n",
         "\n",
@@ -644,14 +661,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "EbQpyYs13jF_"
+        "id": "EbQpyYs13jF_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
@@ -668,12 +683,12 @@
         "    enc_out, enc_hidden = encoder(inputs, hidden)\n",
         "\n",
         "    dec_hidden = enc_hidden\n",
-        "    dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']], 0)\n",
+        "    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
         "\n",
         "    for t in range(max_length_targ):\n",
         "        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
         "        \n",
-        "        # storing the attention weigths to plot later on\n",
+        "        # storing the attention weights to plot later on\n",
         "        attention_weights = tf.reshape(attention_weights, (-1, ))\n",
         "        attention_plot[t] = attention_weights.numpy()\n",
         "\n",
@@ -681,24 +696,24 @@
         "\n",
         "        result += targ_lang.idx2word[predicted_id] + ' '\n",
         "\n",
-        "        if targ_lang.idx2word[predicted_id] == '\u003cend\u003e':\n",
+        "        if targ_lang.idx2word[predicted_id] == '<end>':\n",
         "            return result, sentence, attention_plot\n",
         "        \n",
         "        # the predicted ID is fed back into the model\n",
         "        dec_input = tf.expand_dims([predicted_id], 0)\n",
         "\n",
         "    return result, sentence, attention_plot"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "s5hQWlbN3jGF"
+        "id": "s5hQWlbN3jGF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# function for plotting the attention weights\n",
         "def plot_attention(attention, sentence, predicted_sentence):\n",
@@ -712,17 +727,17 @@
         "    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
         "\n",
         "    plt.show()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "sl9zUHzg3jGI"
+        "id": "sl9zUHzg3jGI",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
@@ -732,91 +747,93 @@
         "    \n",
         "    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
         "    plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "n250XbnjOaqP"
       },
+      "cell_type": "markdown",
       "source": [
         "## Restore the latest checkpoint and test"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "UJpT9D5_OgP6"
+        "id": "UJpT9D5_OgP6",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# restoring the latest checkpoint in checkpoint_dir\n",
         "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WrAM0FDomq3E"
+        "id": "WrAM0FDomq3E",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "zSx2iM36EZQZ"
+        "id": "zSx2iM36EZQZ",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "A3LLCx3ZE0Ls"
+        "id": "A3LLCx3ZE0Ls",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "DUQVLVqUE1YW"
+        "id": "DUQVLVqUE1YW",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# wrong translation\n",
         "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "RTe5P5ioMJwN"
       },
+      "cell_type": "markdown",
       "source": [
         "## Next steps\n",
         "\n",
@@ -824,31 +841,5 @@
         "* Experiment with training on a larger dataset, or using more epochs\n"
       ]
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "nmt_with_attention.ipynb",
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
-          "timestamp": 1527858391290
-        },
-        {
-          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
-          "timestamp": 1527776041613
-        }
-      ],
-      "toc_visible": true,
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  ]
 }
diff --git a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
index d60ee18586196614c9c0f73fc88dfb8b758725ea..57bd18d7529d28e0914d6c32b93881336002b9a5 100644
--- a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
+++ b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
@@ -11,777 +11,17 @@
         "\n",
         "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
         "\n",
-        "# Pix2Pix: An example with tf.keras and eager\n",
-        "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\n",
-        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
-        "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "ITZuApL56Mny"
-      },
-      "source": [
-        "This notebook demonstrates image to image translation using conditional GAN's, as described in [Image-to-Image Translation with Conditional Adversarial Networks](https://arxiv.org/abs/1611.07004). Using this technique we can colorize black and white photos, convert google maps to google earth, etc. Here, we convert building facades to real buildings. We use [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) to achieve this.\n",
-        "\n",
-        "In example, we will use the [CMP Facade Database](http://cmp.felk.cvut.cz/~tylecr1/facade/), helpfully provided by the [Center for Machine Perception](http://cmp.felk.cvut.cz/) at the [Czech Technical University in Prague](https://www.cvut.cz/). To keep our example short, we will use a preprocessed [copy](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/) of this dataset, created by the authors of the [paper](https://arxiv.org/abs/1611.07004) above.\n",
-        "\n",
-        "Each epoch takes around 58 seconds on a single P100 GPU.\n",
-        "\n",
-        "Below is the output generated after training the model for 200 epochs.\n",
-        "\n",
-        "\n",
-        "![sample output_1](https://www.tensorflow.org/images/gan/pix2pix_1.png)\n",
-        "![sample output_2](https://www.tensorflow.org/images/gan/pix2pix_2.png)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "source": [
-        "## Import TensorFlow and enable eager execution"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd"
-      },
-      "outputs": [],
-      "source": [
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import os\n",
-        "import time\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "import PIL\n",
-        "from IPython.display import clear_output"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "source": [
-        "## Load the dataset\n",
-        "\n",
-        "You can download this dataset and similar datasets from [here](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets). As mentioned in the [paper](https://arxiv.org/abs/1611.07004) we apply random jittering and mirroring to the training dataset.\n",
-        "* In random jittering, the image is resized to `286 x 286` and then randomly cropped to `256 x 256`\n",
-        "* In random mirroring, the image is randomly flipped horizontally i.e left to right."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "Kn-k8kTXuAlv"
-      },
-      "outputs": [],
-      "source": [
-        "path_to_zip = tf.keras.utils.get_file('facades.tar.gz',\n",
-        "                                      cache_subdir=os.path.abspath('.'),\n",
-        "                                      origin='https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/facades.tar.gz', \n",
-        "                                      extract=True)\n",
-        "\n",
-        "PATH = os.path.join(os.path.dirname(path_to_zip), 'facades/')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "2CbTEt448b4R"
-      },
-      "outputs": [],
-      "source": [
-        "BUFFER_SIZE = 400\n",
-        "BATCH_SIZE = 1\n",
-        "IMG_WIDTH = 256\n",
-        "IMG_HEIGHT = 256"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "tyaP4hLJ8b4W"
-      },
-      "outputs": [],
-      "source": [
-        "def load_image(image_file, is_train):\n",
-        "  image = tf.read_file(image_file)\n",
-        "  image = tf.image.decode_jpeg(image)\n",
-        "\n",
-        "  w = tf.shape(image)[1]\n",
-        "\n",
-        "  w = w // 2\n",
-        "  real_image = image[:, :w, :]\n",
-        "  input_image = image[:, w:, :]\n",
-        "\n",
-        "  input_image = tf.cast(input_image, tf.float32)\n",
-        "  real_image = tf.cast(real_image, tf.float32)\n",
-        "\n",
-        "  if is_train:\n",
-        "    # random jittering\n",
-        "    \n",
-        "    # resizing to 286 x 286 x 3\n",
-        "    input_image = tf.image.resize_images(input_image, [286, 286], \n",
-        "                                        align_corners=True, \n",
-        "                                        method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n",
-        "    real_image = tf.image.resize_images(real_image, [286, 286], \n",
-        "                                        align_corners=True, \n",
-        "                                        method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)\n",
-        "    \n",
-        "    # randomly cropping to 256 x 256 x 3\n",
-        "    stacked_image = tf.stack([input_image, real_image], axis=0)\n",
-        "    cropped_image = tf.random_crop(stacked_image, size=[2, IMG_HEIGHT, IMG_WIDTH, 3])\n",
-        "    input_image, real_image = cropped_image[0], cropped_image[1]\n",
-        "\n",
-        "    if np.random.random() \u003e 0.5:\n",
-        "      # random mirroring\n",
-        "      input_image = tf.image.flip_left_right(input_image)\n",
-        "      real_image = tf.image.flip_left_right(real_image)\n",
-        "  else:\n",
-        "    input_image = tf.image.resize_images(input_image, size=[IMG_HEIGHT, IMG_WIDTH], \n",
-        "                                         align_corners=True, method=2)\n",
-        "    real_image = tf.image.resize_images(real_image, size=[IMG_HEIGHT, IMG_WIDTH], \n",
-        "                                        align_corners=True, method=2)\n",
-        "  \n",
-        "  # normalizing the images to [-1, 1]\n",
-        "  input_image = (input_image / 127.5) - 1\n",
-        "  real_image = (real_image / 127.5) - 1\n",
-        "\n",
-        "  return input_image, real_image"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "source": [
-        "## Use tf.data to create batches, map(do preprocessing) and shuffle the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "SQHmYSmk8b4b"
-      },
-      "outputs": [],
-      "source": [
-        "train_dataset = tf.data.Dataset.list_files(PATH+'train/*.jpg')\n",
-        "train_dataset = train_dataset.shuffle(BUFFER_SIZE)\n",
-        "train_dataset = train_dataset.map(lambda x: load_image(x, True))\n",
-        "train_dataset = train_dataset.batch(1)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "MS9J0yA58b4g"
-      },
-      "outputs": [],
-      "source": [
-        "test_dataset = tf.data.Dataset.list_files(PATH+'test/*.jpg')\n",
-        "test_dataset = test_dataset.map(lambda x: load_image(x, False))\n",
-        "test_dataset = test_dataset.batch(1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "source": [
-        "## Write the generator and discriminator models\n",
-        "\n",
-        "* **Generator** \n",
-        "  * The architecture of generator is a modified U-Net.\n",
-        "  * Each block in the encoder is (Conv -\u003e Batchnorm -\u003e Leaky ReLU)\n",
-        "  * Each block in the decoder is (Transposed Conv -\u003e Batchnorm -\u003e Dropout(applied to the first 3 blocks) -\u003e ReLU)\n",
-        "  * There are skip connections between the encoder and decoder (as in U-Net).\n",
-        "  \n",
-        "* **Discriminator**\n",
-        "  * The Discriminator is a PatchGAN.\n",
-        "  * Each block in the discriminator is (Conv -\u003e BatchNorm -\u003e Leaky ReLU)\n",
-        "  * The shape of the output after the last layer is (batch_size, 30, 30, 1)\n",
-        "  * Each 30x30 patch of the output classifies a 70x70 portion of the input image (such an architecture is called a PatchGAN).\n",
-        "  * Discriminator receives 2 inputs.\n",
-        "    * Input image and the target image, which it should classify as real.\n",
-        "    * Input image and the generated image (output of generator), which it should classify as fake. \n",
-        "    * We concatenate these 2 inputs together in the code (`tf.concat([inp, tar], axis=-1)`)\n",
-        "\n",
-        "* Shape of the input travelling through the generator and the discriminator is in the comments in the code.\n",
-        "\n",
-        "To learn more about the architecture and the hyperparameters you can refer the [paper](https://arxiv.org/abs/1611.07004).\n",
-        "    "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "tqqvWxlw8b4l"
-      },
-      "outputs": [],
-      "source": [
-        "OUTPUT_CHANNELS = 3"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "lFPI4Nu-8b4q"
-      },
-      "outputs": [],
-      "source": [
-        "class Downsample(tf.keras.Model):\n",
-        "    \n",
-        "  def __init__(self, filters, size, apply_batchnorm=True):\n",
-        "    super(Downsample, self).__init__()\n",
-        "    self.apply_batchnorm = apply_batchnorm\n",
-        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
-        "\n",
-        "    self.conv1 = tf.keras.layers.Conv2D(filters, \n",
-        "                                        (size, size), \n",
-        "                                        strides=2, \n",
-        "                                        padding='same',\n",
-        "                                        kernel_initializer=initializer,\n",
-        "                                        use_bias=False)\n",
-        "    if self.apply_batchnorm:\n",
-        "        self.batchnorm = tf.keras.layers.BatchNormalization()\n",
-        "  \n",
-        "  def call(self, x, training):\n",
-        "    x = self.conv1(x)\n",
-        "    if self.apply_batchnorm:\n",
-        "        x = self.batchnorm(x, training=training)\n",
-        "    x = tf.nn.leaky_relu(x)\n",
-        "    return x \n",
-        "\n",
-        "\n",
-        "class Upsample(tf.keras.Model):\n",
-        "    \n",
-        "  def __init__(self, filters, size, apply_dropout=False):\n",
-        "    super(Upsample, self).__init__()\n",
-        "    self.apply_dropout = apply_dropout\n",
-        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
-        "\n",
-        "    self.up_conv = tf.keras.layers.Conv2DTranspose(filters, \n",
-        "                                                   (size, size), \n",
-        "                                                   strides=2, \n",
-        "                                                   padding='same',\n",
-        "                                                   kernel_initializer=initializer,\n",
-        "                                                   use_bias=False)\n",
-        "    self.batchnorm = tf.keras.layers.BatchNormalization()\n",
-        "    if self.apply_dropout:\n",
-        "        self.dropout = tf.keras.layers.Dropout(0.5)\n",
-        "\n",
-        "  def call(self, x1, x2, training):\n",
-        "    x = self.up_conv(x1)\n",
-        "    x = self.batchnorm(x, training=training)\n",
-        "    if self.apply_dropout:\n",
-        "        x = self.dropout(x, training=training)\n",
-        "    x = tf.nn.relu(x)\n",
-        "    x = tf.concat([x, x2], axis=-1)\n",
-        "    return x\n",
-        "\n",
-        "\n",
-        "class Generator(tf.keras.Model):\n",
-        "    \n",
-        "  def __init__(self):\n",
-        "    super(Generator, self).__init__()\n",
-        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
-        "    \n",
-        "    self.down1 = Downsample(64, 4, apply_batchnorm=False)\n",
-        "    self.down2 = Downsample(128, 4)\n",
-        "    self.down3 = Downsample(256, 4)\n",
-        "    self.down4 = Downsample(512, 4)\n",
-        "    self.down5 = Downsample(512, 4)\n",
-        "    self.down6 = Downsample(512, 4)\n",
-        "    self.down7 = Downsample(512, 4)\n",
-        "    self.down8 = Downsample(512, 4)\n",
-        "\n",
-        "    self.up1 = Upsample(512, 4, apply_dropout=True)\n",
-        "    self.up2 = Upsample(512, 4, apply_dropout=True)\n",
-        "    self.up3 = Upsample(512, 4, apply_dropout=True)\n",
-        "    self.up4 = Upsample(512, 4)\n",
-        "    self.up5 = Upsample(256, 4)\n",
-        "    self.up6 = Upsample(128, 4)\n",
-        "    self.up7 = Upsample(64, 4)\n",
-        "\n",
-        "    self.last = tf.keras.layers.Conv2DTranspose(OUTPUT_CHANNELS, \n",
-        "                                                (4, 4), \n",
-        "                                                strides=2, \n",
-        "                                                padding='same',\n",
-        "                                                kernel_initializer=initializer)\n",
-        "  \n",
-        "  @tf.contrib.eager.defun\n",
-        "  def call(self, x, training):\n",
-        "    # x shape == (bs, 256, 256, 3)    \n",
-        "    x1 = self.down1(x, training=training) # (bs, 128, 128, 64)\n",
-        "    x2 = self.down2(x1, training=training) # (bs, 64, 64, 128)\n",
-        "    x3 = self.down3(x2, training=training) # (bs, 32, 32, 256)\n",
-        "    x4 = self.down4(x3, training=training) # (bs, 16, 16, 512)\n",
-        "    x5 = self.down5(x4, training=training) # (bs, 8, 8, 512)\n",
-        "    x6 = self.down6(x5, training=training) # (bs, 4, 4, 512)\n",
-        "    x7 = self.down7(x6, training=training) # (bs, 2, 2, 512)\n",
-        "    x8 = self.down8(x7, training=training) # (bs, 1, 1, 512)\n",
-        "\n",
-        "    x9 = self.up1(x8, x7, training=training) # (bs, 2, 2, 1024)\n",
-        "    x10 = self.up2(x9, x6, training=training) # (bs, 4, 4, 1024)\n",
-        "    x11 = self.up3(x10, x5, training=training) # (bs, 8, 8, 1024)\n",
-        "    x12 = self.up4(x11, x4, training=training) # (bs, 16, 16, 1024)\n",
-        "    x13 = self.up5(x12, x3, training=training) # (bs, 32, 32, 512)\n",
-        "    x14 = self.up6(x13, x2, training=training) # (bs, 64, 64, 256)\n",
-        "    x15 = self.up7(x14, x1, training=training) # (bs, 128, 128, 128)\n",
-        "\n",
-        "    x16 = self.last(x15) # (bs, 256, 256, 3)\n",
-        "    x16 = tf.nn.tanh(x16)\n",
-        "\n",
-        "    return x16"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "ll6aNeQx8b4v"
-      },
-      "outputs": [],
-      "source": [
-        "class DiscDownsample(tf.keras.Model):\n",
-        "    \n",
-        "  def __init__(self, filters, size, apply_batchnorm=True):\n",
-        "    super(DiscDownsample, self).__init__()\n",
-        "    self.apply_batchnorm = apply_batchnorm\n",
-        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
-        "\n",
-        "    self.conv1 = tf.keras.layers.Conv2D(filters, \n",
-        "                                        (size, size), \n",
-        "                                        strides=2, \n",
-        "                                        padding='same',\n",
-        "                                        kernel_initializer=initializer,\n",
-        "                                        use_bias=False)\n",
-        "    if self.apply_batchnorm:\n",
-        "        self.batchnorm = tf.keras.layers.BatchNormalization()\n",
-        "  \n",
-        "  def call(self, x, training):\n",
-        "    x = self.conv1(x)\n",
-        "    if self.apply_batchnorm:\n",
-        "        x = self.batchnorm(x, training=training)\n",
-        "    x = tf.nn.leaky_relu(x)\n",
-        "    return x \n",
-        "\n",
-        "class Discriminator(tf.keras.Model):\n",
-        "    \n",
-        "  def __init__(self):\n",
-        "    super(Discriminator, self).__init__()\n",
-        "    initializer = tf.random_normal_initializer(0., 0.02)\n",
-        "    \n",
-        "    self.down1 = DiscDownsample(64, 4, False)\n",
-        "    self.down2 = DiscDownsample(128, 4)\n",
-        "    self.down3 = DiscDownsample(256, 4)\n",
-        "    \n",
-        "    # we are zero padding here with 1 because we need our shape to \n",
-        "    # go from (batch_size, 32, 32, 256) to (batch_size, 31, 31, 512)\n",
-        "    self.zero_pad1 = tf.keras.layers.ZeroPadding2D()\n",
-        "    self.conv = tf.keras.layers.Conv2D(512, \n",
-        "                                       (4, 4), \n",
-        "                                       strides=1, \n",
-        "                                       kernel_initializer=initializer, \n",
-        "                                       use_bias=False)\n",
-        "    self.batchnorm1 = tf.keras.layers.BatchNormalization()\n",
-        "    \n",
-        "    # shape change from (batch_size, 31, 31, 512) to (batch_size, 30, 30, 1)\n",
-        "    self.zero_pad2 = tf.keras.layers.ZeroPadding2D()\n",
-        "    self.last = tf.keras.layers.Conv2D(1, \n",
-        "                                       (4, 4), \n",
-        "                                       strides=1,\n",
-        "                                       kernel_initializer=initializer)\n",
-        "  \n",
-        "  @tf.contrib.eager.defun\n",
-        "  def call(self, inp, tar, training):\n",
-        "    # concatenating the input and the target\n",
-        "    x = tf.concat([inp, tar], axis=-1) # (bs, 256, 256, channels*2)\n",
-        "    x = self.down1(x, training=training) # (bs, 128, 128, 64)\n",
-        "    x = self.down2(x, training=training) # (bs, 64, 64, 128)\n",
-        "    x = self.down3(x, training=training) # (bs, 32, 32, 256)\n",
-        "\n",
-        "    x = self.zero_pad1(x) # (bs, 34, 34, 256)\n",
-        "    x = self.conv(x)      # (bs, 31, 31, 512)\n",
-        "    x = self.batchnorm1(x, training=training)\n",
-        "    x = tf.nn.leaky_relu(x)\n",
-        "    \n",
-        "    x = self.zero_pad2(x) # (bs, 33, 33, 512)\n",
-        "    # don't add a sigmoid activation here since\n",
-        "    # the loss function expects raw logits.\n",
-        "    x = self.last(x)      # (bs, 30, 30, 1)\n",
-        "\n",
-        "    return x"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gDkA05NE6QMs"
-      },
-      "outputs": [],
-      "source": [
-        "# The call function of Generator and Discriminator have been decorated\n",
-        "# with tf.contrib.eager.defun()\n",
-        "# We get a performance speedup if defun is used (~25 seconds per epoch)\n",
-        "generator = Generator()\n",
-        "discriminator = Discriminator()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "source": [
-        "## Define the loss functions and the optimizer\n",
-        "\n",
-        "* **Discriminator loss**\n",
-        "  * The discriminator loss function takes 2 inputs; **real images, generated images**\n",
-        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones(since these are the real images)**\n",
-        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros(since these are the fake images)**\n",
-        "  * Then the total_loss is the sum of real_loss and the generated_loss\n",
-        "  \n",
-        "* **Generator loss**\n",
-        "  * It is a sigmoid cross entropy loss of the generated images and an **array of ones**.\n",
-        "  * The [paper](https://arxiv.org/abs/1611.07004) also includes L1 loss which is MAE (mean absolute error) between the generated image and the target image.\n",
-        "  * This allows the generated image to become structurally similar to the target image.\n",
-        "  * The formula to calculate the total generator loss = gan_loss + LAMBDA * l1_loss, where LAMBDA = 100. This value was decided by the authors of the [paper](https://arxiv.org/abs/1611.07004)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "cyhxTuvJyIHV"
-      },
-      "outputs": [],
-      "source": [
-        "LAMBDA = 100"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "wkMNfBWlT-PV"
-      },
-      "outputs": [],
-      "source": [
-        "def discriminator_loss(disc_real_output, disc_generated_output):\n",
-        "  real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_real_output), \n",
-        "                                              logits = disc_real_output)\n",
-        "  generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.zeros_like(disc_generated_output), \n",
-        "                                                   logits = disc_generated_output)\n",
-        "\n",
-        "  total_disc_loss = real_loss + generated_loss\n",
-        "\n",
-        "  return total_disc_loss"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "90BIcCKcDMxz"
-      },
-      "outputs": [],
-      "source": [
-        "def generator_loss(disc_generated_output, gen_output, target):\n",
-        "  gan_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_generated_output),\n",
-        "                                             logits = disc_generated_output) \n",
-        "  # mean absolute error\n",
-        "  l1_loss = tf.reduce_mean(tf.abs(target - gen_output))\n",
-        "\n",
-        "  total_gen_loss = gan_loss + (LAMBDA * l1_loss)\n",
-        "\n",
-        "  return total_gen_loss"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7"
-      },
-      "outputs": [],
-      "source": [
-        "generator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)\n",
-        "discriminator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "aKUZnDiqQrAh"
-      },
-      "source": [
-        "## Checkpoints (Object-based saving)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "WJnftd5sQsv6"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
-        "                                 discriminator_optimizer=discriminator_optimizer,\n",
-        "                                 generator=generator,\n",
-        "                                 discriminator=discriminator)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We start by iterating over the dataset\n",
-        "* The generator gets the input image and we get a generated output.\n",
-        "* The discriminator receives the input_image and the generated image as the first input. The second input is the input_image and the target_image.\n",
-        "* Next, we calculate the generator and the discriminator loss.\n",
-        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables(inputs) and apply those to the optimizer.\n",
-        "\n",
-        "## Generate Images\n",
-        "\n",
-        "* After training, its time to generate some images!\n",
-        "* We pass images from the test dataset to the generator.\n",
-        "* The generator will then translate the input image into the output we expect.\n",
-        "* Last step is to plot the predictions and **voila!**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo"
-      },
-      "outputs": [],
-      "source": [
-        "EPOCHS = 200"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy"
-      },
-      "outputs": [],
-      "source": [
-        "def generate_images(model, test_input, tar):\n",
-        "  # the training=True is intentional here since\n",
-        "  # we want the batch statistics while running the model\n",
-        "  # on the test dataset. If we use training=False, we will get \n",
-        "  # the accumulated statistics learned from the training dataset\n",
-        "  # (which we don't want)\n",
-        "  prediction = model(test_input, training=True)\n",
-        "  plt.figure(figsize=(15,15))\n",
-        "\n",
-        "  display_list = [test_input[0], tar[0], prediction[0]]\n",
-        "  title = ['Input Image', 'Ground Truth', 'Predicted Image']\n",
-        "\n",
-        "  for i in range(3):\n",
-        "    plt.subplot(1, 3, i+1)\n",
-        "    plt.title(title[i])\n",
-        "    # getting the pixel values between [0, 1] to plot it.\n",
-        "    plt.imshow(display_list[i] * 0.5 + 0.5)\n",
-        "    plt.axis('off')\n",
-        "  plt.show()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ"
-      },
-      "outputs": [],
-      "source": [
-        "def train(dataset, epochs):  \n",
-        "  for epoch in range(epochs):\n",
-        "    start = time.time()\n",
-        "\n",
-        "    for input_image, target in dataset:\n",
-        "\n",
-        "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
-        "        gen_output = generator(input_image, training=True)\n",
-        "\n",
-        "        disc_real_output = discriminator(input_image, target, training=True)\n",
-        "        disc_generated_output = discriminator(input_image, gen_output, training=True)\n",
-        "\n",
-        "        gen_loss = generator_loss(disc_generated_output, gen_output, target)\n",
-        "        disc_loss = discriminator_loss(disc_real_output, disc_generated_output)\n",
-        "\n",
-        "      generator_gradients = gen_tape.gradient(gen_loss, \n",
-        "                                              generator.variables)\n",
-        "      discriminator_gradients = disc_tape.gradient(disc_loss, \n",
-        "                                                   discriminator.variables)\n",
-        "\n",
-        "      generator_optimizer.apply_gradients(zip(generator_gradients, \n",
-        "                                              generator.variables))\n",
-        "      discriminator_optimizer.apply_gradients(zip(discriminator_gradients, \n",
-        "                                                  discriminator.variables))\n",
-        "\n",
-        "    if epoch % 1 == 0:\n",
-        "        clear_output(wait=True)\n",
-        "        for inp, tar in test_dataset.take(1):\n",
-        "          generate_images(generator, inp, tar)\n",
-        "          \n",
-        "    # saving (checkpoint) the model every 20 epochs\n",
-        "    if (epoch + 1) % 20 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "\n",
-        "    print ('Time taken for epoch {} is {} sec\\n'.format(epoch + 1,\n",
-        "                                                        time.time()-start))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "a1zZmKmvOH85"
-      },
-      "outputs": [],
-      "source": [
-        "train(train_dataset, EPOCHS)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "kz80bY3aQ1VZ"
-      },
-      "source": [
-        "## Restore the latest checkpoint and test"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "4t4x69adQ5xb"
-      },
-      "outputs": [],
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+        "# Pix2Pix"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "1RGysMU_BZhx"
-      },
-      "source": [
-        "## Testing on the entire test dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "KUgSnmy2nqSP"
-      },
-      "outputs": [],
-      "source": [
-        "# Run the trained model on the entire test dataset\n",
-        "for inp, tar in test_dataset:\n",
-        "  generate_images(generator, inp, tar)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "3AJXOByaZVOf"
+        "id": "c7W3j96p219v"
       },
-      "outputs": [],
       "source": [
-        ""
+        "This notebook has been moved to [https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/pix2pix.ipynb](https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/pix2pix.ipynb)"
       ]
     }
   ],
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index 04ac78a2d3753a07f067f6f2abe0cfc02f5245ba..f2851d97223e483da11120f1fe3f0a2f641dfb81 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -27,7 +27,7 @@ py_library(
 
 cuda_py_test(
     name = "resnet50_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_test.py"],
     additional_deps = [
         ":resnet50",
@@ -35,6 +35,7 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",  # Fix b/118130911
         "nomsan",  # Fix b/118130911
@@ -46,7 +47,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "resnet50_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_graph_test.py"],
     additional_deps = [
         ":resnet50",
@@ -54,6 +55,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",
         "nomsan",
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 0d85bf63ad28d6ad15b93c86f8236bbdc7c2f4be..cb207b8ddf3641a68a114386f6a95a26ce2b74d6 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -67,12 +67,13 @@ py_library(
 # Tests
 cuda_py_test(
     name = "ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["ops_test.py"],
     additional_deps = [
         ":ops",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "oss_serial",
     ],
@@ -80,20 +81,22 @@ cuda_py_test(
 
 cuda_py_test(
     name = "blocks_test",
-    size = "large",
+    size = "medium",
     srcs = ["blocks_test.py"],
     additional_deps = [
         ":blocks",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
+        "no_oss",  # b/123045964
         "optonly",
     ],
 )
 
 cuda_py_test(
     name = "revnet_test",
-    size = "large",
+    size = "medium",
     srcs = ["revnet_test.py"],
     additional_deps = [
         ":blocks_test",
@@ -101,6 +104,7 @@ cuda_py_test(
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",  # depends on blocks_test, which is not available in pip package
         "optonly",
@@ -131,6 +135,13 @@ py_binary(
     name = "main",
     srcs = ["main.py"],
     srcs_version = "PY2AND3",
+    deps = [":main_lib"],
+)
+
+py_library(
+    name = "main_lib",
+    srcs = ["main.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
         ":config",
@@ -145,7 +156,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -157,7 +168,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -169,7 +180,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index 576f60396ef3de21655920f1376ac96b3fbf5c5f..f4dbe7ac16f734f7bee045bc71e9559b630adf81 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_colorbot",
     srcs = ["rnn_colorbot.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_colorbot_lib"],
+)
+
+py_library(
+    name = "rnn_colorbot_lib",
+    srcs = ["rnn_colorbot.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -21,7 +28,7 @@ cuda_py_test(
     name = "rnn_colorbot_test",
     srcs = ["rnn_colorbot_test.py"],
     additional_deps = [
-        ":rnn_colorbot",
+        ":rnn_colorbot_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 74ebb1ec77131a560b1ebfd062c690920c35e261..1c718a5ce3d8e1541656d92fd5e8dad6c6683c4c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -207,7 +207,7 @@ class RNNColorbot(tf.keras.Model):
 
 def loss(labels, predictions):
   """Computes mean squared loss."""
-  return tf.reduce_mean(tf.square(predictions - labels))
+  return tf.reduce_mean(tf.squared_difference(predictions, labels))
 
 
 def test(model, eval_data):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index f9bf82a7d88c46e13bd12ce3ee90f0334d473f10..43a6ca526d3a0aecda2c8df865a0487ac28758ab 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_ptb",
     srcs = ["rnn_ptb.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_ptb_lib"],
+)
+
+py_library(
+    name = "rnn_ptb_lib",
+    srcs = ["rnn_ptb.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
@@ -21,17 +28,18 @@ cuda_py_test(
     name = "rnn_ptb_test",
     srcs = ["rnn_ptb_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_oss"],  # b/123045964
 )
 
 cuda_py_test(
     name = "rnn_ptb_graph_test",
     srcs = ["rnn_ptb_graph_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index d18a097063c7d25947af3e2e2959ce574edd553f..3143270ccfe4f670428c80bdc1e09fa452584207 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 # pylint: enable=g-bad-import-order
 
 
@@ -421,7 +421,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    object_graph = checkpointable_utils.object_metadata(
+    object_graph = trackable_utils.object_metadata(
         checkpoint_management.latest_checkpoint(config.logdir))
     ckpt_variable_names = set()
     for node in object_graph.nodes:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index c8d9266672a8b87d32338ea7c4f74fb40d41c767..b32501c2e804838af9d4c77663be131b77bd30b4 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -32,12 +32,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
 
-class Metric(checkpointable.Checkpointable):
+class Metric(trackable.Trackable):
   """A metric holds state for aggregating statistics over an evaluation run.
 
   Example use with eager execution:
@@ -269,7 +269,7 @@ class Metric(checkpointable.Checkpointable):
       else:
         collections = [ops.GraphKeys.LOCAL_VARIABLES]
       collections += [ops.GraphKeys.METRIC_VARIABLES]
-    # Variables are Checkpointable dependencies of Metrics regardless of the
+    # Variables are Trackable dependencies of Metrics regardless of the
     # global/local distinction. Users can avoid saving variables by not adding a
     # dependency on the Metric.
     v = self._add_variable_with_custom_getter(
@@ -282,7 +282,7 @@ class Metric(checkpointable.Checkpointable):
         use_resource=True,
         getter=variable_scope.get_variable,
         # Raise duplicate variable exceptions from get_variable rather than
-        # Checkpointable.
+        # Trackable.
         overwrite=True)
     self._vars.append(v)
     if context.executing_eagerly():
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 39e5957f5d1760613f2c33607c0bdb163040efb4..c56d1956fde35b562e60496015e666efe9ebc8f6 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class MetricsTest(test.TestCase):
@@ -314,7 +314,7 @@ class MetricsTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     mean = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=mean)
+    checkpoint = trackable_utils.Checkpoint(mean=mean)
     mean.build()
     mean._built = True
     self.evaluate(mean.init_variables())
@@ -327,7 +327,7 @@ class MetricsTest(test.TestCase):
     self.assertAllEqual(200., self.evaluate(mean.value()))
 
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 240f213c602395b8589d39c3ecd90b602ffa9848..b3e8daddaf2369e9e33179fde2aab1469e97ea47 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 # pylint: disable=not-callable
@@ -65,7 +65,7 @@ class NetworkTest(test.TestCase):
 
   def test_checkpointing_not_implemented(self):
     checkpoint_directory = self.get_temp_dir()
-    checkpoint = checkpointable_utils.Checkpoint(net=MyNetwork())
+    checkpoint = trackable_utils.Checkpoint(net=MyNetwork())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(checkpoint_directory)
 
diff --git a/tensorflow/contrib/eager/python/parameter_server.py b/tensorflow/contrib/eager/python/parameter_server.py
index 7803a6799bb64441fab881bf6ca986d5cf3851a8..258f0a19309235dcd99b31b4de3d35ef8d89b15b 100644
--- a/tensorflow/contrib/eager/python/parameter_server.py
+++ b/tensorflow/contrib/eager/python/parameter_server.py
@@ -30,7 +30,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
@@ -129,8 +129,8 @@ class SharedVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index b82e1bb71bce9a28d7bbbf961cc6d5e25dd18acf..05830c9c1c3ebab82a545938d62094772ccf11b1 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -62,7 +62,6 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@Checkpoint
 @@Checkpointable
-@@CheckpointableSaver
 
 @@executing_eagerly
 @@in_eager_mode
@@ -138,9 +137,8 @@ from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Vari
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable.tracking import AutoCheckpointable as Checkpointable
-from tensorflow.python.training.checkpointable.util import CheckpointableSaver
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import CheckpointV1 as Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 8c35dddb5a515aa09cc70c173a9f0605e8567e82..6881fabdc09e3275c29f3013283999c96e283770 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import tempfile
 
 from tensorflow.contrib.eager.python import tfe
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -40,6 +41,9 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertAllEqual([[4.]], y.numpy())
 
   def testInstantError(self):
+    if context.num_gpus():
+      # TODO(nareshmodi): make this test better
+      self.skipTest("Gather doesn't do index checking on GPUs")
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices = 7 is not in \[0, 3\)'):
       array_ops.gather([0, 1, 2], 7)
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping.py b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
index 11856ece38bf08dfdf16e8b0d9890bbfb0033216..47f568ed3d3e1b94e74c1423f774352df5c30f45 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
@@ -23,7 +23,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import early_stopping
+from tensorflow_estimator.python.estimator import early_stopping
 
 # Include attrs that start with single underscore.
 _HAS_DYNAMIC_ATTRIBUTES = True
@@ -31,4 +31,4 @@ early_stopping.__all__ = [
     s for s in dir(early_stopping) if not s.startswith('__')
 ]
 
-from tensorflow_estimator.contrib.estimator.python.estimator.early_stopping import *
+from tensorflow_estimator.python.estimator.early_stopping import *
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index cb86efb8da72f168b54f04773289a6fe421282b1..da2479a0b7b029561136903c82cabed9aae622b8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -109,7 +109,7 @@ tf_gen_op_wrapper_py(
 # Ops tests
 tf_py_test(
     name = "gmm_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "python/ops/gmm_test.py",
     ],
@@ -130,6 +130,7 @@ tf_py_test(
         "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",  # b/38283730
         "notsan",  # Flaky: b/30756419
@@ -202,10 +203,7 @@ py_test(
     srcs = ["python/ops/kmeans_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/73741358
-        "notsan",  # b/67512932
-    ],
+    tags = ["notsan"],
     deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
@@ -227,7 +225,7 @@ py_test(
 
 tf_py_test(
     name = "wals_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/ops/wals_test.py"],
     additional_deps = [
         ":factorization_py",
@@ -250,8 +248,8 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
     tags = [
-        "manual",
         "noasan",  # times out b/63678675
         "nomsan",
     ],
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index a8c5d0763c28ba2b54f217405f0da65533f26b91..68078ba8bbb07b4344c19d554012d214229f9c4f 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -19,12 +19,12 @@
 #include <numeric>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index d48b89cbacce34781819010addbcbd0ba66f9873..505d8d731fa9f3d0b004ccacd724576b7ac0ceee 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -613,7 +613,8 @@ class _InitializeClustersOpFactory(object):
       inp = nn_impl.l2_normalize(inp, dim=1)
     return gen_clustering_ops.kmeans_plus_plus_initialization(
         inp,
-        math_ops.to_int64(self._num_remaining), self._random_seed,
+        math_ops.cast(self._num_remaining, dtypes.int64),
+        self._random_seed,
         self._kmeans_plus_plus_num_retries)
 
   def _kmc2_multiple_centers(self):
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index d365ad111760247fc18b730657390f07ba6b865e..000b9832aa4d9ec645a86a6946fbf2665f9fd71d 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -53,7 +53,7 @@ def _covariance(x, diag):
     A Tensor representing the covariance of x. In the case of
   diagonal matrix just the diagonal is returned.
   """
-  num_points = math_ops.to_float(array_ops.shape(x)[0])
+  num_points = math_ops.cast(array_ops.shape(x)[0], dtypes.float32)
   x -= math_ops.reduce_mean(x, 0, keepdims=True)
   if diag:
     cov = math_ops.reduce_sum(
@@ -297,8 +297,9 @@ class GmmAlgorithm(object):
             cholesky, array_ops.transpose(
                 diff, perm=[0, 2, 1]), lower=True))
     diag_m = array_ops.transpose(math_ops.reduce_sum(x_mu_cov, 1))
-    self._probs[shard_id] = -0.5 * (diag_m + math_ops.to_float(self._dimensions)
-                                    * math_ops.log(2 * np.pi) + log_det_covs)
+    self._probs[shard_id] = (
+        -0.5 * (diag_m + math_ops.cast(self._dimensions, dtypes.float32) *
+                math_ops.log(2 * np.pi) + log_det_covs))
 
   def _define_diag_covariance_probs(self, shard_id, shard):
     """Defines the diagonal covariance probabilities per example in a class.
@@ -314,14 +315,14 @@ class GmmAlgorithm(object):
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
         math_ops.log(self._covs + 1e-3), 1, keepdims=True)
-    diff = shard - self._means
-    x2 = math_ops.square(diff)
+    x2 = math_ops.squared_difference(shard, self._means)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
     # num_classes X num_examples
     x2_cov = math_ops.matmul(x2, cov_expanded)
     x2_cov = array_ops.transpose(array_ops.squeeze(x2_cov, [2]))
     self._probs[shard_id] = -0.5 * (
-        math_ops.to_float(self._dimensions) * math_ops.log(2.0 * np.pi) +
+        math_ops.cast(self._dimensions, dtypes.float32) *
+        math_ops.log(2.0 * np.pi) +
         array_ops.transpose(det_expanded) + x2_cov)
 
   def _define_log_prob_operation(self, shard_id, shard):
@@ -401,7 +402,8 @@ class GmmAlgorithm(object):
       # Update alpha.
       if 'w' in self._params:
         final_points_in_k = points_in_k / num_batches
-        num_examples = math_ops.to_float(math_ops.reduce_sum(final_points_in_k))
+        num_examples = math_ops.cast(math_ops.reduce_sum(final_points_in_k),
+                                     dtypes.float32)
         self._alpha_op = self._alpha.assign(final_points_in_k /
                                             (num_examples + MEPS))
       else:
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 4e29e2559986012d8eeeaec807f14181226363aa..edd6f36e07c246eb4d8a5176a74943b461830cc3 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,7 +14,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
-        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -34,6 +33,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/feature_column:utils",
     ],
 )
 
@@ -72,44 +72,3 @@ tf_py_test(
     ],
     tags = ["no_pip"],
 )
-
-py_library(
-    name = "sequence_feature_column_v2",
-    srcs = ["python/feature_column/sequence_feature_column_v2.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_feature_column_v2_test",
-    srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
-    additional_deps = [
-        ":sequence_feature_column",
-        ":sequence_feature_column_v2",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/feature_column:feature_column_v2_test",
-    ],
-    tags = ["no_pip"],
-)
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 9b3a5c58aaa9498257fc971ac60b97f31d5185d8..64df44fe4360cb30de89f06e6e88d85e6dbaf182 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -23,6 +23,7 @@ import collections
 
 
 from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -506,7 +507,7 @@ class _SequenceNumericColumn(
     # sequence length is not affected.
     num_elements = (self._variable_shape.num_elements()
                     if sp_tensor.shape.ndims == 2 else 1)
-    seq_length = fc._sequence_length_from_sparse_tensor(
+    seq_length = fc_utils.sequence_length_from_sparse_tensor(
         sp_tensor, num_elements=num_elements)
 
     return fc._SequenceDenseColumn.TensorSequenceLengthPair(
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 88a14a2a94cc683f021d032ea11358e0cfb63faa..8fd2b5f39bc88b76fe5583f8d18389e232ea9f40 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -32,7 +32,6 @@ tf_custom_op_py_library(
         "python/ops/arg_scope.py",
         "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
-        "python/ops/critical_section_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
         "python/ops/script_ops.py",
@@ -51,6 +50,7 @@ tf_custom_op_py_library(
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
         "//tensorflow_estimator:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
@@ -171,26 +171,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "critical_section_test",
-    size = "medium",
-    srcs = ["python/ops/critical_section_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_test(
     name = "ops_test",
     size = "small",
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index e72e50585a3861d4527b66f89e1659d76c85960a..063717f08aa88f4de9470d8392db2b7c95b3e4bf 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -94,8 +94,6 @@
 @@smart_constant_value
 @@smart_case
 
-@@CriticalSection
-
 @@BoundedTensorSpec
 @@TensorSpec
 
@@ -129,18 +127,24 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
+    'is_nested',
     'is_sequence',
+    'is_sequence_or_composite',
     'flatten',
     'flatten_dict_items',
     'pack_sequence_as',
     'map_structure',
     'map_structure_with_paths',
+    'map_structure_with_tuple_paths',
     'assert_shallow_structure',
     'flatten_up_to',
+    'flatten_with_tuple_paths_up_to',
     'map_structure_up_to',
+    'map_structure_with_tuple_paths_up_to',
     'get_traverse_shallow_structure',
     'yield_flat_paths',
     'flatten_with_joined_string_paths',
+    'flatten_with_tuple_paths',
 ]
 
 remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols)
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index c4976497f5fa95d82e492153b117681f693eaa13..8113bf7c095bd0817e40cfd08bdf1ef7275ba55b 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.ops.arg_scope import *
 from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
-from tensorflow.contrib.framework.python.ops.critical_section_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
 from tensorflow.contrib.framework.python.ops.script_ops import *
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 57a5bfbf43c915775c6b0ef05baac19581213a09..5c254436e630393341945cb4546c20f2b24be031 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -68,6 +68,8 @@ tf_kernel_library(
     prefix = "fused_conv2d_bias_activation_op",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core:conv_autotuning_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -92,6 +94,8 @@ tf_custom_op_library(
         "ops/fused_conv2d_bias_activation_op.cc",
     ],
     deps = [
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core:conv_autotuning_proto_cc",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/kernels:bounds_check_lib",
         "//tensorflow/core/kernels:conv_2d_hdrs",
@@ -171,6 +175,7 @@ cuda_py_test(
     main = "python/ops/fused_conv2d_bias_activation_benchmark.py",
     tags = [
         "manual",  # TODO(b/117128481): re-enable after fixing OSS build
+        "nogpu",
         "requires-gpu-sm70",
     ],
 )
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index c541c71f996c7a1b36cf28ae9a1783f8dca0a72c..0b3e37107213281abca1f7abdb58f820c85ac1b0 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -34,9 +34,16 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "google/protobuf/duration.pb.h"
+#include "absl/time/time.h"
 #include "cuda/include/cudnn.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/protobuf/conv_autotuning.pb.h"
 #include "tensorflow/core/util/activation_mode.h"
 #endif  // GOOGLE_CUDA
 
@@ -252,6 +259,131 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 #if GOOGLE_CUDA
 namespace dnn = se::dnn;
 
+// Several functions are copyed over from tensorflow/core/kernels/gpu_utils,
+// since this file may be compiled down to a tf_custom_op_library .so file,
+// which can't depend on basic dependencies like tensorflow/core:lib. Instead,
+// the code has to depend on whatever is the same in libtensorflow_framework.so.
+//
+// In theory, we can lift the dependencies of gpu_utils by turning it into a
+// template library that provides duck typing, but I think duplication is the
+// lesser of two evils.
+namespace internal {
+namespace {
+
+tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  tensorflow::CudnnVersion cudnn_version;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      cudnn_version.set_major(version.major_version());
+      cudnn_version.set_minor(version.minor_version());
+      cudnn_version.set_patch(version.patch());
+    }
+  }
+  return cudnn_version;
+}
+
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
+tensorflow::ComputeCapability GetComputeCapability(
+    se::StreamExecutor* stream_executor) {
+  tensorflow::ComputeCapability cc;
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  cc.set_major(cc_major);
+  cc.set_minor(cc_minor);
+  return cc;
+}
+
+void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
+                                 const Tensor& filter, const Tensor& output,
+                                 const Tensor& bias, const Tensor* side_input,
+                                 se::StreamExecutor* stream_exec,
+                                 absl::Span<const AutotuneResult> results) {
+  AutotuningLog log;
+  ConvNodeDef instr;
+  *instr.mutable_conv() = node;
+  input.shape().AsProto(instr.mutable_input()->mutable_tensor_shape());
+  instr.mutable_input()->set_dtype(input.dtype());
+  filter.shape().AsProto(instr.mutable_filter()->mutable_tensor_shape());
+  instr.mutable_filter()->set_dtype(filter.dtype());
+  output.shape().AsProto(instr.mutable_output()->mutable_tensor_shape());
+  instr.mutable_output()->set_dtype(output.dtype());
+  bias.shape().AsProto(instr.mutable_bias()->mutable_tensor_shape());
+  instr.mutable_bias()->set_dtype(bias.dtype());
+  if (side_input) {
+    side_input->shape().AsProto(
+        instr.mutable_side_input()->mutable_tensor_shape());
+    instr.mutable_side_input()->set_dtype(side_input->dtype());
+  }
+  log.mutable_instr()->PackFrom(std::move(instr));
+  *log.mutable_cudnn_version() = internal::GetCudnnVersion(stream_exec);
+  *log.mutable_compute_capability() =
+      internal::GetComputeCapability(stream_exec);
+  for (const auto& result : results) {
+    *log.add_results() = result;
+  }
+  Logger::Singleton()->LogProto(log);
+}
+
+Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
+                              se::dnn::AlgorithmConfig* algo) {
+  // For the "!xhs.has_success()" below, this is because we want successful ones
+  // to order first, therefore they need a smaller key per "min_element".
+  const AutotuneResult* best_result = std::min_element(
+      results.begin(), results.end(),
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        return std::make_tuple(
+                   !lhs.has_success(),
+                   internal::FromDurationProto(lhs.success().run_time())) <
+               std::make_tuple(
+                   !rhs.has_success(),
+                   internal::FromDurationProto(rhs.success().run_time()));
+      });
+
+  const AutotuneResult* best_result_no_scratch = std::min_element(
+      results.begin(), results.end(),
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        return std::make_tuple(
+                   !lhs.has_success(), lhs.success().scratch_bytes(),
+                   internal::FromDurationProto(lhs.success().run_time())) <
+               std::make_tuple(
+                   !rhs.has_success(), rhs.success().scratch_bytes(),
+                   internal::FromDurationProto(rhs.success().run_time()));
+      });
+
+  if (best_result == results.end() || !best_result->has_success()) {
+    return errors::NotFound("No algorithm worked!");
+  }
+  algo->set_algorithm({best_result->conv().algorithm(),
+                       best_result->conv().tensor_ops_enabled()});
+  if (best_result_no_scratch != results.end() &&
+      best_result_no_scratch->has_success() &&
+      best_result_no_scratch->success().scratch_bytes() == 0) {
+    algo->set_algorithm_no_scratch(
+        {best_result_no_scratch->conv().algorithm(),
+         best_result_no_scratch->conv().tensor_ops_enabled()});
+  }
+  return Status::OK();
+}
+
+}  // namespace
+}  // namespace internal
+
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
   static string name() { return "ConvBiasActivation"; }
@@ -565,8 +697,21 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
         fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
             stream->parent()),
         &algorithms));
-    dnn::ProfileResult best_result;
-    dnn::ProfileResult best_result_no_scratch;
+    if (activation_mode == ActivationMode::NONE) {
+      // Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is supported for
+      // identity activation, other algs seem to quietly do Relu.
+      // See
+      // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBiasActivationForward
+      algorithms.erase(
+          std::remove_if(
+              algorithms.begin(), algorithms.end(),
+              [](dnn::AlgorithmDesc alg) {
+                return alg.algo_id() !=
+                       CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+              }),
+          algorithms.end());
+    }
+    std::vector<tensorflow::AutotuneResult> results;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
@@ -583,28 +728,24 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
               .ok();
       if (cudnn_launch_status) {
         if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
-          }
+          results.emplace_back();
+          auto& result = results.back();
+          result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+          result.mutable_conv()->set_tensor_ops_enabled(
+              profile_algorithm.tensor_ops_enabled());
+          result.mutable_success()->set_scratch_bytes(
+              scratch_allocator.TotalByteSize());
+          *result.mutable_success()->mutable_run_time() =
+              internal::ToDurationProto(
+                  absl::Milliseconds(profile_result.elapsed_time_in_ms()));
         }
       }
     }
-    OP_REQUIRES(ctx,
-                best_result.is_valid() || best_result_no_scratch.is_valid(),
-                errors::NotFound("No algorithm worked!"));
-    if (best_result.is_valid()) {
-      algorithm_config.set_algorithm(best_result.algorithm());
-    }
-    if (best_result_no_scratch.is_valid()) {
-      algorithm_config.set_algorithm_no_scratch(
-          best_result_no_scratch.algorithm());
-    }
+    internal::LogFusedConvAutotuneResults(ctx->op_kernel().def(), *conv_input,
+                                          *filter, *output, bias, side_input,
+                                          stream->parent(), results);
+    OP_REQUIRES_OK(
+        ctx, internal::BestCudnnConvAlgorithm(results, &algorithm_config));
     AutoTuneConvBiasActivation::GetInstance()->Insert(fused_conv_parameters,
                                                       algorithm_config);
   }
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index db0868fb2c43464a811b3d6dfcd96480ba2463ee..386e4cf69b7aa118a85fb25bcb809a879c5c1bd8 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -377,7 +377,10 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index db7dc51daa78ecee12ecb7f6d33df4511e068243..3c1d814e70f7fdad4083583c9d89450a60bc2e20 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -9,8 +9,9 @@ explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
-introduction.
+Goodfellow et al. See
+[tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/)
+for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an introduction.
 
 #### Usage
 ```python
@@ -57,11 +58,10 @@ These include the following main pieces (explained in detail below).
     generative models.
 
 *   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
-    and [tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN
-    to make GAN training easier, or use the more complicated examples to
-    jumpstart your own project. These include unconditional and conditional
-    GANs, InfoGANs, adversarial losses on existing networks, and image-to-image
-    translation.
+    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN to make
+    GAN training easier, or use the more complicated examples to jump-start your
+    own project. These include unconditional and conditional GANs, InfoGANs,
+    adversarial losses on existing networks, and image-to-image translation.
 
 ## Training a GAN model
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 5b9c54e43a16adf457d5ed0e7e73dcd168ab0d67..66af79d1e81bbc450141673dd54d865e5c7932d5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -238,10 +237,10 @@ class GANEstimatorIntegrationTest(test.TestCase):
     # Evaluate.
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # Predict.
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
index c00ff4399748a77f88d9753df7592bf3859d754e..0fcd1b7924eb02f5d617b45af16852baf2e2bb48 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -235,10 +234,10 @@ class StarGANEstimatorIntegrationTest(test.TestCase):
     # EVALUTE
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
index 9fdcc08334d50b4ddf3a0bc9bc755e55d51b0bd8..baf2c28df4b63cff525dcf3ff880730768ad000a 100644
--- a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -184,12 +183,11 @@ class TPUGANEstimatorIntegrationTest(test.TestCase, parameterized.TestCase):
     # Evaluate.
     num_steps_eval = 2
     scores = est.evaluate(eval_input_fn, steps=num_steps_eval)
-    self.assertEqual(num_steps_train + num_steps_eval,
-                     scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn(ops.GraphKeys.GLOBAL_STEP, scores)
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # Predict.
     predictions = np.array([x['generated_data'] for x in
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 31f0d34ed68a6adc25cca102236079d0f66615cb..efbdb1152d665509ae8b4444097dd5091a0f5312 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -41,9 +41,9 @@ from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -140,7 +140,7 @@ def preprocess_image(images,
   is_single = images.shape.ndims == 3
   with ops.name_scope(scope, 'preprocess', [images, height, width]):
     if not images.dtype.is_floating:
-      images = math_ops.to_float(images)
+      images = math_ops.cast(images, dtypes.float32)
     if is_single:
       images = array_ops.expand_dims(images, axis=0)
     resized = image_ops.resize_bilinear(images, [height, width])
@@ -346,7 +346,7 @@ def classifier_score(images, classifier_fn, num_batches=1):
       images, num_or_size_splits=num_batches)
 
   # Compute the classifier splits using the memory-efficient `map_fn`.
-  logits = functional_ops.map_fn(
+  logits = map_fn.map_fn(
       fn=classifier_fn,
       elems=array_ops.stack(generated_images_list),
       parallel_iterations=1,
@@ -505,12 +505,12 @@ def frechet_classifier_distance(real_images,
 
   # Compute the activations using the memory-efficient `map_fn`.
   def compute_activations(elems):
-    return functional_ops.map_fn(fn=classifier_fn,
-                                 elems=elems,
-                                 parallel_iterations=1,
-                                 back_prop=False,
-                                 swap_memory=True,
-                                 name='RunClassifier')
+    return map_fn.map_fn(fn=classifier_fn,
+                         elems=elems,
+                         parallel_iterations=1,
+                         back_prop=False,
+                         swap_memory=True,
+                         name='RunClassifier')
 
   real_a = compute_activations(real_imgs)
   gen_a = compute_activations(generated_imgs)
@@ -895,7 +895,7 @@ def kernel_classifier_distance_and_std(real_images,
 
   # Compute the activations using the memory-efficient `map_fn`.
   def compute_activations(elems):
-    return functional_ops.map_fn(
+    return map_fn.map_fn(
         fn=classifier_fn,
         elems=elems,
         parallel_iterations=1,
@@ -1057,7 +1057,8 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
   n_g = array_ops.shape(generated_activations)[0]
 
   n_bigger = math_ops.maximum(n_r, n_g)
-  n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))
+  n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size),
+                           dtypes.int32)
 
   v_r = n_r // n_blocks
   v_g = n_g // n_blocks
@@ -1099,7 +1100,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
             (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
             (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))
 
-  ests = functional_ops.map_fn(
+  ests = map_fn.map_fn(
       compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)
 
   mn = math_ops.reduce_mean(ests)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index bd17571a0535a3c8e9dfee24a8da16eb2e72f165..bc7c1057b478fe2656898e68c1a14013b5a71d12 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -365,7 +365,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     unused_image = array_ops.zeros([2, 299, 299, 3])
     incscore = _run_with_mock(classifier_metrics.inception_score, unused_image)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       incscore_np = sess.run(incscore, {'concat:0': logits})
 
     self.assertAllClose(_expected_inception_score(logits), incscore_np)
@@ -473,7 +473,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_fn=lambda x: x,
         max_block_size=600)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_kid, actual_std = sess.run(kid_op)
 
     expected_kid, expected_std = _expected_kid_and_std(test_pool_real_a,
@@ -500,7 +500,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         max_block_size=max_block_size)
 
     for block_size in [50, 512, 1000]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         actual_kid, actual_std = sess.run(kid_op, {max_block_size: block_size})
 
       expected_kid, expected_std = _expected_kid_and_std(
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
index 4b1105f6bd4f21a0da02338b0fc9db87a41b145f..9657d4e3d0cb60376b3f1dd23d0138a200ce5e5c 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -74,7 +75,7 @@ def _laplacian_pyramid(batch, num_levels):
     res = spatial_conv(res, 4)
     return res
 
-  pyramid = [math_ops.to_float(batch)]
+  pyramid = [math_ops.cast(batch, dtypes.float32)]
   for _ in range(1, num_levels):
     pyramid.append(pyr_down(pyramid[-1]))
     pyramid[-2] -= pyr_up(pyramid[-1])
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 9f448d3a1602c503093214201bdc96fc9bee85b5..3eb4f5db0c841af584a672f100509a3d455a8b75 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.eval.python import eval_utils
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import util as loss_util
@@ -171,8 +172,10 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
         gan_model.generated_data[:num_comparisons])
     real_list = array_ops.unstack(gan_model.real_data[:num_comparisons])
     diffs = [
-        math_ops.abs(math_ops.to_float(generated) - math_ops.to_float(real)) for
-        generated, real in zip(generated_list, real_list)]
+        math_ops.abs(math_ops.cast(generated, dtypes.float32) -
+                     math_ops.cast(real, dtypes.float32))
+        for generated, real in zip(generated_list, real_list)
+    ]
     image_list.extend(diffs)
 
   # Reshape image and display.
@@ -261,7 +264,7 @@ def add_stargan_image_summaries(stargan_model,
 
   summary.image(
       'stargan_image_generation',
-      functional_ops.map_fn(
+      map_fn.map_fn(
           _build_image,
           stargan_model.input_data[:num_images],
           parallel_iterations=num_images,
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
index 0cc653f0a7907f407e66add5537d1e0a5adb6d8b..3764c43cdfc8f6515e0376cd6aa1d244b21e2e89 100644
--- a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
@@ -53,7 +53,7 @@ def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
   Args:
     w_tensor: The weight matrix whose spectral norm should be computed.
     power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yeilds a better approximation.
+      perform. A higher number yields a better approximation.
     name: An optional scope name.
 
   Returns:
@@ -105,7 +105,7 @@ def spectral_normalize(w, power_iteration_rounds=1, name=None):
   Args:
     w: The weight matrix to be normalized.
     power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yeilds a better approximation.
+      perform. A higher number yields a better approximation.
     name: An optional scope name.
 
   Returns:
@@ -126,7 +126,7 @@ def spectral_norm_regularizer(scale, power_iteration_rounds=1, scope=None):
   Args:
     scale: A scalar multiplier. 0.0 disables the regularizer.
     power_iteration_rounds: The number of iterations of the power method to
-      perform. A higher number yeilds a better approximation.
+      perform. A higher number yields a better approximation.
     scope: An optional scope name.
 
   Returns:
@@ -221,7 +221,7 @@ def spectral_normalization_custom_getter(name_filter=_default_name_filter,
     name_filter: Optionally, a method that takes a Variable name as input and
       returns whether this Variable should be normalized.
     power_iteration_rounds: The number of iterations of the power method to
-      perform per step. A higher number yeilds a better approximation of the
+      perform per step. A higher number yields a better approximation of the
       true spectral norm.
 
   Returns:
@@ -294,7 +294,7 @@ def keras_spectral_normalization(name_filter=_default_name_filter,
     name_filter: Optionally, a method that takes a Variable name as input and
       returns whether this Variable should be normalized.
     power_iteration_rounds: The number of iterations of the power method to
-      perform per step. A higher number yeilds a better approximation of the
+      perform per step. A higher number yields a better approximation of the
       true spectral norm.
 
   Yields:
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index f5c448db41c67adb4edd2634dd63a1840180df70..80fae0356f66f9d98969171cdfe18110bd21f7bd 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -224,7 +224,8 @@ class VBN(object):
       # statistics and the reference batch statistics.
       ref_batch_size = _static_or_dynamic_batch_size(
           self._reference_batch, self._batch_axis)
-      self._example_weight = 1. / (math_ops.to_float(ref_batch_size) + 1.)
+      self._example_weight = 1. / (
+          math_ops.cast(ref_batch_size, dtypes.float32) + 1.)
       self._ref_weight = 1. - self._example_weight
 
       # Make the variables, if necessary.
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
index ecfbb8a432e3308863edd6f1343be55c1fe5753c..9848f654badafea3f08ef48207eb84973c62cb16 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
@@ -112,7 +112,7 @@ class VirtualBatchnormTest(test.TestCase):
           batch, axis, training=True)
 
       # Get VBN's batch normalization on reference batch.
-      batch_axis = 0 if axis is not 0 else 1  # axis and batch_axis can't same
+      batch_axis = 0 if axis != 0 else 1  # axis and batch_axis can't same
       vbn = virtual_batchnorm.VBN(batch, axis, batch_axis=batch_axis)
       vbn_normalized = vbn.reference_batch_normalization()
 
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index e3c780ac1a0f0ef15ff993bd3a9bf9730dcb45b8..44ee0f52696dc1cdcd91286a80b2d4b42be93a4d 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -403,7 +403,9 @@ class _PenaltyTest(object):
   def test_all_correct(self):
     loss = self._penalty_fn(**self._kwargs)
     self.assertEqual(self._expected_dtype, loss.dtype)
-    self.assertEqual(self._expected_op_name, loss.op.name)
+    # NOTE: Op names will change, it is inappropriate to include them in tests.
+    # See go/tf-breaking-change.
+    # self.assertEqual(self._expected_op_name, loss.op.name)
     with self.cached_session():
       variables.global_variables_initializer().run()
       self.assertAlmostEqual(self._expected_loss, loss.eval(), 6)
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index f36a5d346e0f27fbbc480e876380db51ed559c09..9bff8090d93d3ad7def69726073accfb234ef301 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -757,7 +757,9 @@ def cyclegan_loss(
 
   return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
 
-
+# Begin google-internal
+# The four major parts can be found here: http://screen/tMRMBAohDYG.
+# End google-internal
 def stargan_loss(
     model,
     generator_loss_fn=tfgan_losses.stargan_generator_loss_wrapper(
@@ -776,8 +778,6 @@ def stargan_loss(
     add_summaries=True):
   """StarGAN Loss.
 
-  The four major part can be found here: http://screen/tMRMBAohDYG.
-
   Args:
     model: (StarGAN) Model output of the stargan_model() function call.
     generator_loss_fn: The loss function on the generator. Takes a
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 704be917b3680a1b5712f4f1dc5059b354db8610..bf8b66dcfa5e44a03107cdf1ef8b04e1dbff4a9c 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -17,11 +17,6 @@ filegroup(
     ]),
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
-)
-
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
@@ -66,7 +61,6 @@ cc_library(
         ":gdr_memory_manager",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:graph_mgr",
@@ -100,15 +94,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gdr_collective_executor_mgr",
+    srcs = ["gdr_collective_executor_mgr.cc"],
+    hdrs = ["gdr_collective_executor_mgr.h"],
+    deps = [
+        ":gdr_memory_manager",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:cancellable_call",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:request_id",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+    ],
+)
+
 cc_library(
     name = "gdr_server_lib",
     srcs = ["gdr_server_lib.cc"],
     hdrs = ["gdr_server_lib.h"],
     linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
+        ":gdr_collective_executor_mgr",
         ":gdr_memory_manager",
         ":gdr_rendezvous_mgr",
         ":gdr_worker",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 8242d93f129904828a11b61d48f2df8fb0f88bc3..711adc865f37fc84550e4b45d9f0c7fff421a0dc 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -114,7 +114,16 @@ Caveats
 
 In current implementation, only tensors that reside in host memory or in GPU memory such that the GPU is adjacent to an RDMA capable NIC will use direct RDMA as its transport. When RDMA is available but not GDR, a temporary tensor copy on host memory will be used as RDMA source/destination (and copied from/to the target device). When there is no RDMA device present, it can even fallback to the original gRPC runtime. While it is theoretically possible to mix GDR enabled TF with non-GDR deployments in the same job, make sure the environment is properly setup so the GDR mode is enabled whenever possible (i.e. do not fall back to gRPC when it is not absolutely necessary).
 
-In the original design (as in the reference), tensor buffers are only registered to NIC when we could determine that the tensor will be either a source of Send or a sink of Recv across physical machine boundary. However, to implement the precise allocations, we need to change all the devices to possibly return a NIC compatible allocator. As GDR is currently in contrib, we would like to avoid the unnecessary code disruption to the TF core, so we allocate all tensors from NIC-registered buffers using a BFC allocator. This behaviour is similar to the effect of enabling the extra GPU option `force_gpu_compatible`, which allocate all host tensors in GPU-registered buffers no matter they will be transferred from/to GPUs or not.
+In the original design (as in the reference), tensor buffers are only registered
+to NIC when we could determine that the tensor will be either a source of Send
+or a sink of Recv across physical machine boundary. However, to implement the
+precise allocations, we need to change all the devices to possibly return a NIC
+compatible allocator. As GDR is currently in contrib, we would like to avoid the
+unnecessary code disruption to the TF core, so we allocate all tensors from
+NIC-registered buffers using a BFC allocator. This behavior is similar to the
+effect of enabling the extra GPU option `force_gpu_compatible`, which allocate
+all host tensors in GPU-registered buffers no matter they will be transferred
+from/to GPUs or not.
 
 Reference
 ===
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..755cbdff31cd7ca31579e0d64399d681dc24ad81
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+class WorkerCacheInterface;
+
+namespace {
+
+class RecvBufCall : public CancellableCall {
+ public:
+  RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
+              const string& key, Device* to_device,
+              DeviceContext* to_device_ctx,
+              const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+              const DeviceLocality& client_locality,
+              const DeviceLocality& server_locality,
+              CancellationManager* cancel_mgr, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, peer_task, wc) {
+    req_.set_step_id(step_id);
+    req_.set_buf_rendezvous_key(key);
+    *req_.mutable_client_locality() = client_locality;
+    *req_.mutable_server_locality() = server_locality;
+    req_.set_num_bytes(to_tensor->TotalBytes());
+    req_.set_buf_ptr(reinterpret_cast<int64>(DMAHelper::base(to_tensor)));
+    req_.set_src_device(peer_device);
+    req_.set_dst_device(to_device->name());
+    req_.set_request_id(GetUniqueRequestId());
+  }
+
+  ~RecvBufCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->RecvBufAsync(&opts_, &req_, &resp_, done);
+  }
+
+  RecvBufRequest req_;
+  RecvBufResponse resp_;
+};
+
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
+                                    DeviceResolverInterface* dev_resolver,
+                                    WorkerCacheInterface* worker_cache,
+                                    int64 step_id,
+                                    RemoteMemoryManager* remote_memory_manager)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (peer_is_local) {
+      CollectiveRemoteAccessLocal::RecvFromPeer(
+          peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+          to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+          done);
+      return;
+    }
+
+    // State that needs to be threaded through a couple of async calls
+    // in order to make this function completely non-blocking.
+    struct State {
+      DeviceLocality server_locality;
+      std::unique_ptr<RecvBufCall> call;
+    };
+    State* state = new State;
+
+    // Logic to be executed on the RecvBufAsync callback.
+    auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
+                              to_device_ctx, to_tensor, done](const Status& s) {
+      if (s.ok()) {
+        remote_memory_manager_->TensorFromTransportOptions(
+            to_tensor, state->call->resp_.transport_options(), to_device,
+            to_device_ctx, to_alloc_attr.on_host(), done);
+      }
+      if (!s.ok() && errors::IsFailedPrecondition(s)) {
+        dev_resolver_->ClearTask(peer_task);
+      }
+
+      delete state;
+    };
+
+    // Logic to execute once we have the device locality for the server-side
+    // device.
+    auto dev_locality_callback = [this, state, peer_device, peer_task, key,
+                                  to_device, to_device_ctx, to_alloc_attr,
+                                  to_tensor, client_locality,
+                                  recv_buf_callback](const Status& s) {
+      if (!s.ok()) {
+        recv_buf_callback(s);
+      } else {
+        state->call.reset(new RecvBufCall(
+            step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+            to_alloc_attr, to_tensor, client_locality, state->server_locality,
+            &cancel_mgr_, worker_cache_));
+        state->call->Start(recv_buf_callback);
+      }
+    };
+
+    dev_resolver_->GetLocalityAsync(
+        peer_device, peer_task, &state->server_locality, dev_locality_callback);
+  }
+
+  void StartAbort(const Status& s) override {
+    CollectiveRemoteAccessLocal::StartAbort(s);
+    cancel_mgr_.StartCancel();
+  }
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  CancellationManager cancel_mgr_;
+  RemoteMemoryManager* remote_memory_manager_;
+};
+
+}  // namespace
+
+CollectiveExecutor* GdrCollectiveExecutorMgr::Create(int64 step_id) {
+  CollectiveRemoteAccessDistributed* rma =
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            worker_cache_, step_id,
+                                            remote_memory_manager_);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
+                                    &gpu_ring_order_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..1417e51e82c31035f058e8e9b546e04fb0ad97b8
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+class WorkerCacheInterface;
+class StepSequenceRequest;
+class StepSequenceResponse;
+
+// An implementation of CollectiveExecutorMgr for a distributed environment
+// that uses WorkerInterface::RecvBufAsync to route data transfers over RDMA.
+class GdrCollectiveExecutorMgr : public RpcCollectiveExecutorMgr {
+ public:
+  GdrCollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+      std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      WorkerCacheInterface* worker_cache, const string& task_name,
+      RemoteMemoryManager* remote_memory_manager)
+      : RpcCollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
+                                 std::move(param_resolver), worker_cache,
+                                 task_name),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~GdrCollectiveExecutorMgr() override {}
+
+ protected:
+  virtual CollectiveExecutor* Create(int64 step_id) override;
+
+ private:
+  RemoteMemoryManager* remote_memory_manager_;  // Not owned.
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index ce1875151597f926aeb6392e7fc8307312da123f..9b8e832fd96c898d11fe817dcf3472b92293270f 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -73,7 +73,10 @@ int TryToReadNumaNode(ibv_device* device) {
 
   std::ifstream ifs(filename.c_str());
   string content;
-  CHECK(std::getline(ifs, content));
+  const auto& ret = std::getline(ifs, content);
+  if (!ret) {
+    return port::kNUMANoAffinity;
+  }
 
   int32 value;
   if (strings::safe_strto32(content, &value)) {
@@ -247,10 +250,9 @@ Status GdrMemoryManager::Init() {
   LOG(INFO) << "Instrumenting CPU allocator(s)";
 
   for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
-    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(numa_idx,
-                                                          alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(numa_idx,
-                                                         free_visitor);
+    GPUProcessState::singleton()->AddGpuHostAllocVisitor(numa_idx,
+                                                         alloc_visitor);
+    GPUProcessState::singleton()->AddGpuHostFreeVisitor(numa_idx, free_visitor);
   }
 
   if (IsGDRAvailable()) {
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 5f8c300155770ed03ad12a9fa5ac74456edaf024..1124dff741309d8fd04954e70c5ebaaf164b940a 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -167,8 +167,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
 
     // RendezvousMgr already aborted, shouldn't send RPC call any more
     if (!call->status().ok()) {
-      done(call->status(), Args(), Args(), Tensor(), false);
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(call->status(), Args(), Args(), Tensor(), false);
       delete call;
       return;
     }
@@ -181,8 +184,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
       // If StartAbort was called prior to DeregisterCall, then the
       // current status should be bad.
       Status s = call->status();
-      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
       delete call;
       Unref();
     });
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index dc0d5d548b80d36409778ef34e63171441f10142..c39cc0f9bcecc26aedfaf9707113210acf670244 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/contrib/gdr/gdr_server_lib.h"
 
 #include "grpc/support/alloc.h"
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 #include "tensorflow/contrib/gdr/gdr_rendezvous_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_worker.h"
-
-#include "grpc/support/alloc.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
 namespace tensorflow {
 
@@ -57,10 +59,34 @@ Status GdrServer::Init() {
     return std::unique_ptr<GdrWorker>(
         new GdrWorker(env, config, remote_memory_manager_.get()));
   };
-
+  CollectiveMgrCreationFunction collective_mgr_func =
+      [this](const ConfigProto& config, const WorkerEnv* env,
+             WorkerCacheInterface* worker_cache) {
+        string unused;
+        string default_worker_name;
+        DeviceNameUtils::SplitDeviceName(
+            env->device_mgr->ListDevices()[0]->name(), &default_worker_name,
+            &unused);
+
+        std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+            new DeviceResolverDistributed(env->device_mgr, worker_cache,
+                                          default_worker_name));
+        std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+            new CollectiveParamResolverDistributed(
+                config, env->device_mgr, dev_resolver.get(), worker_cache,
+                default_worker_name));
+        return new GdrCollectiveExecutorMgr(
+            config, env->device_mgr, std::move(dev_resolver),
+            std::move(param_resolver), worker_cache, default_worker_name,
+            remote_memory_manager_.get());
+      };
   TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
 
-  return GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func);
+  GrpcServerOptions opts;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  opts.collective_mgr_func = collective_mgr_func;
+  opts.worker_func = worker_func;
+  return GrpcServer::Init(opts);
 }
 
 Status GdrServer::Start() {
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 016e5ea27b397830c69b6e1761b5994ebcfa9c3d..1204b8ca501a8f99ea6abd6c047ab2d91350bae1 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/gdr/gdr_worker.h"
 
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -40,13 +42,13 @@ GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
     : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
-      recv_tensor_recent_request_ids_(100000) {}
+      recent_request_ids_(100000) {}
 
 void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                     const RecvTensorRequest* request,
                                     ::grpc::ByteBuffer* response,
                                     StatusCallback done) {
-  Status s = recv_tensor_recent_request_ids_.TrackUnique(
+  Status s = recent_request_ids_.TrackUnique(
       request->request_id(), "RecvTensor (GdrWorker)", *request);
   if (!s.ok()) {
     done(s);
@@ -145,4 +147,41 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GdrWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                             RecvBufResponse* response, StatusCallback done) {
+  // This is an RDMA enabled implementation augmenting grpc.
+  Status s = recent_request_ids_.TrackUnique(request->request_id(),
+                                             "RecvBuf (GdrWorker)", *request);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  CollectiveExecutor::Handle ce_handle(
+      env_->collective_executor_mgr->FindOrCreate(request->step_id()), true);
+  CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
+  rma->buf_rendezvous()->ConsumeBuf(
+      request->buf_rendezvous_key(),
+      [this, request, response, done](const Status& status,
+                                      BufRendezvous::Hook* hook) {
+        Status s = status;
+        if (s.ok()) {
+          if (!DMAHelper::CanUseDMA(hook->prod_value)) {
+            s = errors::Internal("Tensor value for key ",
+                                 request->buf_rendezvous_key(),
+                                 " is not of a type supported by RecvBuf");
+          }
+        }
+        if (s.ok()) {
+          remote_memory_manager_->TransportOptionsFromTensor(
+              response->mutable_transport_options(), *hook->prod_value,
+              hook->prod_dev, hook->prod_ctx, hook->prod_attr.on_host(),
+              [this, response, done, hook](const Status& s) {
+                response->set_send_start_micros(env_->env->NowMicros());
+                done(s);
+                BufRendezvous::DoneWithHook(hook);
+              });
+        }
+      });
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 39f11e6bde5a1ca7ae91ead02279d22d70af027b..9a85cfd4263ad86f6579eedce95969c2829ff62c 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -38,9 +38,13 @@ class GdrWorker : public GrpcWorker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done) override;
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response,
+                            StatusCallback done) override;
+
  private:
   RemoteMemoryManager* remote_memory_manager_;  // Not owned
-  RecentRequestIds recv_tensor_recent_request_ids_;
+  RecentRequestIds recent_request_ids_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index e79ccd8da1f8952758ae322d3a92dec34910a9db..5b37239665d46db38fc249e9004d2200abb3d610 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 from copy import deepcopy
 from functools import partial
 from six import iteritems
-from six import iterkeys
 from six import string_types
 from six import StringIO
 from tensorflow.contrib.graph_editor import reroute
@@ -735,9 +734,8 @@ def graph_replace(target_ts, replacement_ts, dst_scope="",
   # control dependencies.
   graph = util.get_unique_graph(flatten_target_ts, check_types=(tf_ops.Tensor))
   control_ios = util.ControlOutputs(graph)
-  ops = select.get_walks_intersection_ops(list(iterkeys(replacement_ts)),
-                                          flatten_target_ts,
-                                          control_ios=control_ios)
+  ops = select.get_walks_intersection_ops(
+      list(replacement_ts), flatten_target_ts, control_ios=control_ios)
   if not ops:
     raise ValueError("Targets and replacements are not connected!")
 
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 0081fb61770075a2c36e92f65e01126f657edeb4..92016e6a83975a9b15a39a15125e0eabc111912e 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -16,9 +16,31 @@ tf_cc_binary(
     srcs = ["hvx_ops_support_checker_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:io_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:sparse_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:user_ops_op_lib",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/core/kernels/hexagon:graph_transferer",
         "//tensorflow/tools/graph_transforms:file_utils",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 7b30e1776ffc4e8dc552bdadcb3d7017ff77bf57..c1f6cac4942436d32f9867d4b5557c6b9e376c69 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -98,6 +98,7 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
 >>>
@@ -117,7 +118,15 @@ Using this ability we can calculate gradients on the nodes the data is stored on
 
 Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL), we can specify the number of partitions the data will be partitioned on. For example, if an Apache Ignite cluster consists of 10 machines and we create cache with 10 partitions, then every machine will maintain approximately one data partition.
 
-Ignite Dataset allows using these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that can be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach, we can assign a specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
+Ignite Dataset allows using these two aspects of distributed neural network
+training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a
+computation graph operation that can be performed on a remote worker. The remote
+worker can override Ignite Dataset parameters (such as `host`, `port` or `part`)
+by setting correspondent environment variables for worker process (such as
+`IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using
+this overriding approach, we can assign a specific partition to every worker so
+that one worker handles one partition and, at the same time, transparently work
+with single dataset.
 
 ```python
 >>> import tensorflow as tf
@@ -150,23 +159,31 @@ system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
 delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
 addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
 transparently plugged into Hadoop or Spark deployments. This contrib package
-contains an integration between IGFS and TensorFlow. The integration is based
-on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
-from TensorFlow side and
+contains an integration between IGFS and TensorFlow. The integration is based on
+[custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from
+TensorFlow side and
 [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
-Ignite side. It has numerous uses, for example: * Checkpoints of state can be
-saved to IGFS for reliability and fault-tolerance. * Training processes
-communicate with TensorBoard by writing event files to a directory, which
-TensorBoard watches. IGFS allows this communication to work even when
-TensorBoard runs in a different process or machine.
+Ignite side. It has numerous uses, for example:
+
+*   Checkpoints of state can be saved to IGFS for reliability and
+    fault-tolerance.
+*   Training processes communicate with TensorBoard by writing event files to a
+    directory, which TensorBoard watches. IGFS allows this communication to work
+    even when TensorBoard runs in a different process or machine.
 
 ### SSL Connection
 
-Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
+Apache Ignite allows to protect data transfer channels by
+[SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and
+authentication. Ignite Dataset supports both SSL connection with and without
+authentication. For more information, please refer to the
+[Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls)
+documentation.
 
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES",
                             certfile="client.pem",
@@ -187,7 +204,7 @@ Following examples will help you to easily start working with this module.
 
 The simplest way to try Ignite Dataset is to run a
 [Docker](https://www.docker.com/) container with Apache Ignite and loaded
-[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interrupt with
 it using Ignite Dataset. Such container is available on Docker Hub:
 [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
 You need to start this container on your machine:
@@ -198,13 +215,13 @@ docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
 
 After that you will be able to work with it following way:
 
-![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
+![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist-2.png "Ignite Dataset Mnist")
 
 ### IGFS
 
 The simplest way to try IGFS with TensorFlow is to run
 [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
-and then interruct with it using TensorFlow
+and then interrupt with it using TensorFlow
 [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
 is available on Docker Hub:
 [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 66e654ca636a5a051c6f9cd35bf9001dfbcbf7f4..3ffceef8070e0fc3b3cebae2522f89fe98ce4413 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -735,8 +735,6 @@ class IgniteDataset(dataset_ops.DatasetSource):
       cert_password: Password to be used if the private key is encrypted and a
         password is necessary.
     """
-    super(IgniteDataset, self).__init__()
-
     with IgniteClient(host, port, username, password, certfile, keyfile,
                       cert_password) as client:
       client.handshake()
@@ -760,6 +758,8 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
         self.cache_type.to_output_classes())
 
+    super(IgniteDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
                                           self.local, self.part, self.page_size,
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ff5d4c458c859fd8e5e3ae65ee41a454d55d6538..89b74fbfdc38c9f42795d5c778889210baf6387f 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 import os
 
+from tensorflow import compat
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -66,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset_ops.make_one_shot_iterator(dataset)
+    it = compat.v1.data.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index bbb3a3b18fd7bfdc68e8b8532568985245154794..f97e790b56c511ffb7859b4120b7a4220b75c506 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -55,9 +55,10 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
                           &tranformation_matrix));
   // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
-  internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
-      delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
-      tranformation_matrix.flat<float>().size());
+  TF_CHECK_OK(CudaLaunchKernel(internal::compute_tranformation_matrix_cuda, 1,
+                               1, 0, cu_stream, delta_h, scale_s, scale_v,
+                               tranformation_matrix.flat<float>().data(),
+                               tranformation_matrix.flat<float>().size()));
   // Call cuBlas C = A * B directly.
   auto no_transpose = se::blas::Transpose::kNoTranspose;
   auto a_ptr =
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index b25a6f7b5742917a032946fe03a0dab20e7dc1ad..05ba9155c401b538a130958504ee919574480d75 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -518,7 +518,7 @@ def connected_components(images):
     def has_zero():
       # Insert a zero in the consecutive ids where zero appears in unique_ids.
       # id_is_zero has length 1.
-      zero_id_ind = math_ops.to_int32(id_is_zero[0])
+      zero_id_ind = math_ops.cast(id_is_zero[0], dtypes.int32)
       ids_before = nonzero_consecutive_ids[:zero_id_ind]
       ids_after = nonzero_consecutive_ids[zero_id_ind:]
       return array_ops.concat([ids_before, [0], ids_after], axis=0)
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
deleted file mode 100644
index 8cdf16103bab2b22d51c144d21a589e1e39f2f0b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/kafka/ops/kafka_ops.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-REGISTER_OP("KafkaDataset")
-    .Input("topics: string")
-    .Input("servers: string")
-    .Input("group: string")
-    .Input("eof: bool")
-    .Input("timeout: int64")
-    .Output("handle: variant")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that emits the messages of one or more Kafka topics.
-
-topics: A `tf.string` tensor containing one or more subscriptions,
-  in the format of [topic:partition:offset:length],
-  by default length is -1 for unlimited.
-servers: A list of bootstrap servers.
-group: The consumer group id.
-eof: If True, the kafka reader will stop on EOF.
-timeout: The timeout value for the Kafka Consumer to wait
-  (in millisecond).
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
index 08ebcdb544645d3585a1af25c86c6182a1589dcb..3651275f935b50ac9d21bb831fd257eb22a6b793 100644
--- a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -49,7 +50,8 @@ class KafkaDatasetTest(test.TestCase):
         topics, group="test", eof=True).repeat(num_epochs)
     batch_dataset = repeat_dataset.batch(batch_size)
 
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    iterator = iterator_ops.Iterator.from_structure(
+        dataset_ops.get_legacy_output_types(batch_dataset))
     init_op = iterator.make_initializer(repeat_dataset)
     init_batch_op = iterator.make_initializer(batch_dataset)
     get_next = iterator.get_next()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index b399e1b6c2ac47db205b5d8bbc81875ef5c08a31..5591c3b0cc8c8bf196bb4821c018cbf155cba4ce 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -52,7 +52,6 @@ class KafkaDataset(dataset_ops.DatasetSource):
       timeout: The timeout value for the Kafka Consumer to wait
                (in millisecond).
     """
-    super(KafkaDataset, self).__init__()
     self._topics = ops.convert_to_tensor(
         topics, dtype=dtypes.string, name="topics")
     self._servers = ops.convert_to_tensor(
@@ -63,6 +62,8 @@ class KafkaDataset(dataset_ops.DatasetSource):
     self._timeout = ops.convert_to_tensor(
         timeout, dtype=dtypes.int64, name="timeout")
 
+    super(KafkaDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.kafka_dataset(self._topics, self._servers,
                                          self._group, self._eof, self._timeout)
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index c4476a7bbd5056fa898468a46031bf3d8b1e44cf..b12832d2e2a3cccb4948d9e3bf3d226030121ac2 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index 7317fdb52c5b79e787a49d71be49f5261d6b1fff..095b5d798df9ac9038fa1088cdd402dff304e87e 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -23,7 +23,7 @@ from tensorflow.python.keras.metrics import binary_accuracy
 from tensorflow.python.keras.metrics import binary_crossentropy
 from tensorflow.python.keras.metrics import categorical_accuracy
 from tensorflow.python.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras.metrics import cosine_proximity
+from tensorflow.python.keras.metrics import cosine_similarity
 from tensorflow.python.keras.metrics import hinge
 from tensorflow.python.keras.metrics import kullback_leibler_divergence
 from tensorflow.python.keras.metrics import mean_absolute_error
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index 294a7d69a704b3c06ab9e30489af116929ab6c2a..0d43bc2101bc3d189bb48f35b0a801f4e77030bb 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -80,7 +80,7 @@ def sparse_multiclass_hinge_loss(
           ' {}'.format(logits_rank))
     logits_shape = array_ops.shape(logits)
     batch_size, num_classes = logits_shape[0], logits_shape[1]
-    logits = math_ops.to_float(logits)
+    logits = math_ops.cast(logits, dtypes.float32)
 
     # Check labels have valid type.
     if labels.dtype != dtypes.int32 and labels.dtype != dtypes.int64:
diff --git a/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py b/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
index bf89922318b9b9a569e4bd1d71fe6283810cadda..af7018f8368116172511b3f78c42caf3fc215632 100644
--- a/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
+++ b/tensorflow/contrib/kinesis/python/kernel_tests/kinesis_test.py
@@ -29,6 +29,7 @@ from __future__ import print_function
 import boto3
 
 from tensorflow.contrib.kinesis.python.ops import kinesis_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -59,7 +60,8 @@ class KinesisDatasetTest(test.TestCase):
         stream, read_indefinitely=False).repeat(num_epochs)
     batch_dataset = repeat_dataset.batch(batch_size)
 
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    iterator = iterator_ops.Iterator.from_structure(
+        dataset_ops.get_legacy_output_types(batch_dataset))
     init_op = iterator.make_initializer(repeat_dataset)
     init_batch_op = iterator.make_initializer(batch_dataset)
     get_next = iterator.get_next()
@@ -102,7 +104,8 @@ class KinesisDatasetTest(test.TestCase):
         stream, shard, read_indefinitely=False).repeat(num_epochs)
     batch_dataset = repeat_dataset.batch(batch_size)
 
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    iterator = iterator_ops.Iterator.from_structure(
+        dataset_ops.get_legacy_output_types(batch_dataset))
     init_op = iterator.make_initializer(repeat_dataset)
     init_batch_op = iterator.make_initializer(batch_dataset)
     get_next = iterator.get_next()
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 588f15b867c1fedbadd5a5d945d870a356549468..7e19ae7c13df421ec5bb9cb0e07dff0d00fb9548 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -155,7 +155,7 @@ py_library(
         ":core",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:numerics",
         "//tensorflow/python:random_ops",
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index 0c6bba758b429a8c4112bc6abb2fae542b5dfc14..8ee554ffa7ab6bbcc2d36c525ad68e03bacb594b 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -321,8 +321,8 @@ class LabeledTensor(object):
     for (d, axis) in zip(shape, unvalidated_axes.values()):
       if d != axis.size:
         raise ValueError(
-            'Provided axis size %d does not match tensor dimension size %d' %
-            (axis.size, d))
+            'Provided axis size %d does not match tensor dimension size %d'
+            'in tensor %r' % (axis.size, d, tensor))
 
     self._axes = unvalidated_axes
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index 2ede5daee74223e812cc29e9708b1989b698fb4e..a65f045cc886f4d4f351423858d92412baa3a622 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.ops import random_ops
@@ -629,7 +630,7 @@ def map_fn(fn, labeled_tensor, name=None):
 
     # TODO(ericmc): Fix this upstream.
     if labeled_tensor.dtype == dtypes.string:
-      # We must construct the full graph here, because functional_ops.map_fn
+      # We must construct the full graph here, because map_fn_lib.map_fn
       # doesn't work for string-valued tensors.
       # Constructing the full graph may be slow.
       map_lts = [fn(t) for t in unpack_lts]
@@ -652,7 +653,7 @@ def map_fn(fn, labeled_tensor, name=None):
         tensor_lt = core.LabeledTensor(tensor, original_axes)
         return fn(tensor_lt).tensor
 
-      map_op = functional_ops.map_fn(
+      map_op = map_fn_lib.map_fn(
           tf_fn, labeled_tensor.tensor, dtype=first_map_lt.dtype)
       map_lt = core.LabeledTensor(map_op, final_axes)
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 9ca6f8df5dbe3c236c4cd85095176ce69ad9deaa..69d5496f8aebb9b89c5d79f80a1a439f556093d7 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -81,6 +81,7 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 00d819ed0e9fe3a5644105a571beda100204631e..f52aaaf7b7983b4ee0f779a8ed8e163781643222 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -840,7 +840,7 @@ class _WeightedSparseColumn(
       # The weight tensor can be a regular Tensor. In such case, sparsify it.
       weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
     if not self.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
+      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
     return tuple([id_tensor, weight_tensor])
 
   def insert_transformed_feature(self, columns_to_tensors):
@@ -1731,7 +1731,7 @@ class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
     """
     # Transform the input tensor according to the normalizer function.
     input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
-    columns_to_tensors[self] = math_ops.to_float(input_tensor)
+    columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32)
 
   # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
@@ -1871,7 +1871,7 @@ class _RealValuedColumn(
     """
     # Transform the input tensor according to the normalizer function.
     input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
-    columns_to_tensors[self] = math_ops.to_float(input_tensor)
+    columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32)
 
   # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
@@ -1881,7 +1881,7 @@ class _RealValuedColumn(
                           output_rank=2):
     input_tensor = self._to_dense_tensor(input_tensor)
     if input_tensor.dtype != dtypes.float32:
-      input_tensor = math_ops.to_float(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.float32)
     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
 
   def _to_dense_tensor(self, input_tensor):
@@ -1897,8 +1897,8 @@ class _RealValuedColumn(
     return inputs.get(self)
 
   def _transform_feature(self, inputs):
-    return math_ops.to_float(
-        self._normalized_input_tensor(inputs.get(self.name)))
+    return math_ops.cast(
+        self._normalized_input_tensor(inputs.get(self.name)), dtypes.float32)
 
   @property
   def _parse_example_spec(self):
@@ -2104,7 +2104,7 @@ class _BucketizedColumn(
       raise ValueError("BucketizedColumn currently only supports output_rank=2")
     return array_ops.reshape(
         array_ops.one_hot(
-            math_ops.to_int64(input_tensor),
+            math_ops.cast(input_tensor, dtypes.int64),
             self.length,
             1.,
             0.,
@@ -2136,8 +2136,10 @@ class _BucketizedColumn(
       i2 = array_ops.zeros([batch_size], dtype=dtypes.int32, name="zeros")
       bucket_indices = array_ops.reshape(input_tensor, [-1], name="reshape")
 
-    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
-    shape = math_ops.to_int64(array_ops.stack([batch_size, dimension]))
+    indices = math_ops.cast(array_ops.transpose(array_ops.stack((i1, i2))),
+                            dtypes.int64)
+    shape = math_ops.cast(array_ops.stack([batch_size, dimension]),
+                          dtypes.int64)
     sparse_id_values = sparse_tensor_py.SparseTensor(
         indices, bucket_indices, shape)
 
@@ -2527,7 +2529,7 @@ class DataFrameColumn(_FeatureColumn,
                           trainable=True,
                           output_rank=2):
     if input_tensor.dtype != dtypes.float32:
-      input_tensor = math_ops.to_float(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.float32)
     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
 
   def _to_dense_tensor(self, input_tensor):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb..00e41026d0038409ace178e6affd2c1cdc812122 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -1757,7 +1757,7 @@ class WeightedSumTest(test.TestCase):
       logits_core = fc_core.linear_model(features, [movies])
 
       with self.cached_session() as sess:
-        variables_lib.initialize_all_variables().run()
+        variables_lib.global_variables_initializer().run()
         lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 403b522ce45ac6ad98a321378626b87aaa7738aa..1d959b3c78445977b4fe74ee6c20c86aaf7f86da 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2308,11 +2308,13 @@ def layer_norm(inputs,
           initializer=init_ops.ones_initializer(),
           collections=gamma_collections,
           trainable=trainable)
-    # Calculate the moments on the last axis (layer activations).
+    # By default, compute the moments across all the dimensions except the one with index 0.
     norm_axes = list(range(begin_norm_axis, inputs_rank))
     mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
     # Compute layer normalization using the batch_normalization function.
-    variance_epsilon = 1e-12
+    # Note that epsilon must be increased for float16 due to the limited
+    # representable range.
+    variance_epsilon = 1e-12 if dtype != dtypes.float16 else 1e-3
     outputs = nn.batch_normalization(
         inputs,
         mean,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 1c0088186c030437454c0f764decab9e5a276adc..90fd55cf3898586dec3313c238df8b3952b8b349 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1399,9 +1399,10 @@ class DropoutTest(test.TestCase):
     with self.cached_session() as sess:
       images = random_ops.random_uniform(
           (5, height, width, 3), seed=1, name='images')
-      num_elem_initial = math_ops.reduce_mean(math_ops.to_float(images > 0))
+      num_elem_initial = math_ops.reduce_mean(
+          math_ops.cast(images > 0, dtypes.float32))
       output = _layers.dropout(images)
-      num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
+      num_elem = math_ops.reduce_mean(math_ops.cast(output > 0, dtypes.float32))
       num_elem, num_elem_initial = sess.run([num_elem, num_elem_initial])
       self.assertLess(num_elem, num_elem_initial / 2 + 0.1)
       self.assertGreater(num_elem, num_elem_initial / 2 - 0.1)
@@ -1421,9 +1422,10 @@ class DropoutTest(test.TestCase):
     with self.cached_session() as sess:
       images = random_ops.random_uniform(
           (5, height, width, 3), seed=1, name='images')
-      num_elem_initial = math_ops.reduce_mean(math_ops.to_float(images > 0))
+      num_elem_initial = math_ops.reduce_mean(
+          math_ops.cast(images > 0, dtypes.float32))
       output = _layers.dropout(images, is_training=False)
-      num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
+      num_elem = math_ops.reduce_mean(math_ops.cast(output > 0, dtypes.float32))
       num_elem, num_elem_initial = sess.run([num_elem, num_elem_initial])
       self.assertEqual(num_elem, num_elem_initial)
       outputs, inputs = sess.run([output, images])
@@ -1435,9 +1437,10 @@ class DropoutTest(test.TestCase):
       images = random_ops.random_uniform(
           (5, height, width, 3), seed=1, name='images')
       output = _layers.fully_connected(images, 50)
-      num_elem_initial = math_ops.reduce_mean(math_ops.to_float(output > 0))
+      num_elem_initial = math_ops.reduce_mean(
+          math_ops.cast(output > 0, dtypes.float32))
       output = _layers.dropout(output)
-      num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
+      num_elem = math_ops.reduce_mean(math_ops.cast(output > 0, dtypes.float32))
       sess.run(variables_lib.global_variables_initializer())
       num_elem, num_elem_initial = sess.run([num_elem, num_elem_initial])
       self.assertLess(num_elem, num_elem_initial / 2 + 0.1)
@@ -1450,7 +1453,7 @@ class DropoutTest(test.TestCase):
           (5, height, width, 3), seed=1, name='images')
       output = _layers.fully_connected(
           images, 50, normalizer_fn=_layers.dropout)
-      num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
+      num_elem = math_ops.reduce_mean(math_ops.cast(output > 0, dtypes.float32))
       sess.run(variables_lib.global_variables_initializer())
       num_elem = sess.run(num_elem)
       self.assertLess(num_elem, 0.5)
@@ -2869,10 +2872,19 @@ class LayerNormTest(test.TestCase):
                    tol=1e-5,
                    begin_norm_axis=1,
                    dtype=dtypes.float64):
+    eps = 1e-12 if dtype != dtypes.float16 else 1e-3
     expected_mean = np.zeros(input_shape[:begin_norm_axis])
-    expected_var = np.ones(input_shape[:begin_norm_axis])
-    for mu in [0.0, 1e2]:
-      for sigma in [1.0, 0.1]:
+    expected_var_uncorrected = np.ones(input_shape[:begin_norm_axis])
+    sigma_list = [1.0, 0.1]
+    if dtype == dtypes.float16:
+      # This causes the variance to underflow in float16, and requires that
+      # variance_epsilon be set appropriately to avoid NaNs in the output.
+      sigma_list.append(1e-4)
+    # Note that the mean:variance ratio must be limited to the representable
+    # range for float16.
+    for mu in [0.0, 1e2 if dtype != dtypes.float16 else 1e1]:
+      for sigma in sigma_list:
+        expected_var = expected_var_uncorrected / (1.0 + eps / sigma**2)
         input_values = np.random.randn(*input_shape) * sigma + mu
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
@@ -2893,10 +2905,13 @@ class LayerNormTest(test.TestCase):
             outputs, beta, gamma = sess.run((output_t, beta_var, gamma_var))
             # Make sure that there are no NaNs
             self.assertFalse(np.isnan(outputs).any())
+            if outputs.dtype != np.float64:
+              # Cast to float64 before computing mean/variance to avoid
+              # overflow and precision issues.
+              outputs = outputs.astype(np.float64)
             mean = np.mean(outputs, axis=moments_axis)
             var = np.var(outputs, axis=moments_axis)
             # Layer-norm implemented in numpy
-            eps = 1e-12
             expected_out = (
                 (gamma * (input_values - np.mean(
                     input_values, axis=moments_axis, keepdims=True)) /
@@ -2933,6 +2948,12 @@ class LayerNormTest(test.TestCase):
   def testOutputBigInput(self):
     self.doOutputTest((1, 100, 100, 1))
 
+  def testOutputBigInputFloat32(self):
+    self.doOutputTest((1, 100, 1000, 1), tol=1e-4, dtype=dtypes.float32)
+
+  def testOutputBigInputFloat16(self):
+    self.doOutputTest((1, 100, 1000, 1), tol=5e-2, dtype=dtypes.float16)
+
 
 class GDNTest(test.TestCase):
 
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index 2fdcd849b026d52ed4aff724838f6c71e3a315d0..1ccc8f012f90e3240f1156ff1970321c7c4510f0 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib import framework as contrib_framework
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -109,11 +110,12 @@ def optimize_loss(loss,
     gradient_multipliers: dict of variables or variable names to floats.
                           If present, gradients for specified
                           variables will be multiplied by given constant.
-    clip_gradients: float, callable or `None`. If float, is provided, a global
-      clipping is applied to prevent the norm of the gradient to exceed this
-      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
-      This callable takes a `list` of `(gradients, variables)` `tuple`s and
-      returns the same thing with the gradients modified.
+    clip_gradients: float, callable or `None`. If a float is provided, a global
+      clipping is applied to prevent the norm of the gradient from exceeding
+      this value. Alternatively, a callable can be provided, e.g.,
+      `adaptive_clipping_fn()`.  This callable takes a list of 
+      `(gradients, variables)` tuples and returns the same thing with the 
+      gradients modified.
     learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                             `Tensor`s, returns `Tensor`.
                             Can be used to implement any learning rate decay
@@ -324,7 +326,7 @@ def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
 
     # quicker adaptation at the beginning
     if global_step is not None:
-      n = math_ops.to_float(global_step)
+      n = math_ops.cast(global_step, dtypes.float32)
       decay = math_ops.minimum(decay, n / (n + 1.))
 
     # update averages
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 8a6b4f68a8b33d497ddb16614a7e3cdf32f2c422..131b1e0dba28f2498cd11254dad1d5790f5b7c04 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -185,7 +186,8 @@ class _TargetColumn(object):
       return None
     else:
       return array_ops.reshape(
-          math_ops.to_float(features[self._weight_column_name]), shape=(-1,))
+          math_ops.cast(features[self._weight_column_name], dtypes.float32),
+          shape=(-1,))
 
   @property
   def problem_type(self):
@@ -252,9 +254,10 @@ class _TargetColumn(object):
     if weight_tensor is None:
       return math_ops.reduce_mean(loss_unweighted, name="loss")
     loss_weighted = self._weighted_loss(loss_unweighted, weight_tensor)
-    return math_ops.div(math_ops.reduce_sum(loss_weighted),
-                        math_ops.to_float(math_ops.reduce_sum(weight_tensor)),
-                        name="loss")
+    return math_ops.div(
+        math_ops.reduce_sum(loss_weighted),
+        math_ops.cast(math_ops.reduce_sum(weight_tensor), dtypes.float32),
+        name="loss")
 
 
 class _RegressionTargetColumn(_TargetColumn):
@@ -323,7 +326,7 @@ class _MultiClassTargetColumn(_TargetColumn):
       metrics = {("accuracy", "classes"): metric_ops.streaming_accuracy}
 
     predictions = math_ops.sigmoid(logits)
-    labels_float = math_ops.to_float(labels)
+    labels_float = math_ops.cast(labels, dtypes.float32)
 
     default_metrics = self._default_eval_metrics()
     for metric_name, metric_op in default_metrics.items():
@@ -399,7 +402,8 @@ def _mean_squared_loss(logits, target):
     target = array_ops.expand_dims(target, axis=1)
 
   logits.get_shape().assert_is_compatible_with(target.get_shape())
-  return math_ops.square(logits - math_ops.to_float(target))
+  return math_ops.squared_difference(logits,
+                                     math_ops.cast(target, dtypes.float32))
 
 
 def _log_loss_with_two_classes(logits, target):
@@ -407,7 +411,7 @@ def _log_loss_with_two_classes(logits, target):
   if len(target.get_shape()) == 1:
     target = array_ops.expand_dims(target, axis=1)
   loss_vec = nn.sigmoid_cross_entropy_with_logits(
-      labels=math_ops.to_float(target), logits=logits)
+      labels=math_ops.cast(target, dtypes.float32), logits=logits)
   return loss_vec
 
 
@@ -475,7 +479,7 @@ def get_default_binary_metrics_for_eval(thresholds):
 def _float_weights_or_none(weights):
   if weights is None:
     return None
-  return math_ops.to_float(weights)
+  return math_ops.cast(weights, dtypes.float32)
 
 
 def _labels_streaming_mean(unused_predictions, labels, weights=None):
@@ -494,8 +498,8 @@ def _streaming_auc(predictions, labels, weights=None):
 def _accuracy_at_threshold(threshold):
 
   def _accuracy_metric(predictions, labels, weights=None):
-    threshold_predictions = math_ops.to_float(
-        math_ops.greater_equal(predictions, threshold))
+    threshold_predictions = math_ops.cast(
+        math_ops.greater_equal(predictions, threshold), dtypes.float32)
     return metric_ops.streaming_accuracy(
         predictions=threshold_predictions, labels=labels, weights=weights)
 
diff --git a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
index 91684dc61e40efe3f2408c3d5f1f4eb2764bc558..934a7f06069cb85c74ded8237bcfefce0771aa02 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_feature_cross_op.py
@@ -86,11 +86,11 @@ def sparse_feature_cross(inputs, hashed_output=False, num_buckets=0,
   internal_type = dtypes.string
   for i in range(len(values)):
     if values[i].dtype != dtypes.string:
-      values[i] = math_ops.to_int64(values[i])
+      values[i] = math_ops.cast(values[i], dtypes.int64)
       internal_type = dtypes.int64
   for i in range(len(dense_inputs)):
     if dense_inputs[i].dtype != dtypes.string:
-      dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
+      dense_inputs[i] = math_ops.cast(dense_inputs[i], dtypes.int64)
       internal_type = dtypes.int64
 
   if hash_key:
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 14065fcee51c014a1af227504eaaca1fa39941e1..3f0a91ccdc91ed0d8b3e383cf167180fbeb5c8bf 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -357,9 +357,9 @@ py_test(
 
 py_test(
     name = "dnn_linear_combined_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["no_oss"],  # flaky b/70524820
     deps = [
@@ -387,6 +387,13 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
+    deps = [":head_test_lib"],
+)
+
+py_library(
+    name = "head_test_lib",
+    srcs = ["python/learn/estimators/head_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":learn",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/learn/README.md b/tensorflow/contrib/learn/README.md
index b0bff915a993c9a01e2e6d9ef9f71c14d2f29a73..b2d3a6273abba7e3a893f30bbdd4f8b2662bd54a 100644
--- a/tensorflow/contrib/learn/README.md
+++ b/tensorflow/contrib/learn/README.md
@@ -111,18 +111,17 @@ Some arguments are renamed, please refer to documentation. In addition:
 
 Switch to `tf.estimator.train_and_evaluate`. Some differences:
 
-* Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
-  should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
-* Remove the `experiment_fn`. Instead, create the `Estimator`,
-  `train_spec` and `eval_spec`, then call `tf.estimator.train_and_evaluate`
-  directly.
-* Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement
-  for `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
-  replacement for `tf.contrib.learn.make_export_strategy`. If you want to export
-  only at the end of training  use `tf.estimator.FinalExporter`.
-* If the `TF_CONFIG` environment variable is constructed manually, please read
-  the `train_and_evaluate` documentation for the new requirementds (in
-  particular, the chief node and evaluator node).
+*   Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
+    should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
+*   Remove the `experiment_fn`. Instead, create the `Estimator`, `train_spec`
+    and `eval_spec`, then call `tf.estimator.train_and_evaluate` directly.
+*   Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement for
+    `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
+    replacement for `tf.contrib.learn.make_export_strategy`. If you want to
+    export only at the end of training use `tf.estimator.FinalExporter`.
+*   If the `TF_CONFIG` environment variable is constructed manually, please read
+    the `train_and_evaluate` documentation for the new requirements (in
+    particular, the chief node and evaluator node).
 
 ## Others Classes and Functions
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index b968aeed1b7a11d522b531783f04f0104b37904f..ab0ce6d581a9d65f91ace0f8453911e2ddf3e8b8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -474,7 +474,7 @@ class DebugClassifierTest(test.TestCase):
     def _my_metric_op(predictions, labels):
       # For the case of binary classification, the 2nd column of "predictions"
       # denotes the model predictions.
-      labels = math_ops.to_float(labels)
+      labels = math_ops.cast(labels, dtypes.float32)
       predictions = array_ops.strided_slice(
           predictions, [0, 1], [-1, 2], end_mask=1)
       labels = math_ops.cast(labels, predictions.dtype)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index d46a873bfaa297e7f6242aa56e9d0bf0eb551867..4f636ce69ddc7dbd3bb083e5eaead384b5875e8b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -807,7 +807,7 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     def _my_metric_op(predictions, labels):
       # For the case of binary classification, the 2nd column of "predictions"
       # denotes the model predictions.
-      labels = math_ops.to_float(labels)
+      labels = math_ops.cast(labels, dtypes.float32)
       predictions = array_ops.strided_slice(
           predictions, [0, 1], [-1, 2], end_mask=1)
       return math_ops.reduce_sum(math_ops.multiply(predictions, labels))
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index ee25cebd484f1e831fe8b6d3aa7290da7558adee..d779495720b1c8fd4c3c7f4b631dab8b49e4f3bd 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -815,7 +815,7 @@ class DNNClassifierTest(test.TestCase):
     def _my_metric_op(predictions, labels):
       # For the case of binary classification, the 2nd column of "predictions"
       # denotes the model predictions.
-      labels = math_ops.to_float(labels)
+      labels = math_ops.cast(labels, dtypes.float32)
       predictions = array_ops.strided_slice(
           predictions, [0, 1], [-1, 2], end_mask=1)
       labels = math_ops.cast(labels, predictions.dtype)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 28c4964527bb034c8c6b1642366c6c82c1a72201..7a96f6d3ea41ce7a672c3ba8b8a818500012945e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -37,8 +37,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell
@@ -372,9 +372,10 @@ class DynamicRnnEstimatorTest(test.TestCase):
         labels = array_ops.slice(random_sequence, [0, 0],
                                  [batch_size, sequence_length])
         inputs = array_ops.expand_dims(
-            math_ops.to_float(
+            math_ops.cast(
                 array_ops.slice(random_sequence, [0, 1],
-                                [batch_size, sequence_length])), 2)
+                                [batch_size, sequence_length]),
+                dtypes.float32), 2)
         input_dict = {
             dynamic_rnn_estimator._get_state_name(i): random_ops.random_uniform(
                 [batch_size, cell_size], seed=((i + 1) * seed))
@@ -430,9 +431,10 @@ class DynamicRnnEstimatorTest(test.TestCase):
         labels = array_ops.slice(sequence, [0, 0],
                                  [batch_size, sequence_length])
         inputs = array_ops.expand_dims(
-            math_ops.to_float(
+            math_ops.cast(
                 array_ops.slice(sequence, [0, 1], [batch_size, sequence_length
-                                                  ])), 2)
+                                                  ]),
+                dtypes.float32), 2)
         input_dict = state_dict
         input_dict['inputs'] = inputs
         return input_dict, labels
@@ -524,7 +526,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
       def input_fn():
         starts = random_ops.random_uniform(
             [batch_size], maxval=(2 * np.pi), seed=seed)
-        sin_curves = functional_ops.map_fn(
+        sin_curves = map_fn.map_fn(
             _sin_fn, (starts,), dtype=dtypes.float32)
         inputs = array_ops.expand_dims(
             array_ops.slice(sin_curves, [0, 0], [batch_size, sequence_length]),
@@ -587,9 +589,11 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         labels = array_ops.slice(random_sequence, [0, 0],
                                  [batch_size, sequence_length])
         inputs = array_ops.expand_dims(
-            math_ops.to_float(
+            math_ops.cast(
                 array_ops.slice(random_sequence, [0, 1],
-                                [batch_size, sequence_length])), 2)
+                                [batch_size, sequence_length]),
+                dtypes.float32),
+            2)
         return {'inputs': inputs}, labels
 
       return input_fn
@@ -719,11 +723,13 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
       def input_fn():
         random_sequence = random_ops.random_uniform(
             [batch_size, sequence_length], 0, 2, dtype=dtypes.int32, seed=seed)
-        inputs = array_ops.expand_dims(math_ops.to_float(random_sequence), 2)
-        labels = math_ops.to_int32(
+        inputs = array_ops.expand_dims(
+            math_ops.cast(random_sequence, dtypes.float32), 2)
+        labels = math_ops.cast(
             array_ops.squeeze(
                 math_ops.reduce_sum(inputs, axis=[1]) > (
-                    sequence_length / 2.0)))
+                    sequence_length / 2.0)),
+            dtypes.int32)
         return {'inputs': inputs}, labels
 
       return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 8a461a0bd7ba457fcf830769f23c6ca2860a2732..153d4867961ae5115157bb5a246b5819387d91cf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -220,7 +220,7 @@ def _build_estimator_for_export_tests(tmpdir):
     hashtable = lookup.HashTable(
         lookup.TextFileStringTableInitializer(vocab_file_name), 'x')
     features['bogus_lookup'] = hashtable.lookup(
-        math_ops.to_int64(features['feature']))
+        math_ops.cast(features['feature'], dtypes.int64))
 
     return input_fn_utils.InputFnOps(features, labels, inputs)
 
@@ -1181,14 +1181,14 @@ class EstimatorTest(test.TestCase):
         ]
         self.assertItemsEqual([expected_vocab_file], assets)
         graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('linear/linear/feature/matmul' in graph_ops)
+        self.assertIn('input_example_tensor', graph_ops)
+        self.assertIn('ParseExample/ParseExample', graph_ops)
+        self.assertIn('linear/linear/feature/matmul', graph_ops)
         # Since there were no transforms, both save ops are still present.
-        self.assertTrue('save/SaveV2/tensor_names' in graph_ops)
-        self.assertTrue('save_1/SaveV2/tensor_names' in graph_ops)
+        self.assertIn('save/SaveV2/tensor_names', graph_ops)
+        self.assertIn('save_1/SaveV2/tensor_names', graph_ops)
         # Since there were no transforms, the hash table lookup is still there.
-        self.assertTrue('hash_table_Lookup' in graph_ops)
+        self.assertIn('hash_table_Lookup/LookupTableFindV2', graph_ops)
 
     # Restore, to validate that the export was well-formed.
     # tag_2, tag_3 was subjected to strip_unused_nodes.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c1b97d8b49613ea49d9813954da3b7a63d3ba04c..2458652f8eb966d1ff5578ac5231249c572579aa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -567,7 +567,8 @@ def _mean_squared_loss(labels, logits, weights=None):
     if len(logits.get_shape()) == 1:
       logits = array_ops.expand_dims(logits, axis=1)
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
-    loss = math_ops.square(logits - math_ops.to_float(labels), name=name)
+    loss = math_ops.squared_difference(
+        logits, math_ops.cast(labels, dtypes.float32), name=name)
     return _compute_weighted_loss(loss, weights)
 
 
@@ -792,7 +793,7 @@ def _log_loss_with_two_classes(labels, logits, weights=None):
   with ops.name_scope(None, "log_loss_with_two_classes",
                       (logits, labels)) as name:
     logits = ops.convert_to_tensor(logits)
-    labels = math_ops.to_float(labels)
+    labels = math_ops.cast(labels, dtypes.float32)
     # TODO(ptucker): This will break for dynamic shapes.
     # sigmoid_cross_entropy_with_logits requires [batch_size, 1] labels.
     if len(labels.get_shape()) == 1:
@@ -1213,8 +1214,8 @@ def _sparse_labels_to_indicator(labels, num_classes):
     if num_classes < 2:
       raise ValueError("Must set num_classes >= 2 when passing labels as a "
                        "SparseTensor.")
-    return math_ops.to_int64(
-        sparse_ops.sparse_to_indicator(labels, num_classes))
+    return math_ops.cast(
+        sparse_ops.sparse_to_indicator(labels, num_classes), dtypes.int64)
   return labels
 
 
@@ -1399,8 +1400,9 @@ class _MultiLabelHead(_SingleHead):
               math_ops.sigmoid(
                   logits, name=prediction_key.PredictionKey.PROBABILITIES),
           prediction_key.PredictionKey.CLASSES:
-              math_ops.to_int64(
+              math_ops.cast(
                   math_ops.greater(logits, 0),
+                  dtypes.int64,
                   name=prediction_key.PredictionKey.CLASSES)
       }
 
@@ -1782,7 +1784,7 @@ def _weight_tensor(features, weight_column_name):
     raise ValueError("Weights {} missing from features.".format(
         weight_column_name))
   with ops.name_scope(None, "weight_tensor", tuple(six.itervalues(features))):
-    weight_tensor = math_ops.to_float(features[weight_column_name])
+    weight_tensor = math_ops.cast(features[weight_column_name], dtypes.float32)
     shape = weight_tensor.get_shape()
     rank = shape.ndims
     # We don't bother with expanding dims of non-staticly shaped tensors or
@@ -1832,7 +1834,7 @@ def _compute_weighted_loss(loss_unweighted, weight, name="loss"):
     weighted_loss_mean = math_ops.reduce_mean(weighted_loss, name=name_scope)
     weighted_loss_normalized = math_ops.div(
         math_ops.reduce_sum(weighted_loss),
-        math_ops.to_float(math_ops.reduce_sum(weight)),
+        math_ops.cast(math_ops.reduce_sum(weight), dtypes.float32),
         name="weighted_average_loss")
 
     return weighted_loss_mean, weighted_loss_normalized
@@ -1951,7 +1953,7 @@ def _sigmoid_cross_entropy_loss(labels, logits, weights=None):
                       (logits, labels)) as name:
     # sigmoid_cross_entropy_with_logits requires [batch_size, n_classes] labels.
     loss = nn.sigmoid_cross_entropy_with_logits(
-        labels=math_ops.to_float(labels), logits=logits, name=name)
+        labels=math_ops.cast(labels, dtypes.float32), logits=logits, name=name)
     return _compute_weighted_loss(loss, weights)
 
 
@@ -1959,11 +1961,11 @@ def _float_weights_or_none(weights):
   if weights is None:
     return None
   with ops.name_scope(None, "float_weights", (weights,)) as name:
-    return math_ops.to_float(weights, name=name)
+    return math_ops.cast(weights, dtypes.float32, name=name)
 
 
 def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
-  labels = math_ops.to_float(labels)
+  labels = math_ops.cast(labels, dtypes.float32)
   weights = _float_weights_or_none(weights)
   if weights is not None:
     weights = weights_broadcast_ops.broadcast_weights(weights, labels)
@@ -1977,7 +1979,7 @@ def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
 def _predictions_streaming_mean(predictions,
                                 weights=None,
                                 class_id=None):
-  predictions = math_ops.to_float(predictions)
+  predictions = math_ops.cast(predictions, dtypes.float32)
   weights = _float_weights_or_none(weights)
   if weights is not None:
     weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
@@ -2001,9 +2003,9 @@ def _class_predictions_streaming_mean(predictions, weights, class_id):
   return metrics_lib.mean(
       array_ops.where(
           math_ops.equal(
-              math_ops.to_int32(class_id), math_ops.to_int32(predictions)),
-          array_ops.ones_like(predictions),
-          array_ops.zeros_like(predictions)),
+              math_ops.cast(class_id, dtypes.int32),
+              math_ops.cast(predictions, dtypes.int32)),
+          array_ops.ones_like(predictions), array_ops.zeros_like(predictions)),
       weights=weights)
 
 
@@ -2011,15 +2013,16 @@ def _class_labels_streaming_mean(labels, weights, class_id):
   return metrics_lib.mean(
       array_ops.where(
           math_ops.equal(
-              math_ops.to_int32(class_id), math_ops.to_int32(labels)),
-          array_ops.ones_like(labels), array_ops.zeros_like(labels)),
+              math_ops.cast(class_id, dtypes.int32),
+              math_ops.cast(labels, dtypes.int32)), array_ops.ones_like(labels),
+          array_ops.zeros_like(labels)),
       weights=weights)
 
 
 def _streaming_auc(predictions, labels, weights=None, class_id=None,
                    curve="ROC"):
   # pylint: disable=missing-docstring
-  predictions = math_ops.to_float(predictions)
+  predictions = math_ops.cast(predictions, dtypes.float32)
   if labels.dtype.base_dtype != dtypes.bool:
     logging.warning("Casting %s labels to bool.", labels.dtype)
     labels = math_ops.cast(labels, dtypes.bool)
@@ -2046,8 +2049,8 @@ def _assert_class_id(class_id, num_classes=None):
 
 
 def _streaming_accuracy_at_threshold(predictions, labels, weights, threshold):
-  threshold_predictions = math_ops.to_float(
-      math_ops.greater_equal(predictions, threshold))
+  threshold_predictions = math_ops.cast(
+      math_ops.greater_equal(predictions, threshold), dtypes.float32)
   return metrics_lib.accuracy(labels, threshold_predictions, weights)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 7c2d9bb0767cb979dae9c84b5342d129225677ed..a52d25acf402bdda46771e9146a40cfb71e99d53 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -62,8 +62,8 @@ def _assert_no_variables(test_case):
 def _assert_metrics(test_case, expected_loss, expected_eval_metrics,
                     model_fn_ops):
   test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
-  for k in six.iterkeys(expected_eval_metrics):
-    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
+  for k in expected_eval_metrics:
+    test_case.assertIn(k, model_fn_ops.eval_metric_ops)
   variables.initialize_local_variables().run()
   for key, expected_value in six.iteritems(expected_eval_metrics):
     value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
@@ -545,19 +545,19 @@ class MultiLabelHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 0, 0], model_fn_ops.predictions["classes"].eval().tolist()[0])
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
 
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.731059, 0.5, 0.5],
              [0.5, 0.5, 0.731059,]],
@@ -850,18 +850,18 @@ class BinaryClassificationHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 1], list(model_fn_ops.predictions["classes"].eval()))
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.LOGISTIC_REGRESSION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         predicted_classes = predictions_for_serving["classes"].eval().tolist()
         self.assertListEqual(
             [b"0", b"1"], predicted_classes[0])
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
 
   def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
@@ -1349,18 +1349,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [0, 2],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
@@ -1401,18 +1401,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [b"key0", b"key2"],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"key0", b"key1", b"key2"], [b"key0", b"key1", b"key2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index 3cbcc6e98de1c915c302617e4591c9baa33adeaf..8981432f7f27f4c773824efed48d2b3320c1b340 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -31,6 +31,7 @@ from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
 
 
@@ -160,8 +161,9 @@ def _make_logistic_eval_metric_ops(labels, predictions, thresholds):
       labels=labels_tensor, predictions=predictions)
 
   for threshold in thresholds:
-    predictions_at_threshold = math_ops.to_float(
+    predictions_at_threshold = math_ops.cast(
         math_ops.greater_equal(predictions, threshold),
+        dtypes.float32,
         name='predictions_at_threshold_%f' % threshold)
     metrics[metric_key.MetricKey.ACCURACY_MEAN % threshold] = (
         metrics_lib.streaming_accuracy(labels=labels_tensor,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index dcb161180c99ce71195c820217e8bdaf79d70901..96adc8b83b5bec912460dbb54899ce5f168b8f25 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -219,7 +219,7 @@ class ModelFnOps(
         used if a Servo request does not explicitly mention which head to infer
         on. Pass the key of the output alternative here that you want to
         designate as default. A separate ExportOutpout for this default head
-        wil be added to the export_outputs dict with the special key
+        will be added to the export_outputs dict with the special key
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, unless there is
         already an enry in output_alternatives with this special key.
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
index 06c61554fa2fa9b563652e7555fbe436ee102638..0689be88c5efa3f85ad981bb630f2f897e8663ff 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
@@ -396,8 +396,9 @@ class StateSavingRnnEstimatorTest(test.TestCase):
         random_sequence = random_ops.random_uniform(
             [sequence_length + 1], 0, 2, dtype=dtypes.int32, seed=seed)
         labels = array_ops.slice(random_sequence, [0], [sequence_length])
-        inputs = math_ops.to_float(
-            array_ops.slice(random_sequence, [1], [sequence_length]))
+        inputs = math_ops.cast(
+            array_ops.slice(random_sequence, [1], [sequence_length]),
+            dtypes.float32)
         features = {'inputs': inputs}
 
         if mode == model_fn_lib.ModeKeys.INFER:
@@ -450,8 +451,9 @@ class LegacyConstructorTest(test.TestCase):
       random_sequence = random_ops.random_uniform(
           [sequence_length + 1], 0, 2, dtype=dtypes.int32, seed=seed)
       labels = array_ops.slice(random_sequence, [0], [sequence_length])
-      inputs = math_ops.to_float(
-          array_ops.slice(random_sequence, [1], [sequence_length]))
+      inputs = math_ops.cast(
+          array_ops.slice(random_sequence, [1], [sequence_length]),
+          dtypes.float32)
       return {'inputs': inputs}, labels
     return input_fn
 
@@ -537,8 +539,9 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
         random_sequence = random_ops.random_uniform(
             [sequence_length + 1], 0, 2, dtype=dtypes.int32, seed=seed)
         labels = array_ops.slice(random_sequence, [0], [sequence_length])
-        inputs = math_ops.to_float(
-            array_ops.slice(random_sequence, [1], [sequence_length]))
+        inputs = math_ops.cast(
+            array_ops.slice(random_sequence, [1], [sequence_length]),
+            dtypes.float32)
         return {'inputs': inputs}, labels
 
       return input_fn
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index c056a12fa5307a7e9ac4cf30e1386ddfd5cd7d75..950840c6b77d771a5a051870d7986b00de3e2902 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -624,7 +624,7 @@ class SdcaModel(object):
           # Note that we need double precision to get accurate results.
           with ops.control_dependencies(shard_sums):
             shard_sums.append(
-                math_ops.reduce_sum(math_ops.to_double(values), 0))
+                math_ops.reduce_sum(math_ops.cast(values, dtypes.float64), 0))
       summed_values = math_ops.add_n(shard_sums)
 
       primal_loss = summed_values[1]
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index a28394964a12013c43d85701b5a0ab5c559afd62..8fda828e994bc2436eaba4475077020436703631 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
 
-# TODO(rohanj): This should subclass Checkpointable and implement
+# TODO(rohanj): This should subclass Trackable and implement
 # _gather_saveables_for_checkpoint.
 class ShardedMutableDenseHashTable(object):
   """A sharded version of MutableDenseHashTable.
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 8ebe45d8510f4b78cded997916dd9d6b96d22579..58ab3aec6648ca61db996be458178e90d6c47353 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -135,7 +135,7 @@ class SDCAOptimizer(object):
           array_ops.reshape(
               array_ops.split(
                   value=sparse_indices, num_or_size_splits=2, axis=1)[1], [-1]),
-          array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
+          array_ops.reshape(math_ops.cast(sparse_values, dtypes.float32), [-1]))
 
     def _training_examples_and_variables():
       """Returns dictionaries for training examples and variables."""
@@ -254,8 +254,8 @@ class SDCAOptimizer(object):
       examples = dict(
           sparse_features=sparse_feature_with_values,
           dense_features=dense_features,
-          example_labels=math_ops.to_float(
-              array_ops.reshape(targets, shape=[-1])),
+          example_labels=math_ops.cast(
+              array_ops.reshape(targets, shape=[-1]), dtypes.float32),
           example_weights=example_weights,
           example_ids=example_ids)
       sdca_variables = dict(
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
deleted file mode 100644
index 893ddd78231c8a0d819cbe5776e6873bdab57355..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/python/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])
-
-# DO NOT USE THIS TARGET. TensorFlow Lite has moved to tensorflow/lite.
-py_library(
-    name = "lite",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/lite/python:lite",
-    ],
-)
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 229a72a780d5ccce8263444ffeae7700f6ac8613..20e86e56bbe911eca2bba661aff7165e53fa159e 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,11 +26,11 @@ from tensorflow.python.ops import lookup_ops
 # pylint: disable=unused-import
 from tensorflow.python.ops.lookup_ops import FastHashSpec
 from tensorflow.python.ops.lookup_ops import HasherSpec
-from tensorflow.python.ops.lookup_ops import HashTable
 from tensorflow.python.ops.lookup_ops import IdTableWithHashBuckets
 from tensorflow.python.ops.lookup_ops import index_table_from_file
 from tensorflow.python.ops.lookup_ops import index_to_string_table_from_file
 from tensorflow.python.ops.lookup_ops import InitializableLookupTableBase
+from tensorflow.python.ops.lookup_ops import InitializableLookupTableBaseV1
 from tensorflow.python.ops.lookup_ops import KeyValueTensorInitializer
 from tensorflow.python.ops.lookup_ops import LookupInterface
 from tensorflow.python.ops.lookup_ops import StrongHashSpec
@@ -42,7 +40,6 @@ from tensorflow.python.ops.lookup_ops import TextFileIndex
 from tensorflow.python.ops.lookup_ops import TextFileInitializer
 from tensorflow.python.ops.lookup_ops import TextFileStringTableInitializer
 # pylint: enable=unused-import
-from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util.deprecation import deprecated
 
 
@@ -288,353 +285,52 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   return table.lookup(tensor)
 
 
-class MutableHashTable(LookupInterface):
-  """A generic mutable hash table implementation.
-
-  Data can be inserted by calling the insert method and removed by calling the
-  remove method. It does not support initialization via the init method.
+class HashTable(InitializableLookupTableBaseV1):
+  """A generic hash table implementation.
 
   Example usage:
 
   ```python
-  table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
-                                             value_dtype=tf.int64,
-                                             default_value=-1)
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
+  table = tf.HashTable(
+      tf.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor)
+  table.init.run()
   print(out.eval())
   ```
   """
 
-  def __init__(self,
-               key_dtype,
-               value_dtype,
-               default_value,
-               shared_name=None,
-               name="MutableHashTable",
-               checkpoint=True):
-    """Creates an empty `MutableHashTable` object.
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
 
-    Creates a table, the type of its keys and values are specified by key_dtype
-    and value_dtype, respectively.
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
 
     Args:
-      key_dtype: the type of the key tensors.
-      value_dtype: the type of the value tensors.
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
       default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
+      shared_name: If non-empty, this table will be shared under the given name
+        across multiple sessions.
       name: A name for the operation (optional).
-      checkpoint: if True, the contents of the table are saved to and restored
-        from checkpoints. If `shared_name` is empty for a checkpointed table, it
-        is shared using the table node name.
 
     Returns:
-      A `MutableHashTable` object.
-
-    Raises:
-      ValueError: If checkpoint is True and no name was specified.
+      A `HashTable` object.
     """
-    self._default_value = ops.convert_to_tensor(default_value,
-                                                dtype=value_dtype)
-    self._value_shape = self._default_value.get_shape()
-    self._checkpoint = checkpoint
-    self._key_dtype = key_dtype
-    self._value_dtype = value_dtype
-    self._name = name
-
-    if context.executing_eagerly() and shared_name is None:
-      # TODO(allenl): This will leak memory due to kernel caching by the
-      # shared_name attribute value (but is better than the alternative of
-      # sharing everything by default when executing eagerly; hopefully creating
-      # tables in a loop is uncommon).
-      shared_name = "table_%d" % (ops.uid(),)
+    self._initializer = initializer
+    self._default_value = default_value
     self._shared_name = shared_name
-    super(MutableHashTable, self).__init__(key_dtype, value_dtype)
-
-    self._resource_handle = self.create_resource()
-    if checkpoint:
-      saveable = MutableHashTable._Saveable(self, name)
-      if not context.executing_eagerly():
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-
-  def create_resource(self):
-    # The table must be shared if checkpointing is requested for multi-worker
-    # training to work correctly. Use the node name if no shared_name has been
-    # explicitly specified.
-    use_node_name_sharing = self._checkpoint and self._shared_name is None
-    if self._default_value.get_shape().ndims == 0:
-      table_ref = gen_lookup_ops.mutable_hash_table_v2(
-          shared_name=self._shared_name,
-          use_node_name_sharing=use_node_name_sharing,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype,
-          name=self._name)
-    else:
-      table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
-          shared_name=self._shared_name,
-          use_node_name_sharing=use_node_name_sharing,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype,
-          value_shape=self._default_value.get_shape(),
-          name=self._name)
-
-    if context.executing_eagerly():
-      self._table_name = None
-    else:
-      self._table_name = table_ref.op.name.split("/")[-1]
-    return table_ref
-
-  @property
-  def name(self):
-    return self._table_name
-
-  def size(self, name=None):
-    """Compute the number of elements in this table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A scalar tensor containing the number of elements in this table.
-    """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        return gen_lookup_ops.lookup_table_size_v2(
-            self.resource_handle, name=name)
-
-  def remove(self, keys, name=None):
-    """Removes `keys` and its associated values from the table.
-
-    If a key is not present in the table, it is silently ignored.
-
-    Args:
-      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
-        key type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    with ops.name_scope(
-        name, "%s_lookup_table_remove" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      # pylint: disable=protected-access
-      op = gen_lookup_ops.lookup_table_remove_v2(
-          self.resource_handle, keys, name=name)
-
-    return op
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    The `default_value` is used for keys not present in the table.
-
-    Args:
-      keys: Keys to look up. Can be a tensor of any shape. Must match the
-        table's key_dtype.
-      name: A name for the operation (optional).
-
-    Returns:
-      A tensor containing the values in the same shape as `keys` using the
-        table's value type.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    with ops.name_scope(
-        name, "%s_lookup_table_find" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self.resource_handle):
-        values = gen_lookup_ops.lookup_table_find_v2(
-            self.resource_handle, keys, self._default_value, name=name)
-    return values
-
-  def insert(self, keys, values, name=None):
-    """Associates `keys` with `values`.
-
-    Args:
-      keys: Keys to insert. Can be a tensor of any shape. Must match the
-        table's key type.
-      values: Values to be associated with keys. Must be a tensor of the same
-        shape as `keys` and match the table's value type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` or `values` doesn't match the table data
-        types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
-                        [self.resource_handle, keys, values]) as name:
-      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
-      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
-      with ops.colocate_with(self.resource_handle):
-        # pylint: disable=protected-access
-        op = gen_lookup_ops.lookup_table_insert_v2(
-            self.resource_handle, keys, values, name=name)
-    return op
-
-  def export(self, name=None):
-    """Returns tensors of all keys and values in the table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A pair of tensors with the first tensor containing all keys and the
-        second tensors containing all values in the table.
-    """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
-    return exported_keys, exported_values
-
-  def _gather_saveables_for_checkpoint(self):
-    """For object-based checkpointing."""
-    return {"table": functools.partial(MutableHashTable._Saveable, table=self)}
-
-  class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableHashTable."""
-
-    def __init__(self, table, name):
-      tensors = table.export()
-      specs = [
-          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
-          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
-      ]
-      # pylint: disable=protected-access
-      super(MutableHashTable._Saveable, self).__init__(table, specs, name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      del restored_shapes  # unused
-      # pylint: disable=protected-access
-      with ops.colocate_with(self.op.resource_handle):
-        return gen_lookup_ops.lookup_table_import_v2(
-            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
-
-
-class MutableDenseHashTable(LookupInterface):
-  """A generic mutable hash table implementation using tensors as backing store.
-
-  Data can be inserted by calling the insert method and removed by calling the
-  remove method. It does not support initialization via the init method.
-
-  It uses "open addressing" with quadratic reprobing to resolve collisions.
-  Compared to `MutableHashTable` the insert, remove and lookup operations in a
-  `MutableDenseHashTable` are typically faster, but memory usage can be higher.
-  However, `MutableDenseHashTable` does not require additional memory for
-  temporary tensors created during checkpointing and restore operations.
-
-  Example usage:
-
-  ```python
-  table = tf.contrib.lookup.MutableDenseHashTable(key_dtype=tf.int64,
-                                                  value_dtype=tf.int64,
-                                                  default_value=-1,
-                                                  empty_key=0,
-                                                  deleted_key=-1)
-
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
-  print(out.eval())
-  ```
-  """
-
-  # TODO(andreasst): consider extracting common code with MutableHashTable into
-  # a common superclass.
-  def __init__(self,
-               key_dtype,
-               value_dtype,
-               default_value,
-               empty_key,
-               deleted_key,
-               initial_num_buckets=None,
-               shared_name=None,
-               name="MutableDenseHashTable",
-               checkpoint=True):
-    """Creates an empty `MutableDenseHashTable` object.
-
-    Creates a table, the type of its keys and values are specified by key_dtype
-    and value_dtype, respectively.
-
-    Args:
-      key_dtype: the type of the key tensors.
-      value_dtype: the type of the value tensors.
-      default_value: The value to use if a key is missing in the table.
-      empty_key: the key to use to represent empty buckets internally. Must not
-        be used in insert, remove or lookup operations.
-      initial_num_buckets: the initial number of buckets.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
-      name: A name for the operation (optional).
-      checkpoint: if True, the contents of the table are saved to and restored
-        from checkpoints. If `shared_name` is empty for a checkpointed table, it
-        is shared using the table node name.
-      deleted_key: the key to use to represent deleted buckets internally. Must
-        not be used in insert, remove or lookup operations and be different from
-        the empty_key.
-
-    Returns:
-      A `MutableDenseHashTable` object.
-
-    Raises:
-      ValueError: If checkpoint is True and no name was specified.
-    """
-    self._default_value = ops.convert_to_tensor(
-        default_value, dtype=value_dtype, name="default_value")
-    self._key_dtype = key_dtype
-    self._value_dtype = value_dtype
-    self._initial_num_buckets = initial_num_buckets
+    self._name = name or "hash_table"
+    self._table_name = None
+    super(HashTable, self).__init__(default_value, initializer)
     self._value_shape = self._default_value.get_shape()
-    self._checkpoint = checkpoint
-    self._name = name
-
-    self._empty_key = ops.convert_to_tensor(
-        empty_key, dtype=key_dtype, name="empty_key")
-    self._deleted_key = ops.convert_to_tensor(
-        deleted_key, dtype=key_dtype, name="deleted_key")
-    if context.executing_eagerly() and shared_name is None:
-      # TODO(allenl): This will leak memory due to kernel caching by the
-      # shared_name attribute value (but is better than the alternative of
-      # sharing everything by default when executing eagerly; hopefully creating
-      # tables in a loop is uncommon).
-      shared_name = "table_%d" % (ops.uid(),)
-    self._shared_name = shared_name
-    super(MutableDenseHashTable, self).__init__(key_dtype, value_dtype)
-
-    self._resource_handle = self.create_resource()
-    if checkpoint:
-      saveable = MutableDenseHashTable._Saveable(self, name)
-      if not context.executing_eagerly():
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-
-  def create_resource(self):
-    # The table must be shared if checkpointing is requested for multi-worker
-    # training to work correctly. Use the node name if no shared_name has been
-    # explicitly specified.
-    use_node_name_sharing = self._checkpoint and self._shared_name is None
-    table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
-        empty_key=self._empty_key,
-        deleted_key=self._deleted_key,
+
+  def _create_resource(self):
+    table_ref = gen_lookup_ops.hash_table_v2(
         shared_name=self._shared_name,
-        use_node_name_sharing=use_node_name_sharing,
-        value_dtype=self._value_dtype,
-        value_shape=self._value_shape,
-        initial_num_buckets=self._initial_num_buckets,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
         name=self._name)
     if context.executing_eagerly():
       self._table_name = None
@@ -642,107 +338,14 @@ class MutableDenseHashTable(LookupInterface):
       self._table_name = table_ref.op.name.split("/")[-1]
     return table_ref
 
+  @property
+  def init(self):
+    return self.initializer
+
   @property
   def name(self):
     return self._table_name
 
-  def size(self, name=None):
-    """Compute the number of elements in this table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A scalar tensor containing the number of elements in this table.
-    """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        return gen_lookup_ops.lookup_table_size_v2(
-            self.resource_handle, name=name)
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    The `default_value` is used for keys not present in the table.
-
-    Args:
-      keys: Keys to look up. Can be a tensor of any shape. Must match the
-        table's key_dtype.
-      name: A name for the operation (optional).
-
-    Returns:
-      A tensor containing the values in the same shape as `keys` using the
-        table's value type.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
-                        [self.resource_handle, keys]) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self.resource_handle):
-        values = gen_lookup_ops.lookup_table_find_v2(
-            self.resource_handle, keys, self._default_value, name=name)
-
-    return values
-
-  def insert(self, keys, values, name=None):
-    """Associates `keys` with `values`.
-
-    Args:
-      keys: Keys to insert. Can be a tensor of any shape. Must match the
-        table's key type.
-      values: Values to be associated with keys. Must be a tensor of the same
-        shape as `keys` and match the table's value type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` or `values` doesn't match the table data
-        types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
-                        [self.resource_handle, keys, values]) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      values = ops.convert_to_tensor(
-          values, dtype=self._value_dtype, name="values")
-      with ops.colocate_with(self.resource_handle):
-        op = gen_lookup_ops.lookup_table_insert_v2(
-            self.resource_handle, keys, values, name=name)
-      return op
-
-  def remove(self, keys, name=None):
-    """Removes `keys` and its associated values from the table.
-
-    If a key is not present in the table, it is silently ignored.
-
-    Args:
-      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
-        key type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    with ops.name_scope(
-        name, "%s_lookup_table_remove" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      # pylint: disable=protected-access
-      op = gen_lookup_ops.lookup_table_remove_v2(
-          self.resource_handle, keys, name=name)
-
-    return op
-
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
 
@@ -753,34 +356,15 @@ class MutableDenseHashTable(LookupInterface):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+    with ops.name_scope(name, "%s_Export" % self.name,
                         [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype, name=name)
 
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
     return exported_keys, exported_values
 
-  def _gather_saveables_for_checkpoint(self):
-    """For object-based checkpointing."""
-    return {"table": functools.partial(
-        MutableDenseHashTable._Saveable, table=self)}
-
-  class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableDenseHashTable."""
-
-    def __init__(self, table, name):
-      tensors = table.export()
-      specs = [
-          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
-          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
-      ]
-      # pylint: disable=protected-access
-      super(MutableDenseHashTable._Saveable, self).__init__(table, specs, name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      del restored_shapes  # unused
-      # pylint: disable=protected-access
-      with ops.colocate_with(self.op.resource_handle):
-        return gen_lookup_ops.lookup_table_import_v2(
-            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+
+MutableHashTable = lookup_ops.MutableHashTable
+MutableDenseHashTable = lookup_ops.DenseHashTable
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 9b2c2dd87cc8a92fbb6b45504939be3788b60839..9fe8dafcc8edd6b80625c61a4a0e783e65b44720 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -18,14 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 import numpy as np
-import six
 
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import counter
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -37,9 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable
 
 
 class HashTableOpTest(test.TestCase):
@@ -299,1240 +293,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
-class MutableHashTableOpTest(test.TestCase):
-
-  def testMutableHashTable(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list())
-      self.assertAllEqual([None], exported_values.get_shape().as_list())
-
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(exported_keys.eval())
-      sorted_values = np.sort(exported_values.eval())
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([0, 1, 2], sorted_values)
-
-  def testSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      v0 = variables.Variable(10.0, name="v0")
-      v1 = variables.Variable(20.0, name="v1")
-
-      default_val = -1
-      keys = constant_op.constant(["b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(
-          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-
-      save = saver.Saver()
-      variables.global_variables_initializer().run()
-
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      v0 = variables.Variable(-1.0, name="v0")
-      v1 = variables.Variable(-1.0, name="v1")
-      default_val = -1
-      table = lookup.MutableHashTable(
-          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-      table.insert(
-          constant_op.constant(["a", "c"], dtypes.string),
-          constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-      # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["a", "b", "c", "d", "e"],
-                                          dtypes.string)
-      output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, 1, 2, -1], output.eval())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testObjectSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-
-    default_val = -1
-    keys = constant_op.constant(["b", "c", "d"], dtypes.string)
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup.MutableHashTable(
-        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
-    self.evaluate([v0.initializer, v1.initializer])
-
-    # Check that the parameter nodes have been initialized.
-    self.assertEqual(10.0, self.evaluate(v0))
-    self.assertEqual(20.0, self.evaluate(v1))
-
-    self.assertAllEqual(0, self.evaluate(table.size()))
-    self.evaluate(table.insert(keys, values))
-    self.assertAllEqual(3, self.evaluate(table.size()))
-
-    save_path = checkpoint.save(save_prefix)
-    del table, checkpoint, v0, v1
-
-    v0 = variables.Variable(-1.0, name="v0")
-    v1 = variables.Variable(-1.0, name="v1")
-    default_val = -1
-    table = lookup.MutableHashTable(
-        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-    self.evaluate(table.insert(
-        constant_op.constant(["a", "c"], dtypes.string),
-        constant_op.constant([12, 24], dtypes.int64)))
-    self.assertAllEqual(2, self.evaluate(table.size()))
-
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
-
-    # Restore the saved values in the parameter nodes.
-    checkpoint.restore(save_path).run_restore_ops()
-    # Check that the parameter nodes have been restored.
-    self.assertEqual(10.0, self.evaluate(v0))
-    self.assertEqual(20.0, self.evaluate(v1))
-
-    self.assertAllEqual(3, self.evaluate(table.size()))
-
-    input_string = constant_op.constant(["a", "b", "c", "d", "e"],
-                                        dtypes.string)
-    output = table.lookup(input_string)
-    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
-
-  def testSharing(self):
-    # Start a server to store the table state
-    server = server_lib.Server(
-        {
-            "local0": ["localhost:0"]
-        }, protocol="grpc", start=True)
-    # Create two sessions sharing the same state
-    session1 = session.Session(server.target)
-    session2 = session.Session(server.target)
-
-    table = lookup.MutableHashTable(
-        dtypes.int64, dtypes.string, "-", name="t1")
-
-    # Populate the table in the first session
-    with session1:
-      self.assertAllEqual(0, table.size().eval())
-
-      keys = constant_op.constant([11, 12], dtypes.int64)
-      values = constant_op.constant(["a", "b"])
-      table.insert(keys, values).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
-      self.assertAllEqual([b"a", b"b", b"-"], output.eval())
-
-    # Verify that we can access the shared data from the second session
-    with session2:
-      self.assertAllEqual(2, table.size().eval())
-
-      output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
-      self.assertAllEqual([b"-", b"a", b"b"], output.eval())
-
-  def testMutableHashTableOfTensors(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3, 2], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list(),
-                          msg="Saw shape %s" % exported_keys.shape)
-      self.assertAllEqual([None, 2], exported_values.get_shape().as_list(),
-                          msg="Saw shape %s" % exported_values.shape)
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(exported_keys.eval())
-      sorted_values = np.sort(exported_values.eval())
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([[4, 5], [2, 3], [0, 1]], sorted_values)
-
-  def testMutableHashTableExportInsert(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table1 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      self.assertAllEqual(0, table1.size().eval())
-      table1.insert(keys, values).run()
-      self.assertAllEqual(3, table1.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      expected_output = [[0, 1], [2, 3], [-1, -1]]
-      output1 = table1.lookup(input_string)
-      self.assertAllEqual(expected_output, output1.eval())
-
-      exported_keys, exported_values = table1.export()
-      self.assertAllEqual(3, exported_keys.eval().size)
-      self.assertAllEqual(6, exported_values.eval().size)
-
-      # Populate a second table from the exported data
-      table2 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      self.assertAllEqual(0, table2.size().eval())
-      table2.insert(exported_keys, exported_values).run()
-      self.assertAllEqual(3, table2.size().eval())
-
-      # Verify lookup result is still the same
-      output2 = table2.lookup(input_string)
-      self.assertAllEqual(expected_output, output2.eval())
-
-  def testMutableHashTableOfTensorsInvalidShape(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      # Shape [6] instead of [3, 2]
-      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [2,3] instead of [3, 2]
-      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [2, 2] instead of [3, 2]
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [3, 1] instead of [3, 2]
-      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Valid Insert
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-  def testMutableHashTableInvalidDefaultValue(self):
-    with self.cached_session():
-      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      with self.assertRaisesOpError("Default value must be a vector"):
-        self.assertAllEqual(0, table.size().eval())
-
-  def testMutableHashTableDuplicateInsert(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([3, 1, -1], result)
-
-  def testMutableHashTableFindHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([[0, 1], [-1, -1]], result)
-
-  def testMutableHashTableInsertHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, 3, -1], result)
-
-  def testMutableHashTableRemoveHighRank(self):
-    with self.test_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["salad", "tarkus"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, -1, 3, -1], result)
-
-  def testMutableHashTableOfTensorsFindHighRank(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
-
-  def testMutableHashTableOfTensorsRemoveHighRank(self):
-    with self.test_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      remove_string = constant_op.constant([["brain", "tank"]])
-      table.remove(remove_string).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["surgery", "tank"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
-
-  def testMultipleMutableHashTables(self):
-    with self.cached_session() as sess:
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-
-      table1 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table2 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table3 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table1.insert(keys, values).run()
-      table2.insert(keys, values).run()
-      table3.insert(keys, values).run()
-
-      self.assertAllEqual(3, table1.size().eval())
-      self.assertAllEqual(3, table2.size().eval())
-      self.assertAllEqual(3, table3.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
-
-      out1, out2, out3 = sess.run([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
-
-  def testMutableHashTableWithTensorDefault(self):
-    with self.cached_session():
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testSignatureMismatch(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      # insert with keys of the wrong type
-      with self.assertRaises(ValueError):
-        table.insert(constant_op.constant([4, 5, 6]), values).run()
-
-      # insert with values of the wrong type
-      with self.assertRaises(ValueError):
-        table.insert(keys, constant_op.constant(["a", "b", "c"])).run()
-
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string_ref = variables.Variable("brain")
-      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
-      variables.global_variables_initializer().run()
-
-      # Ref types do not produce an insert signature mismatch.
-      table.insert(input_string_ref, input_int64_ref).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      # Ref types do not produce a lookup signature mismatch.
-      self.assertEqual(-1, table.lookup(input_string_ref).eval())
-
-      # lookup with keys of the wrong type
-      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(ValueError):
-        table.lookup(input_string).eval()
-
-      # default value of the wrong type
-      with self.assertRaises(TypeError):
-        lookup.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
-
-  def testMutableHashTableStringFloat(self):
-    with self.cached_session():
-      default_val = -1.5
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.float32,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllClose([0, 1.1, default_val], result)
-
-  def testMutableHashTableIntFloat(self):
-    with self.cached_session():
-      default_val = -1.0
-      keys = constant_op.constant([3, 7, 0], dtypes.int64)
-      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
-      table = lookup.MutableHashTable(dtypes.int64, dtypes.float32,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllClose([-1.2, 9.9, default_val], result)
-
-  def testMutableHashTableInt64String(self):
-    with self.cached_session():
-      default_val = "n/a"
-      keys = constant_op.constant([0, 1, 2], dtypes.int64)
-      values = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup.MutableHashTable(dtypes.int64, dtypes.string,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
-
-
-class MutableDenseHashTableOpTest(test.TestCase):
-
-  def testBasic(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, -1, -1], result)
-
-  def testBasicBool(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([True, True, True, True], dtypes.bool)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.bool,
-          default_value=False,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant([11, 15], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([False, True, False], result)
-
-  def testSameEmptyAndDeletedKey(self):
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, table.size().eval())
-
-  def testLookupUnknownShape(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      placeholder_keys = array_ops.placeholder(dtypes.int64)
-      output = table.lookup(placeholder_keys)
-      self.assertAllEqual(None, output.get_shape())
-      result = output.eval({placeholder_keys: [11, 12, 15]})
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testMapStringToFloat(self):
-    with self.cached_session():
-
-      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
-      default_value = constant_op.constant(-1.5, dtypes.float32)
-      table = lookup.MutableDenseHashTable(
-          dtypes.string,
-          dtypes.float32,
-          default_value=default_value,
-          empty_key="",
-          deleted_key="$")
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["b", "e"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
-
-      result = output.eval()
-      self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
-  def testMapInt64ToFloat(self):
-    for float_dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-
-        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
-        default_value = constant_op.constant(-1.5, float_dtype)
-        table = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            float_dtype,
-            default_value=default_value,
-            empty_key=0,
-            deleted_key=-1)
-        self.assertAllEqual(0, table.size().eval())
-
-        table.insert(keys, values).run()
-        self.assertAllEqual(4, table.size().eval())
-
-        remove_string = constant_op.constant([12, 15], dtypes.int64)
-        table.remove(remove_string).run()
-        self.assertAllEqual(3, table.size().eval())
-
-        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-        output = table.lookup(input_string)
-        self.assertAllEqual([4], output.get_shape())
-
-        result = output.eval()
-        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
-  def testVectorValues(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
-                                    dtypes.int64)
-      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      table.insert(
-          constant_op.constant([14], dtypes.int64),
-          constant_op.constant([[2, 3, 4, 5]], dtypes.int64)).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      remove_string = constant_op.constant([12, 16], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4, 4],
-                          output.shape,
-                          msg="Saw shape: %s" % output.shape)
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
-          result)
-
-  def testVectorKeys(self):
-    with self.cached_session():
-      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
-      values = constant_op.constant([10, 11, 12], dtypes.int64)
-      empty_key = constant_op.constant([0, 3], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      table.insert(
-          constant_op.constant([[0, 0]], dtypes.int64),
-          constant_op.constant([13], dtypes.int64)).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([10, -1, 12, -1], result)
-
-  def testResize(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([12, 99], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
-      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
-
-      table.insert(keys3, values3).run()
-      self.assertAllEqual(6, table.size().eval())
-      self.assertAllEqual(16, len(table.export()[0].eval()))
-
-      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
-                                   dtypes.int64)
-      output = table.lookup(keys4)
-      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], output.eval())
-
-  def testExport(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=100,
-          deleted_key=200,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list())
-      self.assertAllEqual([None], exported_values.get_shape().as_list())
-
-      np_keys = exported_keys.eval()
-      np_values = exported_values.eval()
-
-      self.assertAllEqual(8, len(np_keys))
-      self.assertAllEqual(8, len(np_values))
-
-      # pair up keys and values, drop extra added dimension
-      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
-      # sort by key
-      pairs = pairs[pairs[:, 0].argsort()]
-      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
-                           [100, 0], [100, 0], [200, 2]], pairs)
-
-  def testSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      default_value = -1
-      empty_key = 0
-      deleted_key = -1
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([11, 14], dtypes.int64),
-          constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testObjectSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    default_value = -1
-    empty_key = 0
-    deleted_key = -1
-    keys = constant_op.constant([11, 12, 13], dtypes.int64)
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    save_table = lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.int64,
-        default_value=default_value,
-        empty_key=empty_key,
-        deleted_key=deleted_key,
-        name="t1",
-        checkpoint=True,
-        initial_num_buckets=32)
-
-    save_checkpoint = checkpointable.Checkpoint(table=save_table)
-
-    self.assertAllEqual(0, self.evaluate(save_table.size()))
-    self.evaluate(save_table.insert(keys, values))
-    self.assertAllEqual(3, self.evaluate(save_table.size()))
-    self.assertAllEqual(32, len(self.evaluate(save_table.export()[0])))
-
-    save_path = save_checkpoint.save(save_prefix)
-    del save_table, save_checkpoint
-
-    load_table = lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.int64,
-        default_value=default_value,
-        empty_key=empty_key,
-        deleted_key=deleted_key,
-        name="t1",
-        checkpoint=True,
-        initial_num_buckets=64)
-    self.evaluate(load_table.insert(
-        constant_op.constant([11, 14], dtypes.int64),
-        constant_op.constant([12, 24], dtypes.int64)))
-    self.assertAllEqual(2, self.evaluate(load_table.size()))
-    self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
-
-    restore_checkpoint = checkpointable.Checkpoint(table=load_table)
-
-    # Restore the saved values in the parameter nodes.
-    restore_checkpoint.restore(save_path).run_restore_ops()
-
-    self.assertAllEqual(3, self.evaluate(load_table.size()))
-    self.assertAllEqual(32, len(self.evaluate(load_table.export()[0])))
-
-    input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
-    output = load_table.lookup(input_string)
-    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
-
-  def testVectorSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
-      default_value = constant_op.constant([-1, -2], dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
-                                  dtypes.int64)
-      values = constant_op.constant([[0, 1], [2, 3], [2, 4], [4, 5]],
-                                    dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
-      default_value = constant_op.constant([-1, -2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
-          constant_op.constant([[21, 22], [23, 24]], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant(
-          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -2], [4, 5], [-1, -2]],
-                          output.eval())
-
-  def testVectorScalarSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "vector_scalar_save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
-                                  dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t2",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t2",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
-          constant_op.constant([3, 4], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant(
-          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
-
-  def testReprobe(self):
-    with self.cached_session():
-      # Insert 6 keys into a table with 8 buckets.
-      # The values are chosen to make sure collisions occur when using GCC STL
-      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
-      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(6, table.size().eval())
-
-      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([9], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
-
-  def testCustomEmptyKey(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 0, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=12,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testErrors(self):
-    with self.cached_session():
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-
-      # Inserting the empty key returns an error
-      keys1 = constant_op.constant([11, 0], dtypes.int64)
-      values1 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
-        table.insert(keys1, values1).run()
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
-        table.lookup(keys1).eval()
-
-      # Inserting the deleted key returns an error
-      keys2 = constant_op.constant([11, -1], dtypes.int64)
-      values2 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table.insert(keys2, values2).run()
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table.lookup(keys2).eval()
-
-      # Arbitrary tensors of keys are not supported
-      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
-        table.lookup(keys).eval()
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
-        table.insert(keys, values).run()
-
-      table2 = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=17,
-          deleted_key=-1,
-          initial_num_buckets=12)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Number of buckets must be"):
-        self.assertAllEqual(0, table2.size().eval())
-
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Empty and deleted keys must have same shape"):
-        table3 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=[1, 2])
-        self.assertAllEqual(0, table3.size().eval())
-
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
-        table4 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, table4.size().eval())
-
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
-        table5 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=[1, 2, 3],
-            deleted_key=[1, 2, 3])
-        self.assertAllEqual(0, table5.size().eval())
-
-
 class IndexTableFromFile(test.TestCase):
 
   def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
@@ -2721,64 +1481,6 @@ class IdTableWithHashBucketsTest(test.TestCase):
             hasher_spec=lookup.StrongHashSpec([None, 2]))
 
 
-class MutableHashTableBenchmark(test.Benchmark):
-
-  def _create_table(self):
-    return lookup.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
-
-  def benchmark_single_repeated_scalar_insert_scalar(self):
-    table = self._create_table()
-    value = variables.Variable(1.0)
-    insert = table.insert(0, value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
-      assert sess.run(size) == 1
-
-  def benchmark_many_repeated_scalar_insert_scalar(self):
-    table = self._create_table()
-    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
-    value = variables.Variable(1.0)
-    insert = table.insert(c, value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
-      assert sess.run(size) >= 10000
-
-  def benchmark_single_repeated_batch_32_insert_scalar(self):
-    table = self._create_table()
-    value = variables.Variable([1.0] * 32)
-    insert = table.insert(list(range(32)), value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
-      assert sess.run(size) == 32
-
-  def benchmark_many_repeated_batch_32_insert_scalar(self):
-    table = self._create_table()
-    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
-    value = variables.Variable([1.0] * 32)
-    insert = table.insert(32 * c + list(range(32)), value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
-      assert sess.run(size) >= 1000*32
-
-
-class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
-
-  def _create_table(self):
-    return lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.float32,
-        default_value=0.0,
-        empty_key=-1,
-        deleted_key=-2)
-
-
 if __name__ == "__main__":
   test.main()
+
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e..f4ebbdeee883ddeef0d47cb561901c16e2195bb2 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -82,10 +82,11 @@ py_library(
 
 py_test(
     name = "metric_loss_ops_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "python/metric_learning/metric_loss_ops_test.py",
     ],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":metric_learning_py",
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 709a042bbcefb89125f7e4cd14a0d7ecd2b53281..dea111f9a0f734a19758a59fb5838f742573560d 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -22,6 +22,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -100,8 +101,8 @@ def compute_weighted_loss(losses, weights=1.0, scope=None):
   with ops.name_scope(scope, "weighted_loss", [losses, weights]):
     losses = ops.convert_to_tensor(losses)
     input_dtype = losses.dtype
-    losses = math_ops.to_float(losses)
-    weights = math_ops.to_float(ops.convert_to_tensor(weights))
+    losses = math_ops.cast(losses, dtypes.float32)
+    weights = math_ops.cast(ops.convert_to_tensor(weights), dtypes.float32)
 
     if losses.get_shape().ndims is None:
       raise ValueError("losses.get_shape().ndims cannot be None")
@@ -147,8 +148,8 @@ def _num_present(losses, weights, per_batch=False):
     batch_size = array_ops.reshape(
         array_ops.slice(array_ops.shape(losses), [0], [1]), [])
     num_per_batch = math_ops.div(
-        math_ops.to_float(array_ops.size(losses)),
-        math_ops.to_float(batch_size))
+        math_ops.cast(array_ops.size(losses), dtypes.float32),
+        math_ops.cast(batch_size, dtypes.float32))
     num_per_batch = array_ops.where(
         math_ops.equal(weights, 0), 0.0, num_per_batch)
     num_per_batch = math_ops.multiply(
@@ -159,12 +160,14 @@ def _num_present(losses, weights, per_batch=False):
   if weights.get_shape().ndims >= 1:
     axis = list(range(1, weights.get_shape().ndims))
     num_nonzero_per_batch = math_ops.reduce_sum(
-        math_ops.to_float(math_ops.not_equal(weights, 0)), axis=axis)
+        math_ops.cast(math_ops.not_equal(weights, 0), dtypes.float32),
+        axis=axis)
 
   # Next, determine the number of elements that weights would broadcast to:
   broadcast_dims = array_ops.slice(
       array_ops.shape(losses), [weights.get_shape().ndims], [-1])
-  num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
+  num_to_broadcast = math_ops.cast(math_ops.reduce_prod(broadcast_dims),
+                                   dtypes.float32)
 
   num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
   return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
@@ -262,8 +265,8 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
   with ops.name_scope(scope, "absolute_difference",
                       [predictions, labels, weights]) as scope:
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtypes.float32)
+    labels = math_ops.cast(labels, dtypes.float32)
     losses = math_ops.abs(math_ops.subtract(predictions, labels))
     return compute_weighted_loss(losses, weights, scope=scope)
 
@@ -438,8 +441,8 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
   with ops.name_scope(scope, "log_loss",
                       [predictions, labels, weights]) as scope:
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtypes.float32)
+    labels = math_ops.cast(labels, dtypes.float32)
     losses = -math_ops.multiply(
         labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
@@ -473,7 +476,7 @@ def hinge_loss(logits, labels=None, scope=None):
   with ops.name_scope(scope, "hinge_loss", [logits, labels]) as scope:
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
     # We first need to convert binary labels to -1/1 labels (as floats).
-    labels = math_ops.to_float(labels)
+    labels = math_ops.cast(labels, dtypes.float32)
     all_ones = array_ops.ones_like(labels)
     labels = math_ops.subtract(2 * labels, all_ones)
     return nn_ops.relu(
@@ -509,9 +512,9 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
   with ops.name_scope(scope, "mean_squared_error",
                       [predictions, labels, weights]) as scope:
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
-    losses = math_ops.square(math_ops.subtract(predictions, labels))
+    predictions = math_ops.cast(predictions, dtypes.float32)
+    labels = math_ops.cast(labels, dtypes.float32)
+    losses = math_ops.squared_difference(predictions, labels)
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -563,9 +566,9 @@ def mean_pairwise_squared_error(predictions,
   with ops.name_scope(scope, "mean_pairwise_squared_error",
                       [predictions, labels, weights]) as scope:
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
-    weights = math_ops.to_float(ops.convert_to_tensor(weights))
+    predictions = math_ops.cast(predictions, dtypes.float32)
+    labels = math_ops.cast(labels, dtypes.float32)
+    weights = math_ops.cast(ops.convert_to_tensor(weights), dtypes.float32)
 
     diffs = math_ops.subtract(predictions, labels)
 
@@ -638,8 +641,8 @@ def cosine_distance(predictions,
                       [predictions, labels, weights]) as scope:
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtypes.float32)
+    labels = math_ops.cast(labels, dtypes.float32)
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index de76acb51ffe985162a66c617b266f47c5216b19..226527a49c73834cb3ccb0cc3255f981fda64e84 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -67,11 +67,13 @@ def pairwise_distance(feature, squared=False):
     pairwise_distances = pairwise_distances_squared
   else:
     pairwise_distances = math_ops.sqrt(
-        pairwise_distances_squared + math_ops.to_float(error_mask) * 1e-16)
+        pairwise_distances_squared +
+        math_ops.cast(error_mask, dtypes.float32) * 1e-16)
 
   # Undo conditionally adding 1e-16.
   pairwise_distances = math_ops.multiply(
-      pairwise_distances, math_ops.to_float(math_ops.logical_not(error_mask)))
+      pairwise_distances,
+      math_ops.cast(math_ops.logical_not(error_mask), dtypes.float32))
 
   num_data = array_ops.shape(feature)[0]
   # Explicitly set diagonals to zero.
@@ -105,13 +107,14 @@ def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
   # Get per pair distances
   distances = math_ops.sqrt(
       math_ops.reduce_sum(
-          math_ops.square(embeddings_anchor - embeddings_positive), 1))
+          math_ops.squared_difference(embeddings_anchor, embeddings_positive),
+          1))
 
   # Add contrastive loss for the siamese network.
   #   label here is {0,1} for neg, pos.
   return math_ops.reduce_mean(
-      math_ops.to_float(labels) * math_ops.square(distances) +
-      (1. - math_ops.to_float(labels)) *
+      math_ops.cast(labels, dtypes.float32) * math_ops.square(distances) +
+      (1. - math_ops.cast(labels, dtypes.float32)) *
       math_ops.square(math_ops.maximum(margin - distances, 0.)),
       name='contrastive_loss')
 
@@ -283,8 +286,8 @@ def npairs_loss(labels, embeddings_anchor, embeddings_positive,
   assert lshape.shape == 1
   labels = array_ops.reshape(labels, [lshape[0], 1])
 
-  labels_remapped = math_ops.to_float(
-      math_ops.equal(labels, array_ops.transpose(labels)))
+  labels_remapped = math_ops.cast(
+      math_ops.equal(labels, array_ops.transpose(labels)), dtypes.float32)
   labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
   # Add the softmax loss.
@@ -317,9 +320,10 @@ def _build_multilabel_adjacency(sparse_labels):
   adjacency_matrix = array_ops.zeros([num_pairs, num_pairs])
   for i in range(num_pairs):
     for j in range(num_pairs):
-      sparse_dot_product = math_ops.to_float(
+      sparse_dot_product = math_ops.cast(
           sparse_ops.sparse_reduce_sum(sparse_ops.sparse_minimum(
-              sparse_labels[i], sparse_labels[j])))
+              sparse_labels[i], sparse_labels[j])),
+          dtypes.float32)
       sparse_dot_product = array_ops.expand_dims(sparse_dot_product, 0)
       sparse_dot_product = array_ops.expand_dims(sparse_dot_product, 1)
       one_hot_matrix = array_ops.pad(sparse_dot_product,
@@ -389,7 +393,7 @@ def npairs_loss_multilabel(sparse_labels, embeddings_anchor,
     # TODO(coreylynch): are composed only of 0's and 1's.
 
     multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
-    labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
+    labels_remapped = math_ops.cast(multilabel_adjacency_matrix, dtypes.float32)
     labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
     # Add the softmax loss.
@@ -541,7 +545,8 @@ def get_cluster_assignment(pairwise_distances, centroid_ids):
                         array_ops.constant(0, dtype=dtypes.int64),
                         axis=0,
                         dtype=dtypes.int64),
-      math_ops.to_int64(math_ops.range(array_ops.shape(centroid_ids)[0])))
+      math_ops.cast(math_ops.range(array_ops.shape(centroid_ids)[0]),
+                    dtypes.int64))
   constraint_vect = math_ops.reduce_sum(
       array_ops.transpose(constraint_one_hot), axis=0)
 
@@ -605,46 +610,51 @@ def compute_clustering_score(labels, predictions, margin_type):
 
 
 def _compute_nmi_score(labels, predictions):
-  return math_ops.to_float(
+  return math_ops.cast(
       script_ops.py_func(
           metrics.normalized_mutual_info_score, [labels, predictions],
           [dtypes.float64],
-          name='nmi'))
+          name='nmi'),
+      dtypes.float32)
 
 
 def _compute_ami_score(labels, predictions):
-  ami_score = math_ops.to_float(
+  ami_score = math_ops.cast(
       script_ops.py_func(
           metrics.adjusted_mutual_info_score, [labels, predictions],
           [dtypes.float64],
-          name='ami'))
+          name='ami'),
+      dtypes.float32)
   return math_ops.maximum(0.0, ami_score)
 
 
 def _compute_ari_score(labels, predictions):
-  ari_score = math_ops.to_float(
+  ari_score = math_ops.cast(
       script_ops.py_func(
           metrics.adjusted_rand_score, [labels, predictions], [dtypes.float64],
-          name='ari'))
+          name='ari'),
+      dtypes.float32)
   # ari score can go below 0
   # http://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-score
   return math_ops.maximum(0.0, ari_score)
 
 
 def _compute_vmeasure_score(labels, predictions):
-  vmeasure_score = math_ops.to_float(
+  vmeasure_score = math_ops.cast(
       script_ops.py_func(
           metrics.v_measure_score, [labels, predictions], [dtypes.float64],
-          name='vmeasure'))
+          name='vmeasure'),
+      dtypes.float32)
   return math_ops.maximum(0.0, vmeasure_score)
 
 
 def _compute_zeroone_score(labels, predictions):
-  zeroone_score = math_ops.to_float(
+  zeroone_score = math_ops.cast(
       math_ops.equal(
           math_ops.reduce_sum(
-              math_ops.to_int32(math_ops.equal(labels, predictions))),
-          array_ops.shape(labels)[0]))
+              math_ops.cast(math_ops.equal(labels, predictions), dtypes.int32)),
+          array_ops.shape(labels)[0]),
+      dtypes.float32)
   return zeroone_score
 
 
@@ -710,8 +720,8 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
   candidate_scores = math_ops.add(
       candidate_scores, margin_multiplier * nmi_scores)
 
-  argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, axis=0))
+  argmax_index = math_ops.cast(
+      math_ops.argmax(candidate_scores, axis=0), dtypes.int32)
 
   return candidate_ids[argmax_index]
 
@@ -786,7 +796,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
 
   def func_body(iteration, scores_margin):
     # swap the current medoid with the candidate cluster member
-    candidate_medoid = math_ops.to_int32(cluster_member_ids[iteration])
+    candidate_medoid = math_ops.cast(cluster_member_ids[iteration], dtypes.int32)
     tmp_chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, candidate_medoid)
     predictions = get_cluster_assignment(pairwise_distances, tmp_chosen_ids)
     metric_score = compute_clustering_score(labels, predictions, margin_type)
@@ -810,10 +820,10 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
                                                  [iteration, scores_margin])
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
-  argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, axis=0))
+  argmax_index = math_ops.cast(
+      math_ops.argmax(candidate_scores, axis=0), dtypes.int32)
 
-  best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
+  best_medoid = math_ops.cast(cluster_member_ids[argmax_index], dtypes.int32)
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
   return chosen_ids
 
@@ -841,7 +851,8 @@ def update_all_medoids(pairwise_distances, predictions, labels, chosen_ids,
   def func_body_augmented_pam(iteration, chosen_ids):
     """Call the update_medoid_per_cluster subroutine."""
     mask = math_ops.equal(
-        math_ops.to_int64(predictions), math_ops.to_int64(iteration))
+        math_ops.cast(predictions, dtypes.int64),
+        math_ops.cast(iteration, dtypes.int64))
     this_cluster_ids = array_ops.where(mask)
 
     pairwise_distances_subset = array_ops.transpose(
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 7ea6e34cf50ed8e292f11314550d992c3dde34c0..37cb198e22166a14d2a65704f563d1d530f7f322 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -439,7 +439,6 @@ $(MARCH_OPTION) \
 -Itensorflow/core/kernels \
 -I$(MAKEFILE_DIR)/downloads/cub \
 -I$(MAKEFILE_DIR)/downloads/cub/cub_archive/cub/device \
--Ithird_party/toolchains/gpus/cuda \
 -I$(JETPACK)/cuda/include \
 -I$(JETPACK) \
 -I$(JETPACK)/cuDNN/aarch64 \
@@ -630,6 +629,9 @@ BENCHMARK_NAME := $(BINDIR)benchmark
 
 CORE_CC_ALL_SRCS := \
 $(ABSL_CC_SRCS) \
+tensorflow/c/c_api.cc \
+tensorflow/c/kernels.cc \
+tensorflow/c/tf_status_helper.cc \
 $(wildcard tensorflow/core/*.cc) \
 $(wildcard tensorflow/core/common_runtime/*.cc) \
 $(wildcard tensorflow/core/framework/*.cc) \
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 8fa20213633414d134d6c6a50e151cce2ac8a368..d2fbf696f8f3bc0031db132b021b3da3591d5ce6 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -24,11 +24,11 @@ fi
 usage() {
   echo "Usage: $(basename "$0") [-a]"
   echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
-  echo "default arch i386, x86_64, armv7, armv7s, arm64"
+  echo "default arch x86_64, armv7, armv7s, arm64"
   exit 1
 }
 
-BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+BUILD_TARGET="x86_64 armv7 armv7s arm64"
 while getopts "a:" opt_name; do
   case "$opt_name" in
     a) BUILD_TARGET="${OPTARG}";;
@@ -115,39 +115,6 @@ package_pb_library() {
 
 build_target() {
 case "$1" in
-    i386)  make distclean
-        ./configure \
-        --host=i386-apple-${OSX_VERSION} \
-        --disable-shared \
-        --enable-cross-compile \
-        --with-protoc="${PROTOC_PATH}" \
-        --prefix=${LIBDIR}/iossim_386 \
-        --exec-prefix=${LIBDIR}/iossim_386 \
-        "CFLAGS=${CFLAGS} \
-        -mios-simulator-version-min=${MIN_SDK_VERSION} \
-        -arch i386 \
-        -fembed-bitcode \
-        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
-        "CXX=${CXX}" \
-        "CXXFLAGS=${CXXFLAGS} \
-        -mios-simulator-version-min=${MIN_SDK_VERSION} \
-        -arch i386 \
-        -fembed-bitcode \
-        -isysroot \
-        ${IPHONESIMULATOR_SYSROOT}" \
-        LDFLAGS="-arch i386 \
-        -fembed-bitcode \
-        -mios-simulator-version-min=${MIN_SDK_VERSION} \
-        ${LDFLAGS} \
-        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-        "LIBS=${LIBS}"
-        make -j"${JOB_COUNT}"
-        make install
-
-        package_pb_library "iossim_386"
-        ;;
-
     x86_64) make distclean
         ./configure \
         --host=x86_64-apple-${OSX_VERSION} \
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index ae82163e1178216fc22aad37cd07fd1734c2bedb..3822f0d7da78afbf67fa1fa0389730b0196b637b 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -46,11 +46,11 @@ fi
 usage() {
   echo "Usage: $(basename "$0") [-a]"
   echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
-  echo "default is [i386, x86_64, armv7, armv7s, arm64]"
+  echo "default is [x86_64, armv7, armv7s, arm64]"
   exit 1
 }
 
-BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+BUILD_TARGET="x86_64 armv7 armv7s arm64"
 while getopts "a:f:h:n:" opt_name; do
   case "$opt_name" in
     a) BUILD_TARGET="${OPTARG}";;
@@ -126,18 +126,6 @@ case "$1" in
         fi
         package_tf_library "arm64"
         ;;
-    i386)
-        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-        TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a \
-        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
-        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
-        if [ $? -ne 0 ]
-        then
-          echo "i386 compilation failed."
-          exit 1
-        fi
-        package_tf_library "i386"
-        ;;
     x86_64)
         make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
         TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a \
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index cb4c94d92fc630c1ce4158c618cd82be80de6741..e154b8223c64fd0b42eb4cc6ae7da1188b92ac36 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -22,7 +22,7 @@ set -e
 prog=compile_nsync.sh
 android_api_version=21
 default_android_arch=armeabi-v7a
-default_ios_arch="i386 x86_64 armv7 armv7s arm64"
+default_ios_arch="x86_64 armv7 armv7s arm64"
 
 usage="usage: $prog [-t linux|ios|android|macos|native]
         [-a architecture] [-v android_api_version]
@@ -130,7 +130,7 @@ for arch in $archs; do
 
         ios)    arch_flags=
                 case "$arch" in
-                i386|x86_64)
+                x86_64)
                         arch_flags="$arch_flags -mios-simulator-version-min=8.0"
                         arch_flags="$arch_flags -isysroot $(xcrun --sdk iphonesimulator --show-sdk-path)"
                         ;;
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 2a5232b476712a96f84be0f4725beb78bc138297..af3c541dc214c30e9e59fdcca995ffc53b028df4 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -142,5 +142,6 @@ replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DAT
 # TODO(satok): Remove this once protobuf/autogen.sh is fixed.
 replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#http://download.tensorflow.org/deps/gmock-1.7.0.zip#' \
   "${DOWNLOADS_DIR}/protobuf/autogen.sh"
+cat "third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 9ea94c74330e3e49414a6a84cd5bc0db3778114a..0a0ba36232075460b561bc54a95fc24973017571 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -40,7 +40,6 @@ tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/default/mutex.cc
-tensorflow/core/platform/default/protobuf.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/env.cc
@@ -53,6 +52,7 @@ tensorflow/core/platform/posix/error.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/posix_file_system.cc
+tensorflow/core/platform/protobuf.cc
 tensorflow/core/platform/protobuf_util.cc
 tensorflow/core/platform/setround.cc
 tensorflow/core/platform/tensor_coding.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index 87c73ec1ca610cac6d63468887bc350bada5910b..1c1460ce77c99d29785c7e8b8a8e9f770a45b59f 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/grappler/costs/op_performance_data.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
+tensorflow/core/protobuf/trackable_object_graph.pb.cc
 tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/eager_service.pb.cc
@@ -34,8 +35,11 @@ tensorflow/core/protobuf/meta_graph.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
+tensorflow/core/protobuf/saved_object_graph.pb.cc
 tensorflow/core/protobuf/saver.pb.cc
+tensorflow/core/protobuf/struct.pb.cc
 tensorflow/core/protobuf/tensorflow_server.pb.cc
+tensorflow/core/protobuf/verifier_config.pb.cc
 tensorflow/core/util/event.pb.cc
 tensorflow/core/util/memmapped_file_system.pb.cc
 tensorflow/core/util/saved_tensor_slice.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 4120ea52ec5255b1efce7a6ce6890fc79c1e4831..5def632e8a7b65272a1339bdacd92c1fa23012d2 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.h
 tensorflow/core/framework/versions.pb.h
 tensorflow/core/grappler/costs/op_performance_data.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
+tensorflow/core/protobuf/trackable_object_graph.pb.h
 tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
@@ -34,9 +35,12 @@ tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
+tensorflow/core/protobuf/saved_object_graph.pb.h
 tensorflow/core/protobuf/saver.pb.h
+tensorflow/core/protobuf/struct.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/protobuf/tensorflow_server.pb.h
+tensorflow/core/protobuf/verifier_config.pb.h
 tensorflow/core/util/event.pb.h
 tensorflow/core/util/memmapped_file_system.pb.h
 tensorflow/core/util/saved_tensor_slice.pb.h
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 2cd7d6d519a55423a96526b541845392d9ec6bc2..ea5f5913c66e85ed6ea84e7127c8eb85a51d609e 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -1,3 +1,4 @@
+tensorflow/c/kernels/bitcast_op.cc
 tensorflow/contrib/boosted_trees/ops/model_ops.cc
 tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
 tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -43,7 +44,9 @@ tensorflow/core/kernels/conv_grad_input_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/conv_ops.cc
 tensorflow/core/kernels/conv_ops_3d.cc
-tensorflow/core/kernels/conv_ops_fused.cc
+tensorflow/core/kernels/conv_ops_fused_double.cc
+tensorflow/core/kernels/conv_ops_fused_float.cc
+tensorflow/core/kernels/conv_ops_fused_half.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index f94d70db9046cec43073ab1406762aea1f28c8e3..13e3b6422d1989b0d499d8d20901d919554c630e 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -29,5 +29,6 @@ tensorflow/core/protobuf/debug.pb_text.cc
 tensorflow/core/protobuf/rewriter_config.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
 tensorflow/core/protobuf/tensor_bundle.pb_text.cc
+tensorflow/core/protobuf/verifier_config.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 2712e906d719e72dacb60e213205ad68895f905f..deb6a5b94020a02b878bdd68a33b3737a97fcf2b 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -31,6 +31,7 @@ tensorflow/core/framework/versions.proto
 tensorflow/core/grappler/costs/op_performance_data.proto
 tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/protobuf/trackable_object_graph.proto
 tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
@@ -40,9 +41,12 @@ tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/rewriter_config.proto
+tensorflow/core/protobuf/saved_object_graph.proto
 tensorflow/core/protobuf/saver.proto
+tensorflow/core/protobuf/struct.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/protobuf/tensorflow_server.proto
+tensorflow/core/protobuf/verifier_config.proto
 tensorflow/core/util/event.proto
 tensorflow/core/util/memmapped_file_system.proto
 tensorflow/core/util/saved_tensor_slice.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 63843b993c16363a80b64622af665aaa64e05830..93701249cc8bf722c8c8558e91e0b700ca1c4a04 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -10,6 +10,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -45,6 +46,28 @@ tf_gen_op_wrapper_py(
     deps = [":memory_stats_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_cc(
+    name = "memory_stats_ops",
+    out_ops_file = "memory_stats_ops",
+)
+
+cc_library(
+    name = "memory_stats_cc",
+    srcs = ["memory_stats_ops.cc"],
+    hdrs = ["memory_stats_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_stats_kernels",
+        ":memory_stats_ops_op_lib",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "memory_stats_py",
     srcs = [
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 974fb537499c5ea4591a0a128f53d2dea67b9e57..7ae1dbeaa2d04d7846e7fada117f3941319cc1c1 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -24,13 +24,15 @@ class MemoryStatsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     Allocator* allocator =
         context->device()->GetAllocator(AllocatorAttributes());
-    AllocatorStats allocator_stats;
-    allocator->GetStats(&allocator_stats);
+    absl::optional<AllocatorStats> allocator_stats = allocator->GetStats();
+    if (!allocator_stats) {
+      *allocator_stats = AllocatorStats();
+    }
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    output_tensor->scalar<int64>()() = ExtractAllocatorStats(allocator_stats);
+    output_tensor->scalar<int64>()() = ExtractAllocatorStats(*allocator_stats);
   }
 
  protected:
@@ -71,7 +73,7 @@ class BytesLimitOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.bytes_limit;
+    return allocator_stats.bytes_limit ? *allocator_stats.bytes_limit : -1;
   }
 };
 
@@ -93,7 +95,7 @@ class MaxBytesInUseOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.max_bytes_in_use;
+    return allocator_stats.peak_bytes_in_use;
   }
 };
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 7b432f8bd20989c6d95310bcaca88d44ce3e0d1f..c7316229cf00d0b8db11aa6c37a1e42dd67dcfe1 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -772,7 +772,7 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
 
   if weights is not None:
     broadcast_weights = weights_broadcast_ops.broadcast_weights(
-        math_ops.to_float(weights), predictions)
+        math_ops.cast(weights, dtypes.float32), predictions)
     weights_tiled = array_ops.tile(
         array_ops.reshape(broadcast_weights, [1, -1]), [num_thresholds, 1])
     thresh_tiled.get_shape().assert_is_compatible_with(
@@ -786,8 +786,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   if 'tp' in includes:
     true_positives = metrics_impl.metric_variable(
         [num_thresholds], dtypes.float32, name='true_positives')
-    is_true_positive = math_ops.to_float(
-        math_ops.logical_and(label_is_pos, pred_is_pos))
+    is_true_positive = math_ops.cast(
+        math_ops.logical_and(label_is_pos, pred_is_pos), dtypes.float32)
     if weights_tiled is not None:
       is_true_positive *= weights_tiled
     update_ops['tp'] = state_ops.assign_add(true_positives,
@@ -798,8 +798,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   if 'fn' in includes:
     false_negatives = metrics_impl.metric_variable(
         [num_thresholds], dtypes.float32, name='false_negatives')
-    is_false_negative = math_ops.to_float(
-        math_ops.logical_and(label_is_pos, pred_is_neg))
+    is_false_negative = math_ops.cast(
+        math_ops.logical_and(label_is_pos, pred_is_neg), dtypes.float32)
     if weights_tiled is not None:
       is_false_negative *= weights_tiled
     update_ops['fn'] = state_ops.assign_add(false_negatives,
@@ -810,8 +810,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   if 'tn' in includes:
     true_negatives = metrics_impl.metric_variable(
         [num_thresholds], dtypes.float32, name='true_negatives')
-    is_true_negative = math_ops.to_float(
-        math_ops.logical_and(label_is_neg, pred_is_neg))
+    is_true_negative = math_ops.cast(
+        math_ops.logical_and(label_is_neg, pred_is_neg), dtypes.float32)
     if weights_tiled is not None:
       is_true_negative *= weights_tiled
     update_ops['tn'] = state_ops.assign_add(true_negatives,
@@ -822,8 +822,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   if 'fp' in includes:
     false_positives = metrics_impl.metric_variable(
         [num_thresholds], dtypes.float32, name='false_positives')
-    is_false_positive = math_ops.to_float(
-        math_ops.logical_and(label_is_neg, pred_is_pos))
+    is_false_positive = math_ops.cast(
+        math_ops.logical_and(label_is_neg, pred_is_pos), dtypes.float32)
     if weights_tiled is not None:
       is_false_positive *= weights_tiled
     update_ops['fp'] = state_ops.assign_add(false_positives,
@@ -1356,9 +1356,8 @@ def _compute_placement_auc(labels, predictions, weights, alpha,
           weights_0 * math_ops.square(1. - placement_values_0 - auc_0)) /
       (total_0 - 1. + _EPSILON))
   var_1 = (
-      math_ops.reduce_sum(
-          weights_1 * math_ops.square(placement_values_1 - auc_1)) /
-      (total_1 - 1. + _EPSILON))
+      math_ops.reduce_sum(weights_1 * math_ops.squared_difference(
+          placement_values_1, auc_1)) / (total_1 - 1. + _EPSILON))
   auc_std_err = math_ops.sqrt(
       (var_0 / (total_0 + _EPSILON)) + (var_1 / (total_1 + _EPSILON)))
 
@@ -2165,7 +2164,7 @@ def streaming_recall_at_k(predictions,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  in_top_k = math_ops.to_float(nn.in_top_k(predictions, labels, k))
+  in_top_k = math_ops.cast(nn.in_top_k(predictions, labels, k), dtypes.float32)
   return streaming_mean(in_top_k, weights, metrics_collections,
                         updates_collections, name or _at_k_name('recall', k))
 
@@ -3206,7 +3205,8 @@ def streaming_covariance(predictions,
         [], dtypes.float32, name='comoment')
 
     if weights is None:
-      batch_count = math_ops.to_float(array_ops.size(labels))  # n_B in eqn
+      batch_count = math_ops.cast(
+          array_ops.size(labels), dtypes.float32)  # n_B in eqn
       weighted_predictions = predictions
       weighted_labels = labels
     else:
@@ -3766,15 +3766,15 @@ def count(values,
     count_ = metrics_impl.metric_variable([], dtypes.float32, name='count')
 
     if weights is None:
-      num_values = math_ops.to_float(array_ops.size(values))
+      num_values = math_ops.cast(array_ops.size(values), dtypes.float32)
     else:
-      values = math_ops.to_float(values)
+      values = math_ops.cast(values, dtypes.float32)
       values, _, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
           predictions=values,
           labels=None,
           weights=weights)
       weights = weights_broadcast_ops.broadcast_weights(
-          math_ops.to_float(weights), values)
+          math_ops.cast(weights, dtypes.float32), values)
       num_values = math_ops.reduce_sum(weights)
 
     with ops.control_dependencies([values]):
@@ -3896,10 +3896,11 @@ def cohen_kappa(labels,
       total = math_ops.reduce_sum(pe_row)
       pe_sum = math_ops.reduce_sum(
           math_ops.div_no_nan(
-              math_ops.to_double(pe_row * pe_col), math_ops.to_double(total)))
-      po_sum, pe_sum, total = (math_ops.to_double(po_sum),
-                               math_ops.to_double(pe_sum),
-                               math_ops.to_double(total))
+              math_ops.cast(pe_row * pe_col, dtypes.float64),
+              math_ops.cast(total, dtypes.float64)))
+      po_sum, pe_sum, total = (math_ops.cast(po_sum, dtypes.float64),
+                               math_ops.cast(pe_sum, dtypes.float64),
+                               math_ops.cast(total, dtypes.float64))
       # kappa = (po - pe) / (N - pe)
       k = metrics_impl._safe_scalar_div(  # pylint: disable=protected-access
           po_sum - pe_sum,
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index fc64f343ab4add17f04eabeccab922e8be51a692..aec07241e7a23410870de2285ab0470b1650cc7b 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -5810,9 +5810,10 @@ class StreamingCovarianceTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])))
     _assert_metric_variables(self, (
         'covariance/comoment:0',
         'covariance/count:0',
@@ -5823,18 +5824,20 @@ class StreamingCovarianceTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     cov, _ = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [cov])
 
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
@@ -5857,8 +5860,8 @@ class StreamingCovarianceTest(test.TestCase):
 
   def testSingleUpdateIdentical(self):
     with self.cached_session() as sess:
-      predictions = math_ops.to_float(math_ops.range(10))
-      labels = math_ops.to_float(math_ops.range(10))
+      predictions = math_ops.cast(math_ops.range(10), dtypes_lib.float32)
+      labels = math_ops.cast(math_ops.range(10), dtypes_lib.float32)
 
       cov, update_op = metrics.streaming_covariance(predictions, labels)
 
@@ -5982,9 +5985,10 @@ class StreamingPearsonRTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])))
     _assert_metric_variables(self, (
         'pearson_r/covariance/comoment:0',
         'pearson_r/covariance/count:0',
@@ -6003,18 +6007,20 @@ class StreamingPearsonRTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     pearson_r, _ = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [pearson_r])
 
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) +
+        predictions=math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
         array_ops.ones([10, 10]),
-        labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
+        labels=(math_ops.cast(math_ops.range(10), dtypes_lib.float32) +
+                array_ops.ones([10, 10])),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
@@ -6038,8 +6044,8 @@ class StreamingPearsonRTest(test.TestCase):
 
   def testSingleUpdateIdentical(self):
     with self.cached_session() as sess:
-      predictions = math_ops.to_float(math_ops.range(10))
-      labels = math_ops.to_float(math_ops.range(10))
+      predictions = math_ops.cast(math_ops.range(10), dtypes_lib.float32)
+      labels = math_ops.cast(math_ops.range(10), dtypes_lib.float32)
 
       pearson_r, update_op = metrics.streaming_pearson_correlation(
           predictions, labels)
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index a31fa9ce0b3110d875689d74a41ca9f9cc85f532..e44e10af0814ba8d6d964dfc34a0470ce45c0b40 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -54,7 +54,10 @@ MPIServer::~MPIServer() {
 
 Status MPIServer::Init(ServiceInitFunction service_func,
                        RendezvousMgrCreationFunction rendezvous_mgr_func) {
-  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  GrpcServerOptions opts;
+  opts.service_func = service_func;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  Status s = GrpcServer::Init(opts);
   return s;
 }
 
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc b/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
index b04abde4694199d827a1738850bded9bf696d56c..ca3ddfa721d45a2de3ea51c80d6adfa2371c3c94 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
@@ -96,13 +96,14 @@ __global__ void elemwise_accum(T* out, const T* in, const size_t N) {
 // Synchronously accumulate tensors on the GPU, using a different stream than
 // the default and than TensorFlow to avoid synchronizing on operations
 // unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    auto stream = CudaStreamForMPI();                                \
-    elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size);    \
-    cudaStreamSynchronize(stream);                                   \
+#define GENERATE_ACCUMULATE(type)                                          \
+  template <>                                                              \
+  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src,       \
+                                             size_t size) {                \
+    auto stream = CudaStreamForMPI();                                      \
+    TF_CHECK_OK(CudaLaunchKernel(elemwise_accum<type>, 32, 256, 0, stream, \
+                                 dst, src, size));                         \
+    cudaStreamSynchronize(stream);                                         \
   };
 GENERATE_ACCUMULATE(int);
 GENERATE_ACCUMULATE(long long);
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/ring.cu.cc
index 2f3eef366a9a3c10e59cd5298fc1626e1094dff8..c73156d230820e8f89d88d8d4c8599fd1a5f68d8 100644
--- a/tensorflow/contrib/mpi_collectives/ring.cu.cc
+++ b/tensorflow/contrib/mpi_collectives/ring.cu.cc
@@ -96,13 +96,14 @@ __global__ void elemwise_accum(T* out, const T* in, const size_t N) {
 // Synchronously accumulate tensors on the GPU, using a different stream than
 // the default and than TensorFlow to avoid synchronizing on operations
 // unrelated to the allreduce.
-#define GENERATE_ACCUMULATE(type)                                    \
-  template <>                                                        \
-  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
-                                             size_t size) {          \
-    auto stream = CudaStreamForMPI();                                \
-    elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size);    \
-    cudaStreamSynchronize(stream);                                   \
+#define GENERATE_ACCUMULATE(type)                                          \
+  template <>                                                              \
+  void AccumulateTensorData<GPUDevice, type>(type * dst, type * src,       \
+                                             size_t size) {                \
+    auto stream = CudaStreamForMPI();                                      \
+    TF_CHECK_OK(CudaLaunchKernel(elemwise_accum<type>, 32, 256, 0, stream, \
+                                 dst, src, size));                         \
+    cudaStreamSynchronize(stream);                                         \
   };
 GENERATE_ACCUMULATE(int);
 GENERATE_ACCUMULATE(long long);
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
index cae57ce60eb09509af69f8ccab9eacedea361548..9b5d52e1b648e62af93d5420885e4f22796e3ea1 100644
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ b/tensorflow/contrib/mpi_collectives/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 12320d9e456ae93cbf95639a0c9e0c7f414f3518..f30643cf3059754daaeee4093938ac47b26f76ea 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -413,8 +413,9 @@ py_test(
 
 py_test(
     name = "shampoo_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/training/shampoo_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
index c5c9fc74deaf0171a33d0eb1b5c6f60b3aa5e533..0b149ed17533adff3bd7cd8fd8ff94d171f72911 100644
--- a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Adam rewrite to use global step for computing beta1 & beta2 accumulation."""
 from __future__ import absolute_import
 from __future__ import division
@@ -38,9 +37,14 @@ class AdamGSOptimizer(optimizer.Optimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-  def __init__(self, global_step=0, learning_rate=0.001,
-               beta1=0.9, beta2=0.999, epsilon=1e-8,
-               use_locking=False, name="Adam"):
+  def __init__(self,
+               global_step=0,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
     r"""Construct a new Adam optimizer.
 
     Branched from tf.train.AdamOptimizer. The only difference is to pass
@@ -112,9 +116,6 @@ class AdamGSOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Created in SparseApply if needed.
-    self._updated_lr = None
-
   def _get_beta_accumulators(self):
     return (math_ops.pow(self._beta1_t, self._global_step_on_worker),
             math_ops.pow(self._beta2_t, self._global_step_on_worker))
@@ -146,28 +147,34 @@ class AdamGSOptimizer(optimizer.Optimizer):
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power, beta2_power = self._get_beta_accumulators()
@@ -181,8 +188,7 @@ class AdamGSOptimizer(optimizer.Optimizer):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -192,23 +198,26 @@ class AdamGSOptimizer(optimizer.Optimizer):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
+        grad.values,
+        var,
+        grad.indices,
         lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking))
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
diff --git a/tensorflow/contrib/opt/python/training/ggt.py b/tensorflow/contrib/opt/python/training/ggt.py
index 6dc17fe5a5210fa1700e1382016e40fa0a792917..df0cb2b0071b932418492e83aebca9fe70027162 100644
--- a/tensorflow/contrib/opt/python/training/ggt.py
+++ b/tensorflow/contrib/opt/python/training/ggt.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import collections
 import numpy as np
 from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -224,7 +225,7 @@ class GGTOptimizer(optimizer_v2.OptimizerV2):
     window = state.get_hyper("window")
     grad_buffer = self._get_grad_buffer(state)
     next_grad_index = math_ops.floormod(
-        math_ops.to_int32(update_global_step - 1.), window)
+        math_ops.cast(update_global_step - 1., dtypes.int32), window)
     # grad_buffer[(t-1) % window] := moment1_t
     update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index,
                                                   update_moment1)
diff --git a/tensorflow/contrib/opt/python/training/matrix_functions.py b/tensorflow/contrib/opt/python/training/matrix_functions.py
index baab577638626fb39bfbd9b60d98b5848d481a1c..1c5d2fe17876cf6dda45194b445c3a12b65b1210 100644
--- a/tensorflow/contrib/opt/python/training/matrix_functions.py
+++ b/tensorflow/contrib/opt/python/training/matrix_functions.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -57,7 +58,7 @@ def matrix_square_root(mat_a, mat_a_size, iter_count=100, ridge_epsilon=1e-4):
     current_err = math_ops.sqrt(math_ops.reduce_sum(residual * residual)) / norm
     return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err
 
-  identity = linalg_ops.eye(math_ops.to_int32(mat_a_size))
+  identity = linalg_ops.eye(math_ops.cast(mat_a_size, dtypes.int32))
   mat_a = mat_a + ridge_epsilon * identity
   norm = math_ops.sqrt(math_ops.reduce_sum(mat_a * mat_a))
   mat_init_y = mat_a / norm
@@ -100,7 +101,7 @@ def matrix_inverse_pth_root(mat_g,
     mat_g^alpha
   """
 
-  identity = linalg_ops.eye(math_ops.to_int32(mat_g_size))
+  identity = linalg_ops.eye(math_ops.cast(mat_g_size, dtypes.int32))
 
   def mat_power(mat_m, p):
     """Computes mat_m^p, for p a positive integer.
diff --git a/tensorflow/contrib/opt/python/training/shampoo.py b/tensorflow/contrib/opt/python/training/shampoo.py
index e542f46892a3cea60b758a1a95ce2f20d5f29a67..efbafac662b78fe2cde9e50e6778bb787af29e31 100644
--- a/tensorflow/contrib/opt/python/training/shampoo.py
+++ b/tensorflow/contrib/opt/python/training/shampoo.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.contrib.opt.python.training import matrix_functions
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -120,7 +121,7 @@ class ShampooOptimizer(optimizer.Optimizer):
 
     super(ShampooOptimizer, self).__init__(use_locking, name)
 
-    self._global_step = math_ops.to_float(global_step)
+    self._global_step = math_ops.cast(global_step, dtypes.float32)
     self._max_matrix_size = max_matrix_size
     self._gbar_decay = gbar_decay
     self._gbar_weight = gbar_weight
@@ -246,7 +247,8 @@ class ShampooOptimizer(optimizer.Optimizer):
     if mat_g_size == 1:
       mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
     else:
-      damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size))
+      damping = self._epsilon * linalg_ops.eye(
+          math_ops.cast(mat_g_size, dtypes.int32))
       diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True)
       mat_h = math_ops.matmul(
           mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha),
diff --git a/tensorflow/contrib/opt/python/training/sign_decay.py b/tensorflow/contrib/opt/python/training/sign_decay.py
index e8870c072110da145c0bb78e20c3584083438ea0..99cd0f6e60e1d2fda14060c571c9aab8c7d32da2 100644
--- a/tensorflow/contrib/opt/python/training/sign_decay.py
+++ b/tensorflow/contrib/opt/python/training/sign_decay.py
@@ -23,7 +23,9 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
@@ -51,10 +53,10 @@ def get_linear_decay_fn(decay_steps):
     if global_step is None:
       raise ValueError("global_step is required for linear_decay.")
     global_step = math_ops.minimum(global_step, decay_steps)
-    remaining_steps = math_ops.to_int32(decay_steps) - math_ops.to_int32(
-        global_step)
-    decayed = math_ops.to_float(remaining_steps) / math_ops.to_float(
-        decay_steps)
+    remaining_steps = math_ops.cast(
+        decay_steps, dtypes.int32) - math_ops.cast(global_step, dtypes.int32)
+    decayed = (math_ops.cast(remaining_steps, dtypes.float32) /
+               math_ops.cast(decay_steps, dtypes.float32))
     return math_ops.maximum(0.0, decayed)
   # pylint:enable=missing-docstring
   return linear_decay_fn
@@ -92,8 +94,8 @@ def get_cosine_decay_fn(decay_steps, num_periods=0.5, zero_after=None):
     if global_step is None:
       raise ValueError("global_step is required for cosine_decay.")
     global_step = math_ops.minimum(global_step, decay_steps)
-    completed_fraction = math_ops.to_float(global_step) / math_ops.to_float(
-        decay_steps)
+    completed_fraction = (math_ops.cast(global_step, dtypes.float32) /
+                          math_ops.cast(decay_steps, dtypes.float32))
     fraction = 2.0 * num_periods * completed_fraction
     decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
@@ -143,14 +145,14 @@ def get_restart_decay_fn(decay_steps, num_periods=1, zero_after=None):
     if global_step is None:
       raise ValueError("global_step is required for cosine_decay.")
     global_step = math_ops.minimum(global_step, decay_steps)
-    num = math_ops.mod(num_periods * math_ops.to_float(global_step),
+    num = math_ops.mod(num_periods * math_ops.cast(global_step, dtypes.float32),
                        decay_steps)
-    fraction = num / math_ops.to_float(decay_steps)
+    fraction = num / math_ops.cast(decay_steps, dtypes.float32)
     decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
     if zero_after is not None:
-      tmp = math_ops.to_float(
-          num_periods * global_step) / math_ops.to_float(decay_steps)
+      tmp = (math_ops.cast(num_periods * global_step, dtypes.float32) /
+             math_ops.cast(decay_steps, dtypes.float32))
       decayed = array_ops.where(
           math_ops.greater_equal(tmp, zero_after), 0.0, decayed)
     return decayed
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 0243927ce44aec626973744507e75b20a42253e9..b469ebff25fafc5d97a3e457732954d238cbb5af 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -44,14 +44,15 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
+    super(NonLayerTrackable, self).__init__()
     self.a_variable = util.add_variable(
         self, name="a_variable", shape=[])
 
@@ -64,8 +65,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -100,7 +101,7 @@ class CheckpointingTests(test.TestCase):
     other_model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     if context.executing_eagerly():
       optimizer.minimize(
@@ -116,11 +117,10 @@ class CheckpointingTests(test.TestCase):
           other_model(input_value),
           global_step=optimizer_step)
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        util._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -208,7 +208,7 @@ class CheckpointingTests(test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     if context.executing_eagerly():
@@ -217,24 +217,24 @@ class CheckpointingTests(test.TestCase):
     else:
       train_op = optimizer.minimize(model(input_value))
       # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      root_trackable.save_counter  # pylint: disable=pointless-statement
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
@@ -302,7 +302,7 @@ class CheckpointingTests(test.TestCase):
         with ops.Graph().as_default():
           model = MyModel()
           optimizer = adam.AdamOptimizer(0.001)
-          root = util.Checkpoint(
+          root = util.CheckpointV1(
               optimizer=optimizer, model=model,
               global_step=training_util.get_or_create_global_step())
           input_value = constant_op.constant([[3.]])
@@ -440,7 +440,7 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
+    root = util.Checkpoint()
     root.var = util.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
@@ -455,21 +455,17 @@ class CheckpointingTests(test.TestCase):
           util.Checkpoint(root=root, optimizer=optimizer)))
       self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
                                    14.))
-    slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = util.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = util.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = util.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
     new_root.var = util.add_variable(
@@ -508,15 +504,14 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testManyRestoresGraph(self):
@@ -526,16 +521,15 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         before_ops = graph.get_operations()
-        saver.restore(save_path)
+        obj.restore(save_path)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testMultipleGraphsNonSlotVariables(self):
@@ -548,11 +542,11 @@ class CheckpointingTests(test.TestCase):
       first_session = session_lib.Session(graph=first_graph)
       with first_graph.as_default(), first_session.as_default():
         first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = util.Checkpoint(
+        first_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=first_variable)
         train_op = optimizer.minimize(first_variable.read_value)
         self.evaluate(util.gather_initializers(
-            first_root_checkpointable))
+            first_root_trackable))
         self.evaluate(train_op)
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
@@ -564,23 +558,23 @@ class CheckpointingTests(test.TestCase):
       second_graph = ops.Graph()
       with second_graph.as_default(), session_lib.Session(graph=second_graph):
         second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = util.Checkpoint(
+        second_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=second_variable)
         train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
+        second_root_trackable.restore(None).initialize_or_restore()
         self.evaluate(train_op)
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.evaluate(beta_1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        save_path = second_root_trackable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(6., self.evaluate(beta_1_power))
-        status = second_root_checkpointable.restore(save_path)
+        status = second_root_trackable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
@@ -600,7 +594,7 @@ class CheckpointingTests(test.TestCase):
 class TemplateTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -647,13 +641,13 @@ class CheckpointCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
     self.evaluate(util.gather_initializers(
-        root_checkpointable))
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -662,24 +656,24 @@ class CheckpointCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta_1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta_1_power))
 
   def _write_name_based_checkpoint(self):
@@ -704,14 +698,13 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = util.CheckpointableSaver(root)
+      object_saver = util.TrackableSaver(graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
         self._check_sentinels(root)
       if context.executing_eagerly():
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_consumed()
+        status.assert_consumed()
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
@@ -733,10 +726,9 @@ class CheckpointCompatibilityTests(test.TestCase):
     with context.graph_mode():
       save_graph = ops.Graph()
       with save_graph.as_default(), self.test_session(
-          graph=save_graph) as session:
+          graph=save_graph):
         root = self._initialized_model()
-        save_path = root.save(
-            session=session, file_prefix=checkpoint_prefix)
+        save_path = root.save(file_prefix=checkpoint_prefix)
     with context.eager_mode():
       root = self._initialized_model()
       self._set_sentinels(root)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 1323ed014c9e51e273491694fa44a8e36cc723d0..436ece79a79810d4688e259523a4f86a1ca7f5a5 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -39,7 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
@@ -224,7 +223,7 @@ class _OptimizerV2State(object):
       }
     self._slots = {}
     self._non_slot_dict = {}
-    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # Extra state to help Optimizers implement Trackable. Holds information
     # about variables which will be restored as soon as they're created.
     self._deferred_dependencies = {}  # Non-slot variables
     self._deferred_slot_restorations = {}  # Slot variables
@@ -367,8 +366,8 @@ class _OptimizerV2State(object):
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
       optional_op_name: Name to use when scoping the Variable that needs to be
@@ -386,7 +385,7 @@ class _OptimizerV2State(object):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.create_slot(
           var=variable,
@@ -661,7 +660,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                name=None,
                grad_loss=None,
                stop_gradients=None,
-               scale_loss_by_num_replicas=None):
+               scale_loss_by_num_replicas=False):
     """Add operations to minimize `loss` by updating `var_list`.
 
     This method simply combines calls `compute_gradients()` and
@@ -685,8 +684,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -732,7 +730,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                         aggregation_method=None,
                         grad_loss=None,
                         stop_gradients=None,
-                        scale_loss_by_num_replicas=None):
+                        scale_loss_by_num_replicas=False):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -756,8 +754,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -781,9 +778,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss for number of replicas (callable-loss case). In this case,
-        # we have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # Scale loss for number of replicas (callable-loss case).
         loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
@@ -839,9 +834,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
   @staticmethod
   def _scale_loss(loss_value, scale_loss_by_num_replicas):
     """Scale loss for the number of replicas."""
-    if scale_loss_by_num_replicas is None:
-      scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
       num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
@@ -973,7 +965,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         # `update_op`.
         # TODO(josh11b): Make different state objects for each device to
         # avoid needing to set the device_policy.
-        device_policy = context.context().device_policy(
+        device_policy = context.device_policy(
             context.DEVICE_PLACEMENT_SILENT)
         with ops.name_scope("update_" + scope_name), device_policy:
           return processor.update_op(self, g, state)
@@ -989,7 +981,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       def finish():
         # TODO(josh11b): Make different state objects for each device to
         # avoid needing to set the device_policy.
-        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+        with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
           return self._finish(state)
 
       update_ops = control_flow_ops.group(update_ops)
@@ -1267,10 +1259,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
     return self._per_graph_state.get(var._graph_key, None)
 
   # --------------
-  # Overridden methods from Checkpointable.
+  # Overridden methods from Trackable.
   # --------------
 
-  def _track_checkpointable(self, *args, **kwargs):
+  def _track_trackable(self, *args, **kwargs):
     """Optimizers may not track dependencies. Raises an error."""
     raise NotImplementedError(
         "Optimizers may not have dependencies. File a feature request if this "
@@ -1278,7 +1270,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     state = self._get_per_graph_state()
     if state is not None:
@@ -1287,14 +1279,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
           # Avoid comparing variables
           key=lambda item: item[0]):
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     # Note: ignores super(); Optimizers may not have any dependencies outside of
     # state objects.
     return current_graph_non_slot_variables
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     state = self._get_per_graph_state()
     if state is None:
       return None
@@ -1303,10 +1295,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _deferred_dependencies(self):
-    """Lets Checkpointable know where non-slot variables are created.
+    """Lets Trackable know where non-slot variables are created.
 
     If necessary, creates a new state object for the current default graph.
-    Checkpointable will then add entries to that state's deferred dependency
+    Trackable will then add entries to that state's deferred dependency
     dictionary. The state object will check that dictionary when creating
     non-slot variables, restoring their value if an entry is found.
 
@@ -1319,14 +1311,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
                                        variable):
-    """Checkpointable: Restore a slot variable's value, possibly creating it.
+    """Trackable: Restore a slot variable's value, possibly creating it.
 
     Called when a variable which has an associated slot variable is created or
     restored.
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
index dd7f2f44055a2e48e8a48d01c1da3a8e7513255d..2fc0b5ea4de2332ff3bf32f9a12a15eee566d5c4 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -71,7 +71,7 @@ class OptimizerTest(test.TestCase):
         opt_op = sgd_op.minimize(
             cost,
             global_step, [var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod.
+            aggregation_method=gradients_util.AggregationMethod.
             EXPERIMENTAL_ACCUMULATE_N)
 
         variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index 202c1e9afc0623a5837aa82480f1b406834007ee..ab47b74c65a6ddce3ace4b56d12ecd416cc74e54 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -25,10 +25,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.optimizer_v2 import rmsprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -448,5 +450,56 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
           ]), var1.eval())
 
 
+class SlotColocationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([True, False])
+  @test_util.run_in_graph_and_eager_modes
+  def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+
+    with ops.device("/device:CPU:0"):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtypes.float32)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtypes.float32)
+        global_step = resource_variable_ops.ResourceVariable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+
+    def loss():
+      return 5 * var0 + 3 * var1
+
+    opt = rmsprop.RMSPropOptimizer(
+        learning_rate=1.0, decay=0.9, momentum=0.5, epsilon=1.0)
+
+    # Fetch params to validate initial values
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 1 step through optimizer on GPU.
+    # Slot variables are created the first time optimizer is used on some
+    # variable. This tests that slot variables will be colocated with the base
+    # variable.
+    with ops.device("/device:GPU:0"):
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      opt_op = opt.minimize(loss, global_step, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+
+    # Validate updated params, All variables should have decreased.
+    self.assertTrue(all(v < 0.0 for v in self.evaluate(var0)),
+                    msg="updated variables: %s" % self.evaluate(var0))
+    self.assertTrue(all(v < 2.0 for v in self.evaluate(var1)),
+                    msg="updated variables: %s" % self.evaluate(var1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
index 17b69c7b35dce130c45ab0aadb28be330b4bfb88..13749837e0cd2ed9dec3748ad5209088c1b3fdd9 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
@@ -84,7 +84,10 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
       values = field_dict[field.name]
       self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
 
-      fd = field.value.DESCRIPTOR.fields_by_name[field.name]
+      if 'ext_value' in field.name:
+        fd = test_example_pb2.PrimitiveValue()
+      else:
+        fd = field.value.DESCRIPTOR.fields_by_name[field.name]
 
       # Values has the same shape as the input plus an extra
       # dimension for repeats.
@@ -92,13 +95,16 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
 
       # Nested messages are represented as TF strings, requiring
       # some special handling.
-      if field.name == 'message_value':
+      if field.name == 'message_value' or 'ext_value' in field.name:
         vs = []
         for buf in values.flat:
           msg = test_example_pb2.PrimitiveValue()
           msg.ParseFromString(buf)
           vs.append(msg)
-        evs = getattr(field.value, field.name)
+        if 'ext_value' in field.name:
+          evs = field.value.Extensions[test_example_pb2.ext_value]
+        else:
+          evs = getattr(field.value, field.name)
         if len(vs) != len(evs):
           self.fail('Field %s decoded %d outputs, expected %d' %
                     (fd.name, len(vs), len(evs)))
@@ -223,7 +229,8 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         sanitize=False,
         force_disordered=True)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testPacked(self, case):
     # Now try with the packed serialization.
     #
@@ -235,8 +242,7 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         # Note: float_format='.17g' is necessary to ensure preservation of
         # doubles and floats in text format.
         text_format.Parse(
-            text_format.MessageToString(
-                value, float_format='.17g'),
+            text_format.MessageToString(value, float_format='.17g'),
             test_example_pb2.PackedTestValue()).SerializeToString()
         for value in case.values
     ]
@@ -290,14 +296,13 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     field_names = ['sizes']
     field_types = [dtypes.int32]
 
-    with self.cached_session() as sess:
-      ctensor, vtensor = self._decode_module.decode_proto(
-          batch,
-          message_type=msg_type,
-          field_names=field_names,
-          output_types=field_types,
-          sanitize=sanitize)
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   'Unable to parse binary protobuf'
-                                   '|Failed to consume entire buffer'):
-        _ = sess.run([ctensor] + vtensor)
+    with self.assertRaisesRegexp(
+        errors.DataLossError, 'Unable to parse binary protobuf'
+        '|Failed to consume entire buffer'):
+      self.evaluate(
+          self._decode_module.decode_proto(
+              batch,
+              message_type=msg_type,
+              field_names=field_names,
+              output_types=field_types,
+              sanitize=sanitize))
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
index 01b3ccc7fd3918c4ff910281289e31177e5a8097..fac2453527dde46fe9ee065f7112f02a285823ea 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
@@ -15,9 +15,6 @@
 # =============================================================================
 """Table-driven test for encode_proto op.
 
-This test is run once with each of the *.TestCase.pbtxt files
-in the test directory.
-
 It tests that encode_proto is a lossless inverse of decode_proto
 (for the specified fields).
 """
@@ -33,7 +30,9 @@ from google.protobuf import text_format
 
 from tensorflow.contrib.proto.python.kernel_tests import proto_op_test_base as test_base
 from tensorflow.contrib.proto.python.kernel_tests import test_example_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
 
 
@@ -53,56 +52,86 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     self._decode_module = decode_module
     self._encode_module = encode_module
 
+  def testBadSizesShape(self):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError,
+                        r'Invalid shape for field double_value.')
+    else:
+      expected_error = (ValueError,
+                        r'Shape must be at least rank 2 but is rank 0')
+    with self.assertRaisesRegexp(*expected_error):
+      self.evaluate(
+          self._encode_module.encode_proto(
+              sizes=1,
+              values=[np.double(1.0)],
+              message_type='tensorflow.contrib.proto.TestValue',
+              field_names=['double_value']))
+
   def testBadInputs(self):
     # Invalid field name
-    with self.cached_session():
-      with self.assertRaisesOpError('Unknown field: non_existent_field'):
-        self._encode_module.encode_proto(
-            sizes=[[1]],
-            values=[np.array([[0.0]], dtype=np.int32)],
-            message_type='tensorflow.contrib.proto.TestValue',
-            field_names=['non_existent_field']).eval()
+    with self.assertRaisesOpError('Unknown field: non_existent_field'):
+      self.evaluate(
+          self._encode_module.encode_proto(
+              sizes=[[1]],
+              values=[np.array([[0.0]], dtype=np.int32)],
+              message_type='tensorflow.contrib.proto.TestValue',
+              field_names=['non_existent_field']))
 
     # Incorrect types.
-    with self.cached_session():
-      with self.assertRaisesOpError(
-          'Incompatible type for field double_value.'):
-        self._encode_module.encode_proto(
-            sizes=[[1]],
-            values=[np.array([[0.0]], dtype=np.int32)],
-            message_type='tensorflow.contrib.proto.TestValue',
-            field_names=['double_value']).eval()
+    with self.assertRaisesOpError('Incompatible type for field double_value.'):
+      self.evaluate(
+          self._encode_module.encode_proto(
+              sizes=[[1]],
+              values=[np.array([[0.0]], dtype=np.int32)],
+              message_type='tensorflow.contrib.proto.TestValue',
+              field_names=['double_value']))
 
     # Incorrect shapes of sizes.
-    with self.cached_session():
+    for sizes_value in 1, np.array([[[0, 0]]]):
       with self.assertRaisesOpError(
           r'sizes should be batch_size \+ \[len\(field_names\)\]'):
-        sizes = array_ops.placeholder(dtypes.int32)
-        values = array_ops.placeholder(dtypes.float64)
-        self._encode_module.encode_proto(
-            sizes=sizes,
-            values=[values],
-            message_type='tensorflow.contrib.proto.TestValue',
-            field_names=['double_value']).eval(feed_dict={
-                sizes: [[[0, 0]]],
-                values: [[0.0]]
-            })
+        if context.executing_eagerly():
+          self.evaluate(
+              self._encode_module.encode_proto(
+                  sizes=sizes_value,
+                  values=[np.array([[0.0]])],
+                  message_type='tensorflow.contrib.proto.TestValue',
+                  field_names=['double_value']))
+        else:
+          with self.cached_session():
+            sizes = array_ops.placeholder(dtypes.int32)
+            values = array_ops.placeholder(dtypes.float64)
+            self._encode_module.encode_proto(
+                sizes=sizes,
+                values=[values],
+                message_type='tensorflow.contrib.proto.TestValue',
+                field_names=['double_value']).eval(feed_dict={
+                    sizes: sizes_value,
+                    values: [[0.0]]
+                })
 
     # Inconsistent shapes of values.
-    with self.cached_session():
-      with self.assertRaisesOpError(
-          'Values must match up to the last dimension'):
-        sizes = array_ops.placeholder(dtypes.int32)
-        values1 = array_ops.placeholder(dtypes.float64)
-        values2 = array_ops.placeholder(dtypes.int32)
-        (self._encode_module.encode_proto(
-            sizes=[[1, 1]],
-            values=[values1, values2],
-            message_type='tensorflow.contrib.proto.TestValue',
-            field_names=['double_value', 'int32_value']).eval(feed_dict={
-                values1: [[0.0]],
-                values2: [[0], [0]]
-            }))
+    with self.assertRaisesOpError('Values must match up to the last dimension'):
+      if context.executing_eagerly():
+        self.evaluate(
+            self._encode_module.encode_proto(
+                sizes=[[1, 1]],
+                values=[np.array([[0.0]]),
+                        np.array([[0], [0]])],
+                message_type='tensorflow.contrib.proto.TestValue',
+                field_names=['double_value', 'int32_value']))
+      else:
+        with self.cached_session():
+          values1 = array_ops.placeholder(dtypes.float64)
+          values2 = array_ops.placeholder(dtypes.int32)
+          (self._encode_module.encode_proto(
+              sizes=[[1, 1]],
+              values=[values1, values2],
+              message_type='tensorflow.contrib.proto.TestValue',
+              field_names=['double_value', 'int32_value']).eval(feed_dict={
+                  values1: [[0.0]],
+                  values2: [[0], [0]]
+              }))
 
   def _testRoundtrip(self, in_bufs, message_type, fields):
 
@@ -145,7 +174,8 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         # loss of packing in the encoding).
         self.assertEqual(in_buf, out_buf)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testRoundtrip(self, case):
     in_bufs = [value.SerializeToString() for value in case.values]
 
@@ -154,7 +184,8 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     return self._testRoundtrip(
         in_bufs, 'tensorflow.contrib.proto.TestValue', case.fields)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testRoundtripPacked(self, case):
     # Now try with the packed serialization.
     # We test the packed representations by loading the same test cases using
diff --git a/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
index 2950c7dfdc59a11ba7d2c07d8406bd4af26b5bd9..1a636486a1765ad9544b5cb5e52961cc47f92950 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
@@ -38,17 +38,18 @@ class ProtoOpTestBase(test.TestCase):
       ct.cdll.LoadLibrary(lib)
 
   @staticmethod
-  def named_parameters():
-    return (
-        ("defaults", ProtoOpTestBase.defaults_test_case()),
-        ("minmax", ProtoOpTestBase.minmax_test_case()),
-        ("nested", ProtoOpTestBase.nested_test_case()),
-        ("optional", ProtoOpTestBase.optional_test_case()),
-        ("promote", ProtoOpTestBase.promote_test_case()),
-        ("ragged", ProtoOpTestBase.ragged_test_case()),
-        ("shaped_batch", ProtoOpTestBase.shaped_batch_test_case()),
-        ("simple", ProtoOpTestBase.simple_test_case()),
-    )
+  def named_parameters(extension=True):
+    parameters = [("defaults", ProtoOpTestBase.defaults_test_case()),
+                  ("minmax", ProtoOpTestBase.minmax_test_case()),
+                  ("nested", ProtoOpTestBase.nested_test_case()),
+                  ("optional", ProtoOpTestBase.optional_test_case()),
+                  ("promote", ProtoOpTestBase.promote_test_case()),
+                  ("ragged", ProtoOpTestBase.ragged_test_case()),
+                  ("shaped_batch", ProtoOpTestBase.shaped_batch_test_case()),
+                  ("simple", ProtoOpTestBase.simple_test_case())]
+    if extension:
+      parameters.append(("extension", ProtoOpTestBase.extension_test_case()))
+    return parameters
 
   @staticmethod
   def defaults_test_case():
@@ -399,6 +400,21 @@ class ProtoOpTestBase(test.TestCase):
     field.value.bool_value.append(True)
     return test_case
 
+  @staticmethod
+  def extension_test_case():
+    test_case = test_example_pb2.TestCase()
+    value = test_case.values.add()
+    message_value = value.Extensions[test_example_pb2.ext_value].add()
+    message_value.double_value = 23.5
+    test_case.shapes.append(1)
+    test_case.sizes.append(1)
+    field = test_case.fields.add()
+    field.name = test_example_pb2.ext_value.full_name
+    field.dtype = types_pb2.DT_STRING
+    message_value = field.value.Extensions[test_example_pb2.ext_value].add()
+    message_value.double_value = 23.5
+    return test_case
+
   @staticmethod
   def simple_test_case():
     test_case = test_example_pb2.TestCase()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
index 674d881220a1113631def47c5111e3ef401b99f3..b1ce66de4feb9c6666ca9ccf39403b4e12840fcf 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -61,6 +61,8 @@ message TestValue {
   optional sfixed64 sfixed64_value_with_default = 32 [default = 11];
   optional sint32 sint32_value_with_default = 33 [default = 12];
   optional sint64 sint64_value_with_default = 34 [default = 13];
+
+  extensions 100 to 199;
 }
 
 // A PackedTestValue looks exactly the same as a TestValue in the text format,
@@ -68,7 +70,7 @@ message TestValue {
 // by loading the same test cases using this definition instead of TestValue.
 //
 // NOTE: This definition must be kept in sync with TestValue in every way except
-// the packed=true declaration.
+// the packed=true declaration and the lack of extensions.
 message PackedTestValue {
   repeated double double_value = 1 [packed = true];
   repeated float float_value = 2 [packed = true];
@@ -132,6 +134,10 @@ message ExtraFields {
   optional bool bool_value = 1777;
 }
 
+extend TestValue {
+  repeated PrimitiveValue ext_value = 100;
+}
+
 // The messages below are for yet-to-be created tests.
 
 message EnumValue {
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b35c4fde1a2c704880e023a0c3ac1e0766493514..b67e68ea96a15f94e62050c92405eec4fe4be70f 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -202,8 +202,9 @@ py_test(
 
 py_test(
     name = "quantize_parameterized_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/quantize_parameterized_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     # TODO(b/118839526): Re-enable msan test.
     tags = [
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 5b8da92491fb747c5a37dcfe03bcb21b5b903560..b335e1af69b7b2e6020f8e745c43bb1bdc95a62d 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -8,9 +8,9 @@ for both training and inference. There are two aspects to this:
 
 For efficient inference, TensorFlow combines batch normalization with the preceding
 convolutional and fully-connected layers prior to quantization by
-[folding batch norm layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/python/fold_batch_norms.py){:.external}. 
+[folding batch norm layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/python/fold_batch_norms.py){:.external}.
 
-The quantization error is modeled using [fake quantization](../api_guides/python/array_ops.md#Fake_quantization)
+The quantization error is modeled using [fake quantization](../../api_guides/python/array_ops.md#Fake_quantization)
 nodes to simulate the effect of quantization in the forward and backward passes. The
 forward-pass models quantization, while the backward-pass models quantization as a
 straight-through estimator. Both the forward- and backward-pass simulate the quantization
@@ -105,7 +105,7 @@ toco \
   --std_value=127.5 --mean_value=127.5
 ```
 
-See the documentation for `tf.contrib.quantize` and [TensorFlow Lite](../lite/).
+See the documentation for `tf.contrib.quantize` and [TensorFlow Lite](../../lite/).
 
 
 ## Quantized accuracy results
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index e0c6da00d86fe4c5f881bcab7b444182da092b8f..a70f748fad60c6467946225ad5035caaf89c2aaf 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -454,7 +454,7 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor,
         strides=layer_op.get_attr('strides'),
         padding=layer_op.get_attr('padding'),
         use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
-        data_format=layer_op.get_attr('data_format'),
+        data_format=layer_op.get_attr('data_format').decode(),
         name=new_layer_name)
   elif layer_op.type == 'MatMul':
     return math_ops.matmul(
@@ -867,7 +867,7 @@ class _OpCloner(object):
         strides=op.get_attr('strides'),
         padding=op.get_attr('padding'),
         use_cudnn_on_gpu=op.get_attr('use_cudnn_on_gpu'),
-        data_format=op.get_attr('data_format'),
+        data_format=op.get_attr('data_format').decode(),
         name=new_name).op
 
   def _CloneDepthwiseConv2d(self, op, inputs, new_name):
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 8619708cdaecd78bcc7de0e8e0cbf2baa11bf6a2..39082cacf9770619cf5fb529ac9a0aad6e955c6d 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -224,8 +224,8 @@ def MovingAvgQuantize(inputs,
       None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
     scope.set_partitioner(None)
     input_shape = inputs.get_shape()
-    input_dim = len(input_shape)
     if per_channel:
+      input_dim = len(input_shape)
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
                                       ' scope: %s' % (input_shape, name_prefix))
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index 36d2af94e059cdc75b758bbf607d26c4e1ee73e9..c636c90d23a0f5a6de9d14085c824283cb41f6ca 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -63,6 +63,12 @@ class QuantOpsTest(googletest.TestCase):
     self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
     self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
 
+  def testMovingAvgQuantizeTrainingAssignNoShape(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.MovingAvgQuantize, [[-1, 1], [0, 0]], shape=None)
+    self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
+
   def testMovingAvgSymmetricQuantizeTrainingAssign(self):
     min_value, max_value = self._GetMinMaxValues(
         quant_ops.MovingAvgQuantize, [[-1, 0.5], [0, 0]], symmetric=True)
@@ -109,10 +115,10 @@ class QuantOpsTest(googletest.TestCase):
             is_training=True,
             vars_collection=_MIN_MAX_VARS)
 
-  def _GetMinMaxValues(self, quantize_fn, input_values, **kwds):
+  def _GetMinMaxValues(self, quantize_fn, input_values, shape=(2), **kwds):
     g = ops.Graph()
     with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
+      x = array_ops.placeholder(dtypes.float32, shape=shape)
       y = quantize_fn(
           x,
           init_min=0.0,
diff --git a/tensorflow/contrib/recurrent/python/ops/recurrent.py b/tensorflow/contrib/recurrent/python/ops/recurrent.py
index f51de755d81b74e39a26551a282a1f2a47557ebc..b8540258e63016bce6f5d7c0cd02d9ceef1255ed 100644
--- a/tensorflow/contrib/recurrent/python/ops/recurrent.py
+++ b/tensorflow/contrib/recurrent/python/ops/recurrent.py
@@ -100,7 +100,7 @@ def _Update(struct_acc, struct_x, t):
   to_skip_update = set()
   acc_lst = nest.flatten(struct_acc)
   x_lst = nest.flatten(struct_x)
-  t = math_ops.to_int32([t])  # tf.to_int32 casts on-device tensors.
+  t = math_ops.cast([t], dtypes.int32)  # tf.to_int32 casts on-device tensors.
   lst = []
   for acc, x in zip(acc_lst, x_lst):
     if acc in to_skip_update:
@@ -429,7 +429,8 @@ class _Recurrent(object):
       acc_extras = _EmptyAcc(slen_dim, extras)
 
       t = slen_dim - max_input_length if self._aligned_end else 0
-      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      dev_t = math_ops.cast(t, dtypes.int32) if use_tpu else math_ops.cast(
+          t, dtypes.int64)
       run = functional_ops.For(
           start=t,
           limit=slen_dim if self._aligned_end else max_input_length,
@@ -568,7 +569,8 @@ class _Recurrent(object):
       # Loop backwards. Note the loop's limit is open-ended, so goes through
       # t=0.
       t = slen_dim - 1 if self._aligned_end else max_input_length - 1
-      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      dev_t = math_ops.cast(t, dtypes.int32) if use_tpu else math_ops.cast(
+          t, dtypes.int64)
       limit = slen_dim - max_input_length - 1 if self._aligned_end else -1
       run = functional_ops.For(
           start=t,
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 204b83f7f5f118f418815edb6c482b1c06673845..13fbd974e9ce6a680a31507f7f49df17d121535f 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -77,10 +77,10 @@ namespace functor {
           sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
           0, 0);                                                               \
                                                                                \
-      ReduceSliceDeviceKernel##reduceop<T, Index>                              \
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(    \
-              config, indices_width, bound, beginning<T>(), indices.data(),    \
-              data.data(), output.data());                                     \
+      TF_CHECK_OK(CudaLaunchKernel(                                            \
+          ReduceSliceDeviceKernel##reduceop<T, Index>, config.block_count,     \
+          config.thread_per_block, 0, d.stream(), config, indices_width,       \
+          bound, beginning<T>(), indices.data(), data.data(), output.data())); \
     }                                                                          \
   };
 
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
index 2054367f0d1461c8868e3332d82322a8a3dd38af..7e79785d2867de586f0730373d4864602ef770ae 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
@@ -50,13 +50,13 @@ def remote_fused_graph_execute(inputs,
   if default_graph_input_tensor_type_shapes:
     for type_shape in default_graph_input_tensor_type_shapes:
       type_shape_proto = info_proto.default_graph_input_tensor_shape.add()
-      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      type_shape_proto.dtype = dtypes.as_dtype(type_shape[0]).as_datatype_enum
       for dim in type_shape[1]:
         type_shape_proto.shape.dim.add().size = dim
   if default_graph_output_tensor_type_shapes:
     for type_shape in default_graph_output_tensor_type_shapes:
       type_shape_proto = info_proto.default_graph_output_tensor_shape.add()
-      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      type_shape_proto.dtype = dtypes.as_dtype(type_shape[0]).as_datatype_enum
       for dim in type_shape[1]:
         type_shape_proto.shape.dim.add().size = dim
 
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 3c07051f685c74b6e45fb782c80871f38dffbbf4..3b2ee098b3e24287298273a04f80e41f6d9dcd86 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -119,10 +119,10 @@ struct Resampler2DFunctor<GPUDevice, T> {
         batch_size * num_sampling_points * data_channels;
     ::tensorflow::CudaLaunchConfig config =
         ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
-    Resampler2DKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            data, warp, output, batch_size, data_height, data_width,
-            data_channels, num_sampling_points);
+    TF_CHECK_OK(CudaLaunchKernel(
+        Resampler2DKernel<T>, config.block_count, config.thread_per_block, 0,
+        d.stream(), data, warp, output, batch_size, data_height, data_width,
+        data_channels, num_sampling_points));
   }
 };
 
@@ -254,22 +254,23 @@ struct ResamplerGrad2DFunctor<GPUDevice, T> {
 
     ::tensorflow::CudaLaunchConfig config =
         ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
-    ::tensorflow::
-        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            grad_warp_size, grad_warp);
+    TF_CHECK_OK(::tensorflow::CudaLaunchKernel(
+        SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
+        grad_warp_size, grad_warp));
 
     config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
-    ::tensorflow::
-        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            grad_data_size, grad_data);
+    TF_CHECK_OK(::tensorflow::CudaLaunchKernel(
+        SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
+        grad_data_size, grad_data));
 
     const int resampler_output_size =
         batch_size * num_sampling_points * data_channels;
     config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
-    ResamplerGrad2DKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            data, warp, grad_output, grad_data, grad_warp, batch_size,
-            data_height, data_width, data_channels, num_sampling_points);
+    TF_CHECK_OK(CudaLaunchKernel(ResamplerGrad2DKernel<T>, config.block_count,
+                                 config.thread_per_block, 0, d.stream(), data,
+                                 warp, grad_output, grad_data, grad_warp,
+                                 batch_size, data_height, data_width,
+                                 data_channels, num_sampling_points));
   }
 };
 
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 39b688596875ab1b208d97a5d6f9a5ee811674cb..24fa740d24502a28cb42c994715d09180ee99899 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -102,26 +102,6 @@ cuda_py_tests(
     xla_enabled = True,
 )
 
-cuda_py_tests(
-    name = "core_rnn_cell_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_cell_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_tests(
     name = "rnn_test",
     size = "medium",
@@ -144,32 +124,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "core_rnn_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-    ],
-    shard_count = 10,
-)
-
 tf_py_test(
     name = "fused_rnn_cell_test",
     size = "medium",
@@ -388,6 +342,13 @@ py_binary(
     name = "checkpoint_convert",
     srcs = ["python/tools/checkpoint_convert.py"],
     srcs_version = "PY2AND3",
+    deps = [":checkpoint_convert_lib"],
+)
+
+py_library(
+    name = "checkpoint_convert_lib",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_ops",
@@ -406,7 +367,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        ":checkpoint_convert",
+        ":checkpoint_convert_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
index 15ae95f13cffa5d1469d737b23f2a83b9e5a694f..81beb2942c183e6a831b64e946fea89c050b88db 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
@@ -242,8 +242,9 @@ void LSTMBlockCellFpropWithCUDA(
   const int block_dim = 128;
   const int grid_dim =
       Eigen::divup(batch_size * (cell_size + input_size), block_dim);
-  concat_xh<<<grid_dim, block_dim, 0, cu_stream>>>(
-      xh.data(), x.data(), h_prev.data(), batch_size, cell_size, input_size);
+  TF_CHECK_OK(CudaLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
+                               xh.data(), x.data(), h_prev.data(), batch_size,
+                               cell_size, input_size));
 
   // states1 = xh * w
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
@@ -261,15 +262,17 @@ void LSTMBlockCellFpropWithCUDA(
                    Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
 
   if (use_peephole) {
-    lstm_gates<T, true><<<grid_dim_2d, block_dim_2d, 0, cu_stream>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
         icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
         wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
-        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size);
+        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
   } else {
-    lstm_gates<T, false><<<grid_dim_2d, block_dim_2d, 0, cu_stream>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
         icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
         wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
-        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size);
+        i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
   }
 }
 
@@ -374,12 +377,13 @@ void LSTMBlockCellBpropWithCUDA(
   dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
                    Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
 
-  lstm_gates_bprop<<<grid_dim_2d, block_dim_2d, 0, cu_stream>>>(
+  TF_CHECK_OK(CudaLaunchKernel(
+      lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
       cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
       wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),
       co.data(), cs_grad.data(), h_grad.data(), do_.data(), dcs.data(),
       dci.data(), df.data(), di.data(), dicfo.data(), cs_prev_grad.data(),
-      batch_size, cell_size, use_peephole);
+      batch_size, cell_size, use_peephole));
 
   if (use_peephole) {
     Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell_size});
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
deleted file mode 100644
index 7bad4a60a149011d5b8d745f45359fd25473e54e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ /dev/null
@@ -1,1210 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for RNN cells."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib import rnn as contrib_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import layers as keras_layers
-from tensorflow.python.layers import base as base_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-
-# pylint: enable=protected-access
-Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
-
-
-class RNNCellTest(test.TestCase):
-
-  def testLinear(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0)):
-        x = array_ops.zeros([1, 2])
-        l = Linear([x], 2, False)([x])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([l], {x.name: np.array([[1., 2.]])})
-        self.assertAllClose(res[0], [[3.0, 3.0]])
-
-        # Checks prevent you from accidentally creating a shared function.
-        with self.assertRaises(ValueError):
-          l1 = Linear([x], 2, False)([x])
-
-        # But you can create a new one in a new scope and share the variables.
-        with variable_scope.variable_scope("l1") as new_scope:
-          l1 = Linear([x], 2, False)([x])
-        with variable_scope.variable_scope(new_scope, reuse=True):
-          Linear([l1], 2, False)([l1])
-        self.assertEqual(len(variables_lib.trainable_variables()), 2)
-
-  def testBasicRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testBasicRNNCellNotTrainable(self):
-    with self.cached_session() as sess:
-
-      def not_trainable_getter(getter, *args, **kwargs):
-        kwargs["trainable"] = False
-        return getter(*args, **kwargs)
-
-      with variable_scope.variable_scope(
-          "root",
-          initializer=init_ops.constant_initializer(0.5),
-          custom_getter=not_trainable_getter):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertFalse(cell.trainable_variables)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.non_trainable_variables])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testIndRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = contrib_rnn_cell.IndRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test GRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.156736, 0.156736]])
-
-  def testIndyGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.185265, 0.17704]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test IndyGRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.155127, 0.157328]])
-
-  def testSRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.509682, 0.509682]])
-
-  def testSRUCellKerasRNN(self):
-    """Tests that SRUCell works with keras RNN layer."""
-    cell = contrib_rnn_cell.SRUCell(10)
-    seq_input = ops.convert_to_tensor(
-        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
-    rnn_layer = keras_layers.RNN(cell=cell)
-    rnn_outputs_keras = rnn_layer(seq_input)
-    with self.cached_session() as sess:
-      sess.run([variables_lib.global_variables_initializer()])
-      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
-
-  def testSRUCellBiasType(self):
-    """Tests that the bias' dtype is properly set."""
-    cell = contrib_rnn_cell.SRUCell(10)
-    cell.build((2, 3, 5))
-    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
-
-    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
-    cell.build((2, 3, 5))
-    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
-
-    cell_input = ops.convert_to_tensor(
-        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
-    cell_state = ops.convert_to_tensor(
-        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
-    cell = contrib_rnn_cell.SRUCell(10)
-    cell(cell_input, [cell_state])
-    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
-
-  def testSRUCellWithDiffSize(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
-
-  def testBasicLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          m = array_ops.zeros([1, 8], dtype=dtype)
-          cell = rnn_cell_impl.MultiRNNCell(
-              [
-                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                  for _ in range(2)
-              ],
-              state_is_tuple=False)
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, out_m = cell(x, m)
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run([g, out_m], {
-              x.name: np.array([[1., 1.]]),
-              m.name: 0.1 * np.ones([1, 8])
-          })
-          self.assertEqual(len(res), 2)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # The numbers in results were not calculated, this is just a
-          # smoke test.
-          self.assertAllClose(res[0], np.array(
-              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
-          expected_mem = np.array(
-              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
-              dtype=np_dtype)
-          self.assertAllClose(res[1], expected_mem, 1e-2)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test BasicLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
-              })
-          self.assertEqual(len(res), 2)
-
-  def testBasicLSTMCellDimension0Error(self):
-    """Tests that dimension 0 in both(x and m) shape must be equal."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size - 1, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
-              })
-
-  def testBasicLSTMCellStateSizeError(self):
-    """Tests that state_size must be num_units * 2."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 3  # state_size must be num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size, state_size])
-              })
-
-  def testBasicLSTMCellStateTupleType(self):
-    with self.cached_session():
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = (array_ops.zeros([1, 2]),) * 2
-        m1 = (array_ops.zeros([1, 2]),) * 2
-        cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
-            state_is_tuple=True)
-        self.assertTrue(isinstance(cell.state_size, tuple))
-        self.assertTrue(
-            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(
-            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in regular tuples
-        _, (out_m0, out_m1) = cell(x, (m0, m1))
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in LSTMStateTuples
-        variable_scope.get_variable_scope().reuse_variables()
-        zero_state = cell.zero_state(1, dtypes.float32)
-        self.assertTrue(isinstance(zero_state, tuple))
-        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
-        _, (out_m0, out_m1) = cell(x, zero_state)
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-  def testBasicLSTMCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = array_ops.zeros([1, 4])
-        m1 = array_ops.zeros([1, 4])
-        cell = rnn_cell_impl.MultiRNNCell(
-            [
-                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                for _ in range(2)
-            ],
-            state_is_tuple=True)
-        g, (out_m0, out_m1) = cell(x, (m0, m1))
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m0, out_m1], {
-                x.name: np.array([[1., 1.]]),
-                m0.name: 0.1 * np.ones([1, 4]),
-                m1.name: 0.1 * np.ones([1, 4])
-            })
-        self.assertEqual(len(res), 3)
-        # The numbers in results were not calculated, this is just a smoke test.
-        # Note, however, these values should match the original
-        # version having state_is_tuple=False.
-        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
-        expected_mem0 = np.array(
-            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
-        expected_mem1 = np.array(
-            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
-        self.assertAllClose(res[1], expected_mem0)
-        self.assertAllClose(res[2], expected_mem1)
-
-  def testIndyLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          cell = rnn_cell_impl.MultiRNNCell(
-              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state_0, out_state_1], {
-                  x.name: np.array([[1., 1.]]),
-                  state_0[0].name: 0.1 * np.ones([1, 2]),
-                  state_0[1].name: 0.1 * np.ones([1, 2]),
-                  state_1[0].name: 0.1 * np.ones([1, 2]),
-                  state_1[1].name: 0.1 * np.ones([1, 2]),
-              })
-          self.assertEqual(len(res), 3)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # Only check the range of outputs as this is just a smoke test.
-          self.assertAllInRange(res[0], -1.0, 1.0)
-          self.assertAllInRange(res[1], -1.0, 1.0)
-          self.assertAllInRange(res[2], -1.0, 1.0)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test IndyLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-              })
-          self.assertEqual(len(res), 2)
-
-  def testLSTMCell(self):
-    with self.cached_session() as sess:
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        output, state = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [output, state], {
-                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-                m.name: 0.1 * np.ones((batch_size, state_size))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1].shape, (batch_size, state_size))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
-
-  def testLSTMCellVariables(self):
-    with self.cached_session():
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        cell(x, m)  # Execute to create variables
-      variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
-      self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/kernel")
-
-  def testLSTMCellLayerNorm(self):
-    with self.cached_session() as sess:
-      num_units = 2
-      num_proj = 3
-      batch_size = 1
-      input_size = 4
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        c = array_ops.zeros([batch_size, num_units])
-        h = array_ops.zeros([batch_size, num_proj])
-        state = rnn_cell_impl.LSTMStateTuple(c, h)
-        cell = contrib_rnn_cell.LayerNormLSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            layer_norm=True,
-            norm_gain=1.0,
-            norm_shift=0.0)
-        g, out_m = cell(x, state)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m], {
-                x.name: np.ones((batch_size, input_size)),
-                c.name: 0.1 * np.ones((batch_size, num_units)),
-                h.name: 0.1 * np.ones((batch_size, num_proj))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1][0].shape, (batch_size, num_units))
-        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperCheckpointing(self):
-    for wrapper_type in [
-        rnn_cell_impl.DropoutWrapper,
-        rnn_cell_impl.ResidualWrapper,
-        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
-      cell = rnn_cell_impl.BasicRNNCell(1)
-      wrapper = wrapper_type(cell)
-      wrapper(array_ops.ones([1, 1]),
-              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
-      self.evaluate([v.initializer for v in cell.variables])
-      checkpoint = checkpointable_utils.Checkpoint(wrapper=wrapper)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      self.evaluate(cell._bias.assign([40.]))
-      save_path = checkpoint.save(prefix)
-      self.evaluate(cell._bias.assign([0.]))
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.assertAllEqual([40.], self.evaluate(cell._bias))
-
-  def testOutputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.OutputProjectionWrapper(rnn_cell_impl.GRUCell(3), 2)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.231907, 0.231907]])
-
-  def testInputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.InputProjectionWrapper(
-            rnn_cell_impl.GRUCell(3), num_proj=3)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
-
-  def testResidualWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        base_cell = rnn_cell_impl.GRUCell(3)
-        g, m_new = base_cell(x, m)
-        variable_scope.get_variable_scope().reuse_variables()
-        wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell)
-        (name, dep), = wrapper_object._checkpoint_dependencies
-        wrapper_object.get_config()  # Should not throw an error
-        self.assertIs(dep, base_cell)
-        self.assertEqual("cell", name)
-
-        g_res, m_new_res = wrapper_object(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, g_res, m_new, m_new_res], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.1, 0.1]])
-        })
-        # Residual connections
-        self.assertAllClose(res[1], res[0] + [1., 1., 1.])
-        # States are left untouched
-        self.assertAllClose(res[2], res[3])
-
-  def testResidualWrapperWithSlice(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 5])
-        m = array_ops.zeros([1, 3])
-        base_cell = rnn_cell_impl.GRUCell(3)
-        g, m_new = base_cell(x, m)
-        variable_scope.get_variable_scope().reuse_variables()
-
-        def residual_with_slice_fn(inp, out):
-          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
-          return inp_sliced + out
-
-        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
-            base_cell, residual_with_slice_fn)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
-            [g, g_res, m_new, m_new_res], {
-                x: np.array([[1., 1., 1., 1., 1.]]),
-                m: np.array([[0.1, 0.1, 0.1]])
-            })
-        # Residual connections
-        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
-        # States are left untouched
-        self.assertAllClose(res_m_new, res_m_new_res)
-
-  def testDeviceWrapper(self):
-    with variable_scope.variable_scope(
-        "root", initializer=init_ops.constant_initializer(0.5)):
-      x = array_ops.zeros([1, 3])
-      m = array_ops.zeros([1, 3])
-      wrapped = rnn_cell_impl.GRUCell(3)
-      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
-      (name, dep), = cell._checkpoint_dependencies
-      cell.get_config()  # Should not throw an error
-      self.assertIs(dep, wrapped)
-      self.assertEqual("cell", name)
-
-      outputs, _ = cell(x, m)
-      self.assertTrue("cpu:14159" in outputs.device.lower())
-
-  def _retrieve_cpu_gpu_stats(self, run_metadata):
-    cpu_stats = None
-    gpu_stats = None
-    step_stats = run_metadata.step_stats
-    for ds in step_stats.dev_stats:
-      if "cpu:0" in ds.device[-5:].lower():
-        cpu_stats = ds.node_stats
-      if "gpu:0" == ds.device[-5:].lower():
-        gpu_stats = ds.node_stats
-    return cpu_stats, gpu_stats
-
-  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
-    if not test.is_gpu_available():
-      # Can't perform this test w/o a GPU
-      return
-
-    gpu_dev = test.gpu_device_name()
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
-        with ops.device("/cpu:0"):
-          outputs, _ = rnn.dynamic_rnn(
-              cell=cell, inputs=x, dtype=dtypes.float32)
-        run_metadata = config_pb2.RunMetadata()
-        opts = config_pb2.RunOptions(
-            trace_level=config_pb2.RunOptions.FULL_TRACE)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
-
-      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
-      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
-      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
-
-  def testEmbeddingWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
-        m = array_ops.zeros([1, 2])
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
-        self.assertEqual(embedding_cell.output_size, 2)
-        g, new_m = embedding_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 2))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.17139, 0.17139]])
-
-  def testEmbeddingWrapperWithDynamicRnn(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope("root"):
-        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
-        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
-            embedding_classes=1,
-            embedding_size=2)
-        outputs, _ = rnn.dynamic_rnn(
-            cell=embedding_cell,
-            inputs=inputs,
-            sequence_length=input_lengths,
-            dtype=dtypes.float32)
-        sess.run([variables_lib.global_variables_initializer()])
-        # This will fail if output's dtype is inferred from input's.
-        sess.run(outputs)
-
-  def testMultiRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 4])
-        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=False)
-        _, ml = multi_rnn_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1, 0.1]])
-        })
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
-        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
-        self.assertTrue(
-            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
-
-  def testMultiRNNCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m_bad = array_ops.zeros([1, 4])
-        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
-
-        # Test incorrectness of state
-        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-              state_is_tuple=True)(x, m_bad)
-
-        _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=True)(x, m_good)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            ml, {
-                x.name: np.array([[1., 1.]]),
-                m_good[0].name: np.array([[0.1, 0.1]]),
-                m_good[1].name: np.array([[0.1, 0.1]])
-            })
-
-        # The numbers in results were not calculated, this is just a
-        # smoke test.  However, these numbers should match those of
-        # the test testMultiRNNCell.
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-        self.assertAllClose(res[1], [[0.13248, 0.13248]])
-
-
-class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
-
-  def _testDropoutWrapper(self,
-                          batch_size=None,
-                          time_steps=None,
-                          parallel_iterations=None,
-                          wrapper_type=None,
-                          **kwargs):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        if batch_size is None and time_steps is None:
-          # 2 time steps, batch size 1, depth 3
-          batch_size = 1
-          time_steps = 2
-          x = constant_op.constant(
-              [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
-          m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
-                                    )] * 2)
-        else:
-          x = constant_op.constant(
-              np.random.randn(time_steps, batch_size, 3).astype(np.float32))
-          m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.
-              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
-          ] * 2)
-        outputs, final_state = rnn.dynamic_rnn(
-            cell=wrapper_type(
-                rnn_cell_impl.LSTMCell(3), dtype=x.dtype, **kwargs),
-            time_major=True,
-            parallel_iterations=parallel_iterations,
-            inputs=x,
-            initial_state=m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([outputs, final_state])
-        self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
-        self.assertEqual(res[1].c.shape, (batch_size, 3))
-        self.assertEqual(res[1].h.shape, (batch_size, 3))
-        return res
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperProperties(self, wrapper_type):
-    cell = rnn_cell_impl.BasicRNNCell(10)
-    wrapper = wrapper_type(cell)
-    # Github issue 15810
-    self.assertEqual(wrapper.wrapped_cell, cell)
-    self.assertEqual(wrapper.state_size, 10)
-    self.assertEqual(wrapper.output_size, 10)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperZeroState(self, wrapper_type):
-    class _Cell(rnn_cell_impl.BasicRNNCell):
-
-      def zero_state(self, batch_size=None, dtype=None):
-        return "wrapped_cell_zero_state"
-    wrapper = wrapper_type(_Cell(10))
-    self.assertEqual(wrapper.zero_state(10, dtypes.float32),
-                     "wrapped_cell_zero_state")
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepAllConstantInput(self, wrapper_type):
-    keep = array_ops.ones([])
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepAll(self, wrapper_type):
-    keep = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperWithSeed(self, wrapper_type):
-    keep_some = 0.5
-    random_seed.set_random_seed(2)
-    ## Use parallel_iterations = 1 in both calls to
-    ## _testDropoutWrapper to ensure the (per-time step) dropout is
-    ## consistent across both calls.  Otherwise the seed may not end
-    ## up being munged consistently across both graphs.
-    res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1,
-        wrapper_type=wrapper_type)
-    # Clear away the graph and the test session (which keeps variables around)
-    ops.reset_default_graph()
-    self._ClearCachedSession()
-    random_seed.set_random_seed(2)
-    res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1,
-        wrapper_type=wrapper_type)
-    self.assertAllClose(res_standard_1[0], res_standard_2[0])
-    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
-    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoOutput(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_none,
-        state_keep_prob=keep_all,
-        wrapper_type=wrapper_type)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(np.zeros(res[0].shape), res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    # Even though we dropout state, by default DropoutWrapper never
-    # drops out the memory ("c") term of an LSTMStateTuple.
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_none,
-        wrapper_type=wrapper_type)
-    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    self.assertAllClose(true_full_output[0], res[0][0])
-    # Second output is modified by zero input state
-    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
-    # h state has been set to zero
-    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
-    # c state of an LSTMStateTuple is NEVER modified.
-    self.assertAllClose(true_c_state, res[1].c)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperKeepNoInput(self, wrapper_type):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    # All outputs are different because inputs are zeroed out
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_none,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_all,
-        wrapper_type=wrapper_type)
-    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentOutput(self, wrapper_type):
-    keep_some = 0.8
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_all,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-    # Ensure the same dropout pattern for all time steps
-    output_mask = np.abs(res[0]) > 1e-6
-    for m in output_mask[1:]:
-      self.assertAllClose(output_mask[0], m)
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentStateInputAndOutput(self, wrapper_type):
-    keep_some = 0.9
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-
-    # Smoke test for the state/input masks.
-    output_mask = np.abs(res[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res[1].c) > 1e-6
-    state_h_mask = np.abs(res[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-  @parameterized.parameters(
-      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
-  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(
-      self, wrapper_type):
-    keep_some = 0.9
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987)
-    ops.reset_default_graph()
-    self._ClearCachedSession()
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        wrapper_type=wrapper_type,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987)
-
-    output_mask = np.abs(res0[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res0[1].c) > 1e-6
-    state_h_mask = np.abs(res0[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-    # Ensure seeded calculation is identical.
-    self.assertAllClose(res0[0], res1[0])
-    self.assertAllClose(res0[1].c, res1[1].c)
-    self.assertAllClose(res0[1].h, res1[1].h)
-
-  def testDropoutWrapperKerasStyle(self):
-    """Tests if DropoutWrapperV2 cell is instantiated in keras style scope."""
-    wrapped_cell_v2 = rnn_cell_impl.DropoutWrapperV2(
-        rnn_cell_impl.BasicRNNCell(1))
-    self.assertTrue(wrapped_cell_v2._keras_style)
-
-    wrapped_cell = rnn_cell_impl.DropoutWrapper(rnn_cell_impl.BasicRNNCell(1))
-    self.assertFalse(wrapped_cell._keras_style)
-
-  def testDropoutWrapperV2VariableNames(self):
-    """Tests that variables names do not depend on wrapper in RNN layer."""
-
-    def _rnn_input(apply_wrapper):
-      """Creates a RNN layer with/without wrapper and returns built rnn cell."""
-      with base_layer.keras_style_scope():
-        base_cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
-      if apply_wrapper:
-        rnn_cell = rnn_cell_impl.DropoutWrapperV2(base_cell)
-      else:
-        rnn_cell = base_cell
-      rnn_layer = keras_layers.RNN(rnn_cell)
-      inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
-      _ = rnn_layer(inputs)
-      return base_cell._cells[0]
-
-    rnn_1 = _rnn_input(True)
-    ops.reset_default_graph()
-    rnn_2 = _rnn_input(False)
-
-    self.assertLen(rnn_1.weights, expected_len=2)
-    self.assertCountEqual([v.name for v in rnn_1.weights],
-                          [v.name for v in rnn_2.weights])
-
-  def testDropoutWrapperV2Caller(self):
-    """Tests that DropoutWrapperV2 is using the LayerRNNCell's caller."""
-
-    with base_layer.keras_style_scope():
-      base_cell = rnn_cell_impl.MultiRNNCell(
-          [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
-    rnn_cell = rnn_cell_impl.DropoutWrapperV2(base_cell)
-    inputs = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
-    state = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
-    _ = rnn_cell(inputs, [state, state])
-    weights = base_cell._cells[0].weights
-    self.assertLen(weights, expected_len=2)
-    self.assertTrue(all(["dropout_wrapper" in v.name for v in weights]))
-
-  def testDropoutWrapperV2Build(self):
-    cell = rnn_cell_impl.LSTMCell(10)
-    wrapper = rnn_cell_impl.DropoutWrapperV2(cell)
-    wrapper.build((1,))
-    self.assertTrue(cell.built)
-
-
-def basic_rnn_cell(inputs, state, num_units, scope=None):
-  if state is None:
-    if inputs is not None:
-      batch_size = inputs.get_shape()[0]
-      dtype = inputs.dtype
-    else:
-      batch_size = 0
-      dtype = dtypes.float32
-    init_output = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_state = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_output.set_shape([batch_size, num_units])
-    init_state.set_shape([batch_size, num_units])
-    return init_output, init_state
-  else:
-    with variable_scope.variable_scope(scope, "basic_rnn_cell",
-                                       [inputs, state]):
-      output = math_ops.tanh(
-          Linear([inputs, state], num_units, True)([inputs, state]))
-    return output, output
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index d7ee7fb8faacb0876218a983d68f007e1905c11e..921b4baae43d86f5ac2a86df0828c9691d9dbb2a 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -22,6 +22,7 @@ import itertools
 
 import numpy as np
 
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell as legacy_rnn_cell
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -29,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers as keras_layers
@@ -53,6 +55,314 @@ from tensorflow.python.util import nest
 
 class RNNCellTest(test.TestCase):
 
+  def _assert_cell_builds(self, cell_class, dtype, batch_size, in_size,
+                          out_size):
+    cell = cell_class(out_size, dtype=dtype)
+    in_shape = tensor_shape.TensorShape((batch_size, in_size))
+    cell.build(in_shape)
+    state_output = cell.get_initial_state(
+        inputs=None, batch_size=batch_size, dtype=dtype)
+    cell_output, _ = cell(array_ops.zeros(in_shape, dtype), state_output)
+    self.assertAllEqual([batch_size, out_size], cell_output.shape.as_list())
+
+  def testCellsBuild(self):
+    f32 = dtypes.float32
+    f64 = dtypes.float64
+    self._assert_cell_builds(contrib_rnn_cell.IndRNNCell, f32, 5, 7, 3)
+    self._assert_cell_builds(contrib_rnn_cell.IndRNNCell, f64, 5, 7, 3)
+    self._assert_cell_builds(contrib_rnn_cell.IndyGRUCell, f32, 5, 7, 3)
+    self._assert_cell_builds(contrib_rnn_cell.IndyGRUCell, f64, 5, 7, 3)
+    self._assert_cell_builds(contrib_rnn_cell.IndyLSTMCell, f32, 5, 7, 3)
+    self._assert_cell_builds(contrib_rnn_cell.IndyLSTMCell, f64, 5, 7, 3)
+
+  def testIndRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.IndRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testIndyGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.185265, 0.17704]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test IndyGRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.155127, 0.157328]])
+
+  def testIndyLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          cell = rnn_cell_impl.MultiRNNCell(
+              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state_0, out_state_1], {
+                  x.name: np.array([[1., 1.]]),
+                  state_0[0].name: 0.1 * np.ones([1, 2]),
+                  state_0[1].name: 0.1 * np.ones([1, 2]),
+                  state_1[0].name: 0.1 * np.ones([1, 2]),
+                  state_1[1].name: 0.1 * np.ones([1, 2]),
+              })
+          self.assertEqual(len(res), 3)
+          global_variables = variables.global_variables()
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in global_variables])
+          # Only check the range of outputs as this is just a smoke test.
+          self.assertAllInRange(res[0], -1.0, 1.0)
+          self.assertAllInRange(res[1], -1.0, 1.0)
+          self.assertAllInRange(res[2], -1.0, 1.0)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test IndyLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+              })
+          self.assertEqual(len(res), 2)
+
+  def testLSTMCellLayerNorm(self):
+    with self.cached_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
+  def testOutputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.OutputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), 2)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.231907, 0.231907]])
+
+  def testInputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.InputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), num_proj=3)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
+
+  def testEmbeddingWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
+        m = array_ops.zeros([1, 2])
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
+        self.assertEqual(embedding_cell.output_size, 2)
+        g, new_m = embedding_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 2))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.17139, 0.17139]])
+
+  def testEmbeddingWrapperWithDynamicRnn(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope("root"):
+        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
+        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
+            embedding_classes=1,
+            embedding_size=2)
+        outputs, _ = rnn.dynamic_rnn(
+            cell=embedding_cell,
+            inputs=inputs,
+            sequence_length=input_lengths,
+            dtype=dtypes.float32)
+        sess.run([variables.global_variables_initializer()])
+        # This will fail if output's dtype is inferred from input's.
+        sess.run(outputs)
+
+  def testSRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellKerasRNN(self):
+    """Tests that SRUCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs_keras = rnn_layer(seq_input)
+    with self.cached_session() as sess:
+      sess.run([variables.global_variables_initializer()])
+      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
+
+  def testSRUCellBiasType(self):
+    """Tests that the bias' dtype is properly set."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
+
+    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
+
+    cell_input = ops.convert_to_tensor(
+        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
+    cell_state = ops.convert_to_tensor(
+        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell(cell_input, [cell_state])
+    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
+
+  def testSRUCellWithDiffSize(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
+
   def testCoupledInputForgetGateLSTMCell(self):
     with self.cached_session() as sess:
       num_units = 2
@@ -1265,6 +1575,61 @@ class RNNCellTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  def testNTMCell(self):
+    expected_output = np.array(
+        [[-0.04973561, -0.00020032, -0.09586009, -0.05049511],
+         [-0.02199885, 0.02302885, -0.05558189, -0.02051288],
+         [-0.01399924, 0.02543444, -0.06975862, -0.03782758],
+         [-0.02238393, 0.0135776, -0.09102941, -0.05594013]],
+        dtype=np.float32)
+    expected_read_vector_list = np.array(
+        [[1e-6, 1e-6, 1e-6, 1e-6], [1e-6, 1e-6, 1e-6, 1e-6],
+         [1e-6, 1e-6, 1e-6, 1e-6], [1e-6, 1e-6, 1e-6, 1e-6]],
+        dtype=np.float32)
+    expected_w_list = np.array(
+        [[[0.15837428, 0.21354634, 0.22115856, 0.21117255, 0.19574821],
+          [0.15826838, 0.2150458, 0.2228198, 0.20747298, 0.19639312],
+          [0.15750293, 0.21550071, 0.22280747, 0.20737495, 0.19681393],
+          [0.15763053, 0.21473582, 0.22187267, 0.20920397, 0.19655706]],
+         [[0.21703579, 0.19425659, 0.22143759, 0.18024713, 0.18702294],
+          [0.2164267, 0.19451937, 0.22112325, 0.18051708, 0.18741359],
+          [0.21567065, 0.1947548, 0.22107735, 0.18058982, 0.18790732],
+          [0.2163743, 0.194361, 0.22131558, 0.18042919, 0.1875199]]],
+        dtype=np.float32)
+    expected_M_0 = np.array(
+        [[-0.00553495, -0.01089884, 0.00683121, -0.00273276],
+         [-0.00495392, -0.00975483, 0.00611433, -0.00244583],
+         [-0.00564722, -0.0111199, 0.00696973, -0.0027882],
+         [-0.00459658, -0.00905126, 0.00567345, -0.00226937],
+         [-0.00476941, -0.00939155, 0.00588669, -0.00235472]],
+        dtype=np.float32)
+
+    with session.Session() as sess:
+      with variable_scope.variable_scope("root"):
+        seed = 1234
+        random_seed.set_random_seed(seed)
+        batch_size = 4
+        inputs = random_ops.random_uniform((batch_size, 4),
+                                           0.0,
+                                           1.0,
+                                           seed=seed + 1)
+        cell = contrib_rnn_cell.NTMCell(
+            controller=rnn_cell_impl.LSTMCell(num_units=4),
+            memory_size=5,
+            memory_vector_dim=4,
+            read_head_num=1,
+            write_head_num=1)
+        output, state = cell(inputs, cell.zero_state(batch_size,
+                                                     dtypes.float32))
+        sess.run([variables.global_variables_initializer()])
+        res, read_vector_list, w_list, M = sess.run(
+            [output, state.read_vector_list, state.w_list, state.M])
+        # Smoke test
+        self.assertAllClose(res, expected_output)
+        self.assertAllClose(read_vector_list[0], expected_read_vector_list)
+        self.assertAllClose(w_list, expected_w_list)
+        self.assertAllClose(M[0], expected_M_0)
+
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index b043026bc556a8879b15b432829baf8136250c0e..ed3c7609368819295e142a2a4c5a4e5f66c2ee36 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -691,9 +691,10 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
       wci = wcf = wco = array_ops.zeros([self._num_units], dtype=dtype)
 
     if sequence_length is None:
-      max_seq_len = math_ops.to_int64(time_len)
+      max_seq_len = math_ops.cast(time_len, dtypes.int64)
     else:
-      max_seq_len = math_ops.to_int64(math_ops.reduce_max(sequence_length))
+      max_seq_len = math_ops.cast(math_ops.reduce_max(sequence_length),
+                                  dtypes.int64)
 
     _, cs, _, _, _, _, h = gen_lstm_ops.block_lstm(
         seq_len_max=max_seq_len,
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 0266b72dcb15e4aba01a9a31b4be75c5b84d44da..41b1698321e20f4360d75fa2db79f9bd8a806cea 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -131,7 +131,8 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
                                     sequence_length=None,
                                     parallel_iterations=None,
                                     time_major=False,
-                                    scope=None):
+                                    scope=None,
+                                    swap_memory=False):
   """Creates a dynamic bidirectional recurrent neural network.
 
   Stacks several bidirectional rnn layers. The combined forward and backward
@@ -171,6 +172,10 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
       data is batch-major, so by default this function accepts input and emits
       output in batch-major form.
     scope: VariableScope for the created subgraph; defaults to None.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
 
   Returns:
     A tuple (outputs, output_state_fw, output_state_bw) where:
@@ -230,6 +235,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
             sequence_length=sequence_length,
             parallel_iterations=parallel_iterations,
             dtype=dtype,
+            swap_memory=swap_memory,
             time_major=time_major)
         # Concat the outputs to create the new input.
         prev_layer = array_ops.concat(outputs, 2)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 482e547a16be85804beec88a91fa03b053d09b27..9ada8e244ba5657c3bfc91d17aa69c3c5f49a0df 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -3153,7 +3154,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
   r"""Independently Gated Recurrent Unit cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to GRUCell,
-  yet with the \(U_r\), \(U_z\), and \(U\) matrices in equations 5, 6, and
+  yet with the \\(U_r\\), \\(U_z\\), and \\(U\\) matrices in equations 5, 6, and
   8 of http://arxiv.org/abs/1406.1078 respectively replaced by diagonal
   matrices, i.e. a Hadamard product with a single vector:
 
@@ -3164,12 +3165,10 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     $$\tilde{h}^{(t)}_j = \phi\left([\mathbf W \mathbf x]_j +
       [\mathbf u \circ \mathbf r \circ \mathbf h_{(t-1)}]_j\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyGRU
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyGRU
   node sees only its own state, as opposed to seeing all states in the same
   layer.
 
-  TODO(gonnet): Write a paper describing this and add a reference here.
-
   Args:
     num_units: int, The number of units in the GRU cell.
     activation: Nonlinearity to use.  Default: `tanh`.
@@ -3254,7 +3253,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     self.built = True
 
   def call(self, inputs, state):
-    """Gated recurrent unit (GRU) with nunits cells."""
+    """Recurrently independent Gated Recurrent Unit (GRU) with nunits cells."""
 
     gate_inputs = math_ops.matmul(inputs, self._gate_kernel_w) + (
         gen_array_ops.tile(state, [1, 2]) * self._gate_kernel_u)
@@ -3278,10 +3277,9 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
   r"""Basic IndyLSTM recurrent network cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to
-  BasicLSTMCell, yet with the \(U_f\), \(U_i\), \(U_o\) and \(U_c\)
-  matrices in
-  https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
-  replaced by diagonal matrices, i.e. a Hadamard product with a single vector:
+  BasicLSTMCell, yet with the \\(U_f\\), \\(U_i\\), \\(U_o\\) and \\(U_c\\)
+  matrices in the regular LSTM equations replaced by diagonal matrices, i.e. a
+  Hadamard product with a single vector:
 
     $$f_t = \sigma_g\left(W_f x_t + u_f \circ h_{t-1} + b_f\right)$$
     $$i_t = \sigma_g\left(W_i x_t + u_i \circ h_{t-1} + b_i\right)$$
@@ -3289,8 +3287,8 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     $$c_t = f_t \circ c_{t-1} +
             i_t \circ \sigma_c\left(W_c x_t + u_c \circ h_{t-1} + b_c\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyLSTM
-  node sees only its own state \(h\) and \(c\), as opposed to seeing all
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyLSTM
+  node sees only its own state \\(h\\) and \\(c\\), as opposed to seeing all
   states in the same layer.
 
   We add forget_bias (default: 1) to the biases of the forget gate in order to
@@ -3298,11 +3296,6 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
 
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
-
-  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
-  that follows.
-
-  TODO(gonnet): Write a paper describing this and add a reference here.
   """
 
   def __init__(self,
@@ -3417,6 +3410,354 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     return new_h, new_state
 
 
+NTMControllerState = collections.namedtuple(
+    "NTMControllerState",
+    ("controller_state", "read_vector_list", "w_list", "M", "time"))
+
+
+class NTMCell(rnn_cell_impl.LayerRNNCell):
+  """Neural Turing Machine Cell with RNN controller.
+
+    Implementation based on:
+    https://arxiv.org/abs/1807.08518
+    Mark Collier, Joeran Beel
+
+    which is in turn based on the source code of:
+    https://github.com/snowkylin/ntm
+
+    and of course the original NTM paper:
+    Neural Turing Machines
+    https://arxiv.org/abs/1410.5401
+    A Graves, G Wayne, I Danihelka
+  """
+
+  def __init__(self,
+               controller,
+               memory_size,
+               memory_vector_dim,
+               read_head_num,
+               write_head_num,
+               shift_range=1,
+               output_dim=None,
+               clip_value=20,
+               dtype=dtypes.float32,
+               name=None):
+    """Initialize the NTM Cell.
+
+      Args:
+        controller: an RNNCell, the RNN controller.
+        memory_size: int, The number of memory locations in the NTM memory
+          matrix
+        memory_vector_dim: int, The dimensionality of each location in the NTM
+          memory matrix
+        read_head_num: int, The number of read heads from the controller into
+          memory
+        write_head_num: int, The number of write heads from the controller into
+          memory
+        shift_range: int, The number of places to the left/right it is possible
+          to iterate the previous address to in a single step
+        output_dim: int, The number of dimensions to make a linear projection of
+          the NTM controller outputs to. If None, no linear projection is
+          applied
+        clip_value: float, The maximum absolute value the controller parameters
+          are clipped to
+        dtype: Default dtype of the layer (default of `None` means use the type
+          of the first input). Required when `build` is called before `call`.
+        name: String, the name of the layer. Layers with the same name will
+          share weights, but to avoid mistakes we require reuse=True in such
+          cases.
+    """
+    super(NTMCell, self).__init__(dtype=dtype, name=name)
+
+    rnn_cell_impl.assert_like_rnncell("NTM RNN controller cell", controller)
+
+    self.controller = controller
+    self.memory_size = memory_size
+    self.memory_vector_dim = memory_vector_dim
+    self.read_head_num = read_head_num
+    self.write_head_num = write_head_num
+    self.clip_value = clip_value
+
+    self.output_dim = output_dim
+    self.shift_range = shift_range
+
+    self.num_parameters_per_head = (
+        self.memory_vector_dim + 2 * self.shift_range + 4)
+    self.num_heads = self.read_head_num + self.write_head_num
+    self.total_parameter_num = (
+        self.num_parameters_per_head * self.num_heads +
+        self.memory_vector_dim * 2 * self.write_head_num)
+
+  @property
+  def state_size(self):
+    return NTMControllerState(
+        controller_state=self.controller.state_size,
+        read_vector_list=[
+            self.memory_vector_dim for _ in range(self.read_head_num)
+        ],
+        w_list=[
+            self.memory_size
+            for _ in range(self.read_head_num + self.write_head_num)
+        ],
+        M=tensor_shape.TensorShape([self.memory_size * self.memory_vector_dim]),
+        time=tensor_shape.TensorShape([]))
+
+  @property
+  def output_size(self):
+    return self.output_dim
+
+  def build(self, inputs_shape):
+    if self.output_dim is None:
+      if inputs_shape[1].value is None:
+        raise ValueError(
+            "Expected inputs.shape[-1] to be known, saw shape: %s" %
+            inputs_shape)
+      else:
+        self.output_dim = inputs_shape[1].value
+
+    def _create_linear_initializer(input_size, dtype=dtypes.float32):
+      stddev = 1.0 / math.sqrt(input_size)
+      return init_ops.truncated_normal_initializer(stddev=stddev, dtype=dtype)
+
+    self._params_kernel = self.add_variable(
+        "parameters_kernel",
+        shape=[self.controller.output_size, self.total_parameter_num],
+        initializer=_create_linear_initializer(self.controller.output_size))
+
+    self._params_bias = self.add_variable(
+        "parameters_bias",
+        shape=[self.total_parameter_num],
+        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+
+    self._output_kernel = self.add_variable(
+        "output_kernel",
+        shape=[
+            self.controller.output_size +
+            self.memory_vector_dim * self.read_head_num, self.output_dim
+        ],
+        initializer=_create_linear_initializer(self.controller.output_size +
+                                               self.memory_vector_dim *
+                                               self.read_head_num))
+
+    self._output_bias = self.add_variable(
+        "output_bias",
+        shape=[self.output_dim],
+        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+
+    self._init_read_vectors = [
+        self.add_variable(
+            "initial_read_vector_%d" % i,
+            shape=[1, self.memory_vector_dim],
+            initializer=initializers.glorot_uniform())
+        for i in range(self.read_head_num)
+    ]
+
+    self._init_address_weights = [
+        self.add_variable(
+            "initial_address_weights_%d" % i,
+            shape=[1, self.memory_size],
+            initializer=initializers.glorot_uniform())
+        for i in range(self.read_head_num + self.write_head_num)
+    ]
+
+    self._M = self.add_variable(
+        "memory",
+        shape=[self.memory_size, self.memory_vector_dim],
+        initializer=init_ops.constant_initializer(1e-6, dtype=self.dtype))
+
+    self.built = True
+
+  def call(self, x, prev_state):
+    # Addressing Mechanisms (Sec 3.3)
+
+    def _prev_read_vector_list_initial_value():
+      return [
+          self._expand(
+              math_ops.tanh(
+                  array_ops.squeeze(
+                      math_ops.matmul(
+                          array_ops.ones([1, 1]), self._init_read_vectors[i]))),
+              dim=0,
+              N=x.shape[0].value or array_ops.shape(x)[0])
+          for i in range(self.read_head_num)
+      ]
+
+    prev_read_vector_list = control_flow_ops.cond(
+        math_ops.equal(prev_state.time,
+                       0), _prev_read_vector_list_initial_value, lambda:
+        prev_state.read_vector_list)
+    if self.read_head_num == 1:
+      prev_read_vector_list = [prev_read_vector_list]
+
+    controller_input = array_ops.concat([x] + prev_read_vector_list, axis=1)
+    controller_output, controller_state = self.controller(
+        controller_input, prev_state.controller_state)
+
+    parameters = math_ops.matmul(controller_output, self._params_kernel)
+    parameters = nn_ops.bias_add(parameters, self._params_bias)
+    parameters = clip_ops.clip_by_value(parameters, -self.clip_value,
+                                        self.clip_value)
+    head_parameter_list = array_ops.split(
+        parameters[:, :self.num_parameters_per_head * self.num_heads],
+        self.num_heads,
+        axis=1)
+    erase_add_list = array_ops.split(
+        parameters[:, self.num_parameters_per_head * self.num_heads:],
+        2 * self.write_head_num,
+        axis=1)
+
+    def _prev_w_list_initial_value():
+      return [
+          self._expand(
+              nn_ops.softmax(
+                  array_ops.squeeze(
+                      math_ops.matmul(
+                          array_ops.ones([1, 1]),
+                          self._init_address_weights[i]))),
+              dim=0,
+              N=x.shape[0].value or array_ops.shape(x)[0])
+          for i in range(self.read_head_num + self.write_head_num)
+      ]
+
+    prev_w_list = control_flow_ops.cond(
+        math_ops.equal(prev_state.time, 0),
+        _prev_w_list_initial_value, lambda: prev_state.w_list)
+    if (self.read_head_num + self.write_head_num) == 1:
+      prev_w_list = [prev_w_list]
+
+    prev_M = control_flow_ops.cond(
+        math_ops.equal(prev_state.time, 0), lambda: self._expand(
+            self._M, dim=0, N=x.shape[0].value or array_ops.shape(x)[0]),
+        lambda: prev_state.M)
+
+    w_list = []
+    for i, head_parameter in enumerate(head_parameter_list):
+      k = math_ops.tanh(head_parameter[:, 0:self.memory_vector_dim])
+      beta = nn_ops.softplus(head_parameter[:, self.memory_vector_dim])
+      g = math_ops.sigmoid(head_parameter[:, self.memory_vector_dim + 1])
+      s = nn_ops.softmax(head_parameter[:, self.memory_vector_dim +
+                                        2:(self.memory_vector_dim + 2 +
+                                           (self.shift_range * 2 + 1))])
+      gamma = nn_ops.softplus(head_parameter[:, -1]) + 1
+      w = self._addressing(k, beta, g, s, gamma, prev_M, prev_w_list[i])
+      w_list.append(w)
+
+    # Reading (Sec 3.1)
+
+    read_w_list = w_list[:self.read_head_num]
+    read_vector_list = []
+    for i in range(self.read_head_num):
+      read_vector = math_ops.reduce_sum(
+          array_ops.expand_dims(read_w_list[i], dim=2) * prev_M, axis=1)
+      read_vector_list.append(read_vector)
+
+    # Writing (Sec 3.2)
+
+    write_w_list = w_list[self.read_head_num:]
+    M = prev_M
+    for i in range(self.write_head_num):
+      w = array_ops.expand_dims(write_w_list[i], axis=2)
+      erase_vector = array_ops.expand_dims(
+          math_ops.sigmoid(erase_add_list[i * 2]), axis=1)
+      add_vector = array_ops.expand_dims(
+          math_ops.tanh(erase_add_list[i * 2 + 1]), axis=1)
+      erase_M = array_ops.ones_like(M) - math_ops.matmul(w, erase_vector)
+      M = M * erase_M + math_ops.matmul(w, add_vector)
+
+    output = math_ops.matmul(
+        array_ops.concat([controller_output] + read_vector_list, axis=1),
+        self._output_kernel)
+    output = nn_ops.bias_add(output, self._output_bias)
+    output = clip_ops.clip_by_value(output, -self.clip_value, self.clip_value)
+
+    return output, NTMControllerState(
+        controller_state=controller_state,
+        read_vector_list=read_vector_list,
+        w_list=w_list,
+        M=M,
+        time=prev_state.time + 1)
+
+  def _expand(self, x, dim, N):
+    return array_ops.concat([array_ops.expand_dims(x, dim) for _ in range(N)],
+                            axis=dim)
+
+  def _addressing(self, k, beta, g, s, gamma, prev_M, prev_w):
+    # Sec 3.3.1 Focusing by Content
+
+    k = array_ops.expand_dims(k, axis=2)
+    inner_product = math_ops.matmul(prev_M, k)
+    k_norm = math_ops.sqrt(
+        math_ops.reduce_sum(math_ops.square(k), axis=1, keepdims=True))
+    M_norm = math_ops.sqrt(
+        math_ops.reduce_sum(math_ops.square(prev_M), axis=2, keepdims=True))
+    norm_product = M_norm * k_norm
+
+    # eq (6)
+    K = array_ops.squeeze(inner_product / (norm_product + 1e-8))
+
+    K_amplified = math_ops.exp(array_ops.expand_dims(beta, axis=1) * K)
+
+    # eq (5)
+    w_c = K_amplified / math_ops.reduce_sum(K_amplified, axis=1, keepdims=True)
+
+    # Sec 3.3.2 Focusing by Location
+
+    g = array_ops.expand_dims(g, axis=1)
+
+    # eq (7)
+    w_g = g * w_c + (1 - g) * prev_w
+
+    s = array_ops.concat([
+        s[:, :self.shift_range + 1],
+        array_ops.zeros([
+            s.shape[0].value or array_ops.shape(s)[0], self.memory_size -
+            (self.shift_range * 2 + 1)
+        ]), s[:, -self.shift_range:]
+    ],
+                         axis=1)
+    t = array_ops.concat(
+        [array_ops.reverse(s, axis=[1]),
+         array_ops.reverse(s, axis=[1])],
+        axis=1)
+    s_matrix = array_ops.stack([
+        t[:, self.memory_size - i - 1:self.memory_size * 2 - i - 1]
+        for i in range(self.memory_size)
+    ],
+                               axis=1)
+
+    # eq (8)
+    w_ = math_ops.reduce_sum(
+        array_ops.expand_dims(w_g, axis=1) * s_matrix, axis=2)
+    w_sharpen = math_ops.pow(w_, array_ops.expand_dims(gamma, axis=1))
+
+    # eq (9)
+    w = w_sharpen / math_ops.reduce_sum(w_sharpen, axis=1, keepdims=True)
+
+    return w
+
+  def zero_state(self, batch_size, dtype):
+    read_vector_list = [
+        array_ops.zeros([batch_size, self.memory_vector_dim])
+        for _ in range(self.read_head_num)
+    ]
+
+    w_list = [
+        array_ops.zeros([batch_size, self.memory_size])
+        for _ in range(self.read_head_num + self.write_head_num)
+    ]
+
+    controller_init_state = self.controller.zero_state(batch_size, dtype)
+
+    M = array_ops.zeros([batch_size, self.memory_size, self.memory_vector_dim])
+
+    return NTMControllerState(
+        controller_state=controller_init_state,
+        read_vector_list=read_vector_list,
+        w_list=w_list,
+        M=M,
+        time=0)
+
+
 class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
   """MinimalRNN cell.
 
@@ -3429,7 +3770,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
    Propagation in Recurrent Neural Networks." ICML, 2018.
 
   A MinimalRNN cell first projects the input to the hidden space. The new
-  hidden state is then calcuated as a weighted sum of the projected input and
+  hidden state is then calculated as a weighted sum of the projected input and
   the previous hidden state, using a single update gate.
   """
 
@@ -3543,7 +3884,7 @@ class CFNCell(rnn_cell_impl.LayerRNNCell):
   "A recurrent neural network without chaos." ICLR, 2017.
 
   A CFN cell first projects the input to the hidden space. The hidden state
-  goes through a contractive mapping. The new hidden state is then calcuated
+  goes through a contractive mapping. The new hidden state is then calculated
   as a linear combination of the projected input and the contracted previous
   hidden state, using decoupled input and forget gates.
   """
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
index 3fc6bfbb4d03a39906d4441e48b2788423caa234..d8ab9eba7049e468b373a1641f92dc781aa22558 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -61,10 +61,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
     self._server = server
 
   def tearDown(self):
-    # TODO(ebrevdo): Figure out why this sometimes times out.
-    #    self._service.ExitLoop()
-    #    self._service_thread.join()
-    # self._server.stop()
+    self._server.stop(grace=None)
     super(RpcOpTest, self).tearDown()
 
 
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 0392ed9eee79391c60318faf68d8dfd6eb64a994..a61e9579b84a60d74b73e45a6100a2c772d9cff8 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -22,5 +22,5 @@ from tensorflow.python.keras import saving
 
 
 # TODO(kathywu): Remove all contrib callers, switch to tf.keras.
-save_keras_model = saving.export
+save_keras_model = saving.export_saved_model
 load_keras_model = saving.load_from_saved_model
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 89176180ae0dd963bccc34aa2d0fc52be839dd3f..f42a2953ef96a863bf6b0c33e763413da569bb41 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -74,9 +74,6 @@ tf_custom_op_library(
         "kernels/beam_search_ops_gpu.cu.cc",
         "kernels/beam_search_ops.h",
     ],
-    deps = [
-        "//tensorflow/core/kernels:eigen_helpers",
-    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -96,7 +93,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
     ],
 )
@@ -139,6 +135,27 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "basic_decoder_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/basic_decoder_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "beam_search_ops_test",
     size = "medium",
@@ -173,6 +190,27 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "decoder_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/decoder_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "beam_search_decoder_test",
     size = "medium",
@@ -213,3 +251,19 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
+
+cuda_py_test(
+    name = "attention_wrapper_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/attention_wrapper_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    shard_count = 4,
+)
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
index 34da8c82cdab9b6f82af328c49a365ae1cb951ed..c0b3091fb8d98589f26818b93140f1b58eb73794 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
index bc28d492fe1a25afe0d0783539aa9e759e7b703f..be2aa4782c3cbc2ecce23b57d332e9bf0cec18bc 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
@@ -91,16 +91,11 @@ struct GatherTree<GPUDevice, T> {
     beams.device(d) = beams.constant(end_token);
 
     CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
-    // clang-format off
-    GatherTreeOpKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            batch_size, max_time, beam_width,
-            step_ids.data(),
-            parent_ids.data(),
-            max_sequence_length.data(),
-            end_token,
-            beams.data());
-    // clang-format on
+    TF_CHECK_OK(CudaLaunchKernel(
+        GatherTreeOpKernel<T>, config.block_count, config.thread_per_block, 0,
+        d.stream(), batch_size, max_time, beam_width, step_ids.data(),
+        parent_ids.data(), max_sequence_length.data(), end_token,
+        beams.data()));
   }
 };
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index d815f81f847ad79ddcc6c6ecf5c050598e185d8d..98e54db4584037a0cb6aea13bc6846f38007ecba 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for contrib.seq2seq.python.ops.attention_wrapper."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import collections
 import functools
@@ -30,6 +28,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -66,6 +65,7 @@ def get_result_summary(x):
   return x
 
 
+@test_util.run_v1_only
 class AttentionWrapperTest(test.TestCase):
 
   def assertAllCloseOrEqual(self, x, y, **kwargs):
@@ -358,7 +358,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -387,7 +387,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -454,7 +454,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -696,7 +696,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0025896581),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.73333333))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -707,12 +707,12 @@ class AttentionWrapperTest(test.TestCase):
             shape=(5, 6), dtype=dtype('float32'), mean=-0.00069823361),
         time=3,
         alignments=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         attention_state=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
-        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.04865776002407074)
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.0465225502849)
 
     self._testWithAttention(
         create_attention_mechanism,
@@ -921,9 +921,9 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11723966),
+            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.115853324533),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.266666666666667))
+            shape=(5, 3), dtype=dtype('int32'), mean=8.6))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -931,7 +931,7 @@ class AttentionWrapperTest(test.TestCase):
             h=ResultSummary(
                 shape=(5, 9), dtype=dtype('float32'), mean=-0.0018327223)),
         attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11601614207),
+            shape=(5, 20), dtype=dtype('float32'), mean=0.11462739855),
         time=3,
         alignments=(ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a464dc2183f272215921d26f89ce282bbdf07b
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -0,0 +1,745 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.ops.attention_wrapper."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import initializers
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(AttentionMechanismTest, self).setUp()
+    self.batch = 10
+    self.timestep = 5
+    self.memory_size = 6
+    self.units = 8
+
+    self.memory = np.random.randn(self.batch, self.timestep,
+                                  self.memory_size).astype(np.float32)
+    self.query = np.random.randn(self.batch, self.units).astype(np.float32)
+    self.state = np.random.randn(self.batch, self.timestep).astype(np.float32)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_attention_shape_inference(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    attention_score = attention([self.query, self.state])
+    self.assertLen(attention_score, 2)
+    self.assertEqual(attention_score[0].shape, (self.batch, self.timestep))
+    self.assertEqual(attention_score[1].shape, (self.batch, self.timestep))
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_get_config(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    config = attention.get_config()
+
+    attention_from_config = attention_cls.from_config(config)
+    config_from_clone = attention_from_config.get_config()
+
+    self.assertDictEqual(config, config_from_clone)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_layer_output(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    score = attention([self.query, self.state])
+    self.evaluate(variables.variables_initializer(attention.variables))
+
+    score_val = self.evaluate(score)
+    self.assertLen(score_val, 2)
+    self.assertEqual(score_val[0].shape, (self.batch, self.timestep))
+    self.assertEqual(score_val[1].shape, (self.batch, self.timestep))
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_passing_memory_from_call(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    weights_before_query = attention.get_weights()
+    ref_score = attention([self.query, self.state])
+
+    self.evaluate(variables.global_variables_initializer())
+    ref_score_val = self.evaluate(ref_score)
+
+    all_weights = attention.get_weights()
+    config = attention.get_config()
+    # Simulate the twice invocation of calls here.
+    attention_from_config = attention_cls.from_config(config)
+    attention_from_config.build(self.memory.shape)
+    attention_from_config.set_weights(weights_before_query)
+    attention_from_config(self.memory, setup_memory=True)
+    attention_from_config.build([self.query.shape, self.state.shape])
+    attention_from_config.set_weights(all_weights)
+    score = attention_from_config([self.query, self.state])
+
+    score_val = self.evaluate(score)
+    self.assertAllClose(ref_score_val, score_val)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_save_load_layer(self, attention_cls):
+    vocab = 20
+    embedding_dim = 6
+    inputs = keras.layers.Input(shape=[self.timestep])
+    encoder_input = keras.layers.Embedding(
+        vocab, embedding_dim, mask_zero=True)(
+            inputs)
+    encoder_output = keras.layers.LSTM(
+        self.memory_size, return_sequences=True)(
+            encoder_input)
+
+    attention = attention_cls(self.units, encoder_output)
+    query = keras.layers.Input(shape=[self.units])
+    state = keras.layers.Input(shape=[self.timestep])
+
+    score = attention([query, state])
+
+    x = np.random.randint(vocab, size=(self.batch, self.timestep))
+    x_test = np.random.randint(vocab, size=(self.batch, self.timestep))
+    y = np.random.randn(self.batch, self.timestep)
+    model = keras.models.Model([inputs, query, state], score)
+    model.compile("rmsprop", "mse")
+    model.fit([x, self.query, self.state], (y, y))
+    y_ref = model.predict_on_batch([x_test, self.query, self.state])
+
+    config = model.get_config()
+    weights = model.get_weights()
+    loaded_model = keras.models.Model.from_config(
+        config, custom_objects={attention_cls.__name__: attention_cls})
+    loaded_model.set_weights(weights)
+
+    y = loaded_model.predict_on_batch([x_test, self.query, self.state])
+
+    self.assertAllClose(y_ref, y)
+
+  # TODO(scottzhu): Add tests for model.compile(run_eagerly=True)
+
+
+class ResultSummary(
+    collections.namedtuple("ResultSummary", ("shape", "dtype", "mean"))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionWrapperV2Test(test.TestCase, parameterized.TestCase):
+
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperV2Test, self).assertAllClose(
+          x, y, atol=1e-3, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
+
+  def setUp(self):
+    super(AttentionWrapperV2Test, self).setUp()
+    self.batch = 64
+    self.units = 128
+    self.encoder_timestep = 10
+    self.encoder_dim = 256
+    self.decoder_timestep = 12
+    self.encoder_outputs = np.random.randn(self.batch, self.encoder_timestep,
+                                           self.encoder_dim)
+    self.encoder_sequence_length = np.random.randint(
+        self.encoder_timestep, size=(self.batch,)).astype(np.int32)
+    self.decoder_inputs = np.random.randn(self.batch, self.decoder_timestep,
+                                          self.units)
+    self.decoder_sequence_length = np.random.randint(
+        self.decoder_timestep, size=(self.batch,)).astype(np.int32)
+
+  def _testWithAttention(self,
+                         create_attention_mechanism,
+                         expected_final_output,
+                         expected_final_state,
+                         attention_mechanism_depth=3,
+                         alignment_history=False,
+                         expected_final_alignment_history=None,
+                         attention_layer_size=6,
+                         attention_layer=None,
+                         create_query_layer=False,
+                         create_memory_layer=True,
+                         create_attention_kwargs=None):
+    attention_layer_sizes = ([attention_layer_size]
+                             if attention_layer_size is not None else None)
+    attention_layers = ([attention_layer]
+                        if attention_layer is not None else None)
+    self._testWithMaybeMultiAttention(
+        is_multi=False,
+        create_attention_mechanisms=[create_attention_mechanism],
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[attention_mechanism_depth],
+        alignment_history=alignment_history,
+        expected_final_alignment_history=expected_final_alignment_history,
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
+        create_query_layer=create_query_layer,
+        create_memory_layer=create_memory_layer,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def _testWithMaybeMultiAttention(self,
+                                   is_multi,
+                                   create_attention_mechanisms,
+                                   expected_final_output,
+                                   expected_final_state,
+                                   attention_mechanism_depths,
+                                   alignment_history=False,
+                                   expected_final_alignment_history=None,
+                                   attention_layer_sizes=None,
+                                   attention_layers=None,
+                                   create_query_layer=False,
+                                   create_memory_layer=True,
+                                   create_attention_kwargs=None):
+    # Allow is_multi to be True with a single mechanism to enable test for
+    # passing in a single mechanism in a list.
+    assert len(create_attention_mechanisms) == 1 or is_multi
+    encoder_sequence_length = [3, 2, 3, 1, 1]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    encoder_max_time = 8
+    decoder_max_time = 4
+    input_depth = 7
+    encoder_output_depth = 10
+    cell_depth = 9
+    create_attention_kwargs = create_attention_kwargs or {}
+
+    if attention_layer_sizes is not None:
+      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth]).dims[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
+
+    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
+                                     input_depth).astype(np.float32)
+    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
+                                      encoder_output_depth).astype(np.float32)
+
+    attention_mechanisms = []
+    for creator, depth in zip(create_attention_mechanisms,
+                              attention_mechanism_depths):
+      # Create a memory layer with deterministic initializer to avoid randomness
+      # in the test between graph and eager.
+      if create_query_layer:
+        create_attention_kwargs["query_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+      if create_memory_layer:
+        create_attention_kwargs["memory_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+
+      attention_mechanisms.append(
+          creator(
+              units=depth,
+              memory=encoder_outputs,
+              memory_sequence_length=encoder_sequence_length,
+              **create_attention_kwargs))
+
+    with self.cached_session(use_gpu=True):
+      attention_layer_size = attention_layer_sizes
+      attention_layer = attention_layers
+      if not is_multi:
+        if attention_layer_size is not None:
+          attention_layer_size = attention_layer_size[0]
+        if attention_layer is not None:
+          attention_layer = attention_layer[0]
+      cell = keras.layers.LSTMCell(cell_depth,
+                                   recurrent_activation="sigmoid",
+                                   kernel_initializer="ones",
+                                   recurrent_initializer="ones")
+      cell = wrapper.AttentionWrapper(
+          cell,
+          attention_mechanisms if is_multi else attention_mechanisms[0],
+          attention_layer_size=attention_layer_size,
+          alignment_history=alignment_history,
+          attention_layer=attention_layer)
+      if cell._attention_layers is not None:
+        for layer in cell._attention_layers:
+          if getattr(layer, "kernel_initializer") is None:
+            layer.kernel_initializer = initializers.glorot_uniform(seed=1337)
+
+      sampler = sampler_py.TrainingSampler()
+      my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+      initial_state = cell.get_initial_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      final_outputs, final_state, _ = my_decoder(
+          decoder_inputs,
+          initial_state=initial_state,
+          sequence_length=decoder_sequence_length)
+
+      self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+      self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+
+      expected_time = (
+          expected_final_state.time if context.executing_eagerly() else None)
+      self.assertEqual((batch_size, expected_time, attention_depth),
+                       tuple(final_outputs.rnn_output.get_shape().as_list()))
+      self.assertEqual((batch_size, expected_time),
+                       tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.assertEqual((batch_size, attention_depth),
+                       tuple(final_state.attention.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state[0].get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state[1].get_shape().as_list()))
+
+      if alignment_history:
+        if is_multi:
+          state_alignment_history = []
+          for history_array in final_state.alignment_history:
+            history = history_array.stack()
+            self.assertEqual((expected_time, batch_size, encoder_max_time),
+                             tuple(history.get_shape().as_list()))
+            state_alignment_history.append(history)
+          state_alignment_history = tuple(state_alignment_history)
+        else:
+          state_alignment_history = final_state.alignment_history.stack()
+          self.assertEqual((expected_time, batch_size, encoder_max_time),
+                           tuple(state_alignment_history.get_shape().as_list()))
+        nest.assert_same_structure(cell.state_size,
+                                   cell.zero_state(batch_size, dtypes.float32))
+        # Remove the history from final_state for purposes of the
+        # remainder of the tests.
+        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
+      else:
+        state_alignment_history = ()
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "final_outputs": final_outputs,
+          "final_state": final_state,
+          "state_alignment_history": state_alignment_history,
+      })
+
+      final_output_info = nest.map_structure(get_result_summary,
+                                             eval_result["final_outputs"])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            eval_result["final_state"])
+      print("final_output_info: ", final_output_info)
+      print("final_state_info: ", final_state_info)
+
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
+      if alignment_history:  # by default, the wrapper emits attention as output
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, eval_result["state_alignment_history"])
+        print("final_alignment_history_info: ", final_alignment_history_info)
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
+            # outputs are batch major but the stacked TensorArray is time major
+            expected_final_alignment_history,
+            final_alignment_history_info)
+
+  # TODO(b/126893309): reenable np.float16 once the bug is fixed.
+  @parameterized.parameters([np.float32, np.float64])
+  def testBahdanauNormalizedDType(self, dtype):
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.BahdanauAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        normalize=True,
+        dtype=dtype)
+    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid")
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+
+  # TODO(b/126893309): reenable np.float16 once the bug is fixed.
+  @parameterized.parameters([np.float32, np.float64])
+  def testLuongScaledDType(self, dtype):
+    # Test case for GitHub issue 18099
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.LuongAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        scale=True,
+        dtype=dtype,
+    )
+    cell = keras.layers.LSTMCell(self.units, recurrent_activation="sigmoid")
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+
+  def testBahdanauNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=0.051747426),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype(np.int32), mean=3.33333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=0.44189346),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=0.65429491)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype(np.float32), mean=0.073610783),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        create_query_layer=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.047594748),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.6))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.41311637),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.61683208)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.090581432),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.05481226),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.38453412),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.5785929)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.16311775),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9)
+
+  def testLuongScaled(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.05481226),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.38453412),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.5785929)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.16311775),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testNotUseAttentionLayer(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.072406612),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.86666666))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.61177742),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.032002)],
+        attention=ResultSummary(
+            shape=(5, 10), dtype=np.dtype("float32"), mean=0.011346335),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.041342419),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.53333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.33866978),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.46913195)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.092498459),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12079944),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12079944),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.121448785067)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones",
+                               "normalize": True}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.043294173),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.53333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.40034312),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.5925445)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.096119694),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.1211452),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.1211452),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12258384)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.027387079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.133333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.32660431),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.52464348)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.089345723),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11831035),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11831035),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12194442004)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history)
+
+  def testLuongMonotonicScaled(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.027387079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=[
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.32660431),
+            ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.52464348)],
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=0.089345723),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11831035),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11831035),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12194442004)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index abcf71c61b6e6df9462bf06323b8b11d5cc0d9a8..599abf5a361fa6e2067cd18725a9a471add8ddeb 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -13,31 +13,30 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for contrib.seq2seq.python.seq2seq.basic_decoder."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import numpy as np
 
-from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell
-from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-# pylint: enable=g-import-not-at-top
 
 
+@test_util.run_v1_only
 class BasicDecoderTest(test.TestCase):
 
   def _testStepWithTrainingHelper(self, use_output_layer):
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2341ebb77ab6ecad1e979bc8bed0080128a804da
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
@@ -0,0 +1,670 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.basic_decoder_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class BasicDecoderTest(keras_parameterized.TestCase):
+  """Unit test for basic_decoder.BasicDecoderV2."""
+
+  @parameterized.named_parameters(
+      ("use_output_layer", True),
+      ("without_output_layer", False))
+  def testStepWithTrainingHelperOutputLayer(self, use_output_layer):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    output_layer_depth = 3
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampler = sampler_py.TrainingSampler(time_major=False)
+      if use_output_layer:
+        output_layer = layers_core.Dense(output_layer_depth, use_bias=False)
+        expected_output_depth = output_layer_depth
+      else:
+        output_layer = None
+        expected_output_depth = cell_depth
+      initial_state = cell.zero_state(dtype=dtypes.float32,
+                                      batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler,
+          output_layer=output_layer)
+
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(input_t,
+                                            initial_state=initial_state,
+                                            sequence_length=sequence_length)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(expected_output_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, expected_output_depth),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      if use_output_layer:
+        # The output layer was accessed
+        self.assertEqual(len(output_layer.variables), 1)
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       eval_result["step_outputs"].sample_id.dtype)
+      self.assertAllEqual(
+          np.argmax(eval_result["step_outputs"].rnn_output, -1),
+          eval_result["step_outputs"].sample_id)
+
+  def DISABLED_testStepWithGreedyEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.cached_session(use_gpu=True):
+      embeddings = np.random.randn(vocabulary_size,
+                                   input_depth).astype(np.float32)
+      embeddings_t = constant_op.constant(embeddings)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.GreedyEmbeddingSampler()
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          embeddings_t,
+          start_tokens=start_tokens,
+          end_token=end_token,
+          initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      expected_sample_ids = np.argmax(
+          eval_result["step_outputs"].rnn_output, -1)
+      expected_step_finished = (expected_sample_ids == end_token)
+      expected_step_next_inputs = embeddings[expected_sample_ids]
+      self.assertAllEqual([False, False, False, False, False],
+                          eval_result["first_finished"])
+      self.assertAllEqual(expected_step_finished, eval_result["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       eval_result["step_outputs"].sample_id.dtype)
+      self.assertAllEqual(expected_sample_ids,
+                          eval_result["step_outputs"].sample_id)
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithSampleEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    np.random.seed(0)
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.cached_session(use_gpu=True):
+      embeddings = np.random.randn(vocabulary_size,
+                                   input_depth).astype(np.float32)
+      embeddings_t = constant_op.constant(embeddings)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.SampleEmbeddingSampler(seed=0)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(embeddings_t,
+                                            start_tokens=start_tokens,
+                                            end_token=end_token,
+                                            initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = (sample_ids == end_token)
+      expected_step_next_inputs = embeddings[sample_ids]
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithScheduledEmbeddingTrainingHelper(self):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    vocabulary_size = 10
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(
+          batch_size, max_time, input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      embeddings = np.random.randn(
+          vocabulary_size, input_depth).astype(np.float32)
+      half = constant_op.constant(0.5)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.ScheduledEmbeddingTrainingSampler(
+          sampling_probability=half,
+          time_major=False)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          input_t, sequence_length=sequence_length, embedding=embeddings,
+          initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(vocabulary_size,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[1].get_shape())
+      self.assertEqual((batch_size, input_depth),
+                       step_next_inputs.get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      batch_where_not_sampling = np.where(sample_ids == -1)
+      batch_where_sampling = np.where(sample_ids > -1)
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_sampling],
+          embeddings[sample_ids[batch_where_sampling]])
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_not_sampling],
+          np.squeeze(inputs[batch_where_not_sampling, 1], axis=0))
+
+  def _testStepWithScheduledOutputTrainingHelper(
+      self, sampling_probability, use_next_inputs_fn, use_auxiliary_inputs):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = input_depth
+    if use_auxiliary_inputs:
+      auxiliary_input_depth = 4
+      auxiliary_inputs = np.random.randn(
+          batch_size, max_time, auxiliary_input_depth).astype(np.float32)
+    else:
+      auxiliary_inputs = None
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampling_probability = constant_op.constant(sampling_probability)
+
+      if use_next_inputs_fn:
+        def next_inputs_fn(outputs):
+          # Use deterministic function for test.
+          samples = math_ops.argmax(outputs, axis=1)
+          return array_ops.one_hot(samples, cell_depth, dtype=dtypes.float32)
+      else:
+        next_inputs_fn = None
+
+      sampler = sampler_py.ScheduledOutputTrainingSampler(
+          sampling_probability=sampling_probability,
+          time_major=False,
+          next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(input_t,
+                                            sequence_length=sequence_length,
+                                            initial_state=initial_state,
+                                            auxiliary_inputs=auxiliary_inputs)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+
+      if use_next_inputs_fn:
+        output_after_next_inputs_fn = next_inputs_fn(step_outputs.rnn_output)
+
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+
+      fetches = {
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      }
+      if use_next_inputs_fn:
+        fetches["output_after_next_inputs_fn"] = output_after_next_inputs_fn
+
+      eval_result = self.evaluate(fetches)
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      batch_where_not_sampling = np.where(np.logical_not(sample_ids))
+      batch_where_sampling = np.where(sample_ids)
+
+      auxiliary_inputs_to_concat = (
+          auxiliary_inputs[:, 1] if use_auxiliary_inputs else
+          np.array([]).reshape(batch_size, 0).astype(np.float32))
+
+      expected_next_sampling_inputs = np.concatenate(
+          (eval_result["output_after_next_inputs_fn"][batch_where_sampling]
+           if use_next_inputs_fn else
+           eval_result["step_outputs"].rnn_output[batch_where_sampling],
+           auxiliary_inputs_to_concat[batch_where_sampling]),
+          axis=-1)
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_sampling],
+          expected_next_sampling_inputs)
+
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_not_sampling],
+          np.concatenate(
+              (np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
+               auxiliary_inputs_to_concat[batch_where_not_sampling]),
+              axis=-1))
+
+  def testStepWithScheduledOutputTrainingHelperWithoutNextInputsFnOrAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=False,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputsFn(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=True,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithAuxiliaryInputs(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=False,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputsFnAndAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=True,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNoSampling(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.0, use_next_inputs_fn=True,
+        use_auxiliary_inputs=True)
+
+  def testStepWithInferenceHelperCategorical(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size
+    start_token = 0
+    end_token = 6
+
+    start_inputs = array_ops.one_hot(
+        np.ones(batch_size, dtype=np.int32) * start_token,
+        vocabulary_size)
+
+    # The sample function samples categorically from the logits.
+    sample_fn = lambda x: sampler_py.categorical_sample(logits=x)
+    # The next inputs are a one-hot encoding of the sampled labels.
+    next_inputs_fn = (
+        lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
+    end_fn = lambda sample_ids: math_ops.equal(sample_ids, end_token)
+
+    with self.cached_session(use_gpu=True):
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.InferenceSampler(
+          sample_fn, sample_shape=(), sample_dtype=dtypes.int32, end_fn=end_fn,
+          next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          start_inputs, initial_state=initial_state)
+
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = (sample_ids == end_token)
+      expected_step_next_inputs = np.zeros((batch_size, vocabulary_size))
+      expected_step_next_inputs[np.arange(batch_size), sample_ids] = 1.0
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithInferenceHelperMultilabel(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size
+    start_token = 0
+    end_token = 6
+
+    start_inputs = array_ops.one_hot(
+        np.ones(batch_size, dtype=np.int32) * start_token,
+        vocabulary_size)
+
+    # The sample function samples independent bernoullis from the logits.
+    sample_fn = (
+        lambda x: sampler_py.bernoulli_sample(logits=x, dtype=dtypes.bool))
+    # The next inputs are a one-hot encoding of the sampled labels.
+    next_inputs_fn = math_ops.to_float
+    end_fn = lambda sample_ids: sample_ids[:, end_token]
+
+    with self.cached_session(use_gpu=True):
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.InferenceSampler(
+          sample_fn, sample_shape=[cell_depth], sample_dtype=dtypes.bool,
+          end_fn=end_fn, next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          start_inputs, initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth, cell_depth),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.bool),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = sample_ids[:, end_token]
+      expected_step_next_inputs = sample_ids.astype(np.float32)
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 5e28e651c666b1c448f778fc9c02d637ce817bae..8c84cd13588b624d8a50ca2dc3e4432cf6422473 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -25,10 +25,13 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
@@ -184,14 +187,23 @@ class TestArrayShapeChecks(test.TestCase):
         shape=dynamic_shape)
 
     batch_size = array_ops.constant(batch_size)
-    check_op = beam_search_decoder._check_batch_beam(t, batch_size, beam_width)  # pylint: disable=protected-access
 
-    with self.cached_session() as sess:
-      if is_valid:
-        sess.run(check_op)
+    def _test_body():
+      # pylint: disable=protected-access
+      if context.executing_eagerly():
+        beam_search_decoder._check_batch_beam(t, batch_size, beam_width)
       else:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(check_op)
+        with self.cached_session():
+          check_op = beam_search_decoder._check_batch_beam(
+              t, batch_size, beam_width)
+          self.evaluate(check_op)
+      # pylint: enable=protected-access
+
+    if is_valid:
+      _test_body()
+    else:
+      with self.assertRaises(errors.InvalidArgumentError):
+        _test_body()
 
   def test_array_shape_dynamic_checks(self):
     self._test_array_shape_dynamic_checks(
@@ -460,6 +472,7 @@ class TestLargeBeamStep(test.TestCase):
     self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
 
 
+@test_util.run_v1_only
 class BeamSearchDecoderTest(test.TestCase):
 
   def _testDynamicDecodeRNN(self, time_major, has_attention,
@@ -530,11 +543,10 @@ class BeamSearchDecoderTest(test.TestCase):
           return (shape[1], shape[0]) + shape[2:]
         return shape
 
-      self.assertTrue(
-          isinstance(final_outputs,
-                     beam_search_decoder.FinalBeamSearchDecoderOutput))
-      self.assertTrue(
-          isinstance(final_state, beam_search_decoder.BeamSearchDecoderState))
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
 
       beam_search_decoder_output = final_outputs.beam_search_decoder_output
       self.assertEqual(
@@ -574,5 +586,119 @@ class BeamSearchDecoderTest(test.TestCase):
         with_alignment_history=True)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class BeamSearchDecoderV2Test(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major, has_attention,
+                            with_alignment_history=False):
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
+    batch_size = 5
+    decoder_max_time = 4
+    input_depth = 7
+    cell_depth = 9
+    attention_depth = 6
+    vocab_size = 20
+    end_token = vocab_size - 1
+    start_token = 0
+    embedding_dim = 50
+    max_out = max(decoder_sequence_length)
+    output_layer = layers.Dense(vocab_size, use_bias=True, activation=None)
+    beam_width = 3
+
+    with self.cached_session():
+      batch_size_tensor = constant_op.constant(batch_size)
+      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      initial_state = cell.zero_state(batch_size, dtypes.float32)
+      coverage_penalty_weight = 0.0
+      if has_attention:
+        coverage_penalty_weight = 0.2
+        inputs = array_ops.placeholder_with_default(
+            np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+                np.float32),
+            shape=(None, None, input_depth))
+        tiled_inputs = beam_search_decoder.tile_batch(
+            inputs, multiplier=beam_width)
+        tiled_sequence_length = beam_search_decoder.tile_batch(
+            encoder_sequence_length, multiplier=beam_width)
+        attention_mechanism = attention_wrapper.BahdanauAttention(
+            num_units=attention_depth,
+            memory=tiled_inputs,
+            memory_sequence_length=tiled_sequence_length)
+        initial_state = beam_search_decoder.tile_batch(
+            initial_state, multiplier=beam_width)
+        cell = attention_wrapper.AttentionWrapper(
+            cell=cell,
+            attention_mechanism=attention_mechanism,
+            attention_layer_size=attention_depth,
+            alignment_history=with_alignment_history)
+      cell_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+      if has_attention:
+        cell_state = cell_state.clone(cell_state=initial_state)
+      bsd = beam_search_decoder.BeamSearchDecoderV2(
+          cell=cell,
+          beam_width=beam_width,
+          output_layer=output_layer,
+          length_penalty_weight=0.0,
+          coverage_penalty_weight=coverage_penalty_weight,
+          output_time_major=time_major,
+          maximum_iterations=max_out)
+
+      final_outputs, final_state, final_sequence_lengths = bsd(
+          embedding,
+          start_tokens=array_ops.fill([batch_size_tensor], start_token),
+          end_token=end_token,
+          initial_state=cell_state)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
+
+      beam_search_decoder_output = final_outputs.beam_search_decoder_output
+      expected_seq_length = 3 if context.executing_eagerly() else None
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(beam_search_decoder_output.scores.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(final_outputs.predicted_ids.get_shape().as_list()))
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_results = self.evaluate({
+          'final_outputs': final_outputs,
+          'final_sequence_lengths': final_sequence_lengths
+      })
+
+      max_sequence_length = np.max(eval_results['final_sequence_lengths'])
+
+      # A smoke test
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)),
+          eval_results['final_outputs'].beam_search_decoder_output.scores.shape)
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)), eval_results[
+              'final_outputs'].beam_search_decoder_output.predicted_ids.shape)
+
+  def testDynamicDecodeRNNBatchMajorNoAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=False)
+
+  def testDynamicDecodeRNNBatchMajorYesAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=True)
+
+  def testDynamicDecodeRNNBatchMajorYesAttentionWithAlignmentHistory(self):
+    self._testDynamicDecodeRNN(
+        time_major=False,
+        has_attention=True,
+        with_alignment_history=True)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index b41734d214e98cd24be0c98ee67f7cb5e58b7a61..5506aa8b8ee259fbacc80bc310cd954bdb66482b 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -49,8 +49,8 @@ class GatherTreeTest(test.TestCase):
         parent_ids=parent_ids,
         max_sequence_lengths=max_sequence_lengths,
         end_token=end_token)
-    with self.session(use_gpu=True):
-      self.assertAllEqual(expected_result, beams.eval())
+    with self.cached_session(use_gpu=True):
+      self.assertAllEqual(expected_result, self.evaluate(beams))
 
   def testBadParentValuesOnCPU(self):
     # (batch_size = 1, max_time = 4, beams = 3)
@@ -62,15 +62,14 @@ class GatherTreeTest(test.TestCase):
         [[[0, 0, 0], [0, -1, 1], [2, 1, 2], [-1, -1, -1]]])
     max_sequence_lengths = [3]
     with ops.device("/cpu:0"):
-      beams = beam_search_ops.gather_tree(
-          step_ids=step_ids,
-          parent_ids=parent_ids,
-          max_sequence_lengths=max_sequence_lengths,
-          end_token=end_token)
-    with self.cached_session():
       with self.assertRaisesOpError(
           r"parent id -1 at \(batch, time, beam\) == \(0, 0, 1\)"):
-        _ = beams.eval()
+        beams = beam_search_ops.gather_tree(
+            step_ids=step_ids,
+            parent_ids=parent_ids,
+            max_sequence_lengths=max_sequence_lengths,
+            end_token=end_token)
+        self.evaluate(beams)
 
   def testBadParentValuesOnGPU(self):
     # Only want to run this test on CUDA devices, as gather_tree is not
@@ -93,8 +92,7 @@ class GatherTreeTest(test.TestCase):
           parent_ids=parent_ids,
           max_sequence_lengths=max_sequence_lengths,
           end_token=end_token)
-    with self.session(use_gpu=True):
-      self.assertAllEqual(expected_result, beams.eval())
+      self.assertAllEqual(expected_result, self.evaluate(beams))
 
   def testGatherTreeBatch(self):
     batch_size = 10
@@ -103,7 +101,7 @@ class GatherTreeTest(test.TestCase):
     max_sequence_lengths = [0, 1, 2, 4, 7, 8, 9, 10, 11, 0]
     end_token = 5
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       step_ids = np.random.randint(
           0, high=end_token + 1, size=(max_time, batch_size, beam_width))
       parent_ids = np.random.randint(
@@ -116,7 +114,7 @@ class GatherTreeTest(test.TestCase):
           end_token=end_token)
 
       self.assertEqual((max_time, batch_size, beam_width), beams.shape)
-      beams_value = beams.eval()
+      beams_value = self.evaluate(beams)
       for b in range(batch_size):
         # Past max_sequence_lengths[b], we emit all end tokens.
         b_value = beams_value[max_sequence_lengths[b]:, b, :]
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index 4c25489fade320f2f2218354343021a71af01baf..4a420221e27775c2844daaba6b6d2b3b2ce38828 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -13,26 +13,25 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for contrib.seq2seq.python.seq2seq.decoder."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import numpy as np
 
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
-from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
-# pylint: enable=g-import-not-at-top
 
 
+@test_util.run_v1_only
 class DynamicDecodeRNNTest(test.TestCase):
 
   def _testDynamicDecodeRNN(self, time_major, maximum_iterations=None):
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5bba2b32e940aa4d5984821ebd3845d7f272549
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py
@@ -0,0 +1,169 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class DecodeV2RNNTest(keras_parameterized.TestCase, test.TestCase):
+  """Tests for DecoderV2."""
+
+  def _testDecodeRNN(self, time_major, maximum_iterations=None):
+
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.cached_session(use_gpu=True):
+      if time_major:
+        inputs = np.random.randn(max_time, batch_size,
+                                 input_depth).astype(np.float32)
+      else:
+        inputs = np.random.randn(batch_size, max_time,
+                                 input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampler = sampler_py.TrainingSampler(time_major=time_major)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler,
+          output_time_major=time_major,
+          maximum_iterations=maximum_iterations)
+
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      (final_outputs, unused_final_state, final_sequence_length) = my_decoder(
+          input_t, initial_state=initial_state, sequence_length=sequence_length)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      if not context.executing_eagerly():
+        self.assertEqual((batch_size,),
+                         tuple(final_sequence_length.get_shape().as_list()))
+        self.assertEqual(
+            _t((batch_size, None, cell_depth)),
+            tuple(final_outputs.rnn_output.get_shape().as_list()))
+        self.assertEqual(
+            _t((batch_size, None)),
+            tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.evaluate(variables.global_variables_initializer())
+      final_outputs = self.evaluate(final_outputs)
+      final_sequence_length = self.evaluate(final_sequence_length)
+
+      # Mostly a smoke test
+      time_steps = max_out
+      expected_length = sequence_length
+      if maximum_iterations is not None:
+        time_steps = min(max_out, maximum_iterations)
+        expected_length = [min(x, maximum_iterations) for x in expected_length]
+      if context.executing_eagerly() and maximum_iterations != 0:
+        # Only check the shape of output when maximum_iterations > 0, see
+        # b/123431432 for more details.
+        self.assertEqual(
+            _t((batch_size, time_steps, cell_depth)),
+            final_outputs.rnn_output.shape)
+        self.assertEqual(
+            _t((batch_size, time_steps)), final_outputs.sample_id.shape)
+      self.assertItemsEqual(expected_length, final_sequence_length)
+
+  def testDynamicDecodeRNNBatchMajor(self):
+    self._testDecodeRNN(time_major=False)
+
+  def testDynamicDecodeRNNTimeMajor(self):
+    self._testDecodeRNN(time_major=True)
+
+  def testDynamicDecodeRNNZeroMaxIters(self):
+    self._testDecodeRNN(time_major=True, maximum_iterations=0)
+
+  def testDynamicDecodeRNNOneMaxIter(self):
+    self._testDecodeRNN(time_major=True, maximum_iterations=1)
+
+  def _testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+      self, use_sequence_length):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      inputs = constant_op.constant(inputs)
+
+      cell = rnn_cell.LSTMCell(cell_depth)
+      zero_state = cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)
+      sampler = sampler_py.TrainingSampler()
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell, sampler=sampler, impute_finished=use_sequence_length)
+
+      final_decoder_outputs, final_decoder_state, _ = my_decoder(
+          inputs, initial_state=zero_state, sequence_length=sequence_length)
+
+      final_rnn_outputs, final_rnn_state = rnn.dynamic_rnn(
+          cell,
+          inputs,
+          sequence_length=sequence_length if use_sequence_length else None,
+          initial_state=zero_state)
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "final_decoder_outputs": final_decoder_outputs,
+          "final_decoder_state": final_decoder_state,
+          "final_rnn_outputs": final_rnn_outputs,
+          "final_rnn_state": final_rnn_state
+      })
+
+      # Decoder only runs out to max_out; ensure values are identical
+      # to dynamic_rnn, which also zeros out outputs and passes along state.
+      self.assertAllClose(eval_result["final_decoder_outputs"].rnn_output,
+                          eval_result["final_rnn_outputs"][:, 0:max_out, :])
+      if use_sequence_length:
+        self.assertAllClose(eval_result["final_decoder_state"],
+                            eval_result["final_rnn_state"])
+
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNWithSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+        use_sequence_length=True)
+
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNNoSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+        use_sequence_length=False)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 41b2a53ca5b178be9b04446c81d832575e5ed75b..7eb544a921c595c667083b783757f4b719be5aa0 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.platform import test
 @test_util.run_all_in_graph_and_eager_modes
 class LossTest(test.TestCase):
 
-  def setUp(self):
+  def config_default_values(self):
     self.batch_size = 2
     self.sequence_length = 3
     self.number_of_classes = 5
@@ -56,7 +56,8 @@ class LossTest(test.TestCase):
     self.expected_loss = 1.60944
 
   def testSequenceLoss(self):
-    with self.test_session(use_gpu=True):
+    self.config_default_values()
+    with self.cached_session(use_gpu=True):
       average_loss_per_example = loss.sequence_loss(
           self.logits, self.targets, self.weights,
           average_across_timesteps=True,
@@ -90,7 +91,8 @@ class LossTest(test.TestCase):
       self.assertAllClose(compare_total, res)
 
   def testSequenceLossClass(self):
-    with self.test_session(use_gpu=True):
+    self.config_default_values()
+    with self.cached_session(use_gpu=True):
       seq_loss = loss.SequenceLoss(average_across_timesteps=True,
                                    average_across_batch=True,
                                    sum_over_timesteps=False,
@@ -132,7 +134,8 @@ class LossTest(test.TestCase):
       self.assertAllClose(compare_total, res)
 
   def testSumReduction(self):
-    with self.test_session(use_gpu=True):
+    self.config_default_values()
+    with self.cached_session(use_gpu=True):
       seq_loss = loss.SequenceLoss(average_across_timesteps=False,
                                    average_across_batch=False,
                                    sum_over_timesteps=True,
@@ -174,6 +177,7 @@ class LossTest(test.TestCase):
       self.assertAllClose(compare_total, res)
 
   def testWeightedSumReduction(self):
+    self.config_default_values()
     weights = [
         constant_op.constant(1.0, shape=[self.batch_size])
         for _ in range(self.sequence_length)
@@ -181,7 +185,7 @@ class LossTest(test.TestCase):
     # Make the last element in the sequence to have zero weights.
     weights[-1] = constant_op.constant(0.0, shape=[self.batch_size])
     self.weights = array_ops.stack(weights, axis=1)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       seq_loss = loss.SequenceLoss(average_across_timesteps=False,
                                    average_across_batch=False,
                                    sum_over_timesteps=True,
@@ -225,12 +229,13 @@ class LossTest(test.TestCase):
       self.assertAllClose(compare_total, res)
 
   def testZeroWeights(self):
+    self.config_default_values()
     weights = [
         constant_op.constant(0.0, shape=[self.batch_size])
         for _ in range(self.sequence_length)
     ]
     weights = array_ops.stack(weights, axis=1)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       average_loss_per_example = loss.sequence_loss(
           self.logits, self.targets, weights,
           average_across_timesteps=True,
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 31c62d5849ac3bdb35cbd00f03b298cb5952162c..577a3efbd7da58e8931c6668af2c2f1be91e7298 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -25,10 +25,13 @@ import math
 import numpy as np
 
 from tensorflow.contrib.framework.python.framework import tensor_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -218,25 +221,46 @@ class _BaseAttentionMechanism(AttentionMechanism):
     return self.initial_alignments(batch_size, dtype)
 
 
-class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
+class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
   """A base AttentionMechanism class providing common functionality.
 
   Common functionality includes:
     1. Storing the query and memory layers.
     2. Preprocessing and storing the memory.
 
-  Note that this layer only support Keras functional API since it takes multiple
-  input tensors, which is not available in sequential model.
+  Note that this layer takes memory as its init parameter, which is an
+  anti-pattern of Keras API, we have to keep the memory as init parameter for
+  performance and dependency reason. Under the hood, during `__init__()`, it
+  will invoke `base_layer.__call__(memory, setup_memory=True)`. This will let
+  keras to keep track of the memory tensor as the input of this layer. Once
+  the `__init__()` is done, then user can query the attention by
+  `score = att_obj([query, state])`, and use it as a normal keras layer.
+
+  Special attention is needed when adding using this class as the base layer for
+  new attention:
+    1. Build() could be invoked at least twice. So please make sure weights are
+       not duplicated.
+    2. Layer.get_weights() might return different set of weights if the instance
+       has `query_layer`. The query_layer weights is not initialized until the
+       memory is configured.
+
+  Also note that this layer does not work with Keras model when
+  `model.compile(run_eagerly=True)` due to the fact that this layer is stateful.
+  The support for that will be added in a future version.
   """
 
   def __init__(self,
+               memory,
                probability_fn,
                query_layer=None,
                memory_layer=None,
+               memory_sequence_length=None,
                **kwargs):
     """Construct base AttentionMechanism class.
 
     Args:
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
       probability_fn: A `callable`. Converts the score and previous alignments
         to probabilities. Its signature should be:
         `probabilities = probability_fn(score, state)`.
@@ -247,15 +271,18 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
         depth must match the depth of `query_layer`.
         If `memory_layer` is not provided, the shape of `memory` must match
         that of `query_layer`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       **kwargs: Dictionary that contains other common arguments for layer
         creation.
     """
     if (query_layer is not None
-        and not isinstance(query_layer, layers_base.Layer)):
+        and not isinstance(query_layer, layers.Layer)):
       raise TypeError(
           "query_layer is not a Layer: %s" % type(query_layer).__name__)
     if (memory_layer is not None
-        and not isinstance(memory_layer, layers_base.Layer)):
+        and not isinstance(memory_layer, layers.Layer)):
       raise TypeError(
           "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
     self.query_layer = query_layer
@@ -273,18 +300,127 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
     self.batch_size = None
     self._memory_initialized = False
     self._check_inner_dims_defined = True
+    self.supports_masking = True
+    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
+
+    if memory is not None:
+      # Setup the memory by self.__call__() with memory and memory_seq_length.
+      # This will make the attention follow the keras convention which takes
+      # all the tensor inputs via __call__().
+      if memory_sequence_length is None:
+        inputs = memory
+      else:
+        inputs = [memory, memory_sequence_length]
+
+      self.values = super(_BaseAttentionMechanismV2, self).__call__(
+          inputs, setup_memory=True)
 
   def build(self, input_shape):
-    if self.query_layer is not None:
-      self.query_layer.build(input_shape)
-    if self.memory_layer is not None:
-      self.memory_layer.build(input_shape)
-    # dtype of the layer is known at this moment, create the score_mask_value if
-    # needed.
-    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
-    self.built = True
+    if not self._memory_initialized:
+      # This is for setting up the memory, which contains memory and optional
+      # memory_sequence_length. Build the memory_layer with memory shape.
+      if self.memory_layer is not None and not self.memory_layer.built:
+        if isinstance(input_shape, list):
+          self.memory_layer.build(input_shape[0])
+        else:
+          self.memory_layer.build(input_shape)
+    else:
+      # The input_shape should be query.shape and state.shape. Use the query
+      # to init the query layer.
+      if self.query_layer is not None and not self.query_layer.built:
+        self.query_layer.build(input_shape[0])
+
+  def __call__(self, inputs, **kwargs):
+    """Preprocess the inputs before calling `base_layer.__call__()`.
+
+    Note that there are situation here, one for setup memory, and one with
+    actual query and state.
+    1. When the memory has not been configured, we just pass all the param to
+    base_layer.__call__(), which will then invoke self.call() with proper
+    inputs, which allows this class to setup memory.
+    2. When the memory has already been setup, the input should contain query
+    and state, and optionally processed memory. If the processed memory is
+    not included in the input, we will have to append it to the inputs and
+    give it to the base_layer.__call__(). The processed memory is the output
+    of first invocation of self.__call__(). If we don't add it here, then from
+    keras perspective, the graph is disconnected since the output from
+    previous call is never used.
 
-  def _setup_memory(self, memory, memory_mask=None):
+    Args:
+      inputs: the inputs tensors.
+      **kwargs: dict, other keyeword arguments for the `__call__()`
+    """
+    if self._memory_initialized:
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have 2 or 3 tensors, got %d" %
+                         len(inputs))
+      if len(inputs) == 2:
+        # We append the calculated memory here so that the graph will be
+        # connected.
+        inputs.append(self.values)
+    return super(_BaseAttentionMechanismV2, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, setup_memory=False, **kwargs):
+    """Setup the memory or query the attention.
+
+    There are two case here, one for setup memory, and the second is query the
+    attention score. `setup_memory` is the flag to indicate which mode it is.
+    The input list will be treated differently based on that flag.
+
+    Args:
+      inputs: a list of tensor that could either be `query` and `state`, or
+        `memory` and `memory_sequence_length`.
+        `query` is the tensor of dtype matching `memory` and shape
+        `[batch_size, query_depth]`.
+        `state` is the tensor of dtype matching `memory` and shape
+        `[batch_size, alignments_size]`. (`alignments_size` is memory's
+        `max_time`).
+        `memory` is the memory to query; usually the output of an RNN encoder.
+        The tensor should be shaped `[batch_size, max_time, ...]`.
+        `memory_sequence_length` (optional) is the sequence lengths for the
+         batch entries in memory. If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
+        memory. If it is not None, the corresponding item of the memory should
+        be filtered out during calculation.
+      setup_memory: boolean, whether the input is for setting up memory, or
+        query attention.
+      **kwargs: Dict, other keyword arguments for the call method.
+    Returns:
+      Either processed memory or attention score, based on `setup_memory`.
+    """
+    if setup_memory:
+      if isinstance(inputs, list):
+        if len(inputs) not in (1, 2):
+          raise ValueError("Expect inputs to have 1 or 2 tensors, got %d" %
+                           len(inputs))
+        memory = inputs[0]
+        memory_sequence_length = inputs[1] if len(inputs) == 2 else None
+        memory_mask = mask
+      else:
+        memory, memory_sequence_length = inputs, None
+        memory_mask = mask
+      self._setup_memory(memory, memory_sequence_length, memory_mask)
+      # We force the self.built to false here since only memory is initialized,
+      # but the real query/state has not been call() yet. The layer should be
+      # build and call again.
+      self.built = False
+      # Return the processed memory in order to create the Keras connectivity
+      # data for it.
+      return self.values
+    else:
+      if not self._memory_initialized:
+        raise ValueError("Cannot query the attention before the setup of "
+                         "memory")
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have query, state, and optional "
+                         "processed memory, got %d items" % len(inputs))
+      # Ignore the rest of the inputs and only care about the query and state
+      query, state = inputs[0], inputs[1]
+      return self._calculate_attention(query, state)
+
+  def _setup_memory(self, memory, memory_sequence_length=None,
+                    memory_mask=None):
     """Pre-process the memory before actually query the memory.
 
     This should only be called once at the first invocation of call().
@@ -292,17 +428,30 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
     Args:
       memory: The memory to query; usually the output of an RNN encoder. This
         tensor should be shaped `[batch_size, max_time, ...]`.
-      memory_mask: The boolean tensor with shape `[batch_size, max_time]`. For
-        any value equal to False, the corresponding value in memory should be
-        ignored.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros for
+        values past the respective sequence lengths.
+      memory_mask: (Optional) The boolean tensor with shape `[batch_size,
+        max_time]`. For any value equal to False, the corresponding value in
+        memory should be ignored.
     """
     if self._memory_initialized:
       raise ValueError("The memory for the attention has already been setup.")
+    if memory_sequence_length is not None and memory_mask is not None:
+      raise ValueError("memory_sequence_length and memory_mask cannot be "
+                       "used at same time for attention.")
     with ops.name_scope(
         self.name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self.values = _prepare_memory(
-          memory, memory_mask=memory_mask,
+          memory,
+          memory_sequence_length=memory_sequence_length,
+          memory_mask=memory_mask,
           check_inner_dims_defined=self._check_inner_dims_defined)
+      # Mark the value as check since the memory and memory mask might not
+      # passed from __call__(), which does not have proper keras metadata.
+      # TODO(omalleyt): Remove this hack once the mask the has proper keras
+      # history.
+      base_layer_utils.mark_checked(self.values)
       if self.memory_layer is not None:
         self.keys = self.memory_layer(self.values)
       else:
@@ -310,39 +459,28 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
       self.batch_size = (
           tensor_shape.dimension_value(self.keys.shape[0]) or
           array_ops.shape(self.keys)[0])
-      self.alignments_size = (tensor_shape.dimension_value(self.keys.shape[1])
-                              or array_ops.shape(self.keys)[1])
+      self._alignments_size = (tensor_shape.dimension_value(self.keys.shape[1])
+                               or array_ops.shape(self.keys)[1])
       if memory_mask is not None:
-        self.probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
-            self.probability_fn(_maybe_mask_score(
-                score, self.score_mask_value, memory_mask=memory_mask), prev))
+        unwrapped_probability_fn = self.probability_fn
+        def _mask_probability_fn(score, prev):
+          return unwrapped_probability_fn(
+              _maybe_mask_score(
+                  score,
+                  memory_mask=memory_mask,
+                  memory_sequence_length=memory_sequence_length,
+                  score_mask_value=self.score_mask_value), prev)
+        self.probability_fn = _mask_probability_fn
     self._memory_initialized = True
 
-  def call(self, inputs, mask=None, **kwargs):
-    """Base method to calculate the attention score.
-
-    Args:
-      inputs: a list of tensor that contains `query`, `state`, and `memory`.
-        `query` is the tensor of dtype matching `memory` and shape
-        `[batch_size, query_depth]`.
-        `state` is the tensor of dtype matching `memory` and shape
-        `[batch_size, alignments_size]`. (`alignments_size` is memory's
-        `max_time`).
-        `memory` is the memory to query; usually the output of an RNN encoder.
-        This tensor should be shaped `[batch_size, max_time, feature]`.
-      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
-        memory. If it is not None, the corresponding item of the memory should
-        be filtered out during calculation.
-      **kwargs: Dict, other keyword arguments for the call method.
-    """
-    query, state, memory, memory_mask = self._process_inputs(inputs, mask)
-    if not self._memory_initialized:
-      self._setup_memory(memory, memory_mask=memory_mask)
-    return self.calculate_attention(query, state)
-
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     raise NotImplementedError(
-        "calculate_attention need to be implemented by subclasses.")
+        "_calculate_attention need to be implemented by subclasses.")
+
+  def compute_mask(self, inputs, mask=None):
+    # There real input of the attention is query and state, and the memory layer
+    # mask shouldn't be pass down. Returning None for all output mask here.
+    return None, None
 
   def get_config(self):
     config = {}
@@ -359,16 +497,12 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
           "class_name": self.memory_layer.__class__.__name__,
           "config": self.memory_layer.get_config(),
       }
+    # memory is a required init parameter and its a tensor. It cannot be
+    # serialized to config, so we put a placeholder for it.
+    config["memory"] = None
     base_config = super(_BaseAttentionMechanismV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
-  def _process_inputs(self, inputs, mask):
-    if len(inputs) != 3:
-      raise ValueError(
-          "Expect to have 3 inputs for attention, got %d" % len(inputs))
-    query, state, memory = inputs
-    return query, state, memory, mask
-
   def _process_probability_fn(self, func_name):
     """Helper method to retrieve the probably function by string input."""
     valid_probability_fns = {
@@ -398,6 +532,8 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
     """
     # Reconstruct the query and memory layer for parent class.
     from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    # Instead of updating the input, create a copy and use that.
+    config = config.copy()
     query_layer_config = config.pop("query_layer", None)
     if query_layer_config:
       query_layer = deserialize_layer(query_layer_config,
@@ -410,6 +546,50 @@ class _BaseAttentionMechanismV2(AttentionMechanism, Layer):
       config["memory_layer"] = memory_layer
     return config
 
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+  @property
+  def state_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
+  def initial_state(self, batch_size, dtype):
+    """Creates the initial state values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return the same output as initial_alignments.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A structure of all-zero tensors with shapes as described by `state_size`.
+    """
+    return self.initial_alignments(batch_size, dtype)
+
 
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
@@ -579,6 +759,8 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                scale=False,
                probability_fn="softmax",
                dtype=None,
@@ -588,6 +770,11 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
 
     Args:
       units: The depth of the attention mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       scale: Python boolean. Whether to scale the energy term.
       probability_fn: (optional) string, the name of function to convert the
         attention score to probabilities. The default is `softmax` which is
@@ -606,27 +793,31 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     if dtype is None:
       dtype = dtypes.float32
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.scale = scale
+    self.scale_weight = None
     super(LuongAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
         query_layer=None,
-        memory_layer=layers_core.Dense(
-            units, name="memory_layer", use_bias=False, dtype=dtype),
+        memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.scale = scale
 
   def build(self, input_shape):
     super(LuongAttentionV2, self).build(input_shape)
-    if self.scale:
+    if self.scale and self.scale_weight is None:
       self.scale_weight = self.add_weight(
           "attention_g", initializer=init_ops.ones_initializer, shape=())
-    else:
-      self.scale_weight = None
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -640,6 +831,7 @@ class LuongAttentionV2(_BaseAttentionMechanismV2):
       alignments: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]` (`alignments_size` is memory's
         `max_time`).
+      next_state: Same as the alignments.
     """
     score = _luong_score(query, self.keys, self.scale_weight)
     alignments = self.probability_fn(score, state)
@@ -839,8 +1031,11 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                normalize=False,
                probability_fn="softmax",
+               kernel_initializer="glorot_uniform",
                dtype=None,
                name="BahdanauAttention",
                **kwargs):
@@ -848,12 +1043,19 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
       probability_fn: (optional) string, the name of function to convert the
         attention score to probabilities. The default is `softmax` which is
         `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
         this module. Any other value will result into validation error. Default
         to use `softmax`.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
       dtype: The data type for the query and memory layers of the attention
         mechanism.
       name: Name to use when creating ops.
@@ -865,35 +1067,47 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     if dtype is None:
       dtype = dtypes.float32
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.normalize = normalize
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_g = None
+    self.attention_b = None
     super(BahdanauAttentionV2, self).__init__(
-        query_layer=layers_core.Dense(
-            units, name="query_layer", use_bias=False, dtype=dtype),
-        memory_layer=layers_core.Dense(
-            units, name="memory_layer", use_bias=False, dtype=dtype),
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=query_layer,
+        memory_layer=memory_layer,
         probability_fn=wrapped_probability_fn,
         name=name,
         dtype=dtype,
         **kwargs)
-    self.units = units
-    self.normalize = normalize
 
   def build(self, input_shape):
     super(BahdanauAttentionV2, self).build(input_shape)
-    self.attention_v = self.add_weight(
-        "attention_v", [self.units], dtype=self.dtype)
-    if self.normalize:
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units],
+          dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
           "attention_g", initializer=init_ops.constant_initializer(
               math.sqrt((1. / self.units))), shape=())
       self.attention_b = self.add_weight(
           "attention_b", shape=[self.units],
           initializer=init_ops.zeros_initializer())
-    else:
-      self.attention_g = None
-      self.attention_b = None
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -907,6 +1121,7 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
       alignments: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]` (`alignments_size` is memory's
         `max_time`).
+      next_state: same as alignments.
     """
     processed_query = self.query_layer(query) if self.query_layer else query
     score = _bahdanau_score(processed_query, self.keys, self.attention_v,
@@ -921,6 +1136,7 @@ class BahdanauAttentionV2(_BaseAttentionMechanismV2):
         "units": self.units,
         "normalize": self.normalize,
         "probability_fn": self.probability_fn_name,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer)
     }
     base_config = super(BahdanauAttentionV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1280,11 +1496,14 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                normalize=False,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               kernel_initializer="glorot_uniform",
                dtype=None,
                name="BahdanauMonotonicAttention",
                **kwargs):
@@ -1292,6 +1511,11 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       normalize: Python boolean. Whether to normalize the energy term.
       sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring
         for `_monotonic_probability_fn` for more information.
@@ -1302,6 +1526,8 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
       mode: How to compute the attention distribution. Must be one of
         'recursive', 'parallel', or 'hard'. See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
       dtype: The data type for the query and memory layers of the attention
         mechanism.
       name: Name to use when creating ops.
@@ -1314,34 +1540,47 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
-    super(BahdanauMonotonicAttentionV2, self).__init__(
-        query_layer=layers_core.Dense(
-            units, name="query_layer", use_bias=False, dtype=dtype),
-        memory_layer=layers_core.Dense(
-            units, name="memory_layer", use_bias=False, dtype=dtype),
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs)
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
     self.units = units
     self.normalize = normalize
     self.sigmoid_noise = sigmoid_noise
     self.sigmoid_noise_seed = sigmoid_noise_seed
     self.score_bias_init = score_bias_init
     self.mode = mode
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_score_bias = None
+    self.attention_g = None
+    self.attention_b = None
+    super(BahdanauMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=query_layer,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
 
   def build(self, input_shape):
     super(BahdanauMonotonicAttentionV2, self).build(input_shape)
-    self.attention_v = self.add_weight(
-        "attention_v", [self.units], dtype=self.dtype)
-    self.attention_score_bias = self.add_weight(
-        "attention_score_bias", shape=(), dtype=self.dtype,
-        initializer=init_ops.constant_initializer(
-            self.score_bias_init, dtype=self.dtype))
-    if not self.normalize:
-      self.attention_g = None
-      self.attention_b = None
-    else:
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units], dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(), dtype=self.dtype,
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
+    if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
           "attention_g", dtype=self.dtype,
           initializer=init_ops.constant_initializer(
@@ -1352,7 +1591,7 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
           initializer=init_ops.zeros_initializer())
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -1384,6 +1623,7 @@ class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
         "sigmoid_noise_seed": self.sigmoid_noise_seed,
         "score_bias_init": self.score_bias_init,
         "mode": self.mode,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer),
     }
     base_config = super(BahdanauMonotonicAttentionV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1517,6 +1757,8 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
   def __init__(self,
                units,
+               memory,
+               memory_sequence_length=None,
                scale=False,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
@@ -1529,6 +1771,11 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
 
     Args:
       units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
       scale: Python boolean.  Whether to scale the energy term.
       sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
         for `_monotonic_probability_fn` for more information.
@@ -1551,35 +1798,41 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
-    super(LuongMonotonicAttentionV2, self).__init__(
-        query_layer=None,
-        memory_layer=layers_core.Dense(
-            units, name="memory_layer", use_bias=False, dtype=dtype),
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
     self.units = units
     self.scale = scale
     self.sigmoid_noise = sigmoid_noise
     self.sigmoid_noise_seed = sigmoid_noise_seed
     self.score_bias_init = score_bias_init
     self.mode = mode
+    self.attention_g = None
+    self.attention_score_bias = None
+    super(LuongMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=None,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
 
   def build(self, input_shape):
     super(LuongMonotonicAttentionV2, self).build(input_shape)
-    if self.scale:
+    if self.scale and self.attention_g is None:
       self.attention_g = self.add_weight(
           "attention_g", initializer=init_ops.ones_initializer, shape=())
-    else:
-      self.attention_g = None
-    self.attention_score_bias = self.add_weight(
-        "attention_score_bias", shape=(),
-        initializer=init_ops.constant_initializer(
-            self.score_bias_init, dtype=self.dtype))
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(),
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
     self.built = True
 
-  def calculate_attention(self, query, state):
+  def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
@@ -1593,6 +1846,7 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
       alignments: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]` (`alignments_size` is memory's
         `max_time`).
+      next_state: Same as alignments
     """
     score = _luong_score(query, self.keys, self.attention_g)
     score += self.attention_score_bias
@@ -1603,7 +1857,7 @@ class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
   def get_config(self):
     config = {
         "units": self.units,
-        "normalize": self.normalize,
+        "scale": self.scale,
         "sigmoid_noise": self.sigmoid_noise,
         "sigmoid_noise_seed": self.sigmoid_noise_seed,
         "score_bias_init": self.score_bias_init,
@@ -1666,7 +1920,15 @@ class AttentionWrapperState(
     def with_same_shape(old, new):
       """Check and set new tensor's shape."""
       if isinstance(old, ops.Tensor) and isinstance(new, ops.Tensor):
-        return tensor_util.with_same_shape(old, new)
+        if not context.executing_eagerly():
+          return tensor_util.with_same_shape(old, new)
+        else:
+          if old.shape.as_list() != new.shape.as_list():
+            raise ValueError("The shape of the AttentionWrapperState is "
+                             "expected to be same as the one to clone. "
+                             "self.shape: %s, input.shape: %s" %
+                             (old.shape, new.shape))
+          return new
       return new
 
     return nest.map_structure(
@@ -1710,41 +1972,26 @@ def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None,
                          "but saw shape: %s" % (m.name, m.get_shape()))
     nest.map_structure(_check_dims, memory)
   if memory_sequence_length is None and memory_mask is None:
-    seq_len_mask = None
-    seq_len_batch_size = None
+    return memory
   elif memory_sequence_length is not None:
     seq_len_mask = array_ops.sequence_mask(
         memory_sequence_length,
         maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
         dtype=nest.flatten(memory)[0].dtype)
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_sequence_length.shape[0])
-        or array_ops.shape(memory_sequence_length)[0])
   else:
     # For memory_mask is not None
-    seq_len_mask = memory_mask
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_mask.shape[0])
-        or array_ops.shape(memory_mask)[0])
+    seq_len_mask = math_ops.cast(
+        memory_mask, dtype=nest.flatten(memory)[0].dtype)
   def _maybe_mask(m, seq_len_mask):
     """Mask the memory based on the memory mask."""
     rank = m.get_shape().ndims
     rank = rank if rank is not None else array_ops.rank(m)
     extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-    m_batch_size = tensor_shape.dimension_value(
-        m.shape[0]) or array_ops.shape(m)[0]
-    if seq_len_batch_size is not None:
-      message = ("memory_sequence_length and memory tensor batch sizes do not "
-                 "match.")
-      with ops.control_dependencies([
-          check_ops.assert_equal(
-              seq_len_batch_size, m_batch_size, message=message)]):
-        seq_len_mask = array_ops.reshape(
-            seq_len_mask,
-            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
-        return m * seq_len_mask
-    else:
-      return m
+    seq_len_mask = array_ops.reshape(
+        seq_len_mask,
+        array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+    return m * seq_len_mask
+
   return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
 
 
@@ -1790,8 +2037,14 @@ def hardmax(logits, name=None):
 def _compute_attention(attention_mechanism, cell_output, attention_state,
                        attention_layer):
   """Computes the attention and alignments for a given attention_mechanism."""
-  alignments, next_attention_state = attention_mechanism(
-      cell_output, state=attention_state)
+  if isinstance(attention_mechanism, _BaseAttentionMechanismV2):
+    alignments, next_attention_state = attention_mechanism(
+        [cell_output, attention_state])
+  else:
+    # For other class, assume they are following _BaseAttentionMechanism, which
+    # takes query and state as separate parameter.
+    alignments, next_attention_state = attention_mechanism(
+        cell_output, state=attention_state)
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
   expanded_alignments = array_ops.expand_dims(alignments, 1)
@@ -1804,13 +2057,13 @@ def _compute_attention(attention_mechanism, cell_output, attention_state,
   # the batched matmul is over memory_time, so the output shape is
   #   [batch_size, 1, memory_size].
   # we then squeeze out the singleton dim.
-  context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
-  context = array_ops.squeeze(context, [1])
+  context_ = math_ops.matmul(expanded_alignments, attention_mechanism.values)
+  context_ = array_ops.squeeze(context_, [1])
 
   if attention_layer is not None:
-    attention = attention_layer(array_ops.concat([cell_output, context], 1))
+    attention = attention_layer(array_ops.concat([cell_output, context_], 1))
   else:
-    attention = context
+    attention = context_
 
   return attention, alignments, next_attention_state
 
@@ -2094,7 +2347,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
       if self._initial_cell_state is not None:
         cell_state = self._initial_cell_state
       else:
-        cell_state = self._cell.zero_state(batch_size, dtype)
+        cell_state = self._cell.get_initial_state(batch_size=batch_size,
+                                                  dtype=dtype)
       error_message = (
           "When calling zero_state of AttentionWrapper %s: " % self._base_name +
           "Non-matching batch sizes between the memory "
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index 7eb95e5a70de985dca0d4b565ba03bdf454b6161..16dfa7ed8268d761dee49ec0146efabcaaef1393 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -23,8 +23,10 @@ import collections
 
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.util import nest
@@ -146,3 +148,102 @@ class BasicDecoder(decoder.Decoder):
           sample_ids=sample_ids)
     outputs = BasicDecoderOutput(cell_outputs, sample_ids)
     return (outputs, next_state, next_inputs, finished)
+
+
+class BasicDecoderV2(decoder.BaseDecoder):
+  """Basic sampling decoder."""
+
+  def __init__(self, cell, sampler, output_layer=None, **kwargs):
+    """Initialize BasicDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      sampler: A `Sampler` instance.
+      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`. Optional layer to apply to the RNN output prior to
+        storing the result or sampling.
+      **kwargs: Other keyward arguments for layer creation.
+
+    Raises:
+      TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
+    """
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
+    if not isinstance(sampler, sampler_py.Sampler):
+      raise TypeError("sampler must be a Sampler, received: %s" % (sampler,))
+    if (output_layer is not None and
+        not isinstance(output_layer, layers.Layer)):
+      raise TypeError(
+          "output_layer must be a Layer, received: %s" % (output_layer,))
+    self.cell = cell
+    self.sampler = sampler
+    self.output_layer = output_layer
+    super(BasicDecoderV2, self).__init__(**kwargs)
+
+  def initialize(self, inputs, initial_state=None, **kwargs):
+    """Initialize the decoder."""
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    self._cell_dtype = nest.flatten(initial_state)[0].dtype
+    return self.sampler.initialize(inputs, **kwargs) + (initial_state,)
+
+  @property
+  def batch_size(self):
+    return self.sampler.batch_size
+
+  def _rnn_output_size(self):
+    size = tensor_shape.TensorShape(self.cell.output_size)
+    if self.output_layer is None:
+      return size
+    else:
+      # To use layer's compute_output_shape, we need to convert the
+      # RNNCell's output_size entries into shapes with an unknown
+      # batch size.  We then pass this through the layer's
+      # compute_output_shape and read off all but the first (batch)
+      # dimensions to get the output size of the rnn with the layer
+      # applied to the top.
+      output_shape_with_unknown_batch = nest.map_structure(
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s), size)
+      layer_output_shape = self.output_layer.compute_output_shape(
+          output_shape_with_unknown_batch)
+      return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return BasicDecoderOutput(
+        rnn_output=self._rnn_output_size(),
+        sample_id=self.sampler.sample_ids_shape)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and the sample_ids_dtype from the helper.
+    dtype = self._cell_dtype
+    return BasicDecoderOutput(
+        nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        self.sampler.sample_ids_dtype)
+
+  def step(self, time, inputs, state):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    cell_outputs, cell_state = self.cell(inputs, state)
+    if self.output_layer is not None:
+      cell_outputs = self.output_layer(cell_outputs)
+    sample_ids = self.sampler.sample(
+        time=time, outputs=cell_outputs, state=cell_state)
+    (finished, next_inputs, next_state) = self.sampler.next_inputs(
+        time=time,
+        outputs=cell_outputs,
+        state=cell_state,
+        sample_ids=sample_ids)
+    outputs = BasicDecoderOutput(cell_outputs, sample_ids)
+    return (outputs, next_state, next_inputs, finished)
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 8f8f057702951094758b277ce060955f3dc6e99d..44b7b2c09203c860cb05889c8556ac088c18f226 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -24,11 +24,12 @@ import numpy as np
 from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.layers import base as layers_base
+from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -148,8 +149,8 @@ def gather_tree_from_array(t, parent_ids, sequence_length):
       array_ops.expand_dims(math_ops.range(beam_width), 0), 0)
   beam_ids = array_ops.tile(beam_ids, [max_time, batch_size, 1])
 
-  max_sequence_lengths = math_ops.to_int32(
-      math_ops.reduce_max(sequence_length, axis=1))
+  max_sequence_lengths = math_ops.cast(
+      math_ops.reduce_max(sequence_length, axis=1), dtypes.int32)
   sorted_beam_ids = beam_search_ops.gather_tree(
       step_ids=beam_ids,
       parent_ids=parent_ids,
@@ -182,11 +183,12 @@ def gather_tree_from_array(t, parent_ids, sequence_length):
   return ordered
 
 
-def _check_maybe(t):
+def _check_ndims(t):
   if t.shape.ndims is None:
     raise ValueError(
         "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
+
 def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
   """Raises an exception if dimensions are known statically and can not be
   reshaped to [batch_size, beam_size, -1].
@@ -205,6 +207,7 @@ def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
     return False
   return True
 
+
 def _check_batch_beam(t, batch_size, beam_width):
   """Returns an Assert operation checking that the elements of the stacked
   TensorArray can be reshaped to [batch_size, beam_size, -1]. At this point,
@@ -215,7 +218,7 @@ def _check_batch_beam(t, batch_size, beam_width):
                    "incompatible with the dynamic shape of %s elements. "
                    "Consider setting reorder_tensor_arrays to False to disable "
                    "TensorArray reordering during the beam search."
-                   % (t.name))
+                   % (t if context.executing_eagerly() else t.name))
   rank = t.shape.ndims
   shape = array_ops.shape(t)
   if rank == 2:
@@ -229,70 +232,30 @@ def _check_batch_beam(t, batch_size, beam_width):
   return control_flow_ops.Assert(condition, [error_message])
 
 
+class BeamSearchDecoderMixin(object):
+  """BeamSearchDecoderMixin contains the common methods for BeamSearchDecoder.
 
-class BeamSearchDecoder(decoder.Decoder):
-  """BeamSearch sampling decoder.
-
-    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
-    `AttentionWrapper`, then you must ensure that:
-
-    - The encoder output has been tiled to `beam_width` via
-      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
-    - The `batch_size` argument passed to the `zero_state` method of this
-      wrapper is equal to `true_batch_size * beam_width`.
-    - The initial state created with `zero_state` above contains a
-      `cell_state` value containing properly tiled final state from the
-      encoder.
-
-    An example:
-
-    ```
-    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
-        encoder_outputs, multiplier=beam_width)
-    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
-        encoder_final_state, multiplier=beam_width)
-    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
-        sequence_length, multiplier=beam_width)
-    attention_mechanism = MyFavoriteAttentionMechanism(
-        num_units=attention_depth,
-        memory=tiled_inputs,
-        memory_sequence_length=tiled_sequence_length)
-    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
-    decoder_initial_state = attention_cell.zero_state(
-        dtype, batch_size=true_batch_size * beam_width)
-    decoder_initial_state = decoder_initial_state.clone(
-        cell_state=tiled_encoder_final_state)
-    ```
-
-    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
-    when computing scores(https://arxiv.org/pdf/1609.08144.pdf). It encourages
-    the translation to cover all inputs.
+  It is expected to be used a base class for concrete BeamSearchDecoder. Since
+  this is a mixin class, it is expected to be used together with other class as
+  base.
   """
 
   def __init__(self,
                cell,
-               embedding,
-               start_tokens,
-               end_token,
-               initial_state,
                beam_width,
                output_layer=None,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
-               reorder_tensor_arrays=True):
-    """Initialize the BeamSearchDecoder.
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderMixin.
 
     Args:
       cell: An `RNNCell` instance.
-      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
-      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-      end_token: `int32` scalar, the token that marks end of decoding.
-      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
       beam_width:  Python integer, the number of beams.
-      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
-        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
-        to storing the result or sampling.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
       length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
       coverage_penalty_weight: Float weight to penalize the coverage of source
         sentence. Disabled with 0.0.
@@ -302,59 +265,35 @@ class BeamSearchDecoder(decoder.Decoder):
         Otherwise, the `TensorArray` will be returned as is. Set this flag to
         `False` if the cell state contains `TensorArray`s that are not amenable
         to reordering.
+      **kwargs: Dict, other keyword arguments for parent class.
 
     Raises:
       TypeError: if `cell` is not an instance of `RNNCell`,
-        or `output_layer` is not an instance of `tf.layers.Layer`.
-      ValueError: If `start_tokens` is not a vector or
-        `end_token` is not a scalar.
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
     """
     rnn_cell_impl.assert_like_rnncell("cell", cell)  # pylint: disable=protected-access
     if (output_layer is not None and
-        not isinstance(output_layer, layers_base.Layer)):
+        not isinstance(output_layer, layers.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
     self._output_layer = output_layer
     self._reorder_tensor_arrays = reorder_tensor_arrays
 
-    if callable(embedding):
-      self._embedding_fn = embedding
-    else:
-      self._embedding_fn = (
-          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-    self._start_tokens = ops.convert_to_tensor(
-        start_tokens, dtype=dtypes.int32, name="start_tokens")
-    if self._start_tokens.get_shape().ndims != 1:
-      raise ValueError("start_tokens must be a vector")
-    self._end_token = ops.convert_to_tensor(
-        end_token, dtype=dtypes.int32, name="end_token")
-    if self._end_token.get_shape().ndims != 0:
-      raise ValueError("end_token must be a scalar")
-
-    self._batch_size = array_ops.size(start_tokens)
+    self._start_tokens = None
+    self._end_token = None
+    self._batch_size = None
     self._beam_width = beam_width
     self._length_penalty_weight = length_penalty_weight
     self._coverage_penalty_weight = coverage_penalty_weight
-    self._initial_cell_state = nest.map_structure(
-        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
-    self._start_tokens = array_ops.tile(
-        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
-    self._start_inputs = self._embedding_fn(self._start_tokens)
-
-    self._finished = array_ops.one_hot(
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=False,
-        off_value=True,
-        dtype=dtypes.bool)
+    super(BeamSearchDecoderMixin, self).__init__(**kwargs)
 
   @property
   def batch_size(self):
     return self._batch_size
 
   def _rnn_output_size(self):
+    """Get the output shape from the RNN layer."""
     size = self._cell.output_size
     if self._output_layer is None:
       return size
@@ -393,50 +332,6 @@ class BeamSearchDecoder(decoder.Decoder):
         predicted_ids=tensor_shape.TensorShape([self._beam_width]),
         parent_ids=tensor_shape.TensorShape([self._beam_width]))
 
-  @property
-  def output_dtype(self):
-    # Assume the dtype of the cell is the output_size structure
-    # containing the input_state's first component's dtype.
-    # Return that structure and int32 (the id)
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    return BeamSearchDecoderOutput(
-        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-        predicted_ids=dtypes.int32,
-        parent_ids=dtypes.int32)
-
-  def initialize(self, name=None):
-    """Initialize the decoder.
-
-    Args:
-      name: Name scope for any created operations.
-
-    Returns:
-      `(finished, start_inputs, initial_state)`.
-    """
-    finished, start_inputs = self._finished, self._start_inputs
-
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
-        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
-        dtype=dtype)
-    init_attention_probs = get_attention_probs(
-        self._initial_cell_state, self._coverage_penalty_weight)
-    if init_attention_probs is None:
-      init_attention_probs = ()
-
-    initial_state = BeamSearchDecoderState(
-        cell_state=self._initial_cell_state,
-        log_probs=log_probs,
-        finished=finished,
-        lengths=array_ops.zeros(
-            [self._batch_size, self._beam_width], dtype=dtypes.int64),
-        accumulated_attention_probs=init_attention_probs)
-
-    return (finished, start_inputs, initial_state)
-
   def finalize(self, outputs, final_state, sequence_lengths):
     """Finalize and return the predicted_ids.
 
@@ -456,8 +351,8 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     del sequence_lengths
     # Get max_sequence_length across all beams for each batch.
-    max_sequence_lengths = math_ops.to_int32(
-        math_ops.reduce_max(final_state.lengths, axis=1))
+    max_sequence_lengths = math_ops.cast(
+        math_ops.reduce_max(final_state.lengths, axis=1), dtypes.int32)
     predicted_ids = beam_search_ops.gather_tree(
         outputs.predicted_ids,
         outputs.parent_ids,
@@ -562,7 +457,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
     else:
@@ -586,7 +481,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
@@ -609,11 +504,18 @@ class BeamSearchDecoder(decoder.Decoder):
     if not isinstance(t, tensor_array_ops.TensorArray):
       return t
     # pylint: disable=protected-access
-    if (not t._infer_shape or not t._element_shape
-        or t._element_shape[0].ndims is None
-        or t._element_shape[0].ndims < 1):
+    # This is a bad hack due to the implementation detail of eager/graph TA.
+    # TODO(b/124374427): Update this to use public property of TensorArray.
+    if context.executing_eagerly():
+      element_shape = t._element_shape
+    else:
+      element_shape = t._element_shape[0]
+    if (not t._infer_shape
+        or not t._element_shape
+        or element_shape.ndims is None
+        or element_shape.ndims < 1):
       shape = (
-          t._element_shape[0] if t._infer_shape and t._element_shape
+          element_shape if t._infer_shape and t._element_shape
           else tensor_shape.TensorShape(None))
       tf_logging.warn("The TensorArray %s in the cell state is not amenable to "
                       "sorting based on the beam search result. For a "
@@ -621,10 +523,10 @@ class BeamSearchDecoder(decoder.Decoder):
                       "defined and have at least a rank of 1, but saw shape: %s"
                       % (t.handle.name, shape))
       return t
-    shape = t._element_shape[0]
     # pylint: enable=protected-access
     if not _check_static_batch_beam_maybe(
-        shape, tensor_util.constant_value(self._batch_size), self._beam_width):
+        element_shape, tensor_util.constant_value(self._batch_size),
+        self._beam_width):
       return t
     t = t.stack()
     with ops.control_dependencies(
@@ -684,6 +586,359 @@ class BeamSearchDecoder(decoder.Decoder):
     return (beam_search_output, beam_search_state, next_inputs, finished)
 
 
+class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.Decoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoder to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               embedding,
+               start_tokens,
+               end_token,
+               initial_state,
+               beam_width,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True):
+    """Initialize the BeamSearchDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+      ValueError: If `start_tokens` is not a vector or
+        `end_token` is not a scalar.
+    """
+    super(BeamSearchDecoder, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays)
+
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    """
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+
+class BeamSearchDecoderV2(BeamSearchDecoderMixin, decoder.BaseDecoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoding to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               beam_width,
+               embedding_fn=None,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderV2.
+
+    Args:
+      cell: An `RNNCell` instance.
+      beam_width:  Python integer, the number of beams.
+      embedding_fn: A callable that takes a vector tensor of `ids` (argmax ids).
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+      **kwargs: Dict, other keyword arguments for initialization.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+    """
+    super(BeamSearchDecoderV2, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays,
+        **kwargs)
+
+    if embedding_fn is None or callable(embedding_fn):
+      self._embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be a callable, got %s" %
+                       type(embedding_fn))
+
+  def initialize(self,
+                 embedding,
+                 start_tokens,
+                 end_token,
+                 initial_state):
+    """Initialize the decoder.
+
+    Args:
+      embedding: A tensor from the embedding layer output, which is the
+        `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    Raises:
+      ValueError: If `start_tokens` is not a vector or `end_token` is not a
+        scalar.
+    """
+    if embedding is not None and self._embedding_fn is not None:
+      raise ValueError(
+          "embedding and embedding_fn cannot be provided at same time")
+    elif embedding is not None:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+  def call(self, embeddning, start_tokens, end_token, initial_state, **kwargs):
+    init_kwargs = kwargs
+    init_kwargs["start_tokens"] = start_tokens
+    init_kwargs["end_token"] = end_token
+    init_kwargs["initial_state"] = initial_state
+    return decoder.dynamic_decode(self,
+                                  output_time_major=self.output_time_major,
+                                  impute_finished=self.impute_finished,
+                                  maximum_iterations=self.maximum_iterations,
+                                  parallel_iterations=self.parallel_iterations,
+                                  swap_memory=self.swap_memory,
+                                  decoder_init_input=embeddning,
+                                  decoder_init_kwargs=init_kwargs)
+
+
 def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                       beam_width, end_token, length_penalty_weight,
                       coverage_penalty_weight):
@@ -730,7 +985,7 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
       on_value=np.int64(0),
       off_value=np.int64(1),
       dtype=dtypes.int64)
-  add_mask = math_ops.to_int64(not_finished)
+  add_mask = math_ops.cast(not_finished, dtypes.int64)
   lengths_to_add *= array_ops.expand_dims(add_mask, 2)
   new_prediction_lengths = (
       lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))
@@ -741,7 +996,8 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   attention_probs = get_attention_probs(
       next_cell_state, coverage_penalty_weight)
   if attention_probs is not None:
-    attention_probs *= array_ops.expand_dims(math_ops.to_float(not_finished), 2)
+    attention_probs *= array_ops.expand_dims(
+        math_ops.cast(not_finished, dtypes.float32), 2)
     accumulated_attention_probs = (
         beam_state.accumulated_attention_probs + attention_probs)
 
@@ -775,15 +1031,17 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
       gather_shape=[-1],
       name="next_beam_probs")
   # Note: just doing the following
-  #   math_ops.to_int32(word_indices % vocab_size,
+  #   math_ops.cast(
+  #       word_indices % vocab_size,
+  #       dtypes.int32,
   #       name="next_beam_word_ids")
   # would be a lot cleaner but for reasons unclear, that hides the results of
   # the op which prevents capturing it with tfdbg debug ops.
   raw_next_word_ids = math_ops.mod(
       word_indices, vocab_size, name="next_beam_word_ids")
-  next_word_ids = math_ops.to_int32(raw_next_word_ids)
-  next_beam_ids = math_ops.to_int32(
-      word_indices / vocab_size, name="next_beam_parent_ids")
+  next_word_ids = math_ops.cast(raw_next_word_ids, dtypes.int32)
+  next_beam_ids = math_ops.cast(
+      word_indices / vocab_size, dtypes.int32, name="next_beam_parent_ids")
 
   # Append new ids to current predictions
   previously_finished = _tensor_gather_helper(
@@ -802,7 +1060,8 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   # 2. Beams that are now finished (EOS predicted) have their length
   #    increased by 1.
   # 3. Beams that are not yet finished have their length increased by 1.
-  lengths_to_add = math_ops.to_int64(math_ops.logical_not(previously_finished))
+  lengths_to_add = math_ops.cast(
+      math_ops.logical_not(previously_finished), dtypes.int64)
   next_prediction_len = _tensor_gather_helper(
       gather_indices=next_beam_ids,
       gather_from=beam_state.lengths,
@@ -949,7 +1208,7 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight,
   coverage_penalty = math_ops.reduce_sum(
       math_ops.log(math_ops.minimum(accumulated_attention_probs, 1.0)), 2)
   # Apply coverage penalty to finished predictions.
-  coverage_penalty *= math_ops.to_float(finished)
+  coverage_penalty *= math_ops.cast(finished, dtypes.float32)
   weighted_coverage_penalty = coverage_penalty * coverage_penalty_weight
   # Reshape from [batch_size, beam_width] to [batch_size, beam_width, 1]
   weighted_coverage_penalty = array_ops.expand_dims(
@@ -1002,8 +1261,9 @@ def _length_penalty(sequence_lengths, penalty_factor):
   static_penalty = tensor_util.constant_value(penalty_factor)
   if static_penalty is not None and static_penalty == 0:
     return 1.0
-  return math_ops.div((5. + math_ops.to_float(sequence_lengths))
-                      **penalty_factor, (5. + 1.)**penalty_factor)
+  return math_ops.div(
+      (5. + math_ops.cast(sequence_lengths, dtypes.float32))**penalty_factor,
+      (5. + 1.)**penalty_factor)
 
 
 def _mask_probs(probs, eos_token, finished):
@@ -1068,7 +1328,7 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
   """
   if isinstance(gather_from, tensor_array_ops.TensorArray):
     return gather_from
-  _check_maybe(gather_from)
+  _check_ndims(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
         gather_indices=gather_indices,
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index f58268eff525a4b592c79acb32207e1a3f62bdc7..33f7bac8159401175ce57c0463fff1398c1dd9bb 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -135,6 +136,127 @@ class Decoder(object):
     return False
 
 
+class BaseDecoder(layers.Layer):
+  """An RNN Decoder that is based on a Keras layer.
+
+  Concepts used by this interface:
+  - `inputs`: (structure of) tensors and TensorArrays that is passed as input to
+    the RNNCell composing the decoder, at each time step.
+  - `state`: (structure of) tensors and TensorArrays that is passed to the
+    RNNCell instance as the state.
+  - `memory`: (sturecute of) tensors that is usually the full output of the
+    encoder, which will be used for the attention wrapper for the RNNCell.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at each
+    time step.
+  """
+
+  def __init__(self,
+               output_time_major=False,
+               impute_finished=False,
+               maximum_iterations=None,
+               parallel_iterations=32,
+               swap_memory=False,
+               **kwargs):
+    self.output_time_major = output_time_major
+    self.impute_finished = impute_finished
+    self.maximum_iterations = maximum_iterations
+    self.parallel_iterations = parallel_iterations
+    self.swap_memory = swap_memory
+    super(BaseDecoder, self).__init__(**kwargs)
+
+  def call(self, inputs, initial_state=None, **kwargs):
+    init_kwargs = kwargs
+    init_kwargs["initial_state"] = initial_state
+    return dynamic_decode(self,
+                          output_time_major=self.output_time_major,
+                          impute_finished=self.impute_finished,
+                          maximum_iterations=self.maximum_iterations,
+                          parallel_iterations=self.parallel_iterations,
+                          swap_memory=self.swap_memory,
+                          decoder_init_input=inputs,
+                          decoder_init_kwargs=init_kwargs)
+
+  @property
+  def batch_size(self):
+    """The batch size of input values."""
+    raise NotImplementedError
+
+  @property
+  def output_size(self):
+    """A (possibly nested tuple of...) integer[s] or `TensorShape` object[s]."""
+    raise NotImplementedError
+
+  @property
+  def output_dtype(self):
+    """A (possibly nested tuple of...) dtype[s]."""
+    raise NotImplementedError
+
+  def initialize(self, inputs, initial_state=None, **kwargs):
+    """Called before any decoding iterations.
+
+    This methods must compute initial input values and initial state.
+
+    Args:
+      inputs: (structure of) tensors that contains the input for the decoder. In
+        the normal case, its a tensor with shape [batch, timestep, embedding].
+      initial_state: (structure of) tensors that contains the initial state for
+        the RNNCell.
+      **kwargs: Other arguments that are passed in from layer.call() method. It
+        could contains item like input sequence_length, or masking for input.
+
+    Returns:
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
+    """
+    raise NotImplementedError
+
+  def step(self, time, inputs, state):
+    """Called per step of decoding (but only once for dynamic decoding).
+
+    Args:
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNNCell input (possibly nested tuple of) tensor[s] for this time
+        step.
+      state: RNNCell state (possibly nested tuple of) tensor[s] from previous
+        time step.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an object
+      containing the decoder output, `next_state` is a (structure of) state
+      tensors and TensorArrays, `next_inputs` is the tensor that should be used
+      as input for the next step, `finished` is a boolean tensor telling whether
+      the sequence is complete, for each sequence in the batch.
+    """
+    raise NotImplementedError
+
+  def finalize(self, outputs, final_state, sequence_lengths):
+    raise NotImplementedError
+
+  @property
+  def tracks_own_finished(self):
+    """Describes whether the Decoder keeps track of finished states.
+
+    Most decoders will emit a true/false `finished` value independently
+    at each time step.  In this case, the `dynamic_decode` function keeps track
+    of which batch entries are already finished, and performs a logical OR to
+    insert new batches to the finished set.
+
+    Some decoders, however, shuffle batches / beams between time steps and
+    `dynamic_decode` will mix up the finished state across these entries because
+    it does not track the reshuffle across time steps.  In this case, it is
+    up to the decoder to declare that it will keep track of its own finished
+    state by setting this property to `True`.
+
+    Returns:
+      Python bool.
+    """
+    return False
+
+  # TODO(scottzhu): Add build/get_config/from_config and other layer methods.
+
+
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
   def _create(s, d):
@@ -149,7 +271,8 @@ def dynamic_decode(decoder,
                    maximum_iterations=None,
                    parallel_iterations=32,
                    swap_memory=False,
-                   scope=None):
+                   scope=None,
+                   **kwargs):
   """Perform dynamic decoding with `decoder`.
 
   Calls initialize() once and step() repeatedly on the Decoder object.
@@ -171,6 +294,9 @@ def dynamic_decode(decoder,
     parallel_iterations: Argument passed to `tf.while_loop`.
     swap_memory: Argument passed to `tf.while_loop`.
     scope: Optional variable scope to use.
+    **kwargs: dict, other keyword arguments for dynamic_decode. It might contain
+      arguments for `BaseDecoder` to initialize, which takes all tensor inputs
+      during call().
 
   Returns:
     `(final_outputs, final_state, final_sequence_lengths)`.
@@ -179,7 +305,7 @@ def dynamic_decode(decoder,
     TypeError: if `decoder` is not an instance of `Decoder`.
     ValueError: if `maximum_iterations` is provided but is not a scalar.
   """
-  if not isinstance(decoder, Decoder):
+  if not isinstance(decoder, (Decoder, BaseDecoder)):
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
@@ -204,7 +330,14 @@ def dynamic_decode(decoder,
       if maximum_iterations.get_shape().ndims != 0:
         raise ValueError("maximum_iterations must be a scalar")
 
-    initial_finished, initial_inputs, initial_state = decoder.initialize()
+    if isinstance(decoder, Decoder):
+      initial_finished, initial_inputs, initial_state = decoder.initialize()
+    else:
+      # For BaseDecoder that takes tensor inputs during call.
+      decoder_init_input = kwargs.pop("decoder_init_input", None)
+      decoder_init_kwargs = kwargs.pop("decoder_init_kwargs", {})
+      initial_finished, initial_inputs, initial_state = decoder.initialize(
+          decoder_init_input, **decoder_init_kwargs)
 
     zero_outputs = _create_zero_outputs(decoder.output_size,
                                         decoder.output_dtype,
@@ -222,7 +355,7 @@ def dynamic_decode(decoder,
     def _shape(batch_size, from_shape):
       if (not isinstance(from_shape, tensor_shape.TensorShape) or
           from_shape.ndims == 0):
-        return tensor_shape.TensorShape(None)
+        return None
       else:
         batch_size = tensor_util.constant_value(
             ops.convert_to_tensor(
diff --git a/tensorflow/contrib/seq2seq/python/ops/sampler.py b/tensorflow/contrib/seq2seq/python/ops/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e3e48b3bc61c0ff94ae0a1794767c7ff6914969
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/sampler.py
@@ -0,0 +1,765 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library of sampler for use with SamplingDecoders."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+__all__ = [
+    "Sampler",
+    "TrainingSampler",
+    "GreedyEmbeddingSampler",
+    "SampleEmbeddingSampler",
+    "CustomSampler",
+    "ScheduledEmbeddingTrainingSampler",
+    "ScheduledOutputTrainingSampler",
+    "InferenceSampler",
+]
+
+_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Sampler(object):
+  """Interface for implementing sampling in seq2seq decoders.
+
+  Sampler instances are used by `BasicDecoder`. The normal usage of a sampler is
+  like below:
+  sampler = Sampler(init_args)
+  (initial_finished, initial_inputs) = sampler.initialize(input_tensors)
+  for time_step in range(time):
+    cell_output, cell_state = cell.call(cell_input, previous_state)
+    sample_ids = sampler.sample(time_step, cell_output, cell_state)
+    (finished, next_inputs, next_state) = sampler.next_inputs(
+        time_step,cell_output, cell_state)
+
+  Note that all the tensor input should not be feed to Sampler as __init__()
+  parameters, instead, they should be feed by decoders via initialize().
+  """
+
+  @abc.abstractmethod
+  def initialize(self, inputs, **kwargs):
+    """initialize the sampler with the input tensors.
+
+    This method suppose to be only invoke once before the calling other methods
+    of the Sampler.
+
+    Args:
+      inputs: A (structure of) input tensors, it could be a nested tuple or a
+        single tensor.
+      **kwargs: Other kwargs for initialization. It could contain tensors like
+        mask for inputs, or non tensor parameter.
+
+    Returns:
+      `(initial_finished, initial_inputs)`.
+    """
+    pass
+
+  @abc.abstractmethod
+  def sample(self, time, outputs, state):
+    """Returns `sample_ids`."""
+    pass
+
+  @abc.abstractmethod
+  def next_inputs(self, time, outputs, state, sample_ids):
+    """Returns `(finished, next_inputs, next_state)`."""
+    pass
+
+  @abc.abstractproperty
+  def batch_size(self):
+    """Batch size of tensor returned by `sample`.
+
+    Returns a scalar int32 tensor. The return value might not available before
+    the invocation of initialize(), in this case, ValueError is raised.
+    """
+    raise NotImplementedError("batch_size has not been implemented")
+
+  @abc.abstractproperty
+  def sample_ids_shape(self):
+    """Shape of tensor returned by `sample`, excluding the batch dimension.
+
+    Returns a `TensorShape`. The return value might not available before the
+    invocation of initialize().
+    """
+    raise NotImplementedError("sample_ids_shape has not been implemented")
+
+  @abc.abstractproperty
+  def sample_ids_dtype(self):
+    """DType of tensor returned by `sample`.
+
+    Returns a DType. The return value might not available before the
+    invocation of initialize().
+    """
+    raise NotImplementedError("sample_ids_dtype has not been implemented")
+
+
+class CustomSampler(Sampler):
+  """Base abstract class that allows the user to customize sampling."""
+
+  def __init__(self,
+               initialize_fn,
+               sample_fn,
+               next_inputs_fn,
+               sample_ids_shape=None,
+               sample_ids_dtype=None):
+    """Initializer.
+
+    Args:
+      initialize_fn: callable that returns `(finished, next_inputs)` for the
+        first iteration.
+      sample_fn: callable that takes `(time, outputs, state)` and emits tensor
+        `sample_ids`.
+      next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)`
+        and emits `(finished, next_inputs, next_state)`.
+      sample_ids_shape: Either a list of integers, or a 1-D Tensor of type
+        `int32`, the shape of each value in the `sample_ids` batch. Defaults to
+        a scalar.
+      sample_ids_dtype: The dtype of the `sample_ids` tensor. Defaults to int32.
+    """
+    self._initialize_fn = initialize_fn
+    self._sample_fn = sample_fn
+    self._next_inputs_fn = next_inputs_fn
+    self._batch_size = None
+    self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
+    self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return self._sample_ids_shape
+
+  @property
+  def sample_ids_dtype(self):
+    return self._sample_ids_dtype
+
+  def initialize(self, inputs, **kwargs):
+    (finished, next_inputs) = self._initialize_fn(inputs, **kwargs)
+    if self._batch_size is None:
+      self._batch_size = array_ops.size(finished)
+    return (finished, next_inputs)
+
+  def sample(self, time, outputs, state):
+    return self._sample_fn(time=time, outputs=outputs, state=state)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    return self._next_inputs_fn(
+        time=time, outputs=outputs, state=state, sample_ids=sample_ids)
+
+
+class TrainingSampler(Sampler):
+  """A Sampler for use during training.
+
+  Only reads inputs.
+
+  Returned sample_ids are the argmax of the RNN output logits.
+  """
+
+  def __init__(self, time_major=False):
+    """Initializer.
+
+    Args:
+      time_major: Python bool.  Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+
+    Raises:
+      ValueError: if `sequence_length` is not a 1D tensor.
+    """
+    self.time_major = time_major
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def sample_ids_dtype(self):
+    return dtypes.int32
+
+  def initialize(self, inputs, sequence_length=None):
+    """Initialize the TrainSampler.
+
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+
+    Returns:
+      (finished, next_inputs), a tuple of two items. The first item is a boolean
+        vector to indicate whether the item in the batch has finished. The
+        second item is the first slide of input data based on the timestep
+        dimension (usually the second dim of the input).
+    """
+    self.inputs = ops.convert_to_tensor(inputs, name="inputs")
+    if not self.time_major:
+      inputs = nest.map_structure(_transpose_batch_time, inputs)
+
+    self.input_tas = nest.map_structure(_unstack_ta, inputs)
+    if sequence_length is None:
+      raise ValueError("sequence_length is required for TrainingSampler")
+    self.sequence_length = ops.convert_to_tensor(
+        sequence_length, name="sequence_length")
+    if self.sequence_length.get_shape().ndims != 1:
+      raise ValueError(
+          "Expected sequence_length to be a vector, but received shape: %s" %
+          self._sequence_length.get_shape())
+
+    self.zero_inputs = nest.map_structure(
+        lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
+
+    self._batch_size = array_ops.size(self.sequence_length)
+
+    finished = math_ops.equal(0, self.sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        lambda: self.zero_inputs,
+        lambda: nest.map_structure(lambda inp: inp.read(0), self.input_tas))
+    return (finished, next_inputs)
+
+  def sample(self, time, outputs, state):
+    del state
+    sample_ids = math_ops.cast(math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    return sample_ids
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    del sample_ids
+    next_time = time + 1
+    finished = (next_time >= self.sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+
+    def read_from_ta(inp):
+      return inp.read(next_time)
+
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        lambda: self.zero_inputs,
+        lambda: nest.map_structure(read_from_ta, self.input_tas))
+    return (finished, next_inputs, state)
+
+
+class ScheduledEmbeddingTrainingSampler(TrainingSampler):
+  """A training sampler that adds scheduled sampling.
+
+  Returns -1s for sample_ids where no sampling took place; valid sample id
+  values elsewhere.
+  """
+
+  def __init__(self,
+               sampling_probability,
+               embedding_fn=None,
+               time_major=False,
+               seed=None,
+               scheduling_seed=None):
+    """Initializer.
+
+    Args:
+      sampling_probability: A `float32` 0-D or 1-D tensor: the probability of
+        sampling categorically from the output ids instead of reading directly
+        from the inputs.
+      embedding_fn: A callable that takes a vector tensor of `ids` (argmax ids),
+         or the `params` argument for `embedding_lookup`.
+      time_major: Python bool. Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      scheduling_seed: The schedule decision rule sampling seed.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    if callable(embedding_fn) or embedding_fn is None:
+      self.embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be callable, got %s"
+                       % type(embedding_fn))
+    self.sampling_probability = ops.convert_to_tensor(
+        sampling_probability, name="sampling_probability")
+    if self.sampling_probability.get_shape().ndims not in (0, 1):
+      raise ValueError(
+          "sampling_probability must be either a scalar or a vector. "
+          "saw shape: %s" % (self.sampling_probability.get_shape()))
+    self.seed = seed
+    self.scheduling_seed = scheduling_seed
+    super(ScheduledEmbeddingTrainingSampler,
+          self).__init__(time_major=time_major)
+
+  def initialize(self, inputs, sequence_length=None, embedding=None):
+    if self.embedding_fn is None:
+      if embedding is None:
+        raise ValueError("embedding is required as a keyword argument for "
+                         "ScheduledEmbeddingTrainingSampler")
+      self.embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+    return super(ScheduledEmbeddingTrainingSampler, self).initialize(
+        inputs, sequence_length=sequence_length)
+
+  def sample(self, time, outputs, state):
+    del state
+    # Return -1s where we did not sample, and sample_ids elsewhere
+    select_sample = bernoulli_sample(
+        probs=self.sampling_probability,
+        dtype=dtypes.bool,
+        sample_shape=self.batch_size,
+        seed=self.scheduling_seed)
+    return array_ops.where(select_sample,
+                           categorical_sample(logits=outputs, seed=self.seed),
+                           gen_array_ops.fill([self.batch_size], -1))
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    (finished, base_next_inputs, state) = (
+        super(ScheduledEmbeddingTrainingSampler, self).next_inputs(
+            time=time, outputs=outputs, state=state, sample_ids=sample_ids))
+
+    def maybe_sample():
+      """Perform scheduled sampling."""
+      where_sampling = math_ops.cast(
+          array_ops.where(sample_ids > -1), dtypes.int32)
+      where_not_sampling = math_ops.cast(
+          array_ops.where(sample_ids <= -1), dtypes.int32)
+      sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
+      inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
+                                                where_not_sampling)
+      sampled_next_inputs = self.embedding_fn(sample_ids_sampling)
+      base_shape = array_ops.shape(base_next_inputs)
+      return (array_ops.scatter_nd(
+          indices=where_sampling, updates=sampled_next_inputs, shape=base_shape)
+              + array_ops.scatter_nd(
+                  indices=where_not_sampling,
+                  updates=inputs_not_sampling,
+                  shape=base_shape))
+
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(all_finished, lambda: base_next_inputs,
+                                        maybe_sample)
+    return (finished, next_inputs, state)
+
+
+class ScheduledOutputTrainingSampler(TrainingSampler):
+  """A training sampler that adds scheduled sampling directly to outputs.
+
+  Returns False for sample_ids where no sampling took place; True elsewhere.
+  """
+
+  def __init__(self,
+               sampling_probability,
+               time_major=False,
+               seed=None,
+               next_inputs_fn=None):
+    """Initializer.
+
+    Args:
+      sampling_probability: A `float32` scalar tensor: the probability of
+        sampling from the outputs instead of reading directly from the inputs.
+      time_major: Python bool. Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      next_inputs_fn: (Optional) callable to apply to the RNN outputs to create
+        the next input when sampling. If `None` (default), the RNN outputs will
+        be used as the next inputs.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    self.sampling_probability = ops.convert_to_tensor(
+        sampling_probability, name="sampling_probability")
+    if self.sampling_probability.get_shape().ndims not in (0, 1):
+      raise ValueError(
+          "sampling_probability must be either a scalar or a vector. "
+          "saw shape: %s" % (self._sampling_probability.get_shape()))
+
+    self.seed = seed
+    self.next_inputs_fn = next_inputs_fn
+
+    super(ScheduledOutputTrainingSampler, self).__init__(time_major=time_major)
+
+  def initialize(self, inputs, sequence_length=None, auxiliary_inputs=None):
+    if auxiliary_inputs is None:
+      maybe_concatenated_inputs = inputs
+    else:
+      inputs = ops.convert_to_tensor(inputs)
+      auxiliary_inputs = ops.convert_to_tensor(auxiliary_inputs)
+      maybe_concatenated_inputs = nest.map_structure(
+          lambda x, y: array_ops.concat((x, y), -1), inputs, auxiliary_inputs)
+      if not self.time_major:
+        auxiliary_inputs = nest.map_structure(_transpose_batch_time,
+                                              auxiliary_inputs)
+    if auxiliary_inputs is not None:
+      self._auxiliary_input_tas = nest.map_structure(_unstack_ta,
+                                                     auxiliary_inputs)
+    else:
+      self._auxiliary_input_tas = None
+
+    return super(ScheduledOutputTrainingSampler, self).initialize(
+        maybe_concatenated_inputs, sequence_length=sequence_length)
+
+  def sample(self, time, outputs, state):
+    del state
+    return bernoulli_sample(
+        probs=self.sampling_probability,
+        sample_shape=self.batch_size,
+        seed=self.seed)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    (finished, base_next_inputs, state) = (
+        super(ScheduledOutputTrainingSampler, self).next_inputs(
+            time=time, outputs=outputs, state=state, sample_ids=sample_ids))
+    sample_ids = math_ops.cast(sample_ids, dtypes.bool)
+
+    def maybe_sample():
+      """Perform scheduled sampling."""
+
+      def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
+        """Concatenate outputs with auxiliary inputs, if they exist."""
+        if self._auxiliary_input_tas is None:
+          return outputs_
+
+        next_time = time + 1
+        auxiliary_inputs = nest.map_structure(lambda ta: ta.read(next_time),
+                                              self._auxiliary_input_tas)
+        if indices is not None:
+          auxiliary_inputs = array_ops.gather_nd(auxiliary_inputs, indices)
+        return nest.map_structure(lambda x, y: array_ops.concat((x, y), -1),
+                                  outputs_, auxiliary_inputs)
+
+      if self.next_inputs_fn is None:
+        return array_ops.where(sample_ids,
+                               maybe_concatenate_auxiliary_inputs(outputs),
+                               base_next_inputs)
+
+      where_sampling = math_ops.cast(array_ops.where(sample_ids), dtypes.int32)
+      where_not_sampling = math_ops.cast(
+          array_ops.where(math_ops.logical_not(sample_ids)), dtypes.int32)
+      outputs_sampling = array_ops.gather_nd(outputs, where_sampling)
+      inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
+                                                where_not_sampling)
+      sampled_next_inputs = maybe_concatenate_auxiliary_inputs(
+          self.next_inputs_fn(outputs_sampling), where_sampling)
+
+      base_shape = array_ops.shape(base_next_inputs)
+      return (array_ops.scatter_nd(
+          indices=where_sampling, updates=sampled_next_inputs, shape=base_shape)
+              + array_ops.scatter_nd(
+                  indices=where_not_sampling,
+                  updates=inputs_not_sampling,
+                  shape=base_shape))
+
+    all_finished = math_ops.reduce_all(finished)
+    no_samples = math_ops.logical_not(math_ops.reduce_any(sample_ids))
+    next_inputs = control_flow_ops.cond(
+        math_ops.logical_or(all_finished, no_samples), lambda: base_next_inputs,
+        maybe_sample)
+    return (finished, next_inputs, state)
+
+
+class GreedyEmbeddingSampler(Sampler):
+  """A sampler for use during inference.
+
+  Uses the argmax of the output (treated as logits) and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding_fn=None):
+    """Initializer.
+
+    Args:
+      embedding_fn: A optional callable that takes a vector tensor of `ids`
+        (argmax ids), or the `params` argument for `embedding_lookup`. The
+        returned tensor will be passed to the decoder input. Default to use
+        `embedding_ops.embedding_lookup`.
+    """
+    if embedding_fn is None or callable(embedding_fn):
+      self.embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be a callable, got %s" %
+                       type(embedding_fn))
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def sample_ids_dtype(self):
+    return dtypes.int32
+
+  def initialize(self, embedding, start_tokens=None, end_token=None):
+    """Initialize the GreedyEmbeddingSampler.
+
+    Args:
+      embedding: tensor that contains embedding states matrix. It will be used
+        to generate generate outputs with start_tokens and end_tokens. The
+        embedding will be ignored if the embedding_fn has been provided at
+        __init__().
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+
+    Returns:
+      Tuple of two items: `(finished, self.start_inputs)`.
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    if self.embedding_fn is None:
+      self.embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self.start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    self.end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self.start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._batch_size = array_ops.size(start_tokens)
+    if self.end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+    self.start_inputs = self.embedding_fn(self.start_tokens)
+
+    finished = array_ops.tile([False], [self._batch_size])
+    return (finished, self.start_inputs)
+
+  def sample(self, time, outputs, state):
+    """sample for GreedyEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, use argmax to get the most probable id
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError(
+          "Expected outputs to be a single Tensor, got: %s" % type(outputs))
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
+    return sample_ids
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    """next_inputs_fn for GreedyEmbeddingHelper."""
+    del time, outputs  # unused by next_inputs_fn
+    finished = math_ops.equal(sample_ids, self.end_token)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        # If we're finished, the next_inputs value doesn't matter
+        lambda: self.start_inputs,
+        lambda: self.embedding_fn(sample_ids))
+    return (finished, next_inputs, state)
+
+
+class SampleEmbeddingSampler(GreedyEmbeddingSampler):
+  """A sampler for use during inference.
+
+  Uses sampling (from a distribution) instead of argmax and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding_fn=None, softmax_temperature=None, seed=None):
+    """Initializer.
+
+    Args:
+      embedding_fn: (Optional) A callable that takes a vector tensor of `ids`
+        (argmax ids), or the `params` argument for `embedding_lookup`. The
+        returned tensor will be passed to the decoder input.
+      softmax_temperature: (Optional) `float32` scalar, value to divide the
+        logits by before computing the softmax. Larger values (above 1.0) result
+        in more random samples, while smaller values push the sampling
+        distribution towards the argmax. Must be strictly greater than 0.
+        Defaults to 1.0.
+      seed: (Optional) The sampling seed.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    super(SampleEmbeddingSampler, self).__init__(embedding_fn)
+    self.softmax_temperature = softmax_temperature
+    self.seed = seed
+
+  def sample(self, time, outputs, state):
+    """sample for SampleEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, we sample instead of argmax (greedy).
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError(
+          "Expected outputs to be a single Tensor, got: %s" % type(outputs))
+    if self.softmax_temperature is None:
+      logits = outputs
+    else:
+      logits = outputs / self.softmax_temperature
+
+    return categorical_sample(logits=logits, seed=self.seed)
+
+
+class InferenceSampler(Sampler):
+  """A helper to use during inference with a custom sampling function."""
+
+  def __init__(self,
+               sample_fn,
+               sample_shape,
+               sample_dtype,
+               end_fn,
+               next_inputs_fn=None):
+    """Initializer.
+
+    Args:
+      sample_fn: A callable that takes `outputs` and emits tensor `sample_ids`.
+      sample_shape: Either a list of integers, or a 1-D Tensor of type `int32`,
+        the shape of the each sample in the batch returned by `sample_fn`.
+      sample_dtype: the dtype of the sample returned by `sample_fn`.
+      end_fn: A callable that takes `sample_ids` and emits a `bool` vector
+        shaped `[batch_size]` indicating whether each sample is an end token.
+      next_inputs_fn: (Optional) A callable that takes `sample_ids` and returns
+        the next batch of inputs. If not provided, `sample_ids` is used as the
+        next batch of inputs.
+    """
+    self.sample_fn = sample_fn
+    self.sample_shape = tensor_shape.TensorShape(sample_shape)
+    self.sample_dtype = sample_dtype
+    self.end_fn = end_fn
+    self.next_inputs_fn = next_inputs_fn
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return self.sample_shape
+
+  @property
+  def sample_ids_dtype(self):
+    return self.sample_dtype
+
+  def initialize(self, start_inputs):
+    self.start_inputs = ops.convert_to_tensor(start_inputs, name="start_inputs")
+    self._batch_size = array_ops.shape(start_inputs)[0]
+    finished = array_ops.tile([False], [self._batch_size])
+    return (finished, self.start_inputs)
+
+  def sample(self, time, outputs, state):
+    del time, state  # unused by sample
+    return self.sample_fn(outputs)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    del time, outputs  # unused by next_inputs
+    if self.next_inputs_fn is None:
+      next_inputs = sample_ids
+    else:
+      next_inputs = self.next_inputs_fn(sample_ids)
+    finished = self.end_fn(sample_ids)
+    return (finished, next_inputs, state)
+
+
+# The following sample functions (_call_sampler, bernoulli_sample,
+# categorical_sample) mimic TensorFlow Probability distribution semantics.
+def _call_sampler(sample_n_fn, sample_shape, name=None):
+  """Reshapes vector of samples."""
+  with ops.name_scope(name, "call_sampler", values=[sample_shape]):
+    sample_shape = ops.convert_to_tensor(
+        sample_shape, dtype=dtypes.int32, name="sample_shape")
+    # Ensure sample_shape is a vector (vs just a scalar).
+    pad = math_ops.cast(
+        math_ops.equal(array_ops.rank(sample_shape), 0), dtypes.int32)
+    sample_shape = array_ops.reshape(
+        sample_shape,
+        array_ops.pad(
+            array_ops.shape(sample_shape),
+            paddings=[[pad, 0]],
+            constant_values=1))
+    samples = sample_n_fn(math_ops.reduce_prod(sample_shape))
+    batch_event_shape = array_ops.shape(samples)[1:]
+    final_shape = array_ops.concat([sample_shape, batch_event_shape], 0)
+    return array_ops.reshape(samples, final_shape)
+
+
+def bernoulli_sample(probs=None,
+                     logits=None,
+                     dtype=dtypes.int32,
+                     sample_shape=(),
+                     seed=None):
+  """Samples from Bernoulli distribution."""
+  if probs is None:
+    probs = math_ops.sigmoid(logits, name="probs")
+  else:
+    probs = ops.convert_to_tensor(probs, name="probs")
+  batch_shape_tensor = array_ops.shape(probs)
+
+  def _sample_n(n):
+    """Sample vector of Bernoullis."""
+    new_shape = array_ops.concat([[n], batch_shape_tensor], 0)
+    uniform = random_ops.random_uniform(new_shape, seed=seed, dtype=probs.dtype)
+    return math_ops.cast(math_ops.less(uniform, probs), dtype)
+
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def categorical_sample(logits, dtype=dtypes.int32, sample_shape=(), seed=None):
+  """Samples from categorical distribution."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  event_size = array_ops.shape(logits)[-1]
+  batch_shape_tensor = array_ops.shape(logits)[:-1]
+
+  def _sample_n(n):
+    """Sample vector of categoricals."""
+    if logits.shape.ndims == 2:
+      logits_2d = logits
+    else:
+      logits_2d = array_ops.reshape(logits, [-1, event_size])
+    sample_dtype = dtypes.int64 if logits.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
+        array_ops.concat([[n], batch_shape_tensor], 0))
+    return math_ops.cast(draws, dtype)
+
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def _unstack_ta(inp):
+  return tensor_array_ops.TensorArray(
+      dtype=inp.dtype,
+      size=array_ops.shape(inp)[0],
+      element_shape=inp.get_shape()[1:]).unstack(inp)
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index f2bb458848fab5603128903868b52f29785efc92..7b54aafeb2cfb5f2a99a93b97d14fbc5bf6e8f9c 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -11,7 +11,7 @@ import tensorflow.contrib.slim as slim
 
 ## Why TF-Slim?
 
-TF-Slim is a library that makes building, training and evaluation neural
+TF-Slim is a library that makes defining, training and evaluating neural
 networks simple:
 
 * Allows the user to define models much more compactly by eliminating
@@ -78,7 +78,7 @@ provides convenience wrappers for variable creation and manipulation.
 ## Defining Models
 
 Models can be succinctly defined using TF-Slim by combining its variables,
-layers and scopes. Each of these elements are defined below.
+layers and scopes. Each of these elements is defined below.
 
 ### Variables
 
@@ -160,15 +160,15 @@ slim.add_model_variable(my_model_variable)
 
 ### Layers
 
-While the set of TensorFlow operations is quite extensive, developers of
-neural networks typically think of models in terms of higher level concepts
-like "layers", "losses", "metrics", and "networks". A layer,
-such as a Convolutional Layer, a Fully Connected Layer or a BatchNorm Layer
-are more abstract than a single TensorFlow operation and typically involve
-several operations. Furthermore, a layer usually (but not always) has
-variables (tunable parameters) associated with it, unlike more primitive
-operations. For example, a Convolutional Layer in a neural network
-is composed of several low level operations:
+While the set of TensorFlow operations is quite extensive, developers of neural
+networks typically think of models in terms of higher level concepts like
+"layers", "losses", "metrics", and "networks". A layer, such as a Convolutional
+Layer, a Fully Connected Layer or a BatchNorm Layer is more abstract than a
+single TensorFlow operation and typically involve several operations.
+Furthermore, a layer usually (but not always) has variables (tunable parameters)
+associated with it, unlike more primitive operations. For example, a
+Convolutional Layer in a neural network is composed of several low level
+operations:
 
 1. Creating the weight and bias variables
 2. Convolving the weights with the input from the previous layer
@@ -455,9 +455,8 @@ loss = slim.losses.softmax_cross_entropy(predictions, labels)
 ```
 
 In this example, we start by creating the model (using TF-Slim's VGG
-implementation), and add the standard classification loss. Now, lets turn
-to the case where we have a multi-task model that produces multiple outputs:
-
+implementation), and add the standard classification loss. Now, let's turn to
+the case where we have a multi-task model that produces multiple outputs:
 
 ```python
 # Load the images and labels.
@@ -555,8 +554,8 @@ that we'll save a model checkpoint every 10 minutes.
 
 ### Working Example: Training the VGG16 Model
 
-To illustrate this, lets
-examine the following sample of training the VGG network:
+To illustrate this, let's examine the following sample of training the VGG
+network:
 
 ```python
 import tensorflow as tf
@@ -738,7 +737,7 @@ slim.learning.train(train_op, log_dir, init_fn=init_fn)
 
 Once we've trained a model (or even while the model is busy training) we'd like
 to see how well the model performs in practice. This is accomplished by picking
-a set of evaluation metrics, which will grade the models performance, and the
+a set of evaluation metrics, which will grade the model's performance, and the
 evaluation code which actually loads the data, performs inference, compares the
 results to the ground truth and records the evaluation scores. This step may be
 performed once or repeated periodically.
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index 99ad48763031cc2f98009449cea050fd90d01eb5..36d544d565b0cab05febcb91456e99dc95cb2307 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -252,8 +252,9 @@ def parallel_read(data_sources,
       common_queue = data_flow_ops.FIFOQueue(
           capacity=capacity, dtypes=dtypes, name='common_queue')
 
-    summary.scalar('fraction_of_%d_full' % capacity,
-                   math_ops.to_float(common_queue.size()) * (1. / capacity))
+    summary.scalar(
+        'fraction_of_%d_full' % capacity,
+        math_ops.cast(common_queue.size(), tf_dtypes.float32) * (1. / capacity))
 
     return ParallelReader(
         reader_class,
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
index 62bd20036126b41040ca4329c7f13ea7671a8045..d3c5ab7d7c2c5ced79808ecc59e0b0218c461062 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -86,6 +87,7 @@ def prefetch_queue(tensors,
     enqueue_op = queue.enqueue(tensors)
     queue_runner.add_queue_runner(
         queue_runner.QueueRunner(queue, [enqueue_op] * num_threads))
-    summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.to_float(queue.size()) * (1. / capacity))
+    summary.scalar(
+        "fraction_of_%d_full" % capacity,
+        math_ops.cast(queue.size(), _dtypes.float32) * (1. / capacity))
     return queue
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 1b2b6acacca838f95cb758ae88f79263993ca69e..c3193171a0ccbdfcd1c0f563d80baf24b465e5a6 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -329,7 +329,7 @@ class SparseTensor(ItemHandler):
       shape = indices.dense_shape
     indices_shape = array_ops.shape(indices.indices)
     rank = indices_shape[1]
-    ids = math_ops.to_int64(indices.values)
+    ids = math_ops.cast(indices.values, dtypes.int64)
     indices_columns_to_preserve = array_ops.slice(
         indices.indices, [0, 0], array_ops.stack([-1, rank - 1]))
     new_indices = array_ops.concat(
@@ -396,8 +396,8 @@ class Image(ItemHandler):
     image_format = keys_to_tensors[self._format_key]
 
     if self._repeated:
-      return functional_ops.map_fn(lambda x: self._decode(x, image_format),
-                                   image_buffer, dtype=self._dtype)
+      return map_fn.map_fn(lambda x: self._decode(x, image_format),
+                           image_buffer, dtype=self._dtype)
     else:
       return self._decode(image_buffer, image_format)
 
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 8bbdf96384683c68648367c6433eeb89c64c22bf..e9595d1b324dbd3d570d2407a6620c5295b15548 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -115,9 +115,9 @@ py_library(
 
 py_test(
     name = "inception_v1_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v1_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v1",
@@ -135,9 +135,9 @@ py_test(
 
 py_test(
     name = "inception_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v2_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v2",
@@ -155,9 +155,9 @@ py_test(
 
 py_test(
     name = "inception_v3_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v3_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v3",
@@ -233,8 +233,9 @@ py_library(
 
 py_test(
     name = "resnet_v1_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet_v1_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":resnet_utils",
@@ -268,8 +269,9 @@ py_library(
 
 py_test(
     name = "resnet_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet_v2_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":resnet_utils",
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index 8ff44fe4b5f21e6d174451c416b7e4107cebcde3..1cc54b15514157de1e48890feca398c906b16ec8 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -54,18 +54,19 @@ def create_test_input(batch_size, height, width, channels):
     return array_ops.placeholder(dtypes.float32,
                                  (batch_size, height, width, channels))
   else:
-    return math_ops.to_float(
+    return math_ops.cast(
         np.tile(
             np.reshape(
                 np.reshape(np.arange(height), [height, 1]) + np.reshape(
                     np.arange(width), [1, width]), [1, height, width, 1]),
-            [batch_size, 1, 1, channels]))
+            [batch_size, 1, 1, channels]), dtypes.float32)
 
 
 class ResnetUtilsTest(test.TestCase):
 
   def testSubsampleThreeByThree(self):
-    x = array_ops.reshape(math_ops.to_float(math_ops.range(9)), [1, 3, 3, 1])
+    x = array_ops.reshape(math_ops.cast(math_ops.range(9), dtypes.float32),
+                          [1, 3, 3, 1])
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 6, 8]), [1, 2, 2, 1])
@@ -73,7 +74,8 @@ class ResnetUtilsTest(test.TestCase):
       self.assertAllClose(x.eval(), expected.eval())
 
   def testSubsampleFourByFour(self):
-    x = array_ops.reshape(math_ops.to_float(math_ops.range(16)), [1, 4, 4, 1])
+    x = array_ops.reshape(math_ops.cast(math_ops.range(16), dtypes.float32),
+                          [1, 4, 4, 1])
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 8, 10]), [1, 2, 2, 1])
@@ -95,19 +97,20 @@ class ResnetUtilsTest(test.TestCase):
     variable_scope.get_variable_scope().reuse_variables()
 
     y1 = layers.conv2d(x, 1, [3, 3], stride=1, scope='Conv')
-    y1_expected = math_ops.to_float([[14, 28, 43, 26], [28, 48, 66, 37],
-                                     [43, 66, 84, 46], [26, 37, 46, 22]])
+    y1_expected = math_ops.cast([[14, 28, 43, 26], [28, 48, 66, 37],
+                                 [43, 66, 84, 46], [26, 37, 46, 22]],
+                                dtypes.float32)
     y1_expected = array_ops.reshape(y1_expected, [1, n, n, 1])
 
     y2 = resnet_utils.subsample(y1, 2)
-    y2_expected = math_ops.to_float([[14, 43], [43, 84]])
+    y2_expected = math_ops.cast([[14, 43], [43, 84]], dtypes.float32)
     y2_expected = array_ops.reshape(y2_expected, [1, n2, n2, 1])
 
     y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv')
     y3_expected = y2_expected
 
     y4 = layers.conv2d(x, 1, [3, 3], stride=2, scope='Conv')
-    y4_expected = math_ops.to_float([[48, 37], [37, 22]])
+    y4_expected = math_ops.cast([[48, 37], [37, 22]], dtypes.float32)
     y4_expected = array_ops.reshape(y4_expected, [1, n2, n2, 1])
 
     with self.cached_session() as sess:
@@ -132,14 +135,19 @@ class ResnetUtilsTest(test.TestCase):
     variable_scope.get_variable_scope().reuse_variables()
 
     y1 = layers.conv2d(x, 1, [3, 3], stride=1, scope='Conv')
-    y1_expected = math_ops.to_float([[14, 28, 43, 58, 34], [28, 48, 66, 84, 46],
-                                     [43, 66, 84, 102, 55],
-                                     [58, 84, 102, 120, 64],
-                                     [34, 46, 55, 64, 30]])
+    y1_expected = math_ops.cast([[14, 28, 43, 58, 34],
+                                 [28, 48, 66, 84, 46],
+                                 [43, 66, 84, 102, 55],
+                                 [58, 84, 102, 120, 64],
+                                 [34, 46, 55, 64, 30]],
+                                dtypes.float32)
     y1_expected = array_ops.reshape(y1_expected, [1, n, n, 1])
 
     y2 = resnet_utils.subsample(y1, 2)
-    y2_expected = math_ops.to_float([[14, 43, 34], [43, 84, 55], [34, 55, 30]])
+    y2_expected = math_ops.cast([[14, 43, 34],
+                                 [43, 84, 55],
+                                 [34, 55, 30]],
+                                dtypes.float32)
     y2_expected = array_ops.reshape(y2_expected, [1, n2, n2, 1])
 
     y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv')
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
index 055ecff1c32f76e0788fe141f410d6e6aac86cf5..31bdea9fbcd39ac486b9e13c0b7fd24e723f7fe9 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
@@ -54,18 +54,20 @@ def create_test_input(batch_size, height, width, channels):
     return array_ops.placeholder(dtypes.float32,
                                  (batch_size, height, width, channels))
   else:
-    return math_ops.to_float(
+    return math_ops.cast(
         np.tile(
             np.reshape(
                 np.reshape(np.arange(height), [height, 1]) + np.reshape(
                     np.arange(width), [1, width]), [1, height, width, 1]),
-            [batch_size, 1, 1, channels]))
+            [batch_size, 1, 1, channels]),
+        dtypes.float32)
 
 
 class ResnetUtilsTest(test.TestCase):
 
   def testSubsampleThreeByThree(self):
-    x = array_ops.reshape(math_ops.to_float(math_ops.range(9)), [1, 3, 3, 1])
+    x = array_ops.reshape(math_ops.cast(math_ops.range(9), dtypes.float32),
+                          [1, 3, 3, 1])
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 6, 8]), [1, 2, 2, 1])
@@ -73,7 +75,8 @@ class ResnetUtilsTest(test.TestCase):
       self.assertAllClose(x.eval(), expected.eval())
 
   def testSubsampleFourByFour(self):
-    x = array_ops.reshape(math_ops.to_float(math_ops.range(16)), [1, 4, 4, 1])
+    x = array_ops.reshape(math_ops.cast(math_ops.range(16), dtypes.float32),
+                          [1, 4, 4, 1])
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 8, 10]), [1, 2, 2, 1])
@@ -95,19 +98,22 @@ class ResnetUtilsTest(test.TestCase):
     variable_scope.get_variable_scope().reuse_variables()
 
     y1 = layers.conv2d(x, 1, [3, 3], stride=1, scope='Conv')
-    y1_expected = math_ops.to_float([[14, 28, 43, 26], [28, 48, 66, 37],
-                                     [43, 66, 84, 46], [26, 37, 46, 22]])
+    y1_expected = math_ops.cast([[14, 28, 43, 26],
+                                 [28, 48, 66, 37],
+                                 [43, 66, 84, 46],
+                                 [26, 37, 46, 22]],
+                                dtypes.float32)
     y1_expected = array_ops.reshape(y1_expected, [1, n, n, 1])
 
     y2 = resnet_utils.subsample(y1, 2)
-    y2_expected = math_ops.to_float([[14, 43], [43, 84]])
+    y2_expected = math_ops.cast([[14, 43], [43, 84]], dtypes.float32)
     y2_expected = array_ops.reshape(y2_expected, [1, n2, n2, 1])
 
     y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv')
     y3_expected = y2_expected
 
     y4 = layers.conv2d(x, 1, [3, 3], stride=2, scope='Conv')
-    y4_expected = math_ops.to_float([[48, 37], [37, 22]])
+    y4_expected = math_ops.cast([[48, 37], [37, 22]], dtypes.float32)
     y4_expected = array_ops.reshape(y4_expected, [1, n2, n2, 1])
 
     with self.cached_session() as sess:
@@ -132,17 +138,19 @@ class ResnetUtilsTest(test.TestCase):
     variable_scope.get_variable_scope().reuse_variables()
 
     y1 = layers.conv2d(x, 1, [3, 3], stride=1, scope='Conv')
-    y1_expected = math_ops.to_float([[14, 28, 43, 58, 34],
-                                     [28, 48, 66, 84, 46],
-                                     [43, 66, 84, 102, 55],
-                                     [58, 84, 102, 120, 64],
-                                     [34, 46, 55, 64, 30]])
+    y1_expected = math_ops.cast([[14, 28, 43, 58, 34],
+                                 [28, 48, 66, 84, 46],
+                                 [43, 66, 84, 102, 55],
+                                 [58, 84, 102, 120, 64],
+                                 [34, 46, 55, 64, 30]],
+                                dtypes.float32)
     y1_expected = array_ops.reshape(y1_expected, [1, n, n, 1])
 
     y2 = resnet_utils.subsample(y1, 2)
-    y2_expected = math_ops.to_float([[14, 43, 34],
-                                     [43, 84, 55],
-                                     [34, 55, 30]])
+    y2_expected = math_ops.cast([[14, 43, 34],
+                                 [43, 84, 55],
+                                 [34, 55, 30]],
+                                dtypes.float32)
     y2_expected = array_ops.reshape(y2_expected, [1, n2, n2, 1])
 
     y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv')
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index f88b03ec4c2b1f250091594ea12d7d1862029fa2..7dd52df6b68caea6111813837ba1e872acbeccdb 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -4,17 +4,14 @@ exports_files([
     "LICENSE",
 ])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "py_test",
-    "tf_gen_op_wrapper_py",
-)
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -22,7 +19,6 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,6 +31,7 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -43,7 +40,6 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 807741e05f92f6b666c175269742dc1af50c0054..8e13f7f56b23e47f046120b285b1519c6371ddab 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -22,6 +22,7 @@ import time
 
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -32,7 +33,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 10e4556dacbc17ec02c2bd698389b04d517d7076..27bfdeb3601f4fdb9897feee509b06d5e8f9b873 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -25,6 +25,7 @@ import sqlite3
 import numpy as np
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 398ac314f4b520610ec100273b37c33bc4b5b43a..583bbf97c57cf263f65bc3b0a56b32cc2dce5482 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -537,8 +537,9 @@ py_library(
 
 py_test(
     name = "random_forest_test",
-    size = "large",
+    size = "medium",
     srcs = ["client/random_forest_test.py"],
+    shard_count = 6,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6fa6d0d0e383e454eb0146bb10b6f49d..0b4125f00f9261bb9cbfaf1eedadd08189ab8be0 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
@@ -35,7 +36,7 @@ FEATURE_IMPORTANCE_NAME = 'global_feature_importance'
 
 def _top_k_generator(k):
   def _top_k(probabilities, targets):
-    targets = math_ops.to_int32(targets)
+    targets = math_ops.cast(targets, dtypes.int32)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
     return metrics.mean(nn.in_top_k(probabilities, targets, k))
@@ -48,18 +49,19 @@ def _accuracy(predictions, targets, weights=None):
 
 
 def _r2(probabilities, targets, weights=None):
-  targets = math_ops.to_float(targets)
+  targets = math_ops.cast(targets, dtypes.float32)
   y_mean = math_ops.reduce_mean(targets, 0)
-  squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
+  squares_total = math_ops.reduce_sum(
+      math_ops.squared_difference(targets, y_mean), 0)
   squares_residuals = math_ops.reduce_sum(
-      math_ops.square(targets - probabilities), 0)
+      math_ops.squared_difference(targets, probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
   return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
   targets = array_ops.squeeze(targets, axis=[1])
-  return array_ops.one_hot(math_ops.to_int32(targets), depth)
+  return array_ops.one_hot(math_ops.cast(targets, dtypes.int32), depth)
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
@@ -74,7 +76,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 def _softmax_entropy(probabilities, targets, weights=None):
   return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
-                                          math_ops.to_int32(targets)),
+                                          math_ops.cast(targets, dtypes.int32)),
       weights=weights)
 
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
index a427a02b7cd597b7090155b5e0d300c7d71208c8..926e4dda916e3dd30015aea2d2a7b13df53cdb52 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/hybrid_model.py
@@ -22,6 +22,7 @@ import collections
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework.python.ops import variables as framework_variables
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -110,14 +111,15 @@ class HybridModel(object):
     """The loss to minimize while training."""
 
     if self.is_regression:
-      diff = self.training_inference_graph(data) - math_ops.to_float(labels)
+      diff = self.training_inference_graph(data) - math_ops.cast(
+          labels, dtypes.float32)
       mean_squared_error = math_ops.reduce_mean(diff * diff)
       root_mean_squared_error = math_ops.sqrt(mean_squared_error, name="loss")
       loss = root_mean_squared_error
     else:
       loss = math_ops.reduce_mean(
           nn_ops.sparse_softmax_cross_entropy_with_logits(
-              labels=array_ops.squeeze(math_ops.to_int32(labels)),
+              labels=array_ops.squeeze(math_ops.cast(labels, dtypes.int32)),
               logits=self.training_inference_graph(data)),
           name="loss")
     if self.regularizer:
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
index b9aad36f3d25b9fb7b8b525be54fb7a39394b373..76b1d2b4da269cda71f5b49878f2933d7d9b5776 100644
--- a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
@@ -304,7 +304,7 @@ class TraverseTreeV4Op : public OpKernel {
     auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
     int num_threads = worker_threads->num_threads;
     const int64 costPerTraverse = 500;
-    auto traverse = [this, &set_leaf_ids, &data_set, decision_tree_resource,
+    auto traverse = [&set_leaf_ids, &data_set, decision_tree_resource,
                      num_data](int64 start, int64 end) {
       CHECK(start <= end);
       CHECK(end <= num_data);
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
index fe2c91c1047fe56710b1a86b2fa3206caf6ff3bc..0243f106814511c1b53a5aacb830b845214a00a3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
@@ -307,7 +307,7 @@ class ProcessInputOp : public OpKernel {
     // from a digits run on local desktop.  Heuristics might be necessary
     // if it really matters that much.
     const int64 costPerUpdate = 1000;
-    auto update = [this, &target, &leaf_ids_tensor, &num_targets, &data_set,
+    auto update = [&target, &leaf_ids_tensor, &num_targets, &data_set,
                    fertile_stats_resource, &locks, &set_lock, &ready_to_split,
                    num_data](int64 start, int64 end) {
       CHECK(start <= end);
@@ -317,7 +317,7 @@ class ProcessInputOp : public OpKernel {
                   static_cast<int32>(end), &ready_to_split);
     };
 
-    auto update_collated = [this, &target, &num_targets, fertile_stats_resource,
+    auto update_collated = [&target, &num_targets, fertile_stats_resource,
                             tree_resource, &leaf_examples, &set_lock,
                             &ready_to_split, &data_set,
                             num_leaves](int64 start, int64 end) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index e04eb60f9b27cfd8b6b4e1502594d4d310ae55cc..774da472f1543f938d1b607ebdef008f7b540211 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -18,10 +18,10 @@
 #include <limits>
 
 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
index f878e5989cf2b43be960d34a45c4014d412f1c67..5c1fe23981d50ed067ae0bcf587b89d04c515629 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/data_ops.py
@@ -44,7 +44,7 @@ def CastToFloat(tensor):
   if tensor.dtype == dtypes.string:
     return tensor_forest_ops.reinterpret_string_to_float(tensor)
   elif tensor.dtype.is_integer:
-    return math_ops.to_float(tensor)
+    return math_ops.cast(tensor, dtypes.float32)
   else:
     return tensor
 
@@ -195,7 +195,7 @@ def ParseLabelTensorOrDict(labels):
     A 2-D tensor for labels/outputs.
   """
   if isinstance(labels, dict):
-    return math_ops.to_float(
+    return math_ops.cast(
         array_ops.concat(
             [
                 sparse_ops.sparse_tensor_to_dense(
@@ -203,10 +203,12 @@ def ParseLabelTensorOrDict(labels):
                         labels, sparse_tensor.SparseTensor) else labels[k]
                 for k in sorted(labels.keys())
             ],
-            1))
+            1),
+        dtypes.float32)
   else:
     if isinstance(labels, sparse_tensor.SparseTensor):
-      return math_ops.to_float(sparse_ops.sparse_tensor_to_dense(
-          labels, default_value=-1))
+      return math_ops.cast(
+          sparse_ops.sparse_tensor_to_dense(labels, default_value=-1),
+          dtypes.float32)
     else:
-      return math_ops.to_float(labels)
+      return math_ops.cast(labels, dtypes.float32)
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 290c16fe3966791ea78986539750caf938a37322..d36d0eb0c46b0d68bea4b6fc29a20dc8876ac539 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _model_ops = loader.load_op_library(
@@ -103,9 +103,9 @@ class TreeVariable(tracking.TrackableResource):
     self._container = container
     self._init_op = None
     super(TreeVariable, self).__init__()
-    self._resource_handle = self.create_resource()
+    self._resource_handle = self._create_resource()
 
-  def create_resource(self):
+  def _create_resource(self):
     if context.executing_eagerly():
       # TODO(allenl): This will leak memory due to kernel caching by the
       # shared_name attribute value (but is better than the alternative of
@@ -117,7 +117,7 @@ class TreeVariable(tracking.TrackableResource):
     return gen_model_ops.decision_tree_resource_handle_op(
         self._container, shared_name=shared_name, name=self._name)
 
-  def initialize(self):
+  def _initialize(self):
     return gen_model_ops.create_tree_variable(
         self.resource_handle,
         self._tree_config,
@@ -126,7 +126,7 @@ class TreeVariable(tracking.TrackableResource):
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 9184198cd4c8fd2a7609714d094d5ef2b6868658..7ac68fed20c3c9dfeaff05013e3fc686eea8cc2e 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _stats_ops = loader.load_op_library(
@@ -98,9 +98,9 @@ class FertileStatsVariable(tracking.TrackableResource):
     self._container = container
     self._init_op = None
     super(FertileStatsVariable, self).__init__()
-    self._resource_handle = self.create_resource()
+    self._resource_handle = self._create_resource()
 
-  def create_resource(self):
+  def _create_resource(self):
     if context.executing_eagerly():
       # TODO(allenl): This will leak memory due to kernel caching by the
       # shared_name attribute value (but is better than the alternative of
@@ -112,7 +112,7 @@ class FertileStatsVariable(tracking.TrackableResource):
     return gen_stats_ops.fertile_stats_resource_handle_op(
         self._container, shared_name=shared_name, name=self._name)
 
-  def initialize(self):
+  def _initialize(self):
     return gen_stats_ops.create_fertile_stats_variable(
         self.resource_handle,
         self._stats_config,
@@ -121,7 +121,7 @@ class FertileStatsVariable(tracking.TrackableResource):
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 6f62cd11a9733949c350e35b6b0c436dd097cc33..65a3574e75238bac283fb551a8f0110fa1358bf2 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.tensor_forest.python.ops import data_ops
 from tensorflow.contrib.tensor_forest.python.ops import model_ops
 from tensorflow.contrib.tensor_forest.python.ops import stats_ops
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -540,7 +541,8 @@ class RandomForestGraphs(object):
     for i in range(self.params.num_trees):
       with ops.device(self.variables.device_dummies[i].device):
         sizes.append(self.trees[i].size())
-    return math_ops.reduce_mean(math_ops.to_float(array_ops.stack(sizes)))
+    return math_ops.reduce_mean(
+        math_ops.cast(array_ops.stack(sizes), dtypes.float32))
 
   # pylint: disable=unused-argument
   def training_loss(self, features, labels, name='training_loss'):
@@ -603,7 +605,7 @@ class RandomTreeGraphs(object):
       The last op in the random tree training graph.
     """
     # TODO(gilberth): Use this.
-    unused_epoch = math_ops.to_int32(get_epoch_variable())
+    unused_epoch = math_ops.cast(get_epoch_variable(), dtypes.int32)
 
     if input_weights is None:
       input_weights = []
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 3f1090676865a5cddc61810c385284f0db0fbbbb..91b6d2614a8963c21e35c385411dc4c9956e3146 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,602 +11,54 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_copts",
     "tf_cuda_library",
-    "tf_custom_op_library",
     "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "@local_config_tensorrt//:build_defs.bzl",
     "if_tensorrt",
 )
 
-exports_files(glob([
-    "test/testdata/*",
-]))
-
-tf_cuda_cc_test(
-    name = "tensorrt_test_cc",
-    size = "small",
-    srcs = ["tensorrt_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        "//tensorflow/core:gpu_init",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_custom_op_library(
-    name = "python/ops/_trt_ops.so",
-    srcs = [
-        "ops/get_serialized_resource_op.cc",
-        "ops/trt_engine_op.cc",
-    ],
-    deps = [
-        ":trt_shape_function",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_shape_function",
     srcs = ["shape_fn/trt_shfn.cc"],
     hdrs = ["shape_fn/trt_shfn.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":trt_logging",
-        ":trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_logging",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
     ] + if_tensorrt([
         "@local_config_tensorrt//:tensorrt",
     ]) + tf_custom_op_library_additional_deps(),
 )
 
-cc_library(
-    name = "trt_op_kernels",
-    srcs = [
-        "kernels/get_serialized_resource_op.cc",
-        "kernels/trt_engine_op.cc",
-    ],
-    hdrs = [
-        "kernels/trt_engine_op.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_conversion",
-        ":trt_logging",
-        ":trt_plugins",
-        ":trt_resources",
-        ":utils",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor_headers_lib",
-        "//tensorflow/core/grappler/costs:graph_properties",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]) + tf_custom_op_library_additional_deps(),
-    # TODO(laigd): fix this by merging header file in cc file.
-    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
-)
-
-tf_cuda_cc_test(
-    name = "get_serialized_resource_op_test",
-    size = "small",
-    srcs = ["kernels/get_serialized_resource_op_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":get_serialized_resource_op_op_lib",
-        ":trt_op_kernels",
-        ":trt_resources",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "trt_engine_op",
-        "get_serialized_resource_op",
-    ],
-)
-
-tf_cuda_library(
-    name = "trt_logging",
-    srcs = ["log/trt_logger.cc"],
-    hdrs = ["log/trt_logger.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_gen_op_wrapper_py(
-    name = "trt_ops",
-    deps = [
-        ":get_serialized_resource_op_op_lib",
-        ":trt_engine_op_op_lib",
-        ":trt_logging",
-        ":trt_shape_function",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "trt_ops_loader",
-    srcs = ["python/ops/trt_ops.py"],
-    dso = [
-        ":python/ops/_trt_ops.so",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-    kernels = [
-        ":trt_op_kernels",
-        ":trt_engine_op_op_lib",
-        ":get_serialized_resource_op_op_lib",
-        ":trt_shape_function",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resources",
-    ],
-)
-
 py_library(
     name = "init_py",
     srcs = [
         "__init__.py",
         "python/__init__.py",
+        "python/trt_convert.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_trt_integration_test_base",
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/compiler/tensorrt:init_py",
     ],
 )
 
-py_library(
-    name = "trt_ops_py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":trt_ops",
-        ":trt_ops_loader",
-    ],
-)
+# The following rules forward the libraries that were moved in order to not
+# break other internal targets.
 
-py_library(
-    name = "trt_convert_py",
-    srcs = ["python/trt_convert.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":wrap_conversion",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:tag_constants",
-    ],
-)
-
-# TODO(aaroey): this wrapper has been causing troubles of double linking, so
-# either get rid of it, or split to make it contain minimum dependencies.
-tf_py_wrap_cc(
-    name = "wrap_conversion",
-    srcs = ["trt_conversion.i"],
-    copts = tf_copts(),
-    swig_includes = [
-        "//tensorflow/python:platform/base.i",
-    ],
-    deps = [
-        ":test_utils",
-        ":trt_conversion",
-        ":trt_op_kernels",
-        "//third_party/python_runtime:headers",
-    ],
-)
-
-tf_cuda_library(
-    name = "trt_resources",
-    srcs = [
-        "resources/trt_int8_calibrator.cc",
-        "resources/trt_resource_manager.cc",
-        "resources/trt_resources.cc",
-    ],
-    hdrs = [
-        "resources/trt_int8_calibrator.h",
-        "resources/trt_lru_cache.h",
-        "resources/trt_resource_manager.h",
-        "resources/trt_resources.h",
-    ],
-    deps = [
-        ":trt_allocator",
-        ":trt_logging",
-        ":utils",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_cuda_library(
-    name = "trt_allocator",
-    srcs = ["resources/trt_allocator.cc"],
-    hdrs = ["resources/trt_allocator.h"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_cc_test(
-    name = "trt_allocator_test",
-    size = "small",
-    srcs = ["resources/trt_allocator_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_allocator",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-tf_cc_test(
-    name = "trt_lru_cache_test",
-    size = "small",
-    srcs = ["resources/trt_lru_cache_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_resources",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-# Library for the node-level conversion portion of TensorRT operation creation
-tf_cuda_library(
+alias(
     name = "trt_conversion",
-    srcs = [
-        "convert/convert_graph.cc",
-        "convert/convert_nodes.cc",
-        "convert/trt_optimization_pass.cc",
-    ],
-    hdrs = [
-        "convert/convert_graph.h",
-        "convert/convert_nodes.h",
-        "convert/trt_optimization_pass.h",
-    ],
-    deps = [
-        ":segment",
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_plugins",
-        ":trt_logging",
-        ":trt_resources",
-        ":utils",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:devices",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]) + tf_custom_op_library_additional_deps(),
-)
-
-tf_cuda_cc_test(
-    name = "convert_graph_test",
-    size = "medium",
-    srcs = ["convert/convert_graph_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_conversion",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "convert_nodes_test",
-    size = "medium",
-    srcs = ["convert/convert_nodes_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_logging",
-        ":trt_conversion",
-        ":trt_plugins",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-# Library for the segmenting portion of TensorRT operation creation
-cc_library(
-    name = "segment",
-    srcs = ["segment/segment.cc"],
-    hdrs = [
-        "segment/segment.h",
-        "segment/union_find.h",
-    ],
-    deps = [
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-)
-
-tf_cc_test(
-    name = "segment_test",
-    size = "small",
-    srcs = ["segment/segment_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":segment",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_conversion",
 )
 
-# Library for the plugin factory
-tf_cuda_library(
-    name = "trt_plugins",
-    srcs = [
-        "plugin/trt_plugin.cc",
-        "plugin/trt_plugin_factory.cc",
-        "plugin/trt_plugin_utils.cc",
-    ],
-    hdrs = [
-        "plugin/trt_plugin.h",
-        "plugin/trt_plugin_factory.h",
-        "plugin/trt_plugin_utils.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "trt_plugin_factory_test",
-    size = "small",
-    srcs = ["plugin/trt_plugin_factory_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_plugins",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
-py_library(
-    name = "tf_trt_integration_test_base",
-    srcs = ["test/tf_trt_integration_test_base.py"],
-    deps = [
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-cuda_py_test(
-    name = "trt_convert_test",
-    srcs = ["python/trt_convert_test.py"],
-    additional_deps = [
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python/saved_model:signature_def_utils",
-        "//tensorflow/python/saved_model:tag_constants",
-        "//tensorflow/python/saved_model:utils",
-        "//tensorflow/python/tools:freeze_graph_lib",
-        "//tensorflow/python/tools:saved_model_utils",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-)
-
-cuda_py_tests(
-    name = "tf_trt_integration_test",
-    srcs = [
-        "test/base_test.py",
-        "test/batch_matmul_test.py",
-        "test/biasadd_matmul_test.py",
-        "test/binary_tensor_weight_broadcast_test.py",
-        "test/concatenation_test.py",
-        "test/const_broadcast_test.py",
-        "test/conv2d_test.py",
-        "test/dynamic_input_shapes_test.py",
-        "test/identity_output_test.py",
-        "test/int32_test.py",
-        "test/lru_cache_test.py",
-        "test/manual_test.py",
-        "test/memory_alignment_test.py",
-        "test/multi_connection_neighbor_engine_test.py",
-        "test/neighboring_engine_test.py",
-        "test/quantization_test.py",
-        "test/rank_two_test.py",
-        "test/reshape_transpose_test.py",
-        "test/topk_test.py",
-        "test/unary_test.py",
-        "test/vgg_block_nchw_test.py",
-        "test/vgg_block_test.py",
-    ],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-)
-
-cuda_py_test(
-    name = "quantization_mnist_test",
-    srcs = ["test/quantization_mnist_test.py"],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/keras:keras",
-        "//tensorflow/python/estimator:estimator",
-    ],
-    data = [
-        "test/testdata/checkpoint",
-        "test/testdata/model.ckpt-46900.data-00000-of-00001",
-        "test/testdata/model.ckpt-46900.index",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-        "no_tap",  # It is not able to download the mnist data.
-        "no_windows",
-        "nomac",
-    ],
-)
-
-cc_library(
-    name = "utils",
-    srcs = ["convert/utils.cc"],
-    hdrs = ["convert/utils.h"],
-    copts = tf_copts(),
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
+alias(
+    name = "trt_op_kernels",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
 )
 
-cc_library(
-    name = "test_utils",
-    srcs = ["test/utils.cc"],
-    hdrs = ["test/utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_googlesource_code_re2//:re2",
-    ],
+alias(
+    name = "trt_engine_op_op_lib",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_engine_op_op_lib",
 )
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index 140ad4828208ae4844a49bf664955b50cd9e51cd..fd551d70b4385b14b84b7b98a6d16b0c03733d38 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -18,18 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import errors
-
-# pylint: disable=unused-import,wildcard-import,g-import-not-at-top
-try:
-  from tensorflow.contrib.tensorrt.python import *
-except errors.NotFoundError as e:
-  no_trt_message = (
-      '**** Failed to initialize TensorRT. This is either because the TensorRT'
-      ' installation path is not in LD_LIBRARY_PATH, or because you do not have'
-      ' it installed. If not installed, please go to'
-      ' https://developer.nvidia.com/tensorrt to download and install'
-      ' TensorRT ****')
-  print(no_trt_message)
-  raise e
-# pylint: enable=unused-import,wildcard-import,g-import-not-at-top
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
deleted file mode 100644
index 1f39f56f6392ba33af3d74fec12c326ed4451cb6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
-
-#include <vector>
-
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/types.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-
-namespace tensorflow {
-namespace tensorrt {
-namespace convert {
-
-// Helper class for the segmenter to determine whether given TF node is
-// supported by TRT.
-class TrtCandidateSelector {
- public:
-  TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
-                       int precision_mode);
-
-  // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
-  // to TRT subgraph and later converted into TRT engine.
-  Status IsTensorRTCandidate(const tensorflow::Node* node);
-
- private:
-  // The TF-TRT node converter used to verify whether individual node is
-  // supported. It will operate in validation-only mode.
-  TrtNodeValidator validator_;
-
-  // GraphProperties of the graph whose nodes are to be validated by
-  // IsTensorRTCandidate().
-  const grappler::GraphProperties& graph_properties_;
-
-  // Quantization ops are only converted when using quantized precisions.
-  const int precision_mode_;
-};
-
-struct ConversionParams {
-  ConversionParams()
-      : input_graph_def(nullptr),
-        max_batch_size(1),
-        max_workspace_size_bytes(1 << 30),
-        output_graph_def(nullptr),
-        precision_mode(1),
-        minimum_segment_size(3),
-        graph_properties(nullptr),
-        cluster(nullptr),
-        is_dyn_op(false),
-        fixed_input_size(true),
-        use_calibration(true),
-        max_cached_engines(1) {}
-  const tensorflow::GraphDef* input_graph_def;
-  const std::vector<string>* output_names;
-  size_t max_batch_size;
-  size_t max_workspace_size_bytes;
-  tensorflow::GraphDef* output_graph_def;
-  int precision_mode;
-  int minimum_segment_size;
-  const tensorflow::grappler::GraphProperties* graph_properties;
-  const tensorflow::grappler::Cluster* cluster;
-  bool is_dyn_op;  //  Whether to create engine on conversion or execution time
-  bool fixed_input_size;   // Assume non-batch ranks of input tensors are fixed
-  int max_cached_engines;  // maximum number of cached engines
-  bool use_calibration;
-  std::vector<int> cached_engine_batches;  // list of cached engines
-};
-
-// This method extracts calibration information from the resource managers
-// and puts them in to engine nodedefs.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
-    bool is_dyn_op);
-
-// - max_batch_size: maximum batch size which can be used for inference for
-//   optimization targets inference run with max batch size.
-// - max_workspace_size_bytes: The upper bound of memory allowance for engine
-//   building.
-tensorflow::Status ConvertGraphDefToTensorRT(
-    const tensorflow::GraphDef& graph_def,
-    const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode = 1, int minimum_segment_size = 3,
-    bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
-
-// Method to call from optimization pass
-tensorflow::Status ConvertAfterShapes(ConversionParams& params);
-
-// Return compile time TensorRT library version information.
-std::vector<int> GetLinkedTensorRTVersion();
-
-// Return runtime time TensorRT library version information.
-std::vector<int> GetLoadedTensorRTVersion();
-
-// Helper method for the conversion, expose for testing.
-std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
-    const ConversionParams& params, const EngineInfo& engine);
-
-}  // namespace convert
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
index 09990fb5e3f93d20ba51929dcc6f49e37e0ff0ac..0a2cf105baf5efb62d0c535c1f2d081973ec0ea3 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -45,7 +45,7 @@ tf_custom_op_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:framework_lite",
     ] + if_tensorrt([
         "@local_config_tensorrt//:tensorrt",
@@ -64,7 +64,7 @@ tf_kernel_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:stream_executor_headers_lib",
     ] + if_tensorrt([
         "@local_config_tensorrt//:tensorrt",
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
index 11335d7da637c813b301b4d4657462f4aae0c190..b683c14c0d77ebac74ad4d9b479c5ed493a3900a 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -21,9 +21,10 @@ limitations under the License.
 #include <vector>
 
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/op_kernel.h"
 #include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -38,8 +39,8 @@ void IncrementKernel(const float* d_input, float inc, float* d_output,
   int threads_per_block = 256;
   int blocks_per_grid = (count + threads_per_block - 1) / threads_per_block;
 
-  VecInc<<<threads_per_block, blocks_per_grid, 0, stream>>>(d_input, inc,
-                                                            d_output, count);
+  TF_CHECK_OK(CudaLaunchKernel(VecInc, threads_per_block, blocks_per_grid, 0,
+                               stream, d_input, inc, d_output, count));
 }
 
 // Note: this kernel definition is not needed in the plugin_test rule, but it is
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
index 8d4c893af56689185da72398919e2241d451594b..7c9075142a02546ddd580e861ac87cb86badd739 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
 
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
index 189e9c939b9ffd4450f7ba95fe1abdbbc049b430..fb048d7b19da0f010ed918b147013b20d37ed0dd 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cassert>
 #include <cstring>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
deleted file mode 100644
index 7f0f05aa0a07f84368d79ff033c6d4b0837812d8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_lru_cache.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/include/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-struct TRTInt8Calibrator;
-class TRTCalibrationResource;
-class AsyncHelper;
-//  TODO(Sami): Remove this file?
-
-//  This OP can construct TRTEngine on the fly and if construction of engine
-//  fails, executes equivalent subgraph as a TensorFlow function.
-class TRTEngineOp : public AsyncOpKernel {
- public:
-  explicit TRTEngineOp(OpKernelConstruction* context);
-
-  void ComputeAsync(OpKernelContext* context,
-                    AsyncOpKernel::DoneCallback done) override;
-
- private:
-  // TODO(samikama): context should go to a resource manager!
-
-  // Execute calibration
-  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
-
-  // Construct a function handle for executing native funcdef graph
-  Status ConstructFunctionHandle(OpKernelContext* ctx);
-
-  // Execute replaced native segment as function Op.
-  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
-
-  // Execute the tensorrt engine. Returns whether we need to retry by running
-  // the native segment.
-  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
-
-  // Allocate necessary resources for calibration
-  Status AllocateCalibrationResources(OpKernelContext* ctx,
-                                      TRTCalibrationResource** cr);
-
-  // Get engine for the input shape
-  EngineContext* GetEngine(const std::vector<TensorShape>& input_shapes,
-                           OpKernelContext* ctx);
-
-  // Return engine batch in cached_engne_batch_sizes_ which is closest to input
-  // batch.
-  bool GetCompatibleCachedEngine(
-      const std::vector<TensorShape>& actual_input_shapes,
-      std::vector<TensorShape>* engine_input_shapes);
-
-  std::vector<string> input_nodes_;
-  std::vector<string> output_nodes_;
-
-  // serialized protobuf segment or trt engine depending on static_engine_ flag.
-  string serialized_segment_;
-
-  // Name of the function for TF native execution of the segment.
-  string funcdef_name_;
-
-  // GraphDef representation of the segment.
-  GraphDef segment_graph_;
-
-  // Engine Precision mode.
-  int precision_mode_;
-
-  // Whether engine is constructed during the conversion or needs to be
-  // constructed from protobuf segment.
-  bool static_engine_;
-
-  // Whether to calibrate INT8 engine.
-  bool calibration_mode_;
-
-  // Batches of the cached engines
-  std::vector<int> cached_engine_batches_;
-
-  // Maximum number of cached engines
-  int max_cached_engines_;
-
-  int64 workspace_size_;
-  mutex engine_mutex_;
-  FunctionLibraryRuntime::Handle native_func_;
-
-  // The finalized calibrator for inference.
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-
-  // If true, create calibration graph for INT8 mode. Otherwise, we are using
-  // user-provided quantization ranges.
-  bool use_calibration_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index e2cf253ca07244d99245e1ff83f2e5addc26f28f..c29665b9a82c2f6ec098d34bbb77d40a2d4e85f7 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -19,12 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.tensorrt.python.ops import trt_ops
-from tensorflow.contrib.tensorrt.python.trt_convert import add_test_value
-from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
-from tensorflow.contrib.tensorrt.python.trt_convert import clear_test_values
 from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
-from tensorflow.contrib.tensorrt.python.trt_convert import enable_test_value
-from tensorflow.contrib.tensorrt.python.trt_convert import get_test_value
-from tensorflow.contrib.tensorrt.python.trt_convert import is_tensorrt_enabled
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_ops.py b/tensorflow/contrib/tensorrt/python/ops/trt_ops.py
deleted file mode 100644
index 1fee06854ff0a8bd45249acd5a449fea45a312b6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/python/ops/trt_ops.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Exposes the Python wrapper of TRTEngineOp."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import platform
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tensorrt.ops.gen_trt_ops import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _trt_ops = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_trt_ops.so"))
-else:
-  raise RuntimeError("Windows platforms are not supported")
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 49d72232aa0cfba3f5bf533de04f4d50e65275fd..8f4f1edae0bdeba007b67f18226683b39942df24 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -18,411 +18,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six as _six
-# pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.tensorrt.wrap_conversion import add_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
-from tensorflow.contrib.tensorrt.wrap_conversion import clear_test_values
-from tensorflow.contrib.tensorrt.wrap_conversion import enable_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version
-from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
-from tensorflow.contrib.tensorrt.wrap_conversion import get_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import is_tensorrt_enabled
-# pylint: enable=unused-import,line-too-long
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.framework import errors_impl as _impl
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.grappler import tf_optimizer
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import saver
-
-
-def _to_bytes(s):
-  """Encode s if it is a sequence of chars."""
-  if isinstance(s, _six.text_type):
-    return s.encode("utf-8", errors="surrogateescape")
-  return s
-
-
-def _to_string(s):
-  """Decode s if it is a sequence of bytes."""
-  if isinstance(s, _six.binary_type):
-    return s.decode("utf-8")
-  return s
-
-
-class TrtPrecisionMode(object):
-  FP32 = "FP32"
-  FP16 = "FP16"
-  INT8 = "INT8"
-
-  @staticmethod
-  def supported_precision_modes():
-    return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
-
-
-def get_tensorrt_rewriter_config(rewriter_config=None,
-                                 max_batch_size=1,
-                                 max_workspace_size_bytes=2 << 20,
-                                 precision_mode=TrtPrecisionMode.FP32,
-                                 minimum_segment_size=3,
-                                 is_dynamic_op=False,
-                                 maximum_cached_engines=1,
-                                 cached_engine_batches=None,
-                                 use_calibration=True):
-  """Returns a RewriterConfig proto for TRT transformation.
-
-  Args:
-    rewriter_config: a template RewriterConfig proto used to create a
-      TRT-enabled RewriterConfig. If None, it will use a default one.
-    max_batch_size: max size for the input batch
-    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-      engine can use at execution time. This corresponds to the 'workspaceSize'
-      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
-    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
-      network and engine at run time.
-    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
-      If the number of cached engines is already at max but none of them can
-      serve the input, the TRTEngineOp will fall back to run the TF function
-      based on which the TRTEngineOp is created.
-    cached_engine_batches: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be <= maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8. If
-      set to True, a calibration graph will be created to calibrate the missing
-      ranges. The calibration graph must be converted to an inference graph
-      using calib_graph_to_infer_graph() after running calibration. if set to
-      False, quantization nodes will be expected for every tensor in the graph
-      (exlcuding those which will be fused). If a range is missing, an error
-      will occur. Please note that accuracy may be negatively affected if there
-      is a mismatch between which tensors TRT quantizes and which tensors were
-      trained with fake quantization.
-
-  Returns:
-    A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
-
-  Raises:
-    TypeError: if any of the parameters are of unexpected type.
-    ValueError: if any of the parameters are of unexpected value.
-  """
-  if rewriter_config is not None and not isinstance(
-      rewriter_config, rewriter_config_pb2.RewriterConfig):
-    raise TypeError("rewriter_config should be a RewriterConfig proto.")
-
-  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
-  if rewriter_config is None:
-    # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
-    # need to run constant folding again.
-    rewriter_config_with_trt.optimizers.extend(
-        ["constfold", "layout", "constfold"])
-    rewriter_config_with_trt.meta_optimizer_iterations = (
-        rewriter_config_pb2.RewriterConfig.ONE)
-  else:
-    rewriter_config_with_trt.CopyFrom(rewriter_config)
-
-  if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
-    raise ValueError(("precision mode '{}' is not supported."
-                      "It should be one of {}").format(
-                          precision_mode,
-                          TrtPrecisionMode.supported_precision_modes))
-
-  optimizer = rewriter_config_with_trt.custom_optimizers.add()
-  optimizer.name = "TensorRTOptimizer"
-  optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
-  optimizer.parameter_map["max_batch_size"].i = max_batch_size
-  optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
-  optimizer.parameter_map[
-      "max_workspace_size_bytes"].i = max_workspace_size_bytes
-  optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
-  optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
-  if cached_engine_batches:
-    if not isinstance(cached_engine_batches, list):
-      raise TypeError("cached_engine_batches should be a list.")
-    if len(cached_engine_batches) > maximum_cached_engines:
-      raise ValueError("cached_engine_batches should not contain more than "
-                       "maximum_cached_engines items.")
-    optimizer.parameter_map["cached_engine_batches"].list.i.extend(
-        cached_engine_batches)
-  optimizer.parameter_map["use_calibration"].b = use_calibration
-  return rewriter_config_with_trt
-
-
-def create_inference_graph(input_graph_def,
-                           outputs,
-                           max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20,
-                           precision_mode=TrtPrecisionMode.FP32,
-                           minimum_segment_size=3,
-                           is_dynamic_op=False,
-                           maximum_cached_engines=1,
-                           cached_engine_batches=None,
-                           use_calibration=True,
-                           input_saved_model_dir=None,
-                           input_saved_model_tags=None,
-                           output_saved_model_dir=None,
-                           session_config=None):
-  """Python wrapper for the TRT transformation.
-
-  Args:
-    input_graph_def: a GraphDef object containing a model to be transformed. If
-      set to None, the graph will be read from the SavedModel loaded from
-      input_saved_model_dir.
-    outputs: list of tensors or node names for the model outputs. Only used when
-      input_graph_def is not None.
-    max_batch_size: max size for the input batch.
-    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-      engine can use at execution time. This corresponds to the 'workspaceSize'
-      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
-    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
-      network and engine at run time.
-    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
-      If the number of cached engines is already at max but none of them can
-      serve the input, the TRTEngineOp will fall back to run the TF function
-      based on which the TRTEngineOp is created.
-    cached_engine_batches: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be <= maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8. If
-      set to True, a calibration graph will be created to calibrate the missing
-      ranges. The calibration graph must be converted to an inference graph
-      using calib_graph_to_infer_graph() after running calibration. if set to
-      False, quantization nodes will be expected for every tensor in the graph
-      (exlcuding those which will be fused). If a range is missing, an error
-      will occur. Please note that accuracy may be negatively affected if there
-      is a mismatch between which tensors TRT quantizes and which tensors were
-      trained with fake quantization.
-    input_saved_model_dir: the directory to load the SavedModel which contains
-      the input graph to transforms. Used only when input_graph_def is None.
-    input_saved_model_tags: list of tags to load the SavedModel.
-    output_saved_model_dir: if not None, construct a SavedModel using the
-      returned GraphDef and save it to the specified directory. This option only
-      works when the input graph is loaded from a SavedModel, i.e. when
-      input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. It's also used as
-      a template to create a TRT-enabled ConfigProto for conversion. If not
-      specified, a default ConfigProto will be used.
-
-  Returns:
-    A GraphDef transformed from input_graph_def (or the SavedModel graph def
-    loaded from input_saved_model_dir, if input_graph_def is not present), where
-    all TRT compatible subgraphs are replaced with TRTEngineOps, and a TF
-    function is added for each of the subgraphs.
-
-    If is_dynamic_op is True, each TRTEngineOp will contain a serialized
-    subgraph GraphDef, which will be converted to a TRT engine at execution time
-    and the TRT engine will be cached for future usage. A new TRT engine will be
-    created each time when none of the cached engines match the input shapes. If
-    it fails to execute the TRT engine or the number of cached engines reaches
-    maximum_cached_engines, the op will fall back to call the corresponding TF
-    function.
-
-    If is_dynamic_op is False, each TRTEngineOp will contain a serialized TRT
-    engine created from the corresponding subgraph. No more engines will be
-    created on the fly, and the op will fall back to call the corresponding TF
-    function when it fails to execute the engine.
-
-  Raises:
-    ValueError: if the combination of the parameters is invalid.
-    RuntimeError: if the TensorRT library version is incompatible.
-  """
-  compiled_version = get_linked_tensorrt_version()
-  loaded_version = get_loaded_tensorrt_version()
-  version_mismatch = False
-  if loaded_version[0] < compiled_version[0]:
-    tf_logging.error(
-        "TensorRT version mismatch. Tensorflow was compiled against " +
-        "TensorRT %s but library loaded from environment is TensorRT %s" %
-        (".".join([str(x) for x in compiled_version]),
-         ".".join([str(x) for x in loaded_version])) +
-        ". Please make sure that correct version of TensorRT " +
-        "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
-    raise RuntimeError("Incompatible TensorRT library version")
-  for i in zip(loaded_version, compiled_version):
-    if i[0] != i[1]:
-      tf_logging.warn("TensorRT mismatch. Compiled against version " +
-                      "%s, but loaded %s. Things may not work" %
-                      (".".join([str(x) for x in compiled_version]),
-                       ".".join([str(x) for x in loaded_version])))
-      version_mismatch = True
-      break
-  if not version_mismatch:
-    tf_logging.info("Running against TensorRT version %s" % ".".join(
-        [str(x) for x in loaded_version]))
-
-  if session_config is None:
-    session_config = config_pb2.ConfigProto()
-
-  if input_saved_model_tags is None:
-    input_saved_model_tags = [tag_constants.SERVING]
-  saved_model_loader = None
-  grappler_meta_graph_def = None
-
-  if input_graph_def is None:
-    # Read from SavedModel and freeze the graph if necessary.
-    if input_saved_model_dir is None:
-      raise ValueError("input_graph_def and input_saved_model_dir cannot be "
-                       "both None")
-    with ops.Graph().as_default():
-      with session.Session(config=session_config) as sess:
-        saved_model_loader = loader_impl.SavedModelLoader(input_saved_model_dir)
-        input_meta_graph_def = saved_model_loader.load(sess,
-                                                       input_saved_model_tags)
-        output_node_names = set()
-
-        def _gather_names(tensor_info):
-          """Get the node names from a TensorInfo."""
-          return set(
-              [tensor_info[key].name.split(":")[0] for key in tensor_info])
-
-        # Get input and outputs from all SignatureDef.
-        for key in input_meta_graph_def.signature_def:
-          signature_def = input_meta_graph_def.signature_def[key]
-          output_node_names.update(_gather_names(signature_def.inputs))
-          output_node_names.update(_gather_names(signature_def.outputs))
-
-        # Freeze the variables in the SavedModel graph and copy the frozen
-        # graph over.
-        frozen_graph_def = graph_util.convert_variables_to_constants(
-            sess, sess.graph.as_graph_def(add_shapes=True),
-            list(output_node_names))
-        grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
-        grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
-
-        # Copy the collections that are not variables.
-        for key in input_meta_graph_def.collection_def:
-          # TODO(laigd): currently we use the collection key to filter out
-          # collections that depend on variable ops, but this may miss some
-          # other user-defined collections. A better way would be to use
-          # CollectionDef::NodeList for the filtering.
-          if key not in [
-              "variables", "local_variables", "model_variables",
-              "trainable_variables", "train_op", "table_initializer"
-          ]:
-            grappler_meta_graph_def.collection_def[key].CopyFrom(
-                input_meta_graph_def.collection_def[key])
-
-        # Copy other information.
-        grappler_meta_graph_def.meta_info_def.CopyFrom(
-            input_meta_graph_def.meta_info_def)
-        for key in input_meta_graph_def.signature_def:
-          grappler_meta_graph_def.signature_def[key].CopyFrom(
-              input_meta_graph_def.signature_def[key])
-        # TODO(laigd): maybe add back AssetFileDef.
-  else:
-    if output_saved_model_dir is not None:
-      raise ValueError("output_saved_model_dir cannot be set when "
-                       "input_graph_def is set")
-    # Create MetaGraphDef from input graph.
-    graph = ops.Graph()
-    with graph.as_default():
-      importer.import_graph_def(input_graph_def, name="")
-    grappler_meta_graph_def = saver.export_meta_graph(
-        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
-    if outputs:
-      output_collection = meta_graph_pb2.CollectionDef()
-      output_list = output_collection.node_list.value
-      for i in outputs:
-        if isinstance(i, ops.Tensor):
-          output_list.append(_to_bytes(i.name))
-        else:
-          output_list.append(_to_bytes(i))
-      # TODO(laigd): use another key as the outputs are really not train_op.
-      grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
-          output_collection)
-
-  # Create TRT-enabled ConfigProto.
-  session_config_with_trt = config_pb2.ConfigProto()
-  session_config_with_trt.CopyFrom(session_config)
-  rewriter_config = None
-  if (session_config_with_trt.HasField("graph_options") and
-      session_config_with_trt.graph_options.HasField("rewrite_options")):
-    rewriter_config = session_config_with_trt.graph_options.rewrite_options
-  rewriter_config_with_trt = get_tensorrt_rewriter_config(
-      rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
-      minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batches, use_calibration)
-  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
-      rewriter_config_with_trt)
-
-  # Run Grappler.
-  transformed_graph_def = tf_optimizer.OptimizeGraph(
-      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
-
-  # Optionally write the transformed graphdef as SavedModel.
-  if output_saved_model_dir is not None:
-    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
-    with ops.Graph().as_default():
-      importer.import_graph_def(transformed_graph_def, name="")
-      # We don't use TRT here.
-      with session.Session(config=session_config) as sess:
-        saved_model_builder.add_meta_graph_and_variables(
-            sess,
-            input_saved_model_tags,
-            signature_def_map=grappler_meta_graph_def.signature_def)
-    # Ignore other meta graphs from the input SavedModel.
-    saved_model_builder.save()
-
-  return transformed_graph_def
-
-
-def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
-  """Convert an existing calibration graph to inference graph.
-
-  Args:
-    calibration_graph_def: the calibration GraphDef object with calibration data
-    is_dynamic_op: whether to create dynamic static engines from calibration
-
-  Returns:
-    New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
-  Raises:
-    RuntimeError: if the returned status message is malformed.
-  """
-
-  is_calib_graph = False
-  for n in calibration_graph_def.node:
-    if n.op == "TRTEngineOp":
-      is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
-  if not is_calib_graph:
-    tf_logging.error(
-        "Not a calib graph. Doesn't seem to contain any calibration nodes.")
-    return None
-  graph_str = calibration_graph_def.SerializeToString()
-  out = calib_convert(graph_str, is_dynamic_op)
-  status = _to_string(out[0])
-  output_graph_def_string = out[1]
-  del graph_str  # Save some memory
-  if len(status) < 2:
-    raise _impl.UnknownError(None, None, status)
-  if status[:2] != "OK":
-    msg = status.split(";")
-    if len(msg) == 1:
-      raise RuntimeError("Status message is malformed {}".format(status))
-    # pylint: disable=protected-access
-    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
-                                         int(msg[0]))
-    # pylint: enable=protected-access
-  output_graph_def = graph_pb2.GraphDef()
-  output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string  # Save some memory
-  return output_graph_def
+from tensorflow.python.compiler.tensorrt import trt_convert
+
+
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=trt_convert.DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=trt_convert.TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    output_saved_model_dir=None,
+    session_config=None):
+  return trt_convert.create_inference_graph(
+      input_graph_def=input_graph_def,
+      outputs=outputs,
+      max_batch_size=max_batch_size,
+      max_workspace_size_bytes=max_workspace_size_bytes,
+      precision_mode=precision_mode,
+      minimum_segment_size=minimum_segment_size,
+      is_dynamic_op=is_dynamic_op,
+      maximum_cached_engines=maximum_cached_engines,
+      cached_engine_batches=cached_engine_batches,
+      input_saved_model_dir=input_saved_model_dir,
+      input_saved_model_tags=input_saved_model_tags,
+      output_saved_model_dir=output_saved_model_dir,
+      session_config=session_config)
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
deleted file mode 100644
index 3ef18e3e150e0d421baa76cbda0b0daa929a7e91..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities to test TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.saved_model import utils
-from tensorflow.python.tools import saved_model_utils
-
-
-class TrtConvertTest(test_util.TensorFlowTestCase):
-  """Class to test Tensorflow-TensorRT integration python API."""
-
-  def testGetTensorrtRewriterConfig(self):
-    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-        rewriter_config=None,
-        max_batch_size=128,
-        max_workspace_size_bytes=1234,
-        precision_mode="INT8",
-        minimum_segment_size=10,
-        is_dynamic_op=True,
-        maximum_cached_engines=2,
-        cached_engine_batches=[1, 128])
-    self.assertEqual(["constfold", "layout", "constfold"],
-                     rewriter_cfg.optimizers)
-    self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
-                     rewriter_cfg.meta_optimizer_iterations)
-    trt_optimizer = None
-    for optimizer in rewriter_cfg.custom_optimizers:
-      if optimizer.name == "TensorRTOptimizer":
-        self.assertTrue(trt_optimizer is None)
-        trt_optimizer = optimizer
-    self.assertTrue(trt_optimizer is not None)
-    for key in [
-        "minimum_segment_size", "max_batch_size", "is_dynamic_op",
-        "max_workspace_size_bytes", "precision_mode", "maximum_cached_engines",
-        "cached_engine_batches"
-    ]:
-      self.assertTrue(key in trt_optimizer.parameter_map)
-    self.assertEqual(10, trt_optimizer.parameter_map["minimum_segment_size"].i)
-    self.assertEqual(128, trt_optimizer.parameter_map["max_batch_size"].i)
-    self.assertEqual(True, trt_optimizer.parameter_map["is_dynamic_op"].b)
-    self.assertEqual(1234,
-                     trt_optimizer.parameter_map["max_workspace_size_bytes"].i)
-    self.assertEqual(
-        trt_convert._to_bytes("INT8"),
-        trt_optimizer.parameter_map["precision_mode"].s)
-    self.assertEqual(2, trt_optimizer.parameter_map["maximum_cached_engines"].i)
-    self.assertEqual(
-        [1, 128], trt_optimizer.parameter_map["cached_engine_batches"].list.i)
-
-  def _GetConfigProto(self):
-    """Get ConfigProto for session creation."""
-    config = config_pb2.ConfigProto(
-        gpu_options=config_pb2.GPUOptions(allow_growth=True))
-    return config
-
-  def _GetGraph(self):
-    """Get the graph for testing."""
-    g = ops.Graph()
-    with g.as_default():
-      with g.device("/GPU:0"):
-        inp = array_ops.placeholder(
-            dtype=dtypes.float32, shape=[None, 1, 1], name="input")
-        var = variables.VariableV1([[[1.0]]], dtype=dtypes.float32, name="v1")
-        add = inp + var.value()
-        mul = inp * add
-        add = mul + add
-        out = array_ops.identity(add, name="output")
-    return g, var, inp, out
-
-  def _GetGraphDef(self):
-    """Get the graph def for testing."""
-    g, var, _, _ = self._GetGraph()
-    with self.session(graph=g, config=self._GetConfigProto()) as sess:
-      sess.run(var.initializer)
-      graph_def = graph_util.convert_variables_to_constants(
-          sess, g.as_graph_def(add_shapes=True), ["output"])
-    node_name_to_op = {node.name: node.op for node in graph_def.node}
-    self.assertEqual({
-        "v1": "Const",
-        "v1/read": "Identity",
-        "input": "Placeholder",
-        "add": "Add",
-        "mul": "Mul",
-        "add_1": "Add",
-        "output": "Identity"
-    }, node_name_to_op)
-    return graph_def
-
-  def _WriteInputSavedModel(self, input_saved_model_dir):
-    """Write the saved model as an input for testing."""
-    g, var, inp, out = self._GetGraph()
-    signature_def = signature_def_utils.build_signature_def(
-        inputs={"myinput": utils.build_tensor_info(inp)},
-        outputs={"myoutput": utils.build_tensor_info(out)},
-        method_name=signature_constants.PREDICT_METHOD_NAME)
-    saved_model_builder = builder.SavedModelBuilder(input_saved_model_dir)
-    with self.session(graph=g, config=self._GetConfigProto()) as sess:
-      sess.run(var.initializer)
-      saved_model_builder.add_meta_graph_and_variables(
-          sess, [tag_constants.SERVING],
-          signature_def_map={"mypredict": signature_def})
-    saved_model_builder.save()
-
-  def _TestCreateInferenceGraph(self,
-                                input_saved_model_dir=None,
-                                output_saved_model_dir=None):
-    """General method to test trt_convert.create_inference_graph()."""
-    input_graph_def = None if input_saved_model_dir else self._GetGraphDef()
-    output_graph_def = trt_convert.create_inference_graph(
-        input_graph_def, ["output"],
-        input_saved_model_dir=input_saved_model_dir,
-        output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
-    graph_defs_to_verify = [output_graph_def]
-    if output_saved_model_dir is not None:
-      saved_model_graph_def = saved_model_utils.get_meta_graph_def(
-          output_saved_model_dir, tag_constants.SERVING).graph_def
-      self.assertTrue(isinstance(saved_model_graph_def, graph_pb2.GraphDef))
-      graph_defs_to_verify.append(saved_model_graph_def)
-
-    for graph_def in graph_defs_to_verify:
-      node_name_to_op = {node.name: node.op for node in graph_def.node}
-      self.assertEqual({
-          "input": "Placeholder",
-          "TRTEngineOp_0": "TRTEngineOp",
-          "output": "Identity"
-      }, node_name_to_op)
-
-  def testCreateInferenceGraph_BasicConversion(self):
-    """Test case for trt_convert.create_inference_graph()."""
-    if not trt_convert.is_tensorrt_enabled():
-      return
-
-    # Use GraphDef as input.
-    self._TestCreateInferenceGraph()
-
-    # Use SavedModel as input.
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir1")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir1")
-    self._WriteInputSavedModel(input_saved_model_dir)
-    self._TestCreateInferenceGraph(input_saved_model_dir,
-                                   output_saved_model_dir)
-
-  def _TestRun(self, sess, batch_size, expect_engine_is_run):
-    trt_convert.clear_test_values("")
-    result = sess.run("output:0", feed_dict={"input:0": [[[1.0]]] * batch_size})
-    self.assertAllEqual([[[4.0]]] * batch_size, result)
-    execute_engine_test_value = ("done" if expect_engine_is_run else "")
-    execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
-    self.assertEqual(
-        execute_engine_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
-    self.assertEqual(
-        execute_native_segment_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
-
-  def testCreateInferenceGraph_MinimumSegmentSize(self):
-    if not trt_convert.is_tensorrt_enabled():
-      return
-    output_graph_def = trt_convert.create_inference_graph(
-        self._GetGraphDef(), ["output"],
-        minimum_segment_size=5,
-        is_dynamic_op=False)
-    node_name_to_op = {node.name: node.op for node in output_graph_def.node}
-    self.assertEqual({
-        "v1/read": "Const",
-        "input": "Placeholder",
-        "add": "Add",
-        "mul": "Mul",
-        "add_1": "Add",
-        "output": "Identity"
-    }, node_name_to_op)
-
-  def testCreateInferenceGraph_DynamicOp(self):
-    if not trt_convert.is_tensorrt_enabled():
-      return
-    trt_convert.enable_test_value()
-
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir2")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir2")
-    self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        is_dynamic_op=True,
-        maximum_cached_engines=2,
-        input_saved_model_dir=input_saved_model_dir,
-        output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
-
-    # Test the output GraphDef.
-    with ops.Graph().as_default():
-      importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
-        # Run with batch size 1, a new engine is created and cached.
-        self._TestRun(sess, 1, True)
-        # Run with batch size 2, a new engine is created and cached.
-        self._TestRun(sess, 2, True)
-        # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should evict an old engine and create a new one.
-        self._TestRun(sess, 3, True)
-
-    # Test the output SavedModel
-    with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
-        loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
-        # Run with batch size 1, a new engine is created and cached.
-        self._TestRun(sess, 1, True)
-        # Run with batch size 2, a new engine is created and cached.
-        self._TestRun(sess, 2, True)
-        # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should evict an old engine and create a new one.
-        self._TestRun(sess, 3, True)
-
-  def testCreateInferenceGraph_StaticOp(self):
-    if not trt_convert.is_tensorrt_enabled():
-      return
-    trt_convert.enable_test_value()
-
-    tmp_dir = self.get_temp_dir()
-    input_saved_model_dir = os.path.join(tmp_dir, "in_dir3")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir3")
-    self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        max_batch_size=1,
-        is_dynamic_op=False,
-        maximum_cached_engines=2,  # This is noop, added just for testing.
-        input_saved_model_dir=input_saved_model_dir,
-        output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
-
-    # Test the output GraphDef.
-    with ops.Graph().as_default():
-      importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
-        # Run with batch size 1, the default engine embedded in the graphdef
-        # will be used.
-        self._TestRun(sess, 1, True)
-        # Run with batch size 2, which exceed the max_batch_size, it should fall
-        # back to TF function.
-        self._TestRun(sess, 2, False)
-
-    # Test the output SavedModel
-    with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
-        loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
-        # Run with batch size 1, the default engine embedded in the graphdef
-        # will be used.
-        self._TestRun(sess, 1, True)
-        # Run with batch size 2, which exceed the max_batch_size, it should fall
-        # back to TF function.
-        self._TestRun(sess, 2, False)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
deleted file mode 100644
index 9c3698e5d1cc5d6d8d31a8fcaf03d103f1e1915d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-std::shared_ptr<TRTResourceManager>
-tensorflow::tensorrt::TRTResourceManager::instance() {
-  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
-  return instance_;
-}
-
-std::shared_ptr<tensorflow::ResourceMgr>
-tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
-  // mutex is held for lookup only. Most instantiations where mutex will be held
-  // longer will be during op creation and should be ok.
-  tensorflow::mutex_lock lock(map_mutex_);
-  auto s = managers_.find(op_name);
-  if (s == managers_.end()) {
-    auto it = managers_.emplace(
-        op_name, std::make_shared<tensorflow::ResourceMgr>(op_name));
-    VLOG(1) << "Returning a new manager " << op_name;
-    return it.first->second;
-  }
-  VLOG(1) << "Returning old manager " << op_name;
-  return s->second;
-}
-
-}  // namespace tensorrt
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
deleted file mode 100644
index 19f39e6d3db1571573fb290dd2c30fd43ea604ef..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
-#include <memory>
-
-#include <string>
-#include <unordered_map>
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-class TRTResourceManager {
-  TRTResourceManager() = default;
-
- public:
-  static std::shared_ptr<TRTResourceManager> instance();
-  // returns a manager for given op, if it doesn't exists it creates one
-  std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
-
- private:
-  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
-      managers_;
-  tensorflow::mutex map_mutex_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index f30dba59ad55317d7ad7730e4dc66c9aba4e6a6b..5c60d6b589ed6a16276226726d989e949bcbf9d7 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,14 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorrt/include/NvInfer.h"
 
diff --git a/tensorflow/contrib/tensorrt/test/manual_test.py b/tensorflow/contrib/tensorrt/test/manual_test.py
deleted file mode 100644
index aad7b9f30728cbb3f4ec5fa730c5dbe46fe9fc3f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/manual_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Basic tests for TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import os
-
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-class ManualTest(trt_test.TfTrtIntegrationTestBase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    super(ManualTest, self).__init__(methodName)
-    self._params_map = None
-
-  def _GetEnv(self):
-    """Get an environment variable specifying the manual test parameters.
-
-    The value of the environment variable is the string representation of a dict
-    which should contain the following keys:
-    - 'graph_path': the file path to the serialized frozen graphdef
-    - 'input_names': TfTrtIntegrationTestParams.input_names
-    - 'input_dims': TfTrtIntegrationTestParams.input_dims
-    - 'expected_output_dims': TfTrtIntegrationTestParams.expected_output_dims
-    - 'output_name': the name of op to fetch
-    - 'expected_engines_to_run': ExpectedEnginesToRun() will return this
-    - 'expected_engines_to_build': ExpectedEnginesToBuild() will return this
-    - 'max_batch_size': ConversionParams.max_batch_size
-
-    Returns:
-      The value of the environment variable.
-    """
-    return os.getenv('TRT_MANUAL_TEST_PARAMS', '')
-
-  def _GetParamsMap(self):
-    """Parse the environment variable as a dict and return it."""
-    if self._params_map is None:
-      self._params_map = ast.literal_eval(self._GetEnv())
-    return self._params_map
-
-  def GetParams(self):
-    """Testing conversion of manually provided frozen graph."""
-    params_map = self._GetParamsMap()
-    gdef = graph_pb2.GraphDef()
-    with gfile.Open(params_map['graph_path'], 'rb') as f:
-      gdef.ParseFromString(f.read())
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=gdef,
-        input_names=params_map['input_names'],
-        input_dims=[params_map['input_dims']],
-        output_names=params_map['output_names'],
-        expected_output_dims=[params_map['expected_output_dims']])
-
-  def GetConversionParams(self, run_params):
-    """Return a ConversionParams for test."""
-    conversion_params = super(ManualTest, self).GetConversionParams(run_params)
-    params_map = self._GetParamsMap()
-    if 'max_batch_size' in params_map:
-      conversion_params = conversion_params._replace(
-          max_batch_size=params_map['max_batch_size'])
-    return conversion_params
-
-  def ExpectedEnginesToBuild(self, run_params):
-    """Return the expected engines to build."""
-    return self._GetParamsMap()['expected_engines_to_build']
-
-  def ExpectedEnginesToRun(self, run_params):
-    """Return the expected engines to run."""
-    params_map = self._GetParamsMap()
-    if 'expected_engines_to_run' in params_map:
-      return params_map['expected_engines_to_run']
-    return self.ExpectedEnginesToBuild(run_params)
-
-  def ExpectedAbsoluteTolerance(self, run_params):
-    """The absolute tolerance to compare floating point results."""
-    params_map = self._GetParamsMap()
-    if 'atol' in params_map:
-      return params_map['atol']
-    return 1.e-3
-
-  def ExpectedRelativeTolerance(self, run_params):
-    """The relative tolerance to compare floating point results."""
-    params_map = self._GetParamsMap()
-    if 'rtol' in params_map:
-      return params_map['rtol']
-    return 1.e-3
-
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    return len(self._GetEnv())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
deleted file mode 100644
index 090aa8bdb0487973e186631af3b4edac48096a5f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Script to test TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import numpy as np
-import six as _six
-
-# normally we should do import tensorflow as tf and then
-# tf.placeholder, tf.constant, tf.nn.conv2d etc but
-# it looks like internal builds don't like it so
-# importing every module individually
-
-from tensorflow.contrib import tensorrt as trt
-from tensorflow.core.protobuf import config_pb2 as cpb2
-from tensorflow.core.protobuf import rewriter_config_pb2 as rwpb2
-from tensorflow.python.client import session as csess
-from tensorflow.python.framework import constant_op as cop
-from tensorflow.python.framework import dtypes as dtypes
-from tensorflow.python.framework import importer as importer
-from tensorflow.python.framework import ops as ops
-from tensorflow.python.ops import array_ops as aops
-from tensorflow.python.ops import math_ops as mops
-from tensorflow.python.ops import nn as nn
-from tensorflow.python.ops import nn_ops as nn_ops
-
-
-def py2bytes(inp):
-  return inp
-
-
-def py3bytes(inp):
-  return inp.encode("utf-8", errors="surrogateescape")
-
-
-def py2string(inp):
-  return inp
-
-
-def py3string(inp):
-  return inp.decode("utf-8")
-
-
-if _six.PY2:
-  to_bytes = py2bytes
-  to_string = py2string
-else:
-  to_bytes = py3bytes
-  to_string = py3string
-
-
-def get_multi_engine_graph_def(mode="FP32"):
-  """Create a simple graph and return its graph_def."""
-  dtype = dtypes.float32
-  if mode.upper() == "FP16":
-    dtype = dtypes.float16
-  else:
-    pass
-
-  g = ops.Graph()
-  with g.as_default():
-    x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
-    with g.name_scope("Global_scope"):
-      with g.name_scope("first_scope"):
-        e = cop.constant(
-            np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
-        conv = nn.conv2d(
-            input=x,
-            filter=e,
-            data_format="NCHW",
-            strides=[1, 1, 1, 1],
-            padding="VALID",
-            name="conv")
-        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype)
-        t = conv * b
-
-        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
-        q = conv / b
-      edge = mops.sin(q)
-      edge1 = mops.cos(conv)
-      with g.name_scope("test_scope"):
-        de = edge + edge1
-        t -= edge1
-        q *= edge
-        t += q
-        t -= de
-    k = aops.squeeze(t, name="output")
-  print(k.dtype)
-  return g.as_graph_def()
-
-
-def get_simple_graph_def():
-  """Create a simple graph and return its graph_def."""
-  g = ops.Graph()
-  with g.as_default():
-    a = aops.placeholder(
-        dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
-    e = cop.constant(
-        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-        name="weights",
-        dtype=dtypes.float32)
-    conv = nn.conv2d(
-        input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
-    b = cop.constant(
-        [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
-    t = nn.bias_add(conv, b, name="biasAdd")
-    relu = nn.relu(t, "relu")
-    idty = aops.identity(relu, "ID")
-    v = nn_ops.max_pool(
-        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    aops.squeeze(v, name="output")
-  return g.as_graph_def()
-
-
-def execute_graph(gdef, dumm_inp):
-  """Run given graphdef once."""
-  print("executing")
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
-  ops.reset_default_graph()
-  g = ops.Graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=gdef, return_elements=["input", "output"])
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-  with csess.Session(config=sessconfig, graph=g) as sess:
-    val = sess.run(out, {inp: dumm_inp})
-  return val
-
-
-# Use real data that is representative of the inference dataset
-# for calibration. For this test script it is random data.
-def execute_calibration(gdef, dumm_inp):
-  """Run given calibration graph multiple times."""
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  ops.reset_default_graph()
-  g = ops.Graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=gdef, return_elements=["input", "output"])
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
-    # run over real calibration data here, we are mimicking a calibration set of
-    # 30 different batches. Use as much calibration data as you want
-    for _ in range(30):
-      val = sess.run(out, {inp: dumm_inp})
-  return val
-
-
-def user(multi_engine,
-         run_graph=execute_graph,
-         run_calibration=execute_calibration):
-  """Example function that converts a graph to TFTRT graph."""
-  if multi_engine:
-    inp_dims = (2, 3, 7, 5)
-    orig_graph = get_multi_engine_graph_def()
-  else:
-    inp_dims = (100, 24, 24, 2)
-    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
-  dummy_input = np.random.random_sample(inp_dims)
-  # Get optimized graph
-  trt_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batches=[])
-  o1 = run_graph(orig_graph, dummy_input)
-  o2 = run_graph(trt_graph, dummy_input)
-  o3 = run_graph(trt_graph, dummy_input)
-  assert np.array_equal(o1, o2)
-  assert np.array_equal(o3, o2)  # sanity check
-  fp16_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batches=[])
-  int8_calib_gdef = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batches=[])
-  o4 = run_graph(fp16_graph, dummy_input)
-  _ = run_calibration(int8_calib_gdef, dummy_input)
-  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
-  o5 = run_graph(int8_graph, dummy_input)
-  print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
-  print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
-  print("Pass")
-
-
-def auto(multi_engine):
-  """Run the conversion as an optimization pass."""
-  if multi_engine:
-    inp_dims = (2, 3, 7, 5)
-    orig_graph = get_multi_engine_graph_def()
-  else:
-    inp_dims = (100, 24, 24, 2)
-    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
-  dummy_input = np.random.random_sample(inp_dims)
-  opt_config = rwpb2.RewriterConfig()
-  opt_config.meta_optimizer_iterations = opt_config.ONE
-  opt_config.optimizers.extend(["constfold", "layout"])
-  custom_op = opt_config.custom_optimizers.add()
-  custom_op.name = "TensorRTOptimizer"
-  custom_op.parameter_map["minimum_segment_size"].i = 3
-  custom_op.parameter_map["precision_mode"].s = to_bytes("FP32")
-  custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
-  custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
-  print(custom_op)
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
-  sessconfig = cpb2.ConfigProto(
-      gpu_options=gpu_options, graph_options=graph_options)
-  print(sessconfig)
-  g = ops.Graph()
-  ops.reset_default_graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=orig_graph, return_elements=["input", "output"], name="")
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-    with csess.Session(config=sessconfig, graph=g) as sess:
-      val = sess.run(out, {inp: dummy_input})
-  print(val.shape)
-
-
-if "__main__" in __name__:
-  P = argparse.ArgumentParser(
-      prog="tftrt_test",
-      description="Example utilization of TensorFlow-TensorRT integration")
-  P.add_argument(
-      "--automatic",
-      "-a",
-      action="store_true",
-      help="Do TRT conversion automatically",
-      default=False)
-  P.add_argument(
-      "--multi-engine",
-      "-m",
-      action="store_true",
-      help="Use a graph that will result in 2 engines",
-      default=False)
-  flags, unparsed = P.parse_known_args()
-  if flags.automatic:
-    auto(flags.multi_engine)
-  else:
-    user(flags.multi_engine)
diff --git a/tensorflow/contrib/tensorrt/test/utils.cc b/tensorflow/contrib/tensorrt/test/utils.cc
deleted file mode 100644
index 276308b3a0a6ce864969afb0179c6a3f00d6b70b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/utils.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/tensorrt/test/utils.h"
-
-#include <unordered_map>
-#include <vector>
-
-#include "re2/re2.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace tensorrt {
-namespace test {
-
-// TODO(aaroey): make this class thread-safe.
-class TestValueManager {
- public:
-  static TestValueManager* singleton() {
-    static TestValueManager* manager = new TestValueManager();
-    return manager;
-  }
-
-  void Enable() {
-    VLOG(1) << "Enabling test value";
-    enabled_ = true;
-  }
-
-  void Add(const string& label, const string& value) {
-    if (TF_PREDICT_FALSE(enabled_)) {
-      QCHECK_NE("", value);
-      VLOG(1) << "Adding test value: " << label << " -> " << value;
-      values_.insert({label, value});
-    }
-  }
-
-  string Get(const string& label) {
-    if (TF_PREDICT_FALSE(enabled_)) {
-      VLOG(1) << "Getting test value by " << label;
-      auto itr = values_.find(label);
-      if (itr == values_.end()) return "";
-      return itr->second;
-    }
-    return "";
-  }
-
-  void Clear(const string& pattern) {
-    if (TF_PREDICT_FALSE(enabled_)) {
-      VLOG(1) << "Clearing test values";
-      if (pattern.empty()) {
-        values_.clear();
-        return;
-      }
-      std::vector<string> keys_to_clear;
-      for (const auto& kv : values_) {
-        if (RE2::FullMatch(kv.first, pattern)) {
-          keys_to_clear.push_back(kv.first);
-        }
-      }
-      for (const string& key : keys_to_clear) {
-        values_.erase(key);
-      }
-    }
-  }
-
- private:
-  TestValueManager() : enabled_(false) {}
-
-  bool enabled_;
-  std::unordered_map<string, string> values_;
-};
-
-void EnableTestValue() { TestValueManager::singleton()->Enable(); }
-
-void ClearTestValues(const string& pattern) {
-  TestValueManager::singleton()->Clear(pattern);
-}
-
-void AddTestValue(const string& label, const string& value) {
-  TestValueManager::singleton()->Add(label, value);
-}
-
-string GetTestValue(const string& label) {
-  return TestValueManager::singleton()->Get(label);
-}
-
-}  // namespace test
-}  // namespace tensorrt
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
deleted file mode 100644
index 6ea15fb8eff13663625420288a37ba002d57fa47..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* Wrap trt_conversion */
-%{
-#define SWIG_FILE_WITH_INIT
-%}
-%include "std_pair.i"
-%include "tensorflow/python/platform/base.i"
-
-%{
-PyObject* pair_helper(std::pair<string, string>* in) {
-  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
-  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
-  if (!first) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
-    }
-    return NULL;
-  }
-  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
-  if (!second) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Pair conversion second argument failed");
-    }
-    return NULL;
-  }
-  tuple = Py_BuildValue("(OO)", first, second);
-  if (!tuple) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Tuple creation from pair<string,string> failed!");
-    }
-    return NULL;
-  }
-  return tuple;
-}
-
-struct version_struct{
-  int vmajor;
-  int vminor;
-  int vpatch;
-};
-
-PyObject* version_helper(version_struct* in) {
-  PyObject *tuple(nullptr);
-  tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch);
-  if (!tuple) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Tuple creation from version structure failed!");
-    }
-    return NULL;
-  }
-  return tuple;
-}
-/* Define converters for vector<int> */
-template<>
-bool _PyObjAs(PyObject *pyobj, int* dest) {
-  *dest = PyLong_AsLong(pyobj);
-  return true;
-}
-
-template<>
-PyObject *_PyObjFrom(const int& src) {
-  return PyLong_FromLong(src);
-}
-
-%}
-
-_LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
-
-%typemap(out) std::pair<string, string> {
-  PyObject *tuple = pair_helper(&$1);
-  if (!tuple) SWIG_fail;
-  $result = tuple;
-}
-
-%typemap(out) version_struct {
-  PyObject *tuple = version_helper(&$1);
-  if (!tuple) SWIG_fail;
-  $result = tuple;
-}
-
-%{
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/stat_summarizer.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
-%}
-
-%ignoreall
-%unignore tensorflow;
-%unignore calib_convert;
-%unignore get_linked_tensorrt_version;
-%unignore get_loaded_tensorrt_version;
-%unignore is_tensorrt_enabled;
-%unignore enable_test_value;
-%unignore clear_test_values;
-%unignore add_test_value;
-%unignore get_test_value;
-
-%{
-
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op
-    // unfortunately we can't use TF_Status here since it
-    // is in c/c_api and brings in a lot of other libraries
-    // which in turn declare ops. These ops are included
-    // statically in our library and cause an abort when
-    // module is loaded due to double registration
-    // until Tensorflow properly exposes these headers
-    // we have to work around this by returning a string
-    // and converting it to exception on python side.
-    //,TF_Status* out_status) {
-) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  string out_status;
-
-  tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(graph_def_string)) {
-    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  graph_def_string.resize(0);
-  tensorflow::GraphDef out_graph;
-  tensorflow::Status conversion_status =
-      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(
-          graph_def, &out_graph, is_dyn_op);
-  if (!conversion_status.ok()) {
-    auto retCode = (int)conversion_status.code();
-    char buff[2000];
-    snprintf(buff, 2000, "%d;%s", retCode,
-             conversion_status.error_message().c_str());
-    out_status = buff;
-    return std::pair<string, string>{out_status, ""};
-  }
-  string result;
-  if (!out_graph.SerializeToString(&result)) {
-    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  out_status = "OK;All good!";
-  return std::pair<string, string>{out_status, result};
-#else
-  // Returns FAILED_PRECONDITION.
-  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-}
-
-version_struct get_linked_tensorrt_version() {
-  // Return the version at the link time.
-  version_struct s;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  const auto &lv = tensorflow::tensorrt::convert::GetLinkedTensorRTVersion();
-  s.vmajor = lv[0];
-  s.vminor = lv[1];
-  s.vpatch = lv[2];
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-  return s;
-}
-
-version_struct get_loaded_tensorrt_version() {
-  // Return the version from the loaded library.
-  version_struct s;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  const auto &lv = tensorflow::tensorrt::convert::GetLoadedTensorRTVersion();
-  s.vmajor = lv[0];
-  s.vminor = lv[1];
-  s.vpatch = lv[2];
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-  return s;
-}
-
-bool is_tensorrt_enabled() {
-  return tensorflow::tensorrt::IsGoogleTensorRTEnabled();
-}
-
-void enable_test_value() {
-  tensorflow::tensorrt::test::EnableTestValue();
-}
-
-#if PY_MAJOR_VERSION < 3
-#define TRT_PY_TO_CPP_STRING PyString_AsString
-#define TRT_CPP_TO_PY_STRING PyString_FromString
-#else
-#define TRT_PY_TO_CPP_STRING PyUnicode_AsUTF8
-#define TRT_CPP_TO_PY_STRING PyUnicode_FromString
-#endif
-
-void clear_test_values(PyObject* pattern) {
-  tensorflow::tensorrt::test::ClearTestValues(
-      string(TRT_PY_TO_CPP_STRING(pattern)));
-}
-
-void add_test_value(PyObject* label, PyObject* value) {
-  tensorflow::tensorrt::test::AddTestValue(
-      string(TRT_PY_TO_CPP_STRING(label)), string(TRT_PY_TO_CPP_STRING(value)));
-}
-
-PyObject* get_test_value(PyObject* label) {
-  string value = tensorflow::tensorrt::test::GetTestValue(
-      string(TRT_PY_TO_CPP_STRING(label)));
-  return TRT_CPP_TO_PY_STRING(value.c_str());
-}
-
-%}
-
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op);
-version_struct get_linked_tensorrt_version();
-version_struct get_loaded_tensorrt_version();
-bool is_tensorrt_enabled();
-void enable_test_value();
-void clear_test_values(PyObject* pattern);
-void add_test_value(PyObject* label, PyObject* value);
-PyObject* get_test_value(PyObject* label);
-
-%unignoreall
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index f2b8786a527289fe20de86447355fbf552cd265e..18933227b3431fc56b91c6ab7376c975d3aa69a7 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -23,10 +23,10 @@ py_library(
     name = "timeseries_pip",
     deps = [
         ":timeseries",
-        "//tensorflow/contrib/timeseries/examples:known_anomaly",
-        "//tensorflow/contrib/timeseries/examples:lstm",
-        "//tensorflow/contrib/timeseries/examples:multivariate",
-        "//tensorflow/contrib/timeseries/examples:predict",
+        "//tensorflow/contrib/timeseries/examples:known_anomaly_main_lib",
+        "//tensorflow/contrib/timeseries/examples:lstm_main_lib",
+        "//tensorflow/contrib/timeseries/examples:multivariate_main_lib",
+        "//tensorflow/contrib/timeseries/examples:predict_main_lib",
         "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",
     ],
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index e10be88ece8ebba9635af955b3c3410f29e5503c..70c3a0720eed1971a90e0498d12f876abe4906d5 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -17,6 +17,14 @@ config_setting(
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [":predict_main_lib"],
+)
+
+py_library(
+    name = "predict_main_lib",
+    srcs = ["predict.py"],
     data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -40,7 +48,7 @@ py_test(
         "notsan",  # b/67513579
     ],
     deps = [
-        ":predict",
+        ":predict_main_lib",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -48,6 +56,14 @@ py_test(
 py_binary(
     name = "known_anomaly",
     srcs = ["known_anomaly.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [":known_anomaly_main_lib"],
+)
+
+py_library(
+    name = "known_anomaly_main_lib",
+    srcs = ["known_anomaly.py"],
     data = ["data/changepoints.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -66,7 +82,7 @@ py_test(
     srcs = ["known_anomaly_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":known_anomaly",
+        ":known_anomaly_main_lib",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -74,6 +90,14 @@ py_test(
 py_binary(
     name = "multivariate",
     srcs = ["multivariate.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [":multivariate_main_lib"],
+)
+
+py_library(
+    name = "multivariate_main_lib",
+    srcs = ["multivariate.py"],
     data = ["data/multivariate_level.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -94,7 +118,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":multivariate",
+        ":multivariate_main_lib",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -102,6 +126,15 @@ py_test(
 py_binary(
     name = "lstm",
     srcs = ["lstm.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    visibility = ["//visibility:public"],
+    deps = [":lstm_main_lib"],
+)
+
+py_library(
+    name = "lstm_main_lib",
+    srcs = ["lstm.py"],
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -125,7 +158,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
-        ":lstm",
+        ":lstm_main_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/estimator:estimator_py",
     ],
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 2a22295197dc225cefbedf2736adeea5491a9fc2..449ec8b0a83d0bd6247970302630ad4e0a902a40 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -155,13 +155,16 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "head_test.py",
     ],
-    shard_count = 4,
+    shard_count = 10,
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],  # b/63391119
+    tags = [
+        "no_pip_gpu",  # b/63391119
+        "notap",  # b/124520733
+    ],
     deps = [
         ":estimators",
         ":feature_keys",
@@ -169,7 +172,7 @@ py_test(
         ":input_pipeline",
         ":model",
         ":state_management",
-        "//tensorflow/contrib/timeseries/examples:lstm",
+        "//tensorflow/contrib/timeseries/examples:lstm_main_lib",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index a8d5e1a49dd4313f58f2f515bc3f292ecce5cbd4..3626701d24163ef52564b42d8a630bd9c5a788eb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -465,7 +465,8 @@ class ARModel(model.TimeSeriesModel):
           math_utils.normal_log_prob(targets, sigma, prediction))
     else:
       assert self.loss == ARModel.SQUARED_LOSS, self.loss
-      loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets))
+      loss_op = math_ops.reduce_sum(
+          math_ops.squared_difference(prediction, targets))
     loss_op /= math_ops.cast(
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index d898f05405707672ac4d6b1c11bb5931dfe475e1..ee1cd3213efb0fff3a99536bdf1abd93c0c32a6e 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -18,22 +18,18 @@ package(
         "//learning/brain:__subpackages__",
         "//learning/deepmind:__subpackages__",
         "//medical/pathology:__subpackages__",
+        "//smartass/brain:__subpackages__",
         "//tensorflow:__subpackages__",
         "//vr/perception:__subpackages__",
     ],
 )
 
-cc_library(
-    name = "all_ops",
+py_library(
+    name = "tpu_py",
+    srcs = ["python/ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
+        "//tensorflow/python/tpu:tpu_py",
     ],
 )
 
@@ -42,25 +38,14 @@ py_library(
     srcs = ["python/tpu/async_checkpoint.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/tpu:async_checkpoint",
     ],
 )
 
 py_library(
     name = "tpu_estimator",
     srcs = [
+        "python/tpu/_tpu_estimator_embedding.py",
         "python/tpu/error_handling.py",
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
@@ -70,136 +55,24 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":async_checkpoint",
+        ":feature_column",
         ":functional",
+        ":tpu_embedding",
         ":tpu_lib",
-        ":tpu_ordinal_selector_py",
         "//tensorflow/contrib/training:training_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:function",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:util",
-        "@six_archive//:six",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "cross_replica_ops",
-        "heartbeat_ops",
-        "host_compute_ops",
-        "infeed_ops",
-        "outfeed_ops",
-        "replication_ops",
-        "tpu_configuration_ops",
-        "tpu_embedding_ops",
-        "tpu_ordinal_selector_op",
-        "functional_ops",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_custom_op_library(
-    name = "python/ops/_tpu_ops.so",
-    srcs = [
-        "ops/cross_replica_ops.cc",
-        "ops/heartbeat_ops.cc",
-        "ops/host_compute_ops.cc",
-        "ops/infeed_ops.cc",
-        "ops/outfeed_ops.cc",
-        "ops/replication_ops.cc",
-        "ops/tpu_configuration_ops.cc",
-        "ops/tpu_embedding_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/python/tpu:tpu_estimator",
     ],
 )
 
-tf_gen_op_wrapper_py(
-    name = "tpu_ops",
-    hidden = [
-        "SendTPUEmbeddingGradients",
-        "EnqueueTPUEmbeddingIntegerBatch",
-        "EnqueueTPUEmbeddingSparseBatch",
-        "EnqueueTPUEmbeddingSparseTensorBatch",
-    ],
-    deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
-    ],
-)
-
-tf_custom_op_library(
-    name = "python/ops/_tpu_ordinal_selector.so",
-    srcs = ["ops/tpu_ordinal_selector_op.cc"],
-)
-
-tf_custom_op_py_library(
-    name = "tpu_ordinal_selector_py",
-    srcs = ["ops/gen_tpu_ordinal_selector_op.py"],
-    dso = [":python/ops/_tpu_ordinal_selector.so"],
-    kernels = [
-        ":tpu_ordinal_selector_op_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_ordinal_selector_op",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "tpu_ordinal_selector_op",
-    deps = [
-        ":tpu_ordinal_selector_op_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_functional_ops",
-    out = "python/tpu/gen_functional_ops.py",
-    hidden = [
-        "TPUPartitionedCall",
-    ],
-    deps = [":functional_ops_op_lib"],
-)
-
 py_library(
     name = "functional",
     srcs = ["python/tpu/functional.py"],
+    srcs_version = "PY2AND3",
     visibility = [
         "//visibility:public",
     ],
     deps = [
-        ":gen_functional_ops",
+        "//tensorflow/python/tpu:functional",
     ],
 )
 
@@ -208,30 +81,7 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "tpu_py",
-    srcs = glob(["python/ops/*.py"]),
-    dso = [":python/ops/_tpu_ops.so"],
-    kernels = [
-        ":all_ops",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":profiler",
-        ":tpu_ops",
-        "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/tpu/profiler",
     ],
 )
 
@@ -248,6 +98,7 @@ py_library(
         ":tpu_embedding",
         ":tpu_estimator",
         ":tpu_lib",
+        "//tensorflow/python/tpu",
     ],
 )
 
@@ -270,8 +121,8 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -311,29 +162,12 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
+        ":functional",
         ":profiler",
         ":tpu_py",
-        "//tensorflow/compiler/xla/experimental/xla_sharding",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
-        "//tensorflow/contrib/tpu/proto:topology_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu:tpu_lib",
     ],
 )
 
@@ -344,125 +178,20 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "datasets_test",
-    size = "medium",
-    srcs = ["python/tpu/datasets_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":datasets",
-    ],
-    grpc_enabled = True,
-    shard_count = 4,
-    tags = ["no_oss"],
-)
-
-tf_py_test(
-    name = "tpu_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
-    ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
-)
-
-tf_py_test(
-    name = "tpu_sharding_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_sharding_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "bfloat16_test",
-    size = "small",
-    srcs = ["python/tpu/bfloat16_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_infeed_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_infeed_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_config_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_config_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_estimator_signals_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_estimator_signals_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "topology_test",
-    size = "medium",
-    srcs = ["python/tpu/topology_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/tpu:datasets",
     ],
 )
 
 py_library(
     name = "tpu_embedding",
-    srcs = ["python/tpu/tpu_embedding.py"],
+    srcs = [
+        "python/tpu/tpu_embedding.py",
+        "python/tpu/tpu_embedding_gradient.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":tpu_lib",
-        ":tpu_ops",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "@six_archive//:six",
+        "//tensorflow/python/tpu:tpu_embedding",
     ],
 )
 
@@ -471,31 +200,6 @@ py_library(
     srcs = ["python/tpu/feature_column.py"],
     deps = [
         ":tpu_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-    ],
-)
-
-tf_py_test(
-    name = "feature_column_test",
-    srcs = [
-        "python/tpu/feature_column_test.py",
-    ],
-    additional_deps = [
-        ":feature_column",
-        "//third_party/py/numpy",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/tpu:feature_column",
     ],
-    main = "python/tpu/feature_column_test.py",
 )
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 541fbf33a302a4d850422885fdbbc438bd6b9b7b..e2ce77e118182bb07193cbac82e176d3b2057e17 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -2,35 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
-
-tf_proto_library(
-    name = "tpu_profiler_proto",
-    srcs = ["tpu_profiler.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":op_profile_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "dump_tpu_profile",
-    srcs = ["dump_tpu_profile.cc"],
-    hdrs = ["dump_tpu_profile.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":op_profile_proto_cc",
-        ":tpu_profiler_proto_cc",
-        ":trace_events_proto_cc",
-        ":trace_events_to_json",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
 
 cc_library(
     name = "version",
@@ -43,71 +14,13 @@ tf_cc_binary(
     srcs = [
         "capture_tpu_profile.cc",
     ],
+    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
-        ":dump_tpu_profile",
-        ":tpu_profiler_analysis_proto_cc",
-        ":tpu_profiler_proto_cc",
         ":version",
-        "//tensorflow:grpc++",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
     ],
 )
-
-tf_proto_library(
-    name = "trace_events_proto",
-    srcs = ["trace_events.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "trace_events_to_json",
-    srcs = ["trace_events_to_json.cc"],
-    hdrs = ["trace_events_to_json.h"],
-    deps = [
-        ":trace_events_proto_cc",
-        "//tensorflow/core:lib",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_cc_test(
-    name = "trace_events_to_json_test",
-    srcs = ["trace_events_to_json_test.cc"],
-    deps = [
-        ":trace_events_to_json",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_proto_library(
-    name = "op_profile_proto",
-    srcs = ["op_profile.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library(
-    name = "tpu_profiler_analysis_proto",
-    srcs = ["tpu_profiler_analysis.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":tpu_profiler_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "tpu_profiler_analysis_pb2_grpc",
-    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [":tpu_profiler_analysis_proto_py"],
-)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 1c5ea2d997a58ca57ddc212ffd56aad525e961da..32858850cdb27c985ee16946fcc5d2146644ef64 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -18,235 +18,11 @@ limitations under the License.
 // Initiates a TPU profiling on the TPUProfiler service at service_addr,
 // receives and dumps the profile data to a tensorboard log directory.
 
-#include "grpcpp/grpcpp.h"
-
-#include <cstdio>
-#include <ctime>
-#include <vector>
-
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
-namespace tensorflow {
-namespace tpu {
-namespace {
-
-using ::tensorflow::TPUProfileAnalysis;
-using ::tensorflow::TPUProfiler;
-
-constexpr uint64 kMaxEvents = 1000000;
-
-string GetCurrentTimeStampAsString() {
-  char s[128];
-  std::time_t t = std::time(nullptr);
-  CHECK_NE(std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t)), 0);
-  return s;
-}
-
-Status ValidateHostPortPair(const string& host_port) {
-  uint32 port;
-  std::vector<string> parts = str_util::Split(host_port, ':');
-  // Must be host:port, port must be a number, host must not contain a '/',
-  // host also must not be empty.
-  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
-      parts[0].find("/") != string::npos || parts[0].empty()) {
-    return errors::InvalidArgument("Could not interpret \"", host_port,
-                                   "\" as a host-port pair.");
-  }
-  return Status::OK();
-}
-
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const string& repository_root,
-                                      const string& session_id,
-                                      const ProfileOptions& opts) {
-  ProfileRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_max_events(kMaxEvents);
-  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
-    // For backward compatibilities, only generate tracetable etc when the
-    // user provide a GCS path for model directory.
-    request.set_repository_root(repository_root);
-    request.set_session_id(session_id);
-  }
-  request.add_tools("op_profile");
-  request.add_tools("input_pipeline");
-  request.add_tools("memory_viewer");
-  request.add_tools("overview_page");
-  *request.mutable_opts() = opts;
-  return request;
-}
-
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
-             const string& repository_root, const string& session_id,
-             const ProfileOptions& opts) {
-  ProfileRequest request =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                      std::numeric_limits<int32>::max());
-  std::unique_ptr<TPUProfiler::Stub> stub =
-      TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  ProfileResponse response;
-  TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-
-  if (!response.encoded_trace().empty()) {
-    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-        logdir, session_id, "", response, &std::cout));
-    // Print this at the end so that it's not buried in irrelevant LOG messages.
-    std::cout
-        << "NOTE: using the trace duration " << duration_ms << "ms."
-        << std::endl
-        << "Set an appropriate duration (with --duration_ms) if you "
-           "don't see a full step in your trace or the captured trace is too "
-           "large."
-        << std::endl;
-  }
-
-  return response.encoded_trace().empty();
-}
-
-// Start a new profiling session that include all the hosts included in
-// hostnames, for the time interval of duration_ms. Possibly save the profiling
-// result in the directory specified by repository_root and session_id.
-bool NewSession(const string& service_addr,
-                const std::vector<tensorflow::string>& hostnames,
-                int duration_ms, const string& repository_root,
-                const string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest new_session_request;
-  *new_session_request.mutable_request() =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-  new_session_request.set_repository_root(repository_root);
-  new_session_request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
-    new_session_request.add_hosts(hostname);
-  }
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  // TODO(jiesun): GRPC support following relevant naming scheme:
-  // 1. dns:///host:port
-  // 2. ipv4:host:port or ipv6:[host]:port
-  // We might need to change the prefix which depends on what TPU name resolver
-  // will give us.
-  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
-      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  NewProfileSessionResponse new_session_response;
-  TF_QCHECK_OK(FromGrpcStatus(
-      stub->NewSession(&context, new_session_request, &new_session_response)));
-
-  std::cout << "Profile session succeed for host(s):"
-            << str_util::Join(hostnames, ",") << std::endl;
-  return new_session_response.empty_trace();
-}
-
-// Starts tracing on a single or multiple TPU hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-void StartTracing(const tensorflow::string& service_addr,
-                  const tensorflow::string& logdir,
-                  const tensorflow::string& workers_list,
-                  bool include_dataset_ops, int duration_ms,
-                  int num_tracing_attempts) {
-  // Use the current timestamp as the run name.
-  tensorflow::string session_id = GetCurrentTimeStampAsString();
-  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
-  tensorflow::string repository_root =
-      io::JoinPath(logdir, kProfilePluginDirectory);
-  std::vector<tensorflow::string> hostnames =
-      tensorflow::str_util::Split(workers_list, ",");
-
-  bool empty_trace = false;
-  int remaining_attempts = num_tracing_attempts;
-  tensorflow::ProfileOptions opts;
-  opts.set_include_dataset_ops(include_dataset_ops);
-  while (true) {
-    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
-              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    if (hostnames.empty()) {
-      empty_trace = tensorflow::tpu::Profile(service_addr, logdir, duration_ms,
-                                             repository_root, session_id, opts);
-    } else {
-      tensorflow::string tpu_master = service_addr;
-      empty_trace =
-          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
-                                      repository_root, session_id, opts);
-    }
-    if (remaining_attempts <= 0 || !empty_trace) break;
-    std::cout << "No trace event is collected. Automatically retrying."
-              << std::endl
-              << std::endl;
-  }
-
-  if (empty_trace) {
-    std::cout << "No trace event is collected after " << num_tracing_attempts
-              << " attempt(s). "
-              << "Perhaps, you want to try again (with more attempts?)."
-              << std::endl
-              << "Tip: increase number of attempts with --num_tracing_attempts."
-              << std::endl;
-  }
-}
-
-MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
-  MonitorRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_monitoring_level(monitoring_level);
-  return request;
-}
-
-// Repeatedly collects profiles and shows user-friendly metrics for
-// 'num_queries' time(s).
-void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
-                     int monitoring_level, int num_queries) {
-  for (int query = 0; query < num_queries; ++query) {
-    MonitorRequest request =
-        PopulateMonitorRequest(duration_ms, monitoring_level);
-
-    ::grpc::ClientContext context;
-    ::grpc::ChannelArguments channel_args;
-    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                        std::numeric_limits<int32>::max());
-    std::unique_ptr<TPUProfiler::Stub> stub =
-        TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-            channel_args));
-    MonitorResponse response;
-    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
-
-    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
-              << "):\n\n"
-              << response.data() << std::flush;
-  }
-}
-
-}  // namespace
-}  // namespace tpu
-}  // namespace tensorflow
-
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
@@ -300,8 +76,9 @@ int main(int argc, char** argv) {
     std::cout << usage.c_str() << std::endl;
     return 2;
   }
-  tensorflow::Status status =
-      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  tensorflow::Status status;
+  status =
+      tensorflow::profiler::client::ValidateHostPortPair(FLAGS_service_addr);
   if (!status.ok()) {
     std::cout << status.error_message() << std::endl;
     std::cout << usage.c_str() << std::endl;
@@ -324,12 +101,17 @@ int main(int argc, char** argv) {
               << FLAGS_service_addr << " for " << duration_ms
               << "ms and show metrics for " << num_queries << " time(s)."
               << std::endl;
-    tensorflow::tpu::StartMonitoring(FLAGS_service_addr, duration_ms,
-                                     FLAGS_monitoring_level, num_queries);
+    tensorflow::profiler::client::StartMonitoring(
+        FLAGS_service_addr, duration_ms, FLAGS_monitoring_level, num_queries);
   } else {
-    tensorflow::tpu::StartTracing(FLAGS_service_addr, FLAGS_logdir,
-                                  FLAGS_workers_list, FLAGS_include_dataset_ops,
-                                  duration_ms, num_tracing_attempts);
+    status = tensorflow::profiler::client::StartTracing(
+        FLAGS_service_addr, FLAGS_logdir, FLAGS_workers_list,
+        FLAGS_include_dataset_ops, duration_ms, num_tracing_attempts);
+    if (!status.ok() && status.code() != tensorflow::error::Code::UNAVAILABLE) {
+      std::cout << status.error_message() << std::endl;
+      std::cout << usage.c_str() << std::endl;
+      return 2;
+    }
   }
   return 0;
 }
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 6a6eba282a12d68cc3cd4e46a46a1b4190fb737b..8605bae5c128513186d8c03835dcf49d3e4b6fd9 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -1,389 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Operations for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tpu.ops import gen_tpu_ops
-  from tensorflow.contrib.tpu.ops.gen_tpu_ops import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _tpu_ops = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_tpu_ops.so"))
-
-  def _create_default_group_assignment():
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "cross_replica_sum should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-    group_assignment = [list(range(num_shards))]
-    return group_assignment
-
-  def all_to_all(x,
-                 concat_dimension,
-                 split_dimension,
-                 split_count,
-                 group_assignment=None,
-                 name=None):
-    """Exchange data across TPU replicas.
-
-    Args:
-      x: The local tensor.
-      concat_dimension: The dimension number to concatenate.
-      split_dimension: The dimension number to split.
-      split_count: The number of splits, this number must equal to the sub-group
-        size(group_assignment.get_shape()[1])
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is concatenated by data from different replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-    return gen_tpu_ops.all_to_all(
-        x,
-        group_assignment,
-        concat_dimension=concat_dimension,
-        split_dimension=split_dimension,
-        split_count=split_count,
-        name=name)
-
-  @ops.RegisterGradient("AllToAll")
-  def _all_to_all_grad(op, grad):
-    # The gradient of a all-to-all is also a all-to-all but the
-    # split_dimension and concat_dimension is swapped.
-    # The graident with respect to group_assignment is None.
-    return [
-        gen_tpu_ops.all_to_all(
-            grad,
-            op.inputs[1],
-            concat_dimension=op.get_attr("split_dimension"),
-            split_dimension=op.get_attr("concat_dimension"),
-            split_count=op.get_attr("split_count")), None
-    ]
-
-  def cross_replica_sum(x, group_assignment=None, name=None):
-    """Sum the input tensor across replicas according to group_assignment.
-
-    Args:
-      x: The local tensor to the sum.
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is summed across replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-
-    return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
-
-  def collective_permute(x, source_target_pairs, name=None):
-    """Permute the input tensor across replicas given source_target_pairs.
-
-    For each source_target_pair <a, b>, we send replica a's input to replica b.
-    Each replica id must only appear once in the source column. Also it must
-    only appear once in the target column.
-    For the replica id not in the target column, this op returns a zero tensor
-    with the same shape and dtype of the input x.
-
-    For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-    source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
-    `[0, A, B, C]`.
-
-    Args:
-      x: The local tensor to be permuted.
-      source_target_pairs: 2d int lists with shape [num_pairs, 2].
-        source_target_pairs[i][0] represents the source replica id and
-        source_target_pairs[i][1] represents the target replica id.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is permuted.
-    """
-    return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
-
-  @ops.RegisterGradient("CollectivePermute")
-  def _collective_permute_grad(op, grad):
-    # The gradient of a collective permute operation is also a collective
-    # permute, but with source/target pairs reversed. The gradient with respect
-    # to input argument `source_target_pairs` is `None`.
-    source_target_pairs = op.inputs[1][:, ::-1]
-    return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
-
-  @ops.RegisterGradient("CrossReplicaSum")
-  def _cross_replica_sum_grad(op, grad):
-    # The gradient of a cross replica sum is also a cross-replica sum.
-    # The gradient with respect to group_assignment is None.
-    return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
-
-  # This extra type checking exists to give a more helpful error message in
-  # the common case that uint8 and int64 values are infed. Remove when both
-  # types are supported.
-
-  _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
-      dtypes.complex64
-  ])
-
-  def infeed_dequeue(dtype, shape, name=None):
-    """A placeholder op for a value that will be fed into the computation.
-
-    Args:
-      dtype: A `tf.DType`. The type of elements in the tensor.
-      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
-      name: A name for the operation (optional).
-
-    Returns:
-      A `Tensor` of type `dtype`.
-      A tensor that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If 'dtype` is not a supported infeed type.
-    """
-    if dtype not in _SUPPORTED_INFEED_DTYPES:
-      raise TypeError(
-          "{} is not a supported TPU infeed type. Supported types are: "
-          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-
-    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
-
-  # pylint: disable=redefined-outer-name
-  def infeed_dequeue_tuple(dtypes, shapes, name=None):
-    """A placeholder op for values fed into the TPU simultaneously as a tuple.
-
-    Args:
-      dtypes: A list of `tf.DType`s that has length `>= 1`.
-        The element types of each element in `outputs`.
-      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
-        The shapes of each tensor in `outputs`.
-      name: A name for the operation (optional).
-
-    Returns:
-      A list of `Tensor` objects of type `dtypes`.
-      A list of tensors that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If a type in 'dtypes` is not a supported infeed type.
-    """
-    for dtype in dtypes:
-      if dtype not in _SUPPORTED_INFEED_DTYPES:
-        raise TypeError(
-            "{} is not a supported TPU infeed type. Supported types are: "
-            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
-  # pylint: enable=redefined-outer-name
-
-  # pylint: disable=protected-access
-  def send_tpu_embedding_gradients(inputs,
-                                   config,
-                                   learning_rates=None,
-                                   name=None):
-    """A placeholder op for feeding per-sample gradients to the embedding layer.
-
-    Args:
-      inputs: A TensorList of gradients with which to update embedding tables.
-        Contains one tensor per embedding table in the model.
-      config: Serialized TPUEmbeddingConfiguration proto.
-      learning_rates: A TensorList of float32 scalars, one for each embedding
-        table, containing the learning rates for each table when dynamic
-        learning rate is enabled through the OptimizationParameters in
-        TPUEmbeddingConfiguration. When the learning rate is constant, the list
-        should be empty (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      A SendTPUEmbeddingGradients operation.
-    """
-    if learning_rates is None:
-      learning_rates = []
-    return gen_tpu_ops._send_tpu_embedding_gradients(
-        inputs=inputs, learning_rates=learning_rates, config=config, name=name)
-
-
-  send_tpu_embedding_gradients.__doc__ = (
-      gen_tpu_ops._send_tpu_embedding_gradients.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_integer_batch(batch,
-                                          device_ordinal,
-                                          mode_override=None,
-                                          name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      batch: A list of 1D tensors, one for each embedding table, containing the
-        indices into the tables.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingIntegerBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_integer_batch(
-        batch=batch,
-        device_ordinal=device_ordinal,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_integer_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_integer_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_batch(sample_indices,
-                                         embedding_indices,
-                                         aggregation_weights,
-                                         device_ordinal,
-                                         combiners=None,
-                                         mode_override=None,
-                                         name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        and feature to which the corresponding embedding_indices and
-        aggregation_weights values belong. sample_indices[i] must equal b * nf +
-        f, where nf is the number of features from the corresponding table, f is
-        in [0, nf), and b is in [0, batch size).
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables.
-      aggregation_weights: A list of rank 1 Tensors containing per sample --
-        i.e. per (training example, feature) -- aggregation weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
-                                                embedding_indices,
-                                                aggregation_weights,
-                                                table_ids,
-                                                device_ordinal,
-                                                combiners=None,
-                                                mode_override=None,
-                                                name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        to which the corresponding embedding_indices and aggregation_weights
-        values
-        belong. It corresponds to sp_ids.indices[:,0] in
-          embedding_lookup_sparse().
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
-      aggregation_weights: A list of rank 1 Tensors containing per training
-        example aggregation weights. It corresponds to sp_weights.values in
-        embedding_lookup_sparse().
-      table_ids: A list of integers specifying the identifier of the embedding
-        table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
-        lookup the corresponding input. The ith input is looked up using
-        table_ids[i]. The size of the table_ids list must be equal to that of
-        sample_indices, embedding_indices and aggregation_weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseTensorBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        table_ids=table_ids,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
-
-else:
-  # We have already built the appropriate libraries into the binary via CMake
-  # if we have built contrib, so we don't need this
-  pass
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ops import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/training/mode_keys_test.py b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
similarity index 63%
rename from tensorflow/python/training/mode_keys_test.py
rename to tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
index c4435b7d4870ac1675a3f2f4d80def111dc85ae5..788e1fe0568cf2f406c379e4d928100ea51a37a3 100644
--- a/tensorflow/python/training/mode_keys_test.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,18 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.train.ModeKeys."""
+"""Stub file to maintain backwards compatibility."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.platform import test
-from tensorflow.python.training import mode_keys
-
-
-class ModeKeysTest(test.TestCase):
-
-  def testKeyEquality(self):
-    self.assertEqual(mode_keys.ModeKeys.PREDICT, 'predict')
-    self.assertEqual(mode_keys.ModeKeys.TRAIN, 'train')
-    self.assertEqual(mode_keys.ModeKeys.TEST, 'test')
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ordinal_selector_op import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index 15ce6aceec299adacd7025f0021cf8b6f6ef765b..aeb061dbe114bc287946b50d08a86778c78c7b38 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -1,31 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Classes for TPU trace events."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
-from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
+from tensorflow.python.tpu.profiler import *
 # pylint: enable=wildcard-import,unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/python/tpu/__init__.py b/tensorflow/contrib/tpu/python/tpu/__init__.py
index 0dffd7064b19f353aed6afa3ad383564643a4a90..82d4f68c0221013706f70bcf54ae4c97cc7db1d3 100644
--- a/tensorflow/contrib/tpu/python/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/python/tpu/__init__.py
@@ -1,20 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Ops related to Tensor Processing Units."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..41aa4d267812cabe775459723df7e01efaa83c93
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf..5eb8034e47474873ccef0b6123f2becd0668738c 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -1,212 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Hook for asynchronous checkpointing.
-
-This hook dispatches checkpoint writing operations in a separate thread to
-allow execution to continue on the main thread.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import threading
-import time
-
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.framework import meta_graph
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import training_util
-from tensorflow.python.training.session_run_hook import SessionRunArgs
-from tensorflow.python.training.summary_io import SummaryWriterCache
-
-
-class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
-  """Saves checkpoints every N steps or seconds."""
-
-  def __init__(self,
-               checkpoint_dir,
-               save_secs=None,
-               save_steps=None,
-               saver=None,
-               checkpoint_basename="model.ckpt",
-               scaffold=None,
-               listeners=None):
-    """Initializes a `CheckpointSaverHook`.
-
-    Args:
-      checkpoint_dir: `str`, base directory for the checkpoint files.
-      save_secs: `int`, save every N secs.
-      save_steps: `int`, save every N steps.
-      saver: `Saver` object, used for saving.
-      checkpoint_basename: `str`, base name for the checkpoint files.
-      scaffold: `Scaffold`, use to get saver object.
-      listeners: List of `CheckpointSaverListener` subclass instances. Used for
-        callbacks that run immediately before or after this hook saves the
-        checkpoint.
-
-    Raises:
-      ValueError: One of `save_steps` or `save_secs` should be set.
-      ValueError: At most one of `saver` or `scaffold` should be set.
-    """
-    logging.info("Create AsyncCheckpointSaverHook.")
-    if saver is not None and scaffold is not None:
-      raise ValueError("You cannot provide both saver and scaffold.")
-    self._saver = saver
-    self._save_thread = None
-    self._write_graph_thread = None
-    self._checkpoint_dir = checkpoint_dir
-    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
-    self._scaffold = scaffold
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_secs=save_secs, every_steps=save_steps)
-    self._listeners = listeners or []
-    self._steps_per_run = 1
-    self._summary_writer = None
-    self._global_step_tensor = None
-
-    self._last_checkpoint_step = None
-
-  def _set_steps_per_run(self, steps_per_run):
-    self._steps_per_run = steps_per_run
-
-  def begin(self):
-    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
-    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-    if self._global_step_tensor is None:
-      raise RuntimeError(
-          "Global step should be created to use CheckpointSaverHook.")
-    for l in self._listeners:
-      l.begin()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-
-    # We do write graph and saver_def at the first call of before_run.
-    # We cannot do this in begin, since we let other hooks to change graph and
-    # add variables in begin. Graph is finalized after all begin calls.
-    def _write_graph_fn(self):
-      training_util.write_graph(
-          ops.get_default_graph().as_graph_def(add_shapes=True),
-          self._checkpoint_dir, "graph.pbtxt")
-    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
-                                                args=[self])
-    self._write_graph_thread.start()
-
-    saver_def = self._get_saver().saver_def if self._get_saver() else None
-    graph = ops.get_default_graph()
-    meta_graph_def = meta_graph.create_meta_graph_def(
-        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
-    self._summary_writer.add_graph(graph)
-    self._summary_writer.add_meta_graph(meta_graph_def)
-    # The checkpoint saved here is the state at step "global_step".
-    self._save(session, global_step)
-    self._timer.update_last_triggered_step(global_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    global_step = run_context.session.run(self._global_step_tensor)
-    if self._timer.should_trigger_for_step(global_step):
-      self._timer.update_last_triggered_step(global_step)
-      logging.info("Triggering checkpoint. %s", global_step)
-      if self._save(run_context.session, global_step):
-        run_context.request_stop()
-
-  def end(self, session):
-    if self._save_thread:
-      logging.info("Waiting for any pending checkpoints to finish.")
-      self._save_thread.join()
-    if self._write_graph_thread:
-      logging.info("Waiting for any pending write_graph to finish.")
-      self._write_graph_thread.join()
-
-    last_step = session.run(self._global_step_tensor)
-
-    if self._last_checkpoint_step != last_step:
-      self._save(session, last_step, asynchronous=False)
-
-    for l in self._listeners:
-      l.end(session, last_step)
-
-  def _save(self, session, step, asynchronous=True):
-    """Saves the latest checkpoint, returns should_stop."""
-
-    # Skip saving on step 0
-    if step == 0:
-      return
-
-    def _save_fn():
-      """Run the saver process."""
-      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
-
-      start_time = time.time()
-      for l in self._listeners:
-        l.before_save(session, step)
-
-      self._get_saver().save(session, self._save_path, global_step=step)
-      self._summary_writer.add_session_log(
-          SessionLog(
-              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
-          step)
-
-      for l in self._listeners:
-        l.after_save(session, step)
-
-      end_time = time.time()
-      logging.info("Checkpoint actual writing time: (%.3f sec)",
-                   end_time - start_time)
-      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
-
-    if not asynchronous:
-      self._last_checkpoint_step = step
-      _save_fn()
-      return
-
-    if self._save_thread is not None:
-      self._save_thread.join(timeout=0.1)
-      if self._save_thread.is_alive():
-        logging.info("Saver thread still in progress, skipping checkpoint.")
-        return
-
-    self._last_checkpoint_step = step
-    self._save_thread = threading.Thread(target=_save_fn)
-    self._save_thread.start()
-
-  def _get_saver(self):
-    if self._saver is not None:
-      return self._saver
-    elif self._scaffold is not None:
-      return self._scaffold.saver
-
-    # Get saver from the SAVERS collection if present.
-    collection_key = ops.GraphKeys.SAVERS
-    savers = ops.get_collection(collection_key)
-    if not savers:
-      raise RuntimeError(
-          "No items in collection {}. Please add a saver to the collection "
-          "or provide a saver or scaffold.".format(collection_key))
-    elif len(savers) > 1:
-      raise RuntimeError(
-          "More than one item in collection {}. "
-          "Please indicate which one to use by passing it to the constructor."
-          .format(collection_key))
-
-    self._saver = savers[0]
-    return savers[0]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.async_checkpoint import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
index fa74f651aa63c72d14eb78c8af479263810e9b7d..f3d392a8daec2a80f974d90051324a02be002afd 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -1,77 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper context for running models with bfloat16."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_contextlib
-
-
-def _get_custom_getter():
-  """Returns a custom getter that this class's methods must be called under.
-
-  All methods of this class must be called under a variable scope that was
-  passed this custom getter. Example:
-
-  ```python
-  network = ConvNetBuilder(...)
-  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
-    network.conv(...)
-    # Call more methods of network here
-  ```
-
-  Currently, this custom getter only does anything if self.use_tf_layers is
-  True. In that case, it causes variables to be stored as dtype
-  self.variable_type, then casted to the requested dtype, instead of directly
-  storing the variable as the requested dtype.
-  """
-
-  def inner_custom_getter(getter, *args, **kwargs):
-    """Custom getter that forces variables to have type self.variable_type."""
-    cast_to_bfloat16 = False
-    requested_dtype = kwargs['dtype']
-    if requested_dtype == dtypes.bfloat16:
-      # Only change the variable dtype if doing so does not decrease variable
-      # precision.
-      kwargs['dtype'] = dtypes.float32
-      cast_to_bfloat16 = True
-    var = getter(*args, **kwargs)
-    # This if statement is needed to guard the cast, because batch norm
-    # assigns directly to the return value of this custom getter. The cast
-    # makes the return value not a variable so it cannot be assigned. Batch
-    # norm variables are always in fp32 so this if statement is never
-    # triggered for them.
-    if cast_to_bfloat16:
-      var = math_ops.cast(var, dtypes.bfloat16)
-    return var
-
-  return inner_custom_getter
-
-
-@tf_contextlib.contextmanager
-def bfloat16_scope():
-  """Scope class for bfloat16 variables so that the model uses custom getter.
-
-  This enables variables to be read as bfloat16 type when using get_variable.
-  """
-  with variable_scope.variable_scope(
-      '', custom_getter=_get_custom_getter()) as varscope:
-    yield varscope
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.bfloat16 import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index bc0cd41d210ac6f8de1b20ebf744ee1e1dd04137..c20aac7e36aa31c5a9d88ca6fe02a8703f9ed5a3 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -1,191 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of Cloud TPU helper functions for data loading."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import interleave_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import functional_ops
-
-
-def _TextLineDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-def _TFRecordDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-_FILETYPE_MAP = {
-    'tfrecord': _TFRecordDataset,
-    'textline': _TextLineDataset,
-    'text': _TextLineDataset,
-}
-
-
-def StreamingFilesDataset(files,
-                          filetype=None,
-                          file_reader_job=None,
-                          worker_job=None,
-                          num_epochs=None,
-                          filename_shuffle_buffer_size=None,
-                          num_parallel_reads=None,
-                          batch_transfer_size=None,
-                          sloppy=None):
-  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
-
-  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
-  files local to your GCE VM. In order to train using files stored on your local
-  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
-  helper to generate a dataset to feed your Cloud TPU with files from your GCE
-  VM.
-
-  The resulting dataset may return an OutOfRangeError if there are no files
-  found as a result of the fileglob expansion.
-
-  Note: StreamingFilesDataset assumes that the session is using a
-  TPUClusterResolver and has therefore a worker and a coordinator job. File
-  loading will be done on the coordinator job.
-
-  Args:
-    files: A string glob to match files, or a `tf.data.Dataset` generating file
-      names.
-    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
-      TensorFlow function that when given a filename returns a dataset.
-    file_reader_job: An optional string that corresponds to the job that should
-      perform the file reads.
-    worker_job: An optional string that corresponds to the job that should
-      process the tensors (i.e. your GPU or TPU worker).
-    num_epochs: The number of epochs through the training set that should be
-      generated. By default, it will repeat infinitely.
-    filename_shuffle_buffer_size: An optional integer whose value controls the
-      shuffling of the file names. If you would like to read from the files in
-      the same order, set to 0 or False.
-    num_parallel_reads: An optional integer controlling the number of files to
-      read from concurrently. (Set to 1 for no parallelism.)
-    batch_transfer_size: An optional integer controlling the batching used to
-      amortize the remote function invocation overhead. Set to a very large
-      number to increase throughput. Set to a very small number to reduce memory
-      consumption. Set to False to skip batching.
-    sloppy: (Optional.) If `False`, read input data while maintaining a
-      deterministic order. (This may have significant performance impacts.)
-      sloppy defaults to: True.
-  Returns:
-    A `tf.data.Dataset` with an infinite stream of elements generated by a
-    parallel interleaving of the set of files matched (or generated) by `files`
-    with a type is the output of the dataset specified by `filetype`.
-
-  Raises:
-    ValueError: if any argument is not of the expected type.
-  """
-  if filetype is None:
-    filetype = 'tfrecord'
-
-  if isinstance(filetype, str):
-    if filetype not in _FILETYPE_MAP:
-      raise ValueError('Unexpected filetype: %s' % filetype)
-    reader_fn = _FILETYPE_MAP[filetype]
-  elif callable(filetype):
-    reader_fn = filetype
-  else:
-    raise ValueError('filetype should be a string or a callable')
-
-  file_reader_job = file_reader_job or 'coordinator'
-
-  worker_job = worker_job or 'worker'
-
-  if filename_shuffle_buffer_size is None:
-    filename_shuffle_buffer_size = 4096
-
-  num_parallel_reads = num_parallel_reads or 8
-
-  if batch_transfer_size is None:
-    batch_transfer_size = 256
-
-  if sloppy is None:
-    sloppy = True
-
-  with ops.device('/job:%s' % file_reader_job):
-    if isinstance(files, str):
-      source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.DatasetV2):
-      source_dataset = files
-    else:
-      raise ValueError('files was not a string or a dataset: %s' % files)
-
-    if filename_shuffle_buffer_size:
-      source_dataset = source_dataset.shuffle(
-          buffer_size=filename_shuffle_buffer_size)
-
-    source_dataset = source_dataset.apply(
-        interleave_ops.parallel_interleave(
-            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
-
-    source_dataset = source_dataset.repeat(num_epochs)
-
-    if batch_transfer_size:
-      source_dataset = source_dataset.batch(batch_transfer_size)
-
-    source_dataset = source_dataset.prefetch(1)
-
-    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
-    source_handle = source_iterator.string_handle()
-
-  @function.Defun(dtypes.string)
-  def LoadingFunc(h):
-    remote_iterator = iterator_ops.Iterator.from_string_handle(
-        h, source_dataset.output_types, source_dataset.output_shapes)
-    return remote_iterator.get_next()
-
-  def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
-        args=[source_handle],
-        Tout=output_types,
-        f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
-
-  with ops.device('/job:%s' % worker_job):
-    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
-        MapFn, num_parallel_calls=4 if sloppy else None)
-    output_dataset = output_dataset.prefetch(1)
-
-    if batch_transfer_size:
-      # Undo the batching used during the transfer.
-      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
-
-  return output_dataset
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.datasets import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index 3313dc749c2c7606101b2dc96614df2d052dfed1..05dffef3a1efdae2ad7306ca5ad3bc7a9eac04cf 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -1,313 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.python.tpu.topology import Topology
-
-
-SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
-
-
-def _compute_task_and_cores_to_replicas(core_assignment, topology):
-  """Computes a nested dict which maps task and logical core to replicas."""
-  task_and_cores_to_replicas = {}
-  for replica in xrange(core_assignment.shape[0]):
-    for logical_core in xrange(core_assignment.shape[1]):
-      coordinates = core_assignment[replica, logical_core, :]
-      task_id = topology.task_ordinal_at_coordinates(coordinates)
-      if task_id not in task_and_cores_to_replicas:
-        task_and_cores_to_replicas[task_id] = {}
-      if logical_core not in task_and_cores_to_replicas[task_id]:
-        task_and_cores_to_replicas[task_id][logical_core] = set()
-
-      task_and_cores_to_replicas[task_id][logical_core].add(replica)
-
-  task_to_sorted_replica_id = {}
-
-  for task, core_to_replicas in task_and_cores_to_replicas.items():
-    core_to_sorted_replicas = {}
-    for core, replicas in core_to_replicas.items():
-      core_to_sorted_replicas[core] = sorted(replicas)
-
-    task_to_sorted_replica_id[task] = core_to_sorted_replicas
-  return task_to_sorted_replica_id
-
-
-class DeviceAssignment(object):
-  """Mapping from logical cores in a computation to the physical TPU topology.
-
-  Prefer to use the `device_assignment()` helper to construct a
-  `DeviceAssignment`; it is easier if less flexible than constructing a
-  `DeviceAssignment` directly.
-  """
-
-  def __init__(self, topology, core_assignment):
-    """Constructs a `DeviceAssignment` object.
-
-    Args:
-      topology: A `Topology` object that describes the physical TPU topology.
-      core_assignment: A logical to physical core mapping, represented as a
-        rank 3 numpy array. See the description of the `core_assignment`
-        property for more details.
-
-    Raises:
-      ValueError: If `topology` is not `Topology` object.
-      ValueError: If `core_assignment` is not a rank 3 numpy array.
-    """
-    if not isinstance(topology, Topology):
-      raise ValueError("topology must be a Topology object, got {}".format(
-          type(topology)))
-    core_assignment = np.asarray(core_assignment, dtype=np.int32)
-
-    self._topology = topology
-
-    if core_assignment.ndim != 3:
-      raise ValueError("core_assignment must be a rank 3 numpy array, "
-                       "got shape {}".format(core_assignment.shape))
-
-    self._num_replicas = core_assignment.shape[0]
-    self._num_cores_per_replica = core_assignment.shape[1]
-
-    if core_assignment.shape[-1] != topology.mesh_rank:
-      raise ValueError(
-          "minor dimension of core_assignment must have size equal to topology "
-          "rank ({}), got shape {}".format(topology.mesh_rank,
-                                           core_assignment.shape))
-
-    self._core_assignment = core_assignment
-    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
-        self._core_assignment, topology)
-
-  @property
-  def topology(self):
-    """A `Topology` that describes the TPU topology."""
-    return self._topology
-
-  @property
-  def num_cores_per_replica(self):
-    """The number of cores per replica."""
-    return self._num_cores_per_replica
-
-  @property
-  def num_replicas(self):
-    """The number of replicas of the computation."""
-    return self._num_replicas
-
-  @property
-  def core_assignment(self):
-    """The logical to physical core mapping.
-
-    Returns:
-      An integer numpy array of rank 3, with shape
-      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
-      (replica, logical core) pairs to physical topology coordinates.
-    """
-    return self._core_assignment
-
-  def _coordinates(self, replica, logical_core):
-    """Returns the physical topology coordinates of a logical core."""
-    return tuple(self.core_assignment[replica, logical_core, :])
-
-  def lookup_replicas(self, task_id, logical_core):
-    """Lookup replica ids by task number and logical core.
-
-    Args:
-      task_id: TensorFlow task number.
-      logical_core: An integer, identifying a logical core.
-    Returns:
-      A sorted list of the replicas that are attached to that task and
-      logical_core.
-    Raises:
-      ValueError: If no replica exists in the task which contains the logical
-      core.
-    """
-    try:
-      return self._task_and_cores_to_replicas[task_id][logical_core]
-    except KeyError:
-      raise ValueError(
-          "Can not find any replica in task: {} contains logical_core: {} ".
-          format(task_id, logical_core))
-
-  def tpu_ordinal(self, replica=0, logical_core=0):
-    """Returns the ordinal of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
-
-  def host_device(self, replica=0, logical_core=0, job=None):
-    """Returns the CPU device attached to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
-
-  def tpu_device(self, replica=0, logical_core=0, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
-
-
-def device_assignment(topology,
-                      computation_shape=None,
-                      computation_stride=None,
-                      num_replicas=1):
-  """Computes a device_assignment of a computation across a TPU topology.
-
-  Attempts to choose a compact grid of cores for locality.
-
-  Returns a `DeviceAssignment` that describes the cores in the topology assigned
-  to each core of each replica.
-
-  `computation_shape` and `computation_stride` values should be powers of 2 for
-  optimal packing.
-
-  Args:
-    topology: A `Topology` object that describes the TPU cluster topology.
-      To obtain a TPU topology, evaluate the `Tensor` returned by
-      `initialize_system` using `Session.run`. Either a serialized
-      `TopologyProto` or a `Topology` object may be passed. Note: you must
-      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
-    computation_shape: A rank 1 int32 numpy array with size equal to the
-      topology rank, describing the shape of the computation's block of cores.
-      If None, the `computation_shape` is `[1] * topology_rank`.
-    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
-      describing the inter-core spacing of the `computation_shape` cores in the
-      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
-    num_replicas: The number of computation replicas to run. The replicas will
-      be packed into the free spaces of the topology.
-
-  Returns:
-    A DeviceAssignment object, which describes the mapping between the logical
-    cores in each computation replica and the physical cores in the TPU
-    topology.
-
-  Raises:
-    ValueError: If `topology` is not a valid `Topology` object.
-    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
-      numpy arrays with shape [3] where all values are positive.
-    ValueError: If computation's replicas cannot fit into the TPU topology.
-  """
-  # Deserialize the Topology proto, if it is a string.
-  if isinstance(topology, bytes):
-    topology = Topology(serialized=topology)
-
-  if not isinstance(topology, Topology):
-    raise ValueError("`topology` is not a Topology object; got {}".format(
-        type(topology)))
-
-  topology_rank = len(topology.mesh_shape)
-  mesh_shape = topology.mesh_shape
-  if computation_shape is None:
-    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_shape = np.asarray(computation_shape, dtype=np.int32)
-
-  if computation_stride is None:
-    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_stride = np.asarray(computation_stride, dtype=np.int32)
-
-  if computation_shape.shape != (topology_rank,):
-    raise ValueError("computation_shape must have shape [{}]; got {}".format(
-        topology_rank, computation_shape.shape))
-  if computation_stride.shape != (topology_rank,):
-    raise ValueError("computation_stride must have shape [{}]; got {}".format(
-        topology_rank, computation_stride.shape))
-
-  if any(computation_shape < 1):
-    raise ValueError(
-        "computation_shape must be positive; got computation_shape={}".format(
-            computation_shape))
-  if any(computation_stride < 1):
-    raise ValueError(
-        "computation_stride must be positive; got computation_stride={}".format(
-            computation_stride))
-
-  # Computes the physical size of one computation instance.
-  computation_footprint = computation_shape * computation_stride
-  if any(computation_footprint > mesh_shape):
-    raise ValueError(
-        "computation footprint {} does not fit in TPU topology shape {}".format(
-            computation_footprint, mesh_shape))
-
-  # Computes how many copies of the computation footprint fit in the mesh.
-  block_counts = mesh_shape // computation_footprint
-
-  replica_counts = block_counts * computation_stride
-  max_replicas = np.prod(replica_counts)
-  if num_replicas > max_replicas:
-    raise ValueError(
-        "requested {} replicas but only {} replicas with shape {} and "
-        "computation_stride {} fit in a TPU mesh of shape {}".format(
-            num_replicas, max_replicas, computation_shape, computation_stride,
-            mesh_shape))
-
-  def ceil_of_ratio(n, m):
-    return (n + m - 1) // m
-
-  replica_shape = [0] * topology_rank
-  if num_replicas > 0:
-    remaining_replicas = num_replicas
-    remaining_dims = topology_rank
-
-    # Choose dimensions as close to an equal cube as possible, in order of
-    # increasing dimension size. By visiting dimensions in increasing size, we
-    # assign the most constrained dimension first, so we won't make infeasible
-    # choices.
-    #
-    # As a secondary sort order, visit the dimensions in reverse order. This
-    # means we try to use both cores on the same chip in preference to two cores
-    # on different chips.
-    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
-      i = -ni
-      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
-      replica_shape[i] = min(target_size, x)
-      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
-      remaining_dims -= 1
-
-    assert remaining_replicas == 1 and remaining_dims == 0
-
-  # Assigns an offset to each replica such that no two replicas overlap.
-  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
-  for replica in xrange(num_replicas):
-    # Chooses a replica number in each axis.
-    t = replica
-    pos = []
-    for dim in replica_shape[::-1]:
-      pos.append(t % dim)
-      t //= dim
-    replica_pos = np.array(pos[::-1], dtype=np.int32)
-
-    # Determines where that replica starts in each axis.
-    outer = replica_pos // computation_stride
-    inner = replica_pos % computation_stride
-    replica_offsets[replica, :] = outer * computation_footprint + inner
-
-  # Computes a complete logical core -> physical core mapping for each replica.
-  indices = [
-      np.arange(0, computation_shape[i] * computation_stride[i],
-                computation_stride[i]) for i in xrange(topology_rank)
-  ]
-  indices = np.concatenate(
-      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
-      axis=-1)
-  indices = indices.reshape((-1, topology_rank))
-  assignment = indices + replica_offsets[:, np.newaxis, :]
-  return DeviceAssignment(topology, core_assignment=assignment)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.device_assignment import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
index 52e1ea42370d653d1de7c12eee4b456ec7ce921c..1b1328b4075d9a737e40693c13e33e0b7c1fbedf 100644
--- a/tensorflow/contrib/tpu/python/tpu/error_handling.py
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -1,132 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""ErrorRendezvous handler for collecting errors from multiple threads."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import sys
-import threading
-import time
-
-import six
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-_UNINTERESTING_ERRORS = (errors.CancelledError,)
-
-
-class ErrorRendezvous(object):
-  """Resolve errors from multiple threads during TPU execution.
-
-  TPU errors can occur on the infeed or outfeed threads as well as the main
-  training thread.
-
-  Depending on which thread "wins" and receives the session error first, we may
-  end up showing users a confusing and non-actionable error message (session
-  cancelled) instead of a root cause (e.g. a bad filename).
-
-  The rendezvous object provides a location to capture these errors until all
-  threads terminate.  At that point we can choose the most informative error
-  to report.
-  """
-
-  def __init__(self, num_sources):
-    # string -> (message, traceback)
-    self._errors = {}
-    self._num_sources = num_sources
-    self._session_cancel_timer = None
-
-  def record_error(self, source, exc_info, session=None):
-    """Report an exception from the given source.
-
-    If a session is passed, a timer will be registered to close it after a few
-    seconds.  This is necessary to ensure the main training loop does not hang
-    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
-    interesting error from another thread to propagate.
-
-    Args:
-      source: string, source of the error
-      exc_info: Output from `sys.exc_info` (type, value, traceback)
-      session: Session to close after delay.
-    """
-    _, value, _ = exc_info
-    self._errors[source] = exc_info
-    logging.info('Error recorded from %s: %s', source, value)
-
-    if session is not None and self._session_cancel_timer is None:
-
-      def _cancel_session():
-        time.sleep(5)
-        try:
-          session.close()
-        except:  # pylint: disable=bare-except
-          pass
-
-      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
-      self._session_cancel_timer.daemon = True
-      self._session_cancel_timer.start()
-
-  def record_done(self, source):
-    """Mark execution source `source` as done.
-
-    If an error was originally reported from `source` it is left intact.
-
-    Args:
-      source: `str`, source being recorded
-    """
-    logging.info('%s marked as finished', source)
-    if source not in self._errors:
-      self._errors[source] = None
-
-  @contextlib.contextmanager
-  def catch_errors(self, source, session=None):
-    """Context manager to report any errors within a block."""
-    try:
-      yield
-    except Exception:  # pylint: disable=broad-except
-      self.record_error(source, sys.exc_info(), session)
-
-  def raise_errors(self, timeout_sec=0):
-    """Wait for up to `timeout` seconds for all error sources to finish.
-
-    Preferentially raise "interesting" errors (errors not in the
-    _UNINTERESTING_ERRORS) set.
-
-    Args:
-      timeout_sec: Seconds to wait for other error sources.
-    """
-    for _ in range(timeout_sec):
-      if len(self._errors) == self._num_sources:
-        break
-      time.sleep(1)
-
-    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
-
-    # First check for any interesting errors, then fall back on the session
-    # cancelled errors etc.
-    for k, (typ, value, traceback) in kept_errors:
-      if isinstance(value, _UNINTERESTING_ERRORS):
-        continue
-      else:
-        logging.warn('Reraising captured error')
-        six.reraise(typ, value, traceback)
-
-    for k, (typ, value, traceback) in kept_errors:
-      logging.warn('Reraising captured error')
-      six.reraise(typ, value, traceback)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.error_handling import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column.py b/tensorflow/contrib/tpu/python/tpu/feature_column.py
index 8edf131bc24fd003806263570b63ee8514c49896..ded75e975b10c4265370af260bf804687c9caebc 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column.py
+++ b/tensorflow/contrib/tpu/python/tpu/feature_column.py
@@ -1,429 +1,30 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU Feature Column Library."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.feature_column import feature_column_lib as fc_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
-# pylint: disable=protected-access
-
-
-_TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
-_SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
-                                  fc._VocabularyFileCategoricalColumn,
-                                  fc._VocabularyListCategoricalColumn,
-                                  fc._WeightedCategoricalColumn,
-                                  fc_lib.IdentityCategoricalColumn,
-                                  fc_lib.VocabularyFileCategoricalColumn,
-                                  fc_lib.VocabularyListCategoricalColumn,
-                                  fc_lib.WeightedCategoricalColumn)
-
-
-def embedding_column(categorical_column,
-                     dimension,
-                     combiner='mean',
-                     initializer=None):
-  """TPU embedding_column for `tf.feature_column.embedding_column`.
-
-  Note that the interface for TPU embedding_column is different from the non-TPU
-  version. The following args available for the non-TPU version are NOT
-  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
-
-  Args:
-    categorical_column: A categorical_column returned from
-        categorical_column_with_identity,  weighted_categorical_column,
-        categorical_column_with_vocabulary_list or
-        categorical_column_with_vocabulary_file.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. For more information, see
-      `tf.feature_column.embedding_column`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-
-  Returns:
-    A  _TPUEmbeddingColumn.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if `initializer` is specified but not callable.
-  """
-  if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
-    raise TypeError(
-        'categorical_column for tpu '
-        ' embedding_column must be type %s, got %s.' % (' or '.join([
-            cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
-        ]), type(categorical_column)))
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified. '
-                     'Embedding of column_name: {}'.format(
-                         categorical_column.name))
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1 / math.sqrt(dimension))
-
-  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
-
-  def _creator(weight_collections, scope):
-    embedding_column_layer = fc._EmbeddingColumnLayer(
-        embedding_shape=embedding_shape,
-        initializer=initializer,
-        weight_collections=weight_collections,
-        trainable=True,
-        name='embedding_column_layer')
-    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
-
-  column = _TPUEmbeddingColumn(
-      categorical_column=categorical_column,
-      dimension=dimension,
-      combiner=combiner,
-      layer_creator=_creator,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      max_norm=None,
-      trainable=True)
-  # For Embedding column, the initializer is hidden inside the creator Fn, which
-  # is not accessiable later. So, we attach it to a speicial field. Also note
-  # that non-TPU Embedding column and non-TPU shared Embedding column handle the
-  # initializer differently. See shared_embedding_columns for details.
-  column._tpu_initializer = initializer
-  return column
-
-
-def shared_embedding_columns(categorical_columns,
-                             dimension,
-                             combiner='mean',
-                             initializer=None,
-                             shared_embedding_collection_name=None):
-  """List of dense columns that convert from sparse, categorical input."""
-  for categorical_column in categorical_columns:
-    if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
-      raise TypeError(
-          'categorical_column for tpu '
-          ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
-              cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
-          ]), type(categorical_column)))
-  columns = fc_lib.shared_embedding_columns(
-      categorical_columns,
-      dimension,
-      combiner=combiner,
-      initializer=initializer,
-      shared_embedding_collection_name=shared_embedding_collection_name,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      max_norm=None,
-      trainable=True)
-
-  # Use the initializer and shared_embedding_collection_name to create TPU
-  # version
-  initializer = columns[0].initializer
-  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
-  tpu_columns = []
-
-  # Create the state (_SharedEmbeddingColumnLayer) here.
-  for categorical_column in categorical_columns:
-    column = _TPUSharedEmbeddingColumn(
-        categorical_column=categorical_column,
-        dimension=dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=None,
-        tensor_name_in_ckpt=None,
-        max_norm=None,
-        trainable=True)
-    tpu_columns.append(column)
-
-  return tpu_columns
-
-
-class _TPUBaseEmbeddingColumn(object):
-  """Base class for TPU Embedding Column."""
-
-  def __init__(self, categorical_column):
-    self._tpu_categorical_column = categorical_column
-
-  def get_combiner(self):
-    """Returns the embedding combiner."""
-    raise NotImplementedError('not implemented')
-
-  def get_embedding_table_size(self):
-    """Returns the embedding table size, tuple of vocab size and dimension."""
-    raise NotImplementedError('not implemented')
-
-  def get_feature_key_name(self):
-    """Returns the feature key name in the features dict."""
-    raise NotImplementedError('not impl')
-
-  def get_weight_key_name(self):
-    """Return the key name for weights."""
-    raise NotImplementedError('not impl')
-
-  def get_embedding_var_name(self):
-    """Returns the embedding variable name.
-
-    Feature key name and embedding variable name are usually one-to-one mapping.
-    But for shared embedding columns, it is many-to-one mapping.
-    """
-    raise NotImplementedError('not impl')
-
-  def get_initializer(self):
-    """Returns the initializer."""
-    raise NotImplementedError('not impl')
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    raise NotImplementedError('not impl')
-
-
-class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
-  """Core Embedding Column."""
-
-  def __new__(cls,
-              categorical_column,
-              dimension,
-              combiner='mean',
-              layer_creator=None,
-              ckpt_to_load_from=None,
-              tensor_name_in_ckpt=None,
-              max_norm=None,
-              trainable=True):
-    # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
-    # are not supported on TPU. They are solely for matching the signature of
-    # __new__ of parent class fc._EmbeddingColumn.
-    return fc._EmbeddingColumn.__new__(
-        cls,
-        categorical_column,
-        dimension,
-        combiner=combiner,
-        layer_creator=layer_creator,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable)
-
-  def __init__(self,
-               categorical_column,
-               dimension,
-               combiner='mean',
-               layer_creator=None,
-               ckpt_to_load_from=None,
-               tensor_name_in_ckpt=None,
-               max_norm=None,
-               trainable=True):
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
-    self._key = None
-
-  def get_combiner(self):
-    return self.combiner
-
-  def get_embedding_table_size(self):
-    """Returns num_ids and width."""
-    return (self.categorical_column._num_buckets, self.dimension)
-
-  def get_feature_key_name(self):
-    """get_feature_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.categorical_column.name
-    return self.categorical_column.name
-
-  def get_weight_key_name(self):
-    """get_weight_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.weight_feature_key
-    return None
-
-  def get_embedding_var_name(self):
-    """get_embedding_var_name."""
-    return self.categorical_column.name
-
-  def get_initializer(self):
-    return self._tpu_initializer
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    if isinstance(
-        self.categorical_column,
-        (
-            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
-            fc_lib.WeightedCategoricalColumn)):
-      return True
-    return False
-
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
-    if tpu.under_tpu_inference_context():
-      def host_computation():
-        return fc._EmbeddingColumn._get_dense_tensor(
-            self, inputs, weight_collections, trainable)
-      return tpu.outside_compilation(host_computation)
-
-    if _is_running_on_cpu():
-      return fc._EmbeddingColumn._get_dense_tensor(
-          self, inputs, weight_collections, trainable)
-
-    # TPU mode
-    # Get the embeddings from the LazyBuilder.
-    tensor = inputs.get(self.get_feature_key_name())
-
-    # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(self.get_embedding_var_name(),
-                                    'embedding_weights')
-
-    return tensor
-
-
-class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
-                                fc._SharedEmbeddingColumn):
-  """Core Shared Embedding Column."""
-
-  def __new__(cls,
-              categorical_column,
-              dimension,
-              combiner='mean',
-              initializer=None,
-              shared_embedding_collection_name=None,
-              ckpt_to_load_from=None,
-              tensor_name_in_ckpt=None,
-              max_norm=None,
-              trainable=True):
-    return fc._SharedEmbeddingColumn.__new__(
-        cls,
-        categorical_column,
-        dimension,
-        combiner=combiner,
-        initializer=initializer,
-        shared_embedding_collection_name=shared_embedding_collection_name,
-        ckpt_to_load_from=ckpt_to_load_from,
-        tensor_name_in_ckpt=tensor_name_in_ckpt,
-        max_norm=max_norm,
-        trainable=trainable)
-
-  def __init__(self,
-               categorical_column,
-               dimension,
-               combiner='mean',
-               initializer=None,
-               shared_embedding_collection_name=None,
-               ckpt_to_load_from=None,
-               tensor_name_in_ckpt=None,
-               max_norm=None,
-               trainable=True):
-
-    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
-    self._key = None
-
-  def get_combiner(self):
-    return self.combiner
-
-  def get_embedding_table_size(self):
-    """Returns num_ids and width."""
-    return (self.categorical_column._num_buckets, self.dimension)
-
-  def get_feature_key_name(self):
-    """get_feature_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.categorical_column.name
-    return self.categorical_column.name
-
-  def get_weight_key_name(self):
-    """get_weight_key_name."""
-    if self.is_categorical_column_weighted():
-      return self.categorical_column.weight_feature_key
-    return None
-
-  def get_embedding_var_name(self):
-    """get_embedding_var_name."""
-    return self.shared_embedding_collection_name
-
-  def get_initializer(self):
-    return self.initializer
-
-  def is_categorical_column_weighted(self):
-    """Check if the categorical column of the embedding column is weighted."""
-    if isinstance(
-        self.categorical_column,
-        (
-            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
-            fc_lib.WeightedCategoricalColumn)):
-      return True
-    return False
-
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
-    if tpu.under_tpu_inference_context():
-      def host_computation():
-        return fc._SharedEmbeddingColumn._get_dense_tensor(
-            self, inputs, weight_collections, trainable)
-      return tpu.outside_compilation(host_computation)
-
-    if _is_running_on_cpu():
-      return fc._SharedEmbeddingColumn._get_dense_tensor(
-          self, inputs, weight_collections, trainable)
-
-    # TPU mode
-    # Get the embeddings from the LazyBuilder.
-    tensor = inputs.get(self.get_feature_key_name())
-
-    # Add to collection for _create_tpu_embedding_variables_and_ops
-    _record_variable_scope_and_name(
-        self.get_embedding_var_name(),
-        'embedding_weights',
-        is_shared_embedding=True)
-    return tensor
-
-
-def _record_variable_scope_and_name(embedding_var_name,
-                                    embedding_var_name_in_fc,
-                                    is_shared_embedding=False):
-  """Add embedding variable name and scope to collection."""
-  g = ops.get_default_graph()
-  collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
-  if not collection:
-    collection.append({})
-
-  var_def_dict = collection[0]
-
-  captured_scope = None
-
-  if is_shared_embedding and (embedding_var_name in var_def_dict):
-    if var_def_dict[embedding_var_name][1] != embedding_var_name_in_fc:
-      raise ValueError(
-          'For embedding var name {}, the shared embedding name is different, '
-          'got {}; expected {}'.format(embedding_var_name,
-                                       embedding_var_name_in_fc,
-                                       var_def_dict[embedding_var_name][1]))
-  else:
-    # scope contains var_scope_name.
-    captured_scope = variable_scope.get_variable_scope()
-    var_def_dict[embedding_var_name] = (captured_scope,
-                                        embedding_var_name_in_fc)
-
-
-def _is_running_on_cpu():
-  """Returns True if the current context is CPU model."""
-  return tpu_function.get_tpu_context().number_of_shards is None
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.feature_column import *
+# used by tests
+from tensorflow.python.tpu.feature_column import _is_running_on_cpu
+from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
+from tensorflow.python.tpu.feature_column import _TPU_FC_TO_SCOPE
+from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUSharedEmbeddingColumn
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/functional.py b/tensorflow/contrib/tpu/python/tpu/functional.py
index 1ec9b5b33d007eb2eaa557438f32ea69053261c6..9a5759221ed9660200cc213df69961db56f8d490 100644
--- a/tensorflow/contrib/tpu/python/tpu/functional.py
+++ b/tensorflow/contrib/tpu/python/tpu/functional.py
@@ -1,25 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-"""Functional operations."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import gen_functional_ops
-
-
-TPUPartitionedCall = gen_functional_ops._tpu_partitioned_call  # pylint: disable=invalid-name,protected-access
-
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.functional import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 37fe9af8c4b154a2e20a957f6ca5d97df3d413be..14a484b2c46ecf4231adbfdfda3b575edb7ef4a1 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -55,8 +55,6 @@ import numpy as np
 import six
 
 from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver as tpu_cluster_resolver_lib
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import keras_tpu_variables
 from tensorflow.contrib.tpu.python.tpu import tpu
@@ -64,6 +62,8 @@ from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
+from tensorflow.python import tf2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
@@ -94,6 +94,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.deprecation import deprecated
 
 
 # TODO(b/114775106): temporary shim to optionally initialize the TPU
@@ -200,13 +201,22 @@ class TPUDistributionStrategy(object):
         removed in future once the model replication functionality is mature
         enough. If `False` (default behavior), the system automatically finds
         the best configuration, in terms of number of TPU cores, for the model
-        replication, typically using all avaiable TPU cores. If overwrites as
+        replication, typically using all available TPU cores. If overwrites as
         `True`, force the model replication using single core, i.e., no
         replication.
     Raises:
       Exception: No TPU Found on the given worker.
     """
-
+    if tf2.enabled():
+      raise RuntimeError(
+          'Keras support is now deprecated in support of TPU Strategy. '
+          'Please follow the distribution strategy guide on tensorflow.org '
+          'to migrate to the 2.0 supported version.')
+    else:
+      logging.warning(
+          'Keras support is now deprecated in support of TPU Strategy. '
+          'Please follow the distribution strategy guide on tensorflow.org '
+          'to migrate to the 2.0 supported version.')
     if tpu_cluster_resolver is None:
       tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('')
 
@@ -298,10 +308,11 @@ def _cross_replica_concat(tensor, core_id, num_cores, name):
                     '{}.'.format(input_dtype, name))
 
   batch_size = tensor.shape[0]
-  mask = math_ops.to_float(
-      math_ops.equal(np.arange(num_cores, dtype=np.int32), core_id))
+  mask = math_ops.cast(
+      math_ops.equal(np.arange(num_cores, dtype=np.int32), core_id),
+      dtypes.float32)
   mask = array_ops.reshape(mask, [num_cores] + [1] * tensor.shape.ndims)
-  result = mask * math_ops.to_float(tensor)
+  result = mask * math_ops.cast(tensor, dtypes.float32)
   local_tensor_with_holes = array_ops.reshape(result,
                                               [-1] + result.shape.as_list()[2:])
   concat_tensor = tpu_ops.cross_replica_sum(local_tensor_with_holes)
@@ -725,9 +736,10 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
     self._dataset = dataset
     self._tpu_assignment = tpu_assignment
-    dummy_x_shape = dataset.output_shapes[0].as_list()
+    dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+    dummy_x_shape = dataset_output_shapes[0].as_list()
     dummy_x_shape[0] *= tpu_assignment.num_towers
-    dummy_y_shape = dataset.output_shapes[1].as_list()
+    dummy_y_shape = dataset_output_shapes[1].as_list()
     dummy_y_shape[0] *= tpu_assignment.num_towers
     self._iterator = dataset_ops.make_initializable_iterator(dataset)
     K.get_session().run(self._iterator.initializer)
@@ -743,23 +755,26 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
     # Use dummy numpy inputs for the rest of Keras' shape checking. We
     # intercept them when building the model.
+    dataset_output_types = dataset_ops.get_legacy_output_types(dataset)
     self._dummy_x = np.zeros(
-        dummy_x_shape, dtype=dataset.output_types[0].as_numpy_dtype)
+        dummy_x_shape, dtype=dataset_output_types[0].as_numpy_dtype)
     self._dummy_y = np.zeros(
-        dummy_y_shape, dtype=dataset.output_types[1].as_numpy_dtype)
+        dummy_y_shape, dtype=dataset_output_types[1].as_numpy_dtype)
 
     input_specs = []
-    if isinstance(self._iterator.output_shapes, tuple):
-      assert isinstance(self._iterator.output_types, tuple)
-      assert len(self._iterator.output_shapes) == len(
-          self._iterator.output_types)
-      for i in range(len(self._iterator.output_shapes)):
-        spec = tensor_spec.TensorSpec(self._iterator.output_shapes[i],
-                                      self._iterator.output_types[i])
+    iterator_output_shapes = dataset_ops.get_legacy_output_shapes(
+        self._iterator)
+    iterator_output_types = dataset_ops.get_legacy_output_types(self._iterator)
+    if isinstance(iterator_output_shapes, tuple):
+      assert isinstance(iterator_output_types, tuple)
+      assert len(iterator_output_shapes) == len(iterator_output_types)
+      for i in range(len(iterator_output_shapes)):
+        spec = tensor_spec.TensorSpec(iterator_output_shapes[i],
+                                      iterator_output_types[i])
         input_specs.append(spec)
-    elif isinstance(self._iterator.output_shapes, tensor_shape.TensorShape):
-      spec = tensor_spec.TensorSpec(self._iterator.output_shapes,
-                                    self._iterator.output_types)
+    elif isinstance(iterator_output_shapes, tensor_shape.TensorShape):
+      spec = tensor_spec.TensorSpec(iterator_output_shapes,
+                                    iterator_output_types)
       input_specs.append(spec)
 
     # Pre-process the inputs and get_next_ops before caching.
@@ -770,24 +785,26 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
   def _verify_dataset_shape(self, dataset):
     """Verifies a dataset is of an appropriate shape for TPUs."""
+    dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+    dataset_output_classes = dataset_ops.get_legacy_output_classes(dataset)
     if not isinstance(dataset, dataset_ops.DatasetV2):
       raise ValueError('The function passed as the `x` parameter did not '
                        'return a `tf.data.Dataset`.')
-    if not isinstance(dataset.output_classes, tuple):
+    if not isinstance(dataset_output_classes, tuple):
       raise ValueError('The dataset must return a tuple of tf.Tensors, '
-                       'instead it returns: %s' % dataset.output_classes)
-    if len(dataset.output_classes) != 2:
+                       'instead it returns: %s' % dataset_output_classes)
+    if len(dataset_output_classes) != 2:
       raise ValueError('The dataset must return a 2-element tuple, got '
-                       '%s output classes instead.' % (dataset.output_classes,))
-    for i, cls in enumerate(dataset.output_classes):
+                       '%s output classes instead.' % (dataset_output_classes,))
+    for i, cls in enumerate(dataset_output_classes):
       if cls != ops.Tensor:
         raise ValueError('The dataset returned a non-Tensor type (%s) at '
                          'index %d.' % (cls, i))
-    for i, shape in enumerate(dataset.output_shapes):
+    for i, shape in enumerate(dataset_output_shapes):
       if not shape:
         raise ValueError('The dataset returns a scalar tensor in '
                          'tuple index %d. Did you forget to batch? '
-                         '(Output shapes: %s).' % (i, dataset.output_shapes))
+                         '(Output shapes: %s).' % (i, dataset_output_shapes))
       for j, dim in enumerate(shape):
         if dim.value is None:
           if j == 0:
@@ -800,7 +817,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
               'currently requires static shapes. The provided '
               'dataset only has a partially defined shape. '
               '(Dimension %d of output tensor %d is not statically known '
-              'for output shapes: %s.%s)' % (j, i, dataset.output_shapes, hint))
+              'for output shapes: %s.%s)' % (j, i, dataset_output_shapes, hint))
 
   @property
   def dummy_x(self):
@@ -1028,29 +1045,29 @@ class TPUFunction(object):
           # the Momentum optimizer) when _make_train_function is invoked.
           with keras_tpu_variables.replicated_variable_for_optimizer(
               self._tpu_assignment.num_towers):
-            self._cloned_model._make_fit_function()
+            self._cloned_model._make_train_function()
         else:
-          self._cloned_model._make_fit_function()
+          self._cloned_model._make_train_function()
 
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model._fit_function.outputs
+            for tensor in self._cloned_model.train_function.outputs
         ]
         return [
-            self._cloned_model._fit_function.updates_op,
+            self._cloned_model.train_function.updates_op,
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model._fit_function.outputs,
+                self._cloned_model.train_function.outputs,
                 name='outfeed-enqueue-train')
         ]
       elif is_test:
-        self._cloned_model._make_eval_function()
+        self._cloned_model._make_test_function()
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model._eval_function.outputs
+            for tensor in self._cloned_model.test_function.outputs
         ]
         return [
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model._eval_function.outputs,
+                self._cloned_model.test_function.outputs,
                 name='outfeed-enqueue-test')
         ]
       elif is_predict:
@@ -1367,7 +1384,16 @@ class KerasTPUModel(models.Model):
         outputs=cpu_model.outputs,
         name=cpu_model.name,
     )
-
+    if tf2.enabled():
+      raise RuntimeError(
+          'Keras support is now deprecated in support of TPU Strategy. '
+          'Please follow the distribution strategy guide on tensorflow.org '
+          'to migrate to the 2.0 supported version.')
+    else:
+      logging.warning(
+          'Keras support is now deprecated in support of TPU Strategy. '
+          'Please follow the distribution strategy guide on tensorflow.org '
+          'to migrate to the 2.0 supported version.')
     # Create a mapping from numpy arrays to infeed managers.
     # Note: uses a list of tuples instead of a map because numpy arrays are
     # not hashable.
@@ -1380,8 +1406,6 @@ class KerasTPUModel(models.Model):
     self.predict_function = None
     self.test_function = None
     self.train_function = None
-    self._fit_function = None
-    self._eval_function = None
     self._stateful_metric_functions = []
 
     cluster_resolver = strategy._tpu_cluster_resolver
@@ -2022,21 +2046,6 @@ class KerasTPUModel(models.Model):
           self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
     return self.test_function
 
-  def _make_fit_function(self):
-    if not self._fit_function:
-      self._fit_function = TPUFunction(
-          self,
-          model_fn_lib.ModeKeys.TRAIN,
-          tpu_assignment=self._tpu_assignment)
-
-    return self._fit_function
-
-  def _make_eval_function(self):
-    if not self._eval_function:
-      self._eval_function = TPUFunction(
-          self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
-    return self._eval_function
-
   def _make_predict_function(self):
     if not self.predict_function:
       self.predict_function = TPUFunction(
@@ -2172,7 +2181,10 @@ Output shape: %(output_shape)s
 # pylint: enable=bad-continuation
 
 
-@experimental
+@deprecated(
+    '2019-02-20', 'Switch to tf.contrib.distribute.TPUStrategy. '
+    'https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy'
+)
 def tpu_model(model, strategy=None):
   """Copy `model` along with weights to the TPU.
 
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index f5735cecc38b7033f21fc4d4105cfead233379fa..ed8f9525c9b91208d39805654b01837abdbf3a77 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -1,437 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Operations for handling session logging and shutdown notifications."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-import time
-from google.protobuf import text_format
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-_WATCHDOG = None
-
-
-class CoordinatorShutdownException(Exception):
-  """Raised when the coordinator needs to shutdown."""
-  pass
-
-
-def _clone_session(session, graph=None):
-  return session_lib.Session(
-      target=session.sess_str,
-      config=session._config,  # pylint: disable=protected-access
-      graph=graph if graph else session.graph)
-
-
-def _make_heartbeat_op(session, device, request_ph):
-  """Return a heartbeat op or None if heartbeats are not supported by device."""
-  try:
-    # Test if we can connect in a isolated graph + session
-    with ops.Graph().as_default():
-      with _clone_session(session) as temp_session:
-        with ops.device(device):
-          heartbeat_op = tpu_ops.worker_heartbeat('')
-          options = config_pb2.RunOptions(timeout_in_ms=5000)
-          temp_session.run(heartbeat_op, options=options)
-  except errors.InvalidArgumentError as _:
-    logging.warning('Error running heartbeat on %s', device)
-    return None
-  except errors.DeadlineExceededError as _:
-    logging.warning('Timeout connecting to %s when testing heartbeat', device)
-    return None
-
-  # If we successfully connected and pinged the worker, go ahead and construct
-  # the operation.
-  with ops.device(device):
-    return tpu_ops.worker_heartbeat(request_ph)
-
-
-class WorkerHeartbeatManager(object):
-  """Manages the status/heartbeat monitor for a set of workers."""
-
-  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
-    """Construct a new WorkerHeartbeatManager.
-
-    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
-
-    Args:
-      session: `tf.Session`, session to use for heartbeat operations.
-      devices: `list[string]` Set of devices to connect to.
-      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
-      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
-        the WorkerHeartbeatRequest protocol buffer.
-    """
-    self._session = session
-    self._devices = devices
-    self._ops = heartbeat_ops
-    self._request_placeholder = request_placeholder
-
-  @staticmethod
-  def from_devices(session, devices):
-    """Construct a heartbeat manager for the given devices."""
-    if not devices:
-      logging.error('Trying to create heartbeat manager with no devices?')
-
-    logging.info('Creating heartbeat manager for %s', devices)
-    request_placeholder = array_ops.placeholder(
-        name='worker_heartbeat_request', dtype=dtypes.string)
-
-    heartbeat_ops = []
-    kept_devices = []
-    for device in devices:
-      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
-      if heartbeat_op is not None:
-        kept_devices.append(device)
-        heartbeat_ops.append(heartbeat_op)
-      else:
-        logging.warning('Heartbeat support not available for %s', device)
-
-    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
-                                  request_placeholder)
-
-  def num_workers(self):
-    return len(self._devices)
-
-  def configure(self, message):
-    """Configure heartbeat manager for all devices.
-
-    Args:
-      message: `event_pb2.WorkerHeartbeatRequest`
-    Returns: `None`
-    """
-    logging.info('Configuring worker heartbeat: %s',
-                 text_format.MessageToString(message))
-    self._session.run(self._ops,
-                      {self._request_placeholder: message.SerializeToString()})
-
-  def ping(self, request=None, timeout_in_ms=5000):
-    """Ping all workers, returning the parsed status results."""
-    if request is None:
-      request = event_pb2.WorkerHeartbeatRequest()
-
-    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
-    results = self._session.run(
-        self._ops,
-        feed_dict={self._request_placeholder: request.SerializeToString()},
-        options=options)
-    parsed_results = [
-        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
-        for res_pb in results
-    ]
-    logging.debug('Ping results: %s', parsed_results)
-    return parsed_results
-
-  def lame_workers(self):
-    """Ping all workers, returning manager containing lame workers (or None)."""
-    ping_results = self.ping()
-    lame_workers = []
-
-    for ping_response, device, op in zip(ping_results, self._devices,
-                                         self._ops):
-      if ping_response.health_status != event_pb2.OK:
-        lame_workers.append((device, op))
-
-    if not lame_workers:
-      return None
-
-    bad_devices, bad_ops = zip(*lame_workers)
-    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
-                                  self._request_placeholder)
-
-  def __repr__(self):
-    return 'HeartbeatManager(%s)' % ','.join(self._devices)
-
-  def shutdown(self, timeout_ms=10000):
-    """Shutdown all workers after `shutdown_timeout_secs`."""
-    logging.info('Shutting down %s.', self)
-    req = event_pb2.WorkerHeartbeatRequest(
-        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
-    self.configure(req)
-
-    # Wait for workers to shutdown.  This isn't strictly required
-    # but it avoids triggering multiple checkpoints with the same lame worker.
-    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
-    time.sleep(timeout_ms / 1000)
-
-
-def all_worker_devices(session):
-  """Return a list of devices for each worker in the system."""
-  devices = session.list_devices()
-  return [
-      device.name
-      for device in devices
-      if ':CPU:' in device.name and 'coordinator' not in device.name
-  ]
-
-
-class WatchdogManager(threading.Thread):
-  """Configures worker watchdog timer and handles periodic pings.
-
-  Usage:
-    # Ping workers every minute, shutting down workers if they haven't received
-    # a ping after 1 hour.
-    watchdog_manager = WatchdogManager(
-      ping_interval=60, shutdown_timeout=3600
-    )
-
-    # Use as a context manager, resetting watchdog on context exit:
-    with watchdog_manager:
-      session.run(...)
-
-    # Or setup globally; watchdog will remain active until program exit.
-    watchdog_manager.configure_and_run()
-  """
-
-  def __init__(self,
-               session,
-               devices=None,
-               ping_interval=60,
-               shutdown_timeout=3600):
-    """Initialize a watchdog manager.
-
-    Args:
-      session: Session connected to worker devices.  A cloned session and graph
-        will be created for managing worker pings.
-      devices: Set of devices to monitor.  If none, all workers will be
-        monitored.
-      ping_interval: Time, in seconds, between watchdog pings.
-      shutdown_timeout: Time, in seconds, before watchdog timeout.
-    """
-    threading.Thread.__init__(self)
-    self.ping_interval = ping_interval
-    self.shutdown_timeout = shutdown_timeout
-    self.daemon = True
-    self._config = session._config  # pylint: disable=protected-access
-    self._target = session.sess_str
-    self._running = False
-    self._devices = devices
-
-    self._graph = None
-    self._session = None
-    self._worker_manager = None
-
-  def _reset_manager(self):
-    """Reset the graph, session and worker manager."""
-    self._graph = ops.Graph()
-    self._session = session_lib.Session(
-        target=self._target,
-        graph=self._graph,
-        config=self._config,
-    )
-
-    if self._devices is None:
-      self._devices = all_worker_devices(self._session)
-
-    with self._graph.as_default():
-      self._worker_manager = WorkerHeartbeatManager.from_devices(
-          self._session, self._devices)
-
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(
-                timeout_ms=self.shutdown_timeout * 1000,),
-            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
-
-  def configure_and_run(self):
-    logging.info(
-        'Enabling watchdog timer with %d second timeout '
-        'and %d second ping interval.', self.shutdown_timeout,
-        self.ping_interval)
-    self._reset_manager()
-    self._running = True
-    self.start()
-
-  def stop(self):
-    logging.info('Stopping worker watchdog.')
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
-            shutdown_mode=event_pb2.NOT_CONFIGURED))
-    self._running = False
-    self.join()
-
-  def __enter__(self):
-    self.configure_and_run()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    self.stop()
-
-  def run(self):
-    # Don't fetch logs or adjust timing: just ping the watchdog.
-    #
-    # If we hit an exception, reset our session as it is likely broken.
-    while self._running:
-      try:
-        self._worker_manager.ping(request=None)
-        time.sleep(self.ping_interval)
-      except errors.OpError as e:
-        # Catch any TF errors that occur so we don't stop sending heartbeats
-        logging.debug('Caught error while sending heartbeat: %s', e)
-        self._reset_manager()
-
-
-def start_worker_watchdog(session,
-                          devices=None,
-                          ping_interval=60,
-                          shutdown_timeout=3600):
-  """Start global worker watchdog to shutdown workers on coordinator exit."""
-  global _WATCHDOG
-  if _WATCHDOG is None:
-    # Ensure we can send a few pings before we timeout!
-    ping_interval = min(shutdown_timeout / 10., ping_interval)
-    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
-                                shutdown_timeout)
-    _WATCHDOG.configure_and_run()
-
-
-class GracefulShutdownHook(session_run_hook.SessionRunHook):
-  """Session hook that watches for shutdown events.
-
-  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
-  SystemShutdown exception is raised to terminate the main session.  If `saver`
-  is None the `SAVERS` collection will be read to find a saver.
-
-  `on_shutdown_hooks` is an optional list of functions that should be called
-  after checkpointing.  The function is called with (`run_context`,
-  `all_workers`, `lame_workers`).
-
-  If `heartbeat_group` is not specified, it will default to all CPU workers
-  in the system.
-  """
-
-  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
-    self._saver = saver
-    self._checkpoint_prefix = checkpoint_prefix
-    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
-
-    # Worker heartbeats are managed independently of the main training graph.
-    self._graph = ops.Graph()
-    self._workers = None
-    self._session = None
-    self._heartbeat_supported = False
-
-  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
-    # N.B. We have to pull the global step here to avoid it being unavailable
-    # at checkpoint time; the graph has been frozen at that point.
-    if training_util.get_global_step() is None and self.saver() is not None:
-      raise ValueError(
-          'Saver defined but no global step.  Run `get_or_create_global_step()`'
-          ' in your model definition to allow checkpointing.')
-
-    with self._graph.as_default():
-      logging.info('Installing graceful shutdown hook.')
-      self._session = _clone_session(training_session, self._graph)
-      self._workers = WorkerHeartbeatManager.from_devices(
-          self._session, all_worker_devices(self._session))
-      self._heartbeat_supported = self._workers.num_workers() > 0
-      if self._heartbeat_supported:
-        self._workers.configure(
-            event_pb2.WorkerHeartbeatRequest(
-                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
-      else:
-        logging.warn(
-            'No workers support hearbeats. Failure handling will be disabled.')
-
-  def saver(self):
-    if self._saver:
-      return self._saver
-
-    savers = ops.get_collection(ops.GraphKeys.SAVERS)
-    if not savers:
-      return None
-
-    if not isinstance(savers, list):
-      return savers
-
-    if len(savers) > 1:
-      logging.error(
-          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
-          'will be disabled. Pass an explicit `saver` to the constructor to '
-          'override this behavior.')
-      return None
-
-    return savers[0]
-
-  def after_run(self, run_context, run_values):
-    del run_values
-
-    if not self._heartbeat_supported:
-      return
-
-    lame_workers = self._workers.lame_workers()
-    if lame_workers:
-      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
-
-      if self.saver():
-        logging.info('ShutdownHook: saving checkpoint to %s',
-                     self._checkpoint_prefix)
-        self.saver().save(
-            run_context.session,
-            self._checkpoint_prefix,
-            global_step=training_util.get_global_step(),
-            write_state=True,
-        )
-      else:
-        logging.info('ShutdownHook: no Saver defined.')
-
-      for fn in self._on_shutdown_hooks:
-        fn(run_context, self._workers, lame_workers)
-
-
-class RestartComputation(object):
-  """Restart the entire computation.
-
-  This hook shuts down all workers and returns control to the top-level by
-  throwing a CoordinatorShutdownException.
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    del run_context, lame_workers
-    all_workers.shutdown(timeout_ms=self.timeout_ms)
-
-    logging.info('Terminating coordinator.')
-    raise CoordinatorShutdownException()
-
-
-class ShutdownLameWorkers(object):
-  """Shutdown lamed workers.
-
-  Processing will continue normally (typically by waiting for the down
-  workers to be restarted).
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_in_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.session_support import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index bf492e78a15acc92017663a286e8c8f0b2045339..73db253fd790f26679fb05bd6e7a5da6a99da1a7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -1,1147 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ========================================================================
-"""A utility to trace tensor values on TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import os.path
-import re
-import sys
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-
-_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
-_DEVICE_TYPE_TPU = 'tpu'
-_DEVICE_TYPE_CPU = 'cpu'
-_TRACE_MODE_NAN_INF = 'nan-inf'
-_TRACE_MODE_PART_TENSOR = 'part-tensor'
-_TRACE_MODE_PART_TENSOR_SIZE = 3
-_TRACE_MODE_FULL_TENSOR = 'full-tensor'
-_TRACE_MODE_NORM = 'norm'
-_TRACE_MODE_MAX_ABS = 'max-abs'
-_SUBMODE_BRIEF = 'brief'
-_SUBMODE_DETAILED = 'detailed'
-_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
-_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
-_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
-_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
-_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
-_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
-_REASON_SCALAR_GET_TRACED = 'traced-scalar'
-_REASON_TENSOR_GET_TRACED = 'traced-tensor'
-_REASON_USER_INCLUDED = 'traced-user-included'
-_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
-_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
-_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
-_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
-_MARKER_SECTION_END = '!!!!!!! section-end:'
-_SECTION_NAME_CONFIG = 'configuration'
-_SECTION_NAME_REASON = 'reason'
-_SECTION_NAME_OP_LIST = 'op-list'
-_SECTION_NAME_TENSOR_LIST = 'tensor-list'
-_SECTION_NAME_GRAPH = 'graph'
-_FIELD_NAME_VERSION = 'version:'
-_FIELD_NAME_DEVICE = 'device:'
-_FIELD_NAME_TRACE_MODE = 'trace-mode:'
-_FIELD_NAME_SUBMODE = 'submode:'
-_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
-_FIELD_NAME_NUM_OPS = 'number-of-ops:'
-_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
-_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
-_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
-_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
-_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
-_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
-_FLAG_NAME_ENABLE = 'enable'
-_FLAG_NAME_TRACE_MODE = 'trace_mode'
-_FLAG_NAME_SUBMODE = 'submode'
-_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
-_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
-_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
-_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
-_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
-_FLAG_NAME_TRACE_FILE = 'trace_file_path'
-_FLAG_NAME_REPORT_FILE = 'report_file_path'
-_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
-_FLAG_NAME_OP_RANGE = 'op_range'
-_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
-_OUTPUT_STREAM_ESCAPE = 'file://'
-_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
-_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
-_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
-
-
-def tensor_tracepoint(tensor, checkpoint_name):
-  """Adds a checkpoint with the given checkpoint name for the given tensor.
-
-  The tensor will be added to the list of tensors that will be traced by the
-  tensor tracer.
-
-  Args:
-     tensor: the tensor object for which the tracing is requested.
-     checkpoint_name: a string name for the checkpoint. This name has to be a
-     unique name if used within model comparison. The tensors that have the same
-     checkpoint identifier is compared in model comparison.
-  Returns:
-    The provided tensor.
-  """
-
-  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
-  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
-                                 (tensor, checkpoint_name))
-  return tensor
-
-
-def keras_layer_tracepoint(layer, checkpoint_name):
-  """An interface for adding the tensor outputs of a keras layer.
-
-  Encapsulates tensor_tracepoint.
-
-  Args:
-     layer: A keras layer.
-     checkpoint_name: a string name for the checkpoint. This name has to be a
-     unique name if used within model comparison. The tensors that have the same
-     checkpoint identifier is compared in model comparison.
-
-  Returns:
-    The provided layer.
-  """
-  try:
-    outputs = layer.output
-    if tensor_util.is_tensor(outputs):
-      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
-    else:
-      idx = 0
-      for output_tensor in outputs:
-        if tensor_util.is_tensor(outputs):
-          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
-        idx += 1
-  except AttributeError:
-    pass
-  except RuntimeError:
-    pass
-  return layer
-
-
-class TensorTracer(object):
-  """A software construct for tracing tensor values in a TF graph on TPU.
-
-  This utility is disabled by default. It can be enabled by setting
-  the TENSOR_TRACER_FLAGS env variable as:
-    export TENSOR_TRACER_FLAGS="--enable=1"
-  If it is enabled, it will trace the output tensor values of
-  selected Ops in the graph. It has two outputs: (1) the traces and (2)
-  a report. The traces are dumped to a specified local file on the TPU
-  host. The report is printed to the log.info of the TPU job.
-  By passing options via the env variable, users can change:
-     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
-         full tensor values)
-     (2) which Ops to be traced (via op.name or op.type)
-     (3) output trace file path.
-  """
-
-  @staticmethod
-  def _match_next_flag(flags, pos):
-    """Returns the match for the next TensorTracer flag.
-
-    Args:
-       flags: a string that contains the flags.
-       pos: where in flags to start the search.
-
-    Returns:
-       A pair where the first element is the regular-expression
-       match found and the second element indicates if the match
-       has a value.
-    """
-
-    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
-    if match:
-      # The flag is found but is not given a value.
-      return match, False
-    # The flag is not found.
-    return None, False
-
-  @staticmethod
-  def validate_flag_names():
-    """Validates if the TensorTrace flags passed are valid."""
-    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
-                        _FLAG_NAME_SUBMODE,
-                        _FLAG_NAME_EXCLUDED_OPNAMES,
-                        _FLAG_NAME_EXCLUDED_OPTYPES,
-                        _FLAG_NAME_INCLUDED_OPNAMES,
-                        _FLAG_NAME_INCLUDED_OPTYPES,
-                        _FLAG_NAME_TRACE_FILE, _FLAG_NAME_REPORT_FILE,
-                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
-                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
-                        _FLAG_NAME_OP_RANGE]
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return
-    pos = 0
-    while True:
-      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if flag_name not in valid_flag_names:
-        raise ValueError(
-            'The flag name "%s" passed via the environment variable "%s" '
-            'is invalid. Valid flag names are:'
-            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
-      pos = match.end()
-
-  @staticmethod
-  def print_flag_values():
-    """Prints all TensorTracer flags passed via environment variables."""
-
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
-    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
-                                                   tensor_tracer_flags)
-    result += 'Individual flag value:\n'
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      result += '  %s: %s\n'%(flag_name, flag_value)
-      pos = match.end()
-    result += '\n'
-    return result
-
-  @staticmethod
-  def get_flag_value(wanted_flag_name):
-    """Returns the value of a TensorTracer flags.
-
-    Args:
-      wanted_flag_name: the name the the flag we are looking for.
-
-    Returns:
-      A pair where the first element indicates if the flag is
-      found and the second element is the value of the flag.
-
-    Raises:
-      RuntimeError: If supposedly deadcode is reached.
-    """
-
-    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return False, None
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        return False, None
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      if flag_name == wanted_flag_name:
-        return True, flag_value
-      pos = match.end()
-    raise RuntimeError('Should not reach here.')
-
-  @staticmethod
-  def flag_value_to_re_list(flag_name):
-    """Converts list of strings to compiled RE."""
-
-    re_list = []
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found or not flag_value:
-      return re_list
-    list_of_values = flag_value.split()
-    for v in list_of_values:
-      r = re.compile(v)
-      re_list.append(r)
-    return re_list
-
-  @staticmethod
-  def _is_flag_on(flag_name):
-    """Returns True if the given flag is on."""
-
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found:
-      return False
-    if flag_value is None:
-      return True
-    # Depends on the flag value.
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
-
-  @staticmethod
-  def is_enabled():
-    """Returns True if TensorTracer is enabled."""
-
-    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
-
-  @staticmethod
-  def use_test_undeclared_outputs_dir():
-    """Decides the output directory of the report and trace files.
-
-    Args:
-       None.
-
-    Returns:
-       True if the output files should be written to the
-       test-undeclared-outputs-directory defined via an
-       env variable.
-    """
-
-    return TensorTracer._is_flag_on(
-        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
-
-
-  @staticmethod
-  def check_device_type(device_type):
-    """Checks if the given device type is valid."""
-
-    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
-      raise ValueError('Invalid device_type "%s"'%device_type)
-
-  @staticmethod
-  def check_trace_mode(trace_mode):
-    """Checks if the given trace mode is valid."""
-
-    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
-                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
-                         _TRACE_MODE_MAX_ABS]
-    if trace_mode not in valid_trace_modes:
-      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
-                       'Valid trace modes are: %s'%(trace_mode,
-                                                    valid_trace_modes))
-
-  @staticmethod
-  def check_submode(submode):
-    """Checks if the given submode is valid."""
-
-    if not submode:
-      return
-    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
-    if submode not in valid_submodes:
-      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
-                       'Valid submodes are: %s'%(submode,
-                                                 valid_submodes))
-
-  @staticmethod
-  def unsafe_op(op):
-    """Returns True if this op is not safe to be traced."""
-
-    if control_flow_util.IsInCond(op):
-      return True
-    # Reasons for not including following op types:
-    #    Assign: cause incorrect result with CPU tracing.
-    if op.type in ['Assign']:
-      return True
-    return False
-
-  @staticmethod
-  def device_mismatch(device_type, op):
-    if device_type == _DEVICE_TYPE_TPU:
-      # pylint: disable=protected-access
-      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
-      # pylint: enable=protected-access
-    return False
-
-  @staticmethod
-  def unsafe_scalar_trace(op):
-    """Return true if scalar output tensor from Op is not safe to be traced."""
-
-    # Tracing the following causes cycle in the graph on TPU.
-    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
-                   'Switch', 'Less', 'ReadVariableOp']:
-      return True
-    # Tracing the following will cause casting-issue
-    # with the norm tracing mode or other compilation issues on CPU.
-    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
-                   'IteratorGetNext', 'OneShotIterator',
-                   'IteratorV2', 'MakeIterator',
-                   'BatchDatasetV2', 'MapDataset',
-                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
-                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
-      return True
-    return False
-
-  @staticmethod
-  def less_interesting_op(op):
-    """Returns True if the given Op is not an interesting one to be traced."""
-
-    found, _ = TensorTracer.get_flag_value(
-        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
-    if found:
-      # users force to include all ops.
-      return False
-    # Following ops are highly unlikey to cause bugs.
-    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
-
-  @staticmethod
-  def reason(op_idx, details):
-    """Returns reason why the Op at op_idx is traced or not."""
-
-    return '%d %s'%(op_idx, details)
-
-  @staticmethod
-  def topological_sort(g):
-    """Performs topological sort on the given graph.
-
-    Args:
-       g: the graph.
-
-    Returns:
-       A pair where the first element indicates if the topological
-       sort succeeded (True if there is no cycle found; False if a
-       cycle is found) and the second element is either the sorted
-       list of nodes or the cycle of nodes found.
-    """
-
-    def visit(op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops):
-      """Recursively visits all Ops in a graph.
-
-      Args:
-         op: the current Op being visited.
-         cycle: a cycle of Ops found.
-         permanently_marked_ops: the set of Ops that were already visited.
-         temporarily_marked_ops: the set of Ops that we have visited during
-                                 the current descent.
-         sorted_ops: the list of Ops sorted in topological order.
-      """
-
-      if cycle:
-        return
-      if op in permanently_marked_ops:
-        return
-      if op in temporarily_marked_ops:
-        cycle = temporarily_marked_ops
-        return
-      temporarily_marked_ops.add(op)
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        for consumer_op in out_tensor.consumers():
-          visit(consumer_op, cycle, permanently_marked_ops,
-                temporarily_marked_ops, sorted_ops)
-      # pylint: disable=protected-access
-      for ctrl_output_op in op._control_outputs:
-        # pylint: enable=protected-access
-        visit(ctrl_output_op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops)
-      temporarily_marked_ops.remove(op)
-      permanently_marked_ops.add(op)
-      sorted_ops.insert(0, op)
-
-    graph_cycle = set([])
-    sorted_ops = []
-    permanently_marked_ops = set([])
-    temporarily_marked_ops = set([])
-    unsorted_ops = g.get_operations()
-    for op in unsorted_ops:
-      visit(op, graph_cycle, permanently_marked_ops,
-            temporarily_marked_ops, sorted_ops)
-    if graph_cycle:
-      return (False, graph_cycle)
-    else:
-      assert len(unsorted_ops) == len(sorted_ops)
-      return (True, sorted_ops)
-
-  @staticmethod
-  def _make_op_and_tensor_maps(op_list):
-    """Creates various maps and lists from op_list.
-
-    Args:
-       op_list: a list of Ops
-
-    Returns:
-       opname_idx_map: a map from Op's name to its index in op_list.
-       tensor_list: a list of output tensors of the Ops in op_list.
-       tensorname_idx_map: a map from output tensor name to its index
-                           in tensor_list.
-    """
-
-    opname_idx_map = {}
-    tensor_list = []
-    tensorname_idx_map = {}
-    for op_id, op in enumerate(op_list):
-      if op.name in opname_idx_map:
-        raise ValueError('Duplicated Op name: %s'%op.name)
-      opname_idx_map[op.name] = op_id
-      for output_tensor in op.outputs:
-        if output_tensor.name not in tensorname_idx_map:
-          tensor_list.append(output_tensor)
-          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
-    return (opname_idx_map, tensor_list, tensorname_idx_map)
-
-  def __init__(self):
-    """Initializes a TensorTracer.
-
-    Sets the various member fields from the flags (if given) or the defaults.
-    """
-    self._version = 'use-outside-compilation'
-    self._device_type = None
-    TensorTracer.validate_flag_names()
-    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
-    if not found or not self._trace_mode:
-      self._trace_mode = _TRACE_MODE_NAN_INF
-    TensorTracer.check_trace_mode(self._trace_mode)
-    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
-    if not found or not self._submode:
-      self._submode = _SUBMODE_DETAILED
-    TensorTracer.check_submode(self._submode)
-    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
-    self._instrument_records = {}
-    self._set_trace_file_path()
-    self._set_report_file()
-    self._set_op_range()
-    self._set_excluded_opnames()
-    self._set_excluded_optypes()
-    self._set_included_opnames()
-    self._set_included_optypes()
-    self._num_replicas = None
-    self._replica_id = None
-
-  def _add_replica_id_to_graph(self, num_replicas, result_tensor):
-    """Adds nodes for computing the replica ID to the graph."""
-
-    if not num_replicas:
-      self._replica_id = 'unknown'
-      return result_tensor
-
-    self._num_replicas = num_replicas
-
-    with ops.control_dependencies(None):
-      # Uses None as dependency to run outside of TPU graph rewrites.
-      self._replica_id = tpu_ops.tpu_replicated_input(
-          list(range(self._num_replicas)),
-          name='tt_replica_id')
-    use_replica_id = array_ops.identity(self._replica_id).op
-    with ops.control_dependencies([use_replica_id]):
-      # Adds a control dependency from the result_tensor to
-      # the replica_id to ensure that replica_id will be added to the graph.
-      return array_ops.identity(result_tensor)
-
-  def _set_trace_file_path(self):
-    """Sets the path of the output trace file."""
-
-    found, self._trace_file_path = TensorTracer.get_flag_value(
-        _FLAG_NAME_TRACE_FILE)
-    if found and self._trace_file_path \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      if os.path.isabs(self._trace_file_path):
-        raise ValueError('If use_test_undeclared_outputs_dir is set,'
-                         'trace_file_path cannot be an absolute path (%s)'
-                         %self._trace_file_path)
-      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-      self._trace_file_path = os.path.join(outputs_dir,
-                                           self._trace_file_path)
-
-  def _set_report_file(self):
-    """Sets the path of the output report file."""
-
-    found, self._report_file_path = TensorTracer.get_flag_value(
-        _FLAG_NAME_REPORT_FILE)
-    if found and self._report_file_path \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      if os.path.isabs(self._report_file_path):
-        raise ValueError('If use_test_undeclared_outputs_dir is set,'
-                         'report_file_path cannot be an absolute path (%s)'
-                         %self._report_file_path)
-      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-      self._report_file_path = os.path.join(outputs_dir,
-                                            self._report_file_path)
-    if not self._report_file_path:
-      self._report_file = None
-      return
-    try:
-      self._report_file = gfile.Open(self._report_file_path, 'w')
-    except IOError as e:
-      raise e
-
-  def _close_report_file(self):
-    if self._report_file:
-      self._report_file.close()
-
-  def _set_op_range(self):
-    """Sets the index range of the Ops that we will consider tracing."""
-
-    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
-    if not found or not op_range:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    match = _OP_RANGE_PAT.match(op_range)
-    if not match:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    self._op_range = (int(match.group(1)), int(match.group(2)))
-
-  def _inside_op_range(self, idx):
-    """Return True if the given index is inside the selected range."""
-
-    if idx < self._op_range[0]:
-      return False
-    return self._op_range[1] < 0 or idx <= self._op_range[1]
-
-  def _set_excluded_opnames(self):
-    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPNAMES)
-
-  def _set_excluded_optypes(self):
-    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPTYPES)
-
-  def _set_included_opnames(self):
-    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPNAMES)
-
-  def _set_included_optypes(self):
-    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPTYPES)
-
-  def _is_user_included_op(self, op):
-    for opname_re in self._included_opname_re_list:
-      if opname_re.match(op.name):
-        return True
-    for optype_re in self._included_optype_re_list:
-      if optype_re.match(op.type):
-        return True
-    return False
-
-  def _is_user_excluded_op(self, op):
-    for opname_re in self._excluded_opname_re_list:
-      if opname_re.match(op.name):
-        return True
-    for optype_re in self._excluded_optype_re_list:
-      if optype_re.match(op.type):
-        return True
-    return False
-
-  def _write_report(self, content):
-    """Writes the given content to the report."""
-
-    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
-    if self._report_file:
-      self._report_file.write(line)
-    else:
-      logging.info(line)
-
-  def _write_config_section(self):
-    """Writes the config section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
-    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
-    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
-
-  def _write_reason_section(self):
-    """Writes the reason section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
-    for key in sorted(self._instrument_records):
-      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
-
-  def _write_op_list_section(self, op_list):
-    """Writes the Op-list section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
-    for i in range(0, len(op_list)):
-      op = op_list[i]
-      line = '%d "%s" %s'%(i, op.name, op.type)
-      for out_tensor in op.outputs:
-        if out_tensor.name not in self._tensorname_idx_map:
-          raise ValueError(
-              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
-        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
-      line += '\n'
-      self._write_report(line)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
-
-  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
-    """Writes the tensor-list section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
-                                  _SECTION_NAME_TENSOR_LIST))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
-    for i in range(0, len(tensor_list)):
-      tensor = tensor_list[i]
-      line = '%d "%s"'%(i, tensor.name)
-      for consumer_op in tensor.consumers():
-        if consumer_op.name not in opname_idx_map:
-          raise ValueError(
-              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
-        line += ' %d'%opname_idx_map[consumer_op.name]
-      line += '\n'
-      self._write_report(line)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
-                                  _SECTION_NAME_TENSOR_LIST))
-
-  def _write_graph_section(self, succeed, sorted_or_cycle):
-    """Writes the graph section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
-                                  succeed))
-    l = list(sorted_or_cycle)
-    for i in range(0, len(l)):
-      self._write_report('%d "%s"\n'%(i, l[i].name))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
-
-  def _preprocess_traced_tensor(self, tensor):
-    """Computes NAN/Norm/Max on TPUs before sending to CPU.
-
-    Args:
-      tensor: The tensor to be traced.
-    Returns:
-      A tensor that should be input to the trace_function.
-    Raises:
-      RuntimeError: If the trace mode is invalid.
-    """
-
-    def _detect_nan_inf(tensor):
-      """Trace function for detecting any NaN/Inf in the tensor."""
-
-      if tensor.dtype.is_floating:
-        output_tensor = math_ops.reduce_any(
-            gen_math_ops.logical_or(
-                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
-      else:
-        output_tensor = constant_op.constant(False)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    def _show_norm(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = linalg_ops.norm(tensor)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    def _show_max_abs(tensor):
-      tensor = math_ops.cast(tensor, dtypes.float32)
-      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
-      zero = constant_op.constant(0, dtypes.float32)
-      output_tensor = gen_math_ops.maximum(zero, output_tensor)
-      # The shape has to be 1. Set it if it does not have the information.
-      output_tensor = array_ops.reshape(output_tensor, [1])
-      return output_tensor
-
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
-      return _detect_nan_inf(tensor)
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
-      return tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
-      return tensor
-    if self._trace_mode == _TRACE_MODE_NORM:
-      return _show_norm(tensor)
-    if self._trace_mode == _TRACE_MODE_MAX_ABS:
-      return _show_max_abs(tensor)
-    raise RuntimeError(
-        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
-
-  def _make_tensor_trace_fun(self, tensor_name):
-    """Makes the tensor tracing function called by outside compilation.
-
-    Args:
-      tensor_name: name of the tensor being traced.
-
-    Returns:
-      A function to be passed as the first argument to outside compilation.
-
-    Raises:
-      RuntimeError: If the trace mode is invalid.
-    """
-
-    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
-      """Prints a tensor value to a file.
-
-      Args:
-        tensor_name: name of the tensor being traced.
-        num_elements: number of elements to print (-1 means print all).
-        tensor: the tensor needs to be returned.
-        output_tensor: the tensor needs to be printed.
-
-      Returns:
-        The same tensor passed via the "tensor" argument.
-
-      Raises:
-        ValueError: If tensor_name is not already in
-                    self._tensorname_idx_map.
-      """
-
-      if self._submode == _SUBMODE_BRIEF:
-        if tensor_name not in self._tensorname_idx_map:
-          raise ValueError(
-              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        msg = '%d'%self._tensorname_idx_map[tensor_name]
-      else:
-        msg = '"%s"'%tensor_name
-
-      if self._trace_file_path:
-        output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
-      else:
-        output_stream = sys.stderr
-      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
-                                      '@', self._replica_id,
-                                      '\n', output_tensor, '\n',
-                                      summarize=num_elements,
-                                      output_stream=output_stream)
-      with ops.control_dependencies([print_op]):
-        return array_ops.identity(tensor).op
-
-
-    def _show_part_tensor(tensor):
-      """Trace function for printing part of the tensor."""
-
-      return _print_tensor(tensor_name, self._part_tensor_size,
-                           tensor, tensor)
-
-    def _show_full_tensor(tensor):
-      """Trace function for printing the entire tensor."""
-
-      return _print_tensor(tensor_name, -1, tensor, tensor)
-
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
-      return _show_part_tensor
-    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
-    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
-    # performed within TPUs and only their results are transferred to CPU.
-    # Simply, print the full tensor for these trace modes.
-    if self._trace_mode in [
-        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
-        _TRACE_MODE_MAX_ABS
-    ]:
-      return _show_full_tensor
-
-    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
-                       %self._trace_mode)
-
-  def _skip_op(self, op_id, op, user_included, user_excluded,
-               in_exec_path=True):
-    """Returns True if we should not trace Op."""
-
-    if user_included:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_USER_INCLUDED)
-      return False
-    if user_excluded:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_USER_EXCLUDED)
-      return True
-    if not in_exec_path:
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_NOT_EXECUTED)
-      return True
-    if not self._inside_op_range(op_id):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_OUTSIDE_OP_RANGE)
-      return True
-    if TensorTracer.unsafe_op(op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_UNSAFE_OP)
-      return True
-    if TensorTracer.device_mismatch(self._device_type, op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_DEVICE_MISMATCH)
-      return True
-    if TensorTracer.less_interesting_op(op):
-      self._instrument_records[op.name] = TensorTracer.reason(
-          op_id, _REASON_LESS_INTERESTING_OP)
-      return True
-    return False
-
-  def _skip_tensor(self, op_id, out_tensor, user_included,
-                   user_excluded):
-    """Returns True if we should not trace out_tensor."""
-
-    # Skips a tensor if the tensor has a non-numeric type.
-    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
-    #         because it also excludes tensors with dtypes, bool, and
-    #         float32_ref, which we actually want to trace.
-    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
-                                    dtypes.string])
-    if out_tensor.dtype in non_numeric_tensor_types:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_NON_NUMERIC_TENSOR)
-      return True
-
-    if user_included:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_USER_INCLUDED)
-      return False
-    if user_excluded:
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_USER_EXCLUDED)
-      return True
-    if not out_tensor.get_shape().is_fully_defined():
-      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
-      # to a scalar before the outside compilation call.
-      if self._trace_mode in [
-          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
-      ]:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_TENSOR_GET_TRACED)
-        return False
-      else:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_DYNAMIC_SHAPE)
-        return True
-    rank = len(out_tensor.shape)
-    if rank < 1:
-      # scalar
-      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_UNSAFE_SCALAR)
-        return True
-      else:
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _REASON_SCALAR_GET_TRACED)
-        return False
-    else:
-      # tensor
-      self._instrument_records[out_tensor.name] = TensorTracer.reason(
-          op_id, _REASON_TENSOR_GET_TRACED)
-      return False
-
-  def _filter_execution_path_operations(self, operations, fetches):
-    """Returns the set of ops in the execution path to compute given fetches."""
-    # If no fetch provided, then return all operations.
-    if fetches is None:
-      return set(operations)
-    # Convert to list, if a single element is provided.
-    if not isinstance(fetches, (list, tuple)):
-      fetches = [fetches]
-    # If a tensor is given as fetch, convert it to op.
-    op_fetches = []
-    for fetch in fetches:
-      if isinstance(fetch, ops.Operation):
-        op_fetches.append(fetch)
-      elif isinstance(fetch, ops.Tensor):
-        op_fetches.append(fetch.op)
-      else:
-        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
-                           %fetch)
-
-    execution_path_operations = set(op_fetches)
-    traverse_stack = list(op_fetches)
-    while True:
-      if not traverse_stack:
-        break
-      head_op = traverse_stack.pop()
-      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
-      input_ops.extend(head_op.control_inputs)
-
-      for input_op in input_ops:
-        if input_op not in execution_path_operations:
-          execution_path_operations.add(input_op)
-          traverse_stack.append(input_op)
-    return execution_path_operations
-
-  def _pre_tracing(self, graph):
-    """Work needs to be done prior to TPU or CPU tracing."""
-
-    operations = graph.get_operations()
-    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
-        TensorTracer._make_op_and_tensor_maps(operations))
-    self._write_config_section()
-    self._write_op_list_section(operations)
-    self._write_tensor_list_section(tensor_list, opname_idx_map)
-    # Does the topological sort before adding any nodes to the graph.
-    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
-    return (operations, succeed, sorted_or_cycle)
-
-  def _post_tracing(self, succeed, sorted_or_cycle):
-    """Work needs to be done after TPU or CPU tracing."""
-
-    self._write_reason_section()
-    self._write_graph_section(succeed, sorted_or_cycle)
-    self._close_report_file()
-
-  def _get_checkpoints(self, graph):
-    """Returns the list of Ops that produce the tensors traced with API.
-
-    Args:
-      graph: the graph of Ops.
-
-    Returns:
-      A set of operation names which should be traced.
-    """
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
-                                  _TENSOR_TRACER_CHECKPOINT))
-    checkpoint_operations = set()
-    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
-    for (tensor, checkpoint_name) in tensor_tracer_variables:
-      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
-      checkpoint_operations.add(tensor.op.name)
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
-                                  _TENSOR_TRACER_CHECKPOINT))
-    return checkpoint_operations
-
-  def trace_tpu(self, graph, result_tensor, num_replicas=None, fetches=None):
-    """Traces the tensors generated by TPU Ops in a TF graph.
-
-    Args:
-      graph: the graph of Ops executed on the TPU.
-      result_tensor: a result tensor of evaluating the graph.
-      num_replicas: number of replicas used on the TPU.
-      fetches: the list of fetches given to session.run, used to determine the
-      ops in execution path. If None, the whole graph will be traced.
-
-    Returns:
-      A tuple (result_tensor_copy, tracing_ops), where:
-        result_tensor_copy: an exact copy of result_tensor
-        tracing_ops: a list of tracing ops. If this list
-                     is non empty, the caller of this function
-                     should pose control dependencies upon these
-                     Ops so that they will be executed when the
-                     graph is evaluated.
-    """
-
-    def _cast_unsupported_dtypes(tensor):
-      """Casts tensor to a supported type."""
-
-      if tensor.dtype.__eq__(dtypes.int64):
-        # outside-compilation doesn't support int64 input yet.
-        return math_ops.cast(tensor, dtypes.int32)
-      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
-          dtypes.float16):
-        # Since host can't handle bf16, convert tensor to f32.
-        return math_ops.cast(tensor, dtypes.float32)
-      return tensor
-
-    self._device_type = _DEVICE_TYPE_TPU
-    TensorTracer.check_device_type(self._device_type)
-    result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
-                                                       result_tensor)
-    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
-    # Filter out the operations that won't be executed.
-    # if fetches=None, then ops_in_exec_path = set(operations)
-    ops_in_exec_path = self._filter_execution_path_operations(operations,
-                                                              fetches)
-    tracing_ops = []
-    checkpoint_operations = self._get_checkpoints(graph)
-
-    for op_id, op in enumerate(operations):
-      if checkpoint_operations and op.name not in checkpoint_operations:
-        continue
-      user_included = self._is_user_included_op(op)
-      user_excluded = self._is_user_excluded_op(op)
-      in_exec_path = op in ops_in_exec_path
-      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
-        continue
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        if self._skip_tensor(op_id, out_tensor, user_included,
-                             user_excluded):
-          continue
-        # Create the list of consumers before calling _preprocess_traced_tensor.
-        # Otherwise, adding control input below, will introduce a cycle in the
-        # graph.
-        consumers = out_tensor.consumers()
-        tensor_name = out_tensor.name
-        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
-        processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
-        trace_op = tpu.outside_compilation(
-            self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
-        if consumers:
-          for consumer_op in consumers:
-            # pylint: disable=protected-access
-            consumer_op._add_control_input(trace_op)
-            # pylint: enable=protected-access
-        else:
-          # if there is no consumer, we will add the control dependence later
-          # when we add the control dependency to the output operations.
-          tracing_ops.append(trace_op)
-    self._post_tracing(succeed, sorted_or_cycle)
-    return (result_tensor_copy, tracing_ops)
-
-  def trace_cpu(self, graph):
-    """Traces the tensors generated by CPU Ops in a TF graph.
-
-    Args:
-      graph: the graph of Ops executed on the CPU.
-
-    Returns:
-      tracing_calls: a map from keys to trace calls.
-                     A key is constructed from an Op's name.
-                     A trace call consists of a function and a tensor (
-                     the function will be invoked with the tensor).
-    """
-
-    self._device_type = _DEVICE_TYPE_CPU
-    TensorTracer.check_device_type(self._device_type)
-    self._num_replicas = 1
-    self._replica_id = 0
-    (operations, succeed, sorted_or_cycle) = self._pre_tracing(graph)
-    tracing_calls = {}
-    checkpoint_operations = self._get_checkpoints(graph)
-
-    for op_id, op in enumerate(operations):
-      if checkpoint_operations and op.name not in checkpoint_operations:
-        continue
-      user_included = self._is_user_included_op(op)
-      user_excluded = self._is_user_excluded_op(op)
-      if self._skip_op(op_id, op, user_included, user_excluded):
-        continue
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        if self._skip_tensor(op_id, out_tensor, user_included,
-                             user_excluded):
-          continue
-        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
-        trace_fun = self._make_tensor_trace_fun(out_tensor.name)
-        trace_call = (trace_fun, [processed_out_tensor])
-        trace_call_key = 'tensor_tracing_cpu-%s:%d'%(op.name, i)
-        tracing_calls[trace_call_key] = trace_call
-    self._post_tracing(succeed, sorted_or_cycle)
-    return tracing_calls
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tensor_tracer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index 6ae718cc2c9716587849aeee8abcd0a1de82a9ae..5bf805752cf51b0a0f4b7400b18b63aae93cf831 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -1,220 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Defines the `Topology` class, that describes a TPU fabric topology."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.proto import topology_pb2
-
-
-def _tpu_device_name(job, task, device):
-  """Returns the device name for the TPU `device` on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:TPU:%d" % (task, device)
-  else:
-    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
-
-
-def _tpu_host_device_name(job, task):
-  """Returns the device name for the CPU device on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:CPU:0" % task
-  else:
-    return "/job:%s/task:%d/device:CPU:0" % (job, task)
-
-
-class Topology(object):
-  """Describes a set of TPU devices.
-
-  Represents both the shape of the physical mesh, and the mapping between
-  TensorFlow TPU devices to physical mesh coordinates.
-  """
-
-  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
-    """Builds a Topology object.
-
-    If `serialized` is not `None`, the topology is parsed from `serialized` and
-    the other arguments are ignored. Otherwise, the topology is computed from
-    `mesh_shape` and `device_coordinates`.
-
-    Args:
-      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
-        serialized proto is parsed to discover the topology.
-      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
-        the shape of the TPU topology, in number of cores. Ignored if
-        `serialized` is not `None`.
-      device_coordinates: A rank 3 numpy array that describes the mapping from
-        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
-        if `serialized is not `None`.
-
-    Raises:
-      ValueError: If `serialized` does not describe a well-formed topology.
-      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
-        of 3 positive integers.
-      ValueError: If `serialized` is `None` and `device_coordinates` is not a
-        rank 3 numpy int32 array that describes a valid coordinate mapping.
-    """
-
-    self._serialized = serialized
-
-    if serialized:
-      self._parse_topology(serialized)
-    else:
-      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
-      self._device_coordinates = np.asarray(device_coordinates, np.int32)
-      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
-                         "entries; got {}".format(self._mesh_shape))
-
-      if (len(self._device_coordinates.shape) != 3 or
-          self._device_coordinates.shape[2] != len(self._mesh_shape)):
-        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
-                         "with minor dimension equal to the mesh shape rank")
-
-    self._topology_tasks, self._topology_devices = self._invert_topology()
-
-  def _parse_topology(self, serialized):
-    """Parses a serialized `TopologyProto` into `self`."""
-    proto = topology_pb2.TopologyProto()
-    proto.ParseFromString(serialized)
-
-    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
-    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
-                       "entries; got {}".format(self._mesh_shape))
-
-    if proto.num_tasks < 0:
-      raise ValueError("`num_tasks` must be >= 0; got {}".format(
-          proto.num_tasks))
-    if proto.num_tpu_devices_per_task < 0:
-      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
-          proto.num_tpu_devices_per_task))
-
-    expected_coordinates_size = (
-        proto.num_tasks * proto.num_tpu_devices_per_task * len(
-            proto.mesh_shape))
-    if len(proto.device_coordinates) != expected_coordinates_size:
-      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
-                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
-                       "got shape {}".format(proto.num_tasks,
-                                             proto.num_tpu_devices_per_task,
-                                             proto.mesh_shape,
-                                             len(proto.device_coordinates)))
-
-    coords = np.array(proto.device_coordinates, dtype=np.int32)
-    if any(coords < 0):
-      raise ValueError("`device_coordinates` must be >= 0")
-    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
-                             len(proto.mesh_shape)))
-    self._device_coordinates = coords
-
-  def _invert_topology(self):
-    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
-    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    for task in xrange(self.device_coordinates.shape[0]):
-      for device in xrange(self.device_coordinates.shape[1]):
-        x, y, z = self.device_coordinates[task, device, :]
-        tasks[x, y, z] = task
-        devices[x, y, z] = device
-    return tasks, devices
-
-  @property
-  def mesh_shape(self):
-    """A rank 1 int32 array describing the shape of the TPU topology."""
-    return self._mesh_shape
-
-  @property
-  def mesh_rank(self):
-    """Returns the number of dimensions in the mesh."""
-    return len(self._mesh_shape)
-
-  @property
-  def device_coordinates(self):
-    """Describes the mapping from TPU devices to topology coordinates.
-
-    Returns:
-      A rank 3 int32 array with shape `[tasks, devices, axis]`.
-      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
-      of TPU devices per task, and `axis` is the number of axes in the TPU
-      cluster topology. Each entry gives the `axis`-th coordinate in the
-      topology of a task/device pair. TPU topologies are 3-dimensional, with
-      dimensions `(x, y, core number)`.
-    """
-    return self._device_coordinates
-
-  def task_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow task number attached to `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow task number that contains the TPU device with those
-      physical coordinates.
-    """
-    return self._topology_tasks[tuple(device_coordinates)]
-
-  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow device number at `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow device number within the task corresponding to
-      attached to the device with those physical coordinates.
-    """
-    return self._topology_devices[tuple(device_coordinates)]
-
-  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the CPU device attached to a logical core."""
-    return _tpu_host_device_name(
-        job, self._topology_tasks[tuple(device_coordinates)])
-
-  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    return _tpu_device_name(job,
-                            self._topology_tasks[tuple(device_coordinates)],
-                            self._topology_devices[tuple(device_coordinates)])
-
-  @property
-  def num_tasks(self):
-    """Returns the number of TensorFlow tasks in the TPU slice."""
-    return self._device_coordinates.shape[0]
-
-  @property
-  def num_tpus_per_task(self):
-    """Returns the number of TPU devices per task in the TPU slice."""
-    return self._device_coordinates.shape[1]
-
-  def serialized(self):
-    """Returns the serialized form of the topology."""
-    if self._serialized is None:
-      proto = topology_pb2.TopologyProto()
-      proto.mesh_shape[:] = list(self._mesh_shape)
-      proto.num_tasks = self._device_coordinates.shape[0]
-      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
-      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
-      self._serialized = proto.SerializeToString()
-
-    return self._serialized
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.topology import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index b04baebfe6150c7f10c16e237de884ab788f2642..5364b20f231ac7af8adf943c3d5e21921b7a06a9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1,1392 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.compat import compat as api_compat
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-from tensorflow.python.util import nest
-
-
-# Operations that indicate some error in the users graph, e.g. a placeholder
-# that's introduced outside of the infeed.
-_BLACKLISTED_OPS = set([
-    "Placeholder",
-])
-
-# XLA doesn't currently support reading of intermediate tensors, thus some ops
-# are not supported.
-_UNSUPPORTED_OPS = set([
-    "AudioSummary",
-    "AudioSummaryV2",
-    "HistogramSummary",
-    "ImageSummary",
-    "MergeSummary",
-    "Print",
-    "ScalarSummary",
-    "TensorSummary",
-    "TensorSummaryV2",
-    ])
-
-_MAX_WARNING_LINES = 5
-
-_TPU_REPLICATE_ATTR = "_tpu_replicate"
-_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
-_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
-
-
-def _tpu_system_device_name(job):
-  """Returns the device name for the TPU_SYSTEM device of `job`."""
-  if job is None:
-    return "/device:TPU_SYSTEM:0"
-  else:
-    return "/job:%s/device:TPU_SYSTEM:0" % job
-
-
-def initialize_system(embedding_config=None, job=None):
-  """Initializes a distributed TPU system for use with TensorFlow.
-
-  Args:
-    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
-      describing the desired configuration of the hardware embedding lookup
-      tables. If embedding_config is None, no hardware embeddings can be used.
-    job: The job (the XXX in TensorFlow device specification /job:XXX) that
-      contains the TPU devices that will be initialized. If job=None it is
-      assumed there is only one job in the TensorFlow flock, and an error will
-      be returned if this assumption does not hold.
-  Returns:
-    A serialized `TopologyProto` that describes the TPU system. Note:
-      the topology must be evaluated using `Session.run` before it can be used.
-  """
-  config_string = ("" if embedding_config is None else
-                   embedding_config.SerializeToString())
-  with ops.device(_tpu_system_device_name(job)):
-    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
-
-
-def shutdown_system(job=None):
-  """Shuts down a running a distributed TPU system."""
-  with ops.device(_tpu_system_device_name(job)):
-    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
-  return shutdown_distributed_tpu
-
-
-def core(num):
-  """Returns the device name for a core in a replicated TPU computation.
-
-  Args:
-    num: the virtual core number within each replica to which operators should
-    be assigned.
-  Returns:
-    A device name, suitable for passing to `tf.device()`.
-  """
-  return "device:TPU_REPLICATED_CORE:{}".format(num)
-
-
-class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU computation.
-
-  The primary role of `TPUReplicateContext` is to mark operators inside a
-  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
-  is a unique name.
-
-  We use a `ControlFlowContext` to perform the annotation since it integrates
-  with Tensorflow constructs like ResourceVariables. For example, if a
-  `ResourceVariable` is constructed inside a tpu.replicate() block, the
-  `ResourceVariable` implementation can use
-  `with ops.control_dependencies(None)` to build the variable's definition
-  outside the replicated computation.
-  """
-
-  def __init__(self, name, num_replicas, pivot):
-    """Builds a new TPUReplicateContext.
-
-    Args:
-      name: a unique name for the context, used to populate the `_tpu_replicate`
-        attribute.
-      num_replicas: an integer that gives the number of replicas for the
-        computation.
-      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
-        inputs will have a control dependency on the pivot node. This ensures
-        that nodes are correctly included in any enclosing control flow
-        contexts.
-    """
-    super(TPUReplicateContext, self).__init__()
-    self._num_replicas = num_replicas
-    self._outer_device_function_stack = None
-    self._oc_dev_fn_stack = None
-    self._outside_compilation_cluster = None
-    self._outside_compilation_counter = 0
-    self._in_gradient_colocation = None
-    self._gradient_colocation_stack = []
-    self._host_compute_core = []
-    self._name = name
-    self._name_as_bytes = compat.as_bytes(name)
-    self._unsupported_ops = []
-    self._pivot = pivot
-    self._replicated_vars = {}
-
-  def get_replicated_var_handle(self, name, vars_):
-    """Returns a variable handle for replicated TPU variable 'var'.
-
-    This is a method used by an experimental replicated variable implementation
-    and is not intended as a public API.
-
-    Args:
-      name: The common name of the variable.
-      vars_: The replicated TPU variables.
-
-    Returns:
-      The handle of the TPU replicated input node.
-    """
-    handle = self._replicated_vars.get(name)
-    if handle is not None:
-      return handle
-
-    # Builds a TPUReplicatedInput node for the variable, if one does not already
-    # exist. The TPUReplicatedInput node must belong to the enclosing
-    # control-flow scope of the TPUReplicateContext.
-    # TODO(phawkins): consider changing the contract of the TPU encapsulation
-    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
-    # instead.
-
-    # pylint: disable=protected-access
-    graph = ops.get_default_graph()
-    saved_context = graph._get_control_flow_context()
-    graph._set_control_flow_context(self.outer_context)
-    handle = tpu_ops.tpu_replicated_input(
-        [v.handle for v in vars_], name=name + "/handle")
-    graph._set_control_flow_context(saved_context)
-    # pylint: enable=protected-access
-    self._replicated_vars[name] = handle
-    return handle
-
-  def report_unsupported_operations(self):
-    if self._unsupported_ops:
-      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
-                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
-      logging.warning("%d unsupported operations found: \n%s",
-                      len(self._unsupported_ops), op_str)
-      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
-        logging.warning("... and %d more" %
-                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
-
-  def EnterGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      self._gradient_colocation_stack.append(op)
-      if not self._outside_compilation_cluster:
-        try:
-          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
-          if self._in_gradient_colocation:
-            raise NotImplementedError(
-                "Cannot nest gradient colocation operations outside compilation"
-            )
-          if gradient_uid == "__unsupported__":
-            raise NotImplementedError(
-                "No gradient_uid calling gradient within outside_compilation")
-          # When we take the gradient of an op X in an outside_compilation
-          # cluster C in a forward computation we would like to put the ops
-          # corresponding to the gradient of X into a new outside_compilation
-          # cluster C'. However, if we take the gradient of X twice, the second
-          # one should get yet another new outside_compilation cluster C''.
-          #
-          # The mechanism we adopt is to use a 'root_cluster' which is the
-          # cluster that X was in before we took gradients, and a 'gradient_uid'
-          # which is different for every invocation of gradients, and put the
-          # gradient of X in cluster 'root_cluster.gradient_uid'.
-          #
-          # When taking a gradient of a gradient, some ops will be colocated
-          # with Op in the forward pass (e.g., cluster root_cluster) and some in
-          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
-          # We need all of the grad-of-grad ops to be in the same cluster to
-          # avoid cyclic dependencies between clusters. We adopt a heuristic
-          # that puts any op clustered with root_cluster.<xxx> in
-          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
-          self._in_gradient_colocation = op
-          parts = outside_attr.split(".")
-          cluster = parts[0] + "." + gradient_uid
-          self._EnterOutsideCompilationScope(cluster=cluster)
-        except ValueError:
-          # The attr was not present: do nothing.
-          pass
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      if not self._gradient_colocation_stack:
-        raise errors.InternalError(
-            op.node_def, op,
-            "Badly nested gradient colocation: empty stack when popping Op " +
-            op.name)
-      last_op = self._gradient_colocation_stack.pop()
-      if op is last_op:
-        if op is self._in_gradient_colocation:
-          self._in_gradient_colocation = None
-          self._ExitOutsideCompilationScope()
-      else:
-        raise errors.InternalError(
-            op.node_def, op, "Badly nested gradient colocation, expected " +
-            last_op + ", got " + op.name)
-
-  def _EnterOutsideCompilationScope(self, cluster=None):
-
-    class FakeOp(object):
-      """A helper class to determine the current device.
-
-      Supports only the type and device set/get methods needed to run the
-      graph's _apply_device_function method.
-      """
-
-      def __init__(self):
-        self._device = ""
-
-      @property
-      def type(self):
-        return "FakeOp"
-
-      @property
-      def device(self):
-        return self._device
-
-      def _set_device(self, device):
-        if isinstance(device, pydev.DeviceSpec):
-          self._device = device.to_string()
-        else:
-          self._device = device
-
-    if self._outside_compilation_cluster:
-      raise NotImplementedError("Cannot nest outside_compilation clusters")
-    if cluster:
-      self._outside_compilation_cluster = cluster
-    else:
-      self._outside_compilation_cluster = str(self._outside_compilation_counter)
-      self._outside_compilation_counter += 1
-    graph = ops.get_default_graph()
-    fake_op = FakeOp()
-    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
-    device = pydev.DeviceSpec.from_string(fake_op.device)
-    if (device.device_type == "TPU_REPLICATED_CORE" and
-        device.device_index is not None):
-      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
-                                     str(device.device_index))
-    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
-    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
-
-  def _ExitOutsideCompilationScope(self):
-    if not self._outside_compilation_cluster:
-      raise NotImplementedError(
-          "Attempted to exit outside_compilation scope when not in scope")
-    self._outside_compilation_cluster = None
-    graph = ops.get_default_graph()
-    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
-
-  def Enter(self):
-    if not self._outer_device_function_stack:
-      # Capture the device function stack at the time of first entry
-      # since that is the stack that will be used outside_compilation.
-      graph = ops.get_default_graph()
-      # pylint: disable=protected-access
-      self._outer_device_function_stack = graph._device_function_stack.copy()
-      # pylint: enable=protected-access
-    super(TPUReplicateContext, self).Enter()
-
-  def HostComputeCore(self):
-    return self._host_compute_core
-
-  def AddOp(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
-      logging.error("Operation of type %s (%s) is not supported on the TPU. "
-                    "Execution will fail if this op is used in the graph. " %
-                    (op.type, op.name))
-
-    if op.type in _UNSUPPORTED_OPS:
-      self._unsupported_ops.append(op)
-
-    if any(x.dtype._is_ref_dtype for x in op.inputs):
-      raise NotImplementedError(
-          "Non-resource Variables are not supported inside TPU computations "
-          "(operator name: %s)" % op.name)
-    if _TPU_REPLICATE_ATTR in op.node_def.attr:
-      raise ValueError("TPU computations cannot be nested")
-    op._set_attr(_TPU_REPLICATE_ATTR,
-                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
-    if self._outside_compilation_cluster:
-      op._set_attr(
-          _OUTSIDE_COMPILATION_ATTR,
-          attr_value_pb2.AttrValue(
-              s=compat.as_bytes(self._outside_compilation_cluster)))
-    if self._num_replicas > 1 or not self._outside_compilation_cluster:
-      # Prevent feeding or fetching anything that is being compiled,
-      # and any replicated outside_compilation Op.
-      op.graph.prevent_feeding(op)
-      op.graph.prevent_fetching(op)
-
-    # Remove any control edges from outer control flow contexts. These may cause
-    # mismatched frame errors.
-    (internal_control_inputs,
-     external_control_inputs) = self._RemoveExternalControlEdges(op)
-
-    if not op.inputs:
-      # Add a control edge from the control pivot to this op.
-      if not internal_control_inputs:
-        # pylint: disable=protected-access
-        op._add_control_input(self.GetControlPivot())
-        # pylint: enable=protected-access
-    else:
-      for index in xrange(len(op.inputs)):
-        x = op.inputs[index]
-        real_x = self.AddValue(x)
-        if real_x != x:
-          op._update_input(index, real_x)  # pylint: disable=protected-access
-
-    if external_control_inputs:
-      # Use an identity to pull control inputs as data inputs. Note that we
-      # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      external_control_inputs = [
-          array_ops.identity(x.outputs[0]).op
-          for x in external_control_inputs
-          if x.outputs
-      ]
-      # pylint: disable=protected-access
-      op._add_control_inputs(external_control_inputs)
-      # pylint: enable=protected-access
-
-    # Mark op's outputs as seen by this context and any outer contexts.
-    output_names = [x.name for x in op.outputs]
-    context = self
-    while context is not None:
-      # pylint: disable=protected-access
-      context._values.update(output_names)
-      context = context._outer_context
-      # pylint: enable=protected-access
-
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    """Add `val` to the current context and its outer context recursively."""
-    if val.name in self._values:
-      # Use the real value if it comes from outer context.
-      result = self._external_values.get(val.name)
-      return val if result is None else result
-
-    result = val
-    self._values.add(val.name)
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-      self._values.add(result.name)
-
-    self._external_values[val.name] = result
-
-    return result
-
-  def AddInnerOp(self, op):
-    self.AddOp(op)
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  @property
-  def grad_state(self):
-    # Define the gradient loop state associated with the TPUReplicateContext to
-    # be None as the TPUReplicateContext does not get nested nor does the
-    # grad_state outside the TPUReplicateContext affect the graph inside so the
-    # grad_state should be as if this is the top-level gradient state.
-    return None
-
-  @property
-  def back_prop(self):
-    """Forwards to the enclosing while context, if any."""
-    if self.GetWhileContext():
-      return self.GetWhileContext().back_prop
-    return False
-
-  def GetControlPivot(self):
-    return self._pivot
-
-
-def outside_compilation(computation, *args, **kwargs):
-  """Builds part of a computation outside any current TPU replicate scope.
-
-  Args:
-    computation: A Python function that builds the computation to
-      place on the host.
-    *args: the positional arguments for the computation.
-    **kwargs: the keyword arguments for the computation.
-
-  Returns:
-    The Tensors returned by computation.
-  """
-  args = [] if args is None else args
-  graph = ops.get_default_graph()
-
-  # If we are in a TPUReplicateContext, signal that we are now
-  # outside_compilation
-  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  retval = computation(*args, **kwargs)
-
-  # If we are in a TPUReplicateContext, signal that we are no longer
-  # outside_compilation
-  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  if initial_context is not final_context:
-    raise NotImplementedError(
-        "Control-flow context cannot be different at start and end of an "
-        "outside_compilation scope")
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  return retval
-
-
-def replicate(computation,
-              inputs=None,
-              infeed_queue=None,
-              device_assignment=None,
-              name=None):
-  """Builds a graph operator that runs a replicated TPU computation.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs. Each input can be a nested structure
-      containing values that are convertible to tensors. Note that passing an
-      N-dimension list of compatible values will result in a N-dimention list of
-      scalar tensors rather than a single Rank-N tensors. If you need different
-      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of outputs, indexed by `[replica_num]` each output can be a nested
-    structure same as what computation() returns with a few exceptions.
-
-    Exceptions include:
-      1) None output: a NoOp would be returned which control-depends on
-         computation.
-      2) Single value output: A tuple containing the value would be returned.
-      3) Operation-only outputs: a NoOp would be returned which
-         control-depends on computation.
-      TODO(b/121383831): Investigate into removing these special cases.
-
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-  """
-  return split_compile_and_replicate(computation, inputs, infeed_queue,
-                                     device_assignment, name)[1]
-
-
-def split_compile_and_replicate(computation,
-                                inputs=None,
-                                infeed_queue=None,
-                                device_assignment=None,
-                                name=None,
-                                use_tpu=True):
-  """Builds graph operators that runs compilation and replicated computation.
-
-  This is a lower level interface than replicate that returns a separate compile
-  and execute output tensor. In the generated graph the compile op feeds into
-  the execute op and no additional compilation is incurred when running the
-  compile op before the execute op. The compile op returns additional
-  information about the compilation but does not return the compiled program.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs. Each input can be a nested structure
-      containing values that are convertible to tensors. Note that passing an
-      N-dimension list of compatible values will result in a N-dimention list of
-      scalar tensors rather than a single Rank-N tensors. If you need different
-      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
-      backends. Currently, only supports a default placement (computation is
-      placed on GPU if one is available, and on CPU if not).
-  Returns:
-    A list of lists with the first list corresponding to the compile op and the
-    second a list of output tensors, indexed by `[replica_num][output_num]`.
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-  """
-  del name
-  inputs = [[]] if inputs is None else inputs
-
-  metadata_kwargs = {}
-  if device_assignment is not None:
-    # Turn the Numpy array into a flattened list so we can pass it as an
-    # operator attribute.
-    metadata_kwargs = {
-        "topology":
-            device_assignment.topology.serialized(),
-        "device_assignment":
-            device_assignment.core_assignment.flatten().tolist()
-    }
-    # TODO(phawkins): remove this case after the forward compatibility window
-    # expires on 2018-10-5.
-    if api_compat.forward_compatible(2018, 10, 5):
-      metadata_kwargs["num_cores_per_replica"] = (
-          device_assignment.num_cores_per_replica)
-    else:
-      metadata_kwargs["computation_shape"] = [
-          device_assignment.num_cores_per_replica
-      ]
-
-  if ((not isinstance(inputs, list)) or
-      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
-    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
-
-  num_replicas = len(inputs)
-
-  # No replicas? Nothing to do.
-  if num_replicas == 0:
-    return []
-
-  # Checks all replicas have the same structure.
-  for i in xrange(1, num_replicas):
-    nest.assert_same_structure(inputs[0], inputs[i])
-
-  # Flatten inputs.
-  flat_inputs = [
-      nest.flatten(per_replica_input) for per_replica_input in inputs
-  ]
-  # Converts inputs to Tensors.
-  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
-
-  # Verifies that all replicas have matching numbers and types of inputs
-  flat_input_types = [x.dtype for x in flat_inputs[0]]
-  input_arity = len(inputs[0])
-  flat_input_arity = len(flat_input_types)
-  for i in range(num_replicas):
-    if len(inputs[i]) != input_arity:
-      raise ValueError("Replicas must have the same number of inputs. "
-                       "Replica 0 had {} inputs, replica {} had {} "
-                       "inputs.".format(input_arity, i, len(inputs[i])))
-
-    types = [x.dtype for x in flat_inputs[i]]
-    if types != flat_input_types:
-      raise ValueError("Replicas must have matching input types. Replica 0 had "
-                       "input types {}, replica {} had input types {}".format(
-                           flat_input_types, i, types))
-
-  arg_error = xla.check_function_argument_count(
-      computation, input_arity, infeed_queue)
-  if arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s, but the computation needs %s" % (
-              input_arity, str([i.name for i in inputs[0]]), arg_error))
-    else:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s and %d additional inputs from infeed,"
-          " but the computation needs %s" % (input_arity, str(
-              [i.name
-               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
-                                             arg_error))
-
-  graph = ops.get_default_graph()
-
-  # Fan-in: Builds a TPUReplicatedInput node for each input.
-  computation_inputs = []
-  for i in range(0, flat_input_arity):
-    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
-    computation_inputs.append(
-        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
-
-  cluster_name = graph.unique_name("cluster")
-  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
-  context = TPUReplicateContext(
-      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
-  try:
-    context.Enter()
-
-    metadata = tpu_ops.tpu_replicate_metadata(
-        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
-
-    with tpu_function.tpu_shard_context(
-        num_replicas), ops.control_dependencies([metadata]):
-
-      # Add identity ops so even unused inputs are "consumed" by the
-      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
-      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
-      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-      computation_inputs = [
-          array_ops.identity(x, name="replicated_input_{}".format(i))
-          for i, x in enumerate(computation_inputs)
-      ]
-      for i in computation_inputs:
-        # pylint: disable=protected-access
-        i.op._set_attr("_tpu_input_identity", attr_value_pb2.AttrValue(b=True))
-        # pylint: enable=protected-access
-
-      # Unflatten the computation inputs to match original input structure.
-      computation_inputs = nest.pack_sequence_as(
-          structure=inputs[0], flat_sequence=computation_inputs)
-
-      # If there is an infeed queue, adds the dequeued values to the
-      # computation's inputs.
-      if infeed_queue is not None:
-        infeed_queue.set_number_of_shards(num_replicas)
-        for t in infeed_queue.generate_dequeue_op():
-          computation_inputs.append(t)
-
-      # Only resource variables work inside a TPU computation, so turn on
-      # resource variables for the computation.
-      # TODO(phawkins): consider removing this code. It will
-      # be less confusing to clients if they knowingly choose to use resource
-      # variables.
-      # Partitioned variables is not supported (b/112311320).
-      vscope = variable_scope.get_variable_scope()
-      saved_use_resource = vscope.use_resource
-      saved_custom_getter = vscope.custom_getter
-
-      def custom_getter(getter, name, *args, **kwargs):
-        """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
-        if partitioner is not None:
-          kwargs["partitioner"] = None
-          logging.warning(
-              "Partitioned variables are not supported on TPU. Got "
-              "`partitioner` that is {} for variable {}. "
-              "Setting `partitioner` to `None`."
-              .format(partitioner, name))
-        if saved_custom_getter is None:
-          return getter(name, *args, **kwargs)
-        else:
-          return saved_custom_getter(getter, name, *args, **kwargs)
-
-      vscope.set_use_resource(True)
-      vscope.set_custom_getter(custom_getter)
-
-      outputs = computation(*computation_inputs)
-
-      vscope.set_use_resource(saved_use_resource)
-      vscope.set_custom_getter(saved_custom_getter)
-
-    outputs_is_flat = xla.is_flat(outputs)
-    if outputs_is_flat:
-      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
-    else:
-      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
-
-    context.ExitResult(output_tensors)
-  finally:
-    context.report_unsupported_operations()
-    context.Exit()
-    host_compute_core = context.HostComputeCore()
-
-  if host_compute_core:
-    attr_value = attr_value_pb2.AttrValue()
-    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
-    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
-
-  with ops.control_dependencies([metadata]):
-    if use_tpu:
-      compile_status = tpu_ops.tpu_compilation_result()
-      op = compile_status.op
-      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
-      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
-    else:
-      compile_status = control_flow_ops.no_op(name="compilation_status")
-
-  if not output_tensors:
-    # Returns a list of NoOps dependent on the replication Op, indexed by
-    # [replica_num].
-    return [
-        compile_status,
-        [
-            control_flow_ops.group(control_deps, name="shard_%d" % i)
-            for i in range(num_replicas)
-        ]
-    ]
-
-  # Fan-out: Builds a TPUReplicatedOutput node for each output.
-  replicated_outputs = [[] for i in xrange(num_replicas)]
-  for i, t in enumerate(output_tensors):
-    # Fan-out: Builds a TPUReplicatedOutput node for each output.
-    ys = tpu_ops.tpu_replicated_output(
-        t, num_replicas, name="output{}".format(i))
-
-    # Wraps the outputs in identity operators so the names of any possible
-    # `fetch` nodes are preserved by the replication rewrite.
-    with ops.control_dependencies(control_deps):
-      for replica in xrange(num_replicas):
-        replicated_outputs[replica].append(
-            array_ops.identity(
-                ys[replica], name="output_%d_shard_%d" % (i, replica)))
-
-  if not outputs_is_flat:
-    replicated_outputs = [
-        nest.pack_sequence_as(outputs, replica_outs)
-        for replica_outs in replicated_outputs
-    ]
-
-  return [compile_status, replicated_outputs]
-
-
-def _postprocess_flat_outputs(outputs):
-  """Validates non-flat outputs, add backs device assignments and other attrs.
-
-  Args:
-    outputs: Output from `computation` inside `tpu.rewrite`.
-
-  Returns:
-    Tensors and Operations extracted from outputs.
-  """
-  # Following code segment is to preserve legacy behavior. Previously we only
-  # supported flat outputs and thus for consistency it was nice to convert even
-  # single element into a tuple. But now that we support arbitrary output
-  # structure, this is no longer necessary.
-  # TODO(b/121383831): Migrate all legacy use cases and delete this special
-  # case.
-  # If the computation returns `None`, make it an empty tuple.
-  if outputs is None:
-    outputs = tuple()
-  # If the computation only returned one value, makes it a tuple.
-  if not isinstance(outputs, collections.Sequence):
-    outputs = (outputs,)
-
-  # Append `no_op` here so that fetching any return value of this function
-  # will trigger TPUExecute node.
-  outputs += (control_flow_ops.no_op(),)
-  try:
-    with ops.device(core(0)):
-      outputs = [
-          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-          for o in outputs
-      ]
-  except Exception as e:
-    raise ValueError(
-        "TPU function return values must all either be Operations or "
-        "convertible to Tensors. Got '%s'" % str(e))
-
-  # Separates the returned Operations and Tensors.
-  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-  if outputs != output_tensors + output_operations:
-    raise ValueError(
-        "TPU functions must return zero-or more Tensor values followed by "
-        "zero or more Operations.")
-
-  # Wraps outputs in Identity ops. Otherwise a replicated input copied
-  # straight to an output would bypass the replicate(). This would be bad
-  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-  # be rewritten away, leading to a runtime error.
-  # TODO(phawkins): extend the rewrite to elide these nodes instead.
-  new_output_tensors = []
-  for t in output_tensors:
-    with ops.device(t.device if t.device else core(0)):
-      o = array_ops.identity(t)
-      # pylint: disable=protected-access
-      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-      new_output_tensors.append(o)
-  return new_output_tensors, output_operations
-
-
-def _postprocess_non_flat_outputs(outputs):
-  """Validates non-flat outputs, add backs device assignments and other attrs.
-
-  Args:
-    outputs: Output from `computation` inside `tpu.rewrite`.
-
-  Returns:
-    Tensors extracted from outputs and an empty list because Operations are not
-    allowed in non-flat outputs..
-  """
-
-  # Flatten output items.
-  flat_outputs = nest.flatten(outputs)
-
-  # Convert all non-Operation outputs to Tensors.
-  for i, o in enumerate(flat_outputs):
-    if isinstance(o, ops.Operation):
-      raise ValueError(
-          "tpu.rewrite does not support Operation as return value in non-flat "
-          "output structure. You can set returned Operations as control "
-          "dependencies of returned Tensors so Operations are triggered when "
-          'Tensors are evaluated. Operation found: "%s"' % o.name)
-
-    try:
-      o = ops.convert_to_tensor(o)
-    except Exception as e:
-      raise ValueError(
-          "TPU function return values must all either be Operations or "
-          'convertible to Tensors. Got error: "%s"' % str(e))
-
-    # Wraps outputs in Identity ops. Otherwise a replicated input copied
-    # straight to an output would bypass the replicate(). This would be bad
-    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-    # be rewritten away, leading to a runtime error.
-    # TODO(phawkins): extend the rewrite to elide these nodes instead.
-    with ops.device(core(0)):
-      o = array_ops.identity(o)
-      # pylint: disable=protected-access
-      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-      flat_outputs[i] = array_ops.identity(o)
-
-  # All flat_outputs are Tensors, and no Operations.
-  return flat_outputs, []
-
-
-def split_compile_and_shard(computation,
-                            inputs=None,
-                            num_shards=1,
-                            input_shard_axes=None,
-                            outputs_from_all_shards=True,
-                            output_shard_axes=None,
-                            infeed_queue=None,
-                            device_assignment=None,
-                            name=None):
-  """Shards `computation` for parallel execution.
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
-  of which has a corresponding split axis (from `input_shard_axes`). Each input
-  is split into `num_shards` pieces along the corresponding axis, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  TODO(phawkins): consider adding support for broadcasting Tensors passed
-  as inputs.
-
-  If `outputs_from_all_shards` is true, the outputs from all shards of
-  `computation` are concatenated back together along their `output_shards_axes`.
-  Otherwise, each output is taken from an arbitrary shard.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). Each
-      input tensor has a corresponding shard axes, given by `input_shard_axes`,
-      which must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    input_shard_axes: A list of dimensions along which to shard `inputs`, or
-      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
-      there must be one dimension per input.
-    outputs_from_all_shards: Boolean or list of boolean. For each output, if
-      `True`, outputs from all shards are concatenated along the corresponding
-      `output_shard_axes` entry. Otherwise, each output is taken
-      from an arbitrary shard. If the argument is a boolean, the argument's
-      value is used for each output.
-    output_shard_axes: A list of dimensions along which to concatenate the
-      outputs of `computation`, or `None`. `None` means "concatenate all outputs
-      along dimension 0". If not `None`, there must be one dimension per output.
-      Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
-      of `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A tuple of (compile op, [output tensors]).
-  Raises:
-    ValueError: If num_shards <= 0
-    ValueError: If len(input_shard_axes) != len(inputs)
-    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
-  """
-
-  if num_shards <= 0:
-    raise ValueError("num_shards must be a positive integer.")
-
-  inputs = [] if inputs is None else inputs
-  if not isinstance(inputs, list):
-    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
-
-  # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-
-  if input_shard_axes is None:
-    input_shard_axes = [0] * len(inputs)
-  if len(inputs) != len(input_shard_axes):
-    raise ValueError("Length of input_shard_axes must be equal to the number "
-                     "of inputs.")
-
-  if inputs:
-    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
-    # lists with layout [input][shard]
-    split_inputs = [
-        array_ops.split(x, num_shards, axis=axis)
-        for (axis, x) in zip(input_shard_axes, inputs)]
-
-    # Transposes the input lists to have layout [shard][input]
-    transposed_inputs = [list(i) for i in zip(*split_inputs)]
-  else:
-    transposed_inputs = [[]] * num_shards
-
-  compile_op, outputs = split_compile_and_replicate(
-      computation,
-      transposed_inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-  # There must be at least one shard since num_shards > 0.
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  if isinstance(outputs[0], ops.Operation):
-    # pylint: enable=indexing-exception
-    # There were no outputs from the computation and replicate returned a list
-    # of NoOps with control dependencies on the computation. Return the first
-    # one so it can be used as a control dependency or fetch node.
-    # TODO(b/36647078) remove disable when pylint bug is fixed.
-    # pylint: disable=indexing-exception
-    return compile_op, [outputs[0]]
-    # pylint: enable=indexing-exception
-
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  num_outputs = len(outputs[0])
-  # pylint: enable=indexing-exception
-
-  if output_shard_axes is None:
-    output_shard_axes = [0] * num_outputs
-  if num_outputs != len(output_shard_axes):
-    raise ValueError("Length of output_shard_axes must be equal to the number "
-                     "of outputs.")
-
-  if isinstance(outputs_from_all_shards, bool):
-    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
-
-  if num_outputs != len(outputs_from_all_shards):
-    raise ValueError("Length of outputs_from_all_shards must be equal to the "
-                     "number of outputs.")
-
-  results = []
-  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
-                                   zip(*outputs)):
-    if all_shards:
-      # Concatenate all of the outputs together (use stack for scalars).
-      shape = x[0].shape
-      is_scalar = shape is not None and (shape.ndims == 0)
-      results.append((array_ops.stack(list(x)) if is_scalar
-                      else array_ops.concat(list(x), axis=axis)))
-    else:
-      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
-      results.append(x[0])
-
-  return compile_op, results
-
-
-def shard(computation,
-          inputs=None,
-          num_shards=1,
-          input_shard_axes=None,
-          outputs_from_all_shards=True,
-          output_shard_axes=None,
-          infeed_queue=None,
-          device_assignment=None,
-          name=None):
-  """Shards `computation` for parallel execution.
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
-  of which has a corresponding split axis (from `input_shard_axes`). Each input
-  is split into `num_shards` pieces along the corresponding axis, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  TODO(phawkins): consider adding support for broadcasting Tensors passed
-  as inputs.
-
-  If `outputs_from_all_shards` is true, the outputs from all shards of
-  `computation` are concatenated back together along their `output_shards_axes`.
-  Otherwise, each output is taken from an arbitrary shard.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). Each
-      input tensor has a corresponding shard axes, given by `input_shard_axes`,
-      which must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    input_shard_axes: A list of dimensions along which to shard `inputs`, or
-      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
-      there must be one dimension per input.
-    outputs_from_all_shards: Boolean or list of boolean. For each output, if
-      `True`, outputs from all shards are concatenated along the corresponding
-      `output_shard_axes` entry. Otherwise, each output is taken
-      from an arbitrary shard. If the argument is a boolean, the argument's
-      value is used for each output.
-    output_shard_axes: A list of dimensions along which to concatenate the
-      outputs of `computation`, or `None`. `None` means "concatenate all outputs
-      along dimension 0". If not `None`, there must be one dimension per output.
-      Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
-      of `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If num_shards <= 0
-    ValueError: If len(input_shard_axes) != len(inputs)
-    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
-  """
-  return split_compile_and_shard(
-      computation,
-      inputs=inputs,
-      num_shards=num_shards,
-      input_shard_axes=input_shard_axes,
-      outputs_from_all_shards=outputs_from_all_shards,
-      output_shard_axes=output_shard_axes,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)[1]
-
-
-def batch_parallel(computation,
-                   inputs=None,
-                   num_shards=1,
-                   infeed_queue=None,
-                   device_assignment=None,
-                   name=None):
-  """Shards `computation` along the batch dimension for parallel execution.
-
-  Convenience wrapper around shard().
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list).
-  Each input is split into `num_shards` pieces along the 0-th dimension, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  The outputs from all shards are concatenated back together along their 0-th
-  dimension.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). The
-      0-th dimension of each Tensor must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If `num_shards <= 0`
-  """
-  return shard(
-      computation,
-      inputs,
-      num_shards=num_shards,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-
-def rewrite(computation,
-            inputs=None,
-            infeed_queue=None,
-            device_assignment=None,
-            name=None):
-  """Rewrites `computation` for execution on a TPU system.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors.
-
-      `computation` may return a list of operations and tensors. Tensors must
-      come before operations in the returned list.  The return value of
-      `rewrite` is a list of tensors corresponding to the tensors from the
-      output of `computation`.
-
-      All `Operation`s constructed during `computation` will be executed when
-      evaluating any of the returned output tensors, not just the ones returned.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-      Each input can be a nested structure containing values that are
-      convertible to tensors. Note that passing an N-dimension list of
-      compatible values will result in a N-dimention list of scalar tensors
-      rather than a single Rank-N tensors. If you need different behavior,
-      convert part of inputs to tensors with `tf.convert_to_tensor`.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: (Deprecated) Does nothing.
-  Returns:
-    Same data structure as if computation(*inputs) is called directly with some
-    exceptions for correctness. Exceptions include:
-      1) None output: a NoOp would be returned which control-depends on
-         computation.
-      2) Single value output: A tuple containing the value would be returned.
-      3) Operation-only outputs: a NoOp would be returned which
-         control-depends on computation.
-      TODO(b/121383831): Investigate into removing these special cases.
-  """
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  return replicate(
-      computation,
-      None if inputs is None else [inputs],
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)[0]
-  # pylint: enable=indexing-exception
-
-  # Operations that indicate some error in the user's inference graph.
-_BLACKLISTED_INFERENCE_OPS = set([
-    "ReadVariableOp",
-    "AssignVariableOp",
-    "AssignAddVariableOp",
-    "AssignSubVariableOp",
-    "VarHandleOp",
-    "Variable",
-    "VariableV2",
-])
-
-
-def under_tpu_inference_context():
-  """Check if it is currently under `tpu.rewrite_for_inference()`."""
-  graph = ops.get_default_graph()
-
-  context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  while context:
-    if isinstance(context, _TPUInferenceContext):
-      return True
-    context = context.outer_context
-
-  return False
-
-
-class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU inference computation.
-
-  The primary role of `TPUReplicateContext` is to sanity check operators inside
-  a tpu.rewrite_for_inference() computation.
-  """
-
-  def __init__(self, name):
-    super(_TPUInferenceContext, self).__init__()
-    self._name = name
-
-  def AddOp(self, op):
-    self._AddOpInternal(op)
-
-  def _AddOpInternal(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_INFERENCE_OPS:
-      raise NotImplementedError(
-          "Operation of type %s (%s) is not supported on the TPU for inference."
-          " Execution will fail if this op is used in the graph. Make sure your"
-          " variables are using variable_scope." % (op.type, op.name))
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    result = val
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-    return result
-
-  def AddInnerOp(self, op):
-    self._AddOpInternal(op)
-
-  @property
-  def grad_state(self):
-    return None
-
-
-@experimental
-def validate_inference_rewrite_for_variables(graph):
-  """Validates whether rewrite_for_inference() 'worked' for variables.
-
-     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
-     after ReadVariableOps, but this mechanism works only if you are using
-     tf.get_variable() to create and access variables in your tpu computation.
-     This validation method can be called immediately after calling
-     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
-     to the graph.
-
-     Typical usages:
-       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
-
-       tpu.validate_inference_rewrite_for_variables(sess.graph)
-
-  Args:
-    graph: The graph which needs to be validated.
-  Raises:
-    RuntimeError: if validation failed.
-  """
-  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
-    raise RuntimeError(
-        "No GuaranteeConst ops found in the graph after running "
-        "tpu.rewrite_for_inference(...). Please check that you are using "
-        "tf.get_variable() to create and access variables in your tpu "
-        "computation.")
-
-
-@experimental
-def rewrite_for_inference(computation,
-                          inputs=None,
-                          infeed_queue=None,
-                          device_assignment=None,
-                          name=None):
-  """Rewrites `computation` for inference on a TPU system.
-
-     Other than 'rewriting' the computation to run on a TPU, if using variables
-     in your computation, it moves the ReadVariableOps outside the TPU
-     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
-     This mechanism works only if you are using tf.get_variable() to create and
-     access variables in your tpu computation. You can validate whether this
-     worked, by calling validate_inference_rewrite_for_variables() method
-     immediately after this method to check whether GuaranteeConstOps where
-     added to the graph.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors. If the function returns m outputs, rewrite will return a list of
-      m tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: The name of the operator.
-  Returns:
-    A list of output tensors.
-  """
-
-  def guarantee_const_getter(getter, name, *args, **kwargs):
-    with ops.control_dependencies(None):
-      return array_ops.guarantee_const(
-          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
-
-  def wrapped_computation(*args, **kwargs):
-    """Execute computation under `_TPUInferenceContext`."""
-    context = _TPUInferenceContext(
-        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
-    try:
-      context.Enter()
-
-      vscope = variable_scope.get_variable_scope()
-      prev_custom_getter = vscope.custom_getter
-      prev_caching_device = vscope.caching_device
-      vscope.set_custom_getter(guarantee_const_getter)
-      vscope.set_caching_device(lambda op: op.device)
-
-      result = computation(*args, **kwargs)
-
-      vscope.set_custom_getter(prev_custom_getter)
-      vscope.set_caching_device(prev_caching_device)
-    finally:
-      context.Exit()
-    return result
-
-  # pylint: disable=undefined-variable
-  return rewrite(
-      wrapped_computation,
-      inputs=inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-  # pylint: enable=undefined-variable
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu import *
+# used by tests
+from tensorflow.python.tpu.tpu import _TPU_REPLICATE_ATTR
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 9f8d14706845baa1ed45c84b2c15d372915a0eb4..c36aaa38c0e4823bfc438773e4aa5b5109794da4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -1,275 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""A RunConfig subclass with TPU support."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import tf_logging as logging
-
-# pylint: disable=protected-access
-_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
-_SERVICE_KEY = run_config_lib._SERVICE_KEY
-_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
-# pylint: enable=protected-access
-
-
-class InputPipelineConfig(object):
-  r"""Please see the definition of these values in TPUConfig."""
-  PER_SHARD_V1 = 1
-  PER_HOST_V1 = 2
-  PER_HOST_V2 = 3
-  BROADCAST = 4
-
-
-class TPUConfig(
-    collections.namedtuple('TPUConfig', [
-        'iterations_per_loop',
-        'num_shards',
-        'num_cores_per_replica',
-        'per_host_input_for_training',
-        'tpu_job_name',
-        'initial_infeed_sleep_secs',
-        'input_partition_dims',
-    ])):
-  r"""TPU related configuration required by `TPUEstimator`.
-
-  Args:
-    iterations_per_loop: This is the number of train steps running in TPU
-      system before returning to CPU host for each `Session.run`. This means
-      global step is increased `iterations_per_loop` times in one `Session.run`.
-      It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: (Deprecated, ignored by TPUEstimator).
-      The number of model replicas in the system. For non-model-parallelism
-      case, this number equals the total number of TPU cores. For
-      model-parallelism, the total number of TPU cores equals
-      num_cores_per_replica * num_shards.
-    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
-      An integer which describes the number of TPU cores per model replica. This
-      is required by model-parallelism which enables partitioning
-      the model to multiple cores. Currently num_cores_per_replica must be
-      1, 2, 4, or 8.
-    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
-      `input_fn` is invoked once on each host. With the per-core input pipeline
-      configuration, it is invoked once for each core.
-      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
-      the batch size for each shard is `train_batch_size` // #hosts in the
-      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
-      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. The batch size equals to train_batch_size`. With the per-core
-      input pipeline configuration, the shard batch size is also
-      `train_batch_size` // #cores.
-      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
-    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
-      within TPUEstimator, however when using ClusterSpec propagation in more
-      esoteric cluster configurations, you may need to specify the job name as a
-      string.
-    initial_infeed_sleep_secs: The number of seconds the infeed thread should
-      wait before enqueueing the first batch. This helps avoid timeouts for
-      models that require a long compilation time.
-    input_partition_dims: A nested list to describe the partition dims
-      for all the tensors from input_fn(). The structure of
-      input_partition_dims must match the structure of `features` and
-      `labels` from input_fn(). The total number of partitions must match
-      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
-      images with shape [N, H, W, C] and labels [N].
-      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
-      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
-      to all the TPU cores since the partition dims is `None`.
-      Current limitations: This feature is only supported with the PER_HOST_V2
-      input mode.
-
-    Raises:
-      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
-  """
-
-  def __new__(cls,
-              iterations_per_loop=2,
-              num_shards=None,
-              num_cores_per_replica=None,
-              per_host_input_for_training=True,
-              tpu_job_name=None,
-              initial_infeed_sleep_secs=None,
-              input_partition_dims=None):
-
-    # Check iterations_per_loop.
-    util_lib.check_positive_integer(iterations_per_loop,
-                                    'TPUConfig iterations_per_loop')
-
-    # Check num_shards.
-    if num_shards is not None:
-      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
-
-    if input_partition_dims is not None:
-      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
-        raise ValueError(
-            'input_partition_dims must be a list/tuple with one or two'
-            ' elements.')
-
-      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
-        raise ValueError(
-            'input_partition_dims is only supported in PER_HOST_V2 mode.')
-
-      if num_cores_per_replica is None:
-        raise ValueError(
-            'input_partition_dims requires setting num_cores_per_replica.')
-
-    # Check num_cores_per_replica
-    if num_cores_per_replica is not None:
-      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
-        raise ValueError(
-            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
-                str(num_cores_per_replica)))
-
-    # per_host_input_for_training may be True, False, or integer in [1..3].
-    # Map legacy values (True, False) to numeric values.
-    if per_host_input_for_training is False:
-      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
-    elif per_host_input_for_training is True:
-      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
-
-    # Check initial_infeed_sleep_secs.
-    if initial_infeed_sleep_secs:
-      util_lib.check_positive_integer(initial_infeed_sleep_secs,
-                                      'TPUConfig initial_infeed_sleep_secs')
-
-    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
-
-    return super(TPUConfig, cls).__new__(
-        cls,
-        iterations_per_loop=iterations_per_loop,
-        num_shards=num_shards,
-        num_cores_per_replica=num_cores_per_replica,
-        per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name,
-        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
-        input_partition_dims=input_partition_dims)
-
-
-class RunConfig(run_config_lib.RunConfig):
-  """RunConfig with TPU support."""
-
-  def __init__(self,
-               tpu_config=None,
-               evaluation_master=None,
-               master=None,
-               cluster=None,
-               **kwargs):
-    """Constructs a RunConfig.
-
-    Args:
-      tpu_config: the TPUConfig that specifies TPU-specific configuration.
-      evaluation_master: a string. The address of the master to use for eval.
-        Defaults to master if not set.
-      master: a string. The address of the master to use for training.
-      cluster: a ClusterResolver
-      **kwargs: keyword config parameters.
-
-    Raises:
-      ValueError: if cluster is not None and the provided session_config has a
-        cluster_def already.
-    """
-    super(RunConfig, self).__init__(**kwargs)
-    self._tpu_config = tpu_config or TPUConfig()
-    self._cluster = cluster
-
-    # If user sets master and/or evaluation_master explicitly, including empty
-    # string '', take it. Otherwise, take the values set by parent class.
-    if master is not None:
-      if cluster is not None:
-        raise ValueError('Both master and cluster are set.')
-      self._master = master
-    else:
-      if cluster:
-        self._master = cluster.master()
-
-    if evaluation_master is not None:
-      self._evaluation_master = evaluation_master
-    elif (not self._evaluation_master and
-          self.task_type != run_config_lib.TaskType.EVALUATOR):
-      # If the task type is EVALUATOR, it means some cluster manager sets the
-      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
-      #
-      # Otherwise, it means user executes the code without external cluster
-      # manager. For that, we optimize the user experience by setting
-      # evaluation_master to master, unless user overwrites it.
-      self._evaluation_master = self._master
-
-    # Set the ClusterSpec to use
-    if cluster:
-      self._cluster_spec = cluster.cluster_spec()
-
-      # Merge the cluster_def into the ConfigProto.
-      if self._session_config is None:  # pylint: disable=access-member-before-definition
-        self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-      if self._session_config.HasField('cluster_def'):
-        raise ValueError(
-            'You cannot provide a ClusterResolver and '
-            'session_config.cluster_def.')
-      if self._cluster_spec:
-        self._session_config.cluster_def.CopyFrom(
-            self._cluster_spec.as_cluster_def())
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    # Overrides the parent class session_config overwrite for between-graph. TPU
-    # runs with in-graph, which should not have device filter. Doing nothing
-    # ("pass") basically disables it.
-    pass
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def tpu_config(self):
-    return self._tpu_config
-
-  @property
-  def cluster(self):
-    return self._cluster
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
-
-
-def _get_tpu_job_name_from_tf_config():
-  """Extracts the TPU job name from TF_CONFIG env variable."""
-  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
-  # spec propagation.
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
-  if tpu_job_name:
-    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
-  return tpu_job_name
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_config import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 672462447944b777375331d49727c4d5366cf295..b77b010cba6bf32c3b6d170bc522eebfb6a04f77 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -1,725 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-import copy
-
-from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.platform import tf_logging as logging
-
-
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-_NUM_CORES_TO_COMPUTATION_SHAPE = {
-    1: [1, 1, 1],
-    2: [1, 1, 2],
-    4: [1, 2, 2],
-    8: [2, 2, 2],
-    16: [4, 2, 2],
-}
-
-
-class TPUContext(object):
-  """A context that holds the current configuration of the TPU computation."""
-
-  def __init__(self,
-               internal_ctx,
-               input_device=None,
-               invocation_index=None,
-               call_from_input_fn=True):
-    self._internal_ctx = internal_ctx
-    self._input_device = input_device
-    self._invocation_index = invocation_index
-    self._call_from_input_fn = call_from_input_fn
-
-  def current_input_fn_deployment(self):
-    """The configuration of the current input_fn invocation.
-
-    The configuration depends on `TPUConfig.per_host_input_for_training`. See
-    `TPUConfig` for details.
-
-    Only set in params dict of input_fn
-
-    Returns:
-      A tuple of
-        1. Device spec string: String, is the current CPU host where the
-           input_fn is invoked.
-        2. Current invocation index: Int, 0-based index of the input_fn
-           invocation. See next item for details.
-        3. Total invocation count: Int, the total number of times to invoke the
-           input_fn on all CPU hosts. Each invocation will be passed with a new
-           `TPUContext` instance with current invocation index set properly.
-        4. Total number of replicas consumed by current_invocation: Int, the
-           number of replicas fed by the data returned by current input_fn. For
-           example, for per_core input pipeline deployment
-           and non-model-parallelism, total invocation count is equal to
-           the number of cores in the system and num replicas consumed by
-           current invocation is 1. For per-host v2 input pipeline deployment,
-           total invocation count is equal to the number of hosts in the system
-           and num replicas consumed by current invocation is equal to number of
-           cores per host.
-
-    Raises:
-      RuntimeError: If this method must not be called from input_fn.
-    """
-    if not self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' model_fn.')
-
-    if self._internal_ctx.is_input_sharded_per_core():
-      total_invocation_count = (self._internal_ctx.num_hosts
-                                * self._internal_ctx.num_of_replicas_per_host)
-      replicas_consumed = 1
-    elif self._internal_ctx.is_input_broadcast_with_iterators():
-      total_invocation_count = 1
-      replicas_consumed = self._internal_ctx.num_replicas
-    else:
-      total_invocation_count = self._internal_ctx.num_hosts
-      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
-    return (self._input_device, self._invocation_index,
-            total_invocation_count, replicas_consumed)
-
-  @property
-  def num_replicas(self):
-    """The total number of replicas.
-
-    For non-model-parallelism, num_replicas should be the total num of TPU
-    cores in the system.
-
-    Returns:
-      The number of replicas.
-    """
-    return self._internal_ctx.num_replicas
-
-  @property
-  def num_hosts(self):
-    """The number of hosts for the TPU system."""
-    return self._internal_ctx.num_hosts
-
-  @property
-  def current_host(self):
-    """The current host index for the TPU system."""
-    return self._invocation_index
-
-  @property
-  def num_of_replicas_per_host(self):
-    """The number of replicas for each host."""
-    if self._internal_ctx.model_parallelism_enabled:
-      raise ValueError(
-          'num_of_replicas_per_host is not supported for model_parallelism')
-    return self._internal_ctx.num_of_replicas_per_host
-
-  @property
-  def device_assignment(self):
-    """Returns device_assignment object."""
-    if self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' input_fn.')
-    return self._internal_ctx.device_assignment
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    # Note that: For the non-model parallelism, the mapping could be
-    # a random permutation. The order should not matter in most cases
-    # as far as model is replicated to all cores in the system.
-    return self._internal_ctx.device_for_replica(replica_id)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function.
-
-    The place function takes host_id as the input and returns the TF device
-    for the correspoding host.
-    """
-
-    def _placement_function(host_id):
-      """Return the host device given host_id."""
-      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
-
-    return _placement_function
-
-
-class _InternalTPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, based on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  if eval_on_tpu is False, then execution of eval on TPU is disabled.
-  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
-  and TPU execution is disabled for all modes.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _InternalTPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu, eval_on_tpu=True):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
-    if not use_tpu and eval_on_tpu:
-      logging.warning('eval_on_tpu ignored because use_tpu is False.')
-
-    self._eval_on_tpu = eval_on_tpu
-    self._model_parallelism_enabled = (
-        use_tpu and config.tpu_config.num_cores_per_replica)
-    self._mode = None
-    num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if num_cores_per_replica:
-      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
-          num_cores_per_replica]
-    else:
-      self._computation_shape = None
-    self._lazy_tpu_system_metadata_dict = {}  # key by master address
-    self._lazy_device_assignment_dict = {}  # key by master address
-    self._lazy_validation_dict = {}  # key by ModeKeys
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @contextmanager
-  def with_mode(self, mode):
-    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
-    # such as _lazy_tpu_system_metadata_dict between new copy and the original
-    # one. Note that all lazy states stored in properties _lazy_foo are sort of
-    # immutable as they should be same for the process lifetime.
-    new_ctx = copy.copy(self)
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  def _get_master_address(self):
-    mode = self._assert_mode()
-    config = self._config
-    master = (
-        config.master
-        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
-    return master
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    cluster_def = None
-    if (self._config.session_config and
-        self._config.session_config.cluster_def.job):
-      cluster_def = self._config.session_config.cluster_def
-
-    # pylint: disable=protected-access
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(
-            master,
-            cluster_def=cluster_def,
-            query_topology=self.model_parallelism_enabled))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-  def _get_device_assignment(self):
-    """Gets the (maybe cached) TPU device assignment."""
-    master = self._get_master_address()
-    device_assignment = self._lazy_device_assignment_dict.get(master)
-    if device_assignment is not None:
-      return device_assignment
-
-    tpu_system_metadata = self._get_tpu_system_metadata()
-
-    device_assignment = tpu_device_assignment.device_assignment(
-        tpu_system_metadata.topology,
-        computation_shape=self._computation_shape,
-        num_replicas=self.num_replicas)
-
-    logging.info('num_cores_per_replica: %s',
-                 str(self._config.tpu_config.num_cores_per_replica))
-    logging.info('computation_shape: %s', str(self._computation_shape))
-    logging.info('num_replicas: %d', self.num_replicas)
-    logging.info('device_assignment.topology.device_coordinates: %s',
-                 str(device_assignment.topology.device_coordinates))
-    logging.info('device_assignment.core_assignment: %s',
-                 str(device_assignment.core_assignment))
-
-    self._lazy_device_assignment_dict[master] = device_assignment
-    return device_assignment
-
-  @property
-  def model_parallelism_enabled(self):
-    return self._model_parallelism_enabled
-
-  @property
-  def input_partition_dims(self):
-    return self._config.tpu_config.input_partition_dims
-
-  @property
-  def device_assignment(self):
-    return (self._get_device_assignment()
-            if self._model_parallelism_enabled else None)
-
-  @property
-  def num_of_cores_per_host(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_of_cores_per_host
-
-  @property
-  def num_cores(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_cores
-
-  @property
-  def num_of_replicas_per_host(self):
-    """Return the number of replicas per host."""
-    if self.model_parallelism_enabled:
-      return self.num_replicas // self.num_hosts
-    else:
-      return self.num_of_cores_per_host
-
-  @property
-  def num_replicas(self):
-    num_cores_in_system = self.num_cores
-
-    if self.model_parallelism_enabled:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      if num_cores_per_replica > num_cores_in_system:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the total num of '
-            'TPU cores in the system. num_cores_per_replica: {}, num cores '
-            'in the system: {}'.format(num_cores_per_replica,
-                                       num_cores_in_system))
-
-      if num_cores_in_system % num_cores_per_replica != 0:
-        raise RuntimeError(
-            'The num of cores in the system ({}) is not divisible by the num '
-            'of cores ({}) required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
-                num_cores_in_system, num_cores_per_replica))
-
-      return num_cores_in_system // num_cores_per_replica
-    else:
-      return num_cores_in_system
-
-  @property
-  def num_hosts(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_hosts
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    mode = self._assert_mode()
-    return (mode == model_fn_lib.ModeKeys.TRAIN and
-            (self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.PER_SHARD_V1))
-
-  def is_input_per_host_with_iterators(self):
-    """Return true if input_fn should be run in the per-host v2 config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.PER_HOST_V2)
-
-  def is_input_broadcast_with_iterators(self):
-    """Return true if input_fn should be run in the full_replicae config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.BROADCAST)
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    This API also validates user provided configuration, such as batch size,
-    according the lazy initialized TPU system metadata.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distinguish these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-
-    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
-    if not is_running_on_cpu:
-      self._validate_tpu_configuration()
-    return is_running_on_cpu
-
-  def _is_running_on_cpu(self, is_export_mode):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
-      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
-      return True
-
-    if is_export_mode:
-      return True
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core() or (
-        self.is_input_per_host_with_iterators()):
-      return global_batch_size // self.num_replicas
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU. always sharded per shard.
-    return global_batch_size // self.num_replicas
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    if master in _LOCAL_MASTERS:
-      return None
-
-    if (not run_config.session_config or
-        not run_config.session_config.cluster_def.job):
-      return _DEFAULT_JOB_NAME
-    cluster_def = run_config.session_config.cluster_def
-    job_names = set([job.name for job in cluster_def.job])
-    if _DEFAULT_JOB_NAME in job_names:
-      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
-      raise ValueError('Currently, tpu_worker is not an allowed job name.')
-    if len(job_names) == 1:
-      return cluster_def.job[0].name
-    if len(job_names) == 2:
-      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
-        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
-        return job_names.pop()
-      # TODO(b/67716447): Include more sophisticated heuristics.
-    raise ValueError(
-        'Could not infer TPU job name. Please specify a tpu_job_name as part '
-        'of your TPUConfig.')
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
-      """Return the host device given replica_id or host_id."""
-      assert _sentinal is None
-      if replica_id is not None and host_id is not None:
-        raise RuntimeError(
-            'replica_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        if replica_id is not None:
-          if self.model_parallelism_enabled:
-            return self.device_assignment.host_device(
-                replica=replica_id, job=master)
-          else:
-            host_id = replica_id / self.num_of_cores_per_host
-
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns a TPU device placement Fn."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self.model_parallelism_enabled:
-        return self.device_assignment.tpu_device(replica=i, job=master)
-      else:
-        num_of_cores_per_host = self.num_of_cores_per_host
-        host_id = i / num_of_cores_per_host
-        ordinal_id = i % num_of_cores_per_host
-        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
-
-    return _placement_function
-
-  def tpu_ordinal_function(self, host_id):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(shard_index_in_host):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        shard_index_in_host: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self.model_parallelism_enabled:
-        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
-        replica = self.device_assignment.lookup_replicas(host_id,
-                                                         0)[shard_index_in_host]
-        return self.device_assignment.tpu_ordinal(replica=replica)
-      else:
-        return shard_index_in_host % self.num_of_cores_per_host
-
-    return _tpu_ordinal_function
-
-  def _validate_tpu_configuration(self):
-    """Validates the configuration based on the TPU system metadata."""
-    mode = self._assert_mode()
-    if self._lazy_validation_dict.get(mode):
-      return
-
-    # All following information is obtained from TPU system metadata.
-    num_cores = self.num_cores
-    num_replicas = self.num_replicas
-    num_hosts = self.num_hosts
-
-    if not num_cores:
-      tpu_system_metadata = self._get_tpu_system_metadata()
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system. Please double check '
-          'Tensorflow master address and TPU worker(s). Available devices '
-          'are {}.'.format(tpu_system_metadata.devices))
-
-    if self._config.tpu_config.num_shards:
-      user_provided_num_replicas = self._config.tpu_config.num_shards
-      if user_provided_num_replicas != num_replicas:
-        message = (
-            'TPUConfig.num_shards is not set correctly. According to TPU '
-            'system metadata for Tensorflow master ({}): num_replicas should '
-            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
-            'be the total num of TPU cores in the system. For '
-            'model-parallelism, the total number of TPU cores should be '
-            'num_cores_per_replica * num_replicas. Please set it '
-            'accordingly or leave it as `None`'.format(
-                self._get_master_address(), num_replicas,
-                user_provided_num_replicas))
-
-        raise ValueError(message)
-
-    if self._config.tpu_config.num_cores_per_replica:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
-      if num_cores_per_replica > num_cores_per_host:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the '
-            'num_cores_per_host. num_cores_per_replica: {}, '
-            'num_cores_per_host: {}'.format(num_cores_per_replica,
-                                            num_cores_per_host))
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      if (self._train_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'train batch size {} must be divisible by number of replicas {}'
-            .format(self._train_batch_size, num_replicas))
-
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      if self._eval_batch_size is None:
-        raise ValueError(
-            'eval_batch_size in TPUEstimator constructor cannot be `None`'
-            'if .evaluate is running on TPU.')
-      if (self._eval_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'eval batch size {} must be divisible by number of replicas {}'
-            .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.evaluate should be running on single TPU'
-            ' instead of a Pod.')
-    else:
-      assert mode == model_fn_lib.ModeKeys.PREDICT
-      if self._predict_batch_size is None:
-        raise ValueError(
-            'predict_batch_size in TPUEstimator constructor should not be '
-            '`None` if .predict is running on TPU.')
-      if (self._predict_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'predict batch size {} must be divisible by number of replicas {}'
-            .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.predict should be running on single TPU worker. '
-            'got {}.'.format(num_hosts))
-
-    # Record the state "validated" into lazy dictionary.
-    self._lazy_validation_dict[mode] = True
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    master = self.master_job
-
-    if self.model_parallelism_enabled:
-      return (self.device_assignment.host_device(
-          replica=replica_id, job=master),
-              self.device_assignment.tpu_ordinal(replica=replica_id))
-
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
-
-
-class _OneCoreTPUContext(_InternalTPUContext):
-  """Special _InternalTPUContext for one core usage."""
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
-
-    super(_OneCoreTPUContext, self).__init__(
-        config, train_batch_size, eval_batch_size,
-        predict_batch_size, use_tpu)
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
-            num_cores=1,
-            num_hosts=1,
-            num_of_cores_per_host=1,
-            topology=None,
-            devices=[]))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-
-def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu):
-  """Returns an instance of `_InternalTPUContext`."""
-
-  if (config.tpu_config.num_shards == 1 and
-      config.tpu_config.num_cores_per_replica is None):
-    logging.warning(
-        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
-        'Please fix as soon as possible (leaving num_shards as None.)')
-    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
-                              predict_batch_size, use_tpu)
-
-  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_context import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index 1a909a3ac6fae79070a7762b94bfa138f93a5fb5..cb38a8f1a6bee3c2adfbefc203c1d143303c3368 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -1,10 +1,10 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,1087 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TPU embedding APIs."""
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import math
-import re
-import six
-
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.ops import gen_tpu_ops
-from tensorflow.contrib.tpu.proto import tpu_embedding_configuration_pb2 as elc
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-
-TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
-INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
-
-
-class TableConfig(
-    collections.namedtuple(
-        'TableConfig',
-        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
-  """Embedding table configuration."""
-
-  @experimental
-  def __new__(cls,
-              vocabulary_size,
-              dimension,
-              initializer=None,
-              combiner='mean'):
-    """Embedding table configuration.
-
-    Args:
-      vocabulary_size: Number of vocabulary (/rows) in the table.
-      dimension: The embedding dimension.
-      initializer: A variable initializer function to be used in embedding
-        variable initialization. If not specified, defaults to
-        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-        `1/sqrt(dimension)`.
-      combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-        with bag-of-words columns. For more information, see
-        `tf.nn.embedding_lookup_sparse`.
-
-    Returns:
-      `TableConfig`.
-
-    Raises:
-      ValueError: if `vocabulary_size` is not positive integer.
-      ValueError: if `dimension` is not positive integer.
-      ValueError: if `initializer` is specified and is not callable.
-      ValueError: if `combiner` is not supported.
-    """
-    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
-      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
-
-    if not isinstance(dimension, int) or dimension < 1:
-      raise ValueError('Invalid dimension {}.'.format(dimension))
-
-    if (initializer is not None) and (not callable(initializer)):
-      raise ValueError('initializer must be callable if specified.')
-    if initializer is None:
-      initializer = init_ops.truncated_normal_initializer(
-          mean=0.0, stddev=1 / math.sqrt(dimension))
-
-    if combiner not in ('mean', 'sum', 'sqrtn'):
-      raise ValueError('Invalid combiner {}'.format(combiner))
-
-    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
-                                           initializer, combiner)
-
-
-# TODO(shizhiw): Factor `use_gradient_accumulation` and
-# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
-class _OptimizationParameters(object):
-  """Parameters common to all optimizations."""
-
-  def __init__(self, learning_rate, use_gradient_accumulation,
-               pipeline_execution_with_tensor_core):
-    self.learning_rate = learning_rate
-    self.use_gradient_accumulation = use_gradient_accumulation
-    self.pipeline_execution_with_tensor_core = (
-        pipeline_execution_with_tensor_core)
-
-
-class AdagradParameters(_OptimizationParameters):
-  """Optimization parameters for Adagrad."""
-
-  def __init__(self, learning_rate, initial_accumulator,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adagrad.
-
-    Args:
-      learning_rate: used for updating embedding table.
-      initial_accumulator: initial accumulator for Adagrad.
-      use_gradient_accumulation: setting this to `True` makes embedding
-         gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-         for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdagradParameters, self).__init__(learning_rate,
-                                            use_gradient_accumulation,
-                                            pipeline_execution_with_tensor_core)
-    self.initial_accumulator = initial_accumulator
-
-
-class AdamParameters(_OptimizationParameters):
-  """Optimization parameters for Adam."""
-
-  def __init__(self, learning_rate,
-               beta1=0.9,
-               beta2=0.999,
-               epsilon=1e-08,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adam.
-
-    Args:
-      learning_rate: a floating point value. The learning rate.
-      beta1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
-      epsilon: A small constant for numerical stability.
-      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
-        Please see `optimization_parameters.proto` for details.
-      sum_inside_sqrt: This improves training speed. Please see
-        `optimization_parameters.proto` for details.
-      use_gradient_accumulation: setting this to `True` makes embedding
-        gradients calculation more accurate but slower. Please see
-        `optimization_parameters.proto` for details.
-        for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdamParameters, self).__init__(learning_rate,
-                                         use_gradient_accumulation,
-                                         pipeline_execution_with_tensor_core)
-    self.beta1 = beta1
-    self.beta2 = beta2
-    self.epsilon = epsilon
-    self.lazy_adam = lazy_adam
-    self.sum_inside_sqrt = sum_inside_sqrt
-
-
-class StochasticGradientDescentParameters(_OptimizationParameters):
-  """Optimization parameters for stochastic gradient descent.
-
-  Args:
-    learning_rate: a floating point value. The learning rate.
-    use_gradient_accumulation: setting this to `True` makes embedding
-      gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-    pipeline_execution_with_tensor_core: setting this to `True` makes training
-      faster, but trained model will be different if step N and step N+1
-      involve the same set of embedding ID. Please see
-      `tpu_embedding_configuration.proto` for details.
-    """
-
-  def __init__(self, learning_rate, use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    super(StochasticGradientDescentParameters, self).__init__(
-        learning_rate, use_gradient_accumulation,
-        pipeline_execution_with_tensor_core)
-
-
-class TPUEmbedding(object):
-  """API for using TPU for embedding.
-
-    Example:
-    ```
-    table_config_user = tpu_embedding.TableConfig(
-        vocabulary_size=4, dimension=2,
-        initializer=initializer, combiner='mean')
-    table_to_config_dict = {'video': table_config_video,
-                          'user': table_config_user}
-    feature_to_table_dict = {'watched': 'video',
-                             'favorited': 'video',
-                             'friends': 'user'}
-    batch_size = 4
-    num_hosts = 1
-    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
-    mode = tpu_embedding.TRAINING
-    embedding = tpu_embedding.TPUEmbedding(
-        table_to_config_dict, feature_to_table_dict,
-        batch_size, num_hosts, mode, optimization_parameters)
-
-    batch_size_per_core = embedding.batch_size_per_core
-    sparse_features_list = []
-    for host in hosts:
-      with ops.device(host):
-        for _ in range(embedding.num_cores_per_host):
-          sparse_features = {}
-          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
-          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
-          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
-          sparse_features_list.append(sparse_features)
-
-    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
-
-    def computation():
-      activations = embedding.get_activations()
-      loss = compute_loss(activations)
-
-      base_optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=1)
-      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
-          base_optimizer)
-
-      train_op = cross_shard_optimizer.minimize(loss)
-      # `train_op` and `send_gradients_op` must happen in order.
-      with ops.control_dependencies([train_op]):
-        send_gradients_op = embedding.generate_send_gradients_op()
-      with ops.control_dependencies([send_gradients_op]):
-        loss = array_ops.identity(loss)
-
-    loss = tpu.shard(computation,
-                     num_shards=embedding.num_cores)
-
-    with self.test_session() as sess:
-      sess.run(tpu.initialize_system(embedding_config=
-                                     embedding.config_proto))
-      sess.run(variables.global_variables_initializer())
-      sess.run(embedding.init_ops)
-      sess.run(enqueue_ops)
-      loss_val = sess.run(loss)
-    ```
-  """
-
-  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
-  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
-  # `FeatureConfig` could have fields other than table name. For example, it
-  # could have a field to indicate that the feature should not be used to
-  # update embedding table (cr/204852758, cr/204940540). Also, this can support
-  # different combiners for different features within the same table.
-  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
-  # to `FeatureConfig`?
-
-  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
-  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
-
-  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
-  # for-loops around construction of inputs.
-
-  # `optimization_parameter` applies to all tables. If the need arises,
-  # we can add `optimization_parameters` to `TableConfig` to override this
-  # global setting.
-  @experimental
-  def __init__(self,
-               table_to_config_dict,
-               feature_to_table_dict,
-               batch_size,
-               mode,
-               master,
-               optimization_parameters=None):
-    """API for using TPU for embedding lookups.
-
-    Args:
-      table_to_config_dict: A dictionary mapping from string of table name to
-        `TableConfig`. Table refers to an embedding table, e.g. `params`
-        argument to `tf.nn.embedding_lookup_sparse()`.
-      feature_to_table_dict: A dictionary mapping from string of feature name
-        to string of table name. Feature refers to ids to lookup in embedding
-        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
-      batch_size: An `int` representing the global batch size.
-      mode: `TRAINING` or `INFERENCE`.
-      master: A `string` representing the TensorFlow master to use.
-      optimization_parameters: `AdagradParameters`, `AdamParameters`,
-        `Stochasticgradientdescentparameters`. Must be set in training and must
-        be `None` in inference.
-
-    Raises:
-      ValueError: if any input is invalid.
-    """
-    _validate_table_to_config_dict(table_to_config_dict)
-    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
-    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
-    self._combiners = _create_combiners(self._table_to_config_dict)
-
-    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
-    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
-    self._table_to_features_dict = _create_table_to_features_dict(
-        self._feature_to_table_dict)
-
-    self._batch_size = batch_size
-
-    self._master = master
-    self._tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
-    if self._tpu_system_metadata.num_cores == 0:
-      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
-                       'TPUs.'.format(self._master))
-    self._num_hosts = self._tpu_system_metadata.num_hosts
-    self._hosts = [device.name for device in self._tpu_system_metadata.devices
-                   if 'device:CPU:' in device.name]
-    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
-    self._num_cores = self._tpu_system_metadata.num_cores
-
-    _validate_batch_size(self._batch_size, self._num_cores)
-    self._batch_size_per_core = self._batch_size // self._num_cores
-
-    self._init_ops = []
-
-    # TODO(shizhiw): remove `mode`?
-    if mode == TRAINING:
-      _validate_optimization_parameters(optimization_parameters)
-      self._optimization_parameters = optimization_parameters
-    elif mode == INFERENCE:
-      if optimization_parameters is not None:
-        raise ValueError('`optimization_parameters` should be `None` '
-                         'for inference mode.')
-      self._optimization_parameters = (
-          StochasticGradientDescentParameters(1.))
-    else:
-      raise ValueError('`mode` only supports {} and {}; got {}.'
-                       .format(TRAINING, INFERENCE, mode))
-    self._mode = mode
-
-    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
-    # and create special handler for inference that inherits from
-    # StochasticGradientDescentHandler with more user-friendly error message
-    # on get_slot().
-    self._optimizer_handler = _get_optimization_handler(
-        self._optimization_parameters)
-
-    dummy_table_variables_init_op = self._create_dummy_table_variables()
-    self._init_ops.append(dummy_table_variables_init_op)
-
-    self._config_proto = self._create_config_proto()
-
-    self._create_variables_and_ops()
-    self._init_ops.extend(self._load_parameters_ops)
-
-  @property
-  def hosts(self):
-    """A list of device names for CPU hosts.
-
-    Returns:
-      A list of device names for CPU hosts.
-    """
-    return copy.copy(self._hosts)
-
-  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
-  # to be consistent with `tpu_embedding_configuration.proto`.
-  @property
-  def num_cores_per_host(self):
-    """Number of TPU cores on a CPU host.
-
-    Returns:
-      Number of TPU cores on a CPU host.
-    """
-    return self._num_cores_per_host
-
-  @property
-  def num_cores(self):
-    """Total number of TPU cores on all hosts.
-
-    Returns:
-      Total number of TPU cores on all hosts.
-    """
-    return self._num_cores
-
-  @property
-  def batch_size_per_core(self):
-    """Batch size for each TPU core.
-
-    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
-       must have batch dimension equal to this.
-
-    Returns:
-      Batch size for each TPU core.
-    """
-    return self._batch_size_per_core
-
-  @property
-  def config_proto(self):
-    """Create embedding config proto for `tpu.initialize_system()`.
-
-    Returns:
-      an `TPUEmbeddingConfiguration` proto describing the desired
-         configuration of the hardware embedding lookup tables, which
-         is passed to `tpu.initialize_system()`.
-    """
-    return self._config_proto
-
-  @property
-  def init_ops(self):
-    """Initialization ops for TPU embedding.
-
-    It must be called after all global variables have been initialized,
-    i.e. after `global_variables_initializer()`, as it loads embedding
-    tables into TPU.
-
-    Returns:
-      A list of ops.
-    """
-    return self._init_ops
-
-  # TODO(shizhiw): get table variables the same way as getting slot variables.
-  @property
-  def table_to_table_variables_dict(self):
-    return copy.copy(self._table_to_table_variables_dict)
-
-  def get_slot_names(self):
-    """Return a list of the names of slots created by `TPUEmbedding`."""
-    return self._optimizer_handler.get_slot_names()
-
-  def get_slot(self, table, name):
-    """Return a slot named `name` create for `table` by `TPUEmbedding`."""
-    return self._optimizer_handler.get_slot(table, name)
-
-  # TODO(shizhiw): expose load to user too?
-  @property
-  def retrieve_parameters_ops(self):
-    return self._retrieve_parameters_ops
-
-  def _create_config_proto(self):
-    """Create `TPUEmbeddingConfiguration`."""
-    config_proto = elc.TPUEmbeddingConfiguration()
-    for table in self._table_to_config_dict:
-      table_descriptor = config_proto.table_descriptor.add()
-      table_descriptor.name = table
-
-      table_config = self._table_to_config_dict[table]
-      table_descriptor.vocabulary_size = table_config.vocabulary_size
-      table_descriptor.dimension = table_config.dimension
-
-      features_for_table = self._table_to_features_dict[table]
-      table_descriptor.num_features = len(features_for_table)
-
-      table_descriptor.optimization_parameters.learning_rate.constant = (
-          self._optimization_parameters.learning_rate)
-      table_descriptor.optimization_parameters.use_gradient_accumulation = (
-          self._optimization_parameters.use_gradient_accumulation)
-      self._optimizer_handler.set_optimization_parameters(table_descriptor)
-
-    config_proto.mode = self._mode
-    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
-    config_proto.num_hosts = self._num_hosts
-    config_proto.num_tensor_cores = self._num_cores
-    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
-    config_proto.pipeline_execution_with_tensor_core = (
-        self._optimization_parameters.pipeline_execution_with_tensor_core)
-
-    return config_proto
-
-  def _create_variables_and_ops(self):
-    """Create embedding variables and return ops to load them into TPU."""
-    self._load_parameters_ops = []
-    self._retrieve_parameters_ops = []
-    self._table_to_table_variables_dict = {}
-    for table in self._table_to_config_dict:
-      device_fn = _create_device_fn(self._hosts)
-      with ops.device(device_fn):
-        # TODO(shizhiw): allow user to specify variable name so that
-        # they could make the name consistent with CPU etc.
-        variable_name = table
-        table_variables = _create_partitioned_variables(
-            name=variable_name,
-            num_hosts=self._num_hosts,
-            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
-            embedding_dimension=self._table_to_config_dict[table].dimension,
-            initializer=self._table_to_config_dict[table].initializer,
-            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-        self._table_to_table_variables_dict[table] = table_variables
-
-        self._optimizer_handler.create_variables_and_ops(
-            table, variable_name, self._num_hosts,
-            self._table_to_config_dict[table], table_variables,
-            self._load_parameters_ops, self._retrieve_parameters_ops)
-
-  def _create_dummy_table_variables(self):
-    """Create dummy embedding table variables.
-
-    The sole purpose of these dummy variables are to trigger gradient
-    calcuation wrt them so that the gradients wrt activation can be captured
-    and later sent to TPU embedding.
-
-    Returns:
-      Initializer for these variables.
-
-    Raises:
-      RuntimeError: if collection to store gradients already exists and is not
-      empty.
-    """
-    self._dummy_table_variables = []
-    # TODO(shizhiw): remove table id.
-    for table_id, table in enumerate(self._table_to_features_dict):
-      self._dummy_table_variables.append(
-          variable_scope.get_variable(
-              'tpu_embedding_dummy_table_variable_%s' % table,
-              dtype=dtypes.float32,
-              shape=[1],
-              use_resource=True,
-              trainable=True,
-              # TODO(shizhiw): Remove these dummy variables as
-              # tensorflow optimizer creates slot variable for them which
-              # is undesirable.
-              # e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1}.
-              # Explicitly specifying collections prevents this variable from
-              # being added to the GLOBAL_VARIABLES collection, so that Saver()
-              # ignores it.
-              collections=['tpu_embedding_dummy_table_variables']))
-
-      g = ops.get_default_graph()
-      table_gradients = g.get_collection_ref(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if table_gradients:
-        raise RuntimeError(
-            'tpu_embedding_gradients_table_%d is not empty.' % table_id)
-      table_gradients.extend([None] * len(self._table_to_features_dict[table]))
-
-    return variables.variables_initializer(
-        self._dummy_table_variables,
-        name='tpu_embedding_dummy_table_variables_init')
-
-  def generate_enqueue_ops(self, sparse_features_list):
-    """Generate enqueue ops.
-
-    Args:
-      sparse_features_list: a list of dictionary mapping from string
-        of feature names to sparse tensor. Each dictionary is for one
-        TPU core. Dictionaries for the same core should be contiguous
-        on the list.
-
-    Returns:
-      Ops to enqueue to TPU for embedding.
-    """
-    self._validate_generate_enqueue_ops_sparse_features_list(
-        sparse_features_list)
-    return [
-        self._generate_enqueue_op(
-            sparse_features, device_ordinal=i % self._num_cores_per_host)
-        for i, sparse_features in enumerate(sparse_features_list)
-    ]
-
-  def _validate_generate_enqueue_ops_sparse_features_list(
-      self, sparse_features_list):
-    """Validate `sparse_features_list`."""
-    if len(sparse_features_list) != self._num_cores:
-      raise ValueError('Length of `sparse_features_list` should match the '
-                       'number of cores; '
-                       '`len(sparse_features_list)` is {}, '
-                       'number of cores is {}.'.format(
-                           len(sparse_features_list), self._num_cores))
-
-    feature_set = set(self._feature_to_table_dict.keys())
-    contiguous_device = None
-    for i, sparse_features in enumerate(sparse_features_list):
-      used_feature_set = set(sparse_features.keys())
-
-      # Check features are valid.
-      missing_feature_set = feature_set - used_feature_set
-      if missing_feature_set:
-        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, missing_feature_set))
-
-      extra_feature_set = used_feature_set - feature_set
-      if extra_feature_set:
-        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, extra_feature_set))
-
-      device = None
-      device_feature = None
-      for feature, tensor in six.iteritems(sparse_features):
-        if not isinstance(tensor, sparse_tensor.SparseTensor):
-          raise ValueError('`sparse_features_list[{}]` has a feature that is '
-                           'not mapped to `SparseTensor`. '
-                           '`feature`: {}, type: {}'.format(
-                               i, feature, type(tensor)))
-
-        # Check all features are on the same device.
-        if device is None:
-          device = tensor.op.device
-          device_feature = feature
-        else:
-          if device != tensor.op.device:
-            raise ValueError('Devices are different between features in '
-                             '`sparse_features_list[{}]`; '
-                             'devices: {}, {}; features: {}, {}.'.format(
-                                 i, device, tensor.op.device, feature,
-                                 device_feature))
-
-      if i % self._num_cores_per_host:
-        if device != contiguous_device:
-          raise ValueError('We expect the `sparse_features` which are on the '
-                           'same host to be contiguous in '
-                           '`sparse_features_list`, '
-                           '`sparse_features_list[{}]` is on device {}, '
-                           'but is expected to be on device {}.'.format(
-                               i, device, contiguous_device))
-      else:
-        contiguous_device = device
-
-  def _generate_enqueue_op(self, sparse_features, device_ordinal):
-    with ops.colocate_with(list(sparse_features.values())[0]):
-      sample_idcs, embedding_idcs, aggregation_weights = (
-          self._format_for_tpu_embedding_sparse_batch(sparse_features))
-      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
-          sample_idcs,
-          embedding_idcs,
-          aggregation_weights,
-          combiners=self._combiners,
-          device_ordinal=device_ordinal)
-
-  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
-    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
-
-    Args:
-      sparse_features: a `Dict` of `SparseTensor`s for embedding.
-
-    Returns:
-      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
-    """
-
-    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
-    for table in self._table_to_features_dict:
-      sample_t, indices_t, weights_t = list(), list(), list()
-
-      features = self._table_to_features_dict[table]
-      for i, feature in enumerate(features):
-        tensor = sparse_features[feature]
-        sample_indices = tensor.indices[:, 0]
-        embedding_indices = tensor.values
-        weights = array_ops.ones_like(embedding_indices)
-        sample_t.append(i * self._batch_size_per_core + sample_indices)
-        indices_t.append(embedding_indices)
-        weights_t.append(weights)
-
-      sample_idcs.append(
-          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
-      embedding_idcs.append(
-          math_ops.cast(
-              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
-      aggregation_weights.append(
-          math_ops.cast(
-              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
-
-    return sample_idcs, embedding_idcs, aggregation_weights
-
-  def get_activations(self):
-    """Get activations for features.
-
-    This should be called within `computation` that is passed to
-      `tpu.replicate` and friends.
-
-    Returns:
-      A dictionary mapping from `String` of feature name to `Tensor`
-        of activation.
-    """
-    recv_activations = tpu_ops.recv_tpu_embedding_activations(
-        num_outputs=len(self._table_to_config_dict),
-        config=self._config_proto.SerializeToString())
-
-    activations = collections.OrderedDict()
-    for table_id, table in enumerate(self._table_to_features_dict):
-      features = self._table_to_features_dict[table]
-      for lookup_id, feature in enumerate(features):
-        start_row = lookup_id * self._batch_size_per_core
-        end_row = start_row + self._batch_size_per_core
-        activations[feature] = gen_tpu_ops.tpu_embedding_activations(
-            self._dummy_table_variables[table_id],
-            recv_activations[table_id][start_row:end_row, :],
-            table_id=table_id,
-            lookup_id=lookup_id)
-    return activations
-
-  # TODO(shizhiw): Make `gradient_multiplier` per feature. Setting it to 0 would
-  # have the effect of `tf.stop_gradients()`.
-  # TODO(shizhiw): Consider alternative ways to capture gradients wrt embedding
-  # layer outputs to remove `_dummy_table_variables`,
-  # `_embedding_activation_grad` and `tpu_embedding_gradients_table_%d'.
-  def generate_send_gradients_op(self, gradient_multipliers=None):
-    """Retrieve gradients from collections and send them to TPU embedding.
-
-    Args:
-      gradient_multipliers: None, or dict mapping table names to gradient
-        multiplier Tensors.
-
-    Returns:
-      SendTPUEmbeddingGradients Op.
-
-    Raises:
-      ValueError: If required gradients have not been defined.
-      RuntimeError: If `mode` is not `TRAINING`.
-    """
-    if self._mode != TRAINING:
-      raise RuntimeError('Only in training mode gradients need to '
-                         'be sent to TPU embedding; got mode {}.'
-                         .format(self._mode))
-
-    g = ops.get_default_graph()
-    gradients = list()
-    for table_id, table in enumerate(self._table_to_config_dict):
-      table_gradients = g.get_collection(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if any(gradient is None for gradient in table_gradients):
-        raise ValueError(
-            'Table {}/{} has undefined gradients: this is probably because the '
-            'model asked TPUEmbedding to compute activations that were not '
-            'used.'.format(table_id, table))
-      concat_table_grads = array_ops.concat(table_gradients, axis=0)
-      if gradient_multipliers is not None:
-        concat_table_grads *= gradient_multipliers[table.name]
-      gradients.append(concat_table_grads)
-
-    return tpu_ops.send_tpu_embedding_gradients(
-        inputs=gradients, config=self.config_proto.SerializeToString())
-
-
-def _validate_table_to_config_dict(table_to_config_dict):
-  """Validate `table_to_config_dict`."""
-  for k, v in six.iteritems(table_to_config_dict):
-    if not isinstance(v, TableConfig):
-      raise ValueError('Value of `table_to_config_dict` must be of type '
-                       '`TableConfig`, got {} for {}.'.format(type(v), k))
-
-
-def _validate_feature_to_table_dict(table_to_config_dict,
-                                    feature_to_table_dict):
-  """Validate `feature_to_table_dict`."""
-  used_table_set = set(feature_to_table_dict.values())
-  table_set = set(table_to_config_dict.keys())
-
-  unused_table_set = table_set - used_table_set
-  if unused_table_set:
-    raise ValueError('`table_to_config_dict` specifies table that is not '
-                     'used in `feature_to_table_dict`: {}.'
-                     .format(unused_table_set))
-
-  extra_table_set = used_table_set - table_set
-  if extra_table_set:
-    raise ValueError('`feature_to_table_dict` refers to a table that is not '
-                     'specified in `table_to_config_dict`: {}.'
-                     .format(extra_table_set))
-
-
-def _validate_batch_size(batch_size, num_cores):
-  if batch_size % num_cores:
-    raise ValueError('`batch_size` is not a multiple of number of '
-                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
-                         batch_size, num_cores))
-
-
-def _validate_optimization_parameters(optimization_parameters):
-  if not isinstance(optimization_parameters, _OptimizationParameters):
-    raise ValueError('`optimization_parameters` must inherit from '
-                     '`_OptimizationPramaters`. '
-                     '`type(optimization_parameters)`={}'.format(
-                         type(optimization_parameters)))
-
-
-class _OptimizerHandler(object):
-  """Interface class for handling optimizer specific logic."""
-
-  def __init__(self, optimization_parameters):
-    self._optimization_parameters = optimization_parameters
-
-  def set_optimization_parameters(self, table_descriptor):
-    raise NotImplementedError()
-
-  def create_variables_and_ops(self, table, variable_name):
-    raise NotImplementedError()
-
-  def get_slot_names(self):
-    raise NotImplementedError()
-
-  def get_slot(self, table, name):
-    raise NotImplementedError()
-
-
-class _AdagradHandler(_OptimizerHandler):
-  """Handles Adagrad specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdagradHandler, self).__init__(optimization_parameters)
-    self._table_to_accumulator_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adagrad.SetInParent()
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    optimizer_name = 'Adagrad'
-    accumulator_initializer = init_ops.constant_initializer(
-        self._optimization_parameters.initial_accumulator)
-    accumulator_variables = _create_partitioned_variables(
-        name='%s/%s' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=accumulator_initializer)
-
-    self._table_to_accumulator_variables_dict[table] = accumulator_variables
-    for host_id, table_variable, accumulator_variable in (zip(
-        range(num_hosts), table_variables, accumulator_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adagrad_parameters(
-                parameters=table_variable,
-                accumulators=accumulator_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_accumulator = (
-            tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(accumulator_variable, retrieved_accumulator))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return ['accumulator']
-
-  def get_slot(self, table, name):
-    if name not in self.get_slot_names():
-      raise ValueError('Adagrad has {} as slot names; got {}.'
-                       .format(self.get_slot_names(), name))
-    return self._table_to_accumulator_variables_dict[table]
-
-
-class _AdamHandler(_OptimizerHandler):
-  """Handles Adam specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdamHandler, self).__init__(optimization_parameters)
-    self._table_to_m_variables_dict = {}
-    self._table_to_v_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adam.beta1 = (
-        self._optimization_parameters.beta1)
-    table_descriptor.optimization_parameters.adam.beta2 = (
-        self._optimization_parameters.beta2)
-    table_descriptor.optimization_parameters.adam.epsilon = (
-        self._optimization_parameters.epsilon)
-    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
-        not self._optimization_parameters.lazy_adam)
-    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
-        self._optimization_parameters.sum_inside_sqrt)
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    optimizer_name = 'Adam'
-    m_initializer = init_ops.zeros_initializer()
-    m_variables = _create_partitioned_variables(
-        name='%s/%s/m' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=m_initializer)
-    v_initializer = init_ops.zeros_initializer()
-    v_variables = _create_partitioned_variables(
-        name='%s/%s/v' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=v_initializer)
-
-    self._table_to_m_variables_dict[table] = m_variables
-    self._table_to_v_variables_dict[table] = v_variables
-
-    for host_id, table_variable, m_variable, v_variable in (zip(
-        range(num_hosts), table_variables,
-        m_variables, v_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adam_parameters(
-                parameters=table_variable,
-                momenta=m_variable,
-                velocities=v_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_m, retrieved_v = (
-            tpu_ops.retrieve_tpu_embedding_adam_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(m_variable, retrieved_m),
-            state_ops.assign(v_variable, retrieved_v))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return ['m', 'v']
-
-  def get_slot(self, table, name):
-    if name == 'm':
-      return self._table_to_m_variables_dict[table]
-    elif name == 'v':
-      return self._table_to_v_variables_dict[table]
-    else:
-      raise ValueError('Adam has {} as slot names; got {}.'
-                       .format(self.get_slot_names(), name))
-
-
-class _StochasticGradientDescentHandler(_OptimizerHandler):
-  """Handles stochastic gradient descent specific logic."""
-
-  def set_optimization_parameters(self, table_descriptor):
-    (table_descriptor.optimization_parameters.stochastic_gradient_descent
-     .SetInParent())
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    del table_config
-
-    for host_id, table_variable in (zip(
-        range(num_hosts), table_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops
-            .load_tpu_embedding_stochastic_gradient_descent_parameters(
-                parameters=table_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table = (
-            tpu_ops
-            .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return []
-
-  def get_slot(self, table, name):
-    raise ValueError('Stochastic gradient descent does not have slot variable.')
-
-
-def _get_optimization_handler(optimization_parameters):
-  if isinstance(optimization_parameters, AdagradParameters):
-    return _AdagradHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, AdamParameters):
-    return _AdamHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
-    return _StochasticGradientDescentHandler(optimization_parameters)
-  else:
-    return NotImplementedError()
-
-
-def _create_ordered_dict(d):
-  """Create an OrderedDict from Dict."""
-  return collections.OrderedDict((k, d[k]) for k in sorted(d))
-
-
-def _create_combiners(table_to_config_dict):
-  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
-
-
-def _create_table_to_features_dict(feature_to_table_dict):
-  """Create mapping from table to a list of its features."""
-  table_to_features_dict_tmp = {}
-  for feature, table in six.iteritems(feature_to_table_dict):
-    if table in table_to_features_dict_tmp:
-      table_to_features_dict_tmp[table].append(feature)
-    else:
-      table_to_features_dict_tmp[table] = [feature]
-
-  table_to_features_dict = collections.OrderedDict()
-  for table in sorted(table_to_features_dict_tmp):
-    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
-  return table_to_features_dict
-
-
-def _create_device_fn(hosts):
-  """Create device_fn() to use with _create_partitioned_variables()."""
-
-  def device_fn(op):
-    """Returns the `device` for `op`."""
-    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
-
-    if part_match:
-      idx = int(part_match.group(1))
-    else:
-      raise RuntimeError('Internal Error: '
-                         'Expected %s to contain /part_*.' % op.name)
-
-    device = hosts[idx]
-    return device
-
-  return device_fn
-
-
-def _create_partitioned_variables(name,
-                                  num_hosts,
-                                  vocabulary_size,
-                                  embedding_dimension,
-                                  initializer,
-                                  collections=None):  # pylint: disable=redefined-outer-name
-  """Creates ParitionedVariables based on `num_hosts` for `table`."""
-  # TODO(shizhiw): automatically place embedding lookup elsewhere?
-  if vocabulary_size < num_hosts:
-    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
-                     'As TPU embedding is not optimized for small tables, '
-                     'please consider other ways for this embedding lookup.')
-
-  return list(variable_scope.get_variable(
-      name,
-      shape=(vocabulary_size, embedding_dimension),
-      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
-      dtype=dtypes.float32,
-      initializer=initializer,
-      collections=collections,
-      trainable=False))
-
-
-@ops.RegisterGradient('TPUEmbeddingActivations')
-def _embedding_activations_grad(activations_op, grad_wrt_activations):
-  """Saves the gradient of embedding activations ops in a graph collection."""
-  g = ops.get_default_graph()
-  table_id = activations_op.get_attr('table_id')
-  lookup_id = activations_op.get_attr('lookup_id')
-  table_gradients = g.get_collection_ref(
-      'tpu_embedding_gradients_table_%d' % table_id)
-
-  if not table_gradients:
-    raise RuntimeError(
-        'Gradients for TPUEmbedding have been generated in non-training mode. '
-        'This is not expected. Consider putting your Optimizer.minimize code '
-        'behind the training mode condition check. For Estimator, you can '
-        'do \n\n'
-        '    if mode == tf.estimator.ModeKeys.TRAIN:\n'
-        '        train_op = opt.minimize(loss)\n'
-        '\n')
-
-  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
-  return [
-      # RegisterGradient requires that value be returned for all inputs. Since
-      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
-      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
-      # embedding activations (grad_wrt_activations) has the same shape as the
-      # activations returned by  embedding_activations.
-      array_ops.zeros(arg.shape, dtype=dtypes.float32)
-      for arg in activations_op.inputs
-  ]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..308adc77e9ad2d912d0461512655b55faa53da60
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding_gradient import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 27c8ccf6bd7e77743dabbd329cea01bc45697c52..893118412e1363ce50416e6ef36692bc23d04179 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1,3655 +1,33 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPUEstimator class."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import os
-import signal
-import sys
-import threading
-import time
-
-import numpy as np
-import six
-from six.moves import queue as Queue  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.ops import gen_tpu_ordinal_selector_op
-from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import error_handling
-from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional
-from tensorflow.contrib.tpu.python.tpu import session_support
-from tensorflow.contrib.tpu.python.tpu import tensor_tracer
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_context
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.contrib.training.python.training import hparam
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as contrib_summary
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-
-_INITIAL_LOSS = 1e7
-_ZERO_LOSS = 0.
-_TPU_ESTIMATOR = 'tpu_estimator'
-_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
-_BATCH_SIZE_KEY = 'batch_size'
-_CTX_KEY = 'context'
-_USE_TPU_KEY = 'use_tpu'
-_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_ONE_GIGABYTE = 1024 * 1024 * 1024
-_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
-_TPU_TRAIN_OP = '_tpu_train_op'
-_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
-_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
-
-# Ideally _USE_TPU_KEY should be reserved as well. However there are already
-# models that make use of this key, thus it can not be reserved now to prevent
-# breakage. In the long run, we would like to mitigate this by migrating models
-# off of using _USE_TPU_KEY.
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
-
-# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
-# only used for per-core based deployments. For per-host based pipelines, if a
-# user returns a Dataset instance it will be automatically wrapped in a
-# tf.while_loop (This can be disabled by returning features and labels
-# explicitly).
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
-
-ops.register_proto_function(
-    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
-    proto_type=variable_pb2.VariableDef,
-    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
-    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
-
-
-def _is_iterable(obj):
-  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
-  try:
-    iter(obj)
-    return True
-  except TypeError:
-    return False
-
-
-class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
-
-  def AddOp(self, op):
-    if op.type in [
-        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
-        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
-    ]:
-      raise ValueError('Use tf.contrib.summary inside of host_calls.')
-
-
-def _create_global_step(graph):
-  graph = graph or ops.get_default_graph()
-  if training.get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
-
-
-def _create_or_get_iterations_per_loop():
-  """Creates or gets the iterations_per_loop variable.
-
-  In TPUEstimator, the user provided computation, the model_fn, is wrapped
-  inside a tf.while_loop for peak performance. The iterations of the loop are
-  specified by this variable, which adjusts its value on the CPU after each TPU
-  program execution and before the next TPU execution.
-
-  The purpose of using a variable, rather then a constant, is to allow
-  TPUEstimator adapt the TPU training iterations according to the final steps
-  specified by users. For example, if the user sets the iterations_per_loop as 4
-  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
-  variable will have the following value before each TPU training.
-
-      - 1-th TPU execution: iterations_per_loop = 4
-      - 2-th TPU execution: iterations_per_loop = 4
-      - 3-th TPU execution: iterations_per_loop = 2
-
-  As model_fn increases the global step once per train_op invocation, the global
-  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
-  users.
-
-  Returns:
-    A TF non-trainable resource variable.
-
-  Raises:
-    RuntimeError: If multi iterations_per_loop variables were found.
-  """
-  graph = ops.get_default_graph()
-  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
-  iter_vars = graph.get_collection(collection_name)
-  if len(iter_vars) == 1:
-    return iter_vars[0]
-  elif len(iter_vars) > 1:
-    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
-
-  with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(
-        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
-      return variable_scope.get_variable(
-          _ITERATIONS_PER_LOOP_VAR,
-          initializer=init_ops.zeros_initializer(),
-          shape=[],
-          dtype=dtypes.int32,
-          trainable=False,
-          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
-          use_resource=True)
-
-
-def _sync_variables_ops(ctx):
-  """Create varriables synchronization ops.
-
-  Gets the variables back from TPU nodes. This means the variables updated
-  by TPU will now be *synced* to host memory.
-  In BROADCAST mode, we skip this sync since the variables are ususally too
-  big to transmit via RPC.
-
-  Args:
-    ctx: A `_InternalTPUContext` instance with mode.
-
-  Returns:
-    A list of sync ops.
-  """
-
-  if not ctx.is_input_broadcast_with_iterators():
-    return [
-        array_ops.check_numerics(v.read_value(),
-                                 'Gradient for %s is NaN' % v.name).op
-        for v in variables.trainable_variables()
-    ]
-  else:
-    return [control_flow_ops.no_op()]
-
-
-def _increase_eval_step_op(iterations_per_loop):
-  """Returns an op to increase the eval step for TPU evaluation.
-
-  Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU system
-      before returning to CPU host for each `Session.run`.
-
-  Returns:
-    An operation
-  """
-  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-  # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(
-      eval_step,
-      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
-      use_locking=True)
-
-
-def _extract_key_names(tensor_or_dict):
-  if isinstance(tensor_or_dict, dict):
-    return sorted(tensor_or_dict.keys())
-  return []
-
-
-class _SIGNAL(object):
-  """Signal used to control the thread of infeed/outfeed.
-
-  All preserved signals must be negative numbers. Positive numbers are used to
-  indicate the number of iterations for next training/evaluation loop.
-  """
-  NEXT_BATCH = -1
-  STOP = -2
-
-
-class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
-  `export_outputs`.
-
-  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
-  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
-  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
-  To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
-  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
-  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
-  `tensors` usually specify the model logits, which are transferred back from
-  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
-  size is the first dimension. Once all tensors are available at CPU host from
-  all shards, they are concatenated (on CPU) and passed as positional arguments
-  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
-  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
-  name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
-  `eval_metrics`.
-
-  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
-  function should not capture any Tensors in `model_fn`.
-
-  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function and returns a list of Tensors. `host_call` currently
-  works for train() and evaluate(). The Tensors returned by the function is
-  executed on the CPU on every step, so there is communication overhead when
-  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
-  size of the tensors. The `tensors` are concatenated along their major (batch)
-  dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `TPUEstimatorSpec` instance."""
-    host_calls = {}
-    if eval_metrics is not None:
-      host_calls['eval_metrics'] = eval_metrics
-    if host_call is not None:
-      host_calls['host_call'] = host_call
-    _OutfeedHostCall.validate(host_calls)
-
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in training_hooks + evaluation_hooks + prediction_hooks:
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
-                        .format(hook))
-
-    return super(TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    host_calls = {}
-    if self.eval_metrics is not None:
-      host_calls['eval_metrics'] = self.eval_metrics
-    if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
-    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
-    eval_metric_ops = None
-    if self.eval_metrics is not None:
-      eval_metric_ops = host_call_ret['eval_metrics']
-    hooks = None
-    if self.host_call is not None:
-      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    if tensor_tracer.TensorTracer.is_enabled():
-      tt = tensor_tracer.TensorTracer()
-      tracing_calls = tt.trace_cpu(ops.get_default_graph())
-      tracing_call_ret = _OutfeedHostCall.create_cpu_hostcall(tracing_calls)
-      tracing_functions = tracing_call_ret.values()
-      if tracing_functions:
-        if hooks:
-          hooks.extend([_OutfeedHostCallHook(tracing_functions)])
-        else:
-          hooks = [_OutfeedHostCallHook(tracing_functions)]
-    hooks = tuple(hooks or [])
-    scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=self.loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        scaffold=scaffold,
-        training_hooks=self.training_hooks + hooks,
-        evaluation_hooks=self.evaluation_hooks + hooks,
-        prediction_hooks=self.prediction_hooks + hooks)
-
-
-class _OpQueueContext(object):
-  """Manages work queue and thread for a infeed/outfeed thread."""
-
-  def __init__(self, name, target, args):
-    self._name = name
-    self._queue = Queue.Queue()
-    args = (self,) + args
-    self._thread = threading.Thread(name=name, target=target, args=args)
-    self._thread.daemon = True
-    self._thread.start()
-
-  def stop(self):
-    self._queue.put(_SIGNAL.STOP)
-
-  def send_next_batch_signal(self, iterations):
-    self._queue.put(iterations)
-
-  def read_iteration_counts(self):
-    while True:
-      iterations = self._queue.get(block=True)
-      logging.debug('%s read iterations %s', self._name, iterations)
-      if iterations == _SIGNAL.STOP:
-        logging.info('%s received shutdown signal, stopping.', self._name)
-        return
-      yield iterations
-
-  def join(self):
-    logging.info('Shutting down %s thread.', self._name)
-    self.stop()
-    self._thread.join()
-
-
-class _OpSignalOnceQueueContext(_OpQueueContext):
-  """Manages work queue and thread for a infeed/outfeed thread.
-
-  This subclass only signals once.
-  """
-
-  def __init__(self, name, target, args):
-    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
-    self._has_signaled = False
-
-  def send_next_batch_signal(self, iterations):
-    if not self._has_signaled:
-      self._queue.put(iterations)
-      self._has_signaled = True
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
-
-  def __init__(self,
-               ctx,
-               enqueue_ops,
-               dequeue_ops,
-               tpu_compile_op,
-               run_infeed_loop_on_coordinator=True,
-               rendezvous=None,
-               master=None,
-               session_config=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._rendezvous = rendezvous
-    self._master = master
-    self._session_config = session_config
-    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-    self._feed_error = None
-    self._finished = False
-    self._should_initialize_tpu = True
-    self._tpu_compile_op = tpu_compile_op
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_ops = []
-    if self._should_initialize_tpu:
-      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
-    else:
-      self._finalize_ops = []
-
-    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
-    self._init_ops.extend(summary_writer_init_ops)
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    for op in summary_writer_init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def _run_infeed(self, queue_ctx, session):
-    logging.info('Starting infeed thread controller.')
-    if self._initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   self._initial_infeed_sleep_secs)
-      time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-
-    with self._rendezvous.catch_errors(source='infeed', session=session):
-      if self._run_infeed_loop_on_coordinator:
-        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-          for i in xrange(steps):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(self._enqueue_ops)
-      else:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      logging.info('Infeed thread finished, shutting down.')
-
-  def _run_outfeed(self, queue_ctx, session):
-    logging.info('Starting outfeed thread controller.')
-    with self._rendezvous.catch_errors(source='outfeed', session=session):
-      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-        for i in xrange(steps):
-          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-          session.run(self._dequeue_ops)
-      logging.info('Outfeed thread finished, shutting down.')
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpQueueContext(name=name, target=target, args=args)
-
-  def _assertCompilationSucceeded(self, result, coord):
-    proto = tpu_compilation_result.CompilationResultProto()
-    proto.ParseFromString(result)
-    if proto.status_error_message:
-      logging.error('Compilation failed: {}'.format(proto.status_error_message))
-      coord.request_stop()
-    else:
-      logging.info('Compilation succeeded')
-
-  def after_create_session(self, session, coord):
-    if self._should_initialize_tpu:
-      logging.info('Init TPU system')
-      start = time.time()
-      with ops.Graph().as_default():
-        with tf_session.Session(
-            self._master, config=self._session_config) as sess:
-          sess.run(tpu.initialize_system(job=self._master_job))
-      logging.info('Initialized TPU in %d seconds', time.time() - start)
-
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
-    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
-      logging.info('Compiling user program: this may take a while...')
-      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
-
-    self._infeed_controller = self._create_infeed_controller(
-        name='InfeedController', target=self._run_infeed, args=(session,))
-
-    self._outfeed_controller = _OpQueueContext(
-        name='OutfeedController', target=self._run_outfeed, args=(session,))
-
-    # Enable the worker watchdog to terminate workers on coordinator exit.
-    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
-    if watchdog_timeout > 0:
-      session_support.start_worker_watchdog(session,
-                                            shutdown_timeout=watchdog_timeout)
-
-  def before_run(self, run_context):
-    self._feed_error = None
-
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
-    self._infeed_controller.send_next_batch_signal(iterations)
-
-    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                 iterations)
-    self._outfeed_controller.send_next_batch_signal(iterations)
-
-  def end(self, session):
-    self._finished = True
-    logging.info('Stop infeed thread controller')
-    self._infeed_controller.join()
-    self._rendezvous.record_done('infeed')
-
-    logging.info('Stop output thread controller')
-    self._outfeed_controller.join()
-    self._rendezvous.record_done('outfeed')
-
-    logging.info('Shutdown TPU system.')
-    session.run(self._finalize_ops)
-
-
-class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
-
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
-               rendezvous=None, master=None, session_config=None):
-    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx,
-        enqueue_ops,
-        dequeue_ops,
-        tpu_compile_op=tpu_compile_op,
-        run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous,
-        master=master,
-        session_config=session_config)
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
-
-
-class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step.
-
-  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
-  following differences for TPU training:
-
-  1. This hook sets the variable for iterations_per_loop, which is used by
-     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
-     As the hook execution order is not guaranteed, the variable update is
-     handled in `after_create_session` and `after_run` as
-     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
-
-  2. For each training loop (session.run), the global step could be increased
-     multiple times on TPU. The global step tensor value will be explicitly read
-     again in `after_run` to ensure the latest value is retrieved to avoid race
-     condition.
-  """
-
-  def __init__(self, iterations, num_steps=None, last_step=None):
-    """Initializes a `StopAtStepHook`.
-
-    Args:
-      iterations: The number of iterations to run optimizer per training loop.
-      num_steps: Number of steps to execute.
-      last_step: Step after which to stop.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if num_steps is None and last_step is None:
-      raise ValueError('One of num_steps or last_step must be specified.')
-    if num_steps is not None and last_step is not None:
-      raise ValueError('Only one of num_steps or last_step can be specified.')
-    self._num_steps = num_steps
-    self._last_step = last_step
-    self._iterations = iterations
-
-  def _next_iterations(self, global_step, last_step):
-    gap = last_step - global_step
-    return min(gap, self._iterations)
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError('Global step should be created.')
-
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps
-
-    iterations = self._next_iterations(global_step, self._last_step)
-
-    self._iterations_per_loop_var.load(iterations, session=session)
-
-  def after_run(self, run_context, run_values):
-    # Global step cannot be retrieved via SessionRunArgs and before_run due to
-    # race condition.
-    global_step = run_context.session.run(self._global_step_tensor)
-    if global_step >= self._last_step:
-      run_context.request_stop()
-    else:
-      iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(
-          iterations, session=run_context.session)
-
-
-class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step."""
-
-  def __init__(self, num_steps):
-    """Initializes a `_SetEvalIterationsHook`.
-
-    Args:
-      num_steps: Number of steps to execute.
-    """
-    self._num_steps = num_steps
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    self._iterations_per_loop_var.load(self._num_steps, session=session)
-
-
-class _StoppingPredictHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop according to the stopping signal in prediction."""
-
-  def __init__(self, scalar_stopping_signal):
-    self._scalar_stopping_signal = scalar_stopping_signal
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
-    # in side threads for prediction model. But it makes the
-    # TPUInfeedOutfeedSessionHook prints nice message.
-    self._iterations_per_loop_var.load(1, session=session)
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-    scalar_stopping_signal = run_values.results
-    if _StopSignals.should_stop(scalar_stopping_signal):
-      # NOTE(xiejw): In prediction, stopping signals are inserted for each
-      # batch. And we append one more batch to signal the system it should stop.
-      # The data flow might look like
-      #
-      #  batch   0: images, labels, stop = 0  (user provided)
-      #  batch   1: images, labels, stop = 0  (user provided)
-      #  ...
-      #  batch  99: images, labels, stop = 0  (user provided)
-      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
-      #
-      # where the final batch (id = 100) is appended by TPUEstimator, so we
-      # should drop it before returning the predictions to user.
-      # To achieve that, we throw the OutOfRangeError in after_run. Once
-      # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediction, i.e., batch with id=100, will be discarded
-      # immediately
-      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
-
-
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
-  """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A fn returns enqueue_ops."""
-    num_cores_per_host = ctx.num_of_cores_per_host
-    per_host_sharded_inputs = []
-    for core_ordinal in range(num_cores_per_host):
-      with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        user_context = tpu_context.TPUContext(
-            internal_ctx=ctx,
-            input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
-        inputs = _Inputs.from_input_fn(input_fn(user_context))
-        if inputs.is_dataset:
-          raise TypeError(
-              '`input_fn` returning `Dataset`  is not yet supported in '
-              'per-Core input pipeline deployment yet. Please set '
-              'TPUConfig.per_host_input_for_training to True or return '
-              '`features` and `labels` from `input_fn`')
-        features, labels = inputs.features_and_labels()
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
-        per_host_sharded_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-
-    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
-    return per_host_enqueue_ops
-
-  return enqueue_ops_fn, captured_infeed_queue
-
-
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-      if batch_axis is not None:
-        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A Fn returning the TPU infeed enqueue ops.
-
-    By providing as a Fn, it can be invoked inside the tf.while_loop such that
-    the input pipeline for multiple iterations can be executed by one
-    Session.run call.
-
-    Returns:
-      list of dict of ops.
-    """
-    with ops.device(device):
-      num_of_replicas_per_host = ctx.num_of_replicas_per_host
-      # Convert user input to features and labels.  If the user returns a
-      # dataset, it is initialized and the features and labels extracted via
-      # `dataset.iterator.get_next()`
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels, signals))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device,
-              tpu_ordinal_function=tpu_ordinal_function_impl))
-      if signals is None:
-        return per_host_enqueue_ops
-      else:
-        return {
-            'ops': per_host_enqueue_ops,
-            'signals': signals,
-        }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_per_host_v2_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if not is_dataset:
-      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
-                      'input pipeline configuration.')
-
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True,
-          num_invocations_per_step=ctx.num_of_replicas_per_host)
-
-    dataset_initializer = inputs.dataset_initializer()
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """Generates the per_host enqueue ops."""
-    control_deps = []
-    per_host_sharded_inputs = []
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-    cached_signals = None
-    with ops.device(device):
-      if not inputs.is_dataset:
-        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
-      for _ in range(num_replicas_per_host):
-        # Use control dependencies to ensure a deterministic ordering.
-        with ops.control_dependencies(control_deps):
-          features, labels = inputs.features_and_labels()  # Calls get_next()
-          signals = inputs.signals()
-
-          # All the replicas share the replica 0's stopping singal.
-          # This avoids inconsistent state among different model replcias.
-          if cached_signals:
-            signals['stopping'] = cached_signals['stopping']
-          else:
-            cached_signals = signals
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels, signals))
-        control_deps.extend(flattened_inputs)
-        per_host_sharded_inputs.append(flattened_inputs)
-
-      if inputs_structure_recorder.flattened_input_dims:
-        input_partition_dims = inputs_structure_recorder.flattened_input_dims
-        if signals:
-          input_partition_dims += [None] * len(signals)
-        # pylint: disable=protected-access
-        infeed_queue = tpu_feed._PartitionedInfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-            host_id=host_id,
-            input_partition_dims=input_partition_dims,
-            device_assignment=ctx.device_assignment)
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs)
-      else:
-        infeed_queue = tpu_feed.InfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs,
-            tpu_ordinal_function=tpu_ordinal_function_impl)
-      captured_infeed_queue.capture(infeed_queue)
-
-    if signals is None:
-      return per_host_enqueue_ops
-    else:
-      return {
-          'ops': per_host_enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
-                                      num_hosts):
-  """Generates infeed enqueue ops for one input_fn on all the hosts."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device_0, invocation_index=0)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-
-  def tpu_ordinal_function_impl(replica_id):
-    if ctx.device_assignment:
-      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
-    else:
-      return replica_id % num_replicas_per_host
-
-  def device_function_impl(replica_id):
-    return ctx.tpu_host_placement_function(replica_id=replica_id)
-
-  def enqueue_ops_fn():
-    """Generates enqueue ops for all the hosts."""
-    broadcasted_inputs = []
-    flattened_inputs = None  # Cache result from input_fn.
-    signals = None
-    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
-        for _ in xrange(ctx.num_of_replicas_per_host):
-          # Note: input_fn is only called once at host 0 for the first replica.
-          # The features and labels returned from that invocation are
-          # broadcasted to other replicas(including the replicas on other
-          # hosts).
-          if flattened_inputs is None:
-            features, labels = inputs.features_and_labels()  # Calls get_next()
-            signals = inputs.signals()
-
-            inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            flattened_inputs = (
-                inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels, signals))
-          broadcasted_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(broadcasted_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-    enqueue_ops = infeed_queue.generate_enqueue_ops(
-        broadcasted_inputs,
-        tpu_ordinal_function=tpu_ordinal_function_impl,
-        placement_function=device_function_impl)
-
-    if signals is None:
-      return enqueue_ops
-    else:
-      return {
-          'ops': enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-class _InputPipeline(object):
-  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
-
-  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in
-  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
-  multi-host TPU training) or for one host (usually for single-host TPU
-  evaluation), and sends all `features` and `labels` returned by `input_fn` to
-  TPU infeed. For per-core invocation, `features` and `labels` are piped to
-  infeed directly, one tuple for each core. For per-host invocation,  `features`
-  and `labels` are split at host (with respect to `batch_axis`) and piped to all
-  cores accordingly.
-
-  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
-  inputs returned by the `input_fn` can have one of the following forms:
-  1. features
-  2. (features, labels)
-  3. ((arbitrarily nested structure of features), labels)
-
-  Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separately to underlying methods. For TPU training, TPUEstimator
-  may expect multiple `features` and `labels` tuples one for each core.
-
-  TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  Both `features` and `labels` can be any nested sturcture
-  supported by TF nest (namely, dict, tuples, namedtuples or any nested
-  structure of such of Tensors).  `labels` could be `None` as well.
-
-  These are flattened before they are passed to the infeed/outfeed library
-  as that expectes flattend lists.
-  """
-
-  class InputsStructureRecorder(object):
-    """The recorder to record inputs structure."""
-
-    def __init__(self, input_partition_dims=None):
-      # Holds the structure of inputs
-      self._feature_structure = {}
-      self._flattened_input_dims = None
-
-      if input_partition_dims:
-        # This should have been validated in TPUConfig.
-        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
-        if len(input_partition_dims) == 2:
-          self._feature_dims, self._label_dims = input_partition_dims
-        else:
-          self._feature_dims = input_partition_dims[0]
-          self._label_dims = None
-
-        assert self._feature_dims is not None, ('input_partition_dims[0] must '
-                                                'not be None')
-      else:
-        self._feature_dims = None
-        self._label_dims = None
-
-      # Internal state.
-      self._initialized = False
-
-    @property
-    def flattened_input_dims(self):
-      assert self._initialized, 'InputsStructureRecorder is not initialized.'
-      return self._flattened_input_dims
-
-    def has_labels(self):
-      return 'labels' in self._feature_structure
-
-    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
-                            label_dims_names, label_names, has_labels):
-      """Flatten input dims with the same order as flattened input tensors."""
-      flattened_input_dims = []
-      if feature_dims_names:
-        # We need a fixed ordering for matching the tensors in features.
-        flattened_input_dims.extend(
-            [feature_dims[name] for name in feature_dims_names])
-      else:
-        flattened_input_dims.append(feature_dims)
-
-      if label_dims_names:
-        # We need a fixed ordering for matching the tensors in labels.
-        flattened_input_dims.extend(
-            [label_dims[name] for name in label_dims_names])
-      else:
-        if label_names:
-          num_tensors_in_label = len(label_names)
-        else:
-          num_tensors_in_label = int(has_labels)
-        # Setting `None` in input_partition_dims[1] will apply `None` to
-        # all the tensors in labels, regardless of internal structure.
-        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
-
-      return flattened_input_dims
-
-    def validate_and_record_structure(self, features, labels):
-      """Validates and records the structure of `features` and `labels`."""
-      # Extract structure.
-      has_labels = labels is not None
-      feature_names = _extract_key_names(features)
-      label_names = _extract_key_names(labels)
-
-      if not self._initialized:
-        # Record structure.
-        self._initialized = True
-        if self._feature_dims is not None:
-          feature_dims_names = _extract_key_names(self._feature_dims)
-          if feature_dims_names != feature_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[0] mismatched feature'
-                ' keys. Expected {}, got {}'.format(feature_names,
-                                                    feature_dims_names))
-
-          label_dims_names = _extract_key_names(self._label_dims)
-          if self._label_dims is not None and label_dims_names != label_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[1] mismatched label'
-                ' keys. Expected {}, got {}'.format(label_names,
-                                                    label_dims_names))
-
-          self._flattened_input_dims = self._flatten_input_dims(
-              self._feature_dims, feature_dims_names, self._label_dims,
-              label_dims_names, label_names, has_labels)
-
-    def flatten_features_and_labels(self, features, labels, signals=None):
-      """Flattens the `features` and `labels` to a single tensor list."""
-      self._feature_structure['features'] = features
-      if labels is not None:
-        self._feature_structure['labels'] = labels
-      if signals is not None:
-        self._feature_structure['signals'] = signals
-      return data_nest.flatten(self._feature_structure)
-
-    def unflatten_features_and_labels(self, flattened_inputs):
-      """Restores the flattened inputs to original features and labels form.
-
-      Args:
-        flattened_inputs: Flattened inputs for each shard.
-
-      Returns:
-        A tuple of (`features`, `labels`), where `labels` could be None.
-        Each one, if present, should have identical structure (single tensor vs
-        dict) as the one returned by input_fn.
-
-      Raises:
-        ValueError: If the number of expected tensors from `flattened_inputs`
-          mismatches the recorded structure.
-      """
-
-      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
-                                                      flattened_inputs)
-      return _Inputs(
-          unflattened_inputs['features'],
-          unflattened_inputs.get('labels'),
-          signals=unflattened_inputs.get('signals'))
-
-  def __init__(self, input_fn, batch_axis, ctx):
-    """Constructor.
-
-    Args:
-      input_fn: input fn for train or eval.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards.
-      ctx: A `_InternalTPUContext` instance with mode.
-
-    Raises:
-      ValueError: If both `sharded_features` and `num_cores` are `None`.
-    """
-    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
-        ctx.input_partition_dims)
-
-    self._sharded_per_core = ctx.is_input_sharded_per_core()
-    self._input_fn = input_fn
-    self._infeed_queue = None
-    self._ctx = ctx
-    self._batch_axis = batch_axis
-
-  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
-    """Generates infeed enqueue ops and dequeue_fn."""
-    # While tf.while_loop is called, the body function, which invokes
-    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
-    # structure is recorded.
-    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
-        self._invoke_input_fn_and_record_structure())
-
-    self._validate_input_pipeline()
-
-    def dequeue_fn():
-      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
-      # computations must agree on the core on which infeed takes place. We
-      # choose to perform infeed on logical core 0 of each replica.
-      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
-      # The unflatten process uses the structure information recorded above.
-      return self._inputs_structure_recorder.unflatten_features_and_labels(
-          values)
-
-    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
-
-  def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    all_dataset_initializers = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-
-    run_infeed_loop_on_coordinator = True
-
-    if self._sharded_per_core:
-      # Per-Core input pipeline deployment.
-      # Invoke input pipeline for each core and placed on the corresponding
-      # host.
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
-                generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    host_device, host_id))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              run_infeed_loop_on_coordinator = False
-              enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(captured_infeed_queue.get())
-
-    elif self._ctx.is_input_broadcast_with_iterators():
-      # Only calls input_fn in host 0.
-      host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
-                                            self._inputs_structure_recorder,
-                                            num_hosts))
-      if dataset_initializer:
-        all_dataset_initializers.append(dataset_initializer)
-        run_infeed_loop_on_coordinator = False
-        wrap_fn = (
-            _wrap_computation_in_while_loop
-            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-            _wrap_computation_in_while_loop_with_stopping_signals)
-        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-      else:
-        enqueue_ops.append(enqueue_ops_fn())
-      infeed_queues.append(captured_infeed_queue.get())
-    else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_v2_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, host_device, host_id))
-            else:
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, self._batch_axis,
-                      host_device, host_id))
-
-            # NOTE(xiejw): We dispatch here based on the return type of the
-            # users `input_fn`.
-            #
-            # 1. If input_fn returns a Dataset instance, we initialize the
-            # iterator outside of tf.while_loop, and call the iterator.get_next
-            # inside tf.while_loop.  This should be always safe.
-            #
-            # 2. If input_fn returns (features, labels), it is too late to wrap
-            # them inside tf.while_loop, as resource initialization cannot be
-            # handled in TF control flow properly. In this case, we will use
-            # python loop to enqueue the data into TPU system.  This may be
-            # slow compared to the previous case.
-            if dataset_initializer:
-              all_dataset_initializers.append(dataset_initializer)
-              run_infeed_loop_on_coordinator = False
-              wrap_fn = (
-                  _wrap_computation_in_while_loop
-                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-                  _wrap_computation_in_while_loop_with_stopping_signals)
-              enqueue_ops.append(
-                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(captured_infeed_queue.get())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, [
-        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
-    ], run_infeed_loop_on_coordinator
-
-  def _validate_input_pipeline(self):
-    """Validates the input pipeline.
-
-    Perform some sanity checks to log user friendly information. We should
-    error out to give users better error message. But, if
-    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    user code, so, log a warning.
-
-    Raises:
-      RuntimeError: If the validation failed.
-    """
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'It could be slow and not scalable. Please consider '
-                 'converting your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-
-
-def call_computation(computation,
-                     experimental_exported_model_uses_all_cores=True):
-  """Call computation.
-
-  computation uses a single-core for TPU inference. If
-  `experimental_exported_model_uses_all_cores` is `True`, this function will
-  round-robin
-  computation among all TPU cores visible to the host; otherwise, it will use
-  a single core.
-
-  Args:
-    computation: A Python function that takes no inputs and builds computation
-      graph. If `computation` returns m outputs, this function will return a
-      list of m Tensors.
-    experimental_exported_model_uses_all_cores: Whether to round-robin among all
-      cores visible to the host, or to use a single core.
-
-  Returns:
-    A list of output tensors.
-  """
-  if experimental_exported_model_uses_all_cores:
-    # Using `TPUPartitionedCall` makes it possible to target a different
-    # TPU core with every `Session.run()` call. Note that the entire inference
-    # graph executes on a single core, and that invocations of this graph
-    # will round-robin among the cores attached to a host.
-    @function.Defun()
-    def tpu_subgraph():
-      return computation()
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_subgraph.captured_inputs,
-        device_ordinal=gen_tpu_ordinal_selector_op.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
-        f=tpu_subgraph)
-  else:
-    return computation()
-
-
-class _ModelFnWrapper(object):
-  """A `model_fn` wrapper.
-
-  This makes calling model_fn on CPU and TPU easier and more consistent and
-  performs necessary check and mutation required by TPU training and evaluation.
-
-  In addition, this wrapper manages converting the `model_fn` to a single TPU
-  train and eval step.
-  """
-
-  def __init__(self, model_fn, config, params, ctx):
-    self._model_fn = model_fn
-    self._config = config
-    self._params = params
-    self._ctx = ctx
-
-  def call_without_tpu(self, features, labels, is_export_mode):
-    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
-
-  def convert_to_single_tpu_train_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single train step on TPU.
-
-    The user provided `model_fn` takes input tuple
-    (features, labels) and produces the EstimatorSpec with train_op and loss for
-    train `mode`. This usually represents a single train computation on CPU.
-
-    For TPU training, a train (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input should be taken from TPU infeed rather
-    than input pipeline (input_fn) directly. To fit TPU loop and replicate
-    pattern, the original train computation should be reformed, which is the
-    returned `train_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
-      representing the train step for TPU.
-    """
-
-    host_call = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_training_hooks = _CapturedObject()
-
-    def train_step(loss):
-      """Training step function for use inside a while loop."""
-      del loss  # unused; required in function signature.
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-
-      estimator_spec = self._verify_estimator_spec(
-          self._call_model_fn(features, labels))
-      loss, train_op = estimator_spec.loss, estimator_spec.train_op
-
-      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-      else:
-        captured_scaffold_fn.capture(None)
-
-      captured_training_hooks.capture(estimator_spec.training_hooks)
-
-      tracing_ops = []
-      if tensor_tracer.TensorTracer.is_enabled():
-        tt = tensor_tracer.TensorTracer()
-        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
-                                         self._ctx.num_replicas,
-                                         fetches=[loss, train_op])
-
-      # We must run train_op to update the variables prior to running the
-      # outfeed.
-      with ops.control_dependencies([train_op]+tracing_ops):
-        host_call_outfeed_ops = []
-        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
-            and estimator_spec.host_call is not None):
-          host_call.record({'host_call': estimator_spec.host_call})
-          host_call_outfeed_ops = host_call.create_enqueue_op()
-        with ops.control_dependencies(host_call_outfeed_ops):
-          return array_ops.identity(loss)
-
-    return (train_step, host_call, captured_scaffold_fn,
-            captured_training_hooks)
-
-  def convert_to_single_tpu_eval_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single eval step on TPU.
-
-    Similar to training, the user provided `model_fn` takes input tuple
-    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
-    eval `mode`. This usually represents a single evaluation computation on CPU.
-
-    For TPU evaluation, a eval (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input and output are slightly different. Input,
-    features and labels, should be taken from TPU infeed rather than input
-    pipeline (input_fn) directly. Output is managed in two stages.  First, the
-    model outputs as the result of evaluation computation, usually model logits,
-    should be transferred from TPU system to CPU. Then, all model outputs are
-    concatenated first on CPU and sent to the metric_fn for metrics computation.
-    To fit TPU evaluation pattern, the original eval computation should be
-    reformed, which is the returned `eval_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
-      representing the eval step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_eval_hooks = _CapturedObject()
-
-    def eval_step(total_loss):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-
-      tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU evaluation must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      loss = tpu_estimator_spec.loss
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
-
-      to_record = {}
-      if tpu_estimator_spec.eval_metrics:
-        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
-      if tpu_estimator_spec.host_call is not None:
-        # We assume that evaluate won't update global step, so we don't wrap
-        # this host_call.
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return math_ops.add(total_loss, loss)
-
-    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-
-  def convert_to_single_tpu_predict_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single predict step on TPU.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
-      predict_fn representing the predict step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_predict_hooks = _CapturedObject()
-
-    def predict_step(unused_scalar_stopping_signal):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      stopping_signals = inputs.signals()
-
-      assert stopping_signals is not None, (
-          'Internal Error: `signals` is missing.')
-
-      tpu_estimator_spec = self._call_model_fn(
-          features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU prediction must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
-
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
-      to_record = {}
-      identity_fn = lambda **kwargs: kwargs
-      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
-      to_record['signals'] = [identity_fn, stopping_signals]
-      if tpu_estimator_spec.host_call is not None:
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
-
-    return (predict_step, host_calls, captured_scaffold_fn,
-            captured_predict_hooks)
-
-  def _verify_tpu_spec_predictions(self, predictions):
-    """Validates TPUEstimatorSpec.predictions dict."""
-    # TODO(xiejw): Adds validation for prediction dictionrary.
-    # TODO(xiejw): Adds support for single tensor as predictions.
-    if not isinstance(predictions, dict):
-      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
-
-    for (key, tensor) in predictions.items():
-      if tensor.shape.dims[0].value is None:
-        raise ValueError(
-            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
-    return predictions
-
-  def _validate_model_features_and_labels(self, features, labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: A tensor or any nested structure of tensors supported by TF nest,
-        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for tensor in data_nest.flatten(obj):
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                ('The {} to the model returned by input_fn must have static '
-                 'shape. Tensor: {}').format(obj_name, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
-  def _call_model_fn(self, features, labels, is_export_mode=False):
-    """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-
-    # Makes deep copy with `config` and params` in case user mutates them.
-    config = copy.deepcopy(self._config)
-    params = copy.deepcopy(self._params)
-
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    elif labels is not None:
-      raise ValueError(
-          'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = self._ctx.mode
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-
-    if 'params' not in model_fn_args:
-      raise ValueError('model_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params[\'batch_size\']'.format(self._model_fn))
-
-    if is_export_mode:
-      batch_size_for_model_fn = None
-    else:
-      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
-
-    if batch_size_for_model_fn is not None:
-      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
-
-    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
-    _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
-
-    if not running_on_cpu:
-      user_context = tpu_context.TPUContext(
-          internal_ctx=self._ctx, call_from_input_fn=False)
-      _add_item_to_params(params, _CTX_KEY, user_context)
-
-    estimator_spec = self._model_fn(features=features, **kwargs)
-    if (running_on_cpu and
-        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
-      # The estimator_spec will be passed to `Estimator` directly, which expects
-      # type `EstimatorSpec`.
-      return estimator_spec.as_estimator_spec()
-    else:
-      return estimator_spec
-
-  def _verify_estimator_spec(self, estimator_spec):
-    """Validates the estimator_spec."""
-    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-      return estimator_spec
-
-    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
-    if estimator_spec.training_chief_hooks:
-      raise ValueError(
-          err_msg.format('training_chief_hooks') + 'If you want' +
-          ' to pass training hooks, please pass via training_hooks.')
-
-    if estimator_spec.scaffold:
-      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
-                      'Please use TPUEstimatorSpec.')
-    return estimator_spec
-
-
-class _OutfeedHostCall(object):
-  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
-
-  def __init__(self, ctx):
-    self._ctx = ctx
-    self._names = []
-    # All of these are dictionaries of lists keyed on the name.
-    self._host_fns = {}
-    self._tensor_keys = collections.defaultdict(list)
-    self._tensors = collections.defaultdict(list)
-    self._tensor_dtypes = collections.defaultdict(list)
-    self._tensor_shapes = collections.defaultdict(list)
-
-  @staticmethod
-  def validate(host_calls):
-    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
-
-    for name, host_call in host_calls.items():
-      if not isinstance(host_call, (tuple, list)):
-        raise ValueError('{} should be tuple or list'.format(name))
-      if len(host_call) != 2:
-        raise ValueError('{} should have two elements.'.format(name))
-      if not callable(host_call[0]):
-        raise TypeError('{}[0] should be callable.'.format(name))
-      if not isinstance(host_call[1], (tuple, list, dict)):
-        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
-
-      if isinstance(host_call[1], (tuple, list)):
-        fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = function_utils.fn_args(host_call[0])
-        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
-        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
-          raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
-              'method args of the function, which takes {}.'.format(
-                  name, len(host_call[1]), len(fn_args)))
-
-  @staticmethod
-  def create_cpu_hostcall(host_calls):
-    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
-
-    _OutfeedHostCall.validate(host_calls)
-    ret = {}
-    for name, host_call in host_calls.items():
-      host_fn, tensors = host_call
-      if isinstance(tensors, (tuple, list)):
-        ret[name] = host_fn(*tensors)
-      else:
-        # Must be dict.
-        try:
-          ret[name] = host_fn(**tensors)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling %s: %s. It is likely the tensors '
-              '(%s[1]) do not match the '
-              'function\'s arguments', name, e, name)
-          raise
-    return ret
-
-  def record(self, host_calls):
-    """Records the host_call structure."""
-
-    for name, host_call in host_calls.items():
-      host_fn, tensor_list_or_dict = host_call
-      self._names.append(name)
-      self._host_fns[name] = host_fn
-
-      if isinstance(tensor_list_or_dict, dict):
-        for (key, tensor) in six.iteritems(tensor_list_or_dict):
-          self._tensor_keys[name].append(key)
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-      else:
-        # List or tuple.
-        self._tensor_keys[name] = None
-        for tensor in tensor_list_or_dict:
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-
-  def create_enqueue_op(self):
-    """Create the op to enqueue the recorded host_calls.
-
-    Returns:
-      A list of enqueue ops, which is empty if there are no host calls.
-    """
-    if not self._names:
-      return []
-
-    tensors = []
-    # TODO(jhseu): Consider deduping tensors.
-    for name in self._names:
-      tensors.extend(self._tensors[name])
-
-    with ops.device(tpu.core(0)):
-      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
-
-  def create_tpu_hostcall(self):
-    """Sends the tensors through outfeed and runs the host_fn on CPU.
-
-    The tensors are concatenated along dimension 0 to form a global tensor
-    across all shards. The concatenated function is passed to the host_fn and
-    executed on the first host.
-
-    Returns:
-      A dictionary mapping name to the return type of the host_call by that
-      name.
-
-    Raises:
-      RuntimeError: If outfeed tensor is scalar.
-    """
-    if not self._names:
-      return {}
-
-    ret = {}
-    # For each i, dequeue_ops[i] is a list containing the tensors from all
-    # shards. This list is concatenated later.
-    dequeue_ops = []
-    tensor_dtypes = []
-    tensor_shapes = []
-    for name in self._names:
-      for _ in self._tensors[name]:
-        dequeue_ops.append([])
-      for dtype in self._tensor_dtypes[name]:
-        tensor_dtypes.append(dtype)
-      for shape in self._tensor_shapes[name]:
-        tensor_shapes.append(shape)
-
-    # Outfeed ops execute on each replica's first logical core. Note: we must
-    # constraint it such that we have at most one outfeed dequeue and enqueue
-    # per replica.
-    for i in xrange(self._ctx.num_replicas):
-      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
-        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes,
-            shapes=tensor_shapes,
-            device_ordinal=ordinal_id)
-        for j, item in enumerate(outfeed_tensors):
-          dequeue_ops[j].append(item)
-
-    # Deconstruct dequeue ops.
-    flat_dequeue_ops = []
-    for l in dequeue_ops:
-      flat_dequeue_ops.extend(l)
-
-    dequeue_ops_by_name = {}
-    pos = 0
-    for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
-                                              len(self._tensors[name])]
-      pos += len(self._tensors[name])
-
-    def _call_host_fn(fn, *args, **kw):
-      context = CatchInvalidHostcallFunctions()
-      context.Enter()
-      result = fn(*args, **kw)
-      context.Exit()
-      context.ExitResult(result)
-      return result
-
-    # It is assumed evaluation always happens on single host TPU system. So,
-    # place all ops on tpu host if possible.
-    #
-    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
-      for name in self._names:
-        dequeue_ops = dequeue_ops_by_name[name]
-        for i, item in enumerate(dequeue_ops):
-          if dequeue_ops[i][0].shape.ndims == 0:
-            raise RuntimeError(
-                'All tensors outfed from TPU should preserve batch size '
-                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-          # TODO(xiejw): Make the specification of the outfeed combinaton
-          # function more explicit and well-documented.  We may want to give the
-          # user the option of concatenating along any axis.
-          if (self._ctx.config.tpu_config.per_host_input_for_training is
-              tpu_config.InputPipelineConfig.BROADCAST):
-            # If the infeed is in BROADCAST mode (each core recieving the same
-            # input), then we assume that the cores also produce identical
-            # copies of the same output, and we simply take the output from
-            # the first core.  This mode is used by Mesh-TensorFlow.
-            with ops.control_dependencies(dequeue_ops[i]):
-              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
-          else:
-            # Assume that the input has been batch-split and that axis 0 of the
-            # output tensors represents the batch size.  Concatenate along
-            # the axis 0 to re-combine the batch.
-            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-        if self._tensor_keys[name] is not None:
-          # The user-provided eval_metrics[1] is a dict.
-          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
-          try:
-            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
-          except TypeError as e:
-            logging.warning(
-                'Exception while calling %s: %s. It is likely the tensors '
-                '(%s[1]) do not match the '
-                'function\'s arguments', name, e, name)
-            raise
-        else:
-          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
-
-    # force all dequeue operations to be run if not consumed by the host calls
-    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
-    return ret
-
-
-class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
-  """Hook to run host calls when use_tpu=False."""
-
-  def __init__(self, tensors):
-    self._tensors = tensors
-
-  def begin(self):
-    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
-    # create a separate hook to guarantee execution order, because summaries
-    # need to be initialized before the outfeed thread starts.
-    # TODO(jhseu): Make a wrapper hook instead?
-    self._init_ops = contrib_summary.summary_writer_initializer_op()
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    self._finalize_ops = []
-    for op in self._init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_ops)
-
-  def before_run(self, run_context):
-    return basic_session_run_hooks.SessionRunArgs(self._tensors)
-
-  def end(self, session):
-    session.run(self._finalize_ops)
-
-
-class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Calculate and report global_step/sec and examples/sec during runtime."""
-
-  def __init__(self,
-               batch_size,
-               every_n_steps=100,
-               every_n_secs=None,
-               output_dir=None,
-               summary_writer=None):
-    self._batch_size = batch_size
-    super(ExamplesPerSecondHook, self).__init__(
-        every_n_steps=every_n_steps,
-        every_n_secs=every_n_secs,
-        output_dir=output_dir,
-        summary_writer=summary_writer)
-
-  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_step_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_step_per_sec
-    if self._summary_writer is not None:
-      global_step_summary = Summary(value=[
-          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
-      ])
-      example_summary = Summary(value=[
-          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
-      ])
-      self._summary_writer.add_summary(global_step_summary, global_step)
-      self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_step/sec: %g', global_step_per_sec)
-    logging.info('examples/sec: %g', examples_per_sec)
-
-
-class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
-  """Change SIGINT (CTRL^C) handler to force quit the process.
-
-  The default behavior often results in hanging processes.
-  The original handler is restored after training/evaluation.
-  """
-
-  def __init__(self):
-    self._signal_fn = signal.getsignal(signal.SIGINT)
-
-  def before_run(self, run_context):
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-
-  def end(self, session):
-    signal.signal(signal.SIGINT, self._signal_fn)
-
-
-class TPUEstimator(estimator_lib.Estimator):
-  """Estimator with TPU support.
-
-  TPUEstimator also supports training on CPU and GPU. You don't need to define
-  a separate `tf.estimator.Estimator`.
-
-  TPUEstimator handles many of the details of running on TPU devices, such as
-  replicating inputs and models for each core, and returning to host
-  periodically to run hooks.
-
-  TPUEstimator transforms a global batch size in params to a per-shard batch
-  size when calling the `input_fn` and `model_fn`. Users should specify
-  global batch size in constructor, and then get the batch size for each shard
-  in `input_fn` and `model_fn` by `params['batch_size']`.
-
-  - For training, `model_fn` gets per-core batch size; `input_fn` may get
-    per-core or per-host batch size depending on `per_host_input_for_training`
-    in `TPUConfig` (See docstring for TPUConfig for details).
-
-  - For evaluation and prediction, `model_fn` gets per-core batch size and
-    `input_fn` get per-host batch size.
-
-  Evaluation
-  ==========
-
-  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return
-  `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case
-  the following discussion on TPU evaluation does not apply.
-
-  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of any nested structure of `Tensor`s (See
-  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
-  a dict from metric string name to the result of calling a metric function,
-  namely a `(metric_tensor, update_op)` tuple.
-
-  One can set `use_tpu` to `False` for testing. All training, evaluation, and
-  predict will be executed on CPU. `input_fn` and `model_fn` will receive
-  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
-
-  Current limitations:
-  --------------------
-
-  1. TPU evaluation only works on a single host (one TPU worker) except
-     BROADCAST mode.
-
-  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
-     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
-     batches should have the same size.
-
-  Example (MNIST):
-  ----------------
-
-  ```
-  # The metric Fn which runs on CPU.
-  def metric_fn(labels, logits):
-    predictions = tf.argmax(logits, 1)
-    return {
-      'accuracy': tf.metrics.precision(
-          labels=labels, predictions=predictions),
-    }
-
-  # Your model Fn which runs on TPU (eval_metrics is list in this example)
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, [labels, logits]))
-
-  # or specify the eval_metrics tensors as dict.
-  def model_fn(features, labels, mode, config, params):
-    ...
-    final_layer_output = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, {
-              'labels': labels,
-              'logits': final_layer_output,
-          }))
-  ```
-
-  Prediction
-  ==========
-
-  Prediction on TPU is an experimental feature to support large batch inference.
-  It is not designed for latency-critical system. In addition, due to some
-  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
-  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
-  convenient.
-
-  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
-  *should* raise an end-of-input exception (`OutOfRangeError` or
-  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
-  precise, the ops created by `input_fn` produce one batch of the data.
-  The `predict()` API processes one batch at a time. When reaching the end of
-  the data source, an end-of-input exception should be raised by one of these
-  operations. The user usually does not need to do this manually. As long as the
-  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
-  exception automatically after the last batch has been produced.
-
-  Note: Estimator.predict returns a Python generator. Please consume all the
-  data from the generator so that TPUEstimator can shutdown the TPU system
-  properly for user.
-
-  Current limitations:
-  --------------------
-  1. TPU prediction only works on a single host (one TPU worker).
-
-  2. `input_fn` must return a `Dataset` instance rather than `features`. In
-  fact, .train() and .evaluate() also support Dataset as return value.
-
-  Example (MNIST):
-  ----------------
-  ```
-  height = 32
-  width = 32
-  total_examples = 100
-
-  def predict_input_fn(params):
-    batch_size = params['batch_size']
-
-    images = tf.random_uniform(
-        [total_examples, height, width, 3], minval=-1, maxval=1)
-
-    dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.map(lambda images: {'image': images})
-
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def model_fn(features, labels, params, mode):
-     # Generate predictions, called 'output', from features['image']
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode,
-          predictions={
-              'predictions': output,
-              'is_padding': features['is_padding']
-          })
-
-  tpu_est = TPUEstimator(
-      model_fn=model_fn,
-      ...,
-      predict_batch_size=16)
-
-  # Fully consume the generator so that TPUEstimator can shutdown the TPU
-  # system.
-  for item in tpu_est.predict(input_fn=input_fn):
-    # Filter out item if the `is_padding` is 1.
-    # Process the 'predictions'
-  ```
-
-  Exporting
-  =========
-
-  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
-  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
-  At serving time, these tags are used to select metagraph to load.
-
-  Before running the graph on TPU, TPU system needs to be initialized. If
-  TensorFlow Serving model-server is used, this is done automatically. If
-  not, please call `session.run(tpu.initialize_system())`.
-
-  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
-  `model_fn`.
-
-  Example:
-  ----------------
-
-  ```
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-    export_outputs = {
-      'logits': export_output_lib.PredictOutput(
-        {'logits': logits})
-    }
-
-    def host_call(logits):
-      class_ids = math_ops.argmax(logits)
-      classes = string_ops.as_string(class_ids)
-      export_outputs['classes'] =
-        export_output_lib.ClassificationOutput(classes=classes)
-
-    tpu.outside_compilation(host_call, logits)
-
-    ...
-  ```
-
-  """
-
-  def __init__(self,
-               model_fn=None,
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               export_to_cpu=True,
-               warm_start_from=None,
-               experimental_exported_model_uses_all_cores=False):
-    """Constructs an `TPUEstimator` instance.
-
-    Args:
-      model_fn: Model function as required by `Estimator` which returns
-        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-        and `prediction_hooks` must not capure any TPU Tensor inside the
-        model_fn.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
-      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
-      params: An optional `dict` of hyper parameters that will be passed into
-        `input_fn` and `model_fn`.  Keys are names of parameters, values are
-        basic python types. There are reserved keys for `TPUEstimator`,
-        including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
-        TPU training and evaluation respect this bit, but eval_on_tpu can
-        override execution of eval. See below. - Predict still happens on CPU.
-      train_batch_size: An int representing the global training batch size.
-        TPUEstimator transforms this global batch size to a per-shard batch
-        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
-        number of replicas.
-      eval_batch_size: An int representing evaluation batch size. Must be
-        divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size. Must be
-        divisible by total number of replicas.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards. For example, if your input_fn produced (images, labels)
-        where the images tensor is in `HWCN` format, your shard dimensions would
-        be [3, 0], where 3 corresponds to the `N` dimension of your images
-        Tensor, and 0 corresponds to the dimension along which to split the
-        labels to match up with the corresponding images. If None is supplied,
-        and per_host_input_for_training is True, batches will be sharded based
-        on the major dimension. If tpu_config.per_host_input_for_training is
-        False or `PER_HOST_V2`, batch_axis is ignored.
-      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
-        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
-      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU. Note that unsupported export modes such as EVAL will be
-        ignored. For those modes, only a CPU model will be exported.
-        Currently, export_to_tpu only supports PREDICT.
-      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on CPU.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
-        configure warm-starting.  If the string filepath is provided instead of
-        a `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-      experimental_exported_model_uses_all_cores: Whether to round-robin among
-        all cores visible to the host which is serving the saved model, or to
-        use a single core. This is a temporary flag to enable using all TPU
-        cores for inference with TPUPartitionedCall(). Once outside compilation
-        is supported in TPUPartitionedCall(), this flag will be enabled by
-        default.
-
-    Raises:
-      ValueError: `params` has reserved keys already.
-    """
-    if config is None or not isinstance(config, tpu_config.RunConfig):
-      raise ValueError(
-          '`config` must be provided with type `tpu_config.RunConfig`')
-
-    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError('{} are reserved keys but existed in params {}.'.format(
-          _RESERVED_PARAMS_KEYS, params))
-
-    if use_tpu:
-      # Perform some very basic validations. More validations will be found in
-      # _InternalTPUContext.
-      if train_batch_size is None:
-        raise ValueError('`train_batch_size` cannot be `None`')
-      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
-
-      if (config.tpu_config.per_host_input_for_training is
-          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
-          config.tpu_config.num_cores_per_replica):
-        raise ValueError(
-            'Model parallelism only supports per host input for training. '
-            'Please adjust TPURunconfig.per_host_input_for_training.')
-
-      if eval_batch_size is not None:
-        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
-
-      if predict_batch_size is not None:
-        util_lib.check_positive_integer(predict_batch_size,
-                                        'predict_batch_size')
-
-    # Verifies the model_fn signature according to Estimator framework.
-    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-    # We cannot store config and params in this constructor as parent
-    # constructor might change them, such as assigning a temp dir for
-    # config.model_dir.
-    model_function = self._augment_model_fn(model_fn, batch_axis)
-
-    # Overwrite log_step_count_steps to disable TensorLoggingHook and
-    # StepCounterHook from being created in Estimator. TPUEstimator already
-    # added equivalent hooks in _augment_model_fn above.
-    self._log_every_n_steps = config.log_step_count_steps
-    config = config.replace(log_step_count_steps=None)
-
-    # Passing non-None params as wrapped model_fn has it.
-    params = params or {}
-    super(TPUEstimator, self).__init__(
-        model_fn=model_function,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        warm_start_from=warm_start_from)
-    self._iterations_per_training_loop = (
-        self._config.tpu_config.iterations_per_loop)
-
-    # All properties passed to _InternalTPUContext are immutable.
-    # pylint: disable=protected-access
-    self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu)
-
-    self._export_to_cpu = export_to_cpu
-    self._export_to_tpu = export_to_tpu
-    self._experimental_exported_model_uses_all_cores = (
-        experimental_exported_model_uses_all_cores)
-
-    self._is_input_fn_invoked = None
-    self._rendezvous = {}
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True):
-    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
-                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
-                      'for TPU.'.format(mode))
-
-    if not self._export_to_cpu and not self._export_to_tpu:
-      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
-
-    if self._export_to_cpu:
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables))
-
-    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
-      input_receiver_fn_map = {
-          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
-      }
-      export_tags = [tag_constants.SERVING, tag_constants.TPU]
-      mode = _REWRITE_FOR_INFERENCE_MODE
-
-      # See b/110052256 for why `check_variables` is `False`.
-      if not self._export_to_cpu:
-        check_variables = save_variables = True
-      else:
-        check_variables = save_variables = False
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables=save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables))
-
-  def _call_model_fn(self, features, labels, mode, config):
-    if mode == _REWRITE_FOR_INFERENCE_MODE:
-      return self._call_model_fn_for_inference(features, labels, mode, config)
-    else:
-      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                      config)
-
-  def _call_model_fn_for_inference(self, features, labels, mode, config):
-    """Wraps `_call_model_fn` for `export_savedmodel`."""
-    if mode != _REWRITE_FOR_INFERENCE_MODE:
-      raise ValueError('mode must be {}; '
-                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
-
-    computation, capture = self._build_computation_for_inference(
-        features, labels, mode, config)
-    tensors = call_computation(
-        computation,
-        experimental_exported_model_uses_all_cores=self
-        ._experimental_exported_model_uses_all_cores)
-    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
-        capture.get())
-    predictions_list = tensors[:len(predictions_dict)]
-    export_outputs_list_without_none = tensors[len(predictions_dict):]
-
-    # Reinsert `None`s which we've taken out in
-    # `_build_computation_for_inference()`.
-    export_outputs_list = []
-    while none_indices or export_outputs_list_without_none:
-      if none_indices and none_indices[0] == len(export_outputs_list):
-        export_outputs_list.append(None)
-        none_indices.pop(0)
-      else:
-        export_outputs_list.append(export_outputs_list_without_none.pop(0))
-
-    # Reconstruct `export_outputs` with updated tensors.
-    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
-                                                    export_outputs_list)
-    export_outputs = estimator_spec.export_outputs
-    new_export_outputs = collections.OrderedDict(
-        (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_export_outputs_dict))
-    # Reconstruct `predictions` with updated tensors.
-    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
-    if (len(new_predictions) == 1 and
-        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
-      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
-
-    return estimator_spec._replace(
-        export_outputs=new_export_outputs, predictions=new_predictions)
-
-  def _build_computation_for_inference(self, features, labels, mode, config):
-    capture = _CapturedObject()
-
-    def computation():
-      """Computation to be passed to `TPUPartitionedCall()`."""
-      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
-          features, labels, mode, config)
-
-      tensors_on_cpu = tpu.rewrite_for_inference(tpu_computation)
-      (estimator_spec, export_outputs_dict, export_outputs_list,
-       predictions_dict) = (
-           tpu_capture.get())
-      predictions_list = tensors_on_cpu[:len(predictions_dict)]
-      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
-
-      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
-      # with their CPU counterpart returned from `rewrite_for_inference()`.
-      # `function.Defun()` does not like `None`s in return values, so we leave
-      # `None`s out but record their positions for later reconstruction.
-      export_outputs_list_without_none = []
-      none_indices = []
-      for i, t in enumerate(export_outputs_list):
-        if t is None:
-          none_indices.append(i)
-        else:
-          export_outputs_list_without_none.append(
-              export_outputs_tpu_on_cpu_list.pop(0))
-
-      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
-                       none_indices))
-      return predictions_list + export_outputs_list_without_none
-
-    return computation, capture
-
-  def _build_tpu_computation_for_inference(self, features, labels, mode,
-                                           config):
-    capture = _CapturedObject()
-
-    def computation():
-      """Compute tpu tensors used in export_outputs.
-
-      Passed to rewrite_for_inference so that model_fn will be called under
-      the rewriting contexts. Only tpu tensors are returned, but export_outputs
-      and scaffold are captured.
-
-      Returns:
-         A list of Tensors used in export_outputs and not marked for
-         outside_compilation.
-      """
-      # We should only call model fn once and it should be inside `computation`
-      # so that building the graph will happen under `rewrite_for_inference`.
-      mode = model_fn_lib.ModeKeys.PREDICT
-      estimator_spec = self._call_model_fn(features, labels, mode, config)
-
-      # We pick the TPU tensors out from `export_output` and later return them
-      # from `computation` for rewriting.
-      export_outputs_dict = collections.OrderedDict(
-          (k, _export_output_to_tensors(v))
-          for k, v in six.iteritems(estimator_spec.export_outputs))
-      export_outputs_list = nest.flatten(export_outputs_dict)
-      export_outputs_tpu_list = [
-          t for t in export_outputs_list if t is not None
-      ]
-
-      if isinstance(estimator_spec.predictions, dict):
-        predictions_dict = collections.OrderedDict(
-            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
-      else:
-        predictions_dict = {
-            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
-        }
-      predictions_list = nest.flatten(predictions_dict)
-
-      # We cannot return everything we want through the return values, so
-      # capture the rest here for later use.
-      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
-                       predictions_dict))
-      return predictions_list + export_outputs_tpu_list
-
-    return computation, capture
-
-  def _create_global_step(self, graph):
-    """Creates a global step suitable for TPUs.
-
-    Args:
-      graph: The graph in which to create the global step.
-
-    Returns:
-      A global step `Tensor`.
-
-    Raises:
-      ValueError: if the global step tensor is already defined.
-    """
-    return _create_global_step(graph)
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
-            steps, max_steps)
-
-    # On TPU.
-    if steps is None and max_steps is None:
-      raise ValueError(
-          'For TPU training, one of `steps` or `max_steps` must be set. '
-          'Cannot be both `None`.')
-
-    # Estimator.train has explicit positiveness check.
-    if steps is not None:
-      util_lib.check_positive_integer(steps, 'Train steps')
-    if max_steps is not None:
-      util_lib.check_positive_integer(max_steps, 'Train max_steps')
-
-    return [
-        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
-    ]
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
-
-    if steps is None:
-      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
-
-    util_lib.check_positive_integer(steps, 'Eval steps')
-
-    return [
-        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps),
-        _SetEvalIterationsHook(steps)
-    ]
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: ModeKeys
-
-    Returns:
-      In TPU mode, returns an input_fn to be called later in model_fn.
-      Otherwise, calls the input_fn and returns either fatures or
-        (features, labels).
-
-    Raises:
-      ValueError: if input_fn takes invalid arguments or does not have `params`.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    config = self.config  # a deep copy.
-    kwargs = {}
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params  # a deep copy.
-    else:
-      raise ValueError('input_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params["batch_size"]'.format(input_fn))
-    if 'config' in input_fn_args:
-      kwargs['config'] = config
-
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-
-    # Records the fact input_fn has been invoked.
-    self._is_input_fn_invoked = True
-
-    with self._ctx.with_mode(mode) as ctx:
-      # Setting the batch size in params first. This helps user to have same
-      # input_fn for use_tpu=True/False.
-      batch_size_for_input_fn = ctx.batch_size_for_input_fn
-      if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-                            batch_size_for_input_fn)
-
-      # For export_savedmodel, input_fn is never passed to Estimator. So,
-      # `is_export_mode` must be False.
-      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
-          return input_fn(**kwargs)
-
-      # For TPU computation, input_fn should be invoked in a tf.while_loop for
-      # performance. While constructing the tf.while_loop, the structure of
-      # inputs returned by the `input_fn` needs to be recorded. The structure
-      # includes whether features or labels is dict or single Tensor, dict keys,
-      # tensor shapes, and dtypes. The recorded structure is used to create the
-      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
-      # inside the TPU computation, as the TPU computation is wrapped inside a
-      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
-      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
-      # `features` in `model_fn` signature.
-      def _input_fn(ctx):
-        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
-        return input_fn(**kwargs)
-
-      return _input_fn
-
-  def _validate_features_in_predict_input(self, result):
-    """Skip the validation.
-
-    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
-    has stronger check. Parent class's check generates confusing warning msg.
-
-    Args:
-      result: `features` returned by input_fn.
-    """
-    pass
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
-    try:
-      return super(TPUEstimator, self).train(
-          input_fn=input_fn,
-          hooks=hooks,
-          steps=steps,
-          max_steps=max_steps,
-          saving_listeners=saving_listeners)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('training_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('training_loop')
-      rendezvous.raise_errors()
-
-  def evaluate(self,
-               input_fn,
-               steps=None,
-               hooks=None,
-               checkpoint_path=None,
-               name=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
-    try:
-      return super(TPUEstimator, self).evaluate(
-          input_fn,
-          steps=steps,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('evaluation_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('evaluation_loop')
-      rendezvous.raise_errors()
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
-    try:
-      for result in super(TPUEstimator, self).predict(
-          input_fn=input_fn,
-          predict_keys=predict_keys,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          yield_single_examples=yield_single_examples):
-        yield result
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('prediction_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('prediction_loop')
-      rendezvous.raise_errors()
-
-    rendezvous.record_done('prediction_loop')
-    rendezvous.raise_errors()
-
-  def _augment_model_fn(self, model_fn, batch_axis):
-    """Returns a new model_fn, which wraps the TPU support."""
-
-    def _model_fn(features, labels, mode, config, params):
-      """A Estimator `model_fn` for TPUEstimator."""
-      with self._ctx.with_mode(mode) as ctx:
-        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
-
-        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
-        # but not in `export_savedmodel()`.
-        if self._is_input_fn_invoked:
-          is_export_mode = False
-        else:
-          is_export_mode = True
-
-        # Clear the bit.
-        self._is_input_fn_invoked = None
-
-        # examples_hook is added to training_hooks for both CPU and TPU
-        # execution.
-        if self._log_every_n_steps is not None:
-          examples_hook = ExamplesPerSecondHook(
-              ctx.global_batch_size,
-              # pylint:disable=g-long-ternary
-              output_dir=(self.model_dir
-                          if not config or config.save_summary_steps
-                          else None),
-              # pylint:enable=g-long-ternary
-              every_n_steps=self._log_every_n_steps)
-
-        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
-          logging.info('Running %s on CPU', mode)
-          estimator_spec = model_fn_wrapper.call_without_tpu(
-              features, labels, is_export_mode=is_export_mode)
-          if self._log_every_n_steps is not None:
-            estimator_spec = estimator_spec._replace(
-                training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          return estimator_spec
-
-        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
-        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
-        assert callable(features), '`input_fn` is not callable.'
-        input_fn = features
-
-        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
-            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-
-        graph = ops.get_default_graph()
-        for enqueue_op in enqueue_ops:
-          if isinstance(enqueue_op, list):
-            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
-          else:
-            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
-
-        if mode == model_fn_lib.ModeKeys.TRAIN:
-          compile_op, loss, host_call, scaffold, training_hooks = (
-              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          host_ops = host_call.create_tpu_hostcall()
-          if host_ops is None:
-            host_ops = []
-
-          shutdown_hooks = []
-          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
-                                         'shutdown_worker')
-          if shutdown_mode:
-            if shutdown_mode == 'shutdown_worker':
-              finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
-              ]
-            elif shutdown_mode == 'shutdown_computation':
-              finalizer_hooks = [
-                  session_support.RestartComputation(timeout_ms=60 * 1000),
-              ]
-            else:
-              raise ValueError(
-                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
-
-            shutdown_hooks.append(
-                session_support.GracefulShutdownHook(
-                    checkpoint_prefix=self.model_dir + '/model.ckpt',
-                    on_shutdown_hooks=finalizer_hooks))
-
-          with ops.control_dependencies([loss]):
-            global_step = array_ops.identity(training.get_global_step())
-          hooks = input_hooks + shutdown_hooks
-          hooks.extend([
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.master,
-                  session_config=self._session_config,
-              ),
-              InstallSignalHandlerHook()
-          ])
-          if self._log_every_n_steps is not None:
-            logging_hook_frequency = (  # Divide and round up
-                (self._log_every_n_steps +
-                 self._config.tpu_config.iterations_per_loop - 1) //
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(
-                training.LoggingTensorHook({
-                    'loss': array_ops.identity(loss),
-                    'step': global_step,
-                },
-                                           every_n_iter=logging_hook_frequency))
-            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(examples_hook)
-
-          if training_hooks:
-            hooks.extend(training_hooks)
-
-          chief_hooks = []
-          if (self._config.save_checkpoints_secs or
-              self._config.save_checkpoints_steps):
-            checkpoint_hook = training.CheckpointSaverHook(
-                self.model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            chief_hooks.append(checkpoint_hook)
-
-          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
-          with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops(ctx)
-
-          # Validate the TPU training graph to catch basic errors
-          _validate_tpu_training_graph()
-
-          train_op = control_flow_ops.group(*update_ops)
-          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=loss,
-              training_chief_hooks=chief_hooks,
-              training_hooks=hooks,
-              train_op=train_op,
-              scaffold=scaffold)
-
-        if mode == model_fn_lib.ModeKeys.EVAL:
-          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
-              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(
-              total_loss,
-              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-          with ops.control_dependencies([mean_loss]):
-            # After TPU evaluation computation is done (the mean_loss tensor),
-            # reads all variables back from TPU and updates the eval step
-            # counter properly
-            internal_ops_to_run = _sync_variables_ops(ctx)
-            internal_ops_to_run.append(
-                _increase_eval_step_op(iterations_per_loop_var))
-
-          host_call_ret = host_calls.create_tpu_hostcall()
-          eval_metric_ops = {}
-          eval_update_ops = []
-
-          eval_metrics = host_call_ret.get('eval_metrics', {})
-          if eval_metrics:
-            # Creates a dummy metric update_op for all metrics. Estimator
-            # expects all metrics in `eval_metric_ops` have update_op and calls
-            # them one by one. The real metric update_ops are invoked in a
-            # separated thread. So, here give Estimator the dummy op for all
-            # metrics.
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
-
-            for k, v in eval_metrics.items():
-              eval_metric_ops[k] = (v[0], dummy_update_op)
-              eval_update_ops.append(v[1])
-          else:
-            # If no eval metrics are passed, create an identity node for the
-            # loss and add `internal_ops_to_run` to its dependencies. So
-            # `internal_ops_to_run` can be executed.
-            with ops.control_dependencies(internal_ops_to_run):
-              mean_loss = array_ops.identity(mean_loss)
-
-          if 'host_call' not in host_call_ret:
-            host_ops = []
-          else:
-            host_ops = host_call_ret['host_call']
-          hooks = [
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  eval_update_ops + host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.evaluation_master,
-                  session_config=self._session_config,
-              )] + input_hooks
-
-          if eval_hooks:
-            hooks.extend(eval_hooks)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=mean_loss,
-              evaluation_hooks=hooks,
-              eval_metric_ops=eval_metric_ops,
-              scaffold=scaffold)
-
-        # Predict
-        assert mode == model_fn_lib.ModeKeys.PREDICT
-
-        (compile_op, dummy_predict_op, host_calls,
-         scaffold, prediction_hooks) = _predict_on_tpu_system(
-             ctx, model_fn_wrapper, dequeue_fn)
-        with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops(ctx)
-          with ops.control_dependencies(internal_ops_to_run):
-            dummy_predict_op = control_flow_ops.no_op()
-
-        # In train and evaluation, the main TPU program is passed to monitored
-        # training session to run. Infeed enqueue and outfeed dequeue are
-        # executed in side threads. This is not the configuration for
-        # prediction mode.
-        #
-        # For prediction, the Estimator executes the EstimatorSpec.predictions
-        # directly and yield the element (via generator) to call site. So, the
-        # outfeed based prediction must be passed to MonitoredSession directly.
-        # Other parts of the TPU execution are organized as follows.
-        #
-        # 1. All outfeed based Tensors must be grouped with predictions Tensors
-        #    to form a single invocation. This avoid the issue we might trigger
-        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
-        #    placed in control_dependencies of `stopping_signals`, and
-        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
-        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
-        #    all SessionRunArgs with the fetch in session.run together.
-        #
-        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
-        #    are grouped together. They will be launched once and only once in
-        #    side threads and they quit naturally according to the SAME stopping
-        #    condition.
-        enqueue_ops.append(dummy_predict_op)
-
-        host_call_ret = host_calls.create_tpu_hostcall()
-        if 'host_call' not in host_call_ret:
-          host_ops = []
-        else:
-          host_ops = host_call_ret['host_call']
-
-        predictions = host_call_ret['predictions']
-        _verify_cross_hosts_transfer_size(
-            predictions,
-            message=(
-                'The estimated size for TPUEstimatorSpec.predictions is too '
-                'large.'))
-        signals = host_call_ret['signals']
-
-        with ops.control_dependencies(host_ops):
-          host_ops = []  # Empty, we do do not need it anymore.
-          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              signals)
-          predictions = _PaddingSignals.slice_tensor_or_dict(
-              predictions, signals)
-
-        hooks = [
-            _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
-                tpu_compile_op=compile_op,
-                master=self._config.master,
-                session_config=self._session_config),
-        ] + input_hooks
-
-        if prediction_hooks:
-          hooks.extend(prediction_hooks)
-
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            prediction_hooks=hooks,
-            predictions=predictions,
-            scaffold=scaffold)
-
-    return _model_fn
-
-
-def _export_output_to_tensors(export_output):
-  """Get a list of `Tensors` used in `export_output`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-
-  Returns:
-    a list of tensors used in export_output.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    return [export_output.scores, export_output.classes]
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    return [export_output.value]
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return list(export_output.outputs.values())
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _clone_export_output_with_tensors(export_output, tensors):
-  """Clones `export_output` but with new `tensors`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-    tensors: a list of `Tensors` used to construct a new `export_output`.
-
-  Returns:
-    A dict similar to `export_output` but with `tensors`.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    if len(tensors) != 2:
-      raise ValueError('tensors must be of length 2; '
-                       'got {}.'.format(len(tensors)))
-    return export_output_lib.ClassificationOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    if len(tensors) != 1:
-      raise ValueError('tensors must be of length 1; '
-                       'got {}'.format(len(tensors)))
-    return export_output_lib.RegressionOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output_lib.PredictOutput(
-        dict(zip(export_output.outputs.keys(), tensors)))
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
-
-  def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
-                                [_ZERO_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_eval_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
-
-
-def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_train_step, host_call, captured_scaffold_fn,
-   captured_training_hooks) = (
-       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
-
-  def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
-                                [_INITIAL_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_train_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
-
-
-def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
-   captured_predict_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
-
-  def multi_tpu_predict_steps_on_single_shard():
-
-    def cond(scalar_stopping_signal):
-      return math_ops.logical_not(
-          _StopSignals.should_stop(scalar_stopping_signal))
-
-    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
-    outputs = training_loop.while_loop(
-        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
-    return outputs
-
-  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
-      multi_tpu_predict_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  dummy_predict_op = dummy_predict_op[0]
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return (compile_op, dummy_predict_op, host_calls, scaffold,
-          captured_predict_hooks.get())
-
-
-def _wrap_computation_in_while_loop(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def computation(i):
-    with ops.control_dependencies(op_fn()):
-      return i + 1
-
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    iterations = array_ops.identity(iterations_per_loop_var)
-    return control_flow_ops.while_loop(
-        lambda i: i < iterations,
-        computation, [constant_op.constant(0)],
-        parallel_iterations=1)
-
-
-def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def cond(scalar_stopping_signal):
-    return math_ops.logical_not(
-        _StopSignals.should_stop(scalar_stopping_signal))
-
-  def computation(unused_scalar_stopping_signal):
-    return_value = op_fn()
-    execute_ops = return_value['ops']
-    signals = return_value['signals']
-    with ops.control_dependencies(execute_ops):
-      return _StopSignals.as_scalar_stopping_signal(signals)
-
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    return control_flow_ops.while_loop(
-        cond,
-        computation, [_StopSignals.NON_STOPPING_SIGNAL],
-        parallel_iterations=1)
-
-
-def _validate_tpu_training_graph():
-  """Validate graph before running distributed training.
-
-  Raises:
-    ValueError: If the graph seems invalid for running on device
-  """
-  operations = ops.get_default_graph().get_operations()
-
-  # Check if there is atleast one CrossReplicaSum operation in the graph
-  # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [
-      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
-  ]
-  if not cross_replica_sum_ops:
-    raise ValueError(
-        'CrossShardOptimizer must be used for model training on TPUs.')
-
-
-class _CapturedObject(object):
-  """A placeholder to capture an object.
-
-  This is useful when we need to capture a Python object in the Tensorflow
-  control flow body function and use it outside the control flow.
-  """
-
-  def __init__(self):
-    self._object = None
-    self._captured = False
-
-  def capture(self, o):
-    if self._captured:
-      raise RuntimeError(
-          'InternalError: Object can capture only once. Please file bug.')
-
-    self._captured = True
-    self._object = o
-
-  def get(self):
-    if not self._captured:
-      raise RuntimeError(
-          'InternalError: Object is not captured properly before `get`. '
-          'Please file bug.')
-    return self._object
-
-
-def _get_scaffold(captured_scaffold_fn):
-  """Retrieves the Scaffold from `captured_scaffold_fn`."""
-  with _CapturingContext(message='Inside scaffold_fn'):
-    scaffold_fn = captured_scaffold_fn.get()
-    if scaffold_fn:
-      scaffold = scaffold_fn()
-      if scaffold is None:
-        raise ValueError(
-            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
-    else:
-      scaffold = None
-
-  if scaffold:
-    wrapped_finalize = scaffold.finalize
-
-    def _finalize():
-      with _CapturingContext('Inside Scaffold.finalize'):
-        wrapped_finalize()
-
-    scaffold.finalize = _finalize
-  return scaffold
-
-
-class _CapturingContext(control_flow_ops.ControlFlowContext):
-  """Tracks references to Tensors defined in TPU replication."""
-
-  def __init__(self, message):
-    control_flow_ops.ControlFlowContext.__init__(self)
-    self._message = message
-
-  def to_control_flow_context_def(self, context_def, export_scope=None):
-    # pylint: disable=useless-super-delegation
-    # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(_CapturingContext, self).to_control_flow_context_def(
-        context_def, export_scope)
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    for c in op.inputs:
-      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError('{}: Op {} depends on TPU computation {}, '
-                         'which is not allowed.'.format(self._message, op, c))
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
-
-
-class _Inputs(object):
-  """A data structure representing the input_fn returned values.
-
-  This also supports the returned value from input_fn as `Dataset`.
-  """
-
-  def __init__(self, features=None, labels=None, dataset=None, signals=None):
-    if dataset is not None and (features is not None or labels is not None or
-                                signals is not None):
-      raise RuntimeError('Internal Error: Either (features and labels) or '
-                         'dataset should be provided, not both. Please file '
-                         'bug')
-
-    self._features = features
-    self._labels = labels
-    self._signals = signals
-
-    self._dataset = dataset
-    self._iterator = None
-
-  @staticmethod
-  def from_input_fn(return_values):
-    """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.DatasetV2):
-      dataset = return_values
-      return _Inputs(dataset=dataset)
-
-    features, labels = _Inputs._parse_inputs(return_values)
-    return _Inputs(features, labels)
-
-  @staticmethod
-  def _parse_inputs(return_values):
-    if isinstance(return_values, tuple):
-      features, labels = return_values
-    else:
-      features, labels = return_values, None
-    return features, labels
-
-  @property
-  def is_dataset(self):
-    """Returns True if the return value from input_fn is Dataset."""
-    return self._dataset is not None
-
-  def dataset_initializer(self):
-    """Returns the dataset's initializer.
-
-    The initializer must be run before calling `features_and_labels`.
-    """
-    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return self._iterator.initializer
-
-  def features_and_labels(self):
-    """Gets `features` and `labels`."""
-    if self.is_dataset:
-      if self._iterator is None:
-        raise RuntimeError('Internal error: Must run dataset_initializer '
-                           'before calling features_and_labels(). Please file '
-                           'a bug!')
-      return _Inputs._parse_inputs(self._iterator.get_next())
-
-    return (self._features, self._labels)
-
-  def signals(self):
-    return self._signals
-
-  @property
-  def dataset(self):
-    return self._dataset
-
-
-class _InputsWithStoppingSignals(_Inputs):
-  """Inputs with `_StopSignals` inserted into the dataset."""
-
-  def __init__(self,
-               dataset,
-               batch_size,
-               add_padding=False,
-               num_invocations_per_step=1):
-
-    assert dataset is not None
-    user_provided_dataset = dataset.map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size, add_padding=add_padding))
-    if num_invocations_per_step == 1:
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-    else:
-      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
-      # user_provided_dataset and stop properly.
-      # For example, if num_invocations_per_step is 2, we append 3 additional
-      # padding batches: b1, b2, b3.
-      # If user_provided_dataset contains two batches: a1, a2
-      # Step 1: [a1, a2]
-      # Step 2: [b1, b2] -> STOP
-      # If user_provided_dataset contains three batches: a1, a2, a3.
-      # The training loops:
-      # Step 1: [a1, a2]
-      # Step 2: [a3, b1]
-      # Step 3: [b2, b3] -> STOP.
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-      final_batch_dataset = final_batch_dataset.repeat(
-          2 * num_invocations_per_step - 1)
-
-      def _set_mask(data_dict):
-        signals = data_dict['signals']
-        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
-        data_dict['signals'] = signals
-        return data_dict
-
-      # Mask out the extra batch.
-      final_batch_dataset = final_batch_dataset.map(_set_mask)
-
-    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
-
-    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
-    self._current_inputs = None
-
-  def features_and_labels(self):
-    if self._current_inputs is not None:
-      raise RuntimeError(
-          'Internal Error: The previous inputs have not been properly '
-          'consumed. First call features_and_labels, then call signals.')
-
-    inputs_with_signals = self._iterator.get_next()
-    features = inputs_with_signals['features']
-    labels = inputs_with_signals.get('labels')
-
-    self._current_inputs = inputs_with_signals
-    return features, labels
-
-  def signals(self):
-    """Returns the `Signals` from `_Inputs`."""
-    if self._current_inputs is None:
-      raise RuntimeError(
-          'Internal Error: The current inputs have not been properly '
-          'generated. First call features_and_labels, then call signals.')
-    signals = self._current_inputs['signals']
-    self._current_inputs = None
-    return signals
-
-  @staticmethod
-  def insert_stopping_signal(stop, batch_size, add_padding=False):
-    """Inserts stopping_signal into dataset via _map_fn.
-
-    Here we change the data structure in the dataset, such that the return value
-    is a dictionary now and `features`, `labels`, and `signals` are three
-    distinguished keys in that dict. This provides a better structure, which
-    eases the process to decompose the inputs (see `features_and_labels`).
-
-    Args:
-      stop: bool, state of current stopping signals.
-      batch_size: int, batch size.
-      add_padding: bool, whether to pad the tensor to full batch size.
-
-    Returns:
-      A map_fn passed to dataset.map API.
-    """
-
-    def _map_fn(*args):
-      """The map fn to insert signals."""
-      if len(args) == 1:
-        # Unpack the single Tensor/dict argument as features. This is required
-        # for the input_fn returns no labels.
-        args = args[0]
-      features, labels = _Inputs._parse_inputs(args)
-      new_input_dict = {}
-
-      if add_padding:
-        padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(features, labels,
-                                                    batch_size))
-
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-
-      else:
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-        padding_mask = None
-
-      new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size,
-          padding_mask=padding_mask).as_dict()
-
-      return new_input_dict
-
-    return _map_fn
-
-
-class _StopSignals(object):
-  """Signals class holding all logic to handle TPU stopping condition."""
-
-  NON_STOPPING_SIGNAL = False
-  STOPPING_SIGNAL = True
-
-  def __init__(self, stop, batch_size, padding_mask=None):
-    self._stop = stop
-    self._batch_size = batch_size
-    self._padding_mask = padding_mask
-
-  def as_dict(self):
-    """Returns the signals as Python dict."""
-    shape = [self._batch_size, 1]
-    dtype = dtypes.bool
-
-    if self._stop:
-      stopping = array_ops.ones(shape=shape, dtype=dtype)
-    else:
-      stopping = array_ops.zeros(shape=shape, dtype=dtype)
-
-    signals = {'stopping': stopping}
-    if self._padding_mask is not None:
-      signals['padding_mask'] = self._padding_mask
-    return signals
-
-  @staticmethod
-  def as_scalar_stopping_signal(signals):
-    return array_ops.identity(signals['stopping'][0][0])
-
-  @staticmethod
-  def should_stop(scalar_stopping_signal):
-    """Detects whether scalar_stopping_signal indicates stopping."""
-    if isinstance(scalar_stopping_signal, ops.Tensor):
-      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
-      # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(scalar_stopping_signal,
-                                  _StopSignals.STOPPING_SIGNAL)
-    else:
-      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
-      # the graph anymore. Here, we use pure Python.
-      return bool(scalar_stopping_signal)
-
-
-class _PaddingSignals(object):
-  """Signals class holding all logic to handle padding."""
-
-  @staticmethod
-  def pad_features_and_labels(features, labels, batch_size):
-    """Pads out the batch dimension of features and labels."""
-    real_batch_size = array_ops.shape(
-        _PaddingSignals._find_any_tensor(features))[0]
-
-    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
-
-    check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor,
-        real_batch_size,
-        data=(batch_size_tensor, real_batch_size),
-        message='The real batch size should not be greater than batch_size.')
-
-    with ops.control_dependencies([check_greater]):
-      missing_count = batch_size_tensor - real_batch_size
-
-    def pad_single_tensor(tensor):
-      """Pads out the batch dimension of a tensor to the complete batch_size."""
-      rank = len(tensor.shape)
-      assert rank > 0
-      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-      padded_tensor = array_ops.pad(tensor, padding)
-      padded_tensor.set_shape(padded_shape)
-      return padded_tensor
-
-    def nest_pad(tensor_or_dict):
-      return nest.map_structure(pad_single_tensor, tensor_or_dict)
-
-    features = nest_pad(features)
-    if labels is not None:
-      labels = nest_pad(labels)
-
-    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
-                                                 batch_size)
-
-    return padding_mask, features, labels
-
-  @staticmethod
-  def slice_tensor_or_dict(tensor_or_dict, signals):
-    """Slice the real Tensors according to padding mask in signals."""
-
-    padding_mask = signals['padding_mask']
-    batch_size = array_ops.shape(padding_mask)[0]
-
-    def verify_batch_size(tensor):
-      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
-      with ops.control_dependencies([check_batch_size]):
-        return array_ops.identity(tensor)
-
-    def slice_single_tensor(tensor):
-      rank = len(tensor.shape)
-      assert rank > 0
-      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
-      return verify_batch_size(tensor)[0:real_batch_size]
-
-    # As we split the Tensors to all TPU cores and concat them back, it is
-    # important to ensure the real data is placed before padded ones, i.e.,
-    # order is preserved. By that, the sliced padding mask should have all 0's.
-    # If this assertion failed, # the slice logic here would not hold.
-    sliced_padding_mask = slice_single_tensor(padding_mask)
-    assert_padding_mask = math_ops.equal(
-        math_ops.reduce_sum(sliced_padding_mask), 0)
-
-    with ops.control_dependencies([assert_padding_mask]):
-      should_stop = _StopSignals.should_stop(
-          _StopSignals.as_scalar_stopping_signal(signals))
-
-    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
-
-    def slice_fn(tensor):
-      # If the current batch is full batch or part of stopping signals, we do
-      # not need to slice to save performance.
-      return control_flow_ops.cond(
-          math_ops.logical_or(should_stop, is_full_batch),
-          (lambda: verify_batch_size(tensor)),
-          (lambda: slice_single_tensor(tensor)))
-
-    return nest.map_structure(slice_fn, tensor_or_dict)
-
-  @staticmethod
-  def _find_any_tensor(batch_features):
-    tensors = [
-        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
-    ]
-    if not tensors:
-      raise ValueError('Cannot find any Tensor in features dict.')
-    return tensors[0]
-
-  @staticmethod
-  def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat([
-        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-        array_ops.ones((missing_count,), dtype=dtypes.int32)
-    ],
-                                    axis=0)
-    padding_mask.set_shape((batch_size,))
-    return padding_mask
-
-
-def _verify_cross_hosts_transfer_size(tensor_dict, message):
-  total_size = 0
-  tensor_structure = {}
-  for key, tensor in tensor_dict.items():
-    shape = tensor.shape
-    size = np.product(shape) * tensor.dtype.size
-    tensor_structure[key] = shape
-    total_size += size
-  if total_size >= _ONE_GIGABYTE:
-    raise ValueError(
-        '{} The transfer size is larger than the protobuf limit. Please '
-        'consider to use Tensors with smaller shapes or reduce batch '
-        'size. Given:\n'
-        '{}'.format(
-            message, '\n'.join([
-                ' -- Key: {}, Shape: {}'.format(k, v)
-                for k, v in tensor_structure.items()
-            ])))
-
-
-def _add_item_to_params(params, key, value):
-  """Adds a new item into `params`."""
-  if isinstance(params, hparam.HParams):
-    # For HParams, we need to use special API.
-    if key in params:
-      params.set_hparam(key, value)
-    else:
-      params.add_hparam(key, value)
-  else:
-    # Now params is Python dict.
-    params[key] = value
-
-
-def export_estimator_savedmodel(estimator,
-                                export_dir_base,
-                                serving_input_receiver_fn,
-                                assets_extra=None,
-                                as_text=False,
-                                checkpoint_path=None,
-                                strip_default_attrs=False):
-  """Export `Estimator` trained model for TPU inference.
-
-  Args:
-    estimator: `Estimator` with which model has been trained.
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and returns a
-      `ServingInputReceiver` or `TensorServingInputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs.
-
-  Returns:
-    The string path to the exported directory.
-  """
-  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
-  # `estimator.config`.
-  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
-  est = TPUEstimator(
-      estimator._model_fn,  # pylint: disable=protected-access
-      config=config,
-      params=estimator.params,
-      use_tpu=True,
-      train_batch_size=2048,  # Does not matter.
-      eval_batch_size=2048,  # Does not matter.
-  )
-  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra, as_text, checkpoint_path,
-                               strip_default_attrs)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_estimator import *
+# used by tests
+from tensorflow.python.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow.python.tpu.tpu_estimator import _create_global_step
+from tensorflow.python.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow.python.tpu.tpu_estimator import _get_scaffold
+from tensorflow.python.tpu.tpu_estimator import _Inputs
+from tensorflow.python.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow.python.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow.python.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow.python.tpu.tpu_estimator import _TPU_TRAIN_OP
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index d5957b7e8ec40b40c7af8822378cee6134ef0d0f..af2542ea85290170ce6a38223188c4f9b871f032 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -1,898 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Helper library for handling infeed between hosts and TPUs.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.util import nest
-
-
-class InfeedQueue(object):
-  """A helper object to build a device infeed queue.
-
-  The InfeedQueue builds the host-side and device-side Ops to enqueue and
-  dequeue elements, respectively, and ensures that their types and
-  shapes match.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               shard_dimensions=None,
-               name=None):
-    """Creates a new InfeedQueue with the given configuration.
-
-    The configuration need not be fully specified at creation since it
-    can be modified subsequently by methods that set the values
-    explicitly or infer them from the shapes of inputs.
-
-    Args:
-      number_of_tuple_elements: the number of Tensors fed atomically through the
-        queue, must be present unless it can be inferred from other arguments.
-      tuple_types: if not None, a list of types of the elements of the queue.
-      tuple_shapes: if not None, a list of shapes of the elements of the queue.
-      shard_dimensions: if not None, a list of dimensions on which the
-        elements of the queue should be sharded during automatic
-        parallelization.
-      name: the name of the queue.
-
-    Raises:
-      ValueError: if number_of_tuple_elements <= 0; or
-        number_of_tuple_arguments, tuple_types, tuple_shapes, and
-        shard_dimensions are all None; or the length of tuple_types,
-        tuple_shapes, or shard_dimensions is not equal to
-        number_of_tuple_elements; or any element of shard_dimensions
-        can't be converted to a Dimension.
-      TypeError: if any element of tuple_types or tuple_shapes can't
-        be converted to a dtype or TensorShape, respectively.
-    """
-    self._frozen = False
-    self._generated_enqueue_ops = False
-    self._generated_dequeue_op = False
-    self._name = "InfeedQueue" if name is None else name
-    if number_of_tuple_elements is None:
-      if tuple_types is not None:
-        number_of_tuple_elements = len(tuple_types)
-      elif tuple_shapes is not None:
-        number_of_tuple_elements = len(tuple_shapes)
-      elif shard_dimensions is not None:
-        number_of_tuple_elements = len(shard_dimensions)
-      else:
-        raise ValueError(
-            "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor")
-    if number_of_tuple_elements <= 0:
-      raise ValueError("number_of_tuple_elements %d must be > 0" %
-                       number_of_tuple_elements)
-    # Make an empty sharding policy for each tuple element.
-    self._sharding_policies = [
-        tpu_sharding.ShardingPolicy()
-        for _ in xrange(number_of_tuple_elements)
-    ]
-    if tuple_types is not None:
-      self.set_tuple_types(tuple_types)
-    else:
-      self._tuple_types = None
-    if tuple_shapes is not None:
-      self.set_tuple_shapes(tuple_shapes)
-    else:
-      self._tuple_shapes = None
-    if shard_dimensions is not None:
-      self.set_shard_dimensions(shard_dimensions)
-    self._validate()
-
-  def _validate(self):
-    """Checks that the configuration is self-consistent.
-
-    Raises:
-      ValueError: if the shapes and sharding policies don't match.
-    """
-    if self.tuple_shapes is not None:
-      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
-        # Raise an error if the policy is incompatible with the shape.
-        _ = policy.get_sharded_shape(shape)
-
-  @property
-  def number_of_tuple_elements(self):
-    """Returns the number of InfeedQueue tuple elements."""
-    return len(self._sharding_policies)
-
-  @property
-  def tuple_types(self):
-    """Returns the types of the InfeedQueue tuple elements."""
-    return self._tuple_types
-
-  def set_tuple_types(self, tuple_types):
-    """Sets the type of each element of the queue.
-
-    tuple_types must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a dtype.
-
-    Args:
-      tuple_types: the types of each queue element.
-
-    Raises:
-      ValueError: if tuple_types is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_types cannot be converted to a
-        dtype.
-    """
-    if len(tuple_types) != self.number_of_tuple_elements:
-      raise ValueError("tuple_types is %s, but must be a list of length %d" %
-                       (str(tuple_types), self.number_of_tuple_elements))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_types, tuple_types):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible type. Frozen types are %s, updated types are %s" % (
-                  str(self._tuple_types), str(tuple_types)))
-    else:
-      try:
-        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
-      except (TypeError) as e:
-        raise TypeError(
-            "tuple_types is %s, but must be a list of elements each "
-            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
-
-  @property
-  def tuple_shapes(self):
-    """Returns the shapes of the InfeedQueue tuple elements."""
-    return self._tuple_shapes
-
-  def set_tuple_shapes(self, tuple_shapes):
-    """Sets the shape of each element of the queue.
-
-    tuple_shapes must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a TensorShape.
-
-    Args:
-      tuple_shapes: the shapes of each queue element.
-
-    Raises:
-      ValueError: if tuple_shapes is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_shapes cannot be converted to
-        a TensorShape.
-    """
-    if len(tuple_shapes) != self.number_of_tuple_elements:
-      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
-                       (str(tuple_shapes), self.number_of_tuple_elements))
-    try:
-      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
-    except (ValueError, TypeError) as e:
-      raise TypeError(
-          "tuple_shapes is %s, but must be a list of elements each "
-          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
-                                                        str(e)))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
-              % (str(self._tuple_shapes), str(tuple_shapes)))
-    else:
-      self._tuple_shapes = tuple_shapes
-    self._validate()
-
-  @property
-  def sharding_policies(self):
-    """Returns the sharding policies of the InfeedQueue tuple elements."""
-    return self._sharding_policies
-
-  @property
-  def shard_dimensions(self):
-    """Gets the shard dimension of each tuple element.
-
-    Returns:
-      A list of length number_of_tuple_elements, where each list entry
-      is the shard dimension of that tuple element or None if the
-      shard dimension has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return [policy.shard_dimension for policy in self._sharding_policies]
-
-  def set_shard_dimensions(self, shard_dimensions):
-    """Sets the shard_dimension of each element of the queue.
-
-    shard_dimensions must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a Dimension compatible with self.tuple_shapes.
-
-    Args:
-      shard_dimensions: the dimensions of each queue element.
-
-    Raises:
-      ValueError: if shard_dimensions is not of length
-        self.number_of_tuple_elements; or an element of
-        shard_dimensions cannot be converted to a Dimension; or an
-        element of shard_dimensions is a Dimension that is out of
-        range for the corresponding tuple element shape.
-    """
-    if len(shard_dimensions) != self.number_of_tuple_elements:
-      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
-                       % (str(shard_dimensions),
-                          self.number_of_tuple_elements))
-    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
-      policy.set_shard_dimension(dimension)
-    self._validate()
-
-  @property
-  def number_of_shards(self):
-    """Gets the number of shards to use for the InfeedQueue.
-
-    Returns:
-      Number of shards or None if the number of shards has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return self._sharding_policies[0].number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards to use for the InfeedQueue.
-
-    Args:
-      number_of_shards: number of ways to shard the InfeedQueue.
-
-    Raises:
-      ValueError: if number_of_shards is not > 0; or the policies have
-        been frozen and number_of_shards was already set to something
-        else.
-    """
-    for policy in self._sharding_policies:
-      policy.set_number_of_shards(number_of_shards)
-    self._validate()
-
-  def set_configuration_from_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of Tensors whose types and shapes are used
-    to set the queue configuration.
-
-    Args:
-      input_tensors: list of Tensors of the same types and shapes as
-        the desired queue Tuple.
-
-    Raises:
-      ValueError: if input_tensors is not a list of length
-        self.number_of_tuple_elements
-    """
-    if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
-                       % (str(input_tensors), self.number_of_tuple_elements))
-    self.set_tuple_shapes([t.shape for t in input_tensors])
-    self.set_tuple_types([t.dtype for t in input_tensors])
-
-  def set_configuration_from_sharded_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of lists of Tensors whose types and shapes are used
-    to set the queue configuration. The length of the outer list is the number
-    of shards required, and each inner list is the tuple of Tensors to use to
-    determine the types and shapes of the corresponding shard. This method
-    depends on the shard dimension, and calling it freezes the shard policy.
-
-    Args:
-      input_tensors: list of lists of Tensors. The outer list length corresponds
-        to the desired number of shards, and each inner list is the size
-        and shape of the desired configuration of the corresponding shard.
-
-    Raises:
-      ValueError: if any inner list is not a list of length
-        self.number_of_tuple_elements; or the inner lists do not combine to
-        form a consistent unsharded shape.
-      TypeError: if the types of the Tensors in the inner lists do not match.
-    """
-    if not self._frozen:
-      # Unset the tuple shapes in case the configuration becomes
-      # transiently inconsistent.
-      self._tuple_shapes = None
-    number_of_shards = len(input_tensors)
-    self.set_number_of_shards(number_of_shards)
-    for t in input_tensors:
-      if len(t) != self.number_of_tuple_elements:
-        raise ValueError(
-            "input_tensors is %s but must be a list of lists, where each inner"
-            " list has length number_of_tuple_elements=%d" % (
-                str(input_tensors), self.number_of_tuple_elements))
-    # Transpose the inputs to make a list of shard shapes for each tuple
-    # element.
-    sharded_shapes = [[t[i].shape for t in input_tensors]
-                      for i in xrange(self.number_of_tuple_elements)]
-    # For each tuple, get the unsharded shape using that tuple's policy.
-    unsharded_shapes = [
-        policy.get_unsharded_shape(s)
-        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
-    ]
-    self.set_tuple_shapes(unsharded_shapes)
-    for i in xrange(1, self.number_of_shards):
-      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
-        if t1.dtype != t2.dtype:
-          raise TypeError(
-              "types of the tuple elements of input_tensors %s are not "
-              "consistent" % str(input_tensors))
-    self.set_tuple_types([t.dtype for t in input_tensors[0]])
-
-  def freeze(self):
-    """Freezes the InfeedQueue so it can no longer be modified.
-
-    The configuration is implicitly frozen before any host-side or
-    device-side Ops are generated. The configuration cannot be frozen
-    until the types and shapes of the tuple elements have been set.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set.
-    """
-    self._frozen = True
-    if self._tuple_types is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple types.")
-    if self._tuple_shapes is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for shape in self._tuple_shapes:
-      if shape.dims is None:
-        raise ValueError(
-            "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for policy in self._sharding_policies:
-      policy.freeze()
-    self._validate()
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generates the device-side Op to dequeue a tuple from the queue.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen, which will raise errors if the shapes and types have not
-    been fully specified.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed. If None, no explicit placement will be performed, and it is up
-        to the user to call this API from within a proper TPU device scope.
-        The XLA code will fail if the TPU dequeue instruction is not bound to
-        any device.
-
-    Returns:
-      A list of Outputs corresponding to a shard of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    if tpu_device is not None:
-      with ops.device(tpu.core(tpu_device)):
-        return tpu_ops.infeed_dequeue_tuple(
-            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    else:
-      return tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-
-  def _generate_enqueue_op(self,
-                           inputs,
-                           name_prefix,
-                           index,
-                           device=None,
-                           tpu_ordinal=-1):
-    """Generate a host-side Op to enqueue a tuple to the queue.
-
-    If device is None the inputs are all required to have the same
-    device specification, and the enqueue Op is colocated with
-    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
-
-    Args:
-      inputs: a list of Tensors with the types and shapes of the tuple elements.
-      name_prefix: the base name for the Op.
-      index: the shard index, used to uniquify the Op name.
-      device: device to place the Op on, or None if it should be
-        colocated with the inputs.
-      tpu_ordinal: ordinal of the TPU device on the host to use for
-      infeed if device is a CPU device. Should be set to -1 if device
-      is a TPU device.
-
-    Returns:
-      An Op corresponding to a shard of infeed enqueued at the host,
-      suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if device is None and inputs do not all have the
-        same device specification.
-    """
-    full_name = "%s/%d" % (name_prefix, index)
-    shapes = [t.shape for t in inputs]
-    if device is None:
-      devices = [t.device for t in inputs]
-      for i in xrange(1, self.number_of_tuple_elements):
-        if devices[0] != devices[i]:
-          raise ValueError(
-              "input devices for shard %d are %s, but should all be the same" %
-              (index, str(devices)))
-      with ops.colocate_with(inputs[0]):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-    else:
-      with ops.device(device):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-
-  def generate_enqueue_ops(self,
-                           sharded_inputs,
-                           tpu_ordinal_function=None,
-                           placement_function=None):
-    """Generates the host-side Ops to enqueue the shards of a tuple.
-
-    sharded_inputs is a list, one for each shard, of lists of
-    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
-    shard 0 if the queue. Returns the host-side Ops that must be run to
-    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
-    for shard i.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of sharded_inputs, an error
-    will be raised.
-
-    Args:
-      sharded_inputs: a list of lists of Tensors. The length of the outer list
-        determines the number of shards. Each inner list indicates the types
-        and shapes of the tuples in the corresponding shard.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. tpu_ordinal_function must be
-        set if the inputs are placed on CPU devices.
-      placement_function: if not None, a function that takes the shard index as
-        input and returns the host device where the enqueue op should be placed
-        on.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    if tpu_ordinal_function is None:
-      tpu_ordinal_function = lambda index: -1
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            tpu_ordinal=tpu_ordinal_function(index),
-            device=placement_function(index) if placement_function else None)
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-  # TODO(misard) Generalize this to the case of systems that don't
-  # have 8 devices per host, and figure out what to do with
-  # model-parallelism.
-  def _default_placement_function(self, index):
-    return "/task:%d/device:CPU:0" % (index / 8)
-
-  def _default_ordinal_function(self, index):
-    return index % 8
-
-  # TODO(b/36470756) remove this from tutorials once we have a better story
-  # for automatic placement of input pipelines.
-  def split_inputs_and_generate_enqueue_ops(self,
-                                            inputs,
-                                            device_assignment=None,
-                                            placement_function=None,
-                                            tpu_ordinal_function=None):
-    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
-
-    Generates the host-side Ops to enqueue a tuple.
-
-    This method performs poorly because it takes an entire input on a single
-    host, splits it, and distributes it to all of the cores. It is present only
-    to simplify tutorial examples.
-
-    inputs is a list of Tensors to use to feed the queue. Each input is split
-    into self.number_of_shards shards. Returns an Op for each shard to enqueue
-    the shard. The Op for shard i is placed on device placement_function(i).
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of inputs, an error
-    will be raised.
-
-    Args:
-      inputs: a list of Tensors which indicates the types and shapes of the
-        queue tuple.
-     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
-        device_assignment is not `None`, but `placement_function` and
-        `ordinal_function` are None, then `device_assignment` will be used to
-        place infeeds on the first k TPU shards, where k is the number of shards
-        in the queue. If all three are `None`, then default placement and
-        ordinal functions are used.
-      placement_function: if not None, a function that takes the shard
-        index as input and returns a device string indicating which
-        device the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of inputs are not compatible with the frozen
-        configuration.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of inputs are not compatible with the frozen
-        configuration.
-    """
-    if device_assignment is None:
-      if placement_function is None:
-        placement_function = self._default_placement_function
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = self._default_ordinal_function
-    else:
-
-      def _placement_function_from_map(index):
-        return device_assignment.host_device(replica=index)
-
-      def _ordinal_function_from_map(index):
-        return device_assignment.tpu_ordinal(replica=index)
-
-      if placement_function is None:
-        placement_function = _placement_function_from_map
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = _ordinal_function_from_map
-    self.set_configuration_from_input_tensors(inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    split_name_prefix = "%s/split" % self._name
-    if self.number_of_shards == 1:
-      transposed_sharded_inputs = [[inp] for inp in inputs]
-    else:
-
-      def split_fn(inp, num_shards, axis, name):
-        with ops.colocate_with(inp):
-          return array_ops.split(inp, num_shards, axis=axis, name=name)
-
-      transposed_sharded_inputs = [
-          split_fn(
-              inp,
-              self.number_of_shards,
-              axis=policy.shard_dimension,
-              name="%s/%d" % (split_name_prefix, index))
-          for (inp, policy, index) in zip(inputs, self._sharding_policies,
-                                          xrange(self.number_of_tuple_elements))
-      ]
-    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
-                      for i in xrange(self.number_of_shards)]
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            device=placement_function(index),
-            tpu_ordinal=tpu_ordinal_function(index))
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-
-class _PartitionedInfeedQueue(InfeedQueue):
-  """A helper object to build a device infeed queue with input partition.
-
-  Args:
-    number_of_tuple_elements: the number of Tensors fed atomically through the
-      queue, must be present unless it can be inferred from other arguments.
-    device_assignment: A TPU `DeviceAssignment` which is used to place all the
-      partitions to different TPU infeed queues.
-    host_id: The id of the host machine.
-    input_partition_dims: A nested list/tuple of integers. Each inner
-      list/tuple describes how to partition the corresponding input tensor.
-    tuple_types: If not None, a list of types of the elements of the queue.
-    tuple_shapes: If not None, a list of shapes of the elements of the queue.
-    name: The name of the queue.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements,
-               device_assignment,
-               host_id,
-               input_partition_dims=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               name=None):
-    super(_PartitionedInfeedQueue, self).__init__(
-        number_of_tuple_elements=number_of_tuple_elements,
-        tuple_types=tuple_types,
-        tuple_shapes=None,
-        shard_dimensions=None,
-        name="PartitionedInfeedQueue" if name is None else name)
-    self._input_partition_dims = input_partition_dims
-    self._host_id = host_id
-    self._device_assignment = device_assignment
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generate TPU dequeue ops.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed.
-
-    Returns:
-      A list of Outputs corresponding to a partition of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    with ops.device(tpu.core(tpu_device)):
-      values = tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    return self._tag_sharding_attribute_for_dequeued_tensors(
-        values, self._input_partition_dims)
-
-  def generate_enqueue_ops(self, per_host_sharded_inputs):
-    """Generates the host-side Ops to enqueue the partitioned inputs.
-
-    per_host_sharded_inputs is a list, one for each replica, of lists of
-    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
-    replica i.
-    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
-
-    For example, if sharded_inputs[i][j] is a 2-D Tensor:
-    [[A, B, C, D],
-     [E ,F, G, H]]
-    self._input_partition_dims[j] is [2, 4].
-
-    sharded_inputs[i][j] will be partitioned and flattened into:
-    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
-    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
-
-    Args:
-      per_host_sharded_inputs: a list of lists of Tensors. The length of the
-        outer list determines the number of shards. Each inner list indicates
-        the types and shapes of the tuples in the corresponding shard.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints; or if the partition dims are invalid.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
-    number_of_replicas_per_host = len(per_host_sharded_inputs)
-    number_of_tuple_elements = len(per_host_sharded_inputs[0])
-
-    assert len(self._input_partition_dims) == number_of_tuple_elements
-    per_host_enqueue_ops = []
-
-    for replica_index in range(number_of_replicas_per_host):
-      flattened_inputs = per_host_sharded_inputs[replica_index]
-      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
-                                                 self._input_partition_dims)
-      inputs_parted_iters = [
-          iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in
-          zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat)
-      ]
-
-      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
-        # Places different partitions to different logic cores.
-        replica_id = self._device_assignment.lookup_replicas(
-            self._host_id, logical_core)[replica_index]
-        ordinal = self._device_assignment.tpu_ordinal(
-            replica=replica_id, logical_core=logical_core)
-        infeed_inputs = []
-        for it in inputs_parted_iters:
-          input_for_device = next(it, None)
-          if input_for_device is not None:
-            infeed_inputs.append(input_for_device)
-
-        if infeed_inputs:
-          per_host_enqueue_ops.append(
-              tpu_ops.infeed_enqueue_tuple(
-                  inputs=infeed_inputs,
-                  shapes=[x.shape for x in infeed_inputs],
-                  name="enqueue/replica_{0}/input_{1}".format(
-                      replica_index, logical_core),
-                  device_ordinal=ordinal))
-    return per_host_enqueue_ops
-
-  def _check_input_partition_dims(self, tensor, dims):
-    """Checks that input partition dims are valid for the `Tensor`.
-
-    Args:
-      tensor: Input tensor for partitioning.
-      dims: 1-D np.array of the list of integer describes how to partition the
-        input tensor.
-
-    Raises:
-      ValueError: If the tensor can't be partitioned by dims or the
-        num_cores_per_replica doesn't match the number of
-        partitions(dims.prod()).
-    """
-    if (dims < 1).any():
-      raise ValueError("All input partition dims must be >= 1.")
-
-    # No partitioning, so don't perform further checks.
-    if dims.prod() == 1:
-      return
-
-    if dims.prod() != self._device_assignment.num_cores_per_replica:
-      raise ValueError(
-          "The product of each input parition dim should equal to "
-          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
-          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
-    if dims.shape[0] != tensor.shape.ndims:
-      raise ValueError(
-          "Input partition dims must have the same number of dimensions "
-          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
-          "partition dims = {}).".format(tensor.shape.as_list(), dims))
-
-    tensor.shape.assert_is_fully_defined()
-
-  def _partition_or_replicate_on_host(self, tensor, dims):
-    """Partitions or replicates the input tensor.
-
-      The ops inside this function are placed on the host side.
-
-    Args:
-      tensor: The input tensor which will be partioned or replicated.
-      dims: A list of integer describes how to partition the input tensor.
-    Returns:
-      An iterator of `Tensor`s or a list of partioned tensors.
-    """
-    if dims is None:
-      return itertools.repeat(tensor)
-    dims = np.array(dims)
-    self._check_input_partition_dims(tensor, dims)
-    output = [tensor]
-    shape_list = np.array(tensor.shape.as_list())
-    quotients, remainders = np.divmod(shape_list, dims)
-    for axis, (quotient, remainder, dim, original_size) in enumerate(
-        zip(quotients, remainders, dims, shape_list)):
-      if dim <= 1:
-        continue
-      if remainder > 0:
-        # For each dimension, when it cannot be evenly partitioned, XLA assumes
-        # tensors are partitioned in a greedy manner by using
-        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
-        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
-        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
-        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
-        ceil_ratio = quotient + 1
-        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
-        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
-        if len(num_or_size_splits) < dim:
-          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
-        new_output = []
-        for x in output:
-          new_output.append(
-              array_ops.split(
-                  x, num_or_size_splits=num_or_size_splits, axis=axis))
-        output = new_output
-      else:
-        output = [array_ops.split(x, dim, axis=axis) for x in output]
-      output = nest.flatten(output)
-    return output
-
-  def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensor.
-
-    Args:
-      tensor: The dequeued tensor on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same tensor with the xla_sharding attribute.
-    """
-    if dims is None:
-      return xla_sharding.replicate(tensor)
-    elif np.prod(dims) == 1:
-      return xla_sharding.assign_device(tensor, 0)
-    else:
-      tile_assignment = np.arange(np.prod(dims)).reshape(dims)
-      return xla_sharding.tile(
-          tensor=tensor,
-          tile_assignment=tile_assignment)
-
-  def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensors.
-
-    Args:
-      dequeues: A list of dequeued tensors on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same dequeues with appropriate xla_sharding attribute.
-    """
-    nest.assert_shallow_structure(dequeues, dims)
-    return nest.map_structure_up_to(
-        dequeues, self._tag_sharding_attribute_for_dequeued_tensor, dequeues,
-        dims)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_feed import *
+# used by tests
+from tensorflow.python.tpu.tpu_feed import _PartitionedInfeedQueue
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 84d5967ea547f0c036f7c9aa936ac0c99c141304..f2755c6979c2e49dbc19b6800462949601811496 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -1,57 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for functions used during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
-
-class TpuContext(object):
-  """A context object holding state about the TPU computation being built."""
-
-  def __init__(self):
-    """Creates a new TpuContext."""
-    self._number_of_shards = None
-
-  @property
-  def number_of_shards(self):
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    self._number_of_shards = number_of_shards
-
-
-# The Tpu context holds the number of shards when a sharded computation is
-# being built, or None if no computation is being built.
-_current_tpu_context = TpuContext()
-
-
-@contextlib.contextmanager
-def tpu_shard_context(number_of_shards):
-  if _current_tpu_context.number_of_shards is not None:
-    raise NotImplementedError("tpu_shard_context cannot be nested.")
-  try:
-    _current_tpu_context.set_number_of_shards(number_of_shards)
-    yield
-  finally:
-    _current_tpu_context.set_number_of_shards(None)
-
-
-def get_tpu_context():
-  return _current_tpu_context
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_function import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index 1e11de6421e360faf0b9ad573a84f9aecdf9c98f..ca58e78d7b342c7ca70400652d99092ccbecbbde 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -1,203 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Optimizer that implements cross-shard gradient reduction for TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer
-
-
-class CrossShardOptimizer(optimizer.Optimizer):
-  """An optimizer that averages gradients across TPU shards."""
-
-  def __init__(self,
-               opt,
-               reduction=losses.Reduction.MEAN,
-               name="CrossShardOptimizer",
-               group_assignment=None):
-    """Construct a new cross-shard optimizer.
-
-    Args:
-      opt: An existing `Optimizer` to encapsulate.
-      reduction: The reduction to apply to the shard losses.
-      name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "CrossShardOptimizer".
-      group_assignment: Optional 2d int32 lists with shape
-        [num_groups, num_replicas_per_group] which describles how to apply
-        optimizer to subgroups.
-
-    Raises:
-      ValueError: If reduction is not a valid cross-shard reduction.
-    """
-    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
-      raise ValueError("Unsupported reduction: %s." % reduction)
-
-    super(CrossShardOptimizer, self).__init__(False, name)
-    self._opt = opt
-    self._reduction = reduction
-    self._group_assignment = group_assignment
-
-  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
-    """Verify group_assignment and get the subgroup size".
-
-    Args:
-      group_assignment: list of group ids for applying the optimizer
-        to subgroups.
-      num_shards: The number of TPU shards.
-
-    Returns:
-      The size of one subgroup in group_assignment.
-
-    Raises:
-      ValueError: If group_assignment is invalid.
-    """
-    if not group_assignment:
-      return None
-    if not (isinstance(group_assignment, list) and
-            all(isinstance(i, list) for i in group_assignment)):
-      raise ValueError("group_assignment must be a list of list. Got {}".format(
-          group_assignment))
-
-    replica_ids = set()
-    for g in group_assignment:
-      for i in g:
-        replica_ids.add(i)
-
-    if set(range(num_shards)) != replica_ids:
-      raise ValueError("group_assignment must be a permutation of range({0})."
-                       " Got group_assignment={1}".format(
-                           num_shards, group_assignment))
-
-    subgroup_size_list = [len(group) for group in group_assignment]
-    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
-      return subgroup_size_list[0]
-    else:
-      raise ValueError("The size of each subgroup in group_assignment must "
-                       "be equal. Got group_assignment={}".format(
-                           self._group_assignment))
-
-  def compute_gradients(self, loss, var_list=None, **kwargs):
-    """Compute gradients of "loss" for the variables in "var_list".
-
-    This simply wraps the compute_gradients() from the real optimizer. The
-    gradients will be aggregated in the apply_gradients() so that user can
-    modify the gradients like clipping with per replica global norm if needed.
-    The global norm with aggregated gradients can be bad as one replica's huge
-    gradients can hurt the gradients from other replicas.
-
-    Args:
-      loss: A Tensor containing the value to minimize.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKey.TRAINABLE_VARIABLES`.
-      **kwargs: Keyword arguments for compute_gradients().
-
-    Returns:
-      A list of (gradient, variable) pairs.
-
-    Raises:
-      ValueError: If not within a tpu_shard_context or group_assignment is
-        invalid.
-    """
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "CrossShardOptimizer should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-
-    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
-                                                       num_shards)
-
-    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
-      if self._group_assignment:
-        scale = 1.0 / subgroup_size
-      else:
-        scale = 1.0 / num_shards
-      loss *= scale
-
-    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients to variables.
-
-    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
-    replicas, and then applies the real optimizer.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        compute_gradients().
-      global_step: Optional Variable to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the Optimizer constructor.
-
-    Returns:
-      An `Operation` that applies the gradients. If `global_step` was not None,
-      that operation also increments `global_step`.
-
-    Raises:
-      ValueError: If the grads_and_vars is malformed.
-    """
-    summed_grads_and_vars = []
-    for (grad, var) in grads_and_vars:
-      if grad is None:
-        summed_grads_and_vars.append((grad, var))
-      else:
-        with ops.colocate_with(grad):
-          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
-              grad, self._group_assignment), var))
-    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
-
-  def get_slot(self, *args, **kwargs):
-    """Return a slot named "name" created for "var" by the Optimizer.
-
-    This simply wraps the get_slot() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      The `Variable` for the slot if it was created, `None` otherwise.
-    """
-    return self._opt.get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    """Return a list of the names of slots created by the `Optimizer`.
-
-    This simply wraps the get_slot_names() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      A list of strings.
-    """
-    return self._opt.get_slot_names(*args, **kwargs)
-
-  def variables(self):
-    """Forwarding the variables from the underlying optimizer."""
-    return self._opt.variables()
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_optimizer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
index f5af03f33ca8f13af517007672e9ce0e12be6205..93c52335a582e5fa83092f78212ca268079b7c12 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -1,253 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for sharding during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.python.framework import tensor_shape
-
-_DEFAULT_NUMBER_OF_SHARDS = 1
-_DEFAULT_SHARD_DIMENSION = 0
-
-
-# TODO(b/36777903) change other parts of tpu.py to use this class.
-class ShardingPolicy(object):
-  """An object use to hold the sharding policy for a Tensor.
-  """
-
-  def __init__(self):
-    self._number_of_shards = None
-    self._shard_dimension = None
-    self._frozen = False
-
-  def __str__(self):
-    if self.number_of_shards is None or self.shard_dimension is None:
-      return "ShardingPolicy(unset)"
-    else:
-      return ("ShardingPolicy(%d shards dimension %d)" %
-              (self.number_of_shards, self.shard_dimension))
-
-  def _fill_default_values(self):
-    if self._number_of_shards is None:
-      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
-    if self._shard_dimension is None:
-      self._shard_dimension = tensor_shape.as_dimension(
-          _DEFAULT_SHARD_DIMENSION)
-
-  def freeze(self):
-    """Prevents further modification to the sharding policy.
-
-    Any values that have not been set when freeze is called are set to
-    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
-    """
-    if not self._frozen:
-      self._fill_default_values()
-      self._frozen = True
-
-  @property
-  def number_of_shards(self):
-    """Returns the number of shards in the policy or None if unspecified."""
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards for the current policy.
-
-    If the policy has been frozen then number_of_shards must match the
-    existing setting.
-
-    Args:
-      number_of_shards: The number of shards to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and number_of_shards
-        differs from the frozen value; or number_of_shards <= 0.
-    """
-    if self._frozen:
-      if self._number_of_shards != number_of_shards:
-        raise ValueError(
-            "Can't set sharding policy to use %d shards since it has been "
-            "frozen to use %d." % (number_of_shards, self._number_of_shards))
-    else:
-      if number_of_shards > 0:
-        self._number_of_shards = number_of_shards
-      else:
-        raise ValueError(
-            "Can't set sharding policy to use %s shards; value must be >0",
-            str(number_of_shards))
-
-  @property
-  def shard_dimension(self):
-    """Returns the shard dimension of the policy or None if unspecified."""
-    return self._shard_dimension
-
-  def set_shard_dimension(self, shard_dimension):
-    """Sets the shard dimension for the current policy.
-
-    If the policy has been frozen then shard_dimension must match the
-    existing setting.
-
-    Args:
-      shard_dimension: The shard dimension to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and shard_dimension
-        differs from the frozen value, or shard_dimension can't be
-        interpreted as a Dimension.
-    """
-    if self._frozen:
-      if self._shard_dimension != shard_dimension:
-        raise ValueError(
-            "Can't set shard dimension to %d since it has been frozen to "
-            "use %d." % (shard_dimension, self._shard_dimension))
-    else:
-      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
-
-  def merge(self, other):
-    """Merges the policy of another policy into the current policy.
-
-    Args:
-      other: The policy to merge into this one.
-
-    Raises:
-      ValueError: If this policy has been frozen and the merge conflicts with
-      the frozen policy.
-    """
-    if other.number_of_shards is not None:
-      self.set_number_of_shards(other.number_of_shards)
-    if other.shard_dimension is not None:
-      self.set_shard_dimension(other.shard_dimension)
-
-  def get_sharded_shape(self, shape, shard_index=None):
-    """Returns the shape of a shard of a full Tensor.
-
-    When given the shape of a 'full-size' Tensor, returns the shape of
-    the sub-Tensor after it has been sharded. Freezes the policy if it
-    has not yet been frozen.
-
-    Args:
-      shape: The shape of the full-size Tensor to be sharded.
-      shard_index: The index of the shard whose shape should be returned.
-        shard_index can be None for sharding policies that use the same
-        shape for every shard.
-      freeze_config:
-
-    Returns:
-      The shape of the sharded version of the Tensor.
-
-    Raises:
-      ValueError: If shard_index is None when shards are of different
-        shapes; or shard_index is not None and
-        !(0<=shard_index<number_of_shards); or shape does not have at
-        least self.shard_dimension+1 dimensions; or the value of
-        shape's shard dimension is not a multiple of
-        self.number_of_shards
-    """
-    if self._shard_dimension is None or self._number_of_shards is None:
-      # Don't raise an error if the config is unset.
-      return None
-    if shard_index is not None:
-      if shard_index < 0 or shard_index >= self.number_of_shards:
-        raise ValueError("shard_index %d, but must be in [0,%d)." %
-                         (shard_index, self._number_of_shards))
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    if dims[self._shard_dimension] is None:
-      raise ValueError("shape %s must have a fixed size for dimension %d "
-                       "that is known at graph construction time." %
-                       (shape.as_list(), self._shard_dimension))
-    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
-      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
-                       (shape.as_list(), self._number_of_shards,
-                        self._shard_dimension))
-    dims[self._shard_dimension] /= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def _unshard_shape(self, shape):
-    """Return the unsharded shape that would generate a given sharded shape.
-
-    Args:
-      shape: the sharded shape to unshard
-
-    Returns:
-      The unsharded shape.
-
-    Raises:
-      ValueError: if shape is unknown or does not contain
-        self.shard_dimension
-      TypeError: if shape is not convertible to a TensorShape
-    """
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    dims[self._shard_dimension] *= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def get_unsharded_shape(self, shapes):
-    """Returns the shape of an unsharded Tensor given a list of shards.
-
-    When given a list of shapes of shards, returns the shape of the
-    unsharded Tensor that would generate the shards. Sets defaults for the
-    policy if number_of_shards or shard_dimension is None.
-
-    Args:
-      shapes: The shapes of the Tensor shards to be combined.
-
-    Returns:
-      The shape of the unsharded version of the Tensor.
-
-    Raises:
-      ValueError: if shapes is not a list of length
-        self.number_of_shards; or any element of shapes is not a valid
-        shape consistent with the sharding policy; or the list of
-        shapes is not a valid sharding of a full shape.
-      TypeError: if an element of shapes is not convertible to a
-        TensorShape
-    """
-    self._fill_default_values()
-    if len(shapes) != self.number_of_shards:
-      raise ValueError(
-          "shapes is %s but must be a list of length number_of_shards=%d" % (
-              str(shapes), self.number_of_shards))
-    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
-    for i in xrange(self.number_of_shards - 1):
-      if not unsharded_shapes[i].is_compatible_with(
-          unsharded_shapes[self.number_of_shards - 1]):
-        raise ValueError(
-            "sharded shapes %s are not consistent shards of a full shape "
-            "sharded %d ways along dimension %d" % (
-                str(shapes), self.number_of_shards, self.shard_dimension))
-    return unsharded_shapes[0]
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_sharding import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e..258d34ddaf5250e49c5a354caf018e4b64abae62 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -1,156 +1,25 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
-_RETRY_TIMES = 120
-_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
-
-_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
-
-# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
-# including num_cores and num_hosts.
-_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
-    'num_cores',
-    'num_hosts',
-    'num_of_cores_per_host',
-    'topology',
-    'devices',
-])
-
-
-def _query_tpu_system_metadata(master_address, cluster_def=None,
-                               query_topology=False):
-  """Automatically detects the TPU system metadata in the system."""
-  tpu_core_count = 0
-  devices = []
-  device_dict = collections.defaultdict(list)
-
-  # TODO(b/120564445): Replace with standard library for retries.
-  retry_count = 1
-  while True:
-    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
-                 master_address)
-    try:
-      with ops.Graph().as_default():
-        with session_lib.Session(
-            master_address,
-            config=get_session_config_with_timeout(
-                _PINGING_MASTER_TIMEOUT_IN_MS,
-                cluster_def)) as sess:
-          devices = sess.list_devices()
-          for device in devices:
-            match = _TPU_DEVICE_REG.match(device.name)
-            if match:
-              host_id = match.group(1)
-              core_id = match.group(2)
-              device_dict[host_id].append(core_id)
-              tpu_core_count += 1
-          break
-    except errors.DeadlineExceededError:
-      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
-             'not be ready (still scheduling) or the Tensorflow master address '
-             'is incorrect: got (%s).' %
-             (master_address))
-
-      # TODO(xiejw): For local or grpc master we might not need retry logic
-      # here.
-      if retry_count <= _RETRY_TIMES:
-        logging.warning('%s', msg)
-        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
-        retry_count += 1
-      else:
-        raise ValueError(msg)
-
-  num_of_cores_per_host = 0
-  if tpu_core_count:
-    num_cores_per_host_set = set(
-        [len(core_ids) for core_ids in device_dict.values()])
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError(
-          'TPU cores on each host is not same. This should not happen!. '
-          'devices: {}'.format(devices))
-    num_of_cores_per_host = num_cores_per_host_set.pop()
-
-  topology = None
-  if query_topology:
-    if not tpu_core_count:
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system (master address {}). '
-          'This usually means the master address is incorrect or the '
-          'TPU worker has some problems. Available devices: {}'.format(
-              master_address, devices))
-
-    topology = _obtain_topology(master_address, cluster_def)
-
-  metadata = _TPUSystemMetadata(
-      num_cores=tpu_core_count,
-      num_hosts=len(device_dict),
-      num_of_cores_per_host=num_of_cores_per_host,
-      topology=topology,
-      devices=devices)
-
-  if tpu_core_count:
-    logging.info('Found TPU system:')
-    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
-    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
-    logging.info('*** Num TPU Cores Per Worker: %d',
-                 metadata.num_of_cores_per_host)
-    for device in metadata.devices:
-      logging.info('*** Available Device: %s', device)
-  else:
-    logging.info('Failed to find TPU: %s', metadata)
-  return metadata
-
-
-def _obtain_topology(master_address, cluster_def):
-  """Obtains TPU fabric topology."""
-  try:
-    logging.info('Initializing TPU system (master: %s) to fetch topology '
-                 'for model parallelism. This might take a while.',
-                 master_address)
-    with ops.Graph().as_default():
-      session_config = get_session_config_with_timeout(
-          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
-      with session_lib.Session(
-          master_address, config=session_config) as sess:
-        topology = sess.run(tpu.initialize_system())
-        return topology
-  except errors.DeadlineExceededError:
-    raise ValueError(
-        'Fail to initialize TPU system with master (%s). '
-        'Please double check the TPU system is functional.' % (
-            master_address))
-
-
-def get_session_config_with_timeout(timeout_in_secs, cluster_def):
-  """Returns a session given a timeout and a cluster configuration."""
-  config = config_pb2.ConfigProto(
-      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
-  return config
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_system_metadata import *
+# used by tests
+from tensorflow.python.tpu.tpu_system_metadata import _query_tpu_system_metadata
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
deleted file mode 100644
index 6bdaa528f9f946ae4b9813d554409da2406b1f8d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Tests for tpu_function helpers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.layers import convolutional
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import math_ops
-
-from tensorflow.python.platform import test
-
-
-class TPUContextTest(test.TestCase):
-
-  def testIsInContext(self):
-    """Test that control_flow_util can check that we're in a TPU context."""
-    z1 = array_ops.identity(1)
-    pivot = control_flow_ops.no_op()
-    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
-    context.Enter()
-    z2 = array_ops.identity(1)
-    context.Exit()
-    self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
-    self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
-
-
-class TPULayerRewriteTest(test.TestCase):
-
-  def testUsingInfeedQueueWithRegularizer(self):
-    """Test that Layer regularizers can reference data created in loops."""
-
-    def make_regularizer(scale):
-      return lambda inputs: scale * math_ops.reduce_sum(math_ops.square(inputs))
-
-    def training_step(inputs, scale):
-      outputs = convolutional.conv2d(
-          inputs,
-          filters=16,
-          kernel_size=(3, 3),
-          data_format="channels_first",
-          kernel_regularizer=make_regularizer(scale))
-      loss = math_ops.reduce_mean(math_ops.square(outputs))
-      return loss.op
-
-    inputs = array_ops.zeros(shape=(128, 32, 32, 16))
-    scale = array_ops.ones(shape=())
-    infeed = tpu_feed.InfeedQueue(
-        tuple_types=[dtypes.float32, dtypes.float32],
-        tuple_shapes=[inputs.shape, scale.shape])
-
-    def loop():
-      return training_loop.repeat(5, training_step, infeed_queue=infeed)
-
-    # This should not throw an error.
-    tpu.rewrite(loop)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 0187b4bec6ecc55943bf48b9268a74e18ea5b488..673359b232d6857d468723873c449cb3e48168c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -1,214 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Library for constructing a training loop, suitable for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-
-
-def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop for TPUs.
-
-  The set of loop-carried tensors corresponds to `inputs`.  Both
-  `condition` and `body` take the current value of the loop-carried
-  tensors. 'body' additionally takes a tuple of infeed from
-  infeed_queue if infeed_queue is not None. `condition` must return a
-  single boolean value that determines whether iteration
-  continues. `body` must return an updated list of values for the
-  loop-carried tensors.
-
-  Args:
-    condition: a Python function that builds the loop condition.
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop, or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-
-  Returns:
-    The final values of the loop-carried tensors.
-
-  Raises:
-    TypeError: if body or condition has the wrong signature.
-  """
-  del name
-  # Converts inputs to Tensors.
-  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
-                                      x in inputs]
-  input_types = [x.dtype for x in inputs]
-  input_arity = len(inputs)
-
-  body_arg_error = xla.check_function_argument_count(
-      body, input_arity, infeed_queue)
-  if body_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
-              input_arity, str([i.name for i in inputs]), body_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s and %d additional inputs from "
-          "infeed, but the computation needs %s" % (input_arity, str(
-              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
-                                                    body_arg_error))
-  condition_arg_error = xla.check_function_argument_count(
-      condition, input_arity, None)
-  if condition_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
-                                  condition_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s. Note that infeed is not passed to the loop "
-          "condition." % (input_arity, str([i.name for i in inputs]),
-                          condition_arg_error))
-
-  def condition_wrapper(*inputs):
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-    return condition(*inputs)
-
-  def body_wrapper(*inputs):
-    """Wrapper around `body` that handles infeed queues and control deps."""
-    inputs = list(inputs)
-
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-
-    # Runs `body` with the dequeue_ops appended.
-    if infeed_queue:
-      number_of_shards = tpu_function.get_tpu_context().number_of_shards
-      if number_of_shards is None:
-        raise ValueError("Can't build training loop with infeed when there is "
-                         "no tpu_shard_context. Are you building a loop or "
-                         "graph directly rather than from inside tpu.rewrite, "
-                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
-      infeed_queue.set_number_of_shards(number_of_shards)
-      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
-    else:
-      dequeue_ops = []
-    outputs = body(*(inputs + dequeue_ops))
-
-    # If the computation only returned one value, make it a tuple.
-    if not isinstance(outputs, (list, tuple)):
-      outputs = (outputs,)
-
-    outputs = [
-        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-        for o in outputs
-    ]
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs
-                      if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          "TPU training loop body must return zero or more Tensor values "
-          "followed by zero or more Operations.")
-
-    output_types = [op.dtype for op in output_tensors]
-    if input_types != output_types:
-      raise TypeError(
-          "Mismatch between input types and output types for training loop "
-          "body: {} vs {}".format(input_types, output_types))
-
-    # Add the dequeue operations to output_operations to ensure they are run
-    # by the loop, even if the programmer's loop body does not use them.
-    output_operations += dequeue_ops
-
-    # Add a dummy output, if needed.
-    if not output_tensors:
-      output_tensors = array_ops.constant(0)
-
-    if output_operations:
-      # TODO(phawkins): in principle this is too restrictive since it serializes
-      # the training loop steps. In practice it does not matter since this loop
-      # will be compiled by XLA.
-      return control_flow_ops.tuple(output_tensors,
-                                    control_inputs=output_operations)
-    else:
-      return output_tensors
-
-  # If the body has arity 0, add a dummy loop-carried value to which we can add
-  # control dependencies from any side-effecting operations.
-  if input_arity == 0:
-    inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(
-      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
-
-
-def repeat(n, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop that executes a fixed number of iterations.
-
-  The set of loop-carried tensors correspond to `inputs`.
-  `body` must be a function that takes and returns the values of the
-  loop-carried tensors.
-
-  Args:
-    n: the number of loop iterations
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-  Returns:
-    The final values of the loop-carried tensors.
-  Raises:
-    ValueError: if there is a type error.
-  """
-  def _convert_to_list(xs):
-    if not isinstance(xs, (list, tuple)):
-      return [xs]
-    else:
-      return list(xs)
-
-  def cond(i, *args):
-    del args
-    return i < n
-
-  def body_wrapper(i, *args):
-    return [i + 1] + _convert_to_list(body(*args))
-
-  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
-  outputs = while_loop(
-      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
-  outputs = _convert_to_list(outputs)
-  if len(outputs) == 1:
-    # Returns the Op rather than an empty list.
-    return outputs[0].op
-  else:
-    return outputs[1:]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.training_loop import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b..8d9b70d46eb42c9a525eeafc51d07f0ad4241d52 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -1,51 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Utilities for the functionalities."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-import six
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-
-def check_positive_integer(value, name):
-  """Checks whether `value` is a positive integer."""
-  if not isinstance(value, six.integer_types):
-    raise TypeError('{} must be int, got {}'.format(name, type(value)))
-
-  if value <= 0:
-    raise ValueError('{} must be positive, got {}'.format(name, value))
-
-
-# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
-# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
-# python/estimator/util.py.
-class MultiHostDatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes all passed iterators."""
-
-  def __init__(self, dataset_initializers):
-    self._initializers = dataset_initializers
-
-  def after_create_session(self, session, coord):
-    del coord
-    start = time.time()
-    session.run(self._initializers)
-    logging.info('Initialized dataset iterators in %d seconds',
-                 time.time() - start)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.util import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index f6427ae05a20f253edf030eff0f860361616042b..5bc4c3b88efd641b6f17a54753a29b0603c2b98c 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -264,9 +264,9 @@ py_test(
 
 py_test(
     name = "training_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/training/training_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index e7f23edc901eacfa3a753792c2dbf738bb5a9421..10f3f88f3eb877998f3498018863b4972ee45b07 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -400,7 +400,7 @@ def bucket_by_sequence_length(input_length,
         math_ops.less_equal(buckets_min, input_length),
         math_ops.less(input_length, buckets_max))
     which_bucket = math_ops.reduce_min(array_ops.where(conditions_c))
-    which_bucket = math_ops.to_int32(which_bucket)
+    which_bucket = math_ops.cast(which_bucket, dtypes.int32)
 
     if shapes is not None:
       shapes = [tensor_shape.scalar()] + shapes
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 27f0d9b2e38c433d4fb4573285ecb8c9946112e8..cb0a25f333b2bba9c4eee991180eab2a083eeb31 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -353,8 +353,10 @@ class HParams(object):
   def my_program():
     # Create a HParams object specifying the names and values of the
     # model hyperparameters:
-    hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
-                         activations=['relu', 'tanh'])
+    hparams = tf.contrib.training.HParams(
+        learning_rate=0.1,
+        num_hidden_units=100,
+        activations=['relu', 'tanh'])
 
     # Override hyperparameters values by parsing the command line
     hparams.parse(args.hparams)
@@ -387,7 +389,7 @@ class HParams(object):
     # Define 3 hyperparameters: 'learning_rate' is a float parameter,
     # 'num_hidden_units' an integer parameter, and 'activation' a string
     # parameter.
-    hparams = tf.HParams(
+    hparams = tf.contrib.training.HParams(
         learning_rate=0.1, num_hidden_units=100, activation='relu')
 
     hparams.activation ==> 'relu'
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 53e4f23a7cd940c026e462dc7fb55cf9f175bf02..ce3d5ec1de569a61f4b58a4522acae1be1fc59ee 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1597,7 +1597,7 @@ def _padding(sequences, num_unroll):
   else:  # Only have SparseTensors
     sparse_lengths = [value.dense_shape[0] for value in sequences_dict.values()
                       if isinstance(value, sparse_tensor.SparseTensor)]
-    length = math_ops.reduce_max(math_ops.to_int32(sparse_lengths))
+    length = math_ops.reduce_max(math_ops.cast(sparse_lengths, dtypes.int32))
 
   unroll = array_ops.constant(num_unroll)
   padded_length = length + ((unroll - (length % unroll)) % unroll)
@@ -1620,8 +1620,9 @@ def _padding(sequences, num_unroll):
       # 3. concat values with paddings
       padded_sequences[key] = array_ops.concat([value, paddings], 0)
     else:
-      padded_shape = array_ops.concat([[math_ops.to_int64(padded_length)],
-                                       value.dense_shape[1:]], 0)
+      padded_shape = array_ops.concat(
+          [[math_ops.cast(padded_length, dtypes.int64)], value.dense_shape[1:]],
+          0)
       padded_sequences[key] = sparse_tensor.SparseTensor(
           indices=value.indices,
           values=value.values,
@@ -1834,8 +1835,8 @@ def _reconstruct_sparse_tensor_seq(sequence,
     Returns:
       A SparseTensor with a +1 higher rank than the input.
     """
-    idx_batch = math_ops.to_int64(
-        math_ops.floor(sp_tensor.indices[:, 0] / num_unroll))
+    idx_batch = math_ops.cast(
+        math_ops.floor(sp_tensor.indices[:, 0] / num_unroll), dtypes.int64)
     idx_time = math_ops.mod(sp_tensor.indices[:, 0], num_unroll)
     indices = array_ops.concat(
         [
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index fc6e38ab4a5243cb7502f4ca42db03cbfd342a40..4ceb6e9350f5167efc8f7266d4e748cc6fa4ffd6 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -244,7 +244,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -354,11 +353,11 @@ def multiply_gradients(grads_and_vars, gradient_multipliers):
         raise ValueError('Requested multiple of `None` gradient.')
 
       if isinstance(grad, ops.IndexedSlices):
-        tmp = grad.values * constant_op.constant(
+        tmp = grad.values * ops.convert_to_tensor(
             gradient_multipliers[key], dtype=grad.dtype)
         grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape)
       else:
-        grad *= constant_op.constant(
+        grad *= ops.convert_to_tensor(
             gradient_multipliers[key], dtype=grad.dtype)
     multiplied_grads_and_vars.append((grad, var))
   return multiplied_grads_and_vars
@@ -433,7 +432,7 @@ def create_train_op(total_loss,
   else:
     # Make sure that variables_to_train are in tf.trainable_variables()
     for v in variables_to_train:
-      assert v in tf_variables.trainable_variables()
+      assert v.trainable or v in tf_variables.trainable_variables()
 
   assert variables_to_train
 
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f..7b2bc30e3a85ed890e3c66ceeb448cbeb61e86d3 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -16,10 +16,15 @@ cc_library(
     srcs = ["convert_graphdef_memmapped_format_lib.cc"],
     hdrs = ["convert_graphdef_memmapped_format_lib.h"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:immutable_constant_op",
     ],
@@ -39,7 +44,7 @@ tf_cc_test(
     name = "convert_graphdef_memmapped_format_test",
     srcs = ["convert_graphdef_memmapped_format_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     deps = [
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
index 444c863b942ef8bce8d54d59765563b12eb6087e..e616778665a9c95b30099b128ec5d1e181ba0618 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -25,12 +25,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-namespace grpc {
-class ServerBuilder;
-class ServerCompletionQueue;
-class Alarm;
-}  // namespace grpc
-
 namespace tensorflow {
 
 class GrpcVerbsService : public AsyncServiceInterface {
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index cfb9b7ddd7d88c150e47caff66f0865fcaec662c..2432c34ae2353d5d7bca03d80a043b5875ef8cce 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -27,14 +27,6 @@ limitations under the License.
 
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
 
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-}  // namespace grpc
-
 namespace tensorflow {
 
 namespace grpc {
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 9db80f6b5736d849d88e1e41ea467a5ff11844f5..b4b6b705f4bab74ac9579ea0354bb7306f06b312 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -1086,7 +1086,7 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
       // The tensor must be copied from GPU to CPU, because either:
       // 1. The tensor is located on a non GDR compatible GPU.
       // 2. The tensor's meta-data has changed.
-      Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+      Allocator* alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
       copy = Tensor(alloc, in.dtype(), in.shape());
       CountCopies(rm_.name_, (void*)DMAHelper::base(&in),
                   (void*)DMAHelper::base(&copy), in.TotalBytes(), true);
@@ -1543,7 +1543,7 @@ bool RdmaTensorRequest::AllocateTensors() {
     if (mr_ == nullptr) {
       // Can't RDMA directly to result. Use a proxy.
       proxy_tensor_ =
-          new Tensor(GPUProcessState::singleton()->GetCUDAHostAllocator(0),
+          new Tensor(GPUProcessState::singleton()->GetGpuHostAllocator(0),
                      result_tensor_->dtype(), result_tensor_->shape());
       rdma_addr_ = DMAHelper::base(proxy_tensor_);
       mr_ =
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 2f2375427862ad1e99a0e6bfc506382d200e9b1d..5ac9f46447c518c342b565b2b32bac56ead7be5e 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -277,8 +277,8 @@ void RdmaMgr::InitAllocators() {
   ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
 
 #if GOOGLE_CUDA
-  GPUProcessState::singleton()->AddCUDAHostAllocVisitor(0, alloc_visitor);
-  GPUProcessState::singleton()->AddCUDAHostFreeVisitor(0, free_visitor);
+  GPUProcessState::singleton()->AddGpuHostAllocVisitor(0, alloc_visitor);
+  GPUProcessState::singleton()->AddGpuHostFreeVisitor(0, free_visitor);
 
   if (IsGDRAvailable()) {
     // Note we don't free allocated GPU memory so there is no free visitor
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 19ef109f671ee57ce2aceb55110c50aa44352223..d07fd5ae6e9cc0dbf67c6b6a4e8db086b4c74aa1 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -81,7 +81,10 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
 Status VerbsServer::Init(ServiceInitFunction service_func,
                          RendezvousMgrCreationFunction rendezvous_mgr_func) {
   std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
-  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  GrpcServerOptions opts;
+  opts.service_func = service_func;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  Status s = GrpcServer::Init(opts);
   {
     mutex_lock l(mu_);
     CHECK_EQ(verbs_state_, DISCONNECTED);
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 084db7a0fd470574311501adb5bab8b5f7e19dab..9ff4b631634fe90e968cc6ff2bc4c2156deda1bb 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -63,7 +63,15 @@
 # //tensorflow/tensorflow.bzl) will include the necessary symbols in binary
 # build targets.
 
+package_group(
+    name = "dependency_whitelist",
+    packages = [
+        "//learning/freud/topic_models/tensorflow/...",
+    ],
+)
+
 package(default_visibility = [
+    ":dependency_whitelist",
     "//tensorflow:internal",
     "//tensorflow_models:__subpackages__",
 ])
@@ -77,6 +85,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
     "if_android",
+    "if_emscripten",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
@@ -87,10 +96,12 @@ load(
     "tf_copts",
     "tf_cuda_library",
     "tf_features_nomodules_if_android",
+    "tf_features_nomodules_if_emscripten",
     "tf_gen_op_libs",
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
+    "tf_opts_nortti_if_emscripten",
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
@@ -125,7 +136,9 @@ load(
     "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
-    "tf_additional_proto_compiler_hdrs",
+    "tf_additional_numa_deps",
+    "tf_additional_numa_lib_defines",
+    "tf_additional_numa_copts",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
@@ -144,6 +157,7 @@ load(
     "tf_protos_grappler",
     "tf_protos_grappler_impl",
     "tf_pyclif_proto_library",
+    "tf_grpc_service_all",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -201,10 +215,12 @@ COMMON_PROTO_SRCS = [
     "protobuf/cluster.proto",
     "protobuf/debug.proto",
     "protobuf/device_properties.proto",
+    "protobuf/graph_debug_info.proto",
     "protobuf/queue_runner.proto",
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
+    "protobuf/verifier_config.proto",
     "util/event.proto",
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
@@ -224,13 +240,15 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
-    "protobuf/checkpointable_object_graph.proto",
+    "protobuf/trackable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
+    "protobuf/saved_object_graph.proto",
+    "protobuf/struct.proto",
     "protobuf/tensorflow_server.proto",
     "protobuf/transport_options.proto",
     "util/test_log.proto",
@@ -381,15 +399,15 @@ cc_library(
         ":platform_port_hdrs",
         ":platform_port_internal_hdrs",
     ],
-    copts = tf_copts(),
+    copts = tf_copts() + tf_additional_numa_copts(),
     visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         ":lib_platform",
         ":platform_base",
-        "//tensorflow/core/platform/default/build_config:port",
         "@com_google_absl//absl/base",
+        "//tensorflow/core/platform/default/build_config:port",
         "@snappy",
-    ],
+    ] + tf_additional_numa_deps(),
 )
 
 filegroup(
@@ -413,9 +431,8 @@ cc_library(
     name = "platform_protobuf",
     srcs = tf_platform_hdrs([
         "protobuf.h",
-    ]) + tf_platform_srcs([
-        "protobuf.cc",
     ]) + [
+        "platform/protobuf.cc",
         "platform/protobuf_util.cc",
         "lib/core/status.h",
     ],
@@ -434,6 +451,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_services",
+    srcs = [],
+    hdrs = [
+        "platform/grpc_services.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = tf_grpc_service_all(),
+)
+
 cc_library(
     name = "human_readable_json",
     srcs = tf_platform_srcs(["human_readable_json.cc"]),
@@ -452,10 +480,7 @@ cc_library(
     hdrs = ["platform/logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        ":lib_proto_parsing",
-        "@protobuf_archive//:protobuf",
-    ],
+    deps = [":lib_proto_parsing"],
 )
 
 filegroup(
@@ -506,6 +531,7 @@ cc_library(
         ":platform_port",
         ":platform_protobuf",
         "//tensorflow/core/platform/default/build_config:env",
+        "//tensorflow/core/platform/default/build_config:port",
     ],
 )
 
@@ -661,7 +687,7 @@ cc_library(
     name = "lib_proto_compiler",
     hdrs = [
         "platform/protobuf_compiler.h",
-    ] + tf_additional_proto_compiler_hdrs(),
+    ],
     copts = tf_copts(),
     deps = tf_lib_proto_compiler_deps() + [
         ":lib_proto_parsing",
@@ -907,6 +933,7 @@ tf_cuda_library(
         "framework/tensor_slice.h",
         "framework/tensor_types.h",
         "framework/tensor_util.h",
+        "framework/thread_factory.h",
         "framework/tracking_allocator.h",
         "framework/type_index.h",
         "framework/type_traits.h",
@@ -953,7 +980,10 @@ tf_cuda_library(
         "util/mkl_util.h",
     ]),
     visibility = ["//visibility:public"],
-    deps = [":framework_internal"],
+    deps = [
+        ":framework_internal",
+        "@com_google_absl//absl/base",
+    ],
 )
 
 cc_library(
@@ -1046,13 +1076,13 @@ cc_library(
         "platform/default/integral_types.h",
         "platform/default/logging.h",
         "platform/default/mutex.h",
-        "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/dynamic_annotations.h",
         "platform/macros.h",
         "platform/mutex.h",
         "platform/platform.h",
         "platform/prefetch.h",
+        "platform/protobuf.h",
         "platform/thread_annotations.h",
         "platform/types.h",
         "platform/cpu_info.h",
@@ -1138,6 +1168,13 @@ tf_gen_op_libs(
     deps = [":protos_all_cc"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "mkl_array_ops",
+    ],
+    deps = [":protos_all_cc"],
+)
+
 tf_gen_op_libs(
     op_lib_names = [
         "audio_ops",
@@ -1158,6 +1195,29 @@ tf_gen_op_libs(
     deps = [":lib"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    deps = [
+        ":lib",
+        ":lib_proto_parsing",
+        ":protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
+    ],
+)
+
 # And one for all user ops
 cc_library(
     name = "user_ops_op_lib",
@@ -1274,10 +1334,23 @@ cc_library(
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
+        ":tpu_configuration_ops_op_lib",
+        ":tpu_cross_replica_ops_op_lib",
+        ":tpu_embedding_ops_op_lib",
+        ":tpu_functional_ops_op_lib",
+        ":tpu_heartbeat_ops_op_lib",
+        ":tpu_host_compute_ops_op_lib",
+        ":tpu_infeed_ops_op_lib",
+        ":tpu_outfeed_ops_op_lib",
+        ":tpu_ordinal_selector_ops_op_lib",
+        ":tpu_replication_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
+    ] + if_mkl([
+        ":mkl_array_ops_op_lib",
+        ":mkl_nn_ops_op_lib",
+    ]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1379,8 +1452,9 @@ cc_library(
 # This includes implementations of all kernels built into TensorFlow.
 cc_library(
     name = "all_kernels_impl",
-    visibility = ["//visibility:private"],
+    visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
+        "//tensorflow/c/kernels:bitcast_op",
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:batch_kernels",
@@ -1449,6 +1523,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_identity_op",
         "//tensorflow/core/kernels:mkl_input_conversion_op",
         "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_requantize_ops",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
@@ -1522,6 +1597,7 @@ cc_library(
         "framework/function_testlib.h",
         "framework/shape_inference_testutil.h",
         "framework/tensor_testutil.h",
+        "graph/benchmark_testlib.h",
         "graph/testlib.h",
         # TODO(josh11b): Drop this once users are depending on
         # kernels:ops_testutil instead.
@@ -1537,6 +1613,7 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
+        ":ops",
         ":protos_all_cc",
         ":shape_inference_testutil",
         ":tensor_testutil",
@@ -1561,8 +1638,8 @@ cc_library(
     srcs = ["common_runtime/testlib_ops.cc"],
     linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
+        ":framework",
+        ":lib",
     ],
     alwayslink = 1,
 )
@@ -1637,6 +1714,7 @@ filegroup(
             "platform/**/logger.cc",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
+            "platform/rocm.h",
             "platform/google/**/*",
             "platform/hadoop/**/*",
             "platform/gif.h",
@@ -1663,6 +1741,7 @@ filegroup(
         "//tensorflow/core/kernels:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
+        "//tensorflow/c:srcs",
     ] + glob(
         [
             "common_runtime/**/*.h",
@@ -1767,6 +1846,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    srcs = if_emscripten(["//tensorflow/core:mobile_srcs_no_runtime"]),
+    copts = ["-DSUPPORT_SELECTIVE_REGISTRATION"] + tf_opts_nortti_if_emscripten(),
+    defines = ["TENSORFLOW_LITE_PROTOS"],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":emscripten_proto_lib_no_rtti_lite_runtime",
+        ":mobile_additional_lib_deps",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@zlib_archive//:zlib",
+    ],
+    alwayslink = 1,
+)
+
 # Native library support for iOS applications.
 #
 # bazel  build --config=ios_x86_64 \
@@ -1860,6 +1962,7 @@ filegroup(
             "**/*testutil*",
             "**/*testlib*",
             "**/*main.cc",
+            "**/tpu_*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -1962,6 +2065,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:rocm",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Clif-related proto libraries.
 
@@ -2066,6 +2177,29 @@ tf_pyclif_proto_library(
 # -----------------------------------------------------------------------------
 # Internal targets
 
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["protobuf/autotuning.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    provide_cc_alias = True,
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+tf_proto_library(
+    name = "conv_autotuning_proto",
+    srcs = ["protobuf/conv_autotuning.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+    provide_cc_alias = True,
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
 tf_proto_library_cc(
     name = "worker_proto",
     srcs = ["protobuf/worker.proto"],
@@ -2135,6 +2269,7 @@ LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
         "platform/jpeg.h",
         "platform/png.h",
         "platform/**/cuda.h",
+        "platform/**/rocm.h",
         "platform/**/stream_executor.h",
     ],
 )
@@ -2183,11 +2318,14 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
 ]
 
 # Replicated for lib_internal and lib_internal_impl.
-LIB_INTERNAL_DEFINES = (tf_additional_lib_defines() + [
-                            "TF_USE_SNAPPY",
-                        ] + tf_additional_verbs_lib_defines() +
-                        tf_additional_mpi_lib_defines() +
-                        tf_additional_gdr_lib_defines())
+LIB_INTERNAL_DEFINES = (
+    tf_additional_lib_defines() + [
+        "TF_USE_SNAPPY",
+    ] + tf_additional_verbs_lib_defines() +
+    tf_additional_mpi_lib_defines() +
+    tf_additional_gdr_lib_defines() +
+    tf_additional_numa_lib_defines()
+)
 
 cc_library(
     name = "lib_internal",
@@ -2220,7 +2358,6 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
-        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2238,12 +2375,14 @@ cc_library(
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
+            "platform/protobuf.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
+            "platform/**/rocm.h",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
@@ -2260,17 +2399,20 @@ cc_library(
     copts = tf_copts(),
     defines = LIB_INTERNAL_DEFINES,
     deps = tf_additional_lib_deps() + [
-        ":lib_hash_crc32c_accelerate_internal",
-        ":lib_proto_parsing",
-        ":abi",
-        ":core_stringpiece",
-        "//third_party/eigen3",
-        "//tensorflow/core/platform/default/build_config:platformlib",
-        "@snappy",
-        "@zlib_archive//:zlib",
-        "@double_conversion//:double-conversion",
-        "@protobuf_archive//:protobuf",
-    ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
+               ":lib_hash_crc32c_accelerate_internal",
+               ":lib_proto_parsing",
+               ":abi",
+               ":core_stringpiece",
+               "@com_google_absl//absl/memory",
+               "@com_google_absl//absl/strings",
+               "//third_party/eigen3",
+               "//tensorflow/core/platform/default/build_config:platformlib",
+               "@snappy",
+               "@zlib_archive//:zlib",
+               "@double_conversion//:double-conversion",
+               "@protobuf_archive//:protobuf",
+           ] + tf_protos_all_impl() + tf_protos_grappler_impl() +
+           tf_additional_numa_deps(),
 )
 
 # File compiled with extra flags to get cpu-specific acceleration.
@@ -2360,12 +2502,6 @@ cc_library(
 
 cc_library(
     name = "tflite_portable_logging",
-    srcs = [
-    ] + if_ios([
-        "platform/default/logging.cc",
-        "platform/env_time.cc",
-        "platform/posix/env_time.cc",
-    ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
         "platform/default/integral_types.h",
@@ -2374,10 +2510,11 @@ cc_library(
         "platform/macros.h",
         "platform/platform.h",
         "platform/types.h",
-    ] + if_windows(["platform/windows/integral_types.h"]) + if_ios(["platform/env_time.h"]),
+    ],
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
+        ":platform_base",
         "//tensorflow/core/platform/default/build_config:logging",
     ],
 )
@@ -2643,7 +2780,6 @@ tf_cuda_library(
             "example/**/*.cc",
             "framework/**/*.cc",
             "util/**/*.cc",
-        ] + [
             "graph/edgeset.cc",
             "graph/graph.cc",
             "graph/graph_def_builder.cc",
@@ -2695,6 +2831,7 @@ tf_cuda_library(
         ":stats_calculator_portable",
         ":version_lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/platform/default/build_config:platformlib",
@@ -2741,6 +2878,7 @@ tf_cuda_library(
     srcs = ["platform/stream_executor.h"],
     hdrs = [
         "platform/cuda.h",
+        "platform/rocm.h",
         "platform/stream_executor.h",
     ],
     deps = [
@@ -2831,6 +2969,7 @@ tf_cuda_library(
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2887,6 +3026,7 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/shared_counter.h",
     "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
     "common_runtime/hierarchical_tree_broadcaster.h",
@@ -2911,6 +3051,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/lower_if_while.h",
     "common_runtime/lower_while_op.h",
     "common_runtime/memory_types.h",
+    "common_runtime/metrics.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
@@ -2922,6 +3063,8 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
     "common_runtime/ring_reducer.h",
+    "common_runtime/ring_alg.h",
+    "common_runtime/ring_gatherer.h",
     "common_runtime/session_factory.h",
     "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
@@ -2946,6 +3089,8 @@ tf_cuda_library(
         "common_runtime/collective_param_resolver_local.cc",
         "common_runtime/collective_rma_local.cc",
         "common_runtime/collective_util.cc",
+        "common_runtime/colocation_graph.cc",
+        "common_runtime/colocation_graph.h",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2966,6 +3111,7 @@ tf_cuda_library(
         "common_runtime/lower_if_while.cc",
         "common_runtime/lower_while_op.cc",
         "common_runtime/memory_types.cc",
+        "common_runtime/metrics.cc",
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
@@ -2978,11 +3124,14 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_alg.cc",
+        "common_runtime/ring_gatherer.cc",
         "common_runtime/ring_reducer.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
         "common_runtime/session_options.cc",
         "common_runtime/session_state.cc",
+        "common_runtime/single_threaded_cpu_device.cc",
         "common_runtime/stats_publisher_interface.cc",
         "common_runtime/step_stats_collector.cc",
         "common_runtime/threadpool_device.cc",
@@ -3005,6 +3154,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
@@ -3066,15 +3216,6 @@ cc_library(
     deps = [":lib_internal"],
 )
 
-tf_cuda_library(
-    name = "metrics",
-    srcs = ["common_runtime/metrics.cc"],
-    hdrs = ["common_runtime/metrics.h"],
-    deps = [
-        ":lib",
-    ],
-)
-
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
@@ -3091,7 +3232,6 @@ tf_cuda_library(
         ":graph",
         ":lib",
         ":lib_internal",
-        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
@@ -3175,7 +3315,7 @@ cc_library(
 )
 
 GPU_RUNTIME_HEADERS = [
-    "common_runtime/gpu/cuda_host_allocator.h",
+    "common_runtime/gpu/gpu_host_allocator.h",
     "common_runtime/gpu/gpu_bfc_allocator.h",
     "common_runtime/gpu/gpu_cudamalloc_allocator.h",
     "common_runtime/gpu/gpu_debug_allocator.h",
@@ -3458,6 +3598,7 @@ tf_cc_tests(
         "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
+        ":core_cpu_internal",
         ":lib",
         ":lib_internal",
         ":lib_test_internal",
@@ -3502,6 +3643,7 @@ tf_cc_test(
     name = "platform_strings_test",
     size = "small",
     srcs = ["platform/platform_strings_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":lib",
         ":platform_strings",
@@ -3667,6 +3809,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "lib_strings_proto_serialization_test",
+    srcs = ["lib/strings/proto_serialization_test.cc"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_weighted_picker_test",
     size = "medium",
@@ -3719,7 +3875,6 @@ tf_cc_tests(
     srcs = [
         "common_runtime/buf_rendezvous_test.cc",
         "common_runtime/collective_executor_mgr_test.cc",
-        "common_runtime/collective_param_resolver_local_test.cc",
         "common_runtime/collective_rma_local_test.cc",
         "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
@@ -3799,7 +3954,7 @@ tf_cc_tests(
         "util/work_sharder_test.cc",
     ],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -3835,10 +3990,11 @@ tf_cc_tests(
     name = "higher_level_tests_needing_kernels",
     size = "small",
     srcs = [
+        "common_runtime/collective_param_resolver_local_test.cc",
         "graph/graph_constructor_test.cc",
     ],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -3874,7 +4030,6 @@ tf_cc_test(
         "ops/cudnn_rnn_ops_test.cc",
     ],
     deps = [
-        ":cudnn_rnn_ops",
         "//tensorflow/core",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3934,6 +4089,35 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "ring_gatherer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_gatherer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "hierarchical_tree_broadcaster_test",
     size = "medium",
@@ -4472,7 +4656,7 @@ tf_cc_test(
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl([":mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(
@@ -4885,6 +5069,7 @@ tf_cc_tests(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
     ],
 )
 
@@ -4908,6 +5093,7 @@ tf_cc_tests(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -5025,6 +5211,39 @@ transitive_hdrs(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
+load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
+
+genrule(
+    name = "emscripten_proto_config_lite_runtime",
+    outs = ["emscripten_proto_config_lite_runtime.asciipb"],
+    cmd = tf_genrule_cmd_append_to_srcs("optimize_mode:LITE_RUNTIME"),
+    visibility = ["//visibility:private"],
+)
+
+# We are keeping the "android" version of tf_android_core_proto_headers. All it does is
+# normalize CORE_PROTO_SRCS to generate valid output file names.
+tf_portable_proto_library(
+    name = "emscripten_proto_lib_no_rtti_lite_runtime",
+    config = ":emscripten_proto_config_lite_runtime",
+    copts = tf_opts_nortti_if_emscripten(),
+    features = tf_features_nomodules_if_emscripten(),
+    header_outs = tf_android_core_proto_headers(CORE_PROTO_SRCS) + ["//google/protobuf/any.proto.h"],
+    link_full_protobuf = False,
+    prefix_dir = "emscripten_proto_no_rtti",
+    proto_deps = [
+        ":protos_all_cc",
+        "@protobuf_archive//:protobuf",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# There is currently no need for a full proto version of emscripten tf lib lite.
+alias(
+    name = "emscripten_lib_lite_no_runtime",
+    actual = "//tensorflow/core:emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    visibility = ["//visibility:public"],
+)
+
 alias(
     name = "android_srcs_no_runtime",
     actual = ":mobile_srcs_no_runtime",
diff --git a/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e585bae4a373c6d5afe217b74acd37caa0262023
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "AllToAll"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The exchanged result.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  attr {
+    name: "concat_dimension"
+    description: <<END
+The dimension number to concatenate.
+END
+  }
+  attr {
+    name: "split_dimension"
+    description: <<END
+The dimension number to split.
+END
+  }
+  attr {
+    name: "split_count"
+    description: <<END
+The number of splits, this number must equal to the sub-group
+size(group_assignment.get_shape()[1])
+END
+  }
+  summary: "An Op to exchange data across TPU replicas."
+  description: <<END
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Case.pbtxt b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56fef3ae6d0d452cb2caa57c36f35a04584864ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "Case"
+  in_arg {
+    name: "branch_index"
+    description: "The branch selector, an int32 Tensor."
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors passed to the branch function."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "branches"
+    description: <<END
+      A list of functions each of which takes 'inputs' and returns a list of
+      tensors, whose types are the same as what every other branch returns.
+END
+  }
+  summary: "An n-way switch statement which calls a single branch function."
+  description: <<END
+    An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ChooseFastestBranchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ChooseFastestBranchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6beea104b40bc571e307db244d628b21804667a7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ChooseFastestBranchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ChooseFastestBranchDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cd833b9455458511787bec71d45531810574eb9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveGather"
+  summary: "Mutually accumulates multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09062200471127cd5ed299e27afd0631a5d6f902
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "CollectivePermute"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to be permuted. Currently only supports float and
+bfloat16.
+END
+  }
+  in_arg {
+    name: "source_target_pairs"
+    description: <<END
+A tensor with shape [num_pairs, 2].
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The permuted input.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  summary: "An Op to permute tensors across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt b/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50e72a2446a9b4c304e23566fd1b3bbb974fb865
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt
@@ -0,0 +1,101 @@
+op {
+  graph_op_name: "CombinedNonMaxSuppression"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then 
+same boxes are used for all classes otherwise, if `q` is equal to number of 
+classes, class-specific boxes are used.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+representing a single score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size_per_class"
+    description: <<END
+A scalar integer tensor representing the maximum number of 
+boxes to be selected by non max suppression per class
+END
+  }
+  in_arg {
+    name: "max_total_size"
+    description: <<END
+A scalar representing maximum number of boxes retained over all classes.
+END
+  }
+  in_arg {
+    name: "iou_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding whether
+boxes overlap too much with respect to IOU.
+END
+  }
+  in_arg {
+    name: "score_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding when to remove
+boxes based on score.
+END
+  }
+  attr {
+    name: "pad_per_class"
+    description: <<END
+If false, the output nmsed boxes, scores and classes
+are padded/clipped to `max_total_size`. If true, the
+output nmsed boxes, scores and classes are padded to be of length
+`max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+which case it is clipped to `max_total_size`. Defaults to false.
+END
+  }
+  out_arg {
+    name: "nmsed_boxes"
+    description: <<END
+A [batch_size, max_detections, 4] float32 tensor 
+containing the non-max suppressed boxes.
+END
+  }
+  out_arg {
+    name: "nmsed_scores"
+    description: <<END
+A [batch_size, max_detections] float32 tensor 
+containing the scores for the boxes.
+END
+  }
+  out_arg {
+    name: "nmsed_classes"
+    description: <<END
+A [batch_size, max_detections] float32 tensor 
+containing the classes for the boxes.
+END
+  }
+  out_arg {
+    name: "valid_detections"
+    description: <<END
+A [batch_size] int32 tensor indicating the number of
+valid detections per batch item. Only the top num_detections[i] entries in
+nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+entries are zero paddings.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+This operation performs non_max_suppression on the inputs per batch, across
+all classes.
+Prunes away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system. Also note that
+this algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is the final boxes, scores and classes tensor
+returned after performing non_max_suppression.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3a5e8c6eef7393f29273b524da233db8c5769ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "ConfigureDistributedTPU"
+  visibility: HIDDEN
+  out_arg {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "embedding_config"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  attr {
+    name: "tpu_embedding_config"
+    description: <<END
+Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+describes the embedding lookups of the program.
+END
+  }
+  attr {
+    name: "is_global_init"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  summary: "Sets up the centralized structures for a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ab74bb59b8d28e50f6be73673259d1d73ef8c2e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "CrossReplicaSum"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The sum of all the distributed inputs.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be summed.
+END
+  }
+  summary: "An Op to sum inputs across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+and `B, D, F, H` as group 1. Thus we get the outputs:
+`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt
index 7967ca7c5d17abd6451f0cd05c8154c3eaf4766b..03dc530fc589130e9e3d1c7ff69948e0fddaa02a 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt
@@ -16,9 +16,12 @@ direction: Indicates whether a bidirectional model will be used. Should be
 dropout: Dropout probability. When set to 0., dropout is disabled.
 seed: The 1st part of a seed to initialize dropout.
 seed2: The 2nd part of a seed to initialize dropout.
-input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-    num_units].
+input: If time_major is true, this is a 3-D tensor with the shape of
+    [seq_length, batch_size, input_size]. If time_major is false, the shape is
+    [batch_size, seq_length, input_size].
+input_h: If time_major is true, this is a 3-D tensor with the shape of
+    [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+    is [batch_size, num_layer * dir, num_units].
 input_c: For LSTM, a 3-D tensor with the shape of
     [num_layer * dir, batch, num_units]. For other models, it is ignored.
 params: A 1-D tensor that contains the weights and biases in an opaque layout.
@@ -26,8 +29,9 @@ params: A 1-D tensor that contains the weights and biases in an opaque layout.
     separately. Note that they might not be compatible across different
     generations. So it is a good idea to save and restore
 sequence_lengths: a vector of lengths of each input sequence.
-output: A 3-D tensor with the shape of [seq_length, batch_size,
-    dir * num_units].
+output: If time_major is true, this is a 3-D tensor with the shape of
+    [seq_length, batch_size, dir * num_units]. If time_major is false, the
+    shape is [batch_size, seq_length, dir * num_units].
 output_h: The same shape has input_h.
 output_c: The same shape as input_c for LSTM. An empty tensor for other models.
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
@@ -35,6 +39,8 @@ output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
     pass.
 output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
     pass.
+time_major: Indicates whether the input/output format is time major or batch
+    major.
 reserve_space: The same reserve_space produced in the forward operation.
 input_backprop: The backprop to input in the forward pass. Has the same shape
     as input.
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
index 9cde53684d0350510b18c35e991a9f526c5bb212..e076d3cda28f5f99805418a098e31f0158ce8da6 100644
--- a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
@@ -16,9 +16,12 @@ direction: Indicates whether a bidirectional model will be used. Should be
 dropout: Dropout probability. When set to 0., dropout is disabled.
 seed: The 1st part of a seed to initialize dropout.
 seed2: The 2nd part of a seed to initialize dropout.
-input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-    num_units].
+input: If time_major is true, this is a 3-D tensor with the shape of
+    [seq_length, batch_size, input_size]. If time_major is false, the shape is
+    [batch_size, seq_length, input_size].
+input_h: If time_major is true, this is a 3-D tensor with the shape of
+    [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+    is [batch_size, num_layer * dir, num_units].
 input_c: For LSTM, a 3-D tensor with the shape of
     [num_layer * dir, batch, num_units]. For other models, it is ignored.
 params: A 1-D tensor that contains the weights and biases in an opaque layout.
@@ -26,12 +29,15 @@ params: A 1-D tensor that contains the weights and biases in an opaque layout.
     separately. Note that they might not be compatible across different
     generations. So it is a good idea to save and restore
 sequence_lengths: a vector of lengths of each input sequence.
-output: A 3-D tensor with the shape of [seq_length, batch_size,
-    dir * num_units].
+output: If time_major is true, this is a 3-D tensor with the shape of
+    [seq_length, batch_size, dir * num_units]. If time_major is false, the
+    shape is [batch_size, seq_length, dir * num_units].
 output_h: The same shape has input_h.
 output_c: The same shape as input_c for LSTM. An empty tensor for other models.
 is_training: Indicates whether this operation is used for inferenece or
   training.
+time_major: Indicates whether the input/output format is time major or batch
+    major.
 reserve_space: An opaque tensor that can be used in backprop calculation. It
   is only produced if is_training is true.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
index c8152f53c4ded035140abd24ba006bf391641cf1..22c3524360c196bfdeda1221842c3da7af7701ef 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -31,7 +31,8 @@ END
   attr {
     name: "field_names"
     description: <<END
-List of strings containing proto field names.
+List of strings containing proto field names. An extension field can be decoded
+by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9261fff541bd632dd93fd9bce67d40dcbdd369d7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingIntegerBatch"
+  visibility: HIDDEN
+  in_arg {
+    name: "batch"
+    description: <<END
+A list of 1D tensors, one for each embedding table, containing the
+indices into the tables.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  summary: "An op that enqueues a list of input batch tensors to TPUEmbedding."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..054df9d607138bf7383909bc2cd9c177aaa64c41
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseBatch"
+  visibility: HIDDEN
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example and
+feature to which the corresponding embedding_indices and aggregation_weights
+values belong. sample_indices[i] must equal b * nf + f, where nf is the
+number of features from the corresponding table, f is in [0, nf), and
+b is in [0, batch size).
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per sample -- i.e. per
+(training example, feature) -- aggregation weights.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  summary: "An op that enqueues TPUEmbedding input indices from a SparseTensor."
+  description: <<END
+This Op eases the porting of code that uses embedding_lookup_sparse(),
+although some Python preprocessing of the SparseTensor arguments to
+embedding_lookup_sparse() is required to produce the arguments to this Op,
+since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+step.
+
+The tensors at corresponding positions in the three input lists
+must have the same shape, i.e. rank 1 with dim_size() equal to the total
+number of lookups into the table described by the corresponding table_id.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..865fba73b219f1eb1640c7d65a4e5c55e7794e39
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  visibility: HIDDEN
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example to
+which the corresponding embedding_indices and aggregation_weights values
+belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+It corresponds to sp_ids.values in embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per training example
+aggregation weights. It corresponds to sp_weights.values in
+embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  attr {
+    name: "table_ids"
+    description: <<END
+A list of integers specifying the identifier of the embedding table
+(offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+corresponding input. The ith input is looked up using table_ids[i]. The size
+of the table_ids list must be equal to that of sample_indices,
+embedding_indices and aggregation_weights.
+END
+  }
+  summary: "Eases the porting of code that uses tf.nn.embedding_lookup_sparse()."
+  description: <<END
+sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d815b856bfb3c97b9347aa49b22a3b2f00908b4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  endpoint {
+    name: "EuclideanNorm"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the euclidean norm of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalAutoShardDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalAutoShardDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd43502e80905083dd893e39789d2ed4b5b34d10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalAutoShardDataset.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "ExperimentalAutoShardDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "num_workers"
+  description: <<END
+A scalar representing the number of workers to distribute this dataset across.
+END
+  }
+  in_arg {
+  name: "index"
+  description: <<END
+A scalar representing the index of the current worker out of num_workers.
+END
+  }
+  summary: "Creates a dataset that shards the input dataset."
+  description: <<END
+Creates a dataset that shards the input dataset by num_workers, returning a
+sharded dataset for the index-th worker. This attempts to automatically shard
+a dataset by examining the Dataset graph and inserting a shard op before the
+inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
+
+This dataset will throw a NotFound error if we cannot shard the dataset
+automatically.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8455308e5c8ea178680ecdc6d443054f198ede6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "ExperimentalRebatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "num_workers"
+  description: <<END
+A scalar representing the number of workers to distribute this batch across. As
+a result of this transformation the current batch size would end up being
+divided  by this parameter.
+END
+  }
+  summary: "Creates a dataset that changes the batch size."
+  description: <<END
+Creates a dataset that changes the batch size of the dataset to current batch
+size // num_workers.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
index c10a1bb778e1d8b45b59113d255d69c55a224643..4b951659a2b46a7bb50f038b156b78153d738c6d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
@@ -48,6 +48,14 @@ END
     description: <<END
 indicates if the noise should be generated using a
 uniform distribution or a Gaussian distribution.
+END
+  }
+  attr {
+    name: "noise"
+    description: <<END
+indicates if the noise should `uniform`, `gaussian`, or
+`zero`. The default is `uniform` which means the the noise type
+will be decided by `uniform_noise`.
 END
   }
   summary: "Extracts a glimpse from the input tensor."
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d89457d99f1f4a76020a130b4370eaa64f1677ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "InfeedDequeue"
+  visibility: HIDDEN
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  summary: "A placeholder op for a value that will be fed into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b87b3b947c4f8fa7d35839885bcc8b5356c4917
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "InfeedDequeueTuple"
+  visibility: HIDDEN
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  summary: "Fetches multiple values from infeed as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..144542b927e713d3d132b486847971ceb37d240c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "InfeedEnqueue"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "layout"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence.
+If a layout attribute is passed, but its values are all -1, the layout will
+be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "An op which feeds a single Tensor value into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueuePrelinearizedBuffer.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueuePrelinearizedBuffer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdda6e75687e44af5137c01348a09c2509551843
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueuePrelinearizedBuffer.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "InfeedEnqueuePrelinearizedBuffer"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A variant tensor representing linearized output.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op is running on a TPU device
+and = 0 when the Op is running on the CPU device.
+END
+  }
+  summary: "An op which enqueues prelinearized buffer into TPU infeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..085fa8bca1f3f954a2b429f9a58e1c43192f4ba6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "InfeedEnqueueTuple"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `inputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `inputs`.
+END
+  }
+  attr {
+    name: "layouts"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence for
+all the tuple shapes, in the order the shapes appear in the "shapes" input.
+The layout elements for a sub-shape can be set to -1, in which case the
+corresponding layout will be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Feeds multiple Tensor values into the computation as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2763d3a8f64d5cd5d39b1d3dc3c58fa9a64cb7d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37524ed9f0033dc46b374d6f7e628433cb1be038
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1abc91c1313bcc4bcd7513a7ad58cf0e43c1118e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6f13f89aa83718bd7a71ee599d380af9dc04a0f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..916d3ede96cda4d4c81b64198d59ffd5947b81cb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1b137ccfa5d655885e1311204e22084314540ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b657079c2da56d401aaef8c40d60c8e386d2008f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Value of mg used in the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Load centered RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40a2079d92843ac48b3d3e35eb7cfcb573143f26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1772a1c9c9de3f9c6edbdada428480bb72a56d4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98955a3a8d126d7a87def99a2f180f458186cb52
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+Value of weights used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "benefits"
+    description: <<END
+Value of benefits used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Load MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9742b91b6ea3e15f616511829c024262653e7574
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6ec0028bbb62147e1f220651b14526bfbe2dbec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11ec98df85380538077516c3d40609b97872660d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f13b40fbe597ba76ac725a02c855014bfc98a686
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f56e7ef89b47a0ba6c151d369e289eba0d801312
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8b365db12a94472c01e6191549887aabc029721
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b262391c6ad76d1fc9c4a48f7e4375e6ad1485a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  visibility: HIDDEN
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Load SGD embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
index d158f4b50289e56b7b4cbd9192a6d7dba260ecf6..764189b150ce3dd4a4d8dce87e6a944e7de05bb5 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDefun.pbtxt
@@ -41,5 +41,16 @@ END
   }
   summary: <<END
   Maps a function on the list of tensors unpacked from arguments on dimension 0.
+  The function given by `f` is assumed to be stateless, and is executed
+  concurrently on all the slices; up to batch_size (i.e. the size of the 0th
+  dimension of each argument) functions will be scheduled at once.
+
+  The `max_intra_op_parallelism` attr, which defaults to 1, can be used to
+  limit the intra op parallelism. To limit inter-op parallelism, a user can
+  set a private threadpool on the dataset using `tf.data.Options`'s
+  `ThreadingOptions`.
+
+  Note that this op is not exposed to users directly, but is invoked in tf.data
+  rewrites.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
index 63c5604d60e682efc378e91862e5c18f0082bc23..11ac0b6e39f0a799cef7b00e6440b914d679ea9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
@@ -42,6 +42,12 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "include_batch_in_index"
+    description: <<END
+Whether to include batch dimension in flattened index of `argmax`.
 END
   }
   summary: "Computes second-order gradients of the maxpooling function."
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt
index 4ae503e79d3abddbc972c66a662ed2e39bb4a024..52904e507d202d42de82fa107c88f2988d4d7e84 100644
--- a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt
@@ -43,6 +43,12 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "include_batch_in_index"
+    description: <<END
+Whether to include batch dimension in flattened index of `argmax`.
 END
   }
   summary: "Computes gradients of the maxpooling function."
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt
index e717e57b50af3ee897c4ea7c309aebd72096f8a4..f888d98030c3107b9e7a294c83986f55af5efc1a 100644
--- a/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -35,13 +35,20 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "include_batch_in_index"
+    description: <<END
+Whether to include batch dimension in flattened index of `argmax`.
 END
   }
   summary: "Performs max pooling on the input and outputs both max values and indices."
   description: <<END
 The indices in `argmax` are flattened, so that a maximum value at position
-`[b, y, x, c]` becomes flattened index
-`((b * height + y) * width + x) * channels + c`.
+`[b, y, x, c]` becomes flattened index:
+`(y * width + x) * channels + c` if `include_batch_in_index` is False;
+`((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
 
 The indices returned are always in `[0, height) x [0, width)` before flattening,
 even if padding is involved and the mathematically correct answer is outside
diff --git a/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51a5fa95d3d90cb9ac8798492d7bc469336ccb82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "MulNoNan"
+  endpoint {
+    name: "MulNoNan"
+  }
+  summary: "Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN."
+  description: <<END
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a552670f4cd31db6962696f2f826416702eb19b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "NonDeterministicInts"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Non-deterministic integer values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Non-deterministically generates some integers."
+  description: <<END
+This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05b81db766588ba4190bb1434427ef2bee016f55
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "OutfeedDequeue"
+  visibility: HIDDEN
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be read from the device outfeed.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieves a single tensor from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b38ea861edf464b0a37aca0ba207894ca61dd1cf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "OutfeedDequeueTuple"
+  visibility: HIDDEN
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be read from the outfeed.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieve multiple values from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available. Output `i`
+corresponds to XLA tuple element `i`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0a401d86a501863cbd57abbadbb15616f5fcbce
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "OutfeedEnqueue"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be inserted into the outfeed queue.
+END
+  }
+  summary: "Enqueue a Tensor on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa1146a4d0af7b1f8a24a33a09fdf6b1ceccb66e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "OutfeedEnqueueTuple"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be inserted into the outfeed queue as an
+XLA tuple.
+END
+  }
+  summary: "Enqueue multiple Tensor values on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt
index 10bf370f5493cd7e0848adfefb20c861cab076cf..59974ca3aedbab298f32baf02f008240dd951f97 100644
--- a/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt
@@ -5,8 +5,9 @@ op {
 The polygamma function is defined as:
 
 
-\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+\\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
 
 where \\(\psi(x)\\) is the digamma function.
+The polygamma function is defined only for non-negative integer orders \\a\\.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_Prelinearize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Prelinearize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6febd7b67326689570c6f9354b60fa055bcb710
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Prelinearize.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "Prelinearize"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be linearized.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "layout"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence. If a layout
+attribute is passed but its values are all -1 the layout will be computed by
+the infeed operation.
+END
+  }
+  summary: <<END
+An op which linearizes one Tensor value to an opaque variant tensor.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PrelinearizeTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrelinearizeTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..63d2d891c98774d4e10241f2afaab86a47584ca9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrelinearizeTuple.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "PrelinearizeTuple"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `inputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `inputs`.
+END
+  }
+  attr {
+    name: "layouts"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence for all the
+tuple shapes in the order the shapes appear in the "shapes" input. The layout
+elements for a sub-shape can be set to -1 in which case the corresponding layout
+will be computed by the infeed operation.
+END
+  }
+  summary: <<END
+An op which linearizes multiple Tensor values to an opaque variant tensor.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17ff15378c90f709ec6a2428a9c6408f23eeabe8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3ab3eba2c0bf06bf8a41eabc0020582c3ada8ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b00c2b7f650260d7d2150935ddfab1d65fac335
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f309f648cafb307569bdabe496ca44c8c200c585
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBias"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b73eaae3613238d17900a4f15a7ad6839d92a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..101f72708af5cc92155b0641a14fc89889fa7488
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..697e26841539603ce2f6d26a082378881ce214a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cf52d6c897f9dc4e1e4988259b1c74043203727
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e91a2b8dc063c60cb2d8cd104bac864d063eee3b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ec528bf4a64bca8531d6daa90af2b13cebcec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..47a5f1553b9c35108c1a8ba48ee1834b9dd00475
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "RecvTPUEmbeddingActivations"
+  visibility: HIDDEN
+  out_arg {
+    name: "outputs"
+    description: <<END
+A TensorList of embedding activations containing one Tensor per
+embedding table in the model.
+END
+  }
+  attr {
+    name: "num_outputs"
+    description: <<END
+The number of output activation tensors, equal to the number of
+embedding tables in the model.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "An op that receives embedding activations on the TPU."
+  description: <<END
+The TPU system performs the embedding lookups and aggregations specified by
+the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+results of these aggregations are visible to the Tensorflow Graph as the
+outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+one Tensor of activations per table specified in the model. There can be at
+most one RecvTPUEmbeddingActivations op in the TPU graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
index 2cc1a55676c354c9470287ccb89e39489ab18c02..5be48cd361b04191da1dad453867e80493a4efb3 100644
--- a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
@@ -6,20 +6,30 @@ op {
   }
   in_arg {
     name: "pattern"
-    description: "The regular expression to match the input."
+    description: "The regular expression to be matched in the `input` strings."
   }
   in_arg {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expression."
+    description: <<END
+The rewrite string to be substituted for the `pattern` expression where it is
+matched in the `input` strings.
+END
   }
   out_arg {
     name: "output"
-    description: "The text after applying pattern and rewrite."
+    description: "The text after applying pattern match and rewrite substitution."
   }
   attr {
     name: "replace_global"
-    description: "If True, the replacement is global, otherwise the replacement\nis done only on the first match."
+    description: <<END
+If True, the replacement is global (that is, all matches of the `pattern` regular
+expression in each input string are rewritten), otherwise the `rewrite`
+substitution is only made for the first `pattern` match.
+END
   }
-  summary: "Replaces the match of pattern in input with rewrite."
+  summary: <<END
+Replaces matches of the `pattern` regular expression in `input` with the
+replacement string provided in `rewrite`.
+END
   description: "It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)"
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
index 07bbd4ac6031765a070c5e5b4ee0726512dbb6ca..cd7d4e3ec2abf68b2c8461a5e301213e34252d1e 100644
--- a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
@@ -30,10 +30,12 @@ END
 The type of the input.
 END
   }
-  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
+  summary:
+"Computes a range that covers the actual values present in a quantized tensor."
   description: <<END
-range that covers the actual values present in that tensor.  This op is
-typically used to produce the requested_output_min and requested_output_max for
-Requantize.
+Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+range that covers the actual values present in that tensor. This op is typically
+used to produce the `requested_output_min` and `requested_output_max` for
+`Requantize`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58cf1222500e6bf58a22beb17ffccf2949dd4c81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "RequantizationRangePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "clip_value_max"
+    description: <<END
+The maximum value of the output that needs to be clipped.
+Example: set this to 6 for Relu6. 
+END
+  }
+  summary: "Computes requantization range per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
index 1b03f63b261e00c6b1dfdc0b1f11c69d71b536eb..23e1656288d6c97b8facd7eafb5ba5cd862dade1 100644
--- a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
@@ -48,13 +48,15 @@ END
 The type of the output. Should be a lower bit depth than Tinput.
 END
   }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
+  summary: 
+"Converts the quantized `input` tensor into a lower-precision `output`."
   description: <<END
-output range specified with 'requested_output_min' and 'requested_output_max'.
+Converts the quantized `input` tensor into a lower-precision `output`, using the
+output range specified with `requested_output_min` and `requested_output_max`.
 
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+`[input_min, input_max]` are scalar floats that specify the range for the float
+interpretation of the `input` data. For example, if `input_min` is -1.0f and
+`input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
 value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bce6c3dd5a0b5e6f4e7fe2aa990827a4020bed17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "RequantizePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  in_arg {
+    name: "requested_output_min"
+    description: <<END
+The minimum value of the output tensor requested.
+END
+  }
+  in_arg {
+    name: "requested_output_max"
+    description: <<END
+The maximum value of the output tensor requested.
+END
+  }  
+  out_arg {
+    name: "output"
+    description: <<END
+Output tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The quantized type of output tensor that needs to be converted.
+END
+  }
+  summary: "Requantizes input with min and max values known per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6f32f592c5f0827ced545d02bee4ad0f9c154ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8da231371267da22960834bd661da9b46d40e562
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4857f52b94294c47a4e18bd6f381bf562a7e042
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fad2f257338aec18af338ce6fd4092613a7b5797
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee8af75c8c9a6157322f6db23a00a6ff7a465e09
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..365fe8e89175ac9ea28d486a6762f5127b04391e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41690936932ea25bbb96e50bf25c868c9f2941f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mg"
+    description: <<END
+Parameter mg updated by the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve centered RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..825fee2e5c3d0dc67ff5dd19fa5e181087dcd8b8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f7d9e673fbcf8ce19b882f33b65ffb0189fae2e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c57a7e4c6427df499df5db0f79d62215bf62a0a1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "weights"
+    description: <<END
+Parameter weights updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "benefits"
+    description: <<END
+Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Retrieve MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7120a02618303ccea0a0b4eb6449991c81589ab4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5caa83ce80feefa45d4ee027a9a8aaf62454c826
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0fc8c99aee9d33e1e5e64766e089d55c78566b6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fd44934ee3cf1f6c0284aedf3feffe308bef2bf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1c5ff517f2c057a3150d75027fbe7d83093de01
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d4e2c0f7eca92093522b98f69582d3c8bece62a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11578025da1a10494caf7e72049c20f62581cbc9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  visibility: HIDDEN
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Retrieve SGD embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index d33a36ce06c37092bd25e241b36f1c564070c6e2..d5643c8a79a92fad53b1737a80172fb56004327f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the max is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_max(c, tf.constant([0, 0, 1]))
+# ==> [[4, 3, 3, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index afdc39da96df01185af600409435faca49cabc0e..b03649ab077d893ceb17704c57060cde99be1db2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the mean along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -30,5 +30,15 @@ If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_mean(c, tf.constant([0, 0, 1]))
+# ==> [[2.5, 2.5, 2.5, 2.5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 026b5b3991f9cbbfce9add2ff4cb7e370a1cc799..6796678555ef8f3bbf27c742db6f1c0c30c483bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,14 @@ If the min is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_min(c, tf.constant([0, 0, 1]))
+# ==> [[1, 2, 2, 1],
+#      [5, 6, 7, 8]]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index a168eed87f668c97141f2c8966c68866b82477de..10b368fcca4dab3bb197609e3e10189323bf9bc7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the product is empty for a given segment ID `i`, `output[i] = 1`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_prod(c, tf.constant([0, 0, 1]))
+# ==> [[4, 6, 6, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 876b8608240df108c66bceefa5f7eba82ddb7524..487a6d10746ce684c9f0e27f6336c22994fee3b4 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+# ==> [[5, 5, 5, 5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79286072870850373178155f020c63f70ed48dd6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "SendTPUEmbeddingGradients"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+A TensorList of gradients with which to update embedding tables.
+This argument has the same length and shapes as the return value of
+RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+with respect to the embedding activations. The embedding tables are updated
+from these gradients via the optimizer specified in the TPU embedding
+configuration given to tpu.initialize_system.
+END
+  }
+  in_arg {
+    name: "learning_rates"
+    description: <<END
+A TensorList of float32 scalars, one for each dynamic learning
+rate tag: see the comments in
+//third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+Multiple tables can share the same dynamic learning rate tag as specified
+in the configuration. If the learning rates for all tables are constant,
+this list should be empty.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "Performs gradient updates of embedding tables."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd537e05d7c97a2d43c14916b44eb32ae9b2efc1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "ShardDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "num_shards"
+    description: <<END
+An integer representing the number of shards operating in parallel.
+END
+  }
+  in_arg {
+    name: "index"
+    description: <<END
+An integer representing the current worker index.
+END
+  }
+  summary: "Creates a `Dataset` that includes only 1/`num_shards` of this dataset."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3392daed28010084c4a366c73a372c9d6699f938
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "ShutdownDistributedTPU"
+  visibility: HIDDEN
+  summary: "Shuts down a running distributed TPU system."
+  description: <<END
+The op returns an error if no system is running.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 138a6366c8aa1e5b9d876621b93c7d36f16f38e2..0bbc0780dfee4af8cfcb036264969dbd4ec7bbdf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,9 +21,7 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index b8073d88ac3d10cad6bc7771d3fe28bae905d8e5..65b2358830ed9eeccbc099b055b9f8b50e92afdc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 945bbdcf627c48047ffa65c4c4e5124cbd96e54b..a28bd1a646445c660183d6e35b9e6df64637c4f3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,8 +23,7 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index ff328c8a6195f9aca515de4d8a682b50df92117e..8a5d2bb02c4c42e3d67f6e01b8c609be84575270 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -33,7 +33,7 @@ Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index a68e14607f81e999f95e85b4481fb0474e691aa4..d7494dc8deb37927dce09ce3a854339c27758286 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -22,7 +22,7 @@ END
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index aa5c1fc8d0d698008787418ef24ecb3c0c635f6a..039ca9a23ba1abf2d67327ddb72bb49b8de1ab68 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
 for an explanation of segments.
 
 For example:
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c145e05ac7d9d928dd22721a394df497f1aabab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulUniformFullInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulUniformFullInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d576052c0ab3d52470a2ce48ad86774e0e3be82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulUniformFullInt.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "StatefulUniformFullInt"
+  visibility: HIDDEN
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random integers from a uniform distribution."
+  description: <<END
+The generated values are uniform integers covering the whole range of `dtype`. 
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulUniformInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19027ce43a9ee0c1ab8b48fa33b4e50f250ebe1a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulUniformInt.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "StatefulUniformInt"
+  visibility: HIDDEN
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+Minimum value (inclusive, scalar).
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+Maximum value (exclusive, scalar).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random integers from a uniform distribution."
+  description: <<END
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78fb045160d63d64b72029b90a2b110487b77e6e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TPUCompilationResult"
+  visibility: HIDDEN
+  summary: "CompilationResultProto indicating the status of the TPU compilation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d012fbb8b73800f3b00121b315635078fde2ae05
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "TPUEmbeddingActivations"
+  visibility: HIDDEN
+  in_arg {
+    name: "embedding_variable"
+    description: <<END
+A trainable variable, enabling optimizers to find this op.
+END
+  }
+  in_arg {
+    name: "sliced_activations"
+    description: <<END
+The embedding activations Tensor to return.
+END
+  }
+  attr {
+    name: "table_id"
+    description: <<END
+The id of the table in the embedding layer configuration from which
+these activations were computed.
+END
+  }
+  attr {
+    name: "lookup_id"
+    description: <<END
+Identifier of the set of embedding indices which produced these
+activations.
+END
+  }
+  summary: "An op enabling differentiation of TPU Embeddings."
+  description: <<END
+This op simply returns its first input, which is assumed to have been sliced
+from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+this op, and its first argument being a trainable Variable, enables automatic
+differentiation of graphs containing embeddings via the TPU Embedding Python
+libraries.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f9d53fd146b7c6c80854b22e50801045a6e9982
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "TPUOrdinalSelector"
+  visibility: HIDDEN
+  out_arg {
+    name: "device_ordinals"
+    description: <<END
+A vector 1 or more TPU cores.
+END
+  }
+  summary: "A TPU core selector Op."
+  description: <<END
+This Op produces a set of TPU cores (for warm-up) or a single TPU core
+(for regular inference) to execute the TPU program on. The output is
+consumed by TPUPartitionedCall.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d27e67598a5ef43c78cfaf95974688389d4b6e29
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "TPUPartitionedCall"
+  visibility: HIDDEN
+  in_arg {
+    name: "args"
+    description: <<END
+The arguments to the function.
+END
+  }
+  in_arg {
+    name: "device_ordinal"
+    description: <<END
+The TPU device ordinal to run the function on.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The output of the function call.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The types of the arguments to the function.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The types of the outputs of the function.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The function to call.
+END
+  }
+  summary: "Calls a function placed on a specified TPU device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a305003b64b02d0cc6556d1db324d2eb0d21d5ef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
@@ -0,0 +1,100 @@
+op {
+  graph_op_name: "TPUReplicate"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+the inputs to 'computation', flattened, in replica-major order.
+END
+  }
+  in_arg {
+    name: "broadcast_inputs"
+    description: <<END
+additional arguments to broadcast to all replicas. The
+broadcast inputs are appended to the per-replica inputs when calling
+computation.
+END
+  }
+  in_arg {
+    name: "guaranteed_constants"
+    description: <<END
+arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+the outputs of 'computation'.
+END
+  }
+  attr {
+    name: "computation"
+    description: <<END
+a function containing the computation to run.
+END
+  }
+  attr {
+    name: "num_replicas"
+    description: <<END
+the number of replicas of the computation to run.
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+the number of logical cores in each replica.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+a bool indicating if this computation will run on TPU or CPU/GPU.
+Currently, only supports a default placement (computation is placed on GPU
+if one is available, and on CPU if not).
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+a flattened array with shape
+[replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
+of logical cores in each replica of a computation to physical coordinates in
+the TPU topology.
+END
+  }
+  attr {
+    name: "Tinputs"
+    description: <<END
+the types of the arguments to 'computation'.
+END
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    description: <<END
+the types of the additional arguments to broadcast to all
+replicas.
+END
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    description: <<END
+the types of the arguments to 'guaranteed_constants'.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+the types of the outputs of 'computation'.
+END
+  }
+  summary: "Runs replicated computations on a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94cee9491547e190dee33af5b3bc6bfefb83bfc0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "TPUReplicateMetadata"
+  visibility: HIDDEN
+  attr {
+    name: "num_replicas"
+    description: <<END
+Number of replicas of the computation
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+Number of cores per replica. Used for model parallelism.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+TopologyProto indicating the topology of the TPU pod slice.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+Whether to place the computation on the TPU.
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+The assignment of devices for the computation.
+END
+  }
+  attr {
+    name: "computation_shape"
+    description: <<END
+DEPRECATED. Use num_cores_per_replica instead.
+END
+  }
+  attr {
+    name: "host_compute_core"
+  }
+  attr {
+    name: "padding_map"
+  }
+  summary: "Metadata indicaitng how the TPU computation should be replicated."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..acd52a735cb2612af87fd98a983bba1e7d3de647
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TPUReplicatedInput"
+  visibility: HIDDEN
+  summary: "Connects N inputs to an N-way replicated TPU computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..162da6c7c7dbb511086a69126135f008cac4792e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TPUReplicatedOutput"
+  visibility: HIDDEN
+  summary: "Connects outputs of an N-way replicated computation to N outputs."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b2af2c23b715f5cdb804dd449bf1001a444e686
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "TensorListConcatV2"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+element_shape: The shape of the uninitialized elements in the list. If the first
+  dimension is not -1, it is assumed that all list elements have the same
+  leading dim.
+leading_dims: The list of leading dims of uninitialized list elements. Used if
+  the leading dim of input_handle.element_shape or the element_shape input arg
+  is not already set.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23da422971c485e8e8bba1a6b6cf1d9605d8ffbf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  summary: "Scatters tensor at indices in an input list."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+input_handle: The list to scatter into.
+tensor: The input tensor.
+indices: The indices used to index into the list.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f520900fc0ce06d3fd6bb9bff4e164260ba71f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  summary: "Creates a TensorList by indexing into a Tensor."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+tensor: The input tensor.
+indices: The indices used to index into the list.
+element_shape: The shape of the elements in the list (can be less specified than
+  the shape of the tensor).
+num_elements: The size of the output list. Must be large enough to accommodate
+  the largest index in indices. If -1, the list is just large enough to include
+  the largest index in indices.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80f3675d551ce90435fb7b78969f70986a4e8c02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "TridiagonalSolve"
+  visibility: HIDDEN
+  in_arg {
+    name: "diagonals"
+    description: <<END
+Shape is `[..., 3, M]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+
+  summary: "Solves tridiagonal systems of equations."
+  description: <<END
+`diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+represent matrices with three rows being the superdiagonal, diagonals, and
+subdiagonals, in order. The last element of the superdiagonal and the first
+element of the subdiagonal is ignored.
+`rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+each left-hand side.
+The output is a tensor of shape `[..., M, K]` containing the solutions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index ed4a2bd5588eecb19d9d5effb386b2fe5c0c4409..f282b9fab56a2735519ec56d3292867feb84750a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -37,5 +37,15 @@ dropped, and will not be included in the result.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
 </div>
+
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  3, 3, 4],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
index 7e139ddf4d9fac5cd47fdb56927cb325be45d54d..0360cc09d064295a45032905f652e56055b92986 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -31,6 +31,15 @@ If the minimum is empty for a given segment ID `i`, it outputs the largest
 possible value for the specific numeric type,
 `output[i] = numeric_limits<T>::max()`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 1,  2, 2, 1],
+#       [5,  6, 7, 8]]
+```
+
 If the given segment ID `i` is negative, then the corresponding value is
 dropped, and will not be included in the result.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
index 9c8ea3b620832dba4f18a1cfbac953bad8bb6f56..67de4734bdedcf82acdcf993e1fb3e36d7b140d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -28,6 +28,15 @@ entries belonging to a segment such that:
 \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
 `j...` such that `segment_ids[j...] == i`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  6, 6, 4],
+#       [5,  6, 7, 8]]
+```
+
 If there is no entry for a given segment ID `i`, it outputs 1.
 
 If the given segment ID `i` is negative, then the corresponding value is
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 7e5d9265c2ead2028fa8bb80076ea40f858cff39..08139235f4a792c353dda4db667a135c823c6e5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -35,5 +35,13 @@ added to the sum of the segment.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 5,  5, 5, 5],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e53a8396683402df374c376b8826e8bada6db91b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "WorkerHeartbeat"
+  visibility: HIDDEN
+  in_arg {
+    name: "request"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatRequest
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatResponse
+END
+  }
+  summary: "Worker heartbeat op."
+  description: <<END
+Heartbeats may be sent periodically to indicate the coordinator is still active,
+to retrieve the current worker status and to expedite shutdown when necessary.
+END
+}
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
index 02026e94abc5b3284578859e157279b27ba84446..65d2102ac80579b0ba6f9510cd7a95300cd10a3f 100644
--- a/tensorflow/core/api_def/excluded_ops.cc
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -24,9 +24,9 @@ const std::unordered_set<std::string>* GetExcludedOps() {
            "GcsConfigureBlockCache", "GcsConfigureCredentials",
 #ifdef INTEL_MKL
            // QuantizedFusedOps for Intel CPU
-           "QuantizedConv2DAndRequantize", "QuantizedConv2DWithBias",
-           "QuantizedConv2DWithBiasAndRequantize", "QuantizedConv2DAndRelu",
-           "QuantizedConv2DAndReluAndRequantize",
+           "QuantizedConcatV2", "QuantizedConv2DAndRequantize",
+           "QuantizedConv2DWithBias", "QuantizedConv2DWithBiasAndRequantize",
+           "QuantizedConv2DAndRelu", "QuantizedConv2DAndReluAndRequantize",
            "QuantizedConv2DWithBiasAndRelu",
            "QuantizedConv2DWithBiasAndReluAndRequantize",
            "QuantizedConv2DWithBiasSumAndRelu",
diff --git a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
index d51defc376ff9a0961ed5bd43b848ea3f6df288d..bc8cc309f552e93e1dd6ff1fb0d74f8fda0cd1f7 100644
--- a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
@@ -2,6 +2,10 @@ op {
   graph_op_name: "AsString"
   endpoint {
     name: "dtypes.as_string"
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "strings.as_string"
   }
   endpoint {
     name: "as_string"
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
index cc16523a1567e8d7f2d0146c1c44d9ef11b6c6d5..72c281de342e553280c029d98a275395a93896d0 100644
--- a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "AvgPool3D"
-  endpoint {
-    name: "nn.avg_pool3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Case.pbtxt b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c8193a35061a93dc21f1ac02bde318095fbf7b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Case"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d..edbcba26ce3d31cc8c3d9aecb9efc5286ddd7002 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "Conv3DBackpropFilterV2"
   endpoint {
     name: "nn.conv3d_backprop_filter"
+    deprecation_version: 2
   }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
index d6fd4691f74f51e14ea43a26fdac9d3e87fa1140..28f4514bd88e116b77ecf7f4d6a6660518b85a1f 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "DecodeWav"
-  visibility: HIDDEN
+  endpoint {
+    name: "audio.decode_wav"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
index 1bd83d906152d2e5792fecd5e80e339e0c67e7a5..97af07e0012ea99a69175e6ed5628566bf8b6873 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -1,7 +1,4 @@
 op {
   graph_op_name: "Dilation2D"
-  endpoint {
-    name: "nn.dilation2d"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3ea8859b5426926230f81a9ec31a6083d3a11dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
index ed8abdfcd7f3171d431adf07d47eb3bfc60d1e8f..f1fc72c4ca18afbb4ce597dc17a513634d2423d0 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ExtractGlimpse"
-  endpoint {
-    name: "image.extract_glimpse"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
index e8576c9ff2e0729235d9bca70c369536dacaa08e..47016b9d6949b3cb8558b9d6b794183e9f8e7517 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "MaxPool3D"
-  endpoint {
-    name: "nn.max_pool3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
index 13a1a0b5df4d73884d267777ccf5ad6a44fcdbd4..c57cfc7727a5ebb87f219ad4ec9576a05ac68a69 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -1,7 +1,4 @@
 op {
   graph_op_name: "MaxPoolWithArgmax"
-  endpoint {
-    name: "nn.max_pool_with_argmax"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35b2f309fb3850a4393464209422822eebd9e2a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MulNoNan"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82b28601416f142a3e7898c3c7907866b8feb4a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5988beb8ba3ccd5c045c93715cebd83433f06dc3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterMax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterMax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01c448c3b88229d463128e056afb1e4b654228e7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterMin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterMin"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9752b44deda959bf923151457c21227ba1a7f84
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1816dd734fadd538ad81f243c203e3c1ccae259
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..237774a388d4e6d4079e401b35286cf1b91ce85c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcatV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20d9a43be4168af3f79d96224eb2fbefaa5752a2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a92a529d361cd3684d4306cb82bb3648e2b7e9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4329f2cda556d101906a410ac1ef0953186dc92
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorScatterAdd.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorScatterAdd"
+  deprecation_message: "Use tensor_scatter_nd_add instead"
+  endpoint {
+    name: "tensor_scatter_nd_add"
+  }
+  endpoint {
+    name: "tensor_scatter_add"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorScatterSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bcee62efdcb4ec09beb988c004bfaaa020ec2500
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorScatterSub.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorScatterSub"
+  deprecation_message: "Use tensor_scatter_nd_sub instead"
+  endpoint {
+    name: "tensor_scatter_nd_sub"
+  }
+  endpoint {
+    name: "tensor_scatter_sub"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa4ab86c7c35873514439a46f8097a3bb80f6cec
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorScatterUpdate"
+  deprecation_message: "Use tensor_scatter_nd_update instead"
+  endpoint {
+    name: "tensor_scatter_nd_update"
+  }
+  endpoint {
+    name: "tensor_scatter_update"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 1fc077af92c719bf2c5d87eded55275032891f5d..fb3df6ea8cff378788d769da3ad9c7c9c0532310 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -121,31 +121,23 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
                            .Attr("dtype", dtype)
                            .Attr("var_name", accumulator_name)
                            .Finalize(g, &create_accumulator));
-    if (PartialTensorShape(shape).IsFullyDefined()) {
-      // For fully defined shapes make a constant zero tensor.
-      TF_RETURN_IF_ERROR(make_node("Const")
-                             .Attr("value", make_zeros(dtype, shape))
-                             .Attr("dtype", dtype)
-                             .Finalize(g, &initial_val));
-    } else {
-      // For partial shapes make a Fill operation to make a zero tensor with the
-      // shape of the first input.
-      Node* shape_node;
-      TF_RETURN_IF_ERROR(
-          make_node("Shape")
-              .Input(data_edges[0]->src(), data_edges[0]->src_output())
-              .Finalize(g, &shape_node));
-      Node* zero;
-      TF_RETURN_IF_ERROR(
-          make_node("Const")
-              .Attr("value", make_zeros(dtype, TensorShapeProto()))
-              .Attr("dtype", dtype)
-              .Finalize(g, &zero));
-      TF_RETURN_IF_ERROR(make_node("Fill")
-                             .Input(shape_node)
-                             .Input(zero)
-                             .Finalize(g, &initial_val));
-    }
+    PartialTensorShape partial_shape(shape);
+    // Make a Fill operation to make a zero tensor with the shape of the first
+    // input.
+    Node* shape_node;
+    TF_RETURN_IF_ERROR(
+        make_node("Shape")
+            .Input(data_edges[0]->src(), data_edges[0]->src_output())
+            .Finalize(g, &shape_node));
+    Node* zero;
+    TF_RETURN_IF_ERROR(make_node("Const")
+                           .Attr("value", make_zeros(dtype, TensorShapeProto()))
+                           .Attr("dtype", dtype)
+                           .Finalize(g, &zero));
+    TF_RETURN_IF_ERROR(make_node("Fill")
+                           .Input(shape_node)
+                           .Input(zero)
+                           .Finalize(g, &initial_val));
     TF_RETURN_IF_ERROR(make_node("Assign")
                            .Attr("T", dtype)
                            .Input(create_accumulator)  // ref: Ref(T)
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 92e56df18105218fc8a5112a880b6c999f1a2649..c9e3cf40860a2b68024dc0fc61e8c65640cbc9be 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -63,7 +63,7 @@ int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
       (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
           ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
           : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
-  CHECK_EQ(0, diff % elt_bytes);
+  DCHECK_EQ(0, diff % elt_bytes);
   base_chunk_elts += (diff / elt_bytes);
   DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
       << "total_elts=" << total_elts << " num_chunks=" << num_chunks
@@ -78,17 +78,23 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
  public:
   // Takes ownership of output and prepares to properly alias its chunks.
   // Ownership is taken because the shape may temporarily change.
-  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator,
+                        bool align_chunks)
       : output_(std::move(*output)),
         dt_(output_.dtype()),
         old_shape_(output_.shape()),
         num_chunks_(num_chunks),
         allocator_(allocator),
         total_elts_(output_.NumElements()),
-        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        chunk_elts_(align_chunks
+                        ? AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)
+                        : total_elts_ / num_chunks_),
         data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
         data_end_(data_start_ + total_elts_) {
-    CHECK_GT(chunk_elts_, 0);
+    if (!align_chunks) {
+      DCHECK_EQ(total_elts_, num_chunks_ * chunk_elts_);
+    }
+    DCHECK_GT(chunk_elts_, 0);
     Flatten();
   }
 
@@ -176,19 +182,24 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
 }  // namespace
 
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator) {
+                                         Allocator* allocator,
+                                         bool align_chunks) {
   switch (output->dtype()) {
     case DT_FLOAT:
-      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_DOUBLE:
-      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator,
+                                               align_chunks);
       break;
     case DT_INT32:
-      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_INT64:
-      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     default:
       LOG(FATAL) << "Unsupported type " << output->dtype()
@@ -227,6 +238,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
+                         col_params.instance.type == GATHER_COLLECTIVE ||
                          (col_params.instance.type == BROADCAST_COLLECTIVE &&
                           col_params.is_source))
                             ? &ctx->input(0)
@@ -296,4 +308,42 @@ Status BaseCollectiveExecutor::CreateCollective(
   return status;
 }
 
+bool BaseCollectiveExecutor::CheckDependencies(
+    const CollectiveParams& col_params) {
+  for (int32 instance : col_params.instance.impl_details.dependencies) {
+    auto find_iter = launched_.find(instance);
+    if (find_iter == launched_.end() || find_iter->second != 0) {
+      VLOG(1) << "Collective " << col_params.ToString()
+              << " blocked by instance " << instance;
+      return false;
+    }
+  }
+  return true;
+}
+
+void BaseCollectiveExecutor::WaitForDependencies(
+    const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  while (!CheckDependencies(col_params)) {
+    launch_cv_.wait(l);
+  }
+  VLOG(1) << "Unblocking collective " << col_params.ToString();
+}
+
+void BaseCollectiveExecutor::Launched(const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  if (launched_.find(col_params.instance.instance_key) == launched_.end()) {
+    const string& task_name =
+        col_params.instance.task_names[col_params.default_rank];
+    const int32 num_devices =
+        col_params.instance.num_devices_per_task.at(task_name);
+    launched_[col_params.instance.instance_key] = num_devices;
+  }
+  if (--launched_[col_params.instance.instance_key] == 0) {
+    VLOG(1) << "Unblocking dependencies for collective instance "
+            << col_params.instance.instance_key;
+    launch_cv_.notify_all();
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 09826a8814511cb46c907b983f240fe17df70e3d..bc85b5af5f87bd6d5fc1cdc28b17248eeb33a25d 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -78,9 +78,15 @@ class CollectiveAdapter {
 };
 
 // Create a CollectiveAdaptor wrapping 'output', specialized to its
-// data-type and shape.
+// data-type and shape.  If align_chunks == true then chunk size may
+// be larger than output->NumElements() / num_chunks and one or more
+// of the suffix chunks may be empty.  Chunks will be arranged to start
+// and end on alignment boundaries.  If align_chunks == false then
+// output->NumElements() % num_chunks must be 0 and all chunks will
+// have exactly the same size, ignoring alignment issues.
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator);
+                                         Allocator* allocator,
+                                         bool align_chunks = true);
 
 // Default implementation of CollectiveExecutor.  Delegates the actual
 // work of moving data to a class specialized for the operation type,
@@ -135,15 +141,33 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                                client_locality, done);
   }
 
+  // If we need to enforce an ordering on any portion of collective
+  // implementation, and the ordering is encoded via attribute on the collective
+  // op, this function will block until all dependencies for this collective
+  // have completed.
+  void WaitForDependencies(const CollectiveParams& col_params) override;
+  // Record that this collective has completed the portion of the implementation
+  // that needs to be ordered wrt other collectives, to unblock any of its
+  // dependent ops.
+  void Launched(const CollectiveParams& col_params) override;
+
  protected:
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
   std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
   const string* gpu_ring_order_;  // Not owned.
+  mutex launch_mu_;
+  condition_variable launch_cv_;
+  // collective instance key -> number of local devices for which NCCL ops have
+  // been launched.
+  std::unordered_map<int32, int32> launched_ GUARDED_BY(launch_mu_);
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
                           CollectiveImplementationInterface** col_impl);
+  // Check if all ops on which this collective depends on have launched.
+  bool CheckDependencies(const CollectiveParams& col_params)
+      EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 3843ea9e60cfbac4c428174f9b2201ccafaf505e..0e4ddb102002ec2802761e05013834cf491f7980 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -152,6 +153,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
+  c->freed_count = 0;
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -180,29 +182,46 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   free_chunks_list_ = h;
 }
 
-void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+void* BFCAllocator::AllocateRawInternalWithRetry(
+    size_t unused_alignment, size_t num_bytes,
+    const AllocationAttributes& allocation_attr) {
   // Fast path: Try once to allocate without getting the retry_helper_ involved
-  void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
+  uint64 freed_by_count = 0;
+  if (allocation_attr.freed_by_func != nullptr) {
+    freed_by_count = allocation_attr.freed_by_func();
+  }
+  void* r =
+      AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
   if (r != nullptr) {
     return r;
   } else {
     static const int64 kMaxMillisToWait = 10000;  // 10 seconds
-    return retry_helper_.AllocateRaw(
-        [this](size_t a, size_t nb, bool v) {
-          return AllocateRawInternal(a, nb, v);
+    r = retry_helper_.AllocateRaw(
+        [this, &allocation_attr](size_t a, size_t nb, bool v) {
+          uint64 freed_by_count = 0;
+          if (allocation_attr.freed_by_func != nullptr) {
+            freed_by_count = allocation_attr.freed_by_func();
+          }
+          return AllocateRawInternal(a, nb, v, freed_by_count);
         },
         kMaxMillisToWait, unused_alignment, num_bytes);
+    return r;
   }
 }
 
 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
+  VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
   if (allocation_attr.no_retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
-    void* result =
-        AllocateRawInternal(unused_alignment, num_bytes, dump_log_on_failure);
+    uint64 freed_by_count = 0;
+    if (allocation_attr.freed_by_func != nullptr) {
+      freed_by_count = allocation_attr.freed_by_func();
+    }
+    void* result = AllocateRawInternal(unused_alignment, num_bytes,
+                                       dump_log_on_failure, freed_by_count);
     if (result == nullptr) {
       static std::atomic<int32> log_counter{0};
       int32 counter_value = log_counter.load(std::memory_order_relaxed);
@@ -218,7 +237,8 @@ void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
     }
     return result;
   } else {
-    return AllocateRaw(unused_alignment, num_bytes);
+    return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
+                                        allocation_attr);
   }
 }
 
@@ -233,7 +253,8 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
 
 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                         size_t num_bytes,
-                                        bool dump_log_on_failure) {
+                                        bool dump_log_on_failure,
+                                        uint64 freed_before) {
   if (num_bytes == 0) {
     LOG(ERROR) << "tried to allocate 0 bytes";
     return nullptr;
@@ -247,14 +268,14 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   BinNum bin_num = BinNumForSize(rounded_bytes);
 
   mutex_lock l(lock_);
-  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
   if (ptr != nullptr) {
     return ptr;
   }
 
   // Try to extend
   if (Extend(unused_alignment, rounded_bytes)) {
-    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -274,7 +295,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
 }
 
 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
-                                 size_t num_bytes) {
+                                 size_t num_bytes, uint64 freed_before) {
   // First identify the first bin that could satisfy rounded_bytes.
   for (; bin_num < kNumBins; bin_num++) {
     // Start searching from the first bin for the smallest chunk that fits
@@ -285,6 +306,9 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
       const BFCAllocator::ChunkHandle h = (*citer);
       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
+      if (freed_before > 0 && freed_before < chunk->freed_count) {
+        continue;
+      }
       if (chunk->size >= rounded_bytes) {
         // We found an existing chunk that fits us that wasn't in use, so remove
         // it from the free bin structure prior to using.
@@ -311,10 +335,10 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // Update stats.
         ++stats_.num_allocs;
         stats_.bytes_in_use += chunk->size;
-        stats_.max_bytes_in_use =
-            std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-        stats_.max_alloc_size =
-            std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
+        stats_.peak_bytes_in_use =
+            std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+        stats_.largest_alloc_size =
+            std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
 
         VLOG(4) << "Returning: " << chunk->ptr;
         if (VLOG_IS_ON(4)) {
@@ -347,6 +371,9 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // The new chunk is not in use.
   new_chunk->allocation_id = -1;
 
+  // It inherits the freed time.
+  new_chunk->freed_count = c->freed_count;
+
   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
@@ -364,6 +391,7 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
 }
 
 void BFCAllocator::DeallocateRaw(void* ptr) {
+  VLOG(1) << "DeallocateRaw " << Name() << " " << RequestedSize(ptr);
   DeallocateRawInternal(ptr);
   retry_helper_.NotifyDealloc();
 }
@@ -415,6 +443,9 @@ void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
   // Set the new size
   c1->size += c2->size;
 
+  // Pick latest free time.
+  c1->freed_count = std::max(c1->freed_count, c2->freed_count);
+
   DeleteChunk(h2);
 }
 
@@ -460,6 +491,11 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
   // Mark the chunk as no longer in use.
   c->allocation_id = -1;
 
+  // Optionally record the free time.
+  if (timing_counter_) {
+    c->freed_count = timing_counter_->next();
+  }
+
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
 
@@ -630,7 +666,10 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
         in_use_by_size[c->size]++;
       }
       LOG(INFO) << (c->in_use() ? "Chunk" : "Free ") << " at " << c->ptr
-                << " of size " << c->size;
+                << " of size " << c->size
+                << (timing_counter_
+                        ? strings::StrCat(" freed_count ", c->freed_count)
+                        : "");
       h = c->next;
     }
   }
@@ -647,16 +686,16 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 
-void BFCAllocator::GetStats(AllocatorStats* stats) {
+absl::optional<AllocatorStats> BFCAllocator::GetStats() {
   mutex_lock l(lock_);
-  *stats = stats_;
+  return stats_;
 }
 
 void BFCAllocator::ClearStats() {
   mutex_lock l(lock_);
   stats_.num_allocs = 0;
-  stats_.max_bytes_in_use = stats_.bytes_in_use;
-  stats_.max_alloc_size = 0;
+  stats_.peak_bytes_in_use = stats_.bytes_in_use;
+  stats_.largest_alloc_size = 0;
 }
 
 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 2d74bf2b286a1fac4d3f9b3921fef7a5b838fce8..b0fd0d8667da96d3590965ae3e05675968389089 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -50,9 +51,14 @@ class BFCAllocator : public Allocator {
   ~BFCAllocator() override;
 
   string Name() override { return name_; }
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return AllocateRaw(alignment, num_bytes, AllocationAttributes());
+  }
+
   void* AllocateRaw(size_t alignment, size_t num_bytes,
                     const AllocationAttributes& allocation_attr) override;
+
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() override;
@@ -63,15 +69,23 @@ class BFCAllocator : public Allocator {
 
   int64 AllocationId(const void* ptr) override;
 
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
 
   void ClearStats() override;
 
+  void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
+
  private:
   struct Bin;
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
-                            bool dump_log_on_failure);
+                            bool dump_log_on_failure,
+                            uint64 freed_before_count);
+
+  void* AllocateRawInternalWithRetry(
+      size_t alignment, size_t num_bytes,
+      const AllocationAttributes& allocation_attr);
+
   void DeallocateRawInternal(void* ptr);
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
@@ -126,6 +140,9 @@ class BFCAllocator : public Allocator {
     // What bin are we in?
     BinNum bin_num = kInvalidBinNum;
 
+    // Optional count when this chunk was most recently made free.
+    uint64 freed_count = 0;
+
     bool in_use() const { return allocation_id != -1; }
 
     string DebugString(BFCAllocator* a,
@@ -314,8 +331,8 @@ class BFCAllocator : public Allocator {
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
-  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
+                     uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
@@ -420,6 +437,7 @@ class BFCAllocator : public Allocator {
 
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
+  SharedCounter* timing_counter_ = nullptr;
 
   // Structures mutable after construction
   mutable mutex lock_;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 0e798235bf0649428409a2fa72ac3067736c347a..7621787dec76850e346f65b3883cb2b5073c0077 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -109,7 +109,7 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
 TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
   bool prod_callback_called = false;
   br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_callback_called](const Status& s) {
+                  [&prod_callback_called](const Status& s) {
                     prod_callback_called = true;
                   });
   Status bad_status;
@@ -129,11 +129,11 @@ TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
 
 TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
   Status cons_status;
-  br_->ConsumeBuf(
-      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
-        cons_status = s;
-        EXPECT_EQ(h, nullptr);
-      });
+  br_->ConsumeBuf("key0",
+                  [&cons_status](const Status& s, BufRendezvous::Hook* h) {
+                    cons_status = s;
+                    EXPECT_EQ(h, nullptr);
+                  });
   EXPECT_TRUE(cons_status.ok());
   br_.reset();
   EXPECT_FALSE(cons_status.ok());
@@ -146,13 +146,13 @@ TEST_F(BufRendezvousTest, AbortNonEmpty) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
@@ -175,13 +175,13 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index f3d86aa633938042b862613162d1c2a94b0fe35a..3eef5ed0a0c5984474c4d75ae417c030d269290d 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -44,7 +44,7 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
-        new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+        new CollectiveParamResolverLocal(cp, device_mgr_.get(), drl.get(),
                                          task_name));
     cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
                                          std::move(prl)));
@@ -73,11 +73,11 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
   Notification ss_note;
   Status ss_status;
-  cme_->RefreshStepIdSequenceAsync(
-      123, [this, &ss_status, &ss_note](const Status& s) {
-        ss_status = s;
-        ss_note.Notify();
-      });
+  cme_->RefreshStepIdSequenceAsync(123,
+                                   [&ss_status, &ss_note](const Status& s) {
+                                     ss_status = s;
+                                     ss_note.Notify();
+                                   });
   ss_note.WaitForNotification();
   EXPECT_FALSE(ss_status.ok());
   EXPECT_EQ(ss_status.error_message(),
@@ -87,7 +87,7 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   GetStepSequenceRequest* req = nullptr;
   GetStepSequenceResponse* resp = nullptr;
   cme_->GetStepSequenceAsync(req, resp,
-                             [this, &gs_status, &gs_note](const Status& s) {
+                             [&gs_status, &gs_note](const Status& s) {
                                gs_status = s;
                                gs_note.Notify();
                              });
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index a8e3f4c881afc9c37ce4b5196c32ec591be5506d..a76708385be59a2c6cec556d6ab1124b9c2bf541 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -37,9 +38,12 @@ void CollectiveParamResolverLocal::InstanceRec::WaitForOutMu(mutex_lock& lock) {
 }
 
 CollectiveParamResolverLocal::CollectiveParamResolverLocal(
-    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-    const string& task_name)
-    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver, const string& task_name)
+    : nccl_(config.experimental().collective_nccl()),
+      dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      task_name_(task_name) {}
 
 void CollectiveParamResolverLocal::CompleteGroupAsync(
     const CompleteGroupRequest* request, CompleteGroupResponse* response,
@@ -140,7 +144,6 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
 }
 
 namespace {
-
 struct DevRec {
   string task;
   string device;
@@ -316,29 +319,28 @@ GlobalDeviceMap EstablishGlobalRank(
 // cp->same_num_devices_per_task.  Requires cp->instance.task_names
 // be sorted.
 void SetDevPerTask(CollectiveParams* cp) {
-  cp->instance.same_num_devices_per_task = false;
-  if (cp->instance.task_names.empty()) return;
-  int dev_per_task = -1;
-  int count = 0;
+  cp->instance.num_devices_per_task.clear();
   const string* last_task_name = &cp->instance.task_names[0];
+  int count = 0;
   for (const string& task_name : cp->instance.task_names) {
-    if (task_name != *last_task_name) {
-      CHECK_GT(count, 0);
-      if (dev_per_task < 0) {
-        dev_per_task = count;
-      } else {
-        CHECK_GT(dev_per_task, 0);
-        if (count != dev_per_task) return;
-      }
+    if (task_name == *last_task_name) {
+      ++count;
+    } else {
+      cp->instance.num_devices_per_task[*last_task_name] = count;
       count = 1;
       last_task_name = &task_name;
-    } else {
-      ++count;
     }
   }
-  CHECK_GT(count, 0);
-  if ((dev_per_task > 0) && (count != dev_per_task)) {
-    return;
+  cp->instance.num_devices_per_task[*last_task_name] = count;
+
+  cp->instance.same_num_devices_per_task = false;
+  int dev_per_task = -1;
+  for (const auto& task_dev : cp->instance.num_devices_per_task) {
+    if (dev_per_task == -1) {
+      dev_per_task = task_dev.second;
+    } else if (dev_per_task != task_dev.second) {
+      return;
+    }
   }
   cp->instance.same_num_devices_per_task = true;
   CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
@@ -358,7 +360,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
   for (int i = 0; i < perm.size(); ++i) {
     perm[i] = i;
   }
-  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+  std::sort(perm.begin(), perm.end(), [cp](int a, int b) {
     return cp->instance.device_names[a] < cp->instance.device_names[b];
   });
   std::vector<string> new_devs;
@@ -398,7 +400,6 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 void CollectiveParamResolverLocal::InitInstanceSharedParams(
     const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
     const StatusCallback& done) {
-  VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
     mutex_lock gl(gr->mu);
@@ -412,8 +413,8 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
   }
   ir->shared.default_rank = -1;
 
-  // Sort devce_names lexicographcally, keeping task_names in
-  // corresponding order.
+  // Sort device_names lexicographically, keeping task_names in corresponding
+  // order.  Also set number of devices per task.
   SortDevicesAndTasks(&ir->shared);
 
   // Get Locality data for all devices.
@@ -583,7 +584,7 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
 void CollectiveParamResolverLocal::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
-  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+  VLOG(1) << "CompleteParams local " << device << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
       device, cp,
@@ -605,6 +606,27 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
                        "intended only for non-distributed deployment."));
 }
 
+// TODO(b/111897089): we need a better way to pick the collective
+// implementation.  The ideal way would depend upon the topology and link
+// strength before picking a particular implementation.
+void CollectiveParamResolverLocal::AssignCollectiveType(CollectiveParams* cp) {
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast";
+  } else if (cp->instance.type == REDUCTION_COLLECTIVE) {
+    if (nccl_) {
+      cp->instance.impl_details.collective_name = "NcclReduce";
+    } else {
+      cp->instance.impl_details.collective_name = "RingReduce";
+    }
+  } else if (cp->instance.type == GATHER_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "RingGather";
+  } else {
+    cp->instance.impl_details.collective_name = "undef";
+  }
+  VLOG(1) << "AssignCollectiveType "
+          << cp->instance.impl_details.collective_name;
+}
+
 void CollectiveParamResolverLocal::CompleteInstanceLocal(
     const string& device, const GroupRec* gr, CollectiveParams* cp,
     bool is_source, const StatusCallback& done) {
@@ -641,48 +663,57 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     // custom operator= does a deep copy.
     cp->instance = ir->shared.instance;
   }
-  // Populate the fields common across task, also default_rank.
+  // Populate the fields common across task.
+  AssignCollectiveType(cp);
   SetDefaultRank(device, cp);
   CompleteTaskIsLocal(task_name_, cp);
-  // TODO(b/113171733): we need a better way to pick the collective
-  // implementation.  The ideal way would depend upon the topology and link
-  // strength before picking a particular implementation.
-  cp->instance.impl_details.collective_name =
-      (cp->instance.type == BROADCAST_COLLECTIVE) ? "HierarchicalTreeBroadcast"
-                                                  : "RingReduce";
+
   CollectiveImplementationInterface* col_impl;
-  Status lookup_status = CollectiveRegistry::LookupParamResolverInstance(
+  Status status = CollectiveRegistry::LookupParamResolverInstance(
       cp->instance.impl_details.collective_name, &col_impl);
-  if (!lookup_status.ok()) {
-    done(lookup_status);
+  if (status.ok()) {
+    status = col_impl->InitializeInstanceBeforeGroupDiscovery(cp);
+  }
+  if (!status.ok()) {
+    done(status);
     return;
   }
-  // If broadcast, may need to wait for source discovery.
-  if (cp->instance.type == BROADCAST_COLLECTIVE) {
-    CompleteInstanceSource(ir, cp, is_source,
-                           [col_impl, ir, device, cp, done](InstanceRec* irec) {
-                             CHECK_EQ(ir, irec);
-                             Status s;
-                             {
-                               mutex_lock l(irec->out_mu);
-                               irec->WaitForOutMu(l);
-                               s = irec->status;
-                               cp->source_rank = irec->source_rank;
-                             }
-                             if (s.ok()) {
-                               s = col_impl->InitializeCollectiveParams(cp);
-                             }
-                             done(s);
-                           });
+
+  //  We may need to wait for the group if:
+  //  * this is a broadcast, for source discovery;
+  //  * we are using NCCL with more than 1 worker, for the communicator key from
+  //    rank 0.
+  bool broadcast = cp->instance.type == BROADCAST_COLLECTIVE;
+  bool nccl = cp->instance.type == REDUCTION_COLLECTIVE &&
+              cp->instance.impl_details.collective_name == "NcclReduce" &&
+              cp->group.num_tasks > 1;
+  if (broadcast || nccl) {
+    WaitForGroup(ir, cp, is_source, broadcast, nccl,
+                 [col_impl, ir, device, cp, done](InstanceRec* irec) {
+                   Status s;
+                   if (ir != irec) {
+                     s = errors::Internal("Expected ir ", ir, " and irec ",
+                                          irec, " to be equal");
+                   } else {
+                     mutex_lock l(irec->out_mu);
+                     irec->WaitForOutMu(l);
+                     s = irec->status;
+                     cp->source_rank = irec->source_rank;
+                     cp->instance.communicator_key = irec->communicator_key;
+                   }
+                   if (s.ok()) {
+                     s = col_impl->InitializeCollectiveParams(cp);
+                   }
+                   done(s);
+                 });
   } else {
     done(col_impl->InitializeCollectiveParams(cp));
   }
 }
 
-void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
-                                                          CollectiveParams* cp,
-                                                          bool is_source,
-                                                          const IRConsumer& f) {
+void CollectiveParamResolverLocal::WaitForGroup(
+    InstanceRec* ir, CollectiveParams* cp, bool is_source, bool init_source,
+    bool init_nccl, const IRConsumer& f) {
   std::vector<IRConsumer> ready_waiters;
   {
     mutex_lock l(ir->out_mu);
@@ -692,7 +723,8 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
     if (!ir->known[cp->default_rank]) {
       ir->known[cp->default_rank] = true;
       ++ir->known_count;
-      if (is_source) {
+      if (init_source && is_source) {
+        // Initialize source rank.
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
@@ -702,13 +734,26 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
           ir->source_rank = cp->default_rank;
         }
       }
+      if (init_nccl && cp->default_rank == 0) {
+        // Initialize communicator key.
+        if (!ir->communicator_key.empty()) {
+          ir->status =
+              errors::Internal("Instance ", cp->instance.instance_key,
+                               " already has communicator_key ",
+                               str_util::CEscape(ir->communicator_key),
+                               ", received second claim from device ",
+                               cp->instance.device_names[cp->default_rank]);
+        } else {
+          ir->communicator_key = cp->instance.communicator_key;
+        }
+      }
     }
     if (ir->known_count < ir->shared.group.group_size) {
       ir->known_waiters.push_back(f);
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    if (ir->source_rank < 0) {
+    if (init_source && ir->source_rank < 0) {
       // NOTE(ayushd): changing the error message below would also require
       // updating CompleteParamsBroadcastForgotSend test in
       // CollectiveParamResolverLocalTest.
@@ -718,6 +763,13 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
                            "could mean that there were group_size=",
                            ir->known_count, " BcastRecvs but no BcastSend.");
     }
+    if (init_nccl && ir->communicator_key.empty()) {
+      ir->status = errors::Internal(
+          "Instance ", cp->instance.instance_key, " device ",
+          cp->instance.device_names[cp->default_rank],
+          " did not find rank 0 for setting communicator key.  This is an "
+          "internal error in collective param resolution");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 365bddc787a7ba3d97f2df29b4ebd2a3c7118ef7..08e2f338f3c642cdfa6cd2df824cb1177c4b4911 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class CompleteGroupRequest;
@@ -36,7 +37,8 @@ class DeviceMgr;
 // group leader for param resolution in a multi-task context.
 class CollectiveParamResolverLocal : public ParamResolverInterface {
  public:
-  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+  CollectiveParamResolverLocal(const ConfigProto& config,
+                               const DeviceMgr* dev_mgr,
                                DeviceResolverInterface* dev_resolver,
                                const string& task_name);
 
@@ -130,8 +132,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
     Status status GUARDED_BY(out_mu);
 
     // These fields are used to count the instances that have called
-    // in and become known while resolving broadcast source identity.
+    // in and become known while resolving broadcast source identity and
+    // communicator key.
     int source_rank GUARDED_BY(out_mu);
+    string communicator_key GUARDED_BY(out_mu);
     int known_count GUARDED_BY(out_mu);
     std::vector<bool> known GUARDED_BY(out_mu);
     std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
@@ -197,10 +201,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                            const StatusCallback& done)
       LOCKS_EXCLUDED(ir->out_mu);
 
-  // Complete source data for a broadcast instance.
+  // Complete source data and/or nccl communicator key.
   // Precondition: *cp has complete group data and default_rank.
-  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
-                              bool is_source, const IRConsumer& f)
+  void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
+                    bool init_source, bool init_nccl, const IRConsumer& f)
       LOCKS_EXCLUDED(ir->out_mu);
 
   // If cp.device_names contains only devices local to this process
@@ -216,10 +220,15 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // current ordering of cp->instance.device_names.
   void SetDefaultRank(const string& device, CollectiveParams* cp);
 
+  // Sets cp->instance.type based on collective op type, and attempts to assign
+  // best implementation.
+  void AssignCollectiveType(CollectiveParams* cp);
+
   // Helper to grab status under lock, invoke callback out of lock.
   void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
       LOCKS_EXCLUDED(irec->out_mu);
 
+  const bool nccl_;
   const DeviceMgr* dev_mgr_;
   DeviceResolverInterface* dev_resolver_;  // Not owned.
   string task_name_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 94d889c40dff89204ccfc43478f8732815a4ead4..70eb9f8081aedfde33e3eb67b478c72ca2dee72f 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -41,8 +41,8 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                task_name));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), task_name));
   }
 
   void RunCompleteDefaultRanking(
@@ -175,7 +175,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index 4263f3a4add524bf59e7c08cfb5d927ac9e23e06..2e9d8cd394e36ed6dbbd5cb6e49687b633bf9186 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -46,8 +46,8 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                kTaskName));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), kTaskName));
     rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
                                                kStepId));
   }
@@ -70,7 +70,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
                      "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -85,7 +85,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
   rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
                    cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
@@ -113,7 +113,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
                      "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -130,7 +130,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
                    cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
diff --git a/tensorflow/core/common_runtime/collective_util.cc b/tensorflow/core/common_runtime/collective_util.cc
index 195521a0784fd43f7bcd1b98065c7fcb641d52b4..bee4a13d1826f894b6d81539d7439a37ed1a8cfa 100644
--- a/tensorflow/core/common_runtime/collective_util.cc
+++ b/tensorflow/core/common_runtime/collective_util.cc
@@ -79,5 +79,36 @@ string SubdivPermDebugString(const CollectiveParams& col_params) {
   return buf;
 }
 
+SubContext::SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+                       OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_params_.forward_from_array = &forward_from_;
+  sub_ctx_.reset(new OpKernelContext(&sub_params_, 1));
+}
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(ayushd, tucker): Is it possible to cache and reuse these objects?
+  // They're mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(op_ctx, params, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_.get());
+  return sub_ctx->sub_ctx_->status();
+}
+
 }  // namespace collective_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_util.h b/tensorflow/core/common_runtime/collective_util.h
index ebb5731becadec3b88bea86641887c31b63ae3a5..01fb8b8c81cd2f4dc390c2b6467d7c54c7753bf0 100644
--- a/tensorflow/core/common_runtime/collective_util.h
+++ b/tensorflow/core/common_runtime/collective_util.h
@@ -32,6 +32,27 @@ Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr,
                                    DeviceLocality* device_locality);
 string SubdivPermDebugString(const CollectiveParams& col_params);
 
+// Used for executing a sub-operation, e.g. a merge_op instance, with
+// an OpKernelContext based on the one passed into this Op.
+class SubContext {
+ public:
+  OpKernelContext::Params sub_params_;
+  gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+  gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+  gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+  // Used only for Binary and Unary Ops for which we require
+  // the calculation to be in-place on the first input.
+  int forward_from_ = 0;
+  std::unique_ptr<OpKernelContext> sub_ctx_;
+  SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+             OpKernel* op, Tensor* output, Tensor* input);
+  ~SubContext() = default;
+};
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input);
+
 }  // namespace collective_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbfd0c4f76e2830d45498e59e0bd32b6ddfe41f4
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -0,0 +1,996 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/colocation_graph.h"
+
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+namespace {
+
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
+// Returns a list of devices having type in supported_device_types.  The
+// returned list is sorted by preferred type (higher numeric type is preferred).
+std::vector<Device*> FilterSupportedDevices(
+    const std::vector<Device*>& devices,
+    const PrioritizedDeviceTypeVector& supported_device_types,
+    const Device* default_device) {
+  Device* filtered_default_device = nullptr;
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
+    for (Device* device : devices) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
+        if (device == default_device) {
+          filtered_default_device = device;
+        } else {
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
+        }
+      }
+    }
+  }
+
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
+    // First sort by prioritized device type (higher is preferred) and
+    // then by device name (lexicographically).
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
+  };
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
+  if (filtered_default_device != nullptr) {
+    filtered_devices.emplace_back(filtered_default_device);
+  }
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
+  return filtered_devices;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DevicesToString(const std::vector<Device*> devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (Device* d : devices) {
+    v.push_back(d->name());
+  }
+  return v;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DeviceTypeAndPriorityToString(
+    const PrioritizedDeviceTypeVector& devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (const std::pair<DeviceType, int32>& device_and_type : devices) {
+    v.push_back(DeviceTypeString(device_and_type.first));
+  }
+  return v;
+}
+
+// While Placer can override requested device on ops processing
+// resources, i.e. node that take (and potentially return) a resource,
+// it must not override requested device on ops generating a resource,
+// e.g. VarHandleOp, _Arg. Such ops are currently no-input, single resource/ref
+// output nodes.
+bool IsResourceGeneratorNode(const Node& node) {
+  return node.num_inputs() == 0 && node.num_outputs() == 1 &&
+         (IsRefType(node.output_type(0)) || node.output_type(0) == DT_RESOURCE);
+}
+
+bool IsExemptFromResourceInputColocation(const Node* node) {
+  // Note: Partitioned function calls, which place and partition their
+  // function bodies, are exempt from this check: they forward resource and
+  // ref inputs to operations that are appropriately placed, instead of
+  // dereferencing them.
+  const string& op_type = node->op_def().name();
+  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+}
+
+bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+  for (const auto& prioritized_device_type : device_types) {
+    if (prioritized_device_type.second != 0) return true;
+  }
+  return false;
+}
+
+bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                       const PrioritizedDeviceTypeVector& b_types) {
+  if (a_types.size() != b_types.size()) {
+    return false;
+  }
+  for (int i = 0; i < a_types.size(); ++i) {
+    if (a_types[i].first != b_types[i].first) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status Member::SetParentAndSupportedDevices(
+    const Node& node, const std::vector<DeviceType>& types) {
+  int id = node.id();
+  if (id < 0) {
+    return errors::Internal("Placer should not be creating a Member for node: ",
+                            node.DebugString());
+  }
+  parent_ = id;
+  return SupportedDeviceTypesForNode(types, node.def(),
+                                     &supported_device_types_);
+}
+
+Status Member::SetAssignedDeviceName(const string& device_name) {
+  if (DeviceNameUtils::HasSomeDetails(requested_device_name_)) {
+    return errors::Internal(
+        "Setting assigned device name when there is a requested device set "
+        "is unsupported");
+  }
+  if (!DeviceNameUtils::ParseFullName(device_name, &assigned_device_name_)) {
+    return errors::Internal("Malformed assigned device '", device_name, "'");
+  }
+  // Set requested device to assigned_device to maintain the invariant that
+  // requested is a specialization of assigned.
+  requested_device_name_ = assigned_device_name_;
+  return Status::OK();
+}
+
+Status Member::SetRequestedDeviceName(const Node& node) {
+  if (!DeviceNameUtils::ParseFullName(node.requested_device(),
+                                      &requested_device_name_)) {
+    return errors::InvalidArgument("Malformed device specification '",
+                                   node.requested_device(),
+                                   "' in node: ", node.DebugString());
+  }
+  if (DeviceNameUtils::HasSomeDetails(assigned_device_name_)) {
+    return errors::Internal(
+        "Setting requested device name when there is an assigned device set "
+        "is unsupported");
+  }
+  return Status::OK();
+}
+
+Status Member::EnsureCompatibilityAcrossResourceEdge(
+    const Node& src, const Member& src_root,
+    const Node& dst, /*dst_root is this*/
+    bool log_device_placement) {
+  if (!DeviceNameUtils::AreCompatibleDevNames(src_root.assigned_device_name_,
+                                              assigned_device_name_)) {
+    return errors::InvalidArgument(
+        "Cannot place the graph because a reference or resource edge "
+        "connects colocation groups with incompatible assigned devices: ",
+        DeviceNameUtils::ParsedNameToString(src_root.assigned_device_name_),
+        " vs ", DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+        ". The edge src node is ", src.name(), " , and the dst node is ",
+        dst.name());
+  }
+
+  if (DeviceNameUtils::AreCompatibleDevNames(src_root.requested_device_name_,
+                                             requested_device_name_)) {
+    return Status::OK();
+  }
+
+  // If we are here, assigned devices are compatible but requested ones are
+  // not. We will be overriding the requested device for destination node, but
+  // need to preserve the invariant that it will be a specialization of
+  // the assigned device.
+  if (log_device_placement) {
+    LOG(INFO) << "Ignoring device specification "
+              << DeviceNameUtils::ParsedNameToString(requested_device_name_)
+              << " for node '" << dst.name()
+              << "' because the input edge from '" << src.name()
+              << "' is a reference connection and already has a device "
+                 "field set to "
+              << DeviceNameUtils::ParsedNameToString(
+                     src_root.requested_device_name_);
+  }
+  requested_device_name_ = src_root.requested_device_name_;
+  DeviceNameUtils::EnsureSpecification(&requested_device_name_,
+                                       assigned_device_name_);
+  return Status::OK();
+}
+
+void Member::Merge(std::vector<Member>* tree, int x_root, int y_root,
+                   Member** new_root, Member** old_root, bool dry_run) {
+  Member& x_root_member = (*tree)[x_root];
+  Member& y_root_member = (*tree)[y_root];
+
+  // Merge the sets by setting the parent pointer of the smaller tree's root
+  // node to point to the root of the larger tree. Together with path
+  // compression in ColocationGraph::FindRoot, this ensures that we do not
+  // experience pathological performance on graphs such as chains.
+  int new_root_id, old_root_id;
+  if (x_root_member.rank_ < y_root_member.rank_) {
+    // The tree rooted at x_root is shallower, so connect it to
+    // y_root. The rank of y_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      x_root_member.parent_ = y_root;
+    }
+    new_root_id = y_root;
+    old_root_id = x_root;
+  } else if (x_root_member.rank_ > y_root_member.rank_) {
+    // The tree rooted at y_root is shallower, so connect it to
+    // x_root. The rank of x_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      y_root_member.parent_ = x_root;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  } else {
+    if (!dry_run) {
+      // Both trees have the same rank, so break the tie by choosing
+      // x_root as the new root.
+      y_root_member.parent_ = x_root;
+      // Increment the rank of the tree rooted at x_root, because it
+      // is now strictly deeper than before.
+      ++x_root_member.rank_;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  }
+
+  *new_root = &(*tree)[new_root_id];
+  *old_root = &(*tree)[old_root_id];
+}
+
+// tree is non-const because we can change some `parent` pointers in some
+// members for more efficient future lookups. The vector itself is not
+// changed.
+int Member::FindRoot(std::vector<Member>* tree, int node_id) {
+  Member& member = (*tree)[node_id];
+  if (member.parent_ == node_id) {
+    // member.parent is the root of this disjoint tree.  Do nothing.
+  } else {
+    member.parent_ = FindRoot(tree, member.parent_);
+  }
+  // Now it is guaranteed that member.parent is the root of this disjoint
+  // tree.
+  return member.parent_;
+}
+
+Status Member::MergeDeviceNames(const Member& other,
+                                bool allow_soft_placement) {
+  // Assuming the "requested is a specialization of assigned" invariant holds
+  // for this and `other`, it will hold after the two merges below.
+  DeviceNameUtils::ParsedName assigned_device_name_copy = assigned_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &assigned_device_name_copy, other.assigned_device_name_));
+
+  DeviceNameUtils::ParsedName requested_device_name_copy =
+      requested_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &requested_device_name_copy, other.requested_device_name_,
+      allow_soft_placement));
+
+  // We checked for all errors, now change the devices.
+  assigned_device_name_ = assigned_device_name_copy;
+  requested_device_name_ = requested_device_name_copy;
+  return Status::OK();
+}
+
+// Updates this to contain the intersection of the device types in
+// this and "other".
+bool Member::MergeSupportedDevices(const Member& other) {
+  // Generate intersection with priorities.
+  // Each vector contains the same device types but with different priorities.
+  // The priorities are taken from the corresponding source vector.
+  PrioritizedDeviceTypeVector target_intersection;
+  PrioritizedDeviceTypeVector other_intersection;
+  for (const auto& prioritized_device_type : supported_device_types_) {
+    bool found = false;
+    for (const auto& other_prioritized_device_type :
+         other.supported_device_types_) {
+      if (prioritized_device_type.first ==
+          other_prioritized_device_type.first) {
+        found = true;
+        other_intersection.push_back(other_prioritized_device_type);
+        break;
+      }
+    }
+    if (found) {
+      target_intersection.push_back(prioritized_device_type);
+    }
+  }
+
+  // Sort the devices by priority order.
+  auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                        const std::pair<DeviceType, int32>& b) {
+    // First look at set priorities.
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+    // Then fallback to default priorities.
+    auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+    auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    // Finally just look at the Device type strings.
+    return a.first.type_string() < b.first.type_string();
+  };
+
+  std::sort(target_intersection.begin(), target_intersection.end(),
+            device_sort);
+  std::sort(other_intersection.begin(), other_intersection.end(), device_sort);
+
+  PrioritizedDeviceTypeVector result;
+
+  bool is_target_prioritized = HasPriorities(target_intersection);
+  bool is_other_prioritized = HasPriorities(other_intersection);
+  if (!is_target_prioritized && !is_other_prioritized) {
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    result = target_intersection;
+  } else if (is_target_prioritized && !is_other_prioritized) {
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    result = target_intersection;
+  } else if (!is_target_prioritized && is_other_prioritized) {
+    result = other_intersection;
+  } else {
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (ArePrioritiesSame(target_intersection, other_intersection)) {
+      result = target_intersection;
+    } else {
+      for (const auto& prioritized_device : target_intersection) {
+        result.push_back(std::make_pair(prioritized_device.first, 0));
+      }
+      std::sort(result.begin(), result.end(), device_sort);
+    }
+  }
+
+  if (result.empty()) {
+    return false;
+  }
+  supported_device_types_ = result;
+  return true;
+}
+
+Status Member::AssignDevice(const Node& node, bool allow_soft_placement) {
+  if (node.assigned_device_name_index() == assigned_device_name_index_) {
+    return Status::OK();
+  }
+
+  DeviceNameUtils::ParsedName parsed;
+  DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
+  Status s = DeviceNameUtils::MergeDevNames(&assigned_device_name_, parsed,
+                                            allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's assigned device name: ",
+        DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+        " node's assigned device name \"", node.assigned_device_name(),
+        ". Error: ", s.error_message());
+  }
+  s = DeviceNameUtils::MergeDevNames(&requested_device_name_, parsed,
+                                     allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's requested device name: \"",
+        DeviceNameUtils::ParsedNameToString(requested_device_name_),
+        "\", node's assigned device name \"", node.assigned_device_name(),
+        "\". Error: ", s.error_message());
+  }
+
+  assigned_device_name_index_ = node.assigned_device_name_index();
+  // Clear cached possible_devices, if any.
+  possible_devices_.clear();
+  return Status::OK();
+}
+string Member::DebugString() {
+  return absl::StrCat(
+      "Member(assigned_device_name_index_=", assigned_device_name_index_,
+      " requested_device_name_=",
+      DeviceNameUtils::ParsedNameToString(requested_device_name_),
+      " assigned_device_name_=",
+      DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+      " supported_device_types_=[",
+      absl::StrJoin(DeviceTypeAndPriorityToString(supported_device_types_),
+                    ", "),
+      "] possible_devices_=[",
+      absl::StrJoin(DevicesToString(possible_devices_), ", "), "]");
+}
+ColocationGraph::ColocationGraph(const Graph* graph,
+                                 const DeviceSet* device_set,
+                                 const Device* default_device,
+                                 bool allow_soft_placement,
+                                 bool log_device_placement)
+    : graph_(graph),
+      device_set_(device_set),
+      device_types_(device_set->PrioritizedDeviceTypeList()),
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {
+  members_.resize(graph->num_node_ids());
+}
+
+// Adds each node of the Graph to this ColocationGraph as a singleton.
+//
+// NOTE: The implementation assumes that the ids of nodes passed to
+// this method are dense and zero-based; the memory used will be linear in
+// the largest node ID.
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateAllNodes() {
+  // This maps from a colocation group identifier to the 'root' of that
+  // colocation group.  Note that the keys in this map are StringPiece; the
+  // actual strings are stored under the NodeDef.  The lifetime of this map
+  // is limited to this ColocateAllNodes() method, and no part of the
+  // NodeDef trees are changed during the lifetime of this method, so using
+  // StringPiece as a key is safe.
+  //
+  // Also, as a further optimization, we remove the "loc:@" prefix from
+  // "class" attribute values, when they are used as keys in this table.
+  // This allows us to use StringPiece values that refer to substrings of
+  // 'string' values stored in NodeDef attribute lists, as well as StringPiece
+  // values that refer to 'string' values from NodeDef::name(), without
+  // performing any string allocations.
+  std::unordered_map<StringPiece, const Node*, StringPieceHasher>
+      colocation_group_root;
+
+  for (const Node* node : graph_->op_nodes()) {
+    // When adding the node, identify whether it is part of a colocation
+    // group.
+
+    // This code is effectively the equivalent of GetNodeAttr() for a string
+    // array, but it avoids all internal allocations (the allocation of the
+    // backing store of the std::vector<string> as well as the copies of the
+    // strings within it).  Instead, we combine the query of the colocation
+    // attribute with the calls to ColocateNodeToGroup.
+    bool found_spec = false;
+    const AttrValue* attr_value =
+        node->attrs().Find(kColocationAttrNameStringPiece);
+    if (attr_value != nullptr && attr_value->has_list()) {
+      for (const string& class_spec : attr_value->list().s()) {
+        StringPiece spec(class_spec);
+        if (str_util::ConsumePrefix(&spec, kColocationGroupPrefixStringPiece)) {
+          found_spec = true;
+          TF_RETURN_IF_ERROR(
+              ColocateNodeToGroup(&colocation_group_root, node, spec));
+        }
+      }
+    }
+
+    // TODO(iga): Even when the node has a spec, we need to colocate the
+    // node to its "name group" because other nodes can still use
+    // "loc:@<this_node_name>" in their colocation specs.
+    if (!found_spec) {
+      // If the node does not specify a colocation group, then use the
+      // name of this node as the colocation group.
+      TF_RETURN_IF_ERROR(
+          ColocateNodeToGroup(&colocation_group_root, node, node->name()));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceOrRefEdge(Node* src, Node* dst) {
+  // Colocate `src` and `dst` to maintain the invariant that nodes
+  // connected by reference edges are colocated.
+  int src_root_id = FindRoot(src->id());
+  int dst_root_id = FindRoot(dst->id());
+  auto& src_root = members_[src_root_id];
+  auto& dst_root = members_[dst_root_id];
+
+  TF_RETURN_IF_ERROR(dst_root.EnsureCompatibilityAcrossResourceEdge(
+      *src, src_root, *dst, log_device_placement_));
+  Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
+  if (!status.ok()) {
+    return AttachDef(
+        errors::InvalidArgument("Nodes were connected by a "
+                                "reference connection (requiring them to "
+                                "be on the same device), but the two nodes "
+                                "were assigned two different devices: ",
+                                status.error_message()),
+        *dst);
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceAndRefEdges() {
+  // Enumerate the constraint edges, and use them to update the disjoint
+  // node set.
+  // If `node` has an input edge with reference type, add an edge from the
+  // source of that edge to `node`.
+  for (const Edge* edge : graph_->edges()) {
+    if (edge->IsControlEdge()) {
+      continue;
+    }
+    Node* src = edge->src();
+    Node* dst = edge->dst();
+    DataType input_type = dst->input_type(edge->dst_input());
+    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
+        !IsExemptFromResourceInputColocation(dst)) {
+      TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
+    }
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::Initialize() {
+  TF_RETURN_IF_ERROR(InitializeMembers());
+  TF_RETURN_IF_ERROR(ColocateResourceAndRefEdges());
+  TF_RETURN_IF_ERROR(ColocateAllNodes());
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateNodeToGroup(
+    std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+        colocation_group_root,
+    const Node* node, StringPiece colocation_group) {
+  const Node*& root_node = (*colocation_group_root)[colocation_group];
+  if (root_node == nullptr) {
+    // This is the first node of the colocation group, so
+    // designate this node as the 'root' of that colocation group.
+    root_node = node;
+  } else {
+    // Try to colocate the node with the root.  If there is an
+    // error, return it.
+    Status s = ColocateNodes(*node, *root_node);
+    if (!s.ok()) {
+      if (!allow_soft_placement_) {
+        return AttachDef(s, *node);
+      }
+      if (log_device_placement_) {
+        LOG(INFO) << "Ignoring request to colocate node '" << node->name()
+                  << "' with nodes in colocation group '" << colocation_group
+                  << "' because soft placement is on and an attempt at doing "
+                     "so resulted in the following error: "
+                  << AttachDef(s, *node).ToString();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Merge the (possibly disjoint) sets containing nodes "x" and
+// "y". Returns OK if the all nodes in the union of these sets can
+// be placed on the same device type.
+//
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateNodes(const Node& x, const Node& y) {
+  int x_root = FindRoot(x.id());
+  int y_root = FindRoot(y.id());
+  return ColocateNodes(x, x_root, y, y_root);
+}
+
+// This overload of ColocateNodes() allows a caller to provide the root node
+// ids for the two nodes. For large graphs, this noticeably reduces the
+// graph load time.
+Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
+                                      int y_root) {
+  if (x_root == y_root) {
+    return Status::OK();
+  }
+
+  Member* new_root_member;
+  Member* old_root_member;
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/true);
+
+  // Merge the partial device specifications, and ensure that they are
+  // compatible. NULL options_ is treated as allowing soft placement.
+  // If there is an error, nothing is modified.
+  // TODO(mrry): Consider enriching the error message by pointing
+  // out which nodes have the explicit partial device
+  // specifications that caused this conflict.
+  Status s = new_root_member->MergeDeviceNames(*old_root_member,
+                                               allow_soft_placement_);
+  if (!s.ok()) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()), ": ",
+        s.error_message());
+  }
+
+  // Ensure that the common root has at least one supported device
+  // type, by computing the intersection of
+  // new_root_member.supported_device_types and
+  // old_root_member.supported_device_types.
+  if (!new_root_member->MergeSupportedDevices(*old_root_member)) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()),
+        " because no device type supports both of those nodes and the "
+        "other nodes colocated with them.",
+        DebugInfo(x_root), DebugInfo(y_root));
+  }
+
+  // All error checks are done, merge the colocation graphs.
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/false);
+  return Status::OK();
+}
+
+// Limits the possible devices of `node`'s colocation group to the device
+// to which `node` is assigned. This makes sure that all nodes in this
+// colocation group will be assigned to the same device. Without this
+// explicit restriction, heuristics can choose a different possible device
+// for other nodes in the group.
+Status ColocationGraph::LimitToAssignedDevice(const Node& node) {
+  if (node.assigned_device_name_index() < 0) {
+    return errors::Internal(
+        "Expected an assigned node as argument to LimitToAssignedDevice but "
+        "got: ",
+        node.DebugString());
+  }
+  int root = FindRoot(node.id());
+  Member& root_member = members_[root];
+  return root_member.AssignDevice(node, allow_soft_placement_);
+}
+
+// For the given node, subject to the constraints previously given
+// to this ColocationGraph, set its assigned_device_name. Returns OK
+// if a satisfying device can be found, otherwise an error.
+//
+// Note: This method returns a pointer to a field within members_.
+// The caller must not use the returned pointer after there is any possibility
+// that the members_[i].possible_devices field has been modified.
+Status ColocationGraph::GetDevicesForNode(
+    Node* node, const std::vector<Device*>** possible_devices) {
+  *possible_devices = nullptr;
+  const int node_root = FindRoot(node->id());
+  if (!members_[node_root].possible_devices().empty()) {
+    *possible_devices = &members_[node_root].possible_devices();
+    return Status::OK();
+  }
+
+  // We have not yet computed the possible devices for the
+  // colocated node set containing 'node', so we do so now using the
+  // constraints on the root node.
+
+  // "devices" will contain the set of feasible placements for the
+  // colocated node set containing 'node'.
+  std::vector<Device*> devices;
+  if (DeviceNameUtils::HasSomeDetails(
+          members_[node_root].requested_device_name())) {
+    // The root node has a (possibly partial) device
+    // specification, so enumerate the physical devices that
+    // conform to it.
+    device_set_->FindMatchingDevices(
+        members_[node_root].requested_device_name(), &devices);
+
+    if (!devices.empty()) {
+      // Filter devices into those that are compatible with the root
+      // node (and its children).
+      devices = FilterSupportedDevices(
+          devices, members_[node_root].supported_device_types(),
+          default_device_);
+    }
+
+    // Perform soft placement if allow_soft_placement_ is set.
+    if (devices.empty() && allow_soft_placement_) {
+      // The soft_device_name is the same as the node's device name
+      // without specifying the device type or ID.
+      DeviceNameUtils::ParsedName soft_device_name =
+          members_[node_root].requested_device_name();
+      soft_device_name.type.clear();
+      soft_device_name.has_type = false;
+      soft_device_name.has_id = false;
+      device_set_->FindMatchingDevices(soft_device_name, &devices);
+      if (!devices.empty()) {
+        devices = FilterSupportedDevices(
+            devices, members_[node_root].supported_device_types(),
+            default_device_);
+      }
+    }
+
+    if (devices.empty()) {
+      // Return an error when a physical device that matches an explicit
+      // device specification is not found. This ensures that we don't
+      // assign a node to GPU when the user wanted to force it on CPU.
+      string debug_info = DebugInfo(node_root);
+
+      DeviceNameUtils::ParsedName specified_device_name;
+      if (DeviceNameUtils::ParseFullName(node->requested_device(),
+                                         &specified_device_name) &&
+          specified_device_name ==
+              members_[node_root].requested_device_name()) {
+        // The specified device and merged set device match, and
+        // will appear in the GraphDef (for debugging), so just
+        // print the specified device.
+        std::vector<Device*> devices_matching_nodedef;
+        device_set_->FindMatchingDevices(specified_device_name,
+                                         &devices_matching_nodedef);
+        if (devices_matching_nodedef.empty()) {
+          // Sometimes it is almost impossible to understand the problem
+          // without a list of available devices.
+          std::vector<string> device_names;
+          for (const Device* device : device_set_->devices()) {
+            device_names.push_back(device->name());
+          }
+          std::sort(device_names.begin(), device_names.end());
+
+          string gpu_msg = "";
+          if (!IsGoogleCudaEnabled() &&
+              str_util::Lowercase(specified_device_name.type) == "gpu") {
+            gpu_msg =
+                " The requested device appears to be a GPU, but CUDA is not "
+                "enabled.";
+          }
+
+          return errors::InvalidArgument(
+              errors::FormatNodeNameForError(node->name()),
+              "was explicitly assigned to ", node->requested_device(),
+              " but available devices are [ ",
+              str_util::Join(device_names, ", "), " ]. Make sure ",
+              "the device specification refers to a valid device.", gpu_msg);
+        } else if (specified_device_name.has_type) {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), "' because no supported kernel for ",
+              specified_device_name.type, " devices is available.", debug_info,
+              "\nRegistered kernels:\n",
+              KernelsRegisteredForOp(node->type_string()));
+        } else {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), debug_info);
+        }
+      } else {
+        // The specified device may be a valid device but the
+        // merged set device is different, so print both.
+        return errors::InvalidArgument(
+            "Could not satisfy explicit device specification '",
+            node->requested_device(), "' because the node ",
+            errors::FormatColocationNodeForError(node->name()),
+            " was colocated with a group of nodes that ",
+            "required incompatible device '",
+            DeviceNameUtils::ParsedNameToString(
+                members_[node_root].requested_device_name()),
+            "'", debug_info);
+      }
+    }
+  } else {
+    // The device is completely unspecified, so enumerate the devices that
+    // support all of the nodes in the set.
+    if (device_set_->devices().empty()) {
+      return errors::Internal("No devices are registered");
+    }
+    devices = FilterSupportedDevices(
+        device_set_->devices(), members_[node_root].supported_device_types(),
+        default_device_);
+
+    if (devices.empty()) {
+      return errors::InvalidArgument(
+          "Node had no OpKernel registered to support this operation: ",
+          "Operation was ", node->type_string(), " and inputs were ",
+          DataTypeVectorString(node->input_types()), DebugInfo(node_root));
+    }
+  }
+
+  // Cache the result of the possible devices for this node group.
+  members_[node_root].set_possible_devices(std::move(devices));
+  *possible_devices = &members_[node_root].possible_devices();
+  return Status::OK();
+}
+
+Status ColocationGraph::InitializeMembers() {
+  for (Node* node : graph_->op_nodes()) {
+    Status status = InitializeMember(*node, &members_[node->id()]);
+    if (!status.ok()) {
+      return AttachDef(status, *node);
+    }
+  }
+  return Status::OK();
+}
+
+string ColocationGraph::DebugString() {
+  std::unordered_set<int> roots;
+  std::vector<string> root_strings;
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int node_root = FindRoot(node->id());
+    if (roots.count(node_root) == 0) {
+      root_strings.push_back(DebugInfo(node_root));
+      roots.insert(node_root);
+    }
+  }
+  return absl::StrJoin(root_strings, "\n");
+}
+
+// Returns debugging info for the node referred to by 'node_root'.
+string ColocationGraph::DebugInfo(const int node_root) {
+  string text(
+      "\nColocation Debug Info:\n"
+      "Colocation group had the following types and devices: ");
+
+  // If this node is part of a colocation group, then we want to
+  // collect the mapping of ops to supported devices, so that
+  // the user can see why an unsatisfiable placement occurred.
+
+  std::unordered_map<string, string> type_to_devices;
+  std::vector<const Node*> colocation_nodes;
+  int num_nodes_found = 0;
+
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int id = node->id();
+    if (FindRoot(id) != node_root) {
+      continue;
+    }
+    ++num_nodes_found;
+    colocation_nodes.push_back(node);
+    const string& op_type = node->type_string();
+    string devices_registered;
+    for (const auto& device_type : members_[id].supported_device_types()) {
+      strings::StrAppend(&devices_registered,
+                         DeviceTypeString(device_type.first), " ");
+    }
+
+    type_to_devices[op_type] = std::move(devices_registered);
+  }
+
+  for (const auto& td : type_to_devices) {
+    strings::StrAppend(&text, "\n", td.first, ": ", td.second);
+  }
+  strings::StrAppend(&text,
+                     "\n\nColocation members and user-requested devices:");
+  for (const Node* node : colocation_nodes) {
+    strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
+                       ") ", node->requested_device());
+  }
+  strings::StrAppend(&text, "\n");
+
+  if (num_nodes_found <= 0) {
+    text.clear();
+  }
+  return text;
+}
+
+Status ColocationGraph::InitializeMemberWithAssignedDevice(
+    const string& assigned_device_name, const string& node_type,
+    bool must_be_full_name, Member* member) {
+  // This node has already been assigned to a device, so we
+  // respect this placement, after sanity-checking it.
+  // NOTE: Since any assignment must have been performed by
+  // the TensorFlow runtime, we consider errors in this branch to
+  // be INTERNAL.
+  TF_RETURN_IF_ERROR(member->SetAssignedDeviceName(assigned_device_name));
+  if (!must_be_full_name) {
+    return Status::OK();
+  }
+  // Since assigned device must be a full specification, do extra checks.
+  const Device* assigned_device =
+      device_set_->FindDeviceByName(assigned_device_name);
+  if (assigned_device == nullptr) {
+    return errors::Internal("Assigned device '", assigned_device_name,
+                            "' does not match any device");
+  }
+
+  for (const auto& d : member->supported_device_types()) {
+    if (DeviceType(assigned_device->attributes().device_type()) == d.first) {
+      return Status::OK();
+    }
+  }
+
+  return errors::Internal("Assigned device '", assigned_device_name,
+                          "' does not have registered OpKernel support "
+                          "for ",
+                          node_type);
+}
+
+Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
+  TF_RETURN_IF_ERROR(member->SetParentAndSupportedDevices(node, device_types_));
+
+  if (node.has_assigned_device_name()) {
+    TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+        node.assigned_device_name(), node.type_string(), true, member));
+  } else {
+    // This node has not yet been assigned to a device, so we
+    // calculate any constraints due to the set of registered
+    // kernels and any (partial) user-provided device specification
+    // in the NodeDef.
+
+    // If no kernels are registered for this op type, fail with an error.
+    if (member->supported_device_types().empty()) {
+      std::set<string> registered_device_types;
+      for (Device* d : device_set_->devices()) {
+        registered_device_types.insert(d->device_type());
+      }
+      std::vector<string> attr_key_vals;
+      for (const auto& it : node.attrs()) {
+        const string& name = it.first;
+        const AttrValue& attr_value = it.second;
+        attr_key_vals.push_back(
+            strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
+      }
+      return errors::InvalidArgument(
+          "No OpKernel was registered to support Op '", node.type_string(),
+          "' used by ", errors::FormatNodeNameForError(node.name()),
+          "with these attrs: [", str_util::Join(attr_key_vals, ", "),
+          "]\n"
+          "Registered devices: [",
+          str_util::Join(registered_device_types, ", "), "]\n",
+          "Registered kernels:\n", KernelsRegisteredForOp(node.type_string()));
+    }
+
+    // If the NodeDef contains a device, then we interpret it as a
+    // (partial) device specification.
+    if (!node.requested_device().empty()) {
+      if (IsResourceGeneratorNode(node)) {
+        // Treat requested device on resource generating nodes as assigned
+        // device so that we don't override it.
+        TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+            node.requested_device(), node.type_string(), false, member));
+      } else {
+        // The user has specified a device in the NodeDef, try to find a
+        // valid device matching their specification in the set of
+        // devices.
+        // NOTE: The full name may specify a device that is not in
+        // n.supported_device_types(), but we check that in AssignDevice().
+        TF_RETURN_IF_ERROR(member->SetRequestedDeviceName(node));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..12611496a5f53764fa13eb839753fd4289cca2d6
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -0,0 +1,253 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// Represents a node in the disjoint node forest and the
+// accumulated constraints on the device used by that node.
+class Member {
+ public:
+  Member() = default;
+
+  Status SetParentAndSupportedDevices(const Node& node,
+                                      const std::vector<DeviceType>& types);
+
+  const DeviceNameUtils::ParsedName& requested_device_name() const {
+    return requested_device_name_;
+  }
+
+  Status SetAssignedDeviceName(const string& device_name);
+
+  Status SetRequestedDeviceName(const Node& node);
+
+  Status EnsureCompatibilityAcrossResourceEdge(
+      const Node& src, const Member& src_root,
+      const Node& dst, /*dst_root is this*/
+      bool log_device_placement);
+
+  const PrioritizedDeviceTypeVector& supported_device_types() const {
+    return supported_device_types_;
+  }
+
+  // If `dry_run` is true, just sets `new_root` and `old_root` and does not
+  // actually modify anything in the `tree`.
+  static void Merge(std::vector<Member>* tree, int x_root, int y_root,
+                    Member** new_root, Member** old_root, bool dry_run);
+
+  // tree is non-const because we can change some `parent` pointers in some
+  // members for more efficient future lookups. The vector itself is not
+  // changed.
+  static int FindRoot(std::vector<Member>* tree, int node_id);
+
+  Status MergeDeviceNames(const Member& other, bool allow_soft_placement);
+
+  // Updates this to contain the intersection of the device types in
+  // this and "other". If the intersection is empty, returns false and does
+  // not update this. Else returns true and updates this.
+  bool MergeSupportedDevices(const Member& other);
+
+  Status AssignDevice(const Node& node, bool allow_soft_placement);
+
+  void set_possible_devices(std::vector<Device*>&& devices) {
+    possible_devices_ = devices;
+  }
+  const std::vector<Device*>& possible_devices() { return possible_devices_; }
+
+  string DebugString();
+
+ private:
+  // The id of the node that is the parent of this one, or its own
+  // id if it is a root. parent <= 0 indicates that this member is invalid.
+  int parent_ = -1;
+
+  // A proxy for the depth of the tree that is used to prefer
+  // connecting smaller trees to larger trees when merging disjoint
+  // sets.
+  int rank_ = 0;
+
+  // Once colocation groups have been formed, the Placer starts actually
+  // choosing devices. All nodes in a group must be assigned to the same
+  // device. Once we assigned the first device to some node in this group,
+  // we set assigned_device_name_index to this device name's index in the
+  // graph.
+  // The `*_device_name_` fields will contain the parsed name of this device
+  // and `possible_devices`, if computed, will contain just this device.
+  // `assigned_device_name_index` is an optimization to avoid parsing and
+  // comparing device names. The value of -1 signals that a single device
+  // has not been chosen yet.
+  int assigned_device_name_index_ = -1;
+
+  // The merged form of the device requested for this node, with those of all of
+  // its children. requested_device_name_ is always kept a specialization (i.e.
+  // DeviceNameUtils::IsSpecialization) of assigned_device_name_. When no device
+  // is requested, this field is set to assigned_device_name_.  As a
+  // specialization of assigned_device_name_, requested_device_name_ represents
+  // the most specific form of all assigned and requested devices of this node
+  // and its children, if this node is a root. requested_device_name_ is used
+  // to finally select devices for nodes.  We can override requested devices due
+  // to resource colocation constraints but not assigned devices (unless soft
+  // placement is on).
+  DeviceNameUtils::ParsedName requested_device_name_;
+
+  // The merged form of the device assigned for this node, with
+  // those of all of its children.
+  // This field is used to raise errors due to unsatisfiable constraints.
+  // Can be a partial specification.
+  // INVARIANT: requested_device_name_ is always a
+  // DeviceNameUtils::IsSpecialization of assigned_device_name_.
+  DeviceNameUtils::ParsedName assigned_device_name_;
+
+  // The intersection of all device types supported by this node,
+  // and those of all of its children, in priority order
+  // of the preferred device.
+  PrioritizedDeviceTypeVector supported_device_types_;
+
+  // If this node is a root, stores a list of Devices to which this node
+  // and all of its children have been assigned, or nullptr if this
+  // has not yet been computed.
+  std::vector<Device*> possible_devices_;
+};  // namespace
+
+// This class maintains the connected components of a colocation
+// constraint graph, and uses this information to assign a satisfying
+// device placement to the nodes of the graph.
+//
+// The typical usage pattern is:
+//
+//   Graph graph = ...;
+//   DeviceSet device_set = ...;
+//   ColocationGraph colocation_graph(graph, device_set);
+//
+//   // Add all the nodes of the `graph` to the `colocation_graph`.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
+//   }
+//
+//   // Add one or more colocation constraints.
+//   Node node_1 = *graph.FindNodeId(...);
+//   Node node_2 = *graph.FindNodeId(...);
+//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
+//
+//   // Assign devices based on the accumulated constraints.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
+//   }
+//
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
+//
+// ColocationGraph does not assign any devices to graph nodes. The
+// `log_device_placement` argument is used to log messages when requested
+// device is ignored.
+class ColocationGraph {
+ public:
+  ColocationGraph(const Graph* graph, const DeviceSet* device_set,
+                  const Device* default_device, bool allow_soft_placement,
+                  bool log_device_placement);
+
+  // Adds each node of the Graph to this ColocationGraph as a singleton.
+  //
+  // NOTE: The implementation assumes that the ids of nodes passed to
+  // this method are dense and zero-based; the memory used will be linear in
+  // the largest node ID.
+  // NOTE: If this method returns an error, *this is left in an undefined
+  // state.
+  Status ColocateAllNodes();
+
+  Status ColocateResourceOrRefEdge(Node* src, Node* dst);
+
+  Status ColocateResourceAndRefEdges();
+
+  Status Initialize();
+
+  Status ColocateNodeToGroup(
+      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+          colocation_group_root,
+      const Node* node, StringPiece colocation_group);
+
+  // Merge the (possibly disjoint) sets containing nodes "x" and
+  // "y". Returns OK if the all nodes in the union of these sets can
+  // be placed on the same device type.
+  //
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, const Node& y);
+
+  // This overload of ColocateNodes() allows a caller to provide the root node
+  // ids for the two nodes. For large graphs, this noticeably reduces the
+  // graph load time.
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root);
+
+  // Limits the possible devices of `node`'s colocation group to the device
+  // to which `node` is assigned. This makes sure that all nodes in this
+  // colocation group will be assigned to the same device. Without this
+  // explicit restriction, heuristics can choose a different possible device
+  // for other nodes in the group.
+  Status LimitToAssignedDevice(const Node& node);
+
+  // For the given node, subject to the constraints previously given
+  // to this ColocationGraph, set its assigned_device_name. Returns OK
+  // if a satisfying device can be found, otherwise an error.
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  Status GetDevicesForNode(Node* node,
+                           const std::vector<Device*>** possible_devices);
+
+  Status InitializeMembers();
+
+  string DebugString();
+
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root);
+
+  Status InitializeMemberWithAssignedDevice(const string& assigned_device_name,
+                                            const string& node_type,
+                                            bool must_be_full_name,
+                                            Member* member);
+
+  Status InitializeMember(const Node& node, Member* member);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  int FindRoot(int node_id) { return Member::FindRoot(&members_, node_id); }
+
+  const Graph* const graph_;  // Not owned.
+  std::vector<Member> members_;
+  const DeviceSet* device_set_;  // Not owned.
+  const std::vector<DeviceType> device_types_;
+  const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 5c226ec56e13fbb398d852ff6287910d2347785e..88b8a2dacff66de3cb22107fdc525900746af7dd 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -327,14 +327,15 @@ void FindConstantFoldableNodes(
         shape_replacement_map) {
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order.
-  ReverseDFS(*graph, nullptr,
-             [nodes, constant_control_deps, shape_replacement_map,
-              &internal_node_inserted, &opts](Node* n) {
-               ConsiderConstantFoldableNode(
-                   n, opts, nodes, constant_control_deps, shape_replacement_map,
-                   &internal_node_inserted);
-             },
-             NodeComparatorName());
+  ReverseDFS(
+      *graph, nullptr,
+      [nodes, constant_control_deps, shape_replacement_map,
+       &internal_node_inserted, &opts](Node* n) {
+        ConsiderConstantFoldableNode(n, opts, nodes, constant_control_deps,
+                                     shape_replacement_map,
+                                     &internal_node_inserted);
+      },
+      NodeComparatorName());
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
     nodes->clear();
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 5a0ef28ff22a9bf67cb4355b6d5373e957eb8df0..64119e85c79953760422a13e95c2a63f0bae6b7d 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -127,16 +127,22 @@ class Device : public DeviceBase {
   // flag settings. Override this to return false for devices that don't allow
   // such calls. Instead, these devices must use other mechanisms (such as
   // num_deferred_ops) to ensure the device has finished processing necessary
-  // work at session completion.
+  // work at session completion. In addition, for these devices, RefreshStatus
+  // must be called at session completion to retrieve execution result status.
   //
-  // Devices that override this function must also implement CurrentStatus.
+  // Devices that override this function must also implement RefreshStatus.
   virtual bool AllowsSyncOnCompletion() const { return true; }
 
   // This is used in conjunction with AllowsSyncOnCompletion to allow the
   // executor to get execution result status at session completion.
-  virtual Status CurrentStatus() {
+  //
+  // For supported devices, this call returns the underlying device stream's
+  // current status in a non-blocking way, without using blocking calls such as
+  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
+  // status is also updated with the retrieved stream status.
+  virtual Status RefreshStatus() {
     return errors::Unimplemented(
-        "CurrentStatus is not supported on this device.");
+        "RefreshStatus is not supported on this device.");
   }
 
   // Optionally modify the device's GraphDef before execution.
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 1f7d7c4699872e55a73ebab919936435684405fe..26602a45be8b15c7c379e68b312d36fa14d67dbb 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -95,7 +95,6 @@ string DeviceMgr::DeviceMappingString() const {
 }
 
 Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
-  Status s;
   auto iter = device_map_.find(name);
   if (iter == device_map_.end()) {
     std::vector<StringPiece> device_names;
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 54f1119e139886096cb7c2007e584003992d86c2..b8dac8e0dd9ee5ae01dc4bf05f4c9c64eb6f2490 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -31,7 +31,6 @@ namespace {
 class DeviceResolverLocalTest : public ::testing::Test {
  protected:
   DeviceResolverLocalTest() {
-    ConfigProto cp;
     SessionOptions options;
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
@@ -56,7 +55,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
@@ -74,7 +73,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 80b62f273ce785c700f93ed68af1af9429276f79..40a1ffc42da5020b62932812f3939cbdb7686aff 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -501,7 +501,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       std::unique_ptr<DeviceResolverInterface> drl(
           new DeviceResolverLocal(device_mgr_.get()));
       std::unique_ptr<ParamResolverInterface> cprl(
-          new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+          new CollectiveParamResolverLocal(options_.config, device_mgr_.get(),
+                                           drl.get(),
                                            "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
           options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
@@ -1194,6 +1195,8 @@ Status DirectSession::CreateExecutors(
   if (options_.config.experimental()
           .collective_deterministic_sequential_execution()) {
     options.collective_order = GraphCollectiveOrder::kEdges;
+  } else if (options_.config.experimental().collective_nccl()) {
+    options.collective_order = GraphCollectiveOrder::kAttrs;
   }
 
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 06cbe8049f25ea4ab72915d5f0630b84d5099135..aef64da79492c238713953b0958089f4abd501a2 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -106,6 +106,7 @@ tf_cuda_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "@com_google_absl//absl/strings",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -154,6 +155,7 @@ tf_cuda_library(
     deps = [
         ":attr_builder",
         "@farmhash_archive//:farmhash",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -180,12 +182,18 @@ tf_cc_test(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:constant_op",
@@ -208,6 +216,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index aa64b5f59bd0cb54b1872c0328c10ebb1de622b6..d9f110e6397e1d719d0d300b372c81a574db56b3 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -54,10 +54,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
-// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
-                      TF_AttrType* out, unsigned char* is_list);
-
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
 // An AttrBuilder is a convenience class to help with that - providing a smaller
 // interface than NodeDefBuilder and avoiding expensive (unnecessary?) sanity
@@ -130,17 +126,11 @@ class AttrBuilder {
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
-    // Copied from NodeDefBuilder::Attr
-    const AttrValue* found = AttrSlice(m).Find(attr_name);
-    AttrValue attr_value;
-    if (found == nullptr) {
+    // If attribute is set more than once, its first value prevails
+    if (AttrSlice(m).Find(attr_name) == nullptr) {
+      AttrValue attr_value;
       SetAttrValue(value, &attr_value);
       m->insert(AttrValueMap::value_type(attr_name, attr_value));
-    } else {
-      // TODO(ashankar): Do what is done in
-      // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value);
-      SetAttrValue(std::forward<T>(value), &attr_value);
-      (*m)[attr_name] = attr_value;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 1d93d2bbe6f95250c9e49f20fb24b42933b43422..3f49c9f79befefd02049649083bf3cbf9d3b570a 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -20,9 +20,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#endif
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/util/env_var.h"
@@ -57,8 +60,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
-          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
-          thread_pool_.get())),
+          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          opts.config.graph_options().optimizer_options(), thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       num_active_steps_(0),
       async_default_(async),
@@ -66,7 +69,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
       env_(opts.env),
       use_send_tensor_rpc_(false),
       pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
-          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) {
+          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", false)) {
   if (device_mgr_owned) {
     local_device_manager_.reset(device_mgr);
     local_unowned_device_manager_ = nullptr;
@@ -81,7 +84,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
   std::unique_ptr<DeviceResolverInterface> drl(
       new DeviceResolverLocal(local_device_mgr()));
   std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
-      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+      opts.config, local_device_mgr(), drl.get(),
+      "/job:localhost/replica:0/task:0"));
   collective_executor_mgr_.reset(new CollectiveExecutorMgr(
       opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
@@ -134,9 +138,15 @@ Status EagerContext::SetAsyncForThread(bool async) {
   return Status::OK();
 }
 
-void EagerContext::ClearCaches() {
+Status EagerContext::ClearCaches() {
+  // The executor stores pointers to kernels, so we need to make sure that no
+  // async eager ops are still executing. We lock the cache during this time as
+  // well.
   mutex_lock ml(cache_mu_);
+  TF_RETURN_IF_ERROR(executor_.WaitForAllPendingNodes());
   gtl::STLDeleteValues(&kernel_cache_);
+
+  return Status::OK();
 }
 
 void EagerContext::SetThreadLocalDevicePlacementPolicy(
@@ -205,8 +215,16 @@ EagerContext::~EagerContext() {
 #endif
 
   executor_.WaitForAllPendingNodes().IgnoreError();
-  ClearCaches();
+  ClearCaches().IgnoreError();
   rendezvous_->Unref();
+
+  for (auto& thread : child_threads_) {
+    thread.reset();
+  }
+}
+
+void EagerContext::AddChildThread(std::unique_ptr<Thread> thread) {
+  child_threads_.push_back(std::move(thread));
 }
 
 bool EagerContext::FindFunctionByName(const string& name) {
@@ -340,14 +358,27 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
 }
 
-bool EagerContext::ShouldStoreMetadata() {
+bool EagerContext::ShouldStoreGraphs() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_graphs_.load() || metadata_listener_ != nullptr;
+}
+
+bool EagerContext::ShouldStoreStepStats() {
   mutex_lock ml(metadata_mu_);
-  return should_store_metadata_.load() || metadata_listener_ != nullptr;
+  return should_store_step_stats_.load() || metadata_listener_ != nullptr;
 }
 
-void EagerContext::SetShouldStoreMetadata(bool value) {
+void EagerContext::SetShouldStoreGraphs(bool value) {
   mutex_lock ml(metadata_mu_);
-  should_store_metadata_.store(value);
+  should_store_graphs_.store(value);
+  if (!value || metadata_listener_ != nullptr) {
+    run_metadata_.Clear();
+  }
+}
+
+void EagerContext::SetShouldStoreStepStats(bool value) {
+  mutex_lock ml(metadata_mu_);
+  should_store_step_stats_.store(value);
   if (!value || metadata_listener_ != nullptr) {
     run_metadata_.Clear();
   }
@@ -408,7 +439,7 @@ Status EagerContext::StoreCollectiveOpsServer(
   devices_map_.clear();
 
   InitDeviceMapAndAsync();
-  ClearCaches();
+  TF_RETURN_IF_ERROR(ClearCaches());
 
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       local_unowned_device_manager_, env_, TF_GRAPH_DEF_VERSION, &func_lib_def_,
@@ -425,7 +456,7 @@ Status EagerContext::StoreCollectiveOpsServer(
   return Status::OK();
 }
 
-void EagerContext::InitializeRemote(
+Status EagerContext::InitializeRemote(
     std::unique_ptr<ServerInterface> server,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DeviceMgr> remote_device_manager,
@@ -473,7 +504,7 @@ void EagerContext::InitializeRemote(
 
   InitDeviceMapAndAsync();
 
-  ClearCaches();
+  TF_RETURN_IF_ERROR(ClearCaches());
 
   keep_alive_secs_ = keep_alive_secs;
 
@@ -522,6 +553,7 @@ void EagerContext::InitializeRemote(
           }
         }));
   }
+  return Status::OK();
 }
 #endif
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d8807cf839fbfada5a5363c1b8d19958a39c204b..25bcca7cae9419947214d70eb8e9a63074204204 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/platform/env.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
@@ -62,8 +63,7 @@ enum ContextDevicePlacementPolicy {
   // Silently copy the tensor, which has a performance cost since the operation
   // will be blocked till the copy completes. This is the default policy.
   DEVICE_PLACEMENT_SILENT = 2,
-  // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.
+  // Placement policy which silently copies int32 tensors but not other dtypes.
   DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 };
 
@@ -117,7 +117,7 @@ class EagerContext {
   }
 
   // Clears the kernel caches.
-  void ClearCaches();
+  Status ClearCaches();
 
   // Sets the device placement policy for the current thread.
   void SetThreadLocalDevicePlacementPolicy(ContextDevicePlacementPolicy policy);
@@ -182,8 +182,10 @@ class EagerContext {
 
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
-  bool ShouldStoreMetadata() LOCKS_EXCLUDED(metadata_mu_);
-  void SetShouldStoreMetadata(bool value);
+  bool ShouldStoreStepStats() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreStepStats(bool value);
+  bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreGraphs(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
   void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
@@ -213,7 +215,7 @@ class EagerContext {
   // - remote_device_mgr: A DeviceMgr* which contains all remote devices
   // (should contain no local devices).
   // - remote_contexts: A map containing task name to remote context ID.
-  void InitializeRemote(
+  Status InitializeRemote(
       std::unique_ptr<ServerInterface> server,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DeviceMgr> remote_device_manager,
@@ -238,6 +240,9 @@ class EagerContext {
 
   tensorflow::Env* TFEnv() const { return env_; }
 
+  // All child threads will be reset() when destructing EagerContext.
+  void AddChildThread(std::unique_ptr<Thread> thread);
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
@@ -280,7 +285,8 @@ class EagerContext {
       GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_metadata_{false};
+  std::atomic<bool> should_store_step_stats_{false};
+  std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
   RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
@@ -334,6 +340,7 @@ class EagerContext {
 
   bool use_send_tensor_rpc_;
   const bool pin_small_ops_to_cpu_;
+  std::vector<std::unique_ptr<tensorflow::Thread>> child_threads_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 0718e6893237c8f7a71c1efc9b539ec554a977f6..b10320ca30bd4423bc755722dafb85908d922f8e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -35,4 +35,23 @@ void EagerOperation::ConsumeInput(tensorflow::TensorHandle* h) {
   inputs_.push_back(h);
   attrs_.NumInputs(static_cast<int>(inputs_.size()));
 }
+
+string EagerOperation::DebugString() const {
+  string out;
+  VLOG(1) << "EagerOperation::DebugString() over " << this;
+
+  strings::StrAppend(&out, "Name: ", name_, "\n");
+  strings::StrAppend(
+      &out, "Device: ", Device() ? Device()->DebugString() : "[]", "\n");
+  for (const auto& input : inputs_) {
+    VLOG(1) << "Input ptr: " << input;
+    strings::StrAppend(&out, "Input: ", input->DebugString(), "\n");
+  }
+
+  NodeDef ndef;
+  Attrs().FillAttrValueMap(ndef.mutable_attr());
+  strings::StrAppend(&out, "Attrs: ", ndef.DebugString(), "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 5a9e1f0292e799004b1b39c8d832fece8b051965..23a2d1bf986d8cd2b1670432e48ff3c6b3a1ee1c 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -64,6 +64,8 @@ class EagerOperation {
 
   void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
 
+  string DebugString() const;
+
  private:
   tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
   const tensorflow::string name_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 9df1511bc75f38caa5dd0fb60c2b1782ae8b69c3..1eec6d4391a51a12fe27b54cf0df192901568709 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -85,10 +86,10 @@ int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
 //
 // The passed in *handle will be Unreffed if it is replaced.
 //
-// `op_device` is passed in explicitly because `op->device()` might be unset
-// and we might have selected some specific device to run this op on.
+// `op_device_name` is passed in explicitly because `op->device()` might be
+// unset and we might have selected some specific device to run this op on.
 Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
-                                      const Device* op_device, int i,
+                                      const string& op_device_name, int i,
                                       const Device* expected_input_device,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
@@ -114,7 +115,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
             expected_input_device->name(), " but is actually on ",
-            actual_device->name(), " (operation running on ", op_device->name(),
+            actual_device->name(), " (operation running on ", op_device_name,
             ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
             "methods,"
@@ -127,7 +128,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
                      << " was expected to be on "
                      << expected_input_device->name() << " but is actually on "
                      << actual_device->name() << " (operation running on "
-                     << op_device->name()
+                     << op_device_name
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -173,7 +174,11 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
   return Status::OK();
 }
 
-Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
+// `op_device_name` the name of the device on which the op will run, if any.
+// For functions running using function library runtime, the device can be
+// unspecified.
+Status ValidateInputTypeAndPlacement(EagerContext* ctx,
+                                     const string& op_device_name,
                                      EagerOperation* op,
                                      const KernelAndDevice* kernel,
                                      RunMetadata* run_metadata) {
@@ -184,7 +189,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
   for (int i = 0; i < op->Inputs().size(); ++i) {
     const Device* expected_device = kernel->InputDevice(i);
     TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, op_device, i, expected_device, run_metadata,
+        op, op_device_name, i, expected_device, run_metadata,
         &((*op->MutableInputs())[i])));
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
@@ -251,6 +256,16 @@ bool OnSameTask(EagerContext* ctx, Device* first, Device* second) {
          first->parsed_name().task == second->parsed_name().task;
 }
 
+// Gets the CPU device on the task of device.
+Status CPUDeviceOnTask(EagerContext* ctx, tensorflow::Device* device,
+                       tensorflow::Device** cpu_device) {
+  string cpu_device_name;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+      device->name(), &cpu_device_name));
+
+  return ctx->FindDeviceByName(cpu_device_name, cpu_device);
+}
+
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
   return {tensorflow::FingerprintCat64(a.low64, b.low64),
@@ -334,16 +349,31 @@ Status AddInputDevicesToCacheKey(const EagerContext* ctx,
   return Status::OK();
 }
 
+// There are a lot of references to devices in this function and around.
+// Here is what they mean:
+//  EagerOperation::Device(): The device on which the user requested the op
+//    be executed, except if we had to change the device due to resource inputs
+//    or CPU pinning. If the user did not request a device, the op does not
+//    take resources, and we did not pin it to CPU, the device can be nullptr.
+//  KernelAndDevice::Device(): The first time we see an op (combined with
+//    its attributes), we need to create a KernelAndDevice object for it.
+//    If op->Device() is a nullptr, we select a device for the op when
+//    creating the KernelAndDevice. A concrete device will always be selected
+//    here except when `op` is a function to be executed using function library
+//    runtime. In this case, we don't select a device because running
+//    a function with explicitly requested device has different behavior than
+//    running without an explicitly requested device.
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
+  const string unspecified_device_name("<unspecified>");
   EagerContext* ctx = op->EagerContext();
   auto status = ctx->GetStatus();
   if (!status.ok()) return status;
   Device* device = op->Device();
 
   const string& maybe_unspecified_device_name =
-      device == nullptr ? "unspecified" : device->name();
+      device == nullptr ? unspecified_device_name : device->name();
   Fprint128 cache_key =
       op->MutableAttrs()->CacheKey(maybe_unspecified_device_name);
 
@@ -371,31 +401,38 @@ Status EagerLocalExecute(EagerOperation* op,
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
       compile_with_xla = true;
     }
+    bool run_function_with_flr = is_multi_device_function && !compile_with_xla;
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
-    if (device == nullptr) {
+    if (!run_function_with_flr && device == nullptr) {
       status = SelectDevice(ndef, ctx, &device);
       if (!status.ok()) return status;
     }
+    const string& device_name =
+        device == nullptr ? unspecified_device_name : device->name();
     if (ctx->LogDevicePlacement()) {
-      LOG(INFO) << "Executing op " << ndef.op() << " in device "
-                << device->name();
+      LOG(INFO) << "Executing op " << ndef.op() << " in device " << device_name;
+    } else {
+      VLOG(1) << "Executing op " << ndef.op() << " in device " << device_name;
     }
 
-    auto* flr = ctx->func_lib(device);
-    if (flr == nullptr) {
+    FunctionLibraryRuntime* flr =
+        device == nullptr ? nullptr : ctx->func_lib(device);
+    if (device != nullptr && flr == nullptr) {
       return errors::Unavailable(
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
+    auto runner = (flr != nullptr && flr->runner() != nullptr) ? flr->runner()
+                                                               : ctx->runner();
     GraphCollector* graph_collector = nullptr;
-    if (ctx->ShouldStoreMetadata()) {
+    if (ctx->ShouldStoreGraphs()) {
       graph_collector = ctx->GetGraphCollector();
     }
     // Treat the function as multi_device only when we are not compiling
     // it wholly with XLA. When compiling wholly with XLA, flr->CreateKernel
     // will create an XlaLaunchOp kernel to compile and run the function.
-    if (is_multi_device_function && !compile_with_xla) {
+    if (run_function_with_flr) {
       // Multi-device functions don't use the rendezvous from eager context.
       // If we use that rendezvous, multiple concurrent calls to the same
       // function will likely result in collisions. However, this also means
@@ -405,14 +442,14 @@ Status EagerLocalExecute(EagerOperation* op,
               << "compile_with_xla=" << compile_with_xla
               << ". Full node_def=" << ndef.DebugString();
       kernel = new KernelAndDeviceFunc(
-          flr, ctx->pflr(), std::move(input_dev_ptrs), ctx->runner(),
+          flr, ctx->pflr(), std::move(input_dev_ptrs), runner,
           ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
     } else {
       VLOG(2) << "Running " << ndef.op() << " using op kernel. "
               << "compile_with_xla=" << compile_with_xla
               << ". Full node_def=" << ndef.DebugString();
       kernel = new KernelAndDeviceOp(
-          ctx->GetRendezvous(), ctx->LogMemory(), flr, ctx->runner(),
+          ctx->GetRendezvous(), ctx->LogMemory(), flr, runner,
           ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
     }
 
@@ -432,20 +469,20 @@ Status EagerLocalExecute(EagerOperation* op,
                                    *num_retvals);
   }
   *num_retvals = output_dtypes_size;
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
+  const string& device_name = kernel->device() == nullptr
+                                  ? unspecified_device_name
+                                  : kernel->device()->name();
   status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel,
-      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+      ctx, device_name, op, kernel,
+      ctx->ShouldStoreStepStats() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
   StepStats* maybe_step_stats = nullptr;
   GraphCollector* graph_collector = nullptr;
-  if (ctx->ShouldStoreMetadata()) {
+  if (ctx->ShouldStoreGraphs()) {
     graph_collector = ctx->GetGraphCollector();
+  }
+  if (ctx->ShouldStoreStepStats()) {
     maybe_step_stats = ctx->RunMetadataProto()->mutable_step_stats();
     int64 now_nanos = Env::Default()->NowNanos();
     maybe_stats.reset(new NodeExecStats);
@@ -472,16 +509,16 @@ Status EagerLocalExecute(EagerOperation* op,
           /* resource_device= */ kernel->OutputResourceDevice(i),
           output_dtypes[i], ctx);
     }
-    EagerNode* node = new ExecuteNode(
-        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
-        maybe_step_stats, graph_collector, output_dtypes, *retvals);
+    EagerNode* node = new ExecuteNode(id, ctx, op->Inputs(), kernel,
+                                      maybe_stats.release(), maybe_step_stats,
+                                      graph_collector, output_dtypes, *retvals);
     ctx->ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
-    status = EagerKernelExecute(ctx, op->Device(), op->Inputs(), kernel,
-                                maybe_stats.get(), maybe_step_stats,
-                                graph_collector, retvals->data(), *num_retvals);
+    status = EagerKernelExecute(ctx, op->Inputs(), kernel, maybe_stats.get(),
+                                maybe_step_stats, graph_collector,
+                                retvals->data(), *num_retvals);
   }
 
   return status;
@@ -627,11 +664,17 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         // explicitly copy, and instead depend on the copy to happen locally
         // when the op is executed on the device.
         !OnSameTask(ctx, op->Device(), input_device)) {
+      tensorflow::Device* remote_cpu_device;
+      TF_RETURN_IF_ERROR(
+          CPUDeviceOnTask(ctx, op->Device(), &remote_cpu_device));
       // TODO(b/110044833): It's possible the same tensor gets copied to the
       // remote device repeatedly.
+      // Always copy to the remote CPU so that the actual device can be
+      // correctly determined after the kernel is selected/instantiated, since
+      // the op might have its inputs on host memory.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-          op, op->Device(), i, op->Device(), /* run_metadata= */ nullptr,
-          &(*op->MutableInputs())[i]));
+          op, op->Device()->name(), i, remote_cpu_device,
+          /* run_metadata= */ nullptr, &(*op->MutableInputs())[i]));
     }
 
     tensorflow::TensorHandle* input = op->Inputs()[i];
@@ -744,13 +787,16 @@ bool IsPinnableOp(const string& op_type) {
   static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
       "RandomUniform",
       "RandomUniformInt",
-      "RandomNormal",
+      "RandomStandardNormal",
       "StatelessRandomUniform",
       "StatelessRandomUniformInt",
       "StatelessRandomNormal",
   });
 
-  return unpinnable_ops->find(op_type) == unpinnable_ops->end();
+  // XRT ops refer to per-device handles that are not safe to move between
+  // devices.
+  return unpinnable_ops->find(op_type) == unpinnable_ops->end() &&
+         !absl::StartsWith(op_type, "XRT");
 }
 
 // The Op device may be updated if:
@@ -762,9 +808,18 @@ bool IsPinnableOp(const string& op_type) {
 // (int32/int64). This can be disabled by setting the environment variable
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
 Status MaybeUpdateOpDevice(EagerOperation* op) {
+  if (op->is_function()) {
+    // Don't update the device of direct function calls.
+    // Particularly, if the user did not explicitly request any device for this
+    // function, picking a device would result in this device being the default
+    // for nodes inside the function. This is undesirable for multi-device
+    // functions since the not-explicitly-placed nodes inside the body will all
+    // end up on this default device.
+    return Status::OK();
+  }
   EagerContext* ctx = op->EagerContext();
   bool all_inputs_eligible_for_cpu_pinning =
-      ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
+      ctx->PinSmallOpsToCPU() && !op->is_function() && IsPinnableOp(op->Name());
   Device* op_device = op->Device() == nullptr ? ctx->HostCPU() : op->Device();
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
@@ -854,18 +909,12 @@ Status EagerExecute(EagerOperation* op,
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
 }
 
-Status EagerKernelExecute(EagerContext* ctx, Device* device,
+Status EagerKernelExecute(EagerContext* ctx,
                           const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                           KernelAndDevice* kernel, NodeExecStats* maybe_stats,
                           StepStats* maybe_step_stats,
                           GraphCollector* graph_collector,
                           TensorHandle** retvals, int num_retvals) {
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-
   std::vector<Tensor> outputs(1);
 
   // If there are multiple references to a TensorHandle in 'op_inputs' we must
@@ -899,6 +948,31 @@ Status EagerKernelExecute(EagerContext* ctx, Device* device,
                                    maybe_stats, maybe_step_stats,
                                    graph_collector));
   }
+  if (graph_collector != nullptr) {
+    mutex_lock ml(*ctx->MetadataMu());
+    {
+      GraphCollector* collector = ctx->GetGraphCollector();
+      mutex_lock mll(collector->mu);
+
+      // Adding to partition graphs for backward compatibility.
+      for (const auto& graph : collector->partitioned_graphs) {
+        *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+      }
+
+      if (collector->dirty) {
+        auto* function_graphs = ctx->RunMetadataProto()->add_function_graphs();
+        *function_graphs->mutable_post_optimization_graph() =
+            collector->optimized_graph;
+        *function_graphs->mutable_pre_optimization_graph() =
+            collector->raw_graph;
+        for (const auto& graph : collector->partitioned_graphs) {
+          *function_graphs->add_partition_graphs() = graph;
+        }
+      }
+
+      collector->ClearGraphs();
+    }
+  }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
     maybe_stats->set_op_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
@@ -907,34 +981,34 @@ Status EagerKernelExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                         maybe_stats->all_start_micros());
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    if (ctx->ShouldStoreMetadata()) {
+    if (ctx->ShouldStoreStepStats()) {
       mutex_lock ml(*ctx->MetadataMu());
       {
-        GraphCollector* collector = ctx->GetGraphCollector();
-        mutex_lock mll(collector->mu);
-        for (const auto& graph : collector->graphs) {
-          *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+        auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+        // Lazily initialize the RunMetadata with information about all devices
+        // if this is the first call.
+        while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+          step_stats->add_dev_stats();
         }
-        collector->graphs.clear();
-      }
-      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->devices()->size(); ++i) {
-        if (ctx->devices()->at(i) == device) {
-          device_idx = i;
-          break;
+        // Find the current device's index.
+        // If device is a nullptr (we are running a function without explicitly
+        // requested device), attribute the function runtime to CPU.
+        Device* attribution_device = kernel->device();
+        if (attribution_device == nullptr) {
+          attribution_device = ctx->HostCPU();
+        }
+        int device_idx = 0;
+        for (int i = 0; i < ctx->devices()->size(); ++i) {
+          if (ctx->devices()->at(i) == attribution_device) {
+            device_idx = i;
+            break;
+          }
         }
+        // Populate the device stats for this device.
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        dev_stats->set_device(attribution_device->name());
+        *dev_stats->add_node_stats() = *maybe_stats;
       }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
@@ -942,12 +1016,12 @@ Status EagerKernelExecute(EagerContext* ctx, Device* device,
     if (retvals[i] == nullptr) {
       retvals[i] =
           new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i),
-                           /* op_device= */ device, ctx);
+                           /* op_device= */ kernel->device(), ctx);
     } else {
       // In the async case, the retval is not a nullptr, and its device is
       // already set since all TensorHandles always have their device set
-      // during construction.
-      DCHECK_EQ(device, retvals[i]->op_device());
+      // (potentially to nullptr) during construction.
+      DCHECK_EQ(kernel->device(), retvals[i]->op_device());
       DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
 
       retvals[i]->SetTensor(outputs[i]);
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 80d381cecdd421fe9c580cd0a10fbc6db5953080..b05139a28c98b9f33e532abdad882039386681be 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -41,9 +41,9 @@ Status EagerExecute(
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
     int* num_retvals);
 
-// Low-level utility to execute the kernel specified by kernel on device device,
-// with the inputs op_inputs, in the context ctx.
-Status EagerKernelExecute(EagerContext* ctx, Device* device,
+// Low-level utility to execute the kernel specified by `kernel` on
+// `kernel->device()`, with the inputs op_inputs, in the context 'ctx'.
+Status EagerKernelExecute(EagerContext* ctx,
                           const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                           KernelAndDevice* kernel, NodeExecStats* maybe_stats,
                           StepStats* maybe_step_stats,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 4459e3221b9f2387867e1efed4324322619e4388..723b22dfbb5702959a593a21cd13b325751e9e36 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 
 class ExecuteNode : public EagerNode {
  public:
-  ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
+  ExecuteNode(uint64 id, EagerContext* ctx,
               const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
               KernelAndDevice* kernel, NodeExecStats* maybe_stats,
               StepStats* maybe_step_stats, GraphCollector* graph_collector,
@@ -39,7 +39,6 @@ class ExecuteNode : public EagerNode {
               const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
       : EagerNode(id),
         ctx_(ctx),
-        op_device_(op_device),
         inputs_(inputs),
         kernel_(kernel),
         maybe_stats_(maybe_stats),
@@ -65,8 +64,8 @@ class ExecuteNode : public EagerNode {
 
   tensorflow::Status Run() override {
     const Status status = EagerKernelExecute(
-        ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
-        maybe_step_stats_, graph_collector_, retvals_.begin(), retvals_.size());
+        ctx_, inputs_, kernel_, maybe_stats_.get(), maybe_step_stats_,
+        graph_collector_, retvals_.begin(), retvals_.size());
     if (status.ok()) {
       return status;
     } else {
@@ -79,7 +78,6 @@ class ExecuteNode : public EagerNode {
 
  private:
   tensorflow::EagerContext* ctx_;
-  tensorflow::Device* op_device_;
   tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
   tensorflow::KernelAndDevice* kernel_;
   std::unique_ptr<NodeExecStats> maybe_stats_;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 943b183d63aab8ea6a820dacb803784e054704e6..5a61c767985b228db2636c9482f179c774639f72 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -53,9 +54,24 @@ KernelAndDeviceFunc::~KernelAndDeviceFunc() {
   }
 }
 
+KernelAndDeviceOp::~KernelAndDeviceOp() {
+  // Make sure that the device execution has finished before deleting cm_.
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    while (num_deferred_ops_ > 0) {
+      no_deferred_ops_cv_.wait(lock);
+    }
+  }
+}
+
 Status KernelAndDeviceOp::Init(const NodeDef& ndef,
                                GraphCollector* graph_collector) {
   OpKernel* k = nullptr;
+  if (flr_ == nullptr) {
+    return errors::Internal(
+        "A valid FunctionLibraryRuntime must be provided when running ops "
+        "based on OpKernel.");
+  }
   TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
   kernel_.reset(k);
   return Status::OK();
@@ -64,8 +80,18 @@ Status KernelAndDeviceOp::Init(const NodeDef& ndef,
 Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
                                  GraphCollector* graph_collector) {
   const OpDef* op_def = nullptr;
-  const FunctionDef* function_def =
-      flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
+  const FunctionDef* function_def;
+  if (flr_ == nullptr) {
+    // If function is being executed without an explicit device request,
+    // lookup the FunctionDef in the CPU's FLR. All FLRs share the same
+    // library.
+    function_def = pflr_->GetFLR(host_cpu_device_->name())
+                       ->GetFunctionLibraryDefinition()
+                       ->Find(ndef.op());
+  } else {
+    function_def = flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
+  }
+
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
@@ -75,7 +101,7 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
       InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
 
   FunctionLibraryRuntime::InstantiateOptions options;
-  options.target = device_->name();
+  options.target = device_ == nullptr ? "" : device_->name();
   options.is_multi_device_function = true;
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
@@ -89,16 +115,29 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
   // Android tf library does not include grappler.
   const auto& config_it = ndef.attr().find("config_proto");
   if (it != ndef.attr().end()) {
-    ConfigProto config_proto;
-    if (!config_proto.ParseFromString(config_it->second.s())) {
+    if (!options.config_proto.ParseFromString(config_it->second.s())) {
       return errors::InvalidArgument(
           "Failed to parse config_proto attribute as tensorflow::ConfigProto "
           "proto.");
     }
-    options.optimize_graph_fn =
-        std::bind(grappler::OptimizeGraph, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3,
-                  std::placeholders::_4, config_proto, std::placeholders::_5);
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+
+    // Tensorflow 2.0 in eager mode with automatic control dependencies will
+    // prune all nodes that are not in the transitive fanin of the fetch nodes.
+    // However because the function will be executed via FunctionLibraryRuntime,
+    // and current function implementation does not prune stateful and dataset
+    // ops, we rely on Grappler to do the correct graph pruning.
+    optimization_options.allow_pruning_stateful_and_dataset_ops = true;
+
+    // All the nested function calls will be executed and optimized via
+    // PartitionedCallOp, there is no need to optimize functions now.
+    optimization_options.optimize_function_library = false;
+
+    options.optimize_graph_fn = std::bind(
+        grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
+        std::placeholders::_3, std::placeholders::_4, std::placeholders::_5,
+        options.config_proto, function_def->signature().name(),
+        optimization_options, std::placeholders::_6);
   }
 #endif
   options.graph_collector = graph_collector;
@@ -109,10 +148,10 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
   return Status::OK();
 }
 
-Status KernelAndDevice::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
-                            std::vector<Tensor>* outputs, NodeExecStats* stats,
-                            StepStats* step_stats,
-                            GraphCollector* graph_collector) {
+Status KernelAndDeviceOp::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+                              std::vector<Tensor>* outputs,
+                              NodeExecStats* stats, StepStats* step_stats,
+                              GraphCollector* graph_collector) {
   ScopedStepContainer step_container(0, [this](const string& name) {
     device_->resource_manager()->Cleanup(name).IgnoreError();
   });
@@ -120,6 +159,20 @@ Status KernelAndDevice::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
                    graph_collector);
 }
 
+Status KernelAndDeviceFunc::Run(
+    const gtl::InlinedVector<TensorValue, 4>& inputs,
+    std::vector<Tensor>* outputs, NodeExecStats* stats, StepStats* step_stats,
+    GraphCollector* graph_collector) {
+  const std::vector<Device*> devices = pflr_->device_mgr()->ListDevices();
+  ScopedStepContainer step_container(0, [&devices](const string& name) {
+    for (Device* device : devices) {
+      device->resource_manager()->Cleanup(name).IgnoreError();
+    }
+  });
+  return this->Run(&step_container, inputs, outputs, stats, step_stats,
+                   graph_collector);
+}
+
 namespace {
 void UpdateStats(OpKernelContext* context,
                  StepStatsCollector* step_stats_collector,
@@ -132,9 +185,11 @@ void UpdateStats(OpKernelContext* context,
     memory->set_peak_bytes(std::get<1>(sizes));
     memory->set_live_bytes(std::get<2>(sizes));
 
-    AllocatorStats allocator_stats;
-    allocator_pair.first->GetStats(&allocator_stats);
-    memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+    absl::optional<AllocatorStats> allocator_stats =
+        allocator_pair.first->GetStats();
+    if (stats) {
+      memory->set_allocator_bytes_in_use(allocator_stats->bytes_in_use);
+    }
     allocator_pair.second->GetRecordsAndUnRef();
   }
   auto* ms = stats->mutable_memory_stats();
@@ -153,6 +208,11 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
                               std::vector<Tensor>* outputs,
                               NodeExecStats* stats, StepStats* step_stats,
                               GraphCollector* graph_collector) {
+  gtl::InlinedVector<AllocatorAttributes, 4> in_attrs(kernel_->num_inputs());
+  for (size_t i = 0; i < in_attrs.size(); ++i) {
+    in_attrs[i].set_on_host(kernel_->input_memory_types()[i] ==
+                            tensorflow::HOST_MEMORY);
+  }
   std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
   for (size_t i = 0; i < out_attrs.size(); ++i) {
     out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
@@ -174,12 +234,25 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
   params.inputs = &inputs;
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
+  params.input_alloc_attrs = &in_attrs;
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
   params.function_library = flr_;
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
   params.cancellation_manager = &cm_;
+  cm_.Reset();
   params.log_memory = log_memory_;
+  params.inc_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_++;
+  };
+  params.dec_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_--;
+    if (num_deferred_ops_ == 0) {
+      no_deferred_ops_cv_.notify_all();
+    }
+  };
   std::unique_ptr<StepStatsCollector> step_stats_collector;
   if (stats != nullptr) {
     step_stats_collector.reset(new StepStatsCollector(step_stats));
@@ -240,9 +313,12 @@ Status KernelAndDeviceFunc::Run(
   // function library runtime to create a new for this call. We could have
   // created one here but it requires more state to be kept in
   // KernelAndDeviceFunc.
-  opts.rendezvous = nullptr;
-  opts.create_rendezvous = true;
+  Rendezvous* rendezvous = new IntraProcessRendezvous(pflr_->device_mgr());
+  opts.rendezvous = rendezvous;
+  opts.create_rendezvous = false;
+
   opts.cancellation_manager = &cm_;
+  cm_.Reset();
   // eager runtime does not yet support collective ops.
   opts.collective_executor = nullptr;
   opts.allow_dead_tensors = true;
@@ -266,13 +342,14 @@ Status KernelAndDeviceFunc::Run(
     input_vector.push_back(*tensor_value.tensor);
   }
 
-  flr_->Run(opts, handle_, input_vector, outputs,
-            [&status, &done](const Status& s) {
-              status = s;
-              done.Notify();
-            });
+  pflr_->Run(opts, handle_, input_vector, outputs,
+             [&status, &done](const Status& s) {
+               status = s;
+               done.Notify();
+             });
   done.WaitForNotification();
 
+  rendezvous->Unref();
   if (step_stats_collector != nullptr) {
     step_stats_collector->Finalize();
   }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index c4ea99f53e14287c4852bcb1becb3c352fa9746b..e9573b014f80172e963a40292d9713e8dddda0ca 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -60,24 +60,28 @@ class KernelAndDevice {
 
   // Non-multi-device functions are run using regular CallOp and look like
   // primitive operations from KernelAndDevice perspective.
+  // `flr` can be nullptr if the operation is not run on any specific device
+  // (currently can happen only for multi-device functions).
   KernelAndDevice(
       FunctionLibraryRuntime* flr,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
       Device* host_cpu_device)
-      : device_(flr->device()),
+      : device_(flr == nullptr ? nullptr : flr->device()),
         host_cpu_device_(host_cpu_device),
         flr_(flr),
         runner_(runner),
         default_runner_([](std::function<void()> f) { f(); }),
         collective_executor_(std::move(collective_executor)) {}
 
+  // Not thread safe.
   virtual ~KernelAndDevice() {}
 
   // TODO(ashankar): Handle list-valued inputs.
-  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
-             std::vector<Tensor>* outputs, NodeExecStats* stats,
-             StepStats* step_stats, GraphCollector* graph_collector);
+  virtual Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+                     std::vector<Tensor>* outputs, NodeExecStats* stats,
+                     StepStats* step_stats,
+                     GraphCollector* graph_collector) = 0;
 
   virtual Status Run(ScopedStepContainer* step_container,
                      const gtl::InlinedVector<TensorValue, 4>& inputs,
@@ -91,7 +95,8 @@ class KernelAndDevice {
   // Else, returns nullptr.
   virtual Device* OutputResourceDevice(int idx) const = 0;
 
-  // Returns nullptr for functions.
+  // Returns the kernel that will be used to run this.
+  // Returns nullptr if this will be run using function library runtime.
   virtual const OpKernel* kernel() const = 0;
 
   // Returns the device on which this kernel will run. In the case of
@@ -113,9 +118,9 @@ class KernelAndDevice {
   // provided here only for the few kernels which can't handle one being
   // missing.
   CancellationManager cm_;
-  Device* const device_;           // non-null
-  Device* const host_cpu_device_;  // non-null
-  FunctionLibraryRuntime* const flr_;
+  Device* const device_;               // can be null
+  Device* const host_cpu_device_;      // non-null
+  FunctionLibraryRuntime* const flr_;  // can be null
   std::function<void(std::function<void()>)>* const runner_;
   std::function<void(std::function<void()>)> default_runner_;
   const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
@@ -135,9 +140,13 @@ class KernelAndDeviceOp final : public KernelAndDevice {
         rendez_(rendez),
         log_memory_(log_memory) {}
 
+  virtual ~KernelAndDeviceOp();
+
   Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
 
-  using KernelAndDevice::Run;
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container,
              const gtl::InlinedVector<TensorValue, 4>& inputs,
@@ -151,7 +160,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   Device* OutputResourceDevice(int idx) const override;
 
   DataType input_type(int i) const override;
-  const DataTypeVector& output_dtypes() const {
+  const DataTypeVector& output_dtypes() const override {
     return kernel_->output_types();
   }
   int num_inputs() const override { return kernel_->num_inputs(); }
@@ -162,6 +171,15 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   Rendezvous* const rendez_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
   const bool log_memory_;
+
+  // For deferred ops, AsyncOpKernel::DoneCallback is called once the op is
+  // enqueued to device. The execution of the op may not finish when
+  // device_->Compute returns. We rely on no_deferred_ops_cv_ to know when the
+  // execution has finished.
+  // Available via OpKernelContext to every OpKernel invocation.
+  mutex num_deferred_ops_mu_;
+  condition_variable no_deferred_ops_cv_;
+  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
 };
 
 // Represents a multi-device function. Functions can also be run using
@@ -169,6 +187,9 @@ class KernelAndDeviceOp final : public KernelAndDevice {
 // In such cases, KernelAndDeviceOp is used.
 class KernelAndDeviceFunc final : public KernelAndDevice {
  public:
+  // `flr` can be nullptr.
+  // `pflr` must not be nullptr.
+  // `host_cpu_device` must not be nullptr.
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
@@ -185,8 +206,9 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
 
   Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
 
-  using KernelAndDevice::Run;
-
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
   Status Run(ScopedStepContainer* step_container,
              const gtl::InlinedVector<TensorValue, 4>& inputs,
              std::vector<Tensor>* outputs, NodeExecStats* stats,
@@ -206,7 +228,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   int num_outputs() const override { return output_dtypes_.size(); }
 
  private:
-  ProcessFunctionLibraryRuntime* const pflr_;
+  ProcessFunctionLibraryRuntime* const pflr_;  // non-null
   FunctionLibraryRuntime::Handle handle_;
   // CPU devices are null. Resource handles' devices are actual backing
   // devices.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index eaca6d7b32542abb114826f4ab8c256ee26e6126..e44a97b2655fee02b77c965dcc8d3aa04dbcd091 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -99,6 +100,19 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                               : resource_device_ == nullptr);
 }
 
+TensorHandle::TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype)
+    : dtype(dtype),
+      node_id_(0),
+      device_(nullptr),
+      op_device_(nullptr),
+      resource_device_(nullptr),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(nullptr),
+      is_ready_(true),
+      symbolic_tensor(new OutputGraphNode(symbolic_tensor)) {}
+
 bool TensorHandle::IsReady() {
   if (node_id_ == 0) return true;
   mutex_lock l(ctx_mutex_);
@@ -312,4 +326,20 @@ Device* GetResourceDevice(const Tensor& t, EagerContext* ctx) {
   return it->second;
 }
 
+string TensorHandle::DebugString() const {
+  VLOG(1) << "Calling TensorHandle::DebugString() on " << this;
+
+  if (symbolic_tensor) {
+    return absl::Substitute("TF_Output($0, $1)", symbolic_tensor->oper,
+                            symbolic_tensor->index);
+  }
+
+  string out;
+  strings::StrAppend(&out, "Device: ", device_ ? device_->DebugString() : "[]");
+  // Consider supporting non-CPU tensors (when device_ is non-NULL) if needed.
+  strings::StrAppend(&out, ", Tensor: ", device_ ? "?" : tensor_.DebugString(),
+                     "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index a4727d6f762392bfecea2d5eb047bc5bd70d3f39..f530f0a6d31c1cc3b010d53debeb2d6ee02746ed 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -41,8 +41,17 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+struct TF_Operation;
+
 namespace tensorflow {
 
+// This struct is isomorphic to TF_Output, but we cannot use the latter here due
+// to layering concerns (TF_Output is defined at the C API layer).
+struct OutputGraphNode {
+  TF_Operation* oper;
+  int index;  // The index of the output within oper.
+};
+
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
@@ -58,7 +67,11 @@ class TensorHandle : public core::RefCounted {
                DataType dtype, std::function<void()> call_on_destroy, Device* d,
                Device* op_device, Device* resource_device, EagerContext* ctx);
 
+  // Symbolic tensor constructor.
+  TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype);
+
   ~TensorHandle() override {
+    VLOG(1) << "Deleting internal TensorHandle " << this;
     if (call_on_destroy_) {
       call_on_destroy_();
     }
@@ -114,9 +127,13 @@ class TensorHandle : public core::RefCounted {
 
   bool IsRemote();
 
+  OutputGraphNode* getSymbolicTensor() const { return symbolic_tensor.get(); }
+
+  string DebugString() const;
+
  private:
   // If the contents of the Tensor pointed to by this handle is yet to be
-  // computed by a EagerNode, this function will block till that compuatation is
+  // computed by a EagerNode, this function will block till that computation is
   // done and the handle is "ready".
   Status WaitReady();
   Status WaitForNode(uint64 node_id, bool return_if_is_ready);
@@ -141,6 +158,9 @@ class TensorHandle : public core::RefCounted {
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
+  // Can be nullptr if the op producing this tensor was a function executed
+  // with function library runtime or if this tensor represents a symbolic
+  // tensor.
   tensorflow::Device* const op_device_;
 
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
@@ -165,6 +185,11 @@ class TensorHandle : public core::RefCounted {
   // `ctx` object is not owned and should outlive this handle.
   EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
   bool is_ready_ GUARDED_BY(ctx_mutex_);
+
+  // When non-NULL, this tensor handle instance represents a symbolic tensor
+  // (corresponding to a graph node), whose concrete value is to be produced by
+  // executing that graph node.
+  std::unique_ptr<OutputGraphNode> symbolic_tensor;
 };
 
 // If tensor's dtype is DT_RESOURCE, returns the device backing the resource.
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index 87749da7afed9f67c469cbcd63e685c2c534a4bb..fb51e2dec3ac63f64cd70bececa5734bb5afc8a4 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index d068bbf1e4a15245ca1c8ef9f91d24722f682be1..ad3049b67b891be025fcf0dedd5173f518eae1f9 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -55,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -1278,7 +1280,7 @@ class ExecutorState {
 
   // Available via OpKernelContext to every OpKernel invocation.
   mutex num_deferred_ops_mu_;
-  condition_variable num_deferred_ops_cv_;
+  condition_variable no_deferred_ops_cv_;
   int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
 
   mutex mu_;
@@ -1358,6 +1360,9 @@ class ExecutorState {
 
   // Clean up when this executor is done.
   void Finish();
+  // Schedule Finish() on a separate thread if it needs to wait for deferred
+  // async ops to complete; otherwise run it on the current thread.
+  void ScheduleFinish();
 
   // A standalone routine for this expression so that we can express
   // that we don't want thread safety analysis on this reference (it's
@@ -1643,7 +1648,9 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.dec_num_deferred_ops_function = [this]() {
     mutex_lock lock(num_deferred_ops_mu_);
     num_deferred_ops_--;
-    num_deferred_ops_cv_.notify_all();
+    if (num_deferred_ops_ == 0) {
+      no_deferred_ops_cv_.notify_all();
+    }
   };
 
   Status s;
@@ -1778,7 +1785,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
           const bool completed =
               NodeDone(s, state->item->node, ready, stats, nullptr);
           delete state;
-          if (completed) Finish();
+          if (completed) ScheduleFinish();
         };
         nodestats::SetOpStart(stats);
         device->ComputeAsync(async, &state->ctx, done);
@@ -1865,7 +1872,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   }  // while !inline_ready.empty()
 
   // This thread of computation is done if completed = true.
-  if (completed) Finish();
+  if (completed) ScheduleFinish();
 }
 
 Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
@@ -2421,6 +2428,25 @@ void ExecutorState::DumpState() {
   }
 }
 
+void ExecutorState::ScheduleFinish() {
+  int num_deferred_ops;
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops = num_deferred_ops_;
+  }
+  if (num_deferred_ops > 0) {
+    // Finish() may be blocked waiting for deferred async ops to complete. The
+    // execution of deferred async ops may be waiting for non-enqueued ops of
+    // other executors to complete. So running Finish() on the current thread
+    // (inter-op threadpool thread) may lead to a deadlock due to threadpool
+    // exhaustion. Instead, we run it on a separate thread to unblock the
+    // threadpool thread.
+    Env::Default()->SchedClosure([this]() { Finish(); });
+  } else {
+    Finish();
+  }
+}
+
 void ExecutorState::Finish() {
   mu_.lock();
   auto status = status_;
@@ -2432,11 +2458,11 @@ void ExecutorState::Finish() {
 
   // There are several potential race conditions below. To name a few:
   // 1. Even if the device's status is OK at the precise moment when
-  // num_deferred_ops_ reaches 0, it could go bad before device->CurrentStatus()
+  // num_deferred_ops_ reaches 0, it could go bad before device->RefreshStatus()
   // is called below, caused by work enqueued onto the same device by other
   // concurrent ExecutorState objects.
-  // 2. Some implementations of Device::CurrentStatus, such as
-  // XlaDevice::CurrentStatus, may be inherently racy because it releases the
+  // 2. Some implementations of Device::RefreshStatus, such as
+  // XlaDevice::RefreshStatus, may be inherently racy because it releases the
   // device mutex after a stream pointer is acquired and before the stream is
   // queried for status.
   // 3. It's the same for some implementations of Device::Sync, such as
@@ -2454,7 +2480,7 @@ void ExecutorState::Finish() {
   {
     mutex_lock lock(num_deferred_ops_mu_);
     while (num_deferred_ops_ > 0) {
-      num_deferred_ops_cv_.wait(lock);
+      no_deferred_ops_cv_.wait(lock);
     }
   }
 
@@ -2462,7 +2488,21 @@ void ExecutorState::Finish() {
   // these devices should have used num_deferred_ops correctly to ensure the
   // device has finished all relevant work at this point.
   if (!device->AllowsSyncOnCompletion()) {
-    status.Update(device->CurrentStatus());
+    status.Update(device->RefreshStatus());
+    if (!status.ok()) {
+      // In device async execution mode, it's possible for device execution to
+      // lag behind ExecutorState scheduling so much that this is the first
+      // place a device execution error surfaces.
+      // If so, all ExecutorState::NodeDone calls have already happened with OK
+      // status. This is the last defense where StartCancel must be called to
+      // abort all computation still running on any device.
+      // TODO(b/124523000): Always call Finish in a separate thread, so even if
+      // StartCancel blocks the current thread's execution, we won't encounter
+      // deadlocks caused by inter-op thread exhaustion.
+      if (cancellation_manager_) {
+        cancellation_manager_->StartCancel();
+      }
+    }
     delete this;
     runner([=]() { done_cb(status); });
     return;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 82bbb7e68900164f86c718ec2f799a28c7125e81..488d0c71c893bedd5f4c4ee246e758e2fd9e0ea1 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
@@ -89,9 +91,9 @@ struct EndpointEq {
 
 // The following Add* routines are used to add a few graph nodes while
 // functions are transformed.
-static Node* AddNoOp(Graph* g) {
+static Node* AddNoOp(StringPiece name, Graph* g) {
   NodeDef ndef;
-  ndef.set_name(g->NewName(kNodeLabel));
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
   ndef.set_op("NoOp");
   Status s;
   Node* ret = g->AddNode(ndef, &s);
@@ -99,10 +101,10 @@ static Node* AddNoOp(Graph* g) {
   return ret;
 }
 
-static Node* AddIdentity(Graph* g, Endpoint input) {
+static Node* AddIdentity(StringPiece name, Graph* g, Endpoint input) {
   DCHECK_LT(0, input.dtype());
   NodeDef ndef;
-  ndef.set_name(g->NewName(kNodeLabel));
+  ndef.set_name(g->NewName(absl::StrCat(kNodeLabel, "/", name)));
   ndef.set_op("Identity");
   // NOTE(skyewm): we explicitly set the device here to address a multi-GPU
   // performance issue where this Identity would be placed alone on a GPU,
@@ -192,6 +194,7 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
 
   Env* env() override;
   Device* device() override;
+  std::function<void(std::function<void()>)>* runner() override;
   const DeviceMgr* device_mgr() const override;
 
   string DebugString(Handle handle) override;
@@ -266,6 +269,11 @@ Env* FunctionLibraryRuntimeOverlay::env() { return base_flr_->env(); }
 
 Device* FunctionLibraryRuntimeOverlay::device() { return base_flr_->device(); }
 
+std::function<void(std::function<void()>)>*
+FunctionLibraryRuntimeOverlay::runner() {
+  return base_flr_->runner();
+}
+
 const DeviceMgr* FunctionLibraryRuntimeOverlay::device_mgr() const {
   return base_flr_->device_mgr();
 }
@@ -333,6 +341,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   }
 
   Device* device() override { return device_; }
+
+  std::function<void(std::function<void()>)>* runner() override {
+    return &default_runner_;
+  }
+
   const DeviceMgr* device_mgr() const override { return device_mgr_; }
   Env* env() override { return env_; }
   int graph_def_version() override { return graph_def_version_; }
@@ -684,7 +697,6 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     }
   }
 
-  Status s;
   const FunctionLibraryDefinition* lib_def =
       options.overlay_lib ? options.overlay_lib : base_lib_def_;
   FunctionBody* fbody = nullptr;
@@ -803,12 +815,24 @@ void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
 
 namespace {
 // Removes all stateless nodes that do not contribute to a return
-// value from the function body.  Unlike `RemoveDeadNodes()`, which is
+// value from the function body. Unlike `RemoveDeadNodes()`, which is
 // triggered by `OptimizerOptions.do_function_inlining`, this pass
 // ignores the SINK node, from which (by definition) all nodes are
-// reverse reachable.
-void PruneFunctionBody(Graph* g) {
-  VLOG(2) << "Pruning function body";
+// reverse reachable, and preserves all nodes that are reachable from
+// control output nodes.
+//
+// TODO(ezhulenev, skyewm): Function body should not have special treatment of
+// stateful ops, graph should encode nodes that must execute with `control_ret`
+// and `control_output`.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g) {
+  VLOG(2) << "Pruning function body: function_name=" << fdef.signature().name();
+
+  // `control_ret` nodes must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_nodes;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_nodes.insert(control_ret.second);
+  }
+
   std::unordered_set<const Node*> nodes;
   for (auto n : g->nodes()) {
     // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
@@ -818,8 +842,9 @@ void PruneFunctionBody(Graph* g) {
     // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
     // still needed. It would be preferable to prune entire loops and/or
     // conditionals if they are not used in the graph.
-    if (n->IsControlFlow() || n->IsDataset() ||
-        (n->op_def().is_stateful() && n->type_string() != kArgOp)) {
+    if (n->IsControlFlow() ||
+        (n->op_def().is_stateful() && n->type_string() != kArgOp) ||
+        (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
       nodes.insert(n);
     }
   }
@@ -846,7 +871,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   std::unique_ptr<Graph> g(new Graph(lib_def));
   CopyGraph(*fbody->graph, g.get());
 
-  PruneFunctionBody(g.get());
+  PruneFunctionBody(fbody->fdef, g.get());
   optimizer_.Optimize(this, env(), device(), &g, /*shape_map=*/nullptr);
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
@@ -1305,6 +1330,14 @@ bool RemoveListArrayConverter(Graph* g) {
       }
       gtl::InlinedVector<Node*, 8> identity_nodes(n->num_inputs(), nullptr);
 
+      const auto no_op = [&](StringPiece name) {
+        return AddNoOp(absl::StrCat(n->name(), "/", name), g);
+      };
+
+      const auto identity = [&](StringPiece name, Endpoint input) {
+        return AddIdentity(absl::StrCat(n->name(), "/", name), g, input);
+      };
+
       // Process input edges first.
       Node* input_control_node = nullptr;
       for (const Edge* e : n->in_edges()) {
@@ -1314,7 +1347,7 @@ bool RemoveListArrayConverter(Graph* g) {
             // node (input_control_node) which the additional Identity
             // nodes depends on and the input_control_node depends on
             // the node "n"s control dependencies.
-            input_control_node = AddNoOp(g);
+            input_control_node = no_op("input_control_node");
           }
           g->AddControlEdge(e->src(), input_control_node);
         } else {
@@ -1326,7 +1359,7 @@ bool RemoveListArrayConverter(Graph* g) {
                 << e->dst_input();
             return removed_any;
           }
-          *id_node = AddIdentity(g, {e->src(), e->src_output()});
+          *id_node = identity("input", {e->src(), e->src_output()});
         }
       }
 
@@ -1346,7 +1379,7 @@ bool RemoveListArrayConverter(Graph* g) {
             // adds a no-op node (output_control_node) which those
             // nodes will depend on and output_control_node depends on
             // all Identity nodes.
-            output_control_node = AddNoOp(g);
+            output_control_node = no_op("output_control_node");
           }
           g->AddControlEdge(output_control_node, e->dst());
         } else {
@@ -1377,43 +1410,209 @@ bool RemoveListArrayConverter(Graph* g) {
   return removed_any;
 }
 
-// Returns true iff the function '*fbody' can be inlined at 'node'
-// based on the type signature of 'node' and 'fbody'.
-static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
-  if (static_cast<size_t>(node->num_inputs()) != fbody->arg_types.size()) {
-    return false;
+Status InstantiateFunctionCall(const NodeDef& call_def,
+                               FunctionLibraryRuntime& flr,
+                               FunctionLibraryRuntime::Handle* handle) {
+  const string* func_name;
+  AttrSlice attrs;
+
+  NameAttrList func;
+  if (call_def.op() == "PartitionedCall" ||
+      call_def.op() == "StatefulPartitionedCall") {
+    TF_RETURN_IF_ERROR(GetNodeAttr(call_def, "f", &func));
+    func_name = &func.name();
+    attrs = AttrSlice(&func.attr());
+  } else {
+    func_name = &call_def.op();
+    attrs = AttrSlice(call_def);
   }
-  if (static_cast<size_t>(node->num_inputs()) != fbody->arg_nodes.size()) {
-    return false;
+
+  return flr.Instantiate(*func_name, attrs, handle);
+}
+
+namespace {
+
+Status ValidateNoInline(const FunctionBody* fbody) {
+  const auto attr = AttrSlice(&fbody->fdef.attr());
+  bool noinline = false;
+  if (GetNodeAttr(attr, kNoInlineAttr, &noinline).ok() && noinline) {
+    return errors::InvalidArgument(
+        "Can't inline function marked with '_noinline'");
   }
-  if (static_cast<size_t>(node->num_outputs()) != fbody->ret_types.size()) {
-    return false;
+  return Status::OK();
+}
+
+using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+
+}  // namespace
+
+string InlineFunctionBodyOptions::DebugString() const {
+  return absl::StrCat("ignore_noinline=", ignore_noinline ? "true" : "false",
+                      ", override_device=", override_device ? "true" : "false",
+                      ", output_control_src=",
+                      output_control_src == OutputControlSrc::kDataOutputs
+                          ? "DataOutputs"
+                          : "ControlOutputs");
+}
+
+Status ValidateInlining(const Node* node, const FunctionBody* fbody,
+                        const InlineFunctionBodyOptions& options) {
+  // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
+  // that all side-effectful ops will be executed after inlining. See Grappler
+  // function_optimizer for details. Unify all function inlining mechanism.
+  // Do not inline if `!fbody->control_ret_nodes.empty()`.
+
+  const auto num_node_inputs = static_cast<size_t>(node->num_inputs());
+  const auto num_node_outputs = static_cast<size_t>(node->num_outputs());
+
+  if (num_node_inputs != fbody->arg_types.size() ||
+      num_node_inputs != fbody->arg_nodes.size()) {
+    return errors::InvalidArgument(
+        "Node inputs do not match function arguments: inputs=", num_node_inputs,
+        " arg_types=", fbody->arg_types.size(),
+        " arg_nodes=", fbody->arg_nodes.size());
   }
-  if (static_cast<size_t>(node->num_outputs()) != fbody->ret_nodes.size()) {
-    return false;
+
+  if (num_node_outputs != fbody->ret_types.size() ||
+      num_node_outputs != fbody->ret_nodes.size()) {
+    return errors::InvalidArgument(
+        "Node outputs do not match function returns: outputs=",
+        num_node_outputs, " ret_types=", fbody->ret_types.size(),
+        " ret_nodes=", fbody->ret_nodes.size());
   }
+
   for (int i = 0; i < node->num_inputs(); ++i) {
-    if (node->input_type(i) != fbody->arg_types[i]) return false;
+    if (node->input_type(i) != fbody->arg_types[i]) {
+      return errors::InvalidArgument(
+          "Node input type doesn't match function argument type: ",
+          node->input_type(i), " != ", fbody->arg_types[i], " @ index=", i);
+    }
   }
   for (int i = 0; i < node->num_outputs(); ++i) {
-    if (node->output_type(i) != fbody->ret_types[i]) return false;
+    if (node->output_type(i) != fbody->ret_types[i]) {
+      return errors::InvalidArgument(
+          "Node output type doesn't match function return type: ",
+          node->output_type(i), " != ", fbody->ret_types[i], " @ index=", i);
+    }
   }
-  return true;
+
+  if (!options.ignore_noinline) {
+    TF_RETURN_IF_ERROR(ValidateNoInline(fbody));
+  }
+
+  return Status::OK();
 }
 
-// Given a "caller" in graph "g", which is a function call of a function
-// to "fbody". Replaces the "caller" with fbody->graph and connects
-// edges properly. "override_device" specifies whether inlining should replace
-// explicitly specified devices inside fbody with the callee's device.
-void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
-                        Node* caller, const FunctionBody* fbody,
-                        bool override_device) {
-  if (!ValidateInlining(caller, fbody)) {
-    LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
+// Function inlining must preserve function execution semantics with regards to
+// side-effects visibility. Tensorflow in Eager mode has an automatic control
+// dependencies tracking mechanism, which enforces well-defined execution order
+// of all side-effects. Any other frontend (e.g. Swift) must produce graphs
+// following the same rules, to ensure that function inlining works correctly.
+//
+// IMPORTANT: Currently we do not have a true notion of "side-effectful" node,
+// we assume that all stateful nodes might have side-effects, though it's not
+// true in practice, e.g. `ReadVariableOp` doesn't have an observable
+// side-effect.
+//
+// Automatic control dependency rules in Tensorflow 2.0 (python in eager mode):
+//
+// 1) When a function has a resource (DT_RESOURCE data type) input argument it
+//   "captures" the mutable resource.  This is implemented by automatically
+//    adding a incoming control edge from the previous side-effectful op
+//    touching that resource, and an outgoing control edge to the next
+//    side-effectful op using the same resource. This serializes the mutations
+//    of the resource to make graph execution deterministic.
+//
+// 2) All stateful ops inside a function body are guaranteed to execute in
+//    program order, this is achieved by adding control edges between stateful
+//    ops at graph construction time. Stateful ops (or ops that must execute)
+//    should be in the function control return set. Having a data edge to the
+//    regular function output might be not enough, because after function
+//    inlining it might happen that data output is unused.
+//
+// 3) Furthermore, all ops accepting the same resource as an input are
+//    guaranteed to run in program order. This is also done by adding control
+//    edges at graph construction time. The last op touching the resource
+//    must be in a control return set, which will guarantee that all side
+//    effects to the resource will happen before function completion.
+//
+// Function inlining must preserve side-effect visibility:
+//
+// 1) All side-effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+//
+// 2) All side-effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+//
+// To guarantee that these properties are preserved after inlining we:
+//
+// 1) Create "input_control_node" NoOp. Function call node incoming control
+//    edges will be forwarded *to* this node. Function inputs (Identity nodes)
+//    will have a control edge *from* this node. If function body has nodes
+//    without inputs, they will have a control edge *from* this node.
+//
+// 2) Create "output_control_node" NoOp. All nodes that have incoming control
+//    edge *from* the function call node, will be forwarded to this node.
+//
+//    We have two options for choosing which nodes will have a control edge *to*
+//    the "output control node":
+//       a) control returns            (`control_ret` field in FunctionDef)
+//       b) data returns               (`ret` field in FunctionDef)
+//
+//    We do a) for multi-device function calls in Tensorflow v2 and b)
+//    for the rest for compatibility with Tensorflow v1.
+//
+//    Following the automatic control dependencies tracking rules, a node that
+//    has an incoming control edge from the function call node is dependent on
+//    the side-effects happening inside the function body. The output control
+//    node will guarantee side-effects execution order.
+//
+//    If function call node doesn't have an outgoing control edge, it means that
+//    no one is interested in observing side-effects that might have happened.
+//
+// Function inlining might leave the graph in partially-placed state. Function
+// inlining caller must call Placer to guarantee that all nodes are placed.
+//
+// Function inlining with `options.override_device=true` will leave graph in
+// fully placed state, by overriding all inlined nodes devices with the caller
+// node device, but it will make functions always single-device. These functions
+// after inlining will not be able to handle resources on multiple devices. This
+// is currently acceptable for XLA use cases (XLA cluster is always executed on
+// a single device).
+//
+// TODO(ezhulenev): Documentation above is ahead of implementation below.
+Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                          Node* caller, const FunctionBody* fbody,
+                          const InlineFunctionBodyOptions& options) {
+  VLOG(3) << "Inline function call: " << SummarizeNode(*caller) << " ["
+          << options.DebugString() << "]";
+  VLOG(4) << "Inlined function definition: " << DebugString(fbody->fdef);
+
+  Status validation = ValidateInlining(caller, fbody, options);
+  if (!validation.ok()) {
+    LOG(WARNING) << "Inlining mismatch: " << SummarizeNode(*caller) << " vs. "
                  << DebugString(fbody->graph);
-    return;
+    return errors::Internal("Inlining mismatch: ", validation.error_message());
   }
 
+  // ------------------------------------------------------------------------ //
+  // Helper functions to create `NoOp` and `Identity` nodes for auxiliary
+  // control nodes and inlined function inputs and outputs.
+
+  // Add a NoOp node for function control inputs/outputs.
+  const auto no_op = [&](StringPiece name) {
+    Node* node = AddNoOp(absl::StrCat(caller->name(), "/", name), g);
+    node->set_requested_device(caller->def().device());
+    return node;
+  };
+
+  // Add an Identity node for function data inputs/outputs.
+  const auto identity = [&](StringPiece name, Endpoint input) {
+    return AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
+  };
+
+  // ------------------------------------------------------------------------ //
   // Input edges. For data edges coming into "caller", we first compute the
   // <src>:<src_output> for the i-th input in "inputs".
   // If "caller" has any input control dependencies, we add a NoOp
@@ -1423,7 +1622,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   for (const Edge* e : caller->in_edges()) {
     if (e->IsControlEdge()) {
       if (input_control_node == nullptr) {
-        input_control_node = AddNoOp(g);
+        input_control_node = no_op("input_control_node");
       }
       g->AddControlEdge(e->src(), input_control_node);
     } else {
@@ -1431,6 +1630,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     }
   }
 
+  // ------------------------------------------------------------------------ //
   // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
   // fbody->graph into 'g' except the source and sink nodes.  We copy
   // edges among nodes in 'fbody->graph'.
@@ -1438,11 +1638,10 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
   // remember 'y' in node_map[x->id()].
   std::vector<Node*> node_map(fbody->graph->num_node_ids());
-  Status s;
   for (Node* n : fbody->graph->op_nodes()) {
     NodeDef ndef = n->def();
     ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    if (override_device || ndef.device().empty()) {
+    if (options.override_device || ndef.device().empty()) {
       ndef.set_device(caller->def().device());
     }
     for (auto& attr : *ndef.mutable_attr()) {
@@ -1451,8 +1650,12 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
             strings::StrCat(caller->name(), "/", attr.second.s()));
       }
     }
-    Node* clone = g->AddNode(ndef, &s);
-    TF_CHECK_OK(s);
+    Status added_node;
+    Node* clone = g->AddNode(ndef, &added_node);
+    if (options.override_device && !caller->assigned_device_name().empty()) {
+      clone->set_assigned_device_name(caller->assigned_device_name());
+    }
+    TF_CHECK_OK(added_node);
     node_map[n->id()] = clone;
 
     // If there is an input control node, and one of:
@@ -1467,16 +1670,15 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     //
     // The purpose of case (b) is to ensure that instances of case (a) created
     // by further inlining steps also receive the control dependency.
+    //
+    // TODO(ezhulenev): If caller has no control inputs, should we add a control
+    // edge from one of the inputs to ensure that function body node will
+    // execute in correct frame?
     if (input_control_node) {
-      bool has_inputs = false;
-      for (const Edge* e : n->in_edges()) {
-        if (!e->src()->IsSource()) {
-          has_inputs = true;
-          break;
-        }
-      }
+      bool has_inputs = absl::c_any_of(
+          n->in_edges(), [](const Edge* e) { return !e->src()->IsSource(); });
       if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
-          clone->type_string() == "SymbolicGradient") {
+          clone->type_string() == kGradientOp) {
         g->AddControlEdge(input_control_node, clone);
       }
     }
@@ -1491,6 +1693,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     g->AddEdge(src_copy, e->src_output(), dst_copy, e->dst_input());
   }
 
+  // ------------------------------------------------------------------------ //
   // Connect input edges.
   //
   // We create one Identity node for each input. Then, we connect inputs[i] to
@@ -1501,7 +1704,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   // The added identity nodes depend on "input_control_node".
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
-    Node* n = AddIdentity(g, inputs[i]);
+    Node* n = identity("input", inputs[i]);
     if (input_control_node) {
       g->AddControlEdge(input_control_node, n);
     }
@@ -1516,18 +1719,20 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     g->RemoveNode(arg);  // 'arg' is disconnected.
   }
 
+  // ------------------------------------------------------------------------ //
   // Connect output edges.
   //
-  // For i-th return node in fbody->graph, we add in "g" an identity
-  // node (outputs[i-th]). We then reconnect every incoming edge into
-  // the i-th return node to the added identity node.
+  // For i-th return node in fbody->graph, we add in "g" an identity node
+  // (outputs[i-th]). We then reconnect every incoming edge into the i-th return
+  // node to the added identity node.
   //
-  // For every data edge coming out of "callee"s i-th output, we
-  // reconnect it to the i-th identity added above.
+  // For every data edge coming out of "callee"s i-th output, we reconnect it to
+  // the i-th identity added above.
   //
-  // If "callee" is control-depended upon by any other nodes, we add a
-  // NoOp node "output_control_node". "output_control_node" depends on
-  // all identity nodes added above. And nodes previously depend on
+  // If "callee" is control-depended upon by any other nodes, we add a NoOp node
+  // "output_control_node". "output_control_node" depends on all identity nodes
+  // added above or on all control return nodes (controlled by
+  // `options.output_control_src` value). And nodes previously depend on
   // "callee" is changed to depend on "output_control_node".
   std::vector<Node*> outputs(caller->num_outputs());
   for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
@@ -1540,7 +1745,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
       }
     }
     CHECK(data.node != nullptr);
-    Node* n = AddIdentity(g, data);
+    Node* n = identity("output", data);
     outputs[i] = n;
     for (const Edge* e : ret->in_edges()) {
       if (e->IsControlEdge()) {
@@ -1553,9 +1758,17 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   for (const Edge* e : caller->out_edges()) {
     if (e->IsControlEdge()) {
       if (output_control_node == nullptr) {
-        output_control_node = AddNoOp(g);
-        for (Node* n : outputs) {
-          g->AddControlEdge(n, output_control_node);
+        output_control_node = no_op("output_control_node");
+        if (options.output_control_src ==
+            InlineFunctionBodyOptions::OutputControlSource::kDataOutputs) {
+          for (Node* n : outputs) {
+            g->AddControlEdge(n, output_control_node);
+          }
+        } else {
+          for (Node* fbody_node : fbody->control_ret_nodes) {
+            Node* n = node_map[fbody_node->id()];
+            g->AddControlEdge(n, output_control_node);
+          }
         }
       }
       g->AddControlEdge(output_control_node, e->dst());
@@ -1564,37 +1777,62 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     }
   }
   g->RemoveNode(caller);  // 'caller' is replaced with inlined nodes.
+
+  return Status::OK();
 }
 
-bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
+bool IsFunctionCall(const FunctionLibraryDefinition& lib_def,
+                    const Node& node) {
+  return node.IsPartitionedCall() ||
+         node.type_string() == FunctionLibraryDefinition::kGradientOp ||
+         lib_def.Find(node.def().op()) != nullptr;
+}
+
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options) {
   std::vector<std::pair<Node*, const FunctionBody*>> candidates;
+
   const FunctionLibraryDefinition* fld = lib->GetFunctionLibraryDefinition();
+
   for (Node* node : graph->nodes()) {
-    VLOG(3) << "Expanding " << node->DebugString();
+    // Skip nodes that are not function calls or SymbolicGradient calls.
+    if (!IsFunctionCall(*lib->GetFunctionLibraryDefinition(), *node)) {
+      continue;
+    }
+    // Skip function calls that marked noinline.
     bool noinline;
     if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
-      VLOG(3) << "noinline: " << node->DebugString();
+      VLOG(3) << "noinline: " << SummarizeNode(*node);
       continue;
     }
     FunctionLibraryRuntime::Handle handle;
-    Status s = lib->Instantiate(node->type_string(), node->attrs(), &handle);
+    Status s = InstantiateFunctionCall(node->def(), *lib, &handle);
     if (!s.ok()) {
-      // Either "node" is a primitive op, or the instantiation failed.
-      if (errors::IsNotFound(s)) {
-        VLOG(3) << "ExpandInlineFunctions " << s;
-      } else {
-        LOG(ERROR) << "ExpandInlineFunctions " << s;
-      }
+      LOG(ERROR) << "Failed to instantiate a function:  " << s.error_message();
       continue;
     }
     const FunctionBody* fbody = lib->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    candidates.push_back({node, fbody});
+    candidates.emplace_back(node, fbody);
   }
+
+  bool inlined_any = false;
   for (const auto& p : candidates) {
-    InlineFunctionBody(*fld, graph, p.first, p.second);
+    Status inlined = InlineFunctionBody(*fld, graph, p.first, p.second,
+                                        p.first->IsPartitionedCall()
+                                            ? options.multi_device_options
+                                            : options.native_options);
+    if (inlined.ok()) {
+      inlined_any = true;
+    } else {
+      VLOG(1) << "Failed to inline function call: node=" << p.first->name()
+              << " error=" << inlined.error_message();
+    }
   }
-  return !candidates.empty();
+
+  // TODO(ezhulenev): Release handles for inlined function calls.
+
+  return inlined_any;
 }
 
 string NewName(const Node* n, bool pretty) {
@@ -1684,6 +1922,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       graph(g),
       arg_types(arg_t.begin(), arg_t.end()),
       ret_types(ret_t.begin(), ret_t.end()) {
+  // 1. Find regular Arg/Ret nodes.
   this->arg_nodes.resize(arg_types.size());
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
@@ -1701,6 +1940,17 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
   }
+  // 2. Find ControlRet nodes that must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_node_names;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_node_names.insert(control_ret.second);
+  }
+  this->control_ret_nodes.reserve(control_ret_node_names.size());
+  for (Node* n : this->graph->op_nodes()) {
+    if (control_ret_node_names.count(n->name()) > 0) {
+      this->control_ret_nodes.push_back(n);
+    }
+  }
 }
 
 FunctionBody::~FunctionBody() { delete this->graph; }
@@ -1771,8 +2021,8 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   const int num_y = static_cast<int>(gbody_->ret_nodes.size());
 
   // Populate 'y_node_outputs_' with node function body outputs.
-  // Populate 'y_grad_nodes' with initial gradient nodes for each return node of
-  // the original function body (these will be 'arg' nodes in the function
+  // Populate 'y_grad_nodes' with initial gradient nodes for each return node
+  // of the original function body (these will be 'arg' nodes in the function
   // gradient body).
   std::vector<NodeOut> y_node_outputs;
   y_node_outputs.reserve(num_y);
@@ -1799,8 +2049,8 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   }
 
   // Call AddSymbolicGradients which will add nodes to graph 'g' that
-  // compute the function gradient (adding an entry in 'x_grad_node_outputs' for
-  // each node in 'x_node_outputs').
+  // compute the function gradient (adding an entry in 'x_grad_node_outputs'
+  // for each node in 'x_node_outputs').
   std::vector<NodeOut> x_grad_node_outputs;
   TF_CHECK_OK(AddSymbolicGradients(y_node_outputs, x_node_outputs,
                                    y_grad_node_outputs, &x_grad_node_outputs,
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index df884f7577ddb768ca3dbb9fb067b0b55cd2d2a4..86b4d21cb1dec625c95db07948a089f99b316d25 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -79,6 +79,7 @@ struct FunctionBody {
   DataTypeVector ret_types;
   gtl::InlinedVector<Node*, 4> arg_nodes;
   gtl::InlinedVector<Node*, 4> ret_nodes;
+  gtl::InlinedVector<Node*, 4> control_ret_nodes;
 
   FunctionBody() {}
   FunctionBody(const FunctionDef& f, DataTypeSlice arg_types,
@@ -97,6 +98,17 @@ string DebugString(const Graph* instantiated_func_graph);
 //   1. not stateful; and
 //   2. not _Arg; and
 //   3. not reachable from _Retval.
+//
+// This function is triggered by function inlining, unlike 'PruneFunctionBody'
+// it doesn't preserve nodes that are reachable from control returns. Function
+// inlining is responsible for connecting control return nodes with the nodes
+// that have input control edges from the inlined function call node.
+//
+// Assuming that automatic control dependency tracking is correct, absence of
+// outgoing control edge from the function call node means that no one needs to
+// observe side-effect that might have been generated by the function (see
+// documentation in common_runtime/function.cc for details).
+//
 // Returns true iff any node is removed from "g".
 bool RemoveDeadNodes(Graph* g);
 
@@ -114,15 +126,6 @@ bool RemoveIdentityNodes(Graph* g);
 // Rewrites _ListToArray and _ArrayToList to a set of Identity nodes.
 bool RemoveListArrayConverter(Graph* g);
 
-// For each node in "graph", if "lib" indicates that the node is a
-// function call, inline the function body.  Returns true if at least
-// one node is inlined.
-//
-// This routine goes through "graph" nodes once and applies the
-// inlining.  The caller may decide to apply the inlining on "graph"
-// multiple times by calling ExpandInlineFunctions a few times.
-bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph);
-
 // Dump the contents of the "graph" to log files if the logging level is
 // sufficiently high.
 void DumpGraph(StringPiece label, const Graph* g);
@@ -156,13 +159,127 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
 // TODO(zhifengc): Asks math expert to say the comment again.
 FunctionBody* SymbolicGradient(const FunctionBody& f);
 
+struct InlineFunctionBodyOptions {
+  // All nodes that have incoming control edge *from* the function call node,
+  // will be forwarded to the "output control node". There are two options for
+  // choosing which nodes will have a control edge *to* the "output control
+  // node":
+  //   a) control returns            (`control_ret` field in FunctionDef)
+  //   b) data returns               (`ret` field in FunctionDef)
+  enum class OutputControlSource { kDataOutputs, kControlOutputs };
+
+  // Ignore '_noinline' function attribute.
+  bool ignore_noinline = false;
+  // If 'true' function inlining will override explicitly specified devices
+  // inside function body with the caller node device.
+  bool override_device = false;
+  // For compatibility with Tensorflow v1 by default we will use data outputs.
+  // Control returns were added to Tensorflow v2 with automatic control
+  // dependencies tracking in Eager mode.
+  OutputControlSource output_control_src = OutputControlSource::kDataOutputs;
+
+  // A human-readable debug string for this options.
+  string DebugString() const;
+};
+
+// Returns 'Status::OK()' iff the function '*fbody' can be inlined at 'node'
+// based on the type signature of 'node' and 'fbody':
+//
+// (1) Caller node has the same number of inputs and outputs as the function.
+// (2) Caller node inputs and outputs have the same data types as function
+//     inputs and returns.
+// (3) Validation rules defined in InlineFunctionBodyOptions.
+//
+// If function can't be safely inlined, returns error message with details why
+// inlining is not possible or safe.
+Status ValidateInlining(const Node* node, const FunctionBody* fbody,
+                        const InlineFunctionBodyOptions& options);
+
 // Given a "caller" in graph "g", which is a function call of a function
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly. "override_device" specifies whether inlining should replace
 // explicitly specified devices inside fbody with the callee's device.
-void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
-                        Node* caller, const FunctionBody* fbody,
-                        bool override_device = true);
+//
+// Returns 'Status::OK()' if function was successfully inlined into the graph.
+// If function inlining is not possible returns a error with a reason, and
+// leaves the graph in unmodified state.
+Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                          Node* caller, const FunctionBody* fbody,
+                          const InlineFunctionBodyOptions& options);
+
+// There are three types of function calls that could be invoked during
+// *Tensorflow graph execution*:
+//
+// 1) Native function call (node.type_string() is the function name). These
+//    functions are always executed on a single-device, which is the device of
+//    the function call node.
+//
+// 2) Multi-device function calls (PartitionedCall or StatefulPartitionedCall
+//    ops) can execute on multiple devices and accept DT_RESOURCE inputs that
+//    belong to different devices. This type of functions was added in
+//    Tensorflow 2.0 Eager mode, and it has control outputs to represent
+//    side-effects that must always execute (see `control_ret` in FunctionDef).
+//
+// 3) SymbolicGradient has been deprecated for a while, but we still keep it and
+//    use `native` options for inlining for compatibility.
+//
+// We need to have distinct inlining rules for compatibility with Tensorflow v1.
+//
+// There are few other places in Tensorflow that could execute functions:
+//
+// 1) common_runtime/eager/kernel_and_device.{h,cc} - executes "top level"
+//    functions directly via function library runtime, without going through
+//    the graph.
+// 2) tf.data pipelines - also execute functions directly via function library
+//    runtime with custom executors.
+struct ExpandInlineFunctionsOptions {
+  ExpandInlineFunctionsOptions() : native_options(), multi_device_options() {
+    using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+    multi_device_options.output_control_src = OutputControlSrc::kControlOutputs;
+  }
+
+  InlineFunctionBodyOptions native_options;
+  InlineFunctionBodyOptions multi_device_options;
+};
+
+// WARNING(ezhulenev): PLEASE DO NOT USE THIS FUNCTION. This is a temporary
+// workaround that will be enabled only during the function inlining unification
+// (b/126811947). Contact ezhulenev@ if you think you need it.
+// TODO(ezhulenev): Delete this function.
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options);
+
+// For each node in "graph", if "lib" indicates that the node is a
+// function call, inline the function body. Returns true if at least
+// one node is inlined.
+//
+// This routine goes through "graph" nodes once and applies the
+// inlining. The caller may decide to apply the inlining on "graph"
+// multiple times by calling ExpandInlineFunctions a few times.
+//
+// Function calls that can't be safely inlined into the graph (ValidateInlining
+// returns error), are ignored.
+//
+// TODO(ezhulenev): We do not FunctionLibraryRuntime for this. We need just the
+// FunctionLibraryDefinition and FunctionDefToBodyHelper to implement this (see
+// lower_function_call.cc).
+inline bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
+  return ExpandInlineFunctions(lib, graph, ExpandInlineFunctionsOptions());
+}
+
+// Extracts function name and attributes from `call_def` and invokes
+// flr->Instantiate(name, attrs, handle).
+// `call_def` can be a native function call (where the op type is the function
+// name) or a call through PartitionedCall/StatefulPartitionedCall.
+Status InstantiateFunctionCall(const NodeDef& call_def,
+                               FunctionLibraryRuntime& flr,
+                               FunctionLibraryRuntime::Handle* handle);
+
+// Returns true iff `n` represents a function call. `n` can be a native
+// function call (n.type_string() is the function name),
+// a PartitionedCall/StatefulPartitionedCall, or a SymbolicGradient (which
+// has been deprecated for a while).
+bool IsFunctionCall(const FunctionLibraryDefinition& lib_def, const Node& n);
 
 // Instantiates FunctionDef into a graph. Set *fbody to point to the
 // FunctionBody that holds the instantiated FunctionDef.
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 97e46f406cf96cc284ec14718f9500767f5e9861..15910aff92f3f425cba07803be085be14bd9040f 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -708,14 +708,14 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
     Scope s = Scope::NewRootScope();
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
-    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func0 = ops::Identity(s.WithOpName("Func/x4/input/_0"), x);
     auto x4_x2 = test::function::Call(&s, "x4/x2", "XTimesTwo", {func0});
     auto x4_y = test::function::Call(&s, "x4/y", "XTimesTwo", {x4_x2});
-    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
-    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func1 = ops::Identity(s.WithOpName("Func/x4/output/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/y/input/_2"), func1);
     auto y_x2 = test::function::Call(&s, "y/x2", "XTimesTwo", {func2});
     auto y_y = test::function::Call(&s, "y/y", "XTimesTwo", {y_x2});
-    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/y/output/_3"), y_y);
     auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
     GraphDef expected;
     TF_ASSERT_OK(s.ToGraphDef(&expected));
@@ -739,22 +739,22 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
     auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
     auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
     auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
-    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
-    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto func0 = ops::Identity(s.WithOpName("Func/x4/input/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/x4/x2/input/_4"), func0);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
-    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
-    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto func5 = ops::Identity(s.WithOpName("Func/x4/x2/output/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/x4/y/input/_6"), func5);
     auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
-    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
-    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
-    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
-    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto func7 = ops::Identity(s.WithOpName("Func/x4/y/output/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/x4/output/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/y/input/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/y/x2/input/_8"), func2);
     auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
-    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
-    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto func9 = ops::Identity(s.WithOpName("Func/y/x2/output/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/y/y/input/_10"), func9);
     auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
-    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
-    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto func11 = ops::Identity(s.WithOpName("Func/y/y/output/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/y/output/_3"), func11);
     auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
     TF_ASSERT_OK(s.ToGraphDef(&e2));
 
@@ -801,7 +801,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
 
 // Verifies that control dependencies on the caller are added as control
 // dependencies on any function calls created by inlining.
-TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithInputControlEdges) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour()});
 
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
@@ -822,15 +822,15 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
     auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
     auto c = ops::NoOp(s.WithOpName("c"));
-    auto func0 =
-        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func0 = ops::NoOp(s.WithOpName("Func/b/input_control_node/_0")
+                               .WithControlDependencies({c}));
     auto func1 = ops::Identity(
-        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+        s.WithOpName("Func/b/input/_1").WithControlDependencies({func0}), a);
     auto b_x2 = test::function::Call(&s, "b/x2", "XTimesTwo", {func1});
     s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
     auto b_y = test::function::Call(&s, "b/y", "XTimesTwo", {b_x2});
     s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
-    auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/b/output/_2"), b_y);
     auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
     GraphDef expected;
     TF_ASSERT_OK(s.ToGraphDef(&expected));
@@ -846,32 +846,34 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
     auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
     auto c = ops::NoOp(s.WithOpName("c"));
-    auto func0 =
-        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func0 = ops::NoOp(s.WithOpName("Func/b/input_control_node/_0")
+                               .WithControlDependencies({c}));
     auto func1 = ops::Identity(
-        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+        s.WithOpName("Func/b/input/_1").WithControlDependencies({func0}), a);
 
-    auto func3 =
-        ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
+    auto func3 = ops::NoOp(s.WithOpName("Func/b/x2/input_control_node/_3")
+                               .WithControlDependencies({func0}));
     auto func4 = ops::Identity(
-        s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
+        s.WithOpName("Func/b/x2/input/_4").WithControlDependencies({func3}),
+        func1);
     auto b_x2_two = ops::Const(
         s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
     auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
     auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
-    auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
+    auto func5 = ops::Identity(s.WithOpName("Func/b/x2/output/_5"), b_x2_y);
 
-    auto func6 =
-        ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
+    auto func6 = ops::NoOp(s.WithOpName("Func/b/y/input_control_node/_6")
+                               .WithControlDependencies({func0}));
     auto func7 = ops::Identity(
-        s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
+        s.WithOpName("Func/b/y/input/_7").WithControlDependencies({func6}),
+        func5);
     auto b_y_two = ops::Const(
         s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
     auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
     auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
-    auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
+    auto func8 = ops::Identity(s.WithOpName("Func/b/y/output/_8"), b_y_y);
 
-    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
+    auto func2 = ops::Identity(s.WithOpName("Func/b/output/_2"), func8);
     auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
 
     GraphDef expected;
@@ -883,6 +885,99 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest,
+       ExpandInlineFunctionsWithOutputControlEdges) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  const FunctionDef func =
+      FDH::Create("FunctionWithControlOutputs", {"i: float"}, {"o: float"}, {},
+                  {
+                      {{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+                      {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}},
+                  },
+                  /*ret_def=*/{{"o", "ret:z:0"}},
+                  /*control_ret_def=*/{{"must_execute", "add"}});
+
+  Init({func});
+
+  // Construct a graph for the function call:
+  //
+  //   a = Arg[dtype=DT_FLOAT]
+  //   b = FunctionWithControlOutputs(a)
+  //   c = NoOp(^b)
+  //   ret = RetVal(b, ^c)
+  const auto init_graph = [this](std::unique_ptr<Graph>* g) -> void {
+    g->reset(new Graph(OpRegistry::Global()));
+
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto b = test::function::Call(&s, "b", "FunctionWithControlOutputs", {a});
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto ret = ops::_Retval(s.WithOpName("ret"), b, 0);
+    s.graph()->AddControlEdge(b.node(), c.operation.node());
+    s.graph()->AddControlEdge(c.operation.node(), ret.operation.node());
+    TF_ASSERT_OK(s.ToGraph(g->get()));
+  };
+
+  std::unique_ptr<Graph> g;
+  ExpandInlineFunctionsOptions opts;
+
+  const string input_node = "Func/b/input/_0";
+  const string output_node = "Func/b/output/_1";
+  const string output_control_node = "Func/b/output_control_node/_2";
+
+  // Use data outputs as output control source.
+  opts.native_options.output_control_src = OutputControlSrc::kDataOutputs;
+
+  init_graph(&g);
+  ExpandInlineFunctions(flr0_, g.get(), opts);
+  {
+    GraphDef expected = test::function::GDef(
+        {NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}),
+         NDef(input_node, "Identity", {"a"}, {{"T", DT_FLOAT}}),
+         NDef("b/add", "Add", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef("b/ret", "Mul", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef(output_node, "Identity", {"b/ret"}, {{"T", DT_FLOAT}}),
+         NDef(output_control_node, "NoOp", {"^Func/b/output/_1"}, {}),
+         NDef("c", "NoOp", {"^" + output_control_node}, {}),
+         NDef("ret", "_Retval", {output_node, "^c"},
+              {{"T", DT_FLOAT}, {"index", 0}})},
+        {func});
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  // Use control outputs as output control source.
+  opts.native_options.output_control_src = OutputControlSrc::kControlOutputs;
+
+  init_graph(&g);
+  ExpandInlineFunctions(flr0_, g.get(), opts);
+  {
+    GraphDef expected = test::function::GDef(
+        {NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}),
+         NDef(input_node, "Identity", {"a"}, {{"T", DT_FLOAT}}),
+         NDef("b/add", "Add", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef("b/ret", "Mul", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef(output_node, "Identity", {"b/ret"}, {{"T", DT_FLOAT}}),
+         NDef(output_control_node, "NoOp", {"^b/add"}, {}),
+         NDef("c", "NoOp", {"^" + output_control_node}, {}),
+         NDef("ret", "_Retval", {output_node, "^c"},
+              {{"T", DT_FLOAT}, {"index", 0}})},
+        {func});
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+}
+
 TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   auto T = DT_INT32;
   FunctionDef stateful_func = FDH::Define(
@@ -945,6 +1040,48 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   EXPECT_EQ(expected_node_names, executed_node_names);
 }
 
+TEST_F(FunctionLibraryRuntimeTest, DoNotPruneControlOutputsFromBody) {
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  const FunctionDef func =
+      FDH::Create("FunctionWithControlOutputs", {"i: float"}, {"o: float"}, {},
+                  {
+                      {{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+                      {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}},
+                  },
+                  /*ret_def=*/{{"o", "ret:z:0"}},
+                  /*control_ret_def=*/{{"must_execute", "add"}});
+
+  Init({func});
+
+  auto x = test::AsTensor<float>({1.25});
+  Tensor z;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr1_, "FunctionWithControlOutputs", {}, &handle));
+
+  StepStats stats;
+  StepStatsCollector stats_collector(&stats);
+  FunctionLibraryRuntime::Options opts;
+  opts.stats_collector = &stats_collector;
+  TF_CHECK_OK(Run(flr1_, handle, opts, {x}, {&z}));
+  TF_CHECK_OK(flr1_->ReleaseHandle(handle));
+
+  TF_CHECK_OK(
+      InstantiateAndRun(flr1_, "FunctionWithControlOutputs", {}, {x}, {&z}));
+  test::ExpectTensorEqual<float>(z, test::AsTensor<float>({1.25 * 1.25}));
+
+  stats_collector.FinalizeAndSwap(&stats);
+
+  std::set<string> expected_node_names(
+      {"_SOURCE", "i", "add", "ret", "o_RetVal"});
+  std::set<string> executed_node_names;
+  for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
+    executed_node_names.insert(node_stats.node_name());
+  }
+  EXPECT_EQ(expected_node_names, executed_node_names);
+}
+
 // Constant folding generates names using a global counter.
 // This function invokes constant folding and parses the counter
 // from the generated node name.
@@ -1071,13 +1208,15 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
     auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
-    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func0 = ops::NoOp(s.WithOpName("Func/a0/input_control_node/_0")
+                               .WithControlDependencies(x2));
     auto func1 = ops::Identity(
-        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+        s.WithOpName("Func/a0/input/_1").WithControlDependencies({func0}), x);
     auto func2 = ops::Identity(
-        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
-    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
-        {func1.output.op(), func2.output.op()}));
+        s.WithOpName("Func/a0/input/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(
+        s.WithOpName("Func/a1/output_control_node/_9")
+            .WithControlDependencies({func1.output.op(), func2.output.op()}));
     auto y2 =
         ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
     auto o = ops::Add(s.WithOpName("o"), x2, y2);
@@ -1343,9 +1482,9 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
     auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
     auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
     auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
-    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
-    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
-    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto func0 = ops::Identity(s.WithOpName("Func/grad0/input/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/grad0/input/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/grad0/input/_2"), dz);
     auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
     auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
     auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
@@ -1372,8 +1511,10 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
         std::initializer_list<Input>{func0, func1, grad0_func1[0]},
         {DT_FLOAT, DT_FLOAT}, add);
 
-    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
-    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto func3 =
+        ops::Identity(s.WithOpName("Func/grad0/output/_3"), grad0_func3[0]);
+    auto func4 =
+        ops::Identity(s.WithOpName("Func/grad0/output/_4"), grad0_func3[1]);
     auto dx = ops::Identity(s.WithOpName("dx"), func3);
     auto dy = ops::Identity(s.WithOpName("dy"), func4);
     auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
@@ -1686,14 +1827,14 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
     auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
     auto zero = ops::Const(scope.WithOpName("zero"), 0);
     auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
-    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
-    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
-    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
-    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/a/input/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/a/input/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/a/input/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/a/input/_3"), s[3]);
     auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
     auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
-    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
-    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/x/input/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/x/input/_5"), r);
     auto o = ops::AddN(scope.WithOpName("o"),
                        std::initializer_list<Input>{func_4, func_5});
     auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
@@ -1768,14 +1909,15 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
     Scope s = Scope::NewRootScope();
     auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
     auto dummy = ops::Const(s.WithOpName("dummy"), 0);
-    auto func_2 =
-        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_2 = ops::NoOp(s.WithOpName("Func/x/input_control_node/_2")
+                                .WithControlDependencies(dummy));
     auto func_0 = ops::Identity(
-        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+        s.WithOpName("Func/x/input/_0").WithControlDependencies({func_2}), i);
     auto func_1 = ops::Identity(
-        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
-    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
-        {func_0.output.op(), func_1.output.op()}));
+        s.WithOpName("Func/x/input/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(
+        s.WithOpName("Func/x/output_control_node/_3")
+            .WithControlDependencies({func_0.output.op(), func_1.output.op()}));
     auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
                        std::initializer_list<Input>{func_0, func_1});
     auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 60e82ed13bc1362f40dedfb93e5c001d946bf77f..2e44a37c68f83569b78bc6a5307b7719c25ddf32 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
@@ -36,14 +36,17 @@ namespace tensorflow {
 namespace {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: " << std::endl << stats.DebugString();
-  EXPECT_EQ(stats.bytes_in_use, bytes_in_use);
-  EXPECT_EQ(stats.max_bytes_in_use, max_bytes_in_use);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: " << std::endl << stats->DebugString();
+  EXPECT_EQ(stats->bytes_in_use, bytes_in_use);
+  EXPECT_EQ(stats->peak_bytes_in_use, peak_bytes_in_use);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
@@ -291,9 +294,10 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
     a.DeallocateRaw(existing_ptrs[i]);
   }
 
-  AllocatorStats stats;
-  a.GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+  absl::optional<AllocatorStats> stats = a.GetStats();
+  if (stats) {
+    LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
+  }
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
@@ -576,4 +580,4 @@ TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 4be1bbb7df37c1aa954ea3350f82eee5b15ad1bf..d85ca8892f6d19c2c10a5f35368a476506ecc370 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
@@ -42,7 +41,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // allocate with cudaMalloc
   se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
-  CUresult res = tensorflow::wrap::cuMemAlloc(&rv, num_bytes);
+  CUresult res = cuMemAlloc(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
     return nullptr;
@@ -55,8 +54,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #ifdef GOOGLE_CUDA
   // free with cudaFree
-  CUresult res =
-      tensorflow::wrap::cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
+  CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemFree failed to free " << ptr;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index c22bfcea2cedab93409d761686d852a5c4bbeeb9..0727196e1ceed88063a666a6a45fb139386203aa 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -132,8 +132,8 @@ int64 GPUDebugAllocator::AllocationId(const void* ptr) {
                                        MASK_BYTES);
 }
 
-void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
@@ -208,8 +208,8 @@ size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
   return base_allocator_->AllocatedSize(ptr);
 }
 
-void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 17757a106c5c20939b2c2d3525efc1ad659c2902..fa0394c19d0f6c910aeb5847a2e765f292f9de88 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -43,7 +43,7 @@ class GPUDebugAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // For testing.
@@ -71,7 +71,7 @@ class GPUNanResetAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index aca08a7e33d6ea3f966e87a9f8b800f4df86e9a5..a0728c017e1a9975550c70997e5bbb276342941b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 
@@ -249,4 +249,4 @@ TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 010fdff4e90624191e8cea65013c4e547a0c3398..5332ff3582d7424fd286f741d40595cda82573bd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 // TODO(opensource): Use a more generic sounding preprocessor name than
 // GOOGLE_CUDA
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#endif
 
 #define EIGEN_USE_GPU
 
@@ -55,7 +59,11 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/platform/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -67,18 +75,36 @@ limitations under the License.
 #include "tensorflow/core/util/stream_executor_util.h"
 
 #if !defined(PLATFORM_GOOGLE)
+#if GOOGLE_CUDA
 #include "cuda/cuda_config.h"
 #endif
+#endif
 
 namespace tensorflow {
 
+#if GOOGLE_CUDA
+
+typedef cudaStream_t gpuStream_t;
+typedef cudaDeviceProp gpuDeviceProp_t;
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
+using se::cuda::ScopedActivateExecutorContext;
+
+#elif TENSORFLOW_USE_ROCM
+
+typedef hipStream_t gpuStream_t;
+typedef hipDeviceProp_t gpuDeviceProp_t;
+#define EIGEN_GPU_SCRATCH_SIZE (Eigen::kGpuScratchSize)
+using se::rocm::ScopedActivateExecutorContext;
+
+#endif
+
 // Eigen Ops directly allocate memory only for temporary buffers used
 // during OpKernel::Compute().  The recommended way of allocating such
 // memory is via OpKernelContext::allocate_temp().  However, Eigen Ops
 // don't have access to OpKernelContext, instead they get access to
 // memory directly through the device allocator.  As an Open Source
 // project, Eigen assumes allocator semantics similar to those of the
-// CUDA memory allocator, and may not work correctly due to race
+// CUDA or ROCm memory allocator, and may not work correctly due to race
 // conditions if used with some other allocator.  For safety, we need
 // to delay deallocation calls out of Eigen until all events on the
 // corresponding stream have completed.  The following two classes
@@ -91,7 +117,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     Eigen::initializeDeviceProp();
   }
   ~EigenGpuStreamDevice() override {}
-  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
+  void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
                     char* scratch) {
     if (LogMemory::IsEnabled()) {
@@ -102,15 +128,15 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     scratch_ = scratch;
     semaphore_ =
         reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
-    stream_ = cuda_stream;
+    stream_ = gpu_stream;
     allocator_ = alloc;
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
     device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()];
   }
 
-  const cudaStream_t& stream() const override { return *stream_; }
-  const cudaDeviceProp& deviceProperties() const override {
+  const gpuStream_t& stream() const override { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const override {
     return *device_prop_;
   }
 
@@ -140,8 +166,13 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     }
     AsyncFreeData* afData =
         new AsyncFreeData(allocator_, buffer, operation_, step_id_);
+#if GOOGLE_CUDA
     cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
     CHECK_EQ(err, cudaSuccess);
+#elif TENSORFLOW_USE_ROCM
+    hipError_t err = hipStreamAddCallback(*stream_, asyncFree, afData, 0);
+    CHECK_EQ(err, hipSuccess);
+#endif
   }
 
   // Return a pointer to a per stream scratchpad of 1024 bytes residing
@@ -165,8 +196,12 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
     const int64 step_id_;
   };
 
-  static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
+#if GOOGLE_CUDA
+  static void CUDART_CB asyncFree(gpuStream_t stream, cudaError_t status,
                                   void* userData) {
+#elif TENSORFLOW_USE_ROCM
+  static void asyncFree(gpuStream_t stream, hipError_t status, void* userData) {
+#endif
     AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
@@ -178,8 +213,8 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
 
   string operation_;
   int64 step_id_;
-  const cudaStream_t* stream_;          // Not owned.
-  const cudaDeviceProp* device_prop_;   // Not owned.
+  const gpuStream_t* stream_;           // Not owned.
+  const gpuDeviceProp_t* device_prop_;  // Not owned.
   ::tensorflow::Allocator* allocator_;  // Not owned.
   mutable char* scratch_;
   mutable unsigned int* semaphore_;
@@ -276,6 +311,24 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   GPUProcessState::singleton()->EnableGPUDevice();
+  pending_cap_ = options.config.gpu_options().experimental().pending_cap();
+  timestamped_allocator_ =
+      options.config.gpu_options().experimental().timestamped_allocator();
+  if (timestamped_allocator_ || pending_cap_ > 0) {
+    SharedCounter* timing_counter = nullptr;
+    if (timestamped_allocator_) {
+      // In this case the SharedCounter was already created and set in the
+      // associated Allocator, with ownership by GPUProcessState.
+      // The GPUKernelTracker will use this SharedCounter, instead of
+      // owning its own.
+      timing_counter =
+          GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
+      DCHECK(timing_counter);
+    } else {
+      DCHECK_GT(pending_cap_, 0);
+    }
+    kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
+  }
 }
 
 BaseGPUDevice::~BaseGPUDevice() {
@@ -436,7 +489,7 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   // NOTE(tucker): We need to discriminate between Eigen GPU
   // operations and all others.  If an operation is Eigen
-  // implemented (or otherwise tries to launch a cuda kernel
+  // implemented (or otherwise tries to launch a GPU kernel
   // directly), we need to establish a stacked-scoped environment
   // that directs it to execute on the proper device.  Otherwise we
   // expect the Op to use StreamExecutor directly and correctly.  The
@@ -508,7 +561,11 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  if (pending_cap_ > 0) {
+    DCHECK(kernel_tracker_);
+    kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
+  }
+  ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
     if (sync_every_op_) {
@@ -525,6 +582,14 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       VLOG(1) << "GpuDevice::ComputeHelper scheduled "
               << ComputeOpKernelDebugString(*op_kernel, stream_id);
     }
+    if (kernel_tracker_) {
+      GPUKernelTracker* tracker = kernel_tracker_.get();
+      DCHECK(tracker);
+      uint64 queued_count = tracker->RecordQueued();
+      em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
+        tracker->RecordTerminated(queued_count);
+      });
+    }
   } else {
     if (vlog_1) {
       VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
@@ -566,7 +631,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
   // activity is simple enough that its overhead is negligible.
   tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
                                    op_kernel->IsExpensive());
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
+  ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -685,10 +750,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
  public:
   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
 
-  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
+  void Reinitialize(OpKernelContext* context, const gpuStream_t* gpu_stream,
                     TfGpuId tf_gpu_id, Allocator* base_allocator,
                     char* scratch) {
-    stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
+    stream_device_.Reinitialize(context, gpu_stream, tf_gpu_id, base_allocator,
                                 scratch);
   }
 
@@ -721,8 +786,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
       if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
         return errors::InvalidArgument(
             "Could not parse entry in 'visible_device_list': '",
-            platform_gpu_id_str, "'. visible_device_list = ",
-            visible_device_list);
+            platform_gpu_id_str,
+            "'. visible_device_list = ", visible_device_list);
       }
       if (platform_gpu_id < 0 ||
           platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
@@ -868,9 +933,9 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
   ConcretePerOpGpuDevice* concrete_device =
       static_cast<ConcretePerOpGpuDevice*>(device);
   DCHECK(concrete_device);
-  const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
+  const gpuStream_t* gpu_stream = reinterpret_cast<const gpuStream_t*>(
       streams_[stream_id]->compute->implementation()->GpuStreamMemberHack());
-  concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
+  concrete_device->Reinitialize(context, gpu_stream, tf_gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
 
@@ -947,33 +1012,65 @@ Status BaseGPUDeviceFactory::CreateDevices(
   if (!valid_platform_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
+#if GOOGLE_CUDA
     cudaError_t err = cudaGetDevice(&original_device);
     if (err != cudaSuccess) {
       return errors::Internal("cudaGetDevice() failed. Status: ",
                               cudaGetErrorString(err));
     }
+#elif TENSORFLOW_USE_ROCM
+    hipError_t err = hipGetDevice(&original_device);
+    if (err != hipSuccess) {
+      return errors::Internal("hipGetDevice() failed. Status: ",
+                              hipGetErrorString(err));
+    }
+#endif
+
     // Force to implicitly initialize CUDA runtime on each valid GPU before
     // CreateGPUDevice().
     for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
+#if GOOGLE_CUDA
       err = cudaSetDevice(platform_gpu_id.value());
       if (err != cudaSuccess) {
-        return errors::Internal("cudaSetDevice() on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+        return errors::Internal(
+            "cudaSetDevice() on GPU:", platform_gpu_id.value(),
+            " failed. Status: ", cudaGetErrorString(err));
       }
       err = cudaFree(nullptr);
       if (err != cudaSuccess) {
         return errors::Internal("CUDA runtime implicit initialization on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+                                platform_gpu_id.value(),
+                                " failed. Status: ", cudaGetErrorString(err));
+      }
+#elif TENSORFLOW_USE_ROCM
+      err = hipSetDevice(platform_gpu_id.value());
+      if (err != hipSuccess) {
+        return errors::Internal(
+            "hipSetDevice() on GPU:", platform_gpu_id.value(),
+            " failed. Status: ", hipGetErrorString(err));
+      }
+      err = hipFree(nullptr);
+      if (err != hipSuccess) {
+        return errors::Internal("ROCm runtime implicit initialization on GPU:",
+                                platform_gpu_id.value(),
+                                " failed. Status: ", hipGetErrorString(err));
       }
+#endif
     }
     // Reset to the original device.
+#if GOOGLE_CUDA
     err = cudaSetDevice(original_device);
     if (err != cudaSuccess) {
       return errors::Internal("cudaSetDevice() on GPU:", original_device,
                               " failed. Status: ", cudaGetErrorString(err));
     }
+#elif TENSORFLOW_USE_ROCM
+    err = hipSetDevice(original_device);
+    if (err != hipSuccess) {
+      return errors::Internal("hipSetDevice() on GPU:", original_device,
+                              " failed. Status: ", hipGetErrorString(err));
+    }
+#endif
   }
 
   std::vector<InterconnectMap> interconnect_maps;
@@ -1063,6 +1160,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
 
 static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
                                         const se::DeviceDescription& desc) {
+#if GOOGLE_CUDA
   int cc_major;
   int cc_minor;
   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1074,6 +1172,11 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
                          desc.name(), ", pci bus id: ", desc.pci_bus_id(),
                          ", compute capability: ", cc_major, ".", cc_minor);
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
+#elif TENSORFLOW_USE_ROCM
+  return strings::StrCat("device: ", platform_gpu_id.value(),
+                         ", name: ", desc.name(),
+                         ", pci bus id: ", desc.pci_bus_id());
+#endif
 }
 
 Status BaseGPUDeviceFactory::CreateGPUDevice(
@@ -1100,21 +1203,24 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
                             tf_gpu_id.value(), " with ", memory_limit,
                             " bytes of memory.");
   }
-  AllocatorStats stats;
-  gpu_allocator->GetStats(&stats);
+  absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
+  if (!stats) {
+    return errors::Internal("No allocator statistics");
+  }
   // 'memory_limit' is the required memory size, but if the allocator with given
   // tf_gpu_id was created before, we'll use it instead of creating a new one
   // (as TF gpu device is a shared resource), in which case the actual memory
   // limit represented by 'stats.bytes_limit' used by that allocator may be
   // different (which should be an error).
   //
-  // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
+  // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
+  int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
   std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
-      options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
+      options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
   LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
-            << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
+            << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   devices->push_back(std::move(gpu_device));
@@ -1296,6 +1402,7 @@ static int GetMinGPUMultiprocessorCount(
 
 namespace {
 
+#if GOOGLE_CUDA
 struct CudaVersion {
   // Initialize from version_name in the form of "3.5"
   explicit CudaVersion(const std::string& version_name) {
@@ -1347,6 +1454,15 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
 #endif
   return cuda_caps;
 }
+#endif  // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+std::vector<int> supported_amdgpu_isa_versions = {803, 900, 906};
+
+std::vector<int> GetSupportedAMDGPUISAVersions() {
+  return supported_amdgpu_isa_versions;
+}
+#endif  // TENSORFLOW_USE_ROCM
 
 Status EnablePeerAccess(se::Platform* platform,
                         const std::vector<PlatformGpuId>& visible_gpu_order) {
@@ -1424,6 +1540,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       total_bytes = 0;
     }
     const auto& description = stream_exec->GetDeviceDescription();
+#if GOOGLE_CUDA
     int cc_major;
     int cc_minor;
     if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
@@ -1438,6 +1555,21 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
               << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
               << strings::HumanReadableNumBytes(total_bytes)
               << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
+#elif TENSORFLOW_USE_ROCM
+    int isa_version;
+    if (!description.rocm_amdgpu_isa_version(&isa_version)) {
+      // Logs internally on failure.
+      isa_version = 0;
+    }
+    LOG(INFO) << "Found device " << i << " with properties: "
+              << "\nname: " << description.name() << "\nAMDGPU ISA: gfx"
+              << isa_version << "\nmemoryClockRate (GHz) "
+              << description.clock_rate_ghz() << "\npciBusID "
+              << description.pci_bus_id() << "\nTotal memory: "
+              << strings::HumanReadableNumBytes(total_bytes)
+              << "\nFree memory: "
+              << strings::HumanReadableNumBytes(free_bytes);
+#endif
   }
   // Checking peering and shows matrix if more than one gpu found.
   if (new_gpu_found && visible_gpu_order.size() > 1) {
@@ -1445,6 +1577,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
   }
 
+#if GOOGLE_CUDA
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
   if (cuda_supported_capabilities.empty()) {
     return errors::FailedPrecondition(
@@ -1452,6 +1585,15 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   }
   CudaVersion min_supported_capability = *std::min_element(
       cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
+#elif TENSORFLOW_USE_ROCM
+  auto rocm_supported_isas = GetSupportedAMDGPUISAVersions();
+  if (rocm_supported_isas.empty()) {
+    return errors::FailedPrecondition(
+        "No supported rocm capabilities in binary.");
+  }
+  int min_supported_isa =
+      *std::min_element(rocm_supported_isas.begin(), rocm_supported_isas.end());
+#endif
 
   int min_gpu_core_count =
       GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
@@ -1469,6 +1611,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     }
     se::StreamExecutor* se = exec_status.ValueOrDie();
     const se::DeviceDescription& desc = se->GetDeviceDescription();
+
+#if GOOGLE_CUDA
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
@@ -1489,6 +1633,23 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                 << min_supported_capability << ".";
       continue;
     }
+#elif TENSORFLOW_USE_ROCM
+    int device_isa;
+    if (!desc.rocm_amdgpu_isa_version(&device_isa)) {
+      continue;
+    }
+    // Only GPUs with no less than the minimum supported compute capability is
+    // accepted.
+    if (device_isa < min_supported_isa) {
+      LOG(INFO) << "Ignoring visible gpu device "
+                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << ") "
+                << "with AMDGPU ISA gfx" << device_isa
+                << ". The minimum required AMDGPU ISA is gfx"
+                << min_supported_isa << ".";
+      continue;
+    }
+#endif
 
     // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
     // count than the fastest GPU are filtered out, unless they have 8 or more
@@ -1498,7 +1659,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       LOG(INFO) << "Ignoring visible gpu device "
                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
                 << ") "
-                << "with Cuda multiprocessor count: " << desc.core_count()
+                << "with core count: " << desc.core_count()
                 << ". The minimum required count is " << min_gpu_core_count
                 << ". You can adjust this requirement with the env var "
                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
@@ -1517,6 +1678,115 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   return Status::OK();
 }
 
+uint64 BaseGPUDevice::SafeAllocFrontier() {
+  if (timestamped_allocator_) {
+    return kernel_tracker_->LastTerminatedCount();
+  } else {
+    return 0;
+  }
+}
+
+int BaseGPUDevice::PendingKernels() {
+  if (kernel_tracker_) {
+    return kernel_tracker_->NumPending();
+  }
+  return 0;
+}
+
+uint64 GPUKernelTracker::RecordQueued() {
+  mutex_lock l(mu_);
+  uint64 queued_count = timing_counter_->next();
+  VLOG(2) << "RecordQueued queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_;
+  pending_kernels_[first_available_].queued_count = queued_count;
+  pending_kernels_[first_available_].terminated = false;
+  ++first_available_;
+  ++num_pending_;
+  if (first_available_ >= pending_kernels_.size()) {
+    first_available_ = 0;
+  }
+  if (first_available_ == last_completed_) {
+    // Ring buffer is full: double it.  All of the same valid PendingKernel
+    // entries exist after the copy, they are just shifted to begin
+    // at index 0 in the new array.
+    std::vector<PendingKernel> new_buffer(pending_kernels_.size() * 2);
+    for (int i = 0; i < pending_kernels_.size(); ++i) {
+      int j = (i + last_completed_) % pending_kernels_.size();
+      new_buffer[i] = pending_kernels_[j];
+    }
+    last_completed_ = 0;
+    first_available_ = pending_kernels_.size();
+    pending_kernels_.swap(new_buffer);
+    VLOG(1) << "last_completed_=" << last_completed_
+            << " first_available_=" << first_available_
+            << " num_pending_=" << num_pending_;
+  }
+  DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
+  return queued_count;
+}
+
+void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
+  mutex_lock l(mu_);
+  VLOG(2) << "RecordTerminated queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_ << " LC="
+          << ((last_completed_ >= 0)
+                  ? pending_kernels_[last_completed_].queued_count
+                  : -1);
+  DCHECK_NE(first_available_, last_completed_);
+  DCHECK_GT(num_pending_, 0);
+  // Starting just past the last completed entry, find the entry with
+  // this queued_count and mark it done.
+  int index = (last_completed_ + 1) % pending_kernels_.size();
+  while (true) {
+    if (index == first_available_) {
+      // This should never happen.
+      LOG(FATAL) << "Failed to find " << queued_count  // Crash OK
+                 << " in queue";
+    }
+    if (pending_kernels_[index].queued_count == queued_count) {
+      pending_kernels_[index].terminated = true;
+      break;
+    }
+    index = (index + 1) % pending_kernels_.size();
+  }
+  // Next move last_completed_ forward past all completed kernels.  In theory
+  // kernels should always complete in queued order so we should be able to
+  // advance the completed frontier to the last queued PendingKernel.  In
+  // practice we occassionally see the termination callbacks arrive out of order
+  // probably because of thread scheduling.  Eventually we may support out-of-
+  // order completion involving multple compute streams so here we follow a
+  // conservative approach and wait for every single callback to arrive before
+  // advancing the frontier.
+  while (true) {
+    int next_index = (last_completed_ + 1) % pending_kernels_.size();
+    if (next_index == first_available_) break;
+    if (pending_kernels_[next_index].terminated) {
+      last_completed_ = next_index;
+    } else {
+      break;
+    }
+  }
+  // Last decrease num_pending before maybe waking a waiter.
+  --num_pending_;
+  pending_decreased_.notify_one();
+}
+
+uint64 GPUKernelTracker::LastTerminatedCount() {
+  mutex_lock l(mu_);
+  if (last_completed_ < 0) {
+    // This is an edge case that can be encountered only at the beginning of
+    // execution.  There's not yet a safe threshold count. We don't want to
+    // return 0 since that bypasses the count mechanism in BFCAllocator, so
+    // return the least non-zero value.
+    return 1;
+  }
+  return pending_kernels_[last_completed_].queued_count;
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d002d02c51d073ef3019fa1659d555b5d092d883..377133043f7f64e2b98b3c718206e6e11b800abd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
 #endif
 
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+class GPUKernelTracker;
 
 class BaseGPUDevice : public LocalDevice {
  public:
@@ -96,7 +98,7 @@ class BaseGPUDevice : public LocalDevice {
                                Allocator* allocator) override;
 
   // Returns the platform GPU id of this device within the native driver system;
-  // e.g., for CUDA this is the ordinal of the GPU within the system.
+  // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
   int gpu_id() const {
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
@@ -114,6 +116,17 @@ class BaseGPUDevice : public LocalDevice {
     return scoped_allocator_mgr_.get();
   }
 
+  // The following two functions always return 0 unless one of the
+  // related experimental config options has been specified.
+
+  // If returned value is > 0 then GPU Memory chunks freed before this count
+  // are guaranteed not to be in use by any kernel pending on this device.
+  uint64 SafeAllocFrontier() override;
+
+  // Returns the number of kernels that have been queued for execution on
+  // the compute stream and are not yet known to have completed.
+  int PendingKernels();
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -141,6 +154,9 @@ class BaseGPUDevice : public LocalDevice {
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  int pending_cap_ = 0;
+  bool timestamped_allocator_ = false;
 
   // Initialize scractch buffers used by Eigen.
   Status InitScratchBuffers();
@@ -163,6 +179,83 @@ class BaseGPUDevice : public LocalDevice {
                               StatusCallback done);
 };
 
+// A per-compute-stream utility that keeps track of kernels that have been
+// queued for execution but may not yet have terminated, and also the queued
+// time of the most recently terminated kernel.
+class GPUKernelTracker {
+ public:
+  // If we're going to share a SharedCounter with an allocator, it's owned
+  // by the allocator because allocators are initialized once per process.
+  // Devices are per-session.
+  explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter)
+      : env_(env), timing_counter_(timing_counter), pending_kernels_(64) {
+    if (!timing_counter_) {
+      // There's not a preexisting counter owned by GPUProcessState, i.e.
+      // pending_cap > 0 but timestamped_allocator == false.
+      owned_counter_.reset(new SharedCounter);
+      timing_counter_ = owned_counter_.get();
+    }
+  }
+
+  // Record that a GPU kernel has just been enqueued on the compute stream.
+  // Inserts a new timing counter value in a new PendingKernel record appended
+  // to the end of the ring buffer then returns that same count.
+  uint64 RecordQueued();
+
+  // Takes a count value returned by RecordQueued and finds the corresponding
+  // PendingKernel record in the ring buffer.  Marks the kernel as completed and
+  // advances the completion frontier accordingly.
+  void RecordTerminated(uint64 at_count);
+
+  // Returns the largest timing count such that all kernels queued no
+  // later than that count are known to have terminated.
+  uint64 LastTerminatedCount();
+
+  // Returns the number of kernels enqueued that are not yet known to
+  // have terminated.
+  int NumPending() {
+    mutex_lock l(mu_);
+    return num_pending_;
+  }
+
+  // Yield current thread until number of pending kernels no longer
+  // exceeds the cap.
+  void PauseWhilePendingExceeds(int cap) {
+    mutex_lock l(mu_);
+    while (num_pending_ > cap) {
+      pending_decreased_.wait(l);
+    }
+  }
+
+ private:
+  Env* env_;
+  SharedCounter* timing_counter_;
+  std::unique_ptr<SharedCounter> owned_counter_;
+
+  // Records when a kernel was queued for execution.  Kernel launches are
+  // identified by a unique count value from a per-GPU device timing counter.
+  struct PendingKernel {
+    uint64 queued_count;
+    bool terminated;
+    PendingKernel(const PendingKernel& pk)
+        : queued_count(pk.queued_count), terminated(pk.terminated) {}
+    PendingKernel() : queued_count(0), terminated(false) {}
+  };
+  mutex mu_;
+  // Ring buffer of PendingKernel records.
+  std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
+  // Next unused slot in pending_kernels_.
+  int first_available_ GUARDED_BY(mu_) = 0;
+  // Last completed PendingKernel such that all prior PendingKernels are
+  // also completed.  With out-of-order completion there may be a mixture
+  // of completed and uncompleted entries between last_completed_ and
+  // first_available_, hence num_pending_ is not guaranteed equal to
+  // their differerence.
+  int last_completed_ GUARDED_BY(mu_) = -1;
+  int num_pending_ GUARDED_BY(mu_) = 0;
+  condition_variable pending_decreased_ GUARDED_BY(mu_);
+};
+
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
@@ -218,8 +311,8 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
   // based upon 'visible_gpu_order' which was generated by parsing
-  // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
-  // ids.
+  // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
+  // ROCm GPU ids.
   Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order,
                            std::vector<PlatformGpuId>* ids);
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 962891894ad63c40036a153ebe5d4666f0e43049..99243aa28b5a769afd96418c2daff58ecac6e386 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -45,7 +45,7 @@ class GPUDevice : public BaseGPUDevice {
     if (attr.on_host()) {
       if (attr.gpu_compatible() || force_gpu_compatible_) {
         GPUProcessState* ps = GPUProcessState::singleton();
-        return ps->GetCUDAHostAllocator(0);
+        return ps->GetGpuHostAllocator(0);
       } else {
         return cpu_allocator_;
       }
@@ -94,7 +94,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     GPUProcessState* ps = GPUProcessState::singleton();
     if (attr.gpu_compatible() || force_gpu_compatible_) {
-      return ps->GetCUDAHostAllocator(numa_node_);
+      return ps->GetGpuHostAllocator(numa_node_);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
@@ -136,4 +136,4 @@ REGISTER_LOCAL_DEVICE_FACTORY("CPU", GPUCompatibleCPUDeviceFactory, 70);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index ae623b2adbe152de6cbad248db234ac5469f83e1..2628cd413faf63fdf9eee82e263dabc75ca01669 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -276,6 +277,70 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   allocator->DeallocateRaw(ptr);
 }
 
+class GPUKernelTrackerTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    timing_counter_.reset(new SharedCounter);
+    kernel_tracker_.reset(
+        new GPUKernelTracker(Env::Default(), timing_counter_.get()));
+  }
+
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  std::unique_ptr<SharedCounter> timing_counter_;
+};
+
+TEST_F(GPUKernelTrackerTest, basic) {
+  EXPECT_EQ(0, kernel_tracker_->NumPending());
+  // 1 is the expected value when no kernels have yet terminated.
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  std::deque<int64> queued_counts;
+  for (int i = 0; i < 32; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+  }
+  EXPECT_EQ(32, kernel_tracker_->NumPending());
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  // Mature the kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    EXPECT_EQ(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+
+  // Next inject so many kernel events that the ring buffer needs
+  // to grow a couple of times, while maturing a few in random order
+  // to introduce gaps between last_completed_ and first_available_.
+  int64 lower_bound = timing_counter_->get();
+  for (int i = 0; i < 1111; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+    int64 upper_bound = timing_counter_->get();
+    if (0 == (i % 16)) {
+      size_t index = (random::New64() % queued_counts.size());
+      kernel_tracker_->RecordTerminated(queued_counts[index]);
+      queued_counts.erase(queued_counts.begin() + index);
+      EXPECT_LE(lower_bound, kernel_tracker_->LastTerminatedCount());
+      EXPECT_GE(upper_bound, kernel_tracker_->LastTerminatedCount());
+    }
+  }
+
+  // Next mature the remaining kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    // There may be a gap here where we find a kernel that got terminated
+    // out of order, earlier, so the LastTerminatedCount can actually
+    // jump past x.
+    EXPECT_LE(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+}
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 3c1c31aa732d373e76599cdc8fe8ae8561765c9c..6531d6d367b1407d89da16f2023f72b75903daf9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -241,7 +241,9 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // events have recorded, and then retire them.  Initial observations
 // suggest that typical behavior in a TensorFlow program is to have
 // 0-3 events pending most of the time, but there are occasionally
-// spikes of up to several hundred outstanding.
+// spikes of up to several hundred outstanding.  (If GPUKernelTracker
+// is used to cap pending kernels there should never be more than
+// that many.)
 //
 // NOTE: If all events are on the same stream, no later event will
 // complete before an earlier event, except possibly if the earlier
@@ -249,13 +251,10 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // looking past the first kPending event.  However, if we're using
 // multiple streams there may be some gain in looking deeper.
 // As a compromise, PollEvent() calls that are triggered by the queueing
-// of a single event never look past the first kPending event.  Calls
-// coming from the dedicated polling thread always sweep the full queue.
-//
-// Note that allowing the queue to grow very long could cause overall
-// GPU memory use to spike needlessly.  An alternative strategy would
-// be to throttle new Op execution until the pending event queue
-// clears.
+// of a single event never look past the first kPending event.  Consequently
+// those calls do an expected constant amount of work, unaffected by the
+// length of the pending queue.  Calls coming from the dedicated
+// polling thread always sweep the full queue.
 void EventMgr::PollEvents(bool is_dedicated_poller,
                           gtl::InlinedVector<InUse, 4>* to_free) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
similarity index 74%
rename from tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
rename to tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
index 6bd29ef775fb87d29a95211719a430156e6f7bb5..3ac579112f952e41ffa4ce4c9e96e395afbbfc68 100644
--- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_host_allocator.h
@@ -13,28 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
-// Allocator for pinned CPU RAM that is made known to CUDA for the
+// Allocator for pinned CPU RAM that is made known to GPU for the
 // purpose of efficient DMA with a GPU.
-class CUDAHostAllocator : public SubAllocator {
+class GpuHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
-                             const std::vector<Visitor>& alloc_visitors,
-                             const std::vector<Visitor>& free_visitors)
+  explicit GpuHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
+                            const std::vector<Visitor>& alloc_visitors,
+                            const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors),
         stream_exec_(stream_exec),
         numa_node_(numa_node) {
     CHECK(stream_exec_ != nullptr);
   }
-  ~CUDAHostAllocator() override {}
+  ~GpuHostAllocator() override {}
 
   void* Alloc(size_t alignment, size_t num_bytes) override {
     void* ptr = nullptr;
@@ -61,8 +61,8 @@ class CUDAHostAllocator : public SubAllocator {
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const int numa_node_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuHostAllocator);
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_CUDA_HOST_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_HOST_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index e0ec93a98e2e4f3ba566dd591a84b2c088cbeea0..b6672c35712be9b0a1ea87b4ceb4f3a25bffac57 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -29,17 +29,27 @@ limitations under the License.
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  return se::MultiPlatformManager::PlatformWithName("CUDA").status();
+  return se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).status();
 }
 
 se::Platform* GPUMachineManager() {
-  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto result = se::MultiPlatformManager::PlatformWithName(GpuPlatformName());
   if (!result.ok()) {
-    LOG(FATAL) << "Could not find Platform with name CUDA";
+    LOG(FATAL) << "Could not find Platform with name " << GpuPlatformName();
     return nullptr;
   }
 
   return result.ValueOrDie();
 }
 
+string GpuPlatformName() {
+#if TENSORFLOW_USE_ROCM
+  return "ROCM";
+#else
+  // This function will return "CUDA" even when building TF without GPU support
+  // This is done to preserve existing functionality
+  return "CUDA";
+#endif
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index 4e1f06ac838deca24cce0bef19208d5984155b5e..4c8f0868df0f16bd3c8b2cd8a8338f0914e1c9b7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
 
+#include <string>
 #include "tensorflow/core/lib/core/status.h"
 
 namespace stream_executor {
@@ -24,7 +25,7 @@ class Platform;
 
 namespace tensorflow {
 
-// Initializes the CUDA platform and returns OK if the CUDA
+// Initializes the GPU platform and returns OK if the GPU
 // platform could be initialized.
 Status ValidateGPUMachineManager();
 
@@ -34,6 +35,11 @@ Status ValidateGPUMachineManager();
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
 stream_executor::Platform* GPUMachineManager();
 
+// Returns the string describing the name of the GPU platform in use.
+// This value is "CUDA" by default, and
+// "ROCM" when TF is built with `--config==rocm`
+string GpuPlatformName();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
index 613633eb910381b530b350a22c0b557bb108e968..aad42df5f1fae91e1315f14cab1931be87d9a5ed 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef GOOGLE_CUDA
+#include "cuda/include/cuda.h"
 #define EIGEN_USE_GPU
 #endif
 
@@ -24,7 +25,10 @@ namespace tensorflow {
 void* GpuManagedAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
 #ifdef GOOGLE_CUDA
-  CHECK_EQ(cudaMallocManaged(&ptr, num_bytes), cudaSuccess);
+  CUdeviceptr result = 0;
+  CHECK_EQ(cuMemAllocManaged(&result, num_bytes, CU_MEM_ATTACH_GLOBAL),
+           CUDA_SUCCESS);
+  ptr = reinterpret_cast<void*>(result);
 #endif
   CHECK(!(reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)));
   return ptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0..ea45bfaef534e62ed33b7eb3da6480159ccf8cb2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -18,15 +18,16 @@ limitations under the License.
 #include <cstring>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/gpu/cuda_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
@@ -80,7 +81,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
                                             TfGpuId tf_gpu_id,
                                             size_t total_bytes) {
   CHECK(process_state_);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
@@ -90,7 +91,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
   }
 
   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
-  if (allocator_parts.allocator.get() == nullptr) {
+  if (allocator_parts.allocator == nullptr) {
     // Validate allocator types.
     if (!allocator_type.empty() && allocator_type != "BFC") {
       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -110,9 +111,15 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
         (options.per_process_gpu_memory_fraction() > 1.0 ||
          options.experimental().use_unified_memory()),
         gpu_visitors_[bus_id], {});
-    Allocator* gpu_allocator =
+    GPUBFCAllocator* gpu_bfc_allocator =
         new GPUBFCAllocator(sub_allocator, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
+    Allocator* gpu_allocator = gpu_bfc_allocator;
+    SharedCounter* timing_counter = nullptr;
+    if (options.experimental().timestamped_allocator()) {
+      timing_counter = new SharedCounter;
+      gpu_bfc_allocator->SetTimingCounter(timing_counter);
+    }
 
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
@@ -137,7 +144,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
       recording_allocator = new internal::RecordingAllocator(
           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
     }
-    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
+    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
+                       std::unique_ptr<SharedCounter>(timing_counter),
+                       sub_allocator,
                        std::unique_ptr<Allocator>(recording_allocator)};
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
@@ -146,39 +155,56 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     return allocator_parts.allocator.get();
   }
 #else
-  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
+  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
+                "--config=rocm.";
+  return nullptr;
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}
+
+SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
+  DCHECK(process_state_);
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
+  mutex_lock l(mu_);
+  if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    return nullptr;
+  }
+
+  AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+  return allocator_parts.counter.get();
+#else
   return nullptr;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
+Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
   CHECK(process_state_);
   if (!HasGPUDevice() ||
-      !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
+      !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
   if (numa_node == port::kNUMANoAffinity) {
     numa_node = 0;
   }
   {
-    // Here we optimize the most common use case where cuda_host_allocators_
-    // and cuda_al_ have already been populated and since we're only reading
+    // Here we optimize the most common use case where gpu_host_allocators_
+    // have already been populated and since we're only reading
     // these vectors, we can get by with a shared lock. In the slower case,
     // we take a unique lock and populate these vectors.
     tf_shared_lock lock(mu_);
 
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
-        !cuda_host_allocators_.empty() &&
-        cuda_host_allocators_[0].recording_allocator != nullptr) {
-      return cuda_host_allocators_[0].recording_allocator.get();
+        !gpu_host_allocators_.empty() &&
+        gpu_host_allocators_[0].recording_allocator != nullptr) {
+      return gpu_host_allocators_[0].recording_allocator.get();
     }
-    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
-      return cuda_host_allocators_[0].allocator.get();
+    if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
+      return gpu_host_allocators_[0].allocator.get();
     }
   }
 
   mutex_lock lock(mu_);
-  // Find the first valid StreamExecutor to request CUDA host memory
+  // Find the first valid StreamExecutor to request CUDA or ROCm host memory
   // through, since any will work.
   //
   // This search isn't super clean, and it would be nice to use a
@@ -195,38 +221,39 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
 
   CHECK_NE(nullptr, se);
 
-  while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
-    while (cuda_host_alloc_visitors_.size() <= numa_node) {
-      cuda_host_alloc_visitors_.push_back({});
+  while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
+    while (gpu_host_alloc_visitors_.size() <= numa_node) {
+      gpu_host_alloc_visitors_.push_back({});
     }
-    while (cuda_host_free_visitors_.size() <= numa_node) {
-      cuda_host_free_visitors_.push_back({});
+    while (gpu_host_free_visitors_.size() <= numa_node) {
+      gpu_host_free_visitors_.push_back({});
     }
-    SubAllocator* sub_allocator = new CUDAHostAllocator(
-        se, numa_node, cuda_host_alloc_visitors_[numa_node],
-        cuda_host_free_visitors_[numa_node]);
+    SubAllocator* sub_allocator =
+        new GpuHostAllocator(se, numa_node, gpu_host_alloc_visitors_[numa_node],
+                             gpu_host_free_visitors_[numa_node]);
     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
-    int64 cuda_host_mem_limit_in_mb = -1;
-    Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB",
+    int64 gpu_host_mem_limit_in_mb = -1;
+    Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
                                         1LL << 16 /*64GB max by default*/,
-                                        &cuda_host_mem_limit_in_mb);
+                                        &gpu_host_mem_limit_in_mb);
     if (!status.ok()) {
-      LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
+      LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
     }
-    int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
+    int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
     Allocator* allocator =
-        new BFCAllocator(sub_allocator, cuda_host_mem_limit,
-                         true /*allow_growth*/, "cuda_host_bfc" /*name*/);
+        new BFCAllocator(sub_allocator, gpu_host_mem_limit,
+                         true /*allow_growth*/, "gpu_host_bfc" /*name*/);
 
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
       allocator = new TrackingAllocator(allocator, true);
     }
-    cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
-                                     sub_allocator,
-                                     std::unique_ptr<Allocator>(nullptr)});
-    AllocatorParts& allocator_parts = cuda_host_allocators_.back();
+    gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+                                    std::unique_ptr<SharedCounter>(nullptr),
+                                    sub_allocator,
+                                    std::unique_ptr<Allocator>(nullptr)});
+    AllocatorParts& allocator_parts = gpu_host_allocators_.back();
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
       ProcessState::MemDesc md;
       md.loc = ProcessState::MemDesc::CPU;
@@ -240,15 +267,15 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
     }
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
-    return cuda_host_allocators_[0].recording_allocator.get();
+    return gpu_host_allocators_[0].recording_allocator.get();
   } else {
-    return cuda_host_allocators_[0].allocator.get();
+    return gpu_host_allocators_[0].allocator.get();
   }
 }
 
 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
                                          const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
@@ -258,35 +285,35 @@ void GPUProcessState::AddGPUAllocVisitor(int bus_id,
     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
   gpu_visitors_[bus_id].push_back(visitor);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-void GPUProcessState::AddCUDAHostAllocVisitor(
+void GPUProcessState::AddGpuHostAllocVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
-  CHECK(cuda_host_allocators_.empty())  // Crash OK
-      << "AddCUDAHostAllocVisitor must be called before "
-         "first call to GetCUDAHostAllocator.";
-  while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) {
-    cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  CHECK(gpu_host_allocators_.empty())  // Crash OK
+      << "AddGpuHostAllocVisitor must be called before "
+         "first call to GetGpuHostAllocator.";
+  while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
+    gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
-  cuda_host_alloc_visitors_[numa_node].push_back(visitor);
-#endif  // GOOGLE_CUDA
+  gpu_host_alloc_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-void GPUProcessState::AddCUDAHostFreeVisitor(
+void GPUProcessState::AddGpuHostFreeVisitor(
     int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   mutex_lock lock(mu_);
-  CHECK(cuda_host_allocators_.empty())  // Crash OK
-      << "AddCUDAHostFreeVisitor must be called before "
-         "first call to GetCUDAHostAllocator.";
-  while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) {
-    cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  CHECK(gpu_host_allocators_.empty())  // Crash OK
+      << "AddGpuHostFreeVisitor must be called before "
+         "first call to GetGpuHostAllocator.";
+  while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
+    gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
-  cuda_host_free_visitors_[numa_node].push_back(visitor);
-#endif  // GOOGLE_CUDA
+  gpu_host_free_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
 void GPUProcessState::TestOnlyReset() {
@@ -298,9 +325,9 @@ void GPUProcessState::TestOnlyReset() {
     gpu_device_enabled_ = false;
     gpu_allocators_.clear();
     gpu_visitors_.clear();
-    cuda_host_allocators_.clear();
-    cuda_host_alloc_visitors_.clear();
-    cuda_host_free_visitors_.clear();
+    gpu_host_allocators_.clear();
+    gpu_host_alloc_visitors_.clear();
+    gpu_host_free_visitors_.clear();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index df51c10c8065fa94d736c8f4dfa76faebdc8bc62..09e5575f04fee6ca4e75a1de4d0ec195d1142efd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -33,6 +34,7 @@ namespace tensorflow {
 
 class Allocator;
 class PoolAllocator;
+class SharedCounter;
 
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
@@ -81,7 +83,7 @@ class GPUProcessState {
   virtual Allocator* GetGPUAllocator(const GPUOptions& options,
                                      TfGpuId tf_gpu_id, size_t total_bytes);
 
-  virtual Allocator* GetCUDAHostAllocator(int numa_node);
+  virtual Allocator* GetGpuHostAllocator(int numa_node);
 
   // Registers a Visitor to be invoked on new chunks of memory allocated by the
   // SubAllocator of every GPU proximate to the specified bus.  The AllocVisitor
@@ -96,18 +98,20 @@ class GPUProcessState {
                                   const SubAllocator::Visitor& visitor);
 
   // Registers a Visitor to be invoked on new chunks of memory allocated by
-  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
-  virtual void AddCUDAHostAllocVisitor(int numa_node,
-                                       const SubAllocator::Visitor& visitor);
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostAllocVisitor(int numa_node,
+                                      const SubAllocator::Visitor& visitor);
 
   // Registers a Visitor to be invoked on each chunk handed back for freeing to
-  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
-  virtual void AddCUDAHostFreeVisitor(int numa_node,
-                                      const SubAllocator::Visitor& visitor);
+  // the SubAllocator of the GpuHostAllocator for the given numa_node.
+  virtual void AddGpuHostFreeVisitor(int numa_node,
+                                     const SubAllocator::Visitor& visitor);
 
   // Returns bus_id for the given GPU id.
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
+  SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
+
  protected:
   // GPUProcessState is a singleton that should not normally be deleted except
   // at process shutdown.
@@ -132,16 +136,17 @@ class GPUProcessState {
 
   struct AllocatorParts {
     std::unique_ptr<Allocator> allocator;
+    std::unique_ptr<SharedCounter> counter;
     SubAllocator* sub_allocator;  // owned by allocator
     std::unique_ptr<Allocator> recording_allocator;
   };
   std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
 
-  std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_
+  std::vector<AllocatorParts> gpu_host_allocators_ GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_alloc_visitors_
       GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_host_free_visitors_
       GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 5851360cab720b078e5d21e5e2ef82d6352f4110..56f68c8a8ed52f0482df03099cf09780c720bce7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -150,7 +150,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
     tracing::ScopedAnnotation annotation("SetProtoFromGPU");
-    alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+    alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
     buf = alloc->Allocate<char>(total_bytes);
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation("SetProtoFromGPU",
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 6b2f6547b070f07b09584b5f436167a98b17beba..42b5ed959c9e6ce50e3a5f56cb1b8d20c4fc2ec1 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 
-#include "tensorflow/core/common_runtime/gpu/cuda_host_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,7 +29,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -48,7 +48,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -82,7 +82,7 @@ TEST(PoolAllocatorTest, Alignment) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
@@ -142,7 +142,7 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
       };
   se::Platform* platform =
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
-  CUDAHostAllocator* sub_allocator = new CUDAHostAllocator(
+  GpuHostAllocator* sub_allocator = new GpuHostAllocator(
       platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
           .ValueOrDie(),
       0 /*numa_node*/, {alloc_visitor}, {free_visitor});
@@ -247,7 +247,7 @@ TEST(PoolAllocatorTest, Name) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
+      new GpuHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie(),
           0 /*numa_node*/, {}, {}),
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 8875883f62787dcb31daf33b076e0292c0e39d76..31455e5d952e5a2818d562ceb35b815cca27df4e 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -570,8 +571,11 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
-  Placer placer(new_graph.get(), device_set_, session_options_,
-                /* default_device= */ nullptr);
+  Placer placer(new_graph.get(), device_set_, /* default_device= */ nullptr,
+                session_options_ == nullptr ||
+                    session_options_->config.allow_soft_placement(),
+                session_options_ != nullptr &&
+                    session_options_->config.log_device_placement());
   // TODO(mrry): Consider making the Placer cancelable.
   TF_RETURN_IF_ERROR(placer.Run());
 
@@ -732,6 +736,7 @@ Status GraphExecutionState::OptimizeGraph(
 Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
                                        std::unique_ptr<ClientGraph>* out) {
   VLOG(1) << "BuildGraph";
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   if (!graph_) {
     // It is only valid to call this method directly when the original graph
     // was created with the option `place_pruned_graph == false`.
@@ -835,7 +840,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
-
+  metrics::UpdateGraphBuildTime(Env::Default()->NowMicros() - start_time_usecs);
   *out = std::move(dense_copy);
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 7905944fb18105e38059a892d32b9509273a7742..465cddfe1ab7b96c81e282cce04c212515fffe55 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -86,9 +86,15 @@ void GraphOptimizer::Optimize(
       DumpGraph("OptimizeCSE", g);
       changed = true;
     }
-    if (opts_.do_function_inlining() && ExpandInlineFunctions(runtime, g)) {
-      DumpGraph("ExpandInlineFunctions", g);
-      changed = true;
+    if (opts_.do_function_inlining()) {
+      ExpandInlineFunctionsOptions expand_inline_opts;
+      expand_inline_opts.native_options.override_device = true;
+
+      bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
+      if (was_mutated) {
+        DumpGraph("ExpandInlineFunctions", g);
+        changed = true;
+      }
     }
     if (!changed) break;
   }
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index f9aef3af70bc37ca1d2a679b6f310676118ab763..13f4784ee7f050d1b1c2cb88b2c147bb1a84a643 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -92,7 +92,7 @@ class SimpleRendezvous : public Rendezvous {
 }  // namespace
 
 GraphRunner::GraphRunner(Env* env)
-    : device_deleter_(new SingleThreadedCpuDevice(env)),
+    : device_deleter_(NewSingleThreadedCpuDevice(env)),
       device_(device_deleter_.get()) {}
 GraphRunner::GraphRunner(Device* device) : device_(device) {}
 
@@ -158,9 +158,10 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   params.device = device_;
   params.function_library = function_library;
   const int producer = graph_to_run->versions().producer();
-  params.create_kernel = [this, producer](const NodeDef& ndef,
-                                          OpKernel** kernel) {
-    return CreateNonCachedKernel(device_, nullptr, ndef, producer, kernel);
+  params.create_kernel = [this, function_library, producer](const NodeDef& ndef,
+                                                            OpKernel** kernel) {
+    return CreateNonCachedKernel(device_, function_library, ndef, producer,
+                                 kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
 
@@ -180,6 +181,9 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   // so a CollectiveExecutor should never be required.
   args.collective_executor = nullptr;
 
+  CancellationManager cancellation_manager;
+  args.cancellation_manager = &cancellation_manager;
+
   // Run the graph.
   TF_RETURN_IF_ERROR(executor->Run(args));
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index eae34997d9a801ab19a81868809879dfcec914cd..a7dc9466bb31364be3dd07a707bb41581e8b11cd 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -62,7 +62,7 @@ int HierarchicalTreeBroadcaster::GetDeviceTask(
     int device_rank, const std::vector<int>& dev_per_task) {
   int num_tasks = static_cast<int>(dev_per_task.size());
   int task_lo = 0;
-  int task_hi;
+  int task_hi = -1;
   for (int ti = 0; ti < num_tasks; ti++) {
     task_hi = task_lo + dev_per_task[ti];
     if (task_lo <= device_rank && device_rank < task_hi) return ti;
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
index ceb9baad30b214e5d3bec0cdbb470474d84e7227..76392b8e59e904d3bde7739f640ab92ff53aa96b 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
@@ -41,6 +41,11 @@ class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
   // and device_locality.  Also saves the CollectiveContext in this object.
   Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
 
+  // No-op for hierarchical tree broadcaster.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
   // Begins async execution of the hierarchical tree broadcast.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index f0656ff53332d7dd4f21d9d874846c16fb669681..d17e4b08e08c939ca7fe98280dd55a25bdb9a9c7 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -504,7 +504,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
     // instance may succeed while others fail, even if a transmission
     // failure occurs early in the operation chain.  So, when an abort
     // is specified we need to verify that at least one Op fails with
-    // the expected status and any Op that succeeds yeilds the correct
+    // the expected status and any Op that succeeds yields the correct
     // value.
     if (fail_after > 0) {
       mutex_lock l(mu_);
@@ -616,7 +616,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
         dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+            &cpu_tensor, device_, &tensor_, [&notification](Status s) {
               TF_CHECK_OK(s);
               notification.Notify();
             });
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index f1fcca194e9ef56bf7b96e6c73717db7620b9812..649d83eebececa045e65cb6f1245e75432359d00 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
@@ -27,8 +28,26 @@ limitations under the License.
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
+namespace {
+
+bool OverrideGlobalThreadPoolFromEnvironment() {
+  static const bool override_global_threadpool = [] {
+    bool flag;
+    auto status = ReadBoolFromEnvVar("TF_OVERRIDE_GLOBAL_THREADPOOL",
+                                     /*default_val=*/false, &flag);
+    if (!status.ok()) {
+      LOG(ERROR) << "OverrideGlobalThreadPool: " << status.error_message();
+      return false;
+    }
+    return flag;
+  }();
+  return override_global_threadpool;
+}
+
+}  // namespace
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
@@ -53,15 +72,22 @@ struct LocalDevice::EigenThreadPoolInfo {
 
   explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
                                Allocator* allocator) {
+    // Use session setting if specified.
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
+    // If no session setting, use environment setting.
     if (intra_op_parallelism_threads == 0) {
-      intra_op_parallelism_threads = port::NumSchedulableCPUs();
-      if (numa_node != port::kNUMANoAffinity) {
-        // Assume that CPUs are equally distributed over available NUMA nodes.
-        // This may not be true, but there isn't currently a better way of
-        // determining the number of CPUs specific to the requested node.
-        intra_op_parallelism_threads /= port::NUMANumNodes();
+      static int env_num_threads = NumIntraOpThreadsFromEnvironment();
+      intra_op_parallelism_threads = env_num_threads;
+      // If no session setting or environment, compute a reasonable default.
+      if (intra_op_parallelism_threads == 0) {
+        intra_op_parallelism_threads = port::NumSchedulableCPUs();
+        if (numa_node != port::kNUMANoAffinity) {
+          // Assume that CPUs are equally distributed over available NUMA nodes.
+          // This may not be true, but there isn't currently a better way of
+          // determining the number of CPUs specific to the requested node.
+          intra_op_parallelism_threads /= port::NUMANumNodes();
+        }
       }
     }
     ThreadOptions thread_opts;
@@ -99,6 +125,11 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   // could speed up performance and are available on the current CPU.
   port::InfoAboutUnusedCPUFeatures();
   LocalDevice::EigenThreadPoolInfo* tp_info;
+
+  if (OverrideGlobalThreadPoolFromEnvironment()) {
+    set_use_global_threadpool(false);
+  }
+
   if (use_global_threadpool_) {
     mutex_lock l(global_tp_mu_);
     if (options.config.experimental().use_numa_affinity()) {
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 241c403087c814717d873fc3d4d4c2c4f71e50ae..d288f21eded0b8e33f4fde2da70cd1a1ed47fe41 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -85,6 +85,11 @@ class CondBuilder {
   Node* pivot_t_;
   Node* then_call_node_;
   Node* else_call_node_;
+  // Merge node that has inputs from [pivot_t, pivot_f] and control edges from
+  // [^then_call_node_, ^else_call_node_]. This node will guarantee that if
+  // then/else branch functions do not have outputs, they still will be executed
+  // for the side effects.
+  Node* branch_executed_node_;
   Graph* graph_;
   const FunctionLibraryDefinition& flib_;
   string name_;
@@ -142,11 +147,21 @@ string CondBuilder::NewName(const string& infix) {
 Status CondBuilder::AddInput(Node* src, int src_output) {
   Node* input;
   NodeDebugInfo debug_info(*src);
+  // Colocate the Switch node with the `src` node.
+  //
+  // This is to avoid unnecessary Host<->Device copies between src and the
+  // Switch node. This aligns with the implementation of legacy tf.cond in
+  // control_flow_ops.py. The legacy impl colocates the Switch with the
+  // input tensor which resets the device stack and forces the Switch to have
+  // the same device as the input node (if set) and sets the colocation _class
+  // attr. It also ignores the existing colocation constraints on the input node
+  // using colocate_with(ignore_existing=True).
   TF_RETURN_IF_ERROR(NodeBuilder(NewName(src->name()), "Switch",
                                  graph_->op_registry(), &debug_info)
                          .Input(src, src_output)
                          .Input(pred_)
-                         .Device(if_op_->requested_device())
+                         .Device(src->requested_device())
+                         .Attr("_class", {src->name()})
                          .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
@@ -178,12 +193,12 @@ Status CondBuilder::AddOutputs() {
   TF_RETURN_IF_ERROR(else_call_builder_.Finalize(graph_, &else_call_node_));
   graph_->AddControlEdge(pivot_f_, else_call_node_);
 
-  // Merge the outputs from the two branches.
+  // Add Merge node for each data output of the If node.
   std::vector<Node*> merges(then_call_node_->num_outputs());
   outputs_.resize(merges.size());
   for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
     TF_RETURN_IF_ERROR(
-        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry(),
+        NodeBuilder(graph_->NewName("output"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
             .Device(if_op_->requested_device())
@@ -191,6 +206,20 @@ Status CondBuilder::AddOutputs() {
     outputs_[i] = NodeOut(merges[i], 0);
   }
 
+  // Add a Merge node that will be used as a control dependency source for the
+  // lowered output node. This Merge node will guarantee that lowered else/then
+  // function calls will be executed even if they do not have data outputs.
+  //
+  // Furthermore it will guarantee that all function side effects will be
+  // executed, if the function will be inlined into the graph. Having data
+  // outputs is not enough, because they might become unused after inlining.
+  TF_RETURN_IF_ERROR(NodeBuilder(graph_->NewName("branch_executed"), "Merge",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input({pivot_t_, pivot_f_})
+                         .ControlInputs({then_call_node_, else_call_node_})
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &branch_executed_node_));
+
   TF_RETURN_IF_ERROR(BuildLoweredIfOutput());
 
   // Add outputs.
@@ -203,11 +232,12 @@ Status CondBuilder::AddOutputs() {
       graph_->AddEdge(merges[e->src_output()], 0, e->dst(), e->dst_input());
     }
   }
+
   return Status::OK();
 }
 
-Status InlineCallInGraph(Node* n, const FunctionLibraryDefinition& flib,
-                         Graph* g) {
+Status InlineCallInGraph(const absl::string_view branch_name, Node* n,
+                         const FunctionLibraryDefinition& flib, Graph* g) {
   const FunctionDef* fdef = flib.Find(n->type_string());
   CHECK(fdef != nullptr);
   FunctionBody* fbody;
@@ -219,21 +249,42 @@ Status InlineCallInGraph(Node* n, const FunctionLibraryDefinition& flib,
                               &fbody));
   // TODO(jpienaar): Improve this interface to make the need to delete it
   // explicit.
-  InlineFunctionBody(g->flib_def(), g, n, fbody, false);
+  InlineFunctionBodyOptions inline_opts;
+  inline_opts.override_device = false;
+
+  Status can_inline_function_call = ValidateInlining(n, fbody, inline_opts);
+
+  if (can_inline_function_call.ok()) {
+    TF_RETURN_IF_ERROR(
+        InlineFunctionBody(g->flib_def(), g, n, fbody, inline_opts));
+  } else {
+    VLOG(4) << "Do not inline '" << branch_name << "' branch function call: "
+            << can_inline_function_call.error_message();
+  }
+
   delete fbody;
   return Status::OK();
 }
 
 Status CondBuilder::BuildLoweredIfOutput() {
-  // Build the identity node output.
-  NodeBuilder ib(name_, "IdentityN");
-  ib.Input(outputs_).Device(if_op_->requested_device());
+  // If outputs are empty, it means that we might have only output control
+  // edges. Furthermore it's illegal to have IdentityN with empty `T`.
+  // TODO(ezhulenev): `IdentityN` node will introduce redundant Send/Recv nodes
+  // if branch functions are multi-device.
+  NodeBuilder ib(name_, outputs_.empty() ? "NoOp" : "IdentityN");
+  if (!outputs_.empty()) ib.Input(outputs_);
+  ib.Device(if_op_->requested_device());
+  ib.ControlInput(branch_executed_node_);
   return ib.Finalize(graph_, &lowered_if_output_);
 }
 
 Status CondBuilder::InlineCallNodes() {
-  TF_RETURN_IF_ERROR(InlineCallInGraph(then_call_node_, flib_, graph_));
-  TF_RETURN_IF_ERROR(InlineCallInGraph(else_call_node_, flib_, graph_));
+  // TODO(ezhulenev): Function inlining of no-output produces a graph with
+  // undefined execution.
+  if (outputs_.empty()) return Status::OK();
+
+  TF_RETURN_IF_ERROR(InlineCallInGraph("then", then_call_node_, flib_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph("else", else_call_node_, flib_, graph_));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 5765e3e367f3e0bb61f087e36ea84d4e9c8b4f15..3c274a21b0563a7ecdef849a8dab849da0216afc 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -20,11 +20,13 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
@@ -35,6 +37,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+AttrValue FuncAttr(const string& name) {
+  AttrValue attr;
+  attr.mutable_func()->set_name(name);
+  return attr;
+}
+
 Status Rewrite(std::unique_ptr<Graph>* graph) {
   FunctionLibraryDefinition flib_def((*graph)->flib_def());
   GraphOptimizationPassOptions opt_options;
@@ -60,15 +68,11 @@ TEST(LowerIfOpTest, Simple) {
   auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 1);
   Node* written_if;
   std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
-  AttrValue tb;
-  tb.mutable_func()->set_name("XTimesTwo");
-  AttrValue eb;
-  eb.mutable_func()->set_name("XTimesFour");
   TF_ASSERT_OK(NodeBuilder("if", "If", &root.graph()->flib_def())
                    .Input(pred.node())
                    .Input(inputs)
-                   .Attr("then_branch", tb)
-                   .Attr("else_branch", eb)
+                   .Attr("then_branch", FuncAttr("XTimesTwo"))
+                   .Attr("else_branch", FuncAttr("XTimesFour"))
                    .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Attr("Tout", {DT_INT32})
                    .Finalize(root.graph(), &written_if));
@@ -107,10 +111,200 @@ TEST(LowerIfOpTest, Simple) {
   }
   // One switch for predicate and one for input (A).
   ASSERT_EQ(switch_count, 2);
-  // One merge for the single output values of then and else.
+  // One merge for the single output value of then and else, and one more merge
+  // to enforce then and else function call execution (`branch_executed` node).
+  ASSERT_EQ(merge_count, 2);
+  ASSERT_EQ(node_called_if_count, 1);
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 40);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 20);
+  }
+}
+
+TEST(LowerIfOpTest, BranchFunctionsWithoutOutputs) {
+  using ::tensorflow::test::function::GDef;
+  using ::tensorflow::test::function::NDef;
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  // Wrap AssignAddVariable + Const into a function.
+  const auto assign_add = [](const string& fn_name, int v) {
+    const Tensor tensor = test::AsScalar<int32>(v);
+    return FDH::Create(
+        fn_name, {"v: resource"}, {}, {},
+        {
+            {{"c"}, "Const", {}, {{"value", tensor}, {"dtype", DT_INT32}}},
+            {{"upd"},
+             "AssignAddVariableOp",
+             {"v", "c:output"},
+             {{"dtype", DT_INT32}}},
+        },
+        /*ret_def=*/{},
+        /*control_ret_def=*/{{"side_effects", "upd"}});
+  };
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  // Add test functions for then and else branch.
+  FunctionDefLibrary f_lib_proto;
+  *(f_lib_proto.add_function()) = assign_add("AddOne", 1);
+  *(f_lib_proto.add_function()) = assign_add("AddTwo", 2);
+
+  // Construct a graph to represent following program:
+  //
+  //  (pred: bool, initial_val: int32) -> (int32) {
+  //    var = Variable(initial_value)
+  //    if pred:
+  //      var += 1    # AddOne function call
+  //    else:
+  //      var += 2    # AddTwo function call
+  //    return var
+  //  }
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 0);
+  auto initial_val = ops::_Arg(root.WithOpName("initial_val"), DT_INT32, 1);
+
+  auto var = ops::VarHandleOp(root.WithOpName("var"), DT_INT32, {});
+  auto init = ops::AssignVariableOp(root.WithOpName("init"), var, initial_val);
+
+  Node* if_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(var.node())});
+  TF_ASSERT_OK(NodeBuilder("if", "If", &root.graph()->flib_def())
+                   .Input(pred.node())
+                   .Input(inputs)
+                   .ControlInput(init.operation.node())
+                   .Attr("then_branch", FuncAttr("AddOne"))
+                   .Attr("else_branch", FuncAttr("AddTwo"))
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr("Tout", DataTypeSlice{})
+                   .Finalize(root.graph(), &if_node));
+
+  auto read = ops::ReadVariableOp(
+      root.WithOpName("read").WithControlDependencies(Output(if_node)), var,
+      DT_INT32);
+
+  TF_ASSERT_OK(root.DoShapeInference(if_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify the resultant graph has switch, merge and function call nodes.
+  // TODO(ezhulenev): Inlining functions with empty outputs leads to undefined
+  // graph execution.
+  int switch_count = 0;
+  int merge_count = 0;
+  int node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->IsSwitch()) ++switch_count;
+    if (op->IsMerge()) ++merge_count;
+    if (op->name() == "if") ++node_called_if_count;
+    ASSERT_NE(op->type_string(), "If");
+  }
+  // One switch for predicate and one for input (A).
+  ASSERT_EQ(switch_count, 2);
+  // One merge for the else/then branch (`branch_executed` node).
   ASSERT_EQ(merge_count, 1);
   ASSERT_EQ(node_called_if_count, 1);
 
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(initial_val.node()), Input::Initializer(10));
+
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(read)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 11);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(initial_val.node()), Input::Initializer(10));
+
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(read)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 12);
+  }
+}
+
+TEST(LowerIfOpTest, DoNotInlineLoweredFunction) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  FunctionDef x_times_four = test::function::XTimesFour();
+
+  // If `then` and `else` nodes can't be inlined.
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  (*x_times_four.mutable_attr())["_noinline"].set_b(true);
+
+  // Add test functions for then and else branch.
+  FunctionDefLibrary f_lib_proto;
+  *(f_lib_proto.add_function()) = x_times_two;
+  *(f_lib_proto.add_function()) = x_times_four;
+
+  // Construct simple conditional that switches on `pred` and operates only on
+  // single input `A`.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 1);
+  Node* written_if;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue tb;
+  tb.mutable_func()->set_name("XTimesTwo");
+  AttrValue eb;
+  eb.mutable_func()->set_name("XTimesFour");
+  TF_ASSERT_OK(NodeBuilder("if", "If", &root.graph()->flib_def())
+                   .Input(pred.node())
+                   .Input(inputs)
+                   .Attr("then_branch", tb)
+                   .Attr("else_branch", eb)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr("Tout", {DT_INT32})
+                   .Finalize(root.graph(), &written_if));
+  TF_ASSERT_OK(root.DoShapeInference(written_if));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify that If node was lowered but branch functions were not inlined.
+  int x_times_two_count = 0;
+  int x_times_four_count = 0;
+
+  for (const auto* op : graph->op_nodes()) {
+    if (op->type_string() == x_times_two.signature().name()) {
+      x_times_two_count++;
+    }
+    if (op->type_string() == x_times_four.signature().name()) {
+      x_times_four_count++;
+    }
+    ASSERT_NE(op->type_string(), "If");
+  }
+
+  // One function for 'then' branch and one for 'else' branch.
+  ASSERT_EQ(x_times_two_count, 1);
+  ASSERT_EQ(x_times_four_count, 1);
+
   // Verify execution.
   ClientSession session(root);
   {
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 7552838ca11d5c8e863bddca4398b98caeac2759..60adc73d04fee153111371a421d33d00da834647 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -200,6 +200,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
                            .Input(NodeOut(edge->src(), edge->src_output()))
                            .Attr("frame_name", name_)
                            .Attr("parallel_iterations", parallel_iterations_)
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &enter_node));
     enter_nodes_[edge->dst_input()] = enter_node;
   }
@@ -216,6 +217,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopControlInputs"), "NoOp",
                                    graph_->op_registry(), &debug_info_)
                            .ControlInputs(control_inputs)
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &incoming_control_node));
     for (Node* n : enter_nodes_) {
       graph_->AddControlEdge(incoming_control_node, n);
@@ -231,6 +233,7 @@ Status LowerWhileHelper::CreateMergeNodes() {
         NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
+            .Device(while_op_->requested_device())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -241,6 +244,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   for (Node* merge_node : merge_nodes_) {
     cond_call_builder_.Input(NodeOut(merge_node, 0));
   }
+  cond_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(cond_call_builder_.Finalize(graph_, &cond_call_node_));
   // Add a control edge to make sure the Const nodes in the cond function
   // are in the same frame as the rest of the function, otherwise
@@ -249,6 +253,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopCond"), "LoopCond",
                                  graph_->op_registry(), &debug_info_)
                          .Input(NodeOut(cond_call_node_, 0))
+                         .Device(while_op_->requested_device())
                          .Finalize(graph_, &loop_cond_node_));
   return Status::OK();
 }
@@ -270,6 +275,7 @@ Status LowerWhileHelper::CreateSwitchNodes() {
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(merge_nodes_[i], 0))
                            .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
@@ -280,6 +286,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   for (Node* switch_node : switch_nodes_) {
     body_call_builder_.Input(NodeOut(switch_node, 1));
   }
+  body_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(body_call_builder_.Finalize(graph_, &body_call_node_));
   // Add a control edge to make sure the Const nodes in the body function
   // are in the same frame as the rest of the function, otherwise
@@ -296,6 +303,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   TF_RETURN_IF_ERROR(NodeBuilder(NewName("loop_body_control"), op_type,
                                  graph_->op_registry(), &debug_info_)
                          .Input(NodeOut(switch_nodes_[0], 1))
+                         .Device(while_op_->requested_device())
                          .Finalize(graph_, &body_control_node_));
   graph_->AddControlEdge(body_control_node_, body_call_node_);
   return Status::OK();
@@ -309,6 +317,7 @@ Status LowerWhileHelper::CreateExitNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("exit"), "Exit",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(switch_node, 0))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &exit_node));
     exit_nodes_.emplace_back(exit_node);
     outputs.emplace_back(NodeOut(exit_node, 0));
@@ -320,6 +329,7 @@ Status LowerWhileHelper::CreateExitNodes() {
   // 2. Fetching the output of the While node by name in calls to sess.run.
   NodeBuilder ib(name_, "IdentityN", OpRegistry::Global(), &debug_info_);
   ib.Input(outputs);
+  ib.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(ib.Finalize(graph_, &lowered_while_output_));
   return Status::OK();
 }
@@ -330,6 +340,7 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
                                    graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
@@ -362,7 +373,7 @@ string LowerWhileHelper::NewName(const string& infix) {
   return graph_->NewName(strings::StrCat(name_, "/", infix));
 }
 
-Status InlineCallInGraph(Node* n, Graph* g,
+Status InlineCallInGraph(const absl::string_view node_type, Node* n, Graph* g,
                          const FunctionLibraryDefinition& lib) {
   const FunctionDef* fdef = lib.Find(n->type_string());
   CHECK(fdef != nullptr);
@@ -375,14 +386,26 @@ Status InlineCallInGraph(Node* n, Graph* g,
                               &fbody));
   // TODO(jpienaar): Improve this interface to make the need to delete it
   // explicit.
-  InlineFunctionBody(g->flib_def(), g, n, fbody, false);
+  InlineFunctionBodyOptions inline_opts;
+  inline_opts.override_device = false;
+
+  Status can_inline_function_call = ValidateInlining(n, fbody, inline_opts);
+
+  if (can_inline_function_call.ok()) {
+    TF_RETURN_IF_ERROR(
+        InlineFunctionBody(g->flib_def(), g, n, fbody, inline_opts));
+  } else {
+    VLOG(4) << "Do not inline '" << node_type
+            << "' function call: " << can_inline_function_call.error_message();
+  }
+
   delete fbody;
   return Status::OK();
 }
 
 Status LowerWhileHelper::InlineCallNodes() {
-  TF_RETURN_IF_ERROR(InlineCallInGraph(cond_call_node_, graph_, flib_));
-  TF_RETURN_IF_ERROR(InlineCallInGraph(body_call_node_, graph_, flib_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph("cond", cond_call_node_, graph_, flib_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph("body", body_call_node_, graph_, flib_));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index fcb10bc75dbe574efee9c4c28ab00dcb55c194d3..02ccd23bd906d4d54d4c88feaa14343dcdabb3d3 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -242,5 +242,79 @@ TEST(LowerWhileOpTest, MultipleInputs) {
   }
 }
 
+TEST(LowerWhileOpTest, DoNotInlineLoweredFunctions) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  FunctionDef less_than_or_eq = test::function::LessThanOrEqualToN(8);
+
+  // While loop `cond` and `body` nodes can't be inlined.
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  (*less_than_or_eq.mutable_attr())["_noinline"].set_b(true);
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = x_times_two;
+  *f_lib_proto.add_function() = less_than_or_eq;
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(NodeBuilder("while", "While", &root.graph()->flib_def())
+                   .Input(inputs)
+                   .Attr("T", {DT_INT32})
+                   .Attr("cond", cond_func)
+                   .Attr("body", body_func)
+                   .Attr("parallel_iterations", 100)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Finalize(root.graph(), &while_node));
+  TF_ASSERT_OK(root.DoShapeInference(while_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify that while node was lowered but functions were not inlined.
+  int x_times_two_count = 0;
+  int less_than_or_eq_count = 0;
+
+  for (const auto* op : graph->op_nodes()) {
+    if (op->type_string() == x_times_two.signature().name()) {
+      x_times_two_count++;
+    }
+    if (op->type_string() == less_than_or_eq.signature().name()) {
+      less_than_or_eq_count++;
+    }
+    ASSERT_NE(op->type_string(), "While");
+  }
+
+  ASSERT_EQ(x_times_two_count, 1);
+  ASSERT_EQ(less_than_or_eq_count, 1);
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(a.node()), Input::Initializer(1));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(while_node)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 16);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(a.node()), Input::Initializer(3));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(while_node)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 12);
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index 362d3ceca80c6662b53d99a9a96ec6ec44a7a2a5..fcdab26d3dbc4dee7a825d581543f7619860f225 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -32,18 +32,43 @@ auto* graph_run_time_usecs = monitoring::Counter<0>::New(
 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/autotune", "tf.data autotuning", "name");
 
+auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/bytes_read",
+    "The number of bytes read by tf.data Dataset sources.", "name");
+
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/elements", "tf.data elements", "name");
 
 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/optimization", "tf.data optimization", "name");
 
+auto* build_graph_calls = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_calls",
+    "The number of times TensorFlow has created a new client graph. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
+auto* build_graph_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_time_usecs",
+    "The amount of time TensorFlow has spent creating new client graphs in "
+    "microseconds. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
 }  // namespace
 
 void RecordTFDataAutotune(const string& name) {
   tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
 }
 
+void RecordTFDataBytesRead(const string& name, int64 num_bytes) {
+  tf_data_bytes_read_counter->GetCell(name)->IncrementBy(num_bytes);
+}
+
 void RecordTFDataElements(const string& name, int64 num_elements) {
   tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
 }
@@ -59,5 +84,12 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateGraphBuildTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
+    build_graph_calls->GetCell()->IncrementBy(1);
+    build_graph_time_usecs->GetCell()->IncrementBy(running_time_usecs);
+  }
+}
+
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index 9641265362fad3399b7c787e794713338cabd29d..27d59573dc8fc6d0308e23e86882a3e8b18cde10 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -21,24 +21,45 @@ limitations under the License.
 namespace tensorflow {
 namespace metrics {
 
-// Records that a tf.data dataset op executed by the program used autotuning.
+// Records that a tf.data.Dataset executed by the program used autotuning.
 //
-// The `name` argument identifies the dataset (e.g. "ParallelMap").
+// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
 void RecordTFDataAutotune(const string& name);
 
-// Records the number of elements produced by a tf.data dataset.
+// Records the number of bytes read from the filesystem by a tf.data.Dataset
+// source.
 //
-// The `name` argument identifies the dataset (e.g. "Batch" or "Map").
+// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
+void RecordTFDataBytesRead(const string& name, int64 num_bytes);
+
+// Records the number of elements produced by a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 void RecordTFDataElements(const string& name, int64 num_elements);
 
-// Records the number of independent graph changes resulting from the applicaton
-// of a tf.data optimization.
+// Records the number of independent graph changes resulting from the
+// application of a tf.data optimization.
 //
 // The `name` argument identifies the optimization (e.g. "noop_eliminiation").
 void RecordTFDataOptimization(const string& name, int64 num_changes);
 
 void UpdateGraphExecTime(const uint64 running_time_usecs);
 
+// Updates the metrics stored about time spent building graphs.
+//
+// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
+// the full graph, induced by a set of options. In particular, these options
+// include the feeds and fetches requested.
+//
+// This includes time spent:
+//   * optimizing the graphs with Grappler
+//   * pruning the sub-graph (unless the place_pruned_graph option is set)
+//
+// When executing eagerly, this will not record any activity.
+//
+// TODO(jtkeeling): Should we record building/optimizing tf.functions?
+void UpdateGraphBuildTime(const uint64 running_time_usecs);
+
 }  // namespace metrics
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 429b19599b63740370ae49d7dbe9edcdf1e2c0ce..b467e7b311e3fe73d2eb094e5d92f124a8266a0b 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -39,6 +39,8 @@ typedef unsigned int uint;
 
 namespace tensorflow {
 
+static bool mkl_small_allocator_collect_stats = false;
+
 class MklSubAllocator : public BasicCPUAllocator {
  public:
   MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
@@ -62,15 +64,8 @@ class MklSmallSizeAllocator : public Allocator {
   inline string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* ptr = sub_allocator_->Alloc(alignment, num_bytes);
-    if (ptr != nullptr) {
-      std::pair<void*, size_t> map_val(ptr, num_bytes);
-      mutex_lock l(mutex_);
-      // Check that insertion in the hash map was successful.
-      CHECK(map_.insert(map_val).second);
-      // Increment statistics for small-size allocations.
-      IncrementStats(num_bytes);
-    }
+    void* ptr = port::AlignedMalloc(num_bytes, alignment);
+    if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes);
     return ptr;
   }
 
@@ -80,50 +75,42 @@ class MklSmallSizeAllocator : public Allocator {
       return;
     }
 
-    mutex_lock l(mutex_);
-    auto map_iter = map_.find(ptr);
-    if (map_iter != map_.end()) {
-      // Call free visitors.
-      size_t dealloc_bytes = map_iter->second;
-      sub_allocator_->Free(ptr, dealloc_bytes);
-      DecrementStats(dealloc_bytes);
-      map_.erase(map_iter);
-    } else {
-      LOG(ERROR) << "tried to deallocate invalid pointer";
-      return;
+    if (mkl_small_allocator_collect_stats) {
+      const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
+      DecrementStats(alloc_size);
     }
+    port::AlignedFree(ptr);
   }
 
-  inline bool IsSmallSizeAllocation(const void* ptr) const {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mutex_);
-    return map_.find(ptr) != map_.end();
-  }
-
-  void GetStats(AllocatorStats* stats) override {
-    mutex_lock l(mutex_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mutex_);
-    stats_.Clear();
+    stats_.num_allocs = 0;
+    stats_.peak_bytes_in_use = 0;
+    stats_.largest_alloc_size = 0;
+    stats_.bytes_in_use = 0;
+    stats_.bytes_limit = 0;
   }
 
  private:
   // Increment statistics for the allocator handling small allocations.
-  inline void IncrementStats(size_t alloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     ++stats_.num_allocs;
     stats_.bytes_in_use += alloc_size;
-    stats_.max_bytes_in_use =
-        std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-    stats_.max_alloc_size =
-        std::max(alloc_size, static_cast<size_t>(stats_.max_alloc_size));
+    stats_.peak_bytes_in_use =
+        std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+    stats_.largest_alloc_size =
+        std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size));
   }
 
   // Decrement statistics for the allocator handling small allocations.
-  inline void DecrementStats(size_t dealloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     stats_.bytes_in_use -= dealloc_size;
   }
 
@@ -135,10 +122,6 @@ class MklSmallSizeAllocator : public Allocator {
   // Allocator name
   string name_;
 
-  // Hash map to keep track of "small" allocations
-  // We do not use BFC allocator for small allocations.
-  std::unordered_map<const void*, size_t> map_ GUARDED_BY(mutex_);
-
   // Allocator stats for small allocs
   AllocatorStats stats_ GUARDED_BY(mutex_);
 };
@@ -215,43 +198,72 @@ class MklCPUAllocator : public Allocator {
   }
 
   inline string Name() override { return kName; }
+  inline bool IsSmallSizeAllocation(const void* ptr) const
+      LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    return large_allocations_map_.find(ptr) == large_allocations_map_.end();
+  }
+  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
+  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    if (ptr != nullptr) {
+      std::pair<void*, size_t> map_val(ptr, num_bytes);
+      large_allocations_map_.insert(map_val);
+    }
+  }
+  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    auto map_iter = large_allocations_map_.find(ptr);
+    if (map_iter != large_allocations_map_.end()) {
+      large_allocations_map_.erase(map_iter);
+    } else {
+      LOG(ERROR) << "tried to deallocate invalid pointer";
+    }
+    return;
+  }
 
   inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // If the allocation size is less than threshold, call small allocator,
     // otherwise call large-size allocator (BFC). We found that BFC allocator
     // does not deliver good performance for small allocations when
     // inter_op_parallelism_threads is high.
-    return (num_bytes < kSmallAllocationsThreshold)
-               ? small_size_allocator_->AllocateRaw(alignment, num_bytes)
-               : large_size_allocator_->AllocateRaw(alignment, num_bytes);
+    if (num_bytes < kSmallAllocationsThreshold) {
+      return small_size_allocator_->AllocateRaw(alignment, num_bytes);
+    } else {
+      mutex_lock l(mutex_);
+      void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes);
+      AddLargeAllocMap(ptr, num_bytes);
+      return ptr;
+    }
   }
 
   inline void DeallocateRaw(void* ptr) override {
     // Check if ptr is for "small" allocation. If it is, then call Free
     // directly. Otherwise, call BFC to handle free.
-    if (small_size_allocator_->IsSmallSizeAllocation(ptr)) {
+    if (IsSmallSizeAllocation(ptr)) {
       small_size_allocator_->DeallocateRaw(ptr);
     } else {
+      mutex_lock l(mutex_);
+      RemoveLargeAllocMap(ptr);
       large_size_allocator_->DeallocateRaw(ptr);
     }
   }
 
-  void GetStats(AllocatorStats* stats) override {
-    AllocatorStats l_stats, s_stats;
-    small_size_allocator_->GetStats(&s_stats);
-    large_size_allocator_->GetStats(&l_stats);
+  absl::optional<AllocatorStats> GetStats() override {
+    auto s_stats = small_size_allocator_->GetStats();
+    auto l_stats = large_size_allocator_->GetStats();
 
     // Combine statistics from small-size and large-size allocator.
-    stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
-    stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
-    stats->max_bytes_in_use =
-        l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
+    stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs;
+    stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use;
+    stats_.peak_bytes_in_use =
+        l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use;
 
     // Since small-size allocations go to MklSmallSizeAllocator,
     // max_alloc_size from large_size_allocator would be the maximum
     // size allocated by MklCPUAllocator.
-    stats->max_alloc_size = l_stats.max_alloc_size;
-    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    stats_.largest_alloc_size = l_stats->largest_alloc_size;
+    stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit);
+    return stats_;
   }
 
   void ClearStats() override {
@@ -299,6 +311,13 @@ class MklCPUAllocator : public Allocator {
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
   SubAllocator* sub_allocator_;  // not owned by this class
+  mutable mutex mutex_;
+  AllocatorStats stats_ GUARDED_BY(mutex_);
+
+  // Hash map to keep track of "BFC" allocations
+  // We do not use BFC allocator for small allocations.
+  std::unordered_map<const void*, size_t> large_allocations_map_
+      GUARDED_BY(mutex_);
 
   // Size in bytes that defines the upper-bound for "small" allocations.
   // Any allocation below this threshold is "small" allocation.
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
index e08ab5763856956b435b7eb0451d8316af2d9337..ee1d9cd281bb5514074dd71ba2bdc2379c1ebfc1 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
@@ -24,22 +24,21 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(MKLBFCAllocatorTest, TestMaxLimit) {
-  AllocatorStats stats;
   setenv(MklCPUAllocator::kMaxLimitStr, "1000", 1);
   MklCPUAllocator a;
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
-  EXPECT_EQ(stats.bytes_limit, 1000);
+  auto stats = a.GetStats();
+  EXPECT_EQ(stats->bytes_limit, 1000);
 
   unsetenv(MklCPUAllocator::kMaxLimitStr);
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
+  stats = a.GetStats();
   uint64 max_mem_bytes = MklCPUAllocator::kDefaultMaxLimit;
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
   max_mem_bytes =
       (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE);
 #endif
-  EXPECT_EQ(stats.bytes_limit, max_mem_bytes);
+  EXPECT_EQ(stats->bytes_limit, max_mem_bytes);
 
   setenv(MklCPUAllocator::kMaxLimitStr, "wrong-input", 1);
   EXPECT_TRUE(errors::IsInvalidArgument(a.Initialize()));
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 9be540b0192416b6dfa636b054bd174bb8376eec..7821d3d0607038faad86addd05f28dab8cc04bbc 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -41,15 +41,21 @@ Status OptimizationPassRegistry::RunGrouping(
         Status s = pass->Run(options);
         if (!s.ok()) return s;
         if (VLOG_IS_ON(1)) {
-          DumpGraphToFile(
-              strings::StrCat("after_phase_", phase.first, "_", pass->name()),
-              **options.graph);
+          if (options.graph) {
+            DumpGraphToFile(
+                strings::StrCat(
+                    "after_phase_", phase.first, "_", pass->name(), "_",
+                    reinterpret_cast<uintptr_t>((*options.graph).get())),
+                **options.graph, options.flib_def);
+          }
           if (options.partition_graphs) {
             for (auto& part : *options.partition_graphs) {
               DumpGraphToFile(
-                  strings::StrCat("after_phase_", phase.first, "_",
-                                  pass->name(), "_partition_", part.first),
-                  *part.second);
+                  strings::StrCat(
+                      "after_phase_", phase.first, "_", pass->name(),
+                      "_partition_", part.first, "_",
+                      reinterpret_cast<uintptr_t>(part.second.get())),
+                  *part.second, options.flib_def);
             }
           }
         }
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index d51caaea8f1d12b472232718c973749e47146728..d700040f8af9a3c7bffd2f6bf41270827175b88a 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -82,12 +82,12 @@ Status UpdateArgAndRetvalMetadata(
   // in the original function.
   for (Node* node : subgraph->op_nodes()) {
     string node_type = node->type_string();
-    if (node_type == FunctionLibraryDefinition::kArgOp) {
+    if (node->IsArg()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
       arg_indices->push_back(index);
       arg_nodes.push_back(std::make_pair(node, index));
-    } else if (node_type == FunctionLibraryDefinition::kRetOp) {
+    } else if (node->IsRetval()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
       ret_indices->push_back(index);
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 0d4e36222ba7809dae73fb6eaaceda7fd497288a..705b52a46c6a86983816db9a9e146c5b4527b04b 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -68,8 +68,7 @@ class PartitioningUtilsTest : public ::testing::Test {
     TF_ASSERT_OK(s.ToGraph(graph));
 
     if (assign_device) {
-      Placer placer(graph, &device_set_, nullptr, /* No session options */
-                    device0_);
+      Placer placer(graph, &device_set_, device0_);
       TF_ASSERT_OK(placer.Run());
     }
   }
@@ -85,8 +84,7 @@ class PartitioningUtilsTest : public ::testing::Test {
     auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
     auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
     TF_ASSERT_OK(s.ToGraph(graph));
-    Placer placer(graph, &device_set_, nullptr, /* No session options */
-                  device0_);
+    Placer placer(graph, &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
@@ -100,8 +98,7 @@ class PartitioningUtilsTest : public ::testing::Test {
     auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
     auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
     TF_ASSERT_OK(s.ToGraph(subgraph));
-    Placer placer(subgraph, &device_set_, nullptr, /* No session options */
-                  device0_);
+    Placer placer(subgraph, &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 515c1971d9d5cb179b7b9764ff3462579e742dfc..2fd8c64d843fd60fcd5d938a86d8c45f92feb977 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -32,765 +34,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
 namespace {
 
-// We hoist the conversion from C-style string literal to StringPiece here,
-// so that we can avoid the many repeated calls to strlen().
-const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
-const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
-
-// Returns a list of devices having type in supported_device_types.  The
-// returned list is sorted by preferred type (higher numeric type is preferred).
-std::vector<Device*> FilterSupportedDevices(
-    const std::vector<Device*>& devices,
-    const PrioritizedDeviceTypeVector& supported_device_types,
-    const Device* default_device) {
-  Device* filtered_default_device = nullptr;
-  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
-  for (const auto& supported_device_type : supported_device_types) {
-    for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) ==
-          supported_device_type.first) {
-        if (device == default_device) {
-          filtered_default_device = device;
-        } else {
-          prioritized_filtered_devices.emplace_back(
-              device, supported_device_type.second);
-        }
-      }
-    }
-  }
-
-  auto device_sort = [](const std::pair<Device*, int32>& a,
-                        const std::pair<Device*, int32>& b) {
-    if (a.second != b.second) {
-      return a.second > b.second;
-    }
-
-    auto a_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
-    auto b_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
-    // First sort by prioritized device type (higher is preferred) and
-    // then by device name (lexicographically).
-    if (a_priority != b_priority) {
-      return a_priority > b_priority;
-    }
-    return StringPiece(a.first->name()) < StringPiece(b.first->name());
-  };
-  std::sort(prioritized_filtered_devices.begin(),
-            prioritized_filtered_devices.end(), device_sort);
-
-  std::vector<Device*> filtered_devices;
-  if (filtered_default_device != nullptr) {
-    filtered_devices.emplace_back(filtered_default_device);
-  }
-  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
-    filtered_devices.push_back(prioritized_filtered_device.first);
-  }
-  return filtered_devices;
-}
-
-// This class maintains the connected components of a colocation
-// constraint graph, and uses this information to assign a satisfying
-// device placement to the nodes of the graph.
-//
-// The typical usage pattern is:
-//
-//   Graph graph = ...;
-//   DeviceSet device_set = ...;
-//   ColocationGraph colocation_graph(graph, device_set);
-//
-//   // Add all the nodes of the `graph` to the `colocation_graph`.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
-//   }
-//
-//   // Add one or more colocation constraints.
-//   Node node_1 = *graph.FindNodeId(...);
-//   Node node_2 = *graph.FindNodeId(...);
-//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
-//
-//   // Assign devices based on the accumulated constraints.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
-//   }
-//
-// This implementation uses the Union-Find algorithm to efficiently maintain the
-// connected components and incrementally adds edges via
-// ColocationGraph::ColocateNodes() invocations.
-class ColocationGraph {
- public:
-  ColocationGraph(Graph* graph, const DeviceSet* device_set,
-                  bool allow_soft_placement, const Device* default_device)
-      : graph_(graph),
-        device_set_(device_set),
-        device_types_(device_set->PrioritizedDeviceTypeList()),
-        allow_soft_placement_(allow_soft_placement),
-        default_device_(default_device) {
-    members_.resize(graph->num_node_ids());
-  }
-
-  // Adds each node of the Graph to this ColocationGraph as a singleton.
-  //
-  // NOTE: The implementation assumes that the ids of nodes passed to
-  // this method are dense and zero-based; the memory used will be linear in
-  // the largest node ID.
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateAllNodes() {
-    // This maps from a colocation group identifier to the 'root' of that
-    // colocation group.  Note that the keys in this map are StringPiece; the
-    // actual strings are stored under the NodeDef.  The lifetime of this map
-    // is limited to this ColocateAllNodes() method, and no part of the
-    // NodeDef trees are changed during the lifetime of this method, so using
-    // StringPiece as a key is safe.
-    //
-    // Also, as a further optimization, we remove the "loc:@" prefix from
-    // "class" attribute values, when they are used as keys in this table.
-    // This allows us to use StringPiece values that refer to substrings of
-    // 'string' values stored in NodeDef attribute lists, as well as StringPiece
-    // values that refer to 'string' values from NodeDef::name(), without
-    // performing any string allocations.
-    std::unordered_map<StringPiece, const Node*, StringPieceHasher>
-        colocation_group_root;
-
-    for (Node* node : graph_->op_nodes()) {
-      // When adding the node, identify whether it is part of a colocation
-      // group.
-
-      // This code is effectively the equivalent of GetNodeAttr() for a string
-      // array, but it avoids all internal allocations (the allocation of the
-      // backing store of the std::vector<string> as well as the copies of the
-      // strings within it).  Instead, we combine the query of the colocation
-      // attribute with the calls to ColocateNodeToGroup.
-      bool found_spec = false;
-      const AttrValue* attr_value =
-          node->attrs().Find(kColocationAttrNameStringPiece);
-      if (attr_value != nullptr && attr_value->has_list()) {
-        for (const string& class_spec : attr_value->list().s()) {
-          StringPiece spec(class_spec);
-          if (str_util::ConsumePrefix(&spec,
-                                      kColocationGroupPrefixStringPiece)) {
-            found_spec = true;
-            TF_RETURN_IF_ERROR(
-                ColocateNodeToGroup(&colocation_group_root, node, spec));
-          }
-        }
-      }
-
-      if (!found_spec) {
-        // If the node does not specify a colocation group, then use the
-        // name of this node as the colocation group.
-        TF_RETURN_IF_ERROR(
-            ColocateNodeToGroup(&colocation_group_root, node, node->name()));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status ColocateNodeToGroup(
-      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
-          colocation_group_root,
-      Node* node, StringPiece colocation_group) {
-    const Node*& root_node = (*colocation_group_root)[colocation_group];
-    if (root_node == nullptr) {
-      // This is the first node of the colocation group, so
-      // designate this node as the 'root' of that colocation group.
-      root_node = node;
-    } else {
-      // Try to colocate the node with the root.  If there is an
-      // error, return it.
-      Status s = ColocateNodes(*node, *root_node);
-      if (!s.ok()) {
-        return AttachDef(s, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Merge the (possibly disjoint) sets containing nodes "x" and
-  // "y". Returns OK if the all nodes in the union of these sets can
-  // be placed on the same device type.
-  //
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateNodes(const Node& x, const Node& y) {
-    int x_root = FindRoot(x.id());
-    int y_root = FindRoot(y.id());
-    return ColocateNodes(x, x_root, y, y_root);
-  }
-
-  // This overload of ColocateNodes() allows a caller to provide the root node
-  // ids for the two nodes. For large graphs, this noticeably reduces the
-  // graph load time.
-  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root) {
-    if (x_root == y_root) {
-      return Status::OK();
-    }
-
-    DCHECK_EQ(x_root, FindRoot(x.id()));
-    DCHECK_EQ(y_root, FindRoot(y.id()));
-
-    Member& x_root_member = members_[x_root];
-    Member& y_root_member = members_[y_root];
-
-    // Merge the sets by setting the parent pointer of the smaller tree's root
-    // node to point to the root of the larger tree. Together with path
-    // compression in ColocationGraph::FindRoot, this ensures that we do not
-    // experience pathological performance on graphs such as chains.
-    int new_root, old_root;
-    if (x_root_member.rank < y_root_member.rank) {
-      // The tree rooted at x_root is shallower, so connect it to
-      // y_root. The rank of y_root is unchanged because its new
-      // child has strictly less rank.
-      x_root_member.parent = y_root;
-      new_root = y_root;
-      old_root = x_root;
-    } else if (x_root_member.rank > y_root_member.rank) {
-      // The tree rooted at y_root is shallower, so connect it to
-      // x_root. The rank of x_root is unchanged because its new
-      // child has strictly less rank.
-      y_root_member.parent = x_root;
-      new_root = x_root;
-      old_root = y_root;
-    } else {
-      // Both trees have the same rank, so break the tie by choosing
-      // x_root as the new root.
-      y_root_member.parent = x_root;
-      // Increment the rank of the tree rooted at x_root, because it
-      // is now strictly deeper than before.
-      ++x_root_member.rank;
-      new_root = x_root;
-      old_root = y_root;
-    }
-
-    Member& new_root_member = members_[new_root];
-    Member& old_root_member = members_[old_root];
-
-    // Merge the partial device specifications, and ensure that they are
-    // compatible. NULL options_ is treated as allowing soft placement.
-    // TODO(mrry): Consider enriching the error message by pointing
-    // out which nodes have the explicit partial device
-    // specifications that caused this conflict.
-    Status s = DeviceNameUtils::MergeDevNames(&new_root_member.device_name,
-                                              old_root_member.device_name,
-                                              allow_soft_placement_);
-    if (!s.ok()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()), ": ",
-          s.error_message());
-    }
-
-    // Ensure that the common root has at least one supported device
-    // type, by computing the intersection of
-    // new_root_member.supported_device_types and
-    // old_root_member.supported_device_types.
-    MergeSupportedDevices(&new_root_member.supported_device_types,
-                          old_root_member.supported_device_types);
-    if (new_root_member.supported_device_types.empty()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()),
-          " because no device type supports both of those nodes and the "
-          "other nodes colocated with them.",
-          DebugInfo(x_root), DebugInfo(y_root));
-    }
-
-    return Status::OK();
-  }
-
-  // For the given node, subject to the constraints previously given
-  // to this ColocationGraph, set its assigned_device_name. Returns OK
-  // if a satisfying device can be found, otherwise an error.
-  //
-  // Note: This method returns a pointer to a field within members_.
-  // The caller must not use the returned pointer after there is any possibility
-  // that the members_[i].possible_devices field has been modified.
-  Status GetDevicesForNode(Node* node,
-                           std::vector<Device*>** possible_devices) {
-    *possible_devices = nullptr;
-    const int node_root = FindRoot(node->id());
-    if (!members_[node_root].possible_devices.empty()) {
-      *possible_devices = &members_[node_root].possible_devices;
-      return Status::OK();
-    }
-
-    // We have not yet computed the possible devices for the
-    // colocated node set containing 'node', so we do so now using the
-    // constraints on the root node.
-
-    // "devices" will contain the set of feasible placements for the
-    // colocated node set containing 'node'.
-    std::vector<Device*> devices;
-    if (DeviceNameUtils::HasSomeDetails(members_[node_root].device_name)) {
-      // The root node has a (possibly partial) device
-      // specification, so enumerate the physical devices that
-      // conform to it.
-      device_set_->FindMatchingDevices(members_[node_root].device_name,
-                                       &devices);
-
-      if (!devices.empty()) {
-        // Filter devices into those that are compatible with the root
-        // node (and its children).
-        devices = FilterSupportedDevices(
-            devices, members_[node_root].supported_device_types,
-            default_device_);
-      }
-
-      // Perform soft placement if allow_soft_placement_ is set.
-      if (devices.empty() && allow_soft_placement_) {
-        // The soft_device_name is the same as the node's device name
-        // without specifying the device type or ID.
-        DeviceNameUtils::ParsedName soft_device_name =
-            members_[node_root].device_name;
-        soft_device_name.type.clear();
-        soft_device_name.has_type = false;
-        soft_device_name.has_id = false;
-        device_set_->FindMatchingDevices(soft_device_name, &devices);
-        if (!devices.empty()) {
-          devices = FilterSupportedDevices(
-              devices, members_[node_root].supported_device_types,
-              default_device_);
-        }
-      }
-
-      if (devices.empty()) {
-        // Return an error when a physical device that matches an explicit
-        // device specification is not found. This ensures that we don't
-        // assign a node to GPU when the user wanted to force it on CPU.
-        string debug_info = DebugInfo(node_root);
-
-        DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->requested_device(),
-                                           &specified_device_name) &&
-            specified_device_name == members_[node_root].device_name) {
-          // The specified device and merged set device match, and
-          // will appear in the GraphDef (for debugging), so just
-          // print the specified device.
-          std::vector<Device*> devices_matching_nodedef;
-          device_set_->FindMatchingDevices(specified_device_name,
-                                           &devices_matching_nodedef);
-          if (devices_matching_nodedef.empty()) {
-            // Sometimes it is almost impossible to understand the problem
-            // without a list of available devices.
-            std::vector<string> device_names;
-            for (const Device* device : device_set_->devices()) {
-              device_names.push_back(device->name());
-            }
-            std::sort(device_names.begin(), device_names.end());
-
-            string gpu_msg = "";
-            if (!IsGoogleCudaEnabled() &&
-                str_util::Lowercase(specified_device_name.type) == "gpu") {
-              gpu_msg =
-                  " The requested device appears to be a GPU, but CUDA is not "
-                  "enabled.";
-            }
-
-            return errors::InvalidArgument(
-                errors::FormatNodeNameForError(node->name()),
-                "was explicitly assigned to ", node->requested_device(),
-                " but available devices are [ ",
-                str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.", gpu_msg);
-          } else if (specified_device_name.has_type) {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), "' because no supported kernel for ",
-                specified_device_name.type, " devices is available.",
-                debug_info, "\nRegistered kernels:\n",
-                KernelsRegisteredForOp(node->type_string()));
-          } else {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), debug_info);
-          }
-        } else {
-          // The specified device may be a valid device but the
-          // merged set device is different, so print both.
-          return errors::InvalidArgument(
-              "Could not satisfy explicit device specification '",
-              node->requested_device(), "' because the node ",
-              errors::FormatColocationNodeForError(node->name()),
-              " was colocated with a group of nodes that ",
-              "required incompatible device '",
-              DeviceNameUtils::ParsedNameToString(
-                  members_[node_root].device_name),
-              "'", debug_info);
-        }
-      }
-    } else {
-      // The device is completely unspecified, so enumerate the devices that
-      // support all of the nodes in the set.
-      if (device_set_->devices().empty()) {
-        return errors::Internal("No devices are registered");
-      }
-      devices = FilterSupportedDevices(
-          device_set_->devices(), members_[node_root].supported_device_types,
-          default_device_);
-
-      if (devices.empty()) {
-        return errors::InvalidArgument(
-            "Node had no OpKernel registered to support this operation: ",
-            "Operation was ", node->type_string(), " and inputs were ",
-            DataTypeVectorString(node->input_types()), DebugInfo(node_root));
-      }
-    }
-
-    // Cache the result of the possible devices for this node group.
-    members_[node_root].possible_devices = std::move(devices);
-    *possible_devices = &members_[node_root].possible_devices;
-    return Status::OK();
-  }
-
-  Status InitializeMembers() {
-    for (Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      Status status = InitializeMember(*node, &members_[node->id()]);
-      if (!status.ok()) {
-        return AttachDef(status, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Represents a node in the disjoint node set forest, and the
-  // accumulated constraints on the device used by that node.
-  struct Member {
-    Member() = default;
-    // The id of the node that is the parent of this one, or its own
-    // id if it is a root. parent <= 0 indicates that this member is invalid.
-    int parent = -1;
-
-    // A proxy for the depth of the tree that is used to prefer
-    // connecting smaller trees to larger trees when merging disjoint
-    // sets.
-    int rank = 0;
-
-    // The intersection of all device types supported by this node,
-    // and those of all of its children, in priority order
-    // of the preferred device.
-    PrioritizedDeviceTypeVector supported_device_types;
-
-    // The merged form of the device requested for this node, with
-    // those of all of its children.
-    DeviceNameUtils::ParsedName device_name;
-
-    // If this node is a root, stores a list of Devices to which this node
-    // and all of its children have been assigned, or nullptr if this
-    // has not yet been computed.
-    std::vector<Device*> possible_devices;
-  };
-
-  // Returns debugging info for the node referred to by 'node_root'.
-  string DebugInfo(const int node_root) {
-    string text(
-        "\nColocation Debug Info:\n"
-        "Colocation group had the following types and devices: ");
-
-    // If this node is part of a colocation group, then we want to
-    // collect the mapping of ops to supported devices, so that
-    // the user can see why an unsatisfiable placement occurred.
-
-    std::unordered_map<string, string> type_to_devices;
-    std::vector<const Node*> colocation_nodes;
-    int num_nodes_found = 0;
-
-    for (const Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      int id = node->id();
-      if (FindRoot(id) != node_root) {
-        continue;
-      }
-      ++num_nodes_found;
-      colocation_nodes.push_back(node);
-      const string& op_type = node->type_string();
-      string devices_registered;
-      for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered,
-                           DeviceTypeString(device_type.first), " ");
-      }
-
-      type_to_devices[op_type] = std::move(devices_registered);
-    }
-
-    for (const auto& td : type_to_devices) {
-      strings::StrAppend(&text, "\n", td.first, ": ", td.second);
-    }
-    strings::StrAppend(&text,
-                       "\n\nColocation members and user-requested devices:");
-    for (const Node* node : colocation_nodes) {
-      strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
-                         ") ", node->requested_device());
-    }
-    strings::StrAppend(&text, "\n");
-
-    if (num_nodes_found <= 1) {
-      text.clear();
-    }
-    return text;
-  }
-
-  Status InitializeMember(const Node& node, Member* member) {
-    const int id = node.id();
-    DCHECK_GE(id, 0);
-    member->parent = id;
-    TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
-        device_types_, node.def(), &member->supported_device_types));
-
-    if (node.has_assigned_device_name()) {
-      // This node has already been assigned to a device, so we
-      // respect this placement, after sanity-checking it.  The
-      // device_name and supported_device_types for this node reflect
-      // the assigned device, so any nodes colocated with this node
-      // will be assigned to the same device (assuming this is
-      // possible).
-      // NOTE: Since any assignment must have been performed by
-      // the TensorFlow runtime, we consider errors in this branch to
-      // be INTERNAL.
-      const string& assigned_device_name = node.assigned_device_name();
-      if (!DeviceNameUtils::ParseFullName(assigned_device_name,
-                                          &member->device_name)) {
-        return errors::Internal("Malformed assigned device '",
-                                assigned_device_name, "'");
-      }
-      const Device* assigned_device =
-          device_set_->FindDeviceByName(assigned_device_name);
-      if (assigned_device == nullptr) {
-        return errors::Internal("Assigned device '", assigned_device_name,
-                                "' does not match any device");
-      }
-
-      for (const auto& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) ==
-            d.first) {
-          return Status::OK();
-        }
-      }
-
-      return errors::Internal("Assigned device '", assigned_device_name,
-                              "' does not have registered OpKernel support "
-                              "for ",
-                              node.type_string());
-    } else {
-      // This node has not yet been assigned to a device, so we
-      // calculate any constraints due to the set of registered
-      // kernels and any (partial) user-provided device specification
-      // in the NodeDef.
-
-      // If no kernels are registered for this op type, fail with an error.
-      if (member->supported_device_types.empty()) {
-        std::set<string> registered_device_types;
-        for (Device* d : device_set_->devices()) {
-          registered_device_types.insert(d->device_type());
-        }
-        std::vector<string> attr_key_vals;
-        for (const auto& it : node.attrs()) {
-          const string& name = it.first;
-          const AttrValue& attr_value = it.second;
-          attr_key_vals.push_back(
-              strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
-        }
-        return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.type_string(),
-            "' used by ", errors::FormatNodeNameForError(node.name()),
-            "with these attrs: [", str_util::Join(attr_key_vals, ", "),
-            "]\n"
-            "Registered devices: [",
-            str_util::Join(registered_device_types, ", "), "]\n",
-            "Registered kernels:\n",
-            KernelsRegisteredForOp(node.type_string()));
-      }
-
-      // If the NodeDef contains a device, then we interpret it as a
-      // (partial) device specification.
-      if (!node.requested_device().empty()) {
-        // The user has specified a device in the NodeDef, try to find a
-        // valid device matching their specification in the set of
-        // devices.
-        // NOTE: The full name may specify a device that is not in
-        // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
-                                            &member->device_name)) {
-          return errors::InvalidArgument("Malformed device specification '",
-                                         node.requested_device(), "'");
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
-    for (const auto& prioritized_device_type : device_types) {
-      if (prioritized_device_type.second != 0) return true;
-    }
-    return false;
-  }
-
-  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
-                                const PrioritizedDeviceTypeVector& b_types) {
-    if (a_types.size() != b_types.size()) {
-      return false;
-    }
-    for (int i = 0; i < a_types.size(); ++i) {
-      if (a_types[i].first != b_types[i].first) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Updates target to contain the intersection of the device types in
-  // "target" and "other".
-  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
-                                    const PrioritizedDeviceTypeVector& other) {
-    PrioritizedDeviceTypeVector temp = *target;
-    target->clear();
-
-    // Generate intersection with priorities.
-    PrioritizedDeviceTypeVector target_intersection;
-    PrioritizedDeviceTypeVector other_intersection;
-    for (const auto& prioritized_device_type : temp) {
-      bool found = false;
-      for (const auto& other_prioritized_device_type : other) {
-        if (prioritized_device_type.first ==
-            other_prioritized_device_type.first) {
-          found = true;
-          other_intersection.push_back(other_prioritized_device_type);
-          break;
-        }
-      }
-      if (found) {
-        target_intersection.push_back(prioritized_device_type);
-      }
-    }
-
-    // Sort the devices by priority order.
-    auto device_sort = [](const std::pair<DeviceType, int32>& a,
-                          const std::pair<DeviceType, int32>& b) {
-      // First look at set priorities.
-      if (a.second != b.second) {
-        return a.second > b.second;
-      }
-      // Then fallback to default priorities.
-      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
-      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
-      if (a_priority != b_priority) {
-        return a_priority > b_priority;
-      }
-      // Finally just look at the Device type strings.
-      return a.first.type_string() < b.first.type_string();
-    };
-
-    std::sort(target_intersection.begin(), target_intersection.end(),
-              device_sort);
-    std::sort(other_intersection.begin(), other_intersection.end(),
-              device_sort);
-
-    bool is_target_prioritized = HasPriorities(target_intersection);
-    bool is_other_prioritized = HasPriorities(other_intersection);
-    // If neither are prioritized then we just return the original i.e. target
-    // prioritization.
-    if (!is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    // If only one is prioritized, then we respect priorities of that in the
-    // intersection.
-    if (is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    if (!is_target_prioritized && is_other_prioritized) {
-      *target = other_intersection;
-    }
-    // If both have priorities and agree then we go with that. If the
-    // prioritization order is different, then we just fallback to the default
-    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
-    // merged priorities to 0, so that downstream merges work correctly as well.
-    if (is_target_prioritized && is_other_prioritized) {
-      bool priorities_agree =
-          ArePrioritiesSame(target_intersection, other_intersection);
-      if (priorities_agree) {
-        *target = target_intersection;
-      } else {
-        for (const auto& prioritized_device : target_intersection) {
-          target->push_back(std::make_pair(prioritized_device.first, 0));
-        }
-        std::sort(target->begin(), target->end(), device_sort);
-      }
-    }
-  }
-
-  // Returns the root node of the disjoint tree to which the node with the
-  // given id is connected.
-  int FindRoot(int node_id) {
-    Member& member = members_[node_id];
-    DCHECK_GE(member.parent, 0);
-    if (member.parent == node_id) {
-      // member.parent is the root of this disjoint tree.  Do nothing.
-    } else {
-      member.parent = FindRoot(member.parent);
-    }
-    // Now it is guaranteed that member.parent is the root of this disjoint
-    // tree.
-    DCHECK_GE(member.parent, 0);
-    return member.parent;
-  }
-
-  // Ensures that the devices of 'dst's resource and reference match the device
-  // specified for 'src', which is an input of 'dst' with a partially or fully
-  // specified device.
-  Status VerifyResourceAndRefInputsCanBeColocated(
-      const Node* dst, const Node* src,
-      const DeviceNameUtils::ParsedName& src_parsed_name) {
-    std::vector<const Edge*> edges;
-    TF_RETURN_IF_ERROR(dst->input_edges(&edges));
-    for (const Edge* edge : edges) {
-      DataType input_type = dst->input_type(edge->dst_input());
-      if (input_type == DT_RESOURCE || IsRefType(input_type)) {
-        const Node* input_node = edge->src();
-        if (input_node == src) {
-          continue;
-        }
-        const auto& input_root = members_[FindRoot(input_node->id())];
-        const auto& input_parsed_name = input_root.device_name;
-        if (DeviceNameUtils::HasSomeDetails(input_parsed_name) &&
-            !DeviceNameUtils::AreCompatibleDevNames(input_parsed_name,
-                                                    src_parsed_name)) {
-          return AttachDef(
-              errors::InvalidArgument(
-                  "Could not colocate node with its "
-                  "resource and reference inputs; devices ",
-                  DeviceNameUtils::ParsedNameToString(input_parsed_name),
-                  " and ", DeviceNameUtils::ParsedNameToString(src_parsed_name),
-                  " are not compatible."),
-              *dst);
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  Graph* const graph_;  // Not owned.
-  std::vector<Member> members_;
-  const DeviceSet* device_set_;  // Not owned.
-  const std::vector<DeviceType> device_types_;
-  const bool allow_soft_placement_;
-  const Device* default_device_;
-};
-
 // Returns true if the node has no inputs and produces outputs
 // that are consumed by a single node.
 //
@@ -802,28 +53,46 @@ bool IsGeneratorNode(const Node* node) {
          !IsRefType(node->output_type(0));
 }
 
-bool IsExemptFromResourceInputColocation(const Node* node) {
-  // Note: Partitioned function calls, which place and partition their
-  // function bodies, are exempt from this check: they forward resource and
-  // ref inputs to operations that are appropriately placed, instead of
-  // dereferencing them.
-  const string& op_type = node->op_def().name();
-  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+void LogDeviceAssignment(const Node* node, bool log_device_placement) {
+  // Log placement if log_device_placement is set.
+  if (log_device_placement) {
+    printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
+           node->assigned_device_name().c_str());
+    LOG(INFO) << node->name() << ": "
+              << "(" << node->type_string() << ")"
+              << node->assigned_device_name();
+  }
+}
+
+Status AssignAndLog(int assigned_device, Node* node,
+                    ColocationGraph* colocation_graph,
+                    bool log_device_placement) {
+  node->set_assigned_device_name_index(assigned_device);
+
+  // Constraint the group of node to the assigned device.
+  TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
+
+  LogDeviceAssignment(node, log_device_placement);
+  return Status::OK();
 }
 
 }  // namespace
 
 Placer::Placer(Graph* graph, const DeviceSet* devices,
-               const SessionOptions* options, const Device* default_device)
+               const Device* default_device, bool allow_soft_placement,
+               bool log_device_placement)
     : graph_(graph),
       devices_(devices),
-      options_(options),
-      log_device_placement_(options != nullptr &&
-                            options->config.log_device_placement()),
-      default_device_(default_device) {}
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {}
+
+Placer::Placer(Graph* graph, const DeviceSet* devices,
+               const Device* default_device)
+    : Placer(graph, devices, default_device, true, false) {}
 
 Placer::Placer(Graph* graph, const DeviceSet* devices)
-    : Placer(graph, devices, nullptr, nullptr) {}
+    : Placer(graph, devices, nullptr, true, false) {}
 
 Placer::~Placer() {}
 
@@ -832,103 +101,31 @@ Status Placer::Run() {
     return errors::FailedPrecondition("No devices are registered");
   }
 
-  ColocationGraph colocation_graph(
-      graph_, devices_,
-      options_ == nullptr || options_->config.allow_soft_placement(),
-      default_device_);
-
-  TF_RETURN_IF_ERROR(colocation_graph.InitializeMembers());
-
-  // 1. First add all of the nodes. Note that steps (1) and (2)
-  // requires two passes over the nodes because the graph (and hence
-  // the constraints) may not be acyclic.
-  TF_RETURN_IF_ERROR(colocation_graph.ColocateAllNodes());
-
-  // 2. Enumerate the constraint edges, and use them to update the disjoint
-  // node set.
-
-  // If `node` has an input edge with reference type, add an edge from the
-  // source of that edge to `node`.
-  for (const Edge* edge : graph_->edges()) {
-    if (edge->IsControlEdge()) {
-      continue;
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_input", *graph_, nullptr);
+    for (const Node* node : graph_->op_nodes()) {
+      VLOG(3) << "    " << node->name() << ": requested: '"
+              << node->requested_device() << "' assigned: '"
+              << node->assigned_device_name() << "'";
     }
-    Node* src = edge->src();
-    Node* dst = edge->dst();
-    DataType input_type = dst->input_type(edge->dst_input());
-    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
-        !IsExemptFromResourceInputColocation(dst)) {
-      // Colocate `src` and `dst` to maintain the invariant that nodes connected
-      // by reference edges are colocated.
-      int src_root_id = colocation_graph.FindRoot(src->id());
-      int dst_root_id = colocation_graph.FindRoot(dst->id());
-      auto& src_root = colocation_graph.members_[src_root_id];
-      auto& dst_root = colocation_graph.members_[dst_root_id];
-      // If both the source node and this node have partially
-      // specified a device, then 'node's device should be
-      // cleared: the reference edge forces 'node' to be on the
-      // same device as the source node.
-      const auto& source_parsed_name = src_root.device_name;
-      const auto& dest_parsed_name = dst_root.device_name;
-      if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
-          DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
-        // Ignore a specified device for 'dst' if the two names were
-        // incompatible.
-        if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
-                                                    dest_parsed_name)) {
-          TF_RETURN_IF_ERROR(
-              colocation_graph.VerifyResourceAndRefInputsCanBeColocated(
-                  dst, src, source_parsed_name));
-          if (log_device_placement_) {
-            LOG(INFO) << "Ignoring device specification "
-                      << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
-                      << " for node '" << dst->name()
-                      << "' because the input edge from '" << src->name()
-                      << "' is a reference connection and already has a device "
-                         "field set to "
-                      << DeviceNameUtils::ParsedNameToString(
-                             source_parsed_name);
-          }
+  }
 
-          // Make 'dst' colocated with the source
-          dst_root.device_name = source_parsed_name;
-        } else {
-          bool source_subset_of_dest = DeviceNameUtils::IsSpecification(
-              source_parsed_name, dest_parsed_name);
-          bool dest_subset_of_source = DeviceNameUtils::IsSpecification(
-              dest_parsed_name, source_parsed_name);
+  ColocationGraph colocation_graph(graph_, devices_, default_device_,
+                                   allow_soft_placement_,
+                                   log_device_placement_);
 
-          if (source_subset_of_dest && !dest_subset_of_source) {
-            src_root.device_name = dest_parsed_name;
-          } else {
-            dst_root.device_name = source_parsed_name;
-          }
-        }
-      }
+  TF_RETURN_IF_ERROR(colocation_graph.Initialize());
 
-      Status status =
-          colocation_graph.ColocateNodes(*src, src_root_id, *dst, dst_root_id);
-      if (!status.ok()) {
-        return AttachDef(
-            errors::InvalidArgument("Nodes were connected by a "
-                                    "reference connection (requiring them to "
-                                    "be on the same device), but the two nodes "
-                                    "were assigned two different devices: ",
-                                    status.error_message()),
-            *dst);
-      }
-    }
-  }
-
-  // 3. For each node, assign a device based on the constraints in the
-  // disjoint node set.
+  // For each node, assign a device based on the constraints in the disjoint
+  // node set.
   std::vector<Node*> second_pass;
   for (Node* node : graph_->op_nodes()) {
     // The graph may have come pre-populated by the framework with assigned
     // devices (e.g., for stateful placements), so the placer should not try to
     // place nodes that are already placed.
     if (node->has_assigned_device_name()) {
-      LogDeviceAssignment(node);
+      TF_RETURN_IF_ERROR(colocation_graph.LimitToAssignedDevice(*node));
+      LogDeviceAssignment(node, log_device_placement_);
       continue;
     }
 
@@ -943,7 +140,7 @@ Status Placer::Run() {
       continue;
     }
 
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -984,13 +181,14 @@ Status Placer::Run() {
       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
     }
 
-    AssignAndLog(assigned_device, node);
+    TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
+                                    log_device_placement_));
   }
 
-  // 4. Perform a second pass assignment for those nodes explicitly
+  // Perform a second pass assignment for those nodes explicitly
   // skipped during the first pass.
   for (Node* node : second_pass) {
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -1023,9 +221,13 @@ Status Placer::Run() {
       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
     }
 
-    AssignAndLog(assigned_device, node);
+    TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
+                                    log_device_placement_));
   }
 
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_output", *graph_, nullptr);
+  }
   return Status::OK();
 }
 
@@ -1046,20 +248,4 @@ bool Placer::CanAssignToDevice(const string& candidate_device_name,
   return false;
 }
 
-void Placer::AssignAndLog(int assigned_device, Node* node) const {
-  node->set_assigned_device_name_index(assigned_device);
-  LogDeviceAssignment(node);
-}
-
-void Placer::LogDeviceAssignment(const Node* node) const {
-  // Log placement if log_device_placement is set.
-  if (log_device_placement_) {
-    printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
-           node->assigned_device_name().c_str());
-    LOG(INFO) << node->name() << ": "
-              << "(" << node->type_string() << ")"
-              << node->assigned_device_name();
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index e3e8f3790c5fc1d6223a9e6ba1d3aa79eca0d3e3..3bb503399c1aac705f020b41261a2575f37d5366 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -68,8 +68,10 @@ class Placer {
   //
   // The "graph", "devices", and "default_device" pointer arguments are borrowed
   // by this Placer, and must outlive it.
-  Placer(Graph* graph, const DeviceSet* devices, const SessionOptions* options,
-         const Device* default_device);
+  Placer(Graph* graph, const DeviceSet* devices, const Device* default_device,
+         bool allow_soft_placement, bool log_device_placement);
+
+  Placer(Graph* graph, const DeviceSet* devices, const Device* default_device);
 
   Placer(Graph* graph, const DeviceSet* devices);
 
@@ -88,16 +90,11 @@ class Placer {
   bool CanAssignToDevice(const string& candidate_device_name,
                          const std::vector<Device*>& devices) const;
 
-  // Assigns 'node's devices to 'assigned_device', and logs the
-  // placement if the SessionOptions entry in 'options_' requests it.
-  void AssignAndLog(int assigned_device, Node* node) const;
-  void LogDeviceAssignment(const Node* node) const;
-
   Graph* const graph_;              // Not owned.
   const DeviceSet* const devices_;  // Not owned.
-  const SessionOptions* options_;   // Not owned.
+  const Device* default_device_;    // Not owned.
+  const bool allow_soft_placement_;
   const bool log_device_placement_;
-  const Device* default_device_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
 };
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 04e77e55f62e1bd9345c8e9113407bbf0a375774..6c056f8640604a093ca1e3698a690367c178bcba 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -24,11 +25,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
@@ -40,6 +45,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+using FDH = ::tensorflow::FunctionDefHelper;
+
+constexpr char kCPU[] = "/device:fakecpu:0";
+constexpr char kGPU[] = "/device:fakegpu:0";
+
+constexpr char kFullCPU[] = "/job:a/replica:0/task:0/device:fakecpu:0";
+constexpr char kFullGPU[] = "/job:a/replica:0/task:0/device:fakegpu:0";
+
 namespace {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -210,24 +225,37 @@ class PlacerTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status BuildGraph(const GraphDef& graph_def, Graph* out_graph) {
+    GraphConstructorOptions opts;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, out_graph));
+    nodes_by_name_.clear();
+    for (Node* node : out_graph->nodes()) {
+      nodes_by_name_[node->name()] = node->id();
+    }
+    return Status::OK();
+  }
+
   // Invokes the Placer on "graph". If no DeviceSet is specified, the
   // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
   //
   // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
-  Status Place(Graph* graph, DeviceSet* devices, SessionOptions* options) {
-    Placer placer(graph, devices, options, nullptr);
+  Status Place(Graph* graph, DeviceSet* devices, bool allow_soft_placement,
+               bool log_device_placement) {
+    Placer placer(graph, devices, nullptr, allow_soft_placement,
+                  log_device_placement);
     return placer.Run();
   }
 
   Status Place(Graph* graph, DeviceSet* devices) {
-    return Place(graph, devices, nullptr);
+    return Place(graph, devices, true, false);
   }
 
-  Status Place(Graph* graph, SessionOptions* options) {
-    return Place(graph, &devices_, options);
+  Status Place(Graph* graph, bool allow_soft_placement,
+               bool log_device_placement) {
+    return Place(graph, &devices_, allow_soft_placement, log_device_placement);
   }
 
-  Status Place(Graph* graph) { return Place(graph, &devices_, nullptr); }
+  Status Place(Graph* graph) { return Place(graph, &devices_, true, false); }
 
   // Returns the node in "graph" with the given name.
   //
@@ -248,6 +276,16 @@ class PlacerTest : public ::testing::Test {
                              const DeviceType& expected_device_type);
 };
 
+// Fixture that add a parameter for allow_soft_placement.
+// Test cases that want to test behavior with and without soft placement
+// can use this fixture instead of PlacerTest.
+class SoftPlacementPlacerTest : public PlacerTest,
+                                public ::testing::WithParamInterface<bool> {};
+
+INSTANTIATE_TEST_SUITE_P(, SoftPlacementPlacerTest,
+                         ::testing::Values(false, true),
+                         ::testing::PrintToStringParamName());
+
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
   do {                                                              \
     Graph& g_ = (g);                                                \
@@ -717,9 +755,7 @@ TEST_F(PlacerTest, TestPartialSpecGpuToCpu) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
   EXPECT_DEVICE_CONTAINS(g, "in", "/device:fakecpu");
   EXPECT_DEVICE_TYPE(g, "var", "FakeGPU");
@@ -866,7 +902,7 @@ TEST_F(PlacerTest, TestResourceHandle) {
 }
 
 TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
-  auto handle_test = [this](bool allow_soft_placement) {
+  auto handle_test = [this](bool allow_soft_placement, bool set_assigned) {
     Graph g(OpRegistry::Global());
     {  // Scope for temporary variables used to construct g.
       GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -878,27 +914,38 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
                     b.opts().WithName("two_handles_in"));
       TF_EXPECT_OK(BuildGraph(b, &g));
 
-      GetNodeByName(g, "var_cpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakecpu:0");
-      GetNodeByName(g, "var_gpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakegpu:0");
+      if (set_assigned) {
+        GetNodeByName(g, "var_cpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakegpu:0");
+      } else {
+        GetNodeByName(g, "var_cpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakegpu:0");
+      }
     }
 
-    SessionOptions options;
-    options.config.set_allow_soft_placement(allow_soft_placement);
-    options.config.set_log_device_placement(true);
-    Status s = Place(&g, &options);
-    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    Status s = Place(&g, allow_soft_placement, true);
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     EXPECT_TRUE(str_util::StrContains(
         s.error_message(),
-        "Could not colocate node with its resource and reference inputs"));
+        "Cannot place the graph because a reference or resource edge "
+        "connects "
+        "colocation groups with incompatible assigned devices: "
+        "/job:a/replica:0/task:0/device:fakegpu:0 vs "
+        "/job:a/replica:0/task:0/device:fakecpu:0"));
+
     return Status::OK();
   };
 
-  TF_EXPECT_OK(handle_test(false));
-  TF_EXPECT_OK(handle_test(true));
+  TF_EXPECT_OK(handle_test(false, false));
+  TF_EXPECT_OK(handle_test(false, true));
+  TF_EXPECT_OK(handle_test(true, false));
+  TF_EXPECT_OK(handle_test(true, true));
 }
 
 // Test that an assignment of an operator to the wrong device
@@ -924,8 +971,7 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeGPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeGPU");
@@ -956,8 +1002,7 @@ TEST_F(PlacerTest, TestReferenceConnectionMoreSpecificDestinationSourceWins) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeCPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
@@ -982,8 +1027,7 @@ TEST_F(PlacerTest, TestReferenceConnectionNoSourceDevice) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeCPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
@@ -999,7 +1043,7 @@ TEST_F(PlacerTest, TestColocationGroup) {
         b.opts().WithName("colocated_1").WithAttr("_class", {"loc:@in"}));
 
     // This will not be colocated with the input because TestInput is
-    // only availbale on CPU and TestRelu will default to GPU.
+    // only available on CPU and TestRelu will default to GPU.
     Node* not_colocated_with_input =
         ops::UnaryOp("TestRelu", input, b.opts().WithName("foo"));
     CHECK(colocated_with_input);
@@ -1034,7 +1078,7 @@ TEST_F(PlacerTest, TestMultipleColocationGroups) {
   EXPECT_COLOCATED(g, "in", "foo");
 }
 
-TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
+TEST_P(SoftPlacementPlacerTest, TestInvalidMultipleColocationGroups) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1051,12 +1095,21 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node foo}} and "
-      "{{colocation_node in}} because no device type supports both of those "
-      "nodes and the other nodes colocated with them"));
+  bool allow_soft_placement = GetParam();
+  Status s = Place(&g, allow_soft_placement, true);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "colocated_1", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "foo", "FakeGPU");
+  } else {
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node foo}} and "
+        "{{colocation_node in}} because no device type supports both of those "
+        "nodes and the other nodes colocated with them"))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
@@ -1086,7 +1139,8 @@ TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
   EXPECT_COLOCATED(g, "var2", "assign1");
 }
 
-TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
+TEST_P(SoftPlacementPlacerTest,
+       TestColocationGroupWithUnsatisfiableReferenceConnections) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1116,12 +1170,19 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node var3}} and {{colocation_node "
-      "assign3}} because no device type supports both of those nodes and the "
-      "other nodes colocated with them."));
+  bool allow_soft_placement = GetParam();
+  Status s = Place(&g, allow_soft_placement, true);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node assign3}} and "
+        "{{colocation_node var2}} because no device type supports both of "
+        "those nodes and the other nodes colocated with them."))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -1372,9 +1433,7 @@ TEST_F(PlacerTest, TestNonexistentGpuAllowSoftPlacement) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_CONTAINS(g, "in", "/device:fakegpu:0");
 }
 
@@ -1389,8 +1448,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
@@ -1406,8 +1464,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacementFormatTag) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(s.error_message(),
@@ -1426,8 +1483,7 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
   EXPECT_TRUE(str_util::StrContains(
@@ -1446,8 +1502,7 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
@@ -1468,8 +1523,7 @@ TEST_F(PlacerTest, TestUseGpuWithNoCuda) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
@@ -1487,9 +1541,7 @@ TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
 }
 
 // Test that a graph with device type and reference constraints on
@@ -1516,9 +1568,7 @@ TEST_F(PlacerTest, TestDeviceTypeConstraintsAllowSoftPlacement) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_TYPE(g, "var_gpu", "FakeGPU");
   EXPECT_DEVICE_TYPE(g, "force_gpu", "FakeGPU");
   EXPECT_COLOCATED(g, "var_gpu", "force_gpu");
@@ -1617,5 +1667,155 @@ TEST_F(PlacerTest, TestGeneratorNodeDoesntFollowNonColocatedConsumers) {
   EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
 }
 
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeGPU"), DummyOp);
+
+TEST_P(SoftPlacementPlacerTest,
+       RequestedDeviceOnResourceGeneratorIsTreatedAsAssigned) {
+  /*
+   *    a:RES:GPU  b:RES:CPU
+   *       |         |
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+
+  bool allow_soft_placement = GetParam();
+  Status s = Place(&g, allow_soft_placement, true);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/device:fakecpu:0' and '/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
+TEST_F(PlacerTest, RequestedDeviceCanBeOverridden) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(Place(&g));
+
+  // All should be colocated
+  EXPECT_COLOCATED(g, "a", "b");
+  EXPECT_COLOCATED(g, "id_a", "id_b");
+  EXPECT_COLOCATED(g, "id1", "id2");
+  EXPECT_COLOCATED(g, "a", "id_a");
+  EXPECT_COLOCATED(g, "a", "id1");
+}
+
+TEST_P(SoftPlacementPlacerTest,
+       AssignedDevicesAreNotOverriddenDueToResourcesAndColocation) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  std::unordered_map<string, Node*> nodes = g.BuildNodeNameIndex();
+  GetNodeByName(g, "id_a")->set_assigned_device_name(kFullGPU);
+  GetNodeByName(g, "id_b")->set_assigned_device_name(kFullCPU);
+
+  bool allow_soft_placement = GetParam();
+
+  Status s = Place(&g, allow_soft_placement, false);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id_a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id_b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/job:a/replica:0/task:0/device:fakecpu:0' and "
+        "'/job:a/replica:0/task:0/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 8be9c7b678e2bbe7659c9e22e31cb595ce704307..603e28b39e171e2de911f88a12ace9f93c421add 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -99,8 +99,6 @@ class PoolAllocator : public Allocator {
     return pool_size_limit_;
   }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   struct PtrRecord {
     void* ptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index b236343a0f2f7e6190d6649724bdd9495e63b681..5a0679bda413c7b2571f028d5db65a68390ba666 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -61,7 +61,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr),
+    : env_(env),
+      device_mgr_(device_mgr),
       lib_def_(lib_def),
       default_thread_pool_(default_thread_pool),
       next_handle_(0),
@@ -86,7 +87,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     CustomKernelCreator custom_kernel_creator,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr),
+    : env_(env),
+      device_mgr_(device_mgr),
       lib_def_(lib_def),
       default_thread_pool_(default_thread_pool),
       next_handle_(0),
@@ -289,6 +291,13 @@ void GetColocationGroup(const Node* node, string* group) {
   }
 }
 
+const string* AssignedOrRequestedDeviceName(const Node& node) {
+  if (node.has_assigned_device_name()) {
+    return &node.assigned_device_name();
+  }
+  return &node.requested_device();
+}
+
 }  // anonymous namespace
 
 Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
@@ -301,7 +310,7 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
   // arguments. To make sure that the output producing nodes have assigned
   // devices, we assign them to arguments first.
   for (Node* node : graph->op_nodes()) {
-    if (node->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (node->IsArg()) {
       const AttrValue* attr_value;
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int64 index = attr_value->i();
@@ -310,28 +319,29 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
   }
 
   for (Node* node : graph->op_nodes()) {
-    if (node->type_string() == FunctionLibraryDefinition::kRetOp) {
+    if (node->IsRetval()) {
       if (output_devices.empty()) {
+        VLOG(3) << "Trying to determine device for node " << node->name();
         // If output_devices are empty, the node producing retval
         // must have explicitly assigned device or a colocation constraint
         // to a node with explicitly assigned device.
         for (const auto& it : node->in_edges()) {
           if (!it->IsControlEdge()) {
             Node* src_node = it->src();
-            const string* src_device = &src_node->requested_device();
+            const string* src_device = AssignedOrRequestedDeviceName(*src_node);
             string colocation_group = "";
             GetColocationGroup(src_node, &colocation_group);
+            VLOG(3) << "Considering src: " << src_node->name()
+                    << " src_device: " << *src_device
+                    << " colo group: " << colocation_group;
             while (src_device->empty() && colocation_group.empty() &&
                    src_node->IsIdentity()) {
               src_node = *src_node->in_nodes().begin();
-              src_device = &src_node->requested_device();
-              if (src_device->empty()) {
-                // Some node (e.g. _Args) can have no requested_device,
-                // but have assigned_device.
-                src_device = &src_node->assigned_device_name();
-              }
-
+              src_device = AssignedOrRequestedDeviceName(*src_node);
               GetColocationGroup(src_node, &colocation_group);
+              VLOG(3) << "Considering src: " << src_node->name()
+                      << " src_device: " << *src_device
+                      << " colo group: " << colocation_group;
             }
 
             if (!colocation_group.empty()) {
@@ -350,10 +360,13 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
               }
               std::vector<Device*> matching_devices;
               device_set.FindMatchingDevices(parsed, &matching_devices);
-              if (matching_devices.size() != 1) {
+              if (matching_devices.empty()) {
+                return errors::InvalidArgument(
+                    "Unable to find any devices for spec ", *src_device);
+              } else if (matching_devices.size() != 1) {
                 // Convert a vector of devices to a string.
                 // Using absl::StrJoin did not work in Android builds.
-                string devices = "]";
+                string devices = "[";
                 for (Device* device : matching_devices) {
                   devices.append(device->name());
                   devices.append(", ");
@@ -370,6 +383,9 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
                     "device. Matched devices are ",
                     devices);
               }
+              VLOG(3) << "Setting output device to "
+                      << matching_devices[0]->name() << " for node "
+                      << node->DebugString();
               node->set_assigned_device_name(matching_devices[0]->name());
             }
           }
@@ -380,6 +396,8 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
         int64 index = attr_value->i();
         // output_devices size is checked in InstantiateMultiDevice
         DCHECK_GT(output_devices.size(), index);
+        VLOG(3) << "Setting output device to " << output_devices[index]
+                << " for return at index " << index;
         node->set_assigned_device_name(output_devices[index]);
       }
     }
@@ -461,7 +479,8 @@ Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
                        const FunctionDef* fdef,
                        const FunctionLibraryDefinition* lib_def,
                        std::unique_ptr<Graph>* graph,
-                       std::vector<string>* ret_node_names) {
+                       std::vector<string>* ret_node_names,
+                       std::vector<string>* control_ret_node_names) {
   auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
     return lib_def->LookUpOpDef(op, sig);
   };
@@ -481,6 +500,10 @@ Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
   for (const Node* node : fbody->ret_nodes) {
     ret_node_names->push_back(node->name());
   }
+  control_ret_node_names->reserve(fbody->control_ret_nodes.size());
+  for (const Node* node : fbody->control_ret_nodes) {
+    control_ret_node_names->push_back(node->name());
+  }
   return Status::OK();
 }
 
@@ -504,7 +527,17 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   }
 
   VLOG(1) << "Instantiating MultiDevice function \"" << function_name
-          << "\" on default device " << options.target;
+          << "\" on default device \"" << options.target << "\"";
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "Requested input devices:";
+    for (const string& device : options.input_devices) {
+      VLOG(3) << "    " << device;
+    }
+    VLOG(3) << "Requested output devices:";
+    for (const string& device : options.output_devices) {
+      VLOG(3) << "    " << device;
+    }
+  }
 
   const FunctionLibraryDefinition* lib_def =
       options.overlay_lib == nullptr ? lib_def_ : options.overlay_lib;
@@ -519,9 +552,18 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   std::unique_ptr<Graph> graph;
   std::vector<string> ret_node_names;
+  std::vector<string> control_ret_node_names;
 
   TF_RETURN_IF_ERROR(GetGraphAndRets(function_name, attrs, fdef, lib_def,
-                                     &graph, &ret_node_names));
+                                     &graph, &ret_node_names,
+                                     &control_ret_node_names));
+
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectRawGraph(def);
+  }
 
   DeviceSet device_set;
   for (auto d : device_mgr_->ListDevices()) {
@@ -531,17 +573,6 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   TF_RETURN_IF_ERROR(PinArgsAndRets(
       options.input_devices, options.output_devices, device_set, graph.get()));
 
-  // Make the FunctionLibraryRuntime's device the default device if
-  // nothing else is hard coded. This allows the same function definition
-  // to be specialized to different devices depending on the
-  // PartitionedCallOp's device.
-  FunctionLibraryRuntime* flr = GetFLR(options.target);
-  if (flr == nullptr) {
-    return errors::InvalidArgument(
-        "Cannot instantiate multi-device function with target device ",
-        options.target);
-  }
-
   std::unique_ptr<MultiDeviceFunctionData> data =
       MakeUnique<MultiDeviceFunctionData>(function_name, function_key,
                                           ret_node_names.size(),
@@ -550,7 +581,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   GraphOptimizationPassOptions optimization_options;
   // TODO(iga): Thread other relevant options from SessionOptions.
   SessionOptions session_options;
-  session_options.env = flr->env();
+  session_options.env = env_;
+  session_options.config = options.config_proto;
   optimization_options.session_options = &session_options;
   optimization_options.graph = &graph;
   optimization_options.flib_def = &data->overlay_lib_;
@@ -561,25 +593,40 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
   DumpGraph("Before calling Placer", graph.get());
-  Placer placer(graph.get(), &device_set, nullptr, /* No session options */
-                flr->device() /* Default device */);
+  // Make the FunctionLibraryRuntime's device the default device if
+  // nothing else is hard coded. This allows the same function definition
+  // to be specialized to different devices depending on the
+  // PartitionedCallOp's device.
+  Device* default_device = nullptr;
+  if (!options.target.empty()) {
+    FunctionLibraryRuntime* flr = GetFLR(options.target);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Cannot instantiate multi-device function with target device ",
+          options.target);
+    }
+    default_device = flr->device();
+  }
+
+  // TODO(b/124993244): Smartly merge options in nested defuns, and raise
+  // exceptions/warnings in case where nested function call options are ignored.
+  Placer placer(graph.get(), &device_set, default_device,
+                options.config_proto.allow_soft_placement(),
+                options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run());
 
   DumpGraph("Before running POST_PLACEMENT passes", graph.get());
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
-  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
-  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
-      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
-  DumpGraph("After all optimization passes", graph.get());
 
   Device* cpu_device;
   TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
 
   if (options.optimize_graph_fn) {
-    Status status = options.optimize_graph_fn(std::move(ret_node_names),
-                                              &data->overlay_lib_, device_set,
-                                              cpu_device, &graph);
+    DumpGraph("Before running graph optimization fn", graph.get());
+    Status status = options.optimize_graph_fn(
+        std::move(ret_node_names), std::move(control_ret_node_names),
+        &data->overlay_lib_, device_set, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
@@ -587,6 +634,18 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     DumpGraph("After optimization", graph.get());
   }
 
+  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  DumpGraph("After all optimization passes", graph.get());
+
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectOptimizedGraph(def);
+  }
+
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   TF_RETURN_IF_ERROR(
       PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
@@ -595,7 +654,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     for (const auto& pair : subgraphs) {
       GraphDef def;
       pair.second->ToGraphDef(&def);
-      options.graph_collector->CollectGraph(def);
+      *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+      options.graph_collector->CollectPartitionedGraph(def);
     }
   }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a08e84510737190c628775f6a8002a1190056207..14f3635c5217e13bcaaf71b450ecaba01b5dddc1 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -137,6 +137,8 @@ class ProcessFunctionLibraryRuntime {
            std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) const;
 
+  const DeviceMgr* device_mgr() { return device_mgr_; }
+
  private:
   friend class FunctionLibraryRuntimeImpl;
 
@@ -285,6 +287,7 @@ class ProcessFunctionLibraryRuntime {
 
   mutable mutex mu_;
 
+  Env* const env_;
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index f30e440c29d7c0a9fb8848ae6c54327b4ea83c33..8f3cc5c4446f9f8cf131990e27178d048a514c72 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -83,7 +83,7 @@ class ProcessState : public ProcessStateInterface {
 
   // If these flags need to be runtime configurable consider adding
   // them to ConfigProto.
-  static const bool FLAGS_brain_mem_reg_cuda_dma = true;
+  static const bool FLAGS_brain_mem_reg_gpu_dma = true;
   static const bool FLAGS_brain_gpu_record_mem_types = false;
 
   // Helper method for unit tests to reset the ProcessState singleton by
@@ -130,7 +130,7 @@ class RecordingAllocator : public Allocator {
   bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
   size_t RequestedSize(const void* p) override { return a_->RequestedSize(p); }
   size_t AllocatedSize(const void* p) override { return a_->AllocatedSize(p); }
-  void GetStats(AllocatorStats* stats) override { a_->GetStats(stats); }
+  absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); }
   void ClearStats() override { a_->ClearStats(); }
   ProcessState::MDMap* mm_;  // not owned
   Allocator* a_;             // not owned
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index e1dc08d64545ece29a8aa2ab2612dd3cd994559e..d42b8d55e4f50606578cf249e1f245b72cd7bd24 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -34,14 +34,23 @@ namespace tensorflow {
 
 namespace {
 
+int32 DefaultNumInterOpThreads() {
+  // Use environment setting if specified (init once)
+  static int env_num_threads = NumInterOpThreadsFromEnvironment();
+  if (env_num_threads > 0) {
+    return env_num_threads;
+  }
+
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+}
+
 static thread::ThreadPool* InitComputePool(const SessionOptions& options) {
   int32 inter_op_parallelism_threads =
       options.config.inter_op_parallelism_threads();
   if (inter_op_parallelism_threads == 0) {
-    // Default to using the number of cores available in the process.
-    inter_op_parallelism_threads = port::NumSchedulableCPUs();
+    inter_op_parallelism_threads = DefaultNumInterOpThreads();
   }
-
   return new thread::ThreadPool(Env::Default(), "Compute",
                                 inter_op_parallelism_threads);
 }
@@ -53,6 +62,18 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTEROP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
+int32 NumIntraOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op != 0) return inter_op;
@@ -67,7 +88,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #endif  // _OPENMP
     DCHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
-        (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+        (DefaultNumInterOpThreads() + mkl_intra_op - 1) / mkl_intra_op, 2);
     VLOG(0)
         << "Creating new thread pool with default inter op setting: "
         << mkl_inter_op
@@ -75,8 +96,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
     return mkl_inter_op;
   }
 #endif  // INTEL_MKL
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
+  return DefaultNumInterOpThreads();
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index 5d9266671617320eea4cea60de1ebd7210f3b674..7ad658be9f785032c85f20224a4d592ded7e283c 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,7 +30,18 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
-// Returns number of inter op threads.
+// Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
+int32 NumInterOpThreadsFromEnvironment();
+
+// Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
+int32 NumIntraOpThreadsFromEnvironment();
+
+// Returns the number of inter op threads specified in `options` or a default.
+// If no value is specified in the provided options, then the function returns
+// the value defined in the TF_NUM_INTEROP_THREADS environment variable.
+// If neither a value is specified in the options or in the environment,
+// this function will return a reasonable default value based on the number
+// of schedulable CPUs, and any MKL and OpenMP configurations.
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c20cc74bf70e3340848666a179c1bb3617a4ede6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -0,0 +1,430 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_alg.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+// A ring algorithm exchanges chunks of tensor between devices.  The chunk size
+// depends on the number of subdivisions specified in the algorithm.  If the
+// user does not specify the number of subdivisions we may infer the number
+// dynamically so that the resulting chunk size does not exceed
+// kMaxChunkSizeBytes, empirically set at 4 MiB.
+constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
+// kMaxSubdivsPerDev is used to give an upper bound on the number of
+// subdivisions dynamically generated.  A reasonable value would be a small
+// multiple of the number of NICs adjacent to each device.
+constexpr int kMaxSubdivsPerDevice = 2;
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingAlg instances.  Note that the exec_key will differentiate between
+// different instances consequently we don't need to further differentiate
+// between subclasses of RingAlg.
+string RingAlgBufKey(const string& name, const string& exec_key, int pass,
+                     int section, int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat(name, "(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
+    // hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingAlg::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingAlg::RingField* RingAlg::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingAlg::RingAlg(CollectiveType type, const string& name)
+    : type_(type),
+      name_(name),
+      col_ctx_(nullptr),
+      col_params_(nullptr),
+      done_(nullptr),
+      group_size_(-1),
+      num_subdivs_(-1) {}
+
+namespace {
+Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.shape.num_elements() == 0) {
+    return errors::Internal("shape in CollectiveParams should be non-empty");
+  }
+  const int kAvgDevPerTask =
+      col_params->group.group_size / col_params->group.num_tasks;
+  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
+  if (kMaxNumSubdivs <= 0) {
+    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
+                            " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
+  // as many offsets as needed so that the size of tensor chunks <=
+  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
+  // lead to worse performance.
+  int num_subdivs = 0;
+  const size_t tensor_size = col_params->instance.shape.num_elements() *
+                             DataTypeSize(col_params->instance.data_type);
+  size_t chunk_size;
+  do {
+    ++num_subdivs;
+    int num_chunks = col_params->group.group_size * num_subdivs;
+    chunk_size = tensor_size / num_chunks;
+    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
+            << " chunk_size " << chunk_size;
+  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
+  if (num_subdivs <= 0) {
+    return errors::Internal("Unexpected num_subdivs ", num_subdivs, " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+
+  int subdiv_stride = kAvgDevPerTask / num_subdivs;
+  if (subdiv_stride == 0) subdiv_stride = 1;
+  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
+  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+    int subdiv_offset = subdiv_stride * sdi;
+    if (sdi % 2 == 1) subdiv_offset *= -1;
+    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
+  }
+
+  if (VLOG_IS_ON(2)) {
+    string subdiv_buf;
+    for (const int subdiv_offset :
+         col_params->instance.impl_details.subdiv_offsets) {
+      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+    }
+    VLOG(2) << "Dynamically generated " << num_subdivs
+            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
+            << tensor_size << " chunk_size " << chunk_size;
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
+  const string& device_name =
+      col_params->instance.device_names[col_params->default_rank];
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(col_params->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &col_params->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < col_params->group.group_size; ++di) {
+    if (col_params->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &col_params->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  DCHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
+
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
+  }
+
+  // Generate a ring permutation for requested offset.
+  VLOG(2) << "Setting up perms for col_params " << col_params
+          << " subdiv_permutations "
+          << &col_params->instance.impl_details.subdiv_permutations;
+  col_params->instance.impl_details.subdiv_permutations.resize(
+      col_params->instance.impl_details.subdiv_offsets.size());
+  col_params->subdiv_rank.resize(
+      col_params->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0;
+       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
+    std::vector<int>& perm =
+        col_params->instance.impl_details.subdiv_permutations[sdi];
+    DCHECK_EQ(perm.size(), 0);
+    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
+    // A negative subdivision offset is interpreted as follows:
+    //  1. Reverse the local device ordering.
+    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
+    bool reverse = false;
+    if (offset < 0) {
+      offset = abs(offset);
+      reverse = true;
+    }
+    int prior_dev_count = 0;  // sum over prior worker device counts
+    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int di_offset = (di + offset) % dev_per_task[ti];
+        int offset_di =
+            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
+        // Device index in global subdivision permutation.
+        int permuted_di = prior_dev_count + offset_di;
+        int rank = static_cast<int>(perm.size());
+        perm.push_back(permuted_di);
+        if (col_params->instance.device_names[permuted_di] == device_name) {
+          DCHECK_EQ(permuted_di, col_params->default_rank);
+          col_params->subdiv_rank[sdi] = rank;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    DCHECK_EQ(col_params->group.group_size, perm.size());
+  }
+
+  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
+  return Status::OK();
+}
+
+Status RingAlg::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  DCHECK(col_ctx->dev_mgr);
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+string RingAlg::TensorDebugString(const Tensor& tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
+        [&note](const Status& s) {
+          DCHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingAlg::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting Ring" << name_ << " with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_ctx_->col_exec->StartAbort(s);
+  }
+}
+
+void RingAlg::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(col_ctx_->output);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  rfv_.clear();  // Give up Refs on output tensor.
+  done_(s);
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingAlg::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                            int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaneously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_->subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingAlg::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  DCHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingAlg::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_send);
+  string send_buf_key = RingAlgBufKey(name_, col_ctx_->exec_key,
+                                      rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_->instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_ctx_->col_exec->PostToPeer(
+      col_params_->instance.device_names[send_to_dev_idx],
+      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
+      col_ctx_->device_locality, done);
+}
+
+void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_recv);
+  string recv_buf_key =
+      RingAlgBufKey(name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
+                    (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_ctx_->col_exec->RecvFromPeer(
+      col_params_->instance.device_names[rf->recv_dev_idx],
+      col_params_->instance.task_names[rf->recv_dev_idx],
+      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
+      col_ctx_->device_locality, rf->subdiv_idx, done);
+}
+
+string RingAlg::FieldState() {
+  string s = strings::StrCat(
+      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64>(this)),
+      " exec ", col_ctx_->exec_key, " step_id=", col_ctx_->step_id,
+      " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc07618f8805e3a9abcaf575c3d2984aa27948b7
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Basic ring-algorithm implementation to be further specialized
+// for specific collective functions.
+class RingAlg : public CollectiveImplementationInterface {
+ public:
+  explicit RingAlg(CollectiveType type, const string& name);
+  ~RingAlg() override {}
+
+  // Establishes the requested number of subdivision permutations based on the
+  // ring order implicit in the device order.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // No-op for ring alg.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
+ protected:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void Finish(bool ok);
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                             int field_idx);
+  void AdvanceToSecondPass(RingField* rf);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(const Tensor& tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  const CollectiveType type_;
+  const string name_;
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  int group_size_;
+  int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58251fc171459ee35820c3157c48e4222e9f1ec2
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -0,0 +1,266 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+Status RingGatherer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  DCHECK_EQ(col_params->instance.type, GATHER_COLLECTIVE);
+  DCHECK_EQ(col_params->instance.impl_details.collective_name, "RingGather");
+  // TODO(tucker): Maybe add subdiv support.  It's only useful with
+  // multiple NICS, and maybe gather performance isn't important enough.
+  // For now, there must always be only a single subdiv at offset 0.
+  if (!col_params->instance.impl_details.subdiv_offsets.empty() &&
+      (col_params->instance.impl_details.subdiv_offsets.size() > 1 ||
+       col_params->instance.impl_details.subdiv_offsets[0] != 0)) {
+    return errors::InvalidArgument(
+        "RingGather cannot take any subdiv offset other than 0.");
+  }
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    col_params->instance.impl_details.subdiv_offsets.push_back(0);
+  }
+  return RingAlg::InitializeCollectiveParams(col_params);
+}
+
+void RingGatherer::Run(StatusCallback done) {
+  DCHECK(col_ctx_);
+  DCHECK(col_params_);
+  done_ = std::move(done);
+  group_size_ = col_params_->group.group_size;
+  num_subdivs_ = static_cast<int>(
+      col_params_->instance.impl_details.subdiv_permutations.size());
+  DCHECK_GT(num_subdivs_, 0);
+
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_->instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_->instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_->instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x :
+           col_params_->instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingGatherer::Run for device " << col_ctx_->device_name
+            << " default_rank " << col_params_->default_rank << "\n"
+            << buf;
+  }
+
+  // Prepare to alias fields within the output.
+  AllocatorAttributes attr = col_ctx_->op_ctx->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(col_ctx_->output, group_size_ * num_subdivs_,
+                                  col_ctx_->device->GetAllocator(attr),
+                                  false /*align_chunks*/));
+
+  // Start by copying input to the rank-specific offset of output.
+  // We are running in a blockable thread and the callback can't block so
+  // just wait here on the copy.
+  Notification note;
+  Status status;
+  Tensor alias_chunk(ca_->ChunkAlias(col_params_->subdiv_rank[0]));
+  CollectiveRemoteAccessLocal::MemCpyAsync(
+      col_ctx_->op_ctx->input_device_context(0),
+      col_ctx_->op_ctx->op_device_context(), col_ctx_->device, col_ctx_->device,
+      col_ctx_->op_ctx->input_alloc_attr(0),
+      col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input, &alias_chunk,
+      0 /*dev_to_dev_stream_index*/, [&note, &status](const Status& s) {
+        status.Update(s);
+        note.Notify();
+      });
+  note.WaitForNotification();
+  if (!status.ok()) {
+    done_(status);
+    return;
+  }
+  Finish(RunAsyncParts());
+}
+
+bool RingGatherer::RunAsyncParts() {
+  // This function orchestrates RingGatherer actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+  const DeviceBase::GpuDeviceInfo* gpu_info =
+      col_ctx_->device->tensorflow_gpu_device_info();
+  if (gpu_info) {
+    // Wait for all currently queued events on the CPU compute stream to
+    // complete before proceeding.  The previous InitRingField calls allocated
+    // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
+    // write) unless we do.
+    Notification note;
+    Status s = gpu_info->default_context->ThenExecute(
+        col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
+    if (s.ok()) {
+      note.WaitForNotification();
+    } else {
+      mutex_lock l(status_mu_);
+      status_ =
+          errors::Internal("Failed to dispatch ThenExecute in RingGatherer");
+      return false;
+    }
+  }
+
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          DCHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          rf->action = RF_SEND_READY;
+          break;
+        case RF_REDUCE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_FINALIZE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          DCHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        // There's only one pass.
+        ++field_done_count;
+        break;  // from do while(!dispatched)
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {
+        }  // Ignore any other actions
+      }
+    }
+  }
+
+  DCHECK_EQ(send_pending_count, 0);
+  DCHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " device=" << col_ctx_->device_name << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+REGISTER_COLLECTIVE(RingGather, RingGatherer);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_gatherer.h b/tensorflow/core/common_runtime/ring_gatherer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee9634834d2b6c9d986cfb1841ae03c51e22564b
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Ring-algorithm implementation of collective all-gather.
+class RingGatherer : public RingAlg {
+ public:
+  RingGatherer() : RingAlg(GATHER_COLLECTIVE, "Gather") {}
+  ~RingGatherer() override {}
+
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Begins async execution of the ring gather algorithm.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+ private:
+  bool RunAsyncParts();
+
+  friend class RingGathererTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97ff7b58fa700d72bde145c0cb789228cf163cc6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -0,0 +1,651 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+        done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+static int64 kStepId = 123;
+
+class RingGathererTest : public ::testing::Test {
+ protected:
+  RingGathererTest() : device_type_(DEVICE_CPU) {}
+
+#ifdef GOOGLE_CUDA
+  void InitGPUDevices() {
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+  }
+#endif
+
+  ~RingGathererTest() override {
+    stop_ = true;
+    for (auto i : instances_) delete i;
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+#ifdef GOOGLE_CUDA
+    InitGPUDevices();
+#endif
+    device_type_ = device_type;
+    std::vector<std::unique_ptr<Device>> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    }
+    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    col_params_.instance.impl_details.collective_name = "RingGather";
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Gather(int fail_after) {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoGather();
+        ++done;
+      });
+      if (fail_after > 0) {
+        // Stagger the op execution starts.
+        Env::Default()->SleepForMicroseconds(100);
+      }
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    int32 output_len = tensor_len * num_workers * num_devices;
+    std::vector<T> expected(output_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      int32 instance_offset = di * tensor_len;
+      instance->InitTensor(dtype, TensorShape({tensor_len}),
+                           [instance_offset, &expected, dtype, di](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               // The cast is necessary to prevent clang-tidy
+                               // from insisting that a faster non-open source
+                               // function be substituted.
+                               float value =
+                                   pow(10, static_cast<double>(di)) * i;
+                               if (dtype == DT_INT32 || dtype == DT_INT64) {
+                                 value = di * 10 + i;
+                               }
+                               t->flat<T>()(i) = static_cast<T>(value);
+                               expected[instance_offset + i] = value;
+                             }
+                           });
+    }
+    Gather(fail_after);
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device accumulated the same set of correct
+      // values.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->output_tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({output_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        auto alias = actual.template unaligned_flat<T>();
+        for (int i = 0; i < output_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveGather(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_gather_", gather_counter_++),
+        "CollectiveGather");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", params.instance.shape)
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void RunSubdivPermsTest(
+      CollectiveParams* cp,
+      const std::vector<std::vector<int>>& expected_subdiv_perms,
+      const std::vector<int>& expected_subdiv_rank) {
+    col_exec_ = nullptr;
+    cp->instance.impl_details.subdiv_permutations.clear();
+    cp->subdiv_rank.clear();
+    // Create a stub ring gatherer only for testing param initialization.
+    RingGatherer gatherer;
+    TF_CHECK_OK(gatherer.InitializeCollectiveParams(cp));
+    EXPECT_EQ(expected_subdiv_perms,
+              cp->instance.impl_details.subdiv_permutations);
+    EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingGathererTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      input_tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&input_tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &input_tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoGather() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&input_tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveGather(
+          col_params_, &input_tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TensorShape output_shape({static_cast<int64>(
+          parent_->instances_.size() * input_tensor_.shape().num_elements())});
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, output_shape,
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+      // Prepare a RingGatherer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingGatherer gatherer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &input_tensor_, output_tensor_ptr);
+      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-gather.
+      gatherer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(output_tensor_.CopyFrom(*ctx.mutable_output(0),
+                                      ctx.mutable_output(0)->shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& input_tensor() { return input_tensor_; }
+    const Tensor& output_tensor() { return output_tensor_; }
+
+    RingGathererTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor input_tensor_;
+    Tensor output_tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
+  mutex mu_;
+  int32 gather_counter_ GUARDED_BY(mu_) = 0;
+};
+
+CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
+                                       const int num_tasks) {
+  CollectiveParams cp;
+  const int kNumDevs = num_devs_per_task * num_tasks;
+  cp.group.group_key = 1;
+  cp.group.group_size = kNumDevs;
+  cp.group.device_type = DeviceType("GPU");
+  cp.group.num_tasks = num_tasks;
+  cp.instance.instance_key = 3;
+  cp.instance.type = GATHER_COLLECTIVE;
+  cp.instance.data_type = DataType(DT_FLOAT);
+  cp.instance.shape = TensorShape({kNumDevs * kNumDevs});
+  cp.instance.impl_details.collective_name = "RingGather";
+  cp.instance.impl_details.subdiv_offsets.push_back(0);
+  cp.is_source = false;
+  for (int i = 0; i < kNumDevs; ++i) {
+    int task_id = i / num_devs_per_task;
+    int dev_id = i % num_devs_per_task;
+    string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
+    string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
+    cp.instance.task_names.push_back(task_name);
+    cp.instance.device_names.push_back(device_name);
+  }
+  return cp;
+}
+
+TEST_F(RingGathererTest, InitializeParams) {
+  const int kNumDevsPerTask = 8;
+  const int kNumTasks = 3;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  cp.default_rank = 0;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.instance.impl_details.subdiv_offsets = {0};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.default_rank = 3;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {3});
+}
+
+// TODO(b/113171733): change to use TEST_P.
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingGathererTest,                                                    \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 1, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 1)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingGatherer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 32768, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 1, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 5)
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 092f15e49e330de21452e0f7b4d8cc51607a44ed..3328804cdfb00ecbbc473add3984b414add06b1e 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -39,212 +39,15 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
-// Set true for greater intelligibility of debug mode log messages.
-#define READABLE_KEYS false
-// RingReduce algorithm exchanges chunks of tensor between devices.  The chunk
-// size depends on the number of subdivisions specified in the algorithm.  If
-// the user does not specify the number of subdivisions, we infer the number
-// dynamically so that the resulting chunk size does not exceed
-// kMaxChunkSizeBytes, empirically set at 4 MiB.
-constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
-// kMaxSubdivsPerDev is used to give an upper bound on the number of
-// subdivisions dynamically generated.  A reasonable value would be a small
-// multiple of the number of NICs adjacent to each device.
-constexpr int kMaxSubdivsPerDevice = 2;
-
 namespace tensorflow {
-namespace {
-// Each CollectiveOp implementation is free to define its own
-// BufRendezvous key format.  This function produces the key used by
-// RingReducer.
-string RingReduceBufKey(const string& exec_key, int pass, int section,
-                        int source_rank) {
-  if (READABLE_KEYS) {
-    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
-                           section, "):srcrank(", source_rank, ")");
-  } else {
-    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
-    // hash.
-    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
-  }
-}
-
-}  // namespace
-
-void RingReducer::PCQueue::Enqueue(RingField* rf) {
-  mutex_lock l(pcq_mu_);
-  deque_.push_back(rf);
-  if (waiter_count_ > 0) {
-    cv_.notify_one();
-  }
-}
-
-RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
-  mutex_lock l(pcq_mu_);
-  if (deque_.empty()) {
-    ++waiter_count_;
-    while (deque_.empty()) {
-      cv_.wait(l);
-    }
-    --waiter_count_;
-  }
-  RingField* rf = deque_.front();
-  deque_.pop_front();
-  return rf;
-}
-
-RingReducer::RingReducer()
-    : col_ctx_(nullptr),
-      col_params_(nullptr),
-      done_(nullptr),
-      group_size_(-1),
-      num_subdivs_(-1) {}
 
 RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
 
-Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
-  if (col_params->instance.shape.num_elements() == 0) {
-    return errors::Internal("shape in CollectiveParams should be non-empty");
-  }
-  const int kAvgDevPerTask =
-      col_params->group.group_size / col_params->group.num_tasks;
-  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
-  if (kMaxNumSubdivs <= 0) {
-    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
-                            " in RingReducer");
-  }
-  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
-  // as many offsets as needed so that the size of tensor chunks <=
-  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
-  // lead to worse performance.
-  int num_subdivs = 0;
-  const size_t tensor_size = col_params->instance.shape.num_elements() *
-                             DataTypeSize(col_params->instance.data_type);
-  size_t chunk_size;
-  do {
-    ++num_subdivs;
-    int num_chunks = col_params->group.group_size * num_subdivs;
-    chunk_size = tensor_size / num_chunks;
-    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
-            << " chunk_size " << chunk_size;
-  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
-  if (num_subdivs <= 0) {
-    return errors::Internal("Unexpected num_subdivs ", num_subdivs,
-                            " in RingReducer");
-  }
-
-  int subdiv_stride = kAvgDevPerTask / num_subdivs;
-  if (subdiv_stride == 0) subdiv_stride = 1;
-  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
-  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
-    int subdiv_offset = subdiv_stride * sdi;
-    if (sdi % 2 == 1) subdiv_offset *= -1;
-    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
-  }
-
-  if (VLOG_IS_ON(2)) {
-    string subdiv_buf;
-    for (const int subdiv_offset :
-         col_params->instance.impl_details.subdiv_offsets) {
-      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
-    }
-    VLOG(2) << "Dynamically generated " << num_subdivs
-            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
-            << tensor_size << " chunk_size " << chunk_size;
-  }
-
-  return Status::OK();
-}
-
 Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
   // TODO(b/113171733): change CHECKs to return errors.
   CHECK_EQ(col_params->instance.type, REDUCTION_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name, "RingReduce");
-  const string& device_name =
-      col_params->instance.device_names[col_params->default_rank];
-  // Each subdiv permutation is a ring formed by rotating each
-  // single-task subsequence of devices by an offset.  This makes most
-  // sense when each task has the same number of devices but we can't
-  // depend on that being the case so we'll compute something that
-  // works in any case.
-
-  // Start by counting the devices in each task.
-  // Precondition: device_names must be sorted so that all devices in
-  // the same task are adjacent.
-  VLOG(2) << "Sorted task names: "
-          << str_util::Join(col_params->instance.task_names, ", ");
-  std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->instance.task_names[0];
-  int dev_count = 1;
-  for (int di = 1; di < col_params->group.group_size; ++di) {
-    if (col_params->instance.task_names[di] != *prior_task_name) {
-      dev_per_task.push_back(dev_count);
-      dev_count = 1;
-      prior_task_name = &col_params->instance.task_names[di];
-    } else {
-      ++dev_count;
-    }
-  }
-  dev_per_task.push_back(dev_count);
-  CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
-
-  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
-    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
-  }
-
-  // Generate a ring permutation for requested offset.
-  VLOG(2) << "Setting up perms for col_params " << col_params
-          << " subdiv_permutations "
-          << &col_params->instance.impl_details.subdiv_permutations;
-  col_params->instance.impl_details.subdiv_permutations.resize(
-      col_params->instance.impl_details.subdiv_offsets.size());
-  col_params->subdiv_rank.resize(
-      col_params->instance.impl_details.subdiv_offsets.size(), -1);
-  for (int sdi = 0;
-       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
-    std::vector<int>& perm =
-        col_params->instance.impl_details.subdiv_permutations[sdi];
-    CHECK_EQ(perm.size(), 0);
-    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
-    // A negative subdivision offset is interpreted as follows:
-    //  1. Reverse the local device ordering.
-    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
-    bool reverse = false;
-    if (offset < 0) {
-      offset = abs(offset);
-      reverse = true;
-    }
-    int prior_dev_count = 0;  // sum over prior worker device counts
-    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
-      for (int di = 0; di < dev_per_task[ti]; ++di) {
-        int di_offset = (di + offset) % dev_per_task[ti];
-        int offset_di =
-            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
-        // Device index in global subdivision permutation.
-        int permuted_di = prior_dev_count + offset_di;
-        int rank = static_cast<int>(perm.size());
-        perm.push_back(permuted_di);
-        if (col_params->instance.device_names[permuted_di] == device_name) {
-          CHECK_EQ(permuted_di, col_params->default_rank);
-          col_params->subdiv_rank[sdi] = rank;
-        }
-      }
-      prior_dev_count += dev_per_task[ti];
-    }
-    CHECK_EQ(col_params->group.group_size, perm.size());
-  }
-
-  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
-  return Status::OK();
-}
-
-Status RingReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
-  CHECK(col_ctx->dev_mgr);
-  col_ctx_ = col_ctx;
-  col_params_ = &col_ctx->col_params;
-  return collective_util::InitializeDeviceAndLocality(
-      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
-      &col_ctx->device_locality);
+  return RingAlg::InitializeCollectiveParams(col_params);
 }
 
 void RingReducer::Run(StatusCallback done) {
@@ -303,25 +106,6 @@ void RingReducer::Run(StatusCallback done) {
   ContinueAfterInputCopy();
 }
 
-string RingReducer::TensorDebugString(const Tensor& tensor) {
-  const DeviceBase::GpuDeviceInfo* gpu_device_info =
-      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  if (gpu_device_info) {
-    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
-    Notification note;
-    gpu_device_info->default_context->CopyDeviceTensorToCPU(
-        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
-        [&note](const Status& s) {
-          CHECK(s.ok());
-          note.Notify();
-        });
-    note.WaitForNotification();
-    return cpu_tensor.SummarizeValue(64);
-  } else {
-    return tensor.SummarizeValue(64);
-  }
-}
-
 // Note that this function is blocking and must not run in any thread
 // which cannot be blocked.
 void RingReducer::ContinueAfterInputCopy() {
@@ -358,201 +142,16 @@ void RingReducer::ContinueAfterInputCopy() {
   Finish(RunAsyncParts());
 }
 
-void RingReducer::StartAbort(const Status& s) {
-  // In abort mode we stop issuing additional ProvideBuf
-  // and ConsumeBuf calls, but we need to wait for all of the
-  // outstanding callbacks to be invoked before quitting.
-  bool abort_started = false;
-  {
-    mutex_lock l(status_mu_);
-    if (status_.ok()) {
-      LOG(ERROR) << "Aborting RingReduce with " << s;
-      abort_started = true;
-      status_.Update(s);
-    }
-  }
-  // If this is the initial entry to abort mode then invoke StartAbort
-  // on the CollectiveExecutor that invoked us.  That should start
-  // cancellation on all of the outstanding CollectiveRemoteAccess
-  // actions.
-  if (abort_started) {
-    col_ctx_->col_exec->StartAbort(s);
-  }
-}
-
-void RingReducer::Finish(bool ok) {
-  if (ok) {
-    // Recover the output from the adaptor.
-    ca_->ConsumeFinalValue(col_ctx_->output);
-  }
-  Status s;
-  {
-    mutex_lock l(status_mu_);
-    s = status_;
-  }
-  rfv_.clear();  // Give up Refs on output tensor.
-  done_(s);
-}
-
-RingReducer::SubContext::SubContext(OpKernelContext* ctx,
-                                    OpKernelContext::Params* params,
-                                    OpKernel* op, Tensor* output, Tensor* input)
-    : sub_params_(*params),
-      sub_inputs_({output, input}),
-      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
-      sub_input_dc_(
-          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
-  sub_params_.op_kernel = op;
-  sub_params_.inputs = &sub_inputs_;
-  sub_params_.input_alloc_attrs = &sub_input_attr_;
-  sub_params_.input_device_contexts = &sub_input_dc_;
-  sub_params_.eigen_gpu_device = nullptr;
-  sub_params_.ensure_eigen_gpu_device();
-  sub_params_.forward_from_array = &forward_from_;
-  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
-}
-
-Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                                 Tensor* input) {
-  // Prepare an OpKernelContext that is identical to that of the original Op
-  // (i.e. the collective), except for the input output sizes and identities and
-  // the Op itself.
-  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
-  // mostly identical inside one device execution.
-  std::unique_ptr<SubContext> sub_ctx(
-      new SubContext(col_ctx_->op_ctx, col_ctx_->op_params, op, output, input));
-  device->Compute(op, sub_ctx->sub_ctx_);
-  return sub_ctx->sub_ctx_->status();
-}
-
-// At the beginning of the algorithm initialize a RingField struct for
-// every independent field of the tensor.
 void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
                                 int field_idx) {
-  // Note on field indexing: There are group_size_ devices in the
-  // instance, implying the same number of chunks per tensor, where a
-  // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaneously send data by 2 or more independent
-  // channels we can speed up the transfer by subdividing chunks and
-  // processing multiple subdivisions at once.  So the actual number
-  // of RingFields is group_size_ * num_subdivs_.
-  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
-  rf->chunk_idx = chunk_idx;
-  rf->subdiv_idx = subdiv_idx;
-  rf->sc_idx = field_idx;
-  rf->rank = col_params_->subdiv_rank[subdiv_idx];
-  rf->second_pass = false;
-  rf->action = RF_INIT;
-  // Recv from the device with preceding rank within the subdivision.
-  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  rf->recv_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][recv_from_rank];
-  int send_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][send_to_rank];
-  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
-  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 0 we skip Recv when rank = chunk_idx
-    rf->do_recv = (rf->chunk_idx != rf->rank);
-    // In pass 0 we skip Send when rank = chunk_idx-1
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  if (rf->do_send || rf->do_recv) {
-    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
-    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
-  }
+  RingAlg::InitRingField(rf, chunk_idx, subdiv_idx, field_idx);
   if (rf->do_recv) {
     rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
-    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
   }
-  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
-          << ca_->TBounds(rf->chunk);
-}
-
-// When a RingField transitions from first to second recompute the
-// do_send and do_recv values.
-void RingReducer::AdvanceToSecondPass(RingField* rf) {
-  VLOG(3) << "IncrRingField old value " << rf->DebugString();
-  CHECK(!rf->second_pass);
-  rf->second_pass = true;
-  rf->action = RF_INIT;
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 1 the send/no-send boundary moves down 1 place.
-    rf->do_recv =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  VLOG(3) << "IncrRingField new value " << rf->DebugString();
-}
-
-string RingReducer::RingField::DebugString() const {
-  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
-                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
-                              " action=", action);
-  strings::StrAppend(&rv, " pass=", second_pass);
-  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
-                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
-                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
-  return rv;
-}
-
-void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_send);
-  string send_buf_key = RingReduceBufKey(col_ctx_->exec_key, rf->second_pass,
-                                         rf->sc_idx, rf->rank);
-  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
-          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
-          << rf->sc_idx;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  int send_to_dev_idx = col_params_->instance.impl_details
-                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
-  col_ctx_->col_exec->PostToPeer(
-      col_params_->instance.device_names[send_to_dev_idx],
-      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
-      col_ctx_->device_locality, done);
-}
-
-void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_recv);
-  string recv_buf_key =
-      RingReduceBufKey(col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
-                       (rf->rank + (group_size_ - 1)) % group_size_);
-  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
-          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
-          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
-  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
-                           ? &rf->tmp_chunk
-                           : &rf->chunk;
-  col_ctx_->col_exec->RecvFromPeer(
-      col_params_->instance.device_names[rf->recv_dev_idx],
-      col_params_->instance.task_names[rf->recv_dev_idx],
-      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
-      col_ctx_->device_locality, rf->subdiv_idx, done);
-}
-
-string RingReducer::FieldState() {
-  string s = strings::StrCat(
-      "RingReducer ", strings::Hex(reinterpret_cast<uint64>(this)), " exec ",
-      col_ctx_->exec_key, " step_id=", col_ctx_->step_id, " state of all ",
-      rfv_.size(), " fields:");
-  for (int i = 0; i < rfv_.size(); ++i) {
-    s.append("\n");
-    s.append(rfv_[i].DebugString());
-  }
-  return s;
 }
 
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
 bool RingReducer::RunAsyncParts() {
   // This function orchestrates RingReduce actions on behalf of a
   // single device. It is entered by a blockable thread that
@@ -632,9 +231,9 @@ bool RingReducer::RunAsyncParts() {
           --recv_pending_count;
           if (!rf->second_pass) {
             rf->action = RF_REDUCE;
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->merge_op.get(),
-                             &rf->chunk, &rf->tmp_chunk);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->merge_op.get(), &rf->chunk, &rf->tmp_chunk);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
@@ -647,9 +246,9 @@ bool RingReducer::RunAsyncParts() {
           if (!rf->second_pass && col_params_->final_op.get() && rf->is_final) {
             rf->action = RF_FINALIZE;
             group_size_tensor_ready_.WaitForNotification();
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->final_op.get(),
-                             &rf->chunk, &group_size_tensor_);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->final_op.get(), &rf->chunk, &group_size_tensor_);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
index 0848e37b5225b16a82e19943a3bcc57148fd744c..a681fabd2bdf1c7e3765ede3098ebb5bf596a881 100644
--- a/tensorflow/core/common_runtime/ring_reducer.h
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -21,122 +21,36 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
 #include "tensorflow/core/framework/collective.h"
 
 namespace tensorflow {
 class Device;
 
 // Ring-algorithm implementation of collective all-reduce.
-class RingReducer : public CollectiveImplementationInterface {
+class RingReducer : public RingAlg {
  public:
-  RingReducer();
+  RingReducer() : RingAlg(REDUCTION_COLLECTIVE, "Reduce") {}
   ~RingReducer() override;
 
-  // Establishes the requested number of subdivision permutations based on the
-  // ring order implicit in the device order.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
-
-  // Initializes members of CollectiveContext not yet initialized, i.e. device
-  // and device_locality.  Also saves the CollectiveContext in this object.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
-
   // Begins async execution of the ring reduce algorithm.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
   // collective threadpool.
   void Run(StatusCallback done) override;
 
- private:
-  // Called when a bad status is received that implies we should terminate
-  // execution and return a bad status.
-  void StartAbort(const Status& s);
-  void ContinueAfterInputCopy();
-  void Finish(bool ok);
-  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                      Tensor* input);
-  bool RunAsyncParts();
-
-  // Used for executing a sub-operation, e.g. a merge_op instance, with
-  // an OpKernelContext based on the one passed into this Op.
-  class SubContext {
-   public:
-    OpKernelContext::Params sub_params_;
-    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
-    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
-    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
-    // Used only for Binary and Unary Ops for which we require
-    // the calculation to be in-place on the first input.
-    int forward_from_ = 0;
-    OpKernelContext* sub_ctx_;
-    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
-               OpKernel* op, Tensor* output, Tensor* input);
-    ~SubContext() { delete sub_ctx_; }
-  };
-
-  // Current status of a RingField
-  enum RingFieldAction {
-    RF_INIT = 0,    // Just initialized for a pass
-    RF_RECV,        // Recv pending
-    RF_REDUCE,      // Reduce pending
-    RF_FINALIZE,    // FinalOp pending
-    RF_SEND_READY,  // Ready to send
-    RF_SEND,        // Send pending
-    RF_DONE,        // No more work
-  };
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
 
-  // Tracks progress of actions on a single subfield of the entire tensor.
-  struct RingField {
-    int16 chunk_idx;     // major division index
-    int16 subdiv_idx;    // minor division index
-    int16 sc_idx;        // subchunk index
-    int16 rank;          // rank within subdiv permutation
-    int16 recv_dev_idx;  // dev from which value should be recv'd
-    RingFieldAction action;
-    bool second_pass;
-    bool recv_is_remote = false;
-    bool send_is_remote = false;
-    bool do_send = false;   // is the value sent in this pass?
-    bool do_recv = false;   // is the value recv'd in this pass?
-    bool is_final = false;  // is the last field in the pass for this rank
-    Tensor chunk;           // alias to field values
-    Tensor tmp_chunk;
-    Status status;
-    string DebugString() const;
-  };
-  void AdvanceToSecondPass(RingField* rf);
+ protected:
   void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
-                     int field_idx);
-  void DispatchSend(RingField* rf, const StatusCallback& done);
-  void DispatchRecv(RingField* rf, const StatusCallback& done);
-
-  // For constructing log messages for debugging.
-  string FieldState();
-  string TensorDebugString(const Tensor& tensor);
-
-  // Producer/Consumer Queue of RingField structs.
-  class PCQueue {
-   public:
-    void Enqueue(RingField* rf);
-    RingField* Dequeue();
+                     int field_idx) override;
 
-   private:
-    mutex pcq_mu_;
-    condition_variable cv_;
-    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
-    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
-  };
+ private:
+  void ContinueAfterInputCopy();
+  bool RunAsyncParts();
 
-  CollectiveContext* col_ctx_;          // Not owned
-  const CollectiveParams* col_params_;  // Not owned
-  StatusCallback done_;
-  int group_size_;
-  int num_subdivs_;
   Tensor group_size_tensor_;
   Notification group_size_tensor_ready_;
-  std::unique_ptr<CollectiveAdapter> ca_;
-  mutex status_mu_;
-  Status status_ GUARDED_BY(status_mu_);
-  std::vector<RingField> rfv_;
 
   friend class RingReducerTest;
 };
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 7feb29a6dbbb17d73967344ad07db9d234411840..7f18cdb5e2caec7690c8f96c6deb32319acb2e10 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -335,19 +335,20 @@ class RingReducerTest : public ::testing::Test {
           note.WaitForNotification();
         }
 
+        auto alias = actual.template unaligned_flat<T>();
         for (int i = 0; i < tensor_len; ++i) {
           switch (dtype) {
             case DT_FLOAT:
-              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_DOUBLE:
-              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_INT32:
             case DT_INT64:
-              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             default:
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 9488a447789e67f3a9e73af43a0f3a849457e51f..8f28d2790358456df1414ba201d58e29e80221c9 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/core/common_runtime/shared_counter.h b/tensorflow/core/common_runtime/shared_counter.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e378524b203b1b1089fe6836a57d1effb961db5
--- /dev/null
+++ b/tensorflow/core/common_runtime/shared_counter.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+
+namespace tensorflow {
+// A lightweight thread-safe monotone counter for establishing
+// temporal ordering.
+class SharedCounter {
+ public:
+  int64 get() { return value_; }
+  int64 next() { return ++value_; }
+
+ private:
+  std::atomic<int64> value_{0};
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.cc b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1d716ce9f05b9036a6b9b9098ff66965dd69de2
--- /dev/null
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+namespace {
+
+static constexpr int kNumThreads = 1;
+
+thread::ThreadPool* GraphRunnerThreadPool() {
+  static thread::ThreadPool* thread_pool =
+      new thread::ThreadPool(Env::Default(), "graph_runner", kNumThreads);
+  return thread_pool;
+}
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  explicit SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = kNumThreads;
+    eigen_worker_threads_.workers = GraphRunnerThreadPool();
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace
+
+Device* NewSingleThreadedCpuDevice(Env* env) {
+  return new SingleThreadedCpuDevice(env);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
index 22650b0d831222b077a8f66b6af4a13683dc4666..0433987e8016d0d9d25d4c06030e734a757cbceb 100644
--- a/tensorflow/core/common_runtime/single_threaded_cpu_device.h
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -16,67 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
 
-#define EIGEN_USE_THREADS
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-
 namespace tensorflow {
 
+class Device;
 class Env;
 
-// A simple single-threaded CPU device. This can be used to run inexpensive
-// computations. In particular, using this avoids initializing the global thread
-// pools in LocalDevice.
-class SingleThreadedCpuDevice : public Device {
- public:
-  SingleThreadedCpuDevice(Env* env)
-      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
-                                                  Bytes(256 << 20),
-                                                  DeviceLocality())) {
-    eigen_worker_threads_.num_threads = 1;
-    eigen_worker_threads_.workers = new thread::ThreadPool(
-        env, "graph_runner", eigen_worker_threads_.num_threads);
-    eigen_threadpool_wrapper_.reset(
-        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
-    eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
-    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
-    set_eigen_cpu_device(eigen_device_.get());
-  }
-
-  ~SingleThreadedCpuDevice() override {
-    eigen_threadpool_wrapper_.reset();
-    eigen_device_.reset();
-    delete eigen_worker_threads_.workers;
-  }
-
-  Status Sync() override { return Status::OK(); }
-
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override {
-    Tensor parsed(tensor_proto.dtype());
-    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
-    }
-    *tensor = parsed;
-    return Status::OK();
-  }
-
-  Allocator* GetAllocator(AllocatorAttributes attr) override {
-    return cpu_allocator();
-  }
-
- private:
-  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
-  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
-  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
-};
+// Returns a simple single-threaded CPU device. This can be used to run
+// inexpensive computations. In particular, using this avoids initializing the
+// global thread pools in LocalDevice.
+//
+// The returned pointer is owned by the caller.
+Device* NewSingleThreadedCpuDevice(Env* env);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 49265445659ff1daa30b632f60c03845d4a6a7f7..318cfec21a8be19bdad362c45b11398c33438bbb 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -176,9 +176,10 @@ void NodeExecStatsWrapper::AddAllocation(
   memory->set_peak_bytes(std::get<1>(sizes));
   memory->set_live_bytes(std::get<2>(sizes));
 
-  AllocatorStats stats;
-  allocator->GetStats(&stats);
-  memory->set_allocator_bytes_in_use(stats.bytes_in_use);
+  absl::optional<AllocatorStats> stats = allocator->GetStats();
+  if (stats) {
+    memory->set_allocator_bytes_in_use(stats->bytes_in_use);
+  }
   allocations_.push_back(std::make_pair(memory, tracking_allocator));
 }
 
@@ -409,6 +410,21 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
+void StepStatsCollector::SaveThreadName(const string& device,
+                                        const uint32 thread_id,
+                                        const string& thread_name) {
+  VLOG(1) << "Save dev " << device << " thread id " << thread_id << " name "
+          << thread_name;
+  {
+    mutex_lock l(mu_);
+    if (finalized_) {
+      LOG(WARNING) << "thread_name saved after finalize will not be collected.";
+    }
+    auto& thread_names_map = thread_names_[device];
+    thread_names_map[thread_id] = thread_name;
+  }
+}
+
 NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
     const Node* node) {
   // Only collect statistics for non-transfer nodes.
@@ -531,5 +547,15 @@ void StepStatsCollector::FinalizeInternal() {
       stats->stats()->Swap(dss->add_node_stats());
     }
   }
+  for (const auto& device_thread : thread_names_) {
+    if (dev_stats_pb.find(device_thread.first) == dev_stats_pb.end()) {
+      // skip device without DeviceStepStats.
+      continue;
+    }
+    DeviceStepStats* dss = dev_stats_pb.at(device_thread.first);
+    for (const auto& thread_name : device_thread.second) {
+      (*dss->mutable_thread_names())[thread_name.first] = thread_name.second;
+    }
+  }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 7d34383ce8209c9f4b889410a96bce02f6702a64..dfcc51ff4c79b386c327dcf9503c7ee35b20d2c4 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -175,6 +175,10 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   void Save(const string& device, NodeExecStats* node_stats_pb);
   void Save(const string& device, NodeExecStatsWrapper* node_stats);
 
+  // Saves thread name.
+  void SaveThreadName(const string& device, const uint32 thread_id,
+                      const string& thread_name);
+
   NodeExecStatsInterface* CreateNodeExecStats(const Node* node) override;
   string ReportAllocsOnResourceExhausted(const string& err) override;
 
@@ -191,12 +195,14 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   static const uint64 kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
+  typedef std::unordered_map<uint32, string> ThreadNamesMap;
 
   void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   bool finalized_ GUARDED_BY(mu_);
   std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_);
   StepStats* step_stats_ GUARDED_BY(mu_);
   uint64 collected_nodes_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 591c22b8f625554acfe25d744cb53998f551ff29..f8c07dde46caab062b86a934186c39777485f4d0 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -221,11 +221,17 @@ tf_cc_test(
     deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 5fc95a8f20d2b3f1b37a660e17d0efee17aacb94..b69eb1da39e68cc470d0d64c69c28ce1a3f6477c 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -299,7 +299,7 @@ Status DebugNodeInserter::CreateCopyNode(
 
   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
                      .Input(src_node_name, src_output, src_dt)
-                     .Attr("debug_ops_spec", std::move(debug_ops_spec));
+                     .Attr("debug_ops_spec", debug_ops_spec);
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index f70931e926507c72287588da278a3b8d6bb19122..4927caf5a3285a3855d27b614bf597943059e2fb 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 6994dec3b59d997650e07ba9a6fd14233022b201..ebcb046003437eb9fab452c5337204cb249c510c 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -730,7 +731,7 @@ Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   channel_ = ::grpc::CreateCustomChannel(
       server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
   if (!channel_->WaitForConnected(
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 5390ce408aabf32e483900699826c3d496265ee6..842234d433bfe1aaa0175d2e5e27533145265d3f 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -253,6 +253,7 @@ struct hash<::tensorflow::DebugNodeKey> {
 // TODO(cais): Support grpc:// debug URLs in open source once Python grpc
 //   genrule becomes available. See b/23796275.
 #ifndef PLATFORM_WINDOWS
+#include "grpcpp/channel.h"
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 82e0ae5edb1eccd35c7c76da0a8a2ee9ea12d9fd..0926a82fade31904376fac277273b20b13367167 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_node_key.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index e388d3e6f0f5636c044c36ee03c826f1872cac9f..af744ce790bf5d39895a41fff8f77650f7adc19a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -17,7 +17,6 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # For platform specific build config
@@ -298,6 +297,7 @@ cc_library(
     deps = [
         ":call_options",
         ":message_wrappers",
+        ":request_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
     ],
@@ -311,6 +311,7 @@ cc_library(
         ":call_options",
         ":master_env",
         ":master_session",
+        ":recent_request_ids",
         ":remote_device",
         ":worker_cache",
         ":worker_interface",
@@ -425,7 +426,6 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
@@ -587,13 +587,17 @@ tf_cc_test(
         ":collective_param_resolver_distributed",
         ":device_resolver_distributed",
         ":test_utils",
+        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:collective_ops",
     ],
 )
 
@@ -640,13 +644,14 @@ tf_cuda_cc_test(
         "manual",  # TODO(b/27683709): Re-enable when not flaky.
         "notap",  # TODO(b/27683709): Re-enable when not flaky.
         "noguitar",  # TODO(b/27683709): Re-enable when not flaky.
-        "nooss",  # TODO(b/27683709): Re-enable when not flaky.
+        "no_oss",  # TODO(b/27683709): Re-enable when not flaky.
     ],
     deps = [
         ":master",
         ":remote_device",
         ":worker_interface",
         "//tensorflow:grpc++",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -655,6 +660,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -712,9 +718,14 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
@@ -754,6 +765,7 @@ cc_library(
     srcs = ["recent_request_ids.cc"],
     hdrs = ["recent_request_ids.h"],
     deps = [
+        ":message_wrappers",
         "//tensorflow/core:lib",
         "//tensorflow/core:worker_proto_cc",
     ],
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index de6e4b4a7c51379f6492314de3dc8c69f424c769..a642313275d01f2575575fcb17de8a496cf09239 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -293,8 +293,11 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
-  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
+  if (s.ok() && !is_initialized()) {
+    s.Update(errors::Internal(
+        "RecvAsync called when uninitialized (key:", parsed.FullKey(), ")."));
+  }
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
     return;
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 1dd10d309b5f5acad2acab660aa709a9c0e9751d..443759ab740b99860e9d50a6a112a2a054d39f1a 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -82,15 +82,21 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverDistributed* dev_resolver, WorkerCacheInterface* worker_cache,
     const string& task_name)
-    : CollectiveParamResolverLocal(dev_mgr, dev_resolver, task_name),
+    : CollectiveParamResolverLocal(config, dev_mgr, dev_resolver, task_name),
       worker_cache_(worker_cache),
       group_leader_(task_name == config.experimental().collective_group_leader()
                         ? ""
-                        : config.experimental().collective_group_leader()) {}
+                        : config.experimental().collective_group_leader()) {
+  VLOG(1) << "CompleteParamResolverDistributed ctor task={" << task_name
+          << "} config.collective_group_leader={"
+          << config.experimental().collective_group_leader() << "}";
+}
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
+  VLOG(1) << "CompleteParams distributed " << device << " for " << cp << ": "
+          << cp->ToString();
   CompleteGroupDistributed(device, cp, cancel_mgr,
                            [this, device, cp, cancel_mgr, done](
                                const Status& s, const GroupRec* gr) {
@@ -181,6 +187,10 @@ void CollectiveParamResolverDistributed::CompleteInstanceAsync(
                           ir->WaitForOutMu(l);
                           response->set_instance_key(cp->instance.instance_key);
                           response->set_source_rank(ir->source_rank);
+                          if (!cp->instance.communicator_key.empty()) {
+                            response->set_communicator_key(
+                                cp->instance.communicator_key);
+                          }
                           done_and_cleanup(fi_status);
                         } else {
                           done_and_cleanup(fi_status);
@@ -283,8 +293,10 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
   using InstanceRecPointer = InstanceRec*;
   InstanceRecPointer* irp = new InstanceRecPointer(nullptr);
   int32 source_rank = resp.source_rank();
+  string communicator_key = resp.communicator_key();
 
-  auto continue_with_ir = [this, cp, irp, source_rank, done](const Status& s) {
+  auto continue_with_ir = [cp, irp, source_rank, communicator_key,
+                           done](const Status& s) {
     if (!s.ok()) {
       done(s);
       delete irp;
@@ -306,6 +318,19 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
         }
         ir->source_rank = source_rank;
       }
+      if (ir->communicator_key != communicator_key) {
+        if (!ir->communicator_key.empty()) {
+          ir->status = errors::Internal(
+              "UpdateInstanceCache: CompleteInstanceResponse for instance ",
+              cp->instance.instance_key,
+              " gives communicator_key with size =", communicator_key.size(),
+              " but cache already holds communicator_key with size=",
+              ir->communicator_key.size());
+          status = ir->status;
+          break;
+        }
+        ir->communicator_key = communicator_key;
+      }
       if (ir->known_count < cp->group.group_size) {
         ir->known_count = cp->group.group_size;
         if (ir->known.size() != cp->group.group_size) {
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 40b18d321a1cb3fafeaa4b864e737f6d86695842..5cd75a3100e5ec444e4bf96cadd010c9239e3fbd 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -107,7 +107,6 @@ class FakeCache : public TestWorkerCache {
     WorkerInterface* wi = it->second;
     GetStatusRequest req;
     GetStatusResponse resp;
-    Notification note;
     Status status = wi->GetStatus(&req, &resp);
     if (!status.ok()) {
       done(status);
@@ -268,6 +267,8 @@ class DeviceResDistTest : public ::testing::Test {
         EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
         EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
         if (idx > 0) {
+          EXPECT_EQ(cp_[0].instance.communicator_key,
+                    cp_[idx].instance.communicator_key);
           for (int i = 0; i < dev_count; ++i) {
             EXPECT_EQ(cp_[0].instance.device_names[i],
                       cp_[idx].instance.device_names[i]);
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 26f722a6bd4104b2dc264c2946bc5b5656b0fb32..cac17ab51ab705375239c9b22c4bc3ca6c8c95ee 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -147,7 +147,6 @@ class FakeCache : public TestWorkerCache {
     WorkerInterface* wi = it->second;
     GetStatusRequest req;
     GetStatusResponse resp;
-    Notification note;
     Status status = wi->GetStatus(&req, &resp);
     if (!status.ok()) {
       done(status);
@@ -271,7 +270,6 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
         producer_status.Update(s);
         producer_note.Notify();
       });
-  Status status;
   Device* dst_device = nullptr;
   string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
@@ -300,7 +298,6 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
   Status producer_status;
   FakeWorker* wi = workers_[1];
   const string kBufKey = "fake_buf_key";
-  Status status;
   Device* dst_device = nullptr;
   string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
@@ -333,7 +330,6 @@ TEST_F(CollRMADistTest, ConsFirstAbort) {
   Notification consumer_note;
   Status consumer_status;
   const string kBufKey = "fake_buf_key";
-  Status status;
   Device* dst_device = nullptr;
   string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 842a2b3b058b8c55bec0c07816c1305ed9a2f305..0bc370c93b5925a295e1e4f4f4a81e5f9fecdb1a 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -109,7 +109,6 @@ class FakeCache : public TestWorkerCache {
     WorkerInterface* wi = it->second;
     GetStatusRequest req;
     GetStatusResponse resp;
-    Notification note;
     Status status = wi->GetStatus(&req, &resp);
     if (!status.ok()) {
       done(status);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 1065f021a1b0f97dc955e2b00ff333976575b519..144113a04309d8dde8b8eebdce4485f828732595 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -356,6 +356,12 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = RecvOutputsFromRendezvous(rendezvous, out, Rendezvous::Args());
   rendezvous->Unref();
+  if (!s.ok()) {
+    // Failing to fetch the outputs should not be possible, so rewrite the error
+    // status to an INTERNAL error.
+    s = errors::Internal("Failed to fetch outputs for step ", step_id,
+                         ". (Original error message: ", s.ToString(), ")");
+  }
   return s;
 }
 
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 269f620e42e61b67477f9d73336a6e8da63b2eff..fc8d2871ac770bcea9104a206acabcd44ebde77f 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -65,7 +65,8 @@ Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
       step_count_(0),
-      session_gc_seconds_(session_gc_seconds) {
+      session_gc_seconds_(session_gc_seconds),
+      recent_request_ids_(10000) {
   // Right now, a master service must be co-located with a device.
   // Otherwise, fetches do not work.
   CHECK(!env->local_devices.empty());
@@ -510,6 +511,12 @@ void Master::ExtendSession(const ExtendSessionRequest* req,
 
 void Master::PartialRunSetup(const PartialRunSetupRequest* req,
                              PartialRunSetupResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "PartialRunSetup (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -525,6 +532,12 @@ void Master::PartialRunSetup(const PartialRunSetupRequest* req,
 
 void Master::RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
                      MutableRunStepResponseWrapper* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunStep (Master)", req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto start_time = env_->env->NowMicros();
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
@@ -664,6 +677,12 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
 
 void Master::MakeCallable(const MakeCallableRequest* req,
                           MakeCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "MakeCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -681,6 +700,12 @@ void Master::MakeCallable(const MakeCallableRequest* req,
 
 void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
                          RunCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index dbb337fd484960fbd3bfe47d0bfe0497985de66f..0524582ac78846fe192e8de47419280c6dde6177 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
@@ -95,6 +96,9 @@ class Master {
   // closed automatically.
   const double session_gc_seconds_;
 
+  // Used to track ids for incoming requests so we can detect duplicates.
+  RecentRequestIds recent_request_ids_;
+
   // Call CleanupAll on all workers.
   void CleanupWorkers(const ResetRequest& reset);
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index a8ae3cba3cdd3f02aae823d893e027b2bccae2c9..cde47fb9caf55f35db481fec8ae69ad6e6fcd8ed 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -66,7 +67,9 @@ class MasterInterface {
   // The message returned from this method must only be used in a
   // `RunStep()` call on the same `MasterInterface` instance.
   virtual MutableRunStepRequestWrapper* CreateRunStepRequest() {
-    return new MutableProtoRunStepRequest;
+    MutableProtoRunStepRequest* ret = new MutableProtoRunStepRequest;
+    ret->request_.set_request_id(GetUniqueRequestId());
+    return ret;
   }
 
   // Returns a response object for use in calls to
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 48b72fb9483f632012a69a1f3f8bf3e099310fbd..2f14967656fc832550c310d4c2b7821061d11e75 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1103,6 +1103,8 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
       req.options().experimental().collective_graph_key();
   if (config.experimental().collective_deterministic_sequential_execution()) {
     opts->collective_order = GraphCollectiveOrder::kEdges;
+  } else if (config.experimental().collective_nccl()) {
+    opts->collective_order = GraphCollectiveOrder::kAttrs;
   }
 }
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 40bf564cab6fe465ed66639f42fe0daeb149f132..c9bc558964c2e7a704c5e2e9f52a1f794065a7d3 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -97,6 +97,10 @@ bool InMemoryRunStepRequest::store_errors_in_response_body() const {
   return store_errors_in_response_body_;
 }
 
+int64 InMemoryRunStepRequest::request_id() const {
+  return 0;  // no need to track request id for local version.
+}
+
 void InMemoryRunStepRequest::set_store_errors_in_response_body(
     bool store_errors) {
   store_errors_in_response_body_ = store_errors;
@@ -210,6 +214,10 @@ void MutableProtoRunStepRequest::set_store_errors_in_response_body(
   request_.set_store_errors_in_response_body(store_errors);
 }
 
+int64 MutableProtoRunStepRequest::request_id() const {
+  return request_.request_id();
+}
+
 string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
@@ -272,6 +280,8 @@ bool ProtoRunStepRequest::store_errors_in_response_body() const {
   return request_->store_errors_in_response_body();
 }
 
+int64 ProtoRunStepRequest::request_id() const { return request_->request_id(); }
+
 string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 474ac0e186a203464ff64e1cbea2b4faaf87b05b..2cdbd1bfaf1be1fb646926ae82488f88377d491d 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -87,6 +87,8 @@ class RunStepRequestWrapper {
   // truncate long metadata messages.
   virtual bool store_errors_in_response_body() const = 0;
 
+  virtual int64 request_id() const = 0;
+
   // Returns a human-readable representation of this message for debugging.
   virtual string DebugString() const = 0;
 
@@ -127,6 +129,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -177,6 +180,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -189,6 +193,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
 
  private:
   RunStepRequest request_;
+  friend class MasterInterface;
 };
 
 // Wrapper for immutable RunStep requests that use a non-owned
@@ -216,6 +221,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
  private:
   const RunStepRequest* const request_;  // Not owned.
@@ -234,7 +240,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-// Abstract interface for an immutable RunStepRequest message.
+// Abstract interface for an immutable RunGraphRequest message.
 //
 // This interface is typically used by server-side components in the
 // TensorFlow worker.
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index 4f6866c5d154ba023b0923af67fe00a7a69b459d..2c953e12c06287e88b1a68bfab48a7234207046a 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -28,12 +28,10 @@ RecentRequestIds::RecentRequestIds(int num_tracked_request_ids)
   set_.reserve(num_tracked_request_ids);
 }
 
-Status RecentRequestIds::TrackUnique(int64 request_id,
-                                     const string& method_name,
-                                     const protobuf::Message& request) {
+bool RecentRequestIds::Insert(int64 request_id) {
   if (request_id == 0) {
     // For backwards compatibility, allow all requests with request_id 0.
-    return Status::OK();
+    return true;
   }
 
   mutex_lock l(mu_);
@@ -43,9 +41,7 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
     // request_id's age in the circular_buffer_ if it's tracked again. Strict
     // LRU is not useful here because returning this error will close the
     // current Session.
-    return errors::Aborted("The same ", method_name,
-                           " request was received twice. ",
-                           request.ShortDebugString());
+    return false;
   }
 
   // Remove the oldest request_id from the set_. circular_buffer_ is
@@ -54,7 +50,30 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
   set_.erase(circular_buffer_[next_index_]);
   circular_buffer_[next_index_] = request_id;
   next_index_ = (next_index_ + 1) % circular_buffer_.size();
-  return Status::OK();
+  return true;
+}
+
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const protobuf::Message& request) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           request.ShortDebugString());
+  }
+}
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const RunStepRequestWrapper* wrapper) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           wrapper->ToProto().ShortDebugString());
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 11cf937c94659d85e3dc88350f20e107a27fab62..4094fcbde72ae97fcc5655a030fdf69426b093c8 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,8 +59,13 @@ class RecentRequestIds {
   // ShortDebugString are added to returned errors.
   Status TrackUnique(int64 request_id, const string& method_name,
                      const protobuf::Message& request);
+  // Overloaded versions of the above function for wrapped protos.
+  Status TrackUnique(int64 request_id, const string& method_name,
+                     const RunStepRequestWrapper* wrapper);
 
  private:
+  bool Insert(int64 request_id);
+
   mutex mu_;
   // next_index_ indexes into circular_buffer_, and points to the next storage
   // space to use. When the buffer is full, next_index_ points at the oldest
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 273709a01fd799f7f4aa8afc80d3bdfc48d36322..57fb5e1509037afe34d337ea58e661ca26f2ca17 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/core:lib",
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -159,6 +160,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_response_cache",
+    srcs = ["grpc_response_cache.cc"],
+    hdrs = ["grpc_response_cache.h"],
+    deps = [
+        ":grpc_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cuda_library(
     name = "grpc_worker_service",
     srcs = ["grpc_worker_service.cc"],
@@ -166,6 +177,7 @@ tf_cuda_library(
     deps = [
         ":async_service_interface",
         ":grpc_call",
+        ":grpc_response_cache",
         ":grpc_tensor_coding",
         ":grpc_util",
         ":grpc_worker_service_impl",
@@ -183,6 +195,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -313,9 +326,15 @@ tf_cc_binary(
         ":grpc_server_lib",
         "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:data_flow",
     ],
@@ -330,10 +349,14 @@ tf_cc_binary(
     deps = [
         ":grpc_server_lib",
         "//tensorflow:grpc++",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:constant_op",
@@ -384,6 +407,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master_interface",
         "//tensorflow/core/distributed_runtime:message_wrappers",
+        "//tensorflow/core/distributed_runtime:request_id",
     ],
     alwayslink = 1,
 )
@@ -396,7 +420,7 @@ tf_cuda_cc_tests(
         "rpc_rendezvous_mgr_test.cc",
     ],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -473,6 +497,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
index 521e0ac4fabc5a28e210ed68bfde0bda81fce737..61b02370819313aeb4c654824513e7eb49777404 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -27,14 +27,6 @@ limitations under the License.
 
 #include "tensorflow/core/protobuf/eager_service.pb.h"
 
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-}  // namespace grpc
-
 namespace tensorflow {
 namespace eager {
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 781b7d65cdd184363d7c7650305bd62f3129c271..64c221805b072313fc9fba20fa9cdefe8cea9bfc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -62,7 +62,7 @@ Status ValidateHostPortPair(const string& host_port) {
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   if (rpc_options != nullptr) {
     if (rpc_options->compression_algorithm() == "deflate") {
       args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 751f2633e752c26be716f9f7337ec46a17a6e265..6c8af761f25410a56c26c3d3d01054eb8c348d60 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -27,14 +27,6 @@ limitations under the License.
 
 #include "tensorflow/core/protobuf/master.pb.h"
 
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-}  // namespace grpc
-
 namespace tensorflow {
 
 namespace grpc {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index b832a2115cb809b5561fc55ab8d9057f2274dcd8..a84559098a3ad59cd1cabe91bb8546194da105e5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -43,104 +44,139 @@ class GrpcRemoteMaster : public MasterInterface {
   Status CreateSession(CallOptions* call_options,
                        const CreateSessionRequest* request,
                        CreateSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CreateSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CreateSession);
   }
 
   Status ExtendSession(CallOptions* call_options,
                        const ExtendSessionRequest* request,
                        ExtendSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ExtendSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ExtendSession);
   }
 
   Status PartialRunSetup(CallOptions* call_options,
                          const PartialRunSetupRequest* request,
                          PartialRunSetupResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::PartialRunSetup);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::PartialRunSetup);
   }
 
   Status RunStep(CallOptions* call_options, RunStepRequestWrapper* request,
                  MutableRunStepResponseWrapper* response) override {
-    ::grpc::ClientContext ctx;
-    auto trace = TraceRpc("RunStep/Client", &ctx);
-    return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
+    return CallWithRetry(call_options, &request->ToProto(),
+                         get_proto_from_wrapper(response),
+                         &MasterServiceStub::RunStep, "RunStep/Client");
   }
 
   Status CloseSession(CallOptions* call_options,
                       const CloseSessionRequest* request,
                       CloseSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CloseSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CloseSession);
   }
 
   Status ListDevices(CallOptions* call_options,
                      const ListDevicesRequest* request,
                      ListDevicesResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ListDevices);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ListDevices);
   }
 
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::Reset);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::Reset);
   }
 
   Status MakeCallable(CallOptions* call_options,
                       const MakeCallableRequest* request,
                       MakeCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::MakeCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::MakeCallable);
   }
   Status RunCallable(CallOptions* call_options,
                      const RunCallableRequest* request,
                      RunCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::RunCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::RunCallable);
   }
   Status ReleaseCallable(CallOptions* call_options,
                          const ReleaseCallableRequest* request,
                          ReleaseCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ReleaseCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ReleaseCallable);
   }
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  tracing::ScopedActivity TraceRpc(StringPiece name,
-                                   ::grpc::ClientContext* ctx) {
+  tracing::ScopedActivity* NewTraceRpc(StringPiece name,
+                                       ::grpc::ClientContext* ctx) {
     string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return tracing::ScopedActivity(name, trace_id);
-  }
-
-  void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
-    if (time_in_ms > 0) {
-      ctx->set_deadline(gpr_time_from_millis(time_in_ms, GPR_TIMESPAN));
-    }
+    return new tracing::ScopedActivity(name, trace_id);
   }
 
   template <typename Request, typename Response>
-  Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
-              const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
-                                                         const Request&,
-                                                         Response*)) {
-    ctx->set_fail_fast(false);
-    SetDeadline(ctx, call_options->GetTimeout());
-    return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
+  Status CallWithRetry(CallOptions* call_options, const Request* request,
+                       Response* response,
+                       ::grpc::Status (MasterServiceStub::*pfunc)(
+                           ::grpc::ClientContext*, const Request&, Response*),
+                       string trace_string = {}) {
+    int64 timeout_in_ms = call_options->GetTimeout();
+    int64 expired_time_micros = Env::Default()->NowMicros();
+    if (timeout_in_ms > 0) {
+      expired_time_micros += (timeout_in_ms / 1000.);
+    }
+    Status s;
+    for (int num_retries = 0;; ++num_retries) {
+      ::grpc::ClientContext ctx;
+      std::unique_ptr<tracing::ScopedActivity> trace;
+      if (!trace_string.empty()) {
+        trace.reset(NewTraceRpc(trace_string, &ctx));
+      }
+      ctx.set_fail_fast(false);
+      if (timeout_in_ms > 0) {
+        // We do not modify the timeout here to match legacy behavior. However,
+        // this could violate the contract of tensorflow::Session. If we retry
+        // an RPC just before the deadline is exceeded, we will still set the
+        // timeout to the original value. This leads to the overall timeout
+        // being double what was expected.
+        // TODO(b/117162170): investigate fixing this behavior for legacy and
+        // gRPC RPC layers.
+        ctx.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
+      }
+      s = FromGrpcStatus((stub_.get()->*pfunc)(&ctx, *request, response));
+      if (!errors::IsUnavailable(s)) {
+        return s;
+      }
+      // TODO(b/117162170): we may want to make this configurable.
+      constexpr int kMaxRetries = 10;
+      LOG(WARNING) << "RPC failed with status = \"" << s
+                   << "\" and grpc_error_string = \""
+                   << ctx.debug_error_string() << "\", maybe retrying the RPC";
+      if (num_retries >= kMaxRetries) {
+        LOG(WARNING) << "Too many retries, returning last status: " << s;
+        return s;
+      }
+      const int64 now_micros = Env::Default()->NowMicros();
+      const int64 deadline_with_backoff_micros =
+          now_micros + ComputeBackoffMicroseconds(num_retries);
+      // Wait for a short period of time before retrying the RPC.  If our
+      // backoff would put us past the RPC deadline, we truncate it to ensure
+      // our RPC starts before the deadline.
+      const auto backoff_until =
+          (timeout_in_ms <= 0 ||
+           expired_time_micros > deadline_with_backoff_micros)
+              ? deadline_with_backoff_micros
+              : expired_time_micros;
+      Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
+      if (Env::Default()->NowMicros() > expired_time_micros &&
+          timeout_in_ms > 0) {
+        // If timeout_in_ms is set, exit the retry loop on timeout.
+        return errors::DeadlineExceeded(ctx.debug_error_string());
+      }
+    }
   }
 
   std::unique_ptr<MasterServiceStub> stub_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 2daefcb399c79324f80278340967b679be5c6574..2479e7368be8de810db36cb18c887977aeae2472 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -39,6 +39,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+const int kMaxWorkerRpcRetries = 10;
+
 class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
@@ -259,17 +261,19 @@ class GrpcRemoteWorker : public WorkerInterface {
   // given callback, `done`, will be called when the RPC completes.
   void IssueRequest(const protobuf::Message* request,
                     protobuf::Message* response, const ::grpc::string& method,
-                    StatusCallback done, CallOptions* call_opts = nullptr) {
+                    StatusCallback done, CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
                                     std::move(done), call_opts,
-                                    callback_threadpool_);
+                                    callback_threadpool_, max_retries);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
-                    CallOptions* call_opts = nullptr) {
+                    CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, max_retries);
   }
 
   // Helper function for initializing the RpcMethod objects below.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..613c290905d4e8914761b130d9353536023f5856
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+struct WorkerCacheEntry {
+  enum class State {
+    PENDING = 0,
+    ACTIVE = 1,
+    FINISHED = 2,
+  };
+
+  State state = State::PENDING;
+  int64 expires_seconds;
+
+  ::grpc::ByteBuffer response_buf;
+  Status response_status;
+
+  // Additional retries may arrive while a request is still executing.  The
+  // callbacks for these calls are queued in `callbacks` and evaluated after
+  // the original request is completed.
+  std::vector<std::pair<RPCResponse, StatusCallback>> callbacks;
+};
+
+void RPCResponse::Encode(::grpc::ByteBuffer* tgt) const {
+  if (buf_ != nullptr) {
+    *tgt = *buf_;
+  } else {
+    CHECK(msg_ != nullptr);
+    ::grpc::Slice slice(msg_->ByteSizeLong());
+    msg_->SerializeWithCachedSizesToArray(
+        const_cast<uint8*>(reinterpret_cast<const uint8*>(slice.begin())));
+    ::grpc::ByteBuffer tmp(&slice, 1);
+    tgt->Swap(&tmp);
+  }
+}
+
+void RPCResponse::CopyFrom(const ::grpc::ByteBuffer& src) {
+  if (buf_ != nullptr) {
+    *buf_ = src;
+    return;
+  }
+
+  CHECK(msg_ != nullptr);
+  // We create a single slice when encoding protocol messages.
+  std::vector<::grpc::Slice> slices;
+  if (src.Dump(&slices).ok()) {
+    msg_->ParseFromArray(slices[0].begin(), slices[0].size());
+  } else {
+    LOG(ERROR) << "Failed to decode cached buffer.";
+  }
+}
+
+void GrpcResponseCache::LookupOrCompute(const string& key, RPCResponse response,
+                                        ComputeFunc compute_func,
+                                        StatusCallback done_cb) {
+  VLOG(1) << "Lookup " << key;
+  std::shared_ptr<WorkerCacheEntry> req;
+  MaybeCleanup();
+  {
+    mutex_lock m(mu_);
+
+    if (requests_.find(key) != requests_.end()) {
+      req = requests_[key];
+    } else {
+      req.reset(new WorkerCacheEntry);
+      requests_[key] = req;
+    }
+
+    if (req->state == WorkerCacheEntry::State::FINISHED) {
+      if (req->expires_seconds > Env::Default()->NowSeconds()) {
+        VLOG(1) << "Reuse cached response for " << key;
+        response.CopyFrom(req->response_buf);
+        done_cb(req->response_status);
+        return;
+      }
+      VLOG(1) << "Found expired cache entry for " << key;
+      req->state = WorkerCacheEntry::State::PENDING;
+      req->response_buf.Clear();
+    }
+
+    req->callbacks.push_back(std::make_pair(response, done_cb));
+
+    if (req->state == WorkerCacheEntry::State::ACTIVE) {
+      VLOG(1) << "Found active request for " << key
+              << ".  Adding entry to response queue.";
+      return;
+    }
+
+    VLOG(2) << "No cache entry for " << key << ", running user computation.";
+    req->state = WorkerCacheEntry::State::ACTIVE;
+    req->expires_seconds = Env::Default()->NowSeconds() + expire_time_seconds_;
+  }
+
+  compute_func([this, key, req, response](Status status) {
+    mutex_lock m(mu_);
+    response.Encode(&req->response_buf);
+    current_bytes_ += req->response_buf.Length();
+
+    req->response_status = status;
+    req->state = WorkerCacheEntry::State::FINISHED;
+
+    VLOG(1) << "Operation for " << key << " finished. "
+            << "Status: " << status << ", " << req->response_buf.Length()
+            << " response bytes, " << req->callbacks.size()
+            << " pending callbacks.";
+    for (auto& cb : req->callbacks) {
+      cb.first.CopyFrom(req->response_buf);
+      cb.second(req->response_status);
+    }
+    req->callbacks.clear();
+  });
+}
+
+// Remove all stale or expired cache entries if the cache is full.
+void GrpcResponseCache::MaybeCleanup() {
+  mutex_lock m(mu_);
+  if (current_bytes_ < max_bytes_) {
+    return;
+  }
+
+  VLOG(1) << "Cleanup: " << current_bytes_ << " -> " << max_bytes_;
+  std::vector<std::pair<string, std::shared_ptr<WorkerCacheEntry>>>
+      ordered_entries;
+  ordered_entries.reserve(requests_.size());
+  for (const auto& p : requests_) {
+    ordered_entries.push_back(std::make_pair(p.first, p.second));
+  }
+
+  std::sort(ordered_entries.begin(), ordered_entries.end(),
+            [](const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& a,
+               const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& b) {
+              return a.second->expires_seconds > b.second->expires_seconds;
+            });
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> kept;
+  int64 now = Env::Default()->NowSeconds();
+  int64 bytes_used = 0;
+
+  // Always keep active requests.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->state != WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+    }
+  }
+
+  // Keep unexpired, finished requests up to half of max_bytes_.  This reduces
+  // chances of overfilling the cache when active requests complete and
+  // amortizes cache cleanup cost.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->expires_seconds < now || bytes_used >= max_bytes_ / 2) {
+      break;
+    }
+
+    if (pair.second->state == WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+      bytes_used += pair.second->response_buf.Length();
+    }
+  }
+
+  VLOG(1) << "Cleaned cache.  Bytes used: " << current_bytes_ << " -> "
+          << bytes_used << ". Cache size: " << requests_.size() << " -> "
+          << kept.size();
+  current_bytes_ = bytes_used;
+  std::swap(requests_, kept);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..0892d9f788d165f11803c676717e63585ca808a2
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+// gRPC response caching.  Most WorkerService methods cannot be retried directly
+// as they will fail or deadlock.  To enable retrying, we can instead cache
+// responses for a short period of time and reply to duplicate requests from the
+// cache.
+namespace tensorflow {
+
+// Union type to aid caching of either raw buffers (for RecvTensor RPCs) and
+// protocol buffer messages (for all other RPCs).
+class RPCResponse {
+ public:
+  explicit RPCResponse() : buf_(nullptr), msg_(nullptr) {}
+  explicit RPCResponse(::grpc::ByteBuffer* b) : buf_(b), msg_(nullptr) {}
+  explicit RPCResponse(protobuf::Message* m) : buf_(nullptr), msg_(m) {}
+
+  // Encode this response into the target buffer.
+  void Encode(::grpc::ByteBuffer* tgt) const;
+
+  // Copy from `src`: if this is a buffer, make a shallow copy.
+  // For protocol messages, parse the response from `src`.
+  void CopyFrom(const ::grpc::ByteBuffer& src);
+
+ private:
+  ::grpc::ByteBuffer* buf_;
+  protobuf::Message* msg_;
+};
+
+typedef std::function<void(StatusCallback)> ComputeFunc;
+struct WorkerCacheEntry;
+
+// Track and cache the state of worker service RPCs.  An RPC can be in 3 states:
+//
+// * PENDING: this is the first call of the RPC, and it will transition to
+// * ACTIVE: another thread is active processing this RPC
+// * FINISHED: the worker has finished processing the method
+//
+// The response from completed RPCs are LRU cached until either `max_bytes`
+// bytes are in use by the cache or they expire (according to `expire_time`).
+class GrpcResponseCache {
+ public:
+  GrpcResponseCache(int64 max_bytes, int64 expire_time_seconds)
+      : max_bytes_(max_bytes), expire_time_seconds_(expire_time_seconds) {}
+
+  // Lookup the result for key.
+  // If it is finished, invoke `done_cb` immediately after filling `response`.
+  // If active, done_db will be invoked when the current call completes.
+  // Otherwise, invoke `compute_func` to fill the cache and invoke done_cb.
+  void LookupOrCompute(const string& key, RPCResponse response,
+                       ComputeFunc compute_func, StatusCallback done_cb);
+
+  // Remove all stale or expired cache entries if the cache is full.
+  void MaybeCleanup();
+
+ private:
+  int64 current_bytes_ GUARDED_BY(mu_) = 0;
+  const int64 max_bytes_;
+  const int64 expire_time_seconds_;
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> requests_
+      GUARDED_BY(mu_);
+  mutex mu_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 7f63cc9344f87010ceb1225dbe4b031bd5272f2c..3635caf3d104760d9200497f6f25d3f0fdfde48c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,7 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index ac73182190f6978d5cac11b23e4f09b23b5b4488..f087a39f019974a273b1f94fd13c7c3fad00ee29 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -112,12 +112,7 @@ GrpcServer::~GrpcServer() {
 
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func,
-    const StatsPublisherFactory& stats_factory) {
+Status GrpcServer::Init(const GrpcServerOptions& opts) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -165,9 +160,9 @@ Status GrpcServer::Init(
   worker_env_.device_mgr = new DeviceMgr(std::move(devices));
   master_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
-  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+  worker_env_.rendezvous_mgr = opts.rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
-                                   : rendezvous_mgr_func(&worker_env_);
+                                   : opts.rendezvous_mgr_func(&worker_env_);
   string unused;
   string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
@@ -200,15 +195,16 @@ Status GrpcServer::Init(
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
-                             : NewGrpcWorker(&worker_env_, config);
-  worker_service_ =
-      NewGrpcWorkerService(worker_impl_.get(), &builder).release();
+  worker_impl_ = opts.worker_func ? opts.worker_func(&worker_env_, config)
+                                  : NewGrpcWorker(&worker_env_, config);
+  worker_service_ = NewGrpcWorkerService(worker_impl_.get(), &builder,
+                                         opts.worker_service_options)
+                        .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
   // extra service:
-  if (service_func != nullptr) {
-    service_func(&worker_env_, &builder);
+  if (opts.service_func != nullptr) {
+    opts.service_func(&worker_env_, &builder);
   }
   server_ = builder.BuildAndStart();
 
@@ -222,9 +218,9 @@ Status GrpcServer::Init(
       WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
-  if (collective_mgr_func) {
+  if (opts.collective_mgr_func) {
     worker_env_.collective_executor_mgr =
-        collective_mgr_func(config, &worker_env_, worker_cache);
+        opts.collective_mgr_func(config, &worker_env_, worker_cache);
     if (!worker_env_.collective_executor_mgr) {
       return errors::Internal(
           "collective_mgr_func did not return CollectiveExecutorMgr");
@@ -256,6 +252,7 @@ Status GrpcServer::Init(
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  StatsPublisherFactory stats_factory = opts.stats_factory;
   master_env_.master_session_factory =
       [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
@@ -282,31 +279,6 @@ Status GrpcServer::Init(
   return Status::OK();
 }
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              worker_func, CreateNoOpStatsPublisher);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              nullptr);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr);
-}
-
-Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
-
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
   for (const auto& job : options.cluster_def->job()) {
@@ -457,7 +429,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
@@ -471,8 +445,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<GrpcServer>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c7f543e5bfc0655a603da7436eaaca5351b2f07a..f66d7eb82e8d9bcd43868a5b65c08248f7d860da 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 
+// GrpcServer manages the lifecycle of an Eager, Worker and Master service.
+
 #include <memory>
 
 #include "grpcpp/grpcpp.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
@@ -57,6 +60,15 @@ typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
                                                   const ConfigProto& config)>
     WorkerCreationFunction;
 
+struct GrpcServerOptions {
+  ServiceInitFunction service_func = nullptr;
+  RendezvousMgrCreationFunction rendezvous_mgr_func = nullptr;
+  CollectiveMgrCreationFunction collective_mgr_func = nullptr;
+  WorkerCreationFunction worker_func = nullptr;
+  StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
+  GrpcWorkerServiceOptions worker_service_options;
+};
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
@@ -86,25 +98,7 @@ class GrpcServer : public ServerInterface {
   std::shared_ptr<GrpcChannelCache> channel_cache() { return channel_cache_; }
 
  protected:
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func,
-              const StatsPublisherFactory& stats_factory);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
-
-  Status Init();
+  Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 32063fecbbef4347bcdbfbdfda32f008015b5975..c14bfd2155fb4b2276642e220176a3658448f350 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/local_master.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -312,6 +313,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   for (const string& target : target_nodes) {
     req.add_target(target);
   }
+  req.set_request_id(GetUniqueRequestId());
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   TF_RETURN_IF_ERROR(master_->PartialRunSetup(&call_options, &req, &resp));
   *handle = resp.partial_run_handle();
@@ -408,6 +410,7 @@ Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
   MakeCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   *req.mutable_options() = callable_options;
+  req.set_request_id(GetUniqueRequestId());
   MakeCallableResponse resp;
   CallOptions call_options;
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
@@ -423,6 +426,7 @@ Status GrpcSession::RunCallable(CallableHandle handle,
   RunCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   req.set_handle(handle);
+  req.set_request_id(GetUniqueRequestId());
   for (const Tensor& feed : feed_tensors) {
     feed.AsProtoTensorContent(req.mutable_feed()->Add());
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index d73638651f2b78fb935ab8865a776a708826c930..0ca64dc159b6680342a9937480a1d67135ad6197 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -32,6 +32,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Object allocated per active RPC.
+// Manage the state of a single asynchronous RPC request.  If `max_retries`
+// is greater than 0, the request will be retried for any transient failures
+// as long as the overall deadline has not elapsed.
 template <class Response>
 class RPCState : public GrpcClientCQTag {
  public:
@@ -39,34 +42,55 @@ class RPCState : public GrpcClientCQTag {
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool)
+           thread::ThreadPool* threadpool, int32 max_retries = 0)
       : RPCState(stub, cq, method, request, response, std::move(done),
                  call_opts, threadpool, /*fail_fast=*/false,
-                 /*timeout_in_ms=*/0) {}
+                 /*timeout_in_ms=*/0, max_retries) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
-    context_.set_fail_fast(fail_fast);
-    if (timeout_in_ms > 0) {
-      context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
-    }
-
-    if (call_opts) {
-      call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
-    }
-
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
+           int32 max_retries)
+      : call_opts_(call_opts),
+        threadpool_(threadpool),
+        done_(std::move(done)),
+        cq_(cq),
+        stub_(stub),
+        method_(method),
+        max_retries_(max_retries),
+        timeout_in_ms_(timeout_in_ms),
+        fail_fast_(fail_fast) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
       LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
                  << s.error_message();
+      // Skip retry logic if we fail to parse our request.
+      done_(FromGrpcStatus(s));
+      delete this;
+      return;
+    }
+    StartCall();
+  }
+
+  void StartCall() {
+    context_.reset(new ::grpc::ClientContext());
+    context_->set_fail_fast(fail_fast_);
+
+    if (timeout_in_ms_ > 0) {
+      context_->set_deadline(
+          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
     }
-    call_ =
-        std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
+    if (call_opts_) {
+      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
+    }
+
+    VLOG(2) << "Starting call: " << method_;
+
+    call_ = std::move(
+        stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_));
     call_->StartCall();
     call_->Finish(&response_buf_, &status_, this);
   }
@@ -89,16 +113,26 @@ class RPCState : public GrpcClientCQTag {
         threadpool_->Schedule([this]() { ParseAndCallDone(); });
       } else {
         ParseAndCallDone();
-        return;
       }
-    } else {
-      VLOG(2) << "Call returned with non-ok status: " << s;
+      return;
+    }
 
-      // Attach additional GRPC error information if any
+    VLOG(1) << method_ << " returned with non-ok status: " << s
+            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
+            << context_->debug_error_string();
+    // Retry if we have any attempts left
+    if (++num_retries_ <= max_retries_ &&
+        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
+      response_buf_.Clear();
+      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
+              << " of " << max_retries_;
+      StartCall();
+    } else {
+      // Attach additional GRPC error information if any to the final status
       s = Status(s.code(),
                  strings::StrCat(s.error_message(),
                                  "\nAdditional GRPC error information:\n",
-                                 context_.debug_error_string()));
+                                 context_->debug_error_string()));
       done_(s);
       delete this;
     }
@@ -115,7 +149,7 @@ class RPCState : public GrpcClientCQTag {
 
  private:
   CallOptions* call_opts_;
-  ::grpc::ClientContext context_;
+  std::unique_ptr<::grpc::ClientContext> context_;
   thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
@@ -123,6 +157,15 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::ByteBuffer response_buf_;
   ::grpc::Status status_;
   StatusCallback done_;
+  int64 timeout_in_ms_;
+
+  size_t num_retries_ = 0;
+  size_t max_retries_;
+
+  ::grpc::CompletionQueue* cq_;
+  ::grpc::GenericStub* stub_;
+  ::grpc::string method_;
+  bool fail_fast_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h
index 344e95a6724d4793e437b180c8394c6b0347b231..9399687895ec030bb4fd4adc3f3a19cc9f8fa65b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h
@@ -16,9 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TENSOR_CODING_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TENSOR_CODING_H_
 
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
+#include "grpcpp/impl/codegen/byte_buffer.h"
 
 namespace tensorflow {
 class Tensor;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index e211c33732b26777697f11178909edaf6c9b65ed..471e2c16b348e12eca094247c729008a936174f7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -15,9 +15,61 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
+namespace {
+
+double GenerateUniformRandomNumber() {
+  return random::New64() * (1.0 / std::numeric_limits<uint64>::max());
+}
+
+double GenerateUniformRandomNumberBetween(double a, double b) {
+  if (a == b) return a;
+  DCHECK_LT(a, b);
+  return a + GenerateUniformRandomNumber() * (b - a);
+}
+
+}  // namespace
+
+int64 ComputeBackoffMicroseconds(int current_retry_attempt, int64 min_delay,
+                                 int64 max_delay) {
+  DCHECK_GE(current_retry_attempt, 0);
+
+  // This function with the constants below is calculating:
+  //
+  // (0.4 * min_delay) + (random[0.6,1.0] * min_delay * 1.3^retries)
+  //
+  // Note that there is an extra truncation that occurs and is documented in
+  // comments below.
+  constexpr double kBackoffBase = 1.3;
+  constexpr double kBackoffRandMult = 0.4;
+
+  // This first term does not vary with current_retry_attempt or a random
+  // number. It exists to ensure the final term is >= min_delay
+  const double first_term = kBackoffRandMult * min_delay;
+
+  // This is calculating min_delay * 1.3^retries
+  double uncapped_second_term = min_delay;
+  while (current_retry_attempt > 0 &&
+         uncapped_second_term < max_delay - first_term) {
+    current_retry_attempt--;
+    uncapped_second_term *= kBackoffBase;
+  }
+  // Note that first_term + uncapped_second_term can exceed max_delay here
+  // because of the final multiply by kBackoffBase.  We fix that problem with
+  // the min() below.
+  double second_term = std::min(uncapped_second_term, max_delay - first_term);
+
+  // This supplies the random jitter to ensure that retried don't cause a
+  // thundering herd problem.
+  second_term *=
+      GenerateUniformRandomNumberBetween(1.0 - kBackoffRandMult, 1.0);
+
+  return std::max(static_cast<int64>(first_term + second_term), min_delay);
+}
+
 ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
                                      grpc::ByteBuffer* dst) {
   bool own_buffer;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 45259aa2ece9698d7ffb5a850b716de442f7497f..976f3e6452a7673455d8c2d0946257ee54d762fe 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,15 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Given the total number of RPC retries attempted, return a randomized
+// amount of time to delay before retrying the request.
+//
+// The average computed backoff increases with the number of RPCs attempted.
+// See implementation for details on the calculations.
+int64 ComputeBackoffMicroseconds(int current_retry_attempt,
+                                 int64 min_delay = 1000,
+                                 int64 max_delay = 10000000);
+
 // Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
 // efficient byte reader from which to decode a RecvTensorResponse.
 class GrpcByteSource : public TensorResponse::Source {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index de80992095d13fa38172b3a30c5fdd6c177994e1..904862100e460d811dc03648ff2b8aa4f26f672c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 
 #include <deque>
+#include <memory>
+#include <unordered_map>
+#include <vector>
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -31,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
@@ -41,7 +46,12 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -50,37 +60,6 @@ namespace tensorflow {
 
 namespace {
 
-class GrpcWorkerService : public AsyncServiceInterface {
-  // TODO(ncteisen): consider adding a config var or flag for this
-  static constexpr const size_t kGrpcWorkerServiceThreadCount = 8;
-
- public:
-  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder)
-      : is_shutdown_(false) {
-    builder->RegisterService(&worker_service_);
-    for (int i = 0; i < kGrpcWorkerServiceThreadCount; i++) {
-      threads_.emplace_back(
-          new GrpcWorkerServiceThread(worker, builder, &worker_service_));
-    }
-  }
-
-  void Shutdown() override {
-    bool did_shutdown = false;
-    {
-      mutex_lock l(service_shutdown_mu_);
-      if (!is_shutdown_) {
-        LOG(INFO) << "Shutting down GrpcWorkerService.";
-        is_shutdown_ = true;
-        did_shutdown = true;
-      }
-    }
-    if (did_shutdown) {
-      for (auto& worker_thread : threads_) {
-        worker_thread->Shutdown();
-      }
-    }
-  }
-
 // This macro creates a new request for the given RPC method name
 // (e.g., `ENQUEUE_REQUEST(GetStatus, false);`), and enqueues it on
 // `this->cq_`.
@@ -105,311 +84,344 @@ class GrpcWorkerService : public AsyncServiceInterface {
     }                                                                        \
   } while (0)
 
-  // This method blocks forever handling requests from the completion queue.
-  void HandleRPCsLoop() override {
-    for (auto& worker_thread : threads_) {
-      worker_thread->Start();
-    }
-    for (auto& worker_thread : threads_) {
-      worker_thread->Join();
-    }
+#define SETUP_FOR_REQUEST(method, default_depth, supports_cancel)              \
+  for (int i = 0;                                                              \
+       i < gtl::FindWithDefault(queue_depth_,                                  \
+                                static_cast<int>(GrpcWorkerMethod::k##method), \
+                                default_depth);                                \
+       ++i) {                                                                  \
+    ENQUEUE_REQUEST(method, supports_cancel);                                  \
   }
 
- private:
-  // Thread wrapping class that drives work over a single gRPC
-  // CompletionQueue.
-  class GrpcWorkerServiceThread {
-   public:
-    explicit GrpcWorkerServiceThread(
-        GrpcWorker* worker, ::grpc::ServerBuilder* builder,
-        grpc::WorkerService::AsyncService* worker_service)
-        : worker_(worker),
-          worker_service_(worker_service),
-          is_shutdown_(false) {
-      cq_ = builder->AddCompletionQueue();
-    }
-
-    void Start() {
-      thread_.reset(worker_->env()->env->StartThread(
-          ThreadOptions(), "grpc_worker_service",
-          [this]() { HandleRPCsLoop(); }));
-    }
-
-    void Join() { thread_.reset(); }  // Blocks until thread exits
-
-    void Shutdown() {
-      {
-        mutex_lock lock(shutdown_mu_);
-        is_shutdown_ = true;
-      }
-      cq_->Shutdown();
-    }
-
-   private:
-    void HandleRPCsLoop() {
-      // TODO(ncteisen): This may require performance engineering. We can
-      // change the number of threads, the number of handlers per thread,
-      // or even decide to specialize certain threads to certain methods.
-      ENQUEUE_REQUEST(GetStatus, false);
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
-      ENQUEUE_REQUEST(CleanupAll, false);
-      ENQUEUE_REQUEST(RegisterGraph, false);
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-
-      // TODO(ncteisen): Determine a better policy for enqueuing the
-      // appropriate number of each request type.
-      for (int i = 0; i < 1000; ++i) {
-        EnqueueRecvTensorRequestRaw();
-      }
-      for (int i = 0; i < 500; ++i) {
-        ENQUEUE_REQUEST(RecvBuf, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(RunGraph, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(CleanupGraph, false);
-      }
-
-      ENQUEUE_REQUEST(Logging, false);
-      ENQUEUE_REQUEST(Tracing, false);
+// GrpcWorkerService spawns one or more GrpcWorkerServiceThreads to service
+// requests.  Each thread operates on an independent completion queue.
+class GrpcWorkerServiceThread {
+ public:
+  explicit GrpcWorkerServiceThread(
+      GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+      std::unordered_map<int, int> queue_depth, GrpcResponseCache* cache,
+      grpc::WorkerService::AsyncService* worker_service)
+      : worker_(worker),
+        queue_depth_(queue_depth),
+        cache_(cache),
+        worker_service_(worker_service),
+        is_shutdown_(false) {
+    cq_ = builder->AddCompletionQueue();
+  }
 
-      for (int i = 0; i < 10; ++i) {
-        ENQUEUE_REQUEST(CompleteGroup, true);
-        ENQUEUE_REQUEST(CompleteInstance, true);
-        ENQUEUE_REQUEST(GetStepSequence, true);
-      }
+  void Start() {
+    thread_.reset(
+        worker_->env()->env->StartThread(ThreadOptions(), "grpc_worker_service",
+                                         [this]() { HandleRPCsLoop(); }));
+  }
 
-      void* tag;
-      bool ok;
+  void Join() { thread_.reset(); }  // Blocks until thread exits
 
-      while (cq_->Next(&tag, &ok)) {
-        UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
-            static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
-        CHECK(callback_tag);
-        callback_tag->OnCompleted(this, ok);
-      }
+  void Shutdown() {
+    {
+      mutex_lock lock(shutdown_mu_);
+      is_shutdown_ = true;
     }
+    cq_->Shutdown();
+  }
 
-   private:
-    void Schedule(std::function<void()> f) {
-      worker_->env()->compute_pool->Schedule(std::move(f));
+ private:
+  // Add one or more completion queue entries for each worker method, then
+  // begin servicing requests from the completion queue.
+  void HandleRPCsLoop() {
+    // TODO(ncteisen): This may require performance engineering. We can
+    // change the number of threads, the number of handlers per thread,
+    // or even decide to specialize certain threads to certain methods.
+    SETUP_FOR_REQUEST(GetStatus, 1, false);
+    SETUP_FOR_REQUEST(CreateWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(DeleteWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(CleanupAll, 1, false);
+    SETUP_FOR_REQUEST(RegisterGraph, 1, false);
+    SETUP_FOR_REQUEST(DeregisterGraph, 1, false);
+    SETUP_FOR_REQUEST(Logging, 1, false);
+    SETUP_FOR_REQUEST(Tracing, 1, false);
+    SETUP_FOR_REQUEST(CompleteGroup, 10, true);
+    SETUP_FOR_REQUEST(CompleteInstance, 10, true);
+    SETUP_FOR_REQUEST(GetStepSequence, 10, true);
+    SETUP_FOR_REQUEST(RecvBuf, 500, true);
+    SETUP_FOR_REQUEST(RunGraph, 100, true);
+    SETUP_FOR_REQUEST(CleanupGraph, 100, false);
+
+    // TODO(ncteisen): Determine a better policy for enqueuing the
+    // appropriate number of each request type.
+    for (int i = 0;
+         i < gtl::FindWithDefault(
+                 queue_depth_, static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+                 1000);
+         ++i) {
+      EnqueueRecvTensorRequestRaw();
     }
 
-    // The following section contains one request handler method per
-    // RPC. The `FooHandler` method is called (indirectly) by
-    // `HandleRPCsLoop()` when the next Foo RPC is received. Each
-    // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
-    // and is responsible for requesting the next Foo call by calling
-    // `ENQUEUE_REQUEST(Foo)`.
-
-    template <class RequestMessage, class ResponseMessage>
-    using WorkerCall =
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RequestMessage, ResponseMessage>;
-
-    void GetStatusHandler(
-        WorkerCall<GetStatusRequest, GetStatusResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->GetStatus(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(GetStatus, false);
-    }
+    void* tag;
+    bool ok;
 
-    void CreateWorkerSessionHandler(
-        WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->CreateWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
+    while (cq_->Next(&tag, &ok)) {
+      UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
+          static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
+      CHECK(callback_tag);
+      callback_tag->OnCompleted(this, ok);
     }
+  }
 
-    void DeleteWorkerSessionHandler(
-        WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->DeleteWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
-    }
+ private:
+  void Schedule(std::function<void()> f) {
+    worker_->env()->compute_pool->Schedule(std::move(f));
+  }
 
-    void CleanupAllHandler(
-        WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupAll(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupAll, false);
-    }
+  // The following section contains one request handler method per
+  // RPC. The `FooHandler` method is called (indirectly) by
+  // `HandleRPCsLoop()` when the next Foo RPC is received. Each
+  // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
+  // and is responsible for requesting the next Foo call by calling
+  // `ENQUEUE_REQUEST(Foo)`.
+  template <class RequestMessage, class ResponseMessage>
+  using WorkerCall =
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RequestMessage, ResponseMessage>;
+
+  // Handle all non-cancellable simple methods with a standard wrapper.
+#define HANDLE_CALL(method)                                                   \
+  void method##Handler(WorkerCall<method##Request, method##Response>* call) { \
+    Schedule([this, call]() {                                                 \
+      Status s = worker_->method(&call->request, &call->response);            \
+      if (!s.ok()) {                                                          \
+        VLOG(1) << "Bad response from " << #method << ": " << s;              \
+      }                                                                       \
+      call->SendResponse(ToGrpcStatus(s));                                    \
+    });                                                                       \
+    ENQUEUE_REQUEST(method, false);                                           \
+  }
 
-    void RegisterGraphHandler(
-        WorkerCall<RegisterGraphRequest, RegisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->RegisterGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(RegisterGraph, false);
-    }
+  HANDLE_CALL(GetStatus);
+  HANDLE_CALL(CreateWorkerSession);
+  HANDLE_CALL(DeleteWorkerSession);
+  HANDLE_CALL(CleanupAll);
+  HANDLE_CALL(RegisterGraph);
+  HANDLE_CALL(DeregisterGraph);
+  HANDLE_CALL(CleanupGraph);
+  HANDLE_CALL(Logging);
+  HANDLE_CALL(Tracing);
+
+#undef HANDLE_CALL
+
+  void GetStepSequenceHandler(
+      WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
+    Schedule([this, call]() {
+      worker_->GetStepSequenceAsync(
+          &call->request, &call->response, [call](const Status& s) {
+            VLOG(1) << "Bad response from GetStepSequence:" << s;
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(GetStepSequence, true);
+  }
 
-    void DeregisterGraphHandler(
-        WorkerCall<DeregisterGraphRequest, DeregisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->DeregisterGraph(&call->request, &call->response);
+  void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      ProtoRunGraphRequest* wrapped_request =
+          new ProtoRunGraphRequest(&call->request);
+      NonOwnedProtoRunGraphResponse* wrapped_response =
+          new NonOwnedProtoRunGraphResponse(&call->response);
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      auto done_cb = [call, call_opts, wrapped_request,
+                      wrapped_response](const Status& s) {
+        VLOG(1) << "RunGraph::Done";
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RunGraph:" << s;
+        }
+        call->ClearCancelCallback();
+        delete call_opts;
+        delete wrapped_request;
+        delete wrapped_response;
         call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-    }
+      };
 
-    void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        ProtoRunGraphRequest* wrapped_request =
-            new ProtoRunGraphRequest(&call->request);
-        NonOwnedProtoRunGraphResponse* wrapped_response =
-            new NonOwnedProtoRunGraphResponse(&call->response);
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      auto compute_fn = [this, call_opts, wrapped_request,
+                         wrapped_response](StatusCallback done) {
         worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
-                               [call, call_opts, wrapped_request,
-                                wrapped_response](const Status& s) {
-                                 call->ClearCancelCallback();
-                                 delete call_opts;
-                                 delete wrapped_request;
-                                 delete wrapped_response;
-                                 call->SendResponse(ToGrpcStatus(s));
-                               });
-      });
-      ENQUEUE_REQUEST(RunGraph, true);
-    }
+                               done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
+    });
+    ENQUEUE_REQUEST(RunGraph, true);
+  }
 
-    void RecvTensorHandlerRaw(
-        WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+  void RecvTensorHandlerRaw(
+      WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+
+      auto done_cb = [call, call_opts](const Status& s) {
+        call->ClearCancelCallback();
+        delete call_opts;
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RecvTensor:" << s;
+        }
+        call->SendResponse(ToGrpcStatus(s));
+      };
+
+      auto compute_fn = [this, &call_opts, &call](StatusCallback done) {
         worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
-                                     [call, call_opts](const Status& s) {
-                                       call->ClearCancelCallback();
-                                       delete call_opts;
-                                       call->SendResponse(ToGrpcStatus(s));
-                                     });
-      });
-      EnqueueRecvTensorRequestRaw();
-    }
+                                     done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
+    });
+    EnqueueRecvTensorRequestRaw();
+  }
 
-    void CleanupGraphHandler(
-        WorkerCall<CleanupGraphRequest, CleanupGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupGraph, false);
-    }
+  void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->RecvBufAsync(call_opts, &call->request, &call->response,
+                            [call, call_opts](const Status& s) {
+                              call->ClearCancelCallback();
+                              delete call_opts;
+                              if (!s.ok()) {
+                                VLOG(1) << "Bad response from RecvBuf:" << s;
+                              }
+                              call->SendResponse(ToGrpcStatus(s));
+                            });
+    });
+    ENQUEUE_REQUEST(RecvBuf, true);
+  }
 
-    void LoggingHandler(WorkerCall<LoggingRequest, LoggingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Logging(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Logging, false);
-    }
+  void CompleteGroupHandler(
+      WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteGroupAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteGroup:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteGroup, true);
+  }
 
-    void TracingHandler(WorkerCall<TracingRequest, TracingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Tracing(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Tracing, false);
-    }
+  void CompleteInstanceHandler(
+      WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteInstanceAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteInstance:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteInstance, false);
+  }
+#undef ENQUEUE_REQUEST
 
-    void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->RecvBufAsync(call_opts, &call->request, &call->response,
-                              [call, call_opts](const Status& s) {
-                                call->ClearCancelCallback();
-                                delete call_opts;
-                                call->SendResponse(ToGrpcStatus(s));
-                              });
-      });
-      ENQUEUE_REQUEST(RecvBuf, true);
+  void EnqueueRecvTensorRequestRaw() {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RecvTensorRequest, ::grpc::ByteBuffer>::
+          EnqueueRequestForMethod(
+              worker_service_, cq_.get(),
+              static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+              &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
+              true /* supports cancel*/);
     }
+  }
 
-    void CompleteGroupHandler(
-        WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteGroupAsync(call_opts, &call->request, &call->response,
-                                    [call, call_opts](const Status& s) {
-                                      call->ClearCancelCallback();
-                                      delete call_opts;
-                                      call->SendResponse(ToGrpcStatus(s));
-                                    });
-      });
-      ENQUEUE_REQUEST(CompleteGroup, true);
-    }
+  GrpcWorker* const worker_ = nullptr;  // Not owned.
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<Thread> thread_;
+  std::unordered_map<int, int> queue_depth_;
+  GrpcResponseCache* cache_;
+  grpc::WorkerService::AsyncService* const worker_service_;
 
-    void CompleteInstanceHandler(
-        WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteInstanceAsync(call_opts, &call->request,
-                                       &call->response,
-                                       [call, call_opts](const Status& s) {
-                                         call->ClearCancelCallback();
-                                         delete call_opts;
-                                         call->SendResponse(ToGrpcStatus(s));
-                                       });
-      });
-      ENQUEUE_REQUEST(CompleteInstance, false);
+  mutex shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
+};
+
+class GrpcWorkerService : public AsyncServiceInterface {
+ public:
+  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+                    GrpcWorkerServiceOptions options)
+      : is_shutdown_(false) {
+    builder->RegisterService(&worker_service_);
+    if (options.response_cache_bytes > 0) {
+      cache_.reset(
+          new GrpcResponseCache(options.response_cache_bytes,
+                                options.response_cache_expires_seconds));
     }
 
-    void GetStepSequenceHandler(
-        WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
-      Schedule([this, call]() {
-        worker_->GetStepSequenceAsync(
-            &call->request, &call->response,
-            [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
-      });
-      ENQUEUE_REQUEST(GetStepSequence, true);
+    for (int i = 0; i < options.num_serving_threads; i++) {
+      threads_.emplace_back(
+          new GrpcWorkerServiceThread(worker, builder, options.queue_depth,
+                                      cache_.get(), &worker_service_));
     }
-#undef ENQUEUE_REQUEST
+  }
 
-    void EnqueueRecvTensorRequestRaw() {
-      mutex_lock l(shutdown_mu_);
+  void Shutdown() override {
+    bool did_shutdown = false;
+    {
+      mutex_lock l(service_shutdown_mu_);
       if (!is_shutdown_) {
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RecvTensorRequest, ::grpc::ByteBuffer>::
-            EnqueueRequestForMethod(
-                worker_service_, cq_.get(),
-                static_cast<int>(GrpcWorkerMethod::kRecvTensor),
-                &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
-                true /* supports cancel*/);
+        LOG(INFO) << "Shutting down GrpcWorkerService.";
+        is_shutdown_ = true;
+        did_shutdown = true;
       }
     }
+    if (did_shutdown) {
+      for (auto& worker_thread : threads_) {
+        worker_thread->Shutdown();
+      }
+    }
+  }
 
-    GrpcWorker* const worker_ = nullptr;  // Not owned.
-    std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
-    std::unique_ptr<Thread> thread_;
-    grpc::WorkerService::AsyncService* const worker_service_;
-
-    mutex shutdown_mu_;
-    bool is_shutdown_ GUARDED_BY(shutdown_mu_);
-    TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
-  };  // GrpcWorkerServiceThread
+  // This method blocks forever handling requests from the completion queue.
+  void HandleRPCsLoop() override {
+    for (auto& worker_thread : threads_) {
+      worker_thread->Start();
+    }
+    for (auto& worker_thread : threads_) {
+      worker_thread->Join();
+    }
+  }
 
+ private:
   grpc::WorkerService::AsyncService worker_service_;
   std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
 
+  std::unique_ptr<GrpcResponseCache> cache_;
   mutex service_shutdown_mu_;
   bool is_shutdown_ GUARDED_BY(service_shutdown_mu_);
 
@@ -454,11 +466,14 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
     return;
   }
 
-  // Request the tensor associated with the rendezvous key. Any time
-  // while waiting for the tensor to be produced, up until the start
-  // of execution of the callback lambda body below, an RPC
-  // cancellation should abort the rendezvous.
-  opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
+  // Request the tensor associated with the rendezvous key.
+  // Note that we log the cancellation here but do not abort the current step.
+  // gRPC can generate cancellations in response to transient network failures,
+  // and aborting the step eliminates the opportunity for client side retries.
+  // Repeated client failures will eventually cause the step to be aborted by
+  // the client.
+  opts->SetCancelCallback(
+      [step_id]() { LOG(WARNING) << "RecvTensor cancelled for " << step_id; });
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
       [opts, response, done, src_dev, request](
@@ -640,9 +655,10 @@ std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env,
 }
 
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder) {
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions options) {
   return std::unique_ptr<AsyncServiceInterface>(
-      new GrpcWorkerService(worker, builder));
+      new GrpcWorkerService(worker, builder, options));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 996617d385d1c0e397c30eeceb4f737690fb9490..8f2830c899b9b9854e0b6f02e23651ebd1b06491 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
+#include <memory>
+#include <unordered_map>
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 
 namespace grpc {
@@ -57,9 +61,19 @@ class GrpcWorker : public Worker {
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env,
                                           const ConfigProto& config);
 
+struct GrpcWorkerServiceOptions {
+  // Map from GrpcWorkerMethod id to queue depth.  If set this overrides the
+  // default queue depth for a method.
+  std::unordered_map<int, int> queue_depth;
+  int num_serving_threads = 8;
+  int64 response_cache_bytes = 0;
+  int64 response_cache_expires_seconds = 0;
+};
+
 // Returns an implementation of WorkerService rpc service.
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder);
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions opts = GrpcWorkerServiceOptions());
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 7915c3aafd8a97de2830962d2851b247e7d4db4a..c475153754f759ed056652cbdeedad7b77fb1e69 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -31,14 +31,11 @@ limitations under the License.
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
 
 // Support parsing/unparsing of tensorflow::TensorResponse.
 // Wire-format is identical to RecvTensorResponse.
+// This is specializing an existing template, so it's okay to do this in a
+// namespace that we don't own.
 template <>
 class SerializationTraits<tensorflow::TensorResponse> {
  public:
@@ -66,6 +63,7 @@ class SerializationTraits<tensorflow::TensorResponse> {
     return result;
   }
 };
+
 }  // namespace grpc
 
 namespace tensorflow {
@@ -88,6 +86,7 @@ enum class GrpcWorkerMethod {
   kCompleteInstance,
   kGetStepSequence,
 };
+
 static const int kGrpcNumWorkerMethods =
     static_cast<int>(GrpcWorkerMethod::kGetStepSequence) + 1;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 9fb920404f987d6b5b324cce4155da40c7e753b4..ee561e1a8a02a78256b97f5ce015f99ef148a591 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -71,9 +71,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     req_.set_request_id(GetUniqueRequestId());
   }
 
-  void Reset(WorkerCacheInterface* wc) {
-    wc->ReleaseWorker(src_worker_, wi_);
-    wi_ = nullptr;
+  void Reset() {
+    // The RpcRemoteRendezvous using this object is responsible for calling
+    // ReleaseWorker() before Reset().
+    DCHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "Leaking WorkerInterface in RpcRecvTensorCall::Reset().";
+
     alloc_attrs_ = AllocatorAttributes();
     dst_device_ = nullptr;
     // We don't clear opts_ and assume that Init will set up the state for
@@ -89,9 +92,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   ~RpcRecvTensorCall() override {
     // Since only the RpcRecvTensorFreeList will delete an
-    // RpcRecvTensorCall, and it always sets this->wi_ to null when
-    // a call object is released to it, we can assert that this->wi_ is
-    // always null at the point of deletion.
+    // RpcRecvTensorCall, we require that ReleaseWorker() has been called before
+    // the user releases a Call object to the free list.
     CHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
         << "Leaking WorkerInterface in RpcRecvTensorCall destructor.";
   }
@@ -113,6 +115,13 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     return status_;
   }
 
+  void ReleaseWorker(WorkerCacheInterface* worker_cache) {
+    DCHECK_NE(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "RpcRecvTensorCall::ReleaseWorker() called twice.";
+    worker_cache->ReleaseWorker(src_worker_, wi_);
+    wi_ = nullptr;
+  }
+
   const Tensor& tensor() const { return resp_.tensor(); }
 
   bool is_dead() const { return resp_.metadata().is_dead(); }
@@ -144,7 +153,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   string src_worker_;
   string src_rel_device_;
-  WorkerInterface* wi_;
+  WorkerInterface* wi_;  // Not owned.
   AllocatorAttributes alloc_attrs_;
   Device* dst_device_;
   CallOptions opts_;
@@ -180,8 +189,8 @@ class RpcRecvTensorFreeList {
     return new RpcRecvTensorCall;
   }
 
-  void Release(RpcRecvTensorCall* obj, WorkerCacheInterface* wc) {
-    obj->Reset(wc);
+  void Release(RpcRecvTensorCall* obj) {
+    obj->Reset();
     {
       mutex_lock l(mu_);
       if (objects_.size() < kMaxObjects) {
@@ -220,6 +229,9 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
                          " is invalid remote source device.");
   }
   WorkerSession* sess = session();
+  // The worker will be released in a subsequent call to
+  // `sess->worker_cache->ReleaseWorker()` (if the call has not yet been
+  // initialized) or `call->ReleaseWorker()` (if it has been initialized).
   WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
@@ -233,7 +245,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     if (rwi != nullptr) {
       sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
     }
-    get_call_freelist()->Release(call, sess->worker_cache.get());
+    get_call_freelist()->Release(call);
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -246,10 +258,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   // RendezvousMgr already aborted, shouldn't send RPC call any more
   if (!call->status().ok()) {
+    // NOTE: `*sess` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(sess->worker_cache.get());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     return;
   }
 
@@ -261,10 +275,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // If StartAbort was called prior to DeregisterCall, then the
     // current status should be bad.
     Status s = call->status();
+    // NOTE: `*session()` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(session()->worker_cache.get());
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     Unref();
   });
 }
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index c9581fa00f3e946b212717107809182a6a5d00f2..98eb1467700a5e3259a3635f71c5cebae094751f 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -56,7 +56,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
   void RetireStepId(int64 graph_key, int64 step_id) override;
 
  protected:
-  CollectiveExecutor* Create(int64 step_id) override;
+  virtual CollectiveExecutor* Create(int64 step_id) override;
 
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   const string task_name_;
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 1ab0d20f0b53798ea63e69d25f41c47bcaef17d4..edbd7ddbcb237ef4fca610c861360fe89db84cb8 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -140,7 +140,6 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
 }
 
 TEST_F(SessionMgrTest, LegacySession) {
-  ServerDef server_def;
   string session_handle = "";
   std::shared_ptr<WorkerSession> session;
   TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session));
@@ -150,7 +149,6 @@ TEST_F(SessionMgrTest, LegacySession) {
 }
 
 TEST_F(SessionMgrTest, UnknownSessionHandle) {
-  ServerDef server_def;
   string session_handle = "unknown_session_handle";
   std::shared_ptr<WorkerSession> session;
   Status s = mgr_.WorkerSessionForSession(session_handle, &session);
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934dde814344b70f52fbc972f74347e0..6d20e7cfcada3e3396611143bfcb148ee2a8f0c2 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -68,13 +68,14 @@ Status TensorResponse::InitFrom(RecvTensorResponse* response) {
   return s;
 }
 
-void TensorResponse::InitPartial(const RecvTensorResponse& response) {
+void TensorResponse::InitPartial(const RecvTensorResponse& response,
+                                 const AllocationAttributes& allocation_attr) {
   // Everything except content is present in *response.  Content will
   // arrive later; allocate a Tensor with appropriate storage for that
   // content.
   meta_ = response;
   TensorShape shape(meta_.tensor().tensor_shape());
-  Tensor t(allocator_, meta_.tensor().dtype(), shape);
+  Tensor t(allocator_, meta_.tensor().dtype(), shape, allocation_attr);
   tensor_ = std::move(t);
 }
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.h b/tensorflow/core/distributed_runtime/tensor_coding.h
index 4c34297990d399e4e42f5776cd23fb660c9090c5..86d95a30631493c713f24cbc2e04a09da80e00b8 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.h
+++ b/tensorflow/core/distributed_runtime/tensor_coding.h
@@ -76,7 +76,8 @@ class TensorResponse {
 
   // Initialize tensor metadata from response and allocate
   // uninitialized backing storage for actual contents.
-  void InitPartial(const RecvTensorResponse& response);
+  void InitPartial(const RecvTensorResponse& response,
+                   const AllocationAttributes& allocation_attr);
 
   // Return a reference to the parsed tensor.  The tensor will remain
   // live only until *this is destroyed or modified.
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index e942191efe96cde305acf9eb4335cfd8038cb9b1..f21f76fec53d2deac4a0f6467c8744e086c637b7 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -26,14 +26,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-void AllocatorStats::Clear() {
-  this->num_allocs = 0;
-  this->bytes_in_use = 0;
-  this->max_bytes_in_use = 0;
-  this->max_alloc_size = 0;
-  this->bytes_limit = 0;
-}
-
 string AllocatorStats::DebugString() const {
   return strings::Printf(
       "Limit:        %20lld\n"
@@ -41,8 +33,8 @@ string AllocatorStats::DebugString() const {
       "MaxInUse:     %20lld\n"
       "NumAllocs:    %20lld\n"
       "MaxAllocSize: %20lld\n",
-      this->bytes_limit, this->bytes_in_use, this->max_bytes_in_use,
-      this->num_allocs, this->max_alloc_size);
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
@@ -132,10 +124,10 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       ++stats_.num_allocs;
       stats_.bytes_in_use += alloc_size;
-      stats_.max_bytes_in_use =
-          std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
-      stats_.max_alloc_size =
-          std::max<int64>(stats_.max_alloc_size, alloc_size);
+      stats_.peak_bytes_in_use =
+          std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+      stats_.largest_alloc_size =
+          std::max<int64>(stats_.largest_alloc_size, alloc_size);
 
       if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
           total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
@@ -158,16 +150,16 @@ class CPUAllocator : public Allocator {
     port::AlignedFree(ptr);
   }
 
-  void GetStats(AllocatorStats* stats) override {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mu_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mu_);
     stats_.num_allocs = 0;
-    stats_.max_bytes_in_use = stats_.bytes_in_use;
-    stats_.max_alloc_size = 0;
+    stats_.peak_bytes_in_use = stats_.bytes_in_use;
+    stats_.largest_alloc_size = 0;
   }
 
   size_t AllocatedSizeSlow(const void* ptr) override {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 3ded86e8e93ea484c9b742c0ac7837e35b388bdb..4dc5eaf16d7f3eb034e44898f61dab33ba4c8d82 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include <limits>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
@@ -46,23 +48,31 @@ struct AllocationAttributes {
   // which Op is performing the allocation, and sets this flag to
   // true.
   bool allocation_will_be_logged = false;
+  // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
+  // a memory chunk whose last-freed count is at this value or earlier may be
+  // returned.
+  std::function<uint64()> freed_by_func = nullptr;
 };
 
-// Runtime statistics collected by an allocator.
+// Runtime statistics collected by an allocator. Exactly the same as
+// stream_executor::AllocatorStats, but independently defined to preserve the
+// mutual independence of StreamExecutor and TensorFlow.
 struct AllocatorStats {
-  int64 num_allocs;        // Number of allocations.
-  int64 bytes_in_use;      // Number of bytes in use.
-  int64 max_bytes_in_use;  // The maximum bytes in use.
-  int64 max_alloc_size;    // The max single allocation seen.
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
 
-  // The upper limit what the allocator can allocate, if such a limit
-  // is known. Certain allocator may return 0 to indicate the limit is
-  // unknown.
-  int64 bytes_limit;
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
 
-  AllocatorStats() { Clear(); }
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
 
-  void Clear();
   string DebugString() const;
 };
 
@@ -194,7 +204,7 @@ class Allocator {
   }
 
   // Fills in 'stats' with statistics collected by this allocator.
-  virtual void GetStats(AllocatorStats* stats) { stats->Clear(); }
+  virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
 
   // Clears the internal stats except for the `in_use` field.
   virtual void ClearStats() {}
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index 9dc74345dab8d075809d586ee1c5e86fe9acb515..d9f3280c62d7c1a4a2bb7a3de117768f836653af 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index a409cb2de7fbae20f435f464ca07155a36fede4a..85e8ba6a71b7760b004b9d2ebbc425ddff5fbf17 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -25,20 +25,23 @@ limitations under the License.
 namespace tensorflow {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
 #if defined(PLATFORM_GOOGLE) && defined(NDEBUG)
   // NOTE: allocator stats expectation depends on the system malloc,
   // and can vary as that changes.
   static const int64 kSlop = 5 * 1024;
-  EXPECT_GT(stats.bytes_in_use, bytes_in_use - kSlop);
-  EXPECT_LT(stats.bytes_in_use, bytes_in_use + kSlop);
-  EXPECT_GT(stats.max_bytes_in_use, max_bytes_in_use - kSlop);
-  EXPECT_LT(stats.max_bytes_in_use, max_bytes_in_use + kSlop);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+  EXPECT_GT(stats->bytes_in_use, bytes_in_use - kSlop);
+  EXPECT_LT(stats->bytes_in_use, bytes_in_use + kSlop);
+  EXPECT_GT(stats->peak_bytes_in_use, peak_bytes_in_use - kSlop);
+  EXPECT_LT(stats->peak_bytes_in_use, peak_bytes_in_use + kSlop);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 #endif
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 79966f06922a62c7d04648f4a2829d05861cd76b..43b435270c49087b43ce101991686e8c9c069de2 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -54,9 +54,7 @@ uint64 TensorProtoHash(const TensorProto& tp) {
   DCHECK(success);
   TensorProto p;
   tensor.AsProtoTensorContent(&p);
-  string s;
-  SerializeToStringDeterministic(p, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(p);
 }
 
 // Do not create large tensors in memory, compute hash based on TensorProto
@@ -64,12 +62,8 @@ uint64 TensorProtoHash(const TensorProto& tp) {
 // different hash code if they are defined with different TensorProto
 // representations.
 uint64 FastTensorProtoHash(const TensorProto& tp) {
-  string s;
   if (TensorByteSize(tp) > kMaxAttrValueTensorByteSize) {
-    string s;
-    bool success = SerializeToStringDeterministic(tp, &s);
-    DCHECK(success);
-    return Hash64(s);
+    return DeterministicProtoHash64(tp);
   } else {
     return TensorProtoHash(tp);
   }
@@ -95,11 +89,7 @@ bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
   TensorProto rhs_tp;
   rhs_t.AsProtoTensorContent(&rhs_tp);
 
-  string lhs_str, rhs_str;
-  SerializeToStringDeterministic(lhs_tp, &lhs_str);
-  SerializeToStringDeterministic(rhs_tp, &rhs_str);
-
-  return lhs_str == rhs_str;
+  return AreSerializedProtosEqual(lhs_tp, rhs_tp);
 }
 
 // Do not construct large tensors in memory, compare equality using TensorProto
@@ -139,9 +129,7 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   }
 
   // If `a` is not a tensor or func, get a hash of serialized string.
-  string s;
-  SerializeToStringDeterministic(a, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(a);
 }
 
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
@@ -175,10 +163,7 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
 
   // All other fields in AttrValue have deterministic representations.
   // It is safe to compare their serialized strings.
-  string a_str, b_str;
-  SerializeToStringDeterministic(a, &a_str);
-  SerializeToStringDeterministic(b, &b_str);
-  return a_str == b_str;
+  return AreSerializedProtosEqual(a, b);
 }
 
 string SummarizeString(const string& str) {
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index af59500aee32d83dadb7cf94f6d277819f6c65c4..7f639b5ca9a5fd6219b900f08965acaf2c6ee923 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -27,6 +27,12 @@ CancellationManager::CancellationManager()
       is_cancelled_(false),
       next_cancellation_token_(0) {}
 
+void CancellationManager::Reset() {
+  mutex_lock l(mu_);
+  is_cancelling_ = false;
+  is_cancelled_.store(false);
+}
+
 void CancellationManager::StartCancel() {
   gtl::FlatMap<CancellationToken, CancelCallback> callbacks_to_run;
   {
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 7a5d9424867d35a4ca07e690230c73afff0b2940..51b200423ec11fba771d233e6985c62708f901ac 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -56,6 +56,9 @@ class CancellationManager {
   // Returns true iff StartCancel() has been called.
   bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); }
 
+  // Resets the cancellation manager to its original pre-cancelled state.
+  void Reset();
+
   // Returns a token that must be used in calls to RegisterCallback
   // and DeregisterCallback.
   CancellationToken get_cancellation_token();
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 7fa58347f258acf327e112f4c9cd58c37134ceee..b83d183f14b28672f8da47ae642a386c69253a9b 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -64,7 +65,9 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
     same_num_devices_per_task = other.same_num_devices_per_task;
+    num_devices_per_task = other.num_devices_per_task;
     gpu_ring_order = other.gpu_ring_order;
+    communicator_key = other.communicator_key;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -76,6 +79,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     impl_details.subdiv_source_rank.assign(
         other.impl_details.subdiv_source_rank.begin(),
         other.impl_details.subdiv_source_rank.end());
+    impl_details.dependencies = other.impl_details.dependencies;
   }
   return *this;
 }
@@ -91,6 +95,13 @@ string CollInstanceParams::ToString() const {
   for (const auto& n : task_names) {
     strings::StrAppend(&v, n, ", ");
   }
+  strings::StrAppend(&v, "} num_devices_per_task={");
+  for (const auto dpt : num_devices_per_task) {
+    strings::StrAppend(&v, dpt.first, ": ", dpt.second, ", ");
+  }
+  strings::StrAppend(&v, "}, collective_name=", impl_details.collective_name,
+                     ", communicator_key=", str_util::CEscape(communicator_key),
+                     ", subdiv_offsets={");
   strings::StrAppend(&v, "}, subdiv_offsets={");
   for (const auto& d : impl_details.subdiv_offsets) {
     strings::StrAppend(&v, d, ",");
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 0321429702af74dfb18ca631b0314c705150ec06..e00cc17961cb89cfdad8d33cbca758d80a5ca274 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -42,6 +42,7 @@ class Tensor;
 enum CollectiveType {
   REDUCTION_COLLECTIVE = 0,
   BROADCAST_COLLECTIVE,
+  GATHER_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
@@ -70,6 +71,8 @@ struct CollImplDetails {
   std::vector<std::vector<int>> subdiv_permutations;
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
+  std::vector<int32>
+      dependencies;  // collective instances on which this node depends
 };
 
 // Data common to all members of a collective instance.
@@ -85,9 +88,13 @@ struct CollInstanceParams {
   std::vector<string> task_names;
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
+  // Task -> number of devices on that task.
+  std::unordered_map<string, int32> num_devices_per_task;
   // If passed in to GPUOptions in ConfigProto, defines a good ring order for
   // GPUs.  Assumes same GPU configuration at each worker.
   string gpu_ring_order = "";
+  // Valid when using a communicator-based collective mechanism, e.g. NCCL.
+  string communicator_key;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -269,6 +276,21 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
 
   virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
 
+  // `WaitForDependencies` and `Launched` are used for fine-grained control of
+  // execution order between collective instances.  These functions are intended
+  // to be called in `Run` function of collective implementations, and may be
+  // used to make part, or whole, of the collective execution ordered with
+  // respect to other collective instances.
+  //
+  // `WaitForDependencies` will block until it is safe to continue the callee's
+  // execution, where safety is defined as: ordered with respect to the
+  // collective instances defined in the callee's `wait_for` attribute.
+  virtual void WaitForDependencies(const CollectiveParams& col_params) {}
+  // `Launched` unblocks the dependent collective instances by recording that
+  // this callee device has completed the critical portion of the collective
+  // execution.
+  virtual void Launched(const CollectiveParams& col_params) {}
+
   // Used to designate an invalid group or instance key.
   static int64 kInvalidId;
 
@@ -347,7 +369,8 @@ class CollectiveImplementationInterface {
 
   // Initializes the portions of `col_params` specific to this
   // implementation.  Called exactly once for every Collective instance during
-  // the CollectiveParams resolution process when the graph is first executed.
+  // the CollectiveParams resolution process when the graph is first executed,
+  // at the end of `CompleteInstanceLocal()`.
   // NOTE(ayushd): This is effectively a static function because it modifies the
   // `col_params` passed in and should not manipulate any data members.  However
   // because it is virtual and needs to be implemented by every derived class we
@@ -360,6 +383,14 @@ class CollectiveImplementationInterface {
   // object.
   virtual Status InitializeCollectiveContext(CollectiveContext* col_ctx) = 0;
 
+  // Initializes instance params at the beginning of `CompleteInstanceLocal()`,
+  // unlike `InitializeCollectiveParams` which is called at the end.  This
+  // function is called before all devices in the instance are discovered, and
+  // may be used to broadcast data via the shared `InstanceRec` object in
+  // collective param resolution to all devices.
+  virtual Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) = 0;
+
   // Processes and moves data according to the logic of this Collective
   // implementation.  Relies on appropriate initialization of op-specific
   // CollectiveParams in InitializeCollectiveParams(), as well as appropriate
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 876ac188ac765615a5dfca0fd1eba0086bea12b0..5c974a76aca76f14ef166d285733c5e2f9ad723b 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -203,6 +203,10 @@ Status GetWindowedOutputSizeFromDims(
 
 Status UnchangedShape(shape_inference::InferenceContext* c) {
   c->set_output(0, c->input(0));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr) {
+    c->set_output_handle_shapes_and_types(0, *handle_data);
+  }
   return Status::OK();
 }
 
@@ -1299,6 +1303,12 @@ Status ConcatV2Shape(InferenceContext* c) {
                            c->num_inputs() - 1 /* dim_index */);
 }
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat) {
+  return ConcatShapeHelper(c, 0 /* start_value_index */,
+                           num_inputs_to_concat /* end_value_index */,
+                           num_inputs_to_concat /* dim_index */);
+}
+
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                             ShapeHandle shape_x,
                                             ShapeHandle shape_y,
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 14b9688bdc5d41e8cb2e92b1f1a8640fb9687d8c..d421844ee607b18132f4657e7562dec04253c2fa 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -279,6 +279,8 @@ Status ConcatShape(shape_inference::InferenceContext* c,
 // Shape function for concat operations.
 Status ConcatV2Shape(shape_inference::InferenceContext* c);
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat);
+
 // Shape function for binary operators that broadcast their inputs
 // and with output to output_index.
 // Note: out cannot be NULL.
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 5fed06ed6e7d8f6e4808272c69dd8eb4ec7e1ea5..f5a1db35f945c9cbef5d3623bae34700c2c729cf 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -191,13 +191,13 @@ Status GraphDefBuilderWrapper::AddDataset(
     const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
     Node** output) {
-  const string& name = dataset->name();
+  const string& type_string = dataset->type_string();
   std::unique_ptr<const GraphDefBuilder::Options> opts(
       new GraphDefBuilder::Options(b_->opts()));
   // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
   // attributes defined. It will be nice to have a consistent pattern.
-  bool has_output_types_attr = HasAttr(name, "output_types");
-  bool has_output_shapes_attr = HasAttr(name, "output_shapes");
+  bool has_output_types_attr = HasAttr(type_string, "output_types");
+  bool has_output_shapes_attr = HasAttr(type_string, "output_shapes");
   if (has_output_shapes_attr) {
     opts.reset(new GraphDefBuilder::Options(
         opts->WithAttr("output_shapes", dataset->output_shapes())));
@@ -214,7 +214,8 @@ Status GraphDefBuilderWrapper::AddDataset(
     return errors::Internal("AddDataset: Failed to build Options with error ",
                             opts->StatusToString());
   }
-  NodeBuilder node_builder(opts->GetNameForOp(name), name, opts->op_registry());
+  NodeBuilder node_builder(opts->GetNameForOp(type_string), type_string,
+                           opts->op_registry());
   {
     size_t total_size = inputs.size() + list_inputs.size();
     auto inputs_iter = inputs.begin();
@@ -239,7 +240,7 @@ Status GraphDefBuilderWrapper::AddDataset(
   }
   *output = opts->FinalizeBuilder(&node_builder);
   if (*output == nullptr) {
-    return errors::Internal("AddDataset: Failed to build ", name,
+    return errors::Internal("AddDataset: Failed to build ", type_string,
                             " op with error ", opts->StatusToString());
   }
   return Status::OK();
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index cca10fa49e86c062a7d6fa8b25901c7c1fb87d95..0c38801154ecda329dda9f2714685596385fcb02 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/thread_factory.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
@@ -287,7 +289,8 @@ class IteratorContext {
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
-          stats_aggregator(ctx->stats_aggregator()) {}
+          stats_aggregator(ctx->stats_aggregator()),
+          thread_factory(ctx->thread_factory()) {}
 
     explicit Params(OpKernelContext* ctx)
         : env(ctx->env()),
@@ -338,6 +341,10 @@ class IteratorContext {
 
     // The `StatsAggregator` object to record statistics about the iterator.
     std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
+
+    // A `ThreadFactory` for creating threads used by iterators to perform
+    // blocking work.
+    std::shared_ptr<ThreadFactory> thread_factory = nullptr;
   };
 
   explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {}
@@ -374,6 +381,20 @@ class IteratorContext {
     return &params_.runner;
   }
 
+  const std::shared_ptr<ThreadFactory>& thread_factory() {
+    return params_.thread_factory;
+  }
+
+  std::unique_ptr<Thread> StartThread(const string& name,
+                                      std::function<void()> fn) {
+    if (params_.thread_factory) {
+      return params_.thread_factory->StartThread(name, std::move(fn));
+    } else {
+      return absl::WrapUnique(
+          Env::Default()->StartThread({}, name, std::move(fn)));
+    }
+  }
+
   int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
 
   std::shared_ptr<StatsAggregator> stats_aggregator() {
@@ -524,16 +545,20 @@ class IteratorBase {
 class DatasetContext {
  public:
   struct Params {
-    string name;
+    string type_string;  // op type name of this dataset.
+    string node_name;    // graph node name of this dataset op, uniquely
+                         // identifying the dataset in the graph.
   };
 
   explicit DatasetContext(Params params) : params_(std::move(params)) {}
 
   explicit DatasetContext(OpKernelContext* ctx) {
-    params_.name = ctx->op_kernel().type_string();
+    params_.type_string = ctx->op_kernel().type_string();
+    params_.node_name = ctx->op_kernel().name();
   }
 
-  const string& name() const { return params_.name; }
+  const string& type_string() const { return params_.type_string; }
+  const string& node_name() const { return params_.node_name; }
 
  private:
   Params params_;
@@ -569,9 +594,15 @@ class DatasetBase : public core::RefCounted {
   // format.
   TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
 
-  explicit DatasetBase(DatasetContext&& ctx) : name_(ctx.name()) {}
+  explicit DatasetBase(DatasetContext&& ctx)
+      : type_string_(ctx.type_string()), node_name_(ctx.node_name()) {}
+
+  // Op type name of this dataset.
+  const string& type_string() const { return type_string_; }
 
-  const string& name() const { return name_; }
+  // Graph node name of this dataset op, uniquely identifying the dataset in
+  // the graph.
+  const string& node_name() const { return node_name_; }
 
   // Returns a new iterator for iterating over the range of elements in
   // this dataset.
@@ -650,7 +681,8 @@ class DatasetBase : public core::RefCounted {
     };
   }
 
-  const string name_;
+  const string type_string_;
+  const string node_name_;
 };
 
 // Represents an iterator that is associated with a particular dataset.
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 321947aca8e06008c3291fa43befa389b53f998c..89ba662b69b060b1b76a0a22630acd4ecb80bed6 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -246,6 +246,15 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+  // Some devices (i.e. GPUs) may free device memory prior to its actual use
+  // being completed on the assumption that subsequent allocations can only be
+  // used serially with respect to pending uses.  If this function returns a
+  // non-zero value it is the value of a device-specific counter such that any
+  // device memory tagged with an earlier freed-at count is really unencumbered
+  // by pending uses.  For this to be useful the device memory allocator must
+  // be tagging deallocated memory chunks using the same counter.
+  virtual uint64 SafeAllocFrontier() { return 0; }
+
  protected:
   // Does not take ownership.
   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index b7adfd0c947b60ff9295c867f4afdf756208b126..9923fdffb8524eef61bf97c052d6be722a449c07 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -569,6 +569,9 @@ string Print(const FunctionDef& fdef) {
   for (const auto& n : fdef.node_def()) {
     strings::StrAppend(&out, "  ", Print(n), "\n");
   }
+  for (const auto& cr : fdef.control_ret()) {
+    strings::StrAppend(&out, "  @return ", cr.first, " = ", cr.second, "\n");
+  }
   for (const auto& r : fdef.ret()) {
     strings::StrAppend(&out, "  return ", r.first, " = ", r.second, "\n");
   }
@@ -613,6 +616,8 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
         return strings::StrCat(DataTypeString(dt), "@", parsed.type, ":",
                                parsed.id);
       } else {
+        LOG(WARNING) << "Failed to parse device \"" << n.device() << "\" in "
+                     << n.op() << ":" << n.name();
         return strings::StrCat(DataTypeString(dt), "@",
                                "<FAILED_TO_PARSE_DEVICE>");
       }
@@ -677,7 +682,7 @@ Status AddDefaultAttrs(const string& op,
 Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
-  VLOG(3) << "Instantiation Function: " << Print(fdef);
+  VLOG(4) << "Instantiation Function: " << Print(fdef);
 
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
@@ -825,6 +830,12 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
   if (ret1 != ret2) return false;
 
+  std::map<string, string> control_ret1(f1.control_ret().begin(),
+                                        f1.control_ret().end());
+  std::map<string, string> control_ret2(f2.control_ret().begin(),
+                                        f2.control_ret().end());
+  if (control_ret1 != control_ret2) return false;
+
   return true;
 }
 
@@ -849,6 +860,14 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
     h = Hash64(p.second.data(), p.second.size(), h);
   }
 
+  // control output names
+  std::map<string, string> control_ret(fdef.control_ret().begin(),
+                                       fdef.control_ret().end());
+  for (const auto& p : control_ret) {
+    h = Hash64(p.first.data(), p.first.size(), h);
+    h = Hash64(p.second.data(), p.second.size(), h);
+  }
+
   return h;
 }
 
@@ -900,6 +919,12 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
   if (!executor_type.empty()) {
     entries.push_back(strings::StrCat(kExecutorAttr, "=", executor_type));
   }
+  string config_proto_serialized;
+  options.config_proto.SerializeToString(&config_proto_serialized);
+  if (!config_proto_serialized.empty()) {
+    entries.push_back(strings::StrCat(
+        "_config_proto", "=", str_util::CEscape(config_proto_serialized)));
+  }
   std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
 }
@@ -1339,7 +1364,7 @@ GET_ATTR(bool)
 
 namespace {
 
-constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+constexpr char kApiImplements[] = "api_implements";
 
 absl::flat_hash_set<string> ReachableFunctions(
     const FunctionLibraryDefinition& flib,
@@ -1347,10 +1372,10 @@ absl::flat_hash_set<string> ReachableFunctions(
   // Functions that are reachable from the graph.
   absl::flat_hash_set<string> reachable_funcs;
 
-  // For any functions, if it has attribute "experimental_api_implements" =
+  // For any functions, if it has attribute "api_implements" =
   // "some_interface" and it is reachable, then it means any other
   // function with same attribute name and value could also be potentially
-  // reachable, eg via experimental_implementation_selector swapping the
+  // reachable, eg via implementation_selector swapping the
   // nodedef.
   absl::flat_hash_set<string> reachable_api_interface;
 
@@ -1400,7 +1425,7 @@ absl::flat_hash_set<string> ReachableFunctions(
     const string& func_name = func->signature().name();
     reachable_funcs.insert(func_name);
 
-    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func->attr().find(kApiImplements);
     if (attr_it != func->attr().end()) {
       reachable_api_interface.insert(attr_it->second.s());
     }
@@ -1416,7 +1441,7 @@ absl::flat_hash_set<string> ReachableFunctions(
 
   for (const auto& func_name : flib.ListFunctionNames()) {
     const auto& func_def = flib.Find(func_name);
-    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func_def->attr().find(kApiImplements);
     if (attr_it != func_def->attr().end()) {
       if (reachable_api_interface.contains(attr_it->second.s())) {
         reachable_funcs.insert(func_name);
@@ -1512,7 +1537,8 @@ FunctionDef FunctionDefHelper::Create(
     const string& function_name, gtl::ArraySlice<string> in_def,
     gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
     gtl::ArraySlice<Node> node_def,
-    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+    gtl::ArraySlice<std::pair<string, string>> ret_def,
+    gtl::ArraySlice<std::pair<string, string>> control_ret_def) {
   FunctionDef fdef;
 
   // Signature
@@ -1520,6 +1546,7 @@ FunctionDef FunctionDefHelper::Create(
   for (const auto& i : in_def) b.Input(i);
   for (const auto& o : out_def) b.Output(o);
   for (const auto& a : attr_def) b.Attr(a);
+  for (const auto& c : control_ret_def) b.ControlOutput(c.first);
 
   OpRegistrationData op_reg_data;
   TF_CHECK_OK(b.Finalize(&op_reg_data));
@@ -1535,6 +1562,11 @@ FunctionDef FunctionDefHelper::Create(
     fdef.mutable_ret()->insert({r.first, r.second});
   }
 
+  // Control returns
+  for (const auto& cr : control_ret_def) {
+    fdef.mutable_control_ret()->insert({cr.first, cr.second});
+  }
+
   auto* op_def_registry = OpRegistry::Global();
   // Check if any op is stateful.
   for (const auto& n : node_def) {
@@ -1550,6 +1582,16 @@ FunctionDef FunctionDefHelper::Create(
   return fdef;
 }
 
+/* static */
+FunctionDef FunctionDefHelper::Create(
+    const string& function_name, gtl::ArraySlice<string> in_def,
+    gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+    gtl::ArraySlice<Node> node_def,
+    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+  return Create(function_name, in_def, out_def, attr_def, node_def, ret_def,
+                /*control_ret_def=*/{});
+}
+
 /* static */
 FunctionDef FunctionDefHelper::Define(const string& name,
                                       gtl::ArraySlice<string> arg_def,
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 79755f599cfc80fa3ccdbadc83cef65667d07250..db300588c1e7f40b44cc54453719e73ca0359b2d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
@@ -122,9 +123,23 @@ class FunctionDefHelper {
     NodeDef ToNodeDef() const;
   };
 
-  // The Create() function uses the new NodeDef field.  `ret_def`
-  // holds a mapping from the function output names from `out_def` to
-  // the node outputs from `node_def`.
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
+  // - `control_ret_def` holds a mapping from the function control
+  //   output names to the nodes from `node_def`.
+  static FunctionDef Create(
+      const string& function_name, gtl::ArraySlice<string> in_def,
+      gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+      gtl::ArraySlice<Node> node_def,
+      gtl::ArraySlice<std::pair<string, string>> ret_def,
+      gtl::ArraySlice<std::pair<string, string>> control_ret_def);
+
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
   static FunctionDef Create(const string& function_name,
                             gtl::ArraySlice<string> in_def,
                             gtl::ArraySlice<string> out_def,
@@ -132,7 +147,6 @@ class FunctionDefHelper {
                             gtl::ArraySlice<Node> node_def,
                             gtl::ArraySlice<std::pair<string, string>> ret_def);
 
-  // The two Define() functions use the old FunctionDef::Node field.
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const string& function_name,
                             gtl::ArraySlice<string> arg_def,
@@ -551,9 +565,15 @@ class FunctionLibraryRuntime {
     // surface errors earlier.
     bool create_kernels_eagerly = false;
 
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function with the provided config_proto.
+    ConfigProto config_proto;
+
     // If provided, this optimization function will be invoked before
     // the placer for multi-device functions.
     std::function<Status(std::vector<string> /*ret_node_names*/,
+                         std::vector<string> /*keep_node_names*/,
                          FunctionLibraryDefinition*, const DeviceSet&,
                          Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
         optimize_graph_fn;
@@ -641,6 +661,11 @@ class FunctionLibraryRuntime {
   // Returns the device on which the function executes.
   virtual Device* device() = 0;
 
+  // Returns the default runner in which the ops should be launched. If the
+  // device on which the function executes has a private thread pool, return
+  // runner on the device local thread pool.
+  virtual std::function<void(std::function<void()>)>* runner() = 0;
+
   // Get the DeviceMgr from which the device was obtained.
   virtual const DeviceMgr* device_mgr() const = 0;
 
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index e69d3938d93d109a7cb0c940e8f981d30f464599..64f406bfd73c847e64d58553143aa91b2dc5f424 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -76,6 +76,10 @@ message FunctionDef {
   // A mapping from the output arg names from `signature` to the
   // outputs from `node_def` that should be returned by the function.
   map<string, string> ret = 4;
+
+  // A mapping from control output names from `signature` to node names in
+  // `node_def` which should be control outputs of this function.
+  map<string, string> control_ret = 6;
 }
 
 // GradientDef defines the gradient function of a function defined in
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 6a828e9afaaec536d4d5ef51d50dec88fdd6d391..6fbbabfc95d13d7574f578ddd05f9887435aa0d1 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -156,6 +156,48 @@ ControlDep(x:int32) -> (y:int32) {
   EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
+TEST(TFunc, ControlRet) {
+  auto fdef = FDH::Create(
+      // Name
+      "ControlRet",
+      // Inputs
+      {"x: int32"},
+      // Outputs
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {
+          {{"a"}, "Identity", {"x"}, {{"T", DT_INT32}}},
+      },
+      // Returns
+      {{"y", "a:output:0"}},
+      // Control returns
+      {{"must_execute", "a"}});
+
+  const char* e = R"P(
+ControlRet(x:int32) -> (y:int32) {
+  a = Identity[T=int32](x)
+  @return must_execute = a
+  return y = a:output:0
+}
+)P";
+  EXPECT_EQ(DebugString(fdef), e);
+
+  // Instantiate one with T=float
+  InstantiationResult result;
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
+  const char* e2 = R"P(
+(x:int32) -> (a:int32) {
+  a = Identity[T=int32](x)
+}
+)P";
+  EXPECT_EQ(result.arg_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(result.ret_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(DebugString(result.nodes), e2);
+}
+
 REGISTER_OP("HasDefaultType")
     .Output("out: T")
     .Attr("T: {float, double, int32, int64} = DT_FLOAT");
@@ -1320,7 +1362,7 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
 
     if (!interface_name.empty()) {
       auto* attr = func_def.mutable_attr();
-      (*attr)["experimental_api_implements"].set_s(interface_name);
+      (*attr)["api_implements"].set_s(interface_name);
     }
     return func_def;
   };
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 47c8dbe61b6ac00f7743a331a64b2b1460929ce5..a1c87a3f4210b7fb95597bed03a4d922a81fbfdf 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -380,7 +380,12 @@ std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
     output_ = node;
   }
   if (output) {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id()
+            << ") as input for " << output->name() << "(id:" << output->id()
+            << ")";
     output->add_input(node);
+  } else {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id() << ")";
   }
   collect_resource_usage_ =
       collect_resource_usage_ || node->has_tunable_parameters();
@@ -493,10 +498,13 @@ void Model::RecordStop(const string& name, bool start_output) {
 void Model::RemoveNode(const string& name) {
   mutex_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node && (*node)->output()) {
-    (*node)->output()->remove_input(*node);
+  if (node) {
+    if ((*node)->output()) {
+      (*node)->output()->remove_input(*node);
+    }
+    VLOG(3) << "Removing " << (*node)->name() << "(id:" << (*node)->id() << ")";
+    remove_node_hook_(*node);
   }
-  remove_node_hook_(*node);
   lookup_table_.erase(name);
 }
 
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 7fac1753a6332e1db4d01c15e68242ac15b388ca..5d3946b1fe290729bf25eeaf5d26f66fa831b186 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -247,10 +247,13 @@ class Node {
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     std::shared_ptr<Node> result = Clone(output);
-    result->buffered_bytes_ = buffered_bytes_;
-    result->processing_time_ = processing_time_;
-    result->num_elements_ = num_elements_;
-    result->parameters_ = parameters_;
+    {
+      mutex_lock l2(result->mu_);
+      result->buffered_bytes_ = buffered_bytes_;
+      result->processing_time_ = processing_time_;
+      result->num_elements_ = num_elements_;
+      result->parameters_ = parameters_;
+    }
     for (auto& input : inputs_) {
       result->add_input(input->Snapshot(result));
     }
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index cc583df348b8d4d5416e428698fe1a49c29f3637..03b4456393f5a742dd9ed3d5ea274cb4ccd5df30 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -235,7 +235,7 @@ TEST_F(NodeDefBuilderTest, Polymorphic) {
       op: "Polymorphic" input: "a"
       attr { key: "T" value { type: DT_BOOL } } )proto");
 
-  // Conficting Attr()
+  // Conflicting Attr()
   ExpectFailure(Builder().Input(FakeInput(DT_BOOL)).Attr("T", DT_STRING),
                 "Inconsistent values for attr 'T' DT_BOOL vs. DT_STRING while");
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index e369e882a0961e60d20f52e0155e9738bf16415e..fee52375c139ada0e457efe1247a18d471e8aa46 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -515,10 +515,13 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
           ". (Check whether your GraphDef-interpreting binary is up to date "
           "with your GraphDef-generating binary.).");
     }
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second),
-        "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
-        SummarizeOpDef(op_def));
+    // If attr value is placeholder, do not check it.
+    if (attr.second.placeholder().empty()) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          ValidateAttrValue(attr.second, *iter->second),
+          "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
+          SummarizeOpDef(op_def));
+    }
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b8309eafb05251235bfaaa7b5489cac06f0024dc..b29d7ae77f031a9fff0dfa6280a43dba75f4ab71 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -60,6 +60,21 @@ void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
 
 Status OpRegistry::LookUp(const string& op_type_name,
                           const OpRegistrationData** op_reg_data) const {
+  {
+    tf_shared_lock l(mu_);
+    if (initialized_) {
+      if (const OpRegistrationData* res =
+              gtl::FindWithDefault(registry_, op_type_name, nullptr)) {
+        *op_reg_data = res;
+        return Status::OK();
+      }
+    }
+  }
+  return LookUpSlow(op_type_name, op_reg_data);
+}
+
+Status OpRegistry::LookUpSlow(const string& op_type_name,
+                              const OpRegistrationData** op_reg_data) const {
   *op_reg_data = nullptr;
   const OpRegistrationData* res = nullptr;
 
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 81ed5f95f0bf020780f1d71692388885ce702b70..538ce04ef44f591c7090489f7723121ee362e54f 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -144,6 +144,9 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
       const EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  Status LookUpSlow(const string& op_type_name,
+                    const OpRegistrationData** op_reg_data) const;
+
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index aea2d2bb09a2c2c80ae02b10b1222d6882606c3c..e44ecc9f6236210b3bcb21a4914243741c632d2c 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -54,6 +54,10 @@ message OpDef {
   // Description of the output(s).
   repeated ArgDef output_arg = 3;
 
+  // Named control outputs for this operation. Useful only for composite
+  // operations (i.e. functions) which want to name different control outputs.
+  repeated string control_output = 20;
+
   // Description of the graph-construction-time configuration of this
   // Op.  That is to say, this describes the attr fields that will
   // be specified in the NodeDef.
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 8a9bb6318211ad1537727d7e60945897c4a9a63d..0a62a2e871ab1bfeea4c7cbc14e93173bbc1a3c1 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -316,6 +316,14 @@ bool ConsumeInOutTimesType(StringPiece* sp, StringPiece* out) {
       .GetResult(sp, out);
 }
 
+bool ConsumeControlOutName(StringPiece* sp, StringPiece* out) {
+  return Scanner(*sp)
+      .One(Scanner::LETTER)
+      .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+      .StopCapture()
+      .GetResult(sp, out);
+}
+
 #define VERIFY(expr, ...)                                             \
   do {                                                                \
     if (!(expr)) {                                                    \
@@ -409,6 +417,25 @@ void FinalizeInputOrOutput(StringPiece spec, bool is_output, OpDef* op_def,
 
 #undef VERIFY
 
+string ControlOutError(StringPiece orig, const string& op_name) {
+  return strings::StrCat(" from ControlOutput(\"", orig, "\") for Op ",
+                         op_name);
+}
+
+void FinalizeControlOutput(StringPiece name, OpDef* op_def,
+                           std::vector<string>* errors) {
+  StringPiece orig(name);
+
+  // Parse control output name.
+  StringPiece tmp_name;
+  if (!ConsumeControlOutName(&orig, &tmp_name)) {
+    errors->push_back(strings::StrCat("Trouble parsing 'name:'",
+                                      ControlOutError(orig, op_def->name())));
+  }
+
+  *op_def->add_control_output() = string(tmp_name.data(), tmp_name.size());
+}
+
 int num_leading_spaces(StringPiece s) {
   size_t i = 0;
   while (i < s.size() && s[i] == ' ') {
@@ -545,6 +572,11 @@ OpDefBuilder& OpDefBuilder::Output(string spec) {
   return *this;
 }
 
+OpDefBuilder& OpDefBuilder::ControlOutput(string name) {
+  control_outputs_.push_back(std::move(name));
+  return *this;
+}
+
 #ifndef TF_LEAN_BINARY
 OpDefBuilder& OpDefBuilder::Doc(string text) {
   if (!doc_.empty()) {
@@ -614,6 +646,9 @@ Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   for (StringPiece output : outputs_) {
     FinalizeInputOrOutput(output, true, op_def, &errors);
   }
+  for (StringPiece control_output : control_outputs_) {
+    FinalizeControlOutput(control_output, op_def, &errors);
+  }
   FinalizeDoc(doc_, op_def, &errors);
 
   if (errors.empty()) return Status::OK();
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 8077b20598c210d9266c168569f3d9a3a190c097..38d3f5cfc608d19b90b56b648b2ffb6bccbdd8f3 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -28,6 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class FunctionDefHelper;
+
 namespace shape_inference {
 class InferenceContext;
 }
@@ -150,12 +152,20 @@ class OpDefBuilder {
   Status Finalize(OpRegistrationData* op_reg_data) const;
 
  private:
+  friend class FunctionDefHelper;
+
+  // Adds control output to this OpDefBuilder (and returns *this).
+  // The <name> must be a valid node name (matches regexp
+  // [a-zA-Z][a-zA-Z0-9_]*). Named control output can only exist for functions.
+  OpDefBuilder& ControlOutput(string name);
+
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
   OpRegistrationData op_reg_data_;
   std::vector<string> attrs_;
   std::vector<string> inputs_;
   std::vector<string> outputs_;
+  std::vector<string> control_outputs_;
   string doc_;
   std::vector<string> errors_;
 };
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 3597f43d51987b0d46df90ad0db964927f16adf0..9c47ac0f017779fccfa40ab521a161e83fd1e7df 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -114,6 +114,8 @@ Status ValidateAttrValue(const AttrValue& attr_value,
         length = attr_value.list().shape_size();
       } else if (attr.type() == "list(tensor)") {
         length = attr_value.list().tensor_size();
+      } else if (attr.type() == "list(func)") {
+        length = attr_value.list().func_size();
       }
       if (length < attr.minimum()) {
         return errors::InvalidArgument(
@@ -833,25 +835,37 @@ bool OpDefEqual(const OpDef& o1, const OpDef& o2) {
   // Compare it separately here instead of serializing below.
   if (!RepeatedAttrDefEqual(o1.attr(), o2.attr())) return false;
 
-  // Clear attr field, serialize, and compare serialized strings
+  // `control_output` order doesn't matter.
+  std::set<string> control_output1(o1.control_output().begin(),
+                                   o1.control_output().end());
+  std::set<string> control_output2(o2.control_output().begin(),
+                                   o2.control_output().end());
+  if (control_output1 != control_output2) return false;
+
+  // Clear `attr` and `control_output` fields, serialize, and compare serialized
+  // strings.
   OpDef o1_copy = o1;
   OpDef o2_copy = o2;
   o1_copy.clear_attr();
+  o1_copy.clear_control_output();
   o2_copy.clear_attr();
-  string s1, s2;
-  SerializeToStringDeterministic(o1_copy, &s1);
-  SerializeToStringDeterministic(o2_copy, &s2);
-  if (s1 != s2) return false;
-  return true;
+  o2_copy.clear_control_output();
+
+  return AreSerializedProtosEqual(o1_copy, o2_copy);
 }
 
 uint64 OpDefHash(const OpDef& o) {
   uint64 h = RepeatedAttrDefHash(o.attr());
+
+  // Compute deterministic order-independent control outputs hash.
+  std::set<string> control_output(o.control_output().begin(),
+                                  o.control_output().end());
+  for (const auto& co : control_output) h = Hash64Combine(h, Hash64(co));
+
   OpDef o_copy = o;
   o_copy.clear_attr();
-  string s;
-  SerializeToStringDeterministic(o_copy, &s);
-  return Hash64(s.data(), s.size(), h);
+  o_copy.clear_control_output();
+  return DeterministicProtoHash64(o_copy, h);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 92a7038a404d2bf7f5bbf1e643f727f8c3dfc74a..4a94f97c767987733954692f1ed3d2a40e3c4bb4 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -228,7 +228,6 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     // Add every line to unescaped until we see the "END" string.
     string unescaped;
     bool first = true;
-    string suffix;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
       if (str_util::ConsumePrefix(&line, end)) break;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 789f0fda7526fadc667e51046a344062a9532670..16ca40c31c73e0cab9cab408d59ac230b95e6cde 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <cstdlib>
+#include <cstring>
+
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -998,6 +1001,12 @@ static Status IsProbablySafeToLoad(const string& path) {
 
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
+
+  // Override to allow loading unsafe packages for development.
+  // DO NOT USE UNLESS YOU KNOW WHAT ABI ISSUES YOU CAN ENCOUNTER.
+  bool override_abi_check =
+      strcmp(getenv("TF_REALLY_LOAD_UNSAFE_PACKAGES"), "1") == 0;
+
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
                                          "tensorflow",
                                          "core",
@@ -1010,7 +1019,12 @@ void LoadDynamicKernelsInternal() {
       string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
         Status s = IsProbablySafeToLoad(fullpath);
-        if (s.ok()) {
+        if (!s.ok() && override_abi_check) {
+          LOG(WARNING) << "Loading UNSAFE library " << fullpath
+                       << " because ABI check override is set: "
+                       << s.error_message();
+        }
+        if (s.ok() || override_abi_check) {
           // TODO(gunan): Store the handles to the opened files.
           void* unused_filehandle;
           TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
@@ -1074,6 +1088,11 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
   delete kernel_def;
 }
 
+OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
+    OpKernelConstruction* context) {
+  return (*create_func_)(context);
+}
+
 }  // namespace kernel_factory
 
 namespace {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 06b90964ad1f7e4c8047f79ec37bee097327be9a..5b8521ba707b3f8b7e32028b4b5c56de8d268650 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -525,11 +525,42 @@ struct TensorValue {
 // Used to store partitioned graphs from function-calling ops.
 struct GraphCollector {
   mutex mu;
-  std::vector<GraphDef> graphs GUARDED_BY(mu);
+  std::vector<GraphDef> partitioned_graphs GUARDED_BY(mu);
+  GraphDef raw_graph GUARDED_BY(mu);
+  GraphDef optimized_graph GUARDED_BY(mu);
 
-  void CollectGraph(const GraphDef& graph) {
+  bool dirty GUARDED_BY(mu);
+
+  GraphCollector() : dirty(false) {}
+
+  void CollectRawGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    raw_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectOptimizedGraph(const GraphDef& graph) {
     mutex_lock ml(mu);
-    graphs.push_back(graph);
+    optimized_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectPartitionedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    partitioned_graphs.push_back(graph);
+    dirty = true;
+  }
+
+  void ClearGraphs() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    raw_graph.Clear();
+    optimized_graph.Clear();
+    partitioned_graphs.clear();
+    dirty = false;
+  }
+
+  bool HasUpdatedGraphs() {
+    mutex_lock ml(mu);
+    return dirty;
   }
 };
 
@@ -1107,7 +1138,7 @@ class OpKernelContext {
 
   // Cancellation.
   //
-  // EXPERIMENTAL. See the implementation in tensorflow::TensorQueue for an
+  // EXPERIMENTAL. See the implementation in tensorflow::FIFOQueue for an
   // example of how to use this API.
   CancellationManager* cancellation_manager() const {
     return params_->cancellation_manager;
@@ -1436,23 +1467,21 @@ class OpKernelRegistrar {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      struct PtrOpKernelFactory : public OpKernelFactory {
-        explicit PtrOpKernelFactory(
-            OpKernel* (*create_func)(OpKernelConstruction*))
-            : create_func_(create_func) {}
-
-        OpKernel* Create(OpKernelConstruction* context) override {
-          return (*create_func_)(context);
-        }
-
-        OpKernel* (*create_func_)(OpKernelConstruction*);
-      };
       InitInternal(kernel_def, kernel_class_name,
                    absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
+  struct PtrOpKernelFactory : public OpKernelFactory {
+    explicit PtrOpKernelFactory(OpKernel* (*create_func)(OpKernelConstruction*))
+        : create_func_(create_func) {}
+
+    OpKernel* Create(OpKernelConstruction* context) override;
+
+    OpKernel* (*create_func_)(OpKernelConstruction*);
+  };
+
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     std::unique_ptr<OpKernelFactory> factory);
 };
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index f84ef0f953cf23e3fb2af210706586f95cfbb8ad..ed4ff240393eab495e04b85d80b25377c578ac1e 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -241,7 +241,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
   num_records_produced_ = state.num_records_produced();
   work_ = state.current_work();
   if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
@@ -251,7 +251,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
         debug_string);
   }
   if (work_started_ > work_finished_) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || (__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 7a777f064c7b517de9f9c1c14648e5ff32ca4b5e..8f16c6fd83958fad13eafd104e0a30d879b9c795 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -318,7 +318,6 @@ void BM_SendRecv(int iters) {
   Tensor val(DT_STRING, TensorShape({}));
   bool is_dead = false;
   Rendezvous::Args args;
-  Status s;
   if (iters > 0) {
     while (iters--) {
       TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
@@ -343,7 +342,6 @@ void BM_PingPong(int iters) {
     Tensor foo(DT_STRING, TensorShape({}));
     bool is_dead = false;
     Rendezvous::Args args;
-    Status s;
     for (int i = 0; i < iters; ++i) {
       TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
       TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
@@ -354,7 +352,6 @@ void BM_PingPong(int iters) {
   Tensor bar(DT_STRING, TensorShape({}));
   bool is_dead = false;
   Rendezvous::Args args;
-  Status s;
   for (int i = 0; i < iters; ++i) {
     TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
     TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 9c381e7d6b4e909689591d3a75bfabbecd886a0d..da547d5829f846ae87857c410d731bcc9457cd3b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -89,9 +89,17 @@ class ScopedStepContainer {
   // step_id: the unique ID of this step. Doesn't have to be sequential, just
   // has to be unique.
   // cleanup: callback to delete a container of this name.
+  // prefix: optional string prefix to disambiguate step containers.
   ScopedStepContainer(const int64 step_id,
                       std::function<void(const string&)> cleanup)
       : name_(strings::StrCat("__per_step_", step_id)), cleanup_(cleanup) {}
+
+  ScopedStepContainer(const int64 step_id,
+                      std::function<void(const string&)> cleanup,
+                      const string& prefix)
+      : name_(strings::StrCat("__", prefix, "_per_step_", step_id)),
+        cleanup_(cleanup) {}
+
   ~ScopedStepContainer() { cleanup_(name_); }
 
   const string& name() const { return name_; }
@@ -124,14 +132,14 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status Lookup(const string& container, const string& name,
                 T** resource) const TF_MUST_USE_RESULT;
 
   // Similar to Lookup, but looks up multiple resources at once, with only a
   // single lock acquisition.  If containers_and_names[i] is uninitialized
   // then this function does not modify resources[i].
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupMany(absl::Span<std::pair<const string*, const string*> const>
                         containers_and_names,
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
@@ -147,7 +155,7 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupOrCreate(const string& container, const string& name,
                         T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
@@ -188,7 +196,7 @@ class ResourceMgr {
   mutable mutex mu_;
   std::unordered_map<string, Container*> containers_ GUARDED_BY(mu_);
 
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupInternal(const string& container, const string& name,
                         T** resource) const
       SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
@@ -259,7 +267,7 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value);
 //
 // If the lookup is successful, the caller takes the ownership of one ref on
 // `*value`, and must call its `Unref()` method when it has finished using it.
-template <typename T>
+template <typename T, bool use_dynamic_cast = false>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p, T** value);
 
 // Looks up multiple resources pointed by a sequence of resource handles.  If
@@ -429,15 +437,15 @@ Status ResourceMgr::Create(const string& container, const string& name,
   return DoCreate(container, MakeTypeIndex<T>(), name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::Lookup(const string& container, const string& name,
                            T** resource) const {
   CheckDeriveFromResourceBase<T>();
   tf_shared_lock l(mu_);
-  return LookupInternal(container, name, resource);
+  return LookupInternal<T, use_dynamic_cast>(container, name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupMany(
     absl::Span<std::pair<const string*, const string*> const>
         containers_and_names,
@@ -447,8 +455,9 @@ Status ResourceMgr::LookupMany(
   resources->resize(containers_and_names.size());
   for (size_t i = 0; i < containers_and_names.size(); ++i) {
     T* resource;
-    Status s = LookupInternal(*containers_and_names[i].first,
-                              *containers_and_names[i].second, &resource);
+    Status s = LookupInternal<T, use_dynamic_cast>(
+        *containers_and_names[i].first, *containers_and_names[i].second,
+        &resource);
     if (s.ok()) {
       (*resources)[i].reset(resource);
     }
@@ -456,7 +465,18 @@ Status ResourceMgr::LookupMany(
   return Status::OK();
 }
 
+// Simple wrapper to allow conditional dynamic / static casts.
+template <typename T, bool use_dynamic_cast>
+struct TypeCastFunctor {
+  static T* Cast(ResourceBase* r) { return static_cast<T*>(r); }
+};
+
 template <typename T>
+struct TypeCastFunctor<T, true> {
+  static T* Cast(ResourceBase* r) { return dynamic_cast<T*>(r); }
+};
+
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupInternal(const string& container, const string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
@@ -464,12 +484,12 @@ Status ResourceMgr::LookupInternal(const string& container, const string& name,
   if (s.ok()) {
     // It's safe to down cast 'found' to T* since
     // typeid(T).hash_code() is part of the map key.
-    *resource = static_cast<T*>(found);
+    *resource = TypeCastFunctor<T, use_dynamic_cast>::Cast(found);
   }
   return s;
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
                                    T** resource,
                                    std::function<Status(T**)> creator) {
@@ -478,11 +498,11 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
   Status s;
   {
     tf_shared_lock l(mu_);
-    s = LookupInternal(container, name, resource);
+    s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
     if (s.ok()) return s;
   }
   mutex_lock l(mu_);
-  s = LookupInternal(container, name, resource);
+  s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
   if (s.ok()) return s;
   TF_RETURN_IF_ERROR(creator(resource));
   s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
@@ -558,11 +578,12 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
   return ctx->resource_manager()->Create(p.container(), p.name(), value);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
                       T** value) {
   TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
+  return ctx->resource_manager()->Lookup<T, use_dynamic_cast>(p.container(),
+                                                              p.name(), value);
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index 9387b6c23c77dadfd423865b23bc7dc5fdf41672..47c009a3206c59e6beeeb3a2009eb6169ce6bc28 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -95,6 +95,31 @@ class Var : public ResourceBase {
   TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
+// Does unlock and unref automatically when going out of scope, and also
+// supports early manual release.
+class ScopedUnlockUnrefVar {
+ public:
+  explicit ScopedUnlockUnrefVar(Var* var) : var_(var) {
+    if (var_) {
+      var_->mu()->lock();
+    }
+  }
+  void Release() {
+    if (var_) {
+      var_->mu()->unlock();
+      var_->Unref();
+      var_ = nullptr;
+    }
+  }
+  ~ScopedUnlockUnrefVar() { Release(); }
+
+ private:
+  Var* var_;
+
+  ScopedUnlockUnrefVar(const ScopedUnlockUnrefVar&) = delete;
+  void operator=(const ScopedUnlockUnrefVar&) = delete;
+};
+
 }  //  end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 4dcc80680ff7c62b31fb266c0f5cd80a9325fe81..18a278f07ff4e5b07061047021a86411e04e2511 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/shape_inference.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -1259,7 +1259,6 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
     return false;
   }
   std::vector<ShapeAndType> new_values(shapes_and_types.size());
-  bool refined = false;
   for (int i = 0; i < shapes_and_types.size(); ++i) {
     const ShapeAndType& existing = (*to_update)[i];
     if (shapes_and_types[i].dtype == existing.dtype) {
@@ -1269,16 +1268,9 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
         return false;
       } else {
         new_values[i].dtype = shapes_and_types[i].dtype;
-        refined = true;
       }
     }
     Relax(existing.shape, shapes_and_types[i].shape, &new_values[i].shape);
-    if (!existing.shape.SameHandle(new_values[i].shape)) {
-      refined = true;
-    }
-  }
-  if (!refined) {
-    return false;
   }
   to_update->swap(new_values);
   return true;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e3885b7d9e8a3f746d0cc2121dad71221d4ec06b..bf8b633c0137f856932689aed18456e8946eb778 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -588,9 +588,9 @@ class InferenceContext {
   // position idx with the specified shapes and types. This requires idx to be
   // in the [0, num_inputs) range.
   //
-  // If the relax is successful and any of the new shapes differs from the old
-  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
-  // return true.  Return false otherwise.
+  // If the relax is successful (sizes are the same, old dtypes match new ones
+  // or are DT_INVALID), then store the relaxed shapes and return true.
+  // Return false otherwise.
   //
   // See 'RelaxInput' function for full details and examples.
   bool RelaxInputHandleShapesAndMergeTypes(
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 67cc9e38459a00394c45bc74b5a966e6128b204a..f8cab135aba799d67183f0978ee1166aba533b99 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -77,6 +77,8 @@ message NodeExecStats {
 message DeviceStepStats {
   string device = 1;
   repeated NodeExecStats node_stats = 2;
+  // Its key is thread id.
+  map<uint32, string> thread_names = 3;
 }
 
 message StepStats {
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index ab492a21899bd3fd30dcfc152283a13dea25a777..ecbffecd66d691e3e1b1722625381665ce61ffcc 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -650,14 +651,21 @@ void Tensor::CopyFromInternal(const Tensor& other, const TensorShape& shape) {
   }
 }
 
-void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
-                                    const TensorShape& shape) {
+Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
+                           const TensorShape& shape) {
   int in_size = DataTypeSize(other.dtype());
   int out_size = DataTypeSize(dtype);
-  CHECK_NE(in_size, 0);
-  CHECK_NE(out_size, 0);
-  CHECK_EQ(shape.num_elements() * out_size,
-           other.shape().num_elements() * in_size);
+  if (in_size == 0) {
+    return errors::InvalidArgument("other tensor has zero-sized data type");
+  }
+  if (out_size == 0) {
+    return errors::InvalidArgument("specified output type is zero-sized");
+  }
+  if (shape.num_elements() * out_size !=
+      other.shape().num_elements() * in_size) {
+    return errors::InvalidArgument(
+        "input and output shapes/data type sizes are not compatible");
+  }
   shape_ = shape;
   shape_.set_data_type(dtype);
   if (buf_ != other.buf_) {
@@ -665,6 +673,7 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
     buf_ = other.buf_;
     RefIfNonNull(buf_);
   }
+  return Status::OK();
 }
 
 // Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 4cfad893d5da3f7f037727f433babcf13e15199d..6454cb818f2e3e237ca4bc49070399f3fff31dd7 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -554,12 +554,37 @@ class Tensor {
   /// REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
   StringPiece tensor_data() const;
 
-  /// Copy the other tensor into this tensor and reshape it and reinterpret the
-  /// buffer's datatype.
+  /// Copy the other tensor into this tensor, reshape it and reinterpret the
+  /// buffer's datatype. If Status::OK() is returned, the two tensors now share
+  /// the same underlying storage.
   ///
-  /// This tensor shares other's underlying storage.
-  void UnsafeCopyFromInternal(const Tensor&, DataType dtype,
-                              const TensorShape&);
+  /// This call requires that the `other` tensor and the given type and shape
+  /// are "compatible" (i.e. they occupy the same number of bytes).
+  ///
+  /// Specifically:
+  ///
+  /// shape.num_elements() * DataTypeSize(type)
+  ///
+  /// must equal
+  ///
+  /// other.num_elements() * DataTypeSize(other.dtype())
+  ///
+  /// In addition, this function requires:
+  ///   * DataTypeSize(other.dtype()) != 0
+  ///   * DataTypeSize(type) != 0
+  ///
+  /// If any of the requirements are not met, errors::InvalidArgument is
+  /// returned.
+  Status BitcastFrom(const Tensor& other, DataType dtype,
+                     const TensorShape& shape);
+
+  /// Like BitcastFrom, but CHECK fails if any preconditions are not met.
+  ///
+  /// Deprecated. Use BitcastFrom instead and check the returned Status.
+  void UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
+                              const TensorShape& shape) {
+    TF_CHECK_OK(BitcastFrom(other, dtype, shape));
+  }
 
  private:
   // Returns true if the refcount on buf_ and any possible underlying root
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 5e0b976e1736dff6b8a18c7b801cb6d1ef500f11..7158f1925f65483c3087a6bfc480e5647eacb5d6 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -136,6 +136,89 @@ template <class Shape>
 TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64> dim_sizes) {
   set_tag(REP16);
   set_data_type(DT_INVALID);
+  InitDims(dim_sizes);
+}
+
+// Returns true iff partial is true and val is < 0.
+// REQUIRES: val < kMaxRep16
+// REQUIRES: partial || val >= 0
+static inline bool Set16(bool partial, uint16* dst, int dim, int64 val) {
+  if (partial) {
+    if (val < 0) {
+      dst[dim] = std::numeric_limits<uint16>::max();
+      return true;
+    }
+  } else {
+    CHECK_GE(val, 0);
+  }
+  dst[dim] = val;
+  return false;
+}
+
+template <class Shape>
+void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
+  DCHECK_EQ(tag(), REP16);
+
+  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
+  // below cannot overflow.
+  static const uint64 kMaxSmall = 0xd744;
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+                "bad overflow check");
+  bool large_size = false;
+  for (auto s : dim_sizes) {
+    if (s > kMaxSmall) {
+      large_size = true;
+      break;
+    }
+  }
+
+  if (!large_size) {
+    // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
+    uint16* dst = as16()->dims_;
+    switch (dim_sizes.size()) {
+      case 1: {
+        set_ndims_byte(1);
+        const int64 size = dim_sizes[0];
+        const bool neg = Set16(kIsPartial, dst, 0, size);
+        set_num_elements(neg ? -1 : size);
+        return;
+      }
+      case 2: {
+        set_ndims_byte(2);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        set_num_elements(neg ? -1 : (size0 * size1));
+        return;
+      }
+      case 3: {
+        set_ndims_byte(3);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2));
+        return;
+      }
+      case 4: {
+        set_ndims_byte(4);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        const int64 size3 = dim_sizes[3];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        neg |= Set16(kIsPartial, dst, 3, size3);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2 * size3));
+        return;
+      }
+    }
+  }
+
   set_ndims_byte(0);
   set_num_elements(1);
   for (int64 s : dim_sizes) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 625d88ec1bdcdd9765dd64b09a1bad51f7fa3370..3473a441f2cdcc9b6932fcc1e78071ab8b7fa1fd 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -256,6 +256,7 @@ class TensorShapeBase : public TensorShapeRep {
 
  private:
   void RecomputeNumElements();
+  void InitDims(gtl::ArraySlice<int64> dim_sizes);
 
   // True for PartialTensorShape, false for TensorShape
   static constexpr bool kIsPartial =
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 6329aa6d8edf3795ed8018b7802661749683fe41..d25652ce81815e636b8f1a188171eec4cedb9689 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -684,6 +684,15 @@ static std::vector<int64> MakeSizes(int arg) {
   return sizes;
 }
 
+static void BM_TensorShape_Init(int iters, int arg) {
+  auto sizes = MakeSizes(arg);
+  while (--iters > 0) {
+    TensorShape shape(sizes);
+    tensorflow::testing::DoNotOptimize(shape.num_elements());
+  }
+}
+BENCHMARK(BM_TensorShape_Init)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
+
 static void BM_TensorShape_Assign(int iters, int arg) {
   TensorShape s(MakeSizes(arg));
   while (--iters > 0) {
diff --git a/tensorflow/core/framework/tensor_slice.cc b/tensorflow/core/framework/tensor_slice.cc
index eb3a7f52c2ba5f9622242ff424abeca3457b8ec4..975e1e2e24a439a12943991190c525af0794e29e 100644
--- a/tensorflow/core/framework/tensor_slice.cc
+++ b/tensorflow/core/framework/tensor_slice.cc
@@ -128,7 +128,6 @@ string TensorSlice::DebugString() const {
     if (!first) {
       buffer.append(":");
     }
-    string s;
     if (IsFullAt(d)) {
       buffer.append("-");
     } else {
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 65f6dc1c00b5123287212eae39dc607ad8f68e29..dde59b89da48f7d91a28765975e83108c6c6db9d 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -15,10 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_util.h"
 
+#include <cmath>
 #include <vector>
+
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensor {
@@ -37,10 +43,10 @@ Tensor DeepCopy(const Tensor& other) {
              other_data.size());
     }
   } else if (other.dtype() == DT_STRING) {
-    tmp.flat<string>() = other.flat<string>();
+    tmp.unaligned_flat<string>() = other.unaligned_flat<string>();
   } else {
     CHECK_EQ(DT_VARIANT, other.dtype());
-    tmp.flat<Variant>() = other.flat<Variant>();
+    tmp.unaligned_flat<Variant>() = other.unaligned_flat<Variant>();
   }
   return tmp;
 }
@@ -175,7 +181,186 @@ void SetTensorProtoShape(std::vector<size_t> shape,
     shape_proto->mutable_dim()->Add()->set_size(dim);
   }
 }
+
+template <typename T>
+bool CompressTensorContent(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  const int64 num_bytes = tensor->tensor_content().size();
+  const int64 num_raw_values = num_bytes / sizeof(T);
+  if (num_raw_values != num_tensor_values) {
+    // Invalid or too small.
+    return false;
+  }
+  int64 last_offset = num_bytes - 1;
+  int64 prev_offset = last_offset - sizeof(T);
+  // Inspect individual raw bytes sizeof(T) bytes apart in adjacent elements,
+  // starting from the end, to find the last pair of elements that are not
+  // identical.
+  while (prev_offset >= 0) {
+    if (tensor->tensor_content()[prev_offset] !=
+        tensor->tensor_content()[last_offset]) {
+      break;
+    }
+    --last_offset;
+    --prev_offset;
+  }
+  // Round up to the next whole number of element of type T.
+  const int64 new_num_values = last_offset / sizeof(T) + 1;
+  if (new_num_values * (is_complex<T>::value ? 2 : 1) * sizeof(FieldType) >
+      static_cast<int64>(num_bytes / min_compression_ratio)) {
+    return false;
+  }
+  // Copy values to truncated repeated field.
+  if (sizeof(FieldType) == sizeof(T)) {
+    FieldType* dst_ptr =
+        TypeHelper::AppendUninitialized(new_num_values, tensor);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(dst_ptr));
+    tensor->clear_tensor_content();
+  } else if (sizeof(T) > 1) {
+    // Copy raw bytes to temp array first, then cast.
+    gtl::InlinedVector<T, 64> tmp(new_num_values);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(tmp.data()));
+    tensor->clear_tensor_content();
+    const T* begin = tmp.begin();
+    const T* end = tmp.end();
+    TypeHelper::AddValues(begin, end, tensor);
+  } else {
+    // Copy and cast, one byte at a time.
+    for (int64 i = 0; i < new_num_values; ++i) {
+      char c = tensor->tensor_content()[i];
+      TypeHelper::AddValue(static_cast<T>(c), tensor);
+    }
+    tensor->clear_tensor_content();
+  }
+  return true;
+}
+
+template <typename T>
+inline bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+template <>
+inline bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+template <>
+inline bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+template <typename RealType>
+inline bool PackedValuesNotEqual(const std::complex<RealType>& a,
+                                 const std::complex<RealType>& b) {
+  return PackedValuesNotEqual(a.real(), b.real()) ||
+         PackedValuesNotEqual(a.imag(), b.imag());
+}
+
+template <typename T>
+bool CompressRepeatedField(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  // Notice that for complex types the tensor is stored as an array of up to
+  // 2 * num_tensor_values real values (real and imaginary parts), possibly
+  // truncated.
+  const int64 num_proto_values = TypeHelper::NumValues(*tensor);
+  if (num_proto_values != num_tensor_values) {
+    // Already compressed or invalid.
+    return false;
+  }
+  const T last_value = TypeHelper::GetValue(num_proto_values - 1, *tensor);
+  int64 last_index = 0;
+  for (int64 i = num_proto_values - 2; i >= 0 && last_index == 0; --i) {
+    const T cur_value = TypeHelper::GetValue(i, *tensor);
+    if (PackedValuesNotEqual(cur_value, last_value)) {
+      last_index = i + 1;
+    }
+  }
+  const int64 num_truncated_proto_values = last_index + 1;
+  const int64 num_bytes_as_field =
+      num_truncated_proto_values * sizeof(FieldType);
+  const int64 num_bytes_as_tensor_content = num_tensor_values * sizeof(T);
+  const int64 num_bytes_before = num_proto_values * sizeof(FieldType);
+  if (std::min(num_bytes_as_field, num_bytes_as_tensor_content) >
+      static_cast<int64>(num_bytes_before / min_compression_ratio)) {
+    return false;
+  }
+  if (num_bytes_as_field <= num_bytes_as_tensor_content) {
+    TypeHelper::Truncate(num_truncated_proto_values, tensor);
+  } else {
+    gtl::InlinedVector<T, 64> tmp(num_tensor_values);
+    TypeHelper::CopyValues(tmp.begin(), *tensor);
+    TypeHelper::Truncate(0, tensor);
+    port::CopyFromArray(tensor->mutable_tensor_content(),
+                        reinterpret_cast<const char*>(tmp.data()),
+                        num_bytes_as_tensor_content);
+  }
+  return true;
+}
+
+template <typename T>
+bool CompressTensorProtoInPlaceImpl(int64 min_num_elements,
+                                    float min_compression_ratio,
+                                    TensorProto* tensor) {
+  const TensorShape shape(tensor->tensor_shape());
+  const int64 num_tensor_values = shape.num_elements();
+  if (num_tensor_values < min_num_elements) {
+    return false;
+  }
+  if (tensor->tensor_content().empty()) {
+    return CompressRepeatedField<T>(min_compression_ratio, shape, tensor);
+  } else {
+    return CompressTensorContent<T>(min_compression_ratio, shape, tensor);
+  }
+  return true;
+}
+
 }  // namespace internal
 
+#define HANDLE_COMPRESS_CASE(TF_TYPE)                                  \
+  case TF_TYPE:                                                        \
+    return internal::CompressTensorProtoInPlaceImpl<                   \
+        EnumToDataType<TF_TYPE>::Type>(min_num_elements,               \
+                                       min_compression_ratio, tensor); \
+    break
+
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor) {
+  switch (tensor->dtype()) {
+    HANDLE_COMPRESS_CASE(DT_FLOAT);
+    HANDLE_COMPRESS_CASE(DT_DOUBLE);
+    HANDLE_COMPRESS_CASE(DT_COMPLEX64);
+    HANDLE_COMPRESS_CASE(DT_COMPLEX128);
+    HANDLE_COMPRESS_CASE(DT_UINT8);
+    HANDLE_COMPRESS_CASE(DT_INT8);
+    HANDLE_COMPRESS_CASE(DT_UINT16);
+    HANDLE_COMPRESS_CASE(DT_INT16);
+    HANDLE_COMPRESS_CASE(DT_UINT32);
+    HANDLE_COMPRESS_CASE(DT_INT32);
+    HANDLE_COMPRESS_CASE(DT_UINT64);
+    HANDLE_COMPRESS_CASE(DT_INT64);
+    HANDLE_COMPRESS_CASE(DT_BOOL);
+    HANDLE_COMPRESS_CASE(DT_QUINT8);
+    HANDLE_COMPRESS_CASE(DT_QINT8);
+    HANDLE_COMPRESS_CASE(DT_QUINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT32);
+    HANDLE_COMPRESS_CASE(DT_HALF);
+    HANDLE_COMPRESS_CASE(DT_BFLOAT16);
+    default:
+      return false;
+  }
+}
+
+#undef HANDLE_COMPRESS_CASE
+
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index a7cf600bab9b2d260277b682946467e9c43f745c..dbd2750163a17e8ab84d2a52c39088cbbc2901da 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -16,11 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 
+#include <algorithm>
+#include <vector>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
-#include <vector>
 namespace tensorflow {
 namespace tensor {
 
@@ -60,84 +65,207 @@ namespace internal {
 void SetTensorProtoShape(std::vector<size_t> shape,
                          TensorShapeProto* shape_proto);
 
-// Defines value type dependent methods to manipulate `TensorProto`.
-// Class specializations has to define following methods:
-//   static DataType GetDataType()
-//   static void AddValue(Type value, TensorProto* proto)
 template <typename Type>
-class TensorProtoHelper : public std::false_type {};
+class TensorProtoFieldHelper : public std::false_type {};
 
-template <>
-class TensorProtoHelper<string> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_STRING; }
-  static void AddValue(const string& value, TensorProto* proto) {
-    *proto->mutable_string_val()->Add() = value;
+#define DEFINE_PROTO_FIELD_HELPER(TYPE, FIELDNAME)                            \
+  template <>                                                                 \
+  class TensorProtoFieldHelper<TYPE> : public std::true_type {                \
+   public:                                                                    \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val(0)) FieldType;            \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val()) RepeatedFieldType;     \
+    typedef decltype(std::declval<TensorProto>().mutable_##FIELDNAME##_val()) \
+        MutableRepeatedFieldType;                                             \
+    static MutableRepeatedFieldType GetMutableField(TensorProto* proto) {     \
+      return proto->mutable_##FIELDNAME##_val();                              \
+    }                                                                         \
+    static RepeatedFieldType& GetField(const TensorProto& proto) {            \
+      return proto.FIELDNAME##_val();                                         \
+    }                                                                         \
   }
-};
 
-template <>
-class TensorProtoHelper<int32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT32; }
-  static void AddValue(int32 value, TensorProto* proto) {
-    proto->mutable_int_val()->Add(value);
+// The argument pairs in the following macro instantiations encode the
+// mapping from C++ type ($1) to repeated field name "$2_val" used for storing
+// values in TensorProto. See tensorflow/core/framework/tensor.proto.
+DEFINE_PROTO_FIELD_HELPER(float, float);
+DEFINE_PROTO_FIELD_HELPER(double, double);
+DEFINE_PROTO_FIELD_HELPER(int8, int);
+DEFINE_PROTO_FIELD_HELPER(uint8, int);
+DEFINE_PROTO_FIELD_HELPER(int16, int);
+DEFINE_PROTO_FIELD_HELPER(uint16, int);
+DEFINE_PROTO_FIELD_HELPER(int32, int);
+DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int64, int64);
+DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(bool, bool);
+DEFINE_PROTO_FIELD_HELPER(qint8, int);
+DEFINE_PROTO_FIELD_HELPER(quint8, int);
+DEFINE_PROTO_FIELD_HELPER(qint16, int);
+DEFINE_PROTO_FIELD_HELPER(quint16, int);
+DEFINE_PROTO_FIELD_HELPER(qint32, int);
+DEFINE_PROTO_FIELD_HELPER(Eigen::half, half);
+DEFINE_PROTO_FIELD_HELPER(bfloat16, half);
+DEFINE_PROTO_FIELD_HELPER(complex64, scomplex);
+DEFINE_PROTO_FIELD_HELPER(complex128, dcomplex);
+
+#undef DEFINE_PROTO_HELPER
+
+template <typename T>
+struct CopyHelper {
+  template <typename SrcIter, typename DstIter>
+  static void ToArray(SrcIter begin, SrcIter end, DstIter dst) {
+    using SrcType = typename std::iterator_traits<SrcIter>::value_type;
+    using DstType = typename std::iterator_traits<DstIter>::value_type;
+    std::transform(begin, end, dst, [](const SrcType& x) -> DstType {
+      return static_cast<DstType>(x);
+    });
+  }
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, SrcIter dst) {
+    std::copy(begin, end, dst);
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    ToArray(begin, end, dst);
   }
 };
 
+// Overloads for Eigen::half and bfloat16 that are 16 bits in size but are
+// stored in an int32 field.
 template <>
-class TensorProtoHelper<int64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT64; }
-  static void AddValue(int64 value, TensorProto* proto) {
-    proto->mutable_int64_val()->Add(value);
+struct CopyHelper<Eigen::half> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, Eigen::half* dst) {
+    std::transform(begin, end, dst, [](int x) -> Eigen::half {
+      Eigen::half h;
+      h.x = static_cast<uint16>(x);
+      return h;
+    });
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    std::transform(begin, end, dst,
+                   [](Eigen::half h) -> int { return static_cast<int>(h.x); });
   }
 };
 
 template <>
-class TensorProtoHelper<uint32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT32; }
-  static void AddValue(uint32 value, TensorProto* proto) {
-    proto->mutable_uint32_val()->Add(value);
+struct CopyHelper<bfloat16> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, bfloat16* dst) {
+    std::transform(begin, end, dst, [](int x) -> bfloat16 {
+      bfloat16 bf16;
+      bf16.value = static_cast<uint16>(x);
+      return bf16;
+    });
+  }
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    std::transform(begin, end, dst, [](bfloat16 bf16) -> int {
+      return static_cast<int>(bf16.value);
+    });
   }
 };
 
-template <>
-class TensorProtoHelper<uint64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT64; }
-  static void AddValue(uint64 value, TensorProto* proto) {
-    proto->mutable_uint64_val()->Add(value);
+// Overloads for complex types that store real and imaginary parts
+// at indices 2*i and 2*i+1 in float or double field.
+template <typename RealType>
+struct CopyHelper<std::complex<RealType>> {
+  template <typename SrcIter>
+  static void ToArray(SrcIter begin, SrcIter end, std::complex<RealType>* dst) {
+    using SrcType = typename std::iterator_traits<SrcIter>::value_type;
+    RealType* real_dst = reinterpret_cast<RealType*>(dst);
+    std::copy(begin, end, real_dst);
   }
-};
 
-template <>
-class TensorProtoHelper<float> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_FLOAT; }
-  static void AddValue(float value, TensorProto* proto) {
-    proto->mutable_float_val()->Add(value);
+  template <typename SrcIter, typename DstIter>
+  static void FromArray(SrcIter begin, SrcIter end, DstIter dst) {
+    using DstType = typename std::iterator_traits<DstIter>::value_type;
+    size_t n = std::distance(begin, end);
+    const RealType* real_begin = reinterpret_cast<const RealType*>(&(*begin));
+    std::copy_n(real_begin, 2 * n, dst);
   }
 };
 
-template <>
-class TensorProtoHelper<double> : public std::true_type {
+// Helper class to extract and insert values into TensorProto represented as
+// repeated fields.
+template <typename T>
+class TensorProtoHelper : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_DOUBLE; }
-  static void AddValue(double value, TensorProto* proto) {
-    proto->mutable_double_val()->Add(value);
+  using FieldHelper = TensorProtoFieldHelper<T>;
+  using FieldType = typename TensorProtoFieldHelper<T>::FieldType;
+
+  static DataType GetDataType() { return DataTypeToEnum<T>::value; }
+
+  // Returns the number of values of type T encoded in the proto.
+  static size_t NumValues(const TensorProto& proto) {
+    size_t raw_size = FieldHelper::GetField(proto).size();
+    return is_complex<T>::value ? raw_size / 2 : raw_size;
+  }
+
+  static void AddValue(const T& value, TensorProto* proto) {
+    const T* val_ptr = &value;
+    AddValues(val_ptr, val_ptr + 1, proto);
+  }
+
+  static T GetValue(size_t index, const TensorProto& proto) {
+    T val;
+    if (is_complex<T>::value) index *= 2;
+    CopyHelper<T>::ToArray(FieldHelper::GetField(proto).begin() + index,
+                           FieldHelper::GetField(proto).begin() + index + 1,
+                           &val);
+    return val;
+  }
+
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    size_t n = std::distance(begin, end);
+    FieldType* dst = AppendUninitialized(n, proto);
+    CopyHelper<T>::FromArray(begin, end, dst);
+  }
+
+  template <typename IterType>
+  static void CopyValues(IterType dst, const TensorProto& proto) {
+    CopyHelper<T>::ToArray(FieldHelper::GetField(proto).begin(),
+                           FieldHelper::GetField(proto).end(), dst);
+  }
+
+  static void Truncate(size_t new_size, TensorProto* proto) {
+    if (is_complex<T>::value) new_size *= 2;
+    FieldHelper::GetMutableField(proto)->Truncate(new_size);
+  }
+
+  static FieldType* AppendUninitialized(size_t n, TensorProto* proto) {
+    if (is_complex<T>::value) n *= 2;
+    auto* field = FieldHelper::GetMutableField(proto);
+    field->Reserve(field->size() + n);
+    return reinterpret_cast<FieldType*>(field->AddNAlreadyReserved(n));
   }
 };
 
+// Specialization for string.
 template <>
-class TensorProtoHelper<bool> : public std::true_type {
+class TensorProtoHelper<string> : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_BOOL; }
-  static void AddValue(bool value, TensorProto* proto) {
-    proto->mutable_bool_val()->Add(value);
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    for (IterType it = begin; it != end; ++it) {
+      AddValue(*it, proto);
+    }
+  }
+  template <typename IterType>
+  static void CopyToTensorContent(IterType begin, IterType end,
+                                  TensorProto* proto) {
+    AddValues(begin, end, proto);
   }
 };
+
 }  // namespace internal
 
 // Creates a 'TensorProto' with specified shape and values.
@@ -149,15 +277,52 @@ typename std::enable_if<internal::TensorProtoHelper<Type>::value,
 CreateTensorProto(const std::vector<Type>& values,
                   const std::vector<size_t>& shape) {
   TensorProto tensor;
+  TensorShapeProto tensor_shape_proto;
+  internal::SetTensorProtoShape(shape, &tensor_shape_proto);
+  if (TensorShape(tensor_shape_proto).num_elements() != values.size()) {
+    LOG(ERROR) << "Shape and number of values (" << values.size()
+               << ") are incompatible.";
+    return tensor;
+  }
   using TypeHelper = internal::TensorProtoHelper<Type>;
   tensor.set_dtype(TypeHelper::GetDataType());
-  internal::SetTensorProtoShape(shape, tensor.mutable_tensor_shape());
-  for (const auto& value : values) {
-    TypeHelper::AddValue(value, &tensor);
-  }
+  tensor.mutable_tensor_shape()->Swap(&tensor_shape_proto);
+  TypeHelper::AddValues(values.begin(), values.end(), &tensor);
   return tensor;
 }
 
+// Converts values in tensor to run-length encoded compressed form.
+//
+// The elements of a tensor can be stored in a TensorProto in one of the
+// following two forms:
+// 1. As a raw byte string in the field `tensor_content` containing the
+//    serialized in-memory representation of the tensor.
+// 2. As values of a repeated field depending on the datatype, e.g. that
+//    values of a DT_FLOAT tensor would be stored in the repeated field
+//    `float_val`.
+// Storage scheme 2 may use a simple form of run-length encoding to compress
+// data: If the values contains a tail of identical values, the repeated field
+// will be truncated such that the number of values in the repeated field is
+// less than the number of elements implied by the field`tensor_shape`. The
+// original tensor can be recovered by repeating the final value in the repeated
+// field.
+//
+// The TensorProto will be compressed if a) the tensor contains at least
+// min_num_elements elements and b) the compressed tensor proto is would be at
+// most the size of the original tensor proto divided by min_compression_ratio.
+//
+// Returns true if the tensor was compressed.
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor);
+
+inline bool CompressTensorProtoInPlace(TensorProto* tensor) {
+  static const int64 kDefaultMinNumElements = 64;
+  static const float kDefaultMinCompressionRatio = 2.0f;
+  return CompressTensorProtoInPlace(kDefaultMinNumElements,
+                                    kDefaultMinCompressionRatio, tensor);
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 2b4e1cad2fa24c00f1efc703cd040a105fa68bfe..44708765bbfa091f477b1b86277327f4b65ade79 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -145,6 +149,68 @@ TEST(TensorUtil, DeepCopySlice) {
   }
 }
 
+TEST(TensorUtil, DeepCopySliceString) {
+  Tensor x(DT_STRING, TensorShape({10}));
+  x.flat<string>().setConstant("hello");
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<string>().setConstant("goodbye");
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_STRING, x.dtype());
+  EXPECT_EQ(DT_STRING, y.dtype());
+  EXPECT_EQ(DT_STRING, z.dtype());
+
+  // x and y should now all be 'goodbye', but z should be 'hello'.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("goodbye", x.flat<string>()(i));
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("goodbye", y.unaligned_flat<string>()(i));
+    EXPECT_EQ("hello", z.flat<string>()(i));
+  }
+}
+
+TEST(TensorUtil, DeepCopySliceVariant) {
+  Tensor x(DT_VARIANT, TensorShape({10}));
+  x.flat<Variant>().setConstant(Tensor(42.0f));
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<Variant>().setConstant(Tensor("foo"));
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_VARIANT, x.dtype());
+  EXPECT_EQ(DT_VARIANT, y.dtype());
+  EXPECT_EQ(DT_VARIANT, z.dtype());
+
+  // Each element of x and y should now be a DT_STRING Tensor containing "foo",
+  // but each element of z should be a DT_FLOAT tensor containing 42.0.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("foo", x.flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("foo",
+              y.unaligned_flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+    EXPECT_EQ(42.0, z.flat<Variant>()(i).get<Tensor>()->scalar<float>()());
+  }
+}
+
 TEST(TensorUtil, Concat) {
   std::vector<int64> sizes = {1, 4, 5};
   std::vector<Tensor> to_concat;
@@ -366,5 +432,163 @@ TEST(TensorProtoUtil, CreatesBoolTensorProto) {
             "bool_val: false\n");
 }
 
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceTooSmall) {
+  const int kLength = 63;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<Eigen::half>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto = tensor::CreateTensorProto(
+      std::vector<std::complex<float>>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceAllEqual) {
+  const int kLength = 64;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<float>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<int>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<uint8>::NumValues(tensor_proto),
+            1);
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<bool>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<Eigen::half>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(
+      tensor::internal::TensorProtoHelper<Eigen::half>::NumValues(tensor_proto),
+      1);
+
+  tensor_proto = tensor::CreateTensorProto(
+      std::vector<std::complex<float>>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<std::complex<float>>::NumValues(
+                tensor_proto),
+            1);
+}
+
+template <typename T>
+std::vector<T> VectorWithConstantTail(int size, int tail_length) {
+  CHECK_LE(tail_length, size);
+  std::vector<T> v(size, T(0));
+  for (int i = 0; i < size - tail_length; ++i) {
+    v[i] = T(i + 1);
+  }
+  return v;
+}
+
+template <typename T>
+TensorProto CreateAsProtoTensorContent(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+TensorProto CreateAsProtoField(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoField(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+void CompareTensorValues(const TensorProto& x, const TensorProto& y) {
+  Tensor x_t;
+  EXPECT_TRUE(x_t.FromProto(x));
+  Tensor y_t;
+  EXPECT_TRUE(y_t.FromProto(y));
+  test::ExpectTensorEqual<T>(x_t, y_t);
+}
+
+template <typename T>
+void ConstantTailTest(int64 length, int64 tail_length, bool as_field) {
+  using TensorProtoHelper = tensor::internal::TensorProtoHelper<T>;
+  using FieldType = typename TensorProtoHelper::FieldType;
+  const float kMinCompressionRatio = 2.0;
+  const int64 kMinSize = 64;
+  TensorProto tensor_proto =
+      as_field ? CreateAsProtoField<T>(length, tail_length)
+               : CreateAsProtoTensorContent<T>(length, tail_length);
+  TensorProto original_tensor_proto = tensor_proto;
+  int64 original_size =
+      length * (as_field ? (is_complex<T>::value ? 2 : 1) * sizeof(FieldType)
+                         : sizeof(T));
+  int64 size_as_tensor_content = length * sizeof(T);
+  int64 size_as_field = std::min(length, (length - tail_length + 1)) *
+                        (is_complex<T>::value ? 2 : 1) * sizeof(FieldType);
+  bool will_compress = std::min(size_as_tensor_content, size_as_field) <=
+                       static_cast<int64>(original_size / kMinCompressionRatio);
+
+  EXPECT_EQ(tensor::CompressTensorProtoInPlace(kMinSize, kMinCompressionRatio,
+                                               &tensor_proto),
+            will_compress);
+  if (will_compress) {
+    if (size_as_tensor_content < size_as_field) {
+      EXPECT_EQ(TensorProtoHelper::NumValues(tensor_proto), 0);
+      EXPECT_FALSE(tensor_proto.tensor_content().empty());
+    } else {
+      EXPECT_LE(TensorProtoHelper::NumValues(tensor_proto),
+                (length - tail_length + 1));
+      EXPECT_TRUE(tensor_proto.tensor_content().empty());
+    }
+  }
+  CompareTensorValues<T>(tensor_proto, original_tensor_proto);
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoConstantTail) {
+  const int kLength = 64;
+  for (bool as_field : {true, false}) {
+    for (int tail_length : {0, 1, 2, 32, 33, 63, 64}) {
+      ConstantTailTest<float>(kLength, tail_length, as_field);
+      ConstantTailTest<double>(kLength, tail_length, as_field);
+      ConstantTailTest<complex64>(kLength, tail_length, as_field);
+      ConstantTailTest<complex128>(kLength, tail_length, as_field);
+      ConstantTailTest<int32>(kLength, tail_length, as_field);
+      ConstantTailTest<uint32>(kLength, tail_length, as_field);
+      ConstantTailTest<int64>(kLength, tail_length, as_field);
+      ConstantTailTest<uint64>(kLength, tail_length, as_field);
+      ConstantTailTest<int8>(kLength, tail_length, as_field);
+      ConstantTailTest<uint8>(kLength, tail_length, as_field);
+      ConstantTailTest<int16>(kLength, tail_length, as_field);
+      ConstantTailTest<uint16>(kLength, tail_length, as_field);
+      ConstantTailTest<Eigen::half>(kLength, tail_length, as_field);
+      ConstantTailTest<bfloat16>(kLength, tail_length, as_field);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/thread_factory.h b/tensorflow/core/framework/thread_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb6dda66ba9422fbb46bf616babe6c28c7a62b
--- /dev/null
+++ b/tensorflow/core/framework/thread_factory.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Thread;
+
+// Virtual interface for an object that creates threads.
+class ThreadFactory {
+ public:
+  virtual ~ThreadFactory() {}
+
+  // Runs `fn` asynchronously in a different thread. `fn` may block.
+  //
+  // NOTE: The caller is responsible for ensuring that this `ThreadFactory`
+  // outlives the returned `Thread`.
+  virtual std::unique_ptr<Thread> StartThread(const string& name,
+                                              std::function<void()> fn) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_THREAD_FACTORY_H_
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 2df402573a58ad3728e03a22d391b32766c49b00..ff454f5847563bb696afecb79eae1743241628a5 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -152,8 +152,8 @@ int64 TrackingAllocator::AllocationId(const void* ptr) {
   }
 }
 
-void TrackingAllocator::GetStats(AllocatorStats* stats) {
-  allocator_->GetStats(stats);
+absl::optional<AllocatorStats> TrackingAllocator::GetStats() {
+  return allocator_->GetStats();
 }
 
 void TrackingAllocator::ClearStats() { allocator_->ClearStats(); }
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 5eafce662ec491de2410e5bfdd6e5a69ecaea199..3b45d1cab80f3a82329d19bd9408a2909673de0b 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -66,7 +66,7 @@ class TrackingAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // If the underlying allocator tracks allocation sizes, this returns
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 2cdc7edd2d1e9f2634a96e85879dc45a53f633cc..554af609866e059bc3002a2c5097664d6b173c92 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -44,7 +44,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     EXPECT_NE(size_map_.end(), iter);
     return iter->second;
   }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 
  private:
   std::unordered_map<const void*, size_t> size_map_;
@@ -58,7 +58,7 @@ class NoMemoryAllocator : public Allocator {
   }
   void DeallocateRaw(void* ptr) override {}
   bool TracksAllocationSizes() override { return true; }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 };
 
 TEST(TrackingAllocatorTest, SimpleNoTracking) {
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 03835d1b923d4fe4b242ffa13fad409c4239f51b..432fbf5bed337d031b58817939285811cc742d71 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -7,6 +7,7 @@ option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
+// (== suppress_warning documentation-presence ==)
 // LINT.IfChange
 enum DataType {
   // Not a legal value for DataType.  Used to indicate a DataType field
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index ef5b240aeaa8faef08d4c004f0f6d42e9516c48f..b5107a02a7fa2efeebbfc66a8539590727698882 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -37,57 +37,6 @@ UnaryVariantOpRegistry* UnaryVariantOpRegistry::Global() {
   return global_unary_variant_op_registry;
 }
 
-UnaryVariantOpRegistry::VariantShapeFn* UnaryVariantOpRegistry::GetShapeFn(
-    const TypeIndex& type_index) {
-  auto found = shape_fns.find(type_index);
-  if (found == shape_fns.end()) return nullptr;
-  return &found->second;
-}
-
-void UnaryVariantOpRegistry::RegisterShapeFn(const TypeIndex& type_index,
-                                             const VariantShapeFn& shape_fn) {
-  VariantShapeFn* existing = GetShapeFn(type_index);
-  CHECK_EQ(existing, nullptr)
-      << "Unary VariantShapeFn for type_index: "
-      << port::MaybeAbiDemangle(type_index.name()) << " already registered";
-  shape_fns.insert(std::pair<TypeIndex, VariantShapeFn>(type_index, shape_fn));
-}
-
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape) {
-  CHECK_EQ(variant_tensor.dtype(), DT_VARIANT);
-  CHECK_EQ(variant_tensor.dims(), 0);
-  const Variant& v = variant_tensor.scalar<Variant>()();
-  UnaryVariantOpRegistry::VariantShapeFn* shape_fn =
-      UnaryVariantOpRegistry::Global()->GetShapeFn(v.TypeId());
-  if (shape_fn == nullptr) {
-    return errors::Internal(
-        "No unary variant shape function found for Variant type_index: ",
-        port::MaybeAbiDemangle(v.TypeId().name()));
-  }
-  return (*shape_fn)(v, shape);
-}
-
-// Add some basic registrations for use by others, e.g., for testing.
-namespace {
-template <typename T>
-Status ScalarShape(const T&, TensorShape* shape) {
-  *shape = TensorShape({});
-  return Status::OK();
-}
-}  // namespace
-
-#define REGISTER_VARIANT_SHAPE_TYPE(T) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, ScalarShape<T>);
-
-// No encode/shape registered for std::complex<> and Eigen::half
-// objects yet.
-REGISTER_VARIANT_SHAPE_TYPE(int);
-REGISTER_VARIANT_SHAPE_TYPE(float);
-REGISTER_VARIANT_SHAPE_TYPE(bool);
-REGISTER_VARIANT_SHAPE_TYPE(double);
-
-#undef REGISTER_VARIANT_SHAPE_TYPE
-
 UnaryVariantOpRegistry::VariantDecodeFn* UnaryVariantOpRegistry::GetDecodeFn(
     StringPiece type_name) {
   auto found = decode_fns.find(type_name);
@@ -177,6 +126,37 @@ Status VariantDeviceCopy(
   return (*device_copy_fn)(from, to, copy_fn);
 }
 
+namespace {
+template <typename T>
+Status DeviceCopyPrimitiveType(
+    const T& in, T* out,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copier) {
+  // Dummy copy, we don't actually bother copying to the device and back for
+  // testing.
+  *out = in;
+  return Status::OK();
+}
+}  // namespace
+
+#define REGISTER_VARIANT_DEVICE_COPY_TYPE(T)            \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::HOST_TO_DEVICE,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_HOST,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_DEVICE,  \
+      DeviceCopyPrimitiveType<T>);
+
+// No zeros_like registered for std::complex<> or Eigen::half objects yet.
+REGISTER_VARIANT_DEVICE_COPY_TYPE(int);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(float);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(double);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(bool);
+
+#undef REGISTER_VARIANT_DEVICE_COPY_TYPE
+
 // Special casing UnaryOpFn per op and per device.
 UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
     VariantUnaryOp op, StringPiece device, const TypeIndex& type_index) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 7eb37e859f51992cf74a12736f5099839db5e1fd..488a606f6ee4564abaa0113f9886166afc76dacd 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -58,7 +58,6 @@ enum VariantDeviceCopyDirection {
 
 class UnaryVariantOpRegistry {
  public:
-  typedef std::function<Status(const Variant& v, TensorShape*)> VariantShapeFn;
   typedef std::function<bool(Variant*)> VariantDecodeFn;
   typedef std::function<Status(OpKernelContext*, const Variant&, Variant*)>
       VariantUnaryOpFn;
@@ -93,13 +92,6 @@ class UnaryVariantOpRegistry {
                                AsyncTensorDeviceCopyFn copy_fn)>
       AsyncVariantDeviceCopyFn;
 
-  // Add a shape lookup function to the registry.
-  void RegisterShapeFn(const TypeIndex& type_index,
-                       const VariantShapeFn& shape_fn);
-
-  // Returns nullptr if no shape function was found for the given TypeIndex.
-  VariantShapeFn* GetShapeFn(const TypeIndex& type_index);
-
   // Add a decode function to the registry.
   void RegisterDecodeFn(const string& type_name,
                         const VariantDecodeFn& decode_fn);
@@ -154,7 +146,6 @@ class UnaryVariantOpRegistry {
     std::size_t operator()(const TypeIndex& x) const { return x.hash_code(); }
   };
 
-  gtl::FlatMap<TypeIndex, VariantShapeFn, TypeIndexHash> shape_fns;
   gtl::FlatMap<StringPiece, VariantDecodeFn, StringPieceHasher> decode_fns;
 
   // Map std::pair<Direction, type_name> to function.
@@ -235,15 +226,6 @@ inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
   return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
          (lhs.type_index_ == rhs.type_index_);
 }
-// Gets a TensorShape from a Tensor containing a scalar Variant.
-// Returns an Internal error if the Variant does not have a registered shape
-// function, or if it's a serialized Variant that cannot be decoded.
-//
-// REQUIRES:
-//   variant_tensor.dtype() == DT_VARIANT
-//   variant_tensor.dims() == 0
-//
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape);
 
 // Decodes the Variant whose data_type has a registered decode
 // function.  Returns an Internal error if the Variant does not have a
@@ -326,29 +308,6 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
 
 namespace variant_op_registry_fn_registration {
 
-template <typename T>
-class UnaryVariantShapeRegistration {
- public:
-  typedef std::function<Status(const T& t, TensorShape*)> LocalVariantShapeFn;
-
-  UnaryVariantShapeRegistration(const TypeIndex& type_index,
-                                const LocalVariantShapeFn& shape_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
-    UnaryVariantOpRegistry::Global()->RegisterShapeFn(
-        type_index,
-        [type_index_name, shape_fn](const Variant& v,
-                                    TensorShape* s) -> Status {
-          const T* t = v.get<T>();
-          if (t == nullptr) {
-            return errors::Internal(
-                "VariantShapeFn: Could not access object, type_index: ",
-                type_index_name);
-          }
-          return shape_fn(*t, s);
-        });
-  }
-};
-
 template <typename T>
 class UnaryVariantDecodeRegistration {
  public:
@@ -471,23 +430,6 @@ class UnaryVariantBinaryOpRegistration {
 
 };  // namespace variant_op_registry_fn_registration
 
-// Register a unary shape variant function with the signature:
-//    Status ShapeFn(const T& t, TensorShape* s);
-// to Variants having TypeIndex type_index.
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, shape_function) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(             \
-      __COUNTER__, T, MakeTypeIndex<T>(), shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(ctr, T, type_index, \
-                                                          shape_function)     \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index, shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index,         \
-                                                   shape_function)             \
-  static variant_op_registry_fn_registration::UnaryVariantShapeRegistration<T> \
-      register_unary_variant_op_shape_registration_fn_##ctr(type_index,        \
-                                                            shape_function)
-
 // Register a unary decode variant function for the given type.
 #define REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, type_name) \
   REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ_HELPER(__COUNTER__, T, type_name)
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index b2443e8676e7b986992fd130d5e162818e5fe075..e1a46ebd59d6ae8503d5ae3b31d4f31c7a6f1be1 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -39,13 +39,6 @@ namespace {
 
 struct VariantValue {
   string TypeName() const { return "TEST VariantValue"; }
-  static Status ShapeFn(const VariantValue& v, TensorShape* s) {
-    if (v.early_exit) {
-      return errors::InvalidArgument("early exit!");
-    }
-    *s = TensorShape({-0xdeadbeef});
-    return Status::OK();
-  }
   static Status CPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
                                VariantValue* v_out) {
     if (v.early_exit) {
@@ -89,8 +82,6 @@ struct VariantValue {
   int value;
 };
 
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, VariantValue::ShapeFn);
-
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(VariantValue, "TEST VariantValue");
 
 INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(
@@ -113,38 +104,6 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
 
 }  // namespace
 
-TEST(VariantOpShapeRegistryTest, TestBasic) {
-  class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetShapeFn(MakeTypeIndex<Blah>()),
-            nullptr);
-
-  auto* shape_fn = UnaryVariantOpRegistry::Global()->GetShapeFn(
-      MakeTypeIndex<VariantValue>());
-  EXPECT_NE(shape_fn, nullptr);
-  TensorShape shape;
-
-  VariantValue vv_early_exit{true /* early_exit */};
-  Variant v = vv_early_exit;
-  Status s0 = (*shape_fn)(v, &shape);
-  EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
-
-  VariantValue vv_ok{false /* early_exit */};
-  v = vv_ok;
-  TF_EXPECT_OK((*shape_fn)(v, &shape));
-  EXPECT_EQ(shape, TensorShape({-0xdeadbeef}));
-}
-
-TEST(VariantOpShapeRegistryTest, TestDuplicate) {
-  UnaryVariantOpRegistry registry;
-  UnaryVariantOpRegistry::VariantShapeFn f;
-  class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
-  registry.RegisterShapeFn(kTypeIndex, f);
-  EXPECT_DEATH(registry.RegisterShapeFn(kTypeIndex, f),
-               "FjFjFj already registered");
-}
-
 TEST(VariantOpDecodeRegistryTest, TestBasic) {
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetDecodeFn("YOU SHALL NOT PASS"),
             nullptr);
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 3e67e4a86405819925f153400340145821cce414..993a8989b708c448653bab374dd25bc907b7bf0c 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -20,14 +20,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-VariantTensorData::VariantTensorData() {}
-
 VariantTensorData::VariantTensorData(VariantTensorDataProto proto) {
   FromProto(std::move(proto));
 }
 
-VariantTensorData::~VariantTensorData() {}
-
 int VariantTensorData::tensors_size() const { return tensors_.size(); }
 
 const Tensor& VariantTensorData::tensors(int index) const {
@@ -43,6 +39,12 @@ Tensor* VariantTensorData::add_tensors() {
   return &(tensors_[tensors_.size() - 1]);
 }
 
+template <typename... TensorConstructorArgs>
+Tensor* VariantTensorData::add_tensor(TensorConstructorArgs&&... args) {
+  tensors_.emplace_back(std::forward<TensorConstructorArgs>(args)...);
+  return &tensors_.back();
+}
+
 void VariantTensorData::ToProto(VariantTensorDataProto* proto) const {
   proto->set_type_name(type_name());
   proto->set_metadata(metadata_);
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 8c69c870345a68a2c5fc5f1f33015c7bb97c123e..d98cf6b5e1fb8c6d541aad2c2127c2ca9033792c 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -37,11 +37,11 @@ class VariantTensorDataProto;
 // separate so that kernels do not need to depend on protos.
 class VariantTensorData {
  public:
-  VariantTensorData();
+  VariantTensorData() = default;
+
   // TODO(b/118823936): This silently returns if the proto is invalid.
   // Consider calling FromProto explicitly instead.
   VariantTensorData(VariantTensorDataProto proto);
-  ~VariantTensorData();
 
   // Name of the type of objects being serialized.
   const string& type_name() const { return type_name_; }
@@ -68,6 +68,11 @@ class VariantTensorData {
   const std::vector<Tensor>& tensors() const;
   Tensor* add_tensors();
 
+  // A more general version of add_tensors. Parameters are perfectly forwarded
+  // to the constructor of the tensor added here.
+  template <typename... TensorConstructorArgs>
+  Tensor* add_tensor(TensorConstructorArgs&&... args);
+
   // Conversion to and from VariantTensorDataProto
   void ToProto(VariantTensorDataProto* proto) const;
   // This allows optimizations via std::move.
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 08d09de7b845101cd2c9604b2ea44bbe25a94171..8947f93887a78659e2e0a0bcd06cedc1ab733d99 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -186,7 +186,7 @@ TEST(VariantTest, TensorListTest) {
   x.Encode(&serialized);
 
   Variant y = TensorList();
-  y.Decode(std::move(serialized));
+  y.Decode(serialized);
 
   const TensorList& decoded_vec = *y.get<TensorList>();
   for (int i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 5ad1c19dc1a7bbbd087628a41f613d9d44377147..ff972e3ca0dac288c5b05cc3e33a0311ff56f121 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -94,6 +94,14 @@ void DFS(const Graph& g, const std::function<void(Node*)>& enter,
                 edge_filter);
 }
 
+void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+             const std::function<void(Node*)>& enter,
+             const std::function<void(Node*)>& leave,
+             const NodeComparator& stable_comparator,
+             const EdgeFilter& edge_filter) {
+  DFSFromHelper(g, start, enter, leave, stable_comparator, edge_filter);
+}
+
 void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
              const std::function<void(const Node*)>& enter,
              const std::function<void(const Node*)>& leave,
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 3479605df86e37dc52388651d049968d02239e19..8774a67a91ea1b1c2b4b5256a36e2221ddf7c892 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -61,6 +61,11 @@ extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
+extern void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
 extern void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
                     const std::function<void(const Node*)>& enter,
                     const std::function<void(const Node*)>& leave,
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 60a3e66aa15798063f817ecd941c57a64d976649..a62f8f02a0cd2b283911d3185dc75f5551f8073d 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -156,7 +156,6 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
 
 TEST(AlgorithmTest, PostOrderWithEdgeFilter) {
   GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
-  string error;
   Node* n0 = ops::SourceOp("TestParams", b.opts().WithName("n0"));
   Node* n1 = ops::UnaryOp("TestUnary", n0, b.opts().WithName("n1"));
   Node* n2 = ops::UnaryOp("TestUnary", n1, b.opts().WithName("n2"));
diff --git a/tensorflow/core/graph/benchmark_testlib.h b/tensorflow/core/graph/benchmark_testlib.h
new file mode 100644
index 0000000000000000000000000000000000000000..727cd07620da17a8215697005cbfe7d3476ba1e0
--- /dev/null
+++ b/tensorflow/core/graph/benchmark_testlib.h
@@ -0,0 +1,170 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
+#define TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace test {
+
+REGISTER_OP("Input").Output("y: float");
+REGISTER_OP("Output")
+    .Input("x: N * float")
+    .Attr("N: int >= 1")
+    .Output("y: float");
+REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
+REGISTER_OP("In4Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Output("y: float");
+REGISTER_OP("In8Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Output("y: float");
+REGISTER_OP("In16Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Input("i: float")
+    .Input("j: float")
+    .Input("k: float")
+    .Input("l: float")
+    .Input("m: float")
+    .Input("n: float")
+    .Input("o: float")
+    .Input("p: float")
+    .Output("y: float");
+
+GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
+  const int kNumInNodes = 10 * num_edges_per_node;
+  GraphDef graph_def;
+
+  auto create_node = [](const string& name, const string& op) {
+    NodeDef node;
+    node.set_name(name);
+    node.set_op(op);
+    return node;
+  };
+
+  NodeDef node;
+  for (int in = 0; in < kNumInNodes; ++in) {
+    node = create_node(/*name=*/absl::StrFormat("in%04d", in), /*op=*/"Input");
+    *graph_def.add_node() = std::move(node);
+  }
+
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < num_nodes; ++op) {
+    node = create_node(/*name=*/absl::StrFormat("op%05d", op),
+                       /*op=*/absl::StrFormat("In%dOut1", num_edges_per_node));
+    for (int edge = 0; edge < num_edges_per_node; ++edge) {
+      node.add_input(absl::StrFormat("in%04d", rnd.Uniform(kNumInNodes)));
+    }
+    *graph_def.add_node() = std::move(node);
+  }
+
+  // Add a single sink node. Otherwise a lot of time is spent in
+  // FixupSourceAndSinkEdges().
+  node = create_node(/*name=*/"out", /*op=*/"Output");
+  for (int op = 0; op < num_nodes; ++op) {
+    node.add_input(absl::StrFormat("op%05d", op));
+  }
+  AttrValue attr;
+  attr.set_i(num_nodes);
+  node.mutable_attr()->insert({"N", std::move(attr)});
+  *graph_def.add_node() = std::move(node);
+
+  return graph_def;
+}
+
+GraphDef CreateRandomGraph(int size) {
+  random::PhiloxRandom philox(0x12345);
+  random::SimplePhilox rnd(&philox);
+
+  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+
+  GraphDef graph;
+  for (int i = 0; i < size; ++i) {
+    const string name = absl::StrCat(prefix, i);
+    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+
+    NodeDef node;
+    node.set_name(name);
+    for (int n = 0; n < num_inputs; ++n) {
+      const uint32 input_node = rnd.Uniform(i);
+      node.add_input(absl::StrCat(prefix, input_node));
+    }
+
+    *graph.add_node() = std::move(node);
+  }
+
+  return graph;
+}
+
+GraphDef CreateFaninFanoutNodeGraph(int num_fanins, int num_fanouts) {
+  GraphDef graph;
+
+  auto create_node = [](const string& name) {
+    NodeDef node;
+    node.set_name(name);
+    return node;
+  };
+
+  NodeDef node = create_node(/*name=*/"node");
+
+  for (int i = 0; i < num_fanins; ++i) {
+    const string input_node_name = absl::StrFormat("in%05d", i);
+    NodeDef input_node = create_node(/*name=*/input_node_name);
+    *graph.add_node() = std::move(input_node);
+    node.add_input(input_node_name);
+  }
+
+  for (int i = 0; i < num_fanouts; ++i) {
+    NodeDef output_node = create_node(/*name=*/absl::StrFormat("out%05d", i));
+    output_node.add_input(absl::StrCat(node.name(), ":", i));
+    *graph.add_node() = std::move(output_node);
+  }
+
+  *graph.add_node() = std::move(node);
+
+  return graph;
+}
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_BENCHMARK_TESTLIB_H_
diff --git a/tensorflow/core/graph/collective_order.cc b/tensorflow/core/graph/collective_order.cc
index bbba9264ce845cdca6f71e0b0228589b01f2b481..80750319cb274951b0336158dde86b088842fffe 100644
--- a/tensorflow/core/graph/collective_order.cc
+++ b/tensorflow/core/graph/collective_order.cc
@@ -47,6 +47,9 @@ Status DiscoverDataDependencies(
       instance_keys->push_back(instance_key);
       VLOG(2) << "collective node " << node->DebugString();
     }
+    // Avoid reference invalidation of `node_deps`.
+    data_dependencies->reserve(data_dependencies->size() + 1 +
+                               node->out_edges().size());
     const auto& node_deps = (*data_dependencies)[node];
     for (const Edge* out_edge : node->out_edges()) {
       auto& child_deps = (*data_dependencies)[out_edge->dst()];
@@ -92,8 +95,8 @@ Status CreateControlDependencies(
       const auto& deps_j = (*data_dependencies)[collective_nodes[j]];
       if (deps_i.find(instance_keys[j]) == deps_i.end() &&
           deps_j.find(instance_keys[i]) == deps_j.end()) {
-        int src_idx = instance_keys[i] < instance_keys[j] ? i : j;
-        int dst_idx = instance_keys[i] < instance_keys[j] ? j : i;
+        int src_idx = instance_keys[i] > instance_keys[j] ? i : j;
+        int dst_idx = instance_keys[i] > instance_keys[j] ? j : i;
         Node* src_node = collective_nodes[src_idx];
         Node* dst_node = collective_nodes[dst_idx];
         VLOG(1) << "Adding control dependency from node " << src_node->name()
@@ -140,7 +143,7 @@ Status CreateControlDependencies(
 
 // Insert control dependencies defined by `dependency_edges` in `graph`.  If
 // `order_type` is `kEdges`, insert explicit control edges, else if `order_type`
-// is `kAttrs`, encode depdencies as an attribute on collective node.
+// is `kAttrs`, encode dependencies as an attribute on collective node.
 Status InsertControlDependencies(
     Graph* graph, GraphCollectiveOrder order_type,
     const absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>&
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
index 241c98b549204259d1af4ad7bb8f221c661524c5..9a158e5c3fd040ca2242249aec51f701e785a4b6 100644
--- a/tensorflow/core/graph/collective_order_test.cc
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -137,18 +137,18 @@ std::unique_ptr<Graph> InitGraph() {
 }
 
 // Tests that in the graph created by `InitGraph`, exactly 2 control edges are
-// added after calling `OrderCollectives`: c2_0 -> c3_0 and c2_1 -> c3_1.
+// added after calling `OrderCollectives`: c3_0 -> c2_0 and c3_1 -> c2_1.
 TEST(CollectiveOrderTest, SimpleOrder) {
   std::unique_ptr<Graph> graph = InitGraph();
   TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
   VerifyGraph(*graph, {"c1_0", "c1_1", "c2_0", "c2_1", "c3_0", "c3_1"},
-              {{"c2_0", "c3_0"}, {"c2_1", "c3_1"}});
+              {{"c3_0", "c2_0"}, {"c3_1", "c2_1"}});
 }
 
 TEST(CollectiveOrderTest, SimpleOrderAttr) {
   std::unique_ptr<Graph> graph = InitGraph();
   TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
-  VerifyAttrs(*graph, {{"c3_0", {2}}, {"c3_1", {2}}});
+  VerifyAttrs(*graph, {{"c2_0", {3}}, {"c2_1", {3}}});
 }
 
 // Initialize the following graph:
@@ -185,12 +185,12 @@ std::unique_ptr<Graph> InitGraph2() {
 }
 
 // Tests that in the graph created by `InitGraph2`, we add the following control
-// edges after calling `OrderCollectives`: c2 -> c3, c3 -> c4.  c2->c4 is
+// edges after calling `OrderCollectives`: c4 -> c3, c3 -> c2.  c4->c2 is
 // pruned because it follows from the other two edges.
 TEST(CollectiveOrderTest, SimpleOrder2) {
   std::unique_ptr<Graph> graph = InitGraph2();
   TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
-  VerifyGraph(*graph, {"c1", "c2", "c3", "c4"}, {{"c2", "c3"}, {"c3", "c4"}});
+  VerifyGraph(*graph, {"c1", "c2", "c3", "c4"}, {{"c4", "c3"}, {"c3", "c2"}});
 }
 
 // Initialize the following graph:
@@ -223,12 +223,12 @@ std::unique_ptr<Graph> InitGraphForPruning() {
   return graph;
 }
 
-// Tests that in the graph created by `InitGraphForPruning`, we only add c1 ->
-// c2, c2 -> c3, c3 -> c4, and other edges are pruned away.
+// Tests that in the graph created by `InitGraphForPruning`, we only add c4 ->
+// c3, c3 -> c2, c2 -> c1, and other edges are pruned away.
 TEST(CollectiveOrderTest, Pruning) {
   std::unique_ptr<Graph> graph = InitGraphForPruning();
   TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
-  VerifyAttrs(*graph, {{"c4", {3}}, {"c3", {2}}, {"c2", {1}}});
+  VerifyAttrs(*graph, {{"c3", {4}}, {"c2", {3}}, {"c1", {2}}});
 }
 
 }  // namespace
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 00d3549312aee9669eb588ace593f347263c1a11..f6b49ca9d30020b293f1eabfe35304db8fb752d7 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -85,10 +85,15 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"CollectiveBcastSend", NC_COLLECTIVE},
         {"CollectiveBcastRecv", NC_COLLECTIVE},
         {"FakeParam", NC_FAKE_PARAM},
-        {"IteratorGetNext", NC_DATASET},
-        {"IteratorGetNextSync", NC_DATASET},
-        {"DatasetToSingleElement", NC_DATASET},
-        {"ReduceDataset", NC_DATASET},
+        {"PartitionedCall", NC_PARTITIONED_CALL},
+        {"StatefulPartitionedCall", NC_PARTITIONED_CALL},
+        // Not using the constants defined in FunctionLibraryDefinition for the
+        // 4 ops below because android inference library does not link
+        // tf.function related files.
+        {"_Arg", NC_ARG},
+        {"_DeviceArg", NC_ARG},
+        {"_Retval", NC_RETVAL},
+        {"_DeviceRetval", NC_RETVAL},
     });
 
 #undef REF_CLASS
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index f65e4b921efb3298bad090198a0e1d32c31b8fd3..c463ece3347ef3d11e81ee002e42a9f2d36d9b63 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -63,8 +63,8 @@ struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;    // Declared below
-class NodeIter;        // Declared below
+class NeighborIter;     // Declared below
+class NodeIter;         // Declared below
 struct NodeProperties;  // Defined in .cc
 
 class Node {
@@ -173,8 +173,11 @@ class Node {
 
   bool IsMetadata() const { return class_ == NC_METADATA; }
   bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
-
-  bool IsDataset() const { return class_ == NC_DATASET; }
+  bool IsPartitionedCall() const { return class_ == NC_PARTITIONED_CALL; }
+  // Is this node a function input
+  bool IsArg() const { return class_ == NC_ARG; }
+  // Is this node a function output
+  bool IsRetval() const { return class_ == NC_RETVAL; }
 
   template <typename T>
   void AddAttr(const string& name, const T& val) {
@@ -256,7 +259,9 @@ class Node {
     NC_SCOPED_ALLOCATOR,
     NC_COLLECTIVE,
     NC_FAKE_PARAM,
-    NC_DATASET,
+    NC_PARTITIONED_CALL,
+    NC_ARG,
+    NC_RETVAL,
     NC_OTHER  // Not a special kind of node
   };
 
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 1912f2fc96a4e214a283fc4c93f0bd7bf30b9437..13b8ecc5f1efe03a9a2d474f32282409e42db3ac 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -951,7 +951,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef) {
   EXPECT_TRUE(HasControlEdge("D", sink));
   EXPECT_EQ(9, graph_.num_edges());
 
-  // Importing again should fail because of node name collissions.
+  // Importing again should fail because of node name collisions.
   s = ImportGraphDef(opts, def, &graph_, nullptr);
   EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
 
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 602578a83a3fcc01dbb61841051da92ffc366144..5fa42d32fd926dd7921e43211c376d6af357e02e 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -660,80 +660,11 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
   }
 }
 
-REGISTER_OP("Input").Output("y: float");
-REGISTER_OP("Output")
-    .Input("x: N * float")
-    .Attr("N: int >= 1")
-    .Output("y: float");
-REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
-REGISTER_OP("In4Out1")
-    .Input("a: float")
-    .Input("b: float")
-    .Input("c: float")
-    .Input("d: float")
-    .Output("y: float");
-REGISTER_OP("In8Out1")
-    .Input("a: float")
-    .Input("b: float")
-    .Input("c: float")
-    .Input("d: float")
-    .Input("e: float")
-    .Input("f: float")
-    .Input("g: float")
-    .Input("h: float")
-    .Output("y: float");
-REGISTER_OP("In16Out1")
-    .Input("a: float")
-    .Input("b: float")
-    .Input("c: float")
-    .Input("d: float")
-    .Input("e: float")
-    .Input("f: float")
-    .Input("g: float")
-    .Input("h: float")
-    .Input("i: float")
-    .Input("j: float")
-    .Input("k: float")
-    .Input("l: float")
-    .Input("m: float")
-    .Input("n: float")
-    .Input("o: float")
-    .Input("p: float")
-    .Output("y: float");
-
-GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
-  const int kNumInNodes = 10 * num_edges_per_node;
-  string s;
-  for (int in = 0; in < kNumInNodes; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input' }", in);
-  }
-  random::PhiloxRandom philox(301, 17);
-  random::SimplePhilox rnd(&philox);
-  for (int op = 0; op < num_nodes; op++) {
-    s += strings::Printf("node { name: 'op%05d' op: 'In%dOut1' input: [ ", op,
-                         num_edges_per_node);
-    for (int edge = 0; edge < num_edges_per_node - 1; ++edge) {
-      s += strings::Printf("'in%04d', ", rnd.Uniform(kNumInNodes));
-    }
-    s += strings::Printf("'in%04d' ] } ", rnd.Uniform(kNumInNodes));
-  }
-  // Add a single sink node. Otherwise a lot of time is spent in
-  // FixupSourceAndSinkEdges().
-  s += strings::Printf("node { name: 'out' op: 'Output' input: [ ");
-  for (int op = 0; op < num_nodes - 1; op++) {
-    s += strings::Printf("'op%05d', ", op);
-  }
-  s += strings::Printf("'op%05d' ], attr: { key: 'N' value { i: %d } } } ",
-                       num_nodes - 1, num_nodes);
-  GraphDef graph_def;
-  CHECK(protobuf::TextFormat::ParseFromString(s, &graph_def));
-  return graph_def;
-}
-
 static void BM_InEdgeIteration(int iters, int num_nodes,
                                int num_edges_per_node) {
   testing::StopTiming();
-  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, num_edges_per_node);
   Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
@@ -773,7 +704,8 @@ BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 16);
 
 static void BM_GraphCreation(int iters, int num_nodes, int num_edges_per_node) {
   testing::StopTiming();
-  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
   GraphConstructorOptions opts;
   // Warmup step.
@@ -812,7 +744,8 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
 static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
   testing::StopTiming();
-  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, num_edges_per_node);
   const auto registry = OpRegistry::Global();
   GraphConstructorOptions opts;
   // Warmup step.
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 990b2fe9b04770dc875b949ec3e17c321fe018be..f36ca8c5a843c8f2e5e2860e8416d0533dc940ed 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -96,7 +96,7 @@ static inline bool IsMklOp(const string& op_name, DataType T) {
 
   // Restrict quantized ops to QUINT8 and QINT8 for now
   if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
-    return (T == DT_QUINT8 || T == DT_QINT8);
+    return (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32);
   }
   // Restrict regular ops to FLOAT
   if (kernel.find(kMklOpLabelPattern) != string::npos) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 0f7a81110c5da336c87dd51757780cfbf38201cf..660cfc7960d75d506d4e498c39cc3224df7abdf7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -259,6 +259,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv3d_grad_input = "Conv3DBackpropInputV2";
     csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
     csinfo_.depthwise_conv2d = "DepthwiseConv2dNative";
+    csinfo_.depthwise_conv2d_grad_input = "DepthwiseConv2dNativeBackpropInput";
+    csinfo_.depthwise_conv2d_grad_filter =
+        "DepthwiseConv2dNativeBackpropFilter";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.fused_conv2d = "_FusedConv2D";
@@ -278,14 +281,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_depthwise_conv2d_grad_input =
+        "_MklDepthwiseConv2dNativeBackpropInput";
+    csinfo_.mkl_depthwise_conv2d_grad_filter =
+        "_MklDepthwiseConv2dNativeBackpropFilter";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
     csinfo_.pad = "Pad";
     csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
-// Temporarily don't convert quantized operators into MKL versions for now.
-// TODO(Intel-tf) Once all the relevant PRs have been merged then remove
-// the ifdef.
-#ifdef INTEL_MKL_QUANTIZED
+    csinfo_.pad_with_fused_conv2d = "__MklDummyPadWithFusedConv2D";
     csinfo_.quantized_avg_pool = "QuantizedAvgPool";
     csinfo_.quantized_concatv2 = "QuantizedConcatV2";
     csinfo_.quantized_conv2d = "QuantizedConv2D";
@@ -307,14 +312,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "QuantizedConv2DWithBiasSumAndReluAndRequantize";
     csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize =
         "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize";
-#endif
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.relu6 = "Relu6";
     csinfo_.relu6_grad = "Relu6Grad";
-#ifdef INTEL_MKL_QUANTIZED
     csinfo_.requantize = "Requantize";
-#endif
     csinfo_.tanh = "Tanh";
     csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
@@ -357,9 +359,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConcatV2, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
                       CopyAttrsConv, AlwaysRewrite});
@@ -371,7 +373,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv3d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d),
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv3d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
                       CopyAttrsConv, AlwaysRewrite});
@@ -380,7 +382,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.depthwise_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d),
-                      CopyAttrsConv2DDepthwise, AlwaysRewrite});
+                      CopyAttrsConv2DDepthwiseCheckConstFilter, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_input,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_filter,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
@@ -423,7 +433,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
+    rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
+                      csinfo_.mkl_pad_with_fused_conv2d,
+                      CopyAttrsPadWithFusedConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
                       CopyAttrsQuantizedPooling, AlwaysRewrite});
@@ -479,7 +491,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          mkl_op_registry::GetMklOpName(
              csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
          CopyAttrsQuantizedConv2D, AlwaysRewrite});
-#endif
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
@@ -491,11 +502,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.relu6_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
                       CopyAttrsDataType, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.requantize,
                       mkl_op_registry::GetMklOpName(csinfo_.requantize),
                       CopyAttrsRequantize, AlwaysRewrite});
-#endif
     /*
     rinfo_.push_back({csinfo_.tanh,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh),
@@ -533,10 +542,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
-    minfo_.push_back(
-        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
     // Merge Pad and Conv2d, only if the pad op is "Pad"
     // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+
+    minfo_.push_back({csinfo_.pad, csinfo_.fused_conv2d,
+                      csinfo_.pad_with_fused_conv2d, GetPadOrFusedConv2D});
 
     // The fusion patterns in "finfo_" that show up first will get applied
     // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
@@ -680,6 +692,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv3d_grad_input;
     string conv3d_grad_filter;
     string depthwise_conv2d;
+    string depthwise_conv2d_grad_input;
+    string depthwise_conv2d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
     string fused_conv2d;
@@ -699,11 +713,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_depthwise_conv2d_grad_input;
+    string mkl_depthwise_conv2d_grad_filter;
     string mkl_fused_conv2d;
     string mkl_pad_with_conv2d;
+    string mkl_pad_with_fused_conv2d;
     string mul;
     string pad;
     string pad_with_conv2d;
+    string pad_with_fused_conv2d;
     string quantized_avg_pool;
     string quantized_conv2d;
     string quantized_conv2d_with_requantize;
@@ -929,6 +947,59 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     return n;
   }
+
+  // Find Pad or _FusedConv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists _FusedConv2D node that can
+  // be merged with 'm'. If input 'm' is _FusedConv2D, then check if there
+  // exists Pad node that can be merged with 'm'.
+  static Node* GetPadOrFusedConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then _FusedConv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.fused_conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.fused_conv2d);
+      // If m is _FusedConv2D, Go over all input edges
+      // and search for Pad node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID") {
+        // Then do not merge.
+        n = nullptr;
+        VLOG(1) << "MklLayoutRewritePass: Could match Pad and _FusedConv2D "
+                << "nodes but cannot merge them. Only conv ops with padding "
+                << "type VALID can be merged with Pad op Input node: "
+                << m->DebugString();
+      }
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and _FusedConv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -1409,10 +1480,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                               bool change_format = false);
   static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
                                 bool change_format = false);
-  static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb,
-                                       bool change_format = false);
   static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                             bool change_format = false);
+  static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsConv2DDepthwiseCheckConstFilter(
+      const Node* orig_node, NodeBuilder* nb, bool change_format = false);
+  static void CopyAttrsConvCheckConstFilter(const Node* orig_node,
+                                            NodeBuilder* nb,
+                                            bool change_format = false);
   static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
                                 bool change_format = false);
   static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
@@ -1425,9 +1501,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                            bool change_format = false);
   static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
                                      bool change_format = false);
+  static void CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                          NodeBuilder* nb,
+                                          bool change_format = false);
   static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                         const Node* orig_node2, NodeBuilder* nb,
                                         bool change_format = false);
+  static void CopyAttrsFromPadAndFusedConv2D(const Node* orig_node1,
+                                             const Node* orig_node2,
+                                             NodeBuilder* nb,
+                                             bool change_format = false);
   static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
                                bool change_format = false);
   static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
@@ -1444,6 +1527,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                              bool change_format = false);
   static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
                              bool change_format = false);
+  static void CopyFormatAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                  const std::vector<int32>& strides,
+                                  const std::vector<int32>& dilations,
+                                  bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -1645,6 +1732,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_pad_with_fused_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias ||
            e->dst()->type_string() == csinfo_.mkl_fused_conv2d) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
@@ -1947,10 +2035,10 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
-                                         bool change_format) {
+void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node,
+                                                         NodeBuilder* nb,
+                                                         bool change_format) {
   DataType T;
-  string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
@@ -1961,44 +2049,37 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
 
-  if (!change_format) {
-    nb->Attr("strides", strides);
-    nb->Attr("dilations", dilations);
+  // Add attributes related to `data_format`.
+  CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
+}
 
-    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-    nb->Attr("data_format", data_format);
-  } else {
-    std::vector<int32> new_strides;
-    std::vector<int32> new_dilations;
-    if (strides.size() == 5) {
-      // "strides" and "dilations" also need to be changed according to
-      // "data_format",
-      // in this case, is "NDHWC" to "NCDHW".
-      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
-                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
-                     strides[NDHWC::dim::W]};
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
+  DataType T;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
 
-      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
-                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
-                       dilations[NDHWC::dim::W]};
-    } else {
-      // "strides" and "dilations" also need to be changed according to
-      // "data_format",
-      // in this case, is "NHWC" to "NCHW".
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
-      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
-                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("padding", padding);
 
-      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
-                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
-    }
-    nb->Attr("strides", new_strides);
-    nb->Attr("dilations", new_dilations);
-  }
+  // Add attributes related to `data_format`.
+  CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
 }
 
 // Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
@@ -2023,16 +2104,38 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
       GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("strides", strides);
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
   nb->Attr("data_format", data_format);
   nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                                       NodeBuilder* nb,
+                                                       bool change_format) {
+  DataType Tpaddings;
+
+  CopyAttrsFusedConv2D(orig_node, nb, change_format);
+
+  // Get attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
+  // Check if filter is a constant.
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+}
+
 // Used with MergePadWithConv2D
 void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                                      const Node* orig_node2,
@@ -2067,6 +2170,42 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
+    const Node* fused_conv2d, const Node* pad, NodeBuilder* nb,
+    bool change_format) {
+  DataType T;
+  int num_args;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  float epsilon;
+  std::vector<string> fused_ops;
+  DataType Tpaddings;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "fused_ops", &fused_ops));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(pad->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("fused_ops", fused_ops);
+}
+
 void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node,
                                                     NodeBuilder* nb,
                                                     bool change_format) {
@@ -2091,6 +2230,33 @@ void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
+void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
+    const Node* orig_node, NodeBuilder* nb, bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+  nb->Attr("data_format", data_format);
+}
+
 void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
                                          bool change_format) {
   DataType T;
@@ -2200,7 +2366,6 @@ void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
                                                      NodeBuilder* nb,
                                                      bool change_format) {
   DataType T;
-  string data_format;
   string padding;
   std::vector<int32> ksize, strides;
 
@@ -2223,7 +2388,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
   DataType Tinput, Tfilter, out_type;
   string padding;
   string data_format("NHWC");
-  std::vector<int32> strides, dilations;
+  std::vector<int32> strides, dilations, padding_list;
+  bool has_padding_list = HasNodeAttr(orig_node->def(), "padding_list");
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tinput", &Tinput));
@@ -2232,17 +2398,28 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  if (has_padding_list) {
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding_list", &padding_list));
+  }
+
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
 
   // Add attributes to new node.
   nb->Attr("Tinput", Tinput);
   nb->Attr("Tfilter", Tfilter);
   nb->Attr("out_type", out_type);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
   nb->Attr("strides", strides);
   nb->Attr("dilations", dilations);
   nb->Attr("T", out_type);  // added "T" for facilitating MklToTf conversion.
   nb->Attr("data_format", data_format);
-  // Requantization attr Tbias
+  if (has_padding_list) {
+    nb->Attr("padding_list", padding_list);
+  }
+
+  // Requantization attr Tbias.
   DataType Tbias;
   Status bias_status = GetNodeAttr(orig_node->def(), "Tbias", &Tbias);
   if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias);
@@ -2271,6 +2448,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("Tshape", Tshape);
@@ -2284,6 +2462,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Index", &Index));
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("Index", Index);
@@ -2306,6 +2485,45 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
+void MklLayoutRewritePass::CopyFormatAttrsConv(
+    const Node* orig_node, NodeBuilder* nb, const std::vector<int32>& strides,
+    const std::vector<int32>& dilations, bool change_format) {
+  string data_format;
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // `strides` and `dilations` also need to be changed according to
+      // `data_format`. In this case, from `NDHWC` to `NCDHW`.
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // `strides` and `dilations` also need to be changed according to
+      // `data_format`. In this case, from `NHWC` to `NCHW`.
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
+}
+
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
                                            NodeBuilder* nb,
                                            bool change_format) {
@@ -2382,11 +2600,15 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("num_args", num_args);
   nb->Attr("strides", strides);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
   nb->Attr("data_format", data_format);
   nb->Attr("dilations", dilations);
   nb->Attr("fused_ops", fused_ops);
@@ -2523,7 +2745,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
 
   // Copy attributes from Conv2D to Conv2DWithBias.
-  CopyAttrsConv(const_cast<const Node*>(pred), &nb);
+  CopyAttrsConvCheckConstFilter(const_cast<const Node*>(pred), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -2592,11 +2814,15 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
 
 Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
                                                 Node* m, Node* n) {
-  DCHECK(((m->type_string() == csinfo_.pad &&
-           n->type_string() == csinfo_.conv2d)) ||
-         ((n->type_string() == csinfo_.pad &&
-           m->type_string() == csinfo_.conv2d)));
-
+  DCHECK((m->type_string() == csinfo_.pad &&
+          (n->type_string() == csinfo_.conv2d ||
+           n->type_string() == csinfo_.fused_conv2d)) ||
+         (n->type_string() == csinfo_.pad &&
+          (m->type_string() == csinfo_.conv2d ||
+           m->type_string() == csinfo_.fused_conv2d)));
+
+  bool is_fused_conv2d = n->type_string() == csinfo_.fused_conv2d ||
+                         m->type_string() == csinfo_.fused_conv2d;
   // Conv2D is successor node, and Pad predecessor node.
   Node* pred = m->type_string() == csinfo_.pad ? m : n;
   Node* succ = m->type_string() == csinfo_.pad ? n : m;
@@ -2607,18 +2833,14 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gpu;
+
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
-  // Data format for pad is not available and not necessary, thus
-  // dont need to match data format for Pad
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
-  // Check if the data types and devices of both succ and pred are the same.
-  // Assert is not used,  because it can be too strict.
+  // Check if the devices of both succ and pred are the same.
+  // Assert is not used because it can be too strict.
   // Don't need to check for data formats because it is not available in Pad.
   if (T_pred != T_succ ||
       pred->assigned_device_name() != succ->assigned_device_name() ||
@@ -2662,29 +2884,45 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   }
   DCHECK_EQ(PadDataInputEdges, 2);
 
-  // Conv2D must have 2 data inputs: pad output and Filter
+  // Conv2D must have 2 data inputs: Pad output and Filter
+  // FusedConv2D have 3 data inputs: Pad output, Filter and Args;
   int ConvDataInputEdges = 0;
   for (const Edge* e : succ->in_edges()) {
     if (!e->IsControlEdge()) {
       ConvDataInputEdges++;
     }
   }
-  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  DCHECK_EQ(ConvDataInputEdges, is_fused_conv2d ? 3 : 2);
 
   // We will use the node name of Conv2D as the name of new node
   // Build new node. We use same name as original node, but change the op
   // name.
-  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+
+  NodeBuilder nb(succ->name(), is_fused_conv2d ? csinfo_.pad_with_fused_conv2d
+                                               : csinfo_.pad_with_conv2d);
   nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
   // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
   // In1 of Conv2D is same as output of Pad.
   // Thus, only need to add In2 of Conv2D
-  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
 
-  // Copy attributes from Pad and conv2D to PadWithConv2D.
-  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
-                            const_cast<const Node*>(pred), &nb);
+  if (is_fused_conv2d) {
+    // FusedConv2D has one additional input, args
+    std::vector<NodeBuilder::NodeOut> args;
+    args.emplace_back(succ_in[2].first, succ_in[2].second);
+    nb.Input(gtl::ArraySlice<NodeBuilder::NodeOut>{
+        args});                                     // In3 (args) of FusedConv2D
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and FusedConv2D to PadWithFusedConv2D.
+    CopyAttrsFromPadAndFusedConv2D(const_cast<const Node*>(succ),
+                                   const_cast<const Node*>(pred), &nb);
+  } else {
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and conv2D to PadWithConv2D.
+    CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                              const_cast<const Node*>(pred), &nb);
+  }
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -2882,10 +3120,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
-  if (((m->type_string() == csinfo_.pad &&
-        n->type_string() == csinfo_.conv2d)) ||
-      ((n->type_string() == csinfo_.pad &&
-        m->type_string() == csinfo_.conv2d))) {
+  if ((m->type_string() == csinfo_.pad &&
+       (n->type_string() == csinfo_.conv2d ||
+        (n->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(n)))) ||
+      (n->type_string() == csinfo_.pad &&
+       (m->type_string() == csinfo_.conv2d ||
+        (m->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(m))))) {
     return this->MergePadWithConv2D(g, m, n);
   }
 
@@ -2942,9 +3182,7 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
       DataTypeIsQuantized(orig_node->output_type(0))) {
-#ifdef INTEL_MKL_QUANTIZED
     nb.Attr("_kernel", mkl_op_registry::kMklQuantizedOpLabel);
-#endif
   } else {
     nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
   }
@@ -2998,7 +3236,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
 // Current implementation reflects only QuantizedConv2D and its fused Ops.
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
-#ifdef INTEL_MKL_QUANTIZED
   DataType Tinput, Tfilter;
   if (!(GetNodeAttr(n->def(), "Tinput", &Tinput).ok() &&
         GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok())) {
@@ -3012,7 +3249,6 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
       }
     }
   }
-#endif
   return nullptr;
 }
 
@@ -3039,6 +3275,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.pad_with_conv2d &&
+      n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
       n->type_string() != csinfo_.fused_conv2d &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 6e73ed1b9fe9c7d82342360cd1604379a768aaec..e2ab90de3fefedcd0669b772276fa16ba380b0a7 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -123,6 +123,21 @@ class MklLayoutPassTest : public ::testing::Test {
     return result;
   }
 
+  // Returns the attribute value only from the first node
+  template <typename T>
+  T DoMklLayoutOptimizationPassGetAttrVal(const string& attr,
+                                          const string& node_name) {
+    DoMklLayoutOptimizationPass();
+    T attr_val;
+    for (const Node* n : graph_.nodes()) {
+      if (IncludeNode(n) && n->type_string() == node_name) {
+        TF_CHECK_OK(GetNodeAttr(n->def(), attr, &attr_val));
+        return attr_val;
+      }
+    }
+    return attr_val;
+  }
+
   const string& OriginalGraph() const { return original_; }
 
   Graph graph_;
@@ -557,7 +572,7 @@ TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) {
 // Test if output control edges does not duplicate after merge.
 // If both the merging ops have output control edge to a common op,
 // then after merge, the merged op will have only one control edge
-// to that commom op.
+// to that common op.
 // padding is VALID type
 // A = input(image), B = input(paddings), C= Pad = input of conv2D,
 // D=input(filter), E = Conv2D, Z = Zeta
@@ -1243,6 +1258,338 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative2) {
             "D(_FusedConv2D);E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");
 }
 
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+      "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+      "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;D->F:1;DMT/"
+      "_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E) (With relu)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;"
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with unsupported fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias),
+// F = _FusedConv2D(C, D, E) (With Unsupported), G = Zeta(F, E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);E(Input);F(_FusedConv2D);G("
+            "Zeta)|A->C;B->C:1;C->F;D->F:1;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)(With relu)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
+}
+
+// Tests that there are no duplicate input control edges after merge.
+// If both the merging ops have input control edges from a common op
+// then, the merged op will have only one control edge from that
+// common op. This test only add additional input control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), X = input, B = input(paddings),
+// C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// X:control->C:control
+// X:control->F:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+// X:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(x, c);
+  const Edge* edge_1 = graph_.AddControlEdge(x, f);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;"
+            "A:control->DMT/_3:control;B->F:3;D->F:1;DMT/_0->F:4;"
+            "DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G;"
+            "X:control->F:control");
+}
+
+// ts that there are no duplicate output control edges after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that common op. This test only add additional output control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E), X = input
+// C:control->X:control
+// F:control->X:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_2)
+// F:control->X:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(c, x);
+  const Edge* edge_1 = graph_.AddControlEdge(f, x);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;A:control->DMT/"
+            "_1:control;A:control->DMT/_2:control;A:control->DMT/"
+            "_3:control;B->F:3;D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G;F:control->X:control");
+}
+
+// Pad + _FusedConv2D with padding is VALID,
+// Input node pointing to both Pad and _FusedConv2D
+// Output of both Pad and _FusedConv2D feeds one node (G as Output2)
+// A = input(as image), B = input(as paddings), C = Pad(A, B)
+// E = input(as bias), F = _FusedConv2D(C, A, E), G = Output(C, F)
+// After layout pass - No merging, since Pad and _FusedConv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'A', 'E']}"
+      "node { name: 'G' op: 'Output2'"
+      " input: ['C', 'F']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);E(Input);F(_MklFusedConv2D);G(Output2)|A->C;A->F:1;B->C:"
+            "1;C->F;C->G;C:control->DMT/_0:control;C:control->DMT/"
+            "_1:control;C:control->DMT/_2:control;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;F->G:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -1289,6 +1636,55 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
             "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNativeGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
 // Check that we never rewrite BiasAddGrad.
 TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
   InitGraph(
@@ -2322,6 +2718,29 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
 }
 
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D("
+            "DepthwiseConv2dNativeBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -2530,6 +2949,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Slice_DeviceTest) {
 
 /////////////////////////////////////////////////////////////////////
 //         Post-rewrite fixup pass test
+/////////////////////////////////////////////////////////////////////
 
 TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
   InitGraph(
@@ -2560,6 +2980,302 @@ TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
 }
 
 /////////////////////////////////////////////////////////////////////
+//         Unit tests related to filter caching.
+//
+// These tests check if the attribute `is_filter_const` is set to true
+// when filter is a constant and false otherwise for various operators
+// such as Conv2D, Conv2DWithBias, Conv3D etc.
+/////////////////////////////////////////////////////////////////////
+
+// Conv2D op where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv2D_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const' "  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklConv2D"));
+}
+
+// Conv2D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv2D_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklConv2D"));
+}
+
+// Conv2D + BiasAdd fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv2DWithBias_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklConv2DWithBias"));
+}
+
+// Conv2D + BiasAdd fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv2DWithBias_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklConv2DWithBias"));
+}
+
+// Conv3D op where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv3D_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const' "  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv3D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCDHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklConv3D"));
+}
+
+// Conv3D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv3D_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv3D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCDHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklConv3D"));
+}
+
+// Pad + Conv2D fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, PadWithConv2D_FilterCaching_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklPadWithConv2D"));
+}
+
+// Pad + Conv2D fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, PadWithConv2D_FilterCaching_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"  // Filter
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklPadWithConv2D"));
+}
+
+// _FusedConv2D + BiasAdd fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklFusedConv2D"));
+}
+
+// _FusedConv2D + BiasAdd fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklFusedConv2D"));
+}
+
+// Depthwise Conv2D op where filter is a constant.
+TEST_F(MklLayoutPassTest, DepthwiseConv2dNative_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklDepthwiseConv2dNative"));
+}
+
+// Depthwise Conv2D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, DepthwiseConv2dNative_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklDepthwiseConv2dNative"));
+}
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
   testing::StopTiming();
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 3b6e8cc2339a42285a68c6898c99b1ec4b585917..26bb654356971c711af081c6b970491ff3b6833c 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -621,7 +621,7 @@ Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
       // 5. Reshape OP: Also depends on the first input to this op.
       // 6. Not-Listed-Above OP: If there is only 1 such op, consider it as the
       // model input. However, if there are >1 unknown ops, then returns an
-      // error for now to avoid unexpected bahavior.
+      // error for now to avoid unexpected behavior.
       // Note: The list above might not be a complete list. Please let us
       // know if you see the error so we can handle your case.
       for (const Edge* edge : node->in_edges()) {
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 5a5b85e7273cb2a63b13cae04001b01ebe6dbe50..90b65a60b1e2c303fa71981f07a2f9dbe1901cba 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -62,4 +62,8 @@ TensorId ParseTensorName(StringPiece name) {
   return id;
 }
 
+bool IsTensorIdControl(const TensorId& tensor_id) {
+  return tensor_id.index() == Graph::kControlSlot;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index b0f621fa6c4abced21df6e00bf852ff9642facb0..c593f96b0b329d348b9c174f45fbe42584373427 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_TENSOR_ID_H_
-#define TENSORFLOW_GRAPH_TENSOR_ID_H_
+#ifndef TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
+#define TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
 
 #include <string>
 
@@ -60,6 +60,8 @@ struct TensorId : public std::pair<StringPiece, int> {
 TensorId ParseTensorName(const string& name);
 TensorId ParseTensorName(StringPiece name);
 
+bool IsTensorIdControl(const TensorId& tensor_id);
+
 // Same as TensorId, except owns the backing storage for the op name. This makes
 // the memory management simpler at the expense of a copy.
 struct SafeTensorId : public std::pair<string, int> {
@@ -89,4 +91,4 @@ struct SafeTensorId : public std::pair<string, int> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_TENSOR_ID_H_
+#endif  // TENSORFLOW_CORE_GRAPH_TENSOR_ID_H_
diff --git a/tensorflow/core/graph/tensor_id_test.cc b/tensorflow/core/graph/tensor_id_test.cc
index dd7d89dad841c3ba2bfd1a3786339505117cfc76..878afbe7d6585875ee0dcb097524ac9717346f43 100644
--- a/tensorflow/core/graph/tensor_id_test.cc
+++ b/tensorflow/core/graph/tensor_id_test.cc
@@ -90,5 +90,19 @@ void BM_ParseTensorName(int iters, int arg) {
 }
 BENCHMARK(BM_ParseTensorName)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4)->Arg(5);
 
+TEST(TensorIdTest, IsTensorIdControl) {
+  string input = "^foo";
+  TensorId tensor_id = ParseTensorName(input);
+  EXPECT_TRUE(IsTensorIdControl(tensor_id));
+
+  input = "foo";
+  tensor_id = ParseTensorName(input);
+  EXPECT_FALSE(IsTensorIdControl(tensor_id));
+
+  input = "foo:2";
+  tensor_id = ParseTensorName(input);
+  EXPECT_FALSE(IsTensorIdControl(tensor_id));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index e44eb91d4883f3e8a6ad34e96d8dcd9d9076298b..4487f738c8e97e803618ae483b4551b47fd14c33 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/graph/validate.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -113,5 +115,16 @@ Status ValidateGraphHasNoCycle(const Graph& graph) {
   return Status::OK();
 }
 
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph) {
+  absl::flat_hash_set<absl::string_view> nodes;
+  for (const auto& node : graph.node()) {
+    if (nodes.contains(node.name())) {
+      return errors::AlreadyExists("Node already exists: ", node.name());
+    }
+    nodes.insert(node.name());
+  }
+  return Status::OK();
+}
+
 }  // namespace graph
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/validate.h b/tensorflow/core/graph/validate.h
index 08879dca6037bcab21f4cbf107b3829c1b6600e8..bfb3a25ac91761449b1762fa2125d7758cc8c560 100644
--- a/tensorflow/core/graph/validate.h
+++ b/tensorflow/core/graph/validate.h
@@ -59,6 +59,9 @@ void GetOpListForValidation(
 // be less than the total node count.
 Status ValidateGraphHasNoCycle(const Graph& graph);
 
+// Returns OK if the graph has no duplicate node names.
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph);
+
 }  // namespace graph
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index d58cdc3c5baf02f89cff52ef0396816cb00b48a3..f6a0d2614acfe147eb65b75fb843bc84d0b6dbeb 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -147,5 +147,36 @@ TEST(GetOpListForValidationTest, ShouldStripDocs) {
   EXPECT_TRUE(found_has_docs);
 }
 
+TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'B' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'B'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  TF_ASSERT_OK(graph::VerifyNoDuplicateNodeNames(graph_def));
+}
+
+TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'A' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'A'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  EXPECT_EQ(graph::VerifyNoDuplicateNodeNames(graph_def).code(),
+            tensorflow::error::ALREADY_EXISTS);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index d5266247cf2f1b1ffa676912c368167210a4160a..b5f223facbd6fc63971c50307392d4222957ef00 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,7 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 cc_library(
     name = "op_types",
@@ -45,6 +44,7 @@ tf_cc_test(
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -71,7 +71,6 @@ cc_library(
     deps = [
         ":graph_view",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -118,8 +117,10 @@ tf_cc_test(
         ":graph_view",
         ":grappler_item",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -209,10 +210,12 @@ cc_library(
         ":graph_view",
         ":op_types",
         ":utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index ab8f4bebb3171055add2b8f2b807d338a8d36186..de27cf4ba2a95381b3ba9d91481ae461af0d6fd3 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -59,7 +59,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -81,6 +81,7 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
         "//tensorflow/core/grappler/costs:virtual_scheduler",
     ],
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index e4f6bf7c862302a217c122cff726b7ab925cc482..c7827c17dd26bfac12b5dc7552fe1295d700b35b 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -227,14 +227,14 @@ Status SingleMachine::GetPeakMemoryUsage(
 
   device_peak_memory->clear();
   for (Device* device : devices) {
-    AllocatorStats stats;
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
       return Status(error::INVALID_ARGUMENT,
                     "Tracking allocation is not enabled.");
     }
-    allocator->GetStats(&stats);
-    (*device_peak_memory)[device->name()] = stats.max_bytes_in_use;
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    (*device_peak_memory)[device->name()] =
+        (stats ? stats->peak_bytes_in_use : 0);
   }
 
   return Status::OK();
@@ -455,7 +455,6 @@ Status SingleMachine::ClearAllocatorStats() const {
   std::vector<Device*> devices = device_mgr->ListDevices();
 
   for (Device* device : devices) {
-    AllocatorStats stats;
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
       return Status(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 567e7c075e0167839434312279d77f62b9c14697..f1d3a77e3f0cf9833765b51e08de003434287e0b 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#endif
+
 #ifdef EIGEN_USE_LIBXSMM
 #include "include/libxsmm.h"
 #endif
@@ -109,6 +113,36 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
       strings::StrCat(properties.major, ".", properties.minor);
   (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
   (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
+
+#elif TENSORFLOW_USE_ROCM
+  hipDeviceProp_t properties;
+  hipError_t error =
+      hipGetDeviceProperties(&properties, platform_gpu_id.value());
+  if (error != hipSuccess) {
+    device.set_type("UNKNOWN");
+    LOG(ERROR) << "Failed to get device properties, error code: " << error;
+    return device;
+  }
+
+  // ROCM TODO review if numbers here are valid
+  device.set_vendor("Advanced Micro Devices, Inc");
+  device.set_model(properties.name);
+  device.set_frequency(properties.clockRate * 1e-3);
+  device.set_num_cores(properties.multiProcessorCount);
+  device.set_num_registers(properties.regsPerBlock);
+  device.set_l1_cache_size(16 * 1024);
+  device.set_l2_cache_size(properties.l2CacheSize);
+  device.set_l3_cache_size(0);
+  device.set_shared_memory_size_per_multiprocessor(
+      properties.maxSharedMemoryPerMultiProcessor);
+  device.set_memory_size(properties.totalGlobalMem);
+  // 8 is the number of bits per byte. 2 is accounted for
+  // double data rate (DDR).
+  device.set_bandwidth(properties.memoryBusWidth / 8 *
+                       properties.memoryClockRate * 2);
+
+  (*device.mutable_environment())["architecture"] =
+      strings::StrCat("gfx", properties.gcnArch);
 #endif
 
   return device;
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index 118f74e8b01171e3780317b4ea36750c66a22b98..2839d33c552e688caf8e2c86fd44e6780b9d6c98 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -14,32 +14,33 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
-#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
 namespace tensorflow {
 namespace grappler {
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices)
-    : Cluster(0),
-      node_estimator_(new OpLevelCostEstimator()),
-      node_manager_(new FirstReadyManager()) {
-  devices_ = devices;
-}
+    : VirtualCluster(devices, absl::make_unique<OpLevelCostEstimator>(),
+                     ReadyNodeManagerFactory("FirstReady")) {}
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices,
     std::unique_ptr<OpLevelCostEstimator> node_estimator,
     std::unique_ptr<ReadyNodeManager> node_manager)
-    : Cluster(0),
-      node_estimator_(std::move(node_estimator)),
-      node_manager_(std::move(node_manager)) {
+    : Cluster(0) {
   devices_ = devices;
+
+  // Note that we do not use aggressive shape inference to preserve unknown
+  // shapes from the input graph.
+  estimator_ = absl::make_unique<AnalyticalCostEstimator>(
+      this, std::move(node_estimator), std::move(node_manager),
+      /*use_static_shapes=*/true, /*use_aggressive_shape_inference=*/false);
 }
 
 VirtualCluster::VirtualCluster(const DeviceSet* device_set)
@@ -66,19 +67,13 @@ Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<std::pair<string, Tensor>>& feed,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
-  // Initialize a virtual scheduler to process the graph. Make sure to use
-  // static shape inference to prevent the scheduler from calling the Run
-  // method on the cluster and creating an infinite loop.
+  // Initializes an analytical cost estimator to estimate the graph cost. Makes
+  // sure to use static shape inference to prevent the virtual scheduler from
+  // calling the Run method on the cluster and creating an infinite loop.
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  // Note that we do not use aggressive shape inference to preserve unknown
-  // shapes from the input graph.
-  VirtualScheduler scheduler(/*use_static_shapes=*/true,
-                             /*use_aggressive_shape_inference=*/false, this,
-                             node_manager_.get());
-  TF_RETURN_IF_ERROR(scheduler.Init(&item));
 
   if (metadata) {
     metadata->clear_step_stats();
@@ -86,45 +81,14 @@ Status VirtualCluster::Run(const GraphDef& graph,
     metadata->clear_partition_graphs();
   }
 
-  Costs node_costs;
-  int node_id = 0;
-  do {
-    OpContext op_context = scheduler.GetCurrNode();
-    node_costs = node_estimator_->PredictCosts(op_context);
-    if (metadata) {
-      CostGraphDef::Node* cost_node =
-          metadata->mutable_cost_graph()->add_node();
-      const string& op_name = op_context.name;
-      cost_node->set_id(node_id++);
-      cost_node->set_name(op_name);
-      cost_node->set_device(op_context.device_name);
-      cost_node->set_compute_cost(
-          node_costs.execution_time.asMicroSeconds().count());
-      cost_node->set_compute_time(
-          node_costs.compute_time.asMicroSeconds().count());
-      cost_node->set_memory_time(
-          node_costs.memory_time.asMicroSeconds().count());
-      for (const auto& output : op_context.op_info.outputs()) {
-        auto output_info = cost_node->add_output_info();
-        output_info->set_dtype(output.dtype());
-        *output_info->mutable_shape() = output.shape();
-
-        int64 size = DataTypeSize(output.dtype());
-        for (const auto& dim : output.shape().dim()) {
-          size *= std::max<int64>(1, dim.size());
-        }
-        output_info->set_size(size);
-      }
-    }
-  } while (scheduler.MarkCurrNodeExecuted(node_costs));
-
-  if (metadata) {
-    scheduler.Summary(metadata);
-  }
+  TF_RETURN_IF_ERROR(estimator_->Initialize(item));
+  Costs ignored_costs;
+  TF_RETURN_IF_ERROR(
+      estimator_->PredictCosts(item.graph, metadata, &ignored_costs));
 
   const std::unordered_map<string, DeviceProperties>& device = GetDevices();
   std::unordered_map<string, int64> peak_mem_usage =
-      scheduler.GetPeakMemoryUsage();
+      estimator_->GetScheduler()->GetPeakMemoryUsage();
   for (const auto& mem_usage : peak_mem_usage) {
     const string& device_name = mem_usage.first;
     auto it = device.find(device_name);
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index d19e39cd29204c98d1edea03756649e61c2c4129..94446a998a6748ee0043f887a61d3abb401bee1a 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -50,9 +51,8 @@ class VirtualCluster : public Cluster {
   const DeviceSet* GetDeviceSet() const override { return device_set_; }
 
  private:
-  std::unique_ptr<OpLevelCostEstimator> node_estimator_;
-  std::unique_ptr<ReadyNodeManager> node_manager_;
-  const DeviceSet* device_set_ = nullptr;  // Not owned
+  std::unique_ptr<AnalyticalCostEstimator> estimator_;
+  const DeviceSet* device_set_ = nullptr;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 35ca93d9345d30c834c753e9c3ef7b25ca5ed8d5..84d813fe771b54f45ab156640ce96db1acf3f515 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -165,6 +165,7 @@ tf_cc_test(
 
 cc_library(
     name = "cost_estimator",
+    srcs = ["cost_estimator.cc"],
     hdrs = ["cost_estimator.h"],
     visibility = ["//visibility:public"],
     deps = [
@@ -173,6 +174,16 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "cost_estimator_test",
+    srcs = ["cost_estimator_test.cc"],
+    deps = [
+        ":cost_estimator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "virtual_placer",
     srcs = ["virtual_placer.cc"],
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index 5baf306f6fe39e80fc006ed1183eb70aa5fb5180..9934d7a1951b06be35373c56fd6c8045473359fd 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -56,7 +56,7 @@ void AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
     (*name_to_id)[node->name()] = node->id();
   }
   // For nodes we have seen before (e.g. Merge nodes are executed twice by
-  // VirtualScheduler), the following fields will be overwritten/updated
+  // VirtualScheduler), the following fields will be overwritten/updated.
   node->set_device(op_context.device_name);
   node->set_compute_cost(node_costs.execution_time.asMicroSeconds().count());
   node->set_compute_time(node_costs.compute_time.asMicroSeconds().count());
@@ -67,16 +67,16 @@ void AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
     int input_port;
     string input_name = ParseNodeName(input, &input_port);
 
-    // All inputs should have been seen already unless this is a Merge node
+    // All inputs should have been seen already unless this is a Merge node.
     if (name_to_id->find(input_name) == name_to_id->end()) {
       if (!IsMerge(*node_manager->GetCurrNode()))
-        LOG(ERROR) << "input: " << input
-                   << " not found for non-Merge node: " << op_name;
+        VLOG(1) << "input: " << input
+                << " not found for non-Merge node: " << op_name;
 
       // For Merge node, some of inputs may not be seen before
       // For example, for a typical while loop in tensorflow, Merge node
       // will be executed twice by VirtualScheduler (one for Enter, the
-      // other for NextIteration), so eventually both inputs will be added
+      // other for NextIteration), so eventually both inputs will be added.
       continue;
     }
 
@@ -93,30 +93,38 @@ void AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
     auto output_info = node->add_output_info();
     output_info->set_alias_input_port(-1);
     output_info->set_dtype(output.dtype());
-    auto shape = output_info->mutable_shape();
-    *shape = output.shape();
+    *output_info->mutable_shape() = output.shape();
+
+    int64 size = DataTypeSize(output.dtype());
+    for (const auto& dim : output.shape().dim()) {
+      size *= std::max<int64>(1, dim.size());
+    }
+    output_info->set_size(size);
   }
 }
 
 }  // namespace
 
-AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
-                                                 bool use_static_shapes)
+AnalyticalCostEstimator::AnalyticalCostEstimator(
+    Cluster* cluster, bool use_static_shapes,
+    bool use_aggressive_shape_inference)
     : AnalyticalCostEstimator(
           cluster, absl::make_unique<OpLevelCostEstimator>(),
-          ReadyNodeManagerFactory("FirstReady"), use_static_shapes) {}
+          ReadyNodeManagerFactory("FirstReady"), use_static_shapes,
+          use_aggressive_shape_inference) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
     Cluster* cluster, std::unique_ptr<OpLevelCostEstimator> node_estimator,
-    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes)
+    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes,
+    bool use_aggressive_shape_inference)
     : cluster_(cluster),
       node_estimator_(std::move(node_estimator)),
       node_manager_(std::move(node_manager)),
-      use_static_shapes_(use_static_shapes) {
-  // Use aggressive static shape inference to minimize unknown shapes.
+      use_static_shapes_(use_static_shapes),
+      use_aggressive_shape_inference_(use_aggressive_shape_inference) {
   scheduler_ = absl::make_unique<VirtualScheduler>(
-      use_static_shapes_,
-      /*use_aggressive_shape_inference=*/true, cluster_, node_manager_.get());
+      use_static_shapes_, use_aggressive_shape_inference_, cluster_,
+      node_manager_.get());
 }
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
@@ -142,7 +150,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     cost_graph = run_metadata->mutable_cost_graph();
     // TODO(pcma): Clear nodes in cost_graph after we make sure we always pass
     // in an empty cost_graph (a non-empty but incomplete cost_graph will cause
-    // problems, e.g., no node_id in cost_graph)
+    // problems, e.g., no node_id in cost_graph).
     for (auto& node : *cost_graph->mutable_node()) {
       name_to_cost_node[node.name()] = &node;
     }
@@ -165,7 +173,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                 << node_costs.num_ops_with_unknown_shapes << " unknown shapes";
     }
 
-    // TODO(pcma): Add unit tests for generating CostGraphDef
+    // TODO(pcma): Add unit tests for generating CostGraphDef.
     if (cost_graph) {
       AddCostNode(node_manager_.get(), op_context, node_id++, node_costs,
                   &name_to_cost_node, &name_to_id, cost_graph);
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index d058ba411527f0c001d59ac4aaa8aeea3d422c77..c9028efe0db380d4aeea660057e347c41bade94b 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -35,15 +35,19 @@ struct GrapplerItem;
 
 // Estimate the cost of running a Grappler item based on the theoretical
 // performance of the hardware that will run the model. Note that this
-// internally uses aggressive shape inference with static shape inference.
+// internally uses static shape inference. An option for aggressive shape
+// inference is provided to minimize unknown shapes, and this is only applicable
+// with static shape inference.
 class AnalyticalCostEstimator : public CostEstimator {
  public:
   // Does not take ownership of cluster.
-  AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
+  AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes,
+                          bool use_aggressive_shape_inference);
   AnalyticalCostEstimator(Cluster* cluster,
                           std::unique_ptr<OpLevelCostEstimator> node_estimator,
                           std::unique_ptr<ReadyNodeManager> node_manager,
-                          bool use_static_shapes);
+                          bool use_static_shapes,
+                          bool use_aggressive_shape_inference);
   ~AnalyticalCostEstimator() override {}
 
   // Initializes the estimator for the specified grappler item.
@@ -63,8 +67,10 @@ class AnalyticalCostEstimator : public CostEstimator {
   GrapplerItem item_;
   std::unique_ptr<OpLevelCostEstimator> node_estimator_;
   std::unique_ptr<ReadyNodeManager> node_manager_;
-  bool use_static_shapes_;
   std::unique_ptr<VirtualScheduler> scheduler_;
+
+  bool use_static_shapes_;
+  bool use_aggressive_shape_inference_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index eb7ee8dc0a10147d6bfe201f21d437579850b6d9..e558558d00a21991d723e1f6ffc235e5fe03be93 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -95,14 +94,15 @@ class AnalyticalCostEstimatorTest : public ::testing::Test {
 TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   GrapplerItem item = CreateMiniGraph();
 
-  AnalyticalCostEstimator estimator(cluster_.get(), true);
+  AnalyticalCostEstimator estimator(cluster_.get(), /*use_static_shapes=*/true,
+                                    /*use_aggressive_shape_inference=*/true);
   TF_ASSERT_OK(estimator.Initialize(item));
 
   RunMetadata run_metadata;
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9157), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
   // grappler will not process "label", therefore we have 15 here instead
   EXPECT_EQ(15, summary.num_ops_total);
diff --git a/tensorflow/core/grappler/costs/cost_estimator.cc b/tensorflow/core/grappler/costs/cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc4e99689bb8647577b96b7082177f43de2289f
--- /dev/null
+++ b/tensorflow/core/grappler/costs/cost_estimator.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
+  result.intermediate_memory_time += right.intermediate_memory_time;
+  result.intermediate_memory_read_time += right.intermediate_memory_read_time;
+  result.intermediate_memory_write_time += right.intermediate_memory_write_time;
+
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+
+  result.num_ops_total += right.num_ops_total;
+  if (right.inaccurate) {
+    result.inaccurate = true;
+  }
+  result.num_ops_with_unknown_shapes += right.num_ops_with_unknown_shapes;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+
+  return result;
+}
+
+// Multiplies Costs by a scalar.
+// Equivalent to applying CombineCosts "multiplier" times.
+// Note the field regarding num_ops are not multiplied.
+Costs MultiplyCosts(const Costs& costs, int multiplier) {
+  CHECK_GE(multiplier, 0);
+  if (multiplier == 0) {
+    return Costs::ZeroCosts();
+  }
+  if (multiplier == 1) {
+    return costs;
+  }
+
+  Costs result = costs;
+  result.execution_time *= multiplier;
+  result.compute_time *= multiplier;
+  result.memory_time *= multiplier;
+  result.intermediate_memory_time *= multiplier;
+  result.intermediate_memory_read_time *= multiplier;
+  result.intermediate_memory_write_time *= multiplier;
+  if (result.max_memory != kMemoryUnknown) {
+    result.max_memory *= multiplier;
+  }
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index d85ae0b77f923e9c7678eb9d8dd0a9f128ac5846..2efeebb5c537614dd7922efa4094df07de6d1548 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -16,9 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 
-#include <chrono>
 #include <cmath>
-#include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -134,6 +132,8 @@ struct Costs {
 
   // Intermediate memory access cost of running the graph
   Duration intermediate_memory_time;
+  Duration intermediate_memory_read_time;   // Intermediate memory read cost.
+  Duration intermediate_memory_write_time;  // Intermediate memory write cost.
 
   // This field can be a very pessimistic estimate of the main memory
   // requirements of a graph. For example, it might assume that all activations
@@ -202,6 +202,12 @@ Costs Costs::ZeroCosts() {
   return costs;
 }
 
+Costs CombineCosts(const Costs& left, const Costs& right);
+
+// Multiplies Costs by a scalar.
+// Equivalent to applying CombineCosts "multiplier" times.
+Costs MultiplyCosts(const Costs& costs, int multiplier);
+
 // Given a GrapperItem and an optimized implementation of the corresponding
 // TensorFlow graph, the CostEstimator attempts to predicts the actual cost of
 // running the graph.
diff --git a/tensorflow/core/grappler/costs/cost_estimator_test.cc b/tensorflow/core/grappler/costs/cost_estimator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62197a43dfb864c6b84f6c82bff1ca22eff913fd
--- /dev/null
+++ b/tensorflow/core/grappler/costs/cost_estimator_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(CostEstimatorTest, CombineCosts) {
+  Costs c = Costs::ZeroCosts();
+  c.execution_time = Costs::NanoSeconds(1);
+  c.compute_time = Costs::NanoSeconds(2);
+  c.memory_time = Costs::NanoSeconds(3);
+  c.intermediate_memory_time = Costs::NanoSeconds(4);
+  c.intermediate_memory_read_time = Costs::NanoSeconds(5);
+  c.intermediate_memory_write_time = Costs::NanoSeconds(6);
+  c.max_memory = 1;
+  c.max_per_op_buffers = 2;
+  c.max_per_op_streaming = 3;
+  c.num_ops_total = 1;
+  c.inaccurate = false;
+  c.num_ops_with_unknown_shapes = 0;
+
+  Costs sum = CombineCosts(c, c);
+
+  EXPECT_EQ(sum.execution_time, Costs::NanoSeconds(2));
+  EXPECT_EQ(sum.compute_time, Costs::NanoSeconds(4));
+  EXPECT_EQ(sum.memory_time, Costs::NanoSeconds(6));
+  EXPECT_EQ(sum.intermediate_memory_time, Costs::NanoSeconds(8));
+  EXPECT_EQ(sum.intermediate_memory_read_time, Costs::NanoSeconds(10));
+  EXPECT_EQ(sum.intermediate_memory_write_time, Costs::NanoSeconds(12));
+  EXPECT_EQ(sum.max_memory, 2);
+  EXPECT_EQ(sum.max_per_op_buffers, 2);
+  EXPECT_EQ(sum.max_per_op_streaming, 3);
+  EXPECT_EQ(sum.num_ops_total, 2);
+  EXPECT_FALSE(sum.inaccurate);
+  EXPECT_EQ(sum.num_ops_with_unknown_shapes, 0);
+}
+
+TEST(CostEstimatorTest, MultiplyCosts) {
+  Costs c = Costs::ZeroCosts();
+  c.execution_time = Costs::NanoSeconds(1);
+  c.compute_time = Costs::NanoSeconds(2);
+  c.memory_time = Costs::NanoSeconds(3);
+  c.intermediate_memory_time = Costs::NanoSeconds(4);
+  c.intermediate_memory_read_time = Costs::NanoSeconds(5);
+  c.intermediate_memory_write_time = Costs::NanoSeconds(6);
+  c.max_memory = 1;
+  c.max_per_op_buffers = 2;
+  c.max_per_op_streaming = 3;
+  c.num_ops_total = 1;
+  c.inaccurate = false;
+  c.num_ops_with_unknown_shapes = 0;
+
+  Costs product = MultiplyCosts(c, 10);
+
+  EXPECT_EQ(product.execution_time, Costs::NanoSeconds(10));
+  EXPECT_EQ(product.compute_time, Costs::NanoSeconds(20));
+  EXPECT_EQ(product.memory_time, Costs::NanoSeconds(30));
+  EXPECT_EQ(product.intermediate_memory_time, Costs::NanoSeconds(40));
+  EXPECT_EQ(product.intermediate_memory_read_time, Costs::NanoSeconds(50));
+  EXPECT_EQ(product.intermediate_memory_write_time, Costs::NanoSeconds(60));
+  EXPECT_EQ(product.max_memory, 10);
+  EXPECT_EQ(product.max_per_op_buffers, 2);
+  EXPECT_EQ(product.max_per_op_streaming, 3);
+  EXPECT_EQ(product.num_ops_total, 1);
+  EXPECT_FALSE(product.inaccurate);
+  EXPECT_EQ(product.num_ops_with_unknown_shapes, 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 59696f39f8576bbd4f2e8f93b5a099263350edab..e4136273402a9810b24fc68c19e821c39cbe4aa5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -460,6 +460,35 @@ class TopoQueue {
   std::set<NodeAndId, OrderByIdAscending> queue_;
 };
 
+bool IsNumericType(const DataType dtype) {
+  static const gtl::FlatSet<DataType>* const kRealNumberTypes =
+      CHECK_NOTNULL((new gtl::FlatSet<DataType>{
+          // Floating point.
+          DT_BFLOAT16,
+          DT_HALF,
+          DT_FLOAT,
+          DT_DOUBLE,
+          // Int / UInt.
+          DT_INT8,
+          DT_INT16,
+          DT_INT32,
+          DT_INT64,
+          DT_UINT8,
+          DT_UINT16,
+          DT_UINT32,
+          DT_UINT64,
+          // Quantized Int.
+          DT_QINT8,
+          DT_QUINT8,
+          DT_QINT16,
+          DT_QUINT16,
+          DT_QINT32,
+          // Bool.
+          DT_BOOL,
+      }));
+  return kRealNumberTypes->find(dtype) != kRealNumberTypes->end();
+}
+
 bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
   static const gtl::FlatSet<string>* const kOpTpeWhitelist =
       CHECK_NOTNULL((new gtl::FlatSet<string>{
@@ -504,6 +533,7 @@ bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
           "Split",
           "Range",
           "Fill",
+          "Cast",
       }));
   return kOpTpeWhitelist->find(op_type) != kOpTpeWhitelist->end();
 }
@@ -656,7 +686,7 @@ class SymbolicShapeRefiner {
 
     // Perform inference on function body.
     GraphProperties gp(grappler_function_item);
-    TF_RETURN_IF_ERROR(gp.InferStatically(true));
+    TF_RETURN_IF_ERROR(gp.InferStatically(true, aggressive_shape_inference_));
 
     // Add return nodes for output shapes.
     int output = 0;
@@ -949,6 +979,41 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  // Return true if the annotated shape is compatible with shape inference
+  // result. Examples:
+  // Inferred shape: ?, annotated shape: [10, 10] -> true;
+  // Inferred shape: [-1, 10], annotated shape: [10, 10] -> true;
+  // Inferred shape: [-1, 100], annotated shape: [10, 10] -> false;
+  // Inferred shape: [-1, 10, 10], annotated shape: [10, 10] -> false.
+  bool CompatibleShapes(ShapeHandle inferred_shape,
+                        ShapeHandle annotated_shape) const {
+    if (inferred_shape.SameHandle(annotated_shape)) {
+      return true;
+    }
+    if (!InferenceContext::RankKnown(inferred_shape)) {
+      return true;
+    }
+    if (InferenceContext::Rank(inferred_shape) !=
+        InferenceContext::Rank(annotated_shape)) {
+      return false;
+    }
+    const int rank = InferenceContext::Rank(inferred_shape);
+    for (int i = 0; i < rank; ++i) {
+      if (!InferenceContext::DimKnownRank(inferred_shape, i)
+               .SameHandle(
+                   InferenceContext::DimKnownRank(annotated_shape, i))) {
+        int64 val1 = InferenceContext::Value(
+            InferenceContext::DimKnownRank(inferred_shape, i));
+        int64 val2 = InferenceContext::Value(
+            InferenceContext::DimKnownRank(annotated_shape, i));
+        if (val1 >= 0 && val1 != val2) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
   bool EquivalentShapesAndTypes(const std::vector<ShapeAndType>& st1,
                                 const std::vector<ShapeAndType>& st2) const {
     if (st1.size() != st2.size()) {
@@ -1109,9 +1174,9 @@ class SymbolicShapeRefiner {
     return true;
   }
 
-  // Returns true if we want to update output values with running EvaluateNode()
-  // for this op, based on op type, data type, and size.
-  bool ShouldUpdateOutputValues(NodeContext* c, int64 max_size) {
+  // Returns true if we want to update output shapes and values with running
+  // EvaluateNode() for this op, based on op type, data type, and size.
+  bool ShouldUpdateOutputShapesAndValues(NodeContext* c, int64 max_size) {
     InferenceContext* ic = c->inference_context.get();
 
     // Due to the cost of running EvaluateNode(), we limit only to white listed
@@ -1120,16 +1185,16 @@ class SymbolicShapeRefiner {
       return false;
     }
 
-    // Check input dtypes are integer.
+    // Check input dtypes are number types.
     for (const auto& input_type : c->input_types) {
-      if (input_type != DT_INT32 && input_type != DT_INT64) {
+      if (!IsNumericType(input_type)) {
         return false;
       }
     }
 
-    // Check output dtypes are integer.
+    // Check output dtypes are number types.
     for (const auto& output_type : c->output_types) {
-      if (output_type != DT_INT32 && output_type != DT_INT64) {
+      if (!IsNumericType(output_type)) {
         return false;
       }
     }
@@ -1152,7 +1217,7 @@ class SymbolicShapeRefiner {
     // elements is larger than the given max size.
     for (int i = 0; i < ic->num_outputs(); i++) {
       const ShapeHandle& shape_handle = ic->output(i);
-      if (!ic->FullyDefined(shape_handle) &&
+      if (!ic->FullyDefined(shape_handle) ||
           ic->Value(ic->NumElements(shape_handle)) > max_size) {
         return false;
       }
@@ -1179,11 +1244,11 @@ class SymbolicShapeRefiner {
         const DataType& data_type = c->input_types[i];
         int32 rank = ic->Rank(shape_handle);
         if (rank < 1) {
-          input_tensor_vector->emplace_back(Tensor(data_type, {}));
+          input_tensor_vector->at(i) = Tensor(data_type, {});
         } else {
-          input_tensor_vector->emplace_back(Tensor(data_type, {rank}));
+          input_tensor_vector->at(i) = Tensor(data_type, {rank});
         }
-        auto* tensor = &input_tensor_vector->back();
+        auto* tensor = &input_tensor_vector->at(i);
         if (data_type == DT_INT32) {
           auto flat = tensor->flat<int32>();
           for (int j = 0; j < rank; j++) {
@@ -1202,8 +1267,9 @@ class SymbolicShapeRefiner {
     }
   }
 
-  // Run a node to infer output values, and add it to the NodeContext.
-  Status UpdateOutputValues(const NodeDef& node, NodeContext* c) {
+  // Run a node to infer output shapes and values, and add it to the
+  // NodeContext.
+  Status UpdateOutputShapesAndValues(const NodeDef& node, NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
 
     // Input to EvaluateNode()
@@ -1234,7 +1300,7 @@ class SymbolicShapeRefiner {
           ic->MakeShapeFromTensorShape(t->shape(), &output_shape));
       if (ic->FullyDefined(ic->output(k)) &&
           !EquivalentShapes(ic->output(k), output_shape)) {
-        LOG(WARNING) << "UpdateOutputValues() -- node: " << node.name()
+        LOG(WARNING) << "UpdateOutputShapesAndValues() -- node: " << node.name()
                      << ", inferred output shape "
                      << "doesn't match for k=" << k << ": "
                      << "ic->output(k): " << ic->DebugString(ic->output(k))
@@ -1254,6 +1320,54 @@ class SymbolicShapeRefiner {
     return Status::OK();
   }
 
+  // Update output shapes with annotated information.
+  // Currently only handle nodes with static shapes, i.e. shapes do not change
+  // during execution.
+  // TODO(andiryxu): Use annotated shapes in Enter/Merge etc as well.
+  Status UpdateOutputShapesUsingAnnotatedInformation(const NodeDef& node,
+                                                     NodeContext* c) const {
+    const auto& attr = node.attr();
+    if (attr.count(kOutputSame) == 0 || !attr.at(kOutputSame).b() ||
+        attr.count(kOutputShapes) == 0)
+      return Status::OK();
+
+    InferenceContext* ic = c->inference_context.get();
+    int output_size = attr.at(kOutputShapes).list().shape_size();
+
+    for (int i = 0; i < ic->num_outputs(); i++) {
+      // Annotated Switch node has only one output. Propagate the shape to all
+      // the outputs.
+      int shape_index = IsSwitch(node) ? 0 : i;
+      if (shape_index >= output_size) {
+        LOG(WARNING)
+            << "UpdateOutputShapesUsingAnnotatedInformation() -- node: "
+            << node.name() << ", inferred output shape size "
+            << ic->num_outputs() << ", annotated output shape size "
+            << output_size;
+        break;
+      }
+
+      const TensorShapeProto& shape =
+          attr.at(kOutputShapes).list().shape(shape_index);
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(ic->MakeShapeFromShapeProto(shape, &output_shape));
+
+      // Only use annotated shapes if the inference shape is unknown and
+      // compatible with annotated shapes.
+      if (!ic->FullyDefined(ic->output(i)) &&
+          CompatibleShapes(ic->output(i), output_shape)) {
+        VLOG(3) << "UpdateOutputShapesUsingAnnotatedInformation() -- node: "
+                << node.name() << ", inferred output shape " << i << ": "
+                << "ic->output(i): " << ic->DebugString(ic->output(i))
+                << ", annotated output shape: " << ic->DebugString(output_shape)
+                << " -- " << node.ShortDebugString();
+        ic->set_output(i, output_shape);
+      }
+    }
+
+    return Status::OK();
+  }
+
   Status MaybeUpdateNodeContextOutput(const NodeDef& node, const bool is_fed,
                                       NodeContext* c) {
     // Propagate tensors and shape tensors unless the node is fed.
@@ -1446,16 +1560,19 @@ class SymbolicShapeRefiner {
     }
 
     if (aggressive_shape_inference_) {
+      // Update output shapes with annotated information. This is optional.
+      UpdateOutputShapesUsingAnnotatedInformation(node, c).IgnoreError();
+
       // Update output tensor values using EvaluateNode() if we can.
       // Due to the cost of EvaluateNode(), we run it only for certain op types
       // (white listed) and small integer tensors.
 
       const int max_element_size = 17;  // Max up to 4x4 matrix or similar.
       if (AllOutputValuesKnown(c) || !AllInputValuesKnown(c) ||
-          !ShouldUpdateOutputValues(c, max_element_size)) {
+          !ShouldUpdateOutputShapesAndValues(c, max_element_size)) {
         return Status::OK();
       }
-      UpdateOutputValues(node, c).IgnoreError();  // This is optional.
+      UpdateOutputShapesAndValues(node, c).IgnoreError();  // This is optional.
     }
     return Status::OK();
   }
@@ -1767,6 +1884,7 @@ Status GraphProperties::UpdateShapes(
     // UpdateNode calls UpdateFunction if a function node is detected.
     TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, new_shapes));
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 3fcad6eb1b17e0c0239c5daf17bfcf717b5e3305..bb7e6ed16a634931ab8faea447a1cdd3b0a04c2a 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -27,6 +27,45 @@ namespace tensorflow {
 
 namespace grappler {
 
+// Optional attributes that tell about node output information.
+// We use these side information, if provided, for static shape inference
+// and VirtualScheduler scheduling.
+
+// Switch op attribute as a vector of int that tells which branch the
+// Switch output is taken on every round of execution.
+// Used for scheduling ops after Switch correctly (e.g., While loop).
+ABSL_CONST_INIT const char kOutputSlots[] = "_output_slot_vector";
+
+// Example:
+// Assume a node has two outputs and iterated for three times. Then it has:
+// _execution_count = 3
+// _output_sizes_vector = [2, 2, 2]
+// _output_dtype_vector.size = 6
+// _output_shape_vector.size = 6
+
+// If all the iterations have same output shapes, then
+// _execution_count = 3
+// _same_output_for_iterations = true
+// _output_sizes_vector = [2]
+// _output_dtype_vector.size = 2
+// _output_shape_vector.size = 2
+
+// How many times this node has been executed.
+ABSL_CONST_INIT const char kExecutionCount[] = "_execution_count";
+
+// Records the output sizes for each round of execution.
+ABSL_CONST_INIT const char kOutputSizes[] = "_output_sizes_vector";
+
+// The node has been scheduled multiple times with outputs that have the same
+// shape.
+ABSL_CONST_INIT const char kOutputSame[] = "_same_output_for_iterations";
+
+// Outputs DataType vector.
+ABSL_CONST_INIT const char kOutputTypes[] = "_output_dtype_vector";
+
+// Outputs TensorShapeProto vector.
+ABSL_CONST_INIT const char kOutputShapes[] = "_output_shape_vector";
+
 class SymbolicShapeRefiner;
 class TopoQueue;
 
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 0a7697a21324d3c27b80e04d6c79f5eeb539bc20..a63abed09e18b86e94afa923497dd7e8d6b04b53 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -975,6 +975,52 @@ TEST_F(GraphPropertiesTest, IdentityPassingShape) {
   EXPECT_EQ("float: [5,5]", PropToString(out_prop0));
 }
 
+TEST_F(GraphPropertiesTest, SkippingValueInferenceForLargeTensors) {
+  // When using aggressive_shape_inference, we run EvaluateNode() for
+  // whitelisted ops and small input / output tensors. For instance, Fill op is
+  // evaluated and produces output tensor value if output tensor size is smal
+  // (currently, fewer than 17 elements); otherwise we don't run EvalauteNode().
+  // This is to avoid wasting time and memory for producing huge tensors (e.g.,
+  // initializing a large table using Fill.
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 4, {2});  // 4x4
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is small; expect output values of Fill op.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [4,4]", PropToString(out_prop0));
+    EXPECT_TRUE(out_prop0.has_value());
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 1000, {4});  // 1000x1000x1000x1000
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is huge; in that case we skip value inference.
+    // Otherwise, it'd be too much overhead.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [1000,1000,1000,1000]", PropToString(out_prop0));
+    EXPECT_FALSE(out_prop0.has_value());
+  }
+}
+
 TEST_F(GraphPropertiesTest, PackWithConstInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 1, {});
@@ -1158,6 +1204,61 @@ TEST_F(GraphPropertiesTest, FunctionReturnTensorValue) {
                      properties.GetInputProperties("MyFunc")[0].value());
 }
 
+TEST_F(GraphPropertiesTest, ArithmeticFunctionReturnTensorValue) {
+  FunctionDefLibrary library;
+  // Function that adds two input values.
+  *library.add_function() = FunctionDefHelper::Create(
+      "MyFunc",                                                   // Name
+      {"x: int32", "y: int32"},                                   // Inputs
+      {"out: int32"},                                             // Outputs
+      {},                                                         // Attrs
+      {{{"a"}, "Add", {"x", "y"}, {{"T", DataType::DT_INT32}}}},  // Nodes
+      {{"out", "a:z:0"}});                                        // Returns
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  TF_CHECK_OK(s.graph()->AddFunctionLibrary(library));
+
+  Output shape = ops::Const(s.WithOpName("shape"), {5, 7}, {2});
+  auto _shape = tensorflow::ops::AsNodeOut(s, shape);
+  auto builder =
+      tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
+  tensorflow::Node* func_op;
+  TF_CHECK_OK(
+      builder.Input(_shape).Input(_shape).Finalize(s.graph(), &func_op));
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  {
+    GraphProperties properties(item);
+    // Without aggressive_shape_inference, the internal function does not
+    // evaluate output value.
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/true,
+        /*aggressive_shape_inference=*/false));
+    const auto out_props = properties.GetOutputProperties("MyFunc");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("int32: [2]", PropToString(out_prop0));
+    EXPECT_FALSE(out_prop0.has_value());
+  }
+
+  {
+    GraphProperties properties(item);
+    // With aggressive_shape_inference, output value is evaluated.
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/true,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("MyFunc");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("int32: [2]", PropToString(out_prop0));
+    EXPECT_TRUE(out_prop0.has_value());
+
+    ExpectTensorValues({10, 14}, out_prop0.value());
+    ExpectTensorValues({5, 7},
+                       properties.GetInputProperties("MyFunc")[0].value());
+    ExpectTensorValues({5, 7},
+                       properties.GetInputProperties("MyFunc")[1].value());
+  }
+}
+
 TEST_F(GraphPropertiesTest, FunctionWithScalarInput) {
   // Create graph with a function that takes a scalar value so that we use
   // Placeholder with scalar as for input to the function shape inference.
@@ -1680,7 +1781,7 @@ TEST_F(GraphPropertiesTest, StridedSliceOfShapeWithShrinkAxisMask) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  // Without aggresive shape inference, it cannot infer output value of
+  // Without aggressive shape inference, it cannot infer output value of
   // StridedSlice with ShrinkAxisMask.
   {
     GraphProperties properties(item);
@@ -1690,7 +1791,7 @@ TEST_F(GraphPropertiesTest, StridedSliceOfShapeWithShrinkAxisMask) {
     EXPECT_FALSE(properties.GetOutputProperties("slice").at(0).has_value());
   }
 
-  // InferStatically with aggresive shape inference can infer output value of
+  // InferStatically with aggressive shape inference can infer output value of
   // StridedSlice with ShrinkAxisMask.
   {
     GraphProperties properties(item);
@@ -1747,6 +1848,103 @@ TEST_F(GraphPropertiesTest, ValuePropagationThroughArithmeticOps) {
   ExpectTensorValues({20, 24}, c_plus_b_plus_2a_prop.value());
 }
 
+TEST_F(GraphPropertiesTest, ShapeAnnotation) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", PartialTensorShape({-1, -1}))
+                  .Finalize(item.graph.add_node()));
+  // Annotate shapes.
+  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("_same_output_for_iterations", true)
+                  .Attr("_output_shape_vector", {TensorShape({5, 7})})
+                  .Input("Input", 0, DT_FLOAT)
+                  .Finalize(item.graph.add_node()));
+  {
+    GraphProperties properties(item);
+    // Without aggressive_shape_inference, ignore annotated information.
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/false));
+    const auto props = properties.GetOutputProperties("Identity");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    // Get unknown shapes without using annotated information.
+    EXPECT_EQ("float: [-1,-1]", PropToString(prop));
+  }
+  {
+    GraphProperties properties(item);
+    // Use annotated information.
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto props = properties.GetOutputProperties("Identity");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT, prop.dtype());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    // Update output shape using annotated shapes.
+    EXPECT_EQ("float: [5,7]", PropToString(prop));
+  }
+}
+
+TEST_F(GraphPropertiesTest, ShapeAnnotationWithCompatibleShapes) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", PartialTensorShape({-1, 100}))
+                  .Finalize(item.graph.add_node()));
+  // Annotate shapes.
+  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("_same_output_for_iterations", true)
+                  .Attr("_output_shape_vector", {TensorShape({10, 100})})
+                  .Input("Input", 0, DT_FLOAT)
+                  .Finalize(item.graph.add_node()));
+  GraphProperties properties(item);
+  // Use annotated information.
+  TF_CHECK_OK(properties.InferStatically(
+      /*assume_valid_feeds=*/false,
+      /*aggressive_shape_inference=*/true));
+  const auto props = properties.GetOutputProperties("Identity");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  // Compatible shapes. Update output shape using annotated shapes.
+  EXPECT_EQ("float: [10,100]", PropToString(prop));
+}
+
+TEST_F(GraphPropertiesTest, ShapeAnnotationWithIncompatibleShapes) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Input", "Placeholder")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", PartialTensorShape({-1, 100}))
+                  .Finalize(item.graph.add_node()));
+  // Annotate shapes.
+  TF_CHECK_OK(NodeDefBuilder("Identity", "Identity")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("_same_output_for_iterations", true)
+                  .Attr("_output_shape_vector", {TensorShape({10, 10})})
+                  .Input("Input", 0, DT_FLOAT)
+                  .Finalize(item.graph.add_node()));
+  GraphProperties properties(item);
+  // Use annotated information.
+  TF_CHECK_OK(properties.InferStatically(
+      /*assume_valid_feeds=*/false,
+      /*aggressive_shape_inference=*/true));
+  const auto props = properties.GetOutputProperties("Identity");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  // Incompatible shapes. Do not use annotated shapes.
+  EXPECT_EQ("float: [-1,100]", PropToString(prop));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 96bac8d0cb3feff65680edd1b96b46d84a838031..bf535be18d8d120dc7afd76135341aa0c459057f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -27,7 +27,6 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
-constexpr char kConst[] = "Const";
 constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
@@ -40,6 +39,7 @@ constexpr char kDepthwiseConv2dNativeBackpropInput[] =
     "DepthwiseConv2dNativeBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kSparseTensorDenseMatMul[] = "SparseTensorDenseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
 constexpr char kIdentityN[] = "IdentityN";
@@ -50,10 +50,9 @@ constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
-constexpr char kVariable[] = "Variable";
-constexpr char kVariableV2[] = "VariableV2";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
+constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
@@ -66,7 +65,15 @@ constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
+constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
+// Persistent ops.
+constexpr char kConst[] = "Const";
+constexpr char kVariable[] = "Variable";
+constexpr char kVariableV2[] = "VariableV2";
+constexpr char kAutoReloadVariable[] = "AutoReloadVariable";
+constexpr char kVarHandleOp[] = "VarHandleOp";
+constexpr char kReadVariableOp[] = "ReadVariableOp";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -227,7 +234,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kFusedConv2dBiasActivation,
        wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
-      // reuse Conv2D for DepthwiseConv2dNative because the caculation is the
+      // reuse Conv2D for DepthwiseConv2dNative because the calculation is the
       // same although the actual meaning of the parameters are different. See
       // comments in PredictConv2D and related functions
       {kDepthwiseConv2dNative, wrap(&OpLevelCostEstimator::PredictConv2D)},
@@ -237,7 +244,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kSparseTensorDenseMatMul,
+       wrap(&OpLevelCostEstimator::PredictSparseTensorDenseMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kQuantizedMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -258,12 +268,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
 
-      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
-
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kShapeN, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
       {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
@@ -274,6 +281,11 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
   };
 
+  persistent_ops_ = {
+      kConst,       kVariable,       kVariableV2, kAutoReloadVariable,
+      kVarHandleOp, kReadVariableOp,
+  };
+
 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
 
   // Quantize = apply min and max bounds, multiply by scale factor and round.
@@ -361,21 +373,25 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   auto it = device_cost_impl_.find(op_info.op());
-  if (it == device_cost_impl_.end()) {
-    if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
-      return PredictCwiseOp(op_context);
-    }
+  if (it != device_cost_impl_.end()) {
+    std::function<Costs(const OpContext&)> estimator = it->second;
+    Costs costs = estimator(op_context);
+    VLOG(1) << "Operation " << op_info.op() << " takes "
+            << costs.execution_time.count() << " ns.";
+    return costs;
+  }
 
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+  if (persistent_ops_.find(op_info.op()) != persistent_ops_.end()) {
+    return PredictVariable(op_context);
+  }
 
-    return PredictCostOfAnUnknownOp(op_context);
+  if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
+    return PredictCwiseOp(op_context);
   }
 
-  std::function<Costs(const OpContext&)> estimator = it->second;
-  Costs costs = estimator(op_context);
-  VLOG(1) << "Operation " << op_info.op() << " takes "
-          << costs.execution_time.count() << " ns.";
-  return costs;
+  VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+
+  return PredictCostOfAnUnknownOp(op_context);
 }
 
 DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
@@ -436,7 +452,7 @@ Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   // of any input. We use the count for the largest input here to be more robust
   // in case that the shape is unknown or partially known for other input.
   int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
-  // If output shape is available, try use the element count calcuated from
+  // If output shape is available, try use the element count calculated from
   // that.
   if (op_info.outputs_size() > 0) {
     op_count = std::max(
@@ -526,8 +542,10 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
                       device_info.intermediate_write_gb_per_sec)
           : 0;
 
-  Costs::NanoSeconds intermediate_memory_cost(intermediate_read_time +
-                                              intermediate_write_time);
+  Costs::NanoSeconds intermediate_memory_cost =
+      compute_memory_overlap_
+          ? std::max(intermediate_read_time, intermediate_write_time)
+          : (intermediate_read_time + intermediate_write_time);
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();
@@ -536,6 +554,10 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
   costs.intermediate_memory_time = intermediate_memory_cost;
+  costs.intermediate_memory_read_time =
+      Costs::NanoSeconds(intermediate_read_time);
+  costs.intermediate_memory_write_time =
+      Costs::NanoSeconds(intermediate_write_time);
   CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
@@ -1209,6 +1231,49 @@ Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictSparseTensorDenseMatMul(
+    const OpContext& op_context) const {
+  const auto& op_info = op_context.op_info;
+  bool found_unknown_shapes = false;
+  // input[0]: indices in sparse matrix a
+  // input[1]: values in sparse matrix a
+  // input[2]: shape of matrix a
+  // input[3]: matrix b
+  // See
+  // https://github.com/tensorflow/tensorflow/blob/9a43dfeac5/tensorflow/core/ops/sparse_ops.cc#L85
+  int64 num_elems_in_a =
+      CalculateTensorElementCount(op_info.inputs(1), &found_unknown_shapes);
+  auto b_matrix = op_info.inputs(3);
+  auto b_matrix_shape =
+      MaybeGetMinimumShape(b_matrix.shape(), 2, &found_unknown_shapes);
+  int64 n_dim = b_matrix_shape.dim(1).size();
+
+  // Each element in A is multiplied and added with an element from each column
+  // in b.
+  const int64 op_count = kOpsPerMac * num_elems_in_a * n_dim;
+
+  int64 a_indices_input_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  int64 a_values_input_size =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  int64 a_shape_input_size =
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  int64 b_input_size =
+      num_elems_in_a * n_dim * DataTypeSize(BaseType(b_matrix.dtype()));
+  double input_size = a_indices_input_size + a_values_input_size +
+                      a_shape_input_size + b_input_size;
+
+  double output_size = CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  auto costs =
+      PredictOpCountBasedCost(op_count, input_size, output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.num_ops_with_unknown_shapes = found_unknown_shapes;
+  costs.max_memory = output_size;
+
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
@@ -1236,7 +1301,7 @@ Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
   result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
-  result.execution_time = result.execution_time;
+  result.execution_time = result.compute_time;
   return result;
 }
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index f8ba8c6637d9aade6610a6af8dd6c9f3e0be01af..e569320bf7e6eb17f5501f96e5f2f6b19fcc3a39 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -132,6 +132,7 @@ class OpLevelCostEstimator {
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
   Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
+  Costs PredictSparseTensorDenseMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
   Costs PredictIdentity(const OpContext& op_context) const;
   Costs PredictVariable(const OpContext& op_context) const;
@@ -193,6 +194,7 @@ class OpLevelCostEstimator {
   // If true, assume compute and memory overlap; hence, the op cost is max of
   // compute_time and memory_time, insteaf of sum of those two.
   bool compute_memory_overlap_;
+  std::set<string> persistent_ops_;
 
  private:
   friend class OpLevelCostEstimatorTest;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 6a9bf13b93b775eb44df5a8c117564a9d82648c1..f6c4f2ae5199766d1169257f4f5474197205ec6a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -28,6 +28,31 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+// TODO(dyoon): Consider to use this Test class for all the test cases, and then
+// remove friend in the OpLevelCostEstimator class header.
+class TestOpLevelCostEstimator : public OpLevelCostEstimator {
+ public:
+  TestOpLevelCostEstimator() {
+    compute_memory_overlap_ = true;
+    device_info_ = DeviceInfo();
+  }
+  ~TestOpLevelCostEstimator() override {}
+
+  void SetDeviceInfo(const DeviceInfo& device_info) {
+    device_info_ = device_info;
+  }
+
+  void SetComputeMemoryOverlap(bool value) { compute_memory_overlap_ = value; }
+
+ protected:
+  DeviceInfo GetDeviceInfo(const DeviceProperties& device) const override {
+    return device_info_;
+  }
+
+  DeviceInfo device_info_;
+};
+
 // Wrangles the minimum number of proto fields to set up a matrix.
 void DescribeMatrix(int rows, int columns, OpInfo* op_info) {
   auto input = op_info->add_inputs();
@@ -94,6 +119,22 @@ OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
   return op_context;
 }
 
+// Returns an OpInfo for a SparseTensorDenseMatMul
+OpContext DescribeSparseTensorDenseMatMul(const int nnz_a,
+                                          const std::vector<int>& dims_b,
+                                          const std::vector<int>& dims_out) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("SparseTensorDenseMatMul");
+
+  DescribeArbitraryRankInput({nnz_a, 2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput({nnz_a}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({2}, DT_INT64, &op_context.op_info);
+  DescribeArbitraryRankInput(dims_b, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankOutput(dims_out, DT_FLOAT, &op_context.op_info);
+  return op_context;
+}
+
 // Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
 // estimation purposes.
 void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
@@ -474,6 +515,26 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
   OpLevelCostEstimator estimator_;
 };
 
+TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  std::unordered_set<string> persisent_ops = {
+      "Const",       "Variable",       "VariableV2", "AutoReloadVariable",
+      "VarHandleOp", "ReadVariableOp",
+  };
+  // Minmum cost for all persistent ops.
+  for (const auto& op : persisent_ops) {
+    op_context.op_info.set_op(op);
+    auto cost = estimator_.PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(1), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(1), cost.execution_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
@@ -809,6 +870,58 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
   EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, SparseTensorDenseMatMul) {
+  // Unknown shape cases
+  {
+    auto cost =
+        PredictCosts(DescribeSparseTensorDenseMatMul(-1, {1, 1}, {1, 1}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost =
+        PredictCosts(DescribeSparseTensorDenseMatMul(1, {-1, 1}, {1, 1}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost =
+        PredictCosts(DescribeSparseTensorDenseMatMul(1, {1, -1}, {1, -1}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  {
+    auto cost =
+        PredictCosts(DescribeSparseTensorDenseMatMul(1, {1, 1}, {-1, 1}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(1, cost.num_ops_with_unknown_shapes);
+  }
+  // Known shape cases
+  {
+    auto cost = PredictCosts(
+        DescribeSparseTensorDenseMatMul(10, {1000, 100}, {50, 100}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(2422), cost.memory_time);
+  }
+  {
+    // Same cost as above case because cost does not depend on k_dim
+    auto cost = PredictCosts(
+        DescribeSparseTensorDenseMatMul(10, {100000, 100}, {50, 100}));
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(2422), cost.memory_time);
+  }
+}
+
 void ExpectTensorShape(const std::vector<int64>& expected,
                        const TensorShapeProto& tensor_shape_proto) {
   TensorShape tensor_shape_expected(expected);
@@ -945,7 +1058,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
@@ -986,7 +1099,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
@@ -1208,5 +1321,59 @@ TEST_F(OpLevelCostEstimatorTest, MaybeGetMinimumShape) {
     ExpectTensorShape({10, 20}, y);
   }
 }
+
+TEST_F(OpLevelCostEstimatorTest, IntermediateRdWrBandwidth) {
+  TestOpLevelCostEstimator estimator;
+
+  // Compute limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/1,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  auto cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3548774400), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3551112192), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Memory limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373281), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Intermediate memory bandwidth limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/9999,
+                                     /*intermediate_read_gb_per_sec=*/1,
+                                     /*intermediate_write_gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.intermediate_memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373515), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
index 8f5f16e4904002cfb5b2e0e6df4a3103e8114a7e..146eecf5bcbbbccb5fcdfef7170cc442e96f7c4c 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -87,6 +87,7 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
       default_device_name_ = devices_.begin()->first;  // Any device.
     }
   }
+  VLOG(3) << "default device name: " << default_device_name_;
 
   // Scan the device names from the cluster, and if there is one job name used,
   // use it for canonical device name.
@@ -102,14 +103,15 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
       }
     }
   }
-  // If there is only  type of job name in all the devices in the cluster, use
-  // that one as default job name; otherwise, use localhost.
+  // If there is only one type of job name in all the devices in the cluster,
+  // use that one as default job name; otherwise, use localhost.
   // TODO(dyoon): this should be improved, especially when the cluster is
   // composed of multiple worker, PS, and other types of jobs.
   if (job_names_from_cluster.size() == 1) {
     auto it = job_names_from_cluster.begin();
     default_job_name_lowercase_ = *it;
   }
+  VLOG(3) << "default job name: " << default_job_name_lowercase_;
 }
 
 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
index fee5ce0f510014988656f418b857a73b8d68b807..e17ece7c1a840f66d335c205d3fa759965bc2b52 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.h
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
 
-#include <unordered_map>
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 0aac0348b512d2e8040a9ac1337ceb9c12a09206..52c8f6f97db88ce9785c07fb9c0db70ac20b7acb 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -34,46 +34,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
-// Optional attribute name for Switch op as a vector of int that tells
-// which branch the Switch output is taken on every round of execution.
-// We use this side information, if provided, for scheduling ops after Switch
-// correctly (e.g., While loop).
-constexpr char kOutputSlots[] = "_output_slot_vector";
-
-Costs CombineCosts(const Costs& left, const Costs& right) {
-  CHECK_NE(left.max_memory, kMemoryUnknown);
-  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
-  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
-
-  Costs result = left;
-  result.execution_time += right.execution_time;
-  result.compute_time += right.compute_time;
-  result.memory_time += right.memory_time;
-  result.intermediate_memory_time += right.intermediate_memory_time;
-
-  result.num_ops_total += right.num_ops_total;
-  if (right.inaccurate) result.inaccurate = true;
-  result.num_ops_with_unknown_shapes += right.num_ops_with_unknown_shapes;
-
-  if (right.max_memory != kMemoryUnknown) {
-    result.max_memory += right.max_memory;
-  }
-  if (right.max_per_op_buffers != kMemoryUnknown) {
-    result.max_per_op_buffers =
-        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
-  }
-  if (right.max_per_op_streaming != kMemoryUnknown) {
-    result.max_per_op_streaming =
-        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
-  }
-  VLOG(4) << "costs execution_time=" << result.execution_time.count()
-          << " max_memory=" << result.max_memory
-          << " max_per_op_buffers=" << result.max_per_op_buffers
-          << " max_per_op_streaming=" << result.max_per_op_streaming;
-  return result;
-}
+namespace {
 
 // Key to the cached _Recv ops map, and its hash and predicate structures.
 struct RecvNodeDescriptor {
@@ -294,22 +256,6 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
   return nullptr;
 }
 
-// TODO(pcma): Delete this deprecated API after power_analyzer.cc is modeified
-// to use the new factory API
-ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
-    const string& ready_node_manager) {
-  if (ready_node_manager == "FIFO") {
-    return new FIFOManager();
-  } else if (ready_node_manager == "LIFO") {
-    return new LIFOManager();
-  } else if (ready_node_manager == "FirstReady") {
-    return new FirstReadyManager();
-  } else if (ready_node_manager == "Composite") {
-    return new CompositeNodeManager();
-  }
-  LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
-}
-
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
                                    const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
@@ -387,8 +333,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
     name_to_node[node->name()] = node;
   }
 
-  // Traverse the graph to check if the graph is annotated with Switch outputs.
-  // Also record _Send nodes.
+  // Traverses the graph to record _Send nodes.
   // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
@@ -397,11 +342,6 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
-
-    if (IsSwitch(node)) {
-      const auto& attr = node.attr();
-      if (attr.count(kOutputSlots) > 0) switch_outputs_annotated_ = true;
-    }
   }
 
   // To reuse _Recv ops.
@@ -763,66 +703,29 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
   return it->second;
 }
 
-// Check Switch outputs in updated MetaGraphDef, add corresponding nodes to
-// ready queue.
-// Fallback to add all outputs if fail to find the actual output.
-bool VirtualScheduler::AddSwitchOutputsToReadyQueue(
-    const NodeDef* node, int curr_iter, const Costs::Duration& curr_time) {
-  if (node->attr().count(kOutputSlots) == 0) return false;
-
-  auto& node_state = node_map_[node];
-  const auto& slot_vector = node->attr().at(kOutputSlots);
-  if (slot_vector.list().i_size() <= curr_iter) {
-    // Sometimes we encounter infinite loop. Fall back to add all outputs.
-    return false;
-  }
-
-  int slot = slot_vector.list().i(curr_iter);
-  for (const auto& port_num_output_pair : node_state.outputs) {
-    if (port_num_output_pair.first != slot) continue;
-
-    for (auto* output_node : port_num_output_pair.second) {
-      auto& output_state = node_map_[output_node];
-      output_state.num_inputs_ready++;
-      // Execute a node as soon as all its inputs are ready. Merge nodes
-      // are special since they run as soon as one of their inputs becomes
-      // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
-          IsMerge(*output_node)) {
-        // This output node is now ready.
-        output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
-        VLOG(3) << "Node " << node->name() << " iter " << curr_iter << "/"
-                << slot_vector.list().i_size() << " Add Switch output " << slot
-                << ": " << output_node->name();
-      }
-    }
-    return true;
-  }
-
-  return false;
-}
-
 void VirtualScheduler::AddOutputNodesToReadyQueue(
     const NodeDef* node, const Costs::Duration& curr_time) {
-  auto& node_state = node_map_[node];
-  int curr_iter = node_state.num_executed_times;
-  ++node_state.num_executed_times;
-
-  if (switch_outputs_annotated_) {
-    // If the graph is annotated with StepStats, reset num_inputs_ready so we
-    // can schedule the node multiple times.
-    node_state.num_inputs_ready = 0;
-
-    // For Switch node, get output branch from updated MetaGraphDef.
-    if (IsSwitch(*node) &&
-        AddSwitchOutputsToReadyQueue(node, curr_iter, curr_time))
-      return;
+  // Checks whether the Switch's output slots change over iterations.
+  int slot = -1;
+  if (IsSwitch(*node) && node->attr().count(kOutputSlots) > 0 &&
+      node->attr().at(kOutputSlots).list().i_size() > 0) {
+    slot = node->attr().at(kOutputSlots).list().i(0);
+    for (int i = 1; i < node->attr().at(kOutputSlots).list().i_size(); ++i) {
+      if (slot != node->attr().at(kOutputSlots).list().i(i)) {
+        slot = -1;
+        break;
+      }
+    }
   }
 
   // Increment num_inputs_ready of the output nodes and maybe add to ready
   // nodes.
+  auto& node_state = node_map_[node];
   for (const auto& port_num_output_pair : node_state.outputs) {
+    // If Switch is annotated and its output slots are always the same, we only
+    // schedule the slot that was executed. Otherwise, scheduler both slots.
+    if (slot >= 0 && port_num_output_pair.first != slot) continue;
+
     for (auto* output_node : port_num_output_pair.second) {
       auto& output_state = node_map_[output_node];
       output_state.num_inputs_ready++;
@@ -834,6 +737,7 @@ void VirtualScheduler::AddOutputNodesToReadyQueue(
         // This output node is now ready.
         output_state.time_ready = curr_time;
         ready_nodes_->AddNode(output_node);
+        VLOG(3) << "  Add output: " << output_node->name();
       }
     }
   }
@@ -841,12 +745,20 @@ void VirtualScheduler::AddOutputNodesToReadyQueue(
 
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
-  graph_costs_ = CombineCosts(graph_costs_, node_costs);
   const NodeDef* node = ready_nodes_->GetCurrNode();
+  auto& node_state = node_map_[node];
+  // If there is annotation in the graph about execution times, we use that
+  // number, otherwise, we assume the node is executed once.
+  node_state.execution_count = node->attr().count(kExecutionCount) == 0
+                                   ? 1
+                                   : node->attr().at(kExecutionCount).i();
+  Costs total_node_costs =
+      MultiplyCosts(node_costs, node_state.execution_count);
+  graph_costs_ = CombineCosts(graph_costs_, total_node_costs);
   const string& op_name = node->op();
 
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
-  op_cost = CombineCosts(op_cost, node_costs);
+  op_cost = CombineCosts(op_cost, total_node_costs);
 
   if (VLOG_IS_ON(2)) {
     // Also keep track of op counts and costs per op (with their shapes).
@@ -860,21 +772,16 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   }
 
   // Update node and device states.
-  auto& node_state = node_map_[node];
   auto& device = device_[node_state.device_name];
   device.nodes_executed.push_back(node);
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
-  // TODO(andiryxu): Current node_state result only records the last execution.
-  // With annotated MetaGraph we can schedule a node for multiple times.
-  // Refine NodeState structure accordingly, e.g. record time_scheduled in a
-  // vector.
   node_state.time_scheduled =
       std::max(device.GetCurrTime(), node_state.time_ready);
   // Override device curr time with the time_scheduled.
   device.device_costs.execution_time = node_state.time_scheduled;
-  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  device.device_costs = CombineCosts(device.device_costs, total_node_costs);
   auto curr_time = device.GetCurrTime();
   node_state.time_finished = curr_time;
 
@@ -887,7 +794,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
         node_state.time_no_references[port_num] = curr_time;
       } else {
         device.memory_usage +=
-            CalculateOutputSize(node_state.output_properties, port_num);
+            CalculateOutputSize(node_state.output_properties, port_num) *
+            node_state.execution_count;
         device.nodes_in_memory.insert(std::make_pair(node, port_num));
       }
     }
@@ -895,15 +803,16 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 
   // Update device's per-op cost.
   auto& device_op_cost = FindOrCreateZero(op_name, &device.op_to_cost);
-  device_op_cost = CombineCosts(device_op_cost, node_costs);
+  device_op_cost = CombineCosts(device_op_cost, total_node_costs);
 
   VLOG(3) << "Op scheduled -- name: " << node->name() << ", op: " << node->op()
           << ", device: " << node->device()
+          << ", execution_count: " << node_state.execution_count
           << ", ready: " << node_state.time_ready.count()
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Check outputs, add ready nodes to queue.
+  // Checks outputs, and adds ready nodes to queue.
   AddOutputNodesToReadyQueue(node, curr_time);
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
@@ -920,7 +829,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
       input_state.time_no_references[port] = curr_time;
       auto& input_device = device_[input_state.device_name];
       input_device.memory_usage -=
-          CalculateOutputSize(input_state.output_properties, port);
+          CalculateOutputSize(input_state.output_properties, port) *
+          node_state.execution_count;
 
       input_device.nodes_in_memory.erase(std::make_pair(input, port));
     }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index d96371bcab5db2d3ef730bf1eec8fe7f733bf4f6..e8e162296336e56fb6b81db3fd0238068dda781c 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -71,14 +71,14 @@ struct NodeState {
   // time_no_references.
 
   // How many times this node has been executed, e.g. in a while loop.
-  int num_executed_times;
+  int execution_count;
 
   NodeState() {
     num_inputs_ready = 0;
     time_ready = Costs::Duration::max();
     time_scheduled = Costs::Duration::max();
     time_finished = Costs::Duration::max();
-    num_executed_times = 0;
+    execution_count = 0;
     // Note that num_outputs_executed and time_no_references are not initialized
     // here, since we don't know the size (i.e., # outputs for this node).
   }
@@ -288,10 +288,6 @@ class VirtualScheduler {
   // of the virtual execution of the graph.
   void GenerateRunMetadata(RunMetadata* metadata);
 
-  // DEPRECATED
-  static ReadyNodeManager* ReadyNodeManagerFactory(
-      const string& ready_node_manager);
-
   // Return per device peak memory usage.
   const std::unordered_map<string, int64> GetPeakMemoryUsage() const;
 
@@ -327,8 +323,6 @@ class VirtualScheduler {
                           std::map<string, Costs>* op_cost);
   float Round2(const float x) const;
   bool IsPersistentNode(const NodeDef* node) const;
-  bool AddSwitchOutputsToReadyQueue(const NodeDef* node, int curr_iter,
-                                    const Costs::Duration& curr_time);
   void AddOutputNodesToReadyQueue(const NodeDef* node,
                                   const Costs::Duration& curr_time);
 
@@ -362,10 +356,6 @@ class VirtualScheduler {
   bool track_mem_usage_snapshot_;
   const bool use_aggressive_shape_inference_;
 
-  // Whether the input graph includes Switch nodes annotated with output slots
-  // information.
-  bool switch_outputs_annotated_ = false;
-
   VirtualPlacer placer_;  // owned.
 };
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 128cb986f11ba4f4bb13583cb293183194e1c744..38fd380a66045faa86c71fcb8262760a25a88366 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -873,8 +873,8 @@ versions {
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
-  // A simple while loop strengthened with Switch outputs.
-  void CreateGrapplerItemWithLoopSwitchOutputs() {
+  // A simple while loop strengthened with Switch outputs xxx.
+  void CreateGrapplerItemWithLoopAnnotated() {
     // Test graph produced in python using:
     /*
       with tf.Graph().as_default():
@@ -909,6 +909,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "ones"
@@ -936,6 +942,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Enter"
@@ -965,6 +977,12 @@ node {
       i: 10
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Enter_1"
@@ -994,6 +1012,12 @@ node {
       i: 10
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Merge"
@@ -1012,6 +1036,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Merge_1"
@@ -1030,6 +1060,12 @@ node {
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Less/y"
@@ -1052,6 +1088,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Less"
@@ -1064,11 +1106,23 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/LoopCond"
   op: "LoopCond"
   input: "while/Less"
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Switch"
@@ -1089,6 +1143,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 11
+    }
+  }
   attr {
     key: "_output_slot_vector"
     value {
@@ -1127,6 +1187,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 11
+    }
+  }
   attr {
     key: "_output_slot_vector"
     value {
@@ -1156,6 +1222,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Identity_1"
@@ -1167,6 +1239,12 @@ node {
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/add/y"
@@ -1189,6 +1267,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/add"
@@ -1201,6 +1285,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/concat/axis"
@@ -1223,6 +1313,12 @@ node {
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/concat"
@@ -1248,6 +1344,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/NextIteration"
@@ -1259,6 +1361,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/NextIteration_1"
@@ -1270,6 +1378,12 @@ node {
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Exit"
@@ -1281,6 +1395,12 @@ node {
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Exit_1"
@@ -1292,6 +1412,12 @@ node {
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 versions {
   producer: 21
@@ -1305,6 +1431,115 @@ versions {
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
+  // A simple condition graph.
+  void CreateGrapplerItemWithCondition() {
+    // Handcrafted test graph: a/Less -> Switch -> First/Second -> Merge.
+    const string gdef_ascii = R"EOF(
+node {
+  name: "a"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "Less"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        tensor_content: "\001"
+      }
+    }
+  }
+}
+node {
+  name: "Switch"
+  op: "Switch"
+  input: "a"
+  input: "Less"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "First"
+  op: "Identity"
+  input: "Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Second"
+  op: "Identity"
+  input: "Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Merge"
+  op: "Merge"
+  input: "First"
+  input: "Second"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 27
+})EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"Merge"};
+  }
+
   // Create a FusedBatchNorm op that has multiple output ports.
   void CreateGrapplerItemWithInterDeviceTransfers() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
@@ -2361,7 +2596,7 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   // TODO(dyoon): after fixing while loop behavior correctly (run nodes in the
   // order of Enter, Merge, ...loop condition ..., ... loop body ...,
   // NextIteration, Merge, ... loop condition ..., Exit), re-enable dependency
-  // chaing test w/ Merge nodes.
+  // chaining test w/ Merge nodes.
   ValidateDependencyChain(
       start_times,
       {"Const", "while/Enter",  // "while/Merge",
@@ -2379,87 +2614,155 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
-TEST_F(VirtualSchedulerTest, WhileLoopWithSwitchOutputs) {
-  // Init.
-  CreateGrapplerItemWithLoopSwitchOutputs();
-  InitScheduler();
-
-  // Runs the scheduler.
-  RunScheduler("");
-
-  RunMetadata metadata;
-  scheduler_->Summary(&metadata);
-
-  // Nodes in topological order:
-  // * const, ones
-  // * while/Enter, while/Enter_1
-  // * while/Merge, while/Merge_1
-  // * while/Less/y
-  // * while/Less
-  // * while/LoopCond
-  // * while/Switch, while/Switch_1
-  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
-  // * while/add/y, while/concat/axis
-  // * while/add, while/concat
-  // * while/NextIteration, while/NextIteration_1
-
-  int num_next_iteration = 0;
-  int num_next_iteration_1 = 0;
-  int num_exit = 0;
-  int num_exit_1 = 0;
-  int64 next_iter_start_micro;
-  int64 next_iter_1_start_micro;
-  int64 exit_start_micro;
-  int64 exit_1_start_micro;
+TEST_F(VirtualSchedulerTest, AnnotatedWhileLoop) {
+  {
+    // Init.
+    CreateGrapplerItemWithLoop();
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    Costs c = scheduler_->Summary();
+
+    EXPECT_EQ(23, c.execution_time.asMicroSeconds().count());
+    // Both while/Merge and while/Merge_1 are scheduled twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 2, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
+
+  {
+    // Init.
+    CreateGrapplerItemWithLoopAnnotated();
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    Costs c = scheduler_->Summary();
+
+    // The costs for Merge is accumulated twice for execution_count times, but
+    // since Merge's cost is minimal, we keep this behavior here.
+    EXPECT_EQ(178, c.execution_time.asMicroSeconds().count());
+    // Both while/Merge and while/Merge_1 are scheduled twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 2, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
+}
+
+TEST_F(VirtualSchedulerTest, Condition) {
+  // Without annotation.
+  {
+    // Inits.
+    CreateGrapplerItemWithCondition();
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    RunMetadata metadata;
+    Costs c = scheduler_->Summary(&metadata);
+
+    // Nodes in topological order: a/Less, Switch, First/Second, Merge.
+    int num_a = 0;
+    int num_less = 0;
+    int num_switch = 0;
+    int num_first = 0;
+    int num_second = 0;
+    int num_merge = 0;
+
+    for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+      for (const auto& stats : device_step_stats.node_stats()) {
+        if (stats.node_name() == "a") {
+          ++num_a;
+        } else if (stats.node_name() == "Less") {
+          ++num_less;
+        } else if (stats.node_name() == "Switch") {
+          ++num_switch;
+        } else if (stats.node_name() == "First") {
+          ++num_first;
+        } else if (stats.node_name() == "Second") {
+          ++num_second;
+        } else if (stats.node_name() == "Merge") {
+          ++num_merge;
+        }
+      }
+    }
 
-  std::unordered_map<string, int64> start_times;
-  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
-    for (const auto& stats : device_step_stats.node_stats()) {
-      start_times[stats.node_name()] = stats.all_start_micros();
-      if (stats.node_name() == "while/NextIteration") {
-        ++num_next_iteration;
-        next_iter_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/NextIteration_1") {
-        ++num_next_iteration_1;
-        next_iter_1_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/Exit") {
-        ++num_exit;
-        exit_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/Exit_1") {
-        ++num_exit_1;
-        exit_1_start_micro = stats.all_start_micros();
+    EXPECT_EQ(1, num_a);
+    EXPECT_EQ(1, num_less);
+    EXPECT_EQ(1, num_switch);
+    EXPECT_EQ(1, num_first);
+    EXPECT_EQ(1, num_second);
+    EXPECT_EQ(2, num_merge);
+
+    EXPECT_EQ(7, c.execution_time.asMicroSeconds().count());
+    // Merge is executed twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 1, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
+
+  // With annotation.
+  {
+    // Inits.
+    CreateGrapplerItemWithCondition();
+
+    // Annotates the Switch node.
+    for (auto& node : *grappler_item_->graph.mutable_node()) {
+      if (node.name() == "Switch") {
+        AttrValue attr_output_info;
+        // Adds one output slot 0 so that Second shouldn't be executed.
+        (*attr_output_info.mutable_list()).add_i(0);
+        AddNodeAttr(kOutputSlots, attr_output_info, &node);
       }
     }
-  }
 
-  // Makes sure we run the loop body for ten times.
-  EXPECT_EQ(10, num_next_iteration);
-  EXPECT_EQ(10, num_next_iteration_1);
-  EXPECT_EQ(1, num_exit);
-  EXPECT_EQ(1, num_exit_1);
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    RunMetadata metadata;
+    Costs c = scheduler_->Summary(&metadata);
+
+    // Nodes in topological order: a/Less, Switch, Merge
+    int num_a = 0;
+    int num_less = 0;
+    int num_switch = 0;
+    int num_first = 0;
+    int num_second = 0;
+    int num_merge = 0;
+
+    for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+      for (const auto& stats : device_step_stats.node_stats()) {
+        if (stats.node_name() == "a") {
+          ++num_a;
+        } else if (stats.node_name() == "Less") {
+          ++num_less;
+        } else if (stats.node_name() == "Switch") {
+          ++num_switch;
+        } else if (stats.node_name() == "First") {
+          ++num_first;
+        } else if (stats.node_name() == "Second") {
+          ++num_second;
+        } else if (stats.node_name() == "Merge") {
+          ++num_merge;
+        }
+      }
+    }
 
-  // Start times of while/NextIteration and while/NextIteration_1 should be
-  // different, so should be those of while/Exit and while/Exit_1.
-  EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
-  EXPECT_NE(exit_start_micro, exit_1_start_micro);
+    EXPECT_EQ(1, num_a);
+    EXPECT_EQ(1, num_less);
+    EXPECT_EQ(1, num_switch);
+    EXPECT_EQ(1, num_first);
+    EXPECT_EQ(0, num_second);
+    EXPECT_EQ(1, num_merge);
 
-  // Checks dependency among the nodes; no matter what scheduling mechanism we
-  // use, the scheduled ops should follow these dependency chains.
-  // We have to break the loop into two parts, identified by Switch outputs.
-  ValidateDependencyChain(
-      start_times,
-      {"Const", "while/Enter", "while/Merge", "while/Less/y", "while/Less",
-       "while/LoopCond", "while/Switch", "while/Exit"});
-  ValidateDependencyChain(start_times, {"while/Identity", "while/add/y",
-                                        "while/add", "while/NextIteration"});
-  ValidateDependencyChain(
-      start_times, {"ones", "while/Enter_1", "while/Merge_1", "while/Switch_1",
-                    "while/Exit_1"});
-  ValidateDependencyChain(start_times, {"while/Identity_1", "while/concat",
-                                        "while/NextIteration_1"});
-  ValidateDependencyChain(
-      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
-  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+    EXPECT_EQ(5, c.execution_time.asMicroSeconds().count());
+    // Second is not executed.
+    EXPECT_EQ(grappler_item_->graph.node_size() - 1, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
 }
 
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
index 4c6a9ba9e052b08918317e75b66d9b446a47b092..220138411ad1291bb5513454c8d205a5e56d4c7e 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
@@ -986,7 +986,7 @@ TEST_F(SignatureTest, ComputeOneRoundLinear) {
   EXPECT_THAT(hash_size, ElementsAre(4, 5, 5, 6, 6));
 }
 
-// On a linear topology where the cental node has been already marked as unique
+// On a linear topology where the central node has been already marked as unique
 // (yeah, not a very realistic case but tests the situations when the
 // disconnected subgraphs get created).
 TEST_F(SignatureTest, ComputeOneRoundSplitLinear) {
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
index 38ccfbaeb88fc9a21f83ca86482a75e9187ab382..86d86c3aa722f7b3b9d918c8f0dad57849a1c01e 100644
--- a/tensorflow/core/grappler/graph_topology_view.cc
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -40,7 +40,8 @@ inline void SortAndRemoveDuplicates(T* v) {
 
 Status GraphTopologyView::InitializeFromGraph(
     const GraphDef& graph,
-    const absl::Span<const GraphView::Edge> ephemeral_edges) {
+    const absl::Span<const GraphView::Edge> ephemeral_edges,
+    bool ignore_control_edges) {
   if (graph_ != nullptr) {
     return errors::InvalidArgument("GraphTopologyView is already initialized.");
   }
@@ -62,19 +63,39 @@ Status GraphTopologyView::InitializeFromGraph(
   // 1. Add ephemeral edges to the adjacency lists.
   for (const GraphView::Edge& edge : ephemeral_edges) {
     const auto src = node_name_to_index_.find(edge.src.node->name());
-    if (src == node_name_to_index_.end()) {
-      return errors::InvalidArgument("Non-existent src node: ",
-                                     edge.src.node->name());
+    const bool valid_src = src != node_name_to_index_.end();
+    if (!valid_src) {
+      const string error_message =
+          absl::StrCat("Non-existent src node: ", edge.src.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
     }
+
     const auto dst = node_name_to_index_.find(edge.dst.node->name());
-    if (dst == node_name_to_index_.end()) {
-      return errors::InvalidArgument("Non-existent dst node: ",
-                                     edge.dst.node->name());
+    const bool valid_dst = dst != node_name_to_index_.end();
+
+    if (!valid_dst) {
+      const string error_message =
+          absl::StrCat("Non-existent dst node: ", edge.dst.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
+    }
+
+    if (valid_dst && valid_src) {
+      const int src_idx = src->second;
+      const int dst_idx = dst->second;
+      if (ignore_control_edges && (src_idx < 0 || dst_idx < 0)) {
+        continue;
+      }
+      fanins_[dst_idx].push_back(src_idx);
+      fanouts_[src_idx].push_back(dst_idx);
     }
-    const int src_idx = src->second;
-    const int dst_idx = dst->second;
-    fanins_[dst_idx].push_back(src_idx);
-    fanouts_[src_idx].push_back(dst_idx);
   }
 
   // 2. Add graph edges to the adjacency lists.
@@ -84,14 +105,27 @@ Status GraphTopologyView::InitializeFromGraph(
 
     for (const string& input : node.input()) {
       TensorId tensor = ParseTensorName(input);
+      if (ignore_control_edges && IsTensorIdControl(tensor)) {
+        continue;
+      }
       const auto it = node_name_to_index_.find(tensor.node());
-      if (it == node_name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent input ", input,
-                                       " for node ", node.name());
+      const bool valid_input = it != node_name_to_index_.end();
+
+      if (!valid_input) {
+        const string error_message = absl::StrCat("Non-existent input ", input,
+                                                  " in node ", node.name());
+        if (skip_invalid_edges_) {
+          VLOG(3) << "Skip error: " << error_message;
+        } else {
+          return errors::InvalidArgument(error_message);
+        }
+      }
+
+      if (valid_input) {
+        const int input_idx = it->second;
+        fanins_[node_idx].push_back(input_idx);
+        fanouts_[input_idx].push_back(node_idx);
       }
-      const int input_idx = it->second;
-      fanins_[node_idx].push_back(input_idx);
-      fanouts_[input_idx].push_back(node_idx);
     }
 
     // Dedup the input list while it's still hot in cache.
@@ -106,8 +140,22 @@ Status GraphTopologyView::InitializeFromGraph(
   return Status::OK();
 }
 
+Status GraphTopologyView::InitializeFromGraph(
+    const GraphDef& graph,
+    const absl::Span<const GraphView::Edge> ephemeral_edges) {
+  return InitializeFromGraph(graph, ephemeral_edges,
+                             /*ignore_control_edges=*/false);
+}
+
+Status GraphTopologyView::InitializeFromGraph(const GraphDef& graph,
+                                              bool ignore_control_edges) {
+  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>(),
+                             ignore_control_edges);
+}
+
 Status GraphTopologyView::InitializeFromGraph(const GraphDef& graph) {
-  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>());
+  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>(),
+                             /*ignore_control_edges*/ false);
 }
 
 bool GraphTopologyView::HasNode(const absl::string_view node_name) const {
diff --git a/tensorflow/core/grappler/graph_topology_view.h b/tensorflow/core/grappler/graph_topology_view.h
index 1c222df4b60951eb38b8e24411c4807e4fe4885d..cdb2eeb92a9726b77cca59a5ed8debf94c53d640 100644
--- a/tensorflow/core/grappler/graph_topology_view.h
+++ b/tensorflow/core/grappler/graph_topology_view.h
@@ -38,7 +38,7 @@ namespace grappler {
 //   b = Placeholder(..)
 //   c = AddN([a, a, b])
 //
-// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:3]
+// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:2]
 // GraphTopologyView edges: [a -> c, b -> c]
 //
 // GraphView is used for exploring single node fanins and fanouts, and
@@ -47,14 +47,20 @@ namespace grappler {
 class GraphTopologyView {
  public:
   GraphTopologyView() = default;
+  explicit GraphTopologyView(bool skip_invalid_edges)
+      : skip_invalid_edges_(skip_invalid_edges) {}
 
   // Initialize graph topology view from the graph. It's possible to pass
   // additional edges that do not exist in a graph, but must be respected when
   // computing graph topology. Example: Tensorflow runtime allows concurrent
   // execution of dequeue/enqueue ops from the same queue resource, but we might
   // want to enforce ordering between them for the purpose of graph analysis.
+  Status InitializeFromGraph(const GraphDef& graph,
+                             absl::Span<const GraphView::Edge> ephemeral_edges,
+                             bool ignore_control_edges);
   Status InitializeFromGraph(const GraphDef& graph,
                              absl::Span<const GraphView::Edge> ephemeral_edges);
+  Status InitializeFromGraph(const GraphDef& graph, bool ignore_control_edges);
   Status InitializeFromGraph(const GraphDef& graph);
 
   bool is_initialized() const { return graph_ != nullptr; }
@@ -84,6 +90,10 @@ class GraphTopologyView {
   const absl::InlinedVector<int, 2>& GetFanout(int node_idx) const;
 
  private:
+  // If true, all invalid edges and inputs (srd, dst or input node not found in
+  // a graph) will be skipped, otherwise initialization will fail with error.
+  bool skip_invalid_edges_ = false;
+
   // WARN: `graph_` must outlive this object and graph nodes must not be
   // destructed, because node names captured with absl::string_view.
   const GraphDef* graph_ = nullptr;  // do not own
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
index 36d3a2017cc5ef965a26b0bdbbbdde441fb633db..4d93eaa0b198b8c7e9a29f8aac7270b01edf2d45 100644
--- a/tensorflow/core/grappler/graph_topology_view_test.cc
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -113,5 +113,41 @@ TEST_F(GraphTopologyViewTest, GraphWithALoop) {
   EXPECT_EQ(graph_view.GetFanout(3), Fanout({2}));
 }
 
+TEST_F(GraphTopologyViewTest, GraphWithControls) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},                // idx: 0
+      {"b", {}},                // idx: 1
+      {"c", {"a", "b", "^d"}},  // idx: 2
+      {"d", {"a", "c"}},        // idx: 3
+  });
+
+  {
+    GraphTopologyView graph_view;
+    TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+    EXPECT_TRUE(graph_view.is_initialized());
+
+    using Fanin = absl::InlinedVector<int, 4>;
+    EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1, 3}));
+    EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+    using Fanout = absl::InlinedVector<int, 2>;
+    EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+    EXPECT_EQ(graph_view.GetFanout(3), Fanout({2}));
+  }
+  {
+    GraphTopologyView graph_view;
+    TF_CHECK_OK(
+        graph_view.InitializeFromGraph(graph, /*ignore_controls*/ true));
+    EXPECT_TRUE(graph_view.is_initialized());
+    using Fanin = absl::InlinedVector<int, 4>;
+    EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1}));
+    EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+    using Fanout = absl::InlinedVector<int, 2>;
+    EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+    EXPECT_EQ(graph_view.GetFanout(3), Fanout({}));
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index dc4ab93894c4d85038efa8c3052a06f9e5e55d1d..63c58a0aede059c6def5eca322ce3c491ea709b7 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -143,13 +143,20 @@ class GraphViewInternal {
 
   // Gets the output port(s) in the immediate fanin of an input port.
   absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
-    if (port.port_id >= 0) return {GetRegularFanin(port)};
+    if (port.port_id >= 0) {
+      OutputPort regular_fanin = GetRegularFanin(port);
+      if (regular_fanin.node == nullptr) {
+        return {};
+      }
+      return {regular_fanin};
+    }
 
     // Collect fanin for the control input.
     absl::flat_hash_set<OutputPort> result;
-    for (int i = port.node->input_size() - 1; i >= 0; --i) {
+    const int first_control_port =
+        gtl::FindWithDefault(max_regular_input_port_, port.node, -1) + 1;
+    for (int i = first_control_port; i < port.node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(port.node->input(i));
-      if (tensor_id.index() >= 0) break;  // we reached regular inputs
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -158,22 +165,36 @@ class GraphViewInternal {
   }
 
   // Special case: regular (i.e. non-control) input ports can only have one
-  // fanin.
+  // fanin. If port.port_id is out of range or is a control dependency, then an
+  // empty OutputPort is returned.
   const OutputPort GetRegularFanin(const InputPort& port) const {
-    DCHECK_GE(port.port_id, 0);
-    if (port.port_id < 0) return OutputPort();
+    if (port.port_id < 0 ||
+        port.port_id >
+            gtl::FindWithDefault(max_regular_input_port_, port.node, -1)) {
+      return OutputPort();
+    }
 
     TensorId tensor_id = ParseTensorName(port.node->input(port.port_id));
     return GetOutputPort(tensor_id.node(), tensor_id.index());
   }
 
   // Checks if a tensor id is a fanin of the node.
-  bool HasFanin(const NodeDef& node, const TensorId& fanin) const {
-    if (fanin.index() < -1) {
+  bool HasFanin(const NodeDefT& node, const TensorId& fanin) const {
+    int end = node.input_size();
+    if (end == 0 || fanin.index() < -1) {
       return false;
     }
-    for (const string& input : node.input()) {
-      if (ParseTensorName(input) == fanin) {
+
+    const int num_regular_fanins =
+        gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
+    int start = 0;
+    if (fanin.index() > -1) {
+      end = num_regular_fanins;
+    } else {
+      start = num_regular_fanins;
+    }
+    for (int i = start; i < end; ++i) {
+      if (ParseTensorName(node.input(i)) == fanin) {
         return true;
       }
     }
@@ -183,14 +204,14 @@ class GraphViewInternal {
   // Gets all the input ports in the immediate fanout of a node. Include the
   // controlled nodes iff include_controlled_nodes is true.
   absl::flat_hash_set<InputPort> GetFanouts(
-      const NodeDef& node, bool include_controlled_nodes) const {
+      const NodeDefT& node, bool include_controlled_nodes) const {
     absl::flat_hash_set<InputPort> result;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -205,11 +226,14 @@ class GraphViewInternal {
   // Gets all the output ports in the immediate fanin of a node. Include the
   // controlling nodes iff include_controlling_nodes is true.
   absl::flat_hash_set<OutputPort> GetFanins(
-      const NodeDef& node, bool include_controlling_nodes) const {
+      const NodeDefT& node, bool include_controlling_nodes) const {
     absl::flat_hash_set<OutputPort> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_nodes
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_nodes) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -219,27 +243,23 @@ class GraphViewInternal {
 
   // Gets the number of ports in the immediate fanin of a node. Count the
   // controlling nodes iff include_controlling_nodes is true.
-  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const {
-    int count = 0;
-    for (const string& input : node.input()) {
-      if (!include_controlling_nodes && IsControlInput(input)) {
-        break;
-      }
-      count += 1;
+  int NumFanins(const NodeDefT& node, bool include_controlling_nodes) const {
+    if (include_controlling_nodes) {
+      return node.input_size();
     }
-    return count;
+    return gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
   }
 
   // Gets the number of ports in the immediate fanout of a node. Count the
   // controlled nodes iff include_controlled_nodes is true.
-  int NumFanouts(const NodeDef& node, bool include_controlled_nodes) const {
+  int NumFanouts(const NodeDefT& node, bool include_controlled_nodes) const {
     int count = 0;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -253,7 +273,7 @@ class GraphViewInternal {
   // Gets all the edges in the immediate fanout of a node. Include the
   // controlled edges iff include_controlled_edges is true.
   absl::flat_hash_set<Edge> GetFanoutEdges(
-      const NodeDef& node, bool include_controlled_edges) const {
+      const NodeDefT& node, bool include_controlled_edges) const {
     absl::flat_hash_set<Edge> result;
 
     OutputPort port;
@@ -267,8 +287,7 @@ class GraphViewInternal {
       auto it = fanouts_.find(port);
       if (it != fanouts_.end()) {
         for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
-          result.emplace(/*src=*/OutputPort(const_cast<NodeDefT*>(&node), i),
-                         /*dst=*/*itr);
+          result.emplace(/*src=*/port, /*dst=*/*itr);
         }
       }
     }
@@ -278,11 +297,14 @@ class GraphViewInternal {
   // Gets all the edges in the immediate fanin of a node. Include the
   // controlling edges iff include_controlling_edges is true.
   absl::flat_hash_set<Edge> GetFaninEdges(
-      const NodeDef& node, bool include_controlling_edges) const {
+      const NodeDefT& node, bool include_controlling_edges) const {
     absl::flat_hash_set<Edge> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_edges
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_edges) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) {
@@ -296,14 +318,24 @@ class GraphViewInternal {
  protected:
   explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
 
+  Status AddUniqueNode(NodeDefT* node) {
+    auto inserted = nodes_.emplace(node->name(), node);
+    return inserted.second
+               ? Status::OK()
+               : errors::InvalidArgument("Non unique node name detected: ",
+                                         node->name());
+  }
+
+  // TODO(ezhulenev): Remove this function.
   void AddUniqueNodeOrDie(NodeDefT* node) {
-    auto result = nodes_.emplace(node->name(), node);
-    // TODO(ezhulenev): Replace CHECK with factory method returning
-    // absl::StatusOr (when available).
-    CHECK(result.second) << "Non unique node name detected: " << node->name();
+    Status st = AddUniqueNode(node);
+    CHECK(st.ok()) << st.error_message();
   }
 
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
   void AddFanouts(NodeDefT* node) {
+    int max_input_port = -1;
     for (int i = 0; i < node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(node->input(i));
       OutputPort output(nodes_[tensor_id.node()], tensor_id.index());
@@ -311,11 +343,15 @@ class GraphViewInternal {
       if (output.port_id < 0) {
         fanouts_[output].emplace(node, -1);
       } else {
+        max_input_port = i;
         max_regular_output_port_[output.node] =
             std::max(max_regular_output_port_[output.node], output.port_id);
         fanouts_[output].emplace(node, i);
       }
     }
+    if (max_input_port > -1) {
+      max_regular_input_port_[node] = max_input_port;
+    }
   }
 
   // Access to the mutable internal state for MutableGraphView.
@@ -325,7 +361,11 @@ class GraphViewInternal {
     return fanouts_;
   }
 
-  absl::flat_hash_map<const NodeDef*, int>& max_regular_output_port() {
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_input_port() {
+    return max_regular_input_port_;
+  }
+
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_output_port() {
     return max_regular_output_port_;
   }
 
@@ -338,10 +378,13 @@ class GraphViewInternal {
   // A mapping from the output port to all inputs that read from it.
   absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
 
+  // Keep a maximum index of input tensors of the node.
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_input_port_;
+
   // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
   // that all tensors in the [0, max_regular_output_port] range are actually
   // fetched by other nodes.
-  absl::flat_hash_map<const NodeDef*, int> max_regular_output_port_;
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_output_port_;
 
   // If the node has no fanouts at given output port (output tensor consumers)
   // we return a reference to this set from `GetFanout` (we can't construct new
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 404dcd30c12781f2f9581ac6a1cb5986bb75f187..0036719fc51999e87e0869717e2b08cdb6cf96a3 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -18,9 +18,12 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/ops/parsing_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -42,26 +45,24 @@ TEST_F(GraphViewTest, OpPortIdToArgIdShapeN) {
 
   const OpDef* a_op_def = nullptr;
   const OpDef* b_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def).ok());
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
   // Const has 0 inputs, 1 output.
-  EXPECT_EQ(-1, OpInputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(a_node_def, *a_op_def, 1));
+  EXPECT_EQ(OpInputPortIdToArgId(a_node_def, *a_op_def, 0), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 1), -1);
 
   // ShapeN has N=3 inputs and outputs.
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 4));
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 }
 
 TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
@@ -76,22 +77,21 @@ TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
 
     const NodeDef& b_node_def = *graph_view.GetNode("b");
     const OpDef* b_op_def = nullptr;
-    EXPECT_TRUE(
-        OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+    TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
     // We have 4 inputs.
-    EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-    EXPECT_EQ(1, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-    EXPECT_EQ(2, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-    EXPECT_EQ(3, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-    EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 4));
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 1);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 2);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), 3);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 
     for (int port_id = 0; port_id <= num_splits * 3; ++port_id) {
       int arg_id = -1;
       if (port_id < num_splits * 3) {
         arg_id = port_id / num_splits;
       }
-      EXPECT_EQ(arg_id, OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id));
+      EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id), arg_id);
     }
   }
 }
@@ -110,18 +110,17 @@ TEST_F(GraphViewTest, ParseSingleExample) {
   const NodeDef& c_node_def = *graph_view.GetNode("c");
 
   const OpDef* c_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def).ok());
-
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 1));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 2));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 3));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 4));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 5));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 6));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 7));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 8));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def));
+
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 2), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 3), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 4), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 5), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 6), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 7), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 8), -1);
 }
 
 TEST_F(GraphViewTest, BasicGraph) {
@@ -132,26 +131,26 @@ TEST_F(GraphViewTest, BasicGraph) {
   GraphView graph(&item.graph);
 
   GraphView::InputPort input = graph.GetInputPort("AddN", 0);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 0);
   GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square");
+  EXPECT_EQ(fanin.port_id, 0);
 
   input = graph.GetInputPort("AddN", 1);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square_1", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square_1");
+  EXPECT_EQ(fanin.port_id, 0);
 
   GraphView::OutputPort output = graph.GetOutputPort("AddN", 0);
-  EXPECT_EQ("AddN", output.node->name());
-  EXPECT_EQ(0, output.port_id);
-  EXPECT_EQ(2, graph.GetFanout(output).size());
+  EXPECT_EQ(output.node->name(), "AddN");
+  EXPECT_EQ(output.port_id, 0);
+  EXPECT_EQ(graph.GetFanout(output).size(), 2);
   for (auto fanout : graph.GetFanout(output)) {
     if (fanout.node->name() == "AddN_2" || fanout.node->name() == "AddN_3") {
-      EXPECT_EQ(0, fanout.port_id);
+      EXPECT_EQ(fanout.port_id, 0);
     } else {
       // Invalid fanout
       EXPECT_FALSE(true);
@@ -159,7 +158,7 @@ TEST_F(GraphViewTest, BasicGraph) {
   }
 
   const NodeDef* add_node = graph.GetNode("AddN");
-  EXPECT_NE(nullptr, add_node);
+  EXPECT_NE(add_node, nullptr);
 
   absl::flat_hash_set<string> fanouts;
   absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
@@ -190,44 +189,44 @@ TEST_F(GraphViewTest, ControlDependencies) {
   GraphView graph(&item.graph);
 
   GraphView::OutputPort output = graph.GetOutputPort("a", -1);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(-1, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, -1);
   auto fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("d", (*fanout.begin()).node->name());
-  EXPECT_EQ(-1, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "d");
+  EXPECT_EQ((*fanout.begin()).port_id, -1);
 
   output = graph.GetOutputPort("a", 0);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(0, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, 0);
   fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("b", (*fanout.begin()).node->name());
-  EXPECT_EQ(0, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "b");
+  EXPECT_EQ((*fanout.begin()).port_id, 0);
 
   GraphView::InputPort input = graph.GetInputPort("d", -1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(-1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, -1);
   auto fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("a", (*fanin.begin()).node->name());
-  EXPECT_EQ(-1, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "a");
+  EXPECT_EQ((*fanin.begin()).port_id, -1);
 
   input = graph.GetInputPort("d", 0);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 0);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("b", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "b");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
 
   input = graph.GetInputPort("d", 1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("c", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "c");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
 }
 
 TEST_F(GraphViewTest, HasNode) {
@@ -238,8 +237,8 @@ TEST_F(GraphViewTest, HasNode) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   GraphView graph(&item.graph);
 
-  EXPECT_EQ(true, graph.HasNode("a"));
-  EXPECT_EQ(false, graph.HasNode("b"));
+  EXPECT_EQ(graph.HasNode("a"), true);
+  EXPECT_EQ(graph.HasNode("b"), false);
 }
 
 TEST_F(GraphViewTest, HasFanin) {
@@ -254,16 +253,252 @@ TEST_F(GraphViewTest, HasFanin) {
   GraphView graph(&item.graph);
 
   const NodeDef* d_node = graph.GetNode("d");
-  EXPECT_NE(nullptr, d_node);
-
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"a", Graph::kControlSlot}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"a", 0}));
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"b", 0}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"b", Graph::kControlSlot}));
-  EXPECT_EQ(true, graph.HasFanin(*d_node, {"c", 0}));
-  EXPECT_EQ(false, graph.HasFanin(*d_node, {"c", Graph::kControlSlot}));
+  EXPECT_NE(d_node, nullptr);
+
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", Graph::kControlSlot}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", 0}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", Graph::kControlSlot}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", Graph::kControlSlot}), false);
 }
 
+TEST_F(GraphViewTest, GetRegularFaninPortOutOfBounds) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  const NodeDef* b_node = graph.GetNode("b");
+  EXPECT_NE(b_node, nullptr);
+  const NodeDef* c_node = graph.GetNode("c");
+  EXPECT_NE(c_node, nullptr);
+  const NodeDef* d_node = graph.GetNode("d");
+  EXPECT_NE(d_node, nullptr);
+
+  auto d_output_0 = graph.GetRegularFanin({d_node, 0});
+  EXPECT_EQ(d_output_0, GraphView::OutputPort(b_node, 0));
+  auto d_output_1 = graph.GetRegularFanin({d_node, 1});
+  EXPECT_EQ(d_output_1, GraphView::OutputPort(c_node, 0));
+  auto d_output_2 = graph.GetRegularFanin({d_node, 2});
+  EXPECT_EQ(d_output_2, GraphView::OutputPort());
+  auto d_output_control = graph.GetRegularFanin({d_node, Graph::kControlSlot});
+  EXPECT_EQ(d_output_control, GraphView::OutputPort());
+}
+
+static void BM_GraphViewConstruction(int iters, int num_nodes,
+                                     int num_edges_per_node) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, num_edges_per_node);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    GraphView graph_view(&graph_def);
+  }
+  testing::StopTiming();
+}
+
+BENCHMARK(BM_GraphViewConstruction)
+    ->ArgPair(10, 2)
+    ->ArgPair(100, 2)
+    ->ArgPair(1000, 2)
+    ->ArgPair(10000, 2)
+    ->ArgPair(25000, 2)
+    ->ArgPair(50000, 2)
+    ->ArgPair(100000, 2)
+    ->ArgPair(10, 4)
+    ->ArgPair(100, 4)
+    ->ArgPair(1000, 4)
+    ->ArgPair(10000, 4)
+    ->ArgPair(25000, 4)
+    ->ArgPair(50000, 4)
+    ->ArgPair(100000, 4)
+    ->ArgPair(10, 8)
+    ->ArgPair(100, 8)
+    ->ArgPair(1000, 8)
+    ->ArgPair(10000, 8)
+    ->ArgPair(25000, 8)
+    ->ArgPair(50000, 8)
+    ->ArgPair(100000, 8)
+    ->ArgPair(10, 16)
+    ->ArgPair(100, 16)
+    ->ArgPair(1000, 16)
+    ->ArgPair(10000, 16)
+    ->ArgPair(25000, 16)
+    ->ArgPair(50000, 16)
+    ->ArgPair(100000, 16);
+
+static void BM_GraphViewGetNode(int iters, int num_nodes) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, /*num_edges_per_node=*/16);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    graph_view.GetNode("out");
+  }
+  testing::StopTiming();
+}
+
+BENCHMARK(BM_GraphViewGetNode)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(25000)
+    ->Arg(50000)
+    ->Arg(100000);
+
+#define RUN_FANIN_FANOUT_BENCHMARK(name) \
+  BENCHMARK(name)                        \
+      ->ArgPair(10, 10)                  \
+      ->ArgPair(10, 100)                 \
+      ->ArgPair(10, 1000)                \
+      ->ArgPair(10, 10000)               \
+      ->ArgPair(10, 100000)              \
+      ->ArgPair(100, 10)                 \
+      ->ArgPair(100, 100)                \
+      ->ArgPair(100, 1000)               \
+      ->ArgPair(100, 10000)              \
+      ->ArgPair(100, 100000)             \
+      ->ArgPair(1000, 10)                \
+      ->ArgPair(1000, 100)               \
+      ->ArgPair(1000, 1000)              \
+      ->ArgPair(1000, 10000)             \
+      ->ArgPair(1000, 100000)            \
+      ->ArgPair(10000, 10)               \
+      ->ArgPair(10000, 100)              \
+      ->ArgPair(10000, 1000)             \
+      ->ArgPair(10000, 10000)            \
+      ->ArgPair(10000, 100000)           \
+      ->ArgPair(100000, 10)              \
+      ->ArgPair(100000, 100)             \
+      ->ArgPair(100000, 1000)            \
+      ->ArgPair(100000, 10000)           \
+      ->ArgPair(100000, 100000);
+
+static void BM_GraphViewGetFanout(int iters, int num_fanins, int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFanout({node, 0});
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanout);
+
+static void BM_GraphViewGetFanin(int iters, int num_fanins, int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFanin({node, 0});
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanin);
+
+static void BM_GraphViewGetRegularFanin(int iters, int num_fanins,
+                                        int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetRegularFanin({node, 0});
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanin);
+
+static void BM_GraphViewGetFanouts(int iters, int num_fanins, int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFanouts(*node, /*include_controlled_nodes=*/false);
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanouts);
+
+static void BM_GraphViewGetFanins(int iters, int num_fanins, int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFanins(*node, /*include_controlling_nodes=*/false);
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanins);
+
+static void BM_GraphViewGetFanoutEdges(int iters, int num_fanins,
+                                       int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFanoutEdges(*node, /*include_controlled_edges=*/false);
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFanoutEdges);
+
+static void BM_GraphViewGetFaninEdges(int iters, int num_fanins,
+                                      int num_fanouts) {
+  testing::StopTiming();
+  const GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins, num_fanouts);
+  GraphView graph_view(&graph_def);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    const NodeDef* node = graph_view.GetNode("node");
+    graph_view.GetFaninEdges(*node, /*include_controlling_edges=*/false);
+  }
+  testing::StopTiming();
+}
+
+RUN_FANIN_FANOUT_BENCHMARK(BM_GraphViewGetFaninEdges);
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 1323dd9a64b1aea2b2e3a5605c30c234bd315b04..bc95c9cf72ab06ce8f3ed0126ad42f62cfe2ace7 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -115,11 +115,12 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
     }
   }
 
-  // Tensorflow functions do not prune side effects, or dataset-output ops from
+  // Tensorflow functions do not prune stateful or dataset-output ops from
   // the function body (see PruneFunctionBody in common_runtime/function.cc).
-  if (optimization_options_.is_function_instantiation) {
+  if (!optimization_options_.allow_pruning_stateful_and_dataset_ops) {
+    FunctionLibraryDefinition fn_library(OpRegistry::Global(), graph.library());
     for (const NodeDef& node : graph.node()) {
-      if (!IsFreeOfSideEffect(node) || IsDataset(node)) {
+      if (IsStateful(node, &fn_library) || IsDataset(node)) {
         result.insert(node.name());
       }
     }
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 75712e9f92cc47007caae65be9a4e265458fa619..57949b322d61273d607b50c27d995db79cbc9391 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -91,7 +91,13 @@ struct GrapplerItem {
     // by running Grappler optimizer passes. One main difference is that
     // functions do not prune ops with side-effects and dataset-output ops (see
     // PruneFunctionBody in common_runtime/function.cc).
-    bool is_function_instantiation = false;
+    bool allow_pruning_stateful_and_dataset_ops = true;
+
+    // If true Grappler will optimize the main graph, and also all functions in
+    // the graph function library (function can't be polymorphic, it can't have
+    // undefined type parameters in the function signature, or placeholder
+    // attributes in the function body).
+    bool optimize_function_library = true;
   };
 
   const std::unordered_set<string>& devices() const;
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index ffa204028cca828147810c99277fdcd9cb05f5ee..286c30cd356baf408bb227236d9369f81ab8b1ad 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -49,7 +49,11 @@ cc_library(
     deps = [
         ":input_yielder",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:aggregate_ops",
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index e31eedf1661a34efe18d91dad508ba590b258237..1200cff712717b40eafbd468d86261f81865b84a 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -38,26 +40,6 @@ namespace grappler {
 
 namespace {
 
-const char kMissingMsg[] = "missing";
-const char kInvalidMsg[] = "invalid";
-const char kNoErrMsg[] = "";
-
-string FaninError(bool tensor_id_valid, bool node_missing) {
-  string s;
-  if (!tensor_id_valid && node_missing) {
-    s = absl::StrCat(" ", kInvalidMsg, "/", kMissingMsg);
-  } else if (!tensor_id_valid) {
-    s = absl::StrCat(" ", kInvalidMsg);
-  } else if (node_missing) {
-    s = absl::StrCat(" ", kMissingMsg);
-  }
-  return s;
-}
-
-string NodeError(bool node_missing) {
-  return node_missing ? absl::StrCat(" ", kMissingMsg) : kNoErrMsg;
-}
-
 bool IsTensorIdPortValid(const TensorId& tensor_id) {
   return tensor_id.index() >= Graph::kControlSlot;
 }
@@ -70,10 +52,6 @@ bool IsTensorIdControlling(const TensorId& tensor_id) {
   return tensor_id.index() == Graph::kControlSlot;
 }
 
-bool IsOutputPortRegular(const MutableGraphView::OutputPort& port) {
-  return port.port_id > Graph::kControlSlot;
-}
-
 bool IsOutputPortControlling(const MutableGraphView::OutputPort& port) {
   return port.port_id == Graph::kControlSlot;
 }
@@ -111,16 +89,258 @@ bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
 bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
                                      absl::string_view control_node_name) {
   NodeDef* control_node = graph.GetNode(control_node_name);
+  DCHECK(control_node != nullptr)
+      << "Didn't find a node for control dependency: " << control_node_name;
   return CanDedupControlWithRegularInput(graph, *control_node);
 }
 
+bool HasRegularFaninNode(const MutableGraphView& graph, const NodeDef& node,
+                         absl::string_view fanin_node_name) {
+  const int num_regular_fanins =
+      graph.NumFanins(node, /*include_controlling_nodes=*/false);
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    if (ParseTensorName(node.input(i)).node() == fanin_node_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+using FanoutsMap =
+    absl::flat_hash_map<MutableGraphView::OutputPort,
+                        absl::flat_hash_set<MutableGraphView::InputPort>>;
+
+void SwapControlledFanoutInputs(const MutableGraphView& graph,
+                                const FanoutsMap::iterator& control_fanouts,
+                                absl::string_view to_node_name) {
+  absl::string_view from_node_name(control_fanouts->first.node->name());
+  string control = TensorIdToString({to_node_name, Graph::kControlSlot});
+  for (const auto& control_fanout : control_fanouts->second) {
+    const int start = graph.NumFanins(*control_fanout.node,
+                                      /*include_controlling_nodes=*/false);
+    for (int i = start; i < control_fanout.node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(control_fanout.node->input(i));
+      if (tensor_id.node() == from_node_name) {
+        control_fanout.node->set_input(i, control);
+        break;
+      }
+    }
+  }
+}
+
+void SwapRegularFanoutInputs(FanoutsMap* fanouts, NodeDef* from_node,
+                             absl::string_view to_node_name, int max_port) {
+  MutableGraphView::OutputPort port;
+  port.node = from_node;
+  for (int i = 0; i <= max_port; ++i) {
+    port.port_id = i;
+    auto it = fanouts->find(port);
+    if (it == fanouts->end()) {
+      continue;
+    }
+    string input = TensorIdToString({to_node_name, i});
+    for (const auto& fanout : it->second) {
+      fanout.node->set_input(fanout.port_id, input);
+    }
+  }
+}
+
+using MaxOutputPortsMap = absl::flat_hash_map<const NodeDef*, int>;
+
+void SwapFanoutInputs(const MutableGraphView& graph, FanoutsMap* fanouts,
+                      MaxOutputPortsMap* max_output_ports, NodeDef* from_node,
+                      NodeDef* to_node) {
+  auto from_control_fanouts = fanouts->find({from_node, Graph::kControlSlot});
+  if (from_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, from_control_fanouts, to_node->name());
+  }
+  auto to_control_fanouts = fanouts->find({to_node, Graph::kControlSlot});
+  if (to_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, to_control_fanouts, from_node->name());
+  }
+  auto from_max_port = max_output_ports->find(from_node);
+  if (from_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, from_node, to_node->name(),
+                            from_max_port->second);
+  }
+  auto to_max_port = max_output_ports->find(to_node);
+  if (to_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, to_node, from_node->name(),
+                            to_max_port->second);
+  }
+}
+
+void SwapFanoutsMapValues(FanoutsMap* fanouts,
+                          const MutableGraphView::OutputPort& from_port,
+                          const FanoutsMap::iterator& from_fanouts,
+                          const MutableGraphView::OutputPort& to_port,
+                          const FanoutsMap::iterator& to_fanouts) {
+  const bool from_exists = from_fanouts != fanouts->end();
+  const bool to_exists = to_fanouts != fanouts->end();
+
+  if (from_exists && to_exists) {
+    std::swap(from_fanouts->second, to_fanouts->second);
+  } else if (from_exists) {
+    fanouts->emplace(to_port, std::move(from_fanouts->second));
+    fanouts->erase(from_port);
+  } else if (to_exists) {
+    fanouts->emplace(from_port, std::move(to_fanouts->second));
+    fanouts->erase(to_port);
+  }
+}
+
+void SwapRegularFanoutsAndMaxPortValues(FanoutsMap* fanouts,
+                                        MaxOutputPortsMap* max_output_ports,
+                                        NodeDef* from_node, NodeDef* to_node) {
+  auto from_max_port = max_output_ports->find(from_node);
+  auto to_max_port = max_output_ports->find(to_node);
+  bool from_exists = from_max_port != max_output_ports->end();
+  bool to_exists = to_max_port != max_output_ports->end();
+
+  auto forward_fanouts = [fanouts](NodeDef* from, NodeDef* to, int start,
+                                   int end) {
+    for (int i = start; i <= end; ++i) {
+      MutableGraphView::OutputPort from_port(from, i);
+      auto from_fanouts = fanouts->find(from_port);
+      if (from_fanouts != fanouts->end()) {
+        MutableGraphView::OutputPort to_port(to, i);
+        fanouts->emplace(to_port, std::move(from_fanouts->second));
+        fanouts->erase(from_port);
+      }
+    }
+  };
+
+  if (from_exists && to_exists) {
+    const int from = from_max_port->second;
+    const int to = to_max_port->second;
+    const int shared = std::min(from, to);
+    for (int i = 0; i <= shared; ++i) {
+      MutableGraphView::OutputPort from_port(from_node, i);
+      auto from_fanouts = fanouts->find(from_port);
+      MutableGraphView::OutputPort to_port(to_node, i);
+      auto to_fanouts = fanouts->find(to_port);
+      SwapFanoutsMapValues(fanouts, from_port, from_fanouts, to_port,
+                           to_fanouts);
+    }
+    if (to > from) {
+      forward_fanouts(to_node, from_node, shared + 1, to);
+    } else if (from > to) {
+      forward_fanouts(from_node, to_node, shared + 1, from);
+    }
+
+    std::swap(from_max_port->second, to_max_port->second);
+  } else if (from_exists) {
+    forward_fanouts(from_node, to_node, 0, from_max_port->second);
+
+    max_output_ports->emplace(to_node, from_max_port->second);
+    max_output_ports->erase(from_node);
+  } else if (to_exists) {
+    forward_fanouts(to_node, from_node, 0, to_max_port->second);
+
+    max_output_ports->emplace(from_node, to_max_port->second);
+    max_output_ports->erase(to_node);
+  }
+}
+
+bool HasFanoutValue(const FanoutsMap& fanouts, const FanoutsMap::iterator& it) {
+  return it != fanouts.end() && !it->second.empty();
+}
+
+Status MutationError(absl::string_view function_name, absl::string_view params,
+                     absl::string_view msg) {
+  return errors::InvalidArgument(absl::Substitute(
+      "MutableGraphView::$0($1) error: $2.", function_name, params, msg));
+}
+
+using ErrorHandler = std::function<Status(absl::string_view)>;
+
+ErrorHandler UpdateFanoutsError(absl::string_view from_node_name,
+                                absl::string_view to_node_name) {
+  return [from_node_name, to_node_name](absl::string_view msg) {
+    string params = absl::Substitute("from_node_name='$0', to_node_name='$1'",
+                                     from_node_name, to_node_name);
+    return MutationError("UpdateFanouts", params, msg);
+  };
+}
+
+Status CheckFaninIsRegular(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdRegular(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a regular tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckFaninIsValid(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdPortValid(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a valid tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckAddingFaninToSelf(absl::string_view node_name,
+                              const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(
+        absl::Substitute("can't add fanin '$0' to self", fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckRemovingFaninFromSelf(absl::string_view node_name,
+                                  const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(absl::Substitute("can't remove fanin '$0' from self",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+string NodeMissingErrorMsg(absl::string_view node_name) {
+  return absl::Substitute("node '$0' was not found", node_name);
+}
+
+Status CheckNodeExists(absl::string_view node_name, NodeDef* node,
+                       ErrorHandler handler) {
+  if (node == nullptr) {
+    return handler(NodeMissingErrorMsg(node_name));
+  }
+  return Status::OK();
+}
+
+Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
+  if (port < min || port > max) {
+    if (max < min) {
+      return handler("no available ports as node has no regular fanins");
+    }
+    return handler(
+        absl::Substitute("port must be in range [$0, $1]", min, max));
+  }
+  return Status::OK();
+}
+
+string SwapNodeNamesSwitchControlErrorMsg(absl::string_view node_name) {
+  return absl::Substitute(
+      "can't swap node name '$0' as it will become a Switch control dependency",
+      node_name);
+}
+
+string GeneratedNameForIdentityConsumingSwitch(
+    const MutableGraphView::OutputPort& fanin) {
+  return AddPrefixToNodeName(
+      absl::StrCat(fanin.node->name(), "_", fanin.port_id),
+      kMutableGraphViewCtrl);
+}
+
 }  // namespace
 
 void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
-  // TODO(lyandy): Checks for self loops, Switch control dependencies and if
-  // fanins exist.
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
   absl::flat_hash_set<absl::string_view> fanins;
   absl::flat_hash_set<absl::string_view> controlling_fanins;
+  int max_input_port = -1;
   int pos = 0;
   const int last_idx = node->input_size() - 1;
   int last_pos = last_idx;
@@ -144,6 +364,7 @@ void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
       if (is_control_input) {
         fanouts()[output].emplace(node, Graph::kControlSlot);
       } else {
+        max_input_port = pos;
         max_regular_output_port()[output.node] =
             std::max(max_regular_output_port()[output.node], output.port_id);
         fanouts()[output].emplace(node, pos);
@@ -158,6 +379,10 @@ void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
   if (last_pos < last_idx) {
     node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
   }
+
+  if (max_input_port > -1) {
+    max_regular_input_port()[node] = max_input_port;
+  }
 }
 
 void MutableGraphView::UpdateMaxRegularOutputPortForRemovedFanin(
@@ -181,6 +406,13 @@ void MutableGraphView::UpdateMaxRegularOutputPortForRemovedFanin(
   }
 }
 
+void MutableGraphView::UpdateMaxRegularOutputPortForAddedFanin(
+    const OutputPort& fanin) {
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+}
+
 const absl::flat_hash_set<MutableGraphView::InputPort>&
 MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
   return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
@@ -209,29 +441,307 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
   return node_in_graph;
 }
 
-Status MutableGraphView::UpdateFanouts(absl::string_view from_node,
-                                       absl::string_view to_node) {
-  NodeDef* from_node_ptr = GetNode(from_node);
-  NodeDef* to_node_ptr = GetNode(to_node);
-  if (from_node_ptr && to_node_ptr) {
-    return UpdateFanoutsInternal(from_node_ptr, to_node_ptr);
-  } else if (!from_node_ptr) {
-    return errors::Internal(absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', from node was not found.",
-        from_node, to_node));
-  } else if (!to_node_ptr) {
-    return errors::Internal(absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', to node was not found.",
-        from_node, to_node));
-  } else {
-    return errors::Internal(
-        absl::Substitute("Can't update fanouts from '$0' to '$1', from and to "
-                         "nodes were not found.",
-                         from_node, to_node));
+Status MutableGraphView::AddSubgraph(GraphDef&& subgraph) {
+  // 1. Add all new functions and check that functions with the same name
+  // have identical definition.
+  const int function_size = subgraph.library().function_size();
+  if (function_size > 0) {
+    absl::flat_hash_map<absl::string_view, const FunctionDef*> graph_fdefs;
+    for (const FunctionDef& fdef : graph()->library().function()) {
+      graph_fdefs.emplace(fdef.signature().name(), &fdef);
+    }
+
+    for (FunctionDef& fdef : *subgraph.mutable_library()->mutable_function()) {
+      const auto graph_fdef = graph_fdefs.find(fdef.signature().name());
+
+      if (graph_fdef == graph_fdefs.end()) {
+        VLOG(3) << "Add new function definition: " << fdef.signature().name();
+        graph()->mutable_library()->add_function()->Swap(&fdef);
+      } else {
+        if (!FunctionDefsEqual(fdef, *graph_fdef->second)) {
+          return MutationError(
+              "AddSubgraph",
+              absl::Substitute("function_size=$0", function_size),
+              absl::StrCat(
+                  "Found different function definition with the same name: ",
+                  fdef.signature().name()));
+        }
+      }
+    }
+  }
+
+  // 2. Add all nodes to the underlying graph.
+  int node_size_before = graph()->node_size();
+
+  for (NodeDef& node : *subgraph.mutable_node()) {
+    auto* node_in_graph = graph()->add_node();
+    node_in_graph->Swap(&node);
+    TF_RETURN_IF_ERROR(AddUniqueNode(node_in_graph));
+  }
+
+  // TODO(ezhulenev, lyandy): Right now AddAndDedupFanouts do not check that
+  // fanins actually exists in the graph, and there is already TODO for that.
+
+  for (int i = node_size_before; i < graph()->node_size(); ++i) {
+    NodeDef* node = graph()->mutable_node(i);
+    AddAndDedupFanouts(node);
   }
+
   return Status::OK();
 }
 
+Status MutableGraphView::UpdateNode(
+    absl::string_view node_name, absl::string_view op, absl::string_view device,
+    absl::Span<const std::pair<string, AttrValue>> attrs) {
+  auto error_status = [node_name, op, device, attrs](absl::string_view msg) {
+    std::vector<string> attr_strs;
+    attr_strs.reserve(attrs.size());
+    for (const auto& attr : attrs) {
+      string attr_str = absl::Substitute("('$0', $1)", attr.first,
+                                         attr.second.ShortDebugString());
+      attr_strs.push_back(attr_str);
+    }
+    string params =
+        absl::Substitute("node_name='$0', op='$1', device='$2', attrs={$3}",
+                         node_name, op, device, absl::StrJoin(attr_strs, ", "));
+    return MutationError("UpdateNodeOp", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+
+  MutableGraphView::OutputPort control_port(node, Graph::kControlSlot);
+  auto control_fanouts = GetFanout(control_port);
+  if (op == "Switch" && !control_fanouts.empty()) {
+    return error_status(
+        "can't change node op to Switch when node drives a control dependency "
+        "(alternatively, we could add the identity node needed, but it seems "
+        "like an unlikely event and probably a mistake)");
+  }
+
+  if (node->device() != device) {
+    node->set_device(string(device));
+  }
+  node->mutable_attr()->clear();
+  for (const auto& attr : attrs) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+
+  if (node->op() == op) {
+    return Status::OK();
+  }
+
+  node->set_op(string(op));
+
+  if (CanDedupControlWithRegularInput(*this, *node)) {
+    for (const auto& control_fanout : control_fanouts) {
+      if (HasRegularFaninNode(*this, *control_fanout.node, node->name())) {
+        RemoveControllingFaninInternal(control_fanout.node, node);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
+                                        absl::string_view to_node_name,
+                                        bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("UpdateNodeName", params, msg);
+  };
+
+  NodeDef* node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, node, error_status));
+
+  if (node->name() == to_node_name) {
+    return Status::OK();
+  }
+  if (HasNode(to_node_name)) {
+    return error_status(
+        "can't update node name because new node name is in use");
+  }
+  auto max_output_port = max_regular_output_port().find(node);
+  const bool has_max_output_port =
+      max_output_port != max_regular_output_port().end();
+  auto control_fanouts = fanouts().find({node, Graph::kControlSlot});
+
+  if (update_fanouts) {
+    SwapControlledFanoutInputs(*this, control_fanouts, to_node_name);
+    if (has_max_output_port) {
+      SwapRegularFanoutInputs(&fanouts(), node, to_node_name,
+                              max_output_port->second);
+    }
+  } else if (has_max_output_port ||
+             HasFanoutValue(fanouts(), control_fanouts)) {
+    return error_status("can't update node name because node has fanouts");
+  }
+
+  nodes().erase(node->name());
+  node->set_name(string(to_node_name));
+  nodes().emplace(node->name(), node);
+  return Status::OK();
+}
+
+Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
+                                       absl::string_view to_node_name,
+                                       bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("SwapNodeNames", params, msg);
+  };
+
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, from_node, error_status));
+  if (from_node_name == to_node_name) {
+    return Status::OK();
+  }
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(to_node_name, to_node, error_status));
+
+  auto swap_names = [this, from_node, to_node]() {
+    nodes().erase(from_node->name());
+    nodes().erase(to_node->name());
+    std::swap(*from_node->mutable_name(), *to_node->mutable_name());
+    nodes().emplace(from_node->name(), from_node);
+    nodes().emplace(to_node->name(), to_node);
+  };
+
+  if (update_fanouts) {
+    SwapFanoutInputs(*this, &fanouts(), &max_regular_output_port(), from_node,
+                     to_node);
+    swap_names();
+    return Status::OK();
+  }
+
+  bool from_is_switch = IsSwitch(*from_node);
+  MutableGraphView::OutputPort to_control(to_node, Graph::kControlSlot);
+  auto to_control_fanouts = fanouts().find(to_control);
+  if (from_is_switch && HasFanoutValue(fanouts(), to_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(from_node_name));
+  }
+
+  bool to_is_switch = IsSwitch(*to_node);
+  MutableGraphView::OutputPort from_control(from_node, Graph::kControlSlot);
+  auto from_control_fanouts = fanouts().find(from_control);
+  if (to_is_switch && HasFanoutValue(fanouts(), from_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(to_node_name));
+  }
+
+  // Swap node names.
+  swap_names();
+
+  // Swap controlling fanouts.
+  //
+  // Note: To and from control fanout iterators are still valid as no mutations
+  // has been performed on fanouts().
+  SwapFanoutsMapValues(&fanouts(), from_control, from_control_fanouts,
+                       to_control, to_control_fanouts);
+
+  // Swap regular fanouts.
+  SwapRegularFanoutsAndMaxPortValues(&fanouts(), &max_regular_output_port(),
+                                     from_node, to_node);
+
+  // Update fanins to remove self loops.
+  auto update_fanins = [this](NodeDef* node, absl::string_view old_node_name) {
+    for (int i = 0; i < node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node->input(i));
+      if (tensor_id.node() == node->name()) {
+        const int idx = tensor_id.index();
+        const int node_idx =
+            IsTensorIdControlling(tensor_id) ? Graph::kControlSlot : i;
+
+        MutableGraphView::OutputPort from_fanin(node, idx);
+        absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin];
+        from_fanouts->erase({node, node_idx});
+        UpdateMaxRegularOutputPortForRemovedFanin(from_fanin, *from_fanouts);
+
+        MutableGraphView::OutputPort to_fanin(nodes().at(old_node_name), idx);
+        fanouts()[to_fanin].insert({node, node_idx});
+        UpdateMaxRegularOutputPortForAddedFanin(to_fanin);
+        node->set_input(i, TensorIdToString({old_node_name, idx}));
+      }
+    }
+  };
+  update_fanins(from_node, to_node->name());
+  update_fanins(to_node, from_node->name());
+
+  // Dedup control dependencies.
+  auto dedup_control_fanouts =
+      [this](NodeDef* node, const FanoutsMap::iterator& control_fanouts) {
+        if (CanDedupControlWithRegularInput(*this, *node) &&
+            control_fanouts != fanouts().end()) {
+          for (const auto& control_fanout : control_fanouts->second) {
+            if (HasRegularFaninNode(*this, *control_fanout.node,
+                                    node->name())) {
+              RemoveControllingFaninInternal(control_fanout.node, node);
+            }
+          }
+        }
+      };
+  auto dedup_switch_control = [this, dedup_control_fanouts](NodeDef* node) {
+    OutputPort port;
+    port.node = node;
+    const int max_port =
+        gtl::FindWithDefault(max_regular_output_port(), node, -1);
+    for (int i = 0; i <= max_port; ++i) {
+      port.port_id = i;
+      auto it = fanouts().find(port);
+      if (it == fanouts().end()) {
+        continue;
+      }
+      for (const auto& fanout : it->second) {
+        auto fanout_controls =
+            fanouts().find({fanout.node, Graph::kControlSlot});
+        dedup_control_fanouts(fanout.node, fanout_controls);
+      }
+    }
+  };
+
+  if (!from_is_switch) {
+    if (to_is_switch) {
+      dedup_switch_control(from_node);
+    } else {
+      // Fetch iterator again as the original iterator might have been
+      // invalidated by container rehash triggered due to mutations.
+      auto from_control_fanouts = fanouts().find(from_control);
+      dedup_control_fanouts(from_node, from_control_fanouts);
+    }
+  }
+  if (!to_is_switch) {
+    if (from_is_switch) {
+      dedup_switch_control(to_node);
+    } else {
+      // Fetch iterator again as the original iterator might have been
+      // invalidated by container rehash triggered due to mutations.
+      auto to_control_fanouts = fanouts().find(to_control);
+      dedup_control_fanouts(to_node, to_control_fanouts);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateFanouts(absl::string_view from_node_name,
+                                       absl::string_view to_node_name) {
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_node_name, from_node,
+                      UpdateFanoutsError(from_node_name, to_node_name)));
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(
+      to_node_name, to_node, UpdateFanoutsError(from_node_name, to_node_name)));
+
+  return UpdateFanoutsInternal(from_node, to_node);
+}
+
 Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
                                                NodeDef* to_node) {
   VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
@@ -267,10 +777,10 @@ Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
     if (to_node_is_switch) {
       // Trying to add a Switch as a control dependency, which if allowed will
       // make the graph invalid.
-      return errors::Internal(
-          absl::Substitute("Can't update fanouts from '$0' to '$1', to node is "
-                           "being added as a Switch control dependency.",
-                           from_node->name(), to_node->name()));
+      return UpdateFanoutsError(from_node->name(), to_node->name())(
+          absl::Substitute("can't update fanouts to node '$0' as it will "
+                           "become a Switch control dependency",
+                           to_node->name()));
     }
 
     NodeDef* node = control_port.node;
@@ -303,9 +813,7 @@ Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
     // Update input at destination node.
     input_port.node->set_input(
         input_port.port_id,
-        output_port.port_id == 0
-            ? to_node->name()
-            : absl::StrCat(to_node->name(), ":", output_port.port_id));
+        TensorIdToString({to_node->name(), output_port.port_id}));
 
     // Remove old edge between the `from_node` and the fanout node.
     remove_edge(output_port, input_port);
@@ -333,7 +841,7 @@ Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
 
 bool MutableGraphView::AddFaninInternal(NodeDef* node,
                                         const OutputPort& fanin) {
-  int num_non_controlling_fanins =
+  int num_regular_fanins =
       NumFanins(*node, /*include_controlling_nodes=*/false);
   bool input_is_control = IsOutputPortControlling(fanin);
   bool can_dedup_control_with_regular_input =
@@ -341,7 +849,7 @@ bool MutableGraphView::AddFaninInternal(NodeDef* node,
   // Don't add duplicate control dependencies.
   if (input_is_control) {
     const int start =
-        can_dedup_control_with_regular_input ? 0 : num_non_controlling_fanins;
+        can_dedup_control_with_regular_input ? 0 : num_regular_fanins;
     for (int i = start; i < node->input_size(); ++i) {
       if (ParseTensorName(node->input(i)).node() == fanin.node->name()) {
         return false;
@@ -351,17 +859,15 @@ bool MutableGraphView::AddFaninInternal(NodeDef* node,
 
   InputPort input;
   input.node = node;
-  input.port_id =
-      input_is_control ? Graph::kControlSlot : num_non_controlling_fanins;
+  input.port_id = input_is_control ? Graph::kControlSlot : num_regular_fanins;
 
   node->add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
-  if (IsOutputPortRegular(fanin)) {
-    int last_node_input = node->input_size() - 1;
+  if (!input_is_control) {
+    const int last_node_input = node->input_size() - 1;
     // If there are control dependencies in node, move newly inserted fanin to
     // be before such control dependencies.
-    if (num_non_controlling_fanins < last_node_input) {
-      node->mutable_input()->SwapElements(last_node_input,
-                                          num_non_controlling_fanins);
+    if (num_regular_fanins < last_node_input) {
+      node->mutable_input()->SwapElements(last_node_input, num_regular_fanins);
     }
   }
 
@@ -370,9 +876,12 @@ bool MutableGraphView::AddFaninInternal(NodeDef* node,
     max_regular_output_port()[fanin.node] = fanin.port_id;
   }
 
-  // Dedup control dependencies.
-  if (!input_is_control && can_dedup_control_with_regular_input) {
-    RemoveControllingFaninInternal(node, fanin.node);
+  // Update max input port and dedup control dependencies.
+  if (!input_is_control) {
+    max_regular_input_port()[node] = num_regular_fanins;
+    if (can_dedup_control_with_regular_input) {
+      RemoveControllingFaninInternal(node, fanin.node);
+    }
   }
 
   return true;
@@ -380,52 +889,80 @@ bool MutableGraphView::AddFaninInternal(NodeDef* node,
 
 Status MutableGraphView::AddRegularFanin(absl::string_view node_name,
                                          const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
   NodeDef* fanin_node = GetNode(fanin.node());
-
-  string node_err = NodeError(/*node_missing=*/node == nullptr);
-  string fanin_err = FaninError(IsTensorIdRegular(fanin),
-                                /*node_missing=*/fanin_node == nullptr);
-  if (!node_err.empty() || !fanin_err.empty()) {
-    return errors::Internal(absl::Substitute(
-        "Can't add$0 fanin '$1' as regular fanin to$2 node '$3'.", fanin_err,
-        fanin.ToString(), node_err, node_name));
-  }
-  if (node_name == fanin.node()) {
-    return errors::Internal(absl::Substitute(
-        "Can't add fanin '$0' as regular fanin to self.", fanin.ToString()));
-  }
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
 
   AddFaninInternal(node, {fanin_node, fanin.index()});
   return Status::OK();
 }
 
-Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
-                                             const TensorId& fanin) {
+Status MutableGraphView::AddRegularFaninByPort(absl::string_view node_name,
+                                               int port,
+                                               const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("AddRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, num_regular_fanins, error_status));
   NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
 
-  string node_err = NodeError(/*node_missing=*/node == nullptr);
-  string fanin_err = FaninError(IsTensorIdPortValid(fanin),
-                                /*node_missing=*/fanin_node == nullptr);
-  if (!node_err.empty() || !fanin_err.empty()) {
-    return errors::Internal(
-        absl::Substitute("Can't add$0 controlling fanin '$1' to$2 node '$3'.",
-                         fanin_err, fanin.ToString(), node_err, node_name));
+  const int last_node_input = node->input_size();
+  node->add_input(TensorIdToString(fanin));
+  node->mutable_input()->SwapElements(num_regular_fanins, last_node_input);
+  for (int i = num_regular_fanins - 1; i >= port; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i + 1});
+    node->mutable_input()->SwapElements(i, i + 1);
   }
-  if (node_name == fanin.node()) {
-    return errors::Internal(absl::Substitute(
-        "Can't add controlling fanin '$0' to self.", fanin.ToString()));
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+  fanouts()[fanin_port].insert({node, port});
+  UpdateMaxRegularOutputPortForAddedFanin(fanin_port);
+
+  max_regular_input_port()[node] = num_regular_fanins;
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
   }
 
-  if (!IsSwitch(*fanin_node)) {
-    AddFaninInternal(node, {fanin_node, Graph::kControlSlot});
+  return Status::OK();
+}
+
+NodeDef* MutableGraphView::GetControllingFaninToAdd(absl::string_view node_name,
+                                                    const OutputPort& fanin,
+                                                    string* error_msg) {
+  if (!IsSwitch(*fanin.node)) {
+    return fanin.node;
   } else {
-    if (IsTensorIdControlling(fanin)) {
+    if (IsOutputPortControlling(fanin)) {
       // Can't add a Switch node control dependency.
-      return errors::Internal(absl::Substitute(
-          "Can't add Switch as controlling fanin '$0' to node '$1'.",
-          fanin.ToString(), node_name));
+      TensorId tensor_id(fanin.node->name(), fanin.port_id);
+      *error_msg = absl::Substitute(
+          "can't add fanin '$0' as it will become a Switch control dependency",
+          tensor_id.ToString());
+      return nullptr;
     }
     // We can't anchor control dependencies directly on the switch node: unlike
     // other nodes only one of the outputs of the switch node will be generated
@@ -433,45 +970,73 @@ Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
     // dependency is only triggered when the corresponding output is triggered.
     // We start by looking for an identity node connected to the output of the
     // switch node, and use it to anchor the control dependency.
-    auto fanouts = GetFanouts(*fanin_node, /*include_controlled_nodes=*/false);
-    for (auto fanout : fanouts) {
+    for (const auto& fanout : GetFanout(fanin)) {
       if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
-        if (ParseTensorName(fanout.node->input(0)) == fanin) {
-          if (fanout.node->name() == node_name) {
-            return errors::Internal(absl::Substitute(
-                "Can't add found controlling fanin '$0' from fanin '$1' to "
-                "self.",
-                AsControlDependency(fanout.node->name()), fanin.ToString()));
-          }
-          AddFaninInternal(node, {fanout.node, Graph::kControlSlot});
-          return Status::OK();
+        if (fanout.node->name() == node_name) {
+          *error_msg =
+              absl::Substitute("can't add found fanin '$0' to self",
+                               AsControlDependency(fanout.node->name()));
+          return nullptr;
         }
+        return fanout.node;
       }
     }
-    // We haven't found an existing node where we can anchor the control
-    // dependency: add a new identity node.
-    string ctrl_dep_name = AddPrefixToNodeName(
-        absl::StrCat(fanin.node(), "_", fanin.index()), kMutableGraphViewCtrl);
-    if (node_name == ctrl_dep_name) {
-      return errors::Internal(absl::Substitute(
-          "Can't add generated controlling fanin '$0' from fanin '$1' to self.",
-          AsControlDependency(ctrl_dep_name), fanin.ToString()));
-    }
-
-    // Reuse a previously created node, if possible.
-    NodeDef* ctrl_dep_node = GetNode(ctrl_dep_name);
-    if (ctrl_dep_node == nullptr) {
-      NodeDef new_node;
-      new_node.set_name(ctrl_dep_name);
-      new_node.set_op("Identity");
-      new_node.set_device(fanin_node->device());
-      (*new_node.mutable_attr())["T"].set_type(
-          fanin_node->attr().at("T").type());
-      new_node.add_input(TensorIdToString(fanin));
-      ctrl_dep_node = AddNode(std::move(new_node));
-    }
-    AddFaninInternal(node, {ctrl_dep_node, Graph::kControlSlot});
+
+    // No node found, check if node to be created is itself.
+    if (GeneratedNameForIdentityConsumingSwitch(fanin) == node_name) {
+      *error_msg = absl::Substitute("can't add generated fanin '$0' to self",
+                                    AsControlDependency(string(node_name)));
+    }
+  }
+  return nullptr;
+}
+
+NodeDef* MutableGraphView::GetOrCreateIdentityConsumingSwitch(
+    const OutputPort& fanin) {
+  // We haven't found an existing node where we can anchor the control
+  // dependency: add a new identity node.
+  string identity_name = GeneratedNameForIdentityConsumingSwitch(fanin);
+  NodeDef* identity_node = GetNode(identity_name);
+  if (identity_node == nullptr) {
+    NodeDef new_node;
+    new_node.set_name(identity_name);
+    new_node.set_op("Identity");
+    new_node.set_device(fanin.node->device());
+    (*new_node.mutable_attr())["T"].set_type(fanin.node->attr().at("T").type());
+    new_node.add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
+    identity_node = AddNode(std::move(new_node));
+  }
+  return identity_node;
+}
+
+Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
+                                             const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+
+  string error_msg = "";
+  NodeDef* control_node = GetControllingFaninToAdd(
+      node_name, {fanin_node, fanin.index()}, &error_msg);
+  if (!error_msg.empty()) {
+    return error_status(error_msg);
   }
+  if (control_node == nullptr) {
+    control_node = GetOrCreateIdentityConsumingSwitch(fanin_port);
+  }
+  AddFaninInternal(node, {control_node, Graph::kControlSlot});
+
   return Status::OK();
 }
 
@@ -491,14 +1056,12 @@ bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
 
   auto mutable_inputs = node->mutable_input();
   bool modified = false;
-  const int num_inputs = node->input_size();
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
   int i;
   int curr_pos = 0;
-  for (i = 0; i < num_inputs; ++i) {
+  for (i = 0; i < num_regular_fanins; ++i) {
     TensorId tensor_id = ParseTensorName(node->input(i));
-    if (IsTensorIdControlling(tensor_id)) {
-      break;
-    }
     if (tensor_id.node() == fanin.node->name() &&
         tensor_id.index() == fanin.port_id) {
       remove_input(fanin, i, /*update_max_port=*/true);
@@ -517,9 +1080,17 @@ bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
     }
   }
 
-  if (modified && curr_pos < i) {
-    // Remove fanins from node inputs.
-    mutable_inputs->DeleteSubrange(curr_pos, i - curr_pos);
+  if (modified) {
+    const int last_regular_input_port = curr_pos - 1;
+    if (last_regular_input_port < 0) {
+      max_regular_input_port().erase(node);
+    } else {
+      max_regular_input_port()[node] = last_regular_input_port;
+    }
+    if (curr_pos < i) {
+      // Remove fanins from node inputs.
+      mutable_inputs->DeleteSubrange(curr_pos, i - curr_pos);
+    }
   }
 
   return modified;
@@ -527,24 +1098,64 @@ bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
 
 Status MutableGraphView::RemoveRegularFanin(absl::string_view node_name,
                                             const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("RemoveRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(
+      CheckRemovingFaninFromSelf(node_name, fanin, error_status));
   NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
   NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
 
-  string node_err = NodeError(/*node_missing=*/node == nullptr);
-  string fanin_err = FaninError(IsTensorIdRegular(fanin),
-                                /*node_missing=*/fanin_node == nullptr);
-  if (!node_err.empty() || !fanin_err.empty()) {
-    return errors::Internal(absl::Substitute(
-        "Can't remove$0 fanin '$1' as regular fanin from$2 node '$3'.",
-        fanin_err, fanin.ToString(), node_err, node_name));
+  RemoveRegularFaninInternal(node, {fanin_node, fanin.index()});
+  return Status::OK();
+}
+
+Status MutableGraphView::RemoveRegularFaninByPort(absl::string_view node_name,
+                                                  int port) {
+  auto error_status = [node_name, port](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', port=$1", node_name, port);
+    return MutationError("RemoveRegularFaninByPort", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  fanouts()[fanin_port].erase({node, port});
+  auto mutable_inputs = node->mutable_input();
+  for (int i = port + 1; i <= last_regular_fanin_port; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i - 1});
+    mutable_inputs->SwapElements(i - 1, i);
   }
-  if (node_name == fanin.node()) {
-    return errors::Internal(
-        absl::Substitute("Can't remove fanin '$0' as regular fanin from self.",
-                         fanin.ToString()));
+  const int last_node_input = node->input_size() - 1;
+  if (last_regular_fanin_port < last_node_input) {
+    mutable_inputs->SwapElements(last_regular_fanin_port, last_node_input);
+  }
+  mutable_inputs->RemoveLast();
+
+  const int updated_last_regular_input_port = last_regular_fanin_port - 1;
+  if (updated_last_regular_input_port < 0) {
+    max_regular_input_port().erase(node);
+  } else {
+    max_regular_input_port()[node] = updated_last_regular_input_port;
   }
 
-  RemoveRegularFaninInternal(node, {fanin_node, fanin.index()});
   return Status::OK();
 }
 
@@ -568,21 +1179,19 @@ bool MutableGraphView::RemoveControllingFaninInternal(NodeDef* node,
 
 Status MutableGraphView::RemoveControllingFanin(
     absl::string_view node_name, absl::string_view fanin_node_name) {
+  auto error_status = [node_name, fanin_node_name](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin_node_name='$1'",
+                                     node_name, fanin_node_name);
+    return MutationError("RemoveControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckRemovingFaninFromSelf(
+      node_name, {fanin_node_name, Graph::kControlSlot}, error_status));
   NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
   NodeDef* fanin_node = GetNode(fanin_node_name);
-
-  string node_err = NodeError(/*node_missing=*/node == nullptr);
-  string fanin_err = NodeError(/*node_missing=*/fanin_node == nullptr);
-  if (!node_err.empty() || !fanin_err.empty()) {
-    return errors::Internal(absl::Substitute(
-        "Can't remove$0 controlling fanin '$1' from$2 node '$3'.", fanin_err,
-        AsControlDependency(string(fanin_node_name)), node_err, node_name));
-  }
-  if (node_name == fanin_node_name) {
-    return errors::Internal(
-        absl::Substitute("Can't remove controlling fanin '$0' from self.",
-                         AsControlDependency(string(fanin_node_name))));
-  }
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(fanin_node_name, fanin_node, error_status));
 
   RemoveControllingFaninInternal(node, fanin_node);
   return Status::OK();
@@ -591,24 +1200,26 @@ Status MutableGraphView::RemoveControllingFanin(
 Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
                                          bool keep_controlling_fanins) {
   NodeDef* node = GetNode(node_name);
-
   if (node == nullptr) {
-    return errors::Internal(absl::Substitute(
-        "Can't remove all fanins from missing node '$0'.", node_name));
+    string params =
+        absl::Substitute("node_name='$0', keep_controlling_fanins=$1",
+                         node_name, keep_controlling_fanins);
+    return MutationError("RemoveAllFanins", params,
+                         NodeMissingErrorMsg(node_name));
   }
 
   if (node->input().empty()) {
     return Status::OK();
   }
 
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
   RemoveFaninsInternal(node, keep_controlling_fanins);
   if (keep_controlling_fanins) {
-    int num_non_controlling_fanins =
-        NumFanins(*node, /*include_controlling_nodes=*/false);
-    if (num_non_controlling_fanins == 0) {
+    if (num_regular_fanins == 0) {
       return Status::OK();
-    } else if (num_non_controlling_fanins < node->input_size()) {
-      node->mutable_input()->DeleteSubrange(0, num_non_controlling_fanins);
+    } else if (num_regular_fanins < node->input_size()) {
+      node->mutable_input()->DeleteSubrange(0, num_regular_fanins);
     } else {
       node->clear_input();
     }
@@ -621,37 +1232,36 @@ Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
 Status MutableGraphView::UpdateFanin(absl::string_view node_name,
                                      const TensorId& from_fanin,
                                      const TensorId& to_fanin) {
+  auto error_status = [node_name, from_fanin, to_fanin](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', from_fanin='$1', to_fanin='$2'",
+                         node_name, from_fanin.ToString(), to_fanin.ToString());
+    return MutationError("UpdateFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(from_fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(to_fanin, error_status));
   NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
   NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_fanin.node(), from_fanin_node, error_status));
   NodeDef* to_fanin_node = GetNode(to_fanin.node());
-
-  string node_err = NodeError(/*node_missing=*/node == nullptr);
-  string from_fanin_err =
-      FaninError(IsTensorIdPortValid(from_fanin),
-                 /*node_missing=*/from_fanin_node == nullptr);
-  string to_fanin_err = FaninError(IsTensorIdPortValid(to_fanin),
-                                   /*node_missing=*/to_fanin_node == nullptr);
-  if (!node_err.empty() || !from_fanin_err.empty() || !to_fanin_err.empty()) {
-    return errors::Internal(absl::Substitute(
-        "Can't update$0 fanin '$1' to$2 fanin '$3' in$4 node '$5'.",
-        from_fanin_err, from_fanin.ToString(), to_fanin_err,
-        to_fanin.ToString(), node_err, node_name));
-  }
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(to_fanin.node(), to_fanin_node, error_status));
 
   // When replacing a non control dependency fanin with a control dependency, or
   // vice versa, remove and add, so ports can be updated properly in fanout(s).
   bool to_fanin_is_control = IsTensorIdControlling(to_fanin);
   if (to_fanin_is_control && IsSwitch(*to_fanin_node)) {
     // Can't add Switch node as a control dependency.
-    return errors::Internal(absl::Substitute(
-        "Can't update fanin '$0' to fanin '$1' in node '$2', to fanin is a "
-        "Switch control dependency.",
-        from_fanin.ToString(), to_fanin.ToString(), node_name));
+    return error_status(
+        absl::Substitute("can't update to fanin '$0' as it will become a "
+                         "Switch control dependency",
+                         to_fanin.ToString()));
   }
   if (node_name == from_fanin.node() || node_name == to_fanin.node()) {
-    return errors::Internal(absl::Substitute(
-        "Can't update fanin '$0' to fanin '$1' in self '$2'.",
-        from_fanin.ToString(), to_fanin.ToString(), node_name));
+    return error_status("can't update fanin to or from self");
   }
 
   if (from_fanin == to_fanin) {
@@ -673,33 +1283,27 @@ Status MutableGraphView::UpdateFanin(absl::string_view node_name,
     return Status::OK();
   }
 
-  // In place mutation, requires no shifting of ports.
+  // In place mutation of regular fanins, requires no shifting of ports.
   string to_fanin_string = TensorIdToString(to_fanin);
-  int num_inputs = node->input_size();
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
   bool modified = false;
   absl::flat_hash_set<InputPort>* from_fanin_port_fanouts = nullptr;
   absl::flat_hash_set<InputPort>* to_fanin_port_fanouts = nullptr;
-  for (int i = 0; i < num_inputs; ++i) {
+  for (int i = 0; i < num_regular_fanins; ++i) {
     if (ParseTensorName(node->input(i)) == from_fanin) {
-      InputPort old_input;
-      old_input.node = node;
-      old_input.port_id =
-          IsTensorIdControlling(from_fanin) ? Graph::kControlSlot : i;
+      InputPort input(node, i);
       if (from_fanin_port_fanouts == nullptr) {
         OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
         from_fanin_port_fanouts = &fanouts()[from_fanin_port];
       }
-      from_fanin_port_fanouts->erase(old_input);
+      from_fanin_port_fanouts->erase(input);
 
-      InputPort new_input;
-      new_input.node = node;
-      new_input.port_id =
-          IsTensorIdControlling(to_fanin) ? Graph::kControlSlot : i;
       if (to_fanin_port_fanouts == nullptr) {
         OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
         to_fanin_port_fanouts = &fanouts()[to_fanin_port];
       }
-      to_fanin_port_fanouts->insert(new_input);
+      to_fanin_port_fanouts->insert(input);
 
       node->set_input(i, to_fanin_string);
       modified = true;
@@ -721,6 +1325,165 @@ Status MutableGraphView::UpdateFanin(absl::string_view node_name,
   return Status::OK();
 }
 
+Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
+                                                  int port,
+                                                  const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("UpdateRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  if (tensor_id == fanin) {
+    return Status::OK();
+  }
+
+  InputPort input(node, port);
+  OutputPort from_fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin_port];
+  from_fanouts->erase(input);
+  UpdateMaxRegularOutputPortForRemovedFanin(from_fanin_port, *from_fanouts);
+
+  OutputPort to_fanin_port(fanin_node, fanin.index());
+  fanouts()[to_fanin_port].insert(input);
+  UpdateMaxRegularOutputPortForAddedFanin(to_fanin_port);
+
+  node->set_input(port, TensorIdToString(fanin));
+
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
+                                                  int from_port, int to_port) {
+  auto error_status = [node_name, from_port, to_port](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', from_port=$1, to_port=$2",
+                                     node_name, from_port, to_port);
+    return MutationError("SwapRegularFaninsByPorts", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(CheckPortRange(from_port, /*min=*/0,
+                                    last_regular_fanin_port, error_status));
+  TF_RETURN_IF_ERROR(CheckPortRange(to_port, /*min=*/0, last_regular_fanin_port,
+                                    error_status));
+
+  if (from_port == to_port) {
+    return Status::OK();
+  }
+  TensorId from_fanin = ParseTensorName(node->input(from_port));
+  TensorId to_fanin = ParseTensorName(node->input(to_port));
+  if (from_fanin == to_fanin) {
+    return Status::OK();
+  }
+
+  InputPort from_input(node, from_port);
+  InputPort to_input(node, to_port);
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  absl::flat_hash_set<InputPort>* from_fanouts =
+      &fanouts()[{from_fanin_node, from_fanin.index()}];
+  from_fanouts->erase(from_input);
+  from_fanouts->insert(to_input);
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  absl::flat_hash_set<InputPort>* to_fanouts =
+      &fanouts()[{to_fanin_node, to_fanin.index()}];
+  to_fanouts->erase(to_input);
+  to_fanouts->insert(from_input);
+
+  node->mutable_input()->SwapElements(from_port, to_port);
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateAllRegularFaninsToControlling(
+    absl::string_view node_name) {
+  auto error_status = [node_name](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0'", node_name);
+    return MutationError("UpdateAllRegularFaninsToControlling", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  std::vector<OutputPort> regular_fanins;
+  regular_fanins.reserve(num_regular_fanins);
+  std::vector<NodeDef*> controlling_fanins;
+  controlling_fanins.reserve(num_regular_fanins);
+
+  // Get all regular fanins and derive controlling fanins.
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+
+    string error_msg = "";
+    NodeDef* control_node =
+        GetControllingFaninToAdd(node_name, fanin_port, &error_msg);
+    if (!error_msg.empty()) {
+      return error_status(error_msg);
+    }
+
+    regular_fanins.push_back(fanin_port);
+    controlling_fanins.push_back(control_node);
+  }
+
+  // Replace regular fanins with controlling fanins and dedup.
+  int pos = 0;
+  InputPort input_port(node, Graph::kControlSlot);
+  absl::flat_hash_set<absl::string_view> controls;
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    OutputPort fanin_port = regular_fanins[i];
+    NodeDef* control = controlling_fanins[i];
+    if (control == nullptr) {
+      control = GetOrCreateIdentityConsumingSwitch(fanin_port);
+    }
+    fanouts()[fanin_port].erase({node, i});
+    if (controls.contains(control->name())) {
+      continue;
+    }
+    controls.insert(control->name());
+    node->set_input(pos, AsControlDependency(control->name()));
+    fanouts()[{control, Graph::kControlSlot}].insert(input_port);
+    ++pos;
+  }
+
+  // Shift existing controlling fanins and dedup.
+  for (int i = num_regular_fanins; i < node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (controls.contains(tensor_id.node())) {
+      continue;
+    }
+    controls.insert(tensor_id.node());
+    node->mutable_input()->SwapElements(pos, i);
+    ++pos;
+  }
+
+  // Remove duplicate controls and leftover regular fanins.
+  node->mutable_input()->DeleteSubrange(pos, node->input_size() - pos);
+  max_regular_input_port().erase(node);
+
+  return Status::OK();
+}
+
 Status MutableGraphView::CheckNodesCanBeDeleted(
     const absl::flat_hash_set<string>& nodes_to_delete) {
   std::vector<string> missing_nodes;
@@ -767,16 +1530,18 @@ Status MutableGraphView::CheckNodesCanBeDeleted(
   };
 
   if (!missing_nodes.empty()) {
-    VLOG(1) << absl::Substitute("Attempting to delete missing node(s) [$0]",
+    VLOG(2) << absl::Substitute("Attempting to delete missing node(s) [$0].",
                                 sort_and_sample(&missing_nodes));
   }
   if (!nodes_with_fanouts.empty()) {
     std::vector<string> input_node_names(nodes_to_delete.begin(),
                                          nodes_to_delete.end());
-    return errors::Internal(absl::Substitute(
-        "Can't delete node(s) with retained fanout(s) [$0] from node(s) [$1].",
-        sort_and_sample(&nodes_with_fanouts),
-        sort_and_sample(&input_node_names)));
+    string params = absl::Substitute("nodes_to_delete={$0}",
+                                     sort_and_sample(&input_node_names));
+    string error_msg =
+        absl::Substitute("can't delete node(s) with retained fanouts(s) [$0]",
+                         sort_and_sample(&nodes_with_fanouts));
+    return MutationError("DeleteNodes", params, error_msg);
   }
 
   return Status::OK();
@@ -824,14 +1589,15 @@ void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
                                             bool keep_controlling_fanins) {
   for (int i = 0; i < deleted_node->input_size(); ++i) {
     TensorId tensor_id = ParseTensorName(deleted_node->input(i));
-    if (keep_controlling_fanins && IsTensorIdControlling(tensor_id)) {
+    bool is_control = IsTensorIdControlling(tensor_id);
+    if (keep_controlling_fanins && is_control) {
       break;
     }
     OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
     input.node = deleted_node;
-    input.port_id = IsTensorIdControlling(tensor_id) ? Graph::kControlSlot : i;
+    input.port_id = is_control ? Graph::kControlSlot : i;
 
     auto it = fanouts().find(fanin);
     if (it != fanouts().end()) {
@@ -840,11 +1606,12 @@ void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
       UpdateMaxRegularOutputPortForRemovedFanin(fanin, *fanouts_set);
     }
   }
+  max_regular_input_port().erase(deleted_node);
 }
 
 void MutableGraphView::RemoveFanoutsInternal(NodeDef* deleted_node) {
-  const int max_port = gtl::FindWithDefault(max_regular_output_port(),
-                                            deleted_node, Graph::kControlSlot);
+  const int max_port =
+      gtl::FindWithDefault(max_regular_output_port(), deleted_node, -1);
   for (int i = Graph::kControlSlot; i <= max_port; ++i) {
     fanouts().erase({deleted_node, i});
   }
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index c62129bcadddfc92575a0070100a94a2d81664a5..a09c147be6c1a044d558c202e2047ae6a5d12916 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -63,19 +63,75 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // node in graph.
   NodeDef* AddNode(NodeDef&& node);
 
-  // Updates all fanouts (input ports fetching output tensors) from `from_node`
-  // to the `to_node`, including control dependencies.
+  // Adds all nodes from the `subgraph` to the underlying graph and updates the
+  // view. `subgraph` doesn't have to be a valid graph definition on it's own,
+  // it can have edges to the nodes that are not in it, however after adding
+  // it to the underlying graph, final graph must be valid.
+  //
+  // If subgraph function library is not empty, all new functions will be added
+  // to the graph. Functions that appear with the same name in both subgraph and
+  // the graph represented by *this, must have identical function definitions.
+  //
+  // IMPORTANT: All nodes and functions of the given subgraph moved into the
+  // underlying graph, which leaves subgraph in valid but undefined state.
+  Status AddSubgraph(GraphDef&& subgraph);
+
+  // Updates node `node_name` op, device, and attributes. This will clear any
+  // existing attributes. If it is not possible to update the node or if the
+  // node does not exist, an error will be returned and nothing will be modified
+  // in the graph.
+  Status UpdateNode(absl::string_view node_name, absl::string_view op,
+                    absl::string_view device,
+                    absl::Span<const std::pair<string, AttrValue>> attrs);
+
+  // Updates node `from_node_name` name to `to_node_name`. If `to_node_name` is
+  // in use, node `from_node_name` does not exist, or node `from_node_name` has
+  // fanouts and `update_fanouts` is set to false, an error will be returned and
+  // nothing will be modified in the graph.
+  Status UpdateNodeName(absl::string_view from_node_name,
+                        absl::string_view to_node_name, bool update_fanouts);
+
+  // Swap node names `from_node_name` and `to_node_name`. Self loops of one node
+  // are removed by updating the inputs introducing self loops to use the other
+  // node's name. Setting `update_fanouts` to false will exclude other fanouts
+  // from having their inputs updated, but inputs introducing self loops will
+  // always be updated regardless of `update_fanouts.
+  //
+  // Example:
+  //   1. foo(other:3, bar:2, ^bar)
+  //   2. bar(foo:3, other:1, foo:1, ^foo)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", false):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", true):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(bar:5, foo:6)
+  //
+  // If it is not possible to swap node names (i.e. nodes do not exist or Switch
+  // control dependency may be introduced), an error will be returned and
+  // nothing will be modified in the graph.
+  Status SwapNodeNames(absl::string_view from_node_name,
+                       absl::string_view to_node_name, bool update_fanouts);
+
+  // Updates all fanouts (input ports fetching output tensors) from
+  // `from_node_name` to the `to_node_name`, including control dependencies.
   //
   // Example: We have 3 nodes that use `bar` node output tensors as inputs:
   //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
   //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
+  // After calling UpdateFanouts(bar, new_bar):
   //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
   //   3. foo3(other:2, ^new_bar)
-  Status UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
+  Status UpdateFanouts(absl::string_view from_node_name,
+                       absl::string_view to_node_name);
 
   // Adds regular fanin `fanin` to node `node_name`. If the node or fanin do not
   // exist in the graph, nothing will be modified in the graph. Otherwise fanin
@@ -84,6 +140,17 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // AddControllingFanin.
   Status AddRegularFanin(absl::string_view node_name, const TensorId& fanin);
 
+  // Adds regular fanin `fanin` to node `node_name` at port `port`. If the node
+  // or fanin do not exist in the graph, nothing will be modified in the graph.
+  // Otherwise fanin will be inserted at port `port`. Control dependencies will
+  // be deduped. To add control dependencies, use AddControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the number of
+  // regular fanins), this will result in an error and the node will not be
+  // modified.
+  Status AddRegularFaninByPort(absl::string_view node_name, int port,
+                               const TensorId& fanin);
+
   // Adds control dependency `fanin` to the target node named `node_name`. To
   // add regular fanins, use AddRegularFanin.
   //
@@ -117,6 +184,15 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // not result in an error and the node will not be modified.
   Status RemoveRegularFanin(absl::string_view node_name, const TensorId& fanin);
 
+  // Removes regular fanin at port `port` from node `node_name`. If the node
+  // does not exist in the graph, nothing will be modified in the graph.
+  // To remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status RemoveRegularFaninByPort(absl::string_view node_name, int port);
+
   // Removes control dependency `fanin_node_name` from the target node named
   // `node_name`. If the node or fanin do not exist in the graph, nothing will
   // be modified in the graph. To remove regular fanins, use RemoveRegualrFanin.
@@ -143,6 +219,30 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   Status UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
                      const TensorId& to_fanin);
 
+  // Replaces fanin at port `port` in node `node_name` with fanin `fanin`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status UpdateRegularFaninByPort(absl::string_view node_name, int port,
+                                  const TensorId& fanin);
+
+  // Swaps fanins at ports `from_port` and `to_port` in node `node_name`. If the
+  // node does not exist, nothing will be modified in the graph.
+  //
+  // If the ports are not a valid port (less than 0 or greater than the last
+  // index of the regular fanins), this will result in an error and the node
+  // will not be modified.
+  Status SwapRegularFaninsByPorts(absl::string_view node_name, int from_port,
+                                  int to_port);
+
+  // Updates all regular fanins to equivalent controlling fanins. If it is not
+  // possible, an error will be returned and nothing will be modified in the
+  // graph.
+  Status UpdateAllRegularFaninsToControlling(absl::string_view node_name);
+
   // Deletes nodes from the graph. If a node can't be safely removed,
   // specifically if a node still has fanouts, an error will be returned. Nodes
   // that can't be found are ignored.
@@ -162,6 +262,10 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
       const OutputPort& fanin,
       const absl::flat_hash_set<InputPort>& fanin_fanouts);
 
+  // Updates max regular output port for newly added fanin by checking the
+  // current max and updating if the newly added fanin is of a larger port.
+  void UpdateMaxRegularOutputPortForAddedFanin(const OutputPort& fanin);
+
   // Updates all fanouts (input ports fetching output tensors) from `from_node`
   // to the `to_node`, including control dependencies.
   //
@@ -170,7 +274,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   //   2. foo2(bar:1, other:1)
   //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
+  // After calling UpdateFanouts(bar, new_bar):
   //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
   //   3. foo3(other:2, ^new_bar)
@@ -184,6 +288,21 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // added after existing non control dependency inputs.
   bool AddFaninInternal(NodeDef* node, const OutputPort& fanin);
 
+  // Finds control dependency node to be used based on fanin. If fanin is not a
+  // Switch node, fanin.node is simply returned. Otherwise this will try to find
+  // a candidate Identity node consuming fanin, as the control dependency. If it
+  // is not possible or will introduce a self loop, an error message will be
+  // set. If nullptr is returned with no error
+  // GetOrCreateIdentityConsumingSwitch should be called to generate the new
+  // Identity node.
+  NodeDef* GetControllingFaninToAdd(absl::string_view node_name,
+                                    const OutputPort& fanin, string* error_msg);
+
+  // Finds a generated Identity node consuming Switch node `fanin.node` at port
+  // `fanin.port_id`. If such a node does not exist, a new Identity node will be
+  // created.
+  NodeDef* GetOrCreateIdentityConsumingSwitch(const OutputPort& fanin);
+
   // Removes all instances of regular fanin `fanin` from node `node`.
   bool RemoveRegularFaninInternal(NodeDef* node, const OutputPort& fanin);
 
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index acfaba5ddd3c493387ce8bc9d2ed66723cf6663e..07818d1f526b5b7d7897fd5db2c561b5d90965c7 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -141,6 +141,614 @@ void CheckGraph(const MutableGraphView& mutable_graph) {
   }
 }
 
+TEST(MutableGraphViewTest, AddSubgraph) {
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("foo", "NotImportant", {}, {}),
+          NDef("bar", "NotImportant", {}, {}),
+          NDef("baz", "NotImportant", {"foo", "bar"}),
+      },
+      /*funcs=*/{});
+  MutableGraphView graph(&graph_def);
+
+  // `s/bar` node has inputs that are valid only if we add subgraph into the
+  // original graph.
+  GraphDef subgraph = test::function::GDef(
+      {
+          NDef("s/n0", "NotImportant", {}, {}),
+          NDef("s/n1", "NotImportant", {"bar", "s/n0"}, {}),
+      },
+      /*funcs=*/{});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+
+  // Fanins and fanouts must be updated for the nodes of the original graph, and
+  // added subgraph.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"baz:1", "s/n1"});
+  CheckNode(graph, "s/n1", "NotImportant", "", {}, {"bar", "s/n0"}, {});
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndAddFunction) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndSkipSameFunction) {
+  FunctionDef x_times_two = test::function::XTimesTwo();
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_two});
+  MutableGraphView graph(&graph_def);
+
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndFailIfFunctionDifferent) {
+  FunctionDef x_times_four = test::function::XTimesFour();
+  x_times_four.mutable_signature()->set_name("XTimesTwo");
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_four});
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  Status status = graph.AddSubgraph(std::move(subgraph));
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::AddSubgraph(function_size=1) error: Found "
+            "different function definition with the same name: XTimesTwo.");
+}
+
+TEST(MutableGraphViewTest, UpdateNodeNoDedupControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  AttrValue list_value;
+  list_value.mutable_list()->add_type(DT_FLOAT);
+  TF_EXPECT_OK(
+      graph.UpdateNode("bar_2", "IdentityN", kDevice, {{"T", list_value}}));
+
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2"});
+  CheckNode(graph, "bar_2", "IdentityN", kDevice, {{"T", list_value}},
+            {"bar_1:1"}, {"foo_1", "foo_1:2", "^foo_1", "foo_2:1", "^foo_2"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"bar_2", "other", "bar_2:1", "^bar_2"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"other:1", "bar_2:2", "^bar_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeDedupControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNode("bar_2", "NotImportant", kDevice, {}));
+
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2"});
+  CheckNode(graph, "bar_2", "NotImportant", kDevice, {}, {"bar_1:1"},
+            {"foo_1", "foo_1:2", "foo_2:1"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"bar_2", "other", "bar_2:1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "bar_2:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeSwitchNoControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def =
+      test::function::GDef({NDef("foo", "NotImportant", {}, {}),
+                            NDef("bar", "NotImportant", {"foo:1"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNode("foo", "Switch", kDevice, {}));
+
+  CheckNode(graph, "foo", "Switch", kDevice, {}, {}, {"bar"});
+  CheckNode(graph, "bar", "NotImportant", "", {}, {"foo:1"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeSwitchControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def =
+      test::function::GDef({NDef("foo", "NotImportant", {}, {}),
+                            NDef("bar", "NotImportant", {"^foo"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  AttrValue attr;
+  attr.set_type(DT_FLOAT);
+  Status s = graph.UpdateNode("foo", "Switch", kDevice, {{"T", attr}});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::UpdateNodeOp(node_name='foo', op='Switch', "
+      "device='/device:foo:0', attrs={('T', type: DT_FLOAT)}) error: can't "
+      "change node op to Switch when node drives a control dependency "
+      "(alternatively, we could add the identity node needed, but it seems "
+      "like an unlikely event and probably a mistake).";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  CheckNode(graph, "foo", "NotImportant", "", {}, {}, {"^bar"});
+  CheckNode(graph, "bar", "NotImportant", "", {}, {"^foo"}, {});
+
+  CheckGraph(graph);
+}
+
+absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
+    const GraphDef& graph, absl::string_view node_to_exclude) {
+  absl::flat_hash_map<string, std::vector<string>> node_inputs;
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    node_inputs[node.name()] =
+        std::vector<string>(node.input().begin(), node.input().end());
+  }
+  return node_inputs;
+}
+
+void CheckUnmodifiedNodeFanins(
+    const GraphDef& graph, absl::string_view node_to_exclude,
+    const absl::flat_hash_map<string, std::vector<string>>&
+        unmodified_node_inputs) {
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    auto it = unmodified_node_inputs.find(node.name());
+    ASSERT_NE(it, unmodified_node_inputs.end());
+    ASSERT_EQ(it->second.size(), node.input_size());
+    for (int i = 0; i < node.input_size(); ++i) {
+      EXPECT_EQ(node.input(i), it->second[i]);
+    }
+  }
+}
+
+void TestUpdateNodeName(absl::string_view from_node_name, bool node_exists,
+                        absl::string_view to_node_name, bool update_fanouts,
+                        bool success, const string& error_msg,
+                        absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(from_node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, from_node_name);
+
+  Status s = graph.UpdateNodeName(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), success);
+  string updated_node_name;
+  if (success) {
+    updated_node_name = string(to_node_name);
+  } else {
+    updated_node_name = string(from_node_name);
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    EXPECT_EQ(node->name(), updated_node_name);
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, updated_node_name,
+                            unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeName) {
+  string error_msg;
+  // Node has no fanouts.
+  TestUpdateNodeName("b", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has fanouts and rename to self.
+  TestUpdateNodeName("b", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has no fanouts and rename to self.
+  TestUpdateNodeName("a", /*node_exists=*/true, "a", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {});
+
+  // New node name is in use.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='c', to_node_name='b', "
+      "update_fanouts=false) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("c", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='b', "
+      "update_fanouts=true) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "b", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+  // Node has fanouts.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='d', "
+      "update_fanouts=false) error: can't update node name because node has "
+      "fanouts.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  // Node does not exist.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=false) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=true) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+}
+
+TEST(MutableGraphViewTest, UpdateNodeNameWithFanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"b", "^a"}),
+       NDef("d", "NotImportant", {"^b", "^a"}),
+       NDef("e", "NotImportant", {"b:2", "c:4", "b:1", "^a"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNodeName("b", "f", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"f", "^c", "^d", "^e"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"a:2"},
+            {"c", "^d", "e", "e:2"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"f", "^a"}, {"e:1"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^f", "^a"}, {});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"f:2", "c:4", "f:1", "^a"},
+            {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleSwapNodeNamesMutationGraph() {
+  return test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("switch_1", "Switch", {"a"}),
+       NDef("identity_1", "Identity", {"switch_1:1"}),
+       NDef("b", "NotImportant", {}, {}), NDef("switch_2", "Switch", {"b"}),
+       NDef("identity_2", "Identity", {"switch_2:0"}),
+       NDef("foo_1", "NotImportant", {"identity_1", "^identity_1"}),
+       NDef("foo_2", "NotImportant", {"identity_2", "^identity_2"})},
+      /*funcs=*/{});
+}
+
+void TestSwapNodeNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("foo_1", "foo_2", update_fanouts));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNames) {
+  TestSwapNodeNames(/*update_fanouts=*/false);
+  TestSwapNodeNames(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesWithSameNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("identity_1", "identity_1", update_fanouts));
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSameName) {
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/false);
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/true);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitches) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/false));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_2"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_1"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitchesAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_2:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_1:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a", "identity_1"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"}, {"foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {}, {"identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/true));
+
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"a:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b", "identity_2"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"}, {"foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"b:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+void TestSwapNodeNamesSimpleSelfLoop(bool update_fanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {"b:7"}), NDef("b", "NotImportant", {"a:10"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "b", update_fanouts));
+
+  // No self loops.
+  CheckNode(graph, "a", "NotImportant", "", {}, {"b:10"}, {"b:0"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:7"}, {"a:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSelfLoops) {
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/false);
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesError(absl::string_view from_node_name,
+                            absl::string_view to_node_name, bool update_fanouts,
+                            const string& error_msg) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.SwapNodeNames(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), false);
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+// TODO(lyandy): add tests with update_fanouts == true.
+TEST(MutableGraphView, SwapNodeNamesError) {
+  string error_msg;
+  // Missing nodes.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=false) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=true) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=false) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=true) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=false) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=true) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/true, error_msg);
+
+  // Switch control dependencies.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='switch_2', "
+      "to_node_name='identity_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_2' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("switch_2", "identity_1", /*update_fanouts=*/false,
+                         error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='identity_2', "
+      "to_node_name='switch_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_1' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("identity_2", "switch_1", /*update_fanouts=*/false,
+                         error_msg);
+}
+
 TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
@@ -237,14 +845,16 @@ TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithControlFromSwitch) {
   Status s = graph.UpdateFanouts("a", "b");
   EXPECT_FALSE(s.ok());
   string expected_msg =
-      "Can't update fanouts from 'a' to 'b', to node is being added as a "
-      "Switch control dependency.";
+      "MutableGraphView::UpdateFanouts(from_node_name='a', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
   EXPECT_EQ(s.error_message(), expected_msg);
   s = graph.UpdateFanouts("d", "b");
   EXPECT_FALSE(s.ok());
   expected_msg =
-      "Can't update fanouts from 'd' to 'b', to node is being added as a "
-      "Switch control dependency.";
+      "MutableGraphView::UpdateFanouts(from_node_name='d', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
   EXPECT_EQ(s.error_message(), expected_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 5);
@@ -295,36 +905,6 @@ GraphDef SimpleMutateFaninGraph() {
   return graph_def;
 }
 
-absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
-    const GraphDef& graph, absl::string_view node_to_exclude) {
-  absl::flat_hash_map<string, std::vector<string>> node_inputs;
-  for (const auto& node : graph.node()) {
-    if (node.name() == node_to_exclude) {
-      continue;
-    }
-    node_inputs[node.name()] =
-        std::vector<string>(node.input().begin(), node.input().end());
-  }
-  return node_inputs;
-}
-
-void CheckUnmodifiedNodeFanins(
-    const GraphDef& graph, absl::string_view node_to_exclude,
-    const absl::flat_hash_map<string, std::vector<string>>&
-        unmodified_node_inputs) {
-  for (const auto& node : graph.node()) {
-    if (node.name() == node_to_exclude) {
-      continue;
-    }
-    auto it = unmodified_node_inputs.find(node.name());
-    ASSERT_NE(it, unmodified_node_inputs.end());
-    ASSERT_EQ(it->second.size(), node.input_size());
-    for (int i = 0; i < node.input_size(); ++i) {
-      EXPECT_EQ(node.input(i), it->second[i]);
-    }
-  }
-}
-
 void TestAddRegularFanin(absl::string_view node_name, bool node_exists,
                          const TensorId& fanin_to_add, bool success,
                          const string& error_msg,
@@ -379,66 +959,228 @@ TEST(MutableGraphViewTest, AddRegularFanin) {
                       error_msg, {"c:1", "^b", "^a"});
 
   // Add control to node with 1 input 0 controls.
-  error_msg = "Can't add invalid fanin '^b' as regular fanin to node 'foo_1'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', fanin='^b') error: "
+      "fanin '^b' must be a regular tensor id.";
   TestAddRegularFanin("foo_1", /*node_exists=*/true, {"b", Graph::kControlSlot},
                       /*success=*/false, error_msg, {"a"});
   // Add control to node with multiple inputs and 0 controls.
-  error_msg = "Can't add invalid fanin '^c' as regular fanin to node 'foo_3'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_3', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
   TestAddRegularFanin("foo_3", /*node_exists=*/true, {"c", Graph::kControlSlot},
                       /*success=*/false, error_msg, {"b", "a:1", "a:1"});
   // Add control to node with 1 input multiple controls.
-  error_msg = "Can't add invalid fanin '^d' as regular fanin to node 'foo_2'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^d') error: "
+      "fanin '^d' must be a regular tensor id.";
   TestAddRegularFanin("foo_2", /*node_exists=*/true, {"d", Graph::kControlSlot},
                       /*success=*/false, error_msg, {"b", "^a", "^c"});
   // Add control to node with multiple input multiple controls.
-  error_msg = "Can't add invalid fanin '^a' as regular fanin to node 'foo_4'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_4', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
   TestAddRegularFanin("foo_4", /*node_exists=*/true, {"a", Graph::kControlSlot},
                       /*success=*/false, error_msg,
                       {"a", "b:2", "b:2", "^c", "^d"});
   // Add control to node with 0 inputs 0 controls.
-  error_msg = "Can't add invalid fanin '^a' as regular fanin to node 'foo_5'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_5', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
   TestAddRegularFanin("foo_5", /*node_exists=*/true, {"a", Graph::kControlSlot},
                       /*success=*/false, error_msg, {});
   // Add control to node with 0 inputs multiple controls.
-  error_msg = "Can't add invalid fanin '^c' as regular fanin to node 'foo_6'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
   TestAddRegularFanin("foo_6", /*node_exists=*/true, {"c", Graph::kControlSlot},
                       /*success=*/false, error_msg, {"^a", "^b"});
   // Add control to node with control that already exists.
-  error_msg = "Can't add invalid fanin '^a' as regular fanin to node 'foo_2'.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
   TestAddRegularFanin("foo_2", /*node_exists=*/true, {"a", Graph::kControlSlot},
                       /*success=*/false, error_msg, {"b", "^a", "^c"});
 
   // Add fanin to node where node is missing.
   error_msg =
-      "Can't add fanin 'a:0' as regular fanin to missing node 'foo_missing'.";
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', fanin='a:0') "
+      "error: node 'foo_missing' was not found.";
   TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
                       /*success=*/false, error_msg, {});
   // Add fanin to node where fanin is missing.
   error_msg =
-      "Can't add missing fanin 'bar_missing:0' as regular fanin to node "
-      "'foo_1'.";
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
   TestAddRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
                       /*success=*/false, error_msg, {"a"});
   // Add fanin to node where node and fanin are missing.
   error_msg =
-      "Can't add missing fanin 'bar_missing:0' as regular fanin to missing "
-      "node 'foo_missing'.";
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
   TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"bar_missing", 0},
                       /*success=*/false, error_msg, {});
   // Add control fanin to node where node and fanin are missing.
   error_msg =
-      "Can't add invalid/missing fanin '^bar_missing' as regular fanin to "
-      "missing node 'foo_missing'.";
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
   TestAddRegularFanin("foo_missing", /*node_exists=*/false,
                       {"bar_missing", Graph::kControlSlot},
                       /*success=*/false, error_msg, {});
 
   // Add self to create cycle.
-  error_msg = "Can't add fanin 'foo_6:2' as regular fanin to self.";
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='foo_6:2') "
+      "error: can't add fanin 'foo_6:2' to self.";
   TestAddRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
                       /*success=*/false, error_msg, {"^a", "^b"});
 }
 
+void TestAddRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                               int port, const TensorId& fanin_to_add,
+                               bool success, const string& error_msg,
+                               absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.AddRegularFaninByPort(node_name, port, fanin_to_add);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddRegularFaninByPort) {
+  string error_msg;
+  // Add input at start to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "a:1", "a:1"});
+  // Add input at end to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "a:1", "d:2"});
+  // Add input in middle to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "d:2", "a:1"});
+  // Add input at start to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "^c", "^a"});
+  // Add input at end to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "d:2", "^c", "^a"});
+  // Add input in middle to node with some inputs and some controls, and dedup
+  // controls.
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"a", "b:2", "d:2", "b:2", "^c"});
+  // Add input to node with no inputs and no controls.
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2"});
+  // Add input to node with no inputs and some controls.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2", "^b", "^a"});
+  // Add fanin should dedup control.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"b", 2},
+                            /*success=*/true, error_msg, {"b:2", "^a"});
+
+  // Add controlling fanin.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=2, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Add fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=4, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/4, {"d", 2},
+                            /*success=*/false, error_msg,
+                            {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Add fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"a", 0},
+                            /*success=*/false, error_msg, {});
+  // Add fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestAddRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {"a"});
+  // Add fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {});
+
+  // Add self to create cycle.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                            {"foo_6", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+}
+
 void CheckFanoutRemoved(const MutableGraphView& graph, const TensorId& fanin,
                         absl::string_view node_name) {
   MutableGraphView::OutputPort output_port =
@@ -506,19 +1248,22 @@ TEST(MutableGraphViewTest, RemoveRegularFanin) {
 
   // Remove control from node with 1 input multiple controls.
   error_msg =
-      "Can't remove invalid fanin '^a' as regular fanin from node 'foo_2'.";
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_2', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
   TestRemoveRegularFanin("foo_2", /*node_exists=*/true,
                          {"a", Graph::kControlSlot},
                          /*success=*/false, error_msg, {"b", "^a", "^c"});
   // Remove control from node with multiple input multiple controls.
   error_msg =
-      "Can't remove invalid fanin '^d' as regular fanin from node 'foo_4'.";
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_4', fanin='^d') "
+      "error: fanin '^d' must be a regular tensor id.";
   TestRemoveRegularFanin(
       "foo_4", /*node_exists=*/true, {"d", Graph::kControlSlot},
       /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
   // Remove control from node with 0 inputs multiple controls.
   error_msg =
-      "Can't remove invalid fanin '^a' as regular fanin from node 'foo_6'.";
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
   TestRemoveRegularFanin("foo_6", /*node_exists=*/true,
                          {"a", Graph::kControlSlot},
                          /*success=*/false, error_msg, {"^a", "^b"});
@@ -533,53 +1278,162 @@ TEST(MutableGraphViewTest, RemoveRegularFanin) {
 
   // Remove control from node with 1 input 0 controls.
   error_msg =
-      "Can't remove invalid fanin '^b' as regular fanin from node 'foo_1'.";
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', fanin='^b') "
+      "error: fanin '^b' must be a regular tensor id.";
   TestRemoveRegularFanin("foo_1", /*node_exists=*/true,
                          {"b", Graph::kControlSlot},
                          /*success=*/false, error_msg, {"a"});
   // Remove control from node with multiple inputs and 0 controls.
   error_msg =
-      "Can't remove invalid fanin '^c' as regular fanin from node 'foo_3'.";
-  TestRemoveRegularFanin("foo_3", /*node_exists=*/true,
-                         {"c", Graph::kControlSlot},
-                         /*success=*/false, error_msg, {"b", "a:1", "a:1"});
-  // Remove control from node with 0 inputs 0 controls.
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_3', fanin='^c') "
+      "error: fanin '^c' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_3", /*node_exists=*/true,
+                         {"c", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  // Remove control from node with 0 inputs 0 controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_5', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_5", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
+
+  // Remove fanin from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='a:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
+                         /*success=*/false, error_msg, {});
+  // Remove fanin from node where fanin is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
+                         /*success=*/false, error_msg, {"a"});
+  // Remove fanin from node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", 0}, /*success=*/false, error_msg, {});
+  // Remove control from node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
+
+  // Remove self.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', "
+      "fanin='foo_6:2') error: can't remove fanin 'foo_6:2' from self.";
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
+                         /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestRemoveRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(nullptr, node);
+  } else {
+    EXPECT_EQ(nullptr, node);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveRegularFaninByPort(node_name, port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveRegularFaninByPort) {
+  string error_msg;
+  // Remove input at start of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg, {"a:1", "a:1"});
+  // Remove input at end of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input in middle of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input at start of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg,
+                               {"b:2", "b:2", "^d", "^c"});
+  // Remove input at end of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+  // Remove input in middle of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+
+  // Remove input from node with no inputs and no controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_5', port=0) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/false, error_msg, {});
+  // Remove input from node with no inputs and some controls.
   error_msg =
-      "Can't remove invalid fanin '^a' as regular fanin from node 'foo_5'.";
-  TestRemoveRegularFanin("foo_5", /*node_exists=*/true,
-                         {"a", Graph::kControlSlot},
-                         /*success=*/false, error_msg, {});
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_6', port=1) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/false, error_msg, {"^a", "^b"});
 
-  // Remove fanin from node where node is missing.
+  // Remove fanin at out of bounds port.
   error_msg =
-      "Can't remove fanin 'a:0' as regular fanin from missing node "
-      "'foo_missing'.";
-  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
-                         /*success=*/false, error_msg, {});
-  // Remove fanin from node where fanin is missing.
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
   error_msg =
-      "Can't remove missing fanin 'bar_missing:0' as regular fanin from node "
-      "'foo_1'.";
-  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
-                         /*success=*/false, error_msg, {"a"});
-  // Remove fanin from node where node and fanin are missing.
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
   error_msg =
-      "Can't remove missing fanin 'bar_missing:0' as regular fanin from "
-      "missing node 'foo_missing'.";
-  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
-                         {"bar_missing", 0}, /*success=*/false, error_msg, {});
-  // Remove control from node where node and fanin are missing.
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
   error_msg =
-      "Can't remove invalid/missing fanin '^bar_missing' as regular fanin from "
-      "missing node 'foo_missing'.";
-  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
-                         {"bar_missing", Graph::kControlSlot},
-                         /*success=*/false, error_msg, {});
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
 
-  // Remove self.
-  error_msg = "Can't remove fanin 'foo_6:2' as regular fanin from self.";
-  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
-                         /*success=*/false, error_msg, {"^a", "^b"});
+  // Remove fanin from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_missing', "
+      "port=0) error: node 'foo_missing' was not found.";
+  TestRemoveRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                               /*success=*/false, error_msg, {});
 }
 
 void TestRemoveAllFanins(absl::string_view node_name, bool node_exists,
@@ -666,10 +1520,15 @@ TEST(MutableGraphViewTest, RemoveAllFanins) {
                       /*success=*/true, error_msg, {"^a", "^b"});
 
   // Remove all fanins from node where node is missing.
-  error_msg = "Can't remove all fanins from missing node 'foo_missing'.";
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=false) error: node 'foo_missing' was not found.";
   TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
                       /*keep_controlling_nodes=*/false,
                       /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=true) error: node 'foo_missing' was not found.";
   TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
                       /*keep_controlling_nodes=*/true,
                       /*success=*/false, error_msg, {});
@@ -742,63 +1601,78 @@ TEST(MutableGraphViewTest, UpdateFanin) {
 
   // Update fanin of node where node is missing.
   error_msg =
-      "Can't update fanin 'a:0' to fanin 'a:1' in missing node 'foo_missing'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='a:0', to_fanin='a:1') error: node 'foo_missing' was not "
+      "found.";
   TestUpdateFanin("foo_missing", /*node_exists=*/false, {"a", 0}, {"a", 1},
                   /*success=*/false, error_msg, {});
   // Update fanin of node where from fanin is missing.
   error_msg =
-      "Can't update missing fanin 'from_bar_missing:0' to fanin 'a:1' in node "
-      "'foo_1'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_1', "
+      "from_fanin='from_bar_missing:0', to_fanin='a:1') error: node "
+      "'from_bar_missing' was not found.";
   TestUpdateFanin("foo_1", /*node_exists=*/true, {"from_bar_missing", 0},
                   {"a", 1},
                   /*success=*/false, error_msg, {"a"});
   // Update fanin of node where to fanin is missing.
   error_msg =
-      "Can't update fanin 'a:0' to missing fanin 'to_bar_missing:1' in node "
-      "'foo_1'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='to_bar_missing:1') error: node 'to_bar_missing' was not "
+      "found.";
   TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0},
                   {"to_bar_missing", 1}, /*success=*/false, error_msg, {"a"});
   // Update fanin of node where from/to fanins and node are missing.
   error_msg =
-      "Can't update missing fanin 'from_bar_missing:0' to missing fanin "
-      "'to_bar_missing:1' in missing node 'foo_missing'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:0', to_fanin='to_bar_missing:1') error: "
+      "node 'foo_missing' was not found.";
   TestUpdateFanin("foo_missing", /*node_exists=*/false, {"from_bar_missing", 0},
                   {"to_bar_missing", 1},
                   /*success=*/false, error_msg, {});
   // Update fanin of node where from fanin is invalid.
   error_msg =
-      "Can't update invalid fanin 'a:-2' to fanin 'a:0' in node 'foo_1'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:-2', "
+      "to_fanin='a:0') error: fanin 'a:-2' must be a valid tensor id.";
   TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", -2}, {"a", 0},
                   /*success=*/false, error_msg, {"a"});
   // Update fanin of node where to fanin is invalid.
   error_msg =
-      "Can't update fanin 'a:0' to invalid fanin 'a:-2' in node 'foo_1'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='a:-2') error: fanin 'a:-2' must be a valid tensor id.";
   TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0}, {"a", -2},
                   /*success=*/false, error_msg, {"a"});
   // Update fanin of node where from/to fanins are invalid and missing and node
   // is missing.
   error_msg =
-      "Can't update invalid/missing fanin 'from_bar_missing:-2' to "
-      "invalid/missing fanin 'to_bar_missing:-3' in missing node "
-      "'foo_missing'.";
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:-2', to_fanin='to_bar_missing:-3') error: "
+      "fanin 'from_bar_missing:-2' must be a valid tensor id.";
   TestUpdateFanin("foo_missing", /*node_exists=*/false,
                   {"from_bar_missing", -2}, {"to_bar_missing", -3},
                   /*success=*/false, error_msg, {});
 
   // Update to self to create cycle.
-  error_msg = "Can't update fanin 'b:2' to fanin 'foo_4:3' in self 'foo_4'.";
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='foo_4:3') error: can't update fanin to or from self.";
   TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", 3},
                   /*success=*/false, error_msg,
                   {"a", "b:2", "b:2", "^c", "^d"});
-  error_msg = "Can't update fanin 'b:2' to fanin '^foo_4' in self 'foo_4'.";
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
   TestUpdateFanin(
       "foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", Graph::kControlSlot},
       /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
-  error_msg = "Can't update fanin '^c' to fanin 'foo_4:4' in self 'foo_4'.";
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='foo_4:4') error: can't update fanin to or from self.";
   TestUpdateFanin(
       "foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot}, {"foo_4", 4},
       /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
-  error_msg = "Can't update fanin '^c' to fanin '^foo_4' in self 'foo_4'.";
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
   TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
                   {"foo_4", Graph::kControlSlot}, /*success=*/false, error_msg,
                   {"a", "b:2", "b:2", "^c", "^d"});
@@ -816,8 +1690,9 @@ void TestUpdateFaninFromFaninToNodeAsSwitchControl(const TensorId& fanin) {
   Status s = graph.UpdateFanin("c", fanin, {"b", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
   string expected_msg = absl::Substitute(
-      "Can't update fanin '$0' to fanin '^b' in node 'c', to fanin is a Switch "
-      "control dependency.",
+      "MutableGraphView::UpdateFanin(node_name='c', from_fanin='$0', "
+      "to_fanin='^b') error: can't update to fanin '^b' as it will become a "
+      "Switch control dependency.",
       fanin.ToString());
   EXPECT_EQ(s.error_message(), expected_msg);
 
@@ -837,6 +1712,394 @@ TEST(MutableGraphViewTest, UpdateFaninToNodeAsSwitchControl) {
   TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", Graph::kControlSlot});
 }
 
+void TestUpdateRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, const TensorId& fanin, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateRegularFaninByPort(node_name, port, fanin);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateRegularFaninByPort) {
+  string error_msg;
+  // Update input at start to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "a:1", "a:1"});
+  // Update input at end to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"b", "a:1", "d:2"});
+  // Update input in middle to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"b", "d:2", "a:1"});
+  // Update input at start to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "b:2", "b:2", "^c"});
+  // Update input at end to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"a", "b:2", "d:2", "^c"});
+  // Update input in middle to node with some inputs and some controls and
+  // dedup controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"a", "d:2", "b:2", "^c"});
+
+  // Update input to controlling fanin.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=1, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"a", 0},
+                               /*success=*/false, error_msg, {});
+  // Update fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not "
+      "found.";
+  TestUpdateRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                               {"bar_missing", 0},
+                               /*success=*/false, error_msg, {"a"});
+  // Update fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"bar_missing", 0},
+                               /*success=*/false, error_msg, {});
+
+  // Update self to create cycle.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"foo_6", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestSwapRegularFaninsByPorts(absl::string_view node_name, bool node_exists,
+                                  int from_port, int to_port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.SwapRegularFaninsByPorts(node_name, from_port, to_port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, SwapRegularFaninsByPorts) {
+  string error_msg;
+  // Swapping first and last regular fanins
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  // Swapping first and last regular fanins, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  // Swapping middle regular fanin.
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  // Swapping middle regular fanin, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  // Swapping same port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  // Swapping same fanin but different port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swaping fanins at out of bounds ports.
+  // Node with no regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  // Node with no regular fanins and some controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  // Node with regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  // Node with regular fanins and controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swapping fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_missing', "
+      "from_port=0, to_port=1) error: node 'foo_missing' was not found.";
+  TestSwapRegularFaninsByPorts("foo_missing", /*node_exists=*/false,
+                               /*from_port=*/0, /*to_port=*/1,
+                               /*success=*/false, error_msg, {});
+}
+
 TEST(MutableGraphViewTest, DedupControllingFaninsOnGraphInit) {
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
@@ -894,7 +2157,7 @@ TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFanin) {
   CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnAddFanin) {
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFanin) {
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
        NDef("c", "", {}, {}), NDef("d", "", {}, {})},
@@ -908,12 +2171,61 @@ TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnAddFanin) {
   CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
   TF_EXPECT_OK(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
   CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"b", 2}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "b:2", "^b"}, {});
 
   TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
   CheckNode(graph, "d", "", "", {}, {"^b"}, {});
   TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
   CheckNode(graph, "d", "", "", {}, {"^b"}, {});
 
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"},
+            {"c:0", "c:1", "^c", "^d"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def =
+      test::function::GDef({NDef("a", "NotImportant", {}, {}),
+                            NDef("b", "NotImportant", {"c", "^a"}),
+                            NDef("c", "NotImportant", {"a:1"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("b", 0, {"a", 2}));
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2", "c"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:1"}, {"b:1"});
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b:0", "c:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {"c:2"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 1, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 0, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"b:2", "c:2", "b:2", "^b"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "d:2", "^d"});
+  CheckNode(graph, "c", "", "", {}, {}, {"d:1"});
+
   CheckGraph(graph);
 }
 
@@ -935,7 +2247,7 @@ TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFanin) {
   CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnUpdateFanin) {
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFanin) {
   GraphDef graph_def = test::function::GDef(
       {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
        NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
@@ -954,6 +2266,50 @@ TEST(MutableGraphViewTest, NoDedupControlFlowControllingFaninsOnUpdateFanin) {
   TF_EXPECT_OK(graph.UpdateFanin("e", {"c", 3}, {"c", Graph::kControlSlot}));
   CheckNode(graph, "e", "NotImportant", "", {}, {"^c"}, {});
 
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"d:0", "^d", "^e"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {"a:1", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("c", 0, {"b", 2}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("d", 0, {"b", 1}));
+  CheckNode(graph, "d", "NotImportant", "", {}, {"b:1", "^b"}, {});
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("e", 0, {"c", 2}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c:2", "^c"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "^d"});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"e:0", "^e"});
+
   CheckGraph(graph);
 }
 
@@ -1038,18 +2394,23 @@ TEST(MutableGraphViewTest, AddControllingFaninMissing) {
   // Missing fanin.
   Status s = graph.AddControllingFanin("a", {"c", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
-  string expected_msg = "Can't add missing controlling fanin '^c' to node 'a'.";
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^c') error: "
+      "node 'c' was not found.";
   EXPECT_EQ(s.error_message(), expected_msg);
   // Missing node.
   s = graph.AddControllingFanin("d", {"a", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
-  expected_msg = "Can't add controlling fanin '^a' to missing node 'd'.";
+  expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='^a') error: "
+      "node 'd' was not found.";
   EXPECT_EQ(s.error_message(), expected_msg);
   // Missing node and fanin.
   s = graph.AddControllingFanin("c", {"d", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
   expected_msg =
-      "Can't add missing controlling fanin '^d' to missing node 'c'.";
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='^d') error: "
+      "node 'c' was not found.";
   EXPECT_EQ(s.error_message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
@@ -1106,7 +2467,8 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitch) {
   Status s = graph.AddControllingFanin("a", {"b", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
   string expected_msg =
-      "Can't add Switch as controlling fanin '^b' to node 'a'.";
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^b') error: "
+      "can't add fanin '^b' as it will become a Switch control dependency.";
   EXPECT_EQ(s.error_message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
@@ -1212,7 +2574,9 @@ void TestAddControllingFaninSelfLoops(absl::string_view node_name,
 }
 
 TEST(MutableGraphViewTest, AddControllingFaninSelfLoops) {
-  string error_msg = "Can't add controlling fanin '^a' to self.";
+  string error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^a') error: "
+      "can't add fanin '^a' to self.";
   TestAddControllingFaninSelfLoops("a", {"a", Graph::kControlSlot}, error_msg);
 
   // Adding Switch control dependency to Identity consumer. Node `c` is
@@ -1221,7 +2585,8 @@ TEST(MutableGraphViewTest, AddControllingFaninSelfLoops) {
   // Identity, this will introduce a self loop, so no control dependency should
   // be added.
   error_msg =
-      "Can't add found controlling fanin '^c' from fanin 'b:0' to self.";
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='b:0') "
+      "error: can't add found fanin '^c' to self.";
   TestAddControllingFaninSelfLoops("c", {"b", 0}, error_msg);
 
   // Adding Switch control dependency to Identity consumer. Node `d` is
@@ -1230,7 +2595,8 @@ TEST(MutableGraphViewTest, AddControllingFaninSelfLoops) {
   // Identity, this will introduce a self loop, so no control dependency should
   // be added.
   error_msg =
-      "Can't add found controlling fanin '^d' from fanin 'b:1' to self.";
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='b:1') "
+      "error: can't add found fanin '^d' to self.";
   TestAddControllingFaninSelfLoops("d", {"b", 1}, error_msg);
 }
 
@@ -1253,8 +2619,9 @@ TEST(MutableGraphViewTest, AddControllingFaninSelfLoopsGeneratedIdentity) {
   Status s = graph.AddControllingFanin("ConstantFoldingCtrl/b_1", {"b", 1});
   EXPECT_FALSE(s.ok());
   string expected_msg =
-      "Can't add generated controlling fanin '^ConstantFoldingCtrl/b_1' from "
-      "fanin 'b:1' to self.";
+      "MutableGraphView::AddControllingFanin(node_name='ConstantFoldingCtrl/"
+      "b_1', fanin='b:1') error: can't add generated fanin "
+      "'^ConstantFoldingCtrl/b_1' to self.";
   EXPECT_EQ(s.error_message(), expected_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 4);
@@ -1344,7 +2711,10 @@ TEST(MutableGraphViewTest, RemoveControllingFaninSelfLoop) {
 
   Status s = graph.RemoveControllingFanin("c", "c");
   EXPECT_FALSE(s.ok());
-  string expected_msg = "Can't remove controlling fanin '^c' from self.";
+  string expected_msg =
+      "MutableGraphView::RemoveControllingFanin(node_name='c', "
+      "fanin_node_name='c') error: can't remove fanin '^c' from "
+      "self.";
   EXPECT_EQ(s.error_message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
@@ -1356,6 +2726,109 @@ TEST(MutableGraphViewTest, RemoveControllingFaninSelfLoop) {
   CheckGraph(graph);
 }
 
+void TestUpdateAllRegularFaninsToControlling(
+    absl::string_view node_name, bool node_exists, bool success,
+    const string& error_msg, absl::Span<const string> expected_fanins) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("switch", "Switch", {}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("b", "NotImportant", {"switch:1"}, {}),
+       NDef("ConstantFoldingCtrl/switch_1", "Identity", {"switch:1"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("c", "NotImportant", {"a", "^b"}, {}),
+       NDef("d", "NotImportant", {"b", "c"}, {}),
+       NDef("e", "NotImportant", {"^d"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateAllRegularFaninsToControlling(node_name);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateAllRegularFaninsToControlling) {
+  string error_msg;
+  // Nodes with some regular fanins and some controls.
+  TestUpdateAllRegularFaninsToControlling("a", /*node_exists=*/true,
+                                          /*success=*/true, error_msg, {});
+  TestUpdateAllRegularFaninsToControlling("c", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^a", "^b"});
+  TestUpdateAllRegularFaninsToControlling("d", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^b", "^c"});
+  TestUpdateAllRegularFaninsToControlling("e", /*node_exists=*/true,
+                                          /*success=*/true, error_msg, {"^d"});
+
+  // Use existing Identity to pin control dependency of Switch.
+  TestUpdateAllRegularFaninsToControlling("b", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^ConstantFoldingCtrl/switch_1"});
+
+  // Missing node.
+  error_msg =
+      "MutableGraphView::UpdateAllRegularFaninsToControlling(node_name='f') "
+      "error: node 'f' was not found.";
+  TestUpdateAllRegularFaninsToControlling("f", /*node_exists=*/false,
+                                          /*success=*/false, error_msg, {});
+
+  // Error in getting controlling fanin.
+  error_msg =
+      "MutableGraphView::UpdateAllRegularFaninsToControlling(node_name='"
+      "ConstantFoldingCtrl/switch_1') error: can't add found fanin "
+      "'^ConstantFoldingCtrl/switch_1' to self.";
+  TestUpdateAllRegularFaninsToControlling("ConstantFoldingCtrl/switch_1",
+                                          /*node_exists=*/true,
+                                          /*success=*/false, error_msg,
+                                          {"switch:1"});
+}
+
+TEST(MutableGraphViewTest, UpdateAllRegularFaninsToControllingConsumingSwitch) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("switch", "Switch", {}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("b", "NotImportant", {"switch:1"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateAllRegularFaninsToControlling("b"));
+
+  EXPECT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "switch", "Switch", kDevice, {{"T", DT_FLOAT}}, {},
+            {"ConstantFoldingCtrl/switch_1"});
+  CheckNode(graph, "b", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_1"}, {});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_1", "Identity", kDevice,
+            {{"T", DT_FLOAT}}, {"switch:1"}, {"^b"});
+
+  CheckGraph(graph);
+}
+
 TEST(MutableGraphViewTest, DeleteNodes) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
@@ -1465,8 +2938,8 @@ TEST(MutableGraphViewTest, DeleteNodesWithError) {
   Status s = graph.DeleteNodes({"b", "a"});
   EXPECT_FALSE(s.ok());
   string error_msg =
-      "Can't delete node(s) with retained fanout(s) [a, b] from node(s) [a, "
-      "b].";
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b}) error: can't "
+      "delete node(s) with retained fanouts(s) [a, b].";
   EXPECT_EQ(s.error_message(), error_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 6);
@@ -1499,8 +2972,9 @@ TEST(MutableGraphViewTest, DeleteNodesWithLargeError) {
   Status s = graph.DeleteNodes({"a", "b", "c", "d", "e", "f"});
   EXPECT_FALSE(s.ok());
   string error_msg =
-      "Can't delete node(s) with retained fanout(s) [a, b, c, d, e, ...] from "
-      "node(s) [a, b, c, d, e, ...].";
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b, c, d, e, ...}) "
+      "error: can't delete node(s) with retained fanouts(s) [a, b, c, d, e, "
+      "...].";
   EXPECT_EQ(s.error_message(), error_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 13);
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f7931c615cb2f18418ae09512cc50a2ececf1cc3..84aed8e6ab60eb0a36c9aa188ad42442f3fd4f27 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -47,10 +47,30 @@ bool IsAnyDiv(const NodeDef& node) {
          node.op() == "FloorDiv" || node.op() == "TruncateDiv";
 }
 
+bool IsAnyMax(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Max" || op == "SegmentMax" || op == "UnsortedSegmentMax";
+}
+
+bool IsAnyMaxPool(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPool" || op == "MaxPoolV2" || op == "MaxPool3D" ||
+         op == "MaxPoolWithArgmax" || op == "FractionalMaxPool";
+}
+
+bool IsAnyMin(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Min" || op == "SegmentMin" || op == "UnsortedSegmentMin";
+}
+
 bool IsApproximateEqual(const NodeDef& node) {
   return node.op() == "ApproximateEqual";
 }
 
+bool IsArgMax(const NodeDef& node) { return node.op() == "ArgMax"; }
+
+bool IsArgMin(const NodeDef& node) { return node.op() == "ArgMin"; }
+
 bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
 
 bool IsAssign(const NodeDef& node) {
@@ -164,18 +184,13 @@ bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
 bool IsElementWiseMonotonic(const NodeDef& node, bool* is_non_decreasing) {
   static const gtl::FlatSet<string>* const kMonotonicNonDecreasingOps =
       CHECK_NOTNULL((new gtl::FlatSet<string>{
-          "Asinh", "Atanh",   "Ceil",  "Elu",  "Erf",  "Exp",   "Expm1",
-          "Floor", "Log",     "Log1p", "Relu", "Relu", "Relu6", "Rint",
-          "Selu",  "Sigmoid", "Sign",  "Sinh", "Sqrt", "Tanh",
+          "Acosh", "Asin", "Asinh",    "Atan",     "Atanh", "Ceil",
+          "Elu",   "Erf",  "Exp",      "Expm1",    "Floor", "Log",
+          "Log1p", "Relu", "Relu6",    "Rint",     "Selu",  "Sigmoid",
+          "Sign",  "Sinh", "Softsign", "Softplus", "Sqrt",  "Tanh",
       }));
   static const gtl::FlatSet<string>* const kMonotonicNonIncreasingOps =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{
-          "Inv",
-          "Reciprocal",
-          "Erfc",
-          "Rsqrt",
-          "Neg",
-      }));
+      CHECK_NOTNULL((new gtl::FlatSet<string>{"Acos", "Erfc", "Neg", "Rsqrt"}));
   if (kMonotonicNonDecreasingOps->count(node.op()) > 0) {
     if (is_non_decreasing) {
       *is_non_decreasing = true;
@@ -228,6 +243,8 @@ bool IsGreater(const NodeDef& node) { return node.op() == "Greater"; }
 
 bool IsGreaterEqual(const NodeDef& node) { return node.op() == "GreaterEqual"; }
 
+bool IsHostConstant(const NodeDef& node) { return node.op() == "HostConst"; }
+
 bool IsHistogramSummary(const NodeDef& node) {
   return node.op() == "HistogramSummary";
 }
@@ -278,8 +295,8 @@ bool IsLogicalOr(const NodeDef& node) { return node.op() == "LogicalOr"; }
 
 bool IsMatMul(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
-         op == "SparseMatMul";
+  return op == "MatMul" || op == "BatchMatMul" || op == "SparseMatMul" ||
+         IsQuantizedMatMul(node);
 }
 
 bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
@@ -320,6 +337,8 @@ bool IsNextIteration(const NodeDef& node) {
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsOnesLike(const NodeDef& node) { return node.op() == "OnesLike"; }
+
 bool IsPack(const NodeDef& node) { return node.op() == "Pack"; }
 
 bool IsPad(const NodeDef& node) {
@@ -341,10 +360,16 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
-bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+bool IsPrint(const NodeDef& node) {
+  return node.op() == "Print" || node.op() == "PrintV2";
+}
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsQuantizedMatMul(const NodeDef& node) {
+  return node.op() == "QuantizedMatMul" || node.op() == "QuantizedMatMulV2";
+}
+
 bool IsQueue(const NodeDef& node) {
   return str_util::EndsWith(node.op(), "QueueV2");
 }
@@ -355,6 +380,10 @@ bool IsRandomShuffle(const NodeDef& node) {
 
 bool IsRank(const NodeDef& node) { return node.op() == "Rank"; }
 
+bool IsReadVariableOp(const NodeDef& node) {
+  return node.op() == "ReadVariableOp";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -418,6 +447,8 @@ bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
 
 bool IsSnapshot(const NodeDef& node) { return node.op() == "Snapshot"; }
 
+bool IsSoftmax(const NodeDef& node) { return node.op() == "Softmax"; }
+
 bool IsSoftplusGrad(const NodeDef& node) { return node.op() == "SoftplusGrad"; }
 
 bool IsSoftsignGrad(const NodeDef& node) { return node.op() == "SoftsignGrad"; }
@@ -534,6 +565,8 @@ bool IsWhile(const NodeDef& node) {
   return op == "While" || op == "StatelessWhile";
 }
 
+bool IsZerosLike(const NodeDef& node) { return node.op() == "ZerosLike"; }
+
 bool IsZeta(const NodeDef& node) { return node.op() == "Zeta"; }
 
 namespace {
@@ -543,7 +576,7 @@ bool GetBoolAttr(const NodeDef& node, const string& name) {
 }  // namespace
 
 bool IsPersistent(const NodeDef& node) {
-  return IsConstant(node) || IsVariable(node);
+  return IsConstant(node) || IsVariable(node) || IsHostConstant(node);
 }
 
 bool MaybeHasRefInput(const NodeDef& node) {
@@ -568,6 +601,22 @@ bool IsDataset(const NodeDef& node) {
          op == "DatasetToSingleElement" || op == "ReduceDataset";
 }
 
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry) {
+  const OpDef* op_def = nullptr;
+  const string& op_name = node.op();
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to lookup OpDef for " << op_name
+                 << ". Error: " << status.error_message();
+    return false;
+  }
+  return op_def->is_stateful();
+}
+
+bool IsStateful(const NodeDef node) {
+  return IsStateful(node, OpRegistry::Global());
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
@@ -768,5 +817,114 @@ bool IsIdempotent(const NodeDef& node) {
          !ModifiesFrameInfo(node);
 }
 
+bool NeverForwardsInputs(const NodeDef& node) {
+  static const gtl::FlatSet<string>* const kNonForwardingOps = CHECK_NOTNULL(
+      (new gtl::FlatSet<string>{"ArgMax",
+                                "ArgMin",
+                                "AudioSpectrogram",
+                                "BatchMatMul",
+                                "BatchToSpace",
+                                "BatchToSpaceND",
+                                "Bincount",
+                                "BroadcastArgs",
+                                "BroadcastGradientArgs",
+                                "CTCBeamSearchDecoder",
+                                "CTCGreedyDecoder",
+                                "CTCLoss",
+                                "ComplexAbs",
+                                "Concat",
+                                "ConcatOffset",
+                                "ConcatV2",
+                                "Copy",
+                                "CopyHost",
+                                "Cross",
+                                "CudnnRNN",
+                                "CudnnRNNBackprop",
+                                "CudnnRNNBackpropV2",
+                                "CudnnRNNBackpropV3",
+                                "CudnnRNNCanonicalToParams",
+                                "CudnnRNNParamsSize",
+                                "CudnnRNNParamsToCanonical",
+                                "CudnnRNNV2",
+                                "CudnnRNNV3",
+                                "CumSum",
+                                "CumProd",
+                                "DebugNanCount",
+                                "DebugNumericSummary",
+                                "DecodeProtoV2",
+                                "DecodeWav",
+                                "DeepCopy",
+                                "DepthToSpace",
+                                "Dequantize",
+                                "Diag",
+                                "DiagPart",
+                                "EditDistance",
+                                "Empty",
+                                "EncodeProtoV2",
+                                "EncodeWav",
+                                "ExtractImagePatches",
+                                "ExtractVolumePatches",
+                                "Fill",
+                                "Gather",
+                                "GatherNd",
+                                "GatherV2",
+                                "HistogramFixedWidth",
+                                "InvertPermutation",
+                                "IsInf",
+                                "IsNan",
+                                "Isfinite",
+                                "LinSpace",
+                                "LowerBound",
+                                "MatMul",
+                                "MatrixDiag",
+                                "MatrixDiagPart",
+                                "Mfcc",
+                                "OneHot",
+                                "Pack",
+                                "PopulationCount",
+                                "Range",
+                                "Rank",
+                                "ReverseSequence",
+                                "Shape",
+                                "ShapeN",
+                                "Size",
+                                "SpaceToBatch",
+                                "SpaceToBatchND",
+                                "SpaceToDepth",
+                                "SparseMatMul",
+                                "Split",
+                                "SplitV",
+                                "Unique",
+                                "UniqueV2",
+                                "UniqueWithCounts",
+                                "UniqueWithCountsV2",
+                                "Unpack",
+                                "UnravelIndex",
+                                "UpperBound",
+                                "Where",
+                                "CompareAndBitpack",
+                                "Requantize",
+                                "RequantizationRange",
+                                "Bucketize",
+                                "AvgPool",
+                                "BatchNormWithGlobalNormalization",
+                                "FusedBatchNorm",
+                                "FusedBatchNormV2",
+                                "Conv2D",
+                                "RandomUniform",
+                                "RandomUniformInt",
+                                "RandomStandardNormal",
+                                "ParameterizedTruncatedNormal",
+                                "TruncatedNormal",
+                                "Multinomial",
+                                "RandomGamma",
+                                "RandomPoisson",
+                                "RandomPoissonV2"}));
+  const string& op_name = node.op();
+  return kNonForwardingOps->count(op_name) > 0 ||
+         str_util::StrContains(op_name, "Segment") ||
+         str_util::StartsWith(op_name, "Quantize");
+}
+
 }  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 092748f4ff946d960c5b78cabcaf2f00d1fc9bc7..047f9b8a5654b76ad273e92edbdf034a2c4fb2ae 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -28,7 +28,12 @@ bool IsAll(const NodeDef& node);
 bool IsAngle(const NodeDef& node);
 bool IsAny(const NodeDef& node);
 bool IsAnyDiv(const NodeDef& node);
+bool IsAnyMax(const NodeDef& node);
+bool IsAnyMaxPool(const NodeDef& node);
+bool IsAnyMin(const NodeDef& node);
 bool IsApproximateEqual(const NodeDef& node);
+bool IsArgMax(const NodeDef& node);
+bool IsArgMin(const NodeDef& node);
 bool IsAvgPoolGrad(const NodeDef& node);
 bool IsAssert(const NodeDef& node);
 bool IsAssign(const NodeDef& node);
@@ -72,6 +77,7 @@ bool IsFusedBatchNormGrad(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
 bool IsHistogramSummary(const NodeDef& node);
+bool IsHostConstant(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsIdentityN(const NodeDef& node);
 bool IsIdentityNSingleInput(const NodeDef& node);
@@ -100,10 +106,12 @@ bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
 bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsOnesLike(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPartitionedCall(const NodeDef& node);
+bool IsQuantizedMatMul(const NodeDef& node);
 bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
@@ -115,6 +123,7 @@ bool IsPow(const NodeDef& node);
 bool IsQueue(const NodeDef& node);
 bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
+bool IsReadVariableOp(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu(const NodeDef& node);
@@ -139,6 +148,7 @@ bool IsShapeN(const NodeDef& node);
 bool IsShuffle(const NodeDef& node);
 bool IsSigmoidGrad(const NodeDef& node);
 bool IsSnapshot(const NodeDef& node);
+bool IsSoftmax(const NodeDef& node);
 bool IsSoftplusGrad(const NodeDef& node);
 bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
@@ -169,6 +179,7 @@ bool IsTruncateMod(const NodeDef& node);
 bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsWhile(const NodeDef& node);
+bool IsZerosLike(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
 // Return true if the op is an aggregation (e.g. Add, AddN).
@@ -186,6 +197,11 @@ bool IsPersistent(const NodeDef& node);
 // Returns true if the node belongs to the NC_DATASET class (see graph/graph.h).
 bool IsDataset(const NodeDef& node);
 
+// Returns true if the node op is marked as stateful, or if it was not found in
+// op_registry.
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry);
+bool IsStateful(const NodeDef node);  // use OpRegistry::Global()
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry);
 bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
@@ -229,6 +245,10 @@ bool HasOpDef(const NodeDef& node);
 // and preserves the number of elements.
 bool IsCastLike(const NodeDef& node);
 
+// Returns true if this op never forwards any of its inputs, i.e. always
+// allocates buffers for its inputs.
+bool NeverForwardsInputs(const NodeDef& node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index ee216f80f991a4c56dfc6bc3577b6b4a19c313a0..740ef22a195abff79f115a8f1bce52f0d15dcb00 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -3,7 +3,6 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # Platform specific build config
 load(
@@ -103,6 +102,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -110,6 +110,9 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
+    # Running cuda on cpu will trigger tests guarded by GOOGLE_CUDA but NCHW
+    # won't be available, which result in test failures. So disable that.
+    tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":constant_folding",
         ":dependency_optimizer",
@@ -121,6 +124,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -151,6 +155,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -181,6 +186,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:grappler_test",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -208,6 +214,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -221,6 +228,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
@@ -269,13 +277,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arithmetic_optimizer_test_utils",
+    testonly = 1,
+    hdrs = [
+        "arithmetic_optimizer_test_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":arithmetic_optimizer",
+        ":constant_folding",
+        ":model_pruner",
+        "//tensorflow/core:test",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "arithmetic_optimizer_test",
     size = "small",
     srcs = ["arithmetic_optimizer_test.cc"],
     deps = [
         ":arithmetic_optimizer",
-        ":constant_folding",
+        ":arithmetic_optimizer_test_utils",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
@@ -290,7 +314,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
-        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -516,9 +539,9 @@ cc_library(
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
         ":dependency_optimizer",
-        ":experimental_implementation_selector",
         ":function_optimizer",
         ":graph_optimizer",
+        ":implementation_selector",
         ":layout_optimizer",
         ":loop_optimizer",
         ":memory_optimizer",
@@ -537,6 +560,8 @@ cc_library(
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/verifiers:graph_verifier",
+        "//tensorflow/core/grappler/verifiers:structure_verifier",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -549,6 +574,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
@@ -628,6 +654,7 @@ cc_library(
         "//tensorflow/core/grappler/utils:frame",
         "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -861,9 +888,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "experimental_implementation_selector",
-    srcs = ["experimental_implementation_selector.cc"],
-    hdrs = ["experimental_implementation_selector.h"],
+    name = "implementation_selector",
+    srcs = ["implementation_selector.cc"],
+    hdrs = ["implementation_selector.h"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
@@ -879,14 +906,14 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "experimental_implementation_selector_test",
+    name = "implementation_selector_test",
     size = "small",
-    srcs = ["experimental_implementation_selector_test.cc"],
+    srcs = ["implementation_selector_test.cc"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
-        ":experimental_implementation_selector",
         ":function_api_info",
+        ":implementation_selector",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 2168dbd623e0ddbfe8c6b078776ca893fb34eaa5..f12a07e869039342249d2a3f464bc82228f47b0b 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
+using tensorflow::str_util::StringReplace;
 using tensorflow::strings::StrCat;
 
 namespace tensorflow {
@@ -544,7 +545,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
 
     // If all inputs have the same shape, rewrite whole group with a single AddN
     if (shapes.size() == 1) {
-      string node_name = OptimizedNodeName(root_scope_and_name);
+      string node_name = UniqueOptimizedNodeName(root_scope_and_name);
       AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
                                         group.inputs);
       return node_name;
@@ -560,13 +561,13 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
 
     // optimized name for leaf AddN nodes
     auto leaf_node_name = [&root_scope_and_name, this](int i) {
-      return OptimizedNodeName(root_scope_and_name,
-                               strings::StrCat("Leaf_", i));
+      return UniqueOptimizedNodeName(root_scope_and_name,
+                                     strings::StrCat("Leaf_", i));
     };
     // optimized name for internal nodes of a tree built up from AddN leaves
     auto internal_node_name = [&root_scope_and_name, this](int i) {
-      return OptimizedNodeName(root_scope_and_name,
-                               strings::StrCat("Internal_", i));
+      return UniqueOptimizedNodeName(root_scope_and_name,
+                                     strings::StrCat("Internal_", i));
     };
 
     // Add/AddN nodes that must be added to the tree
@@ -587,8 +588,9 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
       add_ops.pop_front();
       const InputAndShape rhs = add_ops.front();
       add_ops.pop_front();
-      string name = add_ops.empty() ? OptimizedNodeName(root_scope_and_name)
-                                    : internal_node_name(internal_nodes++);
+      string name = add_ops.empty()
+                        ? UniqueOptimizedNodeName(root_scope_and_name)
+                        : internal_node_name(internal_nodes++);
       InputAndShape add = AddAggregatedInputs(*group.root_node, name, lhs, rhs);
       add_ops.push_front(add);
     } while (add_ops.size() > 1);
@@ -1774,6 +1776,63 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
   }
 };
 
+// Performs the conversion:
+// Square(Sub(x, y)) => Identity(SquaredDifference(x, y))
+class FuseSquaredDiffStage : public ArithmeticOptimizerStage {
+ public:
+  explicit FuseSquaredDiffStage(const GraphOptimizerContext& ctx,
+                                const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FuseSquaredDiffStage", ctx, ctx_ext) {}
+  ~FuseSquaredDiffStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsSquare(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* b;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &b));
+    // Optimize only if base is a Sub whose output is not being consumed
+    // elsewhere.
+    if (IsSub(*b) && !IsInPreserveSet(*b) &&
+        (NumNonControlOutputs(*b, *ctx().node_map) == 1)) {
+      node->set_op("Identity");
+      b->set_op("SquaredDifference");
+      AddToOptimizationQueue(node);
+      AddToOptimizationQueue(b);
+    }
+    return Status::OK();
+  }
+};
+
+// Performs the conversion:
+// Log(Softmax(x)) => LogSoftmax(x)
+class LogSoftmaxStage : public ArithmeticOptimizerStage {
+ public:
+  explicit LogSoftmaxStage(const GraphOptimizerContext& ctx,
+                           const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("LogSoftmaxStage", ctx, ctx_ext) {}
+  ~LogSoftmaxStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override { return IsLog(*node); }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* x;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &x));
+    // Optimize only if arg is a Softmax whose output is not being consumed
+    // elsewhere.
+    if (IsSoftmax(*x) && !IsInPreserveSet(*x) &&
+        (NumNonControlOutputs(*x, *ctx().node_map) == 1)) {
+      // Log(Softmax(x)) => LogSoftmax(Identity(x))
+      node->set_op("LogSoftmax");
+      x->set_op("Identity");
+      AddToOptimizationQueue(node);
+      AddToOptimizationQueue(x);
+    }
+    return Status::OK();
+  }
+};
+
 // Bypass redundant reshape nodes:
 //
 //   Reshape                    Reshape  <-+
@@ -2721,7 +2780,8 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
   ~OptimizeMaxOrMinOfMonotonicStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsMax(*node) || IsMin(*node);
+    return IsAnyMax(*node) || IsAnyMin(*node) || IsAnyMaxPool(*node) ||
+           IsArgMax(*node) || IsArgMin(*node);
   }
 
   Status TrySimplify(NodeDef* reduction_node,
@@ -2735,10 +2795,13 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
     // 0. inner_function is not in the preserve set,
     // 1. inner_function's Op is element-wise monotonic
     // 2. inner_function's output is not being consumed elsewhere.
+    // 3. is monotonic increasing if reduction_node is a pooling operation
+    //    since we don't have MinPool operations.
     bool is_non_decreasing = false;
     if (!IsInPreserveSet(*inner_function) &&
         IsElementWiseMonotonic(*inner_function, &is_non_decreasing) &&
-        ctx().node_map->GetOutputs(inner_function->name()).size() == 1) {
+        ctx().node_map->GetOutputs(inner_function->name()).size() == 1 &&
+        (is_non_decreasing || !IsAnyMaxPool(*reduction_node))) {
       // Swap the first inputs of the inner function Op & the reduction Op.
       NodeDef* inner_input;
       TF_RETURN_IF_ERROR(GetInputNode(inner_function->input(0), &inner_input));
@@ -2752,9 +2815,15 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
       if (!is_non_decreasing) {
         // Flip Min<->Max if the function is non-increasing, e.g.
         // Max(Neg(x)) = Neg(Min(x)).
-        const string opposite = IsMax(*reduction_node) ? "Min" : "Max";
+        const string opposite = FlipMinMax(*reduction_node);
         reduction_node->set_op(opposite);
       }
+
+      if (IsArgMax(*reduction_node) || IsArgMin(*reduction_node)) {
+        // ArgMax(Sqrt(x)) = ArgMax(x)
+        inner_function->set_op("Identity");
+      }
+
       AddToOptimizationQueue(reduction_node);
       AddToOptimizationQueue(inner_function);
       AddToOptimizationQueue(inner_input);
@@ -2775,6 +2844,16 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(consumer);
     }
   }
+
+ private:
+  string FlipMinMax(const NodeDef& node) {
+    const string& op = node.op();
+    if (IsAnyMax(node) || IsArgMax(node)) {
+      return str_util::StringReplace(op, "Max", "Min", false);
+    } else {
+      return str_util::StringReplace(op, "Min", "Max", false);
+    }
+  }
 };
 
 // Replace a chain of type&shape preserving unary ops with a
@@ -3239,13 +3318,17 @@ class UniqueNodes {
   }
 
  private:
-  uint64 ComputeSignature(const NodeDef& node) const;
+  uint64 ComputeSignature(const NodeDef& node);
   bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
 
-  std::unordered_map<uint64, std::vector<NodeDef*>> rep_;
+  absl::flat_hash_map<uint64, std::vector<NodeDef*>> rep_;
+  absl::flat_hash_map<const NodeDef*, uint64> memoized_signatures_;
 };
 
-uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
+uint64 UniqueNodes::ComputeSignature(const NodeDef& node) {
+  auto it = memoized_signatures_.find(&node);
+  if (it != memoized_signatures_.end()) return it->second;
+
   uint64 h = Hash64(node.op());
   h = Hash64Combine(Hash64(node.device()), h);
 
@@ -3259,6 +3342,7 @@ uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
     h = Hash64CombineUnordered(Hash64(attr.first), h);
     h = Hash64CombineUnordered(FastAttrValueHash(attr.second), h);
   }
+  memoized_signatures_.emplace(&node, h);
   return h;
 }
 
@@ -3279,31 +3363,29 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   // Compare inputs.
   if (IsCommutative(node1)) {
     std::vector<string> inputs1(node1.input().begin(), node1.input().end());
-    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
     std::sort(inputs1.begin(), inputs1.end());
+    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
     std::sort(inputs2.begin(), inputs2.end());
     return inputs1 == inputs2;
   } else {
-    std::vector<string> regular_inputs1;
-    std::vector<string> regular_inputs2;
-    std::vector<string> ctrl_inputs1;
-    std::vector<string> ctrl_inputs2;
-    for (int index = 0; index < node1.input_size(); ++index) {
+    // The order or ordinary inputs matters.
+    int index = 0;
+    for (; index < node1.input_size(); ++index) {
       if (IsControlInput(node1.input(index))) {
-        ctrl_inputs1.push_back(node1.input(index));
-        ctrl_inputs2.push_back(node2.input(index));
-      } else {
-        regular_inputs1.push_back(node1.input(index));
-        regular_inputs2.push_back(node2.input(index));
+        break;
+      } else if (node1.input(index) != node2.input(index)) {
+        return false;
       }
     }
-    if (regular_inputs1 != regular_inputs2) {
-      return false;
-    }
-    std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
-    std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
-    if (ctrl_inputs1 != ctrl_inputs2) {
-      return false;
+    // The order of control inputs does not matter.
+    if (index < node1.input_size()) {
+      std::vector<string> ctrl_inputs1(node1.input().begin() + index,
+                                       node1.input().end());
+      std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
+      std::vector<string> ctrl_inputs2(node2.input().begin() + index,
+                                       node2.input().end());
+      std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
+      return ctrl_inputs1 != ctrl_inputs2;
     }
   }
 
@@ -3330,8 +3412,8 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (node.device().find("SPU") != string::npos) {
     return false;
   }
-  // Workaround for Assert mistakenly being labeled as stateful.
-  if (IsAssert(node)) {
+  // Workaround for Assert and Print mistakenly being labeled as stateful.
+  if (IsAssert(node) || IsPrint(node)) {
     return true;
   }
   return IsFreeOfSideEffect(node);
@@ -3369,9 +3451,9 @@ void ArithmeticOptimizer::DedupComputations() {
 
   bool stop = true;
   std::set<int> duplicates;
+  UniqueNodes nodes;
   do {
     stop = true;
-    UniqueNodes nodes;
     for (int i = 0; i < optimized_graph_->node_size(); ++i) {
       if (duplicates.find(i) != duplicates.end()) {
         continue;
@@ -3499,6 +3581,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   if (options_.convert_pow) pipeline.AddStage<ConvertPowStage>(ctx, ctx_ext);
   if (options_.convert_log1p)
     pipeline.AddStage<ConvertLog1pStage>(ctx, ctx_ext);
+  if (options_.convert_log_softmax)
+    pipeline.AddStage<LogSoftmaxStage>(ctx, ctx_ext);
   if (options_.optimize_max_or_min_of_monotonic)
     pipeline.AddStage<OptimizeMaxOrMinOfMonotonicStage>(ctx, ctx_ext);
   if (options_.convert_expm1)
@@ -3507,6 +3591,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<UnaryOpsComposition>(ctx, ctx_ext);
   if (options_.remove_stack_strided_slice_same_axis)
     pipeline.AddStage<RemoveStackStridedSliceSameAxis>(ctx, ctx_ext);
+  if (options_.fuse_squared_diff)
+    pipeline.AddStage<FuseSquaredDiffStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << str_util::Join(pipeline.StageNames(), ", ");
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index e1395d75426314afe049be3bc3bd68e3126d4915..0330480db3ca3deddb0122abe291481597f0cb0f 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -61,6 +61,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool fold_conjugate_into_transpose = true;
     bool fold_multiply_into_conv = true;
     bool fold_transpose_into_matmul = true;
+    bool fuse_squared_diff = true;
     bool hoist_common_factor_out_of_aggregation = true;
     bool hoist_cwise_unary_chains = true;
     bool minimize_broadcasts = true;
@@ -78,6 +79,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool simplify_aggregation = true;
     bool convert_pow = true;
     bool convert_log1p = true;
+    bool convert_log_softmax = true;
     bool convert_expm1 = true;
     bool unary_ops_composition = true;
     bool remove_stack_strided_slice_same_axis = true;
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 94c59c68c8f1adf0ea6b234d8ebeb305c561b994..11fd91e7588d43488234d518d1705c062bb179cd 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -72,10 +71,6 @@ string AggregationMulName(const string& name) {
   return AddPrefixToNodeName(name, kSimplifyAggregationMul, "");
 }
 
-string OptimizedName(const string& name) {
-  return AddPrefixToNodeName(name, kArithmeticOptimizer);
-}
-
 void VerifyGraphsMatch(const GraphDef& original_graph,
                        const GraphDef& optimized_graph, int line) {
   EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << line;
@@ -92,211 +87,6 @@ void VerifyGraphsMatch(const GraphDef& original_graph,
 }
 }  // namespace
 
-class ArithmeticOptimizerTest : public GrapplerTest {
- protected:
-  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
-  // longer have any output consumers.
-  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                        GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                     GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  // Optionally run a constant folding pass before pruning.
-  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                             GraphDef* output, bool const_folding = false) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    if (const_folding) {
-      item->graph.Swap(output);
-      output->Clear();
-      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
-                       .Optimize(nullptr, *item, output));
-    }
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // TODO(ezhulenev): Make private. After migration to stages each test
-  // should explicitly enable required optimization for tests isolation
-  void DisableAllStages(ArithmeticOptimizer* optimizer) {
-    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
-    options.dedup_computations = false;
-    options.combine_add_to_addn = false;
-    options.convert_sqrt_div_to_rsqrt_mul = false;
-    options.convert_pow = false;
-    options.convert_log1p = false;
-    options.optimize_max_or_min_of_monotonic = false;
-    options.fold_conjugate_into_transpose = false;
-    options.fold_multiply_into_conv = false;
-    options.fold_transpose_into_matmul = false;
-    options.hoist_common_factor_out_of_aggregation = false;
-    options.hoist_cwise_unary_chains = false;
-    options.minimize_broadcasts = false;
-    options.remove_identity_transpose = false;
-    options.remove_involution = false;
-    options.remove_idempotent = false;
-    options.remove_redundant_bitcast = false;
-    options.remove_redundant_cast = false;
-    options.remove_redundant_reshape = false;
-    options.remove_negation = false;
-    options.remove_logical_not = false;
-    options.reorder_cast_like_and_value_preserving = false;
-    options.replace_mul_with_square = false;
-    options.simplify_aggregation = false;
-    options.unary_ops_composition = false;
-    optimizer->options_ = options;
-  }
-
-  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    optimizer->options_.combine_add_to_addn = false;
-  }
-
-  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.combine_add_to_addn = true;
-  }
-
-  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_conjugate_into_transpose = true;
-  }
-
-  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_multiply_into_conv = true;
-  }
-
-  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_transpose_into_matmul = true;
-  }
-
-  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
-  }
-
-  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.minimize_broadcasts = true;
-  }
-
-  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_identity_transpose = true;
-  }
-
-  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_involution = true;
-  }
-
-  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_bitcast = true;
-  }
-
-  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_cast = true;
-  }
-
-  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_reshape = true;
-  }
-
-  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_negation = true;
-  }
-
-  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.reorder_cast_like_and_value_preserving = true;
-  }
-
-  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.replace_mul_with_square = true;
-  }
-
-  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_cwise_unary_chains = true;
-  }
-
-  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
-  }
-
-  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_pow = true;
-  }
-
-  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_idempotent = true;
-  }
-
-  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_logical_not = true;
-  }
-
-  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.simplify_aggregation = true;
-  }
-
-  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_log1p = true;
-  }
-
-  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.optimize_max_or_min_of_monotonic = true;
-  }
-
-  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_expm1 = true;
-  }
-
-  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.unary_ops_composition = true;
-  }
-
-  void EnableOnlyRemoveStackStridedSliceSameAxis(
-      ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_stack_strided_slice_same_axis = true;
-  }
-};
-
 TEST_F(ArithmeticOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
@@ -2689,6 +2479,144 @@ TEST_F(ArithmeticOptimizerTest, DoNotConvertSqrtDivToRsqrtMulDivisorFetchNode) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, FuseSquaredDiff) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output sub_x_y = ops::Sub(s.WithOpName("sub_x_y"), x, y);
+  Output square_sub_x_y = ops::Square(s.WithOpName("output"), sub_x_y);
+
+  GrapplerItem item;
+  item.fetch = {"output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFuseSquaredDiff(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  const auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("sub_x_y", node.input(0));
+    } else if (node.name() == "sub_x_y") {
+      EXPECT_EQ("SquaredDifference", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, DoNotFuseSquaredDiffFetchNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output sub_x_y = ops::Sub(s.WithOpName("sub_x_y"), x, y);
+  Output square_sub_x_y = ops::Square(s.WithOpName("output"), sub_x_y);
+
+  GrapplerItem item;
+  item.fetch = {"output", "sub_x_y"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(2, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFuseSquaredDiff(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  const auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(2, tensors.size());
+
+  for (int i = 0; i < tensors.size(); i++) {
+    EXPECT_EQ(tensors[i].NumElements(), tensors_expected[i].NumElements());
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ("Square", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("sub_x_y", node.input(0));
+    } else if (node.name() == "sub_x_y") {
+      EXPECT_EQ("Sub", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, ConvertLogSoftmax) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output softmax = ops::Softmax(s.WithOpName("softmax"), x);
+  Output logsoftmax = ops::Log(s.WithOpName("output"), softmax);
+
+  GrapplerItem item;
+  item.fetch = {"output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyLogSoftmax(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  const auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ("LogSoftmax", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, DoNotConvertLogSoftmaxArgFetchNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output floats = ops::Const(s.WithOpName("floats"),
+                             {0.7423212f, 0.19757693f, 0.53124744f}, {1, 3});
+  Output softmax = ops::Softmax(s.WithOpName("softmax"), floats);
+  Output final_output = ops::Log(s.WithOpName("final_output"), softmax);
+
+  GrapplerItem item;
+  item.fetch = {"softmax", "final_output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(2, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyLogSoftmax(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+  const auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(2, tensors.size());
+
+  // Should be a NoOp since we are not allowed to change the output of fetch
+  // nodes.
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  for (int i = 0; i < tensors.size(); i++) {
+    EXPECT_EQ(tensors[i].NumElements(), tensors_expected[i].NumElements());
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
 TEST_F(ArithmeticOptimizerTest, ConvertPow) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
@@ -3466,6 +3394,47 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMaxOrMinOfMonotonicElementWise) {
   EXPECT_EQ(2, required_node_count);
 }
 
+TEST_F(ArithmeticOptimizerTest, OptimizeArgMaxOrArgMinOfMonotonicElementWise) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  const auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output sqrt = ops::Sqrt(s.WithOpName("sqrt"), x);
+  Output arg_max = ops::ArgMax(s.WithOpName("arg_max"), sqrt, 1);
+  Output final_out = ops::Identity(s.WithOpName("final_out"), arg_max);
+
+  GrapplerItem item;
+  item.fetch = {"final_out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  const auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorEqual<int64>(tensors_expected[0], tensors[0]);
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  // Check if the inputs are switched
+  int required_node_count = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "final_out") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("arg_max", node.input(0));
+      ++required_node_count;
+    } else if (node.name() == "arg_max") {
+      EXPECT_EQ("ArgMax", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      ++required_node_count;
+    }
+  }
+  EXPECT_EQ(2, required_node_count);
+}
+
 TEST_F(ArithmeticOptimizerTest,
        OptimizeMaxOrMinOfMonotonicElementWise_DoNotChangeFetchNode) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -3561,6 +3530,75 @@ TEST_F(ArithmeticOptimizerTest,
   EXPECT_EQ(2, required_node_count);
 }
 
+TEST_F(ArithmeticOptimizerTest,
+       OptimizeMaxOrMinOfMonotonicElementWiseNonIncreasingDoNotChangeMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output neg = ops::Neg(s.WithOpName("neg"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), neg, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+
+  GrapplerItem item;
+  item.fetch = {"max_pool"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  // Should be a NoOp
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, OptimizeMaxOrMinOfMonotonicElementWiseMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output sqrt = ops::Sqrt(s.WithOpName("sqrt"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), sqrt, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+  Output final_out = ops::Identity(s.WithOpName("final_out"), max_pool);
+
+  GrapplerItem item;
+  item.fetch = {"final_out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  // Check if the inputs are switched
+  int required_node_count = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "sqrt") {
+      EXPECT_EQ("Sqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("max_pool", node.input(0));
+      ++required_node_count;
+    } else if (node.name() == "max_pool") {
+      EXPECT_EQ("MaxPool", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      ++required_node_count;
+    }
+  }
+  EXPECT_EQ(2, required_node_count);
+}
+
 TEST_F(ArithmeticOptimizerTest, UnaryOpsComposition) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0358d7f540986518bf0b5879cd6dbec390fac5f5
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -0,0 +1,246 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ArithmeticOptimizerTest : public GrapplerTest {
+ protected:
+  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
+  // longer have any output consumers.
+  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                        GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                     GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
+  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                             GraphDef* output, bool const_folding = false) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // TODO(ezhulenev): Make private. After migration to stages each test
+  // should explicitly enable required optimization for tests isolation
+  void DisableAllStages(ArithmeticOptimizer* optimizer) {
+    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
+    options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.convert_pow = false;
+    options.convert_log1p = false;
+    options.optimize_max_or_min_of_monotonic = false;
+    options.fold_conjugate_into_transpose = false;
+    options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
+    options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
+    options.minimize_broadcasts = false;
+    options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
+    options.remove_redundant_bitcast = false;
+    options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
+    options.remove_negation = false;
+    options.remove_logical_not = false;
+    options.reorder_cast_like_and_value_preserving = false;
+    options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
+    options.unary_ops_composition = false;
+    optimizer->options_ = options;
+  }
+
+  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    optimizer->options_.combine_add_to_addn = false;
+  }
+
+  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.combine_add_to_addn = true;
+  }
+
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
+  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
+  }
+
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_identity_transpose = true;
+  }
+
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
+  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_bitcast = true;
+  }
+
+  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_cast = true;
+  }
+
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
+
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_like_and_value_preserving = true;
+  }
+
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_cwise_unary_chains = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+
+  void EnableOnlyLogSoftmax(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log_softmax = true;
+  }
+
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_pow = true;
+  }
+
+  void EnableOnlyFuseSquaredDiff(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fuse_squared_diff = true;
+  }
+
+  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_idempotent = true;
+  }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
+  }
+
+  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log1p = true;
+  }
+
+  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.optimize_max_or_min_of_monotonic = true;
+  }
+
+  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_expm1 = true;
+  }
+
+  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.unary_ops_composition = true;
+  }
+
+  void EnableOnlyRemoveStackStridedSliceSameAxis(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_stack_strided_slice_same_axis = true;
+  }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index b0c3c5b5181be4b744128fb18ac288c122c59f2a..bbd1250c9184238cc9d3573b4a02b05eced2ac7f 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <cmath>
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -25,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -34,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -167,6 +173,55 @@ bool HasTPUAttributes(const NodeDef& node) {
   return false;
 }
 
+template <typename T>
+bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+
+template <>
+bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+
+template <>
+bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+
+float QuantizedTypeMinAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::lowest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::lowest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::lowest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::lowest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::lowest();
+    default:
+      return 0.0f;
+  }
+}
+
+float QuantizedTypeMaxAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::highest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::highest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::highest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::highest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::highest();
+    default:
+      return 0.0f;
+  }
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -716,6 +771,71 @@ Status ConstantFolding::MaterializeReductionIndices(
   return Status::OK();
 }
 
+Status ConstantFolding::MaterializeConstantValuedNode(
+    NodeDef* node, const GraphProperties& properties) {
+  // Nodes that generate constant-valued outputs can be represented compactly in
+  // compressed format, regardless of their shape.
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) return Status::OK();
+  const auto& output_shape = output_props[0].shape();
+  if (!PartialTensorShape(output_shape).IsFullyDefined()) {
+    return Status::OK();
+  }
+  if (IsFill(*node)) {
+    const auto output_dtype = output_props[0].dtype();
+    NodeDef* input_node = nullptr;
+    for (int i = 0; i < 2; ++i) {
+      input_node = node_map_->GetNode(NodeName(node->input(i)));
+      if (input_node == nullptr || !IsReallyConstant(*input_node)) {
+        return Status::OK();
+      }
+    }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
+
+    // Copy the input tensor to the fill node, set the output shape and data
+    // type, and change the node type to Const.
+    TensorProto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+    const TensorProto& input_tensor = input_node->attr().at("value").tensor();
+    if (!input_tensor.tensor_content().empty()) {
+      // Convert the value to repeated field format, so we can use the
+      // decompression mechanism to store only a single value in the constant
+      // node, even if the shape specified in the original Fill is large.
+      Tensor t;
+      if (!t.FromProto(input_tensor)) {
+        return errors::InvalidArgument(
+            "Could not construct Tensor form TensorProto in node: ",
+            input_node->name());
+      }
+      tensor->clear_tensor_content();
+      t.AsProtoField(tensor);
+    } else {
+      *tensor = input_tensor;
+    }
+    *(tensor->mutable_tensor_shape()) = output_shape;
+    (*node->mutable_attr())["dtype"].set_type(output_dtype);
+    node->mutable_attr()->erase("T");
+    node->mutable_attr()->erase("index_type");
+    node->set_op("Const");
+    for (int i = 0; i < 2; i++) {
+      // Change inputs to a control inputs.
+      const string ctrl_dep = AsControlDependency(node->input(i));
+      node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+      node->set_input(i, ctrl_dep);
+    }
+    graph_modified_ = true;
+  } else {
+    double value =
+        (IsZerosLike(*node) ? 0.0 : (IsOnesLike(*node) ? 1.0 : -1.0));
+    bool success = false;
+    if (value >= 0) {
+      TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+          value, properties, output_shape, node, graph_, &success));
+    }
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::MaterializeConstants(
     const GraphProperties& properties) {
   const int node_count = graph_->node_size();
@@ -726,6 +846,8 @@ Status ConstantFolding::MaterializeConstants(
       TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
     } else if (IsReduction(node)) {
       TF_RETURN_IF_ERROR(MaterializeReductionIndices(&node, properties));
+    } else if (IsFill(node) || IsZerosLike(node) || IsOnesLike(node)) {
+      TF_RETURN_IF_ERROR(MaterializeConstantValuedNode(&node, properties));
     }
   }
   return Status::OK();
@@ -870,6 +992,11 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
       SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
       SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
@@ -897,6 +1024,42 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
   return dtype;
 }
 
+// Checks whether the shape of the const input of the Mul op is valid to perform
+// the MulConvPushDown optimization.
+bool IsValidConstShapeForMulConvPushDown(
+    const string& data_format, const TensorShapeProto& filter_shape,
+    const TensorShapeProto& mul_const_input_shape) {
+  // If the const is a scalar, or it has fewer or same number of dimensions
+  // than the filter and it only has single element, the optimization should
+  // work.
+  if (mul_const_input_shape.dim_size() <= data_format.size() &&
+      TensorShape(mul_const_input_shape).num_elements() == 1) {
+    return true;
+  }
+
+  // Otherwise, check the eligibility according to data format.
+  if (data_format == "NHWC" || data_format == "NDHWC") {
+    TensorShapeProto new_filter_shape;
+    if (!ShapeAfterBroadcast(filter_shape, mul_const_input_shape,
+                             &new_filter_shape)) {
+      return false;
+    }
+    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+      return false;
+    }
+    // Only the last dimension could be larger than one, since broadcasting over
+    // the last dimension (the output channel) will result in invalid filter.
+    for (int i = 0; i < mul_const_input_shape.dim_size() - 1; ++i) {
+      if (mul_const_input_shape.dim(i).size() > 1) return false;
+    }
+    return true;
+  } else if (data_format == "NCHW" || data_format == "NCDHW") {
+    // TODO(laigd): support NCHW and NCDHW (b/111214513).
+    return false;
+  }
+  return false;
+}
+
 }  // namespace
 
 // static
@@ -917,29 +1080,28 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                  \
-  {                                                                   \
-    const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
-    TYPE last = *val_ptr;                                             \
-    int64 last_index = 0;                                             \
-    for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
-      TYPE cur = *val_ptr++;                                          \
-      if (cur != last) {                                              \
-        last = cur;                                                   \
-        last_index = i;                                               \
-      }                                                               \
-    }                                                                 \
-    if (last_index < kint32max) {                                     \
-      optimized = true;                                               \
-      encoded_size = (last_index + 1) * sizeof(NAME);                 \
-      t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
-      t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
-      val_ptr = tensor->flat<TYPE>().data();                          \
-      for (int64 i = 0; i <= last_index; ++i) {                       \
-        t->set_##NAME##_val(i, *val_ptr++);                           \
-      }                                                               \
-    }                                                                 \
-  }                                                                   \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                      \
+  {                                                                       \
+    const auto* val_ptr = tensor->flat<TYPE>().data();                    \
+    auto last = *val_ptr;                                                 \
+    int64 last_index = 0;                                                 \
+    for (int64 i = 0; i < tensor->NumElements(); ++i) {                   \
+      TYPE cur = *val_ptr++;                                              \
+      if (PackedValuesNotEqual(cur, last)) {                              \
+        last = cur;                                                       \
+        last_index = i;                                                   \
+      }                                                                   \
+    }                                                                     \
+    if (last_index < kint32max) {                                         \
+      optimized = true;                                                   \
+      encoded_size = (last_index + 1) * sizeof(NAME);                     \
+      t->mutable_##NAME##_val()->Reserve(last_index + 1);                 \
+      const auto* src_ptr = tensor->flat<TYPE>().data();                  \
+      auto* dst_ptr =                                                     \
+          t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);              \
+    }                                                                     \
+  }                                                                       \
   break
 
     switch (tensor->dtype()) {
@@ -975,6 +1137,8 @@ Status ConstantFolding::CreateNodeDef(const string& name,
     t->set_dtype(tensor->dtype());
     tensor->shape().AsProto(t->mutable_tensor_shape());
   } else {
+    // DT_HALF, DT_BFLOAT16, DT_QINT32, DT_QINT16, DT_QUINT16, DT_QINT8,
+    // DT_QUINT8
     tensor->AsProtoTensorContent(t);
     encoded_size = t->tensor_content().size();
   }
@@ -1059,98 +1223,103 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   return Status::OK();
 }
 
-Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
-                                 bool* result_too_large) {
-  if (IsMerge(*node)) {
-    // Merge nodes are special, in the sense that they execute as soon as one of
-    // their input is ready. We can therefore fold a merge node iff it has at
-    // least one constant input without control dependency.
-    // We still need to ensure that the nodes in the fanin of the merge node are
-    // scheduled. We'll therefore add a control dependency from the merge node
-    // to the folded constant. We end up with:
-    //  * the merge node and its inputs are preserved as is
-    //  * a new constant node C1, driven by the merge node through a control
-    //  dependency, initialized to the value of the folded input
-    //  * a new constant node C2, driven by the merge node through a control
-    //  dependency, initialized to the index of the folded input
-    //  * the fanout of the merge nodes is rewired to be driven by either C1 or
-    //  C2.
-    for (int input_index = 0; input_index < node->input_size(); ++input_index) {
-      const auto& input = node->input(input_index);
-      if (IsControlInput(input)) {
-        // Try the next input.
-        continue;
-      }
-      NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsReallyConstant(*input_node)) {
-        continue;
-      }
-      bool valid_input = true;
-      for (const string& fanin_of_input : input_node->input()) {
-        if (IsControlInput(fanin_of_input)) {
-          valid_input = false;
-          break;
-        }
-      }
-      if (!valid_input) {
-        // Try the next input
-        continue;
+Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
+  // Merge nodes are special, in the sense that they execute as soon as one of
+  // their input is ready. We can therefore fold a merge node iff it has at
+  // least one constant input without control dependency.
+  // We still need to ensure that the nodes in the fanin of the merge node are
+  // scheduled. We'll therefore add a control dependency from the merge node
+  // to the folded constant. We end up with:
+  //  * the merge node and its inputs are preserved as is
+  //  * a new constant node C1, driven by the merge node through a control
+  //  dependency, initialized to the value of the folded input
+  //  * a new constant node C2, driven by the merge node through a control
+  //  dependency, initialized to the index of the folded input
+  //  * the fanout of the merge nodes is rewired to be driven by either C1 or
+  //  C2.
+  for (int input_index = 0; input_index < node->input_size(); ++input_index) {
+    const auto& input = node->input(input_index);
+    if (IsControlInput(input)) {
+      // Try the next input.
+      continue;
+    }
+    NodeDef* input_node = node_map_->GetNode(input);
+    if (!IsReallyConstant(*input_node)) {
+      continue;
+    }
+    bool valid_input = true;
+    for (const string& fanin_of_input : input_node->input()) {
+      if (IsControlInput(fanin_of_input)) {
+        valid_input = false;
+        break;
       }
+    }
+    if (!valid_input) {
+      // Try the next input
+      continue;
+    }
 
-      string const_out_name = OptimizedNodeName(*node, "_const");
-      string const_index_name = OptimizedNodeName(*node, "_index");
-      if (node_map_->GetNode(const_out_name) ||
-          node_map_->GetNode(const_index_name)) {
-        // Intended name already exists.
-        return errors::AlreadyExists(
-            strings::StrCat(const_out_name, " or ", const_index_name,
-                            " already present in the graph"));
-      }
-
-      NodeDef* const_out = output_graph->add_node();
-      *const_out = *input_node;
-      const_out->set_name(const_out_name);
-      const_out->set_device(node->device());
-      *const_out->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_out->name(), const_out);
-      node_map_->AddOutput(node->name(), const_out->name());
-
-      NodeDef* const_index = output_graph->add_node();
-      const_index->set_op("Const");
-      Tensor index(DT_INT32, TensorShape({}));
-      index.flat<int32>()(0) = input_index;
-      (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
-      index.AsProtoTensorContent(
-          (*const_index->mutable_attr())["value"].mutable_tensor());
-      const_index->set_name(const_index_name);
-      const_index->set_device(node->device());
-      *const_index->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_index->name(), const_index);
-      node_map_->AddOutput(node->name(), const_index->name());
-
-      auto outputs = node_map_->GetOutputs(node->name());
-      for (NodeDef* output : outputs) {
-        for (int i = 0; i < output->input_size(); i++) {
-          int port;
-          string node_name = ParseNodeName(output->input(i), &port);
-          if (node_name == node->name()) {
-            if (port == 0) {
-              *output->mutable_input(i) = const_out->name();
-              node_map_->AddOutput(const_out->name(), output->name());
-            } else if (port == 1) {
-              *output->mutable_input(i) = const_index->name();
-              node_map_->AddOutput(const_index->name(), output->name());
-            } else {
-              // This is a control dependency (or an invalid edge since the
-              // merge node has only 2 inputs): preserve them.
-            }
+    string const_out_name = OptimizedNodeName(*node, "_const");
+    string const_index_name = OptimizedNodeName(*node, "_index");
+    if (node_map_->GetNode(const_out_name) ||
+        node_map_->GetNode(const_index_name)) {
+      // Intended name already exists.
+      return errors::AlreadyExists(
+          strings::StrCat(const_out_name, " or ", const_index_name,
+                          " already present in the graph"));
+    }
+
+    NodeDef* const_out = output_graph->add_node();
+    *const_out = *input_node;
+    const_out->set_name(const_out_name);
+    const_out->set_device(node->device());
+    *const_out->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_out->name(), const_out);
+    node_map_->AddOutput(node->name(), const_out->name());
+
+    NodeDef* const_index = output_graph->add_node();
+    const_index->set_op("Const");
+    Tensor index(DT_INT32, TensorShape({}));
+    index.flat<int32>()(0) = input_index;
+    (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
+    index.AsProtoTensorContent(
+        (*const_index->mutable_attr())["value"].mutable_tensor());
+    const_index->set_name(const_index_name);
+    const_index->set_device(node->device());
+    *const_index->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_index->name(), const_index);
+    node_map_->AddOutput(node->name(), const_index->name());
+
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (NodeDef* output : outputs) {
+      for (int i = 0; i < output->input_size(); i++) {
+        int port;
+        string node_name = ParseNodeName(output->input(i), &port);
+        if (node_name == node->name()) {
+          if (port == 0) {
+            *output->mutable_input(i) = const_out->name();
+            node_map_->AddOutput(const_out->name(), output->name());
+          } else if (port == 1) {
+            *output->mutable_input(i) = const_index->name();
+            node_map_->AddOutput(const_index->name(), output->name());
+          } else {
+            // This is a control dependency (or an invalid edge since the
+            // merge node has only 2 inputs): preserve them.
           }
         }
       }
-      return Status::OK();
     }
     return Status::OK();
   }
+  return Status::OK();
+}
+
+Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
+                                 bool* result_too_large) {
+  *result_too_large = false;
+  if (IsMerge(*node)) {
+    return FoldMergeNode(node, output_graph);
+  }
 
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(
@@ -1395,7 +1564,8 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "OnesLike") return true;
+  if (IsOnesLike(node)) return true;
+  if (IsZerosLike(node)) return false;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsOnes(*values);
@@ -1417,6 +1587,11 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
     IS_ONES_CASE(DT_INT16);
     IS_ONES_CASE(DT_INT32);
     IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_QINT32);
+    IS_ONES_CASE(DT_QINT16);
+    IS_ONES_CASE(DT_QUINT16);
+    IS_ONES_CASE(DT_QINT8);
+    IS_ONES_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1428,7 +1603,8 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "ZerosLike") return true;
+  if (IsOnesLike(node)) return false;
+  if (IsZerosLike(node)) return true;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsZeros(*values);
@@ -1450,6 +1626,11 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
     IS_ZEROS_CASE(DT_INT16);
     IS_ZEROS_CASE(DT_INT32);
     IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_QINT32);
+    IS_ZEROS_CASE(DT_QINT16);
+    IS_ZEROS_CASE(DT_QUINT16);
+    IS_ZEROS_CASE(DT_QINT8);
+    IS_ZEROS_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1562,6 +1743,7 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     node->set_input(i, ctrl_dep);
   }
   *success = true;
+  graph_modified_ = true;
   return Status::OK();
 }
 
@@ -2458,6 +2640,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
   *success = false;
   const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
   const bool is_matmul = IsMatMul(*node);
+  const bool is_quantized_matmul = IsQuantizedMatMul(*node);
   const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
   const bool is_sub = IsSub(*node);
   const bool is_any_div = IsAnyDiv(*node);
@@ -2552,6 +2735,10 @@ Status ConstantFolding::SimplifyArithmeticOperations(
         if (!replace_op_status.ok()) {
           return replace_op_status;
         } else if (replace_succeed) {
+          if (is_quantized_matmul) {
+            TF_RETURN_IF_ERROR(
+                AddQuantizedMatMulMinMaxOutConstNodes(node, optimized_graph));
+          }
           *success = true;
           return Status::OK();
         }
@@ -2715,118 +2902,110 @@ bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
   //                 X  C1                       C1  C2
   //
   // where C1 and C2 are constants and X is non-constant.
-  if (IsMul(*node) && NumNonControlInputs(*node) == 2) {
-    NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
-    NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
-    // One child must be constant, and the second must be Conv op.
-    const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
-    const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
-    if (!left_child_is_constant && !right_child_is_constant) {
-      return false;
-    }
-    NodeDef* conv_node =
-        left_child_is_constant ? mul_right_child : mul_left_child;
-    if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
-      return false;
-    }
-    if (node->device() != mul_left_child->device() ||
-        node->device() != mul_right_child->device()) {
-      return false;
-    }
-
-    // Make sure that it is safe to change the value of the convolution
-    // output.
-    if (conv_node->input_size() < 2 ||
-        NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
-        nodes_to_preserve_.find(conv_node->name()) !=
-            nodes_to_preserve_.end()) {
-      return false;
-    }
-
-    // Identify the nodes to swap.
-    NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
-    NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
-    const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
-    const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
-    if (!conv_left_is_constant && !conv_right_is_constant) {
-      // At least one of the convolution inputs should be constant.
-      return false;
-    }
-    if (conv_left_is_constant && conv_right_is_constant) {
-      // Leverage regular constant folding to handle this.
-      return false;
-    }
-    const auto& mul_props = properties.GetOutputProperties(node->name());
-    const auto& conv_props = properties.GetOutputProperties(conv_node->name());
-    if (mul_props.empty() || conv_props.empty()) {
-      return false;
-    }
-    const auto& mul_shape = mul_props[0].shape();
-    const auto& conv_shape = conv_props[0].shape();
-    if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
-      return false;
-    }
-
-    const auto& input_props = properties.GetInputProperties(conv_node->name());
-    if (input_props.size() < 2) {
-      return false;
-    }
-    const auto& filter_shape = input_props[1].shape();
+  if (!IsMul(*node) || NumNonControlInputs(*node) != 2) return false;
+
+  NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
+  NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
+  // One child must be constant, and the second must be Conv op.
+  const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
+  const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
+  if (!left_child_is_constant && !right_child_is_constant) {
+    return false;
+  }
+  NodeDef* conv_node =
+      left_child_is_constant ? mul_right_child : mul_left_child;
+  if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
+    return false;
+  }
+  if (node->device() != mul_left_child->device() ||
+      node->device() != mul_right_child->device()) {
+    return false;
+  }
 
-    NodeDef* const_node =
-        left_child_is_constant ? mul_left_child : mul_right_child;
-    const auto& const_props =
-        properties.GetOutputProperties(const_node->name());
-    if (const_props.empty()) {
-      return false;
-    }
-    const auto& const_shape = const_props[0].shape();
+  // Make sure that it is safe to change the value of the convolution
+  // output.
+  if (conv_node->input_size() < 2 ||
+      NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
+      nodes_to_preserve_.find(conv_node->name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
 
-    TensorShapeProto new_filter_shape;
-    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
-      return false;
-    }
-    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
-      return false;
-    }
+  // Identify the nodes to swap.
+  NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
+  NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
+  const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
+  const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
+  if (!conv_left_is_constant && !conv_right_is_constant) {
+    // At least one of the convolution inputs should be constant.
+    return false;
+  }
+  if (conv_left_is_constant && conv_right_is_constant) {
+    // Leverage regular constant folding to handle this.
+    return false;
+  }
+  const auto& mul_props = properties.GetOutputProperties(node->name());
+  const auto& conv_props = properties.GetOutputProperties(conv_node->name());
+  if (mul_props.empty() || conv_props.empty()) {
+    return false;
+  }
+  const auto& mul_shape = mul_props[0].shape();
+  const auto& conv_shape = conv_props[0].shape();
+  if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
+    return false;
+  }
 
-    string mul_new_name =
-        AddPrefixToNodeName("merged_input", conv_node->name());
-    if (node_map_->NodeExists(mul_new_name)) {
-      return false;
-    }
-    // Make sure we don't introduce loops in the graph by removing control
-    // dependencies from the conv2d node to c2.
-    string conv_const_input =
-        conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
-    if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
-                                node_map_.get())) {
-      // Add a control dep from c1 to c2 to ensure c2 is in the right frame
-      MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
-                           node_map_.get());
-    }
+  const auto& input_props = properties.GetInputProperties(conv_node->name());
+  if (input_props.size() < 2) {
+    return false;
+  }
+  const auto& filter_shape = input_props[1].shape();
 
-    conv_node->set_name(node->name());
-    node->set_name(mul_new_name);
-    if (conv_left_is_constant) {
-      node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
-      conv_node->set_input(0, mul_new_name);
-    } else {
-      node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
-      conv_node->set_input(1, mul_new_name);
-    }
-    NodeDef* conv_const_node =
-        conv_left_is_constant ? conv_left_child : conv_right_child;
-    if (left_child_is_constant) {
-      node->set_input(1, conv_const_node->name());
-    } else {
-      node->set_input(0, conv_const_node->name());
-    }
-    node_map_->AddNode(mul_new_name, node);
+  NodeDef* const_node =
+      left_child_is_constant ? mul_left_child : mul_right_child;
+  const auto& const_props = properties.GetOutputProperties(const_node->name());
+  if (const_props.empty()) {
+    return false;
+  }
+  const auto& const_shape = const_props[0].shape();
+  if (!IsValidConstShapeForMulConvPushDown(
+          conv_node->attr().at("data_format").s(), filter_shape, const_shape)) {
+    return false;
+  }
 
-    return true;
+  string mul_new_name = AddPrefixToNodeName("merged_input", conv_node->name());
+  if (node_map_->NodeExists(mul_new_name)) {
+    return false;
   }
-  return false;
+  // Make sure we don't introduce loops in the graph by removing control
+  // dependencies from the conv2d node to c2.
+  string conv_const_input =
+      conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
+  if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
+                              node_map_.get())) {
+    // Add a control dep from c1 to c2 to ensure c2 is in the right frame
+    MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
+                         node_map_.get());
+  }
+
+  conv_node->set_name(node->name());
+  node->set_name(mul_new_name);
+  if (conv_left_is_constant) {
+    node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
+    conv_node->set_input(0, mul_new_name);
+  } else {
+    node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
+    conv_node->set_input(1, mul_new_name);
+  }
+  NodeDef* conv_const_node =
+      conv_left_is_constant ? conv_left_child : conv_right_child;
+  if (left_child_is_constant) {
+    node->set_input(1, conv_const_node->name());
+  } else {
+    node->set_input(0, conv_const_node->name());
+  }
+  node_map_->AddNode(mul_new_name, node);
+
+  return true;
 }
 
 bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
@@ -3127,6 +3306,65 @@ bool ConstantFolding::MergeConcat(const GraphProperties& properties,
   return true;
 }
 
+Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
+    NodeDef* node, GraphDef* optimized_graph) {
+  auto add_quantized_out = [this, node, optimized_graph](
+                               const string& out_const_name, int index) {
+    NodeDef* out_node = optimized_graph->add_node();
+    Tensor value(DT_FLOAT, TensorShape({}));
+    const bool is_min = index == 1;
+    const DataType type_attr = node->attr().at("dtype").type();
+
+    value.flat<float>()(0) = is_min ? QuantizedTypeMinAsFloat(type_attr)
+                                    : QuantizedTypeMaxAsFloat(type_attr);
+    TF_RETURN_IF_ERROR(
+        CreateNodeDef(out_const_name, TensorValue(&value), out_node));
+    node_map_->AddNode(out_const_name, out_node);
+    out_node->set_device(node->device());
+
+    // Copy all inputs from node.
+    out_node->mutable_input()->CopyFrom(node->input());
+    for (const string& input : out_node->input()) {
+      node_map_->AddOutput(NodeName(input), out_const_name);
+    }
+
+    // Update output nodes consuming node:index to new const node.
+    string old_input = absl::StrCat(node->name(), ":", index);
+    int old_node_count = 0;
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (const auto& output : outputs) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        if (output->input(i) == old_input) {
+          output->set_input(i, out_const_name);
+          node_map_->AddOutput(out_const_name, output->name());
+        } else if (NodeName(output->input(i)) == node->name()) {
+          ++old_node_count;
+        }
+      }
+      if (old_node_count == 0) {
+        node_map_->RemoveOutput(node->name(), output->name());
+      }
+    }
+
+    return Status::OK();
+  };
+  const string min_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_min_out");
+  const string max_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_max_out");
+  if (node_map_->GetNode(min_out_const_name) == nullptr &&
+      node_map_->GetNode(max_out_const_name) == nullptr) {
+    TF_RETURN_IF_ERROR(add_quantized_out(min_out_const_name, 1));
+    TF_RETURN_IF_ERROR(add_quantized_out(max_out_const_name, 2));
+  } else {
+    return errors::Internal(absl::Substitute(
+        "Can't create Const for QuantizedMatMul min_out/max_out of "
+        "node '$0' because of node name conflict",
+        node->name()));
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* optimized_graph) {
@@ -3167,6 +3405,20 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   return Status::OK();
 }
 
+namespace {
+Status CompressConstants(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    NodeDef* node = graph->mutable_node(i);
+    if ((IsConstant(*node) || IsHostConstant(*node)) &&
+        HasNodeAttr(*node, "value")) {
+      AttrValue& attr_val = (*node->mutable_attr())["value"];
+      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   // TensorFlow flushes denormals to zero and rounds to nearest, so we do
@@ -3205,6 +3457,7 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(
         RunOptimizationPass(cluster, item_to_optimize, optimized_graph));
   } while (graph_modified_ || optimized_graph->node_size() != node_count);
+  TF_RETURN_IF_ERROR(CompressConstants(optimized_graph));
   *optimized_graph->mutable_library() = item.graph.library();
   *optimized_graph->mutable_versions() = item.graph.versions();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 99200925cb351478bd188361c33b88634caffa26..418176c8932639f4f8bbef8f636c33b56d36f1c2 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -67,8 +67,10 @@ class ConstantFolding : public GraphOptimizer {
                                           const GraphProperties& properties);
   Status MaterializeReductionIndices(NodeDef* node,
                                      const GraphProperties& properties);
-
+  Status MaterializeConstantValuedNode(NodeDef* node,
+                                       const GraphProperties& properties);
   Status MaterializeConstants(const GraphProperties& properties);
+
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -78,6 +80,7 @@ class ConstantFolding : public GraphOptimizer {
   Status EvaluateOneFoldable(const NodeDef& node, std::vector<NodeDef>* outputs,
                              bool* result_too_large);
 
+  Status FoldMergeNode(NodeDef* node, GraphDef* output_graph);
   Status FoldNode(NodeDef* node, GraphDef* output_graph,
                   bool* result_too_large);
 
@@ -233,6 +236,9 @@ class ConstantFolding : public GraphOptimizer {
   bool MergeConcat(const GraphProperties& properties, bool use_shape_info,
                    GraphDef* optimized_graph, NodeDef* node);
 
+  Status AddQuantizedMatMulMinMaxOutConstNodes(NodeDef* node,
+                                               GraphDef* optimized_graph);
+
   // Points to an externally provided device or to owned_device_;
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index d7cabf5a8b8ad6659937e868df7635292936d48c..373e3f0eb164d2d7a047dd33a3c57fec88d99279 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tensor_coding.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -119,6 +120,100 @@ class ConstantFoldingTest : public GrapplerTest {
       }
     }
   }
+
+  void MulConvPushDownTest(const TensorShape& input_shape,
+                           const TensorShape& filter_shape,
+                           const TensorShape& mul_const_input_shape,
+                           const bool use_3d_conv, const char* padding,
+                           const char* data_format, const bool expect_folded) {
+    // Tests if the following rewrite is performed:
+    //
+    //         *                       Conv2D
+    //        / \                       / \
+    //       c  Conv2D        -->      x  (c * filter)
+    //           / \
+    //          x  filter
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    Tensor filter_values(DT_FLOAT, filter_shape);
+    for (int i = 0; i < filter_values.NumElements(); ++i) {
+      filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
+    }
+    Output filter =
+        ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
+
+    Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                    ops::Placeholder::Shape(input_shape));
+
+    Output conv;
+    if (use_3d_conv) {
+      conv = ops::Conv3D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1, 1},
+                         padding, ops::Conv3D::DataFormat(data_format));
+    } else {
+      conv = ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1},
+                         padding, ops::Conv2D::DataFormat(data_format));
+    }
+    Tensor mul_const_input(DT_FLOAT, mul_const_input_shape);
+    for (int i = 0; i < mul_const_input.NumElements(); ++i) {
+      mul_const_input.flat<float>()(i) = static_cast<float>(i + 3);
+    }
+    Output c =
+        ops::Const(s.WithOpName("c"), Input::Initializer(mul_const_input));
+    Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(5, output.node_size());
+    int found = 0;
+    if (expect_folded) {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("conv/merged_input", node.input(1));
+        } else if (node.name() == "conv/merged_input") {
+          found++;
+          EXPECT_EQ("Const", node.op());
+          EXPECT_EQ(0, node.input_size());
+        }
+      }
+    } else {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ("Mul", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("c", node.input(0));
+          EXPECT_EQ("conv", node.input(1));
+        } else if (node.name() == "conv") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("filter", node.input(1));
+        }
+      }
+    }
+    EXPECT_EQ(2, found);
+
+    // Check that const folded multiplication node has the expected value.
+    std::vector<string> fetch = {"mul"};
+    Tensor value(DT_FLOAT, input_shape);
+    for (int i = 0; i < value.NumElements(); ++i) {
+      value.flat<float>()(i) = i;
+    }
+    auto actual = EvaluateNodes(output, fetch, {{"x", value}});
+    auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
+    test::ExpectTensorEqual<float>(expected[0], actual[0]);
+  }
 };
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
@@ -242,73 +337,147 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
-TEST_F(ConstantFoldingTest, ConvPushDownTest) {
-  // Tests if the following rewrite is performed:
-  //
-  //         *                       Conv2D
-  //        / \                       / \
-  //       c  Conv2D        -->      x  (c * filter)
-  //           / \
-  //          x  filter
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/true);
+  }
+}
 
-  int input_depth = 3;
-  int filter_count = 5;
-  int filter_size = 2;
-  TensorShape filter_shape(
-      {filter_size, filter_size, input_depth, filter_count});
-  Tensor filter_values(DT_FLOAT, filter_shape);
-  for (int i = 0; i < filter_values.NumElements(); ++i) {
-    filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
-  }
-  Output filter =
-      ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
-
-  int batch_size = 4;
-  int input_dim = 10;
-  TensorShape input_shape({batch_size, input_dim, input_dim, input_depth});
-  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
-                                  ops::Placeholder::Shape(input_shape));
-
-  Output conv =
-      ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, "VALID");
-  Output c = ops::Const(s.WithOpName("c"), 3.0f, {1});
-  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    for (auto mul_const_input_shape :
+         {TensorShape{1}, TensorShape{1, 1, 1, 1}}) {
+      MulConvPushDownTest(
+          /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                                : TensorShape{4, 3, 10, 10},
+          /*filter_shape=*/{2, 2, 3, 5}, mul_const_input_shape,
+          /*use_3d_conv=*/false,
+          /*padding=*/"VALID", data_format.c_str(),
+          /*expect_folded=*/true);
+    }
+  }
+}
 
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+TEST_F(ConstantFoldingTest,
+       MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{1, 1, 1, 1, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/false);
+  }
+}
 
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef output;
-  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
-  TF_EXPECT_OK(status);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1x3Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1, 3},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
+  }
+}
 
-  EXPECT_EQ(5, output.node_size());
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "mul") {
-      found++;
-      EXPECT_EQ("Conv2D", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("conv/merged_input", node.input(1));
-    } else if (node.name() == "conv/merged_input") {
-      found++;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(0, node.input_size());
-    }
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NHWC_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{1, 3}, TensorShape{1, 1, 1, 3}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NHWC",
+        /*expect_folded=*/true);
   }
-  EXPECT_EQ(2, found);
+}
 
-  // Check that const folded multiplication node has the expected value.
-  std::vector<string> fetch = {"mul"};
-  Tensor value(DT_FLOAT, input_shape);
-  for (int i = 0; i < value.NumElements(); ++i) {
-    value.flat<float>()(i) = i;
+#if GOOGLE_CUDA
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NCHW_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{3, 1, 1}, TensorShape{1, 3, 1, 1}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NCHW",
+        // TODO(laigd): optimization should happen in this case.
+        /*expect_folded=*/false);
+  }
+}
+#endif  // GOOGLE_CUDA
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
   }
-  auto actual = EvaluateNodes(output, fetch, {{"x", value}});
-  auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
-  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NDHWC_1x1x3Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{1, 1, 3},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      /*expect_folded=*/true);
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NCDHW_3x1x1x1Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{3, 1, 1, 1},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      // TODO(laigd): optimization should happen in this case.
+      /*expect_folded=*/false);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement) {
@@ -378,7 +547,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     const string ones_name = strings::StrCat("ones", suffix);
     const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
     const string ctrl_ones_name = strings::StrCat("^ones", suffix);
-    EXPECT_EQ(27, output.node_size());
+    EXPECT_EQ(const_type == kFill ? 31 : 27, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
       const string& name = node.name();
@@ -3466,6 +3635,170 @@ TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
   }
 }
 
+TEST_F(ConstantFoldingTest, MaterializeConstantValuedNode) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output x =
+      ops::Placeholder(scope.WithOpName("x"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({1, 2, 3, 4})));
+  Output ones_like = ops::OnesLike(scope.WithOpName("ones_like"), x);
+  Output zeros_like = ops::ZerosLike(scope.WithOpName("zeros_like"), x);
+  Output fill = ops::Fill(scope.WithOpName("fill"), {4, 3, 2, 1}, 42);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"ones_like", "zeros_like", "fill"};
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 2, 3, 4}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), 6);
+  for (const auto& node : output.node()) {
+    if (node.name() != "x") {
+      EXPECT_EQ(node.op(), "Const");
+    }
+    if (node.name() == "ones_like" || node.name() == "zeros_like") {
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "^x");
+    }
+    if (node.name() == "fill") {
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0)[0], '^');
+      EXPECT_EQ(node.input(1)[0], '^');
+    }
+  }
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  ASSERT_EQ(item.fetch.size(), tensors.size());
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    if (item.fetch[i] == "fill") {
+      test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+    } else {
+      test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, MaterializeConstantValuedNodeHugeFill) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output value = ops::Const(scope.WithOpName("value"), 42, {});
+  Output fill_huge = ops::Fill(scope.WithOpName("fill_huge"),
+                               {1024, 1024, 1024, 1024, 1024}, value);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  // Manually convert the input value format to tensor_content to test this
+  // case.
+  NodeDef* node = item.graph.mutable_node(0);
+  ASSERT_EQ(node->name(), "value");
+  TensorProto* t = (*node->mutable_attr())["value"].mutable_tensor();
+  t->clear_int_val();
+  int val = 42;
+  port::CopyFromArray(t->mutable_tensor_content(),
+                      reinterpret_cast<const char*>(&val), sizeof(int));
+  item.fetch = {"fill_huge"};
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), 3);
+  for (const auto& node : output.node()) {
+    EXPECT_EQ(node.op(), "Const");
+    if (node.name() == "fill_huge") {
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0)[0], '^');
+      EXPECT_EQ(node.input(1)[0], '^');
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Tensor x_t(DT_INT64, TensorShape({2, 2}));
+  x_t.flat<int64>()(0) = 9223372036854775807L;
+  x_t.flat<int64>()(1) = 1L;
+  x_t.flat<int64>()(2) = 9223372036854775807L;
+  x_t.flat<int64>()(3) = 1L;
+  Output x = ops::Const(scope.WithOpName("x"), x_t);
+  Output y = ops::Bitcast(scope.WithOpName("y"), x, DT_FLOAT);
+  Output z = ops::Bitcast(scope.WithOpName("z"), y, DT_INT64);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"z"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  ASSERT_EQ(output.node_size(), 1);
+  const NodeDef& node = output.node(0);
+  EXPECT_EQ(node.name(), "z");
+  EXPECT_EQ(node.op(), "Const");
+
+  auto tensors = EvaluateNodes(output, item.fetch, {});
+  ASSERT_EQ(tensors.size(), 1);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
+}
+
+TEST_F(ConstantFoldingTest, CompressConstants) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Tensor zeros_t(DT_FLOAT, TensorShape({64}));
+  Tensor ones_t(DT_FLOAT, TensorShape({64}));
+  for (int i = 0; i < 64; ++i) {
+    zeros_t.flat<float>()(i) = 0.0f;
+    ones_t.flat<float>()(i) = 1.0f;
+  }
+  Output zeros = ops::Const(scope.WithOpName("zeros"), zeros_t);
+  Output host_ones = ops::Const(scope.WithOpName("host_ones"), ones_t);
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  ASSERT_EQ(item.graph.node(1).name(), "host_ones");
+  // There is not C++ api for HostConst, so we manually change the node type
+  // here.
+  item.graph.mutable_node(1)->set_op("HostConst");
+  item.fetch = {"zeros", "host_ones"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(/*cluster=*/nullptr, item, &output));
+
+  {
+    ASSERT_EQ(output.node_size(), 2);
+    const NodeDef& node = output.node(0);
+    EXPECT_EQ(node.name(), "zeros");
+    EXPECT_EQ(node.op(), "Const");
+    const TensorProto& zeroes_t = node.attr().at("value").tensor();
+    EXPECT_EQ(zeroes_t.float_val_size(), 1);
+    EXPECT_EQ(zeroes_t.float_val(0), 0.0f);
+  }
+  {
+    const NodeDef& node = output.node(1);
+    EXPECT_EQ(node.name(), "host_ones");
+    EXPECT_EQ(node.op(), "HostConst");
+    const TensorProto& ones_t = node.attr().at("value").tensor();
+    EXPECT_EQ(ones_t.float_val_size(), 1);
+    EXPECT_EQ(ones_t.float_val(0), 1.0f);
+  }
+
+  auto tensors = EvaluateNodes(output, item.fetch, {});
+  ASSERT_EQ(tensors.size(), 2);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  for (int i = 0; i < 2; ++i) {
+    test::ExpectTensorEqual<float>(tensors[i], tensors_expected[i]);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index fe59dfef159c1c63a58f781a3666059bd954a4bd..49f36fc7c556d998c8390f6d7e913d25b8784363 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 package(default_visibility = [
     "//tensorflow/core/grappler/optimizers/data:__subpackages__",
     "//tensorflow/core/kernels/data:__pkg__",
+    "//tensorflow/core/kernels/data/experimental:__pkg__",
 ])
 
 cc_library(
@@ -28,6 +29,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "auto_shard",
+    srcs = ["auto_shard.cc"],
+    hdrs = ["auto_shard.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler/utils:functions",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
@@ -84,6 +106,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
@@ -125,7 +148,9 @@ tf_cc_test(
     srcs = ["function_utils_test.cc"],
     deps = [
         ":function_utils",
+        ":graph_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -133,7 +158,7 @@ tf_cc_test(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/tools/graph_transforms:transform_utils",
-    ],
+    ] + tf_protos_all(),
 )
 
 cc_library(
@@ -422,6 +447,7 @@ tf_cc_test(
         ":graph_test_utils",
         ":graph_utils",
         ":map_fusion",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -438,6 +464,7 @@ cc_library(
         "map_parallelization.h",
     ],
     deps = [
+        ":function_utils",
         ":graph_utils",
         ":optimizer_base",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -478,6 +505,7 @@ cc_library(
         ":optimizer_base",
         ":vectorization_utils",
         "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
@@ -494,13 +522,23 @@ tf_cc_test(
     name = "map_vectorization_test",
     srcs = ["map_vectorization_test.cc"],
     deps = [
+        ":function_utils",
         ":graph_utils",
         ":map_vectorization",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels/data",
     ],
 )
 
@@ -525,6 +563,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "rebatch",
+    srcs = ["rebatch.cc"],
+    hdrs = ["rebatch.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core:lib",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
 cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
@@ -566,7 +622,7 @@ cc_library(
         "optimizer_base.h",
     ],
     deps = [
-        "//tensorflow/core:metrics",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
     ],
 )
@@ -645,12 +701,24 @@ tf_cc_test(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         # For ops we need registered
         "//tensorflow/core/kernels/data:dataset_ops",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
+        "//tensorflow/core:logging_ops_op_lib",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core/kernels:nn",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels:parsing",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61f76316ec1e7848b2d509cae90aee5fa1a01110
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -0,0 +1,300 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/auto_shard.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+// clang-format off
+constexpr char kShardDatasetOpName[] = "ShardDataset";
+constexpr char kShuffleDatasetOpName[] = "ShuffleDataset";
+
+constexpr std::array<const char*, 4> kReaderDatasetOps = {
+    "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2",
+    "TextLineDataset",
+    "TFRecordDataset"
+};
+
+constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
+    "ConcatenateDataset",
+    "ZipDataset"
+};
+
+constexpr std::array<const char*, 22> kPassThroughOps = {
+    "BatchDataset",
+    "BatchDatasetV2",
+    "ExperimentalMapAndBatchDataset",
+    "PaddedBatchDataset",
+    "PaddedBatchDatasetV2",
+    "CacheDataset",
+    "FilterDataset",
+    "FilterByLastComponentDataset",
+    "Identity",
+    "MapDataset",
+    "ModelDataset",
+    "OptimizeDataset",
+    "ParallelMapDataset",
+    "PrefetchDataset",
+    "ReduceDataset",
+    "RepeatDataset",
+    "ShardDataset",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "TakeDataset",
+    "WindowDataset"
+};
+
+// TODO(frankchn): Process functions within kFuncDatasetOps as well.
+constexpr std::array<const char*, 4> kFuncDatasetOps = {
+    "ExperimentalParallelInterleaveDataset",
+    "FlatMapDataset",
+    "InterleaveDataset",
+    "ParallelInterleaveDatasetV2"
+};
+
+constexpr std::array<const char*, 5> kUnshardableSourceDatasetOps = {
+    "GeneratorDataset",
+    "RangeDataset",
+    "SparseTensorsSliceDataset",
+    "TensorDataset",
+    "TensorSliceDataset",
+};
+// clang-format on
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
+                     GraphDef* output);
+
+template <std::size_t SIZE>
+bool IsDatasetNodeOfType(const NodeDef& node,
+                         const std::array<const char*, SIZE>& arr) {
+  for (const auto& dataset_op_name : arr) {
+    if (node.op() == dataset_op_name) return true;
+  }
+  return false;
+}
+
+Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
+                    int64 num_workers, int64 index) {
+  NodeDef new_node;
+  new_node.set_op(kShardDatasetOpName);
+  graph_utils::SetUniqueGraphNodeName(kShardDatasetOpName, graph->graph(),
+                                      &new_node);
+
+  // Construct argument nodes
+  NodeDef* num_shards_node =
+      graph_utils::AddScalarConstNode<int64>(num_workers, graph);
+  NodeDef* index_node = graph_utils::AddScalarConstNode<int64>(index, graph);
+
+  // Add inputs to new node
+  new_node.add_input(add_before.input(0));
+  new_node.add_input(num_shards_node->name());
+  new_node.add_input(index_node->name());
+
+  // Add shapes and other attributes
+  NodeDef* add_after = graph->GetNode(add_before.input(0));
+  graph_utils::CopyAttribute("output_shapes", *add_after, &new_node);
+
+  if (add_after->attr().find("Toutput_types") != add_after->attr().end()) {
+    (*(new_node.mutable_attr()))["output_types"] =
+        add_after->attr().at("Toutput_types");
+  } else {
+    graph_utils::CopyAttribute("output_types", *add_after, &new_node);
+  }
+
+  // Add new node into graph and update edges
+  NodeDef* new_node_graph = graph->AddNode(std::move(new_node));
+  TF_RETURN_IF_ERROR(
+      graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
+
+  return Status::OK();
+}
+
+bool ReaderOpInFunction(const NodeDef& node,
+                        const FunctionLibraryDefinition& flib) {
+  const FunctionDef* func = flib.Find(node.attr().at("f").func().name());
+  for (int i = 0; i < func->node_def_size(); i++) {
+    NodeDef node_in_func = func->node_def(i);
+    if (IsDatasetNodeOfType(node_in_func, kReaderDatasetOps) &&
+        node_in_func.input_size() > 0 &&
+        str_util::StartsWith(node_in_func.input(0), "args_0")) {
+      return true;
+    }
+    if (IsDatasetNodeOfType(func->node_def(i), kFuncDatasetOps) &&
+        ReaderOpInFunction(func->node_def(i), flib)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
+                            absl::flat_hash_set<string>* nodes_to_delete) {
+  if (node.op() == kShuffleDatasetOpName) {
+    TF_RETURN_IF_ERROR(graph->UpdateFanouts(node.name(), node.input(0)));
+    nodes_to_delete->insert(node.name());
+  }
+
+  for (const auto& fanin : graph->GetFanins(node, true)) {
+    TF_RETURN_IF_ERROR(
+        RemoveShuffleDataset(graph, *fanin.node, nodes_to_delete));
+  }
+
+  // TODO(frankchn): Traverse functions too.
+  return Status::OK();
+}
+
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
+                           FunctionLibraryDefinition* flib,
+                           MutableGraphView* graph,
+                           absl::flat_hash_set<string>* nodes_to_delete) {
+  if (IsDatasetNodeOfType(node, kUnshardableSourceDatasetOps)) {
+    return errors::NotFound("Found an unshardable source dataset: ",
+                            node.DebugString());
+  }
+
+  if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
+    for (int i = 0; i < node.input_size(); ++i) {
+      const NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
+      TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers, index,
+                                             flib, graph, nodes_to_delete));
+    }
+    return Status::OK();
+  }
+
+  // This handles the case where a reader Dataset is contained within a
+  // FuncDataset (e.g. FlatMap, ParallelInterleave, etc...). For example:
+  //
+  // dataset = Dataset.list_files("/path/to/data")
+  // dataset = dataset.flat_map(core_readers.TFRecordDataset)
+  //
+  // where the list of files is passed in one-by-one as an argument to the
+  // function in flat_map.
+  if (IsDatasetNodeOfType(node, kFuncDatasetOps) &&
+      ReaderOpInFunction(node, *flib)) {
+    TF_RETURN_IF_ERROR(AddShardNode(graph, node, num_workers, index));
+    TF_RETURN_IF_ERROR(RemoveShuffleDataset(graph, node, nodes_to_delete));
+    return Status::OK();
+  }
+
+  if (IsDatasetNodeOfType(node, kReaderDatasetOps)) {
+    // We reached a reader dataset directly and we try to shard input 0.
+    TF_RETURN_IF_ERROR(AddShardNode(graph, node, num_workers, index));
+    TF_RETURN_IF_ERROR(RemoveShuffleDataset(graph, node, nodes_to_delete));
+    return Status::OK();
+  }
+
+  if (!IsDatasetNodeOfType(node, kPassThroughOps)) {
+    return errors::NotFound(
+        "Did not find a shardable source, walked to ",
+        "a node which is not a dataset: ", node.DebugString());
+  }
+
+  const NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+  return RecursivelyHandleOp(*input_node, num_workers, index, flib, graph,
+                             nodes_to_delete);
+}
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
+                     GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
+
+  NodeDef target_node;
+  absl::flat_hash_set<string> nodes_to_delete;
+
+  // The basic approach here is to walk the graph from sink to source, and find
+  // the latest occurrence of a ReaderDataset (e.g. CSVDataset, TFRecordDataset,
+  // etc...). We then add a shard after that dataset to shard the outputs of
+  // that dataset, in effect giving a piece to each worker. Finally, we remove
+  // occurences from randomness from before that point in the graph (e.g. things
+  // like ShuffleDataset) to ensure that `shard` returns a sensible result.
+
+  NodeDef sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
+  TF_RETURN_IF_ERROR(RecursivelyHandleOp(sink_node, num_workers, index, &flib,
+                                         &graph, &nodes_to_delete));
+
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
+
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status AutoShard::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return errors::InvalidArgument("RewriterConfig not found.");
+
+  if ((config->parameter_map().find("num_workers") ==
+       config->parameter_map().end())) {
+    return errors::InvalidArgument("num_workers parameter missing.");
+  }
+
+  if ((config->parameter_map().find("index") ==
+       config->parameter_map().end())) {
+    return errors::InvalidArgument("index parameter missing.");
+  }
+
+  num_workers_ = config->parameter_map().at("num_workers").i();
+  index_ = config->parameter_map().at("index").i();
+
+  if (num_workers_ < 1) {
+    return errors::InvalidArgument("num_workers should be >= 1, currently ",
+                                   num_workers_);
+  }
+
+  if (index_ < 0 || index_ >= num_workers_) {
+    return errors::InvalidArgument("index should be >= 0 and < ", num_workers_,
+                                   ", currently ", index_);
+  }
+
+  return Status::OK();
+}
+
+Status AutoShard::OptimizeAndCollectStats(Cluster* /* cluster */,
+                                          const GrapplerItem& item,
+                                          GraphDef* output,
+                                          OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, index_, output));
+  stats->num_changes++;
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(AutoShard, "tf_auto_shard");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.h b/tensorflow/core/grappler/optimizers/data/auto_shard.h
new file mode 100644
index 0000000000000000000000000000000000000000..67692b9e8bc5e3dca8db0f0066c2cfe8fba3df15
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.h
@@ -0,0 +1,53 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// AutoShard takes a Dataset graph and tries to insert a shard node
+// automatically before a ReaderDataset (e.g. a CSVDataset or a TFRecordDataset)
+// such that the dataset is sharded without any modifications to the original
+// dataset-based input pipeline.
+class AutoShard : public TFDataOptimizerBase {
+ public:
+  AutoShard() = default;
+  ~AutoShard() override = default;
+
+  string name() const override { return "tf_auto_shard"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {}
+
+ private:
+  int64 num_workers_;
+  int64 index_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTO_SHARD_H_
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index 351209f92ea258d60a4f75e10ca3ec9015229a26..20536910db12607bcef9155d739251648696a0c7 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -171,6 +171,57 @@ void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
   node->set_name(std::move(name));
 }
 
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def, bool skip_assert) {
+  if (!function_def.signature().is_stateful()) return false;
+
+  for (const NodeDef& node_def : function_def.node_def()) {
+    if (IsNodeStateful(library, node_def, skip_assert)) return true;
+  }
+  return false;
+}
+
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert) {
+  const OpDef* op_def;
+  Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+
+  if (!s.ok()) return true;
+
+  if (!op_def->is_stateful()) return false;
+
+  if (skip_assert && op_def->name() == "Assert") {
+    return false;
+  }
+
+  if (op_def->name() == "If") {
+    const FunctionDef* then_func =
+        library.Find(node.attr().at("then_branch").func().name());
+    const FunctionDef* else_func =
+        library.Find(node.attr().at("else_branch").func().name());
+    if ((then_func != nullptr &&
+         !IsFunctionStateful(library, *then_func, skip_assert)) &&
+        (else_func != nullptr &&
+         !IsFunctionStateful(library, *else_func, skip_assert))) {
+      return false;
+    }
+  }
+
+  if (op_def->name() == "While") {
+    const FunctionDef* cond_func =
+        library.Find(node.attr().at("cond").func().name());
+    const FunctionDef* body_func =
+        library.Find(node.attr().at("body").func().name());
+    if ((cond_func != nullptr &&
+         !IsFunctionStateful(library, *cond_func, skip_assert)) &&
+        (body_func != nullptr &&
+         !IsFunctionStateful(library, *body_func, skip_assert))) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace function_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index ec08c11dce2dc8ead9c0c469253dbed2e4d37a63..79271e8ad0c330318ed4538c46158967758e5747 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -101,8 +101,24 @@ int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function);
 void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
                                NodeDef* node);
 
-}  // namespace function_utils
-}  // namespace grappler
-}  // namespace tensorflow
+// Checks if the function is stateful by checking the function graph for
+// stateful ops. Because the "If" and "While" ops are conservatively marked as
+// stateful, the check recurses into their graph to determine whether they are
+// actually stateful. The `skip_assert` argument determines whether the "Assert"
+// op should be treated as stateful or not.
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def,
+                        bool skip_assert = false);
+
+// Checks if the node is stateful. Because the "If" or "While" ops are
+// conservatively marked as stateful, the check recurses into their graph to
+// determine whether they are actually stateful. The `skip_assert` argument
+// determines whether the "Assert" op  should be treated as stateful or not.
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert = false);
+
+}  // end namespace function_utils
+}  // end namespace grappler
+}  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index 3739e20eb1444fa24ec5553b8a133d8d96c5d714..8ae0cde4cd1ba20c8259ae9ac7e7a767f7b542e4 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -158,6 +160,692 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   }
 }
 
+// Graph containing function with "If" and "Assert" Op.
+/*
+  @eager_function.defun
+  def test_function():
+    pred = constant_op.constant(True)
+
+    def fn1():
+      return control_flow_ops.no_op()
+
+    def fn2():
+      return control_flow_ops.Assert(False, ["Wrong branch!!!"])
+
+    return control_flow_ops.cond(pred, fn1, fn2)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kCondGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_BOOL } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-20" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_19" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "cond_true_3"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+      }
+      node_def { name: "NoOp" op: "NoOp" }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^NoOp"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "cond_false_4"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Assert/Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/condition"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: false
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/data_0"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert"
+        op: "Assert"
+        input: "Assert/Assert/condition:output:0"
+        input: "Assert/Assert/data_0:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_STRING } }
+        }
+        attr {
+          key: "summarize"
+          value { i: 3 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_19"
+        output_arg { name: "identity" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: true
+            }
+          }
+        }
+      }
+      node_def {
+        name: "cond"
+        op: "If"
+        input: "Const:output:0"
+        input: "Const:output:0"
+        attr {
+          key: "Tcond"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "Tin"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "Tout"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "else_branch"
+          value { func { name: "cond_false_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value { list { shape {} } }
+        }
+        attr {
+          key: "then_branch"
+          value { func { name: "cond_true_3" } }
+        }
+      }
+      node_def {
+        name: "cond/Identity"
+        op: "Identity"
+        input: "cond:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "cond/Identity:output:0"
+        input: "^cond"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// Graph containing function with "While" Op in python.
+/*
+  @eager_function.defun
+  def test_function():
+    return control_flow_ops.while_loop(
+        lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kWhileGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_INT32 } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-35" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_34" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "while_body_5"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_INT32 }
+        output_arg { name: "identity_1" type: DT_INT32 }
+        output_arg { name: "identity_2" type: DT_INT32 }
+      }
+      node_def {
+        name: "add/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add"
+        op: "Add"
+        input: "const"
+        input: "add/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "add_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add_1"
+        op: "Add"
+        input: "while_loop_counter"
+        input: "add_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "add_1:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "add:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_2"
+        op: "Identity"
+        input: "maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+      ret { key: "identity_2" value: "Identity_2:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_34"
+        output_arg { name: "identity" type: DT_INT32 }
+        is_stateful: true
+      }
+      node_def {
+        name: "maximum_iterations"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/loop_counter"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while"
+        op: "While"
+        input: "while/loop_counter:output:0"
+        input: "Const:output:0"
+        input: "maximum_iterations:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_INT32 type: DT_INT32 type: DT_INT32 } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "body"
+          value { func { name: "while_body_5" } }
+        }
+        attr {
+          key: "cond"
+          value { func { name: "while_cond_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value {
+            list {
+              shape {}
+              shape {}
+              shape {}
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/Identity"
+        op: "Identity"
+        input: "while:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_1"
+        op: "Identity"
+        input: "while:output:1"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_2"
+        op: "Identity"
+        input: "while:output:2"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "while/Identity_1:output:0"
+        input: "^while"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+    function {
+      signature {
+        name: "while_cond_4"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "less_maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_BOOL }
+      }
+      node_def {
+        name: "Less"
+        op: "Less"
+        input: "while_loop_counter"
+        input: "less_maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Less_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 3
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Less_1"
+        op: "Less"
+        input: "const"
+        input: "Less_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "LogicalAnd"
+        op: "LogicalAnd"
+        input: "Less:z:0"
+        input: "Less_1:z:0"
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "LogicalAnd:z:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// TODO(shivaniagrawal): split the test into multiple tests for better
+// readability and add full coverage i.e. add/separate out the tests for all
+// branches of IsNodeStateful and IsFunctionStateful:
+// - test for IsNodeStateful for Cond that has a stateful branch
+// - test for IsNodeStateful for Cond that does not have a stateful branches
+// - test for IsNodeStateful for While that has a stateful branch
+// - test for IsNodeStateful for While that does not have a stateful branches
+// - test for IsNodeStateful for Assert
+// - test for IsNodeStateful for a stateful op
+// - test for IsNodeStateful for a stateless op
+//
+// - test for IsFunctionStateful for a function that contains a Cond
+// - test for IsFunctionStateful for a function that contains a While
+// - test for IsFunctionStateful for a function that contains an Assert (and no
+//   other stateful op)
+// - test for IsFunctionStateful for a function that contains a stateful op
+//   other than Assert
+// - test for IsFunctionStateful for a function that does not contain a stateful
+//   op
+
+TEST(FunctionUtilsTest, IsFunctionStateful) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* nodeA = graph_utils::AddNode("", "A", {}, {}, &graph);
+  FunctionDef* function = graph_def.mutable_library()->add_function();
+  *function = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(),
+                                    *graph_def.mutable_library());
+
+  EXPECT_FALSE(IsFunctionStateful(lib_def, *function));
+
+  // Op "A" is not a registered Op.
+  EXPECT_TRUE(IsNodeStateful(lib_def, *nodeA));
+
+  // Get graph_def for the graph `kCondGraphProto`, graph with function
+  // containing "If" and "Assert" Op.
+
+  GraphDef graph_def_cond;
+  protobuf::TextFormat::ParseFromString(kCondGraphProto, &graph_def_cond);
+  FunctionLibraryDefinition cond_lib(OpRegistry::Global(),
+                                     graph_def_cond.library());
+
+  const FunctionDef* no_op_fnc = cond_lib.Find("cond_true_3");
+
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc, true));
+
+  const FunctionDef* assert_func = cond_lib.Find("cond_false_4");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *assert_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *assert_func, true));
+
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Const", *assert_func));
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Assert", *assert_func));
+
+  for (auto node : assert_func->node_def()) {
+    if (node.op() == "Const") {
+      EXPECT_FALSE(IsNodeStateful(lib_def, node));
+    }
+    if (node.op() == "Assert") {
+      EXPECT_TRUE(IsNodeStateful(lib_def, node));
+      EXPECT_FALSE(IsNodeStateful(lib_def, node, true));
+    }
+  }
+
+  const FunctionDef* cond_func = cond_lib.Find("__inference_test_function_19");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *cond_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *cond_func, true));
+
+  // Get graph def for the graph `kWhileGraphProto`, graph with function
+  // containing "While" Op.
+
+  GraphDef graph_def_while;
+  protobuf::TextFormat::ParseFromString(kWhileGraphProto, &graph_def_while);
+
+  FunctionLibraryDefinition while_lib(OpRegistry::Global(),
+                                      graph_def_while.library());
+  const FunctionDef* while_function =
+      while_lib.Find("__inference_test_function_34");
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function));
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function, true));
+}
 }  // namespace
 }  // namespace function_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index cbafb9dc8231509181ace4a1bb02ef5f2191728b..483b95897b5d0ef8aa0611f76db156ee644c1184 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -232,6 +232,13 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph) {
   return graph.GetRegularFanin(input_port).node;
 }
 
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
+                      int64 i) {
+  if (node.input_size() <= i) return nullptr;
+  MutableGraphView::InputPort input_port = graph.GetInputPort(node.name(), i);
+  return graph.GetRegularFanin(input_port).node;
+}
+
 void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph,
                             NodeDef* node) {
   string name = string(prefix);
@@ -293,6 +300,40 @@ Status EnsureNodeNamesUnique(Graph* g) {
 
   return Status::OK();
 }
+
+// Tries to find a Sink node in the graph. A sink node is defined as a node
+// that has at least one input and no outputs. If there are multiple of these,
+// this might return any one of them. This is useful to identify the final
+// Dataset op in the graph but in some cases there might be multiple Identity
+// ops added to the end and this would return the last Identity op in that case.
+
+Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node) {
+  absl::flat_hash_map<string, int> all_node_names;
+  absl::flat_hash_map<string, int> node_input_map;
+  for (int i = 0; i < graph_def.node_size(); ++i) {
+    all_node_names.insert_or_assign(graph_def.node(i).name(), i);
+    node_input_map.insert_or_assign(graph_def.node(i).name(), 0);
+  }
+  // Counts how many graph nodes for each input name. Candidate sink
+  // nodes are ones which are inputs into zero nodes.
+  for (const NodeDef& node : graph_def.node()) {
+    for (const string& input_name : node.input()) {
+      node_input_map[input_name]++;
+    }
+  }
+  for (const auto& it : node_input_map) {
+    if (it.second == 0) {
+      const NodeDef& sink_graph_node = graph_def.node(all_node_names[it.first]);
+      if (sink_graph_node.input_size() == 0) {
+        continue;
+      }
+      *sink_node = sink_graph_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Failed to find a sink node");
+}
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 8f2872c146ba6201436e83b94037bc529efba37c..0253b6d90b5c5b578761b5e47e5591f9560a1eb6 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -108,6 +108,10 @@ int FindGraphNodeWithOp(StringPiece op, const GraphDef& graph);
 // Gets the 0th input to a node in the graph.
 NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph);
 
+// Gets the ith input to a node in the graph.
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
+                      int64 i);
+
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
 std::vector<int> FindAllGraphNodesWithOp(const string& op,
@@ -140,6 +144,9 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 // and renaming nodes does not mutate any edges.
 Status EnsureNodeNamesUnique(Graph* g);
 
+// Returns the sink node (i.e. last node) in the graph.
+Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node);
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 3b6d223fd36b68cf187a7da2a00a47b0757c997b..8108c84fe4a0bf3b2b177fb743888c713204d468 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -228,6 +228,21 @@ TEST(GraphUtilsTest, GetInputNode) {
   EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
 }
 
+TEST(GraphUtilsTest, GetIthInputNode) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node1 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node3 = AddNode("", "A", {node1->name(), node2->name()}, {}, &graph);
+
+  EXPECT_EQ(GetInputNode(*node3, graph), node1);
+  EXPECT_EQ(GetInputNode(*node3, graph, 1), node2);
+  EXPECT_EQ(GetInputNode(*node3, graph, 0), node1);
+  EXPECT_EQ(GetInputNode(*node3, graph, 2), nullptr);
+  EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
+}
+
 TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
   Graph g(OpRegistry::Global());
 
@@ -255,6 +270,40 @@ TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
   EXPECT_NE(const_0->name(), const_2->name());
 }
 
+TEST(GraphUtilsTest, TestFindSinkNodeStandard) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  AddNode("node1", "Identity", {}, {}, &graph);
+  AddNode("node2", "Identity", {"node1"}, {}, &graph);
+  NodeDef* node3 = AddNode("node3", "Identity", {"node2"}, {}, &graph);
+
+  NodeDef sink_node;
+  TF_EXPECT_OK(FindSinkNode(graph_def, &sink_node));
+  EXPECT_EQ(sink_node.name(), node3->name());
+}
+
+TEST(GraphUtilsTest, TestFindSinkNodeNoSingleSink) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  AddNode("node1", "Identity", {}, {}, &graph);
+  AddNode("node2", "Identity", {}, {}, &graph);
+
+  NodeDef sink_node;
+  Status s = FindSinkNode(graph_def, &sink_node);
+  EXPECT_FALSE(s.ok());
+}
+
+TEST(GraphUtilsTest, TestFindSinkNodeGraphDefEmpty) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef sink_node;
+  Status s = FindSinkNode(graph_def, &sink_node);
+  EXPECT_FALSE(s.ok());
+}
+
 }  // namespace
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 496277780f166ab85e967bc71119b59b649dd86d..e29b620140236aa8852d7bd36799b99ce62c1f0d 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -174,7 +174,7 @@ const FunctionDef* MakeLessStatefulFunction(const FunctionDef& map_function,
   return stateless_function;
 }
 // This function returns true if function is stateful and has single
-// RandomUniform op and no other stateful ops except Assert.
+// RandomUniform op and no other stateful ops except Assert and If/While.
 // `is_stateful_after_hoisting` is set to true if RandomUniform is the only
 // stateful op and hoisting can be performed.
 bool CanHoistRandomUniform(const FunctionDef& map_function,
@@ -189,10 +189,10 @@ bool CanHoistRandomUniform(const FunctionDef& map_function,
   for (const auto& node : map_function.node_def()) {
     const OpDef* op_def;
     TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Skip stateless nodes and assert, as it does not actually have a state.
     if (!op_def->is_stateful()) continue;
 
-    if (op_def->name() == "Assert") {
+    if (!function_utils::IsNodeStateful(library, node, true)) {
+      // Skip ops that are marked stateful but are in fact not stateful.
       have_other_stateful_ops = true;
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 4529b89bd4aa2d6ab47884bba13922fa20c568bc..9bff0685ba061fb090309b4179fcb9f4419ddb8b 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -37,8 +37,8 @@ constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
   new_node.set_op(kInsertOpName);
-  graph_utils::SetUniqueGraphNodeName(
-      strings::StrCat(kInsertOpName, "_generated"), graph->graph(), &new_node);
+  graph_utils::SetUniqueGraphNodeName(strings::StrCat(kInsertOpName),
+                                      graph->graph(), &new_node);
   // Set the input of LatencyDataset node as `node`
   new_node.add_input(node.name());
 
@@ -75,8 +75,7 @@ Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
   // TODO(shivaniagrawal): Add Op to return Latency for the particular Op than
   // for the edge (e2 - e1?).
   for (const NodeDef& node : item.graph.node()) {
-    if (!str_util::EndsWith(node.op(), "Dataset") || node.attr().empty() ||
-        str_util::EndsWith(node.name(), "_generated")) {
+    if (!str_util::EndsWith(node.op(), "Dataset") || node.attr().empty()) {
       // TODO(b/111805951): Replace this with non-approximate way to check if
       // node corresponds to a `Dataset` op.
       continue;
@@ -87,15 +86,8 @@ Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
     if (fanout.size() > 1) {
       LOG(WARNING) << node.name() << " has fanout size " << fanout.size();
       continue;
-    } else {  // fanout will have size 0 for last dataset node in the pipeline.
-      if (fanout.size() == 1) {
-        NodeDef* output_node = (*(fanout.begin())).node;
-        if (str_util::EndsWith(output_node->name(), "_generated")) {
-          continue;
-        }
-      }
     }
-
+    // fanout will have size 0 for last dataset node in the pipeline.
     NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
     TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
     stats->num_changes++;
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 57e541cde63cab5e087baa344550314bcfffe8e0..90dd885c7fc75954e4207876ac154bec0e9d3093 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -33,23 +34,6 @@ constexpr char kMapDataset[] = "MapDataset";
 constexpr char kParallelMapDataset[] = "ParallelMapDataset";
 constexpr int kAutotune = -1;
 
-bool CanParallelize(const FunctionDef& function,
-                    const FunctionLibraryDefinition& library) {
-  if (!function.signature().is_stateful()) return true;
-
-  for (const auto& node : function.node_def()) {
-    const OpDef* op_def;
-    TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Assert is marked as stateful, but it does not have any state (except
-    // changing io).  Similarly to CUDA, we do not give guarantee that the
-    // assert operation that would fail would be the first one, so that we can
-    // parallelize it.
-    if (op_def->is_stateful() && op_def->name() != "Assert") return false;
-  }
-
-  return true;
-}
-
 NodeDef MakeParallelMap(const string& name, MutableGraphView* graph) {
   // The inputs of the node to be parallelized could be changed by the
   // optimization pass, so we need to look it up in the modified graph.
@@ -88,7 +72,8 @@ Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
 
     auto* function =
         function_library.Find(map_node->attr().at("f").func().name());
-    if (!CanParallelize(*function, function_library)) continue;
+    if (function_utils::IsFunctionStateful(function_library, *function, true))
+      continue;
 
     auto* parallel_map =
         graph.AddNode(MakeParallelMap(map_node->name(), &graph));
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 4c9e1f31e7c2f23c1b2ac9ff941c4b617aa683a1..983b0436338f433bda19595b07160914c03bffe6 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -36,6 +38,22 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kCastOp[] = "Cast";
+constexpr char kRealDivOp[] = "RealDiv";
+constexpr char kSubOp[] = "Sub";
+constexpr char kMulOp[] = "Mul";
+constexpr char kAddOp[] = "Add";
+constexpr char kEqualOp[] = "Equal";
+constexpr char kCeilOp[] = "Ceil";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr char kChooseFastestOp[] = "ChooseFastestBranchDataset";
+constexpr char kPrefetchOp[] = "PrefetchDataset";
+constexpr int kAutotune = -1;
+
 // Returns a FunctionDef containing a MapDefun op that wraps the original
 // function.
 FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
@@ -101,7 +119,6 @@ FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
   const NodeDef& map_defun_node = vectorized_func->node_def(0);
   DCHECK_EQ(map_defun_node.op(), "MapDefun");
 
-  // TODO(b/116285210): Unreferenced functions should get cleaned up later
   FunctionDef* result;
   Status s = vectorization_utils::VectorizeMapDefun(
       *vectorized_func, map_defun_node, library, &result);
@@ -121,6 +138,7 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   const auto& shapes = shapes_attr->list().shape();
 
   for (const TensorShapeProto& shape : shapes) {
+    if (shape.unknown_rank()) return false;
     for (const auto& dim : shape.dim()) {
       if (dim.size() == -1) {
         return false;
@@ -130,34 +148,68 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   return true;
 }
 
-bool IsStatefulFn(const FunctionLibraryDefinition& library,
-                  const FunctionDef& function_def) {
-  for (const NodeDef& node_def : function_def.node_def()) {
-    const OpDef* op_def;
-    Status s = library.LookUpOpDef(node_def.op(), &op_def);
-    if (!s.ok() || op_def->is_stateful()) {
-      return true;
-    }
+// Returns a mapping from input names to the [start, end) indices of the input
+// in the node's input list.
+Status GetInputMap(const NodeDef& node, NameRangeMap* result) {
+  const OpRegistrationData* op_reg_data;  // Owned by global op registry
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(node.op(), &op_reg_data));
+
+  return NameRangesForNode(node, op_reg_data->op_def, result,
+                           /*outputs=*/nullptr);
+}
+
+Status CopyInputs(StringPiece input_name, const NameRangeMap& input_map,
+                  const NodeDef& from, NodeDef* to) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to copy inputs: did not find inputs with name: ", input_name,
+        ", in node with name: ", from.name());
+  }
+  for (int i = range->first; i < range->second; ++i) {
+    to->add_input(from.input(i));
+  }
+
+  return Status::OK();
+}
+
+Status GetInputNodeName(StringPiece input_name, const NameRangeMap& input_map,
+                        const NodeDef& node, string* result) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to get input node name: did not find input with name: ",
+        input_name, ", in node with name: ", node.name());
+  }
+  if (range->second - range->first > 1) {
+    return errors::Internal("Tried to get single input name for a list input.");
   }
-  return false;
+  *result = node.input(range->first);
+  return Status::OK();
 }
 
-NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
-                         const NodeDef& input_node,
-                         const FunctionDef& vectorized_func,
-                         MutableGraphView* graph) {
+Status AddNewBatchNode(const NodeDef& old_batch_node, const NodeDef& input_node,
+                       const FunctionDef& vectorized_func,
+                       MutableGraphView* graph, NodeDef** new_batch_node) {
   NodeDef batch_node;
-  batch_node.set_op(old_batch_node.op());
+  batch_node.set_op(old_batch_node.op() == kBatchOp ? kBatchOp : kBatchV2Op);
   graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->graph(),
                                       &batch_node);
 
   // Set the `input_dataset` input argument
   batch_node.add_input(input_node.name());
-  // Set the `batch_size` input_argument
-  batch_node.add_input(old_batch_node.input(1));
-  if (batch_node.op() == "BatchDatasetV2") {
-    // Set the `drop_remainder` input argument
-    batch_node.add_input(old_batch_node.input(2));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_batch_node, &input_map));
+
+  // Set the `batch_size` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("batch_size", input_map, old_batch_node, &batch_node));
+
+  // Set the `drop_remainder` input argument
+  if (batch_node.op() != kBatchOp) {
+    TF_RETURN_IF_ERROR(
+        CopyInputs("drop_remainder", input_map, old_batch_node, &batch_node));
   }
 
   // Set attrs
@@ -167,34 +219,61 @@ NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
   }
   (*batch_node.mutable_attr())["output_types"] = output_types;
 
+  // It is safe to assume that input_node has the "output_shapes" attr here,
+  // because earlier we checked that the input node has fully defined output
+  // shapes.
   auto& output_shapes_attr = (*batch_node.mutable_attr())["output_shapes"];
   const auto& input_shapes =
       input_node.attr().at("output_shapes").list().shape();
-  int64 batch_size =
-      old_batch_node.attr().at("output_shapes").list().shape()[0].dim(0).size();
+
+  int64 batch_size = -1;
+  for (const auto& shape :
+       old_batch_node.attr().at("output_shapes").list().shape()) {
+    if (!shape.unknown_rank()) {
+      batch_size = shape.dim(0).size();
+      break;
+    }
+  }
+
   for (size_t i = 0; i < input_shapes.size(); ++i) {
+    // Note: We already checked earlier that input shapes are all fully defined.
     TensorShapeProto* shape = output_shapes_attr.mutable_list()->add_shape();
     TensorShapeProto_Dim* dim = shape->add_dim();
     dim->set_size(batch_size);
     shape->MergeFrom(input_shapes.Get(i));
   }
-  return batch_node;
+
+  *new_batch_node = graph->AddNode(std::move(batch_node));
+  return Status::OK();
 }
 
-NodeDef MakeNewMapNode(const NodeDef& old_map_node,
-                       const NodeDef& old_batch_node,
-                       const NodeDef& new_batch_node,
-                       const FunctionDef& vectorized_func,
-                       MutableGraphView* graph) {
+Status AddNewMapNode(const NodeDef& old_map_node, const NodeDef& old_batch_node,
+                     const NodeDef& new_batch_node,
+                     const FunctionDef& vectorized_func,
+                     MutableGraphView* graph, NodeDef** new_map_node) {
   NodeDef map_node;
-  map_node.set_op(old_map_node.op());
+  map_node.set_op(old_map_node.op() == kMapOp ? kMapOp : kParallelMapOp);
   graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->graph(), &map_node);
 
   // Set the `input_dataset` input argument
   map_node.add_input(new_batch_node.name());
-  for (int i = 1; i < old_map_node.input_size(); i++) {
-    // Set the `other_arguments` and `num_parallel_calls` input arguments
-    map_node.add_input(old_map_node.input(i));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_map_node, &input_map));
+
+  // Set the `other_arguments` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("other_arguments", input_map, old_map_node, &map_node));
+
+  // Set the `num_parallel_calls` input argument
+  if (old_map_node.op() != kMapOp) {
+    // `num_parallel_calls` = kAutotune
+    // TODO(rachelim): Evaluate the performance of other potential
+    // transformations to `num_parallel_calls`,
+    // e.g. ceil(old num_parallel_calls // batch size)
+    auto autotune_val =
+        graph_utils::AddScalarConstNode(static_cast<int32>(kAutotune), graph);
+    map_node.add_input(autotune_val->name());
   }
 
   // Set attrs
@@ -207,8 +286,223 @@ NodeDef MakeNewMapNode(const NodeDef& old_map_node,
   }
 
   (*map_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
+  *new_map_node = graph->AddNode(std::move(map_node));
+  return Status::OK();
+}
 
-  return map_node;
+Status AddNewPrefetchNode(const NodeDef& old_prefetch_node,
+                          const NodeDef& old_batch_node,
+                          const NodeDef& new_map_node, MutableGraphView* graph,
+                          NodeDef** new_prefetch_node) {
+  NodeDef prefetch_node;
+  prefetch_node.set_op(kPrefetchOp);
+  graph_utils::SetUniqueGraphNodeName(kPrefetchOp, graph->graph(),
+                                      &prefetch_node);
+
+  // `input_dataset`
+  prefetch_node.add_input(new_map_node.name());
+
+  // `buffer_size` = kAutotune
+  // TODO(rachelim): Evaluate the performance of other potential transformations
+  // to `buffer_size`, e.g. ceil(old buffer size // batch size)
+  auto autotune_val =
+      graph_utils::AddScalarConstNode(static_cast<int64>(kAutotune), graph);
+  prefetch_node.add_input(autotune_val->name());
+
+  for (const auto& key : {"output_shapes", "output_types"}) {
+    graph_utils::CopyAttribute(key, new_map_node, &prefetch_node);
+  }
+
+  *new_prefetch_node = graph->AddNode(std::move(prefetch_node));
+  return Status::OK();
+}
+
+Status AddBranch(gtl::ArraySlice<const NodeDef*> branch,
+                 NodeDef* choose_fastest_node, DataTypeVector* t_arguments,
+                 std::vector<NameAttrList>* branches,
+                 std::vector<int>* other_arguments_lengths,
+                 FunctionDefLibrary* library) {
+  FunctionDef* branch_func = library->add_function();
+  auto* signature = branch_func->mutable_signature();
+  graph_utils::SetUniqueGraphFunctionName("branch", library, branch_func);
+
+  // Input dataset.
+  string prev_node_output = "args_0";
+  auto* input_arg_0 = signature->add_input_arg();
+  input_arg_0->set_name(prev_node_output);
+  input_arg_0->set_type(DT_VARIANT);
+
+  auto* output_arg = signature->add_output_arg();
+  output_arg->set_name("output");
+  output_arg->set_type(DT_VARIANT);
+
+  int32 captured_arg_lengths = 0;
+
+  // For each node in the branch, copy it to the function def. Add the
+  // corresponding non-0th inputs as captured arguments, modifying the function
+  // input signature, node input names, other_arguments_lengths, and t_arguments
+  // accordingly.
+  for (const NodeDef* node : branch) {
+    // Copy the node to the function
+    auto function_node = branch_func->add_node_def();
+    *function_node = *node;
+    function_utils::SetUniqueFunctionNodeName(node->name(), branch_func,
+                                              function_node);
+    function_node->clear_input();
+    function_node->add_input(prev_node_output);
+
+    // Every input besides the 0th (dataset) becomes a captured argument.
+    int input_size = node->input_size();
+    DataTypeVector input_types;
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node->op(), &op_def));
+    TF_RETURN_IF_ERROR(InputTypesForNode(*node, *op_def, &input_types));
+    DCHECK_EQ(input_types.size(), input_size);
+
+    for (int i = 1; i < input_size; ++i) {
+      // Capture input in `other_arguments`
+      choose_fastest_node->add_input(node->input(i));
+      // Add type to function signature
+      auto* input_arg = signature->add_input_arg();
+
+      string input_arg_name = strings::StrCat(function_node->name(), "_", i);
+      input_arg->set_name(input_arg_name);
+      input_arg->set_type(input_types[i]);
+      function_node->add_input(input_arg_name);
+    }
+    // Add to `Targuments`
+    t_arguments->reserve(t_arguments->size() + input_types.size() - 1);
+    t_arguments->insert(t_arguments->end(), input_types.begin() + 1,
+                        input_types.end());
+    captured_arg_lengths += input_size - 1;
+    prev_node_output = strings::StrCat(function_node->name(), ":handle:0");
+  }
+
+  // Add to `other_arguments_lengths`
+  other_arguments_lengths->push_back(captured_arg_lengths);
+  (*branch_func->mutable_ret())["output"] = prev_node_output;
+
+  // Add to `branches`
+  NameAttrList func_attr;
+  func_attr.set_name(branch_func->signature().name());
+  branches->push_back(std::move(func_attr));
+  return Status::OK();
+}
+
+Status AddNewChooseFastestNode(const NodeDef* input_dataset_node,
+                               const string& ratio_numerator_name,
+                               std::vector<const NodeDef*> original_branch,
+                               std::vector<const NodeDef*> vectorized_branch,
+                               MutableGraphView* graph,
+                               FunctionDefLibrary* library,
+                               NodeDef** new_choose_fastest_node) {
+  NodeDef choose_fastest_node;
+  choose_fastest_node.set_op(kChooseFastestOp);
+  graph_utils::SetUniqueGraphNodeName(choose_fastest_node.op(), graph->graph(),
+                                      &choose_fastest_node);
+
+  // input_dataset
+  choose_fastest_node.add_input(input_dataset_node->name());
+  choose_fastest_node.add_input(ratio_numerator_name);
+  // ratio_denominator == 1
+  auto ratio_denominator =
+      graph_utils::AddScalarConstNode(static_cast<int64>(1), graph);
+  choose_fastest_node.add_input(ratio_denominator->name());
+
+  DataTypeVector t_arguments;
+  std::vector<NameAttrList> branches;
+  std::vector<int32> other_arguments_lengths;
+  // Branch 0: vectorized branch
+  TF_RETURN_IF_ERROR(AddBranch(vectorized_branch, &choose_fastest_node,
+                               &t_arguments, &branches,
+                               &other_arguments_lengths, library));
+  // Branch 1: original branch
+  TF_RETURN_IF_ERROR(AddBranch(original_branch, &choose_fastest_node,
+                               &t_arguments, &branches,
+                               &other_arguments_lengths, library));
+
+  DCHECK_EQ(t_arguments.size(), choose_fastest_node.input_size() - 3);
+  DCHECK_EQ(branches.size(), other_arguments_lengths.size());
+
+  AddNodeAttr("Targuments", t_arguments, &choose_fastest_node);
+  AddNodeAttr("num_elements_per_branch", 10, &choose_fastest_node);
+  AddNodeAttr("branches", branches, &choose_fastest_node);
+  AddNodeAttr("other_arguments_lengths", other_arguments_lengths,
+              &choose_fastest_node);
+
+  for (auto key : {"output_shapes", "output_types"}) {
+    graph_utils::CopyAttribute(key,
+                               *vectorized_branch[vectorized_branch.size() - 1],
+                               &choose_fastest_node);
+  }
+
+  *new_choose_fastest_node = graph->AddNode(std::move(choose_fastest_node));
+  return Status::OK();
+}
+
+// Given an input pipeline graph and a query node, tries to match the node to
+// the 'batch' node in a input_dataset->map->(optional prefetch->)batch pattern,
+// or the 'map_and_batch' node in an input_dataset->map_and_batch pattern.
+bool FindMapAndBatchPattern(const MutableGraphView& graph, const NodeDef& node,
+                            const FunctionLibraryDefinition& function_library,
+                            const NodeDef** batch_node_output,
+                            const NodeDef** optional_prefetch_node_output,
+                            const NodeDef** map_node_output,
+                            const NodeDef** input_node_output,
+                            const FunctionDef** map_fn_output) {
+  const FunctionDef*& map_fn = *map_fn_output;
+  const NodeDef*& batch_node = *batch_node_output;
+  const NodeDef*& optional_prefetch_node = *optional_prefetch_node_output;
+  const NodeDef*& map_node = *map_node_output;
+  const NodeDef*& input_node = *input_node_output;
+
+  if (node.op() == kExperimentalMapAndBatchOp) {
+    batch_node = &node;
+    map_node = &node;
+  } else if (node.op() == kBatchOp || node.op() == kBatchV2Op) {
+    batch_node = &node;
+    auto tmp_input_node = graph_utils::GetInputNode(*batch_node, graph);
+    if (tmp_input_node->op() == kPrefetchOp) {
+      optional_prefetch_node = tmp_input_node;
+      tmp_input_node = graph_utils::GetInputNode(*tmp_input_node, graph);
+    }
+    if (tmp_input_node->op() != kMapOp &&
+        tmp_input_node->op() != kParallelMapOp) {
+      return false;
+    }
+    map_node = tmp_input_node;
+    if (!IsOutputShapesFullyDefined(*map_node)) {
+      // If any of the map func outputs have an unknown shape, don't
+      // optimize, so that batching errors surface as before.
+      VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+                 "dataset does not have fully defined output shapes.";
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  // Input to the map node
+  input_node = graph_utils::GetInputNode(*map_node, graph);
+  DCHECK_NE(input_node, nullptr);
+
+  if (!IsOutputShapesFullyDefined(*input_node)) {
+    // If any of the inputs have an unknown shape, don't optimize, since
+    // inputs might not be batchable.
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the input "
+               "dataset does not have fully defined output shapes.";
+    return false;
+  }
+
+  map_fn = function_library.Find(map_node->attr().at("f").func().name());
+
+  if (function_utils::IsFunctionStateful(function_library, *map_fn)) {
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+               "function is stateful.";
+    return false;
+  }
+
+  return true;
 }
 
 }  // namespace
@@ -221,61 +515,65 @@ Status MapVectorization::OptimizeAndCollectStats(Cluster* cluster,
   MutableGraphView graph(output);
   absl::flat_hash_set<string> nodes_to_delete;
 
-  for (const NodeDef& node : item.graph.node()) {
-    // Find Map->Batch nodes.
-    // TODO(rachelim): Optimize MapAndBatchDataset[V2] as well.
-    if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
-      continue;
-    }
-
-    const NodeDef& batch_node(node);
-    NodeDef* node2 = graph_utils::GetInputNode(batch_node, graph);
-    if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
-      continue;
-    }
-
-    // Use a more descriptive variable name now that we know the node type.
-    NodeDef* map_node = node2;
-    // Input to the map node
-    NodeDef* input_node = graph_utils::GetInputNode(*map_node, graph);
-    CHECK_NOTNULL(input_node);
-
-    FunctionDefLibrary* library = output->mutable_library();
+  FunctionDefLibrary* library = output->mutable_library();
 
+  for (const NodeDef& node : item.graph.node()) {
     FunctionLibraryDefinition function_library(OpRegistry::Global(), *library);
-    const FunctionDef* orig_func =
-        function_library.Find(map_node->attr().at("f").func().name());
-
-    // Check that this is a valid optimization.
-    if (!IsOutputShapesFullyDefined(*input_node) ||
-        !IsOutputShapesFullyDefined(*map_node) ||
-        IsStatefulFn(function_library, *orig_func)) {
-      // 1. If any of the inputs have an unknown shape, don't optimize, since
-      // inputs might not be batchable.
-      // 2. If any of the map func outputs have an unknown shape, don't
-      // optimize, so that batching errors surface as before.
-      // 3. If the function is stateful, don't vectorize it.
+    const NodeDef* map_node;
+    const NodeDef* optional_prefetch_node = nullptr;
+    const NodeDef* batch_node;
+    const NodeDef* input_node;
+    const FunctionDef* map_func;
+    if (!FindMapAndBatchPattern(graph, node, function_library, &batch_node,
+                                &optional_prefetch_node, &map_node, &input_node,
+                                &map_func)) {
       continue;
     }
 
     FunctionDef* vectorized_func =
-        AddVectorizedFunction(*map_node, *orig_func, library);
+        AddVectorizedFunction(*map_node, *map_func, library);
     CHECK_NOTNULL(vectorized_func);
 
-    auto* new_batch_node = graph.AddNode(
-        MakeNewBatchNode(batch_node, *input_node, *vectorized_func, &graph));
+    std::vector<const NodeDef*> vectorized_branch;
+    NodeDef* new_batch_node;
+    TF_RETURN_IF_ERROR(AddNewBatchNode(
+        *batch_node, *input_node, *vectorized_func, &graph, &new_batch_node));
+    vectorized_branch.push_back(new_batch_node);
+
+    NodeDef* new_map_node;
+    TF_RETURN_IF_ERROR(AddNewMapNode(*map_node, *batch_node, *new_batch_node,
+                                     *vectorized_func, &graph, &new_map_node));
+    vectorized_branch.push_back(new_map_node);
+
+    if (optional_prefetch_node) {
+      // If the original pipeline was .map().prefetch().batch(), the new
+      // pipeline is .batch().map().prefetch()
+      NodeDef* new_prefetch_node;
+      TF_RETURN_IF_ERROR(AddNewPrefetchNode(*optional_prefetch_node,
+                                            *batch_node, *new_map_node, &graph,
+                                            &new_prefetch_node));
+      vectorized_branch.push_back(new_prefetch_node);
+    }
 
-    auto* new_map_node = graph.AddNode(MakeNewMapNode(
-        *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
-    TF_RETURN_IF_ERROR(
-        graph.UpdateFanouts(batch_node.name(), new_map_node->name()));
+    std::vector<const NodeDef*> original_branch({map_node});
+    if (optional_prefetch_node) {
+      original_branch.push_back(optional_prefetch_node);
+    }
+    if (map_node->op() != kExperimentalMapAndBatchOp) {
+      original_branch.push_back(batch_node);
+    }
+
+    NodeDef* new_choose_fastest_node;
+    TF_RETURN_IF_ERROR(AddNewChooseFastestNode(
+        input_node, /*ratio_numerator_name=*/new_batch_node->input(1),
+        std::move(original_branch), std::move(vectorized_branch), &graph,
+        library, &new_choose_fastest_node));
 
-    // Mark the `Map` and `Batch` nodes for removal.
-    nodes_to_delete.insert(map_node->name());
-    nodes_to_delete.insert(batch_node.name());
+    // Make output of Batch point to ChooseFastest instead.
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(batch_node->name(),
+                                           new_choose_fastest_node->name()));
     stats->num_changes++;
   }
-  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
index cde6ef6839b93ea041f709469c6e659850b6c746..88ec9cfec627637b305a41c87ffda9a8e0b8955a 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -21,6 +21,23 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// This optimizer rewrites dataset.map(map_fn, ...).batch(...) and
+// dataset.apply(tf.data.experimental.map_and_batch(map_fn, ...)) patterns in an
+// input pipeline. It vectorizes the map_fn, such that this segment can be
+// rewritten as dataset.batch().map(vectorized_map_fn). This is more performant
+// when the map_fn is cheap, because it amortizes the cost of running a map
+// function over a larger batch.
+//
+// From:
+//      input --> map --> batch --> output
+//              (or map_and_batch)
+//
+// To:
+//      input --> map --> batch --------+
+//        |     (or map_and_batch)      |
+//        |                             v
+//        +-----> batch --> map --> choose_fastest --> output
+//
 class MapVectorization : public TFDataOptimizerBase {
  public:
   MapVectorization() = default;
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
index f4faf415496f306cb9ced961c1a8c12e11cb167c..884bc17d98f3136193f3064a3b1e87eeb71fa383 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
@@ -17,195 +17,493 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-using test::function::GDef;
+constexpr char kConstOp[] = "Const";
+constexpr char kRangeOp[] = "RangeDataset";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr char kChooseFastestOp[] = "ChooseFastestBranchDataset";
+constexpr char kPrefetchOp[] = "PrefetchDataset";
+constexpr char kAttrNameF[] = "f";
+constexpr char kAttrNameTarguments[] = "Targuments";
+constexpr char kAttrNameOutputTypes[] = "output_types";
+constexpr char kAttrNameOutputShapes[] = "output_shapes";
+constexpr char kAttrNameInterOpParallelism[] = "use_inter_op_parallelism";
+constexpr char kAttrNamePreserveCardinality[] = "preserve_cardinality";
+constexpr char kAttrNameSloppy[] = "sloppy";
+constexpr char kAttrNameValue[] = "value";
+constexpr char kAttrNameDtype[] = "dtype";
+
 using test::function::NDef;
 
-NodeDef MakeMapNodeHelper(StringPiece name, StringPiece input_node_name,
-                          StringPiece function_name, StringPiece map_op_name,
-                          gtl::ArraySlice<PartialTensorShape> output_shapes,
-                          gtl::ArraySlice<DataType> output_types) {
-  return test::function::NDef(
-      name, map_op_name, {string(input_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
-       {"Targuments", {}},
-       {"output_shapes", output_shapes},
-       {"output_types", output_types}});
-}
-
-NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
-                    StringPiece function_name,
-                    gtl::ArraySlice<PartialTensorShape> output_shapes,
-                    gtl::ArraySlice<DataType> output_types) {
-  return MakeMapNodeHelper(name, input_node_name, function_name, "MapDataset",
-                           output_shapes, output_types);
-}
-
-NodeDef MakeBatchNode(StringPiece name, StringPiece input_node_name,
-                      StringPiece input_batch_size_name,
-                      gtl::ArraySlice<PartialTensorShape> output_shapes,
-                      gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDataset",
-      {string(input_node_name), string(input_batch_size_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeBatchV2Node(StringPiece name, StringPiece input_node_name,
-                        StringPiece input_batch_size_name,
-                        StringPiece input_drop_remainder_name,
-                        gtl::ArraySlice<PartialTensorShape> output_shapes,
-                        gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDatasetV2",
-      {string(input_node_name), string(input_batch_size_name),
-       string(input_drop_remainder_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeRangeNode(StringPiece name, gtl::ArraySlice<string> inputs) {
-  return NDef(name, "RangeDataset", inputs,
-              {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})},
-               {"output_types", gtl::ArraySlice<DataType>({DT_INT64})}});
-}
-
-TEST(MapVectorizationTest, VectorizeMapWithBatch) {
+// Adds a simple vectorizable map function that is akin to
+// dataset.map(lambda x: tf.identity(x))
+FunctionDef* AddMapFn(MutableGraphView* graph) {
+  FunctionDef* map_fn = graph->graph()->mutable_library()->add_function();
+  *map_fn = FunctionDefHelper::Create(
+      /*function_name=*/"map_fn",
+      /*in_def=*/{"x: int64"},
+      /*out_def=*/{"res: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"node"}, "Identity", {"x"}, {{"T", DT_INT64}}}},
+      /*ret_def=*/{{"res", "node:output"}});
+
+  return map_fn;
+}
+
+NodeDef* AddMapNode(MutableGraphView* graph, const string& input_dataset,
+                    const string& map_fn, int num_parallel_calls = 0) {
+  NodeDef result;
+  if (num_parallel_calls) {
+    auto num_parallel_calls_node =
+        graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+    result =
+        NDef(/*name=*/"map", /*op=*/kParallelMapOp,
+             /*inputs=*/{input_dataset, num_parallel_calls_node->name()},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNameSloppy, true},
+              {kAttrNamePreserveCardinality, true}});
+  } else {
+    result =
+        NDef(/*name=*/"map", /*op=*/kMapOp,
+             /*inputs=*/{input_dataset},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNamePreserveCardinality, true}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddPrefetchNode(MutableGraphView* graph, const string& input_dataset,
+                         int64 buffer_size) {
+  auto buffer_size_node = graph_utils::AddScalarConstNode(buffer_size, graph);
+  NodeDef result =
+      NDef(/*name=*/"prefetch", /*op=*/kPrefetchOp,
+           /*inputs=*/{input_dataset, buffer_size_node->name()},
+           /*attrs=*/
+           {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+            {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})}});
+
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddBatchNode(MutableGraphView* graph, const string& input_dataset,
+                      bool v2 = false, int64 batch_size = 10) {
+  NodeDef result;
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+
+  if (v2) {
+    // BatchDatasetV2
+    auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+    result = NDef(
+        /*name=*/"batch", /*op=*/kBatchV2Op,
+        /*inputs=*/
+        {input_dataset, batch_size_node->name(), drop_remainder->name()},
+        /*attrs=*/
+        {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+         {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{10, 1}})}});
+  } else {
+    result =
+        NDef(/*name=*/"batch", /*op=*/kBatchOp,
+             /*inputs=*/{input_dataset, batch_size_node->name()},
+             /*attrs=*/
+             {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes,
+               gtl::ArraySlice<PartialTensorShape>({{v2 ? 10 : -1, 1}})}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddRangeNode(MutableGraphView* graph) {
+  auto start = graph_utils::AddScalarConstNode(static_cast<int64>(0), graph);
+  auto stop = graph_utils::AddScalarConstNode(static_cast<int64>(10), graph);
+  auto step = graph_utils::AddScalarConstNode(static_cast<int64>(1), graph);
+
+  NodeDef result =
+      NDef(/*name=*/"range", /*op=*/kRangeOp,
+           /*inputs=*/{start->name(), stop->name(), step->name()},
+           /*attrs=*/
+           {{kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+void CheckNotVectorized(const GraphDef& output, const string& map_op,
+                        const string& batch_op, const string& map_input_name) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(map_op, output).size(), 1);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(batch_op, output).size(), 1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp(map_op, output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp(batch_op, output));
+  EXPECT_EQ(map_node.input(0), map_input_name);
+  EXPECT_EQ(batch_node.input(0), map_node.name());
+}
+
+void CheckBranch(const FunctionDef& function, gtl::ArraySlice<string> ops) {
+  for (int i = 0, size = ops.size(); i < size; ++i) {
+    EXPECT_EQ(function.node_def(i).op(), ops[i]);
+  }
+}
+
+const FunctionDef* GetFunction(const GraphDef& graph,
+                               const string& function_name) {
+  int found =
+      graph_utils::FindGraphFunctionWithName(function_name, graph.library());
+  if (found == -1) {
+    return nullptr;
+  }
+  return &graph.library().function(found);
+}
+
+// Checks that a graph has undergone the map_vectorization transformation
+// successfully, whereby the new graph has the shape:
+//
+//    input_node -------------> choose_fastest --> ...
+//                               |f0    |f1
+//                               |      |
+//                               |      +---> new batch --> new map
+//                               |
+//                               +--> old map --> old batch
+//
+void CheckVectorized(const GraphDef& output,
+                     gtl::ArraySlice<string> expected_vectorized_branch,
+                     gtl::ArraySlice<string> expected_original_branch,
+                     const string& input_name) {
+  ASSERT_EQ(
+      graph_utils::FindAllGraphNodesWithOp(kChooseFastestOp, output).size(), 1);
+  const NodeDef& choose_fastest_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kChooseFastestOp, output));
+  ASSERT_EQ(choose_fastest_node.input(0), input_name);
+
+  const auto& functions_list = choose_fastest_node.attr().at("branches").list();
+
+  // Branch 0: vectorized
+  const FunctionDef* branch_0 =
+      GetFunction(output, functions_list.func(0).name());
+  ASSERT_NE(branch_0, nullptr);
+  CheckBranch(*branch_0, expected_vectorized_branch);
+
+  // Branch 1: original
+  const FunctionDef* branch_1 =
+      GetFunction(output, functions_list.func(1).name());
+  ASSERT_NE(branch_1, nullptr);
+  CheckBranch(*branch_1, expected_original_branch);
+
+  const NodeDef& vectorized_map_node =
+      branch_0->node_def(function_utils::FindFunctionNodeWithOp(
+          expected_vectorized_branch[1], *branch_0));
+  string function_name =
+      vectorized_map_node.attr().at(kAttrNameF).func().name();
+
+  const FunctionDef* function = GetFunction(output, function_name);
+  ASSERT_NE(function, nullptr);
+  EXPECT_EQ(function->node_def(0).op(), "Identity");
+}
+
+class MapThenBatchTest
+    : public ::testing::TestWithParam<std::tuple<int, bool, int>> {};
+
+TEST_P(MapThenBatchTest, IsVectorized) {
+  int num_parallel_calls = std::get<0>(GetParam());
+  bool use_batch_v2 = std::get<1>(GetParam());
+  int prefetch = std::get<2>(GetParam());
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_dataset = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto dataset = AddMapNode(&graph, range_dataset->name(),
+                            map_fn->signature().name(), num_parallel_calls);
+
+  if (prefetch) {
+    dataset = AddPrefetchNode(&graph, dataset->name(), prefetch);
+  }
+  dataset = AddBatchNode(&graph, dataset->name(), use_batch_v2);
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+  std::vector<string> expected_original_branch;
+  expected_original_branch.push_back(num_parallel_calls > 0 ? kParallelMapOp
+                                                            : kMapOp);
+  if (prefetch) {
+    expected_original_branch.push_back(kPrefetchOp);
+  }
+  expected_original_branch.push_back(use_batch_v2 > 0 ? kBatchV2Op : kBatchOp);
+
+  std::vector<string> expected_vectorized_branch;
+  expected_vectorized_branch.push_back(use_batch_v2 > 0 ? kBatchV2Op
+                                                        : kBatchOp);
+  expected_vectorized_branch.push_back(num_parallel_calls > 0 ? kParallelMapOp
+                                                              : kMapOp);
+  if (prefetch) {
+    expected_vectorized_branch.push_back(kPrefetchOp);
+  }
+
+  CheckVectorized(output, expected_vectorized_branch, expected_original_branch,
+                  range_dataset->name());
 }
 
-TEST(MapVectorizationTest, VectorizeMapWithBatchV2) {
+INSTANTIATE_TEST_SUITE_P(MapThenBatchTest, MapThenBatchTest,
+                         ::testing::Combine(::testing::Values(0, 12),
+                                            ::testing::Bool(),
+                                            ::testing::Values(0, 20)));
+
+NodeDef* AddMapAndBatchNode(MutableGraphView* graph,
+                            const string& input_dataset, const string& map_fn,
+                            int64 batch_size = 10,
+                            int64 num_parallel_calls = 12) {
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+  auto num_parallel_calls_node =
+      graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+  auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+
+  NodeDef result =
+      NDef(/*name=*/"map_and_batch",
+           /*op=*/kExperimentalMapAndBatchOp,
+           /*inputs=*/
+           {input_dataset, batch_size_node->name(),
+            num_parallel_calls_node->name(), drop_remainder->name()},
+           /*attrs=*/
+           {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+            {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+            {kAttrNameOutputShapes,
+             gtl::ArraySlice<PartialTensorShape>({{10, 1}})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+TEST(MapVectorizationTest, VectorizeExperimentalMapAndBatch) {
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("drop_remainder", "Const", {},
-            {{"value", false}, {"dtype", DT_BOOL}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchV2Node("batch", "map", "batch_size", "drop_remainder", {{-1}},
-                       {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_and_batch_node = AddMapAndBatchNode(&graph, range_node->name(),
+                                               map_fn->signature().name());
+  ASSERT_NE(map_and_batch_node, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(
-      graph_utils::FindAllGraphNodesWithOp("BatchDatasetV2", output).size(), 1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+  CheckVectorized(output, {kBatchV2Op, kParallelMapOp},
+                  {kExperimentalMapAndBatchOp}, range_node->name());
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShape) {
+class ChainedMapAndBatchTest
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {};
+
+// Tests:
+// 1) map.batch.map.batch
+// 2) map.batch.map_and_batch
+// 3) map_and_batch.map.batch
+// 4) map_and_batch.map_and_batch
+TEST_P(ChainedMapAndBatchTest, IsVectorized) {
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_types", gtl::ArraySlice<DataType>({DT_INT32})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto input_node = AddRangeNode(&graph);
+
+  auto map_fn = AddMapFn(&graph);
+
+  auto make_map_and_batch = [&graph, map_fn](NodeDef* input, bool fuse) {
+    if (fuse) {
+      return AddMapAndBatchNode(&graph, input->name(),
+                                map_fn->signature().name());
+    }
+    auto map_node =
+        AddMapNode(&graph, input->name(), map_fn->signature().name(), true);
+    auto batch_node = AddBatchNode(&graph, map_node->name(), true);
+    return batch_node;
+  };
+
+  bool fuse_0 = std::get<0>(GetParam());
+  bool fuse_1 = std::get<1>(GetParam());
+  auto map_and_batch_0 = make_map_and_batch(input_node, fuse_0);
+  auto map_and_batch_1 = make_map_and_batch(map_and_batch_0, fuse_1);
+  ASSERT_NE(map_and_batch_1, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  TF_ASSERT_OK(TopologicalSort(&output));
+
+  std::vector<int> choose_fastest_nodes =
+      graph_utils::FindAllGraphNodesWithOp(kChooseFastestOp, output);
+  ASSERT_EQ(choose_fastest_nodes.size(), 2);
+
+  std::vector<string> fused_sequence({kExperimentalMapAndBatchOp});
+  std::vector<string> unfused_sequence({kParallelMapOp, kBatchV2Op});
+  const NodeDef& range_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kRangeOp, output));
+  const NodeDef& choose_fastest_0 = output.node(choose_fastest_nodes[0]);
+  ASSERT_EQ(choose_fastest_0.input(0), range_node.name());
+  const NodeDef& choose_fastest_1 = output.node(choose_fastest_nodes[1]);
+  ASSERT_EQ(choose_fastest_1.input(0), choose_fastest_0.name());
+
+  auto check_branches = [&output](const NodeDef& choose_fastest_node,
+                                  gtl::ArraySlice<string> original_ops) {
+    const auto& functions_list =
+        choose_fastest_node.attr().at("branches").list();
+
+    // Branch 0: vectorized
+    const FunctionDef* branch_0 =
+        GetFunction(output, functions_list.func(0).name());
+    ASSERT_NE(branch_0, nullptr);
+    CheckBranch(*branch_0, {kBatchV2Op, kParallelMapOp});
+
+    // Branch 1: original
+    const FunctionDef* branch_1 =
+        GetFunction(output, functions_list.func(1).name());
+    ASSERT_NE(branch_1, nullptr);
+    CheckBranch(*branch_1, original_ops);
+  };
+
+  check_branches(choose_fastest_0, fuse_0 ? fused_sequence : unfused_sequence);
+  check_branches(choose_fastest_1, fuse_1 ? fused_sequence : unfused_sequence);
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+INSTANTIATE_TEST_SUITE_P(ChainedMapAndBatchTest, ChainedMapAndBatchTest,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::Bool()));
+
+// Not all dataset types have "output_shapes" and "output_types"
+// attrs defined. Add a generic input node which may not have these attrs
+// defined.
+NodeDef* AddArbitraryInputNode(MutableGraphView* graph,
+                               std::vector<PartialTensorShape>* output_shapes,
+                               std::vector<DataType>* output_types) {
+  std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>> attrs;
+  if (output_shapes) {
+    attrs.push_back({kAttrNameOutputShapes, *output_shapes});
+  }
+  if (output_types) {
+    attrs.push_back({kAttrNameOutputTypes, *output_types});
+  }
+
+  NodeDef result = NDef(/*name=*/"input", /*op=*/"InputDataset",
+                        /*inputs=*/{},
+                        /*attrs=*/attrs);
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShapes) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // doesn't have an output_shapes attr defined. In this case, the map and
+  // batch swap does not occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, nullptr, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
 }
 
-TEST(MapVectorizationTest, VectorizeWithFullyDefinedFunction) {
+TEST(MapVectorizationTest, VectorizeWithUnknownRank) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown rank. In this case, the optimization does not
+  // occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "Func", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {FunctionDefHelper::Create(
-          "Func", {"x: int64", "y: int64"}, {"res: int64", "res2: int64"}, {},
-          {{{"o"}, "Mul", {"x", "x"}, {{"T", DT_INT64}}}},
-          {{"res", "o:z"}, {"res2", "o:z"}})});
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+TEST(MapVectorizationTest, VectorizeWithUnknownDim) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown dimensions. In this case, the optimization does
+  // not occur.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{-1, 2}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
 }
 
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+  // Tests that the optimization doesn't break when the input doesn't have
+  // an output_types attr defined. The output_types of the input node, even
+  // if not present, can be inferred from the map function input signature.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{1}});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, nullptr);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(
+      output, /*expected_vectorized_branch=*/{batch_node->op(), map_node->op()},
+      /*expected_original_branch=*/{map_node->op(), batch_node->op()},
+      input_node->name());
+}
+
+// TODO(rachelim): Add test that has a polymorphic function.
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b10e30ed1c90426212f79fdda18e27d833865c1a
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -0,0 +1,280 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/rebatch.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status RebatchOptimizer::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return Status::OK();
+
+  num_workers_ = config->parameter_map().at("num_workers").i();
+  return Status::OK();
+}
+
+namespace {
+
+constexpr char kCastOp[] = "Cast";
+constexpr char kRealDivOp[] = "RealDiv";
+constexpr char kConstOp[] = "Const";
+
+constexpr std::array<const char*, 5> kBatchDatasetOps = {
+    "BatchDataset",
+    "BatchDatasetV2",
+    "ExperimentalMapAndBatchDataset",
+    "PaddedBatchDataset",
+    "PaddedBatchDatasetV2"
+};
+
+constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
+    "ConcatenateDataset",
+    "ZipDataset"
+};
+
+constexpr std::array<const char*, 17> kPassThroughOps = {
+    "CacheDataset",
+    "FilterDataset",
+    "FilterByLastComponentDataset",
+    "Identity",
+    "MapDataset",
+    "ModelDataset",
+    "OptimizeDataset",
+    "ParallelMapDataset",
+    "PrefetchDataset",
+    "ReduceDataset",
+    "RepeatDataset",
+    "ShardDataset",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "TakeDataset",
+    "WindowDataset"
+};
+
+constexpr std::array<const char*, 3> kFuncDatasetOps = {
+    "FlatMapDataset",
+    "InterleaveDataset",
+    "ParallelInterleaveDatasetV2"
+};
+
+constexpr std::array<const char*, 9> kSourceDatasetOps = {
+    "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2",
+    "GeneratorDataset",
+    "RangeDataset",
+    "SparseTensorsSliceDataset",
+    "TensorDataset",
+    "TensorSliceDataset",
+    "TextLineDataset",
+    "TFRecordDataset"
+};
+
+NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
+                     MutableGraphView* graph) {
+  NodeDef cast_node;
+  cast_node.set_op(kCastOp);
+  cast_node.add_input(input);
+  graph_utils::SetUniqueGraphNodeName(cast_node.op(), graph->graph(),
+                                      &cast_node);
+  AddNodeAttr("SrcT", src_t, &cast_node);
+  AddNodeAttr("DstT", dst_t, &cast_node);
+
+  return graph->AddNode(std::move(cast_node));
+}
+
+NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType type,
+                       MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(op);
+  node.add_input(input_x);
+  node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(op, graph->graph(), &node);
+  AddNodeAttr("T", type, &node);
+
+  return graph->AddNode(std::move(node));
+}
+
+NodeDef* AddFloatDivNode(const string& input_x, const string& input_y,
+                         MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kRealDivOp, DT_FLOAT, graph);
+}
+
+template <std::size_t SIZE>
+bool IsDatasetNodeOfType(const NodeDef& node,
+                         const std::array<const char*, SIZE>& arr) {
+  for (const auto& dataset_op_name : arr) {
+    if (node.op() == dataset_op_name) return true;
+  }
+  return false;
+}
+
+// Given a "batch" dataset node, modifies the batch_size input to divide the
+// current batch size by num_workers.
+Status MutateBatchSize(const NodeDef& node, int64 num_workers,
+                       MutableGraphView* graph) {
+  // TODO(rohanj): Fix up the output_shapes attribute as well. For this Dataset
+  // as well as all the downstream datasets.
+  // For all the batching datasets the batch_size is input number 1.
+  NodeDef* batch_size_node = graph_utils::GetInputNode(node, *graph, 1);
+  // By the time this optimization is run, the batch_size is computed and
+  // is a constant.
+  if (batch_size_node->op() != kConstOp) {
+    return errors::Internal("Batch size node should be a Const. Obtained: ",
+                            batch_size_node->op(), " instead.");
+  }
+  Tensor batch_size_tensor;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(*batch_size_node, "value", &batch_size_tensor));
+  if (!TensorShapeUtils::IsScalar(batch_size_tensor.shape())) {
+    return errors::Internal("Batch size node shape should be scalar");
+  }
+  int64 batch_size = batch_size_tensor.scalar<int64>()();
+  if (batch_size % num_workers != 0) {
+    return errors::InvalidArgument(
+        "Batch size: ", batch_size,
+        " is not divisible by num_workers: ", num_workers);
+  }
+  batch_size /= num_workers;
+  NodeDef* new_batch_size_node =
+      graph_utils::AddScalarConstNode<int64>(batch_size, graph);
+  // We don't call UpdateFanouts here because CSE elimination might lead to
+  // multiple nodes sharing the same batch size constant node. This is also
+  // why we don't delete batch_size_node as well.
+  TF_RETURN_IF_ERROR(graph->UpdateRegularFaninByPort(
+      node.name(), 1, {new_batch_size_node->name(), 0}));
+  return Status::OK();
+}
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output);
+
+// Helper function that starts from a node in the graph and recurses into its
+// inputs trying to find a BatchDataset type operation to modify. During the
+// recursion it handles four kinds of cases.
+// 1. BatchDataset type ops: Mutates the batch_size input node and stops.
+// 2. Zip / Concatenate dataset ops: Recurses into all inputs to these ops
+//      as they are datasets themselves.
+// 3. Core dataset ops + Identity op: Recurses into first input parameter.
+// 4. FlatMap type mapping dataset ops: Recurses into the function definition.
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
+                           FunctionLibraryDefinition* flib,
+                           MutableGraphView* graph) {
+  if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
+    return MutateBatchSize(node, num_workers, graph);
+  } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
+    // For all multiple input datasets, all inputs are datasets themselves.
+    for (int i = 0; i < node.input_size(); ++i) {
+      NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
+      TF_RETURN_IF_ERROR(
+          RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+    }
+  } else if (IsDatasetNodeOfType(node, kPassThroughOps)) {
+    // For all the dataset ops that are pass through, the input dataset is
+    // input 0.
+    NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+    TF_RETURN_IF_ERROR(
+        RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+  } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
+    const string func_name = node.attr().at("f").func().name();
+    const FunctionDef* fdef = flib->Find(func_name);
+    GrapplerFunctionItem f_item;
+    TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+        *fdef, *flib, graph->graph()->versions().producer(), &f_item));
+    GraphDef optimized_func_graph;
+    Status s = OptimizeGraph(f_item, num_workers, &optimized_func_graph);
+    if (s.ok()) {
+      // Function body optimization might have created new specialized
+      // functions for each instantiation context. Add them to the library.
+      for (const FunctionDef& func_def :
+           optimized_func_graph.library().function()) {
+        if (flib->Find(func_def.signature().name()) == nullptr) {
+          TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def));
+        }
+      }
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      f_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
+    }
+  } else if (IsDatasetNodeOfType(node, kSourceDatasetOps)) {
+    return errors::InvalidArgument(
+        "Reached a source dataset: ", node.op(),
+        " without encountering a batch transformation.");
+  } else {
+    return errors::InvalidArgument("Encountered an unsupported op: ",
+                                   node.op());
+  }
+  return Status::OK();
+}
+
+// Helper function that given a GrapplerItem generates a mutated graph def
+// with the batch size changed. The GrapplerItem could be generated from the
+// main graph or could be a function graph.
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
+
+  NodeDef sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
+  TF_RETURN_IF_ERROR(
+      RecursivelyHandleOp(sink_node, num_workers, &flib, &graph));
+  *output->mutable_library() = flib.ToProto();
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster,
+                                                 const GrapplerItem& item,
+                                                 GraphDef* output,
+                                                 OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, output));
+  stats->num_changes++;
+  return Status::OK();
+}
+
+void RebatchOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                const GraphDef& optimize_output,
+                                double result) {}
+
+REGISTER_GRAPH_OPTIMIZER_AS(RebatchOptimizer, "tf_data_rebatcher");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.h b/tensorflow/core/grappler/optimizers/data/rebatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a610002645b9dd88d8a278f68094b2121697ac
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer changes the batch size of the output dataset by dividing the
+// current batch size by parameter `num_workers`. Currently, this works only
+// for very simple pipelines with a single BatchDatasetV2 transformation.
+class RebatchOptimizer : public TFDataOptimizerBase {
+ public:
+  RebatchOptimizer() = default;
+  ~RebatchOptimizer() override = default;
+
+  string name() const override { return "tf_data_rebatcher"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  int64 num_workers_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index ff64ff1adbcd71d916a1bd6f842b9decc4a68d96..0563460b29505f1b054a57624d470c4e642bea2f 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 0eee91f241a8e3c09b93a159c93addb43e749b02..0f34d2b7ebe59244a9b02b5209732fc830cd6729 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index c57a7b125693af2b53d52e772bb4264bfbe00b23..1969ff00e4ae5147f183e1230986b4d2b4620fc7 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -415,6 +415,10 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
 // NodeBuilder
 Status Vectorization::StackTensor(WrappedTensor* unstacked,
                                   TensorDesc* result) {
+  if (unstacked->node->output_type(unstacked->output_index) == DT_VARIANT) {
+    // TODO(b/124069171): "ExpandDims" doesn't work with Variant tensors.
+    return errors::Unimplemented("Cannot stack tensor with Variant type.");
+  }
   // Note that all these nodes are necessary as the size of the batch may not be
   // constant.
   if (unstacked->stacked) {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 8b81cb2430ca9a34926217312f2894cf283c1dd2..3a3bb2c9b6983a463db476f9fb4a742adf42046d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -75,16 +75,8 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
   // Recv.
   if (IsVariable(*input) || IsRecv(*input)) {
     return false;
-  } else if (IsSwitch(*input)) {
-    // Don't turn Identity nodes following Switch into NoOp or remove them
-    // if it requires anchoring a control dependencies the Switch node, which
-    // is not valid.
-    if (str_util::StartsWith(node.name(), kConstantFoldingCtrl)) {
-      // TODO(rmlarsen): Try to remove this artificial contraint.
-      return false;
-    }
   }
-  for (auto consumer : node_map_->GetOutputs(node.name())) {
+  for (const auto& consumer : node_map_->GetOutputs(node.name())) {
     if (node.input_size() > 1 && IsMerge(*consumer)) {
       return false;
     }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 7b032673fb3456a724d8021a5dcebc8b4c957ba8..99021b955f2f6000777ecb2915ff0f1d56c99562 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -30,8 +30,7 @@ namespace grappler {
 class DependencyOptimizer : public GraphOptimizer {
  public:
   DependencyOptimizer() {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -73,7 +72,6 @@ class DependencyOptimizer : public GraphOptimizer {
   // single control edge.
   void GroupCrossDeviceControlEdges();
 
-  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 497ad6032ea80b22e5b5e2b23b2860b7c99fc57b..9f6352f1f2efa4b299dff163858ad5b4c88b41b8 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -29,10 +29,10 @@ FunctionApiInfo::~FunctionApiInfo() {}
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
   function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
-    if (attr.first == "experimental_api_preferred_device") {
+    if (attr.first == "api_preferred_device") {
       preferred_device_ = attr.second.s();
     }
-    if (attr.first == "experimental_api_implements") {
+    if (attr.first == "api_implements") {
       interface_name_ = attr.second.s();
     }
     if (attr.first == "forward_function_name") {
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 9a5f548951f0931e98fbe4074f7bbd9aacab0c6e..ffa53a7d8d94e29a1e3b6e214a18903e98f47cda 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -80,6 +80,8 @@ class FunctionLibraryApiInfo {
       const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
+  bool empty() const { return func_info_.empty(); }
+  std::size_t size() const { return func_info_.size(); }
 
  private:
   // Map between function name to function details.
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index b683d26b32f04759b658e9e0704f1b6b661fe178..9bb517faa31f1e347810ed8884b6a2c16b26104b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -58,9 +58,9 @@ void PopulateFunction(const string& name, const string& api_interface_name,
 
   auto* func_attr = func_def->mutable_attr();
   if (!api_interface_name.empty())
-    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+    (*func_attr)["api_implements"].set_s(api_interface_name);
   if (!preferred_device.empty())
-    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+    (*func_attr)["api_preferred_device"].set_s(preferred_device);
   if (!forward_function_name.empty())
     (*func_attr)["forward_function_name"].set_s(forward_function_name);
   if (!backward_function_name.empty())
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index bd69405b1a0d4ab56981246ba35d43b9cf61cc4b..5b2f1e58088a810558bd52317b9af4153c4baf39 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/lower_if_while.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -45,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
@@ -68,6 +71,16 @@ constexpr char kFuncAttrName[] = "f";
 
 constexpr char kNoInlineAttr[] = "_noinline";
 
+// Name of the node that will have control edges from function input nodes, and
+// also used as a new destination for incoming control edges.
+constexpr char kInputsReadyNodeName[] = "inputs_ready";
+
+// Name of the node that will have control edges from function control output
+// nodes, and also used as a new source of outgoing control edges. This node
+// will guarantee that all side-effects inside function body will be executed
+// after function inlining.
+constexpr char kSideEffectsExecutedNodeName[] = "side_effects_executed";
+
 bool AttrIsTrue(const FunctionDef& func, const string& attr) {
   return func.attr().count(attr) != 0 && func.attr().at(attr).b();
 }
@@ -251,28 +264,26 @@ struct FunctionSpecialization {
   std::vector<std::pair<int, int>> output_mapping;
 };
 
+// Function optimizer context initialized once for each optimization pass, and
+// it uses the latest available graph (for the first iteration it will be the
+// GrapplerItem.graph, for next iterations it will be the output of previous
+// function optimizer pass).
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
-                                    const GrapplerItem& item)
-      : grappler_item_id_(item.id),
-        graph_version_(item.graph.versions().producer()),
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level,
+                                    const GraphDef& graph)
+      : item_(&item),
         opt_level_(opt_level),
-        optimization_options_(item.optimization_options()),
-        function_library_(OpRegistry::Global(), item.graph.library()),
-        available_device_names_(item.devices().begin(), item.devices().end()),
-        graph_view_(&item.graph) {
-    InitializeTrulyConstNodes(item);
-    InitializeFetchNodes(item);
-  }
+        function_library_(OpRegistry::Global(), graph.library()),
+        truly_const_nodes_(InferTrulyConstNodes(item, graph)),
+        graph_view_(&graph) {}
 
-  int graph_version() const { return graph_version_; }
+  const GrapplerItem& item() const { return *item_; }
 
-  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+  const int graph_version() const { return item_->graph.versions().producer(); }
 
-  const GrapplerItem::OptimizationOptions& optimization_options() const {
-    return optimization_options_;
-  }
+  RewriterConfig::Toggle opt_level() const { return opt_level_; }
 
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
@@ -299,16 +310,10 @@ class FunctionOptimizerContext {
 
   const GraphView& graph_view() const { return graph_view_; }
 
-  const string& grappler_item_id() const { return grappler_item_id_; }
-
-  const absl::flat_hash_set<string>& fetch_tensors() const {
-    return fetch_tensors_;
-  }
-
   const DeviceSet* devices() const {
     // Create fake devices lazily only if we need a DeviceSet.
-    if (available_devices_.empty() && !available_device_names_.empty()) {
-      for (const string& name : available_device_names_) {
+    if (available_devices_.empty() && !item_->devices().empty()) {
+      for (const string& name : item_->devices()) {
         auto device = absl::make_unique<FakeDevice>(name);
         available_device_set_.AddDevice(device.get());
         available_devices_.push_back(std::move(device));
@@ -318,7 +323,15 @@ class FunctionOptimizerContext {
   }
 
   bool IsFetchNode(const string& node_name) const {
-    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
+    return absl::c_any_of(item_->fetch, [&](const string& fetch) {
+      return ParseTensorName(fetch).node() == node_name;
+    });
+  }
+
+  bool IsKeepOp(const string& node_name) const {
+    return absl::c_any_of(item_->keep_ops, [&](const string& keep_node) {
+      return keep_node == node_name;
+    });
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -340,6 +353,11 @@ class FunctionOptimizerContext {
   }
 
   void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    DCHECK(from.index() != Graph::kControlSlot)
+        << "Tensor mapping must be from regular tensor";
+    DCHECK(to.index() != Graph::kControlSlot)
+        << "Tensor mapping must be to regular tensor";
+
     auto inserted = tensor_mapping_.insert({from, to});
     DCHECK(inserted.second)
         << "Failed to insert duplicated tensor mapping: "
@@ -354,14 +372,16 @@ class FunctionOptimizerContext {
       if (from_idx != to_idx) {
         SafeTensorId from_tensor(func_node, from_idx);
         SafeTensorId to_tensor(func_node, to_idx);
-        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
-        DCHECK(inserted.second);
+        AddTensorMapping(from_tensor, to_tensor);
       }
     }
   }
 
   void AddControlOverrides(const NodeDef& func_node,
                            const std::vector<string>& control_overrides) {
+    VLOG(4) << "Add control overrides: from=" << func_node.name() << " to: ["
+            << absl::StrJoin(control_overrides, ", ") << "]";
+
     control_overrides_[func_node.name()].reserve(control_overrides.size());
     for (const string& control_override : control_overrides) {
       control_overrides_[func_node.name()].push_back(control_override);
@@ -369,24 +389,21 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeTrulyConstNodes(const GrapplerItem& item) {
-    absl::flat_hash_set<string> feed_nodes;
+  static absl::flat_hash_map<string, const NodeDef*> InferTrulyConstNodes(
+      const GrapplerItem& item, const GraphDef& graph) {
+    absl::flat_hash_set<absl::string_view> feed_nodes;
     for (const auto& feed : item.feed) {
-      feed_nodes.insert(NodeName(feed.first));
+      feed_nodes.insert(feed.first);
     }
 
-    for (const NodeDef& node : item.graph.node()) {
-      if (IsConstant(node) && feed_nodes.count(node.name()) == 0) {
-        truly_const_nodes_[node.name()] = &node;
+    absl::flat_hash_map<string, const NodeDef*> const_nodes;
+    for (const NodeDef& node : graph.node()) {
+      if (IsConstant(node) && !feed_nodes.contains(node.name())) {
+        const_nodes[node.name()] = &node;
       }
     }
-  }
 
-  void InitializeFetchNodes(const GrapplerItem& item) {
-    for (const string& fetch : item.fetch) {
-      fetch_tensors_.insert(fetch);
-      fetch_nodes_.insert(NodeName(fetch));
-    }
+    return const_nodes;
   }
 
   void InitializeFunctionLibraryRuntime() {
@@ -398,16 +415,16 @@ class FunctionOptimizerContext {
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
-          device_mgr_.get(), env, graph_version_, &function_library_,
-          optimizer_opts));
+          device_mgr_.get(), env, item_->graph.versions().producer(),
+          &function_library_, optimizer_opts));
       flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
-  const string grappler_item_id_;
-  const int graph_version_;
-  const RewriterConfig::Toggle opt_level_;
-  const GrapplerItem::OptimizationOptions optimization_options_;
+  const GrapplerItem* item_;  // must outlive this object
+  RewriterConfig::Toggle opt_level_;
+
+  // Function library constructed from current graph.
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -415,14 +432,11 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Fully defined names of the devices available to the GrapplerItem.
-  const absl::flat_hash_set<string> available_device_names_;
-
   // List of available `FakedDevices` (lazily initialized, see devices()).
   mutable std::vector<std::unique_ptr<Device>> available_devices_;
 
   // DeviceSet of fake devices (`FakeDevice`) constructed from
-  // available_devices_ (lazily initialized).
+  // item_.devices() (lazily initialized).
   mutable DeviceSet available_device_set_;
 
   // Nodes that are Const and not in feed.
@@ -432,10 +446,6 @@ class FunctionOptimizerContext {
                       const FunctionSpecialization>
       specialized_functions_;
 
-  // GrapplerItem.fetch is a vector of tensors.
-  absl::flat_hash_set<string> fetch_tensors_;  // format: node_name:port
-  absl::flat_hash_set<string> fetch_nodes_;    // format: node_name
-
   // After function inlining and specialization, the optimized graph might be in
   // invalid state, nodes can read from non-existing function call nodes that
   // were inlined, or they can read from output index that is no longer valid
@@ -490,9 +500,11 @@ absl::flat_hash_set<int> GetActiveOutputs(const NodeDef& node,
   }
 
   // 2. Or it can be in a fetch set.
-  for (const string& fetch_tensor : ctx.fetch_tensors()) {
-    int port = NodePositionIfSameNode(fetch_tensor, node.name());
-    if (port >= 0) active_outputs.insert(port);
+  for (const string& fetch : ctx.item().fetch) {
+    TensorId fetch_tensor = ParseTensorName(fetch);
+    if (fetch_tensor.node() == node.name()) {
+      active_outputs.insert(fetch_tensor.index());
+    }
   }
 
   return active_outputs;
@@ -750,14 +762,12 @@ Status InitializeFunctionSpecializationSignature(
 string SpecializedFunctionName(const FunctionOptimizerContext& ctx,
                                const FunctionDef& func,
                                const NodeDef& func_node) {
-  return absl::Substitute("$0_specialized_for_$1_at_$2",
-                          func.signature().name(),
-                          absl::StrReplaceAll(func_node.name(), {{"/", "_"}}),
-                          ctx.grappler_item_id());
+  return absl::Substitute(
+      "$0_specialized_for_$1_at_$2", func.signature().name(),
+      absl::StrReplaceAll(func_node.name(), {{"/", "_"}}), ctx.item().id);
 }
 
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
-                          const int graph_def_version,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
   VLOG(2) << "Specialize function call: " << SummarizeNodeDef(func_node);
@@ -796,8 +806,8 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
   // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
-  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_instantiation_attr,
-                                              flib, graph_def_version, &item));
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+      func, func_instantiation_attr, flib, ctx->graph_version(), &item));
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
@@ -999,7 +1009,6 @@ NodeDef InlinedFunctionOutputsNode(
 
 Status InlineDirectFunctionCall(const NodeDef& func_node,
                                 const FunctionDef& func,
-                                const int graph_def_version,
                                 const FunctionOptimizerContext& ctx,
                                 GraphDef* optimized_graph) {
   VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
@@ -1011,7 +1020,7 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx.function_library(),
-                                                graph_def_version, &item);
+                                                ctx.graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1088,36 +1097,8 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Move the function body node to the optimized graph.
-    const auto move_node_to_optimized_graph = [&]() {
-      // Annotate the node with the function attributes.
-      for (const auto& attr : func.attr()) {
-        func_body_node.mutable_attr()->insert(attr);
-      }
-      // Move the node to the main graph.
-      optimized_graph->add_node()->Swap(&func_body_node);
-    };
-
-    // Check if a body node is itself a function call and can be inlined.
-    const FunctionDef* func_body_node_func =
-        FindFunctionCall(ctx, func_body_node);
-
-    if (func_body_node_func != nullptr) {
-      Status inlinable = IsInlinableDirectFunctionCall(
-          ctx, *func_body_node_func, func_body_node);
-      if (inlinable.ok()) {
-        TF_RETURN_IF_ERROR(
-            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
-                                     graph_def_version, ctx, optimized_graph));
-      } else {
-        VLOG(2) << "Can't inline nested direct function call: "
-                << inlinable.error_message();
-        move_node_to_optimized_graph();
-      }
-
-    } else {
-      move_node_to_optimized_graph();
-    }
+    // Move the node to the main graph.
+    optimized_graph->add_node()->Swap(&func_body_node);
   }
 
   DCHECK(output_tensors.size() == item.output_size())
@@ -1175,12 +1156,37 @@ Status InlineSymbolicGradient(const NodeDef& node,
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
 
-  // Recursively inline the functions until there is nothing more to inline. We
-  // should at least expand one function.
-  int counter = 0;
-  while (counter < 50 && ExpandInlineFunctions(
-                             ctx->mutable_function_library_runtime(), &graph)) {
-    ++counter;
+  FunctionLibraryRuntime* flr = ctx->mutable_function_library_runtime();
+
+  // 1. Inline symbolic gradient node.
+  const bool expanded = ExpandInlineFunctions(flr, &graph);
+  if (!expanded) {
+    return errors::Internal("Failed to expand SymbolicGradient op");
+  }
+
+  // TODO(ezhulenev): InlineFunctionBody in common_runtime/function silently
+  // fails to inline function into the graph, and leaves the graph unmodified.
+  // We check that graph has our symbolic gradient inlined, otherwise we return
+  // a error.
+  const auto is_symbolic_gradient_op = [&](const Node* node) {
+    return node->name() == inlined->name() &&
+           node->type_string() == "SymbolicGradient";
+  };
+  for (Node* node : graph.nodes()) {
+    if (is_symbolic_gradient_op(node)) {
+      return errors::Internal("Failed to inline symbolic gradient node: ",
+                              SummarizeNode(*node));
+    }
+  }
+
+  // 2. Recursively inline nested function calls.
+  int iteration = 0;
+  while (ExpandInlineFunctions(flr, &graph)) {
+    if (++iteration >= 50) {
+      VLOG(2) << "Break symbolic gradient inlining loop at iteration #"
+              << iteration;
+      break;
+    }
   }
 
   GraphDef inlined_graph_def;
@@ -1237,12 +1243,26 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // dependency tracking via input/output control edges, and we relax some of the
 // constraints that we have for direct function call inlining.
 //
-// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
-// input argument it "captures" the mutable resource.  This is implemented by
-// automatically adding a incoming control edge from the previous side-effectful
-// op touching that resource, and an outgoing control edge to the next
-// side-effectful op using the same resource. This serializes the mutations of
-// the resource to make graph execution deterministic.
+// Automatic control dependency rules:
+//
+// 1) "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data
+//    type) input argument it "captures" the mutable resource.  This is
+//    implemented by automatically adding a incoming control edge from the
+//    previous side-effectful op touching that resource, and an outgoing control
+//    edge to the next side-effectful op using the same resource. This
+//    serializes the mutations of the resource to make graph execution
+//    deterministic.
+//
+// 2) All stateful ops inside a function body are guaranteed to execute in
+//    program order, this is achieved by adding control edges between stateful
+//    ops at graph construction time.
+//
+// 3) Furthermore, all ops accepting the same resource as an input are
+//    guaranteed to run in program order. This is also done by adding control
+//    edges at graph construction time. The last op touching the resource
+//    will have an outgoing control edge to all function return nodes, which
+//    will guarantee that all side effects to the resource will happen before
+//    function completion.
 //
 // Function call inlining must preserve side effect visibility:
 //
@@ -1251,17 +1271,31 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // 2) All side effects to the captured resources, that happened inside function
 //    body, must be visible to every op/function using that resource after the
 //    function call completed.
-
-// To guarantee that these properties are preserved after inlining we do:
 //
-// 1) Forward all input control dependencies from the function call node to the
-//    inlined function inputs (Identity nodes).
-// 2) Each side-effectful op inside function body adds itself as a control
-//    dependency to all the nodes in output control set of function call node.
+// To guarantee that these properties are preserved after inlining we:
+//
+// 1) Create "input_control" NoOp. Function call node incoming control edges
+//    will be forwarded *to* this node. Function inputs (Identity nodes) will
+//    have a control edge *from* this node. If function has no inputs, by
+//    construction it must have nodes without inputs in the function body, and
+//    in this case these nodes will have a control edge *from* this node.
+
+// 2) Create "output_control" NoOp. All nodes that have incoming control edge
+//    *from* the function call node, will be forwarded to this node. Function
+//    outputs (Identity nodes) will have a control edge *to* this node. This
+//    will guarantee that nodes that have control dependency on the function
+//    call, will observe all side-effects (guaranteed by graph construction with
+//    automatic control dependencies tracking).
 //
-// We do not add any other control dependencies to/from function body nodes,
-// because they are pure functions of input tensors, and can be freely
-// reordered.
+// If after function instantiation we find a stateful or a dataset op inside
+// the function body, that is not reachable from any of the function outputs (or
+// if the function has no outputs), we do not inline it, because we can't
+// guarantee that these nodes will be executed in correct order (or executed at
+// all) after inlining.
+//
+// We do not try to add any extra control edges to make sure that all
+// side-effectful nodes will be executed, that should be handled at graph
+// construction time.
 
 struct MaybeDeadOutput {
   const NodeDef* dead_tensor_src;
@@ -1274,11 +1308,14 @@ struct MaybeDeadOutput {
 Status MaybeDeadOutputs(const FunctionOptimizerContext& ctx,
                         const GrapplerFunctionItem& item,
                         std::vector<MaybeDeadOutput>* maybe_dead) {
+  VLOG(3) << "Find function outputs that might return dead tensors: item.id="
+          << item.id;
   DCHECK(maybe_dead->empty()) << "Input argument must be an empty vector";
 
   std::vector<const NodeDef*> dead_tensor_srcs;
   for (const NodeDef& node : item.graph.node()) {
     if (IsSwitch(node)) {
+      VLOG(4) << "Add dead tensors source. Switch node: " << node.name();
       dead_tensor_srcs.push_back(&node);
       continue;
     }
@@ -1295,7 +1332,11 @@ Status MaybeDeadOutputs(const FunctionOptimizerContext& ctx,
       std::vector<MaybeDeadOutput> func_dead_outputs;
       TF_RETURN_IF_ERROR(MaybeDeadOutputs(ctx, func_item, &func_dead_outputs));
 
-      if (!func_dead_outputs.empty()) dead_tensor_srcs.push_back(&node);
+      if (!func_dead_outputs.empty()) {
+        VLOG(4) << "Add dead tensors source. Function call: " << node.op()
+                << " node=" << node.name();
+        dead_tensor_srcs.push_back(&node);
+      }
     }
   }
 
@@ -1365,29 +1406,162 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
         SummarizeNodeDef(func_node));
   }
 
-  // TODO(b/120991525, b/120986912): We need to lower `If` and `While` nodes to
-  // `Switch` nodes after function inlining (one more PRE_PLACEMENT pass?), but
-  // because of the reason described above we are not sure that it's safe, for
-  // now just disable inlining functions with functional control flow.
-  const auto is_functional_ctrl_flow_op = [](const NodeDef& node) {
-    return IsIf(node) || IsWhile(node);
-  };
-  if (absl::c_any_of(func.node_def(), is_functional_ctrl_flow_op)) {
-    return errors::FailedPrecondition(
-        "Can't inline function with `If` or `While` nodes in the function "
-        "body: ",
-        SummarizeNodeDef(func_node));
+  return Status::OK();
+}
+
+// Checks that all side-effects will be executed in well defined order. We do it
+// by checking if there is a path from stateful/dataset ops to one of the
+// control output nodes.
+Status CheckThatSideEffectsWillExecute(
+    const FunctionOptimizerContext& ctx,
+    const GraphTopologyView& graph_topo_view,
+    const absl::flat_hash_set<string> control_output_nodes) {
+  // In aggressive mode we just print a warning for side-effectful nodes that
+  // might not be executed after inlining.
+  const bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+
+  for (const NodeDef& func_body_node : graph_topo_view.graph()->node()) {
+    const bool node_must_execute =
+        IsDataset(func_body_node) ||
+        IsStateful(func_body_node, &ctx.function_library());
+
+    // If op has DT_RESOURCE argument it will be marked as stateful, though if
+    // it only reads from that resource, it's allowed to prune it, because it
+    // can't produce any visible side-effects.
+    const bool read_only = IsReadVariableOp(func_body_node);
+
+    if (read_only || !node_must_execute) continue;
+
+    VLOG(3) << "Check that node " << func_body_node.name()
+            << " will execute after inlining.";
+    bool will_execute = false;
+
+    // Check if we reached one of the output nodes.
+    const auto callbacks = DfsCallbacks::PreOrder([&](const NodeDef* node) {
+      if (control_output_nodes.contains(node->name())) {
+        VLOG(4) << "Found a path to control output node: " << node->name();
+        will_execute = true;
+      }
+    });
+
+    // Stop if we already proved that node will execute.
+    const auto predicates = DfsPredicates::Enter(
+        [&](const NodeDef* node) { return !will_execute; });
+
+    DfsTraversal(graph_topo_view, {&func_body_node},
+                 TraversalDirection::kFollowOutputs, predicates, callbacks);
+
+    if (!will_execute) {
+      const string error_message = absl::StrCat(
+          "Can't guarantee execution of a side-effectful node, that is not "
+          "reachable from function outputs. Function body node: ",
+          SummarizeNodeDef(func_body_node));
+
+      if (aggressive) {
+        LOG(WARNING) << error_message;
+      } else {
+        return errors::Internal(error_message);
+      }
+    }
   }
 
   return Status::OK();
 }
 
+Status PlaceInlinedFunctionBody(
+    const NodeDef& func_node, const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, int>& input_placeholders_idx,
+    FunctionOptimizerContext* ctx, GraphDef* placed_graph_def) {
+  // Control flow lowering and Placer works with a Graph object.
+  std::unique_ptr<Graph> func_body_graph =
+      absl::make_unique<Graph>(ctx->function_library());
+
+  GraphConstructorOptions opts;
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(opts, item.graph, func_body_graph.get()));
+
+  // ------------------------------------------------------------------------ //
+  // Grappler receives the graph after PRE_PLACEMENT, Placer, and POST_PLACEMENT
+  // passes, so each node has a valid device assignment. Also V2 control
+  // flow ops (functional If and While) should have been lowered to V1 control
+  // flow (Switch and Merge nodes). To keep the graph valid for execution we
+  // must assign device to every inlined graph node, and also lower the control
+  // flow.
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = &func_body_graph;
+  opt_options.flib_def = ctx->mutable_function_library();
+
+  // TODO(ezhulenev): Should we run full PRE_PLACEMENT pass here? And
+  // POST_PLACEMENT after placer?
+  LowerIfWhilePass pass;
+  TF_RETURN_IF_ERROR(pass.Run(opt_options));
+
+  // ------------------------------------------------------------------------ //
+  // Before placing the function body nodes we pin input placeholders to the
+  // same device as their corresponding input nodes.
+
+  for (Node* func_body_node : func_body_graph->nodes()) {
+    const auto input_placeholder_idx =
+        input_placeholders_idx.find(func_body_node->name());
+
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
+      const int input_idx = input_placeholder_idx->second;
+      const GraphView::OutputPort output_port =
+          ctx->graph_view().GetRegularFanin({&func_node, input_idx});
+
+      VLOG(3) << "Pin inlined function input node '" << func_body_node->name()
+              << "' to the '" << output_port.node->device() << "' device.";
+      func_body_node->set_requested_device(output_port.node->device());
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After placing nodes corresponding to the function inputs, we need to assign
+  // device placements to all other function body nodes.
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    for (Node* func_body_node : func_body_graph->nodes()) {
+      func_body_node->set_requested_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(func_body_graph.get(), devices, default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+  }
+
+  // Convert Graph back to the placed GraphDef.
+  func_body_graph->ToGraphDef(placed_graph_def);
+
+  return Status::OK();
+}
+
 Status InlineIndirectFunctionCall(const NodeDef& func_node,
                                   const FunctionDef& func,
-                                  const int graph_def_version,
                                   FunctionOptimizerContext* ctx,
                                   GraphDef* optimized_graph) {
   VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  VLOG(4) << "Inlined function definition: " << DebugString(func);
   TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
 
   const AttrSlice func_instantiation_attr =
@@ -1396,7 +1570,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx->function_library(),
-                                                graph_def_version, &item);
+                                                ctx->graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1449,7 +1623,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   VLOG(3) << "Happens after set (size = " << happens_after.size()
           << "): " << absl::StrJoin(happens_after, ", ");
 
-  // Regular (positional) inputs to the function call.
+  // Regular (data) inputs to the function call.
   std::vector<SafeTensorId> inputs;
   for (const string& input : func_node.input()) {
     SafeTensorId tensor_id = ParseTensorName(input);
@@ -1457,26 +1631,6 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     inputs.push_back(tensor_id);
   }
 
-  // If we have a node inside the function body without inputs (e.g. Const), we
-  // must attach a control dependency to it, to make sure that if a function
-  // call happens inside a loop, the node will be evaluated in correct frame.
-  //
-  // If the function call node has no inputs and no control dependencies, it
-  // means that it can't be a function call inside a loop, and we can safely
-  // insert that node without inputs into the main graph.
-  //
-  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
-  // the function is called inside a loop.
-  std::vector<string> empty_inputs_hook;
-  if (!item.inputs().empty()) {
-    const InputArgExpansion& arg0 = item.inputs()[0];
-    DCHECK(!arg0.placeholders.empty());
-    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
-        arg0.placeholders[0], /*prefix=*/func_node.name())));
-  } else if (!happens_before.empty()) {
-    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
-  }
-
   // Mapping from input placeholder name to function input position.
   absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
@@ -1489,71 +1643,109 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   const string prefix = strings::StrCat(func_node.name(), "/");
 
   // ------------------------------------------------------------------------ //
-  // Before placing the function body nodes we pin input placeholders to the
-  // same device as their corresponding input nodes.
-
-  for (NodeDef& func_body_node : *item.graph.mutable_node()) {
-    const auto input_placeholder_idx =
-        input_placeholders_idx.find(func_body_node.name());
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<string, string> output_tensors;
 
-    if (input_placeholder_idx != input_placeholders_idx.end()) {
-      const int input_idx = input_placeholder_idx->second;
-      const GraphView::OutputPort output_port =
-          ctx->graph_view().GetRegularFanin({&func_node, input_idx});
+  // Unique names of nodes producing tensors in `output_tensors`.
+  absl::flat_hash_set<string> output_tensors_nodes;
 
-      VLOG(3) << "Pin inlined function input node '" << func_body_node.name()
-              << "' to the '" << output_port.node->device() << "' device.";
-      func_body_node.set_device(output_port.node->device());
+  // Identity nodes added to the function body in place of function outputs.
+  absl::flat_hash_set<string> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
     }
   }
 
-  // ------------------------------------------------------------------------ //
-  // After placing nodes corresponding to the function inputs, we need to assign
-  // device placements to all other function body nodes.
-
-  GraphDef placed_graph_def;
+  for (const NodeDef& func_body_node : item.graph.node()) {
+    const string& node_name = func_body_node.name();
 
-  const DeviceSet* devices = ctx->devices();
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      const string& output_tensor = func_body_node.input(0);
+      output_tensors.emplace(node_name, output_tensor);
 
-  if (devices->devices().empty()) {
-    // If there are no devices available for placer, we just put all nodes to
-    // the same device as a function caller node. This can happen if Grappler is
-    // running "offline", without active runtime session, for example as a part
-    // of a batch job for graph analysis/optimization.
-    VLOG(3) << "Assign function call node device to all function body nodes. "
-            << "Device: " << func_node.device();
-    placed_graph_def = item.mutable_function_body();
-    for (NodeDef& node : *placed_graph_def.mutable_node()) {
-      node.set_device(func_node.device());
+      SafeTensorId tensor_id = ParseTensorName(output_tensor);
+      output_tensors_nodes.insert(tensor_id.node());
     }
-  } else {
-    // If we are running in an active runtime session, Grappler will get the
-    // graph after initial placing is done, and we should have devices for the
-    // placer.
-    VLOG(3) << "Run placer for instantiated function body. Devices: ["
-            << absl::StrJoin(
-                   devices->devices(), ", ",
-                   [](string* out, const Device* d) { out->append(d->name()); })
-            << "]";
-
-    // Construct a Graph object from the instantiated function body.
-    GraphConstructorOptions opts;
-    Graph graph(ctx->function_library());
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+  }
 
-    // Use function caller node device as a default for placer.
-    const Device* default_device =
-        devices->FindDeviceByName(func_node.device());
+  // ------------------------------------------------------------------------ //
+  // IMPORTANT: Actual inputs will be added to the following nodes at the very
+  // last stage, because we don't want to have invalid edges in a function body
+  // graph (control edges that depend on the nodes in the "outer" optimized
+  // graph).
+
+  // If one of the function inputs is a dead tensor, we must not execute any of
+  // the function body nodes, and let the dead tensor flag propagate through the
+  // inlined function body. We add NoOp inputs_ready node, and add control edges
+  // to it from all input nodes. Inlined function arguments (Identity nodes)
+  // will have a control dependency on it.
+  //
+  // TODO(ezhulenev): We do not need to provide this guarantee for ALL nodes in
+  // the function body. We must only ensure that we do not generate observable
+  // side effects.
+  //
+  // If the function call node has incoming control edges, we will update them
+  // to use this node as destination, to ensure side-effects execution order.
+  NodeDef* inputs_ready_node = nullptr;
+  if (func_node.input_size() > 0) {
+    inputs_ready_node = item.graph.add_node();
+    inputs_ready_node->set_op("NoOp");
+    inputs_ready_node->set_name(kInputsReadyNodeName);
+  }
+
+  // All nodes that have a control edge from the function call node, will be
+  // updated to have a control edge from 'side_effects_executed_node`. This node
+  // will have control edges from all function control outputs (see
+  // `control_ret` in FunctionDef). This a "barrier" that guarantees that all
+  // ops with side effects in the function body were executed
+  //
+  // If the function call node has no outgoing control edges, it means that no
+  // one is interested in the function side-effect affecting captured resources.
+  //
+  // If node is in keep_ops set, it means that it must execute. This could
+  // happen if the graph is an instantiation of a function with control output.
+  NodeDef* side_effects_executed_node = nullptr;
+  if (!happens_after.empty() || ctx->IsKeepOp(func_node.name())) {
+    side_effects_executed_node = item.graph.add_node();
+    side_effects_executed_node->set_op("NoOp");
+    side_effects_executed_node->set_name(kSideEffectsExecutedNodeName);
+  }
 
-    Placer placer(&graph, devices, nullptr, /* No session options */
-                  default_device);
-    TF_RETURN_IF_ERROR(placer.Run());
+  // If function executed only for the regular data outputs, it's totally safe
+  // to prune side-effects. If side-effects order is important, it must be
+  // captured at graph construction time via control edges.
+  if (item.control_output_size() > 0 && happens_after.empty()) {
+    VLOG(2) << "Function has control outputs and empty happens after set.";
+  }
 
-    // Convert Graph back to the GraphDef.
-    graph.ToGraphDef(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (inputs_ready_node != nullptr) {
+    empty_inputs_hook.push_back(inputs_ready_node->name());
   }
 
+  // ------------------------------------------------------------------------ //
+  // Grappler called after PRE_PLACEMENT and PLACEMENT passes, so we have to
+  // make sure that after inlining all nodes will have valid device assignment.
+
+  GraphDef placed_graph_def;
+  TF_RETURN_IF_ERROR(PlaceInlinedFunctionBody(
+      func_node, item, input_placeholders_idx, ctx, &placed_graph_def));
+
   // ------------------------------------------------------------------------ //
   // After all nodes placed we need to prepare them for inlining into the
   // optimized graph: turn placeholders into identities, update nodes
@@ -1577,18 +1769,36 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
       const int input_idx = input_placeholder_idx->second;
       func_body_node.add_input(inputs[input_idx].ToString());
 
-      // All side effects must happen before inputs can start executing.
-      for (const string& hb_node : happens_before) {
-        func_body_node.add_input(AsControlDependency(hb_node));
+      // Add a control dependency on 'inputs_ready' node, to guarantee that all
+      // inputs are alive and all side-effects executed before function body.
+      if (inputs_ready_node) {
+        func_body_node.add_input(
+            AsControlDependency(inlined_node_name(inputs_ready_node->name())));
       }
-
     } else {
       // Update inputs of the regular function body nodes.
       for (string& input : *func_body_node.mutable_input()) {
         input = inlined_node_name(input);
       }
-      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
-        *func_body_node.add_input() = empty_inputs_hook[0];
+
+      // Check if we need to ensure node execution in correct loop frame.
+      bool node_needs_empty_inputs_hook =
+          // We have a node to hook and node has no inputs.
+          !empty_inputs_hook.empty() && func_body_node.input_size() == 0 &&
+          // Inputs ready node will always have edge from main graph. If
+          // function call has no regular and control inputs, we will not add
+          // inputs_ready node to the function body graph.
+          node_name != kInputsReadyNodeName &&
+          // The node acting as a return barrier for execution of side effects
+          // might not have any inputs (in case function has no control outputs,
+          // but we still added it because of non-empty happens-after set), so
+          // we must make sure it's executed in correct frame.
+          (node_name != kSideEffectsExecutedNodeName ||
+           item.control_output_size() == 0);
+
+      if (node_needs_empty_inputs_hook) {
+        *func_body_node.add_input() =
+            AsControlDependency(inlined_node_name(empty_inputs_hook[0]));
       }
     }
 
@@ -1606,128 +1816,236 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     AddDefaultsToNodeDef(*op_def, &func_body_node);
   }
 
-  // Construct a graph view for the preprocessed function body graph.
-  GraphView placed_graph_view(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // Check that after inlining all side-effects will be executed in well defined
+  // order. We do it by checking if there is a path from stateful/dataset ops to
+  // one of the output nodes.
+
+  // Because we rename all the nodes before inlining, we need a copy of
+  // output_nodes with a new names.
+  absl::flat_hash_set<string> inlined_output_nodes;
+  for (const string& output_node : output_nodes) {
+    inlined_output_nodes.insert(inlined_node_name(output_node));
+  }
+  const auto is_inlined_output_node = [&](const NodeDef& node) -> bool {
+    return inlined_output_nodes.find(node.name()) != inlined_output_nodes.end();
+  };
 
-  // Keep track of side-effectful ops inside function body. Each outgoing
-  // control edge from the function call node, must be replaced with control
-  // edges from inlined side-effectful ops.
-  std::vector<string> side_effectful_nodes;
+  // Names of the inlined control output nodes.
+  absl::flat_hash_set<string> inlined_control_output_nodes;
+  for (const ControlOutput& control_output : item.control_outputs()) {
+    inlined_control_output_nodes.insert(
+        inlined_node_name(control_output.node_name));
+  }
 
-  // We have to make sure that all side-effectful and dataset-output nodes
-  // inside a function body will be executed after function inlining.
-  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    const bool node_must_execute =
-        !IsFreeOfSideEffect(func_body_node, &ctx->function_library()) ||
-        IsDataset(func_body_node);
-
-    if (node_must_execute) {
-      int num_fanouts = placed_graph_view.NumFanouts(
-          func_body_node, /*include_controlled_nodes=*/true);
-
-      // If the node doesn't have any outgoing edges and we do not have any
-      // nodes in the `happens_after` set, we can't inline a function and
-      // guarantee that it will be executed. The only exception if we do
-      // function library optimization, and the GrapplerItem was instantiated
-      // for the function body, because functions do not prune these ops.
-
-      if (num_fanouts == 0 && happens_after.empty() &&
-          !ctx->optimization_options().is_function_instantiation) {
-        return errors::Internal(
-            "Can't inline a function with a side-effectful op with empty "
-            "fanouts and empty output control edge set. Function body node: ",
-            SummarizeNodeDef(func_body_node));
+  // Construct a graph topology view for DFS traversals (skip invalid edges for
+  // input nodes connected to nodes in the optimized graph).
+  GraphTopologyView placed_topo_view(/*skip_invalid_edges=*/true);
+  TF_RETURN_IF_ERROR(placed_topo_view.InitializeFromGraph(placed_graph_def));
+  TF_RETURN_IF_ERROR(CheckThatSideEffectsWillExecute(
+      *ctx, placed_topo_view, inlined_control_output_nodes));
+
+  // ------------------------------------------------------------------------ //
+  // Move all the nodes to the optimized graph after successful preprocessing.
+
+  if (inputs_ready_node != nullptr) {
+    string inlined_node = inlined_node_name(inputs_ready_node->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
+
+    absl::flat_hash_set<string> input_nodes;
+    for (const string& input : func_node.input()) {
+      SafeTensorId tensor = ParseTensorName(input);
+
+      // Input node might have been a function call that was already inlined.
+      auto it = ctx->tensor_mapping().find(tensor);
+      while (it != ctx->tensor_mapping().end()) {
+        tensor = it->second;
+        it = ctx->tensor_mapping().find(tensor);
       }
 
-      side_effectful_nodes.push_back(func_body_node.name());
+      if (input_nodes.insert(tensor.node()).second) {
+        placed_graph_def.mutable_node(*node_idx)->add_input(
+            AsControlDependency(tensor.node()));
+      }
     }
   }
 
-  // Identity nodes added to the function body in place of function outputs.
-  absl::flat_hash_set<string> output_nodes;
-  for (const OutputArgExpansion& output_arg : item.outputs()) {
-    for (const string& output_node : output_arg.output_nodes) {
-      output_nodes.insert(inlined_node_name(output_node));
+  if (side_effects_executed_node != nullptr) {
+    string inlined_node = inlined_node_name(side_effects_executed_node->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
+
+    // Add control edges from all control output nodes.
+    for (const string& node_name : inlined_control_output_nodes) {
+      placed_graph_def.mutable_node(*node_idx)->add_input(
+          AsControlDependency(node_name));
     }
-  }
 
-  // For each function output value we added an identity node that reads the
-  // tensor from one of the function body nodes. When we inline function into
-  // the main graph we want to bypass these nodes, so we keep a mapping from
-  // 'output node name' -> 'output tensor name'.
-  absl::flat_hash_map<string, string> output_tensors;
+    // Forward all control dependencies in the optimized graph to the new node.
+    ctx->AddControlOverrides(func_node, {inlined_node});
+  }
 
-  // Move all the nodes to the optimized graph after successful preprocessing.
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    const string& node_name = func_body_node.name();
-
-    // Skip output identity node, and add a mapping to the output tensor.
-    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
-      output_tensors.emplace(node_name, func_body_node.input(0));
+    // Skip output identity nodes.
+    if (IsIdentity(func_body_node) && is_inlined_output_node(func_body_node))
       continue;
-    }
 
     optimized_graph->add_node()->Swap(&func_body_node);
   }
 
-  DCHECK(output_tensors.size() == item.output_size())
-      << "Each function output must be mapped to an output tensor";
-
-  // TODO(ezhulenev): Inline nested indirect function calls.
-
   // Indirect function call is fully inlined into the optimized graph, and we do
   // not copy the original function call node, so we have to setup tensor
   // mapping from old output tensors, to the outputs of inlined nodes.
   int output_idx = 0;
   for (const OutputArgExpansion& output : item.outputs()) {
     for (const string& output_node : output.output_nodes) {
-      const string inlined_output = inlined_node_name(output_node);
-      const string& output_tensor = output_tensors.at(inlined_output);
+      const string& output_tensor = output_tensors.at(output_node);
 
       const SafeTensorId from_tensor(func_node.name(), output_idx++);
       const SafeTensorId to_tensor = ParseTensorName(output_tensor);
 
-      ctx->AddTensorMapping(from_tensor, to_tensor);
+      const SafeTensorId inlined_to_tensor =
+          SafeTensorId(absl::StrCat(func_node.name(), "/", to_tensor.node()),
+                       to_tensor.index());
+
+      ctx->AddTensorMapping(from_tensor, inlined_to_tensor);
     }
   }
 
-  // After inlining we'll have to forward all control dependencies from function
-  // call node to all side-effectful ops inside function body.
-  ctx->AddControlOverrides(func_node, side_effectful_nodes);
+  // If function call node was in keep_ops set, it means that we need to keep a
+  // node with the same name in the optimized graph. We forward all data
+  // consumers to inlined nodes, and we verify that the node is not in a fetch
+  // set, so it's safe to assume that the function call node is only required
+  // for a control edge source.
+  if (ctx->IsKeepOp(func_node.name())) {
+    VLOG(4) << "Add NoOp for inlined function in keep ops set.";
+    NodeDef* keep_func_node = optimized_graph->add_node();
+    keep_func_node->set_op("NoOp");
+    keep_func_node->set_name(func_node.name());
+    keep_func_node->set_device(func_node.device());
+    keep_func_node->add_input(
+        AsControlDependency(inlined_node_name(kSideEffectsExecutedNodeName)));
+  }
 
   VLOG(3) << "Successfully inlined indirect function call: "
           << SummarizeNodeDef(func_node);
+
   return Status::OK();
 }
 
-}  // namespace
+// Restores graph invariants after function specialization and inlining: all
+// inputs must be connected to valid nodes.
+Status RestoreGraphInvariants(const FunctionOptimizerContext& ctx,
+                              GraphDef* optimized_graph) {
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
 
-Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
-                                   GraphDef* optimized_graph) {
-  // Nothing to do here.
-  if (item.graph.library().function_size() == 0) {
-    *optimized_graph = item.graph;
-    return Status::OK();
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
+
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
+        }
+      }
+    }
+  }
+
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the control outputs
+  // node (it's also possible to rewrite singe control edge into multiple edges
+  // to inlined side-effectful nodes).
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      absl::flat_hash_set<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
   }
 
-  FunctionOptimizerContext ctx(opt_level_, item);
+  return Status::OK();
+}
+
+}  // namespace
+
+Status FunctionOptimizer::RunFunctionOptimizerPass(
+    const GrapplerItem& item, const GraphDef& graph, const int iteration,
+    std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+    bool* graph_has_unoptimized_function_calls) const {
+  VLOG(3) << absl::Substitute(
+      "Run function optimizer pass (iteration = $0): grappler_item_id = $1",
+      iteration, item.id);
+
+  FunctionOptimizerContext ctx(item, opt_level_, graph);
 
   bool inline_gradients = options_.enable_symbolic_gradient_inlining;
   bool inline_func = options_.enable_function_inlining;
   bool specialize_func = options_.enable_function_specialization;
 
-  for (const NodeDef& node : item.graph.node()) {
+  // We will process all the nodes in topological order, to correctly handle
+  // inlining of function call chains.
+  std::vector<const NodeDef*> topo_ordered_nodes;
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &topo_ordered_nodes));
+
+  for (const NodeDef* node : topo_ordered_nodes) {
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
     const auto is_graph_modified = [&]() {
       int num_nodes = optimized_graph->node_size();
-      CHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
+      DCHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
       return num_nodes > num_nodes_before;
     };
 
-    // Add a copy of an input graph node to the optimized graph.
-    const auto add_node_copy = [&]() { *optimized_graph->add_node() = node; };
+    // Copy node from the `graph` to the `optimized_graph`.
+    const auto copy_node = [&]() { *optimized_graph->add_node() = *node; };
+
+    // If we already failed to optimize this node during one of the previous
+    // passes, we just give up, and do not try on more time.
+    if (skip_nodes->find(node->name()) != skip_nodes->end()) {
+      VLOG(3) << "Skip optimization for node: " << node->name();
+      copy_node();
+      continue;
+    }
 
 // Skip errors if optimized graph was not modified before error happened.
 #define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
@@ -1737,7 +2055,8 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       return _status;                                              \
     if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
       VLOG(3) << "Skip error: " << _status.error_message();        \
-      add_node_copy();                                             \
+      skip_nodes->insert(node->name());                            \
+      copy_node();                                                 \
     }                                                              \
   } while (0)
 
@@ -1745,16 +2064,19 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
     // 1. Inline symbolic gradients into the optimized graph.                 //
     // ---------------------------------------------------------------------- //
 
-    if (IsSymbolicGradient(node) && inline_gradients) {
+    if (IsSymbolicGradient(*node) && inline_gradients) {
       // Inline symbolic gradients only if the corresponding function is not
       // marked as `_noinline`.
-      const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
+      const auto* f_attr = gtl::FindOrNull(node->attr(), "f");
       const string f_name = f_attr != nullptr ? f_attr->func().name() : "";
       const FunctionDef* func = ctx.function_library().Find(f_name);
       if (func && !MarkedNoInline(*func)) {
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineSymbolicGradient(node, &ctx, optimized_graph));
+            InlineSymbolicGradient(*node, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip SymbolicGradient inlining: function=" << f_name;
+        skip_nodes->insert(node->name());
       }
     }
 
@@ -1763,44 +2085,45 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
     // ---------------------------------------------------------------------- //
 
     // Find if a node is a function call (direct or indirect).
-    const FunctionDef* func = FindFunctionCall(ctx, node);
+    const FunctionDef* func = FindFunctionCall(ctx, *node);
 
     if (func != nullptr) {
       const string& func_name = func->signature().name();
-      const int graph_def_version = item.graph.versions().producer();
 
-      const bool is_direct_func = IsDirectFunctionCall(*func, node);
-      const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
+      const bool is_direct_func = IsDirectFunctionCall(*func, *node);
+      const bool is_indirect_func = IsIndirectFunctionCall(*func, *node);
 
       // 2a. Inline direct function call if it's inlinable.
       if (inline_func && is_direct_func) {
-        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
+        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, *node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
-              node, *func, graph_def_version, ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineDirectFunctionCall(*node, *func, ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node->name());
         }
       }
 
       // 2b. Inline indirect function call if it's inlinable.
       if (inline_func && is_indirect_func) {
-        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
+        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, *node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
-              node, *func, graph_def_version, &ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineIndirectFunctionCall(*node, *func, &ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node->name());
         }
       }
 
       // 2c. Specialize it to its instantiation context if can't be inlined,
       // and it has something worth specializing.
       bool specialization_worthy = IsParametrized(*func) ||
-                                   HasTrulyConstInputs(node, ctx) ||
-                                   HasUnusedOutputs(node, *func, ctx);
+                                   HasTrulyConstInputs(*node, ctx) ||
+                                   HasUnusedOutputs(*node, *func, ctx);
 
       // Do not specialize if function has custom gradient.
       const string grad_func = ctx.function_library().FindGradient(func_name);
@@ -1809,95 +2132,95 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            SpecializeFunction(node, *func, item.graph.versions().producer(),
-                               &ctx, optimized_graph));
+            SpecializeFunction(*node, *func, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip function specialization: " << func->signature().name();
+        skip_nodes->insert(node->name());
       }
     }
 
     // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
-    // (inline, specialize), simply add a copy to the graph.
-    add_node_copy();
+    // (inline, specialize), simply copy the node to the optimized graph.
+    copy_node();
 
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  // After function specialization and inlining graph might be in invalid
-  // state, and some nodes can read tensors that do not exists anymore in the
-  // optimized graph: function call node was fully inlined into the graph, or
-  // output index was invalidated by the output pruning.
+  TF_RETURN_IF_ERROR(RestoreGraphInvariants(ctx, optimized_graph));
 
-  if (!ctx.tensor_mapping().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      for (int idx = 0; idx < node.input_size(); ++idx) {
-        TensorId input_tensor = ParseTensorName(node.input(idx));
-        if (input_tensor.index() == Graph::kControlSlot) break;
+  // Preserve the graph version.
+  *optimized_graph->mutable_versions() = graph.versions();
 
-        auto mapping = ctx.tensor_mapping().find(input_tensor);
-        if (mapping != ctx.tensor_mapping().end()) {
-          node.set_input(idx, mapping->second.ToString());
-        }
-      }
+  // Prune unreachable function from the library.
+  if (options_.enable_trim_function_library) {
+    *optimized_graph->mutable_library() =
+        PruneFunctionLibrary(ctx.function_library(), *optimized_graph);
+  } else {
+    *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  }
+
+  // Before returning we check if after single optimization pass we have more
+  // unoptimized function calls.
+  *graph_has_unoptimized_function_calls = false;
+  for (const NodeDef& node : optimized_graph->node()) {
+    // Check if we can inline symbolic gradient.
+    if (IsSymbolicGradient(node) && inline_gradients &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
     }
-  }
 
-  // Function inlining instantiates function body directly into the optimized
-  // graph, and we might end up with control dependencies to the nodes that no
-  // longer exist in a graph. We need to apply control overrides to all
-  // invalidated nodes, and rewire control dependencies to the inlined
-  // side-effectful function body nodes.
+    // Check if after inlining we have unoptimized function calls.
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+    if (func != nullptr && !MarkedSpecialized(*func) &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
+    }
+  }
 
-  // TODO(ezhulenev): With nested function call inlining, single pass over
-  // `control_overrides` might not bring the graph into a valid state,
-  // continue until it converges and all invalidated control dependencies
-  // removed.
+  return Status::OK();
+}
 
-  if (!ctx.control_overrides().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      // Keep track of new control inputs to the node.
-      absl::flat_hash_set<string> add_ctrl_inputs;
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
+                                   GraphDef* optimized_graph) {
+  // Nothing to do here.
+  if (item.graph.library().function_size() == 0) {
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
 
-      // Remove all invalidated control inputs.
-      for (int idx = 0; idx < node.input_size(); /* see below */) {
-        // TODO(ezhulenev): Use non-allocating TensorId after migrating
-        // `control_overrides()` to absl::flat_hash_set.
-        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+  // Do not retry failed function inlining or specialization.
+  std::unordered_set<string> skip_nodes;
+  bool graph_has_unoptimized_function_calls = false;
 
-        auto overrides = ctx.control_overrides().find(input_tensor.node());
-        if (overrides != ctx.control_overrides().end()) {
-          // If this happens it's a bug in the function inlining.
-          if (input_tensor.index() != Graph::kControlSlot) {
-            return errors::Internal(
-                "Illegal input edge from inlined function call node");
-          }
-          // Remove control dependency to the inlined function call node.
-          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
-          node.mutable_input()->RemoveLast();
+  // We'll keep running function optimizer pass until we inlined and optimized
+  // all function call nodes.
+  int iteration = 0;
+  constexpr int kMaxIterations = 50;
 
-          // Keep track of all overrides.
-          for (const string& override : overrides->second) {
-            add_ctrl_inputs.insert(AsControlDependency(override));
-          }
-        } else {
-          // Go to the next input only if the current one was not invalidated,
-          // otherwise we need to check the swapped input as well.
-          ++idx;
-        }
-      }
+  // 1. Run first optimizer pass with GrapplerItem.graph.
+  TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+      item, item.graph, 0, &skip_nodes, optimized_graph,
+      &graph_has_unoptimized_function_calls));
 
-      // Add overrides to the node inputs.
-      for (const string& ctrl_input : add_ctrl_inputs) {
-        node.add_input(ctrl_input);
-      }
+  // 2. If after function inlining we have unoptimized function calls, we have
+  // to run function optimization pass one more time.
+  while (graph_has_unoptimized_function_calls) {
+    if (iteration++ > kMaxIterations) {
+      VLOG(1) << "Break function optimizer loop at iteration #" << iteration;
+      break;
     }
-  }
 
-  *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() =
-      options_.enable_trim_function_library
-          ? PruneFunctionLibrary(ctx.function_library(), *optimized_graph)
-          : ctx.function_library().ToProto();
+    GraphDef workspace_graph;
+    workspace_graph.Swap(optimized_graph);
+
+    TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+        item, workspace_graph, iteration, &skip_nodes, optimized_graph,
+        &graph_has_unoptimized_function_calls));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 4352555064c43c8db40157ace2fca9479907df8e..ab90281509fc1f4a80a82bd6e1ab830e22200838 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -48,6 +48,16 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_trim_function_library = true;
   };
 
+  // Runs a single function optimizer pass over the `graph`. All nodes that are
+  // not function calls will be copied from the `graph` to the
+  // `optimized_graph`. Function call nodes inlined or specialized, and
+  // instantiated function body or specialized function call nodes will be added
+  // to the `optimized_graph`.
+  Status RunFunctionOptimizerPass(
+      const GrapplerItem& item, const GraphDef& graph, const int iteration,
+      std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+      bool* graph_has_unoptimized_function_calls) const;
+
   RewriterConfig::Toggle opt_level_;
   FunctionOptimizerOptions options_;
 };
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 827b0658c5b01264e4241d564402840753d3f46d..83f9468e3f392a4989a98126b03072d2db4ed185 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+
+#include "absl/algorithm/container.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -639,14 +641,20 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
   EXPECT_EQ("SymbolicGradient", output.node(3).name());
   EXPECT_EQ("SymbolicGradient/SymbolicGradient/Identity",
             output.node(4).name());
-  EXPECT_EQ("SymbolicGradient/Func/_0", output.node(5).name());
-  EXPECT_EQ("SymbolicGradient/Func/_1", output.node(6).name());
-  EXPECT_EQ("SymbolicGradient/Func/_2", output.node(7).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/input/_0",
+            output.node(5).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/input/_1",
+            output.node(6).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/output/_2",
+            output.node(7).name());
   EXPECT_EQ("SymbolicGradient/SymbolicGradient/Func/_1/dx",
             output.node(8).name());
-  EXPECT_EQ("SymbolicGradient/Func/_3", output.node(9).name());
-  EXPECT_EQ("SymbolicGradient/Func/_4", output.node(10).name());
-  EXPECT_EQ("SymbolicGradient/Func/_5", output.node(11).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/input/_3",
+            output.node(9).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/input/_4",
+            output.node(10).name());
+  EXPECT_EQ("SymbolicGradient/Func/SymbolicGradient/Func/_1/output/_5",
+            output.node(11).name());
   EXPECT_EQ("out", output.node(12).name());
   for (int i = 2; i < 4; ++i) {
     EXPECT_EQ("IdentityN", output.node(i).op());
@@ -734,8 +742,11 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Function must be inlined and all nodes placed on a valid device.
-       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
        NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
 
        NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
@@ -760,7 +771,7 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
 
   const Tensor kOne = test::AsScalar<float>(1.0);
   const Tensor kTwo = test::AsScalar<float>(2.0);
@@ -774,9 +785,11 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
         "AssignAddVariableOp",
         {"v", "one:output:0"},
         {{"dtype", DT_FLOAT}}},
-       {{"mul"}, "Mul", {"x", "y", "^add"}, {{"T", "$T"}}}},
+       {{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
       /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
+      {{"z", "mul:z:0"}},
+      /* Control output to ensure that side effects will be executed. */
+      {{"size_effects", "add"}});
 
   // Build a graph to compute:
   //   a = Placeholder
@@ -831,36 +844,49 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
             kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+       NDef("f1/inputs_ready", "NoOp", {"^a", "^b", "^v", "^init_v"}, {},
+            kDevice),
+
+       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
             kDevice),
-       NDef("f1/one", "Const", {"^f1/x"},
+       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^f1/inputs_ready"},
+            {{"T", DT_RESOURCE}}, kDevice),
+
+       NDef("f1/one", "Const", {"^f1/inputs_ready"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("f1/side_effects_executed", "NoOp", {"^f1/add"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call.
-       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
-            kDevice),
-       NDef("f2/one", "Const", {"^f2/x"},
+       NDef("f2/inputs_ready", "NoOp",
+            {"^v", "^f1/mul", "^f1/side_effects_executed"}, {}, kDevice),
+
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f2/inputs_ready"},
+            {{"T", DT_RESOURCE}}, kDevice),
+
+       NDef("f2/one", "Const", {"^f2/inputs_ready"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
-       NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
-            kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("f2/side_effects_executed", "NoOp", {"^f2/add"}, {}, kDevice),
 
        // Return values read directly from inlined nodes.
        NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+       NDef("out_2", "ReadVariableOp",
+            {"v", "^f1/side_effects_executed", "^f2/side_effects_executed"},
             {{"dtype", DT_FLOAT}}, kDevice)},
 
       // Function library.
@@ -926,8 +952,11 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
 
        // Function must be inlined and `mul` node placed on a requested device,
        // and input `Identity` nodes must be colocated with their source nodes.
-       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu0),
-       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, cpu0),
+       NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            cpu0),
+       NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            cpu1),
        NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
 
        NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
@@ -937,7 +966,8 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
   CompareGraphs(expected, optimized_graph);
 }
 
-TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+TEST_F(FunctionOptimizerTest,
+       InlineIndirectFunctionWithControlDependencyAndNoSideEffects) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
 
@@ -995,15 +1025,26 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
        NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+       // Control input from `inputs_ready` node is added to ensure correct
+       // frame execution.
+       NDef("f1/side_effects_executed", "NoOp", {"^f1/inputs_ready"}, {},
+            kDevice),
 
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call, and control dependency edge removed.
-       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/inputs_ready", "NoOp", {"^f1/mul", "^f1/side_effects_executed"},
+            {}, kDevice),
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
        NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
 
        // Return directly from inlined node of f2.
@@ -1149,8 +1190,11 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithMergedDeadTensors) {
        NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("fn/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("fn/cond", "Identity", {"b:0"}, {{"T", DT_BOOL}}, kDevice),
+       NDef("fn/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("fn/x", "Identity", {"a:0", "^fn/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/cond", "Identity", {"b:0", "^fn/inputs_ready"},
+            {{"T", DT_BOOL}}, kDevice),
        NDef("fn/switch", "Switch", {"fn/x:0", "fn/cond:0"}, {{"T", DT_FLOAT}},
             kDevice),
        NDef("fn/if_false", "Identity", {"fn/switch:0"}, {{"T", DT_FLOAT}},
@@ -1182,6 +1226,206 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithMergedDeadTensors) {
   test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // `Square` implemented in terms of PartitionedCall to `MyMul`.
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"output:T"}, {"T: {float, double}"},
+      {{{"square"},
+        "PartitionedCall",
+        {"x", "x"},
+        {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+         {"Tout", DataTypeSlice{DT_FLOAT}},
+         {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"output", "square:output:0"}});
+
+  // Build a graph to compute:
+  //   b = Square(a)
+  //   c = Identity(b)
+  //   return c
+  GrapplerItem item;
+  item.fetch = {"c"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "PartitionedCall", {"a"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MySquare", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("c", "Identity", {"b"}, {{"T", DT_FLOAT}}, kDevice)},
+      /* Function library */
+      {mul_func, square_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Inlined inputs of `b` node.
+       NDef("b/inputs_ready", "NoOp", {"^a"}, {}, kDevice),
+       NDef("b/x", "Identity", {"a:0", "^b/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Inlined inputs of `square` node inside inlined `MySquare` function.
+       NDef("b/square/inputs_ready", "NoOp", {"^b/x"}, {}, kDevice),
+       NDef("b/square/x", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("b/square/y", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       // Inlined mul node from the `MyMul` function.
+       NDef("b/square/mul", "Mul", {"b/square/x", "b/square/y"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("c", "Identity", {"b/square/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor three = test::AsScalar<float>(3.0f);
+  item.feed.emplace_back("a", three);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithFunctionalControlFlow) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef add_func = FunctionDefHelper::Create(
+      "MyAdd", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"add"}, "Add", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "add:z:0"}});
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Compute: return cond ? a + b : a * b
+  FunctionDef add_or_mul_func = FunctionDefHelper::Create(
+      "AddOrMul", {"cond:bool", "x:float", "y:float"}, {"z:float"}, {},
+      {
+          {{"if_node"},
+           "If",
+           {"cond", "x", "y"},
+           {
+               {"Tcond", DT_BOOL},
+               {"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+               {"Tout", DataTypeSlice{DT_FLOAT}},
+               {"then_branch", FDH::FunctionRef("MyAdd", {{"T", DT_FLOAT}})},
+               {"else_branch", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})},
+               {"_lower_using_switch_merge", true},
+           }},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "if_node:output:0"}});
+
+  // Build a computation graph for:
+  //   is_add: bool
+  //   a: float
+  //   b: float
+  //   c = AddOrMul(is_add, a, b)  # is_add ? a + b : a * b
+  //   d = Identity(c)
+  //   return d
+
+  // c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("is_add", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+       NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       NDef("c", "PartitionedCall", {"is_add", "a", "b"},
+            {{"Tin", DataTypeSlice{DT_BOOL, DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("AddOrMul")}},
+            kDevice),
+
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {add_or_mul_func, add_func, mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  const auto count_nodes_with_op = [&](const string& op) {
+    return absl::c_count_if(optimized_graph.node(), [&](const NodeDef& node) {
+      return node.op() == op;
+    });
+  };
+
+  // All `PartitionedCall` nodes in the optimized graph must be inlined, and
+  // `If` node must be lowered to `Switch` and `Merge` nodes.
+  EXPECT_EQ(count_nodes_with_op("PartitionedCall"), 0);
+  EXPECT_EQ(count_nodes_with_op("If"), 0);
+  EXPECT_EQ(count_nodes_with_op("Switch"), 3);
+  EXPECT_EQ(count_nodes_with_op("Merge"), 2);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+
+  Tensor one = test::AsScalar<float>(1.0);
+  Tensor two = test::AsScalar<float>(2.0);
+  Tensor three = test::AsScalar<float>(3.0);
+
+  const auto feed_args = [&](bool is_add) {
+    std::vector<std::pair<string, Tensor>> feed;
+    feed.emplace_back("a", one);
+    feed.emplace_back("b", two);
+    feed.emplace_back("is_add", test::AsScalar<bool>(is_add));
+    return feed;
+  };
+
+  {  // Check 'is_add == true': a + b
+    item.feed = feed_args(true);
+    optimized.feed = feed_args(true);
+
+    auto tensors_expected = EvaluateFetchNodes(item);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    test::ExpectTensorEqual<float>(tensors_expected[0], three);
+
+    auto tensors = EvaluateFetchNodes(optimized);
+    ASSERT_EQ(tensors.size(), tensors_expected.size());
+    test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  }
+
+  {  // Check 'is_add == false': a * b
+    item.feed = feed_args(false);
+    optimized.feed = feed_args(false);
+
+    auto tensors_expected = EvaluateFetchNodes(item);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    test::ExpectTensorEqual<float>(tensors_expected[0], two);
+
+    auto tensors = EvaluateFetchNodes(optimized);
+    ASSERT_EQ(tensors.size(), tensors_expected.size());
+    test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  }
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index e587a2b2af74cb417ac58f672a4cc5526335d0a8..44dfe0de7890f09feb0b2cbfc450ddb9e37fc3cd 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -39,7 +39,7 @@ class GraphOptimizer {
   // Routine called to allow an algorithm to propose a rewritten graph
   // for the graph, feeds and fetches in "item" to run more efficiently
   // on "cluster".
-  // Returns true iff it managed to generate a solution, false otherwise.
+  // Returns an error status if it failed to generate a solution.
   virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) = 0;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 19dc2c8ad95ad86b9843406468163dfba5944f88..58107fa0fe38c2cc0f31d7449a0e187aa3f96dbc 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <unordered_map>
 #include <unordered_set>
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -169,6 +171,16 @@ class GraphOptimizerStage {
     return MakeOptimizedNodeName(node, optimizer_name_, prefix);
   }
 
+  const string UniqueOptimizedNodeName(const NodeScopeAndName& node) {
+    const string node_name = OptimizedNodeName(node);
+    return UniqueNodeName(node_name);
+  }
+  const string UniqueOptimizedNodeName(const NodeScopeAndName& node,
+                                       const string& rewrite_rule) {
+    const string node_name = OptimizedNodeName(node, rewrite_rule);
+    return UniqueNodeName(node_name);
+  }
+
   // Get a node by input name from a node map. Return an error if node was not
   // found.
   Status GetInputNode(const string& input, NodeDef** node) const {
@@ -193,10 +205,21 @@ class GraphOptimizerStage {
  protected:
   const GraphOptimizerContext& ctx() const { return ctx_; }
 
- private:  // Data members
+ private:
+  const string UniqueNodeName(absl::string_view name) {
+    string node_name = string(name);
+    while (ctx_.node_map->NodeExists(node_name)) {
+      node_name = absl::StrCat(name, "_unique",
+                               optimized_node_name_counter_.fetch_add(1));
+    }
+
+    return node_name;
+  }
+
   const string optimizer_name_;
   const string stage_name_;
   const GraphOptimizerContext ctx_;
+  std::atomic<int64> optimized_node_name_counter_ = {0};
 };
 
 template <typename Result>
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 799c40c67bca0ae4cdac99b59404b2942cb481b4..7b6bd3a122791139a304667384c9c84acf90796d 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/platform/test.h"
@@ -25,6 +26,9 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+
 class GraphOptimizerStageTest : public ::testing::Test {};
 
 struct FakeResult {};
@@ -44,23 +48,23 @@ class FakeOptimizerStage : public GraphOptimizerStage<FakeResult> {
   }
 };
 
-TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InRoot) {
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScopeInRoot) {
   const auto scope_and_name = ParseNodeScopeAndName("Add");
-  EXPECT_EQ("", scope_and_name.scope);
-  EXPECT_EQ("Add", scope_and_name.name);
+  EXPECT_EQ(scope_and_name.scope, "");
+  EXPECT_EQ(scope_and_name.name, "Add");
 }
 
-TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InScope) {
+TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScopeInScope) {
   const auto scope_and_name = ParseNodeScopeAndName("a/b/c/Add");
-  EXPECT_EQ("a/b/c", scope_and_name.scope);
-  EXPECT_EQ("Add", scope_and_name.name);
+  EXPECT_EQ(scope_and_name.scope, "a/b/c");
+  EXPECT_EQ(scope_and_name.name, "Add");
 }
 
 TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ nullptr,
                             /*graph_properties*/ nullptr,
-                            /*node_name*/ nullptr,
+                            /*node_map*/ nullptr,
                             /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
@@ -68,15 +72,70 @@ TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
   const auto node = ParseNodeScopeAndName("a/b/c/Add");
 
   // Without rewrite rule
-  EXPECT_EQ("a/b/c/my_opt/my_stg_Add", stage.OptimizedNodeName(node));
-  EXPECT_EQ(
-      "a/b/c/my_opt/my_stg_Add_Mul_Sqrt",
-      stage.OptimizedNodeName(node, std::vector<string>({"Mul", "Sqrt"})));
+  EXPECT_EQ(stage.OptimizedNodeName(node), "a/b/c/my_opt/my_stg_Add");
+  EXPECT_EQ(stage.OptimizedNodeName(node, std::vector<string>({"Mul", "Sqrt"})),
+            "a/b/c/my_opt/my_stg_Add_Mul_Sqrt");
+
+  // With rewrite rule
+  const string rewrite = "my_rewrite";
+  EXPECT_EQ(stage.OptimizedNodeName(node, rewrite),
+            "a/b/c/my_opt/my_stg_my_rewrite_Add");
+}
+
+TEST_F(GraphOptimizerStageTest, UniqueOptimizedNodeName) {
+  GraphDef graph =
+      GDef({NDef("a/b/c/A", "NotImportant", {}),
+            NDef("a/b/c/my_opt/my_stg_A", "NotImportant", {}),
+            NDef("a/b/c/my_opt/my_stg_my_rewrite_A", "NotImportant", {})},
+           /*funcs=*/{});
+
+  NodeMap node_map(&graph);
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ nullptr,
+                            /*graph_properties*/ nullptr,
+                            /*node_map*/ &node_map,
+                            /*feed_nodes*/ nullptr,
+                            /*opt_level*/ RewriterConfig::ON);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  const auto node = ParseNodeScopeAndName("a/b/c/A");
+
+  EXPECT_EQ(stage.UniqueOptimizedNodeName(node),
+            "a/b/c/my_opt/my_stg_A_unique0");
+
+  // With rewrite rule
+  const string rewrite = "my_rewrite";
+  EXPECT_EQ(stage.UniqueOptimizedNodeName(node, rewrite),
+            "a/b/c/my_opt/my_stg_my_rewrite_A_unique1");
+}
+
+TEST_F(GraphOptimizerStageTest, UniqueOptimizedNodeNameWithUsedNodeNames) {
+  GraphDef graph = GDef(
+      {NDef("a/b/c/A", "NotImportant", {}),
+       NDef("a/b/c/my_opt/my_stg_A", "NotImportant", {}),
+       NDef("a/b/c/my_opt/my_stg_A_unique0", "NotImportant", {}),
+       NDef("a/b/c/my_opt/my_stg_my_rewrite_A", "NotImportant", {}),
+       NDef("a/b/c/my_opt/my_stg_my_rewrite_A_unique1", "NotImportant", {})},
+      /*funcs=*/{});
+
+  NodeMap node_map(&graph);
+  GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
+                            /*optimized_graph*/ nullptr,
+                            /*graph_properties*/ nullptr,
+                            /*node_map*/ &node_map,
+                            /*feed_nodes*/ nullptr,
+                            /*opt_level*/ RewriterConfig::ON);
+  FakeOptimizerStage stage("my_opt", "my_stg", ctx);
+
+  const auto node = ParseNodeScopeAndName("a/b/c/A");
+
+  EXPECT_EQ(stage.UniqueOptimizedNodeName(node),
+            "a/b/c/my_opt/my_stg_A_unique1");
 
   // With rewrite rule
   const string rewrite = "my_rewrite";
-  EXPECT_EQ("a/b/c/my_opt/my_stg_my_rewrite_Add",
-            stage.OptimizedNodeName(node, rewrite));
+  EXPECT_EQ(stage.UniqueOptimizedNodeName(node, rewrite),
+            "a/b/c/my_opt/my_stg_my_rewrite_A_unique2");
 }
 
 TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
@@ -97,27 +156,28 @@ TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
+                            /*node_map*/ &node_map,
                             /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   NodeDef* add_node;
   TF_CHECK_OK(stage.GetInputNode("Add", &add_node));
-  EXPECT_EQ("a", add_node->input(0));
-  EXPECT_EQ("b", add_node->input(1));
+  ASSERT_EQ(add_node->input_size(), 2);
+  EXPECT_EQ(add_node->input(0), "a");
+  EXPECT_EQ(add_node->input(1), "b");
 
   OpInfo::TensorProperties add_properties;
   TF_CHECK_OK(stage.GetTensorProperties("Add", &add_properties));
-  EXPECT_EQ(DT_FLOAT, add_properties.dtype());
+  EXPECT_EQ(add_properties.dtype(), DT_FLOAT);
 
   OpInfo::TensorProperties a_properties;
   TF_CHECK_OK(stage.GetTensorProperties("a:0", &a_properties));
-  EXPECT_EQ(DT_FLOAT_REF, a_properties.dtype());
+  EXPECT_EQ(a_properties.dtype(), DT_FLOAT_REF);
 
   OpInfo::TensorProperties b_properties;
   TF_CHECK_OK(stage.GetTensorProperties("b:0", &b_properties));
-  EXPECT_EQ(DT_FLOAT_REF, b_properties.dtype());
+  EXPECT_EQ(b_properties.dtype(), DT_FLOAT_REF);
 }
 
 TEST_F(GraphOptimizerStageTest, AddNodes) {
@@ -138,7 +198,7 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
+                            /*node_map*/ &node_map,
                             /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
@@ -148,10 +208,11 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
 
   // Add a new copy node
   NodeDef* add_node_copy = stage.AddCopyNode("Add_1", add_node);
-  EXPECT_EQ("Add_1", add_node_copy->name());
-  EXPECT_EQ("Add", add_node_copy->op());
-  EXPECT_EQ("a", add_node_copy->input(0));
-  EXPECT_EQ("b", add_node_copy->input(1));
+  EXPECT_EQ(add_node_copy->name(), "Add_1");
+  EXPECT_EQ(add_node_copy->op(), "Add");
+  ASSERT_EQ(add_node->input_size(), 2);
+  EXPECT_EQ(add_node_copy->input(0), "a");
+  EXPECT_EQ(add_node_copy->input(1), "b");
 
   // It must be available for by-name lookup
   NodeDef* add_node_copy_by_name;
@@ -160,7 +221,8 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
 
   // Add new empty node
   NodeDef* empty_node = stage.AddEmptyNode("Add_2");
-  EXPECT_EQ("Add_2", empty_node->name());
+  EXPECT_EQ(empty_node->name(), "Add_2");
+  EXPECT_EQ(empty_node->input_size(), 0);
 
   // It must be available for by-name lookup
   NodeDef* empty_node_by_name;
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
similarity index 93%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector.cc
index 75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f..a370bf9934e8b6eb057d9ead6558b5ecf57edaef 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <string>
 
@@ -101,14 +101,14 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::LoadFunctions(
+Status ImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
   TF_RETURN_IF_ERROR(lib_info_->Init(graph.library()));
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
+Status ImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
@@ -170,12 +170,16 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::SelectImplementation(
+Status ImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
     return Status::OK();
   }
+  if (lib_info_->empty()) {
+    VLOG(2) << "Skipping optimization since lib_info is empty";
+    return Status::OK();
+  }
 
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
@@ -183,9 +187,9 @@ Status ExperimentalImplementationSelector::SelectImplementation(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::Optimize(Cluster* cluster,
-                                                    const GrapplerItem& item,
-                                                    GraphDef* optimized_graph) {
+Status ImplementationSelector::Optimize(Cluster* cluster,
+                                        const GrapplerItem& item,
+                                        GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
   TF_RETURN_IF_ERROR(LoadFunctions(*optimized_graph));
   return SelectImplementation(optimized_graph);
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
similarity index 80%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
rename to tensorflow/core/grappler/optimizers/implementation_selector.h
index 82f7473a14ec9b20492ac7acef3b72e919040ece..c206d21640b4816d2af46b0581eb410564aca175 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
 
 #include <string>
 
@@ -33,7 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// -- EXPERIMENTAL --
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -45,12 +44,12 @@ namespace grappler {
 //
 // For instance, the python code might specify:
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one',
-//        experimental_api_preferred_device='GPU')
+//        api_implements='plus_one',
+//        api_preferred_device='GPU')
 // def plus_one_gpu(x): return x + 1.0
 //
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one')
+//        api_implements='plus_one')
 // def plus_one_reference_implementation(x): return x + 1.0
 // input = tf.constant(2.0, dtype=tf.float32)
 //
@@ -62,21 +61,21 @@ namespace grappler {
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
-//  - experimental_api_implements(string): all functions mapping to the same
+//  - api_implements(string): all functions mapping to the same
 //    string can be interchanged. For now, all functions must have the same
 //    signature and overloads are not allowed. Defuns within defuns are
 //    allowed.
-//  - experimental_api_preferred_device(string): sets which device is preferred.
-class ExperimentalImplementationSelector : public CustomGraphOptimizer {
+//  - api_preferred_device(string): sets which device is preferred.
+class ImplementationSelector : public CustomGraphOptimizer {
  public:
-  ExperimentalImplementationSelector() = default;
-  ~ExperimentalImplementationSelector() override = default;
+  ImplementationSelector() = default;
+  ~ImplementationSelector() override = default;
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
   }
   string name() const override {
-    return "experimental_implementation_selector";
+    return "implementation_selector";
   }
 
   // This call is not thread-safe.
@@ -106,10 +105,10 @@ class ExperimentalImplementationSelector : public CustomGraphOptimizer {
 
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExperimentalImplementationSelector);
+  TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
 };
 
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
similarity index 82%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index e330835e9bc4fea33928e376a3fd98ebe34a74ee..e2f58964a2a089a0cfda57449f288925ed71d858 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <algorithm>
 #include <memory>
@@ -38,15 +38,14 @@ namespace {
 constexpr char CpuDevice[] = "/device:CPU:0";
 constexpr char GpuDevice[] = "/device:GPU:0";
 
-class ExperimentalImplementationSelectorTest : public GrapplerTest {};
+class ImplementationSelectorTest : public GrapplerTest {};
 
-TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
+TEST_F(ImplementationSelectorTest, NoUpdate) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {CpuDevice});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  std::unique_ptr<CustomGraphOptimizer> optimizer(
-      new ExperimentalImplementationSelector);
+  std::unique_ptr<CustomGraphOptimizer> optimizer(new ImplementationSelector);
   ASSERT_NE(nullptr, optimizer);
   TF_ASSERT_OK(optimizer->Init());
 
@@ -58,19 +57,19 @@ TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
+TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("times_two");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XAddX();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("times_two");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -96,19 +95,19 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
   }
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
+TEST_F(ImplementationSelectorTest, SwapImplementationEval) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("random_boost");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XTimesFour();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("random_boost");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -133,7 +132,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(2.0f));
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+TEST_F(ImplementationSelectorTest, SwapImplementationWithGradient) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
   // boost_1 returns the doubled input and a const as the internal state, the
@@ -146,8 +145,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s", "one:output:0"}});
   auto* boost_1_attr = boost_1.mutable_attr();
-  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
 
   FunctionDef boost_1_gradient = FDH::Create(
@@ -157,8 +156,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
-  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
 
   // boost_2 return the input * 4, and with two extra internal states.
@@ -171,8 +170,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
   auto* boost_2_attr = boost_2_func.mutable_attr();
-  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
 
   FunctionDef boost_2_gradient = FDH::Create(
@@ -182,8 +181,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
-  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
 
   // Define the forward function with f = boost2 function but with CPU device.
@@ -203,7 +202,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
             {"f", FDH::FunctionRef("Boost2Gradient")}},
            CpuDevice);
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index e9b706a58371cad72ef4b0652bc86364d7c4f5c0..e9d622afbf450fe54be12bfde01ddaae67efc535 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2048,8 +2048,10 @@ class DataLayoutOptimizer : GraphProcessor {
     // only needs to be performed if at least one node in the previous pass is
     // expanded.
     if (graph_->node_size() > node_size_original) {
-      NodeDef* n = AddNodePermNHWCToNCHW();
-      n = AddNodePermNCHWToNHWC();
+      // Create Const nodes holding the permutation used by added Transposes of
+      // nodes not in a frame.
+      AddNodePermNHWCToNCHW();
+      AddNodePermNCHWToNHWC();
       std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
       for (int i = 0; i < graph_->node_size(); i++) {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index cf5e4db29f418ac560c6a4c6381d4a7f3d88088e..c9ca9e211b92636b28d414cb21728c4f9a123a5b 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -581,8 +582,19 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
   return Status::OK();
 }
 
+// TODO(lyandy): Consolidate with ConstantFolding implementation.
+bool IsReallyConstant(const NodeDef& node,
+                      const absl::flat_hash_set<string>& feed_nodes) {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes.find(node.name()) == feed_nodes.end();
+}
+
 Status CheckForDeadFanout(const MutableGraphView& view,
                           const NodeDef& switch_node, const NodeMap& node_map,
+                          const absl::flat_hash_set<string>& feed_nodes,
                           DeviceBase* cpu_device, ResourceMgr* resource_mgr,
                           bool* has_dead_fanout, int* dead_fanout) {
   *has_dead_fanout = false;
@@ -591,7 +603,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
       view.GetRegularFanin(switch_loopcond_port).node;
 
   // CASE 1: Control is a constant.
-  if (IsConstant(*switch_predicate)) {
+  if (IsReallyConstant(*switch_predicate, feed_nodes)) {
     Tensor selector;
     CHECK(selector.FromProto(switch_predicate->attr().at("value").tensor()));
     *has_dead_fanout = true;
@@ -630,7 +642,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsMerge(*node)) {
       merge_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_ctrl_input = node;
       constant_index = i;
     }
@@ -646,7 +658,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsEnter(*node)) {
       enter_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_init_node = node;
     }
   }
@@ -654,7 +666,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (constant_init_node != nullptr) return Status::OK();
     for (const auto& input : enter_node->input()) {
       NodeDef* node = node_map.GetNode(input);
-      if (IsConstant(*node)) {
+      if (IsReallyConstant(*node, feed_nodes)) {
         constant_init_node = node;
       }
     }
@@ -710,8 +722,12 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // TODO(srjoglekar): Figure out if we can optimize NodeMap creations across
     // optimizer passes.
     NodeMap node_map(optimized_graph);
-    TF_RETURN_IF_ERROR(
-        RemoveDeadBranches(item.NodesToPreserve(), node_map, optimized_graph));
+    absl::flat_hash_set<string> feed_nodes;
+    for (const auto& feed : item.feed) {
+      feed_nodes.insert(NodeName(feed.first));
+    }
+    TF_RETURN_IF_ERROR(RemoveDeadBranches(item.NodesToPreserve(), node_map,
+                                          feed_nodes, optimized_graph));
   }
 
   return Status::OK();
@@ -719,7 +735,8 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
 Status LoopOptimizer::RemoveDeadBranches(
     const std::unordered_set<string>& nodes_to_preserve,
-    const NodeMap& node_map, GraphDef* optimized_graph) {
+    const NodeMap& node_map, const absl::flat_hash_set<string>& feed_nodes,
+    GraphDef* optimized_graph) {
   std::unordered_set<const NodeDef*> dead_nodes;
   std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
   // TODO(bsteiner): also rewrite switches as identity. For now we just record
@@ -737,9 +754,9 @@ Status LoopOptimizer::RemoveDeadBranches(
 
     int dead_fanout;
     bool has_dead_fanout;
-    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, cpu_device_,
-                                          resource_mgr_.get(), &has_dead_fanout,
-                                          &dead_fanout));
+    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, feed_nodes,
+                                          cpu_device_, resource_mgr_.get(),
+                                          &has_dead_fanout, &dead_fanout));
     if (!has_dead_fanout) {
       continue;
     }
@@ -774,8 +791,8 @@ Status LoopOptimizer::RemoveDeadBranches(
       }
 
       if (IsMerge(*dead.node)) {
-        const int fanout = dead.node->attr().at("N").i();
-        if (fanout > 2) {
+        const int num_data_inputs = dead.node->attr().at("N").i();
+        if (num_data_inputs > 2) {
           // This never happens in practice, so we'll just skip these to
           // simplify the code for now.
           found_node_to_preserve = true;
@@ -793,18 +810,21 @@ Status LoopOptimizer::RemoveDeadBranches(
         }
 
         bool fully_dead = false;
-        if (dead.port_id < 0) {
-          // If the control dependency never gets triggered the merge will also
-          // never get triggered.
-          fully_dead = true;
-        } else {
+        // Merge node can become real dead only if all data inputs are dead.
+        // Merge always waits for all control edges, but they do not
+        // change the node deadness.
+        if (dead.port_id >= 0) {
           local_dead_merge_inputs[dead.node].insert(dead.port_id);
-          if (local_dead_merge_inputs[dead.node].size() ==
-              dead.node->attr().at("N").i()) {
+          if (local_dead_merge_inputs[dead.node].size() == num_data_inputs) {
             fully_dead = true;
           }
+        } else {
+          // Keep track of all Merge nodes, even if they do not have dead data
+          // inputs. We'll need to cleanup dead control edges for them later.
+          local_dead_merge_inputs.insert({dead.node, {}});
         }
         if (fully_dead) {
+          local_dead_merge_inputs.erase(dead.node);
           local_dead_nodes.insert(dead.node);
           for (const MutableGraphView::InputPort& port :
                view.GetFanouts(*dead.node, true)) {
@@ -836,21 +856,47 @@ Status LoopOptimizer::RemoveDeadBranches(
     if (dead_nodes.count(&optimized_graph->node(i)))
       nodes_idx_to_delete.push_back(i);
   }
-  EraseNodesFromGraph(std::move(nodes_idx_to_delete), optimized_graph);
 
+  // Names of the nodes that were removed from the graph.
+  absl::flat_hash_set<absl::string_view> dead_node_names;
+  dead_node_names.reserve(dead_nodes.size());
+  for (const NodeDef* dead_node : dead_nodes)
+    dead_node_names.insert(dead_node->name());
+
+  // Remove dead inputs from Merge nodes that were not pruned from the graph.
   for (const auto& itr : dead_merge_inputs) {
     NodeDef* dead_node = itr.first;
     if (dead_nodes.find(dead_node) != dead_nodes.end()) {
       // The node has been pruned since all its inputs are dead.
       continue;
     }
+    // Remove dead data input.
     const std::set<int>& dead_inputs = itr.second;
     for (int index : dead_inputs) {
       dead_node->mutable_input()->DeleteSubrange(index, 1);
     }
-    dead_node->set_op("Identity");
-    dead_node->mutable_attr()->erase("N");
+    // Turn Merge into Identity only if we deleted data inputs.
+    if (!dead_inputs.empty()) {
+      dead_node->set_op("Identity");
+      dead_node->mutable_attr()->erase("N");
+    }
+    // Remove control inputs from dead nodes.
+    int pos = 0;
+    while (pos < dead_node->input_size()) {
+      TensorId tensor = ParseTensorName(dead_node->input(pos));
+      if (tensor.index() == Graph::kControlSlot &&
+          dead_node_names.contains(tensor.node())) {
+        auto* inputs = dead_node->mutable_input();
+        inputs->SwapElements(pos, dead_node->input_size() - 1);
+        inputs->RemoveLast();
+      } else {
+        ++pos;
+      }
+    }
   }
+
+  EraseNodesFromGraph(std::move(nodes_idx_to_delete), optimized_graph);
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index d467237a9a704a81a0ecc1da71531868c7f3a49b..7fa1976f348391438d62ce51fb9b8f06f34e15a2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -60,7 +60,9 @@ class LoopOptimizer : public GraphOptimizer {
   };
 
   Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
-                            const NodeMap& node_map, GraphDef* optimized_graph);
+                            const NodeMap& node_map,
+                            const absl::flat_hash_set<string>& feed_nodes,
+                            GraphDef* optimized_graph);
 
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 587767c23c370ca1f747fc5b4e2bfa4cba3ae10d..412073c7c1e42abeba14514a672bc40a4f83c926 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -504,11 +504,11 @@ void VerifyGraphsEqual(const GraphDef& original_graph,
   for (int i = 0; i < original_graph.node_size(); ++i) {
     const NodeDef& original = original_graph.node(i);
     const NodeDef& optimized = optimized_graph.node(i);
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    EXPECT_EQ(optimized.name(), original.name()) << func;
+    EXPECT_EQ(optimized.op(), original.op()) << func;
+    ASSERT_EQ(optimized.input_size(), original.input_size()) << func;
     for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      EXPECT_EQ(optimized.input(j), original.input(j)) << func;
     }
   }
 }
@@ -528,7 +528,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
+TEST_F(LoopOptimizerTest, RemovePushNoOp) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -557,7 +557,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoPopButStackLives) {
+TEST_F(LoopOptimizerTest, RemovePushNoPopButStackLives) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -609,34 +609,34 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(13, output.node_size());
+  EXPECT_EQ(output.node_size(), 13);
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     if (node.name() == "push1") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack1", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack1");
     } else if (node.name() == "push2") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("enter_c", node.input(0));
-      EXPECT_EQ("^enter_stack2", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "enter_c");
+      EXPECT_EQ(node.input(1), "^enter_stack2");
     } else if (node.name() == "push3") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack3", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack3");
     } else {
       const NodeDef& orig_node = item.graph.node(i);
-      EXPECT_EQ(orig_node.ShortDebugString(), node.ShortDebugString());
+      EXPECT_EQ(node.ShortDebugString(), orig_node.ShortDebugString());
     }
   }
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantCondition) {
   Scope scope = Scope::NewRootScope();
-  Output v_in = ops::Variable(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  Output v_in = ops::Const<float>(scope.WithOpName("v_in"), {123.0}, {});
 
   Output ctrl1 = ops::Const(scope.WithOpName("ctrl1"), false, TensorShape({}));
   ops::Switch s1(scope.WithOpName("switch1"), v_in, ctrl1);
@@ -691,57 +691,71 @@ TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
 
   for (const NodeDef& node : output.node()) {
     // These nodes should have been pruned
-    EXPECT_NE("Square1", node.name());
-    EXPECT_NE("Sqrt2", node.name());
-    EXPECT_NE("m5", node.name());
-    EXPECT_NE("m7", node.name());
+    EXPECT_NE(node.name(), "Square1");
+    EXPECT_NE(node.name(), "Sqrt2");
+    EXPECT_NE(node.name(), "m5");
 
     if (node.name() == "m1") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "square1");
     } else if (node.name() == "m2") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
     } else if (node.name() == "m3") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "v_in");
     } else if (node.name() == "m4") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
-      EXPECT_EQ("sqrt2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "square1");
+      EXPECT_EQ(node.input(1), "sqrt2");
     } else if (node.name() == "m6") {
       // both inputs are alive and the control dependency can get triggered
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
-      EXPECT_EQ("^sqrt2", node.input(2));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
+      EXPECT_EQ(node.input(2), "^sqrt2");
+    } else if (node.name() == "m7") {
+      // removed control input from dead sqrt1
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
     } else if (node.name() == "m8") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id1", node.input(0));
-      EXPECT_EQ("id2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "id1");
+      EXPECT_EQ(node.input(1), "id2");
     } else if (node.name() == "m9") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id3", node.input(0));
-      EXPECT_EQ("id4", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ(node.input(0), "id3");
+      EXPECT_EQ(node.input(1), "id4");
     }
   }
+
+  auto tensors_expected = EvaluateNodes(item.graph, {"m7", "m8", "m9"});
+  ASSERT_EQ(tensors_expected.size(), 3);
+
+  auto tensors = EvaluateNodes(output, {"m7", "m8", "m9"});
+  ASSERT_EQ(tensors.size(), 3);
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-6);
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_FullyRemoveDeadBranches) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesFullyRemoveDeadBranches) {
   const string gdef_ascii = R"EOF(
 node {
   name: "episodicreplaybuffer_add_readvariableop_resource"
@@ -1153,7 +1167,7 @@ versions {
       << "Merge node was deleted, but it shouldn't have been.";
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ZeroIterWhile) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesZeroIterWhile) {
   const string gdef_ascii = R"EOF(
 node {
   name: "Const"
@@ -1358,15 +1372,15 @@ versions {
   CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
   item.fetch = {"while/Exit"};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
+  ASSERT_EQ(tensors_expected.size(), 1);
 
   LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_CHECK_OK(status);
   auto tensors_got = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors_got.size());
-  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_got[0]);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<int32>(tensors_got[0], tensors_expected[0]);
 
   int nodes_present = 0;
   for (const NodeDef& node : output.node()) {
@@ -1382,7 +1396,200 @@ versions {
     }
     ++nodes_present;
   }
-  EXPECT_EQ(8, nodes_present);
+  EXPECT_EQ(nodes_present, 8);
+}
+
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantFeed) {
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "I\'m a value!"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Const_1"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/Const"
+  op: "Const"
+  input: "^cond/switch_t"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/Switch_1"
+  input: "cond/Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "cond/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 27
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  item.fetch = {"Identity"};
+  Tensor feed_tensor(DT_BOOL, {});
+  feed_tensor.flat<bool>()(0) = false;
+  item.feed.push_back({"Const_1", feed_tensor});
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+  auto tensors_got = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<string>(tensors_got[0], tensors_expected[0]);
+
+  EXPECT_EQ(output.node_size(), 8);
+
+  // No rewrite because branch has a constant feed node.
+  bool found = false;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "cond/Merge") {
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "cond/Switch_1");
+      EXPECT_EQ(node.input(1), "cond/Const");
+      found = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index b50d50f84245a5910ccf9cde5166465f4d9e9310..ecdb7a647a3eca7bdfa3a8c0ab154e0d29279290 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -1271,7 +1271,8 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
   }
 
   GraphTopologyView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(
+      *optimized_graph, /*ignore_control_edges=*/true));
   std::unordered_set<const NodeDef*> optimized_nodes;
 
   for (int i : assign_nodes) {
@@ -1283,8 +1284,13 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
       assign_nodes_in_fanout.push_back(&assign_node);
 
       std::vector<const NodeDef*> transitive_fanout;
+      // Find the nodes in transitive fanout. If a node is known to never
+      // forward its inputs, we can skip its fanout.
       DfsTraversal(graph_view, {graph_view.GetNode(i)},
                    TraversalDirection::kFollowOutputs,
+                   DfsPredicates::Advance([&](const NodeDef* node) {
+                     return !NeverForwardsInputs(*node);
+                   }),
                    DfsCallbacks::PreOrder([&](const NodeDef* node) {
                      transitive_fanout.push_back(node);
                    }));
@@ -1293,7 +1299,6 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
       // If all nodes in the transitive fanout are on the same device as the
       // assign node, there is no need to allocate the output in pinned memory.
       for (const NodeDef* fanout_node : transitive_fanout) {
-        // const NodeDef& fanout_node = optimized_graph->node(fanout);
         if (relax_constraint &&
             (IsSend(*fanout_node) ||
              CrossesTaskOrCpuGpuBoundary(*fanout_node, assign_node))) {
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 356b23dec0de7d8648fd92b977413720654f2451..e7aea5f5c5edd26c613922620a6354abfc2d011a 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -570,16 +570,36 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
       s.WithOpName("variable0").WithDevice("/cpu:0"), {128, 128}, DT_FLOAT);
   Output assign0 = ops::Assign(s.WithOpName("assign0").WithDevice("/cpu:0"),
                                variable0, constant0);
+  Output assign2 = ops::Assign(s.WithOpName("assign2").WithDevice("/cpu:0"),
+                               variable0, constant0);
+  Output assign3 = ops::Assign(s.WithOpName("assign3").WithDevice("/cpu:0"),
+                               variable0, constant0);
+  Output assign4 = ops::Assign(s.WithOpName("assign4").WithDevice("/cpu:0"),
+                               variable0, constant0);
+  // Rank does not forward its input buffer, so assign3 can be relaxed.
+  Output rank_cpu =
+      ops::Rank(s.WithOpName("rank_cpu").WithDevice("/cpu:0"), assign3);
+  // Exp could forward its input buffer, so we cannot relax assign4.
+  Output exp_cpu =
+      ops::Exp(s.WithOpName("exp_cpu").WithDevice("/cpu:0"), assign4);
+
   // The rest of the graph is on a second device, so we can relax the
-  // constraint for assign1, but not for assign0.
-  Output exp1 = ops::Exp(s.WithOpName("exp1").WithDevice("/gpu:0"), assign0);
-  Output variable1 = ops::Variable(
-      s.WithOpName("variable1").WithDevice("/gpu:0"), {128, 128}, DT_FLOAT);
-  Output assign1 = ops::Assign(s.WithOpName("assign1").WithDevice("/gpu:0"),
-                               variable1, exp1);
+  // constraint for assign1, but not for assign0. Assign2 only has a
+  // control dependency crossing the device boundary, so it can be relaxed too.
+  Output rank_gpu = ops::Rank(s.WithOpName("rank_gpu")
+                                  .WithDevice("/gpu:0")
+                                  .WithControlDependencies(assign2),
+                              assign0);
+  Output id_gpu = ops::Identity(s.WithOpName("id_gpu"), rank_cpu);
+  Output id_gpu2 = ops::Identity(s.WithOpName("id_gpu2"), exp_cpu);
+  Output variable_gpu = ops::Variable(
+      s.WithOpName("variable_gpu").WithDevice("/gpu:0"), {128, 128}, DT_FLOAT);
+  Output assign_gpu = ops::Assign(
+      s.WithOpName("assign_gpu").WithDevice("/gpu:0"), variable_gpu, exp_cpu);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"assign0", "assign_gpu", "rank_gpu", "id_gpu", "id_gpu2"};
 
   MemoryOptimizer optimizer(RewriterConfig::MANUAL);
   GraphDef output;
@@ -589,19 +609,36 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
   EXPECT_EQ("assign0", node.name());
   EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
 
+  node = output.node(4);
+  EXPECT_EQ("assign2", node.name());
+  EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
+  EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
+
   node = output.node(5);
-  EXPECT_EQ("assign1", node.name());
+  EXPECT_EQ("assign3", node.name());
+  EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
+  EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
+
+  node = output.node(6);
+  EXPECT_EQ("assign4", node.name());
+  EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
+
+  node = output.node(12);
+  EXPECT_EQ("assign_gpu", node.name());
   EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
   EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
 
 #if GOOGLE_CUDA
-  item.fetch = {"assign0", "assign1"};
-  item.init_ops = {"exp1", "variable1"};
+  item.init_ops = {"exp_cpu", "variable_gpu"};
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < tensors_expected.size(); ++i) {
-    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    if (i == 2 || i == 3) {
+      test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+    } else {
+      test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    }
   }
 #endif
 }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c200b7db0726da705e282cbe4155fcd90bbe8aa5..04bced53844eb38ae03bbdc60f3456ac03f311fd 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
@@ -26,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
@@ -101,6 +103,18 @@ uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
   }
 }
 
+Status CompressConstants(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    NodeDef* node = graph->mutable_node(i);
+    if ((IsConstant(*node) || IsHostConstant(*node)) &&
+        HasNodeAttr(*node, "value")) {
+      AttrValue& attr_val = (*node->mutable_attr())["value"];
+      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 #define MK_OPT(NAME, VALUE) \
@@ -123,7 +137,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("scoped_allocator",
          new ScopedAllocatorOptimizer(cfg_.scoped_allocator_optimization(),
                                       cfg_.scoped_allocator_opts()));
-  MK_OPT("small_op", new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
+  MK_OPT("pin_to_host",
+         new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
 
   return std::unique_ptr<GraphOptimizer>();
 }
@@ -146,6 +161,9 @@ Status MetaOptimizer::InitializeOptimizers(
   if (!cfg_.disable_model_pruning()) {
     optimizers->push_back(MakeUnique<ModelPruner>());
   }
+  if (cfg_.implementation_selector() != RewriterConfig::OFF) {
+    optimizers->push_back(MakeUnique<ImplementationSelector>());
+  }
   if (cfg_.function_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(
         MakeUnique<FunctionOptimizer>(cfg_.function_optimization()));
@@ -239,18 +257,10 @@ Status MetaOptimizer::InitializeCustomGraphOptimizers(
         pre_initialized_optimizers.end()) {
       continue;
     }
-    // Initialize the ExperimentalImplementationSelector here instead of
-    // CustomizeOptimizer registry, due the static link issue in TensorRT for
-    // double registry.
-    // TODO(laigd): Remove this hack and change it back to use the registry once
-    // the duplicate static import issue is fixed.
-    std::unique_ptr<CustomGraphOptimizer> custom_optimizer;
-    if (optimizer_config.name() == "ExperimentalImplementationSelector") {
-      custom_optimizer.reset(new ExperimentalImplementationSelector());
-    } else {
-      custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
-          optimizer_config.name());
-    }
+
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+
     if (custom_optimizer) {
       VLOG(2) << "Registered custom configurable graph optimizer: "
               << optimizer_config.name();
@@ -284,6 +294,20 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+void MetaOptimizer::InitializeVerifiers(
+    std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+    std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+    const {
+  if (cfg_.inter_optimizer_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    inter_optimizer_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+  if (cfg_.post_optimization_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    post_optimization_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+}
+
 #define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
   {                                                                            \
     const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
@@ -314,6 +338,23 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
   }
 
+  // Initialize the configured verifiers.
+  std::vector<std::unique_ptr<GraphVerifier>> inter_optimizer_verifiers;
+  std::vector<std::unique_ptr<GraphVerifier>> post_optimization_verifiers;
+  InitializeVerifiers(&inter_optimizer_verifiers, &post_optimization_verifiers);
+  if (inter_optimizer_verifiers.empty()) {
+    VLOG(2) << "No inter optimizer verifiers have been configured";
+  } else {
+    VLOG(2) << inter_optimizer_verifiers.size()
+            << " inter optimizer verifiers have been configured";
+  }
+  if (post_optimization_verifiers.empty()) {
+    VLOG(2) << "No post optimization verifiers have been configured";
+  } else {
+    VLOG(2) << post_optimization_verifiers.size()
+            << " post optimization verifiers have been configured";
+  }
+
   VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id
           << " num_optimizers=" << optimizers.size()
           << ", num nodes = " << item.graph.node_size();
@@ -344,6 +385,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     }
 
     VLOG(4) << "Starting optimization iteration " << iteration;
+    if (VLOG_IS_ON(4)) {
+      DumpGraphDefToFile(
+          strings::StrCat("before_MetaOptimizer_iteration_", iteration, "_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
+    }
     for (const auto& optimizer : optimizers) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
       // Some optimizers can run only once.
@@ -358,6 +405,28 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
       RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
+
+      if (VLOG_IS_ON(4)) {
+        DumpGraphDefToFile(
+            strings::StrCat("after_MetaOptimizer_iteration_", iteration, "_",
+                            optimizer->name(), "_",
+                            reinterpret_cast<uintptr_t>(optimized_graph)),
+            *optimized_graph);
+      }
+      for (const auto& verifier : inter_optimizer_verifiers) {
+        // TODO(ashwinm): Need to enforce verification_deadline.
+        TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
+      }
+    }
+    if (VLOG_IS_ON(4)) {
+      DumpGraphDefToFile(
+          strings::StrCat("after_MetaOptimizer_iteration_", iteration, "_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
+    }
+    // TODO(ashwinm): Need to enforce verification_deadline.
+    for (const auto& verifier : post_optimization_verifiers) {
+      TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
     }
   }
 
@@ -376,6 +445,9 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     RUN_OPTIMIZER_OR_RETURN_IF_ERROR(sa_optimizer);
   }
 
+  // Compress the constants in the final graph.
+  TF_RETURN_IF_ERROR(CompressConstants(optimized_graph));
+
   // Record graph optimization result.
   optimization_results_.push_back(optimization_result);
 
@@ -470,7 +542,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (IsTPUGraphDef(*optimized_graph)) {
     VLOG(2) << "Skipping optimizing funcs for TPU graphs";
     if (VLOG_IS_ON(1)) {
-      DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+      DumpGraphDefToFile(
+          strings::StrCat("after_MetaOptimizer_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
     }
     return Status::OK();
   }
@@ -489,7 +564,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
-  bool optimize_function_library = true;
+  bool optimize_function_library =
+      item.optimization_options().optimize_function_library;
 
   while (optimize_function_library) {
     optimize_function_library = false;
@@ -541,7 +617,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // instantiated by the function definition, because we must guarantee
       // function execution semantics wrt side effects (see
       // function_optimizer.cc).
-      func_item.optimization_options().is_function_instantiation = true;
+      func_item.optimization_options().allow_pruning_stateful_and_dataset_ops =
+          false;
 
       // Optimize function body graph.
       GraphDef optimized_func_graph;
@@ -576,7 +653,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           << " functions: " << str_util::Join(optimized_funcs, ", ");
 
   if (VLOG_IS_ON(1)) {
-    DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+    DumpGraphDefToFile(
+        strings::StrCat("after_MetaOptimizer_",
+                        reinterpret_cast<uintptr_t>(optimized_graph)),
+        *optimized_graph);
   }
   return Status::OK();
 }
@@ -631,16 +711,20 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
   return status;
 }
 
-Status OptimizeGraph(std::vector<string> ret_node_names,
-                     FunctionLibraryDefinition* flib,
-                     const DeviceSet& device_set, Device* cpu_device,
-                     const ConfigProto& config_proto,
-                     std::unique_ptr<tensorflow::Graph>* g) {
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* flib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g) {
   if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
     return Status::OK();
   }
 
   tensorflow::grappler::GrapplerItem item;
+  item.id = grappler_item_id;
+  item.optimization_options() = optimization_options;
 
   // Add all available devices so that inlined function can be placed.
   for (const Device* d : device_set.devices()) {
@@ -651,6 +735,9 @@ Status OptimizeGraph(std::vector<string> ret_node_names,
   // Add fetches so that the graph can be pruned.
   item.fetch.swap(ret_node_names);
 
+  // Add noes that can't be removed from the graph.
+  item.keep_ops = std::move(keep_node_names);
+
   (*g)->ToGraphDef(&item.graph);
 
   if (flib) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index c972fe3202bcbb6f0e2b29fd79f10cd894ec73de..b8f0c8e6ff56e2f497144417082d88916b3362ec 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -22,9 +22,11 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/verifier_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -63,6 +65,12 @@ class MetaOptimizer : public GraphOptimizer {
   const RewriterConfig::CustomGraphOptimizer* GetCustomGraphOptimizerConfig(
       const string& name) const;
 
+  // Initialiaze active verifiers from the RewriterConfig toggles.
+  void InitializeVerifiers(
+      std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+      std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+      const;
+
   // Run optimization pass over a single GrapplerItem. Meta optimizer might run
   // multiple such passes: 1) for the main graph 2) for the function library
   Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
@@ -112,16 +120,21 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
 // `device_set`: the set of devices that graph can refer to.
 // `cpu_device`: the CPU device.
 // `config_proto`: Grapper configuration.
+// `grappler_item_id': Grappler item id (e.g. optimized function name).
+// `optimization_options`: Grappler optimization constraints that are known only
+//    at runtime.
 //
 // **g is a graph constructed based on the runtime library 'lib'.
 // OptimizeGraph mutates **g extensively and replaces '*g' with a
 // complete copy. Therefore, the caller should not keep any references
 // to nodes *g.
-Status OptimizeGraph(std::vector<string> ret_node_names,
-                     FunctionLibraryDefinition* lib,
-                     const DeviceSet& device_set, Device* cpu_device,
-                     const ConfigProto& config_proto,
-                     std::unique_ptr<tensorflow::Graph>* g);
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* lib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index b1b72075ca4f63169bfd042176ce34408cbcdc9e..0970134ed2b88f2ddd4e25962604aba666733e85 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/match.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -27,7 +29,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -254,13 +258,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   FunctionDef mul_func = FunctionDefHelper::Create(
       "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "mul:z:0"}});
 
   FunctionDef square_func = FunctionDefHelper::Create(
       "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "my_mul:z:0"}});
   (*square_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -268,7 +272,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
        {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "quadratic:z:0"}});
   (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -290,7 +294,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
        // Forward outputs
        NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func, square_func, quadratic_func});
 
   GraphDef output;
@@ -412,19 +416,20 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneUnusedOutputs) {
       {{{"output0"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
        {{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
        {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z0", "output0:z:0"}, {"z1", "output1:z:0"}, {"z2", "output2:z:0"}});
 
   // Call MyMyl and forward all three outputs.
   FunctionDef my_fwd = FunctionDefHelper::Create(
       "Fwd", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
       {{{"output"}, "MyMul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z0", "output:z0:0"}, {"z1", "output:z1:0"}, {"z2", "output:z2:0"}});
 
   // Mark both functions as `_noinline` to trigger specialization.
   (*my_mul.mutable_attr())["_noinline"].set_b(true);
   (*my_fwd.mutable_attr())["_noinline"].set_b(true);
+  /*funcs=*/
   std::vector<FunctionDef> function_library = {my_mul, my_fwd};
 
   // Tensorflow graph:
@@ -461,14 +466,14 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneUnusedOutputs) {
   FunctionDef expected_my_mul = FunctionDefHelper::Create(
       specialized_my_mul, {"x:float", "y:float"}, {"z2:float"}, {},
       {{{"output2"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z2", "output2:z:0"}});
 
   // Specialized Fwd should also have just one output argument.
   FunctionDef expected_my_fwd = FunctionDefHelper::Create(
       specialized_my_fwd, {"x:float", "y:float"}, {"z2:float"}, {},
       {{{"output"}, specialized_my_mul, {"x", "y"}, {{"T", DT_FLOAT}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z2", "output:z2:0"}});
 
   const FunctionDef* my_mul_spec = optimized_flib.Find(specialized_my_mul);
@@ -512,7 +517,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
       "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T"}, {"T: {float, double}"},
       {{{"mul1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
        {{"mul2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z1", "mul1:z:0"}, {"z2", "mul2:z:0"}});
   (*my_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -536,7 +541,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
        // Read outputs of function call nodes
        NDef("out_fn1", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_fn2", "Identity", {"fn2:1"}, {{"T", DT_FLOAT}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {my_func});
 
   GraphDef output;
@@ -618,17 +623,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
-  FunctionDef mul_func_1 = FunctionDefHelper::Create(
-      "MyMul1", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
-
-  FunctionDef mul_func_2 = FunctionDefHelper::Create(
-      "MyMul2", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
 
   // Tensorflow graph:
   //
@@ -654,7 +659,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
              {"Tin", DataTypeSlice{DT_FLOAT}},
              {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
             kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func_1, mul_func_2});
   item.fetch = {"mul_1", "mul_2", "dx"};
 
@@ -747,6 +752,191 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
 
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_post_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_post_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "NodeDef expected inputs 'float' do not match 3 inputs specified"));
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  // Call Optimize with inter optimizer verifiers.
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_inter_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_inter_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "NodeDef expected inputs 'float' do not match 3 inputs specified"));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 9845fb08d5060445b994b1c998ddd600a842e155..70bf4f28238dc604d25a803827f5896a558a88ce 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -46,8 +46,12 @@ bool IsBlacklisted(const NodeDef& node) {
       IsNoOp(node);
 }
 
-// Check if Tensor is integer and small size.
-bool IsTensorIntegerAndSmall(const OpInfo::TensorProperties& prop) {
+// Check if Tensor is either a string or is integer and small size
+bool IsTensorSmall(const OpInfo::TensorProperties& prop) {
+  if (prop.dtype() == DataType::DT_STRING) {
+    return true;
+  }
+
   // Check type to be int32 or int64.
   if (prop.dtype() != DataType::DT_INT32 &&
       prop.dtype() != DataType::DT_INT64) {
@@ -107,7 +111,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
                  << node.DebugString();
     return Status::OK();
   }
-  if (!IsTensorIntegerAndSmall(output_properties[port_id])) {
+  if (!IsTensorSmall(output_properties[port_id])) {
     return Status::OK();
   }
 
@@ -250,7 +254,7 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
         /*assume_valid_feeds=*/false));
   }
   for (const auto& prop : properties->GetOutputProperties(node.name())) {
-    if (!IsTensorIntegerAndSmall(prop)) {
+    if (!IsTensorSmall(prop)) {
       return Status::OK();
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
index d557a03463f2b9c0355def1da9bde38a1d51f27f..44f26461c0e1445bc198eace681c6c4c8493c38b 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -38,9 +38,8 @@ string TryFindHostDevice(const gtl::FlatSet<string>& devices,
 // gpu->gpu->gpu may have been better/faster. We should probably fix this.
 class PinToHostOptimizer : public GraphOptimizer {
  public:
-  PinToHostOptimizer() : opt_level_(RewriterConfig::DEFAULT) {}
-  explicit PinToHostOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  PinToHostOptimizer() {}
+  explicit PinToHostOptimizer(RewriterConfig::Toggle opt_level) {}
 
   ~PinToHostOptimizer() override {}
 
@@ -51,9 +50,6 @@ class PinToHostOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override {}
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 3f7ff678ed4ff5e41b3b253be6d23d07ad6dedef..7a9110e72abcb88a9e4be26a142e25bd10c0c45a 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -70,9 +70,11 @@ TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
   Output c = ops::Shape(s.WithOpName("c"), a);
   Output d = ops::Const(s.WithOpName("d"), 0, {1});
   Output e = ops::ReduceProd(s.WithOpName("e"), c, d);
+  int num_int32 = 4;
+  Output f = ops::Const(s.WithOpName("f"), {"test"});
 
   GrapplerItem item;
-  item.fetch = {"a", "c", "d", "e"};
+  item.fetch = {"a", "c", "d", "e", "f"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
@@ -84,19 +86,23 @@ TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
   auto tensors = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(tensors_expected.size(), tensors.size());
   for (int i = 0; i < tensors.size(); ++i) {
-    test::ExpectTensorEqual<int32>(tensors[i], tensors_expected[i]);
+    if (i < num_int32) {
+      test::ExpectTensorEqual<int32>(tensors[i], tensors_expected[i]);
+    } else {
+      test::ExpectTensorEqual<string>(tensors[i], tensors_expected[i]);
+    }
   }
 
   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "a" || node.name() == "c") {
       EXPECT_TRUE(node.device().empty());
-    } else if (node.name() == "d" || node.name() == "e") {
+    } else if (node.name() == "d" || node.name() == "e" || node.name() == "f") {
       EXPECT_EQ(node.device(), "/device:CPU:0");
     }
     ++found;
   }
-  EXPECT_EQ(found, 4);
+  EXPECT_EQ(found, 5);
 }
 
 TEST_F(PinToHostOptimizerTest, TopologicalSort) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.h b/tensorflow/core/grappler/optimizers/remapper.h
index c18413e4e72bb970e1e15bca25fcc6316c5ac327..804338f4d21eeb3d48f64a933386caa114640ea6 100644
--- a/tensorflow/core/grappler/optimizers/remapper.h
+++ b/tensorflow/core/grappler/optimizers/remapper.h
@@ -26,7 +26,7 @@ namespace grappler {
 // nodes to decrease the amount of operations needed to perform a computation.
 class Remapper : public GraphOptimizer {
  public:
-  explicit Remapper(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
+  explicit Remapper(RewriterConfig::Toggle opt_level) {}
 
   ~Remapper() override {}
 
@@ -37,9 +37,6 @@ class Remapper : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index e537b3df07deea17b1a53d1abf18be7bad3a6d23..9cc99f0d34b1fc1b451fe134bbbb81eae63af2b3 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -565,9 +565,9 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
   //
   // There must be no non-control edges between Nodes in 'ops'.
   // Control edges among these nodes will be dropped.
-  Status Rewrite(ScopedAllocatorOptimizer* sa_opti, GraphDef* graph,
-                 const string& op_name, const std::vector<NodeDef*>& ops,
-                 bool* applied) override {
+  Status Rewrite(ScopedAllocatorOptimizer* sa_opti, int64 invocation_count,
+                 GraphDef* graph, const string& op_name,
+                 const std::vector<NodeDef*>& ops, bool* applied) override {
     if (VLOG_IS_ON(1)) {
       VLOG(1) << "Rewrite";
       string op_names;
@@ -596,7 +596,8 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                                      &inputs, &sa_shape));
 
     int sa_id = sa_opti->NewScopedAllocatorId(input_shapes.size());
-    string sa_name = strings::StrCat("scoped_allocator_", sa_id);
+    string sa_name =
+        strings::StrCat("scoped_allocator_", sa_id, "_", invocation_count);
     TF_RETURN_IF_ERROR(ConstructScopedAllocatorNode(
         sa_opti, graph, node_map, ops, device_name, dtype, sa_id, sa_name,
         input_shapes, inputs, sa_shape));
@@ -622,7 +623,8 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
 
     // Build a ScopedAllocatorConcat below all of the input nodes.
     std::vector<NodeDefBuilder::NodeOut> sac_inputs;
-    string sac_name = strings::StrCat("scoped_allocator_concat_", sa_id);
+    string sac_name = strings::StrCat("scoped_allocator_concat_", sa_id, "_",
+                                      invocation_count);
     TF_RETURN_IF_ERROR(BuildSAConcatNode(
         graph, node_map, ops, op_instance_names, device_name, dtype, sa_id,
         sa_name, sac_name, sa_shape, &sac_inputs));
@@ -635,7 +637,8 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                                           sa_op_name));
 
     // Build a ScopedAllocatorSplit split below the new Op.
-    string sas_name = strings::StrCat("scoped_allocator_split_", sa_id);
+    string sas_name = strings::StrCat("scoped_allocator_split_", sa_id, "_",
+                                      invocation_count);
     TF_RETURN_IF_ERROR(BuildSplitNode(graph, node_map, ops, input_shapes,
                                       sac_inputs, device_name, dtype, op_name,
                                       sa_id, sas_name, sa_name, sa_op_name));
@@ -813,7 +816,14 @@ void PartitionByLoopStructure(const FrameView& frame_view,
 
 Status ScopedAllocatorOptimizer::ProcessGraphDef(
     GraphDef* graph, const GraphProperties& graph_properties) {
-  VLOG(1) << "ProcessGraphDef";
+  // Nodes created by this optimizer have the IsStateful() property
+  // which means their names must be globally unique within a process,
+  // so we include an optimizer invocation count in every generated
+  // name.
+  static std::atomic<int64> invocation_counter(1);
+  const int64 invocation_count =
+      invocation_counter.fetch_add(1, std::memory_order_seq_cst);
+  VLOG(1) << "ProcessGraphDef " << invocation_count;
   Status status;
   GraphOpOccurrences occ;
   FindOpOccurrences(graph, op_name_set_, &occ);
@@ -840,7 +850,7 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         // in the same Tree struct.  Split those groups into subgroups that
         // share identical loop nesting.
         status = ApplyToAll(root.get(), [this, rewriter, graph, &frame_view,
-                                         &op_name](Tree* t) {
+                                         &op_name, invocation_count](Tree* t) {
           VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
                   << t->depth_ << " of size " << t->nodes_.size();
           if (t->nodes_.size() > 1) {
@@ -852,7 +862,8 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
                 Status s = OrderNodeSet(&lg);
                 TF_RETURN_IF_ERROR(s);
                 VLOG(1) << "Applying Rewriter for " << op_name;
-                s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
+                s = rewriter->Rewrite(this, invocation_count, graph, op_name,
+                                      lg, &applied);
                 LOG_WARNING_AND_RETURN_IF_ERROR(s);
               }
             }
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
index 13589f536ca720d9bf1d1293e64aadd3b01d65ed..2265fed0e4fc643b982f357cef769f7bf9ee8ccb 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
 
+#include <atomic>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -75,7 +76,8 @@ class ScopedAllocatorOptimizer : public GraphOptimizer {
    public:
     virtual ~Rewriter() {}
 
-    virtual Status Rewrite(ScopedAllocatorOptimizer* paopti, GraphDef* graph,
+    virtual Status Rewrite(ScopedAllocatorOptimizer* paopti,
+                           int64 invocation_count, GraphDef* graph,
                            const string& op_name,
                            const std::vector<NodeDef*>& nodes,
                            bool* applied) = 0;
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
index b033cff8e632e9148a6e6f5e9f2a45413f6f09b8..90081ec8535fffbe8718c02d582ec6d86d7da896 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -125,12 +125,12 @@ TEST_F(ScopedAllocatorOptimizerTest, UnaryRewriteOnly) {
 
   // Examine the resulting graph def.
   NodeMap node_map(&optimized_graph);
-  NodeDef* nd = node_map.GetNode("scoped_allocator_1");
+  NodeDef* nd = node_map.GetNode("scoped_allocator_1_1");
   ASSERT_TRUE(nd);
   {
     auto& nd_set = node_map.GetOutputs(nd->name());
     ASSERT_EQ(3, nd_set.size());
-    std::unordered_set<string> expected = {"scoped_allocator_concat_1", "s1",
+    std::unordered_set<string> expected = {"scoped_allocator_concat_1_1", "s1",
                                            "s2"};
     for (auto it : nd_set) {
       ASSERT_NE(expected.find(it->name()), expected.end())
@@ -138,21 +138,21 @@ TEST_F(ScopedAllocatorOptimizerTest, UnaryRewriteOnly) {
     }
   }
   {
-    auto& nd_set = node_map.GetOutputs("scoped_allocator_concat_1");
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_concat_1_1");
     ASSERT_EQ(1, nd_set.size());
     for (auto it : nd_set) {
-      ASSERT_EQ("scoped_allocator_1_Abs", it->name());
+      ASSERT_EQ("scoped_allocator_1_1_Abs", it->name());
     }
   }
   {
-    auto& nd_set = node_map.GetOutputs("scoped_allocator_1_Abs");
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_1_1_Abs");
     ASSERT_EQ(1, nd_set.size());
     for (auto it : nd_set) {
-      ASSERT_EQ("scoped_allocator_split_1", it->name());
+      ASSERT_EQ("scoped_allocator_split_1_1", it->name());
     }
   }
   {
-    auto& nd_set = node_map.GetOutputs("scoped_allocator_split_1");
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_split_1_1");
     ASSERT_EQ(2, nd_set.size());
     std::unordered_set<string> name_set;
     for (auto it : nd_set) {
@@ -188,7 +188,7 @@ TEST_F(ScopedAllocatorOptimizerTest, UnaryExecute) {
 
   // Request two targets: one fetch output and one non-fetched output.
   std::vector<string> output_names = {"r1:0", "r2:0",
-                                      "scoped_allocator_1_Abs:0"};
+                                      "scoped_allocator_1_2_Abs:0"};
   std::vector<string> target_nodes = {};
   std::vector<Tensor> outputs;
   Status s = session->Run(inputs, output_names, target_nodes, &outputs);
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.h b/tensorflow/core/grappler/optimizers/shape_optimizer.h
index b7f84a1e5dbe7dd1e2d21e3752522b3f237e2d7c..d9c1fefb194ce0fe2be921d17c9aaa782aa4ee39 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -30,9 +30,8 @@ namespace grappler {
 // information.
 class ShapeOptimizer : public GraphOptimizer {
  public:
-  ShapeOptimizer() : opt_level_(RewriterConfig::ON) {}
-  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  ShapeOptimizer() {}
+  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level) {}
 
   ~ShapeOptimizer() override {}
 
@@ -43,9 +42,6 @@ class ShapeOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 375c3e56c80aa65cd9e5ab0e2248b81d3e3db776..7d4dfb052071ce374f7361eaed19f2e94daf64e9 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 template <typename T>
-bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
+bool SafeSetDoubleScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
   if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
       value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
@@ -50,6 +50,17 @@ bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   return true;
 }
 
+template <typename T>
+bool SafeSetIntScalarTensorValue(int value, Tensor* tensor) {
+  using RealType = typename Eigen::NumTraits<T>::Real;
+  if (value > static_cast<int>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<int>(Eigen::NumTraits<RealType>::lowest())) {
+    return false;
+  }
+  tensor->flat<T>()(0) = static_cast<T>(value);
+  return true;
+}
+
 // Is 'node' an operator that consumes only the shape of its input, not the
 // data itself?
 // TODO(ezhulenev): move to op_types.h. Requires to break circular dependency.
@@ -410,35 +421,50 @@ void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
   EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
 }
 
-#define HANDLE_CASE(DTYPE)                                          \
-  case DTYPE:                                                       \
-    if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
-            static_cast<double>(value), tensor)) {                  \
-      return errors::InvalidArgument("Cannot store value ", value,  \
-                                     " in tensor of type " #DTYPE); \
-    }                                                               \
+#define HANDLE_DOUBLE_CASE(DTYPE)                                     \
+  case DTYPE:                                                         \
+    if (!SafeSetDoubleScalarTensorValue<EnumToDataType<DTYPE>::Type>( \
+            static_cast<double>(value), tensor)) {                    \
+      return errors::InvalidArgument("Cannot store value ", value,    \
+                                     " in tensor of type " #DTYPE);   \
+    }                                                                 \
+    break
+
+#define HANDLE_INT_CASE(DTYPE)                                               \
+  case DTYPE:                                                                \
+    if (!SafeSetIntScalarTensorValue<EnumToDataType<DTYPE>::Type>(value,     \
+                                                                  tensor)) { \
+      return errors::InvalidArgument("Cannot store value ", value,           \
+                                     " in tensor of type " #DTYPE);          \
+    }                                                                        \
     break
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
   // TODO(rmlarsen): Support more general shapes.
+  // TODO(lyandy): Change `value` to be int64 once int64 -> qint32 is supported.
   if (tensor->NumElements() != 1) {
     return errors::InvalidArgument(
         "Expected scalar tensor, got num_elements = ", tensor->NumElements());
   }
   switch (dtype) {
-    HANDLE_CASE(DT_HALF);
-    HANDLE_CASE(DT_BFLOAT16);
-    HANDLE_CASE(DT_BOOL);
-    HANDLE_CASE(DT_FLOAT);
-    HANDLE_CASE(DT_DOUBLE);
-    HANDLE_CASE(DT_UINT8);
-    HANDLE_CASE(DT_INT8);
-    HANDLE_CASE(DT_UINT16);
-    HANDLE_CASE(DT_INT16);
-    HANDLE_CASE(DT_INT32);
-    HANDLE_CASE(DT_INT64);
-    HANDLE_CASE(DT_COMPLEX64);
-    HANDLE_CASE(DT_COMPLEX128);
+    HANDLE_DOUBLE_CASE(DT_HALF);
+    HANDLE_DOUBLE_CASE(DT_BFLOAT16);
+    HANDLE_DOUBLE_CASE(DT_BOOL);
+    HANDLE_DOUBLE_CASE(DT_FLOAT);
+    HANDLE_DOUBLE_CASE(DT_DOUBLE);
+    HANDLE_DOUBLE_CASE(DT_UINT8);
+    HANDLE_DOUBLE_CASE(DT_INT8);
+    HANDLE_DOUBLE_CASE(DT_UINT16);
+    HANDLE_DOUBLE_CASE(DT_INT16);
+    HANDLE_DOUBLE_CASE(DT_INT32);
+    HANDLE_DOUBLE_CASE(DT_INT64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX128);
+    HANDLE_INT_CASE(DT_QINT8);
+    HANDLE_INT_CASE(DT_QUINT8);
+    HANDLE_INT_CASE(DT_QINT16);
+    HANDLE_INT_CASE(DT_QUINT16);
+    HANDLE_INT_CASE(DT_QINT32);
     default:
       return errors::InvalidArgument("Unsupported type ",
                                      DataTypeString(dtype));
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 1fd0a02b65e3a212780b6fdabadce98833b3ebda..aec75f6eadae06646161ae38722a7244e4ca153f 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -65,6 +65,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 357a0b3b47a233e33a1d686eab2eed7ca9b6cc28..2ec9794b68aad4b322e280eda033b26d7e592913 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -311,15 +311,14 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     string func_name, string description, AttrSlice func_attr,
     std::vector<InputArgExpansion> input_arg_expansions,
     std::vector<OutputArgExpansion> output_arg_expansions,
-    std::vector<string> keep_nodes, const int graph_def_version,
+    std::vector<ControlOutput> control_outputs, const int graph_def_version,
     const bool is_stateful, GraphDef&& function_body)
     : description_(std::move(description)),
-      func_attr_(std::move(func_attr)),
+      func_attr_(func_attr),
       input_arg_expansions_(std::move(input_arg_expansions)),
       output_arg_expansions_(std::move(output_arg_expansions)),
+      control_outputs_(std::move(control_outputs)),
       is_stateful_(is_stateful) {
-  // Move assign GrapplerItem members.
-  keep_ops = std::move(keep_nodes);
   id = std::move(func_name);
   graph = std::move(function_body);
 
@@ -336,10 +335,14 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_node);
     }
   }
+  // We must keep all control output nodes.
+  for (const ControlOutput& control_output : control_outputs_) {
+    keep_ops.push_back(control_output.node_name);
+  }
 
   // Tensorflow functions execution semantics is different from the main graph,
   // and we need to preserve it when we do graph optimizations.
-  optimization_options().is_function_instantiation = true;
+  optimization_options().allow_pruning_stateful_and_dataset_ops = false;
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -368,6 +371,15 @@ const std::size_t GrapplerFunctionItem::output_size() const {
   return output_arg_expansions_.size();
 }
 
+const std::vector<ControlOutput>& GrapplerFunctionItem::control_outputs()
+    const {
+  return control_outputs_;
+}
+
+const std::size_t GrapplerFunctionItem::control_output_size() const {
+  return control_outputs_.size();
+}
+
 const AttrSlice& GrapplerFunctionItem::func_attr() const { return func_attr_; }
 
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
@@ -624,15 +636,20 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     outputs.push_back(std::move(output));
   }
 
-  std::vector<string> keep_ops;
-  bool is_stateful = signature.is_stateful();
+  // Control outputs ensure that all side-effectful nodes in the function body
+  // will execute, even if they are not required to compute regular output args.
+  std::vector<ControlOutput> control_outputs;
+  control_outputs.reserve(func.control_ret_size());
+  for (const auto& control_ret : func.control_ret()) {
+    control_outputs.push_back({control_ret.first, control_ret.second});
+  }
 
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(),
       /*description=*/signature.description(),
       /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
-      std::move(outputs), std::move(keep_ops), graph_def_version, is_stateful,
-      std::move(function_body));
+      std::move(outputs), std::move(control_outputs), graph_def_version,
+      signature.is_stateful(), std::move(function_body));
   return Status::OK();
 }
 
@@ -844,6 +861,13 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
         &(*func->mutable_ret())[output_arg.output_name]));
   }
 
+  // Add function control outputs.
+  for (const ControlOutput& control_out : item.control_outputs()) {
+    func->mutable_control_ret()->insert(
+        {control_out.output_name, control_out.node_name});
+    *func->mutable_signature()->add_control_output() = control_out.output_name;
+  }
+
   // Copy function definition specific attributes.
   for (const auto& attr : item.func_attr()) {
     const auto& attr_name = attr.first;
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index d5a41e74739d67fc2cef0c295efe208edbd6255c..d450f6a41fcf926def615c34b4acc725fae5b3d7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -74,6 +74,12 @@ struct OutputArgExpansion {
   absl::InlinedVector<string, 1> output_nodes;
 };
 
+// A mapping from control output name to node name in function body graph.
+struct ControlOutput {
+  string output_name;
+  string node_name;
+};
+
 // FunctionDef uses different connectivity encoding for the function body nodes,
 // then a GraphDef (see function.proto for details). Input name in FunctionDef
 // can potentially represent a sequence of tensors (instead just one tensor in
@@ -161,6 +167,9 @@ class GrapplerFunctionItem : public GrapplerItem {
   const OutputArgExpansion& output(int i) const;
   const std::size_t output_size() const;
 
+  const std::vector<ControlOutput>& control_outputs() const;
+  const std::size_t control_output_size() const;
+
   const AttrSlice& func_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
@@ -183,8 +192,9 @@ class GrapplerFunctionItem : public GrapplerItem {
                        AttrSlice func_attr,
                        std::vector<InputArgExpansion> input_arg_expansions,
                        std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
+                       std::vector<ControlOutput> control_outputs,
+                       int graph_def_version, bool is_stateful,
+                       GraphDef&& function_body);
 
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
@@ -192,6 +202,7 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
+  std::vector<ControlOutput> control_outputs_;
 
   bool is_stateful_ = false;
 };
@@ -241,7 +252,7 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
                              GrapplerFunctionItem* item,
                              std::vector<std::pair<int, int>>* output_mapping);
 
-// TODO(ezhulennev, b/120103818): Add RemoveFunctionInputs.
+// TODO(ezhulenev, b/120103818): Add RemoveFunctionInputs.
 
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 772088882835d0223f424f5d73a3587c53440469..813e6a318cf69db536bb6859f1937a3366d03d70 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -641,7 +641,41 @@ TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
   EXPECT_EQ(3, item.function_body().node_size());
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ(0, item.output_size());
-  EXPECT_EQ(true, item.optimization_options().is_function_instantiation);
+
+  const auto &opts = item.optimization_options();
+  EXPECT_FALSE(opts.allow_pruning_stateful_and_dataset_ops);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithControlOutputs) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Create(
+      "WithControlOutputs", /*in_def=*/{"x: Ref(float)"}, /*out_def=*/{}, {},
+      {
+          {{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+          {{"update"}, "AssignAdd", {"x", "one:output:0"}, {{"T", DT_FLOAT}}},
+      },
+      {}, {{"side_effects", "update"}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("WithControlOutputs", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+
+  ASSERT_EQ(1, item.control_output_size());
+  const ControlOutput &ctrl = item.control_outputs()[0];
+  EXPECT_EQ("side_effects", ctrl.output_name);
+  EXPECT_EQ("update", ctrl.node_name);
 }
 
 TEST_F(FunctionsTest, MakeFunctionDef) {
@@ -825,17 +859,14 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
 }
 
 TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
-  FunctionDef func = FunctionDefHelper::Define(
-      // Name
-      "DoNothing",
-      // Args
-      {"i: int32"},
-      // Return values
-      {"o: int32"},
-      // Attr def
-      {},
-      // Nodes
-      {{{"o"}, "Identity", {"i"}, {{"T", DT_INT32}}}});
+  FunctionDef func = FunctionDefHelper::Create(
+      "DoNothing", /*in_def=*/{"i: int32"}, /*out_def*/ {"o: int32"},
+      /*attr_def*/ {},
+      {
+          {{"id"}, "Identity", {"i"}, {{"T", DT_INT32}}},
+      },
+      /*ret_def=*/{{"o", "id:output:0"}},
+      /*control_ret_def=*/{{"must_execute", "id"}});
 
   constexpr char description[] = "This is a helpful description.";
   func.mutable_signature()->set_description(description);
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 1b4b9f9a51af17c4472f0fc34331b75192e3d3ae..3a0eec68d1c6adc4236ab2e0e79c8cb66a19b098 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -81,6 +81,7 @@ GrapplerTest::GrapplerTest() {
   cfg->set_debug_stripper(RewriterConfig::OFF);
   cfg->set_dependency_optimization(RewriterConfig::OFF);
   cfg->set_function_optimization(RewriterConfig::OFF);
+  cfg->set_implementation_selector(RewriterConfig::OFF);
   cfg->set_layout_optimizer(RewriterConfig::OFF);
   cfg->set_loop_optimization(RewriterConfig::OFF);
   cfg->set_pin_to_host_optimization(RewriterConfig::OFF);
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index 3868183c62d0dbdb09a65996b9de79b7a6001ca3..11552622d82aa53f68d443c73ea4b1dce750193e 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -16,9 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -200,25 +199,7 @@ TEST_F(TopologicalSortTest, ExtraDependencies) {
 static void BM_ComputeTopologicalOrder(int iters, int size) {
   testing::StopTiming();
 
-  random::PhiloxRandom philox(0x12345);
-  random::SimplePhilox rnd(&philox);
-
-  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
-
-  GraphDef graph;
-  for (int i = 0; i < size; ++i) {
-    const string name = absl::StrCat(prefix, i);
-    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
-
-    NodeDef node;
-    node.set_name(name);
-    for (int n = 0; n < num_inputs; ++n) {
-      const uint32 input_node = rnd.Uniform(i);
-      node.add_input(absl::StrCat(prefix, input_node));
-    }
-
-    *graph.add_node() = std::move(node);
-  }
+  GraphDef graph = test::CreateRandomGraph(size);
 
   testing::StartTiming();
   std::vector<const NodeDef*> topo_order;
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index f5ae39867ac758efa52d9109b5f85b020c1e7ae4..e30b1c5b730a2c67101b9b6364b414ea2f7003d8 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <unistd.h>
 #include <limits>
 #include <memory>
+
+#include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -124,56 +126,56 @@ class UtilsTest : public ::testing::Test {
 };
 
 TEST_F(UtilsTest, NodeName) {
-  EXPECT_EQ("abc", NodeName("abc"));
-  EXPECT_EQ("abc", NodeName("^abc"));
-  EXPECT_EQ("abc", NodeName("abc:0"));
-  EXPECT_EQ("abc", NodeName("^abc:0"));
-
-  EXPECT_EQ("abc/def", NodeName("abc/def"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def"));
-  EXPECT_EQ("abc/def", NodeName("abc/def:1"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def:1"));
-
-  EXPECT_EQ("abc/def0", NodeName("abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("abc/def0:0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0:0"));
-
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0:3"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3"));
-
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3214"));
+  EXPECT_EQ(NodeName("abc"), "abc");
+  EXPECT_EQ(NodeName("^abc"), "abc");
+  EXPECT_EQ(NodeName("abc:0"), "abc");
+  EXPECT_EQ(NodeName("^abc:0"), "abc");
+
+  EXPECT_EQ(NodeName("abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("abc/def:1"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def:1"), "abc/def");
+
+  EXPECT_EQ(NodeName("abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("abc/def0:0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0:0"), "abc/def0");
+
+  EXPECT_EQ(NodeName("abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("abc/def_0:3"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0:3"), "abc/def_0");
+
+  EXPECT_EQ(NodeName("^abc/def_0:3214"), "abc/def_0");
 }
 
 TEST_F(UtilsTest, NodePosition) {
-  EXPECT_EQ(2, NodePosition("abc:2"));
-  EXPECT_EQ(123, NodePosition("abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc"));
-  EXPECT_EQ(0, NodePosition(""));
+  EXPECT_EQ(NodePosition("abc:2"), 2);
+  EXPECT_EQ(NodePosition("abc:123"), 123);
+  EXPECT_EQ(NodePosition("^abc:123"), -1);
+  EXPECT_EQ(NodePosition("^abc"), -1);
+  EXPECT_EQ(NodePosition(""), 0);
 }
 
 TEST_F(UtilsTest, NodePositionIfSameNode) {
-  EXPECT_EQ(-2, NodePositionIfSameNode(":123", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode(":", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode("", ""));
-  EXPECT_EQ(123, NodePositionIfSameNode("abc:123", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc:123", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "abc/xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc/xyz", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc:123", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc:123", "xyz"));
+  EXPECT_EQ(NodePositionIfSameNode(":123", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode(":", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "abc"), 123);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "abc/xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc/xyz", "abc"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "xyz"), -2);
 }
 
 TEST_F(UtilsTest, AddNodeNamePrefix) {
-  EXPECT_EQ("OPTIMIZED/abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
-  EXPECT_EQ("^OPTIMIZED/abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
-  EXPECT_EQ("OPTIMIZED/", AddPrefixToNodeName("", "OPTIMIZED"));
+  EXPECT_EQ(AddPrefixToNodeName("abc", "OPTIMIZED"), "OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("^abc", "OPTIMIZED"), "^OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("", "OPTIMIZED"), "OPTIMIZED/");
 }
 
 TEST_F(UtilsTest, ExecuteWithTimeout) {
@@ -204,17 +206,17 @@ TEST_F(UtilsTest, ExecuteWithTimeout) {
 
 TEST_F(UtilsTest, NumOutputs) {
   GraphDef graph;
-  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode(), &graph));
-  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode(), &graph));
-  EXPECT_EQ(1, NumOutputs(CreateDequeueNode(), &graph));
+  EXPECT_EQ(NumOutputs(CreateConcatOffsetNode(), &graph), 2);
+  EXPECT_EQ(NumOutputs(CreateFusedBatchNormNode(), &graph), 5);
+  EXPECT_EQ(NumOutputs(CreateDequeueNode(), &graph), 1);
 }
 
 TEST_F(UtilsTest, AsControlDependency) {
   NodeDef node;
   node.set_name("foo");
-  EXPECT_EQ("^foo", AsControlDependency(node));
-  EXPECT_EQ("^foo", AsControlDependency(node.name()));
-  EXPECT_EQ("^foo", AsControlDependency("^foo"));
+  EXPECT_EQ(AsControlDependency(node), "^foo");
+  EXPECT_EQ(AsControlDependency(node.name()), "^foo");
+  EXPECT_EQ(AsControlDependency("^foo"), "^foo");
 }
 
 TEST_F(UtilsTest, GetTailOfChain) {
@@ -233,22 +235,23 @@ TEST_F(UtilsTest, GetTailOfChain) {
   GraphDef graph;
   TF_CHECK_OK(s.ToGraphDef(&graph));
 
-  ASSERT_EQ("c0", graph.node(0).name());
-  ASSERT_EQ("c1", graph.node(1).name());
-  ASSERT_EQ("neg0", graph.node(2).name());
-  ASSERT_EQ("neg1", graph.node(3).name());
-  ASSERT_EQ("neg2", graph.node(4).name());
-  ASSERT_EQ("id1", graph.node(5).name());
-  ASSERT_EQ("id2", graph.node(6).name());
-  ASSERT_EQ("noop", graph.node(7).name());
+  ASSERT_EQ(graph.node_size(), 8);
+  ASSERT_EQ(graph.node(0).name(), "c0");
+  ASSERT_EQ(graph.node(1).name(), "c1");
+  ASSERT_EQ(graph.node(2).name(), "neg0");
+  ASSERT_EQ(graph.node(3).name(), "neg1");
+  ASSERT_EQ(graph.node(4).name(), "neg2");
+  ASSERT_EQ(graph.node(5).name(), "id1");
+  ASSERT_EQ(graph.node(6).name(), "id2");
+  ASSERT_EQ(graph.node(7).name(), "noop");
 
   NodeMap node_map(&graph);
   auto is_neg = [&](const NodeDef& node) { return node.op() == "Neg"; };
   // We walk backwards, starting as "id1", so tail should be "neg1".
   NodeDef* tail = GetTailOfChain(graph.node(5), node_map,
                                  /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg1", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg1");
 
   // We stop at branching nodes, so tail should be "neg2".
   auto is_neg_and_non_branching = [&](const NodeDef& node) {
@@ -257,22 +260,22 @@ TEST_F(UtilsTest, GetTailOfChain) {
   tail =
       GetTailOfChain(graph.node(5), node_map,
                      /*follow_control_input=*/false, is_neg_and_non_branching);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg2", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg2");
 
   // We walk backwards, starting from "noop", also following control inputs,
   // so tail should be "neg0".
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/true, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg0", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg0");
 
   // We walk backwards, starting from "noop", not following control inputs,
   // so tail should be "noop" itself.
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("noop", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "noop");
 }
 
 TEST_F(UtilsTest, DedupControlInputs) {
@@ -280,40 +283,40 @@ TEST_F(UtilsTest, DedupControlInputs) {
   foo.set_name("foo");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("bar", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "bar");
 
   foo.set_input(1, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   foo.add_input("^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("gnu");
   foo.add_input("^bar");
   foo.add_input("^gnu");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("gnu", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "gnu");
 }
 
 TEST_F(UtilsTest, NumNonControlOutputs) {
@@ -347,14 +350,14 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   NodeMap node_map(&graph);
 
   const NodeDef* add_node = node_map.GetNode("add");
-  ASSERT_TRUE(add_node != nullptr);
+  ASSERT_NE(add_node, nullptr);
 
   // [a, b] are only non-control inputs
-  EXPECT_EQ(2, NumNonControlInputs(*add_node));
+  EXPECT_EQ(NumNonControlInputs(*add_node), 2);
   // [sqrt, shape] are non control outputs
-  EXPECT_EQ(2, NumNonControlOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
   // sqrt is the only data output
-  EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
 }
 
 TEST(CheckAttrExists, All) {
@@ -465,10 +468,104 @@ TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
 }
 
 TEST_F(UtilsTest, TensorIdToString) {
-  EXPECT_EQ("^foo", TensorIdToString({"foo", -1}));
-  EXPECT_EQ("foo", TensorIdToString({"foo", 0}));
-  EXPECT_EQ("foo:1", TensorIdToString({"foo", 1}));
-  EXPECT_EQ("foo:2", TensorIdToString({"foo", 2}));
+  EXPECT_EQ(TensorIdToString({"foo", -1}), "^foo");
+  EXPECT_EQ(TensorIdToString({"foo", 0}), "foo");
+  EXPECT_EQ(TensorIdToString({"foo", 1}), "foo:1");
+  EXPECT_EQ(TensorIdToString({"foo", 2}), "foo:2");
+}
+
+template <typename T>
+void TestSetTensorValue(DataType type, int val, bool success,
+                        absl::string_view error_msg) {
+  Tensor t(type, TensorShape({}));
+  Status s = SetTensorValue(t.dtype(), val, &t);
+  EXPECT_EQ(s.ok(), success);
+  if (s.ok()) {
+    test::ExpectTensorEqual<T>(Tensor(static_cast<T>(val)), t);
+  } else {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+}
+
+TEST(SetTensorValueTest, Quantized) {
+  auto int_min_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value -2147483648 in tensor of type $0",
+        DataType_Name(type));
+  };
+  auto int_max_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value 2147483647 in tensor of type $0",
+        DataType_Name(type));
+  };
+  const int kMinInt = std::numeric_limits<int>::min();
+  const int kMaxInt = std::numeric_limits<int>::max();
+
+  TestSetTensorValue<qint8>(DT_QINT8, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::min(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::max(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, kMinInt, /*success=*/false,
+                            int_min_error(DT_QINT8));
+  TestSetTensorValue<qint8>(DT_QINT8, kMaxInt, /*success=*/false,
+                            int_max_error(DT_QINT8));
+
+  TestSetTensorValue<quint8>(
+      DT_QUINT8, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT8");
+  TestSetTensorValue<quint8>(DT_QUINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, kMinInt, /*success=*/false,
+                             int_min_error(DT_QUINT8));
+  TestSetTensorValue<quint8>(DT_QUINT8, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QUINT8));
+
+  TestSetTensorValue<qint16>(DT_QINT16, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, kMinInt, /*success=*/false,
+                             int_min_error(DT_QINT16));
+  TestSetTensorValue<qint16>(DT_QINT16, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QINT16));
+
+  TestSetTensorValue<quint16>(
+      DT_QUINT16, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT16");
+  TestSetTensorValue<quint16>(DT_QUINT16, 0, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, 8, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::min(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::max(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, kMinInt, /*success=*/false,
+                              int_min_error(DT_QUINT16));
+  TestSetTensorValue<quint16>(DT_QUINT16, kMaxInt, /*success=*/false,
+                              int_max_error(DT_QUINT16));
+
+  TestSetTensorValue<qint32>(DT_QINT32, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMinInt, /*success=*/true,
+                             /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMaxInt, /*success=*/true,
+                             /*error_msg=*/"");
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index 00b132517615d5cabe25f9aec936e92975894621..e3e1538b00c5ca446deea5859771286f45736c6d 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -1,5 +1,7 @@
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
 cc_library(
     name = "graph_verifier",
     hdrs = [
@@ -7,7 +9,42 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "structure_verifier",
+    srcs = ["structure_verifier.cc"],
+    hdrs = [
+        "structure_verifier.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_verifier",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
+
+tf_cc_test(
+    name = "structure_verifier_test",
+    srcs = ["structure_verifier_test.cc"],
+    deps = [
+        ":structure_verifier",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/verifiers/graph_verifier.h b/tensorflow/core/grappler/verifiers/graph_verifier.h
index f5acd59266ff5406abd6ccef0c0879aeb1938d3a..10fd201eadcfd33709c0e7d2540528ad895b3358 100644
--- a/tensorflow/core/grappler/verifiers/graph_verifier.h
+++ b/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -44,7 +44,9 @@ class GraphVerifier {
   virtual string name() const = 0;
 
   // Implement an algorithm to verify the specified graph.
-  virtual Status Verify(const GraphDef& graph, std::vector<string>* errors) = 0;
+  // The return value is a Status that represents a concatenation of Status of
+  // each verification step.
+  virtual Status Verify(const GraphDef& graph) = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.cc b/tensorflow/core/grappler/verifiers/structure_verifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b438b56c4d2063aca9c4fcaf707c617067b71ed
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.cc
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// TODO(ashwinm): Expand this to add more structural checks.
+Status StructureVerifier::Verify(const GraphDef& graph) {
+  StatusGroup status_group;
+
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             graph.library());
+  status_group.Update(tensorflow::graph::ValidateGraphDefAgainstOpRegistry(
+      graph, function_library));
+  status_group.Update(tensorflow::graph::VerifyNoDuplicateNodeNames(graph));
+
+  std::vector<const NodeDef*> topo_order;
+  status_group.Update(ComputeTopologicalOrder(graph, &topo_order));
+  return status_group.as_status();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.h b/tensorflow/core/grappler/verifiers/structure_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab719f1214eebb624d50a814ce437ffe3957304d
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Verifies the structure of a graph to ensure it is valid.
+class StructureVerifier : public GraphVerifier {
+ public:
+  StructureVerifier() {}
+  ~StructureVerifier() override {}
+
+  string name() const override { return "structure_verifier"; };
+
+  Status Verify(const GraphDef& graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1d0646d9b336cd8a70d5b44bf33eed9f8432c
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/match.h"
+#include "tensorflow/cc/ops/parsing_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StructureVerifierTest : public ::testing::Test {
+ protected:
+  StructureVerifierTest() { verifier_.reset(new StructureVerifier()); }
+  void SetGraph(const string& gdef_ascii) {
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
+  }
+  GraphDef graph_;
+  std::unique_ptr<StructureVerifier> verifier_;
+};
+
+Status Scalars(shape_inference::InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("TestParams").Output("o: float").SetShapeFn(Scalars);
+REGISTER_OP("TestInput")
+    .Output("a: float")
+    .Output("b: float")
+    .SetShapeFn(Scalars);
+REGISTER_OP("TestMul")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(Scalars);
+
+TEST_F(StructureVerifierTest, ValidGraphs) {
+  // With scope, ops gets registered automatically.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  ops::ShapeN b(s.WithOpName("b"), {a, a, a});
+
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  TF_EXPECT_OK(verifier_->Verify(graph));
+
+  // With graphdef directly, relies on REGISTER_OP to register ops
+  SetGraph(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }");
+
+  TF_EXPECT_OK(verifier_->Verify(graph_));
+}
+
+TEST_F(StructureVerifierTest, OpNotRegistered) {
+  SetGraph(
+      "node { name: 'input' op: 'OpNotRegistered' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::NOT_FOUND);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Op type not registered"));
+}
+
+TEST_F(StructureVerifierTest, DuplicateNodeNames) {
+  SetGraph(
+      "node { name: 'A' op: 'TestParams' }"
+      "node { name: 'A' op: 'TestInput' }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::ALREADY_EXISTS);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Node already exists:"));
+}
+
+TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
+  SetGraph(
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(),
+                        "The graph couldn't be sorted in topological order"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 96536e69454911f058ce275fb1a5b4ef06616e8d..ab9d3e11607eb930c45ddeb4ea62a60ca8601415 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -33,6 +33,7 @@ load(
     "if_android",
     "if_not_windows",
     "tf_cc_binary",
+    "tf_cc_shared_object",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -45,6 +46,7 @@ load(
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
@@ -60,6 +62,7 @@ load(
     "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -137,7 +140,11 @@ tf_kernel_library(
         "slice_op.h",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
-        "strided_slice_op_gpu.cu.cc",
+        "strided_slice_op_gpu_impl.h",
+        "strided_slice_op_gpu_int.cu.cc",
+        "strided_slice_op_gpu_complex.cu.cc",
+        "strided_slice_op_gpu_bool.cu.cc",
+        "strided_slice_op_gpu_number_types.cu.cc",
     ],
     deps = [
         ":bounds_check",
@@ -155,7 +162,6 @@ tf_kernel_library(
     name = "clustering_ops",
     prefix = "clustering_ops",
     deps = [
-        "//tensorflow/core:clustering_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
@@ -180,12 +186,35 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "collective_ops",
+    srcs = if_nccl([
+        "collective_nccl_reducer.h",
+        "collective_nccl_reducer.cc",
+    ]),
     prefix = "collective_ops",
     deps = [
-        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+    ] + if_nccl([
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core/nccl:nccl_lib",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "collective_nccl_reducer_test",
+    size = "small",
+    srcs = ["collective_nccl_reducer_test.cc"],
+    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -202,8 +231,9 @@ tf_kernel_library(
     gpu_srcs = [
         "concat_lib_gpu_impl.cu.cc",
         "concat_lib.h",
-        "cuda_device_array.h",
-        "cuda_device_array_gpu.h",
+        "concat_lib_gpu.h",
+        "gpu_device_array.h",
+        "gpu_device_array_gpu.h",
     ],
     deps = [
         ":bounds_check",
@@ -240,14 +270,13 @@ tf_kernel_library(
     deps = [
         ":eigen_helpers",
         ":fill_functor",
-        ":gpu_util_hdrs",
+        ":gpu_utils",
         ":image_resizer_state",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -338,10 +367,62 @@ tf_kernel_library(
         "//tensorflow/core/nccl:nccl_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:nccl_ops_op_lib",
     ]),
 )
 
+cc_library(
+    name = "sparse_utils",
+    srcs = [
+        "sparse_utils.cc",
+    ],
+    hdrs = ["sparse_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_utils_test",
+    srcs = ["sparse_utils_test.cc"],
+    deps = [
+        ":sparse_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "tensor_flag_utils",
+    srcs = [
+        "tensor_flag_utils.cc",
+    ],
+    hdrs = ["tensor_flag_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_flag_utils_test",
+    srcs = ["tensor_flag_utils_test.cc"],
+    deps = [
+        ":tensor_flag_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 tf_cuda_library(
     name = "ops_testutil",
     testonly = 1,
@@ -383,11 +464,31 @@ cc_library(
     hdrs = ["conv_ops_gpu.h"],
 )
 
+# We keep this target only because some contrib/ targets depend on it. The
+# reason why the contrib/ targets can't depend on gpu_utils is that, some
+# of the targets are tf_custom_op_library. tf_custom_op_library forbids the
+# dependency to tensorflow/core:lib, which gpu_utils certainly depends on.
 cc_library(
     name = "gpu_util_hdrs",
     hdrs = ["gpu_utils.h"],
 )
 
+tf_cuda_library(
+    name = "gpu_utils",
+    srcs = if_cuda_is_configured(["gpu_utils.cc"]),
+    hdrs = ["gpu_utils.h"],
+    deps = [
+        ":gpu_util_hdrs",
+        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core:conv_autotuning_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:logger",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/util/proto:proto_utils",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cc_test(
     name = "ops_util_test",
     size = "small",
@@ -487,7 +588,6 @@ cc_library(
         ":concat_lib_hdrs",
         ":ops_util_hdrs",
         ":split_lib_hdrs",
-        "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
@@ -529,9 +629,10 @@ tf_kernel_library(
     gpu_srcs = [
         "split_lib_gpu.cu.cc",
         "split_lib.h",
+        "split_lib_gpu.h",
     ],
     deps = [
-        ":cuda_device_array",
+        ":gpu_device_array",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
@@ -540,9 +641,7 @@ tf_kernel_library(
 
 cc_library(
     name = "split_lib_hdrs",
-    hdrs = [
-        "split_lib.h",
-    ],
+    hdrs = ["split_lib.h"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
@@ -571,13 +670,10 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "bounds_check",
-    hdrs = ["bounds_check.h"],
+    actual = "//tensorflow/core:framework_bounds_check",
     visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework_bounds_check",
-    ],
 )
 
 # Private support libraries ---------------------------------------------------
@@ -588,10 +684,10 @@ cc_header_only_library(
 )
 
 cc_library(
-    name = "cuda_device_array",
+    name = "gpu_device_array",
     hdrs = [
-        "cuda_device_array.h",
-        "cuda_device_array_gpu.h",
+        "gpu_device_array.h",
+        "gpu_device_array_gpu.h",
     ],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
@@ -645,6 +741,15 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "redux_functor",
+    hdrs = ["redux_functor.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "eigen_helpers",
     hdrs = [
@@ -660,10 +765,18 @@ cc_library(
     ],
     deps = [
         ":eigen_contraction_kernel",
+        ":eigen_spatial_convolutions-inl",
         "//third_party/eigen3",
     ],
 )
 
+cc_library(
+    name = "eigen_spatial_convolutions-inl",
+    hdrs = [
+        "eigen_spatial_convolutions-inl.h",
+    ],
+)
+
 cc_library(
     name = "image_resizer_state",
     hdrs = ["image_resizer_state.h"],
@@ -711,7 +824,6 @@ ARRAY_DEPS = [
     ":ops_util",
     ":transpose_functor",
     "//tensorflow/core:array_grad",
-    "//tensorflow/core:array_ops_op_lib",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -740,7 +852,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:set_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -759,7 +870,6 @@ cc_library(
     deps = [
         ":batch_space_ops",
         ":bcast_ops",
-        ":bitcast_op",
         ":broadcast_to_op",
         ":concat_op",
         ":constant_op",
@@ -811,8 +921,8 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bitcast_op",
-    prefix = "bitcast_op",
-    deps = ARRAY_DEPS,
+    deprecation = "use //third_party/tensorflow/c/kernels:bitcast_op instead",
+    deps = ["//tensorflow/c/kernels:bitcast_op"],
 )
 
 tf_kernel_library(
@@ -1008,14 +1118,14 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "split_op",
-    gpu_srcs = ["cuda_device_array.h"],
+    gpu_srcs = ["gpu_device_array.h"],
     prefix = "split_op",
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
 tf_kernel_library(
     name = "split_v_op",
-    gpu_srcs = ["cuda_device_array.h"],
+    gpu_srcs = ["gpu_device_array.h"],
     prefix = "split_v_op",
     deps = ARRAY_DEPS + [":split_lib"],
 )
@@ -1113,7 +1223,6 @@ tf_kernel_library(
     srcs = ["ragged_gather_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
     ],
 )
 
@@ -1125,7 +1234,6 @@ tf_cc_test(
         ":ops_testutil",
         ":ragged_gather_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1137,7 +1245,6 @@ tf_kernel_library(
     srcs = ["ragged_range_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
     ],
 )
 
@@ -1148,7 +1255,6 @@ tf_cc_test(
         ":ops_testutil",
         ":ragged_range_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1160,7 +1266,6 @@ tf_kernel_library(
     srcs = ["ragged_tensor_to_sparse_kernel.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
     ],
 )
 
@@ -1173,7 +1278,6 @@ tf_cc_test(
         ":ragged_tensor_to_sparse_kernel",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -1186,8 +1290,7 @@ tf_kernel_library(
     visibility = ["//visibility:public"],
     deps = [
         ":bounds_check_lib",
-        ":gpu_util_hdrs",
-        "//tensorflow/core:cudnn_rnn_ops_op_lib",
+        ":gpu_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -1263,7 +1366,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1696,7 +1798,7 @@ tf_cc_test(
     size = "small",
     srcs = ["slice_op_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     deps = [
@@ -1788,7 +1890,6 @@ tf_kernel_library(
     prefix = "candidate_sampler_ops",
     deps = [
         ":range_sampler",
-        "//tensorflow/core:candidate_sampling_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1822,7 +1923,6 @@ tf_kernel_library(
     name = "control_flow_ops",
     prefix = "control_flow_ops",
     deps = [
-        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1834,7 +1934,6 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
-        "//tensorflow/core:ctc_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/ctc:ctc_beam_search_lib",
@@ -1915,7 +2014,6 @@ DATA_FLOW_DEPS = [
     ":typed_queue",
     "//third_party/eigen3",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -1980,7 +2078,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
     ],
 )
 
@@ -1999,7 +2096,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -2047,7 +2143,6 @@ tf_kernel_library(
 DYNAMIC_DEPS = [
     ":bounds_check",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2065,8 +2160,8 @@ tf_kernel_library(
 tf_kernel_library(
     name = "dynamic_stitch_op",
     gpu_srcs = [
-        "cuda_device_array.h",
-        "cuda_device_array_gpu.h",
+        "gpu_device_array.h",
+        "gpu_device_array_gpu.h",
     ],
     prefix = "dynamic_stitch_op",
     deps = DYNAMIC_DEPS,
@@ -2080,7 +2175,6 @@ LOOKUP_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -2095,6 +2189,16 @@ tf_kernel_library(
     deps = LOOKUP_DEPS,
 )
 
+cc_library(
+    name = "string_view_variant_wrapper",
+    hdrs = ["string_view_variant_wrapper.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "checkpoint_ops",
     deps = [
@@ -2109,7 +2213,6 @@ tf_kernel_library(
     deps = [
         ":lookup_table_init_op",
         ":lookup_table_op",
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -2120,7 +2223,6 @@ tf_kernel_library(
     name = "load_and_remap_matrix_op",
     srcs = ["load_and_remap_matrix_op.cc"],
     deps = [
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2265,7 +2367,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:resource_variable_ops_op_lib",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2283,7 +2384,6 @@ tf_kernel_library(
         ":fill_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:list_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -2294,7 +2394,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:user_ops_op_lib",
     ],
 )
 
@@ -2317,7 +2416,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
@@ -2330,7 +2428,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -2338,6 +2435,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/stream_executor:stream",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2374,7 +2472,6 @@ IMAGE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:gif_internal",
-    "//tensorflow/core:image_ops_op_lib",
     "//tensorflow/core:jpeg_internal",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2628,7 +2725,7 @@ tf_cc_tests(
         "scale_and_translate_op_test.cc",
     ],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     deps = [
@@ -2715,7 +2812,6 @@ cc_library(
 IO_DEPS = [
     ":ops_util",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2759,7 +2855,6 @@ SAVE_RESTORE_DEPS = [
     ":bounds_check_lib",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2847,6 +2942,7 @@ cc_library(
         ":self_adjoint_eig_op",
         ":self_adjoint_eig_v2_op",
         ":svd_op",
+        ":tridiagonal_solve_op",
     ],
 )
 
@@ -2859,7 +2955,7 @@ tf_kernel_library(
     # and f2c helper functions in global namespace. Tell the compiler to
     # allow multiple definitions when linking this.
     linkopts = select({
-        "//tensorflow:darwin": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
         "//conditions:default": ["-Wl,-z,muldefs"],
     }),
@@ -2878,7 +2974,6 @@ LINALG_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:linalg_ops_op_lib",
 ] + if_cuda([
     ":cuda_solvers",
     ":transpose_functor",
@@ -2965,6 +3060,12 @@ tf_kernel_library(
     ]),
 )
 
+tf_kernel_library(
+    name = "tridiagonal_solve_op",
+    srcs = ["tridiagonal_solve_op.cc"],
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "qr_op",
     prefix = "qr_op",
@@ -3022,7 +3123,6 @@ LOGGING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:logging_ops_op_lib",
     "//tensorflow/core:protos_all_cc",
 ]
 
@@ -3094,7 +3194,6 @@ tf_kernel_library(
         ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:manip_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -3126,7 +3225,6 @@ MATH_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:math_grad",
-    "//tensorflow/core:math_ops_op_lib",
     "//third_party/eigen3",
 ]
 
@@ -3211,7 +3309,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bucketize_op",
-    gpu_srcs = ["cuda_device_array.h"],
+    gpu_srcs = ["gpu_device_array.h"],
     prefix = "bucketize_op",
     deps = ARRAY_DEPS,
 )
@@ -3237,7 +3335,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "cwise_op",
     prefix = "cwise_op",
-    deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
+    deps = MATH_DEPS,
 )
 
 tf_kernel_library(
@@ -3256,7 +3354,6 @@ tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
     deps = MATH_DEPS + [
-        "//tensorflow/core:spectral_ops_op_lib",
     ] + if_cuda([
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
     ]),
@@ -3279,7 +3376,7 @@ tf_kernel_library(
     }),
     deps = MATH_DEPS + [
         ":eigen_contraction_kernel",
-        ":gpu_util_hdrs",
+        ":gpu_utils",
     ] + select({
         ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
         "//conditions:default": [],
@@ -3463,10 +3560,7 @@ tf_cuda_cc_test(
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -3497,7 +3591,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["scan_ops_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     deps = [
@@ -3520,7 +3614,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["reduction_ops_test.cc"],
     linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
     deps = [
@@ -3579,27 +3673,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "shape_op_test",
-    srcs = ["shape_op_test.cc"],
-    deps = [
-        ":array",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "sparse_matmul_op_test",
     size = "small",
@@ -3719,12 +3792,13 @@ tf_kernel_library(
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/util/proto:proto_utils",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
@@ -3754,7 +3828,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@cub_archive//:cub",
         "@local_config_cuda//cuda:cudnn_header",
@@ -3774,7 +3847,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -3821,9 +3893,8 @@ NN_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:nn_grad",
-    "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
+]
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -3840,7 +3911,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "bias_op",
     prefix = "bias_op",
-    deps = NN_DEPS + if_cuda([
+    deps = NN_DEPS + [":redux_functor"] + if_cuda([
         ":reduction_ops",
         "@cub_archive//:cub",
         "//tensorflow/core:stream_executor",
@@ -3959,7 +4030,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -4068,7 +4138,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
@@ -4112,7 +4181,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -4194,7 +4262,6 @@ cc_library(
 PARSING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:parsing_ops_op_lib",
     "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
 ]
@@ -4263,7 +4330,6 @@ RANDOM_OPS_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:random_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4313,7 +4379,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stateful_random_ops_op_lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
@@ -4327,7 +4392,6 @@ tf_kernel_library(
         ":random_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stateless_random_ops_op_lib",
     ],
 )
 
@@ -4342,8 +4406,6 @@ cc_library(
 REQUIRED_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:no_op_op_lib",
-    "//tensorflow/core:sendrecv_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4404,7 +4466,6 @@ cc_library(
 SPARSE_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:sparse_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4652,7 +4713,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:sdca_ops_op_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
     ],
@@ -4692,7 +4752,6 @@ STATE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:state_ops_op_lib",
 ] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 tf_kernel_library(
@@ -4830,7 +4889,6 @@ STRING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:string_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4981,7 +5039,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:string_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/icu/data:conversion_data",
         "@icu//:common",
@@ -5003,7 +5060,6 @@ tf_kernel_library(
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:training_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -5065,7 +5121,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -5093,7 +5148,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -5361,7 +5415,6 @@ filegroup(
     srcs = [
         "avgpooling_op.h",
         "batch_util.h",
-        "bounds_check.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
         "cwise_ops_gradients.h",
@@ -5373,6 +5426,7 @@ filegroup(
         "eigen_pooling.h",
         "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
+        "eigen_spatial_convolutions-inl.h",
         "eigen_volume_patch.h",
         "fifo_queue.h",
         "maxpooling_op.h",
@@ -5402,7 +5456,6 @@ filegroup(
         "assign_op.h",
         "bias_op.cc",
         "bias_op.h",
-        "bounds_check.h",
         "cast_op.cc",
         "cast_op.h",
         "cast_op_impl.h",
@@ -5469,6 +5522,7 @@ filegroup(
         "ops_util.h",
         "pack_op.cc",
         "pooling_ops_common.h",
+        "redux_functor.h",
         "reshape_op.cc",
         "reshape_op.h",
         "reverse_sequence_op.cc",
@@ -5548,6 +5602,7 @@ filegroup(
         "gemm_functors.h",
         "image_resizer_state.h",
         "initializable_lookup_table.h",
+        "logging_ops.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
@@ -5609,7 +5664,10 @@ filegroup(
         "conv_grad_ops.h",
         "conv_ops.cc",
         "conv_ops_3d.cc",
-        "conv_ops_fused.cc",
+        "conv_ops_fused_double.cc",
+        "conv_ops_fused_float.cc",
+        "conv_ops_fused_half.cc",
+        "conv_ops_fused_impl.h",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
         "crop_and_resize_op.h",
@@ -5830,7 +5888,7 @@ ANDROID_TEXTUAL_HDRS = [
 # registration.
 filegroup(
     name = "android_all_ops",
-    srcs = glob(
+    srcs = ["//tensorflow/c/kernels:android_all_ops"] + glob(
         [
             "*.cc",
             "*.h",
@@ -6002,12 +6060,9 @@ tf_kernel_library(
         ":ops_util",
         ":pooling_ops",
         ":quantization_utils",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -6312,6 +6367,29 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_conv_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_conv_ops_test.cc"],
+    tags = ["nomsan"],  # http://b/32242946
+    deps = [
+        ":mkl_conv_op",
+        ":mkl_input_conversion_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantize_op_test",
     size = "small",
@@ -6493,6 +6571,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_concat_op_test",
+    size = "small",
+    srcs = ["mkl_quantized_concat_op_test.cc"],
+    deps = [
+        ":mkl_concat_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_batch_norm_op_test",
     size = "small",
@@ -6577,7 +6679,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:remote_fused_graph_ops_op_lib",
     ],
 )
 
@@ -6732,15 +6833,28 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
+tf_cc_test(
+    name = "bias_op_test",
+    size = "small",
+    srcs = ["bias_op_test.cc"],
+    deps = [
+        ":bias_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_conv_ops_test",
     size = "small",
     srcs = ["mkl_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -6767,8 +6881,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6783,8 +6895,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6803,8 +6913,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6818,8 +6926,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6834,8 +6940,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6893,6 +6997,65 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_requantize_ops",
+    srcs = [
+        "mkl_requantization_range_per_channel_op.cc",
+        "mkl_requantize_per_channel_op.cc",
+    ],
+    hdrs = [
+        "meta_support.h",
+        "no_op.h",
+        "reference_gemm.h",
+    ],
+    deps = if_mkl(
+        [
+            ":concat_lib_hdrs",
+            ":conv_ops",
+            ":cwise_op",
+            ":eigen_helpers",
+            ":image_resizer_state",
+            ":ops_util",
+            ":pooling_ops",
+            ":quantization_utils",
+            ":transpose_functor",
+            "//third_party/eigen3",
+            "@gemmlowp",
+            "@mkl_dnn",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_requantize_ops_test",
+    size = "small",
+    srcs = ["mkl_requantize_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_requantize_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_fused_ops_test",
     size = "small",
@@ -6976,7 +7139,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:summary_ops_op_lib",
         "//tensorflow/core/lib/db:sqlite",
         "//tensorflow/core/summary:schema",
         "//tensorflow/core/summary:summary_db_writer",
@@ -6990,13 +7152,13 @@ tf_kernel_library(
         "decode_proto_op.cc",
     ],
     deps = [
-        "//tensorflow/core:decode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:decode",
         "//tensorflow/core/util/proto:descriptors",
         "//tensorflow/core/util/proto:proto_utils",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -7004,7 +7166,6 @@ tf_kernel_library(
     name = "encode_proto_op",
     srcs = ["encode_proto_op.cc"],
     deps = [
-        "//tensorflow/core:encode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:descriptors",
@@ -7022,7 +7183,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:rpc_ops_op_lib",
         "//tensorflow/core/util/rpc:call_container",
         "//tensorflow/core/util/rpc:rpc_factory",
         "//tensorflow/core/util/rpc:rpc_factory_registry",
@@ -7035,7 +7195,6 @@ tf_kernel_library(
     srcs = ["unicode_script_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:string_ops_op_lib",
         "@icu//:common",
     ],
 )
@@ -7078,3 +7237,31 @@ cc_header_only_library(
         ":cwise_lib",
     ],
 )
+
+# Library to link with when compiling the quantize and dequantize kernels directly,
+# e.g. for selective registration.
+cc_header_only_library(
+    name = "quantize_and_dequantize_op_hdrs",
+    deps = [
+        ":quantize_and_dequantize_op",
+    ],
+)
+
+cc_library(
+    name = "kernel_platform_strings",
+    srcs = ["kernel_platform_strings.h"],
+    deps = [
+        "//tensorflow/core:platform_strings",
+    ],
+    alwayslink = 1,
+)
+
+# Shared object that links all the kernels TF needs.
+tf_cc_shared_object(
+    name = "libtfkernel_all_kernels.so",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":kernel_platform_strings",
+        "//tensorflow/core:all_kernels_impl",
+    ],
+)
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
index c30085269c07e2bdeae70a8729261596faeb6344..985858ad9aff5759e585cdc271315a66813c45df 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -35,9 +35,10 @@ void AdjustHueGPU<T>::operator()(GPUDevice* device,
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<true, false, false, T>
-      <<<block_count, threads_per_block, 0, stream>>>(
-          number_of_elements, input, output, delta, nullptr, nullptr);
+  TF_CHECK_OK(CudaLaunchKernel(internal::adjust_hsv_nhwc<true, false, false, T>,
+                               block_count, threads_per_block, 0, stream,
+                               number_of_elements, input, output, delta,
+                               nullptr, nullptr));
 }
 
 template struct AdjustHueGPU<float>;
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 87d34fcfcc31c9f806754e9b1bc36430938d64c3..98264c4a1de75f7308c0b55e3d77a2dff88ebb49 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -193,8 +193,8 @@ class AdjustSaturationOp<CPUDevice, float> : public AdjustSaturationOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, scale_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, scale_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
index 6c70490d469fa8dbdc425f9e57b42acda14f5a58..ea43fc33bba988e3aa651b156b0ebed7252de4e6 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
@@ -36,9 +36,10 @@ void AdjustSaturationGPU<T>::operator()(GPUDevice* device,
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<false, true, false, T>
-      <<<block_count, threads_per_block, 0, stream>>>(
-          number_of_elements, input, output, nullptr, scale, nullptr);
+  TF_CHECK_OK(CudaLaunchKernel(internal::adjust_hsv_nhwc<false, true, false, T>,
+                               block_count, threads_per_block, 0, stream,
+                               number_of_elements, input, output, nullptr,
+                               scale, nullptr));
 }
 
 template struct AdjustSaturationGPU<float>;
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 150e8fe6379fd2a41778e94df793ba45ef0d309e..edf6d3e61e0dc4297ad330fbe43086fce0607088 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -179,20 +179,7 @@ class AddNOp<Device, Variant> : public OpKernel {
               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
     }
 
-    TensorShape common_shape;
-    OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape));
-    // Step 2: access all variants and ensure shapes match.
-    for (int i = 1; i < num; ++i) {
-      TensorShape check_shape;
-      OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape));
-      OP_REQUIRES(ctx, common_shape == check_shape,
-                  errors::InvalidArgument(
-                      "AddN of Variants of differing shapes; inputs[0] shape: ",
-                      common_shape.DebugString(), ", inputs[", i,
-                      "] shape: ", check_shape.DebugString()));
-    }
-
-    // Step 3: attempt to add using
+    // Step 2: attempt to add using
     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
     //   For the output create a default-constructed variant object.
     // TODO(ebrevdo): Perform summation in a tree-structure.
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index c731b64993b3a6cebfb46eca9221ca28b729e845..778f818a61a54ec1aa78b93a8f5b8e61755a341f 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -25,13 +25,13 @@ limitations under the License.
 
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index ce2fce92e4ee8cbd7bdc578d92103a5bd5da0629..f555c0fd67968cbbe98ae1e27908374d41aab1ab 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -34,7 +34,31 @@ class ExtractGlimpseOp : public OpKernel {
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
-    OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise_));
+    bool uniform_noise = false;
+    string noise;
+    OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise));
+    OP_REQUIRES_OK(context, context->GetAttr("noise", &noise));
+    OP_REQUIRES(context,
+                !(uniform_noise && (!noise.empty() && noise != "uniform")),
+                errors::InvalidArgument("The uniform_noise and noise could not "
+                                        "be specified at the same time"));
+    if (noise.empty()) {
+      noise_ = uniform_noise ? Eigen::ExtractGlimpsesNoiseMode::UNIFORM
+                             : Eigen::ExtractGlimpsesNoiseMode::GAUSSIAN;
+    } else {
+      OP_REQUIRES(context,
+                  noise == "uniform" || noise == "gaussian" || noise == "zero",
+                  errors::InvalidArgument(
+                      "The noise could only be uniform, gaussian, or zero, got",
+                      noise));
+      if (noise == "uniform") {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::UNIFORM;
+      } else if (noise == "gaussian") {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::GAUSSIAN;
+      } else {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::ZERO;
+      }
+    }
   }
 
   // Expect input tensor of rank 4 with dimensions (batch_size, height, width,
@@ -98,13 +122,13 @@ class ExtractGlimpseOp : public OpKernel {
         context->eigen_cpu_device()) =
         Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
                                output_width, output_height, offset_vec,
-                               normalized_, centered_, uniform_noise_);
+                               normalized_, centered_, noise_);
   }
 
  private:
   bool normalized_;
   bool centered_;
-  bool uniform_noise_;
+  Eigen::ExtractGlimpsesNoiseMode noise_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 35511d5c313fb4b3794d00bd685ec4249580daa3..0cf2e4d4cbb8e67835c68146cbf269062a9cb051 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -91,11 +91,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          config.virtual_thread_count, top_diff, num, height, width, channels,
-          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-          pad_t, pad_t, bottom_diff);
+  TF_CHECK_OK(CudaLaunchKernel(
+      AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
+      d.stream(), config.virtual_thread_count, top_diff, num, height, width,
+      channels, pooled_height, pooled_width, kernel_h, kernel_w, stride_h,
+      stride_w, pad_t, pad_t, bottom_diff));
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d5bd36b4ceaa62f6c2f6928bbea704a0e6d01017..89d742c2dafcfd593f0166816d28ec65cb9ac9f9 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -247,7 +247,6 @@ class Barrier : public ResourceBase {
           keys = t[1];
           values.insert(values.begin(), t.begin() + 2, t.end());
           callback(indices, keys, values);
-          return;
         });
   }
 
@@ -509,7 +508,7 @@ class BarrierOpKernel : public AsyncOpKernel {
     Barrier* barrier = nullptr;
     OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &barrier),
                          callback);
-    ComputeAsync(ctx, barrier, [this, callback, barrier]() {
+    ComputeAsync(ctx, barrier, [callback, barrier]() {
       barrier->Unref();
       callback();
     });
@@ -618,7 +617,6 @@ class TakeManyOp : public BarrierOpKernel {
             values_output.set(i, values[i]);
           }
           callback();
-          return;
         });
   }
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5ba461aa9de2a647962c653fb9ca0f199e9110be..338f61ff6642cbc604bb77dfe1908fe28b9fc142 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -720,8 +720,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator = [this,
-                                                         c](BatchResource** r) {
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(
           BatchResource::Create(num_batch_threads_, max_batch_size_,
@@ -801,16 +800,15 @@ class BatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator =
-        [this](BatchResource** r) {
-          std::unique_ptr<BatchResource> new_resource;
-          TF_RETURN_IF_ERROR(BatchResource::Create(
-              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
-              &new_resource));
-          *r = new_resource.release();
-          return Status::OK();
-        };
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
+      std::unique_ptr<BatchResource> new_resource;
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          &new_resource));
+      *r = new_resource.release();
+      return Status::OK();
+    };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
                              container_, shared_name_, &br, creator),
@@ -1066,7 +1064,7 @@ class UnbatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchResource* ubr;
-    std::function<Status(UnbatchResource * *r)> creator =
+    std::function<Status(UnbatchResource**)> creator =
         [this](UnbatchResource** r) {
           *r = new UnbatchResource(timeout_micros_);
           return Status::OK();
@@ -1252,8 +1250,8 @@ class UnbatchGradKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchGradResource* ubr;
-    std::function<Status(UnbatchGradResource * *r)> creator =
-        [this](UnbatchGradResource** r) {
+    std::function<Status(UnbatchGradResource**)> creator =
+        [](UnbatchGradResource** r) {
           *r = new UnbatchGradResource();
           return Status::OK();
         };
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 43539ac908ffdcb49d6f35ad3dc8cdc6ce28bc61..88e6e6239797f861894600b23c39c816aff866d6 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -52,20 +52,15 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 namespace {
 
+// Returns the pair of dimensions along which to perform Tensor contraction to
+// emulate matrix multiplication.
+// For matrix multiplication of 2D Tensors X and Y, X is contracted along
+// second dimension and Y is contracted along the first dimension (if neither X
+// nor Y is adjointed). The dimension to contract along is switched when any
+// operand is adjointed.
+// See http://en.wikipedia.org/wiki/Tensor_contraction
 Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
-  if (!adj_x) {
-    if (!adj_y) {
-      return Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-    } else {
-      return Eigen::IndexPair<Eigen::DenseIndex>(1, 1);
-    }
-  } else {
-    if (!adj_y) {
-      return Eigen::IndexPair<Eigen::DenseIndex>(0, 0);
-    } else {
-      return Eigen::IndexPair<Eigen::DenseIndex>(0, 1);
-    }
-  }
+  return Eigen::IndexPair<Eigen::DenseIndex>(adj_x ? 0 : 1, adj_y ? 1 : 0);
 }
 
 // Parallel batch matmul kernel based on the multi-threaded tensor contraction
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index c34ea14bf6007f6951733990c0a01999ac838b75..609ddd68caf9484574c4a617d7a289a76ee3c3ca 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -127,8 +127,12 @@ class BatchNormGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {2}, 2, var.shape(), &dv));
     Tensor* db = nullptr;
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {3}, 3, mean.shape(), &db));
+    if (scale_after_normalization_) {
+      OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    } else {
+      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                  {3}, 3, mean.shape(), &db));
+    }
     Tensor* dg = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
 
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 656b6ced6de00933cfe8db7dadd1a56ade212758..bef73b0574fc684f6970e705a3b95ed54e41a369 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -125,6 +125,10 @@ class AdaptiveSharedBatchScheduler
     int max_batch_size = 1000;
     // Maximum number of enqueued (i.e. non-scheduled) batches.
     int max_enqueued_batches = 10;
+    // Amount of time non-full batches must wait before becoming schedulable.
+    // A non-zero value can improve performance by limiting the scheduling of
+    // nearly empty batches.
+    int64 batch_timeout_micros = 0;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
@@ -267,8 +271,11 @@ class ASBSQueue : public BatchScheduler<TaskType> {
 template <typename TaskType>
 class ASBSBatch : public Batch<TaskType> {
  public:
-  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros)
-      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros,
+            int64 batch_timeout_micros)
+      : queue_(queue),
+        creation_time_micros_(creation_time_micros),
+        schedulable_time_micros_(creation_time_micros + batch_timeout_micros) {}
 
   ~ASBSBatch() override {}
 
@@ -276,9 +283,12 @@ class ASBSBatch : public Batch<TaskType> {
 
   int64 creation_time_micros() const { return creation_time_micros_; }
 
+  int64 schedulable_time_micros() const { return schedulable_time_micros_; }
+
  private:
   ASBSQueue<TaskType>* queue_;
   const int64 creation_time_micros_;
+  const int64 schedulable_time_micros_;
   TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
 };
 }  // namespace internal
@@ -377,7 +387,12 @@ void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
     bool also_schedule_closed_batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
-  MaybeScheduleNextBatch();
+  // Maybe schedule this batch once it becomes schedulable.
+  GetEnv()->SchedClosureAfter(
+      batch->schedulable_time_micros() - batch->creation_time_micros(), [this] {
+        mutex_lock l(mu_);
+        MaybeScheduleNextBatch();
+      });
   if (also_schedule_closed_batch) {
     MaybeScheduleClosedBatch();
   }
@@ -400,21 +415,22 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
           in_flight_batches_limit_ - in_flight_batches_) {
     return;
   }
-  auto best_it = batches_.begin();
-  double best_score =
-      (*best_it)->creation_time_micros() -
-      options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
-          static_cast<double>((*best_it)->queue()->max_task_size());
-  for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+  auto best_it = batches_.end();
+  double best_score;
+  int64 now_micros = GetEnv()->NowMicros();
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->schedulable_time_micros() > now_micros) continue;
     const double score =
         (*it)->creation_time_micros() -
         options_.full_batch_scheduling_boost_micros * (*it)->size() /
             static_cast<double>((*it)->queue()->max_task_size());
-    if (score < best_score) {
+    if (best_it == batches_.end() || score < best_score) {
       best_score = score;
       best_it = it;
     }
   }
+  // No schedulable batches.
+  if (best_it == batches_.end()) return;
   const internal::ASBSBatch<TaskType>* batch = *best_it;
   batches_.erase(best_it);
   // Queue may destroy itself after ReleaseBatch is called.
@@ -552,7 +568,8 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     if (!current_batch_) {
       num_enqueued_batches_++;
       current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
+          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
+                                  options_.batch_timeout_micros);
     }
     current_batch_->AddTask(std::move(*task));
     num_enqueued_tasks_++;
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 9006fb46fd5bc5494935ce5f32cfb8363a08650c..4b5d04ba3ed675523e867d5f17fc294e826aefe0 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -19,11 +19,12 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/bias_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/redux_functor.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if GOOGLE_CUDA
@@ -286,19 +287,15 @@ class BiasGradOp : public OpKernel {
                 .sum(reduction_axes)
                 .template cast<T>();  // End of code by intel_tf.
       } else {
+        using AccumT = typename AccumulatorType<T>::type;
+        const functor::ReduceOuterDimensions<
+            T, AccumT, Eigen::internal::scalar_sum_op<AccumT>>
+            redux;
+
         Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width * depth,
                                                 channel);
-#ifdef EIGEN_HAS_INDEX_LIST
-        Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
-#else
-        Eigen::array<Eigen::Index, 1> reduction_axis = {0};
-#endif
-        output->template flat<T>().device(context->eigen_device<Device>()) =
-            output_backprop.flat<T>()
-                .template cast<typename AccumulatorType<T>::type>()
-                .reshape(two_dims)
-                .sum(reduction_axis)
-                .template cast<T>();
+        redux(context->eigen_device<Device>(), two_dims, output_backprop,
+              output);
       }
     }
   }
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 006fa1dc712f7c06953f70e278fedaa3504bfcce..84c889d3e896c8da5e1e3f964f1cd41ab1db130e 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -85,14 +85,15 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, input, bias, output, bias_size);
+    TF_CHECK_OK(CudaLaunchKernel(BiasNHWCKernel<T>, config.block_count,
+                                 config.thread_per_block, 0, d.stream(),
+                                 config.virtual_thread_count, input, bias,
+                                 output, bias_size));
   } else {
-    BiasNCHWKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, input, bias, output, bias_size,
-            image_size);
+    TF_CHECK_OK(CudaLaunchKernel(BiasNCHWKernel<T>, config.block_count,
+                                 config.thread_per_block, 0, d.stream(),
+                                 config.virtual_thread_count, input, bias,
+                                 output, bias_size, image_size));
   }
 }
 
@@ -225,24 +226,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              output_backprop, bias_backprop, batch, bias_size, image_size,
-              group_size);
+      TF_CHECK_OK(CudaLaunchKernel(
+          BiasGradNCHW_SharedAtomics<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), output_backprop,
+          bias_backprop, batch, bias_size, image_size, group_size));
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              total_count, output_backprop, bias_backprop, bias_size);
+      TF_CHECK_OK(CudaLaunchKernel(
+          BiasGradNHWC_Naive<T>, config.block_count, config.thread_per_block, 0,
+          d.stream(), total_count, output_backprop, bias_backprop, bias_size));
     } else {
-      BiasGradNCHW_Naive<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              total_count, output_backprop, bias_backprop, bias_size,
-              image_size);
+      TF_CHECK_OK(CudaLaunchKernel(BiasGradNCHW_Naive<T>, config.block_count,
+                                   config.thread_per_block, 0, d.stream(),
+                                   total_count, output_backprop, bias_backprop,
+                                   bias_size, image_size));
     }
   }
 }
diff --git a/tensorflow/core/kernels/bias_op_test.cc b/tensorflow/core/kernels/bias_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8118e322fc8afa451ee21840c5737f5cc85fe8f
--- /dev/null
+++ b/tensorflow/core/kernels/bias_op_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <random>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bias_op.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor out_backprop(DT_FLOAT, TensorShape({d0, d1, d2, d3}));
+  out_backprop.flat<float>().setRandom();
+  test::graph::Unary(g, "BiasAddGrad", test::graph::Constant(g, out_backprop));
+  return g;
+}
+
+#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE)                          \
+  static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE(    \
+      int iters) {                                                      \
+    testing::UseRealTime();                                             \
+    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
+    test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters);       \
+  }                                                                     \
+  BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
+
+// CPU
+BM_BiasAddGradNHWC(32, 32, 32, 128, cpu);
+BM_BiasAddGradNHWC(32, 32, 32, 256, cpu);
+BM_BiasAddGradNHWC(32, 32, 32, 512, cpu);
+BM_BiasAddGradNHWC(32, 32, 32, 1024, cpu);
+
+BM_BiasAddGradNHWC(32, 64, 64, 128, cpu);
+BM_BiasAddGradNHWC(32, 64, 64, 256, cpu);
+BM_BiasAddGradNHWC(32, 64, 64, 512, cpu);
+BM_BiasAddGradNHWC(32, 64, 64, 1024, cpu);
+
+#ifdef GOOGLE_CUDA
+BM_BiasAddGradNHWC(32, 32, 32, 128, gpu);
+BM_BiasAddGradNHWC(32, 32, 32, 256, gpu);
+BM_BiasAddGradNHWC(32, 32, 32, 512, gpu);
+BM_BiasAddGradNHWC(32, 32, 32, 1024, gpu);
+
+BM_BiasAddGradNHWC(32, 64, 64, 128, gpu);
+BM_BiasAddGradNHWC(32, 64, 64, 256, gpu);
+BM_BiasAddGradNHWC(32, 64, 64, 512, gpu);
+BM_BiasAddGradNHWC(32, 64, 64, 1024, gpu);
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bitcast_op.cc b/tensorflow/core/kernels/bitcast_op.cc
deleted file mode 100644
index f602cfa428a555970f35b4057c46641a3ba156dd..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/bitcast_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/array_ops.cc.
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-
-namespace tensorflow {
-
-class BitcastOp : public OpKernel {
- public:
-  explicit BitcastOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("T", &input_data_type_));
-    OP_REQUIRES_OK(context, context->GetAttr("type", &output_data_type_));
-    in_size_ = DataTypeSize(input_data_type_);
-    out_size_ = DataTypeSize(output_data_type_);
-    int check_size =
-        std::max(in_size_, out_size_) % std::min(in_size_, out_size_);
-    OP_REQUIRES(
-        context, check_size == 0,
-        errors::InvalidArgument("cannot convert between datatype ",
-                                input_data_type_, " and ", output_data_type_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor = context->input(0);
-
-    TensorShape adjusted_shape = input_tensor.shape();
-    OP_REQUIRES(context,
-                in_size_ >= out_size_ ||
-                    (input_tensor.dims() > 0 &&
-                     input_tensor.dim_size(input_tensor.dims() - 1) ==
-                         out_size_ / in_size_) ||
-                    input_tensor.dim_size(input_tensor.dims()) == -1,
-                errors::InvalidArgument(
-                    "Cannot bitcast from ", DataTypeString(input_data_type_),
-                    " to ", DataTypeString(output_data_type_), ": shape ",
-                    input_tensor.shape().DebugString()));
-
-    if (out_size_ < in_size_) {
-      adjusted_shape.AddDim(in_size_ / out_size_);
-    } else if (out_size_ > in_size_) {
-      adjusted_shape.RemoveDim(input_tensor.dims() - 1);
-    }
-    Tensor output_tensor;
-
-    output_tensor.UnsafeCopyFromInternal(input_tensor, output_data_type_,
-                                         adjusted_shape);
-    context->set_output(0, output_tensor);
-  }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  DataType input_data_type_;
-  DataType output_data_type_;
-  int in_size_;
-  int out_size_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("Bitcast").Device(DEVICE_CPU), BitcastOp);
-
-#if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("Bitcast").Device(DEVICE_GPU), BitcastOp);
-#endif  // GOOGLE_CUDA
-
-}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 4e9bab3e21f9f240d32e78a1a489033a693caa73..3aa3bb84b9b973878127fd7db1f2d652f591a34d 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -52,6 +52,16 @@ message BucketizedSplit {
   // the rule feature <= threshold.
   int32 feature_id = 1;
   int32 threshold = 2;
+  // If feature column is multivalent, this holds the index of the dimension
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
+  enum DefaultDirection {
+    // Left is the default direction.
+    DEFAULT_LEFT = 0;
+    DEFAULT_RIGHT = 1;
+  }
+  // default direction for missing values.
+  DefaultDirection default_direction = 6;
 
   // Node children indexing into a contiguous
   // vector of nodes starting from the root.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 4ae26fb95b1bb47db6a9462670df08f1bb4e171e..3876bd0c8c8e866502fc61054ab52faefacd0580 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -113,8 +113,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       output_tree_ids.setConstant(latest_tree);
       auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
                       &cached_node_ids, &output_partial_logits,
-                      &output_node_ids, batch_size,
-                      latest_tree](int32 start, int32 end) {
+                      &output_node_ids, latest_tree](int32 start, int32 end) {
         for (int32 i = start; i < end; ++i) {
           int32 tree_id = cached_tree_ids(i);
           int32 node_id = cached_node_ids(i);
@@ -129,7 +128,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
             // Logic in the loop adds the cached node value again if it is a
             // leaf. If it is not a leaf anymore we need to subtract the old
             // node's value. The following logic handles both of these cases.
-            partial_tree_logit -= resource->node_value(tree_id, node_id);
+            const auto& node_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(node_logits.size(), 1);
+            partial_tree_logit -= node_logits[0];
           } else {
             // No cache exists, start from the very first node.
             node_id = 0;
@@ -137,7 +138,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           float partial_all_logit = 0.0;
           while (true) {
             if (resource->is_leaf(tree_id, node_id)) {
-              partial_tree_logit += resource->node_value(tree_id, node_id);
+              const auto& leaf_logits = resource->node_value(tree_id, node_id);
+              DCHECK_EQ(leaf_logits.size(), 1);
+              partial_tree_logit += leaf_logits[0];
 
               // Tree is done
               partial_all_logit +=
@@ -187,9 +190,6 @@ class BoostedTreesPredictOp : public OpKernel {
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("logits_dimension", &logits_dimension_));
-    OP_REQUIRES(context, logits_dimension_ == 1,
-                errors::InvalidArgument(
-                    "Currently only one dimensional outputs are supported."));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -225,18 +225,20 @@ class BoostedTreesPredictOp : public OpKernel {
     }
 
     const int32 last_tree = resource->num_trees() - 1;
-
     auto do_work = [&resource, &batch_bucketized_features, &output_logits,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree, this](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
-        float tree_logit = 0.0;
+        std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32 tree_id = 0;
         int32 node_id = 0;
         while (true) {
           if (resource->is_leaf(tree_id, node_id)) {
-            tree_logit += resource->GetTreeWeight(tree_id) *
-                          resource->node_value(tree_id, node_id);
-
+            const float tree_weight = resource->GetTreeWeight(tree_id);
+            const auto& leaf_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(leaf_logits.size(), logits_dimension_);
+            for (int32 j = 0; j < logits_dimension_; ++j) {
+              tree_logits[j] += tree_weight * leaf_logits[j];
+            }
             // Stop if it was the last tree.
             if (tree_id == last_tree) {
               break;
@@ -249,7 +251,9 @@ class BoostedTreesPredictOp : public OpKernel {
                                           batch_bucketized_features);
           }
         }
-        output_logits(i, 0) = tree_logit;
+        for (int32 j = 0; j < logits_dimension_; ++j) {
+          output_logits(i, j) = tree_logits[j];
+        }
       }
     };
     // 10 is the magic number. The actual number might depend on (the number of
@@ -312,7 +316,7 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     const int batch_size = batch_bucketized_features[0].size();
 
     // We need to get the feature ids used for splitting and the logits after
-    // each split. We will use these to calulate the changes in the prediction
+    // each split. We will use these to calculate the changes in the prediction
     // (contributions) for an arbitrary activation function (done in Python) and
     // attribute them to the associated feature ids. We will store these in
     // a proto below.
@@ -329,13 +333,14 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
     auto do_work = [&resource, &batch_bucketized_features, &output_debug_info,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
         boosted_trees::DebugOutput example_debug_info;
         // Initial bias prediction. E.g., prediction based off training mean.
-        float tree_logit =
-            resource->GetTreeWeight(0) * resource->node_value(0, 0);
+        const auto& tree_logits = resource->node_value(0, 0);
+        DCHECK_EQ(tree_logits.size(), 1);
+        float tree_logit = resource->GetTreeWeight(0) * tree_logits[0];
         example_debug_info.add_logits_path(tree_logit);
         int32 node_id = 0;
         int32 tree_id = 0;
@@ -358,8 +363,9 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
             // Get logit after split.
             node_id = resource->next_node(tree_id, node_id, i,
                                           batch_bucketized_features);
-            tree_logit = resource->GetTreeWeight(tree_id) *
-                         resource->node_value(tree_id, node_id);
+            const auto& tree_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(tree_logits.size(), 1);
+            tree_logit = resource->GetTreeWeight(tree_id) * tree_logits[0];
             // Output logit incorporates sum of leaf logits from prior trees.
             example_debug_info.add_logits_path(tree_logit + past_trees_logit);
           }
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 42df4848815db7a097a70b4f1713fd42484be438..2eceab3a59116d74025e6fbf4da07495349132d3 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -21,10 +21,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-constexpr float kLayerByLayerTreeWeight = 1.0;
-}  // namespace
-
 // Constructor.
 BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
     : tree_ensemble_(
@@ -82,15 +78,38 @@ int32 BoostedTreesEnsembleResource::next_node(
   return -1;
 }
 
-float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
-                                               const int32 node_id) const {
+std::vector<float> BoostedTreesEnsembleResource::node_value(
+    const int32 tree_id, const int32 node_id) const {
   DCHECK_LT(tree_id, tree_ensemble_->trees_size());
   DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
   const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
   if (node.node_case() == boosted_trees::Node::kLeaf) {
-    return node.leaf().scalar();
+    // TODO(crawles): only use vector leaf even if # logits=1.
+    if (node.leaf().has_vector()) {
+      std::vector<float> leaf_values;
+      const auto& leaf_value_vector = node.leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      leaf_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        leaf_values.push_back(leaf_value_vector.value(i));
+      }
+      return leaf_values;
+    } else {
+      return {node.leaf().scalar()};
+    }
   } else {
-    return node.metadata().original_leaf().scalar();
+    if (node.metadata().original_leaf().has_vector()) {
+      std::vector<float> node_values;
+      const auto& leaf_value_vector = node.metadata().original_leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      node_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        node_values.push_back(leaf_value_vector.value(i));
+      }
+      return node_values;
+    } else {
+      return {node.metadata().original_leaf().scalar()};
+    }
   }
 }
 
@@ -452,15 +471,18 @@ void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
 
     // Change node back into leaf.
     *node->mutable_leaf() = node_metadata.original_leaf();
-    const float parent_value = node_value(tree_id, node_id);
+    const auto& parent_values = node_value(tree_id, node_id);
+    DCHECK_EQ(parent_values.size(), 1);
+    const float parent_value = parent_values[0];
 
     // Save the old values of weights of children.
     (*nodes_meta)[left_id].first = node_id;
-    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+    (*nodes_meta)[left_id].second =
+        parent_value - node_value(tree_id, left_id)[0];
 
     (*nodes_meta)[right_id].first = node_id;
     (*nodes_meta)[right_id].second =
-        parent_value - node_value(tree_id, right_id);
+        parent_value - node_value(tree_id, right_id)[0];
 
     // Clear gain for leaf node.
     node->clear_metadata();
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index 3c7b2df9b08a2b8912c43b2439e28f34a64b38ef..34a35f173c338964632b62536f21175137e9b371 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -68,7 +68,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
       const int32 tree_id, const int32 node_id, const int32 index_in_batch,
       const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
 
-  float node_value(const int32 tree_id, const int32 node_id) const;
+  std::vector<float> node_value(const int32 tree_id, const int32 node_id) const;
 
   void set_node_value(const int32 tree_id, const int32 node_id,
                       const float logits);
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 973cdec13a368ff95ae3185695507c62c173675c..7c025b34b982f410ac3585855a6e14f3b99f5e2f 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -288,7 +288,9 @@ class BoostedTreesCenterBiasOp : public OpKernel {
       ensemble_resource->AddNewTreeWithLogits(kLayerByLayerTreeWeight, logits);
       current_bias = logits;
     } else {
-      current_bias = ensemble_resource->node_value(0, 0);
+      const auto& current_biases = ensemble_resource->node_value(0, 0);
+      DCHECK_EQ(current_biases.size(), 1);
+      current_bias = current_biases[0];
       continue_centering =
           std::abs(logits / current_bias) > kMinDeltaForCenterBias;
       current_bias += logits;
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index 2810925bbcd645f60af0e6025a74043cd45f21e7..8c4341335fbae66249259b69a9693b8bcf6073f0 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -43,12 +47,42 @@ class BroadcastToOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
 
+    // Handle copy.
+    if (output_shape == input_shape) {
+      ctx->set_output(0, input_tensor);
+      return;
+    }
+
+    OP_REQUIRES(ctx, input_shape.dims() <= output_shape.dims(),
+                errors::InvalidArgument(
+                    "Rank of input (", input_shape.dims(),
+                    ") must be no greater than rank of output shape (",
+                    output_shape.dims(), ")."));
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
-
-    const Device& d = ctx->eigen_device<Device>();
-    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
-                                      input_tensor, input_shape);
+    // Handle empty case.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
+    // Handle broadcast from Scalar.
+    const Device& device = ctx->eigen_device<Device>();
+    if (input_shape.dims() == 0) {
+      functor::FillFunctor<Device, T>()(device, output_tensor->flat<T>(),
+                                        input_tensor.scalar<T>());
+      return;
+    }
+
+    BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
+                /*fewer_dims_optimization=*/true);
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "Incompatible shapes: ", input_shape.DebugString(), " vs. ",
+                    output_shape.DebugString()));
+
+    functor::BroadcastTo<Device, T>()(device, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape, bcast);
   }
 };
 
@@ -65,12 +99,12 @@ TF_CALL_ALL_TYPES(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 
 namespace functor {
-#define DECLARE_GPU_TEMPLATE(Type)                              \
-  template <>                                                   \
-  void BroadcastTo<GPUDevice, Type>::operator()(                \
-      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
-      const TensorShape& output_shape, const Tensor& input,     \
-      const TensorShape& input_shape);                          \
+#define DECLARE_GPU_TEMPLATE(Type)                               \
+  template <>                                                    \
+  void BroadcastTo<GPUDevice, Type>::operator()(                 \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output,  \
+      const TensorShape& output_shape, const Tensor& input,      \
+      const TensorShape& input_shape, const BCast& bcast) const; \
   extern template struct BroadcastTo<GPUDevice, Type>;
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index bc11c5f914bfcbcbbc4445cace7126717f3d8d2d..6ae860c2b2995f1a9bb5f47ad40b4546923801a6 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -23,196 +23,81 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
 namespace functor {
 
-#define BROADCAST_SHAPE(NDIMS, input_shape, output_shape)                 \
-  auto reshape = AsEigenDSizesWithPrefix<NDIMS>(input_shape);             \
-  auto broadcast = output_shape.AsEigenDSizes<NDIMS>();                   \
-  auto reshape_32bit = AsEigenDSizesWithPrefix<NDIMS, int>(input_shape);  \
-  auto broadcast_32bit = output_shape.AsEigenDSizes<NDIMS, int>();        \
-  if (input_shape.dims() > 0) {                                           \
-    for (int i = 0; i < NDIMS; i++) {                                     \
-      if (reshape[i] != broadcast[i]) {                                   \
-        OP_REQUIRES(                                                      \
-            ctx, ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)), \
-            errors::InvalidArgument("invalid shape to broadcast from ",   \
-                                    input_shape.DebugString(), " to ",    \
-                                    output_shape.DebugString()));         \
-        broadcast[i] = broadcast[i] / reshape[i];                         \
-      } else {                                                            \
-        broadcast[i] = 1;                                                 \
-      }                                                                   \
-      if (can_use_32bit) {                                                \
-        broadcast_32bit[i] = static_cast<int>(broadcast[i]);              \
-      }                                                                   \
-    }                                                                     \
+template <typename Device, typename T>
+struct BroadcastTo {
+  template <int NDIMS>
+  void DoBCast32Bit(const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+                    typename TTypes<T, NDIMS>::ConstTensor in,
+                    const typename Eigen::array<int, NDIMS> &bcast) const {
+    To32Bit(out).device(device) = To32Bit(in).broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_FROM_SCALAR()                              \
-  if (std::is_same<Eigen::GpuDevice, Device>::value) {              \
-    FillFunctor<Device, T>()(d, output_tensor.flat<T>(),            \
-                             input_tensor.scalar<T>());             \
-  } else {                                                          \
-    output.device(d) = output.constant(input_tensor.scalar<T>()()); \
+  template <int NDIMS>
+  void DoBCast(
+      const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+      typename TTypes<T, NDIMS>::ConstTensor in,
+      const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const {
+    out.device(device) = in.broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_CASE(dim_i)                                        \
-  case dim_i: {                                                             \
-    if (can_use_32bit) {                                                    \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      To32Bit(output).device(d) =                                           \
-          To32Bit(input).reshape(reshape_32bit).broadcast(broadcast_32bit); \
-    } else {                                                                \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      output.device(d) = input.reshape(reshape).broadcast(broadcast);       \
-    }                                                                       \
-  } break
-
-template <typename Device, typename T>
-struct BroadcastTo {
-  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
-                  const TensorShape &output_shape, const Tensor &input_tensor,
-                  const TensorShape &input_shape) {
-    if (output_shape.num_elements() == 0) {
-      return;
-    }
-    if (output_shape == input_shape) {
-      output_tensor.flat<T>().device(d) = input_tensor.flat<T>();
-      return;
-    }
-
+  template <int NDIMS>
+  void ReshapeAndBCast(const Device &device, Tensor &output_tensor,
+                       const Tensor &input_tensor, const BCast &bcast) const {
     const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value &&
                                output_tensor.NumElements() < kint32max &&
                                input_tensor.NumElements() < kint32max;
+    if (can_use_32bit) {
+      DoBCast32Bit<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<int, NDIMS>(bcast.x_bcast()));
+    } else {
+      DoBCast<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast()));
+    }
+  }
 
-    switch (output_shape.dims()) {
-      case 0: {
-        if (input_shape.dims() > 0) {
-          ctx->CtxFailure(errors::InvalidArgument(
-              "invalid shape to broadcast from ", input_shape.DebugString(),
-              " to ", output_shape.DebugString()));
-          break;
-        }
-        output_tensor.scalar<T>().device(d) = input_tensor.scalar<T>();
+  // PRECONDITION: rank(input_shape) > 0 &&
+  //               rank(input_shape) <= rank(output_shape)  &&
+  //               output_shape.num_elements() > 0.
+  void operator()(const Device &device, OpKernelContext *ctx,
+                  Tensor &output_tensor, const TensorShape &output_shape,
+                  const Tensor &input_tensor, const TensorShape &input_shape,
+                  const BCast &bcast) const {
+    const int ndims = bcast.y_reshape().size();
+    switch (ndims) {
+      case 1:
+        ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 2:
+        ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 3:
+        ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 4:
+        ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 5:
+        ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast);
         break;
-      }
-      case 1: {
-        BROADCAST_SHAPE(1, input_shape, output_shape);
-
-        auto output = output_tensor.tensor<T, 1>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 2: {
-        BROADCAST_SHAPE(2, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 2>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 3: {
-        BROADCAST_SHAPE(3, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 3>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 4: {
-        BROADCAST_SHAPE(4, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 4>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 5: {
-        BROADCAST_SHAPE(5, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 5>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-            HANDLE_BROADCAST_CASE(5);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
       default:
-        ctx->CtxFailure(errors::InvalidArgument(
-            "invalid shape to broadcast from ", input_shape.DebugString(),
-            " to ", output_shape.DebugString()));
+        ctx->SetStatus(errors::Unimplemented(
+            "Broadcast between ", input_shape.DebugString(), " and ",
+            output_shape.DebugString(), " is not supported yet."));
         break;
     }
   }
-
- private:
-  template <int NDIMS, typename DimType = Eigen::DenseIndex>
-  Eigen::DSizes<DimType, NDIMS> AsEigenDSizesWithPrefix(
-      const TensorShape &shape) const {
-    Eigen::DSizes<DimType, NDIMS> dsizes;
-    for (int d = 0; d < NDIMS - shape.dims(); d++) {
-      dsizes[d] = 1;
-    }
-    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
-      dsizes[d] =
-          static_cast<DimType>(shape.dim_size(d - (NDIMS - shape.dims())));
-    }
-    return dsizes;
-  }
 };
 
-#undef BROADCAST_SHAPE
-#undef HANDLE_BROADCAST_FROM_SCALAR
-#undef HANDLE_BROADCAST_CASE
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 551d77f4950d08e869c49cbc245c564a1050c047..516468c768f01381dffaf27418371677b6a9bbb2 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bucketize_op.h"
-#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -36,8 +36,8 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
     const int32 size_in, const T* in, const int32 size_boundaries,
-    CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
-  const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+    GpuDeviceArrayStruct<float> boundaries_array, int32* out) {
+  const float* boundaries = GetGpuDeviceArrayOnDevice(&boundaries_array);
 
   extern __shared__ __align__(sizeof(float)) unsigned char shared_mem[];
   float* shared_mem_boundaries = reinterpret_cast<float*>(shared_mem);
@@ -85,8 +85,8 @@ struct BucketizeFunctor<GPUDevice, T> {
                         typename TTypes<int32, 1>::Tensor& output) {
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
-    CudaDeviceArrayOnHost<float> boundaries_array(context,
-                                                  boundaries_vector.size());
+    GpuDeviceArrayOnHost<float> boundaries_array(context,
+                                                 boundaries_vector.size());
     TF_RETURN_IF_ERROR(boundaries_array.Init());
     for (int i = 0; i < boundaries_vector.size(); ++i) {
       boundaries_array.Set(i, boundaries_vector[i]);
@@ -103,10 +103,10 @@ struct BucketizeFunctor<GPUDevice, T> {
              d.stream()>>>(input.size(), input.data(), boundaries_vector.size(),
                            boundaries_array.data(), output.data());
     } else {
-      BucketizeCustomKernel<T, false>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              input.size(), input.data(), boundaries_vector.size(),
-              boundaries_array.data(), output.data());
+      TF_CHECK_OK(CudaLaunchKernel(
+          BucketizeCustomKernel<T, false>, config.block_count,
+          config.thread_per_block, 0, d.stream(), input.size(), input.data(),
+          boundaries_vector.size(), boundaries_array.data(), output.data()));
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 3a72567655c09c7091bc917e0af9f20725f38287..5306c77102ebf70cdbcbae847d4386829ee3526b 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -99,9 +99,9 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   } else {
     Tensor in;
     if (external_src_dtype_ != src_dtype_) {
-      // If the type is a quantized type we need to do an UnsafeCopyFromInternal
-      // since the src_dtype_ is different from external_src_type_.
-      in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
+      // If the type is a quantized type we need to do a bitcast since the
+      // src_dtype_ is different from external_src_type_.
+      OP_REQUIRES_OK(ctx, in.BitcastFrom(inp, src_dtype_, inp.shape()));
     } else {
       in = inp;
     }
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index c3c0c50007601c015a677705d452bd5fba63467e..b6deb8e579ebfcec6c8c5e3271c8d77b3982d0de 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -41,6 +41,10 @@ struct CheckNumericsLaunch {
   void Run(const GPUDevice& d, const T* data, int size,
            int abnormal_detected[2]);
 };
+
+extern template struct CheckNumericsLaunch<Eigen::half>;
+extern template struct CheckNumericsLaunch<float>;
+extern template struct CheckNumericsLaunch<double>;
 #endif
 
 namespace {
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index f9f10c1b42f2ed6d2012798c8f720bbb9d211f5c..f0db3141932130f15633c21c60bcd65a01857fcb 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 namespace tensorflow {
 
@@ -65,8 +66,8 @@ struct CheckNumericsLaunch {
         (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
-    CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
-        data, size, abnormal_detected);
+    TF_CHECK_OK(CudaLaunchKernel(CheckNumericsKernel<T>, num_blocks, block_size,
+                                 0, d.stream(), data, size, abnormal_detected));
   }
 };
 
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5e6f06c6578d1a6dc777b39e8e04aa963b5aecd
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+
+namespace tensorflow {
+namespace {
+string NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+}  // namespace
+
+NcclReducer::NcclReducer() : col_ctx_(nullptr), col_params_(nullptr) {}
+
+Status NcclReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.type != REDUCTION_COLLECTIVE ||
+      col_params->instance.impl_details.collective_name != "NcclReduce") {
+    return errors::Internal("Unexpected collective type ",
+                            col_params->instance.type, " expected ",
+                            REDUCTION_COLLECTIVE, "; or collective name ",
+                            col_params->instance.impl_details.collective_name,
+                            " expected NcclReduce");
+  } else {
+    return Status::OK();
+  }
+}
+
+Status NcclReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+Status NcclReducer::InitializeInstanceBeforeGroupDiscovery(
+    CollectiveParams* col_params) {
+  if (col_params->default_rank == 0 && col_params->group.num_tasks > 1) {
+    col_params->instance.communicator_key =
+        NcclManager::instance()->GenerateCommunicatorKey();
+  }
+  return Status::OK();
+}
+
+Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
+  if (merge_op == "Add") {
+    *reduction_op = ncclSum;
+    return Status::OK();
+  } else if (merge_op == "Mul") {
+    *reduction_op = ncclProd;
+    return Status::OK();
+  } else {
+    return errors::Internal("Expected merge_op to be either Add or Mul, found ",
+                            merge_op);
+  }
+}
+
+void NcclReducer::Run(StatusCallback done) {
+  ncclRedOp_t reduction_op;
+  Status s = ReductionOp(col_params_->merge_op->type_string(), &reduction_op);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  Tensor group_size;
+  Notification group_size_ready;
+  Status group_size_status;
+  if (col_params_->final_op) {
+    // Create an on-device scalar value from group_size_.
+    // TODO(ayushd, tucker): avoid this copy by either reusing across
+    // invocations or providing the scalar to the kernel in host memory.
+    Tensor group_size_val(col_ctx_->output->dtype(), TensorShape({}));
+    switch (col_ctx_->output->dtype()) {
+      case DT_FLOAT:
+        group_size_val.scalar<float>()() = col_params_->group.group_size;
+        break;
+      case DT_DOUBLE:
+        group_size_val.scalar<double>()() = col_params_->group.group_size;
+        break;
+      case DT_INT32:
+        group_size_val.scalar<int32>()() = col_params_->group.group_size;
+        break;
+      case DT_INT64:
+        group_size_val.scalar<int64>()() = col_params_->group.group_size;
+        break;
+      default:
+        done(errors::Internal("Unsupported type ", col_ctx_->output->dtype()));
+        return;
+    }
+    group_size = Tensor(
+        col_ctx_->device->GetAllocator(col_ctx_->op_ctx->input_alloc_attr(0)),
+        col_ctx_->output->dtype(), TensorShape({}));
+    DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context();
+    // Enqueue copy on gpu stream.
+    op_dev_ctx->CopyCPUTensorToDevice(
+        &group_size_val, col_ctx_->device, &group_size,
+        [&group_size_ready, &group_size_status](const Status& s) {
+          group_size_status = s;
+          group_size_ready.Notify();
+        });
+  } else {
+    group_size_ready.Notify();
+  }
+
+  Notification nccl_done;
+  Status nccl_status;
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  // `AddToAllReduce` performs consistency checks for the NCCL call and enqueues
+  // the `Participant` struct locally.  When all local participants with this
+  // `nccl_collective_key` have called `AddToAllReduce` and
+  // `SignalMultiNodeReady`, all devices at this worker are ready to process
+  // this NCCL op.
+  //
+  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
+  // point, it synchronizes the NCCL stream with the compute stream, and then
+  // enqueues the NCCL kernel on the NCCL stream.
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  const string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto done_callback = [&nccl_done, &nccl_status](const Status& s) {
+    nccl_status = s;
+    nccl_done.Notify();
+  };
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done_callback));
+  VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
+          << col_params_->group.num_tasks << " current task "
+          << col_params_->instance.task_names[col_params_->default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices << " device "
+          << col_ctx_->device_name << " instance "
+          << col_params_->instance.instance_key;
+  NcclManager::instance()->AddToAllReduce(
+      std::move(participant),
+      {nccl_collective_key, num_local_devices, num_global_devices,
+       col_params_->instance.communicator_key},
+      reduction_op);
+
+  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
+  // deadlocks.  In the current implementation, we define a deterministic
+  // sequential launch order between potentially concurrent collective instances
+  // by introducing control information during static graph analysis in
+  // graph/collective_order.cc.  This can be either in the form of explicit
+  // control edges or via `wait_for` attribute on the collective op.
+  //
+  // The other end of the design spectrum would have a distinguished node
+  // dynamically signal the next collective to launch to all other participants.
+  // This has higher degree of runtime coordination, but it may be able to
+  // achieve better performance if the (arbitrary) static execution order
+  // assigned in the first approach turns out to not be good from a scheduling
+  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
+  // concurrent collective instances, and the static ordering assigns c1 -> c2
+  // -> c3.  In practice, it could turn out that c3 is always ready to execute
+  // before c1 or c2.
+  //
+  // `WaitForDependencies` may block if the collective instances on which this
+  // op depends have not yet launched.  When this function returns, this op is
+  // ready to go.
+  col_ctx_->col_exec->WaitForDependencies(*col_params_);
+  NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  // When all devices at this worker have called `SignalMultiNodeReady`, the
+  // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+  // implementation of `Launched` keeps track of the number of devices that have
+  // launched.
+  col_ctx_->col_exec->Launched(*col_params_);
+
+  // Wait for nccl op and group_size copy to succeed, then do final_op.
+  group_size_ready.WaitForNotification();
+  nccl_done.WaitForNotification();
+  Status final_status =
+      group_size_status.ok() ? nccl_status : group_size_status;
+  if (final_status.ok() && col_params_->final_op) {
+    final_status = collective_util::ComputeBinOp(
+        col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+        col_params_->final_op.get(), col_ctx_->output, &group_size);
+  }
+  done(final_status);
+}
+
+REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.h b/tensorflow/core/kernels/collective_nccl_reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc70b280c5dc9eb9da72667d459ea727945d7e8a
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+#ifdef GOOGLE_CUDA
+
+class NcclReducer : public CollectiveImplementationInterface {
+ public:
+  NcclReducer();
+  ~NcclReducer() override = default;
+
+  // No-op for this collective implementation.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes the device objects and device localities.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // Initialize nccl communicator key.
+  Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) override;
+
+  // Hands off all reduce to NcclManager.
+  void Run(StatusCallback done) override;
+
+ private:
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+};
+
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c92f1f7433e34cf4e3789dcd480f8822147891
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@@ -0,0 +1,332 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+static constexpr int kStepId = 10;
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node, DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      DEVICE_GPU, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) LOG(FATAL) << status;
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+class NcclReducerTest : public ::testing::Test {
+ protected:
+  ~NcclReducerTest() override {
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void InitGPUDevices() {
+    std::vector<std::unique_ptr<Device>> all_devices;
+    SessionOptions session_options;
+    session_options.config.mutable_gpu_options()
+        ->set_per_process_gpu_memory_fraction(0.1);
+    session_options.env = Env::Default();
+    Status s = DeviceFactory::GetFactory(DEVICE_GPU)
+                   ->AddDevices(session_options, "", &all_devices);
+    TF_CHECK_OK(s);
+    for (std::unique_ptr<Device>& d : all_devices) {
+      if (d->device_type() == "GPU") {
+        gpus_.emplace_back(std::move(d));
+      }
+    }
+  }
+
+  void Init(int num_ranks) {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    InitGPUDevices();
+    std::vector<std::unique_ptr<Device>> local_devices;
+    std::vector<string> device_names;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      if (rank < gpus_.size()) {
+        local_devices.emplace_back(std::move(gpus_[rank]));
+      }
+    }
+    int num_gpus = local_devices.size();
+    for (const auto& device : local_devices) {
+      device_names.push_back(device->name());
+      VLOG(2) << device->name();
+    }
+    if (!dev_mgr_) dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
+        /*gpu_ring_order=*/nullptr);
+
+    // Initialize collective params.
+    col_params_.name = "test_nccl_collective_op";
+    const int group_key = 5;
+    col_params_.group.group_key = group_key;
+    col_params_.group.device_type = DEVICE_GPU;
+    col_params_.group.group_size = num_ranks;
+    const int instance_key = 23;
+    col_params_.instance.instance_key = instance_key;
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = DT_FLOAT;
+    col_params_.instance.impl_details.collective_name = "NcclReduce";
+    const string task_name = "/job:worker/replica:0/task:0";
+    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      col_params_.instance.device_names.push_back(
+          device_names[rank % num_gpus]);
+      col_params_.instance.task_names.push_back(task_name);
+    }
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      instances_.push_back(absl::make_unique<DeviceInstance>(
+          rank, col_params_.instance.device_names[rank], this));
+    }
+  }
+
+  void Reduce() {
+    int done = 0;
+    mutex done_mu;
+    condition_variable done_cv;
+    for (const auto& instance : instances_) {
+      DeviceInstance* di = instance.get();
+      SchedClosure([di, &done, &done_mu, &done_cv] {
+        di->DoReduce();
+        mutex_lock l(done_mu);
+        ++done;
+        done_cv.notify_all();
+      });
+    }
+
+    mutex_lock l(done_mu);
+    while (done < instances_.size()) done_cv.wait(l);
+  }
+
+  void RunTest(int num_ranks, int tensor_length) {
+    Init(num_ranks);
+    std::vector<float> expected(tensor_length, 0.0);
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      DeviceInstance* instance = instances_[rank].get();
+      instance->InitTensor(DT_FLOAT, TensorShape({tensor_length}),
+                           [&expected, rank](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               float value = pow(10, rank) * i;
+                               t->flat<float>()(i) = value;
+                               expected[i] += value;
+                             }
+                           });
+    }
+    Reduce();
+    // Confirm that every rank computed the same correct value.
+    for (int i = 0; i < tensor_length; ++i) {
+      expected[i] /= num_ranks;
+    }
+    for (int rank = 0; rank < instances_.size(); ++rank) {
+      TF_ASSERT_OK(instances_[rank]->status_);
+      Tensor* dev_tensor = &instances_[rank]->tensor_;
+      Tensor actual(DT_FLOAT, TensorShape({tensor_length}));
+      Notification note;
+      Device* dev = instances_[rank]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          dev_tensor, /*tensor_name=*/"", dev, &actual,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+      for (int i = 0; i < tensor_length; ++i) {
+        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
+            << "Mismatch at rank " << rank << " index " << i;
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Div")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& device_name, NcclReducerTest* parent)
+        : parent_(parent), device_name_(device_name), rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
+          << "Could not find device " << device_name_ << " existing devices "
+          << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.default_rank = rank;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      Tensor cpu_tensor(dtype, shape);
+      init_f(&cpu_tensor);
+      VLOG(2) << "cpu_tensor " << cpu_tensor.DebugString();
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      Notification note;
+      dev_info->default_context->CopyCPUTensorToDevice(
+          &cpu_tensor, device_, &tensor_,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+    }
+
+    void DoReduce() {
+      col_params_.merge_op = GetAdd(device_);
+      col_params_.final_op = GetDiv(device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          parent_->GetCollectiveReduce(col_params_, &tensor_, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a NcclReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclReducer reducer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &tensor_, &tensor_);
+      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-reduce.
+      reducer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    NcclReducerTest* parent_;
+    string device_name_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };
+
+  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::vector<std::unique_ptr<DeviceInstance>> instances_;
+  CollectiveParams col_params_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+TEST_F(NcclReducerTest, Test2Dev16Len) { RunTest(2, 16); }
+TEST_F(NcclReducerTest, Test4Dev16Len) { RunTest(4, 16); }
+TEST_F(NcclReducerTest, Test8Dev16Len) { RunTest(8, 16); }
+TEST_F(NcclReducerTest, Test8Dev128Len) { RunTest(8, 128); }
+TEST_F(NcclReducerTest, Test8Dev1045991Len) { RunTest(8, 1048576); }
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 82e2913b64afca2e0fc8c64d1c6e366f3a2d307e..23356283bb52dc4ab7f61193211072e6f95fb1f4 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -43,16 +43,21 @@ class CollectiveOpKernel : public AsyncOpKernel {
       // Call in a blockable thread because it's not guaranteed that
       // this call cannot block.
       c->env()->SchedClosure([this, c, done, col_exec]() {
-        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
-                                      c->cancellation_manager(),
-                                      [this, c, done](const Status& s) {
-                                        if (s.ok()) {
-                                          ComputeAsync(c, done);
-                                        } else {
-                                          c->SetStatus(s);
-                                          done();
-                                        }
-                                      });
+        VLOG(1) << "CollectiveOpKernel CompleteParams for collective "
+                << col_params_.name << " device " << c->device()->name()
+                << " group " << col_params_.group.group_key << " instance "
+                << col_params_.instance.instance_key;
+        col_exec->CompleteParamsAsync(
+            c->device()->name(), &col_params_, c->cancellation_manager(),
+            [this, c, done](const Status& s) {
+              if (s.ok()) {
+                col_params_.instance.impl_details.dependencies = dependencies_;
+                ComputeAsync(c, done);
+              } else {
+                c->SetStatus(s);
+                done();
+              }
+            });
       });
       return false;
     }
@@ -60,8 +65,60 @@ class CollectiveOpKernel : public AsyncOpKernel {
   }
 
   CollectiveParams col_params_;
+  std::vector<int32> dependencies_;
 };
 
+class CollectiveGatherOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveGatherOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Gather");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    // Allocate output on the first pass through this function.  This must be
+    // done immediately, while we're still in the executor thread.  Otherwise
+    // the memory is not guaranteed to be unused by any concurrently executing
+    // GPU kernel.
+    if (c->mutable_output(0) == nullptr) {
+      // Allocate the output tensor.
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_output(0, col_params_.instance.shape, &output), done);
+    }
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    auto actual_done = [c, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveGatherOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_CPU),
+                        CollectiveGatherOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_GPU),
+                        CollectiveGatherOpKernel);
+
 class CollectiveReduceOpKernel : public CollectiveOpKernel {
  public:
   explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
@@ -87,6 +144,7 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
                     "final_op must be one of {\"Id\", \"Div\"} but got ",
                     final_op_name));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("wait_for", &dependencies_));
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
@@ -146,10 +204,18 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
       col_params_.instance.shape = c->input(0).shape();
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
-    auto actual_done = [c, col_exec, done](const Status& s) {
+
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveReduceKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveReduceKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -208,10 +274,17 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
                          " does not match shape of input"),
         done);
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -263,10 +336,17 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
index 345405e3fe6f89c5f6bbf0721cf1d6e25b6077d1..1baa27f014f1ebca68f54196f47caa18c9f776fc 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@@ -114,19 +114,20 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
   }
 }
 
-#define DEFINE_GPU_SPECS(T)                                               \
-  template <>                                                             \
-  void CompareAndBitpack<GPUDevice, T>::operator()(                       \
-      OpKernelContext* c, typename TTypes<T>::ConstMatrix input,          \
-      typename TTypes<T>::ConstScalar threshold,                          \
-      TTypes<uint8>::Matrix output) {                                     \
-    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
-    int64 total_count = output.size();                                    \
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
-                                                                          \
-    CompareAndBitpackKernel<T>                                            \
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
-            total_count, threshold.data(), input.data(), output.data());  \
+#define DEFINE_GPU_SPECS(T)                                                    \
+  template <>                                                                  \
+  void CompareAndBitpack<GPUDevice, T>::operator()(                            \
+      OpKernelContext* c, typename TTypes<T>::ConstMatrix input,               \
+      typename TTypes<T>::ConstScalar threshold,                               \
+      TTypes<uint8>::Matrix output) {                                          \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
+    int64 total_count = output.size();                                         \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);             \
+                                                                               \
+    TF_CHECK_OK(CudaLaunchKernel(CompareAndBitpackKernel<T>,                   \
+                                 config.block_count, config.thread_per_block,  \
+                                 0, d.stream(), total_count, threshold.data(), \
+                                 input.data(), output.data()));                \
   }
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 8b53ecf1216429bc52abbc696171e1377e38e063..175c45285d671145030df43317cbaad617c559b6 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -54,6 +55,24 @@ void ConcatGPU(
         inputs_flat,
     Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
+// Explicit instantiations in concat_lib_gpu.cc.
+#define REGISTER(T)                                                           \
+  extern template void ConcatGPU<T>(                                          \
+      OpKernelContext * c,                                                    \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_complex64(REGISTER);
+TF_CALL_complex128(REGISTER);
+TF_CALL_int32(REGISTER);  // Needed for TensorLists.
+TF_CALL_int64(REGISTER);
+TF_CALL_int16(REGISTER);
+TF_CALL_bfloat16(REGISTER);
+TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
+#undef REGISTER
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 93e392d3032405ea848bd2f147653c9a5c7a1818..a75d464c31d489b6a2e23e864153a25d06ba961d 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -26,24 +26,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/kernels/concat_lib_gpu.h"
+#include "tensorflow/core/kernels/gpu_device_array.h"
 
 namespace tensorflow {
-
-template <typename T, typename IntType>
-void ConcatGPUSlice(
-    const Eigen::GpuDevice& gpu_device,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs_flat,
-    typename TTypes<T, 2>::Matrix* output);
-
-template <typename T, typename IntType>
-void ConcatGPUImpl(const Eigen::GpuDevice& d,
-                   const CudaDeviceArrayStruct<const T*>& input_ptrs,
-                   const CudaDeviceArrayStruct<IntType>& ptr_offsets,
-                   bool same_size, int slice_size,
-                   typename TTypes<T, 2>::Matrix* output);
-
 namespace {
 
 template <typename T, typename IntType>
@@ -52,14 +38,14 @@ void ConcatGPUCall(
     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
         inputs_flat,
     typename TTypes<T, 2>::Tensor* output_flat) {
-  CudaDeviceArrayOnHost<const T*> input_ptrs(c, inputs_flat.size());
+  GpuDeviceArrayOnHost<const T*> input_ptrs(c, inputs_flat.size());
   OP_REQUIRES_OK(c, input_ptrs.Init());
   for (int i = 0; i < inputs_flat.size(); ++i) {
     input_ptrs.Set(i, inputs_flat[i]->data());
   }
   OP_REQUIRES_OK(c, input_ptrs.Finalize());
 
-  CudaDeviceArrayOnHost<IntType> output_scan(c, inputs_flat.size() + 1);
+  GpuDeviceArrayOnHost<IntType> output_scan(c, inputs_flat.size() + 1);
   OP_REQUIRES_OK(c, output_scan.Init());
   IntType scan = 0;
   output_scan.Set(0, scan);
@@ -115,7 +101,9 @@ void ConcatGPU(
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
+TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
+TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
diff --git a/tensorflow/core/kernels/concat_lib_gpu.h b/tensorflow/core/kernels/concat_lib_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2db66a7c5a878f93d6308bf6738d3ecc43de445f
--- /dev/null
+++ b/tensorflow/core/kernels/concat_lib_gpu.h
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_GPU
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+
+namespace tensorflow {
+
+template <typename T, typename IntType>
+void ConcatGPUSlice(
+    const Eigen::GpuDevice& gpu_device,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs_flat,
+    typename TTypes<T, 2>::Matrix* output);
+
+template <typename T, typename IntType>
+void ConcatGPUImpl(const Eigen::GpuDevice& d,
+                   const GpuDeviceArrayStruct<const T*>& input_ptrs,
+                   const GpuDeviceArrayStruct<IntType>& ptr_offsets,
+                   bool same_size, int slice_size,
+                   typename TTypes<T, 2>::Matrix* output);
+
+// Explicit instantiations in concat_lib_gpu_impl.cu.cc.
+#define REGISTER(T)                                                           \
+  extern template void ConcatGPUSlice<T, int32>(                              \
+      const Eigen::GpuDevice& gpu_device,                                     \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      typename TTypes<T, 2>::Matrix* output);                                 \
+  extern template void ConcatGPUSlice<T, int64>(                              \
+      const Eigen::GpuDevice& gpu_device,                                     \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \
+          inputs_flat,                                                        \
+      typename TTypes<T, 2>::Matrix* output);                                 \
+  extern template void ConcatGPUImpl<T, int32>(                               \
+      const Eigen::GpuDevice& d,                                              \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                       \
+      const GpuDeviceArrayStruct<int32>& ptr_offsets, bool fixed_size,        \
+      int split_size, typename TTypes<T, 2>::Matrix* output);                 \
+  extern template void ConcatGPUImpl<T, int64>(                               \
+      const Eigen::GpuDevice& d,                                              \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                       \
+      const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size,        \
+      int split_size, typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_complex64(REGISTER);
+TF_CALL_complex128(REGISTER);
+TF_CALL_int32(REGISTER);  // Needed for TensorLists.
+TF_CALL_int64(REGISTER);
+TF_CALL_int16(REGISTER);
+TF_CALL_bfloat16(REGISTER);
+TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
+#undef REGISTER
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_GPU_H_
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a561d918bd36f711d1b813dfb533ec6d690af8ee..e5a00c25cddbfda8e517846058daf970ea15387f 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -23,7 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/cuda_device_array_gpu.h"
+#include "tensorflow/core/kernels/concat_lib_gpu.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -34,9 +35,9 @@ namespace {
 
 template <typename T, typename IntType>
 __global__ void concat_fixed_kernel(
-    CudaDeviceArrayStruct<const T*> input_ptr_data, int split_size,
+    GpuDeviceArrayStruct<const T*> input_ptr_data, int split_size,
     int total_rows, int total_cols, T* output) {
-  const T** input_ptrs = GetCudaDeviceArrayOnDevice(&input_ptr_data);
+  const T** input_ptrs = GetGpuDeviceArrayOnDevice(&input_ptr_data);
   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; gidx < total_cols; gidx += blockDim.x * gridDim.x) {
@@ -58,11 +59,11 @@ __global__ void concat_fixed_kernel(
 // cannot be in anonymous namespace due to extern shared memory
 template <typename T, typename IntType, bool useSmem>
 __global__ void concat_variable_kernel(
-    CudaDeviceArrayStruct<const T*> input_ptr_data,
-    CudaDeviceArrayStruct<IntType> output_scan, IntType total_rows,
+    GpuDeviceArrayStruct<const T*> input_ptr_data,
+    GpuDeviceArrayStruct<IntType> output_scan, IntType total_rows,
     IntType total_cols, T* output) {
-  const T** input_ptrs = GetCudaDeviceArrayOnDevice(&input_ptr_data);
-  IntType* col_scan = GetCudaDeviceArrayOnDevice(&output_scan);
+  const T** input_ptrs = GetGpuDeviceArrayOnDevice(&input_ptr_data);
+  IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
 
   // do upper_bound on col to find which pointer we should be using
   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,8 +136,8 @@ void ConcatGPUSlice(
 
 template <typename T, typename IntType>
 void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
-                   const CudaDeviceArrayStruct<const T*>& input_ptrs,
-                   const CudaDeviceArrayStruct<IntType>& output_scan,
+                   const GpuDeviceArrayStruct<const T*>& input_ptrs,
+                   const GpuDeviceArrayStruct<IntType>& output_scan,
                    bool fixed_size, int split_size,
                    typename TTypes<T, 2>::Matrix* output) {
   auto config = GetCuda2DLaunchConfig(output->dimension(1),
@@ -184,24 +185,26 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
           inputs_flat,                                                        \
       typename TTypes<T, 2>::Matrix* output);
 
-#define REGISTER_GPU32(T)                                               \
-  template void ConcatGPUImpl<T, int32>(                                \
-      const Eigen::GpuDevice& d,                                        \
-      const CudaDeviceArrayStruct<const T*>& input_ptrs,                \
-      const CudaDeviceArrayStruct<int32>& ptr_offsets, bool fixed_size, \
+#define REGISTER_GPU32(T)                                              \
+  template void ConcatGPUImpl<T, int32>(                               \
+      const Eigen::GpuDevice& d,                                       \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                \
+      const GpuDeviceArrayStruct<int32>& ptr_offsets, bool fixed_size, \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
-#define REGISTER_GPU64(T)                                               \
-  template void ConcatGPUImpl<T, int64>(                                \
-      const Eigen::GpuDevice& d,                                        \
-      const CudaDeviceArrayStruct<const T*>& input_ptrs,                \
-      const CudaDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size, \
+#define REGISTER_GPU64(T)                                              \
+  template void ConcatGPUImpl<T, int64>(                               \
+      const Eigen::GpuDevice& d,                                       \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs,                \
+      const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size, \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
+TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_int16(REGISTER_GPUCONCAT32);
 TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
@@ -209,7 +212,9 @@ REGISTER_GPUCONCAT32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
+TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_int16(REGISTER_GPUCONCAT64);
 TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
@@ -217,7 +222,9 @@ REGISTER_GPUCONCAT64(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
+TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_int16(REGISTER_GPU32);
 TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
@@ -225,7 +232,9 @@ REGISTER_GPU32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
+TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_int16(REGISTER_GPU64);
 TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ff6298351761c84bedd117e125f53b2166cd104f..72d8b45dd96b912f3d94f4c0f0495c82de53e4d4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 75ca77fad5cfca27eb4b78954ddf8b6d74f8e5e2..5ff428dd312c6935adc56a0dbcdef76b77cb287b 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 8d117574284065ff8fcf62d913257b0ccdd497e5..1f0a63044ba813b2ccda89d0dea722e16e921479 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -437,9 +437,10 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
     CHECK(dst_filter_format == FORMAT_OIHW)
         << "Unsupported output layout: " << ToString(dst_filter_format);
 
-    ShuffleInTensor3Simple<T, 2, 1, 0>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in.data(), combined_dims, out.data());
+    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), config.virtual_thread_count,
+                                 in.data(), combined_dims, out.data()));
   }
 };
 
@@ -458,9 +459,10 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
       combined_dims[2] *= in.dimension(i);
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    ShuffleInTensor3Simple<T, 2, 1, 0>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in.data(), combined_dims, out.data());
+    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), config.virtual_thread_count,
+                                 in.data(), combined_dims, out.data()));
   }
 };
 
@@ -488,15 +490,15 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
     const Dimension<NDIMS - 2> padding_left_dim(padding_left);
 
     if (format == FORMAT_NHWC) {
-      PadInputCustomKernelNHWC<T, NDIMS>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              config.virtual_thread_count, in.data(), input_dims, out.data(),
-              output_dims, padding_left_dim);
+      TF_CHECK_OK(CudaLaunchKernel(
+          PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
     } else if (format == FORMAT_NCHW) {
-      PadInputCustomKernelNCHW<T, NDIMS>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              config.virtual_thread_count, in.data(), input_dims, out.data(),
-              output_dims, padding_left_dim);
+      TF_CHECK_OK(CudaLaunchKernel(
+          PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
     } else {
       LOG(FATAL) << "Invalid data format: " << format;
     }
@@ -605,15 +607,17 @@ void LaunchBatchNarrowMatrixTransposeKernel(
     const T* input, const Dimension<3>& input_dims, T* output) {
   constexpr int NumThreads = TileLongSide;
   if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
-    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
-                                          TileShortSide>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
+    TF_CHECK_OK(CudaLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                              TileShortSide>,
+        total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
+        output));
   } else {
-    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
-                                          TileLongSide>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
+    TF_CHECK_OK(CudaLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                              TileLongSide>,
+        total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
+        output));
   }
 }
 
@@ -914,10 +918,12 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
 
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
-                                          conjugate>
-        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
-                                                            output);
+
+    TF_CHECK_OK(CudaLaunchKernel(
+        SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize,
+                                              kTileSize, conjugate>,
+        total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims,
+        output));
 
   } else if (narrow_matrix) {
     SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
@@ -925,9 +931,10 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
-    ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, input, input_dims, output);
+    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), config.virtual_thread_count, input,
+                                 input_dims, output));
   }
 }
 
@@ -957,9 +964,10 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
                                static_cast<int>(combined_dims[2])};
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
-    ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in, input_dims, out);
+    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), config.virtual_thread_count, in,
+                                 input_dims, out));
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 0df05ceb0266fba43dc23162a2d92c33b02c7fa2..efd701c7687c90efe541ad3b2372a3a94c909a01 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -51,6 +51,8 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace {
@@ -181,130 +183,6 @@ struct LaunchXsmmBackwardFilter<CPUDevice, float> {
 };
 #endif
 
-template <typename Device, class T>
-class Conv2DFastBackpropFilterOp : public OpKernel {
- public:
-  explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(
-                    "Conv2DFastBackpropFilterOp only supports NHWC."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
-                errors::InvalidArgument(
-                    "Row and column strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(
-        context, padding_ != Padding::EXPLICIT,
-        errors::Unimplemented("Current CPU implementation does not support "
-                              "EXPLICIT padding yet."));
-    std::vector<int64> explicit_paddings;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("explicit_paddings", &explicit_paddings));
-    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
-                                              /*num_dims=*/4, data_format_));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
-    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
-                errors::InvalidArgument(
-                    "Current Eigen and libxsmm implementations do not "
-                    "yet support dilation rates larger than 1."));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const Tensor& filter_sizes = context->input(1);
-    const Tensor& out_backprop = context->input(2);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-        errors::InvalidArgument(
-            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-            filter_sizes.dims()));
-    TensorShape filter_shape;
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                filter_sizes.vec<int32>(), &filter_shape));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        context,
-        ConvBackpropComputeDimensions(
-            type_string(), /*num_spatial_dims=*/2, input.shape(), filter_shape,
-            out_backprop.shape(), strides_, padding_, data_format_, &dims));
-
-    Tensor* filter_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
-
-    // If there is nothing to compute, return.
-    if (filter_shape.num_elements() == 0) {
-      return;
-    }
-
-#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
-    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-            dims.spatial_dims[0].stride, padding_,
-            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-            dims.spatial_dims[1].stride, padding_,
-            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-
-    if (pad_left == pad_right && pad_top == pad_bottom) {
-      if (LaunchXsmmBackwardFilter<Device, T>()(
-              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
-              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
-              static_cast<int>(dims.spatial_dims[0].stride),
-              static_cast<int>(dims.spatial_dims[1].stride),
-              static_cast<int>(pad_top), static_cast<int>(pad_left),
-              data_format_)) {
-        return;
-      }
-    }
-#endif
-
-    LaunchConv2DBackpropFilterOp<Device, T>()(
-        context, false, false, out_backprop, input,
-        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
-        filter_backprop, data_format_);
-  }
-
- private:
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp);
-};
-
 // Based on implementation written by Yangqing Jia (jiayq).
 template <typename Device, class T>
 class Conv2DCustomBackpropFilterOp : public OpKernel {
@@ -537,12 +415,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                             \
                               .Label("custom")                                \
                               .TypeConstraint<T>("T"),                        \
-                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("eigen_tensor")                          \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv2DFastBackpropFilterOp<CPUDevice, T>);
+                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
@@ -970,8 +843,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
         &algorithms));
-    ProfileResult best_result;
-    ProfileResult best_result_no_scratch;
+    std::vector<tensorflow::AutotuneResult> results;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
@@ -988,28 +860,23 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
               .ok();
       if (cudnn_launch_status) {
         if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
-          }
+          results.emplace_back();
+          auto& result = results.back();
+          result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+          result.mutable_conv()->set_tensor_ops_enabled(
+              profile_algorithm.tensor_ops_enabled());
+          result.mutable_success()->set_scratch_bytes(
+              scratch_allocator.TotalByteSize());
+          *result.mutable_success()->mutable_run_time() =
+              proto_utils::ToDurationProto(
+                  absl::Milliseconds(profile_result.elapsed_time_in_ms()));
         }
       }
     }
-    OP_REQUIRES(ctx,
-                best_result.is_valid() || best_result_no_scratch.is_valid(),
-                errors::NotFound("No algorithm worked!"));
-    if (best_result.is_valid()) {
-      algorithm_config.set_algorithm(best_result.algorithm());
-    }
-    if (best_result_no_scratch.is_valid()) {
-      algorithm_config.set_algorithm_no_scratch(
-          best_result_no_scratch.algorithm());
-    }
+    LogConvAutotuneResults(ctx->op_kernel().def(), transformed_input,
+                           pre_transformed_filter_backprop,
+                           transformed_out_backprop, stream->parent(), results);
+    OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
     AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters,
                                                  algorithm_config);
   }
@@ -1098,6 +965,7 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                         Conv2DSlowBackpropFilterOp<GPUDevice, Eigen::half>);
 
 // To be used inside depthwise_conv_grad_op.cc.
+// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 74b97b98648dc5f2a32d4755ac08d731af5549e8..730c71e4a75ab5cf29964de529b3174dfed46011 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/base/dynamic_annotations.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -50,6 +51,8 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace {
@@ -184,130 +187,79 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
 };
 #endif
 
-template <typename Device, class T>
-class Conv2DFastBackpropInputOp : public OpKernel {
- public:
-  explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(
-                    "Eigen Conv2DFastBackpropInputOp only supports NHWC."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
-                errors::InvalidArgument(
-                    "Row and column strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(context, (dilations_[0] && dilations_[3]),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
-    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
-                errors::InvalidArgument(
-                    "Current Eigen and libxsmm implementations do not "
-                    "yet support dilation rates larger than 1."));
-    OP_REQUIRES(
-        context, padding_ != Padding::EXPLICIT,
-        errors::Unimplemented("Current CPU implementation does not support "
-                              "EXPLICIT padding yet."));
-    std::vector<int64> explicit_paddings;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("explicit_paddings", &explicit_paddings));
-    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
-                                              /*num_dims=*/4, data_format_));
+template <typename T>
+struct Conv2DCustomBackpropInputMatMulFunctor {
+  using MatrixMap = Eigen::Map<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  using ConstMatrixMap = Eigen::Map<
+      const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Compute gradient into 'im2col_buf'.
+    MatrixMap C(im2col_buf, output_image_size, filter_total_size);
+
+    ConstMatrixMap A(out_data, output_image_size, dims_out_depth);
+    ConstMatrixMap B(filter_data, filter_total_size, dims_out_depth);
+
+    C.noalias() = A * B.transpose();
   }
+};
 
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_sizes = context->input(0);
-    const Tensor& filter = context->input(1);
-    const Tensor& out_backprop = context->input(2);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(input_sizes.shape()),
-        errors::InvalidArgument(
-            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-            input_sizes.dims()));
-    TensorShape input_shape;
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                input_sizes.vec<int32>(), &input_shape));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv2DFastBackpropInput", /*num_spatial_dims=*/2,
-                       input_shape, filter.shape(), out_backprop.shape(),
-                       strides_, padding_, data_format_, &dims));
-
-    Tensor* in_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_shape, &in_backprop));
-
-    // If there is nothing to compute, return.
-    if (input_shape.num_elements() == 0) {
-      return;
-    }
-
-#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
-    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-            dims.spatial_dims[0].stride, padding_,
-            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-            dims.spatial_dims[1].stride, padding_,
-            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-
-    if (pad_left == pad_right && pad_top == pad_bottom) {
-      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-              context, context->eigen_device<Device>(),
-              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-              dims.spatial_dims[1].input_size,
-              static_cast<int>(dims.spatial_dims[0].stride),
-              static_cast<int>(dims.spatial_dims[1].stride),
-              static_cast<int>(pad_top), static_cast<int>(pad_left),
-              data_format_)) {
-        return;
-      }
-    }
-#endif
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+template <>
+struct Conv2DCustomBackpropInputMatMulFunctor<float> {
+  using T = float;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Inputs are in RowMajor order, we "cheat" by swapping the LHS and RHS:
+    //   RowMajor: C   = A   * B
+    //   ColMajor: C^T = B^T * A^T
+    //
+    // Dimension names:
+    //   out_image_size    -> ois
+    //   filter_total_size -> fts
+    //   dims_out_depth    -> dod
+    //
+    // RowMajor:
+    //   im2col      = out_data    * filter_data^T
+    //   [ois x fts] = [ois x dod] * [fts x dod]^T
+    //
+    // ColMajor:
+    //   im2col^T    = filter_data *  out_data^T
+    //   [fts x ois] = [fts x dod] * [dod x ois]*
+
+    const int m = filter_total_size;
+    const int n = output_image_size;
+    const int k = dims_out_depth;  // contraction dim
+
+    const char transposeA = 'T';  // sgemm(A) == filter_data
+    const char transposeB = 'N';  // sgemm(B) == out_data
+
+    const int ldA = dims_out_depth;
+    const int ldB = dims_out_depth;
+    const int ldC = filter_total_size;
+
+    const float alpha = 1.0;
+    const float beta = 0.0;
+
+    // mkldnn_sgemm code can't be instrumented with msan.
+    ANNOTATE_MEMORY_IS_INITIALIZED(
+        im2col_buf, filter_total_size * output_image_size * sizeof(T));
+
+    mkldnn_status_t st =
+        mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, &alpha, filter_data,
+                     &ldA, out_data, &ldB, &beta, im2col_buf, &ldC);
 
-    LaunchConv2DBackpropInputOp<Device, T>()(
-        context, false, false, out_backprop, filter,
-        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, /*explicit_paddings=*/{},
-        in_backprop, data_format_);
+    OP_REQUIRES(
+        ctx, st == 0,
+        errors::Internal("Failed to call mkldnn_sgemm. Error code: ", st));
   }
-
- private:
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp);
 };
+#endif
 
 // Based on implementation written by Yangqing Jia (jiayq).
 template <typename Device, class T>
@@ -542,21 +494,14 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         input_backprop_data += input_offset;
       }
     } else {
-      typedef Eigen::Map<
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          MatrixMap;
-      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
-                                             Eigen::RowMajor>>
-          ConstMatrixMap;
-
       for (int image_id = 0; image_id < dims.batch_size;
            image_id += shard_size) {
         const int shard_limit =
             std::min(static_cast<int>(shard_size),
                      static_cast<int>(dims.batch_size) - image_id);
 
-        auto shard = [&dims, &pad_top, &pad_left, &pad_bottom, &pad_right,
-                      &output_image_size, &filter_total_size,
+        auto shard = [&context, &dims, &pad_top, &pad_left, &pad_bottom,
+                      &pad_right, &output_image_size, &filter_total_size,
                       &input_backprop_data, &col_buffer_data,
                       &out_backprop_data, &filter_data, &input_offset,
                       &output_offset, &size_C](int64 start, int64 limit) {
@@ -565,13 +510,9 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
             T* input_data = input_backprop_data + shard_id * input_offset;
             const T* out_data = out_backprop_data + shard_id * output_offset;
 
-            // Compute gradient into 'im2col_buf'.
-            MatrixMap C(im2col_buf, output_image_size, filter_total_size);
-
-            ConstMatrixMap A(out_data, output_image_size, dims.out_depth);
-            ConstMatrixMap B(filter_data, filter_total_size, dims.out_depth);
-
-            C.noalias() = A * B.transpose();
+            Conv2DCustomBackpropInputMatMulFunctor<T>()(
+                context, out_data, filter_data, filter_total_size,
+                output_image_size, dims.out_depth, im2col_buf);
 
             Col2im<T>(im2col_buf, dims.in_depth,
                       dims.spatial_dims[0].input_size,
@@ -608,12 +549,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                               .Device(DEVICE_CPU)                            \
                               .Label("custom")                               \
                               .TypeConstraint<T>("T"),                       \
-                          Conv2DCustomBackpropInputOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
-                              .Device(DEVICE_CPU)                            \
-                              .Label("eigen_tensor")                         \
-                              .TypeConstraint<T>("T"),                       \
-                          Conv2DFastBackpropInputOp<CPUDevice, T>);
+                          Conv2DCustomBackpropInputOp<CPUDevice, T>);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
@@ -1019,8 +955,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
         conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
         &algorithms));
-    ProfileResult best_result;
-    ProfileResult best_result_no_scratch;
+    std::vector<tensorflow::AutotuneResult> results;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
@@ -1036,28 +971,23 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
               .ok();
       if (cudnn_launch_status) {
         if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
-          }
+          results.emplace_back();
+          auto& result = results.back();
+          result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+          result.mutable_conv()->set_tensor_ops_enabled(
+              profile_algorithm.tensor_ops_enabled());
+          result.mutable_success()->set_scratch_bytes(
+              scratch_allocator.TotalByteSize());
+          *result.mutable_success()->mutable_run_time() =
+              proto_utils::ToDurationProto(
+                  absl::Milliseconds(profile_result.elapsed_time_in_ms()));
         }
       }
     }
-    OP_REQUIRES(ctx,
-                best_result.is_valid() || best_result_no_scratch.is_valid(),
-                errors::NotFound("No algorithm worked!"));
-    if (best_result.is_valid()) {
-      algorithm_config.set_algorithm(best_result.algorithm());
-    }
-    if (best_result_no_scratch.is_valid()) {
-      algorithm_config.set_algorithm_no_scratch(
-          best_result_no_scratch.algorithm());
-    }
+    LogConvAutotuneResults(ctx->op_kernel().def(), pre_transformed_in_backprop,
+                           transformed_filter, transformed_out_backprop,
+                           stream->parent(), results);
+    OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
     AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters,
                                                algorithm_config);
   }
@@ -1178,6 +1108,7 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                         Conv2DSlowBackpropInputOp<GPUDevice, Eigen::half>);
 
 // To be used inside depthwise_conv_grad_op.cc.
+// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
 template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
 template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
 template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 0fd7550830333f749312f5db54d3ffd6ffa22a4a..9ceb51062e832a2e59455d71a0115e98896ef276 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -78,9 +78,6 @@ Status ConvBackpropExtractAndVerifyDimension(
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
 
-  // TODO(reedwm): Correctly handle explicit padding here. The rest of the
-  // fields set on 'dim' are only used in XLA. TensorFlow ops do not yet support
-  // explicit padding for XLA.
   int64 effective_filter_size = (dim->filter_size - 1) * dim->dilation + 1;
   dim->expanded_output_size = (dim->output_size - 1) * dim->stride + 1;
   const auto padded_out_size = dim->input_size + effective_filter_size - 1;
@@ -102,7 +99,7 @@ Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, const std::vector<int64>& explicit_paddings,
+    Padding padding, absl::Span<const int64> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index c8e8cf28c55e266575738dfe9ef65d588dd0dd2f..173f92806f911edf6dca043510b1fd9b36a0a66f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -222,7 +222,7 @@ struct ConvBackpropSpatialDimension {
   int64 stride;
   int64 dilation;
 
-  // The following fields are valid only if the padding is not EXPLICIT.
+  // Output size after scaling by the stride.
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
@@ -270,7 +270,7 @@ Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, const std::vector<int64>& explicit_paddings,
+    Padding padding, absl::Span<const int64> explicit_paddings,
     TensorFormat data_format, ConvBackpropDimensions* dims);
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index ca46da6ba38044b50aa6299b82f9b9cacd87bb4c..48ea2a687bf27c8195e31e483e99e5c3e685a967 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1145,8 +1145,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     TensorShape input_shape;
     if (takes_shape_) {
       const Tensor& input_sizes = context->input(0);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_sizes.vec<int32>(), &input_shape));
+      OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape));
     } else {
       input_shape = context->input(0).shape();
     }
@@ -1530,8 +1529,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     TensorShape filter_shape;
     if (takes_shape_) {
       const Tensor& filter_sizes = context->input(1);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_sizes.vec<int32>(), &filter_shape));
+      OP_REQUIRES_OK(context, MakeShape(filter_sizes, &filter_shape));
     } else {
       filter_shape = context->input(1).shape();
     }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index a8138fd0a737b40c6b7f38760cd2297a753749b4..2e6cb006a27855686f25a62a45425a2a5ca0535d 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -55,6 +55,8 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -855,8 +857,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         errors::Unknown("Failed to get convolution algorithm. This is probably "
                         "because cuDNN failed to initialize, so try looking to "
                         "see if a warning log message was printed above."));
-    ProfileResult best_result;
-    ProfileResult best_result_no_scratch;
+    std::vector<tensorflow::AutotuneResult> results;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
@@ -871,30 +872,22 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
               .ok();
       if (cudnn_launch_status) {
         if (profile_result.is_valid()) {
-          if (profile_result.elapsed_time_in_ms() <
-              best_result.elapsed_time_in_ms()) {
-            best_result = profile_result;
-          }
-          if (scratch_allocator.TotalByteSize() == 0 &&
-              profile_result.elapsed_time_in_ms() <
-                  best_result_no_scratch.elapsed_time_in_ms()) {
-            best_result_no_scratch = profile_result;
-          }
+          results.emplace_back();
+          auto& result = results.back();
+          result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+          result.mutable_conv()->set_tensor_ops_enabled(
+              profile_algorithm.tensor_ops_enabled());
+          result.mutable_success()->set_scratch_bytes(
+              scratch_allocator.TotalByteSize());
+          *result.mutable_success()->mutable_run_time() =
+              proto_utils::ToDurationProto(
+                  absl::Milliseconds(profile_result.elapsed_time_in_ms()));
         }
       }
     }
-    // TODO(yangzihao): refactor the profile result checking code into a common
-    // utility function.
-    OP_REQUIRES(ctx,
-                best_result.is_valid() || best_result_no_scratch.is_valid(),
-                errors::NotFound("No algorithm worked!"));
-    if (best_result.is_valid()) {
-      algorithm_config.set_algorithm(best_result.algorithm());
-    }
-    if (best_result_no_scratch.is_valid()) {
-      algorithm_config.set_algorithm_no_scratch(
-          best_result_no_scratch.algorithm());
-    }
+    LogConvAutotuneResults(ctx->op_kernel().def(), input, transformed_filter,
+                           transformed_output, stream->parent(), results);
+    OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
     AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
   }
 
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 105a4b1b825e304175d62c1723aeb46154b46a96..ccd24fcdd4c5e4945f2daf6461727e6038b4dd32 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -110,7 +110,7 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
 
 // Computes and validates convolutions dimensions from Conv2D parameters. If
 // parameters are valid, dimensions will be updated with derived convolution
-// dimensions, otherwise error will be returned.
+// dimensions, otherwise an error will be returned.
 Status ComputeConv2DDimension(const Conv2DParameters& params,
                               const Tensor& input, const Tensor& filter,
                               Conv2DDimensions* dimensions);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 5a59e20cc27cb7fe7b6fc6d9fdd160f2e3c4a983..3ea4742d20626e98f20d3b7c1df808cba7ae8710 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -34,6 +34,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 using stream_executor::dnn::DimIndex;
 #endif
 
@@ -445,8 +447,7 @@ struct LaunchConvOp<GPUDevice, T> {
                       "because cuDNN failed to initialize, so try looking to "
                       "see if a warning log message was printed above."));
 
-      ProfileResult best_result;
-      ProfileResult best_result_no_scratch;
+      std::vector<tensorflow::AutotuneResult> results;
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
@@ -461,28 +462,22 @@ struct LaunchConvOp<GPUDevice, T> {
                 .ok();
         if (cudnn_launch_status) {
           if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
+            results.emplace_back();
+            auto& result = results.back();
+            result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+            result.mutable_conv()->set_tensor_ops_enabled(
+                profile_algorithm.tensor_ops_enabled());
+            result.mutable_success()->set_scratch_bytes(
+                scratch_allocator.TotalByteSize());
+            *result.mutable_success()->mutable_run_time() =
+                proto_utils::ToDurationProto(
+                    absl::Milliseconds(profile_result.elapsed_time_in_ms()));
           }
         }
       }
-      OP_REQUIRES(ctx,
-                  best_result.is_valid() || best_result_no_scratch.is_valid(),
-                  errors::NotFound("No algorithm worked!"));
-      if (best_result.is_valid()) {
-        algorithm_config.set_algorithm(best_result.algorithm());
-      }
-      if (best_result_no_scratch.is_valid()) {
-        algorithm_config.set_algorithm_no_scratch(
-            best_result_no_scratch.algorithm());
-      }
+      LogConvAutotuneResults(ctx->op_kernel().def(), input, filter, *output,
+                             stream->parent(), results);
+      OP_REQUIRES_OK(ctx, BestCudnnConvAlgorithm(results, &algorithm_config));
       AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
     }
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_double.cc b/tensorflow/core/kernels/conv_ops_fused_double.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ff5627dc2096c69019099eb642bdaca5df5ef84
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_double.cc
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This include can't be in the conv_ops_fused_impl.h headers. See b/62899350.
+#if GOOGLE_CUDA
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#endif  // GOOGLE_CUDA
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(double);
+}  // namespace functor
+
+TF_CALL_double(REGISTER_FUSED_GPU_CONV2D);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_float.cc b/tensorflow/core/kernels/conv_ops_fused_float.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40f2eb3bbec1fa186328b781463e2b24fd667f1e
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_float.cc
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This include can't be in the conv_ops_fused_impl.h headers. See b/62899350.
+#if GOOGLE_CUDA
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#endif  // GOOGLE_CUDA
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(float);
+}  // namespace functor
+
+TF_CALL_float(REGISTER_FUSED_GPU_CONV2D);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_half.cc b/tensorflow/core/kernels/conv_ops_fused_half.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5086b2b6f1b908366d92485219e70cde4dd3ac5a
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_half.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This include can't be in the conv_ops_fused_impl.h headers. See b/62899350.
+#if GOOGLE_CUDA
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#endif  // GOOGLE_CUDA
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(Eigen::half);
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 7be1de29c951dca16085e35587d02eeeec01354f..c1c3b555d64086f0d80aaa33c8c30a6e282b3a31 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
@@ -102,7 +102,7 @@ void FusedConvParallelFor(
 // Holds the state needed for the resizing subtasks.
 template <class T1>
 struct ResizeTaskParameters {
-  ResizeTaskParameters() : st(false) {}
+  ResizeTaskParameters() : st(false, false) {}
 
   int cache_height;
   T1* resize_cache;
@@ -649,9 +649,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     OP_REQUIRES(context, (input.shape().num_elements() > 0),
                 errors::InvalidArgument("Input tensor can't be empty"));
 
-    ImageResizerState st(false);
+    ImageResizerState st(false, false);
     if (DoResize) {
-      st = ImageResizerState(align_corners_);
+      st = ImageResizerState(align_corners_, false);
       st.ValidateAndCalculateOutputSize(context, input);
       if (!context->status().ok()) return;
     } else {
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused_impl.h
similarity index 94%
rename from tensorflow/core/kernels/conv_ops_fused.cc
rename to tensorflow/core/kernels/conv_ops_fused_impl.h
index 551782c488cdd91c1cf54c35ffe61d1e12e41004..f207af655651ad020a2d14e753642603511bf86f 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -28,6 +28,9 @@ limitations under the License.
 //
 // NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.
 
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
@@ -41,11 +44,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -56,14 +59,16 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
+class AutotuneResult;
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace {
 // Supported Conv2D fusions. Not all of them supported on all type of devices.
 enum class FusedComputationType {
   // NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
@@ -463,12 +468,12 @@ class FusedConvParameters : public ConvParameters {
   se::dnn::ActivationMode activation_mode_;
 };
 
-bool operator==(const FusedConvParameters& lhs,
+inline bool operator==(const FusedConvParameters& lhs,
                 const FusedConvParameters& rhs) {
   return lhs.get_data_as_tuple() == rhs.get_data_as_tuple();
 }
 
-bool operator!=(const FusedConvParameters& lhs,
+inline bool operator!=(const FusedConvParameters& lhs,
                 const FusedConvParameters& rhs) {
   return !(lhs == rhs);
 }
@@ -482,7 +487,7 @@ using AutoTuneFusedConv =
     AutoTuneSingleton<FusedConvAutoTuneGroup, FusedConvParameters,
                       se::dnn::AlgorithmConfig>;
 
-int64 ConvolveScratchSize() {
+inline int64 ConvolveScratchSize() {
   static int64 convolve_scratch_size = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
@@ -494,10 +499,11 @@ int64 ConvolveScratchSize() {
 // convolution on the stream) and parameters, by running all possible
 // algorithms and measuring execution time.
 // TODO(ezhulenev): Move it to conv_ops_gpu.h and share with conv_ops.cc.
-template <typename T, typename ConvLaunch>
+template <typename T, typename ConvLaunch, typename LogFunc>
 Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
                                  const ConvLaunch launch,
                                  OpKernelContext* context, se::Stream* stream,
+                                 const LogFunc& log,
                                  se::dnn::AlgorithmConfig* algorithm_config) {
   // Check if we already have an algorithm selected for the given parameters.
   if (AutoTuneFusedConv::GetInstance()->Find(params, algorithm_config)) {
@@ -515,9 +521,7 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
         "see if a warning log message was printed above.");
   }
 
-  se::dnn::ProfileResult best_result;
-  se::dnn::ProfileResult best_result_no_scratch;
-
+  std::vector<tensorflow::AutotuneResult> results;
   for (auto profile_algorithm : algorithms) {
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
     se::dnn::ProfileResult profile_result;
@@ -527,29 +531,21 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
                &profile_result);
 
     if (cudnn_launch_status && profile_result.is_valid()) {
-      if (profile_result.elapsed_time_in_ms() <
-          best_result.elapsed_time_in_ms()) {
-        best_result = profile_result;
-      }
-      if (scratch_allocator.TotalByteSize() == 0 &&
-          profile_result.elapsed_time_in_ms() <
-              best_result_no_scratch.elapsed_time_in_ms()) {
-        best_result_no_scratch = profile_result;
-      }
+      results.emplace_back();
+      auto& result = results.back();
+      result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
+      result.mutable_conv()->set_tensor_ops_enabled(
+          profile_algorithm.tensor_ops_enabled());
+      result.mutable_success()->set_scratch_bytes(
+          scratch_allocator.TotalByteSize());
+      *result.mutable_success()->mutable_run_time() =
+          proto_utils::ToDurationProto(
+              absl::Milliseconds(profile_result.elapsed_time_in_ms()));
     }
   }
-
-  if (!best_result.is_valid() && !best_result_no_scratch.is_valid()) {
-    return errors::NotFound("No algorithm worked!");
-  }
-  if (best_result.is_valid()) {
-    algorithm_config->set_algorithm(best_result.algorithm());
-  }
-  if (best_result_no_scratch.is_valid()) {
-    algorithm_config->set_algorithm_no_scratch(
-        best_result_no_scratch.algorithm());
-  }
-
+  // Only log on an AutoTuneFusedConv cache miss.
+  log(results);
+  TF_RETURN_IF_ERROR(BestCudnnConvAlgorithm(results, algorithm_config));
   AutoTuneFusedConv::GetInstance()->Insert(params, *algorithm_config);
   return Status::OK();
 }
@@ -796,9 +792,15 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
 
     se::dnn::AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune) {
-      OP_REQUIRES_OK(context, FindBestConvolveAlgorithm<T>(
-                                  conv_parameters, launch, context, stream,
-                                  &algorithm_config));
+      auto status = FindBestConvolveAlgorithm<T>(
+          conv_parameters, launch, context, stream,
+          [&](absl::Span<const tensorflow::AutotuneResult> results) {
+            LogFusedConvAutotuneResults(
+                context->op_kernel().def(), input, transformed_filter,
+                transformed_output, bias, nullptr, stream->parent(), results);
+          },
+          &algorithm_config);
+      OP_REQUIRES_OK(context, status);
     }
 
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
@@ -822,8 +824,6 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
 
 #endif  // GOOGLE_CUDA
 
-}  // namespace
-
 template <typename Device, typename T>
 class FusedConv2DOp : public OpKernel {
  public:
@@ -962,22 +962,9 @@ class FusedConv2DOp : public OpKernel {
       Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       FusedConv2DOp<CPUDevice, T>);
 
-// If we're using the alternative GEMM-based implementation of Conv2D for the
-// CPU implementation, don't register this EigenTensor-based version.
-// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-// contractions with non-default contraction output kernels.
-#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
-TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
-TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
-#endif  // !USE_GEMM_FOR_CONV
-
-#undef REGISTER_FUSED_CPU_CONV2D
-
 #if GOOGLE_CUDA
 
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
+#define DECLARE_FUNCTOR_GPU_SPEC(T)                                      \
   template <>                                                            \
   void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
       const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
@@ -992,23 +979,14 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
 // Registration of the GPU implementations.
 #define REGISTER_FUSED_GPU_CONV2D(T)                                  \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("_FusedConv2D").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       FusedConv2DOp<GPUDevice, T>);
 
-TF_CALL_float(REGISTER_FUSED_GPU_CONV2D);
-TF_CALL_double(REGISTER_FUSED_GPU_CONV2D);
-
-#undef REGISTER_FUSED_GPU_CONV2D
-
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7a67658c4d88b9a5dc66635527f97719773e6f83..9aa395a0d8c8d98bf71542fe12a18eebf90ad92d 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -92,12 +92,12 @@ class ConvParameters {
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
-        in_(in),
+        in_(CheckSpatialArraySize(in)),
         data_format_(data_format),
-        filter_(filter),
-        dilation_(dilation),
-        stride_(stride),
-        padding_(padding),
+        filter_(CheckSpatialArraySize(filter)),
+        dilation_(CheckSpatialArraySize(dilation)),
+        stride_(CheckSpatialArraySize(stride)),
+        padding_(CheckSpatialArraySize(padding)),
         dtype_(dtype),
         device_id_(device_id) {
     hash_code_ = batch;
@@ -170,6 +170,11 @@ class ConvParameters {
  private:
   friend struct ConvParametersPeer;  // For testing purposes.
 
+  static const SpatialArray& CheckSpatialArraySize(const SpatialArray& array) {
+    CHECK_LE(array.size(), 3);  // Catch corruptions related to b/124313574.
+    return array;
+  }
+
   template <typename T>
   bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
     int64 total_size = 16 * std::ceil(batch_ / 16.0) *
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index af0a9fa82ee5778fa9e18cea59cf759fa468224f..05df9e0207e505bfd5b9a3bc9c5b7b2c90a0fa30 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include <string.h>
 #include <map>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 99d01b4db6bac68d890d93ac55bea576f43a5994..838cedd7a4aeeee4b1871bf4c64bbc0c871fdac9 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 8ab08fb93aeef2651f2911047d91216c85392705..427e6562d0db5dfdf35e73820155b011ac2dcdcf 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -371,12 +371,12 @@ struct CropAndResize<GPUDevice, T> {
 
     if (total_count > 0) {
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-      CropAndResizeKernel<<<config.block_count, config.thread_per_block, 0,
-                            d.stream()>>>(
-          config.virtual_thread_count, image.data(), boxes.data(),
-          box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, method, extrapolation_value,
-          crops.data());
+      TF_CHECK_OK(CudaLaunchKernel(
+          CropAndResizeKernel<T>, config.block_count, config.thread_per_block,
+          0, d.stream(), config.virtual_thread_count, image.data(),
+          boxes.data(), box_ind.data(), num_boxes, batch, image_height,
+          image_width, crop_height, crop_width, depth, method,
+          extrapolation_value, crops.data()));
     }
     return d.ok();
   }
@@ -406,11 +406,12 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
     total_count = batch * image_height * image_width * depth;
     if (total_count > 0) {
       config = GetCudaLaunchConfig(total_count, d);
-      SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          config.virtual_thread_count, grads_image.data());
+      TF_CHECK_OK(CudaLaunchKernel(
+          SetZero<T>, config.block_count, config.thread_per_block, 0,
+          d.stream(), config.virtual_thread_count, grads_image.data()));
     }
 
-    // Configurate interpolation method.
+    // Configure interpolation method.
     InterpolationMethod method = BILINEAR;
     if (method_name == "nearest") {
       method = NEAREST;
@@ -420,11 +421,12 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
       config = GetCudaLaunchConfig(total_count, d);
-      CropAndResizeBackpropImageKernel<<<
-          config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          config.virtual_thread_count, grads.data(), boxes.data(),
-          box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, grads_image.data(), method);
+      TF_CHECK_OK(CudaLaunchKernel(
+          CropAndResizeBackpropImageKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          grads.data(), boxes.data(), box_ind.data(), num_boxes, batch,
+          image_height, image_width, crop_height, crop_width, depth,
+          grads_image.data(), method));
     }
     return d.ok();
   }
@@ -454,19 +456,21 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
     total_count = num_boxes * 4;
     if (total_count > 0) {
       config = GetCudaLaunchConfig(total_count, d);
-      SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          config.virtual_thread_count, grads_boxes.data());
+      TF_CHECK_OK(CudaLaunchKernel(
+          SetZero<float>, config.block_count, config.thread_per_block, 0,
+          d.stream(), config.virtual_thread_count, grads_boxes.data()));
     }
 
     // Accumulate.
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
       config = GetCudaLaunchConfig(total_count, d);
-      CropAndResizeBackpropBoxesKernel<<<
-          config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          config.virtual_thread_count, grads.data(), image.data(), boxes.data(),
-          box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, grads_boxes.data());
+      TF_CHECK_OK(CudaLaunchKernel(
+          CropAndResizeBackpropBoxesKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          grads.data(), image.data(), boxes.data(), box_ind.data(), num_boxes,
+          batch, image_height, image_width, crop_height, crop_width, depth,
+          grads_boxes.data()));
     }
     return d.ok();
   }
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 6921020d09e94fa7b99d7ca6cb95c82274b2e4c0..0eadf4c1714f6987f0a91c153f59d56ce0254014 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -423,7 +423,7 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 999 lines)
   //  0, 1, 2, ..., 998
   AddInput<float>(TensorShape({1, kLength, kLength, 1}),
-                  [kLength](int i) -> float { return i % kLength; });
+                  [](int i) -> float { return i % kLength; });
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {0, 0, 0.5, 0.5, 0.5, 0.5, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -437,15 +437,15 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 500 lines)
   //  0, 1, 2, ..., 499
   Tensor result1(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(&result1, [kHalf](int i) -> float { return i % kHalf; });
+  test::FillFn<float>(&result1, [](int i) -> float { return i % kHalf; });
 
   // Result 2:
   //  499, 500, 501, ..., 998
   //  ... (altogether 500 lines)
   //  499, 500, 501, ..., 998
   Tensor result2(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(
-      &result2, [kHalf](int i) -> float { return i % kHalf + kHalf - 1; });
+  test::FillFn<float>(&result2,
+                      [](int i) -> float { return i % kHalf + kHalf - 1; });
 
   // Expected result is the concat of the two tensors.
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, kHalf, kHalf, 1}));
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb375ee4b351e4d15c234f9290ecc8780b096c32..aa68e105addab65cdc3ad468547e6e1273834077 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/ctc_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_loss_calculator.h"
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 39d0a998fdcfe0710af97e404e142955e57a7c2b..82d92388d401af176d6a555f4f0e51af84caef11 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -643,6 +643,50 @@ static inline Status GesvdImpl(
 
 TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
 
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status GesvdjBatchedImpl(BufSizeFnT bufsize, SolverFnT solver,
+                                       CudaSolver* cuda_solver,
+                                       OpKernelContext* context,
+                                       cusolverDnHandle_t cusolver_dn_handle,
+                                       cusolverEigMode_t jobz, int m, int n,
+                                       Scalar* A, int lda, Scalar* S, Scalar* U,
+                                       int ldu, Scalar* V, int ldv,
+                                       int* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
+  /* Get amount of workspace memory required. */
+  int lwork;
+  /* Default parameters for gesvdj and gesvdjBatched. */
+  gesvdjInfo_t svdj_info;
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnCreateGesvdjInfo(&svdj_info));
+  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, &lwork, svdj_info, batch_size));
+  /* Allocate device memory for workspace. */
+  auto dev_workspace =
+      cuda_solver->GetScratchSpace<Scalar>(lwork, "", /* on_host */ false);
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, CUDAComplex(dev_workspace.mutable_data()),
+      lwork, dev_lapack_info, svdj_info, batch_size));
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnDestroyGesvdjInfo(svdj_info));
+  return Status::OK();
+}
+
+#define GESVDJBATCHED_INSTANCE(Scalar, type_prefix)                            \
+  template <>                                                                  \
+  Status CudaSolver::GesvdjBatched<Scalar>(                                    \
+      cusolverEigMode_t jobz, int m, int n, Scalar* dev_A, int lda,            \
+      Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_V, int ldv,           \
+      int* dev_lapack_info, int batch_size) {                                  \
+    return GesvdjBatchedImpl(DN_BUFSIZE_FN(gesvdjBatched, type_prefix),        \
+                             DN_SOLVER_FN(gesvdjBatched, type_prefix), this,   \
+                             context_, cusolver_dn_handle_, jobz, m, n, dev_A, \
+                             lda, dev_S, dev_U, ldu, dev_V, ldv,               \
+                             dev_lapack_info, batch_size);                     \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVDJBATCHED_INSTANCE);
+
 //=============================================================================
 // Wrappers of cuBlas computational methods begin here.
 //
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 1fc344731c28df2e2d4cb9e931accfc0ca4592ed..fa8b4e241556afef82537db118706ebd35539987 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -312,6 +312,11 @@ class CudaSolver {
   Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
                int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
                int ldvt, int* dev_lapack_info) TF_MUST_USE_RESULT;
+  template <typename Scalar>
+  Status GesvdjBatched(cusolverEigMode_t jobz, int m, int n, Scalar* dev_A,
+                       int lda, Scalar* dev_S, Scalar* dev_U, int ldu,
+                       Scalar* dev_V, int ldv, int* dev_lapack_info,
+                       int batch_size);
 
  private:
   OpKernelContext* context_;  // not owned.
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 196494cbcf8b7f4f670599241d5bdbb1c29c7cd1..d43fe74733351b6937c23edb5abaee54cdf06ba0 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -559,7 +559,7 @@ struct RnnScratchSpace {
 // Extract and checks the forward input tensors, parameters, and shapes from the
 // OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
-                           const CudnnModelTypes& model_types,
+                           const CudnnModelTypes& model_types, bool time_major,
                            const Tensor** input, const Tensor** input_h,
                            const Tensor** input_c, const Tensor** params,
                            CudnnRnnModelShapes* model_shapes) {
@@ -573,8 +573,13 @@ Status ExtractForwardInput(OpKernelContext* context,
   if ((*input)->dims() != 3) {
     return errors::InvalidArgument("RNN input must be a 3-D vector.");
   }
-  model_shapes->max_seq_length = (*input)->dim_size(0);
-  model_shapes->batch_size = (*input)->dim_size(1);
+  if (time_major) {
+    model_shapes->max_seq_length = (*input)->dim_size(0);
+    model_shapes->batch_size = (*input)->dim_size(1);
+  } else {
+    model_shapes->max_seq_length = (*input)->dim_size(1);
+    model_shapes->batch_size = (*input)->dim_size(0);
+  }
   model_shapes->input_size = (*input)->dim_size(2);
   model_shapes->input_shape = (*input)->shape();
   model_shapes->dir_count =
@@ -585,12 +590,25 @@ Status ExtractForwardInput(OpKernelContext* context,
   if ((*input_h)->dims() != 3) {
     return errors::InvalidArgument("RNN input_h must be a 3-D vector.");
   }
-  model_shapes->num_layers = (*input_h)->dim_size(0) / model_shapes->dir_count;
+  if (time_major) {
+    model_shapes->num_layers =
+        (*input_h)->dim_size(0) / model_shapes->dir_count;
+  } else {
+    model_shapes->num_layers =
+        (*input_h)->dim_size(1) / model_shapes->dir_count;
+  }
   model_shapes->num_units = (*input_h)->dim_size(2);
 
-  model_shapes->hidden_state_shape =
-      TensorShape({model_shapes->dir_count * model_shapes->num_layers,
-                   model_shapes->batch_size, model_shapes->num_units});
+  if (time_major) {
+    model_shapes->hidden_state_shape =
+        TensorShape({model_shapes->dir_count * model_shapes->num_layers,
+                     model_shapes->batch_size, model_shapes->num_units});
+  } else {
+    model_shapes->hidden_state_shape =
+        TensorShape({model_shapes->batch_size,
+                     model_shapes->dir_count * model_shapes->num_layers,
+                     model_shapes->num_units});
+  }
   if ((*input_h)->shape() != model_shapes->hidden_state_shape) {
     return errors::InvalidArgument(
         "Invalid input_h shape: ", (*input_h)->shape().DebugString(), " ",
@@ -604,23 +622,28 @@ Status ExtractForwardInput(OpKernelContext* context,
           (*input_c)->shape().DebugString());
     }
   }
-  model_shapes->output_shape =
-      TensorShape({model_shapes->max_seq_length, model_shapes->batch_size,
-                   model_shapes->dir_count * model_shapes->num_units});
+  if (time_major) {
+    model_shapes->output_shape =
+        TensorShape({model_shapes->max_seq_length, model_shapes->batch_size,
+                     model_shapes->dir_count * model_shapes->num_units});
+  } else {
+    model_shapes->output_shape =
+        TensorShape({model_shapes->batch_size, model_shapes->max_seq_length,
+                     model_shapes->dir_count * model_shapes->num_units});
+  }
   return Status::OK();
 }
 
-// Extract and checks the sequence_lengths, forward input tensors,
-// parameters, and shapes from the OpKernelContext.
+// Overloaded function to process the sequence_lengths
 Status ExtractForwardInput(OpKernelContext* context,
-                           const CudnnModelTypes& model_types,
+                           const CudnnModelTypes& model_types, bool time_major,
                            const Tensor** input, const Tensor** input_h,
                            const Tensor** input_c, const Tensor** params,
-                           CudnnRnnModelShapes* model_shapes,
-                           const Tensor** sequence_lengths) {
+                           const Tensor** sequence_lengths,
+                           CudnnRnnModelShapes* model_shapes) {
   TF_RETURN_IF_ERROR(context->input("sequence_lengths", sequence_lengths));
-  return ExtractForwardInput(context, model_types, input, input_h, input_c,
-                             params, model_shapes);
+  return ExtractForwardInput(context, model_types, time_major, input, input_h,
+                             input_c, params, model_shapes);
 }
 
 template <typename T>
@@ -629,7 +652,7 @@ Status CreateForwardAndBackwardIODescriptors(
     std::unique_ptr<RnnSequenceTensorDescriptor>* input_desc,
     std::unique_ptr<RnnStateTensorDescriptor>* state_desc,
     std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc,
-    const absl::Span<const int>& seq_lengths) {
+    const absl::Span<const int>& seq_lengths, bool time_major) {
   StreamExecutor* executor = context->op_device_context()->stream()->parent();
   se::dnn::DataType data_type = ToDataType<T>::value;
 
@@ -639,11 +662,19 @@ Status CreateForwardAndBackwardIODescriptors(
 
   DCHECK_EQ(input_shape.dims(), 3);
   if (seq_lengths.data() != nullptr) {
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-        input_shape.dim_size(0), input_shape.dim_size(1),
-        input_shape.dim_size(2), seq_lengths, data_type);
-    TF_RETURN_IF_ERROR(input_desc_s.status());
-    *input_desc = input_desc_s.ConsumeValueOrDie();
+    if (time_major) {
+      auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+          input_shape.dim_size(0), input_shape.dim_size(1),
+          input_shape.dim_size(2), seq_lengths, time_major, data_type);
+      TF_RETURN_IF_ERROR(input_desc_s.status());
+      *input_desc = input_desc_s.ConsumeValueOrDie();
+    } else {
+      auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+          input_shape.dim_size(1), input_shape.dim_size(0),
+          input_shape.dim_size(2), seq_lengths, time_major, data_type);
+      TF_RETURN_IF_ERROR(input_desc_s.status());
+      *input_desc = input_desc_s.ConsumeValueOrDie();
+    }
   } else {
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -653,19 +684,35 @@ Status CreateForwardAndBackwardIODescriptors(
   }
 
   DCHECK_EQ(hidden_state_shape.dims(), 3);
-  auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
-      hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
-      hidden_state_shape.dim_size(2), data_type);
-  TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
-  *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
+  if (time_major) {
+    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+        hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
+        hidden_state_shape.dim_size(2), data_type);
+    TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
+    *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
+  } else {
+    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+        hidden_state_shape.dim_size(1), hidden_state_shape.dim_size(0),
+        hidden_state_shape.dim_size(2), data_type);
+    TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
+    *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
+  }
 
   DCHECK_EQ(output_shape.dims(), 3);
   if (seq_lengths.data() != nullptr) {
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-        output_shape.dim_size(0), output_shape.dim_size(1),
-        output_shape.dim_size(2), seq_lengths, data_type);
-    TF_RETURN_IF_ERROR(output_desc_s.status());
-    *output_desc = output_desc_s.ConsumeValueOrDie();
+    if (time_major) {
+      auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+          output_shape.dim_size(0), output_shape.dim_size(1),
+          output_shape.dim_size(2), seq_lengths, time_major, data_type);
+      TF_RETURN_IF_ERROR(output_desc_s.status());
+      *output_desc = output_desc_s.ConsumeValueOrDie();
+    } else {
+      auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+          output_shape.dim_size(1), output_shape.dim_size(0),
+          output_shape.dim_size(2), seq_lengths, time_major, data_type);
+      TF_RETURN_IF_ERROR(output_desc_s.status());
+      *output_desc = output_desc_s.ConsumeValueOrDie();
+    }
   } else {
     auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
         output_shape.dim_size(0), output_shape.dim_size(1),
@@ -687,7 +734,7 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
                  const bool is_training,
                  /* forward outputs, outputs of the function */
                  Tensor* output, Tensor* output_h, Tensor* output_c,
-                 const Tensor* sequence_lengths,
+                 const Tensor* sequence_lengths, bool time_major,
                  ScratchAllocator* reserve_space_allocator,
                  ScratchAllocator* workspace_allocator,
                  ProfileResult* output_profile_result) {
@@ -702,7 +749,7 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
   }
   TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
       context, model_shapes, &input_desc, &state_desc, &output_desc,
-      seq_lengths));
+      seq_lengths, time_major));
 
   auto input_data = AsDeviceMemory<T>(input);
   auto input_h_data = AsDeviceMemory<T>(input_h);
@@ -750,7 +797,7 @@ Status DoBackward(
     const Tensor* output_c_backprop, const Tensor* reserve_space,
     /* backprop outputs, output of the function */
     Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
-    Tensor* params_backprop, const Tensor* sequence_lengths,
+    Tensor* params_backprop, const Tensor* sequence_lengths, bool time_major,
     ScratchAllocator* workspace_allocator,
     ProfileResult* output_profile_result) {
   std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
@@ -764,7 +811,7 @@ Status DoBackward(
   }
   TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
       context, model_shapes, &input_desc, &state_desc, &output_desc,
-      seq_lengths));
+      seq_lengths, time_major));
 
   auto input_data = AsDeviceMemory<T>(input);
   auto input_h_data = AsDeviceMemory<T>(input_h);
@@ -1216,13 +1263,15 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
   void Compute(OpKernelContext* context) override {
     AlgorithmConfig algo_config;
-    ComputeAndReturnAlgorithm(context, &algo_config, false);
+    ComputeAndReturnAlgorithm(context, &algo_config, /*var_seq_lengths=*/false,
+                              /*time_major=*/true);
   }
 
  protected:
   virtual void ComputeAndReturnAlgorithm(OpKernelContext* context,
                                          AlgorithmConfig* output_algo_config,
-                                         bool var_seq_lengths) {
+                                         bool var_seq_lengths,
+                                         bool time_major) {
     CHECK_NE(output_algo_config, nullptr);
 
     const Tensor* input = nullptr;
@@ -1232,14 +1281,14 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
     if (var_seq_lengths) {
-      OP_REQUIRES_OK(
-          context, ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes,
-                                       &sequence_lengths));
+      OP_REQUIRES_OK(context,
+                     ExtractForwardInput(context, model_types(), time_major,
+                                         &input, &input_h, &input_c, &params,
+                                         &sequence_lengths, &model_shapes));
     } else {
-      OP_REQUIRES_OK(
-          context, ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes));
+      OP_REQUIRES_OK(context, ExtractForwardInput(
+                                  context, model_types(), time_major, &input,
+                                  &input_h, &input_c, &params, &model_shapes));
     }
     RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
@@ -1278,19 +1327,11 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
                                              *output_algo_config,
                                              &rnn_state_cache_, &rnn_desc_ptr));
-      if (var_seq_lengths) {
-        launch_status = DoForward<T>(
-            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-            input_c, params, is_training_, output, output_h, output_c,
-            sequence_lengths, &reserve_space_allocator, &workspace_allocator,
-            /*output_profile_result=*/nullptr);
-      } else {
-        launch_status = DoForward<T>(
-            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-            input_c, params, is_training_, output, output_h, output_c, nullptr,
-            &reserve_space_allocator, &workspace_allocator,
-            /*output_profile_result=*/nullptr);
-      }
+      launch_status = DoForward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, is_training_, output, output_h, output_c,
+          sequence_lengths, time_major, &reserve_space_allocator,
+          &workspace_allocator, /*output_profile_result=*/nullptr);
     }
     OP_REQUIRES_OK(context, launch_status);
   }
@@ -1372,7 +1413,8 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
   void Compute(OpKernelContext* context) override {
     AlgorithmConfig best_algo_config;
     CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
-        context, &best_algo_config, false);
+        context, &best_algo_config, /*var_seq_lengths=*/false,
+        /*time_major=*/true);
     if (!context->status().ok()) {
       return;
     }
@@ -1490,10 +1532,11 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       // Again use temp scratch allocator during profiling.
       CudnnRnnAllocatorInTemp<T> reserve_space_allocator(context);
       CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
-      status = DoForward<T>(
-          context, *rnn_desc, model_types(), model_shapes, input, input_h,
-          input_c, params, is_training(), output, output_h, output_c, nullptr,
-          &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
+      status = DoForward<T>(context, *rnn_desc, model_types(), model_shapes,
+                            input, input_h, input_c, params, is_training(),
+                            output, output_h, output_c, nullptr, true,
+                            &reserve_space_allocator, &workspace_allocator,
+                            &fwd_profile_result);
       if (!status.ok()) {
         continue;
       }
@@ -1506,7 +1549,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
             input_c, params, output, output_h, output_c, &output_backprop,
             &output_h_backprop, &output_c_backprop, &reserve_space,
             &input_backprop, &input_h_backprop, &input_c_backprop,
-            &params_backprop, nullptr, &workspace_allocator,
+            &params_backprop, nullptr, true, &workspace_allocator,
             &bak_profile_result);
         if (!status.ok()) {
           continue;
@@ -1561,15 +1604,22 @@ class CudnnRNNForwardOpV3<GPUDevice, T>
   using CudnnRNNKernelCommon::dropout;
   using CudnnRNNKernelCommon::HasInputC;
   using CudnnRNNKernelCommon::model_types;
+  bool time_major_;
+
+ protected:
+  bool time_major() { return time_major_; }
 
  public:
   explicit CudnnRNNForwardOpV3(OpKernelConstruction* context)
-      : CudnnRNNForwardOp<GPUDevice, T>(context) {}
+      : CudnnRNNForwardOp<GPUDevice, T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("time_major", &time_major_));
+  }
 
   void Compute(OpKernelContext* context) override {
     AlgorithmConfig best_algo_config;
     CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
-        context, &best_algo_config, true);
+        context, &best_algo_config, /*var_seq_lengths=*/true,
+        /*time_major=*/time_major());
     if (!context->status().ok()) {
       return;
     }
@@ -1604,11 +1654,12 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       : CudnnRNNKernelCommon(context) {}
 
   void Compute(OpKernelContext* context) override {
-    ComputeImpl(context, false);
+    ComputeImpl(context, false, true);
   }
 
  protected:
-  virtual void ComputeImpl(OpKernelContext* context, bool var_seq_lengths) {
+  virtual void ComputeImpl(OpKernelContext* context, bool var_seq_lengths,
+                           bool time_major) {
     const Tensor* input = nullptr;
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
@@ -1616,14 +1667,14 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
     if (var_seq_lengths) {
-      OP_REQUIRES_OK(
-          context, ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes,
-                                       &sequence_lengths));
+      OP_REQUIRES_OK(context,
+                     ExtractForwardInput(context, model_types(), time_major,
+                                         &input, &input_h, &input_c, &params,
+                                         &sequence_lengths, &model_shapes));
     } else {
-      OP_REQUIRES_OK(
-          context, ExtractForwardInput(context, model_types(), &input, &input_h,
-                                       &input_c, &params, &model_shapes));
+      OP_REQUIRES_OK(context, ExtractForwardInput(
+                                  context, model_types(), time_major, &input,
+                                  &input_h, &input_c, &params, &model_shapes));
     }
     RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
@@ -1665,22 +1716,13 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
                                              algo_config, &rnn_state_cache_,
                                              &rnn_desc_ptr));
-      if (var_seq_lengths) {
-        launch_status = DoBackward<T>(
-            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-            input_c, params, output, output_h, output_c, output_backprop,
-            output_h_backprop, output_c_backprop, reserve_space, input_backprop,
-            input_h_backprop, input_c_backprop, params_backprop,
-            sequence_lengths, &workspace_allocator,
-            /*output_profile_result=*/nullptr);
-      } else {
-        launch_status = DoBackward<T>(
-            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-            input_c, params, output, output_h, output_c, output_backprop,
-            output_h_backprop, output_c_backprop, reserve_space, input_backprop,
-            input_h_backprop, input_c_backprop, params_backprop, nullptr,
-            &workspace_allocator, /*output_profile_result=*/nullptr);
-      }
+      launch_status = DoBackward<T>(
+          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+          input_c, params, output, output_h, output_c, output_backprop,
+          output_h_backprop, output_c_backprop, reserve_space, input_backprop,
+          input_h_backprop, input_c_backprop, params_backprop, sequence_lengths,
+          time_major, &workspace_allocator,
+          /*output_profile_result=*/nullptr);
     }
     OP_REQUIRES_OK(context, launch_status);
   }
@@ -1827,12 +1869,20 @@ TF_CALL_double(REGISTER_GPU);
 template <typename T>
 class CudnnRNNBackwardOpV3<GPUDevice, T>
     : public CudnnRNNBackwardOp<GPUDevice, T> {
+ private:
+  bool time_major_;
+
+ protected:
+  bool time_major() { return time_major_; }
+
  public:
   explicit CudnnRNNBackwardOpV3(OpKernelConstruction* context)
-      : CudnnRNNBackwardOp<GPUDevice, T>(context) {}
+      : CudnnRNNBackwardOp<GPUDevice, T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("time_major", &time_major_));
+  }
 
   void Compute(OpKernelContext* context) override {
-    CudnnRNNBackwardOp<GPUDevice, T>::ComputeImpl(context, true);
+    CudnnRNNBackwardOp<GPUDevice, T>::ComputeImpl(context, true, time_major());
   }
 };
 
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
index 44dea7dee90822a332abbbd39b1d07a06a02b521..0122ffc6fab26926f672c59ebfc975175654c786 100644
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -62,10 +62,10 @@ struct UnaryClipOp<GPUDevice, T> {
                   typename TTypes<T>::Flat &out_flat) const {
     CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
-    UnaryClipCustomKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
-            out_flat.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        UnaryClipCustomKernel<T>, config.block_count, config.thread_per_block,
+        0, d.stream(), in0_flat.size(), in0_flat.data(), in1_flat.data(),
+        in2_flat.data(), out_flat.data()));
   }
 };
 
@@ -78,10 +78,10 @@ struct BinaryRightClipOp<GPUDevice, T> {
                   typename TTypes<T>::Flat &out_flat) const {
     CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
-    BinaryRightClipCustomKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
-            out_flat.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        BinaryRightClipCustomKernel<T>, config.block_count,
+        config.thread_per_block, 0, d.stream(), in0_flat.size(),
+        in0_flat.data(), in1_flat.data(), in2_flat.data(), out_flat.data()));
   }
 };
 
@@ -94,10 +94,10 @@ struct BinaryLeftClipOp<GPUDevice, T> {
                   typename TTypes<T>::Flat &out_flat) const {
     CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
-    BinaryLeftClipCustomKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
-            out_flat.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        BinaryLeftClipCustomKernel<T>, config.block_count,
+        config.thread_per_block, 0, d.stream(), in0_flat.size(),
+        in0_flat.data(), in1_flat.data(), in2_flat.data(), out_flat.data()));
   }
 };
 
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 313d976e2c60f122c82b578ddef2d3f8184be084..08fd228cd6d8f9a6358b8a93e4130bf24cae6a68 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -24,7 +24,8 @@ REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16,
           int32, int64);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
-REGISTER2(BinaryOp, CPU, "DivNoNan", functor::div_no_nan, float, double);
+REGISTER5(BinaryOp, CPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
index 539f07b0d68321a0f9a33b76aca78bd9e38ce6e9..f4059b2b137ae16dfeed199aae26895f74d39133 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 namespace functor {
 DEFINE_BINARY11(mul, Eigen::half, float, double, uint8, int8, uint16, int16,
                 int32, int64, complex64, complex128);
+DEFINE_BINARY2(mul_no_nan, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_tan.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_tan.cu.cc
index dc66676c577b3474456656403d0e04a166be6be2..e09f12d41243af5a289b1dbc4d2b4e08245369cb 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_tan.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_tan.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY2(tan, float, double);
+DEFINE_UNARY3(tan, Eigen::half, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index cff0407b83a4bafd27573325615322f92e594d46..13c89aae4ce695209bf23f765c42270d5e0a8d93 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -19,6 +19,9 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
           int32, bfloat16);
+REGISTER5(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
+          double, complex64, complex128);
+
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -39,6 +42,7 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
+REGISTER2(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, float, double);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index dd4e4ea547e7738b76796c0e8d174602645b83df..3b51563ca288413b389f938c9ff9810a71c09fd5 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 #include "tensorflow/core/platform/prefetch.h"
 
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 90762fb1b0c349a538a1d56f485b46a26fc37360..4338d75219edd263763b7c9e5588c83e677c6806 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Tan", functor::tan, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
+          complex64, complex128);
 
 #if GOOGLE_CUDA
-REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
+REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index a22d76717a50e0869d38b77f0ec7f0cc46f8c7ac..f04dcce2940ea7563522f11df13ca02c0e93dac4 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
 namespace internal {
@@ -51,15 +51,12 @@ struct scalar_arg_op<std::complex<double>> {
 };
 #endif
 
+#if EIGEN_HAS_CXX11_MATH == 0
 template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::asinh(a);
-#else
     return std::asinh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -71,11 +68,7 @@ template <typename T>
 struct scalar_acosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::acosh(a);
-#else
     return std::acosh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -87,35 +80,14 @@ template <typename T>
 struct scalar_atanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::atanh(a);
-#else
     return std::atanh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
 struct functor_traits<scalar_atanh_op<T>> {
   enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
 };
-
-// TODO(rmlarsen): This is a workaround for upstream change
-// https://bitbucket.org/eigen/eigen/commits/f339468d04d0f87caeb6cab9aef568627e9f6ea9
-// that renamed scalar_binary_pow_op to scalar_pow_op and deleted the unary
-// version of the latter. Remove once we upgrade to Eigen 3.3.
-template <typename Scalar, typename Exponent>
-struct scalar_binary_pow_op_google {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op_google)
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
-                                             const Exponent& b) const {
-    return numext::pow(a, b);
-  }
-};
-
-template <typename Scalar, typename Exponent>
-struct functor_traits<scalar_binary_pow_op_google<Scalar, Exponent>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
+#endif
 
 template <typename Scalar, typename Exponent>
 struct safe_scalar_binary_pow_op {
@@ -175,24 +147,49 @@ struct functor_traits<safe_div_or_mod_op<T, DivOrMod>> {
   };
 };
 
-template <typename T>
-struct div_no_nan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
+template <typename T, typename Binary>
+struct no_nan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(no_nan_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
                                                            const T& b) const {
-    if (b != 0) {
-      return scalar_quotient_op<T>()(a, b);
+    if (b != T(0)) {
+      return Binary()(a, b);
     } else {
-      return 0;
+      return T(0);
     }
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    const Packet mask = pcmp_eq(b, pzero(b));
+    const Packet quotient = Binary().packetOp(a, b);
+    return pandnot(quotient, mask);
+  }
+};
+
+template <typename T>
+struct div_no_nan_op : public no_nan_op<T, scalar_quotient_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
 };
 
 template <typename T>
 struct functor_traits<div_no_nan_op<T>> {
   enum {
     Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
-    PacketAccess = false,
+    PacketAccess = true,
+  };
+};
+
+template <typename T>
+struct mul_no_nan_op : public no_nan_op<T, scalar_product_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(mul_no_nan_op)
+};
+
+template <typename T>
+struct functor_traits<mul_no_nan_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_product_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = true,
   };
 };
 
@@ -360,6 +357,19 @@ struct google_floor_div {
       return x / y;
     }
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x, const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet x_mask = pcmp_lt(x, zeros);
+    Packet y_mask = pcmp_lt(y, zeros);
+    Packet x_div_y = pdiv(x, y);
+    Packet abs_x = pabs(x);
+    Packet abs_y = pabs(y);
+    Packet ones = pones(x);
+    Packet ratio_rounded = pdiv(pnegate(psub(padd(abs_x, abs_y), ones)), abs_y);
+    return pselect(pxor(x_mask, y_mask), ratio_rounded, x_div_y);
+  }
 };
 
 template <typename T>
@@ -369,36 +379,48 @@ struct google_floor_div<
                                                            const T& y) const {
     return x / y;
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x, const Packet& y) const {
+    return pdiv(x, y);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<google_floor_div<Scalar>> {
   enum {
-    Cost = 2 * Eigen::internal::scalar_div_cost<Scalar, false>::value +
-           2 * NumTraits<Scalar>::AddCost,
-    PacketAccess = false
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
+           NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiv
   };
 };
 
-// TODO(b/32239616): This kernel should be moved into Eigen and vectorized.
 template <typename T, typename Enable = void>
 struct google_floor_div_real {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
                                                            const T& y) const {
     return Eigen::numext::floor(x / y);
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x, const Packet& y) const {
+    return pfloor(pdiv(x, y));
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<google_floor_div_real<Scalar>> {
   enum {
-    Cost = 2 * Eigen::internal::scalar_div_cost<Scalar, false>::value +
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
            2 * NumTraits<Scalar>::AddCost,
-    PacketAccess = false
+    PacketAccess =
+        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasFloor
   };
 };
 
-// TODO(b//32239616): This kernel should be moved into Eigen and vectorized.
+// TODO(rmlarsen): Add vectorized mod & fmod in Eigen and use it here.
 template <typename T>
 struct google_floor_fmod {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
@@ -418,7 +440,7 @@ struct functor_traits<google_floor_fmod<Scalar>> {
   };
 };
 
-// TODO(b/32239616): This kernel should be moved into Eigen and vectorized.
+// TODO(rmlarsen): Add vectorized mod & fmod in Eigen and use it here.
 template <typename T>
 struct google_floor_mod {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
@@ -448,7 +470,7 @@ struct functor_traits<google_floor_mod<Scalar>> {
 #define ENABLE_FLOAT_EQUALITY_WARNING
 #endif
 
-template <typename Scalar>
+template <typename Scalar, bool IsInteger = Eigen::NumTraits<Scalar>::IsInteger>
 struct scalar_round_op_google {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
   operator()(const Scalar& x) const {
@@ -472,29 +494,63 @@ struct scalar_round_op_google {
 };
 
 template <typename Scalar>
-struct functor_traits<scalar_round_op_google<Scalar>> {
-  enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
+struct scalar_round_op_google<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return x;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return x;
+  }
 };
 
 template <typename Scalar>
+struct functor_traits<scalar_round_op_google<Scalar>> {
+  enum {
+    Cost = Eigen::NumTraits<Scalar>::IsInteger ? 0
+                                               : 4 * NumTraits<Scalar>::AddCost,
+    PacketAccess = Eigen::NumTraits<Scalar>::IsInteger
+  };
+};
+
+template <typename Scalar, bool IsInteger = Eigen::NumTraits<Scalar>::IsInteger>
 struct scalar_round_up_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
   operator()(const Scalar& x) const {
     EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
                         NUMERIC_TYPE_MUST_BE_REAL)
+    return Eigen::numext::floor(x + Scalar(0.5));
+  }
 
-    Scalar round_val = Eigen::numext::floor(x);
-    const Scalar fraction = x - round_val;
-    if (fraction >= Scalar(.5)) {
-      round_val += Scalar(1.0);
-    }
-    return round_val;
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return pfloor(padd(x, pset1<Packet>(0.5)));
   }
 };
 
 template <typename Scalar>
-struct functor_traits<scalar_round_up_op<Scalar>> {
-  enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
+struct scalar_round_up_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return x;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return x;
+  }
+};
+
+template <typename Scalar, bool IsInteger>
+struct functor_traits<scalar_round_up_op<Scalar, IsInteger>> {
+  enum {
+    Cost = IsInteger ? 0 : 4 * NumTraits<Scalar>::AddCost,
+    PacketAccess = IsInteger || packet_traits<Scalar>::HasFloor
+  };
 };
 
 #undef ENABLE_FLOAT_EQUALITY_WARNING
@@ -507,9 +563,9 @@ struct bitwise_xor_op {
   operator()(const Scalar& x, const Scalar& y) const {
     return x ^ y;
   }
-  typedef typename Eigen::internal::packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a,
-                                                        const Packet& b) const {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
     return Eigen::internal::pxor(a, b);
   }
 };
@@ -519,7 +575,6 @@ struct functor_traits<bitwise_xor_op<Scalar>> {
   enum { Cost = Eigen::NumTraits<Scalar>::AddCost, PacketAccess = true };
 };
 
-// TODO(srvasude): Add packet versions of this operation.
 template <typename Scalar>
 struct xlogy_op {
   EIGEN_EMPTY_STRUCT_CTOR(xlogy_op)
@@ -530,18 +585,28 @@ struct xlogy_op {
     }
     return x * numext::log(y);
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x, const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet mask = pcmp_eq(x, zeros);
+    scalar_log_op<Scalar> log_op;
+    Packet log_y = log_op.packetOp(y);
+    Packet x_log_y = pmul(x, log_y);
+    return pselect(mask, x, x_log_y);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<xlogy_op<Scalar>> {
   enum {
-    Cost = (sizeof(Scalar) == 4 ? 40 : 85) + Eigen::NumTraits<Scalar>::MulCost,
-    PacketAccess = false
+    Cost = functor_traits<scalar_log_op<Scalar>>::Cost +
+           Eigen::NumTraits<Scalar>::MulCost,
+    PacketAccess = functor_traits<scalar_log_op<Scalar>>::PacketAccess
   };
 };
 
 template <typename Scalar>
-// TODO(srvasude): Add packet versions of this operation.
 struct xdivy_op {
   EIGEN_EMPTY_STRUCT_CTOR(xdivy_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
@@ -551,11 +616,25 @@ struct xdivy_op {
     }
     return x / y;
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x, const Packet& y) const {
+    Packet zeros = pzero(x);
+    Packet mask = pcmp_eq(x, zeros);
+    Packet x_div_y = pdiv(x, y);
+    return pselect(mask, x, x_div_y);
+  }
 };
 
 template <typename Scalar>
 struct functor_traits<xdivy_op<Scalar>> {
-  enum { Cost = Eigen::NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost =
+        Eigen::NumTraits<Scalar>::AddCost +
+        Eigen::internal::scalar_div_cost<Scalar,
+                                         packet_traits<Scalar>::HasDiv>::value,
+    PacketAccess = packet_traits<Scalar>::HasDiv
+  };
 };
 
 }  // end namespace internal
@@ -819,6 +898,9 @@ struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
   static const bool use_bcast_optimization = true;
 };
 
+template <typename T>
+struct mul_no_nan : base<T, Eigen::internal::mul_no_nan_op<T>> {};
+
 template <typename T>
 struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
 
@@ -865,7 +947,7 @@ template <typename T>
 struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 
 template <typename T>
-struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
+struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
 
 template <typename T>
 struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 53b53cc277eefbdb3fa4d1c9e82b17f12018fedb..ab919738f9951110f65f9a0aea61b4d17ed1419f 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -75,14 +75,19 @@ struct scalar_inverse_gradient_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_gradient_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return -output_gradient * out_conj * out_conj;
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return -out_conj * out_conj * output_gradient;
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet out_conj = pconj(output);
-    return pnegate(pmul(output_gradient, pmul(out_conj, out_conj)));
+    return mul_no_nan_op<T>().packetOp(pnegate(pmul(out_conj, out_conj)),
+                                       output_gradient);
   }
 };
 template <typename T>
@@ -99,15 +104,20 @@ struct scalar_sqrt_gradient_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_gradient_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return static_cast<T>(0.5) * output_gradient / out_conj;
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return (static_cast<T>(0.5) * output_gradient) / out_conj;
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(0.5));
     const Packet out_conj = pconj(output);
-    return pdiv(pmul(const_half, output_gradient), out_conj);
+    return mul_no_nan_op<T>().packetOp(pdiv(const_half, out_conj),
+                                       output_gradient);
   }
 };
 template <typename T>
@@ -124,17 +134,24 @@ struct scalar_rsqrt_gradient_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_gradient_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return static_cast<T>(-0.5) * (output_gradient * out_conj) *
-           (out_conj * out_conj);
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return static_cast<T>(-0.5) * (output_gradient * out_conj) *
+             (out_conj * out_conj);
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(-0.5));
     const Packet out_conj = pconj(output);
-    return pmul(const_half, pmul(pmul(output_gradient, out_conj),
-                                 pmul(out_conj, out_conj)));
+    auto safe_pmul = [](const Packet& a, const Packet& b) {
+      return mul_no_nan_op<T>().packetOp(a, b);
+    };
+    return safe_pmul(pmul(const_half, pmul(out_conj, out_conj)),
+                     safe_pmul(out_conj, output_gradient));
   }
 };
 template <typename T>
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 696d5840e8ce39c1bf210b54b9f28ae83cf232c7..acf7cc289933c2d42644faf63f58ec6af53957c9 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -45,6 +45,7 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
   void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
     const int64 tot = static_cast<int64>(iters) * num;               \
+    testing::UseRealTime();                                          \
     testing::ItemsProcessed(tot);                                    \
     testing::BytesProcessed(tot * sizeof(T));                        \
     test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
@@ -100,6 +101,7 @@ Graph* BinaryScalar(int num, const string& func) {
 #define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
   void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
     const int64 tot = static_cast<int64>(iters) * num;             \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
@@ -125,6 +127,15 @@ BM_BINARY_SCALAR(gpu, Add);
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
 #endif  // TENSORFLOW_USE_SYCL
+
+BM_BINARY_SCALAR(cpu, DivNoNan);
+#if GOOGLE_CUDA
+BM_BINARY_SCALAR(gpu, DivNoNan);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, DivNoNan);
+#endif  // TENSORFLOW_USE_SYCL
+
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -146,6 +157,7 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
@@ -197,6 +209,7 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
@@ -259,6 +272,7 @@ Graph* BcastAdd(int rows, int cols, int dim) {
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
@@ -285,6 +299,7 @@ BM_BCAST_ADD_ROW_ALL(sycl);
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
@@ -311,6 +326,7 @@ BM_BCAST_ADD_COL_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
@@ -338,6 +354,7 @@ BM_BCAST_ADD_CROSS_RC_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index fab1ec8b6e712e67460819341250d324d66b6dcc..e86c3b0c1bc61c2757b04d3077d9ad519c9176bf 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -9,8 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
     "tf_cc_test",
+    "tf_kernel_library",
 )
 
 # TODO(mrry): Remove this empty forwarding library.
@@ -21,6 +21,27 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
 )
 
+cc_library(
+    name = "dataset_test_base",
+    testonly = 1,
+    srcs = ["dataset_test_base.cc"],
+    hdrs = ["dataset_test_base.h"],
+    deps = [
+        ":dataset_utils",
+        ":iterator_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 cc_library(
     name = "dataset_utils",
     srcs = ["dataset_utils.cc"],
@@ -39,17 +60,26 @@ tf_cc_test(
     srcs = ["dataset_utils_test.cc"],
     deps = [
         ":dataset_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+cc_library(
+    name = "stats_utils",
+    srcs = ["stats_utils.cc"],
+    hdrs = ["stats_utils.h"],
+    deps = ["//tensorflow/core:lib"],
+)
+
 cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
     hdrs = ["captured_function.h"],
     deps = [
         ":single_threaded_executor",
+        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -75,13 +105,18 @@ tf_cc_test(
     srcs = ["single_threaded_executor_test.cc"],
     deps = [
         ":single_threaded_executor",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -94,6 +129,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "unbounded_thread_pool",
+    srcs = ["unbounded_thread_pool.cc"],
+    hdrs = ["unbounded_thread_pool.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "unbounded_thread_pool_test",
+    srcs = ["unbounded_thread_pool_test.cc"],
+    deps = [
+        ":unbounded_thread_pool",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "window_dataset",
     srcs = ["window_dataset.cc"],
@@ -116,6 +174,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "shard_dataset_op",
+    srcs = ["shard_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
@@ -145,6 +214,7 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset_utils",
+        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -178,11 +248,35 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":map_dataset_op",
+        ":range_dataset_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
 cc_library(
     name = "parallel_map_iterator",
     srcs = ["parallel_map_iterator.cc"],
     hdrs = ["parallel_map_iterator.h"],
     deps = [
+        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -203,7 +297,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -256,12 +349,12 @@ tf_kernel_library(
     deps = [
         ":captured_function",
         ":dataset_utils",
+        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
     ],
 )
 
@@ -290,12 +383,12 @@ tf_kernel_library(
     hdrs = ["prefetch_dataset_op.h"],
     deps = [
         ":prefetch_autotuner",
+        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -311,9 +404,30 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "repeat_dataset_op_test",
+    size = "small",
+    srcs = ["repeat_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":repeat_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "take_dataset_op",
     srcs = ["take_dataset_op.cc"],
+    hdrs = ["take_dataset_op.h"],
     deps = [
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -322,6 +436,26 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "take_dataset_op_test",
+    size = "small",
+    srcs = ["take_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":take_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "skip_dataset_op",
     srcs = ["skip_dataset_op.cc"],
@@ -344,6 +478,23 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "shuffle_dataset_op",
     srcs = ["shuffle_dataset_op.cc"],
@@ -366,26 +517,85 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "sparse_tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["sparse_tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":sparse_tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
+tf_cc_test(
+    name = "tensor_dataset_op_test",
+    size = "small",
+    srcs = ["tensor_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
+tf_cc_test(
+    name = "tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -397,6 +607,26 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":zip_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "concatenate_dataset_op",
     srcs = ["concatenate_dataset_op.cc"],
@@ -408,10 +638,31 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "concatenate_dataset_op_test",
+    size = "small",
+    srcs = ["concatenate_dataset_op_test.cc"],
+    deps = [
+        ":concatenate_dataset_op",
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "reader_dataset_ops",
     srcs = ["reader_dataset_ops.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -426,6 +677,7 @@ tf_kernel_library(
     deps = [
         ":dataset_utils",
         ":optional_ops",
+        ":unbounded_thread_pool",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -443,6 +695,7 @@ tf_kernel_library(
     srcs = ["multi_device_iterator_ops.cc"],
     deps = [
         ":dataset_utils",
+        ":unbounded_thread_pool",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -482,17 +735,15 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "optimize_dataset_op",
-    srcs = ["optimize_dataset_op.cc"],
+cc_library(
+    name = "graph_rewrite_dataset",
+    srcs = ["graph_rewrite_dataset.cc"],
+    hdrs = ["graph_rewrite_dataset.h"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -503,6 +754,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "optimize_dataset_op",
+    srcs = ["optimize_dataset_op.cc"],
+    deps = [
+        ":graph_rewrite_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "model_dataset_op",
     srcs = ["model_dataset_op.cc"],
@@ -513,7 +777,6 @@ tf_kernel_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -557,6 +820,7 @@ tf_kernel_library(
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
+        ":shard_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
         ":sparse_tensor_slice_dataset_op",
@@ -565,6 +829,8 @@ tf_kernel_library(
         ":tensor_slice_dataset_op",
         ":window_dataset_op",
         ":zip_dataset_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
@@ -573,6 +839,7 @@ tf_kernel_library(
     name = "map_defun_op",
     srcs = ["map_defun_op.cc"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 343157de6fea3df5fb7ada416f81f95534f76e1c..2149c7fbe527f629e15ef89d05e12d9a451a6200 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -697,8 +697,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         // Use the resource manager in the iterator context to get / create
         // a cache.
         ResourceMgr* mgr = ctx->resource_mgr();
-        const string name =
-            strings::StrCat(prefix(), "::", dataset()->name(), "::MemoryCache");
+        const string name = strings::StrCat(
+            prefix(), "::", dataset()->node_name(), "::MemoryCache");
         TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
             "tf_data", name, &cache_, [](MemoryCache** cache) {
               *cache = new MemoryCache();
@@ -815,7 +815,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             LOG(WARNING)
                 << "The calling iterator did not fully read the dataset being "
                    "cached. In order to avoid unexpected truncation of the "
-                   "dataset, the partially cached contents of the dataset"
+                   "dataset, the partially cached contents of the dataset "
                    "will be discarded. This can happen if you have an input "
                    "pipeline similar to `dataset.cache().take(k).repeat()`. "
                    "You should use `dataset.take(k).cache().repeat()` instead.";
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 99b745b4c45c115b065fced39f7f206c240cf5ed..1449383890d7658c255a9abc265fe356c96c8a4a 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -101,23 +102,32 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
 
 /* static */
 Status CapturedFunction::Create(
-    const NameAttrList& func, OpKernelContext* ctx, const string& argument,
+    const NameAttrList& func, OpKernelContext* ctx, const string& argument_name,
     std::unique_ptr<CapturedFunction>* out_function) {
-  return CapturedFunction::Create(func, ctx, argument, true, out_function);
+  return CapturedFunction::Create(func, ctx, argument_name, true, out_function);
 }
 
 Status CapturedFunction::Create(
-    const NameAttrList& func, OpKernelContext* ctx, const string& argument,
+    const NameAttrList& func, OpKernelContext* ctx, const string& argument_name,
     bool use_inter_op_parallelism,
     std::unique_ptr<CapturedFunction>* out_function) {
   OpInputList inputs;
-  TF_RETURN_IF_ERROR(ctx->input_list(argument, &inputs));
+  TF_RETURN_IF_ERROR(ctx->input_list(argument_name, &inputs));
   std::vector<Tensor> arguments(inputs.begin(), inputs.end());
   *out_function = absl::WrapUnique(new CapturedFunction(
       func, std::move(arguments), use_inter_op_parallelism));
   return Status::OK();
 }
 
+Status CapturedFunction::Create(
+    const NameAttrList& func, OpKernelContext* ctx,
+    std::vector<Tensor>&& captured_inputs, bool use_inter_op_parallelism,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  *out_function = absl::WrapUnique(new CapturedFunction(
+      func, std::move(captured_inputs), use_inter_op_parallelism));
+  return Status::OK();
+}
+
 Status CapturedFunction::Instantiate(
     IteratorContext* ctx, std::unique_ptr<InstantiatedCapturedFunction>*
                               instantiated_captured_function) {
@@ -446,12 +456,14 @@ void InstantiatedCapturedFunction::RunAsync(
           s = frame->ConsumeRetvals(rets);
         }
         delete frame;
-
+        // TODO(shivaniagrawal): add the dataset name containing this function,
+        // make it dataset()->node_name() + captured_func_->func().name().
         if (stats_aggregator) {
+          string prefix_with_func_name = strings::StrCat(
+              str_util::Split(prefix, "::", str_util::SkipEmpty()).back(),
+              "::", captured_func_->func().name());
           stats_aggregator->AddToHistogram(
-              strings::StrCat(
-                  str_util::Split(prefix, "::", str_util::SkipEmpty()).back(),
-                  "::", captured_func_->func().name(), "::execution_time"),
+              stats_utils::ExecutionTimeHistogramName(prefix_with_func_name),
               {static_cast<float>(stats_collector->processing_time())});
         }
         if (model) {
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index cffaf405ecbad4302be4e1b6022fda6db3dad359..9c00123e7de8957df321fbf848b41fa0fca3bde5 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -116,7 +116,7 @@ class CapturedFunction {
   // Creates a new instance using a list of named attributes, fetching captured
   // inputs from a context argument.
   static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument,
+                       const string& argument_name,
                        std::unique_ptr<CapturedFunction>* out_function);
 
   // Creates a new instance using a list of named attributes, fetching captured
@@ -125,7 +125,18 @@ class CapturedFunction {
   // If `use_inter_op_parallelism` is false, the runtime may use an executor
   // that is optimized for small functions.
   static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument, bool use_inter_op_parallelism,
+                       const string& argument_name,
+                       bool use_inter_op_parallelism,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Creates a new instance using a list of named attributes, using provided
+  // captured inputs.
+  //
+  // If `use_inter_op_parallelism` is false, the runtime may use an executor
+  // that is optimized for small functions.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       std::vector<Tensor>&& captured_inputs,
+                       bool use_inter_op_parallelism,
                        std::unique_ptr<CapturedFunction>* out_function);
 
   // Instantiates this function for use in the given context, providing an
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1885c50c298cb6929c7ff200ae1d3bf0cb2b22a2
--- /dev/null
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
@@ -0,0 +1,353 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "concatenate_dataset";
+constexpr char kOpName[] = "ConcatenateDataset";
+
+class ConcatenateDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensors from the input vector of
+  // tensor vectors.
+  Status CreateTensorSliceDatasetTensors(
+      const std::vector<std::vector<Tensor>> &tensor_vectors,
+      std::vector<Tensor> *const dataset_tensors) {
+    for (int i = 0; i < tensor_vectors.size(); ++i) {
+      std::vector<Tensor> tensors = tensor_vectors[i];
+      DatasetBase *tensor_slice_dataset;
+      TF_RETURN_IF_ERROR(
+          CreateTensorSliceDataset(strings::StrCat("tensor_slice_node_", i),
+                                   &tensors, &tensor_slice_dataset));
+      Tensor dataset_tensor(DT_VARIANT, TensorShape({}));
+      TF_RETURN_IF_ERROR(
+          StoreDatasetInVariantTensor(tensor_slice_dataset, &dataset_tensor));
+      dataset_tensors->emplace_back(std::move(dataset_tensor));
+    }
+    return Status::OK();
+  }
+
+  // Creates a new ConcatenateDataset op kernel.
+  Status CreateConcatenateDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "another_dataset"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new ConcatenateDataset op kernel context.
+  Status CreateConcatenateDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestParam {
+  std::vector<std::vector<Tensor>> input_tensors;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+TestParam TestCase1() {
+  // Test case 1: same shape.
+  return {/*input_tensors*/
+          {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                    {1, 2, 3, 4}),
+            DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                    {5, 6, 7, 8})},
+           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                    {11, 12, 13, 14}),
+            DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                    {15, 16, 17, 18})}},
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {5, 6}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {7, 8}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {11, 12}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {15, 16}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {13, 14}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {17, 18})},
+          /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({2}), PartialTensorShape({2})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+TestParam TestCase2() {
+  // Test case 2: different shape.
+  return {
+      /*input_tensors*/
+      {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 3},
+                                                {1, 2, 3, 4, 5, 6}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                {7, 8, 9, 10})},
+       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2},
+                                                {11, 12, 13, 14}),
+        DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {15, 16})}},
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3}, {1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {7, 8}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3}, {4, 5, 6}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {9, 10}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {11, 12}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {15}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {13, 14}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {16})},
+      /*expected_output_dtypes*/ {DT_INT64, DT_INT64},
+      /*expected_output_shapes*/
+      {PartialTensorShape({-1}), PartialTensorShape({-1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 2, 5}};
+}
+
+class ConcatenateDatasetOpTestHelper : public ConcatenateDatasetOpTest {
+ public:
+  ~ConcatenateDatasetOpTestHelper() override {
+    if (dataset_) dataset_->Unref();
+  }
+
+ protected:
+  Status CreateDatasetFromTestCase(const TestParam &test_case) {
+    std::vector<Tensor> tensor_slice_dataset_tensors;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDatasetTensors(
+        test_case.input_tensors, &tensor_slice_dataset_tensors));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &tensor : tensor_slice_dataset_tensors) {
+      inputs.emplace_back(&tensor);
+    }
+    TF_RETURN_IF_ERROR(CreateConcatenateDatasetKernel(
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &dataset_kernel_));
+    TF_RETURN_IF_ERROR(CreateConcatenateDatasetContext(
+        dataset_kernel_.get(), &inputs, &dataset_kernel_ctx_));
+    TF_RETURN_IF_ERROR(CreateDataset(dataset_kernel_.get(),
+                                     dataset_kernel_ctx_.get(), &dataset_));
+    return Status::OK();
+  }
+
+  Status CreateIteratorFromTestCase(const TestParam &test_case) {
+    TF_RETURN_IF_ERROR(CreateDatasetFromTestCase(test_case));
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_kernel_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(
+        dataset_->MakeIterator(iterator_ctx_.get(), "Iterator", &iterator_));
+    return Status::OK();
+  }
+
+  std::unique_ptr<OpKernel> dataset_kernel_;
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx_;
+  DatasetBase *dataset_ = nullptr;  // owned by this class.
+  std::unique_ptr<IteratorContext> iterator_ctx_;
+  std::unique_ptr<IteratorBase> iterator_;
+};
+
+class ParameterizedDatasetTest
+    : public ConcatenateDatasetOpTestHelper,
+      public ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(ParameterizedDatasetTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator_->GetNext(iterator_ctx_.get(), &out_tensors,
+                                    &end_of_sequence));
+    if (!end_of_sequence) {
+      for (const auto &tensor : out_tensors) {
+        EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+        expected_outputs_it++;
+      }
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(ConcatenateDatasetOpTestHelper, DifferentDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  TestParam test_case_with_different_dtypes = {
+      /*input_tensors*/ {
+          {CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4})},
+          {CreateTensor<double>(TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}},
+      /*expected_outputs*/ {},
+      /*expected_output_dtypes*/ {DT_INT64},
+      /*expected_output_shapes*/ {PartialTensorShape({2})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {}};
+
+  EXPECT_EQ(CreateDatasetFromTestCase(test_case_with_different_dtypes).code(),
+            tensorflow::error::INVALID_ARGUMENT);
+}
+
+TEST_F(ConcatenateDatasetOpTestHelper, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateDatasetFromTestCase(TestCase1()));
+
+  EXPECT_EQ(dataset_->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+  TF_EXPECT_OK(VerifyTypesMatch(dataset_->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+  TF_EXPECT_OK(VerifyShapesCompatible(dataset_->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+
+  EXPECT_EQ(dataset_->Cardinality(), GetParam().expected_cardinality);
+}
+
+TEST_F(ConcatenateDatasetOpTestHelper, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateDatasetFromTestCase(TestCase1()));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset_->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+  TF_EXPECT_OK(VerifyTypesMatch(iterator_->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator_->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ConcatenateDatasetOpTestHelper, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateIteratorFromTestCase(TestCase1()));
+  EXPECT_EQ(iterator_->prefix(), "Iterator::Concatenate");
+}
+
+TEST_P(ParameterizedDatasetTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator_->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator_->Restore(iterator_ctx_.get(), &reader));
+
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator_->GetNext(iterator_ctx_.get(), &out_tensors,
+                                      &end_of_sequence));
+      if (!end_of_sequence) {
+        for (auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= dataset_->Cardinality()) {
+      EXPECT_TRUE(end_of_sequence);
+      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ConcatenateDatasetOpTest, ParameterizedDatasetTest,
+    ::testing::ValuesIn(std::vector<TestParam>({TestCase1(), TestCase2()})));
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3680fd1604da27ad20ab6a0faafbcba92695c130
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -0,0 +1,265 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+
+Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
+  EXPECT_EQ(a.dtype(), b.dtype());
+  switch (a.dtype()) {
+#define CASE(type)                       \
+  case DataTypeToEnum<type>::value:      \
+    test::ExpectTensorEqual<type>(a, b); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_string(CASE);
+    // TODO(feihugis): figure out how to support variant tensors.
+#undef CASE
+    default:
+      return errors::Internal("Unsupported dtype", a.dtype());
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateTensorSliceDatasetKernel(
+    StringPiece node_name, const DataTypeVector& dtypes,
+    const std::vector<PartialTensorShape>& shapes,
+    std::unique_ptr<OpKernel>* tensor_slice_dataset_kernel) {
+  std::vector<string> components;
+  components.reserve(dtypes.size());
+  for (int i = 0; i < dtypes.size(); ++i) {
+    // Create the placeholder names for the input components of
+    // `TensorSliceDataset`.
+    components.emplace_back(strings::StrCat("component_", i));
+  }
+  NodeDef node_def = test::function::NDef(
+      node_name, "TensorSliceDataset", components,
+      {{"Toutput_types", dtypes}, {"output_shapes", shapes}});
+  TF_RETURN_IF_ERROR(CreateOpKernel(node_def, tensor_slice_dataset_kernel));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateTensorSliceDataset(
+    StringPiece node_name, std::vector<Tensor>* const components,
+    DatasetBase** tensor_slice_dataset) {
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  DataTypeVector dtypes;
+  dtypes.reserve(components->size());
+  std::vector<PartialTensorShape> shapes;
+  shapes.reserve(components->size());
+  for (const auto& t : *components) {
+    dtypes.push_back(t.dtype());
+    gtl::InlinedVector<int64, 4> partial_dim_sizes;
+    for (int i = 1; i < t.dims(); ++i) {
+      partial_dim_sizes.push_back(t.dim_size(i));
+    }
+    shapes.emplace_back(std::move(partial_dim_sizes));
+  }
+  TF_RETURN_IF_ERROR(CreateTensorSliceDatasetKernel(
+      node_name, dtypes, shapes, &tensor_slice_dataset_kernel));
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto& tensor : *components) {
+    inputs.emplace_back(&tensor);
+  }
+  TF_RETURN_IF_ERROR(CheckOpKernelInput(*tensor_slice_dataset_kernel, inputs));
+  std::unique_ptr<OpKernelContext> context;
+  TF_RETURN_IF_ERROR(CreateOpKernelContext(tensor_slice_dataset_kernel.get(),
+                                           &inputs, &context));
+  TF_RETURN_IF_ERROR(
+      RunOpKernel(tensor_slice_dataset_kernel.get(), context.get()));
+  TF_RETURN_IF_ERROR(
+      GetDatasetFromContext(context.get(), 0, tensor_slice_dataset));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateOpKernel(
+    const NodeDef& node_def, std::unique_ptr<OpKernel>* op_kernel) {
+  OpKernel* kernel;
+  TF_RETURN_IF_ERROR(tensorflow::CreateOpKernel(device_type_, device_.get(),
+                                                allocator_, flr_, node_def,
+                                                TF_GRAPH_DEF_VERSION, &kernel));
+  op_kernel->reset(kernel);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateDataset(OpKernel* kernel,
+                                         OpKernelContext* context,
+                                         DatasetBase** const dataset) {
+  TF_RETURN_IF_ERROR(RunOpKernel(kernel, context));
+  // Assume that DatasetOp has only one output.
+  DCHECK_EQ(context->num_outputs(), 1);
+  TF_RETURN_IF_ERROR(GetDatasetFromContext(context, 0, dataset));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateIteratorContext(
+    OpKernelContext* const op_context,
+    std::unique_ptr<IteratorContext>* iterator_context) {
+  IteratorContext::Params params(op_context);
+  function_handle_cache_ = absl::make_unique<FunctionHandleCache>(flr_);
+  params.function_handle_cache = function_handle_cache_.get();
+  *iterator_context = absl::make_unique<IteratorContext>(params);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::GetDatasetFromContext(OpKernelContext* context,
+                                                 int output_index,
+                                                 DatasetBase** const dataset) {
+  Tensor* output = context->mutable_output(output_index);
+  Status status = GetDatasetFromVariantTensor(*output, dataset);
+  (*dataset)->Ref();
+  return status;
+}
+
+Status DatasetOpsTestBase::InitThreadPool(int thread_num) {
+  if (thread_num < 1) {
+    return errors::InvalidArgument(
+        "The `thread_num` argument should be positive but got: ", thread_num);
+  }
+  thread_pool_ = absl::make_unique<thread::ThreadPool>(
+      Env::Default(), ThreadOptions(), "inter_op", thread_num);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
+    const std::vector<FunctionDef>& flib, int cpu_num) {
+  if (cpu_num < 1) {
+    return errors::InvalidArgument(
+        "The `cpu_num` argument should be positive but got: ", cpu_num);
+  }
+  SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  device_count->insert({"CPU", cpu_num});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+
+  FunctionDefLibrary proto;
+  for (const auto& fdef : flib) *(proto.add_function()) = fdef;
+  lib_def_ =
+      absl::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+
+  OptimizerOptions opts;
+  pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+      opts, thread_pool_.get(), nullptr /* cluster_flr */);
+  flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+  if (thread_pool_ == nullptr) {
+    runner_ = [](std::function<void()> fn) { fn(); };
+  } else {
+    runner_ = [this](std::function<void()> fn) {
+      thread_pool_->Schedule(std::move(fn));
+    };
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::RunOpKernel(OpKernel* op_kernel,
+                                       OpKernelContext* context) {
+  device_->Compute(op_kernel, context);
+  return context->status();
+}
+
+Status DatasetOpsTestBase::CreateOpKernelContext(
+    OpKernel* kernel, gtl::InlinedVector<TensorValue, 4>* inputs,
+    std::unique_ptr<OpKernelContext>* context) {
+  params_ = absl::make_unique<OpKernelContext::Params>();
+  params_->device = device_.get();
+  params_->resource_manager = device_->resource_manager();
+  params_->frame_iter = FrameAndIter(0, 0);
+  params_->inputs = inputs;
+  params_->op_kernel = kernel;
+  params_->function_library = flr_;
+  params_->runner = &runner_;
+  step_container_ =
+      absl::make_unique<ScopedStepContainer>(0, [](const string&) {});
+  params_->step_container = step_container_.get();
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+  slice_reader_cache_ =
+      absl::make_unique<checkpoint::TensorSliceReaderCacheWrapper>();
+  params_->slice_reader_cache = slice_reader_cache_.get();
+
+  // Set the allocator attributes for the outputs.
+  allocator_attrs_.clear();
+  for (int index = 0; index < params_->op_kernel->num_outputs(); index++) {
+    AllocatorAttributes attr;
+    const bool on_host =
+        (params_->op_kernel->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    allocator_attrs_.emplace_back(attr);
+  }
+  params_->output_attr_array = gtl::vector_as_array(&allocator_attrs_);
+
+  *context = absl::make_unique<OpKernelContext>(params_.get());
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateSerializationContext(
+    std::unique_ptr<SerializationContext>* context) {
+  SerializationContext::Params params;
+  params.flib_def = lib_def_.get();
+  *context = absl::make_unique<SerializationContext>(params);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckOpKernelInput(
+    const OpKernel& kernel, const gtl::InlinedVector<TensorValue, 4>& inputs) {
+  if (kernel.input_types().size() != inputs.size()) {
+    return errors::Internal("The number of input elements should be ",
+                            kernel.input_types().size(),
+                            ", but got: ", inputs.size());
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::AddDatasetInput(
+    gtl::InlinedVector<TensorValue, 4>* inputs, DataTypeVector input_types,
+    DataType dtype, const TensorShape& shape) {
+  if (input_types.size() < inputs->size()) {
+    return errors::InvalidArgument("Adding more inputs than types: ",
+                                   inputs->size(), " vs. ", input_types.size());
+  }
+  bool is_ref = IsRefType(input_types[inputs->size()]);
+  std::unique_ptr<Tensor> input =
+      absl::make_unique<Tensor>(allocator_, dtype, shape);
+
+  if (is_ref) {
+    DataType expected_dtype = RemoveRefType(input_types[inputs->size()]);
+    if (expected_dtype != dtype) {
+      return errors::InvalidArgument("The input data type is ", dtype,
+                                     " , but expected: ", expected_dtype);
+    }
+    inputs->push_back({&lock_for_refs_, input.get()});
+  } else {
+    if (input_types[inputs->size()] != dtype) {
+      return errors::InvalidArgument(
+          "The input data type is ", dtype,
+          " , but expected: ", input_types[inputs->size()]);
+    }
+    inputs->push_back({nullptr, input.get()});
+  }
+
+  // TODO(jsimsa): Figure out how to avoid using a member variable to garbage
+  // collect the inputs.
+  tensors_.push_back(std::move(input));
+
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d14608e3465f7606ed8b90c83456ed444c96822
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+
+// Helpful functions to test Dataset op kernels.
+class DatasetOpsTestBase : public ::testing::Test {
+ public:
+  DatasetOpsTestBase()
+      : device_(DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")),
+        device_type_(DEVICE_CPU) {
+    allocator_ = device_->GetAllocator(AllocatorAttributes());
+  }
+
+  ~DatasetOpsTestBase() {}
+
+  // The method validates whether the two tensors have the same shape, dtype,
+  // and value.
+  static Status ExpectEqual(const Tensor& a, const Tensor& b);
+
+  // Creates a tensor with the specified dtype, shape, and value.
+  template <typename T>
+  static Tensor CreateTensor(TensorShape input_shape,
+                             const gtl::ArraySlice<T>& input_data) {
+    Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+    test::FillValues<T>(&tensor, input_data);
+    return tensor;
+  }
+
+  // Creates a new op kernel based on the node definition.
+  Status CreateOpKernel(const NodeDef& node_def,
+                        std::unique_ptr<OpKernel>* op_kernel);
+
+  // Creates a new dataset.
+  Status CreateDataset(OpKernel* kernel, OpKernelContext* context,
+                       DatasetBase** const dataset);
+
+  // Creates a new RangeDataset op kernel. `T` specifies the output dtype of the
+  // op kernel.
+  template <typename T>
+  Status CreateRangeDatasetOpKernel(
+      StringPiece node_name, std::unique_ptr<OpKernel>* range_op_kernel) {
+    DataTypeVector dtypes({tensorflow::DataTypeToEnum<T>::value});
+    std::vector<PartialTensorShape> shapes({{}});
+    NodeDef node_def = test::function::NDef(
+        node_name, "RangeDataset", {"start", "stop", "step"},
+        {{"output_types", dtypes}, {"output_shapes", shapes}});
+
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, range_op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new RangeDataset dataset. `T` specifies the output dtype of the
+  // RangeDataset op kernel.
+  template <typename T>
+  Status CreateRangeDataset(int64 start, int64 end, int64 step,
+                            StringPiece node_name,
+                            DatasetBase** range_dataset) {
+    std::unique_ptr<OpKernel> range_kernel;
+    TF_RETURN_IF_ERROR(CreateRangeDatasetOpKernel<T>(node_name, &range_kernel));
+    gtl::InlinedVector<TensorValue, 4> range_inputs;
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {start}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {end}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {step}));
+    std::unique_ptr<OpKernelContext> range_context;
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(range_kernel.get(), &range_inputs,
+                                             &range_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*range_kernel, range_inputs));
+    TF_RETURN_IF_ERROR(RunOpKernel(range_kernel.get(), range_context.get()));
+    TF_RETURN_IF_ERROR(
+        GetDatasetFromContext(range_context.get(), 0, range_dataset));
+    return Status::OK();
+  }
+
+  // Creates a new TensorSliceDataset op kernel.
+  Status CreateTensorSliceDatasetKernel(
+      StringPiece node_name, const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes,
+      std::unique_ptr<OpKernel>* tensor_slice_dataset_kernel);
+
+  // Creates a new TensorSliceDataset.
+  Status CreateTensorSliceDataset(StringPiece node_name,
+                                  std::vector<Tensor>* const components,
+                                  DatasetBase** tensor_slice_dataset);
+
+  // Fetches the dataset from the operation context.
+  Status GetDatasetFromContext(OpKernelContext* context, int output_index,
+                               DatasetBase** const dataset);
+
+ protected:
+  // Creates a thread pool for parallel tasks.
+  Status InitThreadPool(int thread_num);
+
+  // Initializes the runtime for computing the dataset operation and registers
+  // the input function definitions. `InitThreadPool()' needs to be called
+  // before this method if we want to run the tasks in parallel.
+  Status InitFunctionLibraryRuntime(const std::vector<FunctionDef>& flib,
+                                    int cpu_num);
+
+  // Runs an operation producing outputs.
+  Status RunOpKernel(OpKernel* op_kernel, OpKernelContext* context);
+
+  // Checks that the size of `inputs` matches the requirement of the op kernel.
+  Status CheckOpKernelInput(const OpKernel& kernel,
+                            const gtl::InlinedVector<TensorValue, 4>& inputs);
+
+  // Creates a new context for running the dataset operation.
+  Status CreateOpKernelContext(OpKernel* kernel,
+                               gtl::InlinedVector<TensorValue, 4>* inputs,
+                               std::unique_ptr<OpKernelContext>* context);
+
+  // Creates a new iterator context for iterating the dataset.
+  Status CreateIteratorContext(
+      OpKernelContext* const op_context,
+      std::unique_ptr<IteratorContext>* iterator_context);
+
+  // Creates a new serialization context for serializing the dataset and
+  // iterator.
+  Status CreateSerializationContext(
+      std::unique_ptr<SerializationContext>* context);
+
+  // Adds an arrayslice of data into the input vector. `input_types` describes
+  // the required data type for each input tensor. `shape` and `data` describes
+  // the shape and values of the current input tensor. `T` specifies the dtype
+  // of the input data.
+  template <typename T>
+  Status AddDatasetInputFromArray(gtl::InlinedVector<TensorValue, 4>* inputs,
+                                  DataTypeVector input_types,
+                                  const TensorShape& shape,
+                                  const gtl::ArraySlice<T>& data) {
+    TF_RETURN_IF_ERROR(
+        AddDatasetInput(inputs, input_types, DataTypeToEnum<T>::v(), shape));
+    test::FillValues<T>(inputs->back().tensor, data);
+    return Status::OK();
+  }
+
+ private:
+  // Adds an empty tensor with the specified dtype and shape to the input
+  // vector.
+  Status AddDatasetInput(gtl::InlinedVector<TensorValue, 4>* inputs,
+                         DataTypeVector input_types, DataType dtype,
+                         const TensorShape& shape);
+
+ protected:
+  std::unique_ptr<Device> device_;
+  DeviceType device_type_;
+  Allocator* allocator_;  // Owned by `AllocatorFactoryRegistry`.
+  std::vector<AllocatorAttributes> allocator_attrs_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Owned by `pflr_`.
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<OpKernelContext::Params> params_;
+  std::unique_ptr<checkpoint::TensorSliceReaderCacheWrapper>
+      slice_reader_cache_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::vector<std::unique_ptr<Tensor>> tensors_;  // Owns tensors.
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs.
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 4d92d314d3d207d12310bb744b5601ad922bc570..bdf275cd5abac6207ad1b1b2384dd8051153a68a 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -18,11 +18,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 namespace data {
 
-Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+Status ComputeShortCircuitIndices(OpKernelConstruction* ctx,
                                   const NameAttrList& func,
                                   std::vector<int>* indices) {
   FunctionLibraryRuntime::Handle fn_handle;
@@ -141,5 +142,143 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
   return Status::OK();
 }
 
+namespace {
+
+constexpr char kDelimiter[] = "@@";
+
+}  // namespace
+
+VariantTensorDataReader::VariantTensorDataReader(
+    const tensorflow::VariantTensorData* data)
+    : data_(data) {
+  string metadata;
+  data_->get_metadata(&metadata);
+  auto keys = str_util::Split(metadata, kDelimiter, str_util::SkipEmpty());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    map_[keys[i]] = i;
+  }
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, string* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) {
+  return ReadTensorInternal(key, val);
+}
+
+bool VariantTensorDataReader::Contains(StringPiece key) {
+  return map_.find(string(key)) != map_.end();
+}
+
+template <typename T>
+Status VariantTensorDataReader::ReadScalarInternal(StringPiece key, T* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]).scalar<T>()();
+  return Status::OK();
+}
+
+Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
+                                                   Tensor* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]);
+  return Status::OK();
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key, const int64 val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key,
+                                            const string& val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteTensor(StringPiece key,
+                                            const Tensor& val) {
+  return WriteTensorInternal(key, val);
+}
+
+Status VariantTensorDataWriter::Flush() {
+  string metadata;
+  for (size_t i = 0; i < keys_.size(); ++i) {
+    strings::StrAppend(&metadata, kDelimiter, keys_[i]);
+  }
+  data_->set_metadata(metadata);
+  return Status::OK();
+}
+
+template <typename T>
+Status VariantTensorDataWriter::WriteScalarInternal(StringPiece key,
+                                                    const T& val) {
+  Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+  val_t.scalar<T>()() = val;
+  return WriteTensorInternal(key, val_t);
+}
+
+Status VariantTensorDataWriter::WriteTensorInternal(StringPiece key,
+                                                    const Tensor& val) {
+  DCHECK_EQ(key.find(kDelimiter), string::npos);
+  keys_.push_back(string(key));
+  *(data_->add_tensors()) = val;
+  return Status::OK();
+}
+
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add) {
+  for (const auto& fn : to_add.ListFunctionNames()) {
+    if (auto found = base->Find(fn)) {
+      if (!OpDefEqual(found->signature(), to_add.Find(fn)->signature())) {
+        return errors::InvalidArgument("Cannot add function '", fn,
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fn));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
+
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add) {
+  for (const auto& fd : to_add.function()) {
+    if (auto found = base->Find(fd.signature().name())) {
+      if (!OpDefEqual(found->signature(), fd.signature())) {
+        return errors::InvalidArgument("Cannot add function '",
+                                       fd.signature().name(),
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fd.signature().name()));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
+
+std::function<void(std::function<void()>)> RunnerWithMaxParallelism(
+    std::function<void(std::function<void()>)> runner, int max_parallelism) {
+  return std::bind(
+      [max_parallelism](
+          // Note: `runner` is a const reference to avoid copying it.
+          const std::function<void(std::function<void()>)>& runner,
+          std::function<void()> fn) {
+        std::function<void()> scoped_fn = std::bind(
+            [max_parallelism](const std::function<void()>& fn) {
+              ScopedPerThreadMaxParallelism scope(max_parallelism);
+              fn();
+            },
+            std::move(fn));
+        runner(std::move(scoped_fn));
+      },
+      std::move(runner), std::placeholders::_1);
+}
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 23a3d93ed160c95099a5c8ddb237b4c055a1845c..ad2abdb43da0da4095fbd45b5631245bc93ca9b2 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -33,7 +33,7 @@ namespace data {
 // Returns non-ok status if analysis of the function fails.
 //
 // TODO(jsimsa): Extend this to support constants as well.
-Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+Status ComputeShortCircuitIndices(OpKernelConstruction* ctx,
                                   const NameAttrList& func,
                                   std::vector<int>* indices);
 
@@ -57,6 +57,59 @@ Status VerifyTypesMatch(const DataTypeVector& expected,
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
                               const std::vector<PartialTensorShape>& received);
 
+// Helper class for reading data from a VariantTensorData object.
+class VariantTensorDataReader : public IteratorStateReader {
+ public:
+  explicit VariantTensorDataReader(const VariantTensorData* data);
+
+  // Returns OK iff the initialization was successful.
+  Status ReadScalar(StringPiece key, int64* val) override;
+  Status ReadScalar(StringPiece key, string* val) override;
+  Status ReadTensor(StringPiece key, Tensor* val) override;
+  bool Contains(StringPiece key) override;
+
+ private:
+  template <typename T>
+  Status ReadScalarInternal(StringPiece key, T* val);
+  Status ReadTensorInternal(StringPiece key, Tensor* val);
+
+  std::map<string, size_t> map_;
+  const VariantTensorData* data_;  // Not owned.
+};
+
+// Helper class for writing data to a VariantTensorData object.
+class VariantTensorDataWriter : public IteratorStateWriter {
+ public:
+  // Does not take ownership of data.
+  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
+  Status WriteScalar(StringPiece key, const int64 val) override;
+  Status WriteScalar(StringPiece key, const string& val) override;
+  Status WriteTensor(StringPiece key, const Tensor& val) override;
+
+  // Writes the metadata to `data_`.
+  Status Flush();
+
+ private:
+  template <typename T>
+  Status WriteScalarInternal(StringPiece key, const T& val);
+  Status WriteTensorInternal(StringPiece key, const Tensor& val);
+
+  VariantTensorData* data_;
+  std::vector<string> keys_;
+};
+
+// Adds the functions in `to_add` to `base`. If a function with a matching
+// signature already exists in `base`, replaces it with the function from
+// `to_add`.
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add);
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add);
+
+// Creates a runner that runs functions with limited parallelism.
+std::function<void(std::function<void()>)> RunnerWithMaxParallelism(
+    std::function<void(std::function<void()>)> runner, int max_parallelism);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 43295b8ebb8f9df2acae8e17162f2d307dd4d9c5..23ae9d4a26eda0eaa5f741695b436a85fd8ea6c9 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-TEST(DatasetUtils, ComputeMoveVector) {
+TEST(DatasetUtilsTest, ComputeMoveVector) {
   struct TestCase {
     std::vector<int> indices;
     std::vector<bool> expected;
@@ -41,6 +45,132 @@ TEST(DatasetUtils, ComputeMoveVector) {
   }
 }
 
+TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(writer.WriteScalar("Int64", 24));
+  Tensor input_tensor(DT_FLOAT, {1});
+  input_tensor.flat<float>()(0) = 2.0f;
+  TF_ASSERT_OK(writer.WriteTensor("Tensor", input_tensor));
+  TF_ASSERT_OK(writer.Flush());
+
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  TF_ASSERT_OK(reader.ReadScalar("Int64", &val_int64));
+  EXPECT_EQ(val_int64, 24);
+  Tensor val_tensor;
+  TF_ASSERT_OK(reader.ReadTensor("Tensor", &val_tensor));
+  EXPECT_EQ(input_tensor.NumElements(), val_tensor.NumElements());
+  EXPECT_EQ(input_tensor.flat<float>()(0), val_tensor.flat<float>()(0));
+}
+
+TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
+  VariantTensorData data;
+  strings::StrAppend(&data.metadata_, "key1", "@@");
+  data.tensors_.push_back(Tensor(DT_INT64, {1}));
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  string val_string;
+  Tensor val_tensor;
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_int64).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_string).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadTensor("NonExistentKey", &val_tensor).code());
+}
+
+TEST(DatasetUtilsTest, AddToFunctionLibrary) {
+  auto make_fn_a = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/{{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node:output:0"}});
+  };
+
+  auto make_fn_b = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/
+        {{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}},
+         {{"node2"}, "Identity", {"node:output:0"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node2:output:0"}});
+  };
+
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = make_fn_a("0");
+  *fdef_base.add_function() = make_fn_a("1");
+  *fdef_base.add_function() = make_fn_a("2");
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = make_fn_b("0");  // Override
+  *fdef_to_add.add_function() = make_fn_a("1");  // Do nothing
+  *fdef_to_add.add_function() = make_fn_b("3");  // Add new function
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_0, fdef_to_add));
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_1, flib_to_add));
+
+  for (const auto& flib : {flib_0, flib_1}) {
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("0"), make_fn_b("0")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("1"), make_fn_a("1")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("2"), make_fn_a("2")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("3"), make_fn_b("3")));
+  }
+}
+
+TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}});
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64", "ret2: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}, {"ret2", "arg"}});
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  Status s = AddToFunctionLibrary(&flib_0, fdef_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  s = AddToFunctionLibrary(&flib_1, flib_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+}
+
+TEST(DatasetUtilsTest, RunnerWithMaxParallelism) {
+  auto runner =
+      RunnerWithMaxParallelism([](const std::function<void()> fn) { fn(); }, 2);
+  auto fn = []() { ASSERT_EQ(GetPerThreadMaxParallelism(), 2); };
+  runner(fn);
+}
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 2c3ffad37509f9fe9d8e763ec510b8587f40e303..060ae6e1374c376c49b9702b77b5622c319acb4c 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -21,6 +21,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "choose_fastest_branch_dataset_op",
+    srcs = ["choose_fastest_branch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:take_dataset_op",
+    ],
+)
+
 tf_kernel_library(
     name = "csv_dataset_op",
     srcs = ["csv_dataset_op.cc"],
@@ -54,6 +68,21 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "auto_shard_dataset_op",
+    srcs = ["auto_shard_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/optimizers/data:auto_shard",
+        "//tensorflow/core/kernels/data:graph_rewrite_dataset",
+    ],
+)
+
 tf_kernel_library(
     name = "group_by_reducer_dataset_op",
     srcs = ["group_by_reducer_dataset_op.cc"],
@@ -119,15 +148,17 @@ tf_kernel_library(
     name = "map_and_batch_dataset_op",
     srcs = ["map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:stats_utils",
     ],
 )
 
@@ -169,11 +200,13 @@ tf_kernel_library(
     name = "numa_map_and_batch_dataset_op",
     srcs = ["numa_map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "@com_google_absl//absl/memory",
@@ -200,7 +233,9 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels/data:parallel_map_iterator",
+        "//tensorflow/core/kernels/data:stats_utils",
     ],
 )
 
@@ -227,6 +262,21 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "rebatch_dataset_op",
+    srcs = ["rebatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/optimizers/data:rebatch",
+        "//tensorflow/core/kernels/data:graph_rewrite_dataset",
+    ],
+)
+
 tf_kernel_library(
     name = "scan_dataset_op",
     srcs = ["scan_dataset_op.cc"],
@@ -339,6 +389,7 @@ tf_kernel_library(
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset_utils",
         "//third_party/eigen3",
     ],
 )
@@ -369,6 +420,8 @@ tf_kernel_library(
     name = "dataset_kernels",
     deps = [
         ":assert_next_dataset_op",
+        ":auto_shard_dataset_op",
+        ":choose_fastest_branch_dataset_op",
         ":choose_fastest_dataset_op",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
@@ -386,6 +439,7 @@ tf_kernel_library(
         ":parse_example_dataset_op",
         ":prefetching_kernels",
         ":random_dataset_op",
+        ":rebatch_dataset_op",
         ":scan_dataset_op",
         ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3728c64ab5df80e1612cceec606d8ddb15ab9a48
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc
@@ -0,0 +1,118 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kOptimizerName[] = "tf_auto_shard";
+
+class AutoShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit AutoShardDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 index;
+    int64 num_workers;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
+    OP_REQUIRES(
+        ctx, num_workers > 0,
+        errors::InvalidArgument("num_workers must be greater than zero."));
+
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "index", &index));
+    OP_REQUIRES(ctx, index >= 0 && index < num_workers,
+                errors::InvalidArgument("index must be between 0 and ",
+                                        num_workers - 1));
+
+    Dataset* dataset = new Dataset(ctx, input, num_workers, index,
+                                   output_types_, output_shapes_);
+    const Status s = dataset->Optimize(ctx);
+
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
+  }
+
+ private:
+  class Dataset : public GraphRewriteDataset {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const int64 num_workers, const int64 index,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
+          num_workers_(num_workers),
+          index_(index) {}
+
+    string DebugString() const override {
+      return "AutoShardDatasetOp::Dataset";
+    }
+
+   private:
+    bool ShouldOptimizeFunctions() override {
+      // We only want to optimize functions for some particular datasets like
+      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+      // function optimization and explicitly handle function modifications
+      // for those datasets in the rewrite.
+      return false;
+    }
+
+    RewriterConfig CreateGrapplerRewriteConfig() override {
+      RewriterConfig rewriter_config;
+      rewriter_config.set_fail_on_optimizer_errors(true);
+      rewriter_config.add_optimizers(kOptimizerName);
+      rewriter_config.set_meta_optimizer_iterations(
+          RewriterConfig_NumIterationsType_ONE);
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      AttrValue num_workers_attr;
+      num_workers_attr.set_i(num_workers_);
+      (*custom_optimizer->mutable_parameter_map())["num_workers"] =
+          num_workers_attr;
+
+      AttrValue index_attr;
+      index_attr.set_i(index_);
+      (*custom_optimizer->mutable_parameter_map())["index"] = index_attr;
+
+      return rewriter_config;
+    }
+
+    const int64 num_workers_;
+    const int64 index_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalAutoShardDataset").Device(DEVICE_CPU),
+                        AutoShardDatasetOp);
+
+}  // anonymous namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb1c35411ff4e0244e4896f4b4b000750cae4b5c
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -0,0 +1,557 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/take_dataset_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+static const double kPercentile = 90.0;
+
+// Each instance of this class wraps an iterator. Whenever an iterator created
+// for this dataset invokes the `GetNext` method, the call is delegated to the
+// wrapped iterator's `GetNext` method.
+class WrapperDataset : public DatasetBase {
+ public:
+  WrapperDataset(DatasetContext::Params params,
+                 const DataTypeVector* output_dtypes,
+                 const std::vector<PartialTensorShape>* output_shapes,
+                 IteratorBase* iterator)
+      : DatasetBase(DatasetContext(std::move(params))),
+        output_dtypes_(output_dtypes),
+        output_shapes_(output_shapes),
+        real_iterator_(iterator) {}
+
+  const DataTypeVector& output_dtypes() const override {
+    return *output_dtypes_;
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return *output_shapes_;
+  }
+
+  string DebugString() const override { return "WrapperDataset"; }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** node) const override {
+    return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
+  }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    // MakeIterator should only be called once per WrapperDataset. However,
+    // since this function expects an iterator return value, we raise the
+    // error only at iterator initialization time.
+    bool error = iterator_created_;
+    iterator_created_ = true;
+    return absl::make_unique<WrapperIterator>(
+        WrapperIterator::Params{this, strings::StrCat(prefix, "::Wrapper")},
+        error);
+  }
+
+ private:
+  class WrapperIterator : public DatasetIterator<WrapperDataset> {
+   public:
+    explicit WrapperIterator(const Params& params, bool error)
+        : DatasetIterator<WrapperDataset>(params), error_(error) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      if (error_) {
+        return errors::InvalidArgument(
+            "Cannot create more than one WrapperIterator per WrapperDataset. "
+            "Make sure the branches to ChooseFastestDataset do not expect the "
+            "input to repeat.");
+      }
+      return Status::OK();
+    }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      return dataset()->real_iterator_->GetNext(ctx, out_tensors,
+                                                end_of_sequence);
+    }
+
+   protected:
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1.0);
+    }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return Status::OK();
+    }
+
+   private:
+    const bool error_;
+  };
+
+  mutable bool iterator_created_ = false;
+  const DataTypeVector* const output_dtypes_;
+  const std::vector<PartialTensorShape>* const output_shapes_;
+  IteratorBase* const real_iterator_;  // not owned.
+};
+
+// This Dataset picks between some dataset function branches. Each function is
+// expected to input a dataset and output a dataset. The datasets in the
+// branches are expected to be stateless. For each iterator that can be produced
+// by a functions output, it is expected to call the input dataset's
+// MakeIterator method at most once; otherwise, undefined behavior may occur.
+class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ChooseFastestBranchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &funcs_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_elements_per_branch",
+                                     &num_elements_per_branch_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("other_arguments_lengths",
+                                     &other_arguments_lengths_));
+    OP_REQUIRES(
+        ctx, funcs_.size() == other_arguments_lengths_.size(),
+        errors::InvalidArgument(
+            "branches and other_arguments_lengths must have the same length."));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "ratio_numerator",
+                                                   &ratio_numerator_));
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "ratio_denominator",
+                                                   &ratio_denominator_));
+    OP_REQUIRES(ctx, ratio_numerator_ > 0,
+                errors::InvalidArgument(
+                    "`ratio_numerator` must be greater than zero."));
+    OP_REQUIRES(ctx, ratio_denominator_ > 0,
+                errors::InvalidArgument(
+                    "`ratio_denominator` must be greater than zero."));
+    OP_REQUIRES(ctx, num_elements_per_branch_ % ratio_denominator_ == 0,
+                errors::InvalidArgument("`num_elements_per_branch` must be "
+                                        "divisible by `ratio_denominator`."));
+
+    std::vector<std::unique_ptr<CapturedFunction>> captured_funcs(
+        funcs_.size());
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+
+    // Keeps track of starting index into other_arguments for a given function.
+    int index = 0;
+    for (int i = 0; i < funcs_.size(); ++i) {
+      std::vector<Tensor> captured_args;
+      captured_args.reserve(other_arguments_lengths_[i]);
+      int end_index = index + other_arguments_lengths_[i];
+      for (; index < end_index; ++index) {
+        captured_args.push_back(inputs[index]);
+      }
+      OP_REQUIRES_OK(
+          ctx, CapturedFunction::Create(
+                   funcs_[i], ctx, std::move(captured_args),
+                   /*use_inter_op_parallelism=*/true, &captured_funcs[i]));
+    }
+    *output =
+        new Dataset(ctx, input, funcs_, std::move(captured_funcs),
+                    output_types_, output_shapes_, num_elements_per_branch_,
+                    ratio_numerator_, ratio_denominator_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, DatasetBase* input,
+            const std::vector<NameAttrList>& funcs,
+            std::vector<std::unique_ptr<CapturedFunction>> captured_funcs,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            int64 num_elements_per_branch, int64 ratio_numerator,
+            int64 ratio_denominator)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          funcs_(funcs),
+          captured_funcs_(std::move(captured_funcs)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          num_elements_per_branch_(num_elements_per_branch),
+          ratio_numerator_(ratio_numerator),
+          ratio_denominator_(ratio_denominator) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<ChooseFastestIterator>(
+          ChooseFastestIterator::Params{
+              this, strings::StrCat(prefix, "::ChooseFastestBranch")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ChooseFastestBranchDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      // TODO(rachelim): this might be wrong if the ratio is not fixed, for
+      // example, from a BatchDataset with drop_remainder = False
+      return static_cast<double>(n) * ratio_numerator_ / ratio_denominator_;
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+      Node* ratio_numerator_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(ratio_numerator_, &ratio_numerator_node));
+      Node* ratio_denominator_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(ratio_denominator_, &ratio_denominator_node));
+
+      std::vector<int32> other_arguments_lengths;
+      other_arguments_lengths.reserve(captured_funcs_.size());
+      int num_captured_inputs = 0;
+      for (const auto& func : captured_funcs_) {
+        num_captured_inputs += func->captured_inputs().size();
+        other_arguments_lengths.push_back(func->captured_inputs().size());
+      }
+      DataTypeVector other_arguments_types;
+      std::vector<Node*> other_arguments;
+      other_arguments_types.reserve(num_captured_inputs);
+      other_arguments.reserve(num_captured_inputs);
+      for (const auto& func : captured_funcs_) {
+        for (const Tensor& t : func->captured_inputs()) {
+          Node* node;
+          DatasetBase* input;
+          Status s = GetDatasetFromVariantTensor(t, &input);
+          if (s.ok()) {
+            TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+          } else {
+            TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+          }
+          other_arguments.emplace_back(node);
+          other_arguments_types.emplace_back(t.dtype());
+        }
+      }
+
+      // Targuments
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      // num_elements_per_branch
+      AttrValue num_elements_per_branch_attr;
+      b->BuildAttrValue(num_elements_per_branch_,
+                        &num_elements_per_branch_attr);
+
+      // branches
+      AttrValue branches_attr;
+      b->BuildAttrValue(funcs_, &branches_attr);
+      for (const auto& func : funcs_) {
+        TF_RETURN_IF_ERROR(b->AddFunction(ctx, func.name()));
+      }
+
+      // other_arguments_lengths
+      AttrValue other_arguments_lengths_attr;
+      b->BuildAttrValue(other_arguments_lengths, &other_arguments_lengths_attr);
+
+      return b->AddDataset(
+          this,
+          /*inputs=*/
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(1, ratio_numerator_node),
+           std::make_pair(2, ratio_denominator_node)},
+          /*list_inputs=*/{std::make_pair(3, other_arguments)},
+          /*attrs=*/
+          {std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("num_elements_per_branch",
+                          num_elements_per_branch_attr),
+           std::make_pair("branches", branches_attr),
+           std::make_pair("other_arguments_lengths",
+                          other_arguments_lengths_attr)},
+          output);
+    }
+
+   private:
+    // This iterator picks the fastest of dataset branches by running
+    // experiments for the first dataset()->num_elements_per_branch_ *
+    // num_branches iterations.
+    class ChooseFastestIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit ChooseFastestIterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            instantiated_captured_funcs_(dataset()->funcs_.size()),
+            histograms_(dataset()->funcs_.size()) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+
+        for (int i = 0; i < dataset()->funcs_.size(); ++i) {
+          TF_RETURN_IF_ERROR(dataset()->captured_funcs_[i]->Instantiate(
+              ctx, &instantiated_captured_funcs_[i]));
+        }
+
+        return Status::OK();
+      }
+
+      // The first num_elements_per_branch * num_branches iterations, we run
+      // experiments on the branches, using (branch_index_, experiment_counter_)
+      // to keep track of which experiment we're on.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        {  // Locking scope
+          mutex_lock l(mu_);
+          if (branch_index_ < dataset()->funcs_.size()) {
+            // Still running experiments
+            if (!current_iterator_) {
+              TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, branch_index_,
+                                                     /*is_experiment=*/true));
+            }
+
+            Status s = GetNextFromExperiment(ctx, out_tensors, end_of_sequence);
+            experiment_counter_++;
+
+            if (experiment_counter_ >= dataset()->num_elements_per_branch_) {
+              // Done experimenting with this branch. Increment the branch index
+              // so that on the next iteration, we will draw from the next
+              // branch.
+              experiment_counter_ = 0;
+              branch_index_++;
+              current_iterator_.reset();
+            }
+            return s;
+          }
+          if (!current_iterator_) {
+            SelectFastestInputIndex();
+            TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, fastest_index_,
+                                                   /*is_experiment=*/false));
+          }
+        }
+
+        return current_iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(
+            std::move(args),
+            /*ratio=*/static_cast<double>(dataset()->ratio_numerator_) /
+                dataset()->ratio_denominator_);
+      }
+
+      // TODO(rachelim): Save and restore histogram state as well. Currently,
+      // if an iterator is saved and restored, the histograms start recording
+      // from scratch.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("experiment_counter"),
+                                               experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("branch_index"), branch_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("fastest_index"), fastest_index_));
+        if (current_iterator_) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, current_iterator_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("experiment_counter"),
+                                              &experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("branch_index"), &branch_index_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("fastest_index"), &fastest_index_));
+
+        // Restore state of `current_iterator_` if it exists.
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          if (branch_index_ < dataset()->funcs_.size()) {
+            TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, branch_index_,
+                                                   /*is_experiment=*/true));
+          } else {
+            TF_RETURN_IF_ERROR(MakeCurrentIterator(ctx, fastest_index_,
+                                                   /*is_experiment=*/false));
+          }
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_iterator_));
+        }
+        return Status::OK();
+      }
+
+     private:
+      Status GetNextFromExperiment(IteratorContext* ctx,
+                                   std::vector<Tensor>* out_tensors,
+                                   bool* end_of_sequence)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        DCHECK_GE(branch_index_, 0);
+        DCHECK_LT(branch_index_, histograms_.size());
+
+        int64 start = Env::Default()->NowNanos();
+        Status s =
+            current_iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+
+        histograms_[branch_index_].Add(
+            static_cast<double>(Env::Default()->NowNanos() - start));
+        return s;
+      }
+
+      // Select the fastest input to use based on the histograms of timings
+      // of the completed iterations. The input with the best 90th percentile
+      // iteration time is selected.
+      void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        fastest_index_ = 0;
+
+        VLOG(2) << "90.0 percentile iteration time:";
+        double best_percentile = histograms_[0].Percentile(kPercentile);
+        VLOG(2) << "Branch 0: " << best_percentile;
+        for (size_t i = 1, num_inputs = histograms_.size(); i < num_inputs;
+             ++i) {
+          double percentile = histograms_[i].Percentile(kPercentile);
+          VLOG(2) << "Branch " << i << ": " << percentile;
+          if (percentile <= best_percentile) {
+            best_percentile = percentile;
+            fastest_index_ = i;
+          }
+        }
+        VLOG(1) << "Selecting index " << fastest_index_
+                << " as the fastest index.";
+      }
+
+      Status MakeCurrentIterator(IteratorContext* ctx, int64 branch_index,
+                                 bool is_experiment)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        DCHECK_GE(branch_index, 0);
+        DCHECK_LT(branch_index, histograms_.size());
+
+        // `StoreDatasetInVariantTensor` transfers ownership of the dataset
+        // to the tensor, so the tensor must persist between iterations.
+        wrapper_dataset_tensor_ =
+            absl::make_unique<Tensor>(DT_VARIANT, TensorShape({}));
+
+        DatasetContext::Params params;
+        params.type_string = "ChooseFastestBranch_Wrapper";
+        params.node_name = strings::StrCat(params.type_string, branch_index);
+        DatasetBase* temp_dataset =
+            new WrapperDataset(std::move(params), &dataset()->output_types_,
+                               &dataset()->output_shapes_, input_impl_.get());
+
+        if (is_experiment) {
+          // When running experiment iterations, we add a TakeDataset in between
+          // the input and the function datasets. This is so that function
+          // datasets with prefetching behavior won't consume more input
+          // elements than they actually use to produce output.
+          DatasetContext::Params take_dataset_params;
+          take_dataset_params.type_string = "ChooseFastestBranch_Take";
+          take_dataset_params.node_name =
+              strings::StrCat(take_dataset_params.type_string, branch_index);
+          int64 count = dataset()->num_elements_per_branch_ *
+                        dataset()->ratio_numerator_ /
+                        dataset()->ratio_denominator_;
+          temp_dataset = new TakeDataset(std::move(take_dataset_params), count,
+                                         temp_dataset);
+        }
+
+        TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(
+            temp_dataset, wrapper_dataset_tensor_.get()));
+
+        TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
+            ctx, {*wrapper_dataset_tensor_}, branch_index,
+            *instantiated_captured_funcs_[branch_index], prefix(),
+            &current_iterator_));
+
+        return Status::OK();
+      }
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<std::unique_ptr<InstantiatedCapturedFunction>>
+          instantiated_captured_funcs_ GUARDED_BY(mu_);
+
+      // For tracking the time taken for each input's iterations.
+      std::vector<histogram::Histogram> histograms_ GUARDED_BY(mu_);
+      int64 fastest_index_ = -1;
+      std::unique_ptr<Tensor> wrapper_dataset_tensor_;
+      std::unique_ptr<IteratorBase> current_iterator_;
+
+      // Keeps track of which (branch, experiment) the next iteration is on.
+      int64 branch_index_ GUARDED_BY(mu_) = 0;
+      int64 experiment_counter_ GUARDED_BY(mu_) = 0;
+    };  // class Iterator
+
+    const DatasetBase* const input_;
+    std::vector<NameAttrList> funcs_;
+    const std::vector<std::unique_ptr<CapturedFunction>> captured_funcs_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const int64 num_elements_per_branch_;
+    const int64 ratio_numerator_;
+    const int64 ratio_denominator_;
+  };  // class Dataset
+
+  int64 ratio_numerator_;
+  int64 ratio_denominator_;
+  int64 num_elements_per_branch_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::vector<NameAttrList> funcs_;
+  std::vector<int32> other_arguments_lengths_;
+};  // class ChooseFastestBranchDatasetOp
+
+// Register the kernel implementation for ChooseFastestBranchDataset.
+REGISTER_KERNEL_BUILDER(Name("ChooseFastestBranchDataset").Device(DEVICE_CPU),
+                        ChooseFastestBranchDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index f66d5d9955a43e5d1e95730e46b437b450184e3c..1ae86c1dbfaefe9886855a116c0e7c4ca939e6e9 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -31,6 +31,8 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
   explicit ChooseFastestDatasetOp(OpKernelConstruction* ctx)
       : DatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("num_experiments", &num_experiments_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
@@ -49,45 +51,43 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       inputs.push_back(input);
     }
 
-    const DataTypeVector& output_types = inputs[0]->output_dtypes();
     for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
-      OP_REQUIRES(ctx, inputs[i]->output_dtypes() == output_types,
-                  errors::InvalidArgument(
-                      "All inputs to ChooseFastestDataset "
-                      "must have the same output types. Input ",
-                      i, " has output types: ",
-                      DataTypeVectorString(inputs[i]->output_dtypes()),
-                      ", while all prior inputs have types: ",
-                      DataTypeVectorString(output_types), "."));
+      OP_REQUIRES(
+          ctx, inputs[i]->output_dtypes() == output_types_,
+          errors::InvalidArgument(
+              "All inputs to ChooseFastestDataset "
+              "must have the same output types. Input ",
+              i, " has output types: ",
+              DataTypeVectorString(inputs[i]->output_dtypes()),
+              ". Expected: ", DataTypeVectorString(output_types_), "."));
     }
 
-    std::vector<PartialTensorShape> output_shapes = inputs[0]->output_shapes();
     // Merge the output shapes of all the input datasets, returning an
     // error if any of them are incompatible.
     for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
       OP_REQUIRES(
-          ctx, inputs[i]->output_shapes().size() == output_shapes.size(),
+          ctx, inputs[i]->output_shapes().size() == output_shapes_.size(),
           errors::InvalidArgument(
               "All inputs to ChooseFastestDataset must have compatible outputs."
               " Input ",
               i, " has ", inputs[i]->output_shapes().size(),
-              " components, while all prior inputs have ", output_shapes.size(),
+              " components. Expected to have ", output_shapes_.size(),
               " components."));
-      for (size_t j = 0, num_components = output_shapes.size();
+      for (size_t j = 0, num_components = output_shapes_.size();
            j < num_components; ++j) {
         PartialTensorShape result;
-        OP_REQUIRES(
-            ctx,
-            output_shapes[j]
-                .MergeWith(inputs[i]->output_shapes().at(j), &result)
-                .ok(),
-            errors::InvalidArgument(
-                "All inputs to ChooseFastestDataset must have "
-                "compatible output shapes. Component ",
-                j, " of input ", i, " has shape: ",
-                inputs[i]->output_shapes().at(j), ", while components ", j,
-                " of all prior inputs have shape: ", output_shapes[j], "."));
-        output_shapes[j] = std::move(result);
+        OP_REQUIRES(ctx,
+                    output_shapes_[j]
+                        .MergeWith(inputs[i]->output_shapes().at(j), &result)
+                        .ok(),
+                    errors::InvalidArgument(
+                        "All inputs to ChooseFastestDataset must have "
+                        "compatible output shapes. Component ",
+                        j, " of input ", i,
+                        " has shape: ", inputs[i]->output_shapes().at(j),
+                        ". Expected to be compatible with shape: ",
+                        output_shapes_[j], "."));
+        output_shapes_[j] = std::move(result);
       }
     }
 
@@ -108,7 +108,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
                 "."));
       }
     }
-    *output = new Dataset(ctx, std::move(inputs), std::move(output_shapes),
+    *output = new Dataset(ctx, std::move(inputs), output_types_, output_shapes_,
                           cardinality, num_experiments_);
   }
 
@@ -116,11 +116,13 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, std::vector<DatasetBase*> inputs,
-            std::vector<PartialTensorShape> output_shapes, int64 cardinality,
-            int64 num_experiments)
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            int64 cardinality, int64 num_experiments)
         : DatasetBase(DatasetContext(ctx)),
           inputs_(std::move(inputs)),
-          output_shapes_(std::move(output_shapes)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
           cardinality_(cardinality),
           num_experiments_(num_experiments) {
       for (auto input : inputs_) {
@@ -142,7 +144,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
     }
 
     const DataTypeVector& output_dtypes() const override {
-      return inputs_[0]->output_dtypes();
+      return output_types_;
     }
 
     const std::vector<PartialTensorShape>& output_shapes() const override {
@@ -215,8 +217,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
           }
           return threads[0].result->status;
         }
-        return input_impls_[fastest_index_]->GetNext(ctx, out_tensors,
-                                                     end_of_sequence);
+        return fastest_input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
      protected:
@@ -230,7 +231,14 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       // from scratch.
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        if (input_impls_.empty()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("experiment_counter"),
+                                               experiment_counter_));
+
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("fastest_index"), fastest_index_));
+        if (fastest_index_ != -1) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, fastest_input_impl_));
+        } else if (input_impls_.empty()) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impls_empty"), ""));
         } else {
@@ -238,17 +246,22 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
             TF_RETURN_IF_ERROR(SaveInput(writer, input_impl));
           }
         }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("experiment_counter"),
-                                               experiment_counter_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("fastest_index"), fastest_index_));
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (reader->Contains(full_name("input_impls_empty"))) {
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("experiment_counter"),
+                                              &experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("fastest_index"), &fastest_index_));
+        if (fastest_index_ != -1) {
+          TF_RETURN_IF_ERROR(dataset()->inputs_[fastest_index_]->MakeIterator(
+              ctx, strings::StrCat(prefix(), "_", fastest_index_),
+              &fastest_input_impl_));
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, fastest_input_impl_));
+        } else if (reader->Contains(full_name("input_impls_empty"))) {
           input_impls_.clear();
         } else {
           DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
@@ -256,10 +269,6 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
             TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
           }
         }
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("experiment_counter"),
-                                              &experiment_counter_));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("fastest_index"), &fastest_index_));
         return Status::OK();
       }
 
@@ -277,6 +286,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       };
 
       std::vector<std::unique_ptr<IteratorBase>> input_impls_;
+      std::unique_ptr<IteratorBase> fastest_input_impl_;
       // For tracking the time taken for each input's iterations.
       std::vector<histogram::Histogram> histograms_;
 
@@ -290,10 +300,10 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
         for (size_t i = 0, num_inputs = dataset()->inputs_.size();
              i < num_inputs; ++i) {
           threads[i].result = absl::make_unique<InvocationResult>();
-          threads[i].thread.reset(ctx->env()->StartThread(
-              {}, strings::StrCat("tf_data_merge_", i),
+          threads[i].thread = ctx->StartThread(
+              strings::StrCat("tf_data_merge_", i),
               std::bind(&ChooseFastestIterator::RunnerThread, this, ctx,
-                        threads[i].result.get(), i)));
+                        threads[i].result.get(), i));
         }
         return threads;
       }
@@ -315,25 +325,36 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         fastest_index_ = 0;
 
+        VLOG(2) << "90.0 percentile iteration time:";
         double best_percentile = histograms_[0].Percentile(kPercentile);
+        VLOG(2) << "Branch 0: " << best_percentile;
         for (size_t i = 1, num_inputs = histograms_.size(); i < num_inputs;
              ++i) {
           double percentile = histograms_[i].Percentile(kPercentile);
+          VLOG(2) << "Branch " << i << ": " << percentile;
           if (percentile <= best_percentile) {
             best_percentile = percentile;
             fastest_index_ = i;
           }
         }
+        VLOG(1) << "Selecting index " << fastest_index_
+                << " as the fastest index.";
+
+        fastest_input_impl_ = std::move(input_impls_[fastest_index_]);
+        input_impls_.clear();  // Delete the unused iterators.
       }
     };  // class Iterator
 
     const std::vector<DatasetBase*> inputs_;
+    const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
     const int64 cardinality_;
     const int64 num_experiments_;
   };  // class Dataset
 
   int64 num_experiments_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };  // class ChooseFastestDatasetOp
 
 // Register the kernel implementation for ChooseFastestDataset.
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index e75e6e4b80bce5dd286ed297c1d645adcdc37a4b..758eef0be5ebb18bbfedff6d0de762a4f792adc0 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -149,7 +149,7 @@ class MaterializedDatasetResource : public ResourceBase {
 
 // A wrapper class for storing an `IndexedDataset` instance in a DT_VARIANT
 // tensor. Objects of the wrapper class own a reference on an instance of an
-// `IndexedTensor` and the wrapper's copy constructor and desctructor take care
+// `IndexedTensor` and the wrapper's copy constructor and destructor take care
 // of managing the reference count.
 //
 // NOTE: This is not a feature-complete implementation of the DT_VARIANT
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 0ef56915ebe33b0f843fa462ba981c34b6b257f7..53075ce67f851b9a7f50c86867aaee5f1bf8ee45 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -59,6 +60,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+    OP_REQUIRES_OK(
+        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
  protected:
@@ -86,12 +89,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
     MapAndBatchIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
-    if (indices.empty()) {
+    if (short_circuit_indices_.empty()) {
       map_func = [](IteratorContext* ctx,
                     InstantiatedCapturedFunction* instantiated_captured_func,
                     const string& prefix, std::vector<Tensor> args,
@@ -101,7 +101,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             ctx, std::move(args), out_tensors.get(), std::move(done), prefix);
       };
     } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
+      std::vector<bool> can_move = ComputeMoveVector(short_circuit_indices_);
+      const auto& indices = short_circuit_indices_;
       map_func = [raw_captured_func, indices, can_move](
                      IteratorContext* ctx,
                      InstantiatedCapturedFunction* instantiated_captured_func,
@@ -263,9 +264,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                                         (params.dataset->num_parallel_calls_ +
                                          params.dataset->batch_size_ - 1) /
                                             params.dataset->batch_size_)) {
-        std::vector<string> components =
-            str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        key_prefix_ = components.back();
       }
 
       ~Iterator() override {
@@ -355,7 +353,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
      private:
-      // BatchResult encapsulates the output batch, as well as anciliary
+      // BatchResult encapsulates the output batch, as well as ancillary
       // metadata required to execute the fused map-and-batch operation.
       struct BatchResult {
         explicit BatchResult(int64 batch_size) {
@@ -403,7 +401,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(key_prefix_, "::thread_utilization"),
+              stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
               static_cast<float>(num_calls_) /
                   static_cast<float>(num_parallel_calls_->value));
         }
@@ -450,7 +448,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               result->UpdateStatus(allocate_status, offset);
             } else {
               for (size_t i = 0; i < return_values->size(); ++i) {
-                const Tensor& tensor = return_values->at(i);
+                Tensor& tensor = return_values->at(i);
                 Tensor* batch = &(result->output)[i];
                 if (tensor.NumElements() !=
                     (batch->NumElements() / batch->dim_size(0))) {
@@ -468,8 +466,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 // TODO(mrry): Add a version of DoParallelConcat that allows us
                 // to move `tensor` where possible, to speed up string tensor
                 // batching.
-                Status copy_status =
-                    batch_util::CopyElementToSlice(tensor, batch, offset);
+                Status copy_status = batch_util::CopyElementToSlice(
+                    std::move(tensor), batch, offset);
                 if (!copy_status.ok()) {
                   result->UpdateStatus(copy_status, offset);
                   break;
@@ -516,9 +514,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!runner_thread_) {
           auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_map_and_batch",
-              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+          runner_thread_ = ctx->StartThread(
+              "tf_data_map_and_batch",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy));
         }
       }
 
@@ -648,7 +646,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           if (stats_aggregator) {
             mutex_lock l(*mu_);
             stats_aggregator->AddScalar(
-                strings::StrCat(key_prefix_, "::thread_utilization"),
+                stats_utils::ThreadUtilizationScalarName(
+                    dataset()->node_name()),
                 static_cast<float>(num_calls_) /
                     static_cast<float>(num_parallel_calls_->value));
           }
@@ -805,7 +804,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       int64 waiting_ GUARDED_BY(*mu_) = 0;
       // Identifies the maximum number of batch results to store.
       int64 max_batch_results_ GUARDED_BY(*mu_);
-      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
@@ -826,6 +824,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
   bool preserve_cardinality_;
+  std::vector<int> short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 643b6460e8a838e5e9d6f35e789dc0a82e4f7cc5..ce8a20a783f715e7e291133114f1cfa7b2559d1c 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -807,7 +807,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // inputs. When the runner thread makes new inputs available, it
         // notifies this condition variable.
         condition_variable worker_cond_var_ GUARDED_BY(mu_);
-        // The client threads wait on this condition variable for avaiable
+        // The client threads wait on this condition variable for available
         // batched outputs. When worker threads complete a batch, they notify
         // this condition variable.
         condition_variable client_cond_var_ GUARDED_BY(mu_);
@@ -926,8 +926,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             if (!new_ctx) {
               new_ctx = std::make_shared<IteratorContext>(*ctx);
             }
-            workers_[i]->threads.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_numa_map_and_batch_", i, "_", j),
+            workers_[i]->threads.emplace_back(ctx->StartThread(
+                strings::StrCat("tf_data_numa_map_and_batch_", i, "_", j),
                 [this, new_ctx, i, j]() { WorkerThread(new_ctx, i, j); }));
             VLOG(3) << "Worker " << i << ", " << j << " successfully started.";
           }
@@ -936,9 +936,9 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           if (!new_ctx) {
             new_ctx = std::make_shared<IteratorContext>(*ctx);
           }
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_numa_map_and_batch",
-              [this, new_ctx] { RunnerThread(new_ctx); }));
+          runner_thread_ =
+              ctx->StartThread("tf_data_numa_map_and_batch",
+                               [this, new_ctx] { RunnerThread(new_ctx); });
         }
         VLOG(3) << "All workers & runner thread started.";
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index f6d522078dda68d52bd0722613ecdcfdd314faf1..54c1d839e6056bf515d4e98cea86a3b426bcc423 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -493,8 +493,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           worker_threads_.reserve(dataset()->num_threads());
           for (size_t i = 0; i < dataset()->num_threads(); ++i) {
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+            worker_threads_.emplace_back(ctx->StartThread(
+                strings::StrCat("tf_data_parallel_interleave_worker_", i),
                 [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
           }
         }
@@ -592,8 +592,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             }
             workers_[i].SetInputs(s, std::move(args));
             std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+            worker_threads_.push_back(ctx->StartThread(
+                strings::StrCat("tf_data_parallel_interleave_worker_", i),
                 [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
             if (i < dataset()->cycle_length_) {
               interleave_indices_.push_back(i);
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 00574057344507fe158d36c210e61f15bf92845e..c207cf7ae4fd9d20b467a5b189f21a8d5e540ac2 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/kernels/data/parallel_map_iterator.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
 namespace tensorflow {
@@ -332,23 +333,26 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                   << ", got " << serialized_sparse.shape().DebugString()
                   << ").";
             }
-            // TODO(b/111553342): User provided tags instead of fixed tag.
+            // TODO(b/123360128): Add component name to streamz metrics without
+            // breaking TFX metrics.
             if (stats_aggregator) {
               stats_aggregator->IncrementCounter(
-                  "examples_count", "trainer",
+                  stats_utils::kExamplesCount, "trainer",
                   example_result.feature_stats.size());
               for (example::PerExampleFeatureStats feature_stats :
                    example_result.feature_stats) {
                 stats_aggregator->AddToHistogram(
-                    "features",
+                    stats_utils::FeatureHistogramName(dataset_->node_name()),
                     {static_cast<double>(feature_stats.features_count)});
                 stats_aggregator->IncrementCounter(
-                    "features_count", "trainer", feature_stats.features_count);
+                    stats_utils::kFeaturesCount, "trainer",
+                    feature_stats.features_count);
                 stats_aggregator->IncrementCounter(
-                    "feature_values_count", "trainer",
+                    stats_utils::kFeatureValuesCount, "trainer",
                     feature_stats.feature_values_count);
                 stats_aggregator->AddToHistogram(
-                    "feature-values",
+                    stats_utils::FeatureValueHistogramName(
+                        dataset_->node_name()),
                     {static_cast<double>(feature_stats.feature_values_count)});
               }
             }
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0397ca01c4ee058bce8079c83c787a4f38f4f578
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kOptimizerName[] = "tf_data_rebatcher";
+
+class RebatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit RebatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_workers;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
+    OP_REQUIRES(
+        ctx, num_workers > 0,
+        errors::InvalidArgument("num_workers must be greater than zero."));
+
+    Dataset* dataset =
+        new Dataset(ctx, input, num_workers, output_types_, output_shapes_);
+    Status s = dataset->Optimize(ctx);
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
+  }
+
+ private:
+  class Dataset : public GraphRewriteDataset {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const int64 num_workers, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
+          num_workers_(num_workers) {}
+
+    string DebugString() const override { return "RebatchDatasetOp::Dataset"; }
+
+   private:
+    bool ShouldOptimizeFunctions() override {
+      // We only want to optimize functions for some particular datasets like
+      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+      // function optimization and explicitly handle function modifications
+      // for those datasets in the rewrite.
+      return false;
+    }
+
+    RewriterConfig CreateGrapplerRewriteConfig() override {
+      RewriterConfig rewriter_config;
+      rewriter_config.set_fail_on_optimizer_errors(true);
+      rewriter_config.add_optimizers(kOptimizerName);
+      rewriter_config.set_meta_optimizer_iterations(
+          RewriterConfig_NumIterationsType_ONE);
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      AttrValue num_workers_attr;
+      num_workers_attr.set_i(num_workers_);
+      (*custom_optimizer->mutable_parameter_map())["num_workers"] =
+          num_workers_attr;
+      return rewriter_config;
+    }
+
+    const int64 num_workers_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRebatchDataset").Device(DEVICE_CPU),
+                        RebatchDatasetOp);
+
+}  // anonymous namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index bf96be4eb005e62d1982adf5662984fe15d59091..be5fa4c789ba842952b01ecb256ffa57629d5afa 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index 3a6f70e504ec09007ac21808b1747e299d2b150d..4cd76b071f9a845c2aab3961e5e7516d4a3437a6 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -39,6 +39,11 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
   explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+    OP_REQUIRES_OK(
+        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
+    OP_REQUIRES(
+        ctx, short_circuit_indices_.size() <= 1,
+        errors::InvalidArgument("`predicate` has more than one return value."));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -47,14 +52,8 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-    OP_REQUIRES(
-        ctx, indices.size() <= 1,
-        errors::InvalidArgument("`predicate` has more than one return value."));
-
     LoopIteratorPredicate loop_pred;
-    if (indices.empty()) {
+    if (short_circuit_indices_.empty()) {
       loop_pred = [](IteratorContext* ctx,
                      InstantiatedCapturedFunction* inst_captured_func,
                      const std::vector<Tensor>& args, bool* end_of_sequence) {
@@ -71,11 +70,12 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       };
     } else {
-      loop_pred = [indices](IteratorContext* ctx,
-                            InstantiatedCapturedFunction* inst_captured_func,
-                            const std::vector<Tensor>& args,
-                            bool* end_of_sequence) {
-        const Tensor& predicate = args[indices[0]];
+      int predicate_index = short_circuit_indices_[0];
+      loop_pred = [predicate_index](
+                      IteratorContext* ctx,
+                      InstantiatedCapturedFunction* inst_captured_func,
+                      const std::vector<Tensor>& args, bool* end_of_sequence) {
+        const Tensor& predicate = args[predicate_index];
         if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
           return errors::InvalidArgument(
               "`predicate` must returns a scalar bool tensor.");
@@ -240,6 +240,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
   };
 
   NameAttrList func_;
+  std::vector<int> short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExperimentalTakeWhileDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 7a16cda0f3dc83d5c00a2006f94bdecde866bfd5..9d1649bf021080c16f4e4f089b9e5ffd9b17fc01 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -98,8 +99,9 @@ class ThreadPoolHandleOp : public OpKernel {
                                   EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                                     *ret = new ThreadPoolResource(
                                         ctx->env(), {}, display_name_,
-                                        num_threads_, max_intra_op_parallelism_,
-                                        false /* low_latency_hint */);
+                                        num_threads_,
+                                        /*low_latency_hint=*/false,
+                                        max_intra_op_parallelism_);
                                     return Status::OK();
                                   }));
       initialized_ = true;
@@ -306,19 +308,8 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         IteratorContext::Params params(ctx);
         auto max_parallelism = dataset()->max_intra_op_parallelism_;
-        params.runner = std::bind(
-            [max_parallelism](
-                const std::function<void(std::function<void()>)>& runner,
-                std::function<void()> fn) {
-              std::function<void()> scoped_fn = std::bind(
-                  [max_parallelism](const std::function<void()>& fn) {
-                    ScopedPerThreadMaxParallelism scope(max_parallelism);
-                    fn();
-                  },
-                  std::move(fn));
-              (runner)(std::move(scoped_fn));
-            },
-            std::move(*ctx->runner()), std::placeholders::_1);
+        params.runner =
+            RunnerWithMaxParallelism(*ctx->runner(), max_parallelism);
         return input_impl_->GetNext(IteratorContext{std::move(params)},
                                     out_tensors, end_of_sequence);
       }
@@ -346,7 +337,7 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 num_threads;
+    int64 num_threads = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "num_threads", &num_threads));
     OP_REQUIRES(ctx, num_threads >= 1,
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 483d42c8092356ed9fedb70222c7dc96001874b4..2f47721d62064de8a1300f502b7ff4c19e74a226 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -39,6 +40,11 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+    OP_REQUIRES_OK(
+        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
+    OP_REQUIRES(ctx, short_circuit_indices_.size() <= 1,
+                errors::InvalidArgument(
+                    "predicate function has more than one return value."));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -47,14 +53,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-    OP_REQUIRES(ctx, indices.size() <= 1,
-                errors::InvalidArgument(
-                    "predicate function has more than one return value."));
-
     FilterIteratorPredicate filter_pred;
-    if (indices.empty()) {
+    if (short_circuit_indices_.empty()) {
       filter_pred = [](IteratorContext* ctx,
                        InstantiatedCapturedFunction* inst_captured_func,
                        const std::vector<Tensor>& args, bool* out_matched) {
@@ -71,11 +71,12 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       };
     } else {
-      filter_pred = [indices](IteratorContext* ctx,
-                              InstantiatedCapturedFunction* inst_captured_func,
-                              const std::vector<Tensor>& args,
-                              bool* out_matched) {
-        const Tensor& predicate = args[indices[0]];
+      int predicate_index = short_circuit_indices_[0];
+      filter_pred = [predicate_index](
+                        IteratorContext* ctx,
+                        InstantiatedCapturedFunction* inst_captured_func,
+                        const std::vector<Tensor>& args, bool* out_matched) {
+        const Tensor& predicate = args[predicate_index];
         if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
           return errors::InvalidArgument(
               "Filter predicate `f` must return a scalar bool.");
@@ -167,9 +168,6 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             filtered_elements_(0),
             dropped_elements_(0),
             filter_pred_(std::move(filter_pred)) {
-        std::vector<string> components =
-            str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
       }
 
       Status Initialize(IteratorContext* ctx) override {
@@ -213,13 +211,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
               mutex_lock l(mu_);
               dropped_elements_++;
               stats_aggregator->AddScalar(
-                  strings::StrCat(prefix_end_, "::dropped_elements"),
+                  stats_utils::DroppedElementsScalarName(
+                      dataset()->node_name()),
                   static_cast<float>((dropped_elements_)));
               // TODO(shivaniagrawal): multiple pipelines would collect
               // aggregated number of dropped elements for all the pipelines,
               // exploit tagged_context here.
-              stats_aggregator->IncrementCounter(
-                  prefix_end_, "dropped_elements", static_cast<float>(1));
+              stats_aggregator->IncrementCounter(dataset()->node_name(),
+                                                 stats_utils::kDroppedElements,
+                                                 static_cast<float>(1));
             }
           }
         } while (!matched);
@@ -229,12 +229,13 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
           mutex_lock l(mu_);
           filtered_elements_++;
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::filtered_elements"),
+              stats_utils::FilterdElementsScalarName(dataset()->node_name()),
               static_cast<float>((filtered_elements_)));
           // TODO(shivaniagrawal): multiple pipelines would collect aggregated
           // number of filtered elements for all the pipelines, exploit
           // tagged_context here.
-          stats_aggregator->IncrementCounter(prefix_end_, "filtered_elements",
+          stats_aggregator->IncrementCounter(dataset()->node_name(),
+                                             stats_utils::kFilteredElements,
                                              static_cast<float>(1));
         }
         *end_of_sequence = false;
@@ -281,7 +282,6 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       int64 filtered_elements_ GUARDED_BY(mu_);
       int64 dropped_elements_ GUARDED_BY(mu_);
       const FilterIteratorPredicate filter_pred_;
-      string prefix_end_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
@@ -293,6 +293,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
  private:
   NameAttrList func_;
+  std::vector<int> short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FilterDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 5dff2be39da37e899092c6a764d548b9a4799e22..3469743af63a4d9480de3ed9160c43a650b71410 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -71,7 +71,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params) {}
 
     ~Iterator() override {
-      if (!finalized_) {
+      if (!finalized_ && initialized_) {
         std::vector<Tensor> ignored;
         Status s =
             instantiated_finalize_func_->RunInstantiated(state_, &ignored);
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd8026607e73ccf42d43ddb9a1d595778879478c
--- /dev/null
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+GraphRewriteDataset::~GraphRewriteDataset() {
+  input_->Unref();
+  if (optimized_input_) {
+    optimized_input_->Unref();
+  }
+}
+
+Status GraphRewriteDataset::Optimize(OpKernelContext* ctx) {
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* input_node = nullptr;
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  params.input_list = &input_list;
+  params.optimization_only = true;
+  SerializationContext serialization_ctx(params);
+  TF_RETURN_IF_ERROR(
+      db.AddInputDataset(&serialization_ctx, input_, &input_node));
+  string output_node = input_node->name();
+
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  VLOG(3) << "Before optimization: " << graph_def.DebugString();
+
+  TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
+  VLOG(3) << "After optimization: " << graph_def.DebugString();
+
+  // Instantiate the optimized input pipeline by running the optimized graph
+  // using the optimized function library.
+  TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
+
+  // Create a FunctionHandleCache.
+  function_handle_cache_ = absl::make_unique<FunctionHandleCache>(lib_);
+
+  // Some functions may have been modified without having their names
+  // changed (for example, nested dataset graphs from FlatMap or
+  // Interleave).
+  TF_RETURN_IF_ERROR(
+      AddToFunctionLibrary(flib_def_.get(), graph_def.library()));
+
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+  std::vector<Tensor> outputs;
+  GraphRunner graph_runner(ctx->function_library()->device());
+
+  TF_RETURN_IF_ERROR(
+      graph_runner.Run(&graph, lib_, input_list, {output_node}, &outputs));
+  TF_RETURN_IF_ERROR(
+      GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
+  optimized_input_->Ref();
+  return Status::OK();
+}
+
+Status GraphRewriteDataset::AsGraphDefInternal(SerializationContext* ctx,
+                                               DatasetGraphDefBuilder* b,
+                                               Node** output) const {
+  SerializationContext::Params params;
+  // The optimized input needs access to the newly optimized functions when
+  // it is serialized. Here, we use the optimized function library for
+  // serialization, which is the union of the function library from the
+  // OpKernelContext at dataset creation time and newly optimized functions.
+  // This includes all functions that optimized_input_ may use.
+  params.flib_def = flib_def_.get();
+  params.input_list = ctx->input_list();
+  params.optimization_only = ctx->optimization_only();
+  SerializationContext optimized_ctx(params);
+
+  // We only serialize the optimized dataset to avoid re-running
+  // optimizations when the input pipeline is restored from a checkpoint.
+  TF_RETURN_IF_ERROR(
+      b->AddInputDataset(&optimized_ctx, optimized_input_, output));
+  return Status::OK();
+}
+
+namespace {
+void AddFakeSinks(FunctionDef* function_def) {
+  int counter = 0;
+  for (const auto& output : function_def->signature().output_arg()) {
+    NodeDef* node = function_def->add_node_def();
+    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+        strings::StrCat("FakeSink", counter++), function_def, node);
+    node->set_op("Identity");
+    node->add_input(function_def->ret().at(output.name()));
+    (*node->mutable_attr())["T"].set_type(output.type());
+
+    (*function_def->mutable_ret())[output.name()] =
+        strings::StrCat(node->name(), ":output:0");
+  }
+}
+
+void RemoveFakeSinks(FunctionDef* function_def) {
+  // Map from identity node names to their input tensor strings
+  std::map<string, string> identity_map;
+  for (const auto& node : function_def->node_def()) {
+    if (node.op() == "Identity" && node.input_size() == 1) {
+      identity_map[node.name()] = node.input(0);
+    }
+  }
+  for (const auto& output_arg : function_def->signature().output_arg()) {
+    const string& tensor = function_def->ret().at(output_arg.name());
+    const string& output_node = tensor.substr(0, tensor.find(':'));
+    if (identity_map.find(output_node) != identity_map.end()) {
+      (*function_def->mutable_ret())[output_arg.name()] =
+          identity_map.at(output_node);
+    }
+  }
+}
+}  // anonymous namespace
+
+Status GraphRewriteDataset::ApplyOptimizations(OpKernelContext* ctx,
+                                               GraphDef* graph_def,
+                                               string* output_node) {
+  // Add an identity node as the fetch node, otherwise we might get
+  // 'placeholder is both fed and fetched' errors in some cases when using
+  // input list with placeholder dataset nodes.
+  NodeDef* node = graph_def->mutable_node()->Add();
+  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
+                                                            node);
+  node->set_op("Identity");
+  node->add_input(*output_node);
+  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+  *output_node = node->name();
+
+  // Add fake sink node to graph and functions to allow rewriting the actual
+  // sink nodes.
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function
+  // retvals to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    AddFakeSinks(&function_def);
+  }
+
+  // Create metagraph.
+  MetaGraphDef meta_graph_def;
+  (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+  // Grappler determines fetch ops from collection 'train_op'.
+  CollectionDef collection_def;
+  auto node_list = collection_def.mutable_node_list();
+  node_list->add_value(*output_node);
+  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+  // Create Grappler item.
+  tensorflow::grappler::ItemConfig item_config;
+  item_config.apply_optimizations = true;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+          "graph", meta_graph_def, item_config);
+  grappler_item->optimization_options().optimize_function_library =
+      ShouldOptimizeFunctions();
+  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+  tensorflow::grappler::VirtualCluster cluster(device_map);
+
+  // Run data optimizer using grappler's meta optimizer.
+  tensorflow::ConfigProto config;
+  *config.mutable_graph_options()->mutable_rewrite_options() =
+      CreateGrapplerRewriteConfig();
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+  // Remove fake sinks after optimizations are done.
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function
+  // retvals to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    RemoveFakeSinks(&function_def);
+  }
+
+  return Status::OK();
+}
+
+class GraphRewriteDataset::Iterator
+    : public DatasetIterator<GraphRewriteDataset> {
+ public:
+  explicit Iterator(const Params& params)
+      : DatasetIterator<GraphRewriteDataset>(params) {}
+
+  Status Initialize(IteratorContext* ctx) override {
+    IteratorContext::Params params(ctx);
+    params.lib = dataset()->lib_;
+    params.function_handle_cache = dataset()->function_handle_cache_.get();
+    return dataset()->optimized_input_->MakeIterator(
+        IteratorContext(std::move(params)), prefix(), &input_impl_);
+  }
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override {
+    IteratorContext::Params params(ctx);
+    params.lib = dataset()->lib_;
+    params.function_handle_cache = dataset()->function_handle_cache_.get();
+    return input_impl_->GetNext(IteratorContext(std::move(params)), out_tensors,
+                                end_of_sequence);
+  }
+
+ protected:
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeKnownRatioNode(std::move(args),
+                                     /*ratio=*/1);
+  }
+
+  Status SaveInternal(IteratorStateWriter* writer) override {
+    TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+    return Status::OK();
+  }
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override {
+    TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<IteratorBase> input_impl_;
+};
+
+std::unique_ptr<IteratorBase> GraphRewriteDataset::MakeIteratorInternal(
+    const string& prefix) const {
+  // We do not add a token for this dataset to the prefix. The
+  // prefix is used to identify checkpoint elements and since this
+  // dataset is excluded from the checkpoint, adding a token
+  // here would result in invalid checkpoint identifiers.
+  return absl::make_unique<Iterator>(Iterator::Params{this, prefix});
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.h b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..856fcd3ea727f1223a06783b78af0efc41935516
--- /dev/null
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
+
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+
+namespace tensorflow {
+namespace data {
+
+class GraphRewriteDataset : public DatasetBase {
+ public:
+  GraphRewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      const DataTypeVector& output_types,
+                      const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        optimized_input_(nullptr),
+        input_(input),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
+  }
+
+  ~GraphRewriteDataset() override;
+
+  // Runs Grappler to transform the input dataset into optimized_input_
+  // dataset.
+  Status Optimize(OpKernelContext* ctx);
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
+
+ private:
+  class Iterator;
+
+  // Create a Grappler RewriteConfig proto that defines the list of
+  // optimizations to be run by the Grappler Meta Optimizer.
+  virtual RewriterConfig CreateGrapplerRewriteConfig() = 0;
+
+  // Option specifying whether we want to optimize the function library as well.
+  virtual bool ShouldOptimizeFunctions() { return true; }
+
+  Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
+                            string* output_node);
+
+  DatasetBase* optimized_input_;
+  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
+  const DatasetBase* input_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index da497a5f72073d852d1bb84105d5837f33ef30ea..14fb6624ad7002729886eab670e4910d93a9335e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include <memory>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/optional_ops.h"
+#include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -51,14 +53,15 @@ const char kIteratorVariantTypeName[] = "tensorflow::Iterator";
 
 class IteratorResource : public ResourceBase {
  public:
-  IteratorResource(const DataTypeVector& output_dtypes,
+  IteratorResource(Env* env, const DataTypeVector& output_dtypes,
                    const std::vector<PartialTensorShape>& output_shapes,
                    const int /*unused: graph_def_version*/,
                    std::unique_ptr<DeviceMgr> device_mgr,
                    std::unique_ptr<FunctionLibraryDefinition> flib_def,
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
-      : device_mgr_(std::move(device_mgr)),
+      : unbounded_thread_pool_(env, "tf_data_iterator_resource"),
+        device_mgr_(std::move(device_mgr)),
         iterator_state_(std::make_shared<State>(
             std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */)),
         output_dtypes_(output_dtypes),
@@ -77,6 +80,7 @@ class IteratorResource : public ResourceBase {
       params.function_handle_cache =
           captured_state->function_handle_cache.get();
       params.resource_mgr = &captured_state->resource_mgr;
+      params.thread_factory = unbounded_thread_pool_.get_thread_factory();
       return captured_state->iterator->GetNext(
           IteratorContext(std::move(params)), out_tensors, end_of_sequence);
     } else {
@@ -99,7 +103,17 @@ class IteratorResource : public ResourceBase {
       captured_state = iterator_state_;
     }
     if (captured_state) {
-      return captured_state->iterator->Save(ctx, writer);
+      SerializationContext::Params params;
+      // The iterator state may contain functions that are not present
+      // in ctx's function library. Namely, an iterator may be restored from
+      // a serialized iterator with a modified function library (for example, as
+      // a result of OptimizeDataset). These modified functions are needed
+      // to serialize the iterator again.
+      params.flib_def = captured_state->flib_def.get();
+      params.input_list = ctx->input_list();
+      params.optimization_only = ctx->optimization_only();
+      SerializationContext ctx_with_functions(params);
+      return captured_state->iterator->Save(&ctx_with_functions, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -134,7 +148,14 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
-    TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+
+    // Some function names may be duplicated (for example, if the serialized
+    // graph has an optimized function that retains its original name). We
+    // override functions in flib_def in the event of conflict. It is
+    // safe to assume that any node in the serialized graph is referring to the
+    // serialized function when there is a conflict.
+    TF_RETURN_IF_ERROR(
+        AddToFunctionLibrary(flib_def.get(), graph_def.library()));
     std::unique_ptr<State> new_state = absl::make_unique<State>(
         std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */);
 
@@ -146,6 +167,8 @@ class IteratorResource : public ResourceBase {
     params.lib = new_state->lib;
     params.function_handle_cache = new_state->function_handle_cache.get();
     params.resource_mgr = &new_state->resource_mgr;
+    params.thread_factory = unbounded_thread_pool_.get_thread_factory();
+
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
                                              "Iterator", &new_state->iterator));
     TF_RETURN_IF_ERROR(
@@ -162,6 +185,7 @@ class IteratorResource : public ResourceBase {
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
+      params.thread_factory = unbounded_thread_pool_.get_thread_factory();
       IteratorContext iter_ctx(std::move(params));
       TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader));
     }
@@ -216,6 +240,7 @@ class IteratorResource : public ResourceBase {
     params.lib = new_state->lib;
     params.function_handle_cache = new_state->function_handle_cache.get();
     params.resource_mgr = &new_state->resource_mgr;
+    params.thread_factory = unbounded_thread_pool_.get_thread_factory();
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
                                              "Iterator", &iterator));
     TF_RETURN_IF_ERROR(
@@ -267,6 +292,7 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<IteratorBase> iterator;
   };
 
+  UnboundedThreadPool unbounded_thread_pool_;
   mutex mu_;
   const std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
   std::shared_ptr<State> iterator_state_ GUARDED_BY(mu_);
@@ -276,110 +302,6 @@ class IteratorResource : public ResourceBase {
 
 namespace {
 
-constexpr char kDelimiter[] = "@@";
-
-// Helper class for reading data from a VariantTensorData object.
-class VariantTensorDataReader : public IteratorStateReader {
- public:
-  explicit VariantTensorDataReader(const VariantTensorData* data)
-      : data_(data) {
-    string metadata;
-    data_->get_metadata(&metadata);
-    auto keys = str_util::Split(metadata, kDelimiter, str_util::SkipEmpty());
-    for (size_t i = 0; i < keys.size(); ++i) {
-      map_[keys[i]] = i;
-    }
-  }
-
-  // Returns OK iff the initialization was successful, i.e.,
-  // pre-processing did not have errors.
-  Status status() const { return status_; }
-
-  Status ReadScalar(StringPiece key, int64* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadScalar(StringPiece key, string* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadTensor(StringPiece key, Tensor* val) override {
-    return ReadTensorInternal(key, val);
-  }
-
-  bool Contains(StringPiece key) override {
-    return map_.find(string(key)) != map_.end();
-  }
-
- private:
-  template <typename T>
-  Status ReadScalarInternal(StringPiece key, T* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]).scalar<T>()();
-    return Status::OK();
-  }
-
-  Status ReadTensorInternal(StringPiece key, Tensor* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]);
-    return Status::OK();
-  }
-
-  std::map<string, size_t> map_;
-  const VariantTensorData* data_;  // Not owned.
-  Status status_;
-};
-
-// Helper class for writing data to a VariantTensorData object.
-class VariantTensorDataWriter : public IteratorStateWriter {
- public:
-  // Does not take ownership of data.
-  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
-
-  Status WriteScalar(StringPiece key, const int64 val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteScalar(StringPiece key, const string& val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteTensor(StringPiece key, const Tensor& val) override {
-    return WriteTensorInternal(key, val);
-  }
-
-  Status Flush() {
-    string metadata;
-    for (size_t i = 0; i < keys_.size(); ++i) {
-      strings::StrAppend(&metadata, kDelimiter, keys_[i]);
-    }
-    data_->set_metadata(metadata);
-    return Status::OK();
-  }
-
- private:
-  template <typename T>
-  Status WriteScalarInternal(StringPiece key, const T& val) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    return WriteTensorInternal(key, val_t);
-  }
-
-  Status WriteTensorInternal(StringPiece key, const Tensor& val) {
-    DCHECK_EQ(key.find(kDelimiter), string::npos);
-    keys_.push_back(string(key));
-    *(data_->add_tensors()) = val;
-    return Status::OK();
-  }
-
-  VariantTensorData* data_;
-  std::vector<string> keys_;
-};
-
 // Wrapper for encoding/decoding the iterator state stored in a Variant tensor.
 // The get() method returns an IteratorStateReader which can be used
 // to restore iterator state.
@@ -436,21 +358,15 @@ class IteratorStateVariant {
     std::swap(*tensor_data, data);
     std::unique_ptr<VariantTensorDataReader> reader =
         absl::make_unique<VariantTensorDataReader>(tensor_data.get());
-    status_ = reader->status();
-    if (!status_.ok()) {
-      return false;
-    }
     data_ = std::move(tensor_data);
     reader_ = std::move(reader);
     return true;
   }
   IteratorStateReader* get() { return reader_.get(); }
-  Status status() const { return status_; }
   string DebugString() const {
     if (data_) {
-      return strings::StrCat("IteratorStateVariant<",
-                             "data: ", data_->DebugString(),
-                             " status: ", status_.ToString(), ">");
+      return strings::StrCat("IteratorStateVariant<", data_->DebugString(),
+                             ">");
     } else {
       return strings::StrCat("IteratorStateVariant<empty>");
     }
@@ -458,7 +374,6 @@ class IteratorStateVariant {
 
  private:
   std::unique_ptr<IteratorStateReader> reader_;
-  Status status_;
   std::unique_ptr<VariantTensorData> data_;
 };
 
@@ -526,14 +441,14 @@ void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
           context,
           mgr->LookupOrCreate<IteratorResource>(
               cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                    *ret = new IteratorResource(
-                        output_dtypes_, output_shapes_, graph_def_version_,
-                        std::move(device_mgr), std::move(flib_def),
-                        std::move(pflr), lib);
-                    return Status::OK();
-                  }));
+              [context, lib, &device_mgr, &flib_def, &pflr,
+               this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                *ret = new IteratorResource(
+                    context->env(), output_dtypes_, output_shapes_,
+                    graph_def_version_, std::move(device_mgr),
+                    std::move(flib_def), std::move(pflr), lib);
+                return Status::OK();
+              }));
 
       Status s = VerifyResource(resource);
       if (TF_PREDICT_FALSE(!s.ok())) {
@@ -616,7 +531,7 @@ void AnonymousIteratorHandleOp::Compute(OpKernelContext* context) {
       existing_resource->Unref();
     }
     IteratorResource* new_resource = new IteratorResource(
-        output_dtypes_, output_shapes_, graph_def_version_,
+        context->env(), output_dtypes_, output_shapes_, graph_def_version_,
         std::move(device_mgr), std::move(flib_def), std::move(pflr), lib);
     // Create the resource with our chosen name under the resource lookup
     // mutex to avoid another kernel racily creating a resource with this
@@ -678,7 +593,7 @@ class ToSingleElementOp : public AsyncOpKernel {
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
       // avoid destruction races.
       IteratorBase* raw_iterator = iterator.release();
-      auto cleanup = gtl::MakeCleanup([ctx, raw_iterator, done] {
+      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
         delete raw_iterator;
         done();
       });
@@ -931,11 +846,12 @@ class OneShotIteratorOp : public AsyncOpKernel {
     TF_RETURN_IF_ERROR(
         ctx->resource_manager()->LookupOrCreate<IteratorResource>(
             cinfo->container(), cinfo->name(), iterator,
-            [lib, this, &flib_def, &pflr](IteratorResource** ret)
+            [ctx, lib, this, &flib_def, &pflr](IteratorResource** ret)
                 EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                   *ret = new IteratorResource(
-                      output_dtypes_, output_shapes_, graph_def_version_,
-                      nullptr, std::move(flib_def), std::move(pflr), lib);
+                      ctx->env(), output_dtypes_, output_shapes_,
+                      graph_def_version_, nullptr, std::move(flib_def),
+                      std::move(pflr), lib);
                   return Status::OK();
                 }));
 
@@ -1078,78 +994,58 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
+void IteratorGetNextAsOptionalOp::ComputeAsync(OpKernelContext* ctx,
+                                               DoneCallback done) {
+  IteratorResource* iterator;
+  OP_REQUIRES_OK_ASYNC(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+  // The call to `iterator->GetNext()` may block and depend on an
+  // inter-op thread pool thread, so we issue the call from the
+  // owned thread pool.
+  background_worker_.Schedule(std::bind(
+      [this, ctx, iterator](DoneCallback done) {
+        std::vector<Tensor> components;
+        bool end_of_sequence = false;
 
-class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
- public:
-  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           "tf_data_iterator_get_next_as_optional") {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
+        // NOTE(mrry): We must unref the iterator before calling `done()`, to
+        // avoid destruction races.
+        iterator->Unref();
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    IteratorResource* iterator;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    // The call to `iterator->GetNext()` may block and depend on an
-    // inter-op thread pool thread, so we issue the call from the
-    // owned thread pool.
-    background_worker_.Schedule(std::bind(
-        [this, ctx, iterator](DoneCallback done) {
-          std::vector<Tensor> components;
-          bool end_of_sequence = false;
-
-          Status s = iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
-          // NOTE(mrry): We must unref the iterator before calling `done()`, to
-          // avoid destruction races.
-          iterator->Unref();
-
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (end_of_sequence) {
-            OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
-          } else {
-            for (int i = 0; i < components.size(); ++i) {
-              OP_REQUIRES_ASYNC(
-                  ctx, components[i].dtype() == output_types_[i],
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected type for "
-                      "component ",
-                      i, ". Expected: ", DataTypeString(output_types_[i]),
-                      ". Actual: ", DataTypeString(components[i].dtype()), "."),
-                  done);
-              OP_REQUIRES_ASYNC(
-                  ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."),
-                  done);
-            }
-
-            OP_REQUIRES_OK_ASYNC(
-                ctx,
-                WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+        if (!s.ok()) {
+          ctx->SetStatus(s);
+        } else if (end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
+        } else {
+          for (int i = 0; i < components.size(); ++i) {
+            OP_REQUIRES_ASYNC(
+                ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."),
+                done);
+            OP_REQUIRES_ASYNC(
+                ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."),
                 done);
           }
-          done();
-        },
-        std::move(done)));
-  }
-
- private:
-  BackgroundWorker background_worker_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
 
-}  // namespace
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+              done);
+        }
+        done();
+      },
+      std::move(done)));
+}
 
 void IteratorToStringHandleOp::Compute(OpKernelContext* ctx) {
   const Tensor& resource_handle_t = ctx->input(0);
@@ -1263,12 +1159,10 @@ class DeserializeIteratorOp : public OpKernel {
     OP_REQUIRES(ctx, wrapper != nullptr,
                 errors::InvalidArgument(
                     "DeserializeIteratorOp: Unable to parse variant tensor."));
-    OP_REQUIRES_OK(ctx, wrapper->status());
     OP_REQUIRES_OK(ctx, iterator_resource->Restore(ctx, wrapper->get()));
   }
 };
 
-
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index cd72269859044e6efd97a10ad43bc00c90df7d7d..7d769d365e9aa8d6952a9a8cdb461bc63957d031 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
@@ -115,6 +117,24 @@ class IteratorGetNextOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
+class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
+ public:
+  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(),
+                           "tf_data_iterator_get_next_as_optional") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  BackgroundWorker background_worker_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 class IteratorGetNextSyncOp : public OpKernel {
  public:
   explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 95f4c1c89150b81a91f191e4e53c2b81c30841c4..76383e3c3c431f162da5baefd971a43166fb19bc 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -41,6 +41,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                      &use_inter_op_parallelism_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -50,12 +51,9 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
     MapIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
-    if (indices.empty()) {
+    if (short_circuit_indices_.empty()) {
       map_func = [](IteratorContext* ctx,
                     InstantiatedCapturedFunction* inst_captured_func,
                     std::vector<Tensor> args,
@@ -63,7 +61,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         return inst_captured_func->Run(ctx, std::move(args), out_tensors);
       };
     } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
+      std::vector<bool> can_move = ComputeMoveVector(short_circuit_indices_);
+      const auto& indices = short_circuit_indices_;
       map_func = [raw_captured_func, indices, can_move](
                      IteratorContext* ctx,
                      InstantiatedCapturedFunction* inst_captured_func,
@@ -138,7 +137,6 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
@@ -277,6 +275,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList func_;
   bool use_inter_op_parallelism_;
   bool preserve_cardinality_;
+  std::vector<int> short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d17ab2865c64e4ace7c8414a8ca50dcf21e82c
--- /dev/null
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -0,0 +1,535 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "map_dataset";
+constexpr char kOpName[] = "MapDataset";
+
+class MapDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new MapDataset op kernel. The `input_dataset` parameter should be
+  // same with the node name of the input dataset for the method
+  // `CreateMapDatasetContext()`. `T` specifies the output dtype of MapDataset.
+  template <typename T>
+  Status CreateMapDatasetOpKernel(const string& input_dataset,
+                                  const string& func_name,
+                                  std::unique_ptr<OpKernel>* map_kernel) {
+    FunctionDefHelper::AttrValueWrapper func =
+        FunctionDefHelper::FunctionRef(func_name, {{"T", DT_INT64}});
+
+    map_node_def_ = test::function::NDef(
+        kNodeName, kOpName, {input_dataset},
+        {{"f", func},
+         {"Targuments", {}},
+         {"output_shapes", gtl::ArraySlice<TensorShape>{{}}},
+         {"output_types",
+          gtl::ArraySlice<DataType>{tensorflow::DataTypeToEnum<T>::value}},
+         {"use_inter_op_parallelism", true},
+         {"preserve_cardinality", false}});
+    TF_CHECK_OK(CreateOpKernel(map_node_def_, map_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new MapDataset op kernel context.
+  Status CreateMapDatasetContext(
+      DatasetBase* const input_dataset, OpKernel* const map_kernel,
+      std::unique_ptr<OpKernelContext>* map_context) {
+    map_inputs_.clear();
+    // Save the input dataset into a variant tensor as the input of MapDataset.
+    Tensor dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(input_dataset, &dataset_tensor));
+    Variant variant = dataset_tensor.scalar<Variant>()();
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<Variant>(
+        &map_inputs_, map_kernel->input_types(), TensorShape({}), {variant}));
+    input_dataset->Ref();
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(map_kernel, &map_inputs_, map_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*map_kernel, map_inputs_));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef map_node_def_;
+  gtl::InlinedVector<TensorValue, 4> map_inputs_;
+};
+
+struct GetNextTestParams {
+  explicit GetNextTestParams(int64 input_start, int64 input_end,
+                             int64 input_step, string input_func_name,
+                             std::vector<int64> input_expected_values,
+                             std::vector<FunctionDef> input_func_lib)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        func_name(std::move(input_func_name)),
+        expected_values(std::move(input_expected_values)),
+        func_lib(std::move(input_func_lib)) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  string func_name;
+  std::vector<int64> expected_values;
+  std::vector<FunctionDef> func_lib;
+};
+
+struct DatasetGetNextTest : MapDatasetOpTest,
+                            ::testing::WithParamInterface<GetNextTestParams> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  GetNextTestParams test_params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_params.func_lib, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), test_params.func_name, &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+
+  EXPECT_EQ(out_tensors.size(), test_params.expected_values.size());
+  for (size_t i = 0; i < out_tensors.size(); ++i) {
+    int64 actual_value = out_tensors[i].flat<int64>()(0);
+    int64 expect_value = test_params.expected_values[i];
+    EXPECT_EQ(actual_value, expect_value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MapDatasetOpTest, DatasetGetNextTest,
+    ::testing::Values(
+        GetNextTestParams(
+            0, 10, 3, "XTimesTwo", std::vector<int64>{0, 6, 12, 18},
+            std::vector<FunctionDef>{test::function::XTimesTwo()}),
+        GetNextTestParams(0, 10, 3, "XAddX", std::vector<int64>{0, 6, 12, 18},
+                          std::vector<FunctionDef>{test::function::XAddX()}),
+        GetNextTestParams(
+            10, 0, -3, "XTimesFour", std::vector<int64>{40, 28, 16, 4},
+            std::vector<FunctionDef>{test::function::XTimesTwo(),
+                                     test::function::XTimesFour()})));
+
+TEST_F(MapDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  EXPECT_EQ(map_dataset->type_string(), kOpName);
+}
+
+TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(map_dataset->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(map_dataset->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(
+        map_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+struct CardinalityTestParams {
+  explicit CardinalityTestParams(int64 input_start, int64 input_end,
+                                 int64 input_step,
+                                 int input_expected_cardinality)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        expected_cardinality(input_expected_cardinality) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int expected_cardinality;
+};
+
+struct DatasetCardinalityTest
+    : MapDatasetOpTest,
+      ::testing::WithParamInterface<CardinalityTestParams> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  CardinalityTestParams test_params = GetParam();
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  EXPECT_EQ(map_dataset->Cardinality(), test_params.expected_cardinality);
+}
+
+INSTANTIATE_TEST_CASE_P(MapDatasetOpTest, DatasetCardinalityTest,
+                        ::testing::Values(CardinalityTestParams(0, 10, 1, 10),
+                                          CardinalityTestParams(0, 10, 3, 4),
+                                          CardinalityTestParams(10, 0, -3, 4)));
+
+TEST_F(MapDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(map_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(iterator->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputPrefix) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Map");
+}
+
+struct RoundtripTestParams {
+  explicit RoundtripTestParams(int64 input_start, int64 input_end,
+                               int64 input_step, int input_breakpoint,
+                               int64 input_expected_value,
+                               string input_func_name,
+                               std::vector<FunctionDef> input_func_lib)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        breakpoint(input_breakpoint),
+        expected_value(input_expected_value),
+        func_name(std::move(input_func_name)),
+        func_lib(std::move(input_func_lib)) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int breakpoint;
+  int64 expected_value;
+  string func_name;
+  std::vector<FunctionDef> func_lib;
+};
+
+struct IteratorRoundtripTest
+    : MapDatasetOpTest,
+      ::testing::WithParamInterface<RoundtripTestParams> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  RoundtripTestParams test_params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_params.func_lib, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scoped_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->node_name(), test_params.func_name, &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scoped_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  std::vector<Tensor> out_tensors;
+  bool end_of_sequence = false;
+  for (int i = 0; i < test_params.breakpoint; i++) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+  VariantTensorDataReader reader(&data);
+  TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                 &end_of_sequence));
+  EXPECT_EQ(out_tensors.back().flat<int64>()(0), test_params.expected_value);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MapDatasetOpTest, IteratorRoundtripTest,
+    ::testing::Values(RoundtripTestParams(0, 10, 2, 0, 0, "XTimesTwo",
+                                          std::vector<FunctionDef>{
+                                              test::function::XTimesTwo()}),
+                      RoundtripTestParams(0, 10, 2, 4, 16, "XAddX",
+                                          std::vector<FunctionDef>{
+                                              test::function::XAddX()}),
+                      RoundtripTestParams(0, 10, 2, 6, 32, "XTimesFour",
+                                          std::vector<FunctionDef>{
+                                              test::function::XTimesTwo(),
+                                              test::function::XTimesFour()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 8122048702a6c572486ab8ac36a323f822ab9a0f..1577e770d3126fa2b1024adf3e30609921ea9eea 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -28,16 +29,18 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
-                   bool always_collect_stats) {
-  opts->step_id = ctx->step_id();
-  opts->rendezvous = ctx->rendezvous();
-  if (always_collect_stats) {
-    opts->stats_collector = ctx->stats_collector();
-  }
-  opts->runner = ctx->runner();
-}
-
+// This op runs a given defun on slices of the input arguments. The function
+// given by "f" is assumed to be stateless, and is executed concurrently
+// on all the slices; up to batch_size (i.e. the 0th dimension of each argument)
+// functions will be scheduled at once.
+//
+// The "max_intra_op_parallelism" attr, which defaults to 1, can be used to
+// limit the intra op parallelism. To limit inter-op parallelism, a user
+// can set a private threadpool on the dataset using `tf.data.Options`'s
+// `ThreadingOptions`.
+//
+// Note that this op is not exposed to users directly, but is invoked in
+// tf.data rewrites.
 class MapDefunOp : public AsyncOpKernel {
  public:
   explicit MapDefunOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
@@ -50,6 +53,8 @@ class MapDefunOp : public AsyncOpKernel {
                    func_lib->Instantiate(func->name(), AttrSlice(&func->attr()),
                                          &func_handle_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_intra_op_parallelism",
+                                     &max_intra_op_parallelism_));
 
     OP_REQUIRES(ctx, ctx->num_inputs() >= 0,
                 errors::InvalidArgument("Must have at least one input."));
@@ -72,7 +77,7 @@ class MapDefunOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(ctx, s, done);
 
     FunctionLibraryRuntime::Options opts;
-    SetRunOptions(ctx, &opts, false);
+    SetRunOptions(ctx, &opts, compute_opts, /*always_collect_stats=*/false);
 
     // Run loop
     StatusCallback callback = std::bind(
@@ -124,9 +129,6 @@ class MapDefunOp : public AsyncOpKernel {
   }
 
  private:
-  FunctionLibraryRuntime::Handle func_handle_;
-  std::vector<PartialTensorShape> output_shapes_;
-
   struct ComputeOptions {
     // These vary per MapDefunOp::ComputeAsync call, but must persist until
     // all calls to the function are complete. This struct also encapsulates
@@ -136,6 +138,7 @@ class MapDefunOp : public AsyncOpKernel {
     const std::vector<TensorShape> arg_shapes;
     OpInputList captured_inputs;
     const int64 batch_size;
+    std::function<void(std::function<void()>)> runner;
 
     // Output of a compute call
     std::vector<PartialTensorShape> output_shapes GUARDED_BY(mu);
@@ -144,67 +147,21 @@ class MapDefunOp : public AsyncOpKernel {
 
     // Create a copy of output_shapes because every `Compute` may expect a
     // different output shape.
-    ComputeOptions(OpInputList args, OpInputList captured_inputs,
+    ComputeOptions(OpKernelContext* ctx, OpInputList args,
+                   OpInputList captured_inputs,
                    std::vector<TensorShape> arg_shapes, int64 batch_size,
-                   const std::vector<PartialTensorShape>& output_shapes_attr)
+                   const std::vector<PartialTensorShape>& output_shapes_attr,
+                   int max_parallelism)
         : args(args),
           arg_shapes(std::move(arg_shapes)),
           captured_inputs(captured_inputs),
           batch_size(batch_size),
-          output_shapes(output_shapes_attr) {}
-  };
-
-  // Get inputs to Compute and check that they are valid.
-  Status SetupArgs(OpKernelContext* ctx, ComputeOptions** compute_opts) {
-    OpInputList arguments;
-    TF_RETURN_IF_ERROR(ctx->input_list("arguments", &arguments));
-    OpInputList captured_inputs;
-    TF_RETURN_IF_ERROR(ctx->input_list("captured_inputs", &captured_inputs));
-
-    int64 batch_size = arguments[0].dims() > 0 ? arguments[0].dim_size(0) : -1;
-
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      if (arguments[i].dims() == 0) {
-        return errors::InvalidArgument(
-            "All inputs must have rank at least 1. Input ", i,
-            " has a rank of 0.");
-      } else if (arguments[i].dim_size(0) != batch_size) {
-        return errors::InvalidArgument(
-            "All inputs must have the same dimension 0. Input ", i,
-            " has leading dimension ", ctx->input(i).dim_size(0),
-            ", while all previous inputs have leading dimension ", batch_size);
+          output_shapes(output_shapes_attr) {
+      if (max_parallelism >= 1) {
+        runner = RunnerWithMaxParallelism(*ctx->runner(), max_parallelism);
       }
     }
-
-    std::vector<TensorShape> arg_shapes;
-    arg_shapes.reserve(arguments.size());
-
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      arg_shapes.push_back(arguments[i].shape());
-      arg_shapes.at(i).RemoveDim(0);
-    }
-
-    *compute_opts =
-        new ComputeOptions(arguments, captured_inputs, std::move(arg_shapes),
-                           batch_size, output_shapes_);
-    return Status::OK();
-  }
-
-  Status SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts) {
-    mutex_lock l(opts->mu);
-    TF_RETURN_IF_ERROR(ctx->output_list("output", &opts->output));
-
-    for (size_t i = 0; i < output_types().size(); ++i) {
-      if (output_shapes_.at(i).IsFullyDefined()) {
-        Tensor* out = nullptr;
-        TensorShape output_shape;
-        output_shapes_.at(i).AsTensorShape(&output_shape);
-        output_shape.InsertDim(0, opts->batch_size);
-        TF_RETURN_IF_ERROR(opts->output.allocate(i, output_shape, &out));
-      }
-    }
-    return Status::OK();
-  }
+  };
 
   class MapFunctionCallFrame : public CallFrameInterface {
    public:
@@ -258,6 +215,7 @@ class MapDefunOp : public AsyncOpKernel {
             "output: ",
             index);
       }
+      Tensor* out;
       {  // Locking scope
         mutex_lock l(compute_opts_->mu);
         if (!compute_opts_->output_shapes.at(index).IsCompatibleWith(
@@ -272,23 +230,96 @@ class MapDefunOp : public AsyncOpKernel {
           // this index. Store the shape and allocate the output accordingly.
           compute_opts_->output_shapes.at(index) = val.shape();
 
-          Tensor* out = nullptr;
           TensorShape actual_shape = val.shape();
           actual_shape.InsertDim(0, compute_opts_->batch_size);
           TF_RETURN_IF_ERROR(
               compute_opts_->output.allocate(index, actual_shape, &out));
+        } else {
+          out = (compute_opts_->output)[index];
         }
-        return batch_util::CopyElementToSlice(
-            val, (compute_opts_->output)[index], iter_);
       }
+      return batch_util::CopyElementToSlice(val, out, iter_);
     }
 
    private:
     ComputeOptions* const compute_opts_;  // Not owned
     const OpKernel* kernel_;
     const size_t iter_;
-  };
-};
+  };  // MapFunctionCallFrame
+
+  void SetRunOptions(OpKernelContext* ctx,
+                     FunctionLibraryRuntime::Options* opts,
+                     ComputeOptions* compute_opts, bool always_collect_stats) {
+    opts->step_id = ctx->step_id();
+    opts->rendezvous = ctx->rendezvous();
+    if (always_collect_stats) {
+      opts->stats_collector = ctx->stats_collector();
+    }
+    if (max_intra_op_parallelism_ >= 1) {
+      opts->runner = &compute_opts->runner;
+    } else {
+      opts->runner = ctx->runner();
+    }
+  }
+
+  // Get inputs to Compute and check that they are valid.
+  Status SetupArgs(OpKernelContext* ctx, ComputeOptions** compute_opts) {
+    OpInputList arguments;
+    TF_RETURN_IF_ERROR(ctx->input_list("arguments", &arguments));
+    OpInputList captured_inputs;
+    TF_RETURN_IF_ERROR(ctx->input_list("captured_inputs", &captured_inputs));
+
+    int64 batch_size = arguments[0].dims() > 0 ? arguments[0].dim_size(0) : -1;
+
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      if (arguments[i].dims() == 0) {
+        return errors::InvalidArgument(
+            "All inputs must have rank at least 1. Input ", i,
+            " has a rank of 0.");
+      } else if (arguments[i].dim_size(0) != batch_size) {
+        return errors::InvalidArgument(
+            "All inputs must have the same dimension 0. Input ", i,
+            " has leading dimension ", ctx->input(i).dim_size(0),
+            ", while all previous inputs have leading dimension ", batch_size);
+      }
+    }
+
+    std::vector<TensorShape> arg_shapes;
+    arg_shapes.reserve(arguments.size());
+
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      arg_shapes.push_back(arguments[i].shape());
+      arg_shapes.at(i).RemoveDim(0);
+    }
+
+    *compute_opts = new ComputeOptions(
+        ctx, arguments, captured_inputs, std::move(arg_shapes), batch_size,
+        output_shapes_, max_intra_op_parallelism_);
+    return Status::OK();
+  }
+
+  Status SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts) {
+    mutex_lock l(opts->mu);
+    TF_RETURN_IF_ERROR(ctx->output_list("output", &opts->output));
+
+    for (size_t i = 0; i < output_types().size(); ++i) {
+      if (output_shapes_.at(i).IsFullyDefined()) {
+        Tensor* out = nullptr;
+        TensorShape output_shape;
+        output_shapes_.at(i).AsTensorShape(&output_shape);
+        output_shape.InsertDim(0, opts->batch_size);
+        TF_RETURN_IF_ERROR(opts->output.allocate(i, output_shape, &out));
+      }
+    }
+    return Status::OK();
+  }
+
+  FunctionLibraryRuntime::Handle func_handle_;
+  std::vector<PartialTensorShape> output_shapes_;
+  // If this value is positive, limit the max intra op parallelism when the
+  // function is run on slices of the input.
+  int max_intra_op_parallelism_;
+};  // MapDefunOp
 
 REGISTER_KERNEL_BUILDER(Name("MapDefun").Device(DEVICE_CPU), MapDefunOp);
 }  // namespace
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 20254234e9da492d5b5faad502e092e15d993a91..7c6af83cc7a28afbf46c3d920a8c36f2897ba568 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -31,18 +31,28 @@ constexpr int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ModelDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cpu_budget", &cpu_budget_));
+    if (cpu_budget_ == 0) {
+      cpu_budget_ = port::NumSchedulableCPUs();
+    }
+    OP_REQUIRES(ctx, cpu_budget_ > 0,
+                errors::InvalidArgument("CPU budget must be positive but is ",
+                                        cpu_budget_, "."));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    *output = new Dataset(ctx, input);
+    *output = new Dataset(ctx, input, cpu_budget_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)), input_(input) {
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 cpu_budget)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          cpu_budget_(cpu_budget) {
       input_->Ref();
     }
 
@@ -140,9 +150,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx =
               std::make_shared<IteratorContext>(*ctx);
-          optimize_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_model",
-              [this, new_ctx]() { OptimizeThread(new_ctx); }));
+          optimize_thread_ = ctx->StartThread(
+              "tf_data_model", [this, new_ctx]() { OptimizeThread(new_ctx); });
         }
         return Status::OK();
       }
@@ -163,7 +172,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
             }
             if (cancelled_) return;
           }
-          model_->Optimize(port::NumSchedulableCPUs());
+          model_->Optimize(dataset()->cpu_budget_);
           // Exponentially increase the period of running the optimization
           // until a threshold is reached.
           if (optimization_period_ms < kOptimizationPeriodThresholdMs) {
@@ -187,7 +196,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* input_;
+    const int64 cpu_budget_;
   };
+
+  int64 cpu_budget_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ModelDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 167276032b4d7e55f9e777b813fa6a0f4e5becbc..6a600a72dfa8271c0ade8b926c7af169fb310de3 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -42,14 +43,15 @@ using MultiDeviceIteratorCallback =
 class MultiDeviceIterator : public ResourceBase {
  public:
   MultiDeviceIterator(
-      const DataTypeVector& output_types,
+      Env* env, const DataTypeVector& output_types,
       const std::vector<PartialTensorShape>& output_shapes,
       const std::vector<string>& devices,
       std::unique_ptr<FunctionLibraryDefinition> flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
       FunctionLibraryRuntime* lib,
       std::unique_ptr<FunctionHandleCache> function_handle_cache)
-      : output_types_(output_types),
+      : unbounded_thread_pool_(env, "tf_data_multi_device_iterator_resource"),
+        output_types_(output_types),
         output_shapes_(output_shapes),
         devices_(devices),
         flib_def_(std::move(flib_def)),
@@ -82,27 +84,25 @@ class MultiDeviceIterator : public ResourceBase {
     *incarnation_id = incarnation_id_;
 
     multi_device_buffer_ = absl::make_unique<MultiDeviceBuffer>(
-        devices_.size(), max_buffer_size, incarnation_id_, std::move(iterator));
+        devices_.size(), max_buffer_size, incarnation_id_, std::move(iterator),
+        this);
     return Status::OK();
   }
 
-  void GetNextFromShard(IteratorContext* ctx, int shard_num,
+  void GetNextFromShard(OpKernelContext* ctx, int shard_num,
                         int64 incarnation_id,
                         MultiDeviceIteratorCallback callback) {
-    if (ctx->lib() == lib_) {
-      tf_shared_lock l(mu_);
-      multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
-                                             std::move(callback));
-    } else {
-      IteratorContext::Params params(ctx);
-      params.lib = lib_;
-      params.function_handle_cache = function_handle_cache_.get();
-      params.resource_mgr = &resource_mgr_;
-      IteratorContext iter_ctx(std::move(params));
-      tf_shared_lock l(mu_);
-      multi_device_buffer_->GetNextFromShard(
-          &iter_ctx, shard_num, incarnation_id, std::move(callback));
-    }
+    tf_shared_lock l(mu_);
+    IteratorContext::Params params(ctx);
+    params.function_library = lib_def_;
+    params.lib = lib_;
+    params.function_handle_cache = function_handle_cache_.get();
+    params.resource_mgr = &resource_mgr_;
+    params.thread_factory = unbounded_thread_pool_.get_thread_factory();
+
+    IteratorContext iter_ctx(std::move(params));
+    multi_device_buffer_->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
+                                           std::move(callback));
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -133,12 +133,14 @@ class MultiDeviceIterator : public ResourceBase {
   class MultiDeviceBuffer {
    public:
     MultiDeviceBuffer(size_t size, int64 max_buffer_size, int64 incarnation_id,
-                      std::unique_ptr<IteratorBase> host_iterator)
+                      std::unique_ptr<IteratorBase> host_iterator,
+                      MultiDeviceIterator* parent)
         : buffer_(size),
           size_(size),
           max_buffer_size_(max_buffer_size),
           incarnation_id_(incarnation_id),
-          host_iterator_(std::move(host_iterator)) {}
+          host_iterator_(std::move(host_iterator)),
+          parent_(parent) {}
 
     ~MultiDeviceBuffer() {
       {
@@ -217,10 +219,12 @@ class MultiDeviceIterator : public ResourceBase {
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!background_thread_) {
         auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-        background_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
-            {}, "tf_data_multi_device_iterator",
-            std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
-                      this, std::move(ctx_copy))));
+        background_thread_ =
+            parent_->unbounded_thread_pool_.get_thread_factory()->StartThread(
+                "tf_data_multi_device_iterator",
+                std::bind(
+                    &MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
+                    this, std::move(ctx_copy)));
       }
     }
 
@@ -342,8 +346,10 @@ class MultiDeviceIterator : public ResourceBase {
     const int64 max_buffer_size_;
     const int64 incarnation_id_;
     const std::unique_ptr<IteratorBase> host_iterator_;
+    MultiDeviceIterator* const parent_;  // Not owned.
   };
 
+  UnboundedThreadPool unbounded_thread_pool_;
   mutex mu_;
   const DataTypeVector output_types_;
   const std::vector<PartialTensorShape> output_shapes_;
@@ -359,6 +365,9 @@ class MultiDeviceIterator : public ResourceBase {
   std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_);
 };
 
+// Used to generate unique names for anonymous multi device iterators.
+static std::atomic<int64> current_id_;
+
 // Just creates a MultiDeviceIterator and returns it.
 class MultiDeviceIteratorHandleOp : public OpKernel {
  public:
@@ -388,6 +397,8 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    string unique_name = cinfo_.name();
+    string container_name = cinfo_.container();
     {
       mutex_lock l(mu_);
       if (resource_ == nullptr) {
@@ -402,31 +413,47 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
         MultiDeviceIterator* resource;
-        OP_REQUIRES_OK(context,
-                       mgr->LookupOrCreate<MultiDeviceIterator>(
-                           cinfo_.container(), cinfo_.name(), &resource,
-                           [this, lib, &flib_def, &pflr,
-                            &function_handle_cache](MultiDeviceIterator** ret)
-                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                                 *ret = new MultiDeviceIterator(
-                                     output_types_, output_shapes_, devices_,
-                                     std::move(flib_def), std::move(pflr), lib,
-                                     std::move(function_handle_cache));
-                                 return Status::OK();
-                               }));
-
-        Status s = VerifyResource(resource);
-        if (TF_PREDICT_FALSE(!s.ok())) {
-          resource->Unref();
-          context->SetStatus(s);
-          return;
-        }
 
-        resource_ = resource;
+        if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+          unique_name = strings::StrCat("_AnonymousMultiDeviceIterator",
+                                        current_id_.fetch_add(1));
+          container_name = "AnonymousMultiDeviceIterator";
+          resource = new MultiDeviceIterator(
+              context->env(), output_types_, output_shapes_, devices_,
+              std::move(flib_def), std::move(pflr), lib,
+              std::move(function_handle_cache));
+          // NOTE: `mgr->Create()` transfers the one reference on `resource` to
+          // `mgr`.
+          OP_REQUIRES_OK(context, mgr->Create<MultiDeviceIterator>(
+                                      container_name, unique_name, resource));
+        } else {
+          unique_name = cinfo_.name();
+          container_name = cinfo_.container();
+          OP_REQUIRES_OK(context,
+                         mgr->LookupOrCreate<MultiDeviceIterator>(
+                             container_name, unique_name, &resource,
+                             [this, context, lib, &flib_def, &pflr,
+                              &function_handle_cache](MultiDeviceIterator** ret)
+                                 EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                   *ret = new MultiDeviceIterator(
+                                       context->env(), output_types_,
+                                       output_shapes_, devices_,
+                                       std::move(flib_def), std::move(pflr),
+                                       lib, std::move(function_handle_cache));
+                                   return Status::OK();
+                                 }));
+          Status s = VerifyResource(resource);
+          if (TF_PREDICT_FALSE(!s.ok())) {
+            resource->Unref();
+            context->SetStatus(s);
+            return;
+          }
+          resource_ = resource;
+        }
       }
     }
     OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, cinfo_.container(), cinfo_.name(),
+                                context, 0, container_name, unique_name,
                                 MakeTypeIndex<MultiDeviceIterator>()));
   }
 
@@ -502,9 +529,7 @@ REGISTER_KERNEL_BUILDER(Name("MultiDeviceIteratorInit").Device(DEVICE_CPU),
 class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
  public:
   explicit MultiDeviceIteratorGetNextFromShardOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           "tf_data_multi_device_iterator_get_next") {}
+      : AsyncOpKernel(ctx) {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     const Tensor* tensor_shard_num;
@@ -519,37 +544,27 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
     MultiDeviceIterator* iterator;
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    background_worker_.Schedule(std::bind(
-        [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
-          MultiDeviceIteratorCallback callback = std::bind(
-              [ctx](const HostBufferElement& elem, DoneCallback done) {
-                // iterator->Unref();
-                Status s = elem.status;
-                if (!s.ok()) {
-                  ctx->SetStatus(s);
-                } else if (elem.end_of_sequence) {
-                  ctx->SetStatus(errors::OutOfRange("End of sequence"));
-                } else {
-                  for (int i = 0; i < elem.value.size(); ++i) {
-                    ctx->set_output(i, elem.value[i]);
-                  }
-                }
-                done();
-              },
-              std::placeholders::_1, std::move(done));
-
-          IteratorContext::Params params(ctx);
-          params.function_library = iterator->function_library();
-          IteratorContext iter_ctx(std::move(params));
-          iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
-                                     callback);
-          iterator->Unref();
+
+    MultiDeviceIteratorCallback callback = std::bind(
+        [ctx](const HostBufferElement& elem, DoneCallback done) {
+          // iterator->Unref();
+          Status s = elem.status;
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+          } else if (elem.end_of_sequence) {
+            ctx->SetStatus(errors::OutOfRange("End of sequence"));
+          } else {
+            for (int i = 0; i < elem.value.size(); ++i) {
+              ctx->set_output(i, elem.value[i]);
+            }
+          }
+          done();
         },
-        std::move(done)));
-  }
+        std::placeholders::_1, std::move(done));
 
- private:
-  BackgroundWorker background_worker_;
+    iterator->GetNextFromShard(ctx, shard_num, incarnation_id, callback);
+    iterator->Unref();
+  }
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 6047dc5f3f46fa20878825417bac1a06aacd7c15..17094e3001738becdbc3bf4d98aaaa6a9917d054 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -14,26 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <map>
 
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/graph_runner.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/grappler_item_builder.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -71,235 +56,20 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphRewriteDataset {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const std::vector<string>& optimizations,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          optimized_input_(nullptr),
-          input_(input),
-          optimizations_(optimizations),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override {
-      input_->Unref();
-      if (optimized_input_) {
-        optimized_input_->Unref();
-      }
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      // We do not add a token for the optimization dataset to the prefix. The
-      // prefix is used to identify checkpoint elements and since the
-      // optimization dataset is excluded from the checkpoint, adding a token
-      // here would result in invalid checkpoint identifiers.
-      return absl::make_unique<Iterator>(Iterator::Params{this, prefix});
-    }
-
-    Status Optimize(OpKernelContext* ctx) {
-      GraphDefBuilder b;
-      DatasetGraphDefBuilder db(&b);
-      Node* input_node = nullptr;
-      SerializationContext::Params params;
-      std::vector<std::pair<string, Tensor>> input_list;
-      params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-      params.input_list = &input_list;
-      params.optimization_only = true;
-      SerializationContext serialization_ctx(params);
-      TF_RETURN_IF_ERROR(
-          db.AddInputDataset(&serialization_ctx, input_, &input_node));
-      string output_node = input_node->name();
-
-      GraphDef graph_def;
-      TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-      VLOG(3) << "Before optimization: " << graph_def.DebugString();
-
-      TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
-      VLOG(3) << "After optimization: " << graph_def.DebugString();
-
-      // Instantiate the optimized input pipeline by running the optimized graph
-      // using the optimized function library.
-      TF_RETURN_IF_ERROR(
-          ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
-
-      // Create a FunctionHandleCache.
-      function_handle_cache_ = absl::make_unique<FunctionHandleCache>(lib_);
-
-      // Some functions may have been modified without having their names
-      // changed (for example, nested dataset graphs from FlatMap or
-      // Interleave). To avoid name conflicts, we remove these functions from
-      // flib_def_ before adding the optimized function library.
-      for (const FunctionDef& fd : graph_def.library().function()) {
-        if (flib_def_->Find(fd.signature().name()) != nullptr) {
-          TF_RETURN_IF_ERROR(flib_def_->RemoveFunction(fd.signature().name()));
-        }
-      }
-      TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library()));
-
-      Graph graph(OpRegistry::Global());
-      TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-      std::vector<Tensor> outputs;
-      GraphRunner graph_runner(ctx->function_library()->device());
-
-      TF_RETURN_IF_ERROR(
-          graph_runner.Run(&graph, lib_, input_list, {output_node}, &outputs));
-      TF_RETURN_IF_ERROR(
-          GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
-      optimized_input_->Ref();
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
+        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
+          optimizations_(optimizations) {}
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
-    int64 Cardinality() const override { return input_->Cardinality(); }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      // We only serialize the optimized dataset to avoid re-running
-      // optimizations when the input pipeline is restored from a checkpoint.
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, optimized_input_, output));
-      return Status::OK();
-    }
-
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params(ctx);
-        params.lib = dataset()->lib_;
-        params.function_handle_cache = dataset()->function_handle_cache_.get();
-        return dataset()->optimized_input_->MakeIterator(
-            IteratorContext(std::move(params)), prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        IteratorContext::Params params(ctx);
-        params.lib = dataset()->lib_;
-        params.function_handle_cache = dataset()->function_handle_cache_.get();
-        return input_impl_->GetNext(IteratorContext(std::move(params)),
-                                    out_tensors, end_of_sequence);
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      std::unique_ptr<IteratorBase> input_impl_;
-    };
-
-    void AddFakeSinks(FunctionDef* function_def) {
-      int counter = 0;
-      for (const auto& output : function_def->signature().output_arg()) {
-        NodeDef* node = function_def->add_node_def();
-        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
-            strings::StrCat("FakeSink", counter++), function_def, node);
-        node->set_op("Identity");
-        node->add_input(function_def->ret().at(output.name()));
-        (*node->mutable_attr())["T"].set_type(output.type());
-
-        (*function_def->mutable_ret())[output.name()] =
-            strings::StrCat(node->name(), ":output:0");
-      }
-    }
-
-    void RemoveFakeSinks(FunctionDef* function_def) {
-      // Map from identity node names to their input tensor strings
-      std::map<string, string> identity_map;
-      for (const auto& node : function_def->node_def()) {
-        if (node.op() == "Identity" && node.input_size() == 1) {
-          identity_map[node.name()] = node.input(0);
-        }
-      }
-      for (const auto& output_arg : function_def->signature().output_arg()) {
-        const string& tensor = function_def->ret().at(output_arg.name());
-        const string& output_node = tensor.substr(0, tensor.find(':'));
-        if (identity_map.find(output_node) != identity_map.end()) {
-          (*function_def->mutable_ret())[output_arg.name()] =
-              identity_map.at(output_node);
-        }
-      }
-    }
-
-    Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
-                              string* output_node) {
-      // Add an identity node as the fetch node, otherwise we might get
-      // 'placeholder is both fed and fetched' errors in some cases when using
-      // input list with placeholder dataset nodes.
-      NodeDef* node = graph_def->mutable_node()->Add();
-      tensorflow::grappler::graph_utils::SetUniqueGraphNodeName(
-          "Sink", graph_def, node);
-      node->set_op("Identity");
-      node->add_input(*output_node);
-      (*node->mutable_attr())["T"].set_type(DT_VARIANT);
-      *output_node = node->name();
-
-      // Add fake sink node to graph and functions to allow rewriting the actual
-      // sink nodes.
-      // TODO(b/118820916): When MetaOptimizer adds provisions for function
-      // retvals to be optimizable, we will no longer need this.
-      for (auto& function_def :
-           *graph_def->mutable_library()->mutable_function()) {
-        AddFakeSinks(&function_def);
-      }
-
-      // Create metagraph.
-      MetaGraphDef meta_graph_def;
-      (*meta_graph_def.mutable_graph_def()) = *graph_def;
-
-      // Grappler determines fetch ops from collection 'train_op'.
-      CollectionDef collection_def;
-      auto node_list = collection_def.mutable_node_list();
-      node_list->add_value(*output_node);
-      (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
-
-      // Create Grappler item.
-      tensorflow::grappler::ItemConfig item_config;
-      item_config.apply_optimizations = true;
-      std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
-          tensorflow::grappler::GrapplerItemFromMetaGraphDef(
-              "graph", meta_graph_def, item_config);
-      std::unordered_map<string, tensorflow::DeviceProperties> device_map;
-      tensorflow::grappler::VirtualCluster cluster(device_map);
-
-      // Run data optimizer using grappler's meta optimizer.
-      tensorflow::ConfigProto config;
-      RewriterConfig& rewriter_config =
-          *config.mutable_graph_options()->mutable_rewrite_options();
+    RewriterConfig CreateGrapplerRewriteConfig() override {
+      RewriterConfig rewriter_config;
       rewriter_config.add_optimizers(kOptimizerName);
       rewriter_config.set_meta_optimizer_iterations(
           RewriterConfig_NumIterationsType_ONE);
@@ -311,30 +81,10 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       for (const auto& opt : optimizations_) {
         custom_optimizations_list->add_s(opt);
       }
-
-      TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, config, ctx->device(), &cluster, graph_def));
-
-      // Remove fake sinks after optimizations are done.
-      // TODO(b/118820916): When MetaOptimizer adds provisions for function
-      // retvals to be optimizable, we will no longer need this.
-      for (auto& function_def :
-           *graph_def->mutable_library()->mutable_function()) {
-        RemoveFakeSinks(&function_def);
-      }
-
-      return Status::OK();
+      return rewriter_config;
     }
 
-    DatasetBase* optimized_input_;
-    FunctionLibraryRuntime* lib_ = nullptr;
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
-    std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
-    std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
-    const DatasetBase* input_;
     const std::vector<string> optimizations_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
   };
 
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index a406f7467fe1a1d221ee1d5bd9b2e858fb0044d3..473dbebd3062486de3cd48764ed45d9a059832d9 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -23,133 +23,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-class OptionalNoneOp : public OpKernel {
- public:
-  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
-  }
-};
-
-class OptionalFromValueOp : public OpKernel {
- public:
-  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OpInputList components_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
-    std::vector<Tensor> components(components_input.begin(),
-                                   components_input.end());
-    OP_REQUIRES_OK(
-        ctx, WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
-  }
-};
-
-class OptionalHasValueOp : public OpKernel {
- public:
-  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    Tensor* result;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
-    result->scalar<bool>()() = optional->has_value();
-  }
-};
-
-class OptionalGetValueOp : public OpKernel {
- public:
-  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES(
-        ctx, output_shapes_.size() == output_types_.size(),
-        errors::InvalidArgument(
-            "output_types and output_shapes must be same length, got:\n",
-            "output_types: ", output_types_.size(), "\n",
-            "output_shapes: ", output_shapes_.size()));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    OP_REQUIRES(
-        ctx, optional->has_value(),
-        errors::InvalidArgument("The given optional does not have a value."));
-    const auto& components = optional->get_values();
-    OP_REQUIRES(ctx, components.size() == output_types_.size(),
-                errors::InvalidArgument(
-                    "The given optional has ", components.size(),
-                    " components, expected ", output_types_.size()));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx, components[i].dtype() == output_types_[i],
-          errors::InvalidArgument(
-              "The given optional does not match the expected type for "
-              "component ",
-              i, ". Expected: ", DataTypeString(output_types_[i]),
-              ". Actual: ", DataTypeString(components[i].dtype()), "."));
-      OP_REQUIRES(ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."));
-      ctx->set_output(i, components[i]);
-    }
-  }
-
- private:
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
-    OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
-    OptionalFromValueOp);
-
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("has_value")
-                            .Priority(1),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
-                        OptionalGetValueOp);
-
 static Status OptionalDeviceCopy(
     const OptionalVariant& from, OptionalVariant* to,
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
@@ -190,6 +63,75 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(OptionalVariant,
 
 }  // namespace
 
+void OptionalNoneOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
+}
+
+void OptionalFromValueOp::Compute(OpKernelContext* ctx) {
+  OpInputList components_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
+  std::vector<Tensor> components(components_input.begin(),
+                                 components_input.end());
+  OP_REQUIRES_OK(ctx,
+                 WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
+}
+
+void OptionalHasValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  Tensor* result;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
+  result->scalar<bool>()() = optional->has_value();
+}
+
+void OptionalGetValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  OP_REQUIRES(
+      ctx, optional->has_value(),
+      errors::InvalidArgument("The given optional does not have a value."));
+  const auto& components = optional->get_values();
+  OP_REQUIRES(
+      ctx, components.size() == output_types_.size(),
+      errors::InvalidArgument("The given optional has ", components.size(),
+                              " components, expected ", output_types_.size()));
+  for (int i = 0; i < components.size(); ++i) {
+    OP_REQUIRES(ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."));
+    OP_REQUIRES(ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."));
+    ctx->set_output(i, components[i]);
+  }
+}
+
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
                                       std::vector<Tensor> value) {
   OptionalVariant v(std::move(value));
@@ -213,6 +155,33 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+namespace {
+
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
+
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalGetValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
+                        OptionalGetValueOp);
+
+}  // namespace
+
 REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
                                          DEVICE_CPU, OptionalVariant,
                                          OptionalZerosLike<CPUDevice>);
@@ -221,12 +190,5 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           OptionalVariant,
                                           OptionalBinaryAdd<CPUDevice>);
 
-Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 7089a423d7302decd2e13a6496307e7520e88066..24eb1b81d903b391d413cbfc9b10499c84125a40 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -152,6 +152,47 @@ Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
   return Status::OK();
 }
 
+class OptionalNoneOp : public OpKernel {
+ public:
+  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalFromValueOp : public OpKernel {
+ public:
+  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalHasValueOp : public OpKernel {
+ public:
+  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalGetValueOp : public OpKernel {
+ public:
+  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
+  }
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index ddd81d4596ee216c1abd6a17ec94d86c3d41e18c..4dd5c379c039194ce57018403738d57667ff133e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -211,9 +212,6 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                 "data_parallel_interleave_worker_pool",
                 port::NumSchedulableCPUs() /* num_threads */,
                 false /* low_latency_hint */)) {
-        std::vector<string> components =
-            str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        key_prefix_ = components.back();
       }
 
       ~ParallelInterleaveIterator() override {
@@ -506,7 +504,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
             stats_aggregator->AddScalar(
-                strings::StrCat(key_prefix_, "::thread_utilization"),
+                stats_utils::ThreadUtilizationScalarName(
+                    dataset()->node_name()),
                 static_cast<float>(num_calls_) /
                     static_cast<float>(num_parallel_calls_->value));
           }
@@ -518,17 +517,15 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         if (!current_elements_manager_) {
           auto new_ctx = std::make_shared<IteratorContext>(*ctx);
-          current_elements_manager_ =
-              absl::WrapUnique<Thread>(ctx->env()->StartThread(
-                  {}, "tf_data_parallel_interleave_current",
-                  [this, new_ctx]() { CurrentElementsManager(new_ctx); }));
+          current_elements_manager_ = ctx->StartThread(
+              "tf_data_parallel_interleave_current",
+              [this, new_ctx]() { CurrentElementsManager(new_ctx); });
         }
         if (!future_elements_manager_) {
           auto new_ctx = std::make_shared<IteratorContext>(*ctx);
-          future_elements_manager_ =
-              absl::WrapUnique<Thread>(ctx->env()->StartThread(
-                  {}, "tf_data_parallel_interleave_future",
-                  [this, new_ctx]() { FutureElementsManager(new_ctx); }));
+          future_elements_manager_ = ctx->StartThread(
+              "tf_data_parallel_interleave_future",
+              [this, new_ctx]() { FutureElementsManager(new_ctx); });
         }
       }
 
@@ -567,7 +564,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(key_prefix_, "::thread_utilization"),
+              stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
               static_cast<float>(num_calls_) /
                   static_cast<float>(num_parallel_calls_->value));
         }
@@ -620,7 +617,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
           const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
             stats_aggregator->AddScalar(
-                strings::StrCat(key_prefix_, "::thread_utilization"),
+                stats_utils::ThreadUtilizationScalarName(
+                    dataset()->node_name()),
                 static_cast<float>(num_calls_) /
                     static_cast<float>(num_parallel_calls_->value));
           }
@@ -917,7 +915,6 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
       // Identifies whether background threads should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
-      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 34f341d1d12c02c3900cba2741a5cd38f2b73e9c..165babc9be79831a2c52e4e40b6b5c014cdfc3cf 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -46,6 +46,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
     OP_REQUIRES_OK(
         ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+    OP_REQUIRES_OK(
+        ctx, ComputeShortCircuitIndices(ctx, func_, &short_circuit_indices_));
   }
 
  protected:
@@ -64,17 +66,14 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    std::vector<int> indices;
-    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
-
     if (num_parallel_calls == model::kAutoTune) {
       metrics::RecordTFDataAutotune(kDatasetName);
     }
 
-    *output =
-        new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
-                    output_shapes_, use_inter_op_parallelism_, sloppy_,
-                    std::move(captured_func), indices, preserve_cardinality_);
+    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
+                          output_shapes_, use_inter_op_parallelism_, sloppy_,
+                          std::move(captured_func), short_circuit_indices_,
+                          preserve_cardinality_);
   }
 
  private:
@@ -97,7 +96,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           sloppy_(sloppy),
           preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
-          indices_(indices),
+          short_circuit_indices_(indices),
           can_move_(indices.empty() ? std::vector<bool>()
                                     : ComputeMoveVector(indices)) {
       input_->Ref();
@@ -108,7 +107,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
-      if (indices_.empty()) {
+      if (short_circuit_indices_.empty()) {
         parallel_map_functor =
             absl::make_unique<ParallelMapDatasetFunctor>(this);
       } else {
@@ -215,17 +214,19 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         const std::vector<Tensor>& captured_inputs =
             dataset_->captured_func_->captured_inputs();
         size_t num_args = input_element.size();
-        for (size_t i = 0; i < dataset_->indices_.size(); ++i) {
-          if (dataset_->indices_[i] < num_args) {
+        for (size_t i = 0; i < dataset_->short_circuit_indices_.size(); ++i) {
+          if (dataset_->short_circuit_indices_[i] < num_args) {
             if (dataset_->can_move_[i]) {
-              result->push_back(
-                  std::move(input_element[dataset_->indices_[i]]));
+              result->push_back(std::move(
+                  input_element[dataset_->short_circuit_indices_[i]]));
             } else {
-              result->push_back(input_element[dataset_->indices_[i]]);
+              result->push_back(
+                  input_element[dataset_->short_circuit_indices_[i]]);
             }
           } else {
             result->push_back(
-                captured_inputs[dataset_->indices_[i] - num_args]);
+                captured_inputs[dataset_->short_circuit_indices_[i] -
+                                num_args]);
           }
         }
         done(Status::OK());
@@ -278,7 +279,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const bool sloppy_;
     const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const std::vector<int> indices_;
+    const std::vector<int> short_circuit_indices_;
     const std::vector<bool> can_move_;
   };
 
@@ -288,6 +289,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
   bool sloppy_;
   bool preserve_cardinality_;
   NameAttrList func_;
+  std::vector<int> short_circuit_indices_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index be91de12fe74a39919fa68bd12d60d9c9ac04ac2..3b0d6d7a44962dd48be6db5727e11527d9d91f94 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
@@ -57,9 +58,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
             params.num_parallel_calls, mu_, cond_var_)),
         sloppy_(params.sloppy),
         preserve_cardinality_(params.preserve_cardinality) {
-    std::vector<string> components =
-        str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
-    key_prefix_ = components.back();
+    key_prefix_ = base_params.dataset->node_name();
   }
 
   ~ParallelMapIterator() override {
@@ -192,9 +191,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
-      runner_thread_.reset(ctx->env()->StartThread(
-          {}, "tf_data_parallel_map",
-          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
+      runner_thread_ = ctx->StartThread(
+          "tf_data_parallel_map",
+          std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy));
     }
   }
 
@@ -206,7 +205,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     const auto& stats_aggregator = ctx->stats_aggregator();
     if (stats_aggregator) {
       stats_aggregator->AddScalar(
-          strings::StrCat(key_prefix_, "::thread_utilization"),
+          stats_utils::ThreadUtilizationScalarName(key_prefix_),
           static_cast<float>(num_calls_) /
               static_cast<float>(num_parallel_calls_->value));
     }
@@ -301,7 +300,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(key_prefix_, "::thread_utilization"),
+              stats_utils::ThreadUtilizationScalarName(key_prefix_),
               static_cast<float>(num_calls_) /
                   static_cast<float>(num_parallel_calls_->value));
         }
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index f0e835a27c9775aadad107ca1f274275cc44f622..d716ceca9942d51d648bc48b466f96bbbf5489de 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -79,11 +80,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          auto_tuner_(params.dataset->buffer_size_) {
-      std::vector<string> components =
-          str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-      prefix_end_ = components.back();
-    }
+          auto_tuner_(params.dataset->buffer_size_) {}
 
     ~Iterator() override {
       // Signal the prefetch thread to terminate it. We will then
@@ -143,10 +140,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       if (stats_aggregator) {
         stats_aggregator->AddScalar(
-            strings::StrCat(prefix_end_, "::buffer_size"),
+            stats_utils::BufferSizeScalarName(dataset()->node_name()),
             static_cast<float>(buffer_.size()));
         stats_aggregator->AddScalar(
-            strings::StrCat(prefix_end_, "::buffer_capacity"),
+            stats_utils::BufferCapacityScalarName(dataset()->node_name()),
             static_cast<float>(auto_tuner_.buffer_limit()));
       }
       return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
@@ -236,14 +233,14 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
         stats_aggregator->AddToHistogram(
-            strings::StrCat(prefix_end_, "::buffer_utilization"),
+            stats_utils::BufferUtilizationHistogramName(dataset()->node_name()),
             {static_cast<float>(buffer_.size()) /
              static_cast<float>(auto_tuner_.buffer_limit())});
         stats_aggregator->AddScalar(
-            strings::StrCat(prefix_end_, "::buffer_size"),
+            stats_utils::BufferSizeScalarName(dataset()->node_name()),
             static_cast<float>(buffer_.size()));
         stats_aggregator->AddScalar(
-            strings::StrCat(prefix_end_, "::buffer_capacity"),
+            stats_utils::BufferCapacityScalarName(dataset()->node_name()),
             static_cast<float>(auto_tuner_.buffer_limit()));
       }
       // A new element is available. Forward the status from computing it, and
@@ -271,9 +268,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       if (!prefetch_thread_) {
         std::shared_ptr<IteratorContext> new_ctx =
             std::make_shared<IteratorContext>(*ctx);
-        prefetch_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
-            {}, "tf_data_prefetch",
-            [this, new_ctx]() { PrefetchThread(new_ctx); }));
+        prefetch_thread_ = ctx->StartThread(
+            "tf_data_prefetch", [this, new_ctx]() { PrefetchThread(new_ctx); });
       }
       return Status::OK();
     }
@@ -286,8 +282,6 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       RecordStart(ctx.get());
       auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
       while (true) {
-        std::vector<Tensor> value;
-
         // 1. Wait for a slot in the buffer.
         {
           mutex_lock l(mu_);
@@ -375,7 +369,6 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     mutex parent_mu_ ACQUIRED_BEFORE(mu_);
     std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
     condition_variable cond_var_;
-    string prefix_end_;
     PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
     std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
     std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index aa14d27d5c3ebec797174d5aecf89dd217fe8f3b..87390ad512fcbf0481a0f5c4241d864d0c99cee6 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -64,7 +64,7 @@ class RangeDatasetOp : public DatasetOpKernel {
 
     const std::vector<PartialTensorShape>& output_shapes() const override {
       static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
+          new std::vector<PartialTensorShape>({PartialTensorShape({})});
       return *shapes;
     }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfe091fd524b76a633d1a7f89455f759d2484f94
--- /dev/null
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -0,0 +1,421 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kOpName[] = "RangeDataset";
+
+class RangeDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new RangeDataset op kernel context.
+  Status CreateRangeDatasetContext(
+      int64 start, int64 end, int64 step, OpKernel* const range_kernel,
+      std::unique_ptr<OpKernelContext>* range_context) {
+    inputs_.clear();
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {start}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {end}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {step}));
+
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(range_kernel, &inputs_, range_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*range_kernel, inputs_));
+    return Status::OK();
+  }
+
+ private:
+  gtl::InlinedVector<TensorValue, 4> inputs_;
+};
+
+struct GetNextTestParams {
+  explicit GetNextTestParams(int64 input_start, int64 input_end,
+                             int64 input_step)
+      : start(input_start), end(input_end), step(input_step) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct DatasetGetNextTest : RangeDatasetOpTest,
+                            ::testing::WithParamInterface<GetNextTestParams> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  GetNextTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+  std::vector<int> expected_values;
+  for (int i = params.start; (params.end - i) * params.step > 0;
+       i = i + params.step) {
+    expected_values.reserve(1);
+    expected_values.emplace_back(i);
+  }
+  EXPECT_EQ(out_tensors.size(), expected_values.size());
+  for (size_t i = 0; i < out_tensors.size(); ++i) {
+    int64 actual_value = out_tensors[i].flat<int64>()(0);
+    int64 expect_value = expected_values[i];
+    EXPECT_EQ(actual_value, expect_value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(RangeDatasetOpTest, DatasetGetNextTest,
+                        ::testing::Values(GetNextTestParams(0, 10, 1),
+                                          GetNextTestParams(0, 10, 3),
+                                          GetNextTestParams(10, 0, -1),
+                                          GetNextTestParams(10, 0, -3)));
+
+TEST_F(RangeDatasetOpTest, DatasetName) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  EXPECT_EQ(range_dataset->type_string(), kOpName);
+}
+
+TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(range_dataset->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(range_dataset->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(
+        range_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+struct CardinalityTestParams {
+  explicit CardinalityTestParams(int64 input_start, int64 input_end,
+                                 int64 input_step,
+                                 int input_expected_cardinality)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        expected_cardinality(input_expected_cardinality) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int expected_cardinality;
+};
+
+struct DatasetCardinalityTest
+    : RangeDatasetOpTest,
+      ::testing::WithParamInterface<CardinalityTestParams> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  CardinalityTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  EXPECT_EQ(range_dataset->Cardinality(), params.expected_cardinality);
+}
+
+INSTANTIATE_TEST_CASE_P(RangeDatasetOpTest, DatasetCardinalityTest,
+                        ::testing::Values(CardinalityTestParams(0, 10, 1, 10),
+                                          CardinalityTestParams(0, 10, 3, 4),
+                                          CardinalityTestParams(10, 0, -3, 4)));
+
+TEST_F(RangeDatasetOpTest, DatasetSave) {
+  int64 thread_num = 2, cpu_num = 2;
+  int start = 0, end = 10, step = 1;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(range_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(iterator->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputPrefix) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Range");
+}
+
+struct RoundtripTestParams {
+  explicit RoundtripTestParams(int64 input_start, int64 input_end,
+                               int64 input_step, int input_breakpoint)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        breakpoint(input_breakpoint) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int breakpoint;
+};
+
+struct IteratorRoundtripTest
+    : RangeDatasetOpTest,
+      ::testing::WithParamInterface<RoundtripTestParams> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  RoundtripTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scoped_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  std::vector<Tensor> out_tensors;
+  bool end_of_sequence = false;
+  int64 cur_val = params.start - params.step;
+  for (int i = 0; i < params.breakpoint; i++) {
+    if (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_val = ((params.end - cur_val - params.step) * params.step > 0)
+                    ? cur_val + params.step
+                    : cur_val;
+    }
+  }
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+  VariantTensorDataReader reader(&data);
+  TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                 &end_of_sequence));
+  int64 expect_next = ((params.end - cur_val - params.step) * params.step > 0)
+                          ? cur_val + params.step
+                          : cur_val;
+  EXPECT_EQ(out_tensors.back().flat<int64>()(0), expect_next);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    RangeDatasetOpTest, IteratorRoundtripTest,
+    ::testing::Values(
+        RoundtripTestParams(0, 10, 2, 0),    // unused_iterator
+        RoundtripTestParams(0, 10, 2, 4),    // fully_used_iterator_increase
+        RoundtripTestParams(10, 0, -2, 4),   // fully_used_iterator_decrease
+        RoundtripTestParams(0, 10, 2, 6)));  // exhausted_iterator
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 789f9c859aab2df61d119b9bb6f6ddd88ce24681..c8e0e9ea9440987ab2a6e8edc87972e8985c0c87 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +30,8 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
+constexpr char kTextLineDatasetName[] = "TextLine";
+
 class TextLineDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -91,8 +94,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::TextLine")});
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTextLineDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -142,6 +145,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
+              metrics::RecordTFDataBytesRead(kTextLineDatasetName,
+                                             line_contents.size());
               out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                         TensorShape({}));
               out_tensors->back().scalar<string>()() = std::move(line_contents);
@@ -268,9 +273,12 @@ class TextLineDatasetOp : public DatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
                         TextLineDatasetOp);
 
+constexpr char kFixedLengthRecordDatasetName[] = "FixedLengthRecord";
+
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+
   explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
       : DatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
@@ -346,10 +354,12 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       if (compression_type_.empty()) {
         return absl::make_unique<UncompressedIterator>(
             UncompressedIterator::Params{
-                this, strings::StrCat(prefix, "::FixedLengthRecord")});
+                this,
+                strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       } else {
         return absl::make_unique<CompressedIterator>(CompressedIterator::Params{
-            this, strings::StrCat(prefix, "::FixedLengthRecord")});
+            this,
+            strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       }
     }
 
@@ -411,6 +421,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               string record;
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
+              metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                             dataset()->record_bytes_);
+
               // Produce the record as output.
               Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
@@ -532,6 +545,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
                 string record;
                 TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
                     dataset()->record_bytes_, &record));
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
+
                 // Produce the record as output.
                 Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
                 record_tensor.scalar<string>()() = std::move(record);
@@ -544,6 +560,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               Status s = buffered_input_stream_->ReadNBytes(
                   dataset()->record_bytes_, &record);
               if (s.ok()) {
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
                 lookahead_cache_.append(record);
                 record = lookahead_cache_.substr(0, dataset()->record_bytes_);
                 lookahead_cache_ =
@@ -717,6 +735,8 @@ REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
 
+constexpr char kTFRecordDatasetName[] = "TFRecord";
+
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -766,8 +786,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::TFRecord")});
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTFRecordDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -816,11 +836,19 @@ class TFRecordDatasetOp : public DatasetOpKernel {
             Status s =
                 reader_->ReadRecord(&out_tensors->back().scalar<string>()());
             if (s.ok()) {
+              metrics::RecordTFDataBytesRead(
+                  kTFRecordDatasetName,
+                  out_tensors->back().scalar<string>()().size());
               *end_of_sequence = false;
               return Status::OK();
             }
             out_tensors->pop_back();
             if (!errors::IsOutOfRange(s)) {
+              // In case of other errors e.g., DataLoss, we still move forward
+              // the file index so that it works with ignore_errors.
+              // Otherwise the same file will repeat.
+              ResetStreamsLocked();
+              ++current_file_index_;
               return s;
             }
 
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61f314c5d1c524c14b9ffa8df3bd5b53acb9e896
--- /dev/null
+++ b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
@@ -0,0 +1,560 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "repeat_dataset";
+constexpr char kOpName[] = "RepeatDataset";
+
+class RepeatDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `RepeatDataset` op kernel.
+  Status CreateRepeatDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "count"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `RepeatDataset` op kernel context.
+  Status CreateRepeatDatasetContext(
+      OpKernel *op_kernel, gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  int64 count;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+TestCase FiniteRepeatTestCase() {
+  return {
+      /*input_tensors*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
+      /*count*/ 2,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"})},
+      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+      /*expected_output_shapes*/
+      {PartialTensorShape({2}), PartialTensorShape({1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 1, 3}};
+}
+
+TestCase EmptyRepeatTestCase() {
+  return {
+      /*input_tensors*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
+      /*count*/ 0,
+      /*expected_outputs*/
+      {},
+      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+      /*expected_output_shapes*/
+      {PartialTensorShape({2}), PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 3}};
+}
+
+TestCase ForeverRepeatTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {1, 2})},
+          /*count*/ -1,
+          /*expected_outputs*/
+          // Use the first group of the repeated tensors to represent the
+          // infinite outputs.
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ -1,
+          /*breakpoints*/ {0, 1, 3}};
+}
+
+class ParameterizedDatasetTest
+    : public RepeatDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedDatasetTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+
+  if (test_case.count < 0) {
+    // We test only a finite number of steps of the infinite sequence.
+    for (int i = 0; i < 100; ++i) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      for (const auto &tensor : out_tensors) {
+        TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+        expected_outputs_it++;
+        // In the forever-repeat test case, the first group of the repeated
+        // tensors is used to represent the expected outputs, so the iterator
+        // of the expected outputs needs to be reset once it reaches the end.
+        if (expected_outputs_it == test_case.expected_outputs.end()) {
+          expected_outputs_it = test_case.expected_outputs.begin();
+        }
+      }
+    }
+    EXPECT_FALSE(end_of_sequence);
+  } else {
+    while (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        for (const auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+    }
+    EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+  }
+}
+
+TEST_F(RepeatDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = FiniteRepeatTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  EXPECT_EQ(repeat_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+  TF_EXPECT_OK(VerifyTypesMatch(repeat_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+  TF_EXPECT_OK(VerifyShapesCompatible(repeat_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  EXPECT_EQ(repeat_dataset->Cardinality(), GetParam().expected_cardinality);
+}
+
+TEST_F(RepeatDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = FiniteRepeatTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(repeat_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  if (test_case.count < 0) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::ForeverRepeat");
+  } else if (test_case.count == 0) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::EmptyRepeat");
+  } else {
+    EXPECT_EQ(iterator->prefix(), "Iterator::FiniteRepeat");
+  }
+}
+
+TEST_P(ParameterizedDatasetTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = repeat_dataset->Cardinality() == 0;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        for (auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+      cur_iteration++;
+      if (test_case.count < 0 &&
+          expected_outputs_it == test_case.expected_outputs.end()) {
+        expected_outputs_it = test_case.expected_outputs.begin();
+      }
+    }
+
+    if (breakpoint >= repeat_dataset->Cardinality()) {
+      if (test_case.count < 0) {
+        EXPECT_FALSE(end_of_sequence);
+      } else {
+        EXPECT_TRUE(end_of_sequence);
+        EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+      }
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RepeatDatasetOpTest, ParameterizedDatasetTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {FiniteRepeatTestCase(), EmptyRepeatTestCase(),
+                              ForeverRepeatTestCase()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb64911aa802c6639229be689237db7296558f4
--- /dev/null
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -0,0 +1,195 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/batch_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ShardDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 index = 0;
+    int64 num_shards = 0;
+
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "num_shards", &num_shards));
+    OP_REQUIRES(
+        ctx, num_shards > 0,
+        errors::InvalidArgument("Number of shards must be greater than zero "
+                                "(currently num_shards = ",
+                                num_shards, ")."));
+
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "index", &index));
+    OP_REQUIRES(
+        ctx, index >= 0 && index < num_shards,
+        errors::InvalidArgument("Index must be between 0 and ", num_shards - 1,
+                                " (currently index = ", index, ")."));
+
+    *output = new Dataset(ctx, num_shards, index, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 num_shards, int64 index,
+            const DatasetBase* input)
+        : DatasetBase(DatasetContext(ctx)),
+          num_shards_(num_shards),
+          index_(index),
+          input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Shard")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return strings::StrCat("ShardDatasetOp(", num_shards_, ", ", index_,
+                             ")::Dataset");
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_shards = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_shards_, &num_shards));
+      Node* index = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(index_, &index));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_shards, index}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), next_index_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        std::vector<Tensor> result;
+        do {
+          result.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &result, end_of_sequence));
+          if (*end_of_sequence) {
+            input_impl_.reset();
+            return Status::OK();
+          }
+        } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
+
+        *out_tensors = std::move(result);
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->num_shards_);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_index"), next_index_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_index"), &next_index_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      int64 next_index_ GUARDED_BY(mu_);
+    };
+
+    const int64 num_shards_;
+    const int64 index_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShardDataset").Device(DEVICE_CPU),
+                        ShardDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index e0c435718ac46ee9af1ce404e2bdfa0ba31c3044..1a193b1d235ca65681e4e1662592cc1898499244 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -468,8 +468,8 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
         // resource_mgr.
         ResourceMgr* mgr = ctx->resource_mgr();
         RandomSeedGenerator* seed_generator;
-        const string name = strings::StrCat(prefix(), "::", dataset()->name(),
-                                            "::RandomSeedGenerator");
+        const string name = strings::StrCat(
+            prefix(), "::", dataset()->type_string(), "::RandomSeedGenerator");
 
         int64 dataset_seed, dataset_seed2;
         {
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5421aeb37a07c9d80479527bb23a557859bebb3
--- /dev/null
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
@@ -0,0 +1,510 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "sparse_tensor_slice_dataset";
+constexpr char kOpName[] = "SparseTensorSliceDataset";
+
+class SparseTensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new SparseTensorSliceDataset op kernel.
+  Status CreateSparseTensorSliceDatasetKernel(
+      DataType tvalues, std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(kNodeName, kOpName,
+                                     {"indices", "values", "dense_shape"},
+                                     {{"Tvalues", tvalues}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new SparseTensorSliceDataset op kernel context.
+  Status CreateSparseTensorSliceDatasetContext(
+      OpKernel *const op_kernel, gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct SparseTensorParam {
+  Tensor indices;
+  Tensor values;
+  Tensor dense_shape;
+};
+
+struct TestCase {
+  SparseTensorParam input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs;
+  std::vector<int> breakpoints;
+};
+
+std::vector<TestCase> TestCases() {
+  return {
+      {{{DatasetOpsTestBase::CreateTensor<int64>({2, 2}, {0, 0, 1, 1})},
+        {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+        {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+       {{{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {0})},
+         {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+         {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {1})},
+         {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+         {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}}},
+       {0, 1, 2}},  // 2-D sparse tensor
+      {{{DatasetOpsTestBase::CreateTensor<int64>({2, 3}, {0, 0, 0, 1, 1, 1})},
+        {DatasetOpsTestBase::CreateTensor<double>({2}, {888.0, 999.0})},
+        {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+       {{{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {0, 0})},
+         {DatasetOpsTestBase::CreateTensor<double>({1}, {888.0})},
+         {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {1, 1})},
+         {DatasetOpsTestBase::CreateTensor<double>({1}, {999.0})},
+         {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}}},
+       {0, 1, 2}},  // 3-D sparse tensor
+      {{{DatasetOpsTestBase::CreateTensor<int64>({2, 4},
+                                                 {0, 0, 0, 0, 1, 1, 1, 1})},
+        {DatasetOpsTestBase::CreateTensor<string>({2}, {"a", "b"})},
+        {DatasetOpsTestBase::CreateTensor<int64>({4}, {3, 2, 2, 2})}},
+       {{{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {0, 0, 0})},
+         {DatasetOpsTestBase::CreateTensor<string>({1}, {"a"})},
+         {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {1, 1, 1})},
+         {DatasetOpsTestBase::CreateTensor<string>({1}, {"b"})},
+         {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({0, 3}, {})},
+         {DatasetOpsTestBase::CreateTensor<string>({0}, {})},
+         {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}}},
+       {0, 1, 3}},  // 4-D sparse tensor
+      {{{DatasetOpsTestBase::CreateTensor<int64>(
+            {2, 5}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1})},
+        {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+        {DatasetOpsTestBase::CreateTensor<int64>({5}, {3, 2, 2, 2, 2})}},
+       {{{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {0, 0, 0, 0})},
+         {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+         {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {1, 1, 1, 1})},
+         {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+         {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+        {{DatasetOpsTestBase::CreateTensor<int64>({0, 4}, {})},
+         {DatasetOpsTestBase::CreateTensor<int32>({0}, {})},
+         {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}}},
+       {0, 1, 3}}  // 5-D sparse tensor
+  };
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(
+        dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+    bool end_of_sequence = false;
+    std::vector<Tensor> out_tensors;
+    int cur_slice = 0;
+    while (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        TF_EXPECT_OK(
+            ExpectEqual(out_tensors[0], expected_outputs[cur_slice].indices));
+        TF_EXPECT_OK(
+            ExpectEqual(out_tensors[1], expected_outputs[cur_slice].values));
+        TF_EXPECT_OK(ExpectEqual(out_tensors[2],
+                                 expected_outputs[cur_slice].dense_shape));
+        cur_slice++;
+      }
+    }
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  EXPECT_EQ(dataset->type_string(), kOpName);
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    DataTypeVector expected_output_dtypes = {
+        expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+        expected_outputs[0].dense_shape.dtype()};
+    TF_EXPECT_OK(
+        VerifyTypesMatch(dataset->output_dtypes(), expected_output_dtypes));
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    std::vector<PartialTensorShape> expected_output_shapes = {
+        expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+        expected_outputs[0].dense_shape.shape()};
+    TF_EXPECT_OK(VerifyShapesCompatible(dataset->output_shapes(),
+                                        expected_output_shapes));
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    EXPECT_EQ(dataset->Cardinality(), expected_outputs.size());
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(
+        dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+    DataTypeVector expected_output_dtypes = {
+        expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+        expected_outputs[0].dense_shape.dtype()};
+    TF_EXPECT_OK(
+        VerifyTypesMatch(iterator->output_dtypes(), expected_output_dtypes));
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(
+        dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+    std::vector<PartialTensorShape> expected_output_shapes = {
+        expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+        expected_outputs[0].dense_shape.shape()};
+    TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                        expected_output_shapes));
+  }
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), strings::StrCat("Iterator::SparseTensorSlice"));
+}
+
+TEST_F(SparseTensorSliceDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  for (auto &test_case : TestCases()) {
+    SparseTensorParam input_sparse_tensor = test_case.input_sparse_tensor;
+    std::vector<SparseTensorParam> expected_outputs =
+        test_case.expected_outputs;
+    std::vector<int> breakpoints = test_case.breakpoints;
+    DataType tvalues = input_sparse_tensor.values.dtype();
+    gtl::InlinedVector<TensorValue, 4> inputs = {
+        &input_sparse_tensor.indices, &input_sparse_tensor.values,
+        &input_sparse_tensor.dense_shape};
+
+    std::unique_ptr<OpKernel> dataset_kernel;
+    TF_ASSERT_OK(
+        CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+    std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+    TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+        dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+    DatasetBase *dataset;
+    TF_ASSERT_OK(CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(),
+                               &dataset));
+    core::ScopedUnref scoped_unref(dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(
+        dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+    std::unique_ptr<SerializationContext> serialization_ctx;
+    TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+    int cur_iteration = 0;
+    bool end_of_sequence = false;
+    int64 num_slices = input_sparse_tensor.dense_shape.dim_size(0);
+    std::vector<Tensor> out_tensors;
+
+    for (int breakpoint : breakpoints) {
+      while (cur_iteration < breakpoint) {
+        TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                       &end_of_sequence));
+        cur_iteration++;
+      }
+
+      if (breakpoint == 0) {
+        EXPECT_FALSE(end_of_sequence);
+      } else if (breakpoint <= num_slices) {
+        for (int i = 0; i < out_tensors.size(); ++i) {
+          TF_EXPECT_OK(ExpectEqual(
+              out_tensors[0], expected_outputs[cur_iteration - 1].indices));
+          TF_EXPECT_OK(ExpectEqual(out_tensors[1],
+                                   expected_outputs[cur_iteration - 1].values));
+          TF_EXPECT_OK(ExpectEqual(
+              out_tensors[2], expected_outputs[cur_iteration - 1].dense_shape));
+        }
+      } else {
+        EXPECT_TRUE(end_of_sequence);
+      }
+
+      VariantTensorData data;
+      VariantTensorDataWriter writer(&data);
+      TF_ASSERT_OK(iterator->Save(serialization_ctx.get(), &writer));
+      TF_ASSERT_OK(writer.Flush());
+      VariantTensorDataReader reader(&data);
+      TF_ASSERT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_utils.cc b/tensorflow/core/kernels/data/stats_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eefd92bc6655e32b4edd96835a47267981f1a5a5
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_utils.cc
@@ -0,0 +1,73 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/stats_utils.h"
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace data {
+namespace stats_utils {
+
+ABSL_CONST_INIT const char kDelimiter[] = "::";
+ABSL_CONST_INIT const char kExecutionTime[] = "execution_time";
+ABSL_CONST_INIT const char kThreadUtilization[] = "thread_utilization";
+ABSL_CONST_INIT const char kBufferSize[] = "buffer_size";
+ABSL_CONST_INIT const char kBufferCapacity[] = "buffer_capacity";
+ABSL_CONST_INIT const char kBufferUtilization[] = "buffer_utilization";
+ABSL_CONST_INIT const char kFilteredElements[] = "filtered_elements";
+ABSL_CONST_INIT const char kDroppedElements[] = "dropped_elements";
+ABSL_CONST_INIT const char kFeaturesCount[] = "features_count";
+ABSL_CONST_INIT const char kFeatureValuesCount[] = "feature_values_count";
+ABSL_CONST_INIT const char kExamplesCount[] = "examples_count";
+
+string ExecutionTimeHistogramName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kExecutionTime);
+}
+
+string ThreadUtilizationScalarName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kThreadUtilization);
+}
+
+string BufferSizeScalarName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kBufferSize);
+}
+
+string BufferCapacityScalarName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kBufferCapacity);
+}
+
+string BufferUtilizationHistogramName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kBufferUtilization);
+}
+
+string FilterdElementsScalarName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kFilteredElements);
+}
+
+string DroppedElementsScalarName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kDroppedElements);
+}
+
+string FeatureHistogramName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kFeaturesCount);
+}
+
+string FeatureValueHistogramName(const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, kFeatureValuesCount);
+}
+
+}  // namespace stats_utils
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_utils.h b/tensorflow/core/kernels/data/stats_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7c6190bf74d3f7fc240cb59d28eb65f3f982bff
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_utils.h
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_STATS_UTILS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+namespace stats_utils {
+extern const char kDelimiter[];
+extern const char kExecutionTime[];
+extern const char kThreadUtilization[];
+extern const char kBufferSize[];
+extern const char kBufferCapacity[];
+extern const char kBufferUtilization[];
+extern const char kFilteredElements[];
+extern const char kDroppedElements[];
+extern const char kFeaturesCount[];
+extern const char kFeatureValuesCount[];
+extern const char kExamplesCount[];
+
+// Name for tf.data function execution time (in ns) histogram metrics.
+string ExecutionTimeHistogramName(const string& prefix);
+
+// Name for thread utilization (ratio of threads being used and maximum number
+// of threads allocated) scalar metrics.
+string ThreadUtilizationScalarName(const string& prefix);
+
+// Name for buffer size scalar metrics.
+string BufferSizeScalarName(const string& prefix);
+
+// Name for buffer capacity (maximum allocated buffer size) scalar metrics.
+string BufferCapacityScalarName(const string& prefix);
+
+// Name for buffer utilization (ratio of buffer size and maximum allocated
+// buffer size.) histogram metrics.
+string BufferUtilizationHistogramName(const string& prefix);
+
+// Name for filtered elements scalar metrics.
+string FilterdElementsScalarName(const string& prefix);
+
+// Name for dropped elements scalar mereics.
+string DroppedElementsScalarName(const string& prefix);
+
+// Name for features count histogram metrics.
+string FeatureHistogramName(const string& prefix);
+
+// Name for feature-values count histogram metrics.
+string FeatureValueHistogramName(const string& prefix);
+
+}  // namespace stats_utils
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_UTILS_H_
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 0dd0c0c80de194c60aa7d268cb40317d722956c4..2983ab51762422df3444bf242a0dc68681537daf 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/take_dataset_op.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -20,9 +21,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 class TakeDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit TakeDatasetOp(OpKernelConstruction* ctx)
@@ -34,168 +32,130 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     // Create a new TakeDatasetOp::Dataset, and return it as the output.
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
-    *output = new Dataset(ctx, count, input);
+    *output = new TakeDataset(ctx, count, input);
   }
+};
 
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
-      input_->Ref();
-    }
+REGISTER_KERNEL_BUILDER(Name("TakeDataset").Device(DEVICE_CPU), TakeDatasetOp);
+}  // namespace
 
-    ~Dataset() override { input_->Unref(); }
+class TakeDataset::EmptyIterator : public DatasetIterator<TakeDataset> {
+ public:
+  explicit EmptyIterator(const Params& params)
+      : DatasetIterator<TakeDataset>(params) {}
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override {
+    *end_of_sequence = true;
+    return Status::OK();
+  }
 
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      if (count_ == 0) {
-        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
-            this, strings::StrCat(prefix, "::EmptyTake")});
-      } else {
-        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
-            this, strings::StrCat(prefix, "::FiniteTake")});
-      }
-    }
+ protected:
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeKnownRatioNode(std::move(args),
+                                     /*ratio=*/1);
+  }
 
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
+  Status SaveInternal(IteratorStateWriter* writer) override {
+    return Status::OK();
+  }
 
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return input_->output_shapes();
-    }
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override {
+    return Status::OK();
+  }
+};
 
-    string DebugString() const override { return "TakeDatasetOp::Dataset"; }
+class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
+ public:
+  explicit FiniteIterator(const Params& params)
+      : DatasetIterator<TakeDataset>(params), i_(0) {}
 
-    int64 Cardinality() const override {
-      int64 n = input_->Cardinality();
-      if (n == kUnknownCardinality) {
-        return kUnknownCardinality;
-      }
-      if (n == kInfiniteCardinality) {
-        return count_;
-      }
-      return std::min(n, count_);
-    }
+  Status Initialize(IteratorContext* ctx) override {
+    return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+  }
 
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* count = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, count}, output));
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override {
+    mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+    if (!input_impl_) {
+      *end_of_sequence = true;
       return Status::OK();
     }
-
-   private:
-    class EmptyIterator : public DatasetIterator<Dataset> {
-     public:
-      explicit EmptyIterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        return Status::OK();
-      }
-    };
-
-    class FiniteIterator : public DatasetIterator<Dataset> {
-     public:
-      explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params), i_(0) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
-        if (!input_impl_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        while (dataset()->count_ < 0 || i_ < dataset()->count_) {
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-          if (!*end_of_sequence) {
-            ++i_;
-            return Status::OK();
-          }
-          break;
-        }
-        *end_of_sequence = true;
-        input_impl_.reset();
+    while (dataset()->count_ < 0 || i_ < dataset()->count_) {
+      TF_RETURN_IF_ERROR(
+          input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+      if (!*end_of_sequence) {
+        ++i_;
         return Status::OK();
       }
+      break;
+    }
+    *end_of_sequence = true;
+    input_impl_.reset();
+    return Status::OK();
+  }
 
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        }
-        return Status::OK();
-      }
+ protected:
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeKnownRatioNode(std::move(args),
+                                     /*ratio=*/1);
+  }
 
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        return Status::OK();
-      }
+  Status SaveInternal(IteratorStateWriter* writer) override {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+    if (input_impl_) {
+      TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+    } else {
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name("input_impl_empty"), ""));
+    }
+    return Status::OK();
+  }
 
-     private:
-      mutex mu_;
-      int64 i_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    };
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+    if (!reader->Contains(full_name("input_impl_empty"))) {
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+    } else {
+      input_impl_.reset();
+    }
+    return Status::OK();
+  }
 
-    const int64 count_;
-    const DatasetBase* const input_;
-  };
+ private:
+  mutex mu_;
+  int64 i_ GUARDED_BY(mu_);
+  std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
 };
 
-REGISTER_KERNEL_BUILDER(Name("TakeDataset").Device(DEVICE_CPU), TakeDatasetOp);
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+std::unique_ptr<IteratorBase> TakeDataset::MakeIteratorInternal(
+    const string& prefix) const {
+  if (count_ == 0) {
+    return absl::make_unique<EmptyIterator>(
+        EmptyIterator::Params{this, strings::StrCat(prefix, "::EmptyTake")});
+  } else {
+    return absl::make_unique<FiniteIterator>(
+        FiniteIterator::Params{this, strings::StrCat(prefix, "::FiniteTake")});
+  }
+}
+
+Status TakeDataset::AsGraphDefInternal(SerializationContext* ctx,
+                                       DatasetGraphDefBuilder* b,
+                                       Node** output) const {
+  Node* input_graph_node = nullptr;
+  TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+  Node* count = nullptr;
+  TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+  TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, count}, output));
+  return Status::OK();
+}
 
-}  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/take_dataset_op.h b/tensorflow/core/kernels/data/take_dataset_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e35a26bfff41dbbbbdf96521d07f32d79dfd1beb
--- /dev/null
+++ b/tensorflow/core/kernels/data/take_dataset_op.h
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+class TakeDataset : public DatasetBase {
+ public:
+  TakeDataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
+      : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
+    input_->Ref();
+  }
+
+  TakeDataset(DatasetContext::Params params, int64 count,
+              const DatasetBase* input)
+      : DatasetBase(DatasetContext(std::move(params))),
+        count_(count),
+        input_(input) {
+    input_->Ref();
+  }
+
+  ~TakeDataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override {
+    return input_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return input_->output_shapes();
+  }
+
+  string DebugString() const override { return "TakeDatasetOp::Dataset"; }
+
+  int64 Cardinality() const override {
+    int64 n = input_->Cardinality();
+    if (n == kUnknownCardinality) {
+      return kUnknownCardinality;
+    }
+    if (n == kInfiniteCardinality) {
+      return count_;
+    }
+    return std::min(n, count_);
+  }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
+
+ private:
+  class EmptyIterator;
+  class FiniteIterator;
+  const int64 count_;
+  const DatasetBase* const input_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_TAKE_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/take_dataset_op_test.cc b/tensorflow/core/kernels/data/take_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8c68472ec0e5c1b6801ebeeb3721fa4fd21600b
--- /dev/null
+++ b/tensorflow/core/kernels/data/take_dataset_op_test.cc
@@ -0,0 +1,560 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "take_dataset";
+constexpr char kOpName[] = "TakeDataset";
+
+class TakeDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Create a new `TakeDataset` op kernel.
+  Status CreateTakeDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "count"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `TakeDataset` op kernel context.
+  Status CreateTakeDatasetContext(
+      OpKernel *op_kernel, gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  int64 count;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1: take fewer than input size.
+TestCase TakeLessTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+          /*count*/ 4,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({1})},
+          /*expected_cardinality*/ 4,
+          /*breakpoints*/ {0, 2, 5}};
+}
+
+// Test case 2: take more than input size.
+TestCase TakeMoreTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+          /*count*/ 25,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({1})},
+          /*expected_cardinality*/ 10,
+          /*breakpoints*/ {0, 2, 5, 11}};
+}
+
+// Test case 3: take all of input.
+TestCase TakeAllTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+          /*count*/ -1,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {3}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {4}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {5}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {6}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {7}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {8}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({1})},
+          /*expected_cardinality*/ -1,
+          /*breakpoints*/ {0, 2, 5, 11}};
+}
+
+// Test case 4: take nothing.
+TestCase TakeNothingTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+          /*count*/ 0,
+          /*expected_outputs*/
+          {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/
+          {PartialTensorShape({1})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 2, 5, 11}};
+}
+
+class ParametrizedTakeDatasetOpTest
+    : public TakeDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParametrizedTakeDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(take_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      take_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      for (const auto &tensor : out_tensors) {
+        EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+        expected_outputs_it++;
+      }
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(TakeDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TakeLessTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  EXPECT_EQ(take_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(take_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(take_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  EXPECT_EQ(take_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_F(TakeDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TakeLessTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(take_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(take_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      take_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(take_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      take_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(take_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      take_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  if (test_case.count == 0) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::EmptyTake");
+  } else {
+    EXPECT_EQ(iterator->prefix(), "Iterator::FiniteTake");
+  }
+}
+
+TEST_P(ParametrizedTakeDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_take_dataset;
+  inputs_for_take_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_take_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> take_dataset_kernel;
+  TF_ASSERT_OK(CreateTakeDatasetKernel(test_case.expected_output_dtypes,
+                                       test_case.expected_output_shapes,
+                                       &take_dataset_kernel));
+  std::unique_ptr<OpKernelContext> take_dataset_context;
+  TF_ASSERT_OK(CreateTakeDatasetContext(take_dataset_kernel.get(),
+                                        &inputs_for_take_dataset,
+                                        &take_dataset_context));
+  DatasetBase *take_dataset;
+  TF_ASSERT_OK(CreateDataset(take_dataset_kernel.get(),
+                             take_dataset_context.get(), &take_dataset));
+  core::ScopedUnref scoped_unref(take_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(take_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      take_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+
+    while (cur_iteration <= breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        for (auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= test_case.expected_outputs.size()) {
+      EXPECT_TRUE(end_of_sequence);
+      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(TakeDatasetOpTest, ParametrizedTakeDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TakeLessTestCase(), TakeMoreTestCase(),
+                              TakeAllTestCase(), TakeNothingTestCase()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index a44dbd0d4d436e3eb85adbe9db6dc39bde0419e8..04698751f80aa7452a64a4e3d15fe069dcc2f901 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 
 namespace tensorflow {
 namespace data {
@@ -26,15 +27,20 @@ namespace {
 
 class TensorDatasetOp : public DatasetOpKernel {
  public:
-  explicit TensorDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+  explicit TensorDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     OpInputList inputs;
     OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs));
-    // TODO(mrry): Validate that the shapes of the "components" tensors match
-    // the "shapes" attr.;
     std::vector<Tensor> components(inputs.begin(), inputs.end());
     *output = new Dataset(ctx, std::move(components));
+    OP_REQUIRES_OK(ctx,
+                   VerifyTypesMatch((*output)->output_dtypes(), output_types_));
+    OP_REQUIRES_OK(ctx, VerifyShapesCompatible((*output)->output_shapes(),
+                                               output_shapes_));
   }
 
  private:
@@ -137,6 +143,9 @@ class TensorDatasetOp : public DatasetOpKernel {
     DataTypeVector dtypes_;
     std::vector<PartialTensorShape> shapes_;
   };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9586ded2cb0d5f796ce69c011cf3c077808dcdb3
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
@@ -0,0 +1,535 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "tensor_dataset";
+constexpr char kOpName[] = "TensorDataset";
+
+class TensorDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new TensorDataset op kernel.
+  Status CreateTensorDatasetKernel(
+      DataTypeVector dtypes, std::vector<PartialTensorShape> shapes,
+      std::unique_ptr<OpKernel> *tensor_dataset_kernel) {
+    std::vector<string> components;
+    components.reserve(dtypes.size());
+    for (int i = 0; i < dtypes.size(); i++) {
+      components.emplace_back(strings::StrCat("component_", i));
+    }
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, components,
+        {{"Toutput_types", dtypes}, {"output_shapes", shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, tensor_dataset_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new TensorDataset op kernel context.
+  Status CreateTensorDatasetContext(OpKernel *const tensor_dataset_kernel,
+                                    gtl::InlinedVector<TensorValue, 4> *inputs,
+                                    std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*tensor_dataset_kernel, *inputs));
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(tensor_dataset_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestCase {
+  std::vector<Tensor> components;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1: test a dataset that represents a single tuple of plain tensors.
+TestCase PlainTensorsTestCase() {
+  return {
+      /*components*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({}), {37.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1, 2}),
+                                                {"a", "b"})},
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({}), {37.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1, 2}),
+                                                {"a", "b"})},
+      /*expected_output_dtypes*/
+      {DT_INT64, DT_INT64, DT_DOUBLE, DT_STRING},
+      /*expected_output_shapes*/
+      {PartialTensorShape({}), PartialTensorShape({1, 3}),
+       PartialTensorShape({}), PartialTensorShape({1, 2})},
+      /*expected_cardinality*/ 1,
+      /*breakpoints*/ {0, 1, 2}};
+}
+
+// Test case 2: test a dataset that represents a tuple of nested tensors.
+TestCase NestedTensorsTestCase() {
+  return {
+      /*components*/
+      {DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({}), {DatasetOpsTestBase::CreateTensor<double>(
+                                TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({}), {DatasetOpsTestBase::CreateTensor<string>(
+                                TensorShape({1, 2}), {"a", "b"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({}), {DatasetOpsTestBase::CreateTensor<double>(
+                                TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({}), {DatasetOpsTestBase::CreateTensor<string>(
+                                TensorShape({1, 2}), {"a", "b"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({1, 3}), {1, 2, 3})},
+      /*expected_output_dtypes*/
+      {DT_VARIANT, DT_VARIANT, DT_INT64},
+      /*expected_output_shapes*/
+      {PartialTensorShape({}), PartialTensorShape({}),
+       PartialTensorShape({1, 3})},
+      /*expected_cardinality*/ 1,
+      /*breakpoints*/ {0, 1, 2}};
+}
+
+class ParametrizedTensorDatasetOpTest
+    : public TensorDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParametrizedTensorDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = GetParam();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(
+      CreateIteratorContext(tensor_dataset_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                            &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+  EXPECT_EQ(out_tensors.size(), test_case.expected_outputs.size());
+  for (int i = 0; i < out_tensors.size(); ++i) {
+    if (out_tensors[i].dtype() == DT_VARIANT) {
+      // Currently `ExpectEqual()` does not support the variant tensor
+      // yet, so we manually cast the variant to numeric/string tensor.
+      const Tensor *output = out_tensors[i].scalar<Variant>()().get<Tensor>();
+      const Tensor *expected_output =
+          test_case.expected_outputs[i].scalar<Variant>()().get<Tensor>();
+      TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+    } else {
+      TF_EXPECT_OK(ExpectEqual(out_tensors[i], test_case.expected_outputs[i]));
+    }
+  }
+}
+
+TEST_F(TensorDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  EXPECT_EQ(tensor_dataset->type_string(), kOpName);
+}
+
+TEST_F(TensorDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  EXPECT_EQ(tensor_dataset->node_name(), kNodeName);
+}
+
+TEST_F(TensorDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  EXPECT_EQ(tensor_dataset->output_dtypes(), test_case.expected_output_dtypes);
+}
+
+TEST_F(TensorDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  EXPECT_EQ(tensor_dataset->output_shapes().size(),
+            test_case.expected_output_shapes.size());
+  for (int i = 0; i < test_case.expected_output_shapes.size(); i++) {
+    EXPECT_TRUE(test_case.expected_output_shapes[i].IsIdenticalTo(
+        tensor_dataset->output_shapes()[i]));
+  }
+}
+
+TEST_F(TensorDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  EXPECT_EQ(tensor_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParametrizedTensorDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = GetParam();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(tensor_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParametrizedTensorDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = GetParam();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(
+      CreateIteratorContext(tensor_dataset_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                            &iterator));
+  EXPECT_EQ(iterator->output_dtypes(), test_case.expected_output_dtypes);
+}
+
+TEST_P(ParametrizedTensorDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = GetParam();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(
+      CreateIteratorContext(tensor_dataset_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                            &iterator));
+
+  EXPECT_EQ(iterator->output_shapes().size(),
+            test_case.expected_output_shapes.size());
+  for (int i = 0; i < test_case.expected_output_shapes.size(); ++i) {
+    EXPECT_TRUE(test_case.expected_output_shapes[i].IsIdenticalTo(
+        iterator->output_shapes()[i]));
+  }
+}
+
+TEST_F(TensorDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = PlainTensorsTestCase();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(
+      CreateIteratorContext(tensor_dataset_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                            &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::FromTensor");
+}
+
+TEST_P(ParametrizedTensorDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = GetParam();
+  std::vector<Tensor> components = test_case.components;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+  }
+  std::unique_ptr<OpKernel> tensor_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &tensor_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_dataset_context;
+  TF_ASSERT_OK(CreateTensorDatasetContext(tensor_dataset_kernel.get(), &inputs,
+                                          &tensor_dataset_context));
+  DatasetBase *tensor_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_dataset_kernel.get(),
+                             tensor_dataset_context.get(), &tensor_dataset));
+  core::ScopedUnref scoped_unref(tensor_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(tensor_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      tensor_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+
+    while (cur_iteration <= breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        EXPECT_EQ(out_tensors.size(), test_case.expected_outputs.size());
+        for (int i = 0; i < out_tensors.size(); ++i) {
+          if (out_tensors[i].dtype() == DT_VARIANT) {
+            // Currently `ExpectEqual()` does not support the variant tensor
+            // yet, so we manually cast the variant to numeric/string tensor.
+            const Tensor *output =
+                out_tensors[i].scalar<Variant>()().get<Tensor>();
+            const Tensor *expected_output =
+                test_case.expected_outputs[i].scalar<Variant>()().get<Tensor>();
+            TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+          } else {
+            TF_EXPECT_OK(
+                ExpectEqual(out_tensors[i], test_case.expected_outputs[i]));
+          }
+        }
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= test_case.expected_cardinality) {
+      EXPECT_TRUE(end_of_sequence);
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    TensorDatasetOpTest, ParametrizedTensorDatasetOpTest,
+    ::testing::ValuesIn(std::vector<TestCase>({PlainTensorsTestCase(),
+                                               NestedTensorsTestCase()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 54dcd7eb7d1bd97fa1c58e0d3235670482bafd93..97a1ec402f2abff8627c65e14d2af39e0693afaa 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -28,7 +29,10 @@ namespace {
 class TensorSliceDatasetOp : public DatasetOpKernel {
  public:
   explicit TensorSliceDatasetOp(OpKernelConstruction* ctx)
-      : DatasetOpKernel(ctx) {}
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     OpInputList inputs;
@@ -50,6 +54,10 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
               "All components must have the same size in the 0th dimension"));
     }
     *output = new Dataset(ctx, std::move(components));
+    OP_REQUIRES_OK(ctx,
+                   VerifyTypesMatch((*output)->output_dtypes(), output_types_));
+    OP_REQUIRES_OK(ctx, VerifyShapesCompatible((*output)->output_shapes(),
+                                               output_shapes_));
   }
 
  private:
@@ -170,6 +178,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
     DataTypeVector dtypes_;
     std::vector<PartialTensorShape> shapes_;
   };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorSliceDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..883440924f06f5fdb74e1a04297b627c8bf67564
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -0,0 +1,610 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "tensor_slice_dataset";
+constexpr char kOpName[] = "TensorSliceDataset";
+
+class TensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new TensorSliceDataset op kernel.
+  Status CreateTensorSliceDatasetKernel(
+      DataTypeVector dtypes, std::vector<PartialTensorShape> shapes,
+      std::unique_ptr<OpKernel> *tensor_dataset_kernel) {
+    std::vector<string> components;
+    components.reserve(dtypes.size());
+    for (int i = 0; i < dtypes.size(); i++) {
+      components.emplace_back(strings::StrCat("component_", i));
+    }
+
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, components,
+        {{"Toutput_types", dtypes}, {"output_shapes", shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, tensor_dataset_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new TensorSliceDataset op kernel context.
+  Status CreateTensorSliceDatasetContext(
+      OpKernel *const tensor_dataset_kernel,
+      gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*tensor_dataset_kernel, *inputs));
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(tensor_dataset_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestCase {
+  std::vector<Tensor> components;
+  std::vector<Tensor> expected_outputs;
+  std::vector<int> breakpoints;
+};
+
+std::vector<TestCase> TestCases() {
+  return {
+      // A single tuple of tensors.
+      {{{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+         DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 2}),
+                                                 {1, 2, 3, 4}),
+         DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 1}),
+                                                  {37.0, 38.0}),
+         DatasetOpsTestBase::CreateTensor<string>(TensorShape({2, 1}),
+                                                  {"a", "b"})}},  // components
+       {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+         DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+         DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {37.0}),
+         DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"a"}),
+         DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+         DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {3, 4}),
+         DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {38.0}),
+         DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}),
+                                                  {"b"})}},  // expected_outputs
+       {{0, 1, 3}}},                                         //  breakpoints
+      // Nested tensors
+      {{{DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({2, 1}),
+             {DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                       {1.0, 2.0, 3.0, 4.0}),
+              DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                       {5.0, 6.0, 7.0, 8.0})}),
+         DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({2, 1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                       TensorShape({1, 2}), {"a", "b"}),
+                                   DatasetOpsTestBase::CreateTensor<string>(
+                                       TensorShape({1, 2}), {"c", "d"})}),
+         DatasetOpsTestBase::CreateTensor<int64>(
+             TensorShape({2, 3}), {1, 2, 3, 4, 5, 6})}},  // components
+       {{DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                   TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+         DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                   TensorShape({1, 2}), {"a", "b"})}),
+         DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {1, 2, 3}),
+         DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                   TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
+         DatasetOpsTestBase::CreateTensor<Variant>(
+             TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                   TensorShape({1, 2}), {"c", "d"})}),
+         DatasetOpsTestBase::CreateTensor<int64>(
+             TensorShape({3}), {4, 5, 6})}},  // expected_outputs
+       {{0, 1, 2}}}                           // breakpoints
+  };
+}
+
+TEST_F(TensorSliceDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.push_back(&component);
+      dtypes.push_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_context;
+    TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                       &iterator_context));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                    "Iterator", &iterator));
+    bool end_of_sequence = false;
+    std::vector<Tensor> out_tensors;
+    int cur_slice = 0;
+
+    while (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                     &end_of_sequence));
+      for (int i = 0; i < out_tensors.size(); ++i) {
+        EXPECT_LT(i + num_tensors_per_slice * cur_slice,
+                  expected_outputs.size());
+        if (out_tensors[i].dtype() == DT_VARIANT) {
+          // Currently `ExpectEqual()` does not support the variant tensor
+          // yet, so we manually cast the variant to numeric/string tensor.
+          const Tensor *output =
+              out_tensors[i].scalar<Variant>()().get<Tensor>();
+          const Tensor *expected_output =
+              expected_outputs[i + num_tensors_per_slice * cur_slice]
+                  .scalar<Variant>()()
+                  .get<Tensor>();
+          TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+        } else {
+          TF_EXPECT_OK(ExpectEqual(
+              out_tensors[i],
+              expected_outputs[i + num_tensors_per_slice * cur_slice]));
+        }
+      }
+      out_tensors.clear();
+      cur_slice++;
+    }
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  EXPECT_EQ(tensor_slice_dataset->type_string(), kOpName);
+}
+
+TEST_F(TensorSliceDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    const DataTypeVector produced_output_dtypes =
+        tensor_slice_dataset->output_dtypes();
+    EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+    }
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    const std::vector<PartialTensorShape> produced_output_shapes =
+        tensor_slice_dataset->output_shapes();
+    std::vector<PartialTensorShape> expected_output_shapes;
+    EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      EXPECT_TRUE(
+          produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+    }
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    EXPECT_EQ(tensor_slice_dataset->Cardinality(),
+              inputs[0].tensor->dim_size(0));
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(
+      tensor_slice_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(TensorSliceDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_context;
+    TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                       &iterator_context));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                    "Iterator", &iterator));
+    const DataTypeVector produced_output_dtypes = iterator->output_dtypes();
+
+    EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+    }
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_context;
+    TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                       &iterator_context));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                    "Iterator", &iterator));
+    const std::vector<PartialTensorShape> produced_output_shapes =
+        iterator->output_shapes();
+    EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      EXPECT_TRUE(
+          produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+    }
+  }
+}
+
+TEST_F(TensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), "Iterator::TensorSlice");
+}
+
+TEST_F(TensorSliceDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  for (auto &test_case : TestCases()) {
+    std::vector<Tensor> components = test_case.components;
+    std::vector<Tensor> expected_outputs = test_case.expected_outputs;
+    std::vector<int> breakpoints = test_case.breakpoints;
+    size_t num_tensors_per_slice = components.size();
+
+    TF_ASSERT_OK(InitThreadPool(thread_num));
+    TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+    DataTypeVector dtypes;
+    std::vector<PartialTensorShape> shapes;
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    for (auto &component : components) {
+      inputs.emplace_back(&component);
+      dtypes.emplace_back(component.dtype());
+    }
+    for (int i = 0; i < num_tensors_per_slice; ++i) {
+      shapes.emplace_back(expected_outputs[i].shape());
+    }
+
+    std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+    TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                                &tensor_slice_dataset_kernel));
+    std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+    TF_ASSERT_OK(CreateTensorSliceDatasetContext(
+        tensor_slice_dataset_kernel.get(), &inputs,
+        &tensor_slice_dataset_context));
+    DatasetBase *tensor_slice_dataset;
+    TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                               tensor_slice_dataset_context.get(),
+                               &tensor_slice_dataset));
+    core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_context;
+    TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                       &iterator_context));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                    "Iterator", &iterator));
+    std::unique_ptr<SerializationContext> serialization_context;
+    TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+
+    int cur_iteration = 0;
+    bool end_of_sequence = false;
+    int64 num_slices = inputs[0].tensor->dim_size(0);
+    std::vector<Tensor> out_tensors;
+
+    for (int breakpoint : breakpoints) {
+      while (cur_iteration < breakpoint) {
+        TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                       &end_of_sequence));
+        cur_iteration++;
+      }
+
+      if (breakpoint == 0) {
+        EXPECT_FALSE(end_of_sequence);
+      } else if (breakpoint <= num_slices) {
+        for (int i = 0; i < out_tensors.size(); ++i) {
+          if (out_tensors[i].dtype() == DT_VARIANT) {
+            const Tensor *output =
+                out_tensors[i].scalar<Variant>()().get<Tensor>();
+            const Tensor *expected_output =
+                expected_outputs[i +
+                                 num_tensors_per_slice * (cur_iteration - 1)]
+                    .scalar<Variant>()()
+                    .get<Tensor>();
+            TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+          } else {
+            TF_EXPECT_OK(ExpectEqual(
+                out_tensors[i], expected_outputs[i + num_tensors_per_slice *
+                                                         (cur_iteration - 1)]));
+          }
+        }
+      } else {
+        EXPECT_TRUE(end_of_sequence);
+      }
+
+      VariantTensorData data;
+      VariantTensorDataWriter writer(&data);
+      TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+      TF_ASSERT_OK(writer.Flush());
+      VariantTensorDataReader reader(&data);
+      TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.cc b/tensorflow/core/kernels/data/unbounded_thread_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac12197f1b8863ab023e975acc803ed8af04d3db
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool.cc
@@ -0,0 +1,156 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace data {
+
+// A lightweight wrapper for creating logical threads in a `UnboundedThreadPool`
+// that can be shared (e.g.) in an `IteratorContext`.
+class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory {
+ public:
+  explicit LogicalThreadFactory(UnboundedThreadPool* pool) : pool_(pool) {}
+
+  std::unique_ptr<Thread> StartThread(const string& name,
+                                      std::function<void()> fn) override {
+    return pool_->RunOnPooledThread(std::move(fn));
+  }
+
+ private:
+  UnboundedThreadPool* const pool_;  // Not owned.
+};
+
+// A logical implementation of the `tensorflow::Thread` interface that uses
+// physical threads in an `UnboundedThreadPool` to perform the work.
+//
+// NOTE: This object represents a logical thread of control that may be mapped
+// onto the same physical thread as other work items that are submitted to the
+// same `UnboundedThreadPool`.
+class UnboundedThreadPool::LogicalThreadWrapper : public Thread {
+ public:
+  explicit LogicalThreadWrapper(std::shared_ptr<Notification> join_notification)
+      : join_notification_(std::move(join_notification)) {}
+
+  ~LogicalThreadWrapper() override {
+    // NOTE: The `Thread` destructor is expected to "join" the created thread,
+    // but the physical thread may continue to execute after the work for this
+    // thread is complete. We simulate this by waiting on a notification that
+    // the `CachedThreadFunc` will notify when the thread's work function is
+    // complete.
+    join_notification_->WaitForNotification();
+  }
+
+ private:
+  std::shared_ptr<Notification> join_notification_;
+};
+
+UnboundedThreadPool::~UnboundedThreadPool() {
+  {
+    mutex_lock l(work_queue_mu_);
+    // Wake up all `CachedThreadFunc` threads and cause them to terminate before
+    // joining them when `threads_` is cleared.
+    cancelled_ = true;
+    work_queue_cv_.notify_all();
+    if (!work_queue_.empty()) {
+      LOG(ERROR) << "UnboundedThreadPool named \"" << thread_name_ << "\" was "
+                 << "deleted with pending work in its queue. This may indicate "
+                 << "a potential use-after-free bug.";
+    }
+  }
+
+  {
+    mutex_lock l(thread_pool_mu_);
+    // Clear the list of pooled threads, which will eventually terminate due to
+    // the previous notification.
+    //
+    // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because
+    // no subsequent calls to `this->StartThread()` should be issued after the
+    // destructor starts.
+    thread_pool_.clear();
+  }
+}
+
+std::shared_ptr<ThreadFactory> UnboundedThreadPool::get_thread_factory() {
+  return std::make_shared<LogicalThreadFactory>(this);
+}
+
+size_t UnboundedThreadPool::size() {
+  tf_shared_lock l(thread_pool_mu_);
+  return thread_pool_.size();
+}
+
+std::unique_ptr<Thread> UnboundedThreadPool::RunOnPooledThread(
+    std::function<void()> fn) {
+  auto join_notification = std::make_shared<Notification>();
+  bool all_threads_busy;
+  {
+    // Enqueue a work item for the new thread's function, and wake up a
+    // cached thread to process it.
+    mutex_lock l(work_queue_mu_);
+    work_queue_.push_back({std::move(fn), join_notification});
+    work_queue_cv_.notify_one();
+    // NOTE: The queue may be non-empty, so we must account for queued work when
+    // considering how many threads are free.
+    all_threads_busy = work_queue_.size() > num_idle_threads_;
+  }
+
+  if (all_threads_busy) {
+    // Spawn a new physical thread to process the given function.
+    // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_`
+    // at the beginning of its work loop.
+    Thread* new_thread = env_->StartThread(
+        {}, thread_name_,
+        std::bind(&UnboundedThreadPool::PooledThreadFunc, this));
+
+    mutex_lock l(thread_pool_mu_);
+    thread_pool_.emplace_back(new_thread);
+  }
+
+  return absl::make_unique<LogicalThreadWrapper>(std::move(join_notification));
+}
+
+void UnboundedThreadPool::PooledThreadFunc() {
+  while (true) {
+    WorkItem work_item;
+    {
+      mutex_lock l(work_queue_mu_);
+      ++num_idle_threads_;
+      while (!cancelled_ && work_queue_.empty()) {
+        // Wait for a new work function to be submitted, or the cache to be
+        // destroyed.
+        work_queue_cv_.wait(l);
+      }
+      if (cancelled_) {
+        return;
+      }
+      work_item = std::move(work_queue_.front());
+      work_queue_.pop_front();
+      --num_idle_threads_;
+    }
+
+    work_item.work_function();
+
+    // Notify any thread that has "joined" the cached thread for this work item.
+    work_item.done_notification->Notify();
+  }
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.h b/tensorflow/core/kernels/data/unbounded_thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..c84d495b296f9b5df5cb58467b2c3b4137d4cf68
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool.h
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_UNBOUNDED_THREAD_POOL_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_UNBOUNDED_THREAD_POOL_H_
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/thread_factory.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace data {
+
+// An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a
+// potentially large number of "logical" threads onto a smaller number of
+// "physical" threads. The multiplexing is achieved by maintaining an internal
+// pool of long-running "physical" threads that are used to execute the
+// "logical" threads.  Like a regular thread, a "logical" thread may block on
+// other threads, and the size of the pool will increase to ensure that progress
+// is made. This mechanism is recommended in situations where short-lived
+// threads are created repeatedly, to avoid the overhead and memory
+// fragmentation that can result from excessive thread creation.
+class UnboundedThreadPool {
+ public:
+  UnboundedThreadPool(Env* env, const string& thread_name)
+      : env_(env), thread_name_(thread_name) {}
+  ~UnboundedThreadPool();
+
+  // Returns an implementation of `ThreadFactory` that can be used to create
+  // logical threads in this pool.
+  std::shared_ptr<ThreadFactory> get_thread_factory();
+
+  // Returns the current number of threads in this pool.
+  size_t size();
+
+ private:
+  class LogicalThreadFactory;
+  class LogicalThreadWrapper;
+  struct WorkItem {
+    std::function<void()> work_function;
+    std::shared_ptr<Notification> done_notification;
+  };
+
+  std::unique_ptr<Thread> RunOnPooledThread(std::function<void()> fn);
+  void PooledThreadFunc();
+
+  Env* const env_;  // Not owned.
+  const string thread_name_;
+  mutex work_queue_mu_;
+  condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_);
+  size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0;
+  bool cancelled_ GUARDED_BY(work_queue_mu_) = false;
+  std::deque<WorkItem> work_queue_ GUARDED_BY(work_queue_mu_);
+  mutex thread_pool_mu_;
+  std::vector<std::unique_ptr<Thread>> thread_pool_ GUARDED_BY(thread_pool_mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_UNBOUNDED_THREAD_POOL_H_
diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f996b4f931b82c2b09e3a49c9eb22fe581f0b5c8
--- /dev/null
+++ b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
+
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(UnboundedThreadPool, SingleThread) {
+  UnboundedThreadPool pool(Env::Default(), "test");
+  auto thread_factory = pool.get_thread_factory();
+
+  // Create a thread that updates a variable, and ensure that it runs to
+  // completion.
+  std::atomic<int> i(0);
+  auto thread = thread_factory->StartThread("", [&i]() { ++i; });
+  thread.reset();
+
+  EXPECT_GE(pool.size(), 1);
+  EXPECT_EQ(1, i);
+}
+
+TEST(UnboundedThreadPool, MultipleThreads) {
+  UnboundedThreadPool pool(Env::Default(), "test");
+  auto thread_factory = pool.get_thread_factory();
+
+  // Create ten threads that update a variable, and ensure that they all run
+  // to completion.
+  std::vector<std::unique_ptr<Thread>> threads;
+  const int kNumThreadsToCreate = 10;
+  std::atomic<int> i(0);
+  for (int j = 0; j < kNumThreadsToCreate; ++j) {
+    threads.push_back(thread_factory->StartThread("", [&i]() { ++i; }));
+  }
+  threads.clear();
+
+  EXPECT_GE(pool.size(), 1);
+  EXPECT_EQ(i, kNumThreadsToCreate);
+}
+
+TEST(UnboundedThreadPool, MultipleThreadsSleepingRandomly) {
+  UnboundedThreadPool pool(Env::Default(), "test");
+  auto thread_factory = pool.get_thread_factory();
+
+  // Create 1000 threads that sleep for a random period of time then update a
+  // variable, and ensure that they all run to completion.
+  std::vector<std::unique_ptr<Thread>> threads;
+  const int kNumThreadsToCreate = 1000;
+  std::atomic<int> i(0);
+  for (int j = 0; j < kNumThreadsToCreate; ++j) {
+    threads.push_back(thread_factory->StartThread("", [&i]() {
+      Env::Default()->SleepForMicroseconds(random::New64() % 10);
+      ++i;
+    }));
+  }
+  threads.clear();
+
+  EXPECT_GE(pool.size(), 1);
+  EXPECT_EQ(i, kNumThreadsToCreate);
+}
+
+TEST(UnboundedThreadPool, ConcurrentThreadCreation) {
+  UnboundedThreadPool pool(Env::Default(), "test");
+  auto thread_factory = pool.get_thread_factory();
+
+  // Create ten threads that each create ten threads that update a variable, and
+  // ensure that they all run to completion.
+  std::vector<std::unique_ptr<Thread>> threads;
+  const int kNumThreadsToCreate = 10;
+  std::atomic<int> i(0);
+  for (int j = 0; j < kNumThreadsToCreate; ++j) {
+    threads.push_back(thread_factory->StartThread("", [&i, thread_factory]() {
+      std::vector<std::unique_ptr<Thread>> nested_threads;
+      for (int k = 0; k < kNumThreadsToCreate; ++k) {
+        nested_threads.push_back(
+            thread_factory->StartThread("", [&i]() { ++i; }));
+      }
+      nested_threads.clear();
+    }));
+  }
+  threads.clear();
+
+  EXPECT_GE(pool.size(), 1);
+  EXPECT_EQ(i, kNumThreadsToCreate * kNumThreadsToCreate);
+}
+
+TEST(UnboundedThreadPool, MultipleBlockingThreads) {
+  UnboundedThreadPool pool(Env::Default(), "test");
+  auto thread_factory = pool.get_thread_factory();
+
+  std::vector<std::unique_ptr<Thread>> threads;
+
+  // Create multiple waves (with increasing sizes) of threads that all block
+  // before returning, and
+  // ensure that we create the appropriate number of threads and terminate
+  // correctly.
+  std::vector<int> round_sizes = {5, 10, 15, 20};
+
+  for (const int round_size : round_sizes) {
+    Notification n;
+    BlockingCounter bc(round_size);
+    for (int j = 0; j < round_size; ++j) {
+      threads.push_back(thread_factory->StartThread("", [&bc, &n]() {
+        bc.DecrementCount();
+        // Block until `n` is notified, so that all ten threads must been
+        // created before the first one completes.
+        n.WaitForNotification();
+      }));
+    }
+
+    // Wait until all threads have started. Since the number of threads in each
+    // wave is increasing, we should have at least that number of threads in the
+    // pool.
+    bc.Wait();
+    // NOTE: There is a benign race between a new round starting and the
+    // physical threads from the previous round returning to the pool, so we may
+    // create more threads than the round_size.
+    EXPECT_GE(pool.size(), round_size);
+    n.Notify();
+    threads.clear();
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/zip_dataset_op_test.cc b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f9e86a3d088b6040301593952cbe5d95b339534
--- /dev/null
+++ b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
@@ -0,0 +1,353 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "zip_dataset";
+constexpr char kOpName[] = "ZipDataset";
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+class ZipDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `RangeDataset` variant tensors from the input vector of
+  // `RangeDatasetParam`.
+  Status CreateRangeDatasetTensors(const std::vector<RangeDatasetParam> &params,
+                                   std::vector<Tensor> *const dataset_tensors) {
+    for (int i = 0; i < params.size(); ++i) {
+      DatasetBase *range_dataset;
+      TF_RETURN_IF_ERROR(CreateRangeDataset<int64>(
+          params[i].start, params[i].end, params[i].step,
+          strings::StrCat("range_", i), &range_dataset));
+      Tensor dataset_tensor(DT_VARIANT, TensorShape({}));
+      TF_RETURN_IF_ERROR(
+          StoreDatasetInVariantTensor(range_dataset, &dataset_tensor));
+      dataset_tensors->emplace_back(std::move(dataset_tensor));
+    }
+    return Status::OK();
+  }
+
+  // Creates a new ZipDataset op kernel.
+  Status CreateZipDatasetKernel(
+      const DataTypeVector &dtypes,
+      const std::vector<PartialTensorShape> &output_shapes, int n,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    std::vector<string> input_datasets;
+    input_datasets.reserve(n);
+    for (int i = 0; i < n; ++i) {
+      // Create the placeholder names for the input components of `ZipDataset`.
+      input_datasets.emplace_back(strings::StrCat("input_dataset_", i));
+    }
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, input_datasets,
+        {{"output_types", dtypes}, {"output_shapes", output_shapes}, {"N", n}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new ZipDataset op kernel context.
+  Status CreateZipDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestParam {
+  std::vector<RangeDatasetParam> input_range_dataset_params;
+  std::vector<Tensor> expected_outputs;
+  std::vector<int> breakpoints;
+};
+
+TestParam TestCase1() {
+  // Test case 1: the input datasets with same number of outputs.
+  return {/*input_range_dataset_params*/
+          {RangeDatasetParam{0, 3, 1}, RangeDatasetParam{10, 13, 1}},
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {10}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {11}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {12})},
+          /*breakpoints*/ {0, 1, 4}};
+}
+
+TestParam TestCase2() {
+  // Test case 2: the input datasets with different number of outputs.
+  return {/*input_range_dataset_params*/
+          {RangeDatasetParam{0, 3, 1}, RangeDatasetParam{10, 15, 1}},
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {10}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {11}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{}, {12})},
+          /*breakpoints*/ {0, 1, 4}};
+}
+
+class ZipDatasetOpTestHelper : public ZipDatasetOpTest {
+ public:
+  ~ZipDatasetOpTestHelper() override {
+    if (dataset_) dataset_->Unref();
+  }
+
+ protected:
+  Status CreateDatasetFromTestCase(const TestParam &test_case) {
+    std::vector<Tensor> range_dataset_tensors;
+    range_dataset_tensors.reserve(test_case.input_range_dataset_params.size());
+    TF_RETURN_IF_ERROR(CreateRangeDatasetTensors(
+        test_case.input_range_dataset_params, &range_dataset_tensors));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    inputs.reserve(range_dataset_tensors.size());
+    for (auto &tensor : range_dataset_tensors) {
+      inputs.emplace_back(&tensor);
+    }
+    int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+    TF_RETURN_IF_ERROR(CreateZipDatasetKernel({DT_INT64},
+                                              {{num_tensors_per_slice}},
+                                              inputs.size(), &dataset_kernel_));
+    TF_RETURN_IF_ERROR(CreateZipDatasetContext(dataset_kernel_.get(), &inputs,
+                                               &dataset_kernel_ctx_));
+    TF_RETURN_IF_ERROR(CreateDataset(dataset_kernel_.get(),
+                                     dataset_kernel_ctx_.get(), &dataset_));
+    return Status::OK();
+  }
+
+  Status CreateIteratorFromTestCase(const TestParam &test_case) {
+    TF_RETURN_IF_ERROR(CreateDatasetFromTestCase(test_case));
+    TF_RETURN_IF_ERROR(
+        CreateIteratorContext(dataset_kernel_ctx_.get(), &iterator_ctx_));
+    TF_RETURN_IF_ERROR(
+        dataset_->MakeIterator(iterator_ctx_.get(), "Iterator", &iterator_));
+    return Status::OK();
+  }
+
+  std::unique_ptr<OpKernel> dataset_kernel_;
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx_;
+  DatasetBase *dataset_ = nullptr;  // owned by this class.
+  std::unique_ptr<IteratorContext> iterator_ctx_;
+  std::unique_ptr<IteratorBase> iterator_;
+};
+
+class ParameterizedDatasetTest
+    : public ZipDatasetOpTestHelper,
+      public ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(ParameterizedDatasetTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator_->GetNext(iterator_ctx_.get(), &out_tensors,
+                                    &end_of_sequence));
+    if (!end_of_sequence) {
+      for (const auto &tensor : out_tensors) {
+        EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+        TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+        expected_outputs_it++;
+      }
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(ZipDatasetOpTestHelper, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateDatasetFromTestCase(TestCase1()));
+
+  EXPECT_EQ(dataset_->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+
+  DataTypeVector expected_output_dtypes;
+  expected_output_dtypes.reserve(num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    expected_output_dtypes.emplace_back(test_case.expected_outputs[i].dtype());
+  }
+
+  TF_EXPECT_OK(
+      VerifyTypesMatch(dataset_->output_dtypes(), expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+
+  std::vector<PartialTensorShape> expected_output_shapes;
+  expected_output_shapes.reserve(num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    expected_output_shapes.emplace_back(test_case.expected_outputs[i].shape());
+  }
+
+  TF_EXPECT_OK(VerifyShapesCompatible(dataset_->output_shapes(),
+                                      expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+  TF_ASSERT_OK(CreateDatasetFromTestCase(test_case));
+
+  EXPECT_EQ(dataset_->Cardinality(),
+            test_case.expected_outputs.size() / num_tensors_per_slice);
+}
+
+TEST_F(ZipDatasetOpTestHelper, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateDatasetFromTestCase(TestCase1()));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset_->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  DataTypeVector expected_output_dtypes;
+  expected_output_dtypes.reserve(num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    expected_output_dtypes.emplace_back(test_case.expected_outputs[i].dtype());
+  }
+
+  TF_EXPECT_OK(
+      VerifyTypesMatch(iterator_->output_dtypes(), expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  int num_tensors_per_slice = test_case.input_range_dataset_params.size();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  std::vector<PartialTensorShape> expected_output_shapes;
+  expected_output_shapes.reserve(num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    expected_output_shapes.emplace_back(test_case.expected_outputs[i].shape());
+  }
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator_->output_shapes(),
+                                      expected_output_shapes));
+}
+
+TEST_F(ZipDatasetOpTestHelper, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TF_ASSERT_OK(CreateIteratorFromTestCase(TestCase1()));
+  EXPECT_EQ(iterator_->prefix(), "Iterator::Zip");
+}
+
+TEST_P(ParameterizedDatasetTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestParam &test_case = GetParam();
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  TF_ASSERT_OK(CreateIteratorFromTestCase(test_case));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  for (int breakpoint : test_case.breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator_->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator_->Restore(iterator_ctx_.get(), &reader));
+
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator_->GetNext(iterator_ctx_.get(), &out_tensors,
+                                      &end_of_sequence));
+      if (!end_of_sequence) {
+        for (auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+      cur_iteration++;
+    }
+
+    if (breakpoint >= dataset_->Cardinality()) {
+      EXPECT_TRUE(end_of_sequence);
+      EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ZipDatasetOpTest, ParameterizedDatasetTest,
+    ::testing::ValuesIn(std::vector<TestParam>({TestCase1(), TestCase2()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 273962be997dbb76cffeaf7c1d2fcd29d2345dc2..12ea7db1ea15be3cf95eeabbc0ba0657d79950dc 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -364,7 +364,7 @@ TEST_F(DebugNumericSummaryOpTest, Float_only_valid_values) {
        7.33333333333,  // variance of non-inf and non-nan elements.
        static_cast<double>(DT_FLOAT),  // dtype
        2.0,                            // Number of dimensions.
-       2.0, 3.0});                     // Dimensoin sizes.
+       2.0, 3.0});                     // Dimension sizes.
 
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index ae451be7e21a119a309a74c3312eee4b24256248..8a9f7b1860120bfcf7702d3388142392cd19b69a 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -16,13 +16,13 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -142,7 +142,7 @@ class DecodeBmpOp : public OpKernel {
   }
 
   uint8* Decode(const uint8* input, const int row_size, uint8* const output,
-                const int width, const int height, const int channles,
+                const int width, const int height, const int channels,
                 bool top_down);
 
  private:
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 6bfb5bd5bc0ae50797080ca3540133b0081f0b13..ba6369533adbbde30f3661a3d8577936de1038fa 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -145,7 +145,7 @@ class DecodeCSVOp : public OpKernel {
               output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0);
             } else {
               float value;
-              OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtof(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid float: ", fields[f]));
@@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel {
                   record_defaults[f].flat<double>()(0);
             } else {
               double value;
-              OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtod(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid double: ", fields[f]));
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index b54e1ea8ac233f1ca48a65e8e1b7e547643a45a2..06dc766794caf71f3792460f5d6e4b39864d3266 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -625,8 +626,37 @@ class DecodeProtoOp : public OpKernel {
     // Gather the field descriptors and check that requested output types match.
     int field_index = 0;
     std::vector<const FieldDescriptor*> field_descs;
+    std::vector<const FieldDescriptor*> exts;
+    absl::flat_hash_map<string, const FieldDescriptor*> ext_name_to_field;
+    std::vector<const FieldDescriptor*>::iterator ext_it = exts.begin();
     for (const string& name : field_names) {
       auto fd = message_desc->FindFieldByName(name);
+      if (fd == nullptr) {
+        // If field can't be found in original message, try to find a matching
+        // extension (by its full_name). First check a hashmap for a matching
+        // extension, and if not found, then iterate through available
+        // extensions to find a match (updating the hashmap while iterating.)
+        auto lookup_result = ext_name_to_field.find(name);
+        if (lookup_result != ext_name_to_field.end()) {
+          fd = lookup_result->second;
+        } else {
+          if (ext_it == exts.begin()) {
+            desc_pool->FindAllExtensions(message_desc, &exts);
+            ext_it = exts.begin();
+          }
+          while (ext_it != exts.end()) {
+            auto ext_name = (*ext_it)->full_name();
+            auto ext_field = *ext_it;
+            ++ext_it;
+
+            ext_name_to_field.insert({ext_name, ext_field});
+            if (ext_name == name) {
+              fd = ext_field;
+              break;
+            }
+          }
+        }
+      }
       OP_REQUIRES(context, fd != nullptr,
                   errors::InvalidArgument("Unknown field: ", name,
                                           " in message type ", message_type));
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index eaef5a6097ff5a7235caba37edf6ef94d5860931..3dd019c3d203c63f055113bb992eb1f542e838ae 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -110,6 +110,8 @@ REGISTER(uint8);
 REGISTER(int16);
 REGISTER(int8);
 REGISTER(int64);
+REGISTER(complex64);
+REGISTER(complex128);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
index 0656081177e8673bdc8e603a832d96a8884bff45..768dd38a600b5dc4c4216b7feb7f5570ab0aec27 100644
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -162,10 +162,11 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NHWC> {
       return;
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    D2S_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        D2S_NHWC<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, batch_size,
         input_height, input_width, input_depth, output_height, output_width,
-        output_depth, output.data());
+        output_depth, output.data()));
   }
   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
                   int block_size, typename TTypes<T, 5>::Tensor output) {
@@ -197,23 +198,26 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
-          return D2S_NCHW_LOOP<T, 2>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), input_width, output_width,
-                  output_depth_by_input_area, input_depth_by_input_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              D2S_NCHW_LOOP<T, 2>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), input_width,
+              output_width, output_depth_by_input_area,
+              input_depth_by_input_area, output.data()));
+          return;
         case 3:
-          return D2S_NCHW_LOOP<T, 3>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), input_width, output_width,
-                  output_depth_by_input_area, input_depth_by_input_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              D2S_NCHW_LOOP<T, 3>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), input_width,
+              output_width, output_depth_by_input_area,
+              input_depth_by_input_area, output.data()));
+          return;
         case 4:
-          return D2S_NCHW_LOOP<T, 4>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), input_width, output_width,
-                  output_depth_by_input_area, input_depth_by_input_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              D2S_NCHW_LOOP<T, 4>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), input_width,
+              output_width, output_depth_by_input_area,
+              input_depth_by_input_area, output.data()));
+          return;
       }
     }
 
@@ -223,9 +227,10 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       return;
     }
     auto config = GetCudaLaunchConfig(total_count, d);
-    D2S_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        D2S_NCHW<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, input_width,
-        output_depth * input_height, output.data());
+        output_depth * input_height, output.data()));
   }
   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
                   int block_size, typename TTypes<T, 5>::Tensor output) {
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index c152f2b7e4125687f1b670fae374e2a747cd902c..ab98cacd1a117022444386b9a718e173d68fa99d 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index dacd3cfea8e71cdacf767ae64920c393f68278a3..11c2b31633dd2186c729c725c4cda5816447954d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <type_traits>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index 098853e68430d425143d16ff2e8edbb9877f8e23..fcbd8ffd868ddba7b5f36f41861c8cbeacb958aa 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -983,8 +983,9 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
   auto device = ctx->eigen_gpu_device();
   CudaLaunchConfig config =
       GetCudaLaunchConfig(num_in_backprop, device, kernel, 0, 0);
-  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-      args, out_backprop, filter, in_backprop, num_in_backprop);
+  TF_CHECK_OK(CudaLaunchKernel(
+      kernel, config.block_count, config.thread_per_block, 0, device.stream(),
+      args, out_backprop, filter, in_backprop, num_in_backprop));
   return Status::OK();
 }
 
@@ -1746,8 +1747,9 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU(
   auto device = ctx->eigen_gpu_device();
   CudaLaunchConfig config =
       GetCudaLaunchConfig(num_out_backprop, device, kernel, 0, 0);
-  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-      args, out_backprop, input, filter_backprop, num_out_backprop);
+  TF_CHECK_OK(CudaLaunchKernel(
+      kernel, config.block_count, config.thread_per_block, 0, device.stream(),
+      args, out_backprop, input, filter_backprop, num_out_backprop));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/deserialize_sparse_string_op.cc b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
index 2c13f24ad6b74b3b852a1813a8d000e83f977fa3..d26d8188d51e5ec1740112e32ea609c7a22d19da 100644
--- a/tensorflow/core/kernels/deserialize_sparse_string_op.cc
+++ b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
@@ -204,8 +204,6 @@ class DeserializeSparseOp : public OpKernel {
       target_shape.vec<int64>()(i + ndims - 1) = output.shape().data()[i + 1];
     }
 
-    Tensor output_indices;
-    Tensor output_shape;
     Reshape(context, output.indices(), input_shape, target_shape,
             0 /* output indices index */, 2 /* output shape index */);
     context->set_output(1, output.values());
diff --git a/tensorflow/core/kernels/determinant_op_gpu.cu.cc b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
index c866204c97e6acd160f1b4ec1eed989d88c52eff..681567ef2d8f428d632617d7d05744f505a296c9 100644
--- a/tensorflow/core/kernels/determinant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
@@ -129,10 +129,12 @@ struct DeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
     const int64 num_matrices = output.size();
     const int64 n = lu_factor.dimension(2);
     CudaLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
-    DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/false>
-        <<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-            config.virtual_thread_count, n, lu_factor.data(), pivots, nullptr,
-            output.data());
+
+    TF_CHECK_OK(CudaLaunchKernel(
+        DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/false>,
+        config.block_count, config.thread_per_block, 0, device.stream(),
+        config.virtual_thread_count, n, lu_factor.data(), pivots, nullptr,
+        output.data()));
   }
 };
 
@@ -150,10 +152,11 @@ struct LogDeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
     const int64 num_matrices = sign.size();
     const int64 n = lu_factor.dimension(2);
     CudaLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
-    DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/true>
-        <<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-            config.virtual_thread_count, n, lu_factor.data(), pivots,
-            sign.data(), log_abs_det.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/true>,
+        config.block_count, config.thread_per_block, 0, device.stream(),
+        config.virtual_thread_count, n, lu_factor.data(), pivots, sign.data(),
+        log_abs_det.data()));
   }
 };
 
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index c63806a7f68c6981dd0e83373c6bfd598788e338..12408f2c416e2767f49ccd1a44d2c62cf305359f 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -230,18 +230,20 @@ struct DilationBackpropInput<GPUDevice, T> {
     // Initialize in_backprop with all zeros.
     total_count = batch * input_rows * input_cols * depth;
     config = GetCudaLaunchConfig(total_count, d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        total_count, in_backprop.data());
+    TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
+                                 config.thread_per_block, 0, d.stream(),
+                                 total_count, in_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
     config = GetCudaLaunchConfig(total_count, d);
-    DilationBackpropInputKernel<<<config.block_count, config.thread_per_block,
-                                  0, d.stream()>>>(
-        config.virtual_thread_count, input.data(), filter.data(),
-        out_backprop.data(), batch, input_rows, input_cols, depth, filter_rows,
-        filter_cols, output_rows, output_cols, stride_rows, stride_cols,
-        rate_rows, rate_cols, pad_top, pad_left, in_backprop.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        DilationBackpropInputKernel<T>, config.block_count,
+        config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+        input.data(), filter.data(), out_backprop.data(), batch, input_rows,
+        input_cols, depth, filter_rows, filter_cols, output_rows, output_cols,
+        stride_rows, stride_cols, rate_rows, rate_cols, pad_top, pad_left,
+        in_backprop.data()));
   }
 };
 
@@ -270,8 +272,9 @@ struct DilationBackpropFilter<GPUDevice, T> {
     // Initialize filter_backprop with all zeros.
     total_count = filter_rows * filter_cols * depth;
     config = GetCudaLaunchConfig(total_count, d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        total_count, filter_backprop.data());
+    TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
+                                 config.thread_per_block, 0, d.stream(),
+                                 total_count, filter_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 572d04ae2c464d493508d494ba325a33eb92d4c1..95af19c4c4818abced194f7553e8bb79c777a998 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/util/util.h"
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index e7882acc80e3c2383f3a3c208175d16dd8c092ab..f00baa932f8546eb76ae21b72049fadab3948792 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -40,11 +40,11 @@ limitations under the License.
 #include "third_party/cub/iterator/constant_input_iterator.cuh"
 #include "third_party/cub/thread/thread_operators.cuh"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          start, delta, size, out.data());
+  TF_CHECK_OK(CudaLaunchKernel(RangeInitKernel<T>, config.block_count,
+                               config.thread_per_block, 0, d.stream(), start,
+                               delta, size, out.data()));
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -94,8 +94,9 @@ void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
   // For wrong inputs, we may have out_size < *num_runs. In this case we will
   // only handle the first out_size values.
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  MoveValuesKernel<<<config.block_count, config.thread_per_block, 0,
-                     d.stream()>>>(keys, values, num_runs, out_size, out);
+  TF_CHECK_OK(CudaLaunchKernel(MoveValuesKernel, config.block_count,
+                               config.thread_per_block, 0, d.stream(), keys,
+                               values, num_runs, out_size, out));
 }
 
 template <typename T>
@@ -103,10 +104,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<T, int32, true>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          params, indices, out, gather_dim_size, indices_size, slice_size,
-          out_size);
+  TF_CHECK_OK(CudaLaunchKernel(
+      GatherOpKernel<T, int32, true>, config.block_count,
+      config.thread_per_block, 0, d.stream(), params, indices, out,
+      gather_dim_size, indices_size, slice_size, out_size));
 }
 
 struct IdentityOp {
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f21f2acf2622a56cc3d6f58d259f79788a314dfb..471bd7fbb1c5d2e9096b11f9b6a45449760de2e7 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #ifdef GOOGLE_CUDA
-#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/kernels/gpu_device_array.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -138,9 +138,21 @@ class DynamicStitchOpImplBase : public OpKernel {
 template <typename T>
 void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
                           const int32 slice_size, const int32 first_dim_size,
-                          const CudaDeviceArrayStruct<int>& input_indices,
-                          const CudaDeviceArrayStruct<const T*>& input_ptrs,
+                          const GpuDeviceArrayStruct<int>& input_indices,
+                          const GpuDeviceArrayStruct<const T*>& input_ptrs,
                           T* output);
+#define REGISTER_GPU(T)                                           \
+  extern template void DynamicStitchGPUImpl(                      \
+      const Eigen::GpuDevice& gpu_device, const int32 slice_size, \
+      const int32 first_dim_size,                                 \
+      const GpuDeviceArrayStruct<int32>& input_indices,           \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs, T* output);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
+TF_CALL_int32(REGISTER_GPU);
+#undef REGISTER_GPU
 
 template <class T>
 class DynamicStitchOpGPU : public DynamicStitchOpImplBase<T> {
@@ -167,14 +179,14 @@ class DynamicStitchOpGPU : public DynamicStitchOpImplBase<T> {
     // merged that aren't covered by an index in indices.  What should we do?
     if (first_dim_size > 0) {
       // because the collision requirements, we have to deal with
-      // collion first before send data to gpu kernel.
+      // collision first before send data to gpu kernel.
       // TODO(ekelsen): Instead of doing a serial scan on the CPU to pick the
       // last of duplicated indices, it could instead be done of the GPU
       // implicitly using atomics to make sure the last index is the final
       // write.
       const int slice_size = merged->flat_outer_dims<T>().dimension(1);
-      CudaDeviceArrayOnHost<int32> indices_flat(c, first_dim_size);
-      CudaDeviceArrayOnHost<const T*> data_flat(c, data_elements_size);
+      GpuDeviceArrayOnHost<int32> indices_flat(c, first_dim_size);
+      GpuDeviceArrayOnHost<const T*> data_flat(c, data_elements_size);
       OP_REQUIRES_OK(c, indices_flat.Init());
       OP_REQUIRES_OK(c, data_flat.Init());
       // initialize the indices_flat (-1 represents missing indices)
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
index 102cdc40d428dafcb847bbea41694271478fbc3b..dd8b348961433983d9b2d8ef2b4cd230408884ad 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/cuda_device_array_gpu.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -31,11 +31,11 @@ namespace {
 template <typename T>
 __global__ void DynamicStitchKernel(const int32 slice_size,
                                     const int32 output_size,
-                                    CudaDeviceArrayStruct<int32> input_indices,
-                                    CudaDeviceArrayStruct<const T*> input_ptrs,
+                                    GpuDeviceArrayStruct<int32> input_indices,
+                                    GpuDeviceArrayStruct<const T*> input_ptrs,
                                     T* output) {
-  int32* data_indices = GetCudaDeviceArrayOnDevice(&input_indices);
-  const T** data_ptrs = GetCudaDeviceArrayOnDevice(&input_ptrs);
+  int32* data_indices = GetGpuDeviceArrayOnDevice(&input_indices);
+  const T** data_ptrs = GetGpuDeviceArrayOnDevice(&input_ptrs);
   CUDA_1D_KERNEL_LOOP(output_index, output_size) {
     const int32 slice_id = output_index / slice_size;
     const int32 slice_offset = output_index % slice_size;
@@ -51,23 +51,24 @@ __global__ void DynamicStitchKernel(const int32 slice_size,
 template <typename T>
 void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
                           const int32 slice_size, const int32 first_dim_size,
-                          const CudaDeviceArrayStruct<int>& input_indices,
-                          const CudaDeviceArrayStruct<const T*>& input_ptrs,
+                          const GpuDeviceArrayStruct<int>& input_indices,
+                          const GpuDeviceArrayStruct<const T*>& input_ptrs,
                           T* output) {
   const int32 output_size = first_dim_size * slice_size;
   auto config = GetCudaLaunchConfig(output_size, gpu_device);
 
-  DynamicStitchKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, gpu_device.stream()>>>(
-          slice_size, output_size, input_indices, input_ptrs, output);
+  TF_CHECK_OK(CudaLaunchKernel(DynamicStitchKernel<T>, config.block_count,
+                               config.thread_per_block, 0, gpu_device.stream(),
+                               slice_size, output_size, input_indices,
+                               input_ptrs, output));
 }
 
 #define REGISTER_GPU(T)                                           \
   template void DynamicStitchGPUImpl(                             \
       const Eigen::GpuDevice& gpu_device, const int32 slice_size, \
       const int32 first_dim_size,                                 \
-      const CudaDeviceArrayStruct<int32>& input_indices,          \
-      const CudaDeviceArrayStruct<const T*>& input_ptrs, T* output);
+      const GpuDeviceArrayStruct<int32>& input_indices,           \
+      const GpuDeviceArrayStruct<const T*>& input_ptrs, T* output);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 4d86f9deb9902a64764e29ca0371bb68ad4f3370..c5158e65d8af4b9e721eb54ce5414023b06ef6a4 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -20,6 +20,13 @@ limitations under the License.
 
 namespace Eigen {
 
+// Noise mode used when padding.
+enum ExtractGlimpsesNoiseMode {
+  UNIFORM = 0,
+  GAUSSIAN = 1,
+  ZERO = 2,
+};
+
 /** ExtractGlimpses
  * \ingroup CXX11_NeuralNetworks_Module
  *
@@ -43,18 +50,19 @@ namespace Eigen {
  * for width and height which will be equal to the requested glimpse size.
  */
 namespace {
+
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
                       const bool normalized, const bool centered,
-                      const bool uniform_noise)
+                      const ExtractGlimpsesNoiseMode noise)
       : width_(width),
         height_(height),
         offsets_(offsets),
         normalized_(normalized),
         centered_(centered),
-        uniform_noise_(uniform_noise) {}
+        noise_(noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
@@ -144,64 +152,73 @@ struct GlimpseExtractionOp {
       slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
 
       if (partial_overlap) {
-        if (uniform_noise_) {
-          // Initialize the glimpse with uniform noise.
-          typedef typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type Scalar;
-          TensorFixedSize<Scalar, Sizes<> > mini;
-          mini.device(device) = input.template chip<3>(i).minimum();
-          TensorFixedSize<float, Sizes<> > range;
-          range.device(device) = (input.template chip<3>(i).maximum() - mini)
-                                     .template cast<float>();
-
-          DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
-          TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
-          output.template chip<3>(i).device(device) =
-              mini.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size) +
-              (tmp.random(unigen) *
-               range.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size))
-                  .template cast<Scalar>();
-        } else {
-          // Initialize the glimpse with white noise: compute the mean and sigma
-          // of each channel, and use them to shape the gaussian.
-          DSizes<Index, 2> glimpse_size(width_, height_);
-          DSizes<Index, 2> input_size(input_width, input_height);
-          typedef typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type Scalar;
-
-          for (int j = 0; j < num_channels; ++j) {
-            TensorFixedSize<Scalar, Sizes<> > mean;
-            mean.device(device) = input.template chip<3>(i)
-                                      .template chip<0>(j)
-                                      .template cast<float>()
-                                      .mean();
-            TensorFixedSize<float, Sizes<> > sigma;
-            sigma.device(device) =
-                (input.template chip<3>(i)
-                     .template chip<0>(j)
-                     .template cast<float>() -
-                 mean.reshape(Sizes<1, 1>()).broadcast(input_size))
-                    .square()
-                    .mean()
-                    .sqrt();
+        switch (noise_) {
+          case ZERO: {
+            // Initialize the glimpse with zero noise.
+            output.template chip<3>(i).device(device) =
+                output.template chip<3>(i).constant(0);
+          } break;
+          case UNIFORM: {
+            // Initialize the glimpse with uniform noise.
+            typedef typename internal::remove_const<
+                typename internal::traits<Input>::Scalar>::type Scalar;
             TensorFixedSize<Scalar, Sizes<> > mini;
-            mini.device(device) =
-                input.template chip<3>(i).template chip<0>(j).minimum();
-            TensorFixedSize<float, Sizes<> > maxi;
-            maxi.device(device) =
-                input.template chip<3>(i).template chip<0>(j).maximum();
-
-            TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
-            output.template chip<3>(i).template chip<0>(j).device(device) =
-                (mean.reshape(Sizes<1, 1>()).broadcast(glimpse_size) +
-                 (tmp.random(gen) *
-                  sigma.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
-                     .template cast<Scalar>())
-                    .cwiseMin(
-                        maxi.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
-                    .cwiseMax(
-                        mini.reshape(Sizes<1, 1>()).broadcast(glimpse_size));
-          }
+            mini.device(device) = input.template chip<3>(i).minimum();
+            TensorFixedSize<float, Sizes<> > range;
+            range.device(device) = (input.template chip<3>(i).maximum() - mini)
+                                       .template cast<float>();
+
+            DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
+            TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
+            output.template chip<3>(i).device(device) =
+                mini.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size) +
+                (tmp.random(unigen) *
+                 range.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size))
+                    .template cast<Scalar>();
+          } break;
+          case GAUSSIAN: {
+            // Initialize the glimpse with white noise: compute the mean and
+            // sigma
+            // of each channel, and use them to shape the gaussian.
+            DSizes<Index, 2> glimpse_size(width_, height_);
+            DSizes<Index, 2> input_size(input_width, input_height);
+            typedef typename internal::remove_const<
+                typename internal::traits<Input>::Scalar>::type Scalar;
+
+            for (int j = 0; j < num_channels; ++j) {
+              TensorFixedSize<Scalar, Sizes<> > mean;
+              mean.device(device) = input.template chip<3>(i)
+                                        .template chip<0>(j)
+                                        .template cast<float>()
+                                        .mean();
+              TensorFixedSize<float, Sizes<> > sigma;
+              sigma.device(device) =
+                  (input.template chip<3>(i)
+                       .template chip<0>(j)
+                       .template cast<float>() -
+                   mean.reshape(Sizes<1, 1>()).broadcast(input_size))
+                      .square()
+                      .mean()
+                      .sqrt();
+              TensorFixedSize<Scalar, Sizes<> > mini;
+              mini.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).minimum();
+              TensorFixedSize<float, Sizes<> > maxi;
+              maxi.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).maximum();
+
+              TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
+              output.template chip<3>(i).template chip<0>(j).device(device) =
+                  (mean.reshape(Sizes<1, 1>()).broadcast(glimpse_size) +
+                   (tmp.random(gen) *
+                    sigma.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                       .template cast<Scalar>())
+                      .cwiseMin(
+                          maxi.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                      .cwiseMax(
+                          mini.reshape(Sizes<1, 1>()).broadcast(glimpse_size));
+            }
+          } break;
         }
 
         // Copy the part of the glimpse that cover the input image if any.
@@ -225,7 +242,7 @@ struct GlimpseExtractionOp {
   const std::vector<IndexPair<float> > offsets_;
   const bool normalized_;
   const bool centered_;
-  const bool uniform_noise_;
+  const ExtractGlimpsesNoiseMode noise_;
 };
 }  // namespace
 
@@ -233,12 +250,12 @@ template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
     const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
     const Input>
-ExtractGlimpses(const Input& input,
-                const typename internal::traits<Input>::Index width,
-                const typename internal::traits<Input>::Index height,
-                const std::vector<IndexPair<float> >& offsets,
-                const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true) {
+ExtractGlimpses(
+    const Input& input, const typename internal::traits<Input>::Index width,
+    const typename internal::traits<Input>::Index height,
+    const std::vector<IndexPair<float> >& offsets, const bool normalized = true,
+    const bool centered = true,
+    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM) {
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
                       YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
@@ -246,7 +263,7 @@ ExtractGlimpses(const Input& input,
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
-                                      centered, uniform_noise);
+                                      centered, noise);
   return input.customOp(op);
 }
 
diff --git a/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc b/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
index ec949ddc845de5b391d2589e9ef07fd0c7e3fd0c..12fa7f3409da77b1d6f69b9052a9286bb795bb87 100644
--- a/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
+++ b/tensorflow/core/kernels/eigen_benchmark_cpu_test.cc
@@ -8,7 +8,7 @@ You may obtain a copy of the License at
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONT OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 05a3ae07a1137078806301c81727fdfeb8429918..1af263cdd5dbb2afc0b1435aacfc70a782478f30 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -45,19 +45,34 @@ namespace internal {
 // Returns `true` iff we can use custom contraction kernels. This is a runtime
 // check, that uses environment variables.
 bool UseCustomContractionKernels();
-#endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
-
-// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
 
+// Pack a 2D block of a Tensor expression into contiguous block of memory with
+// col-major storage order. We do not have access to the underlying Tensor
+// expression, we only have a DataMapper (TensorContractionInputMapper for
+// tensor contractions, or blas_data_mapper for plain tensors), that provides a
+// two-dimensional view into the Tensor expression.
+//
+// Default Eigen gemm_pack_rhs and gemm_pack_lhs pack blocks of tensor
+// expressions into the packed format described in "Anatomy of High-Performance
+// Matrix Multiplication" paper (1). Eigen::internal::gebp_kernel relies on this
+// packing format for efficient micro-panel multiplication.
+//
+// This simple packing can be used with any '?gemm' function from BLAS
+// libraries, that work with col-major matrices.
+//
+// (1) http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf
+//
+// IMPORTANT: `gemm_pack_colmajor_block` always packs the block in column major
+// order, DataMapperStorageOrder specifies the storage order of the underlying
+// Tensor expression.
 template <typename Scalar, typename IndexType, typename DataMapper,
-          int StorageOrder>
-struct mkldnn_gemm_pack;
+          int DataMapperStorageOrder>
+struct gemm_pack_colmajor_block;
 
-// mkl_gemm_pack for ColMajor storage order.
+// gemm_pack_colmajor_block for ColMajor storage order.
 template <typename Scalar, typename IndexType, typename DataMapper>
-struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
-                        /*StorageOrder*/ ColMajor> {
+struct gemm_pack_colmajor_block<Scalar, IndexType, DataMapper,
+                                /*DataMapperStorageOrder*/ ColMajor> {
   typedef typename internal::packet_traits<Scalar>::type Packet;
   typedef typename DataMapper::LinearMapper LinearMapper;
 
@@ -66,38 +81,41 @@ struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
   EIGEN_DONT_INLINE
   void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
                   IndexType cols) {
-    const IndexType unrolled_rows =
-        (rows / (4 * PacketSize)) * (4 * PacketSize);
-    const IndexType vectorized_rows = (rows / PacketSize) * PacketSize;
+    const IndexType unrolled_rows = rows - 4 * PacketSize;
+    const IndexType vectorized_rows = rows - PacketSize;
 
     for (IndexType col = 0; col < cols; ++col) {
       LinearMapper lm = data_mapper.getLinearMapper(0, col);
 
+      IndexType row = 0;
       // Give compiler a strong possibility to unroll the loop.
-      for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+      for (; row <= unrolled_rows; row += 4 * PacketSize) {
         for (IndexType j = 0; j < 4; ++j) {
-          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+          const Packet p = lm.template loadPacket<Packet>(row + j * PacketSize);
           internal::pstoreu(block + j * PacketSize, p);
         }
         block += 4 * PacketSize;
       }
-
       // Process remaining rows with packets.
-      for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) {
-        const Packet p = lm.template loadPacket<Packet>(i);
+      for (; row <= vectorized_rows; row += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(row);
         internal::pstoreu(block, p);
         block += PacketSize;
       }
-
       // Finalize with coefficients.
-      for (IndexType i = vectorized_rows; i < rows; ++i) {
-        *block = lm(i);
+      for (; row < rows; ++row) {
+        *block = lm(row);
         ++block;
       }
     }
   }
 };
 
+#endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
+
+// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 template <typename Scalar, typename IndexType, typename OutputMapper,
           bool ConjugateLhs = false, bool ConjugateRhs = false>
 struct mkldnn_gemm_kernel;
@@ -107,6 +125,9 @@ template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
           bool ConjugateRhs>
 struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
                           ConjugateLhs, ConjugateRhs> {
+  static_assert(!ConjugateLhs, "MKL-DNN kernel doesn't support ConjugateLhs");
+  static_assert(!ConjugateRhs, "MKL-DNN kernel doesn't support ConjugateRhs");
+
   EIGEN_DONT_INLINE
   void operator()(const OutputMapper& output, const float* blockA,
                   const float* blockB, const IndexType rows,
@@ -122,11 +143,11 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
     const int n = static_cast<int>(cols);
     const int k = static_cast<int>(depth);
 
-    const char transposeA = ConjugateLhs ? 'Y' : 'N';
-    const char transposeB = ConjugateRhs ? 'Y' : 'N';
+    const char transposeA = 'N';
+    const char transposeB = 'N';
 
-    const int ldA = ConjugateLhs ? k : m;
-    const int ldB = ConjugateRhs ? n : k;
+    const int ldA = m;
+    const int ldB = k;
     const int ldC = static_cast<int>(output.stride());
 
     const float beta = 1.0;
@@ -197,7 +218,8 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
     // We split Kth dimensions in roughly equal slices.
     StorageIndex target_k_slices =
         (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
-    StorageIndex packet_size = 8;
+    StorageIndex packet_size = internal::packet_traits<Scalar>::size;
+    if (packet_size < 8) packet_size = 8;
     StorageIndex target_bk =
         Eigen::divup(k / target_k_slices, packet_size) * packet_size;
     kc_ = (std::min)(k, target_bk);
@@ -221,10 +243,12 @@ struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
   using Scalar = float;
   using Traits = typename internal::gebp_traits<Scalar, Scalar>;
 
-  using LhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
-                                     typename LhsMapper::SubMapper, ColMajor>;
-  using RhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
-                                     typename RhsMapper::SubMapper, ColMajor>;
+  using LhsPacker =
+      gemm_pack_colmajor_block<Scalar, StorageIndex,
+                               typename LhsMapper::SubMapper, ColMajor>;
+  using RhsPacker =
+      gemm_pack_colmajor_block<Scalar, StorageIndex,
+                               typename RhsMapper::SubMapper, ColMajor>;
   using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
 
   // Fallback on default Eigen pack and GEBP kernel if custom contraction
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 3182307e51e5fc2912ff7e178fbeab6c73d47d03..a0b3c101ebab2879d1d9d7f12db7136ae0b3c7f1 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -1432,23 +1432,20 @@ struct gemm_pack_rhs<
   }
 };
 
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-// Arrange a block of the right input matrix (in our case it's always a "virtual
-// matrix" constructed from extracted volume patches) in contiguous memory.
-//
-// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
-// this is basically the same as taking a slice of the matrix. Knowing
-// properties of the original patch op we can do it more efficient than default
-// mkldnn_gemm_pack.
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+// Pack a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous block in
+// column-major storage order. Knowing the properties of the original patch op
+// we can do it more efficient than the default gemm_pack_colmajor_block.
 //
-// TODO(ezhulenev): mkldnn_gemm_pack for spatial convolutions supports squeezing
-// reads along the 2 innermost dimensions, add it here if needed.
+// TODO(ezhulenev): gemm_pack_colmajor_block for spatial convolutions supports
+// squeezing reads along the 2 innermost dimensions, add it here if needed.
 template <typename NewDimension, Index Planes, Index Rows, Index Cols,
           typename ArgType, typename Device, typename Scalar,
           typename StorageIndex, typename nocontract_t, typename contract_t,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
           int Alignment>
-struct mkldnn_gemm_pack<
+struct gemm_pack_colmajor_block<
     Scalar, StorageIndex,
     TensorContractionSubMapper<
         Scalar, StorageIndex, Rhs,
@@ -1594,7 +1591,7 @@ struct mkldnn_gemm_pack<
     }
   }
 };
-#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 
 }  // namespace internal
 
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
index da4a61d1bda1ea1171fdea5c9dffaab8aabd4429..0234c7006eaf2ca73cc3cd04787d48a1f95a6d1e 100644
--- a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -33,8 +33,8 @@ Eigen::array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
 using Scalar = float;
 using Index = Eigen::Index;
 
-TEST(EigenMkldnnTest, MkldnnPack) {
-  // Packing with mkldnn_gemm_pack is the same as taking a slice of 2
+TEST(EigenMkldnnTest, GemmPackColMajor) {
+  // Packing with gemm_pack_colmajor_block is the same as taking a slice of 2
   // dimensional Tensor.
 
   // Mkldnn pack and gemm are used only in Tensor contractions, and it's
@@ -42,7 +42,8 @@ TEST(EigenMkldnnTest, MkldnnPack) {
   static const int Options = ColMajor;
 
   using DataMapper = blas_data_mapper<Scalar, Index, ColMajor>;
-  using MkldnnGemmPack = mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor>;
+  using GemmPackColMajor =
+      gemm_pack_colmajor_block<Scalar, Index, DataMapper, ColMajor>;
   using Tensor2d = Tensor<Scalar, 2, Options, Index>;
 
   Eigen::array<Index, 2> dims = RandomDims<Index, 2>(1, 500);
@@ -65,9 +66,9 @@ TEST(EigenMkldnnTest, MkldnnPack) {
   Tensor2d pack_dst(slice_size[0], slice_size[1]);
   Tensor2d slice_dst(slice_size[0], slice_size[1]);
 
-  // Pack memory using mkldnn_gemm_pack.
+  // Pack memory using gemm_pack_colmajor_block.
   DataMapper data_mapper(src.data(), dims[0]);
-  MkldnnGemmPack gemm_pack;
+  GemmPackColMajor gemm_pack;
   gemm_pack(pack_dst.data(),
             data_mapper.getSubMapper(slice_start[0], slice_start[1]),
             slice_size[0], slice_size[1]);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2afab42ec1922be65a015f872510a4e634a55d6
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@@ -0,0 +1,1496 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+
+// Note this header is used in both TF and TFLite.
+namespace Eigen {
+
+namespace internal {
+
+// WARNING: Most of the code here implicitly assumes that the matrix is in
+// ColMajor layout. This is guaranteed by the tensor contraction (see
+// TensorContraction.h).
+//
+// Inside Eigen a tensor contraction is represented by a matrix multiplication.
+// We don't want to actually extract image patches and reshape the result into
+// a matrix (this involves allocating huge extra memory), so the patch
+// extraction and reshape operations are implicit.
+//
+// TensorContractionInputMapper takes a matrix index and returns the coefficient
+// (or the packet) of the "virtual tensor", that would be at that index if we
+// were to actually reshape the result of patch extraction.
+//
+// TensorContractionSubMapper provides a similar view into the "virtual matrix"
+// at the given vertical and horizontal offsets.
+//
+// "Virtual matrix" dimensions:
+//   *0: kernelChannels * kernelRows * kernelCols;
+//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
+//
+// *) extracted patches are continuous in memory (innermost dimension assuming
+//    col major layout)
+//
+// With this dimensions:
+//   row - offset within a single patch (in code: patchId)
+//   col - index of the extracted patch (in code: patchIndex)
+//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
+//
+// TODO(ezhulenev): Consolidate this part of the code with the image patch
+// extraction code since they are both very similar.
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<
+    Scalar_, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef Scalar_ Scalar;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(
+      const TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>& tensor,
+      const nocontract_t&, const nocontract_t&, const contract_t&,
+      const contract_t&)
+      : m_impl(tensor.impl().impl()) {
+    Index patch_rows;
+    Index patch_depth;
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      patch_depth = tensor.impl().dimensions()[0];
+      patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    } else {
+      const size_t NumDims = tensor.impl().dimensions().size();
+      patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+
+    // Strides for navigating through the single patch.
+    m_patch_row_stride = patch_depth;
+    m_patch_col_stride = patch_rows * m_patch_row_stride;
+
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    } else {
+      const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = patch_depth;
+    m_colInputStride = patch_depth * m_inputRows;
+    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastPatchRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_stride);
+    m_fastPatchColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_stride);
+    m_fastInputRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
+      : m_impl(base_mapper.m_impl) {
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+
+    m_patch_row_stride = base_mapper.m_patch_row_stride;
+    m_patch_col_stride = base_mapper.m_patch_col_stride;
+
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
+    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+  }
+
+  // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
+           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual
+  // m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  // EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+ private:
+  friend class TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>;
+
+  // Load coefficient from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
+                                       Index colIndex, Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol =
+        (m_patch_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow =
+        (m_patch_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
+        origInputRow >= m_inputRows ||
+        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
+        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + origInputRow * m_rowInputStride +
+                             origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
+  // and `in_strides` equal to 1 (template specialization without templates).
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
+                                               Index colIndex,
+                                               Index otherIndex) const {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
+        inputRow >= m_inputRows) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // Load packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
+                                        Index colIndex,
+                                        Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
+                                                Index colIndex,
+                                                Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    } else {
+      // Offsets and input calculation here are identical to
+      // loadCoeffStandard(...), but repeated twice.
+
+      const Index patchOffsets[2] = {
+          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                   patchOffsets[1] / m_fastColStride};
+      const Index inputCols[2] = {colIndex + colOffsets[0],
+                                  colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+        // all zeros
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {
+            patchOffsets[0] - colOffsets[0] * m_colStride,
+            patchOffsets[1] - colOffsets[1] * m_colStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                    rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+          // all zeros
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+          // no padding
+          const Index depth = patchId - patchOffsets[0] * patchDepth();
+          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                                   inputCols[0] * m_colInputStride + otherIndex;
+          return m_impl.template packet<Unaligned>(inputIndex);
+        }
+      }
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
+                                            Index colIndex,
+                                            Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
+        inputRow >= m_inputRows) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    // no padding
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX
+    typename internal::remove_const<Scalar>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
+      Index patchIndex, Index& rowIndex, Index& colIndex,
+      Index& otherIndex) const {
+    const size_t NumInputDims = array_size<
+        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex = (NumInputDims == 3)
+                                   ? patchIndex
+                                   : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  Index m_patch_cols;   // number of columns in the patch
+  Index m_num_patches;  // number of patches to extract.
+
+  // Strides for navigating through the single patch.
+  Index m_patch_row_stride;
+  Index m_patch_col_stride;
+  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
+  internal::TensorIntDivisor<Index> m_fastPatchColStride;
+
+  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
+                                      // image patch
+  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
+                                      // image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;    // row stride in the input tensor
+  Index m_colInputStride;    // col stride in the input tensor
+  Index m_patchInputStride;  // patch stride in the input tensor
+
+  Index m_inputRows;  // Number of rows in the input tensor
+  Index m_inputCols;  // Number of cols in the input tensor
+
+  Index m_outputRows;  // Number of patch rows
+
+  Index m_row_strides;  // User specified row stride
+  Index m_col_strides;  // User specified col stride
+
+  Index m_in_row_strides;  // User specified input row stride
+  Index m_in_col_strides;  // User specified input col stride
+
+  Index m_rowPaddingTop;   // Row padding
+  Index m_colPaddingLeft;  // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<
+    Scalar, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      ParentMapper;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const Self& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                   m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
+                                                          Index j) const {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                    m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
+                                                          Index j) const {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
+                                                        j + m_col_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
+  loadCoeffStandard(Index i) const {
+    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
+                                           m_colIndex, m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
+    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
+                                        m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
+  loadPacketStandard(Index i) const {
+    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
+                                            m_colIndex, m_otherIndex);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_base_mapper.nonStandardPatches();
+  }
+
+  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
+  // index respectively that fits into the peeled_k elements starting at
+  // m_depth_offset.
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
+    const Index max_col =
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
+    return std::min<Index>(1 + max_col, patchCols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
+                                   const Index col) const {
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
+    return std::min<Index>(1 + max_row, patchRows());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
+  // MaxDepth uses only the remaining number of elements in the peeled_k.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
+                                     const Index start_depth) const {
+    return std::min<Index>(start_depth + num_elements, patchDepth());
+  }
+
+  // Every register matters in this code, so sometimes to prevent register
+  // spilling, instead of the variable that you would expect to see, we use
+  // another one, that is guaranteed to have the same value. E.g. patch depth is
+  // always the same as input depth, and it's also the same as input row stride.
+  // Bunch of other parameters have similar relations.
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const {
+    return m_base_mapper.m_rowInputStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const {
+    return m_base_mapper.m_colStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const {
+    return m_base_mapper.m_patch_cols;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return patchDepth();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchColStride() const {
+    return m_base_mapper.m_patch_col_stride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return m_base_mapper.m_fastDimZero;  // patch_depth
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
+    return m_base_mapper.m_fastPatchColStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
+                                             const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 || r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 || c >= m_base_mapper.m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_base_mapper.m_rowInputStride +
+           c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return patchOffset - colOffset * m_base_mapper.m_colStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    return m_depth_offset % patchDepth();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
+  getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+  }
+
+ private:
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
+  // indices for the first element in a patch specified by col_offset
+  // (see computeBaseIndices(...) for details).
+  Index m_rowIndex;
+  Index m_colIndex;
+  Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+};
+
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
+// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
+// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
+// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
+// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
+// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
+// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
+// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
+// A8 ...
+// ...
+//
+// *) A, B, C, ... - patches extracted from the original input.
+// *) A0, A1, A2 ... - values from the same patch at different offsets.
+//
+// The traversal (packed rhs memory) order (B0 besides A0 in memory):
+// A0 B0 C0 D0 A1 B1 C1 D1 ...
+// E0 F0 G0 H0 E1 F1 G1 H1 ...
+// ...
+// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
+//
+// This traversal order must be the same as in default gemm_pack_rhs defined in
+// GeneralBlockPanelKernel.h.
+//
+// *) nr - number of registers along the 'n' dimension.
+//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
+//    Multiplication" paper.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((packet_size % 4) == 0 && !non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows, if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block + 0 * packet_size, kernel.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel.packet[1]);
+            pstoreu(block + 2 * packet_size, kernel.packet[2]);
+            pstoreu(block + 3 * packet_size, kernel.packet[3]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          // Packet can span multiple rows or columns, so we have to go
+          // though the slower "standard" path.
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!non_standard_patches) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Special case for non-vectorized types such as float16.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      if (!rhs.nonStandardPatches()) {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+}  // end namespace internal
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
+ */
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
+EIGEN_DEVICE_FUNC
+    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+        internal::traits<Input>::Layout == ColMajor,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const OutputKernel> >,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const OutputKernel> > >::type
+    SpatialConvolution(const Input& input, const Kernel& kernel,
+                       const Index row_stride = 1, const Index col_stride = 1,
+                       const PaddingType padding_type = PADDING_SAME,
+                       const Index row_in_stride = 1,
+                       const Index col_in_stride = 1,
+                       const OutputKernel& output_kernel = OutputKernel()) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex> >
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(
+      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+      YOU_MADE_A_PROGRAMMING_MISTAKE)
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const Index kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const Index kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
+                                static_cast<float>(row_stride));
+      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
+                               static_cast<float>(col_stride));
+      break;
+    case PADDING_SAME:
+      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
+      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
+      break;
+    default:
+      // Initialize unused variables to avoid a compiler warning
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+          .contract(input
+                        .extract_image_patches(
+                            kernelRows, kernelCols, row_stride, col_stride,
+                            row_in_stride, col_in_stride, padding_type)
+                        .reshape(pre_contract_dims),
+                    contract_dims, output_kernel)
+          .reshape(post_contract_dims),
+      input
+          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                 row_in_stride, col_in_stride, padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+          .reshape(post_contract_dims));
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 8b198139400a6d2ce2795f9ef0b5793114a78e0b..f955bc77b80bfba861ef0ab8639267872051120b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -20,1301 +20,19 @@ limitations under the License.
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
-#endif
 
 namespace Eigen {
-
 namespace internal {
-
-// WARNING: Most of the code here implicitly assumes that the matrix is in
-// ColMajor layout. This is guaranteed by the tensor contraction (see
-// TensorContraction.h).
-//
-// Inside Eigen a tensor contraction is represented by a matrix multiplication.
-// We don't want to actually extract image patches and reshape the result into
-// a matrix (this involves allocating huge extra memory), so the patch
-// extraction and reshape operations are implicit.
-//
-// TensorContractionInputMapper takes a matrix index and returns the coefficient
-// (or the packet) of the "virtual tensor", that would be at that index if we
-// were to actually reshape the result of patch extraction.
-//
-// TensorContractionSubMapper provides a similar view into the "virtual matrix"
-// at the given vertical and horizontal offsets.
-//
-// "Virtual matrix" dimensions:
-//   *0: kernelChannels * kernelRows * kernelCols;
-//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
-//
-// *) extracted patches are continuous in memory (innermost dimension assuming
-//    col major layout)
-//
-// With this dimensions:
-//   row - offset within a single patch (in code: patchId)
-//   col - index of the extracted patch (in code: patchIndex)
-//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
-//
-// TODO(ezhulenev): Consolidate this part of the code with the image patch
-// extraction code since they are both very similar.
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar_, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef Scalar_ Scalar;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-
-  typedef SubMapper VectorMapper;
-  typedef SubMapper LinearMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>& tensor,
-      const nocontract_t&, const nocontract_t&, const contract_t&,
-      const contract_t&)
-      : m_impl(tensor.impl().impl()) {
-    Index patch_rows;
-    Index patch_depth;
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      patch_depth = tensor.impl().dimensions()[0];
-      patch_rows = tensor.impl().dimensions()[1];
-      m_patch_cols = tensor.impl().dimensions()[2];
-      m_num_patches = tensor.impl().dimensions()[3];
-    } else {
-      const size_t NumDims = tensor.impl().dimensions().size();
-      patch_depth = tensor.impl().dimensions()[NumDims - 1];
-      patch_rows = tensor.impl().dimensions()[NumDims - 2];
-      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
-      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
-    }
-
-    // Strides for navigating through the single patch.
-    m_patch_row_stride = patch_depth;
-    m_patch_col_stride = patch_rows * m_patch_row_stride;
-
-    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
-    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
-
-    m_colStride = patch_rows;
-
-    m_outputRows = tensor.impl().outputRows();
-    m_row_strides = tensor.impl().userRowStride();
-    m_col_strides = tensor.impl().userColStride();
-
-    m_in_row_strides = tensor.impl().userInRowStride();
-    m_in_col_strides = tensor.impl().userInColStride();
-
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      m_inputRows = tensor.impl().impl().dimensions()[1];
-      m_inputCols = tensor.impl().impl().dimensions()[2];
-    } else {
-      const int NumDims = tensor.impl().impl().dimensions().size();
-      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
-      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
-    }
-
-    m_rowInputStride = patch_depth;
-    m_colInputStride = patch_depth * m_inputRows;
-    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
-
-    m_rowPaddingTop = tensor.impl().rowPaddingTop();
-    m_colPaddingLeft = tensor.impl().colPaddingLeft();
-
-    m_fastPatchRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_stride);
-    m_fastPatchColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_stride);
-    m_fastInputRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
-    m_fastInputColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
-    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
-    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
-  }
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
-      : m_impl(base_mapper.m_impl) {
-    m_patch_cols = base_mapper.m_patch_cols;
-    m_num_patches = base_mapper.m_num_patches;
-
-    m_patch_row_stride = base_mapper.m_patch_row_stride;
-    m_patch_col_stride = base_mapper.m_patch_col_stride;
-
-    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
-    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
-
-    m_colStride = base_mapper.m_colStride;
-
-    m_rowInputStride = base_mapper.m_rowInputStride;
-    m_colInputStride = base_mapper.m_colInputStride;
-    m_patchInputStride = base_mapper.m_patchInputStride;
-
-    m_inputRows = base_mapper.m_inputRows;
-    m_inputCols = base_mapper.m_inputCols;
-
-    m_outputRows = base_mapper.m_outputRows;
-    m_row_strides = base_mapper.m_row_strides;
-    m_col_strides = base_mapper.m_col_strides;
-
-    m_in_row_strides = base_mapper.m_in_row_strides;
-    m_in_col_strides = base_mapper.m_in_col_strides;
-
-    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
-    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
-
-    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
-    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
-    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
-    m_fastInputColStride = base_mapper.m_fastInputColStride;
-    m_fastNumPatches = base_mapper.m_fastNumPatches;
-    m_fastColStride = base_mapper.m_fastColStride;
-    m_fastOutputRows = base_mapper.m_fastOutputRows;
-    m_fastDimZero = base_mapper.m_fastDimZero;
-  }
-
-  // If true, turns off some optimizations for loading packets since the image
-  // patches are "non-standard" such as there are non-trivial strides or
-  // inflations in the input.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
-           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the coefficient at the patchIndex location instead of the usual
-  // m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  // EIGEN_DEVICE_FUNC
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
-
- private:
-  friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>;
-
-  // Load coefficient from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
-                                       Index colIndex, Index otherIndex) const {
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset * m_in_col_strides;
-    const Index origInputCol =
-        (m_patch_col_inflate_strides == 1)
-            ? inputCol
-            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
-    const Index origInputRow =
-        (m_patch_row_inflate_strides == 1)
-            ? inputRow
-            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
-        origInputRow >= m_inputRows ||
-        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
-        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + origInputRow * m_rowInputStride +
-                             origInputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
-  // and `in_strides` equal to 1 (template specialization without templates).
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
-                                               Index colIndex,
-                                               Index otherIndex) const {
-    eigen_assert(!nonStandardPatches());
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
-        inputRow >= m_inputRows) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // Load packet from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
-                                        Index colIndex,
-                                        Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    if (nonStandardPatches()) {
-      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-    }
-    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
-                                                Index colIndex,
-                                                Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-
-    if ((patchDepth() % packetSize) == 0) {
-      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
-    } else {
-      // Offsets and input calculation here are identical to
-      // loadCoeffStandard(...), but repeated twice.
-
-      const Index patchOffsets[2] = {
-          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
-
-      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
-                                   patchOffsets[1] / m_fastColStride};
-      const Index inputCols[2] = {colIndex + colOffsets[0],
-                                  colIndex + colOffsets[1]};
-      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
-        // all zeros
-        return internal::pset1<Packet>(Scalar(0));
-      }
-
-      if (inputCols[0] == inputCols[1]) {
-        const Index rowOffsets[2] = {
-            patchOffsets[0] - colOffsets[0] * m_colStride,
-            patchOffsets[1] - colOffsets[1] * m_colStride};
-        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-        const Index inputRows[2] = {rowIndex + rowOffsets[0],
-                                    rowIndex + rowOffsets[1]};
-
-        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
-          // all zeros
-          return internal::pset1<Packet>(Scalar(0));
-        }
-
-        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
-          // no padding
-          const Index depth = patchId - patchOffsets[0] * patchDepth();
-          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
-                                   inputCols[0] * m_colInputStride + otherIndex;
-          return m_impl.template packet<Unaligned>(inputIndex);
-        }
-      }
-    }
-    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
-                                            Index colIndex,
-                                            Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-    eigen_assert((patchDepth() % packetSize) == 0);
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
-        inputRow >= m_inputRows) {
-      // all zeros
-      return internal::pset1<Packet>(Scalar(0));
-    }
-    // no padding
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
-      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX
-    typename internal::remove_const<Scalar>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
-    }
-    Packet rslt = internal::pload<Packet>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
-      Index patchIndex, Index& rowIndex, Index& colIndex,
-      Index& otherIndex) const {
-    const size_t NumInputDims = array_size<
-        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
-    const Index patch2DIndex = (NumInputDims == 3)
-                                   ? patchIndex
-                                   : (patchIndex - otherIndex * m_num_patches);
-    otherIndex *= m_patchInputStride;
-    colIndex = patch2DIndex / m_fastOutputRows;
-    rowIndex = patch2DIndex - colIndex * m_outputRows;
-    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
-    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
-  }
-
-  Index m_patch_cols;   // number of columns in the patch
-  Index m_num_patches;  // number of patches to extract.
-
-  // Strides for navigating through the single patch.
-  Index m_patch_row_stride;
-  Index m_patch_col_stride;
-  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
-  internal::TensorIntDivisor<Index> m_fastPatchColStride;
-
-  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
-                                      // image patch
-  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
-                                      // image patch
-  // Fast representation of inflation strides.
-  internal::TensorIntDivisor<Index> m_fastInputRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputColStride;
-
-  Index m_otherStride;
-  Index m_colStride;
-  internal::TensorIntDivisor<Index> m_fastNumPatches;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-
-  Index m_rowInputStride;    // row stride in the input tensor
-  Index m_colInputStride;    // col stride in the input tensor
-  Index m_patchInputStride;  // patch stride in the input tensor
-
-  Index m_inputRows;  // Number of rows in the input tensor
-  Index m_inputCols;  // Number of cols in the input tensor
-
-  Index m_outputRows;  // Number of patch rows
-
-  Index m_row_strides;  // User specified row stride
-  Index m_col_strides;  // User specified col stride
-
-  Index m_in_row_strides;  // User specified input row stride
-  Index m_in_col_strides;  // User specified input col stride
-
-  Index m_rowPaddingTop;   // Row padding
-  Index m_colPaddingLeft;  // Column padding
-
-  internal::TensorIntDivisor<Index> m_fastOutputRows;
-  internal::TensorIntDivisor<Index> m_fastDimZero;
-
-  const TensorEvaluator<ArgType, Device> m_impl;
-};
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      ParentMapper;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset),
-        m_base_mapper(base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                   m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
-                                                          Index j) const {
-    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                    m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
-                                                          Index j) const {
-    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
-                                                        j + m_col_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
-  loadCoeffStandard(Index i) const {
-    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
-                                           m_colIndex, m_otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
-    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
-                                        m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
-  loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
-                                            m_colIndex, m_otherIndex);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
-    return false;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_base_mapper.nonStandardPatches();
-  }
-
-  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
-  // index respectively that fits into the peeled_k elements starting at
-  // m_depth_offset.
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
-    const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
-        fastPatchColStride();
-    return std::min<Index>(1 + max_col, patchCols());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
-                                   const Index col) const {
-    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
-                           col * patchColStride()) /
-                          fastPatchRowStride();
-    return std::min<Index>(1 + max_row, patchRows());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
-                                     Index row) const {
-    const Index max_depth = m_depth_offset + peeled_k -  //
-                            col * patchColStride() -     //
-                            row * patchRowStride();
-    return std::min<Index>(max_depth, patchDepth());
-  }
-
-  // MaxDepth uses only the remaining number of elements in the peeled_k.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
-                                     const Index start_depth) const {
-    return std::min<Index>(start_depth + num_elements, patchDepth());
-  }
-
-  // Every register matters in this code, so sometimes to prevent register
-  // spilling, instead of the variable that you would expect to see, we use
-  // another one, that is guaranteed to have the same value. E.g. patch depth is
-  // always the same as input depth, and it's also the same as input row stride.
-  // Bunch of other parameters have similar relations.
-
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const {
-    return m_base_mapper.m_rowInputStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const {
-    return m_base_mapper.m_colStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const {
-    return m_base_mapper.m_patch_cols;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return patchDepth();
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchColStride() const {
-    return m_base_mapper.m_patch_col_stride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return m_base_mapper.m_fastDimZero;  // patch_depth
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
-    return m_base_mapper.m_fastPatchColStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
-                                            const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.coeff(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
-    const Index r = m_rowIndex + row;
-    return r < 0 || r >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
-                                     const Index last_row) const {
-    return m_rowIndex + first_row < 0 ||
-           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
-    const Index c = m_colIndex + col;
-    return c < 0 || c >= m_base_mapper.m_inputCols;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
-    const Index r = m_rowIndex + row;
-    const Index c = m_colIndex + col;
-    return r * m_base_mapper.m_rowInputStride +
-           c * m_base_mapper.m_colInputStride + m_otherIndex;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowStride() const {
-    return m_base_mapper.m_row_strides;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colStride() const {
-    return m_base_mapper.m_col_strides;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return patchOffset - colOffset * m_base_mapper.m_colStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return colOffset;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index depthOffset() const {
-    return m_depth_offset % patchDepth();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
-  getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
-  }
-
- private:
-  Index m_depth_offset;  // First row in the input matrix
-  Index m_col_offset;    // First col in the input matrix
-
-  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
-  // indices for the first element in a patch specified by col_offset
-  // (see computeBaseIndices(...) for details).
-  Index m_rowIndex;
-  Index m_colIndex;
-  Index m_otherIndex;
-
-  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
-                                     // performs better in benchmarks.
-};
-
-// Arrange a block of the right input matrix (in our case it's always a "virtual
-// matrix" constructed from extracted image patches) in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
-// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
-// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
-// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
-// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
-// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
-// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
-// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
-// A8 ...
-// ...
-//
-// *) A, B, C, ... - patches extracted from the original input.
-// *) A0, A1, A2 ... - values from the same patch at different offsets.
-//
-// The traversal (packed rhs memory) order (B0 besides A0 in memory):
-// A0 B0 C0 D0 A1 B1 C1 D1 ...
-// E0 F0 G0 H0 E1 F1 G1 H1 ...
-// ...
-// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
-//
-// This traversal order must be the same as in default gemm_pack_rhs defined in
-// GeneralBlockPanelKernel.h.
-//
-// *) nr - number of registers along the 'n' dimension.
-//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
-//    Multiplication" paper.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-        inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if ((packet_size % 4) == 0 && !non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows, if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // Check if we can squeeze reads along the `row` and `depth`
-            // dimensions (two innermost dimensions).
-            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 4> kernel;
-            kernel.packet[0] = dm0.loadPacketStandard(k);
-            kernel.packet[1] = dm1.loadPacketStandard(k);
-            kernel.packet[2] = dm2.loadPacketStandard(k);
-            kernel.packet[3] = dm3.loadPacketStandard(k);
-            ptranspose(kernel);
-            pstoreu(block + 0 * packet_size, kernel.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel.packet[1]);
-            pstoreu(block + 2 * packet_size, kernel.packet[2]);
-            pstoreu(block + 3 * packet_size, kernel.packet[3]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!rhs.nonStandardPatches()) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // copy the remaining columns one at a time (nr==1)
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Template specialization for packet_size = 2. We must special-case packet
-// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const int packet_size = 2;
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if (!non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // We can squeeze reads along the `row` and `depth` dimensions if
-            // the row stride is `1`, which means that `row` and `depth`
-            // dimensions are contiguous (two innermost dimensions).
-            if (rhs.rowStride() == 1 &&                                //
-                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          // Packet can span multiple rows or columns, so we have to go
-          // though the slower "standard" path.
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 2> kernel0;
-            PacketBlock<Packet, 2> kernel1;
-            kernel0.packet[0] = dm0.loadPacketStandard(k);
-            kernel0.packet[1] = dm1.loadPacketStandard(k);
-            kernel1.packet[0] = dm2.loadPacketStandard(k);
-            kernel1.packet[1] = dm3.loadPacketStandard(k);
-            ptranspose(kernel0);
-            ptranspose(kernel1);
-            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!non_standard_patches) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Special case for non-vectorized types such as float16.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      if (!rhs.nonStandardPatches()) {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-// Arrange a block of the right input matrix (in our case it's always a
+// Pack a block of the right input matrix (in our case it's always a
 // "virtual matrix" constructed from extracted image patches) in contiguous
-// memory.
-//
-// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
-// this is basically the same as taking a slice of the matrix. Knowing
-// properties of the original patch op we can do it more efficient than default
-// mkldnn_gemm_pack.
+// block in column-major storage order. Knowing the properties of the
+// original patch op we can do it more efficient than the default
+// gemm_pack_colmajor_block.
 template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
           typename Device, typename Scalar, typename StorageIndex,
           typename nocontract_t, typename contract_t, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-struct mkldnn_gemm_pack<
+struct gemm_pack_colmajor_block<
     Scalar, StorageIndex,
     TensorContractionSubMapper<
         Scalar, StorageIndex, Rhs,
@@ -1503,204 +221,12 @@ struct mkldnn_gemm_pack<
     }
   }
 };
-#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-
 }  // end namespace internal
-
-/** SpatialConvolution
- * \ingroup CXX11_NeuralNetworks_Module
- *
- * \brief Applies a 2D convolution over a multichannel input image.
- *
- * The input parameter is expected to be a tensor with a rank of 3 or more
- * (channels, height, width, and optionally others)
- * The kernel parameter is expected to be a 4D tensor (filters, channels,
- * kernel_height, kernel_width)
- * The input and the kernel must both be in col-major layout. The result will
- * also be in col-major layout.
- *
- * If col_in_stride, row_in_stride > 1, then applies convolution with holes
- * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
- * pixels.
- *
- * The result can be assigned to a tensor of rank equal to the rank of the
- * input. The dimensions of the result will be filters, height, width (and
- * others if applicable).
- *
- * It is possible to swap the order of the width and height dimensions provided
- * that the same order is used in the input, the kernel, and the output.
- *
- * It is also possible to add an output kernel to the contraction, output
- * kernel is called by Eigen when it "finalizes" the block of an output tensor.
- *
- */
-template <typename Input, typename Kernel,
-          typename OutputKernel = const NoOpOutputKernel>
-EIGEN_DEVICE_FUNC
-    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-        internal::traits<Input>::Layout == ColMajor,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const OutputKernel> >,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const OutputKernel> > >::type
-    SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const Index row_stride = 1, const Index col_stride = 1,
-                       const PaddingType padding_type = PADDING_SAME,
-                       const Index row_in_stride = 1,
-                       const Index col_in_stride = 1,
-                       const OutputKernel& output_kernel = OutputKernel()) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
-                   internal::traits<Input>::NumDimensions,
-                   internal::traits<Input>::Layout, TensorIndex> >
-      in(input);
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
-                   internal::traits<Kernel>::NumDimensions,
-                   internal::traits<Kernel>::Layout, TensorIndex> >
-      kern(kernel);
-
-  EIGEN_STATIC_ASSERT(
-      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
-      YOU_MADE_A_PROGRAMMING_MISTAKE);
-  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  const int NumDims = internal::traits<Input>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the
-  // result
-  const TensorIndex kernelFilters =
-      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels =
-      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
-  const TensorIndex kernelRows =
-      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
-  const TensorIndex kernelCols =
-      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
-
-  const Index kernelRowsEff =
-      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const Index kernelColsEff =
-      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
-
-  array<IndexPair<TensorIndex>, 1> contract_dims;
-  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-
-  const TensorIndex InputRows =
-      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex InputCols =
-      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-
-  TensorIndex out_height;
-  TensorIndex out_width;
-  switch (padding_type) {
-    case PADDING_VALID:
-      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
-                                static_cast<float>(row_stride));
-      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
-                               static_cast<float>(col_stride));
-      break;
-    case PADDING_SAME:
-      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
-      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
-      break;
-    default:
-      // Initialize unused variables to avoid a compiler warning
-      out_height = 0;
-      out_width = 0;
-      eigen_assert(false && "unexpected padding");
-  }
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  // kernels
-  // - the second dimension (dims[1]): everything else
-  DSizes<TensorIndex, 2> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[1] = out_height * out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[1] *= in.dimension(i);
-    }
-  } else {
-    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[0] = out_height * out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= in.dimension(i);
-    }
-  }
-
-  // Molds the output of the contraction into the shape expected by the used
-  // (assuming this is ColMajor):
-  // - 1st dim: kernel filters
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelFilters;
-    post_contract_dims[1] = out_height;
-    post_contract_dims[2] = out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelFilters;
-    post_contract_dims[NumDims - 2] = out_height;
-    post_contract_dims[NumDims - 3] = out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, 2> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
-    kernel_dims[1] = kernelFilters;
-  }
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      kernel.reshape(kernel_dims)
-          .contract(input
-                        .extract_image_patches(
-                            kernelRows, kernelCols, row_stride, col_stride,
-                            row_in_stride, col_in_stride, padding_type)
-                        .reshape(pre_contract_dims),
-                    contract_dims, output_kernel)
-          .reshape(post_contract_dims),
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-          .reshape(post_contract_dims));
-}
-
 }  // end namespace Eigen
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+// Note the following header is used in both TF and TFLite. Particularly, it's
+// used for float TFLite Conv2D.
+#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 03002adec4740090d8ea65f31f88a73e1a565310..9aba7b6327852cbcdd39d4e9af8b76f1b9f3ba72 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1435,8 +1435,9 @@ static void PackRhsHelper(int iters,
       /*Alignment*/ 0>;
 
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-  using PackRhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
-                                                        SubMapper, ColMajor>;
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<float, Eigen::Index, SubMapper,
+                                                ColMajor>;
 #else
   using PackRhsImpl =
       Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
@@ -1606,9 +1607,11 @@ static void PackLhsHelper(int iters,
       /*Alignment*/ 0>;
 
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-  using PackLhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
-                                                        SubMapper, ColMajor>;
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<float, Eigen::Index, SubMapper,
+                                                ColMajor>;
 #else
+  using Traits = typename Eigen::internal::gebp_traits<float, float>;
   using PackLhsImpl =
       Eigen::internal::gemm_pack_lhs<float, Eigen::Index, SubMapper,      //
                                      Traits::mr,                          //
@@ -1697,9 +1700,9 @@ static void PackLhsHelper(int iters,
     SubMapper sub_mapper =
         input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
 
-    // NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
-    // first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
-    // and accepts block rows and cols in the same order for lhs and rhs.
+// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+// and accepts block rows and cols in the same order for lhs and rhs.
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
     pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
 #else
@@ -1720,7 +1723,7 @@ static void PackLhsHelper(int iters,
 //    H: height
 //    W: width
 //    C: input channels
-//   FC: filter channles
+//   FC: filter channels
 //   FH: filter height
 //   FW: filter width
 //   SH: stride in height dimensions
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 1a5b0f2b675a85ba2c1dbf0356c3e42b03db22b4..e80404a437523862bfe6b8c2961b11cc00bd4426 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
index 8fcda25e692f9aa550ddbb17a4f5cef8ba570b83..cb9a1660a7d059bebaaadea8cc309f74ab974948 100644
--- a/tensorflow/core/kernels/encode_png_op.cc
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index 4a0c1943e54d11f68bef68756851750f4099caa4..213c63f41ae5ade1fc27bb4b72fa3f50d55c9a72 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -525,11 +525,16 @@ class EncodeProtoOp : public OpKernel {
           ctx,
           proto_utils::IsCompatibleType(field_descs_[i]->type(), v.dtype()),
           errors::InvalidArgument(
-              "Incompatible type for field " + field_names_[i] +
-                  ".  Saw dtype: ",
-              DataTypeString(v.dtype()),
+              "Incompatible type for field ", field_names_[i],
+              ".  Saw dtype: ", DataTypeString(v.dtype()),
               " but field type is: ", field_descs_[i]->type_name()));
 
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsMatrixOrHigher(v.shape()),
+          errors::InvalidArgument("Invalid shape for field ", field_names_[i],
+                                  ".  Saw shape ", v.shape().DebugString(),
+                                  " but it should be at least a matrix."));
+
       // All value tensors must have the same shape prefix (i.e. batch size).
       TensorShape shape_prefix = v.shape();
       shape_prefix.RemoveDim(shape_prefix.dims() - 1);
diff --git a/tensorflow/core/kernels/encode_wav_op.cc b/tensorflow/core/kernels/encode_wav_op.cc
index aed095076b92cdef60e217c610fa4c11eb4717ec..082f9a74ae1e36f22ed206c3049dbfd40ac55a48 100644
--- a/tensorflow/core/kernels/encode_wav_op.cc
+++ b/tensorflow/core/kernels/encode_wav_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/audio_ops.cc
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
 
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 528b3c6bf07553e9aeaddb4c00ef3b0e19a8b516..708b52a5174fdcf1ee084add8de23f8ef2f0c07f 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -591,7 +591,22 @@ class ParseSingleSequenceExampleOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->output_list("feature_list_dense_values",
                                          &feature_list_dense_values));
 
+#ifdef TENSORFLOW_LITE_PROTOS
     SequenceExample ex;
+#else
+    // Allocate the SequenceExample on an arena. Provides better memory locality
+    // and greatly speeds up destruction.
+    protobuf::ArenaOptions options;
+    // We have some hint of what the final proto size will be based on the size
+    // of the serialized bytes- use this to set a custom allocation strategy.
+    // Note that the default allocation strategy is quite conservative (min
+    // block size of 256 bytes, and a max of 8 kilobytes).
+    const size_t block_size = serialized_t().size() * 1.1;
+    options.start_block_size = std::max(options.start_block_size, block_size);
+    options.max_block_size = std::max(options.max_block_size, block_size);
+    protobuf::Arena arena(options);
+    auto& ex = *protobuf::Arena::CreateMessage<SequenceExample>(&arena);
+#endif
     OP_REQUIRES(
         ctx, ParseProtoUnlimited(&ex, serialized_t()),
         errors::InvalidArgument("Could not parse example input, value: '",
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/extract_image_patches_op.cc
index 68631d14dbc4af5553e02a7e3d622c3772a95eb5..9306eccf9f018f66cc22a7d88050a20814e46f15 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/extract_image_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
index 60d798af56737c6abb322a971b31ae596ea96ec6..ab424595c1a6e5c26f26aae9dc3768cf2bf15c9b 100644
--- a/tensorflow/core/kernels/extract_jpeg_shape_op.cc
+++ b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc
index 52cd078a3512bcfae13539f1e95ef66c4adf8a03..8107bca7d18633f45e747b5175eca1e11f2cc6fe 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op.cc
@@ -26,11 +26,11 @@ when rates are to be added.
 
 #include "tensorflow/core/kernels/extract_volume_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/eye_functor_gpu.cu.cc b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
index a620316e27595aaa246b018a9a8afc6f678f45a4..d3ac4406313c8cffebbd569a956d3fa6c044b392 100644
--- a/tensorflow/core/kernels/eye_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
@@ -17,11 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/eye_functor.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/kernels/eye_functor.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 050c95cf40d4b29bde66b6b6e72b1b48a7199965..d4c92586897da1ead541a98f5d721a9c18d235b9 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -88,9 +88,16 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
+template <>
+void SetZeroFunctor<GPUDevice, Variant>::operator()(
+    const GPUDevice& d, typename TTypes<Variant>::Flat out) {
+  // TODO(b/123028789): Implement this.
+}
+
 #define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
 TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
 TF_CALL_bool(DEFINE_SETZERO_GPU);
+TF_CALL_variant(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 5ecb203cbc7296d75f6a0a68a2189d7bf018c7fe..246a6ce04d97a5dec54f2d0b44da7e278d703908 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
@@ -120,6 +121,7 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
     opts->stats_collector = ctx->stats_collector();
   }
   opts->runner = ctx->runner();
+  opts->step_container = ctx->step_container();
 }
 
 class IfOp : public AsyncOpKernel {
@@ -210,6 +212,98 @@ class IfOp : public AsyncOpKernel {
   };
 };
 
+class CaseOp : public AsyncOpKernel {
+ public:
+  explicit CaseOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branch_funcs_));
+  }
+
+  ~CaseOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library"), done);
+
+    // TODO(b/37549631): Because this op has `SetIsStateful()` in its op
+    // registration, this kernel may be shared by multiple subgraphs, which have
+    // different associated `FunctionLibraryRuntime` objects and hence different
+    // `FHandle` namespaces. So we must call Instantiate() to make sure we get
+    // the correct function handles with respect to `lib`. Note the underlying
+    // `lib->Instantiate()` caches the created function handles, so calling
+    // `Instantiate()` repeatedly on the same `lib` and function is cheap.
+    std::vector<FHandle> branch_handles(branch_funcs_.size());
+    for (int i = 0; i < branch_funcs_.size(); i++) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, Instantiate(lib, branch_funcs_[i], &branch_handles[i]), done);
+    }
+
+    const Tensor& branch_index = ctx->input(0);
+    OP_REQUIRES_ASYNC(ctx, TensorShapeUtils::IsScalar(branch_index.shape()),
+                      errors::InvalidArgument("branch_index must be scalar"),
+                      done);
+    int32 branch = branch_index.scalar<int32>()();
+    (new State(this, ctx, branch, branch_handles, done))->Start();
+  }
+
+ private:
+  std::vector<NameAttrList> branch_funcs_;
+
+  class State {
+   public:
+    State(CaseOp* kernel, OpKernelContext* ctx, int branch,
+          std::vector<FHandle> branch_handles, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          branch_(branch),
+          branch_handles_(branch_handles),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
+      for (int i = 1; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      int branch = branch_;
+      // The last branch is the default branch.
+      if (branch < 0 || branch >= branch_handles_.size()) {
+        branch = branch_handles_.size() - 1;
+      }
+      rets_.clear();
+      lib_->Run(
+          // Evaluate one of the branch.
+          opts_, branch_handles_[branch], args_, &rets_,
+          // Done callback
+          [this](Status s) {
+            if (s.ok()) {
+              s = SetOutputs(kernel_, ctx_, rets_);
+            }
+            ctx_->SetStatus(s);
+            DoneCallback captured_done(std::move(done_));
+            delete this;
+            captured_done();
+          });
+    }
+
+   private:
+    CaseOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const int branch_;
+    std::vector<FHandle> branch_handles_;
+    DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+  };
+};
+
 // TODO(drpng): remove this.
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
@@ -218,6 +312,10 @@ REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
+REGISTER_KERNEL_BUILDER(Name("Case").Device(DEVICE_CPU), CaseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("Case").Device(DEVICE_GPU).HostMemory("branch_index"), CaseOp);
+
 REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(
     Name("StatelessIf").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 4dbb6a71160e4c4921aec0992624f197f50963ea..b3b637bac725485dadf05ba35ce0622d50f3798e 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -34,7 +34,7 @@ class FuzzStringSplit : public FuzzSession {
     Tensor delimiter_tensor(tensorflow::DT_STRING, TensorShape({}));
 
     if (size > 0) {
-      // The spec for split is that the delimeter should be 0 or 1 characters.
+      // The spec for split is that the delimiter should be 0 or 1 characters.
       // Naturally, fuzz it with something larger.  (This omits the possibility
       // of handing it a > int32_max size string, which should be tested for in
       // an explicit test).
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 7710cf93d61eeebf25a71d99e92b6b3e9ce237c9..93bdebc00e17abb702236453c220ada1e330c5cb 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index 11ea63d730aa69509edaacf127e62b4bbeb5740f..fe7850f9253f9b4ce641653439071129a7bde697 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -92,19 +92,15 @@ struct GatherFunctor<GPUDevice, T, Index> {
 
     CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
     if (is_axis_zero) {
-      // clang-format off
-      GatherOpKernel<T, Index, true>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              params.data(), indices.data(), out.data(), gather_dim_size,
-              indices_size, slice_size, out_size);
-      // clang-format on
+      TF_CHECK_OK(CudaLaunchKernel(
+          GatherOpKernel<T, Index, true>, config.block_count,
+          config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
+          out.data(), gather_dim_size, indices_size, slice_size, out_size));
     } else {
-      // clang-format off
-      GatherOpKernel<T, Index, false>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              params.data(), indices.data(), out.data(), gather_dim_size,
-              indices_size, slice_size, out_size);
-      // clang-format on
+      TF_CHECK_OK(CudaLaunchKernel(
+          GatherOpKernel<T, Index, false>, config.block_count,
+          config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
+          out.data(), gather_dim_size, indices_size, slice_size, out_size));
     }
     // TODO(fpmc): enable indices validation on GPU.
     // Right now checking for indicies out of bound in the kernel would
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index e50b7fe3bf7fb7a32820ec6f95421cb90b506c0a..58867a34bc2361daceb99edd9a6396fe22e5b856 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 003badb74da3512124490d054cf78fad75c2404c..77c0d7717ee97c5a5a130e38c89b17d20fc8acc9 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -18,8 +18,8 @@ limitations under the License.
 // Functor definition for GatherOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index 1c78de253e702f5e546467bbed0758c24dbe0443..cf9817dc3060be9e9325d04637e89e147ce143c1 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index da8d2e9e3cb35235dc524545aebac80065945a11..22fb6674413abbafd6af526517bdcce57d571e36 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -86,12 +86,11 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
 
-    // clang-format off
-    GatherSliceOpKernel<T, Index, IXDIM>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            Tparams.data(), Tindices.data(), Tout.data(), batch_strides,
-            batch_indices, indices_size, s_size, out_size);
-    // clang-format on
+    TF_CHECK_OK(CudaLaunchKernel(GatherSliceOpKernel<T, Index, IXDIM>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), Tparams.data(), Tindices.data(),
+                                 Tout.data(), batch_strides, batch_indices,
+                                 indices_size, s_size, out_size));
 
     // TODO(ebrevdo): enable indices validation on GPU.
     // Right now checking for indices out of bound in the kernel would
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 5795f68889e2393451c5cfae2fd29f14e8f9adce..b26f0a7528df979041869fa327c3c4d890eb58df 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/gpu_device_array.h
similarity index 87%
rename from tensorflow/core/kernels/cuda_device_array.h
rename to tensorflow/core/kernels/gpu_device_array.h
index 74dc298c7a5dc5395af015d2a56df60de2fc2db2..3961cee043be327cdfa6d7c757b4d5ce88ffd32e 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/gpu_device_array.h
@@ -15,20 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 #define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/cuda_device_array_gpu.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 
 namespace tensorflow {
 
 // Create an array of value on the host, to be sent to kernel using
-// CudaDeviceArrayStruct.
+// GpuDeviceArrayStruct.
 //
 // Usage:
 //   int size = ...;
-//   CudaDeviceArrayOnHost ptrs(context, size);
+//   GpuDeviceArrayOnHost ptrs(context, size);
 //   OP_REQUIRES_OK(ptrs.Init());
 //   for (int i = 0; i < size; ++i) {
 //     ptrs.Set(i, ...);
@@ -38,9 +38,9 @@ namespace tensorflow {
 //
 // ValueType must be memcopyable.
 template <typename ValueType, int MaxInlineValues = 8>
-class CudaDeviceArrayOnHost {
+class GpuDeviceArrayOnHost {
  public:
-  CudaDeviceArrayOnHost(OpKernelContext* context, int32 size)
+  GpuDeviceArrayOnHost(OpKernelContext* context, int32 size)
       : context_(context),
         total_bytes_(static_cast<int64>(size) * sizeof(ValueType)) {
     data_.size = size;
@@ -93,7 +93,7 @@ class CudaDeviceArrayOnHost {
     return Status::OK();
   }
 
-  const CudaDeviceArrayStruct<ValueType, MaxInlineValues>& data() const {
+  const GpuDeviceArrayStruct<ValueType, MaxInlineValues>& data() const {
     // Ensure Finalize is called.
     DCHECK(inlined() || out_of_line_values_on_gpu_.IsInitialized());
     return data_;
@@ -105,16 +105,16 @@ class CudaDeviceArrayOnHost {
   OpKernelContext* const context_;
   const int64 total_bytes_;  // total size of all pointers.
   ValueType* values_ = nullptr;
-  CudaDeviceArrayStruct<ValueType, MaxInlineValues> data_;
+  GpuDeviceArrayStruct<ValueType, MaxInlineValues> data_;
 
   Tensor out_of_line_values_on_host_;
   Tensor out_of_line_values_on_gpu_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CudaDeviceArrayOnHost);
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuDeviceArrayOnHost);
 };
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
diff --git a/tensorflow/core/kernels/cuda_device_array_gpu.h b/tensorflow/core/kernels/gpu_device_array_gpu.h
similarity index 76%
rename from tensorflow/core/kernels/cuda_device_array_gpu.h
rename to tensorflow/core/kernels/gpu_device_array_gpu.h
index 64fa3cb806bc7454bc6d9893e560201a620df43a..ca2051c70db920f48a0e205cdced07dc842e209c 100644
--- a/tensorflow/core/kernels/cuda_device_array_gpu.h
+++ b/tensorflow/core/kernels/gpu_device_array_gpu.h
@@ -18,15 +18,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
-static constexpr int kMaxInlineCudaPointers = 8;
-// To decode on the device side, use GetCudaDeviceArrayOnDevice.
-// To encode on the host side, use CudaDeviceArrayOnHost.
+static constexpr int kMaxInlineGpuPointers = 8;
+// To decode on the device side, use GetGpuDeviceArrayOnDevice.
+// To encode on the host side, use GpuDeviceArrayOnHost.
 template <typename ValueType, int MaxInlineValues = 8>
-struct CudaDeviceArrayStruct {
+struct GpuDeviceArrayStruct {
   int32 size;
   // used if size <= MaxInlineValues;
   ValueType inline_values[MaxInlineValues];
@@ -34,8 +34,8 @@ struct CudaDeviceArrayStruct {
 };
 
 template <typename ValueType, int MaxInlineValues = 8>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetCudaDeviceArrayOnDevice(
-    CudaDeviceArrayStruct<ValueType, MaxInlineValues>* data) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetGpuDeviceArrayOnDevice(
+    GpuDeviceArrayStruct<ValueType, MaxInlineValues>* data) {
   if (data->size <= MaxInlineValues) {
     return data->inline_values;
   } else {
@@ -45,6 +45,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetCudaDeviceArrayOnDevice(
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..298acfba54dc49f23b97e1d85aee5cbcbe256e8a
--- /dev/null
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -0,0 +1,153 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/gpu_utils.h"
+
+#if GOOGLE_CUDA
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/logger.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/protobuf/conv_autotuning.pb.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+namespace tensorflow {
+namespace {
+
+tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  tensorflow::CudnnVersion cudnn_version;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      cudnn_version.set_major(version.major_version());
+      cudnn_version.set_minor(version.minor_version());
+      cudnn_version.set_patch(version.patch());
+    }
+  }
+  return cudnn_version;
+}
+
+tensorflow::ComputeCapability GetComputeCapability(
+    se::StreamExecutor* stream_executor) {
+  tensorflow::ComputeCapability cc;
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  cc.set_major(cc_major);
+  cc.set_minor(cc_minor);
+  return cc;
+}
+
+}  // namespace
+
+void LogConvAutotuneResults(const NodeDef& node, const Tensor& input,
+                            const Tensor& filter, const Tensor& output,
+                            se::StreamExecutor* stream_exec,
+                            absl::Span<const AutotuneResult> results) {
+  AutotuningLog log;
+  ConvNodeDef instr;
+  *instr.mutable_conv() = node;
+  input.shape().AsProto(instr.mutable_input()->mutable_tensor_shape());
+  instr.mutable_input()->set_dtype(input.dtype());
+  filter.shape().AsProto(instr.mutable_filter()->mutable_tensor_shape());
+  instr.mutable_filter()->set_dtype(filter.dtype());
+  output.shape().AsProto(instr.mutable_output()->mutable_tensor_shape());
+  instr.mutable_output()->set_dtype(output.dtype());
+  log.mutable_instr()->PackFrom(std::move(instr));
+  *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
+  *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
+  for (const auto& result : results) {
+    *log.add_results() = result;
+  }
+  Logger::Singleton()->LogProto(log);
+}
+
+void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
+                                 const Tensor& filter, const Tensor& output,
+                                 const Tensor& bias, const Tensor* side_input,
+                                 se::StreamExecutor* stream_exec,
+                                 absl::Span<const AutotuneResult> results) {
+  AutotuningLog log;
+  ConvNodeDef instr;
+  *instr.mutable_conv() = node;
+  input.shape().AsProto(instr.mutable_input()->mutable_tensor_shape());
+  instr.mutable_input()->set_dtype(input.dtype());
+  filter.shape().AsProto(instr.mutable_filter()->mutable_tensor_shape());
+  instr.mutable_filter()->set_dtype(filter.dtype());
+  output.shape().AsProto(instr.mutable_output()->mutable_tensor_shape());
+  instr.mutable_output()->set_dtype(output.dtype());
+  bias.shape().AsProto(instr.mutable_bias()->mutable_tensor_shape());
+  instr.mutable_bias()->set_dtype(bias.dtype());
+  if (side_input) {
+    side_input->shape().AsProto(
+        instr.mutable_side_input()->mutable_tensor_shape());
+    instr.mutable_side_input()->set_dtype(side_input->dtype());
+  }
+  log.mutable_instr()->PackFrom(std::move(instr));
+  *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
+  *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
+  for (const auto& result : results) {
+    *log.add_results() = result;
+  }
+  Logger::Singleton()->LogProto(log);
+}
+
+Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
+                              se::dnn::AlgorithmConfig* algo) {
+  // For the "!xhs.has_success()" below, this is because we want successful ones
+  // to order first, therefore they need a smaller key per "min_element".
+  const AutotuneResult* best_result = std::min_element(
+      results.begin(), results.end(),
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        return std::make_tuple(
+                   !lhs.has_success(),
+                   proto_utils::FromDurationProto(lhs.success().run_time())) <
+               std::make_tuple(
+                   !rhs.has_success(),
+                   proto_utils::FromDurationProto(rhs.success().run_time()));
+      });
+
+  const AutotuneResult* best_result_no_scratch = std::min_element(
+      results.begin(), results.end(),
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        return std::make_tuple(
+                   !lhs.has_success(), lhs.success().scratch_bytes(),
+                   proto_utils::FromDurationProto(lhs.success().run_time())) <
+               std::make_tuple(
+                   !rhs.has_success(), rhs.success().scratch_bytes(),
+                   proto_utils::FromDurationProto(rhs.success().run_time()));
+      });
+
+  if (best_result == results.end() || !best_result->has_success()) {
+    return errors::NotFound("No algorithm worked!");
+  }
+  algo->set_algorithm({best_result->conv().algorithm(),
+                       best_result->conv().tensor_ops_enabled()});
+  if (best_result_no_scratch != results.end() &&
+      best_result_no_scratch->has_success() &&
+      best_result_no_scratch->success().scratch_bytes() == 0) {
+    algo->set_algorithm_no_scratch(
+        {best_result_no_scratch->conv().algorithm(),
+         best_result_no_scratch->conv().tensor_ops_enabled()});
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 86146f75f4da277949c6269bf5aec7dce70653f1..332d07e4176609f01b5132c0827d30fd848aa257 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -20,6 +20,9 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -28,6 +31,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+class NodeDef;
+class AutotuneResult;
+
 template <typename T>
 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
@@ -156,6 +162,25 @@ class AutoTuneSingleton {
   }
 };
 
+// Logs convolution results to customized back-storage.
+void LogConvAutotuneResults(const NodeDef& node, const Tensor& input,
+                            const Tensor& filter, const Tensor& output,
+                            se::StreamExecutor* stream_exec,
+                            absl::Span<const AutotuneResult> results);
+
+// Logs fused convolution results to customized back-storage.
+void LogFusedConvAutotuneResults(const NodeDef& node, const Tensor& input,
+                                 const Tensor& filter, const Tensor& output,
+                                 const Tensor& bias, const Tensor* side_input,
+                                 se::StreamExecutor* stream_exec,
+                                 absl::Span<const AutotuneResult> results);
+
+// Returns the best algorithms for the config, one is the fastest, the other is
+// other is fastest with 0 scracth space. Unsuccessful autotuning results are
+// allowed and ignored.
+Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
+                              se::dnn::AlgorithmConfig* algo);
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 87d36f22d719ade68d17c6f4a2e6dc2deeef9e45..a85de34ac262906aa0bbe2adc600505eb76dcedd 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -24,11 +24,18 @@ tf_cc_test(
     deps = [
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 477e729dcb97e20afe090ac774bf3e4efd4b5d8a..df2796f24b4a6bffc28a2b6bc4dd3fe45998618c 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -338,7 +338,6 @@ Status GraphTransferer::TransformGraphToAddAggregatedInputNode(
     shapes.emplace_back(input_node_info_list.at(i).second.shape());
   }
 
-  NodeDef input_node_def;
   auto builder =
       NodeBuilder(AGGREGATED_INPUT_NODE_NAME, "RemoteFusedGraphExecute")
           .Input(std::vector<NodeBuilder::NodeOut>{})
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 1b382996f88bc220eecb6c5f5cb07d6db987c106..9c57c1d429853cf31ac49df707dd5fe15482722d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -76,7 +76,7 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
   // TODO(satok): Use actual data passed by FillInputNode and remove
   // std::vector<float> dummy_input_float_{};
   std::unordered_map<int, std::vector<uint8>> input_tensor_data_{};
-  // Dummy byte array for cosnt node.
+  // Dummy byte array for const node.
   // TODO(satok): Remove
   std::unordered_map<int, std::vector<uint8>> dummy_const_data_{};
 
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index d08a7c9bd27510656173e41d0db63de41368859d..17dad526ce34f3e1447b908fc8f318921b33e1b6 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -63,8 +63,6 @@ REGISTER_KERNEL_BUILDER(Name("Const")
 #endif  // TENSORFLOW_USE_SYCL
 
 // HostConst: forced to generate output on the host.
-// Only used in tests; no op is registered for this kernel
-// externally (i.e., in array_ops.cc)
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), _HostConstantOp);
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 1d4fa1a7db11d28268063055143ccfcbc966ec5c..591acfb444a1901fcbe6c66672a664ceb59ff5c6 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -28,12 +28,12 @@ limitations under the License.
 #include <array>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
@@ -45,9 +45,29 @@ inline float CalculateResizeScale(int64 in_size, int64 out_size,
              : in_size / static_cast<float>(out_size);
 }
 
+// Half pixel scaler scales assuming that the pixel centers are at 0.5, i.e. the
+// floating point coordinates of the top,left pixel is 0.5,0.5.
+struct HalfPixelScaler {
+  inline float operator()(const int x, const float scale) const {
+    // Note that we subtract 0.5 from the return value, as the existing bilinear
+    // sampling code etc assumes pixels are in the old coordinate system.
+    return (static_cast<float>(x) + 0.5f) * scale - 0.5f;
+  }
+};
+
+// Older incorrect scaling method that causes all resizes to have a slight
+// translation leading to inconsistent results. For example, a flip then a
+// resize gives different results then a resize then a flip.
+struct LegacyScaler {
+  inline float operator()(const int x, const float scale) const {
+    return static_cast<float>(x) * scale;
+  }
+};
+
 struct ImageResizerState {
-  explicit ImageResizerState(bool align_corners)
-      : align_corners_(align_corners) {}
+  explicit ImageResizerState(bool align_corners, bool half_pixel_centers)
+      : align_corners_(align_corners),
+        half_pixel_centers_(half_pixel_centers) {}
 
   // ValidateAndCalculateOutputSize checks the bounds on the input tensors
   // and requested size, sets up some of the resizing state such as the
@@ -56,6 +76,11 @@ struct ImageResizerState {
   // the context, which the caller must check.
   void ValidateAndCalculateOutputSize(OpKernelContext* context,
                                       const Tensor& input) {
+    OP_REQUIRES(
+        context,
+        !half_pixel_centers_ || (half_pixel_centers_ && !align_corners_),
+        errors::InvalidArgument("If half_pixel_centers is True, "
+                                "align_corners must be False."));
     OP_REQUIRES(context, input.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
                                         input.shape().DebugString()));
@@ -127,14 +152,23 @@ struct ImageResizerState {
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 struct ImageResizerGradientState {
-  explicit ImageResizerGradientState(bool align_corners)
-      : align_corners_(align_corners) {}
+  explicit ImageResizerGradientState(bool align_corners,
+                                     bool half_pixel_centers)
+      : align_corners_(align_corners),
+        half_pixel_centers_(half_pixel_centers) {}
 
   void ValidateAndCreateOutput(OpKernelContext* context, const Tensor& input,
                                const Tensor& original_image) {
+    OP_REQUIRES(
+        context,
+        !half_pixel_centers_ || (half_pixel_centers_ && !align_corners_),
+        errors::InvalidArgument("If half_pixel_centers is True, "
+                                "align_corners must be False."));
+
     OP_REQUIRES(context, input.dims() == 4,
                 errors::InvalidArgument("input_grad must be 4-dimensional",
                                         input.shape().DebugString()));
@@ -187,6 +221,7 @@ struct ImageResizerGradientState {
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index c37055239c28e0ab243ea30b05b2c8af0905766c..506091f76ec69f1f092b8fe0c67ea46deb851510 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 7f06764d526e7bf49fea318a9d20eaaea6f45133..51862854a75882c81fdc9969a156aa1436514bde 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -543,6 +543,7 @@ REGISTER_EMPTY(float, GPU);
 REGISTER_EMPTY(double, GPU);
 REGISTER_EMPTY(Eigen::half, GPU);
 REGISTER_EMPTY(int64, GPU);
+REGISTER_EMPTY(int32, GPU);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index ba9879691b408d9455de1767d943240a0bab0190..35cfe03e8e2ccaefb03067624c2611f65577d0b3 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -49,9 +49,9 @@ Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
   const int64 ncols = Toutput.dimension(1);
   const T* src = value.flat<T>().data();
   T* dst = output->flat<T>().data();
-  DoParallelConcatOpKernel<T>
-      <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-          cfg.virtual_thread_count, nrows, ncols, loc, src, dst);
+  TF_CHECK_OK(CudaLaunchKernel(
+      DoParallelConcatOpKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+      d.stream(), cfg.virtual_thread_count, nrows, ncols, loc, src, dst));
   return Status::OK();
 }
 
@@ -117,19 +117,22 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
   T* dst = y->flat<T>().data();
   switch (op) {
     case I_UPDATE:
-      DoInplaceOpKernel<T, I_UPDATE>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      TF_CHECK_OK(CudaLaunchKernel(DoInplaceOpKernel<T, I_UPDATE>,
+                                   cfg.block_count, cfg.thread_per_block, 0,
+                                   d.stream(), cfg.virtual_thread_count, nrows,
+                                   ncols, n, src, rowids, dst));
       break;
     case I_ADD:
-      DoInplaceOpKernel<T, I_ADD>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      TF_CHECK_OK(CudaLaunchKernel(DoInplaceOpKernel<T, I_ADD>, cfg.block_count,
+                                   cfg.thread_per_block, 0, d.stream(),
+                                   cfg.virtual_thread_count, nrows, ncols, n,
+                                   src, rowids, dst));
       break;
     case I_SUB:
-      DoInplaceOpKernel<T, I_SUB>
-          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      TF_CHECK_OK(CudaLaunchKernel(DoInplaceOpKernel<T, I_SUB>, cfg.block_count,
+                                   cfg.thread_per_block, 0, d.stream(),
+                                   cfg.virtual_thread_count, nrows, ncols, n,
+                                   src, rowids, dst));
       break;
   }
 }
@@ -148,9 +151,10 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
   const int32* rowids = i.flat<int32>().data();
   bool* dst = y->flat<bool>().data();
   if (op == I_UPDATE) {
-    DoInplaceOpKernel<bool, I_UPDATE>
-        <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-            cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+    TF_CHECK_OK(CudaLaunchKernel(DoInplaceOpKernel<bool, I_UPDATE>,
+                                 cfg.block_count, cfg.thread_per_block, 0,
+                                 d.stream(), cfg.virtual_thread_count, nrows,
+                                 ncols, n, src, rowids, dst));
   }
 }
 
diff --git a/tensorflow/core/kernels/kernel_platform_strings.h b/tensorflow/core/kernels/kernel_platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf40c30a56577ebe21d4a4ba9bf371e30803f79
--- /dev/null
+++ b/tensorflow/core/kernels/kernel_platform_strings.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate platform strings for libtfkernel-*
+
+#ifndef TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+TF_PLATFORM_STRINGS()
+
+#endif  // TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 5859c20d89e06a9684c0abe4199c5d84778fc26f..b5b7b75143b5a321fa016579dc14b8144ac6b7d6 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -79,12 +79,10 @@ static Status TensorListDeviceCopy(
   to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
-    Tensor tmp(t.dtype());
-    // Do not copy uninitialized tensors.
+    to->tensors.emplace_back(t.dtype());
     if (t.dtype() != DT_INVALID) {
-      TF_RETURN_IF_ERROR(copy(t, &tmp));
+      TF_RETURN_IF_ERROR(copy(t, &to->tensors.back()));
     }
-    to->tensors.push_back(tmp);
   }
   return Status::OK();
 }
@@ -99,13 +97,6 @@ REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
 
-Status TensorListShape(const TensorList& t, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorListShape);
-
 bool TensorList::Decode(const VariantTensorData& data) {
   // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
   // that we do not have to copy each tensor individually below. This would
@@ -155,6 +146,7 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
   if (t.shape() == TensorShape({})) {
     if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
         (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1)) {
+      *out = PartialTensorShape();
       return Status::OK();
     }
     return errors::InvalidArgument(
@@ -173,6 +165,57 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
       DataTypeString(t.dtype()));
 }
 
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape) {
+  TF_RETURN_IF_ERROR(TensorShapeFromTensor(c->input(index), element_shape));
+  // Check that `element_shape` and `tensor_list.element_shape` are
+  // compatible and store the merged shape in `element_shape`.
+  PartialTensorShape tmp = *element_shape;
+  TF_RETURN_IF_ERROR(tmp.MergeWith(tensor_list.element_shape, element_shape));
+  return Status::OK();
+}
+
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list) {
+  if (!TensorShapeUtils::IsScalar(c->input(index).shape())) {
+    return errors::InvalidArgument("Input list must be a scalar saw: ",
+                                   c->input(index).shape().DebugString());
+  }
+  const TensorList* l = c->input(index).scalar<Variant>()().get<TensorList>();
+  if (l == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a list. Saw: '",
+        c->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *list = l;
+  return Status::OK();
+}
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list) {
+  // Attempt to forward the input tensor to the output if possible.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  std::unique_ptr<Tensor> maybe_output =
+      c->forward_input(input_index, output_index, DT_VARIANT, TensorShape{},
+                       c->input_memory_type(input_index), attr);
+  Tensor* output_tensor;
+  if (maybe_output != nullptr) {
+    // Woohoo, forwarding succeeded!
+    output_tensor = maybe_output.get();
+  } else {
+    // If forwarding is not possible allocate a new output tensor and copy
+    // the `input_list` to it.
+    TF_RETURN_IF_ERROR(
+        c->allocate_output(output_index, {}, &output_tensor, attr));
+    output_tensor->scalar<Variant>()() = input_list;
+  }
+  *output_list = output_tensor->scalar<Variant>()().get<TensorList>();
+  return Status::OK();
+}
+
 class EmptyTensorList : public OpKernel {
  public:
   explicit EmptyTensorList(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -234,11 +277,8 @@ class TensorListPushBack : public OpKernel {
                                         " but tried to append ",
                                         DataTypeString(input.dtype())));
 
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, l->element_shape.IsCompatibleWith(input.shape()),
                 errors::InvalidArgument(
                     "Tried to append a tensor with incompatible shape to a "
@@ -259,21 +299,9 @@ class TensorListPushBack : public OpKernel {
                                   " max_num_elements: ", l->max_num_elements));
     }
 
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.push_back(
-          input);
-    } else {
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      TensorList output;
-      output = *l;
-      output.tensors.push_back(input);
-      result->scalar<Variant>()() = std::move(output);
-    }
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.push_back(input);
   }
 
  private:
@@ -296,12 +324,8 @@ class TensorListLength : public OpKernel {
   ~TensorListLength() override {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(
-        c, l != nullptr,
-        errors::InvalidArgument(
-            "TensorListLength received a variant which is not a list. Saw: '",
-            c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
     result->scalar<int32>()() = l->tensors.size();
@@ -324,15 +348,8 @@ class TensorListElementShape : public OpKernel {
   explicit TensorListElementShape(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "TensorListElementShape received a variant which is not a "
-                    "list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     if (l->element_shape.unknown_rank()) {
       OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
@@ -358,69 +375,6 @@ class TensorListElementShape : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("TensorListElementShape").Device(DEVICE_CPU),
                         TensorListElementShape);
 
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListElementShape")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("element_shape"),
-                        TensorListElementShape);
-
-#endif  // GOOGLE_CUDA
-
-class TensorListPopBack : public OpKernel {
- public:
-  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
-
-  ~TensorListPopBack() override {}
-
-  void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-
-    OP_REQUIRES(c, !l->tensors.empty(),
-                errors::InvalidArgument("Trying to pop from an empty list."));
-
-    c->set_output(1, l->tensors.back());
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.pop_back();
-    } else {
-      TensorList output;
-      output = *l;
-      output.tensors.pop_back();
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      result->scalar<Variant>()() = std::move(output);
-    }
-  }
-
- private:
-  DataType element_dtype_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_CPU),
-                        TensorListPopBack);
-
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_GPU),
-                        TensorListPopBack);
-
-#endif  // GOOGLE_CUDA
-
 class TensorListReserve : public OpKernel {
  public:
   explicit TensorListReserve(OpKernelConstruction* c) : OpKernel(c) {
@@ -458,72 +412,13 @@ REGISTER_KERNEL_BUILDER(Name("TensorListReserve")
                         TensorListReserve);
 
 #endif  // GOOGLE_CUDA
-
-class TensorListGetItem : public OpKernel {
- public:
-  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
-
-  void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    int32 index = c->input(1).scalar<int32>()();
-    OP_REQUIRES(c, index < l->tensors.size(),
-                errors::InvalidArgument("Trying to access element ", index,
-                                        " in a list with ", l->tensors.size(),
-                                        " elements."));
-    c->set_output(0, l->tensors[index]);
-  }
-
- private:
-  DataType element_dtype_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
-                        TensorListGetItem);
-
-#if GOOGLE_CUDA
-
-#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("index"),               \
-                          TensorListGetItem);
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
-REGISTER_TENSOR_LIST_GET_ITEM_GPU(bool)
-#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
-
-#endif  // GOOGLE_CUDA
-
 class TensorListResize : public OpKernel {
  public:
   explicit TensorListResize(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* input_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, input_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* input_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &input_list));
     int32 size = c->input(1).scalar<int32>()();
     OP_REQUIRES(
         c, size >= 0,
@@ -579,11 +474,8 @@ class TensorListSetItem : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, element_dtype_ == l->element_dtype,
                 errors::InvalidArgument("Invalid data types; op elements ",
                                         DataTypeString(element_dtype_),
@@ -601,21 +493,9 @@ class TensorListSetItem : public OpKernel {
                     "list index. Item element shape: ",
                     value.shape().DebugString(),
                     " list shape: ", l->element_shape.DebugString()));
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    std::unique_ptr<Tensor> maybe_result = c->forward_input(
-        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
-    if (maybe_result != nullptr) {
-      maybe_result->scalar<Variant>()().get<TensorList>()->tensors[index] =
-          value;
-    } else {
-      TensorList output;
-      output = *l;
-      output.tensors[index] = value;
-      Tensor* result;
-      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-      result->scalar<Variant>()() = std::move(output);
-    }
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors[index] = value;
   }
 
  private:
@@ -637,6 +517,7 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
 #undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
@@ -736,69 +617,67 @@ REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListStack<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGather<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGetItem<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListPopBack<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListFromTensor<CPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatterIntoExistingList<CPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListSplit<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
                           TensorListPushBackBatch<CPUDevice, T>)
 
-TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
+TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_OPS_CPU);
+REGISTER_TENSOR_LIST_OPS_CPU(quint8);
+REGISTER_TENSOR_LIST_OPS_CPU(qint8);
+REGISTER_TENSOR_LIST_OPS_CPU(quint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint32);
+REGISTER_TENSOR_LIST_OPS_CPU(Variant);
 
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
+#undef REGISTER_TENSOR_LIST_OPS_CPU
 
-#define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListStack<CPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListConcat<CPUDevice, T>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
-REGISTER_TENSOR_LIST_STACK_CPU(quint8);
-REGISTER_TENSOR_LIST_STACK_CPU(qint8);
-REGISTER_TENSOR_LIST_STACK_CPU(quint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint32);
-REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_STACK_CPU
-
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(T)                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListFromTensor<CPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListSplit<CPUDevice, T>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint32);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_CPU
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 23f552642cac273cf53b25a6d43e1e6ca23ea0cc..9922a92dec39708bff2ef3566b9e264cd5e73f00 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -36,73 +36,90 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_TENSOR_LIST_STACK_GPU(T)                         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
-                          TensorListStack<GPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("lengths"),             \
-                          TensorListConcat<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_STACK_GPU
-
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
-                          TensorListPushBackBatch<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
-
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape"),       \
-                          TensorListFromTensor<GPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("lengths"),             \
+#define REGISTER_TENSOR_LIST_OPS_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListStack<GPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices")                       \
+                              .HostMemory("element_shape"),                \
+                          TensorListGather<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("index")                         \
+                              .HostMemory("element_shape"),                \
+                          TensorListGetItem<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListPopBack<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("leading_dims")                  \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU),                         \
+                          TensorListPushBackBatch<GPUDevice, T>)           \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListFromTensor<GPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("num_elements")                  \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices"),                      \
+                          TensorListScatterIntoExistingList<GPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
                           TensorListSplit<GPUDevice, T>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_GPU
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_OPS_GPU
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")
+                            .TypeConstraint<Variant>("element_dtype")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape"),
+                        TensorListPopBack<GPUDevice, Variant>)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index fd1be80f11652745410931ed6224cecc47e6c9de..682ea15caf94ccdca1e49e7721d3c227f0f1a4fc 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -66,14 +66,16 @@ struct TensorList {
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
 
-// Allocates a Tensor of requested shape and dtype and fills it with zeros.
-template <typename Device, typename T>
-void BuildZerosTensor(OpKernelContext* c, DataType dtype,
-                      const TensorShape& shape, Tensor* zeros) {
-  OP_REQUIRES_OK(c, c->allocate_temp(dtype, shape, zeros));
-  functor::SetZeroFunctor<Device, T> f;
-  f(c->eigen_device<Device>(), zeros->flat<T>());
-}
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape);
+
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list);
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list);
 
 template <typename Device, typename T>
 class TensorListStack : public OpKernel {
@@ -85,27 +87,14 @@ class TensorListStack : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("num_elements", &num_elements_));
   }
 
-  ~TensorListStack() {}
-
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    OP_REQUIRES(
-        c,
-        !tensor_list->tensors.empty() ||
-            tensor_list->element_shape.IsFullyDefined(),
-        errors::InvalidArgument("Tried to stack elements of a empty ",
-                                "list with non-fully-defined element_shape: ",
-                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
                   errors::InvalidArgument(
@@ -113,44 +102,40 @@ class TensorListStack : public OpKernel {
                       " elements but got a list with ",
                       tensor_list->tensors.size(), " elements."));
     }
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // element tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first
-    // initialized element tensor is used and it is checked that all other
-    // initialized tensors have the same shape. An error is thrown if the list
-    // only contains DT_INVALID type tensors.
-    TensorShape resulting_element_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_element_shape)) {
-      bool resulting_element_shape_initialized = false;
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1,
+                                               &partial_element_shape));
+    OP_REQUIRES(
+        c,
+        partial_element_shape.IsFullyDefined() || !tensor_list->tensors.empty(),
+        errors::InvalidArgument("Tried to stack elements of an empty ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
       for (int i = 0; i < tensor_list->tensors.size(); ++i) {
         const Tensor& t = tensor_list->tensors[i];
-        if (!resulting_element_shape_initialized) {
-          if (t.dtype() == DT_INVALID) {
-            continue;
-          }
-          resulting_element_shape = t.shape();
-          resulting_element_shape_initialized = true;
-          continue;
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
         }
-        OP_REQUIRES(
-            c, t.dtype() == DT_INVALID || t.shape() == resulting_element_shape,
-            errors::InvalidArgument(
-                "Tried to stack tensors with unequal shapes: ",
-                resulting_element_shape.DebugString(), " vs ",
-                t.shape().DebugString()));
       }
-      OP_REQUIRES(
-          c, resulting_element_shape_initialized,
-          errors::InvalidArgument("Tried to stack list which only contains ",
-                                  "uninitialized tensors and has a ",
-                                  "non-fully-defined element_shape: ",
-                                  tensor_list->element_shape.DebugString()));
     }
-    TensorShape output_tensor_shape = resulting_element_shape;
-    output_tensor_shape.InsertDim(0, tensor_list->tensors.size());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(c, partial_element_shape.AsTensorShape(&element_shape),
+                errors::InvalidArgument(
+                    "Tried to stack list which only contains uninitialized ",
+                    "tensors and has a non-fully-defined element_shape: ",
+                    partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
@@ -158,13 +143,21 @@ class TensorListStack : public OpKernel {
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
     Tensor zeros;
-    BuildZerosTensor<Device, T>(c, element_dtype_, resulting_element_shape,
-                                &zeros);
     for (const auto& t : tensor_list->tensors) {
       if (t.dtype() != DT_INVALID) {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             t.shaped<T, 2>({1, t.NumElements()})));
       } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             const_cast<const Tensor&>(zeros).shaped<T, 2>(
                 {1, zeros.NumElements()})));
@@ -186,6 +179,122 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListGetItem : public OpKernel {
+ public:
+  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32 index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors.size(),
+                errors::InvalidArgument("Trying to access element ", index,
+                                        " in a list with ", l->tensors.size(),
+                                        " elements."));
+    if (l->tensors[index].dtype() != DT_INVALID) {
+      c->set_output(0, l->tensors[index]);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 2, &partial_element_shape));
+      TensorShape element_shape;
+      // If l->element_shape and the element_shape input are both not fully
+      // defined, try to infer the shape from other list elements. This requires
+      // that all initialized list elements have the same shape.
+      // NOTE(srbs): This might be a performance bottleneck since we are
+      // iterating over the entire list here. This is necessary for feature
+      // parity with TensorArray.read. TensorArray has a mode in which all
+      // elements are required to be of the same shape, TensorList does not.
+      // In that mode TensorArray sets the array's element_shape on the first
+      // write call. We could do something similar here if needed.
+      if (!partial_element_shape.IsFullyDefined()) {
+        for (const Tensor& t : l->tensors) {
+          if (t.dtype() != DT_INVALID) {
+            PartialTensorShape tmp = partial_element_shape;
+            OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+          }
+        }
+      }
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined: ",
+                                  partial_element_shape.DebugString(),
+                                  " and no list element is set."));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(0, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListPopBack : public OpKernel {
+ public:
+  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    OP_REQUIRES(c, !l->tensors.empty(),
+                errors::InvalidArgument("Trying to pop from an empty list."));
+
+    const Tensor& t = l->tensors.back();
+    if (t.dtype() != DT_INVALID) {
+      c->set_output(1, t);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 1, &partial_element_shape));
+      TensorShape element_shape;
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined.",
+                                  partial_element_shape.DebugString()));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(1, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.pop_back();
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 template <typename Device, typename T>
 class TensorListConcat : public OpKernel {
  public:
@@ -206,76 +315,93 @@ class TensorListConcat : public OpKernel {
     }
   }
 
-  ~TensorListConcat() {}
-
   void Compute(OpKernelContext* c) override {
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    // If the TensorList is empty, its element_shape must be fully defined
-    // except for the first dimension.
-    if (!element_shape_except_first_dim_.IsFullyDefined()) {
-      if (!tensor_list->element_shape.unknown_rank()) {
-        OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
-                    errors::InvalidArgument(
-                        "Concat requires elements to be at least vectors, ",
-                        "found scalars instead."));
-        PartialTensorShape shape_except_first_dim(
-            gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
-                .subspan(1));
-        PartialTensorShape tmp = element_shape_except_first_dim_;
-        OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
-                                        &element_shape_except_first_dim_));
-      }
+    // The leading dimension of all list elements if they are all the same.
+    // This is used as the leading dim of uninitialized tensors in the list
+    // if leading_dims is not provided.
+    int64 first_dim = -1;
+    if (c->num_inputs() > 1) {
+      // TensorListConcatV2
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *tensor_list, 1, &element_shape));
+      OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      // Split `element_shape` into `first_dim` and
+      // `element_shape_except_first_dim_`.
+      first_dim = element_shape.dim_size(0);
+      element_shape_except_first_dim_ = element_shape;
+      element_shape_except_first_dim_.RemoveDim(0);
     }
+    // If the TensorList is empty, element_shape_except_first_dim_ must be fully
+    // defined.
     OP_REQUIRES(c,
                 !tensor_list->tensors.empty() ||
                     element_shape_except_first_dim_.IsFullyDefined(),
                 errors::InvalidArgument(
                     "All except the first dimension must be fully defined ",
                     "when concating an empty tensor list. element_shape: ",
-                    tensor_list->element_shape.DebugString()));
-    // 1. Compute the shape of the output tensor.
-    // If `element_shape_except_first_dim_` is fully-defined we just prepend the
-    // leading dim to it. Otherwise we use the shape of the first element tensor
-    // and check to make sure shapes of all tensors are compatible.
-    TensorShape output_shape;
-    if (!element_shape_except_first_dim_.AsTensorShape(&output_shape)) {
-      const Tensor& element_tensor = tensor_list->tensors[0];
-      OP_REQUIRES(
-          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
-          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
-                                  " but requires at least vectors."));
-      output_shape =
-          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
-                          .subspan(1));
-      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
-        const Tensor& element_tensor = tensor_list->tensors[i];
-        OP_REQUIRES(
-            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
-            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
-                                    " but requires at least vectors."));
-        TensorShape actual_shape(
-            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
-                .subspan(1));
-        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
-                    errors::InvalidArgument(
-                        "Tried to concat tensors with unequal shapes: ",
-                        output_shape.DebugString(), " vs ",
-                        actual_shape.DebugString()));
+                    element_shape_except_first_dim_.DebugString()));
+    // 1. Check that `element_shape_except_first_dim_` input tensor is
+    //    compatible with the shapes of element tensors.
+    // 2. Check that the elements have the same shape except the first dim.
+    // 3. If `first_dim` is known, check that it is compatible with the leading
+    //    dims of all elements.
+    // 4. If `first_dim` is unknown (-1), check whether all initialized
+    //    elements have the same leading dim and if so set `first_dim` to that
+    //    value.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      bool check_dim = (first_dim == -1);
+      int64 inferred_first_dim = first_dim;
+      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = element_shape_except_first_dim_;
+          OP_REQUIRES(
+              c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+              errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                      " but requires at least vectors."));
+          TensorShape shape_except_first_dim = TensorShape(
+              gtl::ArraySlice<int64>(t.shape().dim_sizes()).subspan(1));
+          OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
+                                          &element_shape_except_first_dim_));
+          OP_REQUIRES(c, first_dim == -1 || first_dim == t.shape().dim_size(0),
+                      errors::InvalidArgument(
+                          "First entry of element_shape input does not match ",
+                          "the first dim of list element at index: ", i,
+                          " Expected: ", first_dim,
+                          " Actual: ", t.shape().dim_size(0)));
+          if (check_dim) {
+            if (inferred_first_dim == -1) {
+              inferred_first_dim = t.shape().dim_size(0);
+            } else if (inferred_first_dim != t.shape().dim_size(0)) {
+              inferred_first_dim = -1;
+              check_dim = false;
+            }
+          }
+        }
       }
+      first_dim = inferred_first_dim;
     }
-    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    TensorShape output_shape;
+    OP_REQUIRES(
+        c, element_shape_except_first_dim_.AsTensorShape(&output_shape),
+        errors::InvalidArgument(
+            "Trying to concat list with only uninitialized tensors ",
+            "but element_shape_except_first_dim_ is not fully defined: ",
+            element_shape_except_first_dim_.DebugString()));
+    // Build the lengths_tensor and leading dim of the output tensor by
     // iterating over all element tensors.
     Tensor* lengths_tensor = nullptr;
     OP_REQUIRES_OK(
@@ -286,13 +412,36 @@ class TensorListConcat : public OpKernel {
     auto lengths_tensor_vec = lengths_tensor->vec<int64>();
     int64 leading_dim = 0;
     for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
-      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      int64 dim;
+      if (tensor_list->tensors[i].dtype() != DT_INVALID) {
+        dim = tensor_list->tensors[i].shape().dim_size(0);
+      } else {
+        // If leading_dims is not provided or does not contain an entry for
+        // index i use the inferred `first_dim` if set.
+        if ((c->num_inputs() <= 2 || i >= c->input(2).NumElements()) &&
+            first_dim != -1) {
+          dim = first_dim;
+        } else {
+          OP_REQUIRES(c, c->num_inputs() > 2,
+                      errors::InvalidArgument(
+                          "Concating lists with uninitialized tensors is not ",
+                          "supported in this version of TensorListConcat. ",
+                          "Consider updating your GraphDef to run the newer ",
+                          "version."));
+          OP_REQUIRES(c, i < c->input(2).NumElements(),
+                      errors::InvalidArgument(
+                          "List contains uninitialized tensor at index ", i,
+                          " but leading_dims has only ",
+                          c->input(2).NumElements(), " elements."));
+          dim = c->input(2).vec<int64>()(i);
+        }
+      }
       leading_dim += dim;
       lengths_tensor_vec(i) = dim;
     }
     output_shape.InsertDim(0, leading_dim);
     Tensor* output;
-    // 3. Allocate the output tensor and fill it up with the concated element
+    // Allocate the output tensor and fill it up with the concated element
     // tensors.
     OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
@@ -301,9 +450,31 @@ class TensorListConcat : public OpKernel {
 
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
-    for (const auto& element_tensor : tensor_list->tensors) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    // Store the zeros tensors in a vector to prevent them from being GC'ed till
+    // concat is complete.
+    std::vector<Tensor> zeros_vec;
+    for (int i = 0; i < tensor_list->tensors.size(); i++) {
+      const Tensor& element_tensor = tensor_list->tensors[i];
+      if (element_tensor.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+      } else {
+        AllocatorAttributes attr;
+        if (element_dtype_ == DT_VARIANT) {
+          attr.set_on_host(true);
+        }
+        TensorShape element_shape = output_shape;
+        element_shape.set_dim(0, lengths_tensor_vec(i));
+        zeros_vec.emplace_back();
+        Tensor& zeros = zeros_vec.back();
+        OP_REQUIRES_OK(
+            c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+        functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                             zeros.flat<T>());
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -406,59 +577,55 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    Tensor indices = c->input(1);
+    const Tensor& indices = c->input(1);
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 2,
+                                               &partial_element_shape));
     OP_REQUIRES(
-        c,
-        indices.NumElements() > 0 ||
-            tensor_list->element_shape.IsFullyDefined(),
+        c, partial_element_shape.IsFullyDefined() || indices.NumElements() > 0,
         errors::InvalidArgument("Tried to gather 0-elements from "
                                 "a list with non-fully-defined shape: ",
-                                tensor_list->element_shape.DebugString()));
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // requested tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first requested
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const int i = indices.flat<int32>()(0);
-      OP_REQUIRES(
-          c, i < tensor_list->tensors.size(),
-          errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  tensor_list->tensors.size(), " elements."));
-      const Tensor& t = tensor_list->tensors[i];
-      resulting_shape = t.shape();
-      for (int index = 1; index < indices.NumElements(); ++index) {
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int index = 0; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to gather elements with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
       }
     }
-    resulting_shape.InsertDim(0, indices.NumElements());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(
+        c, partial_element_shape.AsTensorShape(&element_shape),
+        errors::InvalidArgument("Tried to gather uninitialized tensors from a ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, indices.NumElements());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(tensor_list->tensors.size());
+    inputs_flat.reserve(indices.NumElements());
+    Tensor zeros;
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
@@ -466,8 +633,24 @@ class TensorListGather : public OpKernel {
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
                                   tensor_list->tensors.size(), " elements."));
       const Tensor& t = tensor_list->tensors[i];
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -529,6 +712,81 @@ class TensorListFromTensor : public OpKernel {
   }
 };
 
+// Scatters values in `value` into `list`. Assumes that `indices` are valid.
+template <typename Device, typename T>
+Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices,
+               TensorList* list) {
+  for (int index = 0; index < indices.NumElements(); ++index) {
+    const int i = indices.flat<int32>()(index);
+    Tensor tmp = value.Slice(index, index + 1);
+    TensorShape tmp_shape = tmp.shape();
+    tmp_shape.RemoveDim(0);
+    if (!tmp.CopyFrom(tmp, tmp_shape)) {
+      return errors::Unknown("Unexpected shape error.");
+    }
+    // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+    // prevent this.
+    Tensor aligned;
+    TF_RETURN_IF_ERROR(c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+    // TODO(apassos) do all slices in a single kernel invocation instead of
+    // many small ones.
+    aligned.flat<T>().device(c->eigen_device<Device>()) =
+        tmp.unaligned_flat<T>();
+    std::swap(list->tensors[i], aligned);
+  }
+  return Status::OK();
+}
+
+template <typename Device, typename T>
+class TensorListScatterIntoExistingList : public OpKernel {
+ public:
+  TensorListScatterIntoExistingList(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    const Tensor& input_tensor = c->input(1);
+    const Tensor& indices = c->input(2);
+
+    // Check that inputs are valid.
+    OP_REQUIRES(c, input_tensor.dtype() == l->element_dtype,
+                errors::InvalidArgument(
+                    "Invalid data types; input tensor type: ",
+                    DataTypeString(input_tensor.dtype()),
+                    " list element_type: ", DataTypeString(l->element_dtype)));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument(
+                    "Expected indices to be a vector, but received shape: ",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(
+        c, indices.NumElements() == input_tensor.shape().dim_size(0),
+        errors::InvalidArgument(
+            "Expected len(indices) == tensor.shape[0], but saw: ",
+            indices.NumElements(), " vs. ", input_tensor.shape().dim_size(0)));
+
+    // Resize the list if needed to accommodate all indices.
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    const auto indices_vec = indices.vec<int32>();
+    int32 max_index =
+        (indices.NumElements() == 0)
+            ? -1
+            : *std::max_element(indices_vec.data(),
+                                indices_vec.data() + indices.NumElements());
+    if (max_index + 1 > output_list->tensors.size()) {
+      output_list->tensors.resize(max_index + 1);
+    }
+
+    // Scatter the values.
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, output_list));
+  }
+};
+
 template <typename Device, typename T>
 class TensorListScatter : public OpKernel {
  public:
@@ -542,6 +800,13 @@ class TensorListScatter : public OpKernel {
     Tensor indices = c->input(1);
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
+    // TensorListScatterV2 passes the num_elements input, TensorListScatter does
+    // not.
+    int num_elements = c->num_inputs() >= 4 ? c->input(3).scalar<int>()() : -1;
+    OP_REQUIRES(c, num_elements >= -1,
+                errors::InvalidArgument(
+                    "TensorListScatter expects num_elements >= -1, found: ",
+                    num_elements));
     TensorList output_list;
     const Tensor& input_tensor = c->input(0);
     output_list.element_dtype = input_tensor.dtype();
@@ -565,36 +830,27 @@ class TensorListScatter : public OpKernel {
 
     // Validate indices and resize output_list.tensors to fit the highest index.
     {
-      size_t list_size = 0;
+      int highest_index = -1;
       for (int index = 0; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
-        OP_REQUIRES(c, i >= 0,
+        OP_REQUIRES(
+            c, i >= 0,
+            errors::InvalidArgument(
+                "Indices in TensorListScatter must all be non-negative."));
+        OP_REQUIRES(c, num_elements == -1 || i < num_elements,
                     errors::InvalidArgument(
-                        "Indices in TensorListScatter must all be positive."));
-        if (i >= list_size) {
-          list_size = i + 1;
+                        "TensorListScatter: Trying to scatter at index ", i,
+                        " in list with size ", num_elements));
+        if (i > highest_index) {
+          highest_index = i;
         }
       }
-      output_list.tensors.resize(list_size, Tensor(DT_INVALID));
+      output_list.tensors.resize(std::max(highest_index + 1, num_elements),
+                                 Tensor(DT_INVALID));
     }
 
-    for (int index = 0; index < indices.NumElements(); ++index) {
-      const int i = indices.flat<int32>()(index);
-      Tensor tmp = input_tensor.Slice(index, index + 1);
-      TensorShape tmp_shape = tmp.shape();
-      tmp_shape.RemoveDim(0);
-      OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
-                  errors::Unknown("Unexpected shape error."));
-      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
-      // prevent this.
-      Tensor aligned;
-      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-      // TODO(apassos) do all slices in a single kernel invocation instead of
-      // many small ondes.
-      aligned.flat<T>().device(c->eigen_device<Device>()) =
-          tmp.unaligned_flat<T>();
-      std::swap(output_list.tensors[i], aligned);
-    }
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, &output_list));
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
 };
@@ -657,8 +913,6 @@ class TensorListPushBackBatch : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
   }
 
-  ~TensorListPushBackBatch() override {}
-
   void Compute(OpKernelContext* c) override {
     const Tensor& input = c->input(1);
     OP_REQUIRES(c, element_dtype_ == input.dtype(),
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 2599340d78a5308cbd63338db84e569f12541a4b..e611ae28b9a21d297cac179f24d343a4e5248ec9 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/logging_ops.h"
+
 #include <iostream>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -48,6 +51,22 @@ Status AppendStringToFile(const std::string& fname, StringPiece data,
 
 }  // namespace
 
+namespace logging {
+
+typedef std::vector<void (*)(const char*)> Listeners;
+
+Listeners* GetListeners() {
+  static Listeners* listeners = new Listeners;
+  return listeners;
+}
+
+bool RegisterListener(void (*listener)(const char*)) {
+  GetListeners()->push_back(listener);
+  return true;
+}
+
+}  // end namespace logging
+
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -157,7 +176,12 @@ class PrintV2Op : public OpKernel {
       OP_REQUIRES_OK(ctx, AppendStringToFile(file_path_, msg, ctx->env()));
       return;
     }
-    if (output_stream_ == "stdout") {
+    auto listeners = logging::GetListeners();
+    if (!listeners->empty()) {
+      for (auto& listener : *listeners) {
+        listener(msg.c_str());
+      }
+    } else if (output_stream_ == "stdout") {
       std::cout << msg << std::endl;
     } else if (output_stream_ == "stderr") {
       std::cerr << msg << std::endl;
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..92a8d63409478e7a0c162ae84361f7e2215aea46
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+
+namespace tensorflow {
+
+namespace logging {
+
+// Register a listener method to call on any printed messages.
+// Returns true if it is successfully registered.
+bool RegisterListener(void (*listener)(const char*));
+
+}  // namespace logging
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 9198a98e4785c31cfebd035d457d0d4b5d9b5c27..5f1b117f338f88fbf7bf06f64ae84404fe88ea7e 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -69,12 +69,12 @@ class LogisticLossUpdater : public DualLossUpdater {
     if (y_wx > 0) {
       // 0 + log(e^(0) + e^(-ywx - 0))
       // log(1 + e^(-ywx))
-      return log(1 + exp(-y_wx)) * example_weight;
+      return log1p(exp(-y_wx)) * example_weight;
     }
     // -ywx + log(e^(ywx) + e^(-ywx + ywx))
     // log(e^(ywx) + e^(0)) - ywx
     // log(1 + e^(ywx)) - ywx
-    return (log(1 + exp(y_wx)) - y_wx) * example_weight;
+    return (log1p(exp(y_wx)) - y_wx) * example_weight;
   }
 
   // Derivative of logistic loss
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 9451247f2684892f4666f77128d5721be9a2baa7..b046401c0ae397682a7e0e780e15c9c9f75a7524 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 #define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/lookup_tables/BUILD b/tensorflow/core/kernels/lookup_tables/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a25660e987ab80de58cee05551a98d0f00ea4268
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/BUILD
@@ -0,0 +1,89 @@
+# Description:
+#   OpKernels and resource templates for lookup tables.
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resource_interface_templates",
+    hdrs = ["resource_interface_templates.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "op_kernel_templates",
+    hdrs = ["op_kernel_templates.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_kernel_library(
+    name = "fingerprint64_map_op_kernels",
+    srcs = [
+        "fingerprint64_map_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_hash_map_op_kernels",
+    srcs = [
+        "flat_hash_map_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_kernel_library(
+    name = "generic_table_op_kernels",
+    srcs = [
+        "generic_table_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:string_view_variant_wrapper",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36274bc6b63c6efd871f360f4234133360cf8fd1
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Map x -> (Fingerprint64(x) % num_oov_buckets) + offset.
+// num_oov_buckets and offset are node attributes provided at construction
+// time.
+template <typename KeyType, typename ValueType>
+class Fingerprint64Map final
+    : public virtual LookupInterface<ValueType*, const KeyType&>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const KeyType>> {
+ public:
+  using key_type = KeyType;
+
+  Fingerprint64Map(int64 num_oov_buckets, int64 offset)
+      : num_oov_buckets_(num_oov_buckets), offset_(offset) {}
+
+  Status Lookup(const KeyType& key_to_find, ValueType* value) const override {
+    *value = LookupHelper(key_to_find);
+    return Status::OK();
+  }
+
+  Status Lookup(absl::Span<const KeyType> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i] = LookupHelper(keys[i]);
+    }
+    return Status::OK();
+  }
+
+  mutex* GetMutex() const override { return nullptr; }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const KeyType& key_to_find) const {
+    // This can cause a downcast.
+    return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                  num_oov_buckets_) +
+           offset_;
+  }
+
+  const int64 num_oov_buckets_;
+  const int64 offset_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Fingerprint64Map);
+};
+
+template <typename Fingerprint64Map>
+struct Fingerprint64MapFactory {
+  struct Functor {
+    using resource_type = Fingerprint64Map;
+
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    Fingerprint64Map** container) {
+      int64 num_oov_buckets;
+      int64 offset;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(kernel->def(), "num_oov_buckets", &num_oov_buckets));
+      TF_RETURN_IF_ERROR(GetNodeAttr(kernel->def(), "offset", &offset));
+      *container = new Fingerprint64Map(num_oov_buckets, offset);
+      return Status::OK();
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename Fingerprint64MapFactory<
+        Fingerprint64Map<KeyType, ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const KeyType&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const KeyType>>>;
+
+#define REGISTER_STRING_KERNEL(ValueType)                     \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype") \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<absl::string_view, ValueType>);              \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<string>("heterogeneous_key_dtype")  \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<string, ValueType>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c37ca87cea58d6bd72cc2b71c9fd934eae64081
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
@@ -0,0 +1,275 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <type_traits>
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/fingerprint.h"
+
+namespace tensorflow {
+namespace tables {
+
+using errors::InvalidArgument;
+
+// absl::flat_hash_map<HeterogeneousKeyType, ValueType> backed table with inline
+// fallback to x -> (Fingerprint64(x) % num_oov_buckets) + offset when looked
+// up keys are not in the flat_hash_map. Inlining the fallback table turns out
+// to be quite efficient in comparison to virtual dispatch for the fallback
+// lookup.
+template <typename ValueType>
+class StaticStringFlatHashMap final
+    : public virtual LookupInterface<ValueType*, const absl::string_view&>,
+      public virtual LookupInterface<ValueType*, const string&>,
+      public virtual LookupWithPrefetchInterface<
+          absl::Span<ValueType>, absl::Span<const absl::string_view>>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const string>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const absl::string_view>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const string>>,
+      public virtual SizeInterface {
+ public:
+  using value_type = ValueType;
+
+  StaticStringFlatHashMap(bool enable_synchronization, int64 num_oov_buckets)
+      : num_oov_buckets_(num_oov_buckets) {
+    if (enable_synchronization) {
+      mutex_ = absl::make_unique<mutex>();
+    }
+  }
+
+  Status Initialize(absl::Span<const absl::string_view> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(string(keys[i]), values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Initialize(absl::Span<const string> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(keys[i], values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Lookup(const absl::string_view& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  Status Lookup(const string& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const absl::string_view> keys,
+                absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const string> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  uint64 Size() const override { return table_.size(); }
+
+  mutex* GetMutex() const override { return mutex_.get(); }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  template <typename T>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const T& key_to_find) const {
+    auto it = table_.find(key_to_find);
+    if (it != table_.end()) {
+      return it->second;
+    } else {
+      return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                    num_oov_buckets_) +
+             StaticStringFlatHashMap::Size();
+    }
+  }
+
+  const int64 num_oov_buckets_;
+  std::unique_ptr<mutex> mutex_;
+  // The underlying table.
+  absl::flat_hash_map<string, ValueType> table_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StaticStringFlatHashMap);
+};
+
+// Used to allocate StaticStringFlatHashMap objects via the AllocateContainer
+// method.
+template <typename StaticStringFlatHashMap>
+struct StaticStringFlatHashMapFactory {
+  struct Functor {
+    using resource_type = StaticStringFlatHashMap;
+
+    template <typename StaticStringFlatHashMapBase>
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    StaticStringFlatHashMapBase** container) {
+      OpInputList table_int64_args;
+      TF_RETURN_IF_ERROR(
+          ctx->input_list("table_int64_args", &table_int64_args));
+      const size_t variadic_arg_size = table_int64_args.size();
+      if (ABSL_PREDICT_FALSE(variadic_arg_size != 2)) {
+        return errors::InvalidArgument(
+            "table_int64_args should have 2 elements (found ",
+            variadic_arg_size,
+            "). Set the first element to 1 to enable synchronized table use "
+            "and to 0 otherwise. The second element should be "
+            "num_oov_buckets.");
+      }
+
+      const bool enable_synchronization = ctx->input(0).scalar<int64>()() != 0;
+      const int64 num_oov_buckets = ctx->input(1).scalar<int64>()();
+      if (ABSL_PREDICT_FALSE(num_oov_buckets <= 0)) {
+        return errors::InvalidArgument(
+            "num_oov_buckets must be positive. Found: ", num_oov_buckets);
+      }
+      auto* non_virtual_container =
+          new StaticStringFlatHashMap(enable_synchronization, num_oov_buckets);
+      *container = non_virtual_container;
+      const Tensor& keys = ctx->input(table_int64_args.size());
+      const Tensor& values = ctx->input(table_int64_args.size() + 1);
+      if (keys.NumElements() == 0) {
+        return Status::OK();
+      } else if (keys.dtype() == DT_STRING) {
+        return Functor::Initialize(
+            keys.flat<string>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      } else if (keys.dtype() == DT_VARIANT) {
+        auto keys_flat = keys.flat<Variant>();
+        if (keys_flat(0).get<absl::string_view>() == nullptr) {
+          return errors::InvalidArgument(
+              "Variant keys tensor must have subtype absl::string_view.");
+        }
+        return Functor::Initialize(
+            keys.flat<Variant>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      }
+      return errors::InvalidArgument(
+          "keys tensor must have type DT_STRING or type DT_VARIANT with "
+          "subtype absl::string_view.");
+    }
+
+    static Status Initialize(
+        const absl::Span<const string> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      return container->Initialize(keys, values);
+    }
+
+    static Status Initialize(
+        const absl::Span<const Variant> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      std::vector<typename absl::string_view> keys_vec;
+      keys_vec.reserve(keys.size());
+      for (size_t i = 0; i < keys.size(); ++i) {
+        keys_vec.push_back(*keys[i].get<absl::string_view>());
+      }
+      return container->Initialize(keys_vec, values);
+    }
+  };
+};
+
+template <typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename StaticStringFlatHashMapFactory<
+        StaticStringFlatHashMap<ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const absl::string_view&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const absl::string_view>>,
+    LookupInterface<ValueType*, const string&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const string>>,
+    SizeInterface>;
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("StaticStringFlatHashMap")                              \
+          .Device(DEVICE_CPU)                                      \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype")      \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"), \
+      ResourceOp<table_value_dtype>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb29afd19a3fd2b03171e6a3d97555e34d3b35b
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
@@ -0,0 +1,227 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <type_traits>
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tables {
+
+template <typename KeyType, typename ValueType>
+struct TensorInsertFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = InsertOrAssignInterface<
+        absl::Span<const ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorInsert(const Tensor& keys, const Tensor& values,
+                               resource_type* table) {
+      if (keys.NumElements() != values.NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values.NumElements());
+      }
+      return TensorInsertHelper(keys, values, table);
+    }
+
+   private:
+    // keys and *values arguments to TensorInsert must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    // Non-variant KeyType which is the same as Container::key_type.
+    // No need to static_cast.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      return table->InsertOrAssign(keys.flat<KeyType>(),
+                                   values.flat<ValueType>());
+    }
+
+    // Variant KeyType; the wrapped type is convertible to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      keys_vec.reserve(keys_flat.size());
+      for (size_t i = 0; i < keys_flat.size(); ++i) {
+        keys_vec.emplace_back(
+            *keys_flat(i).get<typename VariantSubType::value_type>());
+      }
+      return table->InsertOrAssign(keys_vec, values.flat<ValueType>());
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using InsertOp = LookupTableInsertOp<
+    typename TensorInsertFactory<KeyType, ValueType>::Functor>;
+
+template <typename KeyType, typename ValueType>
+struct TensorLookupFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = LookupWithPrefetchInterface<
+        absl::Span<ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorLookup(const resource_type& table, const Tensor& keys,
+                               const int64 prefetch_lookahead,
+                               const int64 num_keys_per_thread,
+                               thread::ThreadPool* threadpool, Tensor* values) {
+      if (keys.NumElements() != values->NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values->NumElements());
+      }
+      return TensorLookupHelper(table, keys, prefetch_lookahead,
+                                num_keys_per_thread, threadpool, values);
+    }
+
+   private:
+    // keys and *values arguments to TensorLookup must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<KeyType>();
+      auto key_span = absl::MakeSpan(keys_flat);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Non-simple KeyType. We'll try an implicit conversion to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      const auto keys_size = keys_flat.size();
+      keys_vec.reserve(keys_size);
+      for (size_t i = 0; i < keys_size; ++i) {
+        keys_vec.emplace_back(*keys_flat(i).get<VariantSubType>()->get());
+      }
+      absl::Span<const typename VariantSubType::value_type> key_span(keys_vec);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Wrapper around table.BatchLookup which permits sharding across cores.
+    template <typename K, typename V>
+    static Status MultithreadedTensorLookup(const resource_type& table,
+                                            int64 prefetch_lookahead,
+                                            int64 num_keys_per_thread, K keys,
+                                            V values,
+                                            thread::ThreadPool* threadpool) {
+      mutex temp_mutex;  // Protect status.
+      Status status;
+      auto lookup_keys = [&](int64 begin, int64 end) {
+        auto temp_status = table.Lookup(keys.subspan(begin, end - begin),
+                                        values.subspan(begin, end - begin),
+                                        prefetch_lookahead);
+        if (ABSL_PREDICT_FALSE(!temp_status.ok())) {
+          mutex_lock lock(temp_mutex);
+          status.Update(temp_status);
+        }
+      };
+      threadpool->TransformRangeConcurrently(
+          num_keys_per_thread /* block_size */, keys.size(), lookup_keys);
+      return status;
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using LookupOp = LookupTableFindOp<
+    typename TensorLookupFactory<KeyType, ValueType>::Functor>;
+
+struct TableSizeFunctor {
+  using resource_type = SizeInterface;
+
+  static Status Size(const SizeInterface& table, uint64* size) {
+    *size = table.Size();
+    return Status::OK();
+  }
+};
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                     \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("insert_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("insert_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("lookup_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("lookup_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(Name("ContainerSizeOp").Device(DEVICE_CPU), \
+                          ContainerSizeOp<TableSizeFunctor>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..d767ca0661e1fad285729b6b68683395908b4096
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
@@ -0,0 +1,448 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+
+#include <cstddef>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/meta/type_traits.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Create resources of type ResourceType and AliasesToRegister using
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// ResourceType**). ResourceType = Functor::resource_type.
+// No-op for resources which have already been created.
+template <typename Functor, typename... AliasesToRegister>
+class ResourceConstructionOp : public OpKernel {
+ public:
+  explicit ResourceConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx,
+                    this](ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      ResourceType* resource = nullptr;
+      auto status = Functor::AllocateContainer(ctx, this, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* resource = nullptr;
+    core::ScopedUnref unref_me(resource);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &resource, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(resource)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~ResourceConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConstructionOp);
+};
+
+// Create resources of type ContainerBase using the static method
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// FallbackTableBaseType*, ContainerBase**)
+// If the resource has already been created it will be looked up.
+// Container must decrease the reference count of the FallbackTableBaseType*
+// constructor argument before its destructor completes.
+template <typename Functor, typename... AliasesToRegister>
+class TableWithFallbackConstructionOp : public OpKernel {
+ public:
+  explicit TableWithFallbackConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    if (ctx->num_inputs() == table_int64_args.size()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Expected op to have a resource input after the table_int64_args "
+          "input but no such input found."));
+      return;
+    }
+
+    // Look up the fallback table.
+    FallbackTableBaseType* fallback_table = nullptr;
+    {
+      const Tensor& table_handle = ctx->input(table_int64_args.size());
+      ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->Lookup<FallbackTableBaseType, true>(
+                   handle.container(), handle.name(), &fallback_table));
+    }
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx, this, fallback_table](
+                       ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // container construction logic can't be merged with
+      // ResourceConstructionOp because Container constructor requires an
+      // input which can only be constructed if the resource manager
+      // internal lock is not already held.
+      ResourceType* resource = nullptr;
+      auto status =
+          Functor::AllocateContainer(ctx, this, fallback_table, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* table = nullptr;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &table, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(table)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~TableWithFallbackConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  using FallbackTableBaseType = typename Functor::fallback_table_type;
+
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TableWithFallbackConstructionOp);
+};
+
+// Lookup a table of type ResourceAlias and insert the passed in keys and
+// values tensors using Functor::TensorInsert(keys, values, table).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableInsertOp : public OpKernel {
+ public:
+  explicit LookupTableInsertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    const size_t tensor_index_offset = table_int64_args.size();
+    // Business logic for checking tensor shapes, etc, is delegated to the
+    // Functor.
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& values = ctx->input(tensor_index_offset + 2);
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    int memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
+    auto* mutex = table->GetMutex();
+    if (mutex != nullptr) {
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    }
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(LookupTableInsertOp);
+};
+
+// Lookup a table of type ResourceAlias and look up the passed in keys using
+// Functor::TensorLookup(
+//     table, keys, prefetch_lookahead, num_keys_per_thread, threadpool, out).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableFindOp : public OpKernel {
+ public:
+  explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    {
+      auto status = ctx->input_list("table_int64_args", &table_int64_args);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+    // We lookup tensors using positional indices because that's more
+    // efficient than looking up their string names.
+    const Tensor& prefetch_lookahead_t = ctx->input(0);
+    const size_t tensor_index_offset = table_int64_args.size();
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& num_threads = ctx->input(tensor_index_offset + 2);
+
+    TensorShape output_shape = keys.shape();
+    Tensor* out;
+    {
+      auto status = ctx->allocate_output(0, output_shape, &out);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+
+    int64 num_threads_scalar;
+    if (TensorShapeUtils::IsScalar(num_threads.shape())) {
+      num_threads_scalar = num_threads.template scalar<int64>()();
+    } else {
+      // Scans through rows of num_threads and returns second entry of first
+      // row whose first entry is <= the number of keys to process.
+      // This allows the user to control parallelism as a function of
+      // the number of keys to lookup.
+      num_threads_scalar = tensor_flag_utils::FindConfigValueForKey<int64, int>(
+          num_threads.template matrix<int64>(), keys.dim_size(0));
+    }
+    const int64 num_keys_per_thread =
+        num_threads_scalar > 0
+            ? std::max(1ll, keys.dim_size(0) / num_threads_scalar)
+            : keys.dim_size(0);
+
+    const int64 prefetch_lookahead = prefetch_lookahead_t.scalar<int64>()();
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    auto* mutex = table->GetMutex();
+    auto* threadpool = ctx->device()->tensorflow_cpu_worker_threads()->workers;
+    if (mutex != nullptr) {
+      // There are many subtle problems with using reader locks so we opt for a
+      // writer lock here.
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    } else {
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    }
+  }
+};
+
+// Lookup a container of type ResourceAlias and return its size using
+// Functor::Size(container, &size).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class ContainerSizeOp : public OpKernel {
+ public:
+  explicit ContainerSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& container_handle = ctx->input(0);
+    ResourceHandle handle(container_handle.scalar<ResourceHandle>()());
+    ResourceAlias* container;
+    core::ScopedUnref unref_me(container);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &container));
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+
+    auto* mutex = container->GetMutex();
+    if (mutex != nullptr) {
+      tf_shared_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    }
+  }
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..7331fb400a4734db19a262503dffa38fb0f71466
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Interface for resources with mutable state.
+class SynchronizedInterface : public virtual ResourceBase {
+ public:
+  // Return value should be used to synchronize read/write access to
+  // all public methods. If null, no synchronization is needed.
+  virtual mutex* GetMutex() const = 0;
+};
+
+// Interface for containers which support batch lookups.
+template <typename ValueType, typename... KeyContext>
+class InsertOrAssignInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Stores each KV pair {keys[i], values[i]} in the underlying map, overriding
+  // pre-existing pairs which have equivalent keys.
+  // keys and values should have the same size.
+  virtual Status InsertOrAssign(KeyContext... key_context,
+                                ValueType values) = 0;
+};
+
+// Interface for containers which support lookups.
+template <typename ValueType, typename... KeyContext>
+class LookupInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values) const = 0;
+};
+
+// Interface for containers which support lookups with prefetching.
+template <typename ValueType, typename... KeyContext>
+class LookupWithPrefetchInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values,
+                        int64 prefetch_lookahead) const = 0;
+};
+
+// Interface for containers with size concepts.
+// Implementations must guarantee thread-safety when GetMutex is used to
+// synchronize method access.
+class SizeInterface : public virtual SynchronizedInterface {
+ public:
+  // Returns the number of elements in the container.
+  virtual uint64 Size() const = 0;
+};
+
+// Interface for tables which can be initialized from key and value arguments.
+template <typename ValueType, typename... KeyContext>
+class KeyValueTableInitializerInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Initialize(KeyContext... key_context, ValueType values) = 0;
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 30fe4b077a368fe7c272e3ea570100923b104c75..c3b80f04ed2e3dfe71550bfd6ccf87595343b1c1 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -163,7 +163,7 @@ class TextFileLineIterator
 
   int64 total_size() const override {
     if (vocab_size_ == -1) {
-      int64 new_size;
+      int64 new_size = -1;
       Status status = GetNumLinesInTextFile(env_, filename_, &new_size);
       if (!status.ok()) {
         LOG(WARNING) << "Unable to get line count: " << status;
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index f405ca3c58cfffc8422dcdd65e66c7fd12784519..ba30432e21a12d66c69217bec0c75660a0ae83ec 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 5d8c5c21ca21f097cb5030b43e288765ae384eaf..496c697ac3fbbc4c06a4c24f9521eba3c0cfeb23 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -102,7 +102,7 @@ TEST_F(LRNFloatTest, Depth96) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 96}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
@@ -138,7 +138,7 @@ TEST_F(LRNFloatTest, Depth16) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 16}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 27a8696e54647e14eda209c36b7b49c1d171d3bc..82b1397d243cbd2e1c2a113588e22e27621781c3 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -582,7 +582,6 @@ class MapUnstageOp : public OpKernel {
 
     const Tensor* key_tensor;
     const Tensor* indices_tensor;
-    OpInputList values_tensor;
 
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
@@ -644,7 +643,6 @@ class MapPeekOp : public OpKernel {
 
     const Tensor* key_tensor;
     const Tensor* indices_tensor;
-    OpInputList values_tensor;
 
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index 35037b8e142abe582eb464dcce62483a698848b2..4abf666fad9754d088175dfc74e4af39dafeec7c 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -73,16 +73,18 @@ struct MatrixSetDiag<GPUDevice, Scalar> {
     if (input.data() == output.data()) {
       CudaLaunchConfig config =
           GetCudaLaunchConfig(batch_size * minsize, device);
-      MatrixSetDiagKernel<Scalar>
-          <<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-              config.virtual_thread_count, m, n, minsize, diag.data(),
-              output.data());
+      TF_CHECK_OK(CudaLaunchKernel(MatrixSetDiagKernel<Scalar>,
+                                   config.block_count, config.thread_per_block,
+                                   0, device.stream(),
+                                   config.virtual_thread_count, m, n, minsize,
+                                   diag.data(), output.data()));
     } else {
       CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
-      MatrixCopyInputAndSetDiagKernel<Scalar>
-          <<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
-              config.virtual_thread_count, m, n, minsize, input.data(),
-              diag.data(), output.data());
+      TF_CHECK_OK(CudaLaunchKernel(MatrixCopyInputAndSetDiagKernel<Scalar>,
+                                   config.block_count, config.thread_per_block,
+                                   0, device.stream(),
+                                   config.virtual_thread_count, m, n, minsize,
+                                   input.data(), diag.data(), output.data()));
     }
   }
 };
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index ab235843f741a7f8bbfb7fa97cbe438ec5212b72..fcca2f718d8ce48364010e767bd84f739729a63b 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -58,7 +58,15 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params) {
+    const PoolParameters& params, const bool include_batch_in_index) {
+  if (input_backprop != nullptr) {
+    OP_REQUIRES(
+        context, include_batch_in_index,
+        errors::Internal(
+            "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index "
+            "to be True when when input_backprop != nullptr"));
+  }
+
   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
       ConstEigenMatrixMap;
   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -90,7 +98,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    and updates the corresponding column(s) in output_as_matrix with the
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
-                &output_arg_max, &out_backprop](int64 start, int64 limit) {
+                &output_arg_max, &out_backprop,
+                include_batch_in_index](int64 start, int64 limit) {
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -143,8 +152,11 @@ static void SpatialMaxPoolWithArgMaxHelper(
                 if (output_ref < input_ref ||
                     out_arg_max_ref == kInvalidMaxPoolingIndex) {
                   output_ref = input_ref;
-                  int64 input_offset = in_index * depth + d;
-                  out_arg_max_ref = input_offset;
+                  if (include_batch_in_index) {
+                    out_arg_max_ref = in_index * depth + d;
+                  } else {
+                    out_arg_max_ref = (h * in_cols + w) * depth + d;
+                  }
                 }
               }
             }
@@ -295,7 +307,7 @@ class MaxPoolingGradOp : public OpKernel {
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params);
+        out_backprop, params, true);
   }
 
  private:
@@ -875,10 +887,11 @@ template <typename T>
 struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output, Tensor* argmax,
-                     bool propagate_nans) {
+                     bool propagate_nans, bool include_batch_in_index) {
     Tensor unused;
-    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
-        context, output, argmax, nullptr, input, unused, params);
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(context, output, argmax,
+                                                 nullptr, input, unused, params,
+                                                 include_batch_in_index);
   }
 };
 
@@ -899,7 +912,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-
+    OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
+                                             &include_batch_in_index_));
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -921,7 +935,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
 
     LaunchMaxPoolingWithArgmax<Device, T>::launch(
-        context, params, tensor_in, output, argmax, propagate_nans_);
+        context, params, tensor_in, output, argmax, propagate_nans_,
+        include_batch_in_index_);
   }
 
  private:
@@ -929,6 +944,7 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   bool propagate_nans_;
+  bool include_batch_in_index_;
 };
 
 template <typename Device, typename T>
@@ -941,11 +957,12 @@ struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
 
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& grad_in, const Tensor& argmax,
-                     Tensor* grad_out) {
+                     Tensor* grad_out, const bool include_batch_in_index) {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *(context->device()->tensorflow_cpu_worker_threads());
 
-    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
+    auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index](
+                     int64 start, int64 limit) {
       const int64 batch_size =
           GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
       const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
@@ -965,7 +982,11 @@ struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
         const int input_start = start * input_size_per_batch;
         const int input_end = limit * input_size_per_batch;
         for (int64 index = input_start; index < input_end; index++) {
-          const int64 grad_out_index = argmax_flat(index);
+          int64 grad_out_index = argmax_flat(index);
+          if (!include_batch_in_index) {
+            const int64 cur_batch = index / input_size_per_batch;
+            grad_out_index += cur_batch * output_size_per_batch;
+          }
           CHECK(grad_out_index >= output_start && grad_out_index < output_end)
               << "Invalid output gradient index: " << grad_out_index << ", "
               << output_start << ", " << output_end;
@@ -1005,6 +1026,8 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
+                                             &include_batch_in_index_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1022,10 +1045,10 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
                            params.tensor_in_cols, params.depth});
     Tensor* grad_out = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {1}, 0, out_shape, &grad_out));
+                                {0}, 0, out_shape, &grad_out));
 
-    LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
-                                                      argmax, grad_out);
+    LaunchMaxPoolingGradWithArgmax<Device, T>::launch(
+        context, params, grad_in, argmax, grad_out, include_batch_in_index_);
   }
 
  private:
@@ -1033,6 +1056,7 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
+  bool include_batch_in_index_;
 };
 
 template <typename Device, typename T>
@@ -1055,6 +1079,8 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
+                                             &include_batch_in_index_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1073,16 +1099,17 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
 
     Tensor* grad_out = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {1}, 0, out_shape, &grad_out));
+                                {0}, 0, out_shape, &grad_out));
 
     LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch(
-        context, params, grad_in, argmax, grad_out);
+        context, params, grad_in, argmax, grad_out, include_batch_in_index_);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  bool include_batch_in_index_;
 };
 
 #if GOOGLE_CUDA
@@ -1279,7 +1306,7 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
         params.out_width, params.window_rows, params.window_cols,
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
-        propagate_nans);
+        propagate_nans, false);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardNoMask"));
@@ -1291,7 +1318,7 @@ template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output, Tensor* argmax,
-                     bool propagate_nans) {
+                     bool propagate_nans, bool include_batch_in_index) {
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
@@ -1299,7 +1326,7 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
-        context->eigen_gpu_device(), propagate_nans);
+        context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
@@ -1311,7 +1338,7 @@ template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& grad_in, const Tensor& argmax,
-                     Tensor* grad_out) {
+                     Tensor* grad_out, const bool include_batch_in_index) {
     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
                            params.tensor_in_cols * params.depth;
     const int output_size = params.tensor_in_batch * params.out_height *
@@ -1322,7 +1349,8 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
     bool status = functor::MaxPoolBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
-        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
+        include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
@@ -1334,7 +1362,7 @@ template <typename T>
 struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& grad_in, const Tensor& argmax,
-                     Tensor* grad_out) {
+                     Tensor* grad_out, const bool include_batch_in_index) {
     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
                            params.tensor_in_cols * params.depth;
     const int output_size = params.tensor_in_batch * params.out_height *
@@ -1346,7 +1374,8 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
     bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
-        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
+        include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
@@ -1427,32 +1456,32 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
 // default Eigen implementation so we are using the custom kernel as the
 // default. However, you can explicitly invoke the eigen version using
 // kernel_label_map.
-#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
-  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .Label("eigen_tensor"),                \
-                          MaxPoolingOp<GPUDevice, T>);               \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
-                              .Device(DEVICE_GPU)                    \
-                              .HostMemory("ksize")                   \
-                              .HostMemory("strides")                 \
-                              .TypeConstraint<T>("T")                \
-                              .Label("eigen_tensor"),                \
-                          MaxPoolingV2Op<GPUDevice, T>);             \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
-      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
-                              .Device(DEVICE_GPU)                    \
-                              .HostMemory("ksize")                   \
-                              .HostMemory("strides")                 \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                        \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                        \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T")            \
+                              .Label("eigen_tensor"),            \
+                          MaxPoolingOp<GPUDevice, T>);           \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("ksize")               \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<T>("T")            \
+                              .Label("eigen_tensor"),            \
+                          MaxPoolingV2Op<GPUDevice, T>);         \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("ksize")               \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<T>("T"),           \
+                          MaxPoolingNoMaskV2Op<GPUDevice, T>);   \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")      \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T")            \
+                              .TypeConstraint<int64>("Targmax"), \
                           MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f28811ffa4d740e6733b33189a0228bea2428b19..1309ce70e8946213ed5a133252fbb879ca8eb4e4 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -54,6 +54,8 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
 //         int form, keeping track of the flattened index of the input item that
 //         produces the max output. If a nullptr is passed in for mask, no mask
 //         will be produced.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 //
 // To call the forward and backward functions, use e.g.:
 // const int kThreadsPerBlock = 1024
@@ -61,14 +63,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
 template <bool propagate_nans, typename dtype>
-__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
-                                   const int channels, const int height,
-                                   const int width, const int pooled_height,
-                                   const int pooled_width, const int kernel_h,
-                                   const int kernel_w, const int stride_h,
-                                   const int stride_w, const int pad_t,
-                                   const int pad_l, dtype* top_data,
-                                   int64* mask) {
+__global__ void MaxPoolForwardNCHW(
+    const int nthreads, const dtype* bottom_data, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    dtype* top_data, int64* mask, const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -82,12 +82,13 @@ __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     wstart = max(wstart, 0);
     dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    const int offset = n * channels * height * width;
+    const dtype* bottom_data_n = bottom_data + offset;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = c * height * width + h * width + w;
         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
-          maxidx = idx;
+          maxidx = include_batch_in_index ? idx + offset : idx;
           maxval = bottom_data_n[idx];
         }
       }
@@ -136,14 +137,12 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
 }
 
 template <bool propagate_nans, typename dtype>
-__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
-                                   const int height, const int width,
-                                   const int channels, const int pooled_height,
-                                   const int pooled_width, const int kernel_h,
-                                   const int kernel_w, const int stride_h,
-                                   const int stride_w, const int pad_t,
-                                   const int pad_l, dtype* top_data,
-                                   int64* mask) {
+__global__ void MaxPoolForwardNHWC(
+    const int nthreads, const dtype* bottom_data, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    dtype* top_data, int64* mask, const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -158,12 +157,13 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
     wstart = max(wstart, 0);
     dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    const int offset = n * height * width * channels;
+    const dtype* bottom_data_n = bottom_data + offset;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (h * width + w) * channels + c;
         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
-          maxidx = idx;
+          maxidx = include_batch_in_index ? idx + offset : idx;
           maxval = bottom_data_n[idx];
         }
       }
@@ -231,17 +231,20 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
 //     bottom_offset: the pre-computed per-image offset of the maxpool input.
 //         This is equal to H*W*C.
 //     bottom_diff: the gradient with respect to the input.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 // This function relies on CudaAtomicAdd to avoid race conditions. Also, before
 // the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
                                 const int64* mask, const int top_offset,
-                                const int bottom_offset, dtype* bottom_diff) {
+                                const int bottom_offset, dtype* bottom_diff,
+                                const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int image_id = (index / top_offset);
-    CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
-                  top_diff[index]);
+    const int offset =
+        include_batch_in_index ? 0 : (index / top_offset) * bottom_offset;
+    CudaAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
   }
 }
 
@@ -358,14 +361,17 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
 //     bottom_offset: the pre-computed per-image offset of the maxpool output.
 //         This is equal to Hout*Wout*C.
 //     bottom_diff: the gradient of the gradient w.r.t. output.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 template <typename dtype>
 __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
                                     const int64* mask, const int top_offset,
-                                    const int bottom_offset,
-                                    dtype* bottom_diff) {
+                                    const int bottom_offset, dtype* bottom_diff,
+                                    const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int image_id = (index / bottom_offset);
-    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
+    const int offset =
+        include_batch_in_index ? 0 : (index / bottom_offset) * top_offset;
+    bottom_diff[index] = top_diff[offset + mask[index]];
   }
 }
 
@@ -399,7 +405,8 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
+    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans,
+    const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
   if (output_size == 0) return true;
@@ -409,14 +416,14 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
            kThreadsPerBlock, 0, d.stream()>>>(
             output_size, bottom_data, height, width, channels, pooled_height,
             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-            top_data, mask);
+            top_data, mask, include_batch_in_index);
   } else {
     MaxPoolForwardNHWC<false>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
            kThreadsPerBlock, 0, d.stream()>>>(
             output_size, bottom_data, height, width, channels, pooled_height,
             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-            top_data, mask);
+            top_data, mask, include_batch_in_index);
   }
   return d.ok();
 }
@@ -449,14 +456,16 @@ template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
+    T* bottom_diff, const Eigen::GpuDevice& d,
+    const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   if (input_size == 0) return true;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff,
+      include_batch_in_index);
   return d.ok();
 }
 
@@ -492,12 +501,14 @@ template <typename T>
 bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
+    T* bottom_diff, const Eigen::GpuDevice& d,
+    const bool include_batch_in_index) {
   if (input_size == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
                         d.stream()>>>(output_size, top_diff, mask, top_offset,
-                                      bottom_offset, bottom_diff);
+                                      bottom_offset, bottom_diff,
+                                      include_batch_in_index);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 38ebb34248012976346b5f25472a75dfe5575aa3..c18c48915079eeb93333ae4387b762d1146ea82c 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -39,7 +39,8 @@ struct MaxPoolForwardWithOptionalArgmax {
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
                   const int pad_t, const int pad_l, T* top_data, int64* mask,
-                  const Eigen::GpuDevice& d, bool propagate_nans);
+                  const Eigen::GpuDevice& d, bool propagate_nans,
+                  const bool include_batch_in_index);
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
@@ -56,7 +57,7 @@ struct MaxPoolBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
                   const T* top_diff, const int64* mask, const int top_offset,
                   const int bottom_offset, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
 template <typename T>
@@ -74,7 +75,7 @@ struct MaxPoolGradBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
                   const T* top_diff, const int64* mask, const int top_offset,
                   const int bottom_offset, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/mfcc_dct_test.cc b/tensorflow/core/kernels/mfcc_dct_test.cc
index 7526278fe9e9c324025af5bbe48eb09c57b62206..70158648976fabbe2da24fab97a049e4f6449707 100644
--- a/tensorflow/core/kernels/mfcc_dct_test.cc
+++ b/tensorflow/core/kernels/mfcc_dct_test.cc
@@ -44,11 +44,8 @@ TEST(MfccDctTest, AgreesWithMatlab) {
 TEST(MfccDctTest, InitializeFailsOnInvalidInput) {
   MfccDct dct1;
   EXPECT_FALSE(dct1.Initialize(-50, 1));
-  MfccDct dct2;
   EXPECT_FALSE(dct1.Initialize(10, -4));
-  MfccDct dct3;
   EXPECT_FALSE(dct1.Initialize(-1, -1));
-  MfccDct dct4;
   EXPECT_FALSE(dct1.Initialize(20, 21));
 }
 
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 3db3b51e8b665f6e28ccb2bf8f3850785c7561fb..2c22fec2b114887ba6259568739d8ba67492f397 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -196,7 +196,7 @@ void MfccMelFilterbank::Compute(const std::vector<double> &input,
 }
 
 double MfccMelFilterbank::FreqToMel(double freq) const {
-  return 1127.0 * log(1.0 + (freq / 700.0));
+  return 1127.0 * log1p(freq / 700.0);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 8eb334f2b497ea2c7d2d10d3007d30ed5a8adb5e..c3127ade06bddbce602f6b5c254f16320a980205 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -101,7 +101,7 @@ class MklAddNOp : public OpKernel {
         }
       }
 
-      std::vector<double> coeff(2, 1.0);
+      const std::vector<float> coeff(2, 1.0f);
       MklDnnData<T> src1(&cpu_engine);
       MklDnnData<T> src2(&cpu_engine);
       MklDnnData<T> dst(&cpu_engine);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index f0278caee6b95269b77185d409de67a7441c5ff3..2398269e0b1c29bfe7cc4a5d5ff6c6ef2d5b3518 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -18,19 +18,21 @@ limitations under the License.
 
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/concat_lib_cpu.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::concat;
 using mkldnn::stream;
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -226,8 +228,50 @@ class MklConcatOp : public OpKernel {
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
+      OpInputList input_mins, input_maxes;
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        // MKL-DNN concat does not support input tensors that have different
+        // ranges. Check if the ranges of the all input tensors are the same.
+        // If not, forward it to Eigen implementation.
+
+        OP_REQUIRES_OK(context, context->input_list("input_mins", &input_mins));
+        OP_REQUIRES(context, (input_mins.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected mins input list length ",
+                        input_mins.size(), " to equal values length ", N));
+
+        OP_REQUIRES_OK(context,
+                       context->input_list("input_maxes", &input_maxes));
+        OP_REQUIRES(context, (input_maxes.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected maxes input list length ",
+                        input_maxes.size(), " to equal values length ", N));
+        float input_min = input_mins[0].flat<float>()(0);
+        float input_max = input_maxes[0].flat<float>()(0);
+        const float eps = 1.0e-6;
+        for (int i = 1; i < N; ++i) {
+          float min = input_mins[i].flat<float>()(0);
+          float max = input_maxes[i].flat<float>()(0);
+
+          if (fabs(input_min - min) > eps || fabs(input_max - max) > eps) {
+            invoke_eigen = true;
+            break;
+          }
+        }
+      }
+
       // Call Eigen library
       if (invoke_eigen) {
+        // MKL-DNN quantized concat does not support input tensors with
+        // different ranges.
+        // TODO (mabuzain): Add quantized version of CallEigen() to support
+        // this case.
+        OP_REQUIRES(
+            context,
+            (!std::is_same<T, qint8>::value && !std::is_same<T, quint8>::value),
+            errors::Unimplemented("MKL DNN quantized concat does not "
+                                  "support input tensors that have "
+                                  "different ranges"));
         CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
@@ -374,6 +418,23 @@ class MklConcatOp : public OpKernel {
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
+
+      // For quantized concat, min and max outputs are also computed.
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        // All input tensors should have the same range, just use the
+        // first one
+        output_min->flat<float>()(0) = input_mins[0].flat<float>()(0);
+        output_max->flat<float>()(0) = input_maxes[0].flat<float>()(0);
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -423,7 +484,7 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
 
-  // This method finds the most commom format accross all MKL inputs
+  // This method finds the most common format across all MKL inputs
   // Inputs:
   //   1. input_shapes: shapes of input (MKL) tensors.
   //   2. concat_dim: concat dimension.
@@ -490,6 +551,20 @@ class MklConcatOp : public OpKernel {
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, quint8, NAME_IS_AXIS>)
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, qint8, NAME_IS_AXIS>)
+
 #undef REGISTER_CONCAT_MKL
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index bc9d45abe68a91a683d026450c28a4eb59c6ec3a..47b2a43ed9212f5a58cdaa07b15f8aec44ee7b0f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -357,12 +357,12 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool bias_enabled>
+template <typename Device, class T, bool bias_enabled, bool is_depthwise>
 class MklConvCustomBackpropFilterOp
-    : public MklConvBackpropCommonOp<Device, T> {
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropFilterOp() {}
 
@@ -432,7 +432,7 @@ class MklConvCustomBackpropFilterOp
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
           &strides, &dilations, &fwd_dst_dims_tf_order, &fwd_dst_dims,
-          &padding_left, &padding_right, false);
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
       auto tf_fmt = is_conv2d
@@ -485,13 +485,37 @@ class MklConvCustomBackpropFilterOp
       diff_filter_mkl_shape.SetMklTensor(false);
 
       if (is_conv2d) {
-        // Conv2D: output_dims_mkl_order is in OIHW format.
-        TensorShape diff_filter_tf_shape({bwd_output_dims[MklDnnDims::Dim_H],
-                                          bwd_output_dims[MklDnnDims::Dim_W],
-                                          bwd_output_dims[MklDnnDims::Dim_I],
-                                          bwd_output_dims[MklDnnDims::Dim_O]});
-        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+        if (!is_depthwise) {
+          // Conv2D: output_dims_mkl_order is in OIHW format.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnDims::Dim_H],
+               bwd_output_dims[MklDnnDims::Dim_W],
+               bwd_output_dims[MklDnnDims::Dim_I],
+               bwd_output_dims[MklDnnDims::Dim_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        } else {
+          // Depthwise Conv2d: bwd_output_dims is GOIHW format
+          //                  | TensorFlow       | MKLDNN
+          // ----------------------------------------------------------------
+          // filter_out_depth | depth_multiplier | depth_multiplier *
+          //                  |                  | group_count
+          // ----------------------------------------------------------------
+          // filter_in_depth  | in_depth         | in_depth / group_count
+          // For depthwise convolution, we have group_count == in_depth.
+          // So here G = original I, and I = 1.
+          // And the GOIHW is mkldnn format, here we try to extract the TF
+          // format, TF format is HWIO, as G = original I, so here is HWGO.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_H],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_W],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_G],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        }
       } else {
         // Conv3D: output_dims_mkl_order is in OIDHW format.
         TensorShape diff_filter_tf_shape(
@@ -628,10 +652,12 @@ class MklConvCustomBackpropFilterOp
   }
 
   // Output layout is Tensorflow's filter layout
-  //   Conv2D: HWIO;  Conv3D: DHWIO
+  //   Conv2D: HWIO;  Conv3D: DHWIO; Depthwise Conv: HWIGO
   memory::format GetOutputFormat(const memory::format data_format) {
-    return (this->strides_.size() == 4) ? memory::format::hwio
-                                        : memory::format::dhwio;
+    return is_depthwise
+               ? memory::format::hwigo
+               : ((this->strides_.size() == 4) ? memory::format::hwio
+                                               : memory::format::dhwio);
   }
 
   // Allocate output tensor.
@@ -673,27 +699,36 @@ class MklConvCustomBackpropFilterOp
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")                     \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")             \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, true>);  \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")       \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklDummyOp<CPUDevice, T>);                           \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropFilterV2")                   \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);        \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, true, false>);         \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklDepthwiseConv2dNativeBackpropFilter")                    \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);                     \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv3DBackpropFilterV2")                                 \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index b5be87ec552c28b1d08afb7298abaca398a90885..4e955df5fe9e551ec9aadc21b466dc3810784760 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -295,11 +295,12 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T>
-class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
+template <typename Device, class T, bool is_depthwise>
+class MklConvCustomBackpropInputOp
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropInputOp() {}
 
@@ -367,7 +368,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
           &strides, &dilations, &fwd_output_dims_tf_order, &fwd_output_dims,
-          &padding_left, &padding_right, false);
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -383,9 +384,11 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       auto fwd_filter_md =
           filter_mkl_shape.IsMklTensor()
               ? filter_mkl_shape.GetMklLayout()
-              : memory::desc(
-                    fwd_filter_dims, MklDnnType<T>(),
-                    is_conv2d ? memory::format::hwio : memory::format::dhwio);
+              : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                             is_depthwise
+                                 ? memory::hwigo
+                                 : (is_conv2d ? memory::format::hwio
+                                              : memory::format::dhwio));
 
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
@@ -554,18 +557,22 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
   }
 };
 
-#define REGISTER_MKL_CPU_KERNELS(T)                                    \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")              \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")            \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>);
-
+#define REGISTER_MKL_CPU_KERNELS(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")                   \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklDepthwiseConv2dNativeBackpropInput")      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, true>);
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 8c585fb48ca15ea3311b5fd9b928095acab12336..0354f725f3cc50ca3fb3cd7aa34ec842a3476b48 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -21,13 +21,14 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -91,8 +92,11 @@ struct MklConvFwdParams {
         padding_left(padding_left),
         padding_right(padding_right) {}
 };
+
+typedef mkldnn::convolution_forward::primitive_desc ConvFwdPd;
+
 // With quantization, input, filter, and output can have different types
-// so we use differnt template parameter for each type
+// so we use different template parameter for each type
 template <typename T, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
@@ -100,7 +104,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
       : cpu_engine_(engine::cpu, 0) {
     context_.fwd_stream.reset(new stream(stream::kind::eager));
-    // create conv primitive
+    // Create conv primitive
     if (context_.conv_fwd == nullptr) {
       Setup(convFwdDims);
     }
@@ -125,7 +129,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
-    // after exec, set data handle back
+    // After exec, set data handle back
     context_.src_mem->set_data_handle(DummyData);
     context_.filter_mem->set_data_handle(DummyData);
     context_.bias_mem->set_data_handle(DummyData);
@@ -148,7 +152,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
-    // after execution, set data handle back
+    // After execution, set data handle back
     context_.src_mem->set_data_handle(DummyData);
     context_.filter_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
@@ -158,15 +162,14 @@ class MklConvFwdPrimitive : public MklPrimitive {
 
   memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
 
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
-  GetPrimitiveDesc() const {
+  std::shared_ptr<ConvFwdPd> GetPrimitiveDesc() const {
     return context_.fwd_pd;
   }
 
  private:
   // Primitive reuse context for Conv2D Fwd op
   struct ConvFwdContext {
-    // expected memory format for this primitive instance
+    // Expected memory format for this primitive instance
     memory::format src_fmt;
     memory::format filter_fmt;
 
@@ -176,17 +179,17 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> bias_mem;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
-    // desc & prmitive desc
+    // Desc & prmitive desc
     std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc;
 
-    // memory desc
+    // Memory desc
     std::shared_ptr<mkldnn::memory::desc> src_md;
     std::shared_ptr<mkldnn::memory::desc> filter_md;
     std::shared_ptr<mkldnn::memory::desc> bias_md;
     std::shared_ptr<mkldnn::memory::desc> dst_md;
 
-    // convolution primitive
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd;
+    // Convolution primitive
+    std::shared_ptr<ConvFwdPd> fwd_pd;
     std::shared_ptr<mkldnn::primitive> conv_fwd;
 
     std::shared_ptr<mkldnn::stream> fwd_stream;
@@ -209,7 +212,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
-    // create memory descriptors for convolution data w/ no specified format
+    // Create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
         {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
 
@@ -223,7 +226,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
       context_.bias_md.reset(new memory::desc(
           {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
 
-    // create a convolution
+    // Create a convolution
     if (!convFwdDims.bias_dims.empty()) {
       context_.fwd_desc.reset(new convolution_forward::desc(
           prop_kind::forward, convolution_direct, *context_.src_md,
@@ -238,8 +241,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
           convFwdDims.padding_right, padding_kind::zero));
     }
 
-    context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-        *context_.fwd_desc, cpu_engine_));
+    context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
 
     // Check if there is any fusions as post-ops
     auto const& post_op_params = convFwdDims.post_op_params;
@@ -270,21 +272,20 @@ class MklConvFwdPrimitive : public MklPrimitive {
         }
       }
       post_ops_attr.set_post_ops(post_ops);
-      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-          *context_.fwd_desc, post_ops_attr, cpu_engine_));
+      context_.fwd_pd.reset(
+          new ConvFwdPd(*context_.fwd_desc, post_ops_attr, cpu_engine_));
     } else {
-      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-          *context_.fwd_desc, cpu_engine_));
+      context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
     }
 
-    // store the expected memory format
+    // Store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
 
     context_.filter_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
 
-    // create memory primitive based on dummy data
+    // Create memory primitive based on dummy data
     context_.src_mem.reset(
         new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
     context_.filter_mem.reset(
@@ -292,7 +293,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.dst_mem.reset(
         new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
-    // create convolution primitive and add it to net
+    // Create convolution primitive and add it to net
     if (!convFwdDims.bias_dims.empty()) {
       context_.bias_mem.reset(new memory(
           {{{convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::x},
@@ -323,11 +324,12 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
       const MklConvFwdParams& convFwdDims, bool do_not_cache) {
     MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* conv_fwd = nullptr;
 
-    if (do_not_cache) { /* Always create new primitive */
+    if (do_not_cache) {
+      // Always create a new primitive
       conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
           convFwdDims);
     } else {
-      // try to find a suitable one in pool
+      // Try to find a suitable one in pool
       conv_fwd = dynamic_cast<
           MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>*>(
           MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
@@ -464,7 +466,7 @@ class MklConvOp : public OpKernel {
                 errors::InvalidArgument("filter must be 4-dimensional: ",
                                         filter.shape().DebugString()));
 
-    for (int i = 0; i < 3; i++) {
+    for (int i = 0; i < 3; ++i) {
       OP_REQUIRES(
           context,
           FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
@@ -858,6 +860,9 @@ class MklConvOp : public OpKernel {
 
   explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    if (context->HasAttr("padding_list")) {
+      OP_REQUIRES_OK(context, context->GetAttr("padding_list", &padding_list_));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -874,6 +879,9 @@ class MklConvOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    is_filter_const_ = false;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("is_filter_const", &is_filter_const_));
 
     if (strides_.size() == 4) {
       OP_REQUIRES(context, dilations_.size() == 4,
@@ -915,6 +923,10 @@ class MklConvOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
 
+      // Data from persistent (cached) filter tensor
+      const Tensor& cached_filter_data_tensor =
+          *cached_filter_data_ptensor_.AccessTensor(context);
+
       MklDnnShape src_mkl_shape, filter_mkl_shape;
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
@@ -929,9 +941,19 @@ class MklConvOp : public OpKernel {
           dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
-      // If pad with conv2d fusion is enabled
-      if (pad_enabled) {
-        PadWithConvFusion(context, padding_left, padding_right);
+      // For Quantized-Conv2D and Pad fusion, we get padding from the
+      // `padding_list` attribute. Otherwise, we get it from one of the inputs.
+      bool quantized_pad_enabled = false;
+      for (auto const& padding_val : padding_list_) {
+        if (padding_val) {
+          quantized_pad_enabled = true;
+          break;
+        }
+      }
+
+      if (fuse_pad_ || quantized_pad_enabled) {
+        PadWithConvFusion(context, padding_left, padding_right,
+                          quantized_pad_enabled);
       }
 
       // Get shapes of input tensors in MKL-DNN order
@@ -942,7 +964,8 @@ class MklConvOp : public OpKernel {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
-          &padding_right, pad_enabled, is_depthwise);
+          &padding_right, (fuse_pad_ || quantized_pad_enabled), is_depthwise);
+
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -956,11 +979,9 @@ class MklConvOp : public OpKernel {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
                                   src_tf_shape, dst_mkl_shape);
 
-        // MklConv2D/3D also outputs converted filter
-        // as 2nd output of Conv2D/3D.
+        // MklConv2D/3D also outputs converted filter as 2nd output.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
-        // MklConv2D also outputs converted filter as 2nd output.
         if (typeid(Tinput) == typeid(float) &&
             typeid(Tfilter) == typeid(float) &&
             typeid(Toutput) == typeid(float)) {
@@ -974,6 +995,12 @@ class MklConvOp : public OpKernel {
 
       bool is_conv2d = (strides_.size() == 4);
 
+      if (!is_conv2d) {
+        OP_REQUIRES(
+            context, !pad_enabled,
+            errors::InvalidArgument("Pad + Conv fusion only works for 2D"));
+      }
+
       // TODO 3-D support for Depthwise is not there
       if (is_depthwise) {
         OP_REQUIRES(context, is_conv2d,
@@ -984,7 +1011,7 @@ class MklConvOp : public OpKernel {
       // TODO(Intel-tf) Add check to make sure pad_enabled is true only for 2D
       if (!is_conv2d) {
         OP_REQUIRES(
-            context, !pad_enabled,
+            context, !fuse_pad_,
             errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
       }
       // Create memory for user data.
@@ -993,10 +1020,11 @@ class MklConvOp : public OpKernel {
       auto tf_fmt = is_conv2d ? TFDataFormatToMklDnnDataFormat(data_format_)
                               : TFDataFormatToMklDnn3DDataFormat(data_format_);
 
-      // If input is in MKL layout, then simply grab input layout; otherwise,
-      // construct input Tf layout. For TF layout, although input shape
-      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
-      // layout depending on data format:
+      // If input is in MKL layout, then simply grab the layout; otherwise,
+      // construct TF layout for input.
+      // For constructing TF layout for input, although input shape (src_dims)
+      // is required to be in MKL-DNN order, the input layout is actually in
+      // TF layout depending on the data format:
       //     Conv2D: NHWC or NCHW
       //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
@@ -1005,8 +1033,8 @@ class MklConvOp : public OpKernel {
       src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO) and (HWIGO)for depthwise/group
-      // convolutions
+      // the layout is Tensorflow's layout (HWIO) and (HWIGO) for
+      // depthwise/group convolutions.
 
       auto filter_format = is_conv2d ? (is_depthwise ? memory::format::hwigo
                                                      : memory::format::hwio)
@@ -1018,54 +1046,42 @@ class MklConvOp : public OpKernel {
               ? filter_mkl_shape.GetMklLayout()
               : memory::desc(filter_dims, MklDnnType<Tfilter>(), filter_format);
       filter.SetUsrMem(filter_md, &filter_tensor);
-      // MKLDNN dilation starts from 0.
-      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      // In some cases, primitve descriptor includes potentialy large buffers,
-      // we don't cache those primitves if the env variable
-      // TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is true. MKL DNN allocates buffers
-      // in the following cases
+      // MKLDNN dilations start from 0.
+      for (int i = 0; i < dilations.size(); ++i) --dilations[i];
+
+      // In some cases, primitive descriptor could potentially contain
+      // large buffers. As a result, we don't cache these primitives if the
+      // environment variable `TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE` is set to True.
+      // MKL-DNN allocates buffers in the following cases:
       //   1. Legacy CPU without AVX512/AVX2, or
-      //   2. 1x1 convolution with stride != 1
+      //   2. 1x1 convolution with strides != 1
       bool do_not_cache =
           MklPrimitiveFactory<Tinput>::IsPrimitiveMemOptEnabled() &&
           (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
           (MklPrimitiveFactory<Tinput>::IsLegacyPlatform() ||
            IsConv1x1StrideNot1(filter_dims, strides));
 
-      // get a conv2d fwd from primitive pool
+      // Get a conv2d fwd from primitive pool
       MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
           conv_fwd = nullptr;
+      memory::dims bias_dims = {};
       if (fuse_biasadd_) {
-        memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
-        MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
-                                     dst_dims_mkl_order, strides, dilations,
-                                     padding_left, padding_right);
-
-        // TODO(mdfaijul):  Extend the basic parameters for data types and
-        // fusions
-        this->ExtendConvFwdParams(context, convFwdDims);
-
-        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
-                                              Ttemp_output>::Get(convFwdDims,
-                                                                 do_not_cache);
-      } else {
-        MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
-                                     dst_dims_mkl_order, strides, dilations,
-                                     padding_left, padding_right);
+      }
+      MklConvFwdParams convFwdDims(
+          src_dims, filter_dims, fuse_biasadd_ ? bias_dims : NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
 
-        // Extend the basic parameters for data types and fusions
-        this->ExtendConvFwdParams(context, convFwdDims);
+      // TODO(mdfaijul): Extend the basic parameters for data types and fusions
+      this->ExtendConvFwdParams(context, convFwdDims);
 
-        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
-                                              Ttemp_output>::Get(convFwdDims,
-                                                                 do_not_cache);
-      }
+      conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                            Ttemp_output>::Get(convFwdDims,
+                                                               do_not_cache);
 
-      // allocate output tensors output_tensor and filter_out_tensor
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
-          conv_fwd->GetPrimitiveDesc();
+      // Allocate output tensors `output_tensor` and `filter_out_tensor`
+      std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
                            &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
@@ -1079,9 +1095,10 @@ class MklConvOp : public OpKernel {
       Ttemp_output* dst_data =
           reinterpret_cast<Ttemp_output*>(dst_tensor->flat<Toutput>().data());
 
-      // check whether src/filter need reorder
+      // Check whether src and filter need to be reordered
       Tinput* src_data = nullptr;
       if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
+        // Reorder src
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
         src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
@@ -1089,25 +1106,43 @@ class MklConvOp : public OpKernel {
         src_data = static_cast<Tinput*>(
             const_cast<Tinput*>(src_tensor.flat<Tinput>().data()));
       }
+
       Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
-        filter.SetUsrMem(filter_md, &filter_tensor);
-        if (filter_out_tensor == nullptr) {
-          filter.CheckReorderToOpMem(
-              conv_fwd_pd.get()->weights_primitive_desc());
-        } else {
-          filter.CheckReorderToOpMem(
-              conv_fwd_pd.get()->weights_primitive_desc(),
-              filter.GetTensorBuffer(filter_out_tensor));
+        bool is_filter_cached = false;
+        // If filter is a constant, we can avoid the conversion of filter from
+        // Tensorflow format to MKL format by caching the filter when it is
+        // converted for the first time. This cached filter can then be reused
+        // in subsequent iterations.
+        if (is_filter_const_) {
+          if (IsFilterCacheEmpty(context)) {
+            // Cache filter if it is not already cached.
+            CacheFilter(context, conv_fwd_pd, filter_data, filter_tensor,
+                        filter, filter_md);
+          }
+          filter_data =
+              GetCachedFilter(context, conv_fwd->GetFilterMemoryFormat());
+          is_filter_cached = (filter_data != nullptr);
+        }
+        if (!is_filter_cached) {
+          filter.SetUsrMem(filter_md, &filter_tensor);
+          if (filter_out_tensor == nullptr) {
+            filter.CheckReorderToOpMem(
+                conv_fwd_pd.get()->weights_primitive_desc());
+          } else {
+            filter.CheckReorderToOpMem(
+                conv_fwd_pd.get()->weights_primitive_desc(),
+                filter.GetTensorBuffer(filter_out_tensor));
+          }
+          filter_data =
+              static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
         }
-        filter_data =
-            static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data = static_cast<Tfilter*>(
             const_cast<Tfilter*>(filter_tensor.flat<Tfilter>().data()));
       }
 
-      // execute convolution
+      // Execute convolution
       if (fuse_biasadd_) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         Tbias* bias_data =
@@ -1117,7 +1152,7 @@ class MklConvOp : public OpKernel {
         conv_fwd->Execute(src_data, filter_data, dst_data);
       }
 
-      // delete primitive since it is not cached.
+      // Delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
     } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
@@ -1130,23 +1165,31 @@ class MklConvOp : public OpKernel {
   }
 
   void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
-                         memory::dims& padding_right) {
-    const Tensor& paddings_tf = MklGetInput(context, 2);
-    OP_REQUIRES(context, paddings_tf.dims() == 2,
-                errors::InvalidArgument("paddings must be 2-dimensional: ",
-                                        paddings_tf.shape().DebugString()));
+                         memory::dims& padding_right,
+                         bool quantized_pad_enabled) {
+    const Tensor& paddings_tf = MklGetInput(context, input_index_pad_);
     Tpadding* paddings = nullptr;
-    // To get individual pad, need to flatten the tensor
-    paddings = static_cast<Tpadding*>(
-        const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
-    // For NHWC format:
-    // paddings[0], paddings[1], paddings[6], paddings[7] should be zero
-    // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ]
-    // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major
-    // then, values are: top = 1, bottom =2, left=3, right=4
-    // For NCHW format:
-    // paddings[0], paddings[1], paddings[2], paddings[3] should be zero
-    // similar explanation as NHWC format will apply.
+    if (quantized_pad_enabled) {
+      paddings = padding_list_.data();
+    } else {
+      OP_REQUIRES(context, paddings_tf.dims() == 2,
+                  errors::InvalidArgument("paddings must be 2-dimensional: ",
+                                          paddings_tf.shape().DebugString()));
+      // Flatten tensor to get individual paddings.
+      paddings = static_cast<Tpadding*>(
+          const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
+    }
+    // If the data format is NHWC, indices 0, 1, 6 and 7 of paddings(_tf)
+    // will be zero.
+    // Example:
+    // paddings_tf = [ [0, 0] [1, 2] [3, 4] [0, 0] ],
+    // flat method = row-major, then:
+    // paddings = {0, 0, 1, 2, 3, 4, 0, 0}.
+    // Hence, the values are: top = 1, bottom = 2, left = 3, right = 4.
+    //
+    // Similarly, if the data format is NCHW, indices 0, 1, 2 and 3 of
+    // paddings(_tf) will be zero.
+    // i.e. for the above example, paddings = {0, 0, 0, 0, 1, 2, 3, 4}.
     int64 pad_top, pad_left;
     int64 pad_bottom, pad_right;
     string data_format = ToString(data_format_);
@@ -1161,7 +1204,7 @@ class MklConvOp : public OpKernel {
       pad_left = paddings[6];
       pad_right = paddings[7];
     }
-    // Create padding arrays for MKL DNN convolutions.
+    // Create padding arrays for MKL-DNN convolutions.
     // MKL-DNN uses asymetric padding.
     padding_left = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
     padding_right = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
@@ -1170,6 +1213,11 @@ class MklConvOp : public OpKernel {
  protected:
   void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
   void set_fuse_relu(bool fuse_relu) { fuse_relu_ = fuse_relu; }
+  void set_fuse_pad(bool fuse_pad) {
+    fuse_pad_ = fuse_pad;
+    // In PadwithFusedConv OP, pad is the fourth index.
+    input_index_pad_ = 3;
+  }
 
   // This method is for the base class MklConvOp, which handles the
   // floating point implementation of Conv. The quantized conv implementations
@@ -1183,30 +1231,26 @@ class MklConvOp : public OpKernel {
     params.dtypes.append(typeid(Toutput).name());
 
     // Add fusions as post ops
-    // Note: Fusion of BiasAdd is handled directly inside MklConvOp by
-    // checking fuse_biasadd_ flag.
+    // NOTE: Fusion of BiasAdd is handled directly inside MklConvOp by
+    // checking `fuse_biasadd_` flag.
     if (fuse_relu_) params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
-  virtual Tbias* GetBiasHandle(
-      OpKernelContext* context,
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
-          conv2d_fwd_pd,
-      const Tensor& bias_tensor) {
+  virtual Tbias* GetBiasHandle(OpKernelContext* context,
+                               std::shared_ptr<ConvFwdPd>& conv2d_fwd_pd,
+                               const Tensor& bias_tensor) {
     if (fuse_biasadd_) {
       return static_cast<Tbias*>(
           const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
-    } else {
-      return nullptr;
     }
+    return nullptr;
   }
 
-  // Allocate output tensor.
-  virtual void AllocateOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) {
+  virtual void AllocateOutputTensor(OpKernelContext* context,
+                                    const ConvFwdPd& conv_prim_desc,
+                                    const memory::dims& output_dims_mkl_order,
+                                    memory::format output_tf_format,
+                                    Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     auto dst_pd = conv_prim_desc.dst_primitive_desc();
 
@@ -1237,23 +1281,54 @@ class MklConvOp : public OpKernel {
  private:
   std::vector<int32> strides_;
   std::vector<int32> dilations_;
+  std::vector<Tpadding> padding_list_;
+  bool is_filter_const_;
+  mutex mu_;
   Padding padding_;
   TensorFormat data_format_;
+  PersistentTensor cached_filter_data_ptensor_ GUARDED_BY(mu_);
+  PersistentTensor cached_filter_md_ptensor_ GUARDED_BY(mu_);
 
   // Initialize to values the template is instantiated with
   bool fuse_biasadd_ = bias_enabled;
   bool fuse_relu_ = false;
+  bool fuse_pad_ = pad_enabled;
+
+  int input_index_pad_ = 2;
 
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
-  const int kInputIndex_Pad = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
-  // Allocate filter output tensor.
-  void AllocateFilterOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+  // Allocate persistent tensors for cached filter data and
+  // cached filter memory descriptor (data format)
+  void AllocatePersistentTensor(OpKernelContext* context,
+                                const ConvFwdPd& conv_prim_desc,
+                                Tensor** filter_tensor) {
+    DCHECK(filter_tensor);
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim(
+        (conv_prim_desc.weights_primitive_desc().get_size() / sizeof(Tfilter)));
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DataTypeToEnum<Tfilter>::value, filter_tf_shape,
+                                &cached_filter_data_ptensor_, filter_tensor));
+
+    Tensor* second_tensor = nullptr;
+    TensorShape filter_mkl_format;
+    filter_mkl_format.AddDim(
+        sizeof(conv_prim_desc.weights_primitive_desc().desc().data.format) /
+        sizeof(DT_INT32));
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DT_INT32, filter_mkl_format,
+                                &cached_filter_md_ptensor_, &second_tensor));
+    second_tensor->scalar<int32>()() =
+        conv_prim_desc.weights_primitive_desc().desc().data.format;
+  }
+
+  void AllocateFilterOutputTensor(OpKernelContext* context,
+                                  const ConvFwdPd& conv_prim_desc,
+                                  const memory::dims& filter_dims_tf_order,
+                                  Tensor** filter_tensor) {
     CHECK_NOTNULL(filter_tensor);
     auto filter_pd = conv_prim_desc.weights_primitive_desc();
 
@@ -1276,12 +1351,14 @@ class MklConvOp : public OpKernel {
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
   }
+
   // Prepare and execute net - checks for input and output reorders.
-  void PrepareAndExecuteNet(
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<Tinput>* src, MklDnnData<Tfilter>* filter,
-      MklDnnData<Tbias>* bias, MklDnnData<Toutput>* output,
-      Tensor* filter_out_tensor) {
+  void PrepareAndExecuteNet(const ConvFwdPd& conv_prim_desc,
+                            MklDnnData<Tinput>* src,
+                            MklDnnData<Tfilter>* filter,
+                            MklDnnData<Tbias>* bias,
+                            MklDnnData<Toutput>* output,
+                            Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -1310,18 +1387,79 @@ class MklConvOp : public OpKernel {
 
     stream(stream::kind::eager).submit(net).wait();
   }
+
+  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // be acquired before entering the function, since it is acquired
+  // inside the function.
+  inline bool IsFilterCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& cached_filter_data_tensor =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+    return (cached_filter_data_tensor.NumElements() == 0);
+  }
+
+  // Cache the converted filter in a persistent tensor.
+  // Only one thread can execute this method at any given time.
+  void CacheFilter(OpKernelContext* context,
+                   const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
+                   Tfilter* filter_data, const Tensor& filter_tensor,
+                   MklDnnData<Tfilter>& filter, const memory::desc& filter_md)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    const Tensor& cached_filter_data_tensor =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+
+    // If filter is already cached, there's nothing to do.
+    if (cached_filter_data_tensor.NumElements() > 0) {
+      return;
+    }
+
+    // Otherwise, cache filter
+    filter.SetUsrMem(filter_md, &filter_tensor);
+    filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc());
+    filter_data = static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+
+    Tensor* filter_tensor_ptr = nullptr;
+    AllocatePersistentTensor(context, *conv_fwd_pd, &filter_tensor_ptr);
+    void* cached_filter_data = filter.GetTensorBuffer(filter_tensor_ptr);
+    size_t cached_filter_data_size =
+        filter.GetOpMem().get_primitive_desc().get_size();
+    memcpy(cached_filter_data, filter_data, cached_filter_data_size);
+  }
+
+  Tfilter* GetCachedFilter(OpKernelContext* context,
+                           const memory::format& filter_mf)
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& cached_filter_data =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+    const Tensor& cached_filter_md =
+        *cached_filter_md_ptensor_.AccessTensor(context);
+
+    // Check if the memory descriptor of the cached weights is same as
+    // filter_mf. If so, we can used the cached weights; otherwise
+    // return NULL.
+    // TODO (bhavanis): Do we need to cast filter_mf before the check?
+    if (cached_filter_md.scalar<int32>().size() &&
+        cached_filter_md.scalar<int32>()() == filter_mf) {
+      return static_cast<Tfilter*>(
+          const_cast<Tfilter*>(cached_filter_data.flat<Tfilter>().data()));
+    }
+    return nullptr;
+  }
 };
 
 // Base class for fused convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
-          typename Toutput, typename Ttemp_output>
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool pad_enabled>
 class MklFusedConvOp
     : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                       int32, false, false, false> {
+                       Tpadding, false, false, false> {
  public:
   explicit MklFusedConvOp(OpKernelConstruction* context)
-      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output, int32,
-                  false, false, false>(context) {
+      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                  Tpadding, false, false, false>(context) {
     // Since we came here through the registration of _MklFusedConv2D, get
     // all information from 'fused_ops' and 'num_args'
     std::vector<string> fused_ops;
@@ -1351,12 +1489,16 @@ class MklFusedConvOp
                   errors::Unimplemented("Fusion is not implemented: [",
                                         str_util::Join(fused_ops, ","), "]"));
     }
+
+    if (pad_enabled) {
+      this->set_fuse_pad(true);
+    }
   }
 
   virtual ~MklFusedConvOp() {}
 };
 
-// We create new class for each verison of Quantized Convolution and inherit
+// We create new class for each version of Quantized Convolution and inherit
 // from the FP32 version of the base class
 template <typename Device, typename Tbias, typename Toutput,
           typename Ttemp_output, bool bias_enabled>
@@ -1378,7 +1520,13 @@ class MklQuantizedConv2DOp
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
       : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-                  bias_enabled, false, false>(context) {}
+                  bias_enabled, false, false>(context) {
+    bool is_filter_const;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("is_filter_const", &is_filter_const));
+    OP_REQUIRES(context, is_filter_const,
+                errors::InvalidArgument("Filter must be a constant"));
+  }
 
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
@@ -1402,9 +1550,9 @@ class MklQuantizedConv2DOp
     float max_output_value;
     if (std::is_same<Toutput, quint8>::value ||
         std::is_same<Toutput, qint8>::value) {
-      // This is the case the convolution and requantization are fused.
+      // This is the case when convolution and requantization are fused.
       // min_freezed_output and max_freezed_output are the actual range
-      // for the output
+      // of the output.
       min_output_value = context->input(6 + bias_index_offset).flat<float>()(0);
       max_output_value = context->input(7 + bias_index_offset).flat<float>()(0);
     } else {
@@ -1473,10 +1621,9 @@ class MklQuantizedConv2DOp
     }
   }
 
-  Tbias* GetBiasHandle(
-      OpKernelContext* context,
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
-      const Tensor& bias_tensor) override {
+  Tbias* GetBiasHandle(OpKernelContext* context,
+                       std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
+                       const Tensor& bias_tensor) override {
     int bias_index_offset;
     bias_index_offset = bias_enabled ? 1 : 0;
 
@@ -1605,12 +1752,11 @@ class MklQuantizedConv2DSumReluOp
     params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
-  // Allocate output tensor.
-  void AllocateOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) override {
+  void AllocateOutputTensor(OpKernelContext* context,
+                            const ConvFwdPd& conv_prim_desc,
+                            const memory::dims& output_dims_mkl_order,
+                            memory::format output_tf_format,
+                            Tensor** output_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
     float reorder_sum_scale = 1.0;
     if (std::is_same<Toutput, quint8>::value) {
@@ -1625,7 +1771,8 @@ class MklQuantizedConv2DSumReluOp
       auto dst_md = summand_mkl_shape.GetMklLayout();
       if (summand_mkl_shape.IsMklTensor()) {
         if (summand_type == DT_QINT8) {
-          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          OP_REQUIRES_OK(context, summand.BitcastFrom(summand, DT_QUINT8,
+                                                      summand.shape()));
           dst_md.data.data_type =
               static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
           summand_mkl_shape.SetMklLayout(&dst_md);
@@ -1696,7 +1843,7 @@ class MklQuantizedConv2DSumReluOp
 };
 
 // INT8 kernel registration
-// Register NoOp kernel for QunatizedConv2D for qint8 filter
+// Register NoOp kernel for QuantizedConv2D for qint8 filter
 REGISTER_KERNEL_BUILDER(Name("QuantizedConv2D")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<quint8>("Tinput")
@@ -1711,7 +1858,7 @@ REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRequantize")
                             .TypeConstraint<qint8>("out_type"),
                         NoOp);
 
-// Register a templatized implementation of MklQuntizedConv2D.
+// Register a templatized implementation of MklQuantizedConv2D.
 REGISTER_KERNEL_BUILDER(
     Name("_MklQuantizedConv2D")
         .Device(DEVICE_CPU)
@@ -1900,17 +2047,40 @@ REGISTER_KERNEL_BUILDER(
         .Device(DEVICE_CPU)
         .TypeConstraint<quint8>("Tinput")
         .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
         .TypeConstraint<quint8>("out_type")
         .Label(mkl_op_registry::kMklQuantizedOpLabel),
     MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, quint8, true>);
+
 REGISTER_KERNEL_BUILDER(
     Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
         .Device(DEVICE_CPU)
         .TypeConstraint<quint8>("Tinput")
         .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
         .TypeConstraint<quint8>("out_type")
         .Label(mkl_op_registry::kMklQuantizedOpLabel),
     MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, qint8, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, float, quint8, quint8, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, float, quint8, qint8, true>);
 #endif  // INTEL_MKL_ML
 
 // Register 2D operations
@@ -1965,13 +2135,35 @@ TF_CALL_float(REGISTER_MKL_CPU_2D);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
 
+// Note we are registering _MklFusedConv2D.
+// We check the fused_ops attributes to decide if bias is enabled or not.
 #define REGISTER_MKL_CPU_2D_FUSED(T)                                \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedConv2D")                   \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklFusedConv2D")                                       \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false>);      \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<int32>("Tpaddings")                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true>);       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .TypeConstraint<int64>("Tpaddings")                       \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true>);       \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithFusedConv2D")      \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int32>("Tpaddings")   \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedConvOp<CPUDevice, T, T, T, T, T>);
-// We check the fused_ops attributes to decide if bias is enabled or not.
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index fd279ead9d7ad9f9475b505be49d15e2b01f63fc..c12a4ff0f0c48d5b15c03eb9ee98985930463845 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "mkldnn.hpp"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -58,7 +58,7 @@ class MklDnnConvUtil {
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
                  Padding pad, TensorFormat fm,
-                 const std::vector<int32>& dilations)
+                 const std::vector<int32>& dilations, bool is_depthwise = false)
       : context_(context),
         strides_(strides),
         dilations_(dilations),
@@ -392,14 +392,24 @@ class MklDnnConvUtil {
     int64 pad_D1, pad_D2;
 
     if (is_conv2d) {
+      Padding padding_type;
+      if (pad_enabled) {
+        padding_type = Padding::EXPLICIT;
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+      } else {
+        padding_type = padding_;
+      }
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_rows, filter_rows, dilation_rows, stride_rows,
-                         padding_, &out_rows, &pad_top, &pad_bottom));
+                         padding_type, &out_rows, &pad_top, &pad_bottom));
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_cols, filter_cols, dilation_cols, stride_cols,
-                         padding_, &out_cols, &pad_left, &pad_right));
+                         padding_type, &out_cols, &pad_left, &pad_right));
     } else {
       OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
                                    input_planes, filter_planes, stride_planes,
@@ -413,25 +423,11 @@ class MklDnnConvUtil {
     }
 
     if (is_conv2d) {
-      // Conv + pad fusion is enabled only for 2D
+      // Conv + pad fusion is enabled only for 2D.
       // If pad_enabled, i.e., pad and conv op are fused, then
       // all pads are already passed from pad op through
-      // *pad_l and *pad_r
-      if (pad_enabled) {
-        pad_top = static_cast<int64>((*pad_l)[0]);
-        pad_left = static_cast<int64>((*pad_l)[1]);
-        pad_bottom = static_cast<int64>((*pad_r)[0]);
-        pad_right = static_cast<int64>((*pad_r)[1]);
-        // update the out_rows and out_cols based on all
-        // sides of the pads coming from pad op.
-        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
-        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
-      }
-      // Handle padding. MKL-DNN uses asymetric padding.
-      // But, if pad_enabled, i.e., pad and conv op are fused,
-      // then, *pad_l and *pad_r are already set from pad op.
-      // In that case they need not set here.
-      else {
+      // *pad_l and *pad_r and they don't need to be set here.
+      if (!pad_enabled) {
         *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
         *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
       }
@@ -550,7 +546,7 @@ class MklDnnConvUtil {
 ///  Common class that implements ConvBackpropFilter and Input
 /////////////////////////////////////////////////////////////////////
 
-template <typename Device, class T>
+template <typename Device, class T, bool is_depthwise>
 class MklConvBackpropCommonOp : public OpKernel {
  public:
   ~MklConvBackpropCommonOp() {}
@@ -563,28 +559,38 @@ class MklConvBackpropCommonOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
 
-    if (strides_.size() == 4) {
-      // Check Conv2D dilations
-      OP_REQUIRES(context, dilations_.size() == 4,
-                  errors::InvalidArgument("Sliding window dilations field must "
-                                          "specify 4 dimensions"));
-      int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-      int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-      int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-      int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-      OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "dilations in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context, dilation_h > 0 && dilation_w > 0,
-          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    // Depthwise Convolution doesn't have dilation parameter
+    if (!is_depthwise) {
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+      if (strides_.size() == 4) {
+        // Check Conv2D dilations
+        OP_REQUIRES(
+            context, dilations_.size() == 4,
+            errors::InvalidArgument("Sliding window dilations field must "
+                                    "specify 4 dimensions"));
+        int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+        int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+        int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+        int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+        OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                    errors::InvalidArgument(
+                        "Current implementation does not yet support "
+                        "dilations in the batch and depth dimensions."));
+        OP_REQUIRES(
+            context, dilation_h > 0 && dilation_w > 0,
+            errors::InvalidArgument("Dilated rates should be larger than 0."));
+      }
+    } else {
+      // Set dilations as 1 for depthwise conv
+      // for future support to align with Tensorflow
+      dilations_ = {1, 1, 1, 1};
     }
 
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index 258cca9332b5b86adbf0bbcb285210552729243e..288515de0bcbb9a940cf3e0c790a308762904482 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -38,8 +38,12 @@ namespace tensorflow {
 static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
 static const TensorShape dummy_shape({8});
 
+using BiasAddGraphRunner =
+    std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                       const Tensor& bias_data, Tensor* out)>;
+
 template <typename T>
-class ConvMklToTF : public OpsTestBase {
+class CommonTestUtilities : public OpsTestBase {
  public:
   void PerformConversion(DataType dtype, const Tensor& tensor,
                          const Tensor& mkl_meta_tensor, Tensor* output) {
@@ -59,6 +63,23 @@ class ConvMklToTF : public OpsTestBase {
     *output = *GetOutput(0);
   }
 
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  static void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                          Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
   void ConvertAndCompare(DataType dtype, const Tensor& tensor,
                          const Tensor& mkl_meta_tensor,
                          const Tensor& expected) {
@@ -67,6 +88,35 @@ class ConvMklToTF : public OpsTestBase {
     test::ExpectTensorNear<T>(expected, output, 1e-5);
   }
   void TestBody() {}
+
+  static void VerifyBiasAddTensorsClose(int depth, int image_width,
+                                        int image_height, int image_batch_count,
+                                        int filter_size, int filter_count,
+                                        const BiasAddGraphRunner& run_default,
+                                        const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectClose(conv_2d, fused_conv_2d);
+  }
 };
 
 // Testing MKL's fused convolution ops
@@ -79,27 +129,6 @@ class MklFusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageHeight = 32;
   static constexpr int kImageBatchCount = 8;
 
-  using BiasAddGraphRunner =
-      std::function<void(const Tensor& input_data, const Tensor& filter_data,
-                         const Tensor& bias_data, Tensor* out)>;
-
-  // Runs a Tensorflow graph defined by the root scope, and fetches the result
-  // of 'fetch' node into the output Tensor.
-  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
-                   Tensor* output) {
-    tensorflow::GraphDef graph;
-    TF_ASSERT_OK(root.ToGraphDef(&graph));
-
-    std::unique_ptr<tensorflow::Session> session(
-        tensorflow::NewSession(tensorflow::SessionOptions()));
-    TF_ASSERT_OK(session->Create(graph));
-
-    std::vector<Tensor> unfused_tensors;
-    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
-
-    *output = unfused_tensors[0];
-  }
-
   void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
                          const Tensor& bias_data, Tensor* output,
                          int stride = 1) {
@@ -115,7 +144,7 @@ class MklFusedConv2DOpTest : public OpsTestBase {
         root.WithOpName("with_bias"), conv,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
 
-    RunAndFetch(root, "with_bias", output);
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
   }
 
   void RunConv2DWithBiasAndRelu(const Tensor& input_data,
@@ -136,7 +165,7 @@ class MklFusedConv2DOpTest : public OpsTestBase {
 
     auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
 
-    RunAndFetch(root, "with_relu", output);
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
   }
 
   void RunMklFusedConv2DOp(const Tensor& image, const Tensor& filter,
@@ -149,12 +178,12 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
                      .Input(FakeInput(dtype))
                      .Input(FakeInput(dtype))
-                     .Attr("num_args", num_args)
                      .Input(FakeInput(num_args, dtype))
                      .Input(FakeInput(DT_UINT8))
                      .Input(FakeInput(DT_UINT8))
                      .Input(FakeInput(num_args, DT_UINT8))
                      .Attr("T", dtype)
+                     .Attr("num_args", num_args)
                      .Attr("strides", {1, stride, stride, 1})
                      .Attr("padding", "SAME")
                      .Attr("fused_ops", fused_ops)
@@ -178,40 +207,11 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     // Index 2 will need to be changed if the number of outputs produced
     // by MklConv2D change.
     const Tensor& output_meta_tensor = *GetOutput(2);
-    ConvMklToTF<T> conv_comp;
-    conv_comp.PerformConversion(dtype, output_tensor, output_meta_tensor,
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
                                 output);
   }
 
-  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
-                                int image_batch_count, int filter_size,
-                                int filter_count,
-                                const BiasAddGraphRunner& run_default,
-                                const BiasAddGraphRunner& run_fused) {
-    DataType dtype = DataTypeToEnum<T>::v();
-
-    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
-    image.flat<T>() = image.flat<T>().setRandom();
-
-    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
-    filter.flat<T>() = filter.flat<T>().setRandom();
-
-    const int bias_size = filter_count;
-    Tensor bias(dtype, {bias_size});
-    bias.flat<T>() = bias.flat<T>().setRandom();
-
-    Tensor conv_2d;
-    Tensor fused_conv_2d;
-
-    run_default(image, filter, bias, &conv_2d);
-    run_fused(image, filter, bias, &fused_conv_2d);
-
-    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
-    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
-
-    test::ExpectClose(conv_2d, fused_conv_2d);
-  }
-
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
   // FusedConv2D.
   void VerifyConv2DWithBias(int filter_size, int filter_count,
@@ -231,9 +231,9 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                               out);
         };
 
-    VerifyBiasAddTensorsNear(depth, image_width, image_height,
-                             image_batch_count, filter_size, filter_count,
-                             run_default, run_fused);
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
   }
 
   // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
@@ -256,9 +256,9 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                               {"BiasAdd", "Relu"}, out);
         };
 
-    VerifyBiasAddTensorsNear(depth, image_width, image_height,
-                             image_batch_count, filter_size, filter_count,
-                             run_default, run_fused);
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
   }
 };
 
@@ -341,8 +341,8 @@ class FusedPadConvOpTest : public OpsTestBase {
     // Compare output to expected results
     const Tensor& first = *GetOutput(0);
     const Tensor& second = *GetOutput(2);
-    ConvMklToTF<T> conv_comp;
-    conv_comp.ConvertAndCompare(dtype, first, second, expected);
+    CommonTestUtilities<T> test_util;
+    test_util.ConvertAndCompare(dtype, first, second, expected);
   }
 };
 
@@ -401,5 +401,295 @@ TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
 
   Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
 }
+
+class FilterCacheTest : public OpsTestBase {
+ public:
+  template <typename T>
+  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& expected,
+           const bool is_filter_const) {
+    const int stride = 1;
+
+    TF_EXPECT_OK(NodeDefBuilder("conv2d_filter_cache", "_MklConv2D")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(dtype))     // Filter
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Attr("padding", "VALID")
+                     .Attr("data_format", "NHWC")
+                     .Attr("is_filter_const", is_filter_const)
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    // Setting up inputs and execute
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare outputs to expected results
+    const Tensor& output = *GetOutput(0);
+    const Tensor& output_layout = *GetOutput(2);
+    CommonTestUtilities<T> conv_comp;
+    conv_comp.ConvertAndCompare(dtype, output, output_layout, expected);
+
+    // TODO(bhavanis): For now, we rely on internal performance tests to
+    // determine if filter data is being cached and reused.
+    // However, we still need to add a check here to determine if this is
+    // still the case by inspecting the contents of the persistent tensor.
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_new = *GetOutput(0);
+    const Tensor& output_layout_new = *GetOutput(2);
+    CommonTestUtilities<T> conv_comp_new;
+    conv_comp_new.ConvertAndCompare(dtype, output_new, output_layout_new,
+                                    expected);
+  }
+};
+
+TEST_F(FilterCacheTest, Conv2DFilterCacheTest) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 1, 2, 1}));
+  test::FillValues<float>(&expected, {312, 357});
+
+  Run<float>(DT_FLOAT, image, filter, expected, true);
+}
+
+// Testing fusion of pad and fusedconv2d
+template <typename T>
+class MklPadWithFusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 30;
+  static constexpr int kImageHeight = 28;
+  static constexpr int kImageBatchCount = 8;
+
+  // 0: top pad, 1: bottom pad, 2: left pad, 3: right pad
+  int padding_list_[4];
+
+  // Verifies that computing Pad+Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBias(int filter_size, int filter_count,
+                                  int depth = kDepth,
+                                  int image_width = kImageWidth,
+                                  int image_height = kImageHeight,
+                                  int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default = [this](const Tensor& input_data,
+                                                  const Tensor& filter_data,
+                                                  const Tensor& bias_data,
+                                                  Tensor* out) {
+      RunMklPadWithFusedConv2DAndBias(input_data, filter_data, bias_data, out);
+    };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Pad+Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBiasRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklPadWithFusedConv2DAndBiasRelu(input_data, filter_data,
+                                              bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd", "Relu"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  void RunMklPadWithFusedConv2DAndBias(const Tensor& input_data,
+                                       const Tensor& filter_data,
+                                       const Tensor& bias_data, Tensor* output,
+                                       int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunMklPadWithFusedConv2DAndBiasRelu(const Tensor& input_data,
+                                           const Tensor& filter_data,
+                                           const Tensor& bias_data,
+                                           Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunMklFusedConv2DWithPadOp(const Tensor& image, const Tensor& filter,
+                                  const std::vector<Tensor>& args,
+                                  const std::vector<string>& fused_ops,
+                                  Tensor* output, int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    const int num_args = static_cast<int>(args.size());
+    Tensor padding(DT_INT32, {4, 2});
+    test::FillValues<int32>(
+        &padding, {0, 0, padding_list_[0], padding_list_[1], padding_list_[2],
+                   padding_list_[3], 0, 0});
+
+    TF_EXPECT_OK(NodeDefBuilder("pad_fused_conv_op", "_MklPadWithFusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("num_args", num_args)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "VALID")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    // Add MKL meta input for input, filter, pad and agrs.
+    for (int i = 0; i < args.size() + 3; ++i)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+ public:
+  void SetPaddingList(int top, int bottom, int left, int right) {
+    padding_list_[0] = top;
+    padding_list_[1] = bottom;
+    padding_list_[2] = left;
+    padding_list_[3] = right;
+  }
+};
+
+TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest);
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest,  //
+                           WithBiasAndRoundPad,          //
+                           WithBiasAndPartialPad,        //
+                           WithBiasReluAndRoundPad,      //
+                           WithBiasReluAndPartialPad);
+
+using MklPadWithFusedConv2DDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, MklPadWithFusedConv2DOpTest,
+                              MklPadWithFusedConv2DDataTypes);
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 4d46abb0a4dd232ef13c8b6b0547b0779af1f98f..bc52127b942375c89cea832e3013684687374cb6 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <vector>
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index dc84d3941e78a2232041b2dbcf83bf3545982dee..a8d1dffd4e52c8e9a16a0a82cf8c31be9cb628e9 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <limits>
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc68480bbe8b9ed509309a16df2b805fe02e20f1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+using test::graph::Constant;
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedConcatTest : public OpsTestBase {
+ protected:
+  QuantizedConcatTest() {}
+
+  void TestSmall8Bit(float first_min, float first_max, float second_min,
+                     float second_max);
+  void TestSecondDim8Bit(float first_min, float first_max, float second_min,
+                         float second_max);
+};
+
+TEST_F(QuantizedConcatTest, Small8BitSameRange) {
+  // Range for both is the same, so impl can use memcpy.
+  TestSmall8Bit(0.0f, 255.0f, 0.0f, 255.0f);
+}
+
+void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
+                                        float second_min, float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_batch = first_batch + second_batch;
+  Tensor expected_float(
+      DT_FLOAT, {expected_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                           13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedConcatTest, SecondDim8BitSameRange) {
+  TestSecondDim8Bit(-10.0f, 150.0f, -10.0f, 150.0f);
+}
+
+void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
+                                            float second_min,
+                                            float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_height = first_height + second_height;
+  Tensor expected_float(
+      DT_FLOAT, {first_batch, expected_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
+                           7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  // Using the same error tolerance as in Eigen QuantizedConcat test
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
index 10825f696253cc6d38bbdee1e6b660d494c34088..fef2d837cf27a0854ffc34ad3d1b60831a776fbc 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -24,8 +24,13 @@ limitations under the License.
 namespace tensorflow {
 template <class T>
 float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
-  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
-  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+
+  // Adjusting for having a symmetric range.
+  // for example: for 8-bit [-127, 127] as opposed to [-128, 127].
+  if (lowest < -highest) ++lowest;
+
   const float float_for_one_quantized_level =
       (range_max - range_min) / (highest - lowest);
   return float_for_one_quantized_level;
@@ -48,6 +53,35 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
   *min_c = c_float_for_one_quant_level * c_lowest;
   *max_c = c_float_for_one_quant_level * c_highest;
 }
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           const Tensor& min_b_vector,
+                                           const Tensor& max_b_vector,
+                                           Tensor** min_c_vector,
+                                           Tensor** max_c_vector) {
+  DCHECK(min_b_vector.NumElements() == (*min_c_vector)->NumElements());
+  DCHECK(max_b_vector.NumElements() == (*max_c_vector)->NumElements());
+  size_t n_channel = min_b_vector.NumElements();
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float* min_b = min_b_vector.flat<float>().data();
+  const float* max_b = max_b_vector.flat<float>().data();
+  float* min_c = (*min_c_vector)->flat<float>().data();
+  float* max_c = (*max_c_vector)->flat<float>().data();
+#pragma omp parallel for
+  for (size_t n = 0; n < n_channel; ++n) {
+    float a_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+    float b_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]);
+    float c_float_for_one_quant_level =
+        a_float_for_one_quant_level * b_float_for_one_quant_level;
+    min_c[n] = c_float_for_one_quant_level * c_lowest;
+    max_c[n] = c_float_for_one_quant_level * c_highest;
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e599d3d9f8e86e4e78297ea27b0b030444ed94a
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops_test.cc
@@ -0,0 +1,458 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// TODO(bhavanis): Move ConvMklToTF to mkl_test_util.h as it is used by
+// most unit tests.
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMklToTF(DataType dtype, const Tensor& input,
+                      const Tensor& input_metadata_tensor, Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // MKL metadata tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(input.shape(), input.flat<T>());
+    AddInputFromArray<uint8>(input_metadata_tensor.shape(),
+                             input_metadata_tensor.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody() {}
+};
+
+class QuantizedConv2DTest : public OpsTestBase {
+ protected:
+  void ConfigureQuantizedConv2D(const int& stride = 1) {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "_MklQuantizedConv2D")
+                     .Input(FakeInput(DT_QUINT8))  // Input
+                     .Input(FakeInput(DT_QINT8))   // Filter
+                     .Input(FakeInput(DT_FLOAT))   // Min input
+                     .Input(FakeInput(DT_FLOAT))   // Max input
+                     .Input(FakeInput(DT_FLOAT))   // Min filter
+                     .Input(FakeInput(DT_FLOAT))   // Max filter
+                     //  MKL metadata tensors //
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     ///////////////////////////
+                     .Attr("Tinput", DataTypeToEnum<quint8>::v())
+                     .Attr("Tfilter", DataTypeToEnum<qint8>::v())
+                     .Attr("T", DataTypeToEnum<quint8>::v())
+                     .Attr("out_type", DataTypeToEnum<qint32>::v())
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("_kernel", "QuantizedMklOp")
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+// Output -> float
+TEST_F(QuantizedConv2DTest, Small) {
+  const int stride = 1;
+  ConfigureQuantizedConv2D(stride);
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+
+  // Image -> uint8
+  const float image_min = 0.0f;
+  const float image_max = 255.0f;
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  Tensor image_float(DT_FLOAT,
+                     {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor image_quantized =
+      FloatTensorToQuantized<quint8>(image_float, image_min, image_max);
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+
+  // Filter -> int8 with symmetric range
+  const float filter_min = -127.0f;
+  const float filter_max = 127.0f;
+
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  Tensor filter_float(DT_FLOAT,
+                      {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter_float, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+  Tensor filter_quantized =
+      FloatTensorToQuantized<qint8>(filter_float, filter_min, filter_max);
+
+  AddInputFromArray<quint8>(image_quantized.shape(),
+                            image_quantized.flat<quint8>());
+  AddInputFromArray<qint8>(filter_quantized.shape(),
+                           filter_quantized.flat<qint8>());
+  AddInputFromArray<float>(TensorShape({1}), {image_min});
+  AddInputFromArray<float>(TensorShape({1}), {image_max});
+  AddInputFromArray<float>(TensorShape({1}), {filter_min});
+  AddInputFromArray<float>(TensorShape({1}), {filter_max});
+
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*8)+(7*0)+(2*11)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+
+  // Output -> float
+  const int expected_width = image_width;
+  const int expected_height = image_height;
+  Tensor expected_float(
+      DT_FLOAT, TensorShape({image_batch_count, expected_height, expected_width,
+                             filter_count}));
+  test::FillValues<float>(&expected_float, {105, 150, 183, 95, 235, 312, 357,
+                                            178, 187, 234, 261, 121});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& output_mkl_metadata = *GetOutput(3);
+
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMklToTF<qint32>(DT_QINT32, output, output_mkl_metadata,
+                                   output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<qint32>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+// Output -> qint32
+TEST_F(QuantizedConv2DTest, Small32Bit) {
+  const int stride = 1;
+  ConfigureQuantizedConv2D(stride);
+
+  // The illustrations and details regarding inputs and outputs
+  // are in TEST_F(QuantizedConv2DTest, Small)
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<qint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {10, 40, 70, 20, 50, 80, 30, 60, 90});
+
+  // Image -> uint8
+  AddInputFromArray<float>(TensorShape({1}), {0.0f});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  // Filter -> int8 with symmetric range
+  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Output -> qint32
+  const int expected_width = image_width;
+  const int expected_height = image_height;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(
+      &expected, {10500, 15000, 18300, 9500, 23500, 31200, 35700, 17800, 18700,
+                  23400, 26100, 12100});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& output_mkl_metadata = *GetOutput(3);
+
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMklToTF<qint32>(DT_QINT32, output, output_mkl_metadata,
+                                   output_quantized);
+
+  test::ExpectTensorEqual<qint32>(expected, output_quantized);
+}
+
+// Output -> qint32
+TEST_F(QuantizedConv2DTest, Small32BitWithPadding) {
+  const int stride = 1;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_conv_op", "_MklQuantizedConv2D")
+                   .Input(FakeInput(DT_QUINT8))  // Input
+                   .Input(FakeInput(DT_QINT8))   // Filter
+                   .Input(FakeInput(DT_FLOAT))   // Min input
+                   .Input(FakeInput(DT_FLOAT))   // Max input
+                   .Input(FakeInput(DT_FLOAT))   // Min filter
+                   .Input(FakeInput(DT_FLOAT))   // Max filter
+                   //  MKL metadata tensors //
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   ///////////////////////////
+                   .Attr("Tinput", DataTypeToEnum<quint8>::v())
+                   .Attr("Tfilter", DataTypeToEnum<qint8>::v())
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("out_type", DataTypeToEnum<qint32>::v())
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("padding_list", {0, 0, 1, 1, 1, 1, 0, 0})
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // The illustrations and details regarding inputs and outputs
+  // are in TEST_F(QuantizedConv2DTest, Small)
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<qint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {10, 40, 70, 20, 50, 80, 30, 60, 90});
+
+  // Image -> uint8
+  AddInputFromArray<float>(TensorShape({1}), {0.0f});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  // Filter -> int8 with symmetric range
+  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Output -> qint32
+  const int expected_width = image_width;
+  const int expected_height = image_height;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(
+      &expected, {10500, 15000, 18300, 9500, 23500, 31200, 35700, 17800, 18700,
+                  23400, 26100, 12100});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& output_mkl_metadata = *GetOutput(3);
+
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMklToTF<qint32>(DT_QINT32, output, output_mkl_metadata,
+                                   output_quantized);
+
+  test::ExpectTensorEqual<qint32>(expected, output_quantized);
+}
+
+// Output -> qint32
+TEST_F(QuantizedConv2DTest, OddPadding) {
+  const int stride = 2;
+  ConfigureQuantizedConv2D(stride);
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 4;
+  const int image_batch_count = 1;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<qint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  // Image -> uint8
+  AddInputFromArray<float>(TensorShape({1}), {0.0f});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  // Filter -> int8 with symmetric range
+  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Output -> qint32
+  const int expected_width = image_width / stride;
+  const int expected_height = image_height / stride;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(&expected, {348, 252, 274, 175});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& output_mkl_metadata = *GetOutput(3);
+
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMklToTF<qint32>(DT_QINT32, output, output_mkl_metadata,
+                                   output_quantized);
+
+  test::ExpectTensorEqual<qint32>(expected, output_quantized);
+}
+
+// Output -> qint32
+TEST_F(QuantizedConv2DTest, OddPaddingBatch) {
+  const int stride = 2;
+  ConfigureQuantizedConv2D(stride);
+
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 4;
+  const int image_batch_count = 3;
+  AddInputFromArray<quint8>(
+      TensorShape({image_batch_count, image_height, image_width, depth}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  AddInputFromArray<qint8>(
+      TensorShape({filter_size, filter_size, depth, filter_count}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  // Image -> uint8
+  AddInputFromArray<float>(TensorShape({1}), {0.0f});
+  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+
+  // Filter -> int8 with symmetric range
+  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Output -> qint32
+  const int expected_width = image_width / stride;
+  const int expected_height = image_height / stride;
+  Tensor expected(DT_QINT32, TensorShape({image_batch_count, expected_height,
+                                          expected_width, filter_count}));
+  test::FillValues<qint32>(
+      &expected, {348, 252, 274, 175, 348, 252, 274, 175, 348, 252, 274, 175});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& output_mkl_metadata = *GetOutput(3);
+
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMklToTF<qint32>(DT_QINT32, output, output_mkl_metadata,
+                                   output_quantized);
+
+  test::ExpectTensorEqual<qint32>(expected, output_quantized);
+}
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index d8ab1cd25b9e09e6b25e2b0454567caa3dcea9e0..b31e5f0cacfb3560de98da78ff4d6c69a141c500 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,36 +16,27 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_bounded_relu;
 using mkldnn::eltwise_elu;
+using mkldnn::eltwise_forward;
 using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
 using mkldnn::memory;
 using mkldnn::prop_kind;
-using mkldnn::relu_backward;
-using mkldnn::relu_forward;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML_ONLY
-
 template <typename T>
 class MklEltwiseFwdParams {
  public:
@@ -451,335 +442,8 @@ class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-#endif
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-struct MklReluHelpers {
-  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
-                                     const Tensor& a) {
-    OP_REQUIRES(context, a.IsSameSize(g),
-                errors::InvalidArgument("g and a must be the same size"));
-  }
-  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
-                               const Tensor& a) {
-    ValidateSameSizeHelper(context, g, a);
-    return context->status().ok();
-  }
-};
-
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklReluOp : public OpKernel {
- public:
-  ~MklReluOp() {}
-
-  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklReluOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
-      const TensorShape& o_shape = input.shape();
-      Tensor* out_tensor = nullptr;
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &out_tensor, o_shape,
-                                mkl_context.output_shape);
-      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
-      (static_cast<T*>(out_o))[0] =
-          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
-      return;
-    }
-
-    // Generate size, stride for input if input is in MKL format.
-    if (input_in_mkl_format) {
-      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
-        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
-      }
-    } else {
-      mkl_context.in_dims = input.dims();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
-      }
-      mkl_context.in_strides[0] = 1;
-      for (int i = 1; i < mkl_context.in_dims; i++) {
-        mkl_context.in_strides[i] =
-            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-      }
-    }
-
-    float negative_slope = 0.0;
-    mkl_context.MklCreateInputLayouts(context);
-    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
-                                      mkl_context.lt_input, negative_slope),
-             E_SUCCESS);
-
-    Tensor* output = nullptr;
-
-    if (input_in_mkl_format) {
-      TensorShape tf_shape;
-      mkl_context.output_shape.SetMklTensor(true);
-      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
-                                            dnnResourceDst);
-      mkl_context.output_shape.SetTfLayout(
-          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                          mkl_context.output_shape.GetMklLayout())) /
-                      sizeof(T));
-      AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                                mkl_context.output_shape);
-    } else {
-      const TensorShape& o_shape = input.shape();
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                                mkl_context.output_shape);
-    }
-
-    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
-
-    mkl_context.relu_res[dnnResourceDst] = user_o;
-    mkl_context.relu_res[dnnResourceSrc] = user_i;
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
-             E_SUCCESS);
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, output_shape;
-    dnnPrimitive_t prim_relu_fwd = nullptr;
-    void* relu_res[dnnResourceNumber];
-    dnnLayout_t lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      dnnDelete_F32(prim_relu_fwd);
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-    }
-  } MklReluOpContext;
-};
-
-template <typename Device, typename T>
-class MklReluGradOp : public OpKernel {
- public:
-  ~MklReluGradOp() {}
-
-  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override;
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, grad_shape, output_shape;
-    void* relu_res[dnnResourceNumber];
-    dnnPrimitive_t prim_relu_bwd;
-    dnnLayout_t lt_input, lt_grad;
-
-    void MklPrepareReluGradInputs(OpKernelContext* context,
-                                  Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& g = MklGetInput(context, 0);
-      const Tensor& a = MklGetInput(context, 1);
-      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-      void* mkl_buffer_convert = nullptr;
-
-      dnnPrimitive_t cv_input_to_grad = nullptr;
-
-      // if input and grad are not in the same layout,
-      // do a conversion between them.
-      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
-                       &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
-        relu_res[dnnResourceSrc] = mkl_buffer_convert;
-        dnnDelete_F32(cv_input_to_grad);
-      } else {
-        relu_res[dnnResourceSrc] = buf_input;
-      }
-
-      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-      relu_res[dnnResourceDiffDst] = buf_grad;
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      if (!input_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-
-      if (!grad_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
-      }
-    }
-
-    void MklCleanup() {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      dnnDelete_F32(prim_relu_bwd);
-      if (!input_is_mkl) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      if (!grad_is_mkl) {
-        dnnLayoutDelete_F32(lt_grad);
-      }
-    }
-  } MklReluGradOpContext;
-};
-
-template <typename Device, typename T>
-void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
-  MklReluGradOpContext mkl_context;
-  const Tensor& g = MklGetInput(context, 0);
-  const Tensor& a = MklGetInput(context, 1);
-
-  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-  GetMklShape(context, 0, &mkl_context.grad_shape);
-  GetMklShape(context, 1, &mkl_context.input_shape);
-
-  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
-  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
-  if (!input_is_mkl && !grad_is_mkl &&
-      !MklReluHelpers::ValidateSameSize(context, g, a))
-    return;
-  Tensor* output = nullptr;
-
-  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
-    // handle the scalar case
-    const TensorShape& g_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, g_shape,
-                              mkl_context.output_shape);
-
-    void* out_o = static_cast<void*>(output->flat<T>().data());
-    (static_cast<T*>(out_o))[0] =
-        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
-    return;
-  }
-
-  // generate size, stride for input if input/grad is in mkl format.
-  if (grad_is_mkl || input_is_mkl) {
-    const MklShape* tmp_mkl_shape =
-        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
-
-    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
-      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
-    }
-  } else {
-    mkl_context.in_dims = g.dims();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
-    }
-    mkl_context.in_strides[0] = 1;
-    for (int i = 1; i < mkl_context.in_dims; i++) {
-      mkl_context.in_strides[i] =
-          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-    }
-  }
-
-  mkl_context.MklCreateInputLayouts(context);
-  float negative_slope = 0.0;
-  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
-                                     mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
-  Tensor mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
-
-  if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
-    TensorShape tf_shape;
-    mkl_context.output_shape.SetMklTensor(true);
-    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
-                                          dnnResourceDiffSrc);
-    mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
-    // shape of one that is in mkl layout.
-    if (grad_is_mkl == true) {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-    }
-
-    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                              mkl_context.output_shape);
-  } else {
-    const TensorShape& o_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                              mkl_context.output_shape);
-  }
-
-  mkl_context.relu_res[dnnResourceDiffSrc] =
-      static_cast<void*>(output->flat<T>().data());
-
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
-  mkl_context.MklCleanup();
-}
-
-#else  // INTEL_MKL_ML_ONLY
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
@@ -877,7 +541,7 @@ class MklReluOpBase : public OpKernel {
 
  private:
   engine cpu_engine = engine(engine::cpu, 0);
-  std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+  std::shared_ptr<eltwise_forward::primitive_desc> relu_fwd_pd;
 
  protected:
   float alpha_;
@@ -1045,7 +709,7 @@ class MklReluGradOpBase : public OpKernel {
 
  private:
   engine cpu_engine = engine(engine::cpu, 0);
-  std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+  std::shared_ptr<eltwise_forward::primitive_desc> relu_fwd_pd;
 
  protected:
   float alpha_;
@@ -1399,8 +1063,6 @@ class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
   }
 };
 
-#endif
-
 // register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
@@ -1415,8 +1077,6 @@ class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifndef INTEL_MKL_ML_ONLY
-
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
   REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
@@ -1470,8 +1130,6 @@ TF_CALL_float(REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES);
                           MklLeakyReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES);
 
-#endif
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..767a6f1c3976d335bfd660f3a6990c03805843ba
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+#include <limits>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class MklRequantizationRangePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizationRangePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("clip_value_max", &clip_value_max_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(kInputTensorIndex);
+    const Tensor& input_min = ctx->input(kInputMinIndex);
+    const Tensor& input_max = ctx->input(kInputMaxIndex);
+
+    const size_t depth = input_max.NumElements();
+    OP_REQUIRES(
+        ctx, input_min.dim_size(0) == depth,
+        errors::InvalidArgument("input_min has incorrect size, expected ",
+                                depth, " was ", input_min.dim_size(0)));
+    OP_REQUIRES(
+        ctx, input_max.dim_size(0) == depth,
+        errors::InvalidArgument("input_max has incorrect size, expected ",
+                                depth, " was ", input_max.dim_size(0)));
+
+    const float* input_min_data = input_min.flat<float>().data();
+    const float* input_max_data = input_max.flat<float>().data();
+    std::vector<float> ranges(depth);
+    bool is_non_negative = true;
+    Eigen::array<int, 2> shuffling({1, 0});
+    auto input_matrix = input.flat_inner_dims<qint32>();
+
+    // TODO: verify performance of not transposing and finding the min max
+    // directly from input_matrix vs the one presented below of transposing and
+    // using the transposed matrix as the transposing operation in itself might
+    // be more costly.
+    // Note that this operation is a calibration step for quantization and will
+    // cease to exist in the final inference graph(will exist as a const node).
+    auto transposed_input = input_matrix.shuffle(shuffling);
+
+    // Find the ranges of each channel in parallel.
+    float out_min_max = std::numeric_limits<float>::min();
+#pragma omp parallel for reduction(max : out_min_max)
+    for (size_t i = 0; i < depth; ++i) {
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =
+          transposed_input.chip<0>(i).minimum();
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> max =
+          transposed_input.chip<0>(i).maximum();
+      const int32_t min_per_channel = min();
+      const int32_t max_per_channel = max();
+      const int32_t abs_max =
+          std::max(std::abs(min_per_channel), std::abs(max_per_channel));
+      float scale =
+          std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i]));
+      ranges[i] =
+          scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31);
+      if (min_per_channel < 0) is_non_negative = false;
+
+      // Thread-local out_min_max.
+      out_min_max = std::max(out_min_max, ranges[i]);
+    }
+    // All local out_min_max gets max-reduced into one global out_min_max at
+    // the end of the loop by specifying reduction(max:out_min_max) along with
+    // omp parallel for.
+
+    // Fixing max to clip_value_max_ (example 6.0 to support relu6)
+    if (out_min_max > clip_value_max_) out_min_max = clip_value_max_;
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+    output_min->flat<float>()(0) = is_non_negative ? 0.0f : -out_min_max;
+    output_max->flat<float>()(0) = out_min_max;
+  }
+
+ private:
+  float clip_value_max_ = std::numeric_limits<float>::infinity();
+  const int kInputTensorIndex = 0;
+  const int kInputMinIndex = 1;
+  const int kInputMaxIndex = 2;
+  const int kOutputMinIndex = 0;
+  const int kOutputMaxIndex = 1;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizationRangePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T"),
+                        MklRequantizationRangePerChannelOp);
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9961462754f4c2378f7e46931d7878ca283278a5
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
@@ -0,0 +1,300 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#include <cmath>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class MklRequantizatedOpsTest : public OpsTestBase {};
+
+class MklRequantizatedOpsTestHelper : public OpsTestBase {
+ public:
+  void Setup(Tensor &input_tensor_qint32, float &range_weights_ch1,
+             float &range_weights_ch2);
+  void TestBody() {}
+};
+
+void MklRequantizatedOpsTestHelper::Setup(Tensor &input_tensor_qint32,
+                                          float &range_weights_ch1,
+                                          float &range_weights_ch2) {
+  // Step 1: Input range assumptions
+  // -------------------------------
+  // Assume input tensor T (NHWC) in FP32 has range [0, 5.0]   size nt*ht*wt*ct
+  // Assume input filter W (NHWC) with 2 output channels of    size nw*ht*wt*2
+  // logically,   filter W has 2 channels W1 and W2 each of    size nw*ht*wt*1
+  // Assume input filter W1(NHWC) in FP32 has range [-2.0, 2.0]size nw*ht*wt*1
+  // Assume input filter W2(NHWC) in FP32 has range [-3.0, 3.0]size nw*ht*wt*1
+
+  // Step 2: Quantization details (per channel)
+  // ------------------------------------------
+  // T and W are quantized using a quantize op.
+  // The input tensor T (NHWC) is quantized to unsigned int8.
+  // Hence T's max value is mapped to ((2^8-1) = 255).
+  // The input filter W (NHWC) is quantized to signed int8.
+  // Hence W's max value is mapped to ((2^7)-1 = 127)).
+
+  // Range of quantized T  in uint8[0  , 255] maps to orig T  in FP32[0   , 5.0]
+  // Range of quantized W1 in int8[-127, 127] maps to orig W1 in FP32[-2.0, 2.0]
+  // Range of quantized W2 in int8[-127, 127] maps to orig W2 in FP32[-3.0, 3.0]
+
+  // Hence the resolution of quantized T will be 5.0/255
+  // Hence the resolution of quantized W1 will be 2.0/127
+  // Hence the resolution of quantized W2 will be 3.0/127
+
+  // Step 3: Assumption of quantizedconv on quantized input&weights(per channel)
+  // ---------------------------------------------------------------------------
+  // The input T and weights W1 (or W2) will be convolved.
+  // The output tensor T is in int32 whose range is [-2^31, 2^31).
+  // For simplicity and symmetry, we truncate the above range to (-2^31, 2^31).
+  // The range of convolved T*W1 is ((2^31)-1) * 5.0/255 * 2.0/127 = 663110.59
+  // So the range of convolved T*W1 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32[0, 5.0] * [-2.0, 2.0] is [-663110.59, 663110.59].
+
+  // The range of convolved T*W2 is (2^31-1) * 5.0/255 * 3.0/127 = 994665.88
+  // So the range of convolved T*W2 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32 [0, 5.0] * [-3.0, 3.0]  is [-994665.88, 994665.88]
+
+  // Step 4: Assumption output above is fed to requantization_range_perchannel
+  // --------------------------------------------------------------------------
+  // Here we recalculate the new range for convolved T*W so that we
+  // make good use in int8 quantization from int32 to int8.
+
+  // We assume the above operations are performed and use these values above
+  // as ranges for requantization_range_perchannel_op.
+  range_weights_ch1 = 663110.59;  // For W1 channel
+  range_weights_ch2 = 994665.88;  // For W2 Channel
+
+  // We Fill the input tensor T qint32 with arbitrary int32 values
+  test::FillValues<qint32>(
+      &input_tensor_qint32,
+      {-1000, -2000,  2000,   4000,   -3000,  -6000,  4000,   8000,
+       5000,  10000,  -6000,  -12000, 7000,   14000,  8000,   16000,
+       9000,  -18000, -10000, -20000, 11000,  22000,  -12000, -24000,
+       13000, 26000,  14000,  28000,  -15000, -30000, 16000,  32000});
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // See test RequantizationRangePerChannelTest_Basic and/or
+  // test RequantizationRangePerChannelTest_ClipMax
+}
+
+// Tests the RequantizationRangePerChannel op wherein the range
+// of the weights is calculated per channel.
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_Basic) {
+  // Let us set up the tensor and inputs before we run this op.
+  float clip_value_max = static_cast<float>((1L << 31) - 1);
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // Define the shape of T.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range Nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the Expected Outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-14.8217, output_min, 0.002);
+  EXPECT_NEAR(14.8217, output_max, 0.002);
+
+  // Output range is made use in RequantizePerChannelTest_Basic
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_ClipMax) {
+  // Let us setup the tensor and inputs before we run this op.
+  float clip_value_max = 6;  // Can be used as 6 for Relu 6 activations.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define and input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the expected outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-6.0, output_min, 0.002);  // Values are aligned with clip_value.
+  EXPECT_NEAR(6.0, output_max, 0.002);   // Values are aligned with clip_value.
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizePerChannelTest_Basic) {
+  // Let us setup the tensor and inputs before we run this op.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define an input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 7: Define and run requantize_perchannel
+  // --------------------------------------------
+  // The output of requantization_range_op_per_channel which calculated the
+  // new ranges of int8 is fed to the requantize per channel op.
+  // Here the values of convolved T*W is converted from int32 to int8.
+
+  TF_ASSERT_OK(NodeDefBuilder("requantize_per_channel", "RequantizePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("out_type", DataTypeToEnum<qint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input Nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Calculate the min and max from Step 6 above
+  // in RequantizationRangePerChannelTest_Basic
+  float range_op_output_min = -14.8217;
+  float range_op_output_max = 14.8217;
+
+  // Add the requested_min and requested_max stored from Step 6.
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_min});
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Verify the output with the expected output
+  Tensor output = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(range_op_output_min, output_min, 0.002);
+  EXPECT_NEAR(range_op_output_max, output_max, 0.002);
+}
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c1a01f8311e81fa2f0dd0945569f0b6980b0be
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+#include <math.h>
+
+#include "mkldnn.hpp"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Toutput>
+class MklRequantizePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_type_));
+    OP_REQUIRES(ctx, out_type_ == DT_QINT8 || out_type_ == DT_QUINT8,
+                errors::InvalidArgument(
+                    "out_type must be qint8 or quint8, but got: " + out_type_));
+  }
+  virtual ~MklRequantizePerChannelOp() {}
+  void Compute(OpKernelContext* ctx) override {
+    try {
+      const Tensor& input = ctx->input(kInputTensorIndex);
+      const Tensor& input_min_vec = ctx->input(kInputMinVecIndex);
+      float* input_min_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_min_vec.flat<float>().data()));
+      const Tensor& input_max_vec = ctx->input(kInputMaxVecIndex);
+      float* input_max_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_max_vec.flat<float>().data()));
+
+      const Tensor& input_requested_min = ctx->input(this->kRequestMinIndex);
+      const float input_requested_min_float =
+          input_requested_min.flat<float>()(0);
+      const Tensor& input_requested_max = ctx->input(this->kRequestMaxIndex);
+      const float input_requested_max_float =
+          input_requested_max.flat<float>()(0);
+
+      size_t depth = input_min_vec.NumElements();
+      OP_REQUIRES(
+          ctx, input_min_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_min has incorrect size, expected ",
+                                  depth, " was ", input_min_vec.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_max has incorrect size, expected ",
+                                  depth, " was ", input_max_vec.dim_size(0)));
+
+      if (out_type_ == DT_QINT8) DCHECK(input_requested_min_float < 0.0f);
+
+      const float factor = (out_type_ == DT_QINT8) ? 127.0f : 255.0f;
+      const float requested_min_max =
+          std::max(std::abs(input_requested_min_float),
+                   std::abs(input_requested_max_float));
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputTensorIndex,
+                                               input.shape(), &output));
+
+      std::vector<float> scales(depth);
+      for (int i = 0; i < depth; ++i) {
+        float min_max_from_vec = std::max(std::abs(input_min_vec_data[i]),
+                                          std::abs(input_max_vec_data[i]));
+        scales[i] = factor * (min_max_from_vec / requested_min_max /
+                              static_cast<float>(1L << 31));
+      }
+
+      mkldnn::primitive_attr reorder_attr;
+      reorder_attr.set_output_scales(2, scales);
+
+      memory::dims dims_mkl_order =
+          TFShapeToMklDnnDimsInNCHW(input.shape(), FORMAT_NHWC);
+      memory::desc input_md = memory::desc(dims_mkl_order, MklDnnType<qint32>(),
+                                           memory::format::nhwc);
+      memory::desc output_md =
+          (out_type_ == DT_QINT8)
+              ? memory::desc(dims_mkl_order, MklDnnType<qint8>(),
+                             memory::format::nhwc)
+              : memory::desc(dims_mkl_order, MklDnnType<quint8>(),
+                             memory::format::nhwc);
+
+      memory::primitive_desc input_pd =
+          memory::primitive_desc(input_md, cpu_engine_);
+      memory::primitive_desc output_pd =
+          memory::primitive_desc(output_md, cpu_engine_);
+
+      void* input_buf =
+          static_cast<void*>(const_cast<qint32*>(input.flat<qint32>().data()));
+      void* output_buf;
+      if (out_type_ == DT_QINT8) {
+        output_buf = static_cast<void*>(
+            const_cast<qint8*>(output->flat<qint8>().data()));
+      } else {
+        output_buf = static_cast<void*>(
+            const_cast<quint8*>(output->flat<quint8>().data()));
+      }
+
+      std::unique_ptr<memory> input_mem_prim_(new memory(input_pd, input_buf));
+      std::unique_ptr<memory> output_mem_prim_(
+          new memory(output_pd, output_buf));
+
+      mkldnn::reorder::primitive_desc reorder_pd =
+          mkldnn::reorder::primitive_desc(input_pd, output_pd, reorder_attr);
+      std::vector<mkldnn::primitive> net;
+      net.push_back(
+          mkldnn::reorder(reorder_pd, *input_mem_prim_, *output_mem_prim_));
+      stream(stream::kind::eager).submit(net).wait();
+
+      Tensor* output_min = nullptr;
+      Tensor* output_max = nullptr;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+
+      output_min->flat<float>()(0) = input_requested_min_float;
+      output_max->flat<float>()(0) = input_requested_max_float;
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  const int kInputTensorIndex = 0;
+  const int kInputMinVecIndex = 1;
+  const int kInputMaxVecIndex = 2;
+  const int kRequestMinIndex = 3;
+  const int kRequestMaxIndex = 4;
+  const int kOutputTensorIndex = 0;
+  const int kOutputMinIndex = 1;
+  const int kOutputMaxIndex = 2;
+  DataType out_type_;
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T")
+                            .TypeConstraint<qint8>("out_type"),
+                        MklRequantizePerChannelOp<CPUDevice, qint8>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index 82dfece4a2a5c2c79125e662bc327dc16fe22b02..46852167ae0e566f6d3141cad5dcaface7e62ee3 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -53,6 +53,20 @@ struct MultinomialFunctor {
                   typename TTypes<OutputType>::Matrix output);
 };
 
+#if GOOGLE_CUDA
+extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
+extern template struct MultinomialFunctor<GPUDevice, float, int32>;
+extern template struct MultinomialFunctor<GPUDevice, double, int32>;
+extern template struct MultinomialFunctor<GPUDevice, int32, int32>;
+extern template struct MultinomialFunctor<GPUDevice, int64, int32>;
+
+extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int64>;
+extern template struct MultinomialFunctor<GPUDevice, float, int64>;
+extern template struct MultinomialFunctor<GPUDevice, double, int64>;
+extern template struct MultinomialFunctor<GPUDevice, int32, int64>;
+extern template struct MultinomialFunctor<GPUDevice, int64, int64>;
+#endif  // GOOGLE_CUDA
+
 template <typename T, typename OutputType>
 struct MultinomialFunctor<CPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const CPUDevice& d,
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 1603a2aa869e4959713741bfb501798193a63d42..2f4a5e9aa03a27eee5b497ebd5adfedc7f02623c 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -148,7 +148,7 @@ class Mutex : public ResourceBase {
             fn_(Status::OK(),
                 SharedLockReleaser{std::make_shared<LockReleaser>(this)});
           } else {
-            fn_(errors::Cancelled("Lock acqusition cancelled."),
+            fn_(errors::Cancelled("Lock acquisition cancelled."),
                 SharedLockReleaser{nullptr});
           }
         },
@@ -242,10 +242,24 @@ class ConsumeMutexLockOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("MutexLock").Device(DEVICE_CPU), MutexLockOp);
 
-REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MutexLock")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("mutex_lock")
+                            .HostMemory("mutex"),
+                        MutexLockOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("MutexV2").Device(DEVICE_CPU).HostMemory("resource"),
+    ResourceHandleOp<Mutex>);
+
+REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_GPU),
                         ResourceHandleOp<Mutex>);
 
 REGISTER_KERNEL_BUILDER(Name("ConsumeMutexLock").Device(DEVICE_CPU),
                         ConsumeMutexLockOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ConsumeMutexLock").Device(DEVICE_GPU).HostMemory("mutex_lock"),
+    ConsumeMutexLockOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 0e820bbb6208ae9c13ac2fb33f67590b9e66ba7e..b218f62ddd9a02026bd654fd76dd2223152da9a8 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
index bcc9d4ee1f60af7ffdcc0798c3e63063852c2dd3..6166a1053f32c0b0b7fba4ceda69ad3126346f65 100644
--- a/tensorflow/core/kernels/nextafter_op.cc
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,26 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/nextafter_op.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
 
-namespace functor {
-
-template <typename T>
-struct nextafter_op {
-  EIGEN_EMPTY_STRUCT_CTOR(nextafter_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
-                                                           const T& x2) const {
-    return std::nextafter(x1, x2);
-  }
-};
-
-template <typename T>
-struct nextafter : base<T, nextafter_op<T>> {};
-
-}  // namespace functor
-
 REGISTER2(BinaryOp, CPU, "NextAfter", functor::nextafter, float, double);
 
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("NextAfter").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::nextafter<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nextafter_op.h b/tensorflow/core/kernels/nextafter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..64374980f2d5aec7c2d5a9011f14280cd6c394ed
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+struct nextafter_op {
+  EIGEN_EMPTY_STRUCT_CTOR(nextafter_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
+                                                           const T& x2) const {
+    return std::nextafter(x1, x2);
+  }
+};
+
+template <typename T>
+struct nextafter : base<T, nextafter_op<T>> {};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
diff --git a/tensorflow/core/kernels/nextafter_op_gpu.cu.cc b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2321c6a882c425f9851cb59a48e5b4c5aed9cb5
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/nextafter_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+DEFINE_BINARY2(nextafter, float, double);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 37f615abd97044caa7703837714840b8d451d420..482b227ccdc8316cf336eb1f4761c6c866da7399 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -74,6 +74,34 @@ static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
               errors::InvalidArgument("boxes must have 4 columns"));
 }
 
+static inline void CheckCombinedNMSScoreSizes(OpKernelContext* context,
+                                              int num_boxes,
+                                              const Tensor& scores) {
+  // The shape of 'scores' is [batch_size, num_boxes, num_classes]
+  OP_REQUIRES(context, scores.dims() == 3,
+              errors::InvalidArgument("scores must be 3-D",
+                                      scores.shape().DebugString()));
+  OP_REQUIRES(context, scores.dim_size(1) == num_boxes,
+              errors::InvalidArgument("scores has incompatible shape"));
+}
+
+static inline void ParseAndCheckCombinedNMSBoxSizes(OpKernelContext* context,
+                                                    const Tensor& boxes,
+                                                    int* num_boxes,
+                                                    const int num_classes) {
+  // The shape of 'boxes' is [batch_size, num_boxes, q, 4]
+  OP_REQUIRES(context, boxes.dims() == 4,
+              errors::InvalidArgument("boxes must be 4-D",
+                                      boxes.shape().DebugString()));
+
+  bool box_check = boxes.dim_size(2) == 1 || boxes.dim_size(2) == num_classes;
+  OP_REQUIRES(context, box_check,
+              errors::InvalidArgument(
+                  "third dimension of boxes must be either 1 or num classes"));
+  *num_boxes = boxes.dim_size(1);
+  OP_REQUIRES(context, boxes.dim_size(3) == 4,
+              errors::InvalidArgument("boxes must have 4 columns"));
+}
 // Return intersection-over-union overlap between boxes i and j
 template <typename T>
 static inline bool IOUGreaterThanThreshold(
@@ -195,6 +223,216 @@ void DoNonMaxSuppressionOp(
   std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
 }
 
+void BatchedNonMaxSuppressionOp(
+    OpKernelContext* context, const Tensor& inp_boxes, const Tensor& inp_scores,
+    int num_boxes, const int max_size_per_class, const int total_size_per_batch,
+    const float score_threshold, const float iou_threshold,
+    bool pad_per_class = false) {
+  int q = inp_boxes.dim_size(2);
+  int num_classes = inp_scores.dim_size(2);
+  const int num_batches = inp_boxes.dim_size(0);
+
+  // Default clip window of [0, 0, 1, 1] if none specified
+  std::vector<float> clip_window{0, 0, 1, 1};
+
+  // [num_batches, per_batch_size * 4]
+  std::vector<std::vector<float>> nmsed_boxes(num_batches);
+  // [num_batches, per_batch_size]
+  std::vector<std::vector<float>> nmsed_scores(num_batches);
+  // [num_batches, per_batch_size]
+  std::vector<std::vector<float>> nmsed_classes(num_batches);
+  // [num_batches]
+  std::vector<int> final_valid_detections;
+
+  int per_batch_size = total_size_per_batch;
+
+  // perform non_max_suppression operation for each batch independently
+  for (int batch = 0; batch < num_batches; ++batch) {
+    // dims of per_batch_boxes [num_boxes, q, 4]
+    Tensor per_batch_boxes = inp_boxes.Slice(batch, batch + 1);
+    // dims of per_batch_scores [num_boxes, num_classes]
+    Tensor per_batch_scores = inp_scores.Slice(batch, batch + 1);
+
+    struct ResultCandidate {
+      int box_index;
+      float score;
+      int class_idx;
+      float box_coord[4];
+    };
+
+    std::vector<ResultCandidate> result_candidate_vec;
+
+    float* scores_data = per_batch_scores.unaligned_flat<float>().data();
+    float* boxes_data = per_batch_boxes.unaligned_flat<float>().data();
+
+    // Iterate through all classes
+    for (int class_idx = 0; class_idx < num_classes; ++class_idx) {
+      std::vector<float> class_scores_data;
+      class_scores_data.reserve(num_boxes);
+      std::vector<float> class_boxes_data;
+      class_boxes_data.reserve(num_boxes * 4);
+
+      for (int box = 0; box < num_boxes; ++box) {
+        // Get the scores per class
+        // class_scores_data dim is [num_boxes].
+        class_scores_data.push_back(scores_data[box * num_classes + class_idx]);
+        for (int cid = 0; cid < 4; ++cid) {
+          if (q > 1) {
+            // Get the boxes per class. class_boxes_data dims is [num_boxes, 4]
+            class_boxes_data.push_back(
+                boxes_data[(box * q + class_idx) * 4 + cid]);
+          } else {
+            class_boxes_data.push_back(boxes_data[box * 4 + cid]);
+          }
+        }
+      }
+
+      // Copy class_boxes_data to a tensor
+      TensorShape boxesShape({num_boxes, 4});
+      Tensor boxes(per_batch_boxes.dtype(), boxesShape);
+      std::copy_n(class_boxes_data.begin(), class_boxes_data.size(),
+                  boxes.unaligned_flat<float>().data());
+
+      const int size_per_class = std::min(max_size_per_class, num_boxes);
+      // Do NMS, get the candidate indices of form vector<int>
+      // Data structure for selection candidate in NMS.
+      struct Candidate {
+        int box_index;
+        float score;
+      };
+      auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+        return bs_i.score > bs_j.score;
+      };
+      std::vector<Candidate> candidate_vector;
+      for (int i = 0; i < class_scores_data.size(); ++i) {
+        if (class_scores_data[i] > score_threshold) {
+          candidate_vector.emplace_back(Candidate({i, class_scores_data[i]}));
+        }
+      }
+
+      std::vector<int> selected;
+      std::vector<float> selected_boxes;
+      Candidate next_candidate;
+
+      std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
+      const Tensor const_boxes = boxes;
+      typename TTypes<float, 2>::ConstTensor boxes_data =
+          const_boxes.tensor<float, 2>();
+      int candidate_idx = 0;
+      while (selected.size() < size_per_class &&
+             candidate_idx < candidate_vector.size()) {
+        next_candidate = candidate_vector[candidate_idx++];
+
+        // Overlapping boxes are likely to have similar scores,
+        // therefore we iterate through the previously selected boxes backwards
+        // in order to see if `next_candidate` should be suppressed.
+        bool should_select = true;
+        for (int j = selected.size() - 1; j >= 0; --j) {
+          if (IOUGreaterThanThreshold(boxes_data, next_candidate.box_index,
+                                      selected[j], iou_threshold)) {
+            should_select = false;
+            break;
+          }
+        }
+
+        if (should_select) {
+          selected.push_back(next_candidate.box_index);
+          // Add the selected box to the result candidate. Sorted by score
+          int id = next_candidate.box_index;
+          ResultCandidate rc = {next_candidate.box_index,
+                                next_candidate.score,
+                                class_idx,
+                                {boxes_data(id, 0), boxes_data(id, 1),
+                                 boxes_data(id, 2), boxes_data(id, 3)}};
+          result_candidate_vec.push_back(rc);
+        }
+      }
+    }
+
+    auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
+      return rc_i.score > rc_j.score;
+    };
+    std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
+
+    int max_detections = 0;
+    // If pad_per_class is false, we always pad to max_total_size
+    if (!pad_per_class) {
+      max_detections =
+          std::min((int)result_candidate_vec.size(), total_size_per_batch);
+      per_batch_size = total_size_per_batch;
+    } else {
+      per_batch_size =
+          std::min(total_size_per_batch, max_size_per_class * num_classes);
+      max_detections =
+          std::min(per_batch_size, (int)result_candidate_vec.size());
+    }
+
+    final_valid_detections.push_back(max_detections);
+
+    int curr_total_size = max_detections;
+    int result_idx = 0;
+    // Pick the top max_detections values
+    while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
+      ResultCandidate next_candidate = result_candidate_vec[result_idx++];
+      // Add to final output vectors
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[0], clip_window[2]),
+                   clip_window[0]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[1], clip_window[3]),
+                   clip_window[1]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[2], clip_window[2]),
+                   clip_window[0]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[3], clip_window[3]),
+                   clip_window[1]));
+      nmsed_scores[batch].push_back(next_candidate.score);
+      nmsed_classes[batch].push_back(next_candidate.class_idx);
+      curr_total_size--;
+    }
+
+    nmsed_boxes[batch].resize(per_batch_size * 4, 0);
+    nmsed_scores[batch].resize(per_batch_size, 0);
+    nmsed_classes[batch].resize(per_batch_size, 0);
+  }
+
+  Tensor* nmsed_boxes_t = nullptr;
+  TensorShape boxes_shape({num_batches, per_batch_size, 4});
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, boxes_shape, &nmsed_boxes_t));
+  auto nmsed_boxes_flat = nmsed_boxes_t->template flat<float>();
+
+  Tensor* nmsed_scores_t = nullptr;
+  TensorShape scores_shape({num_batches, per_batch_size});
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(1, scores_shape, &nmsed_scores_t));
+  auto nmsed_scores_flat = nmsed_scores_t->template flat<float>();
+
+  Tensor* nmsed_classes_t = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(2, scores_shape, &nmsed_classes_t));
+  auto nmsed_classes_flat = nmsed_classes_t->template flat<float>();
+
+  Tensor* valid_detections_t = nullptr;
+  TensorShape valid_detections_shape({num_batches});
+  OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
+                                                   &valid_detections_t));
+  auto valid_detections_flat = valid_detections_t->template flat<int>();
+
+  for (int i = 0; i < num_batches; ++i) {
+    valid_detections_flat(i) = final_valid_detections[i];
+    for (int j = 0; j < per_batch_size; ++j) {
+      nmsed_scores_flat(i * per_batch_size + j) = nmsed_scores[i][j];
+      nmsed_classes_flat(i * per_batch_size + j) = nmsed_classes[i][j];
+      for (int k = 0; k < 4; ++k) {
+        nmsed_boxes_flat(i * per_batch_size * 4 + j * 4 + k) =
+            nmsed_boxes[i][j * 4 + k];
+      }
+    }
+  }
+}
+
 }  // namespace
 
 template <typename Device>
@@ -435,6 +673,74 @@ class NonMaxSuppressionWithOverlapsOp : public OpKernel {
   }
 };
 
+template <typename Device>
+class CombinedNonMaxSuppressionOp : public OpKernel {
+ public:
+  explicit CombinedNonMaxSuppressionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("pad_per_class", &pad_per_class_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [batch_size, num_anchors, q, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [batch_size, num_anchors, num_classes]
+    const Tensor& scores = context->input(1);
+    OP_REQUIRES(
+        context, (boxes.dim_size(0) == scores.dim_size(0)),
+        errors::InvalidArgument("boxes and scores must have same batch size"));
+
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_size_per_class must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    const int max_size_per_class = max_output_size.scalar<int>()();
+    // max_total_size: scalar
+    const Tensor& max_total_size = context->input(3);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_total_size.shape()),
+        errors::InvalidArgument("max_total_size must be 0-D, got shape ",
+                                max_total_size.shape().DebugString()));
+    const int max_total_size_per_batch = max_total_size.scalar<int>()();
+    OP_REQUIRES(context, max_total_size_per_batch > 0,
+                errors::InvalidArgument("max_total_size must be > 0"));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(4);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    // score_threshold: scalar
+    const Tensor& score_threshold = context->input(5);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
+    OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
+                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+    int num_boxes = 0;
+    const int num_classes = scores.dim_size(2);
+    ParseAndCheckCombinedNMSBoxSizes(context, boxes, &num_boxes, num_classes);
+    CheckCombinedNMSScoreSizes(context, num_boxes, scores);
+
+    if (!context->status().ok()) {
+      return;
+    }
+    BatchedNonMaxSuppressionOp(context, boxes, scores, num_boxes,
+                               max_size_per_class, max_total_size_per_batch,
+                               score_threshold_val, iou_threshold_val,
+                               pad_per_class_);
+  }
+
+ private:
+  bool pad_per_class_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
                         NonMaxSuppressionOp<CPUDevice>);
 
@@ -466,4 +772,7 @@ REGISTER_KERNEL_BUILDER(
     Name("NonMaxSuppressionWithOverlaps").Device(DEVICE_CPU),
     NonMaxSuppressionWithOverlapsOp<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("CombinedNonMaxSuppression").Device(DEVICE_CPU),
+                        CombinedNonMaxSuppressionOp<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index c321849f405f5ff966f530ce6ada1c8925ccf1d4..242e41b2652f6200b7d326f4845d14a58e61f9ea 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -861,4 +861,471 @@ TEST_F(NonMaxSuppressionWithOverlapsOpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+class CombinedNonMaxSuppressionOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(bool pad_per_class = false) {
+    TF_EXPECT_OK(NodeDefBuilder("combined_non_max_suppression_op",
+                                "CombinedNonMaxSuppression")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("pad_per_class", pad_per_class)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 0, 0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0, 0, 0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<int>(TensorShape({}), {10});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({0, 10, 4}));
+  test::FillValues<float>(&expected_boxes, {});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({0, 10}));
+  test::FillValues<float>(&expected_scores, {});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({0, 10}));
+  test::FillValues<float>(&expected_classes, {});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected_valid_d, {});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.3, 1, 0.4});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.3});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromThreeClustersWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromThreeClustersWithScoreThresholdZeroScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.1f, 0, 0, .3f, .2f, -5.0f});
+  // If we ask for more boxes than we actually expect to get back;
+  // should still only get 2 boxes back.
+  AddInputFromArray<int>(TensorShape({}), {4});
+  AddInputFromArray<int>(TensorShape({}), {5});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {-3.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 5, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {
+          0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      });
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 5}));
+  test::FillValues<float>(&expected_scores, {0.3, 0.1, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 5}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 1, 1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {1});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 1, 4}));
+  test::FillValues<float>(&expected_boxes, {0, 0, 1, 1});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 1}));
+  test::FillValues<float>(&expected_scores, {0.9});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 1}));
+  test::FillValues<float>(&expected_classes, {0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {1});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1}),
+      {.9f, .75f, .6f, .95f, .5f, .3f, .9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectFromTwoBatchesTwoClasses) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.01f, 0.1, 0.11f,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.75, 0.95, 0.9, 0.75});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {3, 3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThresholdPaddedTotalSize) {
+  MakeOp(true);
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {10});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThresholdPaddedPerClass) {
+  MakeOp(true);
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<int>(TensorShape({}), {50});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 4, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0, 0.95, 0.9, 0, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 0, 1, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesTotalSize) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  // Total size per batch is more than size per class
+  AddInputFromArray<int>(TensorShape({}), {5});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.1f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 5, 4}));
+  test::FillValues<float>(
+      &expected_boxes, {0,   0.11,  0.1, 0.2,   0,   0,     0.1, 0.1, 0, 0.01f,
+                        0.1, 0.11f, 0,   0.12f, 0.1, 0.21f, 0,   0.3, 1, 0.4,
+                        0,   0.21,  0.2, 0.3,   0,   0,     0.2, 0.2, 0, 0.02f,
+                        0.2, 0.22f, 0,   0.22f, 0.2, 0.31f, 0,   0.4, 1, 0.5});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 5}));
+  test::FillValues<float>(
+      &expected_scores, {0.95, 0.9, 0.75, 0.5, 0.3, 0.95, 0.9, 0.75, 0.5, 0.3});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 5}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 1, 0, 0, 1, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {5, 5});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesForBoxesAndScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 2, 4}),
+      // batch 0, box1 of class 1 should get selected
+      {0, 0, 0.1, 0.1, 0, 0, 0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, 0.6f, 0.1, 0.7f,
+       0, -0.01, 0.1, 0.09f, 0, -0.01, 0.1, 0.09f, 0, 0.11, 0.1, 0.2, 0, 0.11,
+       0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.12f, 0.1, 0.21f, 0, 0.3, 1, 0.4, 0,
+       0.3, 1, 0.4,
+       // batch 1, box1 of class 0 should get selected
+       0, 0, 0.2, 0.2, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, 0.02f, 0.2,
+       0.22f, 0, -0.02, 0.2, 0.19f, 0, -0.02, 0.2, 0.19f, 0, 0.21, 0.2, 0.3, 0,
+       0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.22f, 0.2, 0.31f, 0, 0.4, 1,
+       0.5, 0, 0.4, 1, 0.5});
+
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.6f,  0.1, 0.7f,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.8, 0.95, 0.9, 0.75});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 1, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {3, 3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 5d607b90446b6095619472af139e178321701640..3c3836352e8cd334f171da9244079a8878a6126e 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -140,14 +140,13 @@ class OpsTestBase : public ::testing::Test {
     CHECK_GT(input_types_.size(), inputs_.size())
         << "Adding more inputs than types; perhaps you need to call MakeOp";
     ResourceMgr* rm = device_->resource_manager();
-    EXPECT_TRUE(
-        rm->Create(container == "" ? rm->default_container() : container, name,
-                   resource)
-            .ok());
+    std::string container_name =
+        container == "" ? rm->default_container() : container;
+    EXPECT_TRUE(rm->Create(container_name, name, resource).ok());
     TypeIndex type_index = MakeTypeIndex<T>();
     ResourceHandle handle;
     handle.set_device(device_->name());
-    handle.set_container(container);
+    handle.set_container(container_name);
     handle.set_name(name);
     handle.set_hash_code(type_index.hash_code());
     handle.set_maybe_type_name(type_index.name());
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 5645275cfa98eb820b7d1e885b18894bfab17e49..18ed1ea26ac8a63c4716bfdc7197641be522ea7c 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -158,7 +158,8 @@ REGISTER_PACK(string);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bool);
+TF_CALL_int16(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 3b9133ed7e2c210aab3488d667f0c2e543207fcf..691430ebaff5a99ccb103c5f5a80263d15f24b6a 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -322,6 +322,7 @@ namespace functor {
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_int8(DECLARE_GPU_SPECS);
+TF_CALL_uint8(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
@@ -355,6 +356,7 @@ TF_CALL_int8(DECLARE_GPU_SPECS);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index 00ec44adc284099b3fed644d4742af8d07ae13e1..0cd8ef17ba2be995c719dccb5b3a104f9bd09f68 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -41,6 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_int8(DEFINE_GPU_SPECS);
+TF_CALL_uint8(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 45ae45e4f4fc33da5d4b19d79b76e489209a667c..4abbe2fe3b78bfaa35a8c36b6aef9bb7b4a83fb5 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -242,13 +242,12 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            gen, output.data(), num_batches, samples_per_batch, num_elements,
-            means.data(), means.dimension(0) == 1, stddevs.data(),
-            stddevs.dimension(0) == 1, minvals.data(),
-            minvals.dimension(0) == 1, maxvals.data(),
-            maxvals.dimension(0) == 1, kMaxIterations);
+    TF_CHECK_OK(CudaLaunchKernel(
+        TruncatedNormalKernel<T>, config.block_count, config.thread_per_block,
+        0, d.stream(), gen, output.data(), num_batches, samples_per_batch,
+        num_elements, means.data(), means.dimension(0) == 1, stddevs.data(),
+        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
+        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations));
   }
 };
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index cadb83d8cf934dba8bbf4c3706c6e5edff381b10..c5e201d12d3429380d4fa8bb4e6fb6d513b60059 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/partitioned_function_ops.h"
+
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,220 +34,219 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-namespace {
-// A `PartitionedCallOp` asynchronously executes a function, potentially across
-// multiple devices but within a single process. The kernel places and
-// partitions a given function's underlying graph, and executes each of the
-// partitioned subgraphs as a function.
-//
-// TODO(akshayka): Support distributed execution.
-class PartitionedCallOp : public AsyncOpKernel {
- public:
-  explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    string deprecated_config_serialized;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
-    string config_proto_serialized;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
+
+PartitionedCallOp::PartitionedCallOp(OpKernelConstruction* ctx)
+    : AsyncOpKernel(ctx),
+      func_(new NameAttrList),
+      config_proto_(new ConfigProto) {
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, func_.get()));
+  string deprecated_config_serialized;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
+  string config_proto_serialized;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
+  OP_REQUIRES(
+      ctx,
+      deprecated_config_serialized.empty() || config_proto_serialized.empty(),
+      errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
+                              "only one should be provided.  Note the "
+                              "'config' option is deprecated."));
+  if (!deprecated_config_serialized.empty()) {
+    OP_REQUIRES(ctx,
+                config_proto_->mutable_graph_options()
+                    ->mutable_rewrite_options()
+                    ->ParseFromString(deprecated_config_serialized),
+                errors::InvalidArgument("Unable to parse config string as "
+                                        "tensorflow::RewriteOptions proto."));
+  } else {
     OP_REQUIRES(
-        ctx,
-        deprecated_config_serialized.empty() || config_proto_serialized.empty(),
-        errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
-                                "only one should be provided.  Note the "
-                                "'config' option is deprecated."));
-    if (!deprecated_config_serialized.empty()) {
-      OP_REQUIRES(ctx,
-                  config_proto_.mutable_graph_options()
-                      ->mutable_rewrite_options()
-                      ->ParseFromString(deprecated_config_serialized),
-                  errors::InvalidArgument("Unable to parse config string as "
-                                          "tensorflow::RewriteOptions proto."));
-    } else {
-      OP_REQUIRES(
-          ctx, config_proto_.ParseFromString(config_proto_serialized),
-          errors::InvalidArgument("Unable to parse config_proto string as "
-                                  "tensorflow::ConfigProto proto."));
-    }
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
+        ctx, config_proto_->ParseFromString(config_proto_serialized),
+        errors::InvalidArgument("Unable to parse config_proto string as "
+                                "tensorflow::ConfigProto proto."));
   }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
+}
 
-  ~PartitionedCallOp() override {
-    for (const auto& it : handles_) {
-      Status status = it.first->ReleaseHandle(it.second);
-      if (!status.ok()) {
-        LOG(INFO) << "Ignoring error while destructing PartitionedCallOp: "
-                  << status.ToString();
-      }
+PartitionedCallOp::~PartitionedCallOp() {
+  for (const auto& it : handles_) {
+    Status status = it.first->ReleaseHandle(it.second);
+    if (!status.ok()) {
+      LOG(INFO) << "Ignoring error while destructing PartitionedCallOp: "
+                << status.ToString();
     }
   }
+}
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    FunctionLibraryRuntime* lib = ctx->function_library();
-    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
-                      errors::Internal("No function library is provided."),
-                      done);
+void PartitionedCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  FunctionLibraryRuntime* lib = ctx->function_library();
+  OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                    errors::Internal("No function library is provided."), done);
 
-    // The function body's graph is placed and partitioned the first time
-    // `ComputeAsync` is invoked; every subsequent invocation calls each
-    // of the function shards yielded by partitioning.
-    //
-    // The partitioning step yields a set of devices on which to run the
-    // function, and exactly one function shard is created for each device
-    // Inputs and outputs are pinned to the local device, for simplicity.
-    //
-    // TODO(akshayka): Support re-sharding the function on subsequent calls,
-    // via, e.g., virtual device annotations and a list of device names
-    // supplied through an attribute.
-    //
-    // TODO(akshayka): Add a fastpath for functions that execute on a single
-    // device.
-    FunctionLibraryRuntime::Handle handle;
-    // If we are instantiating the function, we can efficiently extract the
-    // inputs while instantiating. Else, we extract them separately below.
-    std::vector<Tensor> inputs;
-    bool inputs_extracted = false;
-    {
-      mutex_lock l(mu_);
-      auto it = handles_.find(lib);
-      if (it == handles_.end()) {
-        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, ctx, &inputs, &handle),
-                             done);
-        inputs_extracted = true;
-        handles_[lib] = handle;
-      } else {
-        handle = it->second;
-      }
+  // The function body's graph is placed and partitioned the first time
+  // `ComputeAsync` is invoked; every subsequent invocation calls each
+  // of the function shards yielded by partitioning.
+  //
+  // The partitioning step yields a set of devices on which to run the
+  // function, and exactly one function shard is created for each device
+  // Inputs and outputs are pinned to the local device, for simplicity.
+  //
+  // TODO(akshayka): Support re-sharding the function on subsequent calls,
+  // via, e.g., virtual device annotations and a list of device names
+  // supplied through an attribute.
+  //
+  // TODO(akshayka): Add a fastpath for functions that execute on a single
+  // device.
+  FunctionLibraryRuntime::Handle handle;
+  // If we are instantiating the function, we can efficiently extract the
+  // inputs while instantiating. Else, we extract them separately below.
+  std::vector<Tensor> inputs;
+  bool inputs_extracted = false;
+  {
+    mutex_lock l(mu_);
+    auto it = handles_.find(lib);
+    if (it == handles_.end()) {
+      OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, ctx, &inputs, &handle), done);
+      inputs_extracted = true;
+      handles_[lib] = handle;
+    } else {
+      handle = it->second;
     }
+  }
 
-    if (!inputs_extracted) {
-      OpInputList args;
-      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
-      inputs.reserve(args.size());
-      for (const Tensor& tensor : args) {
-        inputs.push_back(tensor);
-      }
+  if (!inputs_extracted) {
+    OpInputList args;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
+    inputs.reserve(args.size());
+    for (const Tensor& tensor : args) {
+      inputs.push_back(tensor);
     }
-
-    RunFunction(handle, inputs, lib, ctx, done);
   }
 
- private:
-  Status FillOutputDevices(const FunctionLibraryRuntime& lib,
-                           const Device& cpu_device, AttrSlice attrs,
-                           FunctionLibraryRuntime::InstantiateOptions* opts) {
-    const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
-    const FunctionDef* fdef = flib->Find(func_.name());
-    if (fdef == nullptr) {
-      return errors::NotFound("Failed for find definiton for function \"",
-                              func_.name(), "\"");
-    }
+  RunFunction(handle, inputs, lib, ctx, done);
+}
 
-    bool is_type_list;
-    for (const OpDef::ArgDef& ret_def : fdef->signature().output_arg()) {
-      DataTypeVector dtypes;
-      TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
-      for (DataType dtype : dtypes) {
-        if (MTypeFromDType(dtype) == HOST_MEMORY) {
-          opts->output_devices.push_back(cpu_device.name());
-        } else {
-          opts->output_devices.push_back(opts->target);
-        }
+Status PartitionedCallOp::FillOutputDevices(
+    const FunctionLibraryRuntime& lib, const Device& cpu_device,
+    AttrSlice attrs, FunctionLibraryRuntime::InstantiateOptions* opts) {
+  const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
+  const FunctionDef* fdef = flib->Find(func_->name());
+  if (fdef == nullptr) {
+    return errors::NotFound("Failed for find definition for function \"",
+                            func_->name(), "\"");
+  }
+
+  bool is_type_list;
+  for (const OpDef::ArgDef& ret_def : fdef->signature().output_arg()) {
+    DataTypeVector dtypes;
+    TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+    for (DataType dtype : dtypes) {
+      if (MTypeFromDType(dtype) == HOST_MEMORY) {
+        opts->output_devices.push_back(cpu_device.name());
+      } else {
+        opts->output_devices.push_back(opts->target);
       }
     }
-    return Status::OK();
   }
+  return Status::OK();
+}
 
-  Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
-                     std::vector<Tensor>* inputs,
-                     FunctionLibraryRuntime::Handle* handle) {
-    FunctionLibraryRuntime::InstantiateOptions opts;
-    opts.target = lib->device()->name();
-    opts.is_multi_device_function = true;
-    opts.optimize_graph_fn =
-        std::bind(grappler::OptimizeGraph, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3,
-                  std::placeholders::_4, config_proto_, std::placeholders::_5);
-    opts.graph_collector = ctx->graph_collector();
-    opts.executor_type = executor_type_;
+Status PartitionedCallOp::Instantiate(FunctionLibraryRuntime* lib,
+                                      OpKernelContext* ctx,
+                                      std::vector<Tensor>* inputs,
+                                      FunctionLibraryRuntime::Handle* handle) {
+  grappler::GrapplerItem::OptimizationOptions optimization_options;
 
-    OpInputList args;
-    TF_RETURN_IF_ERROR(ctx->input_list("args", &args));
-    Device* cpu_device;
-    TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
+  // Tensorflow 2.0 in eager mode with automatic control dependencies will
+  // prune all nodes that are not in the transitive fanin of the fetch nodes.
+  // However because the function will be executed via FunctionLibraryRuntime,
+  // and current function implementation does not prune stateful and dataset
+  // ops, we rely on Grappler to do the correct graph pruning.
+  optimization_options.allow_pruning_stateful_and_dataset_ops = true;
 
-    inputs->reserve(args.size());
-    for (const Tensor& tensor : args) {
-      inputs->push_back(tensor);
-      DataType dtype = tensor.dtype();
-      if (dtype == DT_RESOURCE) {
-        const ResourceHandle& handle = tensor.flat<ResourceHandle>()(0);
-        opts.input_devices.push_back(handle.device());
-      } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
-        opts.input_devices.push_back(cpu_device->name());
-      } else {
-        opts.input_devices.push_back(opts.target);
-      }
-    }
+  // All the nested function calls will be executed and optimized via
+  // PartitionedCallOp, there is no need to optimize functions now.
+  optimization_options.optimize_function_library = false;
 
-    TF_RETURN_IF_ERROR(
-        FillOutputDevices(*lib, *cpu_device, AttrSlice(&func_.attr()), &opts));
+  FunctionLibraryRuntime::InstantiateOptions opts;
+  // In some contexts like running the graph to evaluate constants,
+  // the FLR won't have any device.
+  opts.target = lib->device() == nullptr ? "" : lib->device()->name();
+  opts.is_multi_device_function = true;
+  opts.optimize_graph_fn =
+      std::bind(grappler::OptimizeGraph, std::placeholders::_1,
+                std::placeholders::_2, std::placeholders::_3,
+                std::placeholders::_4, std::placeholders::_5, *config_proto_,
+                func_->name(), optimization_options, std::placeholders::_6);
+  opts.graph_collector = ctx->graph_collector();
+  opts.executor_type = executor_type_;
 
-    TF_RETURN_IF_ERROR(
-        lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts, handle));
-    return Status::OK();
+  OpInputList args;
+  TF_RETURN_IF_ERROR(ctx->input_list("args", &args));
+  Device* cpu_device;
+  TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
+
+  inputs->reserve(args.size());
+  for (const Tensor& tensor : args) {
+    inputs->push_back(tensor);
+    DataType dtype = tensor.dtype();
+    if (dtype == DT_RESOURCE) {
+      const ResourceHandle& handle = tensor.flat<ResourceHandle>()(0);
+      opts.input_devices.push_back(handle.device());
+    } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
+      opts.input_devices.push_back(cpu_device->name());
+    } else {
+      opts.input_devices.push_back(opts.target);
+    }
   }
 
-  void RunFunction(FunctionLibraryRuntime::Handle handle,
-                   const std::vector<Tensor>& inputs,
-                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
-                   DoneCallback done) {
-    FunctionLibraryRuntime::Options run_opts;
-    run_opts.step_id = ctx->step_id();
-    run_opts.step_container = ctx->step_container();
-    run_opts.cancellation_manager = ctx->cancellation_manager();
-    run_opts.stats_collector = ctx->stats_collector();
-    run_opts.collective_executor = ctx->collective_executor();
-    // TODO(akshayka): Consider selecting a runner on a per-device basis,
-    // i.e., using device-specific threadpools when available.
-    run_opts.runner = ctx->runner();
-    run_opts.source_device = lib->device()->name();
-    run_opts.allow_dead_tensors = true;
-    // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
-    // constructed rendezvous to a rendezvous manager.
-    Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
-    run_opts.rendezvous = rendez;
+  TF_RETURN_IF_ERROR(
+      FillOutputDevices(*lib, *cpu_device, AttrSlice(&func_->attr()), &opts));
 
-    std::vector<Tensor>* rets = new std::vector<Tensor>;
-    const string& func_name = func_.name();
-    lib->Run(run_opts, handle, inputs, rets,
-             [rets, rendez, done, ctx, func_name](const Status& status) {
-               if (!status.ok()) {
-                 const string function_and_msg =
-                     strings::StrCat(errors::FormatFunctionForError(func_name),
-                                     " ", status.error_message());
-                 ctx->SetStatus(Status(status.code(), function_and_msg));
-               } else {
-                 for (int i = 0; i < rets->size(); ++i) {
-                   ctx->set_output(i, (*rets)[i]);
-                 }
-               }
-               delete rets;
-               rendez->Unref();
-               done();
-             });
-  }
+  TF_RETURN_IF_ERROR(
+      lib->Instantiate(func_->name(), AttrSlice(&func_->attr()), opts, handle));
+  return Status::OK();
+}
+
+void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
+                                    const std::vector<Tensor>& inputs,
+                                    FunctionLibraryRuntime* lib,
+                                    OpKernelContext* ctx, DoneCallback done) {
+  FunctionLibraryRuntime::Options run_opts;
+  run_opts.step_id = ctx->step_id();
+  run_opts.step_container = ctx->step_container();
+  run_opts.cancellation_manager = ctx->cancellation_manager();
+  run_opts.stats_collector = ctx->stats_collector();
+  run_opts.collective_executor = ctx->collective_executor();
+  // TODO(akshayka): Consider selecting a runner on a per-device basis,
+  // i.e., using device-specific threadpools when available.
+  run_opts.runner = ctx->runner();
+  run_opts.source_device =
+      lib->device() == nullptr ? "" : lib->device()->name();
+  run_opts.allow_dead_tensors = true;
+  // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
+  // constructed rendezvous to a rendezvous manager.
+  Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
+  run_opts.rendezvous = rendez;
 
-  NameAttrList func_;
-  ConfigProto config_proto_;
-  string executor_type_;
-  mutex mu_;
-  // Cache the handle per FLR because this kernel may be instantiated for
-  // a stateful op, different invocations of it may use different FLRs.
-  // Different device placements of PartitionedCallOp also use
-  // different FLRs.
-  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
-      GUARDED_BY(mu_);
-};
+  std::vector<Tensor>* rets = new std::vector<Tensor>;
+  const string& func_name = func_->name();
+  lib->Run(run_opts, handle, inputs, rets,
+           [rets, rendez, done, ctx, func_name](const Status& status) {
+             if (!status.ok()) {
+               const string function_and_msg =
+                   strings::StrCat(errors::FormatFunctionForError(func_name),
+                                   " ", status.error_message());
+               ctx->SetStatus(Status(status.code(), function_and_msg));
+             } else {
+               for (int i = 0; i < rets->size(); ++i) {
+                 ctx->set_output(i, (*rets)[i]);
+               }
+             }
+             delete rets;
+             rendez->Unref();
+             done();
+           });
+}
 
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
@@ -260,5 +263,4 @@ REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_SYCL),
                         PartitionedCallOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/partitioned_function_ops.h b/tensorflow/core/kernels/partitioned_function_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..776ebab96958d58cfd8f03296583fe1018269a6a
--- /dev/null
+++ b/tensorflow/core/kernels/partitioned_function_ops.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class NameAttrList;
+class ConfigProto;
+
+// A `PartitionedCallOp` asynchronously executes a function, potentially across
+// multiple devices but within a single process. The kernel places and
+// partitions a given function's underlying graph, and executes each of the
+// partitioned subgraphs as a function.
+//
+// TODO(akshayka): Support distributed execution.
+class PartitionedCallOp : public AsyncOpKernel {
+ public:
+  explicit PartitionedCallOp(OpKernelConstruction* ctx);
+
+  ~PartitionedCallOp() override;
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  Status FillOutputDevices(const FunctionLibraryRuntime& lib,
+                           const Device& cpu_device, AttrSlice attrs,
+                           FunctionLibraryRuntime::InstantiateOptions* opts);
+
+  Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                     std::vector<Tensor>* inputs,
+                     FunctionLibraryRuntime::Handle* handle);
+
+  void RunFunction(FunctionLibraryRuntime::Handle handle,
+                   const std::vector<Tensor>& inputs,
+                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                   DoneCallback done);
+
+  // Using unique pointers to avoid including proto headers in kernel headers
+  std::unique_ptr<NameAttrList> func_;
+  std::unique_ptr<ConfigProto> config_proto_;
+  string executor_type_;
+  mutex mu_;
+  // Cache the handle per FLR because this kernel may be instantiated for
+  // a stateful op, different invocations of it may use different FLRs.
+  // Different device placements of PartitionedCallOp also use
+  // different FLRs.
+  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_PARTITIONED_FUNCTION_OPS_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 69122f467c8fcf3818ab69f3f96d00b9a6b3c245..903cf9313a22cdc6937cdae53afb7063101400f8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -215,7 +215,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
       // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
       // emulated via NHWC.
       data_layout = se::dnn::DataLayout::kBatchYXDepth;
-      batch_size *= depth;
+      batch_size *= depth / 4;
       depth = 4;
       break;
     default:
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
index 27a687ba409fcc359e7fb3c6be2b4917b40fe60e..b9a7da56872993a5909b2a090f1b101a15587332 100644
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -63,17 +63,17 @@ __global__ void PopulationCountKernel<int64>(const int size, const int64* input,
   CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
 }
 
-#define DEFINE_GPU_SPECS(T)                                               \
-  template <>                                                             \
-  void PopulationCount<GPUDevice, T>::operator()(                         \
-      OpKernelContext* c, typename TTypes<T>::ConstFlat input,            \
-      TTypes<uint8>::Flat output) {                                       \
-    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
-    int64 total_count = input.size();                                     \
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
-    PopulationCountKernel<T>                                              \
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
-            total_count, input.data(), output.data());                    \
+#define DEFINE_GPU_SPECS(T)                                                    \
+  template <>                                                                  \
+  void PopulationCount<GPUDevice, T>::operator()(                              \
+      OpKernelContext* c, typename TTypes<T>::ConstFlat input,                 \
+      TTypes<uint8>::Flat output) {                                            \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
+    int64 total_count = input.size();                                          \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);             \
+    TF_CHECK_OK(CudaLaunchKernel(PopulationCountKernel<T>, config.block_count, \
+                                 config.thread_per_block, 0, d.stream(),       \
+                                 total_count, input.data(), output.data()));   \
   }
 
 TF_CALL_uint8(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index b03ac8e87dac8fabe0d45d8685ec4fa5fd642519..ff4e7be1622af8bfd2e19aaff5e1ff3677875f3c 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -246,4 +246,16 @@ REGISTER_QUANTIZED_CONCAT(qint32);
 
 #undef REGISTER_QUANTIZED_CONCAT
 
+#ifdef INTEL_MKL
+#define REGISTER_QUANTIZED_CONCATV2(type)                \
+  REGISTER_KERNEL_BUILDER(Name("QuantizedConcatV2")      \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
+                          QuantizedConcatOp<type>)
+
+REGISTER_QUANTIZED_CONCATV2(quint8);
+REGISTER_QUANTIZED_CONCATV2(qint32);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 9a1dcd0d496e45977704f49c10fba1048effc943..59e33d1ac3147728a08a2f5555a5c1b099393959 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -45,10 +45,11 @@ struct InterpolationCache {
   std::vector<T_SCALE> ilerp;
 };
 
-template <typename T_SCALE>
+template <typename T_SCALE, typename Scaler>
 inline void ComputeInterpolationWeights(
     const int64 out_size, const int64 in_size, const float scale,
     const int resolution, InterpolationCache<T_SCALE>* interpolation) {
+  const Scaler scaler;
   interpolation->lower.resize(out_size + 1);
   interpolation->upper.resize(out_size + 1);
   interpolation->lerp.resize(out_size + 1);
@@ -57,26 +58,31 @@ inline void ComputeInterpolationWeights(
   interpolation->lower[out_size] = 0;
   interpolation->upper[out_size] = 0;
   for (int64 i = out_size - 1; i >= 0; --i) {
-    const float in = i * scale;
-    interpolation->lower[i] = static_cast<int64>(in);
+    const float in = scaler(i, scale);
+    const float in_f = std::floor(in);
+    interpolation->lower[i] =
+        std::max(static_cast<int64>(in_f), static_cast<int64>(0));
     interpolation->upper[i] =
-        std::min(interpolation->lower[i] + 1, in_size - 1);
-    interpolation->lerp[i] = in - interpolation->lower[i];
-    interpolation->ilerp[i] = static_cast<T_SCALE>(
-        (in - interpolation->lower[i]) * (1 << resolution));
+        std::min(static_cast<int64>(std::ceil(in)), in_size - 1);
+    interpolation->lerp[i] = in - in_f;
+    interpolation->ilerp[i] =
+        static_cast<T_SCALE>((in - in_f) * (1 << resolution));
   }
 }
 
 template <typename T_SCALE>
-inline InterpolationCache<T_SCALE> BuildLerpCache(const int64 out_size,
-                                                  const int64 in_size,
-                                                  const float scale,
-                                                  const int index_step,
-                                                  const int resolution) {
+inline InterpolationCache<T_SCALE> BuildLerpCache(
+    const int64 out_size, const int64 in_size, const float scale,
+    const int index_step, const int resolution, const bool half_pixel_centers) {
   InterpolationCache<T_SCALE> cache;
   // Compute the cached interpolation weights on the x and y dimensions.
-  ComputeInterpolationWeights<T_SCALE>(out_size, in_size, scale, resolution,
-                                       &cache);
+  if (half_pixel_centers) {
+    ComputeInterpolationWeights<T_SCALE, HalfPixelScaler>(
+        out_size, in_size, scale, resolution, &cache);
+  } else {
+    ComputeInterpolationWeights<T_SCALE, LegacyScaler>(out_size, in_size, scale,
+                                                       resolution, &cache);
+  }
   CHECK(index_step > 0);
   if (index_step > 1) {
     for (int i = 0; i < cache.lower.size(); ++i) {
@@ -464,13 +470,14 @@ void ResizeImageReference(typename TTypes<T, 4>::ConstTensor images,
                           const int64 out_width, const int channels,
                           const float height_scale, const float width_scale,
                           const float in_min, const float in_max,
+                          const bool half_pixel_centers,
                           typename TTypes<T, 4>::Tensor* output) {
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<float> xs =
-      BuildLerpCache<float>(out_width, in_width, width_scale, channels, 0);
-  const InterpolationCache<float> ys =
-      BuildLerpCache<float>(out_height, in_height, height_scale, 1, 0);
+  const InterpolationCache<float> xs = BuildLerpCache<float>(
+      out_width, in_width, width_scale, channels, 0, half_pixel_centers);
+  const InterpolationCache<float> ys = BuildLerpCache<float>(
+      out_height, in_height, height_scale, 1, 0, half_pixel_centers);
 
   const int64 in_row_size = in_width * channels;
   const int64 in_batch_num_values = in_height * in_row_size;
@@ -512,10 +519,11 @@ void ResizeImage(typename TTypes<T, 4>::ConstTensor images,
                  const int64 out_width, const int channels,
                  const float height_scale, const float width_scale,
                  const float in_min, const float in_max,
+                 const bool half_pixel_centers,
                  typename TTypes<T, 4>::Tensor* output) {
   ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
                           out_width, channels, height_scale, width_scale,
-                          in_min, in_max, output);
+                          in_min, in_max, half_pixel_centers, output);
 }
 
 template <>
@@ -525,6 +533,7 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
                          const int64 out_width, const int channels,
                          const float height_scale, const float width_scale,
                          const float in_min, const float in_max,
+                         const bool half_pixel_centers,
                          typename TTypes<qint32, 4>::Tensor* output) {
   // 30 is maximum resolution for signed int.
   constexpr int RESOLUTION = 30;
@@ -532,10 +541,11 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int32> xs = BuildLerpCache<int32>(
-      out_width, in_width, width_scale, channels, RESOLUTION);
-  const InterpolationCache<int32> ys =
-      BuildLerpCache<int32>(out_height, in_height, height_scale, 1, RESOLUTION);
+  const InterpolationCache<int32> xs =
+      BuildLerpCache<int32>(out_width, in_width, width_scale, channels,
+                            RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int32> ys = BuildLerpCache<int32>(
+      out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64 in_row_size = in_width * channels;
   const int64 in_batch_num_values = in_height * in_row_size;
@@ -586,6 +596,7 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
                          const int64 out_width, const int channels,
                          const float height_scale, const float width_scale,
                          const float in_min, const float in_max,
+                         const bool half_pixel_centers,
                          typename TTypes<quint8, 4>::Tensor* output) {
   // 7 is maximum resolution for unsigned byte.
   constexpr int RESOLUTION = 7;
@@ -593,10 +604,11 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int16> xs = BuildLerpCache<int16>(
-      out_width, in_width, width_scale, channels, RESOLUTION);
-  const InterpolationCache<int16> ys =
-      BuildLerpCache<int16>(out_height, in_height, height_scale, 1, RESOLUTION);
+  const InterpolationCache<int16> xs =
+      BuildLerpCache<int16>(out_width, in_width, width_scale, channels,
+                            RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int16> ys = BuildLerpCache<int16>(
+      out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64 in_row_size = in_width * channels;
   const int64 in_batch_num_values = in_height * in_row_size;
@@ -646,6 +658,7 @@ template <typename T>
 void ResizeBilinear(const typename TTypes<T, 4>::ConstTensor& images,
                     const float height_scale, const float width_scale,
                     const float in_min, const float in_max,
+                    const bool half_pixel_centers,
                     typename TTypes<T, 4>::Tensor* output) {
   CHECK_NOTNULL(output);
 
@@ -666,11 +679,11 @@ void ResizeBilinear(const typename TTypes<T, 4>::ConstTensor& images,
   if (USE_REFERENCE) {
     ResizeImageReference<T>(images, batch_size, in_height, in_width, out_height,
                             out_width, channels, height_scale, width_scale,
-                            in_min, in_max, output);
+                            in_min, in_max, half_pixel_centers, output);
   } else {
     ResizeImage<T>(images, batch_size, in_height, in_width, out_height,
                    out_width, channels, height_scale, width_scale, in_min,
-                   in_max, output);
+                   in_max, half_pixel_centers, output);
   }
 }
 
@@ -682,6 +695,8 @@ class QuantizedResizeBilinearOp : public OpKernel {
   explicit QuantizedResizeBilinearOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -689,7 +704,7 @@ class QuantizedResizeBilinearOp : public OpKernel {
     const float in_min = context->input(2).flat<float>()(0);
     const float in_max = context->input(3).flat<float>()(0);
 
-    ImageResizerState st(align_corners_);
+    ImageResizerState st(align_corners_, false);
     st.ValidateAndCreateOutput(context, input);
 
     if (!context->status().ok()) return;
@@ -701,7 +716,7 @@ class QuantizedResizeBilinearOp : public OpKernel {
     typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());
 
     ResizeBilinear<T>(image_data, st.height_scale, st.width_scale, in_min,
-                      in_max, &output_data);
+                      in_max, half_pixel_centers_, &output_data);
     Tensor* out_min = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &out_min));
     out_min->flat<float>()(0) = in_min;
@@ -713,6 +728,7 @@ class QuantizedResizeBilinearOp : public OpKernel {
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(QuantizedResizeBilinearOp<T>);
 };
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 6fc489459231695a685346e3f728dd0a1e2202f2..a7931cb4510638b70a2c6e39c39ddd0639780d0e 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -65,15 +65,16 @@ float CalculateResizeScale(int64 in_size, int64 out_size, bool align_corners) {
              : in_size / static_cast<float>(out_size);
 }
 
-inline std::tuple<int64, int64, float> GetReferenceWeight(const int64 out_size,
-                                                          const int64 in_size,
-                                                          const int step,
-                                                          const int index,
-                                                          const float scale) {
-  const float in = index * scale;
-  const int64 lower = static_cast<int64>(in);
-  const int64 upper = std::min(lower + 1, in_size - 1);
-  return std::make_tuple(lower * step, upper * step, in - lower);
+inline std::tuple<int64, int64, float> GetReferenceWeight(
+    const bool half_pixel_centers, const int64 out_size, const int64 in_size,
+    const int step, const int index, const float scale) {
+  const float in = half_pixel_centers
+                       ? (static_cast<float>(index) + 0.5f) * scale - 0.5f
+                       : index * scale;
+  const float in_f = std::floor(in);
+  const int64 lower = std::max(static_cast<int64>(in_f), static_cast<int64>(0));
+  const int64 upper = std::min(static_cast<int64>(std::ceil(in)), in_size - 1);
+  return std::make_tuple(lower * step, upper * step, in - in_f);
 }
 
 template <typename T>
@@ -105,17 +106,17 @@ float ComputeLerpReference<float>(const float in_top_left,
 }
 
 template <typename T>
-T CalcReferenceResizedVal(const T* image_data, const int batch_size,
-                          const int64 in_height, const int64 in_width,
-                          const int64 out_height, const int64 out_width,
-                          const int channels, const float height_scale,
-                          const float width_scale, const float min,
-                          const float max, const int b, const int64 x,
-                          const int64 y, const int c) {
-  const std::tuple<int64, int64, float> x_weight =
-      GetReferenceWeight(out_width, in_width, channels, x, width_scale);
-  const std::tuple<int64, int64, float> y_weight =
-      GetReferenceWeight(out_height, in_height, 1, y, height_scale);
+T CalcReferenceResizedVal(const T* image_data, const bool half_pixel_centers,
+                          const int batch_size, const int64 in_height,
+                          const int64 in_width, const int64 out_height,
+                          const int64 out_width, const int channels,
+                          const float height_scale, const float width_scale,
+                          const float min, const float max, const int b,
+                          const int64 x, const int64 y, const int c) {
+  const std::tuple<int64, int64, float> x_weight = GetReferenceWeight(
+      half_pixel_centers, out_width, in_width, channels, x, width_scale);
+  const std::tuple<int64, int64, float> y_weight = GetReferenceWeight(
+      half_pixel_centers, out_height, in_height, 1, y, height_scale);
 
   const int64 in_row_size = in_width * channels;
   const int64 in_batch_num_values = in_height * in_row_size;
@@ -144,7 +145,8 @@ void CheckTensorValue(const T* in_data, const T* out_data, const int batch_size,
                       const int64 in_height, const int64 in_width,
                       const int64 out_height, const int64 out_width,
                       const int channels, const bool align_corners,
-                      const float min, const float max, const float tolerance,
+                      const bool half_pixel_centers, const float min,
+                      const float max, const float tolerance,
                       const bool relative) {
   const int64 out_row_size = out_width * channels;
   const float height_scale =
@@ -157,8 +159,9 @@ void CheckTensorValue(const T* in_data, const T* out_data, const int batch_size,
       for (int64 x = 0; x < out_width; ++x) {
         for (int c = 0; c < channels; ++c) {
           const T ref_qval = CalcReferenceResizedVal<T>(
-              in_data, batch_size, in_height, in_width, out_height, out_width,
-              channels, height_scale, width_scale, min, max, b, x, y, c);
+              in_data, half_pixel_centers, batch_size, in_height, in_width,
+              out_height, out_width, channels, height_scale, width_scale, min,
+              max, b, x, y, c);
           const T qval =
               out_data[(b * out_height + y) * out_row_size + x * channels + c];
           const float ref_val = QuantizedToFloat<T>(ref_qval, min, max);
@@ -186,6 +189,7 @@ void TestResizeBilinear(const Tensor& image_tensor, const DataType dt,
                         const Input::Initializer& new_size,
                         const bool show_time, const int64 iterations,
                         const float min, const float max,
+                        const bool half_pixel_centers,
                         std::vector<Tensor>* outputs) {
   Scope root = Scope::NewRootScope();
 
@@ -195,7 +199,8 @@ void TestResizeBilinear(const Tensor& image_tensor, const DataType dt,
   Output in_max = ops::Const<float>(root.WithOpName("max"), max);
 
   ops::QuantizedResizeBilinear qrb = ops::QuantizedResizeBilinear(
-      root.WithOpName("qrb"), placeholder, size, in_min, in_max);
+      root.WithOpName("qrb"), placeholder, size, in_min, in_max,
+      ops::QuantizedResizeBilinear::HalfPixelCenters(half_pixel_centers));
 
   TF_EXPECT_OK(root.status());
 
@@ -247,7 +252,7 @@ void TestResizeBilinearOneDim() {
 
   std::vector<Tensor> outputs;
   TestResizeBilinear(image_quantized_tensor, DT_QINT32, {1, OUT_WIDTH}, false,
-                     1, MIN, MAX, &outputs);
+                     1, MIN, MAX, false, &outputs);
   ASSERT_EQ(3, outputs.size());
   ASSERT_EQ(OUT_WIDTH, outputs.at(0).NumElements());
   ASSERT_EQ(4, outputs.at(0).shape().dims());
@@ -282,13 +287,16 @@ void TestResizeBilinearOneDim() {
                            /*out_height=*/OUT_WIDTH,
                            /*out_width=*/1,
                            /*channels=*/1,
-                           /*align_corners=*/false, MIN, MAX, TOLERANCE, true);
+                           /*align_corners=*/false,
+                           /*half_pixel_centers=*/false, MIN, MAX, TOLERANCE,
+                           true);
 }
 
 template <typename T>
 void RunTestResizeBilinearTwoDims(int batch_size, int in_height, int in_width,
                                   int out_height, int out_width, int channels,
-                                  float tolerance, bool relative) {
+                                  float tolerance, bool relative,
+                                  const bool half_pixel_centers) {
   constexpr float RATIO = 100.0f;
   const float min = 0.0f;
   const float max = batch_size * in_height * in_width * channels / RATIO;
@@ -298,18 +306,21 @@ void RunTestResizeBilinearTwoDims(int batch_size, int in_height, int in_width,
 
   std::vector<Tensor> outputs;
   TestResizeBilinear(image_quantized_tensor, DataTypeToEnum<T>::value,
-                     {out_height, out_width}, false, 1, min, max, &outputs);
-  CheckTensorValue<T>(image_quantized_tensor.flat<T>().data(),
-                      outputs.at(0).flat<T>().data(), batch_size, in_height,
-                      in_width, out_height, out_width, channels,
-                      /*align_corners=*/false, min, max, tolerance, relative);
+                     {out_height, out_width}, false, 1, min, max,
+                     half_pixel_centers, &outputs);
+  CheckTensorValue<T>(
+      image_quantized_tensor.flat<T>().data(), outputs.at(0).flat<T>().data(),
+      batch_size, in_height, in_width, out_height, out_width, channels,
+      /*align_corners=*/false,
+      /*half_pixel_centers=*/half_pixel_centers, min, max, tolerance, relative);
 }
 
 template <typename T>
 void RunBenchmarkResizeBilinearTwoDims(int batch_size, int in_height,
                                        int in_width, int out_height,
                                        int out_width, int channels,
-                                       int iteration) {
+                                       int iteration,
+                                       const bool half_pixel_centers) {
   constexpr float RATIO = 100.0f;
   const float min = 0.0f;
   const float max = batch_size * in_height * in_width * channels / RATIO;
@@ -319,47 +330,57 @@ void RunBenchmarkResizeBilinearTwoDims(int batch_size, int in_height,
 
   std::vector<Tensor> outputs;
   TestResizeBilinear(image_quantized_tensor, DataTypeToEnum<T>::value,
-                     {out_height, out_width}, true, iteration, min, max,
+                     {out_height, out_width}, true, iteration, min, max, false,
                      &outputs);
 }
 
 template <typename T>
-void TestResizeBilinearTwoDimsType(const float tolerance, const bool relative) {
-  RunTestResizeBilinearTwoDims<T>(1, 1, 1, 1, 1, 1, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 1, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 1, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, tolerance,
-                                  relative);
-  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 1, tolerance,
-                                  relative);
-  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 2, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 2, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 2, tolerance,
-                                  relative);
-  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 2, tolerance,
-                                  relative);
-  RunTestResizeBilinearTwoDims<T>(1, 1, 16, 1, 32, 3, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 3, tolerance, relative);
-  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, tolerance,
-                                  relative);
-  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 3, tolerance,
-                                  relative);
+void TestResizeBilinearTwoDimsType(const float tolerance, const bool relative,
+                                   const bool half_pixel_centers) {
+  RunTestResizeBilinearTwoDims<T>(1, 1, 1, 1, 1, 1, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 1, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 1, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 1, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 2, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 1, 256, 1, 2, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 2, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 2, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 16, 1, 32, 3, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 1, 128, 1, 256, 3, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, tolerance, relative,
+                                  half_pixel_centers);
+  RunTestResizeBilinearTwoDims<T>(1, 256, 256, 128, 128, 3, tolerance, relative,
+                                  half_pixel_centers);
 }
 
 void TestResizeBilinearTwoDims() {
-  TestResizeBilinearTwoDimsType<quint8>(1.0f, false);
-  TestResizeBilinearTwoDimsType<qint32>(1.0e-5, true);
-  TestResizeBilinearTwoDimsType<float>(1.0e-5, true);
+  for (const bool half_pixel_centers : {false, true}) {
+    TestResizeBilinearTwoDimsType<quint8>(1.0f, false, half_pixel_centers);
+    TestResizeBilinearTwoDimsType<qint32>(1.0e-5, true, half_pixel_centers);
+    TestResizeBilinearTwoDimsType<float>(1.0e-5, true, half_pixel_centers);
+  }
 }
 
 template <typename T>
 void RunBenchmarkResizeBilinearTwoDimsType() {
   constexpr int ITER = 100;
-  RunBenchmarkResizeBilinearTwoDims<T>(1, 1, 1, 2, 2, 1, ITER);
-  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, ITER);
-  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, ITER);
-  RunBenchmarkResizeBilinearTwoDims<T>(1, 64, 64, 128, 128, 2, ITER);
-  RunBenchmarkResizeBilinearTwoDims<T>(1, 32, 32, 64, 64, 16, ITER);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 1, 1, 2, 2, 1, ITER, false);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 1, ITER, false);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 128, 128, 256, 256, 3, ITER, false);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 64, 64, 128, 128, 2, ITER, false);
+  RunBenchmarkResizeBilinearTwoDims<T>(1, 32, 32, 64, 64, 16, ITER, false);
 }
 
 void RunBenchmarkResizeBilinearTwoDims() {
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index edb2b10e3d69b6ac93c13b875d00fa9de7ed5362..55278d0480e9852a70a4794f8a5a6f5408809f2a 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_op_gpu.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -36,170 +37,6 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <class Distribution, bool VariableSamplesPerOutput>
-struct FillPhiloxRandomKernel;
-
-template <typename T, int ElementCount>
-class SampleCopier {
- public:
-  inline __device__ void operator()(
-      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
-#pragma unroll
-    for (int i = 0; i < ElementCount; i++) {
-      buf[i] = array[i];
-    }
-  }
-};
-
-template <>
-class SampleCopier<float, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      float* buf, const tensorflow::random::Array<float, 4>& array) const {
-    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
-    // have 32-bit alignment vs 128-bit alignment. There seems to be no
-    // performance loss when assigning each element to a vector.
-    float4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    float4* buf_vector = reinterpret_cast<float4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int32, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
-    int4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    int4* buf_vector = reinterpret_cast<int4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<double, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      double* buf, const tensorflow::random::Array<double, 2>& array) const {
-    double2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    double2* buf_vector = reinterpret_cast<double2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int64, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
-    longlong2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a fixed number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, false> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
-                              Distribution dist) {
-    const int kGroupSize = Distribution::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int32 offset = thread_id * kGroupSize;
-    gen.Skip(thread_id);
-
-    const SampleCopier<T, kGroupSize> copier;
-    while (offset + kGroupSize <= size) {
-      const typename Distribution::ResultType samples = dist(&gen);
-      copier(&data[offset], samples);
-
-      offset += total_thread_count * kGroupSize;
-      gen.Skip(total_thread_count - 1);
-    }
-
-    typename Distribution::ResultType samples = dist(&gen);
-    for (int i = 0; i < kGroupSize; ++i) {
-      if (offset >= size) {
-        return;
-      }
-      data[offset] = samples[i];
-      ++offset;
-    }
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a variable number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, true> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
-                              int64 size, Distribution dist) {
-    using random::PhiloxRandom;
-    using random::SingleSampleAdapter;
-
-    const int kReservedSamplesPerOutput = 256;
-    const int kGroupSize = Distribution::kResultElementCount;
-    const int kGeneratorSkipPerOutputGroup = kGroupSize *
-                                             kReservedSamplesPerOutput /
-                                             PhiloxRandom::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int64 group_index = thread_id;
-    int64 offset = group_index * kGroupSize;
-
-    while (offset < size) {
-      // Since each output takes a variable number of samples, we need to
-      // realign the generator to the beginning for the current output group
-      PhiloxRandom gen = base_gen;
-      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
-      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
-
-      typename Distribution::ResultType samples = dist(&single_samples);
-
-      for (int i = 0; i < kGroupSize; ++i) {
-        if (offset >= size) {
-          return;
-        }
-        data[offset] = samples[i];
-        ++offset;
-      }
-
-      offset += (total_thread_count - 1) * kGroupSize;
-      group_index += total_thread_count;
-    }
-  }
-};
-
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
@@ -222,9 +59,10 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<Distribution>
-      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
-};
+  TF_CHECK_OK(CudaLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
+                               num_blocks, block_size, 0, d.stream(), gen, data,
+                               size, dist));
+}
 
 // Explicit instantiation of the GPU distributions functors
 // clang-format off
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e32c755d78259a76c0dbad16efb871e7dfc8216d
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+
+#if defined(__CUDACC__)
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
+                              Distribution dist);
+};
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
+                              int64 size, Distribution dist);
+};
+
+template <typename T, int ElementCount>
+class SampleCopier {
+ public:
+  inline __device__ void operator()(
+      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
+#pragma unroll
+    for (int i = 0; i < ElementCount; i++) {
+      buf[i] = array[i];
+    }
+  }
+};
+
+template <>
+class SampleCopier<float, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      float* buf, const tensorflow::random::Array<float, 4>& array) const {
+    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
+    // have 32-bit alignment vs 128-bit alignment. There seems to be no
+    // performance loss when assigning each element to a vector.
+    float4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    float4* buf_vector = reinterpret_cast<float4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int32, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
+    int4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    int4* buf_vector = reinterpret_cast<int4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<double, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      double* buf, const tensorflow::random::Array<double, 2>& array) const {
+    double2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    double2* buf_vector = reinterpret_cast<double2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int64, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
+    longlong2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, false>::Run(
+    random::PhiloxRandom gen, T* data, int64 size, Distribution dist) {
+  const int kGroupSize = Distribution::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int32 offset = thread_id * kGroupSize;
+  gen.Skip(thread_id);
+
+  const SampleCopier<T, kGroupSize> copier;
+  while (offset + kGroupSize <= size) {
+    const typename Distribution::ResultType samples = dist(&gen);
+    copier(&data[offset], samples);
+
+    offset += total_thread_count * kGroupSize;
+    gen.Skip(total_thread_count - 1);
+  }
+
+  typename Distribution::ResultType samples = dist(&gen);
+  for (int i = 0; i < kGroupSize; ++i) {
+    if (offset >= size) {
+      return;
+    }
+    data[offset] = samples[i];
+    ++offset;
+  }
+}
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, true>::Run(
+    const random::PhiloxRandom& base_gen, T* data, int64 size,
+    Distribution dist) {
+  using random::PhiloxRandom;
+  using random::SingleSampleAdapter;
+
+  const int kReservedSamplesPerOutput = 256;
+  const int kGroupSize = Distribution::kResultElementCount;
+  const int kGeneratorSkipPerOutputGroup = kGroupSize *
+                                           kReservedSamplesPerOutput /
+                                           PhiloxRandom::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int64 group_index = thread_id;
+  int64 offset = group_index * kGroupSize;
+
+  while (offset < size) {
+    // Since each output takes a variable number of samples, we need to
+    // realign the generator to the beginning for the current output group
+    PhiloxRandom gen = base_gen;
+    gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+    SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+    typename Distribution::ResultType samples = dist(&single_samples);
+
+    for (int i = 0; i < kGroupSize; ++i) {
+      if (offset >= size) {
+        return;
+      }
+      data[offset] = samples[i];
+      ++offset;
+    }
+
+    offset += (total_thread_count - 1) * kGroupSize;
+    group_index += total_thread_count;
+  }
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // defined(__CUDACC__)
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index d682cd3b52db50575480c749b9b8e2633c4b8f07..9522b1ac44b86aaf755e7ce089899b7ac6a71910 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -154,7 +154,7 @@ int64 UniformSampler::Sample(random::SimplePhilox* rnd) const {
 float UniformSampler::Probability(int64 value) const { return inv_range_; }
 
 LogUniformSampler::LogUniformSampler(int64 range)
-    : RangeSampler(range), log_range_(log(range + 1)) {}
+    : RangeSampler(range), log_range_(log1p(range)) {}
 
 int64 LogUniformSampler::Sample(random::SimplePhilox* rnd) const {
   const int64 value =
@@ -294,7 +294,7 @@ Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
       float w = 0.0;
-      if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) {
+      if (!strings::safe_strtof(cols.at(cols.size() - 1), &w)) {
         return errors::InvalidArgument("Wrong vocabulary format at line: ",
                                        line);
       }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index e9cf36c62b966f5f91cf7764421f0c1ff6c131fc..0f08588ebac3d7d0a2dd23689fe03da121ab9929 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -40,6 +40,13 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename T>
+struct Sqrt {
+  __host__ __device__ T operator()(const T& a) const {
+    return Eigen::numext::sqrt(a);
+  }
+};
+
 template <typename T>
 struct Sum {
   __host__ __device__ T operator()(const T& a, const T& b) const {
@@ -98,6 +105,13 @@ struct Prod<std::complex<double>> {
   }
 };
 
+template <typename T>
+struct Square {
+  __host__ __device__ T operator()(const T& a) const {
+    return Prod<T>()(a, Eigen::numext::conj(a));
+  }
+};
+
 template <typename T, typename outT = T>
 struct DividesBy {
   T divisor;
@@ -497,10 +511,11 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
   if (in_size <= 4096) {
     const int num_blocks = 1;
     const int num_threads = 256;
-    BlockReduceKernel<IN_T, OUT_T, num_threads>
-        <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(
+        BlockReduceKernel<IN_T, OUT_T, num_threads, Op>, num_blocks,
+        num_threads, 0, cu_stream, in, out, in_size, op, init));
     return;
-  } else if (in_size <= 1 << 19) {
+  } else if (in_size <= 1 << 18) {
     const int num_threads = 256;
     const int num_blocks = std::min(32, Eigen::divup(in_size, num_threads));
     // it seems like tailoring this to the GPU
@@ -517,17 +532,19 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
             DT_INT8, TensorShape({static_cast<int64>(num_blocks * sizeof(T))}),
             &temp_storage));
 
-    BlockReduceKernel<IN_T, T*, num_threads>
-        <<<num_blocks, num_threads, 0, cu_stream>>>(
-            in, (T*)temp_storage.flat<int8_t>().data(), in_size, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(BlockReduceKernel<IN_T, T*, num_threads, Op>,
+                                 num_blocks, num_threads, 0, cu_stream, in,
+                                 (T*)temp_storage.flat<int8_t>().data(),
+                                 in_size, op, init));
 
     // take care that we only reduce blocks that had some valid elements in them
     // TODO(eriche): CUB currently has a bug in HeadSegmentedReduce that
     // requires it to be used with a full warp.  Can reduce 32 -> num_blocks
     // when this is fixed.
-    CleanupSegments<<<1, 32, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, 1, 1, num_blocks, op,
-        init);
+    TF_CHECK_OK(CudaLaunchKernel(CleanupSegments<T*, OUT_T, Op>, 1, 32, 0,
+                                 cu_stream,
+                                 (T*)temp_storage.flat<int8_t>().data(), out, 1,
+                                 1, num_blocks, op, init));
     return;
   }
 
@@ -539,7 +556,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
 
     OP_REQUIRES(
         ctx, success == 0,
-        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+        errors::Internal("CUB reduce error ", cudaGetErrorString(success)));
   };
 
   reduce(nullptr);  // Get required amount of temp storage.
@@ -562,8 +579,9 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
     const int warps_per_block = threads_per_block / 32;
     int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block;
 
-    RowReduceKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-        in, out, num_rows, num_cols, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(RowReduceKernel<IN_T, OUT_T, Op>, num_blocks,
+                                 threads_per_block, 0, cu_stream, in, out,
+                                 num_rows, num_cols, op, init));
     return;
   }
 
@@ -614,8 +632,9 @@ void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
   }
 
   if (grid_dim.y == 1) {
-    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, out, extent_x, extent_y, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(
+        ColumnReduceMax16ColumnsKernel<IN_T, OUT_T, Op>, grid_dim, block_dim, 0,
+        cu_stream, in, out, extent_x, extent_y, op, init));
   } else {
     Tensor temp_storage;
     OP_REQUIRES_OK(ctx,
@@ -623,15 +642,17 @@ void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
                                       TensorShape({static_cast<int64>(
                                           sizeof(T) * extent_y * grid_dim.y)}),
                                       &temp_storage));
-    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op,
-        init);
+    TF_CHECK_OK(CudaLaunchKernel(ColumnReduceMax16ColumnsKernel<IN_T, T*, Op>,
+                                 grid_dim, block_dim, 0, cu_stream, in,
+                                 (T*)temp_storage.flat<int8_t>().data(),
+                                 extent_x, extent_y, op, init));
 
     dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
     dim3 num_threads(128, 1, 1);
-    CleanupSegments<<<new_grid_dim, num_threads, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
-        grid_dim.y, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(CleanupSegments<T*, OUT_T, Op>, new_grid_dim,
+                                 num_threads, 0, cu_stream,
+                                 (T*)temp_storage.flat<int8_t>().data(), out,
+                                 extent_x, extent_y, grid_dim.y, op, init));
   }
 }
 
@@ -650,8 +671,9 @@ void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
   }
 
   if (grid_dim.y == 1) {
-    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, out, extent_x, extent_y, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(ColumnReduceKernel<IN_T, OUT_T, Op>, grid_dim,
+                                 block_dim, 0, cu_stream, in, out, extent_x,
+                                 extent_y, op, init));
   } else {
     Tensor temp_storage;
     OP_REQUIRES_OK(ctx,
@@ -660,15 +682,16 @@ void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
                                           sizeof(T) * extent_y * grid_dim.y)}),
                                       &temp_storage));
 
-    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op,
-        init);
+    TF_CHECK_OK(CudaLaunchKernel(
+        ColumnReduceKernel<IN_T, T*, Op>, grid_dim, block_dim, 0, cu_stream, in,
+        (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op, init));
 
     dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
     dim3 num_threads(128, 1, 1);
-    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
-        grid_dim.y, op, init);
+    TF_CHECK_OK(CudaLaunchKernel(CleanupSegments<T*, OUT_T, Op>, new_grid_dim,
+                                 block_dim, 0, cu_stream,
+                                 (T*)temp_storage.flat<int8_t>().data(), out,
+                                 extent_x, extent_y, grid_dim.y, op, init));
   }
 }
 
@@ -686,8 +709,9 @@ void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
     int threads_per_block = 128;
     int num_blocks = Eigen::divup(extent_y, threads_per_block);
 
-    ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-        in, out, 1, extent_x, extent_y, op);
+    TF_CHECK_OK(CudaLaunchKernel(ColumnReduceSimpleKernel<IN_T, OUT_T, Op>,
+                                 num_blocks, threads_per_block, 0, cu_stream,
+                                 in, out, 1, extent_x, extent_y, op));
   }
 }
 
@@ -701,8 +725,9 @@ void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
 
   // TODO(eriche): this won't be very good in the case of small x
   //                small z and large y.
-  ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-      in, out, extent_x, extent_y, extent_z, op);
+  TF_CHECK_OK(CudaLaunchKernel(ColumnReduceSimpleKernel<IN_T, OUT_T, Op>,
+                               num_blocks, threads_per_block, 0, cu_stream, in,
+                               out, extent_x, extent_y, extent_z, op));
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -884,6 +909,31 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
   }
 };
 
+// TODO(rmlarsen): Specialize for float16.
+template <typename T>
+struct ReduceFunctor<GPUDevice, functor::EuclideanNormReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::EuclideanNormReducer<T>& reducer) {
+    typedef cub::TransformInputIterator<T, Square<T>, T*> inputIterType;
+    inputIterType input_itr((T*)in.data(), Square<T>());
+    typedef TransformOutputIterator<T, T, Sqrt<T>> outputIterType;
+    outputIterType output_itr((T*)out.data(), Sqrt<T>());
+    ReduceImpl<T, Sum<T>, outputIterType, inputIterType, ReductionAxes>(
+        ctx, output_itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::EuclideanNormReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
 template <typename T>
 struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 2331599b72f46df7a34e9553d5bd41a7613409da..0a1568bdc2521addb954bdd472164922e4f7d0f5 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -33,6 +33,12 @@ struct MeanReducer {
   Scalar initialize() const { return Scalar(0); }
 };
 
+// Dummy class used for template specialization for l2-norm reduction.
+template <typename Scalar>
+struct EuclideanNormReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Reducer>
 struct ReduceEigenImpl {
@@ -56,6 +62,39 @@ struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
   }
 };
 
+// TODO(rmlarsen): Refactor this such that taking the sqrt can be optional
+// controlled by an attribute.
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) =
+        (in * in.conjugate()).reduce(reduction_axes, sum_reducer).sqrt();
+  }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<bfloat16>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<bfloat16>& reducer) {
+    static_assert(std::is_same<bfloat16, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<float> sum_reducer;
+    auto in_as_float = in.template cast<float>();
+    out.device(d) = (in_as_float * in_as_float.conjugate())
+                        .reduce(reduction_axes, sum_reducer)
+                        .sqrt()
+                        .template cast<bfloat16>();
+  }
+};
+
 // For most reducers, the identity is Reducer::initialize()
 template <typename Reducer>
 struct Identity {
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f4bf50e7ca0ecf8506b260829cae2127305cadb
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int32,               \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int64,               \
+                                      functor::EuclideanNormReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
index c44a40b3b38f5a37574d0d81b7b67adcf27451e1..662f24d9054ab2cfd312ea933f2a7769c6e3983b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, functor::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex128, functor::EuclideanNormReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
index 1921130ac043d9d1bfdea415c59aafcedcc31ef3..8ab2a6e13e52b0c92bfde2a2c6acf4423dc5976b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, functor::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex64, functor::EuclideanNormReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index 119f726b929bd9c599e26684fede9890efceb2f2..c492308a9162596235e8d07e9b376abbd89c2007 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(double);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index 70ba4abac48bcfe10d577a120cf08fdd8650f367..b006311c125c1e8e86d499ce125aa7cd817f9d5f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(float);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index 82f6d7df952fcd8b0aaa3561efd4a4bca93e4dce..91a33b92cb6663310d6cfee9d20127b960e6a11b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(int32);
diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
index db050fdea38bd6db58424da72ff75e79e9151a09..f33d504e25a202c5ce229276611c0958f97f8eee 100644
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@@ -51,8 +51,9 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);
 
 DEFINE_FOR_ALL_REDUCERS(Eigen::half);
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index fe8ea59f1be521166d0e42295e79d1bb5a242750..359d7dbeca58be8643e51a1ad2248ccd57f67e79 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -164,6 +164,11 @@ static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
+}
+BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c542099cc0870f890dd57a80022b28ee27b8e047
--- /dev/null
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+namespace functor {
+
+// Compute reduction over all outer dimensions.
+// Example:
+//   input: [32, 32, 256]
+//   ->
+//   output: [256]
+template <typename T, typename AccumT, typename BinaryFunctor>
+struct ReduceOuterDimensions {
+  template <int num_dims>
+  void operator()(const CPUDevice& device,
+                  const Eigen::DSizes<Eigen::Index, num_dims>& input_dims,
+                  const Tensor& input, Tensor* output) const {
+    static_assert(num_dims >= 2, "Input dimensions must at least 2");
+
+    // Compute inner and outer dim after reshaping into 2d tensor.
+    int64 inner_dim = input_dims[num_dims - 1];
+    int64 outer_dim = 1;
+    for (int i = 0; i < num_dims - 1; ++i) outer_dim *= input_dims[i];
+
+    // Compute block size along the outer dimension for efficiency.
+    const int64 parallel_cell_size = inner_dim;
+    const int64 total_workload = outer_dim * inner_dim;
+    const int64 max_parallelism = total_workload / parallel_cell_size;
+
+    const int64 min_block_workload = 2000;
+    const int64 min_block_size =
+        Eigen::divup(min_block_workload, parallel_cell_size);
+    const int64 max_num_blocks =
+        std::min(max_parallelism, Eigen::divup(total_workload, min_block_size));
+
+    // Do not create more blocks than there are threads in a pool.
+    const int64 num_threads = device.numThreads();
+    const int64 num_blocks = std::min(max_num_blocks, num_threads);
+
+    // Block size along the outer dimension.
+    const int64 outer_block_size = Eigen::divup(outer_dim, num_blocks);
+
+    const T* input_data = input.template flat<T>().data();
+
+    // Allocate temporary buffer for partial reductions.
+    Tensor buffer(DataTypeToEnum<AccumT>::v(), {num_blocks, inner_dim});
+    buffer.template flat<AccumT>().setZero();
+    AccumT* buffer_data = buffer.template flat<AccumT>().data();
+
+    using Buffer = Eigen::TensorMap<
+        Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>,
+        Eigen::Unaligned>;
+
+    using Input = Eigen::TensorMap<
+        Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::Index>,
+        Eigen::Unaligned>;
+
+    const auto compute = [inner_dim, num_blocks, outer_block_size, buffer_data,
+                          input_data, outer_dim](Eigen::Index start,
+                                                 Eigen::Index limit) -> void {
+      DCHECK(start >= 0 && limit <= num_blocks);
+      int64 outer_dim_start = start * outer_block_size;
+      int64 outer_dim_limit = limit * outer_block_size;
+      outer_dim_limit = std::min(outer_dim, outer_dim_limit);
+
+      Buffer buf(buffer_data + start * inner_dim, inner_dim);
+      for (int64 i = outer_dim_start; i < outer_dim_limit; ++i) {
+        auto in = Input(input_data + i * inner_dim, inner_dim);
+        auto cast = in.template cast<AccumT>();
+        buf = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf),
+                                         const decltype(cast)>(buf, cast);
+      }
+    };
+
+    // Compute cost of reducing a single block.
+    const int64 compute_size = outer_block_size * inner_dim;
+    const int64 compute_input_bytes = compute_size * sizeof(T);
+    const Eigen::TensorOpCost cost(
+        compute_input_bytes,
+        0,  // We'll be mostly writing to L1, assume store cost is 0
+        compute_size * Eigen::internal::functor_traits<BinaryFunctor>::Cost);
+
+    device.parallelFor(num_blocks, cost, compute);
+
+    // Aggregate partial results from temporary buffer into first block.
+    auto buf0 = Buffer(buffer_data, inner_dim);
+    // TODO(ezhulenev): Parallelize this loop for large inner dimensions?
+    for (int i = 1; i < num_blocks; ++i) {
+      auto buf = Buffer(buffer_data + i * inner_dim, inner_dim);
+      buf0 = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf0),
+                                        const decltype(buf)>(buf0, buf);
+    }
+
+    // Write final result to the output.
+    output->template flat<T>() = buf0.template cast<T>();
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
index c996ae60b79ee9cc54a29d7a2fd13fc7ff5465e8..38bb2a9a9696947c54e93b0596274bc6d1a97b8b 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -144,7 +144,11 @@ class ResizeAreaOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
+    // The op always did the correct thing with regard to pixel centers, so we
+    // always pass false here for half_pixel_centers since ImageResizerState
+    // enforces that if align_corners_ is true, half_pixel_centers must be
+    // false.
+    ImageResizerState st(align_corners_, /*unused half_pixel_centers=*/false);
     st.ValidateAndCreateOutput(context, input);
 
     if (!context->status().ok()) return;
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 8380ed6d8ff64a2959c453059b2df3242134dd94..17ee9dbd9c198c7ec73a5c6bbfcbb658cd70ec62 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -35,25 +35,34 @@ namespace {
 
 static const int64 kTableSize = (1 << 10);
 
-const float* InitCoeffsTable() {
+const float* InitCoeffsTable(const double a) {
   // Allocate and initialize coefficients table using Bicubic
   // convolution algorithm.
   // https://en.wikipedia.org/wiki/Bicubic_interpolation
   float* coeffs_table = new float[(kTableSize + 1) * 2];
-  static const double A = -0.75;
   for (int i = 0; i <= kTableSize; ++i) {
     float x = i * 1.0 / kTableSize;
-    coeffs_table[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs_table[i * 2] = ((a + 2) * x - (a + 3)) * x * x + 1;
     x += 1.0;
-    coeffs_table[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+    coeffs_table[i * 2 + 1] = ((a * x - 5 * a) * x + 8 * a) * x - 4 * a;
   }
+
   return coeffs_table;
 }
 
-const float* GetCoeffsTable() {
+const float* GetCoeffsTable(const bool use_keys_cubic) {
   // Static so that we initialize it on first use
-  static const float* coeffs_table = InitCoeffsTable();
-  return coeffs_table;
+  if (use_keys_cubic) {
+    // http://ieeexplore.ieee.org/document/1163711/
+    // R. G. Keys. Cubic convolution interpolation for digital image
+    // processing. IEEE Transactions on Acoustics, Speech, and Signal
+    // Processing, 29(6):1153–1160, 1981.
+    static const float* coeffs_table = InitCoeffsTable(-0.5f);
+    return coeffs_table;
+  } else {
+    static const float* coeffs_table = InitCoeffsTable(-0.75f);
+    return coeffs_table;
+  }
 }
 
 inline int64 Bound(int64 val, int64 limit) {
@@ -73,20 +82,55 @@ struct WeightsAndIndices {
   int advance;  // advance value.
 };
 
+template <typename Scaler, bool use_keys_cubic>
 inline void GetWeightsAndIndices(const float scale, const int64 out_loc,
                                  const int64 limit, WeightsAndIndices* out) {
-  const int64 in_loc = scale * out_loc;
-  const float delta = scale * out_loc - in_loc;
+  const Scaler scaler;
+  const float in_loc_f = scaler(out_loc, scale);
+  const int64 in_loc = std::floor(in_loc_f);
+  const float delta = in_loc_f - in_loc;
   const int64 offset = lrintf(delta * kTableSize);
-  const float* coeffs_table = GetCoeffsTable();
-  out->weight_0 = coeffs_table[offset * 2 + 1];
-  out->weight_1 = coeffs_table[offset * 2];
-  out->weight_2 = coeffs_table[(kTableSize - offset) * 2];
-  out->weight_3 = coeffs_table[(kTableSize - offset) * 2 + 1];
-  out->index_0 = Bound(in_loc - 1, limit);
-  out->index_1 = Bound(in_loc, limit);
-  out->index_2 = Bound(in_loc + 1, limit);
-  out->index_3 = Bound(in_loc + 2, limit);
+  const float* coeffs_table = GetCoeffsTable(use_keys_cubic);
+  if (use_keys_cubic) {
+    // The legacy code placed more weight on the edge pixels, since bounding
+    // the set of inputs to sample could cause an edge pixel to be repeated.
+    // Here we change the behavior at borders to match that used by the
+    // scale_and_translate_op, where sampling locations outside the image have
+    // their weight set to 0, and the weights are renormalized so that their sum
+    // is 1.0.
+    out->index_0 = Bound(in_loc - 1, limit);
+    out->weight_0 =
+        (out->index_0 == in_loc - 1 ? coeffs_table[offset * 2 + 1] : 0.0f);
+    out->index_1 = Bound(in_loc, limit);
+    out->weight_1 = (out->index_1 == in_loc ? coeffs_table[offset * 2] : 0.0f);
+    out->index_2 = Bound(in_loc + 1, limit);
+    out->weight_2 =
+        (out->index_2 == in_loc + 1 ? coeffs_table[(kTableSize - offset) * 2]
+                                    : 0.0f);
+    out->index_3 = Bound(in_loc + 2, limit);
+    out->weight_3 = (out->index_3 == in_loc + 2
+                         ? coeffs_table[(kTableSize - offset) * 2 + 1]
+                         : 0.0f);
+
+    const float weight_sum =
+        out->weight_0 + out->weight_1 + out->weight_2 + out->weight_3;
+    if (std::abs(weight_sum) >= 1000.0f * std::numeric_limits<float>::min()) {
+      const float one_over_weight_sum = 1.0f / weight_sum;
+      out->weight_0 *= one_over_weight_sum;
+      out->weight_1 *= one_over_weight_sum;
+      out->weight_2 *= one_over_weight_sum;
+      out->weight_3 *= one_over_weight_sum;
+    }
+  } else {
+    out->weight_0 = coeffs_table[offset * 2 + 1];
+    out->weight_1 = coeffs_table[offset * 2];
+    out->weight_2 = coeffs_table[(kTableSize - offset) * 2];
+    out->weight_3 = coeffs_table[(kTableSize - offset) * 2 + 1];
+    out->index_0 = Bound(in_loc - 1, limit);
+    out->index_1 = Bound(in_loc, limit);
+    out->index_2 = Bound(in_loc + 1, limit);
+    out->index_3 = Bound(in_loc + 2, limit);
+  }
 }
 
 template <typename T>
@@ -162,14 +206,25 @@ class CachedInterpolationCalculator {
 };
 
 static void ComputeXWeightsAndIndices(const ImageResizerState& resizer_state,
+                                      const bool half_pixel_centers,
                                       std::vector<WeightsAndIndices>* x_wais) {
   CachedInterpolationCalculator calc;
-  for (int64 x = 0; x < resizer_state.out_width; ++x) {
-    GetWeightsAndIndices(resizer_state.width_scale, x, resizer_state.in_width,
-                         &(*x_wais)[x]);
-    auto& x_wai = (*x_wais)[x];
-    x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
-                                 x_wai.index_3);
+  if (half_pixel_centers) {
+    for (int64 x = 0; x < resizer_state.out_width; ++x) {
+      GetWeightsAndIndices<HalfPixelScaler, true>(
+          resizer_state.width_scale, x, resizer_state.in_width, &(*x_wais)[x]);
+      auto& x_wai = (*x_wais)[x];
+      x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
+                                   x_wai.index_3);
+    }
+  } else {
+    for (int64 x = 0; x < resizer_state.out_width; ++x) {
+      GetWeightsAndIndices<LegacyScaler, false>(
+          resizer_state.width_scale, x, resizer_state.in_width, &(*x_wais)[x]);
+      auto& x_wai = (*x_wais)[x];
+      x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
+                                   x_wai.index_3);
+    }
   }
   // Scale the values so they can be used as offsets into buffers.
   for (int x = 0; x < resizer_state.out_width; ++x) {
@@ -182,14 +237,27 @@ static void ComputeXWeightsAndIndices(const ImageResizerState& resizer_state,
 
 static void ComputeGradientXWeightsAndIndices(
     const ImageResizerGradientState& resizer_state,
-    std::vector<WeightsAndIndices>* x_wais) {
+    const bool half_pixel_centers, std::vector<WeightsAndIndices>* x_wais) {
   CachedInterpolationCalculator calc;
-  for (int64 x = 0; x < resizer_state.resized_width; ++x) {
-    GetWeightsAndIndices(resizer_state.width_scale, x,
-                         resizer_state.original_width, &(*x_wais)[x]);
-    auto& x_wai = (*x_wais)[x];
-    x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
-                                 x_wai.index_3);
+  if (half_pixel_centers) {
+    for (int64 x = 0; x < resizer_state.resized_width; ++x) {
+      GetWeightsAndIndices<HalfPixelScaler, true>(resizer_state.width_scale, x,
+                                                  resizer_state.original_width,
+                                                  &(*x_wais)[x]);
+      auto& x_wai = (*x_wais)[x];
+      x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
+                                   x_wai.index_3);
+    }
+
+  } else {
+    for (int64 x = 0; x < resizer_state.resized_width; ++x) {
+      GetWeightsAndIndices<LegacyScaler, false>(resizer_state.width_scale, x,
+                                                resizer_state.original_width,
+                                                &(*x_wais)[x]);
+      auto& x_wai = (*x_wais)[x];
+      x_wai.advance = calc.Advance(x_wai.index_0, x_wai.index_1, x_wai.index_2,
+                                   x_wai.index_3);
+    }
   }
   // Do not scale, as we will be using these directly as tensor indices on the
   // gradient pass.
@@ -224,10 +292,10 @@ static EIGEN_ALWAYS_INLINE float ComputeYInterpolation(
 template <typename T>
 inline void interpolate_with_caching(
     const typename TTypes<T, 4>::ConstTensor& input_data,
-    const ImageResizerState& resizer_state,
+    const ImageResizerState& resizer_state, const bool half_pixel_centers,
     typename TTypes<float, 4>::Tensor output_data) {
   std::vector<WeightsAndIndices> x_wais(resizer_state.out_width);
-  ComputeXWeightsAndIndices(resizer_state, &x_wais);
+  ComputeXWeightsAndIndices(resizer_state, half_pixel_centers, &x_wais);
 
   const auto num_channels = resizer_state.channels;
   const int64 in_row_width = resizer_state.in_width * num_channels;
@@ -242,8 +310,13 @@ inline void interpolate_with_caching(
     for (int64 y = 0; y < resizer_state.out_height;
          ++y, output_y_ptr += resizer_state.out_width * num_channels) {
       WeightsAndIndices y_wai;
-      GetWeightsAndIndices(resizer_state.height_scale, y,
-                           resizer_state.in_height, &y_wai);
+      if (half_pixel_centers) {
+        GetWeightsAndIndices<HalfPixelScaler, true>(
+            resizer_state.height_scale, y, resizer_state.in_height, &y_wai);
+      } else {
+        GetWeightsAndIndices<LegacyScaler, false>(
+            resizer_state.height_scale, y, resizer_state.in_height, &y_wai);
+      }
       // Make pointers represent offsets of data in input_b_ptr.
       const T* y_ptr_0 = input_b_ptr + y_wai.index_0 * in_row_width;
       const T* y_ptr_1 = input_b_ptr + y_wai.index_1 * in_row_width;
@@ -398,6 +471,7 @@ inline void interpolate_with_caching(
 template <typename T>
 inline void ResizeBicubicGrad(typename TTypes<float, 4>::ConstTensor input_grad,
                               const ImageResizerGradientState& resizer_state,
+                              const bool half_pixel_centers,
                               typename TTypes<T, 4>::Tensor output_grad) {
   // This function computes gradients for the ResizeBicubic op by iterating over
   // the input_grad Tensor and using WeightsAndIndices to appropriately update
@@ -411,11 +485,17 @@ inline void ResizeBicubicGrad(typename TTypes<float, 4>::ConstTensor input_grad,
   output_grad.setZero();
 
   std::vector<WeightsAndIndices> x_wais(resizer_state.resized_width);
-  ComputeGradientXWeightsAndIndices(resizer_state, &x_wais);
+  ComputeGradientXWeightsAndIndices(resizer_state, half_pixel_centers, &x_wais);
   for (int64 b = 0; b < resizer_state.batch_size; ++b) {
     for (int64 y = 0; y < resized_height; ++y) {
       WeightsAndIndices y_wai;
-      GetWeightsAndIndices(height_scale, y, original_height, &y_wai);
+      if (half_pixel_centers) {
+        GetWeightsAndIndices<HalfPixelScaler, true>(height_scale, y,
+                                                    original_height, &y_wai);
+      } else {
+        GetWeightsAndIndices<LegacyScaler, false>(height_scale, y,
+                                                  original_height, &y_wai);
+      }
       for (int64 x = 0; x < resized_width; ++x) {
         const WeightsAndIndices& x_wai = x_wais[x];
         for (int64 c = 0; c < channels; ++c) {
@@ -471,11 +551,13 @@ class ResizeBicubicOp : public OpKernel {
  public:
   explicit ResizeBicubicOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
+    ImageResizerState st(align_corners_, half_pixel_centers_);
     st.ValidateAndCreateOutput(context, input);
 
     if (!context->status().ok()) return;
@@ -483,11 +565,13 @@ class ResizeBicubicOp : public OpKernel {
     typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
     TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();
 
-    interpolate_with_caching<T>(input_data, st, output_data);
+    interpolate_with_caching<T>(input_data, st, half_pixel_centers_,
+                                output_data);
   }
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 template <typename Device, typename T>
@@ -496,6 +580,8 @@ class ResizeBicubicOpGrad : public OpKernel {
   explicit ResizeBicubicOpGrad(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -504,7 +590,7 @@ class ResizeBicubicOpGrad : public OpKernel {
     const Tensor& input = context->input(0);
     const Tensor& original_image = context->input(1);
 
-    ImageResizerGradientState st(align_corners_);
+    ImageResizerGradientState st(align_corners_, half_pixel_centers_);
     st.ValidateAndCreateOutput(context, input, original_image);
 
     if (!context->status().ok()) return;
@@ -512,11 +598,12 @@ class ResizeBicubicOpGrad : public OpKernel {
     TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
     typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
 
-    ResizeBicubicGrad<T>(input_grad, st, output_grad);
+    ResizeBicubicGrad<T>(input_grad, st, half_pixel_centers_, output_grad);
   }
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 #define REGISTER_KERNEL(T)                            \
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index f10c9a19a7fdfabc89d917b0418ec89f2c17ec5d..7bc40ba139a371fb3caa74f5a7e45fe5cd724659 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -39,11 +39,13 @@ class ResizeBilinearOp : public OpKernel {
  public:
   explicit ResizeBilinearOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
+    ImageResizerState st(align_corners_, half_pixel_centers_);
     st.ValidateAndCreateOutput(context, input);
 
     if (!context->status().ok()) return;
@@ -54,13 +56,14 @@ class ResizeBilinearOp : public OpKernel {
     typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
     TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();
 
-    functor::ResizeBilinear<Device, T>()(context->eigen_device<Device>(),
-                                         image_data, st.height_scale,
-                                         st.width_scale, output_data);
+    functor::ResizeBilinear<Device, T>()(
+        context->eigen_device<Device>(), image_data, st.height_scale,
+        st.width_scale, half_pixel_centers_, output_data);
   }
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 namespace {
@@ -73,17 +76,22 @@ struct CachedInterpolation {
   float lerp;
 };
 
-inline void compute_interpolation_weights(const int64 out_size,
+template <typename Scaler>
+inline void compute_interpolation_weights(const Scaler scaler,
+                                          const int64 out_size,
                                           const int64 in_size,
                                           const float scale,
                                           CachedInterpolation* interpolation) {
   interpolation[out_size].lower = 0;
   interpolation[out_size].upper = 0;
   for (int64 i = out_size - 1; i >= 0; --i) {
-    const float in = i * scale;
-    interpolation[i].lower = static_cast<int64>(in);
-    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
-    interpolation[i].lerp = in - interpolation[i].lower;
+    const float in = scaler(i, scale);
+    const float in_f = std::floor(in);
+    interpolation[i].lower =
+        std::max(static_cast<int64>(in_f), static_cast<int64>(0));
+    interpolation[i].upper =
+        std::min(static_cast<int64>(std::ceil(in)), in_size - 1);
+    interpolation[i].lerp = in - in_f;
   }
 }
 
@@ -203,6 +211,7 @@ template <typename T>
 struct ResizeBilinear<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor images,
                   const float height_scale, const float width_scale,
+                  bool half_pixel_centers,
                   typename TTypes<float, 4>::Tensor output) {
     const int batch_size = images.dimension(0);
     const int64 in_height = images.dimension(1);
@@ -221,11 +230,19 @@ struct ResizeBilinear<CPUDevice, T> {
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
 
-    // Compute the cached interpolation weights on the x and y dimensions.
-    compute_interpolation_weights(out_height, in_height, height_scale,
-                                  ys.data());
-    compute_interpolation_weights(out_width, in_width, width_scale, xs.data());
-
+    if (half_pixel_centers) {
+      compute_interpolation_weights(HalfPixelScaler(), out_height, in_height,
+                                    height_scale, ys.data());
+      compute_interpolation_weights(HalfPixelScaler(), out_width, in_width,
+                                    width_scale, xs.data());
+
+    } else {
+      // Compute the cached interpolation weights on the x and y dimensions.
+      compute_interpolation_weights(LegacyScaler(), out_height, in_height,
+                                    height_scale, ys.data());
+      compute_interpolation_weights(LegacyScaler(), out_width, in_width,
+                                    width_scale, xs.data());
+    }
     // Scale x interpolation weights to avoid a multiplication during iteration.
     for (int i = 0; i < xs.size(); ++i) {
       xs[i].lower *= channels;
@@ -244,6 +261,8 @@ class ResizeBilinearOpGrad : public OpKernel {
   explicit ResizeBilinearOpGrad(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -252,7 +271,7 @@ class ResizeBilinearOpGrad : public OpKernel {
     const Tensor& input = context->input(0);
     const Tensor& original_image = context->input(1);
 
-    ImageResizerGradientState st(align_corners_);
+    ImageResizerGradientState st(align_corners_, half_pixel_centers_);
     st.ValidateAndCreateOutput(context, input, original_image);
 
     if (!context->status().ok()) return;
@@ -260,23 +279,26 @@ class ResizeBilinearOpGrad : public OpKernel {
     TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
     typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
 
-    functor::ResizeBilinearGrad<Device, T>()(context->eigen_device<Device>(),
-                                             input_grad, st.height_scale,
-                                             st.width_scale, output_grad);
+    functor::ResizeBilinearGrad<Device, T>()(
+        context->eigen_device<Device>(), input_grad, st.height_scale,
+        st.width_scale, half_pixel_centers_, output_grad);
   }
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 // Partial specialization of ResizeBilinearGrad functor for a CPUDevice.
 namespace functor {
+
 template <typename T>
 struct ResizeBilinearGrad<CPUDevice, T> {
-  void operator()(const CPUDevice& d,
-                  typename TTypes<float, 4>::ConstTensor input_grad,
-                  const float height_scale, const float width_scale,
-                  typename TTypes<T, 4>::Tensor output_grad) {
+  template <typename Scaler>
+  void ResizeGradCore(const Scaler& scaler,
+                      typename TTypes<float, 4>::ConstTensor input_grad,
+                      const float height_scale, const float width_scale,
+                      typename TTypes<T, 4>::Tensor output_grad) {
     const Eigen::Index batch = output_grad.dimension(0);
     const Eigen::Index original_height = output_grad.dimension(1);
     const Eigen::Index original_width = output_grad.dimension(2);
@@ -296,20 +318,22 @@ struct ResizeBilinearGrad<CPUDevice, T> {
     //                       +  bottom_right * y * x
     for (Eigen::Index b = 0; b < batch; ++b) {
       for (Eigen::Index y = 0; y < resized_height; ++y) {
-        const float in_y = y * height_scale;
+        const float in_y = scaler(y, height_scale);
         const Eigen::Index top_y_index =
-            static_cast<Eigen::Index>(floorf(in_y));
+            std::max(static_cast<Eigen::Index>(floorf(in_y)),
+                     static_cast<Eigen::Index>(0));
         const Eigen::Index bottom_y_index = std::min(
             static_cast<Eigen::Index>(ceilf(in_y)), original_height - 1);
-        const float y_lerp = in_y - top_y_index;
+        const float y_lerp = in_y - floorf(in_y);
         const float inverse_y_lerp = (1.0f - y_lerp);
         for (Eigen::Index x = 0; x < resized_width; ++x) {
-          const float in_x = x * width_scale;
+          const float in_x = scaler(x, width_scale);
           const Eigen::Index left_x_index =
-              static_cast<Eigen::Index>(floorf(in_x));
+              std::max(static_cast<Eigen::Index>(floorf(in_x)),
+                       static_cast<Eigen::Index>(0));
           const Eigen::Index right_x_index = std::min(
               static_cast<Eigen::Index>(ceilf(in_x)), original_width - 1);
-          const float x_lerp = in_x - left_x_index;
+          const float x_lerp = in_x - floorf(in_x);
           const float inverse_x_lerp = (1.0f - x_lerp);
           for (Eigen::Index c = 0; c < channels; ++c) {
             output_grad(b, top_y_index, left_x_index, c) +=
@@ -325,7 +349,21 @@ struct ResizeBilinearGrad<CPUDevice, T> {
       }
     }
   }
+  void operator()(const CPUDevice& d,
+                  typename TTypes<float, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
+                  typename TTypes<T, 4>::Tensor output_grad) {
+    if (half_pixel_centers) {
+      return ResizeGradCore(HalfPixelScaler(), input_grad, height_scale,
+                            width_scale, output_grad);
+    } else {
+      return ResizeGradCore(LegacyScaler(), input_grad, height_scale,
+                            width_scale, output_grad);
+    }
+  }
 };
+
 }  // namespace functor
 
 #define REGISTER_KERNEL(T)                            \
diff --git a/tensorflow/core/kernels/resize_bilinear_op.h b/tensorflow/core/kernels/resize_bilinear_op.h
index d16ed64f14842f77341e4430a70a31759689f4c0..b4d0066d4f3bcbca26dd3d3c6b20e53bd1ef8f3a 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.h
+++ b/tensorflow/core/kernels/resize_bilinear_op.h
@@ -27,6 +27,7 @@ template <typename Device, typename T>
 struct ResizeBilinear {
   void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor images,
                   const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
                   typename TTypes<float, 4>::Tensor resized_images);
 };
 
@@ -35,6 +36,7 @@ struct ResizeBilinearGrad {
   void operator()(const Device& d,
                   typename TTypes<float, 4>::ConstTensor input_grad,
                   const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
                   typename TTypes<T, 4>::Tensor output_grad);
 };
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index f82c3fcd9ff45e26d2f44408890fa760c64477e4..4da2b877df2a9c2b68995176d2958cafa4296e7f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -48,6 +48,121 @@ __global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
     const int y = idx % out_height;
     const int b = idx / out_height;
 
+    const float in_y = (static_cast<float>(y) + 0.5f) * height_scale - 0.5f;
+
+    const int top_y_index = in_y > 0.0 ? floorf(in_y) : 0;
+    const int bottom_y_index =
+        (in_y < in_height - 1) ? ceilf(in_y) : in_height - 1;
+    const float y_lerp = in_y - floorf(in_y);
+
+    const float in_x = (static_cast<float>(x) + 0.5f) * width_scale - 0.5f;
+    const int left_x_index = in_x > 0.0 ? floorf(in_x) : 0;
+    const int right_x_index =
+        (in_x < in_width - 1) ? ceilf(in_x) : in_width - 1;
+    const float x_lerp = in_x - left_x_index;
+
+    const float top_left(
+        images[((b * in_height + top_y_index) * in_width + left_x_index) *
+                   channels +
+               c]);
+    const float top_right(
+        images[((b * in_height + top_y_index) * in_width + right_x_index) *
+                   channels +
+               c]);
+    const float bottom_left(
+        images[((b * in_height + bottom_y_index) * in_width + left_x_index) *
+                   channels +
+               c]);
+    const float bottom_right(
+        images[((b * in_height + bottom_y_index) * in_width + right_x_index) *
+                   channels +
+               c]);
+
+    const float top = top_left + (top_right - top_left) * x_lerp;
+    const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+    output[out_idx] = top + (bottom - top) * y_lerp;
+  }
+}
+
+template <typename T>
+__global__ void ResizeBilinearGradKernel(
+    const int32 nthreads, const float* input_grad, float height_scale,
+    float width_scale, int batch, int original_height, int original_width,
+    int channels, int resized_height, int resized_width, T* output_grad) {
+  CUDA_1D_KERNEL_LOOP(in_idx, nthreads) {
+    // in_idx = c + channels * (x + resized_width * (y + resized_height * b))
+    int idx = in_idx;
+    const int c = idx % channels;
+    idx /= channels;
+    const int x = idx % resized_width;
+    idx /= resized_width;
+    const int y = idx % resized_height;
+    const int b = idx / resized_height;
+
+    const float original_y =
+        (static_cast<float>(y) + 0.5f) * height_scale - 0.5f;
+    const int top_y_index = original_y > 0.0 ? floorf(original_y) : 0;
+    const int bottom_y_index = (original_y < original_height - 1)
+                                   ? ceilf(original_y)
+                                   : original_height - 1;
+    const float y_lerp = original_y - floorf(original_y);
+
+    const float original_x =
+        (static_cast<float>(x) + 0.5f) * width_scale - 0.5f;
+
+    const int left_x_index = original_x > 0.0 ? floorf(original_x) : 0;
+    const int right_x_index = (original_x < original_width - 1)
+                                  ? ceilf(original_x)
+                                  : original_width - 1;
+    const float x_lerp = original_x - floorf(original_x);
+
+    const float dtop = (1 - y_lerp) * input_grad[in_idx];
+    CudaAtomicAdd(output_grad +
+                      ((b * original_height + top_y_index) * original_width +
+                       left_x_index) *
+                          channels +
+                      c,
+                  static_cast<T>((1 - x_lerp) * dtop));
+    CudaAtomicAdd(output_grad +
+                      ((b * original_height + top_y_index) * original_width +
+                       right_x_index) *
+                          channels +
+                      c,
+                  static_cast<T>(x_lerp * dtop));
+
+    const float dbottom = y_lerp * input_grad[in_idx];
+    CudaAtomicAdd(output_grad +
+                      ((b * original_height + bottom_y_index) * original_width +
+                       left_x_index) *
+                          channels +
+                      c,
+                  static_cast<T>((1 - x_lerp) * dbottom));
+    CudaAtomicAdd(output_grad +
+                      ((b * original_height + bottom_y_index) * original_width +
+                       right_x_index) *
+                          channels +
+                      c,
+                  static_cast<T>(x_lerp * dbottom));
+  }
+}
+
+template <typename T>
+__global__ void LegacyResizeBilinearKernel(const int32 nthreads,
+                                           const T* images, float height_scale,
+                                           float width_scale, int batch,
+                                           int in_height, int in_width,
+                                           int channels, int out_height,
+                                           int out_width, float* output) {
+  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+    // out_idx = c + channels * (x + out_width * (y + out_height * b))
+    int idx = out_idx;
+    const int c = idx % channels;
+    idx /= channels;
+    const int x = idx % out_width;
+    idx /= out_width;
+    const int y = idx % out_height;
+    const int b = idx / out_height;
+
     const float in_y = y * height_scale;
     const int top_y_index = floorf(in_y);
     const int bottom_y_index =
@@ -84,7 +199,7 @@ __global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
 }
 
 template <typename T>
-__global__ void ResizeBilinearGradKernel(
+__global__ void LegacyResizeBilinearGradKernel(
     const int32 nthreads, const float* input_grad, float height_scale,
     float width_scale, int batch, int original_height, int original_width,
     int channels, int resized_height, int resized_width, T* output_grad) {
@@ -151,6 +266,7 @@ template <typename T>
 struct ResizeBilinear<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor images,
                   const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
                   typename TTypes<float, 4>::Tensor output) {
     const int batch = images.dimension(0);
     const int in_height = images.dimension(1);
@@ -164,11 +280,19 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, images.data(), height_scale,
-            width_scale, batch, in_height, in_width, channels, out_height,
-            out_width, output.data());
+    if (half_pixel_centers) {
+      TF_CHECK_OK(CudaLaunchKernel(
+          ResizeBilinearKernel<T>, config.block_count, config.thread_per_block,
+          0, d.stream(), config.virtual_thread_count, images.data(),
+          height_scale, width_scale, batch, in_height, in_width, channels,
+          out_height, out_width, output.data()));
+    } else {
+      TF_CHECK_OK(CudaLaunchKernel(
+          LegacyResizeBilinearKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          images.data(), height_scale, width_scale, batch, in_height, in_width,
+          channels, out_height, out_width, output.data()));
+    }
   }
 };
 
@@ -178,6 +302,7 @@ struct ResizeBilinearGrad<GPUDevice, T> {
   void operator()(const GPUDevice& d,
                   typename TTypes<float, 4>::ConstTensor input_grad,
                   const float height_scale, const float width_scale,
+                  const bool half_pixel_centers,
                   typename TTypes<T, 4>::Tensor output_grad) {
     const int batch = output_grad.dimension(0);
     const int original_height = output_grad.dimension(1);
@@ -194,17 +319,28 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     total_count = batch * original_height * original_width * channels;
     if (total_count == 0) return;
     config = GetCudaLaunchConfig(total_count, d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, output_grad.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
+        config.virtual_thread_count, output_grad.data()));
 
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, input_grad.data(), height_scale,
-            width_scale, batch, original_height, original_width, channels,
-            resized_height, resized_width, output_grad.data());
+    if (half_pixel_centers) {
+      TF_CHECK_OK(CudaLaunchKernel(
+          ResizeBilinearGradKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          input_grad.data(), height_scale, width_scale, batch, original_height,
+          original_width, channels, resized_height, resized_width,
+          output_grad.data()));
+    } else {
+      TF_CHECK_OK(CudaLaunchKernel(
+          LegacyResizeBilinearGradKernel<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+          input_grad.data(), height_scale, width_scale, batch, original_height,
+          original_width, channels, resized_height, resized_width,
+          output_grad.data()));
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index f2062915b8470e8cc6f6e0897ae579639d6fee4c..dee1020d77a31896195dfce74d461f8c02be72bf 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -30,13 +30,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-class ResizeBilinearOpTest : public OpsTestBase {
+class ResizeBilinearOpTestBase : public OpsTestBase {
  protected:
-  ResizeBilinearOpTest() {
+  explicit ResizeBilinearOpTestBase(bool half_pixel_centers)
+      : half_pixel_centers_(half_pixel_centers) {
     TF_EXPECT_OK(NodeDefBuilder("resize_bilinear_op", "ResizeBilinear")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Attr("align_corners", false)
+                     .Attr("half_pixel_centers", half_pixel_centers_)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
@@ -81,17 +83,25 @@ class ResizeBilinearOpTest : public OpsTestBase {
 
     for (int b = 0; b < batch; ++b) {
       for (int64 y = 0; y < out_height; ++y) {
-        const float in_y = y * height_scale;
-        const int64 top_y_index = static_cast<int64>(floorf(in_y));
+        const float in_y =
+            half_pixel_centers_
+                ? (static_cast<float>(y) + 0.5f) * height_scale - 0.5f
+                : y * height_scale;
+        const int64 top_y_index =
+            std::max(static_cast<int64>(floorf(in_y)), static_cast<int64>(0));
         const int64 bottom_y_index =
             std::min(static_cast<int64>(ceilf(in_y)), in_height - 1);
-        const float y_lerp = in_y - top_y_index;
+        const float y_lerp = in_y - std::floor(in_y);
         for (int64 x = 0; x < out_width; ++x) {
-          const float in_x = x * width_scale;
-          const int64 left_x_index = static_cast<int64>(floorf(in_x));
+          const float in_x =
+              half_pixel_centers_
+                  ? (static_cast<float>(x) + 0.5f) * width_scale - 0.5f
+                  : x * width_scale;
+          const int64 left_x_index =
+              std::max(static_cast<int64>(floorf(in_x)), static_cast<int64>(0));
           const int64 right_x_index =
               std::min(static_cast<int64>(ceilf(in_x)), in_width - 1);
-          const float x_lerp = in_x - left_x_index;
+          const float x_lerp = in_x - std::floor(in_x);
           for (int c = 0; c < channels; ++c) {
             const float top_left = images(b, top_y_index, left_x_index, c);
             const float top_right = images(b, top_y_index, right_x_index, c);
@@ -139,6 +149,17 @@ class ResizeBilinearOpTest : public OpsTestBase {
       }
     }
   }
+  bool half_pixel_centers_;
+};
+
+class ResizeBilinearOpTest : public ResizeBilinearOpTestBase {
+ public:
+  ResizeBilinearOpTest() : ResizeBilinearOpTestBase(false) {}
+};
+
+class ResizeBilinearHalfPixelCentersOpTest : public ResizeBilinearOpTestBase {
+ public:
+  ResizeBilinearHalfPixelCentersOpTest() : ResizeBilinearOpTestBase(true) {}
 };
 
 class ResizeBilinearOpAlignCornersTest : public OpsTestBase {
@@ -343,6 +364,14 @@ TEST_F(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
   test::ExpectClose(expected, *GetOutput(0));
 }
 
+TEST_F(ResizeBilinearHalfPixelCentersOpTest, TestDownsamples) {
+  TestResize(4, 298, 297, 3, 61, 71);
+}
+
+TEST_F(ResizeBilinearHalfPixelCentersOpTest, TestUpsamples) {
+  TestResize(4, 61, 71, 3, 298, 297);
+}
+
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners4x4To3x3) {
   // Input:
   //  1,  2,  3,  4
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index e985d3e5a51ff2a4badec27b4137ec21272467c4..cf2d0e22742659da5ca5a9aaba902fe19c6dbe2a 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -40,11 +40,13 @@ class ResizeNearestNeighborOp : public OpKernel {
   explicit ResizeNearestNeighborOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
+    ImageResizerState st(align_corners_, half_pixel_centers_);
     st.ValidateAndCreateOutput(context, input);
 
     if (!context->status().ok()) return;
@@ -60,16 +62,34 @@ class ResizeNearestNeighborOp : public OpKernel {
     typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());
 
     bool status;
-    if (align_corners_) {
-      status =
-          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/true>()(
-              context->eigen_device<Device>(), input_data, st.height_scale,
-              st.width_scale, output_data);
+    if (half_pixel_centers_) {
+      if (align_corners_) {
+        status = functor::ResizeNearestNeighbor<Device, T,
+                                                /*half_pixe_centers=*/true,
+                                                /*align_corners=*/true>()(
+            context->eigen_device<Device>(), input_data, st.height_scale,
+            st.width_scale, output_data);
+      } else {
+        status = functor::ResizeNearestNeighbor<Device, T,
+                                                /*half_pixe_centers=*/true,
+                                                /*align_corners=*/false>()(
+            context->eigen_device<Device>(), input_data, st.height_scale,
+            st.width_scale, output_data);
+      }
     } else {
-      status =
-          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/false>()(
-              context->eigen_device<Device>(), input_data, st.height_scale,
-              st.width_scale, output_data);
+      if (align_corners_) {
+        status = functor::ResizeNearestNeighbor<Device, T,
+                                                /*half_pixe_centers=*/false,
+                                                /*align_corners=*/true>()(
+            context->eigen_device<Device>(), input_data, st.height_scale,
+            st.width_scale, output_data);
+      } else {
+        status = functor::ResizeNearestNeighbor<Device, T,
+                                                /*half_pixe_centers=*/false,
+                                                /*align_corners=*/false>()(
+            context->eigen_device<Device>(), input_data, st.height_scale,
+            st.width_scale, output_data);
+      }
     }
     if (!status) {
       context->SetStatus(
@@ -79,15 +99,41 @@ class ResizeNearestNeighborOp : public OpKernel {
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
+};
+
+// Helper struct to convert a bool to the correct scaler type.
+template <bool half_pixel_centers>
+struct BoolToScaler {};
+
+struct HalfPixelScalerForNN {
+  inline float operator()(const int x, const float scale) const {
+    // All of the nearest neigbor code below immediately follows a call to this
+    // function with a std::floor(), so instead of subtracting the 0.5 as we
+    // do in HalfPixelScale, we leave it as is, as the std::floor does the
+    // correct thing.
+    return (static_cast<float>(x) + 0.5f) * scale;
+  }
+};
+
+template <>
+struct BoolToScaler<true> {
+  typedef HalfPixelScalerForNN Scaler;
+};
+
+template <>
+struct BoolToScaler<false> {
+  typedef LegacyScaler Scaler;
 };
 
 // Partial specialization of ResizeNearestNeighbor functor for a CPUDevice.
 namespace functor {
-template <typename T, bool align_corners>
-struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
+template <typename T, bool half_pixel_centers, bool align_corners>
+struct ResizeNearestNeighbor<CPUDevice, T, half_pixel_centers, align_corners> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
+    typename BoolToScaler<half_pixel_centers>::Scaler scaler;
     const Eigen::Index batch_size = input.dimension(0);
     const Eigen::Index in_height = input.dimension(1);
     const Eigen::Index in_width = input.dimension(2);
@@ -98,17 +144,23 @@ struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
 
     for (Eigen::Index b = 0; b < batch_size; ++b) {
       for (Eigen::Index y = 0; y < out_height; ++y) {
-        const Eigen::Index in_y =
-            std::min((align_corners)
-                         ? static_cast<Eigen::Index>(roundf(y * height_scale))
-                         : static_cast<Eigen::Index>(floorf(y * height_scale)),
-                     in_height - 1);
+        Eigen::Index in_y = std::min(
+            (align_corners)
+                ? static_cast<Eigen::Index>(roundf(scaler(y, height_scale)))
+                : static_cast<Eigen::Index>(floorf(scaler(y, height_scale))),
+            in_height - 1);
+        if (half_pixel_centers) {
+          in_y = std::max(static_cast<Eigen::Index>(0), in_y);
+        }
         for (Eigen::Index x = 0; x < out_width; ++x) {
-          const Eigen::Index in_x =
-              std::min((align_corners)
-                           ? static_cast<Eigen::Index>(roundf(x * width_scale))
-                           : static_cast<Eigen::Index>(floorf(x * width_scale)),
-                       in_width - 1);
+          Eigen::Index in_x = std::min(
+              (align_corners)
+                  ? static_cast<Eigen::Index>(roundf(scaler(x, width_scale)))
+                  : static_cast<Eigen::Index>(floorf(scaler(x, width_scale))),
+              in_width - 1);
+          if (half_pixel_centers) {
+            in_x = std::max(static_cast<Eigen::Index>(0), in_x);
+          }
           std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
         }
       }
@@ -124,6 +176,8 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
   explicit ResizeNearestNeighborOpGrad(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("half_pixel_centers", &half_pixel_centers_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -173,16 +227,36 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
         CalculateResizeScale(out_width, in_width, align_corners_);
 
     bool status;
-    if (align_corners_) {
-      status = functor::ResizeNearestNeighborGrad<Device, T,
-                                                  /*align_corners=*/true>()(
-          context->eigen_device<Device>(), input_data, height_scale,
-          width_scale, output_data);
+    if (half_pixel_centers_) {
+      if (align_corners_) {
+        status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                    /*half_pixel_centers=*/true,
+                                                    /*align_corners=*/true>()(
+            context->eigen_device<Device>(), input_data, height_scale,
+            width_scale, output_data);
+      } else {
+        status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                    /*half_pixel_centers=*/true,
+                                                    /*align_corners=*/false>()(
+            context->eigen_device<Device>(), input_data, height_scale,
+            width_scale, output_data);
+      }
     } else {
-      status = functor::ResizeNearestNeighborGrad<Device, T,
-                                                  /*align_corners=*/false>()(
-          context->eigen_device<Device>(), input_data, height_scale,
-          width_scale, output_data);
+      if (align_corners_) {
+        status =
+            functor::ResizeNearestNeighborGrad<Device, T,
+                                               /*half_pixel_centers=*/false,
+                                               /*align_corners=*/true>()(
+                context->eigen_device<Device>(), input_data, height_scale,
+                width_scale, output_data);
+      } else {
+        status =
+            functor::ResizeNearestNeighborGrad<Device, T,
+                                               /*half_pixel_centers=*/false,
+                                               /*align_corners=*/false>()(
+                context->eigen_device<Device>(), input_data, height_scale,
+                width_scale, output_data);
+      }
     }
     if (!status) {
       context->SetStatus(
@@ -192,15 +266,18 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
 
  private:
   bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 // Partial specialization of ResizeNearestNeighborGrad functor for a CPUDevice.
 namespace functor {
-template <typename T, bool align_corners>
-struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
+template <typename T, bool half_pixel_centers, bool align_corners>
+struct ResizeNearestNeighborGrad<CPUDevice, T, half_pixel_centers,
+                                 align_corners> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
+    typename BoolToScaler<half_pixel_centers>::Scaler scaler;
     const Eigen::Index batch_size = input.dimension(0);
     const Eigen::Index in_height = input.dimension(1);
     const Eigen::Index in_width = input.dimension(2);
@@ -213,15 +290,16 @@ struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
 
     for (Eigen::Index y = 0; y < in_height; ++y) {
       const Eigen::Index out_y = std::min(
-          (align_corners) ? static_cast<Eigen::Index>(roundf(y * height_scale))
-                          : static_cast<Eigen::Index>(floorf(y * height_scale)),
+          (align_corners)
+              ? static_cast<Eigen::Index>(roundf(scaler(y, height_scale)))
+              : static_cast<Eigen::Index>(floorf(scaler(y, height_scale))),
           out_height - 1);
       for (Eigen::Index x = 0; x < in_width; ++x) {
-        const Eigen::Index out_x =
-            std::min((align_corners)
-                         ? static_cast<Eigen::Index>(roundf(x * width_scale))
-                         : static_cast<Eigen::Index>(floorf(x * width_scale)),
-                     out_width - 1);
+        const Eigen::Index out_x = std::min(
+            (align_corners)
+                ? static_cast<Eigen::Index>(roundf(scaler(x, width_scale)))
+                : static_cast<Eigen::Index>(floorf(scaler(x, width_scale))),
+            out_width - 1);
         for (Eigen::Index b = 0; b < batch_size; ++b) {
           for (Eigen::Index c = 0; c < channels; ++c) {
             output(b, out_y, out_x, c) += input(b, y, x, c);
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
index 9db331ffdcd6c1a1b11c3ab6271d0a949dec6630..d6b053180cea44f3b4df1e5ba1c5d8b2e06fc947 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.h
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
@@ -22,14 +22,16 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-template <typename Device, typename T, bool align_corners>
+template <typename Device, typename T, bool half_pixel_centers,
+          bool align_corners>
 struct ResizeNearestNeighbor {
   bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output);
 };
 
-template <typename Device, typename T, bool align_corners>
+template <typename Device, typename T, bool half_pixel_centers,
+          bool align_corners>
 struct ResizeNearestNeighborGrad {
   bool operator()(const Device& d,
                   typename TTypes<T, 4>::ConstTensor input_grad,
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index d65c8fb949abe7227cbae9de36baeca4571b4ff4..d2494ea36b044b134ccd471be51ca81b2806ac2c 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -32,7 +32,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
 
-template <typename T, bool align_corners>
+template <typename T>
 __global__ void ResizeNearestNeighborNHWC(
     const int nthreads, const T* bottom_data, const int in_height,
     const int in_width, const int channels, const int out_height,
@@ -47,6 +47,37 @@ __global__ void ResizeNearestNeighborNHWC(
     int out_y = n % out_height;
     n /= out_height;
 
+    const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
+    const int in_y =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(out_y) + 0.5f) * height_scale)),
+                in_height - 1),
+            0);
+    const int in_x =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(out_x) + 0.5f) * width_scale)),
+                in_width - 1),
+            0);
+    const int idx = (in_y * in_width + in_x) * channels + c;
+    top_data[index] = ldg(bottom_data_n + idx);
+  }
+}
+
+template <typename T, bool align_corners>
+__global__ void LegacyResizeNearestNeighborNHWC(
+    const int nthreads, const T* bottom_data, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int out_x = n % out_width;
+    n /= out_width;
+    int out_y = n % out_height;
+    n /= out_height;
+
     const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
     const int in_y =
         min((align_corners) ? static_cast<int>(roundf(out_y * height_scale))
@@ -61,7 +92,7 @@ __global__ void ResizeNearestNeighborNHWC(
   }
 }
 
-template <typename T, bool align_corners>
+template <typename T>
 __global__ void ResizeNearestNeighborBackwardNHWC(
     const int nthreads, const T* top_diff, const int in_height,
     const int in_width, const int channels, const int out_height,
@@ -76,6 +107,37 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
     int in_y = n % in_height;
     n /= in_height;
 
+    T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
+    const int out_y =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(in_y) + 0.5f) * height_scale)),
+                out_height - 1),
+            0);
+    const int out_x =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(in_x) + 0.5f) * width_scale)),
+                out_width - 1),
+            0);
+    const int idx = (out_y * out_width + out_x) * channels + c;
+    CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index));
+  }
+}
+
+template <typename T, bool align_corners>
+__global__ void LegacyResizeNearestNeighborBackwardNHWC(
+    const int nthreads, const T* top_diff, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int in_x = n % in_width;
+    n /= in_width;
+    int in_y = n % in_height;
+    n /= in_height;
+
     T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
     const int out_y =
         min((align_corners) ? static_cast<int>(roundf(in_y * height_scale))
@@ -95,8 +157,8 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
 namespace functor {
 
 // Partial specialization of ResizeNearestNeighbor functor for a GPUDevice.
-template <typename T, bool align_corners>
-struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
+template <typename T, bool half_pixel_centers, bool align_corners>
+struct ResizeNearestNeighbor<GPUDevice, T, half_pixel_centers, align_corners> {
   bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
@@ -112,25 +174,38 @@ struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
     if (output_size == 0) return true;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
-    ResizeNearestNeighborNHWC<T, align_corners>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            output_size, input.data(), in_height, in_width, channels,
-            out_height, out_width, height_scale, width_scale, output.data());
-    return d.ok();
+    if (half_pixel_centers) {
+      TF_CHECK_OK(CudaLaunchKernel(
+          ResizeNearestNeighborNHWC<T>, config.block_count,
+          config.thread_per_block, 0, d.stream(), output_size, input.data(),
+          in_height, in_width, channels, out_height, out_width, height_scale,
+          width_scale, output.data()));
+      return d.ok();
+    } else {
+      TF_CHECK_OK(CudaLaunchKernel(
+          LegacyResizeNearestNeighborNHWC<T, align_corners>, config.block_count,
+          config.thread_per_block, 0, d.stream(), output_size, input.data(),
+          in_height, in_width, channels, out_height, out_width, height_scale,
+          width_scale, output.data()));
+      return d.ok();
+    }
   }
 };
 
-#define DECLARE_GPU_SPEC(T)                                   \
-  template struct ResizeNearestNeighbor<GPUDevice, T, false>; \
-  template struct ResizeNearestNeighbor<GPUDevice, T, true>;
+#define DECLARE_GPU_SPEC(T)                                          \
+  template struct ResizeNearestNeighbor<GPUDevice, T, false, false>; \
+  template struct ResizeNearestNeighbor<GPUDevice, T, false, true>;  \
+  template struct ResizeNearestNeighbor<GPUDevice, T, true, false>;  \
+  template struct ResizeNearestNeighbor<GPUDevice, T, true, true>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
 // Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice.
-template <typename T, bool align_corners>
-struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
+template <typename T, bool half_pixel_centers, bool align_corners>
+struct ResizeNearestNeighborGrad<GPUDevice, T, half_pixel_centers,
+                                 align_corners> {
   bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
@@ -153,18 +228,31 @@ struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
     if (input_size == 0) return true;
 
     CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
-    ResizeNearestNeighborBackwardNHWC<T, align_corners>
-        <<<input_config.block_count, input_config.thread_per_block, 0,
-           d.stream()>>>(input_config.virtual_thread_count, input.data(),
-                         in_height, in_width, channels, out_height, out_width,
-                         height_scale, width_scale, output.data());
-    return d.ok();
+    if (half_pixel_centers) {
+      TF_CHECK_OK(CudaLaunchKernel(
+          ResizeNearestNeighborBackwardNHWC<T>, input_config.block_count,
+          input_config.thread_per_block, 0, d.stream(),
+          input_config.virtual_thread_count, input.data(), in_height, in_width,
+          channels, out_height, out_width, height_scale, width_scale,
+          output.data()));
+      return d.ok();
+    } else {
+      TF_CHECK_OK(CudaLaunchKernel(
+          LegacyResizeNearestNeighborBackwardNHWC<T, align_corners>,
+          input_config.block_count, input_config.thread_per_block, 0,
+          d.stream(), input_config.virtual_thread_count, input.data(),
+          in_height, in_width, channels, out_height, out_width, height_scale,
+          width_scale, output.data()));
+      return d.ok();
+    }
   }
 };
 
-#define DECLARE_GPU_SPEC(T)                                       \
-  template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \
-  template struct ResizeNearestNeighborGrad<GPUDevice, T, true>;
+#define DECLARE_GPU_SPEC(T)                                              \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, false, false>; \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, false, true>;  \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, true, false>;  \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, true, true>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
index bd8a064228a6ee94ca8845ad5a5110f54791a391..734ef5da69b6a67cff9be2d25591180f881723da 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -31,18 +31,33 @@ limitations under the License.
 
 namespace tensorflow {
 
-class ResizeNearestNeighborOpTest : public OpsTestBase {
+class ResizeNearestNeighborOpTestBase : public OpsTestBase {
  protected:
-  ResizeNearestNeighborOpTest() {
+  explicit ResizeNearestNeighborOpTestBase(bool half_pixel_centers) {
     TF_EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Attr("align_corners", false)
+                     .Attr("half_pixel_centers", half_pixel_centers)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
 };
 
+class ResizeNearestNeighborOpTest : public ResizeNearestNeighborOpTestBase {
+ protected:
+  ResizeNearestNeighborOpTest() : ResizeNearestNeighborOpTestBase(false) {}
+};
+
+class ResizeNearestNeighborHalfPixelCentersOpTest
+    : public ResizeNearestNeighborOpTestBase {
+ protected:
+  ResizeNearestNeighborHalfPixelCentersOpTest()
+      : ResizeNearestNeighborOpTestBase(true) {}
+};
+
+// TODO(jflynn): Add some actual tests for the half pixel centers case.
+
 class ResizeNearestNeighborOpAlignCornersTest : public OpsTestBase {
  protected:
   ResizeNearestNeighborOpAlignCornersTest() {
@@ -317,4 +332,193 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 5, 1}),
+                           {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected, {2, 4, 2, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected, {4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 2, 2,
+     3, 4, 4,
+     3, 4, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
+  // Input:
+  //  1, 2, 3
+  //  4, 5, 6
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 3,
+     7, 9});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 5});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 2, 2, 2,
+     3, 3, 4, 4, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
+       TestNearestNeighbor4x4To3x3) {
+  // Input:
+  //  1,  2,  3,  4
+  //  5,  6,  7,  8
+  //  9, 10, 11, 12
+  // 13, 14, 15, 16
+  AddInputFromArray<float>(
+      TensorShape({1, 4, 4, 1}),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  3,  4,
+     9, 11, 12,
+     13, 15, 16});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {5, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 2,
+     1, 2,
+     3, 4,
+     3, 4,
+     3, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 2, 2,
+     1, 1, 2, 2,
+     3, 3, 4, 4,
+     3, 3, 4, 4});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
+       TestNearest2x2x2x2To2x3x3x2) {
+  // Input:
+  //  [ [ 1, 1 ], [ 2, 2],
+  //    [ 3, 3 ], [ 4, 4] ],
+  //  [ [ 5, 5 ], [ 6, 6],
+  //    [ 7, 7 ], [ 8, 8] ]
+  AddInputFromArray<float>(TensorShape({2, 2, 2, 2}),
+                           {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1, 1, 2, 2, 2, 2,
+     3, 3, 4, 4, 4, 4,
+     3, 3, 4, 4, 4, 4,
+     5, 5, 6, 6, 6, 6,
+     7, 7, 8, 8, 8, 8,
+     7, 7, 8, 8, 8, 8});
+
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 4167b6005194409d780b3698fda688728a50b3cc..4291b27df5bc5445024f54a3e402f3a0aa242ec5 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -56,12 +56,12 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/kernels/resource_variable_ops.h"
@@ -86,6 +86,7 @@ ReadVariableOp::ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {
 }
 
 namespace {
+
 Status CopyVariable(int output_idx, OpKernelContext* ctx, const Tensor* t) {
   Tensor* output;
   Notification n;
@@ -583,8 +584,34 @@ REGISTER_KERNEL_BUILDER(Name("VarIsInitializedOp")
 
 template <typename Device, typename T, typename Index>
 class ResourceGatherOp : public OpKernel {
+ private:
+  int32 batch_dims_ = 0;
+
+  // Add the batch offset derrived from params to each batch of indices.
+  // Example: batch_dims = 1, indices = [[0, 1, 2], [0, 1, 2]]
+  // If indexing into a params dimension of size 4, then the indices will become
+  // [0, 1, 2, 4, 5, 6]
+  void AddBatchOffsets(Tensor* indices, const Tensor& params) {
+    int64 batch_size = 1;  // The size of all batch dimensions.
+    for (int idx = 0; idx < batch_dims_; ++idx) {
+      batch_size *= params.dim_size(idx);
+    }
+
+    auto indices_flat = indices->flat<Index>();
+    int64 const index_inner_size = indices->NumElements() / batch_size;
+    int64 const batch_offset = params.dim_size(batch_dims_);
+    for (int64 batch_idx = 0, dest_idx = 0; batch_idx < batch_size;
+         ++batch_idx) {
+      for (int64 idx = 0; idx < index_inner_size; ++idx) {
+        indices_flat(dest_idx++) += batch_offset * batch_idx;
+      }
+    }
+  }
+
  public:
-  explicit ResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit ResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("batch_dims", &batch_dims_));
+  }
 
   void Compute(OpKernelContext* c) override {
     Var* v = nullptr;
@@ -612,9 +639,16 @@ class ResourceGatherOp : public OpKernel {
                                 " indexing: ", params.dim_size(0), " > ",
                                 std::numeric_limits<Index>::max()));
 
-    // The result shape is indices.shape + params.shape[1:].
-    TensorShape result_shape = indices.shape();
-    for (int i = 1; i < params.dims(); i++) {
+    // The result shape is params.shape[:batch_dims] +
+    // indices.shape[batch_dims:] + params.shape[batch_dims+1:].
+    TensorShape result_shape;
+    for (int i = 0; i < batch_dims_; ++i) {
+      result_shape.AddDim(params.dim_size(i));
+    }
+    for (int i = batch_dims_; i < indices.dims(); ++i) {
+      result_shape.AddDim(indices.dim_size(i));
+    }
+    for (int i = batch_dims_ + 1; i < params.dims(); ++i) {
       result_shape.AddDim(params.dim_size(i));
     }
 
@@ -627,14 +661,33 @@ class ResourceGatherOp : public OpKernel {
     } else {
       OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     }
+
     if (N > 0) {
-      const int64 gather_dim_size = params.dim_size(0);
+      Tensor tmp_indices;
+
+      // Points to the original or updated (if batch_dims is set) indices.
+      const Tensor* op_indices = &indices;
+      if (batch_dims_ > 0) {
+        OP_REQUIRES_OK(c, c->allocate_temp(indices.dtype(), indices.shape(),
+                                           &tmp_indices));
+        functor::DenseUpdate<Device, Index, ASSIGN> copy_functor;
+        copy_functor(c->eigen_device<Device>(), tmp_indices.flat<Index>(),
+                     indices.flat<Index>());
+
+        AddBatchOffsets(&tmp_indices, params);
+        op_indices = &tmp_indices;
+      }
+
+      int64 gather_dim_size = 1;
+      for (int idx = 0; idx <= batch_dims_; ++idx) {
+        gather_dim_size *= params.dim_size(idx);
+      }
       int64 inner_size = 1;
-      for (int i = 1; i < params.dims(); i++) {
+      for (int i = batch_dims_ + 1; i < params.dims(); ++i) {
         inner_size *= params.dim_size(i);
       }
       auto params_flat = params.shaped<T, 3>({1, gather_dim_size, inner_size});
-      auto indices_flat = indices.flat<Index>();
+      const auto indices_flat = op_indices->flat<Index>();
       auto out_flat = out->shaped<T, 3>({1, N, out->NumElements() / N});
 
       functor::GatherFunctor<Device, T, Index> functor;
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 1c4d0bc1ae9934dbfb8718dfa05202b1d7b38edc..aa2434da03f5fd76ad409121382e6ce93a2e65df 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/reverse_op.h"
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index efa30438d922fa070747bb4269451cc54f574887..494a846ff562e505a569de19418d371ea8b4f80c 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index c0fde8042e816c325475a36129fb71630f0ca7c6..0e68af867bdf753ec70ff9ff2c978d0b95ea5c52 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 82546d581a9ea55d7fe0a478c4de0c9afe2ff8ed..8580891fc066828abb1c2cef6d66f71c48090f05 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 180eb3ca34b4c1fe96bf7088319455185bd06a2c..ed1195c05353389e9c4c465d402d46220a01fad4 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/core/kernels/save_v2_op_test.cc b/tensorflow/core/kernels/save_v2_op_test.cc
index 82e566d35fefa98f96b00a285af618ff98f3da69..589d9639fb4d17e6f3423b92c7d692a7abc25364 100644
--- a/tensorflow/core/kernels/save_v2_op_test.cc
+++ b/tensorflow/core/kernels/save_v2_op_test.cc
@@ -67,9 +67,8 @@ TEST_F(SaveV2OpTest, Simple) {
                    [&tensornames](int x) -> string { return tensornames[x]; });
 
   // Add the slice specs
-  AddInput<string>(TensorShape({13}), [&tensornames](int x) -> string {
-    return "" /* saves in full */;
-  });
+  AddInput<string>(TensorShape({13}),
+                   [](int x) -> string { return "" /* saves in full */; });
 
   // Add a 1-d bool tensor
   AddInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
diff --git a/tensorflow/core/kernels/scale_and_translate_op.cc b/tensorflow/core/kernels/scale_and_translate_op.cc
index 149c5526ae8952a5dab69dd11c0386d0bb38835f..fff457e55c71fb6df70a859bcd19725bae7ae423 100644
--- a/tensorflow/core/kernels/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/scale_and_translate_op.cc
@@ -20,13 +20,13 @@ limitations under the License.
 
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/sampling_kernels.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -50,7 +50,7 @@ template <typename Kernel>
 Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
                         const int64 output_size, const int64 input_size,
                         const float scale, const float translate,
-                        Spans* spans) {
+                        const bool antialias, Spans* spans) {
   // When sampling, we need the inverse scale and translation, to map from an
   // output to an input pixel.
   const float inv_scale = 1.0 / scale;
@@ -58,7 +58,7 @@ Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
   // When downsampling the kernel should be scaled since we want to low pass
   // filter and interpolate, but when upsampling it should not be since we only
   // want to interpolate.
-  const float kernel_scale = std::max(inv_scale, 1.0f);
+  const float kernel_scale = antialias ? std::max(inv_scale, 1.0f) : 1.0f;
   spans->span_size = std::min(
       2 * static_cast<int>(std::ceil(kernel.Radius() * kernel_scale)) + 1,
       static_cast<int>(input_size));
@@ -82,10 +82,8 @@ Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
     const float col_f = x + 0.5f;
     const float sample_f = col_f * inv_scale + inv_translate;
 
-    // Don't sample when the sampling *kernel* is completely outside the
-    // source image.
-    if (sample_f < 0 - kernel.Radius() * kernel_scale ||
-        sample_f > input_size + kernel.Radius() * kernel_scale) {
+    // Don't sample when the sampling location is outside the source image.
+    if (sample_f < 0 || sample_f > input_size) {
       // Add an empty span.
       starts_vec(x) = 0;
       continue;
@@ -169,11 +167,15 @@ Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
   auto grad_weights_vec = grad_spans->weights.vec<float>();
   grad_weights_vec.setZero();
   for (int input_index = 0; input_index < forward_input_size; ++input_index) {
-    const int start_span = grad_components[input_index].front().index;
-    grad_starts_vec(input_index) = start_span;
-    for (const GradComponent& gc : grad_components[input_index]) {
-      grad_weights_vec(input_index * grad_spans->span_size + gc.index -
-                       start_span) += gc.weight;
+    if (!grad_components[input_index].empty()) {
+      const int start_span = grad_components[input_index].front().index;
+      grad_starts_vec(input_index) = start_span;
+      for (const GradComponent& gc : grad_components[input_index]) {
+        grad_weights_vec(input_index * grad_spans->span_size + gc.index -
+                         start_span) += gc.weight;
+      }
+    } else {
+      grad_starts_vec(input_index) = 0;
     }
   }
   return Status::OK();
@@ -186,39 +188,40 @@ Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
 Status ComputeSpans(OpKernelContext* context,
                     const functor::SamplingKernelType kernel_type,
                     const int64 output_size, const int64 input_size,
-                    const float scale, const float translate, Spans* spans) {
+                    const float scale, const float translate,
+                    const bool antialias, Spans* spans) {
   switch (kernel_type) {
     case functor::Lanczos1Kernel: {
       return ComputeSpansCore(context, CreateLanczos1Kernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::Lanczos3Kernel: {
       return ComputeSpansCore(context, CreateLanczos3Kernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::Lanczos5Kernel: {
       return ComputeSpansCore(context, CreateLanczos5Kernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::GaussianKernel: {
       return ComputeSpansCore(context, CreateGaussianKernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::BoxKernel: {
       return ComputeSpansCore(context, CreateBoxKernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::TriangleKernel: {
       return ComputeSpansCore(context, CreateTriangleKernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::KeysCubicKernel: {
       return ComputeSpansCore(context, CreateKeysCubicKernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     case functor::MitchellCubicKernel: {
       return ComputeSpansCore(context, CreateMitchellCubicKernel(), output_size,
-                              input_size, scale, translate, spans);
+                              input_size, scale, translate, antialias, spans);
     }
     default:
       return errors::InvalidArgument(Printf("Unrecognized kernel type: %d",
@@ -234,11 +237,12 @@ Status ComputeGradSpans(OpKernelContext* context,
                         const functor::SamplingKernelType kernel_type,
                         const int64 forward_output_size,
                         const int64 forward_input_size, const float scale,
-                        const float translate, Spans* grad_spans) {
+                        const float translate, const bool antialias,
+                        Spans* grad_spans) {
   Spans spans;
   TF_RETURN_IF_ERROR(ComputeSpans(context, kernel_type, forward_output_size,
                                   forward_input_size, scale, translate,
-                                  &spans));
+                                  antialias, &spans));
   return ComputeGradSpansCore(context, spans, forward_output_size,
                               forward_input_size, grad_spans);
 }
@@ -264,6 +268,7 @@ class ScaleAndTranslateOp : public OpKernel {
  public:
   explicit ScaleAndTranslateOp(OpKernelConstruction* context)
       : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
     string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
@@ -331,12 +336,14 @@ class ScaleAndTranslateOp : public OpKernel {
 
     functor::Spans col_spans;
     OP_REQUIRES_OK(
-        context, ComputeSpans(context, kernel_type_, output_width, input_width,
-                              col_scale, col_translation, &col_spans));
+        context,
+        ComputeSpans(context, kernel_type_, output_width, input_width,
+                     col_scale, col_translation, antialias_, &col_spans));
     functor::Spans row_spans;
-    OP_REQUIRES_OK(context, ComputeSpans(context, kernel_type_, output_height,
-                                         input_height, row_scale,
-                                         row_translation, &row_spans));
+    OP_REQUIRES_OK(
+        context,
+        ComputeSpans(context, kernel_type_, output_height, input_height,
+                     row_scale, row_translation, antialias_, &row_spans));
     Tensor intermediate_t;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DT_FLOAT,
@@ -363,6 +370,7 @@ class ScaleAndTranslateOp : public OpKernel {
         intermediate_data, output_data);
   }
   functor::SamplingKernelType kernel_type_;
+  bool antialias_;
 };
 
 template <typename Device, typename T>
@@ -370,6 +378,7 @@ class ScaleAndTranslateGradOp : public OpKernel {
  public:
   explicit ScaleAndTranslateGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("antialias", &antialias_));
     string kernel_type_str;
     OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
@@ -434,12 +443,12 @@ class ScaleAndTranslateGradOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    ComputeGradSpans(context, kernel_type_, forward_output_width,
                                     forward_input_width, col_scale,
-                                    col_translation, &col_spans));
+                                    col_translation, antialias_, &col_spans));
     functor::Spans row_spans;
     OP_REQUIRES_OK(
         context, ComputeGradSpans(context, kernel_type_, forward_output_height,
                                   forward_input_height, row_scale,
-                                  row_translation, &row_spans));
+                                  row_translation, antialias_, &row_spans));
     Tensor intermediate_t;
     OP_REQUIRES_OK(context, context->allocate_temp(
                                 DT_FLOAT,
@@ -467,6 +476,7 @@ class ScaleAndTranslateGradOp : public OpKernel {
   }
 
   functor::SamplingKernelType kernel_type_;
+  bool antialias_;
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/scale_and_translate_op_test.cc b/tensorflow/core/kernels/scale_and_translate_op_test.cc
index 23176f9f2da9c597d3cf13db0ee2e9f23eb72b37..a17e3d839632b27fa1feafd07dca5d2827f31e51 100644
--- a/tensorflow/core/kernels/scale_and_translate_op_test.cc
+++ b/tensorflow/core/kernels/scale_and_translate_op_test.cc
@@ -90,11 +90,11 @@ inline const T& Clamp(const T& low, const T& high, const T& value) {
 
 // Samples from the image at the passed batch at pixel location sample_f with a
 // kernel scaled by scale.
-void Sample(const DynamicKernel& kernel, TTypes<float, 4>::Tensor images,
-            int batch, const Vector2f& scale, const Vector2f& sample_f,
-            float* dest) {
-  const Vector2f kernel_scale(std::max(scale.x(), 1.0f),
-                              std::max(scale.y(), 1.0f));
+void Sample(const DynamicKernel& kernel, const bool antialias,
+            TTypes<float, 4>::Tensor images, const int batch,
+            const Vector2f& scale, const Vector2f& sample_f, float* dest) {
+  const Vector2f kernel_scale(antialias ? std::max(scale.x(), 1.0f) : 1.0,
+                              antialias ? std::max(scale.y(), 1.0f) : 1.0);
 
   const int64 in_height = images.dimension(1);
   const int64 in_width = images.dimension(2);
@@ -120,7 +120,8 @@ void Sample(const DynamicKernel& kernel, TTypes<float, 4>::Tensor images,
       1;
 
   std::fill(dest, dest + channels, 0.0f);
-  if (y_span_end <= y_span_start || x_span_end <= x_span_start) {
+  if (sample_f.x() < 0.0f || sample_f.y() < 0.0f || sample_f.x() > in_width ||
+      sample_f.y() > in_height) {
     return;
   }
   const Vector2f one_over_kernel_scale(1.0f / kernel_scale.x(),
@@ -153,6 +154,7 @@ void Sample(const DynamicKernel& kernel, TTypes<float, 4>::Tensor images,
 // only difference will be small floating point differences, since this version
 // does not to separable passes in x and y dimensions.
 void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
+                               const bool antialias,
                                TTypes<float, 4>::Tensor images,
                                const Vector2f& orig_scale,
                                const Vector2f& orig_translate,
@@ -169,6 +171,8 @@ void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
 
   const int64 out_height = output.dimension(1);
   const int64 out_width = output.dimension(2);
+  const int64 in_height = images.dimension(1);
+  const int64 in_width = images.dimension(2);
 
   for (int b = 0; b < batch; ++b) {
     for (int64 y = 0; y < out_height; ++y) {
@@ -177,8 +181,13 @@ void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
       for (int64 x = 0; x < out_width; ++x) {
         const float out_x_f = static_cast<float>(x) + 0.5;
         const float in_x_f = out_x_f * scale.x() + translate.x();
-        Sample(kernel, images, b, scale, Vector2f(in_x_f, in_y_f),
-               &output(b, y, x, 0));
+        if (in_x_f < 0.0f || in_y_f < 0.0f || in_x_f > in_width ||
+            in_y_f > in_height) {
+          std::fill(&output(b, y, x, 0), &output(b, y, x + 1, 0), 0.0f);
+        } else {
+          Sample(kernel, antialias, images, b, scale, Vector2f(in_x_f, in_y_f),
+                 &output(b, y, x, 0));
+        }
       }
     }
   }
@@ -186,16 +195,18 @@ void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
 
 class ScaleAndTranslateOpTest : public OpsTestBase {
  protected:
-  void CreateOp(const string& kernel_type_str = "lanczos3") {
+  void CreateOp(const string& kernel_type_str, const bool antialias) {
     TF_EXPECT_OK(NodeDefBuilder("scale_and_translate_op", "ScaleAndTranslate")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_FLOAT))
                      .Attr("kernel_type", kernel_type_str)
+                     .Attr("antialias", antialias)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
     kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
+    antialias_ = antialias;
   }
 
   void SetCheckerboardImageInput(int batch_size, int num_row_squares,
@@ -247,17 +258,19 @@ class ScaleAndTranslateOpTest : public OpsTestBase {
                                  output_image_width, channels}));
 
     std::unique_ptr<const DynamicKernel> kernel = Create(kernel_type_);
-    ScaleAndTranslateBaseline(*kernel, mutable_input(0)->tensor<float, 4>(),
-                              scale, translate, expected.tensor<float, 4>());
+    ScaleAndTranslateBaseline(*kernel, antialias_,
+                              mutable_input(0)->tensor<float, 4>(), scale,
+                              translate, expected.tensor<float, 4>());
     constexpr double kAbs = 1e-2f;
     test::ExpectTensorNear<float>(expected, *GetOutput(0), kAbs);
   }
 
   functor::SamplingKernelType kernel_type_;
+  bool antialias_;
 };
 
 TEST_F(ScaleAndTranslateOpTest, IdentityTest) {
-  CreateOp();
+  CreateOp("lanczos3", true);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 16;
   constexpr int64 kNumColSquares = 13;
@@ -273,7 +286,7 @@ TEST_F(ScaleAndTranslateOpTest, IdentityTest) {
 }
 
 TEST_F(ScaleAndTranslateOpTest, UpsampleTest) {
-  CreateOp();
+  CreateOp("lanczos3", true);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 16;
   constexpr int64 kNumColSquares = 13;
@@ -289,7 +302,7 @@ TEST_F(ScaleAndTranslateOpTest, UpsampleTest) {
 }
 
 TEST_F(ScaleAndTranslateOpTest, DownsampleTest) {
-  CreateOp();
+  CreateOp("lanczos3", true);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 16;
   constexpr int64 kNumColSquares = 13;
@@ -304,8 +317,25 @@ TEST_F(ScaleAndTranslateOpTest, DownsampleTest) {
   RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
 }
 
-TEST_F(ScaleAndTranslateOpTest, DownsampleToASinglePixelTest) {
-  CreateOp();
+TEST_F(ScaleAndTranslateOpTest, AntiAliasedDownsampleToASinglePixelTest) {
+  CreateOp("lanczos3", true);
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 16;
+  constexpr int64 kNumColSquares = 13;
+  constexpr int64 kSquareSize = 12;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 1;
+  constexpr int kOutputImageWidth = 1;
+  const Vector2f kScale(1.0f / (kNumRowSquares * kSquareSize),
+                        1.0f / (kNumColSquares * kSquareSize));
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, NonAntiAliasedDownsampleToASinglePixelTest) {
+  CreateOp("lanczos3", false);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 16;
   constexpr int64 kNumColSquares = 13;
@@ -322,7 +352,7 @@ TEST_F(ScaleAndTranslateOpTest, DownsampleToASinglePixelTest) {
 }
 
 TEST_F(ScaleAndTranslateOpTest, UsampleFromASinglePixelTest) {
-  CreateOp();
+  CreateOp("lanczos3", true);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 1;
   constexpr int64 kNumColSquares = 1;
@@ -337,8 +367,43 @@ TEST_F(ScaleAndTranslateOpTest, UsampleFromASinglePixelTest) {
   RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
 }
 
-TEST_F(ScaleAndTranslateOpTest, ScaleAndTranslationTest) {
-  CreateOp();
+TEST_F(ScaleAndTranslateOpTest, NonAntialiasedUsampleFromASinglePixelTest) {
+  CreateOp("lanczos3", false);
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 1;
+  constexpr int64 kNumColSquares = 1;
+  constexpr int64 kSquareSize = 1;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 10;
+  constexpr int kOutputImageWidth = 17;
+  const Vector2f kScale(17.0f, 10.0f);
+  const Vector2f kTranslate(0.0f, 0.0f);
+  // Anti-aliasing shouldn't have any effect here, verify by comparing with the
+  // ground truth with anti-aliasing turned on.
+  antialias_ = true;
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, AntialiasedScaleAndTranslationTest) {
+  CreateOp("lanczos3", true);
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 11;
+  constexpr int64 kNumColSquares = 7;
+  constexpr int64 kSquareSize = 5;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 49;
+  constexpr int kOutputImageWidth = 51;
+  const Vector2f kScale(1.25f, 0.6f);
+  const Vector2f kTranslate(4.1f, -3.1f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, NonAntialiasedScaleAndTranslationTest) {
+  CreateOp("lanczos3", false);
   constexpr int64 kBatchSize = 2;
   constexpr int64 kNumRowSquares = 11;
   constexpr int64 kNumColSquares = 7;
@@ -348,7 +413,7 @@ TEST_F(ScaleAndTranslateOpTest, ScaleAndTranslationTest) {
                             kSquareSize, kNumChannels);
   constexpr int kOutputImageHeight = 49;
   constexpr int kOutputImageWidth = 51;
-  const Vector2f kScale(1.1f, 0.9f);
+  const Vector2f kScale(1.25f, 0.6f);
   const Vector2f kTranslate(4.1f, -3.1f);
   RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
 }
@@ -358,7 +423,7 @@ TEST_F(ScaleAndTranslateOpTest, TestKernelTypes) {
       "lanczos1", "lanczos3",  "lanczos5",     "box",
       "triangle", "keyscubic", "mitchellcubic"};
   for (const string& kernel_type : kKernelTypes) {
-    CreateOp(kernel_type);
+    CreateOp(kernel_type, true);
     constexpr int64 kBatchSize = 2;
     constexpr int64 kNumRowSquares = 10;
     constexpr int64 kNumColSquares = 11;
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 0a6848361a05559e8d1e23318ca66a9dd3ad9a95..ea42fdefb4124b0fb638adea1f91d77f95d456fd 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index 976b2215405105ece0a5d25c2684aa558b01d8a0..557b72000a72dff228178ab22cb968b9c31576f2 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 #include "tensorflow/core/util/permutation_input_iterator.h"
 #include "tensorflow/core/util/permutation_output_iterator.h"
 
@@ -241,34 +242,40 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
   // Launch on the smallest power of 2 block size that we can.
   if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
     const int block_size = 1024;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   } else if (ideal_block_size >= 512) {
     const int block_size = 512;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   } else if (ideal_block_size >= 256) {
     const int block_size = 256;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   } else if (ideal_block_size >= 128) {
     const int block_size = 128;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   } else if (ideal_block_size >= 64) {
     const int block_size = 64;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   } else {
     const int block_size = 32;
-    scan_kernel<T, Op, block_size, items_per_thread>
-        <<<num_blocks, block_size, 0, d.stream()>>>(
-            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+    TF_CHECK_OK(
+        CudaLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
+                         num_blocks, block_size, 0, d.stream(), in.data(),
+                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
   }
 }
 
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 2d43bde23feadc33c7081fccd8ad2e44dfe3c2d5..755f8f8dc55ec7dfdf6c56f1ca86e14ec3e3e352 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index 057755a05c151b9c1cab3d529bb047b893020049..57344c1dd2440cbe79e66cef43a959aebe1f8d3f 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -127,10 +127,10 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     const Index indices_size = indices.size();
     const Index updates_size = updates.size();
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
-    scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(), first_dim_size,
-            updates_size, indices_size);
+    TF_CHECK_OK(CudaLaunchKernel(
+        scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>, config.block_count,
+        config.thread_per_block, 0, d.stream(), params.data(), updates.data(),
+        indices.data(), first_dim_size, updates_size, indices_size));
     return -1;
   }
 };
@@ -148,10 +148,11 @@ struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
     const Index indices_size = indices.size();
     const Index synthesized_updates_size = indices_size * params.dimension(1);
     CudaLaunchConfig config = GetCudaLaunchConfig(synthesized_updates_size, d);
-    scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), update.data(), indices.data(), first_dim_size,
-            indices_size, synthesized_updates_size);
+    TF_CHECK_OK(CudaLaunchKernel(
+        scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>,
+        config.block_count, config.thread_per_block, 0, d.stream(),
+        params.data(), update.data(), indices.data(), first_dim_size,
+        indices_size, synthesized_updates_size));
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 50e4c66b7e5e7ec5b1f6c743666b98992e338209..9c51d4e3a7d9e93f34a4c5957f9acec55ea14937 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
diff --git a/tensorflow/core/kernels/scatter_nd_op.h b/tensorflow/core/kernels/scatter_nd_op.h
index 8d04731aae6329dbfd2539ec441a2d1b140f6cd3..eec70ba69e5101068dfdcfde5152ab9ea2088efe 100644
--- a/tensorflow/core/kernels/scatter_nd_op.h
+++ b/tensorflow/core/kernels/scatter_nd_op.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 472f5a3547aaaf0237a6d3ce51a141519c4d11a4..01e4656eab8b2b067f870253ba9f3223835a461f 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 08b657f4c38807cd99bd6f03cacf589e9d8fd22c..9936b3f9b78dc7ea0d298825d6a1abbf12ccd7f9 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -136,12 +136,12 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
     }
 
     CudaLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
-    // clang-format off
-    ScatterNdOpKernel<T, Index, op, IXDIM>
-    <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      Tindices.data(), Tupdates.data(), Toutput.data(), output_shape_prefix,
-      batch_strides, batch_size, slice_size);
-    // clang-format on
+
+    TF_CHECK_OK(CudaLaunchKernel(ScatterNdOpKernel<T, Index, op, IXDIM>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), Tindices.data(), Tupdates.data(),
+                                 Toutput.data(), output_shape_prefix,
+                                 batch_strides, batch_size, slice_size));
 
     return -1;
   }
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 2bb2c0d91e94b9462af330e806745cfb8317767a..cbc754af0e9bb1f3606e9de5e31bc415b2113f3d 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -310,7 +310,10 @@ Status Examples::SampleAdaptiveProbabilities(
 
 void Examples::RandomShuffle() {
   std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
-  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::shuffle(sampled_index_.begin(), sampled_index_.end(), rng);
 }
 
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
diff --git a/tensorflow/core/kernels/searchsorted_op.cc b/tensorflow/core/kernels/searchsorted_op.cc
index dc627ac77a51d6da994309687c5694d261908524..06b2d818374fd6a102ec3966e57e3619b4d18289 100644
--- a/tensorflow/core/kernels/searchsorted_op.cc
+++ b/tensorflow/core/kernels/searchsorted_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/searchsorted_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index 263b5bf29809b5053e40a3d22087f1a408387155..71580ff9a871b21fdf7f743d9c507a102a9811d4 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -68,10 +68,10 @@ struct UpperBoundFunctor<GPUDevice, T, OutType> {
     CudaLaunchConfig config =
         GetCudaLaunchConfig(values.size(), context->eigen_gpu_device());
 
-    UpperBoundKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, stream>>>(
-            sorted_inputs.data(), batch_size, num_inputs, num_values,
-            values.data(), output->data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        UpperBoundKernel<T, OutType>, config.block_count,
+        config.thread_per_block, 0, stream, sorted_inputs.data(), batch_size,
+        num_inputs, num_values, values.data(), output->data()));
 
     return Status::OK();
   }
@@ -88,10 +88,10 @@ struct LowerBoundFunctor<GPUDevice, T, OutType> {
     CudaLaunchConfig config =
         GetCudaLaunchConfig(values.size(), context->eigen_gpu_device());
 
-    LowerBoundKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, stream>>>(
-            sorted_inputs.data(), batch_size, num_inputs, num_values,
-            values.data(), output->data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        LowerBoundKernel<T, OutType>, config.block_count,
+        config.thread_per_block, 0, stream, sorted_inputs.data(), batch_size,
+        num_inputs, num_values, values.data(), output->data()));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2328fc6afd8e7b7c24351e612ea6b760a2d522c3..6e1a0d57a169b51e184330c984a5c75d332490da 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -22,15 +22,17 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include <vector>
+
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/util.h"
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 3511c85f7174f8dab47ca3ba05f01d7c4f5110b8..39406dd9a22d29b453205586073e42db99b0a790 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -138,8 +138,9 @@ void SegmentSumFunctor<T, Index>::operator()(
   }
   // Set 'output' to zeros.
   CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-  SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      output.size(), output.data());
+  TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
+                               config.thread_per_block, 0, d.stream(),
+                               output.size(), output.data()));
   if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
     return;
   }
@@ -162,10 +163,11 @@ void SegmentSumFunctor<T, Index>::operator()(
       input_inner_dim_size * input_outer_dim_num_stripe;
 
   config = GetCudaLaunchConfig(total_stripe_count, d);
-  SortedSegmentSumCustomKernel<T, Index, OuterDimTileSize>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          input_outer_dim_size, input_inner_dim_size, output_rows,
-          segment_ids.data(), data, output.data(), total_stripe_count);
+  TF_CHECK_OK(CudaLaunchKernel(
+      SortedSegmentSumCustomKernel<T, Index, OuterDimTileSize>,
+      config.block_count, config.thread_per_block, 0, d.stream(),
+      input_outer_dim_size, input_inner_dim_size, output_rows,
+      segment_ids.data(), data, output.data(), total_stripe_count));
 }
 
 template <typename T, typename Index, typename InitialValueF,
@@ -182,8 +184,9 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
     // Set 'output' to initial value.
     GPUDevice d = ctx->template eigen_device<GPUDevice>();
     CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        output.size(), output.data(), InitialValueF()());
+    TF_CHECK_OK(CudaLaunchKernel(
+        SetToValue<T>, config.block_count, config.thread_per_block, 0,
+        d.stream(), output.size(), output.data(), InitialValueF()()));
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
@@ -196,10 +199,11 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
     const Index input_inner_dim_size = data_size / input_outer_dim_size;
     config = GetCudaLaunchConfig(data_size, d);
 
-    UnsortedSegmentCustomKernel<T, Index, ReductionF>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input_outer_dim_size, input_inner_dim_size, num_segments,
-            segment_ids.data(), data, output.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        UnsortedSegmentCustomKernel<T, Index, ReductionF>, config.block_count,
+        config.thread_per_block, 0, d.stream(), input_outer_dim_size,
+        input_inner_dim_size, num_segments, segment_ids.data(), data,
+        output.data()));
   }
 };
 
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
deleted file mode 100644
index 30cb1e0a7f80f084854073ee061500bbcf0ccade..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <memory>
-
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace {
-
-class ShapeOpTest : public OpsTestBase {};
-
-struct NoKnownShape {
-  string TypeName() const { return "NO KNOWN SHAPE"; }
-};
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(NoKnownShape, "NO KNOWN SHAPE");
-
-struct KnownVecSize {
-  KnownVecSize() : shape_value(0) {}
-  explicit KnownVecSize(int value) : shape_value(value) {}
-  string TypeName() const { return "KNOWN VECTOR SIZE TYPE"; }
-  bool Decode(const VariantTensorData& d) {
-    return d.get_metadata(&shape_value);
-  }
-  void Encode(VariantTensorData* d) const { d->set_metadata(shape_value); }
-  int shape_value;
-};
-
-Status GetShapeFromKnownVecSize(const KnownVecSize& ks, TensorShape* s) {
-  *s = TensorShape({ks.shape_value});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, GetShapeFromKnownVecSize);
-
-static void ExpectHasError(const Status& s, StringPiece substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
-      << ">>" << s << "<<, expected substring >>" << substr << "<<";
-}
-
-TEST_F(ShapeOpTest, Simple) {
-  // Ensure the ops run on CPU, as we have no device copy registration
-  // for NoKnownShape and KnownVecSize objects.
-  Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
-
-  // Use a placeholder so the graph optimizer doesn't optimize away
-  // the shape function.
-  auto input = ops::Placeholder(root, DT_VARIANT);
-  auto shape_output = ops::Shape(root, input);
-  auto rank_output = ops::Rank(root, input);
-  auto size_output = ops::Size(root, input);
-
-  TF_ASSERT_OK(root.status());
-
-  ClientSession session(root);
-
-  std::vector<Tensor> outputs;
-
-  {
-    // Test no shape registered.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = NoKnownShape();
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(
-        s, strings::StrCat(
-               "No unary variant shape function found for Variant type_index: ",
-               port::MaybeAbiDemangle(MakeTypeIndex<NoKnownShape>().name())));
-  }
-
-  {
-    // Test non-scalar variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({1}));
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(s, "Shape of non-unary Variant not supported.");
-  }
-
-  {
-    // Test registered variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    const int vec_dim_value = -0xdeadbeef;  // must be non-negative.
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = KnownVecSize(vec_dim_value);
-    TF_EXPECT_OK(session.Run({{input, variant_tensor}},
-                             {shape_output, rank_output, size_output},
-                             &outputs));
-    EXPECT_EQ(outputs[0].dims(), 1);  // shape
-    EXPECT_EQ(vec_dim_value, outputs[0].vec<int32>()(0));
-    EXPECT_EQ(outputs[1].dims(), 0);  // rank
-    EXPECT_EQ(1, outputs[1].scalar<int32>()());
-    EXPECT_EQ(outputs[2].dims(), 0);  // size
-    EXPECT_EQ(vec_dim_value, outputs[0].scalar<int32>()());
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index ab1ce0f9c83025e472c114225265ce9430be93a3..db7357ca70e8050ff5d0d858989f27673af5f49d 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -469,8 +469,7 @@ class EnsureShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
 
     if (!expected_shape_.IsCompatibleWith(shape)) {
       ctx->SetStatus(errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 7a50f158af02e698681ef513c2baa2be1e22267f..03b32b88d9b7f4441439fb382bc5f8c47643ae43 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -20,27 +20,18 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
 namespace shape_op_helpers {
-inline Status GetRegularOrVariantShape(OpKernelContext* ctx, int input_index,
-                                       TensorShape* shape) {
-  const Tensor& inp = ctx->input(input_index);
-  if (ctx->input_dtype(0) == DT_VARIANT) {
-    if (inp.dims() != 0) {
-      return errors::InvalidArgument(
-          "Shape of non-unary Variant not supported.");
-    }
-    TF_RETURN_IF_ERROR(GetUnaryVariantShape(inp, shape));
-  } else {
-    *shape = inp.shape();
-  }
+inline Status GetShape(OpKernelContext* ctx, int input_index,
+                       TensorShape* shape) {
+  *shape = ctx->input(input_index).shape();
   return Status::OK();
 }
 }  // namespace shape_op_helpers
@@ -52,8 +43,7 @@ class ShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
@@ -81,8 +71,7 @@ class ShapeNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       TensorShape shape;
-      OP_REQUIRES_OK(
-          ctx, shape_op_helpers::GetRegularOrVariantShape(ctx, i, &shape));
+      OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, i, &shape));
       const int dims = shape.dims();
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, {dims}, &out));
@@ -110,8 +99,7 @@ class RankOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
@@ -128,8 +116,7 @@ class SizeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int64 size = shape.num_elements();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index d1e677feb0d345f470bdf0f7dca5cae7e7d6d02e..9b2f3a963bd6ab6672885d628149dd9aecec6d4e 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -180,13 +180,12 @@ class SoftmaxOpGPU : public OpKernel {
           context, const_cast<acc_type*>(sum_probs.flat<acc_type>().data()),
           input_itr, rows, cols);
 
-      GenerateNormalizedProb<T, acc_type>
-          <<<numBlocks, numThreads, 0, cu_stream>>>(
-              reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
-              reinterpret_cast<const acc_type*>(
-                  sum_probs.flat<acc_type>().data()),
-              reinterpret_cast<const T*>(max_logits.flat<T>().data()),
-              const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_);
+      TF_CHECK_OK(CudaLaunchKernel(
+          GenerateNormalizedProb<T, acc_type>, numBlocks, numThreads, 0,
+          cu_stream, reinterpret_cast<const T*>(logits_in_.flat<T>().data()),
+          reinterpret_cast<const acc_type*>(sum_probs.flat<acc_type>().data()),
+          reinterpret_cast<const T*>(max_logits.flat<T>().data()),
+          const_cast<T*>(softmax_out->flat<T>().data()), rows, cols, log_));
     }
   }
 
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index d3fc0e1461b973fe2be929e86fc015468dfab452..fb00e1bb08c3e184168168b3fa9dc8e19c7e637b 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -87,7 +87,7 @@ void SoftplusGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                          \
@@ -119,6 +119,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
index 8df734588b8affce4942eac033794d10e9274de0..900df277a5ba165a46945b28971f57333fd42ab4 100644
--- a/tensorflow/core/kernels/softplus_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -37,4 +37,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index f46a84da1e951113382e4d44b44463c2a621ca10..459f20b0ae1cea1769277f4d367829d61e831ca1 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index 5687141c9eaeec11498c1d2cc954155bd9e05856..ea6e076909bd4fd2483aea409d117214da3da8b5 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,11 +141,11 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS, B2S>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
-            args, const_cast<T*>(batch_tensor.data()));
-    return Status::OK();
+    return CudaLaunchKernel(S2B<T, NUM_BLOCK_DIMS, B2S>, config.block_count,
+                            config.thread_per_block, 0, d.stream(),
+                            config.virtual_thread_count,
+                            const_cast<T*>(space_tensor.data()), args,
+                            const_cast<T*>(batch_tensor.data()));
   }
 };
 
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index b565927ccb8d588ea52856c0c23e62e5fa3d18ff..606ff89e742f70cefdb4d0b45f00e7f182030eac 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -158,10 +158,11 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NHWC> {
       return;
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    S2D_NHWC<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        S2D_NHWC<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, batch_size,
         input_height, input_width, input_depth, output_height, output_width,
-        output_depth, output.data());
+        output_depth, output.data()));
   }
   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
                   int block_size, typename TTypes<T, 5>::Tensor output) {
@@ -193,23 +194,26 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
-          return S2D_NCHW_LOOP<T, 2>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), output_width, input_width,
-                  input_depth_by_output_area, output_depth_by_output_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              S2D_NCHW_LOOP<T, 2>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), output_width,
+              input_width, input_depth_by_output_area,
+              output_depth_by_output_area, output.data()));
+          return;
         case 3:
-          return S2D_NCHW_LOOP<T, 3>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), output_width, input_width,
-                  input_depth_by_output_area, output_depth_by_output_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              S2D_NCHW_LOOP<T, 3>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), output_width,
+              input_width, input_depth_by_output_area,
+              output_depth_by_output_area, output.data()));
+          return;
         case 4:
-          return S2D_NCHW_LOOP<T, 4>
-              <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                  total_count, input.data(), output_width, input_width,
-                  input_depth_by_output_area, output_depth_by_output_area,
-                  output.data());
+          TF_CHECK_OK(CudaLaunchKernel(
+              S2D_NCHW_LOOP<T, 4>, config.block_count, config.thread_per_block,
+              0, d.stream(), total_count, input.data(), output_width,
+              input_width, input_depth_by_output_area,
+              output_depth_by_output_area, output.data()));
+          return;
       }
     }
 
@@ -219,9 +223,10 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       return;
     }
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    S2D_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+    TF_CHECK_OK(CudaLaunchKernel(
+        S2D_NCHW<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, output_width,
-        input_depth * output_height, output.data());
+        input_depth * output_height, output.data()));
   }
   void operator()(const GPUDevice& d, typename TTypes<T, 5>::ConstTensor input,
                   int block_size, typename TTypes<T, 5>::Tensor output) {
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index a4e89f439ed9f5711253924ad120f7a6751e1728..af69ae9ebd57de85c695eb003cca24b2c39331fc 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -78,9 +78,6 @@ class SparseDenseBinaryOpShared : public OpKernel {
                     "but received shapes: ",
                     values_t->shape().DebugString(), " and ",
                     shape_t->shape().DebugString()));
-    OP_REQUIRES(ctx, indices_t->dim_size(0) < std::numeric_limits<int>::max(),
-                errors::InvalidArgument(
-                    "Number of non-zero elements exceeds int32 range"));
 
     const auto indices_mat = indices_t->matrix<int64>();
     const auto shape_vec = shape_t->vec<int64>();
@@ -106,7 +103,7 @@ class SparseDenseBinaryOpShared : public OpKernel {
 
     Tensor *output_values = nullptr;
     Tensor dense_gathered;
-    const int nnz = static_cast<int>(indices_t->dim_size(0));
+    const int64 nnz = indices_t->dim_size(0);
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output(0, TensorShape({nnz}), &output_values));
     OP_REQUIRES_OK(
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 2ea7a1ed3b9c5c37e0c93edef9431ce0438d380d..9c9e7370ac44bfb704f5491e2c572e961f188e3a 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -48,11 +48,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
+#define ALWAYS_INLINE EIGEN_ALWAYS_INLINE
+
 namespace tensorflow {
 namespace {
 
-using Eigen::operator==;
-
 template <typename T>
 using BasicMatrix = Eigen::Tensor<T, 2, Eigen::RowMajor>;
 
@@ -161,6 +161,19 @@ struct SparseSlice {
   const int block_size;
 };
 
+template <typename T>
+bool IsZero(T v);
+
+template <>
+ALWAYS_INLINE bool IsZero(bfloat16 v) {
+  return v.IsZero();
+}
+
+template <>
+ALWAYS_INLINE bool IsZero(float v) {
+  return v == 0.0f;
+}
+
 template <typename T>
 template <bool Transpose>
 void SparseSlice<T>::Initialize(
@@ -182,9 +195,8 @@ void SparseSlice<T>::Initialize(
   index.reserve(num_blocks * num_rows * 2);
 
   Index3 idx3;
-  Index idx;
-  int data3_size = 0;
-  static const T zero(0);
+  const int stride = Transpose ? mat.dimension(1) : 1;
+
   for (int i = 0; i < num_blocks; ++i) {
     int num_block_cols = std::min(block_size, num_cols - block_size * i);
     for (int row = 0; row < num_rows; ++row) {
@@ -196,54 +208,48 @@ void SparseSlice<T>::Initialize(
       const auto* start =
           Transpose ? &mat(col_offset, row) : &mat(row, col_offset);
       const auto* curr = start;
-      const int stride = Transpose ? mat.dimension(1) : 1;
       const auto* end = start + stride * num_block_cols;
       uint8 k = 0;
 #define NEXT_ELEM \
   curr += stride; \
   ++k;
+#define EAT_ZEROS                          \
+  while (curr < end && IsZero<T>(*curr)) { \
+    NEXT_ELEM;                             \
+  }
       while (true) {
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
-        }
+        EAT_ZEROS
         if (curr >= end) break;
         idx3.k1 = k;
-        data3.push_back(*curr);
+        const T value1 = *curr;
         NEXT_ELEM;
 
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
+        EAT_ZEROS
+        if (curr >= end) {
+          data.push_back(value1);
+          index.push_back({idx3.m, idx3.k1});
+          break;
         }
-        if (curr >= end) break;
         idx3.k2 = k;
-        data3.push_back(*curr);
+        const T value2 = *curr;
         NEXT_ELEM;
 
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
+        EAT_ZEROS
+        if (curr >= end) {
+          data.push_back(value2);
+          index.push_back({idx3.m, idx3.k2});
+          data.push_back(value1);
+          index.push_back({idx3.m, idx3.k1});
+          break;
         }
-        if (curr >= end) break;
         idx3.k3 = k;
+        data3.push_back(value1);
+        data3.push_back(value2);
         data3.push_back(*curr);
         NEXT_ELEM;
         index3.push_back(idx3);
 #undef NEXT_ELEM
-      }
-      int num_inserted_mod = data3.size() % 3;
-      // Move some elements to index and data if needed.
-      data3_size = data3.size() - num_inserted_mod;
-      idx.m = idx3.m;
-      switch (num_inserted_mod) {
-        case 2:
-          idx.k = idx3.k2;
-          data.push_back(data3[data3_size + 1]);
-          index.push_back(idx);
-          TF_FALLTHROUGH_INTENDED;
-        case 1:
-          idx.k = idx3.k1;
-          data.push_back(data3[data3_size]);
-          index.push_back(idx);
-          data3.resize(data3_size);
+#undef EAT_ZEROS
       }
     }
     col_offset += block_size;
@@ -276,8 +282,6 @@ const int kNumOperands = (sizeof(Packet) / sizeof(float));
 #define STORE(x, y) Eigen::internal::pstore<float>(x, y);
 #define FMA(a, b, c, d) d = Eigen::internal::pmadd<Packet>(a, b, c);
 
-#define ALWAYS_INLINE EIGEN_ALWAYS_INLINE
-
 ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
diff --git a/tensorflow/core/kernels/sparse_reshape_op.cc b/tensorflow/core/kernels/sparse_reshape_op.cc
index 939d404aa442e6d3384d46f19cc54771cb53a27b..059519a913b7e7c9dd6fd4ee39ae0912a4e4d7cc 100644
--- a/tensorflow/core/kernels/sparse_reshape_op.cc
+++ b/tensorflow/core/kernels/sparse_reshape_op.cc
@@ -34,8 +34,6 @@ class SparseReshapeOp : public OpKernel {
   explicit SparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    Tensor output_indices;
-    Tensor output_shape;
     Reshape(context, context->input(0), context->input(1), context->input(2),
             0 /* output indices index */, 1 /* output shape index */);
   }
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 30c57ef287f4c645b198da6ebf6b8554dde4fd12..0a97c6b6a5424c3c75c52add13bfa8021b665e17 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index e261e42e0d3bf43efc3a1328f07b1362f0870dfd..f85f2a48a10d2c46d9c8d63840f3a5eb1ef64e58 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -81,10 +81,11 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
     // out.size()?  Perhaps p * nnz ?
     CudaLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
 
-    SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            nnz, m, b_rows, b_cols, p, a_indices.data(), a_values.data(),
-            b.data(), out.data());
+    TF_CHECK_OK(CudaLaunchKernel(
+        SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>,
+        config.block_count, config.thread_per_block, 0, d.stream(), nnz, m,
+        b_rows, b_cols, p, a_indices.data(), a_values.data(), b.data(),
+        out.data()));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/sparse_utils.cc b/tensorflow/core/kernels/sparse_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..198862940d1841675f8d7a0b0ade7160d1dc0582
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <cstddef>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat) {
+  // Search in the index range [begin, end) of indices_mat.
+  Tindices begin = sparse_index_begin;
+  Tindices end = indices_mat.dimension(0);
+  const Tindices orig_sparse_index_end = end;
+
+  // The first dense row we search.
+  const Tindices orig_dense_index_begin = indices_mat(begin, 0);
+  // Early exit if no next dense row index.
+  if (orig_dense_index_begin == static_cast<int64>(indices_mat(end - 1, 0))) {
+    return orig_sparse_index_end;
+  }
+
+  Tindices increment = 1;
+  while (begin + increment < end &&
+         indices_mat(begin + increment, 0) == orig_dense_index_begin) {
+    increment *= 2;
+  }
+  // Narrow the search space as an optimization.
+  if (begin + increment < end) {
+    end = begin + increment;
+  }
+  begin += increment / 2;
+
+  // Perform a binary search on the interval [begin, end) for
+  // dense_row_index_to_find.
+  const Tindices dense_row_index_to_find = orig_dense_index_begin;
+  while (begin < end) {
+    const Tindices m = begin + (end - begin) / 2;
+    const Tindices m_dense_row_index = static_cast<Tindices>(indices_mat(m, 0));
+    if (m_dense_row_index == dense_row_index_to_find &&
+        (m + 1 == orig_sparse_index_end ||
+         static_cast<Tindices>(indices_mat(m + 1, 0)) !=
+             dense_row_index_to_find)) {
+      return m + 1;
+    } else if (m_dense_row_index <= dense_row_index_to_find) {
+      begin = m + 1;
+    } else {
+      end = m;
+    }
+  }
+
+  // No next dense row index.
+  return orig_sparse_index_end;
+}
+
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows) {
+  int64 start_sparse_index_of_cur_dense_row = 0;
+  std::vector<Tindices> segment_indices;
+  const Tindices num_entries_in_sparse_tensor = indices_mat.dimension(0);
+  const Tindices num_dense_rows_in_sparse_tensor =
+      1 + indices_mat(num_entries_in_sparse_tensor - 1, 0) - indices_mat(0, 0);
+  // Reserve an extra slot for the 0 we store in the first entry by convention.
+  segment_indices.reserve(1 + num_dense_rows_in_sparse_tensor);
+  segment_indices.push_back(0);
+  *contains_empty_rows = false;
+  while (true) {
+    const Tindices start_sparse_index_of_next_dense_row =
+        FindNextDenseRowStartIndex<Tindices>(
+            start_sparse_index_of_cur_dense_row, indices_mat);
+    if (start_sparse_index_of_next_dense_row == num_entries_in_sparse_tensor) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+      break;
+    }
+    // Encode the length of the current dense row as well as the lengths of all
+    // the empty rows until the next dense row,
+    for (Tindices i = 0;
+         i < indices_mat(start_sparse_index_of_next_dense_row, 0) -
+                 indices_mat(start_sparse_index_of_cur_dense_row, 0);
+         ++i) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+    }
+    // If there is more than one row between the current and next non-empty
+    // rows then those rows are empty.
+    *contains_empty_rows |=
+        indices_mat(start_sparse_index_of_next_dense_row, 0) -
+            indices_mat(start_sparse_index_of_cur_dense_row, 0) >
+        1;
+    start_sparse_index_of_cur_dense_row = start_sparse_index_of_next_dense_row;
+  }
+  return segment_indices;
+}
+
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat) {
+  std::vector<Tindices> out;
+  auto vec = tensor.vec<Tindices>();
+  out.reserve(vec.size() + 1);
+  for (size_t i = 0; i < vec.dimension(0); ++i) {
+    out.push_back(vec(i));
+  }
+  out.push_back(num_nonzero_entries_in_sparse_mat);
+  return out;
+}
+
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices) {
+  // Skip checking the lengths of the first and last dense rows since those are
+  // always non-empty.
+  for (size_t i = 2; i < row_start_indices.size() - 1; ++i) {
+    if (row_start_indices.at(i) - row_start_indices.at(i - 1) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template TypeIndex FindNextDenseRowStartIndex<TypeIndex>(                 \
+      const TypeIndex sparse_index_begin,                                   \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat);                   \
+  template std::vector<TypeIndex> GetStartIndicesOfEachDenseRow<TypeIndex>( \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat,                    \
+      bool* contains_empty_rows);                                           \
+  template bool ContainsEmptyRows<TypeIndex>(                               \
+      const std::vector<TypeIndex>& row_start_indices);                     \
+  template std::vector<TypeIndex> ParseRowStartIndices<TypeIndex>(          \
+      const tensorflow::Tensor& tensor,                                     \
+      const TypeIndex num_nonzero_entries_in_sparse_mat);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_utils.h b/tensorflow/core/kernels/sparse_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e3c41a49642ebe722b7aeb5adeb6f41cea858b3
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for writing OpKernels for sparse tensors.
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+// Find the index i of the first element for which
+// indices_mat(sparse_index_begin, 0) < indices_mat(i, 0).
+// The search is conducted in the open interval
+// [sparse_index_begin, indices_mat.dimension(0)) and when no such i is found,
+// indices_mat.dimension(0) is returned.
+// indices_mat(k, 0) should be non-decreasing over the interval
+// [begin, indices_mat.dimension(0)).
+// Requires 0 <= sparse_index_begin < indices_mat.dimension(0).
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat);
+
+// Returns the vector v of indices in indices_mat at which new dense matrix
+// rows begin.
+// v.front() = 0, v.back() = indices_mat.dimension(0), and for i > 0,
+// v[i] - v[i-1] is the length of the ith dense row in indices_mat.
+// *contains_empty_rows = true if and only if indices_mat contains empty rows
+// (rows without values) between its first and last row.
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows);
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns true if and only if the sparse matrix indices_mat whose row start
+// indices are represented by row_start_indices has empty dense rows
+// (between its first and last dense rows).
+// This function satisfies the identity row_start_indices ==
+// GetStartIndicesOfEachDenseRow(indices_mat, &return_value).
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
diff --git a/tensorflow/core/kernels/sparse_utils_test.cc b/tensorflow/core/kernels/sparse_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0adff8860ded4c8b1f49b99ba6eb3a261782aa
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils_test.cc
@@ -0,0 +1,263 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::uint16;
+using tensorflow::uint32;
+using tensorflow::uint64;
+using tensorflow::sparse_utils::ContainsEmptyRows;
+using tensorflow::sparse_utils::FindNextDenseRowStartIndex;
+using tensorflow::sparse_utils::GetStartIndicesOfEachDenseRow;
+using tensorflow::sparse_utils::ParseRowStartIndices;
+
+TEST(SparseUtilsTest, GetStartIndicesOfEachDenseRow) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int32>({0, 1, 2, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                             &contains_empty_rows) ==
+        std::vector<int32>({0, 1, 3, 3, 3, 6, 6, 7, 11, 13, 13, 14, 14, 15}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int64 data[] = {3, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 1, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int64>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int64>({0, 1}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint32 data[] = {3, 0, 3, 0};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 2, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint32>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint32>({0, 2}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint16 data[] = {0, 0, 0, 0, 0, 0, 1, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 4, 2);
+    // indices_list = {0, 0, 0, 1};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint16>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint16>({0, 3, 4}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint64 data[] = {0, 0, 0, 0, 0, 0, 3, 0};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 4, 2);
+    bool contains_empty_rows;
+    // indices_list = {0, 0, 0, 3};
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint64>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint64>({0, 3, 3, 3, 4}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+}
+
+TEST(SparseUtilsTest, ParseRowStartIndices) {
+  {
+    Tensor t(DataType::DT_INT32, {1});
+    int indx = 0;
+    for (const int32 v : {0}) {
+      t.flat<int32>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int32>(t, 1) ==
+                std::vector<int32>({0, 1}));
+  }
+  {
+    Tensor t(DataType::DT_INT64, {1});
+    int indx = 0;
+    for (const int64 v : {0}) {
+      t.flat<int64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int64>(t, 2) ==
+                std::vector<int64>({0, 2}));
+  }
+  {
+    Tensor t(DataType::DT_UINT64, {2});
+    int indx = 0;
+    for (const uint64 v : {0, 3}) {
+      t.flat<uint64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint64>(t, 4) ==
+                std::vector<uint64>({0, 3, 4}));
+  }
+  {
+    Tensor t(DataType::DT_UINT16, {2});
+    int indx = 0;
+    for (const uint16 v : {0, 3}) {
+      t.flat<uint16>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint16>(t, 4) ==
+                std::vector<uint16>({0, 3, 4}));
+  }
+}
+
+TEST(SparseUtilsTest, ContainsEmptyRows) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint16 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint16>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint32 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint32>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint64 data[] = {0, 0, 0, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint64>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 1, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+}
+
+TEST(SparseUtilsTest, FindNextDenseRowStartIndex) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (int32 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<int32>(i, indices_mat));
+    }
+  }
+  {
+    uint16 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (uint16 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<uint16>(i, indices_mat));
+    }
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(1),
+                                                   indices_mat));
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(2),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(3),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(4),
+                                                   indices_mat));
+    EXPECT_EQ(14, FindNextDenseRowStartIndex<int64>(static_cast<int64>(13),
+                                                    indices_mat));
+    EXPECT_EQ(15, FindNextDenseRowStartIndex<int64>(static_cast<int64>(14),
+                                                    indices_mat));
+  }
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/sparse_xent_op.h b/tensorflow/core/kernels/sparse_xent_op.h
index 5e462424ed8a54de417933b0ecc0b08e0bbe1f02..c94597f29709ae649fc5f0fd85b931b9555cdf60 100644
--- a/tensorflow/core/kernels/sparse_xent_op.h
+++ b/tensorflow/core/kernels/sparse_xent_op.h
@@ -18,9 +18,9 @@ limitations under the License.
 // Functor definition for SparseXentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index afb0bf76267f24ba1e2142954abfdcb41356cb96..f20af4f92178729b1adf54151603cf915481414e 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -49,6 +49,7 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
 /// The representative tests for ptb_word on GPU
+#ifdef GOOGLE_CUDA
 BM_SparseXentDev(8, 1000000, gpu);
 
 BM_SparseXentDev(16, 10000, gpu);
@@ -62,6 +63,7 @@ BM_SparseXentDev(32, 100000, gpu);
 BM_SparseXentDev(64, 10000, gpu);
 BM_SparseXentDev(64, 30000, gpu);
 BM_SparseXentDev(64, 100000, gpu);
+#endif  // GOOGLE_CUDA
 
 // CPU
 BM_SparseXentDev(8, 1000000, cpu);
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index 872a6e9d1bcce09765d1531c5f2898b2badc66a7..bb9d18e915a5297a3561be1f3f6f2de338855d1b 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -140,9 +140,9 @@ void ReadCSVFileToComplexVectorOrDie(
       for (std::vector<string>::const_iterator j = parts.begin();
            j != parts.end(); ++j) {
         if (j->find_first_of("ij") != string::npos) {
-          strings::safe_strtod((*j).c_str(), &imaginary_part);
+          strings::safe_strtod(*j, &imaginary_part);
         } else {
-          strings::safe_strtod((*j).c_str(), &real_part);
+          strings::safe_strtod(*j, &real_part);
         }
       }
       data_line.push_back(std::complex<double>(real_part, imaginary_part));
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index a4a59dbcbc5af86e46f750bfd8b0f6decb755f5b..0f0ea7c7c73079c8261ce65ec161e7c116355c23 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -23,7 +23,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/cuda_device_array_gpu.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+#include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -56,6 +57,8 @@ TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
+TF_CALL_uint8(DEFINE_GPU_KERNELS);
+TF_CALL_bool(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
@@ -74,9 +77,9 @@ namespace {
 template <typename T>
 __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
                               int32 split_dim_size, int32 suffix_dim_size,
-                              CudaDeviceArrayStruct<T*> output_ptr_data) {
+                              GpuDeviceArrayStruct<T*> output_ptr_data) {
   const int32 num_split = output_ptr_data.size;
-  T** output_ptrs = GetCudaDeviceArrayOnDevice(&output_ptr_data);
+  T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
 
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
@@ -111,11 +114,11 @@ __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
 // is reversed
 template <typename T, typename IntType, bool useSmem>
 __global__ void split_v_kernel(const T* input_ptr,
-                               CudaDeviceArrayStruct<IntType> output_scan,
+                               GpuDeviceArrayStruct<IntType> output_scan,
                                IntType total_rows, IntType total_cols,
-                               CudaDeviceArrayStruct<T*> output_ptr_data) {
-  T** output_ptrs = GetCudaDeviceArrayOnDevice(&output_ptr_data);
-  IntType* col_scan = GetCudaDeviceArrayOnDevice(&output_scan);
+                               GpuDeviceArrayStruct<T*> output_ptr_data) {
+  T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
+  IntType* col_scan = GetGpuDeviceArrayOnDevice(&output_scan);
 
   // do upper_bound on col to find which pointer we should be using
   IntType gidx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -167,11 +170,11 @@ __global__ void split_v_kernel(const T* input_ptr,
 // different from the original split implementation due to 2D vs 3D
 // dimensions.  This version is likely faster due to less integer math.
 template <typename T>
-__global__ void SplitVOpKernel_fixed(
-    const T* input, int32 prefix_dim_size, int32 suffix_dim_size,
-    CudaDeviceArrayStruct<T*> output_ptr_data) {
+__global__ void SplitVOpKernel_fixed(const T* input, int32 prefix_dim_size,
+                                     int32 suffix_dim_size,
+                                     GpuDeviceArrayStruct<T*> output_ptr_data) {
   const int32 num_split = output_ptr_data.size;
-  T** output_ptrs = GetCudaDeviceArrayOnDevice(&output_ptr_data);
+  T** output_ptrs = GetGpuDeviceArrayOnDevice(&output_ptr_data);
 
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
@@ -192,54 +195,52 @@ __global__ void SplitVOpKernel_fixed(
 }
 
 template <typename T>
-struct SplitOpGPULaunch {
-  void Run(const Eigen::GpuDevice& d, const T* input, int32 prefix_dim_size,
-           int32 split_dim_size, int32 suffix_dim_size,
-           const CudaDeviceArrayStruct<T*>& output_ptr_data) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(
-        prefix_dim_size * split_dim_size * suffix_dim_size, d);
-
-    SplitOpKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input, prefix_dim_size, split_dim_size, suffix_dim_size,
-            output_ptr_data);
-  }
-};
+void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
+                              int32 prefix_dim_size, int32 split_dim_size,
+                              int32 suffix_dim_size,
+                              const GpuDeviceArrayStruct<T*>& output_ptr_data) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(
+      prefix_dim_size * split_dim_size * suffix_dim_size, d);
+
+  TF_CHECK_OK(CudaLaunchKernel(SplitOpKernel<T>, config.block_count,
+                               config.thread_per_block, 0, d.stream(), input,
+                               prefix_dim_size, split_dim_size, suffix_dim_size,
+                               output_ptr_data));
+}
 
 template <typename T, typename IntType>
-struct SplitVOpGPULaunch {
-  void Run(const Eigen::GpuDevice& gpu_device, bool fixed_size,
-           const T* input_ptr, int total_rows, int total_cols,
-           const CudaDeviceArrayStruct<IntType>& output_scan,
-           const CudaDeviceArrayStruct<T*>& output_ptr_data) {
-    if (fixed_size) {
-      CudaLaunchConfig config =
-          GetCudaLaunchConfig(total_rows * total_cols, gpu_device);
-
-      SplitVOpKernel_fixed<T><<<config.block_count, config.thread_per_block, 0,
-                                gpu_device.stream()>>>(
-          input_ptr, total_rows, total_cols, output_ptr_data);
-    } else {
-      auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
-      IntType smem_max = gpu_device.sharedMemPerBlock();
-      IntType smem_usage = output_scan.size * sizeof(IntType);
-      // performance crossover is less than using maximum available shared
-      // memory on most processors possibly due to decreasing occupancy
-      // 4096 inputs is a lot, most code will take the smem path
-      const int32 kMaxSmemBytesPerformance = 16384;
-      if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
-        split_v_kernel<T, IntType, true>
-            <<<config.block_count, config.thread_per_block, smem_usage,
-               gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
-                                      total_cols, output_ptr_data);
-      else
-        split_v_kernel<T, IntType, false>
-            <<<config.block_count, config.thread_per_block, 0,
-               gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
-                                      total_cols, output_ptr_data);
-    }
+void SplitVOpGPULaunch<T, IntType>::Run(
+    const Eigen::GpuDevice& gpu_device, bool fixed_size, const T* input_ptr,
+    int total_rows, int total_cols,
+    const GpuDeviceArrayStruct<IntType>& output_scan,
+    const GpuDeviceArrayStruct<T*>& output_ptr_data) {
+  if (fixed_size) {
+    CudaLaunchConfig config =
+        GetCudaLaunchConfig(total_rows * total_cols, gpu_device);
+
+    SplitVOpKernel_fixed<T><<<config.block_count, config.thread_per_block, 0,
+                              gpu_device.stream()>>>(
+        input_ptr, total_rows, total_cols, output_ptr_data);
+  } else {
+    auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
+    IntType smem_max = gpu_device.sharedMemPerBlock();
+    IntType smem_usage = output_scan.size * sizeof(IntType);
+    // performance crossover is less than using maximum available shared
+    // memory on most processors possibly due to decreasing occupancy
+    // 4096 inputs is a lot, most code will take the smem path
+    const int32 kMaxSmemBytesPerformance = 16384;
+    if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
+      split_v_kernel<T, IntType, true>
+          <<<config.block_count, config.thread_per_block, smem_usage,
+             gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
+                                    total_cols, output_ptr_data);
+    else
+      split_v_kernel<T, IntType, false>
+          <<<config.block_count, config.thread_per_block, 0,
+             gpu_device.stream()>>>(input_ptr, output_scan, total_rows,
+                                    total_cols, output_ptr_data);
   }
-};
+}
 
 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
 
diff --git a/tensorflow/core/kernels/split_lib_gpu.h b/tensorflow/core/kernels/split_lib_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..20feb7df143722261d93745695edf66c9817b647
--- /dev/null
+++ b/tensorflow/core/kernels/split_lib_gpu.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_GPU
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/gpu_device_array_gpu.h"
+#include "tensorflow/core/kernels/split_lib.h"
+
+namespace tensorflow {
+
+template <typename T>
+struct SplitOpGPULaunch {
+  void Run(const Eigen::GpuDevice& d, const T* input, int32 prefix_dim_size,
+           int32 split_dim_size, int32 suffix_dim_size,
+           const GpuDeviceArrayStruct<T*>& output_ptr_data);
+};
+
+template <typename T, typename IntType>
+struct SplitVOpGPULaunch {
+  void Run(const Eigen::GpuDevice& d, bool fixed, const T* input,
+           int total_cols, int total_rows,
+           const GpuDeviceArrayStruct<IntType>& output_scan,
+           const GpuDeviceArrayStruct<T*>& output_ptr_data);
+};
+
+// Explicit instantiations in split_lib_gpu.cu.cc.
+#define REGISTER_GPU_KERNEL(T)                        \
+  extern template struct SplitOpGPULaunch<T>;         \
+  extern template struct SplitVOpGPULaunch<T, int32>; \
+  extern template struct SplitVOpGPULaunch<T, int64>;
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPLIT_LIB_GPU_H_
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 11db72bfa3c66130783ad67f01c041a5d3d5085a..a419eedb39871f52bb24ab4d63b29a58a674a8fe 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,7 +29,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
-#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -267,13 +268,6 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
 #if GOOGLE_CUDA
 
-template <typename T>
-struct SplitOpGPULaunch {
-  void Run(const Eigen::GpuDevice& d, const T* input, int32 prefix_dim_size,
-           int32 split_dim_size, int32 suffix_dim_size,
-           const CudaDeviceArrayStruct<T*>& output_ptr_data);
-};
-
 // Partial specialization for GPU
 template <typename T>
 class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
@@ -308,7 +302,7 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     TensorShape output_shape(input_shape);
     output_shape.set_dim(split_dim, split_dim_output_size);
 
-    CudaDeviceArrayOnHost<T*> ptrs(context, num_split);
+    GpuDeviceArrayOnHost<T*> ptrs(context, num_split);
     OP_REQUIRES_OK(context, ptrs.Init());
 
     for (int i = 0; i < num_split; ++i) {
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 5c19a45fb18abdacb5f89f623f9690b43bdfa1e5..8e53089af0dacdbf30f4e6c812acf8dce166be1c 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <numeric>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -35,7 +35,8 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
-#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -329,14 +330,6 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
 #if GOOGLE_CUDA
 
-template <typename T, typename IntType>
-struct SplitVOpGPULaunch {
-  void Run(const Eigen::GpuDevice& d, bool fixed, const T* input,
-           int total_cols, int total_rows,
-           const CudaDeviceArrayStruct<IntType>& output_scan,
-           const CudaDeviceArrayStruct<T*>& output_ptr_data);
-};
-
 // Partial specialization for GPU
 template <typename T, typename Tlen>
 class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
@@ -373,10 +366,10 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     // reshape to 2D
 
     if (num_split > 16) {
-      CudaDeviceArrayOnHost<T*> ptrs(context, num_split);
+      GpuDeviceArrayOnHost<T*> ptrs(context, num_split);
       OP_REQUIRES_OK(context, ptrs.Init());
 
-      CudaDeviceArrayOnHost<Tlen> offsets(context, num_split + 1);
+      GpuDeviceArrayOnHost<Tlen> offsets(context, num_split + 1);
       OP_REQUIRES_OK(context, offsets.Init());
 
       Tlen offset = 0;
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 2af6b4b8148807df9e1f7c0de65f664efe6acc79..033b9f34780a9fc8790d5aaa07501dd013f14750 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -244,9 +244,9 @@ void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     DeviceContext* device_ctxt = ctx->op_device_context();
     auto device = static_cast<tensorflow::Device*>(ctx->device());
     Allocator* allocator = device->GetAllocator(alloc_attrs);
-    AllocatorStats stats;
-    allocator->GetStats(&stats);
-    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    if (stats && *stats->bytes_limit &&
+        stats->bytes_in_use > (*stats->bytes_limit * kOccupancy)) {
       // Asynchronously copy the tensor from GPU to CPU memory.
       // TODO(yuanbyu): Swap the oldest tensor first.
       AllocatorAttributes host_alloc_attrs;
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 65174e163c1031d3e480159824f984e4bf83980b..925c9266395585ecfa4c215cf0a2d493f59a6086 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -216,7 +216,7 @@ class StageOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
@@ -249,7 +249,7 @@ class UnstageOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
@@ -284,7 +284,7 @@ class StagePeekOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
     Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
@@ -314,7 +314,7 @@ class StageSizeOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
                         StageSizeOp);
 #endif
@@ -339,7 +339,7 @@ class StageClearOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index e5c1c258641a96db921c9c64a0797796f0727eaf..1312593d2a5de16bf1dcd3fb5afc92c24b0f1c69 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -15,251 +15,230 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
-#include <cmath>
-#include <memory>
-
-#include "absl/strings/str_join.h"
-#include "absl/types/variant.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/dense_update_functor.h"
-#include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/kernels/random_op.h"
-#include "tensorflow/core/kernels/resource_variable_ops.h"
-#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
-#include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/util.h"
-#include "tensorflow/core/util/work_sharder.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-using random::PhiloxRandom;
-
-namespace {
-
-// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
-// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
-// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
-using StateElementType = int64;
-static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
-
-using Algorithm = StateElementType;
-static constexpr Algorithm RNG_ALG_PHILOX = 1;
-
-using SkippableRNG = absl::variant<PhiloxRandom>;
-
-// This function is for hiding the implementation detail about the
-// `absl::variant` index of each algorithm.
-Algorithm GetAlgorithm(SkippableRNG const& rng) {
-  auto idx = rng.index();
-  if (idx == 0) {
-    return RNG_ALG_PHILOX;
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnrefVar* state_var_guard, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data) {
+    auto state_tensor_flat = state_tensor->flat<StateElementType>();
+    auto state_data = state_tensor_flat.data();
+    // Delegates to PhiloxRandom to do the actual increasing.
+    auto philox = GetPhiloxRandomFromMem(state_data + alg_tag_skip);
+    UpdateMemWithPhiloxRandom(philox, output_size, state_data + alg_tag_skip);
+    // No longer needs the lock.
+    state_var_guard->Release();
+    functor::FillPhiloxRandom<CPUDevice, Distribution>()(
+        ctx, device, philox, output_data, output_size, Distribution());
   }
-  // unreachable
-  return RNG_ALG_PHILOX;
-}
+};
 
-// Fills a buffer with random numbers sampled from a given distribution.
-template <class Device, class Distribution>
-Status FillRandom(OpKernelContext* ctx, const Device& device,
-                  SkippableRNG const& gen, int64 size, Distribution dist,
-                  typename Distribution::ResultElementType* data) {
-  auto algorithm = GetAlgorithm(gen);
-  if (algorithm == RNG_ALG_PHILOX) {
-    auto philox = absl::get<PhiloxRandom>(gen);
-    functor::FillPhiloxRandom<Device, Distribution>()(ctx, device, philox, data,
-                                                      size, dist);
-    return Status::OK();
-  } else {
-    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
-    return Status::OK();
+template <typename Device, typename Distribution>
+Status UpdateVariableAndFill(
+    OpKernelContext* ctx, int state_input_idx, bool read_alg_from_state,
+    Algorithm alg, int64 output_size,
+    typename Distribution::ResultElementType* output_data) {
+  Var* var = nullptr;
+  TF_RETURN_IF_ERROR(
+      LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
+  // Use `ScopedUnlockUnrefVar` here instead of `mutex_lock` and `ScopedUnref`
+  // because the former supports early releasing which is needed by
+  // `UpdateVariableAndFill_Philox<CPU>` to avoid holding the lock while
+  // filling.
+  ScopedUnlockUnrefVar state_var_guard(var);
+  Tensor* var_tensor = var->tensor();
+  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
+    return errors::InvalidArgument("dtype of RNG state variable must be ",
+                                   DataTypeString(STATE_ELEMENT_DTYPE),
+                                   ", not ",
+                                   DataTypeString(var_tensor->dtype()));
   }
-}
-
-// The following two functions use the contract "lower 32 bits for the first
-// uint32, higher 32 bits for the second". Note that this is endian-neutral,
-// unlike a direct memory copy `memcpy(output, &input, 8)`.
-void Int64ToUint32s(int64 input, uint32* output1, uint32* output2) {
-  auto u64 = static_cast<uint64>(input);
-  *output1 = static_cast<uint32>(u64);
-  *output2 = static_cast<uint32>(u64 >> 32);
-}
-
-int64 Uint32sToInt64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
-  return static_cast<int64>(u64_1 | (u64_2 << 32));
-}
-
-void GetPhiloxStateFromTensor(Tensor const& tensor,
-                              PhiloxRandom::ResultType* counter,
-                              PhiloxRandom::Key* key) {
-  auto tensor_flat = tensor.flat<StateElementType>();
-  auto tensor_ptr = tensor_flat.data();
-  // tensor_ptr's index is added by 1 to skip the algorithm tag.
-  Int64ToUint32s(tensor_ptr[1], &(*counter)[0], &(*counter)[1]);
-  Int64ToUint32s(tensor_ptr[2], &(*counter)[2], &(*counter)[3]);
-  Int64ToUint32s(tensor_ptr[3], &(*key)[0], &(*key)[1]);
-}
-
-void WritePhiloxStateToTensor(PhiloxRandom::ResultType const& counter,
-                              PhiloxRandom::Key const& key, Tensor* tensor) {
-  auto tensor_flat = tensor->flat<StateElementType>();
-  auto tensor_ptr = tensor_flat.data();
-  // tensor_ptr's index is added by 1 to skip the algorithm tag.
-  tensor_ptr[1] = Uint32sToInt64(counter[0], counter[1]);
-  tensor_ptr[2] = Uint32sToInt64(counter[2], counter[3]);
-  tensor_ptr[3] = Uint32sToInt64(key[0], key[1]);
-}
-
-// A helper function that does the actual work for
-// 'MakeRNGCopyAndUpdateVariable'.
-template <typename Device>
-Status GetRNGCopyAndUpdateTensor(Tensor* tensor, int64 delta,
-                                 SkippableRNG* rng_copy);
-
-template <>
-Status GetRNGCopyAndUpdateTensor<CPUDevice>(Tensor* tensor, int64 delta,
-                                            SkippableRNG* rng_copy) {
-  // The dtype of `tensor` should be `StateElementType` and the first element
-  // is the algorithm.
-  if (tensor->dims() != 1) {
+  if (var_tensor->dims() != 1) {
     return errors::InvalidArgument(
-        "RNG state must have one and only one dimension, not ", tensor->dims());
+        "RNG state must have one and only one dimension, not ",
+        var_tensor->dims());
   }
-  auto tensor_flat = tensor->flat<StateElementType>();
-  if (tensor_flat.size() < 1) {
-    return errors::InvalidArgument("Size of tensor must be at least 1");
+  auto var_tensor_flat = var_tensor->flat<StateElementType>();
+  int64 alg_tag_skip = 0;
+  if (read_alg_from_state) {
+    alg_tag_skip = 1;
+    if (var_tensor_flat.size() < 1) {
+      return errors::InvalidArgument("Size of tensor must be at least 1");
+    }
+    alg = var_tensor_flat(0);
   }
-  auto algorithm = tensor_flat.data()[0];
-  if (algorithm == RNG_ALG_PHILOX) {
-    // Delegates to PhiloxRandom to do the actual increasing.
+  if (alg == RNG_ALG_PHILOX) {
     static_assert(std::is_same<StateElementType, int64>::value,
                   "StateElementType must be int64");
     static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
                   "PhiloxRandom::ResultElementType must be uint32");
-    auto counter_size = PhiloxRandom::ResultType::kElementCount;
-    auto key_size = PhiloxRandom::Key::kElementCount;
-    auto min_tensor_size = 1 + (counter_size + key_size) / 2;
-    if (tensor_flat.size() < min_tensor_size) {
+    if (var_tensor_flat.size() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
       return errors::InvalidArgument(
-          "For Philox algorithm, the size of state"
+          "For the Philox algorithm, the size of state"
           " must be at least ",
-          min_tensor_size, "; got ", tensor_flat.size());
+          alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ",
+          var_tensor_flat.size());
     }
-    PhiloxRandom::ResultType counter;
-    PhiloxRandom::Key key;
-    GetPhiloxStateFromTensor(*tensor, &counter, &key);
-    PhiloxRandom philox(counter, key);
-    auto old_philox = philox;
-    philox.Skip(delta);  // do the actual increasing
-    WritePhiloxStateToTensor(philox.counter(), philox.key(), tensor);
-    *rng_copy = SkippableRNG(old_philox);
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
+        ctx, var_tensor, var->copy_on_read_mode.load()));
+    UpdateVariableAndFill_Philox<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(), output_size, alg_tag_skip,
+        &state_var_guard, var_tensor, output_data);
     return Status::OK();
   } else {
-    // return errors::InvalidArgument("Unsupported algorithm id: ", algorithm);
-    *rng_copy = SkippableRNG(PhiloxRandom());
-    return Status::OK();
+    return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
 }
 
-// Gets a copy of the RNG and updates the variable. The copy can be used to
-// generate upto 'samples' random numbers, and the variable is updated as if
-// 'samples' random numbers have been generated (e.g. if the variable is a
-// counnter, the counter is increased by 'samples').
-template <class Device>
-Status MakeRNGCopyAndUpdateVariable(OpKernelContext* ctx, int input_idx,
-                                    int64 samples, SkippableRNG* rng_copy) {
-  Var* var = nullptr;
-  TF_RETURN_IF_ERROR(
-      LookupResource(ctx, HandleFromInput(ctx, input_idx), &var));
-  core::ScopedUnref s(var);
-  mutex_lock ml(*var->mu());
-  Tensor* var_tensor = var->tensor();
-  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
-    return errors::InvalidArgument("dtype of RNG state variable must be ",
-                                   DataTypeString(STATE_ELEMENT_DTYPE),
-                                   ", not ",
-                                   DataTypeString(var_tensor->dtype()));
-  }
-  TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
-      ctx, var_tensor, var->copy_on_read_mode.load()));
-  TF_RETURN_IF_ERROR(
-      GetRNGCopyAndUpdateTensor<Device>(var_tensor, samples, rng_copy));
-  return Status::OK();
+// Preconditon: input(0) is an existing resource.
+template <typename Device, class Distribution>
+void ComputeImpl(OpKernelContext* ctx, int state_input_idx, int shape_input_idx,
+                 bool read_alg_from_state, Algorithm alg) {
+  using T = typename Distribution::ResultElementType;
+  const Tensor& shape_t = ctx->input(shape_input_idx);
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, ctx->op_kernel().MakeShape(shape_t, &shape));
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+  auto output_flat = output->flat<T>();
+  OP_REQUIRES_OK(ctx, UpdateVariableAndFill<Device, Distribution>(
+                          ctx, state_input_idx, read_alg_from_state, alg,
+                          output_flat.size(), output_flat.data()));
 }
 
 template <typename Device, class Distribution>
 class StatefulRandomOp : public OpKernel {
  public:
-  using T = typename Distribution::ResultElementType;
   explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  // Assumes that input(0) is an existing resource.
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& shape_t = ctx->input(1);
-    Tensor* output;
+    ComputeImpl<Device, Distribution>(ctx, 0, 1, true, 0);
+  }
+};
+
+template <typename Device, class Distribution>
+class StatefulRandomOpV2 : public OpKernel {
+ public:
+  explicit StatefulRandomOpV2(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& alg_tensor = ctx->input(1);
+    OP_REQUIRES(ctx, alg_tensor.dims() == 0,
+                errors::InvalidArgument("algorithm must be of shape [], not ",
+                                        alg_tensor.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, alg_tensor.dtype() == ALGORITHM_DTYPE,
+        errors::InvalidArgument("algorithm's dtype must be ",
+                                DataTypeString(ALGORITHM_DTYPE), ", not ",
+                                DataTypeString(alg_tensor.dtype())));
+    auto alg = alg_tensor.flat<Algorithm>()(0);
+    ComputeImpl<Device, Distribution>(ctx, 0, 2, false, alg);
+  }
+};
+
+template <typename T>
+class NonDeterministicIntsOp : public OpKernel {
+ public:
+  explicit NonDeterministicIntsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_t = ctx->input(0);
     TensorShape shape;
-    OP_REQUIRES_OK(ctx, MakeShape(shape_t, &shape));
+    OP_REQUIRES_OK(ctx, ctx->op_kernel().MakeShape(shape_t, &shape));
+    Tensor* output;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
     if (shape.num_elements() == 0) return;
 
-    auto output_flat = output->flat<T>();
-    SkippableRNG rng;
-    // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
-    // it just here.
-    OP_REQUIRES_OK(ctx, MakeRNGCopyAndUpdateVariable<Device>(
-                            ctx, 0, output_flat.size() * 256, &rng));
-    // Fill in the random numbers
-    OP_REQUIRES_OK(ctx, FillRandom(ctx, ctx->eigen_device<Device>(), rng,
-                                   output_flat.size(), Distribution(),
-                                   output_flat.data()));
+    switch (dtype_) {
+      case DT_INT32:
+      case DT_UINT32:
+      case DT_INT64:
+      case DT_UINT64: {
+        auto output_flat = output->flat<T>();
+        auto data = output_flat.data();
+        for (int64 i = 0; i < output_flat.size(); ++i) {
+          data[i] = static_cast<T>(random::New64());
+        }
+        break;
+      }
+      default:
+        OP_REQUIRES(ctx, false,
+                    errors::InvalidArgument("Unsupported dtype: ",
+                                            DataTypeString(dtype_)));
+    }
   }
-};
 
-}  // namespace
+ private:
+  DataType dtype_;
+};
 
 // So far the 'Distribution' type parameter is only used when the algorithm is
 // philox, so 'NormalDistribution<PhiloxRandom, ...>' is fine for now.
 #define REGISTER(DEVICE, TYPE)            \
   REGISTER_KERNEL_BUILDER(                \
-      Name("StatefulStandardNormal")      \
+      Name("StatefulStandardNormalV2")    \
           .Device(DEVICE_##DEVICE)        \
           .HostMemory("resource")         \
+          .HostMemory("algorithm")        \
+          .HostMemory("shape")            \
+          .TypeConstraint<TYPE>("dtype"), \
+      StatefulRandomOpV2<DEVICE##Device,  \
+                         random::NormalDistribution<PhiloxRandom, TYPE> >);
+
+// CPU also has the old 'StatefulStandardNormal' op for backward compatibility.
+#define REGISTER_CPU(TYPE)                \
+  REGISTER(CPU, TYPE)                     \
+  REGISTER_KERNEL_BUILDER(                \
+      Name("StatefulStandardNormal")      \
+          .Device(DEVICE_CPU)             \
+          .HostMemory("resource")         \
           .HostMemory("shape")            \
           .TypeConstraint<TYPE>("dtype"), \
-      StatefulRandomOp<DEVICE##Device,    \
+      StatefulRandomOp<CPUDevice,         \
                        random::NormalDistribution<PhiloxRandom, TYPE> >);
 
-#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
 
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 
+#if GOOGLE_CUDA
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_GPU
 #undef REGISTER_CPU
 #undef REGISTER
 
+#define REGISTER_NonDeterministicInts(TYPE)                   \
+  REGISTER_KERNEL_BUILDER(Name("NonDeterministicInts")        \
+                              .Device(DEVICE_CPU)             \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          NonDeterministicIntsOp<TYPE>);
+
+TF_CALL_int32(REGISTER_NonDeterministicInts);
+TF_CALL_uint32(REGISTER_NonDeterministicInts);
+TF_CALL_int64(REGISTER_NonDeterministicInts);
+TF_CALL_uint64(REGISTER_NonDeterministicInts);
+
+#undef REGISTER_NonDeterministicInts
+
 // TODO(wangpeng): Add RNG ops for other distributions.
-// TODO(wangpeng): Add support for GPU and XLA.
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..25d0ce7dfe528ea14a297c257c3d73e147a41e54
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+
+// #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
+// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+using StateElementType = int64;
+static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
+
+using Algorithm = StateElementType;
+static constexpr DataType ALGORITHM_DTYPE = STATE_ELEMENT_DTYPE;
+static constexpr Algorithm RNG_ALG_PHILOX = 1;
+static constexpr Algorithm RNG_ALG_THREEFRY = 2;
+
+using random::PhiloxRandom;
+
+static constexpr int64 PHILOX_MIN_STATE_SIZE =
+    (PhiloxRandom::ResultType::kElementCount +
+     PhiloxRandom::Key::kElementCount) /
+    2;
+static constexpr int64 THREEFRY_MIN_STATE_SIZE = 2;
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
diff --git a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc72de36587b2b61cad2c9eb3f4f215990640d8
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
+
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+
+namespace tensorflow {
+
+// The following 5 functions are made templates to avoid duplicate symbols when
+// linking.
+
+// The following 2 functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+PHILOX_DEVICE_INLINE void Int64ToUint32s(int64 input, uint32* output1,
+                                         uint32* output2) {
+  auto u64 = static_cast<uint64>(input);
+  *output1 = static_cast<uint32>(u64);
+  *output2 = static_cast<uint32>(u64 >> 32);
+}
+
+PHILOX_DEVICE_INLINE int64 Uint32sToInt64(uint32 input1, uint32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return static_cast<int64>(u64_1 | (u64_2 << 32));
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom
+GetPhiloxRandomFromMem(StateElementType const* ptr) {
+  PhiloxRandom::ResultType counter;
+  PhiloxRandom::Key key;
+  Int64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Int64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  Int64ToUint32s(ptr[2], &key[0], &key[1]);
+  return PhiloxRandom(counter, key);
+}
+
+PHILOX_DEVICE_INLINE void WritePhiloxRandomToMem(PhiloxRandom const& philox,
+                                                 StateElementType* ptr) {
+  PhiloxRandom::ResultType const& counter = philox.counter();
+  PhiloxRandom::Key const& key = philox.key();
+  ptr[0] = Uint32sToInt64(counter[0], counter[1]);
+  ptr[1] = Uint32sToInt64(counter[2], counter[3]);
+  ptr[2] = Uint32sToInt64(key[0], key[1]);
+}
+
+PHILOX_DEVICE_INLINE void UpdateMemWithPhiloxRandom(PhiloxRandom const& philox,
+                                                    int64 output_size,
+                                                    StateElementType* ptr) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in `FillPhiloxRandomTask`; do not change
+  // it just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  WritePhiloxRandomToMem(new_philox, ptr);
+}
+
+// A per-device helper function that does the actual work for
+// `UpdateVariableAndFill`.
+// Reason to use functor: C++ doesn't allow function-template partial
+// specialization.
+template <typename Device, typename Distribution>
+struct UpdateVariableAndFill_Philox;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+#if GOOGLE_CUDA
+
+using GPUDevice = Eigen::GpuDevice;
+
+// Declares the partially GPU-specialized functor struct.
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnrefVar* not_used, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data);
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99ce3e677d82563f3ba5b4c66e58c7c50a3ba3ff
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/random_op_gpu.h"
+#include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+__device__ int thread_counter;
+
+template <typename Distribution>
+__global__ void FillKernel(
+    Distribution dist, int64 state_size, int64 output_size,
+    StateElementType* state_data,
+    typename Distribution::ResultElementType* output_data) {
+  // Threads in this block share `philox`. Thread 0 is responsible for
+  // initializing it.
+  __shared__ char philox_raw[sizeof(PhiloxRandom)];
+  auto philox = reinterpret_cast<PhiloxRandom*>(philox_raw);
+  if (threadIdx.x == 0) {
+    *philox = GetPhiloxRandomFromMem(state_data);
+  }
+  __syncthreads();
+  functor::FillPhiloxRandomKernel<Distribution,
+                                  Distribution::kVariableSamplesPerOutput>()
+      .Run(*philox, output_data, output_size, dist);
+  // The last thread updates the state.
+  auto total_thread_count = gridDim.x * blockDim.x;
+  auto old_counter_value = atomicAdd(&thread_counter, 1);
+  if (old_counter_value == total_thread_count - 1) {
+    UpdateMemWithPhiloxRandom(*philox, output_size, state_data);
+  }
+}
+
+template <typename Distribution>
+void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
+    OpKernelContext* ctx, const GPUDevice& d, int64 output_size,
+    int64 alg_tag_skip, ScopedUnlockUnrefVar* not_used, Tensor* state_tensor,
+    typename Distribution::ResultElementType* output_data) {
+  OP_REQUIRES(
+      ctx, alg_tag_skip == 0,
+      errors::InvalidArgument(
+          "GPU kernel doesn't support reading algorithm from state variable, "
+          "so alg_tag_skip must be 0; got",
+          alg_tag_skip));
+  auto state_tensor_flat = state_tensor->flat<StateElementType>();
+  auto state_size = state_tensor_flat.size();
+  auto state_data = state_tensor_flat.data();
+
+  // maximize occupancy
+  const int kGroupSize = Distribution::kResultElementCount;
+  int work_element_count = (output_size + kGroupSize - 1) / kGroupSize;
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d,
+                                             FillKernel<Distribution>, 0, 0);
+
+  int zero = 0;
+  cudaMemcpyToSymbol(thread_counter, &zero, sizeof(int));
+  TF_CHECK_OK(CudaLaunchKernel(FillKernel<Distribution>, cfg.block_count,
+                               cfg.thread_per_block, 0, d.stream(),
+                               Distribution(), state_size, output_size,
+                               state_data, output_data));
+}
+
+// Explicit instantiation of the GPU distributions functors.
+
+// clang-format off
+// NVCC cannot handle ">>" properly
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, Eigen::half> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
+// clang-format on
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 925f5291a68327c9fd939fd06fc025b58ab436ee..959334abc81d70bc854d2026d9eba99a2a01850d 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 6db68f937def6fb4827b7fc85bff873b651a0002..4f53b4a28d22d9114e1db05773dfabf84dd7aac5 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_impl.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
@@ -123,11 +123,8 @@ class StridedSliceOp : public OpKernel {
                       "Input must have rank at least 1, got: ", input.dims()));
       // Otherwise, is_identity should be true.
       VLOG(1) << "Strided slice dim 0: " << input.shape().DebugString();
-      OP_REQUIRES(
-          context, begin[0] <= end[0],
-          errors::InvalidArgument("begin[0] (", begin[0],
-                                  ") must less or equal to end[0] (", end[0]));
-      Tensor slice = input.Slice(begin[0], end[0]);
+      // To tolerate begin[0] > end[0] (a 0-output slice), we min(begin, end).
+      Tensor slice = input.Slice(std::min(begin[0], end[0]), end[0]);
       Tensor tmp;
       OP_REQUIRES(context, tmp.CopyFrom(slice, final_shape),
                   errors::Internal("Copy failed"));
diff --git a/tensorflow/core/platform/default/protobuf_compiler.h b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
similarity index 59%
rename from tensorflow/core/platform/default/protobuf_compiler.h
rename to tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
index a93d7a184b21a1111764e0a7fc0765ebe877ce32..8c3f8f2ad30a56fb4c03105a20d0a7ebc692ec25 100644
--- a/tensorflow/core/platform/default/protobuf_compiler.h
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+#if GOOGLE_CUDA
 
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf_compiler.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf_compiler.h
+#define EIGEN_USE_GPU
 
-#include "google/protobuf/compiler/importer.h"
-#include "tensorflow/core/platform/default/protobuf.h"
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
 
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
+namespace tensorflow {
+TF_CALL_bool(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6951924655a8fcd2b3c400b6e1b76f2d8e49270
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
similarity index 90%
rename from tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
rename to tensorflow/core/kernels/strided_slice_op_gpu_impl.h
index cce1d2fddde7edc0283c524269de9464c2602e25..d70f369ac07a3c605ca90c5ba1e6198525dc1206 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -50,16 +53,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
-TF_CALL_bool(DEFINE_GPU_KERNELS);
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(int32);
-
-#undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07dd0130adc73512df10bf2e95ce580794262c68
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_int8(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..149886308cdf4ec8e9e9187db349e51c57e408b8
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index c4205159c380cb0a78085f87deb760bd4a8c9791..d9b62d4c75486d61f28c0cd9bc3b44206a0689a4 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 70dbd15c46cb341d8ad6ed6013b5b9ff8a5d61da..22742dd38e5d56bf3b9970bf6b01ff734f181169 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -51,7 +51,7 @@ class StringToNumberOp : public OpKernel {
     for (int i = 0; i < input_flat.size(); ++i) {
       OP_REQUIRES(
           context,
-          strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
+          strings::SafeStringToNumeric<OutputType>(input_flat(i),
                                                    &output_flat(i)),
           errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
     }
diff --git a/tensorflow/core/kernels/string_view_variant_wrapper.cc b/tensorflow/core/kernels/string_view_variant_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b576eb4a3e63863d666bd325d0276039727e38c5
--- /dev/null
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
+
+namespace tensorflow {
+
+constexpr const char StringViewVariantWrapper::kTypeName[];
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_view_variant_wrapper.h b/tensorflow/core/kernels/string_view_variant_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc4a8e953489500d1967a6899ae9a003edacf0f9
--- /dev/null
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+
+namespace tensorflow {
+
+// A wrapper class for storing an `absl::string_view` instance in a DT_VARIANT
+// tensor.
+class StringViewVariantWrapper {
+ public:
+  static constexpr const char kTypeName[] =
+      "tensorflow::StringViewVariantWrapper";
+
+  using value_type = absl::string_view;
+
+  StringViewVariantWrapper() = default;
+
+  explicit StringViewVariantWrapper(absl::string_view str_view)
+      : str_view_(str_view) {}
+
+  StringViewVariantWrapper(const StringViewVariantWrapper& other)
+      : str_view_(other.str_view_) {}
+
+  const absl::string_view* get() const { return &str_view_; }
+
+  static string TypeName() { return kTypeName; }
+
+  string DebugString() const { return string(str_view_); }
+
+  void Encode(VariantTensorData* data) const {
+    data->add_tensor(string(str_view_));
+  }
+
+  // Decode assumes that the source VariantTensorData will have a longer
+  // lifetime than this StringViewVariantWrapper.
+  bool Decode(const VariantTensorData& data) {
+    if (data.tensors_size() != 1 || data.tensors(0).dtype() != DT_STRING) {
+      return false;
+    }
+    str_view_ = data.tensors(0).scalar<string>()();
+    return true;
+  }
+
+ private:
+  absl::string_view str_view_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 93c427039dd6e0a7984ee58e51479fdff48937bb..77b16b9384de1bfe8956ff7aa89e2bd8fda35d86 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 5e3465d1dd6ce24a82525704f5223b6d9f0ac39f..d33c0cdb7f01a4d11204d20fd020941d544c45ee 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -53,6 +53,7 @@ class CreateSummaryFileWriterOp : public OpKernel {
                                   max_queue, flush_millis, logdir,
                                   filename_suffix, ctx->env(), s);
                             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -89,6 +90,7 @@ class CreateSummaryDbWriterOp : public OpKernel {
                   db, experiment_name, run_name, user_name, ctx->env(), s));
               return Status::OK();
             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 8c3a58b108abe66f2b61b5153923bee192246cd1..9e308cfc0237aeb64754c81595e17ff6a06c16a5 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -93,9 +93,48 @@ class SvdOpGpu : public AsyncOpKernel {
   }
 
   void RunSVD(OpKernelContext* context, DoneCallback done, int64 m, int64 n,
-              int64 p, int64 batch_size, Scalar* input_ptr,
-              RealScalar* outputS_ptr, Scalar* outputU_ptr,
-              Scalar* outputVT_ptr, int* dev_info_ptr, CudaSolver* solver) {
+              int64 p, Tensor& M_copy, Tensor* S, Tensor* U, Tensor* V,
+              std::unique_ptr<CudaSolver> solver) {
+    // Compute U S V* = M.
+    // 1. cuSolver works in column-major rather than row-major.
+    // 2. Gesvd returns V*.
+    // 3. Hence M should be transposed before input and U (rather than V) should
+    // be transposed on output.
+
+    Tensor u_copy;
+    if (compute_uv_) {
+      TensorShape u_shape;
+      if (full_matrices_) {
+        u_shape = U->shape();
+      } else {
+        TensorShape shapeRaw = M_copy.shape();
+        shapeRaw.RemoveLastDims(2);
+        u_shape = shapeRaw;
+        u_shape.AddDim(p);
+        u_shape.AddDim(m);
+      }
+      OP_REQUIRES_OK_ASYNC(
+          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
+          done);
+    }
+
+    // get the pointers to the data
+    Scalar* input_ptr;
+    RealScalar* outputS_ptr;
+    Scalar* outputU_ptr = NULL;
+    Scalar* outputV_ptr = NULL;
+    auto input_reshaped = M_copy.template flat_inner_dims<Scalar, 3>();
+    input_ptr = input_reshaped.data();
+    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
+    if (compute_uv_) {
+      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
+      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
+    }
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
+    int* dev_info_ptr = dev_info.back().mutable_data();
+
     // Save the input matrix
     // Needed for the n=1 fix, see below, since SVD destroys the input
     Tensor input_copy;
@@ -121,12 +160,12 @@ class SvdOpGpu : public AsyncOpKernel {
       if (compute_uv_) {
         if (full_matrices_) {
           outputU = outputU_ptr + batch * m * m;
-          outputVT = outputVT_ptr + batch * n * n;
+          outputVT = outputV_ptr + batch * n * n;
           jobu = 'A';
           jobvt = 'A';
         } else {
           outputU = outputU_ptr + batch * m * p;
-          outputVT = outputVT_ptr + batch * n * p;
+          outputVT = outputV_ptr + batch * n * p;
           jobu = 'S';
           jobvt = 'S';
         }
@@ -155,17 +194,24 @@ class SvdOpGpu : public AsyncOpKernel {
     if (compute_uv_ && n == 1) {
       // 1. compute the (batched) sum
       const GPUDevice& d = context->eigen_device<GPUDevice>();
-      d.memset(outputVT_ptr, 0, batch_size * sizeof(Scalar));
+      d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
       Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
       ComputeValueOfVKernel<<<cfg2D.block_count, cfg2D.thread_per_block, 0,
                               d.stream()>>>(
           cfg2D, m, full_matrices_ ? m : p, input_copy.flat<Scalar>().data(),
-          outputU_ptr, outputS_ptr, outputVT_ptr);
+          outputU_ptr, outputS_ptr, outputV_ptr);
       // 2. clamp V to -1 or +1
       CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
       ExtractSignOfVKernel<<<cfg1D.block_count, cfg1D.thread_per_block, 0,
-                             d.stream()>>>(cfg1D, outputVT_ptr);
+                             d.stream()>>>(cfg1D, outputV_ptr);
     }
+
+    if (compute_uv_) {
+      auto device = context->eigen_device<GPUDevice>();
+      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
+    }
+
+    CheckResult(context, std::move(done), dev_info, std::move(solver));
   }
 
   void CheckResult(OpKernelContext* context, DoneCallback done,
@@ -192,10 +238,9 @@ class SvdOpGpu : public AsyncOpKernel {
   void PerformSVD_MgeqN(OpKernelContext* context, DoneCallback done, int64 m,
                         int64 n, int64 p, const Tensor& M, Tensor* S, Tensor* U,
                         Tensor* V) {
+    // Transpose M, because cuSolver expects it to be column-major
     TensorShape shapeRaw = M.shape();
     shapeRaw.RemoveLastDims(2);
-
-    // Transpose M, because cuSolver expects it to be column-major
     TensorShape input_shape = shapeRaw;
     input_shape.AddDim(n);
     input_shape.AddDim(m);
@@ -210,58 +255,16 @@ class SvdOpGpu : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, M, &input_copy),
                          done);
 
-    // I need to transpose U at the end
-    // Not V, because cuSolver work column-major
-    Tensor u_copy;
-    if (compute_uv_) {
-      TensorShape u_shape;
-      if (full_matrices_) {
-        u_shape = U->shape();
-      } else {
-        u_shape = shapeRaw;
-        u_shape.AddDim(p);
-        u_shape.AddDim(m);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    RunSVD(context, done, m, n, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose U
-    if (compute_uv_) {
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute U S V* = M.
+    RunSVD(context, done, m, n, p, input_copy, S, U, V, std::move(solver));
   }
 
   // The SVD if m < n
   void PerformSVD_MlessN(OpKernelContext* context, DoneCallback done, int64 m,
                          int64 n, int64 p, const Tensor& M, Tensor* S,
                          Tensor* U, Tensor* V) {
-    // Perform the SVD on M'
+    // Perform the SVD on M'. cuSolver works column major so don't need to
+    // transpose M.
 
     // Reuse the input buffer or make a copy for the SVD depending on whether
     // this op owns the input buffer exclusively. This is needed because the
@@ -281,55 +284,8 @@ class SvdOpGpu : public AsyncOpKernel {
                M.NumElements() * sizeof(Scalar));
     }
 
-    // I need to transpose V at the end
-    Tensor v_copy;
-    if (compute_uv_) {
-      TensorShape v_shape;
-      if (full_matrices_) {
-        v_shape = V->shape();
-      } else {
-        TensorShape shapeRaw = M.shape();
-        shapeRaw.RemoveLastDims(2);
-        v_shape = shapeRaw;
-        v_shape.AddDim(p);
-        v_shape.AddDim(n);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(V->dtype(), v_shape, &v_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      // Note that U and V are flipped
-      outputU_ptr = v_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = U->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    // Note that m and n are flipped
-    RunSVD(context, done, n, m, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose V
-    if (compute_uv_) {
-      auto device = context->eigen_device<GPUDevice>();
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, v_copy, V), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute V S U* = M*.
+    RunSVD(context, done, n, m, p, input_copy, S, V, U, std::move(solver));
   }
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec..129035638ab0e3d427a3fa55e1de0ded7e07a85c 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include <numeric>  // clang-format off
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/kernels/tensor_array.h"
diff --git a/tensorflow/core/kernels/tensor_flag_utils.cc b/tensorflow/core/kernels/tensor_flag_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f91927298078168a78144c361f50661c54c096
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+Status ValidateSparseMatrixShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 3) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 3 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (0 > config_matrix(i, 1)) {
+      return errors::InvalidArgument(
+          "Second column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 2) && config_matrix(i, 2) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 2), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key.first >= config_mat(i, 0) && key.second >= config_mat(i, 1)) {
+      return config_mat(i, 2);
+    }
+  }
+  return config_mat(last_row_index, 2);
+}
+
+Status ValidateScalarQuantityShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 2) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 2 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 1) && config_matrix(i, 1) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key >= config_mat(i, 0)) {
+      return config_mat(i, 1);
+    }
+  }
+  return config_mat(last_row_index, 1);
+}
+
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size) {
+  const Tindices next_multiple_of_bucket_size =
+      (value + bucket_size - 1) / bucket_size * bucket_size;
+  return next_multiple_of_bucket_size - (bucket_size - 1);
+}
+
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size) {
+  if (bucket_size == 1) {
+    return 1;
+  }
+  return std::pow(bucket_size, std::floor(std::log(bucket_size * (value - 1)) /
+                                          std::log(bucket_size)) -
+                                   1) +
+         1;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat,                         \
+      const std::pair<TypeIndex, TypeIndex>& key);                          \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat, const TypeIndex key);   \
+  template int64 FindConfigValueForKey<int64, TypeIndex>(                   \
+      const TTypes<int64>::ConstMatrix& config_mat, const TypeIndex key);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+template int32 GetLinearBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetLinearBucket(const int64 value, const int64 bucket_size);
+
+template int32 GetPowerBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetPowerBucket(const int64 value, const int64 bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_flag_utils.h b/tensorflow/core/kernels/tensor_flag_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab59eecc2560bc2a590471d994437133896d0e6f
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for parsing tensors as runtime flags.
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns Status::OK() if and only if config is a float scalar or a matrix with
+// dimensions M x 3. If config is a scalar then config must be in the range
+// [0, 1.0). If config is a matrix then config must have shape M x 3, all of
+// its entries must be positive, and entries in the last column may not
+// exceed 1.0. If config is a matrix then it may not be empty.
+Status ValidateSparseMatrixShardingConfig(const Tensor& config);
+
+// Returns Status::OK() if and only if config is a float scalar or a non-empty
+// matrix with dimensions M x 2.
+Status ValidateScalarQuantityShardingConfig(const Tensor& config);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key);
+
+// Returns largest multiple of bucket_size less than value.
+// Expects 1 <= bucket_size <= value.
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size);
+
+// Returns the largest power of bucket_size less than value.
+// Expects 1 <= bucket_size <= value. If bucket_size = 1, returns 1.
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
diff --git a/tensorflow/core/kernels/tensor_flag_utils_test.cc b/tensorflow/core/kernels/tensor_flag_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23ccc7ad7a16bb9a9cdac4c53f1a3252ae29ed6c
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils_test.cc
@@ -0,0 +1,322 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::error::INVALID_ARGUMENT;
+using tensorflow::tensor_flag_utils::FindConfigValueForKey;
+using tensorflow::tensor_flag_utils::GetLinearBucket;
+using tensorflow::tensor_flag_utils::GetPowerBucket;
+using tensorflow::tensor_flag_utils::ValidateScalarQuantityShardingConfig;
+using tensorflow::tensor_flag_utils::ValidateSparseMatrixShardingConfig;
+
+TEST(SparseUtilsTest, ValidateSparseMatrixShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 50.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, -40.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtilsTest, ValidateScalarQuantityShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {-40.0, 0.41, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtils, FindConfigValueForKey) {
+  {
+    float data[] = {60.0, 50.0, 0.41, 30.0, 20.0, 0.1, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 1, 3);
+    auto val = FindConfigValueForKey<float, int64>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 50.0, 0.41, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 2, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 0.41, 50.0, 0.14, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 2);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, 70);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 60);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 55);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 50);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 20);
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 30);
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+}
+
+TEST(SparseUtils, GetLinearBucket) {
+  EXPECT_EQ(11, GetLinearBucket(11, 5));
+  EXPECT_EQ(11, GetLinearBucket(12, 5));
+  EXPECT_EQ(1, GetLinearBucket(4ll, 5ll));
+}
+
+TEST(SparseUtils, GetPowerBucket) {
+  EXPECT_EQ(6, GetPowerBucket(11, 5));
+  EXPECT_EQ(6, GetPowerBucket(12, 5));
+  EXPECT_EQ(1332, GetPowerBucket(1335, 11));
+  EXPECT_EQ(5, GetPowerBucket(5ll, 4ll));
+  EXPECT_EQ(1, GetPowerBucket(4ll, 1ll));
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index 0de32e730ed858ccc3dfcbacb65a7cf922aa5ce2..59bc2d3a0089f294d8c968a3bd15514060aef8ca 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -75,9 +75,10 @@ void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
   const T* p = in.flat<T>().data();
   T* q = out->flat<T>().data();
   CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
-  TileKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
-      ndims, q);
+  TF_CHECK_OK(
+      CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                       d.stream(), cfg.virtual_thread_count, p,
+                       reinterpret_cast<const int32*>(dev_buf), ndims, q));
   // Safe to deallocate immediately after the kernel launch.
   d.deallocate(dev_buf);
 }
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 7fdce6cb7190ffa5f799853e27d18b9e33f2971a..2f6fffed2fd9f2430bcadda7f8cf28e0753da055 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -82,8 +82,8 @@ class TopK : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, output_shape, &indices_out));
 
-    // Nothing to do for top-nothing.
-    if (k == 0) return;
+    // Nothing to do for top-nothing or over nothing.
+    if (k == 0 || num_rows == 0) return;
 
     auto values = values_out->flat_inner_dims<T>();
     auto indices = indices_out->flat_inner_dims<int32>();
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index 6f3bec20f6919e3257fd823699afd23e3ccc0653..1bcc0221b87d1d320e0a3ee606b9045f358bb215 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -401,8 +401,9 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
   // We are limited by the amount of shared memory we have per block.
   auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
 
-  TopKKernel<<<batch_size, num_shards, shared_memory_size, stream>>>(
-      input, length, k, sorted, output, indices);
+  TF_CHECK_OK(CudaLaunchKernel(TopKKernel<T>, batch_size, num_shards,
+                               shared_memory_size, stream, input, length, k,
+                               sorted, output, indices));
   return cudaGetLastError();
 }
 
@@ -412,7 +413,7 @@ struct SegmentOffsetCreator {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
     return idx * num_cols_;
-  };
+  }
 
   int num_cols_;
 };
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index b2239ab5c39fea33fc70b6aaf170d456cd1ba3fe..5594c998dd1f69e597c31b800bde55a8b7f63e53 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index d6a237d6c183cbacf2b5bbbd5f5e9034e84c73af..ff3972f1ff28eab0b219323348206c2b9bcc5635 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -80,10 +80,10 @@ void TransposeSimple(const GPUDevice& d, const Tensor& in,
   const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
   T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
   CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
-  TransposeKernel<T, conjugate>
-      <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-          cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
-          ndims, q);
+  TF_CHECK_OK(CudaLaunchKernel(
+      TransposeKernel<T, conjugate>, cfg.block_count, cfg.thread_per_block, 0,
+      d.stream(), cfg.virtual_thread_count, p,
+      reinterpret_cast<const int32*>(dev_buf), ndims, q));
   // Safe to deallocate immediately after the kernel launch.
   d.deallocate(dev_buf);
 }
@@ -168,60 +168,29 @@ struct TransposeUsingTile<complex128, conjugate> {
 }  // namespace internal
 
 // Transpose kernel specialized for GPU Device.
+#define HANDLE_DIM(DIM)                                                      \
+  case DIM:                                                                  \
+    internal::TransposeUsingEigen<GPUDevice, T, DIM>(d, in, perm, conjugate, \
+                                                     out);                   \
+    break
+
 template <typename T, bool conjugate>
 struct Transpose<GPUDevice, T, conjugate> {
   static void run(const GPUDevice& d, const Tensor& in,
                   const gtl::ArraySlice<int32> perm, Tensor* out) {
+    if (in.dims() < 2) return;
+    if (internal::TransposeUsingTile<T, conjugate>::run(d, in, perm, out)) {
+      return;
+    }
+
     switch (in.dims()) {
-      case 2:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 2>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 3:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 3>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 4:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 4>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 5:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 5>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 6:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 6>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 7:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 7>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
-      case 8:
-        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
-                                                             out)) {
-          internal::TransposeUsingEigen<GPUDevice, T, 8>(d, in, perm, conjugate,
-                                                         out);
-        }
-        break;
+      HANDLE_DIM(2);
+      HANDLE_DIM(3);
+      HANDLE_DIM(4);
+      HANDLE_DIM(5);
+      HANDLE_DIM(6);
+      HANDLE_DIM(7);
+      HANDLE_DIM(8);
       default:
         internal::TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
@@ -229,6 +198,8 @@ struct Transpose<GPUDevice, T, conjugate> {
   }
 };
 
+#undef HANDLE_DIM
+
 template <bool conjugate>
 struct Transpose<GPUDevice, string, conjugate> {
   static void run(const GPUDevice& d, const Tensor& in,
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 48e392c07073a9adf989fc2171222e966aede0f6..1c0d70c333f8bbef08e9a37e06694ec5ff19b20d 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/transpose_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op.cc b/tensorflow/core/kernels/tridiagonal_solve_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5884ffedfbc7b25f59e3c67da4af486ef6239c48
--- /dev/null
+++ b/tensorflow/core/kernels/tridiagonal_solve_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+static const char kErrMsg[] = "The matrix is not invertible.";
+
+template <class Scalar>
+class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit TridiagonalSolveOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    auto num_inputs = input_matrix_shapes.size();
+    OP_REQUIRES(context, num_inputs == 2,
+                errors::InvalidArgument("Expected two input matrices, got ",
+                                        num_inputs, "."));
+
+    auto num_diags = input_matrix_shapes[0].dim_size(0);
+    OP_REQUIRES(
+        context, num_diags == 3,
+        errors::InvalidArgument("Expected diagonals to be provided as a "
+                                "matrix with 3 rows, got ",
+                                num_diags, " rows."));
+
+    auto num_eqs_left = input_matrix_shapes[0].dim_size(1);
+    auto num_eqs_right = input_matrix_shapes[1].dim_size(0);
+    OP_REQUIRES(
+        context, num_eqs_left == num_eqs_right,
+        errors::InvalidArgument("Expected the same number of left-hand sides "
+                                "and right-hand sides, got ",
+                                num_eqs_left, " and ", num_eqs_right, "."));
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({input_matrix_shapes[1]});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    const int num_eqs = static_cast<int>(input_matrix_shapes[0].dim_size(1));
+    const int num_rhss = static_cast<int>(input_matrix_shapes[1].dim_size(0));
+
+    const double add_cost = Eigen::TensorOpCost::AddCost<Scalar>();
+    const double mult_cost = Eigen::TensorOpCost::MulCost<Scalar>();
+    const double div_cost = Eigen::TensorOpCost::DivCost<Scalar>();
+
+    // Assuming cases with and without row interchange are equiprobable.
+    const double cost =
+        num_eqs * (div_cost * (num_rhss + 1) +
+                   (add_cost + mult_cost) * (2.5 * num_rhss + 1.5));
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const auto diagonals = inputs[0];
+
+    // Subdiagonal elements, first is ignored.
+    const auto& superdiag = diagonals.row(0);
+    // Diagonal elements.
+    const auto& diag = diagonals.row(1);
+    // Superdiagonal elements, n-th is ignored.
+    const auto& subdiag = diagonals.row(2);
+    // Right-hand sides (transposed - necessary for GPU impl).
+    const auto& rhs = inputs[1];
+
+    const int n = diag.size();
+    MatrixMap& x = outputs->at(0);
+    const Scalar zero(0);
+
+    if (n == 0) {
+      return;
+    }
+    if (n == 1) {
+      OP_REQUIRES(context, diag(0) != zero, errors::InvalidArgument(kErrMsg));
+      x.row(0) = rhs.row(0) / diag(0);
+      return;
+    }
+
+    // The three columns in u are the diagonal, superdiagonal, and second
+    // superdiagonal, respectively, of the U matrix in the LU decomposition of
+    // the input matrix (subject to row exchanges due to pivoting). For pivoted
+    // tridiagonal matrix, the U matrix has at most two non-zero superdiagonals.
+    Eigen::Array<Scalar, Eigen::Dynamic, 3> u(n, 3);
+
+    // The code below roughly follows LAPACK's dgtsv routine, with main
+    // difference being not overwriting the input.
+    u(0, 0) = diag(0);
+    u(0, 1) = superdiag(0);
+    x.row(0) = rhs.row(0);
+    for (int i = 0; i < n - 1; ++i) {
+      if (std::abs(u(i)) >= std::abs(subdiag(i + 1))) {
+        // No row interchange.
+        OP_REQUIRES(context, u(i) != zero, errors::InvalidArgument(kErrMsg));
+        const Scalar factor = subdiag(i + 1) / u(i, 0);
+        u(i + 1, 0) = diag(i + 1) - factor * u(i, 1);
+        x.row(i + 1) = rhs.row(i + 1) - factor * x.row(i);
+        if (i != n - 2) {
+          u(i + 1, 1) = superdiag(i + 1);
+          u(i, 2) = 0;
+        }
+      } else {
+        // Interchange rows i and i + 1.
+        const Scalar factor = u(i, 0) / subdiag(i + 1);
+        u(i, 0) = subdiag(i + 1);
+        u(i + 1, 0) = u(i, 1) - factor * diag(i + 1);
+        u(i, 1) = diag(i + 1);
+        x.row(i + 1) = x.row(i) - factor * rhs.row(i + 1);
+        x.row(i) = rhs.row(i + 1);
+        if (i != n - 2) {
+          u(i, 2) = superdiag(i + 1);
+          u(i + 1, 1) = -factor * superdiag(i + 1);
+        }
+      }
+    }
+    x.row(n - 1) /= u(n - 1, 0);
+    x.row(n - 2) = (x.row(n - 2) - u(n - 2, 1) * x.row(n - 1)) / u(n - 2, 0);
+    for (int i = n - 3; i >= 0; --i) {
+      x.row(i) = (x.row(i) - u(i, 1) * x.row(i + 1) - u(i, 2) * x.row(i + 2)) /
+                 u(i, 0);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+};
+
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<double>),
+                       double);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex64>),
+                       complex64);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex128>),
+                       complex128);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index c9c2ac1e69c431957b3db60f10e598b102ba9ebe..c071db606485dbf5747c8695e299da69095c4de3 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "unicode/unistr.h"  // TF:icu
 #include "unicode/uset.h"  // TF:icu
 #include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 3bdcfc90b878479572ad144bc82e9dc6763a4abf..adf84bae49cf7f70577e8b22390527c6b276a170 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 1e1647db5c1c41d6242cab87b0d8a8cf66d32a28..4690609279532e6f616b15ee80177dd2b7e5836e 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -144,6 +144,8 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
+TF_CALL_uint8(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index aa65fa6b637e3077c456b3c724effc759c26c7dd..374257d1766a04feb52fcdb07bae4cfccfc537ed 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 2255597651ffa17cb21650dfad28c24f15b36fc9..da9434efcb0651200d082a690aee0bb365bae829 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "third_party/cub/device/device_select.cuh"
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/where_op.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -324,9 +324,10 @@ struct Where<GPUDevice, NDIM, T, TIndex> {
         CalculateStrides<TIndex, T, NDIM>(input);
     const TIndex output_rows = output.dimension(0);
     CudaLaunchConfig config = GetCudaLaunchConfig(output_rows, d);
-    PropagateWhereIndicesKernel<NDIM, TIndex>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            output_rows, strides, output.data());
+    TF_CHECK_OK(CudaLaunchKernel(PropagateWhereIndicesKernel<NDIM, TIndex>,
+                                 config.block_count, config.thread_per_block, 0,
+                                 d.stream(), output_rows, strides,
+                                 output.data()));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 9a3612bd72cdc2bc1c3c471beed6616816072a71..58d905c3547a0d0aee918a03754b308804bb2048 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -134,7 +134,7 @@ TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T"),
@@ -147,7 +147,7 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<double>("T"),
                         SoftmaxXentWithLogitsOp<GPUDevice, double>);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
index 2c0c0b3a027e28c9502b162aa491dac83fea5fdd..6c7a9d7ba0ad70c909dc2cc289d7e97cbc03a7fd 100644
--- a/tensorflow/core/kernels/xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -54,4 +54,4 @@ template struct functor::XentFunctor<GPUDevice, double>;
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.cc b/tensorflow/core/lib/bfloat16/bfloat16.cc
index a591717fd1abfc3d959d219d9ce2bde1272fd8ea..e6e24bc078668b9290f41ce501cea8de2d423779 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.cc
+++ b/tensorflow/core/lib/bfloat16/bfloat16.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+const uint16_t bfloat16::NAN_VALUE;
+const uint16_t bfloat16::ZERO_VALUE;
+
 B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
   return static_cast<Eigen::half>(float(*this));
 }
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 440854658094c3be0ad113ef01d4814f9f45ca06..1294ccff2676e0cf33585ba4518002457c37e93f 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -372,6 +372,14 @@ struct bfloat16 {
     return x;
   }
 
+  static bfloat16 min_positive_normal() {
+    bfloat16 x;
+    x.value = 0x0080;  // 0x1p-126
+    return x;
+  }
+
+  bool IsZero() const { return (value & 0x7FFF) == ZERO_VALUE; }
+
   uint16_t value;
 
   // A value that represents "not a number".
diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc
index 50872eef83a591718468173b5940d46079924638..4c33dfa211eaa8296057ae183ffcb03b78b57975 100644
--- a/tensorflow/core/lib/core/coding.cc
+++ b/tensorflow/core/lib/core/coding.cc
@@ -133,6 +133,17 @@ int VarintLength(uint64_t v) {
   return len;
 }
 
+const char* GetVarint32Ptr(const char* p, const char* limit, uint32* value) {
+  if (p < limit) {
+    uint32 result = *(reinterpret_cast<const unsigned char*>(p));
+    if ((result & 128) == 0) {
+      *value = result;
+      return p + 1;
+    }
+  }
+  return GetVarint32PtrFallback(p, limit, value);
+}
+
 const char* GetVarint32PtrFallback(const char* p, const char* limit,
                                    uint32* value) {
   uint32 result = 0;
diff --git a/tensorflow/core/lib/core/coding.h b/tensorflow/core/lib/core/coding.h
index 4a70ffa619071a8c074b0000456a6a2bfb99f021..bfab80dd0076355f3e93be8200e7d29f8938a6ca 100644
--- a/tensorflow/core/lib/core/coding.h
+++ b/tensorflow/core/lib/core/coding.h
@@ -55,18 +55,8 @@ extern const char* GetVarint64Ptr(const char* p, const char* limit, uint64* v);
 // Internal routine for use by fallback path of GetVarint32Ptr
 extern const char* GetVarint32PtrFallback(const char* p, const char* limit,
                                           uint32* value);
-inline const char* GetVarint32Ptr(const char* p, const char* limit,
-                                  uint32* value) {
-  if (p < limit) {
-    uint32 result = *(reinterpret_cast<const unsigned char*>(p));
-    if ((result & 128) == 0) {
-      *value = result;
-      return p + 1;
-    }
-  }
-  return GetVarint32PtrFallback(p, limit, value);
-}
-
+extern const char* GetVarint32Ptr(const char* p, const char* limit,
+                                  uint32* value);
 extern char* EncodeVarint32(char* dst, uint32 v);
 extern char* EncodeVarint64(char* dst, uint64 v);
 
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 24aca854eb6cf8c98f49a5a1bcc3e96ed6d8d1a1..dc5406920a4b8e624fb104f53108cd456f467d76 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -140,6 +140,10 @@ uint8* Decode(const void* srcdata, int datasize,
     ColorMapObject* color_map = this_image->ImageDesc.ColorMap
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
+    if (color_map == nullptr) {
+      *error_string = strings::StrCat("missing color map for frame ", k);
+      return nullptr;
+    }
 
     for (int i = imgTop; i < imgBottom; ++i) {
       uint8* p_dst = this_dst + i * width * channel;
@@ -147,6 +151,14 @@ uint8* Decode(const void* srcdata, int datasize,
         GifByteType color_index =
             this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
                                    (j - img_desc->Left)];
+
+        if (color_index >= color_map->ColorCount) {
+          *error_string = strings::StrCat("found color index ", color_index,
+                                          " outside of color map range ",
+                                          color_map->ColorCount);
+          return nullptr;
+        }
+
         const GifColorType& gif_color = color_map->Colors[color_index];
         p_dst[j * channel + 0] = gif_color.Red;
         p_dst[j * channel + 1] = gif_color.Green;
diff --git a/tensorflow/core/lib/gtl/map_util_test.cc b/tensorflow/core/lib/gtl/map_util_test.cc
index e19459c091e2a512202a6eeaae0a0f55425e0246..bcb1a4a6cf8fad7ddc36e0e8985e98625369dd37 100644
--- a/tensorflow/core/lib/gtl/map_util_test.cc
+++ b/tensorflow/core/lib/gtl/map_util_test.cc
@@ -34,7 +34,6 @@ TEST(MapUtil, Find) {
   m["foo"] = "bar";
   EXPECT_EQ("bar", gtl::FindWithDefault(m, "foo", ""));
   EXPECT_EQ("bar", *gtl::FindOrNull(m, "foo"));
-  string str;
   EXPECT_TRUE(m.count("foo") > 0);
   EXPECT_EQ(m["foo"], "bar");
 }
diff --git a/tensorflow/core/lib/io/buffered_inputstream_test.cc b/tensorflow/core/lib/io/buffered_inputstream_test.cc
index 49b2b1a861ab2d18f23f80715f12d9182f0190c8..ad4c8013bc2e80c749fb71e8b5b141e852f9760f 100644
--- a/tensorflow/core/lib/io/buffered_inputstream_test.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream_test.cc
@@ -337,7 +337,6 @@ TEST(BufferedInputStream, ReadAll_Empty) {
 
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
-    string read;
     BufferedInputStream in(&input_stream, buf_size);
     string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
@@ -355,7 +354,6 @@ TEST(BufferedInputStream, ReadAll_Text) {
 
   for (auto buf_size : BufferSizes()) {
     RandomAccessInputStream input_stream(file.get());
-    string read;
     BufferedInputStream in(&input_stream, buf_size);
     string contents;
     TF_ASSERT_OK(in.ReadAll(&contents));
diff --git a/tensorflow/core/lib/io/table.cc b/tensorflow/core/lib/io/table.cc
index 1ef7bb6ccda5eb9bb176aefb62fe2f6f1898dc1f..1e68493bfe94a713aa85b68a02d61c9a2ed2d94b 100644
--- a/tensorflow/core/lib/io/table.cc
+++ b/tensorflow/core/lib/io/table.cc
@@ -133,7 +133,6 @@ Status Table::InternalGet(const StringPiece& k, void* arg,
   Iterator* iiter = rep_->index_block->NewIterator();
   iiter->Seek(k);
   if (iiter->Valid()) {
-    BlockHandle handle;
     Iterator* block_iter = BlockReader(this, iiter->value());
     block_iter->Seek(k);
     if (block_iter->Valid()) {
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index 9e7d1e64108c40b6827a3fc2cd3513d148334e6d..a21b440318e9da097b20df27bd3ababf902d2927 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -81,6 +81,12 @@ bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
          flags.crop_x + flags.crop_width <= input_image_width;
 }
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+// If in fuzzing mode, don't print any error message as that slows down fuzzing.
+// See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+void no_print(j_common_ptr cinfo) {}
+#endif
+
 uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
@@ -112,9 +118,14 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
   cinfo.err = jpeg_std_error(&jerr);
+  jerr.error_exit = CatchError;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  jerr.output_message = no_print;
+#endif
+
   jmp_buf jpeg_jmpbuf;
   cinfo.client_data = &jpeg_jmpbuf;
-  jerr.error_exit = CatchError;
   if (setjmp(jpeg_jmpbuf)) {
     delete[] tempdata;
     return nullptr;
@@ -398,7 +409,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
       }
       break;
     default:
-      // will never happen, should be catched by the previous switch
+      // will never happen, should be caught by the previous switch
       LOG(ERROR) << "Invalid components value " << components << std::endl;
       jpeg_destroy_decompress(&cinfo);
       return nullptr;
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index ca25f508da9635f02941c99c768947927fd97493..ce87e4dcae65e5a48074e00a6f49f79c1dc76c61 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -81,14 +81,6 @@ TEST(CollectionRegistryDeathTest, DuplicateRegistration) {
       "/tensorflow/metric");
 }
 
-TEST(CollectMetricsTest, NoMetrics) {
-  auto* collection_registry = CollectionRegistry::Default();
-  const std::unique_ptr<CollectedMetrics> collected_metrics =
-      collection_registry->CollectMetrics({});
-  EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
-  EXPECT_EQ(0, collected_metrics->point_set_map.size());
-}
-
 TEST(CollectMetricsTest, Counter) {
   auto counter_with_labels = std::unique_ptr<Counter<2>>(
       Counter<2>::New("/tensorflow/test/counter_with_labels",
@@ -111,7 +103,7 @@ TEST(CollectMetricsTest, Counter) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/counter_with_labels");
@@ -134,7 +126,7 @@ TEST(CollectMetricsTest, Counter) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/counter_with_labels");
@@ -201,7 +193,7 @@ TEST(CollectMetricsTest, Gauge) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/string_gauge_with_labels");
@@ -224,7 +216,7 @@ TEST(CollectMetricsTest, Gauge) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/string_gauge_with_labels");
@@ -307,7 +299,7 @@ TEST(CollectMetricsTest, Sampler) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/sampler_with_labels");
@@ -330,7 +322,7 @@ TEST(CollectMetricsTest, Sampler) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/sampler_with_labels");
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index ede9f4d390180501bd65c3cbfe301da86d7530a6..5334394d35243686d6253d0fc6f014556c456483 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -396,7 +396,6 @@ void BM_WriteNum(int n, T multiplier) {
 
 template <typename T>
 void BM_ReadNum(int n, T multiplier) {
-  string x;
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   // Use enough distinct values to confuse the branch predictor
diff --git a/tensorflow/core/lib/strings/proto_serialization.cc b/tensorflow/core/lib/strings/proto_serialization.cc
index 5c1fbda2155492c00049f52ce12ae8da665cbda0..2341d3e341d72fe8c385f3abd441dc7c692d9759 100644
--- a/tensorflow/core/lib/strings/proto_serialization.cc
+++ b/tensorflow/core/lib/strings/proto_serialization.cc
@@ -14,20 +14,65 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 
+#include <cstring>
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
+namespace {
+static const int kInlinedBufferSize = 256;
+}  // namespace
 
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result) {
-  DCHECK_LE(msg.ByteSizeLong(), static_cast<size_t>(INT_MAX));
-  const int size = static_cast<int>(msg.ByteSizeLong());
+  const size_t size = msg.ByteSizeLong();
+  DCHECK_LE(size, static_cast<size_t>(INT_MAX));
   *result = string(size, '\0');
-  protobuf::io::ArrayOutputStream array_stream(&(*result)[0], size);
+  return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
+                                        result->size());
+}
+
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size) {
+  DCHECK(msg.ByteSizeLong() == size && size <= static_cast<size_t>(INT_MAX));
+  protobuf::io::ArrayOutputStream array_stream(buffer, size);
   protobuf::io::CodedOutputStream output_stream(&array_stream);
   output_stream.SetSerializationDeterministic(true);
   msg.SerializeWithCachedSizes(&output_stream);
   return !output_stream.HadError() && size == output_stream.ByteCount();
 }
 
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y) {
+  const size_t size = x.ByteSizeLong();
+  if (size != y.ByteSizeLong()) return false;
+  if (size == 0) return true;
+  gtl::InlinedVector<char, kInlinedBufferSize> x_serialized(size);
+  bool success_x = SerializeToBufferDeterministic(x, x_serialized.data(), size);
+  DCHECK(success_x);
+  gtl::InlinedVector<char, kInlinedBufferSize> y_serialized(size);
+  bool success_y = SerializeToBufferDeterministic(y, y_serialized.data(), size);
+  DCHECK(success_y);
+  return memcmp(x_serialized.data(), y_serialized.data(), size) == 0;
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size, seed);
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/proto_serialization.h b/tensorflow/core/lib/strings/proto_serialization.h
index 6664928e2818c747268ec1c361acce6bcf6c862e..763bd68c1bf8b8cc709d5a01a308550ffefeb743 100644
--- a/tensorflow/core/lib/strings/proto_serialization.h
+++ b/tensorflow/core/lib/strings/proto_serialization.h
@@ -28,6 +28,21 @@ namespace tensorflow {
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result);
 
+// As above, but takes a pre-allocated buffer wrapped by result.
+// PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size);
+
+// Returns true if serializing x and y using
+// SerializeToBufferDeterministic() yields identical strings.
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y);
+
+// Computes Hash64 of the output of SerializeToBufferDeterministic().
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/tensorflow/core/lib/strings/proto_serialization_test.cc b/tensorflow/core/lib/strings/proto_serialization_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81a6f08ae9bf668951103c3f45d5efac527a8a94
--- /dev/null
+++ b/tensorflow/core/lib/strings/proto_serialization_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+
+#include <string>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+GraphDef MakeGraphDef(int num_nodes) {
+  GraphDef graph_def;
+  for (int i = 0; i < num_nodes; ++i) {
+    NodeDef* node = graph_def.add_node();
+    node->set_name(strings::StrCat("node", i));
+    node->set_op(strings::StrCat("op", i % 10));
+    (*node->mutable_attr())["foo"].set_f(3.14f);
+    (*node->mutable_attr())["bar"].set_s("baz");
+  }
+  return graph_def;
+}
+}  // namespace
+
+static void BM_ProtoSerializationToString(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    string serialized;
+    testing::DoNotOptimize(
+        SerializeToStringDeterministic(graph_def, &serialized));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToString)->Range(1, 10000);
+
+static void BM_ProtoSerializationToBuffer(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  const size_t size = graph_def.ByteSizeLong();
+  for (int i = 0; i < iters; ++i) {
+    gtl::InlinedVector<char, 1024> buf(size);
+    testing::DoNotOptimize(
+        SerializeToBufferDeterministic(graph_def, buf.data(), size));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToBuffer)->Range(1, 10000);
+
+static void BM_DeterministicProtoHash64(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(DeterministicProtoHash64(graph_def));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_DeterministicProtoHash64)->Range(1, 10000);
+
+static void BM_AreSerializedProtosEqual(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def_a = MakeGraphDef(num_nodes);
+  GraphDef graph_def_b = MakeGraphDef(num_nodes);
+  graph_def_b.mutable_node(0)->mutable_name()[0] = 'l';
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(AreSerializedProtosEqual(graph_def_a, graph_def_a));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_AreSerializedProtosEqual)->Range(1, 10000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 3aba5ec80eff94970636d8e6afb8985f23ea3e3c..7584f6a239130f00c328570a29a3a4c12c5aaa72 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <ctype.h>
 #include <algorithm>
+#include <cstring>
 #include <vector>
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 4be33b2a0cf10a2525f9a93b5d4942b381d92629..a19e1af888405aa1de9e9a4ca519b895c369cfdf 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -20,9 +20,11 @@ cc_library(
     name = "nccl_lib",
     srcs = if_cuda([
         "nccl_manager.cc",
-        "nccl_manager.h",
         "nccl_rewrite.cc",
     ]),
+    hdrs = if_cuda([
+        "nccl_manager.h",
+    ]),
     copts = tf_copts(),
     deps = if_cuda([
         "@local_config_nccl//:nccl",
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 545284331d7234554fc2fee5543dcf3552c19306..a0b602f301c976acca2c5887de0452210f15acd7 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -63,7 +63,7 @@ struct NcclManager::NcclStream {
   std::unique_ptr<Thread> thread;
   mutex mu;
   condition_variable cv;
-  // Has collective,rank pairs.
+  // Has collective,participant_idx pairs.
   std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
   bool shutdown_requested GUARDED_BY(mu) = false;
 };
@@ -312,8 +312,10 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
   CUDA_RETURN_IF_ERROR(cudaGetDevice(&saved_device));
   NCCL_RETURN_IF_ERROR(ncclGroupStart());
   for (int i = 0; i < collective->num_local_devices; ++i) {
-    const int rank =
-        collective->single_node ? i : collective->participants[i]->global_rank;
+    // Set rank to `participant->global_rank` if provided, else `i`.
+    const int rank = collective->participants[i]->global_rank >= 0
+                         ? collective->participants[i]->global_rank
+                         : i;
     CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[i]));
     NCCL_RETURN_IF_ERROR(ncclCommInitRank(
         nccl_comms.data() + i, collective->num_global_devices, nccl_id, rank));
@@ -344,6 +346,12 @@ void NcclManager::AddToAllReduce(std::unique_ptr<Participant> participant,
   AddParticipant(std::move(participant), context, kAllReduce, reduction_op);
 }
 
+void NcclManager::AddToAllGather(std::unique_ptr<Participant> participant,
+                                 const Context& context) {
+  AddParticipant(std::move(participant), context, kAllGather,
+                 ncclSum /* unused */);
+}
+
 void NcclManager::AddBroadcastSend(std::unique_ptr<Participant> participant,
                                    const Context& context) {
   participant->root = true;
@@ -492,13 +500,11 @@ void NcclManager::RunCollective(Collective* collective) {
     return;
   }
 
-  for (int local_rank = 0; local_rank < collective->num_local_devices;
-       ++local_rank) {
-    Participant* p = collective->participants[local_rank].get();
-    NcclStream* nccl_stream =
-        collective->communicator->members[local_rank].nccl_stream;
+  for (int i = 0; i < collective->num_local_devices; ++i) {
+    Participant* p = collective->participants[i].get();
+    NcclStream* nccl_stream = collective->communicator->members[i].nccl_stream;
     CHECK(nccl_stream != nullptr);
-    const int rank = collective->single_node ? local_rank : p->global_rank;
+    const int rank = p->global_rank >= 0 ? p->global_rank : i;
 
     if (p->input != nullptr) {
       // Wait to ensure that the kernel that produces the data in the input
@@ -522,13 +528,11 @@ void NcclManager::RunCollective(Collective* collective) {
     // Note that it would be possible to run multiple collectives at once, if
     // they have non-intersecting sets of devices.
     mutex_lock l(collective_mu);
-    for (int local_rank = 0; local_rank < collective->num_local_devices;
-         ++local_rank) {
+    for (int i = 0; i < collective->num_local_devices; ++i) {
       NcclStream* nccl_stream =
-          collective->communicator->members[local_rank].nccl_stream;
+          collective->communicator->members[i].nccl_stream;
       mutex_lock l(nccl_stream->mu);
-      nccl_stream->pending_launches_.push_front(
-          std::make_pair(collective, local_rank));
+      nccl_stream->pending_launches_.push_front(std::make_pair(collective, i));
       nccl_stream->cv.notify_all();
     }
   }
@@ -555,23 +559,22 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
       next_launch = nccl_stream->pending_launches_.back();
       nccl_stream->pending_launches_.pop_back();
     }
-    Collective* collective = next_launch.first;
-    int local_rank = next_launch.second;
 
     // Launch the nccl kernel.
+    Collective* collective = next_launch.first;
     ncclDataType_t data_type = ToNcclType(collective->data_type);
-    Participant* p = collective->participants[local_rank].get();
-
-    auto nccl_comm = collective->communicator->members[local_rank].nccl_comm;
+    int p_idx = next_launch.second;
+    Participant* p = collective->participants[p_idx].get();
+    auto nccl_comm = collective->communicator->members[p_idx].nccl_comm;
     ncclResult_t nccl_result = ncclSuccess;
     switch (collective->type) {
       case kAllReduce: {
         const void* sendbuff = p->input->tensor_data().data();
         void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
 
-        VLOG(2) << "call NcclAllReduce participant " << local_rank
-                << " sendbuff " << sendbuff << " recvbuff " << recvbuff
-                << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
+        VLOG(2) << "call NcclAllReduce participant " << p_idx << " sendbuff "
+                << sendbuff << " recvbuff " << recvbuff << " nccl_comm "
+                << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
         nccl_result = ncclAllReduce(sendbuff, recvbuff, p->input->NumElements(),
                                     data_type, collective->reduction_op,
@@ -595,16 +598,30 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                                  collective->root_rank, nccl_comm, *cu_stream);
         break;
       }
+      case kAllGather: {
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+
+        VLOG(2) << "call NcclAllGather participant " << p_idx << " sendbuff "
+                << sendbuff << " sendcount " << p->input->NumElements()
+                << " recvbuff " << recvbuff << " recvcount "
+                << p->output->NumElements() << " nccl_comm " << nccl_comm
+                << " comm_stream " << comm_stream << " cuda_stream "
+                << cu_stream;
+        nccl_result = ncclAllGather(sendbuff, recvbuff, p->input->NumElements(),
+                                    data_type, nccl_comm, *cu_stream);
+        break;
+      }
     }
 
     // Run the done_callback when the nccl kernel finishes running.
-    auto done_callback = [collective, local_rank, nccl_result]() {
+    auto done_callback = [collective, p_idx, nccl_result]() {
       if (nccl_result == ncclSuccess) {
-        collective->participants[local_rank]->done_callback(Status::OK());
+        collective->participants[p_idx]->done_callback(Status::OK());
       } else {
         // Propagate the error, but note that if other members of the collective
         // did launch their kernels, then they are hanging.
-        collective->participants[local_rank]->done_callback(errors::Unknown(
+        collective->participants[p_idx]->done_callback(errors::Unknown(
             "Error invoking NCCL: ", ncclGetErrorString(nccl_result)));
       }
 
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 6ac5deb91a4a762d7c31d8cb501efd191906790a..f2f15f8ec64c23678013e8d479fc6248f5929687 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -143,6 +143,10 @@ class NcclManager {
   void AddToAllReduce(std::unique_ptr<Participant> participant,
                       const Context& context, ncclRedOp_t reduction_op);
 
+  // Adds one participant to an all-gather.
+  void AddToAllGather(std::unique_ptr<Participant> participant,
+                      const Context& context);
+
   // AddBroadcastSend and AddBroadcastRecv combine to send data from one sender
   // to all receivers.
   void AddBroadcastSend(std::unique_ptr<Participant> participant,
@@ -170,6 +174,7 @@ class NcclManager {
     kAllReduce = 1,
     kBroadcast = 2,
     kReduce = 3,
+    kAllGather = 4,
   };
   struct Collective;
   struct Communicator;
@@ -184,7 +189,7 @@ class NcclManager {
   // the corresponding NCCL/CUDA error string.
   Status GetCommunicator(Collective* collective, Communicator** communicator);
 
-  // Adds a participant device to the local `Collective` instance correponding
+  // Adds a participant device to the local `Collective` instance corresponding
   // to `collective_key`.  Launches the `Collective` if it is ready, which it
   // checks by calling `CheckReady()`.  Also performs consistency and sanity
   // checks before launching.
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index e65af133891b3acdc74ee4316c5f6e35d236eb32..420e143c837f600016e66db6833ea8b58edde49d 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -73,9 +73,9 @@ class NcclManagerTest : public ::testing::Test {
 
   static void TearDownTestCase() { delete devices_; }
 
-  TestCase* MakeTestCase(int num_nodes, int num_ranks_per_node,
-                         ncclRedOp_t reduction_op, TensorShape shape,
-                         float value_offset) {
+  TestCase* MakeReductionTestCase(int num_nodes, int num_ranks_per_node,
+                                  ncclRedOp_t reduction_op, TensorShape shape,
+                                  float value_offset) {
     TestCase* test_case = new TestCase();
     test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
@@ -134,6 +134,47 @@ class NcclManagerTest : public ::testing::Test {
     return test_case;
   }
 
+  TestCase* MakeGatherTestCase(int num_nodes, int num_ranks_per_node,
+                               TensorShape in_shape, TensorShape out_shape) {
+    TestCase* test_case = new TestCase();
+    test_case->expected = Tensor(data_type_, out_shape);
+    test::FillFn<Scalar>(&test_case->expected,
+                         [](int) { return static_cast<Scalar>(0); });
+
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int i = 0; i < num_ranks_per_node; ++i) {
+        auto* device = GetDevice(i);
+        auto* stream = device->tensorflow_gpu_device_info()->stream;
+
+        Tensor in_cpu(data_type_, in_shape);
+        test::FillFn<Scalar>(&in_cpu, [&](int index) {
+          return static_cast<Scalar>((index + 1) * value_scale);
+        });
+        // Starting index for this rank's tensor in the all-gathered output.
+        int32 gather_idx =
+            (node * num_ranks_per_node + i) * in_shape.num_elements();
+        for (int j = 0; j < in_shape.num_elements(); ++j) {
+          auto in_val = in_cpu.flat<Scalar>()(j);
+          auto out_expr = test_case->expected.template flat<Scalar>();
+          out_expr(gather_idx + j) = in_val;
+        }
+
+        value_scale *= 10;
+        test_case->ins.emplace_back(GpuAllocator(device), data_type_, in_shape);
+        test_case->outs.emplace_back(GpuAllocator(device), data_type_,
+                                     out_shape);
+
+        const Tensor& in_gpu = test_case->ins.back();
+        auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+        stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
+                           in_cpu.TotalBytes());
+      }
+    }
+
+    return test_case;
+  }
+
   // Waits for the done callback to be called for each participant.
   void WaitForTestCompletion(TestCase* test_case) {
     test_case->mu.lock();
@@ -158,6 +199,9 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
+      VLOG(1) << "Verifying rank " << rank << " expected shape "
+              << test_case->expected.shape() << " out shape "
+              << out_cpu.shape();
       test::ExpectClose(test_case->expected, out_cpu);
     }
   }
@@ -218,8 +262,8 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
     std::unique_ptr<typename TestFixture::TestCase> test_case(
-        this->MakeTestCase(/*num_nodes=*/1, num_ranks, reduction_op,
-                           TensorShape({2, 3}), 0.0f));
+        this->MakeReductionTestCase(/*num_nodes=*/1, num_ranks, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
       VLOG(2) << "rank " << rank << " device " << device->name();
@@ -259,7 +303,7 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
     std::vector<std::pair<int, int>> case_and_rank;
     std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(this->MakeTestCase(
+      test_cases.emplace_back(this->MakeReductionTestCase(
           /*num_nodes=*/1, num_ranks, ncclSum,
           TensorShape({100, i % 5 + 1, i % 3 + 1}), 1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
@@ -324,6 +368,34 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
   }
 }
 
+// Test basic all-gather.
+TYPED_TEST(NcclManagerTest, BasicAllGather) {
+  const int num_ranks = 4;
+  for (int i = 0; i < num_ranks; ++i) {
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeGatherTestCase(/*num_nodes=*/1, num_ranks,
+                                 TensorShape({2, 3}),
+                                 TensorShape({2 * num_ranks, 3})));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
+      auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+      auto* stream = device->tensorflow_gpu_device_info()->stream;
+      auto participant = absl::make_unique<NcclManager::Participant>(
+          device->executor(), stream, event_mgr, device->gpu_id(),
+          &test_case->ins[rank], &test_case->outs[rank], rank,
+          this->CreateDoneCallback(test_case.get()));
+      NcclManager::instance()->AddToAllGather(
+          std::move(participant),
+          {"allgather", /*num_local_devices=*/num_ranks,
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/""});
+    }
+
+    LOG(INFO) << "Verifying results";
+    this->VerifyResults(test_case.get());
+  }
+}
+
 // Multi-node NCCL tests.
 
 TEST(NcclManagerTest, CommunicatorKey) {
@@ -353,8 +425,8 @@ TYPED_TEST(NcclManagerTest, MultiNode) {
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
     std::unique_ptr<typename TestFixture::TestCase> test_case(
-        this->MakeTestCase(num_nodes, num_ranks_per_node, reduction_op,
-                           TensorShape({2, 3}), 0.0f));
+        this->MakeReductionTestCase(num_nodes, num_ranks_per_node, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
     for (int node = 0; node < num_nodes; ++node) {
       auto node_fn = [this, node, &nccl_managers, &communicator_key,
                       &collective_key, reduction_op, &test_case] {
@@ -393,8 +465,9 @@ TYPED_TEST(NcclManagerTest, MultiNode) {
 TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
@@ -427,8 +500,9 @@ TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
 TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
@@ -454,8 +528,9 @@ TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
 TYPED_TEST(NcclManagerTest, ConsistentNumberOfDevices) {
   const int num_ranks = 2;
 
-  std::unique_ptr<typename TestFixture::TestCase> test_case(this->MakeTestCase(
-      1 /* num_nodes */, num_ranks, ncclSum, TensorShape({2, 3}), 0.0f));
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
   for (int rank = 0; rank < num_ranks; ++rank) {
     auto* device = this->GetDevice(rank);
     auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 281e2996ed7c2b07881d5ab564fc31463f8f8607..8b6ee870799f082378033e4535b48407b6ed4a0d 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -347,6 +347,16 @@ REGISTER_OP("Pack")
       while (index < rank) dims.push_back(c->Dim(cur, index++));
 
       c->set_output(0, c->MakeShape(dims));
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        auto* shape_and_type = c->input_handle_shapes_and_types(i);
+        if (shape_and_type) {
+          if (!c->RelaxOutputHandleShapesAndMergeTypes(0, *shape_and_type)) {
+            c->set_output_handle_shapes_and_types(
+                0, std::vector<shape_inference::ShapeAndType>({}));
+            break;
+          }
+        }
+      }
       return Status::OK();
     });
 
@@ -456,47 +466,37 @@ REGISTER_OP("BroadcastTo")
     .Attr("T: type")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle in = c->input(0);
+      ShapeHandle shape_in = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(shape_in, 1, &shape_in));
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
-
       if (!c->RankKnown(out)) {
         // We have no information about the shape of the output.
         c->set_output(0, out);
         return Status::OK();
       }
 
+      ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
         // We have no information about the shape of the input,
         // nothing to do here.
         c->set_output(0, out);
         return Status::OK();
       }
-      if (c->Rank(out) < c->Rank(in)) {
-        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
-                                       c->DebugString(in), " shape ",
-                                       c->DebugString(out));
-      }
-
-      int32 in_offset = c->Rank(out) - c->Rank(in);
-      for (int32 i = 0; i < c->Rank(out); ++i) {
-        DimensionHandle dim = c->Dim(out, i);
-        if (c->ValueKnown(dim)) {
-          // The first in_offset dimensions for input will be expanded with 1,
-          // so no check needed.
-          if (i >= in_offset) {
-            DimensionHandle in_dim = c->Dim(in, i - in_offset);
-            if (c->ValueKnown(in_dim) && c->Value(in_dim) != 0) {
-              if (c->Value(dim) % c->Value(in_dim) != 0) {
-                return errors::InvalidArgument(
-                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
-                    " shape ", c->DebugString(out));
-              }
-            }
-          }
+      int out_rank = c->Rank(out);
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(in, out_rank, &in));
+      int in_rank = c->Rank(in);
+      for (int i = 0; i < in_rank; ++i) {
+        auto in_dim = c->Dim(in, in_rank - i - 1);
+        if (c->Value(in_dim) > 1) {
+          // If the input dimension is greater than 1 then the output dimension
+          // must be equal to it, since we only broadcast "from left to right".
+          auto out_dim = c->Dim(out, out_rank - i - 1);
+          TF_RETURN_IF_ERROR(c->Merge(in_dim, out_dim, &out_dim));
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_rank - i - 1, out_dim, &out));
         }
       }
-
       c->set_output(0, out);
       return Status::OK();
     });
@@ -1034,6 +1034,12 @@ REGISTER_OP("Fill")
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
       c->set_output(0, out);
+
+      auto* shape_and_type = c->input_handle_shapes_and_types(1);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
@@ -1206,27 +1212,13 @@ REGISTER_OP("Identity")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Snapshot")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
@@ -1235,14 +1227,7 @@ REGISTER_OP("_MklIdentity")
     .Output("output: T")
     .Output("mkl_output: uint8")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc( Mkl implementation of IdentityOp
 )Doc");
 #endif
@@ -1626,6 +1611,11 @@ REGISTER_OP("StridedSlice")
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(final_shape, &out));
       c->set_output(0, out);
 
+      auto* shape_and_type = c->input_handle_shapes_and_types(0);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 1c29cd2491fcd8d0e9d773e24e956df8212f2c7f..92648ce18876427b9c19b744f23ba787b4fff217 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -509,6 +509,33 @@ TEST(ArrayOpsTest, BroadcastArgs_ShapeFn) {
   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
 }
 
+TEST(ArrayOpsTest, BroadcastTo_ShapeFn) {
+  ShapeInferenceTestOp op("BroadcastTo");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;[?]", "?");
+  INFER_OK(op, "[];[1]", "[?]");
+  INFER_OK(op, "[1];[1]", "[?]");
+  INFER_OK(op, "[1];[2]", "[?,?]");
+  INFER_OK(op, "[2,2];[3]", "[?,d0_0,d0_1]");
+
+  // Rank checks
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[?,?]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[2];[]");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op, "[2,2];[1]");
+
+  Tensor shape_t(DT_INT64, TensorShape{3});
+  test::FillValues<int64>(&shape_t, {2, 10, 3});
+  op.input_tensors[1] = &shape_t;
+  INFER_OK(op, "[1,?,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[1,1,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[10,1];[3]", "[2,d0_0,3]");
+  INFER_ERROR("Dimensions must be equal, but are 3 and 2 for", op,
+              "[3,1,1];[3]");
+  INFER_ERROR("Dimensions must be equal, but are 2 and 10 for", op,
+              "[2,2,1];[3]");
+}
+
 TEST(ArrayOpsTest, BroadcastGradientArgs_ShapeFn) {
   ShapeInferenceTestOp op("BroadcastGradientArgs");
   // Output is always two unknown vectors.
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index 1c854f661931a6ef26d69752708d7764107b49c6..852b8d326c192e0acd3789d8ae1489a0642d6a7f 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -400,7 +400,6 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries")
           c->WithRank(c->input(num_features), 1, &example_weights_shape));
       for (int i = 0; i < num_features; ++i) {
         ShapeHandle feature_shape;
-        DimensionHandle unused_dim;
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         // the columns are value, weight, min_rank, max_rank.
         c->set_output(i, c->MakeShape({c->UnknownDim(), 4}));
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index e45a8a9b361183303d98f36aef25991566c6f267..06e5f14de76315eb54dfa3ad65f49d5393f8ada7 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -32,6 +32,17 @@ REGISTER_OP("CollectiveReduce")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveGather")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
 REGISTER_OP("CollectiveBcastSend")
     .Input("input: T")
     .Output("data: T")
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index c613ab144f8824586121200b3f89c87b25cc7522..5ffb8cf9a104ebd829c0f872f4fa9c340023f16e 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -37,6 +37,7 @@ tf_cc_test(
     data = [
         ":ops_history.v0.pbtxt",
         ":ops_history.v1.pbtxt",
+        ":ops_history.v2.pbtxt",
         "//tensorflow/core:ops/ops.pbtxt",
     ],
     deps = [
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 3b7a7b812aa9522ccaed64a47dd9e2fe764c90c6..994ac4b0e8f3aa2c9219743f7dbd92727856465b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1546,6 +1546,96 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -12387,6 +12477,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -12624,6 +12754,55 @@ op {
     }
   }
 }
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CholeskyGrad"
   input_arg {
@@ -12643,12 +12822,71 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
 }
+op {
+  name: "ChooseFastestBranchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "ratio_numerator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "ratio_denominator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "num_elements_per_branch"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "other_arguments_lengths"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ClipByValue"
   input_arg {
@@ -12780,7 +13018,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "CollectiveReduce"
+  name: "CollectiveGather"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -12815,32 +13053,113 @@ op {
     type: "int"
   }
   attr {
-    name: "merge_op"
-    type: "string"
-    allowed_values {
-      list {
-        s: "Min"
-        s: "Max"
-        s: "Mul"
-        s: "Add"
-      }
-    }
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "final_op"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "Id"
-        s: "Div"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "subdiv_offsets"
-    type: "list(int)"
-  }
-  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
 }
 op {
   name: "CollectiveReduce"
@@ -12913,6 +13232,56 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -13475,6 +13844,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -16232,124 +16630,59 @@ op {
   }
 }
 op {
-  name: "CudnnRNN"
+  name: "CrossReplicaSum"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "input_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "params"
-    type_attr: "T"
+    name: "group_assignment"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "unidirectional"
-        s: "bidirectional"
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
       }
     }
   }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "CudnnRNNBackprop"
+  name: "CudnnRNN"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -16366,48 +16699,165 @@ op {
     name: "params"
     type_attr: "T"
   }
-  input_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_h_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_c_backprop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reserve_space"
-    type_attr: "T"
-  }
   output_arg {
-    name: "input_backprop"
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "input_h_backprop"
+    name: "output_h"
     type_attr: "T"
   }
   output_arg {
-    name: "input_c_backprop"
+    name: "output_c"
     type_attr: "T"
   }
   output_arg {
-    name: "params_backprop"
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -16775,31 +17225,73 @@ op {
   is_stateful: true
 }
 op {
-  name: "CudnnRNNCanonicalToParams"
+  name: "CudnnRNNBackpropV3"
   input_arg {
-    name: "num_layers"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_units"
-    type: DT_INT32
+    name: "input_h"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_size"
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
     type: DT_INT32
   }
   input_arg {
-    name: "weights"
+    name: "output"
     type_attr: "T"
-    number_attr: "num_params"
   }
   input_arg {
-    name: "biases"
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
     type_attr: "T"
-    number_attr: "num_params"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
   }
   output_arg {
-    name: "params"
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
     type_attr: "T"
   }
   attr {
@@ -16813,12 +17305,6 @@ op {
       }
     }
   }
-  attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
     name: "rnn_mode"
     type: "string"
@@ -16882,9 +17368,17 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "CudnnRNNParamsSize"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
     name: "num_layers"
     type: DT_INT32
@@ -16897,9 +17391,19 @@ op {
     name: "input_size"
     type: DT_INT32
   }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
   output_arg {
-    name: "params_size"
-    type_attr: "S"
+    name: "params"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -16913,14 +17417,10 @@ op {
     }
   }
   attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "rnn_mode"
@@ -16987,7 +17487,7 @@ op {
   }
 }
 op {
-  name: "CudnnRNNParamsToCanonical"
+  name: "CudnnRNNParamsSize"
   input_arg {
     name: "num_layers"
     type: DT_INT32
@@ -17000,19 +17500,9 @@ op {
     name: "input_size"
     type: DT_INT32
   }
-  input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "weights"
-    type_attr: "T"
-    number_attr: "num_params"
-  }
   output_arg {
-    name: "biases"
-    type_attr: "T"
-    number_attr: "num_params"
+    name: "params_size"
+    type_attr: "S"
   }
   attr {
     name: "T"
@@ -17026,10 +17516,123 @@ op {
     }
   }
   attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "rnn_mode"
@@ -17341,6 +17944,138 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cumprod"
   input_arg {
@@ -17979,6 +18714,30 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -19016,6 +19775,43 @@ op {
     }
   }
 }
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "DecodeWav"
   input_arg {
@@ -21194,6 +21990,34 @@ op {
     }
   }
 }
+op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DrawBoundingBoxes"
   input_arg {
@@ -21728,6 +22552,294 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -22035,6 +23147,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -22183,6 +23355,37 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalAutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalBytesProducedStatsDataset"
   input_arg {
@@ -22296,6 +23499,61 @@ op {
     name: "num_experiments"
     type: "int"
   }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
   name: "ExperimentalDatasetCardinality"
@@ -22323,6 +23581,22 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
   input_arg {
@@ -23406,6 +24680,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalScanDataset"
   input_arg {
@@ -23931,58 +25232,49 @@ op {
   }
 }
 op {
-  name: "ExtractImagePatches"
+  name: "ExtractGlimpse"
   input_arg {
-    name: "images"
-    type_attr: "T"
+    name: "input"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "patches"
-    type_attr: "T"
+  input_arg {
+    name: "size"
+    type: DT_INT32
   }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
   attr {
-    name: "padding"
+    name: "noise"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: "uniform"
     }
   }
 }
@@ -24028,8 +25320,6 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -24088,7 +25378,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -24139,15 +25428,15 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24163,33 +25452,9 @@ op {
   }
 }
 op {
-  name: "ExtractJpegShape"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image_shape"
-    type_attr: "output_type"
-  }
-  attr {
-    name: "output_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "ExtractVolumePatches"
+  name: "ExtractImagePatches"
   input_arg {
-    name: "input"
+    name: "images"
     type_attr: "T"
   }
   output_arg {
@@ -24200,13 +25465,96 @@ op {
     name: "ksizes"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractJpegShape"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image_shape"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ExtractVolumePatches"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "T"
@@ -29136,6 +30484,122 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueuePrelinearizedBuffer"
+  input_arg {
+    name: "input"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -31481,2044 +32945,2122 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "Log1p"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "velocities"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogMatrixDeterminant"
+  name: "LoadTPUEmbeddingAdadeltaParameters"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
-  }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "updates"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "accumulators"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
   }
   attr {
-    name: "range_max"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "seed2"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   is_stateful: true
 }
 op {
-  name: "LogicalAnd"
+  name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LogicalNot"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "linears"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogicalOr"
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExportV2"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "momenta"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableImportV2"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
+  name: "LoadTPUEmbeddingRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
   }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "LookupTableInsertV2"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "type"
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "LookupTableSizeV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LoopCond"
+  name: "Log"
   input_arg {
-    name: "input"
-    type: DT_BOOL
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "LowerBound"
-  input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
-  }
+  name: "Log"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Lu"
+  name: "Log"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "lu"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "output_idx_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MakeIterator"
+  name: "Log1p"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LogMatrixDeterminant"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sign"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "log_abs_determinant"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "LogSoftmax"
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "logits"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "logsoftmax"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "MapDataset"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: true
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
 }
 op {
-  name: "MapDataset"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
+    name: "seed2"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MapDefun"
+  name: "LogicalAnd"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "x"
+    type: DT_BOOL
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "y"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MapDefun"
+  name: "LogicalOr"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "Tcaptured"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tvalues"
+    type: "type"
   }
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tvalues"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "LookupTableFind"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
     name: "values"
-    type_list_attr: "dtypes"
+    type_attr: "Tout"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tout"
+    type: "type"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "LookupTableImportV2"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
   }
   input_arg {
     name: "values"
-    type_list_attr: "fake_dtypes"
+    type_attr: "Tout"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tout"
+    type: "type"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "MapUnstage"
+  name: "LookupTableInsertV2"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
+  input_arg {
     name: "values"
-    type_list_attr: "dtypes"
+    type_attr: "Tout"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "Tout"
+    type: "type"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MapUnstageNoKey"
+  name: "LookupTableSize"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "key"
+    name: "size"
     type: DT_INT64
   }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "size"
+    type: DT_INT64
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
   }
-  is_stateful: true
 }
 op {
-  name: "MatMul"
+  name: "LowerBound"
   input_arg {
-    name: "a"
+    name: "sorted_inputs"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "values"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatMul"
+  name: "Lu"
   input_arg {
-    name: "a"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "b"
+  output_arg {
+    name: "lu"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "p"
+    type_attr: "output_idx_type"
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
+    name: "output_idx_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatMul"
+  name: "Lu"
   input_arg {
-    name: "a"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "b"
+  output_arg {
+    name: "lu"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "p"
+    type_attr: "output_idx_type"
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
+    name: "output_idx_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatMul"
+  name: "MakeIterator"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "b"
-    type_attr: "T"
+    name: "iterator"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatchingFiles"
+  name: "MapDataset"
   input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-}
-op {
-  name: "MatrixBandPart"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "num_upper"
-    type: DT_INT64
+  attr {
+    name: "f"
+    type: "func"
   }
-  output_arg {
-    name: "band"
-    type_attr: "T"
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixBandPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "MapDataset"
   input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "band"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "Tindex"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "MapDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDeterminant"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDiag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
 }
 op {
-  name: "MatrixExponential"
+  name: "MapDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "f"
+    type: "func"
   }
-  deprecation {
-    version: 27
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "adjoint"
+    name: "use_inter_op_parallelism"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "MatrixInverse"
+  name: "MapDefun"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
   }
 }
 op {
-  name: "MatrixLogarithm"
+  name: "MapDefun"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixSetDiag"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "f"
+    type: "func"
   }
 }
 op {
-  name: "MatrixSolve"
+  name: "MapDefun"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "arguments"
+    type_list_attr: "Targuments"
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "fast"
-    type: "bool"
+    name: "max_intra_op_parallelism"
+    type: "int"
     default_value {
-      b: true
+      i: 1
     }
   }
 }
 op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
+  name: "MapIncompleteSize"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "fast"
-    type: "bool"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixSquareRoot"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "dtypes"
+    type: "list(type)"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "MapPeek"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "lower"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
+  name: "MapSize"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "lower"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
+  name: "MapStage"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
+  name: "MapUnstageNoKey"
   input_arg {
-    name: "input"
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "b"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
     type: "bool"
     default_value {
       b: false
@@ -33529,55 +35071,39 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Max"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "b"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
     type: "bool"
     default_value {
       b: false
@@ -33588,56 +35114,84 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "transpose_b"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Max"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "b"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
     type: "bool"
     default_value {
       b: false
@@ -33648,31 +35202,79 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
+        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
   attr {
-    name: "Tidx"
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -33683,7 +35285,7 @@ op {
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33695,54 +35297,39 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33754,61 +35341,49 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "T"
+    type: "type"
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatrixExponential"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33820,63 +35395,44 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
+  deprecation {
+    version: 27
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatrixExponential"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33888,64 +35444,50 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  deprecation {
+    version: 27
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: "NHWC"
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixInverse"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33955,25 +35497,10 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
@@ -33981,13 +35508,16 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixInverse"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33997,38 +35527,10 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
   attr {
@@ -34036,13 +35538,17 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixLogarithm"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -34051,95 +35557,55 @@ op {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixSetDiag"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
   attr {
@@ -34147,25 +35613,22 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
+  name: "MatrixSolve"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -34173,25 +35636,10 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
@@ -34199,164 +35647,183 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "rhs"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "fast"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: true
     }
   }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "rhs"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "fast"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: true
     }
   }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "TInput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -34364,79 +35831,79 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "lower"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
+      b: false
     }
   }
   attr {
-    name: "TInput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -34444,185 +35911,230 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "TInput"
+    name: "Tidx"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "Max"
   input_arg {
-    name: "orig_output"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "Max"
   input_arg {
-    name: "orig_output"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NDHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -34634,34 +36146,57 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34697,6 +36232,17 @@ op {
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34706,29 +36252,17 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34764,6 +36298,17 @@ op {
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34781,28 +36326,10 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_QINT8
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34835,9 +36362,21 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34846,6 +36385,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -34854,31 +36395,10 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_QINT8
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34911,45 +36431,57 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -34960,13 +36492,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34982,51 +36514,29 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -35037,13 +36547,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -35059,12 +36569,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -35073,31 +36583,16 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -35108,13 +36603,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -35130,12 +36625,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -35144,30 +36639,22 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "grad"
@@ -35181,13 +36668,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -35199,49 +36686,25 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "grad"
@@ -35255,13 +36718,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -35277,12 +36740,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -35292,47 +36755,40 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35347,59 +36803,70 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35414,61 +36881,72 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35483,38 +36961,46 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
         type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGradGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35527,18 +37013,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35553,12 +37043,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -35568,34 +37058,23 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPool3DGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35605,13 +37084,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -35624,12 +37103,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -35641,29 +37123,32 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35692,48 +37177,45 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35762,18 +37244,24 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -35785,26 +37273,23 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35833,28 +37318,33 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -35864,7 +37354,7 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35877,18 +37367,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35929,12 +37423,15 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35947,23 +37444,27 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
         s: "SAME"
         s: "VALID"
       }
@@ -35993,10 +37494,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -36006,7 +37508,7 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGradGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -36019,18 +37521,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -36057,9 +37563,6 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -36071,15 +37574,12 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGradGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -36092,18 +37592,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -36130,19 +37634,15 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -36152,18 +37652,18 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -36192,42 +37692,52 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -36256,12 +37766,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -36273,45 +37786,44 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
+  input_arg {
     name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "padding"
@@ -36324,12 +37836,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -36347,41 +37862,35 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
+  input_arg {
     name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "padding"
@@ -36394,12 +37903,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -36419,40 +37931,35 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
+  input_arg {
     name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "padding"
@@ -36465,12 +37972,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -36482,23 +37992,31 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -36513,26 +38031,6 @@ op {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
   attr {
     name: "padding"
     type: "string"
@@ -36556,150 +38054,45 @@ op {
       }
     }
   }
-}
-op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36713,56 +38106,61 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36776,25 +38174,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36812,24 +38207,30 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36843,25 +38244,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36881,24 +38279,29 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36912,25 +38315,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36942,33 +38342,37 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36982,25 +38386,29 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -37026,651 +38434,694 @@ op {
   }
 }
 op {
-  name: "Maximum"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "x"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Maximum"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "x"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "orig_output"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Mean"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_FLOAT
     }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Mean"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Mean"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Mean"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_FLOAT
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Merge"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "inputs"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeSummary"
   input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Mfcc"
-  input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
+    minimum: 4
   }
   attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "lower_frequency_limit"
-    type: "float"
-    default_value {
-      f: 20
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "filterbank_channel_count"
-    type: "int"
-    default_value {
-      i: 40
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
     type: "bool"
     default_value {
       b: false
     }
   }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -37682,146 +39133,238 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "MirrorPad"
+  name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tpaddings"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -37831,39 +39374,60 @@ op {
     }
   }
   attr {
-    name: "mode"
+    name: "padding"
     type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MirrorPadGrad"
+  name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tpaddings"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -37873,161 +39437,321 @@ op {
     }
   }
   attr {
-    name: "mode"
+    name: "padding"
     type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "T"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "T"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "T"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_HALF
-        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "ModelDataset"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
+        type: DT_INT8
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Mul"
+  name: "Maximum"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -38046,24 +39770,17 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   is_commutative: true
 }
 op {
-  name: "Mul"
+  name: "Maximum"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -38081,177 +39798,66 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   is_commutative: true
 }
 op {
-  name: "MultiDeviceIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "devices"
-    type: "list(string)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorFromStringHandle"
   input_arg {
-    name: "string_handle"
-    type: DT_STRING
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorGetNextFromShard"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shard_num"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "incarnation_id"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorInit"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "max_buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "incarnation_id"
-    type: DT_INT64
   }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorToStringHandle"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -38261,44 +39867,54 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
-  is_stateful: true
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -38308,46 +39924,56 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  is_stateful: true
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -38357,12 +39983,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -38371,10 +40002,10 @@ op {
     }
   }
   attr {
-    name: "output_dtype"
+    name: "Tidx"
     type: "type"
     default_value {
-      type: DT_INT64
+      type: DT_INT32
     }
     allowed_values {
       list {
@@ -38383,34 +40014,26 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -38424,9 +40047,14 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -38434,10 +40062,10 @@ op {
     }
   }
   attr {
-    name: "output_dtype"
+    name: "Tidx"
     type: "type"
     default_value {
-      type: DT_INT64
+      type: DT_INT32
     }
     allowed_values {
       list {
@@ -38446,360 +40074,409 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
+  name: "Merge"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+}
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  output_arg {
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
   }
   attr {
-    name: "initial_num_buckets"
-    type: "int"
+    name: "delete_old_dirs"
+    type: "bool"
     default_value {
-      i: 131072
+      b: true
     }
   }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
   attr {
-    name: "max_load_factor"
-    type: "float"
+    name: "delete_old_dirs"
+    type: "bool"
     default_value {
-      f: 0.8
+      b: true
     }
   }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTableV2"
+  name: "Mfcc"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "spectrogram"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "deleted_key"
-    type_attr: "key_dtype"
+    name: "sample_rate"
+    type: DT_INT32
   }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "upper_frequency_limit"
+    type: "float"
     default_value {
-      b: false
+      f: 4000
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
+    name: "lower_frequency_limit"
+    type: "float"
     default_value {
-      shape {
-      }
+      f: 20
     }
   }
   attr {
-    name: "initial_num_buckets"
+    name: "filterbank_channel_count"
     type: "int"
     default_value {
-      i: 131072
+      i: 40
     }
   }
   attr {
-    name: "max_load_factor"
-    type: "float"
+    name: "dct_coefficient_count"
+    type: "int"
     default_value {
-      f: 0.8
+      i: 13
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensors"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensorsV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MutexLock"
+  name: "Minimum"
   input_arg {
-    name: "mutex"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
-  output_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MutexV2"
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "NcclAllReduce"
+  name: "Minimum"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  output_arg {
-    name: "data"
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "reduction"
-    type: "string"
-    allowed_values {
-      list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
-      }
-    }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -38807,6 +40484,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -38814,24 +40492,20 @@ op {
       }
     }
   }
-  attr {
-    name: "num_devices"
-    type: "int"
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "NcclBroadcast"
+  name: "Minimum"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -38839,6 +40513,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -38847,648 +40522,593 @@ op {
       }
     }
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "NcclReduce"
+  name: "MirrorPad"
   input_arg {
     name: "input"
     type_attr: "T"
-    number_attr: "num_devices"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "data"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "reduction"
-    type: "string"
-    allowed_values {
-      list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tpaddings"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
   attr {
-    name: "num_devices"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "NearestNeighbors"
-  input_arg {
-    name: "points"
-    type: DT_FLOAT
-  }
+  name: "MirrorPadGrad"
   input_arg {
-    name: "centers"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "k"
-    type: DT_INT64
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "nearest_center_indices"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "nearest_center_distances"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
   }
 }
 op {
-  name: "Neg"
+  name: "Mod"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Neg"
+  name: "Mod"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Neg"
+  name: "Mod"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "NegTrain"
+  name: "ModelDataset"
   input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "examples"
-    type: DT_INT32
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "labels"
-    type: DT_INT32
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "ModelDataset"
   input_arg {
-    name: "lr"
-    type: DT_FLOAT
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  attr {
-    name: "vocab_count"
-    type: "list(int)"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "num_negative_samples"
+    name: "cpu_budget"
     type: "int"
+    default_value {
+      i: 0
+    }
   }
-  deprecation {
-    version: 19
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "NextAfter"
+  name: "Mul"
   input_arg {
-    name: "x1"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "x2"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_HALF
         type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "NextIteration"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "NoOp"
-}
-op {
-  name: "NonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
+  name: "Mul"
   input_arg {
-    name: "scores"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "iou_threshold"
-    type: "float"
-    default_value {
-      f: 0.5
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
+  name: "Mul"
   input_arg {
-    name: "boxes"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "scores"
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
+  name: "MulNoNan"
   input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "NonMaxSuppressionV3"
+  name: "MulNoNan"
   input_arg {
-    name: "boxes"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "scores"
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+  name: "MultiDeviceIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
   }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "shared_name"
+    type: "string"
   }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  attr {
+    name: "container"
+    type: "string"
   }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
+  name: "MultiDeviceIteratorFromStringHandle"
   input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "string_handle"
+    type: DT_STRING
   }
   output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
       }
     }
+    has_minimum: true
   }
   attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
+    name: "output_shapes"
+    type: "list(shape)"
     default_value {
-      b: false
+      list {
+      }
     }
+    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionWithOverlaps"
-  input_arg {
-    name: "overlaps"
-    type: DT_FLOAT
-  }
+  name: "MultiDeviceIteratorGetNextFromShard"
   input_arg {
-    name: "scores"
-    type: DT_FLOAT
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_output_size"
+    name: "shard_num"
     type: DT_INT32
   }
   input_arg {
-    name: "overlap_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+    name: "incarnation_id"
+    type: DT_INT64
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "NotEqual"
+  name: "MultiDeviceIteratorInit"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "max_buffer_size"
+    type: DT_INT64
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
+  output_arg {
+    name: "incarnation_id"
+    type: DT_INT64
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "MultiDeviceIteratorToStringHandle"
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
+    name: "string_handle"
+    type: DT_STRING
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "NotEqual"
+  name: "Multinomial"
   input_arg {
-    name: "x"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "num_samples"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "Multinomial"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
+    name: "num_samples"
     type: DT_INT32
   }
   output_arg {
-    name: "values"
-    type_attr: "T"
+    name: "output"
+    type: DT_INT64
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
@@ -39510,26 +41130,34 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "Multinomial"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
+    name: "num_samples"
     type: DT_INT32
   }
   output_arg {
-    name: "values"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_dtype"
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
@@ -39552,118 +41180,94 @@ op {
       }
     }
   }
-}
-op {
-  name: "NthElement"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "output_dtype"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT64
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
         type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "OneHot"
-  input_arg {
-    name: "indices"
-    type_attr: "TI"
-  }
-  input_arg {
-    name: "depth"
-    type: DT_INT32
-  }
+  name: "Multinomial"
   input_arg {
-    name: "on_value"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "off_value"
-    type_attr: "T"
+    name: "num_samples"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "output_dtype"
   }
   attr {
-    name: "axis"
+    name: "seed"
     type: "int"
     default_value {
-      i: -1
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "TI"
+    name: "output_dtype"
     type: "type"
     default_value {
       type: DT_INT64
     }
     allowed_values {
       list {
-        type: DT_UINT8
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dataset_factory"
-    type: "func"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -39679,244 +41283,118 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "key_dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "value_dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
       }
     }
   }
-}
-op {
-  name: "OptimizeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "optimizations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "OptionalFromValue"
+  name: "MutableDenseHashTableV2"
   input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "empty_key"
+    type_attr: "key_dtype"
   }
-}
-op {
-  name: "OptionalGetValue"
   input_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "deleted_key"
+    type_attr: "key_dtype"
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "OptionalHasValue"
-  input_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "has_value"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "OptionalNone"
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-}
-op {
-  name: "OrderedMapClear"
   attr {
-    name: "capacity"
-    type: "int"
+    name: "shared_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "value_shape"
+    type: "shape"
     default_value {
-      s: ""
+      shape {
+      }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
+    name: "initial_num_buckets"
     type: "int"
     default_value {
-      i: 0
+      i: 131072
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "max_load_factor"
+    type: "float"
     default_value {
-      i: 0
+      f: 0.8
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -39932,43 +41410,29 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapPeek"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -39984,33 +41448,36 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
     default_value {
-      i: 0
+      shape {
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -40026,47 +41493,36 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapStage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -40082,43 +41538,40 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstage"
+  name: "MutexLock"
   input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "mutex"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "mutex_lock"
+    type: DT_VARIANT
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -40137,706 +41590,519 @@ op {
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstageNoKey"
+  name: "NcclAllReduce"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "data"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_devices"
+    type: "int"
   }
   attr {
     name: "shared_name"
     type: "string"
-    default_value {
-      s: ""
-    }
   }
   is_stateful: true
 }
 op {
-  name: "Pack"
+  name: "NcclBroadcast"
   input_arg {
-    name: "values"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "shape"
+    type: "shape"
   }
+  is_stateful: true
 }
 op {
-  name: "Pad"
+  name: "NcclReduce"
   input_arg {
     name: "input"
     type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    number_attr: "num_devices"
   }
   output_arg {
-    name: "output"
+    name: "data"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
   }
   attr {
-    name: "Tpaddings"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
-  name: "PadV2"
+  name: "NearestNeighbors"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "points"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "centers"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "constant_values"
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
+  name: "Neg"
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
+  name: "Neg"
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "PaddedBatchDatasetV2"
+  name: "NegTrain"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+    name: "examples"
+    type: DT_INT32
   }
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "labels"
+    type: DT_INT32
   }
   input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PaddingFIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "lr"
+    type: DT_FLOAT
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "vocab_count"
+    type: "list(int)"
   }
   attr {
-    name: "capacity"
+    name: "num_negative_samples"
     type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  deprecation {
+    version: 19
   }
   is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "x2"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ParallelConcat"
+  name: "NextIteration"
   input_arg {
-    name: "values"
+    name: "data"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
     name: "T"
     type: "type"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
 }
 op {
-  name: "ParallelDynamicStitch"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
-  }
+  name: "NoOp"
+}
+op {
+  name: "NonDeterministicInts"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    number_attr: "N"
+    name: "shape"
+    type_attr: "shape_dtype"
   }
   output_arg {
-    name: "merged"
-    type_attr: "T"
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
   }
   attr {
-    name: "T"
+    name: "shape_dtype"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "NonMaxSuppression"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "max_output_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
   }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "NonMaxSuppressionV2"
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "selected_indices"
+    type: DT_INT32
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV2"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV3"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV3"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV4"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
   }
   attr {
-    name: "sloppy"
+    name: "pad_to_max_output_size"
     type: "bool"
     default_value {
       b: false
@@ -40844,60 +42110,50 @@ op {
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV4"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
   attr {
-    name: "preserve_cardinality"
+    name: "pad_to_max_output_size"
     type: "bool"
     default_value {
       b: false
@@ -40905,110 +42161,87 @@ op {
   }
 }
 op {
-  name: "ParameterizedTruncatedNormal"
+  name: "NonMaxSuppressionWithOverlaps"
   input_arg {
-    name: "shape"
-    type_attr: "T"
+    name: "overlaps"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "means"
-    type_attr: "dtype"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "minvals"
-    type_attr: "dtype"
+    name: "overlap_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "ParameterizedTruncatedNormal"
+  name: "NotEqual"
   input_arg {
-    name: "shape"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -41016,185 +42249,451 @@ op {
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "ParseExample"
+  name: "NthElement"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "names"
-    type: DT_STRING
+    name: "n"
+    type: DT_INT32
   }
-  input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
+}
+op {
+  name: "NthElement"
   input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
+    name: "n"
+    type: DT_INT32
   }
   output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "Nsparse"
+    name: "values"
+    type_attr: "T"
   }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
-  output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "Nsparse"
-    type: "int"
-    has_minimum: true
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
   }
   attr {
-    name: "Ndense"
-    type: "int"
-    has_minimum: true
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_INT64
-        type: DT_STRING
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_UINT8
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
       }
     }
   }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
   attr {
-    name: "dense_shapes"
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
     type: "list(shape)"
     has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParseSequenceExample"
+  name: "OnesLike"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "debug_name"
-    type: DT_STRING
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
-  input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
+    name: "y"
+    type_attr: "T"
   }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
   }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+    name: "y"
+    type_attr: "T"
   }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
   }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+}
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
   }
   output_arg {
-    name: "feature_list_dense_lengths"
-    type: DT_INT64
-    number_attr: "Nfeature_list_dense"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: "list(string)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "context_sparse_keys"
-    type: "list(string)"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalFromValue"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
   }
   attr {
-    name: "context_dense_keys"
-    type: "list(string)"
+    name: "Toutput_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalGetValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "feature_list_sparse_keys"
-    type: "list(string)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "feature_list_dense_keys"
-    type: "list(string)"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalHasValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "has_value"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "OptionalNone"
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
   }
+}
+op {
+  name: "OrderedMapClear"
   attr {
-    name: "Ncontext_sparse"
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -41202,7 +42701,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Ncontext_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -41210,7 +42709,33 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -41218,7 +42743,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -41226,2053 +42751,5367 @@ op {
     has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
+    name: "dtypes"
     type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "capacity"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "feature_list_dense_types"
+    name: "dtypes"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  output_arg {
+    name: "feature_list_dense_lengths"
+    type: DT_INT64
+    number_attr: "Nfeature_list_dense"
+  }
+  attr {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Prelinearize"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "PrelinearizeTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+    allowed_values {
+      list {
+        s: "stdout"
+        s: "stderr"
+        s: "log(info)"
+        s: "log(warning)"
+        s: "log(error)"
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
   attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
+    name: "full_matrices"
+    type: "bool"
     default_value {
-      list {
-      }
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
-    has_minimum: true
   }
 }
 op {
-  name: "ParseSingleExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "num_sparse"
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "num_sparse"
+    name: "num_bits"
     type: "int"
-    has_minimum: true
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
     }
   }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
 }
 op {
-  name: "ParseSingleSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
-  }
-  input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
-  }
-  input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
-  }
-  input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
-  }
-  input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
-  }
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "debug_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
-  }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
-  }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Ncontext_sparse"
-    type: "int"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      i: 0
+      b: true
     }
-    has_minimum: true
   }
   attr {
-    name: "Ncontext_dense"
+    name: "num_bits"
     type: "int"
     default_value {
-      i: 0
+      i: 8
     }
-    has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
+    name: "range_given"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "Nfeature_list_dense"
-    type: "int"
+    name: "input_min"
+    type: "float"
     default_value {
-      i: 0
+      f: 0
     }
-    has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
-    type: "list(type)"
+    name: "input_max"
+    type: "float"
     default_value {
-      list {
-      }
+      f: 0
     }
-    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      b: true
     }
   }
   attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
+    name: "num_bits"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 8
     }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
+    name: "input_min"
+    type: "float"
     default_value {
-      list {
-      }
+      f: 0
     }
-    has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
+    name: "input_max"
+    type: "float"
     default_value {
-      list {
-      }
+      f: 0
     }
-    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  deprecation {
+    version: 22
   }
 }
 op {
-  name: "ParseTensor"
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
   }
 }
 op {
-  name: "PartitionedCall"
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
+    type_attr: "T"
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "config"
-    type: "string"
+    name: "input_min"
+    type: "float"
     default_value {
-      s: ""
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
+  deprecation {
+    version: 22
+  }
 }
 op {
-  name: "PartitionedCall"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    type_attr: "T"
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "config"
-    type: "string"
+    name: "range_given"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "executor_type"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
 }
 op {
-  name: "PartitionedCall"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "config"
-    type: "string"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "config_proto"
-    type: "string"
+    name: "num_bits"
+    type: "int"
     default_value {
-      s: ""
+      i: 8
     }
   }
   attr {
-    name: "executor_type"
-    type: "string"
+    name: "range_given"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
-}
-op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
-}
-op {
-  name: "PlaceholderV2"
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "PlaceholderWithDefault"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
+    type_attr: "T"
   }
-}
-op {
-  name: "Polygamma"
   input_arg {
-    name: "a"
+    name: "input_min"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
+    name: "input_max"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
       }
     }
   }
 }
 op {
-  name: "PopulationCount"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
-    name: "x"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
     type_attr: "T"
   }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
-    name: "y"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
     type_attr: "T"
   }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
     type_attr: "T"
   }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "QuantizeDownAndShrinkRange"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "PrefetchDataset"
+  name: "QuantizeDownAndShrinkRange"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "PrefetchDataset"
+  name: "QuantizeV2"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
   }
 }
 op {
-  name: "PreventGradient"
+  name: "QuantizeV2"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "message"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
 }
 op {
-  name: "Print"
+  name: "QuantizeV2"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "message"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
     }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "round_mode"
+    type: "string"
     default_value {
-      i: 3
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Print"
+  name: "QuantizeV2"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
     }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "round_mode"
+    type: "string"
     default_value {
-      i: 3
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "PrintV2"
+  name: "QuantizedAdd"
   input_arg {
-    name: "input"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_stream"
-    type: "string"
-    default_value {
-      s: "stderr"
-    }
+    name: "T1"
+    type: "type"
     allowed_values {
       list {
-        s: "stdout"
-        s: "stderr"
-        s: "log(info)"
-        s: "log(warning)"
-        s: "log(error)"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_stream"
-    type: "string"
+    name: "Toutput"
+    type: "type"
     default_value {
-      s: "stderr"
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "PriorityQueue"
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
+    name: "T1"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "Toutput"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_QINT32
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "PriorityQueueV2"
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "ksize"
+    type: "list(int)"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "shared_name"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "QuantizedAvgPool"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "Prod"
+  name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "t"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "result"
+    type_attr: "out_type"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
 }
 op {
-  name: "Prod"
+  name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "t"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "result"
+    type_attr: "out_type"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
 }
 op {
-  name: "Prod"
+  name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "T1"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "T"
+    name: "T2"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "PyFunc"
+  name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
-    type_list_attr: "Tin"
+    type_attr: "T1"
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
   }
-  attr {
-    name: "token"
-    type: "string"
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "PyFuncStateless"
   input_arg {
-    name: "input"
-    type_list_attr: "Tin"
+    name: "max_bias"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-}
-op {
-  name: "Qr"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "q"
-    type_attr: "T"
+    name: "min_out"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "T1"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizedConcat"
   input_arg {
-    name: "input"
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
     type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  deprecation {
-    version: 21
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    type_attr: "Tinput"
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  deprecation {
-    version: 22
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "QuantizeAndDequantize"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "input_min"
-    type: "float"
+    name: "out_type"
+    type: "type"
     default_value {
-      f: 0
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  deprecation {
-    version: 22
-  }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "input_min"
-    type: "float"
+    name: "out_type"
+    type: "type"
     default_value {
-      f: 0
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  deprecation {
-    version: 22
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
   }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "signed_input"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: true
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedConv2DAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedConv2DAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: "HALF_TO_EVEN"
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
-    allowed_values {
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: true
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "QuantizedConv2DAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
@@ -43280,11 +48119,11 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -43294,38 +48133,100 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "QuantizedConv2DAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
@@ -43333,11 +48234,11 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -43353,9 +48254,25 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -43366,209 +48283,315 @@ op {
       }
     }
   }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2DWithBias"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
-}
-op {
-  name: "QuantizeV2"
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
   input_arg {
-    name: "input"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
+    name: "Tfilter"
+    type: "type"
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: "HALF_AWAY_FROM_ZERO"
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
-    allowed_values {
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -43581,99 +48604,131 @@ op {
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "round_mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
     allowed_values {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "Toutput"
+    name: "out_type"
     type: "type"
     default_value {
       type: DT_QINT32
@@ -43682,54 +48737,99 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -43742,7 +48842,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -43755,10 +48855,20 @@ op {
     }
   }
   attr {
-    name: "Toutput"
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -43770,13 +48880,46 @@ op {
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -43786,9 +48929,25 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -43799,21 +48958,56 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
     name: "strides"
@@ -43829,12 +49023,40 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -43844,9 +49066,25 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -43857,7 +49095,7 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -43870,8 +49108,43 @@ op {
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
     name: "strides"
@@ -43887,79 +49160,67 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
-    name: "m"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
-    name: "v_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
+    name: "min_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -43969,8 +49230,31 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
         type: DT_QINT32
       }
     }
@@ -43978,97 +49262,114 @@ op {
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
-    name: "t"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "m"
-    type_attr: "Tinput"
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
-    name: "m_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "m_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "v_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
+    name: "summand"
+    type_attr: "Tsummand"
   }
   input_arg {
-    name: "gamma_min"
+    name: "min_summand"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_summand"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -44084,9 +49385,48 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -44098,23 +49438,45 @@ op {
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -44125,11 +49487,31 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
     type: DT_FLOAT
   }
   output_arg {
@@ -44137,62 +49519,126 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
         type: DT_QINT32
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tsummand"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type: DT_FLOAT
   }
   input_arg {
     name: "min_input"
@@ -44203,11 +49649,15 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
     type: DT_FLOAT
   }
   output_arg {
@@ -44215,15 +49665,15 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -44236,7 +49686,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -44251,6 +49701,9 @@ op {
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -44261,53 +49714,35 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -44316,6 +49751,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -44332,6 +49771,10 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -44351,9 +49794,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -44364,9 +49807,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -44380,9 +49823,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -44400,9 +49843,29 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -44411,6 +49874,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -44427,6 +49894,26 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -44446,9 +49933,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -44459,25 +49946,48 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
         type: DT_QINT32
       }
     }
   }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -44509,7 +50019,7 @@ op {
   }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -44518,6 +50028,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -44534,6 +50048,26 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -44572,11 +50106,34 @@ op {
       }
     }
   }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -44614,6 +50171,14 @@ op {
       }
     }
   }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "QuantizedInstanceNorm"
@@ -45708,6 +51273,62 @@ op {
     }
   }
 }
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "QueueClose"
   input_arg {
@@ -47757,6 +53378,78 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -47809,6 +53502,7 @@ op {
       b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "ReduceJoin"
@@ -48809,6 +54503,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -48931,6 +54668,73 @@ op {
     }
   }
 }
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
 op {
   name: "Reshape"
   input_arg {
@@ -49117,6 +54921,52 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeBicubicGrad"
   input_arg {
@@ -49149,6 +54999,45 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeBilinear"
   input_arg {
@@ -49266,6 +55155,53 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeBilinearGrad"
   input_arg {
@@ -49333,6 +55269,47 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeNearestNeighbor"
   input_arg {
@@ -49410,6 +55387,88 @@ op {
     }
   }
 }
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeNearestNeighborGrad"
   input_arg {
@@ -49445,6 +55504,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResourceApplyAdaMax"
@@ -53286,6 +59352,50 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "batch_dims"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterAdd"
   input_arg {
@@ -57176,341 +63286,1025 @@ op {
     }
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "use_locking"
-    type: "bool"
+    name: "table_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
+  output_arg {
     name: "ms"
-    type: DT_RESOURCE
+    type: DT_FLOAT
   }
-  input_arg {
+  output_arg {
     name: "mom"
-    type: DT_RESOURCE
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "use_locking"
-    type: "bool"
+    name: "table_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "begin_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "end_mask"
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "ellipsis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "new_axis_mask"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "shrink_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "ms"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -58766,53 +65560,101 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "ScalarSummary"
+  name: "ScaleAndTranslate"
   input_arg {
-    name: "tags"
-    type: DT_STRING
+    name: "images"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "resized_images"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
+        type: DT_INT8
         type: DT_UINT8
         type: DT_INT16
-        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
 }
 op {
   name: "ScaleAndTranslate"
@@ -58861,6 +65703,13 @@ op {
       s: "lanczos3"
     }
   }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ScaleAndTranslateGrad"
@@ -58901,6 +65750,52 @@ op {
     }
   }
 }
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -62283,6 +69178,31 @@ op {
     version: 11
   }
 }
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
 op {
   name: "SelfAdjointEigV2"
   input_arg {
@@ -62349,6 +69269,41 @@ op {
     }
   }
 }
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Selu"
   input_arg {
@@ -62447,6 +69402,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -62697,6 +69684,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -62883,6 +69901,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -73469,58 +80491,170 @@ op {
   is_stateful: true
 }
 op {
-  name: "StatefulPartitionedCall"
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
+    type_attr: "dtype"
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
   }
   attr {
-    name: "config"
-    type: "string"
+    name: "shape_dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
     }
   }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
   attr {
-    name: "config_proto"
-    type: "string"
+    name: "dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_FLOAT
     }
   }
   attr {
-    name: "executor_type"
-    type: "string"
+    name: "shape_dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
     }
   }
   is_stateful: true
 }
 op {
-  name: "StatefulStandardNormal"
+  name: "StatefulUniformFullInt"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
   input_arg {
     name: "shape"
     type_attr: "shape_dtype"
@@ -73533,15 +80667,7 @@ op {
     name: "dtype"
     type: "type"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      type: DT_UINT64
     }
   }
   attr {
@@ -73550,11 +80676,47 @@ op {
     default_value {
       type: DT_INT64
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulUniformInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
     }
   }
   is_stateful: true
@@ -75286,6 +82448,52 @@ op {
     }
   }
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Switch"
   input_arg {
@@ -75387,66 +82595,525 @@ op {
   is_stateful: true
 }
 op {
-  name: "TFRecordReader"
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
   output_arg {
-    name: "reader_handle"
+    name: "output"
     type: DT_STRING
-    is_ref: true
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
     default_value {
-      s: ""
+      i: 1
     }
   }
   attr {
-    name: "shared_name"
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "compression_type"
-    type: "string"
+    name: "use_tpu"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
-  deprecation {
-    version: 26
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
   }
   is_stateful: true
 }
 op {
-  name: "TFRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
     type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "container"
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "compression_type"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "TakeDataset"
@@ -77244,6 +84911,43 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
@@ -77304,6 +85008,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -77323,6 +85031,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -77349,6 +85061,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -77478,6 +85194,66 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListSetItem"
   input_arg {
@@ -77540,6 +85316,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -78432,6 +86212,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -80632,6 +88439,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/compat/ops_history.v2.pbtxt b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97d1520b7adb99598535585ccd56f6cac357b5d9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
@@ -0,0 +1,88212 @@
+op {
+  name: "Abort"
+  attr {
+    name: "error_msg"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "exit_without_error"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorNumAccumulated"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "num_accumulated"
+    type: DT_INT32
+  }
+}
+op {
+  name: "AccumulatorSetGlobalStep"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "new_global_step"
+    type: DT_INT64
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "AddManySparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddSparseToTensorsMap"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_handle"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AdjustContrast"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 2
+  }
+}
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "All"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "Angle"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AnonymousIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Any"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_INT8
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Assert"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Assign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "validate_shape"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignAddVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSubVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "AssignVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+  }
+  attr {
+    name: "stride"
+    type: "int"
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AudioSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "sample_rate"
+    type: "float"
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "AudioSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Barrier"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BarrierClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BarrierIncompleteSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "BarrierInsertMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "component_index"
+    type: "int"
+  }
+}
+op {
+  name: "BarrierReadySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "BarrierTakeMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "allow_small_batch"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "wait_for_incomplete"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchCholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchCholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchIFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BatchMatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
+op {
+  name: "BatchSelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BatchToSpaceND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tcrops"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Betainc"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Bitcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BoostedTreesBucketize"
+  input_arg {
+    name: "float_values"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "buckets"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tree_complexity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_node_weight"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesCenterBias"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mean_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mean_hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l1"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "l2"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "continue_centering"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesCreateQuantileStreamResource"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "epsilon"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_streams"
+    type: DT_INT64
+  }
+  attr {
+    name: "max_elements"
+    type: "int"
+    default_value {
+      i: 1099511627776
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesExampleDebugOutputs"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "examples_debug_outputs_serialized"
+    type: DT_STRING
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "last_layer_nodes_range"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesMakeQuantileSummaries"
+  input_arg {
+    name: "float_values"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "epsilon"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesQuantileStreamResourceAddSummaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "summaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesQuantileStreamResourceFlush"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  attr {
+    name: "generate_quantiles"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesQuantileStreamResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "max_depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "BroadcastArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BroadcastGradientArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "BroadcastTo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
+}
+op {
+  name: "CTCBeamSearchDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCGreedyDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+}
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "CloseSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Complex"
+  input_arg {
+    name: "real"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "imag"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ComplexAbs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ComputeAccidentalHits"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "ConcatV2"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Conj"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "ConjugateTranspose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "ConsumeMutexLock"
+  input_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv2DBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "CreateSummaryDbWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "db_uri"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "experiment_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "run_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "user_name"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CreateSummaryFileWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "logdir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_queue"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flush_millis"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filename_suffix"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
+op {
+  name: "CudnnRNN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNBackpropV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNCanonicalToParams"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsSize"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
+op {
+  name: "DataFormatVecPermute"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "DebugGradientIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugGradientRefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DecodeAndCropJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "crop_window"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "DecodeCompressed"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+}
+op {
+  name: "DecodeJSONExample"
+  input_arg {
+    name: "json_examples"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "binary_examples"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodePng"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  attr {
+    name: "desired_channels"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "desired_samples"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "DeserializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "DeserializeManySparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type_attr: "Tserialized"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tserialized"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "DestroyResourceOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "ignore_lookup_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "DestroyTemporaryVariable"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "DrawBoundingBoxes"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "DynamicPartition"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "partitions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_partitions"
+  }
+  attr {
+    name: "num_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "DynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "EditDistance"
+  input_arg {
+    name: "hypothesis_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "hypothesis_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "hypothesis_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "truth_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "normalize"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EmptyTensorList"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "EncodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pad"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "EncodeJpeg"
+  input_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "format"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    allowed_values {
+      list {
+        s: ""
+        s: "grayscale"
+        s: "rgb"
+      }
+    }
+  }
+  attr {
+    name: "quality"
+    type: "int"
+    default_value {
+      i: 95
+    }
+  }
+  attr {
+    name: "progressive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "optimize_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "chroma_downsampling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "density_unit"
+    type: "string"
+    default_value {
+      s: "in"
+    }
+    allowed_values {
+      list {
+        s: "in"
+        s: "cm"
+      }
+    }
+  }
+  attr {
+    name: "x_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "y_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "xmp_metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "EncodePng"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "EncodeWav"
+  input_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
+op {
+  name: "EnsureShape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Enter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Exit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExpandDims"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dim"
+    type_attr: "Tdim"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tdim"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ExperimentalAssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalAutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalCSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalDirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalGroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIdentityIndexedDataset"
+  input_arg {
+    name: "size"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIndexedDatasetGet"
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIndexedDatasetMaterialize"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaterializedIndexDatasetHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalSleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalThreadPoolHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "num_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalUniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractJpegShape"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image_shape"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ExtractVolumePatches"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Fact"
+  output_arg {
+    name: "fact"
+    type: DT_STRING
+  }
+}
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQueue"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  is_stateful: true
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type_attr: "index_type"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FilterByLastComponentDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FlushSummaryWriter"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
+op {
+  name: "FractionalAvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalAvgPoolGrad"
+  input_arg {
+    name: "orig_input_tensor_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGrad"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGrad"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Gather"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+}
+op {
+  name: "GeneratorDataset"
+  input_arg {
+    name: "init_func_other_args"
+    type_list_attr: "Tinit_func_args"
+  }
+  input_arg {
+    name: "next_func_other_args"
+    type_list_attr: "Tnext_func_args"
+  }
+  input_arg {
+    name: "finalize_func_other_args"
+    type_list_attr: "Tfinalize_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "next_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tinit_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tnext_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HistogramFixedWidth"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "value_range"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "nbins"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "out"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Identity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "IdentityN"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Igamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IgammaGradA"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Igammac"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Imag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImmutableConst"
+  output_arg {
+    name: "tensor"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "memory_region_name"
+    type: "string"
+  }
+}
+op {
+  name: "ImportEvent"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "event"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "InTopK"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "InTopKV2"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTable"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+}
+op {
+  name: "InitializeTableFromTextFile"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+}
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "InvertPermutation"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "IsBoostedTreesQuantileStreamResourceInitialized"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsVariableInitialized"
+  input_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandleV2"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNextAsOptional"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorToStringHandle"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyRelu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LeftShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ListDiff"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "LoadAndRemapMatrix"
+  input_arg {
+    name: "ckpt_path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "row_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "initializing_values"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_matrix"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_rows"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cols"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_rows_in_memory"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingADAMParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingFTRLParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingMomentumParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LogicalAnd"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableFind"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Merge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MulNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MultiDeviceIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MultiDeviceIteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "MultiDeviceIteratorGetNextFromShard"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shard_num"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "incarnation_id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MultiDeviceIteratorInit"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "max_buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "incarnation_id"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "MultiDeviceIteratorToStringHandle"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  input_arg {
+    name: "deleted_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexLock"
+  input_arg {
+    name: "mutex"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclAllReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclBroadcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "num_devices"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "NearestNeighbors"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "centers"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "NegTrain"
+  input_arg {
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "lr"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "vocab_count"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_negative_samples"
+    type: "int"
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "NoOp"
+}
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV4"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV4"
+  input_arg {
+    name: "boxes"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scores"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "pad_to_max_output_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionWithOverlaps"
+  input_arg {
+    name: "overlaps"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "overlap_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalFromValue"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalGetValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalHasValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "has_value"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "OptionalNone"
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  output_arg {
+    name: "feature_list_dense_lengths"
+    type: DT_INT64
+    number_attr: "Nfeature_list_dense"
+  }
+  attr {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+    allowed_values {
+      list {
+        s: "stdout"
+        s: "stderr"
+        s: "log(info)"
+        s: "log(warning)"
+        s: "log(error)"
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConcat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBias"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndRelu"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueUpTo"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueUpToV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueIsClosed"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "QueueIsClosedV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "QueueSizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RaggedGather"
+  input_arg {
+    name: "params_nested_splits"
+    type: DT_INT64
+    number_attr: "PARAMS_RAGGED_RANK"
+  }
+  input_arg {
+    name: "params_dense_values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type: DT_INT64
+    number_attr: "OUTPUT_RAGGED_RANK"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "RaggedRange"
+  input_arg {
+    name: "starts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "limits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "deltas"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RaggedTensorToSparse"
+  input_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RandomCrop"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  deprecation {
+    version: 8
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomGamma"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomGammaGrad"
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sample"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 25
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoissonV2"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "R"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "R"
+    type: "type"
+    default_value {
+      type: DT_DOUBLE
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "Tout"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Rank"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ReadFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReadVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderNumRecordsProduced"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumRecordsProducedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderNumWorkUnitsCompleted"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumWorkUnitsCompletedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRead"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpTo"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpToV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReadV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReset"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
+op {
+  name: "ReaderResetV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRestoreState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderRestoreStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderSerializeState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderSerializeStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "Real"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "RefEnter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "RefExit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "RefMerge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefNextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefSelect"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefSwitch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "RegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "RegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "rewrite"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    type: "string"
+  }
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "Reshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceCountUpTo"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceGather"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterDiv"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMul"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseSequence"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seq_lengths"
+    type_attr: "Tlen"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "RightShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Roll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SampleDistortedBoundingBox"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "min_object_covered"
+    type: "float"
+    default_value {
+      f: 0.1
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SampleDistortedBoundingBoxV2"
+  input_arg {
+    name: "image_size"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "size"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMin"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNd"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "SdcaFprint"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SdcaOptimizer"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SdcaOptimizer"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+        s: "poisson_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SdcaOptimizerV2"
+  input_arg {
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  attr {
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+        s: "poisson_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SdcaShrinkL1"
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+    number_attr: "num_features"
+    is_ref: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Select"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "SerializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Shape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ShapeN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShardedFilename"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ShardedFilespec"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Size"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Skipgram"
+  output_arg {
+    name: "vocab_word"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "vocab_freq"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "words_per_epoch"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "current_epoch"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "total_words_processed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  attr {
+    name: "filename"
+    type: "string"
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "min_count"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "subsample"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "Slice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Snapshot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Softmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "softmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Softmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "softmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softplus"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftplusGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Softsign"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SoftsignGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SpaceToBatch"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SpaceToDepth"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SpaceToDepth"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseConcat"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "concat_dim"
+    type: "int"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseFillEmptyRows"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "empty_row_indicator"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "product"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "a_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "b_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Ta"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tb"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseReorder"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseReshape"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "new_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSlice"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseSliceGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmax"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSplit"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseToDense"
+  input_arg {
+    name: "sparse_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "output_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseToSparseSetOperation"
+  input_arg {
+    name: "set1_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set1_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set1_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "Split"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SplitV"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size_splits"
+    type_attr: "Tlen"
+  }
+  input_arg {
+    name: "split_dim"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Squeeze"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "squeeze_dims"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "Stack"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StackClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
+op {
+  name: "StackCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "StackPop"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+}
+op {
+  name: "StackPopV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "elem"
+    type_attr: "elem_type"
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "StackPush"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "StackPushV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "elem"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StackV2"
+  input_arg {
+    name: "max_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "elem_type"
+    type: "type"
+  }
+  attr {
+    name: "stack_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StageClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StagePeek"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StageSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulUniformFullInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulUniformInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatelessIf"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
+op {
+  name: "StatelessMultinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessWhile"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
+op {
+  name: "StaticRegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+}
+op {
+  name: "StaticRegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+  attr {
+    name: "rewrite"
+    type: "string"
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "StopGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "StridedSlice"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StridedSliceGrad"
+  input_arg {
+    name: "shape"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "StringFormat"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "template"
+    type: "string"
+    default_value {
+      s: "%s"
+    }
+  }
+  attr {
+    name: "placeholder"
+    type: "string"
+    default_value {
+      s: "%s"
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+}
+op {
+  name: "StringJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
+}
+op {
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "StringSplit"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "delimiter"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "skip_empty"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "StringSplitV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "maxsplit"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "StringStrip"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "StringToHashBucket"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "StringToHashBucketFast"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "StringToHashBucketStrong"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "key"
+    type: "list(int)"
+  }
+}
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+      }
+    }
+  }
+}
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Substr"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pos"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "len"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "unit"
+    type: "string"
+    default_value {
+      s: "BYTE"
+    }
+    allowed_values {
+      list {
+        s: "BYTE"
+        s: "UTF8_CHAR"
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SummaryWriter"
+  output_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Switch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SymbolicGradient"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "TakeManySparseFromTensorsMap"
+  input_arg {
+    name: "sparse_handles"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TemporaryVariable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayCloseV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayConcat"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayConcatV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorArrayConcatV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape_except0"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGather"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayGatherV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGrad"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 16
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayGradWithShape"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "shape_to_prepend"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayPack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayRead"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayReadV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayScatter"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 19
+  }
+}
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayScatterV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArraySize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArraySizeV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArraySplit"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArraySplitV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayUnpack"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 20
+  }
+}
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "identical_element_shapes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorArrayWrite"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayWriteV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListElementShape"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListFromTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListGather"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListGetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListLength"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "length"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorListPopBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListPushBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListReserve"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListScatter"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListStack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorSummary"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "description"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "labels"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "display_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "TensorSummaryV2"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "serialized_summary_metadata"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "TextLineReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Tile"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type_attr: "Tmultiples"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tmultiples"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TileGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "multiples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 3
+  }
+}
+op {
+  name: "Timestamp"
+  output_arg {
+    name: "ts"
+    type: DT_DOUBLE
+  }
+  is_stateful: true
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Transpose"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tperm"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "UnbatchGrad"
+  input_arg {
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "batched_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
+op {
+  name: "UnicodeScript"
+  input_arg {
+    name: "input"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
+  name: "UnicodeTranscode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unique"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueWithCounts"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueWithCountsV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  output_arg {
+    name: "count"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Unpack"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num"
+  }
+  attr {
+    name: "num"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMin"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
+op {
+  name: "UpperBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "VarHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "VarIsInitializedOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "Variable"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "VariableShape"
+  input_arg {
+    name: "input"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "VariableV2"
+  output_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WholeFileReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WholeFileReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
+op {
+  name: "WriteAudioSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "WriteGraphSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteHistogramSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteImageSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bad_color"
+    type: DT_UINT8
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteScalarSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "WriteSummary"
+  input_arg {
+    name: "writer"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "summary_metadata"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Xdivy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Xlogy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ZerosLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Zeta"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
index cd2e5c9d340d29c4836c89e7f4ab64d6a7595ec1..9b22ccdeeec0ac6f4d558afb0a09f9098e2c4513 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -167,6 +167,7 @@ REGISTER_OP("CudnnRNNV3")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("is_training: bool = true")
+    .Attr("time_major: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       auto input_shape = c->input(0);
       auto input_h_shape = c->input(1);
@@ -292,6 +293,7 @@ REGISTER_OP("CudnnRNNBackpropV3")
     .Attr("dropout: float = 0.0")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
+    .Attr("time_major: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       auto input_shape = c->input(0);
       auto input_h_shape = c->input(1);
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 1c117166de029d40b84bbd2335b9315cdc53bcba..89c1204cf0eb3597e42aef4b892b8de46a037c08 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 // to a stateful "iterator" by passing the "dataset" to the
 // "MakeIterator" op.
 //
-// TODO(b/65524810): DT_VARIANT tensors that represent "dataset" objects are
+// TODO(b/123753214): DT_VARIANT tensors that represent "dataset" objects are
 // not presently serializable. To avoid issues with constant folding, ensure
 // that any "source dataset" ops (i.e. ops that output a dataset and do not
 // take one as input) are marked "stateful".
@@ -37,7 +37,7 @@ REGISTER_OP("TensorDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
                                                 // `components` have shapes
@@ -49,7 +49,7 @@ REGISTER_OP("TensorSliceDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that the
                                                 // dim-0 slices of `components`
@@ -62,7 +62,7 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Input("dense_shape: int64")
     .Output("handle: variant")
     .Attr("Tvalues: type")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -79,7 +79,7 @@ REGISTER_OP("GeneratorDataset")
     .Attr("Tfinalize_func_args: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -275,6 +275,22 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShardDataset")
+    .Input("input_dataset: variant")
+    .Input("num_shards: int64")
+    .Input("index: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // num_shards should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // index should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -322,7 +338,7 @@ REGISTER_OP("RangeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -388,7 +404,7 @@ REGISTER_OP("TextLineDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -408,7 +424,7 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Input("footer_bytes: int64")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -431,7 +447,7 @@ REGISTER_OP("FixedLengthRecordDatasetV2")
     .Input("buffer_size: int64")
     .Input("compression_type: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -451,7 +467,7 @@ REGISTER_OP("TFRecordDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -538,13 +554,22 @@ REGISTER_OP("IteratorGetNextSync")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("DatasetToSingleElement")
     .Input("dataset: variant")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ReduceDataset")
     .Input("input_dataset: variant")
     .Input("initial_state: Tstate")
@@ -556,6 +581,7 @@ REGISTER_OP("ReduceDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("IteratorToStringHandle")
@@ -632,10 +658,13 @@ REGISTER_OP("IteratorGetNextAsOptional")
 REGISTER_OP("ModelDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
+    .Attr("cpu_budget: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308749): Add a stateful version of MapDefun and use it when `f`
+// is stateful.
 REGISTER_OP("MapDefun")
     .Input("arguments: Targuments")
     .Input("captured_inputs: Tcaptured")
@@ -645,6 +674,7 @@ REGISTER_OP("MapDefun")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("f: func")
+    .Attr("max_intra_op_parallelism: int = 1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<PartialTensorShape> output_shapes;
       TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 2735618c61ba503cd103d01e4f9d3e40b7791e10..04f40e6cc85be0c7ea88f3f7ee5cde6267a3a578 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,6 +17,15 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("ExperimentalAutoShardDataset")
+    .Input("input_dataset: variant")
+    .Input("num_workers: int64")
+    .Input("index: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalBytesProducedStatsDataset")
     .Input("input_dataset: variant")
     .Input("tag: string")
@@ -29,6 +38,20 @@ REGISTER_OP("ExperimentalBytesProducedStatsDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ChooseFastestBranchDataset")
+    .Input("input_dataset: variant")
+    .Input("ratio_numerator: int64")
+    .Input("ratio_denominator: int64")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("num_elements_per_branch: int >= 1")
+    .Attr("branches: list(func) >= 1")
+    .Attr("other_arguments_lengths: list(int) >= 1")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
@@ -42,7 +65,7 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -76,10 +99,15 @@ REGISTER_OP("ExperimentalDatasetCardinality")
     .Output("cardinality: int64")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ExperimentalDatasetToTFRecord")
     .Input("input_dataset: variant")
     .Input("filename: string")
     .Input("compression_type: string")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
@@ -190,6 +218,14 @@ REGISTER_OP("ExperimentalMapAndBatchDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalRebatchDataset")
+    .Input("input_dataset: variant")
+    .Input("num_workers: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -205,7 +241,7 @@ REGISTER_OP("ExperimentalMapDataset")
 REGISTER_OP("ExperimentalMatchingFilesDataset")
     .Input("patterns: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -259,7 +295,7 @@ REGISTER_OP("ExperimentalRandomDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -330,7 +366,7 @@ REGISTER_OP("ExperimentalSqlDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -459,7 +495,7 @@ REGISTER_OP("ExperimentalLMDBDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -468,6 +504,8 @@ REGISTER_OP("ExperimentalChooseFastestDataset")
     .Output("handle: variant")
     .Attr("N: int >= 2")
     .Attr("num_experiments: int")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalIdentityIndexedDataset")
diff --git a/tensorflow/core/ops/fingerprint64_map_ops.cc b/tensorflow/core/ops/fingerprint64_map_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91b24b401787f154ce67e1c6c7aaaf2a9f65d475
--- /dev/null
+++ b/tensorflow/core/ops/fingerprint64_map_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("Fingerprint64Map")
+    .Output("table_handle: resource")
+    .Attr("heterogeneous_key_dtype: type")
+    .Attr("table_value_dtype: type = DT_INT64")
+    .Attr("num_oov_buckets: int >= 1")
+    .Attr("offset: int >= 0 = 0")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index be440ed728129d6553b017fa537e0585d076c35d..4982ec6bd82e2bf221b56a9e75f00ce4f0763f15 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -132,6 +132,35 @@ REGISTER_OP("If")
       return Status::OK();
     });
 
+REGISTER_OP("Case")
+    .Input("branch_index: int32")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("branches: list(func) >= 1")
+    .Attr("output_shapes: list(shape) = []")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      // If `output_shapes` attr is set use that as the shapes of the outputs
+      // else return unknown shapes.
+      if (output_shapes.empty()) return shape_inference::UnknownShape(c);
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as num outputs (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    });
+
 // TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 0f1555f49cf8dabb8d5fef71d0b14737cc6bc48c..62d473847c0e16e512395368339154e4e16b2eda 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -131,6 +131,70 @@ Status NMSShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status CombinedNMSShapeFn(InferenceContext* c) {
+  // Get inputs and validate ranks
+  ShapeHandle boxes;
+  // boxes is a tensor of Dimensions [batch_size, num_anchors, q, 4]
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &boxes));
+  ShapeHandle scores;
+  // scores is a tensor of Dimensions [batch_size, num_anchors, num_classes]
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &scores));
+  ShapeHandle max_output_size_per_class;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size_per_class));
+  ShapeHandle max_total_size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &max_total_size));
+  ShapeHandle unused_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused_shape));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused_shape));
+
+  DimensionHandle unused;
+  // boxes[0] and scores[0] are both batch_size
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+  // boxes[1] and scores[1] are both num_anchors
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(boxes, 1), c->Dim(scores, 1), &unused));
+  // The boxes[3] is 4.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 3), 4, &unused));
+
+  DimensionHandle d = c->Dim(boxes, 2);
+  DimensionHandle class_dim = c->Dim(scores, 2);
+  if (c->ValueKnown(d) && c->ValueKnown(class_dim)) {
+    if (c->Value(d) != 1 && c->Value(d) != c->Value(class_dim)) {
+      return errors::InvalidArgument(
+          "third dimension of boxes must be either "
+          "1 or equal to the third dimension of scores");
+    }
+  }
+  DimensionHandle output_dim;
+  DimensionHandle batch_dim = c->Dim(boxes, 0);
+
+  TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(3, &output_dim));
+  if (c->ValueKnown(output_dim) && c->Value(output_dim) <= 0) {
+    return errors::InvalidArgument("max_total_size should be > 0 ");
+  }
+  DimensionHandle size_per_class;
+  TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(2, &size_per_class));
+
+  int64 output_size;
+  bool pad_per_class;
+  TF_RETURN_IF_ERROR(c->GetAttr("pad_per_class", &pad_per_class));
+  if (!pad_per_class) {
+    output_size = c->Value(output_dim);
+  } else {
+    if (c->ValueKnown(size_per_class) && c->Value(size_per_class) <= 0) {
+      return errors::InvalidArgument(
+          "max_output_size_per_class must be > 0 "
+          "if pad_per_class is set to true ");
+    }
+    output_size = std::min(c->Value(output_dim),
+                           c->Value(size_per_class) * c->Value(class_dim));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, output_size, 4}));
+  c->set_output(1, c->MakeShape({batch_dim, output_size}));
+  c->set_output(2, c->MakeShape({batch_dim, output_size}));
+  c->set_output(3, c->Vector(batch_dim));
+  return Status::OK();
+}
+
 }  // namespace
 
 // --------------------------------------------------------------------------
@@ -149,6 +213,7 @@ REGISTER_OP("ResizeBicubic")
     .Output("resized_images: float")
     .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
@@ -158,6 +223,7 @@ REGISTER_OP("ResizeBicubicGrad")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
@@ -172,6 +238,7 @@ REGISTER_OP("ResizeBilinear")
         "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
         "float, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
@@ -185,6 +252,7 @@ REGISTER_OP("ScaleAndTranslate")
         "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
         "float, double}")
     .Attr("kernel_type: string = 'lanczos3'")
+    .Attr("antialias: bool = true")
     .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
@@ -198,6 +266,7 @@ REGISTER_OP("QuantizedResizeBilinear")
     .Output("out_max: float")
     .Attr("T: {quint8, qint32, float}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(ResizeShapeFn(c));
       ShapeHandle min_shape;
@@ -216,6 +285,7 @@ REGISTER_OP("ResizeBilinearGrad")
     .Output("output: T")
     .Attr("T: {float, bfloat16, half, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
@@ -230,6 +300,7 @@ REGISTER_OP("ScaleAndTranslateGrad")
     .Output("output: T")
     .Attr("T: {float}")
     .Attr("kernel_type: string = 'lanczos3'")
+    .Attr("antialias: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
@@ -242,6 +313,7 @@ REGISTER_OP("ResizeNearestNeighbor")
     .Output("resized_images: T")
     .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
@@ -251,6 +323,7 @@ REGISTER_OP("ResizeNearestNeighborGrad")
     .Output("output: T")
     .Attr("T: {uint8, int8, int32, half, float, double}")
     .Attr("align_corners: bool = false")
+    .Attr("half_pixel_centers: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
@@ -606,6 +679,7 @@ REGISTER_OP("ExtractGlimpse")
     .Attr("centered: bool = true")
     .Attr("normalized: bool = true")
     .Attr("uniform_noise: bool = true")
+    .Attr("noise: string = 'uniform'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
@@ -618,6 +692,16 @@ REGISTER_OP("ExtractGlimpse")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused));
 
+      bool uniform_noise = false;
+      TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
+      string noise;
+      TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
+      if (uniform_noise && (!noise.empty() && noise != "uniform")) {
+        return errors::InvalidArgument(
+            "The uniform_noise and noise should not be specified at the same "
+            "time");
+      }
+
       return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
                                    c->Dim(input, 3));
     });
@@ -808,4 +892,18 @@ REGISTER_OP("NonMaxSuppressionWithOverlaps")
       return Status::OK();
     });
 
+REGISTER_OP("CombinedNonMaxSuppression")
+    .Input("boxes: float")
+    .Input("scores: float")
+    .Input("max_output_size_per_class: int32")
+    .Input("max_total_size: int32")
+    .Input("iou_threshold: float")
+    .Input("score_threshold: float")
+    .Output("nmsed_boxes: float")
+    .Output("nmsed_scores: float")
+    .Output("nmsed_classes: float")
+    .Output("valid_detections: int32")
+    .Attr("pad_per_class: bool = false")
+    .SetShapeFn(CombinedNMSShapeFn);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 517af26b44f53d85979e1194d1f0d6d8814cb1e8..e517e750955d7fb5335d2766a09e96b5f6382c10 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -183,6 +183,13 @@ TEST(ImageOpsTest, ExtractGlimpse_ShapeFn) {
   op.input_tensors.resize(2);
 
   // Inputs are input, size, offsets.
+  TF_ASSERT_OK(NodeDefBuilder("test", "ExtractGlimpse")
+                   .Input({"input", 0, DT_FLOAT})
+                   .Input({"size", 1, DT_INT32})
+                   .Input({"offsets", 2, DT_FLOAT})
+                   .Attr("uniform_noise", true)
+                   .Attr("noise", "")
+                   .Finalize(&op.node_def));
 
   // Rank and size checks.
   INFER_ERROR("Shape must be rank 4 but is rank 5", op, "[1,2,3,4,5];?;?");
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 952ee4bee2e5a49edeea168f4184767dbebc2527..66594b3576e20a761e26e5b4835571332aaba4f7 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -208,12 +208,42 @@ Status SvdShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// The first input is [...,3,M] and second input is [...,M,K].
+// Output is [...,M,K].
+Status TridiagonalSolveShapeFn(InferenceContext* c) {
+  ShapeHandle lhs;
+  ShapeHandle rhs;
+  // Check that rank is at least 2.
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &lhs));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &rhs));
+
+  // Extract batch dimensions and check they are the same.
+  ShapeHandle lhs_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(lhs, 0, -2, &lhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(
+      c->Merge(lhs_batch_shape, rhs_batch_shape, &lhs_batch_shape));
+
+  // Check that "M" is the same in both inputs.
+  DimensionHandle m_lhs = c->Dim(lhs, -1);
+  DimensionHandle m_rhs = c->Dim(rhs, -2);
+  TF_RETURN_IF_ERROR(c->Merge(m_lhs, m_rhs, &m_lhs));
+
+  // Check that next-to-last dimension of the first input is 3.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(lhs, -2), 3, &m_lhs));
+
+  // The output shape is the same as rhs shape.
+  c->set_output(0, rhs);
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("MatrixDeterminant")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, double, complex64, complex128}")
+    .Attr("T: {half, float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
@@ -232,7 +262,7 @@ REGISTER_OP("LogMatrixDeterminant")
     .Input("input: T")
     .Output("sign: T")
     .Output("log_abs_determinant: T")
-    .Attr("T: {float, double, complex64, complex128}")
+    .Attr("T: {half, float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
@@ -255,7 +285,7 @@ REGISTER_OP("MatrixInverse")
     .Input("input: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixExponential")
@@ -263,7 +293,7 @@ REGISTER_OP("MatrixExponential")
         27, "Use Python implementation tf.linalg.matrix_exponential instead.")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixLogarithm")
@@ -275,20 +305,20 @@ REGISTER_OP("MatrixLogarithm")
 REGISTER_OP("Cholesky")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("CholeskyGrad")
     .Input("l: T")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("SelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, half}")
     .Deprecated(11, "Use SelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
@@ -310,14 +340,14 @@ REGISTER_OP("SelfAdjointEigV2")
     .Output("e: T")
     .Output("v: T")
     .Attr("compute_v: bool = True")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
 REGISTER_OP("Lu")
     .Input("input: T")
     .Output("lu: T")
     .Output("p: output_idx_type")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .Attr("output_idx_type: {int32, int64} = DT_INT32")
     .SetShapeFn(LuShapeFn);
 
@@ -326,7 +356,7 @@ REGISTER_OP("MatrixSolve")
     .Input("rhs: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
@@ -337,7 +367,7 @@ REGISTER_OP("MatrixTriangularSolve")
     .Output("output: T")
     .Attr("lower: bool = True")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
@@ -347,7 +377,7 @@ REGISTER_OP("MatrixSolveLs")
     .Input("rhs: T")
     .Input("l2_regularizer: double")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .Attr("fast: bool = True")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle l2_regularizer;
@@ -358,7 +388,7 @@ REGISTER_OP("MatrixSolveLs")
 REGISTER_OP("MatrixSquareRoot")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("Qr")
@@ -366,7 +396,7 @@ REGISTER_OP("Qr")
     .Output("q: T")
     .Output("r: T")
     .Attr("full_matrices: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(QrShapeFn);
 
 REGISTER_OP("Svd")
@@ -376,9 +406,16 @@ REGISTER_OP("Svd")
     .Output("v: T")
     .Attr("compute_uv: bool = True")
     .Attr("full_matrices: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SvdShapeFn);
 
+REGISTER_OP("TridiagonalSolve")
+    .Input("diagonals: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(TridiagonalSolveShapeFn);
+
 // Deprecated op registrations:
 
 // Can be deleted after 3feb2017.
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index bfacee14efa41408865fecb103bc63b5f6de73ff..93732f938a9278f8da322dd8fd98a234695287d9 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -80,12 +80,18 @@ TEST(LinalgOpsTest, SelfAdjointEig_ShapeFn) {
 TEST(LinalgOpsTest, SelfAdjointEigV2_ShapeFn) {
   ShapeInferenceTestOp op("SelfAdjointEigV2");
   auto set_compute_v = [&op](bool compute_v) {
+    // Test for float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
                      .Input({{"input", 0, DT_FLOAT}})
                      .Attr("compute_v", compute_v)
                      .Finalize(&op.node_def));
-  };
 
+    // Test for float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
+                     .Input({{"input", 0, DT_HALF}})
+                     .Attr("compute_v", compute_v)
+                     .Finalize(&op.node_def));
+  };
   set_compute_v(false);
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
   INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
@@ -174,10 +180,17 @@ TEST(LinalgOpsTest, MatrixSolveLs_ShapeFn) {
 TEST(LinalgOpsTest, Qr_ShapeFn) {
   ShapeInferenceTestOp op("Qr");
   auto set_attrs = [&op](bool full_matrices) {
+    // Test float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Qr")
                      .Input({"input", 0, DT_FLOAT})
                      .Attr("full_matrices", full_matrices)
                      .Finalize(&op.node_def));
+
+    // Test float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Qr")
+                     .Input({"input", 0, DT_HALF})
+                     .Attr("full_matrices", full_matrices)
+                     .Finalize(&op.node_def));
   };
 
   // Defining `P` = min(`M`, `N`), if full_matrices = False, then Q should be
@@ -218,11 +231,19 @@ TEST(LinalgOpsTest, Qr_ShapeFn) {
 TEST(LinalgOpsTest, Svd_ShapeFn) {
   ShapeInferenceTestOp op("Svd");
   auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    // Test for float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Svd")
                      .Input({"input", 0, DT_FLOAT})
                      .Attr("compute_uv", compute_uv)
                      .Attr("full_matrices", full_matrices)
                      .Finalize(&op.node_def));
+
+    // Test for float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Svd")
+                     .Input({"input", 0, DT_HALF})
+                     .Attr("compute_uv", compute_uv)
+                     .Attr("full_matrices", full_matrices)
+                     .Finalize(&op.node_def));
   };
 
   // Defining `P` = min(`M`, `N`), if full_matrices = False, then U should be
@@ -293,4 +314,40 @@ TEST(LinalgOpsTest, Lu_ShapeFn) {
            "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
 }
 
+TEST(LinalgOpsTest, TridiagonalSolve_ShapeFn) {
+  ShapeInferenceTestOp op("TridiagonalSolve");
+  INFER_OK(op, "?;?", "in1");
+  INFER_OK(op, "[3,5];[?,1]", "in1");
+  INFER_OK(op, "[?,5];[5,1]", "in1");
+  INFER_OK(op, "[?,5];[?,?]", "in1");
+  INFER_OK(op, "[?,?];[?,?]", "in1");
+  INFER_OK(op, "[3,5];[5,1]", "in1");
+  INFER_OK(op, "[3,5];[5,2]", "in1");
+
+  INFER_OK(op, "[?,?,?];[?,?,?]", "in1");
+  INFER_OK(op, "[?,3,5];[7,5,2]", "in1");
+  INFER_OK(op, "[7,3,5];[?,5,2]", "in1");
+  INFER_OK(op, "[7,?,5];[?,5,?]", "in1");
+  INFER_OK(op, "[7,3,5];[7,5,2]", "in1");
+
+  INFER_OK(op, "[7,?,3,5];[7,8,5,2]", "in1");
+  INFER_OK(op, "[7,8,3,5];[7,8,5,2]", "in1");
+
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3];[5,1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3,5];[5]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [6,4] and [6,8].",
+      op, "[6,4,3,5];[6,8,5,2]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [?,4] and [6,8].",
+      op, "[?,4,3,5];[6,8,5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[4,5];[5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[6,4,5];[6,5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op, "[3,9];[5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op,
+              "[6,3,9];[6,5,2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index cbc9c7a2f4589924929c8ca6c16b85c04566d620..123ffc493a929600f940fd41a5645cc39e575ee5 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -20,6 +20,34 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Verifies that `shapes_and_types` is a valid list handle and has the right
+// dtype.
+Status VerifyHandleData(
+    shape_inference::InferenceContext* c,
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+    DataType element_dtype) {
+  if (shapes_and_types.size() != 1) {
+    return errors::InvalidArgument(
+        "Invalid handle_data for input list. Expected length of "
+        "shape_and_types: ",
+        1, " Saw: ", shapes_and_types.size());
+  }
+  const shape_inference::ShapeAndType& list_shape_type = shapes_and_types[0];
+  if (list_shape_type.dtype != element_dtype) {
+    return errors::InvalidArgument("Expected list with element dtype ",
+                                   DataTypeString(element_dtype),
+                                   " but got list with element dtype ",
+                                   DataTypeString(list_shape_type.dtype));
+  }
+  return Status::OK();
+}
+
+// Assumes that the handle_data is valid.
+shape_inference::ShapeHandle GetElementShapeFromHandleData(
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types) {
+  return shapes_and_types[0].shape;
+}
+
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
     .Input("max_num_elements: int32")
@@ -51,11 +79,11 @@ REGISTER_OP("TensorListPushBack")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -98,11 +126,11 @@ REGISTER_OP("TensorListPushBackBatch")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -130,6 +158,7 @@ REGISTER_OP("TensorListLength")
 
 REGISTER_OP("TensorListPopBack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("output_handle: variant")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
@@ -138,11 +167,11 @@ REGISTER_OP("TensorListPopBack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with invalid variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -166,6 +195,7 @@ REGISTER_OP("TensorListPopBack")
 
 REGISTER_OP("TensorListStack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
@@ -174,11 +204,11 @@ REGISTER_OP("TensorListStack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -207,6 +237,42 @@ REGISTER_OP("TensorListStack")
       return Status::OK();
     });
 
+Status TensorListConcatShapeInference(
+    shape_inference::InferenceContext* c,
+    shape_inference::ShapeHandle element_shape) {
+  DataType element_dtype;
+  TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr && handle_data->size() > 1) {
+    return errors::InvalidArgument(
+        "Trying to read from list with wrong variant data.");
+  }
+  if (handle_data != nullptr && handle_data->size() == 1) {
+    const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
+    if (list_shape_type.dtype != element_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read from list with wrong element dtype. List has "
+          "type ",
+          DataTypeString(list_shape_type.dtype), " but expected type ",
+          DataTypeString(element_dtype));
+    }
+    shape_inference::ShapeHandle merged;
+    TF_RETURN_IF_ERROR(c->Merge(element_shape, list_shape_type.shape, &merged));
+    element_shape = merged;
+  }
+  if (c->RankKnown(element_shape)) {
+    shape_inference::ShapeHandle result;
+    TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+    TF_RETURN_IF_ERROR(
+        c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+    c->set_output(0, result);
+  } else {
+    c->set_output(0, c->UnknownShape());
+  }
+  c->set_output(1, c->MakeShape({c->UnknownDim()}));
+  return Status::OK();
+}
+
 REGISTER_OP("TensorListConcat")
     .Input("input_handle: variant")
     .Output("tensor: element_dtype")
@@ -214,45 +280,27 @@ REGISTER_OP("TensorListConcat")
     .Attr("element_dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType element_dtype;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       PartialTensorShape raw_element_shape;
       TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &raw_element_shape));
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(raw_element_shape,
                                                             &element_shape));
+      return TensorListConcatShapeInference(c, element_shape);
+    });
 
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
-        return errors::InvalidArgument(
-            "Trying to read from list with wrong variant data.");
-      }
-      if (handle_data != nullptr) {
-        const shape_inference::ShapeAndType& list_shape_type =
-            (*handle_data)[0];
-        if (list_shape_type.dtype != element_dtype) {
-          return errors::InvalidArgument(
-              "Trying to read from list with wrong element dtype. List has "
-              "type ",
-              DataTypeString(list_shape_type.dtype), " but expected type ",
-              DataTypeString(element_dtype));
-        }
-        shape_inference::ShapeHandle merged;
-        TF_RETURN_IF_ERROR(
-            c->Merge(element_shape, list_shape_type.shape, &merged));
-        element_shape = merged;
-      }
-      if (c->RankKnown(element_shape)) {
-        shape_inference::ShapeHandle result;
-        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
-        TF_RETURN_IF_ERROR(
-            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
-        c->set_output(0, result);
-      } else {
-        c->set_output(0, c->UnknownShape());
-      }
-      c->set_output(1, c->MakeShape({c->UnknownDim()}));
-      return Status::OK();
+REGISTER_OP("TensorListConcatV2")
+    .Input("input_handle: variant")
+    .Input("element_shape: shape_type")
+    .Input("leading_dims: int64")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      return TensorListConcatShapeInference(c, element_shape);
     });
 
 REGISTER_OP("TensorListSplit")
@@ -351,6 +399,7 @@ REGISTER_OP("TensorListReserve")
 REGISTER_OP("TensorListGetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
+    .Input("element_shape: int32")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -418,6 +467,7 @@ REGISTER_OP("TensorListSetItem")
 REGISTER_OP("TensorListGather")
     .Input("input_handle: variant")
     .Input("indices: int32")
+    .Input("element_shape: int32")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -461,6 +511,54 @@ REGISTER_OP("TensorListScatter")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListScatterV2")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Input("element_shape: shape_type")
+    .Input("num_elements: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListScatterIntoExistingList")
+    .Input("input_handle: variant")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &ignored));
+      // Check that indices is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        TF_RETURN_IF_ERROR(VerifyHandleData(c, *handle_data, element_dtype));
+        element_shape = GetElementShapeFromHandleData(*handle_data);
+      }
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListConcatLists")
     .Input("input_a: variant")
     .Input("input_b: variant")
@@ -477,15 +575,18 @@ REGISTER_OP("TensorListConcatLists")
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
-      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+      if ((handle_data_a == nullptr || handle_data_a->empty()) &&
+          (handle_data_b == nullptr || handle_data_b->empty())) {
         c->set_output_handle_shapes_and_types(
             0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
-          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+          (handle_data_a && !handle_data_a->empty()) ? handle_data_a->at(0)
+                                                     : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
-          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+          (handle_data_b && !handle_data_b->empty()) ? handle_data_b->at(0)
+                                                     : handle_data_a->at(0);
       if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
diff --git a/tensorflow/core/ops/lookup_table_ops.cc b/tensorflow/core/ops/lookup_table_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ce08f6f2f9d7eec7cd2222de2456170e4976d6c
--- /dev/null
+++ b/tensorflow/core/ops/lookup_table_ops.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("LookupTableInsertOrAssignOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: insert_key_tensor_dtype")
+    .Input("values: table_value_dtype")
+    .Attr("insert_key_tensor_dtype: type")
+    .Attr("table_value_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // Note that, by design, shape checks are implementation dependent so they
+      // must be deferred until runtime.
+      return Status::OK();
+    });
+
+REGISTER_OP("LookupTableFindOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: lookup_key_tensor_dtype")
+    .Input("num_threads: int64")
+    .Output("values: table_value_dtype")
+    .Attr("table_value_dtype: type")
+    .Attr("lookup_key_tensor_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // The output shape cannot be inferred here because the key size
+      // cannot be inferred from the key tensor in general.
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("ContainerSizeOp")
+    .Input("container_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 55dcc50325f600730376c492fa3a2cdde4293ace..99070b6498ca8af701fb8621b9881cfb4f60f42c 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -469,6 +469,16 @@ Status MulGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Mul", MulGrad);
 
+Status MulNoNanGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForBinaryCwise(g, {
+      {{"gx"}, "MulNoNan", {"y", "dz"}},  // y * dz
+      {{"gy"}, "MulNoNan", {"x", "dz"}},  // x * dz
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("MulNoNan", MulGrad);
+
 Status DivGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
@@ -583,6 +593,20 @@ Status XdivyGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Xdivy", XdivyGrad);
 
+Status SquaredDifferenceGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForBinaryCwise(g, {
+      FDH::Const("c", 2LL),
+      {{"two"}, "Cast", {"c"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+      {{"x_sub_y"}, "Sub", {"x", "y"}},
+      {{"two_x_sub_y"}, "Mul", {"two", "x_sub_y"}},  // 2 * (x - y)
+      {{"gx"}, "Mul", {"two_x_sub_y", "dz"}},
+      {{"gy"}, "Neg", {"gx"}}
+    });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("SquaredDifference", SquaredDifferenceGrad);
+
 Status MaximumMinimumGradHelper(const string& comparator,
                                 const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 9fc6b3414791932ef247486424cbf8371a46dbc7..129d9243ec071971069a7d39e7e5c9d72da70669 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -949,6 +949,25 @@ TEST_F(MathGradTest, Xdivy) {
                                 TensorShape({2, 1})));
 }
 
+TEST_F(MathGradTest, SquaredDifference) {
+  auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
+                                 TensorShape({2, 3}));
+  auto y = test::AsTensor<float>({.5f, 2.f}, TensorShape({2, 1}));
+  Tensor dx;
+  Tensor dy;
+  auto g = [](float x, float y) -> float { return 2. * (x - y); };
+  auto h = [](float x, float y) -> float { return 2. * (y - x); };
+  SymGrad("SquaredDifference", x, y, &dx, &dy);
+  test::ExpectClose(
+      dx, test::AsTensor<float>({g(-3.f, .5f), g(-2.f, .5f), g(-1.f, .5f),
+                                 g(1.f, 2.f), g(2.f, 2.f), g(3.f, 2.f)},
+                                TensorShape({2, 3})));
+  test::ExpectClose(
+      dy, test::AsTensor<float>({h(-3.f, .5f) + h(-2.f, .5f) + h(-1.f, .5f),
+                                 h(1.f, 2.f) + h(2.f, 2.f) + h(3.f, 2.f)},
+                                TensorShape({2, 1})));
+}
+
 TEST_F(MathGradTest, Maximum) {
   auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
                                  TensorShape({2, 3}));
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index eb9cbd3225e54877450370db0a15e2315450457b..42bdc12fd3a1593695c87906fff7ace3eafdabce 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -39,7 +39,61 @@ REGISTER_OP("AddN")
                                         " with other shapes.");
       }
       c->set_output(0, cur);
-      return Status::OK();
+
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+
+      if (dtype != DT_VARIANT) {
+        // Exit early if not DT_VARIANT.
+        return Status::OK();
+      } else {
+        // DT_VARIANT shape handle shape inference.  All sizes and dtypes must
+        // be the same; all shapes must be compatible via Merge.
+        std::vector<shape_inference::ShapeAndType> cur_shapes_and_types;
+        auto* shapes_and_types =
+            c->input_handle_shapes_and_types(c->num_inputs() - 1);
+        if (shapes_and_types) {
+          cur_shapes_and_types = *shapes_and_types;
+        }
+
+        for (int i = c->num_inputs() - 2; i >= 0; --i) {
+          auto shapes_and_types_i = c->input_handle_shapes_and_types(i);
+          if (!shapes_and_types && shapes_and_types_i) {
+            // TODO(ebrevdo): Find cases where this happens and fix their shape
+            // inference.  If we are calling AddN on variant types, they should
+            // all have consistent shape_and_type info.
+            shapes_and_types = shapes_and_types_i;
+          } else if (shapes_and_types && shapes_and_types_i) {
+            if (shapes_and_types_i->size() != shapes_and_types->size()) {
+              return errors::InvalidArgument(
+                  "shapes_and_types[", i,
+                  "].size() == ", shapes_and_types_i->size(),
+                  " != shapes_and_types[0].size() == ",
+                  shapes_and_types->size());
+            }
+            for (int j = 0; j < shapes_and_types->size(); ++j) {
+              if (shapes_and_types->at(j).dtype !=
+                  shapes_and_types_i->at(j).dtype) {
+                return errors::InvalidArgument(
+                    "shapes_and_types[", i, "][", j, "].dtype() == ",
+                    DataTypeString(shapes_and_types_i->at(j).dtype),
+                    " != shapes_and_types[0][", j, "].dtype == ",
+                    DataTypeString(shapes_and_types->at(j).dtype));
+              }
+              TF_RETURN_WITH_CONTEXT_IF_ERROR(
+                  c->Merge(shapes_and_types_i->at(j).shape,
+                           cur_shapes_and_types.at(j).shape,
+                           &cur_shapes_and_types.at(j).shape),
+                  "From merging shapes_and_types[", i, "][", j, "].shape with ",
+                  "shapes_and_types[0][", j, "].shape");
+            }
+          }
+        }
+        if (shapes_and_types) {
+          c->set_output_handle_shapes_and_types(0, cur_shapes_and_types);
+        }
+        return Status::OK();
+      }
     });
 
 // --------------------------------------------------------------------------
@@ -352,10 +406,10 @@ REGISTER_OP("_MklAdd")
         "complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns x + y element-wise.
+Returns `x` + `y` element-wise.
 
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+*NOTE*: `tf.math.add` supports broadcasting. `tf.math.add_n` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
 )doc");
 
 REGISTER_OP("Sub").BINARY_MORE().SetShapeFn(
@@ -377,6 +431,14 @@ Returns x - y element-wise.
 REGISTER_OP("Mul").BINARY_MORE().SetIsCommutative().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
+REGISTER_OP("MulNoNan")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {half, float, double, complex64, complex128}")
+    .SetIsCommutative()
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 REGISTER_OP("_MklMul")
     .BINARY_MORE()
     .Input("mkl_x: uint8")
@@ -398,7 +460,7 @@ REGISTER_OP("DivNoNan")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorDiv")
@@ -796,6 +858,15 @@ REGISTER_OP("Sum")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
+REGISTER_OP("EuclideanNorm")
+    .Input("input: T")
+    .Input("reduction_indices: Tidx")
+    .Output("output: T")
+    .Attr("keep_dims: bool = false")
+    .Attr("T: numbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::ReductionShape);
+
 REGISTER_OP("Mean")
     .Input("input: T")
     .Input("reduction_indices: Tidx")
@@ -1368,7 +1439,14 @@ REGISTER_OP("Conj")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {complex64, complex128, variant} = DT_COMPLEX64")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1688,6 +1766,45 @@ inputs: Must all be the same size and shape.
 
 #endif  // INTEL_MKL
 
+REGISTER_OP("RequantizePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Input("requested_output_min: float")
+    .Input("requested_output_max: float")
+    .Output("output: out_type")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+REGISTER_OP("RequantizationRangePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("clip_value_max: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("NextAfter")
     .Attr("T: {float64, float32} = DT_FLOAT")
     .Input("x1: T")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 05379a7d699629d733cacd71343fc9d912eb0893..a1abdb6aed2cee02786166c4c4cfef32981beba1 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -144,6 +144,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[2]", "[d1_0]");
     INFER_OK(op, "[2];[1]", "[d0_0]");
     INFER_OK(op, "[2];[]", "[d0_0]");
+    INFER_OK(op, "[2];[?]", "[d0_0]");
 
     INFER_OK(op, "[0];[0]", "[d0_0|d1_0]");
     INFER_OK(op, "[];[0]", "[d1_0]");
@@ -151,6 +152,9 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[0];[1]", "[d0_0]");
     INFER_OK(op, "[0];[]", "[d0_0]");
 
+    INFER_OK(op, "[2];[?,?]", "[d1_0,d0_0]");
+    INFER_OK(op, "[2,2];[?,?,?]", "[d1_0,d0_0,d0_1]");
+
     // Multiple dimension cases (same test cases, switching x and y).
     INFER_OK(op, "[?,1,2,3,4,5];[3,1,?]",
              "[d0_0,d0_1,d0_2,d0_3|d1_0,d0_4,d0_5]");
@@ -201,7 +205,6 @@ TEST(MathOpsTest, Select_ShapeFn) {
   typedef std::vector<std::pair<TensorShapeProto, DataType>> ShapeDtypeV;
   std::vector<std::unique_ptr<ShapeDtypeV>> handle_data;
   std::unique_ptr<shape_inference::InferenceContext> c;
-  Status run_status;
   auto run_inference_for_handles = [&]() -> Status {
     CHECK(op_reg_data->shape_inference_fn != nullptr);
     c.reset(new shape_inference::InferenceContext(
diff --git a/tensorflow/core/ops/mkl_array_ops.cc b/tensorflow/core/ops/mkl_array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ad3be611218734f769f9b108e0ff85052c2e72
--- /dev/null
+++ b/tensorflow/core/ops/mkl_array_ops.cc
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+// This file contains the registration of MKL-DNN array ops.
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/strided_slice_op.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
+
+// Adding QuantizedConcatV2 op to be able to replace it by
+// _MklQuantizedConcatV2 in the graph rewrite.
+REGISTER_OP("QuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins: N * float32")
+    .Input("input_maxes: N * float32")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins:  N * float32")
+    .Input("input_maxes: N * float32")
+    .Input("mkl_values: N * uint8")
+    .Input("mkl_axis: uint8")
+    .Input("mkl_input_mins:  N * uint8")
+    .Input("mkl_input_maxes: N * uint8")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_output_min: uint8")
+    .Output("mkl_output_max: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() / 2 - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs() / 2; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 658afd99013485ce3c6c16906d3d6f9415ad48f6..0e6ad9162a54c429d3bfa62825dada6b99bfa3c7 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -46,6 +46,7 @@ REGISTER_OP("_MklFusedConv2D")
     .Attr("T: {float}")
     .Attr("num_args: int >= 0")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -59,6 +60,63 @@ REGISTER_OP("_MklFusedConv2D")
  is expected to create these operators.
 )doc");
 
+REGISTER_OP("__MklDummyPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
@@ -145,8 +203,10 @@ REGISTER_OP("_MklQuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -189,8 +249,10 @@ REGISTER_OP("_MklQuantizedConv2DAndRequantize")
     .Attr("out_type: quantizedtype = DT_QINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -233,8 +295,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBias")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -281,8 +345,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize")
     .Attr("out_type: quantizedtype = DT_QINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -322,8 +388,10 @@ REGISTER_OP("_MklQuantizedConv2DAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -366,8 +434,10 @@ REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -410,8 +480,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -458,8 +530,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -505,8 +579,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -560,8 +636,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -617,8 +695,10 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -634,6 +714,50 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 560b71a337e2ecc673ebee73b275c4da2d335672..7beaf57c10be9e973aa4c80abd44a2009d4625d8 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -37,18 +37,41 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       {
         {{"softmax"}, "Softmax", {"x"}, {{"T", "$T"}}},
         {{"n0"}, "Mul", {"grad_softmax", "softmax"}, {{"T", "$T"}}},
-        FDH::Const<int32>("indices", {1}),
-        {{"n1"}, "Sum", {"n0", "indices"}, {{"T", "$T"}}},
-        FDH::Const<int32>("newshape", {-1, 1}),
-        {{"n2"}, "Reshape", {"n1", "newshape"}, {{"T", "$T"}}},
-        {{"n3"}, "Sub", {"grad_softmax", "n2"}, {{"T", "$T"}}},
-        {{"grad_x"}, "Mul", {"n3", "softmax"}, {{"T", "$T"}}}
+        FDH::Const<int32>("indices", {-1}),
+        {{"n1"}, "Sum", {"n0", "indices"}, {{"keep_dims", true}, {"T", "$T"}}},
+        {{"n2"}, "Sub", {"grad_softmax", "n1"}, {{"T", "$T"}}},
+        {{"grad_x"}, "Mul", {"n2", "softmax"}, {{"T", "$T"}}}
       });
   // clang-format on
   return Status::OK();
 }
 REGISTER_OP_GRADIENT("Softmax", SoftmaxGrad);
 
+Status LogSoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+      "LogSoftmaxGrad",
+      // Arg defs
+      {"x: T", "grad_logsoftmax: T"},
+      // Ret val defs
+      {"grad_x: T"},
+      // Attr defs
+      {{"T: {float, double}"}},
+      // Nodes
+      // Based on _LogSoftmaxGrad in nn_grad.py.
+      {
+        {{"softmax"}, "Softmax", {"x"}, {{"T", "$T"}}},
+        FDH::Const<int32>("indices", {-1}),
+        {{"n0"}, "Sum", {"grad_logsoftmax", "indices"},
+         {{"keep_dims", true}, {"T", "$T"}}},
+        {{"n1"}, "Mul", {"n0", "softmax"}, {{"T", "$T"}}},
+        {{"grad_x"}, "Sub", {"grad_logsoftmax", "n1"}, {{"T", "$T"}}}
+      });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("LogSoftmax", LogSoftmaxGrad);
+
 Status ReluGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   *g = FDH::Define(
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 0f4f72593746100fc9bc82a4d7070fa361c5e86f..2b1d031be86c9f6a965b5b23a84c7bf1dd0e4f8e 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -827,6 +827,7 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Attr("strides: list(int) >= 4")
     .Attr("Targmax: {int32, int64} = DT_INT64")
     .Attr(GetPaddingAttrString())
+    .Attr("include_batch_in_index: bool = false")
     .Input("input: T")
     .Output("output: T")
     .Output("argmax: Targmax")
@@ -841,6 +842,7 @@ REGISTER_OP("MaxPoolGradWithArgmax")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
+    .Attr("include_batch_in_index: bool = false")
     .Attr("Targmax: {int32, int64}")
     .Input("input: T")
     .Input("grad: T")
@@ -855,6 +857,7 @@ REGISTER_OP("MaxPoolGradGradWithArgmax")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
+    .Attr("include_batch_in_index: bool = false")
     .Attr("Targmax: {int32, int64}")
     .Input("input: T")
     .Input("grad: T")
@@ -1554,6 +1557,7 @@ REGISTER_OP("_MklDepthwiseConv2dNative")
     .Output("mkl_filter_output: uint8")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1571,6 +1575,7 @@ REGISTER_OP("_MklConv2D")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1590,6 +1595,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1617,6 +1623,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1637,6 +1644,7 @@ REGISTER_OP("__MklDummyPadWithConv2D")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1667,6 +1675,7 @@ REGISTER_OP("_MklPadWithConv2D")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_filter_const: bool = false")
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::Conv2DShape)
@@ -1852,6 +1861,7 @@ REGISTER_OP("_MklConv3D")
     .Output("mkl_filter_output: uint8")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
@@ -2534,6 +2544,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#endif  // INTEL_MKL
 REGISTER_OP("QuantizedConv2DAndRequantize")
     .Input("input: Tinput")
     .Input("filter: Tfilter")
@@ -2552,6 +2563,7 @@ REGISTER_OP("QuantizedConv2DAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2584,6 +2596,7 @@ REGISTER_OP("QuantizedConv2DWithBias")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2617,6 +2630,7 @@ REGISTER_OP("QuantizedConv2DWithBiasAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2649,6 +2663,7 @@ REGISTER_OP("QuantizedConv2DAndRelu")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2679,6 +2694,7 @@ REGISTER_OP("QuantizedConv2DAndReluAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2711,6 +2727,7 @@ REGISTER_OP("QuantizedConv2DWithBiasAndRelu")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2745,6 +2762,7 @@ REGISTER_OP("QuantizedConv2DWithBiasAndReluAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2779,6 +2797,7 @@ REGISTER_OP("QuantizedConv2DWithBiasSumAndRelu")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2816,6 +2835,7 @@ REGISTER_OP("QuantizedConv2DWithBiasSumAndReluAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2855,6 +2875,7 @@ REGISTER_OP("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("padding_list: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2870,6 +2891,4 @@ REGISTER_OP("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
-#endif  // INTEL_MKL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 92a7bb9a3460db4776bb357a81f3d09c55f951e2..c72da53ff9cc692490940d73abd610692a775b6f 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -651,6 +651,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -5141,6 +5194,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -5234,6 +5327,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -5259,12 +5353,71 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
 }
+op {
+  name: "ChooseFastestBranchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "ratio_numerator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "ratio_denominator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "num_elements_per_branch"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "other_arguments_lengths"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ClipByValue"
   input_arg {
@@ -5395,6 +5548,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CollectiveReduce"
   input_arg {
@@ -5466,6 +5700,56 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -5796,6 +6080,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -6879,6 +7192,33 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
 op {
   name: "CudnnRNN"
   input_arg {
@@ -7420,6 +7760,13 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7987,6 +8334,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
@@ -8232,6 +8586,7 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
   name: "DebugGradientIdentity"
@@ -8789,6 +9144,8 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -9662,8 +10019,11 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -10106,6 +10466,202 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type_attr: "T1"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type_attr: "T2"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type_attr: "T3"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T3"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10244,6 +10800,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -10343,6 +10959,37 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalAutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalBytesProducedStatsDataset"
   input_arg {
@@ -10456,6 +11103,18 @@ op {
     name: "num_experiments"
     type: "int"
   }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
   name: "ExperimentalDatasetCardinality"
@@ -10482,6 +11141,7 @@ op {
     name: "compression_type"
     type: DT_STRING
   }
+  is_stateful: true
 }
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
@@ -11262,6 +11922,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalScanDataset"
   input_arg {
@@ -11690,6 +12377,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
 }
 op {
   name: "ExtractImagePatches"
@@ -14517,6 +15211,122 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueuePrelinearizedBuffer"
+  input_arg {
+    name: "input"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -15750,997 +16560,699 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
-  }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "velocities"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "LogicalAnd"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "LogicalNot"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LogicalOr"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "updates"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tvalues"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableExportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableImportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "mom"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mg"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableInsertV2"
+  name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "linears"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
-  }
-}
-op {
-  name: "LookupTableSizeV2"
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "LoopCond"
   input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LowerBound"
   input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
+    name: "linears"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "table_name"
+    type: "string"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      s: ""
     }
   }
-}
-op {
-  name: "Lu"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "lu"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "output_idx_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MakeIterator"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "table_name"
+    type: "string"
     default_value {
-      b: true
+      s: ""
     }
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
   }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "MapDefun"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tcaptured"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
     default_value {
-      list {
-      }
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "capacity"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "table_id"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "shared_name"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "MapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MapUnstageNoKey"
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
+  name: "Log"
   input_arg {
-    name: "b"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -16750,8 +17262,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -16759,60 +17269,42 @@ op {
   }
 }
 op {
-  name: "MatchingFiles"
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-}
-op {
-  name: "MatrixBandPart"
+  name: "Log1p"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
-  }
-  input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
-  }
   output_arg {
-    name: "band"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tindex"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
     type_attr: "T"
   }
   attr {
@@ -16820,6 +17312,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -16829,316 +17322,398 @@ op {
   }
 }
 op {
-  name: "MatrixDiag"
+  name: "LogSoftmax"
   input_arg {
-    name: "diagonal"
+    name: "logits"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "logsoftmax"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "MatrixDiagPart"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "sampled_candidates"
+    type: DT_INT64
   }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  deprecation {
-    version: 27
-    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "unique"
+    type: "bool"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixLogarithm"
+  name: "LogicalAnd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MatrixSetDiag"
+  name: "LogicalOr"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "T"
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
     type: "type"
   }
 }
 op {
-  name: "MatrixSolve"
+  name: "LookupTableExportV2"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tvalues"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixSolveLs"
+  name: "LookupTableFind"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Tout"
+    type: "type"
   }
 }
 op {
-  name: "MatrixSquareRoot"
+  name: "LookupTableFindV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "LookupTableImport"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "Max"
+  name: "LookupTableImportV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
     default_value {
       type: DT_INT32
@@ -17152,556 +17727,521 @@ op {
   }
 }
 op {
-  name: "MaxPool"
+  name: "Lu"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "lu"
     type_attr: "T"
   }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
       }
     }
   }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "data_format"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPool3D"
+  name: "MapDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 5
+    minimum: 1
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "use_inter_op_parallelism"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "MapDefun"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "arguments"
+    type_list_attr: "Targuments"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type_list_attr: "output_types"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "Targuments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 5
+    minimum: 1
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
       list {
-        s: "SAME"
-        s: "VALID"
       }
     }
+    has_minimum: true
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "TInput"
-    type: "type"
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "max_intra_op_parallelism"
+    type: "int"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
+      i: 1
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
+  name: "MapIncompleteSize"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "dtypes"
+    type: "list(type)"
   }
   attr {
-    name: "data_format"
+    name: "container"
     type: "string"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "MapPeek"
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "data_format"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: ""
     }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      type: DT_FLOAT
+      i: 0
     }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "MapStage"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "fake_dtypes"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "dtypes"
+    type: "list(type)"
   }
   attr {
-    name: "data_format"
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MapUnstage"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
   input_arg {
-    name: "strides"
+    name: "indices"
     type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "data_format"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "b"
     type_attr: "T"
   }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
@@ -17709,143 +18249,162 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
         type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MatchingFiles"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "pattern"
+    type: DT_STRING
   }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
   }
+}
+op {
+  name: "MatrixBandPart"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "num_lower"
+    type_attr: "Tindex"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "num_upper"
+    type_attr: "Tindex"
   }
   output_arg {
-    name: "output"
+    name: "band"
     type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tindex"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT64
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MatrixDiag"
   input_arg {
-    name: "input"
+    name: "diagonal"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "diagonal"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 27
+    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
@@ -17853,36 +18412,21 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
+        type: DT_FLOAT
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "MatrixLogarithm"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -17890,318 +18434,175 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MatrixSetDiag"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "diagonal"
     type_attr: "T"
   }
   output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
 }
 op {
-  name: "Maximum"
+  name: "MatrixSolve"
   input_arg {
-    name: "x"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Mean"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "input"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
         type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "fast"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: true
     }
   }
 }
 op {
-  name: "Merge"
+  name: "MatrixSquareRoot"
   input_arg {
-    name: "inputs"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeSummary"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Mfcc"
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "sample_rate"
-    type: DT_INT32
+    name: "rhs"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
-    }
+    type_attr: "T"
   }
   attr {
-    name: "lower_frequency_limit"
-    type: "float"
+    name: "lower"
+    type: "bool"
     default_value {
-      f: 20
+      b: true
     }
   }
   attr {
-    name: "filterbank_channel_count"
-    type: "int"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      i: 40
+      b: false
     }
   }
   attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
 }
 op {
-  name: "Min"
+  name: "Max"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -18261,362 +18662,485 @@ op {
   }
 }
 op {
-  name: "Minimum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "MaxPool"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "MirrorPad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "mode"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
 }
 op {
-  name: "MirrorPadGrad"
+  name: "MaxPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "mode"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NDHWC"
+    }
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
-}
-op {
-  name: "Mod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ModelDataset"
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 5
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+    minimum: 5
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "MultiDeviceIterator"
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "devices"
-    type: "list(string)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 5
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MultiDeviceIteratorFromStringHandle"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "string_handle"
-    type: DT_STRING
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
+      s: "NHWC"
+    }
+    allowed_values {
       list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "T"
+    type: "type"
     default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
-    has_minimum: true
   }
-  is_stateful: true
 }
 op {
-  name: "MultiDeviceIteratorGetNextFromShard"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "shard_num"
-    type: DT_INT32
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "incarnation_id"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorInit"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    minimum: 4
   }
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  input_arg {
-    name: "max_buffer_size"
-    type: DT_INT64
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
-  output_arg {
-    name: "incarnation_id"
-    type: DT_INT64
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MultiDeviceIteratorToStringHandle"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
+    name: "orig_input"
+    type_attr: "T"
   }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
   input_arg {
-    name: "logits"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "output_dtype"
+    type_attr: "T"
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 0
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
@@ -18639,405 +19163,390 @@ op {
       }
     }
   }
-  attr {
-    name: "output_dtype"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "shared_name"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "include_batch_in_index"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
+    name: "Targmax"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableDenseHashTableV2"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "deleted_key"
-    type_attr: "key_dtype"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "initial_num_buckets"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 131072
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTable"
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "shared_name"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "include_batch_in_index"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "Targmax"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableOfTensors"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensorsV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
   }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
       }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
 }
 op {
-  name: "MutexLock"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "mutex"
-    type: DT_RESOURCE
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MutexV2"
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-  }
-  is_stateful: true
-}
-op {
-  name: "NcclAllReduce"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "data"
-    type_attr: "T"
   }
   attr {
-    name: "reduction"
+    name: "padding"
     type: "string"
     allowed_values {
       list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
+  attr {
+    name: "include_batch_in_index"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "num_devices"
-    type: "int"
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  is_stateful: true
 }
 op {
-  name: "NcclBroadcast"
+  name: "Maximum"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -19045,6 +19554,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -19053,33 +19563,27 @@ op {
       }
     }
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "NcclReduce"
+  name: "Mean"
   input_arg {
     name: "input"
     type_attr: "T"
-    number_attr: "num_devices"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "data"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "reduction"
-    type: "string"
-    allowed_values {
-      list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
-      }
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
@@ -19087,338 +19591,382 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "num_devices"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "NearestNeighbors"
-  input_arg {
-    name: "points"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "centers"
-    type: DT_FLOAT
-  }
+  name: "Merge"
   input_arg {
-    name: "k"
-    type: DT_INT64
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
-    name: "nearest_center_indices"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
   output_arg {
-    name: "nearest_center_distances"
-    type: DT_FLOAT
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "Neg"
+  name: "MergeSummary"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "NegTrain"
+  name: "MergeV2Checkpoints"
   input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
+    name: "checkpoint_prefixes"
+    type: DT_STRING
   }
   input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
   input_arg {
-    name: "examples"
-    type: DT_INT32
+    name: "spectrogram"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "labels"
+    name: "sample_rate"
     type: DT_INT32
   }
-  input_arg {
-    name: "lr"
+  output_arg {
+    name: "output"
     type: DT_FLOAT
   }
   attr {
-    name: "vocab_count"
-    type: "list(int)"
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
   }
   attr {
-    name: "num_negative_samples"
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
     type: "int"
+    default_value {
+      i: 40
+    }
   }
-  deprecation {
-    version: 19
-    explanation: "Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result"
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "NextAfter"
+  name: "Min"
   input_arg {
-    name: "x1"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "x2"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "NextIteration"
+  name: "Minimum"
   input_arg {
-    name: "data"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "NoOp"
-}
-op {
-  name: "NonMaxSuppression"
+  name: "MirrorPad"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "iou_threshold"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
       }
     }
   }
 }
 op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
+  name: "MirrorPadGrad"
   input_arg {
-    name: "scores"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
       }
     }
   }
 }
 op {
-  name: "NonMaxSuppressionV4"
+  name: "Mod"
   input_arg {
-    name: "boxes"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "scores"
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
   output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "NonMaxSuppressionWithOverlaps"
-  input_arg {
-    name: "overlaps"
-    type: DT_FLOAT
-  }
+  name: "ModelDataset"
   input_arg {
-    name: "scores"
-    type: DT_FLOAT
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "overlap_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "NotEqual"
+  name: "Mul"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -19429,7 +19977,7 @@ op {
   }
   output_arg {
     name: "z"
-    type: DT_BOOL
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19442,15 +19990,11 @@ op {
         type: DT_DOUBLE
         type: DT_UINT8
         type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
         type: DT_COMPLEX128
       }
     }
@@ -19458,104 +20002,53 @@ op {
   is_commutative: true
 }
 op {
-  name: "NthElement"
+  name: "MulNoNan"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "OneHot"
-  input_arg {
-    name: "indices"
-    type_attr: "TI"
-  }
-  input_arg {
-    name: "depth"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "on_value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "off_value"
-    type_attr: "T"
-  }
+  name: "MultiDeviceIterator"
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "TI"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "shared_name"
+    type: "string"
   }
   attr {
-    name: "dataset_factory"
-    type: "func"
+    name: "container"
+    type: "string"
   }
   attr {
     name: "output_types"
@@ -19569,103 +20062,51 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "OptimizeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "MultiDeviceIteratorFromStringHandle"
   input_arg {
-    name: "optimizations"
+    name: "string_handle"
     type: DT_STRING
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   attr {
     name: "output_types"
     type: "list(type)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
-    minimum: 1
   }
   attr {
     name: "output_shapes"
     type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
-    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "OptionalFromValue"
+  name: "MultiDeviceIteratorGetNextFromShard"
   input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "shard_num"
+    type: DT_INT32
   }
-}
-op {
-  name: "OptionalGetValue"
   input_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "incarnation_id"
+    type: DT_INT64
   }
   output_arg {
     name: "components"
@@ -19683,46 +20124,113 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "OptionalHasValue"
+  name: "MultiDeviceIteratorInit"
   input_arg {
-    name: "optional"
+    name: "dataset"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "max_buffer_size"
+    type: DT_INT64
+  }
   output_arg {
-    name: "has_value"
-    type: DT_BOOL
+    name: "incarnation_id"
+    type: DT_INT64
   }
+  is_stateful: true
 }
 op {
-  name: "OptionalNone"
+  name: "MultiDeviceIteratorToStringHandle"
+  input_arg {
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
+  }
   output_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "string_handle"
+    type: DT_STRING
   }
+  is_stateful: true
 }
 op {
-  name: "OrderedMapClear"
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
   attr {
-    name: "capacity"
+    name: "seed"
     type: "int"
     default_value {
       i: 0
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
+    name: "seed2"
     type: "int"
     default_value {
       i: 0
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -19738,85 +20246,58 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "value_shape"
+    type: "shape"
     default_value {
-      s: ""
+      shape {
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "initial_num_buckets"
+    type: "int"
     default_value {
-      s: ""
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapPeek"
+  name: "MutableDenseHashTableV2"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "empty_key"
+    type_attr: "key_dtype"
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "deleted_key"
+    type_attr: "key_dtype"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -19832,33 +20313,51 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
     default_value {
-      i: 0
+      shape {
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
+    name: "initial_num_buckets"
     type: "int"
     default_value {
-      i: 0
+      i: 131072
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -19874,47 +20373,74 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
   is_stateful: true
 }
 op {
-  name: "OrderedMapStage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "shared_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -19930,43 +20456,36 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
     default_value {
-      i: 0
+      shape {
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -19982,43 +20501,40 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstageNoKey"
+  name: "MutexLock"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "mutex"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "key"
-    type: DT_INT64
+    name: "mutex_lock"
+    type: DT_VARIANT
   }
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -20037,44 +20553,56 @@ op {
   is_stateful: true
 }
 op {
-  name: "Pack"
+  name: "NcclAllReduce"
   input_arg {
-    name: "values"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
-    name: "output"
+    name: "data"
     type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "axis"
+    name: "num_devices"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  is_stateful: true
 }
 op {
-  name: "Pad"
+  name: "NcclBroadcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -20082,397 +20610,373 @@ op {
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
 }
 op {
-  name: "PadV2"
+  name: "NcclReduce"
   input_arg {
     name: "input"
     type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  input_arg {
-    name: "constant_values"
-    type_attr: "T"
+    number_attr: "num_devices"
   }
   output_arg {
-    name: "output"
+    name: "data"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
   }
   attr {
-    name: "Tpaddings"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
+  name: "NearestNeighbors"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "points"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "centers"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "padded_shapes"
+    name: "k"
     type: DT_INT64
-    number_attr: "N"
   }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "nearest_center_distances"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "PaddedBatchDatasetV2"
+  name: "NegTrain"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+    name: "examples"
+    type: DT_INT32
   }
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "labels"
+    type: DT_INT32
   }
   input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "lr"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "vocab_count"
+    type: "list(int)"
   }
   attr {
-    name: "N"
+    name: "num_negative_samples"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
+  deprecation {
+    version: 19
+    explanation: "Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result"
+  }
+  is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueue"
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "T"
+    type: "type"
     default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
       list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
   }
-  is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueueV2"
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+}
+op {
+  name: "NoOp"
+}
+op {
+  name: "NonDeterministicInts"
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "shape_dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
     }
   }
   is_stateful: true
 }
 op {
-  name: "ParallelConcat"
+  name: "NonMaxSuppression"
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "boxes"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
   }
 }
 op {
-  name: "ParallelDynamicStitch"
+  name: "NonMaxSuppressionV2"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "data"
+    name: "scores"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "merged"
-    type_attr: "T"
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "NonMaxSuppressionV3"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "sloppy"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: false
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV4"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
   attr {
-    name: "preserve_cardinality"
+    name: "pad_to_max_output_size"
     type: "bool"
     default_value {
       b: false
@@ -20480,233 +20984,321 @@ op {
   }
 }
 op {
-  name: "ParameterizedTruncatedNormal"
+  name: "NonMaxSuppressionWithOverlaps"
   input_arg {
-    name: "shape"
-    type_attr: "T"
+    name: "overlaps"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "means"
-    type_attr: "dtype"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "minvals"
-    type_attr: "dtype"
+    name: "overlap_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ParseExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
+  name: "OneHot"
   input_arg {
-    name: "names"
-    type: DT_STRING
+    name: "indices"
+    type_attr: "TI"
   }
   input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
+    name: "depth"
+    type: DT_INT32
   }
   input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
+    name: "on_value"
+    type_attr: "T"
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
+    name: "off_value"
+    type_attr: "T"
   }
   output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Nsparse"
+    name: "axis"
     type: "int"
-    has_minimum: true
+    default_value {
+      i: -1
+    }
   }
   attr {
-    name: "Ndense"
-    type: "int"
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_UINT8
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
       }
     }
   }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
   attr {
-    name: "Tdense"
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
     type: "list(type)"
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+    minimum: 1
   }
   attr {
-    name: "dense_shapes"
+    name: "output_shapes"
     type: "list(shape)"
     has_minimum: true
+    minimum: 1
   }
-}
-op {
-  name: "ParseSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "debug_name"
-    type: DT_STRING
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
   input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
+    name: "y"
+    type_attr: "T"
   }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
   }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+}
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
   }
   output_arg {
-    name: "feature_list_dense_lengths"
-    type: DT_INT64
-    number_attr: "Nfeature_list_dense"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: "list(string)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "context_sparse_keys"
-    type: "list(string)"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalFromValue"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
   }
   attr {
-    name: "context_dense_keys"
-    type: "list(string)"
+    name: "Toutput_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalGetValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "feature_list_sparse_keys"
-    type: "list(string)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "feature_list_dense_keys"
-    type: "list(string)"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalHasValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "has_value"
+    type: DT_BOOL
   }
+}
+op {
+  name: "OptionalNone"
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+}
+op {
+  name: "OrderedMapClear"
   attr {
-    name: "Ncontext_sparse"
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -20714,7 +21306,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Ncontext_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -20722,7 +21314,33 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -20730,7 +21348,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -20738,237 +21356,191 @@ op {
     has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
+    name: "dtypes"
     type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      s: ""
     }
   }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "shared_name"
+    type: "string"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      s: ""
     }
   }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
   attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
+    name: "capacity"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
   }
   attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
     has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_types"
+    name: "dtypes"
     type: "list(type)"
-    default_value {
-      list {
-      }
-    }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
+    name: "shared_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "ParseSingleExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
+  name: "OrderedMapSize"
   output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "num_sparse"
+    name: "capacity"
     type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "sparse_types"
+    name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
   }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParseSingleSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
-  }
+  name: "OrderedMapStage"
   input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
+    name: "indices"
+    type: DT_INT32
   }
   input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
+    name: "values"
+    type_list_attr: "fake_dtypes"
   }
-  input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  input_arg {
-    name: "debug_name"
-    type: DT_STRING
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "dtypes"
+    type: "list(type)"
   }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "feature_list_sparse_indices"
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
     type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
   }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "Ncontext_sparse"
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -20976,7 +21548,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Ncontext_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -20984,7 +21556,43 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -20992,7 +21600,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -21000,306 +21608,223 @@ op {
     has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
+    name: "dtypes"
     type: "list(type)"
-    default_value {
-      list {
-      }
-    }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+    minimum: 1
   }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "container"
+    type: "string"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      s: ""
     }
   }
   attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
+    name: "shared_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "ParseTensor"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
+  name: "OutfeedDequeue"
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "dtype"
   }
   attr {
-    name: "out_type"
+    name: "dtype"
     type: "type"
   }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
+  name: "OutfeedDequeueTuple"
   output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "outputs"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "Tout"
+    name: "dtypes"
     type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "config_proto"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "shapes"
+    type: "list(shape)"
   }
   attr {
-    name: "executor_type"
-    type: "string"
+    name: "device_ordinal"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
-      }
-    }
-  }
+  is_stateful: true
 }
 op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
-    explanation: "Placeholder now behaves the same as PlaceholderV2."
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "PlaceholderWithDefault"
+  name: "Pack"
   input_arg {
-    name: "input"
-    type_attr: "dtype"
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
     type: "type"
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
 }
 op {
-  name: "Polygamma"
+  name: "Pad"
   input_arg {
-    name: "a"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
   }
   attr {
-    name: "T"
+    name: "Tpaddings"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "PadV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PrefetchDataset"
+  name: "PaddedBatchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "buffer_size"
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
     type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
   attr {
-    name: "output_types"
+    name: "Toutput_types"
     type: "list(type)"
     has_minimum: true
     minimum: 1
@@ -21310,109 +21835,123 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "PreventGradient"
+  name: "PaddedBatchDatasetV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
   }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
   }
-}
-op {
-  name: "Print"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "drop_remainder"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "U"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
     type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "message"
-    type: "string"
+    name: "shapes"
+    type: "list(shape)"
     default_value {
-      s: ""
+      list {
+      }
     }
+    has_minimum: true
   }
   attr {
-    name: "first_n"
+    name: "capacity"
     type: "int"
     default_value {
       i: -1
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 3
+      s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
   attr {
-    name: "output_stream"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "stderr"
+      s: ""
     }
   }
   is_stateful: true
 }
 op {
-  name: "PriorityQueue"
+  name: "PaddingFIFOQueueV2"
   output_arg {
     name: "handle"
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    default_value {
-      list {
-      }
-    }
     has_minimum: true
+    minimum: 1
   }
   attr {
     name: "shapes"
     type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
   }
   attr {
@@ -21439,381 +21978,2664 @@ op {
   is_stateful: true
 }
 op {
-  name: "PriorityQueueV2"
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
   output_arg {
     name: "handle"
-    type: DT_RESOURCE
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  output_arg {
+    name: "feature_list_dense_lengths"
+    type: DT_INT64
+    number_attr: "Nfeature_list_dense"
+  }
+  attr {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+    explanation: "Placeholder now behaves the same as PlaceholderV2."
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Prelinearize"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "PrelinearizeTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+    explanation: "Replaced by QuantizeAndDequantizeV2"
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConcat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
+    name: "out_type"
+    type: "type"
     default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: ""
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "QuantizedConv2DAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "PyFunc"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "token"
+    name: "padding"
     type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "PyFuncStateless"
+  name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_list_attr: "Tin"
+    type_attr: "Tinput"
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
-  attr {
-    name: "token"
-    type: "string"
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Qr"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "q"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "r"
-    type_attr: "T"
+    name: "min_output"
+    type: DT_FLOAT
   }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "out_type"
+    type: "type"
     default_value {
-      i: 8
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      f: 0
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 22
-    explanation: "Replaced by QuantizeAndDequantizeV2"
-  }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedConv2DAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: "HALF_TO_EVEN"
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
-    allowed_values {
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedConv2DWithBias"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "bias"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      b: true
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: true
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
     name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
@@ -21821,11 +24643,11 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -21842,7 +24664,7 @@ op {
     }
   }
   attr {
-    name: "out_type"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -21854,36 +24676,12 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizeV2"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -21895,73 +24693,92 @@ op {
     }
   }
   attr {
-    name: "mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: "HALF_AWAY_FROM_ZERO"
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
-    allowed_values {
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
       }
     }
   }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -21974,7 +24791,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -21987,10 +24804,20 @@ op {
     }
   }
   attr {
-    name: "Toutput"
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -22002,13 +24829,54 @@ op {
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -22018,9 +24886,25 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -22031,7 +24915,7 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -22044,8 +24928,43 @@ op {
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
     name: "strides"
@@ -22061,79 +24980,87 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
-    name: "t"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "m"
-    type_attr: "Tinput"
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
-    name: "m_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "m_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "v_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
+    name: "summand"
+    type_attr: "Tsummand"
   }
   input_arg {
-    name: "gamma_min"
+    name: "min_summand"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_summand"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -22149,9 +25076,48 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -22163,23 +25129,53 @@ op {
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type: DT_FLOAT
   }
   input_arg {
     name: "min_input"
@@ -22190,11 +25186,15 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
     type: DT_FLOAT
   }
   output_arg {
@@ -22202,15 +25202,15 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -22223,7 +25223,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -22238,6 +25238,9 @@ op {
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -22248,53 +25251,43 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
   }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -22303,6 +25296,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -22319,6 +25316,26 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -22357,11 +25374,34 @@ op {
       }
     }
   }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -22399,6 +25439,14 @@ op {
       }
     }
   }
+  attr {
+    name: "padding_list"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "QuantizedInstanceNorm"
@@ -22994,6 +26042,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "QueueClose"
@@ -24589,6 +27644,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -24641,6 +27715,7 @@ op {
       b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "ReduceJoin"
@@ -25135,6 +28210,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -25196,6 +28314,73 @@ op {
     }
   }
 }
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
 op {
   name: "Reshape"
   input_arg {
@@ -25305,6 +28490,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResizeBicubicGrad"
@@ -25337,6 +28529,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResizeBilinear"
@@ -25377,6 +28576,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResizeBilinearGrad"
@@ -25411,6 +28617,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResizeNearestNeighbor"
@@ -25450,6 +28663,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResizeNearestNeighborGrad"
@@ -25486,6 +28706,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ResourceApplyAdaMax"
@@ -26651,6 +29878,13 @@ op {
     name: "output"
     type_attr: "dtype"
   }
+  attr {
+    name: "batch_dims"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
   attr {
     name: "validate_indices"
     type: "bool"
@@ -27769,11 +31003,81 @@ op {
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
@@ -27837,21 +31141,33 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -27907,241 +31223,843 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
+  name: "ResourceStridedSliceAssign"
   input_arg {
-    name: "var"
+    name: "ref"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "begin"
+    type_attr: "Index"
   }
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "end"
+    type_attr: "Index"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "strides"
+    type_attr: "Index"
   }
   input_arg {
-    name: "rho"
+    name: "value"
     type_attr: "T"
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "begin_mask"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "end_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "ellipsis_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "new_axis_mask"
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "shrink_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "ms"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -28831,6 +32749,13 @@ op {
       s: "lanczos3"
     }
   }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ScaleAndTranslateGrad"
@@ -28870,6 +32795,13 @@ op {
       s: "lanczos3"
     }
   }
+  attr {
+    name: "antialias"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ScatterAdd"
@@ -30076,6 +34008,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
@@ -30112,6 +34045,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -30168,6 +34102,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -30372,6 +34338,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -30487,6 +34484,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -34482,13 +38483,39 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
     }
   }
   attr {
@@ -34497,11 +38524,81 @@ op {
     default_value {
       type: DT_INT64
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulUniformFullInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulUniformInt"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
     }
   }
   is_stateful: true
@@ -35574,6 +39671,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -35678,40 +39776,333 @@ op {
       s: ""
     }
   }
-  deprecation {
-    version: 26
-    explanation: "Use TFRecordReaderV2"
+  deprecation {
+    version: 26
+    explanation: "Use TFRecordReaderV2"
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
   }
   is_stateful: true
 }
 op {
-  name: "TFRecordReaderV2"
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
   output_arg {
-    name: "reader_handle"
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
     type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "container"
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
     type: "string"
     default_value {
-      s: ""
+      s: "STEP_MARK_AT_ENTRY"
     }
   }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
   attr {
-    name: "compression_type"
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "TakeDataset"
@@ -37022,6 +41413,43 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
@@ -37082,6 +41510,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -37101,6 +41533,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -37127,6 +41563,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -37256,6 +41696,66 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListSetItem"
   input_arg {
@@ -37318,6 +41818,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -37848,6 +42352,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -39221,6 +43752,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index eff453241d47c55750b9662e13b8755e2d3a42b9..169076a6f673e4e23a874e6f369575f07fbd5168 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -26,7 +26,10 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("DecodeRaw")
     .Input("bytes: string")
     .Output("output: out_type")
-    .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}")
+    .Attr(
+        "out_type: "
+        "{half,float,double,int32,uint16,uint8,int16,int8,int64,complex64,"
+        "complex128}")
     .Attr("little_endian: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       // Note: last dimension is data dependent.
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 65bdde375bf07f8a43d682dd6ff58bc89ef80f68..0a972a66e672bb4b6df65e52ede748d6f2cf41c1 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -29,21 +29,20 @@ namespace tensorflow {
 
 namespace {
 
-Status ValidateVariableResourceHandle(InferenceContext* c,
-                                      ShapeAndType* shape_and_type) {
+Status ValidateVariableResourceHandle(
+    InferenceContext* c, std::vector<ShapeAndType>* shape_and_type) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
-    shape_and_type->shape = c->UnknownShape();
-    shape_and_type->dtype = DT_INVALID;
+    shape_and_type->emplace_back(c->UnknownShape(), DT_INVALID);
   } else {
-    *shape_and_type = (*handle_data)[0];
+    *shape_and_type = *handle_data;
     DataType value_dtype;
     TF_RETURN_IF_ERROR(c->GetAttr("dtype", &value_dtype));
-    if (shape_and_type->dtype != value_dtype) {
+    if (shape_and_type->at(0).dtype != value_dtype) {
       return errors::InvalidArgument(
           "Trying to read variable with wrong dtype. "
           "Expected ",
-          DataTypeString(shape_and_type->dtype), " got ",
+          DataTypeString(shape_and_type->at(0).dtype), " got ",
           DataTypeString(value_dtype));
     }
   }
@@ -51,9 +50,15 @@ Status ValidateVariableResourceHandle(InferenceContext* c,
 }
 
 Status ReadVariableShapeFn(InferenceContext* c) {
-  ShapeAndType shape_and_type;
+  std::vector<ShapeAndType> shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type));
-  c->set_output(0, shape_and_type.shape);
+  c->set_output(0, shape_and_type[0].shape);
+  if (shape_and_type[0].dtype == DT_VARIANT && shape_and_type.size() > 1) {
+    std::vector<ShapeAndType> variant_shape_and_type;
+    std::copy(shape_and_type.begin() + 1, shape_and_type.end(),
+              std::back_inserter(variant_shape_and_type));
+    c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+  }
   return Status::OK();
 }
 
@@ -180,13 +185,27 @@ REGISTER_OP("DestroyResourceOp")
     .SetShapeFn(shape_inference::NoOutputs);
 
 Status CreateAssignShapeFn(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
   ShapeHandle value_shape = c->input(1);
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(
-      c->Merge(handle_shape_and_type.shape, value_shape, &unused));
+      c->Merge(handle_shape_and_type[0].shape, value_shape, &unused));
+
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(1) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(1);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
@@ -235,34 +254,68 @@ REGISTER_OP("VariableShape")
 REGISTER_OP("ResourceGather")
     .Input("resource: resource")
     .Input("indices: Tindices")
+    .Attr("batch_dims: int = 0")
     .Attr("validate_indices: bool = true")
     .Output("output: dtype")
     .Attr("dtype: type")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
+      std::vector<ShapeAndType> handle_shape_and_type;
       TF_RETURN_IF_ERROR(
           ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
+      ShapeHandle indices_shape = c->input(1);
+
       ShapeHandle unused;
+      int32 batch_dims;
+      TF_RETURN_IF_ERROR(c->GetAttr("batch_dims", &batch_dims));
+      if (batch_dims < 0)
+        return errors::InvalidArgument("batch_dims is negative (", batch_dims,
+                                       ")");
+
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(handle_shape_and_type[0].shape,
+                                            batch_dims + 1, &unused));
+
       TF_RETURN_IF_ERROR(
-          c->WithRankAtLeast(handle_shape_and_type.shape, 1, &unused));
-      ShapeHandle params_subshape;
+          c->WithRankAtLeast(indices_shape, batch_dims, &unused));
+
+      ShapeHandle params_subshape1;
+      TF_RETURN_IF_ERROR(c->Subshape(handle_shape_and_type[0].shape, 0,
+                                     batch_dims, &params_subshape1));
+
+      ShapeHandle params_subshape2;
+      TF_RETURN_IF_ERROR(c->Subshape(handle_shape_and_type[0].shape,
+                                     batch_dims + 1, &params_subshape2));
+
+      ShapeHandle indices_subshape;
       TF_RETURN_IF_ERROR(
-          c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
-      ShapeHandle indices_shape = c->input(1);
+          c->Subshape(indices_shape, batch_dims, &indices_subshape));
+
+      // The out shape is params_shape[:batch_dims] +
+      // indices_shape[batch_dims:] + params_shape[batch_dims+1:].
       ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(params_subshape1, indices_subshape, &out));
+      TF_RETURN_IF_ERROR(c->Concatenate(out, params_subshape2, &out));
+
       c->set_output(0, out);
+      if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+          !handle_shape_and_type.empty()) {
+        std::vector<ShapeAndType> variant_shape_and_type;
+        std::copy(handle_shape_and_type.begin() + 1,
+                  handle_shape_and_type.end(),
+                  std::back_inserter(variant_shape_and_type));
+        c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+      }
       return Status::OK();
     });
 
 namespace {
 
 Status ResourceScatterUpdateShape(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
-  ShapeHandle var_shape = handle_shape_and_type.shape;
+  ShapeHandle var_shape = handle_shape_and_type[0].shape;
   ShapeHandle indices_shape = c->input(1);
 
   ShapeHandle unused_updates_shape;
@@ -274,6 +327,19 @@ Status ResourceScatterUpdateShape(InferenceContext* c) {
       InferenceContext::Rank(c->input(2)) == 0
           ? Status::OK()
           : c->Merge(c->input(2), concat, &unused_updates_shape));
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(2) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(2);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/ops/sendrecv_ops.cc b/tensorflow/core/ops/sendrecv_ops.cc
index 7d0fda2f87fc14804486a0edcd35f221c1045917..e84a4796c1f14d79e539b5c16f7e6e6b89421abd 100644
--- a/tensorflow/core/ops/sendrecv_ops.cc
+++ b/tensorflow/core/ops/sendrecv_ops.cc
@@ -106,8 +106,8 @@ REGISTER_OP("_HostRecv")
     .Doc(R"doc(
 Receives the named tensor from send_device on recv_device.
 
-_HostRecv requires its input on host memory whereas _Recv requires its
-input on device memory.
+_HostRecv produces its output on host memory whereas _Recv produces its
+output on device memory.
 
 tensor: The tensor to receive.
 tensor_name: The name of the tensor to receive.
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index de08a1078458c236520924f52450fa8b4dc6f18a..85186c4a2d88503e16fa5e804a38a321052643d0 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -97,7 +97,6 @@ REGISTER_OP("SparseTensorDenseMatMul")
       ShapeHandle unused;
       ShapeHandle b;
       ShapeHandle a_shape;
-      ShapeHandle a_shape_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));  // a_indices
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));  // a_values
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &a_shape));
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index 4c9277eda557bedf8f7e9e4368892ee46810d768..643b3e902819d4c4b802d4333f5a2a27bb7d303b 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -19,18 +19,83 @@ limitations under the License.
 namespace tensorflow {
 
 Status StatefulRandomShape(shape_inference::InferenceContext* c) {
-  shape_inference::ShapeHandle out;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+  using shape_inference::ShapeHandle;
+  // Check algorithm shape
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+  // Set output shape
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &out));
   c->set_output(0, out);
   return Status::OK();
 }
 
+#define REGISTER_STATEFUL_OP(name, default_dtype) \
+  REGISTER_OP(name)                               \
+      .Input("resource: resource")                \
+      .Input("algorithm: int64")                  \
+      .Input("shape: shape_dtype")                \
+      .Output("output: dtype")                    \
+      .Attr("dtype : type = " #default_dtype)     \
+      .Attr("shape_dtype : type = DT_INT64")      \
+      .SetShapeFn(StatefulRandomShape);
+
+REGISTER_STATEFUL_OP("StatefulUniformFullInt", DT_UINT64);
+REGISTER_STATEFUL_OP("StatefulStandardNormalV2", DT_FLOAT);
+
+REGISTER_OP("StatefulUniformInt")
+    .Input("resource: resource")
+    .Input("algorithm: int64")
+    .Input("shape: shape_dtype")
+    .Input("minval: dtype")
+    .Input("maxval: dtype")
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_INT64")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      // Check inputs
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      // Set output
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
+REGISTER_OP("NonDeterministicInts")
+    .Input("shape: shape_dtype")
+    .SetIsStateful()
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_INT64")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
+// Register the depracated 'StatefulStandardNormal' op. This op is a short-lived
+// version where the 'resource' variable also contains the algorithm tag.
+// It is deprecated in favor of 'StatefulStandardNormalV2'.
 REGISTER_OP("StatefulStandardNormal")
     .Input("resource: resource")
     .Input("shape: shape_dtype")
     .Output("output: dtype")
-    .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT")
-    .Attr("shape_dtype: {int32, int64} = DT_INT64")
-    .SetShapeFn(StatefulRandomShape);
+    .Attr("dtype : type = DT_FLOAT")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      // Set output shape
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/core/ops/tpu_configuration_ops.cc
similarity index 92%
rename from tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
rename to tensorflow/core/ops/tpu_configuration_ops.cc
index d5600eef4a9dc69fcfd931a083f86d7941ba8fb4..febb25096fdbfa006a5353c9719c1e7ce1852504 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/core/ops/tpu_configuration_ops.cc
@@ -193,25 +193,10 @@ REGISTER_OP("ConfigureDistributedTPU")
     .Attr("tpu_embedding_config: string = ''")
     .Attr("is_global_init: bool = false")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that sets up the centralized structures for a distributed TPU
-system.
-
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-tpu_embedding_config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-describes the embedding lookups of the program.
-embedding_config: Reserved. Do not use.
-is_global_init: Reserved. Do not use.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("ShutdownDistributedTPU")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that shuts down a running distributed TPU system. The Op returns
-an error if no system is running.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
similarity index 53%
rename from tensorflow/contrib/tpu/ops/cross_replica_ops.cc
rename to tensorflow/core/ops/tpu_cross_replica_ops.cc
index 87e3a5946c20be8e2c7a24e198d1fb94335a6b86..c26b49eb34b116b5bab5aa1e0154724318c3dbb9 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -26,7 +26,7 @@ REGISTER_OP("AllToAll")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {numbertype, bool}")
     .Attr("concat_dimension: int")
     .Attr("split_dimension: int")
     .Attr("split_count: int")
@@ -70,79 +70,19 @@ REGISTER_OP("AllToAll")
 
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An Op to exchange data across TPU replicas. On each replica, the input is
-split into `split_count` blocks along `split_dimension` and send to the other
-replicas given group_assignment. After receiving `split_count` - 1 blocks from
-other replicas, we concatenate the blocks along `concat_dimension` as the
-output.
-
-For example, suppose there are 2 TPU replicas:
-replica 0 receives input: `[[A, B]]`
-replica 1 receives input: `[[C, D]]`
-
-group_assignment=`[[0, 1]]`
-concat_dimension=0
-split_dimension=1
-split_count=2
-
-replica 0's output: `[[A], [C]]`
-replica 1's output: `[[B], [D]]`
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-concat_dimension: The dimension number to concatenate.
-split_dimension: The dimension number to split.
-split_count: The number of splits, this number must equal to the sub-group
-  size(group_assignment.get_shape()[1])
-output: The exchanged result.
-T: The type of elements to be exchanged.
-)doc");
+    });
 
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to sum inputs across replicated TPU instances. Each instance supplies its
-own input.
-
-For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-and `B, D, F, H` as group 1. Thus we get the outputs:
-`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-output: The sum of all the distributed inputs.
-T: The type of elements to be summed.
-)doc");
+    .Attr("T: {bfloat16, float, int32, uint32}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("CollectivePermute")
     .Input("input: T")
     .Input("source_target_pairs: int32")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to permute tensors across replicated TPU instances. Each instance
-supplies its own input.
-
-For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-`[D, A, B, C]`.
-
-input: The local input to be permuted. Currently only supports float and
-  bfloat16.
-source_target_pairs: A tensor with shape [num_pairs, 2].
-output: The permuted input.
-T: The type of elements to be exchanged.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/ops/tpu_embedding_ops.cc
similarity index 62%
rename from tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
rename to tensorflow/core/ops/tpu_embedding_ops.cc
index 676aed0b7b651494eda80ff2d7c7c31097529590..4eaab1e6c72f23f989ffbe840da20652bb4eb69f 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_ops.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -23,6 +20,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 
 namespace tensorflow {
 
@@ -96,10 +96,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_input_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Value of ", parameter.name(), " used in the ",
-                          GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -127,7 +123,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Load embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& parameter : state_variable_specs) {
     if (parameter.has_user_defined() || is_debug_op) {
@@ -139,21 +134,6 @@ lookups using the %s optimization algorithm.)",
                        GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that loads optimization parameters into HBM for embedding. Must be
-preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-embedding table configuration. For example, this op is used to install
-parameters that are loaded from a checkpoint before a training loop is
-executed.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -233,10 +213,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_output_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Parameter ", parameter.name(), " updated by the ",
-                          tpu::GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -264,7 +240,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Retrieve embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& param : state_variable_specs) {
     if (param.has_user_defined() || is_debug_op) {
@@ -276,20 +251,6 @@ parameters from embedding updates using the %s optimization algorithm.)",
                        tpu::GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that retrieves optimization parameters from embedding to host
-memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-the correct embedding table configuration. For example, this op is
-used to retrieve updated parameters before saving a checkpoint.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -388,23 +349,7 @@ REGISTER_OP("RecvTPUEmbeddingActivations")
         c->set_output(i, output_shape);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that receives embedding activations on the TPU.
-
-The TPU system performs the embedding lookups and aggregations specified by
-the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-results of these aggregations are visible to the Tensorflow Graph as the
-outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-one Tensor of activations per table specified in the model. There can be at
-most one RecvTPUEmbeddingActivations op in the TPU graph.
-
-outputs: A TensorList of embedding activations containing one Tensor per
-    embedding table in the model.
-num_outputs: The number of output activation tensors, equal to the number of
-    embedding tables in the model.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("TPUEmbeddingActivations")
     .Input("embedding_variable: float32")
@@ -415,23 +360,7 @@ REGISTER_OP("TPUEmbeddingActivations")
     .SetShapeFn([](shape_inference::InferenceContext *c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op enabling differentiation of TPU Embeddings.
-
-This op simply returns its first input, which is assumed to have been sliced
-from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
-op, and its first argument being a trainable Variable, enables automatic
-differentiation of graphs containing embeddings via the TPU Embedding Python
-libraries.
-
-embedding_variable: A trainable variable, enabling optimizers to find this op.
-sliced_activations: The embedding activations Tensor to return.
-table_id: The id of the table in the embedding layer configuration from which
-    these activations were computed.
-lookup_id: Identifier of the set of embedding indices which produced these
-    activations.
-)doc");
+    });
 
 REGISTER_OP("SendTPUEmbeddingGradients")
     .Input("inputs: N * float32")
@@ -453,25 +382,7 @@ REGISTER_OP("SendTPUEmbeddingGradients")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that performs gradient updates of embedding tables using the specified
-learning rates.
-
-inputs: A TensorList of gradients with which to update embedding tables.
-    This argument has the same length and shapes as the return value of
-    RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-    with respect to the embedding activations. The embedding tables are updated
-    from these gradients via the optimizer specified in the TPU embedding
-    configuration given to tpu.initialize_system.
-learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-    rate tag: see the comments in
-    //third_party/tensorflow/contrib/tpu/proto/optimization_parameters.proto.
-    Multiple tables can share the same dynamic learning rate tag as specified
-    in the configuration. If the learning rates for all tables are constant,
-    this list should be empty.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Input("batch: N * int32")
@@ -479,25 +390,16 @@ REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that enqueues a list of input batch tensors to TPUEmbedding.
-
-batch: A list of 1D tensors, one for each embedding table, containing the
-    indices into the tables.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
-    .Input("sample_indices: N * int32")
-    .Input("embedding_indices: N * int32")
-    .Input("aggregation_weights: N * float32")
+    .Input("sample_indices: N * T1")
+    .Input("embedding_indices: N * T2")
+    .Input("aggregation_weights: N * T3")
     .Input("mode_override: string")
+    .Attr("T1: {int32,int64} = DT_INT32")
+    .Attr("T2: {int32,int64} = DT_INT32")
+    .Attr("T3: {float32,float64} = DT_FLOAT")
     .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .Attr("combiners: list(string) = []")
@@ -514,90 +416,21 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that enqueues TPUEmbedding input indices from a SparseTensor.
-
-This Op eases the porting of code that uses embedding_lookup_sparse(),
-although some Python preprocessing of the SparseTensor arguments to
-embedding_lookup_sparse() is required to produce the arguments to this Op,
-since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-step.
-
-The tensors at corresponding positions in the three input lists
-must have the same shape, i.e. rank 1 with dim_size() equal to the total
-number of lookups into the table described by the corresponding table_id.
-
-sample_indices: A list of rank 1 Tensors specifying the training example and
-    feature to which the corresponding embedding_indices and aggregation_weights
-    values belong. sample_indices[i] must equal b * nf + f, where nf is the
-    number of features from the corresponding table, f is in [0, nf), and
-    b is in [0, batch size).
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-    (training example, feature) -- aggregation weights.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
-    .Input("sample_indices: N * int32")
-    .Input("embedding_indices: N * int32")
-    .Input("aggregation_weights: N * float32")
+    .Input("sample_indices: N * T1")
+    .Input("embedding_indices: N * T2")
+    .Input("aggregation_weights: N * T3")
     .Input("mode_override: string")
+    .Attr("T1: {int32,int64} = DT_INT32")
+    .Attr("T2: {int32,int64} = DT_INT32")
+    .Attr("T3: {float32,float64} = DT_FLOAT")
     .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .Attr("combiners: list(string) = []")
     .Attr("table_ids: list(int)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-This Op eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-
-sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-to the ith feature. table_ids[i] indicates which embedding table to look up ith
-feature.
-
-The tensors at corresponding positions in the three input lists (sample_indices,
-embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-with dim_size() equal to the total number of lookups into the table described by
-the corresponding feature.
-
-sample_indices: A list of rank 1 Tensors specifying the training example to
-    which the corresponding embedding_indices and aggregation_weights values
-    belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-    It corresponds to sp_ids.values in embedding_lookup_sparse().
-aggregation_weights: A list of rank 1 Tensors containing per training example
-    aggregation weights. It corresponds to sp_weights.values in
-    embedding_lookup_sparse().
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-table_ids: A list of integers specifying the identifier of the embedding table
-    (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-    corresponding input. The ith input is looked up using table_ids[i]. The size
-    of the table_ids list must be equal to that of sample_indices,
-    embedding_indices and aggregation_weights.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/functional_ops.cc b/tensorflow/core/ops/tpu_functional_ops.cc
similarity index 100%
rename from tensorflow/contrib/tpu/ops/functional_ops.cc
rename to tensorflow/core/ops/tpu_functional_ops.cc
diff --git a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc b/tensorflow/core/ops/tpu_heartbeat_ops.cc
similarity index 72%
rename from tensorflow/contrib/tpu/ops/heartbeat_ops.cc
rename to tensorflow/core/ops/tpu_heartbeat_ops.cc
index ca0f5bc0e562cd9e27b4c456b53fb9f51f1cb1f8..660aa32c8278b27b307e229d427935f36e81e5f5 100644
--- a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
+++ b/tensorflow/core/ops/tpu_heartbeat_ops.cc
@@ -23,15 +23,6 @@ REGISTER_OP("WorkerHeartbeat")
     .Input("request: string")
     .Output("response: string")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Worker heartbeat op.
-
-Heartbeats may be sent periodically to indicate the coordinator is still active,
-to retrieve the current worker status and to expedite shutdown when necessary.
-
-request: A string tensor containing a serialized WorkerHeartbeatRequest
-response: A string tensor containing a serialized WorkerHeartbeatResponse
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
similarity index 100%
rename from tensorflow/contrib/tpu/ops/host_compute_ops.cc
rename to tensorflow/core/ops/tpu_host_compute_ops.cc
diff --git a/tensorflow/contrib/tpu/ops/infeed_ops.cc b/tensorflow/core/ops/tpu_infeed_ops.cc
similarity index 51%
rename from tensorflow/contrib/tpu/ops/infeed_ops.cc
rename to tensorflow/core/ops/tpu_infeed_ops.cc
index 2ed16c2a2270a5399059d7e07f5903e11098bbf9..2cab6f7f976e99bb9b1b42f6eb8a6aa3fe6b3539 100644
--- a/tensorflow/contrib/tpu/ops/infeed_ops.cc
+++ b/tensorflow/core/ops/tpu_infeed_ops.cc
@@ -27,14 +27,7 @@ REGISTER_OP("InfeedDequeue")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-A placeholder op for a value that will be fed into the computation.
-
-output: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("InfeedEnqueue")
     .Input("input: dtype")
@@ -43,20 +36,7 @@ REGISTER_OP("InfeedEnqueue")
     .Attr("layout: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds a single Tensor value into the computation.
-
-input: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-layout: A vector holding the requested layout in minor-to-major sequence.
-If a layout attribute is passed, but its values are all -1, the layout will
-be computed by the infeed operation.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedEnqueueTuple")
     .Input("inputs: dtypes")
@@ -65,21 +45,7 @@ REGISTER_OP("InfeedEnqueueTuple")
     .Attr("layouts: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds multiple Tensor values into the computation as an XLA tuple.
-
-inputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `inputs`.
-shapes: The shapes of each tensor in `inputs`.
-layouts: A vector holding the requested layout in minor-to-major sequence for
-all the tuple shapes, in the order the shapes appear in the "shapes" input.
-The layout elements for a sub-shape can be set to -1, in which case the
-corresponding layout will be computed by the infeed operation.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -95,14 +61,27 @@ REGISTER_OP("InfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-A placeholder op for multiple values that will be fed into the computation
-simultaneously as an XLA tuple.
+    });
+
+REGISTER_OP("Prelinearize")
+    .Input("input: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape = {}")
+    .Attr("layout: list(int) = []")
+    .Output("output: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("PrelinearizeTuple")
+    .Input("inputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .Attr("layouts: list(int) = []")
+    .Output("output: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
 
-outputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-)doc");
+REGISTER_OP("InfeedEnqueuePrelinearizedBuffer")
+    .Input("input: variant")
+    .Attr("device_ordinal: int = -1")
+    .SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
similarity index 80%
rename from tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
rename to tensorflow/core/ops/tpu_ordinal_selector_ops.cc
index 54e6b20f7f388b67a96ac8acfe814a4202b56a18..72f24c57dbb4be388264da3c15a1e4fa0de9eb1c 100644
--- a/tensorflow/contrib/tpu/ops/tpu_ordinal_selector_op.cc
+++ b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
@@ -25,15 +25,6 @@ REGISTER_OP("TPUOrdinalSelector")
       c->set_output(0,
                     c->Vector(shape_inference::InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-A TPU core selector Op.
-
-This Op produces a set of TPU cores (for warm-up) or a single TPU core
-(for regular inference) to execute the TPU program on. The output is
-consumed by TPUPartitionedCall.
-
-device_ordinals: A vector 1 or more TPU cores.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/core/ops/tpu_outfeed_ops.cc
similarity index 59%
rename from tensorflow/contrib/tpu/ops/outfeed_ops.cc
rename to tensorflow/core/ops/tpu_outfeed_ops.cc
index b05c76ca64fbaedc205ab06cc31616787ccc84b8..e170ed05a0cd283f086bd75ac28375f3afa15bae 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/core/ops/tpu_outfeed_ops.cc
@@ -26,24 +26,13 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits a single Tensor value from an XLA computation.
-
-input: A tensor that will be inserted into the outfeed queue.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits multiple Tensor values from an XLA computation.
-
-inputs: A list of tensors that will be inserted into the outfeed queue as an
-XLA tuple.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedDequeue")
     .Output("output: dtype")
@@ -51,18 +40,7 @@ REGISTER_OP("OutfeedDequeue")
     .Attr("shape: shape")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Retrieves a single tensor from the computation outfeed.  This operation will
-block indefinitely until data is available.
-
-output: A tensor that will be read from the device outfeed.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("OutfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -85,18 +63,6 @@ REGISTER_OP("OutfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Retrieve multiple values that will be emitted by the computation as an XLA
-tuple.  This operations will block indefinitely until data is available.
-Output `i` corresponds to XLA tuple element `i`.
-
-outputs: A list of tensors that will be read from the outfeed.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
similarity index 68%
rename from tensorflow/contrib/tpu/ops/replication_ops.cc
rename to tensorflow/core/ops/tpu_replication_ops.cc
index 285e11d92de7a684ed87974414ec73c274cc7aa5..7c8949260053a6ca7fd02449d9934a02d6d227ea 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -31,6 +31,8 @@ REGISTER_OP("TPUReplicateMetadata")
     // Deprecated. Use num_cores_per_replica instead.
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
+    .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -68,10 +70,7 @@ REGISTER_OP("TPUReplicatedInput")
         }
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects N unreplicated inputs to an N-way "
-        "replicated TPU computation.");
+    });
 
 REGISTER_OP("TPUReplicatedOutput")
     .Input("input: T")
@@ -83,10 +82,7 @@ REGISTER_OP("TPUReplicatedOutput")
         c->set_output(i, c->input(0));
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects the output of an N-way replicated TPU "
-        "computation to N separate outputs.");
+    });
 
 REGISTER_OP("TPUCompilationResult")
     .Output("output: string")
@@ -105,40 +101,13 @@ REGISTER_OP("TPUReplicate")
     .Attr("NumVariables: int >= 0")
     .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
+    .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
     .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Runs replicated computations on a distributed TPU system.
-
-computation: a function containing the computation to run.
-num_replicas: the number of replicas of the computation to run.
-num_cores_per_replica: the number of logical cores in each replica.
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-use_tpu: a bool indicating if this computation will run on TPU or CPU/GPU.
-Currently, only supports a default placement (computation is placed on GPU
-if one is available, and on CPU if not).
-device_assignment: a flattened array with shape
-  [replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
-  of logical cores in each replica of a computation to physical coordinates in
-  the TPU topology.
-Tinputs: the types of the arguments to 'computation'.
-inputs: the inputs to 'computation', flattened, in replica-major order.
-Tbroadcast_inputs: the types of the additional arguments to broadcast to all
-  replicas.
-Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
-broadcast_inputs: additional arguments to broadcast to all replicas. The
-  broadcast inputs are appended to the per-replica inputs when calling
-  computation.
-guaranteed_constants: arguments which have been guaranteed to not
-change their values during the session lifetime. These contain tensors marked as
-constant using the GuaranteeConstOp.
-output_types: the types of the outputs of 'computation'.
-outputs: the outputs of 'computation'.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 647a797b82cf30384f7f48611788a62a952d5627..40a808b661cbff48f1c4198bcfca5a2261292a25 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -151,6 +151,7 @@ cc_library(
         ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index d1e5779f023d205bb9595e7dbd322eb9e7e73fe6..10b57df183d370966338a1c2e6a6ab42aed9b75c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -64,7 +64,7 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
-ABSL_DEPRECATED("Use GCS_BLOCK_SIZE_MB instead.")
+ABSL_DEPRECATED("Use GCS_READ_CACHE_BLOCK_SIZE_MB instead.")
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 // The environment variable that disables the GCS block cache for reads.
 // This is the explicit alternative to setting BLOCK_SIZE or MAX_SIZE to 0, and
@@ -73,11 +73,11 @@ constexpr char kReadCacheDisabled[] = "GCS_READ_CACHE_DISABLED";
 // The environment variable that overrides the block size for aligned reads from
 // GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
 constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
-constexpr size_t kDefaultBlockSize = 128 * 1024 * 1024;
+constexpr size_t kDefaultBlockSize = 16 * 1024 * 1024;
 // The environment variable that overrides the max size of the LRU cache of
 // blocks read from GCS. Specified in MB.
 constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
-constexpr size_t kDefaultMaxCacheSize = 2 * kDefaultBlockSize;
+constexpr size_t kDefaultMaxCacheSize = kDefaultBlockSize;
 // The environment variable that overrides the maximum staleness of cached file
 // contents. Once any block of a file reaches this staleness, all cached blocks
 // will be evicted on the next read.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index dc9eb7796f76aa8eeb5137bd311f4a99940f1388..a998f8e3adf203ad3aad35ab0d3bf4e77fe75936 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -3155,8 +3155,8 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
 TEST(GcsFileSystemTest, OverrideCacheParameters) {
   // Verify defaults are propagated correctly.
   GcsFileSystem fs1;
-  EXPECT_EQ(128 * 1024 * 1024, fs1.block_size());
-  EXPECT_EQ(2 * fs1.block_size(), fs1.max_bytes());
+  EXPECT_EQ(16 * 1024 * 1024, fs1.block_size());
+  EXPECT_EQ(fs1.block_size(), fs1.max_bytes());
   EXPECT_EQ(0, fs1.max_staleness());
   EXPECT_EQ(120, fs1.timeouts().connect);
   EXPECT_EQ(60, fs1.timeouts().idle);
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index e15400780af0880caadd2f79b7322f39e406ca2b..e91a9f89757ae6f0009ea20120cd98ab25cd1437 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #endif
 #include <fstream>
 #include <utility>
+#include "absl/strings/match.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -44,6 +45,11 @@ constexpr char kGoogleAuthTokenForTesting[] = "GOOGLE_AUTH_TOKEN_FOR_TESTING";
 // The environment variable which can override '~/.config/gcloud' if set.
 constexpr char kCloudSdkConfig[] = "CLOUDSDK_CONFIG";
 
+// The environment variable used to skip attempting to fetch GCE credentials:
+// setting this to 'true' (case insensitive) will skip attempting to contact
+// the GCE metadata service.
+constexpr char kNoGceCheck[] = "NO_GCE_CHECK";
+
 // The default path to the gcloud config folder, relative to the home folder.
 constexpr char kGCloudConfigFolder[] = ".config/gcloud/";
 
@@ -146,10 +152,25 @@ Status GoogleAuthProvider::GetToken(string* t) {
   }
 
   auto token_from_files_status = GetTokenFromFiles();
-  auto token_from_gce_status =
-      token_from_files_status.ok() ? Status::OK() : GetTokenFromGce();
+  if (token_from_files_status.ok()) {
+    *t = current_token_;
+    return Status::OK();
+  }
+
+  char* no_gce_check_var = std::getenv(kNoGceCheck);
+  bool skip_gce_check = no_gce_check_var != nullptr &&
+                        absl::EqualsIgnoreCase(no_gce_check_var, "true");
+  Status token_from_gce_status;
+  if (skip_gce_check) {
+    token_from_gce_status =
+        Status(error::CANCELLED,
+               strings::StrCat("GCE check skipped due to presence of $",
+                               kNoGceCheck, " environment variable."));
+  } else {
+    token_from_gce_status = GetTokenFromGce();
+  }
 
-  if (token_from_files_status.ok() || token_from_gce_status.ok()) {
+  if (token_from_gce_status.ok()) {
     *t = current_token_;
     return Status::OK();
   }
@@ -165,8 +186,13 @@ Status GoogleAuthProvider::GetToken(string* t) {
   // so return an empty token instead of failing.
   *t = "";
 
-  // From now on, always return the empty token.
-  expiration_timestamp_sec_ = UINT64_MAX;
+  // We only want to keep returning our empty token if we've tried and failed
+  // the (potentially slow) task of detecting GCE.
+  if (skip_gce_check) {
+    expiration_timestamp_sec_ = 0;
+  } else {
+    expiration_timestamp_sec_ = UINT64_MAX;
+  }
   current_token_ = "";
 
   return Status::OK();
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 3755b124a87fd0003e5a6343b1a07130f5519dd6..4ab816d54c61e99dea1e2db59d4815f5012d5adc 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -51,7 +51,7 @@ class GoogleAuthProvider : public AuthProvider {
   /// Gets the bearer token from Google Compute Engine environment.
   Status GetTokenFromGce() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  /// Gets the bearer token from the systen env variable, for testing purposes.
+  /// Gets the bearer token from the system env variable, for testing purposes.
   Status GetTokenForTesting() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::unique_ptr<OAuthClient> oauth_client_;
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index ec31c5ee8c11645cf9f8a5659538b46d56ce84ca..d2db59200abb4fd5db5a86d9a653729aa518ee63 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -69,9 +69,10 @@ class GoogleAuthProviderTest : public ::testing::Test {
   void TearDown() override { ClearEnvVars(); }
 
   void ClearEnvVars() {
-    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("CLOUDSDK_CONFIG");
+    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("GOOGLE_AUTH_TOKEN_FOR_TESTING");
+    unsetenv("NO_GCE_CHECK");
   }
 };
 
@@ -238,4 +239,31 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   EXPECT_EQ("", token);
 }
 
+TEST_F(GoogleAuthProviderTest, NoGceCheckEnvironmentVariable) {
+  setenv("NO_GCE_CHECK", "True", 1);
+  auto oauth_client = new FakeOAuthClient;
+
+  FakeEnv env;
+  // If the env var above isn't respected, attempting to fetch a token
+  // from GCE will segfault (as the metadata client is null).
+  GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
+                              nullptr, &env);
+
+  string token;
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We confirm that our env var is case insensitive.
+  setenv("NO_GCE_CHECK", "true", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We also want to confirm that our empty token has a short expiration set: we
+  // now set a testing token, and confirm that it's returned instead of our
+  // empty token.
+  setenv("GOOGLE_AUTH_TOKEN_FOR_TESTING", "newToken", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("newToken", token);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 2efe0c0876e871f6752bb3e7724de4c505102130..38fc453008fcc9b4d59e44591c42ad83df061e70 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -138,8 +138,16 @@ void InfoAboutUnusedCPUFeatures() {
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
+#ifndef INTEL_MKL
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
+#else
+      LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
+                << "to use the following CPU instructions in performance "
+                << "critical operations: " << missing_instructions << std::endl
+                << "To enable them in non-MKL-DNN operations, rebuild "
+                << "TensorFlow with the appropriate compiler flags.";
+#endif
     }
   });
 }
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e26828c75e4476089c239fbdb3f03cf6c9fb6b11..bd35e64ef4796c177412ae5b21700d24636579b0 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -529,19 +530,13 @@ def tf_additional_proto_hdrs():
     return [
         "platform/default/integral_types.h",
         "platform/default/logging.h",
-        "platform/default/protobuf.h",
     ] + if_windows([
         "platform/windows/integral_types.h",
     ])
 
-def tf_additional_proto_compiler_hdrs():
-    return [
-        "platform/default/protobuf_compiler.h",
-    ]
-
 def tf_additional_proto_srcs():
     return [
-        "platform/default/protobuf.cc",
+        "platform/protobuf.cc",
     ]
 
 def tf_additional_human_readable_json_deps():
@@ -551,7 +546,11 @@ def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
 def tf_protos_all_impl():
-    return ["//tensorflow/core:protos_all_cc_impl"]
+    return [
+        "//tensorflow/core:autotuning_proto_cc_impl",
+        "//tensorflow/core:conv_autotuning_proto_cc_impl",
+        "//tensorflow/core:protos_all_cc_impl",
+    ]
 
 def tf_protos_all():
     return if_static(
@@ -559,6 +558,15 @@ def tf_protos_all():
         otherwise = ["//tensorflow/core:protos_all_cc"],
     )
 
+def tf_profiler_all_protos():
+    return ["//tensorflow/core/profiler:protos_all"]
+
+def tf_grpc_service_all():
+    return [
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+    ]
+
 def tf_protos_grappler_impl():
     return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
 
@@ -569,7 +577,14 @@ def tf_protos_grappler():
     )
 
 def tf_additional_cupti_wrapper_deps():
-    return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
+    return [
+        "//tensorflow/core/platform/default/gpu:cupti_wrapper",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ]
 
 def tf_additional_device_tracer_srcs():
     return ["platform/default/device_tracer.cc"]
@@ -578,7 +593,10 @@ def tf_additional_device_tracer_cuda_deps():
     return []
 
 def tf_additional_device_tracer_deps():
-    return []
+    return [
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/internal/cpu:host_tracer",
+    ]
 
 def tf_additional_device_tracer_test_flags():
     return []
@@ -718,6 +736,12 @@ def tf_additional_gdr_lib_defines():
         "//conditions:default": [],
     })
 
+def tf_additional_numa_lib_defines():
+    return select({
+        "//tensorflow:with_numa_support": ["TENSORFLOW_USE_NUMA"],
+        "//conditions:default": [],
+    })
+
 def tf_py_clif_cc(name, visibility = None, **kwargs):
     pass
 
@@ -733,7 +757,11 @@ def tf_additional_binary_deps():
     return ["@nsync//:nsync_cpp"] + if_cuda(
         [
             "//tensorflow/stream_executor:cuda_platform",
-            "//tensorflow/core/platform/default/build_config:cuda",
+        ],
+    ) + if_rocm(
+        [
+            "//tensorflow/stream_executor:rocm_platform",
+            "//tensorflow/core/platform/default/build_config:rocm",
         ],
     ) + [
         # TODO(allenl): Split these out into their own shared objects (they are
@@ -746,3 +774,26 @@ def tf_additional_binary_deps():
             "//third_party/mkl:intel_binary_blob",
         ],
     )
+
+def tf_additional_numa_deps():
+    return select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:macos": [],
+        "//conditions:default": [
+            "@hwloc",
+        ],
+    })
+
+def tf_additional_numa_copts():
+    return select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:macos": [],
+        "//conditions:default": [
+            "-Ithird_party/hwloc/hwloc-master/include",
+            "-DTENSORFLOW_USE_NUMA",
+        ],
+    })
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 664218d11bb75214ba6ed472b82167c08dbef5de..3ad58794823ce6dae40b7c008467a2af245867bc 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -7,7 +7,9 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "check_deps")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
@@ -32,7 +34,10 @@ cc_library(
 
 tf_cuda_library(
     name = "stream_executor",
-    cuda_deps = ["//tensorflow/stream_executor/cuda:cuda_activation"],
+    cuda_deps = [
+        "//tensorflow/stream_executor/cuda:cuda_activation",
+        "//tensorflow/stream_executor/rocm:rocm_activation",
+    ],
     deps = [
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:dnn",
@@ -42,6 +47,7 @@ tf_cuda_library(
         "//tensorflow/stream_executor/cuda:cuda_platform_id",
         "//tensorflow/stream_executor/host:host_platform_id",
         "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ] + select({
         "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
@@ -50,6 +56,7 @@ tf_cuda_library(
         "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
         "//tensorflow:using_cuda_clang_with_dynamic_build": [],
         "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
         "//conditions:default": [],
     }),
 )
@@ -67,6 +74,18 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "stream_executor_rocm",
+    deps = [
+        ":stream_executor_no_cuda",
+        ":rocm",
+    ] + if_static(
+        ["//tensorflow/stream_executor/rocm:all_runtime"],
+    ) + select({
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "stream_executor_no_cuda",
     deps = [
@@ -79,6 +98,7 @@ cc_library(
         "//tensorflow/stream_executor/host:host_platform",
         "//tensorflow/stream_executor/host:host_platform_id",
         "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ],
 )
 
@@ -267,6 +287,31 @@ cc_library(
     ],
 )
 
+# Check that libtensorflow_framework.so does not depend on cuda shared libraries.
+check_deps(
+    name = "libtensorflow_cuda_check_deps",
+    disallowed_deps = [
+        ":cuda",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_driver",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:curand",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+    deps = ["//tensorflow:libtensorflow_framework.so"],
+)
+
+cc_library(
+    name = "rocm",
+    data = [],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
+        ],
+    }),
+    deps = [],
+)
+
 cc_library(
     name = "sycl",
     data = if_ccpp([
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index ab05b25d6822c12d82d14f6d5c4717d77c27f8e5..4e7a35b3ef02928ecef2fd41d38c978383796307 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -4,9 +4,13 @@
 
 load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
 
-def tf_cuda_tests_tags():
+def tf_gpu_tests_tags():
     return ["requires-gpu", "gpu"] + gpu_test_tags()
 
+# terminology changes: saving tf_cuda_* for compatibility
+def tf_cuda_tests_tags():
+    return tf_gpu_tests_tags()
+
 def tf_sycl_tests_tags():
     return ["requires-gpu", "gpu"] + gpu_test_tags()
 
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 8351362e05699c591b5563f2270928f4408077e8..fdd934cdcaf4db3fae75dac0b96747a75a1e820f 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -20,9 +20,15 @@ limitations under the License.
 #include <stdlib.h>
 #include <memory>
 
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cupti_wrapper.h"
@@ -31,274 +37,296 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
+namespace tensorflow {
 namespace {
-
-// Maps a MemcpyKind enum to a const string.
-const char *getMemcpyKindString(CUpti_ActivityMemcpyKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
-      return "HtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
-      return "DtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
-      return "HtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
-      return "AtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
-      return "AtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
-      return "AtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
-      return "DtoA";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
-      return "DtoD";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
-      return "HtoH";
-    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
-      return "PtoP";
-    default:
-      break;
-  }
-  return "<unknown>";
+Status ToStatus(CUptiResult result) {
+  if (result == CUPTI_SUCCESS) {
+    return Status::OK();
+  }
+  const char* str = nullptr;
+  if (auto wrapper =
+          absl::make_unique<perftools::gputools::profiler::CuptiWrapper>()) {
+    wrapper->GetResultString(result, &str);
+  }
+  return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
 }
 
-// Maps a MemoryKind enum to a const string.
-const char *getMemoryKindString(CUpti_ActivityMemoryKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
-      return "Unknown";
-    case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE:
-      return "Pageable";
-    case CUPTI_ACTIVITY_MEMORY_KIND_PINNED:
-      return "Pinned";
-    case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE:
-      return "Device";
-    case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY:
-      return "Array";
-    default:
-      break;
-  }
-  return "<unknown>";
+Status ToStatus(CUresult result) {
+  if (result == CUDA_SUCCESS) {
+    return Status::OK();
+  }
+  const char* str = nullptr;
+  cuGetErrorName(result, &str);
+  return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
 }
 
-// Maps an OverheadKind enum to a const string.
-const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
-      return "COMPILER";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
-      return "BUFFER_FLUSH";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
-      return "INSTRUMENTATION";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
-      return "RESOURCE";
-    default:
-      break;
-  }
-  return "<unknown>";
+void LogIfError(const Status& status) {
+  if (status.ok()) {
+    return;
+  }
+  LOG(ERROR) << status.error_message();
 }
 
-}  // namespace
-
-namespace tensorflow {
-namespace devicetracer {
-
-// Forward declaration.
-class CUPTIManager;
-
-// Returns a pointer to the CUPTIManager singleton.
-CUPTIManager *GetCUPTIManager();
-
-// Callback interface for consumers of CUPTI tracing.
-class CUPTIClient {
- public:
-  virtual ~CUPTIClient() {}
+struct KernelRecord {
+  const char* kernel_name;
+  // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
+  // record the stream and infer the context during collection.
+  CUcontext context;
+  CUstream stream;
+  CUevent start_event;
+  CUevent stop_event;
+  const std::string* annotation;
+};
 
-  // Invoked for each CUPTI activity reported.
-  virtual void ActivityCallback(const CUpti_Activity &activity) = 0;
+struct MemcpyRecord {
+  CUmemorytype src_type;
+  CUmemorytype dst_type;
+  size_t size_bytes;
+  CUcontext context;
+  CUstream stream;
+  CUevent start_event;
+  CUevent stop_event;
+  const std::string* annotation;
 };
 
-#define CUPTI_CALL(call)                                            \
-  do {                                                              \
-    CUptiResult _status = cupti_wrapper_->call;                     \
-    if (_status != CUPTI_SUCCESS) {                                 \
-      LOG(ERROR) << "cuda call " << #call << " failed " << _status; \
-    }                                                               \
-  } while (0)
+Status CreateAndRecordEvent(CUevent* event, CUstream stream) {
+  TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
+  return ToStatus(cuEventRecord(*event, stream));
+}
 
-// Singleton class to manage registration of CUPTI callbacks.
-class CUPTIManager {
+// Thread-local state recording the most recent annotation (if any).
+// When non-null, this points to a string in the active annotation
+// of the current thread.  The annotation is guaranteed to remain live
+// for the duration of the CUPTI API callback.
+static thread_local const char* tls_current_annotation;
+
+// Stores a series of kernel and memcpy records.
+class CudaEventRecorder {
  public:
-  CUPTIManager() {
-    cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
+  // Registers the start of a kernel launch. The returned index should be passed
+  // to StopKernel() after the kernel launch has completed.
+  size_t StartKernel(const char* kernel_name, CUcontext context,
+                     CUstream stream) {
+    KernelRecord record = {kernel_name, context, stream};
+    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
+    mutex_lock lock(mutex_);
+    if (tls_current_annotation) {
+      record.annotation = &*annotations_.emplace(tls_current_annotation).first;
+    }
+    kernel_records_.push_back(record);
+    return kernel_records_.size() - 1;
+  }
+  void StopKernel(size_t index) {
+    mutex_lock lock(mutex_);
+    auto& record = kernel_records_[index];
+    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
   }
 
-  static CUPTIManager *Create() {
-    auto manager = absl::make_unique<CUPTIManager>();
-    CUptiResult status = manager->cupti_wrapper_->ActivityRegisterCallbacks(
-        BufferRequested, BufferCompleted);
-    if (status != CUPTI_SUCCESS) {
-      LOG(ERROR) << "Failed to initialize CUPTI: " << status;
-      return nullptr;
+  // Registers the start of a copy operation. The returned index should be
+  // passed to StopMemcpy() after the kernel launch has completed.
+  size_t StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
+                     size_t size_bytes, CUcontext context, CUstream stream) {
+    MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream};
+    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
+    mutex_lock lock(mutex_);
+    if (tls_current_annotation) {
+      record.annotation = &*annotations_.emplace(tls_current_annotation).first;
     }
-    return manager.release();
+    memcpy_records_.push_back(record);
+    return memcpy_records_.size() - 1;
+  }
+  void StopMemcpy(size_t index) {
+    mutex_lock lock(mutex_);
+    auto& record = memcpy_records_[index];
+    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
   }
 
-  // Enables tracing and delivers event callbacks to 'client'.
-  // Does not take ownership of client.  Client's lifetime must persist
-  // until tracing is disabled.
-  Status EnableTrace(CUPTIClient *client);
-
-  // Disable tracing.  No further events will be delivered to 'client'.
-  Status DisableTrace();
+  std::vector<KernelRecord> ConsumeKernelRecords() {
+    mutex_lock lock(mutex_);
+    return std::move(kernel_records_);
+  }
+  std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
+    mutex_lock lock(mutex_);
+    return std::move(memcpy_records_);
+  }
 
  private:
-  // Static functions which we can use as CUPTI callbacks.
-  static void BufferRequested(uint8_t **buffer, size_t *size,
-                              size_t *maxNumRecords) {
-    GetCUPTIManager()->InternalBufferRequested(buffer, size, maxNumRecords);
-  }
-  static void BufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
-                              size_t size, size_t validSize) {
-    GetCUPTIManager()->InternalBufferCompleted(ctx, streamId, buffer, size,
-                                               validSize);
-  }
-  // These methods are called by the static stubs above.
-  void InternalBufferRequested(uint8_t **buffer, size_t *size,
-                               size_t *maxNumRecords);
-  void InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
-                               uint8_t *buffer, size_t size, size_t validSize);
-
-  // Size of buffers used for CUPTI tracing.
-  static constexpr size_t kBufferSize = 32 * 1024;
-  // Required alignment of CUPTI buffers.
-  static constexpr size_t kBufferAlignment = 8;
+  mutex mutex_;
+  std::unordered_set<std::string> annotations_ GUARDED_BY(mutex_);
+  std::vector<KernelRecord> kernel_records_ GUARDED_BY(mutex_);
+  std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(mutex_);
+};
 
-  mutex mu_;
-  CUPTIClient *client_ GUARDED_BY(mu_);
-  std::unique_ptr<perftools::gputools::profiler::CuptiWrapper> cupti_wrapper_;
+// Instances register callbacks with CUPTI to notify the event recorder before
+// and after kernel launches and memory copies.
+class CuptiCallbackHook {
+ public:
+  CuptiCallbackHook()
+      : cupti_wrapper_(new perftools::gputools::profiler::CuptiWrapper()),
+        subscriber_(nullptr) {}
+
+  Status Enable(CudaEventRecorder* recorder) {
+    TF_RETURN_IF_ERROR(ToStatus(
+        cupti_wrapper_->Subscribe(&subscriber_, &CuptiCallback, recorder)));
+    for (auto cbid : {CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
+                      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2}) {
+      TF_RETURN_IF_ERROR(ToStatus(cupti_wrapper_->EnableCallback(
+          /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid)));
+    }
+    return Status::OK();
+  }
 
-  TF_DISALLOW_COPY_AND_ASSIGN(CUPTIManager);
-};
+  ~CuptiCallbackHook() {
+    LogIfError(ToStatus(cupti_wrapper_->Unsubscribe(subscriber_)));
+  }
 
-Status CUPTIManager::EnableTrace(CUPTIClient *client) {
-  mutex_lock l(mu_);
-  // TODO(pbar) Work out the minimal set to trace.
-  // We can currently manage without driver/runtime tracing.
-  // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
-  // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
-  // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  // These might be useful for annotations but require NVTX API.
-  // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
-  // CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
-
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY2));
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(ActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
-  client_ = client;
-  return Status::OK();
-}
+ private:
+  static void CUPTIAPI CuptiCallback(void* userdata,
+                                     CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid,
+                                     const void* cbdata) {
+    auto recorder = static_cast<CudaEventRecorder*>(userdata);
+    auto data = static_cast<const CUpti_CallbackData*>(cbdata);
+    DCHECK_EQ(domain, CUPTI_CB_DOMAIN_DRIVER_API);
+
+    if (data->callbackSite == CUPTI_API_ENTER) {
+      DriverApiEnterCallback(cbid, *data, recorder);
+    } else {
+      DriverApiExitCallback(cbid, *data, recorder);
+    }
+  }
 
-Status CUPTIManager::DisableTrace() {
-  // We turn off all tracing regardless.
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2));
-  CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
-  {
-    // Don't acquire this lock until Flush returns, since Flush
-    // will potentially cause callbacks into BufferCompleted.
-    mutex_lock l(mu_);
-    client_ = nullptr;
+  static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
+    CUmemorytype mem_type;
+    auto status =
+        cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
+    if (status == CUDA_ERROR_INVALID_VALUE) {
+      // Pointer not registered with CUDA, must be host memory.
+      return CU_MEMORYTYPE_HOST;
+    }
+    LogIfError(ToStatus(status));
+    return mem_type;
   }
-  return Status::OK();
-}
 
-void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size,
-                                           size_t *maxNumRecords) {
-  VLOG(2) << "BufferRequested";
-  void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment);
-  *size = kBufferSize;
-  *buffer = reinterpret_cast<uint8_t *>(p);
-  *maxNumRecords = 0;
-}
+  template <typename T>
+  static void StartMemcpy(CUmemorytype src_type, CUmemorytype dst_type,
+                          const CUpti_CallbackData& cbdata,
+                          CudaEventRecorder* recorder) {
+    auto params = static_cast<const T*>(cbdata.functionParams);
+    *cbdata.correlationData = recorder->StartMemcpy(
+        src_type, dst_type, params->ByteCount, cbdata.context, nullptr);
+  }
+  template <typename T>
+  static void StartMemcpyAsync(CUmemorytype dst_type, CUmemorytype src_type,
+                               const CUpti_CallbackData& cbdata,
+                               CudaEventRecorder* recorder) {
+    auto params = static_cast<const T*>(cbdata.functionParams);
+    *cbdata.correlationData = recorder->StartMemcpy(
+        src_type, dst_type, params->ByteCount, cbdata.context, params->hStream);
+  }
 
-void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
-                                           uint8_t *buffer, size_t size,
-                                           size_t validSize) {
-  VLOG(2) << "BufferCompleted";
-  CUptiResult status;
-  CUpti_Activity *record = nullptr;
-  mutex_lock l(mu_);  // Hold mu_ while using client_.
-  if (client_ && validSize > 0) {
-    do {
-      status =
-          cupti_wrapper_->ActivityGetNextRecord(buffer, validSize, &record);
-      if (status == CUPTI_SUCCESS) {
-        client_->ActivityCallback(*record);
-      } else {
-        break;
+  static void DriverApiEnterCallback(CUpti_CallbackId cbid,
+                                     const CUpti_CallbackData& cbdata,
+                                     CudaEventRecorder* recorder) {
+    switch (cbid) {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
+        DCHECK_NE(cbdata.symbolName, nullptr);
+        auto params =
+            static_cast<const cuLaunchKernel_params*>(cbdata.functionParams);
+        *cbdata.correlationData = recorder->StartKernel(
+            cbdata.symbolName, cbdata.context, params->hStream);
+        return;
+      }
+
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
+        auto params =
+            static_cast<const cuMemcpy_params*>(cbdata.functionParams);
+        return StartMemcpy<cuMemcpy_params>(GetMemoryType(params->src),
+                                            GetMemoryType(params->dst), cbdata,
+                                            recorder);
+      }
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
+        auto params =
+            static_cast<const cuMemcpyAsync_params*>(cbdata.functionParams);
+        return StartMemcpyAsync<cuMemcpyAsync_params>(
+            GetMemoryType(params->src), GetMemoryType(params->dst), cbdata,
+            recorder);
       }
-    } while (1);
 
-    // report any records dropped from the queue
-    size_t dropped;
-    CUPTI_CALL(ActivityGetNumDroppedRecords(ctx, streamId, &dropped));
-    if (dropped != 0) {
-      LOG(WARNING) << "Dropped " << dropped << " activity records";
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+        return StartMemcpy<cuMemcpyHtoD_v2_params>(
+            CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
+
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+        return StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
+            CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
+
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+        return StartMemcpy<cuMemcpyDtoH_v2_params>(
+            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+        return StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
+            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST, cbdata, recorder);
+
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+        return StartMemcpy<cuMemcpyDtoD_v2_params>(
+            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+        return StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
+            CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE, cbdata, recorder);
+
+      default:
+        LOG(ERROR) << "Unexpected callback id: " << cbid;
     }
   }
-  port::AlignedFree(buffer);
-}
-
-CUPTIManager *GetCUPTIManager() {
-  static CUPTIManager *manager = CUPTIManager::Create();
-  return manager;
-}
 
-#ifdef _MSC_VER
-#define __thread __declspec(thread)
-#endif
-
-// TODO(pbar) Move this to platform specific header file?
-// Static thread local variable for POD types.
-#define TF_STATIC_THREAD_LOCAL_POD(_Type_, _var_)                  \
-  static __thread _Type_ s_obj_##_var_;                            \
-  namespace {                                                      \
-  class ThreadLocal_##_var_ {                                      \
-   public:                                                         \
-    ThreadLocal_##_var_() {}                                       \
-    void Init() {}                                                 \
-    inline _Type_ *pointer() const { return &s_obj_##_var_; }      \
-    inline _Type_ *safe_pointer() const { return &s_obj_##_var_; } \
-    _Type_ &get() const { return s_obj_##_var_; }                  \
-    bool is_native_tls() const { return true; }                    \
-                                                                   \
-   private:                                                        \
-    TF_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_);              \
-  } _var_;                                                         \
-  }  // namespace
+  static void DriverApiExitCallback(CUpti_CallbackId cbid,
+                                    const CUpti_CallbackData& cbdata,
+                                    CudaEventRecorder* recorder) {
+    switch (cbid) {
+      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+        recorder->StopKernel(*cbdata.correlationData);
+        break;
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
+      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
+        recorder->StopMemcpy(*cbdata.correlationData);
+        break;
+      default:
+        LOG(ERROR) << "Unexpected callback id: " << cbid;
+    }
+  }
 
-// Thread-local state recording the most recent annotation (if any).
-// When non-null, this points to a string in the active annotation
-// of the current thread.  The annotation is guaranteed to remain live
-// for the duration of the CUPTI API callback.
-TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
+  std::unique_ptr<perftools::gputools::profiler::CuptiWrapper> cupti_wrapper_;
+  CUpti_SubscriberHandle subscriber_;
+};
+}  // namespace
 
 class TraceCollectorImpl : public tracing::TraceCollector {
  public:
+  class ActivityHandle : public Handle {
+   public:
+    ActivityHandle(std::string&& name, int level)
+        : trace_me_(std::move(name), level) {}
+
+   private:
+    profiler::TraceMe trace_me_;
+  };
   TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
   ~TraceCollectorImpl() override {
@@ -310,22 +338,24 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   virtual std::unique_ptr<Handle> CreateAnnotationHandle(
       StringPiece name_part1, StringPiece name_part2) const {
     struct Impl : public tracing::TraceCollector::Handle {
-      string annotation;
-      explicit Impl(string &&name_scope) : annotation(name_scope) {
+      std::string annotation;
+      explicit Impl(std::string&& name_scope) : annotation(name_scope) {
         VLOG(2) << "CreateAnnotationHandle " << annotation;
         // Remember the most recent ScopedAnnotation for each thread.
-        tls_current_annotation.get() = annotation.c_str();
+        tls_current_annotation = annotation.c_str();
       }
-      ~Impl() override { tls_current_annotation.get() = nullptr; }
+      ~Impl() override { tls_current_annotation = nullptr; }
     };
-    return std::unique_ptr<Handle>(
-        new Impl{ConcatenateNames(name_part1, name_part2)});
+    return absl::make_unique<Impl>(ConcatenateNames(name_part1, name_part2));
   }
 
-  virtual std::unique_ptr<Handle> CreateActivityHandle(StringPiece, StringPiece,
-                                                       bool) const {
-    // We don't do anything with 'Activities' yet.
-    return nullptr;
+  virtual std::unique_ptr<Handle> CreateActivityHandle(
+      StringPiece name_part1, StringPiece name_part2, bool is_expensive) const {
+    if (!IsEnabledForActivities(is_expensive)) {
+      return nullptr;
+    }
+    return absl::make_unique<ActivityHandle>(
+        ConcatenateNames(name_part1, name_part2), GetLevel(is_expensive));
   }
 
   bool IsEnabledForAnnotations() const override {
@@ -333,8 +363,7 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
-    // We don't do anything with 'Activities' so we are never 'enabled'.
-    return false;
+    return profiler::TraceMeRecorder::Active(GetLevel(is_expensive));
   }
 
   void Start() {
@@ -349,86 +378,40 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
  private:
+  static int GetLevel(bool is_expensive) {
+    return profiler::GetTFTraceMeLevel(is_expensive);
+  }
+
   std::atomic<bool> active_trace_session_;
 };
 
-TraceCollectorImpl *GlobalDefaultTraceCollector() {
-  static auto *instance = new TraceCollectorImpl();
+TraceCollectorImpl* GlobalDefaultTraceCollector() {
+  static auto* instance = new TraceCollectorImpl();
   return instance;
 }
 
-class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
+class DeviceTracerImpl : public DeviceTracer {
  public:
-  DeviceTracerImpl(CUPTIManager *cupti_manager);
+  DeviceTracerImpl();
   ~DeviceTracerImpl() override;
 
   // DeviceTracer interface:
   Status Start() override;
   Status Stop() override;
-  Status Collect(StepStatsCollector *collector) override;
-
- protected:
-  // This callback is used exclusively by CUPTIManager.
-  friend class CUPTIManager;
-  void ActivityCallback(const CUpti_Activity &activity) override;
+  Status Collect(StepStatsCollector* collector) override;
 
  private:
-  // Internal struct to record kernel launches.
-  struct KernelRecord {
-    uint64_t start_timestamp;
-    uint64_t end_timestamp;
-    uint32 device_id;
-    uint32 stream_id;
-    uint32 correlation_id;
-  };
-  // Internal struct to record memcpy operations.
-  struct MemcpyRecord {
-    uint64_t start_timestamp;
-    uint64_t end_timestamp;
-    uint32 device_id;
-    uint32 stream_id;
-    uint32 correlation_id;
-    uint8 copyKind;
-    uint8 srcKind;
-    uint8 dstKind;
-    uint64 bytes;
-  };
-
-  // This is the subscriber callback which is invoked directly by CUPTI.
-  // The 'userdata' argument will be a pointer to the active 'DeviceTracerImpl'.
-  static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
-                                   CUpti_CallbackId cbid, const void *cbdata);
-
-  // Records the mapping between correlation ID and kernel name.
-  void AddCorrelationId(uint32 correlation_id, const string &name);
-
-  // Returns the current system time in microseconds.
-  inline int64 NowInUsec() { return Env::Default()->NowMicros(); }
-
-  CUPTIManager *cupti_manager_;
-  std::unique_ptr<perftools::gputools::profiler::CuptiWrapper> cupti_wrapper_;
-  CUpti_SubscriberHandle subscriber_;
-
-  mutex trace_mu_;
-  static constexpr size_t kMaxRecords = 1024 * 1024;
-  std::map<uint32, string> correlations_ GUARDED_BY(trace_mu_);
-  std::vector<KernelRecord> kernel_records_ GUARDED_BY(trace_mu_);
-  std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(trace_mu_);
+  std::unique_ptr<CudaEventRecorder> recorder_;
+  std::unique_ptr<CuptiCallbackHook> cupti_hook_;
 
   mutex mu_;
   bool enabled_ GUARDED_BY(mu_);
-  int64 start_walltime_us_ GUARDED_BY(mu_);
-  int64 end_walltime_us_ GUARDED_BY(mu_);
-  uint64_t start_timestamp_ GUARDED_BY(mu_);
-  uint64_t end_timestamp_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
+  std::unique_ptr<profiler::cpu::HostTracer> host_tracer_ GUARDED_BY(mu_);
 };
 
-DeviceTracerImpl::DeviceTracerImpl(CUPTIManager *cupti_manager)
-    : cupti_manager_(cupti_manager) {
+DeviceTracerImpl::DeviceTracerImpl() : recorder_(new CudaEventRecorder()) {
   VLOG(1) << "DeviceTracer created.";
-  cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
+  host_tracer_ = profiler::cpu::HostTracer::Create(2);
   enabled_ = false;
 }
 
@@ -444,55 +427,13 @@ Status DeviceTracerImpl::Start() {
   if (enabled_) {
     return errors::FailedPrecondition("DeviceTracer is already enabled.");
   }
-  // There can only be one CUPTI subscriber.  If we can't create one then
-  // there is another trace in progress (possibly by external code).
-  CUptiResult ret;
-  ret = cupti_wrapper_->Subscribe(
-      &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
-  if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-    return errors::Unavailable("CUPTI subcriber limit reached.");
-  } else if (ret != CUPTI_SUCCESS) {
-    return errors::Internal("Failed to create CUPTI subcriber.");
-  }
+  cupti_hook_.reset(new CuptiCallbackHook());
+  TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get()));
 
   // Register as a TraceEngine to receive ScopedAnnotations.
   GlobalDefaultTraceCollector()->Start();
 
-  // Intercept launch and memcpy calls to capture the Op name annotation.
-  // TODO(pbar) Add callbacks for memcpy variants.
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_RUNTIME_API,
-                            CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020));
-  CUPTI_CALL(EnableCallback(
-      /*enable=*/1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API,
-      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020));
-
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2));
-  CUPTI_CALL(EnableCallback(/*enable=*/1, subscriber_,
-                            CUPTI_CB_DOMAIN_DRIVER_API,
-                            CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2));
-
-  TF_RETURN_IF_ERROR(cupti_manager_->EnableTrace(this));
-
-  CUPTI_CALL(GetTimestamp(&start_timestamp_));
-  start_walltime_us_ = NowInUsec();
+  host_tracer_->Start().IgnoreError();
   enabled_ = true;
   return Status::OK();
 }
@@ -503,196 +444,288 @@ Status DeviceTracerImpl::Stop() {
   if (!enabled_) {
     return Status::OK();
   }
-  CUPTI_CALL(Unsubscribe(subscriber_));
+  cupti_hook_.reset();
   GlobalDefaultTraceCollector()->Stop();
 
-  TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
-  end_walltime_us_ = NowInUsec();
-  CUPTI_CALL(GetTimestamp(&end_timestamp_));
   enabled_ = false;
+  host_tracer_->Stop().IgnoreError();
   return Status::OK();
 }
 
-void DeviceTracerImpl::AddCorrelationId(uint32 correlation_id,
-                                        const string &name) {
-  VLOG(2) << correlation_id << " : " << name;
-  mutex_lock l(trace_mu_);
-  if (correlations_.size() >= kMaxRecords) return;
-  correlations_.emplace(correlation_id, name);
-}
+namespace {
+class CudaEventCollector {
+  struct DeviceInfo {
+    int ordinal;
+    std::string name;
+    int num_contexts;
+  };
 
-/*static*/ void DeviceTracerImpl::ApiCallback(void *userdata,
-                                              CUpti_CallbackDomain domain,
-                                              CUpti_CallbackId cbid,
-                                              const void *cbdata) {
-  auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-  DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
-  VLOG(2) << "ApiCallback " << domain << ":" << cbid
-          << " func: " << cbInfo->functionName;
-
-  // API callbacks are invoked synchronously on the thread making the
-  // CUDA API call.  If this pointer is non-null then the ScopedAnnotation
-  // must be valid.
-  const char *tls_annotation = tls_current_annotation.get();
-
-  if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
-      (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
-    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-      auto *params = reinterpret_cast<const cuLaunchKernel_params *>(
-          cbInfo->functionParams);
-      if (VLOG_IS_ON(2)) {
-        VLOG(2) << "LAUNCH stream " << params->hStream << " correllation "
-                << cbInfo->correlationId << " kernel " << cbInfo->symbolName;
-      }
-      const string annotation =
-          tls_annotation ? tls_annotation : cbInfo->symbolName;
-      tracer->AddCorrelationId(cbInfo->correlationId, annotation);
+  struct ContextInfo {
+    int index;
+    const DeviceInfo* dev_info;
+    int num_streams;
+    CUevent end_event;
+  };
+
+  struct StreamInfo {
+    std::string name;
+    int index;  // 0 is reserved for null stream.
+    const ContextInfo* ctx_info;
+  };
+
+  // Include context in key to distinguish null streams.
+  using StreamKey = std::pair<CUcontext, CUstream>;
+
+  CudaEventCollector(CudaEventRecorder* recorder, StepStatsCollector* collector)
+      : recorder_(recorder), collector_(collector) {
+    DCHECK(recorder != nullptr);
+    DCHECK(collector != nullptr);
+  }
+
+  // Populates device_infos_ from all devices.
+  Status InitializeDeviceInfos() {
+    int count;
+    TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetCount(&count)));
+    for (int ordinal = 0; ordinal < count; ++ordinal) {
+      CUdevice device;
+      TF_RETURN_IF_ERROR(ToStatus(cuDeviceGet(&device, ordinal)));
+      char name[100];
+      TF_RETURN_IF_ERROR(ToStatus(cuDeviceGetName(name, sizeof(name), device)));
+      device_infos_[device] = {ordinal, name};
+    }
+    return Status::OK();
+  }
+
+  // Returns element from context_infos_, adding it if not yet present.
+  Status GetContextInfo(CUcontext context, ContextInfo** ctx_info_ptr) {
+    auto it = context_infos_.find(context);
+
+    if (it == context_infos_.end()) {
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
+      CUdevice device;
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxGetDevice(&device)));
+
+      auto& dev_info = device_infos_[device];
+      ContextInfo ctx_info = {dev_info.num_contexts++, &dev_info};
+      it = context_infos_.emplace(context, ctx_info).first;
     }
-  } else if ((domain == CUPTI_CB_DOMAIN_RUNTIME_API) &&
-             (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 ||
-              cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020)) {
-    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-      if (VLOG_IS_ON(2)) {
-        auto *funcParams = reinterpret_cast<const cudaMemcpy_v3020_params *>(
-            cbInfo->functionParams);
-        size_t count = funcParams->count;
-        enum cudaMemcpyKind kind = funcParams->kind;
-        VLOG(2) << "MEMCPY count " << count << " kind " << kind;
+
+    *ctx_info_ptr = &it->second;
+    return Status::OK();
+  }
+
+  // Adds element to stream_infos_ if not yet present. If present, clear name
+  // if it doesn't match parameter.
+  Status AddStreamInfo(CUcontext context, CUstream stream,
+                       absl::string_view name) {
+    StreamKey key(context, stream);
+    auto it = stream_infos_.find(key);
+    if (it != stream_infos_.end()) {
+      if (it->second.name != name) {
+        it->second.name.clear();  // Stream with inconsistent names, clear it.
       }
-      if (tls_annotation) {
-        const string annotation = tls_annotation;
-        tracer->AddCorrelationId(cbInfo->correlationId, annotation);
+      return Status::OK();
+    }
+
+    ContextInfo* ctx_info;
+    TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
+    int index = stream ? ++ctx_info->num_streams : 0;
+    StreamInfo stream_info = {static_cast<std::string>(name), index, ctx_info};
+    stream_infos_.emplace(key, stream_info);
+    return Status::OK();
+  }
+
+  // Returns string describing source and destination memory types.
+  static std::string GetMemcpyName(const MemcpyRecord& record) {
+    auto get_memory_type = [](CUmemorytype mem_type) {
+      switch (mem_type) {
+        case CU_MEMORYTYPE_HOST:
+          return 'H';
+        case CU_MEMORYTYPE_DEVICE:
+          return 'D';
+        case CU_MEMORYTYPE_ARRAY:
+          return 'A';
+        case CU_MEMORYTYPE_UNIFIED:
+          return 'U';
+        default:
+          LOG(ERROR) << "Unknown memory type: " << mem_type;
+          return '?';
       }
+    };
+    return absl::StrFormat("Memcpy%cto%c", get_memory_type(record.src_type),
+                           get_memory_type(record.dst_type));
+  }
+
+  // Returns time in microseconds between events recorded on the GPU.
+  static uint64_t GetElasedTimeUs(CUevent start, CUevent stop) {
+    float elapsed_ms = 0.0f;
+    LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
+    return static_cast<uint64>(
+        std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
+  }
+
+  // Synchronizes all contexts.
+  Status Synchronize() const {
+    for (const auto& pair : context_infos_) {
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
+    }
+    return Status::OK();
+  }
+
+  // Save stats to collector;
+  Status SaveStats(std::unique_ptr<NodeExecStats> stats,
+                   const StreamInfo& stream_info) const {
+    auto ctx_info = stream_info.ctx_info;
+    auto dev_info = ctx_info->dev_info;
+    // TODO(csigg): tfprof_node.cc, run_metadata_test.py, and timeline_test.py
+    // currently require this particular formatting.
+    collector_->Save(
+        absl::StrFormat("/device:GPU:%d/stream:all", dev_info->ordinal),
+        new NodeExecStats(*stats));
+    auto name = absl::StrFormat("/gpu:%d (%s)/context#%d/", dev_info->ordinal,
+                                dev_info->name, ctx_info->index);
+    if (stream_info.index) {
+      absl::StrAppend(&name, "stream#", std::to_string(stream_info.index));
+    } else {
+      absl::StrAppend(&name, "null stream");
     }
-  } else if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
-             (cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2 ||
-              cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2 ||
-              cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2 ||
-              cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2 ||
-              cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2 ||
-              cbid == CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2)) {
-    if (cbInfo->callbackSite == CUPTI_API_EXIT && tls_annotation) {
-      const string annotation = tls_annotation;
-      tracer->AddCorrelationId(cbInfo->correlationId, annotation);
+    if (!stream_info.name.empty()) {
+      absl::StrAppend(&name, ":", stream_info.name);
     }
-  } else {
-    VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    collector_->Save(name, stats.release());
+    return Status::OK();
   }
-}
 
-void DeviceTracerImpl::ActivityCallback(const CUpti_Activity &record) {
-  VLOG(2) << "ActivityCallback " << record.kind;
-  mutex_lock l(trace_mu_);
-  switch (record.kind) {
-    case CUPTI_ACTIVITY_KIND_MEMCPY: {
-      if (memcpy_records_.size() >= kMaxRecords) return;
-      auto *memcpy = reinterpret_cast<const CUpti_ActivityMemcpy *>(&record);
-      memcpy_records_.push_back(MemcpyRecord{
-          memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
-          memcpy->correlationId, memcpy->copyKind, memcpy->srcKind,
-          memcpy->dstKind, memcpy->bytes});
-      break;
+  Status SaveRecord(const KernelRecord& record) const {
+    if (!record.start_event || !record.stop_event) {
+      return Status::OK();
     }
-    case CUPTI_ACTIVITY_KIND_MEMCPY2: {
-      if (memcpy_records_.size() >= kMaxRecords) return;
-      auto *memcpy = reinterpret_cast<const CUpti_ActivityMemcpy2 *>(&record);
-      memcpy_records_.push_back(MemcpyRecord{
-          memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
-          memcpy->correlationId, memcpy->copyKind, memcpy->srcKind,
-          memcpy->dstKind, memcpy->bytes});
-      break;
+    const auto& stream_info =
+        stream_infos_.at(StreamKey(record.context, record.stream));
+    auto start_us =
+        GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event);
+
+    auto stats = absl::make_unique<NodeExecStats>();
+    std::string node_name = record.kernel_name;
+    if (record.annotation) {
+      node_name = absl::StrCat(*record.annotation, "::", node_name);
     }
-    case CUPTI_ACTIVITY_KIND_KERNEL:
-    case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
-      if (kernel_records_.size() >= kMaxRecords) return;
-      auto *kernel = reinterpret_cast<const CUpti_ActivityKernel3 *>(&record);
-      kernel_records_.push_back(KernelRecord{kernel->start, kernel->end,
-                                             kernel->deviceId, kernel->streamId,
-                                             kernel->correlationId});
-      break;
+    stats->set_node_name(node_name);
+    // TODO(csigg): Report grid size?
+    std::string node_label;
+    stats->set_timeline_label(node_label);
+    stats->set_all_start_micros(end_walltime_us_ - start_us);
+    stats->set_op_end_rel_micros(elapsed_us);
+    stats->set_all_end_rel_micros(elapsed_us);
+    return SaveStats(std::move(stats), stream_info);
+  }
+
+  Status SaveRecord(const MemcpyRecord& record) const {
+    if (!record.start_event || !record.stop_event) {
+      return Status::OK();
+    }
+    const auto& stream_info =
+        stream_infos_.at(StreamKey(record.context, record.stream));
+    auto start_us =
+        GetElasedTimeUs(record.start_event, stream_info.ctx_info->end_event);
+    auto elapsed_us = GetElasedTimeUs(record.start_event, record.stop_event);
+
+    auto stats = absl::make_unique<NodeExecStats>();
+    std::string node_name = GetMemcpyName(record);
+    if (record.annotation) {
+      node_name = absl::StrCat(*record.annotation, "::", node_name);
     }
-    default:
-      VLOG(1) << "ActivityCallback unhandled kind";
-      break;
+    stats->set_node_name(node_name);
+    // TODO(csigg): Show label in Chrome trace viewer.
+    std::string node_label = absl::StrFormat("%d bytes", record.size_bytes);
+    stats->set_timeline_label(node_label);
+    stats->set_all_start_micros(end_walltime_us_ - start_us);
+    stats->set_op_end_rel_micros(elapsed_us);
+    stats->set_all_end_rel_micros(elapsed_us);
+    return SaveStats(std::move(stats), stream_info);
+  }
+
+  Status Collect() {
+    TF_RETURN_IF_ERROR(InitializeDeviceInfos());
+
+    auto kernel_records = recorder_->ConsumeKernelRecords();
+    auto memcpy_records = recorder_->ConsumeMemcpyRecords();
+    LOG(INFO) << "Collecting " << kernel_records.size() << " kernel records, "
+              << memcpy_records.size() << " memcpy records.";
+
+    // Gather all profiled streams and contexts.
+    for (const auto& record : kernel_records) {
+      TF_RETURN_IF_ERROR(
+          AddStreamInfo(record.context, record.stream, "Kernel"));
+    }
+    for (const auto& record : memcpy_records) {
+      TF_RETURN_IF_ERROR(
+          AddStreamInfo(record.context, record.stream, GetMemcpyName(record)));
+    }
+
+    // Synchronize all contexts, record end events, synchronize again.
+    TF_RETURN_IF_ERROR(Synchronize());
+    for (auto& pair : context_infos_) {
+      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
+      TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
+    }
+    TF_RETURN_IF_ERROR(Synchronize());
+    end_walltime_us_ = Env::Default()->NowMicros();
+
+    for (const auto& record : kernel_records) {
+      TF_RETURN_IF_ERROR(SaveRecord(record));
+    }
+    for (const auto& record : memcpy_records) {
+      TF_RETURN_IF_ERROR(SaveRecord(record));
+    }
+
+    return Status::OK();
+  }
+
+ public:
+  // Consumes the records in recorder and saves them to the collector.
+  static Status Collect(CudaEventRecorder* recorder,
+                        StepStatsCollector* collector) {
+    CUcontext context;
+    TF_RETURN_IF_ERROR(ToStatus(cuCtxGetCurrent(&context)));
+    auto status = CudaEventCollector(recorder, collector).Collect();
+    TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(context)));
+    return status;
   }
-}
 
-Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
+ private:
+  CudaEventRecorder* recorder_;
+  StepStatsCollector* collector_;
+
+  absl::node_hash_map<CUdevice, DeviceInfo> device_infos_;
+  absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
+  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
+  int64 end_walltime_us_;
+};
+}  // namespace
+
+Status DeviceTracerImpl::Collect(StepStatsCollector* collector) {
   mutex_lock l(mu_);
   if (enabled_) {
     return errors::FailedPrecondition("DeviceTracer is still enabled.");
   }
 
-  // TODO(pbar) Handle device IDs and prefix properly.
-  const string prefix = "";
-  const int id = 0;
-  const string stream_device =
-      strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device =
-      strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
-
-  mutex_lock l2(trace_mu_);
-  for (const auto &rec : kernel_records_) {
-    auto it = correlations_.find(rec.correlation_id);
-    const string name = (it != correlations_.cend()) ? it->second : "unknown";
-    NodeExecStats *ns = new NodeExecStats;
-    ns->set_all_start_micros(start_walltime_us_ +
-                             ((rec.start_timestamp - start_timestamp_) / 1000));
-    ns->set_op_start_rel_micros(0);
-    auto elapsed_us =
-        std::max<int64>((rec.end_timestamp - rec.start_timestamp) / 1000, 1);
-    ns->set_op_end_rel_micros(elapsed_us);
-    ns->set_all_end_rel_micros(elapsed_us);
-    ns->set_node_name(name);
-    // TODO(pbar) Generate details based on the kernel activity record.
-    // ns->set_timeline_label(details);
-    auto nscopy = new NodeExecStats;
-    *nscopy = *ns;
-    collector->Save(strings::StrCat(stream_device, "all"), ns);
-    collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy);
-  }
-  for (const auto &rec : memcpy_records_) {
-    auto it = correlations_.find(rec.correlation_id);
-    const string name = (it != correlations_.cend()) ? it->second : "unknown";
-    NodeExecStats *ns = new NodeExecStats;
-    ns->set_all_start_micros(start_walltime_us_ +
-                             ((rec.start_timestamp - start_timestamp_) / 1000));
-    ns->set_op_start_rel_micros(0);
-    auto elapsed_us =
-        std::max<int64>((rec.end_timestamp - rec.start_timestamp) / 1000, 1);
-    ns->set_op_end_rel_micros(elapsed_us);
-    ns->set_all_end_rel_micros(elapsed_us);
-    auto copyKind = static_cast<CUpti_ActivityMemcpyKind>(rec.copyKind);
-    auto srcKind = static_cast<CUpti_ActivityMemoryKind>(rec.srcKind);
-    auto dstKind = static_cast<CUpti_ActivityMemoryKind>(rec.dstKind);
-    const string details = strings::Printf(
-        "MEMCPY%s %llu bytes (%s to %s)", getMemcpyKindString(copyKind),
-        rec.bytes, getMemoryKindString(srcKind), getMemoryKindString(dstKind));
-    ns->set_node_name(
-        strings::StrCat(name, ":MEMCPY", getMemcpyKindString(copyKind)));
-    ns->set_timeline_label(details);
-    auto nscopy = new NodeExecStats;
-    *nscopy = *ns;
-    collector->Save(memcpy_device, ns);
-    collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy);
-  }
+  TF_RETURN_IF_ERROR(CudaEventCollector::Collect(recorder_.get(), collector));
+  host_tracer_->CollectDataToCollector(collector).IgnoreError();
   return Status::OK();
 }
 
-}  // namespace devicetracer
-
 std::unique_ptr<DeviceTracer> CreateDeviceTracer() {
-  devicetracer::CUPTIManager *cupti_manager = devicetracer::GetCUPTIManager();
-  if (cupti_manager == nullptr) {
+  auto status = cuInit(0);
+  if (status != CUDA_SUCCESS) {
+    LogIfError(ToStatus(status));
     return nullptr;
   }
-  std::unique_ptr<DeviceTracer> tracer(
-      new devicetracer::DeviceTracerImpl(cupti_manager));
-  return tracer;
+  return absl::make_unique<DeviceTracerImpl>();
 }
-
 }  // namespace tensorflow
-
 #else  // GOOGLE_CUDA
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 7ac5e5c4450708a486be956a5806e31b8dd36fa3..671d8cf8bf96a75f9b97af887033a7a4310edbdc 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,26 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                                 \
-  struct DynLoadShim__##__name {                                              \
-    static const char* kName;                                                 \
-    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                             \
-      static auto status =                                                    \
-          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
-      return status.ValueOrDie();                                             \
-    }                                                                         \
-    static FuncPointerT DynLoad() {                                           \
-      static void* f;                                                         \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
-          GetDsoHandle(), kName, &f))                                         \
-          << "could not find " << kName << "in libcupti DSO";                 \
-      return reinterpret_cast<FuncPointerT>(f);                               \
-    }                                                                         \
-    template <typename... Args>                                               \
-    CUptiResult operator()(Args... args) {                                    \
-      return DynLoad()(args...);                                              \
-    }                                                                         \
-  } __name;                                                                   \
+#define LIBCUPTI_WRAP(__name)                                                \
+  struct DynLoadShim__##__name {                                             \
+    static const char* kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    template <typename... Args>                                              \
+    CUptiResult operator()(Args... args) {                                   \
+      static auto fn = []() -> FuncPointerT {                                \
+        auto handle_or =                                                     \
+            stream_executor::internal::CachedDsoLoader::GetCuptiDsoHandle(); \
+        if (!handle_or.ok()) return nullptr;                                 \
+        void* symbol;                                                        \
+        stream_executor::port::Env::Default()                                \
+            ->GetSymbolFromLibrary(handle_or.ValueOrDie(), kName, &symbol)   \
+            .IgnoreError();                                                  \
+        return reinterpret_cast<FuncPointerT>(symbol);                       \
+      }();                                                                   \
+      if (fn == nullptr) return CUPTI_ERROR_UNKNOWN;                         \
+      return fn(args...);                                                    \
+    }                                                                        \
+  } __name;                                                                  \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
@@ -62,6 +61,7 @@ LIBCUPTI_WRAP(cuptiEnableCallback);
 LIBCUPTI_WRAP(cuptiEnableDomain);
 LIBCUPTI_WRAP(cuptiSubscribe);
 LIBCUPTI_WRAP(cuptiUnsubscribe);
+LIBCUPTI_WRAP(cuptiGetResultString);
 
 }  // namespace dynload
 
@@ -125,6 +125,11 @@ CUptiResult CuptiWrapper::Unsubscribe(CUpti_SubscriberHandle subscriber) {
   return dynload::cuptiUnsubscribe(subscriber);
 }
 
+CUptiResult CuptiWrapper::GetResultString(CUptiResult result,
+                                          const char** str) {
+  return dynload::cuptiGetResultString(result, str);
+}
+
 }  // namespace profiler
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index e3ebe6ca1d025b047abfe91d8f5ab2e1fedd5a1b..b35a5ab4c3108354702eb60ea5ae4c796a4e0638 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -68,6 +68,8 @@ class CuptiWrapper {
                         CUpti_CallbackFunc callback, void* userdata);
 
   CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber);
+
+  CUptiResult GetResultString(CUptiResult result, const char** str);
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46..977ff1272ea2a97e0b52b785b24560e02eb44207 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -46,7 +46,7 @@ Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
   return errors::Internal("Cannot parse JSON protos on Android");
 #else
   proto->Clear();
-  auto status = google::protobuf::util::JsonStringToMessage(str, proto);
+  auto status = protobuf::util::JsonStringToMessage(str, proto);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
diff --git a/tensorflow/core/platform/default/platform.bzl b/tensorflow/core/platform/default/platform.bzl
index 20ab441bf43e19277c697f98f289ba80d755af48..76bfaa896efa2f8d8f06814d6f69f7bf0b66ed33 100644
--- a/tensorflow/core/platform/default/platform.bzl
+++ b/tensorflow/core/platform/default/platform.bzl
@@ -5,55 +5,52 @@ CUDNN_VERSION = ""
 PLATFORM = ""
 
 def cuda_sdk_version():
-  return CUDA_VERSION
+    return CUDA_VERSION
 
 def cudnn_sdk_version():
-  return CUDNN_VERSION
+    return CUDNN_VERSION
 
 def cuda_library_path(name, version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/lib{}.dylib".format(name)
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/lib{}.dylib".format(name)
+        else:
+            return "lib/lib{}.{}.dylib".format(name, version)
+    elif not version:
+        return "lib64/lib{}.so".format(name)
     else:
-      return "lib/lib{}.{}.dylib".format(name, version)
-  else:
-    if not version:
-      return "lib64/lib{}.so".format(name)
-    else:
-      return "lib64/lib{}.so.{}".format(name, version)
+        return "lib64/lib{}.so.{}".format(name, version)
 
 def cuda_static_library_path(name):
-  if PLATFORM == "Darwin":
-    return "lib/lib{}_static.a".format(name)
-  else:
-    return "lib64/lib{}_static.a".format(name)
+    if PLATFORM == "Darwin":
+        return "lib/lib{}_static.a".format(name)
+    else:
+        return "lib64/lib{}_static.a".format(name)
 
 def cudnn_library_path(version = cudnn_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/libcudnn.dylib"
-    else:
-      return "lib/libcudnn.{}.dylib".format(version)
-  else:
-    if not version:
-      return "lib64/libcudnn.so"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/libcudnn.dylib"
+        else:
+            return "lib/libcudnn.{}.dylib".format(version)
+    elif not version:
+        return "lib64/libcudnn.so"
     else:
-      return "lib64/libcudnn.so.{}".format(version)
+        return "lib64/libcudnn.so.{}".format(version)
 
 def cupti_library_path(version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "extras/CUPTI/lib/libcupti.dylib"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "extras/CUPTI/lib/libcupti.dylib"
+        else:
+            return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
+    elif not version:
+        return "extras/CUPTI/lib64/libcupti.so"
     else:
-      return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
-  else:
-    if not version:
-      return "extras/CUPTI/lib64/libcupti.so"
-    else:
-      return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
+        return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
 
 def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
+    if PLATFORM == "Darwin":
+        return "greadlink"
+    else:
+        return "readlink"
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
deleted file mode 100644
index aeef2d9b882c0a3e2624db2dd194345a373bfe0c..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/protobuf.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
-
-#ifndef TENSORFLOW_LITE_PROTOS
-#include "google/protobuf/descriptor.h"
-#include "google/protobuf/descriptor.pb.h"
-#include "google/protobuf/dynamic_message.h"
-#include "google/protobuf/io/tokenizer.h"
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/util/json_util.h"
-#include "google/protobuf/util/type_resolver_util.h"
-#endif
-
-#include "google/protobuf/arena.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/map.h"
-#include "google/protobuf/repeated_field.h"
-
-namespace tensorflow {
-namespace protobuf = ::google::protobuf;
-using protobuf_int64 = ::google::protobuf::int64;
-using protobuf_uint64 = ::google::protobuf::uint64;
-extern const char* kProtobufInt64Typename;
-extern const char* kProtobufUint64Typename;
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index c8e297fa8d8c1ee48b060e6e2c7ee89eb0d23b39..b64bc15971037f204a40513cbf74cc7c944e08f2 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 
 #include "tensorflow/core/platform/platform.h"
-#if !defined(IS_MOBILE_PLATFORM) && defined(PLATFORM_POSIX) && \
-    (defined(__clang__) || defined(__GNUC__))
+#if !defined(IS_MOBILE_PLATFORM) && !defined(PLATFORM_WINDOWS) && \
+    defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
 #define TF_GENERATE_BACKTRACE
 #endif
 
diff --git a/tensorflow/core/platform/default/string_coding.h b/tensorflow/core/platform/default/string_coding.h
deleted file mode 100644
index 70b8ab01444a6175f9c037e35fadc8196c781b19..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/string_coding.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/tensor_coding.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/tensor_coding.h
-
-#include "tensorflow/core/lib/core/coding.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace port {
-
-// Encodes sequences of strings and serialized protocol buffers into a string.
-// Normal usage consists of zero or more calls to Append() and a single call to
-// Finalize().
-class StringListEncoder {
- public:
-  explicit StringListEncoder(string* out) : out_(out) {}
-
-  // Encodes the given protocol buffer. This may not be called after Finalize().
-  void Append(const protobuf::MessageLite& m) {
-    core::PutVarint32(out_, m.ByteSize());
-    m.AppendToString(&rest_);
-  }
-
-  // Encodes the given string. This may not be called after Finalize().
-  void Append(const string& s) {
-    core::PutVarint32(out_, s.length());
-    strings::StrAppend(&rest_, s);
-  }
-
-  // Signals end of the encoding process. No other calls are allowed after this.
-  void Finalize() { strings::StrAppend(out_, rest_); }
-
- private:
-  string* out_;
-  string rest_;
-};
-
-// Decodes a string into sequences of strings (which may represent serialized
-// protocol buffers). Normal usage involves a single call to ReadSizes() in
-// order to retrieve the length of all the strings in the sequence. For each
-// size returned a call to Data() is expected and will return the actual
-// string.
-class StringListDecoder {
- public:
-  explicit StringListDecoder(const string& in) : reader_(in) {}
-
-  // Populates the given vector with the lengths of each string in the sequence
-  // being decoded. Upon returning the vector is guaranteed to contain as many
-  // elements as there are strings in the sequence.
-  bool ReadSizes(std::vector<uint32>* sizes) {
-    int64 total = 0;
-    for (auto& size : *sizes) {
-      if (!core::GetVarint32(&reader_, &size)) return false;
-      total += size;
-    }
-    if (total != static_cast<int64>(reader_.size())) {
-      return false;
-    }
-    return true;
-  }
-
-  // Returns a pointer to the next string in the sequence, then prepares for the
-  // next call by advancing 'size' characters in the sequence.
-  const char* Data(uint32 size) {
-    const char* data = reader_.data();
-    reader_.remove_prefix(size);
-    return data;
-  }
-
- private:
-  StringPiece reader_;
-};
-
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
-
-}  // namespace port
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 63394174455089c64e1e889e35f578437f7fb4fc..59768bf92ae9e854f684623ec15c83a70839312d 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -365,22 +365,10 @@ bool Env::LocalTempFilename(string* filename) {
 }
 
 bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
-#ifdef __APPLE__
-  uint64_t tid64;
-  pthread_threadid_np(nullptr, &tid64);
-  int32 tid = static_cast<int32>(tid64);
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(__FreeBSD__)
-  // Has to be casted to long first, else this error appears:
-  // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
-  // is not allowed
-  int32 tid = static_cast<int32>(static_cast<int64>(pthread_self()));
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(PLATFORM_WINDOWS)
-  int32 tid = static_cast<int32>(GetCurrentThreadId());
+  int32 tid = GetCurrentThreadId();
+#ifdef PLATFORM_WINDOWS
   int32 pid = static_cast<int32>(GetCurrentProcessId());
 #else
-  int32 tid = static_cast<int32>(pthread_self());
   int32 pid = static_cast<int32>(getpid());
 #endif
   uint64 now_microsec = NowMicros();
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1b5382841574e6b8843079ae9cb359c5c9b475d0..280076e098d5fdd121bf095d79be5353c0e2b57f 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -271,6 +271,15 @@ class Env {
                               const string& name,
                               std::function<void()> fn) TF_MUST_USE_RESULT = 0;
 
+  // Returns the thread id of calling thread.
+  // Posix: Returns pthread id which is only guaranteed to be unique within a
+  //        process.
+  // Windows: Returns thread id which is unique.
+  virtual int32 GetCurrentThreadId() = 0;
+
+  // Copies current thread name to "name". Returns true if success.
+  virtual bool GetCurrentThreadName(string* name) = 0;
+
   // \brief Schedules the given closure on a thread-pool.
   //
   // NOTE(mrry): This closure may block.
@@ -360,6 +369,10 @@ class EnvWrapper : public Env {
                       std::function<void()> fn) override {
     return target_->StartThread(thread_options, name, fn);
   }
+  int32 GetCurrentThreadId() override { return target_->GetCurrentThreadId(); }
+  bool GetCurrentThreadName(string* name) override {
+    return target_->GetCurrentThreadName(name);
+  }
   void SchedClosure(std::function<void()> closure) override {
     target_->SchedClosure(closure);
   }
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 2e1d4a263f643da6bf9d0600ffc2cb4469ca8d70..ea1f123424728ea4bec4855dbfc7300a96103eeb 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -392,4 +392,20 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
   EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
+TEST_F(DefaultEnvTest, GetThreadInformation) {
+  Env* env = Env::Default();
+  // TODO(fishx): Turn on this test for Apple.
+#if !defined(__APPLE__)
+  EXPECT_NE(env->GetCurrentThreadId(), 0);
+#endif
+  string thread_name;
+  bool res = env->GetCurrentThreadName(&thread_name);
+#if defined(PLATFORM_WINDOWS) || defined(__ANDROID__)
+  EXPECT_FALSE(res);
+#elif !defined(__APPLE__)
+  EXPECT_TRUE(res);
+  EXPECT_GT(thread_name.size(), 0);
+#endif
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/env_time.h b/tensorflow/core/platform/env_time.h
index b4756ed209cf7f945a2cf4f1bea7271dded7518a..c12b6ba6fb86e7bda394b85fa449c8176c817054 100644
--- a/tensorflow/core/platform/env_time.h
+++ b/tensorflow/core/platform/env_time.h
@@ -25,6 +25,7 @@ namespace tensorflow {
 /// access timer related operations.
 class EnvTime {
  public:
+  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
   static constexpr uint64 kMicrosToNanos = 1000ULL;
   static constexpr uint64 kMillisToMicros = 1000ULL;
   static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
diff --git a/tensorflow/core/platform/grpc_services.h b/tensorflow/core/platform/grpc_services.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd918193dc52881ea396142a7b0a8e3708cb427c
--- /dev/null
+++ b/tensorflow/core/platform/grpc_services.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+#define TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+
+#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+
+#if !defined(PLATFORM_GOOGLE)
+
+namespace tensorflow {
+namespace grpc {
+
+// Google internal GRPC generates services under namespace "tensorflow::grpc".
+// Creating aliases here to make sure we can access services under namespace
+// "tensorflow::grpc" both in google internal and open-source.
+using ::tensorflow::ProfileAnalysis;
+using ::tensorflow::ProfilerService;
+
+}  // namespace grpc
+}  // namespace tensorflow
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 2cf1036cc898ca8afefcb01d622a41240ec7ca56..d61a04450b832da40a2e9138368b475b1ddfcf8f 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -59,8 +59,6 @@ class LibHDFS {
   std::function<hdfsBuilder*()> hdfsNewBuilder;
   std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
   std::function<int(const char*, char**)> hdfsConfGetStr;
-  std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
-      hdfsBuilderSetKerbTicketCachePath;
   std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
   std::function<tSize(hdfsFS, hdfsFile, tOffset, void*, tSize)> hdfsPread;
   std::function<tSize(hdfsFS, hdfsFile, const void*, tSize)> hdfsWrite;
@@ -88,7 +86,6 @@ class LibHDFS {
       BIND_HDFS_FUNC(hdfsNewBuilder);
       BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
       BIND_HDFS_FUNC(hdfsConfGetStr);
-      BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
       BIND_HDFS_FUNC(hdfsCloseFile);
       BIND_HDFS_FUNC(hdfsPread);
       BIND_HDFS_FUNC(hdfsWrite);
@@ -157,7 +154,8 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
     StringPiece defaultScheme, defaultCluster, defaultPath;
     io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
 
-    if (scheme != defaultScheme || namenode != defaultCluster) {
+    if (scheme != defaultScheme ||
+        (namenode != "" && namenode != defaultCluster)) {
       return errors::Unimplemented(
           "viewfs is only supported as a fs.defaultFS.");
     }
@@ -166,14 +164,7 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
     // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
     hdfs_->hdfsBuilderSetNameNode(builder, "default");
   } else {
-    hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
-  }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
-  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
-  // and related code are unnecessary.
-  char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
-  if (ticket_cache_path != nullptr) {
-    hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
+    hdfs_->hdfsBuilderSetNameNode(builder, nn == "" ? "default" : nn.c_str());
   }
   *fs = hdfs_->hdfsBuilderConnect(builder);
   if (*fs == nullptr) {
@@ -219,8 +210,12 @@ class HDFSRandomAccessFile : public RandomAccessFile {
       // We lock inside the loop rather than outside so we don't block other
       // concurrent readers.
       mutex_lock lock(mu_);
+      // Max read length is INT_MAX-2, for hdfsPread function take a parameter
+      // of int32. -2 offset can avoid JVM OutOfMemoryError.
+      size_t read_n =
+          std::min(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
       tSize r = hdfs_->hdfsPread(fs_, file_, static_cast<tOffset>(offset), dst,
-                                 static_cast<tSize>(n));
+                                 static_cast<tSize>(read_n));
       if (r > 0) {
         dst += r;
         n -= r;
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 17a5d5fb5b7099ad01c68d64f5528fa07cc2fa6f..7417ec8aff66de1f393d9c381bbf2b657a85307d 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"  // To pick up PLATFORM_define
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
-    defined(GOOGLE_LOGGING)
+    defined(GOOGLE_LOGGING) || defined(__EMSCRIPTEN__)
 #include "tensorflow/core/platform/google/build_config/logging.h"
 #else
 #include "tensorflow/core/platform/default/logging.h"
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 0481b3687137c8b00fa84d33eb317a1a4f5be9df..671e5dd3c862febe5ef4be912525c7f2043857ed 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -40,7 +40,7 @@ limitations under the License.
 #elif defined(_WIN32)
 #define PLATFORM_WINDOWS
 
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__EMSCRIPTEN__)
 #define PLATFORM_POSIX
 
 // Require an outside macro to tell us if we're building for Raspberry Pi or
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index d87e5dcfe70cc802a8ac5865445f508ff795aa34..f2dff5a9b6441c5c39f6251c3b8c46dcd8639c74 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -86,6 +86,35 @@ class PosixEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() override {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    return static_cast<int32>(tid64);
+#elif defined(__FreeBSD__)
+    // Has to be casted to long first, else this error appears:
+    // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
+    // is not allowed
+    return static_cast<int32>(static_cast<int64>(pthread_self()));
+#else
+    return static_cast<int32>(pthread_self());
+#endif
+  }
+
+  bool GetCurrentThreadName(string* name) override {
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
+    return false;
+#else
+    char buf[100];
+    int res = pthread_getname_np(pthread_self(), buf, static_cast<size_t>(100));
+    if (res != 0) {
+      return false;
+    }
+    *name = buf;
+    return true;
+#endif
+  }
+
   void SchedClosure(std::function<void()> closure) override {
     // TODO(b/27290852): Spawning a new thread here is wasteful, but
     // needed to deal with the fact that many `closure` functions are
diff --git a/tensorflow/core/platform/posix/net.cc b/tensorflow/core/platform/posix/net.cc
index 414ee6c50c22c2ae668c9b959b3f07c56041bbe6..c873f8fd4638e909c43f51a50458a6bef953f3b8 100644
--- a/tensorflow/core/platform/posix/net.cc
+++ b/tensorflow/core/platform/posix/net.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cerrno>
 #include <cstdlib>
+#include <cstring>
 #include <unordered_set>
 
 #include <netinet/in.h>
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index ea6066ac7bd6f89a6e07cb01b82c09f108f095ab..1561632a49af467aa7fcdd0e971458cd6d810cf2 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -45,6 +45,10 @@ limitations under the License.
 #include <thread>
 #endif
 
+#if TENSORFLOW_USE_NUMA
+#include "hwloc.h"  // TF:hwloc
+#endif
+
 namespace tensorflow {
 namespace port {
 
@@ -82,7 +86,9 @@ int NumTotalCPUs() {
 }
 
 int GetCurrentCPU() {
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__EMSCRIPTEN__)
+  return sched_getcpu();
+#elif defined(__linux__) && !defined(__ANDROID__)
   return sched_getcpu();
   // Attempt to use cpuid on all other platforms.  If that fails, perform a
   // syscall.
@@ -113,16 +119,94 @@ int NumHyperthreadsPerCore() {
   return (ht_per_core > 0) ? ht_per_core : 1;
 }
 
-bool NUMAEnabled() {
-  // Not yet implemented: coming soon.
-  return false;
+#ifdef TENSORFLOW_USE_NUMA
+namespace {
+static hwloc_topology_t hwloc_topology_handle;
+
+bool HaveHWLocTopology() {
+  // One time initialization
+  static bool init = []() {
+    if (hwloc_topology_init(&hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_init() failed";
+      return false;
+    }
+    if (hwloc_topology_load(hwloc_topology_handle)) {
+      LOG(ERROR) << "Call to hwloc_topology_load() failed";
+      return false;
+    }
+    return true;
+  }();
+  return init;
 }
 
-int NUMANumNodes() { return 1; }
+// Return the first hwloc object of the given type whose os_index
+// matches 'index'.
+hwloc_obj_t GetHWLocTypeIndex(hwloc_obj_type_t tp, int index) {
+  hwloc_obj_t obj = nullptr;
+  if (index >= 0) {
+    while ((obj = hwloc_get_next_obj_by_type(hwloc_topology_handle, tp, obj)) !=
+           nullptr) {
+      if (obj->os_index == index) break;
+    }
+  }
+  return obj;
+}
+}  // namespace
+#endif  // TENSORFLOW_USE_NUMA
+
+bool NUMAEnabled() { return (NUMANumNodes() > 1); }
+
+int NUMANumNodes() {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    int num_numanodes =
+        hwloc_get_nbobjs_by_type(hwloc_topology_handle, HWLOC_OBJ_NUMANODE);
+    return std::max(1, num_numanodes);
+  } else {
+    return 1;
+  }
+#else
+  return 1;
+#endif  // TENSORFLOW_USE_NUMA
+}
 
-void NUMASetThreadNodeAffinity(int node) {}
+void NUMASetThreadNodeAffinity(int node) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    // Find the corresponding NUMA node topology object.
+    hwloc_obj_t obj = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+    if (obj) {
+      hwloc_set_cpubind(hwloc_topology_handle, obj->cpuset,
+                        HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
+    } else {
+      LOG(ERROR) << "Could not find hwloc NUMA node " << node;
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
+}
 
-int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
+int NUMAGetThreadNodeAffinity() {
+  int node_index = kNUMANoAffinity;
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_cpuset_t thread_cpuset = hwloc_bitmap_alloc();
+    hwloc_get_cpubind(hwloc_topology_handle, thread_cpuset,
+                      HWLOC_CPUBIND_THREAD);
+    hwloc_obj_t obj = nullptr;
+    // Return the first NUMA node whose cpuset is a (non-proper) superset of
+    // that of the current thread.
+    while ((obj = hwloc_get_next_obj_by_type(
+                hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
+      if (hwloc_bitmap_isincluded(thread_cpuset, obj->cpuset)) {
+        node_index = obj->os_index;
+        break;
+      }
+    }
+    hwloc_bitmap_free(thread_cpuset);
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  return node_index;
+}
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
@@ -152,12 +236,54 @@ void* Realloc(void* ptr, size_t size) { return realloc(ptr, size); }
 void Free(void* ptr) { free(ptr); }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_obj_t numa_node = GetHWLocTypeIndex(HWLOC_OBJ_NUMANODE, node);
+    if (numa_node) {
+      return hwloc_alloc_membind(hwloc_topology_handle, size,
+                                 numa_node->nodeset, HWLOC_MEMBIND_BIND,
+                                 HWLOC_MEMBIND_BYNODESET);
+    } else {
+      LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
   return AlignedMalloc(size, minimum_alignment);
 }
 
-void NUMAFree(void* ptr, size_t size) { Free(ptr); }
+void NUMAFree(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology()) {
+    hwloc_free(hwloc_topology_handle, ptr, size);
+    return;
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  Free(ptr);
+}
 
-int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
+int NUMAGetMemAffinity(const void* addr) {
+  int node = kNUMANoAffinity;
+#ifdef TENSORFLOW_USE_NUMA
+  if (HaveHWLocTopology() && addr) {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (!hwloc_get_area_memlocation(hwloc_topology_handle, addr, 4, nodeset,
+                                    HWLOC_MEMBIND_BYNODESET)) {
+      hwloc_obj_t obj = nullptr;
+      while ((obj = hwloc_get_next_obj_by_type(
+                  hwloc_topology_handle, HWLOC_OBJ_NUMANODE, obj)) != nullptr) {
+        if (hwloc_bitmap_isincluded(nodeset, obj->nodeset)) {
+          node = obj->os_index;
+          break;
+        }
+      }
+      hwloc_bitmap_free(nodeset);
+    } else {
+      LOG(ERROR) << "Failed call to hwloc_get_area_memlocation.";
+    }
+  }
+#endif  // TENSORFLOW_USE_NUMA
+  return node;
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 003ab170fe8db2980bb9c7ad79bf90b523e36b76..083284c5ff99eadf08331d9f5a96a4bf0a189bd7 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/mman.h>
-#if !defined(__APPLE__)
+#if defined(__linux__)
 #include <sys/sendfile.h>
 #endif
 #include <sys/stat.h>
diff --git a/tensorflow/core/platform/default/protobuf.cc b/tensorflow/core/platform/protobuf.cc
similarity index 72%
rename from tensorflow/core/platform/default/protobuf.cc
rename to tensorflow/core/platform/protobuf.cc
index 548d5834e6f74b14a3ad16c00f5d3015f337f90a..c9e6f3bf5c6b498818001c9d6644d52af8b7f5d2 100644
--- a/tensorflow/core/platform/default/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/default/protobuf.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
-const char* kProtobufInt64Typename = "::google::protobuf::int64";
-const char* kProtobufUint64Typename = "::google::protobuf::uint64";
+const char* kProtobufInt64Typename = "::tensorflow::protobuf_int64";
+const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index fcbf1fc8c5054e110b9a0fe0217b97cecdd27088..59f4129adf40a5b6892a82ac705f73cdcf02886a 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -25,13 +25,31 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf.h"
-#else
-#include "tensorflow/core/platform/default/protobuf.h"
+#ifndef TENSORFLOW_LITE_PROTOS
+#include "google/protobuf/io/tokenizer.h"
+#include "google/protobuf/descriptor.pb.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/text_format.h"
+#include "google/protobuf/util/json_util.h"
+#include "google/protobuf/util/type_resolver_util.h"
 #endif
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/arena.h"
+#include "google/protobuf/map.h"
+#include "google/protobuf/repeated_field.h"
+
 namespace tensorflow {
+
+namespace protobuf = ::google::protobuf;
+using protobuf_int64 = ::google::protobuf::int64;
+using protobuf_uint64 = ::google::protobuf::uint64;
+extern const char* kProtobufInt64Typename;
+extern const char* kProtobufUint64Typename;
+
 // Parses a protocol buffer contained in a string in the binary wire format.
 // Returns true on success. Note: Unlike protobuf's builtin ParseFromString,
 // this function has no size restrictions on the total size of the encoded
@@ -47,8 +65,19 @@ inline const string& ProtobufStringToString(const string& s) { return s; }
 // Set <dest> to <src>. Swapping is allowed, as <src> does not need to be
 // preserved.
 inline void SetProtobufStringSwapAllowed(string* src, string* dest) {
-  dest->swap(*src);
+  *dest = std::move(*src);
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// These versions of ProtobufStringToString and SetProtobufString get used by
+// tools/proto_text's generated code.  They have the same name as the versions
+// in core/platform/protobuf.h, so the generation code doesn't need to determine
+// if the type is Cord or string at generation time.
+inline string ProtobufStringToString(const Cord& s) { return s.ToString(); }
+inline void SetProtobufStringSwapAllowed(string* src, Cord* dest) {
+  dest->CopyFrom(*src);
 }
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/protobuf_compiler.h b/tensorflow/core/platform/protobuf_compiler.h
index 29679e00892fbd11d1e5242f62650f42ecef5577..916637d13a55044873b5309c1ea0acc9ac4eef47 100644
--- a/tensorflow/core/platform/protobuf_compiler.h
+++ b/tensorflow/core/platform/protobuf_compiler.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 #define TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf_compiler.h"
-#else
-#include "tensorflow/core/platform/default/protobuf_compiler.h"
-#endif
+#include "google/protobuf/compiler/importer.h"
 
 #endif  // TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/platform/rocm.h
similarity index 74%
rename from tensorflow/core/kernels/bounds_check.h
rename to tensorflow/core/platform/rocm.h
index ce6ec1012daacf915fee0ee7bb059306058361d5..1896cc3d84cbcc129491add1077caf01fb6dbe93 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/platform/rocm.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
-#define TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_H_
+#define TENSORFLOW_CORE_PLATFORM_ROCM_H_
 
-#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/stream_executor/rocm/rocm_activation.h"
 
-#endif  // TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_ROCM_H_
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 41184b6fd9ed12c0164f06e2c92816b2c99a03f7..7bc4d80db5b0ab31540f5c95d91ad29239458bce 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -14,7 +14,7 @@ load(
 )
 
 tf_cc_binary(
-    name = "s3_file_system.so",
+    name = "libs3_file_system_shared.so",
     srcs = [
         "aws_crypto.cc",
         "aws_crypto.h",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index 44317f1a3e41831b903bd0044d53d1eba80168df..dac569088936b984f2c3167962ff4872e03decc3 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -69,12 +69,32 @@ void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
 }
 
 namespace {
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int ParseInteger(const char* str, size_t size) {
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int64 LogLevelStrToInt(const char* tf_env_var_val) {
+  if (tf_env_var_val == nullptr) {
+    return 0;
+  }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
+
 static const char* kAWSLoggingTag = "AWSLogging";
 
 Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
   Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
 
-  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+  const int64_t level = getenv("AWS_LOG_LEVEL")
+                            ? LogLevelStrToInt(getenv("AWS_LOG_LEVEL"))
+                            : tensorflow::internal::MinLogLevelFromEnv();
 
   switch (level) {
     case INFO:
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 42822859f6e12372511f10809bd416b5054b7202..437e8a1c95632af71c3f2db2c4b35cfb48849b8a 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 123035cc8a69cd895ad92a505951cc3441b27988..129ee6c7a7503b680e90ccc68e39a3c838bb0e65 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/strong_hash.h b/tensorflow/core/platform/strong_hash.h
index 999fd2e4b309e3d5e59e8b016a8ee490f2fd7b55..a276780d13ba7fce31435b0b56d9b7e4e370dec9 100644
--- a/tensorflow/core/platform/strong_hash.h
+++ b/tensorflow/core/platform/strong_hash.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 // This is a strong keyed hash function interface for strings.
 // The hash function is deterministic on the content of the string within the
 // process. The key of the hash is an array of 2 uint64 elements.
-// A strong hash make it dificult, if not infeasible, to compute inputs that
+// A strong hash make it difficult, if not infeasible, to compute inputs that
 // hash to the same bucket.
 //
 // Usage:
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 84601de39a6547ee78d190764616058b4595dc33..3280802bac42725132ef9ad22cc0439d45fca5ac 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -19,6 +19,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+#include "strings/cord_varint.h"
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 namespace tensorflow {
 namespace port {
@@ -66,5 +72,174 @@ void CopyFromArray(string* s, const char* base, size_t bytes) {
   s->assign(base, bytes);
 }
 
+class StringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit StringListEncoderImpl(string* out) : out_(out) {}
+  ~StringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    core::PutVarint32(out_, m.ByteSizeLong());
+    tensorflow::string serialized_message;
+    m.AppendToString(&serialized_message);
+    strings::StrAppend(&rest_, serialized_message);
+  }
+
+  void Append(const string& s) override {
+    core::PutVarint32(out_, s.length());
+    strings::StrAppend(&rest_, s);
+  }
+
+  void Finalize() override { strings::StrAppend(out_, rest_); }
+
+ private:
+  string* out_;
+  string rest_;
+};
+
+class StringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit StringListDecoderImpl(const string& in) : reader_(in) {}
+  ~StringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!core::GetVarint32(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.size())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    const char* data = reader_.data();
+    reader_.remove_prefix(size);
+    return data;
+  }
+
+ private:
+  StringPiece reader_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
+  return std::unique_ptr<StringListEncoder>(new StringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
+  return std::unique_ptr<StringListDecoder>(new StringListDecoderImpl(in));
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out) {
+  obj->Ref();
+  out->Clear();
+  // Defines a lambda to unref "obj" when Cord deletes this piece of
+  // memory. +[] converts the lambda to a C style function pointer.
+  auto cleanup = +[](absl::string_view donotcare, void* obj) {
+    reinterpret_cast<core::RefCounted*>(obj)->Unref();
+  };
+  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
+                            cleanup);
+}
+
+void EncodeStringList(const string* strings, int64 n, Cord* out) {
+  out->Clear();
+  for (int i = 0; i < n; ++i) {
+    ::strings::CordAppendVarint(strings[i].size(), out);
+  }
+  for (int i = 0; i < n; ++i) {
+    out->Append(strings[i]);
+  }
+}
+
+bool DecodeStringList(const Cord& src, string* strings, int64 n) {
+  std::vector<uint32> sizes(n);
+  CordReader reader(src);
+  int64 tot = 0;
+  for (auto& v : sizes) {
+    if (!::strings::CordReaderReadVarint(&reader, &v)) return false;
+    tot += v;
+  }
+  if (tot != reader.Available()) {
+    return false;
+  }
+  string* data = strings;
+  for (int i = 0; i < n; ++i, ++data) {
+    auto size = sizes[i];
+    if (size > reader.Available()) {
+      return false;
+    }
+    gtl::STLStringResizeUninitialized(data, size);
+    reader.ReadN(size, gtl::string_as_array(data));
+  }
+  return true;
+}
+
+void CopyFromArray(Cord* c, const char* base, size_t bytes) {
+  c->CopyFrom(base, bytes);
+}
+
+class CordStringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit CordStringListEncoderImpl(Cord* out) : out_(out) {}
+  ~CordStringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    ::strings::CordAppendVarint(m.ByteSizeLong(), out_);
+    m.AppendToString(&rest_);
+  }
+
+  void Append(const string& s) override {
+    ::strings::CordAppendVarint(s.length(), out_);
+    rest_.append(s.data(), s.size());
+  }
+
+  void Finalize() override { out_->Append(rest_); }
+
+ private:
+  Cord* out_;
+  string rest_;
+};
+
+class CordStringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit CordStringListDecoderImpl(const Cord& in) : reader_(in) {}
+  ~CordStringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!::strings::CordReaderReadVarint(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.Available())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    tmp_.resize(size);
+    reader_.ReadN(size, tmp_.data());
+    return tmp_.data();
+  }
+
+ private:
+  CordReader reader_;
+  std::vector<char> tmp_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out) {
+  return std::unique_ptr<StringListEncoder>(new CordStringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in) {
+  return std::unique_ptr<StringListDecoder>(new CordStringListDecoderImpl(in));
+}
+
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 6c6d75830de743b3e24676c1f57b6988aad11a0f..993ce537ffcd3884cfbb32d1edbdbfbe89f72658 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -21,14 +21,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
-#ifdef PLATFORM_GOOGLE
-#include "tensorflow/core/platform/google/cord_coding.h"
-#else
-#include "tensorflow/core/platform/default/string_coding.h"
-#endif
-
 namespace tensorflow {
 namespace port {
 
@@ -42,6 +37,15 @@ inline void CopyToArray(const string& src, char* dst) {
   memcpy(dst, src.data(), src.size());
 }
 
+// Copy subrange [pos:(pos + n)) from src to dst. If pos >= src.size() the
+// result is empty. If pos + n > src.size() the subrange [pos, size()) is
+// copied.
+inline void CopySubrangeToArray(const string& src, size_t pos, size_t n,
+                                char* dst) {
+  if (pos >= src.size()) return;
+  memcpy(dst, src.data() + pos, std::min(n, src.size() - pos));
+}
+
 // Store encoding of strings[0..n-1] in *out.
 void EncodeStringList(const string* strings, int64 n, string* out);
 
@@ -52,6 +56,75 @@ bool DecodeStringList(const string& src, string* strings, int64 n);
 // Assigns base[0..bytes-1] to *s
 void CopyFromArray(string* s, const char* base, size_t bytes);
 
+// Encodes sequences of strings and serialized protocol buffers into a string.
+// Normal usage consists of zero or more calls to Append() and a single call to
+// Finalize().
+class StringListEncoder {
+ public:
+  virtual ~StringListEncoder() = default;
+
+  // Encodes the given protocol buffer. This may not be called after Finalize().
+  virtual void Append(const protobuf::MessageLite& m) = 0;
+
+  // Encodes the given string. This may not be called after Finalize().
+  virtual void Append(const string& s) = 0;
+
+  // Signals end of the encoding process. No other calls are allowed after this.
+  virtual void Finalize() = 0;
+};
+
+// Decodes a string into sequences of strings (which may represent serialized
+// protocol buffers). Normal usage involves a single call to ReadSizes() in
+// order to retrieve the length of all the strings in the sequence. For each
+// size returned a call to Data() is expected and will return the actual
+// string.
+class StringListDecoder {
+ public:
+  virtual ~StringListDecoder() = default;
+
+  // Populates the given vector with the lengths of each string in the sequence
+  // being decoded. Upon returning the vector is guaranteed to contain as many
+  // elements as there are strings in the sequence.
+  virtual bool ReadSizes(std::vector<uint32>* sizes) = 0;
+
+  // Returns a pointer to the next string in the sequence, then prepares for the
+  // next call by advancing 'size' characters in the sequence.
+  virtual const char* Data(uint32 size) = 0;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// Store src contents in *out.  If backing memory for src is shared with *out,
+// will ref obj during the call and will arrange to unref obj when no
+// longer needed.
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out);
+
+// TODO(kmensah): Macro guard this with a check for Cord support.
+inline void CopyToArray(const Cord& src, char* dst) { src.CopyToArray(dst); }
+
+// Copy n bytes of src to dst. If pos >= src.size() the result is empty.
+// If pos + n > src.size() the subrange [pos, size()) is copied.
+inline void CopySubrangeToArray(const Cord& src, int64 pos, int64 n,
+                                char* dst) {
+  src.Subcord(pos, n).CopyToArray(dst);
+}
+
+// Store encoding of strings[0..n-1] in *out.
+void EncodeStringList(const string* strings, int64 n, Cord* out);
+
+// Decode n strings from src and store in strings[0..n-1].
+// Returns true if successful, false on parse error.
+bool DecodeStringList(const Cord& src, string* strings, int64 n);
+
+// Assigns base[0..bytes-1] to *c
+void CopyFromArray(Cord* c, const char* base, size_t bytes);
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in);
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 77ce2026d9d2cdda7ef1ea0ad6bb71050a6467af..e0e3dda7055b5cbe8f0e08be4a251232b8005fd2 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -84,6 +84,12 @@ class WindowsEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() override {
+    return static_cast<int32>(::GetCurrentThreadId());
+  }
+
+  bool GetCurrentThreadName(string* name) override { return false; }
+
   static VOID CALLBACK SchedClosureCallback(PTP_CALLBACK_INSTANCE Instance,
                                             PVOID Context, PTP_WORK Work) {
     CloseThreadpoolWork(Work);
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index ba643a0fa8f92f58fbd88ac00fba3f663bb7e0f2..22875ac2bc4a059a26ef2a9ba44e1e51154bee6c 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -19,6 +19,9 @@ limitations under the License.
 #include <string>
 
 #include <Windows.h>
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
 
 namespace tensorflow {
 namespace internal {
diff --git a/tensorflow/core/platform/windows/wide_char.h b/tensorflow/core/platform/windows/wide_char.h
index 1b86abc3fa120feb331ad46a5221444c7d08effb..5aca95454f335119907b71d73afce94d8f99aeff 100644
--- a/tensorflow/core/platform/windows/wide_char.h
+++ b/tensorflow/core/platform/windows/wide_char.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_WINDOWS_WIDE_CHAR_H_
 
 #include <Windows.h>
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
 #include <string>
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 2bf371276ef6013ac9f8e3c44623f9c7720cffb3..4efc15b7e5ff65085137d348e57f7311dd01db14 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_profiler_all_protos")
 
 tf_cc_binary(
     name = "profiler",
@@ -36,9 +37,35 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "profiler_service_proto",
+    srcs = ["profiler_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = tf_profiler_all_protos() + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_analysis_proto",
+    srcs = ["profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [":profiler_service_proto"] + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "protos_all",
-    srcs = glob(["**/*.proto"]),
+    srcs = glob(
+        ["**/*.proto"],
+        exclude = [
+            "profiler_service.proto",
+            "profiler_analysis.proto",
+        ],
+    ),
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 57d76eb4cb9382790c80a0d55ee94b64e7b9dcdc..341421738e618e7406de05a126a49f4e1e336b93 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -35,10 +35,10 @@ bazel-bin/tensorflow/core/profiler/profiler \
     --profile_path=/tmp/train_dir/profile_xx
 tfprof> op -select micros,bytes,occurrence -order_by micros
 
-# To be open sourced...
-bazel-bin/tensorflow/python/profiler/profiler_ui \
-    --profile_path=/tmp/profiles/profile_1
+# Profiler ui available at: https://github.com/tensorflow/profiler-ui
+python ui.py --profile_context_path=/tmp/train_dir/profile_xx
 ```
+
 ![ProfilerUI](g3doc/profiler_ui.jpg)
 
 ```python
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 8dcfde9a2adbd3a1774bce8506a84f80ca099c34..da3039ae3ceba103882d1315c6293af5560e1862 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -6,6 +6,8 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 cc_library(
     name = "tfprof_stats",
@@ -365,3 +367,43 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
+
+tf_cuda_library(
+    name = "traceme_recorder",
+    srcs = ["traceme_recorder.cc"],
+    hdrs = ["traceme_recorder.h"],
+    visibility = [
+        "//learning/brain/runtime:__pkg__",  # xprof_bridge
+        "//perftools/accelerators/xprof/xprofilez:__pkg__",  # alias xprof::TraceMeRecorder
+        "//tensorflow/core/profiler/internal/cpu:__pkg__",  # host_tracer
+        "//tensorflow/core/profiler/lib:__pkg__",  # traceme
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "traceme_recorder_test",
+    srcs = ["traceme_recorder_test.cc"],
+    deps = [
+        ":traceme_recorder",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "profiler_interface",
+    hdrs = [
+        "profiler_interface.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b94453c0a4be5e3c886277356b23ef0c5df5b1c9
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -0,0 +1,44 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+tf_cuda_library(
+    name = "host_tracer",
+    srcs = [
+        "host_tracer.cc",
+    ],
+    hdrs = [
+        "host_tracer.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "host_tracer_test",
+    srcs = ["host_tracer_test.cc"],
+    deps = [
+        ":host_tracer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb296646883cf2215d8df8240219ddce04fb7d0
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/platform/env_time.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+/* static */ std::unique_ptr<HostTracer> HostTracer::Create(
+    int host_trace_level) {
+  return absl::WrapUnique(new HostTracer(host_trace_level));
+}
+HostTracer::HostTracer(int host_trace_level)
+    : host_trace_level_(host_trace_level) {}
+
+HostTracer::~HostTracer() { Stop().IgnoreError(); }
+
+Status HostTracer::Start() {
+  if (recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder already started");
+  }
+  recording_ = TraceMeRecorder::Start(host_trace_level_);
+  if (!recording_) {
+    return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
+  }
+  return Status::OK();
+}
+
+Status HostTracer::Stop() {
+  if (!recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder not started");
+  }
+  events_ = TraceMeRecorder::Stop();
+  recording_ = false;
+  return Status::OK();
+}
+
+constexpr char kUserMetadataMarker = '#';
+
+Status HostTracer::CollectData(RunMetadata* run_metadata) {
+  auto step_stats_collector =
+      absl::make_unique<StepStatsCollector>(run_metadata->mutable_step_stats());
+  return CollectDataToCollector(step_stats_collector.get());
+}
+
+Status HostTracer::CollectDataToCollector(
+    StepStatsCollector* step_stats_collector) {
+  if (events_.empty() && recording_) {
+    events_ = TraceMeRecorder::Collect();
+  }
+  // Pair up start and end events, and add complete events to trace_entries.
+  absl::flat_hash_map<uint64, uint64> end_times;
+  for (const auto& thread : events_) {
+    for (const auto& event : thread.events) {
+      if (event.end_time && !event.start_time) {
+        end_times.emplace(event.activity_id, event.end_time);
+      }
+    }
+  }
+
+  const string cpu_name = "/host:CPU";
+  for (auto& thread : events_) {
+    step_stats_collector->SaveThreadName(cpu_name, thread.thread.tid,
+                                         thread.thread.name);
+    for (auto& event : thread.events) {
+      if (!event.end_time) {
+        auto it = end_times.find(event.activity_id);
+        if (it != end_times.end()) event.end_time = it->second;
+      }
+      if (event.start_time && event.end_time) {
+        NodeExecStats* ns = new NodeExecStats;
+        if (event.name.back() != kUserMetadataMarker) {
+          ns->set_node_name(std::move(event.name));
+        } else {
+          // Expect the format will be "<name>#<metadata>#"
+          std::vector<absl::string_view> parts =
+              absl::StrSplit(event.name, kUserMetadataMarker);
+          if (parts.size() >= 2) {
+            ns->set_node_name(string(parts[0]));
+            ns->set_timeline_label(string(parts[1]));
+          } else {
+            ns->set_node_name(std::move(event.name));
+          }
+        }
+        ns->set_all_start_micros(event.start_time / EnvTime::kMicrosToNanos);
+        ns->set_all_end_rel_micros((event.end_time - event.start_time) /
+                                   EnvTime::kMicrosToNanos);
+        ns->set_thread_id(thread.thread.tid);
+        // TODO(fishx): Add thread name to RunMetadata
+        step_stats_collector->Save(cpu_name, ns);
+      }
+    }
+  }
+  events_.clear();
+  step_stats_collector->Finalize();
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.h b/tensorflow/core/profiler/internal/cpu/host_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6340c2eddc8ee66d4ffb2ad2829e15f34cc38ec
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+// Controls TraceMeRecorder and converts TraceMeRecorder::Events into
+// RunMetadata messages.
+//
+// Thread-safety: This class is go/thread-compatible.
+class HostTracer : public ProfilerInterface {
+ public:
+  static std::unique_ptr<HostTracer> Create(int host_trace_level);
+
+  ~HostTracer();
+
+  // Starts recording TraceMes.
+  Status Start() override;
+
+  // Stops recording TraceMes.
+  Status Stop() override;
+
+  // Populates user traces and thread names in response.
+  // The user traces and thread names are in no particular order.
+  Status CollectData(RunMetadata* run_metadata) override;
+
+  Status CollectDataToCollector(StepStatsCollector* step_stats_collector);
+
+ private:
+  explicit HostTracer(int host_trace_level);
+
+  // Level of host tracing.
+  const int host_trace_level_;
+
+  // True if currently recording.
+  bool recording_ = false;
+
+  // Container of all traced events.
+  TraceMeRecorder::Events events_;
+};
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51f9c6a8ca6e52b21d0335d83f321cc4bbc331dc
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+NodeExecStats MakeNodeStats(const string& name, uint64 thread_id,
+                            const string& label = "") {
+  NodeExecStats ns;
+  ns.set_node_name(name);
+  ns.set_thread_id(thread_id);
+  if (!label.empty()) {
+    ns.set_timeline_label(label);
+  }
+  return ns;
+}
+
+class NodeStatsMatcher {
+ public:
+  explicit NodeStatsMatcher(const NodeExecStats& expected)
+      : expected_(expected) {}
+
+  bool MatchAndExplain(const NodeExecStats& p,
+                       ::testing::MatchResultListener* /* listener */) const {
+    return p.node_name() == expected_.node_name() &&
+           p.thread_id() == expected_.thread_id() &&
+           p.timeline_label() == expected_.timeline_label();
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_.DebugString(); }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_.DebugString();
+  }
+
+ private:
+  const NodeExecStats expected_;
+};
+
+inline ::testing::PolymorphicMatcher<NodeStatsMatcher> EqualsNodeStats(
+    const NodeExecStats& expected) {
+  return ::testing::MakePolymorphicMatcher(NodeStatsMatcher(expected));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEvents) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  { TraceMe traceme("world"); }
+  { TraceMe traceme("contains#inside"); }
+  { TraceMe traceme("good#key1=value1#"); }
+  { TraceMe traceme("morning#key1=value1,key2=value2#"); }
+  { TraceMe traceme("incomplete#key1=value1,key2#"); }
+  TF_ASSERT_OK(tracer->Stop());
+
+  RunMetadata run_metadata;
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+
+  EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 1);
+  EXPECT_EQ(run_metadata.step_stats().dev_stats(0).node_stats_size(), 6);
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      UnorderedElementsAre(
+          EqualsNodeStats(MakeNodeStats("hello", thread_id)),
+          EqualsNodeStats(MakeNodeStats("world", thread_id)),
+          EqualsNodeStats(MakeNodeStats("contains#inside", thread_id)),
+          EqualsNodeStats(MakeNodeStats("good", thread_id, "key1=value1")),
+          EqualsNodeStats(
+              MakeNodeStats("morning", thread_id, "key1=value1,key2=value2")),
+          EqualsNodeStats(
+              MakeNodeStats("incomplete", thread_id, "key1=value1,key2"))));
+}
+
+void ValidateResult(const RunMetadata& run_metadata, const string& trace_name) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      ElementsAre(EqualsNodeStats(MakeNodeStats(trace_name, thread_id))));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEventsBetweenTracing) {
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+  RunMetadata run_metadata;
+  RunMetadata run_metadata2;
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+  { TraceMe traceme("world"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata2));
+  TF_ASSERT_OK(tracer->Stop());
+
+  ValidateResult(run_metadata, "hello");
+  ValidateResult(run_metadata2, "world");
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..35f90e9bfc01f7bba0d0cc6d65cc23ea549469a1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -0,0 +1,25 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "tracer",
+    srcs = [
+        "tracer.cc",
+    ],
+    hdrs = [
+        "tracer.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:device_tracer",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/gpu/tracer.cc b/tensorflow/core/profiler/internal/gpu/tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1cb54161c70dbab52a661065ec874497d57b61b
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/tracer.cc
@@ -0,0 +1,59 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/gpu/tracer.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace gpu {
+
+/* static */ std::unique_ptr<ProfilerInterface> Tracer::Create() {
+  return absl::WrapUnique(new Tracer());
+}
+
+Status Tracer::Start() {
+  device_tracer_ = CreateDeviceTracer();
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "Failed to create device tracer.");
+  }
+  return device_tracer_->Start();
+}
+
+Status Tracer::Stop() {
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No running device tracer.");
+  }
+  return device_tracer_->Stop();
+}
+
+Status Tracer::CollectData(RunMetadata* run_metadata) {
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No running device tracer.");
+  }
+  auto step_stats_collector =
+      absl::make_unique<StepStatsCollector>(run_metadata->mutable_step_stats());
+  Status s = device_tracer_->Collect(step_stats_collector.get());
+  step_stats_collector->Finalize();
+  return s;
+}
+
+Tracer::Tracer() {}
+
+}  // namespace gpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/tracer.h b/tensorflow/core/profiler/internal/gpu/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7765432de96b3eda20dbaef089126abec0d234f
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/tracer.h
@@ -0,0 +1,48 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
+
+#include "tensorflow/core/platform/device_tracer.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace gpu {
+
+class Tracer : public ProfilerInterface {
+ public:
+  static std::unique_ptr<ProfilerInterface> Create();
+
+  Status Start() override;
+
+  Status Stop() override;
+
+  Status CollectData(RunMetadata* run_metadata) override;
+
+ private:
+  Tracer();
+
+  // Trace is neither copyable nor movable.
+  Tracer(const Tracer&) = delete;
+  Tracer& operator=(const Tracer&) = delete;
+
+  std::unique_ptr<DeviceTracer> device_tracer_;
+};
+
+}  // namespace gpu
+}  // namespace profiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..144c4bb44d7a0c4c0e565d466cb1fd3b1506dae2
--- /dev/null
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Interface for tensorflow profiler plugins.
+//
+// ProfileSession calls each of these methods at most once per instance, and
+// implementations can rely on that guarantee for simplicity.
+//
+// Thread-safety: Implementations are only required to be go/thread-compatible.
+// ProfileSession is go/thread-safe and synchronizes access to ProfilerInterface
+// instances.
+class ProfilerInterface {
+ public:
+  virtual ~ProfilerInterface() = default;
+
+  // Starts profiling.
+  virtual Status Start() = 0;
+
+  // Stops profiling.
+  virtual Status Stop() = 0;
+
+  // Moves collected profile data into run_metadata.
+  virtual Status CollectData(RunMetadata* run_metadata) = 0;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
diff --git a/tensorflow/core/profiler/internal/runtime/BUILD b/tensorflow/core/profiler/internal/runtime/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2e383f1716f304bf321b2e82ad85582d643d8d8c
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/BUILD
@@ -0,0 +1,24 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "eager_profiler",
+    srcs = [
+        "eager_profiler.cc",
+    ],
+    hdrs = [
+        "eager_profiler.h",
+    ],
+    deps = [
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/runtime/eager_profiler.cc b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aad692b01f6fa09595f0035bc2530bf210cb7e4e
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/runtime/eager_profiler.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace runtime {
+
+TraceCollector::TraceCollector(EagerContext* const eager_context)
+    : context_(eager_context) {}
+
+void TraceCollector::BeforeClearRunMetadata() {
+  run_metadata_.MergeFrom(*context_->RunMetadataProto());
+}
+
+Status TraceCollector::CollectData(RunMetadata* run_metadata) {
+  run_metadata->MergeFrom(run_metadata_);
+  return Status::OK();
+}
+
+/* static */ std::unique_ptr<ProfilerInterface> EagerProfiler::Create(
+    EagerContext* const eager_context) {
+  return absl::WrapUnique(new EagerProfiler(eager_context));
+}
+
+Status EagerProfiler::Start() {
+  if (context_ == nullptr) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No eager context attached.");
+  }
+  return context_->RegisterRunMetadataListener(&collector_);
+}
+
+Status EagerProfiler::Stop() {
+  collector_.BeforeClearRunMetadata();
+  context_->ClearRunMetadataListener();
+  return Status::OK();
+}
+
+Status EagerProfiler::CollectData(RunMetadata* run_metadata) {
+  return collector_.CollectData(run_metadata);
+}
+
+EagerProfiler::EagerProfiler(EagerContext* const eager_context)
+    : context_(eager_context), collector_(eager_context) {}
+
+}  // namespace runtime
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/runtime/eager_profiler.h b/tensorflow/core/profiler/internal/runtime/eager_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..7135355e6ff16a240a434c5fae2b9d6140c4a3ef
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/eager_profiler.h
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace runtime {
+
+class TraceCollector : public RunMetadataListener {
+ public:
+  TraceCollector(EagerContext* const eager_context);
+
+  void BeforeClearRunMetadata() override;
+
+  Status CollectData(RunMetadata* run_metadata);
+
+ private:
+  RunMetadata run_metadata_;
+  EagerContext* const context_;
+};
+
+class EagerProfiler : public ProfilerInterface {
+ public:
+  static std::unique_ptr<ProfilerInterface> Create(
+      EagerContext* const eager_context);
+
+  Status Start() override;
+
+  Status Stop() override;
+
+  Status CollectData(RunMetadata* run_metadata) override;
+
+ private:
+  EagerProfiler(EagerContext* const eager_context);
+
+  // Trace is neither copyable nor movable.
+  EagerProfiler(const EagerProfiler&) = delete;
+  EagerProfiler& operator=(const EagerProfiler&) = delete;
+
+  EagerContext* const context_;
+  TraceCollector collector_;
+};
+
+}  // namespace runtime
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 3dce1d85db35436d162e73bf0946b320b899d5eb..6e9178c7164141db95a7470833eec2630faacd3e 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -182,7 +182,7 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     // TODO(xpan): Is it the right choice?
     root_->formatted_str = display_str;
   }
-  // Populate the chidren field.
+  // Populate the children field.
   auto* pre_pb = root_->mutable_proto();
   for (auto& show_node : show_nodes) {
     pre_pb->clear_children();
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index 7fa79d23d853229b32ebd93ddb0640d9c75b323d..b9eb1a48924b302d6d65a17a2d406c6df6c99e42 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -63,7 +63,6 @@ TEST_F(TFProfTensorTest, Basics) {
                "", {});
   const GraphNodeProto& root = tf_stats_->ShowGraphNode("scope", opts);
 
-  GraphNodeProto expected;
   EXPECT_EQ(root.children(0).name(), "DW");
   EXPECT_GT(root.children(0).tensor_value().value_double_size(), 10);
   EXPECT_EQ(root.children(1).name(), "DW2");
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0369e0b96de2bb3bea19d1e9b2b280e24ecb0112
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+// To avoid unneccesary synchronization between threads, each thread has a
+// ThreadLocalRecorder that independently records its events.
+//
+// Events are stored in an EventQueue implemented as a linked-list of blocks,
+// with start and end pointers:
+//  [ events........ | next-]--> [ events......... | next ]
+//  ^start_block  ^start         ^end_block  ^end
+//
+// Record() writes at end, and then advances it, allocating a block if needed.
+// Clear() takes ownership of events in the range [start, end).
+// The end pointer is atomic so these can be concurrent.
+//
+// If a thread dies, the ThreadLocalRecorder's destructor hands its data off to
+// the orphaned_events list.
+
+#include <string>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Default value for g_trace_level when tracing is disabled
+constexpr static int kTracingDisabled = -1;
+
+namespace internal {
+std::atomic<int> g_trace_level = ATOMIC_VAR_INIT(kTracingDisabled);
+}  // namespace internal
+
+namespace {
+
+class ThreadLocalRecorder;
+
+struct Data {
+  // Lock for only rare events - start/stop, thread death.
+  mutex global_lock;
+  // Map of the static container instances (thread_local storage) for each
+  // thread, that store the trace events.
+  absl::flat_hash_map<uint64, ThreadLocalRecorder*> threads
+      GUARDED_BY(global_lock);
+  // Events traced from threads that died during tracing.
+  TraceMeRecorder::Events orphaned_events GUARDED_BY(global_lock);
+}* g_data = nullptr;
+
+// A single-producer single-consumer queue of Events.
+// Only the owner thread can write events, writing is lock-free.
+// Consume is also lock-free in this class.
+//
+// Internally, we have a linked list of blocks containing numbered slots.
+// start is the first occupied slot, end is the first unoccupied slot.
+class EventQueue {
+ public:
+  EventQueue()
+      : start_block_(new Block{0, nullptr}), end_block_(start_block_) {}
+
+  // REQUIRES: Consume() was called since the last Push().
+  // Memory should be deallocated and trace events destroyed on destruction.
+  // This doesn't require global lock as this discards all the stored trace
+  // events and we assume of destruction of this class only after the last
+  // Push() has been called.
+  ~EventQueue() {
+    DCHECK_EQ(start_, end_.load()) << "EventQueue destroyed without Consume()";
+    delete end_block_;
+  }
+
+  // Add a new event to the back of the queue. Fast and lock-free.
+  void Push(TraceMeRecorder::Event&& event) {
+    uint64 end = end_.load(std::memory_order_relaxed);
+    new (&end_block_->events[end++ - end_block_->start].event)
+        TraceMeRecorder::Event(std::move(event));
+    if (ABSL_PREDICT_FALSE(end - end_block_->start == Block::kLength)) {
+      auto* new_block = new Block{end, nullptr};
+      end_block_->next = new_block;
+      end_block_ = new_block;
+    }
+    end_.store(end, std::memory_order_release);  // Write index after contents.
+  }
+
+  // Retrieve and remove all events in the queue.
+  std::vector<TraceMeRecorder::Event> Consume() {
+    // Read index before contents.
+    uint64 end = end_.load(std::memory_order_acquire);
+    std::vector<TraceMeRecorder::Event> result;
+    result.reserve(end - start_);
+    while (start_ != end) {
+      Shift(&result);
+    }
+    return result;
+  }
+
+ private:
+  // Shift one event off the front of the queue into *out.
+  void Shift(std::vector<TraceMeRecorder::Event>* out) {
+    // Move the next event into the output.
+    auto& event = start_block_->events[start_++ - start_block_->start].event;
+    out->push_back(std::move(event));
+    event.~Event();  // Events must be individually destroyed.
+    // If we reach the end of a block, we own it and should delete it.
+    // The next block is present: end always points to something.
+    if (start_ - start_block_->start == Block::kLength) {
+      auto* next_block = start_block_->next;
+      delete start_block_;
+      start_block_ = next_block;
+    }
+  }
+
+  // The number of slots in a block. Chosen so that the block fits in 64k.
+  struct Block {
+    static constexpr size_t kLength =
+        ((1 << 16) - (sizeof(uint64) + sizeof(std::atomic<Block*>))) /
+        sizeof(TraceMeRecorder::Event);
+
+    const uint64 start;  // The number of the first slot.
+    Block* next;
+    // Defer construction of Event until the data is available.
+    // Must also destroy manually, as the block may not fill entirely.
+    union MaybeEvent {
+      MaybeEvent() {}
+      ~MaybeEvent() {}
+      TraceMeRecorder::Event event;
+    } events[kLength];
+  };
+
+  // Head of list for reading. Only accessed by consumer thread.
+  Block* start_block_;
+  uint64 start_ = 0;
+  // Tail of list for writing. Accessed by producer thread.
+  Block* end_block_;
+  std::atomic<uint64> end_ = {0};  // Atomic: also read by consumer thread.
+};
+
+class ThreadLocalRecorder {
+ public:
+  // The recorder is created the first time Record() is called on a thread.
+  ThreadLocalRecorder() {
+    auto* env = Env::Default();
+    info_.tid = env->GetCurrentThreadId();
+    env->GetCurrentThreadName(&info_.name);
+    mutex_lock lock(g_data->global_lock);
+    g_data->threads.emplace(info_.tid, this);
+  }
+
+  // The destructor is called when the thread shuts down early.
+  // We unregister this thread, and move its events to orphaned_events.
+  ~ThreadLocalRecorder() {
+    mutex_lock lock(g_data->global_lock);
+    g_data->threads.erase(info_.tid);
+    g_data->orphaned_events.push_back(Clear());
+  }
+
+  // This is the performance-critical part!
+  void Record(TraceMeRecorder::Event&& event) { queue_.Push(std::move(event)); }
+
+  TraceMeRecorder::ThreadEvents Clear()
+      EXCLUSIVE_LOCKS_REQUIRED(g_data->global_lock) {
+    return {info_, queue_.Consume()};
+  }
+
+ private:
+  TraceMeRecorder::ThreadInfo info_;
+  EventQueue queue_;
+};
+
+// Gather events from all active threads, and clear their buffers. The global
+// lock is held, so no threads can be added/removed for the duration while we
+// consume the collected trace entries. This will block any new thread and also
+// the starting and stopping of TraceMeRecorder, hence, this is performance
+// critical and should be kept fast.
+TraceMeRecorder::Events Clear() EXCLUSIVE_LOCKS_REQUIRED(g_data->global_lock) {
+  TraceMeRecorder::Events result;
+  std::swap(g_data->orphaned_events, result);
+  for (const auto& entry : g_data->threads) {
+    auto* recorder = entry.second;
+    result.push_back(recorder->Clear());
+  }
+  return result;
+}
+
+}  // namespace
+
+bool TraceMeRecorder::Start(int level) {
+  level = std::max(0, level);
+  mutex_lock lock(g_data->global_lock);
+  int expected = kTracingDisabled;
+  if (!internal::g_trace_level.compare_exchange_strong(
+          expected, level, std::memory_order_acq_rel)) {
+    return false;
+  }
+  // We may have old events in buffers because Record() raced with Stop().
+  Clear();
+  return true;
+}
+
+
+void TraceMeRecorder::Record(Event event) {
+  static thread_local ThreadLocalRecorder thread_local_recorder;
+  thread_local_recorder.Record(std::move(event));
+}
+
+// Only one thread is expected to call Stop() as first instance of XprofSession
+// prevents another XprofSession from doing any profiling.
+TraceMeRecorder::Events TraceMeRecorder::Stop() {
+  mutex_lock lock(g_data->global_lock);
+  if (internal::g_trace_level.exchange(
+          kTracingDisabled, std::memory_order_acq_rel) == kTracingDisabled) {
+    return {};
+  }
+  return Clear();
+}
+
+TraceMeRecorder::Events TraceMeRecorder::Collect() {
+  mutex_lock lock(g_data->global_lock);
+  if (internal::g_trace_level.load(std::memory_order_acquire) ==
+      kTracingDisabled) {
+    return {};
+  }
+  return Clear();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+REGISTER_MODULE_INITIALIZER(traceme_recorder, {
+  tensorflow::profiler::g_data = new tensorflow::profiler::Data();
+
+  // Workaround for b/35097229, the first block-scoped thread_local can
+  // trigger false positives in the heap checker. Currently triggered by
+  // //perftools/accelerators/xprof/xprofilez/integration_tests:xla_hlo_trace_test
+  static thread_local tensorflow::string fix_deadlock ABSL_ATTRIBUTE_UNUSED;
+});
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e66b1e5bb3f975ca20d43a67c3eec23cd8d16c1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+
+#include <atomic>
+#include <vector>
+#include "absl/base/optimization.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace internal {
+extern std::atomic<int> g_trace_level;
+}  // namespace internal
+
+// TraceMeRecorder is a singleton repository of TraceMe events.
+// It can be safely and cheaply appended to by multiple threads.
+//
+// Start() and Stop() must be called in pairs, Stop() returns the events added
+// since the previous Start().
+//
+// This is the backend for TraceMe instrumentation.
+// The profiler starts the recorder, the TraceMe constructor records begin
+// events, and the destructor records end events.
+// The profiler then stops the recorder and finds start/end pairs. (Unpaired
+// start/end events are discarded at that point).
+class TraceMeRecorder {
+ public:
+  // An Event is either the start of a TraceMe, the end of a TraceMe, or both.
+  // Times are in ns since the Unix epoch.
+  struct Event {
+    uint64 activity_id;
+    string name;
+    uint64 start_time;  // 0 = missing
+    uint64 end_time;    // 0 = missing
+  };
+  struct ThreadInfo {
+    int64 tid;
+    string name;
+  };
+  struct ThreadEvents {
+    const ThreadInfo thread;
+    std::vector<Event> events;
+  };
+  using Events = std::vector<ThreadEvents>;
+
+  // Starts recording of TraceMe().
+  // Only traces <= level will be recorded.
+  // Level must be >= 0.
+  // If level is 0, no traces will be recorded.
+  static bool Start(int level);
+
+  // Stops recording and returns events recorded since Start().
+  static Events Stop();
+
+  // Returns events recorded till now without stopping the recording. Empty
+  // container is returned if the recorder was already stopped.
+  static Events Collect();
+
+  // Returns whether we're currently recording. Racy, but cheap!
+  static inline bool Active(int level = 1) {
+    return ABSL_PREDICT_FALSE(
+        internal::g_trace_level.load(std::memory_order_acquire) >= level);
+  }
+
+  static void Record(Event);
+
+ private:
+  // No copy and assignment
+  TraceMeRecorder(const TraceMeRecorder&) = delete;
+  TraceMeRecorder& operator=(const TraceMeRecorder&) = delete;
+
+  // Implementation of g_trace_level must be lock-free for faster execution
+  // of the TraceMe() public API. This can be commented (if compilation is
+  // failing) but execution might be slow (even when host tracing is disabled).
+  static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec588af1d6048fa709d85e86ea2e5e546f8300d1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -0,0 +1,211 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+#include <atomic>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/synchronization/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+MATCHER_P(Named, name, "") { return arg.name == name; }
+
+constexpr static uint64 kNanosInSec = 1000000000;
+
+TEST(RecorderTest, SingleThreaded) {
+  uint64 start_time = Env::Default()->NowNanos();
+  uint64 end_time = start_time + kNanosInSec;
+
+  TraceMeRecorder::Record({1, "before", start_time, end_time});
+  TraceMeRecorder::Start(/*level=*/1);
+  TraceMeRecorder::Record({2, "during1", start_time, end_time});
+  TraceMeRecorder::Record({3, "during2", start_time, end_time});
+  auto results = TraceMeRecorder::Stop();
+  TraceMeRecorder::Record({4, "after", start_time, end_time});
+
+  ASSERT_EQ(results.size(), 1);
+  EXPECT_THAT(results[0].events,
+              ::testing::ElementsAre(Named("during1"), Named("during2")));
+}
+
+TEST(RecorderTest, CollectionBeforeStop) {
+  uint64 start_time = Env::Default()->NowNanos();
+  uint64 end_time = start_time + kNanosInSec;
+
+  TraceMeRecorder::Record({1, "ignored", start_time, end_time});
+  TraceMeRecorder::Start(/*level=*/1);
+  TraceMeRecorder::Record({2, "during1", start_time, end_time});
+  TraceMeRecorder::Record({3, "during2", start_time, end_time});
+  auto collected_results = TraceMeRecorder::Collect();
+  TraceMeRecorder::Record({4, "after_collect", start_time, end_time});
+  auto stopped_results = TraceMeRecorder::Stop();
+  TraceMeRecorder::Record({5, "after_stop", start_time, end_time});
+  auto results_after_stop = TraceMeRecorder::Collect();
+
+  ASSERT_EQ(collected_results.size(), 1);
+  EXPECT_THAT(collected_results[0].events,
+              ::testing::ElementsAre(Named("during1"), Named("during2")));
+
+  ASSERT_EQ(stopped_results.size(), 1);
+  EXPECT_THAT(stopped_results[0].events,
+              ::testing::ElementsAre(Named("after_collect")));
+
+  ASSERT_EQ(results_after_stop.size(), 0);
+}
+
+void SpinNanos(int nanos) {
+  uint64 deadline = Env::Default()->NowNanos() + nanos;
+  while (Env::Default()->NowNanos() < deadline) {
+  }
+}
+
+// Checks the functional behavior of the recorder, when used from several
+// unsynchronized threads.
+//
+// Each thread records a stream of events.
+//   Thread 0: activity=0, activity=1, activity=2, ...
+//   Thread 1: activity=0, activity=1, activity=2, ...
+//   ...
+//
+// We turn the recorder on and off repeatedly in sessions, expecting to see:
+//   - data from every thread (eventually - maybe not every session)
+//   - unbroken sessions: a consecutive sequence of IDs from each thread
+//   - gaps between sessions: a thread's IDs should be non-consecutive overall
+TEST(RecorderTest, Multithreaded) {
+  constexpr static int kNumThreads = 4;
+
+  // Start several threads writing events.
+  absl::Notification start;
+  absl::Notification stop;
+  thread::ThreadPool pool(Env::Default(), "testpool", kNumThreads);
+  std::atomic<int> thread_count = {0};
+  for (int i = 0; i < kNumThreads; i++) {
+    pool.Schedule([&start, &stop, &thread_count, i] {
+      uint64 j = 0;
+      bool was_active = false;
+      auto record_event = [&j, i]() {
+        uint64 start_time = Env::Default()->NowNanos();
+        uint64 end_time = start_time + kNanosInSec;
+        TraceMeRecorder::Record({/*activity_id=*/j++,
+                                 /*name=*/strings::StrCat(i), start_time,
+                                 end_time});
+      };
+      thread_count.fetch_add(1, std::memory_order_relaxed);
+      start.WaitForNotification();
+      while (!stop.HasBeenNotified()) {
+        // Mimicking production usage, we guard with a racy check.
+        // In principle this isn't needed, but a feedback loop can form:
+        // 1) many events accumulate while the recorder is off
+        // 2) clearing/analyzing these events is slow
+        // 3) while clearing, more events are accumulating, causing 1
+        if (TraceMeRecorder::Active()) {
+          record_event();
+          was_active = true;
+        }
+        // Record some events after the recorder is no longer active to simulate
+        // point 1 and 3.
+        if (was_active && !TraceMeRecorder::Active()) {
+          record_event();
+          record_event();
+          was_active = false;
+        }
+        // This snowballs into OOM in some configurations, causing flakiness.
+        // Keep this big enough to prevent OOM and small enough such that
+        // each thread records at least one event.
+        SpinNanos(10);
+      }
+    });
+  }
+
+  // For each thread, keep track of which events we've seen.
+  struct {
+    bool split_session = false;
+    bool overlapping_sessions = false;
+    std::set<uint64> events;
+  } thread_state[kNumThreads];
+  // We expect each thread to eventually have multiple events, not all in a
+  // contiguous range.
+  auto done = [&thread_state] {
+    for (const auto& t : thread_state) {
+      if (t.events.size() < 2) return false;
+    }
+    return true;
+  };
+
+  // Wait while all the threads are spun up.
+  while (thread_count.load(std::memory_order_relaxed) < kNumThreads) {
+    LOG(INFO) << "Waiting for all threads to spin up...";
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
+  }
+
+  // We will probably be done after two iterations (with each thread getting
+  // some events each iteration). No guarantees as all the threads might not get
+  // scheduled in a session, so try for a while.
+  start.Notify();
+  constexpr static int kMaxIters = 100;
+  for (int iters = 0; iters < kMaxIters && !done(); ++iters) {
+    LOG(INFO) << "Looping until convergence, iteration: " << iters;
+    TraceMeRecorder::Start(/*level=*/1);
+    Env::Default()->SleepForMicroseconds(100 * EnvTime::kMillisToMicros);
+    auto results = TraceMeRecorder::Stop();
+    for (const auto& thread : results) {
+      if (thread.events.empty()) continue;
+      std::istringstream ss(thread.events.front().name);
+      int thread_index = 0;
+      ss >> thread_index;
+      auto& state = thread_state[thread_index];
+
+      std::set<uint64> session_events;
+      uint64 current = 0;
+      for (const auto& event : thread.events) {
+        session_events.emplace(event.activity_id);
+        // Session events should be contiguous.
+        if (current != 0 && event.activity_id != current + 1) {
+          state.split_session = true;
+        }
+        current = event.activity_id;
+      }
+
+      for (const auto& event : session_events) {
+        auto result = state.events.emplace(event);
+        if (!result.second) {
+          // Session events should not overlap with those from previous
+          // sessions.
+          state.overlapping_sessions = true;
+        }
+      }
+    }
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
+  }
+  stop.Notify();
+
+  for (const auto& thread : thread_state) {
+    EXPECT_FALSE(thread.split_session)
+        << "Expected contiguous events in a session";
+    EXPECT_FALSE(thread.overlapping_sessions) << "Expected disjoint sessions";
+    EXPECT_GT(thread.events.size(), 1)
+        << "Expected gaps in thread events between sessions";
+  }
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 3d46606cbd5ae421e05d68c6e008fe85511eeb2f..f078099321ec7a2f38e25e5dfe006f2ab49da2ac 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -13,17 +13,20 @@ load(
 )
 
 tf_cuda_library(
-    name = "eager_profiler",
+    name = "profiler_session",
     srcs = [
-        "eager_profiler.cc",
+        "profiler_session.cc",
     ],
     hdrs = [
-        "eager_profiler.h",
+        "profiler_session.h",
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/cc/profiler",
+        "//tensorflow/core/profiler/internal/gpu:tracer",
+        "//tensorflow/core/profiler/internal/runtime:eager_profiler",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler:protos_all_cc",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -40,3 +43,16 @@ tf_cuda_library(
         ],
     }),
 )
+
+tf_cuda_library(
+    name = "traceme",
+    srcs = ["traceme.cc"],
+    hdrs = ["traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/core/profiler/lib/eager_profiler.cc b/tensorflow/core/profiler/lib/eager_profiler.cc
deleted file mode 100644
index 7702febbc3ae4e13292ab9f77a587a17186f1642..0000000000000000000000000000000000000000
--- a/tensorflow/core/profiler/lib/eager_profiler.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/lib/eager_profiler.h"
-#include "tensorflow/cc/profiler/profiler.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/step_stats_collector.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/platform/device_tracer.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-
-namespace tensorflow {
-
-/*static*/ std::unique_ptr<EagerProfiler> EagerProfiler::Create(
-    EagerContext* const context) {
-  return absl::WrapUnique(new EagerProfiler(context));
-}
-
-void EagerProfiler::BeforeClearRunMetadata() {
-  mutex_lock l(mutex_);
-  run_metadata_.MergeFrom(*context_->RunMetadataProto());
-}
-
-Status EagerProfiler::Status() {
-  mutex_lock l(mutex_);
-  return status_;
-}
-
-Status EagerProfiler::SerializeToString(string* content) {
-  mutex_lock l(mutex_);
-  if (!status_.ok()) return status_;
-  Stop();
-
-  // Get profiling data from device tracer
-  if (device_tracer_ != nullptr) {
-    std::unique_ptr<StepStatsCollector> step_stats_collector(
-        new StepStatsCollector(run_metadata_.mutable_step_stats()));
-    tensorflow::Status s = device_tracer_->Collect(step_stats_collector.get());
-    if (!s.ok()) {
-      device_tracer_.reset(nullptr);
-      LOG(WARNING) << "Failed to collect data from device tracer. "
-                   << s.error_message();
-    }
-    step_stats_collector->Finalize();
-  }
-
-  // TODO(fishx): update tfprof to use a lighter representation instead of
-  // GraphDef.
-  GraphDef graph;
-  std::unique_ptr<tfprof::Profiler> tfprof(new tfprof::Profiler(graph));
-  tfprof->AddStep(0, run_metadata_);
-  return tfprof->SerializeToString(content);
-}
-
-EagerProfiler::EagerProfiler(EagerContext* const context) : context_(context) {
-  LOG(INFO) << "Eager Profiler started.";
-
-  status_ = context_->RegisterRunMetadataListener(this);
-  if (!status_.ok()) {
-    context_ = nullptr;
-    LOG(WARNING)
-        << "Eager Profiler failed to start. Another profiler is running.";
-    return;
-  }
-
-  // TODO(fishx): Allow user disable device tracer.
-  device_tracer_ = CreateDeviceTracer();
-  if (!device_tracer_) {
-    LOG(WARNING) << "Continue profiling without device tracer. "
-                 << "Failed to create device tracer.";
-    return;
-  }
-  class Status s = device_tracer_->Start();
-  if (!s.ok()) {
-    device_tracer_.reset(nullptr);
-    LOG(WARNING) << "Continue profiling without device tracer. "
-                 << s.error_message();
-  }
-}
-
-EagerProfiler::~EagerProfiler() { Stop(); }
-
-void EagerProfiler::Stop() {
-  if (context_ != nullptr) {
-    context_->ClearRunMetadataListener();
-    run_metadata_.MergeFrom(*context_->RunMetadataProto());
-    context_ = nullptr;
-    if (device_tracer_ != nullptr) {
-      tensorflow::Status s = device_tracer_->Stop();
-      if (!s.ok()) {
-        device_tracer_.reset(nullptr);
-        LOG(WARNING) << "Failed to stop device tracer. " << s.error_message();
-      }
-    }
-    LOG(INFO) << "Eager Profiler ended with status:" << status_;
-  }
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86dd4c1e152cce3f9a4aacf68b6cfd8c31009ae7
--- /dev/null
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <cstddef>
+#include <string>
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/gpu/tracer.h"
+#include "tensorflow/core/profiler/internal/runtime/eager_profiler.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Track whether there's an active ProfilerSession.
+// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
+// use singletons that do not allow concurrent profiling request (e.g.,
+// DeviceTracer).
+std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
+
+void AssignLanes(RunMetadata* run_metadata) {
+  for (size_t device_id = 0;
+       device_id < run_metadata->step_stats().dev_stats_size(); ++device_id) {
+    auto* device_stats =
+        run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
+    if (device_stats->thread_names_size() > 0 ||
+        device_stats->node_stats_size() == 0) {
+      continue;
+    }
+    std::vector<uint64> lanes;
+    for (auto ns = device_stats->mutable_node_stats()->rbegin();
+         ns != device_stats->mutable_node_stats()->rend(); ns++) {
+      uint64 end_micros = ns->all_start_micros() + ns->all_end_rel_micros();
+      bool found_lane = false;
+      for (size_t l = 0; l < lanes.size(); l++) {
+        if (end_micros <= lanes[l]) {
+          ns->set_thread_id(l);
+          found_lane = true;
+          lanes[l] = ns->all_start_micros();
+          break;
+        }
+      }
+      if (!found_lane) {
+        ns->set_thread_id(lanes.size());
+        lanes.push_back(ns->all_start_micros());
+      }
+    }
+  }
+}
+
+void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
+                                    profiler::Trace* trace,
+                                    const uint64 profile_start_time_micros) {
+  AssignLanes(run_metadata);
+  auto trace_devices = trace->mutable_devices();
+
+  for (size_t device_id = 0;
+       device_id < run_metadata->step_stats().dev_stats_size(); ++device_id) {
+    // Create device
+    auto* device_stats =
+        run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
+    profiler::Device device;
+    device.set_name(device_stats->device());
+    device.set_device_id(device_id);
+    profiler::Resource resource;
+    resource.set_name("0");
+    resource.set_resource_id(0);
+    (*device.mutable_resources())[0] = resource;
+    for (const auto& thread_name : device_stats->thread_names()) {
+      profiler::Resource resource;
+      resource.set_resource_id(thread_name.first);
+      resource.set_name(thread_name.second);
+      (*device.mutable_resources())[thread_name.first] = resource;
+    }
+    (*trace_devices)[device_id] = device;
+
+    // Emit events.
+    for (auto node :
+         run_metadata->step_stats().dev_stats(device_id).node_stats()) {
+      if (node.all_start_micros() < profile_start_time_micros) {
+        continue;
+      }
+      auto* event = trace->add_trace_events();
+      auto* args = event->mutable_args();
+      event->set_device_id(device_id);
+      event->set_resource_id(node.thread_id());
+      event->set_name(node.node_name());
+      event->set_timestamp_ps(
+          (node.all_start_micros() - profile_start_time_micros) *
+          EnvTime::kMicrosToPicos);
+      event->set_duration_ps(node.all_end_rel_micros() *
+                             EnvTime::kMicrosToPicos);
+      (*args)["label"] = node.timeline_label();
+    }
+  }
+
+  // TODO(fishx): Convert allocation data as well.
+}
+
+}  // namespace
+
+/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
+    ProfilerContext* const context) {
+  return absl::WrapUnique(new ProfilerSession(context));
+}
+
+Status ProfilerSession::Status() {
+  mutex_lock l(mutex_);
+  return status_;
+}
+
+Status ProfilerSession::SerializeToString(string* content) {
+  mutex_lock l(mutex_);
+  if (!status_.ok()) return status_;
+  for (auto& profiler : profilers_) {
+    profiler->Stop().IgnoreError();
+  }
+  RunMetadata run_metadata;
+  for (auto& profiler : profilers_) {
+    profiler->CollectData(&run_metadata).IgnoreError();
+  }
+
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+    active_ = false;
+  }
+
+  profiler::Trace trace;
+
+  ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
+
+  trace.SerializeToString(content);
+  return Status::OK();
+}
+
+ProfilerSession::ProfilerSession(ProfilerContext* const context)
+    : active_(!session_active.exchange(true)),
+      start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
+  if (!active_) {
+    status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
+                                 "Another profiling session is active.");
+    return;
+  }
+
+  LOG(INFO) << "Profile Session started.";
+
+  if (context->eager_context != nullptr) {
+    profilers_.push_back(tensorflow::profiler::runtime::EagerProfiler::Create(
+        context->eager_context));
+  }
+  profilers_.push_back(tensorflow::profiler::gpu::Tracer::Create());
+
+  status_ = Status::OK();
+
+  for (auto& profiler : profilers_) {
+    profiler->Start().IgnoreError();
+  }
+}
+
+ProfilerSession::~ProfilerSession() {
+  for (auto& profiler : profilers_) {
+    profiler->Stop().IgnoreError();
+  }
+
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/eager_profiler.h b/tensorflow/core/profiler/lib/profiler_session.h
similarity index 62%
rename from tensorflow/core/profiler/lib/eager_profiler.h
rename to tensorflow/core/profiler/lib/profiler_session.h
index 62df869e94d1a6901cdc44c6cb4765d6a4cafcdf..07276571244b876c8b6635a9d39347c3c1d89a55 100644
--- a/tensorflow/core/profiler/lib/eager_profiler.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -12,54 +12,58 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
 
 namespace tensorflow {
 
+struct ProfilerContext {
+  EagerContext* eager_context = nullptr;
+};
+
 // A profiler which will start profiling when creating the object and will stop
 // when either the object is destroyed or SerializedToString is called. It will
 // profile all operations run under the given EagerContext.
 // Multiple instances of it can be created, but at most one of them will profile
 // for each EagerContext. Status() will return OK only for the instance that is
 // profiling.
-// Thread-safety: TFE_Profiler is thread-safe.
-class EagerProfiler : RunMetadataListener {
+// Thread-safety: ProfilerSession is thread-safe.
+class ProfilerSession {
  public:
-  // Creates and EagerProfiler and starts profiling.
-  static std::unique_ptr<EagerProfiler> Create(EagerContext* const context);
+  // Creates and ProfilerSession and starts profiling.
+  static std::unique_ptr<ProfilerSession> Create(
+      ProfilerContext* const context);
 
   // Deletes an exsiting Profiler and enables starting a new one.
-  ~EagerProfiler() override;
+  ~ProfilerSession();
 
-  void BeforeClearRunMetadata() override LOCKS_EXCLUDED(mutex_)
-      EXCLUSIVE_LOCKS_REQUIRED(context_->MetadataMu());
   tensorflow::Status Status() LOCKS_EXCLUDED(mutex_);
 
   tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_);
 
  private:
   // Constructs an instance of the class and starts profiling
-  explicit EagerProfiler(EagerContext* const context);
+  explicit ProfilerSession(ProfilerContext* const context);
 
   // Profiler is neither copyable or movable.
-  EagerProfiler(const EagerProfiler&) = delete;
-  EagerProfiler& operator=(const EagerProfiler&) = delete;
+  ProfilerSession(const ProfilerSession&) = delete;
+  ProfilerSession& operator=(const ProfilerSession&) = delete;
+
+  std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
+      profilers_ GUARDED_BY(mutex_);
 
-  void Stop() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  // True if the session is active.
+  bool active_ GUARDED_BY(mutex_);
 
-  RunMetadata run_metadata_ GUARDED_BY(mutex_);
   tensorflow::Status status_ GUARDED_BY(mutex_);
-  std::unique_ptr<DeviceTracer> device_tracer_ GUARDED_BY(mutex_);
-  EagerContext* context_ GUARDED_BY(mutex_);
+  const uint64 start_time_micros_;
   mutex mutex_;
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PROFILER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/core/profiler/lib/traceme.cc b/tensorflow/core/profiler/lib/traceme.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90272b8bf584891075de050c7468376abbaed856
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Activity IDs: To avoid contention over a counter, the top 32 bits identify
+// the originating thread, the bottom 32 bits name the event within a thread.
+// IDs may be reused after 4 billion events on one thread, or 4 billion threads.
+static std::atomic<uint32> thread_counter(1);  // avoid kUntracedActivity
+uint64 NewActivityId() {
+  const thread_local static uint32 thread_id = thread_counter.fetch_add(1);
+  thread_local static uint32 per_thread_activity_id = 0;
+  return static_cast<uint64>(thread_id) << 32 | per_thread_activity_id++;
+}
+
+/* static */ uint64 TraceMe::ActivityStartImpl(
+    absl::string_view activity_name) {
+  uint64 activity_id = NewActivityId();
+  TraceMeRecorder::Record({activity_id, string(activity_name),
+                           /*start_time=*/Env::Default()->NowNanos(),
+                           /*end_time=*/0});
+  return activity_id;
+}
+
+/* static */ void TraceMe::ActivityEndImpl(uint64 activity_id) {
+  TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+                           /*end_time=*/Env::Default()->NowNanos()});
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9fae3d37f0f2dd53a85f4d0d17aeaa626c849f1
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// This is specifically used in xprof_bridge for instrumenting Tensorflow ops.
+// Takes input as whether a TF op is expensive or not and returns the TraceMe
+// level to be assigned to trace that particular op. Assigns level 2 for
+// expensive ops (these are high-level details and shown by default in xprof
+// UI). Assigns level 3 for cheap ops (low-level details not shown by default).
+inline int GetTFTraceMeLevel(bool is_expensive) { return is_expensive ? 2 : 3; }
+
+// This class permits user-specified (CPU) tracing activities. A trace activity
+// is started when an object of this class is created and stopped when the
+// object is destroyed.
+//
+// CPU tracing can be useful when trying to understand what parts of GPU
+// computation (e.g., kernels and memcpy) correspond to higher level activities
+// in the overall program. For instance, a collection of kernels maybe
+// performing one "step" of a program that is better visualized together than
+// interspersed with kernels from other "steps". Therefore, a TraceMe object
+// can be created at each "step".
+//
+// Two APIs are provided:
+//   (1) Scoped object: a TraceMe object starts tracing on construction, and
+//       stops tracing when it goes out of scope.
+//          {
+//            TraceMe trace("step");
+//            ... do some work ...
+//          }
+//       TraceMe objects can be members of a class, or allocated on the heap.
+//   (2) Static methods: ActivityStart and ActivityEnd may be called in pairs.
+//          auto id = ActivityStart("step");
+//          ... do some work ...
+//          ActivityEnd(id);
+class TraceMe {
+ public:
+  // Constructor that traces a user-defined activity labeled with activity_name
+  // in the UI. Level defines the trace priority, used for filtering TraceMe
+  // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
+  // - Must be a positive integer.
+  // - Level 1 is the default and used only for user instrumentation.
+  // - Level 2 is used by xprof for instrumenting high level program execution
+  //   details (expensive TF ops, XLA ops, etc).
+  // - Level 3 is also used by xprof to instrument more verbose (low-level)
+  //   program execution details (cheap TF ops, etc).
+  // Users are welcome to use level >= 2 in their code, if they wish to filter
+  // out their host traces based on verbosity.
+  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(activity_name);
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  // string&& constructor to prevent an unnecessary string copy, e.g. when a
+  // TraceMe is constructed based on the result of a StrCat operation.
+  // Note: We can't take the string by value because a) it would make the
+  // overloads ambiguous, and b) we want lvalue strings to use the string_view
+  // constructor so we avoid copying them when tracing is disabled.
+  explicit TraceMe(string &&activity_name, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(std::move(activity_name));
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  // Do not allow passing strings by reference or value since the caller
+  // may unintentionally maintain ownership of the activity_name.
+  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // you really wish to maintain ownership.
+  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+
+  // This overload is necessary to make TraceMe's with string literals work.
+  // Otherwise, the string&& and the string_view constructor would be equally
+  // good overload candidates.
+  explicit TraceMe(const char *raw, int level = 1)
+      : TraceMe(absl::string_view(raw), level) {}
+
+  // This overload only generates the activity name if tracing is enabled.
+  // Useful for avoiding things like string concatenation when tracing is
+  // disabled. The |name_generator| may be a lambda or functor that returns a
+  // type that the string() constructor can take.
+  // name_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Usage: xprof::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  template <typename NameGeneratorT>
+  explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(name_generator());
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  ~TraceMe() {
+    // We do not need to check the trace level again here.
+    // - If tracing wasn't active to start with, we have kUntracedActivity.
+    // - If tracing was active and was stopped, we have
+    //   TraceMeRecorder::Active().
+    // - If tracing was active and was restarted at a lower level, we may
+    //   spuriously record the event. This is extremely rare, and acceptable as
+    //   event will be discarded when its start timestamp fall outside of the
+    //   start/stop session timestamp (recorded in XprofResponse).
+    if (start_time_ != kUntracedActivity) {
+      if (TraceMeRecorder::Active()) {
+        TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name),
+                                 start_time_, Env::Default()->NowNanos()});
+      }
+      no_init_.name.~string();
+    }
+  }
+
+  // TraceMe is not movable or copyable.
+  TraceMe(const TraceMe &) = delete;
+  TraceMe &operator=(const TraceMe &) = delete;
+
+  // Static API, for use when scoped objects are inconvenient.
+
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  static uint64 ActivityStart(absl::string_view name, int level = 1) {
+    return TraceMeRecorder::Active(level) ? ActivityStartImpl(name)
+                                          : kUntracedActivity;
+  }
+
+  // Record the end time of an activity started by ActivityStart().
+  static void ActivityEnd(uint64 activity_id) {
+    // We don't check the level again (see ~TraceMe()).
+    if (activity_id != kUntracedActivity) {
+      if (TraceMeRecorder::Active()) {
+        ActivityEndImpl(activity_id);
+      }
+    }
+  }
+
+ private:
+  // Activity ID or start time used when tracing is disabled.
+  constexpr static uint64 kUntracedActivity = 0;
+  // Activity ID used as a placeholder when both start and end are present.
+  constexpr static uint64 kCompleteActivity = 1;
+
+  static uint64 ActivityStartImpl(absl::string_view activity_name);
+  static void ActivityEndImpl(uint64 activity_id);
+
+  // Wrap the name into a union so that we can avoid the cost of string
+  // initialization when tracing is disabled.
+  union NoInit {
+    NoInit() {}
+    ~NoInit() {}
+    string name;
+  } no_init_;
+
+  uint64 start_time_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/core/profiler/op_profile.proto
similarity index 98%
rename from tensorflow/contrib/tpu/profiler/op_profile.proto
rename to tensorflow/core/profiler/op_profile.proto
index 292108f949d705762a826d0276a517f1f741fb39..0adca5544a6f579ef64bbf804ff8098e28b37da0 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/core/profiler/op_profile.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu.op_profile;
+package tensorflow.profiler.op_profile;
 
 // Profile is the top-level data that summarizes a program.
 message Profile {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 808e3c853bec0efb9523ee413f3d5272a833358d..cdcb8dddf6d0b3b54fb29156559caed2ba216ca2 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -114,7 +114,7 @@ int Run(int argc, char** argv) {
       Flag("min_output_bytes", &FLAGS_min_output_bytes, "min_output_bytes"),
       Flag("min_micros", &FLAGS_min_micros, "min micros"),
       Flag("min_accelerator_micros", &FLAGS_min_accelerator_micros,
-           "min acclerator_micros"),
+           "min accelerator_micros"),
       Flag("min_cpu_micros", &FLAGS_min_cpu_micros, "min_cpu_micros"),
       Flag("min_params", &FLAGS_min_params, "min params"),
       Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
rename to tensorflow/core/profiler/profiler_analysis.proto
index d3c34bfd490080b86cf3d8b893c550f3a87bbbed..4be75de8bb46a23d26b116f306bad6f107d786ef 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -1,7 +1,7 @@
 syntax = "proto3";
 package tensorflow;
 
-import "tensorflow/contrib/tpu/profiler/tpu_profiler.proto";
+import "tensorflow/core/profiler/profiler_service.proto";
 
 message NewProfileSessionRequest {
   ProfileRequest request = 1;
@@ -58,10 +58,10 @@ message ProfileSessionDataResponse {
   bytes output = 3;
 }
 ////////////////////////////////////////////////////////////////////////////////
-// TPUProfileAnalysis service provide entry point for profiling TPU and for
+// ProfileAnalysis service provide entry point for profiling TPU and for
 // serving profiled data to Tensorboard through GRPC
 ////////////////////////////////////////////////////////////////////////////////
-service TPUProfileAnalysis {
+service ProfileAnalysis {
   // Starts a profiling session, blocks until it completes.
   // TPUProfileAnalysis service delegate this to TPUProfiler service.
   // Populate the profiled data in repository, then return status to caller.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/core/profiler/profiler_service.proto
similarity index 92%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler.proto
rename to tensorflow/core/profiler/profiler_service.proto
index da4a95e0450a9d0c20593ca60b69f3ad467d455d..77702c3c900e5a7391ea09ad93383b4f9c9fb2b2 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -3,11 +3,11 @@ package tensorflow;
 
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/protobuf/config.proto";
-import "tensorflow/contrib/tpu/profiler/op_profile.proto";
+import "tensorflow/core/profiler/op_profile.proto";
 
-// The TPUProfiler service retrieves performance information about
-// the programs running on connected TPUs over a period of time.
-service TPUProfiler {
+// The ProfilerService service retrieves performance information about
+// the programs running on connected devices over a period of time.
+service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {
   }
@@ -81,7 +81,7 @@ message ProfileToolData {
 
 message ProfileResponse {
   reserved 1;  // was uint64 placeholder for returning something meaningful.
-  // Graphs of programs executed on TPUs during the profiling period.
+  // Graphs of programs executed on devices during the profiling period.
   repeated GraphDef computation_graph = 2;
 
   // Performance profile that can be used to annotate HLO operations in the
@@ -96,7 +96,7 @@ message ProfileResponse {
   // Assembles a hierarchical performance profile based on HLOs in trace events.
   // If the trace covers multiple programs, the longest-running one is analyzed.
   // See op_profile.proto for the detailed semantics of the returned profile.
-  tpu.op_profile.Profile op_profile = 4;
+  profiler.op_profile.Profile op_profile = 4;
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3e5cdaa4984d4ddfb4d4af8e23ab81c2645814d2
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -0,0 +1,38 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+tf_cuda_library(
+    name = "profiler_service_impl",
+    srcs = ["profiler_service_impl.cc"],
+    hdrs = ["profiler_service_impl.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "profiler_server",
+    srcs = ["profiler_server.cc"],
+    hdrs = ["profiler_server.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":profiler_service_impl",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ed0137f9b21d8ff82d116d95e71920964e09568d
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -0,0 +1,62 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+tf_cuda_library(
+    name = "capture_profile",
+    srcs = [
+        "capture_profile.cc",
+    ],
+    hdrs = [
+        "capture_profile.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dump_tpu_profile",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dump_tpu_profile",
+    srcs = ["dump_tpu_profile.cc"],
+    hdrs = ["dump_tpu_profile.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "trace_events_to_json",
+    srcs = ["trace_events_to_json.cc"],
+    hdrs = ["trace_events_to_json.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+tf_cc_test(
+    name = "trace_events_to_json_test",
+    srcs = ["trace_events_to_json_test.cc"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa8e1910e7d401cfaab81a280f3a4eb70d1c1f2b
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -0,0 +1,269 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+
+#include "grpcpp/grpcpp.h"
+
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
+#include "tensorflow/core/util/events_writer.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+constexpr uint64 kMaxEvents = 1000000;
+
+string GetCurrentTimeStampAsString() {
+  char s[128];
+  std::time_t t = std::time(nullptr);
+  auto result = std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t));
+  DCHECK_NE(result, 0);
+  return s;
+}
+
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_max_events(kMaxEvents);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
+  request.add_tools("op_profile");
+  request.add_tools("input_pipeline");
+  request.add_tools("memory_viewer");
+  request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+Status Profile(const string& service_addr, const string& logdir,
+               int duration_ms, const string& repository_root,
+               const string& session_id, const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                      std::numeric_limits<int32>::max());
+  std::unique_ptr<grpc::ProfilerService::Stub> stub =
+      grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  ProfileResponse response;
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->Profile(&context, request, &response)));
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(WriteTensorboardTPUProfile(logdir, session_id, "", response,
+                                           &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+Status NewSession(const string& service_addr,
+                  const std::vector<tensorflow::string>& hostnames,
+                  int duration_ms, const string& repository_root,
+                  const string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
+      grpc::ProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_RETURN_IF_ERROR(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
+  if (new_session_response.empty_trace()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Creates an empty event file if not already exists, which indicates that we
+// have a plugins/profile/ directory in the current logdir.
+Status MaybeCreateEmptyEventFile(const tensorflow::string& logdir) {
+  // Suffix for an empty event file.  it should be kept in sync with
+  // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
+  constexpr char kProfileEmptySuffix[] = ".profile-empty";
+  std::vector<string> children;
+  TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
+  for (const string& child : children) {
+    if (str_util::EndsWith(child, kProfileEmptySuffix)) {
+      return Status::OK();
+    }
+  }
+  EventsWriter event_writer(io::JoinPath(logdir, "events"));
+  return event_writer.InitWithSuffix(kProfileEmptySuffix);
+}
+
+// Starts tracing on a single or multiple TPU hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts) {
+  // Use the current timestamp as the run name.
+  tensorflow::string session_id = GetCurrentTimeStampAsString();
+  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+  tensorflow::string repository_root =
+      io::JoinPath(logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(workers_list, ",");
+
+  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
+
+  Status status = Status::OK();
+  int remaining_attempts = num_tracing_attempts;
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(include_dataset_ops);
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    if (hostnames.empty()) {
+      status = Profile(service_addr, logdir, duration_ms, repository_root,
+                       session_id, opts);
+    } else {
+      tensorflow::string tpu_master = service_addr;
+      status = NewSession(tpu_master, hostnames, duration_ms, repository_root,
+                          session_id, opts);
+    }
+    if (remaining_attempts <= 0 || status.ok() ||
+        status.code() != tensorflow::error::Code::UNAVAILABLE)
+      break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (status.code() == tensorflow::error::Code::UNAVAILABLE) {
+    std::cout << "No trace event is collected after " << num_tracing_attempts
+              << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+  }
+  return status;
+}
+
+MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
+  MonitorRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_monitoring_level(monitoring_level);
+  return request;
+}
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries) {
+  for (int query = 0; query < num_queries; ++query) {
+    MonitorRequest request =
+        PopulateMonitorRequest(duration_ms, monitoring_level);
+
+    ::grpc::ClientContext context;
+    ::grpc::ChannelArguments channel_args;
+    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                        std::numeric_limits<int32>::max());
+    std::unique_ptr<grpc::ProfilerService::Stub> stub =
+        grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+            channel_args));
+    MonitorResponse response;
+    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
+
+    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
+              << "):\n\n"
+              << response.data() << std::flush;
+  }
+}
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
new file mode 100644
index 0000000000000000000000000000000000000000..988036724791cd171f11a7a3666aca4267286646
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+Status ValidateHostPortPair(const string& host_port);
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries);
+
+// Starts tracing on a single or multiple hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts);
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
similarity index 91%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
index b4b06a40a2c8aaa97ff82baf93c8f2d55a587e37..ed65c110c9dcc364ba24338822363425e852037d 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
@@ -13,15 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
 
 #include <cstdio>
 #include <ctime>
 #include <vector>
 
-#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -29,10 +26,18 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
+#include "tensorflow/core/profiler/op_profile.pb.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::io::JoinPath;
@@ -88,7 +93,7 @@ Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
                                    const string& host_prefix,
-                                   const tpu::op_profile::Profile& profile,
+                                   const op_profile::Profile& profile,
                                    std::ostream* os) {
   string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
@@ -109,7 +114,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
                                   const string& host_prefix,
-                                  const tensorflow::ProfileToolData& tool,
+                                  const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (EndsWith(tool.name(), kFlatProfilerFileName) ||
@@ -155,5 +160,6 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   return Status::OK();
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
similarity index 80%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
index ecf21b1de2219e8896d5e8b79325a193de0b0fa1..961f4e9498d91a6c0d75b82ad87860963360ddb3 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
 
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/grpc_services.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Dumps all profiling tool data in a TPU profile to a TensorBoard log directory
 // with the given run name. This writes user-facing log messages to `os`.
@@ -36,7 +38,8 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
similarity index 91%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
index 3f7e67dec88918009a2a9856d9c7a182338f748d..6adaec5546052a5d82a54e4ae1ca78eb10a4a103 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::strings::Appendf;
@@ -96,10 +99,9 @@ inline void AddTraceEvent(const TraceEvent &event, string *json) {
 
 string TraceEventsToJson(const Trace &trace) {
   string json;
-  Appendf(&json,
-          R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
-  Appendf(&json,
-          R"("traceEvents":[)");
+  Appendf(
+      &json, R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
+  Appendf(&json, R"("traceEvents":[)");
   // Convert to a std::map so that devices are sorted by the device id.
   std::map<uint32, const Device *> sorted_devices;
   for (const auto &pair : trace.devices()) {
@@ -114,5 +116,6 @@ string TraceEventsToJson(const Trace &trace) {
   return json;
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
similarity index 72%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.h
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.h
index 3bd76dd01c7d0f35bad9386c11811743e1709fca..d54cc3c619e234b82452a54f613a57cabfa7d5d3 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
 
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Converts trace events in the trace proto to a JSON string that can be
 // consumed by catapult trace viewer.
 string TraceEventsToJson(const Trace &trace);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
index e97989cc7be961b2a812e46bb07b189bd6cda897..0f883b04dc869218329fd944d45918b0836d1a44 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 string ConvertTextFormattedTraceToJson(const string& trace_str) {
@@ -109,5 +112,6 @@ TEST(TraceEventsToJson, JsonConversion) {
 }
 
 }  // namespace
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e4e0bf5fa320c499a40065021b0030564bc45
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include <memory>
+#include <utility>
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port) {
+  Env* env = profiler_context->eager_context != nullptr
+                 ? profiler_context->eager_context->TFEnv()
+                 : Env::Default();
+  // Starting the server in the child thread may be delay and user may already
+  // delete the profiler context at that point. So we need to make a copy.
+  ProfilerContext ctx = *profiler_context;
+  return WrapUnique(env->StartThread({}, "profiler server", [ctx, port]() {
+    string server_address = strings::StrCat("0.0.0.0:", port);
+    std::unique_ptr<grpc::ProfilerService::Service> service =
+        CreateProfilerService(ctx);
+    ::grpc::ServerBuilder builder;
+    builder.AddListeningPort(server_address,
+                             ::grpc::InsecureServerCredentials());
+    builder.RegisterService(service.get());
+    std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+    LOG(INFO) << "Profiling Server listening on " << server_address;
+    server->Wait();
+  }));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/string_coding.cc b/tensorflow/core/profiler/rpc/profiler_server.h
similarity index 64%
rename from tensorflow/core/platform/default/string_coding.cc
rename to tensorflow/core/profiler/rpc/profiler_server.h
index 7410ee67820a384e4843a57386b110e40a7e0680..4e8c715ac753d57add26de28a2524d4f737567ec 100644
--- a/tensorflow/core/platform/default/string_coding.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -12,19 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
 
-#include "tensorflow/core/platform/default/string_coding.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
-namespace port {
 
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
-  return std::unique_ptr<StringListEncoder>(new StringListEncoder(out));
-}
-
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
-  return std::unique_ptr<StringListDecoder>(new StringListDecoder(in));
-}
-
-}  // namespace port
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port);
 }  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f25ee66833604882309679615e02bf4b6125d9ed
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+class ProfilerServiceImpl : public grpc::ProfilerService::Service {
+ public:
+  explicit ProfilerServiceImpl(const ProfilerContext& profiler_context)
+      : profiler_context_(profiler_context) {}
+  ~ProfilerServiceImpl() override {}
+
+  ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
+                         MonitorResponse* response) override {
+    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
+  }
+
+  ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
+                         ProfileResponse* response) override {
+    LOG(INFO) << "Received a profile request.";
+    std::unique_ptr<ProfilerSession> profiler =
+        ProfilerSession::Create(&profiler_context_);
+    if (!profiler->Status().ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            profiler->Status().error_message());
+    }
+
+    Env* env = profiler_context_.eager_context != nullptr
+                   ? profiler_context_.eager_context->TFEnv()
+                   : Env::Default();
+    for (size_t i = 0; i < req->duration_ms(); ++i) {
+      env->SleepForMicroseconds(1000);
+      if (ctx->IsCancelled()) {
+        return ::grpc::Status::CANCELLED;
+      }
+    }
+
+    Status s = profiler->SerializeToString(response->mutable_encoded_trace());
+    if (!s.ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL, s.error_message());
+    }
+
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  ProfilerContext profiler_context_;
+};
+}  // namespace
+
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context) {
+  return MakeUnique<ProfilerServiceImpl>(profiler_context);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..64ae01d58377c751945e05417528118026b1614e
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+
+namespace tensorflow {
+
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events.proto b/tensorflow/core/profiler/trace_events.proto
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/trace_events.proto
rename to tensorflow/core/profiler/trace_events.proto
index cb2b9162677a0ebe8240a98671b1cabc1cee0c9f..69ec88ca9a798a0faf1864ce9cf5c3f8bb7df0ca 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events.proto
+++ b/tensorflow/core/profiler/trace_events.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu;
+package tensorflow.profiler;
 
 // A 'Trace' contains metadata for the individual traces of a system.
 message Trace {
@@ -56,4 +56,7 @@ message TraceEvent {
   // The duration of the event in picoseconds if applicable.
   // Events without duration are called instant events.
   uint64 duration_ps = 10;
+
+  // Extra arguments that will be displayed in trace view.
+  map<string, string> args = 11;
 }
diff --git a/tensorflow/core/protobuf/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
new file mode 100644
index 0000000000000000000000000000000000000000..29e4d00a85f3ed5da9634b96a5e209ed9223c1d2
--- /dev/null
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -0,0 +1,73 @@
+// This file defines protos that store the results of autotuning various
+// operations.
+//
+// They are in proto format because we want to log them structured. They offer
+// tremendous statistical, testing, and debugging value.
+syntax = "proto3";
+
+package tensorflow;
+
+import "google/protobuf/any.proto";
+import "google/protobuf/duration.proto";
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message AutotuneResult {
+  message SuccessResult {
+    int64 scratch_bytes = 1;
+    google.protobuf.Duration run_time = 2;
+  }
+
+  message ConvKey {
+    int64 algorithm = 1;
+    bool tensor_ops_enabled = 2;
+  }
+
+  // If the conv runs successfully, success will be populated with the
+  // autotuning result. Otherwise, the error message is propagated.
+  oneof result {
+    SuccessResult success = 3;
+    string error_string = 4;
+  }
+
+  oneof key {
+    ConvKey conv = 5;
+  }
+
+  // Sometimes we run a correctness checker during autotuning. It compares the
+  // result buffer content between two algorithms, say, "reference" and "test"
+  // algorithms. The "test" algorithm is the one associated with this
+  // AutotuneResult.
+  //
+  // This field records the reference algorithm used. Notice that naming it
+  // "reference" doesn't mean it's always correct. However, empirically it's
+  // more correct, as it's "algo 0", less fancy than the compared one.
+  //
+  // Notice that the checker_failure may exist even in the success case.
+  // This is because the error string in `result` comes from the underlying
+  // implementation like cuDNN, which isn't aware that it produced an incorrect
+  // result. And even if the checker detects an incorrect result, we can still
+  // retrieve scratch_bytes and runtime_ms.
+  oneof checker_failure {
+    ConvKey reference_conv = 6;
+  }
+}
+
+message AutotuningLog {
+  google.protobuf.Any instr = 1;
+
+  // Records all auto-tuning results per algorithm.
+  repeated AutotuneResult results = 2;
+
+  CudnnVersion cudnn_version = 3;
+  ComputeCapability compute_capability = 4;
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a2cc1bc9353bf434438ec9d21ff3995e0806f1d0..3e24235369a6bd06d3c8cf0df66e1ee3ead2b9b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -156,6 +156,16 @@ message GPUOptions {
     // CollectiveReduce, and serves as an override to automatic ring order
     // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
     string collective_ring_order = 4;
+
+    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
+    // keep track of when GPU memory is freed and when kernels actually
+    // complete so that we can know when a nominally free memory chunk
+    // is really not subject to pending use.
+    bool timestamped_allocator = 5;
+
+    // If > 0 limit the number of pending kernels on any compute
+    // stream to this number.
+    int32 pending_cap = 6;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -429,6 +439,10 @@ message ConfigProto {
     // If true, make collective op execution order sequential and deterministic
     // for potentially concurrent collective instances.
     bool collective_deterministic_sequential_execution = 6;
+
+    // If true, use NCCL for CollectiveOps.  This feature is highly
+    // experimental.
+    bool collective_nccl = 7;
   };
 
   Experimental experimental = 16;
@@ -506,6 +520,25 @@ message RunMetadata {
 
   // Graphs of the partitions executed by executors.
   repeated GraphDef partition_graphs = 3;
+
+  message FunctionGraphs {
+    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
+    repeated GraphDef partition_graphs = 1;
+
+    GraphDef pre_optimization_graph = 2;
+    GraphDef post_optimization_graph = 3;
+  }
+  // This is only populated for graphs that are run as functions in TensorFlow
+  // V2. There will be an entry below for each function that is traced.
+  // The main use cases of the post_optimization_graph and the partition_graphs
+  // is to give the caller insight into the graphs that were actually run by the
+  // runtime. Additional information (such as those in step_stats) will match
+  // these graphs.
+  // We also include the pre_optimization_graph since it is usually easier to
+  // read, and is helpful in situations where the caller wants to get a high
+  // level idea of what the built graph looks like (since the various graph
+  // optimization passes might change the structure of the graph significantly).
+  repeated FunctionGraphs function_graphs = 4;
 }
 
 // Defines a connection between two tensors in a `GraphDef`.
diff --git a/tensorflow/core/protobuf/conv_autotuning.proto b/tensorflow/core/protobuf/conv_autotuning.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c22162b6d2cfc042d04f583b7cad01feaff461eb
--- /dev/null
+++ b/tensorflow/core/protobuf/conv_autotuning.proto
@@ -0,0 +1,19 @@
+// This is used for convolution logging. Also see
+// tensorflow/core/protobuf/autotuing.h
+syntax = "proto3";
+
+package tensorflow;
+
+import "tensorflow/core/framework/node_def.proto";
+import "tensorflow/core/framework/tensor.proto";
+
+message ConvNodeDef {
+  NodeDef conv = 1;
+  TensorProto input = 2;
+  TensorProto filter = 3;
+  TensorProto output = 4;
+  TensorProto bias = 5;
+  oneof side_input_oneof {
+    TensorProto side_input = 6;
+  }
+}
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/protobuf/graph_debug_info.proto
new file mode 100644
index 0000000000000000000000000000000000000000..a123d3cf496a8d653abe652468ce22dda47bc18f
--- /dev/null
+++ b/tensorflow/core/protobuf/graph_debug_info.proto
@@ -0,0 +1,41 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "GraphDebugInfoProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+message GraphDebugInfo {
+  // This represents a file/line location in the source code.
+  message FileLineCol {
+    // File name index, which can be used to retrive the file name string from
+    // `files`. The value should be between 0 and (len(files)-1)
+    int32 file_index = 1;
+
+    // Line number in the file.
+    int32 line = 2;
+
+    // Col number in the file line.
+    int32 col = 3;
+
+    // Name of function contains the file line.
+    string func = 4;
+
+    // Source code contained in this file line.
+    string code = 5;
+  }
+
+  // This represents a stack trace which is a ordered list of `FileLineCol`.
+  message StackTrace {
+    // Each line in the stack trace.
+    repeated FileLineCol file_line_cols = 1;
+  }
+
+  // This stores all the source code file names and can be indexed by the
+  // `file_index`.
+  repeated string files = 1;
+
+  // This maps a node name to a stack trace in the source code.
+  map<string, StackTrace> traces = 2;
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index c104463c51c7e7be02430c7750ebacee60ed50e4..4a998c5bfcd29a23df01aca6feca827afebd3258 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -16,11 +16,13 @@ limitations under the License.
 syntax = "proto3";
 
 package tensorflow;
+
 option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+
+// add go_package externally with copybara
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
@@ -138,6 +140,11 @@ message RunStepRequest {
   // response body. This is a workaround since the RPC subsystem may
   // truncate long metadata messages.
   bool store_errors_in_response_body = 7;
+
+  // Unique identifier for this request. Every RunStepRequest must
+  // have a unique request_id, and retried RunStepRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 8;
 }
 
 message RunStepResponse {
@@ -183,6 +190,11 @@ message PartialRunSetupRequest {
   // Target Nodes. A list of node names. The named nodes will be run in future
   // steps, but their outputs will not be fetched.
   repeated string target = 4;
+
+  // Unique identifier for this request. Every PartialRunSetupRequest must
+  // have a unique request_id, and retried PartialRunSetupRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 5;
 }
 
 message PartialRunSetupResponse {
@@ -204,8 +216,7 @@ message CloseSessionRequest {
   string session_handle = 1;
 }
 
-message CloseSessionResponse {
-}
+message CloseSessionResponse {}
 
 // Reset() allows misbehaving or slow sessions to be aborted and closed, and
 // causes their resources eventually to be released.  Reset() does not wait
@@ -237,8 +248,7 @@ message ResetRequest {
   repeated string device_filters = 2;
 }
 
-message ResetResponse {
-}
+message ResetResponse {}
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -279,6 +289,11 @@ message MakeCallableRequest {
 
   // Options that define the behavior of the created callable.
   CallableOptions options = 2;
+
+  // Unique identifier for this request. Every MakeCallableRequest must
+  // have a unique request_id, and retried MakeCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 3;
 }
 
 message MakeCallableResponse {
@@ -303,6 +318,11 @@ message RunCallableRequest {
   // Values of the tensors passed as arguments to the callable, in the order
   // defined in the CallableOptions.feed field passed to MakeCallable.
   repeated TensorProto feed = 3;
+
+  // Unique identifier for this request. Every RunCallableRequest must
+  // have a unique request_id, and retried RunCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 4;
 }
 
 message RunCallableResponse {
@@ -330,5 +350,4 @@ message ReleaseCallableRequest {
   int64 handle = 2;
 }
 
-message ReleaseCallableResponse {
-}
+message ReleaseCallableResponse {}
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 75a2a88ed72cd909f607286b574b0c343c6268f6..fa0192cf67c500994e5dd976c414c248b3a321a2 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -12,6 +12,7 @@ import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/op_def.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/protobuf/saved_object_graph.proto";
 import "tensorflow/core/protobuf/saver.proto";
 
 // NOTE: This protocol buffer is evolving, and will go through revisions in the
@@ -84,6 +85,9 @@ message MetaGraphDef {
 
   // Asset file def to be used with the defined graph.
   repeated AssetFileDef asset_file_def = 6;
+
+  // Extra information about the structure of functions and stateful objects.
+  SavedObjectGraph object_graph_def = 7;
 }
 
 // CollectionDef should cover most collections.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 515d673828e3792ac6f4268fd55b58e43aab509b..0978a8257bd4726dfb06b2cb2d1fdd475ee9b1ed 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -5,9 +5,10 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+// add go_package externally with copybara
 
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/verifier_config.proto";
 
 message AutoParallelOptions {
   bool enable = 1;
@@ -77,6 +78,9 @@ message RewriterConfig {
   Toggle scoped_allocator_optimization = 15;
   // Force small ops onto the CPU (default is OFF).
   Toggle pin_to_host_optimization = 18;
+  // Enable the swap of kernel implementations based on the device placement
+  // (default is ON).
+  Toggle implementation_selector = 22;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
 
@@ -166,4 +170,11 @@ message RewriterConfig {
 
   // list of CustomGraphOptimizers to apply.
   repeated CustomGraphOptimizer custom_optimizers = 200;
+
+  // VerifierConfig specifying the verifiers to be run after every optimizer.
+  VerifierConfig inter_optimizer_verifier_config = 300;
+
+  // VerifierConfig specifying the verifiers to be run at the end, after all
+  // optimizers have run.
+  VerifierConfig post_optimization_verifier_config = 301;
 }
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
similarity index 62%
rename from tensorflow/python/saved_model/saved_object_graph.proto
rename to tensorflow/core/protobuf/saved_object_graph.proto
index 6f5a952083473961ec30a1a6b29e910733e1f198..48060b33dc42ca74c0464dbce20b0e88ac4a30e6 100644
--- a/tensorflow/python/saved_model/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -1,10 +1,10 @@
 syntax = "proto3";
 
-import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+import "tensorflow/core/protobuf/trackable_object_graph.proto";
+import "tensorflow/core/protobuf/struct.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/versions.proto";
-import "tensorflow/python/saved_model/struct.proto";
 
 option cc_enable_arenas = true;
 
@@ -14,30 +14,32 @@ package tensorflow;
 // describes the directed graph of Python objects (or equivalent in other
 // languages) that make up a model, with nodes[0] at the root.
 
-// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
-// ObjectGraph belongs to the SavedModel and contains pointers to functions and
-// type information, while CheckpointableObjectGraph lives in the checkpoint and
-// contains pointers only to variable values.
-
-// NOTE: This protocol buffer format is experimental and subject to change.
+// SavedObjectGraph shares some structure with TrackableObjectGraph, but
+// SavedObjectGraph belongs to the MetaGraph and contains pointers to functions
+// and type information, while TrackableObjectGraph lives in the checkpoint
+// and contains pointers only to variable values.
 
 message SavedObjectGraph {
-  // List of objects in the SavedModel.
+  // Flattened list of objects in the object graph.
   //
   // The position of the object in this list indicates its id.
   // Nodes[0] is considered the root node.
   repeated SavedObject nodes = 1;
+
+  // Information about captures and output structures in concrete functions.
+  // Referenced from SavedBareConcreteFunction and SavedFunction.
+  map<string, SavedConcreteFunction> concrete_functions = 2;
 }
 
 message SavedObject {
   // Objects which this object depends on: named edges in the dependency
   // graph.
   //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.ObjectReference
       children = 1;
 
-  // Removed when forking from CheckpointableObjectGraph.
+  // Removed when forking SavedObject from TrackableObjectGraph.
   reserved "attributes";
   reserved 2;
 
@@ -45,8 +47,8 @@ message SavedObject {
   // (optimizer, variable, slot variable) relationship; none of the three
   // depend on the others directly.
   //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.SlotVariableReference
       slot_variables = 3;
 
   oneof kind {
@@ -54,7 +56,9 @@ message SavedObject {
     SavedAsset asset = 5;
     SavedFunction function = 6;
     SavedVariable variable = 7;
-    SavedConcreteFunction concrete_function = 8;
+    SavedBareConcreteFunction bare_concrete_function = 8;
+    SavedConstant constant = 9;
+    SavedResource resource = 10;
   }
 }
 
@@ -71,11 +75,11 @@ message SavedUserObject {
   VersionDef version = 2;
 }
 
-// A SavedAsset represents a file in a SavedModel.
+// A SavedAsset points to an asset in the MetaGraph.
 //
-// When bound to a function this object evaluates to a Variable from which the
-// absolute filename can be read. Users should not expect the filename to be
-// maintained.
+// When bound to a function this object evaluates to a tensor with the absolute
+// filename. Users should not depend on a particular part of the filename to
+// remain stable (e.g. basename could be changed).
 message SavedAsset {
   // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
   //
@@ -86,14 +90,13 @@ message SavedAsset {
 
 // A function with multiple signatures, possibly with non-Tensor arguments.
 message SavedFunction {
-  repeated SavedConcreteFunction concrete_function = 1;
+  repeated string concrete_functions = 1;
   FunctionSpec function_spec = 2;
 }
 
+// Stores low-level information about a concrete function. Referenced in either
+// a SavedFunction or a SavedBareConcreteFunction.
 message SavedConcreteFunction {
-  // A reference to a TensorFlow function in the MetaGraph's FunctionDefLibrary
-  string name = 1;
-
   // Bound inputs to the function. The SavedObjects identified by the node ids
   // given here are appended as extra inputs to the caller-supplied inputs.
   // The only types of SavedObjects valid here are SavedVariable, SavedResource
@@ -108,14 +111,27 @@ message SavedConcreteFunction {
   StructuredValue output_signature = 4;
 }
 
+message SavedBareConcreteFunction {
+  // Identifies a SavedConcreteFunction.
+  string concrete_function_name = 1;
+
+  // A sequence of unique strings, one per Tensor argument.
+  repeated string argument_keywords = 2;
+  // The prefix of `argument_keywords` which may be identified by position.
+  int64 allowed_positional_arguments = 3;
+}
+
+message SavedConstant {
+  // An Operation name for a ConstantOp in this SavedObjectGraph's MetaGraph.
+  string operation = 1;
+}
+
 // Represents a Variable that is initialized by loading the contents from the
-// SavedModel checkpoint.
+// checkpoint.
 message SavedVariable {
   DataType dtype = 1;
   TensorShapeProto shape = 2;
   bool trainable = 3;
-
-  // TODO(andresp): Add save_slice_info_def?
 }
 
 // Represents `FunctionSpec` used in `Function`. This represents a
@@ -134,3 +150,9 @@ message FunctionSpec {
   // The input signature, if specified.
   StructuredValue input_signature = 5;
 }
+
+// A SavedResource represents a TF object that holds state during its lifetime.
+message SavedResource {
+  // An object of this type can have a reference to a:
+  // create_resource() and an initialize() function.
+}
diff --git a/tensorflow/python/saved_model/struct.proto b/tensorflow/core/protobuf/struct.proto
similarity index 64%
rename from tensorflow/python/saved_model/struct.proto
rename to tensorflow/core/protobuf/struct.proto
index fd7db84e05d60b1b45df1960b253e1a6661aa186..55b9b520a89a41b066fa2958a4aedf5914dc247a 100644
--- a/tensorflow/python/saved_model/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -8,6 +8,27 @@ package tensorflow;
 // `StructuredValue` represents a dynamically typed value representing various
 // data structures that are inspired by Python data structures typically used in
 // TensorFlow functions as inputs and outputs.
+//
+// For example when saving a Layer there may be a `training` argument. If the
+// user passes a boolean True/False, that switches between two concrete
+// TensorFlow functions. In order to switch between them in the same way after
+// loading the SavedModel, we need to represent "True" and "False".
+//
+// A more advanced example might be a function which takes a list of
+// dictionaries mapping from strings to Tensors. In order to map from
+// user-specified arguments `[{"a": tf.constant(1.)}, {"q": tf.constant(3.)}]`
+// after load to the right saved TensorFlow function, we need to represent the
+// nested structure and the strings, recording that we have a trace for anything
+// matching `[{"a": tf.TensorSpec(None, tf.float32)}, {"q": tf.TensorSpec([],
+// tf.float64)}]` as an example.
+//
+// Likewise functions may return nested structures of Tensors, for example
+// returning a dictionary mapping from strings to Tensors. In order for the
+// loaded function to return the same structure we need to serialize it.
+//
+// This is an ergonomic aid for working with loaded SavedModels, not a promise
+// to serialize all possible function signatures. For example we do not expect
+// to pickle generic Python objects, and ideally we'd stay language-agnostic.
 message StructuredValue {
   // The kind of value.
   oneof kind {
@@ -29,11 +50,11 @@ message StructuredValue {
     // Represents a boolean value.
     bool bool_value = 14;
 
-    // Represents a tf.TensorShape.
+    // Represents a TensorShape.
     tensorflow.TensorShapeProto tensor_shape_value = 31;
-    // Represents an enum value for tf.DType.
+    // Represents an enum value for dtype.
     tensorflow.DataType tensor_dtype_value = 32;
-    // Represents a value for tf.TensorShape.
+    // Represents a value for tf.TensorSpec.
     TensorSpecProto tensor_spec_value = 33;
 
     // Represents a list of `Value`.
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/core/protobuf/tpu/BUILD
similarity index 88%
rename from tensorflow/contrib/tpu/proto/BUILD
rename to tensorflow/core/protobuf/tpu/BUILD
index c20cab844cfaf083be2702a29ac2a152c7b72c2a..ea98ee25c89e1b7bef39276bae5c98bf382dbd7f 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -49,6 +49,15 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "dynamic_padding_proto",
+    srcs = [
+        "dynamic_padding.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library_py(
     name = "compilation_result_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/proto/compilation_result.proto b/tensorflow/core/protobuf/tpu/compilation_result.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/compilation_result.proto
rename to tensorflow/core/protobuf/tpu/compilation_result.proto
diff --git a/tensorflow/core/protobuf/tpu/dynamic_padding.proto b/tensorflow/core/protobuf/tpu/dynamic_padding.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c9ebf181169a583d774ef77ca0b8c243ce733615
--- /dev/null
+++ b/tensorflow/core/protobuf/tpu/dynamic_padding.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.tpu;
+
+// A mapping between the dynamic shape dimension of an input and the arg that
+// represents the real shape.
+message PaddingMap {
+  // Input arg index with dynamic shapes.
+  int32 arg_index = 1;
+
+  // The dynamic shape dimension index.
+  int32 shape_index = 2;
+
+  // The arg index that dynamic dimension maps to, which represents the value
+  // of the real shape.
+  int32 padding_arg_index = 3;
+}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
similarity index 69%
rename from tensorflow/contrib/tpu/proto/optimization_parameters.proto
rename to tensorflow/core/protobuf/tpu/optimization_parameters.proto
index ddb3b15ecd5b1309deb200f04ad1b649441f54e1..7d3c105eec3bda9976071831beab8f036df3771a 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -89,11 +89,11 @@ message FtrlParameters {
 // the normal version of Adam that updates all parameters in the embedding
 // table, even for entries that are not used in the current minibatch
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
-// use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
-// order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future). If use_sum_inside_sqrt is set, the Adam
-// variable update formula will be changed from m / (sqrt(v) + epsilon) to
-// m / sqrt(v + epsilon**2); this option improves the performance of TPU
+// use_non_lazy_adam is enabled, gradient accumulation is also required to be
+// enabled in order to get correct results; a warning will be printed otherwise
+// (which may change to an error in the future). If use_sum_inside_sqrt is set,
+// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
+// to m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
@@ -170,6 +170,54 @@ message ProximalAdagradParameters {
   float initial_accumulator = 3;
 }
 
+// Status of using gradient accumulation (doing two passes over the input
+// gradients: one to accumulate them into a temporary array and another to apply
+// them using the actual optimization algorithm). The extra message is to wrap
+// the enum for scoping.
+message GradientAccumulationStatus {
+  // if UNSPECIFIED (default), gradient accumulation is ENABLED.
+  enum Status {
+    UNSPECIFIED = 0;
+    ENABLED = 1;
+    DISABLED = 2;
+  }
+};
+
+// Configuration proto for hot ID optimization. This is an experimental feature
+// that is currently disabled (by default).
+message HotIdOptimizerConfiguration {
+  // Whether to enable or disable hot ID optimization.
+  // If UNSPECIFIED (default), hot ID optimization is DISABLED.
+  enum Status {
+    UNSPECIFIED = 0;
+    ENABLED = 1;
+    DISABLED = 2;
+  }
+  Status status = 1;
+
+  // The following fields are never expected to be set by the TF model. However,
+  // a TF model could set them if it chooses to. If the fields are not set,
+  // meaningful default values will be chosen by the TPU software.
+
+  // Frequency above which an embedding ID is classified as hot. The valid
+  // range for the frequency is [0.0, 1.0]. The frequency of an embedding ID is
+  // defined as the ratio of the number of lookups for that ID to the total
+  // number of lookups for the embedding table.
+  float frequency_threshold = 2;
+
+  // The maximum number of hot IDs for the embedding table. If greater than
+  // max_id_count hot IDs exist for the table, the IDs with the highest
+  // frequencies are chosen.
+  int32 max_id_count = 3;
+
+  // The maximum number of slots reserved in HBM (across the entire TPU system)
+  // for storing the replicas of hot IDs for the embedding table. In future, the
+  // number of replicas for a particular hot ID could be adjusted based on its
+  // frequency. The max_slot_count value captures the total number of replicas
+  // across all hot IDs for the table.
+  int32 max_slot_count = 4;
+}
+
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
   LearningRate learning_rate = 13;
@@ -191,12 +239,14 @@ message OptimizationParameters {
   // once per minibatch.
   float weight_decay_factor = 16;
 
-  // Whether to use gradient accumulation (do two passes over the input
+  // Status of using gradient accumulation (doing two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
-  // apply them using the actual optimization algorithm). This feature is
-  // experimental -- it has not been fully verified and may cause training
-  // crashes and/or failures.
-  bool use_gradient_accumulation = 15;
+  // apply them using the actual optimization algorithm).
+  GradientAccumulationStatus.Status gradient_accumulation_status = 17;
+
+  // Configuration proto for hot ID optimization. This is an experimental
+  // feature that is currently disabled (by default).
+  HotIdOptimizerConfiguration hot_id_optimizer_configuration = 18;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
@@ -212,10 +262,13 @@ message OptimizationParameters {
     AdadeltaParameters adadelta = 12;
     ProximalAdagradParameters proximal_adagrad = 14;
   }
+
+  reserved 15;  // Old use_gradient_accumulation.
 }
 
 // Specification of an optimization algorithm's state variables (both the main
-// value vector and any extra accumulators, etc.).
+// value vector and any extra accumulators, etc.). This proto is only used
+// internally by the TPU software and is not exposed directly to the TF model.
 message StateVariableSpecification {
   // Parameter name for the state variable.
   string name = 1;
@@ -223,6 +276,20 @@ message StateVariableSpecification {
   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
   message UserDefined {
+    // For padding embedding rows, this field specifies the initial value to be
+    // used. Separate initial values need to be specified for the embeddings and
+    // any extra accumulators. The initial values should be specified so as to
+    // maintain two invariants during model training:
+    // (1) The embedding vector multiplied by zero returns a vector containing
+    //     all zeros. To maintain this invariant, the embedding values should
+    //     never be NaNs or +-infinity.
+    // (2) Repeatedly applying the optimizer using a gradient vector of all
+    //     zeros does not cause the embeddings or slot variables to become NaNs
+    //     or +-infinity.
+    // The padding row is looked up when no embedding IDs are present for a
+    // feature. The semantics of embedding lookup dictate that the output must
+    // be zero under this scenario.
+    double padding_initial_value = 1;
   }
 
   // A state variable that should be filled with a constant and normally hidden
diff --git a/tensorflow/contrib/tpu/proto/topology.proto b/tensorflow/core/protobuf/tpu/topology.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/topology.proto
rename to tensorflow/core/protobuf/tpu/topology.proto
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
similarity index 94%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
index da19b135d7497d1bd5d2e212cab97db78c756cad..22be27795c78e940d439fde5ca21c66791444873 100644
--- a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
@@ -2,8 +2,8 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-import "tensorflow/contrib/tpu/proto/optimization_parameters.proto";
-import "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto";
+import "tensorflow/core/protobuf/tpu/optimization_parameters.proto";
+import "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto";
 
 message TPUEmbeddingConfiguration {
   // Description of the various embedding tables.
@@ -50,8 +50,8 @@ message TPUEmbeddingConfiguration {
   // contiguous manner. In this case, 13 ids are split across 5 hosts as:
   // [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]].
   // In both the strategies, if the id space does not evenly divide the number
-  // of hosts, each of the first "table_descriptor.num_ids % num_hosts" hosts
-  // will be assigned one more id.
+  // of hosts, each of the first "table_descriptor.vocabulary_size % num_hosts"
+  // hosts will be assigned one more id.
   // This partitioning strategy exactly follows that in the embedding_lookup
   // TensorFlow function at tensorflow/python/ops/embedding_ops.py.
   enum ShardingStrategy {
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
diff --git a/tensorflow/core/protobuf/checkpointable_object_graph.proto b/tensorflow/core/protobuf/trackable_object_graph.proto
similarity index 84%
rename from tensorflow/core/protobuf/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/trackable_object_graph.proto
index f2956404b5e0d384f8fcec391ac0ac6c8b583a5e..02d852e6f3df024fa35bf9e4d05af5f2f8d568a5 100644
--- a/tensorflow/core/protobuf/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/trackable_object_graph.proto
@@ -8,10 +8,10 @@ package tensorflow;
 // own variables, allowing for more robust checkpoint loading into modified
 // programs.
 
-message CheckpointableObjectGraph {
-  message CheckpointableObject {
+message TrackableObjectGraph {
+  message TrackableObject {
     message ObjectReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // An index into `TrackableObjectGraph.nodes`, indicating the object
       // being referenced.
       int32 node_id = 1;
       // A user-provided name for the edge.
@@ -37,12 +37,12 @@ message CheckpointableObjectGraph {
     }
 
     message SlotVariableReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // variable object this slot was created for.
       int32 original_variable_node_id = 1;
       // The name of the slot (e.g. "m"/"v").
       string slot_name = 2;
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // `Object` with the value of the slot variable.
       int32 slot_variable_node_id = 3;
     }
@@ -55,5 +55,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated CheckpointableObject nodes = 1;
+  repeated TrackableObject nodes = 1;
 }
diff --git a/tensorflow/core/protobuf/verifier_config.proto b/tensorflow/core/protobuf/verifier_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..207f0f2a974cbc58413490380edf3795c7206aba
--- /dev/null
+++ b/tensorflow/core/protobuf/verifier_config.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "VerifierConfigProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+// add go_package externally with copybara
+
+// The config for graph verifiers.
+message VerifierConfig {
+  enum Toggle {
+    DEFAULT = 0;
+    ON = 1;
+    OFF = 2;
+  }
+
+  // Deadline for completion of all verification i.e. all the Toggle ON
+  // verifiers must complete execution within this time.
+  int64 verification_timeout_in_ms = 1;
+
+  // Perform structural validation on a tensorflow graph. Default is OFF.
+  Toggle structure_verifier = 2;
+
+  // Next tag: 3
+}
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 74058c846530bc2b4577d18034d02ed002d8983f..4284dd119edf3167915942c6458827ebb7191ad5 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -535,6 +535,7 @@ message CompleteInstanceRequest {
 message CompleteInstanceResponse {
   int32 instance_key = 1;
   int32 source_rank = 2;
+  bytes communicator_key = 3;
 }
 
 // Request for next agreed-upon step_id for the specified graph_keys.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a55fe17dd5fa6f7ba7c0eaebb345c69f9dce2a5c..40b101fb9178349c167f333760acc45f1591cead 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,8 +19,8 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 12
-#define TF_PATCH_VERSION 0
+#define TF_MINOR_VERSION 13
+#define TF_PATCH_VERSION 1
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
diff --git a/tensorflow/contrib/tpu/utils/BUILD b/tensorflow/core/tpu/BUILD
similarity index 82%
rename from tensorflow/contrib/tpu/utils/BUILD
rename to tensorflow/core/tpu/BUILD
index c27b73728702dcb1c84a82d3a07d15978ed2710f..5cbed402f75bf7ecf67ea06a2ad8d89260d7c1d1 100644
--- a/tensorflow/contrib/tpu/utils/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -8,9 +8,9 @@ cc_library(
     hdrs = ["tpu_embedding_optimization_parameters_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
         "@com_google_absl//absl/base",
     ],
 )
@@ -21,10 +21,10 @@ cc_library(
     hdrs = ["tpu_embedding_output_layout_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
similarity index 80%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 76cb5531cd0bc3a375d1434c31fa14a9d7f42476..71766f6f03767fc767cf585631a86001c61cdfa4 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -44,6 +44,7 @@ string GetOptimizationAlgorithmName(OptimizationAlgorithm alg) {
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "*** Not set ***";
   }
+  return "*** Not set ***";
 }
 
 string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
@@ -71,6 +72,7 @@ string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "unknown (not specified)";
   }
+  return "unknown (not specified)";
 }
 
 // Returns the number of optimization parameter vectors used by the optimization
@@ -111,6 +113,7 @@ Status GetBaseAuxiliaryParameterCount(OptimizationAlgorithm alg, int* count) {
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return errors::InvalidArgument("No optimization algorithm specified");
   }
+  return errors::InvalidArgument("No optimization algorithm specified");
 }
 
 Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
@@ -134,12 +137,16 @@ Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
   }
 }
 namespace {
-// Make a normal state variable specification.
+// Make a normal state variable specification. Please refer to
+// //tensorflow/core/protobuf/tpu/optimization_parameters.proto
+// (StateVariableSpecification message) for instructions on how to set the
+// padding_initial_value field.
 StateVariableSpecification MakeStandardStateVariableSpecification(
-    const string& name) {
+    const string& name, double padding_initial_value) {
   StateVariableSpecification result;
   result.set_name(name);
-  result.mutable_user_defined();
+  result.mutable_user_defined()->set_padding_initial_value(
+      padding_initial_value);
   return result;
 }
 }  // namespace
@@ -149,14 +156,14 @@ Status GetOptimizationAlgorithmStateVariables(
     std::vector<StateVariableSpecification>* state_variables) {
   // The first parameter set is always the weights themselves.
   state_variables->push_back(
-      MakeStandardStateVariableSpecification("parameters"));
+      MakeStandardStateVariableSpecification("parameters", 0.0));
   // The order of the returned parameters needs to match the offsets used by
   // the algorithm implementations in test_util.cc and
   // address_handler_program_creator.cc.
   switch (alg) {
     case OptimizationAlgorithm::kAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::kStochasticGradientDescent: {
@@ -165,53 +172,58 @@ Status GetOptimizationAlgorithmStateVariables(
     }
     case OptimizationAlgorithm::kFtrl: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("linears"));
+          MakeStandardStateVariableSpecification("linears", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdam: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("velocities"));
+          MakeStandardStateVariableSpecification("velocities", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMomentum: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       break;
     }
     case OptimizationAlgorithm::kRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
       break;
     }
     case OptimizationAlgorithm::kCenteredRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mg"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mg", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMdlAdagradLight: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("weights"));
+          MakeStandardStateVariableSpecification("weights", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("benefits"));
+          MakeStandardStateVariableSpecification("benefits", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdadelta: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("updates"));
+          MakeStandardStateVariableSpecification("updates", 0.0));
       break;
     }
     case OptimizationAlgorithm::kProximalAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET: {
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
similarity index 91%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
index 81d50264edb93e889d736c62a493b058e2f1bd56..ceb07ff35510ae3b034ad391456e5a8a21fa4240 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
 
 #include <string>
 #include "absl/base/casts.h"
-#include "tensorflow/contrib/tpu/proto/optimization_parameters.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/optimization_parameters.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -87,4 +87,4 @@ const float kGradientAccumulatorInitialValue = absl::bit_cast<float, uint32>(1);
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
similarity index 96%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
index 8480ec4b8bb98e867db3e4e4ed14d4cc529efe49..3a027757af7cb90d465e230b9934a4214888c4f1 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.pb.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
similarity index 81%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
index c10fbeeff2b5af93a118902c0afb3b59cc1a9d60..5bff401b9d2d37f35086fb7c8a39c62d79d7daa9 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -35,4 +35,4 @@ Status ComputeOutputTensorShapes(
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 6d73c38e3c904458e7438915d5fe35db9f4c8fc8..2d647fd8d86866c93f2a3890e3e40e7f70f670f5 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -105,15 +105,21 @@ class BCast {
   static Vec FromShape(const TensorShape& shape);
   static TensorShape ToShape(const BCast::Vec& vec);
 
-  template <int NDIMS>
-  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+  template <typename IndexType, int NDIMS>
+  static Eigen::array<IndexType, NDIMS> ToIndexArrayType(
       const BCast::Vec& vec) {
     CHECK_EQ(vec.size(), NDIMS);
-    Eigen::array<Eigen::DenseIndex, NDIMS> ret;
+    Eigen::array<IndexType, NDIMS> ret;
     for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
     return ret;
   }
 
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+      const BCast::Vec& vec) {
+    return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec);
+  }
+
  private:
   bool valid_ = true;
   Vec x_reshape_;
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index f1196fdfec213c286a489b948aa7e17580048f95..d919adb32f43f23c76b940e641d55e1b7ece7353 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cinttypes>
+#include <cstring>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 2b035ab0e9c8500931665890a637ea6f3242ba22..35f8d13f754d5c2de5a37df3d77e81cdb3e7279a 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include <numeric>
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 #define CUDA_EXPECT_SUCCESS                                 \
   {                                                         \
@@ -152,22 +154,24 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
 // test valid inputs
 #define TEST_LAUNCH_PARAMETER(work_element_count)                              \
   cfg = GetCudaLaunchConfig(bufsize, d);                                       \
-  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
-      cfg, outbuf);                                                            \
+  TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                               cfg.thread_per_block, 0, d.stream(), cfg,       \
+                               outbuf));                                       \
   CUDA_ASSERT_SUCCESS                                                          \
   cfg = GetCudaLaunchConfig(work_element_count, d);                            \
-  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
-      cfg, bufsize, outbuf);                                                   \
+  TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                               0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
                                                                                \
   cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
-  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
-      cfg, outbuf);                                                            \
+  TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                               cfg.thread_per_block, 0, d.stream(), cfg,       \
+                               outbuf));                                       \
   CUDA_ASSERT_SUCCESS                                                          \
   cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
-  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
-      cfg, bufsize, outbuf);                                                   \
+  TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                               0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
   EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
 
@@ -201,25 +205,29 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   CudaLaunchConfig cfg1d;
 
 // test valid inputs
-#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                      \
-  cfg1d = GetCudaLaunchConfig(bufsize, d);                                     \
-  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
-      cfg1d, outbuf);                                                          \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                  \
-  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
-      cfg, bufsize, outbuf);                                                   \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
-                                                                               \
-  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
-  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
-      cfg1d, outbuf);                                                          \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
-  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
-      cfg, bufsize, outbuf);                                                   \
-  CUDA_EXPECT_SUCCESS                                                          \
+#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
+  cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
+  TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
+                                cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
+                                outbuf));                                     \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
+  TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
+                                cfg.thread_per_block, 0, d.stream(), cfg,     \
+                                bufsize, outbuf));                            \
+  CUDA_EXPECT_SUCCESS                                                         \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));        \
+                                                                              \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
+  TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
+                                cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
+                                outbuf));                                     \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
+  TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
+                                cfg.thread_per_block, 0, d.stream(), cfg,     \
+                                bufsize, outbuf));                            \
+  CUDA_EXPECT_SUCCESS                                                         \
   EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
@@ -241,15 +249,17 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   CudaLaunchConfig cfg1d;
 
 // test valid inputs
-#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                                \
-  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
-  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
-      cfg1d, outbuf);                                                          \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
-  Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
-      cfg, bufsize, outbuf);                                                   \
-  CUDA_EXPECT_SUCCESS                                                          \
+#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
+  TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count,             \
+                                cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
+                                outbuf));                                     \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
+  TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count,                     \
+                                cfg.thread_per_block, 0, d.stream(), cfg,     \
+                                bufsize, outbuf));                            \
+  CUDA_EXPECT_SUCCESS                                                         \
   EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
@@ -271,7 +281,8 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
   unsigned* failure_count;
   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
   *failure_count = 0;
-  CudaShuffleGetSrcLaneTest<<<1, 32>>>(failure_count);
+  TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
+                                failure_count));
   ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
   ASSERT_EQ(*failure_count, 0);
   cudaFree(failure_count);
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index c0ae6349f755dcbd643493ccfe82374d12bc2baf..a46bd72c930fa9e8c3a9d73e33a9779a372629cd 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/base/casts.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logging.h"
@@ -298,6 +299,52 @@ inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
   return *ptr;
 }
 
+namespace detail {
+template <typename... Ts, size_t... Is>
+std::array<void*, sizeof...(Ts)> GetArrayOfElementPointersImpl(
+    std::tuple<Ts...>* tuple, absl::index_sequence<Is...>) {
+  return {{&std::get<Is>(*tuple)...}};
+}
+// Returns an array of void pointers to the elements of the given tuple.
+template <typename... Ts>
+std::array<void*, sizeof...(Ts)> GetArrayOfElementPointers(
+    std::tuple<Ts...>* tuple) {
+  return GetArrayOfElementPointersImpl(tuple,
+                                       absl::index_sequence_for<Ts...>{});
+}
+
+template <bool...>
+struct BoolPack;
+template <bool... Bs>
+using NoneTrue = std::is_same<BoolPack<Bs..., false>, BoolPack<false, Bs...>>;
+// Returns whether none of the types in Ts is a reference.
+template <typename... Ts>
+constexpr bool NoneIsReference() {
+  return NoneTrue<(std::is_reference<Ts>::value)...>::value;
+}
+}  // namespace detail
+
+// Launches a CUDA kernel through cudaLaunchKernel with the given arguments.
+//
+// The kernel parameters 'Ts' must be constructible from the arguments 'Args'.
+template <typename... Ts, typename... Args>
+Status CudaLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
+                        size_t shared_memory_size_bytes, cudaStream_t stream,
+                        Args... arguments) {
+  static_assert(detail::NoneIsReference<Ts...>(),
+                "Kernels with reference arguments have undefined behaviour.");
+  // Cast arguments and forward them as an array of pointers.
+  auto args_tuple = std::tuple<Ts...>(arguments...);
+  auto arg_ptrs = detail::GetArrayOfElementPointers(&args_tuple);
+  auto func_ptr = absl::bit_cast<const void*>(function);
+  auto result = cudaLaunchKernel(func_ptr, grid_dim, block_dim, arg_ptrs.data(),
+                                 shared_memory_size_bytes, stream);
+  if (result != cudaSuccess) {
+    return errors::Internal(cudaGetErrorString(result));
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index cb088faec1ece7cffde4499df900be9d8dd16bc5..56e618872a71e190cbec9c1cd33e1b246a1c9e08 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -289,6 +289,30 @@ bool DeviceNameUtils::IsSpecification(const ParsedName& less_specific,
   return true;
 }
 
+void DeviceNameUtils::EnsureSpecification(ParsedName* more_specific,
+                                          const ParsedName& less_specific) {
+  if (less_specific.has_job) {
+    more_specific->has_job = true;
+    more_specific->job = less_specific.job;
+  }
+  if (less_specific.has_replica) {
+    more_specific->has_replica = true;
+    more_specific->replica = less_specific.replica;
+  }
+  if (less_specific.has_task) {
+    more_specific->has_task = true;
+    more_specific->task = less_specific.task;
+  }
+  if (less_specific.has_type) {
+    more_specific->has_type = true;
+    more_specific->type = less_specific.type;
+  }
+  if (less_specific.has_id) {
+    more_specific->has_id = true;
+    more_specific->id = less_specific.id;
+  }
+}
+
 /* static */
 bool DeviceNameUtils::IsCompleteSpecification(const ParsedName& pattern,
                                               const ParsedName& name) {
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index bb5e2b3f0c42b321bc7ab45cdad2ec951671be96..b047e814bd694a775af0487365c85ce02ad573fd 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -110,6 +110,11 @@ class DeviceNameUtils {
   static bool IsSpecification(const ParsedName& less_specific,
                               const ParsedName& more_specific);
 
+  // Makes minimal changes to more_specific so that it becomes a
+  // specification of less_specific.
+  static void EnsureSpecification(ParsedName* more_specific,
+                                  const ParsedName& less_specific);
+
   // Like IsSpecification, but the second argument "name" must have a
   // non-wildcard value for all of its components.
   static bool IsCompleteSpecification(const ParsedName& pattern,
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 3cc75bbd1f353183184462ec9495c0492cf1442b..4c29bd582e6688b5128b79841b4b7479671a2cbf 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb_text.h"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/presized_cuckoo_map.h"
@@ -161,10 +163,30 @@ class Feature {
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
-        while (!stream.ExpectAtEnd()) {
-          uint32 buffer32;
-          if (!stream.ReadLittleEndian32(&buffer32)) return false;
-          float_list->push_back(absl::bit_cast<float>(buffer32));
+        // If the result data type is float and we are on a little endian
+        // machine then we can simply memcpy the data from the proto into the
+        // result vector.
+        constexpr int32 kNumFloatBytes = 4;
+        if (port::kLittleEndian &&
+            sizeof(typename Result::value_type) == kNumFloatBytes) {
+          // Store the initial size to know the offset we have to start writing
+          // data from before resizing the output "vector".
+          const size_t initial_size = float_list->size();
+          float_list->resize(initial_size + packed_length / kNumFloatBytes);
+          // Calculate the length of the buffer available what can be less than
+          // what we requested in resize in case of a LimitedArraySlice.
+          const uint32 bytes_to_copy =
+              std::min(static_cast<uint32>((float_list->size() - initial_size) *
+                                           kNumFloatBytes),
+                       packed_length);
+          if (!stream.ReadRaw(float_list->data() + initial_size, bytes_to_copy))
+            return false;
+        } else {
+          while (!stream.ExpectAtEnd()) {
+            uint32 buffer32;
+            if (!stream.ReadLittleEndian32(&buffer32)) return false;
+            float_list->push_back(absl::bit_cast<float>(buffer32));
+          }
         }
 
         stream.PopLimit(packed_limit);
@@ -448,8 +470,10 @@ struct SeededHasher {
 template <typename T>
 class LimitedArraySlice {
  public:
+  using value_type = T;
+
   LimitedArraySlice(T* begin, size_t num_elements)
-      : current_(begin), end_(begin + num_elements) {}
+      : current_(begin), begin_(begin), end_(begin + num_elements) {}
 
   // May return negative if there were push_back calls after slice was filled.
   int64 EndDistance() const { return end_ - current_; }
@@ -462,8 +486,21 @@ class LimitedArraySlice {
     ++current_;
   }
 
+  // Returns the number of elements in the slice.
+  size_t size() const { return std::min(current_ - begin_, end_ - begin_); }
+
+  // Attempts to resize the vector to the given size. It does so by advancing
+  // the pointer to the current element, possibly beyond the end of the slice.
+  // As a consequence, calling `size()` after `resize(x)` was called might
+  // return a value less than `x`.
+  void resize(size_t size) { current_ = begin_ + size; }
+
+  // Returns the pointer to the underlying data buffer.
+  T* data() { return begin_; }
+
  private:
   T* current_;
+  T* begin_;
   T* end_;
 };
 
@@ -1727,9 +1764,13 @@ Status FastParseSequenceExample(
   DCHECK(context_result != nullptr);
   DCHECK(feature_list_result != nullptr);
   DCHECK(dense_feature_lengths != nullptr);
-  std::map<StringPiece, bool> context_is_sparse;
-  std::map<StringPiece, std::pair<DataType, size_t>>
+  size_t num_context_features =
+      context_config.sparse.size() + context_config.dense.size();
+  absl::flat_hash_map<StringPiece, bool> context_is_sparse;
+  context_is_sparse.reserve(num_context_features);
+  absl::flat_hash_map<StringPiece, std::pair<DataType, size_t>>
       context_feature_type_and_lengths;
+  context_feature_type_and_lengths.reserve(num_context_features);
   if (!example_names.empty() && example_names.size() != num_examples) {
     return errors::InvalidArgument(
         "example_names must be empty or have the correct number of elements");
@@ -1757,11 +1798,14 @@ Status FastParseSequenceExample(
                                        " but expected ", c.shape.DebugString());
       }
     }
-    context_is_sparse[c.feature_name] = false;
   }
-  std::map<StringPiece, bool> sequence_is_sparse;
-  std::map<StringPiece, std::pair<DataType, size_t>>
+  size_t num_sequence_features =
+      feature_list_config.sparse.size() + feature_list_config.dense.size();
+  absl::flat_hash_map<StringPiece, bool> sequence_is_sparse;
+  sequence_is_sparse.reserve(num_sequence_features);
+  absl::flat_hash_map<StringPiece, std::pair<DataType, size_t>>
       sequence_feature_type_and_lengths;
+  sequence_feature_type_and_lengths.reserve(num_sequence_features);
   for (auto& c : feature_list_config.sparse) {
     TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
     sequence_feature_type_and_lengths[c.feature_name] =
@@ -1776,13 +1820,12 @@ Status FastParseSequenceExample(
     TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
     sequence_feature_type_and_lengths[c.feature_name] =
         std::make_pair(c.dtype, 0);
-    sequence_is_sparse[c.feature_name] = false;
   }
 
-  std::vector<std::map<StringPiece, StringPiece>> all_context_features(
-      num_examples);
-  std::vector<std::map<StringPiece, StringPiece>> all_sequence_features(
-      num_examples);
+  std::vector<absl::flat_hash_map<StringPiece, StringPiece>>
+      all_context_features(num_examples);
+  std::vector<absl::flat_hash_map<StringPiece, StringPiece>>
+      all_sequence_features(num_examples);
   const string kUnknown = "<unknown>";
   for (int d = 0; d < num_examples; d++) {
     const string& example = serialized[d];
@@ -1798,9 +1841,9 @@ Status FastParseSequenceExample(
 
     // Extract pointers to all features within this serialized example.
     while (!stream.ExpectAtEnd()) {
-      std::map<StringPiece, StringPiece>* features = nullptr;
-      const std::map<StringPiece, std::pair<DataType, size_t>>* config =
-          nullptr;
+      absl::flat_hash_map<StringPiece, StringPiece>* features = nullptr;
+      const absl::flat_hash_map<StringPiece, std::pair<DataType, size_t>>*
+          config = nullptr;
       if (stream.ExpectTag(kDelimitedTag(1))) {
         // Context
         features = context_features;
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
index 5c8f9d2324d38dc41e2d0790be59e5392feac6db..2b296ad56d8c70c9540245ca87a91121debceaf5 100644
--- a/tensorflow/core/util/exec_on_stall.h
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -27,7 +27,7 @@ namespace tensorflow {
 // is not deleted within the allotted number of seconds.
 //
 // This can be useful in diagnosing deadlocks, stalls and memory leaks
-// without logging too agressively.
+// without logging too aggressively.
 class ExecuteOnStall {
  public:
   // delay_secs: If the object still exists after this many seconds,
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 125a2e6b2340e6d4572a168395c305dc0d72056d..fcd2e18944a26ede0a099be20db08180e3dc41a9 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
+#include <list>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -34,8 +35,7 @@ limitations under the License.
 #endif
 
 #ifdef INTEL_MKL_ML_ONLY
-#error \
-    "Compiling for INTEL MKL ML only is no longer supported.Please use MKL DNN (the default option for --config=mkl)"
+#error "Please use INTEL MKL DNN (the default option for --config=mkl)."
 #endif
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -86,7 +86,7 @@ namespace tensorflow {
 // For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
 
-// The dimensions order that MKL DNN internally uses for 2D activations
+// The dimensions order that MKL-DNN internally uses for 2D activations
 // [Batch, Channel, Height, Width] and
 // for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
@@ -98,7 +98,7 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
-// The dimensions order that MKL DNN internally uses for 3D activations
+// The dimensions order that MKL-DNN internally uses for 3D activations
 // [Batch, Channel, Depth, Height, Width] and
 // for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
@@ -130,7 +130,7 @@ typedef enum {
   TF_3DFILTER_DIM_O = 4
 } TFFilterDims3d;
 
-// The dimensions order that MKL DNN requires for the filter in a grouped
+// The dimensions order that MKL-DNN requires for the filter in a grouped
 // convolution (2D only)
 typedef enum {
   MKL_GROUP_FILTER_DIM_G = 0,
@@ -837,7 +837,6 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
       return mkl_tensor;  // return input since it is already TF tensor
 
     TensorShape output_shape = mkl_shape.GetTfShape();
-    ;
 
     // Allocate output tensor.
     context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
@@ -1582,7 +1581,7 @@ inline TensorShape MklDnnDimsToTFShape(const memory::dims& dims) {
 
 /// Function to calculate strides given tensor shape in Tensorflow order
 /// E.g., if dims_tf_order is {1, 2, 3, 4}, then as per Tensorflow convention,
-/// dimesion with size 1 is outermost dimension; while dimension with size 4 is
+/// dimension with size 1 is outermost dimension; while dimension with size 4 is
 /// innermost dimension. So strides for this tensor would be {4 * 3 * 2,
 /// 4 * 3, 4, 1}, i.e., {24, 12, 4, 1}.
 ///
@@ -1766,6 +1765,7 @@ class MklDnnData {
   inline void SetUsrMem(const memory::primitive_desc& pd,
                         void* data_buffer = nullptr) {
     CHECK_NOTNULL(cpu_engine_);
+    if (user_memory_) delete user_memory_;
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
       user_memory_ = new memory(pd, data_buffer);
@@ -2060,6 +2060,111 @@ class MklPrimitive {
 
 const mkldnn::memory::dims NONE_DIMS = {};
 
+//
+// LRUCache is a class which implements LRU (Least Recently Used) cache.
+// The implementation is similar to that of
+//    tensorflow/core/platform/cloud/expiring_lru_cache.h
+// without its thread-safe part because the cache is supposed to be
+// used as thread local (for instance, MklPrimitive caching).
+//
+// The LRU list maintains objects in chronological order based on
+// creation time, with the least recently accessed object at the
+// tail of LRU list, while the most recently accessed object
+// at the head of LRU list.
+//
+// This class is used to maintain an upper bound on the total number of
+// cached items. When the cache reaches its capacity, the LRU item will
+// be removed and replaced by a new one from SetOp call.
+//
+template <typename T>
+class LRUCache {
+ public:
+  explicit LRUCache(size_t capacity) {
+    capacity_ = capacity;
+    Clear();
+  }
+
+  T* GetOp(const string& key) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return nullptr;
+    }
+
+    // Move to the front of LRU list as the most recently accessed.
+    lru_list_.erase(it->second.lru_iterator);
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return it->second.op;
+  }
+
+  void SetOp(const string& key, T* op) {
+    if (lru_list_.size() >= capacity_) {
+      Delete();
+    }
+
+    // Insert an entry to the front of the LRU list
+    lru_list_.push_front(key);
+    Entry entry(op, lru_list_.begin());
+    cache_.emplace(std::make_pair(key, std::move(entry)));
+  }
+
+  void Clear() {
+    if (lru_list_.empty()) return;
+
+    // Clean up the cache
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+ private:
+  struct Entry {
+    // The entry's value.
+    T* op;
+
+    // A list iterator pointing to the entry's position in the LRU list.
+    std::list<string>::iterator lru_iterator;
+
+    // Constructor
+    Entry(T* op, std::list<string>::iterator it) {
+      this->op = op;
+      this->lru_iterator = it;
+    }
+
+    // Move construcctor
+    Entry(Entry&& source) noexcept
+        : lru_iterator(std::move(source.lru_iterator)) {
+      op = std::move(source.op);
+      source.op = std::forward<T*>(nullptr);
+    }
+
+    // Destructor
+    ~Entry() {
+      if (op != nullptr) delete op;
+    }
+  };
+
+  // Remove the least recently accessed entry from LRU list, which
+  // is the tail of lru_list_. Update cache_ correspondingly.
+  bool Delete() {
+    if (lru_list_.empty()) return false;
+    string key = lru_list_.back();
+    lru_list_.pop_back();
+    cache_.erase(key);
+    return true;
+  }
+
+  // Cache capacity
+  size_t capacity_;
+
+  // The cache, a map from string key to a LRU entry.
+  std::unordered_map<string, Entry> cache_;
+
+  // The LRU list of entries.
+  // The front of the list contains the key of the most recently accessed
+  // entry, while the back of the list is the least recently accessed entry.
+  std::list<string> lru_list_;
+};
+
 template <typename T>
 class MklPrimitiveFactory {
  public:
@@ -2068,23 +2173,13 @@ class MklPrimitiveFactory {
   ~MklPrimitiveFactory() {}
 
   MklPrimitive* GetOp(const string& key) {
-    auto& map = MklPrimitiveFactory<T>::GetHashMap();
-    auto stream_iter = map.find(key);
-    if (stream_iter == map.end()) {
-      return nullptr;
-    } else {
-      CHECK(stream_iter->second != nullptr) << "nullptr present in map";
-      return stream_iter->second;
-    }
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    return lru_cache.GetOp(key);
   }
 
   void SetOp(const string& key, MklPrimitive* op) {
-    auto& map = MklPrimitiveFactory<T>::GetHashMap();
-    auto stream_iter = map.find(key);
-
-    CHECK(stream_iter == map.end());
-
-    map[key] = op;
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    lru_cache.SetOp(key, op);
   }
 
   /// Function to decide whether HW has AVX512 or AVX2
@@ -2104,9 +2199,10 @@ class MklPrimitiveFactory {
   }
 
  private:
-  static inline std::unordered_map<string, MklPrimitive*>& GetHashMap() {
-    static thread_local std::unordered_map<string, MklPrimitive*> map_;
-    return map_;
+  static inline LRUCache<MklPrimitive>& GetLRUCache() {
+    static const int kCapacity = 1024;  // cache capacity
+    static thread_local LRUCache<MklPrimitive> lru_cache_(kCapacity);
+    return lru_cache_;
   }
 };
 
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 4f837f105d2c4fc12a366f52a1db72ce376b79f6..bed6febe377b6109328254c72446eb913be330a4 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -84,6 +84,40 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
+TEST(MklUtilTest, LRUCacheTest) {
+  // The cached objects are of type int*
+  size_t capacity = 100;
+  size_t num_objects = capacity + 10;
+  LRUCache<int> lru_cache(capacity);
+
+  // Test SetOp: be able to set more ops than the capacity
+  for (int k = 0; k < num_objects; k++) {
+    lru_cache.SetOp(std::to_string(k), new int(k));
+  }
+
+  // Test GetOp and capacity:
+  // Least recently accessed objects should not be in cache any more.
+  for (int k = 0; k < num_objects - capacity; ++k) {
+    EXPECT_EQ(nullptr, lru_cache.GetOp(std::to_string(k)));
+  }
+
+  // Test GetOp and capacity:
+  // Most recently accessed objects should still be in cache.
+  for (int k = num_objects - capacity; k < num_objects; ++k) {
+    int* int_ptr = lru_cache.GetOp(std::to_string(k));
+    EXPECT_NE(nullptr, int_ptr);
+    EXPECT_EQ(*int_ptr, k);
+  }
+
+  // Clean up the cache
+  lru_cache.Clear();
+
+  // After clean up, there should be no cached object.
+  for (int k = 0; k < num_objects; ++k) {
+    EXPECT_EQ(nullptr, lru_cache.GetOp(std::to_string(k)));
+  }
+}
+
 #endif  // INTEL_MKL_ML_ONLY
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index b990f0a74918454fcdf8dff44006ef2e6a5602e1..890bd837025da1408e77b4050990c7558767b3b0 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -70,6 +70,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
index 271c85efd88de0f8acbedb3d2254af3397601c6b..c3797f1a8a8c4690a8718dd7c4145827b413f627 100644
--- a/tensorflow/core/util/proto/descriptors.cc
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -25,7 +25,7 @@ namespace {
 // Build a `DescriptorPool` from the named file or URI. The file or URI
 // must be available to the current TensorFlow environment.
 //
-// The file must contiain a serialized `FileDescriptorSet`. See
+// The file must contain a serialized `FileDescriptorSet`. See
 // `GetDescriptorPool()` for more information.
 Status GetDescriptorPoolFromFile(
     tensorflow::Env* env, const string& filename,
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 9451e317a13dec9b0c96096d9a7144263efc600f..ba45f8a5b0e7fb508504d0ae30ecdde8c8d45dc9 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
+#include "google/protobuf/duration.pb.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,6 +60,20 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
   const int index_offset_;
 };
 
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
 }  // namespace proto_utils
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 0782e7e1a8af19a7936bde267c0905dc5f7d00e7..498df7a021df3e65557d96dc25577e9e24e911a6 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 89c163aa5133fafc23b01c7153ac40d32efcaaf6..4e53c59ba364cc1daf7d8db7cd0529986a8e3094 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -63,7 +63,7 @@ class SparseTensor {
                                     ix.shape().dim_size(0), ", values = ",
                                     vals.shape().dim_size(0), ")"));
     }
-    int dims;
+    int dims = 0;
     TF_RETURN_IF_ERROR(GetDimsFromIx(ix, &dims));
     if (order.size() != dims) {
       return Status(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/util/stat_summarizer_test.cc b/tensorflow/core/util/stat_summarizer_test.cc
index 1feedf99cbeab2a059a7f876352ad96b320bc988..4553559e7a9850a2e0b4e658fc1df652ca275e32 100644
--- a/tensorflow/core/util/stat_summarizer_test.cc
+++ b/tensorflow/core/util/stat_summarizer_test.cc
@@ -70,7 +70,6 @@ versions {
   TF_ASSERT_OK(session->Run(run_options, {}, {"myconstant:0"}, {}, &outputs,
                             &run_metadata));
 
-  StatSummarizerOptions opts;
   StatSummarizer stats(graph_def);
   stats.ProcessStepStats(run_metadata.step_stats());
 
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 55688e580848e42bdd453a270a530a5423fb3aec..0df810abd0058facd12e2e67625d80b824dc257b 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #include <array>
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -272,7 +272,7 @@ Status ValidateStridedSliceOp(
     const std::array<int64, 2> valid_range = {
         {stride_i > 0 ? 0 : -1, stride_i > 0 ? dim_i : dim_i - 1}};
 
-    auto canonical = [stride_i, i, dim_i, masks, valid_range](int64 x, int c) {
+    auto canonical = [stride_i, dim_i, masks, valid_range](int64 x, int c) {
       if (masks[c]) {
         return stride_i > 0 ? valid_range[c] : valid_range[(c + 1) & 1];
       } else {
diff --git a/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
index 65b50bd3ae9be960283f6cdfbe7dca296e5c489b..721da2a0bdb96dcbce83ed0b2bc5f0d5094e2b8a 100644
--- a/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
+++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 __global__ void AddOneKernel(const int* in, const int N, int* out) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
@@ -25,7 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
 }
 
 void AddOneKernelLauncher(const int* in, const int N, int* out) {
-  AddOneKernel<<<32, 256>>>(in, N, out);
+  TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
+                                             in, N, out));
 }
 
 #endif
diff --git a/tensorflow/examples/adding_an_op/cuda_op_test.py b/tensorflow/examples/adding_an_op/cuda_op_test.py
index a9aaa81e3fab46f2263bf4d292c1522cb5afe246..5c7e563cf4c3e7c7613516868d5a5150cfe83530 100644
--- a/tensorflow/examples/adding_an_op/cuda_op_test.py
+++ b/tensorflow/examples/adding_an_op/cuda_op_test.py
@@ -26,9 +26,8 @@ class AddOneTest(tf.test.TestCase):
 
   def test(self):
     if tf.test.is_built_with_cuda():
-      with self.cached_session():
-        result = cuda_op.add_one([5, 4, 3, 2, 1])
-        self.assertAllEqual(result.eval(), [6, 5, 4, 3, 2])
+      result = cuda_op.add_one([5, 4, 3, 2, 1])
+      self.assertAllEqual(result, [6, 5, 4, 3, 2])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index 5c47ce6b673e4c9d635b867c1ccdc679f67c6ae5..a3b53da6a35c2f54bd6cb019e949d9f4ba7f3fc9 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -24,10 +24,6 @@
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
     <uses-permission android:name="android.permission.RECORD_AUDIO" />
 
-    <uses-sdk
-        android:minSdkVersion="21"
-        android:targetSdkVersion="23" />
-
     <application android:allowBackup="true"
         android:debuggable="true"
         android:label="@string/app_name"
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index f5f0d7c3c852390ead414bf37260e531119e100b..5f99f0a9c90122ebe194b734af4950c0241156cd 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -37,8 +37,7 @@ cc_binary(
         "-lm",
         "-z defs",
         "-s",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ],
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 0767726aa9a248fb073fbd4114f47d1b4ed6901b..f771530eb9d68a7316748458fe6863c2544726fe 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -25,10 +25,11 @@ getProject().setBuildDir('gradleBuild')
 buildscript {
     repositories {
         jcenter()
+        google()
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'com.android.tools.build:gradle:3.3.1'
         classpath 'org.apache.httpcomponents:httpclient:4.5.4'
     }
 }
@@ -36,6 +37,7 @@ buildscript {
 allprojects {
     repositories {
         jcenter()
+        google()
     }
 }
 
@@ -73,11 +75,14 @@ def bazelLocation = '/usr/local/bin/bazel'
 project.ext.ASSET_DIR = projectDir.toString() + '/assets'
 project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 
+// Download default models; if you wish to use your own models then
+// place them in the "assets" directory and comment out this line.
+apply from: "download-models.gradle"
+
 apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
-    buildToolsVersion '26.0.2'
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {
@@ -135,6 +140,10 @@ android {
         debug.setRoot('build-types/debug')
         release.setRoot('build-types/release')
     }
+    defaultConfig {
+        targetSdkVersion 23
+        minSdkVersion 21
+    }
 }
 
 task buildNativeBazel(type: Exec) {
@@ -182,13 +191,8 @@ tasks.whenTaskAdded { task ->
     }
 }
 
-// Download default models; if you wish to use your own models then
-// place them in the "assets" directory and comment out this line.
-apply from: "download-models.gradle"
-
-
 dependencies {
     if (nativeBuildSystem == 'cmake' || nativeBuildSystem == 'none') {
-        compile 'org.tensorflow:tensorflow-android:+'
+        implementation 'org.tensorflow:tensorflow-android:+'
     }
 }
diff --git a/tensorflow/examples/autograph/integration_tests/BUILD b/tensorflow/examples/autograph/integration_tests/BUILD
deleted file mode 100644
index 2a4a0f75e7a120d554c882025ad2a0e280913a6d..0000000000000000000000000000000000000000
--- a/tensorflow/examples/autograph/integration_tests/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-py_test(
-    name = "keras_test",
-    srcs = [
-        "keras_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "list_literals_test",
-    srcs = [
-        "list_literals_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
deleted file mode 100644
index 3fe33df920d008845bfd1002075fd6b5dc25b31f..0000000000000000000000000000000000000000
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python import autograph
-from tensorflow.python.framework import test_util
-
-
-class MinimalKeras(tf.keras.Model):
-
-  def call(self, x):
-    return x * 3
-
-
-class ModelWithStaticConditional(object):
-
-  def __init__(self, initial):
-    self.initial = initial
-    if self.initial:
-      self.h = 15
-
-  @autograph.convert()
-  def call(self):
-    x = 10
-    if self.initial:
-      x += self.h
-    return x
-
-
-class BasicBlock(tf.keras.Model):
-
-  def __init__(self):
-    super(BasicBlock, self).__init__()
-    self.conv1 = tf.keras.layers.Conv2D(8, 3)
-    self.pool = tf.keras.layers.GlobalAveragePooling2D()
-    self.dense = tf.keras.layers.Dense(3)
-
-  def call(self, x):
-    x = self.conv1(x)
-    x = self.pool(x)
-    x = self.dense(x)
-    return x
-
-
-class CompoundModel(tf.keras.Model):
-
-  def __init__(self):
-    super(CompoundModel, self).__init__()
-    self.block = BasicBlock()
-
-  @autograph.convert(recursive=True)
-  def call(self, x):
-    x = self.block(x)  # pylint: disable=not-callable
-    return x
-
-
-class KerasTest(tf.test.TestCase):
-
-  def test_basic(self):
-    MinimalKeras()
-
-  def test_conditional_attributes_False(self):
-    model = ModelWithStaticConditional(False)
-    self.assertEqual(model.call(), 10)
-
-  def test_conditional_attributes_True(self):
-    model = ModelWithStaticConditional(True)
-    self.assertEqual(model.call(), 25)
-
-  @test_util.run_deprecated_v1
-  def test_recursive_true(self):
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 'Object conversion is not yet supported.'):
-      with tf.Graph().as_default():
-        model = CompoundModel()
-        model.build(tf.TensorShape((None, 10, 10, 1)))
-        init = tf.global_variables_initializer()
-
-        with tf.Session() as sess:
-          self.evaluate(init)
-          sample_input = tf.random_uniform((1, 10, 10, 1))
-          output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(self.evaluate(output).shape, (1, 3))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index bee94d7d90fb3f70107a5dd9e9223f3013402073..938e1faafea524cb5d8449688ac2548bccd1f8e4 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -9,7 +9,6 @@ py_test(
         "custom_regression.py",
         "dnn_regression.py",
         "imports85.py",
-        "linear_regression.py",
         "linear_regression_categorical.py",
         "test.py",
     ],
diff --git a/tensorflow/examples/get_started/regression/test.py b/tensorflow/examples/get_started/regression/test.py
index bb4db6700b8c1fca36aa51f72681112d13e8ef0e..1c37e4a671bc325ced27e030ace6a98fc1bdd59e 100644
--- a/tensorflow/examples/get_started/regression/test.py
+++ b/tensorflow/examples/get_started/regression/test.py
@@ -32,7 +32,6 @@ sys.modules["imports85"] = imports85
 import tensorflow.data as data
 
 import tensorflow.examples.get_started.regression.dnn_regression as dnn_regression
-import tensorflow.examples.get_started.regression.linear_regression as linear_regression
 import tensorflow.examples.get_started.regression.linear_regression_categorical as linear_regression_categorical
 import tensorflow.examples.get_started.regression.custom_regression as custom_regression
 
@@ -46,7 +45,8 @@ FOUR_LINES = "\n".join([
     "1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.50,171.20,65.50,52.40,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500",
     "2,164,audi,gas,std,four,sedan,fwd,front,99.80,176.60,66.20,54.30,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950",
     "2,164,audi,gas,std,four,sedan,4wd,front,99.40,176.60,66.40,54.30,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450",
-    "2,?,audi,gas,std,two,sedan,fwd,front,99.80,177.30,66.30,53.10,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250",])
+    "2,?,audi,gas,std,two,sedan,fwd,front,99.80,177.30,66.30,53.10,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250",
+])
 
 # pylint: enable=line-too-long
 
@@ -54,8 +54,8 @@ FOUR_LINES = "\n".join([
 def four_lines_dataframe():
   text = StringIO(FOUR_LINES)
 
-  return pd.read_csv(text, names=imports85.types.keys(),
-                     dtype=imports85.types, na_values="?")
+  return pd.read_csv(
+      text, names=imports85.types.keys(), dtype=imports85.types, na_values="?")
 
 
 def four_lines_dataset(*args, **kwargs):
@@ -66,22 +66,13 @@ def four_lines_dataset(*args, **kwargs):
 class RegressionTest(googletest.TestCase):
   """Test the regression examples in this directory."""
 
-  @test.mock.patch.dict(data.__dict__,
-                        {"TextLineDataset": four_lines_dataset})
-  @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
-  @test.mock.patch.dict(linear_regression.__dict__, {"STEPS": 1})
-  def test_linear_regression(self):
-    linear_regression.main([""])
-
-  @test.mock.patch.dict(data.__dict__,
-                        {"TextLineDataset": four_lines_dataset})
+  @test.mock.patch.dict(data.__dict__, {"TextLineDataset": four_lines_dataset})
   @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
   @test.mock.patch.dict(linear_regression_categorical.__dict__, {"STEPS": 1})
   def test_linear_regression_categorical(self):
     linear_regression_categorical.main([""])
 
-  @test.mock.patch.dict(data.__dict__,
-                        {"TextLineDataset": four_lines_dataset})
+  @test.mock.patch.dict(data.__dict__, {"TextLineDataset": four_lines_dataset})
   @test.mock.patch.dict(imports85.__dict__, {"_get_imports85": (lambda: None)})
   @test.mock.patch.dict(dnn_regression.__dict__, {"STEPS": 1})
   def test_dnn_regression(self):
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
index 3f94984692341b2d7ae975597ecdd1893486afb4..22ee785dc341bf117d458eef2f0e7dd5e170181b 100644
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ b/tensorflow/examples/ios/benchmark/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_width,
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
index 0aefbc6eedb0f140f7c162512cf60027bbec7501..277b6e272dc34b429021abba4a3e2381a2459060 100644
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.h
@@ -16,8 +16,8 @@
 #import <UIKit/UIKit.h>
 
 #include <memory>
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate,
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
index f10b0b983a957bd52d5bd6dc0841d899a3196beb..991568751e9bb6acdaaf1da3f217438392575aa2 100644
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ b/tensorflow/examples/ios/camera/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
index 78bdb82aae63d14835b99021ed6686b50777577b..33e95b185c74a7fb026ebf1495dca98a12d4e2ae 100644
--- a/tensorflow/examples/ios/camera/tensorflow_utils.h
+++ b/tensorflow/examples/ios/camera/tensorflow_utils.h
@@ -18,8 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Reads a serialized GraphDef protobuf file from the bundle, typically
diff --git a/tensorflow/examples/ios/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
index 0e0b771118b9eb5b33dcf7b9bea1a33b4873ac6d..2d2ee78e991e42fa1e21ae697c2c76606fc7639c 100644
--- a/tensorflow/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/examples/ios/simple/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 96ea525a4e74c68da17d0310f0ad475789314215..82552a7174072ab09143c0c23e58434d54ee5705 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <setjmp.h>
 #include <stdio.h>
 #include <string.h>
+#include <cmath>
 #include <fstream>
 #include <vector>
 
@@ -228,7 +229,9 @@ void DecodeLocation(const float* encoded_location, const float* box_priors,
   }
 }
 
-float DecodeScore(float encoded_score) { return 1 / (1 + exp(-encoded_score)); }
+float DecodeScore(float encoded_score) {
+  return 1 / (1 + std::exp(-encoded_score));
+}
 
 void DrawBox(const int image_width, const int image_height, int left, int top,
              int right, int bottom, tensorflow::TTypes<uint8>::Flat* image) {
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dc0126aa6c3c24baeb4438e7c7b13ff2ee238076
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -0,0 +1,142 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_binary(
+    name = "export_text_rnn_model",
+    srcs = ["export_text_rnn_model.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_text_rnn_model",
+    srcs = ["use_text_rnn_model.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_rnn_cell",
+    srcs = ["export_rnn_cell.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_rnn_cell",
+    srcs = ["use_rnn_cell.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_simple_text_embedding",
+    srcs = ["export_simple_text_embedding.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_model_in_sequential_keras",
+    srcs = ["use_model_in_sequential_keras.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "mnist_util",
+    srcs = ["mnist_util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_mnist_cnn",
+    srcs = ["export_mnist_cnn.py"],
+    deps = [
+        ":mnist_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_mnist_cnn",
+    srcs = ["use_mnist_cnn.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":mnist_util",
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+# NOTE: Split SavedModelTest due to Forge input size limit.
+
+py_test(
+    name = "saved_model_part1_test",
+    srcs = [
+        "saved_model_part1_test.py",
+    ],
+    data = [
+        ":export_rnn_cell",
+        ":export_simple_text_embedding",
+        ":export_text_rnn_model",
+        ":use_model_in_sequential_keras",
+        ":use_rnn_cell",
+        ":use_text_rnn_model",
+    ],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_cuda_on_cpu_tap",  # forge input size exceeded
+        "noasan",  # forge input size exceeded
+        "nomsan",  # forge input size exceeded
+        "notsan",  # forge input size exceeded
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "saved_model_part2_test",
+    srcs = [
+        "saved_model_part2_test.py",
+    ],
+    data = [
+        ":export_mnist_cnn",
+        ":use_mnist_cnn",
+    ],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # forge input size exceeded
+        "nomsan",  # forge input size exceeded
+        "notsan",  # forge input size exceeded
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1a356e661dd1917414cb3846db8ab994d8f843
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
@@ -0,0 +1,189 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports a convolutional feature extractor for MNIST in SavedModel format.
+
+The feature extractor is a convolutional neural network plus a hidden layer
+that gets trained as part of an MNIST classifier and then written to a
+SavedModel (without the classification layer). From there, use_mnist_cnn.py
+picks it up for transfer learning.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow.compat.v2 as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 10,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+flags.DEFINE_bool(
+    'export_print_hparams', False,
+    'If true, the exported function will print its effective hparams.')
+
+
+def make_feature_extractor(l2_strength, dropout_rate):
+  """Returns a Keras Model to compute a feature vector from MNIST images."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', name='conv1',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', name='conv2',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), name='pool1')(net)
+  net = tf.keras.layers.Dropout(dropout_rate, name='dropout1')(net)
+  net = tf.keras.layers.Flatten(name='flatten')(net)
+  net = tf.keras.layers.Dense(10, activation='relu', name='dense1',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def set_feature_extractor_hparams(model, dropout_rate):
+  model.get_layer('dropout1').rate = dropout_rate
+
+
+def make_classifier(feature_extractor, l2_strength, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def wrap_keras_model_for_export(model, batch_input_shape,
+                                set_hparams, default_hparams):
+  """Wraps `model` for saving and loading as SavedModel."""
+  if default_hparams is None: default_hparams = {}
+  hparam_keys = list(default_hparams.keys())
+  hparam_defaults = tuple(default_hparams.values())
+  # The goal is to save a function with this argspec...
+  argspec = tf_inspect.FullArgSpec(
+      args=(['inputs', 'training'] + hparam_keys),
+      defaults=((False,) + hparam_defaults),
+      varargs=None, varkw=None,
+      kwonlyargs=[], kwonlydefaults=None,
+      annotations={})
+  # ...and this behavior:
+  def call_fn(inputs, training, *args):
+    if FLAGS.export_print_hparams:
+      args = [tf.keras.backend.print_tensor(args[i], 'training=%s and %s='
+                                            % (training, hparam_keys[i]))
+              for i in range(len(args))]
+    kwargs = dict(zip(hparam_keys, args))
+    if kwargs: set_hparams(model, **kwargs)
+    return model(inputs, training=training)
+  # We cannot spell out `args` in def statement for call_fn, but since
+  # tf.function uses tf_inspect, we can use tf_decorator to wrap it with
+  # the desired argspec.
+  def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
+    return call_fn(*args, **kwargs)
+  traced_call_fn = tf.function(autograph=False)(
+      tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
+  # Now we need to trigger traces for
+  # - training set to Python values True or False (hence two traces),
+  # - tensor inputs of the expected nesting, shape and dtype,
+  # - tensor-valued kwargs for hparams, with caller-side defaults.
+  # Tracing with partially determined shapes requires an input signature,
+  # so we initiate tracing from a helper function with only tensor inputs.
+  @tf.function(autograph=False)
+  def trigger_traces(inputs, **kwargs):
+    return tuple(traced_call_fn(inputs, training=training, **kwargs)
+                 for training in (True, False))
+  inputs_spec = tf.TensorSpec(shape=batch_input_shape, dtype=tf.float32)
+  hparams_spec = {name: tf.TensorSpec.from_tensor(tf.constant(value))
+                  for name, value in default_hparams.items()}
+  _ = trigger_traces.get_concrete_function(inputs_spec, **hparams_spec)
+
+  # Assemble the output object.
+  obj = tf.train.Checkpoint()
+  obj.__call__ = traced_call_fn
+  obj.trainable_variables = model.trainable_variables
+  obj.variables = model.trainable_variables + model.non_trainable_variables
+  obj.regularization_losses = [_get_traced_loss(model, i)
+                               for i in range(len(model.losses))]
+  return obj
+
+
+def _get_traced_loss(model, i):
+  """Returns tf.function for model.losses[i] with a trace for zero args.
+
+  The intended usage is
+    [_get_traced_loss(model, i) for i in range(len(model.losses))]
+  This is better than
+    [tf.function(lambda: model.losses[i], input_signature=[]) for i ...]
+  because it avoids capturing a loop index in a lambda, and removes any
+  chance of deferring the trace.
+
+  Args:
+    model: a Keras Model.
+    i: an integer between from 0 up to but to len(model.losses).
+  """
+  f = tf.function(lambda: model.losses[i])
+  _ = f.get_concrete_function()
+  return f
+
+
+def main(argv):
+  del argv
+
+  # Build a complete classifier model using a feature extractor.
+  default_hparams = dict(dropout_rate=0.25)
+  l2_strength = 0.01  # Not a hparam for inputs -> outputs.
+  feature_extractor = make_feature_extractor(l2_strength=l2_strength,
+                                             **default_hparams)
+  classifier = make_classifier(feature_extractor, l2_strength=l2_strength)
+
+  # Train the complete model.
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      fake_tiny_data=FLAGS.fast_test_mode)
+  classifier.compile(loss=tf.keras.losses.categorical_crossentropy,
+                     optimizer=tf.keras.optimizers.SGD(),
+                     metrics=['accuracy'])
+  classifier.fit(x_train, y_train,
+                 batch_size=128,
+                 epochs=FLAGS.epochs,
+                 verbose=1,
+                 validation_data=(x_test, y_test))
+
+  # Save the feature extractor to a framework-agnostic SavedModel for reuse.
+  # Note that the feature_extractor object has not been compiled or fitted,
+  # so it does not contain an optimizer and related state.
+  exportable = wrap_keras_model_for_export(feature_extractor,
+                                           (None,) + mnist_util.INPUT_SHAPE,
+                                           set_feature_extractor_hparams,
+                                           default_hparams)
+  tf.saved_model.save(exportable, FLAGS.export_dir)
+
+
+if __name__ == '__main__':
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..bac1d4c35a19dd9d069b00060956b7b42b5ba97a
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Export an RNN cell in SavedModel format."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+
+import tensorflow.compat.v2 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def main(argv):
+  del argv
+
+  root = tf.train.Checkpoint()
+  # Create a cell and attach to our trackable.
+  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
+
+  # Wrap the rnn_cell.__call__ function and assign to next_state.
+  root.next_state = tf.function(root.rnn_cell.__call__, autograph=False)
+
+  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
+  # attribute with the same name.
+  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
+  def get_initial_state(tensor):
+    return root.rnn_cell.get_initial_state(tensor, None, None)
+
+  root.get_initial_state = get_initial_state
+
+  # Construct an initial_state, then call next_state explicitly to trigger a
+  # trace for serialization (we need an explicit call, because next_state has
+  # not been annotated with an input_signature).
+  initial_state = root.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+  root.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+  tf.saved_model.save(root, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4caf46c73e4ee151d7e09ba1b3ccfec820f1614
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text embedding model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+from absl import app
+from absl import flags
+
+import tensorflow.compat.v2 as tf
+
+# TODO(vbardiovsky): remove these when symbols are public.
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.training.tracking import tracking
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def write_vocabulary_file(vocabulary):
+  """Write temporary vocab file for module construction."""
+  tmpdir = tempfile.mkdtemp()
+  vocabulary_file = os.path.join(tmpdir, "tokens.txt")
+  with tf.io.gfile.GFile(vocabulary_file, "w") as f:
+    for entry in vocabulary:
+      f.write(entry + "\n")
+  return vocabulary_file
+
+
+class TextEmbeddingModel(tf.train.Checkpoint):
+  """Text embedding model.
+
+  A text embeddings model that takes a sentences on input and outputs the
+  sentence embedding.
+  """
+
+  def __init__(self, vocabulary, emb_dim, oov_buckets):
+    super(TextEmbeddingModel, self).__init__()
+    self._oov_buckets = oov_buckets
+    self._vocabulary_file = tracking.TrackableAsset(
+        write_vocabulary_file(vocabulary))
+    self._total_size = len(vocabulary) + oov_buckets
+    self._table = lookup_ops.index_table_from_file(
+        vocabulary_file=self._vocabulary_file,
+        num_oov_buckets=self._oov_buckets,
+        hasher_spec=lookup_ops.FastHashSpec)
+    self.embeddings = tf.Variable(
+        tf.random.uniform(shape=[self._total_size, emb_dim]))
+    self.variables = [self.embeddings]
+    self.trainable_variables = self.variables
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    normalized_sentences = tf.reshape(normalized_sentences, [-1])
+    sparse_tokens = tf.strings.split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+    sparse_token_ids = self._table.lookup(sparse_tokens.values)
+
+    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def __call__(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+
+    return tf.nn.safe_embedding_lookup_sparse(
+        embedding_weights=self.embeddings,
+        sparse_ids=tf.SparseTensor(token_ids, token_values, token_dense_shape),
+        sparse_weights=None,
+        combiner="sqrtn")
+
+
+def main(argv):
+  del argv
+
+  vocabulary = ["cat", "is", "on", "the", "mat"]
+  module = TextEmbeddingModel(vocabulary=vocabulary, emb_dim=10, oov_buckets=10)
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..240f441939ef67ea1833b9379b65610558612fd3
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
@@ -0,0 +1,194 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import tensorflow.compat.v2 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+class TextRnnModel(tf.train.Checkpoint):
+  """Text RNN model.
+
+  A full generative text RNN model that can train and decode sentences from a
+  starting word.
+  """
+
+  def __init__(self, vocab, emb_dim, buckets, state_size):
+    super(TextRnnModel, self).__init__()
+    self._buckets = buckets
+    self._lstm_cell = tf.keras.layers.LSTMCell(units=state_size)
+    self._rnn_layer = tf.keras.layers.RNN(
+        self._lstm_cell, return_sequences=True)
+    self._embeddings = tf.Variable(tf.random.uniform(shape=[buckets, emb_dim]))
+    self._logit_layer = tf.keras.layers.Dense(buckets)
+    self._set_up_vocab(vocab)
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    sparse_tokens = tf.strings.split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+
+    return (sparse_tokens.indices, sparse_tokens.values,
+            sparse_tokens.dense_shape)
+
+  def _set_up_vocab(self, vocab_tokens):
+    # TODO(vbardiovsky): Currently there is no real vocabulary, because
+    # saved_model serialization does not support trackable resources. Add a real
+    # vocabulary when it does.
+    vocab_list = ["UNK"] * self._buckets
+    for vocab_token in vocab_tokens:
+      index = self._words_to_indices(vocab_token).numpy()
+      vocab_list[index] = vocab_token
+    # This is a variable representing an inverse index.
+    self._vocab_tensor = tf.Variable(vocab_list)
+
+  def _indices_to_words(self, indices):
+    return tf.gather(self._vocab_tensor, indices)
+
+  def _words_to_indices(self, words):
+    return tf.strings.to_hash_bucket(words, self._buckets)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def train(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+    tokens_sparse = tf.sparse.SparseTensor(
+        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
+    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")
+
+    sparse_lookup_ids = tf.sparse.SparseTensor(
+        indices=tokens_sparse.indices,
+        values=self._words_to_indices(tokens_sparse.values),
+        dense_shape=tokens_sparse.dense_shape)
+    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)
+
+    # Targets are the next word for each word of the sentence.
+    tokens_ids_seq = lookup_ids[:, 0:-1]
+    tokens_ids_target = lookup_ids[:, 1:]
+
+    tokens_prefix = tokens[:, 0:-1]
+
+    # Mask determining which positions we care about for a loss: all positions
+    # that have a valid non-terminal token.
+    mask = tf.logical_and(
+        tf.logical_not(tf.equal(tokens_prefix, "")),
+        tf.logical_not(tf.equal(tokens_prefix, "<E>")))
+
+    input_mask = tf.cast(mask, tf.int32)
+
+    with tf.GradientTape() as t:
+      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
+                                                   tokens_ids_seq)
+
+      lstm_initial_state = self._lstm_cell.get_initial_state(
+          sentence_embeddings)
+
+      lstm_output = self._rnn_layer(
+          inputs=sentence_embeddings, initial_state=lstm_initial_state)
+
+      # Stack LSTM outputs into a batch instead of a 2D array.
+      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])
+
+      logits = self._logit_layer(lstm_output)
+
+      targets = tf.reshape(tokens_ids_target, [-1])
+      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)
+
+      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=targets, logits=logits)
+
+      # Final loss is the mean loss for all token losses.
+      final_loss = tf.math.divide(
+          tf.reduce_sum(tf.multiply(losses, weights)),
+          tf.reduce_sum(weights),
+          name="final_loss")
+
+    watched = t.watched_variables()
+    gradients = t.gradient(final_loss, watched)
+
+    for w, g in zip(watched, gradients):
+      w.assign_sub(g)
+
+    return final_loss
+
+  @tf.function
+  def decode_greedy(self, sequence_length, first_word):
+    initial_state = self._lstm_cell.get_initial_state(
+        dtype=tf.float32, batch_size=1)
+
+    sequence = [first_word]
+    current_word = first_word
+    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
+    current_state = initial_state
+
+    for _ in range(sequence_length):
+      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
+      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
+                                                    current_state)
+      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
+      logits = self._logit_layer(lstm_outputs)
+      softmax = tf.nn.softmax(logits)
+
+      next_ids = tf.math.argmax(softmax, axis=1)
+      next_words = self._indices_to_words(next_ids)[0]
+
+      current_id = next_ids
+      current_word = next_words
+      sequence.append(current_word)
+
+    return sequence
+
+
+def main(argv):
+  del argv
+
+  sentences = ["<S> hello there <E>", "<S> how are you doing today <E>"]
+  vocab = [
+      "<S>", "<E>", "hello", "there", "how", "are", "you", "doing", "today"
+  ]
+
+  module = TextRnnModel(vocab=vocab, emb_dim=10, buckets=100, state_size=128)
+
+  for _ in range(100):
+    _ = module.train(tf.constant(sentences))
+
+  # We have to call this function explicitly if we want it exported, because it
+  # has no input_signature in the @tf.function decorator.
+  decoded = module.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94adf29355607ff50bb7dbdb47dafb234d642c6
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience wrapper around Keras' MNIST and Fashion MNIST data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+INPUT_SHAPE = (28, 28, 1)
+NUM_CLASSES = 10
+
+
+def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False):
+  """Returns MNIST or Fashion MNIST train and test data."""
+  if fake_tiny_data:
+    num_fakes = 10
+    x_train = x_test = np.zeros((num_fakes, 28, 28), dtype=np.uint8)
+    y_train = y_test = np.zeros((num_fakes,), dtype=np.int64)
+  else:
+    mnist = (tf.keras.datasets.fashion_mnist if use_fashion_mnist else
+             tf.keras.datasets.mnist)
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+  return ((_prepare_image(x_train), _prepare_label(y_train)),
+          (_prepare_image(x_test), _prepare_label(y_test)))
+
+
+def _prepare_image(x):
+  """Converts images to [n,h,w,c] format in range [0,1]."""
+  return x[..., None].astype('float32') / 255.
+
+
+def _prepare_label(y):
+  """Conerts labels to one-hot encoding."""
+  return tf.keras.utils.to_categorical(y, NUM_CLASSES)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f144431d6f3a881ce6903abbb96c7bcb2130b8
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py
@@ -0,0 +1,75 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SavedModelPart1Test(tf.test.TestCase):
+
+  def assertCommandSucceeded(self, binary, **flags):
+    command_parts = [binary]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+
+    logging.info("Running: %s" % command_parts)
+    subprocess.check_call(
+        command_parts, env=dict(os.environ, TF2_BEHAVIOR="enabled"))
+
+  @test_util.run_v2_only
+  def test_text_rnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_text_rnn_model")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_text_rnn_model")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_rnn_cell(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_rnn_cell")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_rnn_cell")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_text_embedding_in_sequential_keras(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_simple_text_embedding")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile(
+        "use_model_in_sequential_keras")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e357755414ca633e8f880a3a7e6c64e9e7047c78
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py
@@ -0,0 +1,56 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel integration test for MNIST."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SavedModelPart2Test(tf.test.TestCase):
+
+  def assertCommandSucceeded(self, binary, **flags):
+    command_parts = [binary]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+
+    logging.info("Running: %s", command_parts)
+    subprocess.check_call(
+        command_parts, env=dict(os.environ, TF2_BEHAVIOR="enabled"))
+
+  @test_util.run_v2_only
+  def test_mnist_cnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile("export_mnist_cnn")
+    self.assertCommandSucceeded(
+        export_binary, export_dir=export_dir, fast_test_mode="true")
+
+    use_binary = resource_loader.get_path_to_datafile("use_mnist_cnn")
+    self.assertCommandSucceeded(
+        use_binary, export_dir=export_dir, fast_test_mode="true")
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfc160a794ff7bdc1114334fc9389af2df32109
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -0,0 +1,119 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imports a convolutional feature extractor for MNIST in SavedModel format.
+
+This program picks up the SavedModel written by export_mnist_cnn.py and
+uses the feature extractor contained in it to classification on either
+classic MNIST (digits) or Fashion MNIST (thumbnails of apparel). Optionally,
+it trains the feature extractor further as part of the new classifier.
+As expected, that makes training slower but does not help much for the
+original training dataset but helps a lot for transfer to the other dataset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow.compat.v2 as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.examples.saved_model.integration_tests import util
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 5,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'retrain', False,
+    'If set, the imported SavedModel is trained further.')
+flags.DEFINE_float(
+    'dropout_rate', None,
+    'If set, dropout rate passed to the SavedModel.')
+flags.DEFINE_float(
+    'regularization_loss_multiplier', None,
+    'If set, multiplier for the regularization losses in the SavedModel.')
+flags.DEFINE_bool(
+    'use_fashion_mnist', False,
+    'Use Fashion MNIST (products) instead of the real MNIST (digits). '
+    'With this, --retrain gains a lot.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+
+
+def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def scale_regularization_losses(obj, multiplier):
+  """Scales obj.regularization_losses by multiplier if not None."""
+  if multiplier is None: return
+  def _scale_one_loss(l):  # Separate def avoids lambda capture of loop var.
+    f = tf.function(lambda: tf.multiply(multiplier, l()))
+    _ = f.get_concrete_function()
+    return f
+  obj.regularization_losses = [_scale_one_loss(l)
+                               for l in obj.regularization_losses]
+
+
+def main(argv):
+  del argv
+
+  # Load a pre-trained feature extractor and wrap it for use in Keras.
+  obj = tf.saved_model.load(FLAGS.export_dir)
+  scale_regularization_losses(obj, FLAGS.regularization_loss_multiplier)
+  arguments = {}
+  if FLAGS.dropout_rate is not None:
+    arguments['dropout_rate'] = FLAGS.dropout_rate
+  feature_extractor = util.CustomLayer(obj, output_shape=[10],
+                                       trainable=FLAGS.retrain,
+                                       arguments=arguments)
+
+  # Build a classifier with it.
+  model = make_classifier(feature_extractor)
+
+  # Train the classifier (possibly on a different dataset).
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      use_fashion_mnist=FLAGS.use_fashion_mnist,
+      fake_tiny_data=FLAGS.fast_test_mode)
+  model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                optimizer=tf.keras.optimizers.SGD(),
+                metrics=['accuracy'])
+  print('Training on %s with %d trainable and %d untrainable variables.' %
+        ('Fashion MNIST' if FLAGS.use_fashion_mnist else 'MNIST',
+         len(model.trainable_variables), len(model.non_trainable_variables)))
+  model.fit(x_train, y_train,
+            batch_size=128,
+            epochs=FLAGS.epochs,
+            verbose=1,
+            validation_data=(x_test, y_test))
+
+
+if __name__ == '__main__':
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b6efb76f74494add6c5bd04dcb97dc1bcd288b9
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
@@ -0,0 +1,69 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use text embedding module in sequential Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import numpy as np
+import tensorflow.compat.v2 as tf
+from tensorflow.examples.saved_model.integration_tests import util
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def train(fine_tuning):
+  """Build a Keras model and train with mock data."""
+  features = np.array(["my first sentence", "my second sentence"])
+  labels = np.array([1, 0])
+  dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+
+  module = tf.saved_model.load(FLAGS.model_dir)
+
+  # Create the sequential keras model.
+  l = tf.keras.layers
+  model = tf.keras.Sequential()
+  model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
+  model.add(util.CustomLayer(module, output_shape=[10], trainable=fine_tuning))
+  model.add(l.Dense(100, activation="relu"))
+  model.add(l.Dense(50, activation="relu"))
+  model.add(l.Dense(1, activation="sigmoid"))
+
+  model.compile(
+      optimizer="adam",
+      loss="binary_crossentropy",
+      metrics=["accuracy"],
+      # TODO(b/124446120): Remove after fixed.
+      run_eagerly=True)
+
+  model.fit_generator(generator=dataset.batch(1), epochs=5)
+
+
+def main(argv):
+  del argv
+
+  train(fine_tuning=False)
+  train(fine_tuning=True)
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..14393795832e0fadc2c81bbecfc615b7266cb689
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use an RNN cell stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+  cell = tf.saved_model.load(FLAGS.model_dir)
+
+  initial_state = cell.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+
+  cell.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3811e3606c4aa0acd21f5e8119b4c3c4a566e45b
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
@@ -0,0 +1,46 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow.compat.v2 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+
+  sentences = [
+      "<S> sentence <E>", "<S> second sentence <E>", "<S> third sentence<E>"
+  ]
+
+  model = tf.saved_model.load(FLAGS.model_dir)
+  model.train(tf.constant(sentences))
+  decoded = model.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/util.py b/tensorflow/examples/saved_model/integration_tests/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b709fdf98cd2b98c58e7371f45429d629d08cbd
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/util.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.util import tf_inspect
+
+
+# TODO(vbardiovsky): We should just reuse Keras's Lambda layer, when that
+# enables to get trainable variables.
+class CustomLayer(tf.keras.layers.Layer):
+  """Wraps callable object as a `Layer` object.
+
+  Args:
+    func: The callable object to wrap. Layer inputs are passed as the first
+      positional argument. If `func` accepts a `training` argument, a Python
+      boolean is passed for it.
+      If present, the following attributes of `func` have a special meaning:
+        * variables: a list of all tf.Variable objects that `func` depends on.
+        * trainable_variables: those elements of `variables` that are reported
+          as trainable variables of this Keras Layer.
+        * regularization_losses: a list of callables to be added as losses
+          of this Keras layer. Each one must accept zero arguments and return
+          a scalare tensor.
+    trainable: Boolean controlling whether the trainable variables of `func`
+      are reported as trainable variables of this layer.
+    arguments: optionally, a dict with additional keyword arguments passed
+      to `func`.
+    **kwargs: 'output_shape': A tuple with the (possibly partial) output
+      shape of the callable *without* leading batch size. Other arguments
+      are pass into the Layer constructor.
+  """
+
+  def __init__(self, func, trainable=False, arguments=None, **kwargs):
+    # Set self._{non,}_trainable_weights before calling Layer.__init__.
+    if hasattr(func, 'trainable_variables'):
+      self._trainable_weights = [v for v in func.trainable_variables]
+      trainable_variables_set = set(func.trainable_variables)
+    else:
+      self._trainable_weights = []
+      trainable_variables_set = set()
+    if hasattr(func, 'variables'):
+      self._non_trainable_weights = [v for v in func.variables
+                                     if v not in trainable_variables_set]
+    else:
+      self._non_trainable_weights = []  # TODO(arnoegw): Infer from `func`.
+
+    # TODO(b/124219898): We should be able to get the embedding dimension from
+    # the restored model.
+    if 'output_shape' in kwargs:
+      self._output_shape = tuple(kwargs.pop('output_shape'))
+
+    super(CustomLayer, self).__init__(trainable=trainable, **kwargs)
+    # Prepare to call `func`.
+    self._func = func
+    self._func_fullargspec = tf_inspect.getfullargspec(func.__call__)
+    self._func_wants_training = (
+        'training' in self._func_fullargspec.args or
+        'training' in self._func_fullargspec.kwonlyargs)
+    self._arguments = arguments or {}
+    # Forward the callable's regularization losses (if any).
+    if hasattr(func, 'regularization_losses'):
+      for l in func.regularization_losses:
+        if not callable(l):
+          raise ValueError(
+              'CustomLayer(func) expects func.regularization_losses to be an '
+              'iterable of callables, each returning a scalar loss term.')
+        self.add_loss(l)  # Supports callables.
+
+  def call(self, x, training=None):
+    # We basically want to call this...
+    f = functools.partial(self._func, x, **self._arguments)
+    # ...but we may also have to pass a Python boolean for `training`.
+    if not self._func_wants_training:
+      result = f()
+    else:
+      if training is None:
+        training = tf.keras.backend.learning_phase()  # Could be a tensor.
+      result = smart_cond.smart_cond(training,
+                                     lambda: f(training=True),
+                                     lambda: f(training=False))
+    # TODO(b/124219898): Polymorphic function should return shaped tensor.
+    if hasattr(self, '_output_shape'):
+      result.set_shape((x.shape[0],) + self._output_shape)
+    return result
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 7f3c764fac62ee11c6351e11229198fc726d3804..e15497a3444638bc490240838e400113725cd65c 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -45,6 +45,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -63,6 +64,13 @@ tf_py_test(
 
 py_binary(
     name = "train",
+    srcs = ["train.py"],
+    srcs_version = "PY2AND3",
+    deps = [":train_main_lib"],
+)
+
+py_library(
+    name = "train_main_lib",
     srcs = [
         "train.py",
     ],
@@ -76,8 +84,32 @@ py_binary(
     ],
 )
 
+tf_py_test(
+    name = "train_test",
+    size = "small",
+    srcs = ["train_test.py"],
+    additional_deps = [
+        ":train",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_binary(
     name = "freeze",
+    srcs = ["freeze.py"],
+    srcs_version = "PY2AND3",
+    deps = [":freeze_main_lib"],
+)
+
+py_library(
+    name = "freeze_main_lib",
+    srcs = ["freeze.py"],
+    srcs_version = "PY2AND3",
+    deps = [":freeze_lib"],
+)
+
+py_library(
+    name = "freeze_lib",
     srcs = [
         "freeze.py",
     ],
@@ -103,6 +135,20 @@ tf_py_test(
 
 py_binary(
     name = "wav_to_features",
+    srcs = ["wav_to_features.py"],
+    srcs_version = "PY2AND3",
+    deps = [":wav_to_features_main_lib"],
+)
+
+py_library(
+    name = "wav_to_features_main_lib",
+    srcs = ["wav_to_features.py"],
+    srcs_version = "PY2AND3",
+    deps = [":wav_to_features_lib"],
+)
+
+py_library(
+    name = "wav_to_features_lib",
     srcs = [
         "wav_to_features.py",
     ],
@@ -128,6 +174,20 @@ tf_py_test(
 
 py_binary(
     name = "generate_streaming_test_wav",
+    srcs = ["generate_streaming_test_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":generate_streaming_test_wav_main_lib"],
+)
+
+py_library(
+    name = "generate_streaming_test_wav_main_lib",
+    srcs = ["generate_streaming_test_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":generate_streaming_test_wav_lib"],
+)
+
+py_library(
+    name = "generate_streaming_test_wav_lib",
     srcs = [
         "generate_streaming_test_wav.py",
     ],
@@ -168,6 +228,20 @@ tf_cc_binary(
 
 py_binary(
     name = "label_wav",
+    srcs = ["label_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":label_wav_main_lib"],
+)
+
+py_library(
+    name = "label_wav_main_lib",
+    srcs = ["label_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":label_wav_lib"],
+)
+
+py_library(
+    name = "label_wav_lib",
     srcs = [
         "label_wav.py",
     ],
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 89e790d4e4436cdc49af0fb2ae53dea8485ae9c5..8a6716db464bfe456fcbf0a3cf953baeb57d8f1a 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -49,6 +49,14 @@ import input_data
 import models
 from tensorflow.python.framework import graph_util
 
+# If it's available, load the specialized feature generator. If this doesn't
+# work, try building with bazel instead of running the Python script directly.
+# bazel run tensorflow/examples/speech_commands:freeze_graph
+try:
+  from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
+except ImportError:
+  frontend_op = None
+
 FLAGS = None
 
 
@@ -70,7 +78,7 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
     feature_bin_count: Number of frequency bands to analyze.
     model_architecture: Name of the kind of model to generate.
     preprocess: How the spectrogram is processed to produce features, for
-      example 'mfcc' or 'average'.
+      example 'mfcc', 'average', or 'micro'.
 
   Raises:
     Exception: If the preprocessing mode isn't recognized.
@@ -106,9 +114,33 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
         spectrogram,
         sample_rate,
         dct_coefficient_count=model_settings['fingerprint_width'])
+  elif preprocess == 'micro':
+    if not frontend_op:
+      raise Exception(
+          'Micro frontend op is currently not available when running TensorFlow'
+          ' directly from Python, you need to build and run through Bazel, for'
+          ' example'
+          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
+      )
+    sample_rate = model_settings['sample_rate']
+    window_size_ms = (model_settings['window_size_samples'] *
+                      1000) / sample_rate
+    window_step_ms = (model_settings['window_stride_samples'] *
+                      1000) / sample_rate
+    int16_input = tf.cast(
+        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
+    micro_frontend = frontend_op.audio_microfrontend(
+        int16_input,
+        sample_rate=sample_rate,
+        window_size=window_size_ms,
+        window_step=window_step_ms,
+        num_channels=model_settings['fingerprint_width'],
+        out_scale=1,
+        out_type=tf.float32)
+    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
   else:
-    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
-                    ' "average")' % (preprocess))
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
+                    ' "average", or "micro")' % (preprocess))
 
   fingerprint_size = model_settings['fingerprint_size']
   reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 9ed9050035baee7081ff7413c1c2fc41b86c607d..a242453d0e5f77fa61a3f1df5c5a579133e0487e 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -65,6 +65,24 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(0, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
+  def testCreateInferenceGraphWithMicro(self):
+    with self.cached_session() as sess:
+      freeze.create_inference_graph(
+          wanted_words='a,b,c,d',
+          sample_rate=16000,
+          clip_duration_ms=1000.0,
+          clip_stride_ms=30.0,
+          window_size_ms=30.0,
+          window_stride_ms=10.0,
+          feature_bin_count=40,
+          model_architecture='conv',
+          preprocess='micro')
+      self.assertIsNotNone(sess.graph.get_tensor_by_name('wav_data:0'))
+      self.assertIsNotNone(
+          sess.graph.get_tensor_by_name('decoded_sample_data:0'))
+      self.assertIsNotNone(sess.graph.get_tensor_by_name('labels_softmax:0'))
+
   @test_util.run_deprecated_v1
   def testFeatureBinCount(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 1079a302fa47bea7f5dadd35165bae2b090bb2bc..60e1b8c37a04eef06a44e8a887ae2eea3f7720e1 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -37,6 +37,13 @@ from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
+# If it's available, load the specialized feature generator. If this doesn't
+# work, try building with bazel instead of running the Python script directly.
+try:
+  from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
+except ImportError:
+  frontend_op = None
+
 MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
 SILENCE_LABEL = '_silence_'
 SILENCE_INDEX = 0
@@ -169,9 +176,12 @@ def get_features_range(model_settings):
   elif model_settings['preprocess'] == 'mfcc':
     features_min = -247.0
     features_max = 30.0
+  elif model_settings['preprocess'] == 'micro':
+    features_min = 0.0
+    features_max = 26.0
   else:
-    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
-                    ' "average")' % (model_settings['preprocess']))
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
+                    ' "average", or "micro")' % (model_settings['preprocess']))
   return features_min, features_max
 
 
@@ -377,6 +387,7 @@ class AudioProcessor(object):
 
     Raises:
       ValueError: If the preprocessing mode isn't recognized.
+      Exception: If the preprocessor wasn't compiled in.
     """
     with tf.get_default_graph().name_scope('data'):
       desired_samples = model_settings['desired_samples']
@@ -442,9 +453,36 @@ class AudioProcessor(object):
             dct_coefficient_count=model_settings['fingerprint_width'])
         tf.summary.image(
             'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
+      elif model_settings['preprocess'] == 'micro':
+        if not frontend_op:
+          raise Exception(
+              'Micro frontend op is currently not available when running'
+              ' TensorFlow directly from Python, you need to build and run'
+              ' through Bazel'
+          )
+        sample_rate = model_settings['sample_rate']
+        window_size_ms = (model_settings['window_size_samples'] *
+                          1000) / sample_rate
+        window_step_ms = (model_settings['window_stride_samples'] *
+                          1000) / sample_rate
+        int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16)
+        micro_frontend = frontend_op.audio_microfrontend(
+            int16_input,
+            sample_rate=sample_rate,
+            window_size=window_size_ms,
+            window_step=window_step_ms,
+            num_channels=model_settings['fingerprint_width'],
+            out_scale=1,
+            out_type=tf.float32)
+        self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
+        tf.summary.image(
+            'micro',
+            tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
+            max_outputs=1)
       else:
-        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
-                         ' "average")' % (model_settings['preprocess']))
+        raise ValueError(
+            'Unknown preprocess mode "%s" (should be "mfcc", '
+            ' "average", or "micro")' % (model_settings['preprocess']))
 
       # Merge all the summaries and write them out to /tmp/retrain_logs (by
       # default)
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index 9269bb6c0bc780e06ee0c42617478e3a1486100e..031aa92484382cd3d5a7c70e14d9655b424d337e 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -202,6 +202,10 @@ class InputDataTest(test.TestCase):
   def testGetDataMfcc(self):
     self._runGetDataTest("mfcc", 30)
 
+  @test_util.run_deprecated_v1
+  def testGetDataMicro(self):
+    self._runGetDataTest("micro", 20)
+
   @test_util.run_deprecated_v1
   def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index c63d4c3c7d1a337840f1ce6d61344ad274036f71..d368fec9019d468cb09ee65f9704ff7b43aac9f9 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -71,9 +71,12 @@ def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
   elif preprocess == 'mfcc':
     average_window_width = -1
     fingerprint_width = feature_bin_count
+  elif preprocess == 'micro':
+    average_window_width = -1
+    fingerprint_width = feature_bin_count
   else:
-    raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
-                     ' "average")' % (preprocess))
+    raise ValueError('Unknown preprocess mode "%s" (should be "mfcc",'
+                     ' "average", or "micro")' % (preprocess))
   fingerprint_size = fingerprint_width * spectrogram_length
   return {
       'desired_samples': desired_samples,
@@ -527,6 +530,10 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       shape=[num_filters, batch, input_time_size],
       trainable=False,
       name='runtime-memory')
+  first_time_flag = tf.get_variable(
+      name="first_time_flag",
+      dtype=tf.int32,
+      initializer=1)
   # Determine the number of new frames in the input, such that we only operate
   # on those. For training we do not use the memory, and thus use all frames
   # provided in the input.
@@ -537,9 +544,10 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
     window_stride_ms = int(model_settings['window_stride_samples'] * 1000 /
                            model_settings['sample_rate'])
     num_new_frames = tf.cond(
-        tf.equal(tf.count_nonzero(memory), 0),
+        tf.equal(first_time_flag, 1),
         lambda: input_time_size,
         lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))
+  first_time_flag = 0
   new_fingerprint_input = fingerprint_input[
       :, -num_new_frames*input_frequency_size:]
   # Expand to add input channels dimension.
diff --git a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
index 2972ab778b15233b054568fa5a83b2d2d6798800..6a32744276d89911abac80bbd9503849f414ea74 100644
--- a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
+++ b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
@@ -244,7 +244,7 @@ int main(int argc, char* argv[]) {
   std::vector<std::pair<string, int64>> all_found_words;
   tensorflow::StreamingAccuracyStats previous_stats;
 
-  const int64 audio_data_end = (sample_count - clip_duration_ms);
+  const int64 audio_data_end = (sample_count - clip_duration_samples);
   for (int64 audio_data_offset = 0; audio_data_offset < audio_data_end;
        audio_data_offset += clip_stride_samples) {
     const float* input_start = &(audio_data[audio_data_offset]);
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index f6e39b0b5519cad8f9c90500c960c613f6c8cf4c..43a399b912ec9bc3a1f69be343d864bca0274136 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -446,7 +446,7 @@ if __name__ == '__main__':
       '--preprocess',
       type=str,
       default='mfcc',
-      help='Spectrogram processing mode. Can be "mfcc" or "average"')
+      help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/train_test.py b/tensorflow/examples/speech_commands/train_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db195760e98812d224cac5b9dfe5c66d4d6a7088
--- /dev/null
+++ b/tensorflow/examples/speech_commands/train_test.py
@@ -0,0 +1,144 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data input for speech commands."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
+from tensorflow.examples.speech_commands import train
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+# Used to convert a dictionary into an object, for mocking parsed flags.
+class DictStruct(object):
+
+  def __init__(self, **entries):
+    self.__dict__.update(entries)
+
+
+class TrainTest(test.TestCase):
+
+  def _getWavData(self):
+    with self.cached_session():
+      sample_data = tf.zeros([32000, 2])
+      wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
+      wav_data = self.evaluate(wav_encoder)
+    return wav_data
+
+  def _saveTestWavFile(self, filename, wav_data):
+    with open(filename, 'wb') as f:
+      f.write(wav_data)
+
+  def _saveWavFolders(self, root_dir, labels, how_many):
+    wav_data = self._getWavData()
+    for label in labels:
+      dir_name = os.path.join(root_dir, label)
+      os.mkdir(dir_name)
+      for i in range(how_many):
+        file_path = os.path.join(dir_name, 'some_audio_%d.wav' % i)
+        self._saveTestWavFile(file_path, wav_data)
+
+  def _prepareDummyTrainingData(self):
+    tmp_dir = self.get_temp_dir()
+    wav_dir = os.path.join(tmp_dir, 'wavs')
+    os.mkdir(wav_dir)
+    self._saveWavFolders(wav_dir, ['a', 'b', 'c'], 100)
+    background_dir = os.path.join(wav_dir, '_background_noise_')
+    os.mkdir(background_dir)
+    wav_data = self._getWavData()
+    for i in range(10):
+      file_path = os.path.join(background_dir, 'background_audio_%d.wav' % i)
+      self._saveTestWavFile(file_path, wav_data)
+    return wav_dir
+
+  def _getDefaultFlags(self):
+    flags = {
+        'data_url': '',
+        'data_dir': self._prepareDummyTrainingData(),
+        'wanted_words': 'a,b,c',
+        'sample_rate': 16000,
+        'clip_duration_ms': 1000,
+        'window_size_ms': 30,
+        'window_stride_ms': 20,
+        'feature_bin_count': 40,
+        'preprocess': 'mfcc',
+        'silence_percentage': 25,
+        'unknown_percentage': 25,
+        'validation_percentage': 10,
+        'testing_percentage': 10,
+        'summaries_dir': os.path.join(self.get_temp_dir(), 'summaries'),
+        'train_dir': os.path.join(self.get_temp_dir(), 'train'),
+        'time_shift_ms': 100,
+        'how_many_training_steps': '2',
+        'learning_rate': '0.01',
+        'quantize': False,
+        'model_architecture': 'conv',
+        'check_nans': False,
+        'start_checkpoint': '',
+        'batch_size': 1,
+        'background_volume': 0.25,
+        'background_frequency': 0.8,
+        'eval_step_interval': 1,
+        'save_step_interval': 1,
+    }
+    return DictStruct(**flags)
+
+  @test_util.run_deprecated_v1
+  def testTrain(self):
+    train.FLAGS = self._getDefaultFlags()
+    train.main('')
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.pbtxt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '_labels.txt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.ckpt-1.meta')))
+
+  @test_util.run_deprecated_v1
+  def testQuantizedTrain(self):
+    train.FLAGS = self._getDefaultFlags()
+    train.FLAGS.quantize = True
+    train.FLAGS.model_architecture = 'tiny_conv'
+    train.main('')
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.pbtxt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '_labels.txt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.ckpt-1.meta')))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/examples/speech_commands/wav_to_features.py b/tensorflow/examples/speech_commands/wav_to_features.py
index e6c8f45c5bfaf8cdb669c3f024a27624be8d76ba..d7f2446d355dd8ee98c37a6ff8179c19e2e721df 100644
--- a/tensorflow/examples/speech_commands/wav_to_features.py
+++ b/tensorflow/examples/speech_commands/wav_to_features.py
@@ -56,7 +56,7 @@ def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
     window_stride_ms: How far to move in time between spectogram timeslices.
     feature_bin_count: How many bins to use for the feature fingerprint.
     quantize: Whether to train the model for eight-bit deployment.
-    preprocess: Spectrogram processing mode. Can be "mfcc" or "average".
+    preprocess: Spectrogram processing mode; "mfcc", "average" or "micro".
     input_wav: Path to the audio WAV file to read.
     output_c_file: Where to save the generated C source file.
   """
@@ -86,14 +86,15 @@ def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
     f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
     f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
     if quantize:
-      f.write(' * --quantize \\\n')
+      f.write(' * --quantize=1 \\\n')
     f.write(' * --preprocess="%s" \\\n' % preprocess)
     f.write(' * --input_wav="%s" \\\n' % input_wav)
     f.write(' * --output_c_file="%s" \\\n' % output_c_file)
     f.write(' */\n\n')
-    f.write('const int g_%s_width = %d;\n' % (variable_base, features.shape[2]))
-    f.write(
-        'const int g_%s_height = %d;\n' % (variable_base, features.shape[1]))
+    f.write('const int g_%s_width = %d;\n' %
+            (variable_base, model_settings['fingerprint_width']))
+    f.write('const int g_%s_height = %d;\n' %
+            (variable_base, model_settings['spectrogram_length']))
     if quantize:
       features_min, features_max = input_data.get_features_range(model_settings)
       f.write('const unsigned char g_%s_data[] = {' % variable_base)
@@ -108,7 +109,7 @@ def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
           quantized_value = 255
         if i == 0:
           f.write('\n  ')
-        f.write('%d, ' % quantized_value)
+        f.write('%d, ' % (quantized_value))
         i = (i + 1) % 10
     else:
       f.write('const float g_%s_data[] = {\n' % variable_base)
@@ -168,7 +169,7 @@ if __name__ == '__main__':
       '--preprocess',
       type=str,
       default='mfcc',
-      help='Spectrogram processing mode. Can be "mfcc" or "average"')
+      help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
   parser.add_argument(
       '--input_wav',
       type=str,
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index 6234490b26760c99e3184cfc9a51b56169ec63bb..18e0e63fc6c78289c4e4960e979aa18fa83a5cd0 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -66,6 +66,22 @@ class WavToFeaturesTest(test.TestCase):
       content = f.read()
       self.assertTrue(b"const unsigned char g_input_data" in content)
 
+  @test_util.run_deprecated_v1
+  def testWavToFeaturesMicro(self):
+    tmp_dir = self.get_temp_dir()
+    wav_dir = os.path.join(tmp_dir, "wavs")
+    os.mkdir(wav_dir)
+    self._saveWavFolders(wav_dir, ["a", "b", "c"], 100)
+    input_file_path = os.path.join(tmp_dir, "input.wav")
+    output_file_path = os.path.join(tmp_dir, "output.c")
+    wav_data = self._getWavData()
+    self._saveTestWavFile(input_file_path, wav_data)
+    wav_to_features.wav_to_features(16000, 1000, 10, 10, 40, True, "micro",
+                                    input_file_path, output_file_path)
+    with open(output_file_path, "rb") as f:
+      content = f.read()
+      self.assertIn(b"const unsigned char g_input_data", content)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index c8ab24871c4168eb69363a2cc99492e542ca5bec..b3bd73a08b28c10dc66a3b0019411b82709a4264 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -121,7 +121,7 @@ History
 * 0.1.0: Initial release.
 * 0.2.0: Many fixes, including lower memory footprint and support for Python 3.
 * 0.3.0: Use 0.7.1 release.
-* 0.4.0: Move notMMNIST data for Google Cloud.
+* 0.4.0: Move notMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
 * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
 * 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 31bba1ffbfae1d6ae2ae2b106b262486ff3b56a7..f7100fb6f52e4f3419e68c1923c70c82ed77f551 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -49,12 +49,14 @@ from source.
 
     This can take a while (tens of minutes, more if also building for GPU).
 
-3.  Make `libtensorflow.so` available to the linker. This can be done by either:
+3.  Make `libtensorflow.so` and `libtensorflow_framework.so` available to the
+    linker. This can be done by either:
 
     a. Copying it to a system location, e.g.,
 
     ```sh
     sudo cp ${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow/libtensorflow.so /usr/local/lib
+    sudo cp ${GOPATH}/src/github.com/tensorflow/tensorflow/bazel-bin/tensorflow/libtensorflow_framework.so /usr/local/lib
     ```
 
     OR
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 666ebe33df804308fabcb9c6a856ad558928fc5e..7a3c03bd60e5d828691db8925b9b57bcf548eed9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -619,139 +619,6 @@ func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Outp
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
-//
-// value: Whether the quantization is signed or unsigned. (actually this parameter should
-// have been called <b>`signed_output`</b>)
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
-//
-// value: Whether the range is given or should be determined from the `input` tensor.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
-//
-// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
-// used when rounding float values to their quantized equivalents. The following
-// rounding modes are currently supported:
-//
-// *   HALF_TO_EVEN: this is the default round_mode.
-// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
-//     rounds up to -7.
-//
-// If not specified, defaults to "HALF_TO_EVEN"
-func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This op simulates the precision loss from the quantized forward pass by:
-//
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
-//
-// There are different ways to quantize. This version uses only scaling, so 0.0
-// maps to 0.
-//
-// From the specified 'num_bits' in the quantized output type, it determines
-// minimum and maximum representable quantized values.
-//
-// e.g.
-//
-// *   [-128, 127] for signed, num_bits = 8, or
-// *   [0, 255] for unsigned, num_bits = 8.
-//
-// If range_given == False, the initial input_min, input_max will be determined
-// automatically as the minimum and maximum values in the input tensor, otherwise
-// the specified values of input_min, input_max are used.
-//
-// Note: If the input_min, input_max are specified, they do not need to equal the
-// actual minimum and maximum values in the tensor. e.g. in some cases it may be
-// beneficial to specify these values such that the low probability extremes of the
-// input distribution are clipped.
-//
-// This op determines the maximum scale_factor that would map the initial
-// [input_min, input_max] range to a range that lies within the representable
-// quantized range.
-//
-// It determines the scale from one of input_min and input_max, then updates the
-// other one to maximize the respresentable range.
-//
-// e.g.
-//
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
-//     would update input_max to be 127 / 12.8 = 9.921875
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
-//     would update input_min to be 128.0 / 12.7 = -10.07874
-// *   if the output is unsigned, input_min is forced to be 0, and only the
-//     specified input_max is used.
-//
-// After determining the scale_factor and updating the input range, it applies the
-// following to each value in the 'input' tensor.
-//
-// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
-//
-// The above round function rounds the value based on the given round_mode.
-//
-//
-// Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
-// be represented, otherwise it is determined from the min value of the `input`
-// tensor.
-//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
-// be represented, otherwise it is determined from the max value of the `input`
-// tensor.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Bitcasts a tensor from one type to another without copying data.
 //
 // Given a tensor `input`, this operation returns a tensor that has the same buffer
@@ -782,49 +649,6 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 	return op.Output(0)
 }
 
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SpaceToDepthAttr is an optional argument to SpaceToDepth.
 type SpaceToDepthAttr func(optionalAttr)
 
@@ -1195,65 +1019,6 @@ func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddin
 	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Inserts a dimension of 1 into a tensor's shape.
 //
 // Given a tensor `input`, this operation inserts a dimension of 1 at the
@@ -1603,78 +1368,6 @@ func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
-//
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
-//
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
-		Input: []tf.Input{
-			shape, begin, end, strides, dy,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StridedSliceAttr is an optional argument to StridedSlice.
 type StridedSliceAttr func(optionalAttr)
 
@@ -1868,37 +1561,6 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-//
-// Arguments:
-//
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Slice",
-		Input: []tf.Input{
-			input, begin, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SizeAttr is an optional argument to Size.
 type SizeAttr func(optionalAttr)
 
@@ -3290,30 +2952,6 @@ func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Outpu
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
-//
-// Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Makes a copy of `x`.
 //
 // Arguments:
@@ -4434,89 +4072,31 @@ func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int6
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
-
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
-	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
-	}
-}
-
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
+		Type: "CollectiveGather",
 		Input: []tf.Input{
 			input,
 		},
@@ -5158,6 +4738,14 @@ func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
 	}
 }
 
+// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
 // Backprop step of CudnnRNNV3.
 //
 // Compute the backprop of both data and weights in a RNN. Takes an extra
@@ -5173,9 +4761,12 @@ func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
 // dropout: Dropout probability. When set to 0., dropout is disabled.
 // seed: The 1st part of a seed to initialize dropout.
 // seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
 // input_c: For LSTM, a 3-D tensor with the shape of
 //     [num_layer * dir, batch, num_units]. For other models, it is ignored.
 // params: A 1-D tensor that contains the weights and biases in an opaque layout.
@@ -5183,8 +4774,9 @@ func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
 //     separately. Note that they might not be compatible across different
 //     generations. So it is a good idea to save and restore
 // sequence_lengths: a vector of lengths of each input sequence.
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
 // output_h: The same shape has input_h.
 // output_c: The same shape as input_c for LSTM. An empty tensor for other models.
 // output_backprop: A 3-D tensor with the same shape as output in the forward pass.
@@ -5192,6 +4784,8 @@ func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
 //     pass.
 // output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
 //     pass.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
 // reserve_space: The same reserve_space produced in the forward operation.
 // input_backprop: The backprop to input in the forward pass. Has the same shape
 //     as input.
@@ -6036,77 +5630,6 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 	return op.Output(0)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
 // MapPeekAttr is an optional argument to MapPeek.
 type MapPeekAttr func(optionalAttr)
 
@@ -6627,150 +6150,83 @@ func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
+// Deprecated. Use TensorArraySizeV3
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			a, b, x,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Identity",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			input,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Deprecated. Use TensorArrayGradV3
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			y, x,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
+// Deprecated. Use TensorArrayGradV3
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -6778,84 +6234,56 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xlogy",
-		Input: []tf.Input{
-			x, y,
-		},
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// Deprecated. Use TensorArrayV3
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			size,
 		},
 		Attrs: attrs,
 	}
@@ -6863,382 +6291,443 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
+// Split the data from the input value into TensorArray elements.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Assuming that `lengths` takes on values
+//
+//   ```(n0, n1, ..., n(T-1))```
+//
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "TensorArraySplitV3",
 		Input: []tf.Input{
-			x, y,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Selects the k nearest centers for each point.
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
 //
-// Rows of points are assumed to be input points. Rows of centers are assumed to be
-// the list of candidate centers. For each point, the k centers that have least L2
-// distance to it are computed.
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
+	return func(m optionalAttr) {
+		m["init"] = value
+	}
+}
+
+// Creates a tensor with the given shape.
+//
+// This operation creates a tensor of `shape` and `dtype`.
 //
 // Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
-//	k: Number of nearest centers to return for each point. If k is larger than m, then
-// only m centers are returned.
+//	shape: 1-D. Represents the shape of the output tensor.
 //
-// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
-// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
-// corresponding center in nearest_center_indices.
-func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NearestNeighbors",
+		Type: "Empty",
 		Input: []tf.Input{
-			points, centers, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
 
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// Concat the elements from the TensorArray into value `value`.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// Takes `T` elements of shapes
+//
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			value, bias,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Scatter the data from the input value into specific TensorArray elements.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// `indices` must be a vector, its length must match the first dim of `value`.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "TensorArrayScatterV3",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			handle, indices, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			true_classes,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
 //
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "TensorArrayGradWithShape",
 		Input: []tf.Input{
-			x,
+			handle, flow_in, shape_to_prepend,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// Arguments:
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
+// **A note about the input flow_in:**
 //
-func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalSlidingWindowDataset",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns which elements of x are finite.
+// Pop the element at the top of the stack.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["axis"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Returns a one-hot tensor.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7247,163 +6736,148 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "OneHot",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			indices, depth, on_value, off_value,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
+//	handle: The handle to a queue.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			input, dimension,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: any tensor.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			input,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
 // If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7412,9 +6886,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -7422,195 +6896,254 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Returns locations of nonzero / true values in a tensor.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
 // ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// Arguments:
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "Where",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			condition,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			input,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
+//
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			x,
+			handle, tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["dct_coefficient_count"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -7618,612 +7151,604 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// A queue that produces elements in first-in first-out order.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+//	component_types: The type of each component in a value.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
+		Type: "PaddingFIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the index of a data point that should be added to the seed set.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Entries in distances are assumed to be squared distances of candidate points to
-// the already sampled centers in the seed set. The op constructs one Markov chain
-// of the k-MC^2 algorithm and returns the index of one candidate point to be added
-// as an additional cluster center.
+// Builds a merged tensor such that
 //
-// Arguments:
-//	distances: Vector with squared distances to the closest previously sampled cluster center
-// for each candidate point.
-//	seed: Scalar. Seed for initializing the random number generator.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// Returns Scalar with the index of the sampled point.
-func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "KMC2ChainInitialization",
-		Input: []tf.Input{
-			distances, seed,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
 //
 // For example:
 //
 // ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
 //
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
 //
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
+// See `dynamic_stitch` for an example on how to merge partitions back.
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			data, partitions,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// Produces a string handle for the given MultiDeviceIterator.
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "MultiDeviceIteratorToStringHandle",
 		Input: []tf.Input{
-			x,
+			multi_device_iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Checks whether a tree has been initialized.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "TensorForestTreeIsInitializedOp",
 		Input: []tf.Input{
-			x,
+			tree_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// Gets next element for the provided shard number.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "MultiDeviceIteratorGetNextFromShard",
 		Input: []tf.Input{
-			handle,
+			multi_device_iterator, shard_num, incarnation_id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
+	}
+	return components
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// Initializes the multi device iterator with the given dataset.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			y, dy,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// adjoints (conjugate transposes).
+// to zero.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// The indicator function
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
-// Returns Shape is `[..., M, M]`.
+// For example:
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			input,
+			input, num_lower, num_upper,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "IteratorGetNextAsOptional",
 		Input: []tf.Input{
-			x, y,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
+		Type: "OptionalGetValue",
 		Input: []tf.Input{
-			alpha, sample,
+			optional,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes square of x element-wise.
-//
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Square",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "OptionalHasValue",
 		Input: []tf.Input{
-			features,
+			optional,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Deserializes a proto into the tree handle
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
+//
+// Returns the created operation.
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "TensorForestTreeDeserialize",
 		Input: []tf.Input{
-			x,
+			tree_handle, tree_config,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "OptionalFromValue",
 		Input: []tf.Input{
-			input, diagonal,
+			tf.OutputList(components),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise max of two SparseTensors.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+//
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "OptimizeDataset",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input_dataset, optimizations,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
+//
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "DeserializeIterator",
 		Input: []tf.Input{
-			x,
+			resource_handle, serialized,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
+// Gather slices from `params` axis `axis` according to `indices`.
 //
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
 //
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
 //
-func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			x, y,
+			params, indices, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
-			x,
+			resource_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -8264,8 +7789,9 @@ type RegexReplaceAttr func(optionalAttr)
 
 // RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
+// value: If True, the replacement is global (that is, all matches of the `pattern` regular
+// expression in each input string are rewritten), otherwise the `rewrite`
+// substitution is only made for the first `pattern` match.
 // If not specified, defaults to true
 func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 	return func(m optionalAttr) {
@@ -8273,16 +7799,18 @@ func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Replaces matches of the `pattern` regular expression in `input` with the
+// replacement string provided in `rewrite`.
 //
 // It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
 //	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
+//	pattern: The regular expression to be matched in the `input` strings.
+//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
+// matched in the `input` strings.
 //
-// Returns The text after applying pattern and rewrite.
+// Returns The text after applying pattern match and rewrite substitution.
 func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -8740,24 +8268,6 @@ func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LessEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes softmax activations.
 //
 // For each batch `i` and class `j` we have
@@ -8827,85 +8337,33 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "Elu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Computes square of x element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Square",
 		Input: []tf.Input{
 			x,
 		},
@@ -8914,125 +8372,6 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Checks whether a tree has been initialized.
-//
-// Arguments:
-//	tree_handle: Handle to the tree.
-//
-// Returns Whether the tree is initialized.
-func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeIsInitializedOp",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
-		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
 // LeakyReluGradAttr is an optional argument to LeakyReluGrad.
 type LeakyReluGradAttr func(optionalAttr)
 
@@ -9071,54 +8410,6 @@ func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, option
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -9232,7 +8523,7 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // Computes the minimum along segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
 // This operator is similar to the unsorted segment sum operator found
@@ -9246,6 +8537,15 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // possible value for the specific numeric type,
 // `output[i] = numeric_limits<T>::max()`.
 //
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
 // If the given segment ID `i` is negative, then the corresponding value is
 // dropped, and will not be included in the result.
 //
@@ -9293,28 +8593,32 @@ func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -9322,52 +8626,42 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// Returns the truth value of (x == y) element-wise.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "Equal",
 		Input: []tf.Input{
-			a, x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// The polygamma function is defined as:
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+//
+// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+// The polygamma function is defined only for non-negative integer orders \\a\\.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			input, grad, argmax,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -9421,6 +8715,19 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
+// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
+type MaxPoolGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
 // Computes gradients of the maxpooling function.
 //
 // Arguments:
@@ -9434,11 +8741,14 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 //	padding: The type of padding algorithm to use.
 //
 // Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
@@ -9495,6 +8805,21 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	return op.Output(0)
 }
 
+// Connects N inputs to an N-way replicated TPU computation.
+func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedInput",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -9542,98 +8867,6 @@ func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -9822,37 +9055,20 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
 
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
 //	input: Shape `[batch, depth, rows, cols, in_channels]`.
@@ -9863,7 +9079,7 @@ func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9872,7 +9088,7 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
 			input, filter, out_backprop,
 		},
@@ -9882,24 +9098,6 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -9977,78 +9175,6 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
 type MaxPoolGradAttr func(optionalAttr)
 
@@ -10180,6 +9306,236 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
 // The input `SparseTensor` is represented via the tuple of inputs
@@ -10243,36 +9599,66 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
+type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters with debug support.
 //
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"reduction": reduction}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NcclReduce",
+		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			parameters, momenta, velocities, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the bias tensor will be added to the last dimension
@@ -10282,23 +9668,23 @@ type BiasAddGradAttr func(optionalAttr)
 // The tensor will be added to "in_channels", the third-to-the-last
 //     dimension.
 // If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
+// Adds `bias` to `value`.
 //
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10307,9 +9693,9 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			out_backprop,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -10317,13 +9703,151 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Xdivy",
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Selects the k nearest centers for each point.
+//
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
+//
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -10530,298 +10054,28 @@ func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			x, perm,
+			tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
-//
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
 
 // UnbatchGradContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
@@ -11086,53 +10340,100 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
 // If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["message"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// An identity op that triggers an error if a gradient is requested.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11141,7 +10442,7 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "PreventGradient",
 		Input: []tf.Input{
 			input,
 		},
@@ -11151,498 +10452,425 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
 //
 // For example:
 //
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
 // ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Arguments:
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			condition,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			handle,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
 
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
+		m["little_endian"] = value
 	}
 }
 
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
+// Reinterpret the bytes of a string as a vector of numbers.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
+		m["table_id"] = value
 	}
 }
 
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
+		m["table_name"] = value
 	}
 }
 
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Retrieve ADAM embedding parameters.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParameters",
 
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
+		m["epsilon"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["data_format"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["is_training"] = value
 	}
 }
 
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that shards the input dataset.
+//
+// Creates a dataset that shards the input dataset by num_workers, returning a
+// sharded dataset for the index-th worker. This attempts to automatically shard
+// a dataset by examining the Dataset graph and inserting a shard op before the
+// inputs to a reader Dataset (e.g. CSVDataset, TFRecordDataset).
+//
+// This dataset will throw a NotFound error if we cannot shard the dataset
+// automatically.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this dataset across.
+//	index: A scalar representing the index of the current worker out of num_workers.
+//
+//
+func ExperimentalAutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "ExperimentalAutoShardDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, num_workers, index,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			x,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
-//
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-// ```
-// output =
-//   [5.0 0.0 0.0]  // one_hot(0)
-//   [0.0 0.0 5.0]  // one_hot(2)
-//   [0.0 0.0 0.0]  // one_hot(-1)
-//   [0.0 5.0 0.0]  // one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-// ```
-// output =
-//   [0.0 3.0 3.0 3.0]
-//   [3.0 3.0 3.0 0.0]
-//   [3.0 3.0 3.0 3.0]
-//   [3.0 0.0 3.0 3.0]
-// //  ^                one_hot(0)
-// //      ^            one_hot(2)
-// //          ^        one_hot(-1)
-// //              ^    one_hot(1)
-// ```
-//
-// Suppose that
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// Then output is `[2 x 2 x 3]`:
-// ```
-// output =
-//   [
-//     [1.0, 0.0, 0.0]  // one_hot(0)
-//     [0.0, 0.0, 1.0]  // one_hot(2)
-//   ][
-//     [0.0, 1.0, 0.0]  // one_hot(1)
-//     [0.0, 0.0, 0.0]  // one_hot(-1)
-//   ]
-// ```
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -11650,98 +10878,5594 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// CudnnRNNDropout sets the optional dropout attribute to value.
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["seed"] = value
 	}
 }
 
-// CudnnRNNSeed sets the optional seed attribute to value.
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
+func RandomUniformSeed2(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["seed2"] = value
 	}
 }
 
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformInt",
+		Input: []tf.Input{
+			resource, algorithm, shape, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
 // If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["channels"] = value
 	}
 }
 
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
 // If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["fancy_upscaling"] = value
 	}
 }
 
-// A RNN backed by cuDNN.
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
+
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormalV2",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
+
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformFullInt",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index:
+// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
+// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ModelDatasetAttr is an optional argument to ModelDataset.
+type ModelDatasetAttr func(optionalAttr)
+
+// ModelDatasetCpuBudget sets the optional cpu_budget attribute to value.
+// If not specified, defaults to 0
+func ModelDatasetCpuBudget(value int64) ModelDatasetAttr {
+	return func(m optionalAttr) {
+		m["cpu_budget"] = value
+	}
+}
+
+// Identity transformation that models performance.
+//
+// Identity transformation that models performance.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//
+//
+func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ModelDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ModelDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcatV2",
+		Input: []tf.Input{
+			input_handle, element_shape, leading_dims,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixTriangularSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Slice a `SparseTensor` based on the `start` and `size`.
+//
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSlice",
+		Input: []tf.Input{
+			indices, values, shape, start, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+//
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexFullMatch",
+		Input: []tf.Input{
+			input, pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBicubicHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicHalfPixelCenters(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseSetOperation",
+		Input: []tf.Input{
+			set1, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
+//
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
+//
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+//
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+//
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+//
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeV2",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Return histogram of values.
+//
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
+//
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Get the current size of the TensorArray.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for the scaled exponential linear (Selu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
+//
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSliceGrad",
+		Input: []tf.Input{
+			shape, begin, end, strides, dy,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
+
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the euclidean norm of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EuclideanNorm",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise min of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMinimum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu6",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSequenceExample",
+		Input: []tf.Input{
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+}
+
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
+type PrelinearizeTupleAttr func(optionalAttr)
+
+// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for all the
+// tuple shapes in the order the shapes appear in the "shapes" input. The layout
+// elements for a sub-shape can be set to -1 in which case the corresponding layout
+// will be computed by the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// An op which linearizes multiple Tensor values to an opaque variant tensor.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrelinearizeTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of the variable pointed to by `resource`.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VariableShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+	return func(m optionalAttr) {
+		m["sloppy"] = value
+	}
+}
+
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+//
+// Arguments:
+//
+//
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalParseExampleDataset",
+		Input: []tf.Input{
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeAttr is an optional argument to Prelinearize.
+type PrelinearizeAttr func(optionalAttr)
+
+// PrelinearizeShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// PrelinearizeLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence. If a layout
+// attribute is passed but its values are all -1 the layout will be computed by
+// the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeLayout(value []int64) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// An op which linearizes one Tensor value to an opaque variant tensor.
+//
+// Arguments:
+//	input: A tensor that will be linearized.
+func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prelinearize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11750,115 +16474,122 @@ func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			bytes,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["layouts"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// Arguments:
-//	bytes: All the elements must have the same length.
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
 //
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "InfeedEnqueueTuple",
 		Input: []tf.Input{
-			bytes,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Returns which elements of x are finite.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "IsFinite",
 		Input: []tf.Input{
 			x,
 		},
@@ -11867,55 +16598,60 @@ func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["new_axis_mask"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// Input images can be of different types but output images are always float.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11924,350 +16660,250 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			images, size,
+			ref, begin, end, strides, value,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
-//
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
 //
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RaggedGather",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
+	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// Extract `patches` from `images` and put them in the "depth" output dimension.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
 //
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// We specify the size-related attributes as:
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "ExtractImagePatches",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			images,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+// Computes the mean along sparse segments of a tensor.
 //
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
+// See `tf.sparse.segment_sum` for usage examples.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
 //
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// Deserializes a serialized tree ensemble config and replaces current tree
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// ensemble.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
 //
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
+		Type: "BoostedTreesDeserializeEnsemble",
 		Input: []tf.Input{
-			input, pattern,
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "ParseSingleExample",
 		Input: []tf.Input{
-			predictions, targets, k,
+			serialized, tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// A Reader that outputs the entire contents of a file as a value.
 //
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12276,216 +16912,193 @@ func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
+		Type: "WholeFileReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
 
-// RandomStandardNormalSeed sets the optional seed attribute to value.
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["table_id"] = value
 	}
 }
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["table_name"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
+// Load ADAM embedding parameters.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "LoadTPUEmbeddingADAMParameters",
 		Input: []tf.Input{
-			shape,
+			parameters, momenta, velocities,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// An op which enqueues prelinearized buffer into TPU infeed.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	input: A variant tensor representing linearized output.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "InfeedEnqueuePrelinearizedBuffer",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// Fetches multiple values from infeed as an XLA tuple.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
 	}
+	return outputs
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Enqueue multiple Tensor values on the computation outfeed.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "OutfeedEnqueueTuple",
 		Input: []tf.Input{
-			shape,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12494,252 +17107,122 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["input_mode"] = value
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
-type StatefulStandardNormalAttr func(optionalAttr)
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
 
-// StatefulStandardNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["time_major"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// A RNN backed by cuDNN.
 //
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	shape: The shape of the output tensor.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12748,94 +17231,72 @@ func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormal",
+		Type: "CudnnRNNV3",
 		Input: []tf.Input{
-			resource, shape,
+			input, input_h, input_c, params, sequence_lengths,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
 //
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	mutex: The mutex resource to lock.
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexLock",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			mutex,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Creates a Tensor by indexing into the TensorList.
 //
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "TensorListGather",
 		Input: []tf.Input{
-			serialized,
+			input_handle, indices, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -12843,477 +17304,473 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
-//
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Creates a TensorList which, when stacked, has the value of `tensor`.
-//
-// Each tensor in the result list corresponds to one row of the input tensor.
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
-		Input: []tf.Input{
-			tensor, element_shape,
-		},
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
 
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["encoding"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	record_bytes: Number of bytes in the record.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			resource, indices, updates,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
+		Type: "TPUReplicatedOutput",
 		Input: []tf.Input{
-			resource, indices, updates,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
 	}
-	return scope.AddOperation(opspec)
+	return outputs
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
 //
-//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "LoadTPUEmbeddingFTRLParameters",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			parameters, accumulators, linears,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
-//
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
-//
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+// Returns (x - y)(x - y) element-wise.
 //
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Push an element onto the tensor_array.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV3",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
 	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
-//
-// value: Compression level.
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["table_id"] = value
 	}
 }
 
-// PNG-encode an image.
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
 //
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "CompareAndBitpack",
 		Input: []tf.Input{
-			image,
+			input, threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+// value: Whether the quantization is signed or unsigned. (actually this parameter should
+// have been called <b>`signed_output`</b>)
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["signed_input"] = value
 	}
 }
 
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
 //
-// one in the source data format.
+// value: Whether the range is given or should be determined from the `input` tensor.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
 //
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+// used when rounding float values to their quantized equivalents. The following
+// rounding modes are currently supported:
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// *   HALF_TO_EVEN: this is the default round_mode.
+// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+//     rounds up to -7.
+//
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This op simulates the precision loss from the quantized forward pass by:
+//
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
+//
+// There are different ways to quantize. This version uses only scaling, so 0.0
+// maps to 0.
+//
+// From the specified 'num_bits' in the quantized output type, it determines
+// minimum and maximum representable quantized values.
+//
+// e.g.
+//
+// *   [-128, 127] for signed, num_bits = 8, or
+// *   [0, 255] for unsigned, num_bits = 8.
+//
+// If range_given == False, the initial input_min, input_max will be determined
+// automatically as the minimum and maximum values in the input tensor, otherwise
+// the specified values of input_min, input_max are used.
+//
+// Note: If the input_min, input_max are specified, they do not need to equal the
+// actual minimum and maximum values in the tensor. e.g. in some cases it may be
+// beneficial to specify these values such that the low probability extremes of the
+// input distribution are clipped.
+//
+// This op determines the maximum scale_factor that would map the initial
+// [input_min, input_max] range to a range that lies within the representable
+// quantized range.
+//
+// It determines the scale from one of input_min and input_max, then updates the
+// other one to maximize the respresentable range.
+//
+// e.g.
+//
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+//     would update input_max to be 127 / 12.8 = 9.921875
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+//     would update input_min to be 128.0 / 12.7 = -10.07874
+// *   if the output is unsigned, input_min is forced to be 0, and only the
+//     specified input_max is used.
+//
+// After determining the scale_factor and updating the input range, it applies the
+// following to each value in the 'input' tensor.
+//
+// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
+//
+// The above round function rounds the value based on the given round_mode.
+//
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
+// be represented, otherwise it is determined from the min value of the `input`
+// tensor.
+//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
+// be represented, otherwise it is determined from the max value of the `input`
+// tensor.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13322,9 +17779,9 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "QuantizeAndDequantizeV2",
 		Input: []tf.Input{
-			x,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
@@ -13332,235 +17789,327 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 	return op.Output(0)
 }
 
-// Initializes the multi device iterator with the given dataset.
+// A TPU core selector Op.
 //
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
 //
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
-		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
+		Type: "TPUOrdinalSelector",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// The hash function is deterministic on the content of the string within the
-// process.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-//	num_buckets: The number of buckets.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			string_tensor,
+			table_handle, keys, default_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Update '*var' according to the RMSProp algorithm.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// Returns the truth value of (x > y) element-wise.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "Greater",
 		Input: []tf.Input{
-			gradients, outputs,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// Creates a TensorList by indexing into a Tensor.
 //
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "TensorListScatterV2",
 		Input: []tf.Input{
-			input_dataset, count,
+			tensor, indices, element_shape, num_elements,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// The tensor returned by this operation is immutable.
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
 //
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			mutex_lock,
+			image_size, bounding_boxes,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
 // value: An optional bool. Defaults to True. If True, the assignment will
 // be protected by a lock; otherwise the behavior is undefined,
 // but may exhibit less contention.
 // If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Applies sparse addition to individual values or slices in a Variable.
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
 //
 // `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
@@ -13574,24 +18123,24 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 // `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
 // ```
 //
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
 //
 // ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
 // ```
 //
 // The resulting update to ref would look like this:
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+//     [1, 11, 3, 10, 9, 6, 7, 12]
 //
 // See `tf.scatter_nd` for more details about how to make updates to
 // slices.
@@ -13600,11 +18149,11 @@ func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 //	ref: A resource handle. Must be from a VarHandleOp.
 //	indices: A Tensor. Must be one of the following types: int32, int64.
 // A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
 // values to add to ref.
 //
 // Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13613,7 +18162,7 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
 			ref, indices, updates,
 		},
@@ -13622,90 +18171,183 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
-// Updates the tree ensemble by either adding a layer to the last tree being grown
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
 //
-// or by starting a new tree.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
+		Type: "UnicodeDecodeWithOffsets",
 		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "Sub",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
 //
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
 //
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "LRN",
 		Input: []tf.Input{
 			input,
 		},
@@ -13715,33 +18357,90 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["table_id"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13750,89 +18449,81 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			shape, seed,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["table_id"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "LoadTPUEmbeddingMomentumParameters",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			parameters, momenta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Divides sparse updates into the variable referenced by `resource`.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] /= updates[...]
+//     ref[indices, ...] = updates[...]
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
+//     ref[indices[i], ...] = updates[i, ...]
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
 //
 // Arguments:
 //	resource: Should be from a `Variable` node.
@@ -13840,12 +18531,12 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -13853,253 +18544,123 @@ func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "RightShift",
 		Input: []tf.Input{
-			shape, seed,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
 
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
 	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+		m["num_elements"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+// Stacks all tensors in the list.
 //
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// Requires that all tensors have the same shape.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "TensorListStack",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input_handle, element_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
-type UnicodeEncodeAttr func(optionalAttr)
-
-// UnicodeEncodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD (U+65533).
-// If not specified, defaults to 65533
-func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["replacement_char"] = value
+		m["dtype"] = value
 	}
 }
 
-// Encode a tensor of ints into unicode strings.
-//
-// Returns a vector of strings, where `output[i]` is constructed by encoding the
-// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
-// using `output_encoding`.
-//
-// ---
-//
-// Example:
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// ```
-// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
-// input_splits = [0, 5, 10]
-// output_encoding = 'UTF-8'
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// output = ['Hello', 'World']
-// ```
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
-//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
-// In particular, `output[i]` is constructed by encoding the codepoints in the
-// slice `input_values[input_splits[i]:input_splits[i+1]]`.
-//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
-// "UTF-16-BE", and "UTF-32-BE"`.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
-func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeEncode",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			input_values, input_splits,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -14107,343 +18668,303 @@ func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of tensors in the input tensor list.
+// Makes its input available to the next iteration.
 //
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListLength",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			input_handle,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
-//
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+// Returns the truth value of x AND y element-wise.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			start, limit, delta,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["old_vocab_size"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
+//
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
+//
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+//
+// Arguments:
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
+//
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			indices,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0), op.Output(1)
 }
 
-// Returns element-wise integer closest to x.
+// Says whether the targets are in the top `K` predictions.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "InTopK",
 		Input: []tf.Input{
-			x,
+			predictions, targets,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["table_name"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Retrieve Adagrad embedding parameters with debug support.
 //
-// want to use Nesterov momentum.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	mutex_lock: A tensor returned by `MutexLock`.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "ConsumeMutexLock",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			mutex_lock,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
 
-// SubstrUnit sets the optional unit attribute to value.
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
 //
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+// Applies sparse addition to individual values or slices in a Variable.
 //
-// ---
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// Examples
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// Using scalar `pos` and `len`:
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
 //
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// output = [b'ell', b'orl']
 // ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 // ```
 //
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
 // ```
 //
-// Broadcasting `input` onto `pos` and `len`:
+// The resulting update to ref would look like this:
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+//     [1, 13, 3, 14, 14, 6, 7, 20]
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14452,240 +18973,293 @@ func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optiona
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "ResourceScatterNdAdd",
 		Input: []tf.Input{
-			input, pos, len,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
 }
 
-// Exits the current frame to its parent frame.
+// Worker heartbeat op.
 //
-// Exit makes its input `data` available to the parent frame.
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "WorkerHeartbeat",
 		Input: []tf.Input{
-			data,
+			request,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			tf.OutputList(batch), mode_override,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// An op that receives embedding activations on the TPU.
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
-		Input: []tf.Input{
-			indices, values, shape, start, size,
-		},
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
+		return
+	}
+	return outputs
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// This operation computes
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
 //
-//     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// For example:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
+		Type: "Select",
 		Input: []tf.Input{
-			resource, indices, updates,
+			condition, x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// Returns the set of files matching one or more glob patterns.
 //
-// ```
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
 // Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["squeeze_dims"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
 // For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	input: The `input` to squeeze.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14694,62 +19268,48 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input, delimiter,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Update '*var' according to the adadelta scheme.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14758,137 +19318,170 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the complex conjugate of a complex number.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
-// For example:
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input,
+			boxes, scores, max_output_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// Compute the backprop of both data and weights in a RNN.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14897,135 +19490,222 @@ func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// Gather ragged slices from `params` axis `0` according to `indices`.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
+//
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "RaggedGather",
 		Input: []tf.Input{
-			audio, sample_rate,
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			x,
+			boxes, scores, max_output_size, iou_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
 
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the AdaMax algorithm.
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+			shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["out_type"] = value
 	}
 }
 
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15034,245 +19714,244 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
+	return scope.AddOperation(opspec)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
 	}
-	return output_indices, output_values, output_shape
 }
 
-// Computes numerical negative value element-wise.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
+// Parses a text file and creates a batch of examples.
 //
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+		Type: "Skipgram",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -15280,49 +19959,71 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
 	}
+	return scope.AddOperation(opspec)
 }
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
+// Deprecated. Use TensorArrayReadV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Batch normalization.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			inputs, min, max,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -15330,68 +20031,65 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 	return op.Output(0)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["container"] = value
 	}
 }
 
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// The resulting update to ref would look like this:
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15400,360 +20098,343 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			ref, indices, updates,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Produces a string handle for the given MultiDeviceIterator.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
 //
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
-		Input: []tf.Input{
-			multi_device_iterator,
-		},
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
-type CudnnRNNV3Attr func(optionalAttr)
-
-// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
+//
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+		m["topology"] = value
 	}
 }
 
-// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["use_tpu"] = value
 	}
 }
 
-// CudnnRNNV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
+//
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["direction"] = value
+		m["device_assignment"] = value
 	}
 }
 
-// CudnnRNNV3Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
+//
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["computation_shape"] = value
 	}
 }
 
-// CudnnRNNV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["host_compute_core"] = value
 	}
 }
 
-// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["padding_map"] = value
 	}
 }
 
-// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["step_marker_location"] = value
 	}
 }
 
-// A RNN backed by cuDNN.
+// Metadata indicaitng how the TPU computation should be replicated.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+// Arguments:
+//	num_replicas: Number of replicas of the computation
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNV3",
-		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths,
-		},
+		Type: "TPUReplicateMetadata",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
 //
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
 //
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			parameters, accumulators, linears, gradient_accumulators,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
+// Concatenates tensors along one dimension.
 //
 // Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			data, partitions,
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
 	}
-	return outputs
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
+		Type: "NonMaxSuppressionWithOverlaps",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
 // If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Computes gradient of the FractionalAvgPool function.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15762,68 +20443,52 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
-
-// StringFormatTemplate sets the optional template attribute to value.
-//
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["template"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
-//
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["placeholder"] = value
-	}
-}
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
 
-// StringFormatSummarize sets the optional summarize attribute to value.
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["replace_global"] = value
 	}
 }
 
-// Formats a string template using a list of tensors.
+// Replaces the match of pattern in input with rewrite.
 //
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringFormat",
+		Type: "StaticRegexReplace",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15831,284 +20496,155 @@ func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
-
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Computes gradients for the exponential linear (Elu) operation.
 //
-// For example:
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			input,
+			gradients, outputs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// The hash function is deterministic on the content of the string within the
+// process.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			x, y,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+//
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "BatchDatasetV2",
 		Input: []tf.Input{
-			input,
+			input_dataset, batch_size, drop_remainder,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["num_bits"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
 // If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16117,149 +20653,115 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
+	return op.Output(0)
 }
 
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
-// LRNAlpha sets the optional alpha attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["table_id"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["table_name"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// Retrieve Momentum embedding parameters.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "Merge",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+			tf.OutputList(inputs),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Closes the given queue.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	handle: The handle to a queue.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16268,67 +20770,78 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Writes the given dataset to the given file using the TFRecord format.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "ExperimentalDatasetToTFRecord",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, filename, compression_type,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["num_elements"] = value
+		m["data_format"] = value
 	}
 }
 
-// Stacks all tensors in the list.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// Requires that all tensors have the same shape.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
+// Arguments:
+//	out_backprop: Any number of dimensions.
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListStack",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			input_handle,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -16336,43 +20849,25 @@ func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.Data
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"reduction": reduction}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "NcclReduce",
 		Input: []tf.Input{
-			shape, seed,
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
@@ -16380,256 +20875,275 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			data,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
+		Type: "CrossReplicaSum",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// and
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// then the final deserialized `SparseTensor` will be:
+// That is for rows we have grad for, we update var and accum as follows:
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			serialized_sparse,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
+// An Op to permute tensors across replicated TPU instances.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// Each instance supplies its own input.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectivePermute",
+		Input: []tf.Input{
+			input, source_target_pairs,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
+// Returns the complex conjugate of a complex number.
 //
-// For example:
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
-// Or, to remove specific size 1 dimensions:
+// For example:
 //
 // ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
 // ```
-//
-// Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "Conj",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16638,201 +21152,211 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			input, delimiter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Retrieve FTRL embedding parameters.
 //
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// 2D real-valued fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// For example:
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
 //
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			input, paddings,
+			json_examples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	resource: the input resource handle.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
-			resource,
+			handle, tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// PrintV2OutputStream sets the optional output_stream attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["output_stream"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Prints a string scalar.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Prints a string scalar to the desired output_stream.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: The string scalar to print.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16841,171 +21365,187 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "PrintV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			input,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
+// The gradient operator for the SparseSlice op.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
+//
+// Arguments:
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "SparseSliceGrad",
 		Input: []tf.Input{
-			x,
+			backprop_val_grad, input_indices, input_start, output_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs random values from a truncated normal distribution.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			shape,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["container"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["shared_name"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
+		Type: "MutableHashTableOfTensorsV2",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17014,200 +21554,228 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			string_tensor,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// Returns 0 if the denominator is zero.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+//
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DivNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Subtracts a value from the current value of a variable.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
 // Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			resource, value,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
 
-// EncodeJpegFormat sets the optional format attribute to value.
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
 	return func(m optionalAttr) {
-		m["format"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Restores a tensor from checkpoint files.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["progressive"] = value
+		m["align_corners"] = value
 	}
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
+// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
 // If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["optimize_size"] = value
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// Arguments:
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
-	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+//	num_threads: Identifies the number of threads to use for the private threadpool.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
 // If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["stack_name"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "StackV2",
 		Input: []tf.Input{
-			image,
+			max_size,
 		},
 		Attrs: attrs,
 	}
@@ -17215,48 +21783,99 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
 
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["dropout"] = value
 	}
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["seed"] = value
 	}
 }
 
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["output_dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// Backprop step of CudnnRNN.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+// Compute the backprop of both data and weights in a RNN.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17265,45 +21884,60 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			logits, num_samples,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// InfeedEnqueueShape sets the optional shape attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shape"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	input: A tensor that will be provided using the infeed mechanism.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17312,221 +21946,354 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "InfeedEnqueue",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			input,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the last element of the input list as well as a list with all but that element.
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// Fails if the list is empty.
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
+//
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			input_handle,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// TopKSorted sets the optional sorted attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["sorted"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "TopK",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
-
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// BatchToSpace for N-D tensors of type T.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Gather specific elements from the TensorArray into output `value`.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
 //
-// All elements selected by `indices` must have the same shape.
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			input, block_shape, crops,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "Unpack",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -17536,200 +22303,141 @@ func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and
 	}
 	var idx int
 	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
 		return
 	}
-	return tensors
+	return output
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Delete the stack from its resource container.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
-		Attrs: attrs,
+		Type: "StackCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Provides the time since epoch in seconds.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of max pooling function.
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
+		Type: "Timestamp",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["separator"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17738,9 +22446,9 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -17748,16 +22456,23 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalLatencyStatsDataset",
+		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			input_dataset, tag,
+			element_shape, max_num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -17765,97 +22480,85 @@ func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// tensors.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			tf.OutputList(input),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
 // and mom will not update in iterations during which the grad is zero.
 //
 // mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
+// mg <- rho * mg_{t-1} + (1-rho) * grad
 // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
 // var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
 //	ms: Should be from a Variable().
 //	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
@@ -17865,7 +22568,7 @@ func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17874,82 +22577,63 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17958,127 +22642,160 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			parameters, accumulators, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "Acosh",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["Tout"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Returns the real part of a complex number.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "Real",
 		Input: []tf.Input{
-			size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18086,389 +22803,410 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			true_classes,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
 
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "Qr",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["element_shape"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
 //
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Write data via Write and read via Read or Pack.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			filename,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
+// 3D real-valued fast Fourier transform.
 //
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// with the given separator (default is an empty separator).
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
+		Type: "Relu",
 		Input: []tf.Input{
-			element_shape, max_num_elements,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			gradients, features,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Provides the time since epoch in seconds.
+// Divides sparse updates into the variable referenced by `resource`.
 //
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+// This operation computes
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Timestamp",
+		Type: "ResourceScatterDiv",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
+// ListDiffOutIdx sets the optional out_idx attribute to value.
 // If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Computes the difference between two lists of numbers or strings.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// For example:
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+//
+// For example, given this input:
 //
 // ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
 // ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
+//
+// Arguments:
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
+//
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18477,495 +23215,391 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			parameters, accumulators, updates, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Returns a serialized GraphDef representing `input_dataset`.
 //
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Returns a graph representation for `input_dataset`.
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "DatasetToGraph",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			input_dataset,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Solves systems of linear equations.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			input, fft_length,
+			matrix, rhs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
 
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["use_locking"] = value
 	}
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
 // If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
+// want to use Nesterov momentum.
 //
-// Write data via Write and read via Read or Pack.
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "ResourceApplyKerasMomentum",
 		Input: []tf.Input{
-			size,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
-//
-// Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
+		Type: "Identity",
 		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "Atan2",
 		Input: []tf.Input{
-			x, y,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
+//     Updates specified rows with values in `v`.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+//     Computes `x[i, :] = v; return x`.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "InplaceUpdate",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			x, i, v,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Retrieve multiple values from the computation outfeed.
+//
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "OutfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// var: Should be from a Variable().
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -19040,80 +23674,206 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "MapUnstage",
 		Input: []tf.Input{
-			input,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
 }
 
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
+// An op enabling differentiation of TPU Embeddings.
 //
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// String lengths of `input`.
+// BatchToSpace for 4-D tensors of type T.
 //
-// Computes the length of each string given in the input tensor.
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
 // Arguments:
-//	input: The string for which to compute the length.
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "StringLength",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			input,
+			input, crops,
 		},
 		Attrs: attrs,
 	}
@@ -19121,212 +23881,197 @@ func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (
 	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
 //
 // Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "MakeIterator",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			dataset, iterator,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns 0 if the denominator is zero.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DivNoNan",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			x, y,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PaddedBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["replacement_char"] = value
+		m["use_locking"] = value
 	}
 }
 
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
 // If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
+// want to use Nesterov momentum.
 //
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			x, y,
+			var_, accum, lr, grad, momentum,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// Computes second-order gradients of the maxpooling function.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			shape, rate,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -19334,479 +24079,397 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Returns the last element of the input list as well as a list with all but that element.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Fails if the list is empty.
+//
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "TensorListPopBack",
 		Input: []tf.Input{
-			x, y,
+			input_handle, element_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Determine the script codes of a given tensor of Unicode integer code points.
 //
-// Inputs are the logits, not probabilities.
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	input: A Tensor of int32 Unicode code points.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "UnicodeScript",
 		Input: []tf.Input{
-			features, labels,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
+	return op.Output(0)
 }
 
-// Joins a string Tensor across the given dimensions.
+// Creates a sequence of numbers.
 //
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
 // For example:
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
 // ```
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "Range",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			start, limit, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
+type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["include_batch_in_index"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// L2 Loss.
+// Return a slice from 'input'.
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-//     output = sum(t ** 2) / 2
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "Slice",
 		Input: []tf.Input{
-			t,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "Zeta",
 		Input: []tf.Input{
-			x,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// Returns the cardinality of `input_dataset`.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Returns the cardinality of `input_dataset`.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ExperimentalDatasetCardinality",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			input_dataset,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["container"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// If two elements are equal, the lower-index element appears first.
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
+// Non-deterministically generates some integers.
 //
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
 //
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// Arguments:
+//	shape: The shape of the output tensor.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "NonDeterministicInts",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19815,63 +24478,53 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "ResourceSparseApplyKerasMomentum",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// Update '*var' according to the Adam algorithm.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19880,221 +24533,336 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "ResourceApplyAdamWithAmsgrad",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// RandomPoissonV2Seed sets the optional seed attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Creates a non-initialized hash table.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
-		Input: []tf.Input{
-			shape, rate,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
 
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["table_id"] = value
 	}
 }
 
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["ratio"] = value
+		m["table_name"] = value
 	}
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Retrieve Momentum embedding parameters with debug support.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Enqueue a Tensor on the computation outfeed.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Arguments:
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["container"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// Accepted values are:
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
 //
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			contents, crop_window,
+			empty_key, deleted_key,
 		},
 		Attrs: attrs,
 	}
@@ -20102,265 +24870,297 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+// Deprecated. Use TensorArraySplitV3
 //
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_prod(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 6, 6, 4],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
 
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["footer_bytes"] = value
+		m["input_mode"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+		m["direction"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["dropout"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
-//
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["is_training"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// A RNN backed by cuDNN.
 //
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "CudnnRNN",
 		Input: []tf.Input{
-			tag, tensor,
+			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["compression_type"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
+// Decompress strings.
 //
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	bytes: A Tensor of string which is compressed.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20369,331 +25169,224 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			input,
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_name"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Retrieve MDL Adagrad Light embedding parameters.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_name"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Retrieve Adadelta embedding parameters with debug support.
 //
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
+		Type: "MapClear",
+
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["na_value"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20702,82 +25395,97 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// `value  20 5  16 3  7`
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Randomly shuffles a tensor along its first dimension.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	value: The tensor to be shuffled.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20786,9 +25494,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -20796,107 +25504,116 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["combiners"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
 //
-// Arguments:
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "EnqueueTPUEmbeddingSparseBatch",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["dtype"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// The generated values will have mean 0 and standard deviation 1.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// The outputs are a deterministic function of `shape` and `seed`.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			resource, indices,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -20904,61 +25621,45 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
+// An Op to exchange data across TPU replicas.
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
 //
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
+//
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
 	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
+		Type: "AllToAll",
 		Input: []tf.Input{
-			logits, num_samples, seed,
+			input, group_assignment,
 		},
 		Attrs: attrs,
 	}
@@ -20966,203 +25667,111 @@ func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output,
 	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// Adds a value to the current value of a variable.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Real-valued fast Fourier transform.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "RFFT",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_id"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
-//
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["table_name"] = value
 	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
+}
+
+// Retrieve Adadelta embedding parameters.
 //
-//	value_dtype: Type of the table values.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
-		Input: []tf.Input{
-			empty_key, deleted_key,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // UpperBoundAttr is an optional argument to UpperBound.
@@ -21276,51 +25885,6 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
 type SparseReduceMaxSparseAttr func(optionalAttr)
 
@@ -21374,6 +25938,78 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
@@ -21398,236 +26034,415 @@ func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, outpu
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			y, dy,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Creates a TensorList which, when stacked, has the value of `tensor`.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "TensorListFromTensor",
 		Input: []tf.Input{
-			x, y,
+			tensor, element_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["embedding_config"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["tpu_embedding_config"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["is_global_init"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// Sets up the centralized structures for a distributed TPU system.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ConfigureDistributedTPU",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// ```
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			tensor, shape, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the element-wise sum of a list of tensors.
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// value: The type of each component in a value.
+// If not specified, defaults to <>
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
+	}
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
+//
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+//
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+		Type: "PriorityQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			shape, seed, minval, maxval,
+			var_, alpha, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// Check if the input matches the regex pattern.
 //
-// The Hurwitz zeta function is defined as:
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			x, q,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the cardinality of `input_dataset`.
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// Returns the cardinality of `input_dataset`.
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to return cardinality for.
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
 //
-// Returns The cardinality of `input_dataset`. Named constants are used to represent
-// infinite and unknown cardinality.
-func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetCardinality",
-		Input: []tf.Input{
-			input_dataset,
-		},
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalSqlDataset",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -21635,586 +26450,531 @@ func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_nam
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Arguments:
-//	input: A complex tensor.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters with debug support.
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			input,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
+// Encode audio data using the WAV file format.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// Arguments:
-//	input: A complex tensor.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// Arguments:
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			input,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			x, y,
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "Atan",
 		Input: []tf.Input{
-			input, fft_length,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["summarize"] = value
 	}
 }
 
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// Asserts that the given condition is true.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
+		Input: []tf.Input{
+			condition, tf.OutputList(data),
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["table_id"] = value
 	}
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["table_name"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Load Adagrad embedding parameters with debug support.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
 //
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			contents,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters with debug support.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "ExperimentalUnbatchDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
+
+// StringFormatTemplate sets the optional template attribute to value.
 //
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["template"] = value
+	}
+}
+
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["placeholder"] = value
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
+}
+
+// StringFormatSummarize sets the optional summarize attribute to value.
+//
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Formats a string template using a list of tensors.
+//
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+//
+// Arguments:
+//	inputs: The list of tensors to format into the placeholder string.
+//
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "StringFormat",
 		Input: []tf.Input{
-			features,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Returns true if queue is closed.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation returns true if the queue is closed and false if the queue
+// is open.
+//
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "Atanh",
 		Input: []tf.Input{
-			reader_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
-
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Concats all tensors in the list along the 0th dimension.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// Requires that all tensors have the same shape except the first dimension.
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			input_handle,
+			l, grad,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// Assigns a new value to a variable.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			pattern,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
-//
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+// Returns a tensor of ones with the same shape and type as x.
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+// The gradient of SparseFillEmptyRows.
 //
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
 //
-// Our Conv3D implements a form of cross-correlation.
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			input, filter,
+			tf.OutputList(input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -22222,191 +26982,143 @@ func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["table_id"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["table_name"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-//
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-//
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-//
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-//
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-//
-// ```c++
-// result = round(input * s)
-// ```
+// Load Adagrad embedding parameters.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
 //
 //
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "LoadTPUEmbeddingAdagradParameters",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			parameters, accumulators,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Strip leading and trailing whitespaces from the Tensor.
+//
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
 
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+// StringLengthUnit sets the optional unit attribute to value.
+//
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["unit"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
+// String lengths of `input`.
 //
+// Computes the length of each string given in the input tensor.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	input: The string for which to compute the length.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22415,111 +27127,217 @@ func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_f
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "StringLength",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Performs gradient updates of embedding tables.
 //
 // Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
-//
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"config": config}
 	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
+		Type: "SendTPUEmbeddingGradients",
 		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+// Computes numerical negative value element-wise.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "Neg",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "Substr",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, pos, len,
 		},
 		Attrs: attrs,
 	}
@@ -22527,386 +27345,294 @@ func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, o
 	return op.Output(0)
 }
 
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+// Exits the current frame to its parent frame.
 //
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
+		Type: "Exit",
 		Input: []tf.Input{
-			input_handle, tensor,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
 
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_id"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["table_name"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Retrieve proximal Adagrad embedding parameters.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// Produce a string tensor that encodes the state of a Reader.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			serialized_sparse,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Returns the number of tensors in the input tensor list.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "TensorListLength",
 		Input: []tf.Input{
-			x, y,
+			input_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			input, fft_length,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
-//
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "Asinh",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
+
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
 //
-// For example, if the inputs are
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
 //
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
 //
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
 //
-//     inputs[2]: Tensor [["f"], ["g"]]
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Transcode the input text from a source encoding to a destination encoding.
 //
-// then the output will be
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
 //
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
 //
-// if hashed_output=true then the output will be
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "UnicodeTranscode",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// Update '*var' according to the RMSProp algorithm.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22915,247 +27641,279 @@ func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["dtype"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// Creates an empty hash table.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient operator for the SparseSlice op.
-//
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
-//
-// Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
-//
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// Restores a tensor from checkpoint files.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			images,
+			file_pattern, tensor_name, shape_and_slice,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset by applying optimizations to `input_dataset`.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Creates a dataset by applying optimizations to `input_dataset`.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
 //
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			input_dataset, optimizations,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			shape, alpha,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
-type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
 
-// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// Applies sparse subtraction to individual values or slices in a Variable.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$vhat_t := max{vhat_{t-1}, v_t}$$
-// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	vhat: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
 // Returns the created operation.
-func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23164,461 +27922,474 @@ func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdamWithAmsgrad",
+		Type: "ResourceScatterNdSub",
 		Input: []tf.Input{
-			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["out_type"] = value
 	}
 }
 
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dilations"] = value
 	}
 }
 
-// Op removes and returns a random (key, value)
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			indices,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
 
-// HashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
+// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
+// If not specified, defaults to 0
+func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["batch_dims"] = value
 	}
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// Draws samples from a multinomial distribution.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// The output is computed as follows:
 //
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// and
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// then the final `SparseTensor` will be:
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSetDiag",
+		Input: []tf.Input{
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise max of two SparseTensors.
 //
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			sparse_handles,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
 
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["table_name"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// Load MDL Adagrad Light embedding parameters.
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// That is for rows we have grad for, we update var and accum as follows:
+// Arguments:
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
 //
-// accum = accum * momentum - lr * grad
-// var += accum
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			parameters, accumulators, weights, benefits,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Strip leading and trailing whitespaces from the Tensor.
-//
-// Arguments:
-//	input: A string `Tensor` of any shape.
+// List of the given size with empty elements.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			input,
+			element_shape, num_elements,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			x,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
 //
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
+// This operation computes
 //
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "ResourceScatterMin",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// MatrixSolveLsFast sets the optional fast attribute to value.
 // If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["fast"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
+// Solves one or more linear least-squares problems.
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23627,9 +28398,9 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -23637,264 +28408,117 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// Adds sparse `updates` to an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by adding sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd_add`, except that the updates
-// are added onto an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_add is to add individual elements to a
-// tensor by index. For example, say we want to add 4 elements in a rank-1
-// tensor with 8 elements.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// In Python, this scatter add operation would look like this:
+// Builds a merged tensor such that
 //
 // ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
 // ```
 //
-// The resulting tensor would look like this:
-//
-//     [1, 12, 1, 11, 10, 1, 1, 13]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter add operation would look like this:
+// For example, if each `indices[m]` is scalar or vector, we have
 //
 // ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor copied from tensor and updates added according to the indices.
-func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterAdd",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// The indicator function
+//     merged.shape = [max(indices)] + constant
 //
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
 //
 // For example:
 //
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
 // ```
 //
-// Useful special cases:
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
 // ```
 //
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// Performs a padding as a preprocess during a convolution.
 //
-// See also `RestoreSlice`.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			input, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -23902,68 +28526,110 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
 //
-// Input images and output images must be quantized types.
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
 //
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
 //
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			images, size, min, max,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
-//
-//	num_threads: Identifies the number of threads to use for the private threadpool.
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
 //
 //
-func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalPrivateThreadPoolDataset",
+		Type: "ExperimentalSqlDataset",
 		Input: []tf.Input{
-			input_dataset, num_threads,
+			driver_name, data_source_name, query,
 		},
 		Attrs: attrs,
 	}
@@ -23971,209 +28637,213 @@ func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output,
 	return op.Output(0)
 }
 
-// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
-type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
 
-// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
-		m["sloppy"] = value
+		m["table_id"] = value
 	}
 }
 
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
 //
 //
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalParseExampleDataset",
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
 		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+			parameters, ms, mom, mg,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["src_format"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// one in the source data format.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["direction"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
-//
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["dropout"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24182,9 +28852,9 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			matrix, rhs,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
@@ -24192,129 +28862,83 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "FilterByLastComponentDataset",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
-
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
+// Computes the absolute value of a tensor.
 //
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-//
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+		m["data_format"] = value
 	}
 }
 
-// Transcode the input text from a source encoding to a destination encoding.
-//
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
-//
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
-//
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
-//
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -24322,72 +28946,78 @@ func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, outp
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			x,
+			reader_handle, state,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+//	input: A complex tensor.
 //
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "IFFT",
 		Input: []tf.Input{
-			start, stop, step,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Stops gradient computation.
+// 2D fast Fourier transform.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// Arguments:
+//	input: A complex tensor.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "FFT2D",
 		Input: []tf.Input{
 			input,
 		},
@@ -24396,84 +29026,86 @@ func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Inverse 2D fast Fourier transform.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// More formally, let
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			predictions, targets,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Returns the truth value of (x != y) element-wise.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "NotEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -24482,144 +29114,84 @@ func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["table_id"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["table_name"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Load Momentum embedding parameters with debug support.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
 		Input: []tf.Input{
-			shape, alpha,
+			parameters, momenta, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
-//
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
 
-// Returns the truth value of (x >= y) element-wise.
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["dtype"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24628,9 +29200,9 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "StatefulStandardNormal",
 		Input: []tf.Input{
-			x, y,
+			resource, shape,
 		},
 		Attrs: attrs,
 	}
@@ -24638,374 +29210,187 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "Erf",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
-// ```
-//
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "Floor",
 		Input: []tf.Input{
-			condition, x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// Returns the number of records this Reader has produced.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Serializes the tree handle to a proto
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	tree_handle: Handle to the tree resource to be serialized.
-//
-// Returns Serialied proto string of the tree resource.
-func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSerialize",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			tree_handle,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
+// Concats all tensors in the list along the 0th dimension.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
+// Requires that all tensors have the same shape except the first dimension.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "TensorListConcat",
 		Input: []tf.Input{
-			a, b,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
-
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+// Elementwise computes the bitwise AND of `x` and `y`.
 //
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Converts CudnnRNN params from canonical form to usable form.
+// Sends `input` to all devices that are connected to the output.
 //
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
+// Sends `input` to all devices that are connected to the output.
 //
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
+		Type: "NcclBroadcast",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -25013,16 +29398,28 @@ func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.
 	return op.Output(0)
 }
 
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			input_dataset,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -25030,33 +29427,55 @@ func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_
 	return op.Output(0)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25065,9 +29484,9 @@ func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			input, axis,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -25075,6 +29494,27 @@ func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (ou
 	return op.Output(0)
 }
 
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EnterAttr is an optional argument to Enter.
 type EnterAttr func(optionalAttr)
 
@@ -25130,24 +29570,6 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // TryRpcAttr is an optional argument to TryRpc.
 type TryRpcAttr func(optionalAttr)
 
@@ -25270,259 +29692,47 @@ func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mean",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Add all input tensors element wise.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "AddN",
 		Input: []tf.Input{
-			input, axis,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
-
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using bilinear interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// Returns the element-wise sum of a list of tensors.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Max",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			input_dataset,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -25530,476 +29740,323 @@ func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_typ
 	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Arguments:
+// Computes the reciprocal of x element-wise.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "Inv",
 		Input: []tf.Input{
-			input, dimension,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
 // Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "ExperimentalDenseToSparseBatchDataset",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			input_dataset, batch_size, row_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// Computes the reciprocal of x element-wise.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			data, segment_ids,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Computes square root of x element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "Sqrt",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			data, segment_ids,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			table_handle, keys,
+			y, dy,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// adjoints (conjugate transposes).
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
+// Returns Shape is `[..., M, M]`.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
-//
-// If there is no entry for a given segment ID `i`, it outputs 1.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
+// Computes reciprocal of square root of x element-wise.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "Round",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserializes a serialized tree ensemble config and replaces current tree
+// Delete the TensorArray from its resource container.
 //
-// ensemble.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
 // Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
+		Type: "Exp",
 		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return op.Output(0)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["reverse"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26008,44 +30065,96 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
+		Type: "NthElement",
+		Input: []tf.Input{
+			input, n,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "Softplus",
 		Input: []tf.Input{
-			handle,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "Expm1",
 		Input: []tf.Input{
 			x,
 		},
@@ -26054,162 +30163,97 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Computes natural logarithm of x element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "Log",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes size of weights that can be used by a Cudnn RNN model.
+// Returns the index of a data point that should be added to the seed set.
 //
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// Arguments:
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
+//
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "KMC2ChainInitialization",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			distances, seed,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Sinh",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
+// Computes the sum along sparse segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
@@ -26217,12 +30261,12 @@ func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segm
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
 			data, indices, segment_ids,
 		},
@@ -26231,154 +30275,123 @@ func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igammac",
-		Input: []tf.Input{
-			a, x,
-		},
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "Cast",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// UnicodeEncodeErrors sets the optional errors attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["errors"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["replacement_char"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// Encode a tensor of ints into unicode strings.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "UnicodeEncode",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			input_values, input_splits,
 		},
 		Attrs: attrs,
 	}
@@ -26386,195 +30399,110 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical or" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "Erfc",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
+// Computes sigmoid of `x` element-wise.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			resource,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "Sin",
 		Input: []tf.Input{
-			start, stop, num,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["is_training"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
+// Gradient for batch normalization.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// For example:
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26583,23 +30511,23 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			input,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Cos",
 		Input: []tf.Input{
 			x,
 		},
@@ -26608,103 +30536,95 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// Computes the determinant of one or more square matrices.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			data, segment_ids,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// or by starting a new tree.
 //
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			input_dataset, count,
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -26712,541 +30632,384 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Sends `input` to all devices that are connected to the output.
-//
-// Sends `input` to all devices that are connected to the output.
-//
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
-//
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
-//
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
+		Type: "Acos",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Computes the Bessel i0e function of `x` element-wise.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI0e",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// Shuffle dimensions of x according to a permutation.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "Transpose",
 		Input: []tf.Input{
-			images, size,
+			x, perm,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
-//
-// For example:
+// Computes the Bessel i1e function of `x` element-wise.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "BesselI1e",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
+// Returns an element-wise indication of the sign of a number.
 //
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "Sign",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Creates a dataset that passes a sliding window over `input_dataset`.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// Arguments:
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
 //
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "ExperimentalSlidingWindowDataset",
 		Input: []tf.Input{
-			arr, size, weights,
+			input_dataset, window_size, window_shift, window_stride,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["capacity"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			x, axis,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
+// Returns element-wise integer closest to x.
 //
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "Rint",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGammaGrad",
+		Input: []tf.Input{
+			alpha, sample,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CumprodReverse sets the optional reverse attribute to value.
+// Returns x + y element-wise.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// Returns x + y element-wise.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "AddV2",
 		Input: []tf.Input{
-			x, axis,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["seed"] = value
 	}
 }
 
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["seed2"] = value
 	}
 }
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -27254,148 +31017,119 @@ func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, ma
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Only useful as a placeholder for control edges.
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
+// Saves the input tensors to disk.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// See also `SaveSlices`.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "Save",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			filename, tensor_names, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "MulNoNan",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Returns x / y element-wise for integer types.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
 
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["out_type"] = value
 	}
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Requantizes input with min and max values known per channel.
 //
 // Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
 //
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27404,9 +31138,9 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "RequantizePerChannel",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
 		Attrs: attrs,
 	}
@@ -27414,501 +31148,620 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// Arguments:
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			prefix, tensor_names, shape_and_slices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// Rolls the elements of a tensor along an axis.
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// For example:
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
+//	component_types: The type of each component in a value.
 //
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Roll",
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
-			input, shift, axis,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
-//
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "Xdivy",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// Arguments:
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// Computes the number of elements in the given table.
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
+//
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
+//
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
+//
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	table_handle: Handle to the table.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			table_handle,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Dequeues `n` tuples of one or more tensors from the given queue.
+//
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
+//
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			grads, original_image,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "Maximum",
 		Input: []tf.Input{
-			table_handle,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "Mod",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
-type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
-
-// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// value: The type list for the return values.
-// If not specified, defaults to <>
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+// Computes offsets of concat inputs within its output.
 //
-// value: The list of shapes being produced.
-// If not specified, defaults to <>
+// For example:
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Generates a MultiDeviceIterator resource from its provided string handle.
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	string_handle: String representing the resource.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns A MultiDeviceIterator resource.
-func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorFromStringHandle",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			string_handle,
+			concat_dim, tf.OutputList(shape),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// MutableHashTableV2Container sets the optional container attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_id"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["table_name"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Load RMSProp embedding parameters with debug support.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, ms, mom, gradient_accumulators,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
-
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
 //
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
+// The lower regularized incomplete Gamma function is defined as:
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
-// *MIN_COMBINED Mode Example*
+// where
 //
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
 //
-// If the mode is 'MIN_FIRST', then this approach is used:
+// is the lower incomplete Gamma function.
 //
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// *SCALED mode Example*
+// The regularized incomplete beta integral is defined as:
 //
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
 //
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
 //
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
+// where
 //
-// Our input tensor range is then `[-m, m]`.
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
 //
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
+// For example:
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27917,9 +31770,9 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "Shape",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -27927,155 +31780,131 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0)
 }
 
-// Flips all bits elementwise.
+// Computes fingerprints of the input strings.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
-//
-// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+// Computes the power of one value to another.
 //
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceDeserialize",
+		Type: "Pow",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			input,
+			features, max_value, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Returns the truth value of (x < y) element-wise.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "Less",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
-//
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["first_n"] = value
+		m["seed"] = value
 	}
 }
 
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["seed2"] = value
 	}
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// Use RandomPoissonV2 instead.
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28084,9 +31913,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -28094,93 +31923,68 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// Gets the next output from the given iterator.
 //
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
-//
-// Arguments:
-//
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
+// Returns the truth value of (x >= y) element-wise.
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28189,9 +31993,9 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			tensor,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -28199,229 +32003,263 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// Returns the truth value of x OR y element-wise.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "MatMul",
 		Input: []tf.Input{
-			resource, indices, updates,
+			a, b,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Initializes a table from a text file.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			tags, values,
+			table_handle, filename,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the mean of elements across dimensions of a tensor.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Mean",
 		Input: []tf.Input{
-			tag, values,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "Prod",
 		Input: []tf.Input{
-			handle,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
 
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["max_images"] = value
+		m["align_corners"] = value
 	}
 }
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// ResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBilinearHalfPixelCenters(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28430,9 +32268,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			tag, tensor,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -28440,42 +32278,33 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28484,9 +32313,9 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "Max",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -28494,70 +32323,55 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
-// Splits a tensor into a list.
-//
-// list[i] corresponds to lengths[i] tensors from the input tensor.
-// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
-//
-// tensor: The input tensor.
-// element_shape: A shape compatible with that of elements in the tensor.
-// lengths: Vector of sizes of the 0th dimension of tensors in the list.
-// output_handle: The list.
-func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorListSplit",
+		Type: "ExperimentalUniqueDataset",
 		Input: []tf.Input{
-			tensor, element_shape, lengths,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["output_type"] = value
 	}
 }
 
-// Performs average pooling on the input.
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			value,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -28565,574 +32379,634 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Merges summaries.
+// Converts the quantized `input` tensor into a lower-precision `output`.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// Converts the quantized `input` tensor into a lower-precision `output`, using the
+// output range specified with `requested_output_min` and `requested_output_max`.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// `[input_min, input_max]` are scalar floats that specify the range for the float
+// interpretation of the `input` data. For example, if `input_min` is -1.0f and
+// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "Requantize",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// Creates a dataset that emits the lines of one or more text files.
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			input_handle,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
+// Computes the sum along segments of a tensor.
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
 //
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
-		Input: []tf.Input{
-			input_handle, index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resizes the list.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
+// For example:
 //
-// input_handle: the input list
-// size: size of the output list
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
 //
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListResize",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			input_handle, size,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
+// Computes the mean along segments of a tensor.
 //
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
 //
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
 // For example:
 //
 // ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
 // ```
 //
+//
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			diagonal,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// Computes the minimum along segments of a tensor.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
+// Computes the sum along segments of a tensor.
 //
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			input_handle, index, item,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a Tensor by indexing into the TensorList.
+// Computes the product along segments of a tensor.
 //
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGather",
+		Type: "UnsortedSegmentProd",
 		Input: []tf.Input{
-			input_handle, indices,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
+		Type: "Cosh",
 		Input: []tf.Input{
-			tensor, indices, element_shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
+// Computes the mean along sparse segments of a tensor.
 //
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
 // Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
 //
-// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
 
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
-//
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"T": T, "S": S}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			handle, n,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// Computes gradients for SparseSegmentMean.
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			input,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
 //
-// creates directory if not existing.
 //
 // Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			filename, contents,
+			data, indices, segment_ids,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
-
-// AllKeepDims sets the optional keep_dims attribute to value.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical and" of elements across dimensions of a tensor.
+// The upper regularized incomplete Gamma function is defined as:
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// where
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input, axis,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// N is the size of the segment being reduced.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			gradients, features,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29141,87 +33015,85 @@ func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			input,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "Any",
 		Input: []tf.Input{
-			images, scale,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
+// Deletes the resource specified by the handle.
 //
-// accum = accum * momentum - lr * grad
-// var += accum
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	resource: handle to the resource to delete.
 //
 // Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29230,130 +33102,118 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// Generates values in an interval.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			matrix, rhs,
+			start, stop, num,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a serialized GraphDef representing `input_dataset`.
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
 //
-// Returns a graph representation for `input_dataset`.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+// The input tensors `real` and `imag` must have the same shape.
 //
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
+		Type: "Complex",
 		Input: []tf.Input{
-			input_dataset,
+			real, imag,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["output_idx_type"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be invertible.
-//
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
-//
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
+// Returns the imaginary part of a complex number.
 //
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
+// For example:
 //
-// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29362,198 +33222,179 @@ func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Lu",
+		Type: "Imag",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the matrix square root of one or more square matrices:
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
 //
-// matmul(sqrtm(A), sqrtm(A)) = A
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M, M]`.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			input,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Arguments:
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
 
-// PrintV2OutputStream sets the optional output_stream attribute to value.
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
-		m["output_stream"] = value
+		m["container"] = value
 	}
 }
 
-// Prints a string scalar.
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// Prints a string scalar to the desired output_stream.
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	input: The string scalar to print.
-//
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "VarHandleOp",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["Tout"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// Returns the argument of a complex number.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// For example:
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29562,124 +33403,139 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "Angle",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// Clips tensor values to a specified min and max.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			data, segment_ids,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
+// Counts the number of occurrences of each value in an integer array.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "Bincount",
 		Input: []tf.Input{
-			images,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["reverse"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29688,524 +33544,471 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			s0, s1,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["src_format"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// the source data format.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			set1, set2,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
 // ```
 //
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
 // ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
 //
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 // ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// This is more efficient than using separate `tf.reverse` ops.
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+// The `reverse` and `exclusive` kwargs can also be combined:
 //
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 // ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["table_id"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
 //
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Arguments:
 //
-// This is the opposite of `pack`.
 //
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			value,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
 	}
-	return output
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
+// Returns x + y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
 //
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			resource,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Delete the stack from its resource container.
+// Scatters tensor at indices in an input list.
 //
-// Arguments:
-//	handle: The handle to a stack.
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
+		Type: "TensorListScatterIntoExistingList",
 		Input: []tf.Input{
-			handle,
+			input_handle, tensor, indices,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Computes a range that covers the actual values present in a quantized tensor.
+//
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			basename, num_shards,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// Rolls the elements of a tensor along an axis.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// For example:
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// Updates the table to associates keys with values.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// Arguments:
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//	num_shards: An integer representing the number of shards operating in parallel.
+//	index: An integer representing the current worker index.
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_shards, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -30213,43 +34016,74 @@ func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Ou
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// Computes the number of elements in the given table.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["align_corners"] = value
 	}
 }
 
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Computes the gradient of bilinear interpolation.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30258,38 +34092,74 @@ func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
+// Outputs all keys and values in the table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
 
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
+type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
+
+// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: The type list for the return values.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["output_types"] = value
 	}
 }
 
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: The list of shapes being produced.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
+// Generates a MultiDeviceIterator resource from its provided string handle.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Arguments:
+//	string_handle: String representing the resource.
+//
+// Returns A MultiDeviceIterator resource.
+func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30298,9 +34168,9 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "MultiDeviceIteratorFromStringHandle",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
@@ -30308,47 +34178,63 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
+// value: If non-empty, this table is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
+		Type: "MutableHashTableV2",
 
 		Attrs: attrs,
 	}
@@ -30356,29 +34242,98 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["mode"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30387,274 +34342,285 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// Flips all bits elementwise.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "Invert",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-//     Adds v into specified rows of x.
+// Inverse 3D fast Fourier transform.
 //
-//     Computes y = x; y[i, :] += v; return y.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//	input: A complex64 tensor.
 //
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			x, i, v,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// Shuts down a running distributed TPU system.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// The op returns an error if no system is running.
 //
 // Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+		Type: "ShutdownDistributedTPU",
 	}
 	return scope.AddOperation(opspec)
 }
 
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// BatchSharedName sets the optional shared_name attribute to value.
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
 // If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["message"] = value
 	}
 }
 
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["batching_queue"] = value
+		m["first_n"] = value
 	}
 }
 
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
-//
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
-//
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Batch",
+		Type: "Print",
 		Input: []tf.Input{
-			tf.OutputList(in_tensors),
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
 	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Arguments:
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			images, delta,
+			input_dataset, buffer_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["description"] = value
 	}
 }
 
-// Computes the gradient of bicubic interpolation.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
+//
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30663,9 +34629,9 @@ func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			grads, original_image,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -30673,41 +34639,24 @@ func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	handle: The handle to a TensorArray.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			images, size,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -30715,215 +34664,211 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of nearest neighbor interpolation.
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "ResourceScatterMax",
 		Input: []tf.Input{
-			grads, size,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Extract the shape information of a JPEG-encoded image.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			contents,
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["max_images"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["bad_color"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// Outputs a `Summary` protocol buffer with images.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// DecodePngChannels sets the optional channels attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Outputs a `Summary` protocol buffer with audio.
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30932,9 +34877,9 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			contents,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -30942,139 +34887,119 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// Splits a tensor into a list.
 //
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "TensorListSplit",
 		Input: []tf.Input{
-			contents,
+			tensor, element_shape, lengths,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Performs average pooling on the input.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			true_classes,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// Merges summaries.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// The shape of the elements of the given list, as a tensor.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "TensorListElementShape",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
@@ -31082,305 +35007,268 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+// Returns the item in the list with the given index.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
 //
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// Resizes the list.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+//
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Diag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			images, boxes,
+			input_handle, index, item,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
+// Creates a TensorList by indexing into a Tensor.
 //
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
+		Type: "TensorListScatter",
 		Input: []tf.Input{
-			iterator,
+			tensor, indices, element_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayScatterV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
-
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["precision"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["scientific"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["shortest"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// AsStringWidth sets the optional width attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["width"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// AsStringFill sets the optional fill attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["fill"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
-//
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31389,258 +35277,168 @@ func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "AsString",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
-
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
+	return op.Output(0)
 }
 
-// Extracts a glimpse from the input tensor.
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
 //
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
 //
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			input, size, offsets,
+			starts, limits, deltas,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// A container for an iterator resource.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
-type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
-
-// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a TensorForestTreeResource
-func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeResourceHandleOp",
-
-		Attrs: attrs,
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// creates directory if not existing.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			filename, contents,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
-//
-// Arguments:
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "All",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -31648,26 +35446,28 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
 			input,
 		},
@@ -31676,155 +35476,89 @@ func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Solves tridiagonal systems of equations.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// `diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+// represent matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+// `rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+// each left-hand side.
+// The output is a tensor of shape `[..., M, K]` containing the solutions.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	diagonals: Shape is `[..., 3, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
+		Type: "TridiagonalSolve",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			diagonals, rhs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31833,9 +35567,9 @@ func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -31843,127 +35577,183 @@ func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
+// Adjust the saturation of one or more images.
 //
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			input,
+			images, scale,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
 //
 // Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
 //
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FakeParam",
-
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
-//
-// This operation returns the same result as the C++ std::nextafter function.
+// Deprecated. Use TensorArrayCloseV3
 //
-// It can also return a subnormal number.
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-// @compatibility(cpp)
-// Equivalent to C++ std::nextafter function.
-// @end_compatibility
-func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextAfter",
+		Type: "TensorArrayCloseV2",
 		Input: []tf.Input{
-			x1, x2,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			y, dy,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// List of the given size with empty elements.
+// A dataset that creates window datasets from the input dataset.
 //
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
+		Type: "WindowDataset",
 		Input: []tf.Input{
-			element_shape, num_elements,
+			input_dataset, size, shift, stride, drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -31971,73 +35761,97 @@ func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Ou
 	return op.Output(0)
 }
 
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+// Computes the matrix square root of one or more square matrices:
+//
+// matmul(sqrtm(A), sqrtm(A)) = A
+//
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
+//	input: Shape is `[..., M, M]`.
 //
+// Returns Shape is `[..., M, M]`.
 //
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
+		Type: "MatrixSquareRoot",
 		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
+// Computes the singular value decompositions of one or more matrices.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32046,160 +35860,194 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "Svd",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Add the quantile summaries to each quantile stream resource.
+// Converts one or more images from RGB to HSV.
 //
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
+			images,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
+// Does nothing. Only useful as a placeholder for control edges.
 //
-func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalRandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
+		Type: "NoOp",
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// A dataset that splits the elements of its input into multiple elements.
-func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalUnbatchDataset",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			input_dataset,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that overrides the maximum intra-op parallelism.
+// Saves input tensors slices to disk.
 //
-// Arguments:
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
 //
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
 //
-func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalMaxIntraOpParallelismDataset",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			input_dataset, max_intra_op_parallelism,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["maxsplit"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Split elements of `source` based on `sep` into a `SparseTensor`.
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Note that the above mentioned behavior matches python's str.split.
 //
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			input, sep,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
@@ -32207,278 +36055,271 @@ func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...Str
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
+// Generate a sharded filename. The filename is printf formatted as
 //
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			input_dataset, thread_pool,
+			basename, shard, num_shards,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			features,
+			basename, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
-		Input: []tf.Input{
-			sizes, tf.OutputList(values),
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// If the given TensorArray gradient already exists, returns a reference to it.
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
 //
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
 //
-// **A note about the input flow_in:**
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
 //
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
+// The remappings are 1-D tensors with the following properties:
 //
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
 //
-// **A note about the source attribute:**
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
 //
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
 //
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			handle, flow_in,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
-		Input: []tf.Input{
-			input_dataset, another_dataset,
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
-//
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
-//
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			input, filter,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -32486,87 +36327,38 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
-type ResourceScatterNdSubAttr func(optionalAttr)
-
-// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Applies sparse subtraction to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-// with 8 elements. In Python, that subtraction would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// sub = tf.scatter_nd_sub(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(sub)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, -9, 3, -6, -4, 6, 7, -4]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// A Reader that outputs the queued work as both the key and value.
 //
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// Returns the created operation.
-func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32575,245 +36367,321 @@ func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdSub",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			handle, flow_in,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// Arguments:
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+			reader_handle, queue_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
-// pseudorandomly.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
-//
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
+			reader_handle, queue_handle, num_records,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
+//     Adds v into specified rows of x.
 //
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
+//     Computes y = x; y[i, :] += v; return y.
 //
 // Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "CacheDataset",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			input_dataset, filename,
+			x, i, v,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more binary files.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
+			reader_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Gradients for batch normalization.
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
 //
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "Batch",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			tf.OutputList(in_tensors),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
-type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
 
-// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["align_corners"] = value
 	}
 }
 
-// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+// Computes the gradient of bicubic interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32822,132 +36690,59 @@ func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalSta
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorHandle",
-
+		Type: "ResizeBicubicGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.
-func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "AnonymousIterator",
+}
 
-		Attrs: attrs,
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			input, crops,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -32955,186 +36750,141 @@ func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
-//
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MakeIterator",
-		Input: []tf.Input{
-			dataset, iterator,
-		},
+}
+
+// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			iterator,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	return components
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			resource_handle,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["output_types"] = value
+		m["channels"] = value
 	}
 }
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["output_shapes"] = value
+		m["dtype"] = value
 	}
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33143,9 +36893,9 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			string_handle,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -33153,84 +36903,111 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
 //
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
+//     convert $src.gif -coalesce $dst.gif
 //
-// See also `tf.batch_gather` and `tf.gather_nd`.
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
+//	contents: 0-D.  The GIF-encoded image.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GatherV2",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			params, indices, axis,
+			contents,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			resource_handle,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
 // value: The shape of each component in a value. The length of this attr must
 // be either 0 or the same as the length of component_types. If the length of
@@ -33239,52 +37016,85 @@ type FIFOQueueV2Attr func(optionalAttr)
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shapes"] = value
 	}
 }
 
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
 // value: The upper bound on the number of elements in this queue.
 // Negative numbers mean no limit.
 // If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// FIFOQueueV2Container sets the optional container attribute to value.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this queue is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this queue will be shared under the given name
 // across multiple sessions.
 // If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// A queue that randomizes the order of elements.
 //
 // Arguments:
 //	component_types: The type of each component in a value.
 //
 // Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33293,7 +37103,7 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
+		Type: "RandomShuffleQueueV2",
 
 		Attrs: attrs,
 	}
@@ -33301,230 +37111,38 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
-// Deserializes a proto into the tree handle
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource to be restored.
-//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
-//
-// Returns the created operation.
-func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeDeserialize",
-		Input: []tf.Input{
-			tree_handle, tree_config,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
-//
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
-// If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
-//
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
-	}
-}
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
 
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// value: Whether to sanitize the result or not.
-// If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
 	return func(m optionalAttr) {
-		m["sanitize"] = value
+		m["out_type"] = value
 	}
 }
 
-// The op extracts fields from a serialized protocol buffers message into tensors.
-//
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
-//
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
-//	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names.
-//	output_types: List of TF types to use for the respective field in field_names.
-//
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
-		return
-	}
-	return sizes, values
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
-		Input: []tf.Input{
-			optional,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
-		Input: []tf.Input{
-			optional,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
-	}
-	return components
-}
-
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			iterator,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -33532,482 +37150,361 @@ func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []
 	return op.Output(0)
 }
 
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
+// Draw bounding boxes on a batch of images.
 //
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Identity transformation that models performance.
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
 //
-// Identity transformation that models performance.
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ModelDataset",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			input_dataset,
+			images, boxes,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
-		Input: []tf.Input{
-			input, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
+		m["seed2"] = value
 	}
 }
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["aspect_ratio_range"] = value
 	}
 }
 
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["area_range"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-//     merged.shape = [max(indices)] + constant
+// For example,
 //
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// For example:
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			image_size, bounding_boxes, min_object_covered,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// Computes requantization range per channel.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
+//
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "RequantizationRangePerChannel",
 		Input: []tf.Input{
-			x, y,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
 
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["centered"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayGatherV3
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
-		Input: []tf.Input{
-			handle, indices, flow_in,
-		},
-		Attrs: attrs,
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseNoise sets the optional noise attribute to value.
 //
-//     merged.shape = [max(indices)] + constant
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
 //
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
 //
-// For example:
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// The argument `normalized` and `centered` controls how the windows are built:
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input, size, offsets,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
-//
-// value: The type of each component in a value.
-// If not specified, defaults to <>
+// A container for an iterator resource.
 //
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A queue that produces elements sorted by the first component value.
-//
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
-//
-// Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-//
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
+		Type: "TensorForestTreeResourceHandleOp",
 
 		Attrs: attrs,
 	}
@@ -34015,164 +37512,173 @@ func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV
 	return op.Output(0)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["method"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
 
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Arguments:
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			handle, n,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["method"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -34181,9 +37687,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			input,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -34191,128 +37697,198 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
-//
-// Arguments:
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "NonMaxSuppressionV3",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
 //
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "NonMaxSuppressionV4",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
 //
 // Returns the created operation.
-func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetToTFRecord",
+		Type: "LookupTableRemoveV2",
 		Input: []tf.Input{
-			input_dataset, filename, compression_type,
+			table_handle, keys,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
+type CombinedNonMaxSuppressionAttr func(optionalAttr)
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
+// value: If false, the output nmsed boxes, scores and classes
+// are padded/clipped to `max_total_size`. If true, the
+// output nmsed boxes, scores and classes are padded to be of length
+// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+// which case it is clipped to `max_total_size`. Defaults to false.
 // If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["pad_per_class"] = value
 	}
 }
 
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Arguments:
-//	handle: The handle to a queue.
+// This operation performs non_max_suppression on the inputs per batch, across
+// all classes.
+// Prunes away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system. Also note that
+// this algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is the final boxes, scores and classes tensor
+// returned after performing non_max_suppression.
+//
+// Arguments:
+//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
+// same boxes are used for all classes otherwise, if `q` is equal to number of
+// classes, class-specific boxes are used.
+//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+// representing a single score corresponding to each box (each row of boxes).
+//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression per class
+//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// Returns A [batch_size, max_detections, 4] float32 tensor
+// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
+// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
+// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
+// valid detections per batch item. Only the top num_detections[i] entries in
+// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+// entries are zero paddings.
+func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -34321,106 +37897,99 @@ func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "CombinedNonMaxSuppression",
 		Input: []tf.Input{
-			handle,
+			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Returns true if queue is closed.
+// Computes the matrix logarithm of one or more square matrices:
 //
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
+//
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "FakeParam",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
-}
-
-// A stack that produces elements in first-in last-out order.
+// This operation returns the same result as the C++ std::nextafter function.
 //
-// Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+// It can also return a subnormal number.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "NextAfter",
 		Input: []tf.Input{
-			max_size,
+			x1, x2,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -34503,6 +38072,45 @@ func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf
 	return scope.AddOperation(opspec)
 }
 
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RpcAttr is an optional argument to Rpc.
 type RpcAttr func(optionalAttr)
 
@@ -34639,38 +38247,24 @@ func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output
 	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			handle, elem,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -34678,239 +38272,308 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["seed"] = value
 	}
 }
 
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// Outputs random integers from a uniform distribution.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradient for batch normalization.
+// Add the quantile summaries to each quantile stream resource.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(summaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "ExperimentalRandomDataset",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
-//
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
+// Creates a dataset that overrides the maximum intra-op parallelism.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
 		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
+			input_dataset, max_intra_op_parallelism,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
 //
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
+	}
+}
+
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
 //
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
 // ```
 //
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
 //
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// Note that the above mentioned behavior matches python's str.split.
 //
 // Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
-//
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "StringSplitV2",
 		Input: []tf.Input{
-			input, threshold,
+			input, sep,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Push an element onto the tensor_array.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "ExperimentalThreadPoolDataset",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			input_dataset, thread_pool,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
-//
-// `indices` must be a vector, its length must match the first dim of `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "Softsign",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
 
-// EmptyInit sets the optional init attribute to value.
-//
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
-		m["init"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Creates a tensor with the given shape.
+// The op serializes protobuf messages provided in the input tensors.
 //
-// This operation creates a tensor of `shape` and `dtype`.
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
 //
-// Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
 //
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
 //
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Empty",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			shape,
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -34918,285 +38581,419 @@ func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAt
 	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
+
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Concat the elements from the TensorArray into value `value`.
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
 //
-// Takes `T` elements of shapes
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
+	}
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
 //
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
 //
-// and concatenates them into a Tensor of shape:
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
 //
-// All elements must have the same shape (excepting the first dimension).
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names. An extension field can be decoded
+// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
+//	output_types: List of TF types to use for the respective field in field_names.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
-//
-//   ```(n0, n1, ..., n(T-1))```
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorSliceDataset",
+		Input: []tf.Input{
+			indices, values, dense_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for real types.
 //
-// and that `value` has shape
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// this splits values into a TensorArray with T tensors.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// TensorArray index t will be the subtensor of values with starting position
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
 //
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
 //
-// and having size
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
 //
-//   ```nt x d0 x d1 x ...```
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SeluGrad",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			gradients, outputs,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			handle, flow_in,
+			input_dataset, filename,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Creates a dataset that emits the records from one or more binary files.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			handle, flow_in,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// Gradients for batch normalization.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			t, m, v, gamma, backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
+	opspec := tf.OpSpec{
+		Type: "TFRecordDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AsStringShortest sets the optional shortest attribute to value.
-//
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
 
-// AsStringWidth sets the optional width attribute to value.
-//
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
-		m["width"] = value
+		m["container"] = value
 	}
 }
 
-// AsStringFill sets the optional fill attribute to value.
-//
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
-		m["fill"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Converts each entry in the given tensor to strings.  Supports many numeric
-//
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -35205,363 +39002,191 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "ExperimentalStatsAggregatorHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// A container for an iterator resource.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.
+func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
+		Type: "AnonymousIterator",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// Adjust the contrast of one or more images.
 //
-// The resulting value `output` would look like this:
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// Contrast is adjusted independently for each channel of each image.
 //
-// See `tf.scatter_nd` for more details about how to make updates to slices.
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			input, indices, updates,
+			images, contrast_factor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
 	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
 	}
+	return components
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
+// Outputs the single element from the given dataset.
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
 //
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "DatasetToSingleElement",
 		Input: []tf.Input{
-			value,
+			dataset,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
-		Attrs: attrs,
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Deprecated. Use TensorArraySizeV3
+// Converts the given `resource_handle` representing an iterator to a string.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			handle, flow_in,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
 // If not specified, defaults to <>
-func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["output_types"] = value
 	}
 }
 
-// Conv2DDilations sets the optional dilations attribute to value.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	string_handle: A string representation of the given handle.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "IteratorFromStringHandle",
 		Input: []tf.Input{
-			input, filter,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index af5503f2ad308fffb03d2ebd5964eec273896c72..d70e0d6c0ab2cfefb55ef04ad67b2a74f83ac48d 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -362,7 +362,8 @@ tf_cc_test(
 filegroup(
     name = "libtensorflow_jni",
     srcs = select({
-        "//tensorflow:darwin": [":libtensorflow_jni.dylib"],
+        "//tensorflow:windows": [":tensorflow_jni.dll"],
+        "//tensorflow:macos": [":libtensorflow_jni.dylib"],
         "//conditions:default": [":libtensorflow_jni.so"],
     }),
     visibility = ["//visibility:public"],
@@ -373,26 +374,25 @@ LINKER_VERSION_SCRIPT = ":config/version_script.lds"
 LINKER_EXPORTED_SYMBOLS = ":config/exported_symbols.lds"
 
 tf_cc_binary(
-    name = "libtensorflow_jni.so",
+    name = "tensorflow_jni",
     # Set linker options to strip out anything except the JNI
     # symbols from the library. This reduces the size of the library
     # considerably (~50% as of January 2017).
     linkopts = select({
         "//tensorflow:debug": [],  # Disable all custom linker options in debug mode
-        "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
-            "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
+        "//tensorflow:macos": [
+            "-Wl,-exported_symbols_list,$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
             "-s",
-            "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
-            "$(location {})".format(LINKER_VERSION_SCRIPT),
+            "-Wl,--version-script,$(location {})".format(LINKER_VERSION_SCRIPT),
         ],
     }),
     linkshared = 1,
     linkstatic = 1,
+    per_os_targets = True,
     deps = [
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/java/src/main/native",
@@ -414,14 +414,3 @@ tf_cc_binary(
     srcs = ["generate_pom.cc"],
     deps = ["//tensorflow/c:c_api"],
 )
-
-# System.loadLibrary() on OS X looks for ".dylib" or ".jnilib"
-# and no ".so". If and when https://github.com/bazelbuild/bazel/issues/914
-# is resolved, perhaps this workaround rule can be removed.
-genrule(
-    name = "darwin-compat",
-    srcs = [":libtensorflow_jni.so"],
-    outs = ["libtensorflow_jni.dylib"],
-    cmd = "cp $< $@",
-    output_to_bindir = 1,
-)
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 951e8bdd0dd8aae46a361a8ffcff276579433641..4206f6f9fc8ed029d1a7d9b044dd079ec523de31 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -20,13 +20,13 @@
 Releases built from release branches are available on Maven Central.
 Additionally, every day binaries are built from the `master` branch on GitHub:
 
-- [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
-- [Sourc JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
-- JNI:
-  - [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
-  - [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
-  - [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
-  - Windows: (No nightly builds available yet)
+-   [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
+-   [Source JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
+-   JNI:
+    -   [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
+    -   [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
+    -   [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
+    -   Windows: (No nightly builds available yet)
 
 ## Building from source
 
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index e1916ca4d9d6aa179e1a69451a5e981783560026..f423cc4d8277509d45aa8344e322f71b7f1306a8 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -18,7 +18,7 @@ XLINT_OPTS = [
     "-Xlint:-processing",
     "-Xlint:-serial",
     "-Xlint:-try",
-    "-Xlint:-classfile", # see b/32750402, go/javac-warnings#classfile
+    "-Xlint:-classfile",  # see b/32750402, go/javac-warnings#classfile
 ]
 
 # The bazel errorprone plugin currently only enables default errorChecks
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index db3a3609f1ac4fda18ff5a1248e61c675a8bf9f9..7e04af42be0086bbc510bf07096f65d0c3a9ddda 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 53f7a2d63ef5bc8cfe4fbe372cf2fd3f58a0fe33..8a4d471fa401cf031c2fe5b494aa55744fc882a4 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index a17724c805e38239c61dd27a5cc9ec918bbb2e0f..2e2625641dd8f7eb61d17f52396d7d957ecce2d6 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 30831f90b9f7b4beb5ae3f2ceebadcb6e1f8771e..2cd600541d09994437438a86877f1ec98922fcb2 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.12.0</version>
+  <version>1.13.0-rc2</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index dd6b52be62487ba6cb989b4917a15df7f473a848..b79ee472c378e36ed795adb86cf24e19635f1c58 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index f47c11809d58464953028c388d491b91f67c3510..70091ad1571631ffd200550e35dba39025dbbca2 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index 11aaba983f6ded9a6e757703fd9a2411db82ceb6..94d6801cda89670d784f6e2c4c6779d9c3eb39c3 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 07fcfa5144600f7d9cbf6edbfbecbecc7c115631..5f78f89b93b984e561dfc98b6cb4e8a8a3314b72 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc2</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 5d6387e88e96802e9226774abd391ac2dd673143..db6116bd5c843c2846d6b9f67e253e87db6daffc 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -516,7 +516,7 @@ bool CanGenerateOp(const OpDef& op_def, const ApiDef& api_def) {
     return false;
   }
   for (const auto& attr : op_def.attr()) {
-    if (attr.type() == "func") {
+    if (attr.type() == "func" || attr.type() == "list(func)") {
       return false;  // TODO(karllessard) add support for function attributes
     }
   }
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index f4ff34ea0361fba5528126b93f3f6e45289d8df2..b46721a93dcbd105dea7c52e8ea615cbd00af1c8 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -17,46 +17,48 @@ load(
 # and then archive those source files into
 #     ops/gen_sources.srcjar
 #
-def tf_java_op_gen_srcjar(name,
-                          gen_tool,
-                          base_package,
-                          api_def_srcs=[],
-                          out_dir="ops/",
-                          out_src_dir="src/main/java/",
-                          visibility=["//tensorflow/java:__pkg__"]):
+def tf_java_op_gen_srcjar(
+        name,
+        gen_tool,
+        base_package,
+        api_def_srcs = [],
+        out_dir = "ops/",
+        out_src_dir = "src/main/java/",
+        visibility = ["//tensorflow/java:__pkg__"]):
+    gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
+    srcs = api_def_srcs[:]
 
-  gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
-  srcs = api_def_srcs[:]
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
 
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
+    gen_cmds += ["$(location " + gen_tool + ")" +
+                 " --output_dir=$(@D)/" + out_src_dir +
+                 " --base_package=" + base_package +
+                 " --api_dirs=" + api_def_args_str]
 
-  gen_cmds += ["$(location " + gen_tool + ")" +
-               " --output_dir=$(@D)/" + out_src_dir +
-               " --base_package=" + base_package +
-               " --api_dirs=" + api_def_args_str]
+    # Generate a source archive containing generated code for these ops.
+    gen_srcjar = out_dir + name + ".srcjar"
+    gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
 
-  # Generate a source archive containing generated code for these ops.
-  gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
-
-  native.genrule(
-      name=name,
-      srcs=srcs,
-      outs=[gen_srcjar],
-      tools=[
-          "@local_jdk//:jar",
-          "@local_jdk//:jdk",
-          gen_tool
-      ] + tf_binary_additional_srcs(),
-      cmd=" && ".join(gen_cmds))
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [gen_srcjar],
+        tools = [
+            "@local_jdk//:jar",
+            "@local_jdk//:jdk",
+            gen_tool,
+        ] + tf_binary_additional_srcs(),
+        cmd = " && ".join(gen_cmds),
+    )
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index df1426ad75143d720f1d5bd3cf4ce44d30cb226e..c668d4dda8cf729b2fd70644eb2a2cd38ae0923b 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -357,10 +357,10 @@ public final class OperatorProcessor extends AbstractProcessor {
                     + "  // Optional attributes\n"
                     + "  ops.math().matMul(a, b, MatMul.transposeA(true));\n"
                     + "  // Naming operators\n"
-                    + "  ops.withName(“foo”).constant(5); // name “foo”\n"
+                    + "  ops.withName(\"foo\").constant(5); // name \"foo\"\n"
                     + "  // Names can exist in a hierarchy\n"
-                    + "  Ops sub = ops.withSubScope(“sub”);\n"
-                    + "  sub.withName(“bar”).constant(4); // “sub/bar”\n"
+                    + "  Ops sub = ops.withSubScope(\"sub\");\n"
+                    + "  sub.withName(\"bar\").constant(4); // \"sub/bar\"\n"
                     + "}\n"
                     + "}</pre>\n",
                 T_GRAPH,
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index 752b49af040268d7e3355b12e4ae6aae310789bd..d5dae187197347a5a77b3c9d819321f7f58841e7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -235,7 +235,116 @@ public final class Graph implements AutoCloseable {
   public Output<?>[] addGradients(Output<?> y, Output<?>[] x) {
     return addGradients(null, new Output<?>[] {y}, x, null);
   }
-  
+
+  /**
+   * Used to instantiate an abstract class which overrides the buildSubgraph method to build a
+   * conditional or body subgraph for a while loop. After Java 8, this can alternatively be used to
+   * create a lambda for the same purpose.
+   *
+   * <p>To be used when calling {@link #whileLoop(Output[],
+   * org.tensorflow.Graph.WhileSubgraphBuilder, org.tensorflow.Graph.WhileSubgraphBuilder, String)}
+   *
+   * <p>Example usage (prior to Java 8):
+   *
+   * <p>{@code WhileSubgraphBuilder bodyGraphBuilder = new WhileSubgraphBuilder() { @Override public
+   * void buildSubgraph(Graph bodyGraph, Output<?>[] bodyInputs, Output<?>[] bodyOutputs) { // build
+   * body subgraph } }; }
+   *
+   * <p>Example usage (after Java 8):
+   *
+   * <p>{@code WhileSubgraphBuilder bodyGraphBuilder = (bodyGraph, bodyInputs, bodyOutputs) -> { //
+   * build body subgraph };}
+   */
+  public interface WhileSubgraphBuilder {
+    /**
+     * To be overridden by user with code to build conditional or body subgraph for a while loop
+     *
+     * @param g the subgraph
+     * @param inputs subgraph inputs
+     * @param outputs subgraph outputs
+     */
+    public void buildSubgraph(Graph g, Output<?>[] inputs, Output<?>[] outputs);
+  }
+
+  // called by while loop code in graph_jni.cc to construct conditional/body subgraphs
+  private static long[] buildSubgraph(
+      WhileSubgraphBuilder subgraphBuilder,
+      long subgraphHandle,
+      long[] inputHandles,
+      int[] inputIndices,
+      long[] outputHandles,
+      int[] outputIndices) {
+    Graph subgraph = new Graph(subgraphHandle);
+
+    int ninputs = inputHandles.length;
+    int noutputs = outputHandles.length;
+    Output<?>[] inputs = new Output<?>[ninputs];
+    Output<?>[] outputs = new Output<?>[noutputs];
+    long[] outputHandlesAndIndices = new long[noutputs * 2];
+
+    synchronized (subgraph.nativeHandleLock) {
+      try (Reference ref = subgraph.ref()) {
+
+        for (int i = 0; i < ninputs; i++) {
+          Operation op = new Operation(subgraph, inputHandles[i]);
+          inputs[i] = new Output<>(op, inputIndices[i]);
+        }
+
+        for (int i = 0; i < noutputs; i++) {
+          Operation op = new Operation(subgraph, outputHandles[i]);
+          outputs[i] = new Output<>(op, outputIndices[i]);
+        }
+
+        subgraphBuilder.buildSubgraph(subgraph, inputs, outputs);
+
+        for (int i = 0, j = noutputs; i < noutputs; i++, j++) {
+          outputHandlesAndIndices[i] = outputs[i].op().getUnsafeNativeHandle();
+          outputHandlesAndIndices[j] = (long) outputs[i].index();
+        }
+      }
+      return outputHandlesAndIndices;
+    }
+  }
+
+  /**
+   * Builds a while loop.
+   *
+   * @param inputs the loop inputs
+   * @param cgBuilder WhileSubgraphBuilder to build the conditional subgraph
+   * @param bgBuilder WhileSubgraphBuilder to build the body subgraph
+   * @param name name for the loop
+   * @return list of loop outputs, of the same length as {@code inputs}
+   */
+  public Output<?>[] whileLoop(
+      Output<?>[] inputs,
+      WhileSubgraphBuilder cgBuilder,
+      WhileSubgraphBuilder bgBuilder,
+      String name) {
+    int ninputs = inputs.length;
+    long[] inputHandles = new long[ninputs];
+    int[] inputIndices = new int[ninputs];
+    Output<?>[] outputs = new Output<?>[ninputs];
+
+    synchronized (nativeHandleLock) {
+      try (Reference ref = ref()) {
+
+        for (int i = 0; i < ninputs; i++) {
+          inputHandles[i] = inputs[i].op().getUnsafeNativeHandle();
+          inputIndices[i] = inputs[i].index();
+        }
+
+        long[] outputHandlesAndIndices =
+            whileLoop(nativeHandle, inputHandles, inputIndices, name, cgBuilder, bgBuilder);
+
+        for (int i = 0, j = ninputs; i < ninputs; ++i, ++j) {
+          Operation op = new Operation(this, outputHandlesAndIndices[i]);
+          outputs[i] = new Output<>(op, (int) outputHandlesAndIndices[j]);
+        }
+      }
+      return outputs;
+    }
+  }
+
   private final Object nativeHandleLock = new Object();
   private long nativeHandle;
   private int refcount = 0;
@@ -357,6 +466,14 @@ public final class Graph implements AutoCloseable {
       long[] gradInputHandles,
       int[] gradInputIndices);
 
+  private static native long[] whileLoop(
+      long handle,
+      long[] inputHandles,
+      int[] inputIndices,
+      String name,
+      WhileSubgraphBuilder condGraphBuilder,
+      WhileSubgraphBuilder bodyGraphBuilder);
+
   static {
     TensorFlow.init();
   }
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 49348daa94ed04990a657922a0fbb515b7721d82..ff3b4102013aff3c63cc0b6ac9d17411194a3783 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -33,13 +33,12 @@ tf_cuda_library(
         "//tensorflow:android": [],
         "//conditions:default": ["."],
     }),
-    deps = [
-        "//tensorflow/c:c_api",
-    ] + select({
+    deps = select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//conditions:default": [
+            "//tensorflow/c:c_api",
             "//tensorflow/core:all_kernels",
             "//tensorflow/core:direct_session",
             "//tensorflow/core:ops",
@@ -68,7 +67,7 @@ genrule(
     name = "copy_jni_md_h",
     srcs = select({
         "//tensorflow:windows": ["@bazel_tools//tools/jdk:jni_md_header-windows"],
-        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//tensorflow:macos": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
         "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
     }),
     outs = ["jni_md.h"],
diff --git a/tensorflow/java/src/main/native/graph_jni.cc b/tensorflow/java/src/main/native/graph_jni.cc
index f1744d87693ae8f43c032b24622aaecb41a30cb2..570ba8ac1074ace63f722a1af385a72e2d320b8d 100644
--- a/tensorflow/java/src/main/native/graph_jni.cc
+++ b/tensorflow/java/src/main/native/graph_jni.cc
@@ -18,19 +18,28 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include "tensorflow/c/c_api.h"
-#include "tensorflow/java/src/main/native/utils_jni.h"
 #include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/utils_jni.h"
 
 namespace {
-TF_Graph* requireHandle(JNIEnv* env, jlong handle) {
-  static_assert(sizeof(jlong) >= sizeof(TF_Graph*),
+template <class T>
+T* requireHandleImpl(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(T*),
                 "Cannot package C object pointers as a Java long");
   if (handle == 0) {
     throwException(env, kIllegalStateException,
                    "close() has been called on the Graph");
     return nullptr;
   }
-  return reinterpret_cast<TF_Graph*>(handle);
+  return reinterpret_cast<T*>(handle);
+}
+
+TF_Graph* requireHandle(JNIEnv* env, jlong handle) {
+  return requireHandleImpl<TF_Graph>(env, handle);
+}
+
+TF_Operation* requireOperationHandle(JNIEnv* env, jlong handle) {
+  return requireHandleImpl<TF_Operation>(env, handle);
 }
 }  // namespace
 
@@ -56,10 +65,8 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_operation(JNIEnv* env,
   return reinterpret_cast<jlong>(op);
 }
 
-JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_nextOperation(JNIEnv* env,
-                                                                     jclass clazz,
-                                                                     jlong handle,
-                                                                     jint position) {
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_nextOperation(
+    JNIEnv* env, jclass clazz, jlong handle, jint position) {
   TF_Graph* g = requireHandle(env, handle);
   if (g == nullptr) return nullptr;
 
@@ -189,3 +196,140 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_addGradients(
 
   return dy_handles_and_indices;
 }
+
+// helper function for while loop -- constructs conditional or body subgraph
+jlongArray buildSubgraph(JNIEnv* env, jclass clazz, jobject subgraph_builder,
+                         TF_Graph* const subgraph,
+                         const TF_Output* const inputs,
+                         const TF_Output* const outputs, const int ninputs,
+                         const int noutputs) {
+  jmethodID build_subgraph_method_id = env->GetStaticMethodID(
+      clazz, "buildSubgraph",
+      "(Lorg/tensorflow/Graph$WhileSubgraphBuilder;J[J[I[J[I)[J");
+  if (build_subgraph_method_id == 0) return nullptr;
+
+  jlong subgraph_handle = reinterpret_cast<jlong>(subgraph);
+
+  jlongArray input_handles = env->NewLongArray(ninputs);
+  jintArray input_indices = env->NewIntArray(ninputs);
+  jlongArray output_handles = env->NewLongArray(noutputs);
+  jintArray output_indices = env->NewIntArray(noutputs);
+
+  jlong* input_handles_elems =
+      env->GetLongArrayElements(input_handles, nullptr);
+  jint* input_indices_elems = env->GetIntArrayElements(input_indices, nullptr);
+  jlong* output_handles_elems =
+      env->GetLongArrayElements(output_handles, nullptr);
+  jint* output_indices_elems =
+      env->GetIntArrayElements(output_indices, nullptr);
+
+  for (int i = 0; i < ninputs; ++i) {
+    input_handles_elems[i] = reinterpret_cast<jlong>((inputs[i]).oper);
+    input_indices_elems[i] = static_cast<jint>((inputs[i]).index);
+  }
+
+  for (int i = 0; i < noutputs; ++i) {
+    output_handles_elems[i] = reinterpret_cast<jlong>((outputs[i]).oper);
+    output_indices_elems[i] = static_cast<jint>((outputs[i]).index);
+  }
+
+  env->ReleaseLongArrayElements(input_handles, input_handles_elems, 0);
+  env->ReleaseIntArrayElements(input_indices, input_indices_elems, 0);
+  env->ReleaseLongArrayElements(output_handles, output_handles_elems, 0);
+  env->ReleaseIntArrayElements(output_indices, output_indices_elems, 0);
+
+  // call Java code to construct the subgraph
+  jlongArray output_handles_and_indices =
+      (jlongArray)env->CallStaticObjectMethod(
+          clazz, build_subgraph_method_id, subgraph_builder, subgraph_handle,
+          input_handles, input_indices, output_handles, output_indices);
+
+  if (env->ExceptionOccurred()) {
+    env->ExceptionDescribe();
+    return nullptr;
+  }
+
+  // returned array contains both op handles and output indices, in pair
+  return output_handles_and_indices;
+}
+
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_whileLoop(
+    JNIEnv* env, jclass clazz, jlong handle, jlongArray input_handles,
+    jintArray input_indices, jstring name, jobject cond_graph_builder,
+    jobject body_graph_builder) {
+  TF_Graph* g = requireHandle(env, handle);
+  TF_Status* status = TF_NewStatus();
+  if (g == nullptr) return nullptr;
+
+  int ninputs = env->GetArrayLength(input_handles);
+
+  std::unique_ptr<TF_Output[]> inputs(new TF_Output[ninputs]);
+  resolveOutputs(env, "inputs", input_handles, input_indices, inputs.get(),
+                 ninputs);
+  if (env->ExceptionCheck()) return nullptr;
+
+  // initialize while params
+  TF_WhileParams params = TF_NewWhile(g, inputs.get(), ninputs, status);
+  throwExceptionIfNotOK(env, status);
+
+  // build conditional subgraph
+  jlongArray cond_output_handles_and_indices =
+      buildSubgraph(env, clazz, cond_graph_builder, params.cond_graph,
+                    params.cond_inputs, &params.cond_output, params.ninputs, 1);
+
+  // build body subgraph
+  jlongArray body_output_handles_and_indices = buildSubgraph(
+      env, clazz, body_graph_builder, params.body_graph, params.body_inputs,
+      params.body_outputs, params.ninputs, params.ninputs);
+
+  if (cond_output_handles_and_indices == nullptr ||
+      body_output_handles_and_indices == nullptr)
+    return nullptr;
+
+  // set cond_output param to output of the conditional subgraph
+  jlong* cond_output_elems =
+      env->GetLongArrayElements(cond_output_handles_and_indices, nullptr);
+  TF_Operation* cond_output_op =
+      requireOperationHandle(env, cond_output_elems[0]);
+  params.cond_output = {cond_output_op,
+                        static_cast<jint>(cond_output_elems[1])};
+  env->ReleaseLongArrayElements(cond_output_handles_and_indices,
+                                cond_output_elems, 0);
+
+  // set body_outputs param to outputs of the body subgraph
+  jlong* body_output_elems =
+      env->GetLongArrayElements(body_output_handles_and_indices, nullptr);
+  for (int i = 0, j = ninputs; i < ninputs; ++i, ++j) {
+    TF_Operation* body_output_op =
+        requireOperationHandle(env, body_output_elems[i]);
+    params.body_outputs[i] = {body_output_op,
+                              static_cast<jint>(body_output_elems[j])};
+  }
+  env->ReleaseLongArrayElements(body_output_handles_and_indices,
+                                body_output_elems, 0);
+
+  // set loop name param
+  params.name = env->GetStringUTFChars(name, 0);
+
+  // build the while loop, storing loop outputs in `outputs`
+  std::unique_ptr<TF_Output[]> outputs(new TF_Output[ninputs]);
+  TF_FinishWhile(&params, status, outputs.get());
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+
+  env->ReleaseStringUTFChars(name, params.name);
+
+  // returned array contains both op handles and output indices, in pair
+  jlongArray output_handles_and_indices = env->NewLongArray(ninputs * 2);
+  jlong* output_elems =
+      env->GetLongArrayElements(output_handles_and_indices, nullptr);
+  for (int i = 0, j = ninputs; i < ninputs; ++i, ++j) {
+    TF_Output output = outputs.get()[i];
+    output_elems[i] = reinterpret_cast<jlong>(output.oper);
+    output_elems[j] = static_cast<jlong>(output.index);
+  }
+  env->ReleaseLongArrayElements(output_handles_and_indices, output_elems, 0);
+
+  return output_handles_and_indices;
+}
diff --git a/tensorflow/java/src/main/native/graph_jni.h b/tensorflow/java/src/main/native/graph_jni.h
index efed23f83b6265e4df37cd8b35ce45576c415c43..4281297dca250604e3baf04c35af254fb126b888 100644
--- a/tensorflow/java/src/main/native/graph_jni.h
+++ b/tensorflow/java/src/main/native/graph_jni.h
@@ -51,8 +51,9 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Graph_operation(JNIEnv *, jclass,
  * Signature: (JI)[J
  */
 JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_nextOperation(JNIEnv *,
-								     jclass, jlong,
-								     jint);
+                                                                     jclass,
+                                                                     jlong,
+                                                                     jint);
 
 /*
  * Class:     org_tensorflow_Graph
@@ -82,6 +83,15 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_addGradients(
     JNIEnv *, jclass, jlong, jstring, jlongArray, jintArray, jlongArray,
     jintArray, jlongArray, jintArray);
 
+/*
+ * Class:     org_tensorflow_Graph
+ * Method:    whileLoop
+ * Signature:
+ * (J[J[IILjava/lang/String;Lorg/tensorflow/Graph/WhileSubgraphBuilder;Lorg/tensorflow/Graph/WhileSubgraphBuilder;)[J
+ */
+JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_whileLoop(
+    JNIEnv *, jclass, jlong, jlongArray, jintArray, jstring, jobject, jobject);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
index 7c05c1deafeea5d0b482a70f528d997a3394b365..f0428a1ae6c23a132cd2c349ac8bc7a354f3ecf6 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/GraphTest.java
@@ -254,7 +254,115 @@ public class GraphTest {
       }
     }
   }
-  
+
+  @Test
+  public void buildWhileLoopSingleInput() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+
+      Output<?> input = TestUtil.placeholder(g, "input1", Integer.class);
+
+      // could write this using lambda after Java 8
+      Graph.WhileSubgraphBuilder condGraphBuilder =
+          new Graph.WhileSubgraphBuilder() {
+            @Override
+            public void buildSubgraph(
+                Graph condGraph, Output<?>[] condInputs, Output<?>[] condOutputs) {
+              Output<Integer> sixteen = TestUtil.constant(condGraph, "sixteen", 16);
+              // condInputs[0] < 16
+              Output<?> condOutput =
+                  condGraph
+                      .opBuilder("Less", "cond")
+                      .addInput(condInputs[0])
+                      .addInput(sixteen)
+                      .build()
+                      .output(0);
+
+              condOutputs[0] = condOutput;
+            }
+          };
+
+      // could write this using lambda after Java 8
+      Graph.WhileSubgraphBuilder bodyGraphBuilder =
+          new Graph.WhileSubgraphBuilder() {
+            @Override
+            public void buildSubgraph(
+                Graph bodyGraph, Output<?>[] bodyInputs, Output<?>[] bodyOutputs) {
+              bodyOutputs[0] = TestUtil.square(bodyGraph, "square", bodyInputs[0]);
+            }
+          };
+
+      Output<?>[] loopOutputs =
+          g.whileLoop(toArray(input), condGraphBuilder, bodyGraphBuilder, "test_loop");
+
+      try (Tensor<Integer> c = Tensors.create(2);
+          Tensor<?> output = s.runner().feed(input, c).fetch(loopOutputs[0]).run().get(0)) {
+
+        assertEquals(16, output.intValue()); // ((2^2)^2)
+      }
+    }
+  }
+
+  @Test
+  public void buildWhileLoopMultipleInputs() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+
+      Output<?> input1 = TestUtil.placeholder(g, "input1", Integer.class);
+      Output<?> input2 = TestUtil.placeholder(g, "input2", Integer.class);
+      Output<?>[] inputs = toArray(input1, input2);
+
+      // could write this using lambda after Java 8
+      Graph.WhileSubgraphBuilder condGraphBuilder =
+          new Graph.WhileSubgraphBuilder() {
+            @Override
+            public void buildSubgraph(
+                Graph condGraph, Output<?>[] condInputs, Output<?>[] condOutputs) {
+              Output<Integer> sixteen = TestUtil.constant(condGraph, "sixteen", 16);
+              Output<?> condOutput =
+                  condGraph
+                      .opBuilder("Less", "cond")
+                      .addInput(condInputs[0])
+                      .addInput(sixteen)
+                      .build()
+                      .output(0); // condInputs[0] < 16
+
+              condOutputs[0] = condOutput;
+            }
+          };
+
+      // could write this using lambda after Java 8
+      Graph.WhileSubgraphBuilder bodyGraphBuilder =
+          new Graph.WhileSubgraphBuilder() {
+            @Override
+            public void buildSubgraph(
+                Graph bodyGraph, Output<?>[] bodyInputs, Output<?>[] bodyOutputs) {
+              bodyOutputs[0] = TestUtil.square(bodyGraph, "square1", bodyInputs[0]);
+              bodyOutputs[1] = TestUtil.square(bodyGraph, "square2", bodyInputs[1]);
+            }
+          };
+
+      Output<?>[] loopOutputs =
+          g.whileLoop(inputs, condGraphBuilder, bodyGraphBuilder, "test_loop");
+
+      try (Tensor<Integer> c1 = Tensors.create(2);
+          Tensor<Integer> c2 = Tensors.create(5);
+          TestUtil.AutoCloseableList<Tensor<?>> outputs =
+              new TestUtil.AutoCloseableList<>(
+                  s.runner()
+                      .feed(input1, c1)
+                      .feed(input2, c2)
+                      .fetch(loopOutputs[0])
+                      .fetch(loopOutputs[1])
+                      .run())) {
+
+        assertEquals(2, outputs.size());
+        assertEquals(16, outputs.get(0).intValue()); // ((2^2)^2)
+        assertEquals(625, outputs.get(1).intValue()); // ((5^2)^2)
+      }
+    }
+  }
+
   private static Output<?>[] toArray(Output<?>... outputs) {
     return outputs;
   }
diff --git a/tensorflow/lite/Android.bp b/tensorflow/lite/Android.bp
index a059c43d1a95183ebb7f20b6b5ec87fe726eb73a..4e54edc49f037da90e305f8a845c7918143ab64b 100644
--- a/tensorflow/lite/Android.bp
+++ b/tensorflow/lite/Android.bp
@@ -52,10 +52,13 @@ cc_library_static {
         "core/subgraph.cc",
         "graph_info.cc",
         "interpreter.cc",
+        "minimal_logging.cc",
+        "minimal_logging_android.cc",
         "mmap_allocation.cc",
         "model.cc",
         "mutable_op_resolver.cc",
         "nnapi_delegate.cc",
+        "nnapi/nnapi_implementation.cc",
         "optional_debug_tools.cc",
         "simple_memory_arena.cc",
         "stderr_reporter.cc",
@@ -72,6 +75,7 @@ cc_library_static {
     cflags: [
         "-Wno-deprecated-declarations",
         "-Wno-extern-c-compat",
+        "-Wno-ignored-attributes",
         "-Wno-invalid-partial-specialization",
         "-Wno-mismatched-tags",
         "-Wno-sign-compare",
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 18fd1c10126820527ad844e94a490d9af514d6d4..e9eb7af8ea4145f2fc6783821a7c75b8ef399cf5 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -4,13 +4,15 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
+    "testdata/*.tflite",
+    "testdata/*.csv",
     "models/testdata/*",
 ]))
 
@@ -39,6 +41,7 @@ config_setting(
 TFLITE_DEFAULT_COPTS = if_not_windows([
     "-Wall",
     "-Wno-comment",
+    "-Wno-extern-c-compat",
 ])
 
 cc_library(
@@ -173,27 +176,18 @@ cc_library(
         "stderr_reporter.h",
     ],
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
-    linkopts = [
-    ] + select({
-        "//tensorflow:android": [
-            "-llog",
-        ],
-        "//conditions:default": [
-        ],
-    }),
     deps = [
         ":arena_planner",
         ":graph_info",
         ":memory_planner",
+        ":minimal_logging",
         ":schema_fbs_version",
         ":simple_memory_arena",
         ":string",
         ":util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api:api",
-        "//tensorflow/lite/kernels:eigen_support",
-        "//tensorflow/lite/kernels:gemm_support",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
@@ -219,6 +213,10 @@ cc_test(
     name = "string_util_test",
     size = "small",
     srcs = ["string_util_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         ":string_util",
@@ -233,10 +231,13 @@ cc_test(
     name = "interpreter_test",
     size = "small",
     srcs = ["interpreter_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         ":string_util",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
@@ -252,6 +253,10 @@ cc_test(
     name = "graph_info_test",
     size = "small",
     srcs = ["graph_info_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -264,6 +269,10 @@ cc_test(
     name = "simple_memory_arena_test",
     size = "small",
     srcs = ["simple_memory_arena_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":simple_memory_arena",
         "//tensorflow/lite/testing:util",
@@ -284,9 +293,11 @@ cc_test(
         "testdata/test_model.bin",
         "testdata/test_model_broken.bin",
     ],
+    tags = [
+        "tflite_not_portable",
+    ],
     deps = [
         ":framework",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/testing:util",
@@ -323,6 +334,10 @@ cc_test(
     name = "mutable_op_resolver_test",
     size = "small",
     srcs = ["mutable_op_resolver_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -344,9 +359,76 @@ cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":util",
         "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_library(
+    name = "minimal_logging",
+    srcs = [
+        "minimal_logging.cc",
+    ] + select({
+        "//tensorflow:android": [
+            "minimal_logging_android.cc",
+        ],
+        "//tensorflow:ios": [
+            "minimal_logging_ios.cc",
+        ],
+        "//conditions:default": [
+            "minimal_logging_default.cc",
+        ],
+    }),
+    hdrs = ["minimal_logging.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    linkopts = select({
+        "//tensorflow:android": ["-llog"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "minimal_logging_test",
+    size = "small",
+    srcs = ["minimal_logging_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":minimal_logging",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Shared lib target for convenience, pulls in the core runtime and builtin ops.
+# Note: This target is not yet finalized, and the exact set of exported (C/C++)
+# APIs is subject to change.
+tflite_cc_shared_object(
+    name = "libtensorflowlite.so",
+    linkopts = select({
+        "//tensorflow:macos": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script,$(location //tensorflow/lite:tflite_version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":framework",
+        ":tflite_exported_symbols.lds",
+        ":tflite_version_script.lds",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 5e6b13a46ae2ac8d4a1cfc2f802648ef27742e22..2b30309d83a544889fe80c9fd3eac293ef557d6b 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -2,6 +2,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_binary_additional_srcs",
     "tf_cc_shared_object",
     "tf_cc_test",
 )
@@ -85,24 +86,27 @@ def tflite_jni_linkopts_unstripped():
         "//conditions:default": [],
     })
 
-def tflite_linkopts():
-    """Defines linker flags to reduce size of TFLite binary."""
-    return tflite_linkopts_unstripped() + select({
+def tflite_symbol_opts():
+    """Defines linker flags whether to include symbols or not."""
+    return select({
         "//tensorflow:android": [
-            "-s",  # Omit symbol table.
+            "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
         ],
         "//conditions:default": [],
+    }) + select({
+        "//tensorflow:debug": [],
+        "//conditions:default": [
+            "-s",  # Omit symbol table, for all non debug builds
+        ],
     })
 
+def tflite_linkopts():
+    """Defines linker flags to reduce size of TFLite binary."""
+    return tflite_linkopts_unstripped() + tflite_symbol_opts()
+
 def tflite_jni_linkopts():
     """Defines linker flags to reduce size of TFLite binary with JNI."""
-    return tflite_jni_linkopts_unstripped() + select({
-        "//tensorflow:android": [
-            "-s",  # Omit symbol table.
-            "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
-        ],
-        "//conditions:default": [],
-    })
+    return tflite_jni_linkopts_unstripped() + tflite_symbol_opts()
 
 def tflite_jni_binary(
         name,
@@ -157,7 +161,7 @@ def tf_to_tflite(name, src, options, out):
     """
 
     toco_cmdline = " ".join([
-        "//tensorflow/lite/toco:toco",
+        "$(location //tensorflow/lite/toco:toco)",
         "--input_format=TENSORFLOW_GRAPHDEF",
         "--output_format=TFLITE",
         ("--input_file=$(location %s)" % src),
@@ -168,7 +172,7 @@ def tf_to_tflite(name, src, options, out):
         srcs = [src],
         outs = [out],
         cmd = toco_cmdline,
-        tools = ["//tensorflow/lite/toco:toco"],
+        tools = ["//tensorflow/lite/toco:toco"] + tf_binary_additional_srcs(),
     )
 
 def tflite_to_json(name, src, out):
@@ -225,9 +229,11 @@ def generated_test_models():
     return [
         "abs",
         "add",
+        "add_n",
         "arg_min_max",
         "avg_pool",
         "batch_to_space_nd",
+        "ceil",
         "concat",
         "constant",
         "control_dep",
@@ -235,8 +241,10 @@ def generated_test_models():
         "conv2d_transpose",
         "conv_with_shared_weights",
         "conv_to_depthwiseconv_with_shared_weights",
+        "cos",
         "depthwiseconv",
         "div",
+        "elu",
         "equal",
         "exp",
         "expand_dims",
@@ -247,6 +255,7 @@ def generated_test_models():
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_nd",
         "gather_with_constant",
         "global_batch_norm",
         "greater",
@@ -264,7 +273,7 @@ def generated_test_models():
         "logical_and",
         "logical_or",
         "logical_xor",
-        #"lstm", TODO(b/122889684): Resolve toco structured line parsing in oss.
+        "lstm",
         "max_pool",
         "maximum",
         "mean",
@@ -281,6 +290,7 @@ def generated_test_models():
         "prelu",
         "pow",
         "range",
+        "rank",
         "reduce_any",
         "reduce_max",
         "reduce_min",
@@ -290,6 +300,9 @@ def generated_test_models():
         "relu6",
         "reshape",
         "resize_bilinear",
+        "resolve_constant_strided_slice",
+        "reverse_sequence",
+        "reverse_v2",
         "rsqrt",
         "shape",
         "sigmoid",
@@ -307,12 +320,13 @@ def generated_test_models():
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
-        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
         "transpose_conv",
+        "unidirectional_sequence_lstm",
+        "unidirectional_sequence_rnn",
         "unique",
         "unpack",
         "unroll_batch_matmul",
@@ -328,6 +342,8 @@ def generated_test_models_failing(conversion_mode):
         return [
             "lstm",  # TODO(b/117510976): Restore when lstm flex conversion works.
             "unroll_batch_matmul",  # TODO(b/123030774): Fails in 1.13 tests.
+            "unidirectional_sequence_lstm",
+            "unidirectional_sequence_rnn",
         ]
 
     return []
@@ -442,10 +458,11 @@ def flex_dep(target_op_sets):
     else:
         return []
 
-def gen_model_coverage_test(model_name, data, failure_type, tags):
+def gen_model_coverage_test(src, model_name, data, failure_type, tags):
     """Generates Python test targets for testing TFLite models.
 
     Args:
+      src: Main source file.
       model_name: Name of the model to test (must be also listed in the 'data'
         dependencies)
       data: List of BUILD targets linking the data.
@@ -462,9 +479,9 @@ def gen_model_coverage_test(model_name, data, failure_type, tags):
         i = i + 1
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
-            srcs = ["model_coverage_test.py"],
+            srcs = [src],
+            main = src,
             size = "large",
-            main = "model_coverage_test.py",
             args = [
                 "--model_name=%s" % model_name,
                 "--target_ops=%s" % target_op_sets,
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index ce73aa0f9cddbe53021dc2e6fd515fc1606b9469..7b4efdf4a36c8bed835725d0277ffacb001c17de 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -129,6 +129,15 @@ typedef enum {
   kTfLiteBuiltinAbs = 101,
   kTfLiteBuiltinSplitV = 102,
   kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
+  kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
+  kTfLiteBuiltinReverseSequence = 112,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 91c04a5f1fb5bb1a15bd1da074a1276a3d8e7793..661b648550c9a3fc64b8bc2fb97a2f3b16e7aac1 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -12,7 +12,7 @@ cc_library(
         "c_api_internal.h",
     ],
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
+        "//learning/brain/mobile/kernel_test:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
 )
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 332c2db14511af18a8e3d99fc93891ce92d1792a..5d1c92d36f5d73ba4de79be217daade4f1866b84 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -333,6 +333,9 @@ typedef struct {
   TfLiteType out_type;
 } TfLiteShapeParams;
 
+typedef struct {
+} TfLiteRankParams;
+
 typedef struct {
   // Parameters supported by version 1:
   float min;
@@ -365,6 +368,11 @@ typedef struct {
   TfLiteType index_out_type;
 } TfLiteUniqueParams;
 
+typedef struct {
+  int seq_dim;
+  int batch_dim;
+} TfLiteReverseSequenceParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 4ce7c481e1c26e6fcfdaa680e9ca666b82968d53..4967183dd56df64b75c719869d16d052ae976081 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -71,6 +71,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeConvParams transpose_conv_params;
   TfLiteSparseToDenseParams sparse_to_dense_params;
   TfLiteShapeParams shape_params;
+  TfLiteRankParams rank_params;
   TfLiteFakeQuantParams fake_quant_params;
   TfLitePackParams pack_params;
   TfLiteOneHotParams one_hot_params;
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index 29dba15c63cb9f2b87484cc0d777db471084d8ce..f20ee23bd81eb87c25a1a7f61cce59df7ae6678e 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -91,10 +91,10 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
   t->data.raw = NULL;
 }
 
-void TfLiteQuantizationFree(TfLiteTensor* t) {
-  if (t->quantization.type == kTfLiteAffineQuantization) {
+void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
+  if (quantization->type == kTfLiteAffineQuantization) {
     TfLiteAffineQuantization* q_params =
-        (TfLiteAffineQuantization*)(t->quantization.params);
+        (TfLiteAffineQuantization*)(quantization->params);
     if (q_params->scale) {
       TfLiteFloatArrayFree(q_params->scale);
       q_params->scale = NULL;
@@ -105,8 +105,8 @@ void TfLiteQuantizationFree(TfLiteTensor* t) {
     }
     free(q_params);
   }
-  t->quantization.params = NULL;
-  t->quantization.type = kTfLiteNoQuantization;
+  quantization->params = NULL;
+  quantization->type = kTfLiteNoQuantization;
 }
 
 void TfLiteTensorFree(TfLiteTensor* t) {
@@ -114,7 +114,7 @@ void TfLiteTensorFree(TfLiteTensor* t) {
   if (t->dims) TfLiteIntArrayFree(t->dims);
   t->dims = NULL;
 
-  TfLiteQuantizationFree(t);
+  TfLiteQuantizationFree(&t->quantization);
 }
 
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 31f483370cf7797e5d071d637adc377eefd25352..83e2be690762be3e2cacf02ea8311b76dc1731c4 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -340,6 +340,9 @@ typedef struct {
 // Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
 // Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 
diff --git a/tensorflow/lite/context_util.h b/tensorflow/lite/context_util.h
index 68b91ea0b93e602c20d1db3284a523e9f55dfd5b..2f846cc259e34b1f750ba0787dffa93db597cbe0 100644
--- a/tensorflow/lite/context_util.h
+++ b/tensorflow/lite/context_util.h
@@ -38,6 +38,7 @@ class TfLiteIntArrayView {
   const_iterator begin() const { return int_array_->data; }
   const_iterator end() const { return &int_array_->data[int_array_->size]; }
   size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
 
  private:
   const TfLiteIntArray* int_array_;
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6a43b0322d17041a5ae4a0527376d1465a539b1d..db6b4a2d18ecd894fa3b8a0bf646ca9f8c6b6511 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -51,6 +51,7 @@ cc_test(
     srcs = ["flatbuffer_conversions_test.cc"],
     deps = [
         ":api",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 970e45bbdcd5c5d582c0cab29ea89c657987c70d..2ba64f51d9aa13f6ff863f1a305a7ad36ae4c67a 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -28,22 +28,27 @@ namespace {
 // Copies the contents from the flatbuffer int vector `flatbuffer` into the
 // int array `buffer`. `flat_vector` and `buffer` represent the same
 // configuration operation for a given operation.
-void FlatBufferIntVectorToArray(int max_size_of_buffer,
-                                const flatbuffers::Vector<int32_t>* flat_vector,
-                                int* buffer, ErrorReporter* error_reporter) {
+TfLiteStatus FlatBufferIntVectorToArray(
+    int max_size_of_buffer, const flatbuffers::Vector<int32_t>* flat_vector,
+    int* buffer, ErrorReporter* error_reporter, const char* op_name) {
   if (!flat_vector) {
-    error_reporter->Report("Input array not provided for operation.\n");
+    error_reporter->Report("Input array not provided for operation '%s'.\n",
+                           op_name);
+    return kTfLiteError;
   } else {
     int num_dimensions = flat_vector->Length();
     if (num_dimensions > max_size_of_buffer / sizeof(int)) {
       error_reporter->Report(
-          "Found too many dimensions in the operation's input array.\n");
+          "Found too many dimensions in the input array of operation '%s'.\n",
+          op_name);
+      return kTfLiteError;
     } else {
       for (int i = 0; i < num_dimensions; ++i) {
         buffer[i] = flat_vector->Get(i);
       }
     }
   }
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -452,8 +457,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
-        FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
-                                   params->shape, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->shape), new_shape, params->shape, error_reporter,
+            "reshape"));
         params->num_dimensions = new_shape->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -521,8 +527,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
         const auto& squeeze_dims = schema_params->squeeze_dims();
-        FlatBufferIntVectorToArray(sizeof(params->squeeze_dims), squeeze_dims,
-                                   params->squeeze_dims, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
+            error_reporter, "squeeze"));
         params->num_squeeze_dims = squeeze_dims->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -665,6 +672,17 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_REVERSE_SEQUENCE: {
+      TfLiteReverseSequenceParams* params =
+          allocator->AllocatePOD<TfLiteReverseSequenceParams>();
+      if (auto* reverse_seq_params =
+              op->builtin_options_as_ReverseSequenceOptions()) {
+        params->seq_dim = reverse_seq_params->seq_dim();
+        params->batch_dim = reverse_seq_params->batch_dim();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_ABS:
@@ -673,12 +691,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     // ok for now, since there is no call implementation either.
     case BuiltinOperator_CALL:
     case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_COS:
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_ELU:
     case BuiltinOperator_EMBEDDING_LOOKUP:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
+    case BuiltinOperator_CEIL:
     case BuiltinOperator_FLOOR:
     case BuiltinOperator_GREATER:
     case BuiltinOperator_GREATER_EQUAL:
@@ -718,6 +739,11 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
     case BuiltinOperator_SQUARED_DIFFERENCE:
+    case BuiltinOperator_REVERSE_V2:
+    case BuiltinOperator_ADD_N:
+    case BuiltinOperator_GATHER_ND:
+    case BuiltinOperator_WHERE:
+    case BuiltinOperator_RANK:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 4d1d1b21fda106b3196ff43421996f45ab83af4f..4a5de48302c1e840c524335ee549c74a162e107e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstring>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace {
@@ -33,6 +35,8 @@ class MockErrorReporter : public ErrorReporter {
   char* GetBuffer() { return buffer_; }
   int GetBufferSize() { return buffer_size_; }
 
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
  private:
   static constexpr int kBufferSize = 256;
   char buffer_[kBufferSize];
@@ -60,25 +64,56 @@ class MockDataAllocator : public BuiltinDataAllocator {
 
 }  // namespace
 
-TEST(FlatbufferConversions, TestParseOpDataConv) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> conv_options =
-      CreateConv2DOptions(builder, Padding_SAME, 1, 2,
-                          ActivationFunctionType_RELU, 3, 4)
-          .Union();
-  flatbuffers::Offset<Operator> conv_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_Conv2DOptions, conv_options,
-      nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(conv_offset);
-  void* conv_pointer = builder.GetBufferPointer();
-  const Operator* conv_op = flatbuffers::GetRoot<Operator>(conv_pointer);
+class FlatbufferConversionsTest : public ::testing::Test {
+ public:
+  const Operator* BuildTestOperator(BuiltinOptions op_type,
+                                    flatbuffers::Offset<void> options) {
+    flatbuffers::Offset<Operator> offset =
+        CreateOperatorDirect(builder_, 0, nullptr, nullptr, op_type, options,
+                             nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
+    builder_.Finish(offset);
+    void* pointer = builder_.GetBufferPointer();
+    return flatbuffers::GetRoot<Operator>(pointer);
+  }
+
+ protected:
+  MockErrorReporter mock_reporter_;
+  MockDataAllocator mock_allocator_;
+  flatbuffers::FlatBufferBuilder builder_;
+};
+
+TEST_F(FlatbufferConversionsTest, ParseBadSqueeze) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_SqueezeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'squeeze'"));
+}
+
+TEST_F(FlatbufferConversionsTest, ParseBadReshape) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_ReshapeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_RESHAPE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'reshape'"));
+}
+
+TEST_F(FlatbufferConversionsTest, TestParseOpDataConv) {
+  const Operator* conv_op =
+      BuildTestOperator(BuiltinOptions_Conv2DOptions,
+                        CreateConv2DOptions(builder_, Padding_SAME, 1, 2,
+                                            ActivationFunctionType_RELU, 3, 4)
+                            .Union());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(conv_op, BuiltinOperator_CONV_2D, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(conv_op, BuiltinOperator_CONV_2D, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_NE(nullptr, output_data);
   TfLiteConvParams* params = reinterpret_cast<TfLiteConvParams*>(output_data);
   EXPECT_EQ(kTfLitePaddingSame, params->padding);
@@ -89,30 +124,20 @@ TEST(FlatbufferConversions, TestParseOpDataConv) {
   EXPECT_EQ(4, params->dilation_height_factor);
 }
 
-TEST(FlatbufferConversions, TestParseOpDataCustom) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> null_options;
-  flatbuffers::Offset<Operator> custom_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_NONE, null_options, nullptr,
-      CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(custom_offset);
-  void* custom_pointer = builder.GetBufferPointer();
-  const Operator* custom_op = flatbuffers::GetRoot<Operator>(custom_pointer);
+TEST_F(FlatbufferConversionsTest, TestParseOpDataCustom) {
+  const Operator* custom_op =
+      BuildTestOperator(BuiltinOptions_NONE, flatbuffers::Offset<void>());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(custom_op, BuiltinOperator_CUSTOM, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(custom_op, BuiltinOperator_CUSTOM, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_EQ(nullptr, output_data);
 }
 
-TEST(FlatbufferConversions, TestConvertTensorType) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
+TEST_F(FlatbufferConversionsTest, TestConvertTensorType) {
   TfLiteType type;
-  EXPECT_EQ(kTfLiteOk, ConvertTensorType(TensorType_FLOAT32, &type, reporter));
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_FLOAT32, &type, &mock_reporter_));
   EXPECT_EQ(kTfLiteFloat32, type);
 }
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 4be80d143e7d34b45be45e06e42519afe4d32827..ec6762b16c95e86fb65ec187d5e92f91eff1cbc5 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -72,6 +72,34 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
 }
 
+// Gets the legacy TfLiteQuantizationParams from the current TfLiteQuantization.
+TfLiteQuantizationParams GetLegacyQuantization(
+    const TfLiteQuantization& quantization) {
+  TfLiteQuantizationParams legacy_quantization;
+  legacy_quantization.scale = 0;
+  legacy_quantization.zero_point = 0;
+
+  // If the quantization type isn't affine, return the empty
+  // legacy_quantization.
+  if (quantization.type != kTfLiteAffineQuantization) {
+    return legacy_quantization;
+  }
+
+  auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(quantization.params);
+  if (!affine_quantization || !affine_quantization->scale ||
+      !affine_quantization->zero_point ||
+      affine_quantization->scale->size != 1 ||
+      affine_quantization->zero_point->size != 1) {
+    return legacy_quantization;
+  }
+
+  // We know its per-layer quantization now.
+  legacy_quantization.scale = affine_quantization->scale->data[0];
+  legacy_quantization.zero_point = affine_quantization->zero_point->data[0];
+  return legacy_quantization;
+}
+
 }  // namespace
 
 // A trivial implementation of GraphInfo around the Interpreter.
@@ -369,6 +397,10 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
+void Subgraph::ReserveNodes(int count) {
+  nodes_and_registration_.reserve(count);
+}
+
 TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
                                           int length) {
   // Making sure kOptionalTensor is not re-defined to something other than -1.
@@ -382,7 +414,9 @@ TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
       continue;
     }
     if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
-      ReportError("Invalid tensor index %d in %s\n", index, label);
+      ReportError(
+          "Invalid tensor index %d in %s. The subgraph has %d tensors\n", index,
+          label, context_->tensors_size);
       consistent_ = false;
       return kTfLiteError;
     }
@@ -558,7 +592,12 @@ TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
 
   // Short-circuit the state change if the dimensions don't change, avoiding
   // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+  //
+  // Note that it's required to check `tensor->data.raw != nullptr`. Otherwise
+  // the subgraph won't allocate memory for a dynamic tensor when its size
+  // is equal to the original tensor size.
+  if (tensor->data.raw != nullptr &&
+      EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
     return kTfLiteOk;
   }
 
@@ -779,7 +818,7 @@ TfLiteStatus Subgraph::GetNodeAndRegistration(
 
 TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    const int* dims, TfLiteQuantization quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
@@ -804,16 +843,22 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
       EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
     // Fast path which does not invalidate the invokable property.
     TfLiteTensorDataFree(&tensor);
+    TfLiteQuantizationFree(&tensor.quantization);
     tensor.data.raw = const_cast<char*>(buffer);
     if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
+    tensor.params = GetLegacyQuantization(quantization);
+    tensor.quantization = quantization;
     tensor.allocation_type = kTfLiteMmapRo;
     tensor.allocation = allocation;
   } else {
     state_ = kStateUninvokable;
     TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
+                      GetLegacyQuantization(quantization),
+                      const_cast<char*>(buffer), bytes, kTfLiteMmapRo,
+                      allocation, false, &tensor);
+    // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+    // if there are other required callers.
+    tensor.quantization = quantization;
   }
   return kTfLiteOk;
 }
@@ -824,7 +869,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
 // to Interpreter.
 TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+    const int* dims, TfLiteQuantization quantization, bool is_variable) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
         "SetTensorParametersReadWrite is disallowed when graph is immutable.");
@@ -854,10 +899,14 @@ TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     allocation_type = kTfLiteArenaRwPersistent;
   }
 
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
   TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
+                    GetLegacyQuantization(quantization),
                     /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_->tensors[tensor_index]);
+                    nullptr, is_variable, &tensor);
+  // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+  // if there are other required callers.
+  tensor.quantization = quantization;
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 0bec218d898873459ce80eb0d191041851cda90d..5db15a177ef9fe8fcb54e0bf92f0193238440941 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -59,6 +59,11 @@ class Subgraph {
   // interpreter.
   TfLiteStatus SetVariables(std::vector<int> variables);
 
+  // Ensure the internal node storage memory allocates at least `count`
+  // spots for node. NOTE, this doesn't actually add operators. This is an
+  // efficiency optimization that is subject to change.
+  void ReserveNodes(int count);
+
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
   // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
@@ -68,29 +73,48 @@ class Subgraph {
                                      const char* init_data,
                                      size_t init_data_size, void* builtin_data,
                                      const TfLiteRegistration* registration,
-                                     int* node_index);
+                                     int* node_index = nullptr);
 
   // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
   // The value pointed to by `first_new_tensor_index` will be set to the
   // index of the first new tensor if `first_new_tensor_index` is non-null.
-  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  inline TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes,
+      const Allocation* allocation = nullptr) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation);
+  }
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization,
-      const char* buffer, size_t bytes, const Allocation* allocation);
+      const int* dims, TfLiteQuantization quantization, const char* buffer,
+      size_t bytes, const Allocation* allocation = nullptr);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+  inline TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      bool is_variable = false) {
+    return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
+                                        dims.data(), quantization, is_variable);
+  }
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name, const size_t rank,
+                                            const int* dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
index 45bc571f9284abe95cd3550e64dd098157da14a8..b73ed88d3789d5df8dadaee19d468596ccd4c782 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.h
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -56,25 +56,7 @@ class BufferMap {
   // given TfLiteTensor's data.
   void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor);
 
-  // Sets a bit indicating that the tensor associated with 'tensor_index' can
-  // be use by TF's forwarding optimizations.
-  void SetForwardable(int tensor_index) { forwardable_.insert(tensor_index); }
-
-  // Removes all information about which tensors are forwardable.
-  void ClearForwardable() { forwardable_.clear(); }
-
-  // Returns true if this tensor has been explicitly marks as forwardable by
-  // a call to SetForwardable().
-  bool IsForwardable(int tensor_index) const {
-    return forwardable_.count(tensor_index) > 0;
-  }
-
  private:
-  // List of tensors that can be used by TF in its forwarding optimization.
-  // Doing so allows an input tensor to be modified and used as the output
-  // tensor. The delegate takes care of not holding any references to tensors
-  // in this list while Eager is executing the corresponding op.
-  std::set<int> forwardable_;
   // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that
   // are inputs or outputs of a subgraph will be added here, irrespective of
   // whether their data are managed by TF Lite or TensorFlow.
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index 8edeb280ebdc7953a1031e30b77bb58a760e20e7..accaf3045246b35705085bd5324e5b33ec8ea12a 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -44,6 +44,7 @@ UniqueTfLiteTensor MakeLiteTensor(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(T), tensor.get());
   memcpy(tensor->data.raw, data.data(), data.size() * sizeof(T));
   return tensor;
@@ -62,6 +63,7 @@ UniqueTfLiteTensor MakeLiteTensor<string>(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(string), tensor.get());
 
   DynamicBuffer b;
@@ -222,15 +224,6 @@ TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
               ElementsAre(0, 0, 0, 0.123f, 0, 0));
 }
 
-TEST(BufferMapTest, Forwardable) {
-  BufferMap buffer_map;
-  EXPECT_FALSE(buffer_map.IsForwardable(0));
-  buffer_map.SetForwardable(0);
-  EXPECT_TRUE(buffer_map.IsForwardable(0));
-  buffer_map.ClearForwardable();
-  EXPECT_FALSE(buffer_map.IsForwardable(0));
-}
-
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 2e0fc22ad6872884d04da4c2d2f8a4dce0246de9..ceb9918f6fa7ccfbb4d27a0bf921987faecc1c12 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -67,6 +67,7 @@ class OpInputs {
     for (int index : TfLiteIntArrayView(indexes)) {
       inputs_.push_back(index);
     }
+    forwardable_.resize(inputs_.size());
   }
   ~OpInputs() {}
 
@@ -89,11 +90,21 @@ class OpInputs {
     }
   }
 
+  void SetForwardable(int i, bool v) { forwardable_[i] = v; }
+
+  bool IsForwardable(int i) const { return forwardable_[i]; }
+
   TensorSource GetTensorSource(int i) const { return sources_[i]; }
 
  private:
   std::vector<int> inputs_;
   std::vector<TensorSource> sources_;
+
+  // List of tensors that can be used by TF in its forwarding optimization.
+  // Doing so allows an input tensor to be modified and used as the output
+  // tensor. The delegate takes care of not holding any references to tensors
+  // in this list while Eager is executing the corresponding op.
+  std::vector<int> forwardable_;
 };
 
 // A list of outputs of a given node of the TensorFlow/Eager graph, along with
@@ -279,7 +290,7 @@ class OpNode {
       } else {
         // If this is a forwardable tensor, we will remove it from the previous
         // op's list, giving TF the opportunity to reuse its buffer.
-        bool unref_handle = buffer_map->IsForwardable(input_index);
+        bool unref_handle = inputs_.IsForwardable(i);
         auto* handle =
             s.node->outputs_.GetHandle(s.node_output_index, unref_handle);
         op_->MutableInputs()->push_back(handle);
@@ -485,13 +496,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  buffer_map->ClearForwardable();
-  for (const auto& x : tensor_ref_count) {
-    if (x.second == 1) {
-      // This tensor is referenced once by a single op. We can allow the TF
-      // kernel to "forward" it to the output, meaning its buffer will be
-      // reused and overwritten.
-      buffer_map->SetForwardable(x.first);
+  // All tensors that are referenced exactly once are marked as "forwardable",
+  // meaning that we will allow TensorFlow to reuse its buffer as the output of
+  // an op.
+  for (auto& node_data : op_data->nodes) {
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      bool f = (tensor_ref_count[node_data->inputs().TfLiteIndex(i)] == 1);
+      node_data->mutable_inputs()->SetForwardable(i, f);
     }
   }
 
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 63f6da1fa6d38f39abed2bbe2b382841c7d5bce4..ec0d78e59d5e726f146f1d143b5fdc63f3e0d8aa 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -2,7 +2,6 @@ package(default_visibility = [
     "//visibility:public",
 ])
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 licenses(["notice"])  # Apache 2.0
@@ -16,11 +15,11 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 94fcc8740b42c8f0beaf8801b5a061bfbf3c6433..efbb7d7d6704cebe48e23a4f35a484956227366a 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -25,11 +25,13 @@ limitations under the License.
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
-#include <sys/mman.h>
 #include <sys/system_properties.h>
+#endif
+#if defined __ANDROID__ || defined __unix__
+#include <sys/mman.h>
 #include <unistd.h>
 #endif
 
@@ -49,65 +51,93 @@ namespace {
   } while (0)
 
 namespace {
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return std::numeric_limits<int32_t>::max();
-      }
+
+bool IsFloat(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsQuantized(TfLiteType type) {
+  switch (type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
+                      const TfLiteNode* node) {
+  switch (builtin_code) {
+    case kTfLiteBuiltinConv2d:
+    case kTfLiteBuiltinFullyConnected: {
+      const int input_id = node->inputs->data[0];
+      const int filter_id = node->inputs->data[1];
+      const TfLiteType input_type = context->tensors[input_id].type;
+      const TfLiteType filter_type = context->tensors[filter_id].type;
+      return IsFloat(input_type) && IsQuantized(filter_type);
     }
-    return atoi(sdkVersion);
+    default:
+      return false;
   }
-#endif  // __ANDROID__
-  return 0;
 }
 
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
 
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
 struct NNFreeModel {
   void operator()(ANeuralNetworksModel* model) {
-    ANeuralNetworksModel_free(model);
+    NnApiImplementation()->ANeuralNetworksModel_free(model);
   }
 };
 // RAII NN API Compilation Destructor for use with std::unique_ptr
 struct NNFreeCompilation {
   void operator()(ANeuralNetworksCompilation* model) {
-    ANeuralNetworksCompilation_free(model);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(model);
+  }
+};
+
+// RAII NN API Execution Destructor for use with std::unique_ptr
+struct NNFreeExecution {
+  void operator()(ANeuralNetworksExecution* execution) {
+    NnApiImplementation()->ANeuralNetworksExecution_free(execution);
   }
 };
 
 // Manage NNAPI shared memory handle
 class NNMemory {
  public:
-  NNMemory(const char* name, size_t size) {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  NNMemory(const NnApi* nnapi, const char* name, size_t size) {
+    nnapi_ = nnapi;
     byte_size_ = size;
-    fd_ = ASharedMemory_create(name, size);
+    fd_ = nnapi_->ASharedMemory_create(name, size);
     data_ptr_ = reinterpret_cast<uint8_t*>(
         mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
-                                       &nn_memory_handle_);
-#endif
+    nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
+                                               fd_, 0, &nn_memory_handle_);
   }
+#else
+  NNMemory(const NnApi* /*nnapi*/, const char* /*name*/, size_t /*size*/) {}
+#endif
 
   ~NNMemory() {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
     if (data_ptr_) {
       munmap(data_ptr_, byte_size_);
     }
     if (nn_memory_handle_) {
-      ANeuralNetworksMemory_free(nn_memory_handle_);
+      nnapi_->ANeuralNetworksMemory_free(nn_memory_handle_);
     }
     if (fd_ > 0) close(fd_);
 #endif
@@ -117,7 +147,8 @@ class NNMemory {
   uint8_t* get_data_ptr() { return data_ptr_; }
 
  private:
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  const NnApi* nnapi_;
   int fd_ = 0;
   size_t byte_size_ = 0;
 #endif
@@ -162,15 +193,42 @@ class OperandMapping {
   std::vector<int> lite_tensor_to_ann_tensor_;
 };
 
+class DequantizeMapping {
+ public:
+  int DequantizedAnnIndex(int ann_index, TfLiteType type) const {
+    for (const auto& element : mapping_) {
+      if (ann_index == std::get<0>(element) && type == std::get<1>(element)) {
+        return std::get<2>(element);
+      }
+    }
+    return -1;
+  }
+
+  void Add(int ann_index, TfLiteType type, int dequantized_ann_index) {
+    // This assumes it is not already mapped.
+    mapping_.emplace_back(ann_index, type, dequantized_ann_index);
+  }
+
+ private:
+  // Each tuple specifies the ANN (quantized) tensor index, the desired
+  // floating-point type and the matching ANN (dequantized) tensor index. This
+  // could use a map but instead std::vector is used to keep code size lower.
+  std::vector<std::tuple<int, TfLiteType, int>> mapping_;
+};
+
 // Abstract builder for building an op in the NN API graph. This handles
 // the disparity between TFLite and NN API operand types. NN API has singular
 // operands for both tensors and parameters, and TFLite separates the two.
 class NNAPIOpBuilder {
  public:
-  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
+  NNAPIOpBuilder(const NnApi* nnapi, TfLiteContext* context,
+                 OperandMapping* tensor_mapping,
+                 DequantizeMapping* dequantize_mapping,
                  ANeuralNetworksModel* nn_model)
-      : context_(context),
+      : nnapi_(nnapi),
+        context_(context),
         operand_mapping_(tensor_mapping),
+        dequantize_mapping_(dequantize_mapping),
         nn_model_(nn_model) {}
 
   TfLiteStatus AddScalarInt32Operand(int32_t value) {
@@ -204,48 +262,129 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddTensorInput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_inputs_.push_back(ann_index);
-    return kTfLiteOk;
+  TfLiteStatus AddTensorInput(int tensor_index, bool hybrid_op) {
+    return AddTensor(tensor_index, hybrid_op, &augmented_inputs_);
   }
 
   TfLiteStatus AddTensorOutput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_outputs_.push_back(ann_index);
-    return kTfLiteOk;
+    return AddTensor(tensor_index, /*hybrid_op=*/false, &augmented_outputs_);
   }
 
   TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) {
     std::vector<uint32_t> dims(dimension_count, 0);
-    ANeuralNetworksOperandType operand_type{
-        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
-        .dimensionCount = dimension_count,
-        .dimensions = dims.data()};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    augmented_outputs_.push_back(ann_operand);
-    return kTfLiteOk;
+    return AddFloat32OutputTensor(dimension_count, dims.data(), nullptr);
   }
 
   TfLiteStatus AddStateFloat32Tensor(int tensor_index,
                                      int* ann_tensor_index_out) {
     TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    return AddFloat32OutputTensor(
+        tensor->dims->size, reinterpret_cast<uint32_t*>(tensor->dims->data),
+        ann_tensor_index_out);
+  }
 
+  // Adds a Dequantize operator and replaces the input tensor index with the
+  // dequantized version. If the dequantized version of the operator already
+  // exists then it is not added again.
+  TfLiteStatus AddDequantize(int nn_input_index, int lite_index,
+                             TfLiteType dequantized_type) {
+    const int ann_index = operand_mapping_->lite_index_to_ann(lite_index);
+    int dequantized_ann_index =
+        dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
+
+    if (dequantized_ann_index == -1) {
+      // The dequantized version does not exist yet, it has to be added: a new
+      // Dequantize operation is added, yielding a new tensor.
+      const TfLiteTensor& tensor = context_->tensors[lite_index];
+      ANeuralNetworksOperandType operand_type{
+          dequantized_type, static_cast<uint32_t>(tensor.dims->size),
+          reinterpret_cast<uint32_t*>(tensor.dims->data), 0.f, 0};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+      dequantized_ann_index = operand_mapping_->add_new_non_tensor_operand();
+
+      // Add Dequantize operation.
+      const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
+      const uint32_t dequantize_output[1] = {
+          static_cast<uint32_t>(dequantized_ann_index)};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_, nnapi_->ANeuralNetworksModel_addOperation(
+                        nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1,
+                        dequantize_input, 1, dequantize_output));
+      dequantize_mapping_->Add(ann_index, dequantized_type,
+                               dequantized_ann_index);
+    }
+
+    // The input for the original operation is modified so that the operation
+    // now uses the dequantized tensor as input.
+    augmented_inputs_[nn_input_index] = dequantized_ann_index;
+
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()));
+    augmented_inputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
+
+ private:
+  template <typename T>
+  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{.type = nn_type};
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, &value, sizeof(T)));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  template <typename T>
+  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
+                                int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{
-        ANEURALNETWORKS_TENSOR_FLOAT32,
-        static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-        tensor->params.zero_point};
+        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
+
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    augmented_outputs_.push_back(ann_index);
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, values, sizeof(T) * num_values));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
 
-    *ann_tensor_index_out = ann_index;
+  TfLiteStatus AddFloat32OutputTensor(uint32_t dimension_count,
+                                      const uint32_t* dimension_data,
+                                      int* ann_index_out) {
+    ANeuralNetworksOperandType operand_type{
+        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
+        .dimensionCount = dimension_count,
+        .dimensions = dimension_data,
+    };
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    augmented_outputs_.push_back(ann_index);
+    if (ann_index_out) *ann_index_out = ann_index;
     return kTfLiteOk;
   }
 
@@ -253,10 +392,11 @@ class NNAPIOpBuilder {
   // This returns the NN API tensor index corresponding to the created tensor.
   // If another caller previously created a NN API tensor for `tensor_index`
   // then the existing one is returned.
-  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+  TfLiteStatus AddTensor(int tensor_index, bool hybrid_op,
+                         std::vector<uint32_t>* indices) {
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
-      *ann_tensor_index_out = ann_tensor_index;
+      indices->push_back(ann_tensor_index);
       return kTfLiteOk;
     }
     // Allocate a new tensor index
@@ -267,11 +407,17 @@ class NNAPIOpBuilder {
     float scale = 0.0f;
     int32_t zeroPoint = 0;
     TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    switch (tensor->type) {
+    TfLiteType tensor_type = tensor->type;
+    if (hybrid_op && (tensor_type == kTfLiteUInt8)) {
+      // For legacy reason, UINT8 weights in hybrid operators are actually INT8
+      // values and should be interpreted as such.
+      tensor_type = kTfLiteInt8;
+    }
+    switch (tensor_type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
         // should not be registered with the NNAPI.
-        *ann_tensor_index_out = -1;
+        indices->push_back(-1);
         return kTfLiteOk;
       case kTfLiteFloat32:
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
@@ -285,6 +431,10 @@ class NNAPIOpBuilder {
           scale = 1;
         }
         break;
+      case kTfLiteInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        scale = tensor->params.scale;
+        break;
       case kTfLiteInt32:
         nn_type = ANEURALNETWORKS_TENSOR_INT32;
         scale = tensor->params.scale;
@@ -299,69 +449,35 @@ class NNAPIOpBuilder {
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
 
     if (tensor->allocation_type == kTfLiteMmapRo) {
       // TODO(b/80630405): Use NNAPIAllocation.
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
           context_,
-          ANeuralNetworksModel_setOperandValue(
+          nnapi_->ANeuralNetworksModel_setOperandValue(
               nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
     }
 
-    *ann_tensor_index_out = ann_tensor_index;
-    return kTfLiteOk;
-  }
-
-  // Finish emitting the op (of type `type`) into the NN API.
-  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
-    // Actually add a NN API operation
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        ANeuralNetworksModel_addOperation(
-            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
-            augmented_inputs_.data(),
-            static_cast<uint32_t>(augmented_outputs_.size()),
-            augmented_outputs_.data()));
-    augmented_inputs_.clear();
-    augmented_outputs_.clear();
-    return kTfLiteOk;
-  }
-
- private:
-  template <typename T>
-  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{.type = nn_type};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_setOperandValue(nn_model_, ann_operand,
-                                                       &value, sizeof(T)));
-    augmented_inputs_.push_back(ann_operand);
+    indices->push_back(ann_tensor_index);
     return kTfLiteOk;
   }
 
-  template <typename T>
-  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
-                                int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{
-        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_, ANeuralNetworksModel_setOperandValue(
-                      nn_model_, ann_operand, values, sizeof(T) * num_values));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
+  // Access to NNAPI.
+  const NnApi* const nnapi_;
 
   // TfLiteContext for error handling.
   TfLiteContext* const context_;
 
   // Tracks relationship between indices.
-  OperandMapping* operand_mapping_;
+  OperandMapping* const operand_mapping_;
+
+  // Keeps mapping of ANN quantized tensor and float data type to equivalent
+  // dequantized ANN tensor. For example, tensor #4 (UINT8) + FLOAT32 could map
+  // to tensor #10 (FLOAT32) because a DEQUANTIZE operator was added to convert
+  // tensor #4 to a FLOAT32 tensor.
+  DequantizeMapping* const dequantize_mapping_;
 
   // The NNAPI model.
   ANeuralNetworksModel* const nn_model_;
@@ -392,7 +508,7 @@ ANeuralNetworksOperationType BasicMappingFn(
 // The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
-  NNAPIDelegateKernel() = default;
+  NNAPIDelegateKernel() { nnapi_ = NnApiImplementation(); }
 
   typedef ANeuralNetworksOperationType (*MappingFn)(
       const NNAPIOpMappingArgs& mapping_args);
@@ -400,8 +516,9 @@ class NNAPIDelegateKernel {
   // Return a function that knows how to translate a node into its operands
   // when called. You can use this function to see if a node is supported
   // (i.e. that MappingFn is not nullptr).
-  static MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                       TfLiteNode* node) {
+  static MappingFn Map(const TfLiteContext* context, int builtin_code,
+                       int version, int android_sdk_version,
+                       const TfLiteNode* node) {
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
@@ -457,6 +574,11 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinConv2d:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           auto builtin =
               reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
           if (builtin->dilation_width_factor != 1 ||
@@ -494,6 +616,11 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinFullyConnected:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
@@ -520,7 +647,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSqueeze:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
@@ -528,7 +655,7 @@ class NNAPIDelegateKernel {
             // Note that we add the squeeze dimensions even if the dimensions
             // were unspecified (empty), as NNAPI requires the operand.
             mapping_args.builder->AddVectorInt32Operand(
-                builtin->squeeze_dims,
+                builtin->num_squeeze_dims ? builtin->squeeze_dims : nullptr,
                 static_cast<uint32_t>(builtin->num_squeeze_dims));
             return ANEURALNETWORKS_SQUEEZE;
           };
@@ -576,9 +703,10 @@ class NNAPIDelegateKernel {
         if (version == 1 &&
             reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
                     ->activation == kTfLiteActNone) {
-          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8) {
-            // NNAPI only support concatenating quantized tensor of the same
-            // scale and offset.
+          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8 &&
+              android_sdk_version < kMinSdkVersionForNNAPI12) {
+            // NNAPI 1.0-1 only supported concatenating quantized tensor of the
+            // same scale and offset.
             auto first_param = context->tensors[node->inputs->data[0]].params;
             for (int i = 1; i < node->inputs->size; i++) {
               auto curr_param = context->tensors[node->inputs->data[i]].params;
@@ -598,7 +726,16 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinDequantize:
-        if (version == 1) {
+        if (version == 1 || version == 2) {
+          const auto& input = context->tensors[node->inputs->data[0]];
+          const auto zero_point = input.params.zero_point;
+          // NN API supports int8 type since version 1.2 but only for symmetric
+          // quantization.
+          if (input.type == kTfLiteInt8 &&
+              (zero_point != 0 ||
+               android_sdk_version < kMinSdkVersionForNNAPI12)) {
+            return nullptr;
+          }
           return BasicMappingFn<ANEURALNETWORKS_DEQUANTIZE>;
         }
         break;
@@ -636,7 +773,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSub:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float sub.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -649,7 +786,7 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinDiv:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float div.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -662,22 +799,24 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinPad:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
-            node->inputs->size == 2 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+        if (version == 1 && node->inputs->size == 2 &&
+            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
+            (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
+             android_sdk_version >= kMinSdkVersionForNNAPI12)) {
           // NNAPI does not support specifying the padding value.
-          // NNAPI pads physical zero for quantized tensors, so only delegate
-          // float pad to NNAPI.
+          // Before 1.2, NNAPI pads physical zero for quantized tensors, so only
+          // delegate float pad to NNAPI. NNAPI 1.2 onwards pads with
+          // zero-point, so delegate quantized pad as well.
           return BasicMappingFn<ANEURALNETWORKS_PAD>;
         }
         break;
       case kTfLiteBuiltinSpaceToBatchNd:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
         }
         break;
       case kTfLiteBuiltinStridedSlice:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
@@ -695,7 +834,7 @@ class NNAPIDelegateKernel {
         // dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((version == 1) &&
-            (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
+            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
             (node->inputs->size > 1) &&
             (context->tensors[node->inputs->data[1]].allocation_type ==
              kTfLiteMmapRo)) {
@@ -726,7 +865,9 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinSvdf:
         // NNAPI only support float32 weights.
+        // Only delegate to NNAPI 1.1, as SVDF does not support rank > 1 on 1.0.
         if (version == 1 && node->inputs->size == 5 &&
+            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
                     .type == kTfLiteFloat32) {
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -752,8 +893,11 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinLstm:
         // NNAPI only support float32 weights.
+        // Only delegate to NNAPI 1.1,  as 1.0 has a bug for optional tensors
+        // which would affect LSTM.
         // TODO(miaowang): add loggings to indicate why the op is rejected.
         if (version == 1 && node->inputs->size == 20 &&
+            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs
                                  ->data[/*kInputToOutputWeightsTensor*/ 4]]
                     .type == kTfLiteFloat32) {
@@ -793,7 +937,7 @@ class NNAPIDelegateKernel {
         break;
       case kTfLiteBuiltinMean:
         // NNAPI does not support generating a scalar as output for MEAN.
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
             context->tensors[node->outputs->data[0]].dims->size > 0) {
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -836,9 +980,9 @@ class NNAPIDelegateKernel {
     }
 
     if (!nn_model_) {
-      ANeuralNetworksModel* model;
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(context,
-                                      ANeuralNetworksModel_create(&model));
+      ANeuralNetworksModel* model = nullptr;
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksModel_create(&model));
       nn_model_.reset(model);
 
       TF_LITE_ENSURE_STATUS(
@@ -846,12 +990,17 @@ class NNAPIDelegateKernel {
     }
 
     if (!nn_compilation_) {
-      ANeuralNetworksCompilation* compilation;
+      ANeuralNetworksCompilation* compilation = nullptr;
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context,
-          ANeuralNetworksCompilation_create(nn_model_.get(), &compilation));
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, ANeuralNetworksCompilation_finish(compilation));
+          context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                             &compilation));
+      const int finish_result =
+          nnapi_->ANeuralNetworksCompilation_finish(compilation);
+      if (finish_result != ANEURALNETWORKS_NO_ERROR) {
+        nnapi_->ANeuralNetworksCompilation_free(compilation);
+        compilation = nullptr;
+      }
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, finish_result);
       nn_compilation_.reset(compilation);
     }
     return kTfLiteOk;
@@ -860,8 +1009,10 @@ class NNAPIDelegateKernel {
   TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
     ANeuralNetworksExecution* execution = nullptr;
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context,
-        ANeuralNetworksExecution_create(nn_compilation_.get(), &execution));
+        context, nnapi_->ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                         &execution));
+    std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
+        execution_unique_ptr(execution);
 
     // Set the input tensor buffers. Note: we access tflite tensors using
     // absolute indices but NN api indices inputs by relative indices.
@@ -881,7 +1032,7 @@ class NNAPIDelegateKernel {
                tensor->data.raw, tensor->bytes);
         RETURN_TFLITE_ERROR_IF_NN_ERROR(
             context,
-            ANeuralNetworksExecution_setInputFromMemory(
+            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
                 execution, relative_input_index, nullptr,
                 nn_input_memory_->get_handle(), input_offset, tensor->bytes));
         input_offset += tensor->bytes;
@@ -896,7 +1047,7 @@ class NNAPIDelegateKernel {
       TfLiteTensor* tensor = &context->tensors[output_index];
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
           context,
-          ANeuralNetworksExecution_setOutputFromMemory(
+          nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
               execution, relative_output_index, nullptr,
               nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
@@ -912,18 +1063,25 @@ class NNAPIDelegateKernel {
       // reading and writing into the same buffer during a invocation.
       // TODO(110369471): using double shared buffer to minimize the copies.
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, ANeuralNetworksExecution_setOutput(
+          context, nnapi_->ANeuralNetworksExecution_setOutput(
                        execution, relative_output_index, nullptr,
                        tensor->data.raw, tensor->bytes));
       relative_output_index++;
     }
     // Invoke ANN in blocking fashion.
-    ANeuralNetworksEvent* event = nullptr;
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, ANeuralNetworksExecution_startCompute(execution, &event));
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(context, ANeuralNetworksEvent_wait(event));
-    ANeuralNetworksEvent_free(event);
-    ANeuralNetworksExecution_free(execution);
+    if (nnapi_->android_sdk_version < kMinSdkVersionForNNAPI12) {
+      ANeuralNetworksEvent* event = nullptr;
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_startCompute(execution, &event));
+      const int wait_result = nnapi_->ANeuralNetworksEvent_wait(event);
+      nnapi_->ANeuralNetworksEvent_free(event);
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(context, wait_result);
+    } else {
+      // Use synchronous execution for NNAPI 1.2+.
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksExecution_compute(execution));
+    }
 
     // copy results from shared memory to the destination.
     output_offset = 0;
@@ -938,6 +1096,8 @@ class NNAPIDelegateKernel {
   }
 
  private:
+  // Access to NNApi.
+  const NnApi* nnapi_;
   // ANN API state.
   std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
   std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
@@ -954,18 +1114,69 @@ class NNAPIDelegateKernel {
   std::unique_ptr<NNMemory> nn_input_memory_;
   std::unique_ptr<NNMemory> nn_output_memory_;
 
+  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
+                                         int builtin_code,
+                                         const TfLiteNode* node,
+                                         NNAPIOpBuilder* builder) {
+    // Depending on the operator and the input data format, Dequantize
+    // operators may need to be added. For example when the input is
+    // floating-point but weights are quantized then the weights will first be
+    // dequantized to the same format as the input before being passed to the
+    // operator.
+
+    // The tensor determining whether the inputs should be floating-point.
+    int input_tensor_index = -1;
+    std::vector<int> inputs_to_potentially_dequantize;
+
+    switch (builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinFullyConnected: {
+        input_tensor_index = 0;
+        // Weights and bias are inputs #1 and #2 respectively and may require
+        // dequantization.
+        inputs_to_potentially_dequantize = {1, 2};
+        break;
+      }
+      default:
+        return;
+    }
+
+    int tensor_id = node->inputs->data[input_tensor_index];
+    if (tensor_id < 0) return;
+
+    // Nothing to do if the input is not floating-point.
+    if (!IsFloat(context->tensors[tensor_id].type)) return;
+
+    for (int i : inputs_to_potentially_dequantize) {
+      tensor_id = node->inputs->data[i];
+      if (tensor_id < 0) continue;  // Ignore optional input.
+
+      const TfLiteType type = context->tensors[tensor_id].type;
+      // Nothing to do for this tensor if it's not quantized.
+      if (type != kTfLiteUInt8) continue;
+
+      // Insert Dequantize operator if it hasn't been done already and change
+      // the node's input accordingly.
+      builder->AddDequantize(i, node->inputs->data[i], type);
+    }
+  }
+
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
-    // The operand builder allows creating a single op. We create it at this
-    // reduced power position rather than in the for loop to avoid reallocating
-    // the vectors.
-    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
-    // Add Tensors
-    // allocate outside to avoid realloc
+    DequantizeMapping dequantize_mapping;
+    // The operand builder allows creating a single op. It is created outside
+    // the for loop to avoid reallocating the vectors.
+    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
+                           &dequantize_mapping, nn_model_.get());
+    // Add Tensors.
     for (auto node_index : nodes_) {
       // Obtain the op and registration.
       TfLiteNode* node;
       TfLiteRegistration* reg;
-      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      TF_LITE_ENSURE_STATUS(
+          context->GetNodeAndRegistration(context, node_index, &node, &reg));
+
+      const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
+
       // Map inputs to NN API tensor indices.
       for (auto input_index : TfLiteIntArrayView(node->inputs)) {
         if (input_index == kOptionalTensor &&
@@ -977,18 +1188,24 @@ class NNAPIDelegateKernel {
           // tensor when supported by NNAPI.
           TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
         } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
         }
       }
       // Get op type and operands
-      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
-          {context, &builder, node, &model_state_outputs_,
-           &model_state_tfl_inputs_});
+      int nn_op_type = Map(
+          context, reg->builtin_code, reg->version, nnapi_->android_sdk_version,
+          node)({context, &builder, node, &model_state_outputs_,
+                 &model_state_tfl_inputs_});
       // Map outputs to NN API tensor indices.
       for (auto output_index : TfLiteIntArrayView(node->outputs)) {
         TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
       }
 
+      // Dequantize operators may have to be added in case inputs are to be
+      // floating-point.
+      AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
+                                        &builder);
+
       builder.FinalizeAddOperation(nn_op_type);
     }
     return kTfLiteOk;
@@ -1022,32 +1239,34 @@ class NNAPIDelegateKernel {
       total_output_byte_size += context->tensors[i].bytes;
     }
 
-    // Add state output tensors as model inputs
+    // Add state output tensors as model outputs.
     for (int i : model_state_outputs_) {
       outputs.push_back(i);
     }
 
     // Tell ANN to declare inputs/outputs
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, ANeuralNetworksModel_identifyInputsAndOutputs(
+        context, nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs(
                      nn_model_.get(), inputs.size(), inputs.data(),
                      outputs.size(), outputs.data()));
 
     // Set relaxed computation mode for fp32 if possible.
-    if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-                       nn_model_.get(), context->allow_fp32_relax_to_fp16));
+          context,
+          nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+              nn_model_.get(), context->allow_fp32_relax_to_fp16));
     }
 
     // Finalize the model
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context, ANeuralNetworksModel_finish(nn_model_.get()));
+        context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()));
 
     // Create shared memory pool for inputs and outputs.
-    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_input_memory_.reset(
+        new NNMemory(nnapi_, "input_pool", total_input_byte_size));
     nn_output_memory_.reset(
-        new NNMemory("output_pool", total_output_byte_size));
+        new NNMemory(nnapi_, "output_pool", total_output_byte_size));
 
     return kTfLiteOk;
   }
@@ -1059,14 +1278,26 @@ class NNAPIDelegateKernel {
 TfLiteDelegate* NnApiDelegate() {
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
-      .flags = kTfLiteDelegateFlagsNone,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
-        if (kAndroidSdkVersion < kMinSdkVersionForNNAPI || !NNAPIExists()) {
+        const NnApi* nnapi = NnApiImplementation();
+        if (nnapi->android_sdk_version < kMinSdkVersionForNNAPI ||
+            !nnapi->nnapi_exists) {
           return kTfLiteOk;
         }
-
+        // For NNAPI 1.2+, check if there is any accelerator available.
+        // If not, don't delegate to NNAPI's CPU reference implementation.
+        if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          uint32_t device_count = 0;
+          RETURN_TFLITE_ERROR_IF_NN_ERROR(
+              context, nnapi->ANeuralNetworks_getDeviceCount(&device_count));
+          // Any available accelerator will make the device_count larger than 1.
+          // More sophisticated check and whitelisting can be added later.
+          if (device_count <= 1) {
+            return kTfLiteOk;
+          }
+        }
         // Allocate one element in vector already since TensorFlow Lite uses
         // the first value as the number of nodes. The actual value will be set
         // later, after the vector has been filled.
@@ -1076,6 +1307,7 @@ TfLiteDelegate* NnApiDelegate() {
         TfLiteIntArray* plan;
         TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
 
+        int android_sdk_version = NnApiImplementation()->android_sdk_version;
         // Check for every node if it is supported
         // TODO(b/80625235): Fix this to do more careful checking of versioning.
         for (int node_index : TfLiteIntArrayView(plan)) {
@@ -1084,7 +1316,8 @@ TfLiteDelegate* NnApiDelegate() {
           TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
               context, node_index, &node, &registration));
           if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
-                                       registration->version, node)) {
+                                       registration->version,
+                                       android_sdk_version, node)) {
             supported_nodes.push_back(node_index);
           }
         }
@@ -1121,6 +1354,7 @@ TfLiteDelegate* NnApiDelegate() {
               return state->Invoke(context, node);
             },
 
+            .profiling_string = nullptr,
             .builtin_code = kTfLiteBuiltinDelegate,
         };
 
@@ -1130,7 +1364,13 @@ TfLiteDelegate* NnApiDelegate() {
             context, nnapi_delegate_kernel,
             reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
             delegate);
-      }};
+      },
+
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
 
   return &delegate;
 }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 5da052eb42275d684bfbf83e7b52227ccbb97a06..69284578625dc6086a7549707186a824aff29137 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -49,6 +49,27 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
                                  const std::vector<int>& dims) {
     return interpreter_->ResizeInputTensor(tensor_index, dims);
   }
+
+ protected:
+  void SetData(int index, TensorType type, const std::vector<float>& data) {
+    switch (type) {
+      case TensorType_FLOAT32:
+        PopulateTensor(index, data);
+        break;
+      case TensorType_INT32:
+        QuantizeAndPopulate<int32_t>(index, data);
+        break;
+      case TensorType_UINT8:
+        QuantizeAndPopulate<uint8_t>(index, data);
+        break;
+      case TensorType_INT8:
+        QuantizeAndPopulate<int8_t>(index, data);
+        break;
+      default:
+        FAIL() << "Type not supported: " << type;
+        break;
+    }
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -225,14 +246,15 @@ TEST(NNAPIDelegate, L2PoolWithNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
 }
 
-class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
+class ConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
-  BaseConvolutionOpModel(
+  ConvolutionOpModel(
       const TensorData& input, const TensorData& filter,
       const TensorData& output, int stride_width = 2, int stride_height = 2,
       enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
-      int dilation_width_factor = 1, int dilation_height_factor = 1) {
+      int dilation_width_factor = 1, int dilation_height_factor = 1)
+      : input_type_(input.type), filter_type_(filter.type) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -249,7 +271,8 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     }
 
     output_ = AddOutput(output);
-    if (input.type != TensorType_FLOAT32) {
+
+    if (input_type_ != TensorType_FLOAT32) {
       // The following is required by quantized inference. It is the unittest's
       // responsibility to make sure the output scale falls into the correct
       // range.
@@ -265,56 +288,53 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-class ConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
-  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
   void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    SetData(input_, input_type_, data);
   }
 
   void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
+    SetData(filter_, filter_type_, data);
   }
 
   void SetBias(std::initializer_list<float> data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+  std::vector<float> GetOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return ExtractVector<float>(output_);
+    } else {
+      return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                                 GetScale(output_), GetZeroPoint(output_));
+    }
+  }
+
+  std::vector<uint8_t> GetQuantizedOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return {};  // Not supported.
+    } else {
+      return ExtractVector<uint8_t>(output_);
+    }
   }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+
+  const TensorType input_type_;
+  const TensorType filter_type_;
 };
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(NNAPIDelegate, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
-                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
-                                {TensorType_UINT8, {}, -127, 128});
+TEST(ConvolutionOpTest, SimpleTestQuantized) {
+  ConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                       {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                       {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
       // First batch
       1, 1, 1, 1,  // row = 1
@@ -332,25 +352,55 @@ TEST(NNAPIDelegate, SimpleTestQuantized) {
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      18, 2, 5,  // first batch, left
-                      18, 2, 5,  // first batch, right
-                      17, 4, 3,  // second batch, left
-                      37, 4, 3,  // second batch, right
-                  },
-                  1e-5)));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 2, 5,  // first batch, left
+                                     18, 2, 5,  // first batch, right
+                                     17, 4, 3,  // second batch, left
+                                     37, 4, 3,  // second batch, right
+                                 },
+                                 1e-5)));
   // For good  measure, let's also verify the quantized values:
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 145, 129, 132,  //
-                                 145, 129, 132,  //
-                                 144, 131, 130,  //
-                                 164, 131, 130,  //
-                             }));
+  EXPECT_THAT(m.GetQuantizedOutput(), ElementsAreArray({
+                                          145, 129, 132,  //
+                                          145, 129, 132,  //
+                                          144, 131, 130,  //
+                                          164, 131, 130,  //
+                                      }));
 }
 
-TEST(NNAPIDelegate, Conv2DWithNoActivation) {
+TEST(ConvolutionOpTest, FloatInputQuantizedWeights) {
+  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                       {TensorType_UINT8, {3, 2, 2, 1}, 0, 64},
+                       {TensorType_FLOAT32, {}});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 2,  // row = 1
+      2, 2, 2, 1,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,  // first 2x2 filter
+      0, 1, 0, 1,  // second 2x2 filter
+      0, 0, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 5, 7,    // first batch, left
+                                     16, 5, 6,    // first batch, right
+                                     17, 6, 6,    // second batch, left
+                                     37, 10, 10,  // second batch, right
+                                 },
+                                 0.2)));
+}
+
+TEST(ConvolutionOpTest, NoActivation) {
   ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
@@ -458,56 +508,48 @@ TEST(NNAPIDelegate, DepthwiseConv2DWithNoActivation) {
                              }));
 }
 
-class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
+class FullyConnectedOpModel : public SingleOpModelWithNNAPI {
  public:
-  FloatFullyConnectedOpModel(int units, int batches, const TensorData& input,
-                             const TensorData& output = {TensorType_FLOAT32})
-      : batches_(batches), units_(units) {
-    int total_input_size = 1;
-    for (int i = 0; i < input.shape.size(); ++i) {
-      total_input_size *= input.shape[i];
-    }
-    input_size_ = total_input_size / batches_;
-
+  FullyConnectedOpModel(
+      const TensorData& input, const TensorData& weights,
+      const TensorData& output,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE)
+      : input_type_(input.type), weights_type_(weights.type) {
     input_ = AddInput(input);
-    weights_ =
-        AddInput({input.type, {units_, input_size_}, input.min, input.max});
+    weights_ = AddInput(weights);
 
+    const int units = weights.shape[0];
     if (input.type == TensorType_FLOAT32) {
-      bias_ = AddInput({TensorType_FLOAT32, {units_}});
+      bias_ = AddInput({TensorType_FLOAT32, {units}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
       auto bias_scale = GetScale(input_) * GetScale(weights_);
-      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      TensorData bias{TensorType_INT32, {units}, 0, 0, bias_scale};
       bias_ = AddInput(bias);
     }
 
     output_ = AddOutput(output);
 
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions,
+                 CreateFullyConnectedOptions(builder_, activation).Union());
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
-  int input_size() { return input_size_; }
-  int num_units() { return units_; }
-  int num_batches() { return batches_; }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetWeights(std::initializer_list<float> f) {
-    PopulateTensor(weights_, f);
+  void SetInput(std::initializer_list<float> data) {
+    SetData(input_, input_type_, data);
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  void SetWeights(std::initializer_list<float> data) {
+    SetData(weights_, weights_type_, data);
   }
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
+
+  void SetBias(std::initializer_list<float> data) {
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
@@ -518,14 +560,14 @@ class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
   int bias_;
   int output_;
 
-  int batches_;
-  int units_;
-  int input_size_;
+  const TensorType input_type_;
+  const TensorType weights_type_;
 };
 
-TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
-  FloatFullyConnectedOpModel m(/*units=*/3, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 10}});
+TEST(FullyConnectedOpTest, SimpleTest) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_FLOAT32, {3, 10}},
+                          /*output=*/{TensorType_FLOAT32});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -543,6 +585,28 @@ TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
+TEST(FullyConnectedOpTest, FloatInputQuantizedWeights) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_UINT8, {3, 10}, 0, 64},
+                          /*output=*/{TensorType_FLOAT32});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60}, 1.3)));
+}
+
 class SoftmaxOpModel : public SingleOpModelWithNNAPI {
  public:
   SoftmaxOpModel(int batches, int size, float beta)
@@ -955,8 +1019,9 @@ TEST(NNAPIDelegate, ConcatenationFourInputsQuantizedMixedRange) {
 
 class DequantizeOpModel : public SingleOpModelWithNNAPI {
  public:
-  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
-    input_ = AddInput({TensorType_UINT8, shape, min, max});
+  DequantizeOpModel(TensorType inputType, std::initializer_list<int> shape,
+                    float min, float max) {
+    input_ = AddInput({inputType, shape, min, max});
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
                  CreateDequantizeOptions(builder_).Union());
@@ -964,7 +1029,8 @@ class DequantizeOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter({GetShape(input_)});
   }
 
-  void SetInput(std::initializer_list<uint8_t> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
 
@@ -975,16 +1041,27 @@ class DequantizeOpModel : public SingleOpModelWithNNAPI {
   int output_;
 };
 
-TEST(NNAPIDelegate, DequantizeFourDimensional) {
-  DequantizeOpModel m({2, 5}, -63.5, 64);
+TEST(NNAPIDelegate, DequantizeFourDimensionalUint8) {
+  DequantizeOpModel m(TensorType_UINT8, {2, 5}, -63.5, 64);
 
-  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
                   {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
 }
 
+TEST(NNAPIDelegate, DequantizeFourDimensionalInt8Symm) {
+  // [-64, 63.5] -> scale=0.5, zero_point=0 for INT8
+  DequantizeOpModel m(TensorType_INT8, {2, 5}, -64, 63.5);
+
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-64, -63.5, -63, -62.5, -62, 61.5, 62, 62.5, 63, 63.5})));
+}
+
 class FloorOpModel : public SingleOpModelWithNNAPI {
  public:
   FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
@@ -1810,8 +1887,8 @@ static std::initializer_list<float> rnn_bias = {
 class RNNOpModel : public SingleOpModelWithNNAPI {
  public:
   RNNOpModel(int batches, int units, int size,
-             const TensorType& weights = TensorType_FLOAT32,
-             const TensorType& recurrent_weights = TensorType_FLOAT32)
+             const TensorType weights = TensorType_FLOAT32,
+             const TensorType recurrent_weights = TensorType_FLOAT32)
       : batches_(batches), units_(units), input_size_(size) {
     input_ = AddInput(TensorType_FLOAT32);
     weights_ = AddInput(weights);
@@ -2169,11 +2246,12 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
               bool use_peephole, bool use_projection_weights,
               bool use_projection_bias, float cell_clip, float proj_clip,
               const std::vector<std::vector<int>>& input_shapes,
-              const TensorType& weight_type = TensorType_FLOAT32)
+              const TensorType weight_type)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
-        n_output_(n_output) {
+        n_output_(n_output),
+        weight_type_(weight_type) {
     input_ = AddInput(TensorType_FLOAT32);
 
     if (use_cifg) {
@@ -2246,69 +2324,71 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
-    PopulateTensor(input_to_input_weights_, f);
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetData(input_to_input_weights_, weight_type_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
-    PopulateTensor(input_to_forget_weights_, f);
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetData(input_to_forget_weights_, weight_type_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
-    PopulateTensor(input_to_cell_weights_, f);
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetData(input_to_cell_weights_, weight_type_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
-    PopulateTensor(input_to_output_weights_, f);
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetData(input_to_output_weights_, weight_type_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_input_weights_, f);
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    SetData(recurrent_to_input_weights_, weight_type_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_forget_weights_, f);
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    SetData(recurrent_to_forget_weights_, weight_type_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_cell_weights_, f);
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    SetData(recurrent_to_cell_weights_, weight_type_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_output_weights_, f);
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    SetData(recurrent_to_output_weights_, weight_type_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_input_weights_, f);
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetData(cell_to_input_weights_, weight_type_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_forget_weights_, f);
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetData(cell_to_forget_weights_, weight_type_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_output_weights_, f);
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetData(cell_to_output_weights_, weight_type_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
-    PopulateTensor(projection_weights_, f);
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetData(projection_weights_, weight_type_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -2358,6 +2438,9 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
   int n_input_;
   int n_cell_;
   int n_output_;
+
+ private:
+  const TensorType weight_type_;
 };
 
 class BaseLstmTest : public ::testing::Test {
@@ -2503,7 +2586,8 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
-                   });
+                   },
+                   /*weight_type=*/TensorType_FLOAT32);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -2606,7 +2690,8 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
-                   });
+                   },
+                   /*weight_type=*/TensorType_FLOAT32);
 
   lstm.SetInputToCellWeights(input_to_cell_weights_);
   lstm.SetInputToForgetWeights(input_to_forget_weights_);
@@ -3260,7 +3345,8 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_output, n_cell},  // projection_weight tensor
                        {0},                 // projection_bias tensor
-                   });
+                   },
+                   /*weight_type=*/TensorType_FLOAT32);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -3291,7 +3377,7 @@ class BaseReduceOpModel : public SingleOpModelWithNNAPI {
   void SetAxis(const std::vector<int>& data) { PopulateTensor(axis_, data); }
 
   template <class T>
-  void SetInput(std::vector<T> data) {
+  void SetInput(const std::vector<T>& data) {
     PopulateTensor(input_, data);
   }
 
diff --git a/tensorflow/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
index b372afae190ded84947b45655018a78633715c16..d2bc9846af571af71d8d7cbdf1c985e3a24474f7 100644
--- a/tensorflow/lite/examples/android/app/build.gradle
+++ b/tensorflow/lite/examples/android/app/build.gradle
@@ -1,8 +1,16 @@
 apply plugin: 'com.android.application'
 
+// import DownloadModels task
+project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
+project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
+
+// Download default models; if you wish to use your own models then
+// place them in the "assets" directory and comment out this line.
+apply from: "download-models.gradle"
+
 android {
     compileSdkVersion 26
-    buildToolsVersion '27.0.3'
+    buildToolsVersion '28.0.3'
     defaultConfig {
         applicationId "org.tensorflow.lite.demo"
         minSdkVersion 15
@@ -36,14 +44,6 @@ repositories {
     }
 }
 
-// import DownloadModels task
-project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
-project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
-
-// Download default models; if you wish to use your own models then
-// place them in the "assets" directory and comment out this line.
-apply from: "download-models.gradle"
-
 dependencies {
     implementation fileTree(dir: 'libs', include: ['*.jar'])
     implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
diff --git a/tensorflow/lite/examples/android/app/download-models.gradle b/tensorflow/lite/examples/android/app/download-models.gradle
index 36bd177a1fd6bb21a27edd6d2b6e82fa7aa5d57b..514eeb013501a45ac78778e95df88c946bcc737b 100644
--- a/tensorflow/lite/examples/android/app/download-models.gradle
+++ b/tensorflow/lite/examples/android/app/download-models.gradle
@@ -67,9 +67,6 @@ task extractModels(type: Copy) {
     }
 }
 
-
-
-
 tasks.whenTaskAdded { task ->
     if (task.name == 'assembleDebug') {
         task.dependsOn 'extractModels'
diff --git a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
index bc9574d646b7661de8ac9b745bd53cbba1eb9f31..d4c98c61cca7ee1cb9f803bffb1966991b0368c2 100644
--- a/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
+++ b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
@@ -24,12 +24,7 @@
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
     <uses-permission android:name="android.permission.RECORD_AUDIO" />
 
-    <uses-sdk
-        android:minSdkVersion="21"
-        android:targetSdkVersion="23" />
-
     <application android:allowBackup="true"
-        android:debuggable="true"
         android:label="@string/app_name"
         android:icon="@drawable/ic_launcher"
         android:theme="@style/MaterialTheme">
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
index 7c038ddd46418b6498251068a284e8ffcdeda96f..90b8f0f3badf1be86588b2236bb37e83676602b8 100644
--- a/tensorflow/lite/examples/android/build.gradle
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -6,10 +6,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.1.4'
-
-        // NOTE: Do not place your application dependencies here; they belong
-        // in the individual module build.gradle files
+        classpath 'com.android.tools.build:gradle:3.2.1'
     }
 }
 
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 4f6fcaa96c4b917b79dacc5180594c1458ef18ff..e5c230e65e9e53a8496c4a4f0de49a7677acff81 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -23,12 +23,6 @@
 #include <iostream>
 #include <queue>
 
-#if TFLITE_USE_CONTRIB_LITE
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#else
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -36,7 +30,6 @@
 #if TFLITE_USE_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/metal_delegate.h"
 #endif
-#endif
 
 #define LOG(x) std::cerr
 
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
index 2e15cc63decb30eb2b8c9bffab3b5d1bff10e9b3..0bff676159c22d8cf9419d3d79437b63103b7d06 100644
--- a/tensorflow/lite/examples/ios/camera/Podfile
+++ b/tensorflow/lite/examples/ios/camera/Podfile
@@ -9,5 +9,5 @@ target 'tflite_camera_example'
   # Note: TFLite GPU Delegate binary isn't releast yet, and we're working
   # on it.
 
-  pod 'TensorFlowLite', '1.12.0'
+  pod 'TensorFlowLite', '1.13.1'
   # pod 'TensorFlowLiteGpuExperimental', '0.0.1'
diff --git a/tensorflow/lite/examples/ios/simple/Podfile b/tensorflow/lite/examples/ios/simple/Podfile
index 931b72c1f5e946e8be61ac6dec3c6106a75b9685..d9e6a6586bf80f332192a25a74a91abdf79046a5 100644
--- a/tensorflow/lite/examples/ios/simple/Podfile
+++ b/tensorflow/lite/examples/ios/simple/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_simple_example'
-       pod 'TensorFlowLite', '1.12.0'
+       pod 'TensorFlowLite', '1.13.1'
diff --git a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
index 32da7f7e4fce5cafc3c4746e5847315172542fc9..b90f2d3cfd05a57460149a1f7868c01f32b28cd1 100644
--- a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
@@ -22,10 +22,10 @@
 #include <sstream>
 #include <string>
 
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 4fc8648d46c4bdefe3865381a23f4d73c87c284b..75e1add60198425694356f326862f132e8e6e70e 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -5,14 +5,13 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
 exports_files(glob([
     "testdata/*.bmp",
 ]))
 
-tf_cc_binary(
+cc_binary(
     name = "label_image",
     srcs = [
         "get_top_n.h",
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index b8dc2840dfb49f8c067fbd2bf09432f7b06d6265..340fbab5c6fcc960685bc4581b285b0610fa22df 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -113,6 +113,7 @@ void RunInference(Settings* s) {
   }
 
   interpreter->UseNNAPI(s->accel);
+  interpreter->SetAllowFp16PrecisionForFp32(s->allow_fp16);
 
   if (s->verbose) {
     LOG(INFO) << "tensors size: " << interpreter->tensors_size() << "\n";
@@ -251,19 +252,21 @@ void RunInference(Settings* s) {
 }
 
 void display_usage() {
-  LOG(INFO) << "label_image\n"
-            << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
-            << "--count, -c: loop interpreter->Invoke() for certain times\n"
-            << "--input_mean, -b: input mean\n"
-            << "--input_std, -s: input standard deviation\n"
-            << "--image, -i: image_name.bmp\n"
-            << "--labels, -l: labels for the model\n"
-            << "--tflite_model, -m: model_name.tflite\n"
-            << "--profiling, -p: [0|1], profiling or not\n"
-            << "--num_results, -r: number of results to show\n"
-            << "--threads, -t: number of threads\n"
-            << "--verbose, -v: [0|1] print more information\n"
-            << "\n";
+  LOG(INFO)
+      << "label_image\n"
+      << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
+      << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 not\n"
+      << "--count, -c: loop interpreter->Invoke() for certain times\n"
+      << "--input_mean, -b: input mean\n"
+      << "--input_std, -s: input standard deviation\n"
+      << "--image, -i: image_name.bmp\n"
+      << "--labels, -l: labels for the model\n"
+      << "--tflite_model, -m: model_name.tflite\n"
+      << "--profiling, -p: [0|1], profiling or not\n"
+      << "--num_results, -r: number of results to show\n"
+      << "--threads, -t: number of threads\n"
+      << "--verbose, -v: [0|1] print more information\n"
+      << "\n";
 }
 
 int Main(int argc, char** argv) {
@@ -273,6 +276,7 @@ int Main(int argc, char** argv) {
   while (1) {
     static struct option long_options[] = {
         {"accelerated", required_argument, nullptr, 'a'},
+        {"allow_fp16", required_argument, nullptr, 'f'},
         {"count", required_argument, nullptr, 'c'},
         {"verbose", required_argument, nullptr, 'v'},
         {"image", required_argument, nullptr, 'i'},
@@ -305,6 +309,10 @@ int Main(int argc, char** argv) {
         s.loop_count =
             strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
+      case 'f':
+        s.allow_fp16 =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
       case 'i':
         s.input_bmp_name = optarg;
         break;
diff --git a/tensorflow/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
index 88b047fecc4b3efd10ef025193a79516516c03f1..cc46e56b64a9dc848bba1dafed373375a97dcfe7 100644
--- a/tensorflow/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -26,6 +26,7 @@ struct Settings {
   bool accel = false;
   bool input_floating = false;
   bool profiling = false;
+  bool allow_fp16 = false;
   int loop_count = 1;
   float input_mean = 127.5f;
   float input_std = 127.5f;
diff --git a/tensorflow/lite/examples/minimal/BUILD b/tensorflow/lite/examples/minimal/BUILD
index cdd67af1e93661c1f65cc46d9b687acc1fa56fe8..498dbb9693e74545546f849a246630dc16ab7322 100644
--- a/tensorflow/lite/examples/minimal/BUILD
+++ b/tensorflow/lite/examples/minimal/BUILD
@@ -5,10 +5,9 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
-tf_cc_binary(
+cc_binary(
     name = "minimal",
     srcs = [
         "minimal.cc",
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index cde53e283830aca9c7990e3d8c4901f997621bc2..aef39db02140389c0d0bcbde5ae4d2d89059c2a6 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -19,16 +19,14 @@ load(
 tflite_cc_shared_object(
     name = "libtensorflowlite_c.so",
     linkopts = select({
-        "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
+        "//tensorflow:macos": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/lite/experimental/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/lite/experimental/c:version_script.lds)",
         ],
     }),
     deps = [
@@ -66,7 +64,6 @@ cc_library(
         ":c_api_internal",
         "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
 )
@@ -94,7 +91,6 @@ cc_test(
     deps = [
         ":c_api",
         "//tensorflow/lite:context",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index f39673c028d5924359c5058cbc4a72e2f28394b0..27efa442348c997197becb6a8db603718a158f66 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -5,27 +5,29 @@ package(default_visibility = ["//tensorflow:internal"])
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
-    name = "tflite_lstm",
-    srcs = ["tflite_lstm.py"],
+    name = "rnn",
+    srcs = ["rnn.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python:op_hint",
         "//tensorflow/python:framework",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
         "@six_archive//:six",
     ],
 )
 
 py_library(
-    name = "tflite_rnn",
-    srcs = ["tflite_rnn.py"],
+    name = "rnn_cell",
+    srcs = ["rnn_cell.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python:op_hint",
         "//tensorflow/python:framework",
+        "//tensorflow/python:rnn_cell",
         "@six_archive//:six",
     ],
 )
@@ -40,13 +42,14 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_lstm",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
-        "//tensorflow/python/tools:optimize_for_inference",
+        "//tensorflow/python/tools:optimize_for_inference_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -62,14 +65,71 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_rnn",
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python/tools:optimize_for_inference",
+        "//tensorflow/python/tools:optimize_for_inference_main_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
+
+py_test(
+    name = "bidirectional_sequence_lstm_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_lstm_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference_main_lib",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference_main_lib",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tflite_lstm_ops",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+    ],
+)
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b71d8c22b23952021e7328c275a16e48987147ad
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+    # Import MNIST dataset
+    self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
+
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Lstm Units.
+    self.num_units = 16
+
+  def buildLstmLayer(self):
+    return tf.keras.layers.StackedRNNCells([
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units // 2,
+            use_peepholes=True,
+            num_proj=8,
+            forget_bias=0,
+            name="rnn3"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=0, name="rnn4")
+    ])
+
+  def buildModel(self, fw_lstm_layer, bw_lstm_layer, is_dynamic_rnn):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    if is_dynamic_rnn:
+      lstm_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_lstm_layer,
+          bw_lstm_layer,
+          lstm_inputs,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      lstm_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, fw_lstm_layer, bw_lstm_layer, sess, saver,
+                          is_dynamic_rnn):
+    model_dir = tempfile.mkdtemp()
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer,
+                                                  is_dynamic_rnn)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    try:
+      interpreter.allocate_tensors()
+    except ValueError:
+      assert False
+
+    input_index = (interpreter.get_input_details()[0]["index"])
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = (interpreter.get_output_details()[0]["index"])
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer(), False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), self.buildLstmLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer(), True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(),
+        self.buildLstmLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a0a86b731ad29e890638e9668d4654571aa3487
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -0,0 +1,305 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import flags
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+FLAGS = flags.FLAGS
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(BidirectionalSequenceRnnTest, self).__init__(*args, **kwargs)
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Rnn Units.
+    self.num_units = 16
+
+  def setUp(self):
+    super(BidirectionalSequenceRnnTest, self).setUp()
+    # Import MNIST dataset
+    data_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
+
+  def buildRnnLayer(self):
+    return tf.keras.layers.StackedRNNCells([
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
+    ])
+
+  def buildModel(self,
+                 fw_rnn_layer,
+                 bw_rnn_layer,
+                 is_dynamic_rnn,
+                 is_inference,
+                 use_sequence_length=False):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    batch_size = self.batch_size
+    if is_inference:
+      batch_size = 1
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [batch_size, self.time_steps, self.n_input],
+        name="INPUT_IMAGE")
+
+    sequence_length = None
+    if use_sequence_length:
+      sequence_length = [self.time_steps] * batch_size
+    if is_dynamic_rnn:
+      rnn_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_rnn_layer,
+          bw_rnn_layer,
+          rnn_inputs,
+          sequence_length,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      rnn_inputs = tf.unstack(x, self.time_steps, 1)
+      # Sequence length is not supported for static since we don't have a
+      # wrapper for it. At training phase, we can still have sequence_length,
+      # but inference phase, we change it to None.
+      if is_inference:
+        sequence_length = None
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_rnn_layer,
+          bw_rnn_layer,
+          rnn_inputs,
+          dtype="float32",
+          sequence_length=sequence_length)
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self,
+                          fw_rnn_layer,
+                          bw_rnn_layer,
+                          sess,
+                          saver,
+                          is_dynamic_rnn,
+                          use_sequence_length=False):
+    model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(
+        fw_rnn_layer, bw_rnn_layer, is_dynamic_rnn, True, use_sequence_length)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]["index"]
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = interpreter.get_output_details()[0]["index"]
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), False, is_inference=False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  def testStaticRnnMultiRnnCellWithSequenceLength(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        False,
+        is_inference=False,
+        use_sequence_length=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        sess,
+        saver,
+        False,
+        use_sequence_length=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), True, is_inference=False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCellWithSequenceLength(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        True,
+        is_inference=False,
+        use_sequence_length=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True,
+        use_sequence_length=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/README.md b/tensorflow/lite/experimental/examples/lstm/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfe2d0d153d6042680857c1ad7efe4892a2a5a73
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/g3doc/README.md
@@ -0,0 +1,394 @@
+# TensorFlow Lite LSTM ops API
+
+TensorFlow Lite LSTM ops help developers deploy LSTM models to TensorFlow Lite.
+This is currently an experimental API, it's likely to change in future.
+
+## Introduction
+
+LSTM ops in TensorFlow Lite realm are expressed as "fused ops" (e.g.,
+UnidirectionalSequenceRNN, BidirectionalSequenceLSTM, etc.). However, in
+TensorFlow, LSTM ops are expressed as a "cell" (e.g., `tf.nn.rnn_cell.LSTMCell`,
+`tf.nn.rnn_cell.BasicRNNCell`, etc., and they all contain multiple TensorFlow
+ops) and a "rnn" ( e.g., `tf.nn.static_rnn`,
+`tf.nn.bidirectional_dynamic_rnn`).
+
+The ops breakdown in TensorFlow gives us flexibility while the "fused op" in
+TensorFlow Lite gives us performance boost.
+
+See the difference between TensorFlow LSTM and TensorFlow Lite LSTM.
+
+##### TensorFlow LSTM op ("cell")
+
+![TensorFlow LSTM op](./images/tf_lstm.png)
+
+##### TensorFlow Lite LSTM op ("fused ops")
+
+![TensorFlow Lite LSTM op](./images/tflite_lstm.png)
+
+The TensorFlow LSTM figure is credited to this
+[blog](https://colah.github.io/posts/2015-08-Understanding-LSTMs/).
+
+## How to use
+
+To use TensorFlow Lite LSTM ops is actually pretty simple.
+
+### 1) Training & Evaluation.
+
+First step is replacing `tf.nn.rnn_cell.LSTMCell` with
+`tf.lite.experimental.nn.TFLiteLSTMCell` in training phase, and replacing
+`tf.nn.rnn.dynamic_rnn` with `tf.lite.experimental.nn.dynamic_rnn`, if you are
+using dynamic_rnn. Note you don't need to change if you're using static_rnn.
+
+Both `tf.lite.experimental.nn.TFLiteLSTMCell` &
+`tf.lite.experimental.nn.dynamic_rnn` are just normal `tf.nn.rnn_cell.LSTMCell`
+and `tf.nn.rnn.dynamic_rnn` with OpHinted nodes in it to help the graph
+transformation.
+
+Then you can train and export the model as usual.
+
+### 2) Export for TensorFlow Lite inference.
+
+When you want to convert to TensorFlow Lite model, here's one simple step you
+need to do for your frozen graph:
+
+```python
+with tf.Session() as sess:
+  ophinted_graph = tf.lite.experimental.convert_op_hints_to_stubs(session=sess)
+```
+
+Then you can convert the model to TensorFlow Lite model as usual.
+
+```python
+converter = tf.lite.TFLiteConverter(ophinted_graph, [INPUTS], [OUTPUTS])
+converter.post_training_quantize = True  # If post training quantize is desired.
+tflite_model = converter.convert()  # You got a tflite model!
+```
+
+#### Simple example diff for using original TF code VS. TensorFlow Lite code:
+
+```python
+@@ -56,7 +56,7 @@ class MnistLstmModel(object):
+     for _ in range(self.num_lstm_layer):
+       lstm_layers.append(
+           # Note here, we use `tf.lite.experimental.nn.TFLiteLSTMCell`.
+-          tf.nn.rnn_cell.LSTMCell(
++          tf.lite.experimental.nn.TFLiteLSTMCell(
+               self.num_lstm_units, forget_bias=0))
+     # Weights and biases for output softmax layer.
+     out_weights = tf.Variable(tf.random_normal([self.units, self.num_class]))
+@@ -67,7 +67,7 @@ class MnistLstmModel(object):
+     lstm_cells = tf.nn.rnn_cell.MultiRNNCell(lstm_layers)
+     # Note here, we use `tf.lite.experimental.nn.dynamic_rnn` and `time_major`
+     # is set to True.
+-    outputs, _ = tf.nn.dynamic_rnn(
++    outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+         lstm_cells, lstm_inputs, dtype='float32', time_major=True)
+ 
+     # Transpose the outputs back to [batch, time, output]
+@@ -154,7 +154,9 @@ def export(model, model_dir, tflite_model_file,
+       sess, sess.graph_def, [output_class.op.name])
+ 
+   # Convert ophinted lstm ops to tflite UnidirectionalSequenceLstm ops.
+-  converted_graph = tf.graph_util.remove_training_nodes(frozen_graph)
++  converted_graph = tf.lite.experimental.convert_op_hints_to_stubs(
++      graph_def=frozen_graph)
++  converted_graph = tf.graph_util.remove_training_nodes(converted_graph)
+   converter = tf.lite.TFLiteConverter(converted_graph, [x], [output_class])
+   converter.post_training_quantize = use_post_training_quantize
+   tflite = converter.convert()
+```
+
+## Why introduce another set of LSTM APIs?
+
+Bridging TensorFlow LSTM and TensorFlow Lite is not easy, and the use of
+`dynamic_rnn` adds additional complexity (as the while loop is introduced).
+With the help of
+[OpHint](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/op_hint.py)
+(also see the next section), we create special wrappers around `rnn_cell` and
+`rnn` to help us identify the inputs and outputs of the LSTM ops, and these
+ops are converted to a single fused LSTM op when converting TensorFlow models
+to TensorFlow Lite format.
+
+### What's OpHint
+
+`OpHint` is essentially `Identity` op that is inserted after input tensors and
+output tensors to "hint" the customized op boundary, see the following figure.
+
+##### Ophinted Customized Graph
+
+Let's say we have a "customized conv" op which is a normal conv2d op with a bias
+add op followed by an activation op (graph on the letf), we use `OpHint` to
+track down all the inputs and output, during the graph transformation phase
+(done by `tf.lite.experimental.convert_op_hints_to_stubs`), the conv2d op, bias
+add op and the activation op will become a "my customized conv" op (see the
+graph on the right), and all the "OpHinted" tensors will become the
+inputs/outputs of the "my customized conv" op.
+
+![Ophinted Customized Graph](./images/op_hint.png)
+
+
+## Simple Tutorial
+
+The following tutorial uses MNIST dataset to build a simple two-layer LSTM model
+and convert to quantized TensorFlow Lite model.
+
+Note since we will be using dynamic_rnn, we need to turn on `control_flow_v2`.
+
+### 0. Turn on `control_flow_v2`.
+
+```python
+# Note this needs to happen before import tensorflow.
+import os
+os.environ['TF_ENABLE_CONTROL_FLOW_V2'] = '1'
+```
+
+### 1. Build the model.
+
+```python
+class MnistLstmModel(object):
+  """Build a simple LSTM based MNIST model.
+
+  Attributes:
+    time_steps: The maximum length of the time_steps, but since we're just using
+      the 'width' dimension as time_steps, it's actually a fixed number.
+    input_size: The LSTM layer input size.
+    num_lstm_layer: Number of LSTM layers for the stacked LSTM cell case.
+    num_lstm_units: Number of units in the LSTM cell.
+    units: The units for the last layer.
+    num_class: Number of classes to predict.
+  """
+
+  def __init__(self, time_steps, input_size, num_lstm_layer, num_lstm_units,
+               units, num_class):
+    self.time_steps = time_steps
+    self.input_size = input_size
+    self.num_lstm_layer = num_lstm_layer
+    self.num_lstm_units = num_lstm_units
+    self.units = units
+    self.num_class = num_class
+
+  def build_model(self):
+    """Build the model using the given configs.
+
+    Returns:
+      x: The input placehoder tensor.
+      logits: The logits of the output.
+      output_class: The prediction.
+    """
+    x = tf.placeholder(
+        'float32', [None, self.time_steps, self.input_size], name='INPUT')
+    lstm_layers = []
+    for _ in range(self.num_lstm_layer):
+      lstm_layers.append(
+          # Important:
+          #
+          # Note here, we use `tf.lite.experimental.nn.TFLiteLSTMCell`
+          # (OpHinted LSTMCell).
+          tf.lite.experimental.nn.TFLiteLSTMCell(
+              self.num_lstm_units, forget_bias=0))
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(tf.random_normal([self.units, self.num_class]))
+    out_bias = tf.Variable(tf.zeros([self.num_class]))
+
+    # Transpose input x to make it time major.
+    lstm_inputs = tf.transpose(x, perm=[1, 0, 2])
+    lstm_cells = tf.keras.layers.StackedRNNCells(lstm_layers)
+    # Important:
+    #
+    # Note here, we use `tf.lite.experimental.nn.dynamic_rnn` and `time_major`
+    # is set to True.
+    outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+        lstm_cells, lstm_inputs, dtype='float32', time_major=True)
+
+    # Transpose the outputs back to [batch, time, output]
+    outputs = tf.transpose(outputs, perm=[1, 0, 2])
+    outputs = tf.unstack(outputs, axis=1)
+    logits = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(logits, name='OUTPUT_CLASS')
+
+    return x, logits, output_class
+```
+
+### 2. Let's define the train & eval function.
+
+```python
+def train(model,
+          model_dir,
+          batch_size=20,
+          learning_rate=0.001,
+          train_steps=2000,
+          eval_steps=500,
+          save_every_n_steps=1000):
+  """Train & save the MNIST recognition model."""
+  # Train & test dataset.
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  train_iterator = train_dataset.shuffle(
+      buffer_size=1000).batch(batch_size).repeat().make_one_shot_iterator()
+  x, logits, output_class = model.build_model()
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  test_iterator = test_dataset.batch(
+      batch_size).repeat().make_one_shot_iterator()
+  # input label placeholder
+  y = tf.placeholder(tf.int32, [
+      None,
+  ])
+  one_hot_labels = tf.one_hot(y, depth=model.num_class)
+  # Loss function
+  loss = tf.reduce_mean(
+      tf.nn.softmax_cross_entropy_with_logits(
+          logits=logits, labels=one_hot_labels))
+  correct = tf.nn.in_top_k(output_class, y, 1)
+  accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+  # Optimization
+  opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
+
+  # Initialize variables
+  init = tf.global_variables_initializer()
+  saver = tf.train.Saver()
+  batch_x, batch_y = train_iterator.get_next()
+  batch_test_x, batch_test_y = test_iterator.get_next()
+  with tf.Session() as sess:
+    sess.run([init])
+    for i in range(train_steps):
+      batch_x_value, batch_y_value = sess.run([batch_x, batch_y])
+      _, loss_value = sess.run([opt, loss],
+                               feed_dict={
+                                   x: batch_x_value,
+                                   y: batch_y_value
+                               })
+      if i % 100 == 0:
+        tf.logging.info('Training step %d, loss is %f' % (i, loss_value))
+      if i > 0 and i % save_every_n_steps == 0:
+        accuracy_sum = 0.0
+        for _ in range(eval_steps):
+          test_x_value, test_y_value = sess.run([batch_test_x, batch_test_y])
+          accuracy_value = sess.run(
+              accuracy, feed_dict={
+                  x: test_x_value,
+                  y: test_y_value
+              })
+          accuracy_sum += accuracy_value
+        tf.logging.info('Training step %d, accuracy is %f' %
+                        (i, accuracy_sum / (eval_steps * 1.0)))
+        saver.save(sess, model_dir)
+```
+
+### 3. Let's define the export to TensorFlow Lite model function.
+
+```python
+def export(model, model_dir, tflite_model_file,
+           use_post_training_quantize=True):
+  """Export trained model to tflite model."""
+  tf.reset_default_graph()
+  x, _, output_class = model.build_model()
+  saver = tf.train.Saver()
+  sess = tf.Session()
+  saver.restore(sess, model_dir)
+  # Freeze the graph.
+  frozen_graph = tf.graph_util.convert_variables_to_constants(
+      sess, sess.graph_def, [output_class.op.name])
+  
+  # Important:
+  #
+  # Convert ophinted lstm ops to tflite UnidirectionalSequenceLstm ops.
+  converted_graph =
+      tf.lite.experimental.convert_op_hints_to_stubs(graph_def=frozen_graph)
+  converted_graph = tf.graph_util.remove_training_nodes(converted_graph)
+  converter = tf.lite.TFLiteConverter(converted_graph, [x], [output_class])
+  converter.post_training_quantize = use_post_training_quantize
+  tflite = converter.convert()
+  with open(tflite_model_file, 'w') as f:
+    f.write(tflite)
+```
+
+### 4. Hook everything together.
+
+```python
+def train_and_export(parsed_flags):
+  """Train the MNIST LSTM model and export to TfLite."""
+  model = MnistLstmModel(
+      time_steps=28,
+      input_size=28,
+      num_lstm_layer=2,
+      num_lstm_units=64,
+      units=64,
+      num_class=10)
+  tf.logging.info('Starts training...')
+  train(model, parsed_flags.model_dir)
+  tf.logging.info('Finished training, starts exporting to tflite to %s ...' %
+                  parsed_flags.tflite_model_file)
+  export(model, parsed_flags.model_dir, parsed_flags.tflite_model_file,
+         parsed_flags.use_post_training_quantize)
+  tf.logging.info(
+      'Finished exporting, model is %s' % parsed_flags.tflite_model_file)
+
+
+def run_main(_):
+  """Main in the TfLite LSTM tutorial."""
+  parser = argparse.ArgumentParser(
+      description=('Train a MNIST recognition model then export to TfLite.'))
+  parser.add_argument(
+      '--model_dir',
+      type=str,
+      help='Directory where the models will store.',
+      required=True)
+  parser.add_argument(
+      '--tflite_model_file',
+      type=str,
+      help='Full filepath to the exported tflite model file.',
+      required=True)
+  parser.add_argument(
+      '--use_post_training_quantize',
+      action='store_true',
+      default=True,
+      help='Whether or not to use post_training_quatize.')
+  parsed_flags, _ = parser.parse_known_args()
+  train_and_export(parsed_flags)
+
+
+def main():
+  app.run(main=run_main, argv=sys.argv[:1])
+
+
+if __name__ == '__main__':
+  main()
+
+```
+
+### 5. Visualize the exported TensorFlow Lite model.
+
+Let's go to where the TensorFlow Lite model is exported and use
+[Netron](https://github.com/lutzroeder/netron) to visualize the graph.
+
+See below.
+
+##### Exported TensorFlow Lite Model.
+
+![Exported TensorFlow Lite Model](./images/exported_tflite_model.png)
+
+## Caveat
+
+*   Currently, `tf.lite.experimental.nn.dynamic_rnn` &
+    `tf.lite.experimental.nn.bidirectional_dynamic_rnn` only supports
+    `control_flow_v2`, you can this on by setting the environment variable
+    `TF_ENABLE_CONTROL_FLOW_V2=1`, see in the tutorial.
+*   Currently, `sequence_length` is not supported, prefer to set it to None.
+*   `num_unit_shards` & `num_proj_shards` in LSTMCell are not supported as
+    well.
+*   Currently, `tf.lite.experimental.nn.dynamic_rnn` &
+    `tf.lite.experimental.nn.bidirectional_dynamic_rnn` only takes
+    `time_major=True`.
+*   The behavior of `tf.lite.experimental.nn.bidirectional_dynamic_rnn` is a
+    wrapper around `tf.nn.bidirectional_dynamic_rnn`, not
+    `tf.contrib.rnn.stack_bidirectional_dynamic_rnn`.
+*   For bidirectional_rnn cases, make sure you include all the op_hinted nodes
+    before freeze the graph. See below:
+
+```python
+all_output_nodes = [OUTPUT_NODES]
+with tf.Session() as sess
+  all_output_nodes += tf.lite.find_all_hinted_output_nodes(sess)
+  frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, all_output_nodes)
+```
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/images/exported_tflite_model.png b/tensorflow/lite/experimental/examples/lstm/g3doc/images/exported_tflite_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..0d489d7602e6c579555734e252269c16b4a4e4da
Binary files /dev/null and b/tensorflow/lite/experimental/examples/lstm/g3doc/images/exported_tflite_model.png differ
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/images/op_hint.png b/tensorflow/lite/experimental/examples/lstm/g3doc/images/op_hint.png
new file mode 100644
index 0000000000000000000000000000000000000000..583d4869b6d63e576085b601d071f7686a868d54
Binary files /dev/null and b/tensorflow/lite/experimental/examples/lstm/g3doc/images/op_hint.png differ
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/images/tf_lstm.png b/tensorflow/lite/experimental/examples/lstm/g3doc/images/tf_lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..e962a3c720781e37949a0d654e11dffff1b6803a
Binary files /dev/null and b/tensorflow/lite/experimental/examples/lstm/g3doc/images/tf_lstm.png differ
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/images/tflite_lstm.png b/tensorflow/lite/experimental/examples/lstm/g3doc/images/tflite_lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f6befbb581cc85f99ca676ab65e3c0294e2e8dd
Binary files /dev/null and b/tensorflow/lite/experimental/examples/lstm/g3doc/images/tflite_lstm.png differ
diff --git a/tensorflow/lite/experimental/examples/lstm/rnn.py b/tensorflow/lite/experimental/examples/lstm/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e005d81e4051537e8bed2589005f1637251b301d
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/rnn.py
@@ -0,0 +1,429 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TfLite LSTMCell wrapper.
+
+TODO(renjieliu): Find a better home for this one.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.lite.python.op_hint as op_hint
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn import _best_effort_input_batch_size
+from tensorflow.python.ops.rnn import _dynamic_rnn_loop
+from tensorflow.python.ops.rnn import _should_cache
+from tensorflow.python.ops.rnn import _transpose_batch_time
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("lite.experimental.nn.dynamic_rnn")
+def dynamic_rnn(cell,
+                inputs,
+                sequence_length=None,
+                initial_state=None,
+                dtype=None,
+                parallel_iterations=None,
+                swap_memory=False,
+                time_major=True,
+                scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  Performs fully dynamic unrolling of `inputs`.
+
+  Example:
+
+  ```python
+  # create a BasicRNNCell
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+  # defining initial state
+  initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+  # 'state' is a tensor of shape [batch_size, cell_state_size]
+  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                     initial_state=initial_state,
+                                     dtype=tf.float32)
+  ```
+
+  ```python
+  # create 2 LSTMCells
+  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+  # create a RNN cell composed sequentially of a number of RNNCells
+  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+  # 'state' is a N-tuple where N is the number of LSTMCells containing a
+  # tf.contrib.rnn.LSTMStateTuple for each cell
+  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                     inputs=data,
+                                     dtype=tf.float32)
+  ```
+
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: The RNN inputs.
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If `time_major == True`, this must be a `Tensor` of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements. This may also be
+        a (possibly nested) tuple of Tensors satisfying this property.  The
+        first two dimensions must match across all the inputs, but otherwise the
+        ranks and other shape components may differ. In this case, input to
+        `cell` at each time-step will replicate the structure of these tuples,
+        except for the time dimension (from which the time is taken). The input
+        to `cell` at each time step will be a `Tensor` or (possibly nested)
+        tuple of Tensors each with dimensions `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used
+      to copy-through state and zero-out outputs when past a batch element's
+      sequence length.  So it's more for performance than correctness.
+    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
+      is an integer, this must be a `Tensor` of appropriate type and shape
+      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
+      should be a tuple of tensors having shapes `[batch_size, s] for s in
+      cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    outputs: The RNN output `Tensor`.
+
+      If time_major == False (default), this will be a `Tensor` shaped:
+        `[batch_size, max_time, cell.output_size]`.
+
+      If time_major == True, this will be a `Tensor` shaped:
+        `[max_time, batch_size, cell.output_size]`.
+
+      Note, if `cell.output_size` is a (possibly nested) tuple of integers
+      or `TensorShape` objects, then `outputs` will be a tuple having the
+      same structure as `cell.output_size`, containing Tensors having shapes
+      corresponding to the shape data in `cell.output_size`.
+
+    state: The final state.  If `cell.state_size` is an int, this
+      will be shaped `[batch_size, cell.state_size]`.  If it is a
+      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+      be a tuple having the corresponding shapes. If cells are `LSTMCells`
+      `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+    RuntimeError: If not using control flow v2.
+  """
+
+  # Currently only support time_major == True case.
+  assert time_major
+
+  # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or
+  # TfLiteRNNCells.
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
+
+  if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+    raise RuntimeError("OpHint dynamic rnn only supports control flow v2.")
+
+  parent_first_child_input = [{
+      "parent_ophint_input_index": 0,
+      "first_child_ophint_input_index": 0
+  }]
+  parent_last_child_output = [{
+      "parent_output_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  internal_children_input_output = [{
+      "child_input_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  inputs_outputs_mappings = {
+      "parent_first_child_input": parent_first_child_input,
+      "parent_last_child_output": parent_last_child_output,
+      "internal_children_input_output": internal_children_input_output
+  }
+  tflite_wrapper = op_hint.OpHint(
+      "TfLiteDynamicRnn",
+      level=2,
+      children_inputs_mappings=inputs_outputs_mappings)
+  with vs.variable_scope(scope or "rnn") as varscope:
+    # Create a new scope in which the caching device is either
+    # determined by the parent scope, or is set to place the cached
+    # Variable using the same placement as for the rest of the RNN.
+    if _should_cache():
+      if varscope.caching_device is None:
+        varscope.set_caching_device(lambda op: op.device)
+
+    inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0)
+
+    # By default, time_major==False and inputs are batch-major: shaped
+    #   [batch, time, depth]
+    # For internal calculations, we transpose to [time, batch, depth]
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+      # (batch, time, depth) => (time, batch, depth)
+      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+    if sequence_length is not None:
+      sequence_length = math_ops.cast(sequence_length, dtypes.int32)
+      if sequence_length.shape.rank not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size, "
+            "but saw shape: %s" % sequence_length.shape)
+      sequence_length = array_ops.identity(  # Just to find it in the graph.
+          sequence_length,
+          name="sequence_length")
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If there is no initial_state, you must give a dtype.")
+      if getattr(cell, "get_initial_state", None) is not None:
+        state = cell.get_initial_state(
+            inputs=None, batch_size=batch_size, dtype=dtype)
+      else:
+        state = cell.zero_state(batch_size, dtype)
+
+    def _assert_has_shape(x, shape):
+      x_shape = array_ops.shape(x)
+      packed_shape = array_ops.stack(shape)
+      return control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
+              "Expected shape for Tensor %s is " % x.name, packed_shape,
+              " but saw shape: ", x_shape
+          ])
+
+    if not context.executing_eagerly() and sequence_length is not None:
+      # Perform some shape validation
+      with ops.control_dependencies(
+          [_assert_has_shape(sequence_length, [batch_size])]):
+        sequence_length = array_ops.identity(
+            sequence_length, name="CheckSeqLen")
+
+    inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+    outputs, final_state = _dynamic_rnn_loop(
+        cell,
+        inputs,
+        state,
+        parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory,
+        sequence_length=sequence_length,
+        dtype=dtype)
+
+    # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+    # If we are performing batch-major calculations, transpose output back
+    # to shape [batch, time, depth]
+    if not time_major:
+      # (time, batch, depth) => (batch, time, depth)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
+    outputs = tflite_wrapper.add_output(outputs, name="outputs")
+
+    return outputs, final_state
+
+
+def bidirectional_dynamic_rnn(cell_fw,
+                              cell_bw,
+                              inputs,
+                              sequence_length=None,
+                              initial_state_fw=None,
+                              initial_state_bw=None,
+                              dtype=None,
+                              parallel_iterations=None,
+                              swap_memory=False,
+                              time_major=False,
+                              scope=None):
+  """Creates a dynamic version of bidirectional recurrent neural network.
+
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not
+  given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: The RNN inputs.
+      If time_major == False (default), this must be a tensor of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If time_major == True, this must be a tensor of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences in the batch. If
+      not provided, all batch entries are assumed to be full sequences; and time
+      reversal is applied from time `0` to `max_time` for each sequence.
+    initial_state_fw: (optional) An initial state for the forward RNN. This must
+      be a tensor of appropriate type and shape `[batch_size,
+      cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a
+      tuple of tensors having shapes `[batch_size, s] for s in
+      cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using the
+      corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial states and expected output.
+      Required if initial_states are not provided or RNN states have a
+      heterogeneous dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_states) where:
+      outputs: A tuple (output_fw, output_bw) containing the forward and
+        the backward rnn output `Tensor`.
+        If time_major == False (default),
+          output_fw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_bw.output_size]`.
+        If time_major == True,
+          output_fw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_bw.output_size]`.
+        It returns a tuple instead of a single concatenated `Tensor`, unlike
+        in the `bidirectional_rnn`. If the concatenated one is preferred,
+        the forward and backward outputs can be concatenated as
+        `tf.concat(outputs, 2)`.
+      output_states: A tuple (output_state_fw, output_state_bw) containing
+        the forward and the backward final states of bidirectional rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+  """
+  rnn_cell_impl.assert_like_rnncell("cell_fw", cell_fw)
+  rnn_cell_impl.assert_like_rnncell("cell_bw", cell_bw)
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = dynamic_rnn(
+          cell=cell_fw,
+          inputs=inputs,
+          sequence_length=sequence_length,
+          initial_state=initial_state_fw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=fw_scope)
+
+    # Backward direction
+    if not time_major:
+      time_axis = 1
+      batch_axis = 0
+    else:
+      time_axis = 0
+      batch_axis = 1
+
+    def _reverse(input_, seq_lengths, seq_axis, batch_axis):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_,
+            seq_lengths=seq_lengths,
+            seq_axis=seq_axis,
+            batch_axis=batch_axis)
+      else:
+        return array_ops.reverse(input_, axis=[seq_axis])
+
+    with vs.variable_scope("bw") as bw_scope:
+
+      def _map_reverse(inp):
+        return _reverse(
+            inp,
+            seq_lengths=sequence_length,
+            seq_axis=time_axis,
+            batch_axis=batch_axis)
+
+      inputs_reverse = nest.map_structure(_map_reverse, inputs)
+      tmp, output_state_bw = dynamic_rnn(
+          cell=cell_bw,
+          inputs=inputs_reverse,
+          sequence_length=sequence_length,
+          initial_state=initial_state_bw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=bw_scope)
+
+  output_bw = _reverse(
+      tmp,
+      seq_lengths=sequence_length,
+      seq_axis=time_axis,
+      batch_axis=batch_axis)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
similarity index 71%
rename from tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
rename to tensorflow/lite/experimental/examples/lstm/rnn_cell.py
index 461345060badbad0fc65b37466436b1a1eb424a4..ba2f7875563663bbc78bcf1716280e9cf05c7277 100644
--- a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
+++ b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TfLite LSTMCell wrapper.
+"""TfLite BasicRnnCell wrapper.
 
 TODO(renjieliu): Find a better home for this one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import tensorflow as tf
+import itertools
 
-from tensorflow.lite.python import lite
+import tensorflow.lite.python.op_hint as op_hint
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
 from tensorflow.python.layers import base as base_layer
@@ -33,8 +33,130 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("lite.experimental.nn.TfLiteRNNCell")
+class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
+  """The most basic RNN cell.
+
+  This is used only for TfLite, it provides hints and it also makes the
+  variables in the desired for the tflite ops.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initializes the parameters for an RNN cell.
+
+    Args:
+      num_units: int, The number of units in the RNN cell.
+      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+        that is within Keras activation function names.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. Raises an error if not `True` and the existing scope
+        already has the given variables.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
+
+    Raises:
+      ValueError: If the existing scope already has the given variables.
+    """
+    super(TfLiteRNNCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be Rank-2.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceRnn")
+    self._num_units = num_units
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    """Builds the RNN cell.
+
+    Args:
+      inputs_shape: Rnn input tensor shape.
+
+    Raises:
+      ValueError: If last dimension of the input shape is not known.
+    """
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       (inputs_shape,))
+
+    input_depth = inputs_shape[-1]
+
+    def add_variable_wrapped(name, shape, initializer, index):
+      var = self.add_weight(name, shape=shape, initializer=initializer)
+      return self._tflite_wrapper.add_input(
+          var, name=name, index_override=index)
+
+    self._input_weights = add_variable_wrapped(
+        "input_weights", [self._num_units, input_depth], None, 1)
+    self._recurrent_weights = add_variable_wrapped(
+        "recurrent_weights", [self._num_units, self._num_units], None, 2)
+    self._bias = add_variable_wrapped(
+        "bias",
+        shape=[self._num_units],
+        initializer=init_ops.zeros_initializer(dtype=self.dtype),
+        index=3)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    inputs = self._tflite_wrapper.add_input(
+        inputs, tag="input", name="input", aggregate="stack", index_override=0)
+    state = self._tflite_wrapper.add_input(
+        state,
+        tag="hidden_state",
+        name="hidden_state",
+        aggregate="first",
+        index_override=4)
+    weights = array_ops.transpose(
+        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
+    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+    output = self._activation(gate_inputs)
+    output = self._tflite_wrapper.add_output(
+        output,
+        tag="output",
+        name="output",
+        index_override=1,
+        aggregate="stack")
+    return output, output
+
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(TfLiteRNNCell, self).get_config()
+    return dict(itertools.chain(base_config.items(), config.items()))
+
+
+@tf_export("lite.experimental.nn.TFLiteLSTMCell")
 class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -132,7 +254,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # TODO(raziel): layers stuff -- chop if un-layerizing Op.
     self.input_spec = base_layer.InputSpec(ndim=2)
 
-    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceLstm")
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceLstm")
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -148,7 +270,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
 
     self._output_size = num_proj if num_proj else num_units
     self._state_size = (
-        tf.nn.rnn_cell.LSTMStateTuple(num_units, self._output_size)
+        rnn_cell_impl.LSTMStateTuple(num_units, self._output_size)
         if state_is_tuple else num_units + self._output_size)
 
   @property
@@ -169,10 +291,15 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     Raises:
       ValueError: if the inputs_shape is invalid.
     """
-    if len(inputs_shape) != 2 or inputs_shape[1].value is None:
+    if len(inputs_shape) != 2:
+      raise ValueError(
+          "inputs_shape must be 2-dimensional, saw shape: %s" % inputs_shape)
+    input_depth = (
+        inputs_shape[1]
+        if isinstance(inputs_shape[1], int) else inputs_shape[1].value)
+    if input_depth is None:
       raise ValueError("Invalid inputs_shape, saw shape: %s" % inputs_shape)
 
-    input_depth = inputs_shape[1].value
     maybe_partitioner = (
         partitioned_variables.fixed_size_partitioner(self._num_unit_shards)
         if self._num_unit_shards is not None else None)
@@ -181,7 +308,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     bias_shape = [self._num_units]
 
     def add_variable_wrapped(name, shape, initializer, index, partitioner):
-      var = self.add_variable(
+      var = self.add_weight(
           name, shape=shape, initializer=initializer, partitioner=partitioner)
       return self._tflite_wrapper.add_input(
           var, name=name, index_override=index)
@@ -192,6 +319,8 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     else:
       bias_initializer = init_ops.zeros_initializer(dtype=self.dtype)
 
+    forget_bias_initializer = init_ops.constant_initializer(self._forget_bias)
+
     self.input_to_input_w = add_variable_wrapped(
         "input_to_input_w", input_weight_shape, weight_initializer, 1,
         maybe_partitioner)
@@ -219,8 +348,9 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
 
     self.input_bias = add_variable_wrapped(
         "input_bias", bias_shape, bias_initializer, 12, maybe_partitioner)
-    self.forget_bias = add_variable_wrapped(
-        "forget_bias", bias_shape, bias_initializer, 13, maybe_partitioner)
+    self.forget_bias = add_variable_wrapped("forget_bias", bias_shape,
+                                            forget_bias_initializer, 13,
+                                            maybe_partitioner)
     self.cell_bias = add_variable_wrapped(
         "cell_bias", bias_shape, bias_initializer, 14, maybe_partitioner)
     self.output_bias = add_variable_wrapped(
@@ -230,10 +360,10 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # f stands for forget, i stands for input and o stands for output.
     if self._use_peepholes:
       self._w_f_diag = add_variable_wrapped("w_f_diag", [self._num_units],
-                                            self._initializer, 9,
+                                            self._initializer, 10,
                                             maybe_partitioner)
       self._w_i_diag = add_variable_wrapped("w_i_diag", [self._num_units],
-                                            self._initializer, 10,
+                                            self._initializer, 9,
                                             maybe_partitioner)
       self._w_o_diag = add_variable_wrapped("w_o_diag", [self._num_units],
                                             self._initializer, 11,
@@ -306,9 +436,9 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
         aggregate="first",
         index_override=18)
 
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.shape.with_rank(2)[1]
     if input_size.value is None:
-      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+      raise ValueError("Could not infer input size from inputs.shape[-1]")
 
     inputs_and_m_prev = array_ops.concat([inputs, m_prev], axis=1)
 
@@ -319,35 +449,37 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # c is the final state.
     # m is the output.
     i = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_input_w, self.cell_to_input_w], axis=1),
+            array_ops.concat([self.input_to_input_w, self.cell_to_input_w],
+                             axis=1),
             transpose_b=True), self.input_bias)
     f = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_forget_w, self.cell_to_forget_w], axis=1),
+            array_ops.concat([self.input_to_forget_w, self.cell_to_forget_w],
+                             axis=1),
             transpose_b=True), self.forget_bias)
     o = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_output_w, self.cell_to_output_w], axis=1),
+            array_ops.concat([self.input_to_output_w, self.cell_to_output_w],
+                             axis=1),
             transpose_b=True), self.output_bias)
     j = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_cell_w, self.cell_to_cell_w], axis=1),
+            array_ops.concat([self.input_to_cell_w, self.cell_to_cell_w],
+                             axis=1),
             transpose_b=True), self.cell_bias)
 
     # Diagonal connections
     if self._use_peepholes:
       c = (
-          sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
+          sigmoid(f + self._w_f_diag * c_prev) * c_prev +
           sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
     else:
-      c = (
-          sigmoid(f + self._forget_bias) * c_prev +
-          sigmoid(i) * self._activation(j))
+      c = (sigmoid(f) * c_prev + sigmoid(i) * self._activation(j))
 
     if self._cell_clip is not None:
       # pylint: disable=invalid-unary-operand-type
@@ -359,7 +491,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
       m = sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
-      transposed_proj_kernel = tf.transpose(self._proj_kernel)
+      transposed_proj_kernel = array_ops.transpose(self._proj_kernel)
       m = math_ops.matmul(m, transposed_proj_kernel)
 
       if self._proj_clip is not None:
@@ -373,7 +505,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
         m, tag="m", name="m", index_override=2, aggregate="stack")
 
     new_state = (
-        tf.nn.rnn_cell.LSTMStateTuple(c, m)
+        rnn_cell_impl.LSTMStateTuple(c, m)
         if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py b/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
deleted file mode 100644
index e4aad18367e6c8bf9669e928dff8d7c9376043b7..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/examples/lstm/tflite_rnn.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TfLite BasicRnnCell wrapper.
-
-TODO(renjieliu): Find a better home for this one.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import itertools
-
-from tensorflow.lite.python import lite
-from tensorflow.python.keras import activations
-from tensorflow.python.layers import base as base_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import rnn_cell_impl
-
-
-class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
-  """The most basic RNN cell.
-
-  This is used only for TfLite, it provides hints and it also makes the
-  variables in the desired for the tflite ops.
-  """
-
-  def __init__(self,
-               num_units,
-               activation=None,
-               reuse=None,
-               name=None,
-               dtype=None,
-               **kwargs):
-    """Initializes the parameters for an RNN cell.
-
-    Args:
-      num_units: int, The number of units in the RNN cell.
-      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
-        that is within Keras activation function names.
-      reuse: (optional) Python boolean describing whether to reuse variables in
-        an existing scope. Raises an error if not `True` and the existing scope
-        already has the given variables.
-      name: String, the name of the layer. Layers with the same name will share
-        weights, but to avoid mistakes we require reuse=True in such cases.
-      dtype: Default dtype of the layer (default of `None` means use the type of
-        the first input). Required when `build` is called before `call`.
-      **kwargs: Dict, keyword named properties for common layer attributes, like
-        `trainable` etc when constructing the cell from configs of get_config().
-
-    Raises:
-      ValueError: If the existing scope already has the given variables.
-    """
-    super(TfLiteRNNCell, self).__init__(
-        _reuse=reuse, name=name, dtype=dtype, **kwargs)
-
-    # Inputs must be Rank-2.
-    self.input_spec = base_layer.InputSpec(ndim=2)
-
-    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceRnn")
-    self._num_units = num_units
-    if activation:
-      self._activation = activations.get(activation)
-    else:
-      self._activation = math_ops.tanh
-
-  @property
-  def state_size(self):
-    return self._num_units
-
-  @property
-  def output_size(self):
-    return self._num_units
-
-  def build(self, inputs_shape):
-    """Builds the RNN cell.
-
-    Args:
-      inputs_shape: Rnn input tensor shape.
-
-    Raises:
-      ValueError: If last dimension of the input shape is not known.
-    """
-    if inputs_shape[-1] is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
-                       (inputs_shape,))
-
-    input_depth = inputs_shape[-1]
-
-    def add_variable_wrapped(name, shape, initializer, index):
-      var = self.add_variable(name, shape=shape, initializer=initializer)
-      return self._tflite_wrapper.add_input(
-          var, name=name, index_override=index)
-
-    self._input_weights = add_variable_wrapped(
-        "input_weights", [self._num_units, input_depth], None, 1)
-    self._recurrent_weights = add_variable_wrapped(
-        "recurrent_weights", [self._num_units, self._num_units], None, 2)
-    self._bias = add_variable_wrapped(
-        "bias",
-        shape=[self._num_units],
-        initializer=init_ops.zeros_initializer(dtype=self.dtype),
-        index=3)
-
-    self.built = True
-
-  def call(self, inputs, state):
-    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-    inputs = self._tflite_wrapper.add_input(
-        inputs, tag="input", name="input", aggregate="stack", index_override=0)
-    state = self._tflite_wrapper.add_input(
-        state,
-        tag="hidden_state",
-        name="hidden_state",
-        aggregate="first",
-        index_override=4)
-    weights = array_ops.transpose(
-        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
-    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
-    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
-    output = self._activation(gate_inputs)
-    output = self._tflite_wrapper.add_output(
-        output,
-        tag="output",
-        name="output",
-        index_override=1,
-        aggregate="stack")
-    return output, output
-
-  def get_config(self):
-    config = {
-        "num_units": self._num_units,
-        "activation": activations.serialize(self._activation),
-        "reuse": self._reuse,
-    }
-    base_config = super(TfLiteRNNCell, self).get_config()
-    return dict(itertools.chain(base_config.items(), config.items()))
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index eeb48d123113c5924a74286ad1e0851eb484cdb8..e29c75100344c95d9e76e1d6e27a8b70fe2add4d 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -20,12 +20,12 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
+
 # Number of steps to train model.
 TRAIN_STEPS = 1
 
@@ -54,20 +54,22 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     self.num_units = 16
 
   def buildLstmLayer(self):
-    return tf.nn.rnn_cell.MultiRNNCell([
-        TFLiteLSTMCell(
-            self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
-        TFLiteLSTMCell(
+    return tf.keras.layers.StackedRNNCells([
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, use_peepholes=True, forget_bias=1.0, name="rnn1"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=1.0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=1.0, name="rnn4")
     ])
 
-  def buildModel(self, lstm_layer, is_dynamic_rnn, is_train):
+  def buildModel(self, lstm_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units, self.n_classes]))
@@ -77,16 +79,12 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     x = tf.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
-    # For dynamic_rnn, train with dynamic_rnn and inference with static_rnn.
     # x is shaped [batch_size,time_steps,num_inputs]
     if is_dynamic_rnn:
-      if is_train:
-        lstm_input = x
-        outputs, _ = tf.nn.dynamic_rnn(lstm_layer, lstm_input, dtype="float32")
-        outputs = tf.unstack(outputs, axis=1)
-      else:
-        lstm_input = tf.unstack(x, self.time_steps, 1)
-        outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
+      lstm_input = tf.transpose(x, perm=[1, 0, 2])
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          lstm_layer, lstm_input, dtype="float32")
+      outputs = tf.unstack(outputs, axis=0)
     else:
       lstm_input = tf.unstack(x, self.time_steps, 1)
       outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
@@ -126,8 +124,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(
-        lstm_layer, is_dynamic_rnn, is_train=False)
+    x, prediction, output_class = self.buildModel(lstm_layer, is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -157,8 +154,8 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
         [tf.float32.as_datatype_enum])
 
-    tflite = tf.lite.toco_convert(
-        curr, [tflite_input], [outputs], allow_custom_ops=False)
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
     interpreter = tf.lite.Interpreter(model_content=tflite)
 
     try:
@@ -179,7 +176,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(
-        self.buildLstmLayer(), is_dynamic_rnn=False, is_train=True)
+        self.buildLstmLayer(), is_dynamic_rnn=False)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
@@ -192,26 +189,15 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+  @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(
-        self.buildLstmLayer(), is_dynamic_rnn=True, is_train=True)
+        self.buildLstmLayer(), is_dynamic_rnn=True)
     self.trainModel(x, prediction, output_class, sess)
 
-    # Since we don't yet support OpHints for dynamic, we will load the model
-    # back in as a static model. This requires the variables to have the same
-    # names as if they were trained as a static. Thus, we get rid of while/rnn
-    # names.
-    variables_to_save = {}
-    for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
-      op_name = i.name
-      if op_name.startswith("while/rnn/"):
-        op_name = op_name.split("while/rnn/")[1]
-      if op_name.endswith(":0"):
-        op_name = op_name.split(":0")[0]
-      variables_to_save[op_name] = i
-    saver = tf.train.Saver(variables_to_save)
+    saver = tf.train.Saver()
 
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(), sess, saver, is_dynamic_rnn=True)
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
index 6f9e2dd9498f03665b52e423db43ce38d5401eb1..aa7c16536bbd1faf1122e50d67ef3df7216150ca 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -22,7 +22,6 @@ import tensorflow as tf
 from tensorflow import flags
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_rnn import TfLiteRNNCell
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -61,12 +60,12 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
 
   def buildRnnLayer(self):
-    return tf.nn.rnn_cell.MultiRNNCell([
-        TfLiteRNNCell(self.num_units, name="rnn1"),
-        TfLiteRNNCell(self.num_units, name="rnn2")
+    return tf.keras.layers.StackedRNNCells([
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
     ])
 
-  def buildModel(self, rnn_layer):
+  def buildModel(self, rnn_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units, self.n_classes]))
@@ -77,8 +76,14 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
     # x is shaped [batch_size,time_steps,num_inputs]
-    rnn_input = tf.unstack(x, self.time_steps, 1)
-    outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
+    if is_dynamic_rnn:
+      rnn_input = tf.transpose(x, perm=[1, 0, 2])
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          rnn_layer, rnn_input, dtype="float32")
+      outputs = tf.unstack(outputs, axis=0)
+    else:
+      rnn_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
 
     # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
     # by the softmax layer's out_weight of shape [num_units,n_classes]
@@ -108,13 +113,14 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
                                  self.n_input))
       sess.run(opt, feed_dict={x: batch_x, y: batch_y})
 
-  def saveAndRestoreModel(self, rnn_layer, sess, saver):
+  def saveAndRestoreModel(self, rnn_layer, sess, saver, is_dynamic_rnn):
     """Saves and restores the model to mimic the most common use case.
 
     Args:
       rnn_layer: The rnn layer either a single rnn cell or a multi rnn cell.
       sess: Old session.
       saver: saver created by tf.train.Saver()
+      is_dynamic_rnn: use dynamic_rnn or not.
 
     Returns:
       A tuple containing:
@@ -130,7 +136,7 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(rnn_layer)
+    x, prediction, output_class = self.buildModel(rnn_layer, is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -160,8 +166,8 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
         curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
         [tf.float32.as_datatype_enum])
 
-    tflite = tf.lite.toco_convert(
-        curr, [tflite_input], [outputs], allow_custom_ops=False)
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
     interpreter = tf.lite.Interpreter(model_content=tflite)
     interpreter.allocate_tensors()
 
@@ -177,12 +183,32 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
   def testStaticRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
 
-    x, prediction, output_class = self.buildModel(self.buildRnnLayer())
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=False)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
-        self.buildRnnLayer(), sess, saver)
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=True)
 
     test_inputs, expected_output, frozen_graph = self.getInferenceResult(
         x, output_class, new_sess)
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index dd314545cb6488ea2a76494df39b4b69e92eca33..78af889cf1ef4c90e3e096e3cc447ecc557f8b51 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -5,7 +5,6 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 # ctc support classes imported directly from TensorFlow.
 cc_library(
@@ -50,26 +49,18 @@ cc_library(
     }),
     deps = [
         ":ctc_utils",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:gemm_support",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:optimized",
         "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "@flatbuffers",
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "ctc_beam_search_decoder_test",
     size = "small",
     srcs = ["ctc_beam_search_decoder_test.cc"],
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index 2d00ef76f4a3e5360e45f31ee486e0b8a7c74cc3..b16b8b49f8a46a43475cc08807570e84a160aed4 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -29,7 +29,6 @@ cc_library(
         "simple_tensor_allocator.h",
     ],
     deps = [
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index b47e0feb79de2129a375476565f9b61128bbf974..9eeae442ee9f5db3e1e608f57a73e2faab4d2807 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -7,118 +7,73 @@ so it's designed to be portable even to 'bare metal' systems. The core runtime
 fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword
 detection model, takes up a total of 22KB.
 
-The design goals are for the framework to be:
-
--   **Readable**: We want embedded software engineers to be able to understand
-    what's required to run ML inference without having to study research papers.
-    We've tried to keep the code base small, modular, and have reference
-    implementations of all operations to help with this.
-
--   **Easy to modify**: We know that there are a lot of different platforms and
-    requirements in the embedded world, and we don't expect to cover all of them
-    in one framework. Instead, we're hoping that it can be a good starting point
-    for developers to build on top of to meet their own needs. For example, we
-    tried to make it easy to replace the implementations of key computational
-    operators that are often crucial for performance, without having to touch
-    the data flow and other runtime code. We want it to make more sense to use
-    our workflow to handle things like model import and less-important
-    operations, and customize the parts that matter, rather than having to
-    reimplement everything in your own engine.
-
--   **Well-tested**: If you're modifying code, you need to know if your changes
-    are correct. Having an easy way to test lets you develop much faster. To
-    help there, we've written tests for all the components, and we've made sure
-    that the tests can be run on almost any platform, with no dependencies apart
-    from the ability to log text to a debug console somewhere. We also provide
-    an easy way to run all the tests on-device as part of an automated test
-    framework, and we use qemu/Renode emulation so that tests can be run even
-    without physical devices present.
-
--   **Easy to integrate**: We want to be as open a system as possible, and use
-    the best code available for each platform. To do that, we're going to rely
-    on projects like
-    [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html),
-    [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to
-    handle as much performance-critical code as possible. We know that there are
-    an increasing number of options to accelerate neural networks on
-    microcontrollers, so we're aiming to be a good host for deploying those
-    hardware technologies too.
-
--   **Compatible**: We're using the same file schema, interpreter API, and
-    kernel interface as regular TensorFlow Lite, so we leverage the large
-    existing set of tools, documentation, and examples for the project. The
-    biggest barrier to deploying ML models is getting them from a training
-    environment into a form that's easy to run inference on, so we see reusing
-    this rich ecosystem as being crucial to being easily usable. We also hope to
-    integrate this experimental work back into the main codebase in the future.
-
-To meet those goals, we've made some tradeoffs:
-
--   **Simple C++**: To help with readability, our code is written in a modern
-    version of C++, but we generally treat it as a "better C", rather relying on
-    more complex features such as template meta-programming. As mentioned
-    earlier, we avoid any use of dynamic memory allocation (new/delete) or the
-    standard C/C++ libraries, so we believe this should still be fairly
-    portable. It does mean that some older devices with C-only toolchains won't
-    be supported, but we're hoping that the reference operator implementations
-    (which are simple C-like functions) can still be useful in those cases. The
-    interfaces are also designed to be C-only, so it should be possible to
-    integrate the resulting library with pure C projects.
-
--   **Interpreted**: Code generation is a popular pattern for embedded code,
-    because it gives standalone code that's easy to modify and step through, but
-    we've chosen to go with an interpreted approach. In our internal
-    microcontroller work we've found that using an extremely stripped-down
-    interpreter with almost no dependencies gives us a lot of the same
-    advantages, but is easier to maintain. For example, when new updates come
-    out for the underlying library, you can just merge your local modifications
-    in a single step, rather than having to regenerate new code and then patch
-    in any changes you subsequently made. The coarse granularity of the
-    interpreted primitives means that each operation call typically takes
-    hundreds of thousands of instruction cycles at least, so we don't see
-    noticeable performance gains from avoiding what's essentially a single
-    switch statement at the interpreter level to call each operation. We're
-    still working on improving the packaging though, for example we're
-    considering having the ability to snapshot all the source files and headers
-    used for a particular model, being able to compile the code and data
-    together as a library, and then access it through a minimal set of C
-    interface calls which hide the underlying complexity.
-
--   **Flatbuffers**: We represent our models using
-    [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs),
-    with the difference that we always keep it in read-only program memory
-    (typically flash) rather than relying on having a file system to read it
-    from. This is a good fit because flatbuffer's serialized format is designed
-    to be mapped into memory without requiring any extra memory allocations or
-    modifications to access it. All of the functions to read model values work
-    directly on the serialized bytes, and large sections of data like weights
-    are directly accessible as sequential C-style arrays of their data type,
-    with no strides or unpacking needed. We do get a lot of value from using
-    flatbuffers, but there is a cost in complexity. The flat buffer library code
-    is all inline
-    [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h),
-    but it isn't straightforward to inspect their implementations, and the model
-    data structures aren't easy to comprehend from the debugger. The header for
-    the schema itself also has to be periodically updated when new information
-    is added to the file format, though we try to handle that transparently for
-    most developers by checking in a pre-generated version.
-
--   **Code Duplication**: Some of the code in this prototype largely duplicates
-    the logic in other parts of the TensorFlow Lite code base, for example the
-    operator wrappers. We've tried to keep share as much as we can between the
-    two interpreters, but there are some assumptions built into the original
-    runtime that make this difficult. We'll be working on modularizing the main
-    interpreter so that we can move to an entirely shared system.
-
-This initial preview release is designed to get early feedback, and is not
-intended to be a final product. It only includes enough operations to run a
-simple keyword recognition model, and the implementations are not optimized.
-We're hoping this will be a good way to get feedback and collaborate to improve
-the framework.
-
-## Getting Started with Make
-
-Building requires a Linux or OS X machine.
+## Table of Contents
+
+-   [Getting Started](#getting-started)
+
+    *   [Getting Started with Portable Reference Code](#getting-started-with-portable-reference-code)
+    *   [Building Portable Reference Code using Make](#building-portable-reference-code-using-make)
+    *   [Building for the "Blue Pill" STM32F103 using Make](#building-for-the-blue-pill-stm32f103-using-make)
+    *   [Building for "Hifive1" SiFive FE310 development board using Make](#building-for-hifive1-sifive-fe310-development-board-using-make)
+    *   [Building for Ambiq Micro Apollo3Blue EVB using Make](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+        *   [Additional Apollo3 Instructions](#additional-apollo3-instructions)
+    *   [Building for the Eta Compute ECM3531 EVB using Make](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
+-   [Goals](#goals)
+
+-   [Generating Project Files](#generating-project-#files)
+
+-   [How to Port TensorFlow Lite Micro to a New Platform](#how-to-port-tensorflow-lite-micro-to-a-new-platform)
+
+    *   [Requirements](#requirements)
+    *   [Getting Started](getting-started)
+    *   [Troubleshooting](#troubleshooting)
+    *   [Optimizing for your Platform](#optimizing-for-your-platform)
+    *   [Code Module Organization](#code-module-organization)
+    *   [Working with Generated Projects](#working-with-generated-projects)
+    *   [Supporting a Platform with Makefiles](#supporting-a-platform-with-makefiles)
+    *   [Supporting a Platform with Emulation Testing](#supporting-a-platform-with-emulation-testing)
+    *   [Implementing More Optimizations](#implementing-more-optimizations)
+
+# Getting Started
+
+One of the challenges of embedded software development is that there are a lot
+of different architectures, devices, operating systems, and build systems. We
+aim to support as many of the popular combinations as we can, and make it as
+easy as possible to add support for others.
+
+If you're a product developer, we have build instructions or pre-generated
+project files that you can download for the following platforms:
+
+Device                                                                                         | Mbed                                                                           | Keil                                                                           | Make/GCC
+---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | --------
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl)
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | -                                                                              | -                                                                              | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
+[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/)  | -                                                                              | -                                                                              | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/)                            | -                                                                              | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+[Eta Compute ECM3531 EVB](https://etacompute.com/)                                             | -                                                                              | -                                                                              | [Instructions](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
+If your device is not yet supported, it may not be too hard to add support. You
+can learn about that process
+[here](#how-to-port-tensorflow-lite-micro-to-a-new-platform). We're looking
+forward to getting your help expanding this table!
+
+## Getting Started with Portable Reference Code
+
+If you don't have a particular microcontroller platform in mind yet, or just
+want to try out the code before beginning porting, the easiest way to begin is
+by
+[downloading the platform-agnostic reference code](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+You'll see a series of folders inside the archive, with each one containing just
+the source files you need to build one binary. There is a simple Makefile for
+each folder, but you should be able to load the files into almost any IDE and
+build them. There's also a [Visual Studio Code](https://code.visualstudio.com/) project file already set up, so
+you can easily explore the code in a cross-platform IDE.
+
+## Building Portable Reference Code using Make
+
+It's easy to build portable reference code directly from GitHub using make if
+you're on a Linux or OS X machine.
 
 -   Open a terminal
 -   Download the TensorFlow source with `git clone
@@ -177,7 +132,7 @@ building binaries that run locally on the Mac OS or Linux machine you're
 building on, but this approach becomes important when we're targeting simple
 micro controller devices.
 
-## Building for the "Blue Pill" STM32F103
+## Building for the "Blue Pill" STM32F103 using Make
 
 The goal of this library is to enable machine learning on resource-constrained
 micro controllers and DSPs, and as part of that we've targeted the
@@ -283,7 +238,7 @@ Successfully tagged riscv_build:latest
 
 Building micro_speech_test binary
 
--   Lauch the Docker that we just created using: `docker run -it-v
+-   Launch the Docker that we just created using: `docker run -it-v
     /tmp/copybara_out:/workspace riscv_build:latest bash`
 -   Enter the source root directory by running `cd /workspace`
 -   Download the dependencies by running
@@ -294,7 +249,7 @@ Building micro_speech_test binary
 -   Build the binary: `make -f
     tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
 
-Lauching Renode to test the binary, currently this set up is not automated.
+Launching Renode to test the binary, currently this set up is not automated.
 
 -   Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's
     @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
@@ -313,16 +268,15 @@ You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
 02:25:22.4253 [DEBUG] uart0: [+0.16ms host +0s virt 0.28s virt from start]   Progam has exited with code:0x00000000
 ```
 
-## Building for Ambiq Micro Apollo3Blue EVB
+## Building for Ambiq Micro Apollo3Blue EVB using Make
 
 Follow these steps to get the pushbutton yes/no example working on Apollo 3:
 
-1.  Make sure to run the "Getting Started" section before performing the
-    following steps
-2.  Download Apollo3-SDK-2018.08.13 and place in
-    `tensorflow/lite/experimental/micro/tools/make/downloads`. This is not yet
-    publicly released, but you can contact ashah@ambiqmicro.com to request a
-    copy.
+1.  Make sure to run the "Building Portable Reference Code using Make" section
+    before performing the following steps
+2.  The Ambiq Micro SDK is downloaded into
+    `tensorflow/lite/experimental/micro/tools/make/downloads` by
+    'download_dependencies.sh'.
 3.  Compile the project with the following command: make -f
     tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
     pushbutton_cmsis_speech_test_bin
@@ -349,7 +303,10 @@ Follow these steps to get the pushbutton yes/no example working on Apollo 3:
     4.  Press BTN2. An LED will flash for 1 second. Speak your utterance during
         this one second
     5.  The debugger will print out four numbers. They are the probabilites for
-        1) no speech, 2) unknown speech, 3) yes, 4) no
+        1.  no speech
+        2.  unknown speech
+        3.  yes
+        4.  no
     6.  The EVB LEDs will indicate detection.
         1.  LED0 (rightmost LED) - ON when capturing 1sec of audio
         2.  LED1 - ON when detecting silence
@@ -359,12 +316,172 @@ Follow these steps to get the pushbutton yes/no example working on Apollo 3:
 
 ### Additional Apollo3 Instructions
 
-To flash a part with JFlash Lite, do the following: 1. At the command line:
-JFlashLiteExe 2. Device = AMA3B1KK-KBR 3. Interface = SWD at 1000 kHz 4. Data
-file =
-tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin
+To flash a part with JFlash Lite, do the following: 
+
+1. At the command line: JFlashLiteExe 
+2. Device = AMA3B1KK-KBR 
+3. Interface = SWD at 1000 kHz 
+4. Data file = `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
 5. Prog Addr = 0x0000C000
 
+## Building for the Eta Compute ECM3531 EVB using Make
+
+1.  Follow the instructions at
+    [Tensorflow Micro Speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech#getting-started)
+    to down load the Tensorflow source code and the support libraries \(but do
+    not run the make command shown there.\)
+2.  Download the Eta Compute SDK, version 0.0.17. Contact info@etacompute.com
+3.  You will need the the Arm compiler arm-none-eabi-gcc, version 7.3.1
+    20180622, release ARM/embedded-7-branch revision 261907, 7-2018-q2-update.
+    This compiler is downloaded when you run the
+    tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+    script.
+4.  Edit the file
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
+    so that the variables ETA_SDK and GCC_ARM point to the correct directories.
+5.  Compile the code with the command \
+    &nbsp;&nbsp;&nbsp;&nbsp;make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test \
+    This will produce a set of executables in the
+    tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin
+    directory.
+6.  To load an executable into SRAM \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start ocd \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./load_program name_of_executable, for e.g.,
+    ./load_program audio_provider_test \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start PuTTY \(Connection type = Serial, Speed =
+    11520, Data bits = 8, Stop bits = 1, Parity = None\) \
+    The following output should appear: \
+    Testing TestAudioProvider \
+    Testing TestTimer \
+    2/2 tests passed \
+    \~\~\~ALL TESTS PASSED\~\~\~ \
+    Execution time \(msec\) = 7
+7.  To load into flash \
+    &nbsp;&nbsp;&nbsp;&nbsp;Edit the variable ETA_LDS_FILE in
+    tensorflow/lite/experimental/micro/tools/&nbsp;&nbsp;make/targets/ecm3531_makefile.inc
+    to point to the ecm3531_flash.lds file \
+    &nbsp;&nbsp;&nbsp;&nbsp;Recompile \( make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test\) \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./flash_program executable_name to load into flash.
+
+## Goals
+
+The design goals are for the framework to be:
+
+-   **Readable**: We want embedded software engineers to be able to understand
+    what's required to run ML inference without having to study research papers.
+    We've tried to keep the code base small, modular, and have reference
+    implementations of all operations to help with this.
+
+-   **Easy to modify**: We know that there are a lot of different platforms and
+    requirements in the embedded world, and we don't expect to cover all of them
+    in one framework. Instead, we're hoping that it can be a good starting point
+    for developers to build on top of to meet their own needs. For example, we
+    tried to make it easy to replace the implementations of key computational
+    operators that are often crucial for performance, without having to touch
+    the data flow and other runtime code. We want it to make more sense to use
+    our workflow to handle things like model import and less-important
+    operations, and customize the parts that matter, rather than having to
+    reimplement everything in your own engine.
+
+-   **Well-tested**: If you're modifying code, you need to know if your changes
+    are correct. Having an easy way to test lets you develop much faster. To
+    help there, we've written tests for all the components, and we've made sure
+    that the tests can be run on almost any platform, with no dependencies apart
+    from the ability to log text to a debug console somewhere. We also provide
+    an easy way to run all the tests on-device as part of an automated test
+    framework, and we use qemu/Renode emulation so that tests can be run even
+    without physical devices present.
+
+-   **Easy to integrate**: We want to be as open a system as possible, and use
+    the best code available for each platform. To do that, we're going to rely
+    on projects like
+    [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html),
+    [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to
+    handle as much performance-critical code as possible. We know that there are
+    an increasing number of options to accelerate neural networks on
+    microcontrollers, so we're aiming to be a good host for deploying those
+    hardware technologies too.
+
+-   **Compatible**: We're using the same file schema, interpreter API, and
+    kernel interface as regular TensorFlow Lite, so we leverage the large
+    existing set of tools, documentation, and examples for the project. The
+    biggest barrier to deploying ML models is getting them from a training
+    environment into a form that's easy to run inference on, so we see reusing
+    this rich ecosystem as being crucial to being easily usable. We also hope to
+    integrate this experimental work back into the main codebase in the future.
+
+To meet those goals, we've made some tradeoffs:
+
+-   **Simple C++**: To help with readability, our code is written in a modern
+    version of C++, but we generally treat it as a "better C", rather relying on
+    more complex features such as template meta-programming. As mentioned
+    earlier, we avoid any use of dynamic memory allocation (new/delete) or the
+    standard C/C++ libraries, so we believe this should still be fairly
+    portable. It does mean that some older devices with C-only toolchains won't
+    be supported, but we're hoping that the reference operator implementations
+    (which are simple C-like functions) can still be useful in those cases. The
+    interfaces are also designed to be C-only, so it should be possible to
+    integrate the resulting library with pure C projects.
+
+-   **Interpreted**: Code generation is a popular pattern for embedded code,
+    because it gives standalone code that's easy to modify and step through, but
+    we've chosen to go with an interpreted approach. In our internal
+    microcontroller work we've found that using an extremely stripped-down
+    interpreter with almost no dependencies gives us a lot of the same
+    advantages, but is easier to maintain. For example, when new updates come
+    out for the underlying library, you can just merge your local modifications
+    in a single step, rather than having to regenerate new code and then patch
+    in any changes you subsequently made. The coarse granularity of the
+    interpreted primitives means that each operation call typically takes
+    hundreds of thousands of instruction cycles at least, so we don't see
+    noticeable performance gains from avoiding what's essentially a single
+    switch statement at the interpreter level to call each operation. We're
+    still working on improving the packaging though, for example we're
+    considering having the ability to snapshot all the source files and headers
+    used for a particular model, being able to compile the code and data
+    together as a library, and then access it through a minimal set of C
+    interface calls which hide the underlying complexity.
+
+-   **Flatbuffers**: We represent our models using
+    [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs),
+    with the difference that we always keep it in read-only program memory
+    (typically flash) rather than relying on having a file system to read it
+    from. This is a good fit because flatbuffer's serialized format is designed
+    to be mapped into memory without requiring any extra memory allocations or
+    modifications to access it. All of the functions to read model values work
+    directly on the serialized bytes, and large sections of data like weights
+    are directly accessible as sequential C-style arrays of their data type,
+    with no strides or unpacking needed. We do get a lot of value from using
+    flatbuffers, but there is a cost in complexity. The flat buffer library code
+    is all inline
+    [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h),
+    but it isn't straightforward to inspect their implementations, and the model
+    data structures aren't easy to comprehend from the debugger. The header for
+    the schema itself also has to be periodically updated when new information
+    is added to the file format, though we try to handle that transparently for
+    most developers by checking in a pre-generated version.
+
+-   **Code Duplication**: Some of the code in this prototype largely duplicates
+    the logic in other parts of the TensorFlow Lite code base, for example the
+    operator wrappers. We've tried to keep share as much as we can between the
+    two interpreters, but there are some assumptions built into the original
+    runtime that make this difficult. We'll be working on modularizing the main
+    interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not
+intended to be a final product. It only includes enough operations to run a
+simple keyword recognition model, and the implementations are not optimized.
+We're hoping this will be a good way to get feedback and collaborate to improve
+the framework.
+
 ## Generating Project Files
 
 It's not always easy or convenient to use a makefile-based build process,
@@ -384,7 +501,8 @@ This will create a folder in
 `tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech_main_test/mbed`
 that contains the source and header files, some Mbed configuration files, and a
 README. You should then be able to copy this directory to another machine, and
-use it just like any other Mbed project.
+use it just like any other Mbed project. There's more information about project
+files [below](#working-with-generated-projects).
 
 ## How to Port TensorFlow Lite Micro to a New Platform
 
@@ -537,7 +655,9 @@ critical parts of the code with versions specifically tailored to your
 architecture. The framework has been designed with this in mind, and we hope the
 combination of small modules and many tests makes it as straightforward as
 possible to swap in your own code a piece at a time, ensuring you have a working
-version at every step.
+version at every step. To write specialized implementations for a platform, it's
+useful to understand how optional components are handled inside the build
+system.
 
 ### Code Module Organization
 
@@ -628,7 +748,7 @@ kernel implementations, but with some specific conventions:
     latest in the ordered list will be chosen. This allows us to express “I’d
     like generically-optimized fixed point if it’s available, but I’d prefer
     something using the CMSIS library” using the list 'fixed_point cmsis'. These
-    tags are passed in as `TAGS="<foo"` on the command line when you use the
+    tags are passed in as `TAGS="<foo>"` on the command line when you use the
     main Makefile to build.
 -   There is an implicit “reference” tag at the start of every list, so that
     it’s possible to support directory structures like the current
@@ -640,3 +760,172 @@ kernel implementations, but with some specific conventions:
     top level.
 -   Tests should be at the parent level, with no platform-specific code.
 -   No platform-specific macros or #ifdef’s should be used in any portable code.
+
+The implementation of these rules is handled inside the Makefile, with a
+[`specialize` function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc#L42)
+that takes a list of reference source file paths as an input, and returns the
+equivalent list with specialized versions of those files swapped in if they
+exist.
+
+### Working with Generated Projects
+
+So far, I've recommended that you use the standalone generated projects for your
+system. You might be wondering why you're not just checking out the full
+[TensorFlow codebase from GitHub](https://github.com/tensorflow/tensorflow/)?
+The main reason is that there is a lot more diversity of architectures, IDEs,
+support libraries, and operating systems in the embedded world. Many of the
+toolchains require their own copy of source files, or a list of sources to be
+written to a project file. When a developer working on TensorFlow adds a new
+source file or changes its location, we can't expect her to update multiple
+different project files, many of which she may not have the right software to
+verify the change was correct. That means we have to rely on a central listing
+of source files (which in our case is held in the makefile), and then call a
+tool to generate other project files from those. We could ask embedded
+developers to do this process themselves after downloading the main source, but
+running the makefile requires a Linux system which may not be available, takes
+time, and involves downloading a lot of dependencies. That is why we've opted to
+make regular snapshots of the results of generating these projects for popular
+IDEs and platforms, so that embedded developers have a fast and friendly way to
+start using TensorFlow Lite for Microcontrollers.
+
+This does have the disadvantage that you're no longer working directly on the
+main repository, instead you have a copy that's outside of source control. We've
+tried to make the copy as similar to the main repo as possible, for example by
+keeping the paths of all source files the same, and ensuring that there are no
+changes between the copied files and the originals, but it still makes it
+tougher to sync as the main repository is updated. There are also multiple
+copies of the source tree, one for each target, so any change you make to one
+copy has to be manually propagated across all the other projects you care about.
+This doesn't matter so much if you're just using the projects as they are to
+build products, but if you want to support a new platform and have the changes
+reflected in the main code base, you'll have to do some extra work.
+
+As an example, think about the `DebugLog()` implementation we discussed adding
+for a new platform earlier. At this point, you have a new version of
+`debug_log.cc` that does what's required, but how can you share that with the
+wider community? The first step is to pick a tag name for your platform. This
+can either be the operating system (for example 'mbed'), the name of a device
+('bluepill'), or some other text that describes it. This should be a short
+string with no spaces or special characters. Log in or create an account on
+GitHub, fork the full
+[TensorFlow codebase](https://github.com/tensorflow/tensorflow/) using the
+'Fork' button on the top left, and then grab your fork by using a command like
+`git clone https://github.com/<your user name>/tensorflow`.
+
+You'll either need Linux, MacOS, or Windows with something like CygWin installed
+to run the next steps, since they involve building a makefile. Run the following
+commands from a terminal, inside the root of the source folder:
+
+```
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile generate_projects
+```
+
+This will take a few minutes, since it has to download some large toolchains for
+the dependencies. Once it has finished, you should see some folders created
+inside a path like
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/`. The exact
+path depends on your host operating system, but you should be able to figure it
+out from all the copy commands. These folders contain the generated project and
+source files, with
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/keil`
+containing the Keil uVision targets,
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/mbed` with
+the Mbed versions, and so on.
+
+If you've got this far, you've successfully set up the project generation flow.
+Now you need to add your specialized implementation of `DebugLog()`. Start by
+creating a folder inside `tensorflow/lite/experimental/micro/` named after the
+tag you picked earlier. Put your `debug_log.cc` file inside this folder, and
+then run this command, with '<your tag>' replaced by the actual folder name:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS="<your tag>" generate_projects
+```
+
+If your tag name actually refers to a whole target architecture, then you'll use
+TARGET or TARGET_ARCH instead. For example, here's how a simple RISC-V set of
+projects is generated:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET="riscv32_mcu" generate_projects
+```
+
+The way it works is the same as TAGS though, it just looks for specialized
+implementations with the same containing folder name.
+
+If you look inside the projects that have been created, you should see that the
+default `DebugLog()` implementation is no longer present at
+`tensorflow/lite/experimental/micro/debug_log.cc`, and instead
+`tensorflow/lite/experimental/micro/<your tag>/debug_log.cc` is being used. Copy
+over the generated project files and try building them in your own IDE. If
+everything works, then you're ready to submit your change.
+
+To do this, run something like:
+
+```
+git add tensorflow/lite/experimental/micro/<your tag>/debug_log.cc
+git commit -a -m "Added DebugLog() support for <your platform>"
+git push origin master
+```
+
+Then go back to https://github.com/<your account>/tensorflow, and choose "New
+Pull Request" near the top. You should then be able to go through the standard
+TensorFlow PR process to get your change added to the main repository, and
+available to the rest of the community!
+
+### Supporting a Platform with Makefiles
+
+The changes you've made so far will enable other developers using the generated
+projects to use your platform, but TensorFlow's continuous integration process
+uses makefiles to build frequently and ensure changes haven't broken the build
+process for different systems. If you are able to convert your build procedure
+into something that can be expressed by a makefile, then we can integrate your
+platform into our CI builds and make sure it continues to work.
+
+Fully describing how to do this is beyond the scope of this documentation, but
+the biggest needs are:
+
+-   A command-line compiler that can be called for every source file.
+-   A list of the arguments to pass into the compiler to build and link all
+    files.
+-   The correct linker map files and startup assembler to ensure `main()` gets
+    called.
+
+### Supporting a Platform with Emulation Testing
+
+Integrating your platform into the makefile process should help us make sure
+that it continues to build, but it doesn't guarantee that the results of the
+build process will run correctly. Running tests is something we require to be
+able to say that TensorFlow officially supports a platform, since otherwise we
+can't guarantee that users will have a good experience when they try using it.
+Since physically maintaining a full set of all supported hardware devices isn't
+feasible, we rely on software emulation to run these tests. A good example is
+our
+[STM32F4 'Bluepill' support](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh),
+which uses [Docker](https://www.docker.com/) and [Renode](https://renode.io/) to
+run built binaries in an emulator. You can use whatever technologies you want,
+the only requirements are that they capture the debug log output of the tests
+being run in the emulator, and parse them for the string that indicates the test
+was successful. These scripts need to run on Ubuntu 18.04, in a bash
+environment, though Docker is available if you need to install extra software or
+have other dependencies.
+
+### Implementing More Optimizations
+
+Clearly, getting debug logging support is only the beginning of the work you'll
+need to do on a particular platform. It's very likely that you'll want to
+optimize the core deep learning operations that take up the most time when
+running models you care about. The good news is that the process for providing
+optimized implementations is the same as the one you just went through to
+provide your own logging. You'll need to identify parts of the code that are
+bottlenecks, and then add specialized implementations in their own folders.
+These don't need to be platform specific, they can also be broken out by which
+library they rely on for example. [Here's where we do that for the CMSIS
+implementation of integer fast-fourier
+transforms](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc).
+This more complex case shows that you can also add helper source files alongside
+the main implementation, as long as you
+[mention them in the platform-specific makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc).
+You can also do things like update the list of libraries that need to be linked
+in, or add include paths to required headers.
diff --git a/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc b/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d961963969039c75232b91bba12b54870225605
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc
@@ -0,0 +1,20 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include "eta_csp_io.h"
+
+extern "C" void DebugLog(const char* s) { EtaCspIoPrintf("%s", s); }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 171d07cf569a366aad8edf0715c6c1c8ea4a6d87..218b5de86a742f974520f82c8628fa9e3d08e210 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -11,34 +11,34 @@ load(
 )
 
 cc_library(
-    name = "model_settings",
+    name = "simple_model_settings",
     srcs = [
-        "model_settings.cc",
+        "simple_features/simple_model_settings.cc",
     ],
     hdrs = [
-        "model_settings.h",
+        "simple_features/simple_model_settings.h",
     ],
 )
 
 cc_library(
-    name = "tiny_conv_model_data",
+    name = "tiny_conv_simple_features_model_data",
     srcs = [
-        "tiny_conv_model_data.cc",
+        "simple_features/tiny_conv_simple_features_model_data.cc",
     ],
     hdrs = [
-        "tiny_conv_model_data.h",
+        "simple_features/tiny_conv_simple_features_model_data.h",
     ],
 )
 
 cc_library(
-    name = "features_test_data",
+    name = "simple_features_test_data",
     srcs = [
-        "no_features_data.cc",
-        "yes_features_data.cc",
+        "simple_features/no_simple_features_data.cc",
+        "simple_features/yes_simple_features_data.cc",
     ],
     hdrs = [
-        "no_features_data.h",
-        "yes_features_data.h",
+        "simple_features/no_simple_features_data.h",
+        "simple_features/yes_simple_features_data.h",
     ],
 )
 
@@ -48,10 +48,10 @@ tflite_micro_cc_test(
         "micro_speech_test.cc",
     ],
     deps = [
-        ":features_test_data",
-        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -60,45 +60,66 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_test_data",
+    name = "audio_sample_test_data",
     srcs = [
         "no_30ms_sample_data.cc",
-        "no_power_spectrum_data.cc",
         "yes_30ms_sample_data.cc",
-        "yes_power_spectrum_data.cc",
     ],
     hdrs = [
         "no_30ms_sample_data.h",
-        "no_power_spectrum_data.h",
         "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.h",
     ],
 )
 
 cc_library(
-    name = "preprocessor_reference",
+    name = "audio_large_sample_test_data",
     srcs = [
-        "preprocessor.cc",
+        "no_1000ms_sample_data.cc",
+        "yes_1000ms_sample_data.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "no_1000ms_sample_data.h",
+        "yes_1000ms_sample_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_test_data",
+    srcs = [
+        "simple_features/no_power_spectrum_data.cc",
+        "simple_features/yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "simple_features/no_power_spectrum_data.h",
+        "simple_features/yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_reference",
+    srcs = [
+        "simple_features/simple_features_generator.cc",
+    ],
+    hdrs = [
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_reference_test",
+    name = "simple_features_generator_reference_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_reference",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_reference",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -106,29 +127,30 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_fixed",
+    name = "simple_features_generator_fixed",
     srcs = [
-        "fixed_point/preprocessor.cc",
+        "simple_features/fixed_point/simple_features_generator.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_fixed_test",
+    name = "simple_features_generator_fixed_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_fixed",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_fixed",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -144,9 +166,25 @@ cc_library(
         "audio_provider.h",
     ],
     deps = [
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+cc_library(
+    name = "audio_provider_mock",
+    srcs = [
+        "audio_provider_mock.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -157,9 +195,24 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_mock_test",
+    srcs = [
+        "audio_provider_mock_test.cc",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
@@ -174,10 +227,10 @@ cc_library(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
-        ":preprocessor_reference",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -189,9 +242,41 @@ tflite_micro_cc_test(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider_mock",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_mock_test",
+    srcs = [
+        "feature_provider_mock_test.cc",
+    ],
+    deps = [
+        ":feature_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
@@ -205,9 +290,9 @@ cc_library(
         "recognize_commands.h",
     ],
     deps = [
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -227,6 +312,33 @@ tflite_micro_cc_test(
     ],
 )
 
+cc_library(
+    name = "command_responder",
+    srcs = [
+        "command_responder.cc",
+    ],
+    hdrs = [
+        "command_responder.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "command_responder_test",
+    srcs = [
+        "command_responder_test.cc",
+    ],
+    deps = [
+        ":command_responder",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 cc_binary(
     name = "micro_speech",
     srcs = [
@@ -234,16 +346,33 @@ cc_binary(
     ],
     deps = [
         ":audio_provider",
+        ":command_responder",
         ":feature_provider",
-        ":features_test_data",
-        ":model_settings",
-        ":preprocessor_reference",
         ":recognize_commands",
-        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech_mock",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        ":command_responder",
+        ":feature_provider",
+        ":recognize_commands",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
-        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
index 3d560510ad140ff0bba84ebcf790a0fda90e72fa..22134152afb27b558b14589a0f9a22ae117a93e4 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -5,6 +5,11 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
     -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/
 
+  GENERATED_PROJECT_INCLUDES += \
+    -isystemthird_party/cmsis/CMSIS/Core/Include/ \
+    -isystemthird_party/cmsis/CMSIS/DSP/Include/ \
+    -Ithird_party/CMSIS_ext/
+
   CMSIS_PREPROCESSOR_SRCS := \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc \
@@ -12,6 +17,7 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
   CMSIS_PREPROCESSOR_HDRS := \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h \
     tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h \
+    third_party/CMSIS_ext/README.md \
     third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
 
   PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
@@ -20,24 +26,34 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),)
   FEATURE_PROVIDER_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
   FEATURE_PROVIDER_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
 
+  SIMPLE_FEATURES_GENERATOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  SIMPLE_FEATURES_GENERATOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
   MICRO_SPEECH_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
   MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
 
   THIRD_PARTY_CC_SRCS += \
-    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
-    third_party/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
-    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
-    third_party/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
-    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
-    third_party/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+    $(MAKEFILE_DIR)/downloads/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
 
   THIRD_PARTY_CC_HDRS += \
+    third_party/cmsis/LICENSE.txt \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_compiler.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_gcc.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_version.h \
+    third_party/cmsis/CMSIS/Core/Include/core_cm3.h \
     third_party/cmsis/CMSIS/DSP/Include/arm_common_tables.h \
-    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h
+    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_math.h
 
 endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
index 49aace3d7d05ba1d7010d3d834c66dc13e488c96..6015d0d63614d9deeb42d78e3f954c4403c457c2 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -1,106 +1,373 @@
 
+INCLUDES += \
+ -I$(MAKEFILE_DIR)/downloads/kissfft
+
+GENERATED_PROJECT_INCLUDES += \
+-I./third_party/kissfft
+
+PROJECT_INCLUDES += \
+third_party/kissfft
+
+KISSFFT_LIB_SRCS := \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.c \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.c
+
+KISSFFT_LIB_HDRS := \
+$(MAKEFILE_DIR)/downloads/kissfft/COPYING \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.h \
+$(MAKEFILE_DIR)/downloads/kissfft/_kiss_fft_guts.h \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.h
+
+THIRD_PARTY_CC_HDRS += \
+third_party/kissfft/COPYING \
+third_party/kissfft/kiss_fft.h \
+third_party/kissfft/_kiss_fft_guts.h \
+third_party/kissfft/tools/kiss_fftr.h
+
 MICRO_SPEECH_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
 
 MICRO_SPEECH_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
 
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
+SIMPLE_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
 
-PREPROCESSOR_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+SIMPLE_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
+
+MICRO_FEATURES_LIB_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_LIB_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FFT_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_FFT_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FILTERBANK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
+
+MICRO_FEATURES_FILTERBANK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_FRONTEND_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_FRONTEND_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_LOG_SCALE_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
+
+MICRO_FEATURES_LOG_SCALE_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
+
+MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_WINDOW_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
+
+MICRO_FEATURES_WINDOW_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_GENERATOR_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_GENERATOR_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 AUDIO_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
 
 AUDIO_PROVIDER_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
+AUDIO_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
+
+AUDIO_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
 
 FEATURE_PROVIDER_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
 
 FEATURE_PROVIDER_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+FEATURE_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+FEATURE_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 RECOGNIZE_COMMANDS_TEST_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
 
 RECOGNIZE_COMMANDS_TEST_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
 
+COMMAND_RESPONDER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc
+
+COMMAND_RESPONDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h
+
 MICRO_SPEECH_SRCS := \
 tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
 
 MICRO_SPEECH_HDRS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
 tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h \
-tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+MICRO_SPEECH_MOCK_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_SPEECH_MOCK_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
 
 # Find any platform-specific rules for this example.
 include $(wildcard tensorflow/lite/experimental/micro/examples/micro_speech/*/Makefile.inc)
 
+$(eval $(call microlite_test,micro_features_fft_test,\
+$(MICRO_FEATURES_FFT_TEST_SRCS),$(MICRO_FEATURES_FFT_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_filterbank_test,\
+$(MICRO_FEATURES_FILTERBANK_TEST_SRCS),$(MICRO_FEATURES_FILTERBANK_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_frontend_test,\
+$(MICRO_FEATURES_FRONTEND_TEST_SRCS),$(MICRO_FEATURES_FRONTEND_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_log_scale_test,\
+$(MICRO_FEATURES_LOG_SCALE_TEST_SRCS),$(MICRO_FEATURES_LOG_SCALE_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_noise_reduction_test,\
+$(MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS),$(MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_pcan_gain_control_test,\
+$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS),$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_window_test,\
+$(MICRO_FEATURES_WINDOW_TEST_SRCS),$(MICRO_FEATURES_WINDOW_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,micro_features_generator_test,\
+$(MICRO_FEATURES_GENERATOR_TEST_SRCS), $(MICRO_FEATURES_GENERATOR_TEST_HDRS)))
+
 # Tests loading and running a speech model.
 $(eval $(call microlite_test,micro_speech_test,\
 $(MICRO_SPEECH_TEST_SRCS),$(MICRO_SPEECH_TEST_HDRS)))
 
 # Test the code for feature generation.
-$(eval $(call microlite_test,preprocessor_test,\
-$(PREPROCESSOR_TEST_SRCS), $(PREPROCESSOR_TEST_HDRS)))
+$(eval $(call microlite_test,simple_features_generator_test,\
+$(SIMPLE_FEATURES_GENERATOR_TEST_SRCS), $(SIMPLE_FEATURES_GENERATOR_TEST_HDRS)))
 
 # Tests the audio provider module.
 $(eval $(call microlite_test,audio_provider_test,\
 $(AUDIO_PROVIDER_TEST_SRCS),$(AUDIO_PROVIDER_TEST_HDRS)))
 
+# Tests the audio provider mock module.
+$(eval $(call microlite_test,audio_provider_mock_test,\
+$(AUDIO_PROVIDER_MOCK_TEST_SRCS),$(AUDIO_PROVIDER_MOCK_TEST_HDRS)))
+
 # Tests the feature provider module.
 $(eval $(call microlite_test,feature_provider_test,\
 $(FEATURE_PROVIDER_TEST_SRCS),$(FEATURE_PROVIDER_TEST_HDRS)))
 
-# Tests the feature provider module.
+# Tests the feature provider module using the mock audio provider.
+$(eval $(call microlite_test,feature_provider_mock_test,\
+$(FEATURE_PROVIDER_MOCK_TEST_SRCS),$(FEATURE_PROVIDER_MOCK_TEST_HDRS)))
+
+# Tests the command recognizer module.
 $(eval $(call microlite_test,recognize_commands_test,\
 $(RECOGNIZE_COMMANDS_TEST_SRCS),$(RECOGNIZE_COMMANDS_TEST_HDRS)))
 
+# Tests responding to a command.
+$(eval $(call microlite_test,command_responder_test,\
+$(COMMAND_RESPONDER_TEST_SRCS),$(COMMAND_RESPONDER_TEST_HDRS)))
+
 # Builds a standalone speech command recognizer binary.
 $(eval $(call microlite_test,micro_speech,\
 $(MICRO_SPEECH_SRCS),$(MICRO_SPEECH_HDRS)))
+
+# Builds a standalone speech command recognizer binary using fake audio input.
+$(eval $(call microlite_test,micro_speech_mock,\
+$(MICRO_SPEECH_MOCK_SRCS),$(MICRO_SPEECH_MOCK_HDRS)))
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
index 500eed33bab0187f9b2cf9647c046f4a541b9e2c..4a88de1b5becb2037a7f1454ad22b39a6388e7c3 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -31,6 +31,9 @@ To understand how TensorFlow Lite does this, you can look at the `TestInvoke()`
 
 Once you have downloaded the dependencies and got the x86/Linux build working, you can try building a version for the STM32F103 'bluepill' device. The following command will build the test and then run it on an emulator, assuming you have Docker installed:
 
+*On Mac OS you need to have ARM compiler installed, one way of doing so is with
+brew: brew install caskroom/cask/gcc-arm-embedded*
+
 ```
 make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test_micro_speech
 ```
@@ -83,7 +86,8 @@ If you see a compiling error on older machines, try leaving out the `--copt` arg
 ```
 bazel run tensorflow/examples/speech_commands:freeze -- \
 --model_architecture=tiny_conv --window_stride=20 --preprocess=average \
---wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb
+--wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb \
+--start_checkpoint=/tmp/speech_commands_train/tiny_conv.ckpt-18000
 ```
 
 The next step is to create a TensorFlow Lite file from the frozen graph:
@@ -99,5 +103,59 @@ bazel run tensorflow/lite/toco:toco -- \
 Finally, convert the file into a C source file that can be compiled into an embedded system:
 
 ```
-xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_model_data.cc
+xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_simple_features_model_data.cc
+```
+
+### Creating Your Own Model With Google Cloud
+
+If want to train your model in Google Cloud you can do so by using
+pre-configured Deep Learning images.
+
+First create the VM:
+
+```
+export IMAGE_FAMILY="tf-latest-cpu"
+export ZONE="us-west1-b" # Or any other required region
+export INSTANCE_NAME="model-trainer"
+export INSTANCE_TYPE="n1-standard-8" # or any other instance type
+gcloud compute instances create $INSTANCE_NAME \
+        --zone=$ZONE \
+        --image-family=$IMAGE_FAMILY \
+        --image-project=deeplearning-platform-release \
+        --machine-type=$INSTANCE_TYPE \
+        --boot-disk-size=120GB \
+        --min-cpu-platform=Intel\ Skylake
+```
+
+As soon as instance has been created you can SSH to it(as a jupyter user!):
+
+```
+gcloud compute ssh "jupyter@${INSTANCE_NAME}"
+```
+
+now install Bazel:
+
+```
+wget https://github.com/bazelbuild/bazel/releases/download/0.15.0/bazel-0.15.0-installer-linux-x86_64.sh
+sudo bash ./bazel-0.15.0-installer-linux-x86_64.sh
+source /usr/local/lib/bazel/bin/bazel-complete.bash
+sudo ln /usr/local/bin/bazel /usr/bin/bazel
+```
+
+and finally run the build:
+
+```
+# TensorFlow already pre-baked on the image
+cd src/tensorflow
+bazel run -c opt --copt=-mavx2 --copt=-mfma \
+tensorflow/examples/speech_commands:train -- \
+--model_architecture=tiny_conv --window_stride=20 --preprocess=average \
+--wanted_words="yes,no" --silence_percentage=25 --unknown_percentage=25 --quantize=1
+```
+
+After build is over follow the rest of the instrucitons from this tutorial. And
+finally do not forget to remove the instance when training is done:
+
+```
+gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}"
 ```
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
index 0aa362be0038f8757387a6311021e183dc19dabd..c83090344ba0d82e9f774897577b1eb924e92329 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -5,7 +5,7 @@ ifeq ($(TARGET), apollo3evb)
     $(AP3_MICRO_DIR)/../preprocessor.cc \
     $(AP3_MICRO_DIR)/pushbutton_main.c \
     $(AP3_MICRO_DIR)/pushbutton_test.cc \
-    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
     $(APOLLO3_SDK)/devices/am_devices_led.c
   ALL_SRCS += $(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS)
   PUSHBUTTON_MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
@@ -24,8 +24,8 @@ ifeq ($(TARGET), apollo3evb)
   PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
     $(AP3_MICRO_DIR)/pushbutton_main.c \
     $(AP3_MICRO_DIR)/pushbutton_test.cc \
-    $(AP3_MICRO_DIR)/../tiny_conv_model_data.cc \
-    $(CMSIS_DIR)/preprocessor.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
+    $(CMSIS_DIR)/simple_features_generator.cc \
     $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
     $(CMSIS_DIR)/hanning.c \
     $(APOLLO3_SDK)/devices/am_devices_led.c \
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
index 10a05b6dcf1bbd5c779f7ee7bdf4d01ebde76017..52604f5f2feaaf535a7ed9dc24020e48f4982308 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
@@ -28,7 +28,7 @@ def new_data_to_array(fn):
   vals = []
   with open(fn) as f:
     for n, line in enumerate(f):
-      if n is not 0:
+      if n != 0:
         vals.extend([int(v, 16) for v in line.split()])
   b = ''.join(map(chr, vals))
   y = struct.unpack('<' + 'h' * int(len(b) / 2), b)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
index 52352bad94a1e5627a9ca35d07a5082b6d79e6a6..fab178b3176cb680d739b53b0f6e3171e32ef721 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
@@ -30,7 +30,7 @@ def new_data_to_array(fn, datatype='int16'):
   vals = []
   with open(fn) as f:
     for n, line in enumerate(f):
-      if n is not 0:
+      if n != 0:
         vals.extend([int(v, 16) for v in line.split()])
   b = ''.join(map(chr, vals))
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
index afee38343b3fac81de945dcd01b53ad35e8be270..4f70d47c3ea9b6f7df884ceabeca245a2a5e55ce 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -157,7 +157,7 @@ void pdm_data_get(void) {
 // PDM interrupt handler.
 //
 //*****************************************************************************
-void am_pdm_isr(void) {
+void am_pdm0_isr(void) {
   uint32_t ui32Status;
   //
   // Read the interrupt status.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
index 95043f857b34b953c91a762bc1a54e9489431bff..d4583dbf4a6dcb083e4d9cd2818e63a116debd7f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
  * micro_speech_test.cc */
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -32,20 +32,36 @@ uint8_t g_unknown_score = 0;
 uint8_t g_yes_score = 0;
 uint8_t g_no_score = 0;
 
+namespace {
+
+TfLiteStatus GenerateSimpleFeatures_1sec(tflite::ErrorReporter* error_reporter,
+                                         const int16_t* input,
+                                         uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    GenerateSimpleFeatures(error_reporter, input + i * 320, 480, 43,
+                           output + i * 43);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestPreprocessor) {
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t preprocessed_data[43 * 49];
-  TfLiteStatus preprocess_1sec_status =
-      Preprocess_1sec(error_reporter, captured_data, preprocessed_data);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, preprocess_1sec_status);
+  TfLiteStatus generate_1sec_status = GenerateSimpleFeatures_1sec(
+      error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, generate_1sec_status);
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_simple_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
index 52db18e6868371afc0b7cd39f6f41d0d60b91689..08811c83b437e66bf1e77a1a1f32d1cb5be02c43 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 namespace {
 int16_t g_dummy_audio_data[kMaxAudioSampleSize];
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c9792510b055e243ab4f6e804717647afa0b418
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  const int yes_start = (0 * kAudioSampleFrequency) / 1000;
+  const int yes_end = (1000 * kAudioSampleFrequency) / 1000;
+  const int no_start = (4000 * kAudioSampleFrequency) / 1000;
+  const int no_end = (5000 * kAudioSampleFrequency) / 1000;
+  const int wraparound = (8000 * kAudioSampleFrequency) / 1000;
+  const int start_sample = (start_ms * kAudioSampleFrequency) / 1000;
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    const int sample_index = (start_sample + i) % wraparound;
+    int16_t sample;
+    if ((sample_index >= yes_start) && (sample_index < yes_end)) {
+      sample = g_yes_1000ms_sample_data[sample_index - yes_start];
+    } else if ((sample_index >= no_start) && (sample_index < no_end)) {
+      sample = g_no_1000ms_sample_data[sample_index - no_start];
+    } else {
+      sample = 0;
+    }
+    g_dummy_audio_data[i] = sample;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b73d436ad638277d3f052715c506668e7f163f17
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProviderMock) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i], audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i + 8000],
+                            audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 1500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(0, audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 12250, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_1000ms_sample_data[i + 4000],
+                            audio_samples[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
index 85fbbb80a6c5b330230c1d1d0186de795edc4754..f9212aa3491e99104c2a3f1f5e315e9e96481345 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afff5109d9d1252481304a895045b450b9be85bb
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.cc
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
+
+// The default implementation writes out the name of the recognized command
+// to the error console. Real applications will want to take some custom
+// action instead, and should implement their own versions of this function.
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command) {
+  if (is_new_command) {
+    error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                           current_time);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d91209de0aa035738cee8b52afe46f348c796bd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Provides an interface to take an action based on an audio command.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Called every time the results of an audio recognition run are available. The
+// human-readable name of any recognized command is in the `found_command`
+// argument, `score` has the numerical confidence, and `is_new_command` is set
+// if the previous command was different to this one.
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_COMMAND_RESPONDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8acf4552f59b0b6dffa3023cc86b44d422a28fd5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/command_responder_test.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestCallability) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  // This will have external side-effects (like printing to the debug console
+  // or lighting an LED) that are hard to observe, so the most we can do is
+  // make sure the call doesn't crash.
+  RespondToCommand(error_reporter, 0, "foo", 0, true);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
index 06647d0c536564c26d72cb73396ca36efb3aeb25..49fea826759956d479e9171e2ba7a41331e31023 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 #include "AUDIO_DISCO_F746NG.h"
 #include "SDRAM_DISCO_F746NG.h"
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index 7f9ece41dd3f013ae328ffd1bdc98f197855a131..b5dfa3d944076a21cde2dfafc6ce1ed39f15164d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
     : feature_size_(feature_size),
@@ -48,6 +48,10 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
   int slices_needed = current_step - last_step;
   // If this is the first call, make sure we don't use any cached information.
   if (is_first_run_) {
+    TfLiteStatus init_status = InitializeMicroFeatures(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
     is_first_run_ = false;
     slices_needed = kFeatureSliceCount;
   }
@@ -94,16 +98,17 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
       GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
                       &audio_samples_size, &audio_samples);
       if (audio_samples_size < kMaxAudioSampleSize) {
-        error_reporter->Report("Audio data size %d  too small, want %d",
+        error_reporter->Report("Audio data size %d too small, want %d",
                                audio_samples_size, kMaxAudioSampleSize);
         return kTfLiteError;
       }
       uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
-      TfLiteStatus preprocess_status =
-          Preprocess(error_reporter, audio_samples, audio_samples_size,
-                     kFeatureSliceSize, new_slice_data);
-      if (preprocess_status != kTfLiteOk) {
-        return preprocess_status;
+      size_t num_samples_read;
+      TfLiteStatus generate_status = GenerateMicroFeatures(
+          error_reporter, audio_samples, audio_samples_size, kFeatureSliceSize,
+          new_slice_data, &num_samples_read);
+      if (generate_status != kTfLiteOk) {
+        return generate_status;
       }
     }
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05912e26f8a7f5e89b9f45766adf4270c033ed5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_micro_f2e59fea_nohash_1_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 4000, /* time_in_ms= */ 4970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_micro_f9643d42_nohash_4_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
index 556cbfe799bd9adf2df8f584a4f10b4a1c834bd4..e7655a3be53ae6a032195dd4ca991f740bb19537 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 3a9a5a4df1bf8239950dd2c79a1048706004e1f5..1bf96bc5a7e04bd05b85dd286bf2e9d06774e224 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -14,10 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -31,7 +32,8 @@ int main(int argc, char* argv[]) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -122,9 +124,11 @@ int main(int argc, char* argv[]) {
           "RecognizeCommands::ProcessLatestResults() failed");
       return 1;
     }
-    if (is_new_command) {
-      error_reporter->Report("Heard %s (%d)", found_command, score);
-    }
+    // Do something based on the recognized command. The default implementation
+    // just prints to the error console, but you should replace this with your
+    // own function for a real application.
+    RespondToCommand(error_reporter, current_time, found_command, score,
+                     is_new_command);
   }
 
   return 0;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1e684e1efd0bfbc676635e8c3233ef6284e6954d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
@@ -0,0 +1,300 @@
+# Library for generating feature vectors from audio data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_model_settings",
+    srcs = [
+        "micro_model_settings.cc",
+    ],
+    hdrs = [
+        "micro_model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_micro_features_model_data",
+    srcs = [
+        "tiny_conv_micro_features_model_data.cc",
+    ],
+    hdrs = [
+        "tiny_conv_micro_features_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "micro_features_test_data",
+    srcs = [
+        "no_micro_features_data.cc",
+        "yes_micro_features_data.cc",
+    ],
+    hdrs = [
+        "no_micro_features_data.h",
+        "yes_micro_features_data.h",
+    ],
+)
+
+cc_library(
+    name = "bits",
+    hdrs = ["bits.h"],
+)
+
+cc_library(
+    name = "static_alloc",
+    hdrs = ["static_alloc.h"],
+)
+
+cc_library(
+    name = "fft",
+    srcs = [
+        "fft.cc",
+        "fft_util.cc",
+    ],
+    hdrs = [
+        "fft.h",
+        "fft_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "@kissfft//:kiss_fftr_16",
+    ],
+)
+
+cc_library(
+    name = "filterbank",
+    srcs = [
+        "filterbank.cc",
+        "filterbank_util.cc",
+    ],
+    hdrs = [
+        "filterbank.h",
+        "filterbank_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "frontend",
+    srcs = [
+        "frontend.cc",
+        "frontend_util.cc",
+    ],
+    hdrs = [
+        "frontend.h",
+        "frontend_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":filterbank",
+        ":log_scale",
+        ":micro_model_settings",
+        ":noise_reduction",
+        ":pcan_gain_control",
+        ":window",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "log_scale",
+    srcs = [
+        "log_lut.cc",
+        "log_scale.cc",
+        "log_scale_util.cc",
+    ],
+    hdrs = [
+        "log_lut.h",
+        "log_scale.h",
+        "log_scale_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "noise_reduction",
+    srcs = [
+        "noise_reduction.cc",
+        "noise_reduction_util.cc",
+    ],
+    hdrs = [
+        "noise_reduction.h",
+        "noise_reduction_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "pcan_gain_control",
+    srcs = [
+        "pcan_gain_control.cc",
+        "pcan_gain_control_util.cc",
+    ],
+    hdrs = [
+        "pcan_gain_control.h",
+        "pcan_gain_control_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "window",
+    srcs = [
+        "window.cc",
+        "window_util.cc",
+    ],
+    hdrs = [
+        "window.h",
+        "window_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator",
+    srcs = [
+        "micro_features_generator.cc",
+    ],
+    hdrs = [
+        "micro_features_generator.h",
+    ],
+    deps = [
+        ":frontend",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator_test_data",
+    srcs = [
+        "no_feature_data_slice.cc",
+        "yes_feature_data_slice.cc",
+    ],
+    hdrs = [
+        "no_feature_data_slice.h",
+        "yes_feature_data_slice.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "fft_test",
+    srcs = ["fft_test.cc"],
+    deps = [
+        ":fft",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "filterbank_test",
+    srcs = ["filterbank_test.cc"],
+    deps = [
+        ":filterbank",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "frontend_test",
+    srcs = ["frontend_test.cc"],
+    deps = [
+        ":frontend",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "log_scale_test",
+    srcs = ["log_scale_test.cc"],
+    deps = [
+        ":log_scale",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "noise_reduction_test",
+    srcs = ["noise_reduction_test.cc"],
+    deps = [
+        ":noise_reduction",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "pcan_gain_control_test",
+    srcs = ["pcan_gain_control_test.cc"],
+    deps = [
+        ":pcan_gain_control",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "window_test",
+    srcs = ["window_test.cc"],
+    deps = [
+        ":window",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_features_generator_test",
+    srcs = [
+        "micro_features_generator_test.cc",
+    ],
+    deps = [
+        ":micro_features_generator",
+        ":micro_features_generator_test_data",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech:audio_sample_test_data",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b19ee6f030ae9fa8a931c6693cfe490747e336a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+
+#include <cstdint>
+
+static inline int CountLeadingZeros32Slow(uint64_t n) {
+  int zeroes = 28;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros32(uint32_t n) {
+#if defined(_MSC_VER)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return 31 - result;
+  }
+  return 32;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) {
+    return 32;
+  }
+  return __builtin_clz(n);
+#else
+  return CountLeadingZeros32Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit32(uint32_t n) {
+  return 32 - CountLeadingZeros32(n);
+}
+
+static inline int CountLeadingZeros64Slow(uint64_t n) {
+  int zeroes = 60;
+  if (n >> 32) zeroes -= 32, n >>= 32;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros64(uint64_t n) {
+#if defined(_MSC_VER) && defined(_M_X64)
+  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(_MSC_VER)
+  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return 31 - result;
+  }
+  if (_BitScanReverse(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (n == 0) {
+    return 64;
+  }
+  return __builtin_clzll(n);
+#else
+  return CountLeadingZeros64Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit64(uint64_t n) {
+  return 64 - CountLeadingZeros64(n);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde4e38740e65cf56cd179d577528263177a649e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+
+#include <string.h>
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+// Internal test dependency placeholder1
+// Internal test dependency placeholder2
+#include "tools/kiss_fftr.h"
+// Internal test dependency placeholder3
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift) {
+  const size_t input_size = state->input_size;
+  const size_t fft_size = state->fft_size;
+
+  int16_t* fft_input = state->input;
+  // First, scale the input by the given shift.
+  int i;
+  for (i = 0; i < input_size; ++i) {
+    *fft_input++ = (*input++) << input_scale_shift;
+  }
+  // Zero out whatever else remains in the top part of the input.
+  for (; i < fft_size; ++i) {
+    *fft_input++ = 0;
+  }
+
+  // Apply the FFT.
+  kiss_fftr(reinterpret_cast<const kiss_fftr_cfg>(state->scratch), state->input,
+            reinterpret_cast<kiss_fft_cpx*>(state->output));
+}
+
+void FftInit(struct FftState* state) {
+  // All the initialization is done in FftPopulateState()
+}
+
+void FftReset(struct FftState* state) {
+  memset(state->input, 0, state->fft_size * sizeof(*state->input));
+  memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5d29f68a2cc5688f9644a2b556abb4787e3bb93
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct complex_int16_t {
+  int16_t real;
+  int16_t imag;
+};
+
+struct FftState {
+  int16_t input[kMaxAudioSampleSize];
+  struct complex_int16_t output[kMaxAudioSampleSize + 2];
+  size_t fft_size;
+  size_t input_size;
+  // This magic number was derived from KissFFT's estimate of how much space it
+  // will need to process the particular lengths and datatypes we need to for
+  // these model settings. This size will need to be recalculated for different
+  // models, but you will see a runtime error if it's not large enough.
+  char scratch[2848];
+  size_t scratch_size;
+};
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift);
+
+void FftInit(struct FftState* state);
+
+void FftReset(struct FftState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b89b01445a641c8152aaff8165495688ab6861b2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int16_t kFakeWindow[] = {
+    0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+    0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+const int kScaleShift = 0;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  struct FftState state;
+  TF_LITE_MICRO_EXPECT(FftPopulateState(
+      error_reporter, &state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
+
+  FftInit(&state);
+  FftCompute(&state, kFakeWindow, kScaleShift);
+
+  const struct complex_int16_t expected[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.fft_size / 2; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].real, expected[i].real, 2);
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].imag, expected[i].imag, 2);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab742893197e6fda9ec2266e85997b555e0a4fc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size) {
+  state->input_size = input_size;
+  state->fft_size = 1;
+  while (state->fft_size < state->input_size) {
+    state->fft_size <<= 1;
+  }
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->fft_size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->output, ((state->fft_size / 2 + 1) * sizeof(*state->output) * 2));
+
+  // Ask kissfft how much memory it wants.
+  size_t scratch_size = 0;
+  kiss_fftr_cfg kfft_cfg =
+      kiss_fftr_alloc(state->fft_size, 0, nullptr, &scratch_size);
+  if (kfft_cfg != nullptr) {
+    error_reporter->Report("Kiss memory sizing failed.");
+    return 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->scratch, scratch_size);
+  state->scratch_size = scratch_size;
+  // Let kissfft configure the scratch space we just allocated
+  kfft_cfg = kiss_fftr_alloc(state->fft_size, 0, state->scratch, &scratch_size);
+  if (reinterpret_cast<char*>(kfft_cfg) != state->scratch) {
+    error_reporter->Report("Kiss memory preallocation strategy failed.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dea097bc996e194cef7987431c67be3c976ed2b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Prepares and FFT for the given input size.
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67f69dd67581ff6c15063b2467810f6c212ed1e5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy) {
+  const int end_index = state->end_index;
+  int i;
+  energy += state->start_index;
+  fft_output += state->start_index;
+  for (i = state->start_index; i < end_index; ++i) {
+    const int32_t real = fft_output->real;
+    const int32_t imag = fft_output->imag;
+    fft_output++;
+    const uint32_t mag_squared = (real * real) + (imag * imag);
+    *energy++ = mag_squared;
+  }
+}
+
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy) {
+  uint64_t* work = state->work;
+  uint64_t weight_accumulator = 0;
+  uint64_t unweight_accumulator = 0;
+
+  const int16_t* channel_frequency_starts = state->channel_frequency_starts;
+  const int16_t* channel_weight_starts = state->channel_weight_starts;
+  const int16_t* channel_widths = state->channel_widths;
+
+  int num_channels_plus_1 = state->num_channels + 1;
+  int i;
+  for (i = 0; i < num_channels_plus_1; ++i) {
+    const int32_t* magnitudes = energy + *channel_frequency_starts++;
+    const int16_t* weights = state->weights + *channel_weight_starts;
+    const int16_t* unweights = state->unweights + *channel_weight_starts++;
+    const int width = *channel_widths++;
+    int j;
+    for (j = 0; j < width; ++j) {
+      weight_accumulator += *weights++ * (static_cast<uint64_t>(*magnitudes));
+      unweight_accumulator +=
+          *unweights++ * (static_cast<uint64_t>(*magnitudes));
+      ++magnitudes;
+    }
+    *work++ = weight_accumulator;
+    weight_accumulator = unweight_accumulator;
+    unweight_accumulator = 0;
+  }
+}
+
+static uint16_t Sqrt32(uint32_t num) {
+  if (num == 0) {
+    return 0;
+  }
+  uint32_t res = 0;
+  int max_bit_number = 32 - MostSignificantBit32(num);
+  max_bit_number |= 1;
+  uint32_t bit = 1U << (31 - max_bit_number);
+  int iterations = (31 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFF) {
+    ++res;
+  }
+  return res;
+}
+
+static uint32_t Sqrt64(uint64_t num) {
+  // Take a shortcut and just use 32 bit operations if the upper word is all
+  // clear. This will cause a slight off by one issue for numbers close to 2^32,
+  // but it probably isn't going to matter (and gives us a big performance win).
+  if ((num >> 32) == 0) {
+    return Sqrt32(static_cast<uint32_t>(num));
+  }
+  uint64_t res = 0;
+  int max_bit_number = 64 - MostSignificantBit64(num);
+  max_bit_number |= 1;
+  uint64_t bit = 1ULL << (63 - max_bit_number);
+  int iterations = (63 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFFFFFFLL) {
+    ++res;
+  }
+  return res;
+}
+
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
+  const int num_channels = state->num_channels;
+  const int64_t* work = reinterpret_cast<int64_t*>(state->work + 1);
+  // Reuse the work buffer since we're fine clobbering it at this point to hold
+  // the output.
+  uint32_t* output = reinterpret_cast<uint32_t*>(state->work);
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    *output++ = Sqrt64(*work++) >> scale_down_shift;
+  }
+  return reinterpret_cast<uint32_t*>(state->work);
+}
+
+void FilterbankReset(struct FilterbankState* state) {
+  memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b479d4899a985482710dfcb12b908f50ec1690
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFilterbankBits 12
+
+struct FilterbankState {
+  int num_channels;
+  int start_index;
+  int end_index;
+  int16_t channel_frequency_starts[kFeatureSliceSize + 1];
+  int16_t channel_weight_starts[kFeatureSliceSize + 1];
+  int16_t channel_widths[kFeatureSliceSize + 1];
+  int16_t weights[316];
+  int16_t unweights[316];
+  uint64_t work[kFeatureSliceSize + 1];
+};
+
+// Converts the relevant complex values of an FFT output into energy (the
+// square magnitude).
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy);
+
+// Computes the mel-scale filterbank on the given energy array. Output is cached
+// internally - to fetch it, you need to call FilterbankSqrt.
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy);
+
+// Applies an integer square root to the 64 bit intermediate values of the
+// filterbank, and returns a pointer to them. Memory will be invalidated the
+// next time FilterbankAccumulateChannels is called.
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
+
+void FilterbankReset(struct FilterbankState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..682b216ace37fa0a809db3c06386b5b4b3ca94c8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
@@ -0,0 +1,228 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kSpectrumSize = 17;
+const int kStartIndex = 1;
+const int kEndIndex = 15;
+const int32_t kEnergy[] = {-1,     181,      400,      181,      625,    28322,
+                           786769, 18000000, 40972801, 18000000, 784996, 28085,
+                           625,    181,      361,      -1,       -1};
+const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
+const int kScaleShift = 0;
+
+// Test filterbank generation using scaled-down defaults.
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
+    config_.num_channels = 2;
+    config_.lower_band_limit = 8.0;
+    config_.upper_band_limit = 450.0;
+  }
+
+  struct FilterbankConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 4, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 8, 16};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {8, 8, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
+                              0, 3376, 2468, 1591, 744,  0,   0,   0,
+                              0, 4020, 3226, 2456, 1708, 983, 277, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
+                              0, 720, 1628, 2505, 3352, 0,    0,    0,
+                              0, 76,  870,  1640, 2388, 3113, 3819, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
+  struct FilterbankState state;
+  state.start_index = kStartIndex;
+  state.end_index = kEndIndex;
+
+  struct complex_int16_t fake_fft[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
+  FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
+
+  int i;
+  for (i = state.start_index; i < state.end_index; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  FilterbankAccumulateChannels(&state, kEnergy);
+
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+  std::memcpy(state.work, kWork, sizeof(kWork));
+
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
+
+  const uint32_t expected[] = {247311, 508620};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea4aa5179bc6e1d875127152b596f220a076191e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kFilterbankIndexAlignment 4
+#define kFilterbankChannelBlockSize 4
+
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
+  config->num_channels = 32;
+  config->lower_band_limit = 125.0f;
+  config->upper_band_limit = 7500.0f;
+  config->output_scale_shift = 7;
+}
+
+static float FreqToMel(float freq) {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+static void CalculateCenterFrequencies(const int num_channels,
+                                       const float lower_frequency_limit,
+                                       const float upper_frequency_limit,
+                                       float* center_frequencies) {
+  assert(lower_frequency_limit >= 0.0f);
+  assert(upper_frequency_limit > lower_frequency_limit);
+
+  const float mel_low = FreqToMel(lower_frequency_limit);
+  const float mel_hi = FreqToMel(upper_frequency_limit);
+  const float mel_span = mel_hi - mel_low;
+  const float mel_spacing = mel_span / (static_cast<float>(num_channels));
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    center_frequencies[i] = mel_low + (mel_spacing * (i + 1));
+  }
+}
+
+static void QuantizeFilterbankWeights(const float float_weight, int16_t* weight,
+                                      int16_t* unweight) {
+  *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5);
+  *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5);
+}
+
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size) {
+  state->num_channels = config->num_channels;
+  const int num_channels_plus_1 = config->num_channels + 1;
+
+  // How should we align things to index counts given the byte alignment?
+  const int index_alignment =
+      (kFilterbankIndexAlignment < sizeof(int16_t)
+           ? 1
+           : kFilterbankIndexAlignment / sizeof(int16_t));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_frequency_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_frequency_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_weight_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_weight_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_widths,
+      (num_channels_plus_1 * sizeof(*state->channel_widths)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->work,
+                                 (num_channels_plus_1 * sizeof(*state->work)));
+
+  float center_mel_freqs[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      center_mel_freqs, (num_channels_plus_1 * sizeof(*center_mel_freqs)));
+
+  int16_t actual_channel_starts[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_starts,
+      (num_channels_plus_1 * sizeof(*actual_channel_starts)));
+
+  int16_t actual_channel_widths[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_widths,
+      (num_channels_plus_1 * sizeof(*actual_channel_widths)));
+
+  CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit,
+                             config->upper_band_limit, center_mel_freqs);
+
+  // Always exclude DC.
+  const float hz_per_sbin =
+      0.5 * sample_rate / (static_cast<float>(spectrum_size) - 1);
+  state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin;
+  state->end_index = 0;  // Initialized to zero here, but actually set below.
+
+  // For each channel, we need to figure out what frequencies belong to it, and
+  // how much padding we need to add so that we can efficiently multiply the
+  // weights and unweights for accumulation. To simplify the multiplication
+  // logic, all channels will have some multiplication to do (even if there are
+  // no frequencies that accumulate to that channel) - they will be directed to
+  // a set of zero weights.
+  int chan_freq_index_start = state->start_index;
+  int weight_index_start = 0;
+  int needs_zeros = 0;
+
+  int chan;
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    // Keep jumping frequencies until we overshoot the bound on this channel.
+    int freq_index = chan_freq_index_start;
+    while (FreqToMel((freq_index)*hz_per_sbin) <= center_mel_freqs[chan]) {
+      ++freq_index;
+    }
+
+    const int width = freq_index - chan_freq_index_start;
+    actual_channel_starts[chan] = chan_freq_index_start;
+    actual_channel_widths[chan] = width;
+
+    if (width == 0) {
+      // This channel doesn't actually get anything from the frequencies, it's
+      // always zero. We need then to insert some 'zero' weights into the
+      // output, and just redirect this channel to do a single multiplication at
+      // this point. For simplicity, the zeros are placed at the beginning of
+      // the weights arrays, so we have to go and update all the other
+      // weight_starts to reflect this shift (but only once).
+      state->channel_frequency_starts[chan] = 0;
+      state->channel_weight_starts[chan] = 0;
+      state->channel_widths[chan] = kFilterbankChannelBlockSize;
+      if (!needs_zeros) {
+        needs_zeros = 1;
+        int j;
+        for (j = 0; j < chan; ++j) {
+          state->channel_weight_starts[j] += kFilterbankChannelBlockSize;
+        }
+        weight_index_start += kFilterbankChannelBlockSize;
+      }
+    } else {
+      // How far back do we need to go to ensure that we have the proper
+      // alignment?
+      const int aligned_start =
+          (chan_freq_index_start / index_alignment) * index_alignment;
+      const int aligned_width = (chan_freq_index_start - aligned_start + width);
+      const int padded_width =
+          (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) *
+          kFilterbankChannelBlockSize;
+
+      state->channel_frequency_starts[chan] = aligned_start;
+      state->channel_weight_starts[chan] = weight_index_start;
+      state->channel_widths[chan] = padded_width;
+      weight_index_start += padded_width;
+    }
+    chan_freq_index_start = freq_index;
+  }
+
+  // Allocate the two arrays to store the weights - weight_index_start contains
+  // the index of what would be the next set of weights that we would need to
+  // add, so that's how many weights we need to allocate.
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->weights, (weight_index_start * sizeof(*state->weights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->weights[i] = 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->unweights, (weight_index_start * sizeof(*state->unweights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->unweights[i] = 0;
+  }
+
+  // Next pass, compute all the weights. Since everything has been memset to
+  // zero, we only need to fill in the weights that correspond to some frequency
+  // for a channel.
+  const float mel_low = FreqToMel(config->lower_band_limit);
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    int frequency = actual_channel_starts[chan];
+    const int num_frequencies = actual_channel_widths[chan];
+    const int frequency_offset =
+        frequency - state->channel_frequency_starts[chan];
+    const int weight_start = state->channel_weight_starts[chan];
+    const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1];
+
+    int j;
+    for (j = 0; j < num_frequencies; ++j, ++frequency) {
+      const float weight =
+          (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) /
+          (center_mel_freqs[chan] - denom_val);
+
+      // Make the float into an integer for the weights (and unweights).
+      const int weight_index = weight_start + frequency_offset + j;
+      QuantizeFilterbankWeights(weight, state->weights + weight_index,
+                                state->unweights + weight_index);
+    }
+    if (frequency > state->end_index) {
+      state->end_index = frequency;
+    }
+  }
+
+  if (state->end_index >= spectrum_size) {
+    error_reporter->Report("Filterbank end_index is above spectrum size.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bf0c8494ce11753c2f2c2185e6c1141d0adbc74
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FilterbankConfig {
+  // number of frequency channel buckets for filterbank
+  int num_channels;
+  // maximum frequency to include
+  float upper_band_limit;
+  // minimum frequency to include
+  float lower_band_limit;
+  // unused
+  int output_scale_shift;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
+
+// Allocates any buffers.
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c609190e4db90adb3ec79e2a794b61923cea0978
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read) {
+  struct FrontendOutput output;
+  output.values = nullptr;
+  output.size = 0;
+
+  // Try to apply the window - if it fails, return and wait for more data.
+  if (!WindowProcessSamples(&state->window, samples, num_samples,
+                            num_samples_read)) {
+    return output;
+  }
+
+  // Apply the FFT to the window's output (and scale it so that the fixed point
+  // FFT can have as much resolution as possible).
+  int input_shift =
+      15 - MostSignificantBit32(state->window.max_abs_output_value);
+  FftCompute(&state->fft, state->window.output, input_shift);
+
+  // We can re-ruse the fft's output buffer to hold the energy.
+  int32_t* energy = reinterpret_cast<int32_t*>(state->fft.output);
+  FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output,
+                                      energy);
+  FilterbankAccumulateChannels(&state->filterbank, energy);
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift);
+
+  // Apply noise reduction.
+  NoiseReductionApply(&state->noise_reduction, scaled_filterbank);
+
+  if (state->pcan_gain_control.enable_pcan) {
+    PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank);
+  }
+
+  // Apply the log and scale.
+  int correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  uint16_t* logged_filterbank =
+      LogScaleApply(&state->log_scale, scaled_filterbank,
+                    state->filterbank.num_channels, correction_bits);
+
+  output.size = state->filterbank.num_channels;
+  output.values = logged_filterbank;
+  return output;
+}
+
+void FrontendReset(struct FrontendState* state) {
+  WindowReset(&state->window);
+  FftReset(&state->fft);
+  FilterbankReset(&state->filterbank);
+  NoiseReductionReset(&state->noise_reduction);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
new file mode 100644
index 0000000000000000000000000000000000000000..3221d283e8740cd95fa39ece4ad9533d059018d7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+struct FrontendState {
+  struct WindowState window;
+  struct FftState fft;
+  struct FilterbankState filterbank;
+  struct NoiseReductionState noise_reduction;
+  struct PcanGainControlState pcan_gain_control;
+  struct LogScaleState log_scale;
+};
+
+struct FrontendOutput {
+  const uint16_t* values;
+  size_t size;
+};
+
+// Main entry point to processing frontend samples. Updates num_samples_read to
+// contain the number of samples that have been consumed from the input array.
+// Returns a struct containing the generated output. If not enough samples were
+// added to generate a feature vector, the returned size will be 0 and the
+// values pointer will be NULL. Note that the output pointer will be invalidated
+// as soon as FrontendProcessSamples is called again, so copy the contents
+// elsewhere if you need to use them later.
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read);
+
+void FrontendReset(struct FrontendState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f86d48d86b1795fa45b9bf24db4dd75fb0a20
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test end-to-end frontend behaviors.
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
+    config_.window.size_ms = 25;
+    config_.window.step_size_ms = 10;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.filterbank.num_channels = 2;
+    config_.filterbank.lower_band_limit = 8.0;
+    config_.filterbank.upper_band_limit = 450.0;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.noise_reduction.even_smoothing = 0.025;
+    config_.noise_reduction.odd_smoothing = 0.06;
+    config_.noise_reduction.min_signal_remaining = 0.05;
+    config_.pcan_gain_control.enable_pcan = true;
+    config_.pcan_gain_control.strength = 0.95;
+    config_.pcan_gain_control.offset = 80.0;
+    config_.pcan_gain_control.gain_bits = 21;
+    config_.log_scale.enable_log = true;
+    config_.log_scale.scale_shift = 6;
+  }
+
+  struct FrontendConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
+
+  const uint16_t expected[] = {479, 425};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+
+  const int16_t expected[] = {436, 378};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples + kStepSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
+          kStepSamples,
+      &num_samples_read);
+
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..220bc130fb9332e4afbe02a4432b61c8a4bcd544
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config) {
+  WindowFillConfigWithDefaults(&config->window);
+  FilterbankFillConfigWithDefaults(&config->filterbank);
+  NoiseReductionFillConfigWithDefaults(&config->noise_reduction);
+  PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control);
+  LogScaleFillConfigWithDefaults(&config->log_scale);
+}
+
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate) {
+  memset(state, 0, sizeof(*state));
+
+  if (!WindowPopulateState(error_reporter, &config->window, &state->window,
+                           sample_rate)) {
+    error_reporter->Report("Failed to populate window state");
+    return 0;
+  }
+
+  if (!FftPopulateState(error_reporter, &state->fft, state->window.size)) {
+    error_reporter->Report("Failed to populate fft state");
+    return 0;
+  }
+  FftInit(&state->fft);
+
+  if (!FilterbankPopulateState(error_reporter, &config->filterbank,
+                               &state->filterbank, sample_rate,
+                               state->fft.fft_size / 2 + 1)) {
+    error_reporter->Report("Failed to populate filterbank state");
+    return 0;
+  }
+
+  if (!NoiseReductionPopulateState(error_reporter, &config->noise_reduction,
+                                   &state->noise_reduction,
+                                   state->filterbank.num_channels)) {
+    error_reporter->Report("Failed to populate noise reduction state");
+    return 0;
+  }
+
+  int input_correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  if (!PcanGainControlPopulateState(
+          error_reporter, &config->pcan_gain_control, &state->pcan_gain_control,
+          state->noise_reduction.estimate, state->filterbank.num_channels,
+          state->noise_reduction.smoothing_bits, input_correction_bits)) {
+    error_reporter->Report("Failed to populate pcan gain control state");
+    return 0;
+  }
+
+  if (!LogScalePopulateState(error_reporter, &config->log_scale,
+                             &state->log_scale)) {
+    error_reporter->Report("Failed to populate log scale state");
+    return 0;
+  }
+
+  FrontendReset(state);
+
+  // All good, return a true value.
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7267644ae053e1f816cb22bb5e0ecd04e4de0ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FrontendConfig {
+  struct WindowConfig window;
+  struct FilterbankConfig filterbank;
+  struct NoiseReductionConfig noise_reduction;
+  struct PcanGainControlConfig pcan_gain_control;
+  struct LogScaleConfig log_scale;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
+
+// Prepares any buffers.
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c651caad8c67773f20a485eea5519f286b1b0253
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+const uint16_t kLogLut[]
+#ifndef _MSC_VER
+    __attribute__((aligned(4)))
+#endif  // _MSV_VER
+    = {0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
+       2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
+       3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
+       5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
+       5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
+       5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
+       5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
+       4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
+       3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
+       2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
+       1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ed9339bd02e23cc134992badce5cdb72a74771
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
+
+#include <stdint.h>
+
+// Number of segments in the log lookup table. The table will be kLogSegments+1
+// in length (with some padding).
+#define kLogSegments 128
+#define kLogSegmentsLog2 7
+
+// Scale used by lookup table.
+#define kLogScale 65536
+#define kLogScaleLog2 16
+#define kLogCoeff 45426
+
+extern const uint16_t kLogLut[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f85e9c1a2f5a0056deaffad7b99b774c772ce562
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+
+#define kuint16max 0x0000FFFF
+
+// The following functions implement integer logarithms of various sizes. The
+// approximation is calculated according to method described in
+//       www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/
+//       publicaciones/SPL2007/Log10-spl07.pdf
+// It first calculates log2 of the input and then converts it to natural
+// logarithm.
+
+static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) {
+  // Part 1
+  int32_t frac = x - (1LL << log2x);
+  if (log2x < kLogScaleLog2) {
+    frac <<= kLogScaleLog2 - log2x;
+  } else {
+    frac >>= log2x - kLogScaleLog2;
+  }
+  // Part 2
+  const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2);
+  const uint32_t seg_unit =
+      ((static_cast<uint32_t>(1)) << kLogScaleLog2) >> kLogSegmentsLog2;
+
+  const int32_t c0 = kLogLut[base_seg];
+  const int32_t c1 = kLogLut[base_seg + 1];
+  const int32_t seg_base = seg_unit * base_seg;
+  const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2;
+  return frac + c0 + rel_pos;
+}
+
+static uint32_t Log(const uint32_t x, const uint32_t scale_shift) {
+  const uint32_t integer = MostSignificantBit32(x) - 1;
+  const uint32_t fraction = Log2FractionPart(x, integer);
+  const uint32_t log2 = (integer << kLogScaleLog2) + fraction;
+  const uint32_t round = kLogScale / 2;
+  const uint32_t loge =
+      ((static_cast<uint64_t>(kLogCoeff)) * log2 + round) >> kLogScaleLog2;
+  // Finally scale to our output scale
+  const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2;
+  return loge_scaled;
+}
+
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits) {
+  const int scale_shift = state->scale_shift;
+  uint16_t* output = reinterpret_cast<uint16_t*>(signal);
+  uint16_t* ret = output;
+  int i;
+  for (i = 0; i < signal_size; ++i) {
+    uint32_t value = *signal++;
+    if (state->enable_log) {
+      if (correction_bits < 0) {
+        value >>= -correction_bits;
+      } else {
+        value <<= correction_bits;
+      }
+      if (value > 1) {
+        value = Log(value, scale_shift);
+      } else {
+        value = 0;
+      }
+    }
+    *output++ = (value < kuint16max) ? value : kuint16max;
+  }
+  return ret;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..d90b87fb6d6fe181158b209a87a42f6d075ba457
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+struct LogScaleState {
+  int enable_log;
+  int scale_shift;
+};
+
+// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
+// that the signal array will be modified.
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b23323a273314a347f25e691d538781558980a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kScaleShift = 6;
+const int kCorrectionBits = -1;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
+  struct LogScaleState state;
+  state.enable_log = true;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {3578, 1533};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {479, 425};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
+  struct LogScaleState state;
+  state.enable_log = false;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {85964, 45998};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {65535, 45998};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09adc09c3511fdedeb7246b6717fa4bfb4c83ba1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) {
+  config->enable_log = 1;
+  config->scale_shift = 6;
+}
+
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state) {
+  state->enable_log = config->enable_log;
+  state->scale_shift = config->scale_shift;
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3caf207f2693756783b6c1dc64246d2522388d3b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct LogScaleConfig {
+  // set to false (0) to disable this module
+  int enable_log;
+  // scale results by 2^(scale_shift)
+  int scale_shift;
+};
+
+// Populates the LogScaleConfig with "sane" default values.
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
+
+// Allocates any buffers.
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6657c6f3205903c178b2aa4314551f5b4fee1101
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+
+#include <cmath>
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+FrontendState g_micro_features_state;
+bool g_is_first_time = true;
+
+}  // namespace
+
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter) {
+  FrontendConfig config;
+  config.window.size_ms = kFeatureSliceDurationMs;
+  config.window.step_size_ms = kFeatureSliceStrideMs;
+  config.noise_reduction.smoothing_bits = 10;
+  config.filterbank.num_channels = kFeatureSliceSize;
+  config.filterbank.lower_band_limit = 125.0;
+  config.filterbank.upper_band_limit = 7500.0;
+  config.noise_reduction.smoothing_bits = 10;
+  config.noise_reduction.even_smoothing = 0.025;
+  config.noise_reduction.odd_smoothing = 0.06;
+  config.noise_reduction.min_signal_remaining = 0.05;
+  config.pcan_gain_control.enable_pcan = 1;
+  config.pcan_gain_control.strength = 0.95;
+  config.pcan_gain_control.offset = 80.0;
+  config.pcan_gain_control.gain_bits = 21;
+  config.log_scale.enable_log = 1;
+  config.log_scale.scale_shift = 6;
+  if (!FrontendPopulateState(error_reporter, &config, &g_micro_features_state,
+                             kAudioSampleFrequency)) {
+    error_reporter->Report("FrontendPopulateState() failed");
+    return kTfLiteError;
+  }
+  g_is_first_time = true;
+  return kTfLiteOk;
+}
+
+// This is not exposed in any header, and is only used for testing, to ensure
+// that the state is correctly set up before generating results.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
+  for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
+    g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
+  }
+}
+
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read) {
+  const int16_t* frontend_input;
+  if (g_is_first_time) {
+    frontend_input = input;
+    g_is_first_time = false;
+  } else {
+    frontend_input = input + 160;
+  }
+  FrontendOutput frontend_output = FrontendProcessSamples(
+      &g_micro_features_state, frontend_input, input_size, num_samples_read);
+
+  for (int i = 0; i < frontend_output.size; ++i) {
+    // These scaling values are derived from those used in input_data.py in the
+    // training pipeline.
+    constexpr int32_t value_scale = (10 * 255);
+    constexpr int32_t value_div = (256 * 26);
+    int32_t value =
+        ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
+        value_div;
+    if (value < 0) {
+      value = 0;
+    }
+    if (value > 255) {
+      value = 255;
+    }
+    output[i] = value;
+  }
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..46fa55d62ff7a8032cb94e512d4e856fb5960276
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Sets up any resources needed for the feature generation pipeline.
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter);
+
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network.
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2da7a799ce398ba7faf31d577d79bb96b2072a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+// This is a test-only API, not exposed in any public headers, so declare it.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets);
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+
+  // The micro features pipeline retains state from previous calls to help
+  // estimate the background noise. Unfortunately this makes it harder to
+  // exactly reproduce results in a test environment, so use a known snapshot
+  // of the parameters at the point that the golden feature values were
+  // created.
+  const uint32_t yes_estimate_presets[] = {
+      1062898, 2644477, 1257642, 1864718, 412722, 725703, 395721, 474082,
+      173046,  255856,  158966,  153736,  69181,  199100, 144493, 227740,
+      110573,  164330,  79666,   144650,  122947, 476799, 398553, 497493,
+      322152,  1140005, 566716,  690605,  308902, 347481, 109891, 170457,
+      73901,   100975,  42963,   72325,   34183,  20207,  6640,   9468,
+  };
+  SetMicroFeaturesNoiseEstimates(yes_estimate_presets);
+
+  uint8_t yes_calculated_data[g_yes_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus yes_status = GenerateMicroFeatures(
+      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
+      g_yes_feature_data_slice_size, yes_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+
+  for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_feature_data_slice[i],
+                            yes_calculated_data[i]);
+    if (g_yes_feature_data_slice[i] != yes_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_yes_feature_data_slice[i],
+                             yes_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+  // As we did for the previous features, set known good noise state
+  // parameters.
+  const uint32_t no_estimate_presets[] = {
+      2563964, 1909393, 559801, 538670, 203643, 175959, 75088, 139491,
+      59691,   95307,   43865,  129263, 52517,  80058,  51330, 100731,
+      76674,   76262,   15497,  22598,  13778,  21460,  8946,  17806,
+      10023,   18810,   8002,   10842,  7578,   9983,   6267,  10759,
+      8946,    18488,   9691,   39785,  9939,   17835,  9671,  18512,
+  };
+  SetMicroFeaturesNoiseEstimates(no_estimate_presets);
+
+  uint8_t no_calculated_data[g_no_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus no_status = GenerateMicroFeatures(
+      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      g_no_feature_data_slice_size, no_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
+
+  for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_feature_data_slice[i], no_calculated_data[i]);
+    if (g_no_feature_data_slice[i] != no_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_no_feature_data_slice[i], no_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
index b9b8fb37b19d384fe92edf8ce2292aee19b99b7f..09f65ca24b3cd03485a5a79599dc0143ca83329c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 const char* kCategoryLabels[kCategoryCount] = {
     "silence",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..b74a4d01ca49d37d62daf3710c878cfc6d9940f0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kFeatureSliceSize = 40;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1dbb606e184e70e0fa97d417bcbab6010b8a88a5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
+
+const uint8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
+    216, 195, 223, 211, 238, 223, 243, 215, 226, 204, 232, 211, 232, 213,
+    240, 218, 235, 214, 238, 205, 207, 173, 149, 201, 215, 200, 230, 213,
+    208, 195, 175, 151, 195, 175, 182, 163, 235, 217, 218, 190,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..72ea2bf6a23e83bff5dea771931e585d74c757ec
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_features_data.cc and consists of the 29th spectrogram slice of 43 values.
+// This is the expected result of running the sample data in
+// no_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_no_feature_data_slice_size = 40;
+extern const uint8_t g_no_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..865209b01df7e8f77139bcd5b6a37537a6f674f4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
+ * --output_c_file="/tmp/no_micro_features_data.cc" \
+ */
+
+const int g_no_micro_f9643d42_nohash_4_width = 40;
+const int g_no_micro_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_micro_f9643d42_nohash_4_data[] = {
+    230, 205, 191, 203, 202, 181, 180, 194, 205, 187, 183, 197, 203, 198, 196,
+    186, 202, 159, 151, 126, 110, 138, 141, 142, 137, 148, 133, 120, 110, 126,
+    117, 110, 117, 116, 137, 134, 95,  116, 123, 110, 184, 144, 183, 189, 197,
+    172, 188, 164, 194, 179, 175, 174, 182, 173, 184, 174, 200, 145, 154, 148,
+    147, 135, 143, 122, 127, 138, 116, 99,  122, 105, 110, 125, 127, 133, 131,
+    123, 116, 119, 127, 114, 193, 176, 185, 170, 175, 146, 166, 167, 185, 185,
+    185, 183, 195, 185, 176, 178, 197, 155, 137, 144, 164, 132, 153, 132, 138,
+    137, 134, 95,  120, 116, 131, 122, 99,  120, 120, 110, 116, 110, 126, 127,
+    128, 159, 187, 119, 178, 187, 197, 167, 199, 184, 180, 165, 194, 176, 144,
+    134, 187, 136, 142, 134, 145, 132, 145, 105, 119, 123, 125, 116, 125, 102,
+    129, 138, 130, 99,  99,  90,  120, 123, 134, 95,  194, 172, 187, 123, 191,
+    179, 195, 182, 201, 137, 167, 142, 185, 161, 187, 146, 167, 152, 154, 107,
+    152, 112, 134, 144, 117, 116, 105, 85,  105, 105, 99,  90,  123, 112, 112,
+    68,  107, 105, 117, 99,  116, 143, 139, 90,  154, 142, 188, 172, 178, 135,
+    175, 149, 177, 110, 173, 160, 169, 162, 173, 119, 132, 110, 85,  85,  117,
+    129, 117, 112, 117, 51,  112, 95,  139, 102, 105, 90,  128, 119, 112, 99,
+    170, 168, 195, 152, 174, 173, 180, 0,   157, 130, 169, 149, 149, 123, 170,
+    130, 170, 133, 159, 102, 134, 90,  85,  105, 126, 119, 130, 90,  78,  68,
+    127, 120, 95,  51,  122, 110, 112, 78,  116, 95,  180, 135, 179, 146, 179,
+    162, 197, 153, 172, 135, 154, 0,   149, 95,  145, 114, 166, 0,   114, 110,
+    145, 107, 114, 90,  136, 68,  95,  95,  95,  85,  116, 99,  116, 0,   95,
+    68,  102, 51,  102, 78,  185, 157, 138, 158, 180, 117, 173, 142, 145, 117,
+    169, 130, 159, 99,  138, 123, 169, 90,  78,  0,   123, 85,  107, 51,  114,
+    102, 95,  0,   116, 85,  119, 95,  95,  68,  85,  51,  116, 68,  102, 78,
+    167, 105, 164, 163, 178, 126, 164, 154, 154, 51,  177, 120, 156, 85,  134,
+    139, 168, 90,  161, 102, 114, 116, 122, 95,  112, 102, 107, 51,  114, 85,
+    119, 78,  114, 90,  102, 51,  102, 51,  114, 99,  177, 68,  152, 102, 184,
+    166, 179, 129, 177, 129, 180, 110, 158, 105, 139, 0,   145, 85,  148, 102,
+    117, 102, 116, 0,   78,  68,  90,  51,  107, 85,  78,  0,   51,  0,   51,
+    0,   95,  51,  107, 68,  180, 117, 90,  0,   138, 0,   187, 146, 119, 140,
+    164, 90,  136, 0,   131, 51,  159, 99,  141, 138, 116, 51,  90,  51,  90,
+    68,  105, 0,   85,  78,  112, 51,  122, 95,  128, 68,  85,  0,   112, 68,
+    147, 126, 178, 146, 171, 130, 190, 147, 188, 123, 170, 78,  132, 0,   130,
+    125, 159, 95,  102, 0,   110, 0,   95,  85,  120, 68,  78,  51,  99,  51,
+    105, 0,   112, 102, 105, 68,  90,  51,  90,  0,   127, 95,  166, 175, 187,
+    133, 135, 0,   171, 139, 132, 128, 140, 51,  126, 107, 161, 0,   95,  51,
+    119, 0,   114, 0,   95,  110, 116, 51,  112, 0,   90,  0,   116, 51,  68,
+    0,   105, 68,  105, 0,   164, 78,  173, 0,   194, 166, 145, 114, 116, 51,
+    107, 122, 151, 0,   156, 102, 148, 51,  122, 95,  129, 0,   85,  0,   127,
+    78,  90,  0,   78,  0,   95,  0,   110, 0,   68,  119, 120, 68,  68,  0,
+    122, 99,  147, 127, 200, 167, 85,  114, 161, 85,  161, 125, 143, 99,  156,
+    85,  147, 68,  99,  0,   107, 102, 132, 51,  112, 68,  95,  78,  99,  0,
+    68,  0,   51,  0,   90,  78,  128, 51,  95,  0,   166, 136, 174, 138, 189,
+    144, 130, 129, 138, 134, 132, 120, 134, 0,   51,  78,  147, 51,  51,  0,
+    51,  0,   78,  0,   68,  68,  95,  78,  90,  0,   0,   0,   68,  0,   90,
+    68,  110, 0,   95,  51,  165, 151, 157, 0,   0,   0,   112, 0,   112, 95,
+    149, 107, 119, 68,  126, 68,  138, 0,   78,  0,   78,  0,   99,  51,  112,
+    0,   102, 0,   78,  51,  85,  0,   0,   0,   78,  0,   95,  0,   95,  78,
+    105, 0,   152, 0,   0,   51,  132, 105, 159, 0,   129, 102, 114, 0,   138,
+    51,  123, 0,   129, 78,  119, 51,  51,  51,  105, 0,   78,  85,  95,  0,
+    85,  0,   0,   0,   85,  0,   78,  0,   0,   0,   172, 142, 141, 0,   137,
+    0,   148, 128, 157, 120, 146, 120, 120, 0,   95,  78,  141, 68,  68,  0,
+    68,  0,   90,  0,   85,  0,   107, 0,   78,  0,   85,  51,  102, 0,   68,
+    78,  68,  0,   51,  0,   125, 0,   141, 51,  102, 138, 175, 51,  120, 51,
+    173, 85,  116, 141, 164, 68,  150, 123, 133, 51,  114, 0,   117, 68,  150,
+    51,  116, 68,  78,  0,   68,  0,   68,  0,   85,  0,   78,  0,   51,  78,
+    155, 90,  161, 0,   132, 99,  123, 78,  107, 0,   134, 90,  95,  0,   78,
+    0,   162, 143, 85,  0,   107, 78,  125, 90,  90,  51,  51,  0,   85,  0,
+    0,   0,   132, 102, 102, 154, 128, 0,   99,  68,  162, 102, 151, 0,   99,
+    51,  147, 141, 156, 0,   112, 120, 158, 127, 145, 139, 187, 171, 135, 138,
+    146, 0,   95,  68,  127, 0,   85,  0,   105, 0,   0,   0,   187, 170, 162,
+    188, 165, 51,  51,  78,  243, 215, 225, 196, 205, 181, 205, 168, 176, 134,
+    157, 110, 126, 114, 133, 139, 193, 163, 159, 116, 160, 126, 122, 127, 171,
+    99,  114, 68,  123, 85,  90,  0,   157, 146, 166, 179, 136, 0,   116, 90,
+    242, 219, 240, 204, 216, 164, 188, 171, 176, 164, 154, 158, 190, 157, 190,
+    141, 182, 177, 169, 128, 172, 145, 105, 129, 157, 90,  78,  51,  119, 68,
+    137, 68,  116, 78,  141, 132, 151, 122, 156, 140, 234, 206, 229, 201, 216,
+    174, 191, 144, 162, 85,  122, 157, 194, 167, 204, 149, 180, 166, 166, 139,
+    122, 133, 156, 126, 145, 85,  128, 0,   99,  51,  145, 0,   126, 51,  166,
+    162, 166, 162, 177, 157, 228, 198, 221, 197, 214, 177, 173, 166, 173, 139,
+    185, 191, 202, 163, 205, 172, 206, 189, 135, 68,  166, 134, 149, 134, 135,
+    90,  127, 107, 175, 90,  136, 117, 135, 140, 172, 167, 166, 149, 177, 152,
+    221, 191, 215, 194, 211, 0,   156, 147, 182, 178, 208, 163, 190, 157, 208,
+    200, 195, 164, 179, 154, 181, 150, 143, 99,  132, 137, 185, 143, 163, 85,
+    51,  107, 132, 134, 164, 127, 167, 159, 175, 141, 216, 195, 223, 211, 238,
+    223, 243, 215, 226, 204, 232, 211, 232, 213, 240, 218, 235, 214, 238, 205,
+    207, 173, 149, 201, 215, 200, 230, 213, 208, 195, 175, 151, 195, 175, 182,
+    163, 235, 217, 218, 190, 211, 191, 215, 191, 217, 220, 241, 215, 229, 206,
+    236, 210, 227, 216, 236, 188, 183, 149, 202, 189, 208, 172, 191, 201, 220,
+    193, 221, 207, 216, 208, 201, 131, 170, 187, 229, 197, 211, 194, 226, 201,
+    205, 184, 206, 177, 221, 210, 226, 184, 204, 197, 218, 198, 212, 209, 213,
+    141, 172, 110, 175, 167, 180, 156, 213, 188, 192, 179, 213, 205, 204, 174,
+    200, 147, 162, 181, 203, 167, 198, 187, 210, 164, 196, 169, 189, 168, 224,
+    198, 213, 204, 198, 195, 230, 211, 221, 197, 208, 0,   0,   0,   85,  90,
+    167, 130, 175, 173, 203, 164, 193, 144, 170, 145, 185, 148, 154, 139, 198,
+    159, 180, 171, 216, 174, 178, 161, 166, 136, 216, 184, 215, 197, 199, 190,
+    228, 195, 208, 51,  117, 0,   0,   0,   0,   0,   140, 51,  135, 154, 188,
+    155, 168, 0,   90,  0,   156, 85,  110, 0,   174, 90,  172, 154, 179, 99,
+    142, 166, 179, 157, 177, 95,  192, 142, 204, 198, 217, 147, 173, 0,   112,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   107, 0,   160, 0,   148, 95,
+    172, 0,   0,   0,   116, 0,   122, 114, 170, 0,   0,   0,   0,   0,   179,
+    110, 196, 85,  205, 183, 169, 0,   99,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   141, 0,   112, 0,   0,   0,   134, 0,   0,   0,   0,
+    0,   0,   0,   139, 0,   0,   0,   0,   112, 186, 78,  163, 0,   169, 128,
+    174, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   95,
+    0,   105, 0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   95,  0,
+    0,   0,   0,   0,   0,   0,   119, 0,   164, 78,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   68,
+    117, 0,   0,   0,   0,   0,   0,   0,   148, 0,   0,   0,   0,   0,   0,
+    0,   0,   0,   116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..178323eeba6669d247edfe9cb675b37fe5c7d526
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+
+extern const int g_no_micro_f9643d42_nohash_4_width;
+extern const int g_no_micro_f9643d42_nohash_4_height;
+extern const unsigned char g_no_micro_f9643d42_nohash_4_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b3aa19cda5d1c8151de5e8bf5aad45df09259a0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+
+#include <string.h>
+
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t smoothing =
+        ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing;
+    const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing;
+
+    // Update the estimate of the noise.
+    const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits;
+    uint32_t estimate =
+        ((static_cast<uint64_t>(signal_scaled_up) * smoothing) +
+         (static_cast<uint64_t>(state->estimate[i]) * one_minus_smoothing)) >>
+        kNoiseReductionBits;
+    state->estimate[i] = estimate;
+
+    // Make sure that we can't get a negative value for the signal - estimate.
+    if (estimate > signal_scaled_up) {
+      estimate = signal_scaled_up;
+    }
+
+    const uint32_t floor =
+        (static_cast<uint64_t>(signal[i]) * state->min_signal_remaining) >>
+        kNoiseReductionBits;
+    const uint32_t subtracted =
+        (signal_scaled_up - estimate) >> state->smoothing_bits;
+    const uint32_t output = subtracted > floor ? subtracted : floor;
+    signal[i] = output;
+  }
+}
+
+void NoiseReductionReset(struct NoiseReductionState* state) {
+  memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..699144345d5751f27f7adcafec551180f82725d1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+
+#define kNoiseReductionBits 14
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct NoiseReductionState {
+  int smoothing_bits;
+  uint16_t even_smoothing;
+  uint16_t odd_smoothing;
+  uint16_t min_signal_remaining;
+  int num_channels;
+  uint32_t estimate[kFeatureSliceSize];
+};
+
+// Removes stationary noise from each channel of the signal using a low pass
+// filter.
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
+
+void NoiseReductionReset(struct NoiseReductionState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7181d710bc00938e411869bf071b91e22f2044
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+
+// Test noise reduction using default config values.
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
+    config_.smoothing_bits = 10;
+    config_.even_smoothing = 0.025;
+    config_.odd_smoothing = 0.06;
+    config_.min_signal_remaining = 0.05;
+  }
+
+  struct NoiseReductionConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {6321887, 31248341};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {241137, 478104};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42a5c2136f2a85b0ddd7e3a620bb879d13eeb258
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) {
+  config->smoothing_bits = 10;
+  config->even_smoothing = 0.025;
+  config->odd_smoothing = 0.06;
+  config->min_signal_remaining = 0.05;
+}
+
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels) {
+  state->smoothing_bits = config->smoothing_bits;
+  state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits);
+  state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits);
+  state->min_signal_remaining =
+      config->min_signal_remaining * (1 << kNoiseReductionBits);
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->estimate, (state->num_channels * sizeof(*state->estimate)));
+  for (int i = 0; i < state->num_channels; ++i) {
+    state->estimate[i] = 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..60f9de5067c606158bc0e29771d1e83a495cd4c1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct NoiseReductionConfig {
+  // scale the signal up by 2^(smoothing_bits) before reduction
+  int smoothing_bits;
+  // smoothing coefficient for even-numbered channels
+  float even_smoothing;
+  // smoothing coefficient for odd-numbered channels
+  float odd_smoothing;
+  // fraction of signal to preserve (1.0 disables this module)
+  float min_signal_remaining;
+};
+
+// Populates the NoiseReductionConfig with "sane" default values.
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
+
+// Prepares any buffers.
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50656758d722844b8aeb6a32c04d3df36f0e5242
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) {
+  if (x <= 2) {
+    return lut[x];
+  }
+
+  const int16_t interval = MostSignificantBit32(x);
+  lut += 4 * interval - 6;
+
+  const int16_t frac =
+      ((interval < 11) ? (x << (11 - interval)) : (x >> (interval - 11))) &
+      0x3FF;
+
+  int32_t result = (static_cast<int32_t>(lut[2]) * frac) >> 5;
+  result += (static_cast<int32_t>(lut[1])) << 5;
+  result *= frac;
+  result = (result + (1 << 14)) >> 15;
+  result += lut[0];
+  return static_cast<int16_t>(result);
+}
+
+uint32_t PcanShrink(const uint32_t x) {
+  if (x < (2 << kPcanSnrBits)) {
+    return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits);
+  } else {
+    return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits);
+  }
+}
+
+void PcanGainControlApply(struct PcanGainControlState* state,
+                          uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t gain =
+        WideDynamicFunction(state->noise_estimate[i], state->gain_lut);
+    const uint32_t snr =
+        (static_cast<uint64_t>(signal[i]) * gain) >> state->snr_shift;
+    signal[i] = PcanShrink(snr);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d6fc990385cd74ccba8510765eb7ad8da4eeca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kPcanSnrBits 12
+#define kPcanOutputBits 6
+
+#define kWideDynamicFunctionBits 32
+#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
+
+struct PcanGainControlState {
+  int enable_pcan;
+  uint32_t* noise_estimate;
+  int num_channels;
+  int16_t gain_lut[kWideDynamicFunctionLUTSize];
+  int32_t snr_shift;
+};
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
+
+uint32_t PcanShrink(const uint32_t x);
+
+void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7dee66746f381ea50127e416fe90f063353eca89
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+const int kSmoothingBits = 10;
+const int kCorrectionBits = -1;
+
+// Test pcan auto gain control using default config values.
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
+    config_.enable_pcan = 1;
+    config_.strength = 0.95;
+    config_.offset = 80.0;
+    config_.gain_bits = 21;
+  }
+
+  struct PcanGainControlConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
+  struct PcanGainControlState state;
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      error_reporter, &config.config_, &state, estimate, kNumChannels,
+      kSmoothingBits, kCorrectionBits));
+
+  uint32_t signal[] = {241137, 478104};
+  PcanGainControlApply(&state, signal);
+
+  const uint32_t expected[] = {3578, 1533};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7867ac6284d519ba6dd35f601bc3cb40e2f95fe
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kint16max 0x00007FFF
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config) {
+  config->enable_pcan = 0;
+  config->strength = 0.95;
+  config->offset = 80.0;
+  config->gain_bits = 21;
+}
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x) {
+  const float x_as_float =
+      (static_cast<float>(x)) / (static_cast<uint32_t>(1) << input_bits);
+  const float gain_as_float =
+      (static_cast<uint32_t>(1) << config->gain_bits) *
+      powf(x_as_float + config->offset, -config->strength);
+
+  if (gain_as_float > kint16max) {
+    return kint16max;
+  }
+  return static_cast<int16_t>(gain_as_float + 0.5f);
+}
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits) {
+  state->enable_pcan = config->enable_pcan;
+  if (!state->enable_pcan) {
+    return 1;
+  }
+  state->noise_estimate = noise_estimate;
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->gain_lut, (kWideDynamicFunctionLUTSize * sizeof(int16_t)));
+  state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits;
+
+  const int32_t input_bits = smoothing_bits - input_correction_bits;
+  state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
+  state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
+  int16_t* temp_gain_lut = state->gain_lut - 6;
+  int interval;
+  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+    const uint32_t x0 = static_cast<uint32_t>(1) << (interval - 1);
+    const uint32_t x1 = x0 + (x0 >> 1);
+    const uint32_t x2 =
+        (interval == kWideDynamicFunctionBits) ? x0 + (x0 - 1) : 2 * x0;
+
+    const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0);
+    const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1);
+    const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2);
+
+    const int32_t diff1 = static_cast<int32_t>(y1) - y0;
+    const int32_t diff2 = static_cast<int32_t>(y2) - y0;
+    const int32_t a1 = 4 * diff1 - diff2;
+    const int32_t a2 = diff2 - a1;
+
+    temp_gain_lut[4 * interval] = y0;
+    temp_gain_lut[4 * interval + 1] = static_cast<int16_t>(a1);
+    temp_gain_lut[4 * interval + 2] = static_cast<int16_t>(a2);
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc1de7bb25db509f8271d12f053e61554d07680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct PcanGainControlConfig {
+  // set to false (0) to disable this module
+  int enable_pcan;
+  // gain normalization exponent (0.0 disables, 1.0 full strength)
+  float strength;
+  // positive value added in the normalization denominator
+  float offset;
+  // number of fractional bits in the gain
+  int gain_bits;
+};
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config);
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x);
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2af862de7590323819c99de3a6702d1bd046681
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+
+// Checks to ensure that the C-style array passed in has a compile-time size of
+// at least the number of bytes requested. This doesn't work with raw pointers
+// since sizeof() doesn't know their actual length, so only use this to check
+// statically-allocated arrays with known sizes.
+#define STATIC_ALLOC_ENSURE_ARRAY_SIZE(A, N)                                 \
+  do {                                                                       \
+    if (sizeof(A) < (N)) {                                                   \
+      error_reporter->Report(#A " too small (%d bytes, wanted %d) at %s:%d", \
+                             sizeof(A), (N), __FILE__, __LINE__);            \
+      return 0;                                                              \
+    }                                                                        \
+  } while (0)
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57a32c3595da7ae17c2328bb4c98fb005fd253ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
@@ -0,0 +1,1541 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+// See the README for a full description of the creation process.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+
+const unsigned char g_tiny_conv_micro_features_model_data[] = {
+    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xd0, 0x46, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
+    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x41, 0x00, 0x00,
+    0x74, 0x41, 0x00, 0x00, 0x44, 0x41, 0x00, 0x00, 0xb4, 0x3e, 0x00, 0x00,
+    0xac, 0x3e, 0x00, 0x00, 0xa4, 0x3e, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xb9, 0xff, 0xff,
+    0xf4, 0xb9, 0xff, 0xff, 0x52, 0xba, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x3e, 0x00, 0x00, 0x68, 0x95, 0x91, 0x7d, 0x9b, 0x85, 0x85, 0x81,
+    0x77, 0x85, 0x99, 0x89, 0x7e, 0x8a, 0x85, 0x92, 0xa5, 0x7e, 0x93, 0x97,
+    0x97, 0x91, 0xa3, 0x97, 0x88, 0x8b, 0xa6, 0x71, 0x77, 0x85, 0x95, 0x86,
+    0x6b, 0x93, 0xcb, 0x96, 0x7a, 0x9a, 0x7f, 0x85, 0x7a, 0x8e, 0xac, 0x98,
+    0x6d, 0x9d, 0x9b, 0x70, 0x9a, 0x90, 0xba, 0x99, 0x7b, 0x93, 0x6e, 0x68,
+    0x75, 0x86, 0xc4, 0x8b, 0x66, 0x5d, 0x96, 0x7f, 0x92, 0x91, 0xb6, 0x7b,
+    0x96, 0x95, 0x9a, 0x77, 0x9a, 0x96, 0xce, 0x80, 0x88, 0x65, 0x8e, 0x80,
+    0x88, 0x85, 0xb7, 0x9c, 0x7b, 0x93, 0x9d, 0x95, 0x83, 0x92, 0xd0, 0x7e,
+    0x68, 0x88, 0x6c, 0x78, 0x98, 0x81, 0xac, 0x95, 0x9e, 0x98, 0xa2, 0x99,
+    0x8d, 0x7d, 0xb8, 0x81, 0x6e, 0x68, 0xa1, 0x81, 0x9d, 0x99, 0xb4, 0x7d,
+    0x92, 0x86, 0x9d, 0x93, 0xa3, 0xb0, 0xd6, 0x79, 0x93, 0x76, 0x8d, 0x84,
+    0x91, 0x9d, 0xbe, 0x94, 0xb0, 0x70, 0x84, 0x80, 0x85, 0x99, 0x9e, 0xa2,
+    0x86, 0x8a, 0x7a, 0x76, 0x91, 0x8d, 0xa6, 0x76, 0x8d, 0x82, 0x98, 0x8c,
+    0x92, 0x8f, 0x8c, 0xb3, 0x78, 0x75, 0xa5, 0x88, 0x73, 0x8c, 0x91, 0x7c,
+    0x82, 0x7d, 0x93, 0x9e, 0x8b, 0x97, 0x7c, 0x90, 0x84, 0x95, 0x7e, 0x9e,
+    0xa4, 0x52, 0x8a, 0xb4, 0x97, 0x65, 0x7d, 0xb6, 0x83, 0x7d, 0x99, 0x80,
+    0x97, 0x85, 0x96, 0x5f, 0x8e, 0x87, 0x95, 0x6d, 0x76, 0x84, 0x97, 0x8c,
+    0x66, 0x97, 0xae, 0x6b, 0x93, 0xb3, 0xa8, 0x8b, 0xa1, 0x79, 0xa3, 0x94,
+    0x7e, 0xa8, 0x8d, 0xad, 0x78, 0x82, 0xa2, 0x7b, 0x90, 0xa4, 0x7d, 0xb3,
+    0xa0, 0x7b, 0x94, 0x85, 0x9a, 0x8d, 0x76, 0x82, 0x65, 0x73, 0xab, 0xa4,
+    0xaa, 0x74, 0x93, 0x9c, 0x83, 0x66, 0xbf, 0x7a, 0xaa, 0x81, 0x92, 0x89,
+    0x7e, 0x88, 0xa6, 0x66, 0xaf, 0x92, 0x9f, 0x97, 0x6c, 0x89, 0x9c, 0x74,
+    0x7e, 0x82, 0x8e, 0x88, 0xb2, 0x85, 0xba, 0x96, 0x90, 0x78, 0x8d, 0xa7,
+    0x9e, 0x87, 0xbc, 0x7f, 0xb2, 0x8b, 0x77, 0x9b, 0xab, 0x8f, 0xa4, 0x7d,
+    0x6f, 0x77, 0x8c, 0x98, 0x6f, 0x89, 0xb1, 0x9f, 0xa7, 0x94, 0x7d, 0xae,
+    0x88, 0x8a, 0xa9, 0x75, 0x7d, 0x7c, 0x88, 0x99, 0x90, 0x9d, 0x97, 0xa7,
+    0x8d, 0x7f, 0x73, 0xa1, 0xa3, 0x87, 0xa9, 0x92, 0x98, 0x7e, 0x9c, 0x88,
+    0x73, 0x6b, 0x78, 0x8e, 0x7d, 0x86, 0x6c, 0x7c, 0x92, 0x40, 0x86, 0xa7,
+    0x65, 0x93, 0x67, 0x91, 0x67, 0x71, 0x6c, 0xa8, 0x81, 0x70, 0x8e, 0xa8,
+    0x7b, 0x63, 0x89, 0x76, 0x69, 0x90, 0x73, 0x5e, 0x92, 0x78, 0x7e, 0x9d,
+    0x87, 0x86, 0x89, 0x64, 0x66, 0xa9, 0x92, 0x8d, 0x72, 0x7c, 0x63, 0x7f,
+    0x94, 0x5c, 0x92, 0x89, 0x87, 0x9d, 0x8b, 0x75, 0x93, 0x8c, 0x94, 0x68,
+    0x97, 0x87, 0x78, 0x7d, 0x7f, 0x84, 0x84, 0x77, 0x6b, 0x8e, 0x83, 0xab,
+    0x7e, 0x62, 0x90, 0x83, 0x8e, 0x71, 0x7e, 0x9b, 0x96, 0x6d, 0x83, 0x6a,
+    0x76, 0x68, 0x71, 0x90, 0x98, 0x90, 0x9b, 0x68, 0x89, 0x89, 0x95, 0x85,
+    0x6e, 0x75, 0x8e, 0x95, 0x83, 0x7a, 0x91, 0x7f, 0x8b, 0x71, 0x90, 0x7d,
+    0xad, 0x91, 0x6f, 0x74, 0x98, 0x8a, 0xb0, 0xa8, 0x80, 0xa3, 0x8e, 0x7c,
+    0xa5, 0x67, 0xa4, 0x66, 0xa9, 0x7b, 0x85, 0x9d, 0x88, 0xab, 0x7d, 0x81,
+    0x6e, 0x7f, 0x8f, 0x97, 0x97, 0x84, 0x89, 0x74, 0x9d, 0x5f, 0x9c, 0x88,
+    0x6f, 0x74, 0x96, 0x9e, 0x7e, 0x7e, 0xa4, 0x85, 0x94, 0x91, 0xaf, 0x99,
+    0x7a, 0xaa, 0x8c, 0x92, 0x85, 0x9d, 0x6c, 0x79, 0x57, 0x7a, 0x80, 0x84,
+    0x79, 0x79, 0x74, 0xa5, 0x55, 0xab, 0x73, 0x8c, 0x72, 0x9d, 0x72, 0xa9,
+    0x90, 0x73, 0x8f, 0xa0, 0x89, 0x6d, 0x68, 0x66, 0x61, 0x6f, 0x57, 0x7d,
+    0x66, 0x8c, 0x65, 0x87, 0x62, 0x76, 0x83, 0x77, 0x89, 0xa4, 0x73, 0x89,
+    0x7f, 0x70, 0x79, 0x6b, 0x86, 0x6f, 0x8d, 0x96, 0x65, 0x89, 0x66, 0x53,
+    0x73, 0xae, 0x6a, 0x72, 0x88, 0x97, 0x7a, 0x7f, 0x5d, 0xa1, 0x86, 0x88,
+    0x5f, 0x9f, 0x9b, 0x8a, 0x74, 0x9a, 0x7a, 0x7e, 0x8b, 0x71, 0x58, 0x74,
+    0x8f, 0x9b, 0x9b, 0x8d, 0x6b, 0x83, 0x60, 0x7f, 0x75, 0x91, 0x79, 0x93,
+    0x7a, 0x92, 0x8c, 0x7e, 0x7a, 0x95, 0x84, 0x69, 0x8f, 0x8c, 0x7c, 0x6e,
+    0x8b, 0x87, 0x82, 0x62, 0xa6, 0x97, 0x91, 0x65, 0xa2, 0xa4, 0x9b, 0x8b,
+    0x85, 0xa4, 0x84, 0x7b, 0x67, 0x93, 0x96, 0x84, 0x85, 0x75, 0x6d, 0x9e,
+    0x80, 0x80, 0x73, 0x8c, 0x81, 0x70, 0x8a, 0x68, 0x9c, 0x8e, 0x63, 0x91,
+    0x89, 0x79, 0x8d, 0x79, 0xa4, 0x9a, 0x96, 0xa0, 0x83, 0x63, 0x88, 0x8f,
+    0x76, 0xb4, 0xa8, 0x8e, 0x68, 0x8d, 0x8e, 0x95, 0x78, 0xae, 0x5d, 0x89,
+    0x66, 0x7e, 0x7b, 0x8a, 0x75, 0x86, 0x71, 0x97, 0x6d, 0xb3, 0x67, 0x76,
+    0x82, 0x7d, 0x70, 0x79, 0x8a, 0x9c, 0x82, 0xa7, 0x82, 0xab, 0x58, 0x86,
+    0x5c, 0x70, 0x8c, 0x71, 0x61, 0xa6, 0x74, 0xa8, 0x65, 0x78, 0x72, 0x9d,
+    0x6c, 0x92, 0x70, 0x88, 0x88, 0x79, 0x96, 0x6f, 0x68, 0xa4, 0x7a, 0x7b,
+    0x96, 0xac, 0x6d, 0x76, 0x6a, 0xab, 0x82, 0x7d, 0x71, 0x8d, 0x6b, 0x81,
+    0x6c, 0x9d, 0x71, 0x59, 0x5c, 0x71, 0x77, 0x6d, 0x6a, 0x96, 0x76, 0x69,
+    0x80, 0x83, 0x88, 0x70, 0x97, 0xb4, 0x8a, 0x6c, 0xa5, 0x6e, 0x64, 0x75,
+    0x73, 0xa2, 0x7f, 0x97, 0x9e, 0x75, 0x8f, 0x86, 0x68, 0xbb, 0x6b, 0x86,
+    0x8d, 0x80, 0x8e, 0x58, 0x6d, 0xb2, 0x76, 0x99, 0x8f, 0x70, 0x6c, 0x86,
+    0x78, 0x9e, 0x91, 0x90, 0xa2, 0x7c, 0x8c, 0x81, 0x80, 0xb4, 0x77, 0x7a,
+    0x8c, 0x5f, 0x85, 0x56, 0x7a, 0x93, 0x6b, 0x5c, 0x74, 0x59, 0x7e, 0x86,
+    0x8c, 0xae, 0x76, 0x7d, 0x76, 0x7e, 0x81, 0x5f, 0x81, 0x8e, 0x7b, 0x90,
+    0xaa, 0x99, 0x79, 0x89, 0x93, 0xbc, 0x86, 0x91, 0xa2, 0x88, 0x79, 0x82,
+    0x80, 0xb6, 0x4a, 0x93, 0x7b, 0x89, 0x75, 0x8d, 0x7a, 0x8d, 0x66, 0x7c,
+    0x81, 0x9f, 0x6e, 0x86, 0x4d, 0x82, 0x66, 0x88, 0x73, 0x89, 0x7d, 0xac,
+    0x89, 0x9f, 0x58, 0x7f, 0x6b, 0x8c, 0x6a, 0x82, 0x59, 0xb8, 0x83, 0x67,
+    0x8b, 0x8a, 0x84, 0x7b, 0x7f, 0xb5, 0x44, 0x57, 0x5a, 0x73, 0x8b, 0x6d,
+    0x7c, 0x9e, 0x71, 0x72, 0x8d, 0x93, 0x80, 0x60, 0x7f, 0xc5, 0x69, 0x5c,
+    0x67, 0x92, 0x6c, 0x75, 0x66, 0x8f, 0x91, 0x5a, 0x6c, 0x70, 0x90, 0x84,
+    0x88, 0xab, 0x90, 0x66, 0x9c, 0x64, 0x6e, 0x68, 0x92, 0x9e, 0x89, 0x8d,
+    0x82, 0x97, 0x77, 0x75, 0x7f, 0xa7, 0x91, 0x75, 0x8c, 0x89, 0xa4, 0x6b,
+    0x98, 0x99, 0x80, 0x7d, 0x6b, 0x7f, 0x7d, 0x88, 0x79, 0xa1, 0x87, 0x90,
+    0x81, 0x8e, 0x94, 0x96, 0x7d, 0xa8, 0x86, 0x84, 0x86, 0x79, 0x97, 0x6e,
+    0xaa, 0x95, 0x8a, 0x9f, 0x8c, 0x72, 0x99, 0x77, 0x81, 0x94, 0x91, 0x9f,
+    0x6e, 0x67, 0x87, 0x70, 0x7d, 0xad, 0x58, 0x7f, 0x6d, 0x96, 0x8e, 0x82,
+    0x7d, 0xa6, 0x77, 0x99, 0x87, 0x95, 0x89, 0x7e, 0xa6, 0x9e, 0x86, 0xac,
+    0x78, 0x9f, 0x9b, 0x85, 0x76, 0x99, 0x6a, 0x92, 0x66, 0x7b, 0x9a, 0x99,
+    0x83, 0x8b, 0x57, 0x65, 0x75, 0x9f, 0xa6, 0x8a, 0x8d, 0x96, 0x6f, 0x80,
+    0x65, 0x8f, 0x80, 0x9f, 0x82, 0x85, 0x55, 0x75, 0x5c, 0x84, 0x91, 0x86,
+    0x76, 0x96, 0x5a, 0x6c, 0x62, 0x7b, 0x92, 0x88, 0x61, 0xca, 0x75, 0x66,
+    0x70, 0x70, 0x8e, 0x7a, 0x75, 0xb2, 0x66, 0x81, 0x5b, 0x79, 0x92, 0x97,
+    0x94, 0xaf, 0x72, 0x8a, 0x9b, 0x5f, 0x65, 0x96, 0x81, 0xb6, 0x8a, 0x6f,
+    0x94, 0x7a, 0x96, 0x92, 0x79, 0x94, 0x8e, 0x53, 0x9a, 0x73, 0x6a, 0x9d,
+    0xa1, 0xa3, 0xa4, 0x8f, 0x6b, 0xa4, 0x8b, 0x82, 0x96, 0xb1, 0x8c, 0x92,
+    0x7f, 0x91, 0x5f, 0x98, 0x8a, 0xa4, 0x7e, 0x80, 0x97, 0x86, 0x86, 0x86,
+    0x8f, 0xa6, 0x77, 0x9a, 0x82, 0x80, 0x6e, 0x73, 0x83, 0xaf, 0x87, 0x6d,
+    0x77, 0x9a, 0x83, 0x9f, 0x7c, 0xa4, 0x71, 0x6f, 0x7d, 0x75, 0x9d, 0x82,
+    0x83, 0xaf, 0x85, 0x80, 0x8d, 0x7f, 0xa4, 0xa2, 0x88, 0xba, 0x76, 0x76,
+    0x94, 0x6b, 0x76, 0x83, 0x77, 0x96, 0x78, 0x8c, 0xb0, 0x8e, 0x83, 0x87,
+    0xa0, 0xcc, 0x7f, 0xa4, 0x8c, 0x77, 0x84, 0x8c, 0x80, 0xa0, 0x57, 0x76,
+    0x76, 0x71, 0x86, 0x9c, 0x7f, 0x88, 0x57, 0x95, 0x4d, 0x8c, 0x7f, 0x80,
+    0x66, 0x9e, 0x42, 0x8d, 0x6a, 0x8e, 0x8c, 0x80, 0x89, 0x9d, 0x4f, 0x83,
+    0x54, 0x8a, 0x5e, 0x64, 0x70, 0x94, 0x78, 0x90, 0x7d, 0x78, 0x8d, 0x71,
+    0x56, 0x9a, 0x8c, 0x65, 0x8b, 0x62, 0x88, 0x9a, 0x6c, 0x8e, 0x7b, 0x78,
+    0x68, 0x86, 0x64, 0x6b, 0x67, 0xaa, 0x8c, 0x7b, 0x67, 0x75, 0x58, 0x7e,
+    0x6b, 0x97, 0x92, 0x87, 0x9c, 0x79, 0x71, 0x76, 0x7d, 0xbb, 0x89, 0x75,
+    0x83, 0x57, 0x74, 0x98, 0xa1, 0x8f, 0xb0, 0x89, 0x76, 0x88, 0x69, 0x9c,
+    0x74, 0xb0, 0x86, 0x9c, 0x79, 0x6f, 0x84, 0x70, 0x94, 0xa1, 0x6e, 0x7a,
+    0xa3, 0x88, 0xa0, 0x7a, 0x94, 0xa1, 0x82, 0x93, 0x99, 0x95, 0x7f, 0xab,
+    0x97, 0x9d, 0x6e, 0x68, 0x79, 0x73, 0x76, 0x83, 0x76, 0xbd, 0x87, 0x87,
+    0x86, 0x74, 0x8f, 0x6e, 0x65, 0xba, 0x6a, 0x78, 0x91, 0x62, 0x72, 0x67,
+    0x75, 0xbd, 0x8c, 0x5e, 0x85, 0x6d, 0x72, 0x85, 0x7d, 0x96, 0x8f, 0xb9,
+    0x9f, 0x97, 0xa2, 0x8a, 0xa1, 0xc1, 0x8d, 0xbc, 0x85, 0x78, 0x93, 0x97,
+    0x99, 0x9f, 0x3a, 0x98, 0x65, 0x8d, 0x6a, 0x6c, 0x92, 0x85, 0x49, 0x7e,
+    0x6a, 0xaa, 0x8a, 0x94, 0x6b, 0x93, 0x40, 0x8a, 0x8c, 0x9c, 0x6f, 0xad,
+    0x72, 0xb0, 0x58, 0x88, 0x60, 0x8c, 0x86, 0x84, 0x74, 0x96, 0x8f, 0x97,
+    0x5e, 0x6c, 0x79, 0x92, 0x51, 0xa8, 0x92, 0x58, 0x62, 0x6f, 0x6c, 0x76,
+    0x5f, 0x9e, 0x86, 0x71, 0x9c, 0x69, 0x7e, 0x80, 0x8a, 0x97, 0x6f, 0x79,
+    0x8b, 0x6f, 0x6c, 0x88, 0x73, 0x9c, 0x6d, 0x91, 0x77, 0x73, 0x7f, 0x97,
+    0x86, 0xa9, 0xac, 0x71, 0x82, 0x90, 0x83, 0x8a, 0x80, 0x9d, 0xa8, 0x85,
+    0x78, 0x7f, 0x94, 0x99, 0x8e, 0xa3, 0x89, 0x70, 0x87, 0x62, 0x82, 0x87,
+    0x8c, 0x98, 0x7a, 0x88, 0x72, 0x7e, 0x78, 0xa0, 0x78, 0x95, 0x97, 0x8f,
+    0x7b, 0x7c, 0x83, 0x94, 0x93, 0xa7, 0x77, 0x97, 0x90, 0x5e, 0x76, 0x7c,
+    0x68, 0xaa, 0x69, 0x67, 0x76, 0x84, 0x7e, 0x64, 0xa3, 0xbe, 0x7e, 0x8b,
+    0x82, 0x50, 0x8a, 0x82, 0x89, 0xc0, 0x79, 0x78, 0x68, 0x7c, 0x6b, 0x77,
+    0x82, 0x99, 0x7b, 0x83, 0x80, 0x90, 0x96, 0x96, 0x87, 0xb7, 0xa5, 0x94,
+    0x82, 0x99, 0x95, 0x91, 0x7e, 0xa2, 0x49, 0x95, 0x6d, 0x8e, 0xa9, 0x89,
+    0x8e, 0x8f, 0x3d, 0x95, 0x6a, 0x8c, 0x8b, 0x8c, 0x7e, 0x88, 0x63, 0x94,
+    0x69, 0x94, 0x88, 0x92, 0x79, 0xa7, 0x68, 0x60, 0x76, 0x85, 0xa1, 0x6f,
+    0x54, 0x96, 0x63, 0x7a, 0x5c, 0x73, 0x74, 0x6e, 0x53, 0x99, 0x69, 0x76,
+    0x69, 0x57, 0x6a, 0x82, 0x55, 0x93, 0x82, 0x80, 0x65, 0x7f, 0x7b, 0x76,
+    0x72, 0x87, 0x8d, 0x97, 0x98, 0x78, 0x7e, 0x6d, 0x7a, 0x95, 0x78, 0x70,
+    0x90, 0x83, 0x89, 0x80, 0x7f, 0x9d, 0x73, 0x73, 0x84, 0x77, 0x8e, 0x77,
+    0x8e, 0x75, 0x9e, 0xa5, 0x86, 0x68, 0x89, 0x7d, 0x8d, 0x99, 0x79, 0x8f,
+    0x8e, 0x87, 0x87, 0x97, 0x8c, 0x91, 0xa1, 0x96, 0x83, 0x73, 0x87, 0xa9,
+    0x8c, 0xa6, 0x85, 0x8c, 0x96, 0x7d, 0x7f, 0x8e, 0x7e, 0xb0, 0x85, 0x8f,
+    0x7f, 0x7d, 0x95, 0x7d, 0x9c, 0xb3, 0x71, 0x86, 0x81, 0x69, 0x7b, 0x69,
+    0x76, 0xb6, 0x5d, 0x67, 0x8a, 0x68, 0x9c, 0xa6, 0x70, 0xbf, 0x79, 0x60,
+    0x8b, 0x7f, 0x7a, 0x7b, 0x8b, 0xaf, 0x8c, 0xa1, 0x86, 0x92, 0x76, 0x8d,
+    0x89, 0xa2, 0xa8, 0xa3, 0xa0, 0xa2, 0x96, 0x9d, 0x7c, 0x92, 0x3f, 0x9b,
+    0x6d, 0x8a, 0x80, 0x81, 0xa0, 0x92, 0x50, 0x7c, 0x82, 0x99, 0x80, 0xa6,
+    0x8e, 0x8d, 0x4f, 0x8d, 0x65, 0x71, 0x77, 0x81, 0x51, 0xa6, 0x3f, 0x5c,
+    0x63, 0x6f, 0x61, 0x93, 0x5c, 0xaa, 0x77, 0x8f, 0x5d, 0x53, 0x79, 0x74,
+    0x6b, 0x94, 0x86, 0x81, 0x85, 0x48, 0x81, 0x80, 0x6b, 0x85, 0x6c, 0x91,
+    0x92, 0x6a, 0x74, 0x78, 0x72, 0x87, 0x6c, 0x82, 0x88, 0x7b, 0x93, 0x71,
+    0x91, 0x8d, 0x67, 0x83, 0x86, 0x5b, 0x86, 0x79, 0x81, 0x9f, 0x95, 0x8a,
+    0x70, 0x66, 0x9e, 0x6b, 0x72, 0x98, 0x97, 0x95, 0x72, 0x93, 0x84, 0x92,
+    0x8c, 0x96, 0xa2, 0x65, 0x80, 0x75, 0xa2, 0xa7, 0x7d, 0x97, 0x71, 0x8f,
+    0x69, 0x65, 0x8f, 0xae, 0x9c, 0x97, 0x5d, 0xb3, 0x98, 0x83, 0x98, 0xa0,
+    0x5f, 0x7e, 0x7a, 0x7a, 0x87, 0x7c, 0x92, 0xa0, 0x81, 0xa6, 0x71, 0x8e,
+    0x88, 0x52, 0xa3, 0x88, 0x6a, 0x9d, 0x84, 0x82, 0x7c, 0x78, 0x9f, 0x92,
+    0x66, 0xa4, 0x53, 0x6a, 0x7e, 0x84, 0x60, 0x84, 0x92, 0xb0, 0x93, 0x9d,
+    0xa0, 0x5f, 0x95, 0x8c, 0x77, 0xa1, 0x8c, 0x90, 0xa0, 0x9c, 0x9a, 0x95,
+    0x85, 0xa1, 0x22, 0x8f, 0x57, 0x80, 0x96, 0x7d, 0x92, 0x8b, 0x41, 0xa6,
+    0x61, 0xa2, 0x6f, 0x80, 0x5d, 0x91, 0x66, 0xab, 0x6d, 0x7e, 0x88, 0x93,
+    0x5c, 0xa5, 0x75, 0x6e, 0x6c, 0x86, 0x69, 0x73, 0x4e, 0x8e, 0x77, 0x6b,
+    0x6c, 0x60, 0x67, 0x91, 0x75, 0x91, 0x6c, 0x7c, 0x53, 0x6e, 0x75, 0x8e,
+    0x79, 0x8c, 0x8b, 0x74, 0x6b, 0x57, 0x71, 0xa1, 0x7f, 0x83, 0x6c, 0x6b,
+    0x93, 0x99, 0x7a, 0x78, 0x71, 0x8c, 0x78, 0x88, 0x9f, 0x85, 0x77, 0x7b,
+    0x86, 0x85, 0xa1, 0x61, 0x78, 0x65, 0x61, 0x75, 0x82, 0x7d, 0xa9, 0xa2,
+    0x84, 0x82, 0x94, 0x95, 0x90, 0x9f, 0x83, 0x97, 0x76, 0x95, 0x8a, 0x83,
+    0x9b, 0x87, 0x8b, 0x7a, 0x6c, 0x6e, 0x75, 0x95, 0x85, 0x95, 0x84, 0x9e,
+    0x96, 0x74, 0x7d, 0xa5, 0x85, 0x8e, 0x7e, 0x73, 0x85, 0x8d, 0x87, 0x80,
+    0x8a, 0x96, 0x65, 0x87, 0x7c, 0x73, 0x80, 0x96, 0x73, 0x8d, 0x5e, 0x79,
+    0x7e, 0x8d, 0x79, 0x85, 0x63, 0xa0, 0x62, 0x89, 0x9d, 0x8c, 0x74, 0x7b,
+    0x9c, 0xa5, 0x71, 0x8c, 0x83, 0x91, 0x8e, 0x8d, 0x89, 0x8b, 0x8b, 0xa4,
+    0x78, 0x88, 0x9e, 0x85, 0x8b, 0x94, 0x38, 0x84, 0x7b, 0x86, 0x7d, 0xa2,
+    0x73, 0x8f, 0x47, 0x7b, 0x69, 0xb4, 0x85, 0x71, 0x61, 0x9d, 0x59, 0x95,
+    0x74, 0x93, 0x6a, 0x88, 0x62, 0xa2, 0x56, 0x93, 0x8d, 0x68, 0x7e, 0x80,
+    0x6b, 0xb7, 0x63, 0x90, 0x5d, 0x54, 0x6c, 0x90, 0x5a, 0x8e, 0x7e, 0x7d,
+    0x82, 0x73, 0x7f, 0x89, 0x94, 0x8e, 0x7a, 0x70, 0x6c, 0x79, 0x88, 0x88,
+    0x9b, 0x8b, 0x70, 0x81, 0x83, 0x83, 0x8b, 0x86, 0x64, 0x93, 0x82, 0x66,
+    0x66, 0x79, 0x74, 0x91, 0x92, 0x94, 0x7c, 0x87, 0x72, 0x79, 0x8d, 0xaa,
+    0xa2, 0x9e, 0xaf, 0x95, 0xb1, 0x8a, 0x95, 0x8b, 0x94, 0x7e, 0x79, 0x8e,
+    0x99, 0x98, 0x97, 0x9e, 0x94, 0x87, 0x74, 0x72, 0x63, 0x92, 0x92, 0x95,
+    0xb0, 0x94, 0x86, 0x91, 0x77, 0x8f, 0x91, 0x7e, 0x83, 0x88, 0x90, 0xa5,
+    0x79, 0x70, 0x85, 0x8f, 0x67, 0x90, 0x98, 0x8d, 0x8a, 0x5d, 0x8c, 0x9c,
+    0x94, 0x91, 0x80, 0x95, 0x6e, 0x95, 0x73, 0x8d, 0x63, 0x8e, 0x53, 0x8a,
+    0x77, 0x88, 0x8f, 0x6f, 0x87, 0x9e, 0x8b, 0xb7, 0x99, 0xb2, 0x85, 0x82,
+    0xa1, 0x89, 0x9b, 0xa7, 0x80, 0x81, 0xa0, 0x8e, 0x84, 0xa9, 0x27, 0x73,
+    0x5e, 0x85, 0x5f, 0x92, 0x8c, 0xa2, 0x34, 0x8e, 0x6e, 0xb2, 0x7b, 0x8c,
+    0x69, 0x93, 0x47, 0x9e, 0x58, 0x7e, 0x94, 0x86, 0x47, 0xa3, 0x53, 0x6b,
+    0x6e, 0x6a, 0x7f, 0x73, 0x5b, 0x8c, 0x7a, 0x99, 0x6c, 0x5d, 0x82, 0x82,
+    0x62, 0x8a, 0x7a, 0x8e, 0x88, 0x62, 0xa0, 0x8e, 0x5c, 0x9a, 0x72, 0x79,
+    0x66, 0x6b, 0x75, 0x78, 0x82, 0x8a, 0x59, 0x91, 0x93, 0x68, 0x78, 0xb4,
+    0x86, 0x7e, 0x8c, 0x6e, 0x88, 0x7f, 0x96, 0x8e, 0x6e, 0x8b, 0x8c, 0x73,
+    0xab, 0x79, 0x88, 0xa6, 0x86, 0x81, 0x9a, 0x80, 0x9a, 0x9e, 0x8b, 0x6d,
+    0x9a, 0x70, 0x8e, 0x8a, 0x84, 0x7a, 0xaf, 0xb8, 0x9e, 0x90, 0x89, 0xb3,
+    0x9b, 0x85, 0x94, 0xb6, 0x87, 0x8c, 0x6e, 0xa3, 0xac, 0x9e, 0x8c, 0x7c,
+    0x81, 0x83, 0x70, 0x8d, 0x7c, 0x81, 0x77, 0x82, 0x69, 0x8e, 0x5e, 0x80,
+    0x8a, 0x8e, 0x7c, 0x8a, 0x89, 0x90, 0x58, 0x59, 0x85, 0x88, 0x7a, 0x86,
+    0x73, 0x9c, 0x4a, 0x81, 0x8d, 0x89, 0x91, 0x95, 0x72, 0x83, 0x9d, 0x99,
+    0x8d, 0x6b, 0x95, 0x7e, 0x70, 0x94, 0x8c, 0x9f, 0x8a, 0x8f, 0xa7, 0x84,
+    0x87, 0xb6, 0x42, 0x81, 0x63, 0x8a, 0x79, 0x77, 0x74, 0x90, 0x23, 0x85,
+    0x74, 0x8f, 0x87, 0x80, 0x50, 0xa1, 0x4d, 0x9b, 0x55, 0x82, 0x74, 0x8e,
+    0x4a, 0xa7, 0x52, 0x4d, 0x77, 0x67, 0x77, 0x9e, 0x62, 0xa5, 0x7d, 0x96,
+    0x6f, 0x45, 0x80, 0x8c, 0x6c, 0x92, 0x99, 0x6f, 0x5d, 0x56, 0x93, 0xac,
+    0x94, 0x9c, 0x95, 0x92, 0x6e, 0x71, 0x87, 0x8c, 0x7b, 0xa9, 0x7f, 0x7a,
+    0x69, 0x6b, 0x7d, 0x90, 0x6f, 0x81, 0x9f, 0x80, 0x83, 0x67, 0x78, 0x85,
+    0x85, 0x91, 0x8a, 0x80, 0xaa, 0x86, 0x8c, 0x88, 0x8c, 0x8f, 0x9b, 0x85,
+    0x8b, 0x7e, 0x83, 0x82, 0x95, 0x75, 0x6b, 0x8f, 0x85, 0x8b, 0xb0, 0x9f,
+    0xa7, 0x8e, 0x61, 0x9d, 0x72, 0xac, 0x92, 0x87, 0x94, 0x96, 0x68, 0x8f,
+    0x63, 0x85, 0x9c, 0xa8, 0x82, 0x9b, 0x85, 0x9b, 0x6b, 0x72, 0x83, 0x85,
+    0x90, 0x87, 0x74, 0xa4, 0x88, 0x57, 0x63, 0x90, 0x8e, 0x7b, 0x80, 0x81,
+    0x94, 0x74, 0x68, 0x8a, 0x7f, 0x86, 0x78, 0x72, 0x75, 0x67, 0x7a, 0x8a,
+    0x7a, 0x74, 0x8c, 0xad, 0x75, 0xa2, 0x7d, 0x9a, 0x9e, 0x83, 0x92, 0xa2,
+    0xa3, 0x98, 0xa5, 0x91, 0x84, 0xb0, 0x21, 0x9a, 0x5f, 0x8c, 0x7e, 0x86,
+    0x80, 0xa0, 0x16, 0x9b, 0x5b, 0x9c, 0x76, 0x8d, 0x77, 0x9f, 0x62, 0x86,
+    0x6a, 0x6c, 0x6e, 0x8f, 0x4e, 0xc1, 0x61, 0x6f, 0x74, 0x79, 0x80, 0x5f,
+    0x59, 0x9e, 0x7c, 0x87, 0x7f, 0x4b, 0x6c, 0x8b, 0x5a, 0x8f, 0x65, 0x8a,
+    0x62, 0x58, 0x66, 0x8d, 0x83, 0x97, 0x8a, 0x7a, 0x77, 0x79, 0x6c, 0x83,
+    0x8c, 0x93, 0x82, 0x5e, 0x61, 0x8c, 0x82, 0x80, 0x88, 0x88, 0x85, 0x87,
+    0x77, 0x70, 0x8d, 0x7f, 0x7a, 0x89, 0x72, 0x7e, 0xa3, 0x99, 0x6b, 0xaa,
+    0x81, 0x87, 0x90, 0x6f, 0x7f, 0x77, 0x96, 0x83, 0x89, 0x89, 0x6a, 0x77,
+    0xa4, 0x6c, 0x97, 0x7e, 0x95, 0xa4, 0x63, 0x8d, 0x71, 0x96, 0x8a, 0xa4,
+    0x9f, 0x7c, 0x54, 0x94, 0x7a, 0x89, 0x8a, 0x90, 0x7e, 0x9d, 0x53, 0x7c,
+    0x9d, 0x83, 0x90, 0x84, 0xa1, 0x8e, 0x80, 0x74, 0x69, 0x7a, 0x69, 0x93,
+    0x8a, 0x90, 0x83, 0x76, 0x8b, 0x6f, 0x8e, 0x93, 0x82, 0x84, 0x7d, 0x94,
+    0xa1, 0x78, 0x7d, 0x68, 0x79, 0x83, 0x85, 0x9d, 0x89, 0xa0, 0x8a, 0x93,
+    0x90, 0x8c, 0x82, 0x86, 0x80, 0x71, 0xb3, 0xa1, 0x90, 0xb2, 0x27, 0xa3,
+    0x5e, 0xa3, 0xa6, 0x64, 0x75, 0xa0, 0x23, 0x8c, 0x7c, 0xc4, 0x7a, 0x8c,
+    0x4d, 0xa3, 0x4c, 0x93, 0x71, 0x7b, 0x71, 0x8b, 0x34, 0xa5, 0x47, 0x7f,
+    0x4e, 0x73, 0x51, 0x8a, 0x67, 0xa0, 0x9d, 0x7f, 0x65, 0x38, 0x61, 0x70,
+    0x71, 0x8d, 0x6a, 0x7e, 0x7e, 0x4c, 0x7d, 0x8d, 0x81, 0x80, 0xa5, 0x84,
+    0x6f, 0x57, 0x70, 0x91, 0x8b, 0x99, 0x9d, 0x84, 0x77, 0x7f, 0x6b, 0x7f,
+    0x76, 0x8f, 0x90, 0x72, 0x6c, 0x58, 0x6b, 0x85, 0xa6, 0x8a, 0xa2, 0x6d,
+    0x8a, 0x71, 0x71, 0x95, 0x92, 0x7c, 0x88, 0x67, 0x86, 0x6d, 0x8d, 0x95,
+    0x79, 0x8e, 0x65, 0x71, 0x71, 0x91, 0x85, 0x99, 0xa9, 0x87, 0x80, 0x88,
+    0x74, 0x86, 0x75, 0x83, 0x8b, 0x7f, 0x78, 0xb1, 0x90, 0xa8, 0x7b, 0x98,
+    0x8a, 0x7b, 0x5b, 0x99, 0x6f, 0x7f, 0xa0, 0x79, 0xa5, 0x93, 0x8b, 0x7b,
+    0x7e, 0x7a, 0x61, 0x9d, 0x98, 0x8b, 0x82, 0x7c, 0x76, 0x73, 0x81, 0x8a,
+    0x7e, 0x8d, 0x6e, 0x71, 0xa0, 0x65, 0x80, 0x62, 0x7d, 0x8d, 0x5e, 0x9b,
+    0x8f, 0x85, 0x89, 0xad, 0x71, 0x73, 0x7f, 0x89, 0x8d, 0x89, 0xb3, 0xa1,
+    0x7c, 0xaf, 0x43, 0x82, 0x49, 0x92, 0x62, 0x7f, 0x79, 0xa6, 0x23, 0x99,
+    0x6c, 0x9a, 0x8a, 0x90, 0x6c, 0xb9, 0x6f, 0x8a, 0x61, 0x7f, 0x8f, 0x8a,
+    0x57, 0xb9, 0x55, 0x65, 0x4b, 0x51, 0x66, 0x6e, 0x4a, 0xa1, 0x83, 0x8a,
+    0x73, 0x23, 0x8a, 0x6d, 0x46, 0xa7, 0x87, 0x64, 0x84, 0x5f, 0x6f, 0x6f,
+    0x9b, 0x9d, 0x76, 0x83, 0x60, 0x6e, 0x76, 0x8a, 0x9a, 0xa6, 0x75, 0x73,
+    0x86, 0x5b, 0x97, 0x88, 0x7b, 0x8e, 0x82, 0x5c, 0x97, 0x71, 0x74, 0x85,
+    0x83, 0x91, 0x89, 0x6f, 0x93, 0x94, 0x8b, 0xa9, 0x7d, 0x84, 0x80, 0x89,
+    0x97, 0x80, 0x65, 0x92, 0x9a, 0x85, 0x5a, 0x6a, 0x6b, 0x58, 0x6f, 0x8c,
+    0x9a, 0x8b, 0x6e, 0x81, 0x9d, 0xae, 0x8c, 0x86, 0x8d, 0x90, 0x6c, 0xb8,
+    0x91, 0x89, 0x98, 0xbd, 0x8b, 0x78, 0x7d, 0x87, 0x9c, 0x72, 0x73, 0x80,
+    0x9e, 0x92, 0x5d, 0x77, 0x78, 0x4f, 0x87, 0x7b, 0x7a, 0x9e, 0x74, 0x67,
+    0x6a, 0x58, 0x95, 0x80, 0x75, 0x97, 0x81, 0x75, 0x94, 0x75, 0x73, 0x92,
+    0x83, 0x7b, 0x6b, 0x8e, 0x82, 0x6e, 0x7d, 0x9b, 0x91, 0x7f, 0x9e, 0xaa,
+    0x8c, 0xa3, 0xa8, 0x8c, 0x9a, 0xc1, 0x28, 0xac, 0x49, 0x9b, 0x59, 0x8a,
+    0x60, 0xa7, 0x39, 0xa7, 0x75, 0x9b, 0x95, 0x94, 0x76, 0xb3, 0x4a, 0x6b,
+    0x60, 0x6c, 0xa5, 0x71, 0x40, 0xc4, 0x4c, 0x7c, 0x76, 0x7b, 0x67, 0x76,
+    0x76, 0xa4, 0x7b, 0x83, 0x67, 0x4d, 0x87, 0x87, 0x6e, 0x93, 0x84, 0x70,
+    0x78, 0x41, 0x87, 0x9f, 0x7a, 0x8c, 0x87, 0x69, 0x73, 0x6c, 0x93, 0x73,
+    0x77, 0xa2, 0x52, 0x72, 0x5c, 0x75, 0x6c, 0x8f, 0x65, 0x92, 0x87, 0x52,
+    0x67, 0x54, 0x54, 0x75, 0x90, 0x9c, 0x91, 0x6f, 0xa3, 0x86, 0x87, 0x9c,
+    0x99, 0x86, 0x9f, 0x71, 0x8a, 0x7a, 0x7a, 0x97, 0x7a, 0x86, 0x6c, 0x99,
+    0x89, 0x7e, 0x9c, 0x83, 0x98, 0x78, 0x73, 0x7f, 0x91, 0x96, 0x9a, 0x8d,
+    0xb0, 0x9e, 0x6a, 0x80, 0x92, 0x86, 0x95, 0x83, 0x94, 0x92, 0x6f, 0x86,
+    0x8a, 0x52, 0x6e, 0x82, 0x84, 0x8b, 0x77, 0x88, 0x70, 0x54, 0x8f, 0x7f,
+    0x7d, 0x7e, 0x57, 0x89, 0x6d, 0x6f, 0x9c, 0x93, 0x90, 0x93, 0x52, 0x70,
+    0x75, 0x92, 0x73, 0x88, 0x93, 0x77, 0x77, 0x91, 0x89, 0xa2, 0x9d, 0xa6,
+    0xae, 0x84, 0x7d, 0xab, 0x92, 0x7e, 0x9c, 0x98, 0x7b, 0xc3, 0x38, 0x98,
+    0x4f, 0x97, 0x8f, 0x93, 0x62, 0xb8, 0x23, 0xa4, 0x6d, 0x9c, 0x81, 0x8e,
+    0x6f, 0x9d, 0x56, 0x89, 0x50, 0x94, 0x70, 0x77, 0x5d, 0xb7, 0x60, 0x5b,
+    0x72, 0x45, 0x81, 0x8c, 0x66, 0xbc, 0x8f, 0x7f, 0x57, 0x43, 0x85, 0x96,
+    0x5a, 0xb2, 0x91, 0x7d, 0x6c, 0x3a, 0x73, 0x92, 0x63, 0x93, 0x89, 0x90,
+    0x7f, 0x52, 0x7f, 0x7b, 0xa1, 0xa6, 0x8f, 0x60, 0x78, 0x51, 0x5f, 0xac,
+    0x7b, 0x89, 0x88, 0x97, 0x7e, 0x64, 0x57, 0x72, 0x6c, 0x96, 0x74, 0x78,
+    0xab, 0x66, 0x62, 0x8d, 0x6f, 0x86, 0x91, 0x93, 0x7d, 0x74, 0x82, 0x80,
+    0x73, 0x84, 0x9c, 0x8e, 0x68, 0x69, 0x9e, 0xa1, 0x8a, 0x83, 0x7a, 0x87,
+    0x94, 0x8c, 0x83, 0x7e, 0x91, 0x92, 0x82, 0x7b, 0xa0, 0x8e, 0x73, 0x86,
+    0xa9, 0x95, 0x7c, 0xa5, 0x6c, 0x6f, 0x8c, 0x87, 0xa6, 0x8a, 0x77, 0x86,
+    0x7d, 0x79, 0x89, 0x75, 0x8f, 0x82, 0x54, 0x61, 0x82, 0x8e, 0x80, 0x84,
+    0x7b, 0x8e, 0x61, 0x82, 0x86, 0x77, 0x7d, 0x7c, 0x7e, 0x6c, 0x7b, 0xad,
+    0x7b, 0x90, 0x88, 0x80, 0x64, 0x83, 0x7e, 0xa7, 0x83, 0x7e, 0xb5, 0xbb,
+    0x88, 0xd9, 0x21, 0x9a, 0x4d, 0x9f, 0x91, 0x97, 0x64, 0xb5, 0x1c, 0x8a,
+    0x5f, 0xaf, 0x7e, 0x7b, 0x67, 0xad, 0x48, 0x7f, 0x4e, 0x87, 0x8f, 0x7c,
+    0x46, 0xab, 0x70, 0x7f, 0x4b, 0x4e, 0x48, 0x8c, 0x63, 0xc5, 0xa2, 0x7f,
+    0x68, 0x3b, 0x59, 0x7f, 0x53, 0xa1, 0x8e, 0x6e, 0x7a, 0x4a, 0x5f, 0x62,
+    0x5b, 0xa1, 0x62, 0x78, 0x74, 0x57, 0x78, 0x91, 0x7b, 0x9b, 0x75, 0x73,
+    0x73, 0x72, 0x94, 0x92, 0x79, 0xaa, 0x94, 0x75, 0x86, 0x58, 0x8c, 0x71,
+    0x77, 0x91, 0xa5, 0x74, 0x8f, 0x73, 0x89, 0x77, 0x68, 0x8e, 0x90, 0x96,
+    0x9f, 0x79, 0x77, 0x7d, 0x89, 0x9b, 0x8c, 0x94, 0x81, 0x88, 0x91, 0x8f,
+    0x9b, 0x91, 0x78, 0x87, 0x82, 0x72, 0xa7, 0xa2, 0x85, 0x98, 0xa3, 0x91,
+    0x83, 0x75, 0x72, 0x93, 0x80, 0x8f, 0x85, 0x70, 0x97, 0x58, 0x9f, 0x72,
+    0x91, 0x8e, 0x93, 0x74, 0x97, 0x73, 0x74, 0x91, 0x80, 0x84, 0x96, 0x94,
+    0x76, 0x69, 0x66, 0x9e, 0x81, 0x8a, 0x8b, 0x63, 0x65, 0x7c, 0xa1, 0x9a,
+    0x72, 0x84, 0x9e, 0x89, 0x9a, 0x86, 0x98, 0x7f, 0x77, 0x85, 0x82, 0xaa,
+    0xa3, 0x88, 0xac, 0x9e, 0x76, 0xca, 0x2b, 0xa0, 0x40, 0xad, 0x6f, 0x6c,
+    0x66, 0xc8, 0x07, 0x9e, 0x3e, 0x9f, 0x85, 0x9f, 0x5e, 0xb7, 0x53, 0x91,
+    0x56, 0x6d, 0x62, 0x95, 0x4c, 0xc7, 0x46, 0x56, 0x4b, 0x5d, 0x6f, 0x52,
+    0x4d, 0xa3, 0x8c, 0x90, 0x78, 0x4d, 0x58, 0x8d, 0x53, 0x93, 0x8e, 0x68,
+    0x6f, 0x3b, 0x49, 0x86, 0x6e, 0x9d, 0x76, 0x74, 0x5b, 0x44, 0x7b, 0x8c,
+    0x89, 0xb0, 0x64, 0x62, 0x6a, 0x6d, 0x7a, 0xae, 0x84, 0x95, 0x8c, 0x71,
+    0x8b, 0x60, 0x82, 0x9e, 0x8c, 0xa8, 0x90, 0x66, 0xa1, 0x7b, 0x65, 0x82,
+    0x8f, 0x7d, 0x8d, 0x78, 0x8e, 0x5f, 0x75, 0x88, 0x5d, 0x93, 0xa1, 0x93,
+    0x6b, 0x67, 0x7a, 0xa7, 0x92, 0x8c, 0x65, 0x88, 0x95, 0x93, 0x87, 0x81,
+    0x9c, 0x97, 0x62, 0x9d, 0x90, 0x62, 0xa1, 0x9f, 0x87, 0x94, 0x94, 0x99,
+    0x92, 0x8f, 0x71, 0x80, 0x77, 0x82, 0x92, 0x78, 0x67, 0x69, 0x7e, 0x81,
+    0x93, 0x89, 0x80, 0x9b, 0x71, 0x57, 0x63, 0x83, 0x7b, 0x9f, 0x5d, 0x92,
+    0x85, 0x96, 0x7e, 0x92, 0x84, 0x7f, 0x81, 0xa3, 0xa8, 0x96, 0x91, 0x8e,
+    0x8c, 0x8e, 0x7d, 0xb0, 0x86, 0x72, 0x9d, 0x8e, 0x8e, 0xd0, 0x05, 0x77,
+    0x45, 0xad, 0x91, 0x95, 0x71, 0xb8, 0x01, 0x9a, 0x41, 0xb8, 0x94, 0x6e,
+    0x63, 0xd3, 0x58, 0x8c, 0x5a, 0x89, 0x85, 0x83, 0x52, 0xc1, 0x7b, 0x6a,
+    0x65, 0x6e, 0x73, 0x63, 0x68, 0xba, 0x67, 0x78, 0x79, 0x4a, 0x73, 0x8f,
+    0x51, 0xc9, 0x85, 0x8a, 0x6b, 0x45, 0x6a, 0x8f, 0x6c, 0xad, 0x8a, 0x8d,
+    0x6a, 0x6e, 0x6b, 0x7f, 0x86, 0xb4, 0x88, 0x7d, 0xaa, 0x71, 0x5c, 0x69,
+    0x5d, 0xa8, 0x62, 0x7d, 0x6c, 0x6e, 0x6f, 0x6a, 0x7c, 0x9d, 0x7a, 0x83,
+    0x7d, 0x79, 0x7b, 0x9c, 0x73, 0x93, 0x7f, 0x9d, 0x8c, 0x75, 0x78, 0x83,
+    0x85, 0x88, 0x81, 0x81, 0x98, 0x79, 0xa3, 0xae, 0x5b, 0x90, 0x89, 0x9d,
+    0x6d, 0x90, 0xa3, 0x8e, 0x87, 0x96, 0x60, 0xa7, 0x76, 0x82, 0x81, 0x84,
+    0x84, 0x9c, 0x73, 0x8a, 0x6c, 0x58, 0x64, 0x96, 0x89, 0x8b, 0x76, 0x60,
+    0x91, 0x72, 0x7f, 0x86, 0x9a, 0x89, 0x67, 0x7d, 0x77, 0x84, 0x73, 0x5c,
+    0x67, 0x8a, 0x82, 0x8c, 0x8c, 0x94, 0x8a, 0xa2, 0xaa, 0x7e, 0x5f, 0x7f,
+    0x86, 0x90, 0x96, 0xab, 0x8d, 0x91, 0x7c, 0xb6, 0x82, 0x8d, 0xb8, 0xa9,
+    0x92, 0xea, 0x1b, 0x74, 0x25, 0xab, 0x8d, 0x61, 0x81, 0xd8, 0x2c, 0x86,
+    0x2f, 0xcf, 0xa2, 0x84, 0x7f, 0xa4, 0x36, 0x86, 0x47, 0x8d, 0x60, 0x8a,
+    0x62, 0xb1, 0x4a, 0x54, 0x48, 0x73, 0x64, 0x9d, 0x72, 0xb2, 0x76, 0x4c,
+    0x8e, 0x4e, 0x76, 0x94, 0x7c, 0xad, 0x74, 0x6c, 0x6c, 0x54, 0x7f, 0x63,
+    0x97, 0xb3, 0x74, 0x6c, 0x99, 0x5f, 0x86, 0x6a, 0xa3, 0x94, 0x7c, 0x83,
+    0x8d, 0x81, 0x79, 0xac, 0x61, 0x9b, 0x65, 0x7b, 0x66, 0x89, 0x60, 0x76,
+    0x8d, 0x93, 0x8d, 0x84, 0x71, 0x65, 0x82, 0x8c, 0x94, 0xa7, 0x59, 0xa1,
+    0x8b, 0x72, 0x84, 0x65, 0x75, 0x95, 0x62, 0x71, 0x71, 0x7e, 0x7b, 0x97,
+    0x9b, 0x9a, 0x80, 0xb1, 0x77, 0x7a, 0x73, 0x8e, 0x9c, 0x8c, 0x7d, 0x96,
+    0x89, 0x7d, 0x7e, 0x80, 0x8e, 0x93, 0x63, 0x72, 0x6b, 0x57, 0x78, 0x8f,
+    0x90, 0x86, 0x62, 0x75, 0x7e, 0x54, 0x7d, 0x95, 0x85, 0x84, 0x73, 0x7b,
+    0x8f, 0x9e, 0x72, 0x8c, 0x90, 0x96, 0x8e, 0x6c, 0x80, 0x8b, 0x9e, 0x8c,
+    0x87, 0x8e, 0x9b, 0x97, 0x8f, 0x94, 0xa3, 0x6b, 0xad, 0x93, 0x8a, 0x96,
+    0x8d, 0x91, 0xa6, 0x8a, 0x9e, 0xce, 0x6b, 0x98, 0x6d, 0xa9, 0x92, 0x92,
+    0x7c, 0xe2, 0x63, 0x97, 0x42, 0xc8, 0xa3, 0xa0, 0x88, 0xdc, 0x75, 0x9b,
+    0x51, 0x7d, 0x5c, 0x80, 0x89, 0xc0, 0x83, 0x5e, 0x5e, 0xa4, 0x3e, 0x74,
+    0x9b, 0xb6, 0x7f, 0x63, 0x78, 0x7d, 0x74, 0x57, 0x93, 0xa2, 0x83, 0x70,
+    0x5e, 0x7d, 0x60, 0x69, 0x93, 0x9e, 0x79, 0x86, 0x91, 0x67, 0x86, 0x95,
+    0xa2, 0xad, 0x62, 0x74, 0x68, 0x7e, 0x7e, 0x82, 0x8c, 0xb0, 0xa0, 0x63,
+    0x8b, 0x82, 0x8f, 0x8c, 0xa4, 0xa3, 0x76, 0x6c, 0x8e, 0x87, 0x72, 0x85,
+    0xaa, 0xa4, 0x7f, 0x7b, 0x8e, 0x9a, 0x69, 0x91, 0x9d, 0xa0, 0x81, 0x92,
+    0x90, 0x85, 0x66, 0x82, 0xa3, 0xa9, 0x7f, 0x8f, 0x83, 0x9d, 0x8b, 0x8d,
+    0x96, 0xa3, 0x8f, 0x7a, 0x6d, 0x89, 0x74, 0x8a, 0xa9, 0xa9, 0x7b, 0x77,
+    0x93, 0x8b, 0x63, 0x92, 0x99, 0x8b, 0x88, 0x4f, 0x87, 0x7c, 0x67, 0x78,
+    0x83, 0xa5, 0xa5, 0x58, 0x8d, 0x70, 0x86, 0x82, 0x9e, 0xa7, 0xa5, 0x96,
+    0x8d, 0x7b, 0x96, 0x8c, 0x95, 0xa3, 0x8d, 0x9c, 0x92, 0x95, 0x98, 0x94,
+    0x87, 0x90, 0x92, 0x92, 0x95, 0x96, 0xad, 0x6e, 0x97, 0x8c, 0x92, 0x7f,
+    0x95, 0x8b, 0x8a, 0x90, 0x9b, 0x87, 0x9e, 0x86, 0x91, 0xa0, 0x68, 0x82,
+    0x85, 0x8e, 0x82, 0xa8, 0x9f, 0x68, 0x87, 0x75, 0x9b, 0x70, 0x95, 0x91,
+    0x6c, 0x77, 0x8b, 0x7b, 0x95, 0x80, 0x99, 0x65, 0x95, 0x82, 0x92, 0x9a,
+    0x8a, 0x65, 0x70, 0x8c, 0x98, 0x9e, 0x80, 0x7b, 0xa5, 0x9b, 0x93, 0x94,
+    0x84, 0x6a, 0x69, 0x82, 0x80, 0x7a, 0x75, 0x72, 0x94, 0x79, 0xad, 0xb2,
+    0x81, 0x8b, 0x85, 0x6c, 0x86, 0x88, 0x9e, 0x79, 0x86, 0x9e, 0x7e, 0x91,
+    0x7b, 0x6d, 0x93, 0x91, 0x82, 0x97, 0x6b, 0xa6, 0xaa, 0x9f, 0xa8, 0x74,
+    0x94, 0x7f, 0x63, 0x98, 0x90, 0xa1, 0x8c, 0x7f, 0x71, 0x86, 0x89, 0x95,
+    0x88, 0x80, 0x77, 0x67, 0x85, 0x7d, 0x89, 0x6d, 0x9c, 0x76, 0x72, 0x8d,
+    0x96, 0x94, 0x88, 0x98, 0x9f, 0x94, 0x8e, 0x84, 0x7a, 0x88, 0x79, 0x9f,
+    0x81, 0xa1, 0x7c, 0x8b, 0x71, 0x79, 0x7d, 0x9d, 0x7b, 0x6a, 0x8c, 0x66,
+    0x9e, 0x7b, 0x77, 0x7a, 0xb0, 0x74, 0x7f, 0x8d, 0x8d, 0x71, 0x72, 0x84,
+    0x90, 0x98, 0x7b, 0x89, 0x9b, 0x8e, 0x85, 0x7a, 0x67, 0x8a, 0x72, 0x84,
+    0x82, 0x91, 0x91, 0x7a, 0x85, 0x8a, 0xae, 0x8a, 0x9a, 0x9a, 0x7f, 0x85,
+    0x8a, 0x90, 0x69, 0x7b, 0x76, 0x78, 0x98, 0x54, 0x94, 0x7e, 0x6c, 0x72,
+    0x89, 0x88, 0x82, 0x96, 0x59, 0x95, 0x76, 0x91, 0x94, 0x96, 0x83, 0x84,
+    0x72, 0x8d, 0x97, 0x71, 0x68, 0x8e, 0x88, 0x8b, 0x7c, 0xa9, 0x73, 0x8a,
+    0x95, 0x86, 0x87, 0x96, 0x91, 0x77, 0xb1, 0x88, 0x6e, 0x7d, 0x7c, 0x9f,
+    0x8f, 0x82, 0x79, 0x83, 0xa6, 0x81, 0x89, 0x83, 0x85, 0x9b, 0x7c, 0x68,
+    0x6f, 0x84, 0x7c, 0xa1, 0x8e, 0x80, 0x78, 0x8f, 0x96, 0x77, 0x7e, 0x7b,
+    0x8f, 0x81, 0xa5, 0x84, 0x86, 0x91, 0x7b, 0x73, 0x92, 0x85, 0xa3, 0x7e,
+    0x80, 0x95, 0x7d, 0x5f, 0x8c, 0x94, 0x95, 0x73, 0x95, 0x78, 0x87, 0xa1,
+    0x94, 0x6c, 0xac, 0x6c, 0x77, 0x89, 0x86, 0x9c, 0x82, 0x76, 0x99, 0x93,
+    0x92, 0x88, 0x80, 0x80, 0x85, 0x8a, 0xa8, 0x8f, 0x7a, 0x89, 0x9a, 0x7a,
+    0x8f, 0x91, 0x86, 0x82, 0x7f, 0x82, 0x91, 0x95, 0x85, 0x71, 0x7d, 0x8f,
+    0x83, 0x8c, 0x79, 0x97, 0x7a, 0x9b, 0x91, 0x88, 0xa2, 0x86, 0x8a, 0x80,
+    0xa0, 0x96, 0x8b, 0x7d, 0x76, 0x96, 0x9f, 0x8d, 0x95, 0x8a, 0x94, 0xa0,
+    0x80, 0x95, 0x9b, 0x96, 0x81, 0xa8, 0x59, 0x89, 0x92, 0xb2, 0x83, 0x89,
+    0x85, 0x81, 0x7e, 0x64, 0x77, 0x82, 0x90, 0x96, 0x7e, 0x9f, 0xab, 0x8a,
+    0x6e, 0x9b, 0x90, 0x89, 0x6e, 0x7d, 0x81, 0x65, 0x81, 0x86, 0xa1, 0x93,
+    0x8b, 0x83, 0x81, 0x89, 0x8b, 0x90, 0x7e, 0x97, 0x8e, 0x75, 0x7e, 0x7e,
+    0x7b, 0x81, 0x9a, 0x64, 0x90, 0xab, 0x90, 0x82, 0x8a, 0x82, 0x8d, 0xad,
+    0x90, 0x74, 0x7f, 0x9a, 0x88, 0x92, 0x83, 0x97, 0xa6, 0x6e, 0x9d, 0x81,
+    0xa2, 0x98, 0x74, 0x84, 0x93, 0x85, 0x84, 0x7d, 0xa2, 0x92, 0x92, 0x87,
+    0x73, 0x8b, 0x92, 0x74, 0x96, 0x70, 0x83, 0x86, 0x8a, 0x89, 0x86, 0x88,
+    0x87, 0x7c, 0x7d, 0x81, 0x8d, 0x71, 0x8c, 0x89, 0x70, 0x94, 0x8f, 0x9a,
+    0x83, 0x9d, 0x99, 0x78, 0x74, 0x88, 0x84, 0x9a, 0x95, 0x8b, 0x8e, 0x7f,
+    0xa2, 0xa0, 0x76, 0x93, 0x9b, 0x7c, 0x97, 0x81, 0x83, 0x8c, 0xa1, 0x99,
+    0x9d, 0x7f, 0x87, 0x75, 0xa7, 0x75, 0x89, 0x7e, 0x88, 0x80, 0x8f, 0x84,
+    0x9a, 0x77, 0x8d, 0x90, 0x9d, 0x6c, 0x88, 0x8d, 0x8e, 0x81, 0x97, 0x6d,
+    0x81, 0x88, 0x64, 0x8c, 0x77, 0x8e, 0x91, 0x8a, 0x7f, 0x8a, 0x94, 0x7a,
+    0x89, 0x93, 0x8c, 0x69, 0x85, 0x8c, 0x93, 0x61, 0x7e, 0x89, 0x7e, 0x8a,
+    0x65, 0x8a, 0xa9, 0x7f, 0x80, 0x86, 0x82, 0x90, 0x66, 0x7a, 0x99, 0x71,
+    0x7f, 0x73, 0x8d, 0x94, 0x7d, 0x73, 0x7a, 0x7d, 0x87, 0x7a, 0x97, 0x70,
+    0x81, 0x60, 0x61, 0x7a, 0x91, 0x88, 0x93, 0x7a, 0x9e, 0xa6, 0x92, 0x9d,
+    0x92, 0x67, 0x99, 0x9a, 0xae, 0x71, 0x89, 0xa5, 0x9f, 0xa6, 0x98, 0x89,
+    0x97, 0x90, 0x9b, 0x9a, 0xc0, 0x95, 0x8f, 0x9c, 0x95, 0x93, 0x88, 0x95,
+    0x95, 0xa0, 0x8e, 0x8c, 0xa8, 0x94, 0x6e, 0x9e, 0x6f, 0x7b, 0xa5, 0x96,
+    0x98, 0x90, 0x91, 0x89, 0x93, 0x8f, 0x84, 0xb2, 0x7f, 0x5e, 0xc2, 0x75,
+    0x8f, 0x90, 0x9c, 0xbf, 0x8a, 0x84, 0xa6, 0x85, 0x7d, 0x84, 0x8a, 0xad,
+    0x6f, 0x88, 0xac, 0x77, 0x91, 0x8d, 0x94, 0xac, 0x8f, 0x7f, 0xa1, 0xa5,
+    0x8e, 0x6d, 0x8a, 0x82, 0x85, 0x80, 0x9b, 0x7a, 0x9f, 0x60, 0x95, 0x97,
+    0x90, 0x67, 0x8f, 0x91, 0x86, 0x89, 0x88, 0x89, 0x96, 0x6c, 0x8b, 0x94,
+    0x8a, 0x75, 0x84, 0x96, 0x8a, 0x86, 0x7c, 0x91, 0x74, 0x8f, 0x97, 0x89,
+    0x8f, 0x8e, 0x6b, 0x97, 0x93, 0x89, 0x6b, 0x7e, 0x65, 0xa4, 0xa5, 0x63,
+    0x85, 0x88, 0x81, 0xa3, 0x70, 0x9b, 0x9e, 0x8c, 0x62, 0x73, 0x85, 0xb4,
+    0x88, 0x6e, 0x92, 0x6f, 0x91, 0x88, 0x79, 0x91, 0x7f, 0x7d, 0x9a, 0x6b,
+    0x78, 0x93, 0x7e, 0x79, 0x93, 0x7a, 0x74, 0x91, 0x8d, 0x92, 0xb3, 0x61,
+    0xa3, 0x76, 0x81, 0x99, 0x96, 0x8b, 0x93, 0x8f, 0xa7, 0x6f, 0x8f, 0xa6,
+    0xb2, 0x76, 0xa1, 0x83, 0xa8, 0x8b, 0xae, 0x99, 0x90, 0x6a, 0x97, 0x97,
+    0xaa, 0x95, 0x85, 0x7d, 0x97, 0x94, 0x86, 0x94, 0x89, 0xa4, 0xa9, 0x81,
+    0x89, 0x7c, 0x96, 0xb3, 0x92, 0x7d, 0xa4, 0x6f, 0x6d, 0x92, 0x83, 0xb4,
+    0x7b, 0x94, 0x8c, 0x79, 0x61, 0x6f, 0x8f, 0xb7, 0x88, 0x66, 0xaa, 0x7d,
+    0x89, 0x7f, 0x90, 0xbd, 0x99, 0xac, 0xb1, 0x96, 0x9c, 0x7c, 0x92, 0xb7,
+    0x73, 0x94, 0xad, 0x9d, 0x7c, 0x80, 0x87, 0x96, 0x73, 0x8d, 0xa8, 0x88,
+    0xa9, 0x83, 0x7b, 0x84, 0x9d, 0x99, 0x83, 0x89, 0x9d, 0x7f, 0x7e, 0x86,
+    0x75, 0x83, 0x77, 0x7d, 0x8b, 0x7d, 0x80, 0x9d, 0xa2, 0x94, 0x72, 0x92,
+    0x75, 0x95, 0x99, 0xa0, 0x7b, 0x83, 0x99, 0x89, 0x82, 0x92, 0x5b, 0x9e,
+    0x7c, 0x91, 0x95, 0x79, 0x61, 0x86, 0x60, 0xc7, 0x72, 0x91, 0xb5, 0x88,
+    0x71, 0x8d, 0x85, 0x91, 0x83, 0x74, 0xa8, 0x67, 0x79, 0x77, 0x7f, 0x79,
+    0x68, 0x84, 0x95, 0x69, 0x98, 0x88, 0x74, 0x72, 0x9c, 0x86, 0x87, 0x95,
+    0x90, 0x95, 0x9b, 0x8b, 0xc5, 0x7d, 0x81, 0x8f, 0x88, 0x8c, 0xb0, 0x95,
+    0xa8, 0x8c, 0x84, 0xa0, 0xb0, 0x89, 0x9a, 0x90, 0xaa, 0x88, 0x96, 0x9b,
+    0x88, 0xa9, 0x89, 0x99, 0xb7, 0x82, 0x99, 0xa0, 0x85, 0x70, 0x9c, 0x9a,
+    0x94, 0x74, 0x91, 0x81, 0x76, 0x70, 0x8f, 0xc2, 0x8c, 0x91, 0x8f, 0x69,
+    0x74, 0x7e, 0x6d, 0x9a, 0x80, 0x77, 0xa5, 0x94, 0x8b, 0x6d, 0x82, 0xcf,
+    0x8e, 0x74, 0xc4, 0x86, 0x7f, 0x78, 0x72, 0xb3, 0x78, 0x7a, 0xac, 0x9c,
+    0x7d, 0x77, 0x8d, 0xca, 0x67, 0x8c, 0xd5, 0x8f, 0x7f, 0x71, 0x70, 0x82,
+    0x7e, 0x9f, 0xb0, 0x7f, 0x75, 0x90, 0x79, 0x7b, 0x8d, 0x7b, 0xa6, 0x87,
+    0x98, 0x76, 0x84, 0x96, 0x81, 0x6a, 0x96, 0x86, 0x8e, 0x77, 0xa3, 0x83,
+    0x91, 0x83, 0x8a, 0x6c, 0x74, 0x83, 0x99, 0x7d, 0x7c, 0x8a, 0x88, 0x9a,
+    0x6b, 0x86, 0x59, 0xa3, 0x8a, 0x8e, 0xbb, 0x8a, 0x75, 0x78, 0x68, 0xb5,
+    0x9b, 0x7b, 0xa7, 0x93, 0x5b, 0x6c, 0x6b, 0xa0, 0x74, 0x99, 0xc0, 0x73,
+    0x8b, 0x7e, 0x8e, 0x83, 0x64, 0x7c, 0x7d, 0x7a, 0x98, 0x7d, 0x82, 0x7c,
+    0x8f, 0x7e, 0x74, 0x86, 0xa9, 0x84, 0xba, 0x8f, 0xc7, 0x6f, 0x87, 0xae,
+    0x97, 0x91, 0xad, 0x82, 0xb2, 0x70, 0x8a, 0xa0, 0xb0, 0x7d, 0x95, 0x8d,
+    0xc2, 0x85, 0x80, 0xad, 0x9f, 0x85, 0x8b, 0x76, 0xaa, 0xab, 0x8f, 0xa0,
+    0x89, 0x9b, 0x8a, 0xb3, 0xa0, 0x72, 0xbe, 0x8c, 0x93, 0x7a, 0xa0, 0xad,
+    0x99, 0x6f, 0xa2, 0x79, 0x78, 0x8b, 0x6d, 0xae, 0x75, 0x6f, 0xa1, 0x8d,
+    0x68, 0x81, 0x74, 0xb3, 0x8f, 0x81, 0xc6, 0x96, 0x77, 0x68, 0x85, 0xaf,
+    0x86, 0x9f, 0xbb, 0x8a, 0x7e, 0x8a, 0x86, 0xab, 0x8b, 0x87, 0x94, 0x96,
+    0x99, 0x82, 0x6a, 0xaa, 0x7b, 0x81, 0xa6, 0x9b, 0xb6, 0x73, 0x78, 0x9a,
+    0x8f, 0xaa, 0x93, 0x81, 0x97, 0x7a, 0x72, 0x82, 0x79, 0x81, 0x7c, 0x88,
+    0x8e, 0x79, 0x9d, 0x81, 0x9a, 0x75, 0x9b, 0x89, 0x73, 0x6a, 0xa6, 0x84,
+    0x5c, 0x6f, 0xa0, 0x9d, 0x81, 0x84, 0x3e, 0xaf, 0x94, 0xa1, 0xb8, 0x93,
+    0x81, 0x89, 0x68, 0xd4, 0x87, 0x99, 0x99, 0x95, 0x79, 0x72, 0x81, 0xa1,
+    0x78, 0x7d, 0x8f, 0x7e, 0x87, 0x78, 0x8e, 0x97, 0x7e, 0x96, 0x86, 0x86,
+    0x97, 0x74, 0x6f, 0x7d, 0xa5, 0x81, 0x6f, 0x8e, 0x9e, 0x8b, 0xad, 0xac,
+    0xbd, 0x75, 0x84, 0xa2, 0x93, 0x76, 0xc7, 0x9e, 0xb0, 0x75, 0x89, 0xa4,
+    0x95, 0x92, 0xb5, 0xaa, 0xb9, 0x7d, 0x79, 0xa5, 0x88, 0x70, 0x84, 0x70,
+    0xa3, 0x81, 0xa1, 0xa6, 0x8f, 0x96, 0x96, 0x8d, 0xa5, 0x83, 0xb2, 0x8f,
+    0x88, 0x74, 0x96, 0xbc, 0x8b, 0x81, 0xa4, 0x85, 0x7c, 0x87, 0x64, 0xb4,
+    0x80, 0x88, 0x92, 0x90, 0x78, 0x79, 0x77, 0xa5, 0x79, 0x8b, 0xbd, 0x7d,
+    0x84, 0x8c, 0x96, 0xd4, 0x78, 0x81, 0xa4, 0x8c, 0x97, 0x89, 0x78, 0xc4,
+    0x9f, 0x94, 0xb9, 0x83, 0x76, 0x78, 0x89, 0x86, 0x81, 0x8f, 0xbd, 0xa7,
+    0x88, 0x79, 0x8e, 0x92, 0x86, 0x88, 0xad, 0x8a, 0x7b, 0x7f, 0x80, 0xad,
+    0x7a, 0xaf, 0x8a, 0x93, 0xa6, 0x84, 0x92, 0x8e, 0x84, 0x99, 0x80, 0xae,
+    0x74, 0x7c, 0x95, 0x9c, 0x7b, 0x84, 0x84, 0x84, 0xa4, 0x82, 0x57, 0xb5,
+    0x95, 0xc1, 0xb7, 0xa0, 0x85, 0x7b, 0x69, 0xc3, 0xb1, 0x8e, 0xa0, 0x8e,
+    0x81, 0x88, 0x78, 0x9e, 0x81, 0x97, 0xb2, 0x74, 0x81, 0x84, 0x91, 0x87,
+    0x6f, 0x6f, 0x75, 0x78, 0x92, 0x7a, 0x6d, 0x80, 0x9a, 0x7e, 0x81, 0xa1,
+    0xa8, 0x6d, 0xb5, 0x98, 0xb4, 0x7f, 0x9a, 0xa4, 0x9d, 0x7b, 0xba, 0xaa,
+    0xce, 0x93, 0x79, 0xa5, 0x81, 0x95, 0xa6, 0x7f, 0x8c, 0x8b, 0x96, 0xa4,
+    0xa1, 0x8d, 0x91, 0x97, 0xce, 0x8e, 0x8e, 0x9d, 0x86, 0x7f, 0x97, 0xa3,
+    0x99, 0x75, 0xa3, 0xa0, 0x69, 0x6a, 0x87, 0xa0, 0x9a, 0x80, 0xa2, 0x72,
+    0x6d, 0x85, 0x6b, 0x94, 0x8d, 0x77, 0x9f, 0x84, 0x7f, 0x92, 0x64, 0xaa,
+    0x78, 0x82, 0xa7, 0x8f, 0x84, 0x79, 0x84, 0xb9, 0x92, 0x7c, 0xb6, 0x96,
+    0x9c, 0x99, 0x8f, 0xab, 0xab, 0x8a, 0xa2, 0xab, 0x6d, 0x97, 0x7b, 0xb1,
+    0x9e, 0x6c, 0x9a, 0x99, 0xaa, 0xa3, 0x70, 0x80, 0x81, 0x6f, 0xb6, 0x95,
+    0x93, 0x93, 0x8e, 0x80, 0x86, 0xb0, 0x87, 0x91, 0x8f, 0x8c, 0xa4, 0x86,
+    0x89, 0x8f, 0x93, 0x83, 0x75, 0x7d, 0x9b, 0x86, 0x7d, 0x5a, 0x9d, 0x67,
+    0x9f, 0x78, 0x5c, 0xa5, 0x8e, 0xa2, 0xc1, 0x95, 0x89, 0x84, 0x53, 0xd1,
+    0x7d, 0x9b, 0xc0, 0x8f, 0x73, 0x7f, 0x85, 0x9e, 0x8a, 0x7b, 0xa6, 0x84,
+    0x6c, 0x74, 0x95, 0x93, 0x7a, 0x7a, 0x81, 0x7d, 0x89, 0x86, 0x76, 0x8a,
+    0xad, 0x66, 0x90, 0x90, 0x9d, 0x77, 0xb4, 0xad, 0xac, 0x8e, 0xb3, 0xa5,
+    0x9d, 0x91, 0xd7, 0x94, 0xba, 0x8b, 0x72, 0xa4, 0x93, 0x7e, 0xa7, 0x86,
+    0xae, 0x83, 0x63, 0xa6, 0xa0, 0x78, 0x81, 0x8b, 0xc4, 0x82, 0x8f, 0x98,
+    0xa1, 0x8f, 0x79, 0x9a, 0x92, 0x85, 0x9d, 0x91, 0x92, 0x84, 0x8f, 0x84,
+    0x91, 0x6d, 0x7b, 0x69, 0x75, 0x87, 0x5d, 0x99, 0x92, 0x83, 0xab, 0x8f,
+    0x53, 0x90, 0x7b, 0xa0, 0x71, 0x89, 0xc2, 0x7f, 0x6a, 0x7c, 0x86, 0xb2,
+    0x8d, 0x89, 0xaf, 0x9c, 0x81, 0x8c, 0x84, 0xbe, 0x93, 0x9c, 0xa8, 0x97,
+    0x68, 0x9b, 0x84, 0xa3, 0x8a, 0x77, 0xa5, 0x79, 0x7b, 0x87, 0x86, 0xa5,
+    0x80, 0x83, 0x9e, 0x8d, 0xb1, 0x94, 0x7a, 0x8b, 0xa6, 0xa8, 0x80, 0x98,
+    0x8c, 0x73, 0xa9, 0x7b, 0x91, 0x8f, 0x71, 0x82, 0x68, 0x84, 0xa5, 0x96,
+    0x67, 0x63, 0xa6, 0x71, 0xa7, 0x85, 0x57, 0x9f, 0x91, 0xb2, 0xa6, 0x87,
+    0x80, 0x8f, 0x6a, 0xba, 0x9d, 0xb7, 0xb9, 0x8b, 0x75, 0x7c, 0x6f, 0x9f,
+    0x74, 0x8d, 0xaf, 0x6e, 0x7c, 0x65, 0x6c, 0x8a, 0x7c, 0x81, 0x89, 0x77,
+    0x8b, 0x74, 0x65, 0x9b, 0xa5, 0x6b, 0x92, 0x71, 0xbb, 0x70, 0x99, 0xbf,
+    0xb0, 0x7b, 0x92, 0xb4, 0xa4, 0x84, 0xc4, 0x92, 0xa8, 0x94, 0x7e, 0xcd,
+    0x83, 0x87, 0xaf, 0xa0, 0xa5, 0x94, 0x72, 0xb9, 0x90, 0xa6, 0x9e, 0x9e,
+    0x9b, 0x7a, 0x68, 0xc0, 0x8f, 0x89, 0x72, 0x94, 0x9b, 0x81, 0x81, 0x91,
+    0x88, 0x90, 0xa8, 0x8d, 0x90, 0x78, 0x7c, 0x67, 0x64, 0x8e, 0x55, 0xa1,
+    0x6d, 0x86, 0xa3, 0x6f, 0x5c, 0x7d, 0x79, 0xa3, 0x64, 0x71, 0xd4, 0x87,
+    0x73, 0x85, 0x76, 0xc7, 0x72, 0x86, 0xb2, 0x8c, 0x7b, 0x8d, 0x96, 0xc3,
+    0xad, 0x87, 0xac, 0xa8, 0x84, 0x94, 0x7b, 0xbf, 0x83, 0x74, 0x8e, 0x8c,
+    0x9c, 0x99, 0x88, 0x8e, 0x86, 0x88, 0xae, 0x7f, 0x70, 0x96, 0x6f, 0x74,
+    0x8f, 0x85, 0x7c, 0x86, 0x97, 0x83, 0xa0, 0x6a, 0x8b, 0x82, 0x88, 0x90,
+    0x72, 0x84, 0x9b, 0xa1, 0x6f, 0x72, 0xa4, 0x95, 0xa6, 0x7d, 0x65, 0xbd,
+    0x90, 0xb6, 0x9e, 0x98, 0xa1, 0x94, 0x66, 0xb3, 0x9c, 0xb3, 0xa7, 0x7f,
+    0x91, 0x69, 0x6e, 0xb1, 0x68, 0x7a, 0xaa, 0x91, 0x7c, 0x71, 0x9f, 0x95,
+    0x83, 0x86, 0x76, 0x69, 0x9b, 0x7f, 0x8c, 0x94, 0x9c, 0x89, 0x86, 0x93,
+    0xc1, 0x79, 0x98, 0x9e, 0xb1, 0x90, 0x9b, 0xb7, 0xab, 0x86, 0xc6, 0xa1,
+    0xa9, 0xaa, 0x86, 0xb0, 0x8b, 0x79, 0xb9, 0x85, 0xbe, 0x92, 0x60, 0xc0,
+    0x9f, 0x9a, 0x90, 0x8d, 0xb5, 0x77, 0x95, 0xad, 0x8b, 0x93, 0x8a, 0x93,
+    0x93, 0x7e, 0x86, 0xa6, 0x7d, 0x89, 0x6b, 0x81, 0x93, 0x75, 0x7f, 0x86,
+    0x66, 0x8f, 0x56, 0x8f, 0x84, 0x75, 0x9e, 0x77, 0x78, 0x89, 0x62, 0xb3,
+    0x78, 0x76, 0xb5, 0x92, 0x7f, 0x80, 0x7a, 0xb9, 0x7d, 0x80, 0xc2, 0xb9,
+    0x7d, 0x8f, 0x8f, 0x8c, 0xa0, 0x78, 0xa2, 0xaf, 0x68, 0x98, 0x77, 0xac,
+    0x96, 0x77, 0x96, 0x99, 0x84, 0xb1, 0x72, 0x8e, 0x96, 0xa4, 0xa9, 0x8e,
+    0x84, 0x7b, 0x85, 0x8d, 0x8f, 0x83, 0x83, 0x7f, 0x85, 0x6e, 0xa4, 0x98,
+    0xab, 0x83, 0x90, 0x8e, 0x77, 0x8e, 0xab, 0x9c, 0x73, 0x79, 0x8d, 0x6e,
+    0xa0, 0x97, 0x68, 0xa7, 0x8a, 0xbd, 0x95, 0x96, 0x96, 0x8b, 0x72, 0xc7,
+    0x8d, 0x8c, 0xa5, 0x83, 0x9b, 0x8b, 0x6c, 0xac, 0x62, 0x78, 0xae, 0x78,
+    0x71, 0x7a, 0x8d, 0xae, 0x91, 0x87, 0x90, 0x82, 0x9b, 0x83, 0x90, 0x97,
+    0xb0, 0x96, 0x82, 0xa5, 0xa9, 0x76, 0xa5, 0xa0, 0xac, 0xa1, 0x93, 0x94,
+    0xb7, 0x91, 0xbb, 0x9b, 0xa4, 0xa5, 0x8c, 0xb5, 0x95, 0x7b, 0x92, 0x91,
+    0xb0, 0x97, 0x73, 0xb9, 0x86, 0xa7, 0x92, 0x98, 0x9e, 0x70, 0x77, 0xba,
+    0x96, 0x7b, 0xa6, 0x86, 0x97, 0x85, 0x8e, 0xaa, 0x93, 0x97, 0x8f, 0x8b,
+    0x8d, 0x79, 0x84, 0x7e, 0x70, 0x95, 0x52, 0x8f, 0x62, 0x75, 0x8b, 0x8b,
+    0x7b, 0x8b, 0x79, 0xaf, 0x90, 0x6d, 0xc8, 0x8d, 0x84, 0x8c, 0x72, 0xaf,
+    0x70, 0x8d, 0xa5, 0x8a, 0x76, 0x97, 0x87, 0x8e, 0xa9, 0x83, 0xb2, 0x8d,
+    0x7e, 0x9b, 0x76, 0xc2, 0xa2, 0x72, 0xc5, 0x87, 0x75, 0xb7, 0x92, 0x95,
+    0x9e, 0xa0, 0xc3, 0x82, 0x8d, 0x8f, 0x7d, 0x85, 0x90, 0x99, 0x7b, 0x82,
+    0x87, 0x87, 0xa0, 0x87, 0x9a, 0x8b, 0xa2, 0xa4, 0x67, 0x93, 0xa5, 0xbb,
+    0x73, 0x5f, 0x8c, 0x60, 0xa5, 0x7d, 0x6c, 0xb3, 0xb2, 0xb3, 0xa9, 0xa9,
+    0x8d, 0x8d, 0x67, 0xd7, 0x63, 0x99, 0xaa, 0x83, 0x88, 0x6a, 0x6f, 0x9e,
+    0x5e, 0x9e, 0x9d, 0x81, 0x84, 0x6e, 0x98, 0x90, 0x89, 0x7c, 0x95, 0x7d,
+    0x81, 0x8a, 0xa2, 0x8c, 0x92, 0x85, 0x80, 0x92, 0xac, 0x80, 0x9b, 0x9b,
+    0xc3, 0x8c, 0x95, 0xbc, 0xaa, 0x7c, 0xb5, 0x8d, 0xa1, 0xb8, 0x70, 0xb6,
+    0x8c, 0x92, 0xa8, 0x8e, 0xa3, 0x76, 0x6c, 0xbe, 0xa0, 0x8c, 0x92, 0x8e,
+    0xa1, 0x83, 0x76, 0xb2, 0x91, 0x7b, 0x8e, 0x87, 0x7f, 0x89, 0x8a, 0xa1,
+    0x91, 0xa0, 0x7a, 0x95, 0x7b, 0x86, 0x99, 0x92, 0x78, 0x8a, 0x62, 0x9e,
+    0x7b, 0x7b, 0x89, 0x79, 0x78, 0x87, 0x82, 0x94, 0x7d, 0x91, 0x96, 0x79,
+    0x7b, 0x8d, 0x80, 0xa7, 0x88, 0x95, 0xa6, 0x8f, 0x7d, 0x95, 0x79, 0xa2,
+    0x91, 0x9b, 0x9d, 0x90, 0x79, 0xa4, 0x88, 0x98, 0x9b, 0x7a, 0xa5, 0x7f,
+    0x71, 0x9c, 0x87, 0x96, 0x8c, 0x8f, 0xbc, 0x74, 0x95, 0x99, 0x7f, 0x78,
+    0x8c, 0x63, 0x7c, 0x7a, 0x92, 0x8c, 0xa8, 0x78, 0xa8, 0x89, 0x9a, 0x86,
+    0x69, 0x7e, 0xa1, 0xc3, 0x57, 0x68, 0x84, 0x89, 0xa9, 0x8d, 0x6f, 0xa9,
+    0x8a, 0xab, 0xa5, 0xad, 0x94, 0x83, 0x6b, 0xa7, 0x7e, 0x95, 0x9b, 0x7f,
+    0x8b, 0x78, 0x73, 0x90, 0x65, 0x8d, 0xb1, 0x91, 0x84, 0x65, 0x90, 0xb4,
+    0x8c, 0x89, 0x94, 0x7c, 0x99, 0x8b, 0x98, 0xb7, 0xb0, 0x91, 0x9e, 0x88,
+    0xbd, 0xa0, 0xa4, 0xb9, 0xad, 0x96, 0x97, 0xa3, 0xb6, 0x81, 0xba, 0x9b,
+    0xbc, 0xa9, 0x94, 0xb9, 0xa0, 0x85, 0x8e, 0xa1, 0xac, 0x87, 0x65, 0xa6,
+    0x98, 0x8e, 0xaa, 0xa3, 0xa3, 0x7f, 0x79, 0xb4, 0x93, 0x76, 0x90, 0x99,
+    0x8b, 0x90, 0x84, 0xa6, 0x90, 0x8f, 0x88, 0xa6, 0x89, 0x83, 0x86, 0x7a,
+    0x5d, 0x96, 0x71, 0xa5, 0x64, 0x94, 0x9a, 0x85, 0x7c, 0xa1, 0x96, 0x9d,
+    0x76, 0x8f, 0x95, 0xa0, 0x7f, 0x8c, 0x80, 0xc7, 0x6c, 0x7d, 0xb7, 0xb2,
+    0x82, 0x8e, 0x82, 0xbd, 0xb3, 0x82, 0x99, 0x9b, 0x80, 0x94, 0x8c, 0x94,
+    0x94, 0x6b, 0xc6, 0xa9, 0x81, 0x9f, 0x8c, 0x7e, 0x87, 0x88, 0xb3, 0x7d,
+    0x88, 0x8c, 0x81, 0x81, 0x7e, 0x7e, 0x86, 0x87, 0x96, 0x85, 0xb4, 0x87,
+    0xab, 0x91, 0x8f, 0xa1, 0x72, 0x83, 0xa4, 0x89, 0x6b, 0x75, 0x85, 0x7c,
+    0x94, 0x85, 0x6f, 0xad, 0x91, 0xae, 0xa4, 0xa5, 0xa7, 0x8e, 0x6c, 0xb2,
+    0x73, 0x99, 0x96, 0x92, 0x89, 0x81, 0x7d, 0x88, 0x60, 0x8d, 0x94, 0x83,
+    0x99, 0x68, 0x86, 0xa2, 0x94, 0x8e, 0x82, 0x76, 0x89, 0x8d, 0x98, 0x86,
+    0x94, 0x90, 0x83, 0x7d, 0xad, 0x94, 0xa6, 0x90, 0xcb, 0x96, 0xa2, 0xb2,
+    0xb6, 0x89, 0xc4, 0x9d, 0xc7, 0xa5, 0x75, 0xc3, 0x92, 0x8c, 0x8e, 0xad,
+    0x96, 0x94, 0x8e, 0xab, 0x94, 0x90, 0xa8, 0x84, 0xb5, 0x84, 0x66, 0xce,
+    0x74, 0x8c, 0x93, 0x8d, 0x8f, 0x95, 0x8b, 0xa1, 0x7b, 0xa1, 0x79, 0x9e,
+    0x81, 0xa4, 0xa0, 0x98, 0x5f, 0x78, 0x8e, 0x97, 0x6f, 0x81, 0x96, 0x8d,
+    0x70, 0x93, 0x72, 0x9c, 0x7b, 0x98, 0x8b, 0x8a, 0x8f, 0x8b, 0x6c, 0xa9,
+    0x81, 0x99, 0xb3, 0xa3, 0x71, 0x9c, 0x8b, 0x94, 0xa6, 0x8a, 0xb8, 0xa0,
+    0x7b, 0x98, 0x74, 0x9f, 0x92, 0x92, 0xb2, 0x89, 0x81, 0xa8, 0x87, 0x97,
+    0x96, 0x86, 0xa4, 0x7b, 0x63, 0x8e, 0x86, 0x7d, 0x76, 0x81, 0x93, 0x94,
+    0x98, 0x8b, 0xaf, 0x6d, 0xab, 0x9b, 0x85, 0x9b, 0x91, 0x86, 0x95, 0x95,
+    0x65, 0x89, 0x9e, 0x6b, 0xa4, 0x82, 0x68, 0xb5, 0x8b, 0xd1, 0x9d, 0x93,
+    0x7d, 0x67, 0x5e, 0xba, 0x9b, 0x94, 0x93, 0x8d, 0x88, 0x73, 0x7c, 0x8e,
+    0x7d, 0x83, 0x9a, 0x82, 0xa4, 0x62, 0x9a, 0x8d, 0x86, 0xa0, 0x7b, 0x72,
+    0xa9, 0x84, 0xa7, 0x94, 0xb2, 0x98, 0x8f, 0x81, 0xbe, 0x84, 0x9d, 0x94,
+    0x9c, 0x9a, 0x94, 0x8f, 0xb1, 0x82, 0xb1, 0x82, 0xb1, 0xb2, 0x78, 0xa7,
+    0x95, 0x99, 0x8b, 0x8c, 0xb1, 0x81, 0x5b, 0xbb, 0x88, 0x7a, 0x90, 0xa3,
+    0x8d, 0x78, 0x6f, 0xbf, 0x8c, 0x93, 0xa1, 0x8e, 0x9f, 0x98, 0x88, 0xb3,
+    0x7e, 0x82, 0x8a, 0x8e, 0x7d, 0x8a, 0x96, 0x6a, 0x6c, 0x7b, 0x91, 0x94,
+    0x6f, 0x89, 0x9a, 0x84, 0x73, 0x8b, 0x8c, 0x91, 0x7d, 0x8e, 0x9e, 0x80,
+    0x88, 0x81, 0x78, 0xaf, 0x86, 0xa5, 0xa2, 0x8d, 0x6a, 0x8a, 0x75, 0xa1,
+    0x83, 0x87, 0xaf, 0x7d, 0x6c, 0xa3, 0x65, 0x77, 0x89, 0x91, 0x9a, 0xa1,
+    0xa1, 0xaf, 0x78, 0x94, 0x93, 0xb2, 0xaf, 0x92, 0x74, 0x7a, 0xa7, 0x7b,
+    0x8f, 0x9c, 0x86, 0x8d, 0x8f, 0x79, 0xb0, 0xb3, 0x97, 0x82, 0x8e, 0x92,
+    0x92, 0x81, 0xa7, 0xbc, 0x6e, 0x6e, 0x89, 0xa5, 0x9a, 0x8d, 0x84, 0xb6,
+    0x83, 0xae, 0xa5, 0xa7, 0xae, 0x86, 0x6b, 0xb9, 0x89, 0xb0, 0x8f, 0x82,
+    0x8f, 0x6f, 0x83, 0x98, 0x6a, 0x98, 0x9a, 0x85, 0x9f, 0x78, 0x93, 0x8d,
+    0x83, 0x88, 0x88, 0x7e, 0x97, 0x99, 0x8a, 0x9b, 0xb0, 0x90, 0x86, 0x88,
+    0xb5, 0x90, 0xb3, 0xaa, 0xad, 0x96, 0x93, 0xa3, 0x9d, 0x81, 0xa3, 0x9a,
+    0x9f, 0x99, 0x90, 0x9c, 0x9e, 0x8e, 0x88, 0x93, 0xa8, 0x94, 0x62, 0xa6,
+    0x94, 0x92, 0xa1, 0x86, 0xb7, 0x8a, 0x6a, 0xa6, 0x81, 0x7e, 0x7b, 0x80,
+    0x89, 0x8f, 0x74, 0xa6, 0x72, 0x91, 0xa6, 0x9b, 0x73, 0x97, 0x7e, 0x6f,
+    0x70, 0x8d, 0x73, 0x98, 0x80, 0x90, 0x8f, 0x7e, 0x83, 0x77, 0x84, 0x92,
+    0x7f, 0x8c, 0x91, 0xa6, 0x99, 0x90, 0x9d, 0xb1, 0x88, 0x85, 0x89, 0x85,
+    0x7c, 0x9f, 0x7e, 0xb0, 0xaa, 0x84, 0xa0, 0x8e, 0x74, 0x93, 0x78, 0x90,
+    0x9a, 0x8b, 0x8e, 0x97, 0x8f, 0x9f, 0x7c, 0x83, 0x8a, 0x88, 0xa5, 0x8f,
+    0x8b, 0x74, 0x84, 0x9a, 0x7f, 0x91, 0x88, 0x77, 0x9c, 0x91, 0xbc, 0x93,
+    0x9c, 0x82, 0x89, 0x9b, 0x8a, 0x7d, 0xb7, 0xb8, 0x6f, 0x68, 0xb5, 0x8e,
+    0xb4, 0x86, 0x8c, 0xb3, 0x94, 0xb6, 0xa4, 0x93, 0x98, 0x8b, 0x70, 0xb3,
+    0x96, 0xaa, 0x87, 0x89, 0x99, 0x68, 0x74, 0xa4, 0x69, 0x9e, 0x8e, 0x6b,
+    0x9f, 0x6b, 0x95, 0x9c, 0x88, 0x89, 0x8a, 0x86, 0x8d, 0x75, 0x94, 0x88,
+    0xa0, 0x94, 0x77, 0x8c, 0x9c, 0x8d, 0x8e, 0xa4, 0xac, 0xa7, 0x8a, 0x9b,
+    0xa9, 0x81, 0xab, 0xac, 0xaf, 0xaf, 0x87, 0xbb, 0x9b, 0x95, 0x8e, 0x9e,
+    0x9f, 0xa1, 0x6c, 0xb4, 0x98, 0x8f, 0x81, 0x8d, 0x98, 0x8f, 0x78, 0x96,
+    0x89, 0x86, 0x6c, 0x91, 0x8d, 0x9f, 0x95, 0x9f, 0x6b, 0x7f, 0x93, 0x7c,
+    0x96, 0x8e, 0x8a, 0x58, 0x80, 0x8e, 0x7a, 0x93, 0x8b, 0x78, 0x99, 0x92,
+    0x62, 0x8e, 0x83, 0x8e, 0x87, 0x83, 0x86, 0x99, 0x93, 0x92, 0x80, 0x95,
+    0xa2, 0x72, 0xa2, 0x97, 0x78, 0x87, 0x7b, 0xa3, 0x99, 0x78, 0x98, 0x9c,
+    0x80, 0x9b, 0x5e, 0x8a, 0x9c, 0x99, 0xa6, 0x7a, 0x8e, 0x99, 0x7a, 0x8e,
+    0x8b, 0x76, 0x9b, 0x89, 0x80, 0x8e, 0x83, 0x8a, 0x80, 0x7c, 0x80, 0x74,
+    0x95, 0x8c, 0xbf, 0x7e, 0xa8, 0x7a, 0x99, 0x7d, 0x7d, 0x73, 0xb4, 0xae,
+    0x88, 0x76, 0xae, 0x78, 0xaa, 0x65, 0x94, 0xbe, 0x97, 0xaf, 0xa4, 0x91,
+    0x9c, 0x95, 0x6c, 0xbe, 0x82, 0xb1, 0x9b, 0x91, 0x85, 0x7d, 0x66, 0x9c,
+    0x99, 0xbd, 0xa3, 0x88, 0xa8, 0x73, 0x81, 0x94, 0x92, 0x8e, 0x90, 0x8d,
+    0xaf, 0x75, 0x86, 0x9b, 0x8b, 0x8b, 0x8d, 0x74, 0xbd, 0x85, 0x97, 0x8b,
+    0x9d, 0xba, 0x90, 0xa8, 0x9d, 0x72, 0xa5, 0xa8, 0xbf, 0xbb, 0x7b, 0xb6,
+    0xad, 0x94, 0x6f, 0x9a, 0xa7, 0x97, 0x78, 0x9c, 0x98, 0x8d, 0x8c, 0x93,
+    0xb8, 0xa8, 0x7f, 0x9d, 0x98, 0x7f, 0x8f, 0x8a, 0x8d, 0xa8, 0x86, 0x7b,
+    0x5d, 0x89, 0x8a, 0x83, 0x8c, 0x8b, 0x81, 0x56, 0x7c, 0x87, 0x89, 0xa6,
+    0x75, 0x7c, 0x92, 0x74, 0x96, 0x92, 0x78, 0x8d, 0x8d, 0x98, 0xae, 0x7a,
+    0x95, 0x8f, 0x8b, 0x9c, 0x95, 0x9f, 0xae, 0x93, 0x7b, 0x93, 0x8c, 0x9a,
+    0x79, 0x74, 0x94, 0x6e, 0x7e, 0x8f, 0x64, 0x9f, 0x9c, 0x88, 0x8f, 0x8e,
+    0x84, 0x8d, 0x89, 0x95, 0x96, 0x8f, 0x9d, 0x60, 0x85, 0x86, 0x7c, 0x93,
+    0x8d, 0x68, 0x83, 0x7c, 0x94, 0x87, 0xb8, 0xa2, 0x9d, 0x82, 0x8e, 0x84,
+    0x6c, 0x73, 0xa8, 0xbc, 0x84, 0x85, 0xa2, 0x79, 0x92, 0x64, 0x69, 0xa9,
+    0x82, 0xa7, 0x9d, 0x95, 0x8e, 0x6f, 0x9f, 0xa7, 0x97, 0xb1, 0x9d, 0x8e,
+    0xa1, 0x70, 0x80, 0x9e, 0x8e, 0x91, 0xa0, 0xaa, 0x81, 0x5b, 0x98, 0x8f,
+    0xa0, 0xaa, 0x83, 0x7a, 0x91, 0x7a, 0x73, 0x80, 0xa6, 0x9a, 0x80, 0x7d,
+    0x9e, 0x75, 0x7b, 0xa3, 0xad, 0x92, 0x98, 0xc0, 0xa1, 0x80, 0x88, 0xa2,
+    0xa5, 0xa4, 0x7e, 0x9b, 0xa0, 0x80, 0x6e, 0xa0, 0x9f, 0xa3, 0x8a, 0x8f,
+    0xa2, 0x93, 0x86, 0x8d, 0x8f, 0x93, 0x7e, 0x90, 0x98, 0x83, 0x7d, 0x9b,
+    0x9f, 0x9a, 0x97, 0x83, 0x6e, 0x8d, 0x94, 0x6c, 0x7b, 0x7f, 0x73, 0x65,
+    0x6a, 0x93, 0x8a, 0x94, 0x83, 0x89, 0x7d, 0x7b, 0x77, 0x8a, 0x7a, 0x9b,
+    0x8e, 0x8d, 0x94, 0x89, 0x86, 0x83, 0x7c, 0x8e, 0x8b, 0x90, 0xab, 0x99,
+    0x81, 0x8e, 0x77, 0x9c, 0x8c, 0x82, 0x97, 0x8f, 0x78, 0x91, 0x5f, 0xa1,
+    0x8b, 0x83, 0xa9, 0x8d, 0x7b, 0x97, 0x77, 0x80, 0x84, 0x7e, 0x9e, 0x75,
+    0xa3, 0x86, 0x67, 0x7c, 0x80, 0x6d, 0x77, 0x75, 0x88, 0x75, 0xad, 0x7a,
+    0x93, 0x89, 0x8c, 0x87, 0x7a, 0x79, 0xb2, 0xa1, 0x69, 0x80, 0xb5, 0x7a,
+    0xa6, 0x7b, 0x95, 0xac, 0x95, 0xa9, 0x98, 0xa4, 0xad, 0x83, 0x8d, 0xbe,
+    0xa4, 0x98, 0xad, 0x7d, 0x8b, 0x65, 0x65, 0xad, 0x6a, 0xae, 0xa3, 0xa8,
+    0x9c, 0x63, 0x90, 0x91, 0x6d, 0x9a, 0x81, 0x98, 0x86, 0x6a, 0x83, 0x84,
+    0x94, 0x9c, 0x77, 0x86, 0xc2, 0x7f, 0x9b, 0xa9, 0xad, 0xae, 0xa7, 0xa6,
+    0xd4, 0x70, 0x9d, 0xb5, 0xaa, 0xdb, 0x8f, 0xa3, 0xa5, 0x87, 0x88, 0x9e,
+    0xa9, 0x9f, 0x62, 0xa7, 0xa2, 0x8e, 0x7d, 0x8a, 0x9d, 0xa2, 0x6b, 0xa7,
+    0x96, 0x6d, 0x76, 0x8c, 0x9b, 0x8c, 0x86, 0x86, 0x93, 0x7c, 0x9d, 0x7c,
+    0x7e, 0x93, 0x5c, 0x79, 0x76, 0x8c, 0x8a, 0x87, 0x79, 0x97, 0x9a, 0x7a,
+    0x85, 0x8c, 0x7f, 0x85, 0x7a, 0xa1, 0xa7, 0x72, 0x87, 0x7f, 0x96, 0x9e,
+    0x92, 0x92, 0x9e, 0xa0, 0x72, 0x99, 0x7a, 0xb0, 0x8c, 0x8d, 0xa3, 0x9b,
+    0x91, 0xa6, 0x63, 0x94, 0x8b, 0x81, 0xbb, 0x94, 0x79, 0x95, 0x99, 0x9a,
+    0xa0, 0x7a, 0x96, 0x72, 0x82, 0x9a, 0x83, 0x7f, 0x72, 0x7f, 0x6d, 0x75,
+    0x91, 0x7f, 0xbc, 0x84, 0x9a, 0x81, 0x95, 0x69, 0x7d, 0x6d, 0xa2, 0xa8,
+    0x7e, 0x64, 0xac, 0x86, 0x85, 0x6d, 0x99, 0xaa, 0x7e, 0x79, 0x9c, 0xa0,
+    0xa4, 0x77, 0x99, 0xac, 0xa8, 0x8d, 0xb7, 0xa2, 0xa3, 0x61, 0x82, 0x98,
+    0x84, 0x8e, 0xa1, 0x8c, 0x88, 0x82, 0x6f, 0x7d, 0x88, 0x80, 0x7a, 0x8a,
+    0x8c, 0x6d, 0x87, 0x6f, 0xab, 0x8f, 0x8b, 0x76, 0xa0, 0x7d, 0x9f, 0xab,
+    0xb0, 0xb8, 0x9c, 0x8d, 0xb8, 0x81, 0x89, 0x94, 0xa8, 0xc8, 0x92, 0x9b,
+    0x8d, 0x83, 0x7b, 0xaf, 0x97, 0x94, 0x6e, 0xa5, 0x9b, 0x97, 0x89, 0x8d,
+    0xaa, 0x8a, 0x66, 0x88, 0x93, 0x84, 0xa1, 0x88, 0xa0, 0x99, 0x85, 0x89,
+    0x7d, 0x84, 0x8b, 0x6a, 0x92, 0xa1, 0x74, 0x76, 0x73, 0x87, 0x7a, 0x9a,
+    0x77, 0x86, 0x89, 0x5f, 0x7f, 0x8b, 0x7f, 0x8d, 0x7e, 0x81, 0x95, 0x8a,
+    0x7d, 0x85, 0x74, 0x9a, 0x87, 0x8c, 0x9e, 0xae, 0x80, 0x88, 0x7d, 0x8b,
+    0xaa, 0x79, 0x7c, 0x97, 0x79, 0x90, 0x7b, 0x97, 0x97, 0x9f, 0xa1, 0xa2,
+    0xab, 0x97, 0x69, 0x7a, 0x8d, 0x9f, 0x9f, 0x89, 0x90, 0x8c, 0x66, 0x98,
+    0x6e, 0x86, 0x7b, 0x6e, 0x86, 0x8a, 0xb2, 0xa6, 0x93, 0x7d, 0x8c, 0x81,
+    0x7e, 0x84, 0xa6, 0xb6, 0x83, 0x92, 0xa0, 0x88, 0x90, 0x5f, 0x7c, 0x92,
+    0x98, 0x94, 0x92, 0x98, 0xa7, 0x65, 0x90, 0xa2, 0xa2, 0x9b, 0xa6, 0x7d,
+    0x8b, 0x5a, 0x94, 0x95, 0x9b, 0xa5, 0x99, 0xa5, 0x7e, 0x61, 0x9a, 0x7a,
+    0x8b, 0x77, 0x87, 0x76, 0x9d, 0x72, 0x9a, 0x84, 0x98, 0x94, 0x92, 0x73,
+    0xae, 0x78, 0x8e, 0xaa, 0xa0, 0xc3, 0x7a, 0xa4, 0xa0, 0x75, 0xa9, 0xae,
+    0x8c, 0xd6, 0x87, 0x8f, 0x9f, 0x8c, 0x9b, 0x90, 0x99, 0x97, 0x73, 0x8f,
+    0x9b, 0x9c, 0x8c, 0x89, 0xa5, 0x84, 0x8f, 0x7b, 0x8b, 0x7f, 0x97, 0x98,
+    0x8d, 0x7b, 0x94, 0x9d, 0x9c, 0x8e, 0x92, 0x89, 0x88, 0x8d, 0x6c, 0x63,
+    0x73, 0x81, 0x72, 0x8a, 0x88, 0x8a, 0x9f, 0x79, 0x81, 0x82, 0x9a, 0xa9,
+    0x7a, 0x92, 0x7d, 0x76, 0x7b, 0x7a, 0x6a, 0xbe, 0x91, 0x7d, 0x86, 0xad,
+    0x84, 0x86, 0x6c, 0x91, 0x91, 0x9f, 0x92, 0x6b, 0x95, 0x98, 0x84, 0xa0,
+    0x8f, 0x8b, 0x9e, 0x7f, 0x9f, 0x97, 0x7e, 0x87, 0x80, 0x9e, 0x79, 0x8d,
+    0x68, 0x87, 0x88, 0x7d, 0x89, 0x81, 0x6d, 0x85, 0x80, 0x82, 0xa0, 0x97,
+    0xa3, 0x72, 0x94, 0x74, 0x8e, 0x56, 0x96, 0x98, 0x91, 0x6f, 0xa0, 0xae,
+    0x7c, 0x6e, 0x8e, 0xa9, 0x7c, 0x80, 0x87, 0xa3, 0x9e, 0x57, 0x8e, 0xb5,
+    0x87, 0xa6, 0x87, 0x79, 0x8f, 0x55, 0x8a, 0x81, 0x97, 0x6c, 0x9b, 0x99,
+    0x78, 0x5c, 0x82, 0x80, 0x91, 0x76, 0x80, 0x91, 0x8b, 0x65, 0x89, 0x7d,
+    0xa9, 0x95, 0x89, 0x97, 0x96, 0x6a, 0x89, 0xad, 0x92, 0x9f, 0xb6, 0x82,
+    0x88, 0x79, 0x9d, 0xa5, 0x9c, 0xae, 0x9a, 0x93, 0x77, 0x8e, 0x8a, 0xb5,
+    0x84, 0xb0, 0x76, 0xa2, 0x89, 0xa0, 0x96, 0x7a, 0xa5, 0x8e, 0x7e, 0x74,
+    0x8d, 0x89, 0x89, 0x9e, 0x93, 0x95, 0x90, 0x78, 0x93, 0x8f, 0xa5, 0x7c,
+    0x9d, 0x7c, 0x77, 0x85, 0x81, 0x92, 0x7c, 0x87, 0x92, 0x82, 0x98, 0xa3,
+    0x63, 0x76, 0x9b, 0x91, 0x7b, 0x8e, 0x97, 0x7e, 0x66, 0x90, 0x63, 0xb4,
+    0x71, 0x88, 0x86, 0x8e, 0x6f, 0x89, 0x7a, 0x88, 0x93, 0x7f, 0x96, 0xa8,
+    0x7d, 0x88, 0x88, 0x86, 0x7b, 0x91, 0x88, 0x6b, 0xa6, 0x8b, 0x69, 0x78,
+    0x82, 0x80, 0x83, 0x6b, 0xaf, 0x81, 0x7b, 0x64, 0x8f, 0x78, 0x6e, 0x7f,
+    0x86, 0x91, 0x92, 0xa3, 0xa0, 0x97, 0x82, 0x88, 0x92, 0x90, 0x9e, 0x89,
+    0x9d, 0x7b, 0x96, 0x82, 0xa3, 0x8c, 0x7f, 0x84, 0x7a, 0x6c, 0x60, 0x85,
+    0xa9, 0x74, 0x83, 0xa2, 0x89, 0x87, 0x9b, 0x77, 0x9b, 0x9a, 0x99, 0x84,
+    0x7c, 0x9c, 0x8d, 0x90, 0x8d, 0x7b, 0x74, 0x77, 0x93, 0x8c, 0x6c, 0x8b,
+    0x85, 0x78, 0x7f, 0x7d, 0x75, 0x7f, 0x7e, 0x85, 0x8f, 0x7d, 0x62, 0x8c,
+    0x7c, 0xad, 0x7f, 0x83, 0xa1, 0xa1, 0x97, 0x7b, 0x72, 0x82, 0x9d, 0x81,
+    0x94, 0x81, 0x8d, 0x9f, 0x6f, 0x8f, 0x9d, 0x89, 0x6a, 0x7e, 0x7f, 0x7f,
+    0x8d, 0x7e, 0x91, 0x86, 0x7d, 0x8a, 0x7e, 0x70, 0x7b, 0x9b, 0x6e, 0x5f,
+    0xa8, 0x7a, 0x73, 0x8a, 0x7a, 0x71, 0x90, 0x95, 0x8d, 0x78, 0x7b, 0x72,
+    0x5e, 0x89, 0x62, 0xa1, 0x87, 0x7f, 0x83, 0x75, 0x98, 0x7f, 0x76, 0x72,
+    0x8f, 0x9b, 0x7a, 0x8b, 0xa1, 0x7f, 0x60, 0x99, 0x96, 0x6e, 0x67, 0x76,
+    0x88, 0x98, 0x6c, 0x7b, 0x9b, 0x8d, 0x5f, 0x89, 0x7c, 0x81, 0x79, 0x86,
+    0x69, 0x9e, 0x83, 0x65, 0x8e, 0x82, 0x83, 0x89, 0x85, 0x7f, 0x90, 0x80,
+    0xa2, 0x81, 0x85, 0x83, 0x8e, 0x94, 0x94, 0x75, 0x86, 0x87, 0x9a, 0xb2,
+    0x82, 0x99, 0x85, 0x7f, 0x8c, 0x7e, 0x81, 0x9a, 0x81, 0x7d, 0x87, 0x81,
+    0xa3, 0x8c, 0x8d, 0x85, 0x8d, 0x96, 0x86, 0x7c, 0xa7, 0x87, 0x7e, 0x9d,
+    0x63, 0xa8, 0x7c, 0x97, 0xa2, 0xa4, 0x7e, 0x87, 0x93, 0x9e, 0x89, 0x8d,
+    0x6b, 0x6d, 0x9d, 0x9b, 0x78, 0x8a, 0x8e, 0x7f, 0x7b, 0xa5, 0x6e, 0x8c,
+    0x89, 0x88, 0x73, 0x7e, 0x77, 0x9d, 0xa6, 0xa7, 0x77, 0x87, 0x7e, 0x7e,
+    0x97, 0x84, 0x6b, 0x59, 0x60, 0x90, 0x85, 0x76, 0x8f, 0x61, 0x7f, 0x94,
+    0x8f, 0x84, 0x8b, 0x7f, 0x73, 0x77, 0x73, 0x71, 0x8a, 0x9b, 0x7b, 0x89,
+    0x97, 0x8f, 0x76, 0x63, 0xa3, 0xa1, 0x6b, 0x7c, 0x62, 0x95, 0x8e, 0xa3,
+    0x9f, 0x89, 0x8f, 0x7f, 0x92, 0x7c, 0xa2, 0xa4, 0xa6, 0x92, 0x89, 0x93,
+    0x74, 0x73, 0x73, 0x96, 0xad, 0x9b, 0x87, 0xac, 0x91, 0x8a, 0xa0, 0x70,
+    0x70, 0x7e, 0x8f, 0x74, 0x75, 0xaf, 0x8d, 0x82, 0x8e, 0x82, 0x96, 0x7d,
+    0x69, 0x9c, 0x64, 0xa2, 0x82, 0x89, 0x83, 0x9d, 0x83, 0x88, 0x62, 0x92,
+    0x72, 0x89, 0x6d, 0x7f, 0x92, 0x70, 0x8e, 0x80, 0x7e, 0x8d, 0x91, 0x85,
+    0x8d, 0x89, 0x83, 0x96, 0x90, 0x96, 0x9c, 0xa6, 0x8a, 0x73, 0x89, 0x79,
+    0xa9, 0x70, 0x80, 0x78, 0x96, 0x80, 0x7b, 0x85, 0xa5, 0x80, 0x93, 0x95,
+    0xc5, 0x74, 0x81, 0x88, 0xa2, 0x93, 0x86, 0x9c, 0xa3, 0x6d, 0x92, 0x8a,
+    0x92, 0x99, 0x98, 0x65, 0xad, 0x63, 0x9d, 0x95, 0x99, 0x89, 0x7f, 0x7a,
+    0x99, 0x91, 0x7f, 0x78, 0x90, 0x8f, 0x80, 0x85, 0xa1, 0x68, 0x9d, 0x6c,
+    0x83, 0x8f, 0x7c, 0x5e, 0x99, 0x7b, 0x80, 0x91, 0x66, 0x8a, 0x92, 0xb3,
+    0x7a, 0x99, 0x91, 0x7e, 0x7d, 0x96, 0x69, 0x9e, 0x7c, 0x89, 0xad, 0x8f,
+    0x9d, 0x90, 0x85, 0x8e, 0x72, 0xa9, 0x89, 0x83, 0x7c, 0x82, 0x70, 0x82,
+    0x6b, 0x79, 0x75, 0x8d, 0x77, 0x9b, 0x7c, 0x8f, 0x8a, 0x95, 0x87, 0x9f,
+    0x7c, 0x90, 0x87, 0x70, 0x83, 0x83, 0x98, 0x9f, 0x85, 0x86, 0x8d, 0x81,
+    0x87, 0x87, 0x87, 0x9d, 0x8f, 0x9d, 0x7c, 0x98, 0xa2, 0xac, 0x88, 0x93,
+    0x88, 0x7d, 0x9b, 0x76, 0x82, 0x67, 0x69, 0x7f, 0x8c, 0x8d, 0x94, 0x7d,
+    0x7b, 0xae, 0x8c, 0x85, 0x8b, 0xa7, 0x8c, 0x87, 0x96, 0x7d, 0x8b, 0x90,
+    0x90, 0x7c, 0x92, 0xa8, 0x81, 0x87, 0xa4, 0xa4, 0x82, 0x8b, 0x8d, 0x89,
+    0x8f, 0x70, 0x9d, 0x7f, 0xa0, 0x84, 0x99, 0x65, 0x99, 0x78, 0x94, 0x8b,
+    0xc5, 0x8d, 0x8d, 0x55, 0xb3, 0x8d, 0x78, 0x93, 0xb4, 0x6d, 0x84, 0x90,
+    0xd5, 0x76, 0x7a, 0x9e, 0xc8, 0x8f, 0x86, 0x8a, 0xaa, 0x8b, 0x7f, 0x90,
+    0xaa, 0x95, 0x9c, 0x81, 0xb4, 0x6b, 0x64, 0x8a, 0x99, 0x84, 0x74, 0x6e,
+    0x95, 0x75, 0x98, 0x92, 0x9a, 0x91, 0x8c, 0x7d, 0x88, 0x6e, 0x89, 0x7d,
+    0x87, 0x80, 0x8e, 0x86, 0x78, 0x9f, 0x96, 0x75, 0x76, 0x82, 0x84, 0xaf,
+    0x8a, 0xb3, 0x93, 0x97, 0x86, 0x7c, 0x7e, 0x96, 0x7c, 0x6d, 0x90, 0x8e,
+    0x85, 0x88, 0x8a, 0x9f, 0x70, 0x89, 0x9f, 0x99, 0x95, 0x87, 0x91, 0x9d,
+    0x80, 0x74, 0x88, 0x7c, 0x7f, 0xa8, 0x93, 0x77, 0x66, 0xa6, 0x80, 0xa2,
+    0x88, 0xa0, 0xaf, 0x6f, 0x76, 0x70, 0x82, 0x9a, 0x73, 0x89, 0x9a, 0x75,
+    0x75, 0x8e, 0x5f, 0x85, 0x6a, 0x76, 0x98, 0x66, 0x87, 0xa3, 0x7a, 0x73,
+    0x9d, 0xa1, 0x98, 0x8e, 0x78, 0x91, 0x83, 0x8c, 0x82, 0x9e, 0x90, 0x87,
+    0x8f, 0x9b, 0x8b, 0x8f, 0x89, 0x62, 0x74, 0x82, 0x7b, 0x7f, 0x8a, 0x9d,
+    0x89, 0x93, 0x8c, 0x7a, 0x99, 0x77, 0xac, 0x75, 0x9b, 0x7f, 0x7f, 0x56,
+    0x8c, 0x96, 0x70, 0x79, 0xc2, 0x7d, 0x90, 0x64, 0xe9, 0x79, 0x68, 0xb2,
+    0xc2, 0xa6, 0xa7, 0x7e, 0xd9, 0x98, 0x79, 0x87, 0xc0, 0x97, 0x87, 0x66,
+    0xd0, 0x9f, 0x92, 0x82, 0xa4, 0xa8, 0x8d, 0x78, 0xa6, 0xa1, 0x76, 0x7d,
+    0xa4, 0x87, 0x89, 0x51, 0xae, 0x88, 0x5b, 0x76, 0x7d, 0x70, 0x74, 0x93,
+    0x89, 0x74, 0x9e, 0x7a, 0x79, 0x64, 0x9a, 0x94, 0x65, 0x93, 0xb0, 0x8d,
+    0x88, 0x7e, 0x8e, 0xa5, 0x63, 0x94, 0x94, 0x7d, 0x91, 0x87, 0x84, 0x95,
+    0x75, 0x9e, 0x81, 0x99, 0x65, 0x76, 0x82, 0x9c, 0x6a, 0xab, 0x84, 0x85,
+    0x88, 0x72, 0x92, 0x83, 0x82, 0xaf, 0x6d, 0x9d, 0x9e, 0x73, 0x98, 0x7f,
+    0x91, 0xb4, 0x62, 0x8d, 0x74, 0x6e, 0xb4, 0x94, 0x97, 0x9e, 0x6f, 0x9a,
+    0x83, 0x7b, 0xa9, 0x7d, 0x87, 0x97, 0x60, 0xa9, 0x7a, 0x75, 0xad, 0x6c,
+    0x77, 0xa4, 0x88, 0x82, 0x6f, 0x8a, 0x83, 0x74, 0x9a, 0xa7, 0x83, 0x91,
+    0x7c, 0x7c, 0x78, 0x77, 0x83, 0x92, 0x7a, 0x83, 0x90, 0x6f, 0x79, 0x6b,
+    0x9b, 0x8d, 0x99, 0x95, 0x7b, 0x89, 0x8e, 0x6c, 0x8e, 0x6c, 0x9b, 0x91,
+    0x97, 0x80, 0x83, 0x6f, 0xaa, 0x91, 0x66, 0x76, 0xc9, 0x77, 0x82, 0x4d,
+    0xd7, 0x5f, 0x58, 0x9a, 0xb1, 0x7a, 0xb1, 0x6b, 0xe5, 0x9d, 0x76, 0x89,
+    0xb6, 0x94, 0x90, 0x5b, 0xb8, 0x92, 0x7d, 0x90, 0xbd, 0x9a, 0x85, 0x4e,
+    0xb4, 0x84, 0x61, 0x82, 0x94, 0x8e, 0x70, 0x57, 0x90, 0x89, 0x6f, 0x60,
+    0x78, 0x90, 0x78, 0x85, 0x8e, 0x7c, 0x76, 0x74, 0x71, 0x5d, 0x94, 0x93,
+    0x71, 0x8f, 0xc2, 0x80, 0x75, 0x7d, 0x77, 0xa8, 0x70, 0x8f, 0xa6, 0x83,
+    0x74, 0x6b, 0x79, 0x97, 0x76, 0xa2, 0xad, 0x93, 0x5b, 0x8c, 0x7c, 0x7e,
+    0x82, 0x9b, 0xa0, 0x76, 0x71, 0x7a, 0xa3, 0x80, 0x87, 0x90, 0x92, 0xa6,
+    0x85, 0x71, 0x99, 0x91, 0x91, 0x8c, 0x99, 0x9b, 0x92, 0x74, 0xb2, 0x79,
+    0x9c, 0x7c, 0x7b, 0xa8, 0x8c, 0x6f, 0xb5, 0x69, 0x7a, 0x8a, 0x68, 0x9f,
+    0x82, 0x7d, 0xbd, 0x5f, 0xa1, 0x92, 0x83, 0x9f, 0x6f, 0xa1, 0x88, 0x61,
+    0x7b, 0x94, 0x89, 0x83, 0x6f, 0x6e, 0x92, 0x9d, 0x65, 0x7f, 0x97, 0x83,
+    0x87, 0x75, 0x92, 0x8a, 0x82, 0x82, 0x79, 0x92, 0x78, 0x89, 0x92, 0x7a,
+    0x91, 0x64, 0x8a, 0x93, 0x9d, 0x74, 0x78, 0x64, 0xab, 0x57, 0x7a, 0x84,
+    0xcf, 0x7d, 0x95, 0x4f, 0xde, 0x63, 0x78, 0x9a, 0xb7, 0x7a, 0x8b, 0x5b,
+    0xda, 0xa3, 0x94, 0x99, 0xbd, 0x88, 0xa4, 0x53, 0xad, 0x8b, 0x81, 0x96,
+    0xca, 0x8f, 0x76, 0x5e, 0xbd, 0x9d, 0x70, 0x81, 0x9b, 0x7d, 0x8a, 0x44,
+    0xa0, 0x77, 0x52, 0x6e, 0x82, 0x62, 0x6a, 0x6b, 0x9d, 0xaa, 0x81, 0x85,
+    0x7d, 0x5f, 0x7f, 0x9c, 0x65, 0x99, 0x97, 0x81, 0x7f, 0x65, 0x65, 0xa4,
+    0x84, 0x8c, 0xa1, 0x6d, 0x7a, 0x70, 0x79, 0x90, 0x98, 0xaa, 0x76, 0x95,
+    0x7f, 0x91, 0x95, 0x96, 0x6e, 0xa5, 0x95, 0xa2, 0x7d, 0x7e, 0x93, 0x87,
+    0x7d, 0x9b, 0x85, 0x9b, 0x85, 0x79, 0x96, 0x6b, 0x9d, 0x9d, 0x61, 0x99,
+    0x9c, 0x74, 0xcc, 0x7e, 0x9a, 0x83, 0x83, 0x98, 0x6f, 0x6d, 0xc5, 0x69,
+    0xb0, 0xa5, 0x5c, 0x91, 0x6c, 0x7b, 0xcc, 0x72, 0x9a, 0x9d, 0x7e, 0xa3,
+    0x8a, 0x96, 0x8e, 0x74, 0x7b, 0x80, 0x6b, 0x85, 0x84, 0x56, 0x92, 0x83,
+    0x64, 0x90, 0x86, 0x86, 0x88, 0x79, 0x8b, 0xa0, 0x86, 0x72, 0xab, 0x95,
+    0x80, 0x81, 0x96, 0x8f, 0x75, 0x7f, 0x71, 0x92, 0x9e, 0x75, 0x62, 0x5e,
+    0xc3, 0x7a, 0x6c, 0x84, 0xba, 0x81, 0x8f, 0x49, 0xc9, 0x76, 0x54, 0x89,
+    0xc2, 0x8c, 0xa2, 0x54, 0xd8, 0xa4, 0x72, 0x90, 0xb1, 0x91, 0xa0, 0x7a,
+    0xbf, 0x9a, 0x6f, 0x82, 0xbb, 0x81, 0x6a, 0x52, 0xc2, 0x82, 0x52, 0x65,
+    0x8d, 0x8a, 0x84, 0x46, 0xa2, 0x90, 0x45, 0x52, 0x82, 0x61, 0x8c, 0x77,
+    0x92, 0x6d, 0x87, 0x5b, 0x5e, 0x72, 0x76, 0x97, 0x73, 0x8d, 0x8d, 0x70,
+    0x7a, 0x66, 0x76, 0x89, 0x72, 0xbf, 0xb0, 0x84, 0x7d, 0x80, 0x71, 0x8f,
+    0x85, 0xa9, 0xa3, 0x7d, 0x7b, 0x84, 0x83, 0xa1, 0x97, 0xa7, 0xaf, 0x84,
+    0x86, 0x7d, 0x94, 0x78, 0x80, 0x98, 0x71, 0x84, 0x94, 0x73, 0xb0, 0x74,
+    0x99, 0xa2, 0x68, 0xa7, 0x8b, 0x86, 0xe0, 0x75, 0x9e, 0x93, 0x5c, 0xb2,
+    0xa2, 0x68, 0xb8, 0x61, 0x92, 0xa3, 0x68, 0xa4, 0x89, 0x59, 0xd0, 0x77,
+    0x97, 0xa9, 0x6a, 0x9b, 0x7d, 0x69, 0x9b, 0x79, 0x8c, 0x7c, 0x68, 0x8b,
+    0x7a, 0x53, 0x99, 0x9c, 0x7e, 0x8d, 0x89, 0x96, 0x9e, 0x83, 0x89, 0x74,
+    0x7f, 0x94, 0x92, 0x8f, 0x85, 0x8a, 0x8a, 0x80, 0x99, 0x87, 0x7a, 0x7d,
+    0xac, 0x93, 0x74, 0x68, 0xba, 0x87, 0x6a, 0x98, 0xc7, 0x79, 0x91, 0x54,
+    0xeb, 0x80, 0x45, 0x80, 0xc4, 0xb4, 0x94, 0x61, 0xd2, 0xa6, 0x7b, 0x95,
+    0xa4, 0xaa, 0x93, 0x7b, 0xb1, 0x74, 0x53, 0x7c, 0xaa, 0x91, 0x64, 0x51,
+    0xa9, 0x6e, 0x5e, 0x7c, 0x79, 0x82, 0x8b, 0x2e, 0x9d, 0x66, 0x61, 0x5e,
+    0x72, 0x7f, 0x6e, 0x6d, 0x8c, 0x79, 0x7d, 0x60, 0x76, 0x79, 0x68, 0x84,
+    0x4d, 0x8e, 0xa8, 0x8f, 0x78, 0x74, 0x69, 0xa4, 0x6e, 0xa9, 0xb9, 0x59,
+    0x83, 0x7f, 0x7a, 0x93, 0x90, 0x9b, 0x8d, 0x93, 0x78, 0x80, 0x77, 0x8b,
+    0x72, 0xa3, 0x97, 0x73, 0x91, 0x6c, 0x9a, 0x97, 0xa3, 0xad, 0x89, 0x96,
+    0x9e, 0x6d, 0xb5, 0x7c, 0xa4, 0x98, 0x61, 0x8a, 0x93, 0x5f, 0xdc, 0x63,
+    0xba, 0x92, 0x84, 0x94, 0xab, 0x6f, 0xbf, 0x66, 0x98, 0x93, 0x74, 0x85,
+    0x96, 0x63, 0xb8, 0x60, 0x94, 0xbb, 0x79, 0x94, 0x7b, 0x67, 0x8a, 0x64,
+    0x99, 0xac, 0x60, 0x98, 0xb0, 0x65, 0xa2, 0x73, 0x8f, 0x94, 0x8c, 0x92,
+    0x84, 0x84, 0x9b, 0x8f, 0x84, 0x8d, 0x9f, 0x90, 0x91, 0x85, 0x93, 0x74,
+    0x97, 0x66, 0x7f, 0x78, 0xa2, 0x95, 0x73, 0x6b, 0xc5, 0x6f, 0x62, 0x79,
+    0xbd, 0x81, 0x89, 0x4a, 0xbd, 0x93, 0x57, 0x81, 0xba, 0xb0, 0x9b, 0x4c,
+    0xe8, 0xa2, 0x85, 0xa2, 0x96, 0x92, 0x93, 0x62, 0xbe, 0x7a, 0x71, 0x8b,
+    0x8d, 0x97, 0x53, 0x56, 0xb1, 0x5f, 0x67, 0x60, 0x7a, 0x8e, 0x8a, 0x3a,
+    0x86, 0x67, 0x6d, 0x53, 0x6e, 0x91, 0x7b, 0x60, 0x99, 0x6d, 0x71, 0x5d,
+    0x67, 0x65, 0x63, 0x87, 0x71, 0x8a, 0x92, 0x6d, 0x8f, 0x6f, 0x6f, 0xae,
+    0x6c, 0xa2, 0x87, 0x6f, 0x99, 0x88, 0x78, 0x94, 0x8a, 0xb2, 0x93, 0x89,
+    0x90, 0x8d, 0x8c, 0x98, 0x81, 0x86, 0x90, 0x6d, 0xa2, 0x82, 0xa2, 0xa3,
+    0x9d, 0x8f, 0x7a, 0x9f, 0x87, 0x70, 0xbd, 0x8e, 0xa5, 0x99, 0x5d, 0x70,
+    0x8c, 0x60, 0xc7, 0x78, 0x97, 0xb0, 0x6f, 0x94, 0x92, 0x5a, 0xc3, 0x6e,
+    0x8b, 0x9f, 0x79, 0xa3, 0x8c, 0x5e, 0xbf, 0x79, 0x8e, 0x98, 0x76, 0x8e,
+    0x67, 0x31, 0x9b, 0x85, 0x8e, 0x85, 0x71, 0x99, 0x72, 0x77, 0x84, 0x81,
+    0x91, 0x95, 0x80, 0x98, 0x82, 0x6f, 0x90, 0xa0, 0x91, 0x91, 0x8e, 0x75,
+    0x8a, 0x89, 0x93, 0x69, 0x95, 0x7f, 0x9a, 0xa0, 0x9e, 0x9b, 0x88, 0x4e,
+    0xc3, 0x8d, 0x65, 0x74, 0xba, 0x8d, 0x97, 0x4d, 0xd6, 0x94, 0x73, 0xa0,
+    0xb1, 0xb3, 0x8c, 0x67, 0xdd, 0x9f, 0x7f, 0xaa, 0xaf, 0x9a, 0x88, 0x67,
+    0xc2, 0x8f, 0x71, 0x7b, 0x8f, 0x9f, 0x47, 0x52, 0x93, 0x72, 0x5a, 0x52,
+    0x97, 0x9d, 0x67, 0x3c, 0xa9, 0x59, 0x59, 0x5b, 0x88, 0x92, 0x82, 0x57,
+    0x83, 0x67, 0x94, 0x77, 0x52, 0x74, 0x60, 0x9e, 0x52, 0x84, 0xa2, 0x69,
+    0x71, 0x96, 0x73, 0xb0, 0x5e, 0xb0, 0x89, 0x71, 0x94, 0x8a, 0x66, 0xa0,
+    0x75, 0xc1, 0x99, 0x8e, 0x83, 0x8a, 0x91, 0x89, 0x6b, 0xa5, 0x79, 0x82,
+    0x8b, 0x73, 0x95, 0xb0, 0x77, 0x9b, 0x82, 0x7d, 0x8f, 0x60, 0xb9, 0x78,
+    0x8b, 0x8f, 0x7b, 0x74, 0x84, 0x6d, 0xbf, 0x76, 0x8f, 0xa3, 0x91, 0xa1,
+    0x81, 0x59, 0xcb, 0x69, 0xac, 0x90, 0x98, 0x92, 0xa7, 0x5d, 0xb4, 0x8b,
+    0xaa, 0xb1, 0x98, 0x8c, 0xa2, 0x4d, 0xa1, 0x69, 0x7f, 0xa0, 0x7d, 0x8a,
+    0x9b, 0x77, 0x8e, 0x71, 0x82, 0x8a, 0x78, 0x8d, 0x98, 0x78, 0x90, 0x91,
+    0x7e, 0x7f, 0x78, 0x85, 0x97, 0x8a, 0x97, 0x6d, 0xb3, 0x94, 0x89, 0xa3,
+    0xa5, 0x9a, 0x76, 0x6b, 0xbd, 0x79, 0x71, 0x95, 0xce, 0xab, 0x93, 0x1f,
+    0xe9, 0x97, 0x4c, 0x84, 0xd5, 0x9f, 0x98, 0x6e, 0xdd, 0x8d, 0x80, 0x9c,
+    0xa8, 0x9e, 0x8d, 0x75, 0xbc, 0x8c, 0x80, 0x89, 0xa1, 0x89, 0x74, 0x58,
+    0x92, 0x86, 0x55, 0x87, 0x91, 0x8d, 0x70, 0x33, 0xb8, 0x50, 0x63, 0x6b,
+    0x79, 0x99, 0x76, 0x71, 0x75, 0x59, 0x73, 0x6b, 0x62, 0x62, 0x74, 0x85,
+    0x73, 0xa3, 0xac, 0x78, 0x77, 0x88, 0x64, 0xa0, 0x73, 0xa1, 0xa8, 0x73,
+    0x91, 0x8e, 0x5f, 0x9a, 0x68, 0xc9, 0xa1, 0x92, 0x7a, 0x7c, 0x69, 0x77,
+    0x7d, 0x9e, 0x8f, 0x76, 0x88, 0x80, 0x92, 0x93, 0x91, 0x99, 0x8c, 0x85,
+    0x9f, 0x69, 0xa8, 0x9b, 0x9f, 0x9a, 0x64, 0x7a, 0x99, 0x70, 0xc4, 0x6d,
+    0x9a, 0x99, 0x82, 0xa0, 0x8b, 0x59, 0xc8, 0x61, 0x8f, 0x95, 0x72, 0x8c,
+    0x90, 0x63, 0xa9, 0x7e, 0x88, 0x8c, 0x85, 0x78, 0x76, 0x58, 0x8e, 0x72,
+    0xa3, 0x9a, 0x7c, 0xa0, 0x7f, 0x6d, 0xa6, 0x83, 0x7e, 0x8d, 0x83, 0x88,
+    0x86, 0x68, 0x8d, 0x96, 0xaa, 0x78, 0x90, 0xa5, 0x9c, 0x9d, 0x99, 0x88,
+    0xb0, 0x82, 0x6f, 0x7e, 0xad, 0xa9, 0x7b, 0x6a, 0xba, 0x6c, 0x6d, 0x89,
+    0xc1, 0x9e, 0x8e, 0x2f, 0xf2, 0x77, 0x50, 0x73, 0xdb, 0xc4, 0x9c, 0x6c,
+    0xd0, 0x90, 0x88, 0xbe, 0x97, 0xb9, 0x9e, 0x6e, 0xbe, 0x8e, 0x83, 0x8e,
+    0x96, 0x98, 0x4c, 0x4e, 0xa7, 0x8d, 0x43, 0x92, 0x8f, 0x92, 0x6d, 0x27,
+    0x94, 0x73, 0x5f, 0x42, 0x7c, 0xa7, 0x8a, 0x5a, 0x81, 0x60, 0x85, 0x66,
+    0x73, 0x72, 0x74, 0x9d, 0x5a, 0x9e, 0xa3, 0x71, 0x75, 0x91, 0x4f, 0xa2,
+    0x67, 0xa6, 0x91, 0x64, 0x92, 0x7e, 0x95, 0x8d, 0x6e, 0xbe, 0x9b, 0x57,
+    0x9b, 0x82, 0x89, 0x70, 0x6f, 0x9e, 0x7e, 0x86, 0x97, 0x81, 0x85, 0x8e,
+    0x70, 0x96, 0x6c, 0x72, 0xab, 0x6d, 0x9c, 0x91, 0xa0, 0x8a, 0x8d, 0x88,
+    0x9e, 0x75, 0xc6, 0x76, 0x7c, 0xa7, 0x6b, 0xa8, 0x94, 0x72, 0xb6, 0x78,
+    0x8d, 0x90, 0x7b, 0x8c, 0xa6, 0x65, 0xad, 0x9b, 0xaa, 0x94, 0x89, 0x7d,
+    0x90, 0x69, 0xaa, 0x7e, 0x9e, 0xad, 0x7f, 0x94, 0x81, 0x7d, 0xa1, 0x7b,
+    0x6c, 0x65, 0x83, 0x95, 0x89, 0x75, 0x93, 0x87, 0x94, 0x87, 0xa8, 0x92,
+    0x8d, 0xa6, 0x9f, 0x78, 0xaa, 0x72, 0x95, 0x94, 0xac, 0xa6, 0x91, 0x5a,
+    0xdb, 0x82, 0x55, 0xb6, 0xc1, 0xa3, 0x84, 0x4f, 0xc9, 0x88, 0x53, 0x8f,
+    0xbb, 0xae, 0x9b, 0x8a, 0xd8, 0xa9, 0x68, 0xc2, 0xa0, 0xa9, 0x87, 0x6b,
+    0xbd, 0x99, 0x7e, 0x86, 0x88, 0xa7, 0x5e, 0x53, 0xa4, 0x84, 0x6b, 0x6e,
+    0x89, 0x95, 0x84, 0x2d, 0xb5, 0x43, 0x3e, 0x50, 0x71, 0x96, 0x9a, 0x5b,
+    0xa1, 0x60, 0x80, 0x70, 0x6a, 0x73, 0x8f, 0x95, 0x52, 0x9b, 0xae, 0x71,
+    0x76, 0x7d, 0x61, 0x99, 0x5b, 0xc3, 0xa8, 0x76, 0x98, 0x72, 0x7f, 0x8a,
+    0x66, 0xc7, 0xa3, 0x7b, 0x8e, 0x8f, 0x70, 0x74, 0x6a, 0xae, 0x85, 0x83,
+    0x96, 0x7d, 0x98, 0xa7, 0x8f, 0x94, 0x7e, 0x84, 0x96, 0x7a, 0xab, 0x7d,
+    0x83, 0xb1, 0x6f, 0x7d, 0x9f, 0x80, 0xca, 0x8f, 0x9b, 0xa9, 0x69, 0x7a,
+    0x92, 0x73, 0xaa, 0x74, 0x88, 0x98, 0x87, 0x8f, 0xa7, 0x68, 0xa0, 0x74,
+    0x97, 0x95, 0x6e, 0x6f, 0x83, 0x53, 0x9b, 0x79, 0x71, 0x87, 0x7d, 0x8b,
+    0x79, 0x87, 0xa3, 0x75, 0x68, 0x73, 0x7e, 0x89, 0x8f, 0x81, 0x98, 0x7a,
+    0x9a, 0x83, 0x9d, 0x95, 0x90, 0x98, 0x97, 0x57, 0x93, 0x7e, 0xa2, 0x9a,
+    0xa8, 0x8a, 0x85, 0x53, 0xbd, 0x7a, 0x61, 0x8b, 0xca, 0xac, 0x9b, 0x2e,
+    0xe8, 0xa5, 0x66, 0x86, 0xca, 0xa7, 0xa0, 0x85, 0xcf, 0xa4, 0x6a, 0xc2,
+    0xb0, 0xaa, 0x76, 0x76, 0xb6, 0xa2, 0x72, 0xa9, 0xa1, 0xa1, 0x67, 0x67,
+    0xac, 0x90, 0x70, 0x6d, 0x8f, 0xb5, 0x6d, 0x3b, 0x85, 0x64, 0x4a, 0x6e,
+    0x72, 0x9f, 0x98, 0x5b, 0x97, 0x3e, 0x8a, 0x6a, 0x6c, 0x7d, 0x77, 0x98,
+    0x5a, 0x92, 0xa3, 0x81, 0x6f, 0x91, 0x7b, 0xa6, 0x6e, 0x9c, 0x9b, 0x5f,
+    0x9e, 0x7e, 0x77, 0x9d, 0x88, 0xc6, 0x81, 0x5a, 0x93, 0x8b, 0x6c, 0x71,
+    0x63, 0x9e, 0x78, 0x79, 0x70, 0x90, 0x95, 0x9f, 0x71, 0xa9, 0x90, 0x73,
+    0x98, 0x8a, 0xa5, 0x8e, 0x87, 0xb0, 0x79, 0x79, 0x92, 0x7d, 0xcc, 0xa8,
+    0x7a, 0x92, 0x82, 0x91, 0x90, 0x69, 0xa4, 0x9b, 0x97, 0x8f, 0x75, 0x7c,
+    0xa3, 0x69, 0xb5, 0x87, 0x8d, 0x88, 0x7b, 0x94, 0x8b, 0x55, 0xa2, 0x6d,
+    0x89, 0x8e, 0x81, 0x8a, 0x9e, 0x87, 0x86, 0x83, 0x8b, 0x84, 0x87, 0xa7,
+    0x8e, 0x79, 0xa4, 0x9c, 0x99, 0x82, 0xa3, 0x8f, 0x91, 0x9a, 0x95, 0x5b,
+    0x9f, 0x6e, 0x85, 0x93, 0xa6, 0x9a, 0x91, 0x4c, 0xd8, 0x6b, 0x6d, 0x85,
+    0xde, 0xaa, 0x97, 0x51, 0xcf, 0x8c, 0x5f, 0x9a, 0xc2, 0x9d, 0x9a, 0x7c,
+    0xc6, 0xb1, 0x84, 0xac, 0xba, 0xa5, 0x7c, 0x76, 0xbd, 0x93, 0x7f, 0xa0,
+    0x86, 0xae, 0x47, 0x41, 0x88, 0x82, 0x62, 0x62, 0x73, 0xad, 0x6b, 0x23,
+    0xa0, 0x48, 0x5a, 0x5a, 0x8f, 0x98, 0xbd, 0x5c, 0x9c, 0x72, 0x7c, 0x68,
+    0x50, 0x78, 0x91, 0xab, 0x5c, 0xc1, 0xc6, 0x66, 0x87, 0x86, 0x60, 0x99,
+    0x65, 0xac, 0x94, 0x91, 0x7e, 0x8c, 0x7d, 0x9b, 0x70, 0xb2, 0x9a, 0x7d,
+    0x82, 0x91, 0x6b, 0x86, 0x6f, 0xbb, 0x7f, 0x66, 0x7a, 0x79, 0x94, 0x96,
+    0x71, 0xa5, 0x75, 0x73, 0x95, 0x81, 0xa4, 0x8b, 0x87, 0xaa, 0x8e, 0x92,
+    0xa9, 0x82, 0xb0, 0x92, 0x89, 0xa7, 0x83, 0x81, 0x8c, 0x6d, 0xc4, 0x7a,
+    0x89, 0xa5, 0xa1, 0xa2, 0xa4, 0x6b, 0xa4, 0x82, 0x90, 0xb2, 0x8d, 0x72,
+    0x83, 0x60, 0xa7, 0x7a, 0x80, 0x97, 0x65, 0x90, 0x87, 0x85, 0xae, 0x71,
+    0x7d, 0x71, 0x98, 0xa8, 0x90, 0x75, 0xa9, 0x96, 0xa2, 0x91, 0x7b, 0x6b,
+    0xa0, 0x9d, 0x8d, 0x5d, 0xa4, 0x79, 0x8c, 0xa4, 0xad, 0x94, 0x7e, 0x77,
+    0xb6, 0x92, 0x74, 0xaf, 0xb5, 0x9b, 0x99, 0x67, 0xe7, 0x8e, 0x6a, 0x87,
+    0xc1, 0x98, 0x9b, 0x7e, 0xd7, 0x9b, 0x5b, 0xae, 0xc9, 0x94, 0x7a, 0x6d,
+    0x9e, 0xb4, 0x86, 0x8e, 0xa3, 0xa1, 0x5e, 0x5d, 0x8e, 0x8f, 0x6b, 0x59,
+    0xa5, 0xa9, 0x69, 0x20, 0xa4, 0x64, 0x35, 0x61, 0x83, 0x9d, 0x8a, 0x4e,
+    0x8b, 0x6c, 0x5e, 0x5b, 0x68, 0x76, 0x89, 0x94, 0x5f, 0x87, 0x98, 0x7a,
+    0x5d, 0x81, 0x89, 0xa6, 0x54, 0xa3, 0xb4, 0x7b, 0x83, 0x8a, 0x90, 0x8b,
+    0x86, 0xbc, 0x86, 0x59, 0x91, 0x79, 0x71, 0x6b, 0x7c, 0x94, 0x98, 0x7f,
+    0x81, 0x76, 0x85, 0xad, 0x69, 0xa8, 0x83, 0x8c, 0x8f, 0x70, 0x9a, 0x91,
+    0x78, 0xb3, 0x8f, 0x6d, 0x90, 0x86, 0xbd, 0x97, 0x7f, 0xaf, 0x7e, 0x90,
+    0x8f, 0x63, 0xa2, 0x93, 0x6e, 0xab, 0x75, 0x72, 0x8d, 0x74, 0xa1, 0x72,
+    0x82, 0xaa, 0x70, 0x82, 0x8d, 0x67, 0x94, 0x91, 0x92, 0xa5, 0x7f, 0xa5,
+    0x6f, 0x6d, 0xaf, 0x80, 0x89, 0x7d, 0x92, 0x99, 0x92, 0x72, 0x9d, 0x7d,
+    0x92, 0x78, 0xa9, 0x89, 0xa9, 0x9b, 0xa3, 0x73, 0x98, 0x71, 0x98, 0x86,
+    0x9e, 0x97, 0x9e, 0x6a, 0xb9, 0x6a, 0x6e, 0x90, 0xde, 0x94, 0x9a, 0x52,
+    0xdd, 0xa9, 0x6a, 0x79, 0xb9, 0xa3, 0xaa, 0x95, 0xba, 0xa2, 0x75, 0xc2,
+    0xbf, 0xb5, 0x6d, 0x8d, 0xae, 0x9b, 0x8d, 0x9a, 0x92, 0xb4, 0x5e, 0x4b,
+    0x8b, 0x99, 0x4f, 0x65, 0x94, 0xb6, 0x5d, 0x3a, 0xa3, 0x77, 0x51, 0x4e,
+    0x6d, 0xa3, 0x94, 0x59, 0x80, 0x56, 0x8c, 0x67, 0x67, 0x74, 0x99, 0x85,
+    0x57, 0x7b, 0x9e, 0x7e, 0x84, 0x85, 0x94, 0x96, 0x71, 0xbf, 0x97, 0x5f,
+    0x7d, 0x80, 0x93, 0x87, 0x6b, 0xb9, 0x7d, 0x8b, 0x84, 0x84, 0x6b, 0x8c,
+    0x6c, 0xc4, 0x85, 0x82, 0x87, 0x8d, 0x64, 0x90, 0x80, 0xb6, 0x9a, 0x70,
+    0x9c, 0x68, 0xa0, 0x88, 0x81, 0x9d, 0x83, 0x75, 0x9d, 0x84, 0xbf, 0x8f,
+    0x83, 0x9b, 0x75, 0x82, 0x9c, 0x76, 0xa4, 0x9d, 0x8a, 0xa7, 0x8e, 0x96,
+    0x9c, 0x64, 0xc0, 0x95, 0x88, 0xa5, 0x6f, 0x74, 0x7e, 0x5d, 0x9f, 0x7d,
+    0x89, 0x81, 0x71, 0xa8, 0x82, 0x6e, 0x9b, 0x9a, 0x6f, 0xa5, 0x88, 0x89,
+    0xa4, 0x7e, 0xa4, 0x90, 0xa1, 0x83, 0x8b, 0x9c, 0x9a, 0x89, 0xa2, 0x89,
+    0x9d, 0x5d, 0x86, 0xa5, 0xc4, 0x96, 0x9c, 0x85, 0xd6, 0x7c, 0x69, 0x88,
+    0xc9, 0xa5, 0x9b, 0x60, 0xea, 0xab, 0x62, 0x9f, 0xd1, 0xa5, 0x86, 0x7e,
+    0xb3, 0xbd, 0x7a, 0xa1, 0xbd, 0xa0, 0x7c, 0x92, 0xa6, 0xa3, 0x7d, 0xa9,
+    0x98, 0xa6, 0x71, 0x5c, 0x9b, 0x9b, 0x58, 0x6f, 0x8f, 0xaa, 0x5e, 0x3b,
+    0xa6, 0x5f, 0x3a, 0x79, 0x94, 0xa5, 0x84, 0x6f, 0x83, 0x5d, 0x75, 0x65,
+    0x6c, 0x77, 0x86, 0xad, 0x4a, 0x92, 0x8e, 0x8a, 0x8f, 0x7b, 0x72, 0x96,
+    0x79, 0xa6, 0xa8, 0x6d, 0x7b, 0x7b, 0x98, 0xa9, 0x79, 0xb9, 0x9e, 0x8f,
+    0x90, 0x6d, 0x76, 0x82, 0x81, 0xc1, 0x95, 0x7c, 0x97, 0x8d, 0x95, 0xa2,
+    0x7c, 0xa4, 0x7b, 0x9b, 0x7f, 0x6f, 0xac, 0x83, 0x7e, 0xa1, 0x7c, 0x7c,
+    0xa1, 0x7a, 0xa1, 0x6d, 0x95, 0x86, 0x77, 0x98, 0x8e, 0x58, 0xa2, 0x76,
+    0x8e, 0xa8, 0x94, 0x90, 0xa7, 0x62, 0xb8, 0x8a, 0x9f, 0xac, 0x87, 0x91,
+    0x88, 0x50, 0xa7, 0x83, 0x88, 0x65, 0x7a, 0x92, 0x9d, 0x70, 0xa9, 0x99,
+    0x7c, 0x87, 0x8c, 0x96, 0x8e, 0x73, 0xa4, 0xa7, 0x9b, 0x70, 0x99, 0x96,
+    0x8f, 0x88, 0xb4, 0x85, 0xa8, 0x6a, 0x9e, 0x78, 0xb0, 0x82, 0x9f, 0x89,
+    0xc9, 0x8d, 0x71, 0x7f, 0xc0, 0x98, 0xa0, 0x6d, 0xd2, 0x8e, 0x64, 0x9e,
+    0xb2, 0xa9, 0x93, 0x6e, 0xcc, 0xbb, 0x89, 0xb1, 0xc1, 0x9b, 0x86, 0x94,
+    0xb5, 0xb5, 0x95, 0xa0, 0x9c, 0x9b, 0x62, 0x5f, 0x7b, 0x91, 0x69, 0x74,
+    0x9e, 0xa3, 0x81, 0x30, 0x85, 0x59, 0x49, 0x5e, 0x83, 0x85, 0x7d, 0x6a,
+    0x90, 0x51, 0x80, 0x5e, 0x64, 0x6f, 0x99, 0x93, 0x75, 0x9a, 0xa7, 0x72,
+    0x6c, 0x5d, 0xa3, 0x93, 0x87, 0xa7, 0xbd, 0x6f, 0x92, 0x6d, 0x85, 0x98,
+    0x6f, 0xc7, 0xb6, 0x7c, 0x80, 0x71, 0x8a, 0x9f, 0x71, 0xb5, 0x8c, 0x6d,
+    0xac, 0x7b, 0x72, 0xb7, 0x69, 0xa6, 0x9d, 0x66, 0xab, 0x7a, 0x8b, 0x70,
+    0x8c, 0x9e, 0x86, 0x75, 0x96, 0x7b, 0xa3, 0x93, 0x8f, 0xb7, 0x84, 0x8c,
+    0x87, 0x56, 0xae, 0x82, 0x71, 0xa3, 0x8d, 0x93, 0xaf, 0x59, 0xb3, 0x8a,
+    0x97, 0x99, 0x75, 0x73, 0x8e, 0x51, 0xae, 0x84, 0x8b, 0x7a, 0x76, 0x77,
+    0x6e, 0x75, 0xa4, 0x8a, 0x75, 0x8e, 0x8f, 0xa2, 0x96, 0x76, 0x9a, 0x80,
+    0x96, 0x7d, 0x94, 0x71, 0x8a, 0x90, 0xac, 0x82, 0xa5, 0x61, 0xa3, 0x84,
+    0xac, 0x8f, 0x74, 0x5c, 0xb6, 0x77, 0x8b, 0x9b, 0xb5, 0x8b, 0xb6, 0x52,
+    0xd7, 0xaa, 0x4b, 0x8c, 0xbf, 0xb8, 0x9f, 0x6d, 0xcb, 0xa3, 0x6e, 0x97,
+    0xaa, 0x8d, 0x7c, 0x99, 0xc0, 0xd0, 0x9e, 0xb7, 0x93, 0xaa, 0x5a, 0x6a,
+    0x7d, 0x9a, 0x63, 0x71, 0x78, 0x8c, 0x67, 0x43, 0x87, 0x52, 0x64, 0x68,
+    0x68, 0x9c, 0x65, 0x60, 0x7a, 0x35, 0x68, 0x66, 0x63, 0x69, 0x8d, 0x8f,
+    0x72, 0x9b, 0x99, 0x5b, 0x80, 0x67, 0x93, 0xa2, 0x97, 0x9d, 0x8c, 0x68,
+    0x80, 0x86, 0x96, 0x91, 0x64, 0xbf, 0x98, 0x63, 0x83, 0x85, 0x61, 0x97,
+    0x6a, 0xac, 0xb4, 0x99, 0x8d, 0x7b, 0x7b, 0xad, 0x8b, 0xb2, 0x9e, 0x7f,
+    0x9a, 0x73, 0x91, 0x84, 0x89, 0x9f, 0x8a, 0x87, 0x8b, 0x72, 0x8e, 0x79,
+    0x86, 0xa7, 0x77, 0x84, 0x90, 0x58, 0xb2, 0x90, 0x93, 0xa0, 0x7f, 0x8a,
+    0x91, 0x5a, 0xb1, 0x80, 0x99, 0xc1, 0x80, 0x7d, 0x97, 0x5c, 0x9a, 0x8c,
+    0x71, 0x96, 0x7e, 0x7f, 0xad, 0x7b, 0xb9, 0x8a, 0x84, 0x84, 0x81, 0x97,
+    0x94, 0x64, 0x9f, 0x7e, 0x9b, 0x8d, 0x7d, 0x8d, 0x9a, 0x9e, 0xac, 0x72,
+    0xb2, 0x73, 0x81, 0x84, 0xc8, 0x81, 0x88, 0x72, 0xbe, 0x85, 0x86, 0x97,
+    0xd3, 0x8a, 0xc7, 0x75, 0xce, 0x9c, 0x69, 0xa6, 0xb0, 0xa1, 0x8e, 0x64,
+    0xb1, 0xa6, 0x67, 0xaa, 0xcd, 0x95, 0x97, 0xa2, 0xb2, 0xb2, 0x85, 0x9a,
+    0x9d, 0xa3, 0x5e, 0x73, 0x6e, 0xae, 0x50, 0x83, 0x8c, 0xab, 0x92, 0x43,
+    0x6b, 0x66, 0x43, 0x5c, 0x8f, 0x8a, 0x9a, 0x6c, 0x84, 0x48, 0x80, 0x6b,
+    0x8d, 0x82, 0xaf, 0x89, 0x71, 0x9f, 0xa4, 0x9a, 0x7b, 0x68, 0x91, 0xaa,
+    0x6b, 0xa3, 0x9c, 0x62, 0x8d, 0x6d, 0x87, 0x87, 0x81, 0x9a, 0x97, 0x6c,
+    0x9c, 0x76, 0x63, 0xbc, 0x62, 0xbc, 0xb0, 0x97, 0xa7, 0x81, 0x70, 0x8f,
+    0x7d, 0xb2, 0xa6, 0x98, 0xa1, 0x7b, 0x8e, 0x83, 0x8c, 0xa2, 0x7e, 0x73,
+    0x99, 0x65, 0xc1, 0x77, 0x8e, 0xbc, 0x72, 0xa6, 0x8c, 0x55, 0xab, 0x8e,
+    0x7d, 0xa3, 0x79, 0x80, 0x9e, 0x6b, 0xa9, 0x6c, 0x80, 0xb6, 0x81, 0xa6,
+    0x92, 0x5b, 0xb7, 0x99, 0x81, 0x7e, 0x8e, 0x89, 0x97, 0x86, 0x93, 0x86,
+    0x7b, 0x9a, 0x7f, 0x9a, 0x8e, 0x69, 0xa3, 0xa4, 0x9f, 0x8b, 0x96, 0x6f,
+    0x8b, 0x97, 0xb4, 0x74, 0x96, 0x53, 0x99, 0x91, 0xa7, 0xa8, 0x69, 0x72,
+    0xc9, 0x85, 0x99, 0x93, 0xc0, 0x90, 0xaa, 0x7f, 0xc7, 0x71, 0x74, 0x8d,
+    0xb7, 0xab, 0x91, 0x69, 0xb4, 0x9b, 0x7d, 0x95, 0xc3, 0xb0, 0x9b, 0xa9,
+    0xb3, 0x9f, 0x79, 0xa5, 0x9f, 0xad, 0x6b, 0x85, 0x90, 0xad, 0x69, 0x62,
+    0x7e, 0xa6, 0x69, 0x4e, 0x80, 0x7e, 0x52, 0x57, 0x5f, 0x95, 0x72, 0x4c,
+    0x87, 0x4e, 0x5a, 0x62, 0x7d, 0x70, 0x92, 0x98, 0x76, 0x8e, 0x99, 0x7d,
+    0x73, 0x6d, 0x86, 0x8e, 0x6b, 0x80, 0xa7, 0x9d, 0x91, 0x73, 0x95, 0x70,
+    0x80, 0xc3, 0x9f, 0x8b, 0x72, 0x86, 0x6b, 0xad, 0x76, 0xbe, 0xad, 0x8e,
+    0x9c, 0x78, 0x6a, 0xbf, 0x7d, 0xa8, 0x88, 0x8a, 0x8b, 0x8c, 0x9c, 0x8c,
+    0x8a, 0x85, 0x73, 0x92, 0xa2, 0x7b, 0xa5, 0x96, 0x9b, 0xa3, 0x6c, 0x80,
+    0xa6, 0x63, 0xac, 0x98, 0xa3, 0x9a, 0x83, 0x8a, 0x8c, 0x63, 0xb9, 0x8c,
+    0x99, 0xa1, 0x7a, 0x6c, 0x9e, 0x59, 0x90, 0x84, 0x8a, 0x93, 0x8f, 0x87,
+    0x98, 0x84, 0x99, 0xa4, 0x72, 0x6d, 0x95, 0xa2, 0x95, 0x72, 0xc3, 0x88,
+    0x8f, 0x6a, 0x77, 0x7d, 0x8b, 0xae, 0xa3, 0x7c, 0xa8, 0x5d, 0x7c, 0xa8,
+    0xa1, 0x85, 0x7e, 0x8c, 0xac, 0x8d, 0x73, 0x88, 0xc1, 0x89, 0xaa, 0x89,
+    0xb2, 0x92, 0x75, 0x9a, 0x9c, 0x8e, 0xb9, 0xaa, 0xaa, 0xac, 0x78, 0x85,
+    0xbc, 0x9f, 0x6d, 0xb7, 0x89, 0xa6, 0xb3, 0x8e, 0xa5, 0xbb, 0x6b, 0x9d,
+    0x8f, 0x8b, 0x69, 0x7a, 0x82, 0x99, 0x8c, 0x49, 0x87, 0x74, 0x37, 0x63,
+    0x5d, 0x92, 0x77, 0x66, 0x63, 0x56, 0x77, 0x5d, 0x7f, 0x68, 0x97, 0x74,
+    0x84, 0x94, 0x7d, 0x7d, 0x91, 0x78, 0x87, 0x96, 0x7f, 0x97, 0x94, 0x6f,
+    0x89, 0x6c, 0x96, 0x71, 0x83, 0x8f, 0x8a, 0x89, 0x7d, 0x84, 0x8a, 0xa6,
+    0x7b, 0x95, 0x89, 0x77, 0x94, 0x80, 0x7f, 0x93, 0x5e, 0xbb, 0x9c, 0xa8,
+    0xa2, 0x7e, 0xa6, 0x86, 0x7d, 0x8b, 0x92, 0x73, 0xac, 0x78, 0xaa, 0x98,
+    0xb1, 0x94, 0x79, 0x8b, 0x8f, 0x70, 0xa7, 0xae, 0x92, 0xad, 0xb1, 0x8b,
+    0xb0, 0x78, 0xbc, 0xa9, 0xa4, 0xa3, 0x9e, 0x76, 0x89, 0x67, 0xab, 0x98,
+    0x75, 0x8c, 0x86, 0x95, 0x9e, 0x77, 0x96, 0x85, 0x8c, 0x8e, 0x8b, 0x8a,
+    0x8a, 0x4b, 0x71, 0x8a, 0x9b, 0x6d, 0x6e, 0x89, 0x81, 0x82, 0xa7, 0x98,
+    0xa5, 0x66, 0x72, 0x8b, 0x99, 0x9a, 0x8b, 0x8b, 0x9f, 0x87, 0x79, 0x84,
+    0x99, 0x6d, 0x90, 0x7d, 0x9d, 0xa7, 0x81, 0xa3, 0x9d, 0x96, 0x82, 0x86,
+    0xa2, 0x8e, 0x8d, 0x7f, 0x84, 0x8c, 0x98, 0xbc, 0x83, 0xb4, 0xb5, 0x78,
+    0x7d, 0xab, 0x8d, 0x87, 0x71, 0x8d, 0x6e, 0x8f, 0x89, 0xaa, 0x7c, 0x6f,
+    0x71, 0x69, 0x65, 0x60, 0x81, 0x91, 0x94, 0x6d, 0x76, 0x66, 0x74, 0x5e,
+    0x77, 0x7c, 0xa2, 0xa6, 0x70, 0x90, 0xa3, 0x68, 0x83, 0x69, 0x71, 0x72,
+    0x6c, 0xa9, 0x85, 0x71, 0x88, 0x60, 0x90, 0x84, 0x8a, 0xba, 0x8b, 0x8c,
+    0x72, 0x8f, 0x98, 0x84, 0x8b, 0x8a, 0xb1, 0xa2, 0x93, 0x8d, 0x86, 0x99,
+    0xa2, 0x99, 0xb0, 0xa6, 0x92, 0x78, 0x86, 0x87, 0x9c, 0x9d, 0x6f, 0x92,
+    0x9a, 0x8a, 0xbf, 0xaa, 0xa3, 0xa2, 0x71, 0x8d, 0x93, 0x70, 0xb5, 0x9c,
+    0xa8, 0x97, 0xb4, 0x93, 0xa6, 0x75, 0xbb, 0xa3, 0x92, 0x95, 0x95, 0x94,
+    0x90, 0x5b, 0xbf, 0x92, 0x8a, 0x95, 0xa0, 0xa1, 0x68, 0x7e, 0x9a, 0x7f,
+    0x88, 0xa7, 0x93, 0xa1, 0x7a, 0x93, 0x95, 0x8b, 0x96, 0x94, 0x70, 0xa0,
+    0x70, 0x8f, 0x9d, 0x96, 0x8e, 0x9c, 0x90, 0x9f, 0x7e, 0x83, 0x84, 0x9e,
+    0x7f, 0x65, 0x72, 0x84, 0x64, 0x94, 0x75, 0xa7, 0x62, 0xa3, 0x8a, 0x9b,
+    0x82, 0x99, 0x87, 0x70, 0x81, 0x6d, 0xac, 0x7b, 0x74, 0x68, 0x5d, 0x95,
+    0xa0, 0x6e, 0x84, 0xab, 0x79, 0x8e, 0x8b, 0x79, 0x7b, 0x83, 0xa0, 0x7b,
+    0x96, 0x71, 0x5d, 0xad, 0xa4, 0x82, 0x79, 0x96, 0x73, 0x84, 0x7d, 0x98,
+    0x87, 0x93, 0x86, 0xa6, 0x7f, 0x7c, 0x71, 0x9d, 0xa4, 0x9b, 0x8a, 0x7c,
+    0x87, 0x6a, 0x7f, 0x8d, 0x97, 0x92, 0xa0, 0x88, 0x77, 0x7d, 0x70, 0x9c,
+    0x9f, 0xa0, 0x71, 0xa3, 0x73, 0x95, 0x76, 0x79, 0x94, 0x95, 0x83, 0x8b,
+    0x8d, 0x82, 0x7a, 0x77, 0xa6, 0x88, 0x72, 0x7a, 0x90, 0x76, 0x7f, 0x95,
+    0x83, 0x90, 0x9e, 0x7c, 0x8e, 0x9a, 0x6b, 0xa4, 0x98, 0x9f, 0x86, 0x8c,
+    0x76, 0x70, 0x74, 0x97, 0x7e, 0xa4, 0x5f, 0xa3, 0xa7, 0x7f, 0x67, 0x8d,
+    0x82, 0x95, 0x93, 0x99, 0x82, 0x70, 0x75, 0xa8, 0xa1, 0xaf, 0x8a, 0x8a,
+    0xb0, 0x89, 0x88, 0x6b, 0x98, 0xaf, 0x75, 0x7f, 0x86, 0x90, 0x8f, 0x8c,
+    0x84, 0x8d, 0x7f, 0x8b, 0x94, 0x9f, 0x80, 0x8b, 0x93, 0xa2, 0x98, 0xa5,
+    0x83, 0x81, 0x8a, 0xaa, 0x86, 0xa3, 0xb0, 0xac, 0x64, 0x9c, 0x7c, 0x93,
+    0xac, 0x85, 0x7f, 0x88, 0x7a, 0xa5, 0x75, 0x69, 0x94, 0xa8, 0x95, 0xa9,
+    0x6f, 0x9f, 0x85, 0x8a, 0xa5, 0x97, 0x98, 0xa9, 0x76, 0x80, 0x7e, 0x95,
+    0x89, 0xaf, 0x68, 0x7b, 0xb4, 0x8a, 0x6b, 0xa4, 0x7b, 0x90, 0x79, 0xba,
+    0x9f, 0x82, 0x7d, 0x89, 0x85, 0x82, 0x94, 0xa5, 0x78, 0x8f, 0x6f, 0x71,
+    0x62, 0x66, 0x73, 0x98, 0x8c, 0x7d, 0x81, 0xa2, 0x69, 0x7c, 0x76, 0xa4,
+    0x94, 0x8f, 0x6f, 0x8a, 0x94, 0x8e, 0x8a, 0x88, 0x8c, 0xa3, 0x6f, 0xa2,
+    0x7d, 0x90, 0x8f, 0x96, 0x6c, 0x76, 0x6e, 0x8e, 0x82, 0x85, 0x7f, 0x93,
+    0x81, 0x83, 0x7b, 0x9f, 0x91, 0x89, 0x75, 0x9c, 0x9f, 0x86, 0x7a, 0x8c,
+    0x7a, 0x7b, 0x82, 0xae, 0x6a, 0x7d, 0x82, 0x82, 0xa0, 0x85, 0x99, 0x9f,
+    0x88, 0x8b, 0x8c, 0x8f, 0x90, 0x96, 0x8e, 0x98, 0xa3, 0x87, 0x7f, 0x9b,
+    0x94, 0x73, 0x96, 0x86, 0x72, 0x7c, 0x75, 0x7c, 0x90, 0x79, 0x83, 0x80,
+    0x79, 0x9e, 0x9c, 0x8e, 0x99, 0x8c, 0x7a, 0x9c, 0x8d, 0x99, 0x9d, 0x84,
+    0xa5, 0x93, 0x85, 0x96, 0x88, 0x94, 0x80, 0x90, 0x73, 0xa3, 0x7c, 0xa1,
+    0x88, 0xa4, 0x98, 0x9f, 0x9e, 0x92, 0x6c, 0xa0, 0x84, 0x87, 0x8a, 0x83,
+    0x7b, 0x91, 0x8c, 0x9e, 0x73, 0xa6, 0x93, 0xa0, 0x8d, 0x98, 0x74, 0xa1,
+    0x83, 0x9a, 0x80, 0xbc, 0x62, 0x70, 0x9e, 0xad, 0x9e, 0x8f, 0x8f, 0x9e,
+    0x7e, 0xac, 0xb0, 0xa9, 0x79, 0x6f, 0x79, 0x8f, 0x7e, 0x71, 0x8d, 0xab,
+    0x97, 0x76, 0x86, 0xa2, 0x98, 0x95, 0x8b, 0x9b, 0x75, 0x7a, 0x71, 0x85,
+    0x7f, 0x61, 0x76, 0x8e, 0x99, 0x91, 0x88, 0x73, 0x71, 0x65, 0x82, 0xa0,
+    0x9b, 0x8f, 0x79, 0x70, 0x78, 0x66, 0x85, 0x94, 0x8b, 0x91, 0x75, 0x80,
+    0x9c, 0x94, 0x7f, 0xa5, 0x82, 0x91, 0x7d, 0x76, 0x80, 0x78, 0x83, 0x82,
+    0x79, 0x98, 0x83, 0x87, 0x94, 0x71, 0x73, 0x77, 0x71, 0x94, 0x6a, 0xa8,
+    0x9e, 0x8d, 0x90, 0x78, 0x7a, 0x81, 0x9c, 0x91, 0x96, 0x80, 0x79, 0x83,
+    0x92, 0x9f, 0x8a, 0x84, 0x8e, 0x97, 0x8c, 0x81, 0x87, 0x74, 0x8b, 0x8e,
+    0xa7, 0x86, 0x8b, 0x8a, 0x8e, 0x8f, 0x9b, 0x6b, 0x82, 0x8a, 0x9f, 0x7a,
+    0x96, 0x80, 0x91, 0x94, 0xa6, 0x8e, 0x7a, 0x97, 0x8a, 0x6c, 0xad, 0xa1,
+    0x78, 0x95, 0x9d, 0x9d, 0x88, 0x94, 0x99, 0x86, 0x80, 0x9b, 0x7c, 0x9c,
+    0x87, 0x7a, 0xa0, 0xa8, 0x83, 0x74, 0x8e, 0x9b, 0x65, 0x95, 0x83, 0xc2,
+    0x69, 0x88, 0x87, 0xa7, 0x86, 0x98, 0x9f, 0xc6, 0x5c, 0x7f, 0xb9, 0x9c,
+    0x8b, 0x6e, 0x95, 0xbd, 0x72, 0x83, 0xbf, 0xb1, 0x89, 0x6d, 0x89, 0x8e,
+    0x9d, 0x87, 0x95, 0x92, 0x76, 0x8d, 0x7f, 0x7f, 0x6d, 0x9d, 0x7b, 0x95,
+    0x86, 0x69, 0x90, 0xa0, 0x62, 0x7c, 0x56, 0xa0, 0x9c, 0x8b, 0x81, 0x79,
+    0xa6, 0x73, 0x69, 0xaa, 0x7b, 0x87, 0x8b, 0x7e, 0xa1, 0x9f, 0x6d, 0xa6,
+    0x7e, 0x7e, 0x87, 0x7c, 0xa5, 0x84, 0x7b, 0xa2, 0xae, 0x92, 0x8e, 0x67,
+    0x93, 0x88, 0x8b, 0xa2, 0x8d, 0x96, 0x92, 0x8e, 0x71, 0x7a, 0x82, 0x80,
+    0x9e, 0x8b, 0x7b, 0x87, 0x96, 0xa0, 0xa4, 0x92, 0x88, 0x7e, 0x77, 0x8e,
+    0x91, 0x7e, 0x81, 0x77, 0x79, 0x93, 0x8d, 0x9d, 0x8a, 0x71, 0x8d, 0x88,
+    0x9d, 0x89, 0x85, 0x94, 0x99, 0x80, 0x89, 0x8f, 0x87, 0x81, 0x83, 0x74,
+    0x8a, 0x89, 0x68, 0x7e, 0x99, 0x82, 0x8c, 0x76, 0xc6, 0x8f, 0x90, 0x7d,
+    0x6c, 0x68, 0xbd, 0x90, 0x78, 0x9d, 0x7b, 0xa3, 0x99, 0x76, 0xaf, 0x8d,
+    0x7d, 0x84, 0x7f, 0x9f, 0x8b, 0x7a, 0xaa, 0xa8, 0x79, 0x89, 0x8f, 0x8f,
+    0x71, 0x80, 0x7f, 0xaa, 0x85, 0x70, 0xa8, 0x96, 0x6c, 0x8c, 0xaf, 0xeb,
+    0x57, 0x7e, 0xcf, 0x8d, 0x93, 0x72, 0xa6, 0xd2, 0x52, 0xab, 0xbb, 0xa8,
+    0x8d, 0x82, 0x7a, 0xbc, 0x72, 0x95, 0xa3, 0xa7, 0x8b, 0x74, 0x84, 0x85,
+    0x6a, 0x85, 0x92, 0x9f, 0x91, 0x6b, 0x9b, 0x73, 0x77, 0xa2, 0x7f, 0x81,
+    0x8e, 0x8b, 0x71, 0x8c, 0x7f, 0x60, 0x86, 0x81, 0x9c, 0x86, 0x93, 0x65,
+    0x84, 0x84, 0x89, 0xa2, 0x98, 0x67, 0x88, 0x71, 0x92, 0x80, 0x65, 0xa2,
+    0xa5, 0x99, 0x85, 0x95, 0x8f, 0x85, 0x8f, 0x82, 0x7e, 0x9a, 0x8a, 0x74,
+    0x9d, 0x75, 0x88, 0x7e, 0xa2, 0x77, 0x82, 0x9e, 0x78, 0xa1, 0x74, 0x79,
+    0x7f, 0x87, 0x91, 0x8d, 0x7a, 0x73, 0x96, 0xa2, 0xa3, 0x81, 0x7d, 0x8a,
+    0x85, 0x75, 0x84, 0x81, 0x8b, 0x7f, 0x6c, 0x86, 0x8d, 0x7b, 0x79, 0x78,
+    0x89, 0x85, 0x8c, 0x9a, 0xa6, 0x96, 0x7a, 0x78, 0xa2, 0x85, 0x9b, 0x89,
+    0xc8, 0x97, 0xa3, 0x82, 0x8b, 0x7f, 0xe7, 0x8f, 0x8f, 0x74, 0x75, 0x83,
+    0x87, 0x79, 0xb3, 0xab, 0x70, 0x9a, 0x9a, 0xa6, 0x81, 0x7e, 0xb8, 0x91,
+    0x8b, 0x8d, 0x93, 0xa1, 0x79, 0x7d, 0x81, 0xb4, 0x79, 0x94, 0xa5, 0x89,
+    0x8e, 0x7c, 0x9b, 0xe2, 0x50, 0x94, 0xdf, 0xa0, 0x53, 0x5d, 0x90, 0xde,
+    0x67, 0x90, 0xaf, 0x8a, 0x8f, 0x73, 0x7b, 0xcb, 0x64, 0x9f, 0x91, 0x86,
+    0x95, 0x84, 0x83, 0x88, 0x76, 0x8b, 0x8a, 0x8f, 0x9c, 0x9a, 0x92, 0x96,
+    0x7f, 0x8e, 0x79, 0x80, 0x91, 0x6d, 0x86, 0x59, 0x74, 0x8a, 0x53, 0x88,
+    0xae, 0x7b, 0x80, 0x70, 0x87, 0x74, 0x75, 0x91, 0xa4, 0x74, 0x8d, 0x5a,
+    0x83, 0x95, 0x65, 0xa1, 0xb3, 0x74, 0x87, 0x7d, 0xaa, 0x82, 0x79, 0x78,
+    0x9b, 0x7c, 0x78, 0x74, 0x9e, 0x74, 0x92, 0x92, 0xa3, 0x6e, 0x75, 0x92,
+    0x6a, 0x6f, 0xa3, 0x7c, 0x9e, 0x7f, 0x92, 0x6b, 0x96, 0x79, 0x9a, 0x87,
+    0x83, 0x8c, 0x72, 0x79, 0x6a, 0xa3, 0x79, 0x7d, 0x6d, 0x6c, 0x81, 0x96,
+    0x98, 0x7f, 0x94, 0x81, 0x8a, 0x8a, 0xa7, 0x8c, 0x9a, 0x84, 0xa7, 0x89,
+    0x9d, 0x85, 0xa6, 0xa8, 0xd0, 0x92, 0x97, 0x9f, 0x76, 0x86, 0xe6, 0x6f,
+    0x7c, 0x84, 0x98, 0x8d, 0x80, 0x75, 0xc5, 0x86, 0x6b, 0x8d, 0x9e, 0x9e,
+    0x7f, 0x71, 0x97, 0xa1, 0x75, 0x92, 0xa9, 0x9e, 0x91, 0x5e, 0xa2, 0xa2,
+    0x68, 0xad, 0xa5, 0xa0, 0x7e, 0x68, 0xac, 0xdc, 0x50, 0xa2, 0xc1, 0x8a,
+    0x63, 0x74, 0x7e, 0xd9, 0x3f, 0xbb, 0xba, 0x9d, 0x7f, 0x76, 0x5f, 0xb0,
+    0x74, 0x8e, 0xb1, 0x95, 0x9a, 0x81, 0x63, 0x9f, 0x98, 0x74, 0x80, 0x89,
+    0x95, 0x8e, 0x9e, 0x78, 0x87, 0x82, 0x57, 0x87, 0x8d, 0x90, 0x79, 0x80,
+    0x76, 0x7c, 0x7d, 0x8a, 0xa6, 0x82, 0x98, 0x7a, 0x96, 0x97, 0x84, 0x87,
+    0xab, 0x7f, 0x87, 0x57, 0x83, 0x6a, 0x6a, 0x84, 0x9c, 0x8d, 0x74, 0x68,
+    0xa2, 0x92, 0x90, 0x98, 0x98, 0x8b, 0x6d, 0x72, 0x90, 0x8c, 0x7c, 0x7d,
+    0x9b, 0x6e, 0x71, 0x76, 0x6b, 0x7b, 0x63, 0x81, 0xad, 0x71, 0x78, 0x8e,
+    0x74, 0x87, 0x8e, 0x8a, 0xab, 0x8e, 0x83, 0x85, 0x7d, 0xa0, 0x67, 0x7f,
+    0x9c, 0x74, 0x6b, 0x88, 0x66, 0x92, 0x7f, 0x83, 0x94, 0x92, 0xa5, 0x82,
+    0xa1, 0x7b, 0x6f, 0x70, 0xab, 0x72, 0xb5, 0x91, 0xb7, 0x89, 0x91, 0x77,
+    0x77, 0x8a, 0xdb, 0x88, 0x8a, 0x8d, 0x89, 0x6c, 0x7b, 0x83, 0xc8, 0xb5,
+    0x4b, 0x96, 0x8b, 0x92, 0x91, 0x76, 0xa9, 0xae, 0x70, 0xa8, 0x74, 0x9d,
+    0x96, 0x6d, 0xa1, 0xba, 0x86, 0xbc, 0xbc, 0xa2, 0x8d, 0x6c, 0x96, 0xd8,
+    0x71, 0xb1, 0xae, 0xb0, 0x79, 0x7b, 0x71, 0xd8, 0x32, 0xaa, 0xae, 0xa7,
+    0x7c, 0x6b, 0x77, 0xc0, 0x7c, 0x9e, 0x9f, 0x89, 0x92, 0x8a, 0x76, 0xae,
+    0x97, 0x75, 0x87, 0x8c, 0x7f, 0x86, 0x8b, 0x73, 0x6b, 0x64, 0x87, 0x6d,
+    0x99, 0x8f, 0x8d, 0x66, 0x76, 0x87, 0x6d, 0x6e, 0x98, 0x7a, 0x91, 0x92,
+    0x8c, 0x7c, 0x89, 0x9b, 0x9e, 0x83, 0x86, 0x62, 0x90, 0x6e, 0x62, 0x82,
+    0xa3, 0x7e, 0x86, 0x6a, 0x93, 0x9b, 0x73, 0x6c, 0xa8, 0x99, 0x73, 0x99,
+    0x8c, 0x89, 0x85, 0x67, 0x98, 0x78, 0x63, 0x98, 0x77, 0xa6, 0x6e, 0x81,
+    0xa4, 0x64, 0x8f, 0x8a, 0x7f, 0x9b, 0x91, 0x91, 0x94, 0x82, 0x8b, 0x8b,
+    0x76, 0x66, 0x83, 0x81, 0x94, 0x71, 0x82, 0x9e, 0x93, 0x85, 0x80, 0x8c,
+    0xae, 0x94, 0x96, 0x74, 0x91, 0x9a, 0x6f, 0x9e, 0xa9, 0x76, 0xab, 0x8e,
+    0xd6, 0x9c, 0x7d, 0x98, 0x83, 0x6e, 0xfe, 0x83, 0x71, 0x82, 0x9f, 0x93,
+    0x7b, 0x67, 0xcb, 0xb9, 0x66, 0x89, 0x99, 0x8a, 0xac, 0x8c, 0xa0, 0x9c,
+    0x70, 0xaf, 0x81, 0x88, 0x9c, 0x7e, 0xa8, 0xa5, 0x65, 0x8c, 0xa1, 0x8c,
+    0x83, 0x85, 0x9d, 0xcb, 0x4b, 0xc1, 0xb5, 0xa2, 0x75, 0x63, 0x75, 0xbd,
+    0x34, 0xae, 0xca, 0xa2, 0x89, 0x7a, 0x69, 0xb0, 0x70, 0xae, 0x94, 0x76,
+    0x85, 0x93, 0x6a, 0x90, 0x6a, 0x8a, 0xac, 0x71, 0x7e, 0x81, 0xa2, 0x71,
+    0x98, 0x86, 0x99, 0x76, 0x8f, 0x6f, 0x90, 0x93, 0x7c, 0x72, 0x81, 0x8c,
+    0x78, 0x77, 0x97, 0x84, 0x98, 0x70, 0x96, 0x9a, 0x9b, 0x93, 0x92, 0x5f,
+    0xaa, 0x88, 0x5b, 0x74, 0xaa, 0x96, 0x6a, 0x73, 0x87, 0x83, 0x72, 0x89,
+    0xab, 0x8a, 0x5f, 0x71, 0xa4, 0x94, 0x92, 0x60, 0x96, 0x7b, 0x53, 0x88,
+    0x69, 0x8b, 0x5e, 0x7b, 0xa0, 0x83, 0x70, 0x95, 0x6d, 0x9b, 0x6d, 0x98,
+    0x99, 0x86, 0x6e, 0x7a, 0x87, 0x86, 0x68, 0x8a, 0x7e, 0x87, 0x90, 0x7d,
+    0x76, 0x93, 0x80, 0x8a, 0x8f, 0x97, 0xac, 0x71, 0xa2, 0x96, 0x7f, 0x8e,
+    0xc2, 0x71, 0xab, 0xa9, 0xd1, 0x85, 0x8c, 0x74, 0x70, 0x72, 0xff, 0x77,
+    0x6d, 0x77, 0x91, 0x5d, 0x71, 0x5d, 0xb2, 0xb1, 0x38, 0x76, 0xa6, 0x80,
+    0x91, 0x86, 0xa3, 0x9c, 0x85, 0x95, 0x99, 0xab, 0x8a, 0x6e, 0x9f, 0xa6,
+    0x75, 0xa9, 0xb3, 0x97, 0x69, 0x85, 0xa4, 0xc9, 0x59, 0xb4, 0xca, 0x8d,
+    0x5c, 0x67, 0x7d, 0xcd, 0x29, 0xca, 0xdb, 0x8c, 0x86, 0x8c, 0x70, 0xaa,
+    0x5c, 0x9e, 0x98, 0x86, 0x92, 0x7e, 0x6b, 0x8e, 0x8f, 0x6a, 0x84, 0x71,
+    0x9a, 0x76, 0x87, 0x84, 0x8b, 0x7f, 0x7f, 0x6e, 0xa3, 0x83, 0x85, 0x78,
+    0x6f, 0x7c, 0x6f, 0x96, 0x95, 0x8c, 0xa3, 0x72, 0x92, 0x66, 0x7b, 0x99,
+    0x9c, 0x9c, 0x9a, 0x63, 0xaa, 0x81, 0x7f, 0x90, 0x8c, 0xa0, 0x7e, 0x67,
+    0x94, 0x96, 0x7f, 0x8a, 0x95, 0x91, 0x5c, 0x73, 0x88, 0x9b, 0x85, 0x70,
+    0x87, 0x79, 0x56, 0x92, 0x69, 0x95, 0x62, 0x78, 0x93, 0x83, 0x63, 0x98,
+    0x7a, 0xa4, 0x95, 0x7c, 0x8e, 0x69, 0x86, 0x92, 0x7d, 0x6b, 0x69, 0x85,
+    0xa8, 0x90, 0x7c, 0x7b, 0x9e, 0x87, 0x7b, 0x90, 0x98, 0x7a, 0xa4, 0x92,
+    0xad, 0x97, 0xa0, 0x6d, 0xa6, 0x74, 0xb7, 0x7f, 0xb9, 0x94, 0x6c, 0x77,
+    0x65, 0x6f, 0xfc, 0x7d, 0x68, 0x74, 0xa1, 0x6c, 0x71, 0x61, 0xc3, 0xb5,
+    0x60, 0x86, 0x8b, 0x7d, 0x89, 0x8b, 0x93, 0xa4, 0x68, 0xa0, 0x8f, 0x73,
+    0x96, 0x6e, 0x81, 0x99, 0x81, 0x9d, 0xae, 0x93, 0x6a, 0x8b, 0x9a, 0xcb,
+    0x68, 0xaf, 0xca, 0x81, 0x73, 0x6e, 0x70, 0xd7, 0x49, 0xb9, 0xc5, 0x9d,
+    0x87, 0x8d, 0x61, 0xa8, 0x5e, 0xa4, 0xb7, 0xab, 0x96, 0x84, 0x76, 0x98,
+    0x84, 0x99, 0x8f, 0x70, 0x79, 0x94, 0xa5, 0x87, 0x6e, 0x73, 0x63, 0x7e,
+    0x83, 0x8c, 0x88, 0x71, 0x7a, 0x81, 0x7d, 0x94, 0x92, 0x89, 0xab, 0x7a,
+    0x96, 0x66, 0x7b, 0x8b, 0x8f, 0x8e, 0x94, 0x5b, 0xa0, 0x7f, 0x82, 0x84,
+    0x84, 0x80, 0x7d, 0x81, 0x89, 0x7b, 0x97, 0x78, 0x83, 0x93, 0x4c, 0x95,
+    0x7f, 0x93, 0x8e, 0x70, 0x89, 0x81, 0x69, 0x87, 0x76, 0x73, 0x9a, 0x74,
+    0xa2, 0x88, 0x5e, 0xac, 0x74, 0x8e, 0x74, 0x8e, 0x94, 0x85, 0x7b, 0x7a,
+    0x72, 0x82, 0x68, 0x77, 0x96, 0x8a, 0x7b, 0x6c, 0x88, 0x8b, 0x6b, 0x86,
+    0xa4, 0x88, 0xac, 0xa1, 0x90, 0x8e, 0x85, 0x6d, 0xb1, 0x69, 0xb1, 0xa2,
+    0xbe, 0x9a, 0x7c, 0xb4, 0x63, 0x56, 0xf2, 0x90, 0x5e, 0x71, 0xa3, 0x6a,
+    0x8b, 0x67, 0xbe, 0xa8, 0x6e, 0x8b, 0x90, 0x83, 0xa0, 0x78, 0x9f, 0xa5,
+    0x65, 0xa3, 0x8b, 0x94, 0x84, 0x6c, 0xa5, 0x97, 0x7d, 0xa7, 0x9f, 0x9c,
+    0x62, 0x7d, 0xb5, 0xb1, 0x58, 0x98, 0xba, 0x8d, 0x7f, 0x57, 0x86, 0xc5,
+    0x39, 0xb3, 0xc9, 0xa9, 0x89, 0x8e, 0x55, 0xaf, 0x54, 0xb4, 0xb0, 0x8f,
+    0x8b, 0x7c, 0x6e, 0x8e, 0x96, 0x90, 0x8a, 0x83, 0x84, 0x8c, 0x96, 0x7f,
+    0x89, 0x67, 0x99, 0x60, 0x74, 0x8d, 0x9b, 0x82, 0x6f, 0x61, 0x84, 0x9a,
+    0x7c, 0x85, 0x86, 0x7c, 0x9b, 0x5f, 0x81, 0x96, 0x90, 0x9b, 0xa0, 0x58,
+    0xaf, 0x78, 0x81, 0x8f, 0x96, 0x81, 0x77, 0x7d, 0xa2, 0x85, 0x74, 0x84,
+    0x99, 0x8d, 0x5f, 0x77, 0x8a, 0x8c, 0x85, 0x78, 0x8f, 0x80, 0x5c, 0x6f,
+    0x77, 0x73, 0x80, 0x99, 0x83, 0x89, 0x6f, 0x8e, 0x85, 0x7e, 0x6c, 0x81,
+    0x99, 0x89, 0x69, 0x70, 0x8c, 0x8f, 0x6b, 0x89, 0x80, 0x7a, 0x83, 0x7a,
+    0x96, 0x99, 0x73, 0x76, 0x9c, 0x67, 0xab, 0xab, 0xbd, 0x8b, 0x85, 0x90,
+    0xb0, 0x6b, 0xbd, 0x9c, 0xb9, 0xa0, 0x7c, 0x7d, 0x66, 0x78, 0xdb, 0x97,
+    0x55, 0x67, 0x96, 0x69, 0x80, 0x49, 0xc1, 0xbb, 0x6c, 0x91, 0x8a, 0x92,
+    0x9a, 0x98, 0xa5, 0x98, 0x51, 0xa6, 0x99, 0x8e, 0x73, 0x73, 0x9d, 0x9f,
+    0x77, 0xa6, 0xa4, 0x92, 0x64, 0x75, 0xac, 0xb2, 0x5d, 0xa1, 0xab, 0xa4,
+    0x5a, 0x5b, 0xb3, 0xb7, 0x2d, 0xca, 0xc8, 0x76, 0x94, 0x8e, 0x59, 0xb0,
+    0x52, 0x9d, 0xbd, 0x89, 0x97, 0x84, 0x5d, 0x9a, 0x87, 0x9b, 0x94, 0x6c,
+    0x7b, 0xaa, 0x8a, 0x8b, 0x79, 0x5d, 0x90, 0x5c, 0x8b, 0x7b, 0xbe, 0x68,
+    0x84, 0x6f, 0x75, 0x72, 0x98, 0x82, 0x92, 0x7a, 0xa2, 0x6e, 0x7b, 0x7d,
+    0x9c, 0x99, 0x97, 0x5d, 0x9b, 0x69, 0x80, 0xa3, 0x96, 0x8d, 0x7c, 0x82,
+    0xa3, 0x76, 0x95, 0x67, 0x93, 0x8e, 0x62, 0x7b, 0x78, 0x96, 0x69, 0x67,
+    0x84, 0x8f, 0x62, 0x80, 0x88, 0x7e, 0x6c, 0x94, 0xab, 0x8b, 0x82, 0x9e,
+    0x7e, 0x8c, 0x70, 0x83, 0x9c, 0x9c, 0x80, 0x87, 0x8f, 0xa1, 0x7f, 0x81,
+    0x95, 0x83, 0x6d, 0x7a, 0xa0, 0x77, 0x6d, 0x76, 0x91, 0x7e, 0xa3, 0x62,
+    0xa0, 0x93, 0x7e, 0x97, 0xb6, 0x6c, 0xad, 0x72, 0xb2, 0x95, 0x73, 0x83,
+    0x62, 0x56, 0xe2, 0x99, 0x6e, 0x66, 0xb0, 0x6c, 0x75, 0x4e, 0xb2, 0xc7,
+    0x51, 0x98, 0x90, 0x8c, 0x82, 0x63, 0xa8, 0x99, 0x54, 0xc1, 0x87, 0x80,
+    0x79, 0x62, 0xad, 0x81, 0x76, 0x99, 0xa9, 0x9b, 0x4e, 0x8c, 0xaf, 0xb6,
+    0x5d, 0x9b, 0xb4, 0x9f, 0x6d, 0x60, 0xa5, 0xb5, 0x3e, 0xb2, 0xc4, 0x96,
+    0x86, 0x6d, 0x48, 0x99, 0x50, 0xc1, 0xa8, 0x93, 0x8a, 0x92, 0x7d, 0x8f,
+    0x74, 0x87, 0x91, 0x71, 0x8c, 0x87, 0x90, 0x80, 0x80, 0x82, 0x7b, 0x85,
+    0x81, 0x7f, 0xa7, 0x6a, 0x78, 0x4e, 0x90, 0x85, 0x9f, 0x93, 0x91, 0x91,
+    0xa5, 0x6e, 0x9d, 0xa7, 0x9e, 0x7f, 0x9a, 0x66, 0xbe, 0x6f, 0x82, 0x81,
+    0x85, 0x86, 0x89, 0x6c, 0x88, 0x92, 0x6d, 0x6a, 0x8c, 0x95, 0x68, 0x70,
+    0x91, 0x9b, 0x76, 0x59, 0x87, 0x93, 0x6f, 0x79, 0x7a, 0x99, 0x7d, 0x76,
+    0xa3, 0x9c, 0x69, 0x75, 0x8f, 0x8e, 0x7e, 0x7a, 0x80, 0x8b, 0x76, 0x82,
+    0x70, 0x71, 0x77, 0x7a, 0x88, 0xa1, 0x79, 0x75, 0x9e, 0x7e, 0x6d, 0x6f,
+    0xa5, 0x84, 0xb1, 0x77, 0xad, 0x94, 0x98, 0x90, 0xa7, 0x5c, 0xb6, 0x84,
+    0x99, 0x91, 0x71, 0x7b, 0x6d, 0x54, 0xd2, 0x84, 0x5d, 0x75, 0xb4, 0x7e,
+    0x7d, 0x53, 0xc5, 0x98, 0x70, 0xaa, 0x9e, 0x81, 0x7d, 0x68, 0xa7, 0x8d,
+    0x63, 0xab, 0x9b, 0x96, 0x7e, 0x6b, 0xa3, 0x9e, 0x6d, 0x98, 0xaf, 0x9b,
+    0x78, 0x74, 0xae, 0xc7, 0x70, 0x98, 0xd4, 0x9a, 0x6e, 0x75, 0xa2, 0xcd,
+    0x42, 0xb0, 0xc9, 0x89, 0x88, 0x77, 0x6a, 0xa4, 0x66, 0xb5, 0xbc, 0x8a,
+    0x96, 0x87, 0x5e, 0xa5, 0x87, 0x95, 0x91, 0x5d, 0x85, 0x91, 0xaa, 0x8f,
+    0x99, 0x78, 0x79, 0x74, 0x7f, 0x81, 0xa1, 0x74, 0x77, 0x64, 0x6c, 0x94,
+    0xa0, 0x8b, 0x9b, 0x8e, 0xac, 0x6a, 0x98, 0x9c, 0x7a, 0x9f, 0xab, 0x7e,
+    0xa3, 0x8b, 0x68, 0x7f, 0x84, 0x9f, 0x93, 0x77, 0x90, 0x98, 0x8f, 0x87,
+    0x81, 0x8e, 0x76, 0x95, 0x66, 0x78, 0x85, 0x79, 0x95, 0x89, 0x64, 0x8e,
+    0x8a, 0x87, 0x6f, 0x65, 0xa4, 0x98, 0x7a, 0x83, 0x85, 0x7e, 0x6b, 0xaa,
+    0x81, 0x94, 0x7c, 0x6e, 0x78, 0x85, 0x87, 0x6d, 0x7a, 0x92, 0x67, 0x7a,
+    0x8d, 0x95, 0x77, 0x7f, 0x9f, 0x71, 0xb1, 0xa1, 0xb2, 0x91, 0x7f, 0xb0,
+    0xac, 0x5c, 0xaf, 0x6a, 0xae, 0x98, 0x63, 0x7e, 0x67, 0x6f, 0xc4, 0x8a,
+    0x75, 0x61, 0xac, 0x73, 0x86, 0x54, 0xc3, 0xa8, 0x5d, 0xa9, 0xb4, 0x9b,
+    0x80, 0x6d, 0xa1, 0x8d, 0x64, 0xaa, 0x86, 0x96, 0x86, 0x6c, 0x9b, 0x8b,
+    0x73, 0x9f, 0x9a, 0x87, 0x64, 0x6c, 0xad, 0xa6, 0x64, 0x8a, 0xbe, 0x88,
+    0x67, 0x67, 0xaf, 0xb0, 0x71, 0xae, 0xde, 0x95, 0x9f, 0x7c, 0x7d, 0xa1,
+    0x79, 0xb8, 0xaa, 0x9c, 0x84, 0x91, 0x6b, 0xac, 0x74, 0xa1, 0xad, 0x74,
+    0x88, 0x93, 0x94, 0x72, 0x97, 0x7a, 0x78, 0x86, 0x76, 0x93, 0xb1, 0x6f,
+    0x91, 0x44, 0x96, 0x8e, 0x8e, 0xa5, 0x9a, 0x70, 0x99, 0x79, 0x84, 0x82,
+    0x7f, 0x78, 0xac, 0x6f, 0x9c, 0x80, 0x7d, 0x87, 0x7f, 0x9d, 0x6a, 0x71,
+    0x7c, 0x92, 0x78, 0x7a, 0x93, 0x90, 0x55, 0x83, 0x7a, 0x8a, 0x9a, 0x65,
+    0x86, 0x9b, 0x7c, 0x6b, 0xa3, 0x85, 0x86, 0x71, 0xab, 0x9a, 0x86, 0x90,
+    0x86, 0x88, 0x88, 0x88, 0x99, 0x98, 0x77, 0x86, 0x88, 0x90, 0x79, 0x7c,
+    0x6e, 0x9f, 0x76, 0x70, 0x84, 0x67, 0x7e, 0x8b, 0xa5, 0x68, 0xa7, 0x9d,
+    0xb5, 0x9b, 0x8b, 0x8a, 0xc0, 0x60, 0x9e, 0x83, 0xb0, 0xb7, 0x65, 0x7f,
+    0x7a, 0x7e, 0xc3, 0x7b, 0x74, 0x8f, 0xa4, 0x68, 0x5f, 0x47, 0xbb, 0xa4,
+    0x74, 0x95, 0xab, 0x80, 0x70, 0x5c, 0x9a, 0x8a, 0x7d, 0xa5, 0x90, 0x7d,
+    0x86, 0x68, 0xb1, 0x73, 0x6d, 0xad, 0x93, 0x8d, 0x7b, 0x64, 0xbd, 0xae,
+    0x7a, 0x98, 0xcb, 0x97, 0x83, 0x67, 0xab, 0xb0, 0x61, 0xa7, 0xcd, 0x7e,
+    0x87, 0x78, 0x76, 0x95, 0x6a, 0xba, 0xa9, 0x84, 0x8f, 0x95, 0x7c, 0x8b,
+    0x90, 0x89, 0x8b, 0x81, 0x87, 0x8b, 0x76, 0x73, 0x6f, 0x61, 0x94, 0x73,
+    0x83, 0x97, 0xb3, 0x6b, 0x9c, 0x55, 0x7f, 0x96, 0x9a, 0x92, 0x85, 0x52,
+    0xc6, 0x73, 0x88, 0x9c, 0x7c, 0x86, 0x98, 0x6d, 0x99, 0x87, 0x80, 0x7c,
+    0x7d, 0x98, 0x74, 0x7c, 0x89, 0x8a, 0x7d, 0x7b, 0x83, 0x90, 0x7d, 0x81,
+    0x7a, 0xa0, 0x86, 0x5f, 0x74, 0x8e, 0x68, 0x7b, 0x6c, 0x86, 0x90, 0x84,
+    0x7e, 0xae, 0x73, 0x6f, 0x8d, 0x81, 0x7c, 0x93, 0xa0, 0xb3, 0x6b, 0x9a,
+    0x88, 0xab, 0x8a, 0x94, 0x9c, 0x87, 0x9c, 0x75, 0x7d, 0x8f, 0x7c, 0x7f,
+    0x9b, 0x69, 0xa8, 0x99, 0x9d, 0x89, 0x8f, 0x72, 0xba, 0x61, 0xac, 0x91,
+    0xb5, 0xa7, 0x84, 0x99, 0x71, 0x7e, 0xd0, 0x7c, 0x6d, 0x66, 0xb6, 0x72,
+    0x79, 0x61, 0xb6, 0xab, 0x69, 0xa0, 0xaa, 0x7d, 0x74, 0x61, 0x95, 0xa5,
+    0x71, 0xb0, 0x93, 0x95, 0x86, 0x7d, 0x9f, 0x7e, 0x6c, 0x97, 0x85, 0x87,
+    0x72, 0x7b, 0xb4, 0xad, 0x84, 0x7b, 0xcd, 0xa9, 0x7e, 0x6d, 0xc8, 0xc7,
+    0x7e, 0xb7, 0xcf, 0x98, 0x7b, 0x7c, 0x69, 0xaf, 0x64, 0xa6, 0xc1, 0x8e,
+    0x8f, 0x9c, 0x7d, 0x93, 0x7a, 0x96, 0x8a, 0x65, 0x92, 0x95, 0x8d, 0x6f,
+    0x9f, 0x7f, 0x65, 0x69, 0x7a, 0x92, 0x9f, 0x5c, 0x90, 0x4e, 0x69, 0x89,
+    0x8f, 0x9c, 0xa8, 0x7a, 0xb6, 0x7d, 0x84, 0x97, 0x7f, 0x91, 0x8d, 0x71,
+    0xae, 0x86, 0x80, 0x78, 0x81, 0x87, 0x6e, 0x88, 0x87, 0x7f, 0x8f, 0x9d,
+    0x78, 0x91, 0x74, 0x91, 0x7f, 0x7a, 0x80, 0x63, 0x93, 0xa0, 0x7f, 0x6f,
+    0xa3, 0x88, 0x76, 0x5c, 0x6e, 0xa1, 0x6e, 0x7f, 0x84, 0x8b, 0x87, 0x6d,
+    0x87, 0x9f, 0x79, 0x7c, 0x83, 0x89, 0x7e, 0x86, 0xa0, 0x82, 0x80, 0x8e,
+    0x8b, 0x6c, 0x6e, 0x69, 0x9f, 0x79, 0xaa, 0x6e, 0xa2, 0x8f, 0x9d, 0x87,
+    0xb4, 0x5d, 0xba, 0x6c, 0xaf, 0xa0, 0x84, 0x87, 0x8c, 0x89, 0xcb, 0x6f,
+    0x8e, 0x71, 0xae, 0x5d, 0x6c, 0x61, 0xb3, 0xaf, 0x7a, 0x94, 0xb1, 0x8a,
+    0x80, 0x65, 0x8a, 0x9d, 0x61, 0xb6, 0x8b, 0x97, 0x8a, 0x73, 0xa8, 0x82,
+    0x74, 0x8a, 0x9c, 0x73, 0x61, 0x69, 0xb8, 0x9f, 0x76, 0x90, 0xc5, 0xaa,
+    0x6b, 0x5f, 0xb7, 0xce, 0x6d, 0xb7, 0xcc, 0x97, 0x7a, 0x81, 0x95, 0xbe,
+    0x78, 0xb1, 0xb4, 0x97, 0x8e, 0x99, 0x70, 0xa2, 0x72, 0x8d, 0x8e, 0x7d,
+    0x90, 0x9f, 0x7b, 0x63, 0x87, 0x89, 0x7a, 0x5f, 0x81, 0x97, 0x8d, 0x78,
+    0x94, 0x64, 0x95, 0x9d, 0x90, 0x87, 0xb3, 0x6e, 0xc2, 0x80, 0x94, 0x86,
+    0x87, 0x93, 0xb3, 0x57, 0xb8, 0x73, 0x8a, 0x81, 0x6f, 0x95, 0x89, 0x82,
+    0x94, 0x7a, 0x8e, 0x97, 0x8a, 0x91, 0x7f, 0x77, 0x98, 0x72, 0x67, 0x5f,
+    0x7b, 0x8d, 0x78, 0x74, 0x91, 0x82, 0x86, 0x5c, 0x88, 0xa3, 0x73, 0x6f,
+    0x92, 0x78, 0x9c, 0x95, 0x99, 0x9d, 0x70, 0x89, 0x8f, 0xa7, 0x74, 0x89,
+    0x77, 0x90, 0x72, 0x8d, 0x9c, 0x6f, 0x7a, 0x6c, 0x9f, 0x72, 0xad, 0x6c,
+    0xa5, 0x7a, 0x9d, 0x78, 0xa4, 0x52, 0xbd, 0x94, 0xb5, 0x97, 0x75, 0x78,
+    0x86, 0x72, 0xdf, 0x6f, 0x98, 0x81, 0xab, 0x5d, 0x62, 0x65, 0x9d, 0xbc,
+    0x68, 0x8a, 0xc1, 0x7e, 0x67, 0x7f, 0x88, 0x95, 0x7f, 0xbd, 0x9c, 0x77,
+    0x7d, 0x7e, 0x96, 0x7c, 0x7f, 0xa1, 0xa4, 0x90, 0x7c, 0x74, 0xc0, 0xac,
+    0x7d, 0xa1, 0xdb, 0x85, 0x85, 0x51, 0xbc, 0xb1, 0x6c, 0xcb, 0xd1, 0xa7,
+    0x76, 0x70, 0x7d, 0xba, 0x88, 0xb6, 0xaf, 0xa2, 0x9d, 0x9b, 0x71, 0x96,
+    0x80, 0x89, 0xa3, 0x86, 0x89, 0x8f, 0x76, 0x77, 0xa9, 0x82, 0x8f, 0x69,
+    0x7f, 0x9d, 0xac, 0x80, 0x98, 0x6c, 0x70, 0x72, 0x81, 0x8b, 0xaf, 0x80,
+    0xb1, 0x6f, 0x7c, 0x90, 0x91, 0x82, 0xa5, 0x67, 0x9c, 0x76, 0x8c, 0x6b,
+    0x9c, 0x9b, 0x87, 0x8c, 0x8e, 0x8b, 0xb0, 0x9d, 0x89, 0x8f, 0x76, 0x87,
+    0x9b, 0x90, 0x8e, 0x74, 0x73, 0x91, 0x85, 0x80, 0x81, 0x72, 0x99, 0x84,
+    0x87, 0x95, 0x84, 0x8c, 0x8a, 0x6e, 0x8c, 0x82, 0xad, 0x9d, 0x80, 0x7f,
+    0x96, 0x9c, 0x7f, 0x67, 0xb0, 0x98, 0x69, 0x84, 0x94, 0xa9, 0x7e, 0x83,
+    0x9d, 0x62, 0x92, 0x6e, 0x95, 0x88, 0xa4, 0x90, 0x97, 0x4d, 0xae, 0x89,
+    0xb6, 0xa1, 0x88, 0x9f, 0x7a, 0x70, 0xc2, 0x71, 0x7f, 0x83, 0x90, 0x83,
+    0x5e, 0x50, 0xa9, 0x9f, 0x73, 0x8c, 0xb2, 0x80, 0x79, 0x65, 0x7c, 0x90,
+    0x6d, 0x9a, 0x91, 0x8d, 0x6f, 0x65, 0x97, 0x87, 0x82, 0xa0, 0xa4, 0x8c,
+    0x68, 0x76, 0xa8, 0xa2, 0x7f, 0xa4, 0xcd, 0x91, 0x70, 0x54, 0x95, 0xc6,
+    0x6e, 0x9c, 0xe2, 0xa1, 0x86, 0x82, 0x73, 0xbc, 0x89, 0xaa, 0xb2, 0x7d,
+    0x82, 0x84, 0x8b, 0x9e, 0x84, 0x94, 0xa0, 0x7a, 0x98, 0x9d, 0x99, 0x7b,
+    0x7b, 0x89, 0x8f, 0x66, 0x89, 0x9b, 0xa7, 0x8b, 0x9b, 0x62, 0x9b, 0x78,
+    0x8b, 0x95, 0xbd, 0x7a, 0x9e, 0x61, 0x80, 0x84, 0x89, 0x8e, 0xb4, 0x7b,
+    0xb8, 0x70, 0x75, 0x8e, 0x7b, 0x9c, 0x9e, 0x9f, 0x89, 0x86, 0x9b, 0x7a,
+    0x7b, 0x95, 0x83, 0x95, 0x80, 0x94, 0x85, 0x65, 0x8c, 0x81, 0x67, 0x77,
+    0x94, 0x8a, 0x92, 0x74, 0x72, 0x90, 0x6b, 0x74, 0x7e, 0x75, 0x71, 0x84,
+    0x9e, 0xa6, 0x64, 0x80, 0x8d, 0x7a, 0x8c, 0x82, 0x98, 0x96, 0x64, 0x7d,
+    0x8b, 0x82, 0x6a, 0x7f, 0x97, 0x4e, 0x91, 0x74, 0x94, 0x99, 0x6d, 0x6a,
+    0xb3, 0x5a, 0xb8, 0x64, 0xa3, 0x95, 0x5d, 0x95, 0x90, 0x87, 0xcc, 0x72,
+    0x85, 0x85, 0x8f, 0x55, 0x6f, 0x65, 0x84, 0xb6, 0x7b, 0x77, 0xce, 0x79,
+    0x82, 0x59, 0x8a, 0xa2, 0x68, 0x9b, 0xa3, 0x81, 0x9c, 0x7a, 0x97, 0x87,
+    0x6b, 0x8c, 0x9c, 0xaa, 0x5c, 0x69, 0xb8, 0xb7, 0x7c, 0xa0, 0xb5, 0x92,
+    0x8d, 0x67, 0x96, 0xd2, 0x77, 0xa6, 0xd9, 0xad, 0xaa, 0x79, 0x90, 0xc9,
+    0x81, 0xbf, 0xd0, 0x8d, 0x9d, 0x88, 0x9c, 0x91, 0x90, 0x94, 0x89, 0x8a,
+    0x91, 0x9b, 0x89, 0x79, 0x92, 0x80, 0x8f, 0x7b, 0x7e, 0x8b, 0xb1, 0x85,
+    0xa4, 0x5a, 0xb4, 0x7a, 0xa7, 0x8c, 0xa4, 0x75, 0xb9, 0x66, 0x93, 0x86,
+    0x8a, 0x87, 0xad, 0x64, 0xa2, 0x7e, 0x99, 0x9f, 0x81, 0xa2, 0x9b, 0x88,
+    0x9e, 0xa2, 0xb9, 0x8a, 0x78, 0x84, 0x91, 0x8e, 0x8b, 0x90, 0x83, 0x80,
+    0x64, 0x93, 0x77, 0x89, 0x81, 0x86, 0x96, 0x7a, 0x81, 0xab, 0x6d, 0x73,
+    0x7d, 0x7e, 0xaa, 0x85, 0x95, 0xac, 0x8b, 0x89, 0x8b, 0x77, 0xa3, 0x8b,
+    0xa3, 0xa0, 0x87, 0x86, 0x7a, 0x74, 0x6f, 0x7c, 0x90, 0x58, 0xa2, 0x64,
+    0x94, 0x8b, 0xa0, 0x88, 0xab, 0x53, 0xce, 0x67, 0xb7, 0x7f, 0x8d, 0x69,
+    0x84, 0x74, 0xaf, 0x72, 0xab, 0x70, 0x8f, 0x6e, 0x5d, 0x61, 0x96, 0xa1,
+    0x7b, 0x6f, 0xa2, 0x75, 0x8f, 0x5d, 0x93, 0x72, 0x82, 0x97, 0x76, 0x65,
+    0x7e, 0x96, 0xb3, 0x8b, 0x8d, 0x89, 0x8f, 0x7b, 0x6f, 0x71, 0xa1, 0x9e,
+    0x91, 0x7c, 0xc9, 0x9f, 0x7c, 0x71, 0xa1, 0xba, 0x77, 0xa5, 0xd4, 0xa6,
+    0xa0, 0x82, 0x7b, 0x95, 0x9d, 0xb7, 0xaa, 0x8d, 0x71, 0x87, 0x94, 0x7e,
+    0x88, 0x7f, 0x8b, 0x6e, 0x93, 0x9f, 0x82, 0x88, 0x94, 0x8a, 0x97, 0x7f,
+    0x7d, 0x8c, 0xa0, 0x84, 0xb4, 0x7c, 0x8c, 0x7f, 0x71, 0x8c, 0x8e, 0x7f,
+    0xc6, 0x64, 0x81, 0x8d, 0x89, 0x8d, 0xc4, 0x77, 0xaf, 0x75, 0x92, 0x7f,
+    0x84, 0xa1, 0x99, 0x94, 0x9e, 0x82, 0x7a, 0x98, 0x7e, 0x8e, 0x93, 0x8c,
+    0x6b, 0x93, 0x84, 0xaa, 0x7f, 0x8f, 0x6b, 0x94, 0xa3, 0x8a, 0x78, 0x82,
+    0x60, 0x92, 0x8b, 0x8d, 0x75, 0x8c, 0x8e, 0x6e, 0x7e, 0x9d, 0x6d, 0x8e,
+    0x79, 0x8d, 0x80, 0x89, 0xaa, 0x99, 0x7e, 0xa3, 0x83, 0x95, 0x83, 0x85,
+    0x9c, 0x60, 0x99, 0x78, 0x93, 0x8b, 0x80, 0x82, 0x9d, 0x6b, 0xc2, 0x54,
+    0xb9, 0x7a, 0x83, 0x98, 0x88, 0x65, 0xcb, 0x52, 0xa7, 0x8d, 0x7f, 0x81,
+    0x6b, 0x6d, 0x9e, 0x92, 0x85, 0x82, 0x9f, 0x67, 0x6f, 0x74, 0xaa, 0x75,
+    0x99, 0x9f, 0x8a, 0x8b, 0x88, 0x82, 0xb8, 0x6b, 0x85, 0x99, 0x93, 0x90,
+    0x8d, 0x7a, 0xaa, 0x9d, 0x86, 0x7f, 0xbd, 0x91, 0x67, 0x65, 0x8c, 0xb3,
+    0x87, 0x94, 0xa3, 0x9a, 0x7e, 0x73, 0x83, 0xaa, 0x7a, 0xba, 0xaa, 0x9e,
+    0x9e, 0x86, 0x9a, 0x63, 0x9c, 0x98, 0x5e, 0xa0, 0x9c, 0x9e, 0x8b, 0x85,
+    0xa2, 0x74, 0x80, 0x8d, 0x7e, 0x89, 0xc0, 0x75, 0xa5, 0x3f, 0x97, 0xa2,
+    0x8c, 0x8c, 0x9d, 0x88, 0xa4, 0x5e, 0x75, 0x5f, 0x87, 0x82, 0xbc, 0x72,
+    0xa3, 0x77, 0x83, 0x79, 0x82, 0x95, 0x8d, 0x77, 0x73, 0x81, 0x9d, 0x9b,
+    0x6c, 0x87, 0x93, 0x96, 0x83, 0x86, 0x8b, 0x89, 0x72, 0x7d, 0x96, 0x78,
+    0x67, 0xa2, 0x8d, 0x81, 0x6a, 0x98, 0x75, 0x80, 0x8a, 0x80, 0x9e, 0x82,
+    0x76, 0x9b, 0x6c, 0x94, 0x7a, 0x96, 0x74, 0x92, 0x78, 0x91, 0x7a, 0x7c,
+    0x9a, 0x98, 0x70, 0x5d, 0x9c, 0x4b, 0x70, 0x7d, 0xa9, 0x9b, 0x70, 0x96,
+    0xad, 0x59, 0xc4, 0x63, 0xbc, 0x8f, 0x5c, 0x86, 0x8e, 0x97, 0xa0, 0x7c,
+    0xa6, 0x77, 0xaa, 0x93, 0x68, 0x66, 0x93, 0x91, 0x7b, 0x7e, 0xa2, 0x7a,
+    0x98, 0x77, 0x97, 0x59, 0x84, 0x76, 0x9c, 0x7b, 0x8b, 0x76, 0x88, 0x7a,
+    0x8c, 0x7b, 0xa4, 0xae, 0x6e, 0x7d, 0xb3, 0x99, 0x8d, 0x68, 0x9e, 0x7e,
+    0x77, 0x59, 0x80, 0xbe, 0x80, 0x83, 0xd9, 0x9f, 0x7d, 0x60, 0x8b, 0x98,
+    0x7f, 0x9e, 0xa3, 0x8d, 0x7d, 0x81, 0x9e, 0x78, 0x99, 0x94, 0x70, 0x80,
+    0x9b, 0x89, 0x8c, 0x6d, 0x9c, 0x95, 0x76, 0x7c, 0x83, 0x87, 0x97, 0x93,
+    0x89, 0x6d, 0x77, 0x7e, 0x7e, 0x87, 0x8e, 0x7e, 0x94, 0x61, 0x94, 0xa2,
+    0x94, 0x91, 0xa1, 0x64, 0xc1, 0x78, 0x79, 0xaf, 0x67, 0x7a, 0x9b, 0xa1,
+    0x95, 0x8e, 0x97, 0x84, 0x7b, 0x85, 0x80, 0xa1, 0x6f, 0x87, 0x79, 0x83,
+    0x73, 0x9d, 0x81, 0x64, 0x7a, 0x7f, 0x8f, 0x91, 0x73, 0x97, 0x74, 0x8b,
+    0x7e, 0x88, 0x7f, 0x7e, 0x6e, 0xa1, 0x85, 0x8f, 0x77, 0x93, 0x7a, 0x6f,
+    0x7b, 0x91, 0x67, 0x73, 0x8b, 0x97, 0x6d, 0x87, 0x84, 0xf8, 0xff, 0xff,
+    0x88, 0xf8, 0xff, 0xff, 0xe6, 0xf8, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x02, 0x00, 0x00, 0x73, 0x84, 0xbb, 0xa4, 0xa5, 0x44, 0x5c, 0xb1,
+    0x8e, 0x50, 0x82, 0x8b, 0x81, 0x86, 0x48, 0x80, 0xa9, 0x61, 0xa3, 0xa8,
+    0xca, 0x5a, 0x9d, 0x8a, 0x89, 0x7c, 0x65, 0x91, 0x5e, 0x70, 0x84, 0x71,
+    0xbc, 0x36, 0x8e, 0x8b, 0xa6, 0x63, 0xb7, 0x75, 0x92, 0x59, 0x60, 0x7e,
+    0x33, 0x8f, 0x90, 0x7a, 0xa9, 0x27, 0x72, 0x80, 0x62, 0x95, 0x93, 0x7b,
+    0x60, 0x46, 0x40, 0x55, 0x01, 0x9e, 0x8a, 0x6b, 0x58, 0x8a, 0xa6, 0xb7,
+    0x91, 0x39, 0x72, 0xb4, 0x6e, 0x67, 0x83, 0x91, 0x82, 0x7b, 0x64, 0x7a,
+    0x87, 0x6e, 0xb0, 0xa0, 0xd3, 0x53, 0xb7, 0x93, 0x76, 0xa6, 0x68, 0x8a,
+    0x74, 0x6a, 0x96, 0x6e, 0xb3, 0x53, 0xaa, 0x89, 0xf1, 0x76, 0xb8, 0x75,
+    0x8b, 0x66, 0x5f, 0x6e, 0x52, 0x92, 0x6f, 0x82, 0xbe, 0x45, 0x8d, 0x69,
+    0x98, 0x98, 0x80, 0x87, 0x73, 0x7d, 0x4d, 0x42, 0x1f, 0xa5, 0x6a, 0x73,
+    0x47, 0x87, 0x8a, 0xd1, 0x75, 0x30, 0x91, 0xae, 0x60, 0x82, 0x7a, 0x94,
+    0x75, 0x71, 0x6a, 0x7c, 0x74, 0x7a, 0xac, 0xa2, 0xb6, 0x51, 0xc6, 0x97,
+    0x63, 0xa0, 0x67, 0x7f, 0x80, 0x69, 0x88, 0x6b, 0xa5, 0x5e, 0xc2, 0x72,
+    0xf4, 0x6e, 0xaf, 0x76, 0x7f, 0x7c, 0x55, 0x68, 0x67, 0x97, 0x61, 0x7b,
+    0xbe, 0x5e, 0xab, 0x58, 0xca, 0xa2, 0x77, 0x7a, 0x8f, 0x6e, 0x54, 0x33,
+    0x4d, 0xa7, 0x5d, 0x66, 0x47, 0x92, 0x6f, 0xd6, 0x5c, 0x25, 0xa9, 0xbc,
+    0x5c, 0xb8, 0x64, 0x9b, 0x58, 0x6e, 0x77, 0x76, 0x6a, 0x94, 0xb2, 0xac,
+    0x9a, 0x51, 0xd0, 0x94, 0x62, 0xcc, 0x5a, 0x7f, 0x74, 0x6e, 0x7d, 0x71,
+    0x9b, 0x69, 0xd3, 0x64, 0xef, 0x76, 0xaa, 0x75, 0x89, 0x84, 0x50, 0x76,
+    0x72, 0x97, 0x5f, 0x77, 0xc5, 0x66, 0xce, 0x3a, 0xe5, 0xad, 0x5a, 0x81,
+    0x9e, 0x8e, 0x60, 0x3d, 0x6d, 0xa9, 0x46, 0x6b, 0x44, 0x89, 0x4d, 0xd8,
+    0x4c, 0x28, 0xb1, 0xb7, 0x60, 0xc7, 0x57, 0xb5, 0x50, 0x68, 0x88, 0x7c,
+    0x60, 0x98, 0xac, 0x9a, 0x7f, 0x51, 0xce, 0x8a, 0x5e, 0xd8, 0x51, 0x7d,
+    0x68, 0x6e, 0x7f, 0x6e, 0x90, 0x7b, 0xdf, 0x60, 0xda, 0x77, 0x91, 0x6f,
+    0x85, 0xa0, 0x58, 0x73, 0x70, 0x93, 0x51, 0x7d, 0xb9, 0x70, 0xf5, 0x31,
+    0xe9, 0xa3, 0x47, 0x76, 0xa7, 0x9b, 0x72, 0x3d, 0x90, 0xb2, 0x57, 0x64,
+    0x5b, 0x6f, 0x2b, 0xcf, 0x52, 0x28, 0xc1, 0xa7, 0x6a, 0x78, 0x51, 0xad,
+    0x49, 0x70, 0x90, 0x81, 0x5c, 0x7e, 0x9e, 0x99, 0x77, 0x50, 0xc0, 0x94,
+    0x63, 0xb7, 0x4d, 0x71, 0x58, 0x66, 0x76, 0x6d, 0x78, 0x6a, 0xe1, 0x40,
+    0xc7, 0x73, 0x7f, 0x65, 0x7c, 0x7f, 0x4d, 0x80, 0x64, 0x95, 0x57, 0x81,
+    0xb1, 0x5e, 0xff, 0x26, 0xd6, 0xa2, 0x3a, 0x73, 0xa7, 0x81, 0x76, 0x5d,
+    0x92, 0xb1, 0x58, 0x48, 0x4e, 0x5e, 0x1a, 0xc8, 0x58, 0x2c, 0xb6, 0xa7,
+    0x67, 0x89, 0x5e, 0xa0, 0x4f, 0x78, 0x93, 0x8b, 0x57, 0x7b, 0x95, 0x78,
+    0x6e, 0x46, 0xb2, 0x98, 0x55, 0xd3, 0x5e, 0x66, 0x56, 0x68, 0x74, 0x7e,
+    0x72, 0x74, 0xdd, 0x36, 0xa6, 0x64, 0x65, 0x6b, 0x81, 0x98, 0x56, 0x76,
+    0x65, 0x93, 0x58, 0x7d, 0x9b, 0x82, 0xef, 0x44, 0xbf, 0xa4, 0x3d, 0x57,
+    0xa0, 0xa7, 0x7a, 0x74, 0x9f, 0xa8, 0x70, 0x52, 0x55, 0x5f, 0x1a, 0x94,
+    0x64, 0x37, 0xa7, 0xa6, 0x80, 0x7d, 0x6e, 0x99, 0x5d, 0x81, 0x8a, 0x99,
+    0x5c, 0x76, 0x8f, 0x44, 0x68, 0x50, 0x94, 0x97, 0x63, 0xb6, 0x73, 0x56,
+    0x5b, 0x70, 0x66, 0x8b, 0x72, 0x78, 0xcc, 0x31, 0x8b, 0x68, 0x4a, 0x74,
+    0x7d, 0x99, 0x54, 0x91, 0x6a, 0x90, 0x5d, 0x80, 0x8c, 0x82, 0xcd, 0x4f,
+    0xb0, 0x96, 0x63, 0x56, 0x97, 0xb3, 0x7e, 0x97, 0xa4, 0x9d, 0x7a, 0x5d,
+    0x49, 0x36, 0x18, 0x64, 0x60, 0x43, 0x89, 0xa2, 0x6a, 0x49, 0x7f, 0x58,
+    0x6a, 0x83, 0x77, 0x9d, 0x70, 0x3b, 0x83, 0x21, 0x59, 0x52, 0x6d, 0x95,
+    0x48, 0xa8, 0x8a, 0x42, 0x50, 0x6d, 0x44, 0x95, 0x69, 0x50, 0xc1, 0x4b,
+    0x7c, 0x59, 0x42, 0x78, 0x77, 0x7f, 0x5b, 0x98, 0x67, 0x89, 0x55, 0x8b,
+    0x82, 0x47, 0xb7, 0x64, 0x9d, 0x83, 0x5c, 0x53, 0x89, 0x90, 0x79, 0xb2,
+    0x90, 0x98, 0x85, 0x5a, 0x4d, 0x2b, 0x19, 0x1e, 0x52, 0x50, 0x57, 0x8b,
+    0x73, 0x3a, 0x88, 0x1e, 0x65, 0x80, 0x4d, 0x9b, 0x6c, 0x3c, 0x86, 0x26,
+    0x5b, 0x56, 0x36, 0x98, 0x49, 0x87, 0x9f, 0x2a, 0x40, 0x61, 0x27, 0x9d,
+    0x63, 0x40, 0xa8, 0x46, 0x6b, 0x52, 0x52, 0x7f, 0x67, 0x6a, 0x58, 0xa1,
+    0x5d, 0x6d, 0x5f, 0x9a, 0x72, 0x3a, 0x99, 0x63, 0x8c, 0x80, 0x68, 0x58,
+    0x72, 0x6a, 0x7c, 0xbb, 0x7e, 0x78, 0x94, 0x60, 0x72, 0xfb, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff,
+    0x8f, 0x00, 0x00, 0x00, 0x8f, 0xfc, 0xff, 0xff, 0xb4, 0xfe, 0xff, 0xff,
+    0xc1, 0xfd, 0xff, 0xff, 0x59, 0xff, 0xff, 0xff, 0xbc, 0xfe, 0xff, 0xff,
+    0x09, 0xff, 0xff, 0xff, 0x9e, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0xe9, 0x03, 0x00, 0x00, 0x2b, 0xfd, 0xff, 0xff,
+    0x3b, 0xfd, 0xff, 0xff, 0x91, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,
+    0x04, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf0, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x18, 0x03, 0x00, 0x00, 0x78, 0x03, 0x00, 0x00,
+    0x88, 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x64, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+    0xb2, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
+    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xfd, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x22, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x1c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x17, 0xb1, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x84, 0xdb, 0x33, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x9d, 0xf0, 0x2c, 0xc1, 0x8e, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f,
+    0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74,
+    0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f,
+    0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
+    0x84, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xac, 0x5f, 0xf6, 0x39, 0x01, 0x00, 0x00, 0x00, 0x1d, 0xaf, 0x62, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x5e, 0x1b, 0x83, 0xbd, 0x22, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x0f, 0x72, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x38, 0x1d, 0x71, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x6c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc6, 0xd0, 0xd0, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x0e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x3c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61,
+    0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d,
+    0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf7, 0x5e, 0x6c, 0x3a,
+    0x01, 0x00, 0x00, 0x00, 0x30, 0x42, 0xec, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x42, 0xca, 0xe8, 0xbd, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
+    0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xec, 0xcd, 0xc0, 0x38, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
+    0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x25, 0xf5, 0xe8, 0x37, 0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
+    0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+    0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+    0x14, 0x00, 0x1c, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff,
+    0x00, 0x19, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x09, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x04};
+const int g_tiny_conv_micro_features_model_data_len = 18208;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..22c0a970b774299aea629ce034b9dd2e4c04e1ca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system. It was created using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+
+extern const unsigned char g_tiny_conv_micro_features_model_data[];
+extern const int g_tiny_conv_micro_features_model_data_len;
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51b7d8b35bcbce892fbd891415e3f09725c62faa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+#include <string.h>
+
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read) {
+  const int size = state->size;
+
+  // Copy samples from the samples buffer over to our local input.
+  size_t max_samples_to_copy = state->size - state->input_used;
+  if (max_samples_to_copy > num_samples) {
+    max_samples_to_copy = num_samples;
+  }
+  memcpy(state->input + state->input_used, samples,
+         max_samples_to_copy * sizeof(*samples));
+  *num_samples_read = max_samples_to_copy;
+  state->input_used += max_samples_to_copy;
+
+  if (state->input_used < state->size) {
+    // We don't have enough samples to compute a window.
+    return 0;
+  }
+
+  // Apply the window to the input.
+  const int16_t* coefficients = state->coefficients;
+  const int16_t* input = state->input;
+  int16_t* output = state->output;
+  int i;
+  int16_t max_abs_output_value = 0;
+  for (i = 0; i < size; ++i) {
+    int16_t new_value = ((static_cast<int32_t>(*input++)) * *coefficients++) >>
+                        kFrontendWindowBits;
+    *output++ = new_value;
+    if (new_value < 0) {
+      new_value = -new_value;
+    }
+    if (new_value > max_abs_output_value) {
+      max_abs_output_value = new_value;
+    }
+  }
+  // Shuffle the input down by the step size, and update how much we have used.
+  memmove(state->input, state->input + state->step,
+          sizeof(*state->input) * (state->size - state->step));
+  state->input_used -= state->step;
+  state->max_abs_output_value = max_abs_output_value;
+
+  // Indicate that the output buffer is valid for the next stage.
+  return 1;
+}
+
+void WindowReset(struct WindowState* state) {
+  memset(state->input, 0, state->size * sizeof(*state->input));
+  memset(state->output, 0, state->size * sizeof(*state->output));
+  state->input_used = 0;
+  state->max_abs_output_value = 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
new file mode 100644
index 0000000000000000000000000000000000000000..b32c059d81a8efe68c8a87a250fd733e1849479b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFrontendWindowBits 12
+
+struct WindowState {
+  size_t size;
+  int16_t coefficients[kMaxAudioSampleSize];
+  size_t step;
+
+  int16_t input[kMaxAudioSampleSize];
+  size_t input_used;
+  int16_t output[kMaxAudioSampleSize];
+  int16_t max_abs_output_value;
+};
+
+// Applies a window to the samples coming in, stepping forward at the given
+// rate.
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read);
+
+void WindowReset(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..310f84fc60b32e37f7e7d9d79bc2425ce7cddf8a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test window function behaviors using default config values.
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
+    config_.size_ms = 25;
+    config_.step_size_ms = 10;
+  }
+
+  struct WindowConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+
+  const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
+                              2681, 3145, 3541, 3843, 4032, 4096, 4032,
+                              3843, 3541, 3145, 2681, 2177, 1664, 1176,
+                              743,  391,  144,  16};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  int i;
+  for (i = kStepSamples; i < kWindowSamples; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  const int16_t expected[] = {
+      0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+      0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+
+  const int16_t expected[] = {
+      0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
+      0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      state.input_used,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..618973b39b2ebd2088b4c3756ea6ca1c1f7e8181
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+// Needed because some platforms don't have M_PI defined.
+#define WINDOW_PI (3.14159265358979323846f)
+
+void WindowFillConfigWithDefaults(struct WindowConfig* config) {
+  config->size_ms = 25;
+  config->step_size_ms = 10;
+}
+
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate) {
+  state->size = config->size_ms * sample_rate / 1000;
+  state->step = config->step_size_ms * sample_rate / 1000;
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->coefficients,
+                                 (state->size * sizeof(*state->coefficients)));
+
+  // Populate the window values.
+  const float arg = WINDOW_PI * 2.0 / (static_cast<float>(state->size));
+  int i;
+  for (i = 0; i < state->size; ++i) {
+    float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5)));
+    // Scale it to fixed point and round it.
+    state->coefficients[i] =
+        floor(float_value * (1 << kFrontendWindowBits) + 0.5);
+  }
+
+  state->input_used = 0;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->output,
+                                 (state->size * sizeof(*state->output)));
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0c61c29dc9cd2a91f37ea89ace5e031235dd337
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct WindowConfig {
+  // length of window frame in milliseconds
+  size_t size_ms;
+  // length of step for next frame in milliseconds
+  size_t step_size_ms;
+};
+
+// Populates the WindowConfig with "sane" default values.
+void WindowFillConfigWithDefaults(struct WindowConfig* config);
+
+// Allocates any buffers.
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void WindowFreeStateContents(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
diff --git a/tensorflow/core/kernels/bitcast_op.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
similarity index 53%
rename from tensorflow/core/kernels/bitcast_op.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
index 1f3659f303338efc69da56da0a67144e9400844b..48535d12d5db850cf0a497645f9e77d98fbcb8a1 100644
--- a/tensorflow/core/kernels/bitcast_op.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// See docs in ../ops/array_ops.cc.
+// See the header for documentation on the meaning of this data.
 
-#ifndef TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
-#define TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
 
-#include <string.h>  // for memcpy
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-
-#endif  // TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+const uint8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
+    214, 215, 236, 202, 235, 203, 225, 191, 203, 188, 199, 194, 212, 127,
+    51,  0,   174, 188, 219, 196, 228, 221, 240, 207, 235, 220, 241, 219,
+    237, 207, 212, 142, 95,  0,   139, 78,  162, 177, 197, 183,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e73a13153b65be78a2a57edce0d09f48a8cb444f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_micro_features_data.cc and consists of the 26th spectrogram slice of 40
+// values. This is the expected result of running the sample data in
+// yes_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_yes_feature_data_slice_size = 40;
+extern const uint8_t g_yes_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2ee0995c00ee0da1337c86cf9aa18ba726bfe1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
+ * --output_c_file="yes_micro_features_data.cc" \
+ */
+
+const int g_yes_micro_f2e59fea_nohash_1_width = 40;
+const int g_yes_micro_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_micro_f2e59fea_nohash_1_data[] = {
+    244, 226, 245, 223, 234, 213, 228, 208, 194, 110, 95,  116, 102, 0,   137,
+    161, 183, 173, 137, 116, 133, 157, 151, 156, 128, 110, 128, 0,   68,  78,
+    78,  90,  68,  68,  78,  102, 95,  78,  95,  78,  210, 188, 209, 183, 204,
+    188, 201, 191, 166, 119, 90,  107, 110, 107, 175, 157, 179, 168, 182, 145,
+    152, 164, 171, 165, 136, 143, 122, 68,  0,   78,  90,  90,  110, 90,  102,
+    99,  90,  68,  78,  68,  223, 186, 179, 123, 182, 110, 196, 171, 159, 110,
+    102, 95,  90,  99,  160, 134, 125, 136, 153, 152, 164, 134, 164, 151, 141,
+    136, 99,  90,  90,  90,  78,  78,  102, 119, 102, 90,  110, 90,  68,  51,
+    177, 175, 211, 172, 183, 0,   95,  68,  129, 102, 68,  85,  114, 105, 110,
+    85,  102, 95,  140, 51,  85,  51,  95,  90,  143, 116, 90,  78,  78,  51,
+    107, 85,  68,  0,   68,  51,  90,  51,  68,  0,   164, 117, 193, 120, 156,
+    0,   138, 51,  90,  0,   51,  0,   51,  85,  0,   0,   51,  0,   0,   0,
+    0,   0,   114, 0,   85,  78,  90,  51,  0,   0,   51,  85,  99,  85,  107,
+    68,  90,  85,  78,  0,   51,  0,   110, 0,   68,  0,   0,   0,   51,  0,
+    51,  0,   0,   0,   68,  90,  107, 0,   68,  0,   0,   0,   68,  0,   51,
+    68,  0,   78,  68,  0,   51,  0,   78,  68,  90,  68,  78,  51,  51,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   0,   0,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  68,
+    0,   0,   78,  0,   78,  0,   78,  0,   51,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   51,  0,   51,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   51,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   0,   51,  78,  0,   0,   51,  51,  0,   0,   0,   78,  0,
+    213, 170, 192, 180, 196, 188, 173, 131, 173, 116, 137, 105, 159, 127, 0,
+    0,   0,   0,   127, 164, 165, 161, 170, 164, 185, 197, 195, 167, 134, 138,
+    159, 134, 136, 105, 51,  0,   99,  0,   51,  0,   228, 215, 229, 218, 237,
+    215, 228, 210, 237, 222, 239, 211, 208, 211, 234, 218, 220, 209, 225, 219,
+    235, 222, 245, 225, 245, 224, 243, 223, 241, 218, 237, 224, 234, 213, 221,
+    193, 197, 164, 157, 128, 227, 188, 232, 196, 220, 220, 240, 219, 234, 213,
+    234, 211, 231, 218, 233, 213, 239, 215, 228, 207, 229, 206, 224, 208, 226,
+    207, 232, 210, 225, 208, 230, 199, 227, 206, 210, 205, 218, 174, 178, 141,
+    235, 208, 220, 206, 225, 203, 233, 203, 225, 167, 205, 199, 208, 190, 221,
+    204, 223, 207, 225, 188, 225, 197, 215, 188, 199, 183, 225, 195, 224, 200,
+    216, 178, 208, 188, 215, 202, 214, 183, 176, 140, 198, 150, 211, 194, 203,
+    120, 175, 188, 204, 189, 219, 192, 223, 202, 216, 186, 203, 185, 210, 182,
+    214, 183, 204, 170, 204, 125, 184, 187, 206, 185, 198, 182, 210, 161, 202,
+    198, 218, 173, 145, 120, 188, 183, 205, 168, 200, 170, 210, 177, 187, 190,
+    209, 193, 193, 166, 210, 162, 175, 119, 174, 147, 182, 161, 181, 134, 176,
+    143, 187, 165, 186, 149, 185, 141, 192, 181, 202, 123, 170, 143, 144, 78,
+    149, 0,   208, 182, 170, 78,  170, 0,   117, 51,  156, 99,  195, 170, 200,
+    130, 152, 68,  175, 141, 173, 134, 194, 132, 189, 164, 198, 134, 173, 117,
+    171, 149, 183, 181, 185, 99,  153, 117, 125, 0,   166, 0,   173, 117, 144,
+    0,   117, 102, 188, 120, 193, 166, 197, 68,  163, 119, 169, 99,  134, 0,
+    162, 0,   164, 68,  171, 116, 126, 0,   120, 68,  68,  0,   105, 0,   159,
+    95,  150, 51,  90,  85,  0,   0,   131, 0,   105, 0,   145, 51,  170, 51,
+    120, 0,   107, 0,   145, 85,  160, 0,   85,  0,   0,   51,  149, 0,   78,
+    0,   0,   0,   0,   0,   0,   0,   90,  0,   112, 0,   78,  102, 122, 0,
+    0,   0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   0,   0,   112,
+    0,   164, 120, 143, 0,   0,   0,   0,   0,   51,  0,   90,  0,   78,  0,
+    0,   0,   0,   0,   110, 0,   139, 0,   112, 51,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   102, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   107,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,   51,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   127, 110, 133, 0,   167, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   132, 0,   190,
+    194, 202, 0,   197, 187, 161, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    214, 213, 223, 203, 218, 189, 200, 122, 78,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   191, 210, 231, 197, 226, 217, 238, 216, 236, 207,
+    199, 0,   0,   0,   0,   0,   107, 122, 155, 160, 214, 215, 236, 202, 235,
+    203, 225, 191, 203, 188, 199, 194, 212, 127, 51,  0,   174, 188, 219, 196,
+    228, 221, 240, 207, 235, 220, 241, 219, 237, 207, 212, 142, 95,  0,   139,
+    78,  162, 177, 197, 183, 211, 199, 235, 208, 238, 215, 227, 207, 211, 201,
+    224, 213, 226, 192, 213, 170, 223, 205, 234, 221, 245, 225, 242, 220, 245,
+    221, 239, 221, 238, 213, 226, 180, 159, 112, 176, 159, 208, 202, 213, 191,
+    205, 191, 225, 197, 238, 219, 224, 201, 227, 200, 221, 201, 225, 203, 212,
+    195, 229, 210, 228, 210, 239, 216, 226, 212, 233, 205, 225, 200, 229, 207,
+    222, 151, 147, 119, 179, 185, 230, 218, 223, 192, 202, 136, 205, 177, 223,
+    204, 228, 215, 232, 209, 221, 189, 221, 205, 209, 200, 226, 209, 229, 205,
+    235, 192, 209, 198, 228, 190, 206, 185, 207, 187, 214, 175, 177, 184, 220,
+    195, 214, 207, 230, 184, 205, 159, 208, 184, 189, 169, 224, 213, 219, 199,
+    229, 203, 216, 205, 222, 204, 224, 206, 231, 208, 231, 176, 197, 184, 216,
+    193, 211, 139, 212, 195, 231, 164, 166, 195, 217, 182, 208, 190, 217, 179,
+    205, 68,  182, 119, 195, 168, 182, 136, 204, 179, 193, 158, 182, 140, 188,
+    154, 197, 169, 190, 99,  184, 0,   125, 0,   131, 0,   99,  68,  179, 85,
+    190, 184, 213, 203, 223, 202, 212, 190, 209, 138, 178, 0,   159, 51,  128,
+    51,  105, 0,   139, 51,  179, 125, 185, 114, 171, 128, 175, 132, 181, 174,
+    155, 0,   0,   0,   90,  0,   125, 0,   176, 188, 227, 217, 244, 215, 234,
+    221, 239, 192, 224, 210, 0,   0,   134, 0,   51,  0,   105, 0,   105, 0,
+    143, 90,  192, 119, 175, 147, 141, 51,  184, 110, 85,  0,   0,   0,   0,
+    0,   0,   0,   151, 139, 201, 203, 232, 203, 226, 208, 236, 206, 230, 212,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   169, 0,   119,
+    0,   78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   68,  0,   0,   133,
+    200, 180, 220, 197, 228, 201, 221, 184, 213, 193, 110, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   78,  0,   164, 0,   0,   0,   0,   0,   107, 0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   150, 164, 202, 182, 224,
+    197, 211, 179, 212, 193, 134, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   150, 0,   85,  0,   95,  0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   102, 90,  193, 160, 203, 164, 200, 178, 205, 174,
+    116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   120, 114, 123, 0,   114,
+    0,   145, 68,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    102, 68,  199, 170, 195, 180, 208, 176, 200, 164, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   142, 102, 172, 110, 186,
+    167, 185, 147, 189, 154, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   177, 0,   158, 136, 197, 155, 189, 166,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   155, 90,  175, 117, 175, 138, 202, 165, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   139,
+    0,   120, 68,  51,  123, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   119, 0,   78,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..d19bf8f067d7329dcda0b866d0d323b92f175e61
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+
+extern const int g_yes_micro_f2e59fea_nohash_1_width;
+extern const int g_yes_micro_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_micro_f2e59fea_nohash_1_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
index 4e54ff670eb9badd648aee99cf154c0d3b988bff..6f0c2581771e87e69481726adaea4fab3108640b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -32,7 +32,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -61,12 +62,12 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(40, input->dims->data[2]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
 
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
-  const uint8_t* yes_features_data = g_yes_f2e59fea_nohash_1_data;
+  const uint8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = yes_features_data[i];
   }
@@ -102,7 +103,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
 
   // Now test with a different input, from a recording of "No".
-  const uint8_t* no_features_data = g_no_f9643d42_nohash_4_data;
+  const uint8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = no_features_data[i];
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85113a90dcf610a38f21e17f0b303befd6c1e071
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
@@ -0,0 +1,1477 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+
+const int g_no_1000ms_sample_data_size = 16000;
+const int16_t g_no_1000ms_sample_data[16000] = {
+    5,     1,     -10,   -16,   -14,   -10,   -4,    -5,    -10,   -15,   -13,
+    -17,   -22,   -21,   -23,   -25,   -22,   -26,   -28,   -31,   -28,   -25,
+    -20,   -24,   -21,   -13,   -7,    -1,    -1,    3,     3,     4,     -4,
+    -6,    -8,    -10,   -13,   -4,    -2,    5,     8,     11,    26,    28,
+    34,    32,    34,    30,    21,    18,    15,    13,    8,     5,     14,
+    13,    7,     8,     4,     -5,    -7,    -4,    -9,    -13,   -17,   -21,
+    -16,   -14,   -12,   -12,   -14,   -11,   -9,    -2,    5,     -1,    2,
+    0,     2,     1,     -3,    -13,   -14,   -16,   -11,   -10,   -9,    -13,
+    -17,   -19,   -25,   -21,   -21,   -20,   -13,   -5,    -3,    0,     3,
+    6,     5,     1,     0,     -1,    -7,    -10,   -11,   -9,    -6,    -7,
+    -11,   -10,   -5,    -14,   -20,   -23,   -22,   -22,   -19,   -15,   -12,
+    -6,    -5,    3,     13,    16,    17,    25,    26,    28,    34,    34,
+    33,    34,    30,    21,    22,    18,    13,    20,    22,    24,    27,
+    26,    23,    21,    18,    9,     5,     -2,    -7,    -8,    -10,   -8,
+    -8,    -4,    2,     2,     -1,    -7,    -10,   -8,    -12,   -13,   -15,
+    -9,    -5,    -4,    -3,    -6,    -11,   -11,   -18,   -16,   -13,   -10,
+    -12,   -6,    0,     -2,    0,     -3,    -4,    -8,    -12,   -19,   -16,
+    -17,   -19,   -23,   -30,   -33,   -36,   -38,   -39,   -40,   -36,   -37,
+    -32,   -27,   -25,   -31,   -38,   -41,   -47,   -52,   -50,   -42,   -32,
+    -16,   -7,    -3,    0,     -1,    -1,    -5,    -16,   -23,   -29,   -34,
+    -33,   -27,   -17,   -11,   1,     4,     10,    18,    21,    24,    24,
+    25,    30,    34,    30,    29,    26,    23,    20,    15,    14,    13,
+    14,    16,    23,    28,    21,    23,    21,    13,    12,    12,    14,
+    17,    21,    26,    27,    30,    30,    26,    20,    15,    15,    9,
+    8,     9,     10,    7,     8,     7,     1,     -2,    -6,    -10,   -10,
+    -12,   -15,   -10,   -7,    -6,    -5,    0,     -3,    -3,    -12,   -25,
+    -35,   -49,   -53,   -49,   -51,   -48,   -46,   -48,   -39,   -33,   -31,
+    -37,   -42,   -47,   -49,   -46,   -47,   -47,   -46,   -42,   -39,   -33,
+    -26,   -23,   -14,   -8,    -9,    -7,    -10,   -11,   -13,   -13,   -19,
+    -20,   -16,   -11,   -9,    7,     16,    21,    29,    27,    29,    28,
+    21,    14,    13,    17,    19,    20,    18,    13,    17,    16,    18,
+    20,    17,    13,    16,    23,    26,    26,    25,    27,    31,    30,
+    31,    34,    32,    35,    32,    36,    31,    26,    23,    27,    27,
+    29,    27,    26,    32,    31,    28,    26,    23,    14,    6,     0,
+    -4,    -7,    -9,    -10,   -8,    -3,    4,     12,    11,    15,    11,
+    8,     2,     -3,    -3,    -4,    -6,    -11,   -14,   -20,   -28,   -32,
+    -38,   -46,   -42,   -44,   -40,   -34,   -26,   -29,   -25,   -23,   -24,
+    -17,   -21,   -26,   -23,   -25,   -19,   -10,   -11,   -10,   -10,   -12,
+    -9,    -3,    0,     -3,    -7,    -10,   -13,   -10,   -14,   -13,   -17,
+    -22,   -22,   -30,   -28,   -29,   -26,   -18,   -6,    -1,    -3,    -4,
+    -6,    -10,   -13,   -10,   -14,   -16,   -11,   -15,   -9,    -3,    -6,
+    -1,    2,     3,     4,     6,     6,     3,     4,     12,    14,    17,
+    21,    19,    20,    16,    17,    15,    21,    21,    22,    20,    17,
+    16,    16,    20,    17,    15,    9,     5,     11,    18,    24,    28,
+    26,    23,    23,    26,    22,    18,    21,    23,    26,    27,    25,
+    27,    29,    26,    20,    10,    7,     11,    8,     16,    25,    33,
+    37,    38,    39,    35,    30,    20,    13,    9,     6,     5,     13,
+    13,    14,    15,    12,    8,     3,     3,     3,     2,     9,     11,
+    10,    5,     5,     0,     -7,    -11,   -12,   -15,   -17,   -12,   -13,
+    -18,   -19,   -21,   -24,   -22,   -27,   -34,   -36,   -36,   -32,   -20,
+    -16,   -15,   -5,    -5,    -9,    -10,   -9,    -17,   -19,   -20,   -14,
+    -13,   -10,   -4,    -7,    -7,    -14,   -19,   -28,   -31,   -30,   -31,
+    -23,   -19,   -20,   -12,   -11,   -14,   -16,   -20,   -18,   -20,   -21,
+    -24,   -29,   -30,   -30,   -34,   -31,   -25,   -21,   -18,   -11,   -4,
+    2,     2,     3,     3,     2,     4,     -1,    -4,    -8,    -3,    -1,
+    7,     15,    18,    22,    20,    20,    16,    16,    14,    13,    21,
+    25,    26,    35,    28,    28,    28,    25,    21,    19,    18,    21,
+    24,    20,    25,    28,    19,    16,    15,    8,     3,     -1,    3,
+    5,     13,    18,    25,    31,    33,    39,    36,    36,    32,    36,
+    37,    39,    42,    36,    32,    27,    30,    24,    18,    15,    10,
+    7,     5,     6,     -1,    -4,    -10,   -17,   -15,   -19,   -15,   -7,
+    -4,    3,     0,     3,     4,     -2,    -7,    -13,   -21,   -23,   -28,
+    -27,   -26,   -25,   -15,   -10,   -4,    -6,    -5,    -9,    -5,    -3,
+    1,     2,     -1,    1,     -4,    -7,    -8,    -17,   -17,   -15,   -14,
+    -9,    -5,    -7,    -6,    -9,    -16,   -15,   -15,   -16,   -16,   -11,
+    -15,   -15,   -6,    -6,    -5,    -2,    0,     -9,    -10,   -12,   -13,
+    -10,   -4,    0,     8,     5,     4,     2,     0,     -5,    -8,    -16,
+    -15,   -12,   -3,    9,     17,    24,    26,    30,    28,    22,    17,
+    14,    9,     8,     9,     8,     11,    12,    12,    15,    14,    18,
+    20,    17,    19,    22,    21,    12,    5,     0,     3,     -3,    -4,
+    -6,    -7,    1,     8,     8,     8,     10,    2,     -3,    -8,    -15,
+    -20,   -24,   -22,   -23,   -13,   -6,    -7,    -5,    -10,   -8,    -15,
+    -19,   -22,   -20,   -17,   -18,   -13,   -10,   -1,    6,     5,     3,
+    1,     -5,    -11,   -10,   -14,   -19,   -15,   -13,   -8,    -2,    -3,
+    -4,    -3,    -4,    -1,    1,     0,     -3,    -4,    -8,    -18,   -21,
+    -25,   -24,   -16,   -9,    -2,    1,     5,     1,     3,     -2,    -7,
+    -10,   -23,   -30,   -29,   -23,   -9,    -3,    4,     11,    11,    6,
+    2,     0,     -12,   -20,   -28,   -24,   -22,   -17,   -22,   -19,   -14,
+    -21,   -17,   -17,   -12,   -8,    -3,    2,     0,     -6,    -5,    -8,
+    -12,   -17,   -27,   -34,   -31,   -30,   -27,   -19,   -14,   -14,   -14,
+    -14,   -19,   -22,   -21,   -19,   -14,   -1,    5,     9,     8,     6,
+    5,     -4,    -2,    -3,    -3,    -1,    -2,    -3,    2,     7,     8,
+    7,     6,     6,     3,     2,     1,     -2,    0,     6,     11,    18,
+    18,    19,    17,    14,    9,     4,     3,     3,     0,     -1,    3,
+    -1,    -5,    0,     -2,    0,     1,     7,     7,     8,     20,    29,
+    33,    31,    24,    14,    5,     -6,    -11,   -8,    -11,   -2,    6,
+    10,    12,    16,    26,    26,    24,    18,    12,    10,    4,     7,
+    6,     -2,    -12,   -17,   -17,   -20,   -23,   -23,   -18,   -8,    1,
+    3,     5,     6,     3,     0,     -6,    -12,   -12,   -15,   -12,   -7,
+    3,     3,     8,     7,     7,     7,     1,     -1,    -1,    4,     11,
+    17,    25,    32,    35,    42,    50,    52,    56,    50,    55,    53,
+    52,    47,    40,    38,    30,    26,    27,    28,    29,    25,    23,
+    23,    28,    30,    25,    26,    21,    19,    14,    9,     16,    22,
+    25,    33,    39,    45,    49,    48,    55,    51,    43,    35,    20,
+    14,    13,    23,    25,    24,    20,    22,    28,    22,    22,    17,
+    16,    13,    10,    10,    10,    9,     9,     14,    11,    10,    10,
+    4,     0,     0,     -2,    -3,    -5,    -7,    -3,    1,     -8,    -8,
+    -9,    -4,    4,     9,     11,    14,    11,    6,     8,     3,     -6,
+    -10,   -19,   -22,   -24,   -27,   -22,   -16,   -21,   -25,   -33,   -33,
+    -32,   -30,   -21,   -13,   -6,    -5,    2,     1,     4,     9,     7,
+    5,     1,     1,     8,     6,     7,     6,     0,     -6,    -15,   -18,
+    -23,   -22,   -23,   -25,   -22,   -21,   -19,   -17,   -13,   -10,   -10,
+    -16,   -17,   -15,   -13,   -8,    -9,    -14,   -13,   -17,   -20,   -26,
+    -28,   -31,   -29,   -26,   -23,   -13,   -10,   -6,    -1,    5,     7,
+    2,     -3,    -7,    -20,   -18,   -16,   -21,   -27,   -33,   -25,   -27,
+    -22,   -22,   -21,   -16,   -11,   -7,    -2,    2,     11,    18,    11,
+    9,     4,     1,     -1,    -6,    -4,    -5,    -9,    -12,   -16,   -25,
+    -29,   -37,   -37,   -38,   -37,   -33,   -23,   -16,   -14,   -7,    -1,
+    -4,    -3,    -4,    -5,    -11,   -14,   -8,    -8,    -8,    -8,    -9,
+    -4,    -14,   -21,   -22,   -21,   -18,   -15,   -2,    3,     -3,    0,
+    -2,    0,     -4,    -7,    -1,    -2,    3,     3,     -3,    -10,   -13,
+    -10,   -16,   -19,   -17,   -17,   -14,   -7,    5,     5,     7,     8,
+    12,    7,     0,     -5,    -13,   -17,   -18,   -14,   -7,    -4,    3,
+    11,    11,    12,    11,    8,     4,     -5,    -5,    -11,   -15,   -17,
+    -23,   -22,   -18,   -14,   -14,   -12,   -6,    -4,    -1,    3,     1,
+    -4,    -10,   -22,   -29,   -30,   -26,   -15,   -2,    6,     16,    21,
+    28,    32,    25,    24,    20,    9,     5,     0,     3,     7,     10,
+    11,    13,    17,    15,    16,    13,    11,    11,    8,     7,     1,
+    1,     -5,    -2,    -2,    -1,    4,     8,     17,    22,    24,    24,
+    26,    23,    20,    17,    16,    9,     4,     6,     5,     8,     2,
+    -1,    -5,    -4,    -10,   -14,   -14,   -17,   -19,   -18,   -16,   -14,
+    -6,    -3,    1,     3,     0,     -4,    -6,    -4,    -1,    -1,    2,
+    5,     3,     8,     7,     7,     14,    13,    20,    24,    29,    24,
+    12,    7,     -1,    -6,    -15,   -22,   -20,   -27,   -22,   -14,   -6,
+    2,     7,     9,     9,     2,     -3,    -7,    -8,    -10,   -9,    -3,
+    -6,    -11,   -12,   -8,    -5,    -4,    -5,    -3,    0,     3,     6,
+    6,     7,     5,     -7,    -10,   -14,   -13,   -14,   -17,   -11,   -7,
+    -4,    1,     1,     4,     -4,    -8,    -18,   -23,   -23,   -25,   -19,
+    -16,   -15,   -9,    3,     10,    19,    25,    30,    31,    26,    27,
+    23,    19,    16,    8,     7,     2,     0,     -1,    -1,    1,     5,
+    6,     6,     1,     3,     -1,    -7,    -11,   -17,   -19,   -19,   -7,
+    0,     3,     11,    12,    18,    20,    16,    9,     -2,    -7,    -14,
+    -19,   -22,   -30,   -33,   -34,   -36,   -26,   -14,   -11,   -9,    -3,
+    0,     -2,    1,     -2,    -3,    -5,    -12,   -15,   -19,   -14,   -9,
+    -8,    -2,    -6,    -13,   -15,   -19,   -22,   -25,   -26,   -21,   -20,
+    -11,   -1,    1,     5,     9,     13,    15,    12,    11,    3,     1,
+    -1,    0,     8,     13,    16,    16,    15,    16,    15,    12,    9,
+    7,     8,     4,     6,     4,     3,     3,     7,     0,     -4,    -8,
+    -11,   -18,   -18,   -15,   -20,   -23,   -21,   -22,   -21,   -27,   -25,
+    -15,   -7,    -2,    8,     9,     8,     8,     3,     3,     7,     8,
+    8,     8,     12,    11,    12,    4,     -1,    -7,    -11,   -15,   -18,
+    -17,   -17,   -20,   -19,   -13,   -11,   -3,    -3,    -1,    1,     -3,
+    1,     1,     8,     10,    15,    24,    26,    29,    34,    36,    26,
+    20,    12,    -2,    -6,    -9,    -7,    -6,    1,     10,    13,    19,
+    22,    22,    18,    21,    24,    28,    35,    37,    34,    33,    34,
+    34,    30,    19,    15,    10,    19,    21,    23,    24,    21,    19,
+    18,    21,    22,    22,    27,    30,    31,    32,    33,    32,    32,
+    24,    18,    10,    8,     10,    10,    6,     2,     -7,    -14,   -22,
+    -29,   -27,   -29,   -32,   -30,   -28,   -23,   -22,   -11,   -11,   -13,
+    -3,    2,     -1,    1,     1,     -3,    -7,    -5,    -7,    -11,   -17,
+    -23,   -25,   -26,   -27,   -26,   -23,   -14,   -5,    -3,    -1,    -2,
+    -2,    -1,    1,     -2,    -7,    -4,    2,     4,     10,    13,    6,
+    3,     -2,    -6,    -7,    -11,   -17,   -21,   -15,   -7,    -2,    11,
+    16,    22,    25,    25,    23,    24,    23,    21,    22,    25,    23,
+    17,    17,    12,    8,     -2,    -4,    1,     0,     4,     9,     8,
+    10,    9,     9,     15,    13,    10,    8,     1,     1,     -3,    1,
+    4,     11,    10,    9,     5,     5,     4,     1,     -1,    -4,    0,
+    8,     7,     4,     3,     3,     0,     -9,    -16,   -19,   -20,   -21,
+    -18,   -16,   -11,   -10,   -9,    -13,   -12,   -19,   -25,   -21,   -15,
+    -5,    8,     14,    21,    24,    18,    20,    17,    6,     1,     -2,
+    -2,    1,     1,     4,     1,     -3,    2,     0,     -3,    -3,    -4,
+    1,     0,     -5,    -11,   -17,   -21,   -20,   -20,   -20,   -14,   -9,
+    -3,    3,     7,     5,     3,     1,     -1,    -3,    -4,    -1,    1,
+    -5,    -1,    -1,    -7,    -11,   -14,   -12,   -14,   -17,   -18,   -23,
+    -29,   -24,   -27,   -19,   -12,   -13,   -2,    -3,    4,     4,     0,
+    -3,    -5,    -2,    -1,    -5,    -6,    -7,    -7,    -7,    -9,    -13,
+    -9,    -4,    1,     1,     1,     -4,    -11,   -8,    -15,   -19,   -19,
+    -12,   -5,    1,     7,     12,    8,     10,    10,    10,    11,    11,
+    19,    12,    9,     9,     2,     -4,    -13,   -22,   -24,   -25,   -24,
+    -26,   -19,   -14,   -10,   -1,    5,     4,     -1,    -4,    -5,    -10,
+    -14,   -11,   -8,    -10,   -8,    -9,    -7,    -8,    -6,    -1,    -5,
+    -10,   -18,   -27,   -29,   -24,   -19,   -11,   -7,    1,     10,    8,
+    8,     5,     2,     -5,    -1,    -1,    0,     2,     2,     -2,    -8,
+    -8,    -14,   -26,   -25,   -23,   -18,   -9,    2,     2,     7,     13,
+    6,     7,     5,     4,     3,     2,     1,     7,     2,     -1,    1,
+    -2,    2,     0,     -2,    -6,    -3,    5,     7,     9,     6,     5,
+    4,     2,     0,     -1,    -3,    3,     7,     6,     14,    18,    22,
+    20,    22,    19,    13,    9,     2,     -8,    -11,   -6,    -2,    -3,
+    -3,    0,     0,     0,     1,     -1,    -2,    1,     7,     11,    10,
+    11,    17,    17,    11,    11,    4,     6,     6,     13,    19,    22,
+    23,    27,    25,    24,    22,    14,    11,    13,    7,     0,     -3,
+    -9,    -11,   -7,    -7,    -6,    -4,    1,     7,     9,     15,    18,
+    18,    10,    5,     3,     -3,    -6,    -5,    -8,    -5,    4,     8,
+    8,     11,    10,    9,     4,     4,     1,     -3,    -10,   -11,   -8,
+    -16,   -20,   -22,   -19,   -12,   -7,    -10,   -10,   -13,   -14,   -11,
+    -11,   -13,   -18,   -21,   -19,   -17,   -22,   -18,   -22,   -22,   -16,
+    -9,    -3,    0,     3,     6,     3,     3,     -3,    -6,    -9,    -14,
+    -1,    14,    21,    30,    37,    33,    27,    26,    19,    15,    14,
+    11,    20,    12,    9,     10,    19,    20,    19,    22,    20,    22,
+    17,    13,    14,    10,    8,     12,    15,    13,    12,    12,    12,
+    9,     10,    11,    11,    9,     6,     4,     5,     -2,    1,     1,
+    -1,    5,     1,     8,     6,     3,     -1,    -4,    -15,   -24,   -27,
+    -26,   -23,   -19,   -9,    -3,    -4,    -9,    -9,    -10,   -16,   -22,
+    -19,   -18,   -15,   -2,    3,     5,     6,     7,     8,     11,    3,
+    1,     2,     1,     1,     0,     -4,    -13,   -18,   -19,   -19,   -20,
+    -23,   -15,   -10,   -5,    -3,    -1,    -1,    -1,    3,     -1,    0,
+    -8,    -11,   -13,   -14,   -13,   -8,    -6,    -3,    1,     1,     0,
+    0,     5,     4,     5,     5,     5,     4,     0,     -1,    -4,    -13,
+    -22,   -21,   -28,   -26,   -22,   -28,   -23,   -23,   -14,   -11,   -10,
+    -7,    -8,    -5,    -4,    1,     9,     10,    15,    19,    21,    17,
+    18,    19,    16,    13,    16,    21,    27,    29,    22,    22,    13,
+    4,     1,     0,     -5,    -6,    -2,    3,     5,     8,     6,     9,
+    10,    2,     -3,    -9,    -8,    -4,    -2,    -7,    -6,    -4,    -8,
+    -6,    -8,    -11,   -8,    -8,    -6,    2,     -2,    -2,    -1,    2,
+    4,     8,     5,     -1,    -8,    -10,   -7,    -6,    -5,    -6,    -5,
+    6,     13,    22,    28,    33,    31,    38,    35,    28,    27,    22,
+    22,    23,    26,    23,    21,    28,    28,    23,    23,    22,    21,
+    20,    14,    6,     -1,    -5,    -8,    -5,    -1,    2,     5,     5,
+    7,     8,     5,     4,     0,     3,     6,     10,    13,    13,    6,
+    4,     4,     0,     -2,    -3,    0,     3,     5,     7,     9,     7,
+    6,     10,    8,     3,     4,     -1,    -4,    -2,    0,     -2,    -2,
+    -2,    -3,    5,     8,     6,     4,     -1,    -7,    -6,    -7,    -12,
+    -18,   -11,   -2,    -1,    -1,    -1,    -2,    -7,    -7,    -3,    -3,
+    -5,    -6,    -6,    -6,    -6,    -6,    -9,    -12,   -9,    -5,    1,
+    3,     5,     5,     8,     7,     3,     -5,    -3,    -2,    2,     3,
+    5,     5,     -1,    -2,    -4,    -8,    -9,    -9,    -7,    -12,   -13,
+    -17,   -19,   -16,   -19,   -21,   -21,   -19,   -11,   -6,    -3,    7,
+    8,     6,     2,     0,     1,     1,     -2,    -5,    0,     -2,    2,
+    1,     2,     0,     -2,    -1,    -10,   -21,   -25,   -24,   -21,   -19,
+    -14,   -8,    -3,    -5,    0,     0,     -5,    -6,    -3,    -6,    -9,
+    -13,   -19,   -20,   -21,   -21,   -24,   -25,   -27,   -27,   -29,   -26,
+    -19,   -14,   -14,   -13,   -8,    -5,    -10,   -10,   -6,    1,     4,
+    14,    22,    23,    24,    20,    20,    18,    14,    11,    9,     6,
+    8,     12,    15,    18,    18,    12,    8,     9,     9,     9,     7,
+    4,     9,     5,     6,     5,     3,     3,     -1,    -1,    -6,    -10,
+    -6,    -8,    -3,    0,     -2,    -3,    -2,    -6,    -6,    -7,    -3,
+    -3,    -3,    -2,    1,     -1,    -10,   -7,    -13,   -21,   -23,   -20,
+    -19,   -18,   -18,   -19,   -15,   -16,   -7,    -6,    -9,    -13,   -12,
+    -6,    -1,    3,     6,     7,     5,     3,     -3,    -11,   -18,   -20,
+    -26,   -29,   -27,   -27,   -24,   -30,   -29,   -28,   -23,   -18,   -21,
+    -18,   -15,   -9,    1,     9,     17,    21,    23,    18,    14,    5,
+    -1,    -2,    -1,    0,     3,     6,     5,     4,     4,     0,     -1,
+    1,     -4,    -9,    -13,   -11,   -20,   -21,   -19,   -14,   -9,    -4,
+    1,     6,     10,    16,    24,    30,    35,    31,    38,    37,    35,
+    39,    36,    36,    32,    30,    33,    31,    24,    19,    12,    4,
+    -1,    -7,    -11,   -7,    -5,    -3,    2,     6,     10,    16,    19,
+    21,    21,    16,    10,    14,    12,    14,    13,    12,    12,    5,
+    6,     2,     0,     1,     3,     4,     6,     9,     6,     2,     -1,
+    -3,    -10,   -15,   -13,   -17,   -19,   -15,   -16,   -15,   -13,   -8,
+    -8,    -7,    -10,   -5,    -2,    1,     5,     5,     11,    10,    12,
+    10,    9,     9,     15,    23,    33,    35,    33,    34,    34,    35,
+    34,    24,    30,    26,    23,    21,    20,    15,    10,    3,     4,
+    0,     -7,    -8,    -9,    -9,    -8,    -4,    0,     5,     5,     2,
+    3,     -2,    0,     0,     -1,    0,     -1,    1,     2,     6,     3,
+    1,     -9,    -5,    -6,    -2,    -8,    -12,   -9,    -10,   -7,    -8,
+    -8,    -6,    -2,    -2,    -1,    0,     -2,    -1,    -8,    -18,   -19,
+    -27,   -37,   -42,   -40,   -39,   -33,   -30,   -23,   -16,   -16,   -9,
+    -13,   -11,   -10,   -10,   -8,    -3,    -1,    2,     0,     -1,    2,
+    6,     4,     8,     10,    17,    21,    28,    31,    33,    28,    20,
+    12,    8,     -3,    -5,    -4,    -3,    2,     6,     9,     8,     2,
+    7,     4,     -6,    -9,    -15,   -13,   -15,   -17,   -14,   -11,   -12,
+    -5,    -6,    -4,    -6,    -11,   -11,   -7,    -4,    -6,    -8,    -13,
+    -10,   -7,    -12,   -11,   -12,   -13,   -12,   -9,    -9,    -10,   -10,
+    -6,    -8,    -8,    -7,    -9,    -9,    -7,    2,     5,     5,     6,
+    3,     4,     6,     3,     -1,    -2,    -2,    -2,    1,     5,     3,
+    4,     2,     -2,    -7,    -9,    -13,   -11,   -8,    2,     12,    23,
+    31,    37,    41,    40,    37,    36,    31,    31,    27,    28,    24,
+    13,    16,    14,    15,    9,     4,     4,     5,     4,     7,     12,
+    16,    14,    11,    13,    6,     -2,    -4,    -1,    -3,    3,     6,
+    6,     9,     7,     9,     7,     5,     0,     1,     -1,    -2,    -4,
+    -1,    0,     0,     -4,    0,     -4,    -9,    -15,   -16,   -18,   -15,
+    -10,   -6,    -8,    -5,    -2,    -2,    0,     4,     7,     0,     -2,
+    -3,    4,     3,     2,     -1,    -3,    -8,    -19,   -19,   -19,   -16,
+    -8,    -5,    0,     1,     2,     1,     -1,    -2,    -10,   -12,   -10,
+    -4,    3,     4,     2,     7,     8,     4,     1,     -5,    -5,    -4,
+    -1,    9,     10,    12,    15,    15,    14,    11,    20,    16,    19,
+    18,    26,    29,    21,    23,    16,    16,    3,     -3,    -4,    -10,
+    -12,   -10,   -6,    -7,    -12,   -17,   -14,   -16,   -19,   -13,   -10,
+    -13,   -13,   -2,    2,     3,     7,     13,    22,    21,    21,    21,
+    24,    27,    23,    22,    20,    17,    17,    16,    13,    11,    5,
+    1,     1,     5,     5,     3,     2,     -1,    2,     -5,    -6,    -3,
+    -11,   -9,    -6,    -5,    -10,   -4,    -1,    1,     2,     -1,    -4,
+    -4,    -9,    -9,    -7,    -3,    3,     -2,    1,     1,     4,     -4,
+    -8,    -8,    -17,   -17,   -13,   -13,   -18,   -18,   -25,   -27,   -21,
+    -22,   -18,   -7,    -1,    5,     9,     11,    11,    11,    15,    11,
+    4,     1,     6,     8,     17,    12,    10,    5,     -2,    -3,    -14,
+    -17,   -25,   -26,   -22,   -20,   -13,   -12,   -12,   -13,   -10,   -4,
+    -6,    -6,    -4,    -6,    -4,    0,     -3,    -7,    -7,    -10,   -17,
+    -14,   -9,    -3,    4,     4,     6,     1,     0,     0,     -6,    -3,
+    -4,    -3,    -6,    -9,    -9,    -5,    0,     1,     2,     -2,    3,
+    -1,    -4,    -5,    -11,   -14,   -17,   -14,   -12,   -14,   -19,   -21,
+    -25,   -35,   -40,   -39,   -31,   -24,   -13,   -4,    -1,    0,     0,
+    2,     -2,    -5,    -8,    -8,    -9,    -6,    -2,    0,     -5,    -6,
+    2,     5,     4,     1,     6,     8,     9,     14,    13,    19,    15,
+    19,    13,    14,    20,    16,    16,    14,    14,    17,    13,    12,
+    11,    6,     -1,    -7,    -9,    -10,   -11,   -2,    8,     12,    12,
+    12,    8,     4,     1,     -3,    -4,    -4,    -3,    1,     9,     14,
+    16,    10,    12,    9,     6,     4,     -1,    8,     6,     3,     6,
+    1,     -11,   -10,   -10,   -13,   -9,    -6,    -2,    -2,    9,     13,
+    17,    17,    19,    17,    16,    9,     -2,    -5,    -5,    -3,    -9,
+    -8,    -8,    -12,   -17,   -16,   -18,   -15,   -9,    -7,    1,     10,
+    17,    18,    23,    25,    23,    20,    15,    17,    18,    23,    33,
+    40,    43,    45,    51,    53,    47,    36,    27,    10,    5,     1,
+    4,     5,     4,     0,     0,     6,     7,     8,     9,     3,     2,
+    1,     0,     -1,    3,     5,     5,     13,    7,     4,     4,     3,
+    11,    17,    21,    31,    31,    31,    31,    28,    26,    23,    19,
+    16,    17,    16,    10,    10,    12,    9,     7,     -1,    -7,    -12,
+    -15,   -15,   -15,   -13,   -13,   -16,   -19,   -19,   -23,   -31,   -34,
+    -38,   -39,   -31,   -30,   -21,   -21,   -18,   -11,   -16,   -20,   -25,
+    -22,   -18,   -14,   -7,    -8,    -3,    2,     10,    13,    12,    10,
+    6,     2,     0,     0,     0,     -6,    -4,    -1,    0,     0,     -1,
+    -2,    1,     3,     8,     9,     3,     6,     2,     -4,    -2,    -3,
+    -7,    -4,    -3,    2,     6,     8,     10,    12,    15,    11,    15,
+    12,    13,    14,    15,    18,    14,    8,     4,     4,     3,     -4,
+    -5,    -4,    -2,    -3,    -2,    4,     9,     13,    18,    21,    20,
+    18,    15,    11,    6,     7,     10,    8,     6,     3,     -3,    -7,
+    -14,   -21,   -29,   -33,   -32,   -26,   -17,   -12,   -11,   -9,    -3,
+    -10,   -13,   -18,   -23,   -21,   -26,   -26,   -24,   -28,   -25,   -29,
+    -30,   -30,   -27,   -17,   -7,    2,     10,    13,    16,    16,    17,
+    18,    17,    19,    19,    20,    15,    14,    16,    14,    10,    5,
+    0,     -4,    -18,   -21,   -25,   -20,   -16,   -13,   -8,    -5,    2,
+    6,     11,    12,    18,    16,    18,    15,    13,    17,    18,    22,
+    21,    25,    26,    25,    26,    28,    31,    27,    20,    10,    3,
+    -6,    -10,   -16,   -19,   -18,   -15,   -13,   -10,   -2,    0,     2,
+    4,     3,     5,     -1,    0,     1,     2,     0,     -2,    -1,    -6,
+    -5,    -7,    -12,   -10,   -9,    -4,    -1,    3,     4,     2,     4,
+    4,     3,     -3,    -6,    -11,   -14,   -15,   -23,   -25,   -29,   -30,
+    -28,   -25,   -22,   -19,   -21,   -19,   -11,   -7,    -7,    -3,    -3,
+    -6,    -8,    -13,   -10,   -10,   -5,    1,     4,     9,     7,     6,
+    6,     4,     -5,    -11,   -8,    -6,    -3,    0,     3,     7,     11,
+    7,     3,     5,     6,     10,    12,    14,    16,    8,     5,     -1,
+    -1,    4,     0,     0,     -3,    -5,    -5,    -4,    -2,    -2,    1,
+    4,     7,     5,     10,    9,     6,     9,     12,    19,    28,    32,
+    32,    33,    31,    29,    20,    17,    16,    14,    15,    6,     -2,
+    -5,    -7,    -10,   -10,   -11,   -9,    -6,    -3,    8,     10,    10,
+    10,    12,    12,    7,     7,     5,     3,     2,     2,     -2,    -5,
+    -4,    -7,    -2,    -6,    -5,    -6,    -11,   -14,   -13,   -10,   -11,
+    -15,   -16,   -11,   -11,   -11,   -10,   -16,   -15,   -15,   -16,   -10,
+    -11,   -11,   -5,    -1,    2,     1,     2,     0,     1,     4,     8,
+    5,     -4,    -2,    -4,    -12,   -18,   -24,   -20,   -25,   -14,   -3,
+    4,     11,    13,    13,    7,     4,     -4,    -9,    -13,   -17,   -10,
+    -6,    -1,    0,     2,     2,     -1,    1,     -8,    -18,   -22,   -19,
+    -19,   -22,   -20,   -22,   -20,   -17,   -12,   -9,    -4,    3,     9,
+    9,     9,     7,     6,     13,    10,    11,    8,     4,     -1,    5,
+    7,     7,     8,     4,     2,     2,     -2,    -8,    -11,   -16,   -18,
+    -12,   -12,   -9,    -2,    3,     3,     5,     5,     6,     9,     11,
+    20,    22,    26,    30,    28,    22,    15,    15,    10,    11,    9,
+    6,     9,     9,     11,    10,    12,    10,    8,     8,     7,     9,
+    4,     3,     9,     5,     1,     2,     0,     -3,    -3,    0,     3,
+    0,     -2,    1,     4,     6,     4,     0,     1,     -4,    -13,   -13,
+    -11,   -20,   -21,   -15,   -17,   -23,   -22,   -24,   -29,   -24,   -29,
+    -32,   -21,   -13,   -11,   -9,    -9,    -8,    -13,   -11,   -11,   -11,
+    -11,   -17,   -17,   -21,   -23,   -27,   -32,   -33,   -32,   -31,   -35,
+    -31,   -26,   -24,   -18,   -10,   -1,    5,     13,    17,    15,    13,
+    8,     4,     6,     9,     10,    13,    11,    12,    13,    9,     5,
+    6,     8,     12,    21,    25,    24,    23,    16,    8,     7,     0,
+    -3,    -8,    -9,    -2,    1,     11,    18,    25,    30,    31,    27,
+    21,    19,    19,    18,    18,    22,    24,    16,    14,    8,     2,
+    -4,    -9,    -7,    -10,   -6,    -8,    -8,    -13,   -14,   -11,   -13,
+    -8,    -7,    6,     9,     10,    15,    17,    11,    11,    9,     2,
+    2,     -2,    2,     -6,    -6,    -7,    -14,   -11,   -12,   -13,   -17,
+    -22,   -25,   -30,   -24,   -16,   -4,    5,     2,     7,     5,     2,
+    -1,    1,     -4,    -4,    4,     8,     8,     5,     6,     6,     2,
+    1,     -2,    -9,    -14,   -17,   -16,   -15,   -14,   -12,   -11,   -6,
+    -6,    -2,    -3,    -3,    6,     13,    18,    27,    27,    26,    24,
+    22,    19,    18,    19,    12,    8,     7,     -2,    0,     -6,    -8,
+    -6,    -4,    -6,    -14,   -16,   -16,   -15,   -12,   -2,    6,     12,
+    16,    18,    14,    16,    13,    12,    17,    16,    17,    17,    12,
+    13,    10,    14,    14,    10,    2,     -1,    -3,    -5,    -10,   -15,
+    -13,   -20,   -21,   -21,   -21,   -19,   -20,   -18,   -8,    -4,    -1,
+    -1,    4,     2,     -3,    0,     -5,    -5,    -3,    -1,    0,     6,
+    5,     6,     7,     7,     3,     2,     1,     -5,    -3,    0,     3,
+    5,     7,     4,     10,    15,    15,    11,    6,     8,     9,     14,
+    19,    18,    14,    12,    16,    15,    11,    9,     9,     5,     4,
+    0,     -7,    -12,   -18,   -22,   -29,   -32,   -36,   -37,   -38,   -39,
+    -32,   -24,   -20,   -14,   -10,   -2,    0,     1,     9,     13,    21,
+    26,    31,    35,    40,    38,    32,    33,    25,    14,    11,    7,
+    1,     -1,    -6,    -5,    -11,   -20,   -22,   -19,   -16,   -9,    2,
+    9,     14,    14,    13,    13,    12,    10,    3,     2,     1,     0,
+    6,     5,     -1,    -4,    -13,   -17,   -21,   -25,   -29,   -30,   -23,
+    -14,   -4,    4,     11,    11,    12,    13,    13,    5,     6,     6,
+    7,     5,     5,     9,     -2,    3,     0,     -2,    -3,    -5,    -1,
+    3,     9,     16,    18,    17,    17,    11,    5,     1,     -4,    -13,
+    -12,   -7,    -7,    1,     6,     4,     2,     3,     1,     1,     0,
+    -1,    -5,    -5,    -3,    -5,    -1,    8,     9,     7,     12,    7,
+    6,     4,     3,     -1,    -1,    -4,    -14,   -16,   -18,   -24,   -34,
+    -44,   -37,   -37,   -36,   -28,   -19,   -15,   -6,    -2,    -3,    2,
+    5,     6,     3,     6,     6,     9,     7,     3,     -4,    -15,   -25,
+    -34,   -37,   -41,   -41,   -38,   -33,   -27,   -22,   -14,   -15,   -18,
+    -18,   -15,   -8,    -7,    -2,    2,     0,     4,     12,    13,    10,
+    17,    20,    16,    17,    23,    24,    22,    24,    22,    28,    26,
+    24,    22,    26,    28,    27,    23,    17,    10,    4,     4,     1,
+    -1,    0,     4,     9,     15,    14,    15,    14,    14,    13,    8,
+    0,     -1,    -11,   -13,   -4,    -3,    -5,    -3,    -1,    -6,    -5,
+    -7,    -4,    -2,    2,     7,     15,    20,    14,    13,    8,     2,
+    -6,    -15,   -23,   -25,   -20,   -22,   -20,   -14,   -10,   -4,    -2,
+    1,     -10,   -15,   -12,   -8,    -8,    -7,    -5,    -10,   -12,   -20,
+    -28,   -26,   -24,   -16,   -8,    -5,    3,     8,     9,     12,    12,
+    12,    14,    13,    12,    10,    13,    23,    29,    28,    33,    36,
+    32,    28,    23,    25,    26,    30,    34,    27,    22,    16,    12,
+    3,     -6,    -13,   -13,   -15,   -14,   -9,    -11,   -13,   -13,   -16,
+    -15,   -20,   -22,   -20,   -32,   -30,   -29,   -24,   -18,   -18,   -18,
+    -13,   -15,   -15,   -16,   -17,   -10,   -11,   -12,   -15,   -17,   -17,
+    -19,   -21,   -22,   -26,   -28,   -21,   -18,   -14,   -5,    2,     6,
+    7,     5,     3,     -2,    0,     -4,    -2,    -3,    -6,    -9,    -12,
+    -11,   -11,   -19,   -23,   -20,   -21,   -16,   -19,   -23,   -22,   -24,
+    -21,   -22,   -17,   -15,   -8,    -1,    4,     14,    18,    23,    24,
+    25,    25,    18,    15,    7,     2,     14,    19,    22,    20,    23,
+    22,    20,    19,    20,    17,    16,    21,    22,    21,    18,    9,
+    3,     -6,    -14,   -19,   -30,   -36,   -40,   -32,   -22,   -21,   -16,
+    -7,    -1,    3,     2,     3,     6,     9,     16,    20,    22,    26,
+    27,    29,    32,    30,    23,    19,    20,    21,    18,    22,    24,
+    15,    14,    9,     9,     7,     6,     9,     9,     16,    22,    20,
+    18,    18,    9,     -1,    -10,   -16,   -19,   -22,   -22,   -20,   -16,
+    -11,   -5,    0,     1,     4,     2,     0,     3,     5,     10,    8,
+    12,    10,    11,    9,     8,     7,     -3,    -4,    -10,   -11,   -5,
+    2,     8,     12,    12,    13,    14,    15,    14,    12,    10,    14,
+    13,    8,     0,     -2,    -3,    -9,    -6,    -13,   -21,   -12,   -12,
+    -8,    -9,    -14,   -16,   -19,   -23,   -22,   -23,   -30,   -26,   -17,
+    -14,   -9,    -2,    3,     11,    16,    17,    17,    11,    12,    13,
+    12,    9,     8,     7,     10,    17,    14,    13,    9,     7,     6,
+    5,     10,    10,    6,     10,    9,     1,     -5,    -10,   -12,   -17,
+    -16,   -14,   -13,   -10,   -6,    -2,    0,     -1,    2,     2,     -1,
+    2,     6,     12,    18,    23,    22,    23,    24,    20,    16,    10,
+    6,     9,     16,    15,    15,    16,    14,    8,     4,     0,     -3,
+    -7,    -4,    -5,    -5,    0,     -4,    1,     1,     1,     -4,    -10,
+    -17,   -25,   -25,   -28,   -28,   -27,   -25,   -20,   -20,   -20,   -22,
+    -14,   -11,   -4,    4,     6,     11,    10,    12,    9,     6,     2,
+    -6,    -10,   -12,   -7,    -1,    -6,    0,     1,     2,     5,     1,
+    -1,    1,     -3,    -6,    -4,    -5,    -4,    -6,    -5,    -7,    -10,
+    -10,   -8,    -11,   -9,    -2,    9,     15,    14,    20,    19,    19,
+    16,    16,    11,    3,     2,     2,     5,     4,     5,     3,     -1,
+    -1,    -6,    -11,   -16,   -18,   -18,   -12,   -17,   -18,   -13,   -15,
+    -5,    -4,    -3,    -1,    2,     6,     7,     11,    14,    17,    17,
+    18,    21,    18,    19,    18,    23,    27,    36,    32,    35,    30,
+    24,    25,    18,    10,    3,     -1,    -4,    -11,   -16,   -21,   -33,
+    -37,   -35,   -36,   -35,   -30,   -26,   -26,   -21,   -10,   -7,    -3,
+    -4,    -3,    -3,    -9,    -12,   -16,   -25,   -22,   -11,   -6,    2,
+    5,     7,     4,     -2,    -8,    -16,   -23,   -30,   -28,   -23,   -20,
+    -11,   -11,   -8,    5,     2,     -3,    -1,    -11,   -15,   -10,   -13,
+    -8,    -8,    -12,   -9,    -10,   -15,   -8,    -4,    -3,    7,     6,
+    13,    20,    25,    24,    25,    27,    28,    25,    23,    22,    27,
+    28,    27,    30,    28,    26,    20,    16,    13,    7,     2,     1,
+    6,     3,     -4,    -6,    -13,   -18,   -19,   -21,   -15,   -3,    -1,
+    10,    16,    17,    20,    24,    28,    28,    26,    26,    28,    27,
+    24,    23,    20,    20,    24,    20,    17,    14,    6,     0,     2,
+    1,     0,     -3,    -7,    -12,   -18,   -29,   -28,   -30,   -32,   -23,
+    -27,   -25,   -20,   -17,   -13,   -11,   -14,   -17,   -21,   -22,   -18,
+    -11,   -12,   -6,    -8,    -9,    -5,    -6,    -10,   -18,   -19,   -16,
+    -13,   -9,    -6,    -7,    -13,   -10,   -14,   -22,   -30,   -37,   -35,
+    -37,   -35,   -34,   -36,   -30,   -23,   -17,   -16,   -16,   -11,   -6,
+    -2,    3,     7,     7,     6,     7,     7,     13,    21,    20,    22,
+    23,    22,    24,    17,    5,     -1,    -2,    -8,    -13,   -14,   -17,
+    -24,   -28,   -23,   -22,   -19,   -12,   -14,   -10,   -14,   -21,   -20,
+    -21,   -22,   -13,   -6,    -1,    6,     4,     10,    11,    8,     10,
+    10,    17,    20,    27,    34,    32,    26,    26,    24,    17,    13,
+    6,     9,     12,    15,    17,    12,    11,    9,     3,     -3,    -3,
+    -8,    -9,    -4,    -2,    -2,    2,     1,     -1,    -3,    -7,    -8,
+    -11,   -15,   -8,    -5,    1,     9,     7,     10,    13,    17,    14,
+    12,    8,     6,     3,     6,     9,     8,     5,     0,     -2,    1,
+    1,     -3,    -6,    -12,   -17,   -17,   -23,   -28,   -33,   -31,   -29,
+    -30,   -35,   -28,   -25,   -17,   -5,    0,     6,     10,    14,    27,
+    31,    26,    31,    30,    32,    41,    42,    42,    43,    34,    32,
+    21,    12,    2,     1,     -3,    -1,    8,     13,    20,    19,    18,
+    19,    13,    8,     5,     7,     6,     7,     6,     4,     3,     -2,
+    0,     2,     -4,    -1,    -3,    2,     12,    22,    33,    32,    31,
+    35,    35,    34,    32,    26,    27,    26,    21,    17,    10,    1,
+    -3,    -14,   -21,   -19,   -21,   -19,   -24,   -24,   -19,   -16,   -13,
+    -16,   -13,   -15,   -17,   -12,   -9,    -4,    7,     19,    27,    33,
+    37,    34,    35,    30,    24,    23,    25,    21,    20,    18,    15,
+    12,    13,    8,     2,     -4,    -12,   -18,   -17,   -14,   -10,   -14,
+    -8,    -14,   -14,   -12,   -14,   -19,   -23,   -31,   -32,   -28,   -30,
+    -22,   -20,   -13,   1,     0,     6,     14,    15,    20,    22,    20,
+    16,    9,     2,     1,     3,     6,     7,     9,     10,    14,    17,
+    16,    14,    4,     -7,    -16,   -31,   -40,   -41,   -40,   -38,   -34,
+    -40,   -37,   -33,   -28,   -22,   -17,   -11,   -10,   -12,   -5,    -5,
+    -8,    -4,    0,     -1,    1,     1,     6,     11,    14,    22,    25,
+    28,    31,    32,    32,    31,    31,    20,    13,    12,    5,     4,
+    4,     2,     0,     -3,    -6,    -8,    -4,    -4,    -4,    -1,    7,
+    9,     10,    13,    13,    16,    10,    7,     3,     6,     8,     8,
+    15,    20,    23,    18,    15,    12,    4,     1,     0,     -4,    -4,
+    -1,    8,     11,    13,    21,    24,    19,    12,    2,     -5,    -11,
+    -15,   -17,   -17,   -19,   -23,   -28,   -34,   -33,   -37,   -29,   -27,
+    -24,   -17,   -13,   -8,    -6,    -2,    5,     3,     4,     -2,    -5,
+    -4,    0,     2,     3,     1,     -5,    -5,    -6,    -11,   -11,   -15,
+    -15,   -19,   -17,   -17,   -21,   -23,   -21,   -22,   -24,   -28,   -27,
+    -25,   -15,   -8,    -1,    2,     2,     3,     3,     2,     -2,    0,
+    1,     -1,    2,     5,     7,     2,     0,     2,     -6,    -9,    -8,
+    -6,    -3,    -3,    3,     0,     5,     0,     0,     -5,    -12,   -13,
+    -20,   -14,   -14,   -6,    -5,    -2,    0,     6,     11,    9,     9,
+    11,    10,    13,    19,    26,    29,    36,    37,    40,    35,    27,
+    20,    13,    6,     3,     -1,    -1,    -1,    -3,    -6,    -8,    -14,
+    -16,   -25,   -28,   -23,   -21,   -24,   -22,   -22,   -22,   -24,   -28,
+    -35,   -43,   -42,   -37,   -29,   -20,   -5,    2,     10,    23,    28,
+    30,    31,    30,    39,    43,    40,    41,    43,    43,    38,    29,
+    18,    14,    12,    3,     6,     3,     3,     0,     -1,    -3,    -5,
+    -5,    -8,    -8,    -10,   -6,    -1,    1,     5,     1,     2,     6,
+    0,     -3,    -7,    -13,   -10,   -7,    -8,    -7,    -3,    -5,    -4,
+    -4,    -4,    -5,    -2,    2,     3,     6,     4,     3,     -1,    -2,
+    -5,    -16,   -22,   -31,   -39,   -38,   -42,   -47,   -42,   -42,   -35,
+    -27,   -30,   -28,   -25,   -26,   -24,   -20,   -19,   -19,   -19,   -19,
+    -14,   -16,   -13,   -9,    -10,   -1,    8,     17,    21,    28,    26,
+    28,    24,    14,    8,     2,     0,     -4,    -4,    -13,   -16,   -16,
+    -13,   -12,   -7,    -5,    0,     -4,    -1,    2,     4,     8,     8,
+    10,    10,    10,    14,    16,    17,    23,    20,    27,    27,    27,
+    21,    14,    11,    0,     -4,    -8,    -8,    -1,    -1,    1,     6,
+    8,     23,    22,    23,    23,    25,    26,    26,    22,    21,    20,
+    22,    17,    12,    8,     3,     -2,    -2,    -4,    -5,    -3,    1,
+    7,     6,     8,     9,     12,    6,     1,     -4,    -8,    -6,    -3,
+    -4,    -5,    -3,    -7,    -6,    -6,    -11,   -11,   -19,   -23,   -26,
+    -28,   -34,   -41,   -41,   -44,   -45,   -47,   -40,   -39,   -33,   -29,
+    -21,   -14,   -16,   -6,    -7,    -3,    1,     6,     8,     11,    14,
+    14,    15,    15,    18,    18,    16,    17,    12,    15,    20,    21,
+    19,    21,    23,    22,    21,    16,    12,    8,     7,     7,     10,
+    13,    13,    16,    16,    16,    16,    15,    15,    12,    14,    14,
+    15,    12,    11,    17,    19,    19,    14,    13,    15,    17,    18,
+    20,    24,    27,    24,    19,    11,    10,    1,     0,     0,     -1,
+    3,     8,     16,    18,    17,    22,    22,    21,    19,    7,     0,
+    1,     -1,    -2,    -1,    -6,    -8,    -12,   -14,   -20,   -21,   -24,
+    -19,   -9,    -4,    -3,    2,     2,     3,     0,     -10,   -19,   -23,
+    -29,   -31,   -35,   -29,   -33,   -28,   -25,   -25,   -19,   -22,   -23,
+    -24,   -21,   -17,   -15,   -17,   -13,   -15,   -12,   -15,   -14,   -14,
+    -12,   -9,    -5,    1,     9,     13,    13,    17,    17,    15,    11,
+    12,    8,     13,    20,    24,    30,    29,    33,    30,    26,    23,
+    13,    9,     4,     3,     3,     5,     3,     2,     5,     3,     2,
+    1,     3,     6,     10,    14,    19,    23,    21,    20,    21,    17,
+    11,    5,     -3,    -7,    -12,   -15,   -16,   -13,   -15,   -13,   -7,
+    -4,    -5,    -5,    -1,    5,     11,    8,     7,     -2,    -2,    -5,
+    -6,    -1,    -2,    0,     2,     8,     13,    15,    17,    15,    16,
+    10,    13,    3,     -1,    -4,    -4,    -4,    0,     8,     13,    15,
+    9,     11,    9,     12,    9,     10,    10,    5,     11,    16,    21,
+    20,    15,    13,    5,     3,     -3,    1,     1,     0,     -4,    -7,
+    -9,    -7,    -9,    -10,   -7,    -6,    -3,    -2,    -3,    -3,    -6,
+    -12,   -16,   -22,   -21,   -26,   -28,   -25,   -24,   -23,   -23,   -28,
+    -32,   -29,   -26,   -26,   -23,   -29,   -23,   -16,   -11,   -7,    -9,
+    -10,   -12,   -18,   -20,   -20,   -26,   -23,   -16,   -17,   -10,   -7,
+    0,     3,     -2,    0,     -4,    -7,    -8,    -6,    -3,    -7,    -5,
+    -5,    1,     0,     -3,    -2,    -3,    5,     7,     10,    19,    17,
+    22,    21,    20,    16,    8,     9,     10,    12,    20,    28,    31,
+    28,    28,    26,    21,    14,    8,     5,     4,     5,     8,     9,
+    9,     13,    17,    16,    14,    20,    17,    13,    16,    17,    18,
+    18,    15,    11,    5,     -2,    -8,    -15,   -17,   -17,   -24,   -24,
+    -23,   -18,   -13,   -13,   -9,    -7,    -4,    0,     3,     6,     2,
+    2,     -4,    -5,    -5,    -4,    -4,    -2,    2,     6,     10,    7,
+    4,     2,     -2,    -3,    -8,    -10,   -14,   -27,   -29,   -37,   -36,
+    -29,   -27,   -19,   -7,    -3,    0,     -2,    2,     8,     13,    18,
+    15,    10,    10,    6,     1,     -5,    -12,   -17,   -20,   -23,   -23,
+    -22,   -19,   -17,   -10,   -6,    -3,    2,     0,     4,     11,    14,
+    19,    16,    6,     7,     3,     3,     4,     1,     7,     8,     7,
+    3,     -2,    0,     0,     0,     -1,    -2,    0,     4,     3,     5,
+    9,     9,     12,    7,     5,     0,     0,     1,     0,     2,     -6,
+    -10,   -9,    -13,   -15,   -19,   -15,   -18,   -16,   -17,   -9,    -5,
+    -2,    2,     2,     3,     7,     2,     -3,    -8,    -13,   -8,    1,
+    8,     12,    15,    17,    17,    11,    7,     0,     -4,    -8,    -8,
+    -3,    -1,    -4,    -6,    -6,    -13,   -12,   -12,   -13,   -12,   -8,
+    -9,    -5,    -4,    -2,    0,     -1,    -6,    -7,    -6,    -10,   -10,
+    -8,    -6,    1,     5,     6,     15,    18,    16,    12,    12,    12,
+    10,    13,    7,     0,     -9,    -10,   -11,   -6,    -8,    -8,    -4,
+    0,     6,     10,    11,    15,    15,    15,    12,    10,    6,     6,
+    11,    12,    20,    25,    23,    25,    18,    12,    6,     -1,    -4,
+    -10,   -12,   -9,    -13,   -16,   -15,   -18,   -18,   -22,   -22,   -17,
+    -14,   -12,   -8,    -3,    1,     4,     11,    13,    7,     0,     -8,
+    -11,   -11,   -13,   -14,   -12,   -11,   -9,    -6,    -5,    -2,    1,
+    5,     6,     10,    18,    17,    15,    13,    11,    12,    13,    10,
+    9,     13,    16,    16,    13,    11,    6,     5,     0,     -5,    -4,
+    -3,    2,     6,     5,     6,     11,    14,    20,    23,    28,    27,
+    22,    24,    23,    22,    16,    17,    12,    7,     -1,    -9,    -10,
+    -9,    -9,    -13,   -11,   -9,    -2,    -2,    -7,    -8,    -6,    -7,
+    -12,   -12,   -10,   0,     5,     11,    13,    11,    10,    7,     3,
+    0,     0,     3,     10,    14,    16,    18,    19,    21,    14,    15,
+    12,    7,     6,     7,     9,     7,     11,    6,     4,     4,     -1,
+    -9,    -12,   -12,   -14,   -9,    -9,    -6,    -5,    -4,    -6,    -7,
+    -12,   -15,   -17,   -27,   -23,   -20,   -19,   -19,   -18,   -24,   -20,
+    -25,   -28,   -33,   -31,   -29,   -27,   -15,   -12,   -7,    -3,    1,
+    -3,    -3,    -5,    -8,    -6,    0,     13,    17,    24,    25,    23,
+    24,    18,    8,     -3,    -4,    -4,    -7,    -3,    1,     4,     7,
+    9,     10,    14,    14,    20,    28,    35,    38,    42,    43,    43,
+    39,    30,    27,    19,    15,    8,     10,    12,    19,    25,    26,
+    27,    23,    22,    15,    10,    6,     8,     4,     6,     6,     3,
+    7,     7,     15,    11,    7,     6,     5,     9,     6,     0,     -3,
+    -14,   -21,   -21,   -30,   -39,   -42,   -40,   -37,   -37,   -36,   -32,
+    -30,   -24,   -21,   -22,   -23,   -24,   -28,   -31,   -31,   -29,   -27,
+    -30,   -31,   -31,   -31,   -34,   -33,   -34,   -26,   -21,   -15,   -10,
+    -5,    -3,    -2,    -3,    -6,    -5,    -11,   -14,   -10,   -5,    0,
+    9,     10,    18,    21,    19,    21,    11,    7,     4,     6,     6,
+    7,     3,     -6,    -9,    -16,   -23,   -24,   -23,   -26,   -18,   -16,
+    -11,   -8,    0,     6,     5,     6,     10,    8,     8,     16,    24,
+    24,    23,    24,    24,    24,    18,    9,     4,     -3,    -11,   -16,
+    -15,   -18,   -14,   -12,   -9,    -3,    -4,    -1,    8,     11,    10,
+    19,    21,    21,    23,    20,    22,    15,    9,     7,     5,     3,
+    1,     12,    13,    10,    18,    23,    31,    37,    40,    36,    38,
+    40,    40,    38,    27,    24,    21,    14,    12,    12,    7,     7,
+    15,    18,    19,    18,    17,    18,    14,    12,    11,    7,     5,
+    7,     9,     9,     15,    14,    15,    18,    16,    7,     0,     -5,
+    -6,    -6,    -6,    -1,    7,     9,     12,    6,     4,     4,     2,
+    -1,    2,     3,     3,     5,     4,     -1,    -13,   -19,   -29,   -34,
+    -39,   -43,   -49,   -54,   -53,   -55,   -55,   -56,   -59,   -58,   -49,
+    -41,   -32,   -19,   -10,   -2,    -4,    -1,    -6,    -19,   -27,   -26,
+    -27,   -27,   -21,   -22,   -20,   -26,   -26,   -20,   -20,   -20,   -21,
+    -17,   -18,   -7,    -6,    -6,    -5,    -1,    7,     18,    10,    16,
+    25,    24,    31,    30,    32,    30,    26,    24,    22,    23,    21,
+    23,    21,    24,    19,    17,    13,    12,    15,    6,     2,     -5,
+    -9,    -13,   -10,   -5,    1,     10,    13,    17,    13,    8,     5,
+    5,     6,     5,     13,    19,    16,    14,    12,    7,     15,    18,
+    19,    16,    4,     -1,    0,     -1,    -2,    -9,    -15,   -19,   -21,
+    -13,   -13,   -10,   -7,    -7,    -7,    -6,    -11,   -22,   -18,   -19,
+    -22,   -22,   -19,   -18,   -10,   -7,    -9,    -7,    -12,   -16,   -20,
+    -27,   -35,   -37,   -37,   -33,   -24,   -14,   -4,    8,     14,    19,
+    19,    16,    12,    6,     2,     -5,    -6,    -11,   -17,   -16,   -14,
+    -13,   -12,   -17,   -21,   -22,   -24,   -18,   -14,   -12,   -1,    4,
+    9,     17,    14,    9,     13,    14,    13,    14,    14,    12,    11,
+    15,    11,    16,    21,    20,    20,    22,    31,    30,    26,    15,
+    13,    6,     8,     5,     1,     -5,    -3,    2,     9,     14,    13,
+    16,    17,    18,    13,    10,    8,     7,     9,     12,    21,    23,
+    23,    21,    19,    16,    14,    5,     -4,    -12,   -15,   -16,   -12,
+    -9,    -12,   -14,   -17,   -16,   -15,   -14,   -15,   -28,   -27,   -24,
+    -12,   -8,    -3,    3,     9,     15,    18,    25,    25,    31,    32,
+    35,    36,    33,    36,    24,    13,    2,     -11,   -19,   -18,   -18,
+    -10,   -6,    -4,    0,     -3,    -3,    -15,   -18,   -17,   -9,    -7,
+    2,     5,     7,     6,     2,     -2,    -12,   -16,   -16,   -9,    -3,
+    6,     8,     15,    17,    16,    18,    11,    5,     -4,    -8,    -17,
+    -16,   -22,   -24,   -25,   -28,   -23,   -19,   -11,   -3,    5,     11,
+    22,    26,    29,    24,    14,    12,    7,     6,     -2,    -1,    2,
+    10,    23,    33,    36,    32,    31,    16,    3,     -4,    -3,    -3,
+    1,     8,     11,    13,    12,    8,     3,     5,     3,     1,     -1,
+    4,     2,     3,     8,     5,     5,     1,     -2,    -1,    -3,    -1,
+    5,     8,     10,    17,    17,    15,    19,    27,    18,    21,    23,
+    19,    20,    15,    1,     -7,    -18,   -24,   -24,   -33,   -28,   -32,
+    -30,   -30,   -30,   -30,   -29,   -30,   -41,   -43,   -50,   -51,   -49,
+    -42,   -32,   -19,   -10,   0,     4,     -2,    5,     9,     8,     12,
+    19,    17,    10,    9,     3,     1,     -4,    -8,    -4,    0,     5,
+    7,     10,    9,     12,    0,     -6,    -7,    -13,   -16,   -10,   -10,
+    -9,    -1,    -1,    -2,    -6,    -11,   -14,   -17,   -18,   -10,   -3,
+    -3,    0,     6,     1,     6,     4,     3,     3,     9,     16,    22,
+    28,    27,    32,    18,    21,    25,    20,    21,    18,    18,    22,
+    23,    15,    8,     -3,    -9,    -10,   -13,   -8,    3,     7,     18,
+    26,    23,    26,    30,    17,    11,    9,     -1,    0,     2,     2,
+    12,    15,    6,     1,     0,     -5,    2,     1,     -3,    -1,    -6,
+    -2,    -4,    -11,   -18,   -30,   -38,   -36,   -33,   -32,   -27,   -19,
+    -18,   -14,   -13,   -16,   -11,   -12,   -12,   -4,    0,     7,     13,
+    13,    10,    11,    6,     3,     3,     3,     4,     10,    4,     -1,
+    -3,    -11,   -21,   -27,   -34,   -33,   -31,   -33,   -28,   -22,   -21,
+    -14,   -8,    -13,   -10,   -8,    -12,   -7,    -11,   -3,    3,     5,
+    7,     7,     -1,    -12,   -13,   -17,   -21,   -8,    -2,    4,     7,
+    13,    18,    18,    16,    15,    13,    11,    15,    13,    12,    17,
+    18,    15,    15,    11,    -3,    -1,    2,     11,    15,    10,    18,
+    13,    10,    12,    9,     2,     2,     4,     -1,    6,     9,     11,
+    5,     7,     13,    8,     9,     10,    11,    9,     7,     11,    5,
+    3,     1,     -9,    -19,   -31,   -40,   -42,   -33,   -27,   -24,   -22,
+    -20,   -25,   -20,   -12,   -17,   -23,   -23,   -25,   -25,   -20,   -18,
+    -17,   -19,   -15,   -22,   -20,   -19,   -13,   -8,    -12,   0,     2,
+    -6,    -1,    -5,    -15,   -10,   -12,   -19,   -8,    -6,    -3,    9,
+    5,     12,    22,    10,    9,     12,    5,     8,     28,    13,    20,
+    25,    11,    16,    19,    10,    15,    14,    6,     23,    19,    18,
+    32,    17,    12,    19,    -1,    -8,    11,    -4,    -8,    9,     -4,
+    -6,    0,     -10,   -7,    -3,    -8,    -11,   -11,   -23,   -7,    -4,
+    -4,    14,    6,     4,     9,     3,     -4,    4,     2,     9,     26,
+    19,    26,    33,    22,    22,    24,    13,    20,    18,    18,    28,
+    28,    19,    24,    16,    -1,    1,     -12,   -34,   -28,   -25,   -27,
+    -13,   6,     8,     21,    25,    22,    19,    3,     4,     0,     -5,
+    6,     8,     1,     6,     8,     -4,    -3,    -10,   -23,   -17,   -9,
+    -10,   3,     6,     -1,    3,     -10,   -22,   -28,   -49,   -49,   -36,
+    -29,   -10,   8,     -1,    4,     14,    -3,    -14,   -5,    -16,   -10,
+    8,     7,     21,    24,    17,    25,    15,    -4,    13,    -7,    -23,
+    0,     -7,    -14,   12,    1,     -18,   -10,   -27,   -43,   -31,   -34,
+    -19,   -3,    -10,   15,    20,    -7,    10,    9,     -20,   7,     28,
+    14,    42,    54,    32,    34,    24,    5,     10,    -11,   -13,   11,
+    -6,    -4,    31,    7,     0,     34,    3,     -9,    5,     -24,   -33,
+    -14,   -11,   -1,    8,     0,     10,    7,     -7,    11,    10,    -6,
+    17,    16,    0,     10,    3,     -26,   -23,   -33,   -39,   -26,   -29,
+    -18,   -6,    -9,    -1,    5,     -11,   -6,    7,     -6,    1,     13,
+    8,     1,     3,     -13,   -23,   -25,   -33,   -28,   -21,   -9,    2,
+    4,     1,     8,     4,     -13,   -5,    -12,   -14,   3,     14,    18,
+    26,    30,    21,    20,    15,    15,    10,    5,     13,    11,    20,
+    25,    29,    18,    19,    9,     -10,   -15,   -13,   -12,   1,     16,
+    20,    30,    39,    37,    21,    15,    3,     -7,    -9,    -1,    2,
+    -6,    -7,    -10,   -20,   -19,   -19,   -31,   -25,   -12,   -15,   -13,
+    -17,   -18,   -14,   -24,   -24,   -18,   -28,   -24,   -3,    1,     17,
+    46,    48,    43,    46,    34,    12,    6,     -14,   -19,   -10,   -14,
+    3,     15,    3,     7,     7,     -13,   4,     9,     -2,    3,     22,
+    19,    25,    41,    48,    46,    36,    42,    40,    24,    33,    50,
+    29,    30,    57,    35,    13,    29,    17,    -9,    5,     15,    7,
+    13,    38,    47,    40,    56,    72,    42,    29,    40,    18,    14,
+    36,    52,    50,    58,    55,    42,    22,    20,    13,    -8,    8,
+    32,    26,    41,    70,    48,    51,    65,    36,    27,    23,    4,
+    5,     1,     -3,    2,     -8,    -23,   -6,    -30,   -46,   -24,   -40,
+    -45,   -22,   -32,   -35,   -24,   -50,   -41,   -35,   -56,   -38,   -29,
+    -55,   -25,   -7,    -40,   -26,   -25,   -63,   -51,   -40,   -61,   -47,
+    -38,   -38,   -5,    2,     3,     26,    -1,    -7,    8,     -20,   -17,
+    10,    -14,   -6,    41,    24,    27,    52,    26,    13,    25,    5,
+    -6,    2,     -7,    -2,    10,    4,     29,    36,    30,    74,    93,
+    91,    131,   150,   132,   167,   177,   158,   189,   188,   178,   200,
+    199,   187,   212,   202,   188,   210,   188,   173,   187,   175,   183,
+    215,   218,   236,   264,   253,   279,   296,   275,   290,   288,   261,
+    261,   261,   230,   216,   199,   157,   160,   147,   115,   108,   84,
+    50,    32,    7,     -30,   -56,   -96,   -130,  -146,  -179,  -199,  -223,
+    -255,  -280,  -293,  -326,  -341,  -352,  -391,  -410,  -429,  -464,  -489,
+    -507,  -538,  -559,  -577,  -602,  -634,  -656,  -679,  -696,  -702,  -700,
+    -699,  -700,  -687,  -666,  -665,  -656,  -634,  -626,  -609,  -572,  -539,
+    -518,  -484,  -462,  -444,  -418,  -390,  -364,  -336,  -295,  -245,  -210,
+    -175,  -127,  -97,   -63,   -28,   10,    45,    83,    121,   167,   222,
+    272,   324,   369,   396,   439,   485,   502,   536,   571,   585,   618,
+    656,   676,   705,   729,   744,   767,   776,   786,   798,   796,   813,
+    849,   855,   865,   883,   862,   843,   834,   794,   781,   778,   767,
+    746,   744,   721,   702,   681,   638,   607,   562,   521,   490,   447,
+    398,   361,   313,   255,   204,   123,   20,    -59,   -143,  -217,  -270,
+    -328,  -400,  -462,  -529,  -607,  -666,  -737,  -797,  -854,  -906,  -936,
+    -944,  -955,  -965,  -976,  -993,  -1003, -1007, -1032, -1040, -1045, -1055,
+    -1039, -1016, -1003, -990,  -995,  -1026, -1046, -1070, -1079, -1058, -1060,
+    -1062, -1028, -1010, -1006, -991,  -1000, -1004, -987,  -981,  -958,  -921,
+    -890,  -852,  -798,  -754,  -713,  -681,  -682,  -658,  -617,  -585,  -524,
+    -452,  -404,  -332,  -258,  -224,  -183,  -144,  -132,  -94,   -64,   -31,
+    37,    99,    147,   219,   280,   329,   389,   439,   483,   563,   632,
+    702,   799,   884,   965,   1050,  1107,  1150,  1209,  1260,  1308,  1383,
+    1446,  1514,  1582,  1632,  1679,  1727,  1770,  1804,  1837,  1872,  1916,
+    1961,  1999,  2038,  2071,  2089,  2097,  2107,  2091,  2084,  2072,  2051,
+    2021,  1998,  1940,  1868,  1814,  1734,  1641,  1559,  1480,  1395,  1305,
+    1213,  1115,  1015,  901,   785,   667,   520,   381,   256,   110,   -26,
+    -141,  -284,  -417,  -528,  -670,  -805,  -935,  -1080, -1206, -1324, -1438,
+    -1527, -1622, -1725, -1798, -1879, -1956, -2006, -2063, -2128, -2166, -2201,
+    -2238, -2257, -2292, -2316, -2337, -2357, -2356, -2362, -2382, -2375, -2368,
+    -2367, -2358, -2337, -2329, -2318, -2296, -2273, -2240, -2195, -2140, -2095,
+    -2044, -1990, -1932, -1872, -1803, -1737, -1673, -1602, -1520, -1428, -1325,
+    -1219, -1112, -1006, -896,  -780,  -681,  -591,  -481,  -388,  -294,  -189,
+    -85,   30,    148,   252,   348,   466,   579,   692,   811,   918,   1041,
+    1162,  1271,  1389,  1507,  1611,  1735,  1864,  1965,  2085,  2203,  2312,
+    2436,  2536,  2614,  2697,  2760,  2812,  2886,  2956,  3010,  3066,  3088,
+    3098,  3120,  3110,  3101,  3106,  3108,  3130,  3149,  3139,  3122,  3085,
+    3016,  2951,  2874,  2770,  2671,  2559,  2435,  2315,  2198,  2059,  1915,
+    1761,  1570,  1387,  1185,  984,   787,   601,   413,   224,   40,    -158,
+    -348,  -560,  -760,  -960,  -1147, -1312, -1471, -1621, -1779, -1925, -2069,
+    -2206, -2333, -2463, -2570, -2664, -2743, -2811, -2860, -2886, -2934, -2976,
+    -3015, -3057, -3074, -3076, -3079, -3060, -3032, -2998, -2950, -2920, -2893,
+    -2863, -2837, -2806, -2761, -2715, -2662, -2607, -2554, -2486, -2402, -2325,
+    -2264, -2190, -2127, -2063, -1989, -1932, -1862, -1788, -1724, -1640, -1545,
+    -1455, -1346, -1234, -1112, -984,  -859,  -735,  -610,  -494,  -384,  -280,
+    -176,  -68,   40,    140,   244,   363,   478,   596,   739,   876,   1001,
+    1128,  1240,  1352,  1474,  1595,  1717,  1853,  1972,  2093,  2215,  2328,
+    2432,  2533,  2641,  2744,  2855,  2949,  3055,  3157,  3242,  3329,  3415,
+    3479,  3528,  3569,  3588,  3617,  3649,  3676,  3708,  3747,  3751,  3753,
+    3744,  3693,  3640,  3576,  3470,  3369,  3248,  3098,  2976,  2838,  2690,
+    2557,  2395,  2222,  2055,  1872,  1675,  1488,  1279,  1057,  851,   623,
+    393,   180,   -74,   -315,  -537,  -771,  -979,  -1161, -1373, -1558, -1729,
+    -1932, -2110, -2294, -2478, -2636, -2785, -2917, -3007, -3094, -3183, -3247,
+    -3319, -3402, -3450, -3510, -3564, -3595, -3622, -3635, -3627, -3635, -3639,
+    -3620, -3620, -3610, -3596, -3581, -3535, -3495, -3455, -3410, -3361, -3323,
+    -3265, -3202, -3141, -3078, -3001, -2919, -2830, -2739, -2640, -2540, -2430,
+    -2320, -2192, -2057, -1909, -1761, -1603, -1422, -1244, -1059, -887,  -726,
+    -570,  -425,  -256,  -92,   69,    238,   411,   557,   728,   910,   1066,
+    1229,  1403,  1561,  1727,  1895,  2050,  2208,  2352,  2492,  2638,  2765,
+    2893,  3025,  3145,  3263,  3387,  3496,  3595,  3707,  3804,  3884,  3975,
+    4046,  4105,  4167,  4204,  4220,  4237,  4243,  4247,  4260,  4255,  4251,
+    4246,  4201,  4143,  4092,  3996,  3885,  3772,  3604,  3435,  3283,  3086,
+    2923,  2742,  2535,  2341,  2130,  1887,  1649,  1411,  1137,  915,   659,
+    398,   163,   -81,   -351,  -580,  -814,  -1069, -1262, -1476, -1689, -1850,
+    -2043, -2237, -2395, -2591, -2763, -2918, -3095, -3224, -3319, -3435, -3508,
+    -3582, -3698, -3772, -3858, -3950, -4008, -4047, -4088, -4093, -4085, -4098,
+    -4064, -4052, -4057, -4033, -4028, -4018, -3991, -3971, -3933, -3865, -3802,
+    -3727, -3633, -3562, -3477, -3392, -3300, -3210, -3115, -3018, -2924, -2819,
+    -2721, -2606, -2490, -2381, -2246, -2111, -1963, -1810, -1638, -1460, -1293,
+    -1132, -980,  -828,  -666,  -496,  -322,  -125,  72,    264,   470,   676,
+    879,   1087,  1280,  1457,  1633,  1799,  1970,  2152,  2327,  2501,  2678,
+    2840,  3007,  3165,  3301,  3434,  3558,  3667,  3791,  3912,  4023,  4140,
+    4257,  4359,  4475,  4554,  4614,  4656,  4682,  4697,  4726,  4749,  4775,
+    4810,  4812,  4812,  4810,  4768,  4697,  4620,  4502,  4368,  4210,  4031,
+    3860,  3663,  3472,  3291,  3076,  2849,  2642,  2392,  2140,  1890,  1610,
+    1325,  1064,  782,   494,   231,   -50,   -329,  -593,  -861,  -1112, -1345,
+    -1588, -1812, -2022, -2257, -2467, -2682, -2924, -3126, -3317, -3495, -3630,
+    -3737, -3855, -3941, -4031, -4128, -4200, -4281, -4348, -4388, -4427, -4449,
+    -4444, -4450, -4458, -4452, -4464, -4460, -4451, -4444, -4425, -4384, -4344,
+    -4289, -4234, -4160, -4076, -4000, -3917, -3837, -3753, -3669, -3558, -3460,
+    -3354, -3230, -3111, -2966, -2824, -2665, -2495, -2333, -2151, -1951, -1752,
+    -1554, -1367, -1222, -1053, -882,  -716,  -520,  -331,  -141,  62,    270,
+    476,   707,   923,   1133,  1349,  1534,  1735,  1943,  2124,  2317,  2511,
+    2668,  2839,  3002,  3140,  3317,  3481,  3615,  3771,  3920,  4050,  4196,
+    4319,  4430,  4556,  4657,  4765,  4868,  4945,  4999,  5057,  5075,  5100,
+    5123,  5133,  5134,  5127,  5104,  5084,  5058,  4968,  4896,  4750,  4575,
+    4381,  4179,  3971,  3776,  3590,  3394,  3209,  2991,  2800,  2535,  2269,
+    1972,  1654,  1319,  998,   697,   384,   105,   -187,  -476,  -759,  -1047,
+    -1316, -1579, -1841, -2085, -2317, -2550, -2745, -2938, -3145, -3326, -3523,
+    -3706, -3859, -3998, -4124, -4218, -4288, -4346, -4386, -4437, -4495, -4550,
+    -4619, -4680, -4732, -4779, -4813, -4820, -4842, -4825, -4791, -4773, -4742,
+    -4715, -4709, -4683, -4652, -4605, -4527, -4428, -4315, -4194, -4086, -3978,
+    -3872, -3779, -3685, -3569, -3458, -3313, -3121, -2921, -2693, -2454, -2230,
+    -1998, -1783, -1588, -1414, -1240, -1069, -886,  -690,  -473,  -256,  -36,
+    170,   384,   594,   797,   1015,  1235,  1449,  1664,  1882,  2098,  2311,
+    2504,  2681,  2843,  3019,  3171,  3337,  3534,  3709,  3885,  4072,  4235,
+    4380,  4524,  4641,  4746,  4864,  4979,  5087,  5213,  5308,  5393,  5450,
+    5468,  5475,  5472,  5452,  5462,  5467,  5453,  5451,  5425,  5342,  5255,
+    5113,  4914,  4725,  4512,  4273,  4053,  3866,  3632,  3436,  3205,  2955,
+    2705,  2420,  2095,  1794,  1503,  1195,  941,   639,   342,   56,    -269,
+    -601,  -894,  -1208, -1499, -1736, -1994, -2239, -2426, -2652, -2891, -3099,
+    -3361, -3588, -3793, -4013, -4183, -4302, -4439, -4523, -4613, -4734, -4809,
+    -4891, -4999, -5056, -5090, -5131, -5092, -5061, -5044, -4987, -4954, -4955,
+    -4924, -4911, -4873, -4809, -4755, -4673, -4555, -4440, -4316, -4187, -4088,
+    -3986, -3881, -3802, -3717, -3605, -3495, -3359, -3207, -3063, -2889, -2698,
+    -2504, -2306, -2088, -1861, -1627, -1415, -1201, -1000, -799,  -593,  -410,
+    -220,  -7,    203,   412,   634,   865,   1126,  1367,  1602,  1838,  2052,
+    2257,  2474,  2659,  2863,  3076,  3255,  3429,  3617,  3773,  3939,  4102,
+    4222,  4358,  4501,  4611,  4733,  4846,  4939,  5056,  5147,  5217,  5301,
+    5357,  5388,  5428,  5417,  5400,  5430,  5422,  5406,  5442,  5446,  5431,
+    5437,  5381,  5304,  5212,  5057,  4874,  4683,  4465,  4249,  4026,  3767,
+    3545,  3304,  3021,  2741,  2450,  2113,  1807,  1490,  1151,  841,   544,
+    212,   -102,  -439,  -788,  -1091, -1413, -1730, -2033, -2336, -2627, -2854,
+    -3118, -3350, -3560, -3781, -4008, -4194, -4376, -4524, -4640, -4757, -4865,
+    -4945, -5016, -5083, -5131, -5170, -5184, -5198, -5208, -5211, -5210, -5209,
+    -5192, -5174, -5154, -5108, -5052, -5002, -4932, -4854, -4780, -4704, -4604,
+    -4514, -4421, -4309, -4208, -4111, -4004, -3880, -3751, -3622, -3496, -3367,
+    -3210, -3047, -2867, -2654, -2430, -2177, -1897, -1651, -1417, -1182, -983,
+    -793,  -593,  -406,  -211,  17,    232,   461,   716,   958,   1197,  1441,
+    1674,  1899,  2130,  2355,  2573,  2788,  3004,  3220,  3419,  3612,  3809,
+    3973,  4120,  4277,  4433,  4573,  4742,  4902,  5037,  5165,  5282,  5377,
+    5460,  5539,  5596,  5654,  5716,  5741,  5759,  5770,  5776,  5762,  5751,
+    5737,  5706,  5675,  5644,  5550,  5446,  5324,  5169,  4974,  4767,  4530,
+    4289,  4067,  3823,  3621,  3391,  3145,  2878,  2575,  2228,  1890,  1525,
+    1149,  807,   473,   145,   -152,  -454,  -769,  -1057, -1374, -1703, -2033,
+    -2372, -2701, -2977, -3258, -3495, -3694, -3897, -4089, -4270, -4483, -4668,
+    -4840, -5015, -5140, -5225, -5304, -5334, -5350, -5390, -5398, -5403, -5428,
+    -5438, -5449, -5472, -5463, -5441, -5401, -5333, -5252, -5151, -5051, -4974,
+    -4880, -4805, -4729, -4626, -4526, -4403, -4248, -4088, -3939, -3778, -3617,
+    -3464, -3308, -3173, -3027, -2852, -2669, -2461, -2233, -1979, -1713, -1455,
+    -1216, -996,  -796,  -610,  -397,  -198,  21,    272,   517,   775,   1037,
+    1295,  1544,  1790,  2007,  2211,  2423,  2634,  2848,  3081,  3319,  3551,
+    3792,  4000,  4171,  4303,  4418,  4518,  4596,  4679,  4807,  4913,  5044,
+    5172,  5288,  5405,  5518,  5609,  5664,  5713,  5735,  5735,  5737,  5701,
+    5691,  5656,  5633,  5611,  5552,  5475,  5394,  5293,  5177,  5064,  4924,
+    4737,  4599,  4420,  4237,  4048,  3828,  3623,  3413,  3183,  2915,  2622,
+    2308,  1980,  1657,  1261,  901,   549,   205,   -85,   -383,  -688,  -969,
+    -1246, -1530, -1850, -2206, -2561, -2915, -3224, -3482, -3713, -3921, -4107,
+    -4287, -4470, -4660, -4850, -5057, -5239, -5395, -5540, -5619, -5697, -5724,
+    -5697, -5675, -5633, -5590, -5579, -5530, -5486, -5442, -5426, -5391, -5348,
+    -5276, -5197, -5124, -5039, -4925, -4808, -4677, -4581, -4479, -4343, -4218,
+    -4087, -3970, -3858, -3729, -3570, -3384, -3206, -3020, -2839, -2636, -2453,
+    -2287, -2185, -2154, -1926, -1562, -1223, -758,  -473,  -64,   395,   599,
+    880,   814,   938,   1172,  1498,  1928,  2127,  2422,  2608,  2841,  2937,
+    2886,  2815,  2985,  3324,  3757,  4152,  4481,  4652,  4917,  4965,  4766,
+    4583,  4328,  4503,  4815,  5118,  5408,  5682,  5956,  6082,  6055,  5744,
+    5426,  5341,  5427,  5606,  5882,  6065,  6226,  6428,  6477,  6385,  6009,
+    5728,  5552,  5439,  5339,  5200,  5008,  4947,  4835,  4614,  4330,  3887,
+    3521,  3111,  2460,  1983,  1297,  650,   279,   -353,  -720,  -1044, -1518,
+    -1668, -2117, -2496, -2743, -3266, -3607, -3790, -4149, -4075, -4042, -4096,
+    -3981, -4138, -4226, -4214, -4503, -4455, -4577, -4642, -4346, -4351, -4270,
+    -4263, -4522, -4521, -4673, -4814, -4731, -4950, -5011, -5004, -5288, -5341,
+    -5566, -5833, -5783, -5929, -5847, -5765, -5828, -5644, -5613, -5615, -5428,
+    -5291, -5014, -4554, -4277, -3964, -3854, -3829, -3612, -3603, -3438, -3137,
+    -2831, -2164, -1438, -939,  -330,  -156,  46,    242,   73,    242,   220,
+    239,   542,   565,   739,   872,   801,   857,   676,   543,   586,   567,
+    828,   1142,  1490,  1985,  2508,  2982,  3438,  3699,  3939,  4069,  4178,
+    4420,  4622,  4917,  5338,  5801,  6285,  6658,  6963,  7213,  7233,  7328,
+    7176,  7038,  7031,  6860,  6957,  6767,  6599,  6523,  6212,  6147,  6063,
+    5860,  6020,  6015,  6033,  6184,  5722,  5607,  5016,  4337,  4063,  3229,
+    3080,  3006,  2804,  3035,  2541,  2136,  1879,  1012,  401,   -575,  -1584,
+    -1930, -2278, -2485, -2477, -2712, -2747, -2766, -3320, -3592, -4188, -4669,
+    -4672, -4939, -4789, -4426, -4203, -3674, -3563, -3656, -3759, -4067, -4257,
+    -4522, -4970, -5204, -5237, -5139, -4907, -4911, -4917, -4921, -5007, -5230,
+    -5654, -6122, -6464, -6733, -6948, -7067, -6972, -6800, -6520, -6132, -5830,
+    -5382, -5091, -4797, -4546, -4472, -4362, -4350, -4235, -3851, -3454, -3144,
+    -2735, -2341, -1845, -1262, -958,  -549,  -166,  66,    382,   366,   352,
+    341,   85,    -13,   -176,  -303,  -235,  -341,  -309,  -227,  -249,  -50,
+    143,   384,   874,   1149,  1552,  2155,  2767,  3499,  3994,  4460,  4920,
+    5288,  5569,  5704,  5881,  6094,  6461,  6653,  6803,  7115,  7311,  7521,
+    7612,  7443,  7380,  7124,  6742,  6495,  5964,  5656,  5415,  5167,  5656,
+    5813,  6027,  6401,  6351,  6787,  7019,  6581,  6512,  5965,  5308,  5140,
+    4336,  4147,  3899,  3398,  3360,  2830,  2624,  1968,  1026,  395,   -699,
+    -1424, -2327, -3006, -3192, -3435, -3337, -3686, -3513, -3350, -3502, -3261,
+    -3878, -4005, -4063, -4187, -3767, -3598, -3384, -3300, -3094, -2857, -3023,
+    -3274, -3851, -4352, -4523, -4943, -5477, -5612, -5682, -5733, -5714, -5965,
+    -6110, -5950, -6158, -6548, -6897, -7165, -7281, -7352, -7258, -7185, -6659,
+    -5946, -5470, -4738, -4046, -3707, -3210, -3108, -3270, -3227, -3222, -3218,
+    -3017, -2943, -2668, -2296, -1593, -1061, -811,  -403,  -513,  -361,  -128,
+    -595,  -633,  -991,  -1205, -1159, -1284, -1330, -1164, -999,  -729,  -538,
+    -336,  27,    350,   794,   1245,  1646,  2446,  3210,  4017,  4835,  5271,
+    5739,  6028,  6140,  6212,  6161,  6066,  5984,  6081,  5995,  6152,  6301,
+    6278,  6424,  6377,  6396,  6362,  6152,  5788,  5309,  5071,  4860,  4704,
+    4804,  4919,  5258,  5869,  6121,  6365,  6694,  6692,  6694,  6532,  6187,
+    5808,  5704,  5302,  4816,  4611,  4043,  3775,  3249,  2600,  1933,  982,
+    336,   -848,  -1538, -2242, -3103, -3374, -3756, -3975, -4017, -4061, -3972,
+    -3749, -3609, -3853, -3850, -3714, -3760, -3736, -3914, -3923, -3830, -3541,
+    -3649, -3757, -3661, -3913, -4038, -4231, -4594, -4769, -5009, -5273, -5588,
+    -5676, -5937, -5997, -6060, -6164, -6414, -6623, -6765, -6857, -6771, -6921,
+    -6914, -6535, -6187, -5626, -5206, -4742, -4189, -3618, -3120, -2823, -2606,
+    -2550, -2703, -2736, -2626, -2498, -2406, -2133, -1852, -1348, -753,  -318,
+    162,   330,   524,   375,   9,     -204,  -866,  -1249, -1532, -1669, -1455,
+    -1235, -723,  -283,  262,   535,   862,   1340,  1712,  2316,  2625,  3171,
+    4015,  4698,  5516,  6006,  6452,  6838,  6921,  7003,  6735,  6339,  6138,
+    5768,  5575,  5593,  5568,  5728,  6041,  6233,  6260,  6175,  6048,  5728,
+    5366,  4931,  4340,  4194,  4174,  4330,  4743,  5028,  5754,  6250,  6598,
+    7120,  7114,  6962,  6675,  6157,  5373,  4797,  4081,  3237,  3153,  2588,
+    2143,  1639,  1021,  681,   -149,  -816,  -1987, -3003, -3493, -4138, -4420,
+    -4607, -4841, -4725, -4254, -4033, -3845, -3842, -4063, -4035, -4099, -4582,
+    -4718, -4779, -4689, -4437, -4327, -4352, -4119, -3881, -4061, -4345, -4768,
+    -5248, -5610, -5920, -6383, -6779, -6731, -6673, -6677, -6597, -6659, -6619,
+    -6417, -6516, -6862, -7017, -7069, -6944, -6715, -6376, -6000, -5162, -4333,
+    -3577, -2884, -2355, -1807, -1366, -1380, -1590, -1869, -1962, -1945, -2006,
+    -2141, -1960, -1516, -1025, -471,  -135,  85,    348,   239,   -8,    -475,
+    -951,  -1245, -1520, -1569, -1448, -1188, -517,  134,   827,   1585,  2114,
+    2792,  3214,  3651,  4230,  4546,  4894,  5321,  5588,  6105,  6583,  6877,
+    7014,  7087,  7068,  6876,  6695,  6280,  5684,  5385,  5205,  5064,  5033,
+    5028,  5080,  5322,  5510,  5461,  5390,  5541,  5494,  5443,  5306,  5065,
+    5193,  5338,  5513,  5818,  5911,  6345,  6506,  6514,  6543,  5981,  5703,
+    5082,  4228,  3517,  2424,  1880,  1245,  562,   -130,  -864,  -1156, -1561,
+    -1970, -2597, -3357, -3707, -4189, -4521, -4975, -5477, -5478, -5585, -5445,
+    -5353, -5327, -4971, -4580, -4431, -4469, -4432, -4422, -4275, -4227, -4507,
+    -4745, -4758, -4752, -4845, -4933, -5118, -5117, -5124, -5324, -5673, -5971,
+    -6152, -6366, -6702, -6970, -7159, -7136, -6929, -6917, -6703, -6520, -6302,
+    -5794, -5484, -5123, -4694, -4254, -3722, -3334, -2917, -2410, -1721, -1010,
+    -584,  -312,  27,    321,   327,   214,   -17,   -363,  -402,  -550,  -638,
+    -469,  -315,  -86,   142,   242,   387,   448,   458,   423,   321,   194,
+    285,   417,   717,   1176,  1673,  2402,  3144,  3985,  4764,  5406,  6056,
+    6507,  6783,  6891,  6868,  6850,  6717,  6532,  6359,  6248,  6303,  6279,
+    6140,  6071,  5927,  5687,  5480,  5146,  4835,  4572,  4447,  4481,  4578,
+    4840,  4936,  5246,  5659,  5732,  5856,  5658,  5403,  5282,  5004,  4949,
+    4843,  4681,  4884,  4886,  4967,  5108,  4781,  4647,  4240,  3443,  2768,
+    1830,  983,   309,   -769,  -1382, -1987, -2553, -2750, -3346, -3555, -4052,
+    -4400, -4599, -5196, -5437, -5945, -6340, -6343, -6554, -6611, -6381, -6184,
+    -5681, -5398, -5098, -4751, -4529, -4138, -4100, -4088, -4044, -4186, -4189,
+    -4263, -4453, -4465, -4598, -4651, -4726, -4919, -4926, -5142, -5286, -5490,
+    -5831, -6002, -6341, -6492, -6562, -6710, -6553, -6506, -6219, -5766, -5521,
+    -5008, -4556, -4002, -3293, -2769, -2069, -1467, -824,  -34,   509,   1034,
+    1385,  1560,  1650,  1664,  1419,  1016,  834,   511,   353,   381,   299,
+    523,   833,   956,   1280,  1492,  1425,  1547,  1350,  1143,  1114,  931,
+    1054,  1217,  1583,  2217,  2917,  4017,  4965,  5827,  6816,  7393,  7875,
+    8197,  8175,  7924,  7578,  7040,  6566,  6242,  5746,  5530,  5334,  5222,
+    5237,  5074,  5146,  5011,  4902,  4753,  4442,  4482,  4254,  4247,  4319,
+    4187,  4516,  4690,  4935,  5193,  5229,  5350,  5332,  5486,  5386,  5143,
+    4999,  4494,  4304,  3961,  3421,  2781,  2032,  1404,  614,   -88,   -956,
+    -1714, -2155, -2684, -3038, -3237, -3368, -3423, -3569, -3809, -4213, -4533,
+    -4973, -5514, -6011, -6663, -7084, -7258, -7158, -6947, -6639, -6111, -5548,
+    -4887, -4362, -4043, -3895, -3940, -4107, -4452, -4836, -5143, -5500, -5532,
+    -5510, -5485, -5096, -4739, -4375, -4065, -4063, -4094, -4252, -4576, -4904,
+    -5431, -5837, -6190, -6402, -6310, -6292, -5992, -5516, -5025, -4342, -3899,
+    -3386, -2697, -2077, -1493, -994,  -392,  232,   931,   1608,  1988,  2360,
+    2589,  2639,  2623,  2471,  2121,  1708,  1478,  1181,  1167,  1296,  1279,
+    1648,  1859,  2107,  2368,  2359,  2390,  2122,  1904,  1629,  1418,  1502,
+    1524,  1859,  2357,  3041,  3909,  4810,  5751,  6449,  7128,  7534,  7767,
+    7908,  7699,  7460,  7032,  6647,  6301,  5876,  5556,  5190,  4948,  4762,
+    4576,  4464,  4370,  4338,  4275,  4287,  4265,  4320,  4221,  4066,  3947,
+    3514,  3379,  3003,  2635,  2534,  2078,  2040,  1950,  1958,  2152,  2085,
+    2390,  2321,  2319,  2359,  1851,  1643,  877,   168,   -527,  -1245, -1704,
+    -2519, -2739, -3251, -3382, -3236, -3527, -3294, -3523, -3732, -3916, -4434,
+    -4888, -5615, -6161, -6729, -7283, -7543, -7920, -7865, -7660, -7430, -7034,
+    -6758, -6224, -5866, -5441, -5076, -4998, -4760, -4673, -4539, -4410, -4308,
+    -4131, -3992, -3791, -3611, -3448, -3213, -3070, -3046, -3048, -3168, -3244,
+    -3354, -3607, -3834, -4170, -4439, -4648, -4864, -4892, -4928, -4821, -4524,
+    -4211, -3576, -2819, -1968, -929,  -19,   1029,  2064,  2949,  3716,  4159,
+    4450,  4536,  4503,  4301,  3968,  3655,  3242,  2979,  2856,  2744,  2750,
+    2771,  2749,  2859,  2850,  2793,  2702,  2402,  2179,  1877,  1672,  1581,
+    1543,  1769,  1967,  2485,  3089,  3783,  4662,  5406,  6246,  6950,  7542,
+    8016,  8200,  8245,  8027,  7584,  6958,  6241,  5494,  4710,  3974,  3255,
+    2653,  2274,  2038,  1986,  1964,  2141,  2321,  2513,  2772,  2756,  2743,
+    2636,  2406,  2125,  1836,  1456,  1247,  1145,  995,   1077,  1140,  1290,
+    1561,  1685,  1762,  1609,  1391,  1147,  544,   84,    -754,  -1546, -2107,
+    -2806, -3137, -3522, -3732, -3826, -3834, -3609, -3493, -3340, -3254, -3499,
+    -3621, -3981, -4455, -4859, -5513, -6080, -6626, -7061, -7372, -7556, -7573,
+    -7515, -7366, -7091, -6799, -6366, -5887, -5484, -5098, -4746, -4334, -3941,
+    -3558, -3269, -3053, -2844, -2663, -2497, -2314, -2227, -2185, -2141, -2139,
+    -2070, -2037, -2031, -2062, -2205, -2348, -2544, -2774, -2979, -3298, -3520,
+    -3647, -3622, -3395, -3054, -2513, -1829, -948,  64,    1090,  2169,  3127,
+    3987,  4712,  5229,  5560,  5754,  5741,  5619,  5401,  5005,  4666,  4287,
+    3967,  3734,  3476,  3322,  3203,  3147,  3144,  3116,  3080,  3011,  2871,
+    2735,  2544,  2363,  2245,  2075,  2032,  2118,  2263,  2688,  3066,  3605,
+    4244,  4746,  5384,  5819,  6151,  6319,  6194,  5938,  5495,  4929,  4305,
+    3581,  2924,  2279,  1713,  1372,  1086,  1006,  983,   1006,  1146,  1249,
+    1349,  1360,  1231,  1084,  794,   502,   264,   -85,   -238,  -411,  -504,
+    -394,  -322,  -51,   188,   420,   589,   624,   666,   573,   338,   -86,
+    -564,  -1056, -1560, -1925, -2434, -2806, -3017, -3341, -3320, -3375, -3480,
+    -3410, -3567, -3553, -3595, -3805, -3919, -4284, -4482, -4754, -5190, -5354,
+    -5806, -6050, -6136, -6387, -6343, -6330, -6206, -5851, -5468, -4960, -4549,
+    -4080, -3542, -3150, -2698, -2440, -2318, -2132, -2067, -2081, -2017, -2099,
+    -2151, -2060, -2067, -1916, -1823, -1718, -1523, -1386, -1221, -1189, -1141,
+    -1014, -1008, -966,  -996,  -1015, -916,  -809,  -648,  -467,  -128,  237,
+    735,   1358,  1969,  2697,  3399,  4060,  4732,  5295,  5720,  6077,  6169,
+    6139,  5928,  5614,  5292,  4766,  4247,  3705,  3262,  3030,  2827,  2702,
+    2684,  2728,  2887,  3092,  3216,  3310,  3313,  3214,  3098,  2873,  2620,
+    2343,  2031,  1799,  1589,  1491,  1537,  1645,  1913,  2210,  2548,  2922,
+    3295,  3650,  3951,  4100,  4099,  3972,  3740,  3421,  2948,  2427,  1762,
+    1136,  574,   44,    -330,  -642,  -846,  -852,  -751,  -520,  -229,  44,
+    272,   446,   502,   443,   329,   66,    -191,  -492,  -841,  -1002, -1240,
+    -1237, -1199, -1177, -936,  -867,  -660,  -456,  -508,  -464,  -706,  -997,
+    -1265, -1780, -2178, -2724, -3270, -3735, -4142, -4378, -4609, -4666, -4749,
+    -4575, -4355, -4137, -3767, -3563, -3218, -2970, -2834, -2630, -2716, -2776,
+    -2920, -3210, -3363, -3764, -4023, -4125, -4268, -4194, -4223, -4005, -3639,
+    -3258, -2891, -2644, -2297, -1987, -1751, -1587, -1570, -1485, -1415, -1342,
+    -1194, -1100, -889,  -613,  -267,  161,   482,   865,   1269,  1639,  2005,
+    2202,  2381,  2549,  2628,  2700,  2625,  2559,  2481,  2357,  2319,  2192,
+    2142,  2199,  2283,  2514,  2670,  2919,  3214,  3510,  3830,  3971,  4080,
+    4073,  3911,  3700,  3359,  2954,  2549,  2094,  1766,  1556,  1442,  1462,
+    1560,  1808,  2070,  2357,  2606,  2730,  2831,  2737,  2582,  2309,  1931,
+    1585,  1178,  834,   529,   288,   214,   218,   302,   470,   679,   944,
+    1211,  1420,  1562,  1674,  1631,  1548,  1355,  1072,  776,   375,   25,
+    -320,  -614,  -818,  -992,  -991,  -906,  -755,  -525,  -291,  -17,   225,
+    447,   528,   546,   466,   270,   96,    -205,  -536,  -861,  -1148, -1383,
+    -1586, -1688, -1814, -1783, -1772, -1745, -1630, -1611, -1505, -1488, -1462,
+    -1409, -1519, -1489, -1609, -1723, -1755, -1977, -2042, -2132, -2215, -2184,
+    -2268, -2205, -2170, -2107, -1978, -1990, -1909, -1886, -1943, -1997, -2152,
+    -2326, -2500, -2762, -2987, -3227, -3392, -3522, -3630, -3579, -3469, -3262,
+    -2916, -2555, -2103, -1581, -1090, -531,  -20,   457,   873,   1228,  1561,
+    1809,  1999,  2105,  2139,  2196,  2201,  2149,  2113,  2038,  1990,  1913,
+    1787,  1705,  1595,  1490,  1372,  1201,  1113,  998,   917,   917,   894,
+    961,   1007,  1098,  1321,  1470,  1681,  1882,  2067,  2317,  2465,  2626,
+    2750,  2777,  2783,  2694,  2569,  2431,  2142,  1843,  1597,  1306,  1069,
+    824,   622,   532,   430,   388,   357,   377,   438,   414,   481,   468,
+    431,   454,   383,   374,   305,   207,   187,   133,   157,   115,   113,
+    206,   244,   382,   475,   591,   753,   821,   916,   908,   855,   754,
+    577,   399,   123,   -159,  -399,  -647,  -784,  -923,  -1010, -965,  -918,
+    -806,  -647,  -504,  -355,  -253,  -179,  -130,  -138,  -156,  -262,  -339,
+    -401,  -552,  -600,  -671,  -697,  -662,  -673,  -616,  -597,  -522,  -495,
+    -513,  -490,  -624,  -701,  -804,  -961,  -1073, -1328, -1503, -1656, -1798,
+    -1801, -1913, -1863, -1785, -1720, -1453, -1309, -1051, -846,  -715,  -487,
+    -457,  -357,  -331,  -400,  -427,  -627,  -765,  -873,  -1021, -1105, -1255,
+    -1312, -1357, -1370, -1288, -1261, -1165, -1139, -1062, -917,  -808,  -680,
+    -597,  -452,  -277,  -104,  122,   312,   558,   771,   919,   1110,  1205,
+    1312,  1355,  1302,  1280,  1151,  1049,  946,   818,   733,   569,   451,
+    429,   388,   408,   387,   376,   426,   463,   542,   576,   632,   666,
+    673,   740,   766,   791,   845,   829,   857,   841,   822,   835,   796,
+    773,   671,   600,   560,   484,   460,   371,   311,   284,   242,   277,
+    261,   261,   277,   273,   358,   380,   410,   433,   435,   471,   432,
+    414,   386,   330,   294,   194,   149,   108,   69,    84,    69,    92,
+    83,    75,    88,    53,    12,    -96,   -194,  -269,  -369,  -438,  -523,
+    -553,  -528,  -500,  -392,  -277,  -136,  53,    240,   466,   678,   870,
+    1050,  1178,  1294,  1336,  1310,  1247,  1080,  916,   677,   387,   120,
+    -182,  -471,  -740,  -972,  -1148, -1273, -1343, -1402, -1363, -1263, -1129,
+    -922,  -724,  -518,  -288,  -79,   111,   250,   364,   405,   405,   395,
+    284,   199,   83,    -43,   -126,  -244,  -313,  -400,  -451,  -497,  -610,
+    -672,  -807,  -951,  -1087, -1325, -1517, -1736, -1929, -2086, -2260, -2318,
+    -2356, -2271, -2125, -1967, -1685, -1379, -1000, -598,  -238,  149,   481,
+    790,   1042,  1185,  1287,  1274,  1195,  1068,  868,   654,   386,   138,
+    -65,   -273,  -450,  -598,  -665,  -670,  -669,  -620,  -553,  -425,  -288,
+    -179,  -72,   15,    122,   205,   263,   324,   357,   435,   518,   603,
+    709,   779,   892,   1006,  1107,  1170,  1183,  1190,  1173,  1116,  1016,
+    890,   750,   628,   488,   331,   197,   95,    43,    25,    1,     22,
+    97,    209,   363,   495,   615,   724,   833,   937,   984,   990,   933,
+    884,   851,   747,   678,   573,   497,   469,   401,   391,   352,   339,
+    352,   337,   354,   361,   370,   402,   411,   418,   440,   468,   526,
+    576,   619,   683,   766,   857,   965,   1038,  1114,  1159,  1172,  1167,
+    1106,  1006,  840,   644,   426,   177,   -110,  -390,  -665,  -929,  -1160,
+    -1375, -1497, -1550, -1592, -1553, -1507, -1394, -1201, -1084, -863,  -685,
+    -540,  -322,  -234,  -68,   29,    59,    160,   141,   170,   140,   79,
+    77,    -11,   -53,   -179,  -274,  -327,  -480,  -564,  -736,  -884,  -995,
+    -1185, -1300, -1461, -1617, -1711, -1832, -1831, -1863, -1865, -1776, -1691,
+    -1516, -1353, -1168, -954,  -729,  -490,  -305,  -93,   81,    211,   322,
+    364,   392,   384,   332,   264,   146,   29,    -101,  -230,  -357,  -486,
+    -616,  -705,  -752,  -801,  -809,  -788,  -750,  -654,  -546,  -456,  -328,
+    -200,  -78,   45,    137,   232,   316,   388,   447,   485,   528,   578,
+    630,   697,   760,   835,   910,   988,   1068,  1124,  1154,  1157,  1166,
+    1163,  1116,  1070,  1024,  994,   986,   988,   1030,  1110,  1212,  1303,
+    1411,  1498,  1551,  1599,  1587,  1565,  1481,  1336,  1212,  1028,  847,
+    669,   466,   330,   187,   61,    -9,    -54,   -55,   -20,   11,    69,
+    133,   195,   244,   253,   225,   182,   133,   62,    -11,   -96,   -168,
+    -199,  -214,  -213,  -197,  -167,  -127,  -105,  -86,   -83,   -109,  -140,
+    -217,  -323,  -448,  -588,  -717,  -854,  -971,  -1086, -1185, -1211, -1227,
+    -1180, -1135, -1099, -992,  -918,  -788,  -704,  -651,  -562,  -542,  -470,
+    -421,  -431,  -391,  -429,  -386,  -344,  -336,  -260,  -257,  -162,  -61,
+    -6,    100,   120,   178,   215,   179,   132,   15,    -106,  -238,  -416,
+    -595,  -765,  -929,  -1066, -1170, -1252, -1278, -1290, -1258, -1173, -1114,
+    -1012, -945,  -868,  -741,  -695,  -612,  -547,  -494,  -388,  -332,  -225,
+    -110,  22,    182,   318,   496,   677,   835,   992,   1104,  1162,  1166,
+    1133,  1054,  916,   709,   430,   164,   -90,   -340,  -600,  -853,  -1033,
+    -1135, -1177, -1146, -1079, -946,  -746,  -500,  -208,  83,    377,   673,
+    950,   1183,  1356,  1503,  1627,  1707,  1735,  1708,  1678,  1668,  1645,
+    1588,  1494,  1419,  1354,  1291,  1194,  1052,  900,   718,   524,   325,
+    110,   -114,  -330,  -500,  -630,  -729,  -803,  -834,  -795,  -727,  -627,
+    -492,  -325,  -125,  54,    238,   393,   528,   642,   691,   706,   661,
+    585,   504,   380,   245,   87,    -61,   -195,  -320,  -435,  -556,  -663,
+    -742,  -814,  -883,  -952,  -1009, -1038, -1047, -1067, -1063, -1050, -1020,
+    -949,  -888,  -795,  -698,  -574,  -405,  -257,  -70,   68,    203,   381,
+    479,   580,   619,   623,   645,   565,   492,   364,   206,   106,   -71,
+    -191,  -331,  -460,  -469,  -527,  -471,  -441,  -386,  -222,  -123,  60,
+    168,   245,   404,   470,   596,   605,   581,   633,   548,   562,   468,
+    355,   334,   192,   161,   62,    -36,   -39,   -146,  -121,  -167,  -243,
+    -229,  -302,  -276,  -327,  -415,  -419,  -444,  -396,  -433,  -455,  -407,
+    -357,  -244,  -221,  -158,  -63,   36,    172,   210,   296,   326,   351,
+    424,   367,   369,   300,   224,   235,   124,   54,    -39,   -122,  -118,
+    -239,  -304,  -360,  -403,  -361,  -418,  -427,  -394,  -342,  -259,  -232,
+    -176,  -110,  -48,   27,    48,    78,    90,    86,    91,    76,    57,
+    -1,    -34,   -53,   -103,  -151,  -209,  -239,  -261,  -319,  -354,  -372,
+    -382,  -385,  -411,  -432,  -428,  -431,  -446,  -471,  -496,  -512,  -532,
+    -562,  -570,  -567,  -543,  -499,  -457,  -379,  -290,  -204,  -94,   -11,
+    78,    155,   196,   234,   222,   198,   160,   113,   64,    5,     -57,
+    -108,  -136,  -175,  -186,  -196,  -184,  -125,  -90,   -25,   58,    146,
+    271,   372,   472,   562,   636,   709,   741,   760,   752,   730,   710,
+    688,   655,   608,   595,   570,   556,   540,   517,   513,   511,   497,
+    481,   449,   417,   401,   347,   325,   295,   248,   261,   238,   250,
+    294,   295,   367,   380,   416,   454,   430,   479,   443,   431,   430,
+    386,   397,   333,   292,   238,   176,   153,   54,    24,    -37,   -84,
+    -109,  -172,  -155,  -199,  -220,  -219,  -261,  -227,  -255,  -280,  -266,
+    -293,  -277,  -273,  -243,  -214,  -221,  -179,  -153,  -130,  -109,  -154,
+    -149,  -151,  -155,  -186,  -243,  -253,  -311,  -326,  -358,  -434,  -427,
+    -491,  -533,  -554,  -598,  -596,  -655,  -668,  -679,  -714,  -671,  -694,
+    -643,  -607,  -602,  -532,  -496,  -409,  -408,  -377,  -309,  -289,  -211,
+    -223,  -196,  -145,  -147,  -104,  -157,  -123,  -125,  -177,  -152,  -229,
+    -192,  -204,  -243,  -213,  -259,  -194,  -190,  -172,  -98,   -123,  -43,
+    -12,   41,    103,   87,    148,   150,   166,   154,   113,   118,   80,
+    54,    8,     4,     25,    12,    59,    70,    162,   260,   305,   387,
+    427,   501,   549,   564,   571,   517,   488,   423,   355,   294,   206,
+    165,   113,   92,    77,    62,    115,   116,   154,   162,   171,   218,
+    210,   221,   208,   192,   215,   176,   169,   114,   89,    89,    52,
+    62,    29,    35,    73,    98,    167,   195,   261,   325,   349,   401,
+    382,   393,   368,   302,   254,   174,   104,   6,     -78,   -136,  -203,
+    -229,  -291,  -303,  -284,  -294,  -241,  -235,  -222,  -186,  -187,  -156,
+    -160,  -149,  -122,  -114,  -71,   -44,   -28,   6,     20,    47,    57,
+    54,    52,    55,    53,    23,    9,     -16,   -59,   -86,   -158,  -223,
+    -292,  -372,  -421,  -498,  -532,  -561,  -570,  -531,  -512,  -456,  -367,
+    -297,  -206,  -125,  -37,   26,    88,    147,   157,   188,   169,   152,
+    152,   131,   99,    62,    44,    46,    53,    61,    61,    79,    110,
+    159,   175,   185,   237,   220,   278,   276,   239,   264,   203,   190,
+    138,   70,    34,    -9,    18,    1,     10,    71,    115,   191,   220,
+    255,   265,   296,   319,   270,   266,   214,   189,   187,   155,   145,
+    123,   149,   166,   172,   186,   179,   195,   213,   201,   182,   161,
+    150,   116,   76,    41,    -29,   -58,   -101,  -183,  -209,  -269,  -314,
+    -342,  -385,  -379,  -380,  -348,  -304,  -273,  -197,  -144,  -88,   -28,
+    -5,    11,    20,    27,    -5,    -24,   -22,   -61,   -73,   -87,   -124,
+    -118,  -133,  -150,  -160,  -198,  -196,  -219,  -228,  -239,  -281,  -276,
+    -275,  -288,  -277,  -305,  -324,  -302,  -294,  -292,  -266,  -261,  -224,
+    -203,  -210,  -190,  -198,  -176,  -180,  -201,  -196,  -198,  -175,  -166,
+    -151,  -127,  -114,  -59,   -48,   -8,    39,    75,    126,   131,   168,
+    160,   152,   142,   82,    36,    -13,   -49,   -81,   -105,  -105,  -103,
+    -65,   -38,   -16,   19,    33,    67,    82,    95,    110,   98,    111,
+    98,    87,    67,    54,    66,    52,    49,    53,    71,    106,   139,
+    186,   224,   270,   320,   361,   413,   433,   462,   473,   478,   480,
+    459,   441,   391,   339,   298,   239,   206,   159,   149,   120,   114,
+    117,   95,    106,   81,    67,    61,    30,    11,    -29,   -42,   -76,
+    -97,   -98,   -124,  -107,  -107,  -103,  -69,   -71,   -36,   -12,   23,
+    69,    86,    129,   152,   158,   162,   152,   127,   81,    48,    -9,
+    -80,   -120,  -172,  -201,  -225,  -276,  -297,  -311,  -330,  -339,  -361,
+    -375,  -389,  -376,  -365,  -374,  -378,  -375,  -370,  -358,  -347,  -355,
+    -338,  -314,  -289,  -244,  -212,  -168,  -129,  -80,   -26,   -12,   47,
+    79,    92,    105,   105,   113,   99,    85,    29,    -18,   -53,   -110,
+    -133,  -167,  -186,  -196,  -199,  -176,  -177,  -150,  -122,  -106,  -73,
+    -61,   -30,   -34,   -29,   -40,   -68,   -63,   -85,   -84,   -71,   -65,
+    -40,   -16,   23,    56,    87,    144,   167,   196,   206,   221,   243,
+    226,   233,   210,   192,   190,   150,   140,   110,   91,    77,    43,
+    27,    -10,   -5,    -5,    -22,   -9,    -7,    27,    48,    59,    64,
+    70,    87,    104,   139,   151,   188,   239,   270,   317,   311,   336,
+    349,   341,   330,   274,   254,   223,   195,   163,   102,   81,    43,
+    20,    8,     -37,   -28,   -31,   -29,   -21,   -39,   -16,   -22,   -11,
+    -21,   -41,   -32,   -47,   -39,   -60,   -75,   -71,   -94,   -98,   -131,
+    -147,  -139,  -145,  -146,  -165,  -150,  -136,  -112,  -90,   -106,  -86,
+    -91,   -87,   -98,   -136,  -121,  -135,  -124,  -132,  -144,  -114,  -108,
+    -87,   -74,   -75,   -50,   -30,   -5,    -18,   -24,   -3,    -3,    -6,
+    -41,   -76,   -98,   -127,  -159,  -215,  -257,  -263,  -268,  -266,  -262,
+    -237,  -194,  -144,  -113,  -99,   -61,   -28,   12,    21,    46,    76,
+    92,    130,   115,   123,   132,   135,   149,   134,   133,   132,   135,
+    138,   94,    76,    51,    19,    -15,   -72,   -98,   -125,  -135,  -154,
+    -174,  -171,  -164,  -139,  -130,  -99,   -74,   -40,   9,     34,    86,
+    129,   176,   214,   226,   245,   250,   280,   271,   256,   250,   226,
+    234,   212,   187,   178,   148,   144,   104,   79,    64,    37,    36,
+    9,     -10,   -23,   -38,   -35,   -62,   -67,   -67,   -82,   -70,   -80,
+    -75,   -59,   -34,   -3,    9,     48,    76,    101,   120,   120,   123,
+    126,   131,   112,   92,    77,    61,    54,    32,    3,     -18,   -28,
+    -39,   -56,   -71,   -91,   -92,   -100,  -124,  -134,  -142,  -144,  -155,
+    -177,  -178,  -175,  -171,  -168,  -160,  -141,  -123,  -89,   -73,   -64,
+    -46,   -39,   -18,   -19,   -34,   -32,   -46,   -51,   -63,   -74,   -73,
+    -81,   -70,   -83,   -71,   -49,   -39,   -12,   -1,    30,    48,    65,
+    94,    100,   125,   136,   148,   156,   138,   140,   124,   115,   86,
+    58,    57,    32,    43,    40,    44,    63,    60,    83,    90,    99,
+    115,   113,   135,   140,   148,   164,   172,   187,   182,   190,   183,
+    171,   171,   146,   139,   121,   105,   94,    61,    46,    17,    -6,
+    -34,   -70,   -89,   -121,  -138,  -158,  -178,  -190,  -206,  -206,  -210,
+    -214,  -204,  -196,  -173,  -154,  -128,  -97,   -81,   -58,   -51,   -46,
+    -38,   -47,   -49,   -57,   -58,   -57,   -59,   -49,   -58,   -58,   -54,
+    -60,   -48,   -65,   -72,   -72,   -78,   -70,   -77,   -73,   -76,   -79,
+    -76,   -90,   -90,   -91,   -88,   -76,   -67,   -43,   -16,   6,     27,
+    39,    55,    69,    71,    74,    65,    56,    60,    47,    37,    27,
+    8,     -5,    -29,   -50,   -71,   -89,   -96,   -114,  -111,  -113,  -115,
+    -105,  -112,  -90,   -78,   -68,   -49,   -46,   -26,   -14,   5,     18,
+    10,    14,    3,     5,     -9,    -20,   -15,   -30,   -26,   -33,   -31,
+    -23,   -23,   -12,   -21,   -20,   -16,   -23,   -20,   -13,   -7,    6,
+    28,    47,    69,    96,    115,   134,   147,   154,   166,   174,   186,
+    196,   202,   204,   198,   193,   181,   164,   144,   125,   113,   102,
+    96,    90,    92,    91,    96,    99,    99,    100,   99,    99,    93,
+    94,    86,    68,    55,    44,    36,    22,    13,    15,    13,    15,
+    21,    16,    11,    3,     -15,   -31,   -50,   -75,   -105,  -125,  -145,
+    -154,  -155,  -164,  -178,  -189,  -186,  -177,  -174,  -169,  -152,  -134,
+    -114,  -93,   -65,   -42,   -23,   -4,    -1,    6,     6,     2,     -4,
+    -18,   -26,   -25,   -25,   -23,   -32,   -31,   -33,   -39,   -50,   -68,
+    -69,   -74,   -79,   -78,   -83,   -85,   -85,   -77,   -71,   -61,   -42,
+    -27,   -3,    28,    59,    95,    123,   146,   155,   160,   162,   144,
+    130,   112,   94,    82,    67,    60,    46,    35,    35,    22,    4,
+    -14,   -27,   -35,   -45,   -52,   -61,   -62,   -65,   -68,   -55,   -52,
+    -43,   -38,   -34,   -20,   -8,    8,     18,    24,    34,    36,    37,
+    42,    46,    51,    50,    58,    76,    75,    70,    67,    58,    53,
+    48,    36,    23,    18,    10,    3,     9,     14,    24,    39,    43,
+    53,    62,    63,    66,    62,    66,    64,    59,    51,    25,    19,
+    6,     -10,   -19,   -26,   -35,   -43,   -44,   -37,   -47,   -43,   -50,
+    -54,   -60,   -69,   -75,   -84,   -91,   -93,   -98,   -96,   -99,   -91,
+    -87,   -91,   -88,   -84,   -80,   -75,   -61,   -48,   -44,   -40,   -37,
+    -34,   -45,   -52,   -58,   -72,   -82,   -84,   -78,   -68,   -65,   -63,
+    -51,   -42,   -27,   -22,   -13,   -3,    8,     20,    26,    31,    31,
+    37,    33,    29,    33,    31,    32,    31,    34,    44,    55,    68,
+    74,    69,    75,    73,    72,    65,    63,    67,    70,    83,    81,
+    81,    85,    84,    80,    75,    69,    53,    44,    36,    27,    20,
+    11,    1,     -4,    -19,   -26,   -27,   -25,   -21,   -14,   -12,   -12,
+    -14,   -9,    -21,   -29,   -40,   -50,   -50,   -54,   -46,   -35,   -17,
+    -4,    -1,    7,     20,    28,    26,    22,    23,    21,    23,    18,
+    13,    12,    7,     6,     3,     2,     -1,    -1,    4,     6,     17,
+    29,    35,    34,    34,    32,    28,    33,    26,    22,    16,    16,
+    22,    20,    13,    -1,    -1,    -7,    -15,   -20,   -30,   -32,   -38,
+    -39,   -45,   -45,   -53,   -63,   -70,   -83,   -96,   -107,  -113,  -122,
+    -122,  -118,  -114,  -114,  -113,  -112,  -111,  -110,  -107,  -103,  -102,
+    -94,   -80,   -71,   -58,   -52,   -47,   -40,   -43,   -47,   -48,   -50,
+    -39,   -46,   -44,   -44,   -44,   -43,   -45,   -41,   -40,   -34,   -32,
+    -23,   -12,   -6,    -1,    -1,    6,     12,    18,    20,    22,    32,
+    48,    65,    80,    93,    109,   122,   128,   131,   135,   135,   129,
+    126,   130,   127,   124,   125,   121,   122,   115,   118,   122,   128,
+    137,   143,   143,   141,   142,   134,   131,   121,   109,   105,   97,
+    93,    99,    96,    96,    94,    83,    84,    80,    77,    66,    59,
+    46,    42,    44,    32,    28,    20,    12,    8,     4,     4,     5,
+    3,     -4,    -7,    -6,    -14,   -19,   -24,   -34,   -40,   -45,   -52,
+    -61,   -62,   -60,   -57,   -57,   -61,   -63,   -61,   -65,   -73,   -81,
+    -89,   -94,   -93,   -89,   -87,   -82,   -82,   -84,   -81,   -86,   -82,
+    -84,   -86,   -90,   -86,   -83,   -82,   -81,   -80,   -80,   -76,   -75,
+    -76,   -70,   -69,   -68,   -61,   -53,   -50,   -43,   -38,   -42,   -43,
+    -41,   -41,   -39,   -34,   -27,   -21,   -16,   -20,   -22,   -27,   -36,
+    -39,   -38,   -40,   -37,   -35,   -28,   -14,   -6,    -3,    -2,    2,
+    4,     5,     15,    18,    25,    35,    36,    41,    45,    48,    52,
+    54,    52,    50,    60,    67,    76,    85,    85,    90,    86,    83,
+    84,    77,    77,    72,    77,    81,    89,    91,    93,    99,    101,
+    102,   98,    94,    87,    77,    70,    69,    63,    62,    55,    59,
+    58,    54,    51,    53,    57,    62,    65,    60,    54,    48,    45,
+    40,    29,    17,    8,     -3,    -14,   -17,   -18,   -20,   -25,   -34,
+    -40,   -44,   -53,   -56,   -63,   -71,   -71,   -69,   -66,   -62,   -66,
+    -67,   -68,   -71,   -75,   -79,   -79,   -73,   -67,   -60,   -49,   -46,
+    -45,   -45,   -46,   -55,   -64,   -67,   -72,   -74,   -70,   -68,   -67,
+    -69,   -70,   -64,   -56,   -55,   -54,   -51,   -41,   -30,   -26,   -28,
+    -29,   -30,   -28,   -25,   -27,   -20,   -12,   -5,    -2,    2,     3,
+    -3,    0,     -7,    -8,    -14,   -15,   -9,    -7,    4,     12,    24,
+    36,    41,    52,    58,    59,    51,    45,    48,    44,    46,    43,
+    40,    42,    47,    53,    52,    52,    63,    69,    74,    75,    80,
+    78,    69,    68,    59,    60,    54,    54,    54,    58,    66,    71,
+    78,    78,    75,    78,    72,    71,    61,    55,    53,    42,    36,
+    31,    28,    29,    23,    19,    25,    27,    27,    23,    29,    29,
+    20,    11,    5,     -4,    -10,   -31,   -38,   -39,   -36,   -33,   -27,
+    -17,   -15,   -14,   -17,   -13,   -14,   -25,   -33,   -44,   -51,   -61,
+    -63,   -63,   -65,   -67,   -66,   -63,   -59,   -52,   -48,   -45,   -44,
+    -50,   -62,   -74,   -84,   -89,   -100,  -101,  -102,  -96,   -95,   -85,
+    -76,   -78,   -72,   -71,   -66,   -61,   -63,   -60,   -62,   -72,   -69,
+    -69,   -58,   -56,   -50,   -37,   -28,   -17,   -17,   -16,   -17,   -18,
+    -18,   -13,   -7,    -4,    6,     17,    23,    25,    28,    24,    21,
+    17,    21,    27,    30,    33,    35,    46,    49,    48,    54,    56,
+    57,    58,    60,    64,    62,    64,    66,    67,    64,    70,    77,
+    83,    82,    84,    88,    89,    95,    86,    75,    64,    51,    36,
+    29,    26,    21,    26,    31,    38,    40,    55,    63,    65,    65,
+    64,    60,    54,    54,    49,    41,    34,    26,    21,    9,     6,
+    6,     5,     -1,    3,     5,     3,     2,     -4,    -13,   -13,   -24,
+    -32,   -33,   -36,   -33,   -24,   -18,   -15,   -9,    -5,    -5,    -14,
+    -17,   -24,   -34,   -36,   -42,   -43,   -36,   -42,   -43,   -43,   -38,
+    -36,   -27,   -20,   -23,   -21,   -28,   -25,   -22,   -24,   -25,   -23,
+    -22,   -30,   -31,   -26,   -25,   -20,   -15,   -8,    -10,   -11,   -13,
+    -18,   -22,   -30,   -36,   -35,   -39,   -35,   -34,   -27,   -24,   -19,
+    -15,   -7,    -6,    -7,    -2,    0,     7,     12,    14,    19,    20,
+    26,    26,    24,    16,    10,    4,     1,     3,     2,     9,     11,
+    17,    19,    27,    31,    31,    32,    30,    27,    25,    28,    27,
+    25,    22,    23,    23,    20,    21,    25,    36,    38,    40,    43,
+    40,    32,    27,    20,    9,     4,     1,     12,    27,    37,    49,
+    63,    73,    72,    73,    70,    67,    53,    39,    33,    26,    23,
+    13,    9,     6,     0,     -2,    -3,    0,     -1,    0,     -1,    -4,
+    -9,    -16,   -22,   -21,   -24,   -21,   -19,   -12,   -3,    0,     12,
+    14,    13,    3,     -6,    -13,   -27,   -34,   -42,   -41,   -44,   -42,
+    -43,   -46,   -42,   -40,   -39,   -36,   -31,   -29,   -30,   -22,   -19,
+    -21,   -20,   -17,   -17,   -22,   -31,   -41,   -45,   -54,   -65,   -64,
+    -68,   -70,   -74,   -70,   -64,   -62,   -61,   -60,   -58,   -52,   -46,
+    -43,   -37,   -35,   -40,   -41,   -47,   -52,   -58,   -62,   -61,   -53,
+    -54,   -46,   -41,   -40,   -34,   -29,   -20,   -15,   -8,    2,     12,
+    28,    35,    41,    42,    42,    43,    41,    43,    39,    45,    44,
+    46,    55,    54,    55,    55,    51,    48,    42,    43,    39,    40,
+    46,    54,    65,    70,    76,    81,    86,    89,    79,    73,    70,
+    62,    56,    52,    39,    32,    28,    17,    18,    19,    18,    15,
+    19,    20,    15,    13,    13,    10,    6,     5,     12,    10,    15,
+    20,    24,    30,    31,    28,    22,    17,    2,     -15,   -24,   -39,
+    -52,   -53,   -55,   -46,   -40,   -34,   -26,   -21,   -22,   -31,   -32,
+    -38,   -36,   -35,   -32,   -33,   -34,   -30,   -28,   -27,   -35,   -40,
+    -42,   -45,   -44,   -45,   -44,   -52,   -54,   -57,   -57,   -53,   -60,
+    -63,   -63,   -65,   -51,   -45,   -40,   -40,   -39,   -39,   -43,   -44,
+    -46,   -52,   -46,   -51,   -49,   -45,   -45,   -47,   -47,   -45,   -50,
+    -47,   -40,   -35,   -32,   -24,   -17,   -19,   -14,   -13,   -9,    -7,
+    -7,    -7,    -9,    0,     3,     7,     13,    12,    14,    15,    13,
+    6,     -1,    -3,    -9,    -10,   -5,    -2,    6,     9,     11,    12,
+    15,    19,    24,    37,    47,    47,    56,    53,    51,    52,    52,
+    47,    39,    38,    40,    41,    43,    44,    42,    43,    42,    41,
+    43,    40,    41,    35,    37,    39,    40,    41,    38,    30,    21,
+    14,    5,     2,     -1,    -2,    1,     -2,    6,     2,     4,     2,
+    -1,    -11,   -16,   -23,   -25,   -20,   -18,   -25,   -27,   -32,   -27,
+    -24,   -16,   -15,   -11,   -9,    -3,    -4,    -2,    -9,    -10,   -18,
+    -28,   -33,   -38,   -37,   -41,   -41,   -33,   -24,   -22,   -25,   -25,
+    -25,   -24,   -33,   -38,   -42,   -52,   -57,   -55,   -50,   -51,   -53,
+    -52,   -48,   -49,   -49,   -53,   -55,   -58,   -51,   -34,   -19,   -12,
+    -12,   -5,    1,     1,     0,     -6,    -2,    -10,   -11,   -11,   -6,
+    0,     -6,    2,     -2,    -6,    2,     5,     16,    18,    18,    21,
+    16,    18,    18,    20,    20,    13,    18,    9,     7,     12,    7,
+    8,     10,    16,    17,    18,    23,    26,    36,    44,    51,    55,
+    60,    64,    69,    68,    71,    70,    62,    58,    52,    44,    35,
+    31,    34,    32,    33,    36,    37,    38,    41,    47,    55,    56,
+    58,    60,    60,    57,    48,    41,    29,    19,    7,     4,     8,
+    9,     10,    8,     13,    15,    13,    8,     8,     6,     4,     10,
+    8,     -4,    -6,    -9,    -20,   -28,   -39,   -38,   -27,   -24,   -22,
+    -19,   -23,   -32,   -35,   -36,   -41,   -48,   -51,   -50,   -52,   -55,
+    -60,   -67,   -72,   -76,   -84,   -82,   -80,   -81,   -75,   -64,   -50,
+    -36,   -28,   -18,   -14,   -12,   -15,   -12,   -18,   -24,   -21,   -22,
+    -19,   -21,   -19,   -22,   -20,   -18,   -16,   -17,   -19,   -15,   -7,
+    1,     0,     0,     9,     14,    20,    24,    20,    16,    17,    20,
+    20,    25,    27,    26,    32,    33,    35,    38,    42,    38,    37,
+    39,    46,    44,    43,    45,    45,    42,    37,    34,    25,    21,
+    22,    33,    44,    49,    54,    53,    58,    54,    51,    46,    40,
+    37,    37,    39,    34,    37,    39,    31,    39,    38,    36,    35,
+    32,    33,    33,    32,    28,    23,    18,    22,    28,    31,    27,
+    18,    3,     4,     0,     -4,    -7,    -15,   -18,   -24,   -32,   -34,
+    -39,   -42,   -36,   -31,   -24,   -12,   -10,   -10,   -13,   -20,   -28,
+    -34,   -44,   -49,   -50,   -53,   -56,   -54,   -52,   -53,   -47,   -43,
+    -41,   -45,   -41,   -38,   -38,   -33,   -32,   -34,   -35,   -33,   -40,
+    -45,   -53,   -62,   -61,   -67,   -72,   -70,   -67,   -68,   -59,   -51,
+    -47,   -38,   -31,   -20,   -13,   -13,   -13,   -14,   -17,   -21,   -22,
+    -29,   -31,   -27,   -23,   -13,   -6,    4,     12,    17,    25,    23,
+    23,    25,    30,    30,    32,    31,    28,    27,    18,    14,    13,
+    3,     5,     7,     19,    35,    47,    61,    70,    84,    90,    95,
+    92,    94,    89,    77,    71,    66,    59,    50,    51,    50,    51,
+    53,    56,    65,    67,    69,    75,    74,    69,    67,    56,    51,
+    44,    34,    25,    17,    10,    6,     7,     7,     4,     6,     -1,
+    -1,    -2,    -9,    -9,    -9,    -7,    -5,    1,     -2,    -5,    -11,
+    -19,   -27,   -39,   -38,   -44,   -45,   -48,   -48,   -54,   -59,   -53,
+    -51,   -49,   -52,   -50,   -50,   -47,   -42,   -32,   -28,   -28,   -26,
+    -27,   -34,   -40,   -40,   -36,   -37,   -37,   -34,   -37,   -36,   -41,
+    -36,   -40,   -46,   -48,   -52,   -47,   -44,   -40,   -40,   -38,   -43,
+    -43,   -47,   -59,   -62,   -59,   -59,   -51,   -41,   -29,   -19,   -8,
+    -2,    1,     1,     -4,    -9,    -19,   -23,   -29,   -29,   -25,   -23,
+    -15,   -7,    -2,    6,     8,     15,    27,    35,    43,    40,    36,
+    35,    32,    25,    22,    19,    17,    13,    13,    21,    25,    28,
+    36,    44,    50,    57,    56,    58,    59,    62,    66,    70,    73,
+    69,    66,    66,    66,    62,    53,    48,    44,    38,    39,    44,
+    52,    51,    55,    57,    52,    49,    44,    36,    26,    16,    13,
+    13,    14,    14,    17,    14,    10,    6,     -5,    -14,   -23,   -24,
+    -21,   -28,   -25,   -27,   -29,   -29,   -33,   -33,   -39,   -42,   -43,
+    -41,   -40,   -43,   -46,   -45,   -43,   -42,   -41,   -41,   -46,   -46,
+    -52,   -52,   -52,   -59,   -63,   -70,   -68,   -73,   -77,   -73,   -68,
+    -66,   -62,   -64,   -66,   -58,   -54,   -51,   -52,   -48,   -47,   -43,
+    -40,   -39,   -33,   -26,   -19,   -17,   -16,   -17,   -14,   -9,    -10,
+    -3,    5,     5,     9,     5,     9,     8,     4,     3,     0,     -5,
+    -10,   -3,    2,     8,     14,    16,    20,    27,    39,    40,    44,
+    48,    43,    39,    34,    29,    22,    12,    8,     5,     0,     -2,
+    -3,    5,     12,    16,    19,    22,    25,    28,    35,    28,    30,
+    31,    30,    39,    43,    47,    43,    42,    41,    41,    41,    37,
+    37,    39,    37,    38,    43,    44,    41,    43,    34,    28,    25,
+    23,    30,    34,    32,    33,    29,    21,    18,    13,    14,    11,
+    3,     2,     1,     3,     1,     -1,    0,     -3,    -1,    -3,    -8,
+    -9,    -7,    -9,    -2,    0,     -3,    0,     1,     5,     0,     -1,
+    -9,    -13,   -8,    -11,   -18,   -23,   -25,   -29,   -29,   -26,   -27,
+    -29,   -25,   -24,   -23,   -18,   -19,   -18,   -17,   -21,   -22,   -30,
+    -38,   -42,   -42,   -42,   -40,   -41,   -43,   -39,   -38,   -37,   -36,
+    -33,   -31,   -28,   -27,   -18,   -15,   -7,    -8,    -8,    -1,    1,
+    3,     -5,    0,     -4,    -5,    -4,    -8,    -10,   -14,   -21,   -24,
+    -25,   -20,   -11,   -4,    3,     6,     13,    15,    12,    17,    16,
+    17,    17,    15,    21,    28,    33,    36,    35,    35,    29,    31,
+    29,    28,    23,    21,    14,    15,    27,    36,    40,    40,    43,
+    51,    56,    62,    69,    77,    80,    88,    88,    88,    82,    76,
+    63,    52,    44,    36,    26,    23,    25,    24,    27,    26,    31,
+    21,    13,    8,     -8,    -8,    -11,   -14,   -18,   -28,   -28,   -30,
+    -32,   -29,   -26,   -26,   -27,   -24,   -20,   -14,   -8,    -6,    -8,
+    -5,    -10,   -14,   -18,   -26,   -34,   -36,   -38,   -44,   -51,   -57,
+    -66,   -64,   -68,   -72,   -75,   -75,   -70,   -68,   -65,   -64,   -62,
+    -68,   -63,   -60,   -65,   -65,   -69,   -68,   -67,   -57,   -46,   -41,
+    -38,   -34,   -31,   -39,   -40,   -45,   -45,   -48,   -47,   -40,   -39,
+    -32,   -26,   -24,   -14,   -9,    -7,    -3,    -2,    3,     4,     0,
+    -2,    -2,    -2,    1,     3,     2,     3,     8,     13,    20,    25,
+    29,    31,    26,    17,    11,    3,     -5,    2,     6,     9,     11,
+    19,    26,    40,    51,    61,    60,    58,    61,    55,    55,    57,
+    60,    54,    40,    42,    38,    34,    38,    37,    34,    32,    35,
+    36,    35,    41,    36,    32,    29,    23,    22,    23,    22,    14,
+    13,    19,    19,    20,    22,    22,    17,    13,    6,     9,     13,
+    15,    17,    19,    11,    15,    8,     4,     6,     -1,    -3,    3,
+    7,     11,    8,     10,    7,     6,     4,     -4,    -5,    -11,   -9,
+    -16,   -14,   -14,   -16,   -16,   -22,   -19,   -19,   -13,   -9,    -4,
+    1,     1,     2,     -6,    -14,   -25,   -32,   -41,   -46,   -50,   -49,
+    -42,   -39,   -34,   -24,   -14,   -18,   -15,   -17,   -21,   -23,   -21,
+    -19,   -21,   -20,   -19,   -20,   -19,   -16,   -17,   -19,   -20,   -20,
+    -20,   -20,   -22,   -22,   -23,   -22,   -22,   -14,   -5,    5,     8,
+    13,    16,    19,    23,    19,    21,    16,    16,    18,    13,    18,
+    13,    15,    18,    12,    12,    6,     11,    8,     5,     5,     9,
+    17,    14,    15,    14,    16,    14,    14,    12,    9,     7,     9,
+    11,    13,    15,    15,    19,    17,    14,    8,     7,     4,     0,
+    3,     8,     10,    7,     8,     19,    15,    19,    18,    19,    17,
+    9,     14,    10,    4,     -3,    -11,   -19,   -25,   -31,   -35,   -36,
+    -28,   -21,   -8,    5,     8,     11,    13,    7,     4,     1,     -7,
+    -15,   -17,   -17,   -21,   -28,   -33,   -37,   -40,   -39,   -41,   -45,
+    -46,   -44,   -40,   -41,   -36,   -31,   -41,   -40,   -42,   -44,   -47,
+    -50,   -49,   -55,   -52,   -52,   -52,   -45,   -50,   -52,   -56,   -58,
+    -60,   -69,   -75,   -82,   -86,   -91,   -87,   -80,   -80,   -72,   -58,
+    -52,   -45,   -33,   -21,   -13,   -12,   -10,   -6,    -1,    -2,    -7,
+    -7,    -5,    -6,    -3,    9,     15,    25,    36,    35,    39,    28,
+    16,    11,    8,     11,    17,    27,    34,    36,    47,    49,    52,
+    52,    42,    46,    49,    55,    65,    66,    67,    62,    56,    53,
+    49,    50,    55,    53,    62,    69,    72,    73,    68,    61,    54,
+    46,    43,    38,    34,    39,    43,    42,    39,    36,    31,    26,
+    24,    17,    13,    14,    14,    21,    26,    29,    28,    26,    24,
+    18,    19,    16,    11,    6,     2,     -2,    1,     3,     2,     -4,
+    -3,    -1,    -3,    -2,    -2,    -5,    -3,    0,     3,     -3,    -6,
+    -6,    -15,   -19,   -25,   -30,   -35,   -39,   -34,   -34,   -34,   -31,
+    -17,   -17,   -8,    -2,    -2,    8,     14,    25,    24,    26,    22,
+    16,    10,    2,     -3,    -5,    -12,   -15,   -11,   -14,   -16,   -17,
+    -17,   -16,   -21,   -18,   -18,   -21,   -23,   -21,   -15,   -11,   -4,
+    -2,    3,     8,     10,    17,    18,    25,    24,    24,    24,    21,
+    24,    23,    24,    22,    23,    31,    39,    49,    58,    64,    67,
+    63,    57,    53,    52,    44,    45,    43,    40,    45,    42,    49,
+    50,    49,    52,    51,    48,    46,    38,    37,    35,    36,    37,
+    37,    37,    44,    45,    47,    42,    42,    36,    35,    44,    40,
+    40,    28,    24,    23,    18,    12,    9,     8,     10,    17,    17,
+    18,    12,    5,     -2,    -12,   -16,   -20,   -27,   -29,   -29,   -26,
+    -22,   -17,   -16,   -15,   -14,   -15,   -11,   -11,   -15,   -19,   -15,
+    -20,   -22,   -24,   -37,   -52,   -62,   -63,   -68,   -64,   -59,   -51,
+    -43,   -42,   -36,   -32,   -33,   -33,   -33,   -41,   -48,   -51,   -49,
+    -48,   -47,   -42,   -45,   -42,   -41,   -40,   -39,   -33,   -29,   -25,
+    -14,   -1,    -4,    -6,    -11,   -16,   -19,   -26,   -29,   -28,   -25,
+    -17,   -10,   -1,    -1,    3,     7,     -1,    -3,    -8,    -18,   -20,
+    -20,   -16,   -13,   -11,   -8,    0,     6,     8,     11,    14,    15,
+    20,    26,    26,    26,    24,    23,    24,    30,    34,    41,    52,
+    61,    70,    80,    85,    86,    89,    84,    87,    79,    67,    60,
+    57,    59,    63,    68,    74,    78,    84,    89,    91,    87,    81,
+    74,    69,    63,    59,    59,    56,    58,    60,    60,    59,    54,
+    49,    41,    40,    34,    25,    19,    11,    1,     0,     -1,    -4,
+    -8,    -12,   -12,   -17,   -22,   -31,   -44,   -54,   -58,   -68,   -74,
+    -80,   -80,   -73,   -65,   -61,   -61,   -55,   -50,   -50,   -59,   -65,
+    -69,   -73,   -73,   -78,   -79,   -83,   -87,   -87,   -88,   -94,   -103,
+    -107,  -107,  -109,  -106,  -113,  -115,  -110,  -105,  -100,  -100,  -92,
+    -78,   -62,   -49,   -39,   -35,   -27,   -26,   -25,   -24,   -22,   -23,
+    -28,   -26,   -22,   -15,   -11,   -4,    4,     13,    21,    32,    31,
+    28,    30,    30,    28,    23,    25,    23,    21,    25,    21,    26,
+    27,    32,    40,    48,    53,    55,    54,    55,    55,    54,    48,
+    44,    47,    48,    54,    60,    71,    79,    79,    74,    72,    59,
+    48,    42,    32,    26,    22,    21,    23,    22,    31,    42,    44,
+    41,    36,    30,    30,    33,    38,    35,    30,    28,    20,    15,
+    8,     4,     6,     9,     16,    26,    27,    23,    19,    16,    10,
+    4,     -4,    -12,   -12,   -16,   -16,   -19,   -24,   -23,   -23,   -31,
+    -34,   -38,   -40,   -41,   -39,   -39,   -36,   -36,   -40,   -45,   -48,
+    -53,   -66,   -73,   -76,   -76,   -78,   -75,   -71,   -65,   -59,   -58,
+    -59,   -56,   -60,   -62,   -62,   -62,   -64,   -68,   -73,   -79,   -80,
+    -85,   -87,   -85,   -78,   -72,   -66,   -56,   -48,   -42,   -37,   -35,
+    -32,   -33,   -31,   -25,   -26,   -27,   -16,   -18,   -18,   -13,   -14,
+    -17,   -22,   -24,   -25,   -23,   -19,   -14,   -12,   -11,   -7,    -4,
+    -1,    2,     5,     8,     10,    10,    18,    28,    29,    25,    22,
+    29,    21,    20,    21,    22,    30,    32,    41,    41,    45,    46,
+    49,    52,    57,    59,    58,    52,    46,    47,    56,    58,    49,
+    49,    46,    40,    33,    23,    14,    11,    16,    29,    34,    37,
+    41,    42,    48,    54,    60,    61,    62,    62,    69,    79,    76,
+    71,    72,    71,    64,    59,    54,    49,    40,    42,    34,    23,
+    27,    18,    13,    9,     3,     -4,    -8,    -16,   -18,   -20,   -26,
+    -28,   -30,   -32,   -29,   -32,   -35,   -39,   -41,   -38,   -34,   -31,
+    -26,   -18,   -21,   -20,   -22,   -28,   -35,   -34,   -31,   -33,   -31,
+    -31,   -40,   -43,   -45,   -53,   -64,   -67,   -74,   -75,   -74,   -75,
+    -70,   -61,   -56,   -45,   -37,   -30,   -33,   -35,   -32,   -31,   -27,
+    -25,   -19,   -17,   -14,   -9,    -4,    -1,    -3,    -4,    1,     8,
+    14,    20,    24,    25,    18,    11,    7,     -3,    -9,    -3,    4,
+    15,    30,    29,    33,    33,    36,    35,    31,    33,    34,    42,
+    43,    42,    47,    49,    53,    61,    69,    73,    74,    79,    81,
+    84,    76,    69,    62,    47,    39,    31,    19,    8,     2,     -6,
+    -5,    -3,    -3,    -1,    1,     -2,    -3,    -3,    -6,    -12,   -13,
+    -15,   -11,   -5,    -4,    -8,    -14,   -9,    -3,    0,     -3,    -4,
+    0,     3,     0,     -6,    -14,   -23,   -33,   -38,   -41,   -38,   -38,
+    -34,   -30,   -29,   -29,   -26,   -31,   -33,   -41,   -49,   -50,   -56,
+    -57,   -58,   -54,   -46,   -39,   -39,   -34,   -31,   -28,   -30,   -30,
+    -31,   -29,   -27,   -16,   -18,   -17,   -15,   -13,   -15,   -12,   -7,
+    -11,   -9,    -9,    -4,    -11,   -7,    -7,    -8,    -9,    -10,   -7,
+    -9,    1,     9,     15,    12,    19,    19,    18,    17,    13,    11,
+    8,     6,     10,    17,    20,    26,    28,    33,    39,    30,    25,
+    25,    18,    16,    21,    26,    30,    33,    32,    36,    42,    49,
+    46,    39,    44,    44,    37,    35,    30,    24,    22,    23,    26,
+    23,    25,    21,    24,    24,    22,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc8030cdac7c4e8364e0fcd7dcc5fff63617908
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_no_1000ms_sample_data_size;
+extern const int16_t g_no_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
index 892757e799f3832db725424163e613bea35ab9e7..6468c1a95a9cd3f844595bf2c6e88c1e2833823b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <AudioToolbox/AudioToolbox.h>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
index 9366dc71e0d76d087a3dad9b9c4c206a0749e235..8187962c3e780a76413134771dc63ba30910f3b6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -28,8 +28,8 @@ RecognizeCommands::RecognizeCommands(tflite::ErrorReporter* error_reporter,
       suppression_ms_(suppression_ms),
       minimum_count_(minimum_count),
       previous_results_(error_reporter) {
-  previous_top_label_ = "_silence_";
-  previous_top_label_time_ = 0;
+  previous_top_label_ = "silence";
+  previous_top_label_time_ = std::numeric_limits<int32_t>::min();
 }
 
 TfLiteStatus RecognizeCommands::ProcessLatestResults(
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
index adefffe850076821dd1e0bf683fdd2180d6999ea..292cd3e88dcd63f925cb16995b5e8a16554a8547 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
 // Partial implementation of std::dequeue, just providing the functionality
@@ -129,8 +129,8 @@ class RecognizeCommands {
   // help reduce spurious recognitions.
   explicit RecognizeCommands(tflite::ErrorReporter* error_reporter,
                              int32_t average_window_duration_ms = 1000,
-                             uint8_t detection_threshold = 51,
-                             int32_t suppression_ms = 500,
+                             uint8_t detection_threshold = 200,
+                             int32_t suppression_ms = 1500,
                              int32_t minimum_count = 3);
 
   // Call this with the results of running a model on sample data.
@@ -149,8 +149,6 @@ class RecognizeCommands {
 
   // Working variables
   PreviousResultsQueue previous_results_;
-  int previous_results_head_;
-  int previous_results_tail_;
   const char* previous_top_label_;
   int32_t previous_top_label_time_;
 };
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
index f0cc73f10b3dadfdf06cb0f2935140b792635add..6582c948d16f9493a4b1e5bdf43bdc1f30e6dc31 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -118,7 +118,9 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     }
   }
   TF_LITE_MICRO_EXPECT(has_found_new_command);
-  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+  }
 
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
       {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
@@ -141,8 +143,10 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     }
   }
   TF_LITE_MICRO_EXPECT(has_found_new_command);
-  TF_LITE_MICRO_EXPECT_EQ(231, score);
-  TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(231, score);
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+  }
 }
 
 TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
similarity index 85%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
index 5c6978b5edef635af58873bf537a251fa4510ef4..403976e222fe549f6f8c755bf7460d245d9370e8 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+
 extern "C" {
-#define ARM_MATH_CM4
 #define IFFT_FLAG_R 0
 #define BIT_REVERSE_FLAG 1
 #define FFT_SIZE 512
@@ -24,8 +25,6 @@ extern "C" {
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
 }
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-
 void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
 
 q15_t bufA[FFT_SIZE];
@@ -42,9 +41,9 @@ constexpr int kOutputSize =
     ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   if (input_size > kInputSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
                            kInputSize);
@@ -94,12 +93,3 @@ void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
     output[i] = (uint8_t)(bufA[i] >> 5);
   }
 }
-
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output) {
-  int i;
-  for (i = 0; i < 49; i++) {
-    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
-  }
-  return kTfLiteOk;
-}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
similarity index 96%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
index b623d8d11b75d59600cc6a029527d3957084a328..ad11684b0a94e630580aa9a95d4b1db92f914d6f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
@@ -27,11 +27,11 @@ limitations under the License.
 // instead of floating point, to help show how this can work on platforms that
 // don't have good float support.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
@@ -118,9 +118,9 @@ void CalculatePeriodicHann(int window_length, int16_t* window_function) {
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
index c4fc5c33bb329cba4e1abcf6d36b01f14e9e2b27..0b20f2f86fb6455d4251cb81d3e70c3c15de7c6b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
 
 const uint8_t g_no_power_spectrum_data[g_no_power_spectrum_data_size] = {
     255, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
index fa39d3c70d78ce261db81cf8ad7c416efd2c468c..9693950fb5ee1d56242b83c6265e9e2315ec8971 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // no_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_no_power_spectrum_data_size = 43;
 extern const uint8_t g_no_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
index e98c84f7ed2e678eb91580a2b6fb69514cee4740..3d3a9538fb527888e3bdf0e1aa9ca00d4d5f1544 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
- * --output_c_file="no_features_data.cc" \
+ * --output_c_file="no_simple_features_data.cc" \
  */
 
-const int g_no_f9643d42_nohash_4_width = 43;
-const int g_no_f9643d42_nohash_4_height = 49;
-const unsigned char g_no_f9643d42_nohash_4_data[] = {
+const int g_no_simple_f9643d42_nohash_4_width = 43;
+const int g_no_simple_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_simple_f9643d42_nohash_4_data[] = {
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
index 39a3bb914cc1986aa851ace0e39ce63ed1a93282..30332b30c5c8325edb53713d572fcf987446844a 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
 
-extern const int g_yes_f2e59fea_nohash_1_width;
-extern const int g_yes_f2e59fea_nohash_1_height;
-extern const unsigned char g_yes_f2e59fea_nohash_1_data[];
+extern const int g_no_simple_f9643d42_nohash_4_width;
+extern const int g_no_simple_f9643d42_nohash_4_height;
+extern const unsigned char g_no_simple_f9643d42_nohash_4_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
similarity index 92%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
index f8858aad72f3c141d20077ffa927e30bd9492987..3aa05b7bf1d5d1762c9c6744ac8a5fe99f922332 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
@@ -24,11 +24,11 @@ limitations under the License.
 // functions used here, for example replacing the DFT with an FFT, so this
 // version shouldn't be used where performance is critical.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
@@ -72,9 +72,9 @@ void CalculatePeriodicHann(int window_length, float* window_function) {
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
@@ -146,12 +146,3 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
-
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output) {
-  int i;
-  for (i = 0; i < 49; i++) {
-    Preprocess(error_reporter, input + i * 320, 480, 43, output + i * 43);
-  }
-  return kTfLiteOk;
-}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
similarity index 77%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
index d710beeceea6a7b6fb7fca748e5795f602276e32..f4e86b18a4c3d1c0a5beb32eb6806faaf1c11c14 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -24,11 +24,8 @@ limitations under the License.
 // both floating point and fixed point available, but because the calculations
 // involved can be time-consuming, it's recommended that you use or write
 // specialized versions for your platform.
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output);
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output);
 
-TfLiteStatus Preprocess_1sec(tflite::ErrorReporter* error_reporter,
-                             const int16_t* input, uint8_t* output);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
index e8b49f67e3d72faa4700c4bdec7f94a5b79cd72e..65e526327c77c727ec88cee421a466f0df34ee76 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
@@ -13,23 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestPreprocessor) {
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus yes_status = Preprocess(
+  TfLiteStatus yes_status = GenerateSimpleFeatures(
       error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
       g_yes_power_spectrum_data_size, yes_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(TestPreprocessor) {
   }
 
   uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus no_status = Preprocess(
+  TfLiteStatus no_status = GenerateSimpleFeatures(
       error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
       g_no_power_spectrum_data_size, no_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4842f8dbd907dbbd73aab14c7767a8d64476b52d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
+
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
similarity index 93%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
index f48252d14d251673f0070e63dfa4169ca3a89025..d31d6b33622b3a15c90fab4c52d7452960a54930 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
 
 // Keeping these as constant expressions allow us to allocate fixed-sized arrays
 // on the stack for our working memory.
@@ -40,4 +40,4 @@ constexpr int kSilenceIndex = 0;
 constexpr int kUnknownIndex = 1;
 extern const char* kCategoryLabels[kCategoryCount];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
similarity index 99%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
index 62e4359859a422c96ec368b6f91cba99e3c4a4eb..a14412edc941e8a7df0aef9dd66b79b1d9a1d7a6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 // Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 // See the README for a full description of the creation process.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 
-const unsigned char g_tiny_conv_model_data[] = {
+const unsigned char g_tiny_conv_simple_features_model_data[] = {
     0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
     0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
@@ -1670,4 +1670,4 @@ const unsigned char g_tiny_conv_model_data[] = {
     0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
     0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_model_data_len = 19800;
+const int g_tiny_conv_simple_features_model_data_len = 19800;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
similarity index 74%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
index a465dbfabf7cbba44473ae7e2ff94b1de2092b20..cadf7d0de754e032ae9ff77cdd8deec43bc03847 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
@@ -16,12 +16,12 @@ limitations under the License.
 // This is a standard TensorFlow Lite model file that has been converted into a
 // C data array, so it can be easily compiled into a binary for devices that
 // don't have a file system. It was created using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
 
-extern const unsigned char g_tiny_conv_model_data[];
-extern const int g_tiny_conv_model_data_len;
+extern const unsigned char g_tiny_conv_simple_features_model_data[];
+extern const int g_tiny_conv_simple_features_model_data_len;
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
index 9a34a2045a221e2eee8c51f23000e819b1638499..cd46408c0fb5c2c5dad12ae67c5456c8cb178b2d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 
 const uint8_t g_yes_power_spectrum_data[g_yes_power_spectrum_data_size] = {
     8, 89, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 13, 1, 6, 23, 20, 6, 4, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
index 5c8c00ac1116dcbd7ad4aeda1828603e962c2001..77e52d58b54763ec8df46729ab6f8dd84086d59b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // yes_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_yes_power_spectrum_data_size = 43;
 extern const uint8_t g_yes_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
index 2eb737fb8e1204a02f7ea4852016e85d03980bfd..2d660bb8b5c5b825eb48490699c89e5ba241369f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
- * --output_c_file="yes_features_data.cc" \
+ * --output_c_file="yes_simple_features_data.cc" \
  */
 
-const int g_yes_f2e59fea_nohash_1_width = 43;
-const int g_yes_f2e59fea_nohash_1_height = 49;
-const unsigned char g_yes_f2e59fea_nohash_1_data[] = {
+const int g_yes_simple_f2e59fea_nohash_1_width = 43;
+const int g_yes_simple_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_simple_f2e59fea_nohash_1_data[] = {
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
index e2ee0c46cf13b00b310bd22b7ca1cb5a9751c6e6..87ea4a4aea89d02189bca9c37872e27b95672190 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
 
-extern const int g_no_f9643d42_nohash_4_width;
-extern const int g_no_f9643d42_nohash_4_height;
-extern const unsigned char g_no_f9643d42_nohash_4_data[];
+extern const int g_yes_simple_f2e59fea_nohash_1_width;
+extern const int g_yes_simple_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_simple_f2e59fea_nohash_1_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..520a46ef59811263bcae4cca739ddc26c215b202
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/audio_provider.cc
@@ -0,0 +1,358 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
+// These are headers from Ambiq's Apollo3 SDK.
+#include "am_bsp.h"         // NOLINT
+#include "am_mcu_apollo.h"  // NOLINT
+#include "am_util.h"        // NOLINT
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+// These are the raw buffers that are filled by the ADC during DMA
+constexpr int kAdcNumSlots = 2;
+constexpr int kAdcSamplesPerSlot = 1024;
+constexpr int kAdcSampleBufferSize = (kAdcNumSlots * kAdcSamplesPerSlot);
+uint32_t g_ui32ADCSampleBuffer0[kAdcSampleBufferSize];
+uint32_t g_ui32ADCSampleBuffer1[kAdcSampleBufferSize];
+// Controls the double buffering between the two DMA buffers.
+int g_dma_destination_index = 0;
+// ADC Device Handle.
+static void* g_adc_handle;
+// ADC DMA error flag.
+volatile bool g_adc_dma_error;
+// So the interrupt can use the passed-in error handler to report issues.
+tflite::ErrorReporter* g_adc_dma_error_reporter = nullptr;
+
+// Holds a longer history of audio samples in a ring buffer.
+constexpr int kAudioCaptureBufferSize = 16000;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize] = {};
+int g_audio_capture_buffer_start = 0;
+int64_t g_total_samples_captured = 0;
+int32_t g_latest_audio_timestamp = 0;
+
+// Copy of audio samples returned to the caller.
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+bool g_is_audio_initialized = false;
+
+// Start the DMA fetch of ADC samples.
+void adc_start_dma(tflite::ErrorReporter* error_reporter) {
+  am_hal_adc_dma_config_t ADCDMAConfig;
+
+  // Configure the ADC to use DMA for the sample transfer.
+  ADCDMAConfig.bDynamicPriority = true;
+  ADCDMAConfig.ePriority = AM_HAL_ADC_PRIOR_SERVICE_IMMED;
+  ADCDMAConfig.bDMAEnable = true;
+  ADCDMAConfig.ui32SampleCount = kAdcSampleBufferSize;
+  if (g_dma_destination_index == 0) {
+    ADCDMAConfig.ui32TargetAddress = (uint32_t)g_ui32ADCSampleBuffer0;
+  } else {
+    ADCDMAConfig.ui32TargetAddress = (uint32_t)g_ui32ADCSampleBuffer1;
+  }
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_configure_dma(g_adc_handle, &ADCDMAConfig)) {
+    error_reporter->Report("Error - configuring ADC DMA failed.");
+  }
+
+  // Reset the ADC DMA flags.
+  g_adc_dma_error = false;
+  g_adc_dma_error_reporter = error_reporter;
+}
+
+// Configure the ADC.
+void adc_config0(tflite::ErrorReporter* error_reporter) {
+  am_hal_adc_config_t ADCConfig;
+  am_hal_adc_slot_config_t ADCSlotConfig;
+
+  // Initialize the ADC and get the handle.
+  if (AM_HAL_STATUS_SUCCESS != am_hal_adc_initialize(0, &g_adc_handle)) {
+    error_reporter->Report("Error - reservation of the ADC0 instance failed.");
+  }
+
+  // Power on the ADC.
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_power_control(g_adc_handle, AM_HAL_SYSCTRL_WAKE, false)) {
+    error_reporter->Report("Error - ADC0 power on failed.");
+  }
+
+  // Set up the ADC configuration parameters. These settings are reasonable
+  // for accurate measurements at a low sample rate.
+  ADCConfig.eClock = AM_HAL_ADC_CLKSEL_HFRC_DIV2;
+  ADCConfig.ePolarity = AM_HAL_ADC_TRIGPOL_RISING;
+  ADCConfig.eTrigger = AM_HAL_ADC_TRIGSEL_SOFTWARE;
+  ADCConfig.eReference =
+      AM_HAL_ADC_REFSEL_INT_2P0;  // AM_HAL_ADC_REFSEL_INT_1P5;
+  ADCConfig.eClockMode = AM_HAL_ADC_CLKMODE_LOW_LATENCY;
+  ADCConfig.ePowerMode = AM_HAL_ADC_LPMODE0;
+  ADCConfig.eRepeat = AM_HAL_ADC_REPEATING_SCAN;
+  if (AM_HAL_STATUS_SUCCESS != am_hal_adc_configure(g_adc_handle, &ADCConfig)) {
+    error_reporter->Report("Error - configuring ADC0 failed.");
+  }
+
+  // Set up an ADC slot (2)
+  ADCSlotConfig.eMeasToAvg = AM_HAL_ADC_SLOT_AVG_1;
+  ADCSlotConfig.ePrecisionMode = AM_HAL_ADC_SLOT_14BIT;
+  ADCSlotConfig.eChannel = AM_HAL_ADC_SLOT_CHSEL_SE2;
+  ADCSlotConfig.bWindowCompare = false;
+  ADCSlotConfig.bEnabled = true;
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_configure_slot(g_adc_handle, 2, &ADCSlotConfig)) {
+    error_reporter->Report("Error - configuring ADC Slot 2 failed.");
+  }
+
+  // Set up an ADC slot (1)
+  ADCSlotConfig.eMeasToAvg = AM_HAL_ADC_SLOT_AVG_1;
+  ADCSlotConfig.ePrecisionMode = AM_HAL_ADC_SLOT_14BIT;
+  ADCSlotConfig.eChannel = AM_HAL_ADC_SLOT_CHSEL_SE1;
+  ADCSlotConfig.bWindowCompare = false;
+  ADCSlotConfig.bEnabled = true;
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_configure_slot(g_adc_handle, 1, &ADCSlotConfig)) {
+    error_reporter->Report("Error - configuring ADC Slot 1 failed.");
+  }
+
+  // Configure the ADC to use DMA for the sample transfer.
+  adc_start_dma(error_reporter);
+
+  // For this example, the samples will be coming in slowly. This means we
+  // can afford to wake up for every conversion.
+  am_hal_adc_interrupt_enable(g_adc_handle,
+                              AM_HAL_ADC_INT_DERR | AM_HAL_ADC_INT_DCMP);
+
+  // Enable the ADC.
+  if (AM_HAL_STATUS_SUCCESS != am_hal_adc_enable(g_adc_handle)) {
+    error_reporter->Report("Error - enabling ADC0 failed.");
+  }
+}
+
+// Initialize the ADC repetitive sample timer A3.
+void init_timerA3_for_ADC() {
+  // Start a timer to trigger the ADC periodically (1 second).
+  am_hal_ctimer_config_single(3, AM_HAL_CTIMER_TIMERA,
+                              AM_HAL_CTIMER_HFRC_12MHZ |
+                                  AM_HAL_CTIMER_FN_REPEAT |
+                                  AM_HAL_CTIMER_INT_ENABLE);
+
+  am_hal_ctimer_int_enable(AM_HAL_CTIMER_INT_TIMERA3);
+
+  // 750 = 12,000,000 (clock rate) / 16,000 (desired sample rate).
+  am_hal_ctimer_period_set(3, AM_HAL_CTIMER_TIMERA, 750, 374);
+
+  // Enable the timer A3 to trigger the ADC directly
+  am_hal_ctimer_adc_trigger_enable();
+
+  // Start the timer.
+  am_hal_ctimer_start(3, AM_HAL_CTIMER_TIMERA);
+}
+
+// Make sure the CPU is running as fast as possible.
+void enable_burst_mode(tflite::ErrorReporter* error_reporter) {
+  am_hal_burst_avail_e eBurstModeAvailable;
+  am_hal_burst_mode_e eBurstMode;
+
+  // Check that the Burst Feature is available.
+  if (AM_HAL_STATUS_SUCCESS ==
+      am_hal_burst_mode_initialize(&eBurstModeAvailable)) {
+    if (AM_HAL_BURST_AVAIL == eBurstModeAvailable) {
+      error_reporter->Report("Apollo3 Burst Mode is Available\n");
+    } else {
+      error_reporter->Report("Apollo3 Burst Mode is Not Available\n");
+    }
+  } else {
+    error_reporter->Report("Failed to Initialize for Burst Mode operation\n");
+  }
+
+  // Put the MCU into "Burst" mode.
+  if (AM_HAL_STATUS_SUCCESS == am_hal_burst_mode_enable(&eBurstMode)) {
+    if (AM_HAL_BURST_MODE == eBurstMode) {
+      error_reporter->Report("Apollo3 operating in Burst Mode (96MHz)\n");
+    }
+  } else {
+    error_reporter->Report("Failed to Enable Burst Mode operation\n");
+  }
+}
+
+}  // namespace
+
+// Interrupt handler for the ADC.
+extern "C" void am_adc_isr(void) {
+  uint32_t ui32IntMask;
+
+  // Read the interrupt status.
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_interrupt_status(g_adc_handle, &ui32IntMask, false)) {
+    g_adc_dma_error_reporter->Report("Error reading ADC0 interrupt status.");
+  }
+
+  // Clear the ADC interrupt.
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_adc_interrupt_clear(g_adc_handle, ui32IntMask)) {
+    g_adc_dma_error_reporter->Report("Error clearing ADC0 interrupt status.");
+  }
+
+  // If we got a DMA complete, set the flag.
+  if (ui32IntMask & AM_HAL_ADC_INT_DCMP) {
+    uint32_t* source_buffer;
+    if (g_dma_destination_index == 0) {
+      source_buffer = g_ui32ADCSampleBuffer0;
+      g_dma_destination_index = 1;
+    } else {
+      source_buffer = g_ui32ADCSampleBuffer1;
+      g_dma_destination_index = 0;
+    }
+    adc_start_dma(g_adc_dma_error_reporter);
+
+    // For slot 1:
+    uint32_t slotCount = 0;
+    for (uint32_t indi = 0; indi < kAdcSampleBufferSize; indi++) {
+      am_hal_adc_sample_t temp;
+
+      temp.ui32Slot = AM_HAL_ADC_FIFO_SLOT(source_buffer[indi]);
+      temp.ui32Sample = AM_HAL_ADC_FIFO_SAMPLE(source_buffer[indi]);
+
+      if (temp.ui32Slot == 1) {
+        g_audio_capture_buffer[g_audio_capture_buffer_start] = temp.ui32Sample;
+        g_audio_capture_buffer_start =
+            (g_audio_capture_buffer_start + 1) % kAudioCaptureBufferSize;
+        slotCount++;
+      }
+    }
+
+    g_total_samples_captured += slotCount;
+    g_latest_audio_timestamp =
+        (g_total_samples_captured / (kAudioSampleFrequency / 1000));
+  }
+
+  // If we got a DMA error, set the flag.
+  if (ui32IntMask & AM_HAL_ADC_INT_DERR) {
+    g_adc_dma_error = true;
+  }
+}
+
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  // Set the clock frequency.
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0)) {
+    error_reporter->Report("Error - configuring the system clock failed.");
+    return kTfLiteError;
+  }
+
+  // Set the default cache configuration and enable it.
+  if (AM_HAL_STATUS_SUCCESS !=
+      am_hal_cachectrl_config(&am_hal_cachectrl_defaults)) {
+    error_reporter->Report("Error - configuring the system cache failed.");
+    return kTfLiteError;
+  }
+  if (AM_HAL_STATUS_SUCCESS != am_hal_cachectrl_enable()) {
+    error_reporter->Report("Error - enabling the system cache failed.");
+    return kTfLiteError;
+  }
+
+  // Ensure the CPU is running as fast as possible.
+  enable_burst_mode(error_reporter);
+
+  // Start the CTIMER A3 for timer-based ADC measurements.
+  init_timerA3_for_ADC();
+
+  // Enable interrupts.
+  NVIC_EnableIRQ(ADC_IRQn);
+  am_hal_interrupt_master_enable();
+
+  // Edge Board Pin Definitions
+  constexpr int kSfEdgePinMic0 = 11;
+  const am_hal_gpio_pincfg_t g_sf_edge_pin_mic0 = {
+      .uFuncSel = AM_HAL_PIN_11_ADCSE2,
+  };
+  constexpr int kSfEdgePinMic1 = 29;
+  const am_hal_gpio_pincfg_t g_sf_edge_pin_mic1 = {
+      .uFuncSel = AM_HAL_PIN_29_ADCSE1,
+  };
+
+  // Set pins to act as our ADC input
+  am_hal_gpio_pinconfig(kSfEdgePinMic0, g_sf_edge_pin_mic0);
+  am_hal_gpio_pinconfig(kSfEdgePinMic1, g_sf_edge_pin_mic1);
+
+  // Configure the ADC
+  adc_config0(error_reporter);
+
+  // Trigger the ADC sampling for the first time manually.
+  if (AM_HAL_STATUS_SUCCESS != am_hal_adc_sw_trigger(g_adc_handle)) {
+    error_reporter->Report("Error - triggering the ADC0 failed.");
+    return kTfLiteError;
+  }
+
+  // Enable the LED outputs.
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_RED, g_AM_HAL_GPIO_OUTPUT_12);
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_YELLOW, g_AM_HAL_GPIO_OUTPUT_12);
+
+  am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED);
+  am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+
+  // This is the 'zero' level of the microphone when no audio is present, and
+  // should be recalibrated if the hardware configuration ever changes. It was
+  // generated experimentally by averaging some samples captured on a board.
+  const int16_t kAdcSampleDC = 6003;
+
+  // Temporary gain emulation to deal with too-quiet audio on prototype boards.
+  const int16_t kAdcSampleGain = 10;
+
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    const int32_t capture_value = g_audio_capture_buffer[capture_index];
+    int32_t output_value = capture_value - kAdcSampleDC;
+    output_value *= kAdcSampleGain;
+    if (output_value < std::numeric_limits<int16_t>::min()) {
+      output_value = std::numeric_limits<int16_t>::min();
+    }
+    if (output_value > std::numeric_limits<int16_t>::max()) {
+      output_value = std::numeric_limits<int16_t>::max();
+    }
+    g_audio_output_buffer[i] = output_value;
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..78469f2b7d78d70caaf0f890970d7ff666a3c452
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/command_responder.h"
+
+#include "am_bsp.h"  // NOLINT
+
+// This implementation will light up the LEDs on the board in response to
+// different commands.
+void RespondToCommand(tflite::ErrorReporter* error_reporter,
+                      int32_t current_time, const char* found_command,
+                      uint8_t score, bool is_new_command) {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    // Setup LED's as outputs
+    am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_RED, g_AM_HAL_GPIO_OUTPUT_12);
+    am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_BLUE, g_AM_HAL_GPIO_OUTPUT_12);
+    am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_GREEN, g_AM_HAL_GPIO_OUTPUT_12);
+    am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_YELLOW, g_AM_HAL_GPIO_OUTPUT_12);
+    is_initialized = true;
+  }
+  static int count = 0;
+
+  // Toggle the blue LED every time an inference is performed.
+  ++count;
+  if (count & 1) {
+    am_hal_gpio_output_set(AM_BSP_GPIO_LED_BLUE);
+  } else {
+    am_hal_gpio_output_clear(AM_BSP_GPIO_LED_BLUE);
+  }
+
+  // Turn on the yellow LED if 'yes' was heard.
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED);
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_YELLOW);
+  am_hal_gpio_output_clear(AM_BSP_GPIO_LED_GREEN);
+  if (is_new_command) {
+    error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                           current_time);
+    if (found_command[0] == 'y') {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW);
+    }
+    if (found_command[0] == 'n') {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED);
+    }
+    if (found_command[0] == 'u') {
+      am_hal_gpio_output_set(AM_BSP_GPIO_LED_GREEN);
+    }
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f6ceb3f0b3935d084fa9463c72e98d4e0cad83
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
@@ -0,0 +1,1800 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+const int g_yes_1000ms_sample_data_size = 16000;
+const int16_t g_yes_1000ms_sample_data[16000] = {
+    -7,     -12,    -18,    -20,    -20,    -21,    -21,    -25,    -29,
+    -31,    -31,    -30,    -30,    -29,    -30,    -30,    -29,    -28,
+    -24,    -22,    -17,    -12,    -8,     -7,     -6,     -1,     2,
+    5,      7,      8,      11,     15,     18,     19,     23,     24,
+    24,     27,     27,     26,     25,     28,     30,     32,     33,
+    31,     29,     27,     28,     30,     28,     26,     26,     24,
+    22,     17,     16,     15,     13,     10,     5,      0,      -4,
+    -4,     -7,     -9,     -12,    -14,    -14,    -13,    -11,    -10,
+    -8,     -6,     -3,     3,      7,      8,      12,     15,     18,
+    21,     19,     19,     21,     23,     24,     23,     22,     19,
+    17,     11,     5,      -3,     -12,    -22,    -28,    -35,    -45,
+    -54,    -62,    -69,    -76,    -84,    -92,    -100,   -109,   -116,
+    -117,   -120,   -120,   -120,   -122,   -124,   -126,   -123,   -121,
+    -116,   -113,   -107,   -97,    -88,    -75,    -61,    -50,    -41,
+    -27,    -12,    4,      21,     37,     58,     76,     93,     108,
+    121,    137,    156,    172,    184,    196,    205,    215,    224,
+    235,    242,    245,    242,    240,    238,    231,    223,    214,
+    205,    195,    178,    158,    135,    112,    90,     69,     46,
+    19,     -11,    -45,    -76,    -105,   -133,   -159,   -186,   -211,
+    -236,   -260,   -280,   -294,   -308,   -320,   -331,   -336,   -338,
+    -335,   -326,   -316,   -301,   -286,   -267,   -246,   -225,   -203,
+    -180,   -154,   -124,   -91,    -59,    -34,    -8,     19,     42,
+    64,     87,     103,    119,    134,    148,    162,    174,    182,
+    188,    190,    189,    187,    184,    180,    177,    171,    162,
+    154,    144,    137,    129,    118,    106,    95,     81,     69,
+    58,     48,     37,     26,     14,     3,      -7,     -22,    -31,
+    -42,    -52,    -62,    -69,    -75,    -79,    -82,    -87,    -88,
+    -92,    -94,    -91,    -87,    -85,    -81,    -74,    -70,    -64,
+    -55,    -47,    -40,    -33,    -25,    -19,    -12,    -6,     -4,
+    -1,     1,      1,      -2,     -9,     -15,    -17,    -18,    -20,
+    -22,    -22,    -26,    -31,    -33,    -35,    -31,    -26,    -17,
+    -4,     8,      19,     31,     44,     54,     64,     71,     79,
+    86,     92,     102,    109,    111,    109,    104,    96,     84,
+    70,     60,     51,     38,     27,     13,     4,      -3,     -9,
+    -13,    -18,    -26,    -33,    -32,    -27,    -20,    -10,    -4,
+    2,      6,      10,     14,     16,     21,     25,     29,     31,
+    33,     35,     37,     33,     22,     15,     13,     11,     12,
+    9,      5,      2,      1,      -3,     -9,     -17,    -27,    -32,
+    -35,    -36,    -36,    -42,    -50,    -56,    -66,    -77,    -85,
+    -96,    -100,   -106,   -113,   -118,   -121,   -119,   -117,   -119,
+    -122,   -124,   -123,   -112,   -94,    -77,    -64,    -51,    -37,
+    -22,    -3,     17,     37,     54,     68,     86,     100,    114,
+    134,    154,    167,    174,    178,    182,    189,    189,    187,
+    185,    179,    177,    174,    171,    157,    138,    123,    108,
+    94,     76,     50,     25,     6,      -8,     -20,    -37,    -59,
+    -86,    -110,   -132,   -147,   -159,   -169,   -178,   -191,   -203,
+    -213,   -217,   -215,   -208,   -199,   -194,   -195,   -190,   -178,
+    -165,   -155,   -144,   -134,   -123,   -103,   -80,    -56,    -35,
+    -18,    -4,     11,     23,     36,     50,     65,     78,     93,
+    111,    122,    129,    132,    131,    127,    125,    126,    126,
+    128,    127,    125,    122,    118,    111,    108,    104,    99,
+    93,     89,     90,     87,     82,     78,     75,     68,     65,
+    67,     69,     66,     61,     54,     39,     28,     15,     3,
+    -7,     -18,    -25,    -29,    -35,    -42,    -52,    -66,    -78,
+    -83,    -85,    -86,    -86,    -82,    -83,    -84,    -83,    -81,
+    -75,    -62,    -57,    -53,    -49,    -46,    -41,    -34,    -26,
+    -16,    -10,    -7,     -2,     2,      6,      12,     15,     19,
+    18,     15,     17,     21,     24,     30,     33,     27,     22,
+    21,     20,     23,     24,     21,     15,     13,     8,      3,
+    1,      -1,     -3,     -4,     -6,     -9,     -11,    -11,    -8,
+    -10,    -13,    -15,    -19,    -17,    -11,    -2,     1,      2,
+    6,      9,      10,     12,     13,     9,      8,      10,     13,
+    20,     18,     13,     10,     4,      1,      -2,     -6,     -11,
+    -13,    -16,    -18,    -15,    -18,    -21,    -21,    -22,    -23,
+    -25,    -23,    -22,    -20,    -19,    -16,    -12,    -10,    -9,
+    -11,    -15,    -19,    -22,    -19,    -14,    -11,    -9,     -11,
+    -17,    -20,    -18,    -19,    -15,    -11,    -8,     -2,     8,
+    19,     30,     36,     37,     36,     38,     45,     57,     69,
+    77,     81,     79,     75,     76,     74,     69,     66,     60,
+    53,     45,     36,     28,     22,     17,     10,     0,      -5,
+    -11,    -15,    -18,    -26,    -31,    -33,    -34,    -34,    -35,
+    -37,    -37,    -35,    -28,    -24,    -29,    -37,    -45,    -46,
+    -41,    -36,    -31,    -32,    -33,    -37,    -37,    -36,    -36,
+    -34,    -27,    -19,    -14,    -11,    -8,     -1,     6,      14,
+    19,     21,     25,     30,     34,     38,     38,     33,     26,
+    22,     19,     20,     18,     17,     15,     10,     2,      -3,
+    -5,     -10,    -13,    -13,    -13,    -16,    -16,    -16,    -15,
+    -13,    -14,    -13,    -16,    -19,    -20,    -18,    -17,    -18,
+    -16,    -16,    -24,    -28,    -28,    -28,    -23,    -21,    -21,
+    -20,    -24,    -27,    -23,    -18,    -14,    -7,     4,      11,
+    15,     19,     21,     25,     33,     39,     41,     45,     47,
+    50,     56,     58,     57,     59,     59,     55,     50,     47,
+    39,     34,     30,     24,     18,     11,     8,      3,      0,
+    -3,     -8,     -14,    -15,    -13,    -13,    -12,    -14,    -17,
+    -17,    -12,    -10,    -4,     -7,     -12,    -10,    -14,    -17,
+    -17,    -19,    -25,    -28,    -27,    -29,    -30,    -31,    -35,
+    -38,    -43,    -47,    -51,    -52,    -50,    -49,    -48,    -47,
+    -45,    -39,    -32,    -30,    -31,    -35,    -35,    -31,    -24,
+    -17,    -12,    -11,    -14,    -15,    -17,    -16,    -9,     -5,
+    -3,     -1,     0,      1,      0,      3,      12,     21,     26,
+    33,     35,     38,     45,     50,     53,     53,     54,     58,
+    61,     64,     69,     67,     66,     64,     58,     54,     51,
+    46,     44,     45,     41,     35,     31,     27,     25,     27,
+    25,     20,     13,     12,     16,     17,     17,     12,     7,
+    3,      2,      -2,     -4,     -8,     -14,    -19,    -25,    -29,
+    -38,    -49,    -60,    -69,    -73,    -71,    -74,    -82,    -89,
+    -98,    -103,   -104,   -103,   -99,    -98,    -98,    -98,    -99,
+    -97,    -94,    -91,    -85,    -82,    -78,    -74,    -74,    -71,
+    -68,    -61,    -54,    -52,    -47,    -41,    -36,    -32,    -21,
+    -12,    -3,     11,     26,     36,     44,     48,     55,     64,
+    77,     92,     100,    108,    117,    120,    122,    128,    130,
+    129,    130,    127,    124,    122,    121,    118,    114,    110,
+    102,    92,     85,     80,     77,     68,     55,     46,     39,
+    36,     34,     31,     27,     15,     5,      -1,     -5,     -11,
+    -20,    -29,    -37,    -43,    -46,    -47,    -54,    -61,    -65,
+    -74,    -82,    -84,    -91,    -94,    -96,    -104,   -109,   -111,
+    -111,   -112,   -113,   -111,   -112,   -110,   -104,   -99,    -96,
+    -93,    -89,    -87,    -81,    -71,    -63,    -54,    -45,    -43,
+    -37,    -30,    -24,    -17,    -12,    -8,     -2,     2,      15,
+    23,     28,     35,     41,     42,     44,     52,     58,     66,
+    74,     78,     80,     82,     85,     88,     90,     92,     92,
+    88,     87,     87,     79,     73,     69,     64,     62,     55,
+    50,     45,     41,     36,     29,     24,     20,     16,     12,
+    8,      5,      2,      1,      1,      0,      1,      -4,     -4,
+    -4,     -4,     -1,     1,      2,      1,      -3,     -6,     -1,
+    5,      6,      7,      8,      4,      2,      0,      -2,     -3,
+    0,      -3,     -4,     -3,     -4,     -5,     -8,     -15,    -20,
+    -25,    -28,    -32,    -37,    -38,    -39,    -43,    -48,    -55,
+    -62,    -69,    -75,    -75,    -78,    -81,    -83,    -89,    -89,
+    -92,    -91,    -91,    -89,    -83,    -81,    -74,    -66,    -63,
+    -54,    -45,    -39,    -31,    -23,    -15,    -4,     6,      14,
+    23,     29,     35,     41,     45,     49,     55,     61,     69,
+    75,     75,     76,     75,     74,     74,     73,     74,     72,
+    69,     69,     65,     62,     57,     52,     44,     35,     33,
+    29,     24,     14,     7,      3,      -4,     -12,    -17,    -20,
+    -22,    -27,    -32,    -34,    -39,    -42,    -43,    -42,    -43,
+    -40,    -38,    -36,    -36,    -37,    -36,    -33,    -31,    -27,
+    -24,    -23,    -22,    -17,    -11,    -7,     -7,     -7,     -3,
+    5,      13,     19,     25,     27,     25,     27,     35,     40,
+    40,     41,     45,     47,     50,     54,     52,     50,     45,
+    43,     44,     40,     34,     28,     24,     18,     11,     6,
+    -2,     -9,     -14,    -21,    -27,    -35,    -39,    -43,    -50,
+    -57,    -62,    -66,    -68,    -71,    -72,    -73,    -74,    -76,
+    -76,    -77,    -75,    -75,    -74,    -67,    -61,    -55,    -49,
+    -45,    -40,    -30,    -21,    -11,    -4,     4,      13,     23,
+    34,     44,     52,     59,     65,     70,     77,     84,     87,
+    88,     90,     91,     90,     89,     85,     80,     75,     72,
+    71,     64,     56,     48,     41,     34,     27,     21,     12,
+    1,      -11,    -19,    -28,    -33,    -39,    -46,    -50,    -53,
+    -58,    -63,    -66,    -71,    -73,    -76,    -76,    -74,    -73,
+    -71,    -67,    -65,    -62,    -60,    -55,    -51,    -45,    -39,
+    -35,    -31,    -27,    -20,    -13,    -6,     -3,     1,      8,
+    12,     18,     24,     26,     30,     35,     38,     44,     47,
+    47,     51,     53,     52,     53,     52,     50,     51,     49,
+    50,     51,     50,     48,     48,     45,     43,     42,     37,
+    34,     31,     31,     30,     26,     24,     21,     15,     12,
+    11,     7,      4,      1,      -3,     -5,     -7,     -9,     -15,
+    -21,    -26,    -28,    -31,    -35,    -39,    -46,    -48,    -49,
+    -53,    -58,    -63,    -67,    -69,    -71,    -72,    -74,    -75,
+    -77,    -77,    -73,    -72,    -69,    -65,    -60,    -55,    -50,
+    -47,    -43,    -38,    -30,    -25,    -20,    -12,    -4,     4,
+    9,      16,     20,     24,     28,     35,     43,     50,     58,
+    61,     65,     72,     74,     74,     76,     79,     78,     76,
+    78,     76,     76,     74,     70,     64,     59,     52,     46,
+    41,     33,     26,     19,     12,     5,      -2,     -8,     -15,
+    -20,    -26,    -31,    -37,    -39,    -41,    -44,    -44,    -47,
+    -51,    -52,    -52,    -48,    -45,    -46,    -48,    -45,    -42,
+    -40,    -36,    -32,    -27,    -24,    -22,    -18,    -16,    -11,
+    -10,    -5,     0,      3,      8,      11,     16,     18,     21,
+    23,     25,     26,     27,     28,     30,     31,     31,     30,
+    29,     27,     26,     23,     19,     17,     13,     10,     6,
+    0,      -2,     -5,     -10,    -12,    -15,    -19,    -23,    -26,
+    -29,    -30,    -30,    -32,    -33,    -34,    -35,    -34,    -31,
+    -29,    -29,    -28,    -28,    -23,    -19,    -17,    -12,    -12,
+    -10,    -5,     -2,     3,      7,      10,     13,     14,     19,
+    22,     26,     31,     34,     34,     35,     36,     39,     43,
+    45,     47,     47,     48,     49,     51,     48,     47,     50,
+    45,     41,     41,     38,     34,     34,     30,     23,     17,
+    11,     7,      4,      -4,     -9,     -15,    -23,    -28,    -32,
+    -35,    -39,    -45,    -46,    -49,    -53,    -52,    -53,    -55,
+    -56,    -56,    -55,    -54,    -53,    -53,    -51,    -47,    -44,
+    -42,    -40,    -37,    -33,    -28,    -25,    -23,    -18,    -15,
+    -8,     -6,     -2,     3,      8,      15,     18,     23,     26,
+    27,     32,     36,     36,     36,     39,     38,     38,     40,
+    39,     35,     31,     29,     25,     23,     19,     15,     11,
+    7,      5,      3,      1,      -1,     -6,     -8,     -7,     -10,
+    -9,     -10,    -11,    -10,    -7,     -6,     -8,     -6,     -5,
+    -4,     1,      2,      4,      7,      7,      9,      11,     11,
+    9,      9,      10,     11,     13,     17,     15,     15,     15,
+    17,     19,     17,     17,     17,     15,     15,     13,     11,
+    12,     8,      7,      5,      3,      0,      -4,     -4,     -6,
+    -9,     -12,    -14,    -15,    -15,    -16,    -20,    -19,    -20,
+    -20,    -20,    -18,    -18,    -21,    -22,    -21,    -21,    -23,
+    -20,    -20,    -23,    -24,    -23,    -25,    -25,    -25,    -25,
+    -26,    -24,    -23,    -23,    -23,    -23,    -22,    -19,    -18,
+    -15,    -14,    -10,    -8,     -4,     -1,     1,      3,      6,
+    8,      9,      14,     19,     22,     24,     26,     29,     32,
+    31,     34,     39,     42,     42,     46,     49,     50,     50,
+    52,     53,     52,     49,     49,     48,     48,     46,     45,
+    40,     34,     30,     25,     21,     17,     13,     10,     6,
+    2,      -4,     -9,     -12,    -15,    -18,    -21,    -26,    -28,
+    -31,    -32,    -33,    -35,    -35,    -38,    -37,    -36,    -34,
+    -35,    -35,    -33,    -33,    -34,    -30,    -26,    -27,    -25,
+    -23,    -22,    -18,    -15,    -16,    -12,    -9,     -9,     -6,
+    -1,     2,      3,      5,      8,      7,      9,      12,     15,
+    17,     18,     18,     19,     18,     20,     19,     18,     21,
+    20,     19,     18,     16,     15,     15,     15,     14,     12,
+    9,      9,      10,     8,      6,      4,      2,      1,      -1,
+    -3,     -1,     -3,     -2,     -4,     -5,     -5,     -8,     -8,
+    -10,    -10,    -8,     -8,     -8,     -7,     -8,     -8,     -8,
+    -9,     -11,    -12,    -11,    -9,     -7,     -8,     -8,     -8,
+    -10,    -8,     -7,     -8,     -7,     -6,     -7,     -5,     -3,
+    -3,     -3,     -3,     -2,     0,      3,      3,      5,      7,
+    10,     11,     10,     10,     12,     13,     16,     16,     16,
+    17,     15,     16,     17,     16,     14,     16,     13,     11,
+    11,     9,      9,      6,      4,      4,      3,      0,      -2,
+    -4,     -7,     -7,     -7,     -13,    -15,    -13,    -14,    -16,
+    -15,    -15,    -17,    -16,    -16,    -18,    -19,    -19,    -20,
+    -19,    -16,    -15,    -13,    -12,    -10,    -7,     -6,     -4,
+    -4,     -2,     0,      2,      6,      8,      10,     12,     14,
+    15,     14,     13,     13,     13,     15,     15,     17,     17,
+    17,     18,     17,     16,     15,     15,     14,     11,     9,
+    8,      8,      9,      8,      5,      5,      3,      -1,     -1,
+    -4,     -5,     -7,     -8,     -8,     -8,     -9,     -10,    -8,
+    -11,    -12,    -12,    -12,    -12,    -13,    -11,    -11,    -9,
+    -8,     -7,     -8,     -7,     -6,     -7,     -6,     -5,     -4,
+    -4,     -2,     -2,     -3,     -2,     -2,     -3,     0,      -1,
+    -3,     1,      1,      2,      4,      3,      5,      6,      3,
+    3,      4,      3,      3,      4,      5,      4,      6,      7,
+    7,      7,      6,      3,      3,      5,      3,      3,      6,
+    6,      7,      6,      4,      5,      2,      1,      1,      0,
+    0,      2,      1,      1,      1,      -1,     -2,     -3,     -5,
+    -4,     -5,     -4,     -4,     -6,     -4,     -4,     -4,     -5,
+    -6,     -5,     -6,     -5,     -4,     -5,     -4,     -3,     -4,
+    0,      2,      2,      2,      2,      2,      2,      3,      3,
+    5,      6,      6,      5,      6,      7,      6,      8,      6,
+    5,      5,      5,      6,      6,      6,      5,      5,      2,
+    2,      1,      2,      0,      -1,     -1,     -1,     -1,     0,
+    -1,     -4,     -6,     -8,     -8,     -9,     -8,     -7,     -6,
+    -5,     -5,     -6,     -3,     -4,     -5,     -4,     -7,     -6,
+    -4,     -2,     -1,     -1,     1,      1,      1,      1,      1,
+    2,      2,      1,      3,      4,      4,      6,      6,      6,
+    6,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      1,      1,      1,      0,      1,      1,      0,      -2,
+    -2,     -3,     -3,     -3,     -3,     -5,     -4,     -3,     -5,
+    -5,     -3,     -5,     -4,     -4,     -2,     -2,     -2,     -1,
+    -3,     -2,     -2,     -1,     -3,     -2,     -1,     -2,     -2,
+    -2,     0,      0,      0,      0,      0,      1,      0,      0,
+    1,      2,      3,      3,      3,      4,      5,      4,      3,
+    4,      5,      5,      7,      7,      6,      9,      8,      6,
+    7,      8,      6,      5,      7,      8,      8,      8,      7,
+    6,      5,      4,      4,      4,      5,      4,      2,      1,
+    2,      1,      0,      -2,     -3,     -2,     -4,     -6,     -6,
+    -7,     -7,     -8,     -9,     -9,     -9,     -9,     -9,     -9,
+    -9,     -10,    -10,    -10,    -8,     -7,     -8,     -6,     -5,
+    -4,     -3,     -5,     -2,     -2,     -2,     -1,     -1,     0,
+    1,      1,      2,      3,      2,      4,      3,      3,      5,
+    3,      3,      5,      4,      5,      6,      5,      4,      5,
+    3,      2,      2,      3,      4,      4,      4,      4,      4,
+    3,      4,      4,      4,      3,      2,      2,      2,      2,
+    2,      2,      2,      2,      1,      1,      1,      2,      1,
+    1,      2,      1,      1,      2,      1,      1,      1,      -1,
+    0,      1,      0,      -1,     1,      -1,     -1,     -1,     -2,
+    -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -2,
+    -1,     0,      -1,     -1,     1,      1,      2,      0,      -1,
+    0,      -1,     -1,     0,      0,      1,      2,      2,      2,
+    1,      1,      0,      0,      0,      0,      1,      1,      0,
+    0,      0,      0,      0,      -1,     -2,     -1,     -3,     -4,
+    -4,     -4,     -4,     -4,     -4,     -4,     -3,     -3,     -5,
+    -6,     -4,     -2,     -2,     -1,     -1,     -1,     -2,     1,
+    -1,     1,      0,      0,      1,      1,      1,      1,      2,
+    1,      2,      2,      3,      3,      3,      3,      4,      5,
+    5,      5,      5,      5,      5,      5,      5,      6,      6,
+    5,      5,      5,      6,      6,      5,      3,      6,      5,
+    4,      5,      3,      2,      2,      2,      2,      1,      1,
+    2,      0,      -1,     0,      -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -3,     -4,     -4,     -5,
+    -6,     -6,     -6,     -6,     -6,     -6,     -5,     -5,     -6,
+    -5,     -4,     -4,     -4,     -4,     -2,     -2,     -2,     -1,
+    -2,     0,      1,      0,      1,      3,      4,      4,      4,
+    4,      4,      4,      5,      4,      4,      4,      5,      7,
+    5,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      2,      0,      1,      1,      0,      1,      1,      -1,
+    0,      -1,     -2,     -1,     -3,     -4,     -4,     -3,     -5,
+    -5,     -5,     -5,     -5,     -5,     -4,     -3,     -3,     -2,
+    -3,     -2,     -2,     -5,     -3,     -3,     -3,     -2,     0,
+    1,      1,      1,      1,      1,      1,      1,      1,      3,
+    3,      4,      4,      4,      4,      5,      5,      2,      3,
+    4,      3,      5,      4,      3,      4,      3,      3,      5,
+    5,      3,      4,      2,      1,      1,      3,      4,      3,
+    1,      3,      2,      1,      2,      1,      0,      1,      0,
+    1,      0,      1,      1,      1,      1,      0,      -1,     0,
+    0,      -1,     -1,     -2,     -1,     -1,     -2,     0,      -1,
+    -2,     -1,     -1,     -2,     -2,     -1,     -3,     -3,     -3,
+    -3,     -3,     -4,     -3,     -5,     -6,     -4,     -4,     -5,
+    -4,     -3,     -5,     -6,     -4,     -5,     -6,     -4,     -3,
+    -5,     -4,     -3,     -4,     -3,     -2,     -2,     -2,     0,
+    0,      1,      1,      0,      0,      0,      1,      1,      3,
+    3,      3,      4,      3,      3,      3,      3,      3,      3,
+    3,      3,      3,      3,      3,      3,      3,      3,      3,
+    1,      1,      1,      1,      1,      1,      1,      0,      0,
+    0,      1,      -2,     -1,     1,      0,      -1,     -2,     -2,
+    0,      1,      0,      1,      1,      1,      1,      0,      0,
+    1,      0,      0,      2,      1,      0,      1,      1,      1,
+    1,      3,      3,      3,      4,      3,      3,      4,      2,
+    2,      2,      2,      2,      2,      2,      1,      2,      2,
+    2,      2,      -1,     -1,     -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -5,     -4,     -5,     -5,
+    -5,     -5,     -7,     -7,     -7,     -8,     -7,     -8,     -7,
+    -8,     -8,     -7,     -8,     -8,     -8,     -8,     -7,     -6,
+    -6,     -6,     -7,     -6,     -6,     -5,     -5,     -3,     -2,
+    -2,     -1,     0,      -1,     0,      1,      2,      2,      3,
+    3,      3,      6,      7,      7,      7,      8,      9,      8,
+    10,     10,     9,      10,     11,     9,      10,     12,     11,
+    10,     9,      9,      9,      9,      10,     9,      6,      6,
+    5,      5,      6,      3,      1,      1,      0,      1,      0,
+    0,      1,      -1,     -2,     -2,     -1,     -3,     -3,     -2,
+    -4,     -4,     -3,     -2,     -4,     -4,     -4,     -5,     -3,
+    -3,     -5,     -3,     -3,     -5,     -4,     -2,     -2,     -3,
+    -3,     -1,     0,      -1,     0,      0,      0,      -2,     -1,
+    0,      -1,     -2,     -2,     -2,     -2,     -1,     -3,     -2,
+    -3,     -4,     -3,     -3,     -3,     -3,     -3,     -3,     -3,
+    -2,     -4,     -6,     -5,     -3,     -2,     -4,     -3,     -2,
+    -4,     -4,     -4,     -3,     -4,     -5,     -4,     -5,     -3,
+    -2,     -5,     -2,     -4,     -4,     -3,     -2,     -1,     -1,
+    -1,     0,      2,      2,      1,      1,      3,      3,      3,
+    3,      4,      4,      5,      6,      5,      5,      6,      7,
+    7,      7,      8,      8,      7,      9,      9,      9,      9,
+    10,     9,      9,      9,      9,      9,      9,      8,      7,
+    9,      9,      6,      7,      5,      2,      3,      2,      1,
+    1,      0,      -2,     -2,     -2,     -3,     -3,     -2,     -2,
+    -4,     -5,     -4,     -4,     -4,     -4,     -5,     -4,     -4,
+    -5,     -4,     -5,     -4,     -5,     -6,     -4,     -4,     -5,
+    -5,     -5,     -5,     -6,     -4,     -4,     -4,     -3,     -2,
+    -3,     -3,     -2,     -2,     -1,     -2,     -3,     -1,     0,
+    -1,     0,      0,      0,      0,      1,      0,      0,      0,
+    0,      -1,     1,      1,      1,      0,      -2,     -2,     -3,
+    -3,     -4,     -4,     -6,     -7,     -5,     -4,     -5,     -5,
+    -4,     -6,     -8,     -7,     -6,     -5,     -5,     -5,     -4,
+    -4,     -5,     -4,     -3,     -3,     0,      0,      -2,     -1,
+    0,      0,      1,      1,      2,      2,      2,      2,      2,
+    4,      5,      5,      5,      6,      7,      7,      9,      10,
+    10,     10,     12,     12,     13,     14,     14,     14,     15,
+    15,     15,     15,     15,     15,     14,     15,     15,     12,
+    13,     13,     12,     10,     11,     11,     11,     10,     8,
+    6,      5,      7,      6,      6,      4,      3,      4,      5,
+    3,      2,      2,      1,      1,      2,      3,      1,      0,
+    0,      1,      0,      -2,     -1,     -2,     -3,     -3,     -3,
+    -3,     -4,     -6,     -8,     -9,     -9,     -10,    -12,    -14,
+    -15,    -18,    -21,    -21,    -21,    -21,    -22,    -24,    -26,
+    -26,    -27,    -27,    -28,    -26,    -25,    -26,    -28,    -27,
+    -24,    -23,    -23,    -24,    -21,    -17,    -17,    -15,    -12,
+    -12,    -12,    -12,    -9,     -7,     -6,     -5,     -3,     -3,
+    -2,     0,      0,      1,      3,      7,      6,      4,      6,
+    7,      8,      11,     10,     10,     13,     15,     14,     13,
+    18,     20,     18,     19,     21,     23,     24,     23,     22,
+    24,     26,     26,     26,     27,     25,     23,     25,     27,
+    28,     28,     28,     23,     19,     23,     24,     20,     20,
+    21,     15,     13,     15,     16,     14,     11,     8,      7,
+    8,      11,     11,     6,      4,      8,      7,      6,      7,
+    6,      4,      7,      13,     12,     7,      8,      8,      4,
+    1,      1,      1,      2,      -4,     -12,    -18,    -24,    -25,
+    -25,    -32,    -41,    -55,    -59,    -61,    -75,    -87,    -96,
+    -109,   -122,   -133,   -141,   -148,   -157,   -168,   -180,   -191,
+    -198,   -202,   -207,   -206,   -207,   -211,   -211,   -208,   -203,
+    -189,   -171,   -153,   -132,   -114,   -96,    -75,    -54,    -30,
+    -5,     19,     43,     61,     77,     93,     106,    123,    143,
+    161,    182,    198,    202,    201,    209,    229,    242,    240,
+    235,    239,    249,    258,    255,    242,    233,    245,    268,
+    278,    256,    223,    223,    253,    263,    235,    198,    178,
+    188,    215,    230,    200,    143,    113,    128,    158,    158,
+    128,    99,     90,     82,     70,     56,     32,     7,      14,
+    46,     36,     -23,    -71,    -76,    -54,    -36,    -39,    -74,
+    -118,   -134,   -122,   -101,   -104,   -129,   -164,   -174,   -129,
+    -86,    -109,   -184,   -219,   -191,   -147,   -141,   -183,   -249,
+    -290,   -269,   -236,   -266,   -346,   -394,   -366,   -325,   -353,
+    -431,   -472,   -406,   -313,   -316,   -398,   -449,   -401,   -287,
+    -194,   -164,   -193,   -245,   -212,   -55,    75,     67,     26,
+    67,     165,    237,    269,    293,    319,    333,    368,    414,
+    432,    463,    488,    448,    404,    391,    377,    361,    365,
+    376,    308,    197,    150,    129,    73,     53,     91,     43,
+    -107,   -165,   -54,    1,      -148,   -312,   -273,   -125,   -62,
+    -128,   -258,   -294,   -141,   70,     57,     -217,   -378,   -145,
+    198,    289,    169,    -47,    -219,   -101,   264,    458,    217,
+    -163,   -199,   13,     121,    101,    -51,    -293,   -319,   -62,
+    24,     -274,   -474,   -296,   -170,   -336,   -422,   -285,   -248,
+    -302,   -130,   98,     -11,    -257,   -146,   184,    278,    264,
+    331,    192,    -35,    235,    805,    830,    315,    82,     322,
+    503,    522,    619,    557,    242,    163,    399,    507,    489,
+    618,    602,    156,    -164,   112,    476,    406,    94,     -154,
+    -242,   -132,   56,     5,      -325,   -566,   -527,   -478,   -624,
+    -692,   -561,   -551,   -744,   -836,   -671,   -520,   -626,   -736,
+    -647,   -581,   -639,   -687,   -702,   -739,   -665,   -383,   -236,
+    -414,   -513,   -321,   -114,   -43,    32,     65,     -98,    -236,
+    34,     608,    924,    680,    218,    56,     329,    847,    1214,
+    1006,   341,    11,     340,    667,    553,    353,    355,    415,
+    416,    364,    257,    108,    6,      113,    293,    233,    46,
+    4,      25,     -10,    -12,    55,     40,     -65,    -56,    -26,
+    -101,   -61,    143,    229,    78,     -161,   -210,   103,    424,
+    377,    86,     -274,   -491,   -328,   -37,    60,     128,    188,
+    -105,   -625,   -823,   -464,   138,    389,    111,    -343,   -526,
+    -306,   13,     205,    250,    -35,    -554,   -764,   -498,   -42,
+    167,    -210,   -639,   -448,   -101,   -110,   -171,   -74,    -39,
+    47,     424,    616,    324,    98,     367,    853,    942,    416,
+    -184,   -130,   339,    472,    369,    239,    -165,   -418,   101,
+    742,    659,    325,    365,    476,    233,    -14,    270,    785,
+    719,    -29,    -533,   -220,   237,    305,    179,    -190,   -644,
+    -610,   -380,   -526,   -601,   -237,   48,     -36,    -124,   -49,
+    -6,     23,     117,    55,     -199,   -428,   -512,   -338,   -238,
+    -424,   -323,   -135,   -464,   -657,   -189,   100,    -379,   -964,
+    -893,   -346,   -64,    -322,   -650,   -480,   32,     238,    201,
+    386,    616,    611,    400,    195,    357,    842,    1051,   832,
+    712,    829,    1070,   1307,   1081,   551,    363,    544,    623,
+    239,    -374,   -609,   -230,   375,    486,    -52,    -446,   -270,
+    181,    645,    601,    -135,   -654,   -256,   567,    840,    380,
+    -54,    18,     334,    386,    21,     -214,   83,     243,    -316,
+    -937,   -1074,  -1006,  -896,   -674,   -424,   -331,   -354,   -380,
+    -481,   -392,   80,     358,    171,    -170,   -624,   -796,   -130,
+    706,    803,    381,    152,    367,    620,    685,    655,    347,
+    36,     180,    417,    412,    358,    288,    189,    150,    16,
+    -240,   -428,   -428,   -266,   -335,   -819,   -1150,  -946,   -587,
+    -437,   -580,   -961,   -1218,  -1065,  -704,   -431,   -350,   -315,
+    -214,   -162,   -81,    26,     -8,     -52,    -117,   -226,   -40,
+    285,    241,    -2,     -69,    57,     207,    81,     -144,   -69,
+    65,     84,     49,     -168,   -248,   126,    502,    472,    192,
+    120,    442,    667,    551,    512,    634,    814,    1014,   1098,
+    1156,   1112,   974,    1144,   1330,   1099,   825,    847,    877,
+    555,    2,      -243,   -102,   -196,   -471,   -377,   -235,   -439,
+    -622,   -547,   -470,   -495,   -431,   -197,   -21,    21,     -9,
+    -246,   -438,   -238,   -31,    0,      96,     137,    -25,    -211,
+    -181,   -149,   -350,   -368,   -33,    21,     -308,   -323,   32,
+    379,    605,    531,    85,     -374,   -367,   9,      277,    147,
+    -356,   -698,   -494,   -140,   -126,   -354,   -549,   -673,   -642,
+    -428,   -269,   -273,   -246,   -216,   -349,   -323,   -16,    32,
+    -387,   -742,   -662,   -434,   -223,   41,     140,    -58,    -227,
+    -80,    93,     20,     -166,   -360,   -536,   -555,   -305,   -33,
+    -23,    -86,    -75,    -9,     82,     -1,     -156,   24,     532,
+    916,    956,    835,    901,    1127,   1279,   1417,   1435,   1144,
+    822,    862,    1214,   1352,   1001,   611,    539,    532,    369,
+    189,    170,    308,    465,    430,    232,    64,     14,     51,
+    -37,    -244,   -321,   -276,   -144,   57,     77,     -215,   -467,
+    -335,   -186,   -245,   -133,   -81,    -588,   -1130,  -959,   -520,
+    -631,   -1122,  -1270,  -971,   -873,   -1118,  -1157,  -1078,  -1296,
+    -1365,  -1010,  -873,   -1138,  -1061,  -379,   89,     51,     177,
+    372,    185,    -14,    63,     197,    125,    -123,   -60,    243,
+    195,    88,     201,    115,    -63,    -12,    -79,    -492,   -751,
+    -489,   49,     163,    -293,   -424,   -52,    229,    302,    212,
+    217,    315,    70,     -207,   -210,   -173,   129,    619,    556,
+    213,    181,    170,    112,    167,    322,    451,    206,    -136,
+    58,     426,    526,    524,    394,    387,    568,    481,    297,
+    164,    8,      263,    664,    777,    943,    989,    934,    1283,
+    1495,   1153,   861,    738,    582,    614,    692,    655,    629,
+    432,    127,    -119,   -338,   -313,   -138,   -204,   -561,   -994,
+    -1168,  -948,   -700,   -658,   -788,   -1053,  -1027,  -684,   -566,
+    -528,   -355,   -335,   -323,   -28,    206,    87,     56,     387,
+    585,    296,    24,     261,    492,    248,    -132,   -469,   -674,
+    -502,   -235,   -255,   -517,   -847,   -1038,  -965,   -707,   -630,
+    -767,   -639,   -298,   -193,   -290,   -310,   -118,   74,     -77,
+    -337,   -324,   -120,   187,    323,    -72,    -552,   -454,   -14,
+    29,     -427,   -803,   -735,   -586,   -762,   -918,   -783,   -649,
+    -723,   -857,   -786,   -626,   -591,   -417,   -83,    167,    262,
+    49,     -161,   157,    842,    1298,   1356,   1206,   1041,   1194,
+    1461,   1323,   1070,   1221,   1687,   2051,   2002,   1673,   1464,
+    1550,   1851,   1907,   1531,   1327,   1399,   1342,   1287,   1264,
+    1152,   1030,   878,    716,    601,    454,    264,    264,    352,
+    151,    -193,   -296,   -161,   -93,    -215,   -423,   -617,   -668,
+    -547,   -416,   -464,   -807,   -1175,  -1174,  -1045,  -1076,  -1023,
+    -829,   -710,   -745,   -1069,  -1443,  -1417,  -1099,  -939,   -1165,
+    -1307,  -1056,  -843,   -638,   -304,   -190,   -334,   -578,   -770,
+    -705,   -675,   -947,   -957,   -565,   -437,   -617,   -843,   -1015,
+    -813,   -489,   -584,   -904,   -1054,  -797,   -229,   -26,    -208,
+    -66,    398,    710,    644,    390,    413,    726,    992,    1204,
+    1337,   1234,   1104,   1038,   1001,   1043,   982,    847,    885,
+    1024,   1098,   1138,   1108,   1038,   966,    885,    882,    878,
+    929,    1005,   944,    1008,   1284,   1415,   1289,   1007,   760,
+    812,    947,    806,    455,    111,    -72,    -290,   -611,   -626,
+    -559,   -765,   -1034,  -1375,  -1632,  -1565,  -1588,  -1728,  -1585,
+    -1477,  -1547,  -1533,  -1371,  -1103,  -995,   -1090,  -1102,  -947,
+    -686,   -403,   -295,   -250,   -107,   -86,    -171,   -150,   12,
+    234,    283,    185,    300,    461,    393,    382,    434,    378,
+    306,    202,    195,    253,    -8,     -307,   -105,   264,    342,
+    212,    34,     -57,    78,     435,    571,    180,    -165,   -51,
+    339,    705,    683,    464,    658,    958,    825,    579,    465,
+    390,    241,    61,     202,    429,    128,    -122,   241,    406,
+    39,     -167,   -60,    15,     -31,    -68,    146,    402,    344,
+    227,    208,    87,     -25,    -31,    -66,    -169,   -249,   -87,
+    75,     -181,   -438,   -249,   49,     87,     -40,    -16,    53,
+    -86,    -74,    98,     78,     110,    169,    -84,    -323,   -251,
+    -102,   -172,   -513,   -750,   -675,   -568,   -587,   -583,   -523,
+    -450,   -302,   -245,   -356,   -480,   -590,   -495,   -183,   -105,
+    -191,   -215,   -308,   -206,   39,     4,      -77,    -21,    74,
+    186,    218,    356,    611,    489,    83,     13,     246,    371,
+    348,    240,    61,     -66,    -107,   -170,   -205,   -74,    200,
+    277,    45,     -11,    180,    263,    100,    -74,    102,    246,
+    6,      -154,   -162,   -197,   -128,   -189,   -227,   -49,    -238,
+    -490,   -333,   -188,   1,      215,    150,    144,    128,    -33,
+    187,    532,    676,    911,    773,    283,    351,    673,    620,
+    349,    105,    205,    425,    325,    295,    372,    340,    511,
+    628,    394,    224,    187,    91,     -174,   -556,   -482,   -37,
+    -9,     -226,   -382,   -568,   -466,   -208,   -241,   -426,   -656,
+    -814,   -788,   -902,   -1065,  -946,   -860,   -896,   -831,   -744,
+    -672,   -685,   -743,   -723,   -783,   -813,   -570,   -341,   -239,
+    -57,    137,    348,    576,    593,    454,    429,    503,    449,
+    238,    173,    350,    423,    419,    530,    501,    272,    156,
+    207,    295,    404,    568,    676,    419,    30,     113,    463,
+    550,    473,    349,    126,    33,     144,    207,    193,    267,
+    304,    81,     -252,   -401,   -368,   -347,   -404,   -452,   -408,
+    -272,   -40,    234,    281,    48,     -72,    -18,    54,     208,
+    309,    285,    245,    164,    38,     -20,    148,    430,    563,
+    655,    679,    453,    300,    319,    219,    25,     -15,    54,
+    -117,   -444,   -431,   -135,   -147,   -468,   -667,   -722,   -593,
+    -301,   -217,   -428,   -642,   -598,   -400,   -422,   -602,   -628,
+    -554,   -509,   -501,   -541,   -488,   -250,   -129,   -284,   -441,
+    -358,   -161,   -82,    4,      134,    157,    290,    516,    582,
+    702,    859,    871,    858,    759,    615,    616,    754,    839,
+    725,    464,    259,    187,    127,    150,    280,    238,    92,
+    78,     5,      -86,    6,      67,     -14,    -92,    -143,   -211,
+    -89,    213,    300,    107,    -91,    -154,   -153,   -238,   -355,
+    -314,   -227,   -168,   -92,    -142,   -219,   -156,   -47,    53,
+    -15,    -195,   -161,   -186,   -382,   -395,   -297,   -238,   -240,
+    -390,   -502,   -336,   -97,    -29,    -116,   -290,   -289,   -67,
+    74,     112,    119,    182,    358,    382,    315,    341,    290,
+    218,    190,    101,    -51,    -168,   -132,   -41,    -39,    -15,
+    104,    186,    151,    68,     89,     154,    67,     10,     143,
+    120,    -185,   -382,   -365,   -263,   -145,   -111,   -159,   -190,
+    -53,    151,    177,    179,    384,    553,    502,    490,    572,
+    600,    573,    442,    119,    -212,   -260,   -166,   -318,   -506,
+    -413,   -279,   -285,   -354,   -390,   -278,   -142,   -85,    -18,
+    -19,    -121,   -143,   -32,    88,     118,    42,     -96,    -187,
+    -167,   -113,   -172,   -270,   -256,   -178,   -192,   -249,   -128,
+    103,    132,    -47,    -147,   -104,   -56,    -9,     45,     35,
+    109,    315,    381,    326,    336,    457,    667,    786,    675,
+    489,    460,    569,    595,    470,    303,    272,    448,    620,
+    545,    226,    -92,    -128,   91,     172,    -98,    -385,   -378,
+    -264,   -284,   -362,   -314,   -148,   -72,    -198,   -350,   -353,
+    -344,   -389,   -353,   -292,   -327,   -413,   -473,   -519,   -588,
+    -577,   -546,   -737,   -989,   -1030,  -997,   -1010,  -861,   -683,
+    -731,   -690,   -419,   -197,   -47,    112,    167,    74,     41,
+    176,    309,    438,    671,    781,    793,    868,    904,    991,
+    1099,   987,    812,    816,    869,    766,    605,    633,    728,
+    592,    424,    460,    405,    170,    75,     30,     -105,   -58,
+    63,     -58,    -242,   -359,   -415,   -255,   -44,    -127,   -266,
+    -191,   -187,   -296,   -273,   -260,   -341,   -345,   -324,   -384,
+    -467,   -421,   -233,   -125,   -227,   -341,   -256,   -168,   -217,
+    -249,   -302,   -447,   -425,   -274,   -289,   -299,   -229,   -275,
+    -272,   -103,   -57,    -117,   -106,   -162,   -256,   -184,   -31,
+    51,     69,     31,     -19,    72,     256,    318,    331,    254,
+    28,     -7,     121,    48,     -64,    58,     183,    152,    161,
+    201,    167,    190,    287,    278,    157,    56,     103,    332,
+    460,    299,    166,    238,    308,    374,    508,    509,    373,
+    275,    270,    298,    229,    185,    192,    23,     -160,   -80,
+    67,     31,     -170,   -378,   -384,   -330,   -500,   -648,   -615,
+    -686,   -716,   -510,   -510,   -771,   -752,   -475,   -434,   -556,
+    -480,   -403,   -515,   -464,   -255,   -177,   -105,   29,     95,
+    152,    210,    190,    180,    279,    408,    325,    225,    462,
+    607,    537,    759,    1022,   973,    945,    964,    846,    818,
+    952,    907,    584,    313,    302,    428,    533,    479,    260,
+    178,    262,    185,    18,     -77,    -263,   -370,   -208,   -240,
+    -589,   -739,   -572,   -444,   -405,   -357,   -475,   -738,   -771,
+    -542,   -441,   -529,   -651,   -803,   -823,   -556,   -285,   -227,
+    -233,   -202,   -168,   -110,   -78,    -220,   -302,   -56,    129,
+    -60,    -149,   54,     130,    169,    324,    231,    24,     89,
+    269,    320,    262,    231,    225,    138,    67,     153,    310,
+    399,    269,    -21,    -197,   -183,   -59,    144,    234,    -13,
+    -274,   -168,   32,     -37,    -277,   -417,   -441,   -416,   -324,
+    -312,   -467,   -540,   -373,   -166,   -161,   -297,   -365,   -341,
+    -246,   -69,    81,     99,     -3,     11,     305,    540,    449,
+    394,    586,    667,    606,    685,    665,    425,    410,    585,
+    509,    360,    424,    538,    583,    482,    250,    159,    310,
+    423,    217,    -131,   -280,   -204,   -51,    -12,    -204,   -338,
+    -232,   -143,   -201,   -306,   -374,   -336,   -229,   -257,   -453,
+    -576,   -497,   -379,   -326,   -302,   -372,   -504,   -453,   -229,
+    -133,   -226,   -328,   -326,   -261,   -151,   -6,     97,     143,
+    164,    143,    138,    267,    433,    500,    470,    297,    143,
+    279,    504,    556,    475,    333,    233,    225,    228,    198,
+    128,    24,     -17,    4,      -55,    -187,   -251,   -213,   -119,
+    -94,    -214,   -357,   -349,   -246,   -195,   -183,   -261,   -440,
+    -533,   -476,   -341,   -213,   -170,   -220,   -299,   -220,   -8,
+    51,     -11,    19,     172,    292,    189,    9,      -6,     102,
+    238,    384,    477,    448,    353,    304,    354,    473,    543,
+    400,    229,    275,    380,    425,    415,    371,    398,    460,
+    377,    202,    154,    199,    110,    -123,   -365,   -524,   -524,
+    -360,   -134,   -47,    -182,   -348,   -453,   -542,   -503,   -376,
+    -398,   -521,   -595,   -621,   -560,   -439,   -284,   -115,   -80,
+    -123,   -57,    28,     -15,    -60,    -9,     47,     119,    203,
+    288,    435,    571,    635,    706,    750,    627,    436,    345,
+    330,    398,    460,    368,    213,    127,    140,    215,    202,
+    58,     -99,    -244,   -387,   -470,   -527,   -637,   -754,   -791,
+    -768,   -742,   -739,   -735,   -704,   -649,   -552,   -479,   -491,
+    -494,   -454,   -433,   -422,   -398,   -315,   -115,   75,     175,
+    244,    307,    360,    398,    460,    532,    529,    446,    422,
+    497,    541,    504,    541,    702,    803,    744,    645,    621,
+    727,    877,    873,    734,    593,    513,    523,    516,    412,
+    336,    334,    274,    199,    163,    123,    125,    117,    107,
+    140,    72,     -73,    -114,   -68,    -15,    13,     -122,   -338,
+    -367,   -325,   -386,   -497,   -608,   -634,   -546,   -477,   -427,
+    -377,   -412,   -464,   -436,   -343,   -276,   -327,   -390,   -313,
+    -149,   -17,    2,      -93,    -146,   -104,   -76,    -87,    -131,
+    -224,   -280,   -194,   -46,    12,     -76,    -189,   -151,   18,
+    160,    200,    99,     -81,    -149,   -95,    -31,    -6,     -45,
+    -93,    -97,    -71,    0,      73,     34,     -82,    -129,   -102,
+    -84,    -96,    -107,   -69,    -5,     6,      18,     48,     35,
+    27,     32,     -4,     -71,    -30,    119,    205,    266,    352,
+    325,    237,    282,    352,    358,    342,    265,    203,    200,
+    159,    120,    159,    195,    185,    133,    37,     20,     152,
+    312,    363,    316,    255,    251,    259,    211,    160,    86,
+    -4,     -30,    -79,    -154,   -213,   -271,   -243,   -146,   -147,
+    -211,   -283,   -319,   -219,   -157,   -207,   -237,   -252,   -245,
+    -136,   0,      42,     -22,    -108,   -82,    34,     130,    179,
+    152,    98,     105,    110,    116,    180,    175,    66,     -9,
+    -9,     36,     82,     75,     12,     -39,    -14,    23,     1,
+    12,     31,     -61,    -155,   -184,   -158,   -86,    -60,    -67,
+    -63,    -84,    -100,   -81,    -115,   -171,   -157,   -150,   -179,
+    -191,   -209,   -245,   -217,   -128,   -54,    -42,    -73,    -100,
+    -88,    -10,    104,    199,    249,    227,    201,    204,    151,
+    83,     75,     87,     84,     67,     34,     18,     44,     110,
+    218,    275,    232,    190,    209,    263,    294,    256,    174,
+    108,    37,     -54,    -110,   -129,   -179,   -293,   -360,   -339,
+    -282,   -190,   -135,   -188,   -239,   -234,   -227,   -182,   -127,
+    -89,    -51,    -73,    -136,   -151,   -85,    0,      72,     129,
+    122,    65,     44,     103,    202,    272,    252,    170,    148,
+    167,    152,    130,    127,    79,     14,     70,     157,    142,
+    109,    70,     -25,    -57,    -6,     46,     98,     135,    135,
+    82,     16,     10,     68,     87,     -20,    -120,   -116,   -98,
+    -102,   -129,   -204,   -271,   -282,   -252,   -216,   -215,   -221,
+    -156,   -70,    -66,    -120,   -156,   -146,   -126,   -84,    -15,
+    -21,    -76,    -8,     131,    146,    86,     42,     12,     44,
+    110,    169,    171,    91,     68,     173,    262,    248,    160,
+    36,     -90,    -109,   -24,    -12,    -57,    -64,    -78,    -89,
+    -75,    -87,    -101,   -82,    -72,    -76,    -81,    -63,    -34,
+    -4,     61,     87,     46,     23,     -1,     -8,     40,     63,
+    46,     45,     39,     14,     -11,    -25,    -16,    36,     78,
+    85,     110,    120,    132,    189,    228,    217,    154,    89,
+    57,     14,     -14,    -6,     0,      13,     8,      -50,    -68,
+    -60,    -107,   -140,   -126,   -122,   -151,   -147,   -118,   -105,
+    -85,    -83,    -100,   -139,   -195,   -194,   -168,   -183,   -173,
+    -148,   -166,   -168,   -123,   -59,    -11,    20,     64,     98,
+    80,     58,     83,     111,    143,    176,    171,    152,    146,
+    165,    174,    143,    93,     30,     5,      21,     42,     35,
+    -37,    -94,    -61,    -12,    -5,     -27,    -58,    -85,    -81,
+    -11,    79,     65,     -14,    -17,    15,     -4,     -2,     39,
+    20,     -29,    -19,    3,      -11,    -39,    -62,    -43,    -34,
+    -60,    -77,    -119,   -163,   -128,   -5,     87,     73,     51,
+    116,    189,    217,    240,    234,    177,    192,    295,    344,
+    313,    263,    236,    240,    230,    179,    99,     19,     -25,
+    -16,    -9,     -35,    -66,    -53,    -16,    -40,    -70,    -81,
+    -102,   -86,    -87,    -156,   -225,   -228,   -145,   -52,    -22,
+    -57,    -171,   -255,   -247,   -208,   -165,   -187,   -242,   -275,
+    -261,   -168,   -75,    -13,    8,      -62,    -125,   -136,   -133,
+    -81,    -11,    -17,    -80,    -115,   -103,   -27,    71,     134,
+    137,    44,     -48,    -24,    69,     156,    194,    175,    112,
+    55,     54,     101,    148,    157,    142,    100,    44,     27,
+    63,     106,    107,    89,     67,     37,     17,     30,     63,
+    69,     61,     21,     -37,    -55,    -72,    -53,    -26,    -53,
+    -77,    -87,    -109,   -119,   -80,    -36,    -29,    -38,    -48,
+    -57,    -65,    -16,    52,     83,     83,     24,     -27,    -14,
+    9,      27,     52,     50,     45,     90,     132,    117,    75,
+    16,     -1,     60,     95,     55,     25,     26,     20,     61,
+    119,    89,     1,      -61,    -68,    -46,    -36,    -40,    -39,
+    -49,    -58,    -16,    30,     13,     -12,    18,     35,     6,
+    3,      30,     22,     25,     52,     32,     12,     9,      -5,
+    -16,    -25,    -33,    -38,    -44,    -76,    -118,   -118,   -96,
+    -54,    -3,     9,      -31,    -82,    -84,    -35,    18,     25,
+    -26,    -72,    -48,    8,      25,     8,      -20,    -66,    -105,
+    -102,   -80,    -73,    -79,    -80,    -70,    -59,    -55,    -82,
+    -113,   -85,    -51,    -59,    -57,    -38,    -13,    -7,     -18,
+    -6,     20,     51,     55,     18,     -8,     -7,     24,     78,
+    119,    137,    135,    139,    153,    144,    155,    179,    166,
+    128,    56,     8,      38,     85,     94,     72,     20,     -32,
+    -9,     25,     17,     -15,    -84,    -123,   -106,   -82,    -62,
+    -60,    -43,    -4,     -12,    -45,    -68,    -108,   -100,   -47,
+    -49,    -64,    -50,    -9,     37,     59,     68,     62,     53,
+    49,     25,     13,     32,     40,     60,     109,    82,     18,
+    10,     -1,     21,     102,    111,    40,     -10,    -9,     20,
+    31,     0,      -51,    -108,   -135,   -89,    -21,    1,      -54,
+    -125,   -129,   -113,   -144,   -205,   -227,   -167,   -118,   -114,
+    -100,   -71,    5,      34,     -51,    -119,   -120,   -72,    10,
+    56,     51,     58,     65,     98,     135,    84,     20,     -3,
+    -1,     57,     135,    137,    90,     88,     107,    102,    45,
+    -4,     9,      48,     95,     99,     65,     42,     44,     78,
+    80,     29,     11,     39,     27,     0,      7,      19,     10,
+    -45,    -99,    -86,    -77,    -74,    -57,    -74,    -84,    -92,
+    -134,   -114,   -65,    -73,    -76,    -96,    -105,   -50,    -31,
+    -17,    17,     9,      18,     62,     75,     55,     63,     76,
+    61,     61,     80,     103,    107,    110,    131,    134,    120,
+    94,     66,     70,     78,     59,     52,     57,     53,     72,
+    76,     31,     -18,    -53,    -57,    -35,    -17,    -9,     -27,
+    -34,    -7,     -17,    -26,    -13,    -60,    -86,    -53,    -42,
+    -36,    -36,    -46,    -13,    19,     -16,    -47,    -15,    11,
+    -9,     -18,    -26,    -24,    14,     8,      -53,    -54,    15,
+    43,     15,     -9,     -5,     5,      -12,    -40,    -57,    -74,
+    -94,    -105,   -91,    -20,    30,     -10,    -50,    -58,    -52,
+    -42,    -47,    -54,    -61,    -83,    -64,    -30,    -3,     31,
+    9,      -35,    -43,    -31,    6,      50,     54,     55,     67,
+    53,     43,     30,     27,     62,     37,     -26,    -52,    -54,
+    -29,    3,      -12,    -23,    11,     26,     23,     31,     57,
+    66,     46,     32,     35,     83,     124,    111,    124,    157,
+    143,    101,    80,     60,     27,     11,     21,     22,     9,
+    -4,     -26,    -41,    -35,    -50,    -103,   -138,   -116,   -90,
+    -89,    -90,    -79,    -74,    -58,    -18,    -12,    -29,    -36,
+    -17,    22,     30,     -1,     -8,     8,      10,     19,     31,
+    36,     38,     41,     28,     -7,     -14,    -6,     -20,    -30,
+    -11,    -2,     -9,     0,      25,     56,     78,     68,     40,
+    34,     47,     50,     40,     37,     26,     28,     53,     61,
+    57,     25,     -35,    -75,    -65,    -48,    -65,    -81,    -67,
+    -53,    -41,    3,      19,     -3,     -9,     -2,     -1,     -24,
+    -36,    -23,    -26,    -29,    -9,     0,      -15,    -17,    -9,
+    12,     50,     45,     14,     19,     37,     24,     9,      16,
+    13,     -16,    -19,    3,      -3,     -12,    -10,    -23,    -43,
+    -47,    -38,    -46,    -44,    -7,     3,      -19,    -13,    -26,
+    -52,    -29,    -19,    -32,    0,      11,     -26,    -24,    -20,
+    -41,    -30,    -24,    -53,    -67,    -26,    23,     20,     9,
+    6,      -8,     3,      16,     7,      3,      -5,     2,      33,
+    53,     72,     94,     86,     69,     96,     118,    95,     91,
+    78,     32,     26,     48,     48,     37,     21,     7,      -6,
+    -8,     8,      1,      -17,    -2,     18,     1,      -28,    -51,
+    -84,    -93,    -74,    -46,    -18,    -19,    -31,    -10,    10,
+    10,     7,      -5,     -30,    -39,    -28,    -9,     10,     17,
+    11,     14,     20,     -1,     2,      18,     7,      15,     40,
+    40,     32,     27,     23,     31,     43,     33,     7,      -3,
+    18,     51,     53,     31,     21,     14,     16,     14,     4,
+    11,     16,     1,      -24,    -38,    -33,    -27,    -50,    -74,
+    -70,    -60,    -54,    -44,    -22,    -22,    -43,    -33,    -16,
+    -35,    -36,    -18,    -27,    -42,    -46,    -36,    -17,    -15,
+    -22,    -21,    -20,    -2,     15,     12,     22,     27,     22,
+    41,     57,     60,     63,     54,     56,     65,     62,     68,
+    58,     34,     53,     70,     58,     60,     51,     33,     41,
+    39,     16,     -3,     -16,    -18,    -15,    -18,    -32,    -76,
+    -85,    -62,    -82,    -87,    -68,    -84,    -75,    -40,    -48,
+    -55,    -45,    -42,    -24,    -14,    -1,     27,     23,     -1,
+    -2,     12,     15,     32,     55,     52,     55,     82,     81,
+    58,     62,     59,     37,     24,     20,     17,     18,     19,
+    15,     14,     5,      -18,    -27,    -20,    -19,    -34,    -39,
+    -29,    -30,    -27,    -27,    -48,    -52,    -54,    -77,    -48,
+    -18,    -36,    -34,    -13,    -21,    -38,    -28,    -15,    -7,
+    -6,     -20,    -18,    2,      4,      -11,    -5,     7,      1,
+    1,      12,     -2,     -17,    7,      15,     2,      15,     34,
+    48,     78,     94,     82,     66,     66,     64,     47,     44,
+    57,     64,     74,     65,     34,     26,     31,     32,     33,
+    18,     5,      -1,     -18,    -22,    -31,    -54,    -37,    -32,
+    -74,    -89,    -77,    -73,    -65,    -72,    -75,    -39,    -21,
+    -31,    -31,    -24,    -19,    -8,     -4,     7,      26,     22,
+    15,     13,     11,     28,     47,     42,     35,     28,     5,
+    18,     55,     55,     45,     44,     18,     9,      18,     -2,
+    -5,     6,      -15,    -16,    -12,    -20,    -4,     4,      -15,
+    -18,    -10,    -5,     -2,     -16,    -24,    -14,    -7,     -14,
+    -33,    -33,    -20,    -17,    -17,    -18,    -30,    -37,    -35,
+    -34,    -13,    -3,     -28,    -28,    -10,    -21,    -17,    -4,
+    -12,    -16,    -20,    -27,    -16,    -8,     -4,     14,     24,
+    11,     17,     30,     27,     14,     7,      28,     30,     22,
+    45,     47,     23,     31,     23,     -5,     10,     17,     -5,
+    2,      15,     9,      20,     29,     11,     -9,     -8,     8,
+    10,     -1,     -14,    -30,    -30,    -8,     -9,     -20,    -17,
+    -17,    -12,    1,      6,      -7,     -18,    -6,     10,     -6,
+    -7,     29,     35,     21,     16,     9,      25,     44,     26,
+    21,     34,     28,     40,     41,     9,      -2,     1,      12,
+    34,     18,     -12,    -10,    -16,    -29,    -24,    -25,    -20,
+    -17,    -35,    -29,    -12,    -29,    -39,    -32,    -30,    -17,
+    -12,    -28,    -20,    -5,     -4,     7,      14,     10,     3,
+    -3,     0,      19,     27,     4,      -21,    -18,    -7,     -4,
+    0,      1,      -6,     -17,    -30,    -24,    -11,    -9,     0,
+    -1,     0,      -3,     -12,    1,      15,     -2,     3,      16,
+    -3,     -8,     7,      3,      13,     32,     23,     10,     -6,
+    -11,    8,      4,      -12,    -9,     3,      12,     -2,     -31,
+    -36,    -33,    -37,    -17,    -5,     -20,    -14,    4,      5,
+    4,      6,      17,     31,     27,     23,     16,     -1,     -4,
+    15,     24,     21,     18,     7,      -7,     -14,    18,     41,
+    25,     14,     13,     2,      5,      12,     8,      15,     10,
+    2,      13,     10,     3,      5,      -1,     0,      11,     10,
+    6,      2,      7,      10,     -4,     -3,     2,      -13,    -4,
+    14,     -4,     -17,    -11,    -4,     8,      3,      -8,     -1,
+    -7,     -20,    -4,     23,     23,     8,      5,      24,     21,
+    -5,     -2,     7,      -9,     -15,    -8,     -6,     6,      2,
+    -26,    -19,    1,      -19,    -31,    -27,    -34,    -41,    -47,
+    -39,    -12,    -12,    -29,    -32,    -41,    -36,    -26,    -36,
+    -35,    -33,    -29,    -1,     5,      -13,    -21,    -21,    -3,
+    12,     1,      -7,     -1,     2,      12,     9,      -1,     15,
+    21,     18,     25,     4,      -13,    5,      12,     16,     33,
+    33,     19,     21,     26,     30,     30,     24,     23,     19,
+    22,     34,     39,     28,     15,     14,     24,     24,     18,
+    12,     10,     4,      8,      28,     29,     2,      -7,     6,
+    8,      10,     2,      -13,    -8,     -2,     0,      12,     13,
+    -1,     3,      21,     26,     24,     17,     11,     15,     19,
+    19,     19,     11,     1,      3,      3,      0,      -5,     -11,
+    -16,    -26,    -18,    3,      -5,     -17,    2,      10,     6,
+    6,      -8,     -11,    4,      -3,     -17,    -10,    -17,    -37,
+    -31,    -17,    -26,    -37,    -42,    -53,    -49,    -34,    -40,
+    -39,    -21,    -17,    -23,    -23,    -25,    -30,    -24,    -13,
+    -10,    -10,    1,      1,      -7,     7,      19,     11,     4,
+    -3,     -8,     1,      6,      7,      25,     22,     -5,     3,
+    20,     7,      -1,     14,     17,     18,     20,     12,     25,
+    41,     23,     19,     37,     39,     21,     17,     23,     17,
+    6,      9,      15,     4,      -15,    -8,     8,      7,      1,
+    -12,    -18,    -14,    -15,    -10,    0,      -3,     3,      13,
+    -8,     -21,    -8,     -26,    -29,    -1,     -9,     -24,    -19,
+    -22,    -24,    -18,    -25,    -27,    -28,    -34,    -26,    -9,
+    -14,    -14,    -8,     -8,     -5,     4,      4,      -10,    -12,
+    -7,     -8,     -10,    -15,    -19,    -10,    -5,     -9,     -9,
+    -19,    -33,    -27,    -14,    -15,    -14,    -16,    -25,    -10,
+    5,      -7,     -11,    2,      3,      7,      17,     28,     33,
+    32,     33,     39,     49,     57,     63,     62,     64,     67,
+    59,     55,     67,     71,     58,     53,     53,     44,     38,
+    44,     51,     51,     45,     35,     34,     46,     55,     48,
+    36,     21,     3,      -5,     2,      7,      0,      -17,    -30,
+    -34,    -48,    -62,    -64,    -66,    -66,    -62,    -79,    -90,
+    -85,    -88,    -88,    -85,    -88,    -103,   -112,   -112,   -102,
+    -99,    -102,   -103,   -110,   -100,   -80,    -60,    -57,    -68,
+    -59,    -45,    -35,    -6,     9,      -3,     2,      32,     45,
+    48,     51,     40,     51,     78,     85,     83,     87,     94,
+    101,    104,    105,    100,    86,     82,     96,     102,    96,
+    85,     68,     63,     65,     55,     50,     46,     28,     32,
+    43,     33,     30,     27,     8,      18,     36,     27,     20,
+    13,     -14,    -19,    8,      12,     0,      -1,     -12,    -24,
+    -20,    -27,    -39,    -39,    -39,    -44,    -38,    -32,    -42,
+    -38,    -33,    -43,    -55,    -57,    -60,    -61,    -56,    -57,
+    -55,    -43,    -46,    -58,    -55,    -50,    -50,    -51,    -48,
+    -46,    -44,    -36,    -26,    -20,    -13,    -11,    -8,     1,
+    5,      0,      8,      21,     31,     42,     39,     43,     56,
+    48,     37,     45,     45,     47,     52,     46,     40,     26,
+    18,     28,     30,     22,     14,     0,      -3,     8,      0,
+    -7,     0,      -10,    -13,    -9,     -13,    -13,    -18,    -33,
+    -32,    -26,    -37,    -41,    -32,    -26,    -30,    -34,    -31,
+    -38,    -40,    -24,    -25,    -29,    -15,    -18,    -23,    -4,
+    2,      -7,     0,      5,      10,     22,     23,     25,     31,
+    33,     37,     38,     39,     43,     46,     41,     44,     46,
+    37,     35,     46,     63,     67,     52,     38,     30,     35,
+    41,     41,     41,     29,     15,     16,     4,      -4,     3,
+    -12,    -18,    -13,    -27,    -39,    -47,    -55,    -44,    -43,
+    -53,    -45,    -36,    -37,    -37,    -38,    -40,    -49,    -57,
+    -41,    -24,    -28,    -31,    -26,    -20,    -15,    -21,    -23,
+    -18,    -19,    -14,    -10,    -11,    1,      -6,     -26,    -14,
+    -1,     -7,     -10,    -11,    -9,     0,      -4,     -9,     3,
+    8,      0,      -2,     1,      16,     20,     7,      9,      10,
+    8,      18,     12,     11,     17,     -6,     -19,    0,      0,
+    -10,    -6,     -12,    -14,    -11,    -9,     -2,     -10,    -19,
+    -9,     -11,    -4,     18,     7,      -3,     9,      17,     23,
+    28,     25,     19,     19,     24,     33,     37,     30,     28,
+    35,     44,     43,     33,     31,     30,     26,     33,     39,
+    35,     31,     27,     19,     23,     24,     19,     13,     0,
+    0,      2,      -7,     -9,     -10,    -13,    -6,     -6,     -23,
+    -28,    -15,    -9,     -20,    -34,    -30,    -15,    -12,    -11,
+    -3,     -4,     -4,     6,      15,     9,      -11,    -20,    3,
+    26,     23,     1,      -16,    -3,     12,     2,      -22,    -36,
+    -35,    -28,    -20,    -13,    -19,    -38,    -43,    -29,    -11,
+    -5,     -15,    -37,    -40,    -9,     12,     -1,     -23,    -30,
+    -16,    12,     21,     -1,     -25,    -21,    4,      34,     55,
+    34,     -12,    -11,    47,     99,     107,    58,     0,      8,
+    78,     148,    151,    56,     -40,    -2,     142,    215,    99,
+    -67,    -64,    76,     153,    99,     -21,    -107,   -92,    -1,
+    106,    107,    -123,   -395,   -334,   60,     274,    -69,    -597,
+    -626,   -126,   238,    18,     -447,   -577,   -312,   -34,    20,
+    -89,    -242,   -332,   -222,   74,     262,    64,     -285,   -232,
+    259,    563,    294,    -138,   -130,   312,    642,    515,    189,
+    57,     187,    415,    538,    467,    277,    109,    134,    334,
+    441,    299,    59,     -7,     128,    228,    146,    -20,    -99,
+    -34,    60,     24,     -108,   -188,   -147,   -57,    -48,    -142,
+    -224,   -210,   -144,   -122,   -175,   -212,   -176,   -150,   -199,
+    -256,   -210,   -100,   -79,    -195,   -298,   -248,   -107,   -48,
+    -110,   -192,   -224,   -189,   -112,   -40,    -31,    -124,   -238,
+    -193,   -3,     87,     -53,    -221,   -165,   48,     132,    -2,
+    -150,   -109,   61,     147,    83,     -20,    -60,    -13,    85,
+    157,    130,    17,     -68,    -10,    147,    217,    116,    -20,
+    -21,    103,    200,    158,    52,     35,     105,    155,    132,
+    81,     74,     110,    114,    74,     48,     68,     100,    77,
+    27,     30,     48,     19,     -15,    7,      63,     53,     -56,
+    -123,   -41,    81,     75,     -61,    -154,   -84,    45,     68,
+    -24,    -105,   -76,    22,     53,     -13,    -63,    -21,    54,
+    59,     -1,     -34,    16,     80,     81,     48,     37,     61,
+    89,     88,     101,    134,    132,    100,    83,     125,    188,
+    173,    101,    95,     172,    214,    149,    68,     94,     181,
+    177,    103,    83,     132,    165,    122,    83,     140,    191,
+    153,    92,     106,    198,    226,    138,    85,     146,    215,
+    187,    110,    77,     115,    146,    115,    91,     96,     78,
+    27,     -3,     42,     102,    71,     -23,    -46,    30,     95,
+    63,     -18,    -25,    77,     174,    138,    13,     -25,    96,
+    218,    181,    34,     -70,    -45,    17,     2,      -67,    -174,
+    -346,   -516,   -553,   -446,   -455,   -789,   -1213,  -1308,  -1046,
+    -878,   -1179,  -1691,  -1839,  -1528,  -1219,  -1292,  -1623,  -1772,
+    -1538,  -1147,  -921,   -951,   -1038,  -929,   -549,   -95,    155,
+    127,    97,     387,    931,    1339,   1380,   1234,   1276,   1661,
+    2102,   2223,   2027,   1848,   1942,   2198,   2295,   2119,   1856,
+    1725,   1745,   1752,   1601,   1335,   1102,   993,    952,    830,
+    570,    286,    139,    133,    85,     -135,   -436,   -638,   -645,
+    -571,   -620,   -835,   -1064,  -1151,  -1069,  -951,   -964,   -1109,
+    -1209,  -1162,  -1044,  -961,   -944,   -977,   -1001,  -912,   -687,
+    -517,   -623,   -887,   -897,   -469,   10,     -35,    -590,   -934,
+    -545,   184,    427,    -53,    -619,   -563,   40,     489,    339,
+    -128,   -306,   -6,     403,    497,    232,    -55,    0,      388,
+    704,    584,    145,    -76,    260,    816,    942,    485,    2,
+    65,     575,    923,    744,    290,    76,     276,    596,    662,
+    419,    134,    92,     280,    434,    344,    88,     -66,    8,
+    151,    126,    -81,    -239,   -176,   -29,    -74,    -351,   -574,
+    -487,   -208,   -132,   -426,   -780,   -797,   -577,   -595,   -978,
+    -1169,  -667,   -36,    -548,   -2285,  -3281,  -1756,  927,    1236,
+    -1911,  -5006,  -4073,  -66,    2017,   -295,   -3701,  -3797,  -892,
+    975,    -165,   -1978,  -1636,  374,    1482,   679,    -567,   -591,
+    706,    2337,   3224,   2743,   1269,   287,    1221,   3597,   5083,
+    4106,   1858,   972,    2334,   4096,   4167,   2806,   1916,   2383,
+    3045,   2508,   1220,   820,    1784,   2669,   1981,   204,    -876,
+    -470,   510,    803,    170,    -787,   -1568,  -1893,  -1598,  -1027,
+    -992,   -1803,  -2610,  -2484,  -1905,  -2113,  -3113,  -3399,  -2267,
+    -1261,  -2007,  -3637,  -3909,  -2340,  -893,   -1158,  -2272,  -2486,
+    -1639,  -915,   -777,   -596,   -91,    196,    85,     210,    875,
+    1373,   1247,   1219,   1958,   2718,   2328,   1196,   1008,   2350,
+    3677,   3269,   1503,   366,    922,    2264,   2810,   1996,   608,
+    -168,   75,     680,    811,    395,    -56,    -318,   -607,   -966,
+    -1108,  -925,   -613,   -368,   -369,   -919,   -1926,  -2460,  -1685,
+    -300,   155,    -611,   -1524,  -2204,  -3227,  -3859,  -2037,  1622,
+    2382,   -2583,  -8448,  -7544,  -84,    4814,   915,    -6423,  -7558,
+    -1746,  2515,   -59,    -4587,  -3858,  1260,   3625,   187,    -4148,
+    -3500,  1542,   5467,   4780,   1256,   -1127,  -403,   2481,   5332,
+    6346,   5014,   2536,   1216,   2467,   5039,   6238,   5070,   3381,
+    3269,   4173,   3905,   2248,   1586,   3299,   5240,   4362,   1004,
+    -1382,  -489,   2113,   3168,   1620,   -742,   -1824,  -1435,  -897,
+    -1058,  -1500,  -1545,  -1398,  -1965,  -3266,  -4136,  -3756,  -2609,
+    -1804,  -1986,  -3087,  -4599,  -5296,  -4051,  -1731,  -781,   -2228,
+    -4092,  -3977,  -2325,  -1353,  -1568,  -1490,  -428,   178,    -672,
+    -1650,  -1058,  749,    2039,   2079,   1540,   897,    310,    572,
+    2266,   4265,   4265,   1869,   -231,   559,    3332,   4752,   3229,
+    768,    101,    1364,   2463,   1984,   819,    411,    723,    675,
+    -162,   -923,   -743,   -32,    185,    -516,   -1653,  -2359,  -2103,
+    -986,   42,     -205,   -1702,  -2870,  -2337,  -809,   -221,   -982,
+    -1544,  -946,   -598,   -2117,  -4291,  -4100,  -857,   1948,   338,
+    -4799,  -7972,  -5403,  173,    2371,   -1063,  -5533,  -5578,  -1777,
+    605,    -985,   -3249,  -2213,  1184,   2691,   560,    -2356,  -2288,
+    1233,   5244,   6441,   4004,   370,    -663,   2555,   7404,   9282,
+    6573,   2612,   1836,   4662,   7467,   7393,   5421,   4262,   4741,
+    5362,   4705,   3163,   2397,   3337,   4887,   4810,   2254,   -749,
+    -1316,  772,    2706,   2016,   -573,   -2552,  -2746,  -2012,  -1647,
+    -1978,  -2579,  -3105,  -3473,  -3911,  -4484,  -4891,  -4795,  -4163,
+    -3543,  -3538,  -4275,  -5356,  -5743,  -4637,  -2614,  -1301,  -1825,
+    -3341,  -4011,  -2937,  -751,   1007,   1245,   235,    -639,   -61,
+    1626,   2864,   2967,   2734,   3013,   3329,   2914,   2312,   2666,
+    3839,   4308,   3162,   1453,   768,    1255,   1887,   2006,   1715,
+    1031,   -297,   -1660,  -1690,  -277,   813,    -30,    -2137,  -3370,
+    -2854,  -1553,  -593,   -413,   -1146,  -2567,  -3440,  -2369,  -205,
+    379,    -1258,  -2315,  -812,   262,    -3205,  -8576,  -7894,  738,
+    7492,   1951,   -11595, -17098, -6934,  7139,   8065,   -4575,  -14199,
+    -8946,  3606,   7504,   -547,   -8242,  -5113,  4406,   8113,   2134,
+    -5040,  -4089,  4157,   10934,  10158,  4167,   -565,   -192,   4428,
+    9765,   12201,  9861,   4512,   1225,   3451,   8483,   10133,  6497,
+    2574,   3333,   6806,   6986,   2487,   -1214,  623,    5416,   6647,
+    2204,   -3289,  -4556,  -1565,  1544,   1525,   -1236,  -4293,  -5695,
+    -5174,  -3995,  -3403,  -3449,  -3750,  -4505,  -6014,  -7296,  -6523,
+    -3849,  -2096,  -3288,  -5722,  -6004,  -3581,  -1497,  -1960,  -3330,
+    -2800,  -434,   964,    -111,   -1739,  -1136,  1736,   4151,   3736,
+    1274,   -451,   469,    3386,   5833,   5898,   3646,   1085,   272,
+    1743,   4061,   5108,   3837,   1490,   246,    967,    1866,   859,
+    -1069,  -974,   1542,   2835,   47,     -4285,  -5068,  -1567,  1781,
+    1223,   -1997,  -4227,  -3747,  -1720,  41,     245,    -1228,  -2972,
+    -2673,  22,     1980,   -930,   -7721,  -11271, -5725,  4974,   8484,
+    -2007,  -16979, -19255, -4670,  11057,  9690,   -6417,  -17537, -10841,
+    4262,   9292,   206,    -9128,  -6224,  4828,   10018,  3699,   -5183,
+    -5121,  4702,   14279,  14466,  5778,   -2633,  -2185,  7036,   16118,
+    16305,  8081,   390,    499,    6580,   11150,  10036,  5704,   2902,
+    3378,   4664,   3786,   863,    -796,   1216,   4609,   4493,   -338,
+    -5670,  -6486,  -2751,  884,    571,    -3095,  -6446,  -6997,  -5770,
+    -5041,  -5016,  -4216,  -2579,  -2468,  -5088,  -8129,  -7964,  -4228,
+    -323,   497,    -1556,  -3653,  -3615,  -1718,  464,    1808,   2386,
+    2832,   3085,   2905,   2676,   3473,   5501,   7094,   6442,   3929,
+    1663,   1436,   3254,   5807,   7100,   5044,   -34,    -4091,  -2992,
+    2149,   5333,   2562,   -3067,  -5877,  -4480,  -2080,  -1793,  -3026,
+    -3838,  -3735,  -3663,  -4472,  -5756,  -5753,  -3576,  -640,   -274,
+    -3965,  -7787,  -6757,  -717,   4380,   3595,   -1553,  -5936,  -8603,
+    -10223, -8952,  -922,   9700,   9355,   -7788,  -25795, -22413, 2268,
+    20887,  12133,  -11291, -20129, -5899,  10236,  8585,   -3645,  -6300,
+    4667,   14216,  9346,   -3593,  -8558,  715,    15085,  21179,  14887,
+    3733,   -2703,  -675,   7170,   15131,  18360,  13959,  4205,   -2825,
+    -656,   7594,   11845,  7182,   319,    -439,   3255,   3213,   -3299,
+    -8972,  -6318,  2300,   7190,   2254,   -9247,  -17334, -15064, -4452,
+    5160,   5127,   -4268,  -14501, -17256, -11145, -1830,  3786,   2984,
+    -2498,  -8101,  -9587,  -5703,  622,    4570,   4035,   1442,   729,
+    2493,   3534,   2433,   2239,   5944,   11438,  12371,  6496,   -211,
+    -156,   7092,   13566,  11979,  3928,   -2545,  -2226,  2713,   6150,
+    5117,   1270,   -1851,  -2859,  -2376,  -1909,  -2364,  -3401,  -4183,
+    -3897,  -2875,  -3205,  -5503,  -7822,  -7501,  -3934,  -942,   -1572,
+    -4262,  -5939,  -4671,  -2353,  -1387,  -1159,  -1270,  -1328,  -606,
+    474,    1044,   -2647,  -11603, -17081, -10374, 5922,   14849,  2056,
+    -22033, -31238, -14612, 11094,  17910,  1778,   -15538, -15417, -2045,
+    6690,   2855,   -2559,  473,    8823,   11423,  3782,   -4649,  -2775,
+    9111,   20847,  21610,  11572,  962,    -1465,  5731,   15559,  20008,
+    16950,  9230,   2204,   114,    3088,   8130,   10523,  7643,   2045,
+    -2107,  -2945,  -2538,  -3593,  -5210,  -4403,  -857,   1328,   -2497,
+    -11667, -18881, -16866, -6286,  3400,   2835,   -7811,  -18322, -19279,
+    -10025, 1525,   6930,   3766,   -4647,  -11401, -9904,  -322,   10100,
+    12428,  5874,   -274,   926,    6762,   9360,   6778,   5904,   10509,
+    15077,  12681,  3846,   -1653,  2460,   11036,  14737,  8967,   -1021,
+    -6168,  -3899,  2328,   6041,   3404,   -2878,  -7672,  -6869,  -1918,
+    801,    -2188,  -7419,  -8083,  -2687,  1898,   -692,   -8121,  -11198,
+    -5642,  2830,   5915,   1120,   -5666,  -8314,  -5770,  118,    4614,
+    4713,   1482,   -2544,  -3331,  -3779,  -8931,  -13840, -10273, 3355,
+    13432,  2906,   -20058, -30890, -17080, 7759,   16047,  2886,   -12525,
+    -15117, -5998,  1614,   2294,   2684,   4610,   6236,   5486,   2514,
+    1346,   1962,   4564,   11022,  17438,  18182,  10179,  -796,   -3019,
+    5456,   15942,  18468,  11176,  2796,   -143,   1670,   3922,   3836,
+    3337,   3330,   1623,   -2609,  -7177,  -7654,  -4250,  -2210,  -3491,
+    -5312,  -4380,  -3103,  -6738,  -13209, -14278, -6529,  3346,   4931,
+    -2861,  -11176, -12097, -5552,  2679,   7102,   6050,   1301,   -3350,
+    -3378,  1785,   7413,   9059,   7013,   5043,   5331,   5197,   3143,
+    1862,   3790,   8037,   10159,  7236,   1450,   -3393,  -3980,  598,
+    6251,   7410,   1502,   -7144,  -10260, -5116,  2386,   4197,   -894,
+    -6255,  -6026,  -1493,  873,    -1639,  -4426,  -2720,  2252,   4206,
+    158,    -4631,  -4466,  537,    4709,   4528,   1691,   -828,   -1394,
+    -455,   756,    2662,   3101,   1730,   -3579,  -12987, -18531, -12998,
+    1944,   11963,  1503,   -19826, -29919, -18138, 2254,   7644,   -1829,
+    -9260,  -6516,  134,    -793,   -5234,  -2336,  6264,   12828,  11829,
+    6589,   3429,   2592,   4795,   11433,  19490,  21681,  13136,  379,
+    -4138,  3585,   14812,  17633,  10124,  623,    -2287,  696,    2273,
+    -926,   -5000,  -4391,  -386,   139,    -4657,  -11003, -13946, -11930,
+    -7460,  -1932,  1277,   -2311,  -10543, -16920, -14512, -4039,  4987,
+    7518,   3175,   -4213,  -7535,  -4747,  3590,   12231,  13419,  8429,
+    2377,   1080,   5563,   8497,   7304,   5331,   5656,   8235,   6997,
+    998,    -3131,  -1857,  3017,   5883,   3744,   -408,   -4503,  -6489,
+    -4796,  -374,   3254,   1651,   -2830,  -5206,  -3690,  -681,   -969,
+    -2819,  -2616,  19,     3379,   2359,   -2476,  -6413,  -6111,  -463,
+    4664,   4106,   -565,   -4801,  -4960,  -1242,  2479,   3706,   2168,
+    -1104,  -3048,  -1563,  1217,   2013,   -5714,  -17921, -21743, -10839,
+    7751,   13091,  -4648,  -26509, -29653, -9872,  10100,  9523,   -4335,
+    -12121, -5509,  4923,   6380,   1839,   -508,   3312,   10704,  14545,
+    12317,  5508,   -243,   2421,   11485,  19096,  18306,  8626,   -1357,
+    -5542,  -1695,  7815,   13549,  10229,  -23,    -8373,  -7496,  -2775,
+    -1016,  -2900,  -4868,  -4103,  -4535,  -6851,  -8099,  -8137,  -6414,
+    -4023,  -1790,  -45,    -1513,  -4791,  -6160,  -4105,  1060,   5970,
+    7099,   3934,   -996,   -2213,  1973,   6975,   7927,   4726,   2474,
+    3951,   5221,   2642,   -2359,  -3579,  1362,   6614,   6282,   116,
+    -5643,  -5733,  -1884,  2107,   3418,   2566,   684,    -2319,  -3803,
+    -2133,  1512,   2943,   475,    -1004,  753,    3095,   1652,   -3074,
+    -4562,  -932,   3815,   4486,   -22,    -4199,  -4666,  -2201,  284,
+    316,    -914,   -2297,  -2441,  -1538,  -435,   909,    626,    -1222,
+    -1534,  -429,   1711,   2386,   -1786,  -10676, -18200, -16272, -3805,
+    9505,   8238,   -9397,  -24577, -22256, -4907,  8659,   5940,   -3701,
+    -6764,  40,     6190,   4239,   208,    238,    7081,   14458,  15143,
+    10726,  3479,   -706,   1700,   9131,   17577,  17708,  7959,   -5009,
+    -11508, -5347,  5635,   10789,  6499,   -3121,  -9303,  -9814,  -6625,
+    -3333,  -3193,  -4349,  -5615,  -6188,  -5123,  -4441,  -4550,  -4074,
+    -2769,  -61,    2441,   2881,   1395,   -578,   -341,   2509,   6034,
+    8202,   6377,   2696,   1272,   2589,   4787,   4611,   2378,   2124,
+    3911,   4872,   2049,   -3374,  -5770,  -2705,  3179,   5905,   2589,
+    -2792,  -5419,  -3176,  1056,   2875,   2483,   1205,   605,    856,
+    1012,   892,    105,    -411,   707,    2924,   4184,   1755,   -2553,
+    -4857,  -3556,  401,    2466,   945,    -2315,  -5556,  -5549,  -2241,
+    534,    601,    -1774,  -3034,  -1962,  -886,   -448,   -720,   -467,
+    864,    760,    -22,    -2546,  -10211, -17121, -15877, -4803,  7993,
+    7254,   -6563,  -18374, -17755, -6143,  3291,   4322,   1822,   416,
+    2788,   5190,   4256,   2627,   2590,   6398,   12709,  15757,  12829,
+    5542,   -667,   167,    7241,   14346,  14826,  6392,   -3516,  -7434,
+    -4607,  1054,   2988,   847,    -1549,  -2641,  -3046,  -5363,  -8256,
+    -9130,  -6906,  -1460,  2260,   1568,   -2911,  -8580,  -9418,  -3675,
+    5021,   10127,  7909,   1478,   -4015,  -3331,  2450,   7291,   7632,
+    2567,   -2022,  -899,   3418,   5544,   1349,   -4117,  -3409,  1758,
+    6000,   3526,   -3975,  -7331,  -3931,  2747,   7037,   4962,   -21,
+    -2902,  -2008,  1306,   4461,   6364,   5956,   3623,   1734,   793,
+    44,     -893,   -1041,  1633,   5264,   4870,   -943,   -7404,  -8611,
+    -4974,  -1192,  185,    -1334,  -3672,  -4910,  -5132,  -4387,  -3532,
+    -3233,  -2430,  -469,   1245,   892,    -969,   -2441,  -2140,  320,
+    4999,   5954,   -4638,  -20056, -24424, -8954,  13558,  16089,  -3145,
+    -20665, -19447, -4802,  4488,   3733,   943,    683,    3109,   6219,
+    9247,   7736,   782,    -1410,  8024,   20877,  20174,  4723,   -7148,
+    -2758,  11240,  17896,  11462,  414,    -6134,  -4913,  113,    2818,
+    98,     -5900,  -8369,  -4446,  924,    1657,   -3389,  -10569, -13223,
+    -7690,  2339,   7741,   1634,   -9014,  -10982, -1172,  9642,   9098,
+    1310,   -2795,  -1040,  2790,   3808,   3559,   3064,   -527,   -3160,
+    -1391,  3120,   5224,   -144,   -6714,  -6416,  -719,   5630,   7253,
+    2735,   -2973,  -4325,  679,    7146,   8220,   4055,   -42,    814,
+    5288,   7658,   6592,   3051,   -746,   -541,   3401,   6030,   1953,
+    -6340,  -8619,  -2689,  4076,   3217,   -4875,  -9612,  -7826,  -4293,
+    -2441,  -4080,  -5740,  -5529,  -3656,  -506,   -1035,  -5787,  -9518,
+    -7034,  2323,   9287,   6495,   -1853,  -6110,  -3281,  -1708,  -8958,
+    -19544, -18870, -2771,  13029,  10762,  -7491,  -21837, -18923, -4183,
+    8733,   12580,  9779,   4597,   738,    1460,   6302,   9711,   8375,
+    8143,   12512,  15808,  11272,  389,    -5554,  161,    11080,  15851,
+    10426,  692,    -6372,  -6808,  -2525,  652,    827,    -219,   -349,
+    -622,   -3328,  -7883,  -11020, -8961,  -3240,  1884,   4155,   1995,
+    -3530,  -7816,  -6444,  -218,   6086,   9279,   7901,   3113,   -2352,
+    -5757,  -3836,  2022,   4572,   894,    -3519,  -3311,  -534,   -618,
+    -3716,  -5515,  -3290,  1495,   4374,   4455,   2961,   -645,   -3247,
+    -656,   5273,   9838,   9751,   5755,   1863,   158,    1457,   4585,
+    6390,   5379,   2894,   2284,   1867,   -2279,  -7051,  -6578,  70,
+    4745,   1660,   -4524,  -8007,  -7088,  -5690,  -5467,  -4178,  -2679,
+    -2218,  -3422,  -4167,  -4313,  -6105,  -6633,  -4202,  864,    5119,
+    4084,   -163,   -5331,  -8699,  -8710,  -7313,  -4649,  -2471,  -1419,
+    -1136,  -3199,  -6428,  -8048,  -4902,  1089,   4681,   5723,   5535,
+    5146,   4006,   2052,   2314,   5274,   8680,   9907,   8776,   6722,
+    2548,   -2403,  -3303,  1224,   7406,   9468,   5089,   -1197,  -4384,
+    -3570,  -298,   1776,   2005,   2041,   1326,   971,    -180,   -2334,
+    -1170,  1913,   4281,   4732,   2874,   1174,   -1341,  -3384,  -2503,
+    368,    4031,   3270,   -986,   -3519,  -5360,  -6004,  -5576,  -3603,
+    208,    708,    -2137,  -4940,  -5349,  -3588,  -2796,  -1399,  1017,
+    3144,   4196,   2483,   828,    338,    919,    3842,   6202,   7189,
+    7499,   6330,   4847,   3252,   2136,   3698,   5845,   5566,   3019,
+    267,    -55,    -1091,  -4220,  -5041,  -3430,  -280,   171,    -4649,
+    -8723,  -9280,  -5975,  -3192,  -3974,  -3912,  -4053,  -3748,  -3570,
+    -5871,  -5499,  -3552,  -1691,  320,    341,    748,    -313,   -3436,
+    -4687,  -3681,  21,     2550,   643,    -2123,  -3254,  -2226,  -1044,
+    -1617,  -1510,  183,    1250,   726,    -1662,  -3388,  -1759,  933,
+    3817,   5242,   3025,   248,    -1339,  -514,   2022,   3410,   3970,
+    3324,   2632,   2603,   2240,   2166,   1271,   487,    1076,   2039,
+    3296,   3836,   3610,   2913,   2718,   4213,   5555,   6023,   4769,
+    2442,   2067,   2173,   1623,   1201,   348,    52,     -124,   -1528,
+    -2834,  -3604,  -3463,  -2357,  -2564,  -3775,  -3801,  -1929,  -465,
+    -2109,  -3743,  -2657,  200,    2580,   954,    -1304,  -95,    1549,
+    2303,   1795,   1633,   3356,   3699,   2361,   792,    1148,   4045,
+    4820,   3851,   3197,   2449,   2704,   1722,   -652,   -1154,  -393,
+    113,    -1010,  -3328,  -4342,  -3939,  -3345,  -3697,  -5115,  -5610,
+    -4202,  -3639,  -5088,  -5351,  -3216,  -862,   -414,   -1839,  -3996,
+    -4831,  -2467,  147,    1055,   1288,   -247,   -2225,  -2233,  -1562,
+    -1278,  -936,   -961,   -935,   -367,   -323,   -459,   -1940,  -3974,
+    -2262,  -13,    2,      -401,   -1825,  -2308,  -1124,  448,    2154,
+    2434,   1300,   -812,   -1337,  1325,   3374,   3466,   2500,   2156,
+    3439,   3549,   2068,   1392,   1986,   3025,   3944,   3898,   3259,
+    4467,   6347,   5356,   2893,   1690,   2072,   4136,   5313,   2776,
+    -236,   -1063,  -794,   524,    802,    -1377,  -2879,  -2167,  -1439,
+    -1595,  -1539,  -1666,  -2495,  -2375,  -1253,  -515,   -187,   -1409,
+    -2847,  -511,   2411,   1761,   492,    -18,    607,    2350,   3288,
+    3505,   2741,   1099,   699,    2017,   3214,   3333,   1567,   33,
+    1260,   1925,   808,    -377,   -2558,  -3781,  -1677,  164,    -580,
+    -1727,  -2619,  -3421,  -3586,  -3957,  -4562,  -3646,  -2285,  -3437,
+    -5293,  -4792,  -4128,  -4012,  -2920,  -2249,  -2439,  -3737,  -5607,
+    -4427,  -1259,  71,     609,    555,    -1039,  -3354,  -5388,  -3760,
+    415,    2513,   2513,   819,    -1436,  -2780,  -2740,  -501,   2727,
+    3936,   1491,   -965,   -766,   -484,   -223,   361,    695,    1771,
+    1130,   -1839,  -1764,  797,    -31,    -2549,  -1790,  2108,   4043,
+    887,    -154,   2411,   2605,   2012,   1977,   3923,   6630,   4176,
+    107,    -311,   1731,   1910,   1011,   3119,   3219,   998,    -1282,
+    -2832,  -1645,  -685,   945,    2574,   2543,   -267,   -5015,  -3819,
+    -342,   1228,   2055,   -619,   -1233,  2069,   2896,   1095,   62,
+    1365,   3366,   4584,   4956,   3323,   -19,    -50,    4024,   5222,
+    3695,   3118,   1933,   1256,   1443,   128,    -119,   2043,   2477,
+    1823,   1324,   30,     -1363,  -3023,  -3074,  -188,   621,    -1775,
+    -2806,  -2961,  -2753,  -4359,  -5350,  -1220,  -116,   -4157,  -4811,
+    -2793,  -1040,  -1957,  -2862,  -1901,  -3192,  -3720,  -2357,  -1727,
+    -387,   -2131,  -5011,  -3650,  -454,   596,    -1298,  -3716,  -3122,
+    496,    136,    -2415,  -1675,  -811,   -837,   140,    -1243,  -187,
+    -1431,  -5320,  -2121,  100,    -467,   2465,   681,    -2093,  1224,
+    1632,   1428,   1776,   648,    2480,   3622,   876,    259,    1403,
+    2139,   3117,   497,    -763,   -170,   279,    1769,   342,    -871,
+    -25,    -1549,  -2290,  290,    1042,   -796,   -4291,  -3895,  159,
+    1264,   -540,   -2328,  -702,   1972,   852,    -2274,  -798,   1126,
+    -579,   -480,   3481,   3833,   1004,   901,    1536,   1809,   3103,
+    2521,   3183,   5220,   1800,   -266,   4663,   4230,   -790,   159,
+    2274,   5114,   4304,   -1998,  344,    4921,   -343,   -2048,  1180,
+    2112,   3109,   -10,    -1818,  552,    -1360,  -2889,  -1302,  -1918,
+    -37,    1406,   -1762,  -3054,  -1446,  -2073,  -4292,  -3214,  1163,
+    2333,   -712,   -2583,  -2058,  -1034,  -600,   -3796,  -2395,  2137,
+    -1122,  -1927,  702,    -2196,  -4374,  -3257,  -1558,  -256,   -728,
+    -395,   -176,   -1529,  -2772,  -1121,  -340,   -1147,  -250,   -4079,
+    -473,   4241,   -2818,  -3523,  3255,   2355,   -2550,  -1082,  1197,
+    2213,   -94,    -237,   3123,   1314,   -1075,  977,    1081,   2045,
+    2966,   -1328,  -1069,  -741,   -524,   -380,   -2766,  -986,   926,
+    -3281,  -1554,  2554,   -3620,  -6394,  -1680,  -321,   2889,   243,
+    -1567,  2276,   -1294,  -525,   2010,   -4883,  -1495,  6778,   2085,
+    -873,   2496,   418,    -1156,  -1179,  1604,   6173,   1190,   -2381,
+    5788,   2431,   -4941,  -242,   1248,   1023,   4426,   3399,   2726,
+    1388,   -922,   595,    392,    1414,   6260,   2673,   -973,   2237,
+    1776,   -2393,  -757,   4158,   2842,   -2327,  505,    1230,   -3623,
+    -917,   336,    -1400,  -1018,  1771,   2696,   -570,   -2435,  886,
+    2309,   -2865,  -1328,  2077,   -1967,  -3486,  -411,   961,    -1661,
+    -1979,  1179,   -493,   -2597,  1995,   284,    -3300,  -2213,  184,
+    312,    -1665,  -641,   -1325,  -1276,  90,     69,     476,    -778,
+    -1099,  853,    1515,   1630,   1188,   -877,   -1751,  702,    2983,
+    -201,   664,    4018,   -352,   -1864,  875,    2367,   813,    -2463,
+    -702,   886,    -2204,  -2216,  399,    -1729,  -2408,  1412,   -2757,
+    -3530,  449,    -2554,  -3910,  906,    697,    -1696,  566,    -1360,
+    -1991,  81,     -1756,  -159,   1180,   -667,   -584,   -359,   183,
+    1943,   -412,   -1747,  1659,   1961,   280,    294,    222,    2000,
+    2076,   829,    -43,    -880,   3353,   3615,   1279,   1746,   -1031,
+    1301,   3477,   -777,   2567,   1215,   -2344,  3556,   561,    -2166,
+    1119,   2377,   -391,   -1825,  -2359,  49,     1764,   391,    -291,
+    325,    1223,   1443,   -624,   -2828,  1381,   2438,   28,     -652,
+    -166,   581,    -2039,  -374,   -20,    -2459,  -1149,  1505,   2008,
+    -1798,  -3848,  -1796,  -2208,  -2224,  -878,   728,    -154,   -534,
+    1061,   538,    -1465,  73,     1147,   82,     -119,   3800,   4797,
+    -873,   784,    1458,   -148,   3180,   1319,   908,    4951,   584,
+    -57,    2394,   -967,   586,    405,    -1601,  3566,   -285,   -3949,
+    -1301,  -1953,  -1223,  -1831,  -3477,  -779,   -389,   -3169,  -1828,
+    -1496,  -1451,  -556,   -3327,  -209,   534,    -4908,  131,    -386,
+    -5232,  1373,   2129,   -1740,  -1957,  -1102,  76,     396,    -1426,
+    -179,   1357,   -3276,  -1420,  3819,   -44,    56,     2777,   -1202,
+    1908,   1410,   2031,   3495,   -2197,  -163,   1565,   239,    2803,
+    480,    -1636,  1180,   616,    1206,   1166,   -1579,  1572,   814,
+    -774,   2310,   740,    -2606,  1234,   -603,   -362,   1562,   -2134,
+    652,    -777,   -2353,  5464,   377,    -2490,  1012,   157,    680,
+    -1389,  -1898,  1135,   -1,     -1730,  1800,   -1466,  -1687,  -1469,
+    -3250,  -1081,  1381,   -81,    -204,   -26,    353,    1941,   174,
+    104,    2009,   1032,   -871,   3280,   3398,   -651,   -154,   3309,
+    1964,   448,    812,    -17,    887,    2405,   3295,   -54,    -2396,
+    1410,   1380,   -1156,  296,    -1706,  -1729,  401,    -970,   -878,
+    -723,   -2285,  1259,   1320,   -1960,  -1039,  -211,   -661,   -763,
+    -1599,  -43,    308,    -1841,  72,     -2075,  -3010,  -497,   506,
+    -377,   247,    1932,   -1788,  -2419,  257,    208,    -2176,  488,
+    2827,   -1720,  -1649,  -619,   520,    1103,   -1231,  -1327,  2162,
+    1535,   -383,   315,    -1488,  -235,   1761,   -27,    -232,   515,
+    127,    -2239,  654,    2871,   -379,   -1274,  2445,   874,    -2444,
+    514,    -206,   -1289,  1314,   1869,   1316,   1878,   -1454,  -982,
+    476,    359,    2084,   -708,   405,    -246,   -1071,  1757,   -866,
+    -2331,  783,    501,    -853,   896,    36,     -2468,  -1138,  1445,
+    -613,   -687,   1999,   -449,   -731,   1478,   384,    -45,    96,
+    1530,   1919,   186,    -94,    1347,   -329,   -348,   1631,   574,
+    1062,   735,    -1652,  675,    244,    1241,   1137,   -2469,  621,
+    45,     -612,   1308,   -2015,  -208,   2392,   -1646,  -67,    77,
+    -1558,  113,    1263,   -236,   -971,   -333,   -733,   -555,   2024,
+    -135,   -3817,  -398,   1696,   -1179,  -1473,  1175,   -166,   618,
+    1132,   -2504,  -575,   146,    -688,   1323,   150,    -2021,  15,
+    1673,   347,    -1535,  -106,   235,    -32,    1167,   -471,   -503,
+    -1260,  416,    -13,    -1082,  1036,   -790,   -1676,  487,    985,
+    77,     57,     -1175,  1146,   2023,   -1706,  -404,   3249,   -739,
+    -979,   3044,   -514,   -168,   2201,   -2863,  1009,   1833,   -2309,
+    1565,   476,    -1698,  1667,   -496,   -2193,  1686,   532,    336,
+    -1095,  -1655,  578,    -909,   -1263,  2569,   -2833,  -1808,  2860,
+    -822,   27,     1098,   -1371,  1585,   -284,   -1074,  2944,   -764,
+    -2871,  2484,   1179,   -1213,  -670,   -1226,  1112,   1837,   -299,
+    -388,   -51,    1,      992,    -723,   -361,   1723,   -1115,  -2012,
+    1261,   -9,     -127,   -510,   -1550,  1448,   957,    -1930,  171,
+    776,    -2104,  14,     764,    -599,   -745,   -438,   -371,   -659,
+    1075,   282,    -3116,  684,    3747,   22,     -2139,  816,    1413,
+    -333,   458,    906,    483,    -1084,  797,    1039,   -467,   -377,
+    1386,   -1182,  610,    1787,   -1354,  -2800,  2638,   424,    -2372,
+    1153,   -51,    -689,   290,    -2199,  818,    3755,   -2674,  -1689,
+    3497,   -507,   -1978,  1729,   1413,   215,    -76,    53,     759,
+    371,    -1529,  1005,   -770,   -685,   1754,   -908,   -653,   1047,
+    -1066,  -784,   -199,   -526,   86,     -1750,  -916,   1839,   580,
+    -1884,  319,    226,    -977,   212,    202,    -741,   -1013,  2057,
+    69,     -2961,  974,    1964,   -512,   -224,   1554,   -79,    -1142,
+    1853,   -71,    1009,   1174,   -718,   2040,   -158,   -1508,  1042,
+    0,      -1219,  1212,   448,    -208,   -47,    -779,   -867,   1924,
+    -254,   -1085,  -221,   -1283,  1543,   -584,   -951,   225,    -1089,
+    -464,   -853,   -615,   1576,   -2313,  -1214,  950,    -2548,  -314,
+    1201,   -1527,  952,    764,    -1915,  528,    169,    -1676,  1742,
+    425,    -2346,  932,    290,    109,    492,    -379,   932,    70,
+    582,    135,    769,    1665,   -1751,  576,    1013,   366,    2339,
+    71,     637,    1500,   576,    111,    494,    765,    1170,   1421,
+    -5,     -892,   2054,   -640,   160,    1426,   -651,   348,    -841,
+    -558,   1563,   277,    -408,   -1468,  482,    -1538,  -2255,  968,
+    -1307,  -454,   1306,   -3085,  -1680,  2624,   -2191,  -1719,  1891,
+    -3826,  -1441,  2736,   -3694,  -266,   1897,   -4468,  841,    2828,
+    -4060,  -318,   2305,   -1662,  528,    3056,   -2429,  -156,   2045,
+    -753,   475,    419,    -597,   1100,   1845,   504,    1067,   -402,
+    -824,   1807,   1192,   459,    200,    1728,   50,     -497,   678,
+    -355,   938,    1239,   -1223,  360,    1251,   -95,    981,    1029,
+    -1940,  260,    1627,   -2387,  3426,   519,    -3141,  1822,   -506,
+    -1471,  1101,   -2137,  1069,   885,    -2618,  1673,   -463,   -1558,
+    1439,   -386,   -1923,  1538,   -1313,  -1735,  540,    -1433,  -915,
+    494,    -839,   -1527,  -1143,  480,    -1081,  27,     1732,   -1285,
+    -1833,  1952,   -667,   -1626,  1819,   -1293,  -1323,  2139,   -376,
+    -1392,  1277,   -1172,  -240,   2907,   -1875,  -238,   2573,   -1068,
+    -471,   2065,   -686,   -1315,  2575,   233,    -1005,  1135,   706,
+    534,    278,    -182,   1091,   -21,    -222,   1413,   -371,   -54,
+    1108,   -103,   382,    -70,    787,    894,    -108,   1308,   1113,
+    -1412,  574,    1140,   -2032,  500,    569,    -1251,  951,    -50,
+    -1398,  772,    -474,   -1536,  1297,   251,    -2321,  109,    -703,
+    -425,   40,     -1354,  -773,   -225,   -1743,  -1839,  1244,   261,
+    -3082,  -424,   1162,   -937,   123,    -322,   -407,   -561,   -331,
+    1369,   -1142,  -1050,  1024,   1116,   -213,   -752,   1521,   -383,
+    -415,   1011,   947,    -713,   743,    1945,   -237,   881,    600,
+    -757,   885,    -835,   756,    2454,   -1985,  699,    1572,   -1652,
+    673,    232,    -42,    1975,   -736,   -270,   1660,   -704,   -96,
+    1264,   -428,   278,    774,    -954,   -1325,  756,    1275,   -594,
+    -353,   204,    -1130,  -782,   -432,   -979,   268,    378,    20,
+    -870,   405,    -357,   -1661,  637,    473,    293,    -314,   -895,
+    3,      -175,   -1016,  -643,   204,    -588,   -1007,  -131,   401,
+    -849,   -476,   271,    320,    -198,   533,    -25,    -1994,  1421,
+    525,    -1611,  1261,   507,    -488,   1093,   361,    -1814,  2230,
+    312,    -196,   3242,   -803,   -962,   1714,   -1479,  1426,   1612,
+    -1953,  1376,   -581,   -669,   1370,   -1251,  426,    1274,   -470,
+    1757,   807,    -589,   1275,   126,    -871,   1025,   -1331,  287,
+    1258,   -1813,  146,    -839,   -1471,  828,    -402,   -281,   1704,
+    -1341,  -231,   939,    -1035,  -472,   -197,   -764,   -380,   -816,
+    -266,   382,    -497,   -1708,  -591,   1119,   -1941,  178,    969,
+    -1656,  685,    1004,   -1114,  -127,   -1473,  -678,   1610,   -1253,
+    277,    1807,   -1642,  -461,   2033,   -1449,  392,    98,     -157,
+    1525,   -860,   2455,   413,    -2159,  2457,   475,    -374,   1532,
+    -981,   843,    973,    324,    1168,   225,    -407,   1487,   681,
+    -680,   1098,   117,    245,    1238,   -223,   1076,   -428,   -466,
+    2593,   -663,   -1225,  1303,   -933,   -561,   1190,   -1071,  -1229,
+    406,    -284,   -13,    198,    -1494,  -637,   352,    -1960,  420,
+    49,     -1472,  -761,   -234,   -2213,  -1750,  -521,   -1554,  -813,
+    662,    -633,   -1388,  -15,    -947,   -391,   -152,   -894,   631,
+    -461,   -885,   633,    -51,    -1063,  218,    1149,   -61,    -274,
+    988,    -140,   7,      1774,   1558,   -623,   755,    1352,   -511,
+    1106,   744,    17,     2640,   -91,    697,    1547,   -1757,  1832,
+    1859,   -206,   1505,   575,    -444,   556,    250,    1786,   792,
+    -125,   -266,   407,    501,    798,    -536,   -1214,  58,     6,
+    354,    -685,   613,    99,     -2022,  -116,   -236,   -182,   263,
+    -824,   -1187,  -142,   -138,   -1228,  -1008,  786,    -1421,  -1127,
+    -269,   -2278,  841,    222,    -2423,  678,    -1153,  -2082,  574,
+    -570,   -729,   180,    -777,   212,    270,    -274,   1077,   -493,
+    118,    804,    -1260,  349,    799,    545,    481,    971,    1099,
+    1146,   -273,   34,     1728,   1128,   411,    758,    308,    -808,
+    950,    1490,   209,    -265,   1154,   -11,    -460,   2644,   -122,
+    -728,   2033,   -1100,  -305,   1774,   -208,   -1567,  -57,    -140,
+    -670,   -454,   -1390,  -80,    978,    -438,   -731,   -684,   344,
+    -458,   -199,   -126,   -1663,  -883,   642,    -1517,  -1144,  -375,
+    -422,   -452,   -1815,  -791,   763,    -1502,  -205,   684,    -1641,
+    448,    1399,   -2160,  804,    1088,   -2214,  1030,   1585,   -1093,
+    -11,    1718,   -360,   -81,    1294,   398,    218,    1225,   644,
+    505,    2090,   -385,   526,    2111,   -303,   -316,   1550,   1323,
+    -459,   881,    1874,   -1256,  1429,   2485,   -1003,  -552,   14,
+    432,    952,    471,    -633,   408,    -358,   140,    554,    -1260,
+    -404,   245,    -2572,  954,    1005,   -1621,  -82,    -175,   -957,
+    112,    106,    -1117,  -819,   -62,    -785,   71,     93,     -1296,
+    -1680,  242,    -956,   -2696,  302,    -204,   -1404,  254,    -558,
+    -201,   -630,   16,     -436,   -1647,  1649,   -1096,  -1267,  2273,
+    -1270,  20,     1749,   -2509,  780,    942,    -1859,  2762,   304,
+    -300,   2617,   -947,   861,    2601,   -1153,  754,    1629,   -681,
+    686,    1443,   -235,   1900,   5,      -565,   1559,   285,    -170,
+    757,    480,    547,    752,    -427,   50,     839,    -95,    -791,
+    -1698,  -291,   -62,    -1730,  524,    1008,   -2176,  -369,   165,
+    -749,   -972,   -287,   889,    -1218,  -1712,  833,    -855,   -995,
+    -14,    -793,   -1815,  605,    -607,   -1890,  769,    -781,   230,
+    1155,   -2000,  876,    1835,   -1617,  9,      1058,   -1232,  859,
+    1486,   -1301,  1595,   501,    -951,   2935,   -921,   -634,   2826,
+    -793,   655,    2660,   -232,   235,    1879,   481,    -51,    804,
+    987,    -360,   -331,   2099,   -302,   -149,   1966,   -1233,  -12,
+    1330,   -2265,  1256,   -116,   -1394,  2937,   -995,   -1572,  2964,
+    -2257,  -2587,  1820,   -2132,  -1609,  778,    -1596,  -486,   560,
+    -1749,  274,    -706,   -1714,  1304,   -360,   -2657,  1833,   -750,
+    -1729,  433,    -1461,  -794,   -1545,  -892,   385,    -891,   -374,
+    1261,   -589,   235,    815,    -773,   -669,   636,    -471,   136,
+    871,    -392,   782,    677,    -472,   1130,   1029,   -1262,  1070,
+    2171,   575,    675,    600,    2104,   1077,   -182,   2621,   -604,
+    -30,    3302,   -1331,  599,    742,    291,    1329,   -551,   1043,
+    1729,   -1754,  1220,   1113,   -2174,  1281,   743,    -2027,  851,
+    -205,   -1576,  214,    -1629,  -605,   -394,   -1508,  -254,   -63,
+    -489,   -847,   -26,    -997,   -1065,  -120,   -376,   -1283,  -1393,
+    83,     -212,   -1610,  419,    -1120,  -590,   395,    -1210,  -21,
+    -273,   -622,   899,    -196,   -1059,  1130,   616,    -529,   -166,
+    794,    22,     -216,   862,    664,    -390,   980,    228,    789,
+    182,    402,    2149,   -1133,  799,    2637,   -799,   176,    1306,
+    905,    -93,    677,    338,    121,    483,    297,    339,    347,
+    249,    731,    40,     66,     112,    -889,   -128,   582,    -1191,
+    -67,    -1364,  -233,   488,    -1734,  -634,   1517,   -1657,  -1015,
+    594,    -1422,  1396,   -1357,  -1617,  1254,   -1596,  -941,   789,
+    -1860,  -77,    245,    -327,   569,    -723,   104,    905,    -543,
+    -918,   1387,   -42,    -440,   619,    68,     45,     1364,   -880,
+    19,     1491,   -561,   1174,   1403,   -1411,  1351,   1222,   -612,
+    864,    877,    -658,   382,    864,    -552,   1286,   309,    -105,
+    1083,   -170,   -289,   1049,   -248,   -537,   625,    -48,    337,
+    -385,   532,    -315,   -1398,  588,    -628,   -1192,  649,    -806,
+    -170,   541,    -2267,  1052,   274,    -1970,  833,    253,    -1345,
+    -290,   -120,   -959,   -94,    -189,   -1397,  -136,   -155,   -654,
+    207,    -706,   617,    415,    -1962,  1169,   670,    -1132,  319,
+    297,    -589,   100,    510,    -620,   610,    -153,   -15,    1327,
+    -99,    229,    281,    169,    1015,   -106,   1197,   577,    -698,
+    577,    931,    -964,   1605,   505,    -1713,  2369,   115,    -1585,
+    1839,   664,    -1411,  867,    620,    329,    491,    -1119,  420,
+    266,    -1708,  499,    -69,    -1037,  795,    -321,   -959,   32,
+    235,    -1748,  295,    -249,   -230,   485,    -1185,  -97,    489,
+    -2036,  711,    405,    -2800,  593,    434,    -1038,  536,    347,
+    -570,   705,    -806,   -290,   818,    -999,   53,     1585,   -756,
+    -657,   1180,   115,    -364,   217,    -226,   1033,   347,    -20,
+    611,    658,    590,    -128,   -451,   1676,   -660,   -21,    805,
+    -880,   1481,   412,    -1534,  1522,   221,    -132,   662,    -407,
+    613,    1132,   -551,   -187,   1184,   -577,   -444,   953,    -1034,
+    -472,   461,    -865,   -99,    637,    -572,   300,    450,    -591,
+    137,    404,    -972,   306,    -524,   -1167,  433,    124,    -1326,
+    -368,   -305,   -917,   452,    -626,   -695,   656,    258,    -1401,
+    270,    446,    -1045,  636,    -357,   -1072,  913,    512,    -1732,
+    489,    952,    -747,   58,     673,    -453,   1125,   -488,   46,
+    1723,   -1244,  417,    1803,   -1215,  623,    659,    -560,   676,
+    -9,     92,     701,    1100,   -623,   142,    283,    -512,   547,
+    576,    -525,   -155,   1143,   -1286,  -329,   1959,   -1302,  -459,
+    1188,   -1199,  1020,   -118,   -1303,  956,    -905,   -647,   595,
+    -356,   -1354,  -74,    750,    -791,   -335,   56,     -862,   -36,
+    276,    -279,   46,     -485,   -181,   196,    -584,   -238,   259,
+    -314,   -77,    383,    509,    -386,   -180,   859,    -542,   955,
+    372,    -362,   1458,   113,    -106,   1495,   -534,   63,     1295,
+    -505,   846,    983,    -1097,  1764,   320,    -185,   1061,   -525,
+    115,    217,    -328,   326,    312,    374,    179,    -683,   485,
+    -1286,  147,    -583,   -979,   888,    -504,   -1235,  715,    -1050,
+    -1111,  848,    -828,   -1043,  -115,   -327,   22,     -451,   -1008,
+    98,     -262,   -545,   -363,   -48,    -257,   -731,   878,    96,
+    -1186,  426,    359,    -1101,  1074,   -267,   521,    -375,   -166,
+    1398,   -994,   780,    550,    124,    -298,   581,    236,    305,
+    -111,   396,    741,    -10,    662,    155,    271,    563,    65,
+    -318,   812,    -483,   843,    75,     -714,   1152,   -26,    -190,
+    -97,    533,    -111,   -564,   724,    -24,    -820,   835,    -473,
+    -632,   154,    -104,   -932,   919,    -606,   -619,   496,    -310,
+    -271,   -360,   120,    -630,   126,    65,     -931,   548,    -207,
+    -455,   410,    -282,   -931,   944,    -354,   69,     412,    -661,
+    1068,   -969,   -443,   1894,   -1281,  -442,   2003,   -1640,  713,
+    852,    -1344,  1338,   -457,   243,    498,    -697,   -129,   993,
+    -388,   -76,    1039,   -768,   492,    -104,   -58,    951,    -854,
+    181,    1093,   -1111,  491,    544,    -1061,  118,    586,    -477,
+    -411,   392,    233,    91,     -908,   532,    218,    -1176,  670,
+    -74,    -674,   696,    -801,   194,    592,    -1790,  762,    -564,
+    -791,   595,    -145,   -727,   228,    434,    -246,   -232,   -169,
+    281,    -324,   289,    -120,   -270,   -49,    282,    250,    -56,
+    -405,   507,    27,     -1060,  1329,   -203,   -204,   1677,   -767,
+    -313,   1272,   -968,   717,    183,    -1652,  2157,   -75,    -1906,
+    2590,   -428,   -1614,  2564,   -1511,  -240,   1421,   -1911,  1420,
+    396,    -1397,  1691,   -694,   -1500,  1942,   -823,   -784,   841,
+    -635,   759,    -447,   351,    44,     -946,   227,    441,    -564,
+    155,    -719,   182,    509,    -320,   -300,   205,    -662,   726,
+    469,    -1240,  191,    664,    -269,   -152,   -18,    214,    -149,
+    -257,   347,    76,     -79,    -384,   874,    -387,   -269,   892,
+    -783,   537,    46,     27,     251,    -332,   133,    377,    -522,
+    232,    626,    -362,   -499,   1112,   -342,   -522,   362,    -187,
+    547,    -384,   -155,   517,    -551,   227,    651,    -825,   -88,
+    579,    -758,   -40,    456,    -774,   542,    -164,   -482,   968,
+    -1000,  -394,   1094,   -885,   431,    74,     -348,   403,    -959,
+    831,    -465,   -330,   762,    -717,   -645,   1342,   -499,   -416,
+    944,    -417,   -438,   737,    -368,   -42,    740,    -1234,  689,
+    29,     -106,   619,    -824,   -10,    1047,   -824,   146,    -59,
+    210,    163,    -43,    522,    -352,   213,    460,    -1049,  599,
+    308,    -843,   632,    223,    -504,   296,    530,    -931,   751,
+    -176,   -524,   379,    236,    -626,   66,     662,    -575,   191,
+    -175,   -619,   660,    -424,   -217,   704,    -498,   200,    62,
+    -543,   280,    91,     -378,   54,     168,    -554,   670,    -215,
+    -1097,  1805,   -1015,  -617,   1642,   -1560,  727,    61,     7,
+    -48,    -659,   1308,   -752,   -613,   914,    160,    -469,   164,
+    -167,   274,    326,    -667,   497,    333,    -757,   1252,   -481,
+    -1257,  2019,   -949,   -719,   1676,   -1078,  250,    323,    -1100,
+    1550,   145,    -1697,  972,    522,    -966,   374,    -365,   846,
+    -276,   -756,   629,    -278,   302,    -151,   -243,   -363,   841,
+    -7,     -1092,  476,    45,     201,    -378,   -456,   1113,   -926,
+    97,     178,    -240,   326,    -597,   472,    -10,    -190,   394,
+    -501,   -259,   307,    133,    240,    -433,   -192,   472,    -190,
+    12,     398,    -191,   -605,   1295,   -576,   -154,   474,    -661,
+    866,    -968,   172,    887,    -736,   36,     259,    -201,   265,
+    460,    -859,   622,    102,    -690,   776,    -80,    -745,   919,
+    140,    -750,   224,    134,    -236,   -196,   456,    409,    -1069,
+    600,    239,    -306,   -383,   541,    -213,   -323,   -121,   700,
+    -735,   179,    222,    -613,   653,    -711,   -81,    592,    -694,
+    117,    703,    -772,   -264,   644,    -117,   -422,   276,    64,
+    -355,   -430,   800,    -74,    -619,   1207,   -1057,  4,      960,
+    -1219,  977,    -78,    -1186,  1536,   267,    -1388,  1144,   -90,
+    -1052,  1889,   -1255,  -387,   1815,   -1763,  1037,   421,    -1003,
+    767,    -24,    -277,   -54,    759,    -285,   -1015,  1422,   -581,
+    -121,   547,    -687,   288,    440,    -626,   -623,   1261,   -248,
+    -1133,  1204,   -714,   382,    219,    -851,   240,    -161,   672,
+    -261,   -855,   1043,   -599,   111,    -362,   225,    641,    -913,
+    -122,   1075,   -1165,  432,    131,    -803,   978,    33,     -1291,
+    992,    224,    -1054,  789,    -121,   -215,   262,    -11,    89,
+    -174,   365,    -240,   114,    406,    -813,   291,    233,    158,
+    -377,   194,    216,    -477,   635,    -228,   -512,   599,    23,
+    -273,   71,     258,    10,     -155,   -198,   354,    61,     -749,
+    768,    -19,    -709,   596,    97,     -276,   164,    69,     -144,
+    -20,    529,    -897,   188,    480,    -703,   836,    -874,   259,
+    917,    -1044,  -7,     566,    -97,    -439,   256,    -466,   998,
+    -360,   -1134,  1619,   -762,   -752,   1446,   -707,   -177,   652,
+    -899,   579,    253,    -410,   146,    -262,   275,    353,    -610,
+    52,     671,    -862,   419,    -140,   273,    247,    -1062,  1005,
+    -175,   -497,   772,    -431,   -101,   450,    -598,   266,    428,
+    -842,   477,    -11,    -554,   642,    17,     -787,   544,    445,
+    -625,   -205,   796,    -222,   -733,   764,    -572,   423,    166,
+    -994,   931,    -228,   -303,   362,    -214,   104,    448,    -1091,
+    722,    570,    -1311,  773,    259,    -648,   477,    193,    -682,
+    302,    459,    -464,   -383,   1120,   -561,   -564,   1083,   -372,
+    -354,   864,    -586,   -200,   502,    -331,   27,     446,    -657,
+    281,    571,    -888,   502,    251,    -423,   116,    277,    -263,
+    118,    -170,   168,    367,    -723,   202,    438,    -793,   451,
+    -30,    -292,   202,    38,     -188,   -66,    221,    -90,    -105,
+    7,      346,    -578,   337,    247,    -371,   -14,    22,     36,
+    151,    -322,   -244,   692,    -556,   -5,     550,    -560,   200,
+    161,    -347,   191,    258,    -520,   441,    -212,   -215,   584,
+    -428,   -251,   213,    90,     -187,   109,    138,    -211,   -17,
+    191,    111,    -259,   161,    -141,   232,    -175,   0,      154,
+    -369,   539,    -171,   -438,   484,    43,     -375,   -37,    249,
+    196,    -328,   -106,   541,    -531,   103,    240,    -191,   186,
+    -363,   40,     585,    -573,   258,    170,    -593,   515,    -261,
+    -86,    407,    -339,   164,    -214,   -34,    464,    -377,   -206,
+    336,    -230,   239,    -85,    -69,    322,    -503,   322,    142,
+    -748,   867,    -160,   -753,   836,    -249,   -362,   750,    -374,
+    -222,   448,    -82,    -246,   399,    13,     -429,   441,    -47,
+    -127,   -29,    337,    -502,   318,    132,    -457,   498,    -145,
+    -91,    98,     208,    -179,   54,     62,     -260,   237,    96,
+    -161,   32,     -150,   93,     21,     -31,    74,     75,     -322,
+    164,    168,    -191,   119,    -121,   -66,    -195,   296,    -128,
+    -251,   381,    -56,    -338,   281,    -29,    -472,   664,    -301,
+    -275,   423,    -285,   -77,    258,    -82,    -139,   160,    -54,
+    -26,    27,     75,     -49,    -196,   305,    -131,   -187,   262,
+    -37,    -206,   65,     269,    -240,   -144,   261,    54,     -338,
+    355,    3,      -503,   535,    -253,   -210,   433,    -290,   -33,
+    381,    -546,   173,    252,    -364,   271,    -329,   166,    266,
+    -564,   507,    -32,    -648,   861,    -400,   -357,   819,    -519,
+    -74,    392,    -423,   426,    -306,   -93,    691,    -991,   537,
+    467,    -992,   614,    426,    -823,   491,    182,    -371,   174,
+    84,     -64,    98,     -96,    23,     182,    -69,    -211,   226,
+    18,     -134,   334,    -514,   352,    378,    -623,   363,    266,
+    -592,   493,    -46,    -369,   594,    -440,   -10,    295,    -368,
+    326,    -192,   -140,   306,    -305,   140,    198,    -396,   202,
+    154,    -341,   208,    -8,     -169,   -76,    106,    20,     -347,
+    233,    30,     -193,   117,    -9,     -165,   182,    -4,     -195,
+    96,     131,    -188,   -106,   166,    -71,    -99,    57,     4,
+    -31,    -131,   101,    63,     -199,   225,    -25,    -281,   342,
+    -247,   -170,   516,    -289,   -263,   422,    -158,   -148,   363,
+    -192,   -138,   122,    62,     -105,   7,      194,    -53,    -224,
+    83,     173,    -182,   20,     178,    -274,   182,    74,     -109,
+    -5,     319,    -303,   -72,    428,    -371,   50,     271,    -204,
+    17,     161,    -256,   169,    93,     -169,   94,     -89,    139,
+    80,     -199,   325,    -67,    -83,    202,    -154,   16,     202,
+    -325,   162,    61,     -93,    201,    -278,   236,    108,    -477,
+    594,    -145,   -370,   647,    -261,   -356,   669,    -369,   -181,
+    420,    -266,   -154,   159,    -25,    53,     -40,    -22,    68,
+    -203,   144,    -2,     -173,   88,     -3,     -62,    2,      75,
+    55,     -95,    -130,   219,    -142,   -191,   164,    -170,   44,
+    0,      -246,   249,    -27,    -413,   461,    27,     -490,   292,
+    19,     -145,   13,     99,     91,     -466,   209,    295,    -773,
+    465,    210,    -680,   410,    163,    -358,   399,    -201,   87,
+    23,     -212,   270,    -230,   86,     159,    -353,   381,    -73,
+    -456,   726,    -353,   -357,   754,    -367,   -344,   657,    -59,
+    -417,   432,    35,     -309,   153,    97,     -69,    89,     -101,
+    63,     107,    -127,   106,    112,    -26,    -236,   376,    43,
+    -479,   544,    -57,    -407,   447,    -148,   -103,   195,    -198,
+    80,     156,    -228,   35,     145,    -77,    -55,    130,    -33,
+    -190,   123,    41,     -170,   74,     114,    -241,   67,     192,
+    -195,   -76,    186,    -136,   -133,   213,    -105,   -110,   144,
+    -51,    -126,   154,    -59,    -124,   147,    -49,    -132,   82,
+    26,     -130,   63,     68,     -211,   97,     131,    -224,   59,
+    184,    -250,   59,     205,    -225,   -67,    163,    -135,   -24,
+    74,     -22,    -4,     -81,    21,     71,     -137,   71,     47,
+    -120,   71,     34,     -65,    138,    -6,     -116,   112,    -47,
+    -39,    20,     -75,    64,     -7,     2,      35,     52,     -61,
+    -29,    81,     -61,    -30,    195,    -91,    -136,   261,    -11,
+    -186,   162,    -86,    -35,    152,    -106,   -32,    126,    -4,
+    49,     33,     -9,     -11,    46,     111,    -132,   -3,     204,
+    -175,   -10,    281,    -146,   -94,    226,    -126,   -36,    58,
+    -14,    61,     -172,   48,     193,    -221,   83,     149,    -279,
+    195,    130,    -357,   226,    102,    -260,   191,    16,     -223,
+    124,    14,     -144,   90,     -31,    -81,    -66,    54,     103,
+    -181,   29,     174,    -281,   92,     81,     -226,   139,    -133,
+    -41,    167,    -147,   44,     27,     -132,   107,    -34,    -122,
+    105,    -54,    17,     52,     -131,   138,    33,     -206,   158,
+    43,     -80,    24,     10,     -27,    33,     43,     -71,    15,
+    71,     -42,    14,     18,     0,      -3,     -14,    -14,    58,
+    46,     -99,    122,    105,    -202,   125,    119,    -238,   112,
+    133,    -242,   113,    129,    -301,   52,     161,    -177,   82,
+    73,     -139,   46,     122,    -119,   22,     155,    -230,   23,
+    242,    -211,   -12,    182,    -184,   -57,    190,    -34,    -101,
+    58,     -20,    6,      103,    -61,    -78,    12,     18,     12,
+    86,     -71,    -27,    43,     -24,    8,      39,     -109,   21,
+    -4,     -44,    66,     13,     -59,    61,     -39,    35,     113,
+    -179,   19,     171,    -158,   14,     112,    -133,   26,     9,
+    -43,    -9,     6,      41,     -77,    22,     80,     -61,    -63,
+    65,     -32,    -32,    125,    -105,   -11,    114,    -120,   42,
+    42,     -92,    45,     -56,    -25,    131,    -83,    -24,    97,
+    -51,    -5,     67,     -69,    7,      41,     -27,    8,      3,
+    -10,    8,      -3,     -87,    -28,    122,    -33,    -58,    124,
+    -53,    -50,    67,     -115,   -17,    111,    -112,   -30,    101,
+    -24,    -13,    41,     3,      45,     -13,    -34,    23,     23,
+    -19,    13,     -49,    -49,    68,     -68,    -32,    91,     -58,
+    -18,    73,     -19,    -27,    17,     -33,    -35,    99,     -38,
+    -99,    78,     -31,    -62,    95,     -71,    -124,   184,    -15,
+    -146,   160,    -27,    -109,   140,    -25,    -63,    84,     -34,
+    -18,    58,     -68,    -16,    22,     -87,    86,     23,     -130,
+    61,     62,     -132,   51,     168,    -139,   35,     133,    -121,
+    50,     102,    -120,   40,     126,    -87,    -40,    119,    -14,
+    -59,    78,     11,     -68,    41,     24,     -25,    55,     -2,
+    15,     21,     -73,    56,     88,     -74,    -41,    4,      -10,
+    -4,     5,      7,      -39,    -3,     -4,     -39,    94,     52,
+    -135,   42,     90,     -86,    12,     21,     -55,    -70,    -37,
+    55,     -63,    -35,    50,     -100,   21,     84,     -151,   24,
+    87,     -94,    51,     2,      -58,    104,    -61,    -70,    60,
+    -25,    -42,    -31,    55,     35,     -129,   47,     69,     -65,
+    77,     2,      -60,    110,    -32,    -69,    84,     -54,    -26,
+    98,     -28,    -7,     49,     -49,    -19,    119,    -11,    -157,
+    20,     106,    29,     -8,     -38,    -30,    72,     30,     -3,
+    1,      -32,    -11,    -9,     52,     46,     -144,   -38,    86,
+    -31,    -9,     -42,    -75,    142,    34,     -64,    79,     -109,
+    -55,    195,    -69,    -80,    48,     -49,    62,     25,     -111,
+    -42,    52,     19,     -41,    1,      -16,    -33,    44,     30,
+    -21,    17,     -2,     -30,    111,    34,     -111,   83,     55,
+    -119,   66,     62,     -89,    63,     -39,    -143,   168,    21,
+    -158,   158,    32,     -132,   134,    -3,     -77,    88,     -45,
+    -18,    117,    -51,    -71,    10,     30,     35,     -27,    -63,
+    13,     34,     23,     -23,    19,     -4,     -92,    34,     74,
+    -69,    -15,    20,     -36,    56,     -36,    -96,    69,     -34,
+    -122,   32,     31,     -51,    -3,     -21,    4,      43,     -44,
+    6,      81,     -39,    -35,    26,     -38,    -24,    29,     -16,
+    -47,    -6,     19,     -7,     -9,     41,     32,     13,     -2,
+    -21,    3,      24,     49,     -3,     -66,    14,     95,     -7,
+    -52,    80,     68,     -72,    -14,    39,     2,      24,     -6,
+    -53,    86,     21,     -78,    67,     28,     -34,    16,     -23,
+    -1,     70,     -3,     -58,    45,     33,     -94,    -34,    62,
+    41,     -11,    -27,    27,     46,     14,     -33,    -12,    44,
+    -16,    -59,    6,      45,     -3,     -42,    2,      13,     19,
+    -1,     -71,    3,      42,     -36,    6,      17,     26,     5,
+    -46,    6,      -68,    -75,    86,     -20,    -90,    80,     4,
+    -86,    5,      2,      -33,    -15,    -2,     -8,     -18,    15,
+    -7,     -25,    27,     -28,    -88,    39,     -2,     -85,    58,
+    40,     -45,    3,      17,     0,      11,     -4,     -3,     84,
+    22,     -113,   8,      94,     10,     9,      28,     6,      -3,
+    5,      -2,     23,     23,     -1,     -40,    20,     48,     -40,
+    -21,    72,     7,      -40,    -1,     27,     16,     30,     31,
+    -16,    11,     9,      -71,    -7,     62,     21,     -61,    -19,
+    78,     -2,     -22,    67,     -42,    -12,    75,     -79,    47,
+    86,     -124,   -42,    21,     4,      23,     -32,    -7,     19,
+    1,      -13,    -46,    2,      32,     -43,    -7,     86,     -16,
+    -22,    46,     -61,    -35,    11,     -64,    -38,    17,     -12,
+    -27,    20,     41,     6,      -58,    -61,    58,     -51,    -77,
+    36,     -25,    19,     93,     -76,    1,      72,     -92,    15,
+    40,     -56,    65,     13,     -29,    82,     -9,     -21,    24,
+    -83,    -5,     4,      -63,    77,     80,     -58,    -6,     -19,
+    -43,    100,    5,      -36,    63,     33,     -26,    -48,    26,
+    -18,    -75,    34,     24,     -45,    -1,     6,      -35,    -24,
+    -23,    -22,    47,     -15,    -46,    31,     -40,    -41,    74,
+    -32,    -73,    59,     -51,    -26,    143,    -29,    -42,    93,
+    -44,    -21,    56,     -7,     55,     51,     -61,    74,     111,
+    -71,    35,     124,    -123,   -3,     62,     -79,    100,    49,
+    -122,   143,    79,     -137,   72,     30,     -82,    75,     -10,
+    -48,    35,     -23,    -25,    34,     0,      -54,    -6,     34,
+    -46,    -59,    -7,     -72,    -6,     70,     -41,    -39,    23,
+    -33,    11,     104,    -44,    -30,    54,     -69,    -20,    62,
+    -75,    1,      45,     -69,    1,      40,     -59,    -15,    18,
+    -16,    38,     -1,     -52,    8,      14,     -32,    11,     -15,
+    -58,    18,     -22,    -44,    69,     40,     -50,    -21,    1,
+    -35,    -3,     -5,     -20,    40,     36,     -41,    -36,    -43,
+    -11,    48,     -34,    -40,    51,     -10,    -9,     30,     10,
+    12,     51,     51,     -8,     -16,    32,     -6,     31,     24,
+    -38,    43,     18,     -15,    53,     -10,    -55,    9,      8,
+    -28,    21,     10,     -26,    21,     10,     -9,     5,      -29,
+    -13,    38,     -1,     -11,    49,     0,      -41,    10,     23,
+    -25,    -35,    -2,     -32,    -10,    58,     -6,     -18,    16,
+    -9,     4,      11,     17,     21,     21,     12,     -2,     49,
+    -16,    -128,   21,     75,     -32,    22,     34,     -59,    48,
+    75,     -69,    -11,    -2,     -65,    39,     57,     -54,    -79,
+    -11,    -20,    -13,    38,     4,      -9,     -22,    -22,    33,
+    -7,     -52,    10,     -10,    -19,    54,     47,     -21,    -35,
+    -6,     -4,     11,     8,      -28,    1,      8,      -4,     30,
+    1,      -22,    26,     -7,     -24,    56,     25,     -45,    13,
+    24,     -32,    13,     22,     -46,    -2,     15,     -39,    28,
+    32,     -69,    0,      27,     -69,    0,      39,     -40,    28,
+    55,     -27,    -13,    0,      -14,    37,     25,     -25,    34,
+    -3,     -69,    26,     39,     -41,    -6,     29,     -7,     5,
+    66,     41,     -27,    -17,    6,      -14,    -21,    0,      29,
+    -9,     -26,    32,     -5,     -34,    60,     15,     -60,    20,
+    13,     11,     43,     -48,    -15,    88,     -13,    -55,    26,
+    -32,    -46,    35,     14,     -37,    -11,    12,     -20,    11,
+    9,      -64,    -16,    17,     5,      38,     7,      -30,    -9,
+    -49,    -11,    52,     -15,    -38,    -27,    -12,    36,     53,
+    1,      -37,    -17,    -12,    0,      31,     1,      13,     40,
+    -15,    2,      47,     -15,    -17,    28,     -2,     -4,     25,
+    -6,     -12,    2,      -17,    -9,     5,      -15,    17,     21,
+    -28,    0,      15,     -43,    -63,    -6,     -14,    -8,     37,
+    -34,    -40,    30,     -12,    -14,    37,     -13,    -16,    26,
+    -15,    -2,     13,     -37,    -13,    32,     13,     -8,     -2,
+    -12,    -8,     9,      9,      -3,     4,      13,     34,     -2,
+    -22,    40,     19,     29,     25,     -48,    -17,    23,     17,
+    7,      3,      0,      12,     37,     -1,     -25,    30,     41,
+    -7,     7,      29,     -31,    -31,    -23,    -27,    5,      2,
+    -18,    -2,     22,     9,      -6,     5,      -7,     -24,    9,
+    0,      -28,    19,     61,     -11,    -45,    21,     -28,    -65,
+    28,     33,     -44,    -27,    -6,     -26,    -8,     4,      5,
+    9,      -10,    -46,    -20,    20,     -7,     -7,     -33,    -26,
+    50,     9,      -65,    -22,    -3,     -20,    15,     21,     20,
+    24,     -16,    -27,    -13,    14,     21,     -38,    -48,    9,
+    35,     28,     21,     3,      -31,    -8,     57,     32,     -35,
+    -22,    20,     14,     12,     28,     39,     0,      -18,    44,
+    -2,     -17,    53,     0,      -27,    33,     43,     5,      -10,
+    25,     47,     -3,     -4,     36,     15,     -12,    -3,     29,
+    41,     23,     23,     -8,     -32,    15,     37,     0,      3,
+    22,     31,     1,      -20,    27,     2,      -50,    0,      33,
+    16,     -16,    -17,    18,     -26,    -34,    31,     -27,    -84,
+    -33,    4,      -5,     -22,    -17,    -28,    -66,    -24,    8,
+    -16,    -25,    -51,    -13,    45,     -11,    -49,    -26,    -49,
+    -38,    21,     10,     -52,    -58,    -19,    -4,     9,      -31,
+    -29,    55,     2,      -45,    29,     10,     -22,    49,     33,
+    -27,    -19,    -5,     30,     47,     11,     -11,    -2,     8,
+    5,      17,     8,      3,      57,     63,     28,     24,     11,
+    2,      14,     22,     7,      7,      2,      23,     33,     -2,
+    -8,     14,     7,      20,     57,     32,     -5,     12,     23,
+    10,     17,     26,     -18,    -72,    -6,     74,     61,     13,
+    -17,    -21,    -7,     29,     45,     5,      -52,    -49,    1,
+    10,     35,     40,     -46,    -66,    7,      31,     -27,    -44,
+    -12,    -41,    -22,    32,     -12,    -32,    -3,     -17,    -22,
+    -22,    -31,    -30,    -23,    -13,    3,      0,      -21,    -19,
+    -7,     -17,    -9,     18,     -40,    -64,    1,      4,      -4,
+    8,      -17,    -28,    -1,     9,      -7,     -9,     27,     6,
+    -63,    -32,    52,     25,     -46,    -23,    -6,     -11,    35,
+    29,     -50,    -44,    17,     -6,     -12,    53,     28,     -17,
+    -9,     28,     34,     -20,    -18,    22,     43,     28,     -6,
+    8,      14,     19,     28,     14,     27,     26,     12,     76,
+    66,     -18,    -2,     18,     -12,    -1,     -2,     -1,     51,
+    30,     -18,    5,      14,     -12,    2,      13,     -25,    -9,
+    32,     7,      -5,     15,     -12,    -33,    -18,    -13,    6,
+    0,      -25,    -12,    1,      -17,    0,      13,     -24,    -27,
+    4,      35,     14,     -22,    5,      13,     -18,    -30,    -10,
+    -7,     -7,     31,     23,     -27,    -26,    9,      47,     6,
+    -50,    -11,    19,     1,      11,     12,     -19,    -43,    -18,
+    10,     -6,     -3,     12,     2,      -12,    -16,    10,     9,
+    -25,    -21,    -10,    -13,    0,      8,      -1,     -9,     10,
+    4,      -34,    14,     46,     5,      18,     24,     -15,    -7,
+    20,     -1,     -13,    7,      11,     14,     11,     -2,     8,
+    27,     10,     -1,     13,     -2,     -7,     48,     44,     -15,
+    -16,    -6,     3,      7,      -35,    -25,    8,      -31,    -16,
+    30,     36,     22,     -13,    -21,    -10,    8,      2,      -58,
+    -37,    32,     25,     -1,     -25,    -21,    3,      3,      -6,
+    -11,    -3,     2,      4,      34,     22,     -25,    -19,    0,
+    -6,     -10,    -8,     -35,    -32,    8,      -3,     -20,    -11,
+    -6,     3,      8,      -8,     3,      25,     23,     -7,     -35,
+    -15,    8,      -20,    -6,     15,     -44,    -29,    19,     -5,
+    -1,     18,     28,     6,      -21,    9,      11,     -20,    -10,
+    18,     22,     6,      -2,     12,     6,      23,     34,     -20,
+    -19,    1,      -10,    34,     41,     13,     6,      3,      22,
+    11,     -4,     4,      -12,    -8,     17,     18,     12,     -1,
+    5,      9,      -6,     -2,     4,      1,      3,      2,      -6,
+    -32,    -25,    9,      18,     27,     -4,     -54,    -29,    2,
+    -3,     -18,    -38,    -28,    -10,    9,      20,     5,      -9,
+    -15,    -3,     2,      -14,    -15,    -6,     5,      10,     6,
+    3,      -11,    -9,     -5,     -20,    -13,    8,      3,      -14,
+    6,      20,     -15,    -21,    9,      19,     21,     12,     -4,
+    -21,    -17,    16,     27,     -4,     -28,    -2,     26,     9,
+    -12,    -16,    -28,    -28,    -4,     4,      -15,    -9,     3,
+    -10,    -16,    2,      17,     -10,    -26,    3,      16,     26,
+    17,     -12,    -9,     2,      -2,     -5,     -11,    5,      28,
+    1,      -14,    13,     14,     5,      18,     6,      -17,    -5,
+    7,      2,      -3,     11,     10,     -1,     50,     36,     -28,
+    21,     39,     -9,     -6,     2,      10,     36,     20,     -2,
+    -3,     -11,    -10,    -6,     -5,     -4,     -8,     2,      17,
+    1,      -13,    11,     -13,    -36,    11,     14,     -19,    -6,
+    3,      0,      20,     -5,     -24,    12,     7,      -11,    2,
+    -15,    -28,    -1,     6,      -14,    -31,    -39,    -19,    19,
+    37,     3,      -32,    -27,    -6,     13,     31,     15,     -41,
+    -41,    25,     35,     -3,     -16,    -25,    -19,    -10,    -3,
+    19,     10,     -4,     7,      -4,     -19,    -12,    -13,    -9,
+    6,      2,      -12,    -6,     12,     6,      -1,     -5,     -19,
+    -7,     7,      40,     56,     -3,     -13,    21,     24,     7,
+    -11,    -9,     -3,     24,     28,     -10,    1,      12,     21,
+    24,     -16,    -15,    4,      -7,     -2,     19,     13,     -11,
+    -7,     -8,     15,     41,     5,      -16,    -18,    -11,    26,
+    26,     -5,     -12,    -14,    -6,     10,     8,      -8,     -16,
+    -16,    -3,     10,     1,      -3,     -3,     -2,     -15,    -18,
+    6,      -4,     -4,     21,     4,      -2,     15,     13,     0,
+    -2,     12,     7,      -15,    -9,     1,      -2,     2,      -1,
+    -9,     -15,    -17,    -14,    -10,    1,      -4,     -16,    -17,
+    -1,     18,     8,      1,      22,     11,     -19,    -10,    4,
+    -23,    -29,    0,      -2,     -14,    -6,     13,     7,      -23,
+    -13,    10,     9,      11,     10,     4,      -4,     -4,     1,
+    6,      14,     9,      2,      0,      2,      6,      4,      -9,
+    -18,    -8,     8,      18,     8,      13,     9,      -27,    -22,
+    -10,    -24,    -9,     17,     11,     2,      9,      3,      -13,
+    -10,    -1,     -7,     -1,     10,     -4,     1,      16,     12,
+    -6,     -14,    -2,     -5,     -1,     0,      -1,     6,      -9,
+    -3,     12,     4,      1,      -2,     2,      17,     24,     22,
+    9,      8,      21,     14,     -2,     -2,     4,      -1,     -7,
+    -7,     -6,     -1,     -6,     17,     30,     -7,     -10,    -3,
+    -19,    -18,    2,      21,     4,      -20,    -6,     -1,     -18,
+    -14,    -6,     -7,     -1,     6,      10,     8,      -5,     0,
+    10,     -22,    -40,    -22,    4,      34,     16,     -19,    -16,
+    -12,    -17,    -16,    -17,    -29,    -28,    -4,     10,     16,
+    22,     13,     4,      -1,     -5,     16,     15,     -11,    -6,
+    9,      3,      -14,    -22,    -19,    -12,    5,      -5,     -15,
+    3,      9,      27,     17,     -4,     8,      -2,     1,      16,
+    11,     9,      9,      8,      -14,    -16,    7,      -5,     -15,
+    -11,    -5,     19,     25,     25,     43,     21,     -9,     -9,
+    -19,    -10,    14,     -11,    -19,    8,      3,      1,      11,
+    -1,     -24,    -20,    -1,     2,      7,      24,     22,     11,
+    8,      6,      -2,     -11,    -3,     -2,     -4,     0,      -7,
+    0,      6,      -1,     -16,    -35,    -8,     8,      -11,    -6,
+    6,      18,     16,     7,      12,     5,      -2,     -3,     -10,
+    -21,    -27,    -10,    -3,     -3,     8,      0,      -9,     -10,
+    -3,     0,      -5,     6,      9,      19,     23,     8,      -5,
+    -19,    -16,    -5,     -6,     -27,    -22,    1,      6,      8,
+    2,      -9,     -13,    -15,    -18,    -13,    4,      25,     29,
+    26,     -2,     -22,    1,      8,      1,      -6,     -6,     -7,
+    -20,    0,      13,     -14,    -24,    -24,    -21,    2,      14,
+    16,     23,     15,     10,     10,     5,      0,      -26,    -32,
+    3,      19,     5,      -8,     -7,     -8,     -3,     17,     27,
+    -7,     -28,    10,     32,     10,     1,      10,     3,      -4,
+    22,     24,     -31,    -40,    0,      6,      5,      17,     17,
+    1,      10,     30,     8,      -12,    -6,     9,      6,      -12,
+    -5,     1,      -4,     6,      11,     0,      -9,     -4,     -3,
+    -4,     -3,     2,      0,      -2,     -9,     -27,    -23,    2,
+    13,     -6,     -9,     -3,     -12,    -2,     10,     6,      -7,
+    -19,    -31,    -13,    16,     11,     -3,     -13,    -15,    0,
+    7,      -3,     -7,     -1,     -4,     7,      15,     0,      -12,
+    -8,     -1,     -7,     -12,    -21,    -17,    5,      30,     25,
+    -6,     -6,     0,      -12,    -8,     2,      13,     11,     1,
+    5,      4,      4,      10,     -1,     -20,    -12,    -4,     3,
+    15,     11,     -7,     -24,    -4,     8,      -2,     -14,    -25,
+    -17,    7,      21,     14,     1,      0,      12,     17,     13,
+    6,      1,      6,      14,     11,     -10,    -21,    -12,    -4,
+    3,      -2,     -21,    -24,    -2,     12,     14,     17,     4,
+    -2,     11,     11,     11,     1,      -34,    -32,    -5,     10,
+    7,      -11,    -12,    6,      7,      -4,     -10,    -15,    -5,
+    17,     21,     0,      -15,    -15,    -1,     5,      -18,    -18,
+    -10,    -9,     24,     27,     -9,     -14,    0,      9,      25,
+    22,     1,      -7,     -2,     16,     13,     -14,    -10,    7,
+    0,      2,      15,     2,      -9,     5,      10,     -5,     -3,
+    10,     3,      0,      15,     15,     -1,     -3,     8,      6,
+    -7,     -7,     2,      0,      -4,     5,      -8,     -37,    -28,
+    -1,     8,      6,      10,     -1,     -12,    12,     28,     8,
+    -17,    -16,    -15,    -17,    1,      6,      -4,     -8,     -4,
+    -15,    -15,    6,      -9,     -15,    10,     9,      -13,    -8,
+    5,      -2,     -10,    5,      12,     -27,    -33,    9,      8,
+    -16,    -3,     16,     -3,     -7,     22,     22,     10,     5,
+    -11,    -16,    -4,     9,      12,     6,      -3,     2,      2,
+    -1,     4,      -7,     -8,     1,      8,      19,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..33aeea516fb8c7fcb080b3b971bf5d69b81b9c4c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_yes_1000ms_sample_data_size;
+extern const int16_t g_yes_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index 47ac85c605488bdaa30515325122019a2d88678f..451eed28528fa58c56af879bf556c395aaf7b6cf 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -22,7 +22,6 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels:padding",
@@ -43,7 +42,42 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":micro_ops",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "portable_optimized_micro_ops",
+    srcs = [
+        "fully_connected.cc",
+        "portable_optimized/depthwise_conv.cc",
+        "softmax.cc",
+    ],
+    hdrs = [
+    ],
+    copts = tflite_copts(),
+    deps = [
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels:padding",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+    ],
+)
+
+cc_library(
+    name = "portable_optimized_ops_resolver",
+    srcs = [
+        "all_ops_resolver.cc",
+    ],
+    hdrs = [
+        "all_ops_resolver.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":portable_optimized_micro_ops",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
@@ -61,6 +95,19 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "portable_optimized_depthwise_conv_test",
+    srcs = [
+        "depthwise_conv_test.cc",
+    ],
+    deps = [
+        ":portable_optimized_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "fully_connected_test",
     srcs = [
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index 05ba8798c0dc34eab5c563489cf9fc928325d00f..ff952b39c001be0d0f757d2de130f9c0df27c543 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -50,6 +50,11 @@ void TestDepthwiseConvFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
 
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, &context);
 
@@ -403,4 +408,84 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
       kTfLiteActRelu, output_data);
 }
 
+TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = 0;
+  const float input_max = 255.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 128.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 9;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 1, 9, 1},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 2, 1, 8, 1},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 1},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          220,
+          184,
+          140,
+          150,
+          161,
+          200,
+          172,
+          148,
+          133,
+      },
+      {4, 1, 1, 9, 1},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1ddf6b0733b07fa57c505e95e79d03c9ca0f1ce
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc
@@ -0,0 +1,439 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Size of the cached buffer we'll be using to hold reordered weights.
+constexpr int kReshapedFilterDataSize = 1 * 1024;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+// Specialized implementation of the depthwise convolution operation designed to
+// work with the particular filter width of eight used by the default micro
+// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
+// converted from TFLite's NHWC format to NCHW format, and expressed as signed
+// eight bit integers, rather than unsigned. Care must be taken when calling
+// this not to use it for more than one node since there's only a single static
+// buffer holding the weights. You should use this implementation if depthwise
+// convolutions are a performance bottleneck, you have a layer that meets the
+// parameter requirements, and the extra RAM usage and additional code size are
+// not an issue.
+static inline void DepthwiseConvOptimizedForFilterWidthEight(
+    TfLiteContext* context, const DepthwiseParams& params,
+    const RuntimeShape& input_shape, const uint8* input_data,
+    const RuntimeShape& filter_shape, const uint8* filter_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    const RuntimeShape& output_shape, uint8* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  static int8_t reshaped_filter_data[kReshapedFilterDataSize];
+  const int needed_size =
+      output_depth * filter_width * filter_height * input_depth;
+  if (needed_size > kReshapedFilterDataSize) {
+    context->ReportError(
+        context,
+        "Size too large for reshaped weight buffer (%d needed, %d available)",
+        needed_size, kReshapedFilterDataSize);
+    return;
+  }
+
+  RuntimeShape reshaped_filter_shape;
+  reshaped_filter_shape.BuildFrom(
+      {1, output_depth, filter_height, filter_width});
+
+  // If this is the first time through, repack the weights into a cached buffer
+  // so that they can be accessed sequentially.
+  static bool is_reshaped_filter_initialized = false;
+  if (!is_reshaped_filter_initialized) {
+    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+      for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+        for (int oc = 0; oc < output_depth; ++oc) {
+          const uint8* current_filter =
+              filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
+          int8* reshaped_filter =
+              reshaped_filter_data +
+              Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
+          *reshaped_filter = (int32_t)(*current_filter) + filter_offset;
+        }
+      }
+    }
+    is_reshaped_filter_initialized = true;
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            int in_y_start = in_y_origin;
+            int filter_y_start = 0;
+            if (in_y_origin < 0) {
+              in_y_start = 0;
+              filter_y_start = 0 - in_y_origin;
+            }
+            int filter_y_end = filter_height;
+            if ((in_y_origin + filter_height) >= input_height) {
+              filter_y_end -= (in_y_origin + filter_height) - input_height;
+            }
+            int in_y = in_y_start;
+            int in_x_start = in_x_origin;
+            int filter_x_start = 0;
+            bool is_out_of_x_bounds = false;
+            if (in_x_origin < 0) {
+              in_x_start = 0;
+              filter_x_start = 0 - in_x_origin;
+              is_out_of_x_bounds = true;
+            }
+            int filter_x_end = filter_width;
+            if ((in_x_origin + filter_width) >= input_width) {
+              filter_x_end -= (in_x_origin + filter_width) - input_width;
+              is_out_of_x_bounds = true;
+            }
+            for (int filter_y = filter_y_start; filter_y < filter_y_end;
+                 ++filter_y, ++in_y) {
+              const uint8* current_input =
+                  input_data + Offset(input_shape, b, in_y, in_x_start, ic);
+              if ((filter_width == 8) && !is_out_of_x_bounds) {
+                int8* current_filter =
+                    reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
+                                                  filter_y, filter_x_start);
+                const uint32_t input_vals0 =
+                    *reinterpret_cast<const uint32_t*>(current_input);
+                current_input += 4;
+                const int32_t filter_vals0 =
+                    *reinterpret_cast<const int32_t*>(current_filter);
+                current_filter += 4;
+                const uint8 input_val0 = input_vals0 & 0xff;
+                const int8 filter_val0 = filter_vals0 & 0xff;
+                acc += filter_val0 * input_val0;
+                const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
+                const int8 filter_val1 = (filter_vals0 >> 8) & 0xff;
+                acc += filter_val1 * input_val1;
+                const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
+                const int8 filter_val2 = (filter_vals0 >> 16) & 0xff;
+                acc += filter_val2 * input_val2;
+                const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
+                const int8 filter_val3 = (filter_vals0 >> 24) & 0xff;
+                acc += filter_val3 * input_val3;
+
+                const uint32_t input_vals1 =
+                    *reinterpret_cast<const uint32_t*>(current_input);
+                const int32_t filter_vals1 =
+                    *reinterpret_cast<const int32_t*>(current_filter);
+                const uint8 input_val4 = input_vals1 & 0xff;
+                const int8 filter_val4 = filter_vals1 & 0xff;
+                acc += filter_val4 * input_val4;
+                const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
+                const int8 filter_val5 = (filter_vals1 >> 8) & 0xff;
+                acc += filter_val5 * input_val5;
+                const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
+                const int8 filter_val6 = (filter_vals1 >> 16) & 0xff;
+                acc += filter_val6 * input_val6;
+                const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
+                const int8 filter_val7 = (filter_vals1 >> 24) & 0xff;
+                acc += filter_val7 * input_val7;
+              } else {
+                const uint8* current_filter =
+                    filter_data +
+                    Offset(filter_shape, 0, filter_y, filter_x_start, oc);
+                for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                     ++filter_x) {
+                  int32 input_val = *current_input;
+                  current_input += input_depth;
+                  int32 filter_val = *current_filter;
+                  current_filter += output_depth;
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[oc];
+            }
+            acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                DepthwiseConvOutputRounding::kAwayFromZero>(
+                acc, output_multiplier, output_shift);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                static_cast<uint8>(acc);
+          }
+        }
+      }
+    }
+  }
+}  // namespace
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  // Figure out if we can use the optimized path for this set of parameters.
+  const int filter_width = GetTensorShape(filter).Dims(2);
+  const int input_depth = GetTensorShape(input).Dims(3);
+  const int output_depth = GetTensorShape(filter).Dims(3);
+  const int filter_height = GetTensorShape(filter).Dims(1);
+  const int needed_size =
+      output_depth * filter_width * filter_height * input_depth;
+  bool use_optimized_path = false;
+  if ((filter_width == 8) && (input_offset == 0) && (filter_offset == -127) &&
+      (input_depth == 1) && (needed_size <= kReshapedFilterDataSize)) {
+    // FIXME(petewarden) - We need a more robust way of handling this, ideally
+    // with an allocation mechanism available through the context API.
+    // Use the address of the node as a proxy for its identity, since we need
+    // to ensure the weight values are consistent between calls, and there's
+    // no easy way to do that quickly other than relying on the identity of
+    // the owning node.
+    static TfLiteNode* initialized_node_address = node;
+    if (initialized_node_address == node) {
+      use_optimized_path = true;
+    } else {
+      static bool has_warned = false;
+      if (!has_warned) {
+        context->ReportError(
+            context,
+            "Multiple depthwise conv ops match optimization parameters, but "
+            "only the first will use the fast path, because there's only one "
+            "RAM cache available");
+        has_warned = true;
+      }
+    }
+  }
+  if (use_optimized_path) {
+    DepthwiseConvOptimizedForFilterWidthEight(
+        context, op_params, GetTensorShape(input),
+        GetTensorData<uint8_t>(input), GetTensorShape(filter),
+        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(output),
+        GetTensorData<uint8_t>(output));
+  } else {
+    tflite::reference_ops::DepthwiseConv(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int out_width = ComputeOutSize(params->padding, width, filter_width,
+                                 params->stride_width);
+  int out_height = ComputeOutSize(params->padding, height, filter_height,
+                                  params->stride_height);
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, out_width,
+                                        out_height, data_type, data));
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, data, input, filter, bias, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
+                                 depthwise_conv::Prepare, depthwise_conv::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
index 2f20dd5ac77dfd3f304c7cc93be0b865a0c2f0cb..32e9a57f76ecc055c67c0ede8d1c83550c602aab 100644
--- a/tensorflow/lite/experimental/micro/testing/micro_test.h
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -107,13 +107,13 @@ extern tflite::ErrorReporter* reporter;
     }                                                                          \
   } while (false)
 
-#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
-  do {                                                                        \
-    if ((x) != (y)) {                                                         \
-      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
-                                   __LINE__);                                 \
-      micro_test::did_test_fail = true;                                       \
-    }                                                                         \
+#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                          \
+  do {                                                                         \
+    if ((x) != (y)) {                                                          \
+      micro_test::reporter->Report(#x " == " #y " failed at %s:%d (%d vs %d)", \
+                                   __FILE__, __LINE__, (x), (y));              \
+      micro_test::did_test_fail = true;                                        \
+    }                                                                          \
   } while (false)
 
 #define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
diff --git a/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1647cf82a276d7c1725c7c3334693e0e1b7e057c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dcec7269bdc95ab57204f3b4cbc17f9d3cacadc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Creates the project file distributions for the TensorFlow Lite Micro test and
+# example targets aimed at embedded platforms.
+#
+# Usage: ci_build_micro_projects.sh <TARGET OS> <TAGS>
+#
+# For example:
+# ci_build_micro_projects.sh mbed "CMSIS disco_f746ng"
+
+set -e
+set -x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd ${ROOT_DIR}
+pwd
+
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
+  TARGET=${1} \
+  TAGS="${2}" \
+  generate_projects
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index fde195118b18ca308940292c7bd5706ecace8563..e11e8a8cf092e0244e2d5b04dd48ae51560f4ca8 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -30,18 +30,26 @@ TARGET_ARCH := $(HOST_ARCH)
 # STM32F746NG board, using the CMSIS library's implementations where possible.
 ALL_TAGS := $(TAGS) $(TARGET)
 
+# This is obviously horrible.  We need to generate these 3 versions of the
+# include directories from one source.
 INCLUDES := \
 -I. \
--I$(MAKEFILE_DIR)/../../../../../ \
--I$(MAKEFILE_DIR)/../../../../../../ \
--I$(MAKEFILE_DIR)/../../../../../../../ \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
--I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
--I$(OBJDIR)
-# This is at the end so any globally-installed frameworks like protobuf don't
-# override local versions in the source tree.
-INCLUDES += -I/usr/local/include
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include
+
+# Same list of paths, but now relative to the generated project files.
+GENERATED_PROJECT_INCLUDES := \
+-I. \
+-I./third_party/gemmlowp \
+-I./third_party/flatbuffers/include
+
+# Same list of paths, but now in the format the generate_keil_project.py
+# script expects them.
+PROJECT_INCLUDES := \
+. \
+third_party/gemmlowp \
+third_party/flatbuffers/include
 
 TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_linux_binary.sh
 
@@ -79,7 +87,6 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
-MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
 
 MICROLITE_CC_HDRS := \
 $(wildcard tensorflow/lite/experimental/micro/*.h) \
@@ -118,12 +125,24 @@ third_party/flatbuffers/LICENSE.txt
 
 MAKE_PROJECT_FILES := \
   README_MAKE.md \
-  Makefile
+  Makefile \
+  .vscode/tasks.json
 
 MBED_PROJECT_FILES := \
   README_MBED.md \
   mbed-os.lib \
-  mbed_app.json
+  mbed_app.json \
+  .vscode/tasks.json
+
+KEIL_PROJECT_FILES := \
+  README_KEIL.md \
+  keil_project.uvprojx
+
+ALL_PROJECT_TARGETS :=
+
+KEIL_PROJECT_FILES := \
+  README_KEIL.md \
+  keil_project.uvprojx
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
@@ -131,6 +150,9 @@ MBED_PROJECT_FILES := \
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+# Call specialize here so that platform-specific tags can be taken into account.
+MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
+
 ALL_TAGS += $(TARGET_ARCH)
 
 ALL_SRCS := \
@@ -156,8 +178,8 @@ include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
 
-MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
-$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
+MICROLITE_LIB_OBJS += $(addprefix $(OBJDIR), \
+$(patsubst %.S,%.o,$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(THIRD_PARTY_CC_SRCS)))))
 
 # For normal manually-created TensorFlow C++ source files.
 $(OBJDIR)%.o: %.cc
@@ -197,11 +219,21 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
+# snease: Add %.bin rule here since BINDIR is now defined
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+$(BINDIR)%.bin: $(BINDIR)%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
 # Generate standalone makefile projects for all of the test targets.
 $(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
 $(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
 
-test: test_micro_speech $(MICROLITE_TEST_TARGETS)
+test: $(MICROLITE_TEST_TARGETS)
+
+generate_projects: $(ALL_PROJECT_TARGETS)
 
 # Gets rid of all generated files.
 clean:
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
index 82c15e32f6572f36588945431918cf75299d3a64..7a434e5ca81596b651992219d433cd856c94e7b1 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -34,10 +34,19 @@ FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff68
 CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
 SIFIVE_FE310_LIB_URL="https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
-RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+AM_SDK_URL="http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.0.0.zip"
 AP3_URL="https://github.com/AmbiqMicro/TFLiteMicro_Apollo3/archive/dfbcef9a57276c087d95aab7cb234f1d4c9eaaba.zip"
 CUST_CMSIS_URL="https://github.com/AmbiqMicro/TFLiteMicro_CustCMSIS/archive/8f63966c5692e6a3a83956efd2e4aed77c4c9949.zip"
-GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
+SPARKFUN_EDGE_BSP_URL="https://github.com/sparkfun/SparkFun_Edge_BSP/archive/620f5f7a69fc69e38cda8132b69302d9c28ba0dd.zip"
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-8.1.0-2019.01.0-x86_64-apple-darwin.tar.gz"
+  GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-mac.tar.bz2"
+else
+  RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+  GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+fi
 
 download_and_extract() {
   local usage="Usage: download_and_extract URL DIR"
@@ -72,37 +81,58 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
-patch_apollo3_sdk() {
-  local ap3_dir="${1}"
-  if [ ! -f ${ap3_dir}/VERSION.txt ]; then
-    echo "Could not find ${ap3_dir}, skipping Apollo3 SDK";
+patch_am_sdk() {
+  local am_dir="${1}"
+  if [ ! -f ${am_dir}/VERSION.txt ]; then
+    echo "Could not find ${am_dir}, skipping AmbiqMicro SDK patch";
     return;
   fi
-  local src_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc
-  local dest_dir=${ap3_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+
+  local src_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc
+  local dest_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+
   rm -rf ${dest_dir}
   mkdir ${dest_dir}
+
   cp "${src_dir}/startup_gcc.c" "${dest_dir}/startup_gcc.c"
   cp "${src_dir}/hello_world.ld" "${dest_dir}/apollo3evb.ld"
-  sed -i -e '131s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
-  sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+
+  sed -i -e '114s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
+  #sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+
   sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "${dest_dir}/apollo3evb.ld"
   sed -i -e '3s/startup_gnu/startup_gcc/g' "${dest_dir}/apollo3evb.ld"
-  sed -i -e '6s/am_reset_isr/Reset_Handler/g' "${dest_dir}/apollo3evb.ld"
-  sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "${dest_dir}/apollo3evb.ld"
-  sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "${dest_dir}/apollo3evb.ld"
+  sed -i -e $'22s/\*(.text\*)/\*(.text\*)\\\n\\\n\\\t\/\* These are the C++ global constructors.  Stick them all here and\\\n\\\t \* then walk through the array in main() calling them all.\\\n\\\t \*\/\\\n\\\t_init_array_start = .;\\\n\\\tKEEP (\*(SORT(.init_array\*)))\\\n\\\t_init_array_end = .;\\\n\\\n\\\t\/\* XXX Currently not doing anything for global destructors. \*\/\\\n/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e $'70s/} > SRAM/} > SRAM\\\n    \/\* Add this to satisfy reference to symbol "end" from libnosys.a(sbrk.o)\\\n     \* to denote the HEAP start.\\\n     \*\/\\\n   end = .;/g' "${dest_dir}/apollo3evb.ld"
+
+  # Workaround for bug in 2.0.0 SDK, remove once that's fixed.
+  sed -i -e $'s/#ifndef AM_HAL_GPIO_H/#ifdef __cplusplus\\\nextern "C" {\\\n#endif\\\n#ifndef AM_HAL_GPIO_H/g' ${am_dir}/mcu/apollo3/hal/am_hal_gpio.h
+
   echo "Finished preparing Apollo3 files"
 }
 
+patch_kissfft() {
+  sed -i -E $'s@#ifdef FIXED_POINT@// Patched automatically by download_dependencies.sh so default is 16 bit.\\\n#ifndef FIXED_POINT\\\n#define FIXED_POINT (16)\\\n#endif\\\n// End patch.\\\n\\\n#ifdef FIXED_POINT@g' tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_MALLOC malloc@#define KISS_FFT_MALLOC(X) (void*)(0) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_FREE free@#define KISS_FFT_FREE(X) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -ir -E "s@(fprintf.*\);)@/* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  sed -ir -E "s@(exit.*\);)@return; /* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  echo "Finished patching kissfft"
+}
+
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
 download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
 download_and_extract "${SIFIVE_FE310_LIB_URL}" "${DOWNLOADS_DIR}/sifive_fe310_lib"
 download_and_extract "${RISCV_TOOLCHAIN_URL}" "${DOWNLOADS_DIR}/riscv_toolchain"
+download_and_extract "${AM_SDK_URL}" "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
+patch_am_sdk "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
 download_and_extract "${AP3_URL}" "${DOWNLOADS_DIR}/apollo3_ext"
-patch_apollo3_sdk "${DOWNLOADS_DIR}/Apollo3-SDK-2018.08.13"
 download_and_extract "${CUST_CMSIS_URL}" "${DOWNLOADS_DIR}/CMSIS_ext"
 download_and_extract "${GCC_EMBEDDED_URL}" "${DOWNLOADS_DIR}/gcc_embedded"
+download_and_extract "${KISSFFT_URL}" "${DOWNLOADS_DIR}/kissfft"
+patch_kissfft "${DOWNLOADS_DIR}/kissfft"
+download_and_extract "${SPARKFUN_EDGE_BSP_URL}" "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0/boards/SparkFun_TensorFlow_Apollo3_BSP"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0c632e24b139a5a3e27fadbfb850a53fff531d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py
@@ -0,0 +1,117 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generates a Keil uVision project file from a template."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os.path
+import re
+
+
+def sanitize_xml(unsanitized):
+  """Uses a whitelist to avoid generating bad XML."""
+  return re.sub(r'[^a-zA-Z0-9+_\-/\\.]', '', unsanitized)
+
+
+def main(unused_args, flags):
+  """Generates a Keil project file from a template source."""
+  with open(flags.input_template, 'r') as input_template_file:
+    template_file_text = input_template_file.read()
+
+  template_file_text = re.sub(r'%{EXECUTABLE}%', flags.executable,
+                              template_file_text)
+
+  srcs_list = flags.srcs.split(' ')
+  hdrs_list = flags.hdrs.split(' ')
+  all_srcs_list = srcs_list + hdrs_list
+  all_srcs_list.sort()
+
+  replace_srcs = ''
+  for src in all_srcs_list:
+    if not src:
+      continue
+    ext = os.path.splitext(src)[1]
+    # These extension indexes are used by uVision to keep track of the type
+    # of files. I determined them by experimentation, since the file format
+    # isn't documented.
+    if ext == '.h':
+      ext_index = '5'
+    elif ext == '.c':
+      ext_index = '1'
+    elif ext == '.cc' or ext == '.cpp':
+      ext_index = '8'
+    else:
+      ext_index = '5'
+    basename = sanitize_xml(os.path.basename(src))
+    clean_src = sanitize_xml(src)
+    replace_srcs += '            <File>\n'
+    replace_srcs += '              <FileName>' + basename + '</FileName>\n'
+    replace_srcs += '              <FileType>' + ext_index + '</FileType>\n'
+    replace_srcs += '              <FilePath>' + clean_src + '</FilePath>\n'
+    replace_srcs += '            </File>\n'
+  template_file_text = re.sub(r'%{SRCS}%', replace_srcs, template_file_text)
+
+  include_paths = re.sub(' ', ';', flags.include_paths)
+  template_file_text = re.sub(r'%{INCLUDE_PATHS}%', include_paths,
+                              template_file_text)
+
+  with open(flags.output_file, 'w') as output_file:
+    output_file.write(template_file_text)
+
+
+def parse_args():
+  """Converts the raw arguments into accessible flags."""
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--input_template',
+      type=str,
+      default='',
+      help='Path to template project file to build from.')
+  parser.add_argument(
+      '--output_file',
+      type=str,
+      default='',
+      help='Path to write the completed project file to.')
+  parser.add_argument(
+      '--executable',
+      type=str,
+      default='',
+      help='Name of the executable the project will build.')
+  parser.add_argument(
+      '--hdrs',
+      type=str,
+      default='',
+      help='Space-separated list of C or C++ source files to compile.')
+  parser.add_argument(
+      '--srcs',
+      type=str,
+      default='',
+      help='Space-separated list of C or C++ header files to include.')
+  parser.add_argument(
+      '--include_paths',
+      type=str,
+      default='',
+      help='Space-separated list of paths to look for header files on.')
+  flags, unparsed = parser.parse_known_args()
+
+  main(unparsed, flags)
+
+
+if __name__ == '__main__':
+  parse_args()
diff --git a/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..22b68e4f68360cc28d6dd7e751381709a72892e7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the TensorFlow Lite Micro project generator.
+
+set -e
+
+INPUT_TEMPLATE=${TEST_SRCDIR}/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
+OUTPUT_FILE=${TEST_TMPDIR}/keil_project.uvprojx
+EXECUTABLE=test_executable
+
+${TEST_SRCDIR}/tensorflow/lite/experimental/micro/tools/make/generate_keil_project \
+  --input_template=${INPUT_TEMPLATE} \
+  --output_file=${OUTPUT_FILE} \
+  --executable=${EXECUTABLE} \
+  --hdrs="foo.h bar.h" \
+  --srcs="foo.c bar.cc some/bad<xml.cc" \
+  --include_paths=". include"
+
+if ! grep -q "${EXECUTABLE}" ${OUTPUT_FILE}; then
+  echo "ERROR: No executable name '${EXECUTABLE}' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "foo\.h" ${OUTPUT_FILE}; then
+  echo "ERROR: No header 'foo.h' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "bar\.h" ${OUTPUT_FILE}; then
+  echo "ERROR: No header 'bar.h' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "foo\.c" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'foo.c' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "bar\.cc" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'bar.cc' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "some/badxml\.cc" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'some/badxml.cc' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "\.;include" ${OUTPUT_FILE}; then
+  echo "ERROR: No include paths '.;include' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+echo
+echo "SUCCESS: generate_keil_project test PASSED"
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
index 87c002635f55be334bbb22a892a3013e92087cc2..89f473b3109816d0326c28b5965f00f167f64476 100644
--- a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -52,6 +52,7 @@ specialize = $(call specialize_on_tags,$(1),$(strip $(call reverse,$(ALL_TAGS)))
 # 5 - List of C/C++ header files needed to build the target.
 # 6 - Linker flags required.
 # 7 - C++ compilation flags needed.
+# 8 - C compilation flags needed.
 # Calling eval on the output will create a <Name>_makefile target that you
 # can invoke to create the standalone project.
 define generate_project
@@ -68,9 +69,22 @@ $(PRJDIR)$(3)/$(1)/%: tensorflow/lite/experimental/micro/tools/make/templates/%.
 	sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
 	sed -E 's#\%\{EXECUTABLE\}\%#$(3)#g' | \
 	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
-	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' > $$@
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
+
+$(PRJDIR)$(3)/$(1)/keil_project.uvprojx: tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
+	@mkdir -p $$(dir $$@)
+	python tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py \
+        --input_template=$$< --output_file=$$@ --executable=$(3) \
+        --srcs="$(4)" --hdrs="$(5)" --include_paths="$$(PROJECT_INCLUDES)"
+
+$(PRJDIR)$(3)/$(1)/.vscode/tasks.json : tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.$(1).tpl
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
 
 generate_$(3)_$(1)_project: $(addprefix $(PRJDIR)$(3)/$(1)/, $(4) $(5) $(2))
+
+ALL_PROJECT_TARGETS += generate_$(3)_$(1)_project
 endef
 
 # Specialized version of generate_project for TF Lite Micro test targets that
@@ -83,8 +97,9 @@ endef
 # Calling eval on the output will create targets that you can invoke to
 # generate the standalone project.
 define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
-$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS))
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
+$(call generate_project,keil,$(KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
 endef
 
 
@@ -103,7 +118,6 @@ $(1)_LOCAL_HDRS := $(3)
 $(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
 $$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
 $(1)_BINARY := $$(BINDIR)$(1)
-ALL_BINARIES += $$($(1)_BINARY)
 $$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
 	@mkdir -p $$(dir $$@)
 	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
@@ -113,5 +127,8 @@ $(1): $$($(1)_BINARY)
 $(1)_bin: $$($(1)_BINARY).bin
 test_$(1): $$($(1)_BINARY)
 	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
+ifneq (,$(findstring _test,$(1)))
+  MICROLITE_TEST_TARGETS += test_$(1)
+endif
 $(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
 endef
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index 6ed402a623188a7c39a007a1cfd7dbc67b775103..4df26a7bf704eeee0acb8fa3386ab42589204595 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,15 +1,18 @@
-# Settings for apollo3 evb platforms.
-ifeq ($(TARGET), apollo3evb)
+# Settings for apollo3 evb and SparkFun Edge platforms.
+ifeq ($(TARGET),$(filter $(TARGET),apollo3evb sparkfun_edge))
   export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
   TARGET_ARCH := cortex-m4
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/Apollo3-SDK-2018.08.13
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/AmbiqSuite-Rel2.0.0
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the softfp interfaces.
+  # with the hard interfaces.
   GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
+  # Use the faster depthwise conv implementation.
+  ALL_TAGS += portable_optimized
+
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
     -DAM_PACKAGE_BGA \
@@ -31,7 +34,7 @@ ifeq ($(TARGET), apollo3evb)
     -mcpu=cortex-m4 \
     -mthumb \
     -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=softfp \
+    -mfloat-abi=hard \
     -std=gnu++11 \
     -Wvla \
     -Wall \
@@ -49,7 +52,7 @@ ifeq ($(TARGET), apollo3evb)
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
-    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=softfp \
+    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
     -nostartfiles -static \
     -Wl,--gc-sections -Wl,--entry,Reset_Handler \
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
@@ -58,10 +61,16 @@ ifeq ($(TARGET), apollo3evb)
     -Wl,-T,$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
     -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
   BUILD_TYPE := micro
+  ifeq ($(TARGET), apollo3evb)
+    BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp
+  endif
+  ifeq ($(TARGET), sparkfun_edge)
+    BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/SparkFun_TensorFlow_Apollo3_BSP/bsp
+  endif
   MICROLITE_LIBS := \
-    $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
+    $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \
     $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
-    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
     -lm
   INCLUDES += \
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
@@ -70,7 +79,7 @@ ifeq ($(TARGET), apollo3evb)
     -I$(GCC_ARM)/arm-none-eabi/ \
     -I$(APOLLO3_SDK)/mcu/apollo3/ \
     -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
-    -I$(APOLLO3_SDK)/boards/apollo3_evb/bsp/ \
+    -I$(BOARD_BSP_PATH) \
     -I$(APOLLO3_SDK)/devices/ \
     -I$(APOLLO3_SDK)/utils/
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e339fe635d4af2e9e884d0c3bdb56d9d210e9ad
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
@@ -0,0 +1,9 @@
+Compiling instructions here
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro
+
+CONTACT INFORMATION:
+
+Contact info@etacompute.com for more information on obtaining the Eta Compute
+SDK and evalution board.
+
+www.etacompute.com
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
deleted file mode 100644
index 3cb74a72437be8017527c0ea05a1b82eb1a4ac9e..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Running The Micro Speech Example On Eta Compute's ECM3531EVB
-
-This code will enable you to compile and execute the Tensorflow Lite Micro Speech Example on Eta Computes's low power ECM3531 chip.
-
-
-GETTING STARTED:
-
-1. Download the Tensorflow code from Github and follow instructions there to download other dependencies.  
-
-2. Download the Eta Compute SDK, version 0.0.17.
-
-3. Install the Arm compiler arm-none-eabi-gcc, version = arm-none-eabi-gcc (GNU Tools for Arm Embedded Processors 7-2018-q2-update) 7.3.1 20180622 (release) [ARM/embedded-7-branch revision 261907]
-
-4. Edit the file   tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc  so that the variable ETA_SDK points to the location where the Eta Compute SDK is installed, and the variable GCC_ARM points to the Arm compiler.
-
-5. Compile the code with the command   "make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531 test".  This will create the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test.
-
-6. Connect the board to the host computer, start PuTTY (Connection type = Serial, Speed = 11520, Data bits = 8, Stop bits = 1,  Parity = None), and load the executable with ocd.  A sample script for loading the image is provided in tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program.  
-
-The following  will be printed on the Uart:
-
-Testing TestInvoke
-Ran successfully
-
-/ tests passed
-~~~ALL TESTS PASSED~~~
-
-
-
-CONTACT INFORMATION:
-
-Contact info@etacompute.com  for more information on obtaining the Eta Compute SDK and evalution board.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
index 2764f3ba50de699fa72717585114369cf833d76e..25d3e7c169d5f7419a892d35bd30aa2d9a128160 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
@@ -51,12 +51,6 @@ void EtaPrintExecutionTime(uint64_t);
 //*****************************************************************************
 extern int main(int argc, char** argv);
 
-void DebugLog(const char* s) { EtaCspIoPrintf("%s", s); }
-void DebugLogInt32(int32_t i) { EtaCspIoPrintf("%d", i); }
-void DebugLogUInt32(uint32_t i) { EtaCspIoPrintf("%d", i); }
-void DebugLogHex(uint32_t i) { EtaCspIoPrintf("0x%8x", i); }
-void DebugLogFloat(float i) { EtaCspIoPrintf("%f", i); }
-
 int _main(void) {
   uint64_t time_ms;
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
index af34f988f2d04a0c1c87f20d6058df560db7e2c5..383b7f924408b484c8ee2ada5c4d6ec66edb059a 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
@@ -55,7 +55,7 @@ SECTIONS
     {
         _datax = .;
         KEEP(*(.mainStack))
-        . += 12288;
+        . += 16384;
         _edatax = .;
         _stack_top = .;
         . += 4;
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
new file mode 100644
index 0000000000000000000000000000000000000000..9cbbea3569ba05b8fc9269ff6c5500fb386c03a3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531 chip.
+ * .text  and .ro map to FLASH all else to SRAM.
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   FLASH is at 0x01000000 of length 0x00080000  512KB
+ *   SRAM  is at 0x10000000 of length 0x00020000  128KB
+ */
+MEMORY
+{
+    FLASH (RX) : ORIGIN = 0x01000000, LENGTH = 0x00080000
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > FLASH= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > FLASH
+/* put the stack at the bottom of SRAM*/
+    .datax (NOLOAD) :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . = ALIGN(0x4);
+        . += 16384;
+        _edatax = .;
+        _stack_top = .;
+    } > SRAM
+    .data :
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM AT > FLASH
+
+    .bss (NOLOAD) :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+
+
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
new file mode 100755
index 0000000000000000000000000000000000000000..5395b3d9965e98572fb12d61d7b862f4ce926a0f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_erase to erase the flash.
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+#ocd comand
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "mww 0x1001fff8 0\n",
+                "mdw 0x01000000 16\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
new file mode 100755
index 0000000000000000000000000000000000000000..bc3fe5cb21aa2a89d8dda41a68185f03e43c674e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_program executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/ into flash
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import sys, getopt
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "flash write_image {}\n".format(elf_file),
+                "mww 0x1001fff8 0\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands:
+    print(x)
+    send_ocd_cmd(x)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
index ac1f49962a61756ccbde02300c612bd7b4f48e84..781231480aa2f1dec18cc468e1ea0129604c71e7 100755
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram to load the executable tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/micro_speech_test into SRAM
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/
 #
 #
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
@@ -18,6 +18,7 @@
 #==============================================================================
 
 
+import sys, getopt
 import os
 import telnetlib
 
@@ -34,7 +35,8 @@ get_ocd_response() # clean it out
 
 # git path to project elf file
 cur_dir = os.getcwd()
-elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech_test'
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'preprocessor_test'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
 print("elf_file = ",elf_file)
 
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
index baae58f87e1761c978a87256fda8b7e90edb79e5..897a2b66d21668c4a28573e9d068b865c8f008a8 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
@@ -5,6 +5,10 @@ ifeq ($(TARGET), ecm3531)
   ETA_SDK := /home/hari/TensaiSDK-v0.0.17/soc/
   GCC_ARM := /home/hari/Downloads/gcc-arm-none-eabi-7-2018-q2-update/
 
+#Pick the appropriate lds file depending whether you are running frof SRAM of flash
+  ETA_LDS_FILE := ecm3531.lds
+#  ETA_LDS_FILE := ecm3531_flash.lds
+
   ifeq ($(wildcard $(ETA_SDK)),)
     $(error Path to ETA SDK is not set (ETA_SDK))
   endif
@@ -14,6 +18,7 @@ ifeq ($(TARGET), ecm3531)
   endif
 
   PLATFORM_FLAGS = \
+    -DARM_MATH_CM3 \
     -DFIRMWARE_BUILD \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
@@ -58,14 +63,13 @@ ifeq ($(TARGET), ecm3531)
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
     -fno-exceptions \
     -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.lds \
-    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE) \
+    -Wl,-Map=$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.map,--cref
   BUILD_TYPE := micro
   MICROLITE_LIBS := \
     $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
     -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  ECM3531_INCLUDES := \
     -I$(GCC_ARM)/arm-none-eabi/include/ \
     -I$(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/inc/ \
     -I$(ETA_SDK)/ecm3531/m3/reg/inc/ \
@@ -75,6 +79,9 @@ ifeq ($(TARGET), ecm3531)
     -I$(ETA_SDK)/../utils/inc/  \
     -I$(ETA_SDK)/ecm3531/boards/eta_evb/eta_bsp/inc
 
+  INCLUDES += $(ECM3531_INCLUDES)
+  GENERATED_PROJECT_INCLUDES += $(ECM3531_INCLUDES)
+
   # _main.c contains application and target specific initialization, like
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
@@ -83,7 +90,13 @@ ifeq ($(TARGET), ecm3531)
     $(MAKEFILE_DIR)/targets/ecm3531/_main.c \
     $(wildcard $(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/src/*.c) \
     $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.c) \
-    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s)
+
+  # The linker script isn't a header, but it needs to get copied to the gen/
+  # directory for generated projects.  This is similar to the behavior needed
+  # for headers.
+  MICROLITE_CC_HDRS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE)
 
   TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
   # These are tests that don't currently work on the blue pill.
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
index 3b91eeff9fd5f2df06caa9a5f73b221815f9bbdf..090b4fa101d765adb6a7eed181752021f55ebd1b 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
@@ -1,6 +1,9 @@
 # Settings for Mac OS platforms.
 ifeq ($(TARGET), osx)
 
+  # Make sure we can find the embedded GCC compiler.
+  export PATH := ${PATH}:tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/
+
   PLATFORM_FLAGS = \
     -DTF_LITE_DISABLE_X86_NEON
 
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_x86_64_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_x86_64_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..78febaf5dddda19f082a21b7a4c5b9409677f769
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_x86_64_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for x86 on Mac
+ifeq ($(TARGET), osx)
+  ifeq ($(TARGET_ARCH), x86_64)
+    PLATFORM_FLAGS = \
+      -DTF_LITE_DISABLE_X86_NEON
+    CXXFLAGS += $(PLATFORM_FLAGS)
+    CCFLAGS += $(PLATFORM_FLAGS)
+  endif
+endif
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
index 74d54f1ebee12d7773edfd1b073ddf17dd3791d6..ca6519c1390b5b783e4b6f26cac40a6b7ef32f46 100644
--- a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
@@ -4,12 +4,8 @@ SRCS := \
 OBJS := \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
 
-INCLUDES := \
--I. \
--I./third_party/gemmlowp \
--I./third_party/flatbuffers/include
-
 CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
 
 LDFLAGS += %{LINKER_FLAGS}%
 
@@ -20,7 +16,6 @@ LDFLAGS += %{LINKER_FLAGS}%
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 %{EXECUTABLE}% : $(OBJS)
-	$(CXX) $(LDFLAGS) $(OBJS) \
-	-o $@
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS)
 
 all: %{EXECUTABLE}%
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..945b9f9c1ae4c5761afb80febe57803d1e7fcab2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl
@@ -0,0 +1,5 @@
+# TensorFlow Lite Micro Mbed Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the Keil uVision IDE.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..440d4b6b3e8a7894bc2b0c6afbd5ff78b54f198b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
@@ -0,0 +1,418 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<Project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="project_projx.xsd">
+
+  <SchemaVersion>2.1</SchemaVersion>
+
+  <Targets>
+    <Target>
+      <TargetName>%{EXECUTABLE}%</TargetName>
+      <ToolsetNumber>0x4</ToolsetNumber>
+      <ToolsetName>ARM-ADS</ToolsetName>
+      <pCCUsed>6100001::V6.10.1::.\ARMCLANG</pCCUsed>
+      <uAC6>1</uAC6>
+      <TargetOption>
+        <TargetCommonOption>
+          <Device>STM32F746NGHx</Device>
+          <Vendor>STMicroelectronics</Vendor>
+          <PackID>Keil.STM32F7xx_DFP.2.11.0</PackID>
+          <PackURL>http://www.keil.com/pack</PackURL>
+          <Cpu>IRAM(0x20010000,0x40000) IRAM2(0x20000000,0x10000) IROM(0x08000000,0x100000) IROM2(0x00200000,0x100000) CPUTYPE("Cortex-M7") FPU3(SFPU) CLOCK(12000000) ELITTLE</Cpu>
+          <FlashUtilSpec></FlashUtilSpec>
+          <StartupFile></StartupFile>
+          <FlashDriverDll>UL2CM3(-S0 -C0 -P0 -FD20010000 -FC1000 -FN2 -FF0STM32F7x_1024 -FS08000000 -FL0100000 -FF1STM32F7xTCM_1024 -FS1200000 -FL1100000 -FP0($$Device:STM32F746NGHx$CMSIS\Flash\STM32F7x_1024.FLM) -FP1($$Device:STM32F746NGHx$CMSIS\Flash\STM32F7xTCM_1024.FLM))</FlashDriverDll>
+          <DeviceId>0</DeviceId>
+          <RegisterFile>$$Device:STM32F746NGHx$Drivers\CMSIS\Device\ST\STM32F7xx\Include\stm32f7xx.h</RegisterFile>
+          <MemoryEnv></MemoryEnv>
+          <Cmp></Cmp>
+          <Asm></Asm>
+          <Linker></Linker>
+          <OHString></OHString>
+          <InfinionOptionDll></InfinionOptionDll>
+          <SLE66CMisc></SLE66CMisc>
+          <SLE66AMisc></SLE66AMisc>
+          <SLE66LinkerMisc></SLE66LinkerMisc>
+          <SFDFile>$$Device:STM32F746NGHx$CMSIS\SVD\STM32F7x6_v1r1.svd</SFDFile>
+          <bCustSvd>0</bCustSvd>
+          <UseEnv>0</UseEnv>
+          <BinPath></BinPath>
+          <IncludePath></IncludePath>
+          <LibPath></LibPath>
+          <RegisterFilePath></RegisterFilePath>
+          <DBRegisterFilePath></DBRegisterFilePath>
+          <TargetStatus>
+            <Error>0</Error>
+            <ExitCodeStop>0</ExitCodeStop>
+            <ButtonStop>0</ButtonStop>
+            <NotGenerated>0</NotGenerated>
+            <InvalidFlash>1</InvalidFlash>
+          </TargetStatus>
+          <OutputDirectory>.\Objects\</OutputDirectory>
+          <OutputName>%{EXECUTABLE}%</OutputName>
+          <CreateExecutable>1</CreateExecutable>
+          <CreateLib>0</CreateLib>
+          <CreateHexFile>0</CreateHexFile>
+          <DebugInformation>1</DebugInformation>
+          <BrowseInformation>1</BrowseInformation>
+          <ListingPath>.\Listings\</ListingPath>
+          <HexFormatSelection>1</HexFormatSelection>
+          <Merge32K>0</Merge32K>
+          <CreateBatchFile>0</CreateBatchFile>
+          <BeforeCompile>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopU1X>0</nStopU1X>
+            <nStopU2X>0</nStopU2X>
+          </BeforeCompile>
+          <BeforeMake>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopB1X>0</nStopB1X>
+            <nStopB2X>0</nStopB2X>
+          </BeforeMake>
+          <AfterMake>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopA1X>0</nStopA1X>
+            <nStopA2X>0</nStopA2X>
+          </AfterMake>
+          <SelectedForBatchBuild>0</SelectedForBatchBuild>
+          <SVCSIdString></SVCSIdString>
+        </TargetCommonOption>
+        <CommonProperty>
+          <UseCPPCompiler>0</UseCPPCompiler>
+          <RVCTCodeConst>0</RVCTCodeConst>
+          <RVCTZI>0</RVCTZI>
+          <RVCTOtherData>0</RVCTOtherData>
+          <ModuleSelection>0</ModuleSelection>
+          <IncludeInBuild>1</IncludeInBuild>
+          <AlwaysBuild>0</AlwaysBuild>
+          <GenerateAssemblyFile>0</GenerateAssemblyFile>
+          <AssembleAssemblyFile>0</AssembleAssemblyFile>
+          <PublicsOnly>0</PublicsOnly>
+          <StopOnExitCode>3</StopOnExitCode>
+          <CustomArgument></CustomArgument>
+          <IncludeLibraryModules></IncludeLibraryModules>
+          <ComprImg>1</ComprImg>
+        </CommonProperty>
+        <DllOption>
+          <SimDllName>SARMCM3.DLL</SimDllName>
+          <SimDllArguments> -REMAP -MPU</SimDllArguments>
+          <SimDlgDll>DCM.DLL</SimDlgDll>
+          <SimDlgDllArguments>-pCM7</SimDlgDllArguments>
+          <TargetDllName>SARMCM3.DLL</TargetDllName>
+          <TargetDllArguments> -MPU</TargetDllArguments>
+          <TargetDlgDll>TCM.DLL</TargetDlgDll>
+          <TargetDlgDllArguments>-pCM7</TargetDlgDllArguments>
+        </DllOption>
+        <DebugOption>
+          <OPTHX>
+            <HexSelection>1</HexSelection>
+            <HexRangeLowAddress>0</HexRangeLowAddress>
+            <HexRangeHighAddress>0</HexRangeHighAddress>
+            <HexOffset>0</HexOffset>
+            <Oh166RecLen>16</Oh166RecLen>
+          </OPTHX>
+        </DebugOption>
+        <Utilities>
+          <Flash1>
+            <UseTargetDll>1</UseTargetDll>
+            <UseExternalTool>0</UseExternalTool>
+            <RunIndependent>0</RunIndependent>
+            <UpdateFlashBeforeDebugging>1</UpdateFlashBeforeDebugging>
+            <Capability>1</Capability>
+            <DriverSelection>-1</DriverSelection>
+          </Flash1>
+          <bUseTDR>1</bUseTDR>
+          <Flash2>BIN\UL2CM3.DLL</Flash2>
+          <Flash3></Flash3>
+          <Flash4></Flash4>
+          <pFcarmOut></pFcarmOut>
+          <pFcarmGrp></pFcarmGrp>
+          <pFcArmRoot></pFcArmRoot>
+          <FcArmLst>0</FcArmLst>
+        </Utilities>
+        <TargetArmAds>
+          <ArmAdsMisc>
+            <GenerateListings>0</GenerateListings>
+            <asHll>1</asHll>
+            <asAsm>1</asAsm>
+            <asMacX>1</asMacX>
+            <asSyms>1</asSyms>
+            <asFals>1</asFals>
+            <asDbgD>1</asDbgD>
+            <asForm>1</asForm>
+            <ldLst>0</ldLst>
+            <ldmm>1</ldmm>
+            <ldXref>1</ldXref>
+            <BigEnd>0</BigEnd>
+            <AdsALst>1</AdsALst>
+            <AdsACrf>1</AdsACrf>
+            <AdsANop>0</AdsANop>
+            <AdsANot>0</AdsANot>
+            <AdsLLst>1</AdsLLst>
+            <AdsLmap>1</AdsLmap>
+            <AdsLcgr>1</AdsLcgr>
+            <AdsLsym>1</AdsLsym>
+            <AdsLszi>1</AdsLszi>
+            <AdsLtoi>1</AdsLtoi>
+            <AdsLsun>1</AdsLsun>
+            <AdsLven>1</AdsLven>
+            <AdsLsxf>1</AdsLsxf>
+            <RvctClst>0</RvctClst>
+            <GenPPlst>0</GenPPlst>
+            <AdsCpuType>"Cortex-M7"</AdsCpuType>
+            <RvctDeviceName></RvctDeviceName>
+            <mOS>0</mOS>
+            <uocRom>0</uocRom>
+            <uocRam>0</uocRam>
+            <hadIROM>1</hadIROM>
+            <hadIRAM>1</hadIRAM>
+            <hadXRAM>0</hadXRAM>
+            <uocXRam>0</uocXRam>
+            <RvdsVP>2</RvdsVP>
+            <RvdsMve>0</RvdsMve>
+            <hadIRAM2>1</hadIRAM2>
+            <hadIROM2>1</hadIROM2>
+            <StupSel>8</StupSel>
+            <useUlib>0</useUlib>
+            <EndSel>0</EndSel>
+            <uLtcg>0</uLtcg>
+            <nSecure>0</nSecure>
+            <RoSelD>4</RoSelD>
+            <RwSelD>4</RwSelD>
+            <CodeSel>0</CodeSel>
+            <OptFeed>0</OptFeed>
+            <NoZi1>0</NoZi1>
+            <NoZi2>0</NoZi2>
+            <NoZi3>0</NoZi3>
+            <NoZi4>0</NoZi4>
+            <NoZi5>0</NoZi5>
+            <Ro1Chk>0</Ro1Chk>
+            <Ro2Chk>0</Ro2Chk>
+            <Ro3Chk>0</Ro3Chk>
+            <Ir1Chk>1</Ir1Chk>
+            <Ir2Chk>0</Ir2Chk>
+            <Ra1Chk>0</Ra1Chk>
+            <Ra2Chk>0</Ra2Chk>
+            <Ra3Chk>0</Ra3Chk>
+            <Im1Chk>1</Im1Chk>
+            <Im2Chk>1</Im2Chk>
+            <OnChipMemories>
+              <Ocm1>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm1>
+              <Ocm2>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm2>
+              <Ocm3>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm3>
+              <Ocm4>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm4>
+              <Ocm5>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm5>
+              <Ocm6>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm6>
+              <IRAM>
+                <Type>0</Type>
+                <StartAddress>0x20010000</StartAddress>
+                <Size>0x40000</Size>
+              </IRAM>
+              <IROM>
+                <Type>1</Type>
+                <StartAddress>0x8000000</StartAddress>
+                <Size>0x100000</Size>
+              </IROM>
+              <XRAM>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </XRAM>
+              <OCR_RVCT1>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT1>
+              <OCR_RVCT2>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT2>
+              <OCR_RVCT3>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT3>
+              <OCR_RVCT4>
+                <Type>1</Type>
+                <StartAddress>0x8000000</StartAddress>
+                <Size>0x100000</Size>
+              </OCR_RVCT4>
+              <OCR_RVCT5>
+                <Type>1</Type>
+                <StartAddress>0x200000</StartAddress>
+                <Size>0x100000</Size>
+              </OCR_RVCT5>
+              <OCR_RVCT6>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT6>
+              <OCR_RVCT7>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT7>
+              <OCR_RVCT8>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT8>
+              <OCR_RVCT9>
+                <Type>0</Type>
+                <StartAddress>0x20010000</StartAddress>
+                <Size>0x40000</Size>
+              </OCR_RVCT9>
+              <OCR_RVCT10>
+                <Type>0</Type>
+                <StartAddress>0x20000000</StartAddress>
+                <Size>0x10000</Size>
+              </OCR_RVCT10>
+            </OnChipMemories>
+            <RvctStartVector></RvctStartVector>
+          </ArmAdsMisc>
+          <Cads>
+            <interw>1</interw>
+            <Optim>7</Optim>
+            <oTime>0</oTime>
+            <SplitLS>0</SplitLS>
+            <OneElfS>1</OneElfS>
+            <Strict>0</Strict>
+            <EnumInt>0</EnumInt>
+            <PlainCh>0</PlainCh>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <wLevel>3</wLevel>
+            <uThumb>0</uThumb>
+            <uSurpInc>0</uSurpInc>
+            <uC99>0</uC99>
+            <uGnu>1</uGnu>
+            <useXO>0</useXO>
+            <v6Lang>3</v6Lang>
+            <v6LangP>3</v6LangP>
+            <vShortEn>1</vShortEn>
+            <vShortWch>1</vShortWch>
+            <v6Lto>0</v6Lto>
+            <v6WtE>0</v6WtE>
+            <v6Rtti>0</v6Rtti>
+            <VariousControls>
+              <MiscControls></MiscControls>
+              <Define></Define>
+              <Undefine></Undefine>
+              <IncludePath>%{INCLUDE_PATHS}%</IncludePath>
+            </VariousControls>
+          </Cads>
+          <Aads>
+            <interw>1</interw>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <thumb>0</thumb>
+            <SplitLS>0</SplitLS>
+            <SwStkChk>0</SwStkChk>
+            <NoWarn>0</NoWarn>
+            <uSurpInc>0</uSurpInc>
+            <useXO>0</useXO>
+            <uClangAs>0</uClangAs>
+            <VariousControls>
+              <MiscControls></MiscControls>
+              <Define></Define>
+              <Undefine></Undefine>
+              <IncludePath></IncludePath>
+            </VariousControls>
+          </Aads>
+          <LDads>
+            <umfTarg>0</umfTarg>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <noStLib>0</noStLib>
+            <RepFail>1</RepFail>
+            <useFile>0</useFile>
+            <TextAddressRange>0x08000000</TextAddressRange>
+            <DataAddressRange>0x20010000</DataAddressRange>
+            <pXoBase></pXoBase>
+            <ScatterFile></ScatterFile>
+            <IncludeLibs></IncludeLibs>
+            <IncludeLibsPath></IncludeLibsPath>
+            <Misc></Misc>
+            <LinkerInputFile></LinkerInputFile>
+            <DisabledWarnings></DisabledWarnings>
+          </LDads>
+        </TargetArmAds>
+      </TargetOption>
+      <Groups>
+        <Group>
+          <GroupName>Source</GroupName>
+          <Files>
+%{SRCS}%
+          </Files>
+        </Group>
+        <Group>
+          <GroupName>::Compiler</GroupName>
+        </Group>
+      </Groups>
+    </Target>
+  </Targets>
+
+  <RTE>
+    <apis/>
+    <components>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDERR" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDIN" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDOUT" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+    </components>
+    <files/>
+  </RTE>
+
+</Project>
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..141994d854565dc2ad2152e440c1d29526acb3dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
@@ -0,0 +1,16 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Make Build",
+            "type": "shell",
+            "command": "make",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..616f3b23188df4af934433772c86c5c1a9452539
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
@@ -0,0 +1,39 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Mbed Config Root",
+            "type": "shell",
+            "command": "mbed config root .",
+        },
+        {
+            "label": "Mbed Deploy",
+            "type": "shell",
+            "command": "mbed deploy",
+        },
+        {
+            "label": "Mbed Patch C++11",
+            "type": "shell",
+            "command": "python",
+            "args": [
+                "-c",
+                "import fileinput, glob;\nfor filename in glob.glob(\"mbed-os/tools/profiles/*.json\"):\n  for line in fileinput.input(filename, inplace=True):\n    print line.replace(\"\\\"-std=gnu++98\\\"\",\"\\\"-std=c++11\\\", \\\"-fpermissive\\\"\")"
+            ]
+        },
+        {
+            "label": "Mbed Init",
+            "dependsOn": ["Mbed Config Root", "Mbed Deploy", "Mbed Patch C++11"]
+        },
+        {
+            "label": "Mbed build",
+            "type": "shell",
+            "command": "mbed compile -m auto -t GCC_ARM",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
index a055e52f71001295cf95dfcbe790bc4118140fed..8dd42fc38290dbf5be8f9f1a850ad88cbf326ace 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -6,6 +6,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
@@ -117,72 +122,65 @@ cc_library(
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "fft_test",
-    size = "small",
     srcs = ["fft_test.cc"],
     deps = [
         ":fft",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "filterbank_test",
-    size = "small",
     srcs = ["filterbank_test.cc"],
     deps = [
         ":filterbank",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "frontend_test",
-    size = "small",
     srcs = ["frontend_test.cc"],
     deps = [
         ":frontend",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "log_scale_test",
-    size = "small",
     srcs = ["log_scale_test.cc"],
     deps = [
         ":log_scale",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "noise_reduction_test",
-    size = "small",
     srcs = ["noise_reduction_test.cc"],
     deps = [
         ":noise_reduction",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "pcan_gain_control_test",
-    size = "small",
     srcs = ["pcan_gain_control_test.cc"],
     deps = [
         ":pcan_gain_control",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "window_test",
-    size = "small",
     srcs = ["window_test.cc"],
     deps = [
         ":window",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/experimental/microfrontend/lib/bits.h b/tensorflow/lite/experimental/microfrontend/lib/bits.h
index bf15466a3d6484c3059a1ded1bb51e4d4287b1bf..04b3ba6f055f956720b58720c78083b1529fb065 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/bits.h
+++ b/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -63,14 +63,14 @@ static inline int CountLeadingZeros64Slow(uint64_t n) {
 
 static inline int CountLeadingZeros64(uint64_t n) {
 #if defined(_MSC_VER) && defined(_M_X64)
-  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
   unsigned long result = 0;  // NOLINT(runtime/int)
   if (_BitScanReverse64(&result, n)) {
     return 63 - result;
   }
   return 64;
 #elif defined(_MSC_VER)
-  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
   unsigned long result = 0;  // NOLINT(runtime/int)
   if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
     return 31 - result;
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
index 1b754c1b4c244edf1b091a581e5ae9399c2ac2e3..ec1f247ba24ad27917330708d6f9c754515a686b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,13 @@ const int16_t kFakeWindow[] = {
     0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
 const int kScaleShift = 0;
 
-TEST(FftTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
   struct FftState state;
-  ASSERT_TRUE(
+  TF_LITE_MICRO_EXPECT(
       FftPopulateState(&state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
 
   FftInit(&state);
@@ -37,14 +40,15 @@ TEST(FftTest, CheckOutputValues) {
       {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
       {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
       {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
-  ASSERT_EQ(state.fft_size / 2 + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.fft_size / 2; ++i) {
-    EXPECT_EQ(state.output[i].real, expected[i].real);
-    EXPECT_EQ(state.output[i].imag, expected[i].imag);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].real, expected[i].real);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].imag, expected[i].imag);
   }
 
   FftFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
index 41f0064d4f1674471fa731e72464b1d40fce4216..16257aa11a5ca3e82aeff60f8cc0176de3c519ab 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -33,9 +32,9 @@ const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
 const int kScaleShift = 0;
 
 // Test filterbank generation using scaled-down defaults.
-class FilterbankTest : public ::testing::Test {
- protected:
-  FilterbankTest() {
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
     config_.num_channels = 2;
     config_.lower_band_limit = 8.0;
     config_.upper_band_limit = 450.0;
@@ -44,110 +43,124 @@ class FilterbankTest : public ::testing::Test {
   struct FilterbankConfig config_;
 };
 
-TEST_F(FilterbankTest, CheckStartIndex) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.start_index, kStartIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckEndIndex) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.end_index, kEndIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelFrequencyStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 4, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWeightStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 8, 16};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWidths) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {8, 8, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_widths[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckWeights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
                               0, 3376, 2468, 1591, 744,  0,   0,   0,
                               0, 4020, 3226, 2456, 1708, 983, 277, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.weights[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckUnweights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
                               0, 720, 1628, 2505, 3352, 0,    0,    0,
                               0, 76,  870,  1640, 2388, 3113, 3819, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.unweights[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
   struct FilterbankState state;
   state.start_index = kStartIndex;
   state.end_index = kEndIndex;
@@ -161,42 +174,46 @@ TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
 
   int i;
   for (i = state.start_index; i < state.end_index; ++i) {
-    EXPECT_EQ(energy[i], kEnergy[i]);
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
   }
 }
 
-TEST_F(FilterbankTest, CheckAccumulateChannels) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   FilterbankAccumulateChannels(&state, kEnergy);
 
-  ASSERT_EQ(state.num_channels + 1, sizeof(kWork) / sizeof(kWork[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
   int i;
   for (i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.work[i], kWork[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckSqrt) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
   std::memcpy(state.work, kWork, sizeof(kWork));
 
   uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
 
   const uint32_t expected[] = {247311, 508620};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(scaled_filterbank[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
index ce8b4acc0f696f9c4123bab9daeb1e8802c3e828..5a668fa92f17f97cb97ed8c3f640d65b1821e3de 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
@@ -28,9 +28,7 @@ void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
   config->output_scale_shift = 7;
 }
 
-static float FreqToMel(float freq) {
-  return 1127.0 * log(1.0 + (freq / 700.0));
-}
+static float FreqToMel(float freq) { return 1127.0 * log1p(freq / 700.0); }
 
 static void CalculateCenterFrequencies(const int num_channels,
                                        const float lower_frequency_limit,
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
index a6faa1fc1f51360e295253fb2b3cfdf01ada74ad..568484f14dde6b958d5c9e144ab8dfd7a68a0fb0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test end-to-end frontend behaviors.
-class FrontendTest : public ::testing::Test {
- protected:
-  FrontendTest() {
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
     config_.window.size_ms = 25;
     config_.window.step_size_ms = 10;
     config_.noise_reduction.smoothing_bits = 10;
@@ -53,9 +52,15 @@ class FrontendTest : public ::testing::Test {
   struct FrontendConfig config_;
 };
 
-TEST_F(FrontendTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   struct FrontendOutput output = FrontendProcessSamples(
@@ -63,18 +68,20 @@ TEST_F(FrontendTest, CheckOutputValues) {
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
 
   const uint16_t expected[] = {479, 425};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -86,18 +93,20 @@ TEST_F(FrontendTest, CheckConsecutiveWindow) {
       &num_samples_read);
 
   const int16_t expected[] = {436, 378};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -113,10 +122,10 @@ TEST_F(FrontendTest, CheckNotEnoughSamples) {
           kStepSamples,
       &num_samples_read);
 
-  EXPECT_EQ(output.size, 0);
-  EXPECT_EQ(output.values, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
 
   FrontendFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
index 1ea0842ec2ad1065782198b635bf8b4858d6bf3a..be52fd426a23a389aac84e4b2dac832924716f83 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
@@ -15,15 +15,18 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kScaleShift = 6;
 const int kCorrectionBits = -1;
 
-TEST(LogScaleTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
   struct LogScaleState state;
   state.enable_log = true;
   state.scale_shift = kScaleShift;
@@ -36,11 +39,11 @@ TEST(LogScaleTest, CheckOutputValues) {
   const uint16_t expected[] = {479, 425};
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-TEST(LogScaleTest, CheckOutputValuesNoLog) {
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
   struct LogScaleState state;
   state.enable_log = false;
   state.scale_shift = kScaleShift;
@@ -53,8 +56,8 @@ TEST(LogScaleTest, CheckOutputValuesNoLog) {
   const uint16_t expected[] = {65535, 45998};
   int i;
   for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
index 13d58b2476762d89ee79be554be12a9b7a897ad5..ba864c427ced36748167c9412fe2966d72d3cb0e 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
@@ -15,17 +15,16 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kNumChannels = 2;
 
 // Test noise reduction using default config values.
-class NoiseReductionTest : public ::testing::Test {
- protected:
-  NoiseReductionTest() {
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
     config_.smoothing_bits = 10;
     config_.even_smoothing = 0.025;
     config_.odd_smoothing = 0.06;
@@ -35,38 +34,48 @@ class NoiseReductionTest : public ::testing::Test {
   struct NoiseReductionConfig config_;
 };
 
-TEST_F(NoiseReductionTest, TestNoiseReductionEstimate) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {6321887, 31248341};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(state.estimate[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-TEST_F(NoiseReductionTest, TestNoiseReduction) {
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {241137, 478104};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
index 7c92d2d29d0e41d5e378a596c5a06e8418edfa8d..93d7a8bcb94d5e0145b9ee701b413194f2946a7b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,9 @@ const int kSmoothingBits = 10;
 const int kCorrectionBits = -1;
 
 // Test pcan auto gain control using default config values.
-class PcanGainControlTest : public ::testing::Test {
- protected:
-  PcanGainControlTest() {
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
     config_.enable_pcan = 1;
     config_.strength = 0.95;
     config_.offset = 80.0;
@@ -37,24 +36,30 @@ class PcanGainControlTest : public ::testing::Test {
   struct PcanGainControlConfig config_;
 };
 
-TEST_F(PcanGainControlTest, TestPcanGainControl) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
   uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
   struct PcanGainControlState state;
-  ASSERT_TRUE(PcanGainControlPopulateState(&config_, &state, estimate,
-                                           kNumChannels, kSmoothingBits,
-                                           kCorrectionBits));
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      &config.config_, &state, estimate, kNumChannels, kSmoothingBits,
+      kCorrectionBits));
 
   uint32_t signal[] = {241137, 478104};
   PcanGainControlApply(&state, signal);
 
   const uint32_t expected[] = {3578, 1533};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   PcanGainControlFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
index 60f11440f56ea39a25a5aa2beb23eb25a83048b3..cf9df523b8f5e540d47c8e6d3d42e37540f62ce0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/window.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test window function behaviors using default config values.
-class WindowTest : public ::testing::Test {
- protected:
-  WindowTest() {
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
     config_.size_ms = 25;
     config_.step_size_ms = 10;
   }
@@ -39,84 +38,98 @@ class WindowTest : public ::testing::Test {
   struct WindowConfig config_;
 };
 
-TEST_F(WindowTest, CheckCoefficients) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
 
   const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
                               2681, 3145, 3541, 3843, 4032, 4096, 4032,
                               3843, 3541, 3145, 2681, 2177, 1664, 1176,
                               743,  391,  144,  16};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.coefficients[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckResidualInput) {
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
   int i;
   for (i = kStepSamples; i < kWindowSamples; ++i) {
-    EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckOutputValues) {
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
   const int16_t expected[] = {
       0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
       0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckMaxAbsValue) {
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
-  EXPECT_EQ(state.max_abs_output_value, 32256);
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
@@ -124,38 +137,41 @@ TEST_F(WindowTest, CheckConsecutiveWindow) {
   const int16_t expected[] = {
       0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
       0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
   int i;
   for (i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
-  ASSERT_FALSE(WindowProcessSamples(
-      &state, kFakeAudioData + kWindowSamples + kStepSamples,
-      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
-          kStepSamples,
-      &num_samples_read));
-
-  EXPECT_EQ(
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
       state.input_used,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
 
   WindowFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py b/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
index 3d49482f4ecd34ec47df1d3baa3e6dccf8ae4bef..52bad311436db5374b2c85da8cda30e962cf0dba 100644
--- a/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
+++ b/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
@@ -96,7 +96,7 @@ def audio_microfrontend(audio,
   Raises:
     ValueError: If the audio tensor is not explicitly a vector.
   """
-  audio_shape = audio.get_shape()
+  audio_shape = audio.shape
   if audio_shape.ndims is None:
     raise ValueError("Input to `AudioMicrofrontend` should have known rank.")
   if len(audio_shape) > 1:
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..eef9e322311dd96738580a4af9ec61f6e2e745db
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -0,0 +1,110 @@
+# TensorFlow Lite for Objective-C
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+
+SOURCES = glob([
+    "sources/*.h",
+    "sources/*.m",
+    "sources/*.mm",
+])
+
+API_HEADERS = glob([
+    "apis/*.h",
+])
+
+MINIMUM_OS_VERSION = "9.0"
+
+# Compiler flags for building regular non-test libraries.
+RELEASE_COPTS = [
+    # Enables language-specific warnings for Objective-C, Objective-C++, C, and C++.
+    "-Wall",
+    # Warns if functions, variables, and types marked with the deprecated attribute are being used.
+    "-Wdeprecated-declarations",
+    # Warns for errors in documentation.
+    "-Wdocumentation",
+    # Turns all warnings into errors.
+    "-Werror",
+    # Enables extra warning flags that are not enabled by -Wall.
+    "-Wextra",
+    # Warns if a global function is defined without a previous prototype declaration.
+    "-Wmissing-prototypes",
+    # From -Wextra. Disables warning when signed value is converted to unsigned value during comparison.
+    "-Wno-sign-compare",
+    # From -Wextra. Disables warning for unused parameters, which are common in delegate methods and block callbacks.
+    "-Wno-unused-parameter",
+    # Warns if a global or local variable or type declaration shadows another variable, parameter, type, class member, or instance variable.
+    "-Wshadow",
+    # Warns if a function is declared or defined without specifying the argument types. For a block with no args, use (void) instead of ().
+    "-Wstrict-prototypes",
+    # Warns if an @selector() expression is encountered with a method name that hasn't been defined yet.
+    "-Wundeclared-selector",
+    # Turn off warnings for headers not part of TensorFlow Lite Objective-C API.
+    "--system-header-prefix=tensorflow/lite/experimental/c/",
+]
+
+# Compiler flags for building test libraries.
+TEST_COPTS = RELEASE_COPTS + [
+    # From -Wall. Disables warning when passing nil to a callee that requires a non-null argument.
+    "-Wno-nonnull",
+    # Disables warning when a global or local variable or type declaration shadows another.
+    "-Wno-shadow",
+]
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+]
+
+objc_library(
+    name = "TensorFlowLite",
+    srcs = SOURCES,
+    hdrs = API_HEADERS,
+    copts = RELEASE_COPTS,
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+    alwayslink = 1,
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # These sanitizer tests are not supported by iOS build toolchain (b/74292221).
+        # Disabled these for iOS test targets.
+        "noasan",
+        "notsan",
+        "nomsan",
+    ],
+    deps = [
+        ":TestsLib",
+    ],
+)
+
+objc_library(
+    name = "TestsLib",
+    testonly = 1,
+    srcs = glob([
+        "tests/*.m",
+    ]),
+    hdrs = glob([
+        "apis/*.h",
+        "sources/*.h",
+        "tests/*.h",
+    ]),
+    copts = TEST_COPTS,
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+    ],
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+    ],
+)
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9dd5aa697392d2867fb9d57159a5f9b79ba3847b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -0,0 +1,54 @@
+# TensorFlow Lite for Objective-C
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Objective-C developers. It enables low-latency inference of
+on-device machine learning models with a small binary size and fast performance
+supporting hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+objc_library(
+  deps = [
+      "//tensorflow/lite/experimental/objc:TensorFlowLite",
+  ],
+)
+```
+
+If you would like to build the Objective-C TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Objective-C library target:
+
+```shell
+bazel build tensorflow/lite/experimental/objc:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
+```
+
+### Tulsi
+
+Open the `TensorFlowLite.tulsiproj` using the
+[TulsiApp](https://github.com/bazelbuild/tulsi) or by running the
+[`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
+script from the root `tensorflow` directory:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
+```
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..04a2a2c19cdb0bd259705e98b43ea6980305524e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -0,0 +1,60 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite",
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/objc",
+    "tensorflow/lite/experimental/objc/apis",
+    "tensorflow/lite/experimental/objc/sources",
+    "tensorflow/lite/experimental/objc/tests",
+    "tensorflow/lite/kernels",
+    "tensorflow/lite/kernels/internal",
+    "tensorflow/lite/nnapi",
+    "tensorflow/lite/schema",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/objc:TensorFlowLite",
+    "//tensorflow/lite/experimental/objc:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLite",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/objc/BUILD",
+  ]
+}
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..e92cb481386d88a8210b14f19ed92e7e47d9033f
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/project.tulsiconf
@@ -0,0 +1,17 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "BazelBuildOptionsDebug" : {
+
+      },
+      "BazelBuildOptionsRelease" : {
+
+      },
+    }
+  },
+  "projectName" : "TensorFlowLite",
+  "packages" : [
+    "tensorflow/lite/experimental/objc"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c06a4bc82f752baabdb6db100ee96e9ce29d29f
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLInterpreterOptions;
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLInterpreterErrorCode
+ * This enum specifies various error codes related to `TFLInterpreter`.
+ */
+typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
+  /** Provided tensor index is invalid. */
+  TFLInterpreterErrorCodeInvalidTensorIndex,
+
+  /** Input data has invalid byte size. */
+  TFLInterpreterErrorCodeInvalidInputByteSize,
+
+  /** Provided shape is invalid. It must be a non-empty array of positive unsigned integers. */
+  TFLInterpreterErrorCodeInvalidShape,
+
+  /** Provided model cannot be loaded. */
+  TFLInterpreterErrorCodeFailedToLoadModel,
+
+  /** Failed to create `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToCreateInterpreter,
+
+  /** Failed to invoke `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToInvoke,
+
+  /** Failed to retrieve a tensor. */
+  TFLInterpreterErrorCodeFailedToGetTensor,
+
+  /** Invalid tensor. */
+  TFLInterpreterErrorCodeInvalidTensor,
+
+  /** Failed to resize an input tensor. */
+  TFLInterpreterErrorCodeFailedToResizeInputTensor,
+
+  /** Failed to copy data into an input tensor. */
+  TFLInterpreterErrorCodeFailedToCopyDataToInputTensor,
+
+  /** Copying data into an output tensor not allowed. */
+  TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed,
+
+  /** Failed to get data from a tensor. */
+  TFLInterpreterErrorCodeFailedToGetDataFromTensor,
+
+  /** Failed to allocate memory for tensors. */
+  TFLInterpreterErrorCodeFailedToAllocateTensors,
+
+  /** Operaton not allowed without allocating memory for tensors first. */
+  TFLInterpreterErrorCodeAllocateTensorsRequired,
+
+  /** Operaton not allowed without invoking the interpreter first. */
+  TFLInterpreterErrorCodeInvokeInterpreterRequired,
+};
+
+/**
+ * A TensorFlow Lite model interpreter.
+ */
+@interface TFLInterpreter : NSObject
+
+/** The total number of input tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger inputTensorCount;
+
+/** The total number of output tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger outputTensorCount;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and the
+ * default interpreter options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and the default interpreter
+ *     options. `nil` if there is an error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and
+ * options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error NS_DESIGNATED_INITIALIZER;
+
+/**
+ * Invokes the interpreter to run inference.
+ *
+ * @param error An optional error parameter populated when there is an error in invoking the
+ *     interpreter.
+ *
+ * @return Whether the invocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)invokeWithError:(NSError **)error;
+
+/**
+ * Returns the input tensor at the given index.
+ *
+ * @param index The index of an input tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the input
+ *     tensor.
+ *
+ * @return The input tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Returns the output tensor at the given index.
+ *
+ * @param index The index of an output tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the
+ *     output tensor.
+ *
+ * @return The output tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Resizes the input tensor at the given index to the specified shape (an array of positive unsigned
+ * integers).
+ *
+ * @param index The index of an input tensor.
+ * @param shape Shape that the given input tensor should be resized to. It should be an array of
+ *     positive unsigned integer(s) containing the size of each dimension.
+ * @param error An optional error parameter populated when there is an error in resizing the input
+ *     tensor.
+ *
+ * @return Whether the input tensor was resized successfully. Returns NO if an error occurred.
+ */
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error;
+
+/**
+ * Allocates memory for tensors.
+ *
+ * @param error An optional error parameter populated when there is an error in allocating memory.
+ *
+ * @return Whether memory allocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)allocateTensorsWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6461fbf0178b1e72afb81e91d58109a2d7b0226b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Custom configuration options for a TensorFlow Lite interpreter. */
+@interface TFLInterpreterOptions : NSObject
+
+/**
+ * Maximum number of threads that the interpreter should run on. Defaults to 0 (unspecified, letting
+ * TensorFlow Lite to optimize the threading decision).
+ */
+@property(nonatomic) NSUInteger numberOfThreads;
+
+/**
+ * Initializes a new instance of `TFLInterpreterOptions`.
+ *
+ * @return A new instance of `TFLInterpreterOptions`.
+ */
+- (instancetype)init NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5cf793c5bed984debe3a36fdec4f0945cd7c64
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Parameters for asymmetric quantization. Quantized values can be converted to float values using:
+ * `realValue = scale * (quantizedValue - zeroPoint)`.
+ */
+@interface TFLQuantizationParameters : NSObject
+
+/** Scale of asymmetric quantization. */
+@property(nonatomic, readonly) float scale;
+
+/** Zero point of asymmetric quantization. */
+@property(nonatomic, readonly) int32_t zeroPoint;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensor.h b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc710abf4e2ea99126be2fb359412287f3c37a33
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLQuantizationParameters;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorDataType
+ * This enum specifies supported TensorFlow Lite tensor data types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
+  /** Tensor data type not available. This indicates an error with the model. */
+  TFLTensorDataTypeNoType,
+
+  /** 32-bit single precision floating point. */
+  TFLTensorDataTypeFloat32,
+
+  /** 32-bit signed integer. */
+  TFLTensorDataTypeInt32,
+
+  /** 8-bit unsigned integer. */
+  TFLTensorDataTypeUInt8,
+
+  /** 64-bit signed integer. */
+  TFLTensorDataTypeInt64,
+
+  /** Boolean. */
+  TFLTensorDataTypeBool,
+
+  /** 16-bit signed integer. */
+  TFLTensorDataTypeInt16,
+
+  /** 8-bit signed integer. */
+  TFLTensorDataTypeInt8,
+};
+
+/**
+ * An input or output tensor in a TensorFlow Lite model.
+ *
+ * @warning Each `TFLTensor` instance is associated with a `TFLInterpreter` instance. Multiple
+ *     `TFLTensor` instances of the same TensorFlow Lite model are associated with the same
+ *     `TFLInterpreter` instance. As long as a `TFLTensor` instance is still in use, its associated
+ *     `TFLInterpreter` instance will not be deallocated.
+ */
+@interface TFLTensor : NSObject
+
+/** Name of the tensor. */
+@property(nonatomic, readonly, copy) NSString *name;
+
+/** Data type of the tensor. */
+@property(nonatomic, readonly) TFLTensorDataType dataType;
+
+/** Parameters for asymmetric quantization. `nil` if the tensor does not use quantization. */
+@property(nonatomic, readonly, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Copies the given data into an input tensor. This is allowed only for an input tensor and only
+ * before the interpreter is invoked; otherwise an error will be returned.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor.
+ * @param error An optional error parameter populated when there is an error in copying the data.
+ *
+ * @return Whether the data was copied into the input tensor successfully. Returns NO if an error
+ *     occurred.
+ */
+- (BOOL)copyData:(NSData *)data error:(NSError **)error;
+
+/**
+ * Retrieves a copy of data in the tensor. For an output tensor, the data is only available after
+ * the interpreter invocation has successfully completed; otherwise an error will be returned.
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the data.
+ *
+ * @return A copy of data in the tensor. `nil` if there is an error in retrieving the data or the
+ *     data is not available.
+ */
+- (nullable NSData *)dataWithError:(NSError **)error;
+
+/**
+ * Retrieves the shape of the tensor, an array of positive unsigned integers containing the size
+ * of each dimension. For example: the shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3] (i.e. an array of 2 arrays of 2 arrays of 3 numbers).
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d50c896e6d5716308c7dc8818258fa38dbd72
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Helper utility for error reporting. */
+@interface TFLErrorUtil : NSObject
+
+/**
+ * Creates and saves an interpreter error with the given error code and description.
+ *
+ * @param code Error code.
+ * @param description Error description.
+ * @param error Pointer to where to save the created error. If `nil`, no error will be saved.
+ */
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
new file mode 100644
index 0000000000000000000000000000000000000000..aa973c780060f4fa67573ff1e224ab0aed2bc92b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "TFLErrorUtil.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Error domain of TensorFlow Lite interpreter related errors. */
+static NSString *const TFLInterpreterErrorDomain = @"org.tensorflow.lite.interpreter";
+
+@implementation TFLErrorUtil
+
+#pragma mark - Public
+
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error {
+  if (error) {
+    *error = [NSError errorWithDomain:TFLInterpreterErrorDomain
+                                 code:code
+                             userInfo:@{NSLocalizedDescriptionKey : description}];
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b900c4f050451061a5d1a02b8be4dc51cade175
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
@@ -0,0 +1,63 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLInterpreter (Internal)
+
+/**
+ * Copies the given data into the input tensor at the given index. This is allowed only before the
+ * interpreter is invoked.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor at the given index.
+ * @param index An input tensor index.
+ * @param error An optional error parameter populated when there is an error in setting the data.
+ *
+ * @return Whether the data was copied into the input tensor at the given index successfully.
+ *     Returns NO if an error occurred.
+ */
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Retrieves a copy of the data from the given tensor. For an output tensor, the interpreter
+ * invocation has to complete before the data can be retrieved.
+ *
+ * @param tensor A tensor.
+ * @param error An optional error parameter populated when there is an error in getting the data.
+ *
+ * @return The data of the given tensor. `nil` if there is an error or data is not available.
+ */
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+/**
+ * Retrieves the shape of the given tensor, an array of positive unsigned integer(s) containing the
+ * size of each dimension. For example: shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3].
+ *
+ * @param tensor An input or output tensor.
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
new file mode 100644
index 0000000000000000000000000000000000000000..a8ca982f6dd619f9a01bd67cc028ee6fb583a75d
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -0,0 +1,407 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLQuantizationParameters+Internal.h"
+#import "TFLTensor+Internal.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#include "tensorflow/lite/experimental/c/c_api.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Error reporter for TFLInterpreter.
+ *
+ * @param user_data User data. Not used.
+ * @param format Error message which may contain argument formatting specifiers.
+ * @param args Values of the arguments in the error message.
+ */
+static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_list args) {
+  NSLog(@"%@", [[NSString alloc] initWithFormat:@(format) arguments:args]);
+}
+
+@interface TFLInterpreter ()
+
+/** TFL_Interpreter backed by C API. */
+@property(nonatomic, nullable) TFL_Interpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreter
+
+#pragma mark - NSObject
+
+- (void)dealloc {
+  TFL_DeleteInterpreter(_interpreter);
+}
+
+#pragma mark - Public
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error {
+  return [self initWithModelPath:modelPath
+                         options:[[TFLInterpreterOptions alloc] init]
+                           error:error];
+}
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error {
+  self = [super init];
+
+  if (self != nil) {
+    TFL_Model *model = nullptr;
+    TFL_InterpreterOptions *cOptions = nullptr;
+
+    @try {
+      const char *modelPathCString = modelPath.UTF8String;
+      NSString *pathErrorString =
+          [NSString stringWithFormat:@"Cannot load model from path (%@).", modelPath];
+      if (modelPathCString == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      model = TFL_NewModelFromFile(modelPathCString);
+      if (model == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      cOptions = TFL_NewInterpreterOptions();
+      if (cOptions == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      if (options.numberOfThreads > 0) {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, (int32_t)options.numberOfThreads);
+      }
+      TFL_InterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
+
+      _interpreter = TFL_NewInterpreter(model, cOptions);
+      if (_interpreter == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      _inputTensorCount = (NSUInteger)TFL_InterpreterGetInputTensorCount(_interpreter);
+      _outputTensorCount = (NSUInteger)TFL_InterpreterGetOutputTensorCount(_interpreter);
+      if (_inputTensorCount <= 0 || _outputTensorCount <= 0) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+    } @finally {
+      TFL_DeleteInterpreterOptions(cOptions);
+      TFL_DeleteModel(model);
+    }
+  }
+
+  return self;
+}
+
+- (BOOL)invokeWithError:(NSError **)error {
+  if (TFL_InterpreterInvoke(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToInvoke
+                                   description:@"Failed to invoke the interpreter."
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeInput atIndex:index error:error];
+}
+
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.outputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeOutput atIndex:index error:error];
+}
+
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return NO;
+  }
+
+  if (shape.count == 0) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                   description:@"Invalid shape. Must not be empty."
+                                         error:error];
+    return NO;
+  }
+
+  int cDimensions[self.inputTensorCount];
+  for (int dimIndex = 0; dimIndex < shape.count; ++dimIndex) {
+    int dimension = shape[dimIndex].intValue;
+    if (dimension <= 0) {
+      NSString *errorDescription = @"Invalid shape. Dimensions must be positive integers.";
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                     description:errorDescription
+                                           error:error];
+      return NO;
+    }
+    cDimensions[dimIndex] = dimension;
+  }
+
+  if (TFL_InterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions,
+                                       (int32_t)shape.count) != kTfLiteOk) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Failed to resize input tensor at index (%lu).", (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToResizeInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (BOOL)allocateTensorsWithError:(NSError **)error {
+  if (TFL_InterpreterAllocateTensors(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToAllocateTensors
+                                   description:@"Failed to allocate memory for tensors."
+                                         error:error];
+    return NO;
+  }
+  return YES;
+}
+
+#pragma mark - TFLInterpreter (Internal)
+
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:TFLTensorTypeInput atIndex:index error:error];
+  if (cTensor == nullptr) {
+    return NO;
+  }
+
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (data.length != byteSize) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Input tensor at index (%lu) expects data size (%lu), but got (%lu).",
+                         (unsigned long)index, byteSize, (unsigned long)data.length];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidInputByteSize
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  if (TFL_TensorCopyFromBuffer((TFL_Tensor *)cTensor, data.bytes, data.length) != kTfLiteOk) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to copy data into input tensor at index (%lu).",
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCopyDataToInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  void *bytes = TFL_TensorData(cTensor);
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (bytes == nullptr || byteSize == 0) {
+    NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get data from %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetDataFromTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  return [NSData dataWithBytes:bytes length:byteSize];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+  int32_t rank = TFL_TensorNumDims(cTensor);
+  if (rank <= 0) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid rank (%d).", tensorType,
+                                   (unsigned long)index, rank];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:rank];
+  for (int32_t dimIndex = 0; dimIndex < rank; dimIndex++) {
+    int32_t dimension = TFL_TensorDim(cTensor, dimIndex);
+    if (dimension <= 0) {
+      NSString *errorDescription =
+          [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid %d-th dimension (%d).",
+                                     tensorType, (unsigned long)index, dimIndex, dimension];
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                     description:errorDescription
+                                           error:error];
+      return nil;
+    }
+    shape[dimIndex] = @((NSUInteger)dimension);
+  }
+
+  return shape;
+}
+
+#pragma mark - Private
+
+- (const TFL_Tensor *)cTensorOfType:(TFLTensorType)type
+                            atIndex:(NSUInteger)index
+                              error:(NSError **)error {
+  const TFL_Tensor *tensor = nullptr;
+
+  switch (type) {
+    case TFLTensorTypeInput:
+      tensor = TFL_InterpreterGetInputTensor(self.interpreter, (int32_t)index);
+      break;
+    case TFLTensorTypeOutput:
+      tensor = TFL_InterpreterGetOutputTensor(self.interpreter, (int32_t)index);
+      break;
+  }
+
+  if (tensor == nullptr) {
+    NSString *tensorType = [TFLTensor stringForTensorType:type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetTensor
+                                   description:errorDescription
+                                         error:error];
+  }
+
+  return tensor;
+}
+
+- (nullable TFLTensor *)tensorOfType:(TFLTensorType)type
+                             atIndex:(NSUInteger)index
+                               error:(NSError **)error {
+  const TFL_Tensor *tensor = [self cTensorOfType:type atIndex:index error:error];
+
+  if (tensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:type];
+  const char *cName = TFL_TensorName(tensor);
+  if (cName == nullptr) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get name of %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+  NSString *name = [NSString stringWithUTF8String:cName];
+
+  TFLTensorDataType dataType = [self tensorDataTypeFromCTensorType:TFL_TensorType(tensor)];
+
+  TFL_QuantizationParams cParams = TFL_TensorQuantizationParams(tensor);
+  TFLQuantizationParameters *quantizationParams;
+
+  // TODO(b/119735362): Update this check once the TFL_QuantizationParams struct has a mode.
+  if (cParams.scale != 0.0) {
+    quantizationParams = [[TFLQuantizationParameters alloc] initWithScale:cParams.scale
+                                                                zeroPoint:cParams.zero_point];
+  }
+
+  // TODO: Set quantization parameters when C API supports it.
+  return [[TFLTensor alloc] initWithInterpreter:self
+                                           type:type
+                                          index:index
+                                           name:name
+                                       dataType:dataType
+                         quantizationParameters:quantizationParams];
+}
+
+- (TFLTensorDataType)tensorDataTypeFromCTensorType:(TFL_Type)cTensorType {
+  switch (cTensorType) {
+    case kTfLiteFloat32:
+      return TFLTensorDataTypeFloat32;
+    case kTfLiteInt32:
+      return TFLTensorDataTypeInt32;
+    case kTfLiteUInt8:
+      return TFLTensorDataTypeUInt8;
+    case kTfLiteInt8:
+      return TFLTensorDataTypeInt8;
+    case kTfLiteInt64:
+      return TFLTensorDataTypeInt64;
+    case kTfLiteBool:
+      return TFLTensorDataTypeBool;
+    case kTfLiteInt16:
+      return TFLTensorDataTypeInt16;
+    case kTfLiteNoType:
+    case kTfLiteString:
+    case kTfLiteComplex64:
+      // kTfLiteString and kTfLiteComplex64 are not supported in TensorFlow Lite Objc API.
+      return TFLTensorDataTypeNoType;
+  }
+}
+
+- (BOOL)isValidTensorIndex:(NSUInteger)index
+                belowLimit:(NSUInteger)totalTensorCount
+                     error:(NSError **)error {
+  if (index >= totalTensorCount) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Invalid tensor index (%lu) exceeds max (%lu).",
+                                   (unsigned long)index, (unsigned long)(totalTensorCount - 1)];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensorIndex
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
new file mode 100644
index 0000000000000000000000000000000000000000..d129befecabc5af752ccff70e84a4a66c7ee4bca
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLInterpreterOptions
+
+#pragma mark - Public
+
+- (instancetype)init {
+  self = [super init];
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..37d9ef0bb4761c9ff93111ba3158d4c4d68a9ec2
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLQuantizationParameters (Internal)
+
+/**
+ * Initializes a `TFLQuantizationParameters` instance with the given scale and zero point.
+ *
+ * @param scale Scale of asymmetric quantization.
+ * @param zeroPoint Zero point of asymmetric quantization.
+ *
+ * @return A new instance of `TFLQuantizationParameters` with the given scale and zero point.
+ */
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
new file mode 100644
index 0000000000000000000000000000000000000000..44cb90d3323a73c1f79a27f319ac263c84e94408
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import "TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLQuantizationParameters
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint {
+  self = [super init];
+  if (self != nil) {
+    _scale = scale;
+    _zeroPoint = zeroPoint;
+  }
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5c51caabd8e44ab2b30a7b44259f6878865586
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+@class TFLInterpreter;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorType
+ * This enum specifies input or output tensor types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorType) {
+  /** Input tensor type. */
+  TFLTensorTypeInput,
+
+  /** Output tensor type. */
+  TFLTensorTypeOutput,
+};
+
+@interface TFLTensor (Internal)
+
+/** Input or output tensor type. */
+@property(nonatomic, readonly) TFLTensorType type;
+
+/** Index of the tensor. */
+@property(nonatomic, readonly) NSUInteger index;
+
+/**
+ * Initializes a `TFLTensor` with the given interpreter, name, data type, and quantization
+ * parameters.
+ *
+ * @param interpreter Interpreter backing the tensor.
+ * @param type Input or output tensor type.
+ * @param index Index of the tensor.
+ * @param name Name of the tensor.
+ * @param dataType Data type of the tensor.
+ * @param quantizationParameters Quantization parameters of the tensor. `nil` if the tensor does not
+ *     use quantization.
+ *
+ * @return A new instance of `TFLTensor` with the given name, data type, shape, and quantization
+ *     parameters.
+ */
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters;
+
+/**
+ * Returns the string name of the given input or output tensor type.
+ *
+ * @param type Input or output tensor type.
+ *
+ * @return The string name of the given input or output tensor type.
+ */
++ (NSString *)stringForTensorType:(TFLTensorType)type;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor.m b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
new file mode 100644
index 0000000000000000000000000000000000000000..2eaebfd6bec0483817bd4c1c3e540113cca75f5e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLInterpreter+Internal.h"
+#import "TFLTensor+Internal.h"
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+// String names of input or output tensor types.
+static NSString *const kTFLInputTensorTypeString = @"input";
+static NSString *const kTFLOutputTensorTypeString = @"output";
+
+@interface TFLTensor ()
+
+// Redefines readonly properties.
+@property(nonatomic) TFLTensorType type;
+@property(nonatomic) NSUInteger index;
+@property(nonatomic, copy) NSString *name;
+@property(nonatomic) TFLTensorDataType dataType;
+@property(nonatomic, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/**
+ * The backing interpreter. It's a strong reference to ensure that the interpreter is never released
+ * before this tensor is released.
+ *
+ * @warning Never let the interpreter hold a strong reference to the tensor to avoid retain cycles.
+ */
+@property(nonatomic) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLTensor
+
+#pragma mark - Public
+
+- (BOOL)copyData:(NSData *)data error:(NSError **)error {
+  if (self.type == TFLTensorTypeOutput) {
+    [TFLErrorUtil
+        saveInterpreterErrorWithCode:TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed
+                         description:@"Cannot copy data into an output tensor."
+                               error:error];
+    return NO;
+  }
+
+  return [self.interpreter copyData:data toInputTensorAtIndex:self.index error:error];
+}
+
+- (nullable NSData *)dataWithError:(NSError **)error {
+  return [self.interpreter dataFromTensor:self error:error];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error {
+  return [self.interpreter shapeOfTensor:self error:error];
+}
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters {
+  self = [super init];
+  if (self != nil) {
+    _interpreter = interpreter;
+    _type = type;
+    _index = index;
+    _name = [name copy];
+    _dataType = dataType;
+    _quantizationParameters = quantizationParameters;
+  }
+  return self;
+}
+
++ (NSString *)stringForTensorType:(TFLTensorType)type {
+  switch (type) {
+    case TFLTensorTypeInput:
+      return kTFLInputTensorTypeString;
+    case TFLTensorTypeOutput:
+      return kTFLOutputTensorTypeString;
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..00b800d6af96636054f2a79f3d4c8d007dd89ea3
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -0,0 +1,49 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+#import <XCTest/XCTest.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Unit tests for TFLInterpreterOptions.
+ */
+@interface TFLInterpreterOptionsTests : XCTestCase
+@end
+
+@implementation TFLInterpreterOptionsTests
+
+#pragma mark - Tests
+
+- (void)testInit {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  XCTAssertEqual(options.numberOfThreads, 0);
+}
+
+- (void)testSetNumberOfThread {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.numberOfThreads = 2;
+  XCTAssertEqual(options.numberOfThreads, 2);
+  options.numberOfThreads = 0;
+  XCTAssertEqual(options.numberOfThreads, 0);
+  options.numberOfThreads = 3;
+  XCTAssertEqual(options.numberOfThreads, 3);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..eefa9b9f05826a0782c0b236a2d7e145428b1ca1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
@@ -0,0 +1,358 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Float model resource name. */
+static NSString *const kAddFloatModelResourceName = @"add";
+
+/** Quantized model resource name. */
+static NSString *const kAddQuantizedModelResourceName = @"add_quantized";
+
+/** Model resource type. */
+static NSString *const kAddModelResourceType = @"bin";
+
+/** Rank of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorRank = 1U;
+
+/** Size of the first (and only) dimension of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorFirstDimensionSize = 2U;
+
+/** Quantization scale of the quantized model. */
+static const float kAddQuantizedModelScale = 0.003922F;
+
+/** Quantization zero point of the quantized model. */
+static const int32_t kAddQuantizedModelZeroPoint = 0;
+
+/** Invalid input tensor index. */
+static const NSUInteger kInvalidInputTensorIndex = 1U;
+
+/** Invalid output tensor index. */
+static const NSUInteger kInvalidOutputTensorIndex = 1U;
+
+/** Accurary used in comparing floating numbers. */
+static const float kTestAccuracy = 1E-5F;
+
+/**
+ * Unit tests for TFLInterpreter.
+ */
+@interface TFLInterpreterTests : XCTestCase
+
+/** Absolute path of the Add float model resource. */
+@property(nonatomic, nullable) NSString *floatModelPath;
+
+/** Default interpreter using the Add model. */
+@property(nonatomic, nullable) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreterTests
+
+#pragma mark - XCTestCase
+
+- (void)setUp {
+  [super setUp];
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  self.floatModelPath = [bundle pathForResource:kAddFloatModelResourceName
+                                         ofType:kAddModelResourceType];
+  NSError *error;
+  self.interpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(self.interpreter);
+  XCTAssertTrue([self.interpreter allocateTensorsWithError:nil]);
+}
+
+- (void)tearDown {
+  self.floatModelPath = nil;
+  self.interpreter = nil;
+
+  [super tearDown];
+}
+
+#pragma mark - Tests
+
+- (void)testSuccessfulFullRunAddFloatModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath
+                                                                        options:options
+                                                                          error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:sizeof(float)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  float output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(float) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqualWithAccuracy(output[0], 3.f, kTestAccuracy);
+  XCTAssertEqualWithAccuracy(output[1], 9.f, kTestAccuracy);
+}
+
+- (void)testSuccessfulFullRunQuantizedModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  NSString *quantizedModelPath = [bundle pathForResource:kAddQuantizedModelResourceName
+                                                  ofType:kAddModelResourceType];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter =
+      [[TFLInterpreter alloc] initWithModelPath:quantizedModelPath options:options error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(inputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(inputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  uint8_t one = 1;
+  uint8_t three = 3;
+  [inputData appendBytes:&one length:sizeof(uint8_t)];
+  [inputData appendBytes:&three length:sizeof(uint8_t)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(outputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(outputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  uint8_t output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(uint8_t) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqual(output[0], 3);
+  XCTAssertEqual(output[1], 9);
+}
+
+- (void)testInitWithModelPath_invalidPath {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *brokenInterpreter = [[TFLInterpreter alloc] initWithModelPath:@"InvalidPath"
+                                                                          error:&error];
+  XCTAssertNil(brokenInterpreter);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToLoadModel);
+}
+
+- (void)testInvoke_beforeAllocation {
+  NSError *error;
+  TFLInterpreter *interpreterWithoutAllocation =
+      [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNotNil(interpreterWithoutAllocation);
+  XCTAssertNil(error);
+
+  XCTAssertFalse([interpreterWithoutAllocation invokeWithError:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToInvoke);
+}
+
+- (void)testInputTensorAtIndex_invalidIndex {
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:kInvalidInputTensorIndex
+                                                          error:&error];
+  XCTAssertNil(inputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_invalidIndex {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:kInvalidInputTensorIndex
+                                                    toShape:shape
+                                                      error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_emptyShape {
+  NSMutableArray *emptyShape = [NSMutableArray arrayWithCapacity:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:emptyShape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testResizeInputTensorAtIndex_zeroDimensionSize {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testCopyDataToInputTensorAtIndex_invalidInputDataByteSize {
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([inputTensor copyData:inputData error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidInputByteSize);
+}
+
+- (void)testCopyDataToOutputTensorAtIndex_notAllowed {
+  NSMutableData *data = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [data appendBytes:&one length:sizeof(float)];
+  [data appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *outputTensor = [self.interpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([outputTensor copyData:data error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..239e0bcb0dee8b6d2258be6f7e1ae2591611f501
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
@@ -0,0 +1,48 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Test scale of quantization parameters. */
+static const float kTestScale = 2.0;
+
+/** Test zero point of quantization parameters. */
+static const int32_t kTestZeroPoint = 128;
+
+/**
+ * Unit tests for TFLQuantizationParameters.
+ */
+@interface TFLQuantizationParametersTests : XCTestCase
+@end
+
+@implementation TFLQuantizationParametersTests
+
+#pragma mark - Tests
+
+- (void)testInitWithScaleAndZeroPoint {
+  TFLQuantizationParameters *params =
+      [[TFLQuantizationParameters alloc] initWithScale:kTestScale zeroPoint:kTestZeroPoint];
+  XCTAssertEqual(params.scale, kTestScale);
+  XCTAssertEqual(params.zeroPoint, kTestZeroPoint);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..8f44546a4b8d98a023cda9eae33fba2ce87258b9
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -0,0 +1,105 @@
+# TensorFlow Lite for Swift
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
+
+MINIMUM_OS_VERSION = "9.0"
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+]
+
+swift_library(
+    name = "TensorFlowLite",
+    srcs = glob(["Sources/*.swift"]),
+    module_name = "TensorFlowLite",
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # DISABLED: Following sanitizer tests are not supported by iOS test targets.
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":TestsLib",
+    ],
+)
+
+ios_application(
+    name = "TensorFlowLiteApp",
+    app_icons = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/**"]),
+    bundle_id = "com.tensorflow.lite.swift.TensorFlowLite",
+    families = [
+        "ipad",
+        "iphone",
+    ],
+    infoplists = ["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist"],
+    launch_storyboard = "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    sdk_frameworks = [
+        "CoreGraphics",
+    ],
+    tags = DEFAULT_TAGS + ["manual"],
+    deps = [
+        ":AppLib",
+    ],
+)
+
+swift_library(
+    name = "TestsLib",
+    testonly = 1,
+    srcs = glob(["Tests/*.swift"]),
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":Resources",
+        ":TensorFlowLite",
+    ],
+)
+
+swift_library(
+    name = "AppLib",
+    srcs = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/*.swift"]),
+    module_name = "AppLib",
+    tags = DEFAULT_TAGS + ["manual"],
+    deps = [
+        ":AppResources",
+        ":TensorFlowLite",
+    ],
+)
+
+objc_library(
+    name = "Resources",
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = DEFAULT_TAGS,
+)
+
+objc_library(
+    name = "AppResources",
+    data = glob([
+        "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/*.storyboard",
+    ]),
+    tags = DEFAULT_TAGS + ["manual"],
+    deps = [
+        ":Resources",
+    ],
+)
diff --git a/tensorflow/lite/experimental/swift/LICENSE b/tensorflow/lite/experimental/swift/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..716ab33f8d6acaa4a4896f79c1b8e5662698c4c3
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -0,0 +1,63 @@
+# TensorFlow Lite for Swift
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Swift developers. It enables low-latency inference of on-device
+machine learning models with a small binary size and fast performance supporting
+hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+swift_library(
+  deps = [
+      "//tensorflow/lite/experimental/swift:TensorFlowLite",
+  ],
+)
+```
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
+
+If you would like to build the Swift TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Swift library target:
+
+```shell
+bazel build tensorflow/lite/experimental/swift:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
+```
+
+Note that `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
+
+### Tulsi
+
+Open the `TensorFlowLite.tulsiproj` using the
+[TulsiApp](https://github.com/bazelbuild/tulsi)
+or by running the
+[`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
+script from the root `tensorflow` directory:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
+```
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a14b5966b1a24946137fddae0ddea16ed43ba46c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -0,0 +1,265 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite interpreter that performs inference from a given model.
+public final class Interpreter {
+
+  /// The `TFL_Interpreter` C pointer type represented as an `UnsafePointer<TFL_Interpreter>`.
+  private typealias CInterpreter = OpaquePointer
+
+  /// Total number of input tensors associated with the model.
+  public var inputTensorCount: Int {
+    return Int(TFL_InterpreterGetInputTensorCount(cInterpreter))
+  }
+
+  /// Total number of output tensors associated with the model.
+  public var outputTensorCount: Int {
+    return Int(TFL_InterpreterGetOutputTensorCount(cInterpreter))
+  }
+
+  /// The underlying `TFL_Interpreter` C pointer.
+  private var cInterpreter: CInterpreter?
+
+  /// Creates a new model interpreter instance.
+  ///
+  /// - Parameters:
+  ///   - modelPath: Local file path to a TensorFlow Lite model.
+  ///   - options: Custom configurations for the interpreter. The default is `nil` indicating that
+  ///       interpreter will determine the configuration options.
+  /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
+  public init(modelPath: String, options: InterpreterOptions? = nil) throws {
+    guard let model = Model(filePath: modelPath) else { throw InterpreterError.failedToLoadModel }
+
+    let cInterpreterOptions: OpaquePointer? = try options.map { options in
+      guard let cOptions = TFL_NewInterpreterOptions() else {
+        throw InterpreterError.failedToCreateInterpreter
+      }
+      if let threadCount = options.threadCount, threadCount > 0 {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, Int32(threadCount))
+      }
+      if options.isErrorLoggingEnabled {
+        TFL_InterpreterOptionsSetErrorReporter(
+          cOptions,
+          { (_, format, arguments) in
+            guard let cFormat = format,
+                  let message = String(cFormat: cFormat, arguments: arguments)
+            else {
+              return
+            }
+            print(String(describing: InterpreterError.tensorFlowLiteError(message)))
+          },
+          nil
+        )
+      }
+      return cOptions
+    }
+    defer { TFL_DeleteInterpreterOptions(cInterpreterOptions) }
+
+    guard let cInterpreter = TFL_NewInterpreter(model.cModel, cInterpreterOptions) else {
+      throw InterpreterError.failedToCreateInterpreter
+    }
+    self.cInterpreter = cInterpreter
+  }
+
+  deinit {
+    TFL_DeleteInterpreter(cInterpreter)
+  }
+
+  /// Invokes the interpreter to perform inference from the loaded graph.
+  ///
+  /// - Throws: An error if the model was not ready because tensors were not allocated.
+  public func invoke() throws {
+    guard TFL_InterpreterInvoke(cInterpreter) == kTfLiteOk else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.allocateTensorsRequired
+    }
+  }
+
+  /// Returns the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the index is invalid or the tensors have not been allocated.
+  /// - Returns: The input tensor at the given index.
+  public func input(at index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Returns the output tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the output tensor.
+  /// - Throws: An error if the index is invalid, tensors haven't been allocated, or interpreter
+  ///     hasn't been invoked for models that dynamically compute output tensors based on the values
+  ///     of its input tensors.
+  /// - Returns: The output tensor at the given index.
+  public func output(at index: Int) throws -> Tensor {
+    let maxIndex = outputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetOutputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.invokeInterpreterRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Resizes the input tensor at the given index to the specified tensor shape.
+  ///
+  /// - Note: After resizing an input tensor, the client **must** explicitly call
+  ///     `allocateTensors()` before attempting to access the resized tensor data or invoking the
+  ///     interpreter to perform inference.
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  ///   - shape: The shape that the input tensor should be resized to.
+  /// - Throws: An error if the input tensor at the given index could not be resized.
+  public func resizeInput(at index: Int, to shape: TensorShape) throws {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard TFL_InterpreterResizeInputTensor(
+            cInterpreter,
+            Int32(index),
+            shape.int32Dimensions,
+            Int32(shape.rank)
+          ) == kTfLiteOk
+    else {
+      throw InterpreterError.failedToResizeInputTensor(index: index)
+    }
+  }
+
+  /// Copies the given data to the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - data: The data to be copied to the input tensor's data buffer.
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the `data.count` does not match the input tensor's `data.count` or if
+  ///     the given index is invalid.
+  /// - Returns: The input tensor with the copied data.
+  @discardableResult
+  public func copy(_ data: Data, toInputAt index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)) else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+
+    let byteCount = TFL_TensorByteSize(cTensor)
+    guard data.count == byteCount else {
+      throw InterpreterError.invalidTensorDataCount(provided: data.count, required: byteCount)
+    }
+
+    let status = data.withUnsafeBytes { TFL_TensorCopyFromBuffer(cTensor, $0, data.count) }
+    guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
+    return try input(at: index)
+  }
+
+  /// Allocates memory for all input tensors based on their `TensorShape`s.
+  ///
+  /// - Note: This is a relatively expensive operation and should only be called after creating the
+  ///     interpreter and/or resizing any input tensors.
+  /// - Throws: An error if memory could not be allocated for the input tensors.
+  public func allocateTensors() throws {
+    guard TFL_InterpreterAllocateTensors(cInterpreter) == kTfLiteOk else {
+      throw InterpreterError.failedToAllocateTensors
+    }
+  }
+}
+
+// MARK: - Extensions
+
+extension String {
+  /// Returns a new `String` initialized by using the given format C array as a template into which
+  /// the remaining argument values are substituted according to the user’s default locale.
+  ///
+  /// - Note: Returns `nil` if a new `String` could not be constructed from the given values.
+  /// - Parameters:
+  ///   - cFormat: The format C array as a template for substituting values.
+  ///   - arguments: A C pointer to a `va_list` of arguments to substitute into `cFormat`.
+  init?(cFormat: UnsafePointer<CChar>, arguments: CVaListPointer) {
+    var buffer: UnsafeMutablePointer<CChar>?
+    guard vasprintf(&buffer, cFormat, arguments) != 0, let cString = buffer else { return nil }
+    self.init(validatingUTF8: cString)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
new file mode 100644
index 0000000000000000000000000000000000000000..5de58b997a76b6bf9493525694bc9f9e4e6b6c1c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// TensorFlow Lite interpreter errors.
+public enum InterpreterError: Error {
+  case invalidTensorIndex(index: Int, maxIndex: Int)
+  case invalidTensorDataCount(provided: Int, required: Int)
+  case invalidTensorDataType
+  case failedToLoadModel
+  case failedToCreateInterpreter
+  case failedToResizeInputTensor(index: Int)
+  case failedToCopyDataToInputTensor
+  case failedToAllocateTensors
+  case allocateTensorsRequired
+  case invokeInterpreterRequired
+  case tensorFlowLiteError(String)
+}
+
+// MARK: - Extensions
+
+extension InterpreterError: LocalizedError {
+  /// Localized description of the interpreter error.
+  public var errorDescription: String? {
+    switch self {
+    case .invalidTensorIndex(let index, let maxIndex):
+      return "Invalid tensor index \(index), max index is \(maxIndex)."
+    case .invalidTensorDataCount(let providedCount, let requiredCount):
+      return "Provided data count \(providedCount) must match the required count \(requiredCount)."
+    case .invalidTensorDataType:
+      return "Tensor data type is unsupported or could not be determined because of a model error."
+    case .failedToLoadModel:
+      return "Failed to load the given model."
+    case .failedToCreateInterpreter:
+      return "Failed to create the interpreter."
+    case .failedToResizeInputTensor(let index):
+      return "Failed to resize input tesnor at index \(index)."
+    case .failedToCopyDataToInputTensor:
+      return "Failed to copy data to input tensor."
+    case .failedToAllocateTensors:
+      return "Failed to allocate memory for input tensors."
+    case .allocateTensorsRequired:
+      return "Must call allocateTensors()."
+    case .invokeInterpreterRequired:
+      return "Must call invoke()."
+    case .tensorFlowLiteError(let message):
+      return "TensorFlow Lite Error: \(message)"
+    }
+  }
+}
+
+extension InterpreterError: CustomStringConvertible {
+  /// Textual representation of the TensorFlow Lite interpreter error.
+  public var description: String {
+    return errorDescription ?? "Unknown error."
+  }
+}
+
+#if swift(>=4.2)
+extension InterpreterError: Equatable {}
+#else
+extension InterpreterError: Equatable {
+  public static func == (lhs: InterpreterError, rhs: InterpreterError) -> Bool {
+    switch (lhs, rhs) {
+    case (.invalidTensorDataType, .invalidTensorDataType),
+         (.failedToLoadModel, .failedToLoadModel),
+         (.failedToCreateInterpreter, .failedToCreateInterpreter),
+         (.failedToAllocateTensors, .failedToAllocateTensors),
+         (.allocateTensorsRequired, .allocateTensorsRequired),
+         (.invokeInterpreterRequired, .invokeInterpreterRequired):
+      return true
+    case (.invalidTensorIndex(let lhsIndex, let lhsMaxIndex),
+          .invalidTensorIndex(let rhsIndex, let rhsMaxIndex)):
+      return lhsIndex == rhsIndex && lhsMaxIndex == rhsMaxIndex
+    case (.invalidTensorDataCount(let lhsProvidedCount, let lhsRequiredCount),
+          .invalidTensorDataCount(let rhsProvidedCount, let rhsRequiredCount)):
+      return lhsProvidedCount == rhsProvidedCount && lhsRequiredCount == rhsRequiredCount
+    case (.failedToResizeInputTensor(let lhsIndex), .failedToResizeInputTensor(let rhsIndex)):
+      return lhsIndex == rhsIndex
+    case (.tensorFlowLiteError(let lhsMessage), .tensorFlowLiteError(let rhsMessage)):
+      return lhsMessage == rhsMessage
+    default:
+      return false
+    }
+  }
+}
+#endif  // swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2365fd7ade0f9562250b239308f6a13b16c35784
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Custom configuration options for a TensorFlow Lite interpreter.
+public struct InterpreterOptions: Equatable {
+
+  /// Maximum number of CPU threads that the interpreter should run on. Default is `nil` which
+  /// indicates that the `Interpreter` will decide the number of threads to use.
+  public var threadCount: Int? = nil
+
+  /// Whether error logging to the console is enabled. The default is `false`.
+  public var isErrorLoggingEnabled = false
+
+  /// Creates a new instance of interpreter options.
+  public init() {}
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Model.swift b/tensorflow/lite/experimental/swift/Sources/Model.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e8c49ff1ae10cc20d1c50b8e8340950cb1491722
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Model.swift
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite model used by the 'Interpreter` to perform inference.
+final class Model {
+
+  /// The `TFL_Model` C pointer type represented as an `UnsafePointer<TFL_Model>`.
+  typealias CModel = OpaquePointer
+
+  /// The underlying `TFL_Model` C pointer.
+  let cModel: CModel?
+
+  /// Creates a new model instance.
+  ///
+  /// - Precondition: Initialization can fail if the given `filePath` is invalid.
+  /// - Parameters:
+  ///   - filePath: Local file path to a TensorFlow Lite model.
+  init?(filePath: String) {
+    guard !filePath.isEmpty, let cModel = TFL_NewModelFromFile(filePath) else { return nil }
+    self.cModel = cModel
+  }
+
+  deinit {
+    TFL_DeleteModel(cModel)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f36787564478115e19584b933a10fb0458e06c71
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Parameters that determine the mapping of quantized values to real values. Quantized values can
+/// be mapped to float values using the following conversion:
+/// `realValue = scale * (quantizedValue - zeroPoint)`.
+public struct QuantizationParameters {
+
+  /// Difference between real values corresponding to consecutive quantized values differing by 1.
+  /// For example, the range of quantized values for `UInt8` data type is [0, 255].
+  public let scale: Float
+
+  /// Quantized value that corresponds to the real 0 value.
+  public let zeroPoint: Int
+
+  /// Creates a new quantization parameters instance.
+  ///
+  /// - Parameters:
+  ///   - scale: Scale value for asymmetric quantization.
+  ///   - zeroPoint: Zero point for asymmetric quantization.
+  init(scale: Float, zeroPoint: Int) {
+    self.scale = scale
+    self.zeroPoint = zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Tensor.swift b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..b738d8754914e20ac4c1cb991c92b029828f66d2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// An input or output tensor in a TensorFlow Lite graph.
+public struct Tensor {
+
+  /// Name of the tensor.
+  public let name: String
+
+  /// Data type of the tensor.
+  public let dataType: TensorDataType
+
+  /// Shape of the tensor.
+  public let shape: TensorShape
+
+  /// Data in the input or output tensor.
+  public let data: Data
+
+  /// Quantization parameters for the tensor if using a quantized model.
+  public let quantizationParameters: QuantizationParameters?
+
+  /// Creates a new input or output tensor instance.
+  ///
+  /// - Parameters:
+  ///   - name: Name of the tensor.
+  ///   - dataType: Data type of the tensor.
+  ///   - data: Data in the input tensor.
+  ///   - quantizationParameters Quantization parameters for the tensor if using a quantized model.
+  ///       The default is `nil`.
+  init(
+    name: String,
+    dataType: TensorDataType,
+    shape: TensorShape,
+    data: Data,
+    quantizationParameters: QuantizationParameters? = nil
+  ) {
+    self.name = name
+    self.dataType = dataType
+    self.shape = shape
+    self.data = data
+    self.quantizationParameters = quantizationParameters
+  }
+}
+
+/// Supported TensorFlow Lite tensor data types.
+public enum TensorDataType: Equatable {
+  /// 32-bit single precision floating point tensor data type.
+  case float32
+  /// 8-bit unsigned integer tensor data type.
+  case uInt8
+  /// 16-bit signed integer tensor data type.
+  case int16
+  /// 32-bit signed integer tensor data type.
+  case int32
+  /// 64-bit signed integer tensor data type.
+  case int64
+  /// Boolean tensor data type.
+  case bool
+
+  /// Creates a new tensor data type from the given `TFL_Type` or `nil` if the data type is
+  /// unsupported or could not be determined because there was an error.
+  ///
+  /// - Parameter type: A data type supported by a tensor.
+  init?(type: TFL_Type) {
+    switch type {
+    case kTfLiteFloat32:
+      self = .float32
+    case kTfLiteUInt8:
+      self = .uInt8
+    case kTfLiteInt16:
+      self = .int16
+    case kTfLiteInt32:
+      self = .int32
+    case kTfLiteInt64:
+      self = .int64
+    case kTfLiteBool:
+      self = .bool
+    case kTfLiteNoType:
+      fallthrough
+    default:
+      return nil
+    }
+  }
+}
+
+/// The shape of a TensorFlow Lite tensor.
+public struct TensorShape {
+
+  /// The number of dimensions of the tensor.
+  public let rank: Int
+
+  /// Array of dimensions for the tensor.
+  public let dimensions: [Int]
+
+  /// Array of `Int32` dimensions for the tensor.
+  var int32Dimensions: [Int32] { return dimensions.map(Int32.init) }
+
+  /// Creates a new tensor shape instance with the given array of dimensions.
+  ///
+  /// - Parameters:
+  ///   - dimensions: Dimensions for the tensor.
+  public init(_ dimensions: [Int]) {
+    self.rank = dimensions.count
+    self.dimensions = dimensions
+  }
+
+  /// Creates a new tensor shape instance with the given elements representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - elements: Dimensions for the tensor.
+  public init(_ elements: Int...) {
+    self.init(elements)
+  }
+}
+
+extension TensorShape: ExpressibleByArrayLiteral {
+  /// Creates a new tensor shape instance with the given array literal representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - arrayLiteral: Dimensions for the tensor.
+  public init(arrayLiteral: Int...) {
+    self.init(arrayLiteral)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..16bc6cbfe8f554caad2cba3cae11b364b34ed64d
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -0,0 +1,57 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/swift",
+    "tensorflow/lite/experimental/swift/Sources",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj",
+    "tensorflow/lite/experimental/swift/Tests",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/swift:TensorFlowLite",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteApp",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLite",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/swift/BUILD"
+  ]
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..82ac8aa38126021c176773e4093352bcbecd8603
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
@@ -0,0 +1,14 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "ProjectPrioritizesSwift" : {
+        "p" : "YES"
+      }
+    }
+  },
+  "projectName" : "TensorFlowLite",
+  "packages" : [
+    "tensorflow/lite/experimental/swift"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..fbbf9a1de2c8e82ab486b99b9e9b8c6dfe80868e
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
@@ -0,0 +1,345 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */; };
+		4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B722146ED64006C3AEF /* AppDelegate.swift */; };
+		4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B742146ED64006C3AEF /* ViewController.swift */; };
+		4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B762146ED64006C3AEF /* Main.storyboard */; };
+		4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B792146ED66006C3AEF /* Assets.xcassets */; };
+		4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */; };
+		4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+TensorFlowLite.swift"; sourceTree = "<group>"; };
+		4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TensorFlowLiteApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		4AA72B722146ED64006C3AEF /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		4AA72B742146ED64006C3AEF /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		4AA72B772146ED64006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		4AA72B792146ED66006C3AEF /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		4AA72B7C2146ED66006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		4AA72B7E2146ED66006C3AEF /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Array+TensorFlowLite.swift"; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		4AA72B6C2146ED64006C3AEF /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		4AA72B662146ED64006C3AEF = {
+			isa = PBXGroup;
+			children = (
+				4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */,
+				4AA72B702146ED64006C3AEF /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		4AA72B702146ED64006C3AEF /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B722146ED64006C3AEF /* AppDelegate.swift */,
+				4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */,
+				4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */,
+				4AA72B742146ED64006C3AEF /* ViewController.swift */,
+				4AA72B762146ED64006C3AEF /* Main.storyboard */,
+				4AA72B792146ED66006C3AEF /* Assets.xcassets */,
+				4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */,
+				4AA72B7E2146ED66006C3AEF /* Info.plist */,
+			);
+			path = TensorFlowLiteApp;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */;
+			buildPhases = (
+				4AA72B6B2146ED64006C3AEF /* Sources */,
+				4AA72B6C2146ED64006C3AEF /* Frameworks */,
+				4AA72B6D2146ED64006C3AEF /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TensorFlowLiteApp;
+			productName = TensorFlowLiteApp;
+			productReference = 4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		4AA72B672146ED64006C3AEF /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0940;
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					4AA72B6E2146ED64006C3AEF = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = 4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 4AA72B662146ED64006C3AEF;
+			productRefGroup = 4AA72B702146ED64006C3AEF /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		4AA72B6D2146ED64006C3AEF /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */,
+				4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */,
+				4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		4AA72B6B2146ED64006C3AEF /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */,
+				4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */,
+				4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */,
+				4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		4AA72B762146ED64006C3AEF /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B772146ED64006C3AEF /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B7C2146ED66006C3AEF /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		4AA72B7F2146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		4AA72B802146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		4AA72B822146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		4AA72B832146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B7F2146ED66006C3AEF /* Debug */,
+				4AA72B802146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B822146ED66006C3AEF /* Debug */,
+				4AA72B832146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 4AA72B672146ED64006C3AEF /* Project object */;
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ffa90a06adb0b9f93575c8390cd30bd589e43ac7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
@@ -0,0 +1,24 @@
+import UIKit
+
+@UIApplicationMain
+
+final class AppDelegate: UIResponder, UIApplicationDelegate {
+
+  /// The main window of the app.
+  var window: UIWindow?
+
+  func application(
+    _ application: UIApplication,
+    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
+  ) -> Bool {
+    return true
+  }
+}
+
+// MARK: - Extensions
+
+#if !swift(>=4.2)
+extension UIApplication {
+  typealias LaunchOptionsKey = UIApplicationLaunchOptionsKey
+}
+#endif  // !swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..56df1ce6597aacf307f7a89a084527ea93c303c2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
@@ -0,0 +1,22 @@
+import Foundation
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Warning: The array's `Element` type must be trivial in that it can be copied bit for bit
+  ///     with no indirection or reference-counting operations; otherwise, copying the raw bytes in
+  ///     the `unsafeData`'s buffer to a new array returns an unsafe copy.
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8db8d65fd79fd541b2b7eba75c7378af3448f9c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4a164c918651cdd1e11dca5cc62c333f097601
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..a07a1321be2e65323fadeca51487671c88f462c8
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14109" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="Llm-lL-Icb"/>
+                        <viewControllerLayoutGuide type="bottom" id="xb3-aO-Qok"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlowLite" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="3Gq-PV-hia">
+                                <rect key="frame" x="16" y="315" width="343" height="38.5"/>
+                                <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="leading" secondItem="Ze5-6b-2t3" secondAttribute="leading" constant="16" id="aXL-9T-5Pf"/>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="centerY" secondItem="Ze5-6b-2t3" secondAttribute="centerY" id="cDf-Go-1FR"/>
+                            <constraint firstAttribute="trailing" secondItem="3Gq-PV-hia" secondAttribute="trailing" constant="16" id="fB9-BX-A3B"/>
+                        </constraints>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="52" y="374.66266866566718"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..b9e8bfb822f90ea1e1db31a21d482d6e522e374f
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="AppLib" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
+                        <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="7Mj-sL-hrd">
+                                <rect key="frame" x="0.0" y="367" width="375" height="300"/>
+                                <color key="backgroundColor" red="0.0" green="0.47843137250000001" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="300" id="YUb-MC-D5w"/>
+                                </constraints>
+                                <color key="textColor" cocoaTouchSystemColor="tableCellGroupedBackgroundColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Qwg-EP-bd6" userLabel="Bottom Toolbar">
+                                <rect key="frame" x="0.0" y="323" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="jhT-Q0-E9N"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="P3q-uA-YUa"/>
+                                    <barButtonItem title="Invoke Interpreter" id="A4J-Mg-nmd" userLabel="Invoke Button">
+                                        <connections>
+                                            <action selector="invokeInterpreter:" destination="BYZ-38-t0r" id="lZU-x7-PsJ"/>
+                                        </connections>
+                                    </barButtonItem>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="Qad-Pa-ySg"/>
+                                </items>
+                            </toolbar>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Gkb-TR-PCB" userLabel="Top Toolbar">
+                                <rect key="frame" x="0.0" y="28" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="hSD-2q-fUE"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" id="LKw-TX-bbH">
+                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="rhA-nW-xzT">
+                                            <rect key="frame" x="16" y="7" width="343" height="30"/>
+                                            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                            <segments>
+                                                <segment title="Add"/>
+                                                <segment title="AddQuantized"/>
+                                                <segment title="MultiAdd"/>
+                                            </segments>
+                                            <connections>
+                                                <action selector="modelChanged:" destination="BYZ-38-t0r" eventType="valueChanged" id="YnG-Ov-B5D"/>
+                                            </connections>
+                                        </segmentedControl>
+                                    </barButtonItem>
+                                </items>
+                            </toolbar>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstAttribute="trailing" secondItem="Gkb-TR-PCB" secondAttribute="trailing" id="4Cr-Sf-I7n"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="bottom" secondItem="wfy-db-euE" secondAttribute="top" id="6ot-zD-sze"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="top" secondItem="Qwg-EP-bd6" secondAttribute="bottom" id="ELA-C6-NiG"/>
+                            <constraint firstAttribute="trailing" secondItem="7Mj-sL-hrd" secondAttribute="trailing" id="HDO-xr-mBl"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="Kmo-6K-gS4"/>
+                            <constraint firstItem="Qwg-EP-bd6" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="hGu-lm-fMG"/>
+                            <constraint firstAttribute="trailing" secondItem="Qwg-EP-bd6" secondAttribute="trailing" id="iXR-LK-nTO"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="nr7-jW-ZYf"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="top" secondItem="y3c-jy-aDJ" secondAttribute="bottom" constant="8" id="uCF-VW-rR0"/>
+                        </constraints>
+                    </view>
+                    <connections>
+                        <outlet property="invokeButton" destination="A4J-Mg-nmd" id="UxZ-Ft-E45"/>
+                        <outlet property="modelControl" destination="rhA-nW-xzT" id="KKf-TT-BQ2"/>
+                        <outlet property="resultsTextView" destination="7Mj-sL-hrd" id="T4I-z4-tYA"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="125.59999999999999" y="133.5832083958021"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a70c848390ad7ba584629563d7d75a9e32341
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
@@ -0,0 +1,13 @@
+import Foundation
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3875f04e5789da9cfb34a44151cd06226a8f3
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>0.0.1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>NSCameraUsageDescription</string>
+	<key>NSPhotoLibraryUsageDescription</key>
+	<string>Select a photo to detect objects in.</string>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..73c74fd19c996653d988977d551fcef683f18697
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
@@ -0,0 +1,299 @@
+import TensorFlowLite
+import UIKit
+
+class ViewController: UIViewController {
+
+  // MARK: - Properties
+
+  /// TensorFlowLite interpreter object for performing inference from a given model.
+  private var interpreter: Interpreter?
+
+  /// Serial dispatch queue for managing `Interpreter` calls.
+  private let interpreterQueue = DispatchQueue(
+    label: Constant.dispatchQueueLabel,
+    qos: .userInitiated
+  )
+
+  /// The currently selected model.
+  private var currentModel: Model {
+    guard let currentModel = Model(rawValue: modelControl.selectedSegmentIndex) else {
+      preconditionFailure("Invalid model for selected segment index.")
+    }
+    return currentModel
+  }
+
+  /// A description of the current model.
+  private var modelDescription: String {
+    guard let interpreter = interpreter else { return "" }
+    let inputCount = interpreter.inputTensorCount
+    let outputCount = interpreter.outputTensorCount
+    let inputTensors = (0..<inputCount).map { index in
+      var tensorInfo = "  Input \(index + 1): "
+      do {
+        let tensor = try interpreter.input(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    let outputTensors = (0..<outputCount).map { index in
+      var tensorInfo = "  Output \(index + 1): "
+      do {
+        let tensor = try interpreter.output(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    return "Model Description:\n" +
+             "  Input Tensor Count = \(inputCount)\n\(inputTensors)\n\n" +
+             "  Output Tensor Count = \(outputCount)\n\(outputTensors)"
+  }
+
+  // MARK: - IBOutlets
+
+  /// A segmented control for changing models. See the `Model` enum for available models.
+  @IBOutlet private var modelControl: UISegmentedControl!
+
+  @IBOutlet private var resultsTextView: UITextView!
+  @IBOutlet private var invokeButton: UIBarButtonItem!
+
+  // MARK: - UIViewController
+
+  override func viewDidLoad() {
+    super.viewDidLoad()
+
+    invokeButton.isEnabled = false
+    loadModel()
+  }
+
+  // MARK: - IBActions
+
+  @IBAction func modelChanged(_ sender: Any) {
+    invokeButton.isEnabled = false
+    updateResultsText("Switched to the \(currentModel.description) model.")
+    loadModel()
+  }
+
+  @IBAction func invokeInterpreter(_ sender: Any) {
+    switch currentModel {
+    case .add:
+      invokeAdd()
+    case .addQuantized:
+      invokeAddQuantized()
+    case .multiAdd:
+      invokeMultiAdd()
+    }
+  }
+
+  // MARK: - Private
+
+  private func loadModel() {
+    let fileInfo = currentModel.fileInfo
+    guard let modelPath = Bundle.main.path(forResource: fileInfo.name, ofType: fileInfo.extension)
+    else {
+      updateResultsText("Failed to load the \(currentModel.description) model.")
+      return
+    }
+    setUpInterpreter(withModelPath: modelPath)
+  }
+
+  private func setUpInterpreter(withModelPath modelPath: String) {
+    interpreterQueue.async {
+      do {
+        var options = InterpreterOptions()
+        options.isErrorLoggingEnabled = true
+        self.interpreter = try Interpreter(modelPath: modelPath, options: options)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to create the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+      safeDispatchOnMain { self.invokeButton.isEnabled = true }
+    }
+  }
+
+  private func invokeAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [Float32] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(copyingBufferOf: input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let results = [Float32](unsafeData: outputTensor.data) else { return "No results." }
+          return resultsText + results.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeAddQuantized() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [UInt8] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on quantized input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let quantizationParameters = outputTensor.quantizationParameters else {
+            return "No results."
+          }
+          let quantizedResults = [UInt8](outputTensor.data)
+          let dequantizedResults = quantizedResults.map {
+            quantizationParameters.scale * Float(Int($0) - quantizationParameters.zeroPoint)
+          }
+          return resultsText + quantizedResults.description +
+                   ", dequantized results: " + dequantizedResults.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeMultiAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        let shape = TensorShape(2)
+        try (0..<interpreter.inputTensorCount).forEach { index in
+          try interpreter.resizeInput(at: index, to: shape)
+        }
+        try interpreter.allocateTensors()
+        let inputs = try (0..<interpreter.inputTensorCount).map { index -> [Float32] in
+          let input = [Float32(index + 1), Float32(index + 2)]
+          let data = Data(copyingBufferOf: input)
+          try interpreter.copy(data, toInputAt: index)
+          return input
+        }
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 3 add operations on inputs \(inputs.description) equals: "
+        self.updateResultsText(resultsText)
+        try interpreter.invoke()
+        let results = try (0..<interpreter.outputTensorCount).map { index -> [Float32] in
+          let tensor = try interpreter.output(at: index)
+          return [Float32](unsafeData: tensor.data) ?? []
+        }
+        self.updateResultsText(resultsText + results.description)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func updateResultsText(_ text: String? = nil) {
+    safeDispatchOnMain { self.resultsTextView.text = text }
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let dispatchQueueLabel = "TensorFlowLiteInterpreterQueue"
+  static let nilInterpreterErrorMessage =
+    "Failed to invoke the interpreter because the interpreter was nil."
+}
+
+/// Models that can be loaded by the TensorFlow Lite `Interpreter`.
+private enum Model: Int, CustomStringConvertible {
+  /// A float model that performs two add operations on one input tensor and returns the result in
+  /// one output tensor.
+  case add = 0
+  /// A quantized model that performs two add operations on one input tensor and returns the result
+  /// in one output tensor.
+  case addQuantized = 1
+  /// A float model that performs three add operations on four input tensors and returns the results
+  /// in 2 output tensors.
+  case multiAdd = 2
+
+  var fileInfo: (name: String, extension: String) {
+    switch self {
+    case .add:
+      return Add.fileInfo
+    case .addQuantized:
+      return AddQuantized.fileInfo
+    case .multiAdd:
+      return MultiAdd.fileInfo
+    }
+  }
+
+  // MARK: - CustomStringConvertible
+
+  var description: String {
+    switch self {
+    case .add:
+      return Add.name
+    case .addQuantized:
+      return AddQuantized.name
+    case .multiAdd:
+      return MultiAdd.name
+    }
+  }
+}
+
+/// Values for the `Add` model.
+private enum Add {
+  static let name = "Add"
+  static let fileInfo = (name: "add", extension: "bin")
+}
+
+/// Values for the `AddQuantized` model.
+private enum AddQuantized {
+  static let name = "AddQuantized"
+  static let fileInfo = (name: "add_quantized", extension: "bin")
+}
+
+/// Values for the `MultiAdd` model.
+private enum MultiAdd {
+  static let name = "MultiAdd"
+  static let fileInfo = (name: "multi_add", extension: "bin")
+}
+
+// MARK: - Fileprivate
+
+/// Safely dispatches the given block on the main queue. If the current thread is `main`, the block
+/// is executed synchronously; otherwise, the block is executed asynchronously on the main thread.
+fileprivate func safeDispatchOnMain(_ block: @escaping () -> Void) {
+  if Thread.isMainThread { block(); return }
+  DispatchQueue.main.async { block() }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..54b4f59b28942fe2398aba1a19443857e9617458
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterOptionsTests: XCTestCase {
+
+  func testInterpreterOptions_InitWithDefaultValues() {
+    let options = InterpreterOptions()
+    XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_InitWithCustomValues() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertEqual(options.threadCount, 2)
+    options.isErrorLoggingEnabled = true
+    XCTAssertTrue(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_Equatable() {
+    var options1 = InterpreterOptions()
+    var options2 = InterpreterOptions()
+    XCTAssertEqual(options1, options2)
+
+    options1.threadCount = 2
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.threadCount = 3
+    XCTAssertNotEqual(options1, options2)
+    options2.threadCount = 2
+
+    options1.isErrorLoggingEnabled = true
+    options2.isErrorLoggingEnabled = true
+    XCTAssertEqual(options1, options2)
+
+    options2.isErrorLoggingEnabled = false
+    XCTAssertNotEqual(options1, options2)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e98da5f951e9bc6bfebaf6a1bd76b3c8c8bb9e83
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -0,0 +1,315 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterTests: XCTestCase {
+
+  var interpreter: Interpreter!
+
+  override func setUp() {
+    super.setUp()
+
+    interpreter = try! Interpreter(modelPath: AddModel.path)
+  }
+
+  override func tearDown() {
+    interpreter = nil
+
+    super.tearDown()
+  }
+
+  func testInterpreter_InitWithModelPath() {
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path))
+  }
+
+  func testInterpreter_Init_ThrowsFailedToLoadModel() {
+    XCTAssertThrowsError(try Interpreter(modelPath: "/invalid/path")) { error in
+      self.assertEqualErrors(actual: error, expected: .failedToLoadModel)
+    }
+  }
+
+  func testInterpreter_InitWithModelPathAndOptions() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path, options: options))
+  }
+
+  func testInterpreter_InputTensorCount() {
+    XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
+  }
+
+  func testInterpreter_OutputTensorCount() {
+    XCTAssertEqual(interpreter.outputTensorCount, AddModel.outputTensorCount)
+  }
+
+  func testInterpreter_Invoke() throws {
+    try interpreter.allocateTensors()
+    XCTAssertNoThrow(try interpreter.invoke())
+  }
+
+  func testInterpreter_Invoke_ThrowsAllocateTensorsRequired_ModelNotReady() {
+    XCTAssertThrowsError(try interpreter.invoke()) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddModel.validIndex)
+    XCTAssertEqual(inputTensor, AddModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(inputTensor, AddQuantizedModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsAllocateTensorsRequired() {
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddModel.validIndex)
+    XCTAssertEqual(outputTensor, AddModel.outputTensor)
+    let expectedResults = [Float32](unsafeData: outputTensor.data)
+    XCTAssertEqual(expectedResults, AddModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(outputTensor, AddQuantizedModel.outputTensor)
+    let expectedResults = [UInt8](outputTensor.data)
+    XCTAssertEqual(expectedResults, AddQuantizedModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    try interpreter.invoke()
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.outputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvokeInterpreterRequired() {
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .invokeInterpreterRequired)
+    }
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape() {
+    XCTAssertNoThrow(try interpreter.resizeInput(at: AddModel.validIndex, to: [2, 2, 3]))
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.resizeInput(
+      at: AddModel.invalidIndex,
+      to: [2, 2, 3]
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let inputTensor = try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+    XCTAssertEqual(inputTensor.data, AddModel.inputData)
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.copy(
+      AddModel.inputData,
+      toInputAt: AddModel.invalidIndex
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidDataCount() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let invalidData = Data(count: AddModel.dataCount - 1)
+    XCTAssertThrowsError(try interpreter.copy(
+      invalidData,
+      toInputAt: AddModel.validIndex
+    )) { error in
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
+      )
+    }
+  }
+
+  func testInterpreter_AllocateTensors() {
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  // MARK: - Private
+
+  private func setUpAddModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+  }
+
+  private func setUpAddQuantizedModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddQuantizedModel.inputOutputIndex, to: AddQuantizedModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddQuantizedModel.inputData, toInputAt: AddQuantizedModel.inputOutputIndex)
+  }
+
+  private func assertEqualErrors(actual: Error, expected: InterpreterError) {
+    guard let actual = actual as? InterpreterError else {
+      XCTFail("Actual error should be of type InterpreterError.")
+      return
+    }
+    XCTAssertEqual(actual, expected)
+  }
+}
+
+// MARK: - Constants
+
+/// Values for the `add.bin` model.
+private enum AddModel {
+  static let info = (name: "add", extension: "bin")
+  static let inputTensorCount = 1
+  static let outputTensorCount = 1
+  static let invalidIndex = 1
+  static let validIndex = 0
+  static let shape: TensorShape = [2]
+  static let dataCount = inputData.count
+  static let inputData = Data(copyingBufferOf: [Float32(1.0), Float32(3.0)])
+  static let outputData = Data(copyingBufferOf: [Float32(3.0), Float32(9.0)])
+  static let results = [Float32(3.0), Float32(9.0)]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .float32,
+    shape: shape,
+    data: inputData
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .float32,
+    shape: shape,
+    data: outputData
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+/// Values for the `add_quantized.bin` model.
+private enum AddQuantizedModel {
+  static let info = (name: "add_quantized", extension: "bin")
+  static let inputOutputIndex = 0
+  static let shape: TensorShape = [2]
+  static let inputData = Data([1, 3])
+  static let outputData = Data([3, 9])
+  static let quantizationParameters = QuantizationParameters(scale: 0.003922, zeroPoint: 0)
+  static let results: [UInt8] = [3, 9]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .uInt8,
+    shape: shape,
+    data: inputData,
+    quantizationParameters: quantizationParameters
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .uInt8,
+    shape: shape,
+    data: outputData,
+    quantizationParameters: quantizationParameters
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+// MARK: - Extensions
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..025db1890607641d49304ae22da1fc33fed084ef
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
@@ -0,0 +1,59 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class ModelTests: XCTestCase {
+
+  var modelPath: String!
+
+  override func setUp() {
+    super.setUp()
+
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(
+            forResource: Constant.modelInfo.name,
+            ofType: Constant.modelInfo.extension)
+    else {
+      XCTFail("Failed to get the model file path.")
+      return
+    }
+    self.modelPath = modelPath
+  }
+
+  override func tearDown() {
+    modelPath = nil
+
+    super.tearDown()
+  }
+
+  func testModel_InitWithFilePath() {
+    XCTAssertNotNil(Model(filePath: modelPath))
+  }
+
+  func testModel_InitWithEmptyFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: ""))
+  }
+
+  func testModel_InitWithInvalidFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: "invalid/path"))
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let modelInfo = (name: "add", extension: "bin")
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..65648c26982daa0cab2a40d111d72e10563373cf
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class QuantizationParametersTests: XCTestCase {
+
+  func testQuantizationParameters_InitWithCustomValues() {
+    let parameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters.scale, 0.5)
+    XCTAssertEqual(parameters.zeroPoint, 1)
+  }
+
+  func testQuantizationParameters_Equatable() {
+    let parameters1 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let parameters2 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters1, parameters2)
+
+    let parameters3 = QuantizationParameters(scale: 0.4, zeroPoint: 1)
+    XCTAssertNotEqual(parameters1, parameters3)
+    XCTAssertNotEqual(parameters2, parameters3)
+  }
+}
+
+// MARK: - Extensions
+
+extension QuantizationParameters: Equatable {
+  public static func == (lhs: QuantizationParameters, rhs: QuantizationParameters) -> Bool {
+    return lhs.scale == rhs.scale && lhs.zeroPoint == rhs.zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4540043a1636f43834ec496ffef1e78444ba312b
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class TensorTests: XCTestCase {
+
+  // MARK: - Tensor
+
+  func testTensor_Init() {
+    let name = "InputTensor"
+    let dataType: TensorDataType = .uInt8
+    let shape = TensorShape(Constant.dimensions)
+    guard let data = name.data(using: .utf8) else { XCTFail("Data should not be nil."); return }
+    let quantizationParameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let inputTensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    XCTAssertEqual(inputTensor.name, name)
+    XCTAssertEqual(inputTensor.dataType, dataType)
+    XCTAssertEqual(inputTensor.shape, shape)
+    XCTAssertEqual(inputTensor.data, data)
+    XCTAssertEqual(inputTensor.quantizationParameters, quantizationParameters)
+  }
+
+  // MARK: - TensorShape
+
+  func testTensorShape_InitWithArray() {
+    let shape = TensorShape(Constant.dimensions)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithElements() {
+    let shape = TensorShape(2, 2, 3)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithArrayLiteral() {
+    let shape: TensorShape = [2, 2, 3]
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  /// Array of 2 arrays of 2 arrays of 3 numbers: [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]].
+  static let dimensions = [2, 2, 3]
+}
+
+// MARK: - Extensions
+
+extension TensorShape: Equatable {
+  public static func == (lhs: TensorShape, rhs: TensorShape) -> Bool {
+    return lhs.rank == rhs.rank && lhs.dimensions == rhs.dimensions
+  }
+}
+
+extension Tensor: Equatable {
+  public static func == (lhs: Tensor, rhs: Tensor) -> Bool {
+    return lhs.name == rhs.name && lhs.dataType == rhs.dataType && lhs.shape == rhs.shape &&
+           lhs.data == rhs.data && lhs.quantizationParameters == rhs.quantizationParameters
+  }
+}
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 57ce63636714aa616cb50e04fe2c15210cc2eb1c..9ba74d0e9114a5e47bef6c6b146f121d254e0f92 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -33,7 +33,6 @@ cc_library(
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
     ],
 )
@@ -48,6 +47,16 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "writer_test",
+    srcs = ["writer_test.cc"],
+    deps = [
+        ":writer_lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
 cc_test(
     name = "writer_lib_test",
     size = "small",
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index cb6ec3e0d7e0f1b53cc8b84e10cb1be4b1f023c0..949a255abaf63cbc1cc0b3f718f6aaca4f38fd8d 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -112,5 +112,29 @@ inline LSHProjectionType LSHProjectionTypeToSchema(
   }
 }
 
+inline MirrorPadMode MirrorPaddingModeToSchema(TfLiteMirrorPaddingMode mode) {
+  switch (mode) {
+    case kTfLiteMirrorPaddingUnknown:
+      return MirrorPadMode_REFLECT;  // TODO(aselle): consider an error
+    case kTfLiteMirrorPaddingReflect:
+      return MirrorPadMode_REFLECT;
+    case kTfLiteMirrorPaddingSymmetric:
+      return MirrorPadMode_SYMMETRIC;
+  }
+}
+
+inline CombinerType CombinerTypeToSchema(TfLiteCombinerType type) {
+  switch (type) {
+    case kTfLiteCombinerTypeSum:
+      return CombinerType_SUM;
+    case kTfLiteCombinerTypeMean:
+      return CombinerType_MEAN;
+    case kTfLiteCombinerTypeSqrtn:
+      return CombinerType_SQRTN;
+  }
+}
+
+// int
+
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 73742494762b8af9a9a08cd24c6eae1ac25fd426..a5c5dc8709969eccb03250ff194127f47592896a 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -22,54 +22,60 @@ limitations under the License.
 namespace tflite {
 namespace {
 // This is generated by grepping
-//  cat  third_party/tensorflow/lite/builtin_op_data.h
-//| grep "^} TfLite" | sed 's/^} TfLite\(.*\)Params;/\1Params/g' | grep -v "^}"
-static const char* param_structs[] = {"TfLiteConvParams",
-                                      "TfLitePoolParams",
-                                      "TfLiteDepthwiseConvParams",
-                                      "TfLiteSVDFParams",
-                                      "TfLiteRNNParams",
-                                      "TfLiteSequenceRNNParams",
-                                      "TfLiteFullyConnectedParams",
-                                      "TfLiteLSHProjectionParams",
-                                      "TfLiteSoftmaxParams",
-                                      "TfLiteConcatenationParams",
-                                      "TfLiteAddParams",
-                                      "TfLiteSpaceToBatchNDParams",
+//  cat  third_party/tensorflow/lite/c/builtin_op_data.h | grep "^} TfLite" |
+//  sed 's/^} \(TfLite.*\)Params;/\1Params/g' | grep -v "^}" | sed
+//  's/\(.*\)/"\1",/g' | sort
+static const char* param_structs[] = {"TfLiteAddParams",
+                                      "TfLiteArgMaxParams",
+                                      "TfLiteArgMinParams",
                                       "TfLiteBatchToSpaceNDParams",
-                                      "TfLiteMulParams",
-                                      "TfLiteSubParams",
+                                      "TfLiteBidirectionalSequenceLSTMParams",
+                                      "TfLiteBidirectionalSequenceRNNParams",
+                                      "TfLiteCastParams",
+                                      "TfLiteConcatenationParams",
+                                      "TfLiteConvParams",
+                                      "TfLiteDepthwiseConvParams",
                                       "TfLiteDivParams",
+                                      "TfLiteEmbeddingLookupSparseParams",
+                                      "TfLiteFakeQuantParams",
+                                      "TfLiteFullyConnectedParams",
+                                      "TfLiteGatherParams",
                                       "TfLiteL2NormParams",
+                                      "TfLiteLeakyReluParams",
                                       "TfLiteLocalResponseNormParams",
+                                      "TfLiteLSHProjectionParams",
                                       "TfLiteLSTMParams",
-                                      "TfLiteResizeBilinearParams",
-                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteMulParams",
+                                      "TfLiteOneHotParams",
+                                      "TfLitePackParams",
                                       "TfLitePadParams",
                                       "TfLitePadV2Params",
+                                      "TfLitePoolParams",
+                                      "TfLiteReducerParams",
                                       "TfLiteReshapeParams",
+                                      "TfLiteResizeBilinearParams",
+                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteRNNParams",
+                                      "TfLiteSequenceRNNParams",
+                                      "TfLiteShapeParams",
                                       "TfLiteSkipGramParams",
+                                      "TfLiteSoftmaxParams",
+                                      "TfLiteSpaceToBatchNDParams",
                                       "TfLiteSpaceToDepthParams",
-                                      "TfLiteCastParams",
-                                      "TfLiteEmbeddingLookupSparseParams",
-                                      "TfLiteGatherParams",
-                                      "TfLiteTransposeParams",
-                                      "TfLiteReducerParams",
+                                      "TfLiteSparseToDenseParams",
                                       "TfLiteSplitParams",
                                       "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
-                                      "TfLiteArgMaxParams",
-                                      "TfLiteArgMinParams",
+                                      "TfLiteSubParams",
+                                      "TfLiteSVDFParams",
                                       "TfLiteTransposeConvParams",
-                                      "TfLiteSparseToDenseParams",
-                                      "TfLiteShapeParams",
-                                      "TfLiteFakeQuantParams",
-                                      "TfLitePackParams",
-                                      "TfLiteOneHotParams",
-                                      "TfLiteLeakyReluParams",
-                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteTransposeParams",
+                                      "TfLiteUnidirectionalSequenceLSTMParams",
                                       "TfLiteUniqueParams",
+                                      "TfLiteUnpackParams",
+                                      "TfLiteReverseSequenceParams",
                                       nullptr};
 }  // namespace
 
@@ -142,7 +148,6 @@ class OpOptionData {
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_MIN"] = "ReducerOptions";
     op_to_option_["REDUCE_ANY"] = "ReducerOptions";
-    op_to_option_["UNPACK"] = "";
     op_to_option_["SUM"] = "ReducerOptions";
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_PROD"] = "ReducerOptions";
@@ -151,33 +156,32 @@ class OpOptionData {
     op_to_option_["AVERAGE_POOL_2D"] = "Pool2DOptions";
     op_to_option_["MAX_POOL_2D"] = "Pool2DOptions";
     op_to_option_["L2_NORMALIZATION"] = "L2NormOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
-    op_to_option_["UNIQUE"] = "";      // TODO(karimnosseir): UniqueOptions.
-    // Manually specified mappings between ops and options (none)
-    op_to_option_["EMBEDDING_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["MAXIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["MINIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
+    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
+
+    // Manually specified mappings between ops to "none" options -- these are
+    // ops without a corresponding Options message in schema as yet. If these
+    // options do get assigned an Options message in future, they need to be
+    // updated here as well.
+    op_to_option_["EMBEDDING_LOOKUP"] = "";
     op_to_option_["FLOOR"] = "";
-    op_to_option_["HASHTABLE_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["CEIL"] = "";
+    op_to_option_["HASHTABLE_LOOKUP"] = "";
     op_to_option_["LOGISTIC"] = "";
     op_to_option_["RELU"] = "";
     op_to_option_["RELU_N1_TO_1"] = "";
     op_to_option_["RELU6"] = "";
     op_to_option_["TANH"] = "";
-    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
-    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
     op_to_option_["PRELU"] = "";
-    op_to_option_["MAXIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
-    op_to_option_["MINIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
     op_to_option_["SIN"] = "";
     op_to_option_["LOG"] = "";
     op_to_option_["SQRT"] = "";
     op_to_option_["RSQRT"] = "";
+    op_to_option_["ELU"] = "";
+    op_to_option_["REVERSE_SEQUENCE"] = "";
 
     // TODO(aselle): These are undesirable hacks. Consider changing C structs
     option_to_struct_["Pool2DOptions"] = "TfLitePoolParams";
@@ -185,6 +189,7 @@ class OpOptionData {
     option_to_struct_["DepthwiseConv2DOptions"] = "TfLiteDepthwiseConvParams";
     option_to_struct_["LocalResponseNormalizationOptions"] =
         "TfLiteLocalResponseNormParams";
+    option_to_struct_["MirrorPadOptions"] = "TfLiteMirrorPaddingParams";
     // Now for every op, try to find an option.
     bool fatal = false;
     for (auto op_name : ops_) {
@@ -224,13 +229,15 @@ class OpOptionData {
           if (!param_struct_found) {
             std::cerr << "Failed to get param struct for option " << option_name
                       << std::endl;
-            fatal = true;
           } else {
             option_to_struct_.insert(std::make_pair(option_name, params_guess));
           }
         }
       }
     }
+    if (fatal) {
+      exit(1);
+    }
   }
 
  private:
@@ -241,16 +248,28 @@ class OpOptionData {
       option_to_type_function_;
 };
 
+void GenerateImportForResizeBilinearOp(FILE* fp) {
+  fprintf(fp,
+          "  case BuiltinOperator_RESIZE_BILINEAR:  {\n"
+          "    const auto* params = reinterpret_cast<const "
+          "TfLiteResizeBilinearParams*>(builtin_op_data);\n"
+          "    auto union_type = CreateResizeBilinearOptions(*fbb, "
+          "params->align_corners).Union();\n"
+          "    return std::make_pair(BuiltinOptions_ResizeBilinearOptions, "
+          "union_type);\n"
+          "  }\n  break;\n");
+}
+
 void GenerateImportForOp(FILE* fp, const std::string& op_name,
                          const std::string& option_name,
                          const std::string& option_type,
                          const flatbuffers::TypeTable* options,
                          const std::string& struct_name) {
-  // Skip tricky ones for now
-  if (struct_name == "TfLiteResizeBilinearParams") return;
-  if (struct_name == "TfLiteSqueezeParams") return;
-  if (struct_name == "TfLiteEmbeddingLookupSparseParams") return;
-  if (struct_name == "TfLiteReshapeParams") return;
+  // Special-case ResizeBilinear which has some deprecated fields.
+  if (struct_name == "TfLiteResizeBilinearParams") {
+    GenerateImportForResizeBilinearOp(fp);
+    return;
+  }
 
   fprintf(fp, "  case BuiltinOperator_%s:  {\n", op_name.c_str());
   fprintf(fp,
@@ -260,6 +279,9 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
 
   for (size_t i = 0; i < options->num_elems; i++) {
     std::string elem_name = options->names[i];
+    bool is_int_vector = false;
+    std::string vector_name = elem_name;
+    std::string vector_size;
     // TODO(aselle): Irregular naming in builtins
     if (elem_name == "fused_activation_function")
       elem_name = "activation";
@@ -271,8 +293,26 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       elem_name = "dilation_height_factor";
     else if (elem_name == "dilation_w_factor")
       elem_name = "dilation_width_factor";
-    else if (elem_name == "new_shape")
-      elem_name = "shape";
+    else if (elem_name == "idx_out_type")
+      elem_name = "index_out_type";
+
+    // Vector fields treated specially.
+    if (elem_name == "new_shape") {
+      is_int_vector = true;
+      vector_name = "shape";
+      vector_size = "num_dimensions";
+    } else if (elem_name == "squeeze_dims") {
+      is_int_vector = true;
+      vector_size = "num_squeeze_dims";
+    }
+
+    if (is_int_vector) {
+      fprintf(fp,
+              "    auto val%zu = fbb->CreateVector("
+              "std::vector<int>(params->%s, params->%s + params->%s));\n",
+              i, vector_name.c_str(), vector_name.c_str(), vector_size.c_str());
+      continue;
+    }
 
     flatbuffers::TypeCode code = options->type_codes[i];
     auto contained_type = code.sequence_ref != -1
@@ -291,6 +331,10 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       mapper = "LSTMKernelTypeToSchema";
     } else if (contained_type == LSHProjectionTypeTypeTable) {
       mapper = "LSHProjectionTypeToSchema";
+    } else if (contained_type == MirrorPadModeTypeTable) {
+      mapper = "MirrorPaddingModeToSchema";
+    } else if (contained_type == CombinerTypeTypeTable) {
+      mapper = "CombinerTypeToSchema";
     }
 
     fprintf(fp,
diff --git a/tensorflow/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
index a0ce4b716d62c5a24342f5a3863e58eb203f7441..2bdc41bae84341949631f77a1be8631b007f2985 100644
--- a/tensorflow/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -219,6 +219,11 @@ std::vector<int> InterpreterWriter::RemapTensorIndicesToWritten(
   std::vector<int> output;
   output.reserve(input.size());
   for (int x : input) {
+    // Special value representing an optional tensor which is not present.
+    if (x == -1) {
+      output.push_back(x);
+      continue;
+    }
     if (tensor_to_written_tensor_[x] != -1) {
       output.push_back(tensor_to_written_tensor_[x]);
     }
diff --git a/tensorflow/lite/experimental/writer/writer_test.cc b/tensorflow/lite/experimental/writer/writer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc9b25db33094ed06e6ab464b8b9dcf5209f1488
--- /dev/null
+++ b/tensorflow/lite/experimental/writer/writer_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Loads the input tflite file into interpreter, serializes it back to a tflite
+// buffer, and then verifies that the generated output can be loaded back into
+// an interpreter and the model prepared (i.e., AllocateTensors returns ok).
+//
+// Usage:
+//   writer_test <input tflite>
+
+#include <iostream>
+
+#include "tensorflow/lite/experimental/writer/writer_lib.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+
+int main(int argc, char* argv[]) {
+  if (argc != 2) {
+    fprintf(stderr, "Usage: %s input_file\n", argv[0]);
+    return 1;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(argv[1]);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::ops::builtin::BuiltinOpResolver builtin_op_resolver;
+  tflite::InterpreterBuilder(*model, builtin_op_resolver)(&interpreter);
+  tflite::InterpreterWriter writer(interpreter.get());
+  std::unique_ptr<uint8_t[]> output_buffer;
+  size_t output_buffer_size;
+  writer.GetBuffer(&output_buffer, &output_buffer_size);
+
+  // Verify the generated model.
+  model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<char*>(output_buffer.get()), output_buffer_size);
+  tflite::InterpreterBuilder(*model, builtin_op_resolver)(&interpreter);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    fprintf(stderr, "AllocateTensors failed on the round-tripped model.\n");
+    return 1;
+  }
+  return 0;
+}
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 9c48e1e54d153b9ff043e43f75f25cc36398bc60..82e9de3d26847494e343e4a892a4159c612d0957 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -1,116 +1,124 @@
 upper_tabs:
-# Tabs left of dropdown menu
-- include: /_upper_tabs_left.yaml
-- include: /api_docs/_upper_tabs_api.yaml
-# Dropdown menu
-- name: Resources
-  path: /resources
+- name: "Install"
+  lower_tabs:
+    guides:
+    - include: /install/_toc.yaml
+
+- name: "Learn"
+  path: /learn/
   is_default: true
   menu:
-  - include: /resources/_menu_toc.yaml
+  - include: /learn/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
-    - name: Guide
+    - name: "Guide"
       contents:
-      - title: Overview
-        path: /lite/overview
-      - title: Developer guide
-        path: /lite/devguide
-      - title: Android demo app
-        path: /lite/demo_android
-      - title: iOS demo app
-        path: /lite/demo_ios
-      - break: true
-      - title: TensorFlow Lite APIs
-        path: /lite/apis
-      - title: Custom operators
-        path: /lite/custom_operators
-      - title: TensorFlow Lite ops versioning
-        path: /lite/ops_versioning
-      - title: TensorFlow Lite compatibility guide
-        path: /lite/tf_ops_compatibility
-      - title: List of hosted models
-        path: /lite/models
-      - title: TensorFlow Lite for iOS
-        path: /lite/ios
-      - title: TensorFlow Lite for Raspberry Pi
-        path: /lite/rpi
+      - title: "TensorFlow Lite guide"
+        path: /lite/guide
+
+      - heading: "Get started"
+      - title: "Overview"
+        path: /lite/guide/get_started
+      - title: "Android quickstart"
+        path: /lite/guide/android
+      - title: "iOS quickstart"
+        path: /lite/guide/ios
+      - title: "FAQ"
+        path: /lite/guide/faq
+      - title: "Roadmap"
+        path: /lite/guide/roadmap
 
-      - heading: TF Lite converter
-      - title: Overview
+      - heading: "Convert a model"
+      - title: "TensorFlow Lite converter"
         path: /lite/convert/
-      - title: Python API guide
-        path: /lite/convert/python_api
-      - title: Command line examples
+      - title: "Command line examples"
         path: /lite/convert/cmdline_examples
-      - title: Command line reference
+      - title: "Command line reference"
         path: /lite/convert/cmdline_reference
+      - title: "Python API"
+        path: /lite/convert/python_api
+
+      - heading: "Inference"
+      - title: "Overview"
+        path: /lite/guide/inference
+      - title: "Custom operators"
+        path: /lite/guide/ops_custom
+      - title: "Operator versions"
+        path: /lite/guide/ops_version
+      - title: "Operator compatibility"
+        path: /lite/guide/ops_compatibility
+      - title: "Select operators from TensorFlow"
+        path: /lite/guide/ops_select
+      - title: "List of hosted models"
+        path: /lite/guide/hosted_models
 
-      - heading: Performance
-      - title: Best practices
+      - heading: "Performance"
+      - title: "Best practices"
         path: /lite/performance/best_practices
-      - title: Benchmarks
+      - title: "Benchmarks"
         path: /lite/performance/benchmarks
-      - title: Model optimization
+      - title: "Model optimization"
         path: /lite/performance/model_optimization
-      - title: Post-training quantization
+      - title: "Post-training quantization"
         path: /lite/performance/post_training_quantization
-      - title: Post-training quantization example
+      - title: "Post-training quantization example"
         path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
         status: external
-      - title: GPU delegate
+      - title: "Delegates"
+        path: /lite/performance/delegates
+      - title: "GPU delegate"
         path: /lite/performance/gpu
-      - title: Advanced GPU
+      - title: "Advanced GPU"
         path: /lite/performance/gpu_advanced
 
-      - title: TF Mobile
-        style: accordion
-        status: deprecated
-        section:
-        - title: Overview
-          path: /lite/tfmobile/
-        - title: Building TensorFlow on Android
-          path: /lite/tfmobile/android_build
-        - title: Building TensorFlow on IOS
-          path: /lite/tfmobile/ios_build
-        - title: Integrating TensorFlow libraries
-          path: /lite/tfmobile/linking_libs
-        - title: Preparing models for mobile deployment
-          path: /lite/tfmobile/prepare_models
-        - title: Optimizing for mobile
-          path: /lite/tfmobile/optimizing
+      - heading: "Build TensorFlow Lite"
+      - title: "Build for iOS"
+        path: /lite/guide/build_ios
+      - title: "Build for ARM64"
+        path: /lite/guide/build_arm64
+      - title: "Build for Raspberry Pi"
+        path: /lite/guide/build_rpi
 
-    # - name: Models
-    #   contents:
-    #   - title: Overview
-    #     path: /lite/models/
-    #   - heading: Beginner
-    #     style: divider
-    #   - title: Image labeling
-    #     section:
-    #     - title: Overview
-    #       path: /lite/models/image/label/overview
-    #     - title: Android
-    #       path: /lite/models/image/label/android
-    #     - title: iOS
-    #       path: /lite/models/image/label/ios
-    #   - heading: Advanced
-    #     style: divider
-    #   - heading: Image
-    #   - title: Image classification
-    #     path: /lite/models/image/classification/
-    #   - heading: Audio
-    #   - title: Hot word detection
-    #     path: /lite/models/audio/hot_word/
-    #   - heading: Text
-    #   - title: Text classification
-    #     path: /lite/models/text/classification/
+      - heading: "Microcontroller"
+      - title: "Overview"
+        path: /lite/guide/microcontroller
+      - title: "TensorFlow Codelab"
+        path: https://g.co/codelabs/sparkfunTF
+        status: external
+
+    - name: "Examples"
+      contents:
+      - title: "Examples"
+        path: /lite/examples
 
-    - name: API
+    - name: "Models"
+      contents:
+      - title: "Overview"
+        path: /lite/models/
+      - heading: "Image classification"
+      - title: "Overview"
+        path: /lite/models/image_classification/overview
+      - title: "Android"
+        path: /lite/models/image_classification/android
+      - title: "iOS"
+        path: /lite/models/image_classification/ios
+      - heading: "Other techniques"
+      - title: "Object detection"
+        path: /lite/models/object_detection/overview
+      - title: "Pose estimation"
+        path: /lite/models/pose_estimation/overview
+      - title: "Segmentation"
+        path: /lite/models/segmentation/overview
+      - title: "Smart reply"
+        path: /lite/models/smart_reply/overview
+
+    - name: "API"
       skip_translation: true
       contents:
-      - title: API
+      - title: "API"
         path: /api_docs/python/tf/lite
 
+- include: /api_docs/_upper_tabs_api.yaml
+- include: /resources/_upper_tabs_resources.yaml
 - include: /_upper_tabs_right.yaml
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
deleted file mode 100644
index 7153b7c6f670375df8183a9269bb7eaf096ac0c2..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/_index.yaml
+++ /dev/null
@@ -1,202 +0,0 @@
-project_path: /lite/_project.yaml
-book_path: /lite/_book.yaml
-description: <!--no description-->
-landing_page:
-  custom_css_path: /site-assets/css/style.css
-  rows:
-  - heading: TensorFlow Lite is for mobile and embedded devices
-    description: >
-      <p style="max-width: 75%;">
-        TensorFlow Lite is the official solution for running machine learning
-        models on mobile and embedded devices. It enables on&#8209;device machine
-        learning inference with low latency and a small binary size on Android,
-        iOS, and other operating systems.
-      </p>
-      <style>
-      .tfo-landing-row-heading h2 {
-        margin-top: 0 !important;
-      }
-      .tfo-landing-row-heading-list ol, .tfo-landing-row-heading-list ul {
-        margin-top: 0;
-      }
-      </style>
-
-  - classname: tfo-landing-row-heading tfo-landing-row-heading-list
-    heading: Many benefits
-    description: >
-      On-device ML inference is difficult because of the many constraints—TensorFlow Lite can solve these:
-    items:
-    - list:
-      - heading: Performance
-        description: >
-          TF Lite is fast with no noticeable accuracy loss—see the <a href="./performance">metrics</a>.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Portability
-        description: >
-          <a href="https://developer.android.com/ndk/guides/neuralnetworks/" class="external">Android</a>,
-          iOS, and more specialized IoT devices.
-        icon:
-          icon_name: lens
-          foreground: theme
-    - list:
-      - heading: Low latency
-        description: >
-          Optimized float- and fixed-point CPU kernels, op&#8209;fusing, and more.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Acceleration
-        description: >
-          Integration with GPU and internal/external accelerators.
-        icon:
-          icon_name: lens
-          foreground: theme
-    - list:
-      - heading: Small model size
-        description: >
-          Controlled dependencies, <a href="https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3" class="external">quantization</a>,
-          and op&nbsp;registration.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Tooling
-        description: >
-          Conversion, compression, benchmarking, power-consumption, and more.
-        icon:
-          icon_name: lens
-          foreground: theme
-
-  - heading: How it works
-    items:
-    - heading: Pick a model
-      icon:
-        icon_name: build
-      description: >
-        Pick a new model or retrain an existing one.
-      buttons:
-      - label: Pick
-        path: /lite/devguide#1_choose_a_model
-        classname: button button-primary tfo-button-primary
-    - heading: Convert
-      icon:
-        icon_name: autorenew
-      description: >
-        Convert a TensorFlow model into a compressed flat buffer with the
-        TensorFlow Lite Converter.
-      buttons:
-      - label: Convert
-        path: /lite/devguide#2_convert_the_model_format
-        classname: button button-primary tfo-button-primary
-    - heading: Deploy
-      icon:
-        icon_name: settings_cell
-      description: >
-        Take the compressed <code>.tflite</code> file and load it into a mobile or embedded device.
-      buttons:
-      - label: Deploy
-        path: /lite/devguide#3_use_the_tensorflow_lite_model_for_inference_in_a_mobile_app
-        classname: button button-primary tfo-button-primary
-    - heading: Optimize
-      icon:
-        icon_name: bolt
-      description: >
-        [optional] Quantize by converting 32-bit floats to more efficient 8-bit integers or run on GPU.
-      buttons:
-      - label: Optimize
-        path: /lite/devguide#4_optimize_your_model_optional
-        classname: button button-primary tfo-button-primary
-
-  - heading: Build your first TensorFlow Lite app with Codelabs
-    background: grey
-    items:
-    - classname: tfo-landing-row-item-inset-white
-      heading: Get started
-      description: >
-        <ul>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/" class="external">TensorFlow for Poets</a></li>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/" class="external">TensorFlow for Poets 2: Android</a></li>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-ios/" class="external">TensorFlow for Poets 2: iOS </a></li>
-          <li>Intermediate: <a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193" class="external">Object detection tutorial</a>
-        </ul>
-    - classname: tfo-landing-row-item-inset-white
-      heading: Share your TensorFlow Lite story
-      description: >
-        We love to hear what you're working on—it may even get highlighted on
-        our social media! <a href="https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss" class="external">Tell us</a>.
-
-  - classname: devsite-landing-row-logos tfo-landing-row-heading
-    heading: TensorFlow Lite users
-    items:
-    - custom_image:
-        path: ./images/landing-page/photos_logo.png
-    - custom_image:
-        path: ./images/landing-page/gboard_logo.png
-    - custom_image:
-        path: ./images/landing-page/gmail_logo.png
-    - custom_image:
-        path: ./images/landing-page/assistant_logo.png
-
-  - classname: devsite-landing-row-logos
-    items:
-    - custom_image:
-        path: ./images/landing-page/vsco_logo.png
-    - custom_image:
-        path: ./images/landing-page/shazam_logo.png
-    - custom_image:
-        path: ./images/landing-page/nest_logo.png
-    - custom_image:
-        path: ./images/landing-page/loseit_logo.png
-
-
-  - classname: devsite-landing-row-cards
-    background: grey
-    heading: Updates
-    items:
-    - heading: "TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)"
-      image_path: ./images/landing-page/facial_contour_detection.png
-      path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
-      buttons:
-      - label: Read more
-        path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
-    - heading: "AI in motion: react in the real world"
-      image_path: ./images/landing-page/ai_in_motion.png
-      path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
-      buttons:
-      - label: Read more
-        path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
-    - heading: "Introducing the Model Optimization Toolkit"
-      image_path: /resources/images/tf-logo-card-16x9.png
-      path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-      buttons:
-      - label: Read on TensorFlow blog
-        path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-    - heading: "East Africa Cassava App"
-      image_path: ./images/landing-page/detect_crop_disease_in_africa.png
-      path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-      buttons:
-      - label: Read more
-        path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-
-  - classname: devsite-landing-row-cards
-    background: grey
-    items:
-    - heading: "Using TensorFlow Lite on Android"
-      image_path: /resources/images/tf-logo-card-16x9.png
-      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-      buttons:
-      - label: Read on TensorFlow blog
-        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-    - heading: "TensorFlow Lite at the Dev Summit"
-      youtube_id: FAMfy7izB6A
-      buttons:
-      - label: Watch the video
-        path: https://www.youtube.com/watch?v=FAMfy7izB6A
-    - heading: "TensorFlow Lite on GitHub"
-      image_path: /resources/images/github-card-16x9.png
-      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
-      buttons:
-      - label: View on GitHub
-        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
-    - classname: devsite-landing-row-item-hidden
diff --git a/tensorflow/lite/g3doc/_project.yaml b/tensorflow/lite/g3doc/_project.yaml
index 3ce698639647d9e105b6748512314aeca148b0a0..768edeba0a12ab100da82110cd96301e8cd76b98 100644
--- a/tensorflow/lite/g3doc/_project.yaml
+++ b/tensorflow/lite/g3doc/_project.yaml
@@ -1,5 +1,5 @@
 name: TensorFlow Lite
-breadcrumb_name: TensorFlow Lite
+breadcrumb_name: For Mobile & IoT
 home_url: /lite/
 parent_project_metadata_path: /_project.yaml
 description: >
@@ -8,3 +8,4 @@ use_site_branding: true
 hide_from_products_list: true
 content_license: cc3-apache2
 buganizer_id: 316308
+include: /_project_included.yaml
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
index 169f2d91d8a72278ff61f170f0b450885e4c2c93..139a6c9b521c060d058c5243776dcd03896ce1de 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -1,4 +1,4 @@
-# Converter command-line examples
+# Converter command line examples
 
 This page shows how to use the TensorFlow Lite Converter in the command line.
 
@@ -68,9 +68,9 @@ tflite_convert \
 has fewer required flags than frozen graphs due to access to additional data
 contained within the SavedModel. The values for `--input_arrays` and
 `--output_arrays` are an aggregated, alphabetized list of the inputs and outputs
-in the [SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within
+in the [SignatureDefs](../../serving/signature_defs.md) within
 the
-[MetaGraphDef](https://www.tensorflow.org/guide/saved_model#apis_to_build_and_load_a_savedmodel)
+[MetaGraphDef](https://www.tensorflow.org/saved_model#apis_to_build_and_load_a_savedmodel)
 specified by `--saved_model_tag_set`. As with the GraphDef, the value for
 `input_shapes` is automatically determined whenever possible.
 
diff --git a/tensorflow/lite/g3doc/convert/cmdline_reference.md b/tensorflow/lite/g3doc/convert/cmdline_reference.md
index d72a46760d48dae46d63f1e914d8afda3f527e27..609ab3fdedeb92979e6fc93da60f9ed461819f7a 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_reference.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_reference.md
@@ -1,4 +1,4 @@
-# Converter command-line reference
+# Converter command line reference
 
 This page is complete reference of command-line flags used by the TensorFlow
 Lite Converter's command line starting from TensorFlow 1.9 up until the most
@@ -38,7 +38,7 @@ The following flags specify optional parameters when using SavedModels.
     Specifies a comma-separated set of tags identifying the MetaGraphDef within
     the SavedModel to analyze. All tags in the tag set must be specified.
 *   `--saved_model_signature_key`. Type: string. Default:
-    [DEFAULT_SERVING_SIGNATURE_DEF_KEY](https://www.tensorflow.org/api_docs/python/tf/saved_model/signature_constants).
+    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`.
     Specifies the key identifying the SignatureDef containing inputs and
     outputs.
 
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 60fa265c295174453b1a910f5279807dd0be32cb..fac658aa6c4d9f469063cacb8ed843df9766679c 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,25 +1,37 @@
-# TensorFlow Lite Converter
+# TensorFlow Lite converter
 
-The TensorFlow Lite Converter takes a TensorFlow graph file and creates a graph
-file used by the TensorFlow Lite interpreter.
+TensorFlow Lite uses the optimized
+[FlatBuffer](https://google.github.io/flatbuffers/) format to represent graphs.
+Therefore, a TensorFlow model
+([protocol buffer](https://developers.google.com/protocol-buffers/)) needs to be
+converted into a `FlatBuffer` file before deploying to clients.
 
 ## From model training to device deployment
 
-After a TensorFlow model is trained, the TensorFlow Lite converter uses that
-model to generate a TensorFlow Lite
-[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`). The
-converter supports as input:
-[SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
-frozen graphs (models generated by
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
-and `tf.keras` HDF5 models. The TensorFlow Lite `FlatBuffer` file is deployed to
-a client device (generally a mobile or embedded device), and the TensorFlow Lite
-interpreter uses the compressed model for on-device inference. This conversion
-process is shown in the diagram below:
+The TensorFlow Lite converter generates a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`) from a
+TensorFlow model.
+
+The converter supports the following input formats:
+
+*   [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
+*   Frozen `GraphDef`: Models generated by
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
+*   `tf.keras` HDF5 models.
+*   Any model taken from a `tf.Session` (Python API only).
+
+The TensorFlow Lite `FlatBuffer` file is then deployed to a client device
+(generally a mobile or embedded device), and the TensorFlow Lite interpreter
+uses the compressed model for on-device inference. This conversion process is
+shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
 
-The TensorFlow Lite Converter can be used either from [Python](python_api.md) or
-from the [command line](cmdline_examples.md). This allows you to integrate the
-conversion step into the model design workflow, ensuring the model is easy to
-convert to a mobile inference graph.
+## Options
+
+The TensorFlow Lite Converter can be used from either of these two options:
+
+*   [Python](python_api.md) (**Preferred**): Using the Python API makes it
+    easier to convert models as part of a model development pipeline, and helps
+    mitigate [compatibility](../tf_ops_compatibility.md) issues early on.
+*   [Command line](cmdline_examples.md)
diff --git a/tensorflow/lite/g3doc/convert/quantization.md b/tensorflow/lite/g3doc/convert/quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..895f3e637e7527d35e656d9ab137dacc2a3f2709
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/quantization.md
@@ -0,0 +1,47 @@
+# Converting Quantized Models
+
+This page provides information for how to convert quantized TensorFlow Lite
+models. For more details, please see the
+[model optimization](../performance/model_optimization.md).
+
+# Post-training: Quantizing models for CPU model size
+
+The simplest way to create a small model is to quantize the weights to 8 bits
+and quantize the inputs/activations "on-the-fly", during inference. This
+has latency benefits, but prioritizes size reduction.
+
+During conversion, set the `optimizations` flag to optimize for size:
+
+```
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+tflite_quant_model = converter.convert()
+```
+
+# During training: Quantizing models for integer-only execution.
+
+Quantizing models for integer-only execution gets a model with even faster
+latency, smaller size, and integer-only accelerators compatible model.
+Currently, this requires training a model with
+["fake-quantization" nodes](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize).
+
+Convert the graph:
+
+```
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+input_arrays = converter.get_input_arrays()
+converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
+tflite_model = converter.convert()
+```
+
+For fully integer models, the inputs are uint8. The `mean` and `std_dev values`
+specify how those uint8 values map to the float input values used while training
+the model.
+
+`mean` is the integer value from 0 to 255 that maps to floating point 0.0f.
+`std_dev` is 255 / (float_max - float_min)
+
+For most users, we recommend using post-training quantization. We are working on
+new tools for post-training and during training quantization that we hope will
+simplify generating quantized models.
diff --git a/tensorflow/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
deleted file mode 100644
index f4b481dc6192db703dea4161ed28e2fd63812ebf..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/demo_ios.md
+++ /dev/null
@@ -1,76 +0,0 @@
-
-# iOS Demo App
-
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model. These
-instructions walk you through building and running the demo on an iOS device.
-
-## Prerequisites
-
-*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
-    a valid Apple Developer ID, and have an iOS device set up and linked to your
-    developer account with all of the appropriate certificates. For these
-    instructions, we assume that you have already been able to build and deploy
-    an app to an iOS device with your current developer environment.
-
-*   The demo app requires a camera and must be executed on a real iOS device.
-    You can build it and run with the iPhone Simulator but it won't have any
-    camera information to classify.
-
-*   You don't need to build the entire TensorFlow library to run the demo, but
-    you will need to clone the TensorFlow repository if you haven't already:
-
-        git clone https://github.com/tensorflow/tensorflow
-        cd tensorflow
-
-*   You'll also need the Xcode command-line tools:
-
-        xcode-select --install
-
-    If this is a new install, you will need to run the Xcode application once to
-    agree to the license before continuing.
-
-## Building the iOS Demo App
-
-1.  Install CocoaPods if you don't have it:
-
-        sudo gem install cocoapods
-
-2.  Download the model files used by the demo app (this is done from inside the
-    cloned directory):
-
-        sh tensorflow/lite/examples/ios/download_models.sh
-
-3.  Install the pod to generate the workspace file:
-
-        cd tensorflow/lite/examples/ios/camera
-        pod install
-
-    If you have installed this pod before and that command doesn't work, try
-
-        pod repo update
-
-    At the end of this step you should have a file called
-    `tflite_camera_example.xcworkspace`.
-
-4.  Open the project in Xcode by typing this on the command line:
-
-        open tflite_camera_example.xcworkspace
-
-    This launches Xcode if it isn't open already and opens the
-    `tflite_camera_example` project.
-
-5.  Under `Project navigator -> tflite_camera_example -> Targets ->
-    tflite_camera_example -> General` change the bundle identifier by
-    pre-pending your name:
-
-    ![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
-
-6.  Build and run the app in Xcode.
-
-    Note that as mentioned earlier, you must already have a device set up and
-    linked to your Apple Developer account in order to deploy the app on a
-    device.
-
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
diff --git a/tensorflow/lite/g3doc/demo_android.md b/tensorflow/lite/g3doc/guide/android.md
similarity index 99%
rename from tensorflow/lite/g3doc/demo_android.md
rename to tensorflow/lite/g3doc/guide/android.md
index 772598d5cfd36a388d253becd7fc3026f31375c9..4b2f38a5d32ffe30996d2149c81c74be70b7798a 100644
--- a/tensorflow/lite/g3doc/demo_android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -1,5 +1,4 @@
-
-# Android Demo App
+# Android quickstart
 
 An example Android application using TensorFLow Lite is available
 [on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo).
diff --git a/tensorflow/lite/g3doc/guide/build_arm64.md b/tensorflow/lite/g3doc/guide/build_arm64.md
new file mode 100644
index 0000000000000000000000000000000000000000..0daa45abd02ea0e5f3a122a818c80ca16e03d796
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/build_arm64.md
@@ -0,0 +1,62 @@
+# Build TensorFlow Lite for ARM64 boards
+
+## Cross compiling
+
+### Installing the toolchain
+
+```bash
+sudo apt-get update
+sudo apt-get install crossbuild-essential-arm64
+```
+
+> If you are using Docker, you may not use `sudo`.
+
+### Building
+
+Clone this Tensorflow repository. Run this script at the root of the repository
+to download all the dependencies:
+
+> The Tensorflow repository is in `/tensorflow` if you are using
+> `tensorflow/tensorflow:nightly-devel` docker image, just try it.
+
+```bash
+./tensorflow/lite/tools/make/download_dependencies.sh
+```
+
+Note that you only need to do this once.
+
+Compile:
+
+```bash
+./tensorflow/lite/tools/make/build_aarch64_lib.sh
+```
+
+This should compile a static library in:
+`tensorflow/lite/gen/gen/aarch64_armv8-a/lib/libtensorflow-lite.a`.
+
+## Native compiling
+
+These steps were tested on HardKernel Odroid C2, gcc version 5.4.0.
+
+Log in to your board, install the toolchain.
+
+```bash
+sudo apt-get install build-essential
+```
+
+First, clone the TensorFlow repository. Run this at the root of the repository:
+
+```bash
+./tensorflow/lite/tools/make/download_dependencies.sh
+```
+
+Note that you only need to do this once.
+
+Compile:
+
+```bash
+./tensorflow/lite/tools/make/build_aarch64_lib.sh
+```
+
+This should compile a static library in:
+`tensorflow/lite/gen/gen/aarch64_armv8-a/lib/libtensorflow-lite.a`.
diff --git a/tensorflow/lite/g3doc/ios.md b/tensorflow/lite/g3doc/guide/build_ios.md
similarity index 98%
rename from tensorflow/lite/g3doc/ios.md
rename to tensorflow/lite/g3doc/guide/build_ios.md
index c195b6abf4f76f88d1f60b192cd19165aefe9a11..40f2ac2fdfdac4ef7c9fb958bb125afca51ab148 100644
--- a/tensorflow/lite/g3doc/ios.md
+++ b/tensorflow/lite/g3doc/guide/build_ios.md
@@ -3,7 +3,7 @@
 
 This document describes how to build TensorFlow Lite iOS library. If you just
 want to use it, the easiest way is using the TensorFlow Lite CocoaPod releases.
-See [TensorFlow Lite iOS Demo](demo_ios.md) for examples.
+See [TensorFlow Lite iOS Demo](ios.md) for examples.
 
 
 ## Building
diff --git a/tensorflow/lite/g3doc/rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
similarity index 97%
rename from tensorflow/lite/g3doc/rpi.md
rename to tensorflow/lite/g3doc/guide/build_rpi.md
index 708d9e328cbdfffb491d487e4592d789b4fd06af..cb0cabc2d3e5d7ef3100a74e8dcb82db214f7287 100644
--- a/tensorflow/lite/g3doc/rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite for Raspberry Pi
+# Build TensorFlow Lite for Raspberry Pi
 
 ## Cross compiling
 
diff --git a/tensorflow/lite/g3doc/guide/faq.md b/tensorflow/lite/g3doc/guide/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0e4d09ef1e6f3bb1214d6a7130f672e53f32396
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/faq.md
@@ -0,0 +1,135 @@
+# Frequently Asked Questions
+
+If you don't find an answer to your question here, please look through our
+detailed documentation for the topic or file a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+
+## Model Conversion
+
+#### What formats are supported for conversion from TensorFlow to TensorFlow Lite?
+
+The TensorFlow Lite converter supports the following formats:
+
+*   SavedModels:
+    [TFLiteConverter.from_saved_model](../convert/python_api.md#exporting_a_savedmodel_)
+*   Frozen GraphDefs generated by
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py):
+    [TFLiteConverter.from_frozen_graph](../convert/python_api.md#exporting_a_graphdef_from_file_)
+*   tf.keras HDF5 models:
+    [TFLiteConverter.from_keras_model_file](../convert/python_api.md#exporting_a_tfkeras_file_)
+*   tf.Session:
+    [TFLiteConverter.from_session](../convert/python_api.md#exporting_a_graphdef_from_tfsession_)
+
+The recommended approach is to integrate the
+[Python converter](../convert/python_api.md) into your model pipeline in order to
+detect compatibility issues early on.
+
+#### Why doesn't my model convert?
+
+Since the number of TensorFlow Lite operations is smaller than TensorFlow's,
+some inference models may not be able to convert. For unimplemented operations,
+take a look at the question on
+[missing operators](faq.md#why-are-some-operations-not-implemented-in-tensorflow-lite).
+Unsupported operators include embeddings and LSTM/RNNs. For conversion issues
+not related to missing operations, search our
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues?q=label%3Acomp%3Alite+)
+or file a [new one](https://github.com/tensorflow/tensorflow/issues).
+
+#### How do I determine the inputs/outputs for GraphDef protocol buffer?
+
+The easiest way to inspect a graph from a `.pb` file is to use the
+[summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md#inspecting-graphs)
+tool.
+
+If that approach yields an error, you can visualize the GraphDef with
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) and
+look for the inputs and outputs in the graph. To visualize a `.pb` file, use the
+[`import_pb_to_tensorboard.py`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/import_pb_to_tensorboard.py)
+script like below:
+
+```
+python import_pb_to_tensorboard.py --model_dir <model path> --log_dir <log dir path>
+```
+
+#### How do I inspect a `.tflite` file?
+
+TensorFlow Lite models can be visualized using the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+script in our repository.
+
+*   [Clone the TensorFlow repository](https://www.tensorflow.org/install/source)
+*   Run the `visualize.py` script with bazel:
+
+```
+bazel run //tensorflow/lite/tools:visualize model.tflite visualized_model.html
+```
+
+## Models & Operations
+
+#### Why are some operations not implemented in TensorFlow Lite?
+
+In order to keep TensorFlow Lite lightweight, only certain operations were used
+in the converter. The [Compatibility Guide](ops_compatibility.md) provides a
+list of operations currently supported by TensorFlow Lite.
+
+If you don’t see a specific operation (or an equivalent) listed, it's likely
+that it has not been prioritized. The team tracks requests for new operations on
+GitHub [issue #21526](https://github.com/tensorflow/tensorflow/issues/21526).
+Leave a comment if your request hasn’t already been mentioned.
+
+In the meanwhile, you could try implementing a
+[custom operator](ops_custom.md) or using a different model that only
+contains supported operators. If binary size is not a constraint, try using
+TensorFlow Lite with [select TensorFlow ops](ops_select.md).
+
+#### How do I test that a TensorFlow Lite model behaves the same as the original TensorFlow model?
+
+The best way to test the behavior of a TensorFlow Lite model is to use our API
+with test data and compare the outputs to TensorFlow for the same inputs. Take a
+look at our [Python Interpreter example](../convert/python_api.md) that generates
+random data to feed to the interpreter.
+
+## Optimization
+
+#### How do I reduce the size of my converted TensorFlow Lite model?
+
+[Post-training quantization](../performance/post_training_quantization.md) can be
+used during conversion to TensorFlow Lite to reduce the size of the model.
+Post-training quantization quantizes weights to 8-bits of precision from
+floating-point and dequantizes them during runtime to perform floating point
+computations. However, note that this could have some accuracy implications.
+
+If retraining the model is an option, consider
+[Quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize).
+However, note that quantization-aware training is only available for a subset of
+convolutional neural network architectures.
+
+For a deeper understanding of different optimization methods, look at
+[Model optimization](../performance/model_optimization.md).
+
+#### How do I optimize TensorFlow Lite performance for my machine learning task?
+
+The high-level process to optimize TensorFlow Lite performance looks something
+like this:
+
+*   *Make sure that you have the right model for the task.* For image
+    classification, check out our [list of hosted models](hosted_models.md).
+*   *Tweak the number of threads.* Many TensorFlow Lite operators support
+    multi-threaded kernels. You can use `SetNumThreads()` in the
+    [C++ API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L345)
+    to do this. However, increasing threads results in performance variability
+    depending on the environment.
+*   *Use Hardware Accelerators.* TensorFlow Lite supports model acceleration for
+    specific hardware using delegates. For example, to use Android’s Neural
+    Networks API, call
+    [`UseNNAPI`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L343)
+    on the interpreter. Or take a look at our
+    [GPU delegate tutorial](../performance/gpu.md).
+*   *(Advanced) Profile Model.* The Tensorflow Lite
+    [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+    has a built-in profiler that can show per-operator statistics. If you know
+    how you can optimize an operator’s performance for your specific platform,
+    you can implement a [custom operator](ops_custom.md).
+
+For a more in-depth discussion on how to optimize performance, take a look at
+[Best Practices](../performance/best_practices.md).
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/guide/get_started.md
similarity index 85%
rename from tensorflow/lite/g3doc/devguide.md
rename to tensorflow/lite/g3doc/guide/get_started.md
index cbad036407fabea9d49910e22b4c968470566211..daa551ff7d65a14010eea74076f93a8ecccb78f5 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -1,4 +1,4 @@
-# TF Lite Developer Guide
+# Get started with TensorFlow Lite
 
 Using a TensorFlow Lite model in your mobile app requires multiple
 considerations: you must choose a pre-trained or custom model, convert the model
@@ -35,7 +35,7 @@ by suggesting contextually relevant messages. The model is built specifically fo
 memory constrained devices, such as watches and phones, and has been successfully
 used in Smart Replies on Android Wear. Currently, this model is Android-specific.
 
-These pre-trained models are [available for download](models.md).
+These pre-trained models are [available for download](hosted_models.md).
 
 ### Re-train Inception-V3 or MobileNet for a custom data set
 
@@ -54,7 +54,7 @@ both floating point and quantized inference.
 ### Train a custom model
 
 A developer may choose to train a custom model using Tensorflow (see the
-[TensorFlow tutorials](../tutorials/) for examples of building and training
+[TensorFlow tutorials](https://www.tensorflow.org/tutorials/) for examples of building and training
 models). If you have already written a model, the first step is to export this
 to a `tf.GraphDef` file. This is required because some formats do not store the
 model structure outside the code, and we must communicate with other parts of
@@ -63,24 +63,24 @@ the framework. See
 to create file for the custom model.
 
 TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
-the [TensorFlow Lite & TensorFlow Compatibility Guide](tf_ops_compatibility.md)
+the [TensorFlow Lite & TensorFlow Compatibility Guide](ops_compatibility.md)
 for supported operators and their usage. This set of operators will continue to
 grow in future Tensorflow Lite releases.
 
 ## 2. Convert the model format
 
-The [TensorFlow Lite Converter](convert/index.md) accepts the following file
+The [TensorFlow Lite Converter](../convert/index.md) accepts the following file
 formats:
 
 *   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
     input and output arguments to a model. See the documentation for converting
-    SavedModels using [Python](convert/python_api.md#basic_savedmodel) or using
-    the [command line](convert/cmdline_examples.md#savedmodel).
+    SavedModels using [Python](../convert/python_api.md#basic_savedmodel) or using
+    the [command line](../convert/cmdline_examples.md#savedmodel).
 *   `tf.keras` - A HDF5 file containing a model with weights and input and
     output arguments generated by `tf.Keras`. See the documentation for
     converting HDF5 models using
-    [Python](convert/python_api.md#basic_keras_file) or using the
-    [command line](convert/cmdline_examples.md#keras).
+    [Python](../convert/python_api.md#basic_keras_file) or using the
+    [command line](../convert/cmdline_examples.md#keras).
 *   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
     variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
     checkpoint and a `GraphDef`, and converting each variable into a constant
@@ -92,7 +92,7 @@ formats:
 
 TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
 to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
-frozen. This process invovles several file formats including the `frozen
+frozen. This process involves several file formats including the `frozen
 GraphDef`:
 
 *   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
@@ -154,19 +154,19 @@ the arguments for specifying the output nodes for inference in the
 
 ### Full converter reference
 
-The [TensorFlow Lite Converter](convert/index.md) can be
-[Python](convert/python_api.md) or from the
-[command line](convert/cmdline_examples.md). This allows you to integrate the
+The [TensorFlow Lite Converter](../convert/index.md) can be
+[Python](../convert/python_api.md) or from the
+[command line](../convert/cmdline_examples.md). This allows you to integrate the
 conversion step into the model design workflow, ensuring the model is easy to
 convert to a mobile inference graph.
 
 ### Ops compatibility
 
-Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
+Refer to the [ops compatibility guide](ops_compatibility.md) for
 troubleshooting help, and if that doesn't help, please
 [file an issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Graph vizualization tool
+### Graph Visualization tool
 
 The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
 to visualize TensorFlow Lite models after conversion. To build the
@@ -195,15 +195,15 @@ The open source Android demo app uses the JNI interface and is available
 [on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/app).
 You can also download a
 [prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the <a href="./demo_android.md">Android demo</a> guide for details.
+See the <a href="./android.md">Android demo</a> guide for details.
 
-The <a href="./android_build.md">Android mobile</a> guide has instructions for
+The <a href="./android.md">Android mobile</a> guide has instructions for
 installing TensorFlow on Android and setting up `bazel` and Android Studio.
 
 ### iOS
 
 To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](ios.md) guide and <a href="./demo_ios.md">iOS demo</a>
+[TensorFlow Lite for iOS](ios.md) guide and <a href="./ios.md">iOS demo</a>
 guide.
 
 #### Core ML support
@@ -215,11 +215,14 @@ trained Tensorflow models to the
 devices. To use the converter, refer to the
 [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
 
-### Raspberry Pi
+### ARM32 and ARM64 Linux
 
 Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
-to build your app. There are plans for Python bindings and a demo app.
+[RPi build instructions](build_rpi.md) Compile Tensorflow Lite for a generic aarch64
+board such as Odroid C2, Pine64, NanoPi, and others by following the
+[ARM64 Linux build instructions](build_arm64.md) This compiles a static
+library file (`.a`) used to build your app. There are plans for Python bindings
+and a demo app.
 
 ## 4. Optimize your model (optional)
 
@@ -250,7 +253,9 @@ tflite_quantized_model=converter.convert()
 open(“quantized_model.tflite”, “wb”).write(tflite_quantized_model)
 ```
 
-Read the full documentation [here](performance/post_training_quantization) and see a tutorial [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
+Read the full documentation [here](../performance/post_training_quantization.md)
+and see a tutorial
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
 
 ### GPU
 Run on GPU GPUs are designed to have high throughput for massively
@@ -263,4 +268,4 @@ Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
 power and generate less heat than when the same task is run on CPUs.
 
-Read the tutorial [here](performance/gpu) and full documentation [here](performance/gpu_advanced).
+Read the tutorial [here](../performance/gpu.md) and full documentation [here](../performance/gpu_advanced.md).
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
similarity index 64%
rename from tensorflow/lite/g3doc/models.md
rename to tensorflow/lite/g3doc/guide/hosted_models.md
index 62b3f17c79aa3688011a1452da18e098008f414e..69f196782eac1de7eff1c326693f93fbcb601b94 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -1,64 +1,27 @@
+# Hosted models
 
-# List of Hosted Models
-
-# AutoML mobile image classification models (Float Models)
-
-Model Name          | Paper_Model_Files | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance^
-------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ---------------------:
-MnasNet_0.50_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.5_224_09_07_2018.tgz) | 8.5 Mb    | 68.03%          | 87.79%          | 37 ms
-MnasNet_0.75_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.75_224_09_07_2018.tgz) | 12 Mb     | 71.72%          | 90.17%          | 61 ms
-MnasNet_1.0_96| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_96_09_07_2018.tgz) | 17 Mb    | 62.33%          | 83.98%          | 23 ms
-MnasNet_1.0_128| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_128_09_07_2018.tgz) | 17 Mb    | 67.32%          | 87.70%          | 34 ms
-MnasNet_1.0_160| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_160_09_07_2018.tgz) | 17 Mb    | 70.63%          | 89.58%          | 51 ms
-MnasNet_1.0_192| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_192_09_07_2018.tgz) | 17 Mb    | 72.56%          | 90.76%          | 70 ms
-MnasNet_1.0_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz) | 17 Mb    | 74.08%          | 91.75%          | 93 ms
-MnasNet_1.3_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz) | 24 Mb     | 75.24%          | 92.55%          | 152 ms
-
-
-^ Performance numbers are generated on Pixel-1 using single thread large BIG core.
-
-
-## Image classification (Float Models)
-
-Model Name            | Paper_Model_Files^                                                                                                                                                                        | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance^^ | Tensorflow Performance
---------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | --------------------: | ---------------------:
-DenseNet              | [paper](https://arxiv.org/abs/1608.06993), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/densenet_2018_04_27.tgz)            | 43.6 Mb    | 64.2%          | 85.6%          | 894 ms                | 1262 ms
-SqueezeNet            | [paper](https://arxiv.org/abs/1602.07360), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz)          | 5.0 Mb     | 49.0%          | 72.9%          | 224 ms                | 255 ms
-NASNet mobile         | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz)       | 21.4 Mb    | 73.9%          | 91.5%          | 261 ms                | 389 ms
-NASNet large          | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_large_2018_04_27.tgz)        | 355.3 Mb   | 82.6%          | 96.1%          | 6697 ms               | 7940 ms
-ResNet_V2_101         | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/resnet_v2_101.tgz)                                   | 178.3 Mb   | 76.8%          | 93.6%          | 1880 ms               | 1970 ms
-Inception_V3          | [paper](http://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz)         | 95.3 Mb    | 77.9%          | 93.8%          | 1433 ms               | 1522 ms
-Inception_V4          | [paper](http://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz)         | 170.7 Mb   | 80.1%          | 95.1%          | 2986 ms               | 3139 ms
-Inception_ResNet_V2   | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz) | 121.0 Mb   | 77.5%          | 94.0%          | 2731 ms               | 2926 ms
-Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz)                                       | 1.9 Mb     | 41.4%          | 66.2%          | 6.2 ms                | 13.0 ms
-Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz)                                       | 1.9 Mb     | 45.4%          | 70.2%          | 8.6 ms                | 19.5 ms
-Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz)                                       | 1.9 Mb     | 47.1%          | 72.0%          | 12.1 ms               | 27.8 ms
-Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz)                                       | 1.9 Mb     | 49.7%          | 74.1%          | 16.2 ms               | 37.3 ms
-Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz)                                        | 5.3 Mb     | 56.2%          | 79.3%          | 18.1 ms               | 29.9 ms
-Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz)                                        | 5.3 Mb     | 59.0%          | 81.8%          | 26.8 ms               | 45.9 ms
-Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz)                                        | 5.3 Mb     | 61.7%          | 83.5%          | 35.6 ms               | 65.3 ms
-Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz)                                        | 5.3 Mb     | 63.2%          | 84.9%          | 47.6 ms               | 164.2 ms
-Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz)                                       | 10.3 Mb    | 62.0%          | 83.8%          | 34.6 ms               | 48.7 ms
-Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz)                                       | 10.3 Mb    | 65.2%          | 85.9%          | 51.3 ms               | 75.2 ms
-Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz)                                       | 10.3 Mb    | 67.1%          | 87.2%          | 71.7 ms               | 107.0 ms
-Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz)                                       | 10.3 Mb    | 68.3%          | 88.1%          | 95.7 ms               | 143.4 ms
-Mobilenet_V1_1.0_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz)                                        | 16.9 Mb    | 65.2%          | 85.7%          | 57.4 ms               | 76.8 ms
-Mobilenet_V1_1.0_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz)                                        | 16.9 Mb    | 68.0%          | 87.7%          | 86.0 ms               | 117.7 ms
-Mobilenet_V1_1.0_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz)                                        | 16.9 Mb    | 69.9%          | 89.1%          | 118.6 ms              | 167.3 ms
-Mobilenet_V1_1.0_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)                                        | 16.9 Mb    | 71.0%          | 89.9%          | 160.1 ms              | 224.3 ms
-Mobilenet_V2_1.0_224  | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz)                                                | 14.0 Mb    | 71.8%          | 90.6%          | 117 ms                |
-
-^ The model files include both TF Lite FlatBuffer and Tensorflow frozen Graph.
-
-^^ The performance numbers are generated in the benchmark on Pixel-2 using
-single thread large core.
-
-^^ Accuracy numbers were computed using the
-[TFLite accuracy tool](../tools/accuracy/ilsvrc) .
-
-## Image classification (Quantized Models)
-
-Model Name                  | Paper_Model_Files                                                                                                                                         | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance
+The following is an incomplete list of pre-trained models optimized to work with
+TensorFlow Lite.
+
+To get started choosing a model, visit <a href="../models">Models</a>.
+
+Note: The best model for a given application depends on your requirements. For
+example, some applications might benefit from higher accuracy, while others
+require a small model size. You should test your application with a variety of
+models to find the optimal balance between size, performance, and accuracy.
+
+## Image classification
+
+For more information about image classification, see
+<a href="../image_classification/overview.md">Image classification</a>.
+
+### Quantized models
+
+<a href="../performance/post_training_quantization.md">Quantized</a> image
+classification models offer the smallest model size and fastest performance, at
+the expense of accuracy.
+
+Model name                  | Paper and model                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance
 --------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------:
 Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb     | 39.5%          | 64.4%          | 3.7 ms
 Mobilenet_V1_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb     | 42.8%          | 68.1%          | 5.5 ms
@@ -82,9 +45,104 @@ Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite
 Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
 Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
-## Other models
+Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen
+Graph.
+
+Note: Performance numbers were benchmarked on Pixel-2 using single thread large
+core. Accuracy numbers were computed using the
+[TFLite accuracy tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).
+
+### Floating point models
+
+Floating point models offer the best accuracy, at the expense of model size and
+performance. <a href="../performance/gpu.md">GPU acceleration</a> requires the
+use of floating point models.
+
+Model name            | Paper and model                                                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance | Tensorflow performance
+--------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------: | ---------------------:
+DenseNet              | [paper](https://arxiv.org/abs/1608.06993), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/densenet_2018_04_27.tgz)            | 43.6 Mb    | 64.2%          | 85.6%          | 894 ms              | 1262 ms
+SqueezeNet            | [paper](https://arxiv.org/abs/1602.07360), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz)          | 5.0 Mb     | 49.0%          | 72.9%          | 224 ms              | 255 ms
+NASNet mobile         | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz)       | 21.4 Mb    | 73.9%          | 91.5%          | 261 ms              | 389 ms
+NASNet large          | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_large_2018_04_27.tgz)        | 355.3 Mb   | 82.6%          | 96.1%          | 6697 ms             | 7940 ms
+ResNet_V2_101         | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/resnet_v2_101.tgz)                                   | 178.3 Mb   | 76.8%          | 93.6%          | 1880 ms             | 1970 ms
+Inception_V3          | [paper](http://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz)         | 95.3 Mb    | 77.9%          | 93.8%          | 1433 ms             | 1522 ms
+Inception_V4          | [paper](http://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz)         | 170.7 Mb   | 80.1%          | 95.1%          | 2986 ms             | 3139 ms
+Inception_ResNet_V2   | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz) | 121.0 Mb   | 77.5%          | 94.0%          | 2731 ms             | 2926 ms
+Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz)                                       | 1.9 Mb     | 41.4%          | 66.2%          | 6.2 ms              | 13.0 ms
+Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz)                                       | 1.9 Mb     | 45.4%          | 70.2%          | 8.6 ms              | 19.5 ms
+Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz)                                       | 1.9 Mb     | 47.1%          | 72.0%          | 12.1 ms             | 27.8 ms
+Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz)                                       | 1.9 Mb     | 49.7%          | 74.1%          | 16.2 ms             | 37.3 ms
+Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz)                                        | 5.3 Mb     | 56.2%          | 79.3%          | 18.1 ms             | 29.9 ms
+Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz)                                        | 5.3 Mb     | 59.0%          | 81.8%          | 26.8 ms             | 45.9 ms
+Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz)                                        | 5.3 Mb     | 61.7%          | 83.5%          | 35.6 ms             | 65.3 ms
+Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz)                                        | 5.3 Mb     | 63.2%          | 84.9%          | 47.6 ms             | 164.2 ms
+Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz)                                       | 10.3 Mb    | 62.0%          | 83.8%          | 34.6 ms             | 48.7 ms
+Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz)                                       | 10.3 Mb    | 65.2%          | 85.9%          | 51.3 ms             | 75.2 ms
+Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz)                                       | 10.3 Mb    | 67.1%          | 87.2%          | 71.7 ms             | 107.0 ms
+Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz)                                       | 10.3 Mb    | 68.3%          | 88.1%          | 95.7 ms             | 143.4 ms
+Mobilenet_V1_1.0_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz)                                        | 16.9 Mb    | 65.2%          | 85.7%          | 57.4 ms             | 76.8 ms
+Mobilenet_V1_1.0_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz)                                        | 16.9 Mb    | 68.0%          | 87.7%          | 86.0 ms             | 117.7 ms
+Mobilenet_V1_1.0_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz)                                        | 16.9 Mb    | 69.9%          | 89.1%          | 118.6 ms            | 167.3 ms
+Mobilenet_V1_1.0_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)                                        | 16.9 Mb    | 71.0%          | 89.9%          | 160.1 ms            | 224.3 ms
+Mobilenet_V2_1.0_224  | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz)                                                | 14.0 Mb    | 71.8%          | 90.6%          | 117 ms              |
+
+### AutoML mobile models
+
+The following image classification models were created using
+<a href="https://cloud.google.com/automl/">Cloud AutoML</a>.
+
+Model Name       | Paper and model                                                                                                                                                | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance
+---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------:
+MnasNet_0.50_224 | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.5_224_09_07_2018.tgz)  | 8.5 Mb     | 68.03%         | 87.79%         | 37 ms
+MnasNet_0.75_224 | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.75_224_09_07_2018.tgz) | 12 Mb      | 71.72%         | 90.17%         | 61 ms
+MnasNet_1.0_96   | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_96_09_07_2018.tgz)   | 17 Mb      | 62.33%         | 83.98%         | 23 ms
+MnasNet_1.0_128  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_128_09_07_2018.tgz)  | 17 Mb      | 67.32%         | 87.70%         | 34 ms
+MnasNet_1.0_160  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_160_09_07_2018.tgz)  | 17 Mb      | 70.63%         | 89.58%         | 51 ms
+MnasNet_1.0_192  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_192_09_07_2018.tgz)  | 17 Mb      | 72.56%         | 90.76%         | 70 ms
+MnasNet_1.0_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz)  | 17 Mb      | 74.08%         | 91.75%         | 93 ms
+MnasNet_1.3_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz)  | 24 Mb      | 75.24%         | 92.55%         | 152 ms
+
+Note: Performance numbers were benchmarked on Pixel-1 using single thread large
+BIG core.
+
+## Object detection
+
+For more information about object detection, see
+<a href="../models/object_detection/overview.md">Object detection</a>.
+
+The object detection model we currently host is
+**coco_ssd_mobilenet_v1_1.0_quant_2018_06_29**.
+
+<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+model and labels</a>
+
+## Pose estimation
+
+For more information about pose estimation, see
+<a href="../models/pose_estimation/overview.md">Pose estimation</a>.
+
+The pose estimation model we currently host is
+**multi_person_mobilenet_v1_075_float**.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download
+model</a>
+
+## Image segmentation
+
+For more information about image segmentation, see
+<a href="../models/segmentation/overview.md">Segmentation</a>.
+
+The image segmentation model we currently host is **deeplabv3_257_mv_gpu**.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download
+model</a>
+
+## Smart reply
+
+For more information about smart reply, see
+<a href="../models/smart_reply/overview.md">Smart reply</a>.
+
+The smart reply model we currently host is **smartreply_1.0_2017_11_01**.
 
-Model                   | TF Lite FlatBuffer
------------------------ | :----------------:
-[reference](https://research.googleblog.com/2017/11/on-device-conversational-modeling-with.html),
-[tflite](https://storage.googleapis.com/download.tensorflow.org/models/smartreply_1.0_2017_11_01.zip)
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/smartreply_1.0_2017_11_01.zip">Download
+model</a>
diff --git a/tensorflow/lite/g3doc/overview.md b/tensorflow/lite/g3doc/guide/index.md
similarity index 98%
rename from tensorflow/lite/g3doc/overview.md
rename to tensorflow/lite/g3doc/guide/index.md
index 2d747a9b59f734a007ef54d13223aed22f38cb1d..288f7a07576165959f79b6796ec52286fdc60123 100644
--- a/tensorflow/lite/g3doc/overview.md
+++ b/tensorflow/lite/g3doc/guide/index.md
@@ -1,5 +1,5 @@
 
-# Introduction to TensorFlow Lite
+# TensorFlow Lite guide
 
 TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
 devices. It enables on-device machine learning inference with low latency and a
@@ -118,7 +118,7 @@ TensorFlow Lite provides:
       to all first-party and third-party apps.
 
     Also see the complete list of
-    [TensorFlow Lite's supported models](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md),
+    [TensorFlow Lite's supported models](hosted_models.md),
     including the model sizes, performance numbers, and downloadable model files.
 
 - Quantized versions of the MobileNet model, which runs faster than the
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/guide/inference.md
similarity index 74%
rename from tensorflow/lite/g3doc/apis.md
rename to tensorflow/lite/g3doc/guide/inference.md
index 1a05142bc44b824e090fd6eb513360837eac2c69..b0107ece0b1d137ec75dc871bff4284d55cbf2cc 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -1,77 +1,82 @@
-# TensorFlow Lite APIs
+# TensorFlow Lite inference
 
-TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
-the API design reflects a preference for performance over ease of use.
-TensorFlow Lite is designed for fast inference on small devices so it should be
-no surprise that the APIs try to avoid unnecessary copies at the expense of
-convenience.  Similarly, consistency with TensorFlow APIs was not an explicit
-goal and some variance is to be expected.
+[TOC]
 
-There is also a [Python API for TensorFlow Lite](g3doc/convert/python_api.md).
+## Overview
 
-## C++
+TensorFlow Lite inference is the process of executing a TensorFlow Lite
+model on-device and extracting meaningful results from it. Inference is the
+final step in using the model on-device in the
+[architecture](index.md#tensorflow_lite_architecture).
 
-In order to run the inference model in TensorFlow Lite, one has to load the
-model into a `FlatBufferModel` object which then can be executed by an
-`Interpreter`.  The `FlatBufferModel` needs to remain valid for the whole
-lifetime of the `Interpreter`, and a single `FlatBufferModel` can be
-simultaneously used by more than one `Interpreter`. In concrete terms, the
-`FlatBufferModel` object must be created before any `Interpreter` objects that
-use it, and must be kept around until they have all been destroyed.
+Inference for TensorFlow Lite models is run through an interpreter. This
+document outlines the various APIs for the interpreter along with the
+[supported platforms](#supported-platforms).
 
-The simplest usage of TensorFlow Lite will look like this:
+### Important Concepts
 
-```c++
-tflite::FlatBufferModel model(path_to_model);
-tflite::ops::builtin::BuiltinOpResolver resolver;
-std::unique_ptr<tflite::Interpreter> interpreter;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-// Resize input tensors, if desired.
-interpreter->AllocateTensors();
-float* input = interpreter->typed_input_tensor<float>(0);
-// Fill `input`.
-interpreter->Invoke();
-float* output = interpreter->typed_output_tensor<float>(0);
-```
-### Data Alignment
+TensorFlow Lite inference on device typically follows the following steps.
 
-TensorFlow Lite data is usually aligned to 16-byte boundaries. It is recommended
-that all data provided to TensorFlow Lite be aligned that way.
+1. **Loading a Model**
 
-### Error Reporting
+   The user loads the `.tflite` model into memory which contains the model's
+   execution graph.
 
-In many places TensorFlow Lite returns status information through
-`TfLiteStatus` objects:
+1. **Transforming Data**
+   Input data acquired by the user generally may not match the input data format
+   expected by the model. For eg., a user may need to resize an image or change
+   the image format to be used by the model.
 
-```c++
-typedef enum {
-  kTfLiteOk = 0,
-  kTfLiteError = 1
-} TfLiteStatus;
+1. **Running Inference**
 
-```
+   This step involves using the API to execute the model. It involves a few
+   steps such as building the interpreter, and allocating tensors as explained
+   in detail in [Running a Model](#running_a_model).
 
-Failures can be easily verified with:
+1. **Interpreting Output**
 
-```c++
-if (status != kTfLiteOk) {
-  // ... error handling here ...
-}
-```
+   The user retrieves results from model inference and interprets the tensors in
+   a meaningful way to be used in the application.
 
-In order to obtain detailed error information an ErrorReporter must be
-provided:
+   For example, a model may only return a list of probabilities. It is up to the
+   application developer to meaningully map them to relevant categories and
+   present it to their user.
 
-```c++
-class ErrorReporter {
-  virtual int Report(const char* format, va_list args) = 0;
-};
-```
+### Supported Platforms
+TensorFlow inference APIs are provided for most common mobile/embedded platforms
+such as Android, iOS and Linux.
+
+#### Android
+On Android, TensorFlow Lite inference can be performed using either Java or C++
+APIs. The Java APIs provide convenience and can be used directly within your
+Android Activity classes. The C++ APIs on the other hand may offer more
+flexibility and speed, but may require writing JNI wrappers to move data between
+Java and C++ layers. You can find an example [here](android.md).
+
+#### iOS
+TensorFlow Lite provides Swift/Objective C++ APIs for inference on iOS. An
+example can be found [here](ios.md).
+
+#### Linux
+On Linux platforms such as [Raspberry Pi](build_rpi.md), TensorFlow Lite C++
+and Python APIs can be used to run inference.
 
-The `DefaultErrorReporter` takes care of reporting to `stderr`.
+
+## API Guides
+
+TensorFlow Lite provides programming APIs in C++, Java and Python, with
+experimental bindings for several other languages (C, Swift, Objective-C). In
+most cases, the API design reflects a preference for performance over ease of
+use. TensorFlow Lite is designed for fast inference on small devices so it
+should be no surprise that the APIs try to avoid unnecessary copies at the
+expense of convenience. Similarly, consistency with TensorFlow APIs was not an
+explicit goal and some variance is to be expected.
+
+There is also a [Python API for TensorFlow Lite](../convert/python_api.md).
 
 ### Loading a Model
 
+#### C++
 The `FlatBufferModel` class encapsulates a model and can be built in a couple of
 slightly different ways depending on where the model is stored:
 
@@ -92,11 +97,42 @@ class FlatBufferModel {
 };
 ```
 
+```c++
+tflite::FlatBufferModel model(path_to_model);
+```
+
 Note that if TensorFlow Lite detects the presence of Android's NNAPI it will
 automatically try to use shared memory to store the FlatBufferModel.
 
-### Running a Model
+#### Java
+
+TensorFlow Lite's Java API supports on-device inference and is provided as an
+Android Studio Library that allows loading models, feeding inputs, and
+retrieving inference outputs.
+
+The `Interpreter` class drives model inference with TensorFlow Lite. In
+most of the cases, this is the only class an app developer will need.
+
+The `Interpreter` can be initialized with a model file using the constructor:
 
+```java
+public Interpreter(@NotNull File modelFile);
+```
+
+or with a `MappedByteBuffer`:
+
+```java
+public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
+```
+
+In both cases a valid TensorFlow Lite model must be provided or an
+`IllegalArgumentException` with be thrown. If a `MappedByteBuffer` is used to
+initialize an Interpreter, it should remain unchanged for the whole lifetime of
+the `Interpreter`.
+
+### Running a Model {#running_a_model}
+
+#### C++
 Running a model involves a few simple steps:
 
   * Build an `Interpreter` based on an existing `FlatBufferModel`
@@ -106,7 +142,7 @@ Running a model involves a few simple steps:
   * Read output tensor values
 
 The important parts of public interface of the `Interpreter` are provided
-below.  It should be noted that:
+below. It should be noted that:
 
   * Tensors are represented by integers, in order to avoid string comparisons
     (and any fixed dependency on string libraries).
@@ -114,41 +150,129 @@ below.  It should be noted that:
   * Memory allocation for input and output tensors must be triggered
     by calling AllocateTensors() right after resizing tensors.
 
+In order to run the inference model in TensorFlow Lite, one has to load the
+model into a `FlatBufferModel` object which then can be executed by an
+`Interpreter`.  The `FlatBufferModel` needs to remain valid for the whole
+lifetime of the `Interpreter`, and a single `FlatBufferModel` can be
+simultaneously used by more than one `Interpreter`. In concrete terms, the
+`FlatBufferModel` object must be created before any `Interpreter` objects that
+use it, and must be kept around until they have all been destroyed.
+
+The simplest usage of TensorFlow Lite will look like this:
+
 ```c++
-class Interpreter {
-  Interpreter(ErrorReporter* error_reporter);
+tflite::FlatBufferModel model(path_to_model);
 
-  // Read only access to list of inputs.
-  const std::vector<int>& inputs() const;
+tflite::ops::builtin::BuiltinOpResolver resolver;
+std::unique_ptr<tflite::Interpreter> interpreter;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
 
-  // Read only access to list of outputs.
-  const std::vector<int>& outputs() const;
+// Resize input tensors, if desired.
+interpreter->AllocateTensors();
+
+float* input = interpreter->typed_input_tensor<float>(0);
+// Fill `input`.
 
-  // Change the dimensionality of a given tensor.
-  TfLiteStatus ResizeInputTensor(int tensor_index,
-                                 const std::vector<int>& dims);
+interpreter->Invoke();
 
-  // Returns status of success or failure.
-  TfLiteStatus AllocateTensors();
+float* output = interpreter->typed_output_tensor<float>(0);
+```
 
-  // Return a pointer into the data of a given input tensor.
-  template <class T>
-  T* typed_input_tensor(int index) {
-    return typed_tensor<T>(inputs_[index]);
-  }
+#### Java
 
-  // Return a pointer into the data of a given output tensor.
-  template <class T>
-  T* typed_output_tensor(int index) {
-    return typed_tensor<T>(outputs_[index]);
-  }
+The simplest usage of Tensorflow Lite Java API looks like this:
 
-  // Execute the model, populating output tensors.
-  TfLiteStatus Invoke();
-};
+```java
+try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+  interpreter.run(input, output);
+}
 ```
 
-### Writing Custom Operators
+If a model takes only one input and returns only one output, the following will
+trigger an inference run:
+
+```java
+interpreter.run(input, output);
+```
+
+For models with multiple inputs, or multiple outputs, use:
+
+```java
+interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
+```
+
+where each entry in `inputs` corresponds to an input tensor and
+`map_of_indices_to_outputs` maps indices of output tensors to the corresponding
+output data. In both cases the tensor indices should correspond to the values
+given to the
+[TensorFlow Lite Optimized Converter](../convert/cmdline_examples.md) when the
+model was created. Be aware that the order of tensors in `input` must match the
+order given to the `TensorFlow Lite Optimized Converter`.
+
+The Java API also provides convenient functions for app developers to get the
+index of any model input or output using a tensor name:
+
+```java
+public int getInputIndex(String tensorName);
+public int getOutputIndex(String tensorName);
+```
+
+If tensorName is not a valid name in model, an `IllegalArgumentException` will
+be thrown.
+
+##### Releasing Resources After Use
+
+An `Interpreter` owns resources. To avoid memory leak, the resources must be
+released after use by:
+
+```java
+interpreter.close();
+```
+
+##### Supported Data Types
+
+To use TensorFlow Lite, the data types of the input and output tensors must be
+one of the following primitive types:
+
+*   `float`
+*   `int`
+*   `long`
+*   `byte`
+
+`String` types are also supported, but they are encoded differently than the
+primitive types. In particular, the shape of a string Tensor dictates the number
+and arrangement of strings in the Tensor, with each element itself being a
+variable length string. In this sense, the (byte) size of the Tensor cannot be
+computed from the shape and type alone, and consequently strings cannot be
+provided as a single, flat `ByteBuffer` argument.
+
+If other data types, including boxed types like `Integer` and `Float`, are used,
+an `IllegalArgumentException` will be thrown.
+
+##### Inputs
+
+Each input should be an array or multi-dimensional array of the supported
+primitive types, or a raw `ByteBuffer` of the appropriate size. If the input is
+an array or multi-dimensional array, the associated input tensor will be
+implicitly resized to the array's dimensions at inference time. If the input is
+a ByteBuffer, the caller should first manually resize the associated input
+tensor (via `Interpreter.resizeInput()`) before running inference.
+
+When using 'ByteBuffer', prefer using direct byte buffers, as this allows the
+`Interpreter` to avoid unnecessary copies. If the `ByteBuffer` is a direct byte
+buffer, its order must be `ByteOrder.nativeOrder()`. After it is used for a
+model inference, it must remain unchanged until the model inference is finished.
+
+##### Outputs
+
+Each output should be an array or multi-dimensional array of the supported
+primitive types, or a ByteBuffer of the appropriate size. Note that some models
+have dynamic outputs, where the shape of output tensors can vary depending on
+the input. There's no straightforward way of handling this with the existing
+Java inference API, but planned extensions will make this possible.
+
+
+## Writing Custom Operators
 
 All TensorFlow Lite operators (both custom and builtin) are defined using a
 simple pure-C interface that consists of four functions:
@@ -251,127 +375,3 @@ be code-generated  based on a given subset of ops, possibly only the ones
 contained in a given model. This is the equivalent of TensorFlow's selective
 registration (and a simple version of it is available in the `tools`
 directory).
-
-## Java
-
-TensorFlow Lite's Java API supports on-device inference and is provided as an
-Android Studio Library that allows loading models, feeding inputs, and
-retrieving inference outputs.
-
-The simplest usage of Tensorflow Lite Java API looks like this:
-
-```java
-try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
-  interpreter.run(input, output);
-}
-```
-
-### Loading a Model
-
-The `Interpreter.java` class drives model inference with TensorFlow Lite. In
-most of the cases, this is the only class an app developer will need.
-
-#### Initializing an `Interpreter` With a Model File
-
-The `Interpreter` can be initialized with a model file using the constructor:
-
-```java
-public Interpreter(@NotNull File modelFile);
-```
-
-or with a `MappedByteBuffer`:
-
-```java
-public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
-```
-
-In both cases a valid TensorFlow Lite must be provided or an
-`IllegalArgumentException` with be thrown. If a `MappedByteBuffer` is used to
-initialize an Interpreter, it should remain unchanged for the whole lifetime of
-the `Interpreter`.
-
-### Running a Model
-
-#### Supported Data Types
-
-To use TensorFlow Lite, the data types of the input and output tensors must be
-one of the following primitive types:
-
-*   `float`
-*   `int`
-*   `long`
-*   `byte`
-
-`String` types are also supported, but they are encoded differently than the
-primitive types. In particular, the shape of a string Tensor dictates the number
-and arrangement of strings in the Tensor, with each element itself being a
-variable length string. In this sense, the (byte) size of the Tensor cannot be
-computed from the shape and type alone, and consequently strings cannot be
-provided as a single, flat `ByteBuffer` argument.
-
-If other data types, including boxed types like `Integer` and `Float`, are used,
-an `IllegalArgumentException` will be thrown.
-
-#### Inputs
-
-Each input should be an array or multi-dimensional array of the supported
-primitive types, or a raw `ByteBuffer` of the appropriate size. If the input is
-an array or multi-dimensional array, the associated input tensor will be
-implicitly resized to the array's dimensions at inference time. If the input is
-a ByteBuffer, the caller should first manually resize the associated input
-tensor (via `Interpreter.resizeInput()`) before running inference.
-
-When using 'ByteBuffer', prefer using direct byte buffers, as this allows the
-`Interpreter` to avoid unnecessary copies. If the `ByteBuffer` is a direct byte
-buffer, its order must be `ByteOrder.nativeOrder()`. After it is used for a
-model inference, it must remain unchanged until the model inference is finished.
-
-#### Outputs
-
-Each output should be an array or multi-dimensional array of the supported
-primitive types, or a ByteBuffer of the appropriate size. Note that some models
-have dynamic outputs, where the shape of output tensors can vary depending on
-the input. There's no straightforward way of handling this with the existing
-Java inference API, but planned extensions will make this possible.
-
-#### Running Model Inference
-
-If a model takes only one input and returns only one output, the following will
-trigger an inference run:
-
-```java
-interpreter.run(input, output);
-```
-
-For models with multiple inputs, or multiple outputs, use:
-
-```java
-interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
-```
-
-where each entry in `inputs` corresponds to an input tensor and
-`map_of_indices_to_outputs` maps indices of output tensors to the corresponding
-output data. In both cases the tensor indices should correspond to the values
-given to the [TensorFlow Lite Optimized Converter](convert/cmdline_examples.md)
-when the model was created. Be aware that the order of tensors in `input` must
-match the order given to the `TensorFlow Lite Optimized Converter`.
-
-The Java API also provides convenient functions for app developers to get the
-index of any model input or output using a tensor name:
-
-```java
-public int getInputIndex(String tensorName);
-public int getOutputIndex(String tensorName);
-```
-
-If tensorName is not a valid name in model, an `IllegalArgumentException` will
-be thrown.
-
-### Releasing Resources After Use
-
-An `Interpreter` owns resources. To avoid memory leak, the resources must be
-released after use by:
-
-```java
-interpreter.close();
-```
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..3565ce71df3095d24352f564bef2714a3582adae
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -0,0 +1,229 @@
+# iOS quickstart
+
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your requirements.
+
+## Prerequisites
+
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
+
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
+
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
+
+*   You'll also need the Xcode command-line tools:
+
+        xcode-select --install
+
+    If this is a new install, you will need to run the Xcode application once to
+    agree to the license before continuing.
+
+*   Install CocoaPods if you don't have it:
+
+        sudo gem install cocoapods
+
+### Step 1. Clone the TensorFlow source code
+
+First, we clone the GitHub repository on the computer in a folder to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
+
+Run the following command to install TensorFlow Lite pod:
+
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
+
+If you have installed this pod before and that command doesn't work, try
+
+```
+    pod repo update
+```
+
+### Step 3. Build the XCode project
+
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
+
+```
+    open tflite_camera_example.xcworkspace
+```
+
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
+
+![pre-pend your name to the bundle identifier](../images/ios/bundle_identifier.png)
+
+Plug in your iOS device. Note the app must be executed with a real device with
+camera. Select the iOS device from the drop-down menu.
+
+![Device selection](../images/ios/device_selection.png)
+
+Click the "Run" button to build and run the app
+
+![Build and execute](../images/ios/build_and_execute.png)
+
+Note that as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app on a device.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFlow Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed a quantized model is used.
+
+### Pre-process of bitmap image
+
+The MobileNet model we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger, and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model, and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling, and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note the code is preprocessing and preparing the model input from the camera
+data. Therefore the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After preprocessing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is 0-255.
+We need to convert the value to float to get the probabilities with value range
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/guide/microcontroller.md b/tensorflow/lite/g3doc/guide/microcontroller.md
new file mode 100644
index 0000000000000000000000000000000000000000..635132004ee45fd9f566a970641bc624b85351bb
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/microcontroller.md
@@ -0,0 +1,21 @@
+# Microcontrollers
+
+## Overview
+
+Microcontrollers are compact integrated circuits with very limited resources. Currently, they only perform simple functions.
+
+With the onset of TensorFlow Lite, hence smaller binary sizes, these devices will be able to support machine learning applications, opening the industry up to a myriad of use cases.
+
+## Getting started
+
+Note: This is an experimental release aimed at microcontrollers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems.
+
+One of the challenges of embedded software development is that there are a lot of different architectures, devices, operating systems, and build systems. We aim to support as many of the popular combinations as we can and make it as easy as possible to add support for others.
+
+Read more about [how to get started](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#getting-started).
+
+## Goals
+
+The design goals are to make the framework readable, easy to modify, well-tested, easy to integrate, and compatible (e.g. consistent file schema, interpreter, API, kernel interface).
+
+Read more about [goals and tradeoffs](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro#goals).
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
similarity index 90%
rename from tensorflow/lite/g3doc/tf_ops_compatibility.md
rename to tensorflow/lite/g3doc/guide/ops_compatibility.md
index 8b85c08a32588e210889b8fe2c8e4e2a11223295..000cca5873c5b299ce144aa95b5067ed8f7df64e 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite & TensorFlow Compatibility Guide
+# TensorFlow Lite and TensorFlow operator compatibility
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
 inference models. As they are processed by the TensorFlow Lite Optimizing
@@ -8,7 +8,9 @@ operations are mapped to their TensorFlow Lite counterparts.
 Since the set of TensorFlow Lite operations is smaller than TensorFlow's, not
 every model is convertible. Even for supported operations, very specific usage
 patterns are sometimes expected, for performance reasons. We expect to expand
-the set of supported operations in future TensorFlow Lite releases.
+the set of supported operations in future TensorFlow Lite releases. Additional
+ops can be included by [using select TensorFlow ops](ops_select.md), at the cost
+of binary size.
 
 The best way to understand how to build a TensorFlow model that can be used with
 TensorFlow Lite is to carefully consider how operations are converted and
@@ -17,15 +19,15 @@ optimized, along with the limitations imposed by this process.
 ## Supported Types
 
 Most TensorFlow Lite operations target both floating-point (float32) and
-quantized (uint8) inference, but usually there is little or no support for other
-types like tf.float16 and strings.
+quantized (uint8, int8) inference, but many ops do not yet for other types like
+tf.float16 and strings.
 
 Apart from using different version of the operations, the other difference
 between floating-point and quantized models lies in the way they are converted.
-Quantized conversion expect the models to be annotated with "fake quantization"
-nodes that record the dynamic range of the tensors. Without that information TF
-Lite is not able to accurately quantize a model, which means that proper
-quantized training is necessary before conversion.
+Quantized conversion requires dynamic range information for tensors. This
+requires "fake-quantization" during model training, getting range information
+via a calibration data set, or doing "on-the-fly" range estimation. See
+[quantization](../performance/model_optimization.md).
 
 ## Data Format and Broadcasting
 
@@ -89,7 +91,8 @@ be simply removed from the graph (tf.identity), replaced by tensors
 some supported operations may sometimes be removed through one of these
 processes.
 
-Here is a list of TensorFlow operations that are usually removed from the graph:
+Here is a non-exhaustive list of TensorFlow operations that are usually removed
+from the graph:
 
 *   [tf.add](https://www.tensorflow.org/api_docs/python/tf/add)
 *   [tf.check_numerics](https://www.tensorflow.org/api_docs/python/tf/check_numerics)
@@ -165,6 +168,17 @@ Options {
 }
 ```
 
+**ADD_N**
+
+```
+Inputs {
+  0-N: any number of tensors (must have same size and shape)
+}
+Outputs {
+  0: elementwise sum of the input tensors
+}
+```
+
 **ARG_MAX**
 
 ```
@@ -291,6 +305,17 @@ Options {
 }
 ```
 
+**ELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to exp(features) - 1 if < 0, features otherwise.
+}
+```
+
 **EQUAL**
 
 ```
@@ -362,6 +387,17 @@ Outputs {
 }
 ```
 
+**CEIL**
+
+```
+inputs {
+  0: tensor
+}
+outputs: {
+  0: result of computing element-wise ceil of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -392,6 +428,18 @@ Outputs {
 }
 ```
 
+**GATHER_ND**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
@@ -463,20 +511,6 @@ Options {
 }
 ```
 
-**LEAKY_RELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(input, input * alpha)
-}
-Options {
-  alpha
-}
-```
-
 **LESS**
 
 ```
@@ -691,6 +725,17 @@ Options {
 }
 ```
 
+**RANK**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a 0-D int32 Tensor representing the rank of input
+}
+```
+
 **RELU**
 
 ```
@@ -747,7 +792,7 @@ Inputs {
   1: a 1D tensor with 2 elements
 }
 Outputs {
-  0: A tensor of type `tensor 0` resized according to `tensor 1` heigh/width values
+  0: A tensor of type `tensor 0` resized according to `tensor 1` height/width values
   using nearest neighbors interpolation.
 }
 Options {
@@ -766,6 +811,25 @@ Outputs {
 }
 ```
 
+**REVERSE_SEQUENCE**
+
+```
+Inputs {
+  0: a tensor
+  1: a 1-D tensor which specifies the length of sequence to be reversed in each
+  dim
+}
+Outputs {
+  0: a tensor with the same shape as the input tensor
+}
+Options {
+  seq_dim: a 0-D int tensor (scalar). The dimension which is partially
+  reversed.
+  batch_dim: a 0-D int tensor (scalar). Defaults to 0. The dimension along
+  which reversal is performed.
+}
+```
+
 **SHAPE**
 
 ```
@@ -978,6 +1042,22 @@ Outputs {
 }
 ```
 
+**WHERE**
+
+```
+Inputs {
+  0: A tensor of type bool.
+  1: A tensor which may have the same shape as condition. If condition is rank
+     1, x may have higher rank, but its first dimension must match the size of
+     condition.
+  2: A tensor with the same shape and type as x.
+}
+Outputs {
+  0: A tensor with the same type and shape as x, y if they are non-None, or
+     a tensor with shape (num_true, dim_size(condition)).
+}
+```
+
 **ZEROS_LIKE**
 
 ```
diff --git a/tensorflow/lite/g3doc/custom_operators.md b/tensorflow/lite/g3doc/guide/ops_custom.md
similarity index 57%
rename from tensorflow/lite/g3doc/custom_operators.md
rename to tensorflow/lite/g3doc/guide/ops_custom.md
index 2d80668f37d645054596e1150f1eee6249122f75..1365f449b9c1ca9b86de8051c8866c4cd6c467d2 100644
--- a/tensorflow/lite/g3doc/custom_operators.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -1,13 +1,37 @@
+# Custom operators
 
-# How to use custom operators
+TensorFlow Lite currently supports a subset of TensorFlow operators. It supports
+the use of user-provided implementations (as known as custom implementations) if
+the model contains an operator that is not supported. Providing custom kernels
+is also a way of evaluating a series of TensorFlow operations as a single fused
+TensorFlow Lite operations.
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. However, it
-does support the use of user-provided implementations (as known as custom
-implementations) if the model contains an operator that is not supported.
+Using custom operators consists of three steps.
 
-Let’s walk through this via an example. Assume we are using the `Sin` operator
-and that we are building a very simple model for a function `y = sin(x +
-offset)`, where `offset` is trainable.
+*   Making sure the TensorFlow Graph Def or SavedModel refers to the correctly
+    named TensorFlow Lite operator.
+
+*   Registering a custom kernel with TensorFlow Lite so that the runtime knows
+    how to map your operator and parameters in your graph to executable C/C++
+    code.
+
+*   Testing and profiling your operator correctness and performance,
+    respectively. If you wish to test just your custom operator it is best to
+    create a model with just your custom operator and using the
+    [benchmark_model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/benchmark/benchmark_model_test.cc)
+    proggram
+
+Below we describe a complete example of defining Sin and some links to existing
+conversion process involving custom operators.
+
+## Making a custom operator for Sin
+
+Let’s walk through this an example of supporting a TensorFlow operator that
+TensorFlow Lite does not have. Assume we are using the `Sin` operator and that
+we are building a very simple model for a function `y = sin(x + offset)`, where
+`offset` is trainable.
+
+### Generating the model from TensorFlow
 
 The code to train the TensorFlow model will be something like:
 
@@ -30,6 +54,8 @@ Didn't find custom op for name 'Sin'
 Registration failed.
 ```
 
+### Defining the kernel in the TensorFlow Lite runtime
+
 All we need to do to use the op in TensorFlow Lite is define two functions
 (`Prepare` and `Eval`), and construct a `TfLiteRegistration`. This code would
 look something like this:
@@ -79,19 +105,34 @@ TfLiteRegistration* Register_SIN() {
 }
 ```
 
-When initializing the OpResolver, add the custom op into the resolver, this will
-register the operator with Tensorflow Lite so that TensorFlow Lite can use the
-new implementation.
+When initializing the `OpResolver`, add the custom op into the resolver, this
+will register the operator with Tensorflow Lite so that TensorFlow Lite can use
+the new implementation. Note that the last two arguments in TfLiteRegistration
+correspond to the `SinPrepare` and `SinEval()` functions you defined for the
+custom op. If you used two functions to initialize variables used in the op and
+free up space: `Init()` and `Free()`, then they would be added to the first two
+arguments of TfLiteRegistration; they are set to nullptr in this example.
 
 ```cpp
 tflite::ops::builtin::BuiltinOpResolver builtins;
 builtins.AddCustom("Sin", Register_SIN());
 ```
 
+If you want to make your custom operators in Java, you would currently need to
+build your own custom JNI layer and compile your own AAR
+[in this jni code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/native/builtin_ops_jni.cc).
+Similarly, if you wish to make these operators available in Python you can place
+your registrations in the
+[Python wrapper code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc).
+
 Note that a similar process as above can be followed for supporting for a set of
-operations instead of a single operator.
+operations instead of a single operator. Just add as many `AddCustom` operators
+as you need. In addition, `BuiltinOpResolver` also allows you to override
+implementations of builtins by using the `AddBuiltin`.
+
+## Best Practices
 
-## Best Practices for writing custom operators
+### Writing TensorFlow Lite kernels best practices
 
 1.  Optimize memory allocations and de-allocations cautiously. It is more
     efficient to allocate memory in Prepare() instead of Invoke(), and allocate
@@ -135,13 +176,35 @@ operations instead of a single operator.
     Your code must not leave memory hanging when TF_LITE_ENSURE is done, i.e.,
     these should be done before any resources are allocated that will leak.
 
-## Special TF Graph Attributes
+### Conversion best practices
+
+The example above was easy to convert since it was a builtin operator in
+TensorFlow. If you are defining a new operator that fuses many operators or you
+have complicated shapes or types, you might need to provide more information and
+use graph transformations to rewrite an existing graph to use your operator
+instead of the builtin TensorFlow one.
+
+#### Converting TensorFlow models to convert graphs
+
+In TensorFlow you can use the `tf.lite.OpHint` class to encapsulate groups of
+operators when you create a TensorFlow graph. This allows you then to extract a
+graph def that has references to those operators. This is currently experimental
+and should only be used by advanced users. There is a full example of how to use
+this in the
+[OpHint code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/op_hint.py).
+
+In addition, you can also use a manual graph substitution approach to rewrite
+Tensorflow graphs. There is an example of how this is done in single shot object
+based detection models
+[export script](https://github.com/tensorflow/models/blob/master/research/object_detection/export_tflite_ssd_graph.py).
+
+### TF Graph Attributes
 
 When `tflite_convert` converts a TensorFlow graph into TFLite format, it makes
-some assumption about custom operations that might be not correct. In this case,
+some assumption about custom operations that might not be correct. In this case,
 the generated graph may not execute.
 
-It is possible to add aditional information about your custom op output to TF
+It is possible to add additional information about your custom op output to TF
 graph before it is converted. The following attributes are supported:
 
 -   **_output_quantized** a boolean attribute, true if the operation outputs are
@@ -149,7 +212,7 @@ graph before it is converted. The following attributes are supported:
 -   **_output_types** a list of types for output tensors
 -   **_output_shapes** a list of shapes for output tensors
 
-### Setting the Attributes
+#### Setting the Attributes
 
 This is an example how the attributes can be set:
 
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/guide/ops_select.md
similarity index 92%
rename from tensorflow/lite/g3doc/using_select_tf_ops.md
rename to tensorflow/lite/g3doc/guide/ops_select.md
index 269774a4b10648f92aab5ee6bf5ae3687c263f75..51320140e1c085f3b44238a3586efe92819dc170 100644
--- a/tensorflow/lite/g3doc/using_select_tf_ops.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -1,4 +1,6 @@
-# [Experimental] Using TensorFlow Lite with select TensorFlow ops
+# Select TensorFlow operators to use in TensorFlow Lite
+
+Caution: This feature is experimental.
 
 The TensorFlow Lite builtin op library has grown rapidly, and will continue to
 grow, but there remains a long tail of TensorFlow ops that are not yet natively
@@ -13,7 +15,7 @@ please send feedback about models that work and issues you are facing to
 tflite@tensorflow.org.
 
 TensorFlow Lite will continue to have
-[TensorFlow Lite builtin ops](tf_ops_compatibility.md) optimized for mobile and
+[TensorFlow Lite builtin ops](ops_compatibility.md) optimized for mobile and
 embedded devices. However, TensorFlow Lite models can now use a subset of
 TensorFlow ops when TFLite builtin ops are not sufficient.
 
@@ -32,7 +34,7 @@ choice. It also discusses some [known limitations](#known-limitations), the
 
 To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
 use the `target_ops` argument in the
-[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert/). The
+[TensorFlow Lite converter](../convert/). The
 following values are valid options for `target_ops`:
 
 *   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
@@ -49,8 +51,7 @@ partially supported by TensorFlow Lite, and one would like to avoid those
 limitations.
 
 The following example shows how to use `target_ops` in the
-[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
-API.
+[`TFLiteConverter`](./convert/python_api.md) Python API.
 
 ```
 import tensorflow as tf
@@ -63,7 +64,7 @@ open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
 The following example shows how to use `target_ops` in the
-[`tflite_convert`](https://www.tensorflow.org/lite/convert/cmdline_examples)
+[`tflite_convert`](../convert/cmdline_examples.md)
 command line tool.
 
 ```
@@ -96,7 +97,7 @@ includes the necessary library of TensorFlow ops.
 ### Android AAR
 
 A new Android AAR target with select TensorFlow ops has been added for
-convenience. Assuming a <a href="./demo_android.md">working TensorFlow Lite
+convenience. Assuming a <a href="android.md">working TensorFlow Lite
 build environment</a>, build the Android AAR with select TensorFlow ops as
 follows:
 
@@ -151,8 +152,8 @@ TensorFlow Lite XCode project with support for select TensorFlow ops has been
 added to
 `tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
 
-To use this feature in a your own project, either clone the example project or
-set the project settings for a new or existing project to the following:
+To use this feature in your own project, either clone the example project or set
+the project settings for a new or existing project to the following:
 
 *   In Build Phases -> Link Binary With Libraries, add the static libraries
     under `tensorflow/contrib/makefile/gen/lib/` directory:
@@ -197,9 +198,7 @@ Python support is actively under development.
 
 When using a mixture of both builtin and select TensorFlow ops, all of the same
 TensorFlow Lite optimizations and optimized builtin kernels will be be available
-and usable with the converted model. For the TensorFlow ops, performance should
-generally be comparable to that of
-[TensorFlow Mobile](https://www.tensorflow.org/lite/tfmobile/).
+and usable with the converted model.
 
 The following table describes the average time taken to run inference on
 MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
diff --git a/tensorflow/lite/g3doc/ops_versioning.md b/tensorflow/lite/g3doc/guide/ops_version.md
similarity index 99%
rename from tensorflow/lite/g3doc/ops_versioning.md
rename to tensorflow/lite/g3doc/guide/ops_version.md
index 0d571ce54779547a5e3457b089b791abca858930..9418ce4e92af3f43378181eaa836461edfed987a 100644
--- a/tensorflow/lite/g3doc/ops_versioning.md
+++ b/tensorflow/lite/g3doc/guide/ops_version.md
@@ -1,5 +1,4 @@
-
-# TensorFlow Lite Ops Versioning
+# TensorFlow Lite operator versions
 
 This document describes TensorFlow Lite's op versioning schema. Op
 versioning enables developers to add new functionalities and parameters into
diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
new file mode 100644
index 0000000000000000000000000000000000000000..a93591813c3e5e58b7d65b27f2d3f40b0e9e723a
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -0,0 +1,77 @@
+# TensorFlow Lite 2019 Roadmap
+
+**Updated: March 6th, 2019**
+
+The following represents a high level overview of our 2019 plan. You should be
+conscious that this roadmap may change at anytime relative to a range of factors
+and the order below does not reflect any type of priority. As a matter of
+principle, we typically prioritize issues that the majority of our users are
+asking for and so this list fundamentally reflects that.
+
+We break our roadmap into four key segments: usability, performance,
+optimization and portability. We strongly encourage you to comment on our
+roadmap and provide us feedback in the TF Lite discussion groups and forums.
+
+## Usability
+
+*   **More ops coverage**
+    *   Prioritize many more ops based on user feedback
+*   **Op versioning & signatures**
+    *   Op kernels will get version numbers
+    *   Op kernels will be identifiable by signature
+*   **New Convertor**
+    *   Implementing a new TensorFlow Lite convertor that will better handle
+        graph conversion (i.e. control flow, conditionals etc) and replace TOCO
+*   **Continue to improve TF Select Ops**
+    *   Support more types of conversion utilizing TF Selects such as hash
+        tables, strings etc.
+    *   Support smaller binary size when using select TF ops via op stripping
+*   **LSTM / RNN support**
+    *   Add full support of conversion for LSTMs and RNNs
+*   **Graph Visualization Tooling**
+    *   Provide enhanced graph visualization tooling
+*   **Pre-and-post processing support**
+    *   Add more support for pre-and-post processing of inference
+*   **Control Flow & Training on-device**
+    *   Add support for control flow related ops
+    *   Add support for training on-device
+*   **New APIs**
+    *   New C API as core for language bindings and most clients
+    *   Objective-C API for iOS
+    *   SWIFT API for iOS
+    *   Updated Java API for Android
+    *   C# Unity language bindings
+*   **Add more Models**
+    *   Add more models to the support section of the site
+
+## Performance
+
+*   **More hardware delegates**
+    *   Add support for more hardware delegates
+*   **Support NN API**
+    *   Continually support and improve support for NN API
+*   **Framework Extensibility**
+    *   Enable simplistic overwriting of CPU kernels with customized optimized
+        versions
+*   **GPU Delegate**
+    *   Continue to extend the total support ops for OpenGL and Metal ops
+    *   Open-source
+*   **Improve TFLite CPU performance**
+    *   Optimizations for float and quantized models
+
+## Optimization
+
+*   **Model Optimization Toolkit**
+    *   Post training quantization + hybrid kernels
+    *   Post Training quantization + fixed-point kernels
+    *   Training with quantization
+*   **More support for more techniques**
+    *   RNN Support
+    *   Sparsity/Pruning
+    *   Lower bit-width support
+
+## Portability
+
+*   **Microcontroller Support**
+    *   Add support for a range of 8-bit, 16-bit and 32-bit MCU architecture use
+        cases for Speech and Image Classification
diff --git a/tensorflow/lite/g3doc/images/ios/build_and_execute.png b/tensorflow/lite/g3doc/images/ios/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/device_selection.png b/tensorflow/lite/g3doc/images/ios/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_1.png b/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..29b883a40f3a18f3db183887dda253a5a86d7c13
Binary files /dev/null and b/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_1.png differ
diff --git a/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_2.png b/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..197fa216e29a7671b35dc57fbd517c4c2b543784
Binary files /dev/null and b/tensorflow/lite/g3doc/images/performance/tflite_delegate_graph_2.png differ
diff --git a/tensorflow/lite/g3doc/models/_index.yaml b/tensorflow/lite/g3doc/models/_index.yaml
deleted file mode 100644
index f4d8bc40a9325b12734022e005996e13dba0a0d6..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/models/_index.yaml
+++ /dev/null
@@ -1,125 +0,0 @@
-project_path: /lite/_project.yaml
-book_path: /lite/_book.yaml
-description: <!--no description-->
-landing_page:
-  body_class: tfo-hide-page-nav
-  custom_css_path: /site-assets/css/style.css
-  show_side_navs: true
-  rows:
-
-  # Hero
-  - classname: >
-      devsite-landing-row-50
-      devsite-landing-row-large-headings
-      devsite-landing-row-no-image-background
-    foreground: theme
-    items:
-    - heading: Models marketplace
-      description: >
-        The TensorFlow Lite models marketplace, your neighborhood model shoppe.
-      image_path: /resources/images/tflite-card-16x9.png
-
-  # Features
-  - background: grey
-    items:
-    - heading: Optimized for mobile
-      description: >
-        Machine learning can make your apps more engaging, personalized, and
-        helpful, and provides solutions that are optimized to run on-device.
-    - heading: Built with Google expertise
-      description: >
-        Models offer the technologies that have long powered Google's own
-        experiences on mobile.
-    - heading: Approachable and comprehensive
-      description: >
-        Use out-of-the-box solutions (base APIs) or custom models, running
-        on-device or in the Cloud, depending on your specific needs.
-
-  # Beginner models
-  - classname: devsite-landing-row-100
-    heading: "Build machine learning into your apps"
-    items:
-    - heading: >
-        Image labeling
-      description: >
-        Identify objects, locations, activities, animal species, products, and
-        more
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Text recognition (OCR)
-      description: >
-        Recognize and extract text from images
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Face detection
-      description: >
-        Detect faces and facial landmarks
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-
-  - items:
-    - heading: >
-        Barcode scanning
-      description: >
-        Scan and process barcodes
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Landmark detection
-      description: >
-        Identify popular landmarks in an image
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-    - heading: >
-        Smart reply
-      description: >
-        Provide suggested text snippet that fits context
-      icon:
-        path: ../images/landing-page/assistant_logo.png
-      path: /lite/image/labeling/
-
-  # Custom models
-  - classname: >
-      devsite-landing-row-no-image-background
-      devsite-landing-row-50
-      devsite-landing-row-large-headings
-    foreground: theme
-    background: grey
-    items:
-    - heading: Custom models
-      description: >
-        <p>If models don’t cover your use cases, you can always
-        bring your own existing TensorFlow Lite models. Just upload your model,
-        and we’ll take care of hosting and serving it to your app.</p>
-
-        <p>Models acts as an API layer to your custom model, making it easy to
-        run and use. In addition to deploying your models, we are releasing an
-        experimental model compression flow that aims to reduce model size (up
-        to orders of magnitudes) while maintaining similar accuracy. Sign up at
-        <a href="https://g.co/firebase/signup">g.co/firebase/signup</a></p>
-
-        <p>And if you’re new to machine learning and want more information on
-        custom models for mobile, you can <a
-        href="//www.tensorflow.org/lite/">learn more about TensorFlow
-        Lite.</a></p>
-      image_path: /resources/images/tflite-card-16x9.png
-      image_left: true
-  - classname: devsite-landing-row-large-headings
-    foreground: theme
-    items:
-    - heading: Just the beginning
-      description: >
-        Our ultimate goal is to reduce idea–to–implementation cycles and make AI
-        an essential and intuitive part of a developer's toolkit. We will do so
-        by continuing to add new Base APIs that leverage Google’s machine
-        learning expertise. Base APIs will ultimately cover significantly more
-        use cases in the vision, speech, and text fields. We will also continue
-        to simplify use of custom models, adding tools to deploy, compress, and
-        create them.
diff --git a/tensorflow/lite/g3doc/models/image/label/ios.md b/tensorflow/lite/g3doc/models/image/label/ios.md
deleted file mode 100644
index 904c6450ac7272e67c1982b56099b608b91e2237..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/models/image/label/ios.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# iOS
-
-lorem
diff --git a/tensorflow/lite/g3doc/models/image/label/overview.md b/tensorflow/lite/g3doc/models/image/label/overview.md
deleted file mode 100644
index b3d9133bb2123012f2ddd2db768347305d224744..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/models/image/label/overview.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Overview
-
-Image labeling gives you insight into the content of images. When you use the
-API, you get a list of the entities that were recognized: people, things,
-places, activities, and so on. Each label found comes with a score that
-indicates the confidence the ML model has in its relevance. With this
-information, you can perform tasks such as automatic metadata generation
-and content moderation.
diff --git a/tensorflow/lite/g3doc/models/image/label/android.md b/tensorflow/lite/g3doc/models/image_classification/android.md
similarity index 85%
rename from tensorflow/lite/g3doc/models/image/label/android.md
rename to tensorflow/lite/g3doc/models/image_classification/android.md
index c755328ac059013d2d45bbeb3c67516dafbb0ff1..61606096f77ce810c2b1a686cd05599b35200f57 100644
--- a/tensorflow/lite/g3doc/models/image/label/android.md
+++ b/tensorflow/lite/g3doc/models/image_classification/android.md
@@ -3,21 +3,20 @@
 This tutorial provides a simple Android mobile application to classify images
 using the Android device camera. In this tutorial, you will download the demo
 application from the Tensorflow repository, build it on your computer, and
-install it on your Android Device. You will also learn how to customize the
+install it on your Android device. You will also learn how to customize the
 application to suit your requirements.
 
 ### Prerequisites
 
 *   Android Studio 3.2 (installed on a Linux, Mac or Windows machine)
 
-*   Android Device
+*   Android device
 
 *   USB cable (to connect Android device to your computer)
 
 ### Step 1. Clone the TensorFlow source code
 
-First, we clone the GitHub repository on the computer in a folder to get the
-demo application.
+Clone the GitHub repository to your computer to get the demo application.
 
 ```
 
@@ -29,21 +28,21 @@ Open the TensorFlow source code in Android Studio. To do this, open Android
 Studio and select `Open an existing project` setting the folder to
 `tensorflow/lite/examples/android`
 
-![Step 1](images/classifydemo_img1.png)
+<img src="images/classifydemo_img1.png" />
 
 This folder contains the demo application for image classification, object
 detection, and speech hotword detection.
 
 ### Step 2. Build the Android Studio project
 
-In this step, Select `Build -> Make Project` and check that the project builds
+Select `Build -> Make Project` and check that the project builds
 successfully. You will need Android SDK configured in the settings. You'll need
 at least SDK version 23. The gradle file will prompt you to download any missing
 libraries.
 
-![Step 2](images/classifydemo_img4.png)
+<img src="images/classifydemo_img4.png" style="width: 40%" />
 
-![Step 2a](images/classifydemo_img2.png)
+<img src="images/classifydemo_img2.png" style="width: 60%" />
 
 #### TensorFlow Lite AAR from JCenter:
 
@@ -53,25 +52,25 @@ build.
 If you see a build error related to compatibility with Tensorflow Lite's Java
 API (example: method X is undefined for type Interpreter), there has likely been
 a backwards compatible change to the API. You will need to pull new app code
-that's compatible with the nightly build by running git pull.
+that's compatible with the nightly build by running `git pull`.
 
-### Step 3. Install and Run the app
+### Step 3. Install and run the app
 
-Connect the Android device to the computer, and be sure to approve any ADB
+Connect the Android device to the computer and be sure to approve any ADB
 permission prompts that appear on your phone. Select `Run -> Run app.` Select
-the deployment target in the connected devices to the device on which app will
+the deployment target in the connected devices to the device on which the app will
 be installed. This will install the app on the device.
 
-![Step 3](images/classifydemo_img5.png)
+<img src="images/classifydemo_img5.png" style="width: 60%" />
 
-![Step 3a](images/classifydemo_img6.png)
+<img src="images/classifydemo_img6.png" style="width: 70%" />
 
-![Step 3b](images/classifydemo_img7.png)
+<img src="images/classifydemo_img7.png" style="width: 40%" />
 
-![Step 3c](images/classifydemo_img8.png)
+<img src="images/classifydemo_img8.png" style="width: 80%" />
 
-To test the app, open the app named `TFL Classify` on the device. When you run
-the app first time, the app will request permission to access the camera.
+To test the app, open the app called `TFL Classify` on your device. When you run
+the app the first time, the app will request permission to access the camera.
 Re-installing the app may require you to uninstall the previous installations.
 
 ## Understanding Android App Code
@@ -85,7 +84,7 @@ This file depends on `AndroidManifest.xml` in the folder
 `tensorflow/tensorflow/lite/examples/android/app/src/main` to set the camera
 orientation.
 
-### Pre-process of bitmap image
+### Pre-process bitmap image
 
 The mobile application code that pre-processes the images and runs inference is
 in
@@ -102,7 +101,7 @@ DIM_PIXEL_SIZE);
 c.imgData.order(ByteOrder.nativeOrder());
 ```
 
-While running the application, we preprocess the incoming bitmap images from the
+While running the application, we pre-process the incoming bitmap images from the
 camera to a Bytebuffer. Since this model is quantized 8-bit, we will put a
 single byte for each channel. `imgData` will contain an encoded `Color` for each
 pixel in ARGB format, so we need to mask the least significant 8 bits to get
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/android_banana.png b/tensorflow/lite/g3doc/models/image_classification/images/android_banana.png
new file mode 100644
index 0000000000000000000000000000000000000000..a25dffe3a070a8af509efab823193de3d2f80f49
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/android_banana.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img1.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img2.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img4.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img5.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img6.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img7.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
diff --git a/tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
similarity index 100%
rename from tensorflow/lite/g3doc/models/image/label/images/classifydemo_img8.png
rename to tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/dog.png b/tensorflow/lite/g3doc/models/image_classification/images/dog.png
new file mode 100644
index 0000000000000000000000000000000000000000..65c6eb551468be3b53dc805009688c7b5808c660
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/dog.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/ios.md b/tensorflow/lite/g3doc/models/image_classification/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..63e3abd779355b842964ae8836f24a1cd7a8832f
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/ios.md
@@ -0,0 +1,229 @@
+# TensorFlow Lite iOS Image Classifier App Example
+
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your needs.
+
+## Prerequisites
+
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
+
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
+
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
+
+*   You'll also need the Xcode command-line tools:
+
+        xcode-select --install
+
+    If this is a new install, you will need to run the Xcode application once to
+    agree to the license before continuing.
+
+*   Install CocoaPods if you don't have it:
+
+        sudo gem install cocoapods
+
+### Step 1. Clone the TensorFlow source code
+
+lone the GitHub repository onto your computer to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
+
+Run the following command to install TensorFlow Lite pod:
+
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
+
+If you have installed this pod before and that command doesn't work, try
+
+```
+    pod repo update
+```
+
+### Step 3. Build the XCode project
+
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
+
+```
+    open tflite_camera_example.xcworkspace
+```
+
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
+
+![pre-pend your name to the bundle identifier](images/bundle_identifier.png)
+
+Plug in your iOS device. Note that the app must be executed with a real device with
+a camera. Select the iOS device from the drop-down menu.
+
+![Device selection](images/device_selection.png)
+
+Click the "Run" button to build and run the app
+
+![Build and execute](images/build_and_execute.png)
+
+Note that, as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app onto a device.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses a quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed that a quantized model is used.
+
+### Pre-process bitmap image
+
+The MobileNet model that we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note that the code pre-processes and prepares the model input from the camera
+data. Therefore, the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After pre-processing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is from 0-255.
+We need to convert the value to float to get the probabilities with a value range from
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ddbaf43ef092456b23d65684e1fdd7609c58472
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -0,0 +1,285 @@
+# Image classification
+
+<img src="../images/image.png" class="attempt-right">
+
+Use a pre-trained and optimized model to identify hundreds of classes of
+objects, including people, activities, animals, plants, and places.
+
+## Get started
+
+If you are unfamiliar with the concept of image classification, you should start
+by reading <a href="#what_is_image_classification">What is image
+classification?</a>
+
+If you understand image classification, you’re new to TensorFlow Lite, and
+you’re working with Android or iOS, we recommend following the corresponding
+tutorial that will walk you through our sample code.
+
+<a class="button button-primary" href="android.md">Android</a>
+<a class="button button-primary" href="ios.md">iOS</a>
+
+We also provide <a href="example_applications">example applications</a> you can
+use to get started.
+
+If you are using a platform other than Android or iOS, or you are already
+familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
+download our starter image classification model and the accompanying labels.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">Download
+starter model and labels</a>
+
+Once you have the starter model running on your target device, you can
+experiment with different models to find the optimal balance between
+performance, accuracy, and model size. For guidance, see
+<a href="#choose_a_different_model">Choose a different model</a>.
+
+If you are using a platform other than Android or iOS, or you are already
+familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
+download our starter image classification model and the accompanying labels.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">Download
+starter model and labels</a>
+
+### Example applications
+
+We have example applications for image classification for both Android and iOS.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">Android
+example</a>
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios.md">iOS
+example</a>
+
+The following screenshot shows the Android image classification example:
+
+<img src="images/android_banana.png" alt="Screenshot of Android example" width="30%">
+
+## What is image classification?
+
+A common use of machine learning is to identify what an image represents. For
+example, we might want to know what type of animal appears in the following
+photograph.
+
+<img src="images/dog.png" alt="dog" width="50%">
+
+The task of predicting what an image represents is called _image
+classification_. An image classification model is trained to recognize various
+classes of images. For example, a model might be trained to recognize photos
+representing three different types of animals: rabbits, hamsters, and dogs.
+
+When we subsequently provide a new image as input to the model, it will output
+the probabilities of the image representing each of the types of animal it was
+trained on. An example output might be as follows:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Animal type</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>Hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">Dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+Based on the output, we can see that the classification model has predicted that
+the image has a high probability of representing a dog.
+
+Note: Image classification can only tell you the probability that an image
+represents one or more of the classes that the model was trained on. It cannot
+tell you the position or identity of objects within the image. If you need to
+identify objects and their positions within images, you should use an
+<a href="../object_detection/overview.md">object detection</a> model.
+
+### Training, labels, and inference
+
+During training, an image classification model is fed images and their
+associated _labels_. Each label is the name of a distinct concept, or class,
+that the model will learn to recognize.
+
+Given sufficient training data (often hundreds or thousands of images per
+label), an image classification model can learn to predict whether new images
+belong to any of the classes it has been trained on. This process of prediction
+is called _inference_.
+
+To perform inference, an image is passed as input to a model. The model will
+then output an array of probabilities between 0 and 1. With our example model,
+this process might look like the following:
+
+<table style="width: 60%">
+  <tr style="border-top: 0px;">
+    <td style="width: 40%"><img src="images/dog.png" alt="dog"></td>
+    <td style="width: 20%; font-size: 2em; vertical-align: middle; text-align: center;">→</td>
+    <td style="width: 40%; vertical-align: middle; text-align: center;">[0.07, 0.02, 0.91]</td>
+</table>
+
+Each number in the output corresponds to a label in our training data.
+Associating our output with the three labels the model was trained on, we can
+see the model has predicted a high probability that the image represents a dog.
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+You might notice that the sum of all the probabilities (for rabbit, hamster, and
+dog) is equal to 1. This is a common type of output for models with multiple
+classes (see
+<a href="https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax">Softmax</a>
+for more information).
+
+### Ambiguous results
+
+Since the probabilities will always sum to 1, if the image is not confidently
+recognized as belonging to any of the classes the model was trained on you may
+see the probability distributed throughout the labels without any one value
+being significantly larger.
+
+For example, the following might indicate an ambiguous result:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.31</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.35</td>
+    </tr>
+    <tr>
+      <td>dog</td>
+      <td>0.34</td>
+    </tr>
+  </tbody>
+</table>
+
+### Uses and limitations
+
+The image classification models that we provide are useful for single-label
+classification, which means predicting which single label the image is most
+likely to represent. They are trained to recognize 1000 classes of image. For a
+full list of classes, see the labels file in the
+<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">model
+zip</a>.
+
+If you want to train a model to recognize new classes, see
+<a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting the type and position of one or more objects within an image (see <a href="../object_detection/overview.md">object detection</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
+</ul>
+
+Once you have the starter model running on your target device, you can
+experiment with different models to find the optimal balance between
+performance, accuracy, and model size. For guidance, see
+<a href="#choose_a_different_model">Choose a different model</a>.
+
+## Choose a different model
+
+There are a large number of image classification models available on our
+<a href="../../guide/hosted_models.md">List of hosted models</a>. You should aim to choose the
+optimal model for your application based on performance, accuracy and model
+size. There are trade-offs between each of them.
+
+### Performance
+
+We measure performance in terms of the amount of time it takes for a model to
+run inference on a given piece of hardware. The less time, the faster the model.
+
+The performance you require depends on your application. Performance can be
+important for applications like real-time video, where it may be important to
+analyze each frame in the time before the next frame is drawn (e.g. inference
+must be faster than 33ms to perform real-time inference on a 30fps video
+stream).
+
+Our quantized Mobilenet models’ performance ranges from 3.7ms to 80.3 ms.
+
+### Accuracy
+
+We measure accuracy in terms of how often the model correctly classifies an
+image. For example, a model with a stated accuracy of 60% can be expected to
+classify an image correctly an average of 60% of the time.
+
+Our <a href="../../guide/hosted_models.md">list of hosted models</a> provides Top-1 and Top-5
+accuracy statistics. Top-1 refers to how often the correct label appears as the
+label with the highest probability in the model’s output. Top-5 refers to how
+often the correct label appears in the top 5 highest probabilities in the
+model’s output.
+
+Our quantized Mobilenet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
+
+### Size
+
+The size of a model on-disk varies with its performance and accuracy. Size may
+be important for mobile development (where it might impact app download sizes)
+or when working with hardware (where available storage might be limited).
+
+Our quantized Mobilenet models’ size ranges from 0.5 to 3.4 Mb.
+
+### Architecture
+
+There are several different architectures of models available on
+<a href="../../guide/hosted_models.md">List of hosted models</a>, indicated by the model’s name.
+For example, you can choose between Mobilenet, Inception, and others.
+
+The architecture of a model impacts its performance, accuracy, and size. All of
+our hosted models are trained on the same data, meaning you can use the provided
+statistics to compare them and choose which is optimal for your application.
+
+Note: The image classification models we provide accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. <br /><br />All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel.<br /><br />Our <a href="android.md">Android</a> and <a href="ios.md">iOS</a> code samples demonstrate how to process full-sized camera images into the required format for each model.
+
+## Customize model
+
+The pre-trained models we provide are trained to recognize 1000 classes of
+image. For a full list of classes, see the labels file in the
+<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_quant_and_labels.zip">model
+zip</a>.
+
+You can use a technique known as _transfer learning_ to re-train a model to
+recognize classes not in the original set. For example, you could re-train the
+model to distinguish between different species of tree, despite there being no
+trees in the original training data. To do this, you will need a set of training
+images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the
+<a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/">TensorFlow
+for Poets</a> codelab.
diff --git a/tensorflow/lite/g3doc/models/images/audio.png b/tensorflow/lite/g3doc/models/images/audio.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b25c442016a21600eb8249eafa55bacbba4e9
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/audio.png differ
diff --git a/tensorflow/lite/g3doc/models/images/blank.png b/tensorflow/lite/g3doc/models/images/blank.png
new file mode 100644
index 0000000000000000000000000000000000000000..d099da5da07271410883554e07e37765ca048590
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/blank.png differ
diff --git a/tensorflow/lite/g3doc/models/images/camera.png b/tensorflow/lite/g3doc/models/images/camera.png
new file mode 100644
index 0000000000000000000000000000000000000000..95a9218d47864aba12255bd32b67bb74b0d6704e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/camera.png differ
diff --git a/tensorflow/lite/g3doc/models/images/detection.png b/tensorflow/lite/g3doc/models/images/detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..30e10f59cd53af21fe9b6a86aa5b45ca07131b1b
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/detection.png differ
diff --git a/tensorflow/lite/g3doc/models/images/image.png b/tensorflow/lite/g3doc/models/images/image.png
new file mode 100644
index 0000000000000000000000000000000000000000..e72aac9b25eec69e8c0252f441d125340b88cab5
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/image.png differ
diff --git a/tensorflow/lite/g3doc/models/images/object.png b/tensorflow/lite/g3doc/models/images/object.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa8ed428ed15e7b166bdde560669563a224e6f6c
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/object.png differ
diff --git a/tensorflow/lite/g3doc/models/images/output_stride.png b/tensorflow/lite/g3doc/models/images/output_stride.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d4663f8675eef733e18b2a5cb05670cd40d8293
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/output_stride.png differ
diff --git a/tensorflow/lite/g3doc/models/images/pose.png b/tensorflow/lite/g3doc/models/images/pose.png
new file mode 100644
index 0000000000000000000000000000000000000000..f071d789963d0f48efb5ba20633391403f75ddf8
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/pose.png differ
diff --git a/tensorflow/lite/g3doc/models/images/segmentation.png b/tensorflow/lite/g3doc/models/images/segmentation.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c61330687cc9a388a443bc6b771027d15b66d98
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/segmentation.png differ
diff --git a/tensorflow/lite/g3doc/models/images/sentiment.png b/tensorflow/lite/g3doc/models/images/sentiment.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ba494fcb6e62a90015d2aead4779fcacab70529
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/sentiment.png differ
diff --git a/tensorflow/lite/g3doc/models/images/smart_reply.png b/tensorflow/lite/g3doc/models/images/smart_reply.png
new file mode 100644
index 0000000000000000000000000000000000000000..802cc80feebe2a46b059b23d52ccf794701e4d99
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/smart_reply.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tabular.png b/tensorflow/lite/g3doc/models/images/tabular.png
new file mode 100644
index 0000000000000000000000000000000000000000..2eac8f4c4ac74029c755a207b1f8a25592f468ac
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tabular.png differ
diff --git a/tensorflow/lite/g3doc/models/images/text.png b/tensorflow/lite/g3doc/models/images/text.png
new file mode 100644
index 0000000000000000000000000000000000000000..227594f07e3d38fd4110249eb2c4c6541fb89baa
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/text.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tflite_models.png b/tensorflow/lite/g3doc/models/images/tflite_models.png
new file mode 100644
index 0000000000000000000000000000000000000000..f60cd26a3177f95e40875ed92aa4a30c59a7623f
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tflite_models.png differ
diff --git a/tensorflow/lite/g3doc/models/images/video.png b/tensorflow/lite/g3doc/models/images/video.png
new file mode 100644
index 0000000000000000000000000000000000000000..88b3b7d3c76840625abec821220413a03d384a45
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/video.png differ
diff --git a/tensorflow/lite/g3doc/models/object_detection/images/android_apple_banana.png b/tensorflow/lite/g3doc/models/object_detection/images/android_apple_banana.png
new file mode 100644
index 0000000000000000000000000000000000000000..f7a9fe5af89f89772248e8ea8d89904719d310dd
Binary files /dev/null and b/tensorflow/lite/g3doc/models/object_detection/images/android_apple_banana.png differ
diff --git a/tensorflow/lite/g3doc/models/object_detection/images/false_positive.png b/tensorflow/lite/g3doc/models/object_detection/images/false_positive.png
new file mode 100644
index 0000000000000000000000000000000000000000..39d2103a3a8749233ebfab9fff3fea533f0f00fd
Binary files /dev/null and b/tensorflow/lite/g3doc/models/object_detection/images/false_positive.png differ
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffa6381ef3df28d7b035d585a2813496ea2ea7e0
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -0,0 +1,270 @@
+# Object detection
+
+<img src="../images/detection.png" class="attempt-right">
+
+Detect multiple objects within an image, with bounding boxes. Recognize 80
+different classes of objects.
+
+## Get started
+
+If you are new to TensorFlow Lite and are working with Android or iOS, we
+recommend exploring the following example applications that can help you get
+started.
+
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">Android
+example</a>
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios">iOS
+example</a>
+
+If you are using a platform other than Android or iOS, or you are already
+familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
+download our starter object detection model and the accompanying labels.
+
+<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+starter model and labels</a>
+
+For more information about the starter model, see
+<a href="#starter_model">Starter model</a>.
+
+## What is object detection?
+
+Given an image or a video stream, an object detection model can identify which
+of a known set of objects might be present and provide information about their
+positions within the image.
+
+For example, this screenshot of our <a href="#get_started">example
+application</a> shows how two objects have been recognized and their positions
+annotated:
+
+<img src="images/android_apple_banana.png" alt="Screenshot of Android example" width="30%">
+
+An object detection model is trained to detect the presence and location of
+multiple classes of objects. For example, a model might be trained with images
+that contain various pieces of fruit, along with a _label_ that specifies the
+class of fruit they represent (e.g. an apple, a banana, or a strawberry), and
+data specifying where each object appears in the image.
+
+When we subsequently provide an image to the model, it will output a list of the
+objects it detects, the location of a bounding box that contains each object,
+and a score that indicates the confidence that detection was correct.
+
+### Model output
+
+Imagine a model has been trained to detect apples, bananas, and strawberries.
+When we pass it an image, it will output a set number of detection results - in
+this example, 5.
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Apple</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Banana</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Strawberry</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td>Banana</td>
+      <td>0.23</td>
+      <td>[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td>Apple</td>
+      <td>0.11</td>
+      <td>[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+### Confidence score
+
+To interpret these results, we can look at the score and the location for each
+detected object. The score is a number between 0 and 1 that indicates confidence
+that the object was genuinely detected. The closer the number is to 1, the more
+confident the model is.
+
+Depending on your application, you can decide a cut-off threshold below which
+you will discard detection results. For our example, we might decide a sensible
+cut-off is a score of 0.5 (meaning a 50% probability that the detection is
+valid). In that case, we would ignore the last two objects in the array, because
+those confidence scores are below 0.5:
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Apple</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Banana</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Strawberry</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Banana</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.23</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Apple</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.11</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+The cut-off you use should be based on whether you are more comfortable with
+false positives (objects that are wrongly identified, or areas of the image that
+are erroneously identified as objects when they are not), or false negatives
+(genuine objects that are missed because their confidence was low).
+
+For example, in the following image, a pear (which is not an object that the
+model was trained to detect) was misidentified as a "person". This is an example
+of a false positive that could be ignored by selecting an appropriate cut-off.
+In this case, a cut-off of 0.6 (or 60%) would comfortably exclude the false
+positive.
+
+<img src="images/false_positive.png" alt="Screenshot of Android example showing a false positive" width="30%">
+
+### Location
+
+For each detected object, the model will return an array of four numbers
+representing a bounding rectangle that surrounds its position. For the starter
+model we provide, the numbers are ordered as follows:
+
+<table style="width: 50%; margin: 0 auto;">
+  <tbody>
+    <tr style="border-top: none;">
+      <td>[</td>
+      <td>top,</td>
+      <td>left,</td>
+      <td>bottom,</td>
+      <td>right</td>
+      <td>]</td>
+    </tr>
+  </tbody>
+</table>
+
+The top value represents the distance of the rectangle’s top edge from the top
+of the image, in pixels. The left value represents the left edge’s distance from
+the left of the input image. The other values represent the bottom and right
+edges in a similar manner.
+
+Note: Object detection models accept input images of a specific size. This is likely to be different from the size of the raw image captured by your device’s camera, and you will have to write code to crop and scale your raw image to fit the model’s input size (there are examples of this in our <a href="#get_started">example applications</a>).<br /><br />The pixel values output by the model refer to the position in the cropped and scaled image, so you must scale them to fit the raw image in order to interpret them correctly.
+
+## Starter model
+
+We recommend starting with this pre-trained quantized COCO SSD MobileNet v1
+model.
+
+<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
+starter model and labels</a>
+
+### Uses and limitations
+
+The object detection model we provide can identify and locate up to 10 objects
+in an image. It is trained to recognize 80 classes of object. For a full list of
+classes, see the labels file in the
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
+zip</a>.
+
+If you want to train a model to recognize new classes, see
+<a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting which single label the image most likely represents (see <a href="../image_classification/overview.md">image classification</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
+</ul>
+
+### Input
+
+The model takes an image as input. The expected image is 300x300 pixels, with
+three channels (red, blue, and green) per pixel. This should be fed to the model
+as a flattened buffer of 270,000 byte values (300x300x3). Since the model is
+<a href="../../performance/post_training_quantization.md">quantized</a>, each
+value should be a single byte representing a value between 0 and 255.
+
+### Output
+
+The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2
+describe 10 detected objects, with one element in each array corresponding to
+each object. There will always be 10 objects detected.
+
+<table>
+  <thead>
+    <tr>
+      <th>Index</th>
+      <th>Name</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>Locations</td>
+      <td>Multidimensional array of [10][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>Classes</td>
+      <td>Array of 10 integers (output as floating point values) each indicating the index of a class label from the labels file</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Scores</td>
+      <td>Array of 10 floating point values between 0 and 1 representing probability that a class was detected</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Number and detections</td>
+      <td>Array of length 1 containing a floating point value expressing the total number of detection results</td>
+    </tr>
+  </tbody>
+</table>
+
+## Customize model
+
+The pre-trained models we provide are trained to detect 80 classes of object.
+For a full list of classes, see the labels file in the
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
+zip</a>.
+
+You can use a technique known as transfer learning to re-train a model to
+recognize classes not in the original set. For example, you could re-train the
+model to detect multiple types of vegetable, despite there only being one
+vegetable in the original training data. To do this, you will need a set of
+training images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in
+<a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193">Training
+and serving a real-time mobile object detector in 30 minutes</a>.
diff --git a/tensorflow/lite/g3doc/models/pose_estimation/overview.md b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..981a2553f701ac0302e880dd5445defa14bece37
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
@@ -0,0 +1,153 @@
+# Pose estimation
+
+<img src="../images/pose.png" class="attempt-right" />
+
+## Get started
+
+_PoseNet_ is a vision model that can be used to estimate the pose of a person in
+an image or video by estimating where key body joints are.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download
+starter model</a>
+
+Android and iOS end-to-end tutorials are coming soon. In the meantime, if you
+want to experiment this on a web browser, check out the
+<a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">TensorFlow.js
+GitHub repository</a>.
+
+## How it works
+
+Pose estimation refers to computer vision techniques that detect human figures
+in images and videos, so that one could determine, for example, where someone’s
+elbow shows up in an image.
+
+To be clear, this technology is not recognizing who is in an image. The
+algorithm is simply estimating where key body joints are.
+
+The key points detected are indexed by "Part ID", with a confidence score
+between 0.0 and 1.0, 1.0 being the highest.
+
+<table style="width: 30%;">
+  <thead>
+    <tr>
+      <th>Id</th>
+      <th>Part</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>nose</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>leftEye</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>rightEye</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>leftEar</td>
+    </tr>
+    <tr>
+      <td>4</td>
+      <td>rightEar</td>
+    </tr>
+    <tr>
+      <td>5</td>
+      <td>leftShoulder</td>
+    </tr>
+    <tr>
+      <td>6</td>
+      <td>rightShoulder</td>
+    </tr>
+    <tr>
+      <td>7</td>
+      <td>leftElbow</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>rightElbow</td>
+    </tr>
+    <tr>
+      <td>9</td>
+      <td>leftWrist</td>
+    </tr>
+    <tr>
+      <td>10</td>
+      <td>rightWrist</td>
+    </tr>
+    <tr>
+      <td>11</td>
+      <td>leftHip</td>
+    </tr>
+    <tr>
+      <td>12</td>
+      <td>rightHip</td>
+    </tr>
+    <tr>
+      <td>13</td>
+      <td>leftKnee</td>
+    </tr>
+    <tr>
+      <td>14</td>
+      <td>rightKnee</td>
+    </tr>
+    <tr>
+      <td>15</td>
+      <td>leftAnkle</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>rightAnkle</td>
+    </tr>
+  </tbody>
+</table>
+
+## Example output
+
+<img alt="Animation showing pose estimation" src="https://www.tensorflow.org/images/models/pose_estimation.gif" />
+
+## How it performs
+
+Performance varies based on your device and output stride (heatmaps and offset
+vectors). The PoseNet model is image size invariant, which means it can predict
+pose positions in the same scale as the original image regardless of whether the
+image is downscaled. This means PoseNet can be configured to have a higher
+accuracy at the expense of performance.
+
+The output stride determines how much we’re scaling down the output relative to
+the input image size. It affects the size of the layers and the model outputs.
+The higher the output stride, the smaller the resolution of layers in the
+network and the outputs, and correspondingly their accuracy. In this
+implementation, the output stride can have values of 8, 16, or 32. In other
+words, an output stride of 32 will result in the fastest performance but lowest
+accuracy, while 8 will result in the highest accuracy but slowest performance.
+We recommend starting with 16.
+
+The following image shows how the output stride determines how much we’re
+scaling down the output relative to the input image size. A higher output stride
+is faster but results in lower accuracy.
+
+<img alt="Output stride and heatmap resolution" src="../images/output_stride.png" >
+
+## Read more about pose estimation
+
+<ul>
+  <li><a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">Blog post: Real-time Human Pose Estimation in the Browser with TensorFlow.js</a></li>
+  <li><a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">TF.js GitHub: Pose Detection in the Browser: PoseNet Model</a></li>
+</ul>
+
+### Use cases
+
+<ul>
+  <li><a href="https://vimeo.com/128375543">‘PomPom Mirror’</a></li>
+  <li><a href="https://youtu.be/I5__9hq-yas">Amazing Art Installation Turns You Into A Bird | Chris Milk "The Treachery of Sanctuary"</a></li>
+  <li><a href="https://vimeo.com/34824490">Puppet Parade - Interactive Kinect Puppets</a></li>
+  <li><a href="https://vimeo.com/2892576">Messa di Voce (Performance), Excerpts</a></li>
+  <li><a href="https://www.instagram.com/p/BbkKLiegrTR/">Augmented reality</a></li>
+  <li><a href="https://www.instagram.com/p/Bg1EgOihgyh/">Interactive animation</a></li>
+  <li><a href="https://www.runnersneed.com/expert-advice/gear-guides/gait-analysis.html">Gait analysis</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e664adf700b396b9cd06d48378b782efbbca4282
Binary files /dev/null and b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif differ
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bd268ada1fcefbdad39c9951c0d471e32b16b03
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -0,0 +1,43 @@
+# Segmentation
+
+<img src="../images/segmentation.png" class="attempt-right" />
+
+## Get started
+
+_DeepLab_ is a state-of-art deep learning model for semantic image segmentation,
+where the goal is to assign semantic labels (e.g. person, dog, cat) to every
+pixel in the input image.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download
+starter model</a>
+
+## How it works
+
+Semantic image segmentation predicts whether each pixel of an image is
+associated with a certain class. This is in contrast to
+<a href="../object_detection/overview.md">object detection</a>, which detects
+objects in rectangular regions, and
+<a href="../image_classification/overview.md">image classification</a>, which
+classifies the overall image.
+
+The current implementation includes the following features:
+<ol>
+  <li>DeepLabv1: We use atrous convolution to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks.</li>
+  <li>DeepLabv2: We use atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales with filters at multiple sampling rates and effective fields-of-views.</li>
+  <li>DeepLabv3: We augment the ASPP module with image-level feature [5, 6] to capture longer range information. We also include batch normalization [7] parameters to facilitate the training. In particular, we applying atrous convolution to extract output features at different output strides during training and evaluation, which efficiently enables training BN at output stride = 16 and attains a high performance at output stride = 8 during evaluation.</li>
+  <li>DeepLabv3+: We extend DeepLabv3 to include a simple yet effective decoder module to refine the segmentation results especially along object boundaries. Furthermore, in this encoder-decoder structure one can arbitrarily control the resolution of extracted encoder features by atrous convolution to trade-off precision and runtime.</li>
+</ol>
+
+## Example output
+
+The model will create a mask over the target objects with high accuracy.
+
+<img alt="Animation showing image segmentation" src="images/segmentation.gif" />
+
+## Read more about segmentation
+
+<ul>
+  <li><a href="https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html">Semantic Image Segmentation with DeepLab in TensorFlow</a></li>
+  <li><a href="https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7">TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)</a></li>
+  <li><a href="https://github.com/tensorflow/models/tree/master/research/deeplab">DeepLab: Deep Labelling for Semantic Image Segmentation</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a61691fd8714102409d290e7f6d6e361d9cbf13
Binary files /dev/null and b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif differ
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..20c359ec9ff9c79d48df8f2af381d98e27a5cc84
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -0,0 +1,52 @@
+# Smart reply
+
+<img src="../images/smart_reply.png" class="attempt-right" />
+
+## Get started
+
+Our smart reply model generates reply suggestions based on chat messages. The
+suggestions are intended to be contextually relevant, one-touch responses that
+help the user to easily reply to an incoming message.
+
+<a class="button button-primary" href="http://download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download
+starter model and labels</a>
+
+### Sample application
+
+We have provided a pre-built APK that demonstrates the smart reply model on
+Android.
+
+Go to the
+<a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc">GitHub
+page</a> for instructions and list of supported ops and functionalities.
+
+## How it works
+
+The model generates reply suggestions to conversational chat messages.
+
+The on-device model comes with several benefits. It is:
+<ul>
+  <li>Fast: The model resides on the device and does not require internet connectivity. Thus, inference is very fast and has an average latency of only a few milliseconds.</li>
+  <li>Resource efficient: The model has a small memory footprint on the device.</li>
+  <li>Privacy-friendly: User data never leaves the device.</li>
+</ul>
+
+## Example output
+
+<img alt="Animation showing smart reply" src="images/smart_reply.gif" />
+
+## Read more about this
+
+<ul>
+  <li><a href="https://arxiv.org/pdf/1708.00630.pdf">Research paper</a></li>
+  <li><a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/">Source code</a></li>
+</ul>
+
+## Users
+
+<ul>
+  <li><a href="https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/">Gmail</a></li>
+  <li><a href="https://www.blog.google/products/gmail/computer-respond-to-this-email/">Inbox</a></li>
+  <li><a href="https://blog.google/products/allo/google-allo-smarter-messaging-app/">Allo</a></li>
+  <li><a href="https://research.googleblog.com/2017/02/on-device-machine-intelligence.html">Smart Replies on Android Wear</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
index 5a1e5586beecad4876c9d0390a0fa31e78705195..a51fdb4080718d7d9dac9a01ee4935a1631933d6 100644
--- a/tensorflow/lite/g3doc/performance/benchmarks.md
+++ b/tensorflow/lite/g3doc/performance/benchmarks.md
@@ -1,5 +1,4 @@
-
-# Performance
+# Performance benchmarks
 
 This document lists TensorFlow Lite performance benchmarks when running well
 known models on some Android and iOS devices.
@@ -42,7 +41,7 @@ Pixel xl | 0c |
     <tr>
       <th>Model Name</th>
       <th>Device </th>
-      <th>Mean inference time (std dev)</th>
+      <th>Mean inference time</th>
     </tr>
   </thead>
   <tr>
@@ -50,66 +49,66 @@ Pixel xl | 0c |
       <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
     </td>
     <td>Pixel 2 </td>
-    <td>166.5 ms (2.6 ms)</td>
+    <td>123.3 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>122.9 ms (1.8 ms)  </td>
+     <td>Pixel XL </td>
+     <td>113.3 ms</td>
   </tr>
   <tr>
     <td rowspan = 2>
       <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
     </td>
     <td>Pixel 2 </td>
-    <td>69.5 ms (0.9 ms)</td>
+    <td>65.4 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>78.9 ms (2.2 ms)  </td>
+     <td>Pixel XL </td>
+     <td>74.6 ms</td>
   </tr>
   <tr>
     <td rowspan = 2>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
     </td>
     <td>Pixel 2 </td>
-    <td>273.8 ms (3.5 ms)</td>
+    <td>273.8 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>210.8 ms (4.2 ms)</td>
+     <td>Pixel XL </td>
+     <td>210.8 ms</td>
   </tr>
   <tr>
     <td rowspan = 2>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
     </td>
     <td>Pixel 2 </td>
-    <td>234.0 ms (2.1 ms)</td>
+    <td>234.0 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>158.0 ms (2.1 ms)</td>
+     <td>Pixel XL </td>
+     <td>158.0 ms</td>
   </tr>
   <tr>
     <td rowspan = 2>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
     </td>
     <td>Pixel 2 </td>
-    <td>2846.0 ms (15.0 ms)</td>
+    <td>2846.0 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>1973.0 ms (15.0 ms)  </td>
+     <td>Pixel XL </td>
+     <td>1973.0 ms </td>
   </tr>
   <tr>
     <td rowspan = 2>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
     </td>
     <td>Pixel 2 </td>
-    <td>3180.0 ms (11.7 ms)</td>
+    <td>3180.0 ms</td>
   </tr>
    <tr>
-     <td>Pixel xl </td>
-     <td>2262.0 ms (21.0 ms)  </td>
+     <td>Pixel XL </td>
+     <td>2262.0 ms</td>
   </tr>
 
  </table>
@@ -126,7 +125,7 @@ modified  to set `num_threads` to 1.
     <tr>
       <th>Model Name</th>
       <th>Device </th>
-      <th>Mean inference time (std dev)</th>
+      <th>Mean inference time</th>
     </tr>
   </thead>
   <tr>
@@ -134,41 +133,41 @@ modified  to set `num_threads` to 1.
       <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
     </td>
     <td>iPhone 8 </td>
-    <td>32.2 ms (0.8 ms)</td>
+    <td>32.2 ms</td>
   </tr>
   <tr>
     <td>
       <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
     </td>
     <td>iPhone 8 </td>
-    <td>24.4 ms (0.8 ms)</td>
+    <td>24.4 ms</td>
   </tr>
   <tr>
     <td>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
     </td>
     <td>iPhone 8 </td>
-    <td>60.3 ms (0.6 ms)</td>
+    <td>60.3 ms</td>
   </tr>
   <tr>
     <td>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
     </td>
     <td>iPhone 8 </td>
-    <td>44.3 (0.7 ms)</td>
+    <td>44.3</td>
   </tr>
   <tr>
     <td>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
     </td>
     <td>iPhone 8</td>
-    <td>562.4 ms (18.2 ms)</td>
+    <td>562.4 ms</td>
   </tr>
   <tr>
     <td>
       <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
     </td>
     <td>iPhone 8 </td>
-    <td>661.0 ms (29.2 ms)</td>
+    <td>661.0 ms</td>
   </tr>
  </table>
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index 5f41a7027538f571601c85a0a367208200155dd6..45aa17f07a9dad1cd4eb140f45abbf21e3d30fe7 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -1,19 +1,27 @@
 # Performance best practices
 
-Mobile and embedded devices have limited computational resources and it is
+Mobile and embedded devices have limited computational resources, so it is
 important to keep your application resource efficient. We have compiled a list
 of best practices and strategies that you can use to optimize your model and
 application when using TensorFlow Lite.
 
 ## Choose the best model for the task
-Depending on the task you will need to make a tradeoff between model complexity and size. If your task requires high accuracy then you may need a large and complex model. Some tasks may work with a less precise model, for these tasks it is better to use a smaller but less precise model. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. For example, graphs below show accuracy and latency tradeoff for some common image classification models.
 
-![accuracy vs model size](../images/performance/model_size_vs_accuracy.png "Accuracy vs Model size")
+Depending on the task, you will need to make a tradeoff between model complexity
+and size. If your task requires high accuracy, then you may need a large and
+complex model. For tasks that require less precision, it is better to use a
+smaller model because they not only use less disk space and memory, but they are
+also generally faster and more energy efficient. For example, graphs below show
+accuracy and latency tradeoffs for some common image classification models.
 
+![Graph of model size vs accuracy](../images/performance/model_size_vs_accuracy.png "Model Size vs Accuracy")
 
-![latency vs model size](../images/performance/model_size_vs_latency.png "Latency vs Model size")
+![Graph of model size vs latency](../images/performance/model_size_vs_latency.png "Model Size vs Latency")
 
-One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. TensorFlow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
+One example of models optimized for mobile devices are
+[MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile
+vision applications. [Hosted models](../models/hosted.md) lists several other
+models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
 [image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
@@ -24,33 +32,58 @@ You can retrain the listed models on your own dataset by using transfer learning
 Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. TensorFlow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
 
 ## Profile and optimize operators in the graph
-If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
- This scenario should be rare as TensorFlow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
 
-## Quantize your model
-If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](model_optimization.md) for details about optimizing your model. 
+If a particular operator appears frequently in the model and, based on
+profiling, you find that the operator consumes the most amount of time, you can
+look into optimizing that operator. This scenario should be rare as TensorFlow
+Lite has optimized versions for most operators. However, you may be able to
+write a faster version of a custom op if you know the constraints in which the
+operator is executed. Check out our
+[custom operator documentation](../custom_operators.md).
+
+## Optimize your model
+
+Model compression aims to create smaller models that are generally faster and
+more energy efficient, so that they can be deployed on mobile devices.
+
+### Quantization
+
+If your model uses floating-point weights or activations, then it may be
+possible to reduce the size of model up to ~4x by using quantization, which
+effectively turns the float weights to 8-bit. There are two flavors of
+quantization: [post-training quantization](post_training_quantization.md) and
+[quantized training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external}.
+The former does not require model re-training, but, in rare cases, may have
+accuracy loss. When accuracy loss is beyond acceptable thresholds, quantized
+training should be used instead.
+
+We strongly recommend running benchmarks to make sure that the accuracy is not
+impacted during model compression. Check out our
+[model optimization toolkit](model_optimization.md) for details.
 
 ## Tweak the number of threads
 
 TensorFlow Lite supports multi-threaded kernels for many operators. You can
 increase the number of threads and speed up execution of operators. Increasing
-the number of threads will however make your model use more resources and power.
-For some applications latency may be more important than energy efficiency. You
-can increase the number of threads by setting the number of
-[interpreter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L333)
-threads. Multi-threaded execution however comes at the cost of increased
-performance variability depending on what else is been executed concurrently.
-This is particularly the case for mobile apps. For example, isolated tests may
-show 2x speed up vs single-threaded but if another app is executing at the same
-time may result in worst performance than single-threaded.
+the number of threads will, however, make your model use more resources and
+power.
+
+For some applications, latency may be more important than energy efficiency. You
+can increase the number of threads by setting the number of interpreter
+[threads](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L346).
+Multi-threaded execution, however, comes at the cost of increased performance
+variability depending on what else is executed concurrently. This is
+particularly the case for mobile apps. For example, isolated tests may show 2x
+speed-up vs single-threaded, but, if another app is executing at the same time,
+it may result in worse performance than single-threaded.
 
 ## Eliminate redundant copies
 
-If your application is not careful, there can be redundant copies when feeding
-the input to the model and reading output from the model. Make sure to eliminate
-redundant copies. If you are using higher level APIs like Java API, make sure to
-carefully check the documentation for performance caveats. For example, the Java
-API is a lot faster if ByteBuffers are used as
+If your application is not carefully designed, there can be redundant copies
+when feeding the input to and reading the output from the model. Make sure to
+eliminate redundant copies. If you are using higher level APIs, like Java, make
+sure to carefully check the documentation for performance caveats. For example,
+the Java API is a lot faster if ByteBuffers are used as
 [inputs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L175).
 
 ## Profile your application with platform specific tools
@@ -58,23 +91,23 @@ Platform specific tools like [Android profiler](https://developer.android.com/st
 
 ## Evaluate whether your model benefits from using hardware accelerators available on the device
 
-TensorFlow Lite has added been new ways to accelerate models with faster
-hardware like GPUs, DSPs, and neural accelerators. Typically, these accelerators
-are exposed through *delegate* submodules that take over parts of the
+TensorFlow Lite has added new ways to accelerate models with faster hardware
+like GPUs, DSPs, and neural accelerators. Typically, these accelerators are
+exposed through [delegate](delegates.md) submodules that take over parts of the
 interpreter execution. TensorFlow Lite can use delegates by:
 
 *   Using Android's
     [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/).
     You can utilize these hardware accelerator backends to improve the speed and
     efficiency of your model. To enable the Neural Networks API, call
-    [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L330)
+    [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L343)
     on the interpreter instance.
-*   A binary-only GPU delegate has been released for Android and iOS—using
+*   A binary-only GPU delegate has been released for Android and iOS, using
     OpenGL and Metal, respectively. To try them out, see the
     [GPU delegate tutorial](gpu.md) and [documentation](gpu_advanced.md).
 *   It is possible to create your own delegate if you have access to
-    non-standard hardware. View the NN API delegate in the source code as an
-    example.
+    non-standard hardware. See [TensorFlow Lite delegates](delegates.md) for
+    more information.
 
 Be aware that some accelerators work better for different types of models. It is
 important to benchmark each delegate to see if it is a good choice for your
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5abfb034386435a4f2ab9c4f4ba816417d7d7b8
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -0,0 +1,206 @@
+## TensorFlow Lite delegates
+
+_Note: Delegate API is still experimental and is subject to change._
+
+
+## What is a TensorFlow Lite delegate?
+
+A TensorFlow Lite delegate is a way to delegate part or all of graph execution to another executor.
+
+
+## Why should I use delegates?
+
+Running inference on compute-heavy machine learning models on mobile devices is resource demanding due to the devices' limited processing and power.
+
+Instead of relying on the CPU, some devices have hardware accelerators, such as GPU or DSP, that allows for better performance and higher energy efficiency.
+
+
+## Using the experimental GPU delegate
+
+TensorFlow Lite provides an experimental GPU delegate that can be used to accelerate models on devices that have a GPU available.
+
+For an overview of the experimental GPU delegate, see [TensorFlow Lite on GPU](https://www.tensorflow.org/lite/performance/gpu_advanced). For step-by-step tutorials on using the GPU delegate with Android and iOS, see [TensorFlow Lite GPU Delegate Tutorial](https://www.tensorflow.org/lite/performance/gpu).
+
+
+## How do delegates work?
+
+Let's say we have a simple model graph such as the following:
+
+![Original graph](../images/performance/tflite_delegate_graph_1.png "Original Graph")
+
+If a delegate was provided for specific operations, then TensorFlow Lite will split the graph into multiple subgraphs where each subgraph will be handled by a delegate.
+
+Let's assume that there is a delegate "MyDelegate," which has a faster implementation for Conv2D and Mean operations. The resulting main graph will be updated to look like below.
+
+![Graph with delegate](../images/performance/tflite_delegate_graph_2.png "Graph with delegate")
+
+Each subgraph that is handled by a delegate will be replaced with a node that evaluates the subgraph on its invoked call.
+
+Depending on the model, the final graph can end up with one node, which means that all of the graphs were delegated or multiple nodes handled the subgraphs. In general, you don't want to have multiple subgraphs handled by the delegate, since each time you switch from delegate to the main graph, there is an overhead for passing the results from the subgraph to the main graph. It's not always safe to share memory.
+
+
+## How to add a delegate
+
+_Note that the API used below is experimental and is subject to change._
+
+Based on the previous section, to add a delegate, we need to do the following:
+
+
+
+1.  Define a kernel node that is responsible for evaluating the delegate subgraph
+1.  Create an instance of [TfLiteDelegate](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/c_api_internal.h#L545), which is responsible for registering the kernel node and claiming the nodes that the delegate can execute
+
+To see it in code, let's define a delegate and call it "MyDelegate," which can execute Conv2D and Mean operations faster.
+
+```
+// This is where the execution of the operations or whole graph happens.
+// The class below has an empty implementation just as a guideline
+// on the structure.
+class MyDelegate {
+ public:
+  // Returns true if my delegate can handle this type of op.
+  static bool SupportedOp(const TfLiteRegistration* registration) {
+    switch (registration->builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinMean:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  // Any initialization code needed
+  bool Init() {}
+  // Any preparation work needed (e.g. allocate buffers)
+  bool Prepare(TfLiteContext* context, TfLiteNode* node) {}
+  // Actual running of the delegate subgraph.
+  bool Invoke(TfLiteContext* context, TfLiteNode* node) {}
+  // ... Add any other methods needed.
+};
+
+// Create the TfLiteRegistration for the Kernel node which will replace
+// the subrgaph in the main TfLite graph.
+TfLiteRegistration GetMyDelegateNodeRegistration() {
+  // This is the registration for the Delegate Node that gets added to
+  // the TFLite graph instead of the subGraph it replaces.
+  // It is treated as a an OP node. But in our case
+  // Init will initialize the delegate
+  // Invoke will run the delegate graph.
+  // Prepare for preparing the delegate.
+  // Free for any cleaning needed by the delegate.
+  TfLiteRegistration kernel_registration;
+  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
+  kernel_registration.custom_name = "MyDelegate";
+  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
+    delete reinterpret_cast<MyDelegate*>(buffer);
+  };
+  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
+                                   size_t) -> void* {
+    // In the node init phase, initialize MyDelegate instance
+    const TfLiteDelegateParams* delegate_params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    MyDelegate* my_delegate = new MyDelegate;
+    if (!my_delegate->Init(context, params)) {
+      return nullptr;
+    }
+    return my_delegate;
+  };
+  kernel_registration.invoke = [](TfLiteContext* context,
+                                   TfLiteNode* node) -> TfLiteStatus {
+    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
+    return kernel->Invoke(context, node);
+  };
+  kernel_registration.prepare = [](TfLiteContext* context,
+                                    TfLiteNode* node) -> TfLiteStatus {
+    MyDelegate* kernel = reinterpret_cast<MyDelegate*>(node->user_data);
+    return kernel->Prepare(context, node);
+  };
+
+  return kernel_registration;
+}
+
+// TfLiteDelegate methods
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  // Claim all nodes that can be evaluated by the delegate and ask the
+  // framework to update the graph with delegate kernel instead.
+  // Reserve 1 element, since we need first element to be size.
+  std::vector<int> supported_nodes(1);
+  TfLiteIntArray* plan;
+  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+  TfLiteNode* node;
+  TfLiteRegistration* registration;
+  for (int node_index : TfLiteIntArrayView(plan)) {
+    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+        context, node_index, &node, &registration));
+    if (MyDelegate::SupportedOp(registration)) {
+      supported_nodes.push_back(node_index);
+    }
+  }
+  // Set first element to the number of nodes to replace.
+  supported_nodes[0] = supported_nodes.size() - 1;
+  TfLiteRegistration my_delegate_kernel_registration =
+      GetMyDelegateNodeRegistration();
+
+  // This call split the graphs into subgraphs, for subgraphs that can be
+  // handled by the delegate, it will replace it with a
+  // 'my_delegate_kernel_registration'
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, my_delegate_kernel_registration,
+      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+}
+
+void FreeBufferHandle(TfLiteContext* context, TfLiteDelegate* delegate,
+                      TfLiteBufferHandle* handle) {
+  // Do any cleanups.
+}
+
+TfLiteStatus CopyToBufferHandle(TfLiteContext* context,
+                                TfLiteDelegate* delegate,
+                                TfLiteBufferHandle buffer_handle,
+                                TfLiteTensor* tensor) {
+  // Copies data from tensor to delegate buffer if needed.
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor* tensor) {
+  // Copies the data from delegate buffer into the tensor raw memory.
+  return kTfLiteOk;
+}
+
+// Caller takes ownership of the returned pointer.
+TfLiteDelegate* CreateMyDelegate() {
+  TfLiteDelegate* delegate = new TfLiteDelegate;
+
+  delegate->data_ = nullptr;
+  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->Prepare = &DelegatePrepare;
+  // This cannot be null.
+  delegate->CopyFromBufferHandle = &CopyFromBufferHandle;
+  // This can be null.
+  delegate->CopyToBufferHandle = &CopyToBufferHandle;
+  // This can be null.
+  delegate->FreeBufferHandle = &FreeBufferHandle;
+
+  return delegate;
+}
+
+
+// To add the delegate you need to call
+
+auto* my_delegate = CreateMyDelegate();
+if (interpreter->ModifyGraphWithDelegate(my_delegate) !=
+        kTfLiteOk) {
+  // Handle error
+} else {
+  interpreter->Invoke();
+}
+...
+// Don't forget to delete your delegate
+delete my_delegate;
+
+
+```
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index c7389226123746180c8c5e6020431ffe579112a7..3b2cca9cf3211abfd4deb6bc59fb57f0225677d3 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite GPU Delegate Tutorial
+# TensorFlow Lite GPU delegate
 
 [TensorFlow Lite](https://www.tensorflow.org/lite) supports several hardware
 accelerators. This document describes how to preview the experimental GPU backend using the
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index 2eb432c008168794c3722fcd4f9ab6df0771e48f..dcfc2bed3a0e55ee0d7a1af643eb500faf3960ba 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -1,12 +1,14 @@
 # Model optimization
 
-Inference efficiency is a critical issue when deploying machine learning
-models to mobile devices. Where the computational demand for *training*
-grows with the number of models trained on different architectures, the
-computational demand for *inference* grows in proportion to the number of
-users. The *Tensorflow Model Optimization Toolkit* minimizes the complexity
-of inference—the model size, the latency and power consumption.
+The *Tensorflow Model Optimization Toolkit* minimizes the complexity
+of optimizing inference. Inference efficiency
+is a critical issue when deploying machine learning
+models to mobile devices because of the model size, latency, and power consumption.
 
+Computational demand for *training*
+grows with the number of models trained on different architectures, whereas the
+computational demand for *inference* grows in proportion to the number of
+users.
 
 ## Use cases
 
@@ -23,9 +25,11 @@ Model optimization is useful for:
 
 Model optimization uses multiple techniques:
 
-* Reduced parameter count, for example, pruning and structured pruning.
-* Reduced representational precision, for example, quantization.
-* Update the original model topology to a more efficient one, with reduced parameters or faster execution, for example, tensor decomposition methods and distillation.
+* Reduce parameter count with pruning and structured pruning.
+* Reduce representational precision with quantization.
+* Update the original model topology to a more efficient one with reduced parameters or faster execution. For example, tensor decomposition methods and distillation.
+
+We support quantization, and are working to add support for other techniques.
 
 ## Model quantization
 
@@ -34,19 +38,17 @@ representations of weights and, optionally, activations for both storage and
 computation. Quantization provides several benefits:
 
 * Support on existing CPU platforms.
-* Quantizing activations reduces memory access costs for reading and storing intermediate activations.
+* Quantization of activations reduces memory access costs for reading and storing intermediate activations.
 * Many CPU and hardware accelerator implementations provide SIMD instruction capabilities, which are especially beneficial for quantization.
 
 TensorFlow Lite provides several levels of support for quantization.
 
-[Post-training quantization](post_training_quantization.md) quantizes weights and activations post training and is very easy to use.
-[Quantization-aware training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external} allows for training networks that can be quantized with minimal accuracy drop and is only available
-for a subset of convolutional neural network architectures.
-
+* [Post-training quantization](post_training_quantization.md) quantizes weights and activations post training and is very easy to use.
+* [Quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external} allows for training networks that can be quantized with minimal accuracy drop and is only available for a subset of convolutional neural network architectures.
 
 ### Latency and accuracy results
 
-Below are the results of the latency and accuracy of post-training quantization and
+Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on
 Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will the numbers here:
 
@@ -62,13 +64,12 @@ Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will t
       <th>Latency (Quantization Aware Training) (ms) </th>
       <th> Size (Original) (MB)</th>
       <th> Size (Optimized) (MB)</th>
-    </tr>
-    <tr><td>Mobilenet-v1-1-224</td><td>0.709</td><td>0.657</td><td>0.70</td>
-      <td>180</td><td>145</td><td>80.2</td><td>16.9</td><td>4.3</td></tr>
+    </tr> <tr><td>Mobilenet-v1-1-224</td><td>0.709</td><td>0.657</td><td>0.70</td>
+      <td>124</td><td>112</td><td>64</td><td>16.9</td><td>4.3</td></tr>
     <tr><td>Mobilenet-v2-1-224</td><td>0.719</td><td>0.637</td><td>0.709</td>
-      <td>117</td><td>121</td><td>80.3</td><td>14</td><td>3.6</td></tr>
+      <td>89</td><td>98</td><td>54</td><td>14</td><td>3.6</td></tr>
    <tr><td>Inception_v3</td><td>0.78</td><td>0.772</td><td>0.775</td>
-      <td>1585</td><td>1187</td><td>637</td><td>95.7</td><td>23.9</td></tr>
+      <td>1130</td><td>845</td><td>543</td><td>95.7</td><td>23.9</td></tr>
    <tr><td>Resnet_v2_101</td><td>0.770</td><td>0.768</td><td>N/A</td>
       <td>3973</td><td>2868</td><td>N/A</td><td>178.3</td><td>44.9</td></tr>
  </table>
@@ -79,10 +80,10 @@ Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will t
 
 ## Choice of quantization tool
 
-As a starting point, check if the models in the TensorFlow Lite model repository can work for
-your application. If not, we recommend that users start with the post-training quantization tool
+As a starting point, check if the models in [hosted models](../guide/hosted_models.md) can work for
+your application. If not, we recommend that users start with the [post-training quantization tool](post_training_quantization.md)
 since this is broadly applicable and does not require training data. For cases where the accuracy
-and latency targets are not met, or hardware accelerator support is important, quantization-aware
-training is the better option.
+and latency targets are not met, or hardware accelerator support is important, [quantization-aware
+training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external} is the better option.
 
 Note: Quantization-aware training supports a subset of convolutional neural network architectures.
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index cf4d70b2deb3370d0acdde1fcaa8d7fce0cf3bf2..5fdf77b40112d2ba67db0b6f319f07279291da8f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -1,26 +1,24 @@
 # Post-training quantization
 
-Post-training quantization is a general technique to reduce the model size while also
+Post-training quantization is a general technique to reduce model size while also
 providing up to 3x lower latency with little degradation in model accuracy. Post-training
-quantization quantizes weights to 8-bits of precision from floating-point. This technique
-is enabled as an option in [TensorFlow Lite model converter](../convert):
+quantization quantizes weights from floating point to 8-bits of precision. This technique
+is enabled as an option in the [TensorFlow Lite converter](../convert/):
 
 ```
 import tensorflow as tf
-converter = tf.lite.TocoConverter.from_saved_model(saved_model_dir)
-converter.post_training_quantize = True
-tflite_quantized_model = converter.convert()
-open("quantized_model.tflite", "wb").write(tflite_quantized_model)
-
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+tflite_quant_model = converter.convert()
 ```
 
-At inference, weights are converted from 8-bits of precision to floating-point and
-computed using floating point kernels. This conversion is done once and cached to reduce latency.
+At inference, weights are converted from 8-bits of precision to floating point and
+computed using floating-point kernels. This conversion is done once and cached to reduce latency.
 
 To further improve latency, hybrid operators dynamically quantize activations to 8-bits and
 perform computations with 8-bit weights and activations. This optimization provides latencies
 close to fully fixed-point inference. However, the outputs are still stored using
-floating-point, so the speedup with hybrid ops is less than a full fixed-point computation.
+floating point, so that the speedup with hybrid ops is less than a full fixed-point computation.
 Hybrid ops are available for the most compute-intensive operators in a network:
 
 *  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
@@ -31,46 +29,55 @@ Hybrid ops are available for the most compute-intensive operators in a network:
 *  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
 
 
-Since weights are quantized post-training, there could be an accuracy loss, particularly for
+Since weights are quantized post training, there could be an accuracy loss, particularly for
 smaller networks. Pre-trained fully quantized models are provided for specific networks in
-the [TensorFlow Lite model repository](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md#image-classification-quantized-models){:.external}. It is important to check the accuracy of the quantized model to verify that any degradation
+the [TensorFlow Lite model repository](../models/). It is important to check the accuracy of the quantized model to verify that any degradation
 in accuracy is within acceptable limits. There is a tool to evaluate [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/README.md){:.external}.
 
-If the accuracy drop is too high, consider using [quantization aware training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external}.
+If the accuracy drop is too high, consider using [quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
 
 ### Representation for quantized tensors
 
 TensorFlow approaches the conversion of floating-point arrays of numbers into
 8-bit representations as a compression problem. Since the weights and activation
 tensors in trained neural network models tend to have values that are distributed
-across comparatively small ranges (for example, -15 to +15 for weights or -500 to
-1000 for image model activations). And since neural nets tend to be robust
-handling noise, the error introduced by quantizing to a small set of values
-maintains the precision of the overall results within an acceptable threshold. A
-chosen representation must perform fast calculations, especially the large matrix
-multiplications that comprise the bulk of the computations while running a model.
+across comparatively small ranges (e.g. -15 to +15 for weights or -500 to
+1000 for image model activations).
+
+Since neural networks tend to be robust at handling noise, the error introduced
+by quantizing to a small set of values maintains the precision of the overall
+results within an acceptable threshold. A chosen representation must perform
+fast calculations, especially with large matrix multiplications that comprise
+the bulk of the computations while running a model.
 
 This is represented with two floats that store the overall minimum and maximum
 values corresponding to the lowest and highest quantized value. Each entry in the
 quantized array represents a float value in that range, distributed linearly
-between the minimum and maximum. For example, with a minimum of -10.0 and maximum
-of 30.0f, and an 8-bit array, the quantized values represent the following:
+between the minimum and maximum.
+
+With our post-training quantization tooling, we use symmetric quantization for
+our weights, meaning we expand the represented range and force the min and max
+to be the negative of each other.
+
+For example, with an overall minimum of -10.0 and a maximum
+of 30.0f, we instead represent a minimum of -30.0 and maximum of 30.0f. In an
+8-bit array, the quantized values would be represented as follows:
 
 <figure>
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
-    <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>128</td><td>10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
+    <tr><td>-42</td><td>-10.0</td></tr>
+    <tr><td>0</td><td>0</td></tr>
+    <tr><td>127</td><td>30.0</td></tr>
+    <tr><td>-127</td><td>30.0 (this value does not ever show up)</td></tr>
   </table>
   <figcaption>
-    <b>Table 2</b>: Example quantized value range
+    <b>Table 2</b>: Quantized value range example
   </figcaption>
 </figure>
 
 The advantages of this representation format are:
 
 * It efficiently represents an arbitrary magnitude of ranges.
-* The values don't have to be symmetrical.
-* The format represents both signed and unsigned values.
 * The linear spread makes multiplications straightforward.
+* A symmetric range for weights enables downstream hardware optimizations.
diff --git a/tensorflow/lite/g3doc/r2/convert/concrete_function.md b/tensorflow/lite/g3doc/r2/convert/concrete_function.md
new file mode 100644
index 0000000000000000000000000000000000000000..c17981353d97e24fb215ba024d203efaaf909eb9
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/concrete_function.md
@@ -0,0 +1,208 @@
+# Generating a concrete function
+
+In order to convert TensorFlow 2.0 models to TensorFlow Lite, the model needs to
+be exported as a concrete function. This document outlines what a concrete
+function is and how to generate one for an existing model.
+
+[TOC]
+
+## Background
+
+In TensorFlow 2.0, eager execution is on by default. TensorFlow's eager
+execution is an imperative programming environment that evaluates operations
+immediately, without building graphs. Operations return concrete values instead
+of constructing a computational graph to run later. A detailed guide on eager
+execution is available
+[here](https://github.com/tensorflow/docs/blob/master/site/en/r2/guide/eager.ipynb).
+
+While running imperatively with eager execution makes development and debugging
+more interactive, it doesn't allow for deploying on-device. The `tf.function`
+API makes it possible to save models as graphs, which is required to run
+TensorFlow Lite in 2.0. All operations wrapped in the `tf.function` decorator
+can be exported as a graph which can then be converted to the TensorFlow Lite
+FlatBuffer format.
+
+## Terminology
+
+The following terminology is used in this document:
+
+*   **Signature** - The inputs and outputs for a set of operations.
+*   **Concrete function** - Graph with a single signature.
+*   **Polymorphic function** - Python callable that encapsulates several
+    concrete function graphs behind one API.
+
+## Methodology
+
+This section describes how to export a concrete function.
+
+### Annotate functions with `tf.function`
+
+Annotating a function with `tf.function` generates a *polymorphic function*
+containing those operations. All operations that are not annotated with
+`tf.function` will be evaluated with eager execution. The examples below show
+how to use `tf.function`.
+
+```python
+@tf.function
+def pow(x):
+  return x ** 2
+```
+
+```python
+tf.function(lambda x : x ** 2)
+```
+
+### Create an object to save
+
+The `tf.function` can be optionally stored as part of a `tf.Module` object.
+Variables should only be defined once within the `tf.Module`. The examples below
+show two different approaches for creating a class that derives `Checkpoint`.
+
+```python
+class BasicModel(tf.Module):
+
+  def __init__(self):
+    self.const = None
+
+  @tf.function
+  def pow(self, x):
+    if self.const is None:
+      self.const = tf.Variable(2.)
+    return x ** self.const
+
+root = BasicModel()
+```
+
+```python
+root = tf.Module()
+root.const = tf.Variable(2.)
+root.pow = tf.function(lambda x : x ** root.const)
+```
+
+### Exporting the concrete function
+
+The concrete function defines a graph that can be converted to TensorFlow Lite
+model or be exported to a SavedModel. In order to export a concrete function
+from the polymorphic function, the signature needs to be defined. The signature
+can be defined the following ways:
+
+*   Define `input_signature` parameter in `tf.function`.
+*   Pass in `tf.TensorSpec` into `get_concrete_function`: e.g.
+    `tf.TensorSpec(shape=[1], dtype=tf.float32)`.
+*   Pass in a sample input tensor into `get_concrete_function`: e.g.
+    `tf.constant(1., shape=[1])`.
+
+The follow example shows how to define the `input_signature` parameter for
+`tf.function`.
+
+```python
+class BasicModel(tf.Module):
+
+  def __init__(self):
+    self.const = None
+
+  @tf.function(input_signature=[tf.TensorSpec(shape=[1], dtype=tf.float32)])
+  def pow(self, x):
+    if self.const is None:
+      self.const = tf.Variable(2.)
+    return x ** self.const
+
+# Create the tf.Module object.
+root = BasicModel()
+
+# Get the concrete function.
+concrete_func = root.pow.get_concrete_function()
+```
+
+The example below passes in a sample input tensor into `get_concrete_function`.
+
+```python
+# Create the tf.Module object.
+root = tf.Module()
+root.const = tf.Variable(2.)
+root.pow = tf.function(lambda x : x ** root.const)
+
+# Get the concrete function.
+input_data = tf.constant(1., shape=[1])
+concrete_func = root.pow.get_concrete_function(input_data)
+```
+
+## Example program
+
+```python
+import tensorflow as tf
+
+# Initialize the tf.Module object.
+root = tf.Module()
+
+# Instantiate the variable once.
+root.var = None
+
+# Define a function so that the operations aren't computed in advance.
+@tf.function
+def exported_function(x):
+  # Each variable can only be defined once. The variable can be defined within
+  # the function but needs to contain a reference outside of the function.
+  if root.var is None:
+    root.var = tf.Variable(tf.random.uniform([2, 2]))
+  root.const = tf.constant([[37.0, -23.0], [1.0, 4.0]])
+  root.mult = tf.matmul(root.const, root.var)
+  return root.mult * x
+
+# Save the function as part of the tf.Module object.
+root.func = exported_function
+
+# Get the concrete function.
+concrete_func = root.func.get_concrete_function(
+  tf.TensorSpec([1, 1], tf.float32))
+```
+
+## Common Questions
+
+### How do I save a concrete function as a SavedModel?
+
+Users who want to save their TensorFlow model before converting it to TensorFlow
+Lite should save it as a SavedModel. After getting the concrete function, call
+`tf.saved_model.save` to save the model. The example above can be saved using
+the following instruction.
+
+```python
+tf.saved_model.save(root, export_dir, concrete_func)
+```
+
+Reference the
+[SavedModel guide](https://github.com/tensorflow/docs/blob/master/site/en/r2/guide/saved_model.ipynb)
+for detailed instructions on using SavedModels.
+
+### How do I get a concrete function from the SavedModel?
+
+Each concrete function within a SavedModel can be identified by a signature key.
+The default signature key is `tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY`.
+The example below shows how to get the concrete function from a model.
+
+```python
+model = tf.saved_model.load(export_dir)
+concrete_func = model.signatures[
+  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+```
+
+### How do I get a concrete function for a `tf.Keras` model?
+
+There are two approaches that you can use:
+
+1.  Save the model as a SavedModel. A concrete function will be generated during
+    the saving process, which can be accessed upon loading the model.
+2.  Annotate the model with `tf.function` as seen below.
+
+```python
+model = tf.keras.Sequential([tf.keras.layers.Dense(units=1, input_shape=[1])])
+model.compile(optimizer='sgd', loss='mean_squared_error')
+model.fit(x=[-1, 0, 1, 2, 3, 4], y=[-3, -1, 1, 3, 5, 7], epochs=50)
+
+# Get the concrete function from the Keras model.
+run_model = tf.function(lambda x : model(x))
+
+# Save the concrete function.
+concrete_func = run_model.get_concrete_function(
+    tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
+```
diff --git a/tensorflow/lite/g3doc/r2/convert/index.md b/tensorflow/lite/g3doc/r2/convert/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1e763e027c8caf7c34c01e991ff4d93076aff32
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/index.md
@@ -0,0 +1,24 @@
+# TensorFlow Lite converter
+
+The TensorFlow Lite converter takes a TensorFlow model represented as a
+[concrete function](concrete_function.md), and generates a TensorFlow Lite
+[`FlatBuffer`](https://google.github.io/flatbuffers/) file (`.tflite`).
+
+Note: This page contains documentation on the converter API for TensorFlow 2.0.
+The API for TensorFlow 1.X is available
+[here](https://www.tensorflow.org/lite/convert/).
+
+## Device deployment
+
+The TensorFlow Lite `FlatBuffer` file is then deployed to a client device (e.g.
+mobile, embedded) and run locally using the TensorFlow Lite interpreter. This
+conversion process is shown in the diagram below:
+
+![TFLite converter workflow](../images/convert/workflow.svg)
+
+## Converting models
+
+The TensorFlow Lite converter can be used from the [Python API](python_api.md).
+Using the Python API makes it easier to convert models as part of a model
+development pipeline and helps mitigate
+[compatibility](../guide/ops_compatibility.md) issues early on.
diff --git a/tensorflow/lite/g3doc/r2/convert/python_api.md b/tensorflow/lite/g3doc/r2/convert/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..882346fb8cf7fc0db7bbd44e1d8d6a2c95ac5f59
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/python_api.md
@@ -0,0 +1,252 @@
+# Converter Python API guide
+
+This page provides examples on how to use the
+[TensorFlow Lite converter](index.md) using the Python API in TensorFlow 2.0.
+
+[TOC]
+
+## Python API
+
+The Python API for converting TensorFlow models to TensorFlow Lite in TensorFlow
+2.0 is
+[`tf.lite.TFLiteConverter.from_concrete_function()`](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/lite/TFLiteConverter).
+Documentation on concrete functions is available [here](concrete_function.md).
+
+This document contains [example usages](#examples) of the API, a detailed list
+of [changes in the API between 1.X and 2.0](#differences), and
+[instructions](#versioning) on running the different versions of TensorFlow.
+
+## Examples <a name="examples"></a>
+
+### Exporting a concrete function <a name="concrete_function"></a>
+
+The following example shows how to convert a TensorFlow concrete function into a
+TensorFlow Lite `FlatBuffer`.
+
+```python
+import tensorflow as tf
+
+# Construct a basic model.
+root = tf.train.Checkpoint()
+root.v1 = tf.Variable(3.)
+root.v2 = tf.Variable(2.)
+root.f = tf.function(lambda x: root.v1 * root.v2 * x)
+
+# Create the concrete function.
+input_data = tf.constant(1., shape=[1, 1])
+concrete_func = root.f.get_concrete_function(input_data)
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### Exporting a SavedModel <a name="saved_model"></a>
+
+The following example shows how to convert a SavedModel into a TensorFlow Lite
+`FlatBuffer`.
+
+Note: Due to a known issue with preserving input shapes with SavedModels,
+`set_shape` needs to be called for all input tensors.
+
+```python
+import tensorflow as tf
+
+# Construct a basic model.
+root = tf.train.Checkpoint()
+root.v1 = tf.Variable(3.)
+root.v2 = tf.Variable(2.)
+root.f = tf.function(lambda x: root.v1 * root.v2 * x)
+
+# Save the model.
+export_dir = "/tmp/test_saved_model"
+input_data = tf.constant(1., shape=[1, 1])
+to_save = root.f.get_concrete_function(input_data)
+tf.saved_model.save(root, export_dir, to_save)
+
+# Load model and get the concrete function.
+model = tf.saved_model.load(export_dir)
+concrete_func = model.signatures[
+  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+# Set the shape manually.
+concrete_func.inputs[0].set_shape(input_data.shape)
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### Exporting a Keras model <a name="keras"></a>
+
+The following example shows how to convert a `tf.keras` model into a TensorFlow
+Lite `FlatBuffer`.
+
+```python
+import tensorflow as tf
+
+# Create a simple Keras model.
+x = [-1, 0, 1, 2, 3, 4]
+y = [-3, -1, 1, 3, 5, 7]
+
+model = tf.keras.models.Sequential(
+    [tf.keras.layers.Dense(units=1, input_shape=[1])])
+model.compile(optimizer='sgd', loss='mean_squared_error')
+model.fit(x, y, epochs=50)
+
+# Get the concrete function from the Keras model.
+run_model = tf.function(lambda x : model(x))
+concrete_func = run_model.get_concrete_function(
+    tf.TensorSpec([None, 1], tf.float32))
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### End-to-end MobileNet conversion <a name="mobilenet"></a>
+
+The following example shows how to convert and run inference on a pre-trained
+`tf.Keras` MobileNet model to TensorFlow Lite. In order to load the model from
+file, use `model_path` instead of `model_content`.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Load the MobileNet tf.keras model.
+model = tf.keras.applications.MobileNetV2(
+    weights="imagenet", input_shape=(224, 224, 3))
+
+# Save and load the model to generate the concrete function to export.
+export_dir = "/tmp/test_model/mobilenet"
+tf.saved_model.save(model, export_dir)
+model = tf.saved_model.load(export_dir)
+concrete_func = model.signatures[
+  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+# Set the shape manually.
+concrete_func.inputs[0].set_shape([1, 224, 224, 3])
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
+
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+# Test model on random input data.
+input_shape = input_details[0]['shape']
+input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
+interpreter.set_tensor(input_details[0]['index'], input_data)
+
+interpreter.invoke()
+output_data = interpreter.get_tensor(output_details[0]['index'])
+print(output_data)
+```
+
+## Summary of changes in `TFLiteConverter` between 1.X and 2.0 <a name="differences"></a>
+
+The following section summarizes the changes in `TFLiteConverter` from 1.X to
+2.0. If any of the changes raise concerns, please file a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+
+### Supported formats
+
+`TFLiteConverter` in 2.0 supports SavedModels and Keras model files generated in
+both 1.X and 2.0. However, the conversion process no longer supports frozen
+`GraphDefs` generated in 1.X. Users who want to convert frozen `GraphDefs` to
+TensorFlow Lite should use `tensorflow.compat.v1`.
+
+### Quantization-aware training
+
+The following attributes and methods associated with
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
+have been removed from `TFLiteConverter` in TensorFlow 2.0:
+
+*   `inference_type`
+*   `inference_input_type`
+*   `quantized_input_stats`
+*   `default_ranges_stats`
+*   `reorder_across_fake_quant`
+*   `change_concat_input_ranges`
+*   `post_training_quantize` - Deprecated in the 1.X API
+*   `get_input_arrays()`
+
+The rewriter function that supports quantization-aware training does not support
+models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
+API is being reworked and streamlined in a direction that supports
+quantization-aware training through the Keras API. These attributes will be
+removed in the 2.0 API until the new quantization API is launched. Users who
+want to convert models generated by the rewriter function can use
+`tensorflow.compat.v1`.
+
+### Changes to attributes
+
+The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
+to `supported_ops` in line with future additions to the optimization framework.
+
+Additionally, the following attributes have been removed:
+
+*   `drop_control_dependency` (default: `True`) - Control flow is currently not
+    supported by TFLite so it is always `True`.
+*   _Graph visualization_ - The recommended approach for visualizing a
+    TensorFlow Lite graph in TensorFlow 2.0 will be to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
+    Unlike GraphViz, it enables users to visualize the graph after post training
+    quantization has occurred. The following attributes related to graph
+    visualization will be removed:
+    *   `output_format`
+    *   `dump_graphviz_dir`
+    *   `dump_graphviz_video`
+
+### Deprecated APIs
+
+The following methods that were previously deprecated in 1.X will no longer be
+exported in 2.0:
+
+*   `lite.toco_convert`
+*   `lite.TocoConverter`
+
+## Installing TensorFlow <a name="versioning"></a>
+
+### Installing the TensorFlow 2.0 nightly <a name="2.0-nightly"></a>
+
+The TensorFlow 2.0 nightly can be installed using the following command:
+
+```
+pip install tf-nightly-2.0-preview
+```
+
+### Using TensorFlow 2.0 from a 1.X installation <a name="use-2.0-from-1.X"></a>
+
+TensorFlow 2.0 can be enabled from recent 1.X installations using the following
+code snippet.
+
+```python
+import tensorflow.compat.v2 as tf
+
+tf.enable_v2_behavior()
+```
+
+### Using TensorFlow 1.X from a 2.0 installation <a name="use-1.X-from-2.0"></a>
+
+TensorFlow 1.X can be enabled from 2.0 installation. This can be useful if you
+are using features that are no longer supported in 2.0.
+
+```python
+import tensorflow.compat.v1 as tf
+```
+
+### Build from source code <a name="latest_package"></a>
+
+In order to run the latest version of the TensorFlow Lite Converter Python API,
+either install the nightly build with
+[pip](https://www.tensorflow.org/install/pip) (recommended) or
+[Docker](https://www.tensorflow.org/install/docker), or
+[build the pip package from source](https://www.tensorflow.org/install/source).
diff --git a/tensorflow/lite/g3doc/r2/images/convert/workflow.svg b/tensorflow/lite/g3doc/r2/images/convert/workflow.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2d8339f35f3292d964367a1f788187e81178e44d
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/images/convert/workflow.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 620.0 380.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 353.70078l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 353.70078l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 351.05682q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m21.160105 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m21.160105 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m80.063835 52.13155l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.613289 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875076 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.3437576 0 2.1406326 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.8906326 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.9531326 0 1.4375076 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.4531326 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.8418045 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 162.17848l249.0079 0l0 203.0236l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 162.17848l249.0079 0l0 203.0236l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 353.1271q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m78.215225 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.215225 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m106.46312 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m98.353676 324.24844q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.37499237 0.21875 -0.6249924 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.48436737 -0.296875 1.0468674 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m230.91296 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.91296 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16086 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.59827 324.8422l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 324.8422l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m318.40903 313.32217l41.417328 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.40903 313.32217l37.990265 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.39926 313.32217l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m495.33072 255.95735l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 255.95735l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 271.48904l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 310.04532q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.875q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.15625q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m165.71129 313.32217l65.19685 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m165.71129 313.32217l61.76976 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.48105 313.32217l-1.124588 1.1245728l3.0897675 -1.1245728l-3.0897675 -1.1245728z" fill-rule="evenodd"/><path fill="#d9ead3" d="m359.81628 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m384.31497 114.11024l77.480316 0l0 31.748024l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m401.3306 130.27086q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m466.6929 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m466.6929 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m491.1916 114.11024l100.0 0l0 31.748024l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m501.36346 134.63023l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.595703 0l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m27.741055 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m27.741055 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m60.082745 162.6372l0.984375 0l0 0.078125q-0.0625 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.671875l2.9375 0l0 -3.1875l0.9843712 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.953125l-0.8906212 0l0 -3.5625l-2.921875 0l0 3.5625l-0.875 0l0 -7.46875zm6.046917 0l1.75 0q0.921875 0 1.453125 0.25q0.546875 0.234375 0.9375 0.765625q0.734375 0.984375 0.734375 2.75q-0.0625 1.8125 -0.84375 2.78125q-0.765625 0.953125 -2.421875 0.9375l-1.609375 0l0 -7.484375zm1.5625 6.828125q2.484375 0 2.484375 -3.0q-0.015625 -1.53125 -0.578125 -2.328125q-0.546875 -0.796875 -1.765625 -0.796875l-0.90625 0l0 6.125l0.765625 0zm4.734421 0.640625l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm8.140671 -4.859375q0.65625 0 1.15625 0.3125q0.515625 0.296875 0.8125 0.875q0.296875 0.5625 0.296875 1.3125q0 0.765625 -0.3125 1.328125q-0.296875 0.5625 -0.84375 0.859375q-0.53125 0.296875 -1.203125 0.296875q-0.6875 0 -1.265625 -0.296875q-0.578125 -0.296875 -0.953125 -0.84375l0.671875 -0.515625l0.015625 0q0.015625 0 0.015625 0q0 0 0 0q0.3125 0.484375 0.65625 0.71875q0.34375 0.21875 0.90625 0.21875q0.390625 0 0.71875 -0.21875q0.34375 -0.234375 0.53125 -0.625q0.203125 -0.40625 0.203125 -0.953125q0 -0.8125 -0.4375 -1.28125q-0.4375 -0.484375 -1.09375 -0.484375q-0.390625 0 -0.765625 0.1875q-0.359375 0.171875 -0.640625 0.515625l-0.53125 -0.21875l0.25 -3.796875l3.796875 0l0 0.75l-3.078125 0l-0.125 2.140625q0.59375 -0.28125 1.21875 -0.28125zm-1.625 3.328125q-0.0625 -0.09375 0.015625 -0.015625l-0.015625 0.015625zm0.125 0q0 0.0625 -0.109375 -0.015625l0.0625 -0.046875l0.046875 0.0625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m26.839895 63.718502l87.49606 0l0 30.99213l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.839895 63.718502l87.49606 0l0 30.99213l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.915867 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 63.71982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 63.71982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m137.97523 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 214.08858l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 214.08858l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 234.10464l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 172.21982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 172.21982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 191.61089q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m71.48908 181.08202l0 27.68892l50.58268 0l0 27.681168" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m71.48908 181.08202l0 27.68892l50.582672 0l0 24.254074" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.071754 233.02501l-1.1245804 -1.1245728l1.1245804 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 313.38516l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 271.4639l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 229.57414l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -125.60629l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 187.71588l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m128.68855 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m128.68855 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m146.01448 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m171.20473 94.711945l1.2283478 55.37007" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20473 94.711945l1.1523285 51.943832" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m172.35706 146.65578l-1.1492462 -1.0993652l1.1928406 3.0640717l1.0557709 -3.1139526z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m172.43658 181.08202l0 27.68892l-50.362198 0l0 27.681168" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m172.4366 181.08202l0 27.68892l-50.362213 0l0 24.254074" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.07439 233.02501l-1.124588 -1.1245728l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m70.58793 94.71063l0.9133835 55.37007" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.58793 94.71063l0.8568573 51.94345" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m71.444786 146.65408l-1.1429749 -1.1058807l1.1753922 3.0708008l1.0734634 -3.107895z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m95.068245 94.66404l0 27.763779l54.01574 0l0 27.763786" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m95.068245 94.66404l0 27.763779l54.01574 0l0 24.336693" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m149.08398 146.76451l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#d9ead3" d="m78.32767 236.45998l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.32767 236.45998l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.46614 248.8823q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171921 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.281296 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859421 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m99.013016 263.47604l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.390671 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.8750458 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.015671 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m122.0757 267.4521l-0.12598419 30.362213" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m122.0757 267.4521l-0.11177063 26.93515" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m121.96393 294.38724l-1.1199036 -1.129242l1.1117554 3.0944214l1.1373901 -3.085083z" fill-rule="evenodd"/></g></svg>
diff --git a/tensorflow/lite/g3doc/tfmobile/android_build.md b/tensorflow/lite/g3doc/tfmobile/android_build.md
deleted file mode 100644
index f8c0243298e435382a7514e72ada89880fb00c1c..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/android_build.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Building TensorFlow on Android
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-To get you started working with TensorFlow on Android, we'll walk through two
-ways to build our TensorFlow mobile demos and deploying them on an Android
-device. The first is Android Studio, which lets you build and deploy in an
-IDE. The second is building with Bazel and deploying with ADB on the command
-line.
-
-Why choose one or the other of these methods?
-
-The simplest way to use TensorFlow on Android is to use Android Studio. If you
-aren't planning to customize your TensorFlow build at all, or if you want to use
-Android Studio's editor and other features to build an app and just want to add
-TensorFlow to it, we recommend using Android Studio.
-
-If you are using custom ops, or have some other reason to build TensorFlow from
-scratch, scroll down and see our instructions
-for [building the demo with Bazel](#build_the_demo_using_bazel).
-
-## Build the demo using Android Studio
-
-**Prerequisites**
-
-If you haven't already, do the following two things:
-
-- Install [Android Studio](https://developer.android.com/studio/index.html),
-  following the instructions on their website.
-
-- Clone the TensorFlow repository from GitHub:
-
-        git clone https://github.com/tensorflow/tensorflow
-
-**Building**
-
-1. Open Android Studio, and from the Welcome screen, select **Open an existing
-   Android Studio project**.
-
-2. From the **Open File or Project** window that appears, navigate to and select
-    the `tensorflow/examples/android` directory from wherever you cloned the
-    TensorFlow GitHub repo.  Click OK.
-
-    If it asks you to do a Gradle Sync, click OK.
-
-    You may also need to install various platforms and tools, if you get
-    errors like "Failed to find target with hash string 'android-23' and similar.
-
-3. Open the `build.gradle` file (you can go to **1:Project** in the side panel
-    and find it under the **Gradle Scripts** zippy under **Android**). Look for
-    the `nativeBuildSystem` variable and set it to `none` if it isn't already:
-
-        // set to 'bazel', 'cmake', 'makefile', 'none'
-        def nativeBuildSystem = 'none'
-
-4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
-    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
-
-    If it asks you to use Instant Run, click **Proceed Without Instant Run**.
-
-    Also, you need to have an Android device plugged in with developer options
-    enabled at this
-    point. See [here](https://developer.android.com/studio/run/device.html) for
-    more details on setting up developer devices.
-
-This installs three apps on your phone that are all part of the TensorFlow
-Demo. See [Android Sample Apps](#android_sample_apps) for more information about
-them.
-
-## Adding TensorFlow to your apps using Android Studio
-
-To add TensorFlow to your own apps on Android, the simplest way is to add the
-following lines to your Gradle build file:
-
-    allprojects {
-        repositories {
-            jcenter()
-        }
-    }
-
-    dependencies {
-        implementation 'org.tensorflow:tensorflow-android:+'
-    }
-
-This automatically downloads the latest stable version of TensorFlow as an AAR
-and installs it in your project.
-
-## Build the demo using Bazel
-
-Another way to use TensorFlow on Android is to build an APK
-using [Bazel](https://bazel.build/) and load it onto your device
-using [ADB](https://developer.android.com/studio/command-line/adb.html). This
-requires some knowledge of build systems and Android developer tools, but we'll
-guide you through the basics here.
-
-- First, follow our instructions for
-  <a href="http://www.tensorflow.org/install/source">installing from sources</a>.
-  This will also guide you through installing Bazel and cloning the
-  TensorFlow code.
-
-- Download the Android [SDK](https://developer.android.com/studio/index.html)
-  and [NDK](https://developer.android.com/ndk/downloads/index.html) if you do
-  not already have them. You need at least version 12b of the NDK, and 23 of the
-  SDK.
-
-- In your copy of the TensorFlow source, update the
-  [WORKSPACE](https://github.com/tensorflow/tensorflow/blob/master/WORKSPACE)
-  file with the location of your SDK and NDK, where it says &lt;PATH_TO_NDK&gt;
-  and &lt;PATH_TO_SDK&gt;.
-
-- Run Bazel to build the demo APK:
-
-        bazel build -c opt //tensorflow/examples/android:tensorflow_demo
-
-- Use [ADB](https://developer.android.com/studio/command-line/adb.html#move) to
-  install the APK onto your device:
-
-        adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
-
-Note: In general when compiling for Android with Bazel you need
-`--config=android` on the Bazel command line, though in this case this
-particular example is Android-only, so you don't need it here.
-
-This installs three apps on your phone that are all part of the TensorFlow
-Demo. See [Android Sample Apps](#android_sample_apps) for more information about
-them.
-
-## Android Sample Apps
-
-The
-[Android example code](https://www.tensorflow.org/code/tensorflow/examples/android/) is
-a single project that builds and installs three sample apps which all use the
-same underlying code. The sample apps all take video input from a phone's
-camera:
-
-- **TF Classify** uses the Inception v3 model to label the objects it’s pointed
-  at with classes from Imagenet. There are only 1,000 categories in Imagenet,
-  which misses most everyday objects and includes many things you’re unlikely to
-  encounter often in real life, so the results can often be quite amusing. For
-  example there’s no ‘person’ category, so instead it will often guess things it
-  does know that are often associated with pictures of people, like a seat belt
-  or an oxygen mask. If you do want to customize this example to recognize
-  objects you care about, you can use
-  the
-  [TensorFlow for Poets codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) as
-  an example for how to train a model based on your own data.
-
-- **TF Detect** uses a multibox model to try to draw bounding boxes around the
-  locations of people in the camera. These boxes are annotated with the
-  confidence for each detection result. Results will not be perfect, as this
-  kind of object detection is still an active research topic.  The demo also
-  includes optical tracking for when objects move between frames, which runs
-  more frequently than the TensorFlow inference. This improves the user
-  experience since the apparent frame rate is faster, but it also gives the
-  ability to estimate which boxes refer to the same object between frames, which
-  is important for counting objects over time.
-
-- **TF Stylize** implements a real-time style transfer algorithm on the camera
-  feed. You can select which styles to use and mix between them using the
-  palette at the bottom of the screen, and also switch out the resolution of the
-  processing to go higher or lower rez.
-
-When you build and install the demo, you'll see three app icons on your phone,
-one for each of the demos. Tapping on them should open up the app and let you
-explore what they do. You can enable profiling statistics on-screen by tapping
-the volume up button while they’re running.
-
-### Android Inference Library
-
-Because Android apps need to be written in Java, and core TensorFlow is in C++,
-TensorFlow has a JNI library to interface between the two. Its interface is aimed
-only at inference, so it provides the ability to load a graph, set up inputs,
-and run the model to calculate particular outputs. You can see the full
-documentation for the minimal set of methods in
-[TensorFlowInferenceInterface.java](https://www.tensorflow.org/code/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java)
-
-The demos applications use this interface, so they’re a good place to look for
-example usage. You can download prebuilt binary jars
-at
-[ci.tensorflow.org](https://ci.tensorflow.org/view/Nightly/job/nightly-android/).
diff --git a/tensorflow/lite/g3doc/tfmobile/index.md b/tensorflow/lite/g3doc/tfmobile/index.md
deleted file mode 100644
index 15f0fd396134e40e89266182cb308080d9d250cb..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/index.md
+++ /dev/null
@@ -1,298 +0,0 @@
-# Overview
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-TensorFlow was designed to be a good deep learning solution for mobile
-platforms. Currently we have two solutions for deploying machine learning
-applications on mobile and embedded devices: TensorFlow for Mobile and
-<a href="../../lite">TensorFlow Lite</a>.
-
-## TensorFlow Lite versus TensorFlow Mobile
-
-Here are a few of the differences between the two:
-
-- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
-  developed with TensorFlow Lite will have a smaller binary size, fewer
-  dependencies, and better performance.
-
-- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
-  We expect you to use TensorFlow Mobile to cover production cases.
-
-- TensorFlow Lite supports only a limited set of operators, so not all models
-  will work on it by default. TensorFlow for Mobile has a fuller set of
-  supported functionality.
-
-TensorFlow Lite provides better performance and a small binary size on mobile
-platforms as well as the ability to leverage hardware acceleration if available
-on their platforms. In addition, it has many fewer dependencies so it can be
-built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
-also allows targeting accelerators through the [Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
-
-TensorFlow Lite currently has coverage for a limited set of operators. While
-TensorFlow for Mobile supports only a constrained set of ops by default, in
-principle if you use an arbitrary operator in TensorFlow, it can be customized
-to build that kernel. Thus use cases which are not currently supported by
-TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
-evolves, it will gain additional operators, and the decision will be easier to
-make.
-
-
-## Introduction to TensorFlow Mobile
-
-TensorFlow was designed from the ground up to be a good deep learning solution
-for mobile platforms like Android and iOS. This mobile guide should help you
-understand how machine learning can work on mobile platforms and how to
-integrate TensorFlow into your mobile apps effectively and efficiently.
-
-## About this Guide
-
-This guide is aimed at developers who have a TensorFlow model that’s
-successfully working in a desktop environment, who want to integrate it into
-a mobile application, and cannot use TensorFlow Lite. Here are the
-main challenges you’ll face during that process:
-
-- Understanding how to use Tensorflow for mobile.
-- Building TensorFlow for your platform.
-- Integrating the TensorFlow library into your application.
-- Preparing your model file for mobile deployment.
-- Optimizing for latency, RAM usage, model file size, and binary size.
-
-## Common use cases for mobile machine learning
-
-**Why run TensorFlow on mobile?**
-
-Traditionally, deep learning has been associated with data centers and giant
-clusters of high-powered GPU machines. However, it can be very expensive and
-time-consuming to send all of the data a device has access to across a network
-connection. Running on mobile makes it possible to deliver very interactive
-applications in a way that’s not possible when you have to wait for a network
-round trip.
-
-Here are some common use cases for on-device deep learning:
-
-### Speech Recognition
-
-There are a lot of interesting applications that can be built with a
-speech-driven interface, and many of these require on-device processing. Most of
-the time a user isn’t giving commands, and so streaming audio continuously to a
-remote server would be a waste of bandwidth, since it would mostly be silence or
-background noises. To solve this problem it’s common to have a small neural
-network running on-device
-[listening out for a particular keyword](../tutorials/sequences/audio_recognition).
-Once that keyword has been spotted, the rest of the
-conversation can be transmitted over to the server for further processing if
-more computing power is needed.
-
-### Image Recognition
-
-It can be very useful for a mobile app to be able to make sense of a camera
-image. If your users are taking photos, recognizing what’s in them can help your
-camera apps apply appropriate filters, or label the photos so they’re easily
-findable. It’s important for embedded applications too, since you can use image
-sensors to detect all sorts of interesting conditions, whether it’s spotting
-endangered animals in the wild
-or
-[reporting how late your train is running](https://svds.com/tensorflow-image-recognition-raspberry-pi/).
-
-TensorFlow comes with several examples of recognizing the types of objects
-inside images along with a variety of different pre-trained models, and they can
-all be run on mobile devices. You can try out
-our
-[Tensorflow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) and
-[Tensorflow for Poets 2: Optimize for Mobile](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/index.html#0) codelabs to
-see how to take a pretrained model and run some very fast and lightweight
-training to teach it to recognize specific objects, and then optimize it to
-run on mobile.
-
-### Object Localization
-
-Sometimes it’s important to know where objects are in an image as well as what
-they are. There are lots of augmented reality use cases that could benefit a
-mobile app, such as guiding users to the right component when offering them
-help fixing their wireless network or providing informative overlays on top of
-landscape features. Embedded applications often need to count objects that are
-passing by them, whether it’s pests in a field of crops, or people, cars and
-bikes going past a street lamp.
-
-TensorFlow offers a pretrained model for drawing bounding boxes around people
-detected in images, together with tracking code to follow them over time. The
-tracking is especially important for applications where you’re trying to count
-how many objects are present over time, since it gives you a good idea when a
-new object enters or leaves the scene. We have some sample code for this
-available for Android [on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
-and also a [more general object detection
-model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
-available as well.
-
-### Gesture Recognition
-
-It can be useful to be able to control applications with hand or other
-gestures, either recognized from images or through analyzing accelerometer
-sensor data. Creating those models is beyond the scope of this guide, but
-TensorFlow is an effective way of deploying them.
-
-### Optical Character Recognition
-
-Google Translate’s live camera view is a great example of how effective
-interactive on-device detection of text can be.
-
-<div class="video-wrapper">
-  <iframe class="devsite-embedded-youtube-video" data-video-id="06olHmcJjS0"
-            data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
-  </iframe>
-</div>
-
-There are multiple steps involved in recognizing text in images. You first have
-to identify the areas where the text is present, which is a variation on the
-object localization problem, and can be solved with similar techniques. Once you
-have an area of text, you then need to interpret it as letters, and then use a
-language model to help guess what words they represent. The simplest way to
-estimate what letters are present is to segment the line of text into individual
-letters, and then apply a simple neural network to the bounding box of each. You
-can get good results with the kind of models used for MNIST, which you can find
-in TensorFlow’s tutorials, though you may want a higher-resolution input.  A
-more advanced alternative is to use an LSTM model to process a whole line of
-text at once, with the model itself handling the segmentation into different
-characters.
-
-### Translation
-
-Translating from one language to another quickly and accurately, even if you
-don’t have a network connection, is an important use case. Deep networks are
-very effective at this sort of task, and you can find descriptions of a lot of
-different models in the literature. Often these are sequence-to-sequence
-recurrent models where you’re able to run a single graph to do the whole
-translation, without needing to run separate parsing stages.
-
-### Text Classification
-
-If you want to suggest relevant prompts to users based on what they’re typing or
-reading, it can be very useful to understand the meaning of the text. This is
-where text classification comes in. Text classification is an umbrella term
-that covers everything from sentiment analysis to topic discovery. You’re likely
-to have your own categories or labels that you want to apply, so the best place
-to start is with an example
-like
-[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/research/skip_thoughts/),
-and then train on your own examples.
-
-### Voice Synthesis
-
-A synthesized voice can be a great way of giving users feedback or aiding
-accessibility, and recent advances such as
-[WaveNet](https://deepmind.com/blog/wavenet-generative-model-raw-audio/) show
-that deep learning can offer very natural-sounding speech.
-
-## Mobile machine learning and the cloud
-
-These examples of use cases give an idea of how on-device networks can
-complement cloud services. Cloud has a great deal of computing power in a
-controlled environment, but running on devices can offer higher interactivity.
-In situations where the cloud is unavailable, or your cloud capacity is limited,
-you can provide an offline experience, or reduce cloud workload by processing
-easy cases on device.
-
-Doing on-device computation can also signal when it's time to switch to working
-on the cloud. A good example of this is hotword detection in speech. Since
-devices are able to constantly listen out for the keywords, this then triggers a
-lot of traffic to cloud-based speech recognition once one is recognized. Without
-the on-device component, the whole application wouldn’t be feasible, and this
-pattern exists across several other applications as well. Recognizing that some
-sensor input is interesting enough for further processing makes a lot of
-interesting products possible.
-
-## What hardware and software should you have?
-
-TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
-supported operating systems and instructions to install TensorFlow, see
-<a href="https://www.tensorflow.org/install">Installing Tensorflow</a>.
-
-Note that some of the sample code we provide for mobile TensorFlow requires you
-to compile TensorFlow from source, so you’ll need more than just `pip install`
-to work through all the sample code.
-
-To try out the mobile examples, you’ll need a device set up for development,
-using
-either [Android Studio](https://developer.android.com/studio/install.html),
-or [XCode](https://developer.apple.com/xcode/) if you're developing for iOS.
-
-## What should you do before you get started?
-
-Before thinking about how to get your solution on mobile:
-
-1. Determine whether your problem is solvable by mobile machine learning
-2. Create a labelled dataset to define your problem
-3. Pick an effective model for the problem
-
-We'll discuss these in more detail below.
-
-### Is your problem solvable by mobile machine learning?
-
-Once you have an idea of the problem you want to solve, you need to make a plan
-of how to build your solution. The most important first step is making sure that
-your problem is actually solvable, and the best way to do that is to mock it up
-using humans in the loop.
-
-For example, if you want to drive a robot toy car using voice commands, try
-recording some audio from the device and listen back to it to see if you can
-make sense of what’s being said. Often you’ll find there are problems in the
-capture process, such as the motor drowning out speech or not being able to hear
-at a distance, and you should tackle these problems before investing in the
-modeling process.
-
-Another example would be giving photos taken from your app to people see if they
-can classify what’s in them, in the way you’re looking for. If they can’t do
-that (for example, trying to estimate calories in food from photos may be
-impossible because all white soups look the same), then you’ll need to redesign
-your experience to cope with that. A good rule of thumb is that if a human can’t
-handle the task then it will be difficult to train a computer to do better.
-
-### Create a labelled dataset
-
-After you’ve solved any fundamental issues with your use case, you need to
-create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, more than picking which model to use. You want it
-to be as representative as possible of your actual use case, since the model
-will only be effective at the task you teach it. It’s also worth investing in
-tools to make labeling the data as efficient and accurate as possible. For
-example, if you’re able to switch from having to click a button on a web
-interface to simple keyboard shortcuts, you may be able to speed up the
-generation process a lot. You should also start by doing the initial labeling
-yourself, so you can learn about the difficulties and likely errors, and
-possibly change your labeling or data capture process to avoid them. Once you
-and your team are able to consistently label examples (that is once you
-generally agree on the same labels for most examples), you can then try and
-capture your knowledge in a manual and teach external raters how to run the same
-process.
-
-### Pick an effective model
-
-The next step is to pick an effective model to use. You might be able to avoid
-training a model from scratch if someone else has already implemented a model
-similar to what you need; we have a repository of models implemented in
-TensorFlow [on GitHub](https://github.com/tensorflow/models) that you can look
-through. Lean towards the simplest model you can find, and try to get started as
-soon as you have even a small amount of labelled data, since you’ll get the best
-results when you’re able to iterate quickly. The shorter the time it takes to
-try training a model and running it in its real application, the better overall
-results you’ll see. It’s common for an algorithm to get great training accuracy
-numbers but then fail to be useful within a real application because there’s a
-mismatch between the dataset and real usage. Prototype end-to-end usage as soon
-as possible to create a consistent user experience.
diff --git a/tensorflow/lite/g3doc/tfmobile/ios_build.md b/tensorflow/lite/g3doc/tfmobile/ios_build.md
deleted file mode 100644
index d922907cdc5fe5ccec8864b456586fce0293a0af..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/ios_build.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Building TensorFlow on iOS
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-## Using CocoaPods
-
-The simplest way to get started with TensorFlow on iOS is using the CocoaPods
-package management system. You can add the `TensorFlow-experimental` pod to your
-Podfile, which installs a universal binary framework. This makes it easy to get
-started but has the disadvantage of being hard to customize, which is important
-in case you want to shrink your binary size. If you do need the ability to
-customize your libraries, see later sections on how to do that.
-
-## Creating your own app
-
-If you'd like to add TensorFlow capabilities to your own app, do the following:
-
-- Create your own app or load your already-created app in XCode.
-
-- Add a file named Podfile at the project root directory with the following content:
-
-        target 'YourProjectName'
-        pod 'TensorFlow-experimental'
-
-- Run `pod install` to download and install the `TensorFlow-experimental` pod.
-
-- Open `YourProjectName.xcworkspace` and add your code.
-
-- In your app's **Build Settings**, make sure to add `$(inherited)` to the
-  **Other Linker Flags**, and **Header Search Paths** sections.
-
-## Running the Samples
-
-You'll need Xcode 7.3 or later to run our iOS samples.
-
-There are currently three examples: simple, benchmark, and camera. For now, you
-can download the sample code by cloning the main tensorflow repository (we are
-planning to make the samples available as a separate repository later).
-
-From the root of the tensorflow folder, download [Inception
-v1](https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip),
-and extract the label and graph files into the data folders inside both the
-simple and camera examples using these steps:
-
-    mkdir -p ~/graphs
-    curl -o ~/graphs/inception5h.zip \
-     https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip \
-     && unzip ~/graphs/inception5h.zip -d ~/graphs/inception5h
-    cp ~/graphs/inception5h/* tensorflow/examples/ios/benchmark/data/
-    cp ~/graphs/inception5h/* tensorflow/examples/ios/camera/data/
-    cp ~/graphs/inception5h/* tensorflow/examples/ios/simple/data/
-
-Change into one of the sample directories, download the
-[Tensorflow-experimental](https://cocoapods.org/pods/TensorFlow-experimental)
-pod, and open the Xcode workspace. Note that installing the pod can take a long
-time since it is big (~450MB). If you want to run the simple example, then:
-
-    cd tensorflow/examples/ios/simple
-    pod install
-    open tf_simple_example.xcworkspace   # note .xcworkspace, not .xcodeproj
-                                         # this is created by pod install
-
-Run the simple app in the XCode simulator. You should see a single-screen app
-with a **Run Model** button. Tap that, and you should see some debug output
-appear below indicating that the example Grace Hopper image in directory data
-has been analyzed, with a military uniform recognized.
-
-Run the other samples using the same process. The camera example requires a real
-device connected. Once you build and run that, you should get a live camera view
-that you can point at objects to get real-time recognition results.
-
-### iOS Example details
-
-There are three demo applications for iOS, all defined in Xcode projects inside
-[tensorflow/examples/ios](https://www.tensorflow.org/code/tensorflow/examples/ios/).
-
-- **Simple**: This is a minimal example showing how to load and run a TensorFlow
-  model in as few lines as possible. It just consists of a single view with a
-  button that executes the model loading and inference when its pressed.
-
-- **Camera**: This is very similar to the Android TF Classify demo. It loads
-  Inception v3 and outputs its best label estimate for what’s in the live camera
-  view. As with the Android version, you can train your own custom model using
-  TensorFlow for Poets and drop it into this example with minimal code changes.
-
-- **Benchmark**: is quite close to Simple, but it runs the graph repeatedly and
-  outputs similar statistics to the benchmark tool on Android.
-
-
-### Troubleshooting
-
-- Make sure you use the TensorFlow-experimental pod (and not TensorFlow).
-
-- The TensorFlow-experimental pod is current about ~450MB. The reason it is so
-  big is because we are bundling multiple platforms, and the pod includes all
-  TensorFlow functionality (e.g. operations). The final app size after build is
-  substantially smaller though (~25MB). Working with the complete pod is
-  convenient during development, but see below section on how you can build your
-  own custom TensorFlow library to reduce the size.
-
-## Building the TensorFlow iOS libraries from source
-
-While Cocoapods is the quickest and easiest way of getting started, you sometimes
-need more flexibility to determine which parts of TensorFlow your app should be
-shipped with. For such cases, you can build the iOS libraries from the
-sources. [This
-guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/ios#building-the-tensorflow-ios-libraries-from-source)
-contains detailed instructions on how to do that.
-
diff --git a/tensorflow/lite/g3doc/tfmobile/linking_libs.md b/tensorflow/lite/g3doc/tfmobile/linking_libs.md
deleted file mode 100644
index fd0e322c93493ed835ae7ec9766a708885c6ac88..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/linking_libs.md
+++ /dev/null
@@ -1,270 +0,0 @@
-# Integrating TensorFlow libraries
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-Once you have made some progress on a model that addresses the problem you’re
-trying to solve, it’s important to test it out inside your application
-immediately. There are often unexpected differences between your training data
-and what users actually encounter in the real world, and getting a clear picture
-of the gap as soon as possible improves the product experience.
-
-This page talks about how to integrate the TensorFlow libraries into your own
-mobile applications, once you have already successfully built and deployed the
-TensorFlow mobile demo apps.
-
-## Linking the library
-
-After you've managed to build the examples, you'll probably want to call
-TensorFlow from one of your existing applications. The very easiest way to do
-this is to use the Pod installation steps described in
-<a href="./ios_build.md">Building TensorFlow on iOS</a>, but if you want to build
-TensorFlow from source (for example to customize which operators are included)
-you'll need to break out TensorFlow as a framework, include the right header
-files, and link against the built libraries and dependencies.
-
-### Android
-
-For Android, you just need to link in a Java library contained in a JAR file
-called `libandroid_tensorflow_inference_java.jar`. There are three ways to
-include this functionality in your program:
-
-1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
-
-2. Download the nightly precompiled version from
-[ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
-
-3. Build the JAR file yourself using the instructions [in our Android GitHub repo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/android)
-
-### iOS
-
-Pulling in the TensorFlow libraries on iOS is a little more complicated. Here is
-a checklist of what you’ll need to do to your iOS app:
-
-- Link against tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a, usually
-  by adding `-L/your/path/tensorflow/contrib/makefile/gen/lib/` and
-  `-ltensorflow-core` to your linker flags.
-
-- Link against the generated protobuf libraries by adding
-  `-L/your/path/tensorflow/contrib/makefile/gen/protobuf_ios/lib` and
-  `-lprotobuf` and `-lprotobuf-lite` to your command line.
-
-- For the include paths, you need the root of your TensorFlow source folder as
-  the first entry, followed by
-  `tensorflow/contrib/makefile/downloads/protobuf/src`,
-  `tensorflow/contrib/makefile/downloads`,
-  `tensorflow/contrib/makefile/downloads/eigen`, and
-  `tensorflow/contrib/makefile/gen/proto`.
-
-- Make sure your binary is built with `-force_load` (or the equivalent on your
-  platform), aimed at the TensorFlow library to ensure that it’s linked
-  correctly. More detail on why this is necessary can be found in the next
-  section, [Global constructor magic](#global_constructor_magic). On Linux-like
-  platforms, you’ll need different flags, more like
-  `-Wl,--allow-multiple-definition -Wl,--whole-archive`.
-
-You’ll also need to link in the Accelerator framework, since this is used to
-speed up some of the operations.
-
-## Global constructor magic
-
-One of the subtlest problems you may run up against is the “No session factory
-registered for the given session options” error when trying to call TensorFlow
-from your own application. To understand why this is happening and how to fix
-it, you need to know a bit about the architecture of TensorFlow.
-
-The framework is designed to be very modular, with a thin core and a large
-number of specific objects that are independent and can be mixed and matched as
-needed. To enable this, the coding pattern in C++ had to let modules easily
-notify the framework about the services they offer, without requiring a central
-list that has to be updated separately from each implementation. It also had to
-allow separate libraries to add their own implementations without needing a
-recompile of the core.
-
-To achieve this capability, TensorFlow uses a registration pattern in a lot of
-places. In the code, it looks like this:
-
-```
-class MulKernel : OpKernel {
-	Status Compute(OpKernelContext* context) { … }
-};
-REGISTER_KERNEL(MulKernel, “Mul”);
-```
-
-This would be in a standalone `.cc` file linked into your application, either
-as part of the main set of kernels or as a separate custom library. The magic
-part is that the `REGISTER_KERNEL()` macro is able to inform the core of
-TensorFlow that it has an implementation of the Mul operation, so that it can be
-called in any graphs that require it.
-
-From a programming point of view, this setup is very convenient. The
-implementation and registration code live in the same file, and adding new
-implementations is as simple as compiling and linking it in. The difficult part
-comes from the way that the `REGISTER_KERNEL()` macro is implemented. C++
-doesn’t offer a good mechanism for doing this sort of registration, so we have
-to resort to some tricky code. Under the hood, the macro is implemented so that
-it produces something like this:
-
-```
-class RegisterMul {
-	public:
-		RegisterMul() {
-			global_kernel_registry()->Register(“Mul”, [](){
-				return new MulKernel()
-			});
-	}
-};
-RegisterMul g_register_mul;
-```
-
-This sets up a class `RegisterMul` with a constructor that tells the global
-kernel registry what function to call when somebody asks it how to create a
-“Mul” kernel. Then there’s a global object of that class, and so the constructor
-should be called at the start of any program.
-
-While this may sound sensible, the unfortunate part is that the global object
-that’s defined is not used by any other code, so linkers not designed with this
-in mind will decide that it can be deleted. As a result, the constructor is
-never called, and the class is never registered. All sorts of modules use this
-pattern in TensorFlow, and it happens that `Session` implementations are the
-first to be looked for when the code is run, which is why it shows up as the
-characteristic error when this problem occurs.
-
-The solution is to force the linker to not strip any code from the library, even
-if it believes it’s unused. On iOS, this step can be accomplished with the
-`-force_load` flag, specifying a library path, and on Linux you need
-`--whole-archive`. These persuade the linker to not be as aggressive about
-stripping, and should retain the globals.
-
-The actual implementation of the various `REGISTER_*` macros is a bit more
-complicated in practice, but they all suffer the same underlying problem. If
-you’re interested in how they work, [op_kernel.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_kernel.h#L1091)
-is a good place to start investigating.
-
-## Protobuf problems
-
-TensorFlow relies on
-the [Protocol Buffer](https://developers.google.com/protocol-buffers/) library,
-commonly known as protobuf. This library takes definitions of data structures
-and produces serialization and access code for them in a variety of
-languages. The tricky part is that this generated code needs to be linked
-against shared libraries for the exact same version of the framework that was
-used for the generator. This can be an issue when `protoc`, the tool used to
-generate the code, is from a different version of protobuf than the libraries in
-the standard linking and include paths. For example, you might be using a copy
-of `protoc` that was built locally in `~/projects/protobuf-3.0.1.a`, but you have
-libraries installed at `/usr/local/lib` and `/usr/local/include` that are from
-3.0.0.
-
-The symptoms of this issue are errors during the compilation or linking phases
-with protobufs. Usually, the build tools take care of this, but if you’re using
-the makefile, make sure you’re building the protobuf library locally and using
-it, as shown in [this Makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/makefile/Makefile#L18).
-
-Another situation that can cause problems is when protobuf headers and source
-files need to be generated as part of the build process. This process makes
-building more complex, since the first phase has to be a pass over the protobuf
-definitions to create all the needed code files, and only after that can you go
-ahead and do a build of the library code.
-
-### Multiple versions of protobufs in the same app
-
-Protobufs generate headers that are needed as part of the C++ interface to the
-overall TensorFlow library. This complicates using the library as a standalone
-framework.
-
-If your application is already using version 1 of the protocol buffers library,
-you may have trouble integrating TensorFlow because it requires version 2. If
-you just try to link both versions into the same binary, you’ll see linking
-errors because some of the symbols clash. To solve this particular problem, we
-have an experimental script at [rename_protobuf.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/makefile/rename_protobuf.sh).
-
-You need to run this as part of the makefile build, after you’ve downloaded all
-the dependencies:
-
-```
-tensorflow/contrib/makefile/download_dependencies.sh
-tensorflow/contrib/makefile/rename_protobuf.sh
-```
-
-## Calling the TensorFlow API
-
-Once you have the framework available, you then need to call into it. The usual
-pattern is that you first load your model, which represents a preset set of
-numeric computations, and then you run inputs through that model (for example,
-images from a camera) and receive outputs (for example, predicted labels).
-
-On Android, we provide the Java Inference Library that is focused on just this
-use case, while on iOS and Raspberry Pi you call directly into the C++ API.
-
-### Android
-
-Here’s what a typical Inference Library sequence looks like on Android:
-
-```
-// Load the model from disk.
-TensorFlowInferenceInterface inferenceInterface =
-new TensorFlowInferenceInterface(assetManager, modelFilename);
-
-// Copy the input data into TensorFlow.
-inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
-
-// Run the inference call.
-inferenceInterface.run(outputNames, logStats);
-
-// Copy the output Tensor back into the output array.
-inferenceInterface.fetch(outputName, outputs);
-```
-
-You can find the source of this code in the [Android examples](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java#L107).
-
-### iOS and Raspberry Pi
-
-Here’s the equivalent code for iOS and Raspberry Pi:
-
-```
-// Load the model.
-PortableReadFileToProto(file_path, &tensorflow_graph);
-
-// Create a session from the model.
-tensorflow::Status s = session->Create(tensorflow_graph);
-if (!s.ok()) {
-    LOG(FATAL) << "Could not create TensorFlow Graph: " << s;
-}
-
-// Run the model.
-std::string input_layer = "input";
-std::string output_layer = "output";
-std::vector<tensorflow::Tensor> outputs;
-tensorflow::Status run_status = session->Run({\{input_layer, image_tensor}},
-                               {output_layer}, {}, &outputs);
-if (!run_status.ok()) {
-    LOG(FATAL) << "Running model failed: " << run_status;
-}
-
-// Access the output data.
-tensorflow::Tensor* output = &outputs[0];
-```
-
-This is all based on the
-[iOS sample code](https://www.tensorflow.org/code/tensorflow/examples/ios/simple/RunModelViewController.mm),
-but there’s nothing iOS-specific; the same code should be usable on any platform
-that supports C++.
-
-You can also find specific examples for Raspberry Pi
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/pi_examples/label_image/label_image.cc).
diff --git a/tensorflow/lite/g3doc/tfmobile/optimizing.md b/tensorflow/lite/g3doc/tfmobile/optimizing.md
deleted file mode 100644
index 59ff8e774c6c63a01668aee7d6caeea01171468d..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/optimizing.md
+++ /dev/null
@@ -1,518 +0,0 @@
-# Optimizing for mobile
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-There are some special issues that you have to deal with when you’re trying to
-ship on mobile or embedded devices, and you’ll need to think about these as
-you’re developing your model.
-
-These issues are:
-
-- Model and Binary Size
-- App speed and model loading speed
-- Performance and threading
-
-We'll discuss a few of these below.
-
-## What are the minimum device requirements for TensorFlow?
-
-You need at least one megabyte of program memory and several megabytes of RAM to
-run the base TensorFlow runtime, so it’s not suitable for DSPs or
-microcontrollers. Other than those, the biggest constraint is usually the
-calculation speed of the device, and whether you can run the model you need for
-your application with a low enough latency. You can use the benchmarking tools
-in [How to Profile your Model](#how_to_profile_your_model) to get an idea of how
-many FLOPs are required for a model, and then use that to make rule-of-thumb
-estimates of how fast they will run on different devices. For example, a modern
-smartphone might be able to run 10 GFLOPs per second, so the best you could hope
-for from a 5 GFLOP model is two frames per second, though you may do worse
-depending on what the exact computation patterns are.
-
-This model dependence means that it’s possible to run TensorFlow even on very
-old or constrained phones, as long as you optimize your network to fit within
-the latency budget and possibly within limited RAM too. For memory usage, you
-mostly need to make sure that the intermediate buffers that TensorFlow creates
-aren’t too large, which you can examine in the benchmark output too.
-
-## Speed
-
-One of the highest priorities of most model deployments is figuring out how to
-run the inference fast enough to give a good user experience. The first place to
-start is by looking at the total number of floating point operations that are
-required to execute the graph. You can get a very rough estimate of this by
-using the `benchmark_model` tool:
-
-    bazel build -c opt tensorflow/tools/benchmark:benchmark_model && \
-    bazel-bin/tensorflow/tools/benchmark/benchmark_model \
-    --graph=/tmp/inception_graph.pb --input_layer="Mul:0" \
-    --input_layer_shape="1,299,299,3" --input_layer_type="float" \
-    --output_layer="softmax:0" --show_run_order=false --show_time=false \
-    --show_memory=false --show_summary=true --show_flops=true --logtostderr
-
-This should show you an estimate of how many operations are needed to run the
-graph. You can then use that information to figure out how feasible your model
-is to run on the devices you’re targeting. For an example, a high-end phone from
-2016 might be able to do 20 billion FLOPs per second, so the best speed you
-could hope for from a model that requires 10 billion FLOPs is around 500ms. On a
-device like the Raspberry Pi 3 that can do about 5 billion FLOPs, you may only
-get one inference every two seconds.
-
-Having this estimate helps you plan for what you’ll be able to realistically
-achieve on a device. If the model is using too many ops, then there are a lot of
-opportunities to optimize the architecture to reduce that number.
-
-Advanced techniques include [SqueezeNet](https://arxiv.org/abs/1602.07360)
-and [MobileNet](https://arxiv.org/abs/1704.04861), which are architectures
-designed to produce models for mobile -- lean and fast but with a small accuracy
-cost.  You can also just look at alternative models, even older ones, which may
-be smaller. For example, Inception v1 only has around 7 million parameters,
-compared to Inception v3’s 24 million, and requires only 3 billion FLOPs rather
-than 9 billion for v3.
-
-## Model Size
-
-Models that run on a device need to be stored somewhere on the device, and very
-large neural networks can be hundreds of megabytes. Most users are reluctant to
-download very large app bundles from app stores, so you want to make your model
-as small as possible. Furthermore, smaller neural networks can persist in and
-out of a mobile device's memory faster.
-
-To understand how large your network will be on disk, start by looking at the
-size on disk of your `GraphDef` file after you’ve run `freeze_graph` and
-`strip_unused_nodes` on it (see <a href="./prepare_models.md">Preparing models</a> for
-more details on these tools), since then it should only contain
-inference-related nodes. To double-check that your results are as expected, run
-the `summarize_graph` tool to see how many parameters are in constants:
-
-    bazel build tensorflow/tools/graph_transforms:summarize_graph && \
-    bazel-bin/tensorflow/tools/graph_transforms/summarize_graph \
-    --in_graph=/tmp/tensorflow_inception_graph.pb
-
-That command should give you output that looks something like this:
-
-    No inputs spotted.
-    Found 1 possible outputs: (name=softmax, op=Softmax)
-    Found 23885411 (23.89M) const parameters, 0 (0) variable parameters,
-    and 99 control_edges
-    Op types used: 489 Const, 99 CheckNumerics, 99 Identity, 94
-    BatchNormWithGlobalNormalization, 94 Conv2D, 94 Relu, 11 Concat, 9 AvgPool,
-    5 MaxPool, 1 Sub, 1 Softmax, 1 ResizeBilinear, 1 Reshape, 1 Mul, 1 MatMul,
-    1 ExpandDims, 1 DecodeJpeg, 1 Cast, 1 BiasAdd
-
-The important part for our current purposes is the number of const
-parameters. In most models these will be stored as 32-bit floats to start, so if
-you multiply the number of const parameters by four, you should get something
-that’s close to the size of the file on disk. You can often get away with only
-eight-bits per parameter with very little loss of accuracy in the final result,
-so if your file size is too large you can try using
-<a href="https://www.tensorflow.org/performance/quantization">quantize_weights</a>
-to transform the parameters down.
-
-    bazel build tensorflow/tools/graph_transforms:transform_graph && \
-    bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
-    --in_graph=/tmp/tensorflow_inception_optimized.pb \
-    --out_graph=/tmp/tensorflow_inception_quantized.pb \
-    --inputs='Mul:0' --outputs='softmax:0' --transforms='quantize_weights'
-
-If you look at the resulting file size, you should see that it’s about a quarter
-of the original at 23MB.
-
-Another transform is `round_weights`, which doesn't make the file smaller, but it
-makes the file compressible to about the same size as when `quantize_weights` is
-used. This is particularly useful for mobile development, taking advantage of
-the fact that app bundles are compressed before they’re downloaded by consumers.
-
-The original file does not compress well with standard algorithms, because the
-bit patterns of even very similar numbers can be very different. The
-`round_weights` transform keeps the weight parameters stored as floats, but
-rounds them to a set number of step values. This means there are a lot more
-repeated byte patterns in the stored model, and so compression can often bring
-the size down dramatically, in many cases to near the size it would be if they
-were stored as eight bit.
-
-Another advantage of `round_weights` is that the framework doesn’t have to
-allocate a temporary buffer to unpack the parameters into, as we have to when
-we just use `quantize_weights`. This saves a little bit of latency (though the
-results should be cached so it’s only costly on the first run) and makes it
-possible to use memory mapping, as described later.
-
-## Binary Size
-
-One of the biggest differences between mobile and server development is the
-importance of binary size. On desktop machines it’s not unusual to have
-executables that are hundreds of megabytes on disk, but for mobile and embedded
-apps it’s vital to keep the binary as small as possible so that user downloads
-are easy. As mentioned above, TensorFlow only includes a subset of op
-implementations by default, but this still results in a 12 MB final
-executable. To reduce this, you can set up the library to only include the
-implementations of the ops that you actually need, based on automatically
-analyzing your model. To use it:
-
-- Run `tools/print_required_ops/print_selective_registration_header.py` on your
-  model to produce a header file that only enables the ops it uses.
-
-- Place the `ops_to_register.h` file somewhere that the compiler can find
-  it. This can be in the root of your TensorFlow source folder.
-
-- Build TensorFlow with `SELECTIVE_REGISTRATION` defined, for example by passing
-  in `--copts=”-DSELECTIVE_REGISTRATION”` to your Bazel build command.
-
-This process recompiles the library so that only the needed ops and types are
-included, which can dramatically reduce the executable size. For example, with
-Inception v3, the new size is only 1.5MB.
-
-## How to Profile your Model
-
-Once you have an idea of what your device's peak performance range is, it’s
-worth looking at its actual current performance. Using a standalone TensorFlow
-benchmark, rather than running it inside a larger app, helps isolate just the
-Tensorflow contribution to the
-latency. The
-[tensorflow/tools/benchmark](https://www.tensorflow.org/code/tensorflow/tools/benchmark/) tool
-is designed to help you do this. To run it on Inception v3 on your desktop
-machine, build this benchmark model:
-
-    bazel build -c opt tensorflow/tools/benchmark:benchmark_model && \
-    bazel-bin/tensorflow/tools/benchmark/benchmark_model \
-    --graph=/tmp/tensorflow_inception_graph.pb --input_layer="Mul" \
-    --input_layer_shape="1,299,299,3" --input_layer_type="float" \
-    --output_layer="softmax:0" --show_run_order=false --show_time=false \
-    --show_memory=false --show_summary=true --show_flops=true --logtostderr
-
-You should see output that looks something like this:
-
-<pre>
-============================== Top by Computation Time ==============================
-[node
- type]  [start]  [first] [avg ms]     [%]  [cdf%]  [mem KB]  [Name]
-Conv2D   22.859   14.212   13.700  4.972%  4.972%  3871.488  conv_4/Conv2D
-Conv2D    8.116    8.964   11.315  4.106%  9.078%  5531.904  conv_2/Conv2D
-Conv2D   62.066   16.504    7.274  2.640% 11.717%   443.904  mixed_3/conv/Conv2D
-Conv2D    2.530    6.226    4.939  1.792% 13.510%  2765.952  conv_1/Conv2D
-Conv2D   55.585    4.605    4.665  1.693% 15.203%   313.600  mixed_2/tower/conv_1/Conv2D
-Conv2D  127.114    5.469    4.630  1.680% 16.883%    81.920  mixed_10/conv/Conv2D
-Conv2D   47.391    6.994    4.588  1.665% 18.548%   313.600  mixed_1/tower/conv_1/Conv2D
-Conv2D   39.463    7.878    4.336  1.574% 20.122%   313.600  mixed/tower/conv_1/Conv2D
-Conv2D  127.113    4.192    3.894  1.413% 21.535%   114.688  mixed_10/tower_1/conv/Conv2D
-Conv2D   70.188    5.205    3.626  1.316% 22.850%   221.952  mixed_4/conv/Conv2D
-
-============================== Summary by node type ==============================
-[Node type]  [count]  [avg ms]    [avg %]    [cdf %]  [mem KB]
-Conv2D            94   244.899    88.952%    88.952% 35869.953
-BiasAdd           95     9.664     3.510%    92.462% 35873.984
-AvgPool            9     7.990     2.902%    95.364%  7493.504
-Relu              94     5.727     2.080%    97.444% 35869.953
-MaxPool            5     3.485     1.266%    98.710%  3358.848
-Const            192     1.727     0.627%    99.337%     0.000
-Concat            11     1.081     0.393%    99.730%  9892.096
-MatMul             1     0.665     0.242%    99.971%     4.032
-Softmax            1     0.040     0.015%    99.986%     4.032
-<>                 1     0.032     0.012%    99.997%     0.000
-Reshape            1     0.007     0.003%   100.000%     0.000
-
-Timings (microseconds): count=50 first=330849 curr=274803 min=232354 max=415352 avg=275563 std=44193
-Memory (bytes): count=50 curr=128366400(all same)
-514 nodes defined 504 nodes observed
-</pre>
-
-This is the summary view, which is enabled by the show_summary flag. To
-interpret it, the first table is a list of the nodes that took the most time, in
-order by how long they took. From left to right, the columns are:
-
-- Node type, what kind of operation this was.
-
-- Start time of the op, showing where it falls in the sequence of operations.
-
-- First time in milliseconds. This is how long the operation took on the first
-  run of the benchmark, since by default 20 runs are executed to get more
-  reliable statistics. The first time is useful to spot which ops are doing
-  expensive calculations on the first run, and then caching the results.
-
-- Average time for the operation across all runs, in milliseconds.
-
-- What percentage of the total time for one run the op took. This is useful to
-  understand where the hotspots are.
-
-- The cumulative total time of this and the previous ops in the table. This is
-  handy for understanding what the distribution of work is across the layers, to
-  see if just a few of the nodes are taking up most of the time.
-  
-- The amount of memory consumed by outputs of this type of op.
-
-- Name of the node.
-
-The second table is similar, but instead of breaking down the timings by
-particular named nodes, it groups them by the kind of op. This is very useful to
-understand which op implementations you might want to optimize or eliminate from
-your graph. The table is arranged with the most costly operations at the start,
-and only shows the top ten entries, with a placeholder for other nodes. The
-columns from left to right are:
-
-- Type of the nodes being analyzed.
-
-- Accumulated average time taken by all nodes of this type, in milliseconds.
-
-- What percentage of the total time was taken by this type of operation.
-
-- Cumulative time taken by this and op types higher in the table, so you can
-  understand the distribution of the workload.
-
--  How much memory the outputs of this op type took up.
-
-Both of these tables are set up so that you can easily copy and paste their
-results into spreadsheet documents, since they are output with tabs as
-separators between the columns. The summary by node type can be the most useful
-when looking for optimization opportunities, since it’s a pointer to the code
-that’s taking the most time. In this case, you can see that the Conv2D ops are
-almost 90% of the execution time. This is a sign that the graph is pretty
-optimal, since convolutions and matrix multiplies are expected to be the bulk of
-a neural network’s computing workload.
-
-As a rule of thumb, it’s more worrying if you see a lot of other operations
-taking up more than a small fraction of the time. For neural networks, the ops
-that don’t involve large matrix multiplications should usually be dwarfed by the
-ones that do, so if you see a lot of time going into those it’s a sign that
-either your network is non-optimally constructed, or the code implementing those
-ops is not as optimized as it could
-be. [Performance bugs](https://github.com/tensorflow/tensorflow/issues) or
-patches are always welcome if you do encounter this situation, especially if
-they include an attached model exhibiting this behavior and the command line
-used to run the benchmark tool on it.
-
-The run above was on your desktop, but the tool also works on Android, which is
-where it’s most useful for mobile development. Here’s an example command line to
-run it on a 64-bit ARM device:
-
-    bazel build -c opt --config=android_arm64 \
-    tensorflow/tools/benchmark:benchmark_model
-    adb push bazel-bin/tensorflow/tools/benchmark/benchmark_model /data/local/tmp
-    adb push /tmp/tensorflow_inception_graph.pb /data/local/tmp/
-    adb shell '/data/local/tmp/benchmark_model \
-    --graph=/data/local/tmp/tensorflow_inception_graph.pb --input_layer="Mul" \
-    --input_layer_shape="1,299,299,3" --input_layer_type="float" \
-    --output_layer="softmax:0" --show_run_order=false --show_time=false \
-    --show_memory=false --show_summary=true'
-
-You can interpret the results in exactly the same way as the desktop version
-above. If you have any trouble figuring out what the right input and output
-names and types are, take a look at the
-<a href="./prepare_models">Preparing models</a>
-page for details about detecting these for your model, and look at the
-`summarize_graph` tool which may give you
-helpful information.
-
-There isn’t good support for command line tools on iOS, so instead there’s a
-separate example
-at
-[tensorflow/examples/ios/benchmark](https://www.tensorflow.org/code/tensorflow/examples/ios/benchmark) that
-packages the same functionality inside a standalone app. This outputs the
-statistics to both the screen of the device and the debug log. If you want
-on-screen statistics for the Android example apps, you can turn them on by
-pressing the volume-up button.
-
-## Profiling within your own app
-
-The output you see from the benchmark tool is generated from modules that are
-included as part of the standard TensorFlow runtime, which means you have access
-to them within your own applications too. You can see an example of how to do
-that [here](https://www.tensorflow.org/code/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm?l=139).
-
-The basic steps are:
-
-1. Create a StatSummarizer object:
-
-        tensorflow::StatSummarizer stat_summarizer(tensorflow_graph);
-
-2. Set up the options:
-
-        tensorflow::RunOptions run_options;
-        run_options.set_trace_level(tensorflow::RunOptions::FULL_TRACE);
-        tensorflow::RunMetadata run_metadata;
-
-3. Run the graph:
-
-        run_status = session->Run(run_options, inputs, output_layer_names, {},
-                                  output_layers, &run_metadata);
-
-4. Calculate the results and print them out:
-
-        assert(run_metadata.has_step_stats());
-        const tensorflow::StepStats& step_stats = run_metadata.step_stats();
-        stat_summarizer->ProcessStepStats(step_stats);
-        stat_summarizer->PrintStepStats();
-
-## Visualizing Models
-
-The most effective way to speed up your code is by altering your model so it
-does less work. To do that, you need to understand what your model is doing, and
-visualizing it is a good first step. To get a high-level overview of your graph,
-use [TensorBoard](https://github.com/tensorflow/tensorboard).
-
-## Threading
-
-The desktop version of TensorFlow has a sophisticated threading model, and will
-try to run multiple operations in parallel if it can. In our terminology this is
-called “inter-op parallelism” (though to avoid confusion with “intra-op”, you
-could think of it as “between-op” instead), and can be set by specifying
-`inter_op_parallelism_threads` in the session options.
-
-By default, mobile devices run operations serially; that is,
-`inter_op_parallelism_threads` is set to 1. Mobile processors usually have few
-cores and a small cache, so running multiple operations accessing disjoint parts
-of memory usually doesn’t help performance. “Intra-op parallelism” (or
-“within-op”) can be very helpful though, especially for computation-bound
-operations like convolutions where different threads can feed off the same small
-set of memory.
-
-On mobile, how many threads an op will use is set to the number of cores by
-default, or 2 when the number of cores can't be determined. You can override the
-default number of threads that ops are using by setting
-`intra_op_parallelism_threads` in the session options.  It’s a good idea to
-reduce the default if your app has its own threads doing heavy processing, so
-that they don’t interfere with each other.
-
-To see more details on session options, look at [ConfigProto](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
-
-## Retrain with mobile data
-
-The biggest cause of accuracy problems when running models on mobile apps is
-unrepresentative training data. For example, most of the Imagenet photos are
-well-framed so that the object is in the center of the picture, well-lit, and
-shot with a normal lens. Photos from mobile devices are often poorly framed,
-badly lit, and can have fisheye distortions, especially selfies.
-
-The solution is to expand your training set with data actually captured from
-your application. This step can involve extra work, since you’ll have to label
-the examples yourself, but even if you just use it to expand your original
-training data, it can help the training set dramatically. Improving the training
-set by doing this, and by fixing other quality issues like duplicates or badly
-labeled examples is the single best way to improve accuracy. It’s usually a
-bigger help than altering your model architecture or using different techniques.
-
-## Reducing model loading time and/or memory footprint
-
-Most operating systems allow you to load a file using memory mapping, rather
-than going through the usual I/O APIs. Instead of allocating an area of memory
-on the heap and then copying bytes from disk into it, you simply tell the
-operating system to make the entire contents of a file appear directly in
-memory. This has several advantages:
-
-* Speeds loading
-* Reduces paging (increases performance)
-* Does not count towards RAM budget for your app
-
-TensorFlow has support for memory mapping the weights that form the bulk of most
-model files. Because of limitations in the `ProtoBuf` serialization format, we
-have to make a few changes to our model loading and processing code. The
-way memory mapping works is that we have a single file where the first part is a
-normal `GraphDef` serialized into the protocol buffer wire format, but then the
-weights are appended in a form that can be directly mapped.
-
-To create this file, run the
-`tensorflow/contrib/util:convert_graphdef_memmapped_format` tool. This takes in
-a `GraphDef` file that’s been run through `freeze_graph` and converts it to the
-format that has the weights appended at the end. Since that file’s no longer a
-standard `GraphDef` protobuf, you then need to make some changes to the loading
-code. You can see an example of this in
-the
-[iOS Camera demo app](https://www.tensorflow.org/code/tensorflow/examples/ios/camera/tensorflow_utils.mm?l=147),
-in the `LoadMemoryMappedModel()` function.
-
-The same code (with the Objective C calls for getting the filenames substituted)
-can be used on other platforms too. Because we’re using memory mapping, we need
-to start by creating a special TensorFlow environment object that’s set up with
-the file we’ll be using:
-
-    std::unique_ptr<tensorflow::MemmappedEnv> memmapped_env;
-    memmapped_env->reset(
-          new tensorflow::MemmappedEnv(tensorflow::Env::Default()));
-    tensorflow::Status mmap_status =
-          (memmapped_env->get())->InitializeFromFile(file_path);
-
-You then need to pass in this environment to subsequent calls, like this one for
-loading the graph:
-
-    tensorflow::GraphDef tensorflow_graph;
-    tensorflow::Status load_graph_status = ReadBinaryProto(
-        memmapped_env->get(),
-        tensorflow::MemmappedFileSystem::kMemmappedPackageDefaultGraphDef,
-        &tensorflow_graph);
-
-You also need to create the session with a pointer to the environment you’ve
-created:
-
-    tensorflow::SessionOptions options;
-    options.config.mutable_graph_options()
-        ->mutable_optimizer_options()
-        ->set_opt_level(::tensorflow::OptimizerOptions::L0);
-    options.env = memmapped_env->get();
-
-    tensorflow::Session* session_pointer = nullptr;
-    tensorflow::Status session_status =
-        tensorflow::NewSession(options, &session_pointer);
-
-One thing to notice here is that we’re also disabling automatic optimizations,
-since in some cases these will fold constant sub-trees, and so create copies of
-tensor values that we don’t want and use up more RAM.
-
-Once you’ve gone through these steps, you can use the session and graph as
-normal, and you should see a reduction in loading time and memory usage.
-
-## Protecting model files from easy copying
-
-By default, your models will be stored in the standard serialized protobuf
-format on disk. In theory this means that anybody can copy your model, which you
-may not want. However, in practice, most models are so application-specific and
-obfuscated by optimizations that the risk is similar to that of competitors
-disassembling and reusing your code, but if you do want to make it tougher for
-casual users to access your files it is possible to take some basic steps.
-
-Most of our examples use
-the
-[ReadBinaryProto()](https://www.tensorflow.org/code/tensorflow/core/platform/env.cc?q=core/platform/env.cc&l=409) convenience
-call to load a `GraphDef` from disk. This does require an unencrypted protobuf on
-disk. Luckily though, the implementation of the call is pretty straightforward
-and it should be easy to write an equivalent that can decrypt in memory. Here's
-some code that shows how you can read and decrypt a protobuf using your own
-decryption routine:
-
-    Status ReadEncryptedProto(Env* env, const string& fname,
-                              ::tensorflow::protobuf::MessageLite* proto) {
-      string data;
-      TF_RETURN_IF_ERROR(ReadFileToString(env, fname, &data));
-
-      DecryptData(&data);  // Your own function here.
-
-      if (!proto->ParseFromString(&data)) {
-        TF_RETURN_IF_ERROR(stream->status());
-        return errors::DataLoss("Can't parse ", fname, " as binary proto");
-      }
-      return Status::OK();
-    }
-
-To use this you’d need to define the DecryptData() function yourself. It could
-be as simple as something like:
-
-    void DecryptData(string* data) {
-      for (int i = 0; i < data.size(); ++i) {
-        data[i] = data[i] ^ 0x23;
-      }
-    }
-
-You may want something more complex, but exactly what you’ll need is outside the
-current scope here.
diff --git a/tensorflow/lite/g3doc/tfmobile/prepare_models.md b/tensorflow/lite/g3doc/tfmobile/prepare_models.md
deleted file mode 100644
index 1d373251ddf3ba6a0119bd57bf14caf100ef371a..0000000000000000000000000000000000000000
--- a/tensorflow/lite/g3doc/tfmobile/prepare_models.md
+++ /dev/null
@@ -1,318 +0,0 @@
-# Preparing models for mobile deployment
-
-Warning: We expect to deprecate TensorFlow Mobile in early 2019
-
-<div class="caution">
-  <p>
-    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
-    working hard to close the feature gap between TensorFlow Mobile and
-    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
-    will give ample notice to our users when we get to that point and will
-    provide help and support to ensure easy migrations.
-  </p>
-  <p>
-    In the meantime, please use TensorFlow Lite. If you have a feature request,
-    such as a missing op, please post to our <a
-    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
-  </p>
-</div>
-
-The requirements for storing model information during training are very
-different from when you want to release it as part of a mobile app. This section
-covers the tools involved in converting from a training model to something
-releasable in production.
-
-## What is up with all the different saved file formats?
-
-You may find yourself getting very confused by all the different ways that
-TensorFlow can save out graphs. To help, here’s a rundown of some of the
-different components, and what they are used for. The objects are mostly defined
-and serialized as protocol buffers:
-
-- [NodeDef](https://www.tensorflow.org/code/tensorflow/core/framework/node_def.proto):
-  Defines a single operation in a model. It has a unique name, a list of the
-  names of other nodes it pulls inputs from, the operation type it implements
-  (for example `Add`, or `Mul`), and any attributes that are needed to control
-  that operation. This is the basic unit of computation for TensorFlow, and all
-  work is done by iterating through a network of these nodes, applying each one
-  in turn. One particular operation type that’s worth knowing about is `Const`,
-  since this holds information about a constant. This may be a single, scalar
-  number or string, but it can also hold an entire multi-dimensional tensor
-  array. The values for a `Const` are stored inside the `NodeDef`, and so large
-  constants can take up a lot of room when serialized.
-
-- [Checkpoint](https://www.tensorflow.org/code/tensorflow/core/util/tensor_bundle/tensor_bundle.h). Another
-  way of storing values for a model is by using `Variable` ops. Unlike `Const`
-  ops, these don’t store their content as part of the `NodeDef`, so they take up
-  very little space within the `GraphDef` file. Instead their values are held in
-  RAM while a computation is running, and then saved out to disk as checkpoint
-  files periodically. This typically happens as a neural network is being
-  trained and weights are updated, so it’s a time-critical operation, and it may
-  happen in a distributed fashion across many workers, so the file format has to
-  be both fast and flexible. They are stored as multiple checkpoint files,
-  together with metadata files that describe what’s contained within the
-  checkpoints. When you’re referring to a checkpoint in the API (for example
-  when passing a filename in as a command line argument), you’ll use the common
-  prefix for a set of related files. If you had these files:
-
-        /tmp/model/model-chkpt-1000.data-00000-of-00002
-        /tmp/model/model-chkpt-1000.data-00001-of-00002
-        /tmp/model/model-chkpt-1000.index
-        /tmp/model/model-chkpt-1000.meta
-
-    You would refer to them as `/tmp/model/chkpt-1000`.
-
-- [GraphDef](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto):
-  Has a list of `NodeDefs`, which together define the computational graph to
-  execute. During training, some of these nodes will be `Variables`, and so if
-  you want to have a complete graph you can run, including the weights, you’ll
-  need to call a restore operation to pull those values from
-  checkpoints. Because checkpoint loading has to be flexible to deal with all of
-  the training requirements, this can be tricky to implement on mobile and
-  embedded devices, especially those with no proper file system available like
-  iOS. This is where
-  the
-  [`freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py) script
-  comes in handy. As mentioned above, `Const` ops store their values as part of
-  the `NodeDef`, so if all the `Variable` weights are converted to `Const` nodes,
-  then we only need a single `GraphDef` file to hold the model architecture and
-  the weights. Freezing the graph handles the process of loading the
-  checkpoints, and then converts all Variables to Consts. You can then load the
-  resulting file in a single call, without having to restore variable values
-  from checkpoints. One thing to watch out for with `GraphDef` files is that
-  sometimes they’re stored in text format for easy inspection. These versions
-  usually have a ‘.pbtxt’ filename suffix, whereas the binary files end with
-  ‘.pb’.
-
-- [FunctionDefLibrary](https://www.tensorflow.org/code/tensorflow/core/framework/function.proto):
-  This appears in `GraphDef`, and is effectively a set of sub-graphs, each with
-  information about their input and output nodes. Each sub-graph can then be
-  used as an op in the main graph, allowing easy instantiation of different
-  nodes, in a similar way to how functions encapsulate code in other languages.
-
-- [MetaGraphDef](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto):
-  A plain `GraphDef` only has information about the network of computations, but
-  doesn’t have any extra information about the model or how it can be
-  used. `MetaGraphDef` contains a `GraphDef` defining the computation part of
-  the model, but also includes information like ‘signatures’, which are
-  suggestions about which inputs and outputs you may want to call the model
-  with, data on how and where any checkpoint files are saved, and convenience
-  tags for grouping ops together for ease of use.
-
-- [SavedModel](https://www.tensorflow.org/code/tensorflow/core/protobuf/saved_model.proto):
-  It’s common to want to have different versions of a graph that rely on a
-  common set of variable checkpoints. For example, you might need a GPU and a
-  CPU version of the same graph, but keep the same weights for both. You might
-  also need some extra files (like label names) as part of your
-  model. The
-  [SavedModel](https://www.tensorflow.org/code/tensorflow/python/saved_model/README.md) format
-  addresses these needs by letting you save multiple versions of the same graph
-  without duplicating variables, and also storing asset files in the same
-  bundle. Under the hood, it uses `MetaGraphDef` and checkpoint files, along
-  with extra metadata files. It’s the format that you’ll want to use if you’re
-  deploying a web API using TensorFlow Serving, for example.
-
-## How do you get a model you can use on mobile?
-
-In most situations, training a model with TensorFlow will give you a folder
-containing a `GraphDef` file (usually ending with the `.pb` or `.pbtxt` extension) and
-a set of checkpoint files. What you need for mobile or embedded deployment is a
-single `GraphDef` file that’s been ‘frozen’, or had its variables converted into
-inline constants so everything’s in one file.  To handle the conversion, you’ll
-need the `freeze_graph.py` script, that’s held in
-[`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
-
-    bazel build tensorflow/python/tools:freeze_graph
-    bazel-bin/tensorflow/python/tools/freeze_graph \
-    --input_graph=/tmp/model/my_graph.pb \
-    --input_checkpoint=/tmp/model/model.ckpt-1000 \
-    --output_graph=/tmp/frozen_graph.pb \
-    --output_node_names=output_node \
-
-The `input_graph` argument should point to the `GraphDef` file that holds your
-model architecture. It’s possible that your `GraphDef` has been stored in a text
-format on disk, in which case it’s likely to end in `.pbtxt` instead of `.pb`,
-and you should add an extra `--input_binary=false` flag to the command.
-
-The `input_checkpoint` should be the most recent saved checkpoint. As mentioned
-in the checkpoint section, you need to give the common prefix to the set of
-checkpoints here, rather than a full filename.
-
-`output_graph` defines where the resulting frozen `GraphDef` will be
-saved. Because it’s likely to contain a lot of weight values that take up a
-large amount of space in text format, it’s always saved as a binary protobuf.
-
-`output_node_names` is a list of the names of the nodes that you want to extract
-the results of your graph from. This is needed because the freezing process
-needs to understand which parts of the graph are actually needed, and which are
-artifacts of the training process, like summarization ops. Only ops that
-contribute to calculating the given output nodes will be kept. If you know how
-your graph is going to be used, these should just be the names of the nodes you
-pass into `Session::Run()` as your fetch targets. The easiest way to find the
-node names is to inspect the Node objects while building your graph in python.
-Inspecting your graph in TensorBoard is another simple way.  You can get some
-suggestions on likely outputs by running the [`summarize_graph` tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms/README.md#inspecting-graphs).
-
-Because the output format for TensorFlow has changed over time, there are a
-variety of other less commonly used flags available too, like `input_saver`, but
-hopefully you shouldn’t need these on graphs trained with modern versions of the
-framework.
-
-## Using the Graph Transform Tool
-
-A lot of the things you need to do to efficiently run a model on device are
-available through the [Graph Transform
-Tool](https://www.tensorflow.org/code/tensorflow/tools/graph_transforms/README.md). This
-command-line tool takes an input `GraphDef` file, applies the set of rewriting
-rules you request, and then writes out the result as a `GraphDef`. See the
-documentation for more information on how to build and run this tool.
-
-### Removing training-only nodes
-
-TensorFlow `GraphDefs` produced by the training code contain all of the
-computation that’s needed for back-propagation and updates of weights, as well
-as the queuing and decoding of inputs, and the saving out of checkpoints. All of
-these nodes are no longer needed during inference, and some of the operations
-like checkpoint saving aren’t even supported on mobile platforms. To create a
-model file that you can load on devices you need to delete those unneeded
-operations by running the `strip_unused_nodes` rule in the Graph Transform Tool.
-
-The trickiest part of this process is figuring out the names of the nodes you
-want to use as inputs and outputs during inference.  You'll need these anyway
-once you start to run inference, but you also need them here so that the
-transform can calculate which nodes are not needed on the inference-only
-path. These may not be obvious from the training code. The easiest way to
-determine the node name is to explore the graph with TensorBoard.
-
-Remember that mobile applications typically gather their data from sensors and
-have it as arrays in memory, whereas training typically involves loading and
-decoding representations of the data stored on disk. In the case of Inception v3
-for example, there’s a `DecodeJpeg` op at the start of the graph that’s designed
-to take JPEG-encoded data from a file retrieved from disk and turn it into an
-arbitrary-sized image. After that there’s a `BilinearResize` op to scale it to
-the expected size, followed by a couple of other ops that convert the byte data
-into float and scale the value magnitudes it in the way the rest of the graph
-expects. A typical mobile app will skip most of these steps because it’s getting
-its input directly from a live camera, so the input node you will actually
-supply will be the output of the `Mul` node in this case.
-
-<img src ="../images/inception_input.png" width="300">
-
-You’ll need to do a similar process of inspection to figure out the correct
-output nodes.
-
-If you’ve just been given a frozen `GraphDef` file, and are not sure about the
-contents, try using the `summarize_graph` tool to print out information
-about the inputs and outputs it finds from the graph structure. Here’s an
-example with the original Inception v3 file:
-
-    bazel run tensorflow/tools/graph_transforms:summarize_graph --
-    --in_graph=tensorflow_inception_graph.pb
-
-Once you have an idea of what the input and output nodes are, you can feed them
-into the graph transform tool as the `--input_names` and `--output_names`
-arguments, and call the `strip_unused_nodes` transform, like this:
-
-    bazel run tensorflow/tools/graph_transforms:transform_graph --
-    --in_graph=tensorflow_inception_graph.pb
-    --out_graph=optimized_inception_graph.pb --inputs='Mul' --outputs='softmax'
-    --transforms='
-      strip_unused_nodes(type=float, shape="1,299,299,3")
-      fold_constants(ignore_errors=true)
-      fold_batch_norms
-      fold_old_batch_norms'
-
-One thing to look out for here is that you need to specify the size and type
-that you want your inputs to be. This is because any values that you’re going to
-be passing in as inputs to inference need to be fed to special `Placeholder` op
-nodes, and the transform may need to create them if they don’t already exist. In
-the case of Inception v3 for example, a `Placeholder` node replaces the old
-`Mul` node that used to output the resized and rescaled image array, since we’re
-going to be doing that processing ourselves before we call TensorFlow. It keeps
-the original name though, which is why we always feed in inputs to `Mul` when we
-run a session with our modified Inception graph.
-
-After you’ve run this process, you’ll have a graph that only contains the actual
-nodes you need to run your prediction process. This is the point where it
-becomes useful to run metrics on the graph, so it’s worth running
-`summarize_graph` again to understand what’s in your model.
-
-## What ops should you include on mobile?
-
-There are hundreds of operations available in TensorFlow, and each one has
-multiple implementations for different data types. On mobile platforms, the size
-of the executable binary that’s produced after compilation is important, because
-app download bundles need to be as small as possible for the best user
-experience. If all of the ops and data types are compiled into the TensorFlow
-library then the total size of the compiled library can be tens of megabytes, so
-by default only a subset of ops and data types are included.
-
-That means that if you load a model file that’s been trained on a desktop
-machine, you may see the error “No OpKernel was registered to support Op” when
-you load it on mobile. The first thing to try is to make sure you’ve stripped
-out any training-only nodes, since the error will occur at load time even if the
-op is never executed. If you’re still hitting the same problem once that’s done,
-you’ll need to look at adding the op to your built library.
-
-The criteria for including ops and types fall into several categories:
-
-- Are they only useful in back-propagation, for gradients? Since mobile is
-  focused on inference, we don’t include these.
-
-- Are they useful mainly for other training needs, such as checkpoint saving?
-  These we leave out.
-
-- Do they rely on frameworks that aren’t always available on mobile, such as
-  libjpeg? To avoid extra dependencies we don’t include ops like `DecodeJpeg`.
-
-- Are there types that aren’t commonly used? We don’t include boolean variants
-  of ops for example, since we don’t see much use of them in typical inference
-  graphs.
-
-These ops are trimmed by default to optimize for inference on mobile, but it is
-possible to alter some build files to change the default.  After alternating the
-build files, you will need to recompile TensorFlow.  See below for more details
-on how to do this, and also see <a href="./optimizing.md">optimizing binary size</a>
-for more on reducing your binary size.
-
-### Locate the implementation
-
-Operations are broken into two parts. The first is the op definition, which
-declares the signature of the operation, which inputs, outputs, and attributes
-it has. These take up very little space, and so all are included by default. The
-implementations of the op computations are done in kernels, which live in the
-`tensorflow/core/kernels` folder. You need to compile the C++ file containing
-the kernel implementation of the op you need into the library. To figure out
-which file that is, you can search for the operation name in the source
-files.
-
-[Here’s an example search in github](https://github.com/search?utf8=%E2%9C%93&q=repo%3Atensorflow%2Ftensorflow+extension%3Acc+path%3Atensorflow%2Fcore%2Fkernels+REGISTER+Mul&type=Code&ref=searchresults).
-
-You’ll see that this search is looking for the `Mul` op implementation, and it
-finds it in `tensorflow/core/kernels/cwise_op_mul_1.cc`. You need to look for
-macros beginning with `REGISTER`, with the op name you care about as one of the
-string arguments.
-
-In this case, the implementations are actually broken up across multiple `.cc`
-files, so you’d need to include all of them in your build. If you’re more
-comfortable using the command line for code search, here’s a grep command that
-also locates the right files if you run it from the root of your TensorFlow
-repository:
-
-`grep 'REGISTER.*"Mul"' tensorflow/core/kernels/*.cc`
-
-### Add the implementation to the build
-
-If you’re using Bazel, and building for Android, you’ll want to add the files
-you’ve found to
-the
-[`android_extended_ops_group1`](https://www.tensorflow.org/code/tensorflow/core/kernels/BUILD#L3565) or
-[`android_extended_ops_group2`](https://www.tensorflow.org/code/tensorflow/core/kernels/BUILD#L3632) targets. You
-may also need to include any .cc files they depend on in there. If the build
-complains about missing header files, add the .h’s that are needed into
-the
-[`android_extended_ops`](https://www.tensorflow.org/code/tensorflow/core/kernels/BUILD#L3525) target.
-
-If you’re using a makefile targeting iOS, Raspberry Pi, etc, go to
-[`tensorflow/contrib/makefile/tf_op_files.txt`](https://www.tensorflow.org/code/tensorflow/contrib/makefile/tf_op_files.txt) and
-add the right implementation files there.
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 1cec0d0c290679c7755cbf84858317489c0ba159..a9091924c064341316d788704daa643bc8e247b5 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/graph_info.h"
 #include <algorithm>
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -94,6 +95,10 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // been identified.
     for (int output_index : info_->outputs()) {
       int output_epoch = tensor_epochs_[output_index];
+      if (output_epoch == kEpochAlwaysReady) {
+        // This happens when an input of subgraph is also an output of subgraph.
+        continue;
+      }
       NodeSubset& output_subset = (*node_subsets_)[output_epoch];
       output_subset.output_tensors.push_back(output_index);
     }
@@ -138,7 +143,8 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // See if all dependencies of this node are already assigned to a
     // node sub set.
     for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
-      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+      if (input_tensor_index != kOptionalTensor &&
+          tensor_epochs_[input_tensor_index] == kEpochNotReady) {
         return false;
       }
     }
@@ -162,6 +168,9 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
       // Look at our inputs one more time to update that tensor's
       // epochs' outputs
       for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        if (input_tensor_index == kOptionalTensor) {
+          continue;
+        }
         int input_epoch = tensor_epochs_[input_tensor_index];
         int node_epoch = current_epoch;
         if (input_epoch != node_epoch) {
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 4d8bbdc0eef49b3f79b3c74c1d07fd86467e1d65..b72728a9a9c94c4ee7312e5ff6f17e7b4d3b8a95 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -101,7 +101,7 @@ void CheckPartitionSubgraphs(
 }
 
 // Test an empty trivial graph with no partitions.
-TEST(PartitionTest, Nodes0_PartitionNodes0) {
+TEST(PartitionTest, Nodes0PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
   std::vector<NodeSubset> generated_subgraphs;
@@ -109,6 +109,20 @@ TEST(PartitionTest, Nodes0_PartitionNodes0) {
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
 
+// Test a trivial graph with no node and only 1 tensor.
+// The tensor is input & output of the graph at the same time.
+// Note: This is a regression test to ensure the partitioning logic
+// handles this case without crashing.
+TEST(PartitionTest, Nodes0PartitionNodes0Tensors1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(1);
+  graph.SetInputsAndOutputs({0}, {0});
+  std::vector<int> nodes_to_partition = {};
+  std::vector<NodeSubset> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+  CheckPartitionSubgraphs(generated_subgraphs, {});
+}
+
 // Test a 1 node graph with no partitions.
 // Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[]
 // Output: [kTfNoPartition, tensor(0) -> node(0) -> tensor(1)]
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 60fa2130fabaa692d23c109f42fa8883f6e8de19..6888183d14ce3375443cc4b50ba00f339da6530e 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi_delegate.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -32,9 +34,36 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+
+// Gets the current TfLiteQuantization from the legacy fLiteQuantizationParams.
+TfLiteQuantization GetQuantizationFromLegacy(
+    const TfLiteQuantizationParams& legacy_quantization) {
+  TfLiteQuantization quantization;
+  quantization.type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  affine_quantization->scale->data[0] = legacy_quantization.scale;
+  affine_quantization->zero_point->data[0] = legacy_quantization.zero_point;
+  quantization.params = affine_quantization;
+
+  return quantization;
+}
+
+}  // namespace
+
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
+  // Only log initialization once per-process to avoid log spam.
+  static std::once_flag init_log_once_flag;
+  std::call_once(init_log_once_flag, []() {
+    // TODO(b/128420794): Include the TFLite runtime version in the log.
+    TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Initialized TensorFlow Lite runtime.");
+  });
+
   // There's always at least 1 subgraph which is the primary subgraph.
   AddSubgraphs(1);
   context_ = primary_subgraph().context();
@@ -71,7 +100,7 @@ TfLiteStatus Interpreter::AllocateTensors() {
 }
 
 void Interpreter::ReserveNodes(int count) {
-  primary_subgraph().nodes_and_registration().reserve(count);
+  primary_subgraph().ReserveNodes(count);
 }
 
 void Interpreter::AddSubgraphs(int subgraphs_to_add,
@@ -123,24 +152,49 @@ TfLiteStatus Interpreter::ResetVariableTensors() {
   return primary_subgraph().ResetVariableTensors();
 }
 
+TfLiteStatus Interpreter::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    const char* buffer, size_t bytes, const Allocation* allocation) {
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, dims.size(), dims.data(), quantization, buffer,
+      bytes, allocation);
+}
+
+TfLiteStatus Interpreter::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    bool is_variable) {
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, dims.size(), dims.data(), quantization,
+      is_variable);
+}
+
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  return primary_subgraph().SetTensorParametersReadOnly(
-      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
-      allocation);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadOnly(
+          tensor_index, type, name, rank, dims, new_quantization, buffer, bytes,
+          allocation) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
-// Set description of inputs/outputs/data/fptrs for node `node_index`.
-// This variant assumes an external buffer has been allocated of size
-// bytes. The lifetime of buffer must be ensured to be greater or equal
-// to Interpreter.
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  return primary_subgraph().SetTensorParametersReadWrite(
-      tensor_index, type, name, rank, dims, quantization, is_variable);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadWrite(
+          tensor_index, type, name, rank, dims, new_quantization,
+          is_variable) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
@@ -178,7 +232,10 @@ void Interpreter::SetCancellationFunction(void* data,
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
-  return primary_subgraph().ModifyGraphWithDelegate(delegate);
+  for (auto& subgraph : subgraphs_) {
+    TF_LITE_ENSURE_OK(context_, subgraph->ModifyGraphWithDelegate(delegate));
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 754439c9667980454d5ee4ef61892a4869cd95be..806b66c12a0bf119985927e4e937c71fc6fed487 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -160,6 +160,12 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -179,6 +185,13 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name,
+                                            const std::vector<int>& dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadWrite(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 1ee993c6f9c65289c18f73e7974c5ff7df713d5b..f67733f89f258d931dfae68c9d5e9e83a4c522ce 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/interpreter.h"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -51,13 +52,25 @@ TfLiteRegistration* Register_NEG();
 }  // namespace ops
 namespace {
 
+using ::testing::IsEmpty;
+
 // Make an interpreter that has no tensors and no nodes
 TEST(BasicInterpreter, ZeroInterpreter) {
+  testing::internal::CaptureStderr();
+
   Interpreter interpreter;
+  EXPECT_THAT(testing::internal::GetCapturedStderr(),
+              testing::HasSubstr("INFO: Initialized TensorFlow Lite runtime"));
+
   interpreter.SetInputs({});
   interpreter.SetOutputs({});
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+
+  // Creating a new interpreter should not redundantly log runtime init.
+  testing::internal::CaptureStderr();
+  Interpreter interpreter2;
+  EXPECT_THAT(testing::internal::GetCapturedStderr(), IsEmpty());
 }
 
 // Test various error conditions.
@@ -73,8 +86,9 @@ TEST(BasicInterpreter, TestAllocateTensorsResetVariableTensors) {
   int tensor_index;
   ASSERT_EQ(interpreter.AddTensors(1, &tensor_index), kTfLiteOk);
   constexpr int kTensorSize = 16;
+  TfLiteQuantizationParams quant;
   interpreter.SetTensorParametersReadWrite(tensor_index, kTfLiteFloat32, "",
-                                           {kTensorSize}, {}, true);
+                                           {kTensorSize}, quant, true);
   interpreter.SetVariables({tensor_index});
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   TfLiteTensor* tensor = interpreter.tensor(tensor_index);
@@ -170,6 +184,53 @@ TEST(BasicInterpreter, CheckAllocate) {
   }
 }
 
+TEST(BasicInterpreter, CheckQuantization) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  interpreter.SetInputs({0, 1});
+  interpreter.SetOutputs({});
+  TfLiteType tensor_type = kTfLiteInt8;
+  const uint8_t int8s[] = {3, 4};
+  float scale = 0.5f;
+  int32_t zero_point = 12;
+
+  TfLiteQuantization rw_quantization;
+  rw_quantization.type = kTfLiteAffineQuantization;
+  auto* rw_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  rw_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  rw_affine_quantization->scale->data[0] = scale;
+  rw_affine_quantization->zero_point->data[0] = zero_point;
+  rw_quantization.params = rw_affine_quantization;
+
+  TfLiteQuantization ro_quantization;
+  ro_quantization.type = kTfLiteAffineQuantization;
+  auto* ro_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  ro_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  ro_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  ro_affine_quantization->scale->data[0] = scale;
+  ro_affine_quantization->zero_point->data[0] = zero_point;
+  ro_quantization.params = ro_affine_quantization;
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(0, tensor_type, "", {3},
+                                                     rw_quantization),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadOnly(
+                1, tensor_type, "", {2}, ro_quantization,
+                reinterpret_cast<const char*>(int8s), 2),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  // Check that the legacy scale and zero_point are set correctly.
+  ASSERT_EQ(interpreter.tensor(0)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(0)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(0)->quantization.type, rw_quantization.type);
+  ASSERT_EQ(interpreter.tensor(1)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(1)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(1)->quantization.type, ro_quantization.type);
+}
+
 TEST(BasicInterpreter, CheckResize) {
   const float floats[] = {-3., -4.};
   const int32_t int32s[] = {-3, -4};
diff --git a/tensorflow/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
index b91c6d149a213926be90b9b131bd632d4f79a0fc..a76a727ec75d231a506b4ef693b3dcd681515b1a 100644
--- a/tensorflow/lite/java/AndroidManifest.xml
+++ b/tensorflow/lite/java/AndroidManifest.xml
@@ -3,7 +3,6 @@
     package="org.tensorflow.lite">
 
     <uses-sdk
-        android:minSdkVersion="4"
         android:targetSdkVersion="19" />
 
     <application />
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index a539a0cf771a898ec8d9eefb8d58590ccbeb7e22..8983079a31d7d99dbd666387c0a2c0ded63747e8 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -90,6 +90,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
         ":tensorflowlitelib",
@@ -103,6 +106,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
         ":tensorflowlitelib",
@@ -126,6 +132,9 @@ java_test(
         "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
         ":tensorflowlitelib",
@@ -186,6 +195,9 @@ java_test(
         "src/testdata/add.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorTest",
     deps = [
         ":tensorflowlitelib",
@@ -199,6 +211,9 @@ filegroup(
     srcs = select({
         "//conditions:default": [":libtensorflowlite_jni.so"],
     }),
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 814d236872caff05e9fbd4dc5aa4a9a995eb586b..c6f315b545bbe8196999df07c6a4bcdfdaafa2d5 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -45,7 +45,6 @@ import android.os.Bundle;
 import android.os.Handler;
 import android.os.HandlerThread;
 import android.support.annotation.NonNull;
-import android.support.v13.app.FragmentCompat;
 import android.support.v4.content.ContextCompat;
 import android.text.SpannableString;
 import android.text.SpannableStringBuilder;
@@ -62,6 +61,7 @@ import android.widget.ListView;
 import android.widget.NumberPicker;
 import android.widget.TextView;
 import android.widget.Toast;
+import android.support.v13.app.FragmentCompat;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -71,7 +71,6 @@ import java.util.List;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
-
 /** Basic fragments for the Camera. */
 public class Camera2BasicFragment extends Fragment
     implements FragmentCompat.OnRequestPermissionsResultCallback {
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
index ce17ac4fa0d37cb0b790617c4258ea469d14a664..3121cda7fe65a245a544fc8ec74c617f91166177 100644
--- a/tensorflow/lite/java/jni/BUILD
+++ b/tensorflow/lite/java/jni/BUILD
@@ -39,7 +39,7 @@ genrule(
 genrule(
     name = "copy_jni_md_h",
     srcs = select({
-        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//tensorflow:macos": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
         "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
     }),
     outs = ["jni_md.h"],
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 774320871eec9afb2fae31824dc021fb7d338e1e..b00c9cd05809c9a694f32a25ae4fde3c33d40a88 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -19,7 +19,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicClassifierTest",
     visibility = ["//visibility:public"],
     deps = [
@@ -87,7 +90,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicDetectorTest",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 725bb326ba1d6a9d9c206cd4fb01bdf687b0a79c..16cca45f388953e0616dd0b1b4c24114e5a6108b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -107,7 +107,7 @@ public final class Tensor {
       throw new IllegalArgumentException(
           "Null inputs are allowed only if the Tensor is bound to a buffer handle.");
     }
-    throwExceptionIfTypeIsIncompatible(src);
+    throwIfDataIsIncompatible(src);
     if (isByteBuffer(src)) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
       // For direct ByteBuffer instances we support zero-copy. Note that this assumes the caller
@@ -138,7 +138,7 @@ public final class Tensor {
       throw new IllegalArgumentException(
           "Null outputs are allowed only if the Tensor is bound to a buffer handle.");
     }
-    throwExceptionIfTypeIsIncompatible(dst);
+    throwIfDataIsIncompatible(dst);
     if (dst instanceof ByteBuffer) {
       ByteBuffer dstByteBuffer = (ByteBuffer) dst;
       dstByteBuffer.put(buffer());
@@ -159,6 +159,7 @@ public final class Tensor {
     if (isByteBuffer(input)) {
       return null;
     }
+    throwIfTypeIsIncompatible(input);
     int[] inputShape = computeShapeOf(input);
     if (Arrays.equals(shapeCopy, inputShape)) {
       return null;
@@ -243,16 +244,14 @@ public final class Tensor {
     }
   }
 
-  private void throwExceptionIfTypeIsIncompatible(Object o) {
+  private void throwIfDataIsIncompatible(Object o) {
+    throwIfTypeIsIncompatible(o);
+    throwIfShapeIsIncompatible(o);
+  }
+
+  private void throwIfTypeIsIncompatible(Object o) {
+    // ByteBuffer payloads can map to any type, so exempt it from the check.
     if (isByteBuffer(o)) {
-      ByteBuffer oBuffer = (ByteBuffer) o;
-      if (oBuffer.capacity() != numBytes()) {
-        throw new IllegalArgumentException(
-            String.format(
-                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
-                    + "ByteBuffer with %d bytes.",
-                numBytes(), oBuffer.capacity()));
-      }
       return;
     }
     DataType oType = dataTypeOf(o);
@@ -263,7 +262,20 @@ public final class Tensor {
                   + "object of type %s (which is compatible with the TensorFlowLite type %s).",
               dtype, o.getClass().getName(), oType));
     }
+  }
 
+  private void throwIfShapeIsIncompatible(Object o) {
+    if (isByteBuffer(o)) {
+      ByteBuffer oBuffer = (ByteBuffer) o;
+      if (oBuffer.capacity() != numBytes()) {
+        throw new IllegalArgumentException(
+            String.format(
+                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
+                    + "ByteBuffer with %d bytes.",
+                numBytes(), oBuffer.capacity()));
+      }
+      return;
+    }
     int[] oShape = computeShapeOf(o);
     if (!Arrays.equals(oShape, shapeCopy)) {
       throw new IllegalArgumentException(
diff --git a/tensorflow/lite/java/src/main/native/exception_jni.cc b/tensorflow/lite/java/src/main/native/exception_jni.cc
index 5406c7197f0c6ba6fd17c3472a365ef2d56d07a4..74217d6b682c72917ed00d4ef8173ab0c0af0605 100644
--- a/tensorflow/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/lite/java/src/main/native/exception_jni.cc
@@ -31,12 +31,14 @@ void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
   va_start(args, fmt);
   const size_t max_msg_len = 512;
   auto* message = static_cast<char*>(malloc(max_msg_len));
-  if (vsnprintf(message, max_msg_len, fmt, args) >= 0) {
+  if (message && (vsnprintf(message, max_msg_len, fmt, args) >= 0)) {
     env->ThrowNew(env->FindClass(clazz), message);
   } else {
     env->ThrowNew(env->FindClass(clazz), "");
   }
-  free(message);
+  if (message) {
+    free(message);
+  }
   va_end(args);
 }
 
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index c5496e3a21e7f5d27c36d92e49dd6c8e622b0070..ff3325633c1d71a950682764b6d1576e3b75ed6a 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.fail;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -247,6 +248,18 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testRunWithUnsupportedInputType() {
+    FloatBuffer floatBuffer = FloatBuffer.allocate(10);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    try (Interpreter interpreter = new Interpreter(MODEL_FILE)) {
+      interpreter.run(floatBuffer, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("DataType error: cannot resolve DataType of");
+    }
+  }
+
   @Test
   public void testRunWithWrongOutputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
diff --git a/tensorflow/lite/kernels/Android.bp b/tensorflow/lite/kernels/Android.bp
index f0c0d12834934ba1d81e3f854f21f5178e33a0dc..b9da5fb594ce543978fa5f10f2532d71ee00da6f 100644
--- a/tensorflow/lite/kernels/Android.bp
+++ b/tensorflow/lite/kernels/Android.bp
@@ -28,6 +28,8 @@ cc_library_static {
     ],
     cflags: [
         "-Wno-extern-c-compat",
+        "-Wno-sign-compare",
+        "-Wno-unused-function",
     ]
 }
 
@@ -37,12 +39,14 @@ cc_library_static {
     srcs: [
         "activations.cc",
         "add.cc",
+        "add_n.cc",
         "arg_min_max.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "ceil.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -62,10 +66,11 @@ cc_library_static {
         "floor_mod.cc",
         "fully_connected.cc",
         "gather.cc",
+        "gather_nd.cc",
         "hashtable_lookup.cc",
+        "if.cc",
         "kernel_util.cc",
         "l2norm.cc",
-        "layer_norm_lstm.cc",
         "local_response_norm.cc",
         "logical.cc",
         "lsh_projection.cc",
@@ -81,9 +86,11 @@ cc_library_static {
         "pooling.cc",
         "pow.cc",
         "range.cc",
+        "rank.cc",
         "reduce.cc",
-        "relu1.cc",
         "register.cc",
+        "reverse.cc",
+        "reverse_sequence.cc",
         "reshape.cc",
         "resize_bilinear.cc",
         "resize_nearest_neighbor.cc",
@@ -109,6 +116,8 @@ cc_library_static {
         "unidirectional_sequence_rnn.cc",
         "unique.cc",
         "unpack.cc",
+        "where.cc",
+        "while.cc",
         "zeros_like.cc",
 	"internal/kernel_utils.cc",
         "internal/tensor_utils.cc",
@@ -127,10 +136,13 @@ cc_library_static {
         "-Wno-array-bounds",
         "-Wno-extern-c-compat",
         "-Wno-invalid-partial-specialization",
+        "-Wno-ignored-attributes",
         "-Wno-missing-field-initializers",
         "-Wno-sign-compare",
         "-Wno-unused-local-typedef",
+        "-Wno-unused-function",
         "-Wno-unused-variable",
+        "-Wno-unused-private-field",
         "-Wno-mismatched-tags",
     ],
 }
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 7a4b6b8644be52274f298f6a23c55d677fcfdd35..4f04445d972eb6b1025e49d3c3148be7b736fa74 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -6,7 +6,7 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_android")
+load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
 
 # Suppress warnings that are introduced by Eigen Tensor.
 EXTRA_EIGEN_COPTS = select({
@@ -21,7 +21,7 @@ EXTRA_EIGEN_COPTS = select({
     "//conditions:default": ["-Wno-error=reorder"],
 })
 
-tf_cc_test(
+cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools/optimize:quantization_utils",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -110,12 +111,12 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:round",
-        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
@@ -126,7 +127,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.cc"],
@@ -151,6 +152,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "add_n.cc",
         "arg_min_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
@@ -158,6 +160,7 @@ cc_library(
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "ceil.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -177,9 +180,10 @@ cc_library(
         "floor_mod.cc",
         "fully_connected.cc",
         "gather.cc",
+        "gather_nd.cc",
         "hashtable_lookup.cc",
+        "if.cc",
         "l2norm.cc",
-        "layer_norm_lstm.cc",
         "local_response_norm.cc",
         "logical.cc",
         "lsh_projection.cc",
@@ -195,18 +199,19 @@ cc_library(
         "pooling.cc",
         "pow.cc",
         "range.cc",
+        "rank.cc",
         "reduce.cc",
-        "relu1.cc",
         "reshape.cc",
         "resize_bilinear.cc",
         "resize_nearest_neighbor.cc",
+        "reverse.cc",
+        "reverse_sequence.cc",
         "select.cc",
         "shape.cc",
         "skip_gram.cc",
         "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
-        "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
         "split_v.cc",
@@ -223,6 +228,8 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
         "unique.cc",
         "unpack.cc",
+        "where.cc",
+        "while.cc",
         "zeros_like.cc",
     ],
     hdrs = [
@@ -285,14 +292,13 @@ cc_library(
     srcs = ["register_ref.cc"],
     hdrs = ["register_ref.h"],
     deps = [
-        ":builtin_op_kernels",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_internal",
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
@@ -305,7 +311,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
@@ -318,7 +324,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "detection_postprocess_test",
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
@@ -331,37 +337,22 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "relu1_test",
-    size = "small",
-    srcs = ["relu1_test.cc"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "sparse_output_fully_connected_test",
+cc_test(
+    name = "activations_test",
     size = "small",
-    srcs = ["sparse_output_fully_connected_test.cc"],
+    srcs = ["activations_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
-        "//tensorflow/lite/kernels/internal:types",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
-tf_cc_test(
-    name = "activations_test",
+cc_test(
+    name = "add_test",
     size = "small",
-    srcs = ["activations_test.cc"],
+    srcs = ["add_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -370,19 +361,19 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "add_test",
+cc_test(
+    name = "add_n_test",
     size = "small",
-    srcs = ["add_test.cc"],
+    srcs = ["add_n_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_util",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "arg_min_max_test",
     size = "small",
     srcs = ["arg_min_max_test.cc"],
@@ -394,7 +385,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
@@ -406,7 +397,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
@@ -418,7 +409,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
@@ -432,7 +423,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
@@ -444,7 +435,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
@@ -456,7 +447,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
@@ -468,7 +459,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
@@ -480,7 +471,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
@@ -493,7 +484,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
@@ -506,7 +497,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
@@ -520,7 +511,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
@@ -532,7 +523,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
@@ -545,7 +536,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "floor_test",
     size = "small",
     srcs = ["floor_test.cc"],
@@ -557,7 +548,22 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
+    name = "ceil_test",
+    size = "small",
+    srcs = ["ceil_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
@@ -569,7 +575,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
@@ -581,7 +587,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "bidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
@@ -593,7 +599,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
@@ -605,7 +611,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
@@ -617,7 +623,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
@@ -629,7 +635,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "fake_quant_test",
     size = "small",
     srcs = ["fake_quant_test.cc"],
@@ -641,7 +647,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
@@ -653,7 +659,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.cc"],
@@ -665,7 +671,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
@@ -677,7 +683,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
@@ -689,7 +695,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
@@ -701,7 +707,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
@@ -714,7 +720,20 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
+    name = "gather_nd_test",
+    size = "small",
+    srcs = ["gather_nd_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
@@ -727,7 +746,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
@@ -739,7 +758,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "resize_nearest_neighbor_test",
     size = "small",
     srcs = ["resize_nearest_neighbor_test.cc"],
@@ -751,7 +770,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
@@ -763,7 +782,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
@@ -775,7 +794,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
@@ -787,7 +806,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
@@ -801,7 +820,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
@@ -813,7 +832,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
@@ -825,7 +844,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
@@ -838,7 +857,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
@@ -851,7 +870,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
@@ -863,7 +882,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
@@ -876,20 +895,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "layer_norm_lstm_test",
-    size = "small",
-    srcs = ["layer_norm_lstm_test.cc"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
+cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
@@ -901,7 +907,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
@@ -914,7 +920,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
@@ -926,7 +932,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
@@ -938,7 +944,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "split_v_test",
     size = "small",
     srcs = ["split_v_test.cc"],
@@ -950,7 +956,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
@@ -962,7 +968,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
@@ -974,7 +980,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "tile_test",
     size = "small",
     srcs = ["tile_test.cc"],
@@ -987,7 +993,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "comparisons_test",
     size = "small",
     srcs = [
@@ -1001,7 +1007,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "neg_test",
     size = "small",
     srcs = ["neg_test.cc"],
@@ -1013,7 +1019,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "select_test",
     size = "small",
     srcs = [
@@ -1027,7 +1033,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "slice_test",
     size = "small",
     srcs = [
@@ -1041,7 +1047,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "transpose_conv_test",
     size = "small",
     srcs = ["transpose_conv_test.cc"],
@@ -1054,7 +1060,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "expand_dims_test",
     size = "small",
     srcs = ["expand_dims_test.cc"],
@@ -1067,7 +1073,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "sparse_to_dense_test",
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
@@ -1080,7 +1086,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "shape_test",
     size = "small",
     srcs = ["shape_test.cc"],
@@ -1093,7 +1099,20 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
+    name = "rank_test",
+    size = "small",
+    srcs = ["rank_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "pow_test",
     size = "small",
     srcs = ["pow_test.cc"],
@@ -1106,7 +1125,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "pack_test",
     size = "small",
     srcs = ["pack_test.cc"],
@@ -1119,7 +1138,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "one_hot_test",
     size = "small",
     srcs = ["one_hot_test.cc"],
@@ -1131,7 +1150,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "logical_test",
     size = "small",
     srcs = ["logical_test.cc"],
@@ -1144,7 +1163,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "unpack_test",
     size = "small",
     srcs = ["unpack_test.cc"],
@@ -1157,7 +1176,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "floor_div_test",
     size = "small",
     srcs = ["floor_div_test.cc"],
@@ -1170,7 +1189,20 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
+    name = "where_test",
+    size = "small",
+    srcs = ["where_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
     name = "zeros_like_test",
     size = "small",
     srcs = ["zeros_like_test.cc"],
@@ -1183,7 +1215,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "floor_mod_test",
     size = "small",
     srcs = ["floor_mod_test.cc"],
@@ -1196,7 +1228,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "range_test",
     size = "small",
     srcs = ["range_test.cc"],
@@ -1209,19 +1241,54 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "squared_difference_test",
     size = "small",
     srcs = ["squared_difference_test.cc"],
     deps = [
         ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
     ],
 )
 
-tf_cc_test(
+cc_test(
+    name = "if_test",
+    size = "small",
+    srcs = ["if_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":subgraph_test_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":subgraph_test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
@@ -1233,7 +1300,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "unique_test",
     srcs = ["unique_test.cc"],
     deps = [
@@ -1244,6 +1311,18 @@ tf_cc_test(
     ],
 )
 
+cc_test(
+    name = "reverse_test",
+    size = "small",
+    srcs = ["reverse_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -1258,7 +1337,7 @@ filegroup(
 
 tflite_portable_test_suite()
 
-tf_cc_test(
+cc_test(
     name = "mirror_pad_test",
     srcs = ["mirror_pad_test.cc"],
     deps = [
@@ -1268,3 +1347,43 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "subgraph_test_util",
+    testonly = 1,
+    srcs = ["subgraph_test_util.cc"],
+    hdrs = ["subgraph_test_util.h"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "subgraph_test_util_test",
+    size = "small",
+    srcs = ["subgraph_test_util_test.cc"],
+    deps = [
+        ":subgraph_test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "reverse_sequence_test",
+    size = "small",
+    srcs = ["reverse_sequence_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 4463a6c5a65bf848ad68635717750d3a214dd0a0..930eabaeccfde5c9fce824a58d28d14783dde419 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -23,7 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -60,9 +63,9 @@ namespace {
 TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     const TfLiteTensor* output) {
+  TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
   } else {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
   }
@@ -118,7 +121,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     static constexpr int kInputIntegerBits = 4;
 
     const double input_real_multiplier =
@@ -177,8 +180,15 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<uint8_t>::min());
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<int8_t>::min());
+    }
     TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
 
     static constexpr int kInputIntegerBits = 4;
@@ -261,8 +271,13 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 127);
+    }
     TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
 
     static const double kBeta = 1.0;
@@ -353,6 +368,24 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+namespace {
+template <typename T>
+void QuantizedRelu6(const TfLiteTensor* input, TfLiteTensor* output) {
+  ActivationParams params;
+  params.activation_type = FusedActivationFunctionType::kRelu6;
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(0.f / output->params.scale)));
+  params.quantized_activation_max =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+  optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData<T>(input),
+                       GetTensorShape(output), GetTensorData<T>(output));
+}
+}  // namespace
+
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -365,23 +398,16 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
-    case kTfLiteUInt8: {
-      ActivationParams params;
-      params.activation_type = FusedActivationFunctionType::kRelu6;
-      params.quantized_activation_min = std::max(
-          0, output->params.zero_point +
-                 static_cast<int32>(roundf(0.f / output->params.scale)));
-      params.quantized_activation_max = std::min(
-          255, output->params.zero_point +
-                   static_cast<int32>(roundf(6.f / output->params.scale)));
-      optimized_ops::ReluX(params, GetTensorShape(input),
-                           GetTensorData<uint8>(input), GetTensorShape(output),
-                           GetTensorData<uint8>(output));
+    case kTfLiteUInt8:
+      QuantizedRelu6<uint8_t>(input, output);
+      return kTfLiteOk;
+    case kTfLiteInt8: {
+      QuantizedRelu6<int8_t>(input, output);
       return kTfLiteOk;
     } break;
     default:
       context->ReportError(
-          context, "Only float32 and uint8 supported currently, got %s.",
+          context, "Only float32, uint8 and int8 supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
@@ -436,6 +462,16 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       }
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int size = MatchingFlatSize(input_shape, output_shape);
+      reference_integer_ops::Tanh(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently, got %s.",
                            TfLiteTypeGetName(input->type));
@@ -493,6 +529,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     }
+    case kTfLiteInt8: {
+      const int input_size =
+          MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+      reference_integer_ops::Logistic(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, input_size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
+      break;
+    }
     default:
       context->ReportError(context, "Only float32 supported currently, got %s.",
                            TfLiteTypeGetName(input->type));
@@ -815,6 +860,21 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       }
       return kTfLiteOk;
     }
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int trailing_dim = input_shape.DimensionsCount() - 1;
+      const int outer_size =
+          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+      const int depth =
+          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+      reference_integer_ops::LogSoftmax(
+          data->input_multiplier, data->input_left_shift,
+          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+          data->diff_min, outer_size, depth, GetTensorData<int8_t>(input),
+          GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    }
     default:
       context->ReportError(context, "Only float32 supported currently., got %s",
                            TfLiteTypeGetName(input->type));
@@ -884,8 +944,31 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::Elu(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
 }  // namespace activations
 
+TfLiteRegistration* Register_ELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::EluEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_RELU() {
   static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
                                  activations::GenericPrepare,
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 5e3c56ed5bf7092581fbbced6d3735958c19580c..25b17a9678728f0ee82ccf22e2a5b63eee2c3537 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -32,6 +32,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     input_ = AddInput(input);
     if (input.type == TensorType_UINT8) {
       output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256, -128});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -116,6 +118,20 @@ class QuantizedActivationsOpModel : public BaseActivationsOpModel {
   }
 };
 
+TEST(FloatActivationsOpTest, Elu) {
+  FloatActivationsOpModel m(BuiltinOperator_ELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, -4,     //
+      3, -2, 10, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.0, -0.997521, 2.0, -0.981684,    //
+                                 3.0, -0.864665, 10.0, -0.0951626,  //
+                             })));
+}
+
 TEST(FloatActivationsOpTest, Relu) {
   FloatActivationsOpModel m(BuiltinOperator_RELU,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
@@ -172,7 +188,7 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Relu6) {
+TEST(QuantizedActivationsOpTest, Relu6Uint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -195,7 +211,29 @@ TEST(QuantizedActivationsOpTest, Relu6) {
               ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
 }
 
-TEST(QuantizedActivationsOpTest, Tanh) {
+TEST(QuantizedActivationsOpTest, Relu6Int8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                                    {
+                                                        0, 0, 2, 4,  //
+                                                        3, 0, 6, 1,  //
+                                                    },
+                                                    kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 0, 32, 64, 48, 0, 96, 16}));
+}
+
+TEST(QuantizedActivationsOpTest, TanhUint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -218,6 +256,29 @@ TEST(QuantizedActivationsOpTest, Tanh) {
               ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 225}));
 }
 
+TEST(QuantizedActivationsOpTest, TanhInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      -4, -2, 8, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.999987, 0.964027, 0.999329,     //
+                      -0.999329, -0.96402, 0.99999, 0.76159,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -128, 123, 127, -128, -123, 127, 97}));
+}
+
 TEST(QuantizedActivationsOpTest, TanhInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -253,7 +314,7 @@ TEST(FloatActivationsOpTest, Sigmoid) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Sigmoid) {
+TEST(QuantizedActivationsOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOGISTIC,
       /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
@@ -273,6 +334,26 @@ TEST(QuantizedActivationsOpTest, Sigmoid) {
               ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
+TEST(QuantizedActivationsOpTest, SigmoidInt8) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -127, 99, 123, 116, -99, 127, 60}));
+}
+
 TEST(QuantizedActivationsOpTest, SigmoidInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -705,7 +786,7 @@ TEST(FloatActivationsOpTest, LogSoftmax) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, LogSoftmax) {
+TEST(QuantizedActivationsOpTest, LogSoftmaxUint8) {
   const float kLogSoftmaxQuantizedTolerance = 16 / 256.0;
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOG_SOFTMAX,
@@ -727,6 +808,30 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
+TEST(QuantizedActivationsOpTest, LogSoftmaxInt8) {
+  const float kLogSoftmaxQuantizedTolerance = 0.06355;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOG_SOFTMAX,
+      /*input=*/{TensorType_INT8, {2, 4}, -10, 10},
+      /*output=*/{TensorType_INT8, {}, 0, 0, 16. / 256, 127});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -4.14297, -10.14297, -2.14297, -.142971,    //
+                      -7.00104, -12.00104, -.00104087, -9.00104,  //
+                  },
+                  kLogSoftmaxQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         61, -36, 93, 125,   //
+                                         15, -65, 127, -16,  //
+                                     }));
+}
+
 // A base class of PRelu op model. It provides the constructor for
 // FloatPReluOpModel and QuantizedPReluOpModel.
 class BasePReluOpModel : public SingleOpModel {
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 9867cc53b342d9fddda81db270c223de2ecda14f..4cfe435e9e2c4dab7253c4be4fffcb991cea4abd 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -92,7 +93,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     // 8bit -> 8bit general quantized path, with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
@@ -117,10 +118,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplierSmallerThanOneExp(
         real_output_multiplier, &data->output_multiplier, &data->output_shift);
 
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
-
+    if (output->type == kTfLiteUInt8) {
+      CalculateActivationRangeUint8(params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+    } else {
+      CalculateActivationRangeInt8(params->activation, output,
+                                   &data->output_activation_min,
+                                   &data->output_activation_max);
+    }
   } else if (output->type == kTfLiteInt16) {
     // 16bit -> 16bit special quantized path, supporting only a rather
     // narrow case of quantization parameters: zero_points must all be 0
@@ -219,7 +225,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -235,25 +241,33 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                         data->output_activation_max, &op_params);
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output));
-    if (kernel_type == kReference) {
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
       } else {
-        TF_LITE_ADD(reference_ops, Add);
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
       }
     } else {
-      if (op_params.broadcast_category ==
-          BroadcastableOpCategory::kGenericBroadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
-      } else if (need_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        } else {
+          TF_LITE_ADD(reference_ops, Add, uint8_t);
+        }
       } else {
-        TF_LITE_ADD(optimized_ops, Add);
+        if (op_params.broadcast_category ==
+            BroadcastableOpCategory::kGenericBroadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, uint8_t);
+        } else if (need_broadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAddFivefold, uint8_t);
+        } else {
+          TF_LITE_ADD(optimized_ops, Add, uint8_t);
+        }
       }
     }
 #undef TF_LITE_ADD
@@ -292,7 +306,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(context,
                       EvalAddQuantized<kernel_type>(context, node, params, data,
                                                     input1, input2, output));
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9b2ea24afcd3eff107b110e7d5bb6226d95d3a
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace add_n {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int num_inputs = NumInputs(node);
+  TF_LITE_ENSURE(context, num_inputs >= 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = input1->type;
+
+  // Check that all input tensors have the same shape and type.
+  for (int i = kInputTensor1 + 1; i < num_inputs; ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, HaveSameShapes(input1, input));
+    TF_LITE_ENSURE_EQ(context, input1->type, input->type);
+  }
+
+  // Use the first input node's dimension to be the dimension of the output
+  // node.
+  TfLiteIntArray* input1_dims = input1->dims;
+  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input1_dims);
+  return context->ResizeTensor(context, output, output_dims);
+}
+
+template <typename T>
+void EvalAddN(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(haoliang): Initialize all_inputs only once during init.
+  VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  int num_inputs = NumInputs(node);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  reference_ops::AddN<T>(GetTensorShape(input1), num_inputs, all_inputs.data(),
+                         GetTensorData<T>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (output->type == kTfLiteFloat32) {
+    EvalAddN<float>(context, node);
+  } else if (output->type == kTfLiteInt32) {
+    EvalAddN<int32_t>(context, node);
+  } else {
+    context->ReportError(context,
+                         "AddN only supports FLOAT32|INT32 now, got %s.",
+                         TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace add_n
+
+TfLiteRegistration* Register_ADD_N() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 add_n::Prepare, add_n::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee9477d2ff13c4e4f4e2da815d8f5660ab5b6c4e
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseAddNOpModel : public SingleOpModel {
+ public:
+  BaseAddNOpModel(const std::vector<TensorData>& inputs,
+                  const TensorData& output) {
+    int num_inputs = inputs.size();
+    std::vector<std::vector<int>> input_shapes;
+
+    for (int i = 0; i < num_inputs; ++i) {
+      inputs_.push_back(AddInput(inputs[i]));
+      input_shapes.push_back(GetShape(inputs_[i]));
+    }
+
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD_N, BuiltinOptions_AddNOptions,
+                 CreateAddNOptions(builder_).Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  int input(int i) { return inputs_[i]; }
+
+ protected:
+  std::vector<int> inputs_;
+  int output_;
+};
+
+class FloatAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatAddNOpModel, AddMultipleTensors) {
+  FloatAddNOpModel m({{TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}}},
+                     {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input(0), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input(1), {0.1, 0.2, 0.3, 0.5});
+  m.PopulateTensor<float>(m.input(2), {0.5, 0.1, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.4, 0.5, 1.1, 1.5}));
+}
+
+TEST(IntegerAddNOpModel, AddMultipleTensors) {
+  IntegerAddNOpModel m({{TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}}},
+                       {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input(0), {-20, 2, 7, 8});
+  m.PopulateTensor<int32_t>(m.input(1), {1, 2, 3, 5});
+  m.PopulateTensor<int32_t>(m.input(2), {10, -5, 1, -2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-9, -1, 11, 11}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 16045d457238e482bd7aad1077d0344632a7550b..2904f4a11a947264cb12fc2e8c0a7822df24c678 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -63,9 +63,10 @@ class QuantizedAddOpModel : public BaseAddOpModel {
  public:
   using BaseAddOpModel::BaseAddOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -74,17 +75,15 @@ class QuantizedAddOpModel : public BaseAddOpModel {
   }
 };
 
-// for quantized Add, the error shouldn't exceed 2*step
+// for quantized Add, the error shouldn't exceed step
 float GetTolerance(float min, float max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 float GetToleranceInt16(float min, float max) {
   float kQuantizedStep = (max - min) / 32767.f;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 TEST(FloatAddOpModel, NoActivation) {
@@ -191,7 +190,8 @@ TEST(IntegerAddOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
@@ -200,19 +200,28 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -238,7 +247,8 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.3}};
@@ -247,53 +257,74 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-0.2, 0.6, 1.0, -0.1},
                                              {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear({-1.9, 0.5, 1.0, 1.3, 2.2, 2.1},
                                                 kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcast) {
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithScalarBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, {TensorType_UINT8, {}, -3.f, 3.f},
-        ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+        {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input2(), {0.1f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input2(),
+                                                     {0.1f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
                                         kQuantizedTolerance)))
         << "With shape number " << i;
@@ -301,22 +332,31 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcast) {
   // Re-run with exchanged inputs.
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, {}, -3.f, 3.f},
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(model_fixture.input1(), {0.1f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+        {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input1(),
+                                                     {0.1f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
                                         kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcast) {
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastUInt8) {
+  QuantizedWithScalarBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
+  QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithMixedBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   const std::vector<int> base_shape = {2, 3, 1, 2};
   std::vector<std::vector<int>> test_shapes = {
@@ -335,40 +375,48 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcast) {
       {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f,
        -1.3f}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, base_shape, -3.f, 3.f},
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
                                  2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
         << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel model_fixture(
-        {TensorType_UINT8, test_shapes[i], -3.f, 3.f},
-        {TensorType_UINT8, base_shape, -3.f, 3.f},
-        {TensorType_UINT8, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input1(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
-    model_fixture.QuantizeAndPopulate<uint8_t>(
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
         model_fixture.input2(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
                                  2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
     model_fixture.Invoke();
     EXPECT_THAT(
-        model_fixture.GetDequantizedOutput(),
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
         ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastUInt8) {
+  QuantizedWithMixedBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
+  QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index eea2de27f74af8bf73df92c28ed6042e4d8fa4ff..e5223badc407059511f06cd538b6057c1e276966 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -36,9 +36,15 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* input,
     axis_value += NumDimensions(input);
   }
 
-  // Copy the input dimensions to output except make the axis dimension 1.
-  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
-  output_dims->data[axis_value] = 1;
+  // Copy the input dimensions to output except the axis dimension.
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(NumDimensions(input) - 1);
+  int j = 0;
+  for (int i = 0; i < NumDimensions(input); ++i) {
+    if (i != axis_value) {
+      output_dims->data[j] = SizeOfDimension(input, i);
+      ++j;
+    }
+  }
   return context->ResizeTensor(context, output, output_dims);
 }
 
@@ -74,13 +80,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt32:
       break;
 
     default:
       context->ReportError(
           context,
-          "Unkonwn input type: %d, only float32 and int types are supported",
+          "Unknown input type: %d, only float32 and int types are supported",
           input->type);
       return kTfLiteError;
   }
@@ -129,6 +136,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int32_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int32_t);
             break;
@@ -144,6 +154,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int64_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int64_t);
             break;
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index dcdff74cc6f376b3418b64c025e8eb4a36c429a0..01ea923f26d3ca32ec109a61d0484b0ecbd30c93 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -83,7 +83,29 @@ TEST(ArgMaxOpTest, GetMaxArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgUInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_UINT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<uint8_t>(model.input(), {1, 9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int8_t>(model.input(), {-1, -9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgInt) {
@@ -94,7 +116,7 @@ TEST(ArgMaxOpTest, GetMaxArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
@@ -105,7 +127,7 @@ TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
@@ -116,7 +138,7 @@ TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1, 0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgOutput64) {
@@ -127,7 +149,7 @@ TEST(ArgMaxOpTest, GetMaxArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgFloat) {
@@ -138,7 +160,7 @@ TEST(ArgMinOpTest, GetMinArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgInt) {
@@ -149,7 +171,7 @@ TEST(ArgMinOpTest, GetMinArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgMulDimensions) {
@@ -160,7 +182,7 @@ TEST(ArgMinOpTest, GetMinArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
@@ -171,7 +193,7 @@ TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0, 0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMinOpTest, GetMinArgOutput64) {
@@ -182,7 +204,7 @@ TEST(ArgMinOpTest, GetMinArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index 34fdf34f70c9660266e23260bd5a6b645a3c5ccb..ce85aeddedcebdecf4d2944bade2ed5f823b0592 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -148,6 +148,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int8_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index f33089559992c1a6a6fa34161122c43b7954fbdb..bd806b55ca48424e143a77d1f95640365af5fe77 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -26,8 +26,9 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -38,7 +39,10 @@ class BatchToSpaceNDOpModel : public SingleOpModel {
     PopulateTensor<int>(crops_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
@@ -58,11 +62,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
  public:
   BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
                              std::initializer_list<int> block_shape,
-                             std::initializer_list<int> crops) {
-    input_ = AddInput(TensorType_FLOAT32);
+                             std::initializer_list<int> crops,
+                             const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
     crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -81,11 +86,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
 //    m.Invoke();
 class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
  public:
-  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape,
+                               const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddInput(TensorType_INT32);
     crops_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -96,22 +102,47 @@ class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
 
 TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
   BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0},
+                               TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}, TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
@@ -127,7 +158,7 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, -1, 0});
   EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index e807626108c2d45071adab62416c9c31f04d5a9a..0adf574bb0641b2ddd2774f1563a92a66023f7a2 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -53,7 +53,10 @@ constexpr int kBwWeightsTensor = 5;
 constexpr int kBwRecurrentWeightsTensor = 6;
 constexpr int kBwBiasTensor = 7;
 constexpr int kBwHiddenStateTensor = 8;
-// Auxiliary inputs.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 9;       // Optional.
 constexpr int kFwAuxWeightsTensor = 10;  // Optional.
 constexpr int kBwAuxWeightsTensor = 11;  // Optional.
@@ -113,13 +116,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_weights =
       GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) &&
+  const bool aux_inputs_weights_or_none =
+      ((fw_aux_input_weights != nullptr) &&
        (bw_aux_input_weights != nullptr)) ||
-      ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) &&
-       (bw_aux_input_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+      ((fw_aux_input_weights == nullptr) && (bw_aux_input_weights == nullptr));
+  TF_LITE_ENSURE(context, aux_inputs_weights_or_none);
+  const bool has_aux_input = (fw_aux_input_weights != nullptr);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -277,16 +279,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
-    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
-    const TfLiteTensor* bw_input_weights,
-    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
-    const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
-    const TfLiteTensor* bw_aux_input_weights,
-    const TfLiteBidirectionalSequenceRNNParams* params,
-    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
-    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
+TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
+                       const TfLiteTensor* fw_input_weights,
+                       const TfLiteTensor* fw_recurrent_weights,
+                       const TfLiteTensor* fw_bias,
+                       const TfLiteTensor* bw_input_weights,
+                       const TfLiteTensor* bw_recurrent_weights,
+                       const TfLiteTensor* bw_bias,
+                       const TfLiteTensor* aux_input,
+                       const TfLiteTensor* fw_aux_input_weights,
+                       const TfLiteTensor* bw_aux_input_weights,
+                       const TfLiteBidirectionalSequenceRNNParams* params,
+                       TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+                       TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -339,7 +344,7 @@ TfLiteStatus EvalFloat(
     float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
+          bw_input->data.f + s * input_size * batch_size;
       const float* aux_input_ptr_batch =
           (aux_input != nullptr)
               ? aux_input->data.f + s * input_size * batch_size
@@ -407,7 +412,8 @@ TfLiteStatus EvalFloat(
 }
 
 TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
+    const TfLiteTensor* input, const TfLiteTensor* bw_input,
+    const TfLiteTensor* fw_input_weights,
     const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
     const TfLiteTensor* bw_input_weights,
     const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
@@ -504,7 +510,7 @@ TfLiteStatus EvalHybrid(
       float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
       for (int s = max_time - 1; s >= 0; s--) {
         const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size;
+            bw_input->data.f + s * input_size * batch_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
                 ? aux_input->data.f + s * input_size * batch_size
@@ -616,13 +622,35 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                 ? nullptr
                                 : GetOutput(context, node, kBwOutputTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_weights != nullptr);
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_weights->type) {
     case kTfLiteFloat32:
-      return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                       bw_input_weights, bw_recurrent_weights, bw_bias,
-                       aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                       params, fw_hidden_state, fw_output, bw_hidden_state,
-                       bw_output);
+      return EvalFloat(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                       fw_bias, bw_input_weights, bw_recurrent_weights, bw_bias,
+                       real_aux_input, fw_aux_input_weights,
+                       bw_aux_input_weights, params, fw_hidden_state, fw_output,
+                       bw_hidden_state, bw_output);
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       TfLiteTensor* input_quantized =
@@ -634,17 +662,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* scaling_factors =
           GetTemporary(context, node, kScalingFactors);
       TfLiteTensor* aux_input_quantized =
-          (aux_input != nullptr)
-              ? GetTemporary(context, node, kAuxInputQuantized)
-              : nullptr;
-
-      return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                        bw_input_weights, bw_recurrent_weights, bw_bias,
-                        aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                        params, scaling_factors, input_quantized,
-                        aux_input_quantized, fw_hidden_state_quantized,
-                        fw_hidden_state, fw_output, bw_hidden_state_quantized,
-                        bw_hidden_state, bw_output);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
+
+      return EvalHybrid(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                        fw_bias, bw_input_weights, bw_recurrent_weights,
+                        bw_bias, real_aux_input, fw_aux_input_weights,
+                        bw_aux_input_weights, params, scaling_factors,
+                        input_quantized, aux_input_quantized,
+                        fw_hidden_state_quantized, fw_hidden_state, fw_output,
+                        bw_hidden_state_quantized, bw_hidden_state, bw_output);
     }
     default:
       context->ReportError(context, "Type not currently supported.");
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 5bad8e02c29608fa058d0d1104acbf09626f1b66..9b61f8238b558042e7a957d09dac162d8ea6450b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -654,8 +654,8 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size, bool time_major,
-                          bool merge_outputs)
+                          int bw_units, int input_size, bool use_aux_input,
+                          bool time_major, bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
@@ -671,7 +671,13 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     bw_bias_ = AddInput(TensorType_FLOAT32);
     bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
 
-    aux_input_ = AddNullInput();
+    int aux_input_size = 0;
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+      aux_input_size = input_size_;
+    } else {
+      aux_input_ = AddNullInput();
+    }
     aux_fw_weights_ = AddNullInput();
     aux_bw_weights_ = AddNullInput();
 
@@ -691,18 +697,18 @@ class BidirectionalRNNOpModel : public SingleOpModel {
                      : std::vector<int>({batches_, sequence_len_, input_size_});
 
     BuildInterpreter({
-        input_shape,                   // input
-        {fw_units_, input_size_},      // fw_weights
-        {fw_units_, fw_units_},        // fw_recurrent_weights
-        {fw_units_},                   // fw_bias
-        {batches_, fw_units_},         // fw_hidden_state
-        {bw_units_, input_size_},      // bw_weights
-        {bw_units_, bw_units_},        // bw_recurrent_weights
-        {bw_units_},                   // bw_bias
-        {batches_, bw_units_},         // bw_hidden_state
-        {batches_, sequence_len_, 0},  // aux_input
-        {fw_units_, 0},                // aux_fw_weights
-        {bw_units_, 0},                // aux_bw_weights
+        input_shape,                                // input
+        {fw_units_, input_size_},                   // fw_weights
+        {fw_units_, fw_units_},                     // fw_recurrent_weights
+        {fw_units_},                                // fw_bias
+        {batches_, fw_units_},                      // fw_hidden_state
+        {bw_units_, input_size_},                   // bw_weights
+        {bw_units_, bw_units_},                     // bw_recurrent_weights
+        {bw_units_},                                // bw_bias
+        {batches_, bw_units_},                      // bw_hidden_state
+        {batches_, sequence_len_, aux_input_size},  // aux_input
+        {fw_units_, 0},                             // aux_fw_weights
+        {bw_units_, 0},                             // aux_bw_weights
     });
   }
 
@@ -738,6 +744,10 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -775,7 +785,8 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -813,7 +824,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -822,7 +834,6 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   rnn.SetFwRecurrentWeights(recurrent_weights);
   rnn.SetBwRecurrentWeights(recurrent_weights);
 
-  // const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
   // Insert the inputs in time_major format. The batch_major format is:
   // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
   // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
@@ -850,7 +861,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -888,7 +900,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -932,7 +945,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -979,7 +993,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   const int output_size = 4;
   float dnn_weights[] = {
@@ -1046,6 +1061,137 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) {
   }
 }
 
+// Same as BlackBox test, but has aux input.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInput) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has aux input is all zeros.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> bw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has input is all zeros, and aux input is the real
+// input. This is testing the bw path is functional.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> fw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput(2 * i * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_fw_units();
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  }
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/relu1.cc b/tensorflow/lite/kernels/ceil.cc
similarity index 55%
rename from tensorflow/lite/kernels/relu1.cc
rename to tensorflow/lite/kernels/ceil.cc
index 5a55631405b6b32a602cfe21ba863d0dc92213ea..6bb763255b136f1d5103dd2e72ce6aebf38f06d3 100644
--- a/tensorflow/lite/kernels/relu1.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -12,48 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/context.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
-namespace custom {
-namespace relu1 {
+namespace builtin {
+namespace ceil {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TfLiteTensor* output = GetOutput(context, node, 0);
   output->type = input->type;
-  return context->ResizeTensor(context, output,
-                               TfLiteIntArrayCopy(input->dims));
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
 }
 
-// This is derived from lite/kernels/activations.cc.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const int elements = NumElements(input);
-  const float* in = input->data.f;
-  const float* in_end = in + elements;
-  float* out = output->data.f;
-  for (; in < in_end; ++in, ++out) {
-    *out = std::min(std::max(0.f, *in), 1.f);
-  }
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
+                      GetTensorShape(output), GetTensorData<float>(output));
+
   return kTfLiteOk;
 }
+}  // namespace ceil
 
-}  // namespace relu1
-
-TfLiteRegistration* Register_RELU_1() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 relu1::Prepare, relu1::Eval};
+TfLiteRegistration* Register_CEIL() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, ceil::Prepare, ceil::Eval};
   return &r;
 }
 
-}  // namespace custom
+}  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/ceil_test.cc b/tensorflow/lite/kernels/ceil_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e120105082751a732bb8812944c318ad9e5ecff5
--- /dev/null
+++ b/tensorflow/lite/kernels/ceil_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class CeilOpModel : public SingleOpModel {
+ public:
+  CeilOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_CEIL, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(CeilOpTest, SingleDim) {
+  CeilOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(CeilOpTest, MultiDims) {
+  CeilOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 9, 1, 10, 1, 0, -8, 0, -9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index a914449ae552e37249f2cecb5c88f3b49e83f133..e49348a5462d7efcaffad888b8714cadbfb64a10 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -59,11 +59,12 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 // TODO(ruic): optimize macros below to using template functions.
 #define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
+  template <typename input_dtype>                                              \
   void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
                              const TfLiteTensor* input1,                       \
                              const TfLiteTensor* input2, TfLiteTensor* output, \
                              bool requires_broadcast) {                        \
-    if (input1->type == kTfLiteUInt8) {                                        \
+    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
       auto input1_offset = -input1->params.zero_point;                         \
       auto input2_offset = -input2->params.zero_point;                         \
       const int left_shift = 8;                                                \
@@ -87,14 +88,16 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
       op_params.input2_shift = input2_shift;                                   \
       if (requires_broadcast) {                                                \
         reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       } else {                                                                 \
         reference_ops::opname##WithScaling(                                    \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       }                                                                        \
     }                                                                          \
   }
@@ -126,6 +129,9 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      break;
     case kTfLiteFloat32:
       TF_LITE_COMPARISON(float, Equal, requires_broadcast);
       break;
@@ -136,13 +142,17 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedEqual(context, node, input1, input2, output,
-                         requires_broadcast);
+      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
+                                  requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
       break;
     default:
-      context->ReportError(context,
-                           "Does not support type %d, requires float|int|uint8",
-                           input1->type);
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -155,6 +165,9 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      break;
     case kTfLiteFloat32:
       TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
       break;
@@ -165,13 +178,17 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedNotEqual(context, node, input1, input2, output,
-                            requires_broadcast);
+      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
       break;
     default:
-      context->ReportError(context,
-                           "Does not support type %d, requires float|int|uint8",
-                           input1->type);
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -193,8 +210,12 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreater(context, node, input1, input2, output,
-                           requires_broadcast);
+      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
+                                   requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -221,8 +242,12 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual(context, node, input1, input2, output,
-                                requires_broadcast);
+      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
+                                         requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
+                                        requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -249,8 +274,12 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLess(context, node, input1, input2, output,
-                        requires_broadcast);
+      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
+                                requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -277,8 +306,12 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLessEqual(context, node, input1, input2, output,
-                             requires_broadcast);
+      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
+                                      requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
       break;
     default:
       context->ReportError(context,
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index ab10c959a4d6b234cb6ae0810174e8f1c48898d1..3f950a322059f6e22dd95740606098c32c0bd310 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -92,6 +92,17 @@ class ComparisonOpModel : public SingleOpModel {
   }
 };
 
+TEST(ComparisonsTest, EqualBool) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BOOL,
+                          BuiltinOperator_EQUAL);
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {true, true, false, false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, EqualFloat) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
                           BuiltinOperator_EQUAL);
@@ -137,6 +148,17 @@ TEST(ComparisonsTest, EqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
+TEST(ComparisonsTest, NotEqualBool) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BOOL,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {true, true, false, false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, NotEqualFloat) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
                           BuiltinOperator_NOT_EQUAL);
@@ -363,7 +385,7 @@ TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-TEST(QuantizedComparisonsTest, EqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -376,7 +398,20 @@ TEST(QuantizedComparisonsTest, EqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false));
 }
 
-TEST(QuantizedComparisonsTest, NotEqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {-1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false));
+}
+
+TEST(QuantizedComparisonsTest, NotEqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -389,6 +424,19 @@ TEST(QuantizedComparisonsTest, NotEqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
 }
 
+TEST(QuantizedComparisonsTest, NotEqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_NOT_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
+}
+
 TEST(ComparisonsTest, GreaterQuantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
@@ -470,7 +518,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedUInt8NotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -488,7 +536,25 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8NotEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_NOT_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {-20, 2, 7, -8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {2});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, true, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -506,7 +572,25 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, false, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -524,7 +608,25 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -542,7 +644,25 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8LessWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, false, false, false))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -560,6 +680,24 @@ TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   }
 }
 
+TEST(ComparisonsTest, QuantizedInt8LessEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, true, false, false))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index a8dd160c8dbb42ba2c2363af55b30eb0b79f86af..76d906fa6deb0f7c1a3d67301f83ab4e02929dab 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -58,8 +58,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
   TF_LITE_ENSURE(context,
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-                     input_type == kTfLiteInt16 || input_type == kTfLiteInt32 ||
-                     input_type == kTfLiteInt64);
+                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
+                     input_type == kTfLiteInt32 || input_type == kTfLiteInt64);
 
   // Output dimensions will match input dimensions, except 'axis', which
   // will be the sum of inputs
@@ -85,6 +85,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
 
+  if (input_type == kTfLiteInt8) {
+    // Make sure there is no re-scaling needed for Int8 quantized kernel. This
+    // is a restriction we introduced to Int8 kernels.
+    VectorOfTensors<int8_t> all_inputs(*context, *node->inputs);
+    for (int i = 0; i < node->inputs->size; ++i) {
+      TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
+      TF_LITE_ENSURE_EQ(context, t->params.scale, output->params.scale);
+      TF_LITE_ENSURE_EQ(context, t->params.zero_point,
+                        output->params.zero_point);
+    }
+  }
+
   return context->ResizeTensor(context, output, output_size);
 }
 
@@ -148,6 +160,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
+    case kTfLiteInt8: {
+      if (kernel_type == kReference) {
+        TF_LITE_CONCATENATION(reference_ops, int8_t);
+      } else {
+        TF_LITE_CONCATENATION(optimized_ops, int8_t);
+      }
+    } break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
         TF_LITE_CONCATENATION(reference_ops, int64_t);
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 422380a03eaf9073958d4984eb2234890d555780..dab77d612dc5ab328849892fbee1115bc5324f44 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -78,13 +78,18 @@ class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
             .Union());
     BuildInterpreter(all_input_shapes);
   }
+  template <typename T>
   void SetInput(int index, std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(index, data);
+    QuantizeAndPopulate<T>(index, data);
   }
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 };
 
@@ -165,25 +170,47 @@ TEST(ConcatenationOpTest, FourInputs) {
               }));
 }
 
-TEST(ConcatenationOpTest, FourInputsQuantized) {
+TEST(ConcatenationOpTest, FourInputsQuantizedUint8) {
   QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
                                    /*axis=*/2,
                                    /*num_inputs=*/4);
 
-  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
-  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
-  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
-  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.SetInput<uint8_t>(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput<uint8_t>(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput<uint8_t>(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput<uint8_t>(3, {1.3f, 3.3f, 4.3f, 7.3f});
   m0.Invoke();
-  EXPECT_THAT(m0.GetDequantizedOutput(),
+  EXPECT_THAT(m0.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({
                   1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
                   4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
               })));
-  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
-                                  137, 157, 138, 158, 139, 159, 140, 160,  //
-                                  167, 197, 168, 198, 169, 199, 170, 200,  //
-                              }));
+  EXPECT_THAT(m0.GetOutput<uint8_t>(),
+              ElementsAreArray({
+                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                  167, 197, 168, 198, 169, 199, 170, 200,  //
+              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantizedInt8) {
+  QuantizedConcatenationOpModel m0({TensorType_INT8, {2, 1, 2}, -12.7, 12.8},
+                                   /*axis=*/2,
+                                   /*num_inputs=*/4);
+
+  m0.SetInput<int8_t>(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput<int8_t>(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput<int8_t>(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput<int8_t>(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  1, 3, 1.1, 3.1, 1.2, 3.2, 1.3, 3.3,  //
+                  4, 7, 4.1, 7.1, 4.2, 7.2, 4.3, 7.3   //
+              })));
+  EXPECT_THAT(m0.GetOutput<int8_t>(), ElementsAreArray({
+                                          9, 29, 10, 30, 11, 31, 12, 32,   //
+                                          39, 69, 40, 70, 41, 71, 42, 72,  //
+                                      }));
 }
 
 TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
@@ -194,20 +221,21 @@ TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
                                    /*axis=*/2, /*num_inputs=*/4,
                                    {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
 
-  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
-  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
-  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
-  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.SetInput<uint8_t>(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput<uint8_t>(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput<uint8_t>(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput<uint8_t>(3, {1.3f, 3.3f, 4.3f, 7.3f});
   m0.Invoke();
-  EXPECT_THAT(m0.GetDequantizedOutput(),
+  EXPECT_THAT(m0.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({
                   1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
                   4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
               })));
-  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
-                                  137, 157, 138, 158, 139, 159, 140, 160,  //
-                                  167, 197, 168, 198, 169, 199, 170, 200,  //
-                              }));
+  EXPECT_THAT(m0.GetOutput<uint8_t>(),
+              ElementsAreArray({
+                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                  167, 197, 168, 198, 169, 199, 170, 200,  //
+              }));
 }
 
 TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
@@ -218,22 +246,23 @@ TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
                                    /*axis=*/2, /*num_inputs=*/4,
                                    {TensorType_UINT8, {2, 1, 2}, -1., 1.});
 
-  m0.SetInput(0, {1.0f, -3.0f, -4.0f, -7.0f});
-  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
-  m0.SetInput(2, {1.2f, -3.2f, -4.2f, 7.2f});
-  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.SetInput<uint8_t>(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput<uint8_t>(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput<uint8_t>(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput<uint8_t>(3, {1.3f, 3.3f, 4.3f, 7.3f});
   m0.Invoke();
-  EXPECT_THAT(m0.GetDequantizedOutput(),
+  EXPECT_THAT(m0.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
                       -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
                   },
                   4e-3)));
-  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
-                                  255, 0, 255, 255, 255, 0, 255, 255,  //
-                                  0, 0, 255, 255, 0, 255, 255, 255,    //
-                              }));
+  EXPECT_THAT(m0.GetOutput<uint8_t>(),
+              ElementsAreArray({
+                  255, 0, 255, 255, 255, 0, 255, 255,  //
+                  0, 0, 255, 255, 0, 255, 255, 255,    //
+              }));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 05368aa5edda8073d3ee5dbf0352468c7f4f6133..75e75fae6b09e6f3653719e3cf69c8cc1b4956f6 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -68,6 +69,11 @@ struct OpData {
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> per_channel_output_multiplier;
+  std::vector<int> per_channel_output_shift;
+
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -226,8 +232,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Check types. (We assume that UINT8 refers to quantized tensors)
   TfLiteType input_type = input->type;
-  TF_LITE_ENSURE(context,
-                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
 
   TfLiteTensor* bias = nullptr;
@@ -238,7 +245,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   if (has_bias) {
     bias = &context->tensors[node->inputs->data[2]];
-    if (input_type == kTfLiteUInt8) {
+    if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else {
@@ -296,18 +303,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, has_bias);
 
   // Note that full fixed-point inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
+  // parameters set. This is usually done during quantized training or
+  // calibration.
   if (input_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    const int number_channel = affine_quantization->scale->size;
+    data->per_channel_output_multiplier.resize(number_channel);
+    data->per_channel_output_shift.resize(number_channel);
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier.data(),
+        data->per_channel_output_shift.data()));
   }
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
@@ -481,6 +495,29 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, OpData* data,
+                             TfLiteTensor* input, TfLiteTensor* filter,
+                             TfLiteTensor* bias, TfLiteTensor* output) {
+  ConvParams op_params;
+  op_params.input_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+
+  reference_integer_ops::ConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+}
+
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
@@ -665,6 +702,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
                                  bias, im2col, hwcn_weights, output);
       break;
+    case kTfLiteInt8:
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index d0350b2fa7f7bad804d4b1348f4d389cb102f68e..7c562a530abf53a174e7f55053d01e6ae2fa212f 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -58,9 +58,35 @@ class BaseConvolutionOpModel : public SingleOpModel {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
-      auto bias_scale = GetScale(input_) * GetScale(filter_);
-      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (int i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        TensorData bias{TensorType_INT32,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_scale=*/bias_scale,
+                        /*per_channel_zero_point=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
     }
 
     output_ = AddOutput(output);
@@ -338,6 +364,18 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
   // |  187  |  234  |  261  |  121  |
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
                                                178, 187, 234, 261, 121}));
+
+  // Add an additional test for the multi-threaded case, ensuring stability
+  // under different thread counts.
+  if (GetParam() == "MultithreadedOptimized") {
+    for (int i = 1; i < 4; ++i) {
+      m.SetNumThreads(i);
+      m.Invoke();
+      EXPECT_THAT(m.GetOutput(),
+                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
+                                    234, 261, 121}));
+    }
+  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -1069,6 +1107,76 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
+class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTest) {
+  PerChannelQuantizedConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel=*/true,
+       /*per_channel_scales=*/{1, 2},
+       /*per_channel_zeros=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({28.5, 64, -59.5, -46})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({56, 127, -120, -93}));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ConvolutionOpTest, ConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 3f4ae5087b267a62d4d4237a8f5f534ff346a493..a349b2790531a674be1faa40d928677a9144e265 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -58,6 +59,10 @@ struct OpData {
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> per_channel_output_multiplier;
+  std::vector<int> per_channel_output_shift;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -99,14 +104,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     SizeOfDimension(filter, 3));
 
   const TfLiteType data_type = input->type;
-  TF_LITE_ENSURE(context,
-                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32 ||
+                              data_type == kTfLiteUInt8 ||
+                              data_type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, output->type, data_type);
   TF_LITE_ENSURE_EQ(context, filter->type, data_type);
 
   if (hasBias) {
     bias = GetInput(context, node, kBiasTensor);
-    if (data_type == kTfLiteUInt8) {
+    if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else {
@@ -150,17 +156,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      filter_width, out_width);
 
   // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
+  // parameters set. This is usually done during quantized training or
+  // calibration.
   if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    const int number_channel = affine_quantization->scale->size;
+    data->per_channel_output_multiplier.resize(number_channel);
+    data->per_channel_output_shift.resize(number_channel);
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier.data(),
+        data->per_channel_output_shift.data()));
   }
 
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
@@ -250,6 +264,33 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                  GetTensorData<uint8_t>(output));
 }
 
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, OpData* data,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = input->params.zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = output->params.zero_point;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
@@ -273,6 +314,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
                                  bias, output);
       break;
+    case kTfLiteInt8: {
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
+    }
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 75aed4cc4a96e76f35499d3c26cf0fc25f463160..5dc513262b2d71aaab458a815aacba473d788859 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -56,9 +56,35 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
-      auto bias_scale = GetScale(input_) * GetScale(filter_);
-      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (int i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        TensorData bias{TensorType_INT32,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_scale=*/bias_scale,
+                        /*per_channel_zero_point=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
     }
 
     output_ = AddOutput(output);
@@ -437,6 +463,76 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
               ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
 }
 
+class PerChannelQuantizedDepthwiseConvolutionOpModel
+    : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+       {1, 2, 2, 4},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel=*/true,
+       /*per_channel_scales=*/{1, 2, 3, 4},
+       /*per_channel_zeros=*/{0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      /*filter data*/
+      {
+          // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+          // depth multiplier = 2
+          1, 2, 3, 4,  // y = 0, x = 0
+          3, 4, 5, 6,  // y = 0, x = 1
+          7, 8, 5, 6,  // y = 1, x = 0
+          3, 4, 1, 2,  // y = 1, x = 1
+      });
+  m.SetBias({3, -2, 4, 6});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({40.5, 48, 27, 40, 0.5, -4, -24, -36})));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index be7caa31892a9dbb41eef2f88479c9f0051e2339..77254335fbde0ff4246af00291ccfba9ec8b0acf 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -30,13 +30,7 @@ class DequantizeOpModel : public SingleOpModel {
  public:
   DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
                     float scale, int32_t zero_point) {
-    TensorData input_tensor_data;
-    input_tensor_data.type = type;
-    input_tensor_data.shape = shape;
-    input_tensor_data.min = 0;
-    input_tensor_data.max = 0;
-    input_tensor_data.scale = scale;
-    input_tensor_data.zero_point = zero_point;
+    const TensorData input_tensor_data = {type, shape, 0, 0, scale, zero_point};
     input_ = AddInput(input_tensor_data);
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index e2a2c4aac9456dfae2e26d75d903c300e382b1d0..6543cc7162d66eca2cd6da6a120f4feab19531a3 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -24,8 +24,12 @@ namespace tflite {
 namespace eigen_support {
 namespace {
 
+// For legacy reasons, we use 4 threads by default unless the thread count is
+// explicitly specified by the context.
+const int kDefaultNumThreadpoolThreads = 4;
+
 #ifndef EIGEN_DONT_ALIGN
-// Eigen may require buffers to be algiend to 16, 32 or 64 bytes depending on
+// Eigen may require buffers to be aligned to 16, 32 or 64 bytes depending on
 // hardware architecture and build configurations.
 // If the static assertion fails, try to increase `kDefaultTensorAlignment` to
 // in `arena_planner.h` to 32 or 64.
@@ -63,9 +67,45 @@ class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
   std::unique_ptr<Eigen::ThreadPool> pool_;
 };
 
+// Utility class for lazily creating an Eigen thread pool/device only when used.
+class LazyEigenThreadPoolHolder {
+ public:
+  explicit LazyEigenThreadPoolHolder(int num_threads) {
+    SetNumThreads(num_threads);
+  }
+
+  // Gets the ThreadPoolDevice, creating if necessary.
+  const Eigen::ThreadPoolDevice* GetThreadPoolDevice() {
+    if (!device_) {
+      thread_pool_wrapper_.reset(new EigenThreadPoolWrapper(
+          new Eigen::ThreadPool(target_num_threads_)));
+      device_.reset(new Eigen::ThreadPoolDevice(thread_pool_wrapper_.get(),
+                                                target_num_threads_));
+    }
+    return device_.get();
+  }
+
+  // Updates the thread count, invalidating the ThreadPoolDevice if necessary.
+  void SetNumThreads(int num_threads) {
+    const int target_num_threads =
+        num_threads != -1 ? num_threads : kDefaultNumThreadpoolThreads;
+    if (target_num_threads_ != target_num_threads) {
+      target_num_threads_ = target_num_threads;
+      // As the device references the thread pool wrapper, destroy it first.
+      device_.reset();
+      thread_pool_wrapper_.reset();
+    }
+  }
+
+ private:
+  int target_num_threads_ = kDefaultNumThreadpoolThreads;
+  // Both device_ and thread_pool_wrapper_ are lazily created.
+  std::unique_ptr<Eigen::ThreadPoolDevice> device_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> thread_pool_wrapper_;
+};
+
 struct RefCountedEigenContext : public TfLiteExternalContext {
-  std::unique_ptr<Eigen::ThreadPoolInterface> thread_pool_wrapper;
-  std::unique_ptr<Eigen::ThreadPoolDevice> device;
+  std::unique_ptr<LazyEigenThreadPoolHolder> thread_pool_holder;
   int num_references = 0;
 };
 
@@ -74,24 +114,12 @@ RefCountedEigenContext* GetEigenContext(TfLiteContext* context) {
       context->GetExternalContext(context, kTfLiteEigenContext));
 }
 
-void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) {
-  int num_threads = 4;
-  if (context->recommended_num_threads != -1) {
-    num_threads = context->recommended_num_threads;
-  }
-  ptr->device.reset();  // destroy before we invalidate the thread pool
-  ptr->thread_pool_wrapper.reset(
-      new EigenThreadPoolWrapper(new Eigen::ThreadPool(num_threads)));
-  ptr->device.reset(
-      new Eigen::ThreadPoolDevice(ptr->thread_pool_wrapper.get(), num_threads));
-}
-
 TfLiteStatus Refresh(TfLiteContext* context) {
   SetEigenNbThreads(context->recommended_num_threads);
 
   auto* ptr = GetEigenContext(context);
   if (ptr != nullptr) {
-    InitDevice(context, ptr);
+    ptr->thread_pool_holder->SetNumThreads(context->recommended_num_threads);
   }
 
   return kTfLiteOk;
@@ -108,8 +136,9 @@ void IncrementUsageCounter(TfLiteContext* context) {
     ptr = new RefCountedEigenContext;
     ptr->type = kTfLiteEigenContext;
     ptr->Refresh = Refresh;
+    ptr->thread_pool_holder.reset(
+        new LazyEigenThreadPoolHolder(context->recommended_num_threads));
     ptr->num_references = 0;
-    InitDevice(context, ptr);
     context->SetExternalContext(context, kTfLiteEigenContext, ptr);
   }
   ptr->num_references++;
@@ -134,7 +163,7 @@ const Eigen::ThreadPoolDevice* GetThreadPoolDevice(TfLiteContext* context) {
     TF_LITE_FATAL(
         "Call to GetFromContext() not preceded by IncrementUsageCounter()");
   }
-  return ptr->device.get();
+  return ptr->thread_pool_holder->GetThreadPoolDevice();
 }
 
 }  // namespace eigen_support
diff --git a/tensorflow/lite/kernels/eigen_support.h b/tensorflow/lite/kernels/eigen_support.h
index c24ae6896a7e9783ddd32bc510881ccc1a5d27bf..7e052e1f83cc9ddabac84d318d1639f478b6fb01 100644
--- a/tensorflow/lite/kernels/eigen_support.h
+++ b/tensorflow/lite/kernels/eigen_support.h
@@ -32,6 +32,11 @@ void IncrementUsageCounter(TfLiteContext* context);
 // usages all temporary Eigen objects will be deleted.
 void DecrementUsageCounter(TfLiteContext* context);
 
+// Fetch the ThreadPoolDevice associated with the provided context.
+//
+// Note: The caller must ensure that |IncrementUsageCounter()| has already been
+// called. Moreover, it is *not* safe to cache the returned device; it may be
+// invalidated if the context thread count changes.
 const EigenForTFLite::ThreadPoolDevice* GetThreadPoolDevice(
     TfLiteContext* context);
 
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index a79388b900eb89b56a4d18f887dbe52e84fb123f..1cc188ae5f7bfe91bee48c60b692d9dca2b7cf0e 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -83,6 +83,10 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
 
+TfLiteStatus CosEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::cos);
+}
+
 TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::log);
 }
@@ -122,6 +126,14 @@ TfLiteRegistration* Register_SIN() {
   return &r;
 }
 
+TfLiteRegistration* Register_COS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::CosEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_LOG() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 7d24320081257925508b2aa53503c1cf71d0e913..89f2a506f0cc00df021d8b5113174833df7e33cb 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -65,6 +65,15 @@ TEST(ElementWise, Sin) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(ElementWise, Cos) {
+  ElementWiseOpFloatModel m(BuiltinOperator_COS, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({1, -1, -1, 0.54030})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 TEST(ElementWise, Log) {
   ElementWiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1});
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index dfc9550ed600ac08407c4f07b6ad7d0be26bfe10..55cde983abccdd4c20c9c1cd24d44883d5b883cd 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/gemm_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -40,9 +41,8 @@ namespace fully_connected {
 // This file has four implementations of FullyConnected
 enum KernelType {
   kReference,
-  kGenericOptimized,  // Neon-free
-  kNeonOptimized,
-  kPie,  // Used by the PIE team
+  kGenericOptimized,
+  kLegacyPie,  // Legacy path used by the PIE team and related clients.
 };
 
 struct OpData {
@@ -212,7 +212,9 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context,
                  filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  if (bias) {
+    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  }
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
 
   int total_input_size = 1;
@@ -286,6 +288,27 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
     macro_name(target_namespace, kRelu6);                            \
   }
 
+namespace {
+void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
+                        const TfLiteTensor* filter, const TfLiteTensor* bias,
+                        TfLiteTensor* output,
+                        gemmlowp::GemmContext* gemm_context) {
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output), gemm_context);
+}
+}  // namespace
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
@@ -314,11 +337,20 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         GetTensorShape(output), GetTensorData<output_data_type>(output), \
         gemm_context);                                                   \
   }
-  if (kernel_type == kReference) {
+  // Only the Pie path supports quantized models and float inputs/outputs.
+  if (input->type == kTfLiteFloat32) {
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
+    return EvalHybrid(context, node, params, data, input, filter, bias,
+                      input_quantized, scaling_factors, output);
+  } else if (kernel_type == kReference) {
     switch (output->type) {
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
         break;
@@ -328,17 +360,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
             "Quantized FullyConnected expects output data type uint8 or int16");
         return kTfLiteError;
     }
-  } else if (kernel_type == kPie && input->type == kTfLiteFloat32) {
-    // Pie currently only supports quantized models and float inputs/outputs.
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
-    return EvalHybrid(context, node, params, data, input, filter, bias,
-                      input_quantized, scaling_factors, output);
   } else {
     switch (output->type) {
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
         break;
@@ -418,7 +447,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   }
   if (kernel_type == kReference) {
     TF_LITE_FULLY_CONNECTED(reference_ops);
-  } else if (kernel_type == kPie) {
+  } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
   } else {
     TF_LITE_FULLY_CONNECTED(optimized_ops);
@@ -488,13 +517,6 @@ TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
   return &r;
 }
 
-TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
-      fully_connected::Eval<fully_connected::kNeonOptimized>};
-  return &r;
-}
-
 TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
   static TfLiteRegistration r = {
       fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
@@ -502,24 +524,16 @@ TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
   return &r;
 }
 
+// Legacy path for PIE clients.
 TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
-  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
-                                 fully_connected::Prepare,
-                                 fully_connected::Eval<fully_connected::kPie>};
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kLegacyPie>};
   return &r;
 }
 
 TfLiteRegistration* Register_FULLY_CONNECTED() {
-  // TODO(ahentz): We don't have a dedicated quantized version of the PIE
-  // kernel. For now, the quantized version just defer to the corresponding
-  // optimized MINI kernel. At some point we will allow different libraries to
-  // be built with different kernels, but for now we have to pick one here.
-  return Register_FULLY_CONNECTED_PIE();
-#ifdef USE_NEON
-  return Register_FULLY_CONNECTED_NEON_OPT();
-#else
   return Register_FULLY_CONNECTED_GENERIC_OPT();
-#endif
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 03f4ea71430f5d578288d913e8ba1d0222467882..7a69b999f1f2207dad865616f33f07e71da8b54f 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -33,7 +33,6 @@ namespace ops {
 namespace builtin {
 
 TfLiteRegistration* Register_FULLY_CONNECTED_REF();
-TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
 TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
 TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
 
@@ -137,6 +136,7 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
   BaseFullyConnectedOpModel(
       TfLiteRegistration* registration, int units, int batches,
       const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
+      bool bias_tensor_optional = false,
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT)
@@ -151,7 +151,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    if (input.type == TensorType_FLOAT32) {
+    if (bias_tensor_optional) {
+      bias_ = AddNullInput();
+    } else if (input.type == TensorType_FLOAT32) {
       bias_ = AddInput({TensorType_FLOAT32, {units_}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
@@ -173,7 +175,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
             .Union());
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_FULLY_CONNECTED, registration);
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter(
+        {GetShape(input_), GetShape(weights_),
+         (bias_ == kOptionalTensor) ? std::vector<int>() : GetShape(bias_)});
   }
 
   int input_size() { return input_size_; }
@@ -216,9 +220,12 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   void SetBias(const std::vector<float>& data) {
     QuantizeAndPopulate<int32_t>(bias_, data);
   }
+  template <typename T>
   void SetWeights(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(weights_, data);
+    QuantizeAndPopulate<T>(weights_, data);
   }
+
+  template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
     std::vector<float> shuffled_data(data.size());
@@ -237,15 +244,17 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     }
     TfLiteTensor* t = interpreter_->tensor(weights_);
     auto quantized_data =
-        Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point);
-    for (uint8_t& q : quantized_data) {
+        Quantize<T>(shuffled_data, t->params.scale, t->params.zero_point);
+    for (T& q : quantized_data) {
       q ^= 0x80;
     }
     PopulateTensor(weights_, 0, quantized_data.data(),
                    quantized_data.data() + quantized_data.size());
   }
+
+  template <typename T>
   void SetInput(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   template <typename T>
@@ -320,7 +329,6 @@ class HybridFullyConnectedOpModel : public SingleOpModel {
 
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
-    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
     {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
 });
@@ -334,7 +342,6 @@ class FloatFullyConnectedOpTest : public SingleOpTest {
 
 const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
-    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
@@ -345,15 +352,18 @@ class QuantizedFullyConnectedOpTest : public SingleOpTest {
   }
 };
 
-const auto kKernelMapPie = new std::map<string, TfLiteRegistration*>({
+const auto kKernelMapHybrid = new std::map<string, TfLiteRegistration*>({
     {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+    // Only Pie supports the hybrid path, so the optimized kernel should fall
+    // back to the Pie path in such cases.
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
 // Hybrid mode is used by the Pie quantized kernel.
 class HybridFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapPie;
+    return *kKernelMapHybrid;
   }
 };
 
@@ -397,21 +407,42 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
+TEST(FloatFullyConnectedOpTest, SimpleTestNoBias) {
+  // The optimized kernel assumes that the bias is specified.
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/1, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 2}},
+                               /*output=*/{TensorType_FLOAT32},
+                               /*bias_tensor_optional=*/true);
+  m.SetWeights({
+      2, 4,  // u = 0
+  });
+
+  m.SetInput({
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(10, 8));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -427,22 +458,48 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(23, 24, 25, 57, 58, 59));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -458,6 +515,36 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(175, 177, 179, 243, 245, 247));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Int8) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -127, 128},
+      /*output=*/{TensorType_INT8, {}, -63.5, 64});
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(47, 49, 51, 115, 117, 119));
+}
+
 void SimpleTestQuantizedInt16OutputCase(
     TfLiteRegistration* registration, int input_depth, int output_depth,
     int batches, FullyConnectedOptionsWeightsFormat weights_format) {
@@ -477,6 +564,7 @@ void SimpleTestQuantizedInt16OutputCase(
       /*input=*/
       {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
       /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
+      /*bias_tensor_optional=*/false,
       /*activation_func=*/ActivationFunctionType_NONE, weights_format);
 
   std::mt19937 random_engine;
@@ -492,7 +580,7 @@ void SimpleTestQuantizedInt16OutputCase(
   // and set the (possibly shuffled) weights.
   switch (weights_format) {
     case FullyConnectedOptionsWeightsFormat_DEFAULT:
-      m.SetWeights(weights_data);
+      m.SetWeights<uint8_t>(weights_data);
       break;
     case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
       // The shuffled path currently supports only a restrictive subset of
@@ -500,7 +588,7 @@ void SimpleTestQuantizedInt16OutputCase(
       CHECK_EQ(input_depth % 16, 0);
       CHECK_EQ(output_depth % 4, 0);
       CHECK(batches == 1 || batches == 4);
-      m.ShuffleAndSetWeights(weights_data, input_depth, output_depth);
+      m.ShuffleAndSetWeights<uint8_t>(weights_data, input_depth, output_depth);
       break;
     default:
       LOG(FATAL) << "Unhandled weights format";
@@ -522,7 +610,7 @@ void SimpleTestQuantizedInt16OutputCase(
   }
 
   m.SetBias(bias_data);
-  m.SetInput(input_data);
+  m.SetInput<uint8_t>(input_data);
 
   m.Invoke();
 
@@ -664,21 +752,21 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -695,21 +783,21 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
 }
 
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index f205daae1343cb0abecc95e7d1b280c10f55d897..54d05adbcf161a2af88bea4a0de1eec06e70c09a 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -57,6 +57,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt64:
     case kTfLiteInt32:
       break;
@@ -135,6 +136,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int32_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt64:
@@ -153,6 +156,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int64_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt64:
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20e98652ee57ec7b6b86a20cbc474b4b9c29b2aa
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -0,0 +1,154 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather_nd {
+constexpr int kParams = 0;
+constexpr int kIndices = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (params->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Params of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+  switch (indices->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+
+  const int params_rank = NumDimensions(params);
+  const int indices_rank = NumDimensions(indices);
+  const int indices_nd = SizeOfDimension(indices, indices_rank - 1);
+  if (params_rank < 1) {
+    context->ReportError(context, "Params must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_rank < 1) {
+    context->ReportError(context, "Indices must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_nd > params_rank) {
+    context->ReportError(
+        context, "Index innermost dimension length must be <= params rank.");
+    return kTfLiteError;
+  }
+
+  // Assign to output the input type.
+  output->type = params->type;
+
+  // The result shape is
+  // indices.shape[:-1] + params.shape[indices.shape[-1]:]
+  const int output_rank = indices_rank + params_rank - indices_nd - 1;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
+  int output_index = 0;
+  for (int i = 0; i < indices_rank - 1; ++i) {
+    output_shape->data[output_index++] = indices->dims->data[i];
+  }
+  for (int i = indices_nd; i < params_rank; ++i) {
+    output_shape->data[output_index++] = params->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename ParamsT, typename IndicesT>
+TfLiteStatus GatherNd(const TfLiteTensor* params, const TfLiteTensor* indices,
+                      TfLiteTensor* output) {
+  reference_ops::GatherNd(
+      GetTensorShape(params), GetTensorData<ParamsT>(params),
+      GetTensorShape(indices), GetTensorData<IndicesT>(indices),
+      GetTensorShape(output), GetTensorData<ParamsT>(output));
+  return kTfLiteOk;
+}
+
+template <typename IndicesT>
+TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
+                          const TfLiteTensor* indices, TfLiteTensor* output) {
+  switch (params->type) {
+    case kTfLiteFloat32:
+      return GatherNd<float, IndicesT>(params, indices, output);
+    case kTfLiteUInt8:
+      return GatherNd<uint8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt8:
+      return GatherNd<int8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt32:
+      return GatherNd<int32_t, IndicesT>(params, indices, output);
+    case kTfLiteInt64:
+      return GatherNd<int64_t, IndicesT>(params, indices, output);
+    default:
+      context->ReportError(context,
+                           "Params type '%s' are not supported by gather_nd.",
+                           TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (indices->type) {
+    case kTfLiteInt32:
+      return EvalGatherNd<int32_t>(context, params, indices, output);
+    case kTfLiteInt64:
+      return EvalGatherNd<int64_t>(context, params, indices, output);
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+}
+}  // namespace gather_nd
+
+TfLiteRegistration* Register_GATHER_ND() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 gather_nd::Prepare, gather_nd::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e93efb8ff468f9e1cd6d2cd8c4343c0fe62e79
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherNdOpModel : public SingleOpModel {
+ public:
+  GatherNdOpModel(const TensorData& params, const TensorData& indices) {
+    params_ = AddInput(params);
+    indices_ = AddInput(indices);
+    output_ = AddOutput(params.type);
+    SetBuiltinOp(BuiltinOperator_GATHER_ND, BuiltinOptions_GatherNdOptions,
+                 CreateGatherNdOptions(builder_).Union());
+    BuildInterpreter({GetShape(params_), GetShape(indices_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(params_, data);
+  }
+
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(indices_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int params_;
+  int indices_;
+  int output_;
+};
+
+TEST(GatherNdOpTest, ElementIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 0, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 1.1}));
+}
+
+TEST(GatherNdOpTest, ElementIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {1, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({1.1, -1.2, 1.3, -2.1, 2.2, 2.3, 5.1, -5.2, 5.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({3.1, 3.2, -3.3, -4.1, -4.2, 4.3, 1.1, -1.2, 1.3,
+                                -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor3) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0, 0, 0, 2, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3, 1.1, -1.2, 1.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor4) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, 3.2, 4.3, 6.3}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 0, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int32) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int64) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT64, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Int32Int32) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int32Int64) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int32) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int64) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int64_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Int8Int32) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int8Int64) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int64Int32) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+TEST(GatherNdOpTest, Int64Int64) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 7b5f84348903a3cc436f1bd6cf32b3175b2f5815..8fc6bd173da831d63ee9eea364b7f352ea679af3 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -96,6 +96,15 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) {
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
+TEST(GatherOpTest, Test1DInput1DIndex) {
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {1}});
+  m.SetInput<float>({1.0, 3.0, 5.0});
+  m.SetPositions<int32_t>({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3.0})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+}
+
 TEST(GatherOpTest, Test2DIndexWith2DResult) {
   GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {1, 2}});
   m.SetInput<float>({1.0, 2.0, 3.0});
@@ -137,6 +146,29 @@ TEST(FloatGatherOpTest, Axis1) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3}));
 }
 
+TEST(FloatGatherOpTest, Axis10DIndex) {
+  const int axis = 1;
+  GatherOpModel m({TensorType_FLOAT32, {1, 3, 2}}, {TensorType_INT32, {}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3, 4})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+}
+
+TEST(FloatGatherOpTest, Axis1Slice) {
+  const int axis = 1;
+  GatherOpModel m({TensorType_FLOAT32, {1, 4, 2}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetPositions<int32_t>({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({7, 8, 3, 4})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
+}
+
 TEST(FloatGatherOpTest, LastAxis) {
   const int axis = -1;
   GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
@@ -149,6 +181,17 @@ TEST(FloatGatherOpTest, LastAxis) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
 }
 
+TEST(FloatGatherOpTest, LastAxis0DIndex) {
+  const int axis = -1;
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3, 6})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+}
+
 TEST(TypesGatherOpTest, Float32Int32) {
   GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
   m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
@@ -205,6 +248,24 @@ TEST(TypesGatherOpTest, Uint8Int64) {
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
 }
 
+TEST(TypesGatherOpTest, Int8Int32) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
+TEST(TypesGatherOpTest, Int8Int64) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
 TEST(TypesGatherOpTest, Int64Int32) {
   GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
   m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bd394e980073b73674ca972d28fafe04f7b8adf
--- /dev/null
+++ b/tensorflow/lite/kernels/if.cc
@@ -0,0 +1,200 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace if_kernel {
+
+struct OpData {
+  int then_subgraph_index;
+  int else_subgraph_index;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  op_data->then_subgraph_index = m["then_subgraph_index"].AsInt32();
+  op_data->else_subgraph_index = m["else_subgraph_index"].AsInt32();
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE(context, node->inputs->size > 0);
+
+  // The first input is the condition.
+  const TfLiteTensor* cond = GetInput(context, node, 0);
+  // Currently only bool is supported.
+  // TODO(ycling): Support other types since TensorFlow also support
+  // non-bool types as condition.
+  TF_LITE_ENSURE_EQ(context, cond->type, kTfLiteBool);
+  TF_LITE_ENSURE_EQ(context, NumElements(cond), 1);
+
+  // The first input of the node is the condition. The rest of inputs are
+  // passed to the branch subgraphs. Therefore, the number of subgraph inputs
+  // will be the number of node inputs - 1.
+  int num_inputs = node->inputs->size - 1;
+  int num_outputs = node->outputs->size;
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->then_subgraph_index < subgraphs->size());
+  TF_LITE_ENSURE(context, op_data->else_subgraph_index < subgraphs->size());
+
+  Subgraph* then_subgraph = (*subgraphs)[op_data->then_subgraph_index].get();
+  Subgraph* else_subgraph = (*subgraphs)[op_data->else_subgraph_index].get();
+
+  for (auto* subgraph : {then_subgraph, else_subgraph}) {
+    TF_LITE_ENSURE_EQ(context, num_inputs, subgraph->inputs().size());
+    TF_LITE_ENSURE_EQ(context, num_outputs, subgraph->outputs().size());
+  }
+
+  bool has_dynamic_output_tensors = false;
+  for (auto* subgraph : {then_subgraph, else_subgraph}) {
+    for (int i = 0; i < num_inputs; ++i) {
+      // The first input of the node is the condition. The indices of the inputs
+      // passed to the subgraphs are offset by 1.
+      const TfLiteTensor* input = GetInput(context, node, i + 1);
+      std::vector<int> dims(input->dims->data,
+                            input->dims->data + input->dims->size);
+      subgraph->ResizeInputTensor(i, dims);
+      TfLiteTensor* subgraph_input = subgraph->tensor(subgraph->inputs()[i]);
+      TF_LITE_ENSURE_EQ(context, input->type, subgraph_input->type);
+    }
+    // Note: The `Prepare` function is responsible to run `AllocateTensors` on
+    // both subgraphs. It's intentionally not to break out of the loop when
+    // finding a dynamic output tensor.
+    TF_LITE_ENSURE_OK(context, subgraph->AllocateTensors());
+    has_dynamic_output_tensors |= subgraph->HasDynamicTensors();
+  }
+
+  if (!has_dynamic_output_tensors) {
+    for (int i = 0; i < num_outputs; ++i) {
+      TfLiteTensor* then_output =
+          then_subgraph->tensor(then_subgraph->outputs()[i]);
+      TfLiteTensor* else_output =
+          else_subgraph->tensor(else_subgraph->outputs()[i]);
+      // If the 2 subgraphs have static but different output shapes, the output
+      // tensors of the IF op have dynamic sizes.
+      if (!TfLiteIntArrayEqual(then_output->dims, else_output->dims)) {
+        has_dynamic_output_tensors = true;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < num_outputs; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      // When there's no dynamic output tensors, the 2 subgraph has exactly
+      // the same static sized outputs.
+      TfLiteTensor* then_output =
+          then_subgraph->tensor(then_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(then_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* cond = GetInput(context, node, 0);
+  bool cond_value = cond->data.b[0];
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+
+  // Currently we copy the input / output between the subgraphs. This isn't
+  // optimized yet.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  int active_branch_subgraph_index =
+      cond_value ? op_data->then_subgraph_index : op_data->else_subgraph_index;
+  Subgraph& active_branch_subgraph =
+      *(*subgraphs)[active_branch_subgraph_index];
+  for (int i = 0; i < active_branch_subgraph.inputs().size(); ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i + 1);
+    TfLiteTensor* subgraph_input =
+        active_branch_subgraph.tensor(active_branch_subgraph.inputs()[i]);
+    TF_LITE_ENSURE_EQ(context, input->bytes, subgraph_input->bytes);
+    memcpy(subgraph_input->data.raw, input->data.raw, input->bytes);
+  }
+
+  // Note: It's guaranteed that the subgraphs' `AllocateTensors` are called
+  // in `Prepare`, so we don't need to do it here again.
+  TF_LITE_ENSURE_OK(context, active_branch_subgraph.Invoke());
+
+  for (int tensor_index : active_branch_subgraph.outputs()) {
+    active_branch_subgraph.EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  bool has_dynamic_output_tensors = false;
+  for (int i = 0; i < node->outputs->size; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (IsDynamicTensor(output)) {
+      has_dynamic_output_tensors = true;
+      break;
+    }
+  }
+
+  if (has_dynamic_output_tensors) {
+    for (int i = 0; i < node->outputs->size; ++i) {
+      TfLiteTensor* output = GetOutput(context, node, i);
+      TfLiteTensor* subgraph_output =
+          active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(subgraph_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+
+  for (int i = 0; i < active_branch_subgraph.outputs().size(); ++i) {
+    const TfLiteTensor* subgraph_output =
+        active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_EQ(context, output->bytes, subgraph_output->bytes);
+    memcpy(output->data.raw, subgraph_output->data.raw, output->bytes);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace if_kernel
+
+TfLiteRegistration* Register_IF() {
+  static TfLiteRegistration r = {if_kernel::Init, if_kernel::Free,
+                                 if_kernel::Prepare, if_kernel::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f90db131b0bc335b54f4f8c24fa5d8dd02862f4
--- /dev/null
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
+
+namespace {
+
+// A simple test that performs `ADD` if condition is true, and `MUL` otherwise.
+// The computation is: `cond ? a + b : a * b`.
+class SimpleIfTest : public ControlFlowOpTest {
+ protected:
+  void SetUp() override {
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildMulSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
+  }
+};
+
+TEST_F(SimpleIfTest, TestIfTrue) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = true;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(SimpleIfTest, TestIfFalse) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = false;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {5, 14});
+}
+
+// Test IF op using subgraphs with dynamically sized outputs.
+// The computation is: `cond ? a + b : pad(a, b)`.
+class DynamicSubgraphIfTest : public ControlFlowOpTest {
+ protected:
+  void SetUp() override {
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildPadSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
+  }
+};
+
+TEST_F(DynamicSubgraphIfTest, TestIfTrue) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = true;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  // Even if the true branch has a static type output, the output of the
+  // if op is dynamic because the other branch has dynamic output.
+  EXPECT_TRUE(IsDynamicTensor(output));
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(DynamicSubgraphIfTest, TestIfFalse) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = false;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  // The false branch has dynamic output.
+  EXPECT_TRUE(IsDynamicTensor(output));
+  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index b734b2d6cc30bb84eaa424ffed71747136f57c4c..4a18ee3c09720f5e2b0b1f0ddc906ade97973adc 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
@@ -46,7 +46,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -60,10 +59,16 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
+config_setting(
+    name = "aarch64",
+    values = {
+        "cpu": "aarch64",
+    },
+)
+
 config_setting(
     name = "arm",
     values = {
@@ -246,6 +251,7 @@ cc_library(
         ":optimized_base",
         ":tensor",
         ":types",
+        "//tensorflow/core/kernels:eigen_spatial_convolutions-inl",
         "//tensorflow/lite/c:c_api_internal",
         "//third_party/eigen3",
     ],
@@ -254,9 +260,6 @@ cc_library(
 cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":tensor",
         "@com_google_googletest//:gtest",
@@ -286,9 +289,6 @@ cc_library(
 cc_test(
     name = "quantization_util_test",
     srcs = ["quantization_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":quantization_util",
         "@com_google_googletest//:gtest",
@@ -314,9 +314,19 @@ cc_library(
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
+        "reference/integer_ops/add.h",
+        "reference/integer_ops/conv.h",
+        "reference/integer_ops/depthwise_conv.h",
         "reference/integer_ops/dequantize.h",
+        "reference/integer_ops/fully_connected.h",
+        "reference/integer_ops/l2normalization.h",
+        "reference/integer_ops/log_softmax.h",
+        "reference/integer_ops/logistic.h",
+        "reference/integer_ops/mean.h",
+        "reference/integer_ops/mul.h",
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/softmax.h",
+        "reference/integer_ops/tanh.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -413,6 +423,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
@@ -497,6 +508,9 @@ cc_library(
         "//tensorflow/lite/kernels:op_macros",
         "@gemmlowp",
     ] + select({
+        ":aarch64": [
+            ":neon_tensor_utils",
+        ],
         ":arm": [
             ":neon_tensor_utils",
         ],
@@ -552,13 +566,10 @@ cc_library(
     }),
     deps = [
         ":types",
-        "//tensorflow/lite:string",
     ],
 )
 
-# TODO(b/122597976): Eliminate TF dependency from lite/kernels:test_util,
-# in turn eliminating the need to use tf_cc_test for any dependent tests.
-tf_cc_test(
+cc_test(
     name = "tensor_utils_test",
     srcs = ["tensor_utils_test.cc"],
     linkopts = select({
@@ -568,9 +579,6 @@ tf_cc_test(
         "//conditions:default": [],
     }),
     linkstatic = 1,
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":tensor_utils",
         "//tensorflow/lite/c:c_api_internal",
@@ -593,26 +601,25 @@ cc_test(
 
 cc_test(
     name = "depthwiseconv_quantized_test",
-    srcs = ["depthwiseconv_quantized_test.cc"],
-    shard_count = 2,
-    tags = [
-        "tflite_not_portable_ios",
+    srcs = [
+        "depthwiseconv_quantized_test.cc",
+        "optimized/depthwiseconv_uint8_transitional.h",
     ],
+    shard_count = 2,
     deps = [
         ":optimized_base",
         ":reference_base",
         ":test_util",
         ":types",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@gemmlowp",
     ],
 )
 
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -625,9 +632,6 @@ cc_test(
 cc_test(
     name = "resize_nearest_neighbor_test",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -660,7 +664,7 @@ cc_test(
     srcs = [
         "logsoftmax_quantized_test.cc",
     ],
-    shard_count = 3,
+    shard_count = 4,
     tags = [
         # TODO(b/122242739): Reenable after fixing the flakiness?
         "nomac",
@@ -679,6 +683,10 @@ cc_test(
 cc_test(
     name = "log_quantized_test",
     srcs = ["log_quantized_test.cc"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -714,4 +722,78 @@ cc_test(
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
 
+filegroup(
+    name = "optimized_op_headers",
+    srcs = glob([
+        "optimized/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "reference_op_headers",
+    srcs = glob([
+        "reference/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "headers",
+    srcs = glob([
+        "*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+transitive_hdrs(
+    name = "nnapi_external_headers",
+    visibility = ["//tensorflow/lite:__subpackages__"],
+    deps = [
+        "//third_party/eigen3",
+        "@gemmlowp",
+    ],
+)
+
+# ---------------------------------------------------------
+# The public target "install_nnapi_extra_headers" is only
+# used for external targets that requires exporting optmized
+# and reference op headers.
+
+genrule(
+    name = "install_nnapi_extra_headers",
+    srcs = [
+        ":nnapi_external_headers",
+        ":headers",
+        ":optimized_op_headers",
+        ":reference_op_headers",
+    ],
+    outs = ["include"],
+    cmd = """
+    mkdir $@
+    for f in $(SRCS); do
+      d="$${f%/*}"
+      d="$${d#bazel-out*genfiles/}"
+      d="$${d#*external/eigen_archive/}"
+
+      if [[ $${d} == *local_config_* ]]; then
+        continue
+      fi
+
+      if [[ $${d} == external* ]]; then
+        extname="$${d#*external/}"
+        extname="$${extname%%/*}"
+        if [[ $${TF_SYSTEM_LIBS:-} == *$${extname}* ]]; then
+          continue
+        fi
+      fi
+
+      mkdir -p "$@/$${d}"
+      cp "$${f}" "$@/$${d}/"
+    done
+    """,
+    tags = ["manual"],
+    visibility = ["//visibility:private"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index bc30ac91220906588f204d6ff21c275faa2b6c25..2b8226c4977f489876f9073a4cd16c49f0673fa1 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -131,6 +131,223 @@ int CountLeadingZeros(T integer_input) {
 #endif
 }
 
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(
+      std::min(static_cast<int32_t>(32767),
+               std::max(static_cast<int32_t>(-32768), diff)));
+}
+
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90
+             ? 7
+             : input_bits > 44
+                   ? 6
+                   : input_bits > 21
+                         ? 5
+                         : input_bits > 10
+                               ? 4
+                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+}
+
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32 r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
+  const int32 r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficent to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
 inline int32 GetReciprocal(int32 x, int x_integer_digits,
                            int* num_bits_over_unit) {
   int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
@@ -148,6 +365,55 @@ inline int32 GetReciprocal(int32 x, int x_integer_digits,
   return shifted_scale.raw();
 }
 
+inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
+                                             int32* output_inv_sqrt,
+                                             int* output_shift) {
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  TFLITE_DCHECK_GT(input, 0);
+  const unsigned max_left_shift_bits =
+      CountLeadingZeros(static_cast<uint32>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32, 3>;
+  using F0 = FixedPoint<int32, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
 // BROADCASTING.
 //
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 3682499d494cc4e63712b6c57d80482899b2185d..3e48d95a082ca285874c2dad01aaf3b845db9e88 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -19,40 +19,97 @@ limitations under the License.
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 
 namespace tflite {
 namespace {
 
-enum class ForceKernelInvocation {
-  // Run all tests against kUseStandardEntry even if also testing another
-  // kernel, since we need to be sure that the main DepthwiseConv() function in
-  // optimized_ops.h dispatches to a correctly-executing kernel.
-  kNone = 0,  // The "default" option: use the normal DepthwiseConv
-              // kernel (entry) function.
-  kUseGenericKernel,
-  kUseNeon3x3,            // 3x3 kernel that uses NEON when available.
-  kUseNeon3x3DotProduct,  // 3x3 kernel that uses dot-product enabled NEON when
-                          // available.
+using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+using ::testing::Bool;
+using ::testing::Values;
+
+#if defined(__aarch64__)
+static constexpr bool kLooseIntrinsicsTolerance = false;
+#else
+static constexpr bool kLooseIntrinsicsTolerance = true;
+#endif
+
+// Currently, this is used in place of a Boolean "is symmetric?".
+enum class ParamsSpecialization {
+  kNone = 0,
+  kSymmetric,  // Symmetric quantization: zero represented by 128.
+};
+
+static constexpr int kSymmetricZeroPoint = 128;
+
+// Extend coverage distribution in a specific aspect, either explicitly chosen
+// or randomly chosen as in a mixture distribution.
+enum class CoverageExtension {
+  kNone = 0,
+  kLargeHeights = 1,
+  kLargeWidths = 2,
+  kNumOptions
+};
+
+// The TestParam structure below is the preferred parameterization of tests. A
+// tuple version is defined in order to support value-parameterized tests.
+typedef std::tuple<DepthwiseConvImplementation, int, bool, bool, bool,
+                   DepthwiseConvOutputRounding, bool>
+    TestParamTuple;
+
+struct TestParam {
+  TestParam() = default;
+
+  explicit TestParam(TestParamTuple param_tuple)
+      : forced_invocation(::testing::get<0>(param_tuple)),
+        tests_to_run(::testing::get<1>(param_tuple)),
+        test_stride(::testing::get<2>(param_tuple)),
+        test_pad(::testing::get<3>(param_tuple)),
+        test_depth_multiplier(::testing::get<4>(param_tuple)),
+        output_rounding(::testing::get<5>(param_tuple)),
+        loose_tolerance(::testing::get<6>(param_tuple)) {}
+
+  static std::string TestNameSuffix(
+      const ::testing::TestParamInfo<TestParamTuple>& info) {
+    const TestParam param(info.param);
+    return absl::Substitute("invocation_$0_stride_$1_pad_$2_depth_mult_$3",
+                            static_cast<int>(param.forced_invocation),
+                            param.test_stride, param.test_pad,
+                            param.test_depth_multiplier);
+  }
+
+  DepthwiseConvImplementation forced_invocation =
+      DepthwiseConvImplementation::kNone;
+  int tests_to_run = 0;
+  bool test_stride = false;
+  bool test_pad = false;
+  bool test_depth_multiplier = false;
+  DepthwiseConvOutputRounding output_rounding =
+      DepthwiseConvOutputRounding::kNone;
+  bool loose_tolerance = false;
 };
 
 inline void DispatchDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const DepthwiseParams& params,
+    const TestParam& test_param, const DepthwiseParams& params,
     const RuntimeShape& input_shape, const uint8* input_data,
     const RuntimeShape& filter_shape, const uint8* filter_data,
     const RuntimeShape& bias_shape, const int32* bias_data,
     const RuntimeShape& output_shape, uint8* output_data) {
-  switch (forced_invocation) {
-    case ForceKernelInvocation::kUseNeon3x3: {
+  switch (test_param.forced_invocation) {
+    case DepthwiseConvImplementation::kUseNeon3x3: {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
@@ -67,20 +124,20 @@ inline void DispatchDepthwiseConv(
 
       // Check that parameter combination is supported.
       const bool basic_3x3_kernel_supported =
-          optimized_ops::Fast3x3FilterKernelSupported(
+          optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
               input_shape, filter_shape, stride_width, stride_height,
               dilation_width_factor, dilation_height_factor, pad_width,
               pad_height, depth_multiplier, output_shape, output_shift);
       ASSERT_TRUE(basic_3x3_kernel_supported)
           << "pad_width = " << params.padding_values.width
           << " pad_height = " << params.padding_values.height
-          << " input_width = " << input_shape.Dims(1)
-          << " input_height = " << input_shape.Dims(2)
-          << " output_width = " << output_shape.Dims(1)
-          << " output_height = " << output_shape.Dims(2);
+          << " input_width = " << input_shape.Dims(2)
+          << " input_height = " << input_shape.Dims(1)
+          << " output_width = " << output_shape.Dims(2)
+          << " output_height = " << output_shape.Dims(1);
 
       // Call kernel optimized for depthwise convolutions using 3x3 filters.
-      optimized_ops::DepthwiseConv3x3Filter(
+      optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
           params, input_shape, input_data, filter_shape, filter_data,
           bias_shape, bias_data, output_shape, output_data);
       return;
@@ -88,64 +145,148 @@ inline void DispatchDepthwiseConv(
       break;
 #endif
     }
-    case ForceKernelInvocation::kUseNeon3x3DotProduct: {
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__) && \
-    !defined(GOOGLE_L4T)
-      using optimized_ops::DotProduct3x3KernelType;
+    case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
       DotProduct3x3KernelType kernel_type =
-          optimized_ops::CategorizeDotProductKernel(params);
-      switch (kernel_type) {
-        case DotProduct3x3KernelType::kPlain:
-          // TODO(b/118430534): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithDepthMultiplication:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad0Stride2:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad1Stride1:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kNone:
-        default:
-          break;
-      }
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+          << "Kernel type = " << static_cast<int>(kernel_type);
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
 #endif
       break;
     }
-    case ForceKernelInvocation::kUseGenericKernel: {
-      optimized_ops::DepthwiseConvGeneral(params, input_shape, input_data,
-                                          filter_shape, filter_data, bias_shape,
-                                          bias_data, output_shape, output_data);
+    case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+          << "Kernel type = " << static_cast<int>(kernel_type)
+          << " depth_multiplier = " << params.depth_multiplier
+          << " pad_width = " << params.padding_values.width
+          << " pad_height = " << params.padding_values.height
+          << " stride_width = " << params.stride_width
+          << " stride_height = " << params.stride_height
+          << " input_width = " << input_shape.Dims(2)
+          << " input_height = " << input_shape.Dims(1)
+          << " output_width = " << output_shape.Dims(2)
+          << " output_height = " << output_shape.Dims(1)
+          << " depth = " << input_shape.Dims(3)
+          << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
+          << " input_offset = " << params.input_offset;
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseCModel3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
       return;
     }
-    case ForceKernelInvocation::kNone:
+    case DepthwiseConvImplementation::kUseUnwound3x3DotProduct: {
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseUnwound3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+    }
+    case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
+#if defined(USE_NEON)
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+#else
+      break;
+#endif
+    }
+    case DepthwiseConvImplementation::kUseGenericKernel: {
+      optimized_ops::depthwise_conv::DepthwiseConvGeneral(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+    }
+    case DepthwiseConvImplementation::kNone:
+    default:
+      break;
+  }
+
+  EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
+      << "TODO(b/118426582) requested kernel was not invoked / available yet: "
+      << " forced_invocation = "
+      << static_cast<int>(test_param.forced_invocation)
+      << " depth_multiplier = " << params.depth_multiplier
+      << " pad_width = " << params.padding_values.width
+      << " pad_height = " << params.padding_values.height
+      << " stride_width = " << params.stride_width
+      << " stride_height = " << params.stride_height
+      << " input_width = " << input_shape.Dims(2)
+      << " input_height = " << input_shape.Dims(1)
+      << " output_width = " << output_shape.Dims(2)
+      << " output_height = " << output_shape.Dims(1)
+      << " depth = " << input_shape.Dims(3)
+      << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
+      << " input_offset = " << params.input_offset;
+  switch (test_param.output_rounding) {
+    case DepthwiseConvOutputRounding::kAwayFromZero:
+      optimized_ops::DepthwiseConvWithRounding<
+          DepthwiseConvOutputRounding::kAwayFromZero>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+    case DepthwiseConvOutputRounding::kUpward:
+      optimized_ops::DepthwiseConvWithRounding<
+          DepthwiseConvOutputRounding::kUpward>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
     default:
       break;
   }
-  optimized_ops::DepthwiseConv(params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               output_data);
 }
 
 // Runs the DepthwiseConv and compares against the reference implementation.
 int TestOneDepthwiseConvWithGivenOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -174,10 +315,31 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   op_params.output_offset = output_offset;
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = -output_shift;
-  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               reference_output_data.data());
-  DispatchDepthwiseConv(forced_invocation, op_params, input_shape, input_data,
+  switch (test_param.output_rounding) {
+    case DepthwiseConvOutputRounding::kUpward:
+      reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
+          DepthwiseConvOutputRounding::kAwayFromZero>::Run(op_params,
+                                                           input_shape,
+                                                           input_data,
+                                                           filter_shape,
+                                                           filter_data,
+                                                           bias_shape,
+                                                           bias_data,
+                                                           output_shape,
+                                                           reference_output_data
+                                                               .data());
+      break;
+    case DepthwiseConvOutputRounding::kAwayFromZero:
+      reference_ops::DepthwiseConv(
+          op_params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, reference_output_data.data());
+      break;
+    case DepthwiseConvOutputRounding::kNone:
+    default:
+      EXPECT_NE(test_param.output_rounding, DepthwiseConvOutputRounding::kNone);
+      break;
+  }
+  DispatchDepthwiseConv(test_param, op_params, input_shape, input_data,
                         filter_shape, filter_data, bias_shape, bias_data,
                         output_shape, output_data.data());
   int saturated_min = 0;
@@ -201,15 +363,46 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
   const float mean_abs_diff =
       static_cast<float>(sum_abs_diff) / output_buffer_size;
+
+  int diff_mean_tolerance = 1;
+  int diff_median_tolerance = 0;
+  // The tolerance that we apply to means is tight, but we allow for a rounding
+  // difference in one pixel, and loosen by another 1% for float comparison.
+  float mean_tolerance = std::max(2e-5f, 1.01f * 3.f / output_buffer_size *
+                                             std::sqrt(1.f * depth_multiplier));
+  if (test_param.loose_tolerance) {
+    mean_tolerance = 500.f;
+    diff_mean_tolerance = 256;
+    diff_median_tolerance = 225;
+  }
+
   // Normally we should require bit-for-bit exact results. Unfortunately a bug
   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
-  // causes 1-bit inaccuracy in
-  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
-  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
-  // yet still ensure that no more than a small minority of values are wrong.
-  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
-              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
-              std::abs(max_diff) <= 1);
+  // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
+  // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
+  // few off-by-one errors for now, yet still ensure that no more than a small
+  // minority of values are wrong.
+  EXPECT_LT(std::abs(mean_diff), mean_tolerance);
+  EXPECT_LT(mean_abs_diff, mean_tolerance);
+  EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
+  EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
+  EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
+  EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
+              mean_abs_diff < mean_tolerance &&
+              std::abs(median_diff) <= diff_median_tolerance &&
+              std::abs(min_diff) <= diff_mean_tolerance &&
+              std::abs(max_diff) <= diff_mean_tolerance)
+      << "pad_width = " << op_params.padding_values.width
+      << " pad_height = " << op_params.padding_values.height
+      << " input_width = " << input_shape.Dims(2)
+      << " input_height = " << input_shape.Dims(1)
+      << " output_width = " << output_shape.Dims(2)
+      << " output_height = " << output_shape.Dims(1)
+      << " depth = " << input_shape.Dims(3)
+      << " output_offset = " << op_params.output_offset
+      << " output_multiplier = " << op_params.output_multiplier
+      << " output_shift = " << op_params.output_shift;
+
   if (saturated_min > 2 * saturated_max) {
     return -1;
   }
@@ -221,13 +414,12 @@ int TestOneDepthwiseConvWithGivenOutputShift(
 
 // The point of this function is that we can't practically know which
 // output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
-// could do some
-// statistics for large size, but they would be fragile at smaller sizes), and
-// guessing wrong would mean that all the values get saturated so the test
-// becomes
-// vacuous. So we just bisect our way to reasonable output_shift values.
+// could do some statistics for large size, but they would be fragile at smaller
+// sizes), and guessing wrong would mean that all the values get saturated so
+// the test becomes vacuous. So we just bisect our way to reasonable
+// output_shift values.
 void TestOneDepthwiseConvBisectOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -242,7 +434,7 @@ void TestOneDepthwiseConvBisectOutputShift(
   int output_shift_bisect_midpoint =
       (output_activation_bisect_start + output_activation_bisect_end) / 2;
   int bisect_result = TestOneDepthwiseConvWithGivenOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       output_shift_bisect_midpoint, output_activation_min,
@@ -269,7 +461,7 @@ void TestOneDepthwiseConvBisectOutputShift(
                                              ? output_activation_bisect_end
                                              : output_shift_bisect_midpoint;
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       new_output_activation_bisect_start, new_output_activation_bisect_end,
@@ -277,7 +469,7 @@ void TestOneDepthwiseConvBisectOutputShift(
 }
 
 void TestOneDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -287,13 +479,14 @@ void TestOneDepthwiseConv(
     std::int32_t output_activation_min, std::int32_t output_activation_max,
     const RuntimeShape& output_shape) {
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       0, 32, output_activation_min, output_activation_max, output_shape);
 }
 
-bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
+bool TryTestDepthwiseConv(const TestParam& test_param,
+                          ParamsSpecialization params_specialization, int batch,
                           int input_depth, int input_width, int input_height,
                           int filter_width, int filter_height,
                           int depth_multiplier, int stride,
@@ -318,9 +511,12 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   }
   const std::int32_t output_multiplier =
       UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
-  const std::int32_t input_offset = UniformRandomInt(-256, 0);
-  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
-  const std::int32_t output_offset = UniformRandomInt(-256, 0);
+  std::int32_t filter_offset = -kSymmetricZeroPoint;
+  if (params_specialization != ParamsSpecialization::kSymmetric) {
+    filter_offset = UniformRandomInt(-255, 0);
+  }
+  const std::int32_t input_offset = UniformRandomInt(-255, 0);
+  const std::int32_t output_offset = UniformRandomInt(0, 255);
   RuntimeShape input_shape_inference(
       {batch, input_height, input_width, input_depth});
   RuntimeShape output_shape_inference;
@@ -343,7 +539,7 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   FillRandom(&filter_data);
   FillRandom(&bias_data, -10000, 10000);
   TestOneDepthwiseConv(
-      forced_invocation, input_data.data(), input_shape_inference, input_offset,
+      test_param, input_data.data(), input_shape_inference, input_offset,
       filter_data.data(), filter_shape_inference, filter_offset,
       bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
       pad_height, depth_multiplier, output_offset, output_multiplier,
@@ -355,7 +551,8 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
 // be legal. If they're not legal, it returns false. If they're legal,
 // it runs the DepthwiseConv test and returns true. This allows the caller
 // to loop until a test has been run.
-bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv(const TestParam& test_param,
+                             ParamsSpecialization params_specialization) {
   // We have to pick a lot of positive values, where we are particularly
   // interested in small values because they are most likely to be special
   // cases in optimized implementations, and secondarily because they allow
@@ -375,13 +572,14 @@ bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests parameters for the 3x3 filter kernel.
-bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv3x3Filter(
+    const TestParam& test_param, ParamsSpecialization params_specialization) {
   const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
   const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
   int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
@@ -397,7 +595,7 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   // Adjust for, or reject, special cases.
-  if (forced_invocation != ForceKernelInvocation::kNone) {
+  if (test_param.forced_invocation != DepthwiseConvImplementation::kNone) {
     // With stride == 2 and SAME, padding width and height are the left and top
     // padding amounts. When there is an even input dimension, padding + 1 is
     // required on the right / bottom. This is not handled by these kernels, so
@@ -416,59 +614,77 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
   }
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
-bool TryTestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                          bool test_stride, bool test_pad,
-                          bool test_depth_multiplier) {
+bool TryTestOneNeonDot3x3(const TestParam& test_param,
+                          ParamsSpecialization params_specialization) {
+  const CoverageExtension coverage_extension = static_cast<CoverageExtension>(
+      UniformRandomInt(0, static_cast<int>(CoverageExtension::kNumOptions)));
+
   const int batch = 1;
-  const int input_depth = test_depth_multiplier
+  const int input_depth = test_param.test_depth_multiplier
                               ? 1
-                              : 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
-  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+                              : 8 * ExponentialRandomPositiveInt(0.9f, 3, 50);
+  const int input_width = coverage_extension == CoverageExtension::kLargeWidths
+                              ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+                              : ExponentialRandomPositiveInt(0.9f, 20, 60);
+  const int input_height =
+      coverage_extension == CoverageExtension::kLargeHeights
+          ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+          : ExponentialRandomPositiveInt(0.9f, 20, 60);
   const int filter_width = 3;
   const int filter_height = 3;
   const int depth_multiplier =
-      test_depth_multiplier ? 8 * ExponentialRandomPositiveInt(0.8f, 1, 6) : 1;
-  const int stride = test_stride ? 2 : 1;
+      test_param.test_depth_multiplier
+          ? 8 * ExponentialRandomPositiveInt(0.2f, 1, 9)
+          : 1;
+  const int stride = test_param.test_stride ? 2 : 1;
   // We don't support dilations in the 3x3 filter.
   const int dilation_width_factor = 1;
   const int dilation_height_factor = 1;
-  const auto padding_type = test_pad ? PaddingType::kSame : PaddingType::kValid;
+  const auto padding_type =
+      test_param.test_pad ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
-void TestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv(forced_invocation)) {
+void TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,
+                          DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv3x3Filter(forced_invocation)) {
+void TestOneDepthwiseConv3x3Filter(
+    DepthwiseConvImplementation forced_invocation,
+    DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv3x3Filter(test_param,
+                                           ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                       bool test_stride, bool test_pad,
-                       bool test_depth_multiplier) {
-  while (!TryTestOneNeonDot3x3(forced_invocation, test_stride, test_pad,
-                               test_depth_multiplier)) {
+void TestOneNeonDot3x3(const TestParam& test_param) {
+  while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
   }
 }
 
 TEST(TestDepthwiseConv, TestDepthwiseConv) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -476,69 +692,141 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) {
 TEST(TestDepthwiseConv, TestGenericKernel) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 TEST(TestDepthwiseConv, TestKernel3x3Filter) {
   const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
+#endif
 
-// While the 3x3 coverage test is primarily targeted at specialized kernels, we
-// also run it against the generic kernel, optionally with fewer invocations.
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv3x3Filter(
+        DepthwiseConvImplementation::kUseGenericKernel,
+        DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 TEST(TestDepthwiseConv, TestNeon3x3Filter) {
   const int kTestsToRun = 3 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseNeon3x3);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
+#endif
 
-// No stride, no depth multiplier, no pad.
-TEST(TestDepthwiseConv, TestNeonDot3x3Plain) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+class DepthwiseConvTest : public ::testing::TestWithParam<TestParamTuple> {};
 
-TEST(TestDepthwiseConv, TestNeonDot3x3DepthMultiplier) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/true);
+TEST_P(DepthwiseConvTest, NeonDot3x3) {
+  const TestParam param(GetParam());
+  for (int i = 0; i < param.tests_to_run; i++) {
+    TestOneNeonDot3x3(param);
   }
 }
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Stride2) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/true, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+INSTANTIATE_TEST_SUITE_P(
+    Neon3x3Kernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
+        Values(1000),                                      // tests_to_run
+        Bool(),                                            // test_stride
+        Values(false),                                     // test_pad
+        Values(false),  // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
+        Values(false)                                        // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+#endif
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Pad1) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/true,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
+INSTANTIATE_TEST_SUITE_P(
+    GenericKernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseGenericKernel),  // forced_invocation
+        Values(100),                    // tests_to_run
+        Bool(),                         // test_stride
+        Bool(),                         // test_pad
+        Bool(),                         // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
+        Values(false)                                        // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
+INSTANTIATE_TEST_SUITE_P(
+    CModel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseCModel3x3DotProduct),           // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
+INSTANTIATE_TEST_SUITE_P(
+    Unwound, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseUnwound3x3DotProduct),          // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
+#if defined(USE_NEON)
+INSTANTIATE_TEST_SUITE_P(
+    Intrinsics, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseIntrinsics3x3DotProduct),       // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(kLooseIntrinsicsTolerance)              // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+#endif
+
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
+INSTANTIATE_TEST_SUITE_P(
+    NeonAsm, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseNeon3x3DotProduct),             // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+#endif
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/log_quantized_test.cc b/tensorflow/lite/kernels/internal/log_quantized_test.cc
index 8c39350ab1dd8996799e6539755f040399974106..c31c8e307751bcf1030e121eec23ac6cb217f461 100644
--- a/tensorflow/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/log_quantized_test.cc
@@ -121,8 +121,7 @@ void RunSingleTest(const std::vector<int32>& test_input,
                    const string& check_label, int tolerance) {
   const int n = test_input.size();
   std::vector<int32> float_gen_output(n, 0);
-  std::vector<int32> reference_output(n, 0);
-  std::vector<int32> optimized_output(n, 0);
+  std::vector<int32> quantized_output(n, 0);
 
   // Workaround the stupid things that intelligent humans do.
   // Consequence of __builtin_clz(0u) may equal 31 instead of 32.
@@ -132,45 +131,21 @@ void RunSingleTest(const std::vector<int32>& test_input,
   }
 
   for (int i = 0; i < n; ++i) {
-    reference_output[i] =
-        tflite::reference_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
-            gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
-                fudged_input[i]))
-            .raw();
-    optimized_output[i] =
-        tflite::optimized_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
+    quantized_output[i] =
+        tflite::log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                            InputIntegerBits>(
             gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
                 fudged_input[i]))
             .raw();
     float_gen_output[i] = LogPositiveValuesViaFloat(
         fudged_input[i], InputIntegerBits, OutputIntegerBits);
   }
-  // Note that first check is intolerant.
-  {
-    std::ostringstream label;
-    label << check_label << " / optimized vs reference / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, reference_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, 0);
-  }
   {
     std::ostringstream label;
     label << check_label << " / reference vs float-gen / InputIntegerBits="
           << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        reference_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
-  }
-  {
-    std::ostringstream label;
-    label << check_label << " optimized vs float-gen / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
+    CheckOutputData(quantized_output, float_gen_output, test_input, label.str(),
+                    InputIntegerBits, OutputIntegerBits, tolerance);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 945300dad1653257db69c3440f6db0589e0c1a7b..d0d2654d4123e5025d000a796907f675ca29b05c 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
 #include "tensorflow/lite/string.h"
@@ -61,7 +63,42 @@ void RunLogSoftmaxFloatReference(const uint8* input_data,
   }
 }
 
-void CheckOutputData(const uint8* test_output, const uint8* reference_output,
+// Same as above except for the following change:
+// - input and output data type
+// - Dequnatize function
+// - clamping values
+void RunLogSoftmaxFloatReference(const int8* input_data,
+                                 const RuntimeShape& shape_common,
+                                 int32 input_offset, const double input_scale,
+                                 int stride, float beta,
+                                 int8* reference_output_data) {
+  const int ref_buffer_size = shape_common.FlatSize();
+  std::vector<float> reference_dequant_data(ref_buffer_size);
+  std::vector<float> reference_output_float_data(ref_buffer_size);
+
+  // Reference data generated via Dequant of input into float, and then applying
+  // float LogSoftmax.
+  DequantizationParams dq_params;
+  dq_params.zero_point = input_offset;
+  dq_params.scale = input_scale;
+  reference_integer_ops::Dequantize(dq_params, shape_common, input_data,
+                                    shape_common,
+                                    reference_dequant_data.data());
+  SoftmaxParams sm_params;
+  optimized_ops::LogSoftmax(sm_params, shape_common,
+                            reference_dequant_data.data(), shape_common,
+                            reference_output_float_data.data());
+  // Work with quantized scaling for LogSoftmax, under which 255 represents 0,
+  // and -16 gets nudged up to 0.
+  for (int i = 0; i < ref_buffer_size; i++) {
+    reference_output_data[i] = std::max(
+        -128, static_cast<int>(
+                  127 + std::round(16.0f * reference_output_float_data[i])));
+  }
+}
+
+template <typename T>
+void CheckOutputData(const T* test_output, const T* reference_output,
                      const RuntimeShape& shape_common,
                      const string& check_label, bool be_exacting) {
   const int buffer_size = shape_common.FlatSize();
@@ -144,15 +181,58 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
   reference_ops::LogSoftmax(params, shape_common, input_data, shape_common,
                             reference_quant_logsoftmax_output.data());
 
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Optimized vs float reference", false);
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_quant_logsoftmax_output.data(), shape_common,
-                  "Optimized vs quant reference", true);
-  CheckOutputData(reference_quant_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Quant reference vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Optimized vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_quant_logsoftmax_output.data(),
+                           shape_common, "Optimized vs quant reference", true);
+  CheckOutputData<uint8_t>(reference_quant_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Quant reference vs float reference",
+                           false);
+}
+
+// Runs the LogSoftmax and compares against the float reference implementation
+// and the int8 quantized reference implementation.
+void RunOneLogSoftmaxTest(const int8* input_data,
+                          const RuntimeShape& shape_common, int32 input_offset,
+                          const double input_scale, int stride, float beta) {
+  const int buffer_size = shape_common.FlatSize();
+  std::vector<int8> quantized_logsoftmax_reference_implementation(buffer_size);
+  std::vector<int8> float_logsoftmax_optimized_implementation(buffer_size);
+
+  RunLogSoftmaxFloatReference(input_data, shape_common, input_offset,
+                              input_scale, stride, beta,
+                              float_logsoftmax_optimized_implementation.data());
+
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  int32 reverse_scaling_divisor;
+  int reverse_scaling_right_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessLogSoftmaxScalingExp(
+      beta, input_scale, kScaledDiffIntegerBits, &input_beta_multiplier,
+      &input_beta_left_shift, &reverse_scaling_divisor,
+      &reverse_scaling_right_shift);
+  reverse_scaling_right_shift *= -1;
+  // diff_min has a negative value, and is used to limit the maximum magnitude
+  // of the diffs, which are <= 0.
+  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                                     input_beta_left_shift);
+
+  const int outer_size =
+      shape_common.Dims(0) * shape_common.Dims(1) * shape_common.Dims(2);
+  const int inner_size = shape_common.Dims(3);
+  reference_integer_ops::LogSoftmax(
+      input_beta_multiplier, input_beta_left_shift, reverse_scaling_divisor,
+      reverse_scaling_right_shift, diff_min, outer_size, inner_size, input_data,
+      quantized_logsoftmax_reference_implementation.data());
+
+  CheckOutputData<int8_t>(quantized_logsoftmax_reference_implementation.data(),
+                          float_logsoftmax_optimized_implementation.data(),
+                          shape_common, "Quant reference vs float reference",
+                          false);
 }
 
 // This function picks some random LogSoftmax params, which are checked for
@@ -161,6 +241,7 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
 // to loop until a test has been run.
 //
 // Currently we do not reject for any reason.
+template <typename T>
 bool TryOneUniformLogSoftmax() {
   // We pick mostly positive values, on the whole emphasizing smaller values and
   // therefore faster tests.  We test a wider range of depths.  In the case of
@@ -178,7 +259,7 @@ bool TryOneUniformLogSoftmax() {
       RuntimeShape({batch, input_height, input_width, input_depth});
   const int buffer_size = shape_common.FlatSize();
 
-  std::vector<uint8> input_data(buffer_size);
+  std::vector<T> input_data(buffer_size);
   FillRandom(&input_data);
   RunOneLogSoftmaxTest(input_data.data(), shape_common, input_offset,
                        input_scale, stride, beta);
@@ -224,15 +305,23 @@ bool TryOneSkyscraperLogSoftmax(bool small_depth) {
   return true;
 }
 
-TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Tests) {
+  const int kTestsToRun = 100;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneUniformLogSoftmax<uint8_t>()) {
+    }
+  }
+}
+
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Int8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    while (!TryOneUniformLogSoftmax()) {
+    while (!TryOneUniformLogSoftmax<int8_t>()) {
     }
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxUint8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(false)) {
@@ -240,7 +329,7 @@ TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
+TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxUint8Tests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(true)) {
diff --git a/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
index 9748da39862edd7565fdb2bcce2ce92b9d767429..4f22517866eba964e5c51406022c377951c22252 100644
--- a/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
+++ b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
@@ -197,7 +197,7 @@ void MfccMelFilterbank::Compute(const std::vector<double> &input,
 }
 
 double MfccMelFilterbank::FreqToMel(double freq) const {
-  return 1127.0 * log(1.0 + (freq / 700.0));
+  return 1127.0 * log1p(freq / 700.0);
 }
 
 }  // namespace internal
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index d3dca799a7cca4a3048cd2d19477ba2b57fbcdac..d1a9d65aae825796ac390bd4682874ec23990a0c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
 
 // Implementation of quantized DepthwiseConv
 
@@ -1945,7 +1947,10 @@ inline void DepthwiseConvGeneral(
   }
 }
 
-inline void DepthwiseConv(
+}  // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
@@ -1979,20 +1984,34 @@ inline void DepthwiseConv(
 
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3x3FilterKernelSupported(
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
           input_shape, filter_shape, stride_width, stride_height,
           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
           depth_multiplier, output_shape, output_shift)) {
-    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-                           filter_data, bias_shape, bias_data, output_shape,
-                           output_data);
+    gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
+    depthwise_conv::DepthwiseConv3x3Filter(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data);
     return;
   }
 #endif
 
-  DepthwiseConvGeneral(params, input_shape, input_data, filter_shape,
-                       filter_data, bias_shape, bias_data, output_shape,
-                       output_data);
+  gemmlowp::ScopedProfilingLabel specialized_label(
+      "DepthwiseConv/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data);
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data);
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 5859bcaed4ac2b991ca22e7d9c17d34d3267a120..50de905db1754db1c98b3dd721a427134dfaea3b 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,58 +15,263 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 
+#include <memory>
+
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
+
+constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
+constexpr int kDepthwiseConvAdjustedBiasLimit = 64;
+// In cases such as depth multiplication, we want to be able to load data from
+// the workspace that is beyond the valid range. Macro-block sizes are adjusted
+// to allow for this.
+constexpr int kWorkspaceExtension = 16;
+
+#ifdef USE_NEON
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
+
+#define vst1_lane_8x4(dst, reg, lane_num)                         \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+#define vst1q_lane_8x4(dst, reg, lane_num)                        \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+
+#define vld1q_lane_s8x8(src, reg, lane_num) \
+  vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
+#define vld1_lane_8x4(src, reg, lane_num) \
+  vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_lane_8x4(src, reg, lane_num) \
+  vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
+
+#ifndef __aarch64__
+inline int8x16_t vqtbl4q_s8(int8x16x4_t a, uint8x16_t b) {
+  const uint8x16_t mask = vtstq_u8(b, vdupq_n_u8(8));
+
+  // Delete bit 3 from the indices.
+  const uint8x16_t high_bits = vshrq_n_u8(b, 4);
+  uint8x16_t deleted_bit_3 = b;
+  deleted_bit_3 = vsliq_n_u8(deleted_bit_3, high_bits, 3);
+
+  int8x8x4_t repacked_data;
+
+  // Calculate for lower indices.
+  repacked_data.val[0] = vget_low_u8(a.val[0]);
+  repacked_data.val[1] = vget_low_u8(a.val[1]);
+  repacked_data.val[2] = vget_low_u8(a.val[2]);
+  repacked_data.val[3] = vget_low_u8(a.val[3]);
+  const int8x16_t output_for_lower =
+      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
+
+  // Calculate for high indices.
+  repacked_data.val[0] = vget_high_u8(a.val[0]);
+  repacked_data.val[1] = vget_high_u8(a.val[1]);
+  repacked_data.val[2] = vget_high_u8(a.val[2]);
+  repacked_data.val[3] = vget_high_u8(a.val[3]);
+  const int8x16_t output_for_higher =
+      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
+
+  // Merge.
+  int8x16_t output = mask;
+  output = vbslq_u8(output, output_for_higher, output_for_lower);
+  return output;
+}
+#endif  // !__aarch64__
+
+// Convenience-compatibility functions.
+// Compatibility: Intrinsics reflect a mixture of older and newer ARM
+//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
+//     one intrinsic is provided. Also older instructions operated in place,
+//     and it seems more defensive to assume that some versions of intrinsics
+//     might reflect this
+// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
+//     want the calling code to get cluttered with unpacking int8x16x2_t.
+inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
+  int8x16x2_t r8x16;
+  r8x16 = vzipq_s8(*a, *b);
+  *a = r8x16.val[0];
+  *b = r8x16.val[1];
+}
+
+inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+  *b = vreinterpretq_s8_s16(r16x8.val[1]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the TRN1 asm instruction result.
+inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+}
+
+inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
+  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
+  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
+                                           vreinterpretq_u32_s8(*right), 24));
+  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
+}
+
+#ifndef __aarch64__
+inline int32x4_t vpaddq_s32(int32x4_t a, int8x16_t b) {
+  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
+  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
+}
+#endif  // !__aarch64__
+
+#ifdef __ARM_FEATURE_DOTPROD
+// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
+// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
+// unusual interpretation of "lane".
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, const int lane) {
+  switch (lane) {
+    case 0:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0);
+    case 1:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1);
+    case 2:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            0);
+    case 3:
+    default:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            1);
+  }
+}
+
+#else
+
+inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, int lane) {
+  int8x8_t lane_rhs;
+  if (lane == 0) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
+  } else if (lane == 1) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
+  } else if (lane == 2) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
+  } else {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
+  }
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+#endif  // !DOTPROD
+#endif  // ARM NEON
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DivideByPOT {};
+
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return RoundingDivideByPOT(x, exponent);
+  }
+};
+
+#ifdef USE_NEON
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32>(-exponent)));
+  }
+};
+#endif  // ARM NEON
 
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
   kPlain,
-  kWithDepthMultiplication,
-  kWithPad0Stride2,
-  kWithPad1Stride1,
+  kWithDepthMultiplicationStride1,
+  kWithDepthMultiplicationStride2,
+  kStride2,
 };
 
 inline DotProduct3x3KernelType CategorizeDotProductKernel(
+    const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
     const DepthwiseParams& params) {
-  const int padding = params.padding_values.width;
+  constexpr int kSymmetricZeroPoint = 128;
+  const int padding =
+      std::max(params.padding_values.width, params.padding_values.height);
   const int stride = params.stride_width;
-  if (padding != params.padding_values.height ||
-      stride != params.stride_height) {
+  const int32 input_depth = input_shape.Dims(3);
+  const int32 depth_multiplier = params.depth_multiplier;
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+
+  bool supported =
+      params.weights_offset == -kSymmetricZeroPoint &&
+      stride == params.stride_height && stride <= 2 && padding <= 1 &&
+      filter_width == 3 && filter_height == 3 && params.output_shift <= 0 &&
+      params.dilation_width_factor == 1 && params.dilation_height_factor == 1 &&
+      (((input_depth % 8) == 0 && depth_multiplier == 1) ||
+       (input_depth == 1 && depth_multiplier > 1));
+
+  if (!supported) {
     return DotProduct3x3KernelType::kNone;
   }
 
   if (params.depth_multiplier == 1) {
-    if (padding == 0 && stride == 1) {
+    if (stride == 1) {
       return DotProduct3x3KernelType::kPlain;
-    } else if (padding == 0 && stride == 2) {
-      return DotProduct3x3KernelType::kWithPad0Stride2;
-    } else if (padding == 1 && stride == 1) {
-      return DotProduct3x3KernelType::kWithPad1Stride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   } else {
-    if (padding == 0 && stride == 1) {
-      return DotProduct3x3KernelType::kWithDepthMultiplication;
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   }
 }
 
+#ifdef USE_NEON
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #include <stddef.h>
 
-#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
-
 // Encapsulates constant parameters used in DepthwiseConv.
 // 64-bit is used for types that will be added to 64-bit addresses in asm.
 struct DepthwiseConvParams {
@@ -90,9 +295,6 @@ struct DepthwiseConvParams {
   int32 output_height;
 };
 
-#define STR(s) STR_UNEXPANDED(s)
-#define STR_UNEXPANDED(s) #s
-
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
 // Keep these values in sync with the static_asserts below.
@@ -167,7 +369,49 @@ static_assert(offsetof(DepthwiseConvParams, output_width) ==
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
                   OFFSET_OUTPUT_HEIGHT,
               "");
+#endif  // __aarch64__
 
+#endif  // ARM NEON
+
+// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+//
+// This structure is specifically designed for use in asm.
+struct DepthwiseConvDotProdParams {
+  int64_t input_depth;
+  int64_t output_depth;
+  int32 workspace_height_stride;
+  int32 input_width_overall_micro_repeats;
+  int32 input_width_micro_repeats;
+  int32 depth_micro_repeats;
+  int32 inbound_block_height;
+  int32 residual_width;
+  int32 input_height_stride;
+  int32 stride;
+  int32 output_width_overall_micro_repeats;
+  int32 output_width_micro_repeats;
+  int32 output_residual_width;
+  int32 output_height_stride;
+  int32 bias_increment;
+  int32 padding_left;
+  int32 padding_right;
+  int32 padding_top;
+  int32 padding_bottom;
+  int32 height_macro_count;
+  int32 width_macro_count;
+  int32 outbound_block_height;
+  int32 workspace_width_micro_repeats;
+  int32 input_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int32 output_shift;
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  int32 four_over_stride;
+};
+
+#ifdef USE_NEON
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
 
@@ -229,7 +473,8 @@ struct DepthwiseConvWindow<8, 1, 1> {
         // Set "constant" registers. These registers may be replaced with temp
         // values from time to time when there are not enough NEON registers.
         // We use x9--x15 general purpose registers as they are caller-saved
-        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
         "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
         "cmp %w[output_window_height], #2\n"
@@ -2963,8 +3208,6 @@ struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> {
 #undef OFFSET_INPUT_HEIGHT
 #undef OFFSET_OUTPUT_WIDTH
 #undef OFFSET_OUTPUT_HEIGHT
-#undef STR
-#undef STR_UNEXPANDED
 
 // Copies a subset of the input designated by |input_ptr| into |output_ptr|
 // with the specified output dimensions. Supports output depths of 64 only as
@@ -3045,9 +3288,9 @@ struct DepthwiseConvMultiRow {
     TFLITE_DCHECK(
         shuffle_params.input_width ==
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
-    TFLITE_DCHECK(64 * shuffle_params.input_width *
-                      shuffle_params.input_height <=
-                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
 
     int32 out_x = start_x;
 
@@ -3375,7 +3618,7 @@ inline void DepthwiseConv3x3Filter(
   // allocated on the stack. Eventually we will want to move it to the heap
   // and have it allocated outside of this function, like the im2col_array
   // used in gemmlowp.
-  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
 
   for (int32 b = 0; b < batches; ++b) {
     const uint8* input_ptr = input_data + b * input_batch_size;
@@ -3454,9 +3697,3840 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-
 #endif  // __aarch64__
 
+#endif
+
+// Permute filter data, and adjust bias data to account for symmetric input
+// offset. Details are provided in the implementation of the
+// kUseCModel3x3DotProduct version.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation>
+struct ProcessPerDepth {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Copy a macro block of data from the input buffer into the workspace,
+// permuting data within each micro block.
+//
+// (a) Copy a macro block of data, padding as required along the width and
+//     height.
+// (b) Transpose the data within each micro block.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication,
+          int32 max_padding>
+struct PackMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Apply filter to macro block of input data and store results. Details are
+// provided in the implementation of the kUseCModel3x3DotProduct version.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication, int32 stride>
+struct KernelMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+#if defined(USE_NEON) && defined(__aarch64__)
+// Experiments suggest that a modest performance improvement is seen, at least
+// on 855 chipset big cores, with cache hints.
+inline void PreloadInputBlock(
+    const uint8* input_block_data,
+    const DepthwiseConvDotProdParams* function_params) {
+  // Preload.
+  const int input_width_micro_repeats =
+      function_params->input_width_micro_repeats;
+  const int block_height = function_params->inbound_block_height;
+  const int residual_width = function_params->residual_width;
+  const int input_height_stride = function_params->input_height_stride;
+  const int input_depth = function_params->input_depth;
+
+  {
+    const int total_width = 4 * input_width_micro_repeats + residual_width;
+    const uint8* row_ptr = input_block_data;
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* ptr = row_ptr;
+      for (int j = 0; j < total_width; ++j) {
+        // Input data is loaded once.
+        asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+        ptr += input_depth;
+      }
+      row_ptr += input_height_stride;
+    }
+  }
+}
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
+  static void ProcessPerDepthIntrinsics(
+      const uint8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    constexpr int kSymmetricZeroPoint = 128;
+    constexpr uint8 kSignBit = 0x80;
+    const int32 input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+    const int8x16_t ones_vector = vdupq_n_s8(1);
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8x16_t filter_reg_0_a;
+    int8x16_t filter_reg_0_b;
+    int8x16_t filter_reg_1_a;
+    int8x16_t filter_reg_1_b;
+    int8x16_t filter_reg_2_a;
+    int8x16_t filter_reg_2_b;
+
+    // Register pairs for each height.
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    const uint8* filter_block = filter_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+
+      // Load zero-point into effective position of zero-padding of filter
+      // (register B, upper part).
+      filter_reg_0_b = vdupq_n_u8(kSignBit);
+      filter_reg_1_b = vdupq_n_u8(kSignBit);
+      filter_reg_2_b = vdupq_n_u8(kSignBit);
+
+      const uint8* filter_block_ptr = filter_block;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 1);
+
+      filter_reg_0_a = veorq_s8(filter_reg_0_a, sign_bit);
+      filter_reg_0_b = veorq_s8(filter_reg_0_b, sign_bit);
+      filter_reg_1_a = veorq_s8(filter_reg_1_a, sign_bit);
+      filter_reg_1_b = veorq_s8(filter_reg_1_b, sign_bit);
+      filter_reg_2_a = veorq_s8(filter_reg_2_a, sign_bit);
+      filter_reg_2_b = veorq_s8(filter_reg_2_b, sign_bit);
+
+      vzipq_s8_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8_in_place(&filter_reg_2_a, &filter_reg_2_b);
+      vzipq_s8x2_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8x2_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8x2_in_place(&filter_reg_2_a, &filter_reg_2_b);
+
+      vst1q_s8(shuffled_filter_data, filter_reg_0_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_0_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_b);
+      shuffled_filter_data += 16;
+
+      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      // For instance, if input_offset == 128, no adjustment is needed.
+
+      int32x4_t filter_sum_a = vdupq_n_s32(0);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_2_a, ones_vector);
+      int32x4_t filter_sum_b = vdupq_n_s32(0);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_2_b, ones_vector);
+
+      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
+                                         input_offset_difference);
+      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
+                                         input_offset_difference);
+
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
+      adjusted_bias_data += 4;
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
+      adjusted_bias_data += 4;
+
+      filter_block += 8;
+    }
+  }
+
+  static inline void Run(const uint8* filter_data, const int32* bias_data,
+                         int8* shuffled_filter_data, int32* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
+                              adjusted_bias_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    static const uint8 perm_data[64] = {
+        0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,  //
+        4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+        8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+        12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr uint8 kSignBit = 0x80;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+    const int8x16_t perm_data_0 = vld1q_u8(perm_data);
+    const int8x16_t perm_data_1 = vld1q_u8(perm_data + 16);
+    const int8x16_t perm_data_2 = vld1q_u8(perm_data + 32);
+    const int8x16_t perm_data_3 = vld1q_u8(perm_data + 48);
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int i_depth = 0;
+        for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+          int8x16x4_t input_data;
+          input_data.val[0] = vld1q_u8(input_data_0);
+          input_data.val[1] = vld1q_u8(input_data_1);
+          input_data.val[2] = vld1q_u8(input_data_2);
+          input_data.val[3] = vld1q_u8(input_data_3);
+          input_data_1 += 16;
+          input_data_0 += 16;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 16;
+          input_data_3 += 16;
+
+          tmp_0 = vqtbl4q_s8(input_data, perm_data_2);
+          tmp_1 = vqtbl4q_s8(input_data, perm_data_3);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+        }
+        for (; i_depth < depth_micro_repeats; ++i_depth) {
+          int8x16x4_t input_data;
+          input_data.val[0] =
+              vld1q_lane_s8x8(input_data_0, input_data.val[0], 0);
+          input_data.val[1] =
+              vld1q_lane_s8x8(input_data_1, input_data.val[1], 0);
+          input_data.val[2] =
+              vld1q_lane_s8x8(input_data_2, input_data.val[2], 0);
+          input_data.val[3] =
+              vld1q_lane_s8x8(input_data_3, input_data.val[3], 0);
+          input_data_1 += 8;
+          input_data_0 += 8;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        TFLITE_DCHECK_LT(residual_width, 4);
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          work_reg_a = vdupq_n_u8(kSignBit);
+          work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+          work_reg_b = vdupq_n_u8(kSignBit);
+          if (residual_width > 1) {
+            work_reg_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+            if (residual_width == 3) {
+              work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                           work_reg_a, 1);
+            }
+          }
+          work_reg_a = veorq_s8(work_reg_a, sign_bit);
+          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+          vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_0 += 8;
+          input_data_1 += 8;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    constexpr uint8 kSignBit = 0x80;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint = 128;
+
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs zero-ed).
+        int adjusted_residual_width =
+            j_width == (input_width_micro_repeats) ? residual_width : 4;
+
+        if (trailing_width_padding &&
+            j_width == (width_overall_micro_repeats - 1)) {
+          adjusted_residual_width -= 1;
+        }
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+        }
+        if (start_width == 0) {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 0) {
+                work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+                if (adjusted_residual_width > 1) {
+                  work_reg_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               work_reg_b, 0);
+                  if (adjusted_residual_width == 3) {
+                    work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 work_reg_a, 1);
+                  }
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        } else {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              // Skip loading first column.
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              // Skip loading first column.
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 1) {
+                work_reg_b =
+                    vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+                if (adjusted_residual_width == 3) {
+                  work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                               work_reg_a, 1);
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        }
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = vld1q_u8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(padding_reg, work_reg, 15);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          vst1_lane_8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+      // ASM should use MOVI 64-bit set.
+      padding_mask = vcreate_u64(~0xffffff00L);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 2),
+                         half_work_reg, 3);
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      if (leading_width_padding) {
+        padding_mask = vset_lane_u8(255, padding_mask, 0);
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vshl_n_s64(half_work_reg, 8);
+        }
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    const int copy_block_height = block_height;
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    const int copy_size =
+        (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = copy_size & 0x7;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ(copy_done % 16, 0);
+          vst1q_s8(scratch_data + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + copy_done, half_work_reg);
+        vst1_s8(scratch_data + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = copy_size & 0x3;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    const int8* input_data_depthwise = scratch_block_data;
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = input_data_base;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(next_input_data + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(next_input_data + 4 * workspace_height_stride);
+          int8x16_t left_bank_5_reg =
+              vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+          acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+          acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+          acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (i_width == output_width_micro_repeats) &&
+            //        ((residual_width - 1) * stride_val < 2)
+            const bool no_right_block =
+                i_width == output_width_micro_repeats && residual_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+              left_bank_5_reg = right_bank_5_reg;
+
+              output_data += depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += width_micro_stride;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+              biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+              biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+              biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
+              biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
+              biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+          input_data_base += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data = input_data_base;
+            uint8* output_data = output_data_base;
+
+            // Load first sub-micro block of data into operational banks.
+            int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+            int8x16_t left_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            int8x16_t left_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += width_micro_stride;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              int8x16_t right_bank_0_reg;
+              int8x16_t right_bank_1_reg;
+              int8x16_t right_bank_2_reg;
+              // Logic: (output_width - 1) * stride_val < 2.
+              const bool no_right_block = output_width < 3;
+
+              if (no_right_block) {
+                // Only needed for santizer checks.
+                right_bank_0_reg = vdupq_n_s8(0);
+                right_bank_1_reg = vdupq_n_s8(0);
+                right_bank_2_reg = vdupq_n_s8(0);
+              } else {
+                right_bank_0_reg = vld1q_s8(next_input_data);
+                right_bank_1_reg =
+                    vld1q_s8(next_input_data + workspace_height_stride);
+                right_bank_2_reg =
+                    vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              }
+              // Load next sub-micro block of data.
+
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_s32(acc, filter_reg_0_a, left_bank_0_reg);
+                acc = vdotq_s32(acc, filter_reg_1_a, left_bank_1_reg);
+                acc = vdotq_s32(acc, filter_reg_2_a, left_bank_2_reg);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+                biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+                biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+
+                output_data += depth;
+              }
+            }
+            input_data_base += workspace_height_stride;
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      input_data_depthwise += depth_micro_stride;
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    // This version only does min/max on 64 bits.
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x8_t output_activation_min_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x8_t output_activation_max_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_max));
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    TFLITE_DCHECK_EQ(stride_val, 2);
+    TFLITE_DCHECK_LE(block_height, 2);
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      if (block_height == 2) {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(input_data_0 + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(input_data_0 + 4 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+          int8x16_t right_bank_3_reg;
+          int8x16_t right_bank_4_reg;
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+              right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                          3 * workspace_height_stride);
+              right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                          4 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + depth + output_height_stride,
+                            acc_u8, 1);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+
+          int32x4_t acc0;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      }
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->stride, 1);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = scratch_block_data;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_a_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_b_reg =
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                             input_bank_b_reg, 2);
+          input_bank_c_reg = vld1q_dup_s8x4(
+              next_input_data +
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_c_reg =
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                             input_bank_c_reg, 2);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
+          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
+          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += 4;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
+
+              output_data += output_depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+          // scratch_block_data += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        // Block height < 4.
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data =
+                scratch_block_data + k_height * workspace_height_stride;
+            uint8* output_data = output_data_base;
+
+            int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+            int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+
+            // Load first sub-micro block of data into operational banks.
+            input_bank_a_reg =
+                vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                  // uninitialized variable.
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+            input_bank_b_reg = vld1q_dup_s8x4(
+                next_input_data +
+                2 * workspace_height_stride);  // Load lane 0, avoiding
+                                               // uninitialized variable.
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += 4;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data + workspace_height_stride,
+                                 input_bank_a_reg, 3);
+              input_bank_b_reg =
+                  vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                                 input_bank_b_reg, 1);
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_four_lane_s32(acc, filter_reg_0_a, input_bank_a_reg,
+                                          0);
+                acc = vdotq_four_lane_s32(acc, filter_reg_1_a, input_bank_a_reg,
+                                          2);
+                acc = vdotq_four_lane_s32(acc, filter_reg_2_a, input_bank_b_reg,
+                                          0);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+                input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+
+                output_data += output_depth;
+              }
+            }
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      TFLITE_DCHECK_EQ(bias_increment, 4);
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+
+      if (block_height == 2) {
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        //
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+#endif  // USE_NEON && __aarch64__
+
+// Top-level implementation function for 3x3 depthwise convolution using NEON
+// dot-product instructions.
+//
+// MACRO & MICRO BLOCKS
+//
+// The task is divided into macro blocks. Data is copied first into a macro
+// block in a workspace. This has two purposes: (a) bringing data into
+// cache, and (b) permuting data so that it can be used much more easily in
+// a dot-product filter.
+//
+// When there is no depth multiplication:
+//
+// The permutations required for dot-products are local, within 4 data points
+// down the depth and 4 across the width. We want to pull in input data at least
+// 8-bytes at a time, down the depth, and so we divide the macro blocks into
+// 1x4x8 (height, width, depth) and further divide the micro blocks into
+// sub-blocks with shape (1x4x4).
+//
+// Each macro-block is constructed from micro-blocks that are internally
+// rearranged during loading into the macro-block workspace.
+//
+// In other words, the micro-block shape is
+//     {1, 1, 4, 8}
+// Each macro block is typically shape
+//     {1, height_block_size, 4 * workspace_width_micro_repeats, 64}
+// and workspace_width_micro_repeats is chosen so it fits into the workspace.
+//
+// However, if depth < 64, we decrease the macro block depth, enabling us to
+// increase the macro-block width.
+//
+// When there is depth multiplication:
+//
+// We require input-depth = 1 and exploit that instead.  Note that output data
+// is still full-depth, *as is the filter and bias data after certain
+// adjustments*, and so the filter stage in this case still proceeds in terms of
+// sub-blocks.
+//
+// The Magic of these numbers:
+//     4 is the number of input elements used in each dot-product.
+//     8 is the number of inputs we load at a time into a register.
+//     64 is min amount of data to be loaded in a stretch (when possible).
+//
+// FILTER DATA PREPARATION
+//
+// Filter data needs to be permuted in a fashion like that of input data, and
+// this is done in a preprocessing stage. In addition, this stage extends the
+// filter in the direction of width from 3 to 4. The extra filter taps are set
+// to zero so that input data does not have to be zeroed before applying
+// dot-products.
+//
+// OVERALL COUNTS: HANDLING TRAILING ITERATION
+//
+// Often it is necessary to handle the last iteration in a loop differently,
+// generally because the final item is shorter. The logic to detect the
+// special case can be a bit expensive. We use a scheme in which there are
+// two counts, in a pattern like xxx_yyy_repeats and
+// xxx_overall_yyy_repeats. The first gives the count of "normal"
+// iterations. The loop iterates over the second count, and the induction
+// variable is checked to see if it reaches xxx_yyy_repeats. If there is no
+// special trailing iteration, xxx_yyy_repeats = xxx_overall_yyy_repeats,
+// and the special code is not executed.
+//
+// Example:
+// Suppose that we characterize a size s as
+// f(s) -> (block-4-repetitions, remainder, overall_repetitions):
+// f(11) -> (2, 3, 3)
+// f(12) -> (3, 0, 3)
+// f(13) -> (3, 1, 4)
+//
+// POINTING OUTSIDE OF INPUT ARRAY.
+//
+// When there is padding, the input data pointer passed to the fill routines
+// points outside of the input array and into a kind-of virtual padded
+// margin. It turns out that this simplifies the code and removes
+// conditional statements. It is hard to explain why without comparing two
+// versions of the code. In summary, this way the adjustment into the margin
+// can be made unconditionally, and the correction back into the input array
+// is done where there is a conditional already.
+//
+// OVERLAP
+//
+// Since this is *depthwise* conv, neither the batch nor the depth have overlap.
+// The height and depth overlap by (filter_size - 1). Thus some data is used
+// twice on the borders of macro blocks.
+//
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  // Check kernel restrictions.
+  constexpr int filter_size = 3;
+  constexpr int kMaxStride = 2;
+  constexpr int kMaxPadding = 1;
+  constexpr int kSymmetricZeroPoint = 128;
+  TFLITE_DCHECK_EQ(params.weights_offset, -kSymmetricZeroPoint);
+  TFLITE_DCHECK_LE(params.stride_width, kMaxStride);
+  TFLITE_DCHECK_EQ(params.stride_height, params.stride_width);
+  TFLITE_DCHECK_EQ(params.dilation_width_factor, 1);
+  TFLITE_DCHECK_EQ(params.dilation_height_factor, 1);
+  TFLITE_DCHECK_LE(params.padding_values.width, kMaxPadding);
+  TFLITE_DCHECK_LE(params.padding_values.height, kMaxPadding);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  // Key kernel parameters (along with padding handled later).
+  const int stride = params.stride_width;
+  const int depth_multiplier = params.depth_multiplier;
+  const bool has_depth_multiplication = depth_multiplier > 1;
+
+  // Extract task dimensions.
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  TFLITE_DCHECK(!has_depth_multiplication || input_depth == 1);
+  TFLITE_DCHECK(has_depth_multiplication || input_depth == output_depth);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  TFLITE_DCHECK_EQ(input_depth * depth_multiplier, output_depth);
+  TFLITE_DCHECK_EQ(MatchingDim(filter_shape, 1, filter_shape, 2), filter_size);
+
+  // Return now if nothing to do.
+  if (output_width == 0 || output_height == 0) {
+    return;
+  }
+
+  // Kernel parameter structure: set basic fields.
+  //
+  // In asm it is easier to pass a structure than more than, say, 8 parameters.
+  DepthwiseConvDotProdParams function_params;
+  function_params.input_depth = input_depth;
+  function_params.output_depth = output_depth;
+  function_params.input_offset = params.input_offset;
+  function_params.output_offset = params.output_offset;
+  function_params.output_multiplier = params.output_multiplier;
+  function_params.output_shift = params.output_shift;
+  function_params.quantized_activation_min = params.quantized_activation_min;
+  function_params.quantized_activation_max = params.quantized_activation_max;
+  function_params.stride = stride;
+
+  // Handle inbound bias data.
+  //
+  // Note that this data is adjusted in a per-depth process before the main
+  // filters. The adjustment accounts for a non-symmetric input offset.
+  //
+  // Kernel subroutines need to be able to operate consistently on an bias
+  // array. Where there is no bias, we provide one filled with zeros.
+  constexpr int kMinBiasLoad = 8;
+  int32 zero_bias_data[kMinBiasLoad];
+  int32 bias_increment;
+  if (bias_data) {
+    bias_increment = 4;
+  } else {
+    memset(zero_bias_data, 0, sizeof(zero_bias_data));
+    bias_data = &zero_bias_data[0];
+    bias_increment = 0;
+  }
+  function_params.bias_increment = bias_increment;
+  TFLITE_DCHECK_LE(2 * function_params.bias_increment, kMinBiasLoad);
+
+  // Process padding.
+  //
+  // Whether "correct" or not, this matches ComputeConvSizes. When there is
+  // stride > 1 there can be padding on the bottom or top, and therefore
+  // we need to consider padding. This is true even if one or other of the
+  // padding_values is 0.
+  const int padded_width = (output_width - 1) * stride + filter_size;
+  {
+    const int padding_left = params.padding_values.width;
+    // Right padding would be -1 if discarding input because of stride.
+    const int padding_right =
+        std::max(padded_width - input_width - padding_left, 0);
+    const int padding_top = params.padding_values.height;
+    const int padded_height = (output_height - 1) * stride + filter_size;
+    const int padding_bottom =
+        std::max(padded_height - input_height - padding_top, 0);
+
+    function_params.padding_left = padding_left;
+    function_params.padding_right = padding_right;
+    function_params.padding_top = padding_top;
+    function_params.padding_bottom = padding_bottom;
+
+    TFLITE_DCHECK_LE(padding_left, padding_right);
+    TFLITE_DCHECK_LE(padding_top, padding_bottom);
+  }
+  // When stride == 1 left or top padding may only be non-zero.
+  // This is when padding is specified but not needed on a trailing dimension.
+  // When stride == 2 right or bottom padding may only be non-zero.
+  // This is a result of the details of the padding calculations.
+  const bool padding_required =
+      function_params.padding_left > 0 || function_params.padding_top > 0 ||
+      function_params.padding_right > 0 || function_params.padding_bottom > 0;
+
+  // Choose parameter-specific kernel subroutines.
+  //
+  // The main part of the kernel has two stages. First, a temporary workspace is
+  // filled with padded and permuted data. Second, the filter is applied to the
+  // workspace data to generate output.
+  //
+  // The workspace fill stage handles padding so that the filter stage does not
+  // need to account for it. The workspace fill stage does not need to
+  // understand striding, and implicitly handles striding through the parameters
+  // that it is given.
+  using pack_macro_block_func_t = decltype(
+      &PackMacroBlock<implementation,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      0>::Run);
+  using kernel_macro_block_func_t = decltype(
+      &KernelMacroBlock<implementation,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        1>::Run);
+  pack_macro_block_func_t pack_macro_block_func;
+  kernel_macro_block_func_t kernel_macro_block_func;
+  {
+    if (has_depth_multiplication) {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/2>::Run;
+      }
+    } else {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/2>::Run;
+      }
+    }
+  }
+
+  // Stride-only variables.
+  //
+  // stride == 1 ? 4 : 2:
+  const int output_height_per_macro = 6 - 2 * stride;
+  // output_height_per_macro * stride:
+  constexpr int input_height_per_macro = 4;
+  // Number of rows per micro block (= rows per macro block) is
+  //   (output_height_per_macro - 1) * stride + 1 + (filter_size - 1)
+  //   = stride == 1 ? 3 + filter_size : 2 + filter_size:
+  const int height_block_size = 4 + filter_size - stride;
+  const int input_height_overlap = filter_size - stride;
+  // stride == 1 ? 4 : 2:
+  function_params.four_over_stride = output_height_per_macro;
+
+  TFLITE_DCHECK_EQ(stride * function_params.four_over_stride, 4);
+  TFLITE_DCHECK_EQ(height_block_size,
+                   input_height_per_macro + input_height_overlap);
+
+  // Create workspaces.
+  //
+  // Filter workspace is for shuffle: only first depth/8 is used.
+  // indexed as [depth/8][sub-block][height][depth][width].
+  TFLITE_DCHECK_EQ(kDepthwiseConvAdjustedBiasLimit % 8, 0);
+  int8 macroblock_workspace[kDepthwiseConvScratchWorkspaceSize];
+  int32 adjusted_bias_data[kDepthwiseConvAdjustedBiasLimit];
+  int8 filter_workspace[kDepthwiseConvAdjustedBiasLimit >> 3][3][2][4][4];
+
+  // Output depth characterization.
+  //
+  const int depth_macro_count = output_depth / 64;
+  const int depth_overall_macro_count = (output_depth + 63) / 64;
+  // Number of micro blocks down the depth in a final incomplete macro block.
+  const int depth_trailing_micro_repeats = output_depth / 8 % 8;
+  // The output_depth may not have a remainder: it must be a multiple of 8.
+  TFLITE_DCHECK_EQ(output_depth,
+                   64 * depth_macro_count + 8 * depth_trailing_micro_repeats);
+
+  // Characterize the first macro block depth, the largest.
+  //
+  // We base treatment of the width on the trailing macro block if there are
+  // no full blocks, in order to do more work together (that is, increase
+  // workspace_width_micro_repeats when largest_macro_depth < 64).
+  const int largest_macro_depth =
+      has_depth_multiplication
+          ? 1
+          : (depth_macro_count > 0 ? 64 : 8 * depth_trailing_micro_repeats);
+
+  // Characterize width, consumption of input and generation of output.
+  //
+  // In the case of depth multiplication, we ensure that some of the workspace
+  // at the end remains unused. This enables the filter routines to load the
+  // "next" data, of at least 16 bytes, even when at the end of the workspace.
+  // It is relatively expensive to detect the end micro block. It is also very
+  // difficult to test for (to trigger) erroneous reads (past end of array) in
+  // the depth multplication case.
+  int workspace_width_micro_repeats =
+      (has_depth_multiplication
+           ? kDepthwiseConvScratchWorkspaceSize - kWorkspaceExtension
+           : kDepthwiseConvScratchWorkspaceSize) /
+      (4 * largest_macro_depth * height_block_size);
+  // When there is no depth multiplication, the workspace depth is a multiple of
+  // 8, which ensures that workspace rows are 16-byte aligned. (Actually 32,
+  // because of the micro width of 4.) This is not necessarily the case under
+  // depth multiplication, so we adjust now to impose this restriction.
+  if (has_depth_multiplication) {
+    workspace_width_micro_repeats = (workspace_width_micro_repeats / 4) * 4;
+  }
+  TFLITE_DCHECK_EQ((workspace_width_micro_repeats * largest_macro_depth) % 4,
+                   0);
+  // Discount 1 of the micro-block repeats in each macro block to account for
+  // overlap.
+  const int consumed_width_per_macro_block =
+      4 * (workspace_width_micro_repeats - 1);
+  const int output_width_per_macro_block =
+      function_params.four_over_stride * (workspace_width_micro_repeats - 1);
+  TFLITE_DCHECK_GT(workspace_width_micro_repeats, 1);
+  TFLITE_DCHECK_EQ(output_width_per_macro_block * stride,
+                   consumed_width_per_macro_block);
+
+  // Width repetitions and residuals.
+  //
+  // Use of the workspace is characterized primarily in terms of *padded input*.
+  // Striding only matters in a few places.
+  //
+  // Simplifications: We require that there always be at least one full
+  // micro-block across the width. Since the maximum padding is 1, the trailing
+  // padding cannot span two micro blocks.
+  const int residual_micro_width = padded_width % 4;
+  // We base the count of macro blocks on the amount of padded input data each
+  // one consumes.
+  int width_overall_macro_count = (padded_width - residual_micro_width +
+                                   consumed_width_per_macro_block - 1) /
+                                  consumed_width_per_macro_block;
+  // Recall that we left a micro block at the end of each macro block for use as
+  // overlap. There is a special case in which we can use one fewer macro
+  // blocks, with the last one consuming extra input. (But not if the
+  // calculation thinks that we can use zero blocks.)
+  if (padded_width <=
+      ((width_overall_macro_count - 1) * consumed_width_per_macro_block + 4)) {
+    width_overall_macro_count -= 1;
+  }
+  width_overall_macro_count = std::max(width_overall_macro_count, 1);
+  // We always have to treat the final macro block along width as trailing,
+  // because even if it is full in terms of padded input, it will be incomplete
+  // in terms of output.
+  const int width_macro_count = width_overall_macro_count - 1;
+  // Micro blocks are traversed in terms of input in fill routines.
+  const int width_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count) / 4;
+  const int width_overall_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count + 3) /
+      4;
+  // Micro blocks are traversed in terms of output in filtering routines.
+  const int residual_output_micro_width =
+      (output_width - 1) % function_params.four_over_stride + 1;
+  const int output_width_trailing_micro_repeats =
+      residual_micro_width > (filter_size - 1)
+          ? width_trailing_micro_repeats
+          : width_trailing_micro_repeats - 1;
+  // Check results.
+  TFLITE_DCHECK_GT(width_overall_trailing_micro_repeats, 0);
+  TFLITE_DCHECK_EQ(padded_width,
+                   residual_micro_width +
+                       consumed_width_per_macro_block * width_macro_count +
+                       4 * width_trailing_micro_repeats);
+  TFLITE_DCHECK_LE(width_overall_macro_count, width_macro_count + 1);
+  TFLITE_DCHECK_GE(width_overall_macro_count, width_macro_count);
+
+  // Height repetitions and residuals.
+  //
+  const int height_macro_count = output_height / output_height_per_macro;
+  const int residual_output_height = output_height % output_height_per_macro;
+  const int height_overall_macro_count =
+      (output_height + output_height_per_macro - 1) / output_height_per_macro;
+  TFLITE_DCHECK_EQ(
+      output_height,
+      residual_output_height + output_height_per_macro * height_macro_count);
+  TFLITE_DCHECK_LE(height_overall_macro_count, height_macro_count + 1);
+  TFLITE_DCHECK_GE(height_overall_macro_count, height_macro_count);
+
+  // Data strides.
+  //
+  const int input_height_stride = input_width * input_depth;
+  const int output_height_stride = output_width * output_depth;
+  const int input_batch_stride = input_height_stride * input_height;
+  const int output_batch_stride = output_height_stride * output_height;
+  const int input_depth_macro_stride = has_depth_multiplication ? 0 : 64;
+  const int input_width_macro_stride =
+      input_depth * consumed_width_per_macro_block;
+  const int output_width_macro_stride =
+      output_depth * output_width_per_macro_block;
+
+  // Store parameters that do not vary across macro blocks.
+  //
+  function_params.workspace_width_micro_repeats = workspace_width_micro_repeats;
+  function_params.height_macro_count = height_overall_macro_count;
+  function_params.width_macro_count = width_overall_macro_count;
+  function_params.input_height_stride = input_height_stride;
+  function_params.output_height_stride = output_height_stride;
+  function_params.residual_width = residual_micro_width;
+
+  // Main process.
+  //
+  // Most kernels are nested batch-height-width-depth. Here we proceed over
+  // macro blocks batch-width-depth-height.
+  //
+  // Example of handling of trailing iteration: when there is trailing depth,
+  // depth_overall_macro_count = depth_macro_count + 1, so we can adjust the
+  // dimensions for trailing macro blocks by looking for
+  // j_depth == depth_macro_count.
+  for (int b = 0; b < batches; ++b) {
+    for (int k_width = 0; k_width < width_overall_macro_count; ++k_width) {
+      // Figure out the work to be done for this macro block. If it trails in
+      // any dimension, the work in that dimension is adjusted.
+      // The work to be done across widths has 3 cases:
+      // (a) A full macro block,
+      // (b) Partial terminal macro block, with input and output ending in
+      //     same micro block, and
+      // (c) Partial terminal macro block, with output corresponding to one
+      //     fewer micro blocks, because filter extends across micro-block
+      //     boundary.
+      if (k_width != width_macro_count) {
+        function_params.output_residual_width = 0;
+        function_params.input_width_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.output_width_micro_repeats =
+            workspace_width_micro_repeats - 1;
+      } else {
+        function_params.output_residual_width = residual_output_micro_width;
+        function_params.input_width_micro_repeats =
+            width_trailing_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            width_overall_trailing_micro_repeats;
+        function_params.output_width_micro_repeats =
+            output_width_trailing_micro_repeats;
+      }
+      function_params.output_width_overall_micro_repeats =
+          function_params.output_residual_width == 0
+              ? function_params.output_width_micro_repeats
+              : function_params.output_width_micro_repeats + 1;
+
+      for (int j_depth = 0; j_depth < depth_overall_macro_count; ++j_depth) {
+        const uint8* input_data_block =
+            input_data + b * input_batch_stride +
+            j_depth * input_depth_macro_stride +
+            k_width * input_width_macro_stride -
+            function_params.padding_left * input_depth -
+            function_params.padding_top * input_height_stride;
+        uint8* output_data_block = output_data + b * output_batch_stride +
+                                   j_depth * 64 +
+                                   k_width * output_width_macro_stride;
+
+        // Process filter and bias data.
+        //
+        function_params.depth_micro_repeats =
+            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
+        ProcessPerDepth<implementation>::Run(
+            filter_data + 64 * j_depth,
+            bias_data + 8 * 2 * bias_increment * j_depth,
+            filter_workspace[0][0][0][0], adjusted_bias_data, &function_params);
+
+        // Under depth multiplication the workspace_height_stride does not have
+        // to depend on input_width_overall_micro_repeats, but this improves the
+        // compactness of workspace use.
+        const int workspace_height_stride =
+            has_depth_multiplication
+                ? 16 * ((function_params.input_width_overall_micro_repeats +
+                         3) >>
+                        2)
+                : 4 * function_params.input_width_overall_micro_repeats * 8 *
+                      function_params.depth_micro_repeats;
+        TFLITE_DCHECK_EQ(workspace_height_stride % 16, 0);
+        function_params.workspace_height_stride = workspace_height_stride;
+
+        // For the first macro block for output rows we fill in the first few
+        // rows.  After this we will copy them (see below in loop.)
+        function_params.inbound_block_height = input_height_overlap;
+        pack_macro_block_func(-1, k_width, input_data_block,
+                              macroblock_workspace, &function_params);
+        input_data_block += input_height_stride * input_height_overlap;
+
+        for (int i_height = 0; i_height < height_overall_macro_count;
+             ++i_height) {
+          if (i_height != height_macro_count) {
+            function_params.inbound_block_height = input_height_per_macro;
+            function_params.outbound_block_height = output_height_per_macro;
+          } else {
+            function_params.inbound_block_height =
+                residual_output_height * stride;
+            function_params.outbound_block_height = residual_output_height;
+          }
+          TFLITE_DCHECK_LT(i_height * output_height_per_macro, output_height);
+          TFLITE_DCHECK_LT(i_height * input_height_per_macro, input_height);
+          TFLITE_DCHECK_LT(k_width * output_width_per_macro_block,
+                           output_width);
+          TFLITE_DCHECK_LT(k_width * consumed_width_per_macro_block,
+                           input_width);
+
+          // Macro blocks overlap by input_height_overlap rows, so we copy
+          // those instead of filling in afresh.  The first macro block across
+          // output rows was filled in outside of the loop (above).
+          if (i_height > 0) {
+            memcpy(macroblock_workspace,
+                   macroblock_workspace +
+                       input_height_per_macro * workspace_height_stride,
+                   input_height_overlap * workspace_height_stride);
+          }
+
+          pack_macro_block_func(
+              i_height, k_width, input_data_block,
+              macroblock_workspace +
+                  input_height_overlap * workspace_height_stride,
+              &function_params);
+
+          kernel_macro_block_func(
+              macroblock_workspace, filter_workspace[0][0][0][0],
+              adjusted_bias_data, output_data_block, &function_params);
+
+          input_data_block += input_height_stride * input_height_per_macro;
+          output_data_block += output_height_stride * output_height_per_macro;
+        }
+      }
+    }
+  }
+}
+
+#undef vst1_lane_8x4
+#undef vst1q_lane_8x4
+#undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7fafa0b1c89e1b83e351e8c1c19afd48d227e04
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -0,0 +1,5005 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+
+// This file provides kernel implementations that are not used in shipped
+// inference code, but rather (a) show how model C++ code is designed and then
+// transformed into asm code, and (b) aid with maintenance and later development
+// of variations. Many projects (even including, say, the classic NAG libraries)
+// develop highly optimized code, but do not maintain intermediate versions.
+// Often the result is incomprehensible final-version code.
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#ifdef USE_NEON
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
+
+#define vst1_lane_8x4(dst, reg, lane_num)                         \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+#define vst1q_lane_8x4(dst, reg, lane_num)                        \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+
+#define vld1q_lane_s8x8(src, reg, lane_num) \
+  vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
+#define vld1_lane_8x4(src, reg, lane_num) \
+  vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_lane_8x4(src, reg, lane_num) \
+  vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
+#endif  // USE_NEON
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct> {
+  // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
+  // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
+  // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
+  //
+  // Note that this rearrangement is much like that performed on input data when
+  // filling the workspace, and optimized versions will be similar.
+  static inline void FillFilterBank(int depth, const uint8* filter_block,
+                                    int8 filter_bank[3][2][4][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    // Load filter data in, 8-bytes down depth / sub-block at a time.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8 loaded_filter[3][4][2][4];
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
+               8);
+      }
+      // Pad the filter with symmetric representation of 0, so that the values
+      // become 0 when the zero-poing is added below. Thus these filter taps are
+      // effectively disregarded in later filtering.
+      memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
+    }
+    for (int y = 0; y < 3; ++y) {
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank[y][0][z][x] =
+              loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
+          filter_bank[y][1][z][x] =
+              loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+    }
+  }
+
+  // Adjust the bias (weights) data according to the input offset.
+  //
+  // The output calculation is
+  // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
+  //                                 (filter[i][j][d] + filter_offset)
+  // (where offsets are expressed as differences from 128).
+  //
+  // Since we cannot efficiently handle varying offsets / bias across the image,
+  // we insist on filter_offset = 0.
+  //
+  // This function calculates
+  // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
+  // which accounts for input offset. If the bias is constant over the depth,
+  // the adjusted bias will vary.
+  static inline void AdjustBias(int32 input_offset,
+                                const int8 filter_bank[3][2][4][4],
+                                const int32* bias_data,
+                                int32 adjusted_bias_block[2][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    // For instance, if input_offset == 128, no adjustment is needed.
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int s = 0; s < 2; ++s) {
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_block[s][z] = bias_data[4 * s + z];
+        for (int i = 0; i < 9; ++i) {
+          adjusted_bias_block[s][z] +=
+              input_offset_difference * filter_bank[i % 3][s][z][i / 3];
+        }
+      }
+    }
+  }
+
+  static void Run(const uint8* filter_data, const int32* bias_data,
+                  int8* shuffled_filter_data, int32* adjusted_bias_data,
+                  const DepthwiseConvDotProdParams* function_params) {
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+    const int32 input_offset = function_params->input_offset;
+
+    int8 filter_bank[3][2][4][4];
+    int32 adjusted_bias_block[2][4];
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
+      AdjustBias(input_offset, filter_bank,
+                 bias_data + 2 * bias_increment * j_depth, adjusted_bias_block);
+
+      memcpy(shuffled_filter_data, filter_bank[0][0][0],
+             shuffled_filter_increment);
+      shuffled_filter_data += shuffled_filter_increment;
+      memcpy(adjusted_bias_data, adjusted_bias_block[0],
+             8 * sizeof(adjusted_bias_block[0][0]));
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct> {
+  static inline void Run(const uint8* filter_data, const int32* bias_data,
+                         int8* shuffled_filter_data, int32* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+
+    // Load filter data in, essentially dropping the [depth/8] dimension, which
+    // is equivalent to loading just the depth needed for one micro-block.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8 loaded_filter_0[4][2][4];
+    uint8 loaded_filter_1[4][2][4];
+    uint8 loaded_filter_2[4][2][4];
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const uint8* filter_block = filter_data + 8 * j_depth;
+
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
+               8);
+      }
+      // Pad the filter with -filter_offset, so that the values become 0 when
+      // the filter_offset is later added, and so the filter tap is effectively
+      // disregarded.
+      memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
+
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank_a_0[z][x] =
+              loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_0[z][x] =
+              loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_1[z][x] =
+              loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_1[z][x] =
+              loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_2[z][x] =
+              loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_2[z][x] =
+              loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+
+      memcpy(shuffled_filter_data, filter_bank_a_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_2, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_2, 16);
+      shuffled_filter_data += 16;
+
+      int32 adjusted_bias_data_0[4];
+      int32 adjusted_bias_data_1[4];
+      // For instance, if input_offset == 128, no adjustment is needed.
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_data_0[z] = bias_data[z];
+        adjusted_bias_data_1[z] = bias_data[4 + z];
+        for (int x = 0; x < 4; ++x) {
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_0[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_1[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_2[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_0[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_1[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_2[z][x];
+
+          adjusted_bias_data[z] = adjusted_bias_data_0[z];
+          adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
+        }
+      }
+      bias_data += 2 * bias_increment;
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+#ifdef USE_NEON
+template <>
+struct ProcessPerDepth<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct> {
+  static void ProcessPerDepthIntrinsics(
+      const uint8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    constexpr int kSymmetricZeroPoint = 128;
+    constexpr uint8 kSignBit = 0x80;
+    const int32 input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+    const int8x16_t ones_vector = vdupq_n_s8(1);
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8x16_t filter_reg_0_a;
+    int8x16_t filter_reg_0_b;
+    int8x16_t filter_reg_1_a;
+    int8x16_t filter_reg_1_b;
+    int8x16_t filter_reg_2_a;
+    int8x16_t filter_reg_2_b;
+
+    // Register pairs for each height.
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    const uint8* filter_block = filter_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+
+      // Load zero-point into effective position of zero-padding of filter
+      // (register B, upper part).
+      filter_reg_0_b = vdupq_n_u8(kSignBit);
+      filter_reg_1_b = vdupq_n_u8(kSignBit);
+      filter_reg_2_b = vdupq_n_u8(kSignBit);
+
+      const uint8* filter_block_ptr = filter_block;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 1);
+
+      filter_reg_0_a = veorq_s8(filter_reg_0_a, sign_bit);
+      filter_reg_0_b = veorq_s8(filter_reg_0_b, sign_bit);
+      filter_reg_1_a = veorq_s8(filter_reg_1_a, sign_bit);
+      filter_reg_1_b = veorq_s8(filter_reg_1_b, sign_bit);
+      filter_reg_2_a = veorq_s8(filter_reg_2_a, sign_bit);
+      filter_reg_2_b = veorq_s8(filter_reg_2_b, sign_bit);
+
+      vzipq_s8_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8_in_place(&filter_reg_2_a, &filter_reg_2_b);
+      vzipq_s8x2_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8x2_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8x2_in_place(&filter_reg_2_a, &filter_reg_2_b);
+
+      vst1q_s8(shuffled_filter_data, filter_reg_0_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_0_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_b);
+      shuffled_filter_data += 16;
+
+      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      // For instance, if input_offset == 128, no adjustment is needed.
+
+      int32x4_t filter_sum_a = vdupq_n_s32(0);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_2_a, ones_vector);
+      int32x4_t filter_sum_b = vdupq_n_s32(0);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_2_b, ones_vector);
+
+      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
+                                         input_offset_difference);
+      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
+                                         input_offset_difference);
+
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
+      adjusted_bias_data += 4;
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
+      adjusted_bias_data += 4;
+
+      filter_block += 8;
+    }
+  }
+
+  static inline void Run(const uint8* filter_data, const int32* bias_data,
+                         int8* shuffled_filter_data, int32* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
+                              adjusted_bias_data, function_params);
+  }
+};
+#endif
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      max_padding> {
+  // A straight copy of a macro block of input data into a scratch buffer.
+  //
+  // Requirement: depth_micro_repeats > 0.
+  static inline void CopyMacroBlock(
+      int32 height_block_number, int32 width_block_number,
+      const DepthwiseConvDotProdParams& function_params,
+      const uint8* input_block_data, int8* scratch_block_data) {
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The input depth and count of micro blocks provide the width strides.
+    const int input_height_stride = function_params.input_height_stride;
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int input_depth = function_params.input_depth;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    int input_width_micro_repeats = function_params.input_width_micro_repeats;
+    const int residual_width = function_params.residual_width;
+    const int block_height = function_params.inbound_block_height;
+
+    const int padding_left = function_params.padding_left;
+    const int padding_right = function_params.padding_right;
+    const int padding_top = function_params.padding_top;
+    const int padding_bottom = function_params.padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params.width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params.height_macro_count - 1);
+
+    // Modify the trailing case to reflect the input width.
+    int input_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+    if (trailing_width_padding) {
+      input_residual_width -= 1;
+      input_width_micro_repeats = width_overall_micro_repeats - 1;
+    }
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params.input_offset + kSymmetricZeroPoint;
+
+    // We load data into a temporary buffer and then save, to match subsequent
+    // processing. This will make it easier to combine stages into one ASM
+    // routine.
+    int8 tmp_load[4][2][4];
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs trailing padding).
+        int adjusted_residual_width =
+            j_width == input_width_micro_repeats ? input_residual_width : 4;
+
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+          memset(tmp_load[0][0], -input_offset_difference, 8);
+        }
+        if (adjusted_residual_width < 4) {
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            memset(tmp_load[x][0], -input_offset_difference, 8);
+          }
+        }
+
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // The inner 3 loops go through the sub-block, depth and width within
+          // each micro block.
+
+          // Load, and apply symmetric offset.
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          const uint8* input_data = input_block_data +
+                                    k_height * input_height_stride +
+                                    j_width * 4 * input_depth + i_depth * 8;
+          // Full-size macro blocks are 2*4*4 = 32 bytes.
+          for (int x = start_width; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+
+          // Save results.
+          memcpy(&scratch_data[0], tmp_load[0][0], 8);
+          memcpy(&scratch_data[8], tmp_load[1][0], 8);
+          memcpy(&scratch_data[16], tmp_load[2][0], 8);
+          memcpy(&scratch_data[24], tmp_load[3][0], 8);
+        }
+      }
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference, workspace_height_stride);
+    }
+  }
+
+  // Transpose 4x4 blocks within each sub-micro-block.
+  //
+  // Implemented somewhat like NEON register manipulation, so that we can see
+  // equivalence of the two approaches.
+  static inline void MicroTransposeBlocks(
+      const DepthwiseConvDotProdParams& function_params,
+      int8* scratch_block_data) {
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    const int block_height = function_params.inbound_block_height;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating.
+    int8 tmp_load[4][2][4];         // [width][sub-block][depth]
+    int8 tmp_transposed[4][2][4];   // [depth][sub-block][width]
+    int8 tmp_interleaved[2][4][4];  // [sub-block][depth][width]
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          // A. Load data
+          memcpy(tmp_load[0][0], &scratch_data[0], 8);
+          memcpy(tmp_load[1][0], &scratch_data[8], 8);
+          memcpy(tmp_load[2][0], &scratch_data[16], 8);
+          memcpy(tmp_load[3][0], &scratch_data[24], 8);
+
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
+        }
+      }
+    }
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    CopyMacroBlock(height_block_number, width_block_number, *function_params,
+                   input_block_data, scratch_block_data);
+    MicroTransposeBlocks(*function_params, scratch_block_data);
+  }
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      max_padding> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    // Currently support for padding is limited to 1 on any side.
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The count of micro blocks (below) provides the width strides.
+    const int input_height_stride = function_params->input_height_stride;
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int residual_width = function_params->residual_width;
+    const int block_height = function_params->inbound_block_height;
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params->input_offset + kSymmetricZeroPoint;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // When there is unit input depth, the micro-block iteration need only be
+    // through the height. The micro blocks are contiguous across the width.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data =
+          input_block_data + k_height * input_height_stride;
+      int8* scratch_data =
+          scratch_block_data + k_height * workspace_height_stride;
+
+      // Handle leading padding. This is overwritten if there is no padding.
+      scratch_data[0] = -input_offset_difference;
+
+      memcpy(&scratch_data[start_width], input_data, copy_size);
+      for (int i = 0; i < copy_size; ++i) {
+        scratch_data[start_width + i] += -kSymmetricZeroPoint;
+      }
+
+      // Handle trailing padding, and fill in remainder of micro block.
+      memset(&scratch_data[start_width + copy_size], -input_offset_difference,
+             4 - adjusted_residual_width + kWorkspaceExtension);
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+    }
+  }
+};
+
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint = 128;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
+    // down the depth.
+    int8 tmp_load[4][2][4];
+    int8 tmp_transposed[4][2][4];
+    int8 tmp_interleaved[2][4][4];
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data = scratch_block_data;
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* input_data = input_block_data;
+      input_block_data += input_height_stride;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        // Figure out division of work (available input vs zero-ed).
+        const int adjusted_residual_width = residual_width;
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = 0;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      scratch_data += height_advance;
+    }
+
+    TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
+                                       block_height * workspace_height_stride);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    // Just use C model code for case of padding. Optimized versions merge the
+    // modifications therein to handle padding.
+    PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                   DepthwiseConvDepthMultiplication::kNoMultiplication,
+                   /*max_padding=*/1>::Run(height_block_number,
+                                           width_block_number, input_block_data,
+                                           scratch_block_data, function_params);
+  }
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      max_padding> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // This is used to simulate what should happen in registers.
+    int8 tmp_data[16];
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    if (copy_size >= 16) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 16);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 16);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
+        }
+
+        const int copy_remaining = copy_size - copy_done;
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (16 - copy_remaining),
+                 16);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 16; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          const int final_repeats =
+              width_overall_micro_repeats - (start_width + copy_done) / 4;
+          for (int i = 0; i < final_repeats; ++i) {
+            memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
+            copy_done += 4;
+          }
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 4);
+          copy_done += 3;
+        }
+
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 4);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Perform as 4 int32 stores, because that is our alignment.
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+        }
+
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+        const int copy_remaining = copy_size - copy_done;
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (4 - copy_remaining),
+                 4);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 4; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+          copy_done += 4;
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
+        memset(scratch_data_base + scratch_data_offset + 8,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      // This path is basically the same as the preceding, 2-micro-block one,
+      // but here we simply store fewer bytes.
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
+        memset(scratch_data_base + scratch_data_offset + 4,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockIntrinsics(
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    static const uint8 perm_data[64] = {
+        0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,  //
+        4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+        8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+        12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr uint8 kSignBit = 0x80;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+    const int8x16_t perm_data_0 = vld1q_u8(perm_data);
+    const int8x16_t perm_data_1 = vld1q_u8(perm_data + 16);
+    const int8x16_t perm_data_2 = vld1q_u8(perm_data + 32);
+    const int8x16_t perm_data_3 = vld1q_u8(perm_data + 48);
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int i_depth = 0;
+        for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+          int8x16x4_t input_data;
+          input_data.val[0] = vld1q_u8(input_data_0);
+          input_data.val[1] = vld1q_u8(input_data_1);
+          input_data.val[2] = vld1q_u8(input_data_2);
+          input_data.val[3] = vld1q_u8(input_data_3);
+          input_data_1 += 16;
+          input_data_0 += 16;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 16;
+          input_data_3 += 16;
+
+          tmp_0 = vqtbl4q_s8(input_data, perm_data_2);
+          tmp_1 = vqtbl4q_s8(input_data, perm_data_3);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+        }
+        for (; i_depth < depth_micro_repeats; ++i_depth) {
+          int8x16x4_t input_data;
+          input_data.val[0] =
+              vld1q_lane_s8x8(input_data_0, input_data.val[0], 0);
+          input_data.val[1] =
+              vld1q_lane_s8x8(input_data_1, input_data.val[1], 0);
+          input_data.val[2] =
+              vld1q_lane_s8x8(input_data_2, input_data.val[2], 0);
+          input_data.val[3] =
+              vld1q_lane_s8x8(input_data_3, input_data.val[3], 0);
+          input_data_1 += 8;
+          input_data_0 += 8;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        TFLITE_DCHECK_LT(residual_width, 4);
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          work_reg_a = vdupq_n_u8(kSignBit);
+          work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+          work_reg_b = vdupq_n_u8(kSignBit);
+          if (residual_width > 1) {
+            work_reg_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+            if (residual_width == 3) {
+              work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                           work_reg_a, 1);
+            }
+          }
+          work_reg_a = veorq_s8(work_reg_a, sign_bit);
+          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+          vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_0 += 8;
+          input_data_1 += 8;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockIntrinsics(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    constexpr uint8 kSignBit = 0x80;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint = 128;
+
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs zero-ed).
+        int adjusted_residual_width =
+            j_width == (input_width_micro_repeats) ? residual_width : 4;
+
+        if (trailing_width_padding &&
+            j_width == (width_overall_micro_repeats - 1)) {
+          adjusted_residual_width -= 1;
+        }
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+        }
+        if (start_width == 0) {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 0) {
+                work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+                if (adjusted_residual_width > 1) {
+                  work_reg_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               work_reg_b, 0);
+                  if (adjusted_residual_width == 3) {
+                    work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 work_reg_a, 1);
+                  }
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        } else {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              // Skip loading first column.
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              // Skip loading first column.
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 1) {
+                work_reg_b =
+                    vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+                if (adjusted_residual_width == 3) {
+                  work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                               work_reg_a, 1);
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        }
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockIntrinsics(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = vld1q_u8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(padding_reg, work_reg, 15);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          vst1_lane_8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+      // ASM should use MOVI 64-bit set.
+      padding_mask = vcreate_u64(~0xffffff00L);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 2),
+                         half_work_reg, 3);
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      if (leading_width_padding) {
+        padding_mask = vset_lane_u8(255, padding_mask, 0);
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vshl_n_s64(half_work_reg, 8);
+        }
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockIntrinsics(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    const int copy_block_height = block_height;
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    const int copy_size =
+        (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = copy_size & 0x7;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ(copy_done % 16, 0);
+          vst1q_s8(scratch_data + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + copy_done, half_work_reg);
+        vst1_s8(scratch_data + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = copy_size & 0x3;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
+  }
+};
+
+#endif  // ARM NEON
+
+// Apply filter to macro block of input data and store results.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, depth 4, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
+                                               int workspace_height_stride,
+                                               int width_micro_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+
+    // The input banks have same format as selected_data.
+    int8 left_bank[3][4][4];
+    int8 right_bank[3][4][4];
+
+    // Work through one slice, by row, at a time.
+    for (int k_height = 0; k_height < 3; ++k_height) {
+      // Simulate demangling of mangled storage arrangement.
+      const int8* left_input_block =
+          &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
+      memcpy(left_bank[k_height][0], left_input_block, 16);
+      if (no_right_block) {
+        memset(right_bank[k_height][0], 0, 16);
+      } else {
+        const int8* right_input_block =
+            &input_block[k_height * workspace_height_stride +
+                         sub_block * 2 * 8 + width_micro_stride];
+        memcpy(right_bank[k_height][0], right_input_block, 16);
+      }
+      for (int depth_index = 0; depth_index < 4; ++depth_index) {
+        memcpy(selected_data[k_height][depth_index],
+               &left_bank[k_height][depth_index][offset], 4 - offset);
+        memcpy(&selected_data[k_height][depth_index][4 - offset],
+               right_bank[k_height][depth_index], offset);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& params, int sub_block,
+      const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 output_multiplier = params.output_multiplier;
+    const int32 output_shift = params.output_shift;
+    const int32 output_offset = params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][d][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];                 // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                scratch_data + width_micro_stride * i_width;
+            // Iterate over input width shifts within sub-micro blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, s,
+                                        workspace_height_stride,
+                                        width_micro_stride, no_right_block,
+                                        input_data, sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset,
+                                               int workspace_height_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+    if (no_right_block) {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset],
+               4 - offset);
+      }
+    } else {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset], 4);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& function_params, int sub_block,
+      const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min =
+        function_params.quantized_activation_min;
+    const int32 output_activation_max =
+        function_params.quantized_activation_max;
+    const int32 output_multiplier = function_params.output_multiplier;
+    const int32 output_shift = function_params.output_shift;
+    const int32 output_offset = function_params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];              // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data = scratch_data + 4 * i_width;
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
+                                        no_right_block, input_data,
+                                        sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        stride> {
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+    uint8 output_values[4];  // Sub-block, depth 4.
+    // selected_data has format Depth 4, width 4.
+    int8 left_bank_0[4][4];
+    int8 left_bank_1[4][4];
+    int8 left_bank_2[4][4];
+    int8 right_bank_0[4][4];
+    int8 right_bank_1[4][4];
+    int8 right_bank_2[4][4];
+    memset(right_bank_0[0], 0, 16);
+    memset(right_bank_1[0], 0, 16);
+    memset(right_bank_2[0], 0, 16);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      memcpy(filter_bank_a_0, filter_block, 16);
+      memcpy(filter_bank_b_0, filter_block + 16, 16);
+      memcpy(filter_bank_a_1, filter_block + 32, 16);
+      memcpy(filter_bank_b_1, filter_block + 48, 16);
+      memcpy(filter_bank_a_2, filter_block + 64, 16);
+      memcpy(filter_bank_b_2, filter_block + 80, 16);
+
+      for (int s = 0; s < 2; ++s) {
+        // Work through one slice, by row, at a time.
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          // Load first sub-micro block of data into operational banks.
+          memcpy(left_bank_0[0], input_data_0, 16);
+          memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
+          memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
+                 16);
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+
+            // Load next sub-micro block of data.
+            if (!no_right_block) {
+              memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
+              memcpy(right_bank_1[0],
+                     input_data + workspace_height_stride + width_micro_stride,
+                     16);
+              memcpy(
+                  right_bank_2[0],
+                  input_data + 2 * workspace_height_stride + width_micro_stride,
+                  16);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              // Operate on depth of 4 in batches.
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = 0;
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_0[d][x];
+                  int32 filter_val = filter_bank_a_0[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_1[d][x];
+                  int32 filter_val = filter_bank_a_1[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_2[d][x];
+                  int32 filter_val = filter_bank_a_2[d][x];
+                  acc += filter_val * input_val;
+                }
+                acc += bias_data[d];
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[d] = static_cast<uint8>(acc);
+              }
+
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+
+              // Simulate shifting instructions.
+              if (stride_val == 1) {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 3; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 1];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 1];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 1];
+                  }
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
+                  for (int z = 0; z < 3; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 1];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 1];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 1];
+                  }
+                }
+              } else {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 2; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 2];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 2];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 2];
+                  }
+                  left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
+                  for (int z = 0; z < 2; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 2];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 2];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 2];
+                  }
+                }
+              }
+            }
+          }
+        }
+        bias_data += bias_increment;
+
+        // Move filter for second sub-block into operational filter.
+        for (int z = 0; z < 4; ++z) {
+          for (int x = 0; x < 4; ++x) {
+            filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
+            filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
+            filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
+          }
+        }
+      }
+    }
+  }
+};
+
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        stride> {
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+
+    int8 input_bank_0[8];
+    int8 input_bank_1[8];
+    int8 input_bank_2[8];
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+
+    uint8 output_values[2][4];  // Sub-block, depth 4.
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank_a_0, filter_workspace, 16);
+      memcpy(filter_bank_b_0, filter_workspace + 16, 16);
+      memcpy(filter_bank_a_1, filter_workspace + 32, 16);
+      memcpy(filter_bank_b_1, filter_workspace + 48, 16);
+      memcpy(filter_bank_a_2, filter_workspace + 64, 16);
+      memcpy(filter_bank_b_2, filter_workspace + 80, 16);
+
+      // Work through one slice, by row, at a time.
+      for (int k_height = 0; k_height < block_height; ++k_height) {
+        const int8* scratch_data =
+            scratch_block_data +
+            workspace_height_stride * k_height * stride_val;
+        uint8* output_data =
+            output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+        memcpy(input_bank_0, scratch_data, 4);
+        memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
+        memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 * i_width;
+
+          memcpy(input_bank_0 + 4, input_data + 4, 4);
+          memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
+          memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
+                 4);
+
+          // Iterate over input width shifts within 4x4 blocks.
+          for (int w = 0; w < output_width; ++w) {
+            constexpr int offset =
+                0;  // Shift input instead of offset in multiply-accumulate.
+
+            {
+              const int s = 0;
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val_0 = input_bank_0[offset + x];
+                  int32 filter_val_0 = filter_bank_a_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32 input_val_1 = input_bank_1[offset + x];
+                  int32 filter_val_1 = filter_bank_a_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32 input_val_2 = input_bank_2[offset + x];
+                  int32 filter_val_2 = filter_bank_a_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+            {
+              const int s = 1;
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val_0 = input_bank_0[offset + x];
+                  int32 filter_val_0 = filter_bank_b_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32 input_val_1 = input_bank_1[offset + x];
+                  int32 filter_val_1 = filter_bank_b_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32 input_val_2 = input_bank_2[offset + x];
+                  int32 filter_val_2 = filter_bank_b_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+
+            // Simulate register shifts.
+            for (int i = 0; i < (8 - stride_val); ++i) {
+              input_bank_0[i] = input_bank_0[i + stride_val];
+              input_bank_1[i] = input_bank_1[i + stride_val];
+              input_bank_2[i] = input_bank_2[i + stride_val];
+            }
+
+            output_data += output_depth;
+          }
+        }
+      }
+      bias_data += 2 * bias_increment;
+      filter_workspace += shuffled_filter_increment;
+    }
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/1> {
+  static inline void KernelMacroBlockIntrinsics(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    const int8* input_data_depthwise = scratch_block_data;
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = input_data_base;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(next_input_data + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(next_input_data + 4 * workspace_height_stride);
+          int8x16_t left_bank_5_reg =
+              vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+          acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+          acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+          acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (i_width == output_width_micro_repeats) &&
+            //        ((residual_width - 1) * stride_val < 2)
+            const bool no_right_block =
+                i_width == output_width_micro_repeats && residual_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+              left_bank_5_reg = right_bank_5_reg;
+
+              output_data += depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += width_micro_stride;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+              biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+              biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+              biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
+              biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
+              biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+          input_data_base += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data = input_data_base;
+            uint8* output_data = output_data_base;
+
+            // Load first sub-micro block of data into operational banks.
+            int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+            int8x16_t left_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            int8x16_t left_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += width_micro_stride;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              int8x16_t right_bank_0_reg;
+              int8x16_t right_bank_1_reg;
+              int8x16_t right_bank_2_reg;
+              // Logic: (output_width - 1) * stride_val < 2.
+              const bool no_right_block = output_width < 3;
+
+              if (no_right_block) {
+                // Only needed for santizer checks.
+                right_bank_0_reg = vdupq_n_s8(0);
+                right_bank_1_reg = vdupq_n_s8(0);
+                right_bank_2_reg = vdupq_n_s8(0);
+              } else {
+                right_bank_0_reg = vld1q_s8(next_input_data);
+                right_bank_1_reg =
+                    vld1q_s8(next_input_data + workspace_height_stride);
+                right_bank_2_reg =
+                    vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              }
+              // Load next sub-micro block of data.
+
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_s32(acc, filter_reg_0_a, left_bank_0_reg);
+                acc = vdotq_s32(acc, filter_reg_1_a, left_bank_1_reg);
+                acc = vdotq_s32(acc, filter_reg_2_a, left_bank_2_reg);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+                biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+                biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+
+                output_data += depth;
+              }
+            }
+            input_data_base += workspace_height_stride;
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      input_data_depthwise += depth_micro_stride;
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    DepthwiseConvDepthMultiplication::kNoMultiplication,
+    /*stride=*/2> {
+  static inline void KernelMacroBlockIntrinsics(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    // This version only does min/max on 64 bits.
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x8_t output_activation_min_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x8_t output_activation_max_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_max));
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    TFLITE_DCHECK_EQ(stride_val, 2);
+    TFLITE_DCHECK_LE(block_height, 2);
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      if (block_height == 2) {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(input_data_0 + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(input_data_0 + 4 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+          int8x16_t right_bank_3_reg;
+          int8x16_t right_bank_4_reg;
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+              right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                          3 * workspace_height_stride);
+              right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                          4 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + depth + output_height_stride,
+                            acc_u8, 1);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+
+          int32x4_t acc0;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      }
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/1> {
+  static inline void KernelMacroBlockIntrinsics(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->stride, 1);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = scratch_block_data;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_a_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_b_reg =
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                             input_bank_b_reg, 2);
+          input_bank_c_reg = vld1q_dup_s8x4(
+              next_input_data +
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_c_reg =
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                             input_bank_c_reg, 2);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
+          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
+          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += 4;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
+
+              output_data += output_depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+          // scratch_block_data += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        // Block height < 4.
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data =
+                scratch_block_data + k_height * workspace_height_stride;
+            uint8* output_data = output_data_base;
+
+            int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+            int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+
+            // Load first sub-micro block of data into operational banks.
+            input_bank_a_reg =
+                vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                  // uninitialized variable.
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+            input_bank_b_reg = vld1q_dup_s8x4(
+                next_input_data +
+                2 * workspace_height_stride);  // Load lane 0, avoiding
+                                               // uninitialized variable.
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += 4;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data + workspace_height_stride,
+                                 input_bank_a_reg, 3);
+              input_bank_b_reg =
+                  vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                                 input_bank_b_reg, 1);
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_four_lane_s32(acc, filter_reg_0_a, input_bank_a_reg,
+                                          0);
+                acc = vdotq_four_lane_s32(acc, filter_reg_1_a, input_bank_a_reg,
+                                          2);
+                acc = vdotq_four_lane_s32(acc, filter_reg_2_a, input_bank_b_reg,
+                                          0);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+                input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+
+                output_data += output_depth;
+              }
+            }
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<
+    DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
+    DepthwiseConvDepthMultiplication::kUnitInputDepth,
+    /*stride=*/2> {
+  static inline void KernelMacroBlockIntrinsics(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      TFLITE_DCHECK_EQ(bias_increment, 4);
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+
+      if (block_height == 2) {
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        //
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
+  }
+};
+
+#undef vst1_lane_8x4
+#undef vst1q_lane_8x4
+#undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
+
+#endif  //  USE_NEON
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index a7abf3e370c466d12be99c9d3dc5d35eca2caf77..01103d727fc0a390e82d56310c9d7614fd864b35 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
-// TODO(petewarden) - move this to a common location in Eigen itself.
-
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 730d9b662a33eb3a2fe08fec887c9fb35671a116..ce5eb308e8744097c49987ce8a1e31f54dfff222 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
@@ -43,6 +49,49 @@ void* aligned_alloc(size_t alignment, size_t size, void** freeing_buffer) {
              : ((char*)*freeing_buffer + (alignment - offset));  // NOLINT
 }
 
+// Use /proc/cpuinfo to test whether we have the right processor.
+bool HasSdotInstruction() {
+  // TODO(strohman): Replace this with a proper API call once we are running
+  // on kernels that can tell us about this instruction: (b/119112014)
+  // Note that the C++ spec ensures that this variable will be initialized
+  // exactly once.
+  static bool has_sdot = []() -> bool {
+    char text[1024];
+    int fd = open("/proc/cpuinfo", O_RDONLY);
+    if (fd < 0) {
+      return false;
+    }
+
+    bool found = false;
+    int buffer = 0;
+    const char kSM8150[] = "Qualcomm Technologies, Inc SM8150";
+    while (true) {
+      int count = read(fd, text + buffer, sizeof(text) - buffer);
+      if (count <= 0) {
+        break;
+      }
+      int text_end = buffer + count;
+
+      if (memmem(text, text_end, kSM8150, sizeof(kSM8150) - 1) != nullptr) {
+        found = true;
+        break;
+      }
+
+      // Keep up to some bytes of the previous buffer state so that we
+      // can find a string match even if it occurs on a buffer boundary.
+      buffer = text_end;
+      if (text_end > sizeof(kSM8150)) {
+        buffer = sizeof(kSM8150);
+      }
+
+      memmove(text, text + text_end - buffer, buffer);
+    }
+    close(fd);
+    return found;
+  }();
+  return has_sdot;
+}
+
 }  // namespace
 
 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
@@ -84,10 +133,266 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
   }
 }
 
+#ifdef __aarch64__
+
+// We interleave vector data to make the dot product logic more efficient.
+// Suppose that vectors is:
+//     a0 a1 a2 a3 a4 a5 ...
+//     b0 b1 b2 b3 b4 b5 ...
+//     c0 c1 c2 c3 c4 c5 ...
+//     d0 d1 d2 d3 d4 d5 ...
+//     e0 e1 e2 e3 e4 e5 ...
+// This code interleaves them like this:
+//     a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ...
+//     e0 e1 e2 e3 f0 f1 f2 f3 ...
+// Once the data is interleaved, each 16-byte read from the vectors pointer
+// contains 4 bytes from each of 4 vectors.
+const int8_t* ShuffleVectors(const int8_t* vectors, const int n_batch,
+                             const int m_cols, void** shuffled_vectors_free) {
+  const int kWeightsPerUint32 = 4;
+
+  int8* shuffled_vectors = reinterpret_cast<int8*>(aligned_alloc(
+      kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+
+  for (int i = 0; i < n_batch; i += 4) {
+    int8* shuffled_vectors_ptr = shuffled_vectors + (i * m_cols);
+    const int8* unshuffled_vec0_ptr =
+        reinterpret_cast<const int8*>(vectors) + (i * m_cols);
+    const int8* unshuffled_vec1_ptr =
+        reinterpret_cast<const int8*>(vectors) + ((i + 1) * m_cols);
+    const int8* unshuffled_vec2_ptr =
+        reinterpret_cast<const int8*>(vectors) + ((i + 2) * m_cols);
+    const int8* unshuffled_vec3_ptr =
+        reinterpret_cast<const int8*>(vectors) + ((i + 3) * m_cols);
+    const int8* const end_vec0_ptr = unshuffled_vec1_ptr;
+
+    while (unshuffled_vec0_ptr != end_vec0_ptr) {
+      asm volatile(
+          // This code path requires that (n_cols % 16) == 0 so we can safely
+          // read in 16-byte chunks from each row.
+          "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+          "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+          "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+          "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+          "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+          "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+          : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
+            [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
+            [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
+            [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
+            [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
+          :
+          : "v0", "v1", "v2", "v3", "cc", "memory");
+    }
+  }
+
+  return reinterpret_cast<const int8_t*>(shuffled_vectors);
+}
+
+static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* vectors, const float* scaling_factors, int n_batch,
+    float* __restrict__ result) {
+  void* shuffled_vectors_free;
+
+  const int8_t* shuffled_vectors =
+      ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
+
+  for (int row = 0; row < m_rows; row += 2) {
+    for (int batch = 0; batch < n_batch; batch += 4) {
+      float* result_ptr = result + (batch * m_rows) + row;
+      const int8* mat_ptr0 = matrix + (row * m_cols);
+      const int8* mat_ptr1 = matrix + ((row + 1) * m_cols);
+      const int8* mat_ptr0_end = mat_ptr1;
+      const int8* vec_ptr = shuffled_vectors + (batch * m_cols);
+      const float* scaling_factors_ptr = scaling_factors + batch;
+      const uint64_t wide_rows = m_rows * sizeof(float);
+
+      asm volatile(
+          // Zero out the accumulator registers.
+          "dup v0.4s, wzr\n"
+          "dup v1.4s, wzr\n"
+          "dup v2.4s, wzr\n"
+          "dup v3.4s, wzr\n"
+
+          "1:\n"  // batch_cols_loop
+
+          // Read 16 more bytes from a pair of matrix rows.
+          "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+          // Read from input vectors 4 times; 64 bytes total.
+          // Each 16-byte register contains parts of 4 vectors; see the
+          // shuffle logic above.
+
+          // From Benoit, places to look in the future:
+          // - Move load instructions further from sdot
+          // - Switch loop use-then-reload
+          // - Do partial unrolling to use register space better
+          "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+          "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+          "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+          "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+          ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+          // Re-use those vectors for the next row as well.
+          "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+          ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+          ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+          ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+          ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+          // If we're not done with these rows, continue.
+          "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+          "bne 1b\n"  // batch_cols_loop
+
+          // Done with the rows, sum the results.
+          "add v0.4s, v0.4s, v1.4s\n"
+          "add v2.4s, v2.4s, v3.4s\n"
+
+          // Convert the per-vector sums to floating point.
+          "scvtf v0.4s, v0.4s\n"
+          "scvtf v1.4s, v2.4s\n"
+
+          // Fetch scale factors.
+          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+          // Multiply scale factors times sums.
+          "fmul v0.4s, v4.4s, v0.4s\n"
+          "fmul v1.4s, v4.4s, v1.4s\n"
+
+          // Load previous result values.
+          // The result position is:
+          //   result[batch * m_rows + row]
+          // Here that is factored into:
+          //   result_ptr = result + row
+          //   *result_ptr = res[0]
+          //   (uint8*)result_ptr += (m_rows * sizeof(float))
+          //   *result_ptr = res[1]
+          //   ...
+          // Since we're reading two rows at a time, though, we read both
+          //   result[batch * m_rows + row]
+          // and
+          //   result[batch * m_rows + row + 1]
+          "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+          "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+          // Go back to the starting position (subtract wide_rows * 4).
+          "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+          // Add previous result values.
+          "fadd v9.4s, v9.4s, v0.4s\n"
+          "fadd v10.4s, v10.4s, v1.4s\n"
+
+          // Store results.
+          "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+          "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+          : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
+            [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr)
+          : [ mat_ptr0_end ] "r"(mat_ptr0_end),
+            [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+            [ wide_rows ] "r"(wide_rows)
+          : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+            "v10", "v11", "v12", "v13", "cc", "memory");
+    }
+  }
+
+  free(shuffled_vectors_free);
+}
+
+static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const uint8_t* ledger_ptr = ledger;
+  const int8* mat_ptr = matrix;
+
+  for (int row = 0; row < m_rows; row++) {
+    int num_nonzero_chunks = *ledger_ptr;
+    ledger_ptr++;
+    const uint8* ledger_start = ledger_ptr;
+    const uint8* ledger_end = ledger_ptr + num_nonzero_chunks;
+    const int8* mat_start = mat_ptr;
+
+    for (int batch = 0; batch < n_batch; batch++) {
+      const int8* vec_ptr = vectors + (batch * m_cols);
+      int64_t row_sum = 0;
+
+      mat_ptr = mat_start;
+      ledger_ptr = ledger_start;
+
+      if (ledger_ptr != ledger_end) {
+        asm volatile(
+            "dup v0.4s, wzr\n"
+            "dup v1.4s, wzr\n"
+            "dup v8.4s, wzr\n"
+            "mov x7, 0\n"
+
+            "1:\n"  // chunks_loop
+
+            // Single matrix chunk, 16 bytes
+            "ld1 {v8.16b}, [%[mat_ptr]], #16\n"
+
+            // Read the next ledger index and increment.
+            "ldrb w7, [%[ledger_ptr]], #1\n"
+
+            // Read 16 bytes of vector data from (vec_ptr + (ledger_index * 16))
+            "add x8, %[vec_ptr], x7, lsl #4\n"
+            "ld1 {v9.16b}, [x8]\n"
+
+            // Dot product of matrix row and vector.
+            ".word 0x4e889520  // sdot v0.4s, v9.16b, v8.16b\n"
+
+            "cmp %[ledger_ptr], %[ledger_end]\n"
+            "blt 1b\n"  // chunks_loop
+
+            // Sum the 4 vector components into a 32-bit value.
+            "addv s1, v0.4s\n"
+            // row_sum is 64-bit, so we copy 64 bits of v1 into it.
+            // We have to be careful to cast this value to 32 bits in order
+            // to interpret the sign bit properly.
+            "mov %[row_sum], v1.d[0]\n"
+            : [ row_sum ] "=r"(row_sum), [ ledger_ptr ] "+r"(ledger_ptr),
+              [ mat_ptr ] "+r"(mat_ptr), [ vec_ptr ] "+r"(vec_ptr)
+            : [ ledger_end ] "r"(ledger_end)
+            : "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
+      }
+      result[(batch * m_rows + row) * result_stride] +=
+          static_cast<int32>(row_sum) * scaling_factors[batch];
+    }
+  }
+}
+
+#endif  // __aarch64__
+
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride) {
+#ifdef __aarch64__
+  if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
+      m_rows >= n_batch) {
+    if (n_batch % 4 == 0 && result_stride == 1) {
+      // Benchmarks suggest that it's always better to use the batch code
+      // when we can, even on small matrices.
+      DotprodMatrixBatchFourVectorMultiplyAccumulate(
+          matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
+      return;
+    }
+  }
+#endif  // __aarch64__
+
   const int kWeightsPerUint32 = 4;
   const int kWeightsPerNeonLane = 16;
   // Assuming *matrix is kWeightsPerUint32-byte aligned,
@@ -202,6 +507,127 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   free(aligned_vec_free);
 }
 
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  const int kNeonLanesPerBlock = 4;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+        const float* vector_in_batch = vector + b * m_cols;
+
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+
+          for (int c = 0; c < kNeonLanesPerBlock; c++) {
+            // Load 4 float values from the vector and matrix row.
+            float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
+                                                 c * kFloatWeightsPerNeonLane);
+            float32x4_t matrix_f32x4 =
+                vld1q_f32(matrix_ptr + c * kFloatWeightsPerNeonLane);
+            // Multiply the vector and matrix row and add to accumulator.
+            acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+          }
+          matrix_ptr += kBlockSize;
+        }
+        *result_in_batch +=
+            (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+             vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+      }
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+#ifdef __aarch64__
+  if (HasSdotInstruction() && m_cols % 16 == 0) {
+    DotprodSparseMatrixBatchVectorMultiplyAccumulate(
+        matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
+        result, result_stride);
+    return;
+  }
+#endif  // __aarch64__
+
+  const int kWeightsPerUint32 = 4;
+  const int kWeightsPerNeonLane = 16;
+  const int kBlockSize = kWeightsPerNeonLane;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  void* aligned_vec_free = nullptr;
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                             &aligned_vec_free);
+
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+
+    const uint8_t* ledger_ptr = ledger;
+    const int8_t* row_ptr = matrix;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod = vmovq_n_s32(0);
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        // Prefetch the row to cache.
+        __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                           3 /* temporal locality */);
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int col_index = *ledger_ptr++ * kBlockSize;
+          // Load 16 8-bit values from the row and vector, each, to operate on.
+          // Here the assumption is that each buffer is 4-byte aligned.
+          // Otherwise, performance may suffer significantly.
+          TFLITE_DCHECK_EQ(  // NOLINT
+              (uintptr_t)(&row_ptr) & (kWeightsPerUint32 - 1), 0);
+          const int8x16_t s1_8x16 =
+              vld1q_s8((const int8_t*)(aligned_vec + col_index));
+          const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr));
+          // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+          // registers).
+          int16x8_t prod_16x8 =
+              vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+          // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+          // registers), and accumulate with the result of the low bits product.
+          // The assumption here is that overflow will not happen as we quantize
+          // our values to be in the range [-127, 127]. As such the sum of the 2
+          // products is always strictly smaller than 15-bits (32767 in absolute
+          // value).
+          prod_16x8 =
+              vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+          dotprod = vpadalq_s16(dotprod, prod_16x8);
+          row_ptr += kBlockSize;
+        }
+        // Add the 4 intermediate sum values to get the final dot-prod value for
+        // this row.
+        int64x2_t pairwiseAdded = vpaddlq_s32(dotprod);
+        int32 neon_sum =
+            vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+        *result += neon_sum * batch_scaling_factor;
+      }
+    }  // for row
+  }    // for batch
+  free(aligned_vec_free);
+}
+
 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
                                   int v_size, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 903f4c80139cd326b354ef6292a393c75af11608..a86457dba745dbe94ce3e1dc718012545f258804 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -40,6 +40,24 @@ void MatrixBatchVectorMultiplyAccumulate(
                    vectors, scaling_factors, n_batch, result, result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const float* vector, int n_batch, float* result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(matrix, ledger, m_rows, m_cols,
+                                                vectors, scaling_factors,
+                                                n_batch, result, result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 7009323a9b2ec7d1e5310f89ef6977a376f536eb..7d8ab2c6029b8e3b7fdbc38c591cf02f24019c83 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <sys/types.h>
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <memory>
 #include <tuple>
@@ -63,6 +64,7 @@ using reference_ops::ConcatenationWithScaling;
 using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
+using reference_ops::Elu;
 using reference_ops::FakeQuant;
 using reference_ops::Fill;
 using reference_ops::Gather;
@@ -85,6 +87,7 @@ using reference_ops::Select;
 using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
+using reference_ops::Sub16;
 using reference_ops::Transpose;
 
 // TODO(b/80247582) Remove this constant.
@@ -181,45 +184,6 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. The template version is to be preferred,
-// since some target hardware optimizations depend on the range of the exponent.
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. See raw-integer version for further comments.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -810,7 +774,7 @@ inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& weights_shape,
     const float* weights_data, const RuntimeShape& bias_shape,
-    const float* bias_data, const RuntimeShape& output_shape,
+    const float* optional_bias_data, const RuntimeShape& output_shape,
     float* output_data) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected");
   const float output_activation_min = params.float_activation_min;
@@ -834,30 +798,36 @@ inline void FullyConnected(
       MapAsMatrixWithLastDimAsRows(output_data, output_shape);
 
   Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   bias_shape, bias_data, output_shape,
-                                   output_data);
+
+  if (optional_bias_data != nullptr) {
+    AddBiasAndEvalActivationFunction(
+        output_activation_min, output_activation_max, bias_shape,
+        optional_bias_data, output_shape, output_data);
+  } else {
+    const int flat_size = output_shape.FlatSize();
+    for (int i = 0; i < flat_size; ++i) {
+      output_data[i] = ActivationFunctionWithMinMax(
+          output_data[i], output_activation_min, output_activation_max);
+    }
+  }
 }
 
 #ifdef USE_NEON
-inline void FullyConnectedAsGEMV(
+inline void FullyConnectedAsGEMVWorkerImpl(
     const RuntimeShape& input_shape, const uint8* input_data,
     int32 input_offset, const RuntimeShape& filter_shape,
     const uint8* filter_data, int32 filter_offset,
     const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
     int32 output_multiplier, int output_shift, int32 output_activation_min,
     int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int row_start, int row_end) {
   gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
   const int output_dim_count = output_shape.DimensionsCount();
-  const int filter_dim_count = filter_shape.DimensionsCount();
   TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
   const int input_size = FlatSizeSkipDim(input_shape, 0);
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
   static constexpr int kPeel = 4;
   const bool shift_left = (output_shift > 0);
   for (int k = 0; k < input_size; k += 64) {
@@ -866,81 +836,139 @@ inline void FullyConnectedAsGEMV(
   for (int k = 0; k < kPeel * input_size; k += 64) {
     optimized_ops_preload_l1_stream(filter_data + k);
   }
-  TFLITE_DCHECK(!(output_size % kPeel));
-  const int32* bias_ptr = bias_data;
-  uint8* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += kPeel) {
-    int32x4_t acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      acc[k] = vdupq_n_s32(0);
-    }
+
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
     const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
     const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
     int in = 0;
     for (; in <= input_size - 16; in += 16) {
       const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      uint8x16_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1q_u8(filter_ptr);
-        optimized_ops_preload_l1_stream(filter_ptr + 64);
-      }
-      int16x8_t input_val[2];
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
-      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
-      int16x8_t filter_val[kPeel][2];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
-        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
-        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
-        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
-        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
-        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
-      }
-      for (int p = 0; p < 2; p++) {
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
-                             vget_low_s16(input_val[p]));
-        }
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
-                             vget_high_s16(input_val[p]));
-        }
-      }
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      int16x8_t input_val_0, input_val_1;
+      uint8x8_t low = vget_low_u8(input_val_u8);
+      uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_u8(filter_val_u8_0);
+      high = vget_high_u8(filter_val_u8_0);
+      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_1);
+      high = vget_high_u8(filter_val_u8_1);
+      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_2);
+      high = vget_high_u8(filter_val_u8_2);
+      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_3);
+      high = vget_high_u8(filter_val_u8_3);
+      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
     }
     for (; in <= input_size - 8; in += 8) {
       const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      uint8x8_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1_u8(filter_ptr);
-      }
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
+      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
       input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
-        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
-                           vget_low_s16(input_val));
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
-                           vget_high_s16(input_val));
-      }
+      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
     }
     if (in < input_size) {
-      int32 buf[4 * kPeel];
-      for (int k = 0; k < 4; k++) {
-        vst1q_s32(buf + 4 * k, acc[k]);
-      }
+      int32 buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
       for (; in < input_size; in++) {
         int lane = (in + 8 - input_size) % 4;
         const int32 input_val = input_data[in] + input_offset;
@@ -950,26 +978,28 @@ inline void FullyConnectedAsGEMV(
           buf[lane + 4 * k] += filter_val * input_val;
         }
       }
-      for (int k = 0; k < 4; k++) {
-        acc[k] = vld1q_s32(buf + 4 * k);
-      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
     }
 
     // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      pairwise_reduced_acc[k] =
-          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
-    }
-    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
     const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
     reduced = vaddq_s32(reduced, bias_vec);
     if (shift_left) {
       const int32 multiplier_power_of_two = 1 << output_shift;
@@ -992,11 +1022,116 @@ inline void FullyConnectedAsGEMV(
     // Apply the clamping from the activation function
     res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
     res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
-    // Store results to destination. Assumes 32bit alignment.
-    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
-                  vreinterpret_u32_u8(res8), 0);
-    output_ptr += kPeel;
+    // Store results to destination.
+    vst1_lane_u8(output_data + out + 0, res8, 0);
+    vst1_lane_u8(output_data + out + 1, res8, 1);
+    vst1_lane_u8(output_data + out + 2, res8, 2);
+    vst1_lane_u8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  FullyConnectedAsGEMVWorkerTask(const RuntimeShape& input_shape,
+                                 const uint8* input_data, int32 input_offset,
+                                 const RuntimeShape& filter_shape,
+                                 const uint8* filter_data, int32 filter_offset,
+                                 const RuntimeShape& bias_shape,
+                                 const int32* bias_data, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data, int row_start, int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
   }
+
+  const RuntimeShape& input_shape_;
+  const uint8* input_data_;
+  int32 input_offset_;
+  const RuntimeShape& filter_shape_;
+  const uint8* filter_data_;
+  int32 filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32* bias_data_;
+  int32 output_offset_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int32 output_activation_min_;
+  int32 output_activation_max_;
+  const RuntimeShape& output_shape_;
+  uint8* output_data_;
+  gemmlowp::GemmContext* gemm_context_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const uint8* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemm_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemm_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
+  }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker =
+      gemmlowp::RoundUp<kKernelRows>(output_rows / thread_count);
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks[i] = new FullyConnectedAsGEMVWorkerTask(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemm_context->workers_pool()->Execute(tasks);
 }
 #endif  // USE_NEON
 
@@ -1053,14 +1188,16 @@ inline void FullyConnected(
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 #ifdef USE_NEON
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
-  if (batches == 1 && !(output_size % 4)) {
-    return FullyConnectedAsGEMV(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data);
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemm_context);
+    }
   }
 #endif  // USE_NEON
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
@@ -1616,6 +1753,222 @@ inline void ShuffledFullyConnected(
   gemm_context->workers_pool()->Execute(tasks);
 }
 
+inline void MeanImpl(const tflite::MeanParams& op_params,
+                     const RuntimeShape& input_shape, const uint8_t* input_data,
+                     int32 input_zero_point, float input_scale,
+                     const RuntimeShape& output_shape, uint8_t* output_data,
+                     int32 output_zero_point, float output_scale,
+                     int start_depth, int end_depth) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8/MeanImpl");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(2);
+  const int output_width = output_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  const bool ordinary_mean =
+      (input_zero_point == output_zero_point && input_scale == output_scale);
+  float scale, bias;
+  if (!ordinary_mean) {
+    scale = input_scale / output_scale;
+    bias = -input_zero_point * scale + 0.5;
+  }
+
+#ifdef USE_NEON
+  const float32x4_t num_elements_dup = vdupq_n_f32(num_elements_in_axis);
+  // This is only an approximation as NEON does not offer division instruction.
+  const float32x4_t num_elements_reverse = vrecpeq_f32(num_elements_dup);
+  const float32x4_t kRounding = vdupq_n_f32(0.5);
+  float32x4_t bias_dup;
+  float32x4_t output_zero_point_dup;
+  if (!ordinary_mean) {
+    bias_dup = vdupq_n_f32(bias);
+    output_zero_point_dup = vdupq_n_f32(output_zero_point);
+  }
+#endif
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    int out_d = start_depth;
+#ifdef USE_NEON
+
+    for (; out_d < end_depth - 8; out_d += 8) {
+      float32x4_t temp_sum_1 = vdupq_n_f32(0);
+      float32x4_t temp_sum_2 = vdupq_n_f32(0);
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          const uint8_t* input_data_ptr =
+              input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
+          uint8x8_t input_data_val = vld1_u8(input_data_ptr);
+          int16x8_t input_data_val_shift =
+              vreinterpretq_s16_u16(vmovl_u8(input_data_val));
+          float32x4_t input_float_1 =
+              vcvtq_f32_s32(vmovl_s16(vget_high_s16(input_data_val_shift)));
+          float32x4_t input_float_2 =
+              vcvtq_f32_s32(vmovl_s16(vget_low_s16(input_data_val_shift)));
+          temp_sum_1 = vaddq_f32(temp_sum_1, input_float_1);
+          temp_sum_2 = vaddq_f32(temp_sum_2, input_float_2);
+        }
+      }
+
+      float32x4_t mean_1 = vmulq_f32(temp_sum_1, num_elements_reverse);
+      float32x4_t mean_2 = vmulq_f32(temp_sum_2, num_elements_reverse);
+
+      if (!ordinary_mean) {
+        // maq is not supported, break down into two ops.
+        mean_1 = vmulq_n_f32(mean_1, scale);
+        mean_1 = vaddq_f32(mean_1, bias_dup);
+        mean_2 = vmulq_n_f32(mean_2, scale);
+        mean_2 = vaddq_f32(mean_2, bias_dup);
+      }
+
+      if (!ordinary_mean) {
+        mean_1 = vaddq_f32(mean_1, output_zero_point_dup);
+        mean_2 = vaddq_f32(mean_2, output_zero_point_dup);
+      }
+
+      // Rounding.
+      mean_1 = vaddq_f32(mean_1, kRounding);
+      mean_2 = vaddq_f32(mean_2, kRounding);
+      uint32x4_t casted_mean_1 = vcvtq_u32_f32(mean_1);
+      uint16x4_t narrow_range_mean_1 = vmovn_u32(casted_mean_1);
+      uint32x4_t casted_mean_2 = vcvtq_u32_f32(mean_2);
+      uint16x4_t narrow_range_mean_2 = vmovn_u32(casted_mean_2);
+      uint16x8_t combined_mean =
+          vcombine_u16(narrow_range_mean_2, narrow_range_mean_1);
+      uint8x8_t narrowed_combined_mean = vmovn_u16(combined_mean);
+      uint8_t* output_data_ptr =
+          output_data + Offset(output_shape, out_b, 0, 0, out_d);
+      vst1_u8(output_data_ptr, narrowed_combined_mean);
+    }
+#endif
+
+    for (; out_d < end_depth; ++out_d) {
+      float temp_value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          temp_value +=
+              input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+
+      temp_value = temp_value / num_elements_in_axis;
+      if (ordinary_mean) {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value));
+      } else {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value * scale + bias)) +
+            output_zero_point;
+      }
+    }
+  }
+}
+
+struct MeanWorkerTask : public gemmlowp::Task {
+  MeanWorkerTask(const tflite::MeanParams& op_params,
+                 const RuntimeShape& input_shape, const uint8_t* input_data,
+                 int32 input_zero_point, float input_scale,
+                 const RuntimeShape& output_shape, uint8_t* output_data,
+                 int32 output_zero_point, float output_scale, int start_height,
+                 int end_height)
+      : op_params_(op_params),
+        input_shape_(input_shape),
+        input_data_(input_data),
+        input_zero_point_(input_zero_point),
+        input_scale_(input_scale),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        output_zero_point_(output_zero_point),
+        output_scale_(output_scale),
+        start_height_(start_height),
+        end_height_(end_height) {}
+
+  void Run() override {
+    MeanImpl(op_params_, input_shape_, input_data_, input_zero_point_,
+             input_scale_, output_shape_, output_data_, output_zero_point_,
+             output_scale_, start_height_, end_height_);
+  }
+
+ private:
+  const tflite::MeanParams& op_params_;
+  const RuntimeShape& input_shape_;
+  const uint8_t* input_data_;
+  int32 input_zero_point_;
+  float input_scale_;
+  const RuntimeShape& output_shape_;
+  uint8_t* output_data_;
+  int32 output_zero_point_;
+  float output_scale_;
+  int start_height_;
+  int end_height_;
+  gemmlowp::GemmContext* gemm_context_;
+};
+
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const uint8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 uint8_t* output_data, int32 output_zero_point,
+                 float output_scale, gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  constexpr int kMinDepthPerThread = 8;
+  int thread_count = output_depth / kMinDepthPerThread;
+  thread_count = thread_count > 0 ? thread_count : 1;
+  const int capped_thread_count =
+      std::min(thread_count, gemm_context->max_num_threads());
+
+  if (thread_count == 1) {
+    MeanImpl(op_params, input_shape, input_data, input_zero_point, input_scale,
+             output_shape, output_data, output_zero_point, output_scale, 0,
+             output_depth);
+  } else {
+    // Instead parrallel for batch, we loop for the output_depth since batch
+    // is typical 1.
+    std::vector<gemmlowp::Task*> tasks(capped_thread_count);
+    int depth_start = 0;
+    for (int i = 0; i < capped_thread_count; ++i) {
+      // Try to distribute the tasks as even as possible.
+      int depth_end = depth_start +
+                      (output_depth - depth_start) / (capped_thread_count - i);
+      tasks[i] = new MeanWorkerTask(op_params, input_shape, input_data,
+                                    input_zero_point, input_scale, output_shape,
+                                    output_data, output_zero_point,
+                                    output_scale, depth_start, depth_end);
+      depth_start = depth_end;
+    }
+    gemm_context->workers_pool()->Execute(tasks);
+  }
+}
+
 template <typename T>
 inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
                                          int h, int b, int kheight, int kwidth,
@@ -2083,6 +2436,21 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+#ifdef USE_NEON
+  if (gemm_input_cols == 1 && output_rows >= 4) {
+    RuntimeShape fc_filter_shape{
+        filter_shape.Dims(0),
+        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
+
+    return FullyConnectedAsGEMV(
+        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
+        filter_data, filter_offset, bias_shape, bias_data, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_shape, output_data, gemm_context);
+  }
+#endif
+
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
       filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
@@ -2215,55 +2583,6 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
   }
 }
 
-inline void GetInvSqrtQuantizedMultiplierExp(int32 input,
-                                             int32* output_inv_sqrt,
-                                             int* output_shift) {
-  *output_shift = 11;
-  while (input >= (1 << 29)) {
-    input /= 4;
-    ++*output_shift;
-  }
-  TFLITE_DCHECK_GT(input, 0);
-  const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
-  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
-  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
-  *output_shift -= left_shift_bit_pairs;
-  input <<= 2 * left_shift_bit_pairs;
-  TFLITE_DCHECK_GE(input, (1 << 27));
-  TFLITE_DCHECK_LT(input, (1 << 29));
-  using gemmlowp::FixedPoint;
-  using gemmlowp::Rescale;
-  using gemmlowp::SaturatingRoundingMultiplyByPOT;
-  // Using 3 integer bits gives us enough room for the internal arithmetic in
-  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
-  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
-  const F3 fixedpoint_half_input =
-      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
-  const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
-  // Newton-Raphson iteration
-  // Naive unoptimized starting guess: x = 1
-  F3 x = F3::One();
-  // Naive unoptimized number of iterations: 5
-  for (int i = 0; i < 5; i++) {
-    const F3 x3 = Rescale<3>(x * x * x);
-    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
-  }
-  const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
-  x = x * fixedpoint_half_sqrt_2;
-  *output_inv_sqrt = x.raw();
-  if (*output_shift < 0) {
-    *output_inv_sqrt <<= -*output_shift;
-    *output_shift = 0;
-  }
-  // Convert right shift (right is positive) to left shift.
-  *output_shift *= kReverseShift;
-}
-
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const uint8* input_data,
@@ -2285,8 +2604,8 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
     }
     int32 inv_l2norm_multiplier;
     int inv_l2norm_shift;
-    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, &inv_l2norm_multiplier,
-                                     &inv_l2norm_shift);
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
 
     for (int c = 0; c < depth; c++) {
       int32 diff = *input_data - input_zero_point;
@@ -2678,7 +2997,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
     // dimension.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -2707,7 +3026,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
     // for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3064,7 +3383,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
   int y4 = params.broadcast_shape[4];
   if (y4 > 1) {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3081,7 +3400,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
     }
   } else {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3709,6 +4028,14 @@ inline void AveragePool(const PoolParams& params,
                         const uint8* input_data,
                         const RuntimeShape& output_shape, uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3721,69 +4048,76 @@ inline void AveragePool(const PoolParams& params,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint16 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        const int filter_count =
-            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint16 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint16x8_t acc_reg[2];
-              for (int i = 0; i < 2; i++) {
-                acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint16x8_t acc_reg[2];
+                for (int i = 0; i < 2; i++) {
+                  acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+                }
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+                acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+                for (int i = 0; i < 2; i++) {
+                  vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+                }
               }
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
-              acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
-              for (int i = 0; i < 2; i++) {
-                vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint16x8_t acc_reg = vld1q_u16(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vaddw_u8(acc_reg, input_reg);
+                vst1q_u16(acc + channel, acc_reg);
               }
-            }
-            for (; channel <= depth - 8; channel += 8) {
-              uint16x8_t acc_reg = vld1q_u16(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vaddw_u8(acc_reg, input_reg);
-              vst1q_u16(acc + channel, acc_reg);
-            }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] += *input_row_ptr++;
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
 #define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
   if (filter_count == FILTER_COUNT) {                                   \
-    for (; channel <= depth - 8; channel += 8) {                        \
+    for (; channel <= tranche_depth - 8; channel += 8) {                \
       uint16 buf[8];                                                    \
       for (int i = 0; i < 8; i++) {                                     \
         buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
@@ -3794,25 +4128,26 @@ inline void AveragePool(const PoolParams& params,
       vst1_u8(output_ptr + channel, buf8);                              \
     }                                                                   \
   }
-        AVGPOOL_DIVIDING_BY(9)
-        AVGPOOL_DIVIDING_BY(15)
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
 #undef AVGPOOL_DIVIDING_BY
-        for (; channel <= depth - 8; channel += 8) {
-          uint16 buf[8];
-          for (int i = 0; i < 8; i++) {
-            buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint16 buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
           }
-          uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
-          buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
-          buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, buf8);
-        }
 #endif
-        for (; channel < depth; ++channel) {
-          uint16 a = (acc[channel] + filter_count / 2) / filter_count;
-          a = std::max<uint16>(a, params.quantized_activation_min);
-          a = std::min<uint16>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16>(a, params.quantized_activation_min);
+            a = std::min<uint16>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -3877,6 +4212,14 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                     const uint8* input_data, const RuntimeShape& output_shape,
                     uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3889,77 +4232,85 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint8 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint8 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint8x16_t acc_reg = vld1q_u8(acc + channel);
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg = vmaxq_u8(acc_reg, input_reg);
-              vst1q_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint8x16_t acc_reg = vld1q_u8(acc + channel);
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_u8(acc_reg, input_reg);
+                vst1q_u8(acc + channel, acc_reg);
+              }
 
-            for (; channel <= depth - 8; channel += 8) {
-              uint8x8_t acc_reg = vld1_u8(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vmax_u8(acc_reg, input_reg);
-              vst1_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint8x8_t acc_reg = vld1_u8(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_u8(acc_reg, input_reg);
+                vst1_u8(acc + channel, acc_reg);
+              }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] = std::max(acc[channel], *input_row_ptr++);
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
-        for (; channel <= depth - 16; channel += 16) {
-          uint8x16_t a = vld1q_u8(acc + channel);
-          a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
-          a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
-          vst1q_u8(output_ptr + channel, a);
-        }
-        for (; channel <= depth - 8; channel += 8) {
-          uint8x8_t a = vld1_u8(acc + channel);
-          a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
-          a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, a);
-        }
+          for (; channel <= tranche_depth - 16; channel += 16) {
+            uint8x16_t a = vld1q_u8(acc + channel);
+            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
+            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
+            vst1q_u8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint8x8_t a = vld1_u8(acc + channel);
+            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
+            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, a);
+          }
 #endif
-        for (; channel < depth; ++channel) {
-          uint8 a = acc[channel];
-          a = std::max<uint8>(a, params.quantized_activation_min);
-          a = std::min<uint8>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint8 a = acc[channel];
+            a = std::max<uint8>(a, params.quantized_activation_min);
+            a = std::min<uint8>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -4345,119 +4696,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 // Currently just a copy of the reference code.
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
@@ -5052,6 +5290,14 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   output_map.array() = Eigen::floor(input_map.array());
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Ceil");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = Eigen::ceil(input_map.array());
+}
+
 #ifdef USE_NEON
 inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
                                  float scale, float* output_ptr) {
diff --git a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
index 8f52ef131dedf4d0270c0346b1094add57f52dfc..00b2d7e063254e2941fd3453f15dbaf2dbd4451e 100644
--- a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -54,6 +54,25 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 002444b6810925910a651dd5c919a46ac8e5fb47..c38f37416dde30cf16a41d6cc6f08dc40f3dfe7d 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -23,90 +23,173 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
+
+// Used in tests and template parameters to control which version of depthwise
+// convolution is called. Primarily for reference code, and specializations
+// forced in tests.
+enum class DepthwiseConvImplementation {
+  // Run all tests against kUseStandardEntry even if also testing another
+  // kernel, since we need to be sure that the main DepthwiseConv() function in
+  // optimized_ops.h dispatches to a correctly-executing kernel.
+  kNone = 0,                 // The "default" option: use the normal
+                             // DepthwiseConv kernel (entry) function.
+  kUseGenericKernel,         // Forced use of generic kernel.
+  kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
+  kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
+                             // when available.
+  kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
+                             // to match overall design NEON code.
+  kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
+                             // and some arrays.
+  kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
+};
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication {
+  kNoMultiplication = 0,  // Depth multiplier = 1.
+  kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
+};
+
 namespace reference_ops {
+namespace depthwise_conv {
 
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  int32 filter_val = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
+                                int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(const DepthwiseParams& params,
+                         const RuntimeShape& input_shape,
+                         const uint8* input_data,
+                         const RuntimeShape& filter_shape,
+                         const uint8* filter_data,
+                         const RuntimeShape& bias_shape, const int32* bias_data,
+                         const RuntimeShape& output_shape, uint8* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 input_offset = params.input_offset;
+    const int32 filter_offset = params.weights_offset;
+    const int32 output_offset = params.output_offset;
+    const int32 output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32 acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32 input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32 filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
                 }
               }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8>(acc);
             }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                                output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
           }
         }
       }
     }
   }
+};
+
+}  // namespace depthwise_conv
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  return depthwise_conv::DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
 }
 
-}  // end namespace reference_ops
+}  // namespace reference_ops
 }  // end namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
new file mode 100644
index 0000000000000000000000000000000000000000..a694ba2aaa993b0631958e0b338a7a62e154de75
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+
+#include <limits>
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int8");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ad2a70b31c23cb9e316b88d7b53f6c065675e9f
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -0,0 +1,128 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  // Get parameters.
+  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32 output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
+  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
+
+  // Sanity check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8, even though it
+                  // is represented using int32.
+                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val - input_offset);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6195c0da93a47c3034a50a167761ec8b5850c44
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 input_offset = params.input_offset;
+  const int32 output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
+  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarentees that the input_offset is a int8, even though it
+                  // is represented using int32.
+                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val - input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b349f4d49bb66d6f60ed92629945640e24cb9c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, void* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32 input_val = input_data[b * accum_depth + d];
+        int32 filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e7c7f317602d78a661500049eab736207aafcd9
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
+                            int32_t depth, const int8* input_data,
+                            int8* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  // The output scale must be in sync with Prepare().
+  // Output is in 1/128 scale so the actual output range is nudged from [-1, 1]
+  // to [-1, 127/128].
+  static constexpr int32_t kOutputScale = 7;
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    // int32 = (int8 - int8) ^ 2.
+    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
+    // safe from overflowing in at least 2^16 steps.
+    int32_t acc = 0;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+      acc += input * input;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(acc, /*reverse_shift*/ -1,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input =
+          input_data[depth * outer_index + inner_index] - input_zero_point;
+
+      // Rescale and downcast. Rescale is folded into the division.
+      int32_t output_in_q24 = MultiplyByQuantizedMultiplier(
+          input, inv_l2norm_multiplier, inv_l2norm_shift + kOutputScale);
+      output_in_q24 =
+          std::min(static_cast<int32_t>(kMaxInt8),
+                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
+      output_data[depth * outer_index + inner_index] =
+          static_cast<int8>(output_in_q24);
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_L2NORMALIZATION_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f22bb4f13803cf4e14c8b4fd18b9c301fab07359
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void LogSoftmax(int32_t input_multiplier, int32_t input_shift,
+                       int32_t reverse_multiplier, int32_t reverse_shift,
+                       int32_t diff_min, int32_t outer_size, int32_t depth,
+                       const int8* input_data, int8* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+
+  // [-16, 0] is mapped to [-128, 127] with 1/16 as scale and 127 as zero
+  // point. This nudges the output to [-255/16, 0].
+  static constexpr int32_t kOutputZeroPoint = 127;
+
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    int8 max_in_row = kMinInt8;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min = std::max(
+        diff_min - 1,
+        MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                      reverse_multiplier, -reverse_shift));
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kOutputZeroPoint;
+
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxInt8)),
+                     static_cast<int32_t>(kMinInt8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<int8_t>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinInt8;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
new file mode 100644
index 0000000000000000000000000000000000000000..8277c3b3d565d845da4cc8931a4256c1005db77c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int8_t* input_data,
+                     int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputIntegerBits = 8;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kOutputZeroPoint = -128;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
+          input, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+      const int32_t output_in_q0 =
+          gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q23 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
+      output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
+                                        static_cast<int32_t>(kMinInt8)),
+                               static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q23);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
new file mode 100644
index 0000000000000000000000000000000000000000..72885d1b467ba57ec163876b1a1f271d879d36c7
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
+                 int32_t shift, const RuntimeShape& unextended_input_shape,
+                 const int8_t* input_data, int32 input_zero_point,
+                 const RuntimeShape& unextended_output_shape,
+                 int8_t* output_data, int32 output_zero_point) {
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int num_elements_in_axis = input_width * input_height;
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  static constexpr int32_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      int32 acc = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
+                 input_zero_point;
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
+                    : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
+      acc += output_zero_point;
+      acc = std::min(std::max(acc, kMinInt8), kMaxInt8);
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+          static_cast<int8_t>(acc);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e33d089945a2907e489c51c117eec77b194ed7e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Mul with 16 bit inputs and int8_t outputs.
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16Int8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result =
+        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result =
+        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, params.output_multiplier,
+                  params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
index 22750bc91a856b360459fbf9b5ed0519e4ac6c88..2762bec8e6c3c8d69198456cbd16b04dc45ef2ab 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
 
+#include <limits>
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
@@ -77,6 +78,63 @@ inline void AveragePool(const PoolParams& params,
   }
 }
 
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8* input_data, const RuntimeShape& output_shape,
+                    int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int8_t max = std::numeric_limits<int8_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int8_t>(max, params.quantized_activation_min);
+          max = std::min<int8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(max);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
index 3f6bf1cb73e40b2bc396a59f5b47cefaea071d02..892b38630a64e6c56b3216b74e9210b66fc8fd24 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
@@ -85,9 +85,8 @@ inline void Softmax(const SoftmaxParams& params,
             (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
         const int32 shifted_output = unsat_output - 128;
 
-        output_data[i * depth + c] = static_cast<int8>(
-            std::max(std::min(shifted_output, static_cast<int32>(127)),
-                     static_cast<int32>(-128)));
+        output_data[i * depth + c] =
+            static_cast<int8>(std::max(std::min(shifted_output, 127), -128));
 
       } else {
         output_data[i * depth + c] = -128;
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
new file mode 100644
index 0000000000000000000000000000000000000000..081928bc88d9c59e15b5ed857daf4a144abe2ad7
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 int32_t input_size, const int8_t* input_data,
+                 int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 =
+          std::min(std::max(output_in_q24, static_cast<int32_t>(kMinInt8)),
+                   static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 380fc8f98ebbdd90bb68144a46903640734bff08..390bf08e30300625471f8fe0bfceac21fc43756d 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -2033,7 +2034,16 @@ template <typename T1, typename T2, typename T3>
 void ArgMax(const T3* axis, const T1* input_data,
             const tflite::Dims<4>& input_dims, T2* output_data,
             const tflite::Dims<4>& output_dims) {
-  ArgMinMax(DimsToShape(input_dims), input_data, axis, DimsToShape(output_dims),
+  // Assumes the input always has 4 dimensions, and therefore,
+  // output always has three dimensions.
+  auto output_shape = RuntimeShape(
+      {output_dims.sizes[2], output_dims.sizes[1], output_dims.sizes[0]});
+  // Another way to interpret this is that output_dims.sizes[4] is always 1.
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(),
+                   DimsToShape(output_dims).FlatSize());
+  // Legacy path only supported this.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+  ArgMinMax(DimsToShape(input_dims), input_data, axis, output_shape,
             output_data, std::greater<T1>());
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 1acf0caad0db8481965fcba0bc1fafb41bd23f47..f5c4b78dc1429f45e477ecc9528e976aeda2ab1f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -109,6 +110,73 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      float dot_prod = 0.0f;
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        const float* vector_in_batch = vector + b * m_cols;
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+          }
+        }
+      }
+      *result_in_batch += dot_prod;
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const int8_t* vector_block_ptr = vectors + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dotprod += (*row_ptr++) * (*vector_block_ptr++);
+          }  // for block
+        }
+      }
+      *result += (dotprod * batch_scaling_factor);
+    }  // for row
+  }    // for batch
+}
+
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index a06ebc1600d4fe47cf054b4e157bc21a5f70ddfc..49b59da0bbaf7aec6ba1b66b499df8d5426f5951 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -48,6 +48,16 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
@@ -165,6 +175,23 @@ void MatrixBatchVectorMultiplyAccumulate(
                                               result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index b6a8f3859f0aa36184304ee9d3af32d9f77d6f57..a028ab105807a4043b0b6f1d6f1d43720d5d4116 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -36,68 +36,6 @@ limitations under the License.
 
 namespace tflite {
 
-// TODO(b/77858996): Add these to gemmlowp.
-template <typename IntegerType>
-IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t sum = a64 + b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          sum)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
-}
-
-template <typename IntegerType>
-IntegerType SaturatingSub(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
-  std::int32_t a32 = a;
-  std::int32_t b32 = b;
-  std::int32_t diff = a32 - b32;
-  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
-}
-
-template <>
-inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t diff = a64 - b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          diff)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingSub(a.raw(), b.raw()));
-}
-// End section to be moved to gemmlowp.
-
 namespace reference_ops {
 
 // Return true for broadcast case, false otherwise.
@@ -192,59 +130,6 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
   return true;
 }
 
-template <typename T>
-int CountLeadingZeros(T integer_input) {
-  static_assert(std::is_unsigned<T>::value,
-                "Only unsigned integer types handled.");
-  if (integer_input == 0) {
-    return std::numeric_limits<T>::digits;
-  }
-  const T one_in_leading_positive = static_cast<T>(1)
-                                    << (std::numeric_limits<T>::digits - 1);
-  int leading_zeros = 0;
-  while (integer_input < one_in_leading_positive) {
-    integer_input <<= 1;
-    ++leading_zeros;
-  }
-  return leading_zeros;
-}
-
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// If we want to leave IntegerBits fixed, then multiplication
-// by a power of two has to be saturating/rounding, not exact anymore.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const float* input_data, const RuntimeShape& filter_shape,
                  const float* filter_data, const RuntimeShape& bias_shape,
@@ -506,6 +391,15 @@ inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
   }
 }
 
+inline void Elu(const RuntimeShape& input_shape, const float* input_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
 inline void Relu(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -543,16 +437,17 @@ inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+template <typename T>
 inline void ReluX(const tflite::ActivationParams& params,
-                  const RuntimeShape& input_shape, const uint8* input_data,
-                  const RuntimeShape& output_shape, uint8* output_data) {
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  const uint8 max_value = params.quantized_activation_max;
-  const uint8 min_value = params.quantized_activation_min;
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
   for (int i = 0; i < flat_size; ++i) {
-    const uint8 val = input_data[i];
-    const uint8 clamped =
+    const T val = input_data[i];
+    const T clamped =
         val > max_value ? max_value : val < min_value ? min_value : val;
     output_data[i] = clamped;
   }
@@ -594,55 +489,6 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
   }
 }
 
-inline void GetInvSqrtQuantizedMultiplierExp(int32 input,
-                                             int32* output_inv_sqrt,
-                                             int* output_shift) {
-  *output_shift = 11;
-  while (input >= (1 << 29)) {
-    input /= 4;
-    ++*output_shift;
-  }
-  TFLITE_DCHECK_GT(input, 0);
-  const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
-  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
-  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
-  *output_shift -= left_shift_bit_pairs;
-  input <<= 2 * left_shift_bit_pairs;
-  TFLITE_DCHECK_GE(input, (1 << 27));
-  TFLITE_DCHECK_LT(input, (1 << 29));
-  using gemmlowp::FixedPoint;
-  using gemmlowp::Rescale;
-  using gemmlowp::SaturatingRoundingMultiplyByPOT;
-  // Using 3 integer bits gives us enough room for the internal arithmetic in
-  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
-  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
-  const F3 fixedpoint_half_input =
-      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
-  const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
-  // Newton-Raphson iteration
-  // Naive unoptimized starting guess: x = 1
-  F3 x = F3::One();
-  // Naive unoptimized number of iterations: 5
-  for (int i = 0; i < 5; i++) {
-    const F3 x3 = Rescale<3>(x * x * x);
-    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
-  }
-  const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
-  x = x * fixedpoint_half_sqrt_2;
-  *output_inv_sqrt = x.raw();
-  if (*output_shift < 0) {
-    *output_inv_sqrt <<= -*output_shift;
-    *output_shift = 0;
-  }
-  // Convert right shift (right is positive) to left shift.
-  *output_shift *= kReverseShift;
-}
-
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const uint8* input_data,
@@ -662,9 +508,8 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
     }
     int32 inv_l2norm_multiplier;
     int inv_l2norm_shift;
-    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, &inv_l2norm_multiplier,
-                                     &inv_l2norm_shift);
-
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
+                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
     for (int c = 0; c < depth; c++) {
       int32 diff = input_data[depth * i + c] - input_zero_point;
       int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
@@ -702,6 +547,22 @@ inline void Add(const ArithmeticParams& params,
   }
 }
 
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 T* const* input_data, T* output_data) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (int i = 0; i < size; ++i) {
+    T x = 0;
+    for (int j = 0; j < num_inputs; ++j) {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
@@ -1688,6 +1549,54 @@ inline void SubWithActivation(const ArithmeticParams& params,
   }
 }
 
+inline void Sub16(const ArithmeticParams& params,
+                  const RuntimeShape& input1_shape, const int16_t* input1_data,
+                  const RuntimeShape& input2_shape, const int16_t* input2_data,
+                  const RuntimeShape& output_shape, int16_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingFlatSize(output_shape, input1_shape, input2_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  } else {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
 template <typename Scalar>
 inline void Concatenation(const ConcatenationParams& params,
                           const RuntimeShape* const* input_shapes,
@@ -1837,11 +1746,17 @@ void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
   const int outputs_count = params.num_split;
 
   int outer_size = 1;
-  for (int i = 0; i < params.axis; i++) {
+  int axis = params.axis;
+  if (axis < 0) {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i) {
     outer_size *= input_shape.Dims(i);
   }
   int copy_size = 1;
-  for (int i = params.axis + 1; i < dimensions; i++) {
+  for (int i = axis + 1; i < dimensions; ++i) {
     copy_size *= input_shape.Dims(i);
   }
   TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
@@ -2681,121 +2596,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// Although currently the name of this function says that it cannot handle
-// values less than 1, in practice it can handle as low as 1/x_max, where
-// x_max is the largest representable input.  In other words, the output range
-// is symmetric.
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
                        const RuntimeShape& output_shape, uint8* output_data) {
@@ -3122,6 +2922,16 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::ceil(input_data[offset]);
+  }
+}
+
 template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@@ -3159,6 +2969,43 @@ inline void Gather(const tflite::GatherParams& op_params,
   }
 }
 
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape& params_shape,
+                     const ParamsT* params_data,
+                     const RuntimeShape& indices_shape,
+                     const IndicesT* indices_data,
+                     const RuntimeShape& output_shape, ParamsT* output_data) {
+  gemmlowp::ScopedProfilingLabel label("GatherNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  const int indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i) {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = indices_nd; i < params_dims; ++i) {
+    slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  for (int i = 0; i < n_slices; ++i) {
+    int from_pos = 0;
+    for (int j = 0; j < indices_nd; ++j) {
+      from_pos += indices_data[i * indices_nd + j] * dims_to_count[j];
+    }
+    std::memcpy(output_data + i * slice_size, params_data + from_pos,
+                sizeof(ParamsT) * slice_size);
+  }
+}
+
 template <typename T>
 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
                            const RuntimeShape& unextended_input_shape,
@@ -3458,6 +3305,16 @@ inline void PadImageStyle(const tflite::PadParams& op_params,
       output_data);
 }
 
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const int8_t* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          int8_t* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
 template <typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                           const RuntimeShape& input_shape,
@@ -3787,6 +3644,65 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const uint8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 uint8_t* output_data, int32 output_zero_point,
+                 float output_scale) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  const bool ordinary_mean =
+      (input_zero_point == output_zero_point && input_scale == output_scale);
+  float scale, bias;
+  if (!ordinary_mean) {
+    scale = input_scale / output_scale;
+    bias = -input_zero_point * scale + 0.5;
+  }
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float temp_value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          temp_value +=
+              input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      temp_value = temp_value / num_elements_in_axis;
+      if (ordinary_mean) {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value));
+      } else {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value * scale + bias)) +
+            output_zero_point;
+      }
+    }
+  }
+}
+
 // Computes the mean of elements across dimensions given in axis.
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
@@ -3950,11 +3866,8 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
                const T3* input2_data, const RuntimeShape& output_shape,
                T2* output_data, const Cmp& cmp) {
   gemmlowp::ScopedProfilingLabel label("ArgMinMax");
-  // For ArgMax, the number of output dimensions = (number of input dimensions -
-  // 1). For the sake of simplicity, the output dimensions are equal to the
-  // input dimensions here. We enforce the constraint that the axis dimension
-  // must always be 1.
-  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(),
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
                    output_shape.DimensionsCount());
 
   int axis = input2_data[0];
@@ -3963,7 +3876,6 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   }
 
   const int axis_size = input1_shape.Dims(axis);
-  TFLITE_DCHECK_EQ(output_shape.Dims(axis), 1);
 
   int outer_size = 1;
   for (int i = 0; i < axis; ++i) {
@@ -3974,7 +3886,7 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   int inner_size = 1;
   const int dims_count = input1_shape.DimensionsCount();
   for (int i = axis + 1; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
     inner_size *= input1_shape.Dims(i);
   }
 
@@ -4419,6 +4331,34 @@ void RankOneSelect(const RuntimeShape& input_condition_shape,
   }
 }
 
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape& input_condition_shape,
+                      const D* input_condition_data, T* output_data) {
+  const size_t size = input_condition_shape.FlatSize();
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i) {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i) {
+    if (input_condition_data[i]) {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j) {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
 // For easy implementation, the indices is always a vector of size-4 vectors.
 template <typename T, typename TI>
 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
@@ -4714,6 +4654,112 @@ void Fill(const RuntimeShape& value_shape, const T* value_data,
   }
 }
 
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape& input_shape,
+             const Scalar* input_data, const RuntimeShape& output_shape,
+             Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i) {
+    for (int j = 0; j < dims_at_axis; ++j) {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar* output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS* seq_lengths, const int seq_dim,
+                     const int batch_dim, const RuntimeShape& input_shape,
+                     const Scalar* input_data, const RuntimeShape& output_shape,
+                     Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i) {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar* output_ptr;
+  if (batch_dim > seq_dim) {
+    for (int i = 0; i < outer_size; ++i) {
+      for (int j = 0; j < dims_at_outer_dim; ++j) {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p) {
+          for (int q = 0; q < dims_at_medium_dim; ++q) {
+            const int in_pos =
+                ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar* in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl) {
+              output_ptr = output_data + in_pos;
+            } else {
+              const int out_pos_base =
+                  (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos =
+                  ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  } else if (batch_dim < seq_dim) {
+    for (int i = 0; i < outer_size; ++i) {
+      for (int j = 0; j < dims_at_outer_dim; ++j) {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p) {
+          for (int q = 0; q < dims_at_medium_dim; ++q) {
+            const int in_pos =
+                ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar* in_ptr = input_data + in_pos;
+            if (q > sl) {
+              output_ptr = output_data + in_pos;
+            } else {
+              const int out_pos =
+                  ((out_pos_base + p) * dims_at_medium_dim + sl - q) *
+                  copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 71ae69522f9a45745a9ed9eae211db3d048ba43d..4f18f283b6094c66fb89080115d359ffce776dd8 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -55,6 +55,21 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int n_batch, float* result,
                                          int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
 // Same as the function above, but for values quantized using symmetric
 // quantization (e.g. by calling SymmetricQuantizeFloats).
 // The passed scaling factors is a buffer of the quantization scaling factors
@@ -67,6 +82,23 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result);
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 29866d066406e58e06e6caa2e5b410460564c966..0918c8d27727408899f13ed866dfe737202f527b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
+#ifdef DOTPROD_BENCHMARKS
+#include "testing/base/public/benchmark.h"
+#endif  // DOTPROD_BENCHMARKS
+
 namespace tflite {
 namespace tensor_utils {
 
@@ -143,12 +147,228 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
                                                -1., 3., 7., 3., 23., 3.})));
 }
 
+struct MatrixVectorData {
+  // Contains dense parameters.
+  std::vector<int8_t> matrix;
+
+  // Like matrix, but with about half of the parameters set to zero.
+  // Use this to create golden output for sparse matrix tests.
+  std::vector<int8_t> zeroed_matrix;
+
+  // zeroed_matrix described in sparse form.
+  std::vector<int8_t> sparse_matrix;
+  std::vector<uint8_t> ledger;
+
+  std::vector<int8_t> vectors;
+  std::vector<float> scale_factors;
+  std::vector<float> results;
+
+  int rows;
+  int cols;
+  int batch;
+};
+
+MatrixVectorData SetupMatrixVectorData(int rows, int cols, int batch,
+                                       bool negative = false) {
+  MatrixVectorData data;
+  data.rows = rows;
+  data.cols = cols;
+  data.batch = batch;
+
+  for (int i = 0; i < rows * cols; i++) {
+    int sign = 1;
+    if ((i % 3) == 0 && negative) sign = -1;
+    data.matrix.push_back(sign * (i % 70));
+  }
+  for (int i = 0; i < cols * batch; i++) {
+    int sign = 1;
+    if ((i % 5) == 0 && negative) sign = -1;
+    data.vectors.push_back(sign * (i % 50));
+  }
+  data.scale_factors = {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
+  data.results.resize(rows * batch, 0);
+
+  data.zeroed_matrix = data.matrix;
+
+  // Make a sparsification ledger.
+  for (int i = 0; i < rows; i++) {
+    int max_chunks = cols / 16;
+    int selected_chunks = (max_chunks / 2);
+    bool row_is_odd = (i % 2) > 0;
+    bool max_chunks_is_odd = (max_chunks % 2) > 0;
+
+    data.ledger.push_back(selected_chunks);
+    if (max_chunks_is_odd && row_is_odd) {
+      selected_chunks++;
+    }
+
+    // In odd rows, use odd chunk indexes.
+    // In even rows, use even chunk indexes.
+    for (int j = 0; j < max_chunks; j++) {
+      const int chunk_start = i * cols + (j * 16);
+      const int chunk_end = i * cols + (j * 16) + 16;
+      if ((j % 2) == (i % 2)) {
+        // Copy this chunk into the sparse matrix.
+        data.ledger.push_back(j);
+        for (int k = chunk_start; k < chunk_end; k++) {
+          data.sparse_matrix.push_back(data.matrix[k]);
+        }
+      } else {
+        // Zero this part out of zeroed_matrix.
+        for (int k = chunk_start; k < chunk_end; k++) {
+          data.zeroed_matrix[k] = 0;
+        }
+      }
+    }
+  }
+  return data;
+}
+
+std::vector<float> TestDotprodMatrixBatchVectorMultiply(int rows, int cols,
+                                                        int batch,
+                                                        bool negative = false) {
+  MatrixVectorData data = SetupMatrixVectorData(rows, cols, batch, negative);
+
+  // All partial sums in this computation are small enough to fit in the
+  // mantissa of a float, and the scale factors are all integers, so we expect
+  // an exact result.
+  MatrixBatchVectorMultiplyAccumulate(
+      data.matrix.data(), rows, cols, data.vectors.data(),
+      data.scale_factors.data(), batch, &data.results[0], 1);
+  return data.results;
+}
+
+std::vector<float> TestSparseDotprodMatrixBatchVectorMultiply(
+    int rows, int cols, int batch, bool negative = false) {
+  MatrixVectorData data = SetupMatrixVectorData(rows, cols, batch, negative);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      data.sparse_matrix.data(), data.ledger.data(), rows, cols,
+      data.vectors.data(), data.scale_factors.data(), batch, &data.results[0],
+      1);
+  return data.results;
+}
+
+TEST(uKernels, DotprodMatrixBatchVectorMultiplyAccumulateTest) {
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 16, 1),
+              testing::ElementsAre(1240, 3160, 5080, 7000));
+
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 32, 2),
+              testing::ElementsAre(10416, 26288, 8490, 23312, 18276, 70756,
+                                   37416, 60916));
+
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 32, 3),
+              testing::ElementsAre(10416, 26288, 8490, 23312, 18276, 70756,
+                                   37416, 60916, 52080, 142704, 55878, 125712));
+
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(8, 1024, 3),
+              testing::ElementsAreArray(
+                  {841094,  853168,  866642,  840286,  860760,  862754,
+                   843678,  872552,  1724476, 1769072, 1747588, 1738844,
+                   1758240, 1742916, 1761612, 1755808, 2506896, 2564262,
+                   2629188, 2515824, 2598390, 2569236, 2537352, 2645118}));
+
+  const bool kNegative = true;
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 64, 1, kNegative),
+              testing::ElementsAre(13696, 6904, 7764, 11806));
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(4, 32, 2, kNegative),
+      testing::ElementsAre(3436, 3522, 1590, 6972, 2516, 20520, 456, 10628));
+}
+
+TEST(uKernels, DotprodMatrixBatchFourVectorMultiplyAccumulateDotprodTest) {
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 16, 4),
+              testing::ElementsAreArray(
+                  {1240, 3160, 6320, 18352, 15240, 45576, 4200, 16232}));
+  ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 64, 4),
+              testing::ElementsAreArray({45794, 38948, 88536, 84252, 157626,
+                                         165312, 209864, 246128}));
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(2, 64, 8),
+      testing::ElementsAreArray({45794, 38948, 88536, 84252, 157626, 165312,
+                                 209864, 246128, 219700, 195550, 279684, 278928,
+                                 413616, 445662, 374896, 365952}));
+
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(4, 64, 8),
+      testing::ElementsAreArray(
+          {45794,  38948,  34622,  32816,  88536,  84252,  85008,  90804,
+           157626, 165312, 180558, 203364, 209864, 246128, 236472, 208896,
+           219700, 195550, 184000, 185050, 279684, 278928, 293292, 322776,
+           413616, 445662, 495348, 513674, 374896, 365952, 321168, 296544}));
+
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(16, 1024, 4),
+      testing::ElementsAreArray(
+          {841094,  853168,  866642,  840286,  860760,  862754,  843678,
+           872552,  837586,  851270,  877414,  834188,  863062,  857846,
+           841780,  879054,  1724476, 1769072, 1747588, 1738844, 1758240,
+           1742916, 1761612, 1755808, 1737684, 1750780, 1747356, 1754152,
+           1748348, 1753324, 1743320, 1754316, 2506896, 2564262, 2629188,
+           2515824, 2598390, 2569236, 2537352, 2645118, 2508444, 2571480,
+           2610576, 2510442, 2618208, 2566584, 2544570, 2614536, 3458904,
+           3502688, 3474792, 3505976, 3499360, 3488264, 3485848, 3512832,
+           3500616, 3482520, 3489624, 3469008, 3495992, 3524376, 3465680,
+           3526264}));
+
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(4, 128, 4),
+      testing::ElementsAreArray({87920, 80024, 92288, 103712, 228148, 224820,
+                                 233812, 213124, 271284, 271788, 332772, 328236,
+                                 419328, 431328, 411968, 417248}));
+
+  ASSERT_THAT(
+      TestDotprodMatrixBatchVectorMultiply(4, 128, 8),
+      testing::ElementsAreArray(
+          {87920,  80024,  92288,  103712, 228148, 224820, 233812, 213124,
+           271284, 271788, 332772, 328236, 419328, 431328, 411968, 417248,
+           482680, 523840, 560800, 593560, 563940, 609924, 566868, 644772,
+           743708, 857780, 818972, 823284, 708384, 695008, 730912, 872096}));
+
+  const bool kNegative = true;
+  EXPECT_THAT(TestDotprodMatrixBatchVectorMultiply(1, 16, 1, kNegative),
+              testing::ElementsAre(450));
+  EXPECT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 64, 8, kNegative),
+              testing::ElementsAreArray({13696, 6904, 9952, 12368, 22848, 61632,
+                                         40424, 46776, 57630, 38670, 62976,
+                                         49824, 39032, 71988, 60128, 148992}));
+
+  std::vector<float> results =
+      TestDotprodMatrixBatchVectorMultiply(256, 1024, 8);
+  int64_t sum = 0;
+  for (int i = 0; i < results.size(); i++) {
+    sum += static_cast<int64_t>(results[i]);
+  }
+  EXPECT_EQ(7980076336, sum);
+}
+
+TEST(uKernels, DotprodSparseMatrixBatchVectorMultiplyAccumulate) {
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 16, 1),
+              testing::ElementsAre(0));
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 32, 1),
+              testing::ElementsAre(1240));
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 1),
+              testing::ElementsAre(26544));
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 2),
+              testing::ElementsAre(26544, 24344));
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(4, 64, 4),
+              testing::ElementsAreArray(
+                  {26544, 15866, 22140, 11408, 24344, 53248, 42704, 39900,
+                   48000, 94146, 101892, 81876, 87712, 105160, 148304, 75936}));
+
+  const bool kNegative = true;
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 1, kNegative),
+              testing::ElementsAre(8764));
+  EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(2, 64, 2, kNegative),
+              testing::ElementsAre(8764, 5196, 7204, 11148));
+}
+
 #ifdef __ANDROID__
 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   // Note we use 29 columns as this exercises all the neon kernel: the
   // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
   const int a_rows = 4, a_cols = 29;
   const int kWeightsPerUint32 = 4;
+  /* clang-format off */
   const float a_float_data[] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
@@ -174,126 +394,18 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
                           &a_max, &scaling_factor_a);
   const int8_t expected_a_int8_data[] = {
-      /* 1st row */
-      5,
-      10,
-      15,
-      20,
-      25,
-      30,
-      35,
-      40,
-      44,
-      45,
-      50,
-      54,
-      59,
-      64,
-      68,
-      73,
-      77,
-      82,
-      86,
-      91,
-      95,
-      100,
-      104,
-      109,
-      113,
-      118,
-      122,
-      127,
-      0,
-      /* 2nd row */
-      -5,
-      -10,
-      -15,
-      -20,
-      -25,
-      -30,
-      -35,
-      -40,
-      -44,
-      -45,
-      -50,
-      -54,
-      -59,
-      -64,
-      -68,
-      -73,
-      -77,
-      -82,
-      -86,
-      -91,
-      -95,
-      -100,
-      -104,
-      -109,
-      -113,
-      -118,
-      -122,
-      -127,
-      0,
-      /* 3rd row */
-      5,
-      -10,
-      15,
-      -20,
-      25,
-      -30,
-      35,
-      -40,
-      44,
-      -45,
-      50,
-      -54,
-      59,
-      -64,
-      68,
-      -73,
-      77,
-      -82,
-      86,
-      -91,
-      95,
-      -100,
-      104,
-      -109,
-      113,
-      -118,
-      122,
-      -127,
-      0,
-      /* 4th row */
-      -5,
-      10,
-      -15,
-      20,
-      -25,
-      30,
-      -35,
-      40,
-      -44,
-      45,
-      -50,
-      54,
-      -59,
-      64,
-      -68,
-      73,
-      -77,
-      82,
-      -86,
-      91,
-      -95,
-      100,
-      -104,
-      109,
-      -113,
-      118,
-      -122,
-      127,
-      0,
+    /* 1st row */
+    5, 10, 15, 20, 25, 30, 35, 40, 44, 45, 50, 54, 59, 64, 68, 73, 77, 82, 86,
+    91, 95, 100, 104, 109, 113, 118, 122, 127, 0,
+    /* 2nd row */
+    -5, -10, -15, -20, -25, -30, -35, -40, -44, -45, -50, -54, -59, -64, -68,
+    -73, -77, -82, -86, -91, -95, -100, -104, -109, -113, -118, -122, -127, 0,
+    /* 3rd row */
+    5, -10, 15, -20, 25, -30, 35, -40, 44, -45, 50, -54, 59, -64, 68, -73, 77,
+    -82, 86, -91, 95, -100, 104, -109, 113, -118, 122, -127, 0,
+    /* 4th row */
+    -5, 10, -15, 20, -25, 30, -35, 40, -44, 45, -50, 54, -59, 64, -68, 73, -77,
+    82, -86, 91, -95, 100, -104, 109, -113, 118, -122, 127, 0,
   };
   for (int i = 0; i < a_rows * a_cols; ++i) {
     EXPECT_EQ(expected_a_int8_data[i], a_int8_data[i]);
@@ -301,66 +413,14 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 
   const int b_rows = 29, b_cols = 1, batches = 2;
   const float b_float_data[] = {
-      /* batch 1 */
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      /* batch 2 */
-      2.5,
-      -2.1,
-      3.0,
-      -1.3,
-      1.3,
-      -1.1,
-      2.0,
-      -1.7,
-      1.9,
-      -1.5,
-      0.5,
-      -0.7,
-      0.8,
-      -0.3,
-      2.8,
-      -2.8,
-      1.1,
-      -2.3,
-      1.9,
-      -1.9,
-      2.1,
-      -0.5,
-      2.4,
-      -0.1,
-      1.0,
-      -2.5,
-      0.7,
-      -1.9,
-      0.2,
+    /* batch 1 */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0,
+    /* batch 2 */
+    2.5, -2.1, 3.0, -1.3, 1.3, -1.1, 2.0, -1.7, 1.9, -1.5, 0.5, -0.7, 0.8, -0.3,
+    2.8, -2.8, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5, 0.7, -1.9,
+    0.2,
   };
 
   // Quantized values of B:
@@ -374,67 +434,15 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
                           &scaling_factor_b[1]);
 
   const int8_t expected_b_int8_data[] = {
-      /* batch 1 */
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      /* batch 2 */
-      106,
-      -89,
-      127,
-      -55,
-      55,
-      -47,
-      85,
-      -72,
-      80,
-      -64,
-      21,
-      -30,
-      34,
-      -13,
-      119,
-      -119,
-      47,
-      -97,
-      80,
-      -80,
-      89,
-      -21,
-      102,
-      -4,
-      42,
-      -106,
-      30,
-      -80,
-      8,
+    /* batch 1 */
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127,
+    /* batch 2 */
+    106, -89, 127, -55, 55, -47, 85, -72, 80, -64, 21, -30, 34, -13, 119, -119,
+    47, -97, 80, -80, 89, -21, 102, -4, 42, -106, 30, -80, 8,
   };
+  /* clang-format on */
   for (int i = 0; i < b_rows * b_cols * batches; ++i) {
     EXPECT_EQ(expected_b_int8_data[i], b_int8_data[i]);
   }
@@ -468,6 +476,176 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 }
 #endif  // __ANDROID__
 
+TEST(uKernels, SparseMatrixBatchVectorMultiplyAccumulateTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  float matrix[kRow * kCol] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+
+  // BCSR format of the above matrix.
+  float matrix_values[] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38, 39.39,
+      40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24, -25.25,
+      -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25, -26.26,
+      27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, -33.33, 34.34, -35.35, 36.36, -37.37, 38.38,
+      -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float vector[kBatch * kCol] = {
+    /* 1st batch */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    /* 2nd batch */
+    2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+    -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3, 0.0,
+    2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5,
+    0.7, -1.9, 0.2, 0.0, 0.1, 0.2,
+  };
+  /* clang-format on */
+
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      dense_output.data(), /*result_stride=*/1);
+
+  EXPECT_THAT(dense_output, ElementsAreArray(ArrayFloatNear(
+                                {-13.69, 6.06001, 272.7, -608.03, -9.66602,
+                                 -10.201, 10.201, -713.897949},
+                                1e-4)));
+
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      matrix_values, ledger, kRow, kCol, vector, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+
+  EXPECT_THAT(sparse_output,
+              ElementsAreArray(ArrayFloatNear(dense_output, 1e-4)));
+}
+
+#ifdef __ANDROID__
+TEST(uKernels,
+     SparseMatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  const int8_t quantized_matrix[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 98, 101, 104, 107, 110, 113, 115,
+      118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -49, -52, -55, -58, -61,
+      -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0,
+      /* 3rd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, -52, 55, -58, 61, -64,
+      66, -69, 72, -75, 78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -95, 98, -101, 104, -107, 110,
+      -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  const int8_t quantized_matrix_values[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 95, 98, 101,
+      104, 107, 110, 113, 115, 118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      -49, -52, -55, -58, -61, -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0,
+      /* 3rd row */
+      49, -52, 55, -58, 61, -64, 66, -69, 72, -75, 78, -81, 0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, -95,
+      98, -101, 104, -107, 110, -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float matrix_scaling_factor = 0.349921;
+
+  const int8_t quantized_vector[] = {
+      /* 1st batch */
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      /* 2nd batch */
+      106, 0, -89, 0, 127, 0, -55, 0, 55, 0, -47, 0, 85, 0, -72, 0, 80, 0,
+      -64, 0, 21, 0, -30, 0, 34, 0, -13, 0, 119, 0, -119, 0, 47, -97, 80, -80,
+      89, -21, 102, -4, 42, -106, 30, -80, 8, 1, 2, 3,
+  };
+  float vector_scaling_factor[2] = {0.00787402, 0.023622};
+
+  /* clang-format on */
+  float result_scaling_factor[2] = {
+      matrix_scaling_factor * vector_scaling_factor[0],
+      matrix_scaling_factor * vector_scaling_factor[1],
+  };
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(quantized_matrix, kRow, kCol,
+                                      quantized_vector, result_scaling_factor,
+                                      kBatch, dense_output.data(),
+                                      /*result_stride=*/1);
+
+  EXPECT_THAT(dense_output,
+              ElementsAreArray(ArrayFloatNear(
+                  {-13.646927, 6.298582, 272.938538, -607.813110, -6.637464,
+                   -9.381721, 9.381721, -713.845642})));
+
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      quantized_matrix_values, ledger, kRow, kCol, quantized_vector,
+      result_scaling_factor, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+
+  EXPECT_THAT(sparse_output,
+              ElementsAreArray(ArrayFloatNear(
+                  {-13.646927, 6.298582, 272.938538, -607.813110, -6.637464,
+                   -9.381721, 9.381721, -713.845642})));
+}
+#endif  // __ANDROID__
+
 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
@@ -804,3 +982,109 @@ TEST(uKernels, MeanStddevNormalizationSmallValue) {
 
 }  // namespace tensor_utils
 }  // namespace tflite
+
+#ifdef DOTPROD_BENCHMARKS
+
+// Compile with --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" and
+// --copt="-DDOTPROD_BENCHMARKS"
+// Run with --benchmarks=all
+void BM_DotprodBatchOneMultiply(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  const int batch = state.range(2);
+
+  tflite::tensor_utils::MatrixVectorData data =
+      tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
+  for (auto _ : state) {
+    for (int i = 0; i < batch; i++) {
+      tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          data.matrix.data(), data.rows, data.cols,
+          data.vectors.data() + (data.cols * i), data.scale_factors.data(), 1,
+          &data.results[0], 1);
+      testing::DoNotOptimize(data.results[2]);
+    }
+  }
+}
+BENCHMARK(BM_DotprodBatchOneMultiply)
+    ->Args({16, 16, 1})
+    ->Args({16, 16, 4})
+    ->Args({32, 32, 1})
+    ->Args({32, 32, 4})
+    ->Args({64, 64, 1})
+    ->Args({64, 64, 4})
+    ->Args({128, 128, 1})
+    ->Args({128, 128, 4})
+    ->Args({992, 992, 1})
+    ->Args({992, 992, 8})
+    ->Args({1024, 1024, 1})
+    ->Args({1024, 1024, 4})
+    ->Args({1024, 1024, 8})
+    ->Args({640, 2048, 1})
+    ->Args({640, 2048, 4})
+    ->Args({640, 2048, 8})
+    ->Args({2048, 2048, 1})
+    ->Args({2048, 2048, 8});
+
+void BM_DotprodBatchFourMultiply(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  const int batch = state.range(2);
+
+  tflite::tensor_utils::MatrixVectorData data =
+      tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
+  for (auto _ : state) {
+    tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        data.matrix.data(), data.rows, data.cols, data.vectors.data(),
+        data.scale_factors.data(), data.batch, &data.results[0], 1);
+    testing::DoNotOptimize(data.results[2]);
+  }
+}
+BENCHMARK(BM_DotprodBatchFourMultiply)
+    ->Args({16, 16, 4})
+    ->Args({32, 32, 4})
+    ->Args({64, 64, 4})
+    ->Args({64, 256, 64})
+    ->Args({64, 256, 256})
+    ->Args({64, 256, 1024})
+    ->Args({64, 256, 12544})
+    ->Args({128, 128, 4})
+    ->Args({640, 640, 4})
+    ->Args({992, 992, 8})
+    ->Args({1024, 1024, 4})
+    ->Args({1024, 1024, 8})
+    ->Args({1024, 1024, 256})
+    ->Args({640, 2048, 4})
+    ->Args({640, 2048, 8})
+    ->Args({2048, 2048, 4})
+    ->Args({2048, 2048, 8});
+
+void BM_DotprodSparseMultiply(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  const int batch = state.range(2);
+
+  tflite::tensor_utils::MatrixVectorData data =
+      tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
+  for (auto _ : state) {
+    tflite::tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+        data.sparse_matrix.data(), data.ledger.data(), data.rows, data.cols,
+        data.vectors.data(), data.scale_factors.data(), data.batch,
+        &data.results[0], 1);
+    testing::DoNotOptimize(data.results[2]);
+  }
+}
+BENCHMARK(BM_DotprodSparseMultiply)
+    ->Args({128, 128, 1})
+    ->Args({128, 128, 4})
+    ->Args({640, 640, 4})
+    ->Args({992, 992, 8})
+    ->Args({1024, 1024, 1})
+    ->Args({1024, 1024, 4})
+    ->Args({1024, 1024, 8})
+    ->Args({640, 2048, 1})
+    ->Args({640, 2048, 4})
+    ->Args({640, 2048, 8})
+    ->Args({2048, 2048, 1})
+    ->Args({2048, 2048, 8});
+
+#endif  // DOTPROD_BENCHMARKS
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 57f4bfa9fa29ca39aa2506a08870ef6b2d61ab09..deb484b70f029d86e85495e4f6d8ad62efcf3ed4 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -18,10 +18,91 @@ limitations under the License.
 #include <cmath>
 #include <memory>
 
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 
 namespace tflite {
 
+void GuardedQuantizeMultiplier(double effective_output_scale,
+                               int32_t* significand, int* shift) {
+  QuantizeMultiplier(effective_output_scale, significand, shift);
+  // Additional guard to make sure RoundingDivideByPOT does not fail.
+  if (*shift < -31) {
+    // If shift is less than -31, RoundingDivideByPOT fails. This happens when
+    // min and max are close and small. For this particular case, both
+    // significand and shift are set to zero.
+    *significand = 0;
+    *shift = 0;
+  }
+}
+
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int* per_channel_shift) {
+  TF_LITE_ENSURE_EQ(context, input->quantization.type,
+                    kTfLiteAffineQuantization);
+  TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                    kTfLiteAffineQuantization);
+  // TODO(jianlijianli): Enable bias type check and bias scale == input scale
+  // * filter scale for each channel in affine quantization once bias
+  // quantization is properly populated.
+  // TF_LITE_ENSURE_EQ(context, bias->quantization.type,
+  // kTfLiteAffineQuantization);
+
+  // Check data type.
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  TF_LITE_ENSURE(context, affine_quantization);
+  TF_LITE_ENSURE(context, affine_quantization->scale);
+  const bool is_per_channel = affine_quantization->scale->size > 1;
+  if (is_per_channel) {
+    //  Currently only Int8 is supported for per channel quantization.
+    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(
+        context, affine_quantization->scale->size,
+        filter->dims->data[affine_quantization->quantized_dimension]);
+  }
+
+  // Populate multiplier and shift using affine quantization.
+  const int num_channels = affine_quantization->scale->size;
+  const float input_scale = input->params.scale;
+  const float output_scale = output->params.scale;
+  const float* filter_scales = affine_quantization->scale->data;
+  for (int i = 0; i < num_channels; ++i) {
+    const double filter_scale = static_cast<double>(filter_scales[i]);
+    const double effective_output_scale = static_cast<double>(input_scale) *
+                                          filter_scale /
+                                          static_cast<double>(output_scale);
+    int32_t significand;
+    int shift;
+    GuardedQuantizeMultiplier(effective_output_scale, &significand, &shift);
+    per_channel_multiplier[i] = significand;
+    per_channel_shift[i] = shift;
+  }
+
+  // Populate scalar quantization parameters.
+  // This check on legacy quantization parameters is kept only for backward
+  // compatibility.
+  if (input->type == kTfLiteUInt8) {
+    // Check bias scale == input scale * filter scale.
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+
+    // Populate quantization parameteters with multiplier and shift.
+    QuantizeMultiplier(real_multiplier, multiplier, &exponent);
+    *shift = -exponent;
+    CalculateActivationRangeUint8(activation, output, output_activation_min,
+                                  output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
                                               const TfLiteTensor* input,
                                               const TfLiteTensor* filter,
@@ -81,6 +162,9 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
   if (output->type == kTfLiteUInt8) {
     qmin = std::numeric_limits<uint8_t>::min();
     qmax = std::numeric_limits<uint8_t>::max();
+  } else if (output->type == kTfLiteInt8) {
+    qmin = std::numeric_limits<int8_t>::min();
+    qmax = std::numeric_limits<int8_t>::max();
   } else if (output->type == kTfLiteInt16) {
     qmin = std::numeric_limits<int16_t>::min();
     qmax = std::numeric_limits<int16_t>::max();
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 4cfc885f8939481f1515b445dfc9e261a4e79ed9..423832c047c6f1cf3b8427f2b4eb8fec3d70d2cb 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -84,6 +84,18 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Check dimensionality match and populate OpData for Conv and DepthwiseConv.
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int* per_channel_shift);
+
+// QuantizedMultiplier with the guard that shift will not be smaller than -31.
+void GuardedQuantizeMultiplier(double effective_output_scale,
+                               int32_t* significand, int* shift);
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 70eb18365891097686d579bde4a5457703e84aee..a31befbcd16a29cf4eb5d45602c3882138f54d09 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -28,6 +28,8 @@ class KernelUtilTest : public ::testing::Test {
   KernelUtilTest() {
     context_.ReportError = ReportError;
 
+    memset(&tensor1_, 0, sizeof(TfLiteTensor));
+    memset(&tensor2_, 0, sizeof(TfLiteTensor));
     tensor1_.dims = nullptr;
     tensor2_.dims = nullptr;
     tensor1_.allocation_type = kTfLiteMmapRo;
@@ -142,6 +144,222 @@ TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
   TfLiteIntArrayFree(output);
 }
 
+// TODO(jianlijianli): Add more test cases.
+TEST_F(KernelUtilTest, CheckAndPopulate) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {0.5, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 0.5;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {0.25, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(3);
+  filter_params->scale->data[0] = 0.25;
+  filter_params->scale->data[1] = 0.125;
+  filter_params->scale->data[2] = 0.25;
+  filter_params->zero_point = TfLiteIntArrayCreate(3);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->zero_point->data[1] = 0;
+  filter_params->zero_point->data[2] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create bias.
+  TfLiteTensor bias;
+  bias.type = kTfLiteInt32;
+  bias.allocation_type = kTfLiteArenaRw;
+  bias.dims = TfLiteIntArrayCreate(4);
+  TfLiteQuantizationParams bias_quant = {0.125, 9};
+  bias.params = bias_quant;
+  bias.quantization.type = kTfLiteAffineQuantization;
+  auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  bias_params->scale = TfLiteFloatArrayCreate(3);
+  bias_params->scale->data[0] = 0.125;
+  bias_params->scale->data[1] = 0.0625;
+  bias_params->scale->data[2] = 0.125;
+  bias_params->zero_point = TfLiteIntArrayCreate(3);
+  bias_params->zero_point->data[0] = 11;
+  bias_params->zero_point->data[1] = 12;
+  bias_params->zero_point->data[2] = 15;
+  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {0.5, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 0.5;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(3);
+  std::vector<int> per_channel_shift(3);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, &bias, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier,
+              ::testing::ElementsAre(1073741824, 1073741824, 1073741824));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-1, -2, -1));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&bias);
+  TfLiteTensorFree(&output);
+}
+
+TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {1, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 1;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(3);
+  int32_t two_pow_neg_31 = 0x30000000;  // 2^-31 so shift = -30.
+  int32_t two_pow_neg_32 = 0x2F800000;  // 2^-32 so shift = -31.
+  int32_t two_pow_neg_33 = 0x2F000000;  // 2^-33 so shift = -32.
+  filter_params->scale->data[0] = *reinterpret_cast<float*>(&two_pow_neg_31);
+  filter_params->scale->data[1] = *reinterpret_cast<float*>(&two_pow_neg_32);
+  filter_params->scale->data[2] = *reinterpret_cast<float*>(&two_pow_neg_33);
+  filter_params->zero_point = TfLiteIntArrayCreate(3);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->zero_point->data[1] = 0;
+  filter_params->zero_point->data[2] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create bias.
+  TfLiteTensor bias;
+  bias.type = kTfLiteInt32;
+  bias.allocation_type = kTfLiteArenaRw;
+  bias.dims = TfLiteIntArrayCreate(4);
+  TfLiteQuantizationParams bias_quant = {4.6566129e-10, 9};
+  bias.params = bias_quant;
+  bias.quantization.type = kTfLiteAffineQuantization;
+  auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  bias_params->scale = TfLiteFloatArrayCreate(3);
+  bias_params->scale->data[0] = 4.6566129e-10;  // 2^-31
+  bias_params->scale->data[1] = 2.3283064e-10;  // 2^-32
+  bias_params->scale->data[2] = 1.1641532e-10;  // 2^-33
+  bias_params->zero_point = TfLiteIntArrayCreate(3);
+  bias_params->zero_point->data[0] = 11;
+  bias_params->zero_point->data[1] = 12;
+  bias_params->zero_point->data[2] = 15;
+  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {1, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 1;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(3);
+  std::vector<int> per_channel_shift(3);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, &bias, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier,
+              ::testing::ElementsAre(1073741824, 1073741824, 0));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-30, -31, 0));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&bias);
+  TfLiteTensorFree(&output);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
index 19a4824e9398decec862bb7f5d20ac05b2652226..5eeda0858f4781bf9c47e3eab80c093bae1bcf70 100644
--- a/tensorflow/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -45,13 +46,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
-  TF_LITE_ENSURE(
-      context, output->type == kTfLiteFloat32 || output->type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
+                              output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+    if (output->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+    }
+    if (output->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    }
   }
 
   // TODO(ahentz): For some reason our implementations don't support
@@ -97,6 +104,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_L2NORM(optimized_ops);
     }
 #undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteInt8) {
+    const auto input_shape = GetTensorShape(input);
+    const auto output_shape = GetTensorShape(output);
+    const int trailing_dim = input_shape.DimensionsCount() - 1;
+    const int depth =
+        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+    const int outer_size =
+        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    reference_integer_ops::L2Normalization(input->params.zero_point, outer_size,
+                                           depth, GetTensorData<int8>(input),
+                                           GetTensorData<int8>(output));
   } else {
     context->ReportError(context, "Output type is %d, requires float.",
                          output->type);
diff --git a/tensorflow/lite/kernels/l2norm_test.cc b/tensorflow/lite/kernels/l2norm_test.cc
index 50108a5a264c3624bbd9c230f50c65f5897480bb..4cd63155b95b7dfa412a77cf85d7eb33b4a68e23 100644
--- a/tensorflow/lite/kernels/l2norm_test.cc
+++ b/tensorflow/lite/kernels/l2norm_test.cc
@@ -55,9 +55,10 @@ class L2NormOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
   int input() const { return input_; }
@@ -100,7 +101,20 @@ TEST(L2NormOpTest, SimpleUint8Test) {
   m.Invoke();
   EXPECT_THAT(m.GetOutput<uint8_t>(),
               ElementsAreArray({58, 166, 173, 205, 83, 134}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
+}
+
+TEST(L2NormOpTest, SimpleInt8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_INT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<int8_t>(m.input(), {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({-70, 38, 45, 77, -45, 6}));
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
 }
@@ -121,7 +135,32 @@ TEST(L2NormOpTest, MultipleBatchUint8Test) {
                   58, 166, 173, 205, 83, 134,  // batch 2
                   58, 166, 173, 205, 83, 134,  // batch 3
               }));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+                  },
+                  0.1)));
+}
+
+TEST(L2NormOpTest, MultipleBatchInt8Test) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_INT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<int8_t>(m.input(),
+                                {
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+                                    -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+                                });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         -70, 38, 45, 77, -45, 6,  // batch 1
+                                         -70, 38, 45, 77, -45, 6,  // batch 2
+                                         -70, 38, 45, 77, -45, 6,  // batch 3
+                                     }));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
deleted file mode 100644
index ce0c21dfcba770b72f144c272d7ab12b2e77e399..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ /dev/null
@@ -1,1324 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// DEPRECATED: Tensorflow Lite has implemented layer norm lstm as builtin Op and
-// the implementation of layer norm lstm as custom Op in this file is
-// deprecated. It is only kept for backward compatibility.
-//
-// Layer Normalization LSTM op that applies normalization by mean and standard
-// deviation to the activation of the LSTM layers. Please see
-// https://arxiv.org/abs/1607.06450 for details.
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace layer_norm_lstm {
-
-// Struct to hold Layer Norm LSTM option data.
-struct OpData {
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-  int scratch_tensor_index;
-};
-
-// Input Tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kInputToInputWeightsTensor = 1;  // Optional
-constexpr int kInputToForgetWeightsTensor = 2;
-constexpr int kInputToCellWeightsTensor = 3;
-constexpr int kInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kRecurrentToForgetWeightsTensor = 6;
-constexpr int kRecurrentToCellWeightsTensor = 7;
-constexpr int kRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kCellToInputWeightsTensor = 9;    // Optional
-constexpr int kCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kCellToOutputWeightsTensor = 11;  // Optional
-
-// Layer norm weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kInputLayerNormWeightsTensor = 12;  // Optional
-constexpr int kForgetLayerNormWeightsTensor = 13;
-constexpr int kCellLayerNormWeightsTensor = 14;
-constexpr int kOutputLayerNormWeightsTensor = 15;
-
-// Gates bias tensors of size {n_cell}
-constexpr int kInputGateBiasTensor = 16;  // Optional
-constexpr int kForgetGateBiasTensor = 17;
-constexpr int kCellGateBiasTensor = 18;
-constexpr int kOutputGateBiasTensor = 19;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kProjectionWeightsTensor = 20;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kProjectionBiasTensor = 21;  // Optional
-
-// State tensors.
-constexpr int kInputActivationStateTensor = 22;
-constexpr int kInputCellStateTensor = 23;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-// Total number of scratch tensors for hybrid Op.
-constexpr int kTensorsToAdd = 7;
-
-// Small float to avoid divergence during calculation of deviation.
-const float kLayerNormEpsilon = 1e-8;
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-
-  // Turn custom option data into flexbuffer map format.
-  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
-
-  // Get activation function, cell_clip and proj_clip from the flexbuffer.
-  // TODO(b/113824099): make activation more generic.
-  assert(m["fused_activation_function"].ToString() == "TANH");
-  data->activation = kTfLiteActTanh;
-  data->cell_clip = m["cell_clip"].AsFloat();
-  data->proj_clip = m["proj_clip"].AsFloat();
-
-  // Populate scratch_tensor_index.
-  context->AddTensors(context, /*tensors_to_add=*/kTensorsToAdd,
-                      &data->scratch_tensor_index);
-  return data;
-}
-
-// Check that input tensor dimensions matches with each other.
-TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
-                                        TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
-  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  // Making sure clipping parameters have valid values.
-  // == 0 means no clipping
-  //  > 0 means clipping
-  TF_LITE_ENSURE(context, op_data->cell_clip >= 0);
-  TF_LITE_ENSURE(context, op_data->proj_clip >= 0);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  if (!use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-  }
-
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
-                      n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
-  }
-
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  if (cell_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  if (cell_to_forget_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-  if (cell_to_output_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-  }
-
-  // Making sure the peephole weights are there all or none.
-  const bool peephole_weights_all_or_none =
-      ((cell_to_input_weights != nullptr || use_cifg) &&
-       (cell_to_forget_weights != nullptr) &&
-       (cell_to_output_weights != nullptr)) ||
-      ((cell_to_input_weights == nullptr) &&
-       (cell_to_forget_weights == nullptr) &&
-       (cell_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
-
-  // Making sure layer norm weights are not null and have the right dimension.
-  const TfLiteTensor* input_layer_norm_weights =
-      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights, nullptr);
-  } else {
-    TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_layer_norm_weights =
-      GetInput(context, node, kForgetLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, forget_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_layer_norm_weights =
-      GetInput(context, node, kCellLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, cell_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_layer_norm_weights =
-      GetInput(context, node, kOutputLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, output_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->data[0], n_cell);
-
-  // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-  }
-
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-  }
-
-  // Making sure the projection tensors are consistent:
-  // 1) If projection weight is not present, then projection bias should not be
-  // present.
-  // 2) If projection weight is present, then projection bias is optional.
-  const bool projection_tensors_consistent =
-      ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
-
-  return kTfLiteOk;
-}
-
-// Resize the output, state tensors based on the sizes of the input tensors.
-// Allocate a temporary scratch tensor. Also check that the sizes of the input
-// tensors match each other.
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 24);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Inferring batch size, number of outputs and number of cells from the
-  // input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE(context, input->dims->size > 1);
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-  const int n_cell = input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
-                    n_cell);
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
-
-  // Get the pointer to output, activation_state and cell_state tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
-  const TfLiteTensor* cell_state =
-      GetInput(context, node, kInputCellStateTensor);
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
-  // Resize the output tensors.
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
-  output_size->data[0] = n_batch;
-  output_size->data[1] = n_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size));
-
-  // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
-                             input->type == kTfLiteFloat32);
-
-  TfLiteIntArrayFree(node->temporaries);
-  if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(7);
-  } else {
-    node->temporaries = TfLiteIntArrayCreate(1);
-  }
-  node->temporaries->data[0] = op_data->scratch_tensor_index;
-
-  // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-  scratch_buffer->type = input->type;
-  scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-  scratch_buffer_size->data[0] = n_batch;
-  if (use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                   scratch_buffer_size));
-
-  if (is_hybrid_op) {
-    // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
-    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* activation_state_quantized =
-        GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = kTfLiteUInt8;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
-    }
-    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, /*index=*/3);
-    cell_state_quantized->type = kTfLiteUInt8;
-    cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
-      TfLiteIntArray* cell_state_quantized_size =
-          TfLiteIntArrayCopy(cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, cell_state_quantized,
-                                              cell_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors and product scaling
-    // factors. The latter is a convenience storage which allows to quantize
-    // a vector once (which produces the scaling factors) and multiply it with
-    // different matrices (which requires multiplying the scaling factors with
-    // the scaling factor of the matrix).
-    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, /*index=*/5);
-    prod_scaling_factors->type = kTfLiteFloat32;
-    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
-                                   scaling_dims)) {
-      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-      prod_scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, prod_scaling_factors,
-                                              prod_scaling_factors_size));
-    }
-
-    // Allocate a temporary tensor to store the recovered weights. Since
-    // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
-    TfLiteTensor* recovered_weights = GetTemporary(context, node, /*index=*/6);
-    recovered_weights->type = kTfLiteFloat32;
-    recovered_weights->allocation_type = kTfLiteArenaRw;
-    int recovered_dims[1] = {n_cell};
-    if (!TfLiteIntArrayEqualsArray(recovered_weights->dims, 1,
-                                   recovered_dims)) {
-      TfLiteIntArray* recovered_weights_size = TfLiteIntArrayCreate(1);
-      recovered_weights_size->data[0] = n_cell;
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, recovered_weights,
-                                              recovered_weights_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-void LayerNormLstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr,
-    const float* input_layer_norm_weight_ptr,
-    const float* forget_layer_norm_weight_ptr,
-    const float* cell_layer_norm_weight_ptr,
-    const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, float cell_clip, float proj_clip,
-    const TfLiteFusedActivation& activation, int n_batch, int n_cell,
-    int n_input, int n_output, float* output_state_ptr, float* cell_state_ptr,
-    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
-    float* output_gate_scratch, float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-
-  // Initialize scratch buffers with 0.
-  if (!use_cifg) {
-    tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
-  }
-  tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
-
-  // For each batch and cell: compute input_weight * input.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::MeanStddevNormalization(input_gate_scratch,
-                                          input_gate_scratch, n_cell, n_batch,
-                                          kLayerNormEpsilon);
-    tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr,
-                                                n_cell, input_gate_scratch,
-                                                n_batch, input_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                       input_gate_scratch);
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                        forget_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr,
-                                              n_cell, forget_gate_scratch,
-                                              n_batch, forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                     forget_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                        n_batch, kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(
-      cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch);
-  tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                     cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip,
-                             cell_state_ptr);
-  }
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                        output_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr,
-                                              n_cell, output_gate_scratch,
-                                              n_batch, output_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                     output_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_ptr_batch, /*result_stride=*/1);
-    if (proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip,
-                               output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-void LayerNormLstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale,
-    const float* input_layer_norm_weight_ptr,
-    const float* forget_layer_norm_weight_ptr,
-    const float* cell_layer_norm_weight_ptr,
-    const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    int n_batch, int n_cell, int n_input, int n_output,
-    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
-    float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-
-  // Initialize scratch buffers with 0.
-  if (!use_cifg) {
-    tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
-  }
-  tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors[b]);
-    }
-    // For each batch and cell: compute input_weight * input.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * input_to_input_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights_ptr, n_cell, n_input,
-          quantized_input_ptr_batch, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, forget_gate_scratch,
-        /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, output_gate_scratch,
-        /*result_stride=*/1);
-  }
-
-  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_output;
-      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
-                                            quantized_output_state_ptr + offset,
-                                            &unused_min, &unused_max,
-                                            &scaling_factors[b]);
-    }
-    // For each batch and cell: compute recurrent_weight * output_state.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * recurrent_to_input_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_input_weights_ptr, n_cell, n_output,
-          quantized_output_state_ptr, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_forget_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        forget_gate_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_cell_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_output_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        output_gate_scratch, /*result_stride=*/1);
-  }
-
-  // Save quantization and matmul computation for all zero input.
-  bool is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole && !is_cell_state_all_zeros) {
-      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                         cell_to_input_weights_scale,
-                                         recovered_weights);
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          recovered_weights, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::MeanStddevNormalization(input_gate_scratch,
-                                          input_gate_scratch, n_cell, n_batch,
-                                          kLayerNormEpsilon);
-    tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr,
-                                                n_cell, input_gate_scratch,
-                                                n_batch, input_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                       input_gate_scratch);
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                       cell_to_forget_weights_scale,
-                                       recovered_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_weights, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                        forget_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr,
-                                              n_cell, forget_gate_scratch,
-                                              n_batch, forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                     forget_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                        n_batch, kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(
-      cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch);
-  tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                     cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip,
-                             cell_state_ptr);
-  }
-
-  is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-  // For each batch and cell: update the output gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                       cell_to_output_weights_scale,
-                                       recovered_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_weights, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                        output_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr,
-                                              n_cell, output_gate_scratch,
-                                              n_batch, output_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                     output_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
-      // Save quantization and matmul computation for all zero input.
-      float unused_min, unused_max;
-      for (int b = 0; b < n_batch; ++b) {
-        const int offset = b * n_cell;
-        tensor_utils::SymmetricQuantizeFloats(
-            output_gate_scratch + offset, n_cell,
-            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
-            &scaling_factors[b]);
-      }
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * projection_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
-          product_scaling_factors, n_batch, output_ptr_batch,
-          /*result_stride=*/1);
-    }
-    if (proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip,
-                               output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-// The LayerNormLSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_layer_norm_weights,
-    const TfLiteTensor* forget_layer_norm_weights,
-    const TfLiteTensor* cell_layer_norm_weights,
-    const TfLiteTensor* output_layer_norm_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-  const float* input_layer_norm_weight_ptr =
-      (input_layer_norm_weights == nullptr) ? nullptr
-                                            : input_layer_norm_weights->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
-  const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
-  const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  LayerNormLstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_layer_norm_weight_ptr,
-      forget_layer_norm_weight_ptr, cell_layer_norm_weight_ptr,
-      output_layer_norm_weight_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell,
-      n_input, n_output, activation_state_ptr, cell_state_ptr,
-      input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_layer_norm_weights,
-    const TfLiteTensor* forget_layer_norm_weights,
-    const TfLiteTensor* cell_layer_norm_weights,
-    const TfLiteTensor* output_layer_norm_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
-    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_weights,
-    TfLiteTensor* input_quantized, TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-  const float* input_layer_norm_weight_ptr =
-      (input_layer_norm_weights == nullptr) ? nullptr
-                                            : input_layer_norm_weights->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
-  const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
-  const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_weights_ptr = recovered_weights->data.f;
-
-  LayerNormLstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_layer_norm_weight_ptr, forget_layer_norm_weight_ptr,
-      cell_layer_norm_weight_ptr, output_layer_norm_weight_ptr,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell,
-      n_input, n_output, input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
-      recovered_weights_ptr, quantized_input_ptr,
-      quantized_activation_state_ptr, quantized_cell_state_ptr,
-      activation_state_ptr, cell_state_ptr, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  const TfLiteTensor* input_layer_norm_weights =
-      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
-  const TfLiteTensor* forget_layer_norm_weights =
-      GetInput(context, node, kForgetLayerNormWeightsTensor);
-  const TfLiteTensor* cell_layer_norm_weights =
-      GetInput(context, node, kCellLayerNormWeightsTensor);
-  const TfLiteTensor* output_layer_norm_weights =
-      GetInput(context, node, kOutputLayerNormWeightsTensor);
-
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
-  TfLiteTensor* activation_state =
-      &context->tensors[node->inputs->data[kInputActivationStateTensor]];
-  TfLiteTensor* cell_state =
-      &context->tensors[node->inputs->data[kInputCellStateTensor]];
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (input_to_output_weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_layer_norm_weights,
-                       forget_layer_norm_weights, cell_layer_norm_weights,
-                       output_layer_norm_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, op_data->cell_clip,
-                       op_data->proj_clip, op_data->activation, scratch_buffer,
-                       activation_state, cell_state, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_weights =
-          GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_layer_norm_weights, forget_layer_norm_weights,
-          cell_layer_norm_weights, output_layer_norm_weights, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, op_data->cell_clip, op_data->proj_clip,
-          op_data->activation, scratch_buffer, scaling_factors,
-          prod_scaling_factors, recovered_weights, input_quantized,
-          activation_state_quantized, cell_state_quantized, activation_state,
-          cell_state, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           input_to_output_weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-}  // namespace layer_norm_lstm
-
-TfLiteRegistration* Register_LAYER_NORM_LSTM() {
-  static TfLiteRegistration r = {layer_norm_lstm::Init, layer_norm_lstm::Free,
-                                 layer_norm_lstm::Prepare,
-                                 layer_norm_lstm::Eval};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
deleted file mode 100644
index 5aed818f2407a96acb8893654971fc5bb91a81ed..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ /dev/null
@@ -1,885 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite Layer Norm LSTM op.
-
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class LayerNormLSTMOpModel : public SingleOpModel {
- public:
-  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                       bool use_cifg, bool use_peephole,
-                       bool use_projection_weights, bool use_projection_bias,
-                       float cell_clip, float proj_clip,
-                       const std::vector<std::vector<int>>& input_shapes,
-                       const TensorType& weight_type = TensorType_FLOAT32)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
-        n_output_(n_output) {
-    input_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_to_input_weights_ = AddNullInput();
-    } else {
-      input_to_input_weights_ = AddInput(weight_type);
-    }
-
-    input_to_forget_weights_ = AddInput(weight_type);
-    input_to_cell_weights_ = AddInput(weight_type);
-    input_to_output_weights_ = AddInput(weight_type);
-
-    if (use_cifg) {
-      recurrent_to_input_weights_ = AddNullInput();
-    } else {
-      recurrent_to_input_weights_ = AddInput(weight_type);
-    }
-
-    recurrent_to_forget_weights_ = AddInput(weight_type);
-    recurrent_to_cell_weights_ = AddInput(weight_type);
-    recurrent_to_output_weights_ = AddInput(weight_type);
-
-    if (use_peephole) {
-      if (use_cifg) {
-        cell_to_input_weights_ = AddNullInput();
-      } else {
-        cell_to_input_weights_ = AddInput(weight_type);
-      }
-      cell_to_forget_weights_ = AddInput(weight_type);
-      cell_to_output_weights_ = AddInput(weight_type);
-    } else {
-      cell_to_input_weights_ = AddNullInput();
-      cell_to_forget_weights_ = AddNullInput();
-      cell_to_output_weights_ = AddNullInput();
-    }
-
-    if (use_cifg) {
-      input_layer_norm_weights_ = AddNullInput();
-    } else {
-      input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    }
-    forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_gate_bias_ = AddNullInput();
-    } else {
-      input_gate_bias_ = AddInput(TensorType_FLOAT32);
-    }
-    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
-    output_gate_bias_ = AddInput(TensorType_FLOAT32);
-
-    if (use_projection_weights) {
-      projection_weights_ = AddInput(weight_type);
-      if (use_projection_bias) {
-        projection_bias_ = AddInput(TensorType_FLOAT32);
-      } else {
-        projection_bias_ = AddNullInput();
-      }
-    } else {
-      projection_weights_ = AddNullInput();
-      projection_bias_ = AddNullInput();
-    }
-
-    // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
-    cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
-
-    output_ = AddOutput(TensorType_FLOAT32);
-
-    // Set up and pass in custom options using flexbuffer.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {
-      fbb.Int("cell_clip", cell_clip);
-      fbb.Int("proj_clip", proj_clip);
-      fbb.String("fused_activation_function", "TANH");
-    });
-    fbb.Finish();
-    SetCustomOp("LAYER_NORM_LSTM", fbb.GetBuffer(), Register_LAYER_NORM_LSTM);
-    BuildInterpreter(input_shapes);
-  }
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetInputGateBias(const std::vector<float>& f) {
-    PopulateTensor(input_gate_bias_, f);
-  }
-
-  void SetForgetGateBias(const std::vector<float>& f) {
-    PopulateTensor(forget_gate_bias_, f);
-  }
-
-  void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
-  }
-
-  void SetOutputGateBias(const std::vector<float>& f) {
-    PopulateTensor(output_gate_bias_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    PopulateTensor(projection_weights_, f);
-  }
-
-  void SetProjectionBias(const std::vector<float>& f) {
-    PopulateTensor(projection_bias_, f);
-  }
-
-  void SetInput(int offset, const float* begin, const float* end) {
-    PopulateTensor(input_, offset, const_cast<float*>(begin),
-                   const_cast<float*>(end));
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
-  int num_inputs() { return n_input_; }
-  int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
-
- protected:
-  int input_;
-  int input_to_input_weights_;
-  int input_to_forget_weights_;
-  int input_to_cell_weights_;
-  int input_to_output_weights_;
-
-  int recurrent_to_input_weights_;
-  int recurrent_to_forget_weights_;
-  int recurrent_to_cell_weights_;
-  int recurrent_to_output_weights_;
-
-  int cell_to_input_weights_;
-  int cell_to_forget_weights_;
-  int cell_to_output_weights_;
-
-  int input_layer_norm_weights_;
-  int forget_layer_norm_weights_;
-  int cell_layer_norm_weights_;
-  int output_layer_norm_weights_;
-
-  int input_gate_bias_;
-  int forget_gate_bias_;
-  int cell_bias_;
-  int output_gate_bias_;
-
-  int projection_weights_;
-  int projection_bias_;
-
-  int output_state_;
-  int cell_state_;
-
-  int output_;
-
-  int n_batch_;
-  int n_input_;
-  int n_cell_;
-  int n_output_;
-};
-
-class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
- public:
-  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                             bool use_cifg, bool use_peephole,
-                             bool use_projection_weights,
-                             bool use_projection_bias, float cell_clip,
-                             float proj_clip,
-                             const std::vector<std::vector<int>>& input_shapes)
-      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
-                             use_peephole, use_projection_weights,
-                             use_projection_bias, cell_clip, proj_clip,
-                             input_shapes, TensorType_UINT8) {}
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(const std::vector<float>& f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
-  }
-};
-
-class BaseLayerNormLstmTest : public ::testing::Test {
- protected:
-  // Weights of the Layer Norm LSTM model. Some are optional.
-  std::vector<float> input_to_input_weights_;
-  std::vector<float> input_to_cell_weights_;
-  std::vector<float> input_to_forget_weights_;
-  std::vector<float> input_to_output_weights_;
-  std::vector<float> input_gate_bias_;
-  std::vector<float> cell_gate_bias_;
-  std::vector<float> forget_gate_bias_;
-  std::vector<float> output_gate_bias_;
-  std::vector<float> recurrent_to_input_weights_;
-  std::vector<float> recurrent_to_cell_weights_;
-  std::vector<float> recurrent_to_forget_weights_;
-  std::vector<float> recurrent_to_output_weights_;
-  std::vector<float> cell_to_input_weights_;
-  std::vector<float> cell_to_forget_weights_;
-  std::vector<float> cell_to_output_weights_;
-  std::vector<float> input_layer_norm_weights_;
-  std::vector<float> forget_layer_norm_weights_;
-  std::vector<float> cell_layer_norm_weights_;
-  std::vector<float> output_layer_norm_weights_;
-  std::vector<float> projection_weights_;
-
-  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
-  std::vector<std::vector<float>> layer_norm_lstm_input_;
-
-  // Compares output up to tolerance to the result of the layer_norm_lstm given
-  // the input.
-  void VerifyGoldens(const std::vector<std::vector<float>>& input,
-                     const std::vector<std::vector<float>>& output,
-                     LayerNormLSTMOpModel* layer_norm_lstm,
-                     float tolerance = 1e-5) {
-    const int num_batches = input.size();
-    EXPECT_GT(num_batches, 0);
-    const int num_inputs = layer_norm_lstm->num_inputs();
-    EXPECT_GT(num_inputs, 0);
-    const int input_sequence_size = input[0].size() / num_inputs;
-    EXPECT_GT(input_sequence_size, 0);
-    for (int i = 0; i < input_sequence_size; ++i) {
-      for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
-
-        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
-                                  batch_start, batch_end);
-      }
-
-      layer_norm_lstm->Invoke();
-
-      const int num_outputs = layer_norm_lstm->num_outputs();
-      std::vector<float> expected;
-      for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
-      }
-      EXPECT_THAT(layer_norm_lstm->GetOutput(),
-                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
-    }
-  }
-};
-
-class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
-                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
-                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
-
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
-                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
-    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
-
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    layer_norm_lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244077, 0.128027, -0.00170918,  // seq 0
-          0.0137642, 0.140751, 0.0395835,    // seq 1
-          -0.00459231, 0.155278, 0.0837377,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00692428, 0.0848741, 0.063445,  // seq 0
-          -0.00403912, 0.139963, 0.072681,   // seq 1
-          0.00752706, 0.161903, 0.0561371,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244576, 0.127847, -0.00181765,  // seq 0
-          0.0137518, 0.140892, 0.0402234,    // seq 1
-          -0.0048839, 0.155096, 0.0840309,   // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00728636, 0.0843957, 0.0634786,  // seq 0
-          -0.00448382, 0.139278, 0.0737372,   // seq 1
-          0.00734616, 0.161793, 0.0560238,    // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-class CifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
-  void SetUp() override {
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    layer_norm_lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {0},       // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.02129706, 0.140816242, 0.0112733059,     // seq 0
-          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
-          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
-          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
-          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {0},       // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
-          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
-          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
-          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
-          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 470c74d207d51688c3c48de0fc8bdecda43097a7..ea22ed56941cd4ca4e625e6a9e6be5f74f69c996 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -840,6 +840,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
       return full::Init(context, buffer, length);
     case kTfLiteLSTMBasicKernel:
       return basic::Init(context, buffer, length);
+    default:
+      return nullptr;
   }
 }
 void Free(TfLiteContext* context, void* buffer) {
@@ -855,6 +857,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return full::Prepare(context, node);
     case kTfLiteLSTMBasicKernel:
       return basic::Prepare(context, node);
+    default:
+      return kTfLiteError;
   }
 }
 
@@ -865,6 +869,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return full::Eval(context, node);
     case kTfLiteLSTMBasicKernel:
       return basic::Eval(context, node);
+    default:
+      return kTfLiteError;
   }
 }
 
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 244cfae4a20b93b32022bee412f241397df53c49..045f41bd91eccc24c54726d67d295b85c90185ec 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -119,7 +119,7 @@ inline void LstmStepWithAuxInput(
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch) {
   // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
+  // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool is_layer_norm_lstm =
@@ -473,7 +473,7 @@ inline void LstmStepWithAuxInput(
     int8_t* quantized_cell_state_ptr, float* output_state_ptr,
     float* cell_state_ptr, float* output_ptr_batch) {
   // Since we have already checked that weights are all there or none, we
-  // can check the existense of only one to the get the condition.
+  // can check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool is_layer_norm_lstm =
@@ -913,7 +913,7 @@ TfLiteStatus EvalFloat(
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
+  // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
   const bool is_layer_norm_lstm = (forget_layer_norm_coefficients != nullptr);
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3bcaabf675eba4f528fe73b01610d915e7780f85..6ae836bc3fba78f0a1941fe6f9df83fbd5e474c2 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -26,7 +26,7 @@ namespace ops {
 namespace builtin {
 namespace maximum_minimum {
 
-// This file has a reference implemenation of TFMaximum/TFMinimum.
+// This file has a reference implementation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
 };
@@ -108,6 +108,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       case kTfLiteUInt8:
         TFLiteOperation<uint8_t, OpType>(context, node, op_context);
         break;
+      case kTfLiteInt8:
+        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        break;
       case kTfLiteInt32:
        TFLiteOperation<int32_t, OpType>(context, node, op_context);
         break;
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index acb74e09d3fb47c33c6c146af4d0b1b1030491be..6567c8f3611204af3bdeecbdb11a07f6f16be908 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -112,6 +112,17 @@ TEST(MaxMinOpTest, Uint8Test) {
                      {0, 0, 1, 11, 2, 1});
 }
 
+TEST(MaxMinOpTest, Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {1, 0, 2, 12, 123, 23});
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {0, 0, 1, 11, 2, 1});
+}
+
 TEST(MaximumOpTest, FloatWithBroadcastTest) {
   std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
   std::initializer_list<float> data2 = {0.5, 2.0};
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index e74e47f7a37b0f449fb2a63237e95066bb452de6..f54d925d777380b154a0041452872778ba314999 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
@@ -21,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -30,118 +32,37 @@ namespace builtin {
 namespace mirror_pad {
 namespace {
 
-// Simple class that represents a mirror padded tensor - which is the output
-// from the Op.
-struct PaddedTensor {
-  // If not null that means this is a scalar value.
-  // Note: This is not owned by default. It will point to the value
-  // in the input tensor.
-  const void* value = nullptr;
-  // If this tensor is not one value, then this vector will have
-  // all the tensors that belongs to this tensor.
-  // Pointers are owned.
-  std::vector<std::unique_ptr<PaddedTensor>> values;
-  // Pointers to PaddedTensors that are padded on the left of the current
-  // tensor.
-  std::vector<PaddedTensor*> left_pad_ptrs;
-  // Pointers to PaddedTensors that are padded on the right of the current
-  // tensor.
-  std::vector<PaddedTensor*> right_pad_ptrs;
-
-  // Returns mutable pointer to the tensor identified by 'indices'.
-  PaddedTensor* GetMutable(const std::vector<int>& indices) {
-    auto* result = this;
-    for (int i = 0; i < indices.size(); ++i) {
-      if (indices[i] >= result->values.size()) {
-        return nullptr;
-      }
-      result = result->values[indices[i]].get();
-      if (result == nullptr) break;
-    }
-    return result;
-  }
-};
-
-// Util method to initialize the memory of the padded tensor.
-void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
-                            int dims_size, PaddedTensor* padded_tensor) {
-  if (dim_index >= dims_size) {
-    return;
-  }
-  padded_tensor->values.reserve(dims->data[dim_index]);
-  for (int i = 0; i < dims->data[dim_index]; ++i) {
-    padded_tensor->values.emplace_back(new PaddedTensor());
-    InitializeTensorMemory(dims, dim_index + 1, dims_size,
-                           padded_tensor->values.back().get());
-  }
-}
-
-// Returns pointer to the value at the specified index in 'data'.
-inline const void* GetValuePointerAtIndex(const void* data, int index,
-                                          const TfLiteType data_type) {
-  switch (data_type) {
-    case kTfLiteFloat32:
-      return static_cast<const float*>(data) + index;
-    case kTfLiteInt32:
-      return static_cast<const int32_t*>(data) + index;
-    case kTfLiteUInt8:
-      return static_cast<const uint8_t*>(data) + index;
-    case kTfLiteInt64:
-      return static_cast<const int64_t*>(data) + index;
-    case kTfLiteBool:
-      return static_cast<const bool*>(data) + index;
-    case kTfLiteInt16:
-      return static_cast<const int16_t*>(data) + index;
-    case kTfLiteInt8:
-      return static_cast<const int8_t*>(data) + index;
-    // Unsupported types ?
-    default:
-      return nullptr;
-  }
-  return nullptr;
-}
+// Nil value for paddingMode/offset.
+const int kUnsetOffset = -1;
 
-// Util method that increment index in the N-d array.
-void IncrementTensorIndex(const TfLiteIntArray* dims,
-                          std::vector<int>* tensor_index_ptr) {
-  int dimension_index = dims->size - 1;
-  auto& tensor_index = *tensor_index_ptr;
-  tensor_index[dimension_index]++;
-  while (dimension_index >= 0 &&
-         tensor_index[dimension_index] == dims->data[dimension_index]) {
-    tensor_index[dimension_index] = 0;
-    dimension_index--;
-    if (dimension_index >= 0) tensor_index[dimension_index]++;
-  }
-}
-
-// Fills the 'padded_tensor' with data from 'input_tensor'.
-TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
-                                 PaddedTensor* padded_tensor) {
-  const auto* dims = input_tensor->dims;
-  const auto data_type = input_tensor->type;
-  const void* data = static_cast<const void*>(input_tensor->data.raw_const);
-  // Either invalid input or unsupported type.+
-  if (data == nullptr) {
-    return kTfLiteError;
-  }
-  // Index of current processing tensor.
-  std::vector<int> tensor_index(dims->size, 0);
-  int flat_index = 0;
-  const int num_elements = NumElements(input_tensor);
-  while (flat_index < num_elements) {
-    auto* tensor = padded_tensor->GetMutable(tensor_index);
-    if (tensor == nullptr) {
-      return kTfLiteError;
-    }
-    tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
-    IncrementTensorIndex(dims, &tensor_index);
-    ++flat_index;
-  }
+// Wrapper for data used by the op.
+struct OpData {
+  // Holds computed value (memoized value) of an internal fill state of a
+  // subarray.
+  // State is (Dimension to fill, index in tensor as flattened array)
+  // The value is start and end in the output array which has the padded result.
+  std::vector<std::pair<int, int>> cache;
+};
 
-  return kTfLiteOk;
-}
+// Wrapper for params passed to the Eval<T> function.
+template <typename T>
+struct EvalData {
+  OpData* op_data = nullptr;
+  const TfLiteTensor* padding_matrix = nullptr;
+  const TfLiteIntArray* input_dims = nullptr;
+  // Holds number of elements at the nth dimension.
+  // value at last dimension = 1, at second to last = sizeof last dimension.
+  const std::vector<int>* dimension_num_elements = nullptr;
+  const T* input_data = nullptr;
+
+  int offset = kUnsetOffset;
+  T* output_data = nullptr;
+  int input_size = 0;
+  int output_size = 0;
+  int num_dims = 0;
+};
 
+// Helper method that fills the left and right pads.
 template <typename T>
 inline void GetPadding(const T* data, int offset, int64_t* left_pad,
                        int64_t* right_pad) {
@@ -149,9 +70,8 @@ inline void GetPadding(const T* data, int offset, int64_t* left_pad,
   *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
 }
 
-inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
-                               int dimension, int64_t* left_pad,
-                               int64_t* right_pad) {
+inline void GetPadding(const TfLiteTensor* padding_matrix, int dimension,
+                       int64_t* left_pad, int64_t* right_pad) {
   switch (padding_matrix->type) {
     case kTfLiteInt32:
       GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad);
@@ -160,93 +80,59 @@ inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
       GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad);
       break;
     default:
-      return kTfLiteError;
+      return;
   }
-  return kTfLiteOk;
 }
 
-TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
-                            int dimension_index, PaddedTensor* padded_tensor,
-                            TfLiteContext* context) {
-  if (dimension_index >= padding_matrix->dims->data[0]) {
-    return kTfLiteOk;
-  }
-
-  int64_t left_pad = 0, right_pad = 0;
-  TF_LITE_ENSURE_STATUS(
-      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
-  // If we are not going to include border we must have enough values
-  // to use.
-  if (left_pad + offset > padded_tensor->values.size()) {
-    context->ReportError(
-        context, "Not enough values for Mirror Pad, required %d, available %d.",
-        left_pad + offset, padded_tensor->values.size());
-    return kTfLiteError;
-  }
-  if (right_pad + offset > padded_tensor->values.size()) {
-    context->ReportError(
-        context, "Not enough values for Mirror Pad, required %d, available %d.",
-        right_pad + offset, padded_tensor->values.size());
-    return kTfLiteError;
+template <typename T>
+int Eval(EvalData<T>* eval_data, int current_dim, int flat_index,
+         int output_index) {
+  if (current_dim == eval_data->num_dims) {
+    // Base case if we finished evaluating.
+    if (output_index >= eval_data->output_size) {
+      return output_index;
+    }
+    eval_data->output_data[output_index] = eval_data->input_data[flat_index];
+    return output_index + 1;
   }
-  if (!padded_tensor->values.empty()) {
-    ValidateTensor(padding_matrix, offset, dimension_index + 1,
-                   padded_tensor->values[0].get(), context);
+  // Check if the value is computed already.
+  const int cache_index = current_dim * eval_data->input_size + flat_index;
+  auto& cache_entry = eval_data->op_data->cache[cache_index];
+  if (cache_entry.first != -1) {
+    // Cache value is (start, end) interval. We can just copy the interval
+    // directly.
+    const int count = cache_entry.second - cache_entry.first;
+    memcpy(eval_data->output_data + output_index,
+           eval_data->output_data + cache_entry.first, count * sizeof(T));
+    return output_index + count;
   }
-  return kTfLiteOk;
-}
-
-// Fills 'padded_tensor' with the padding information based on
-// 'padding_matrix'.
-// 'dimension_index' represents which dimension the function is operating on.
-TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
-                       int dimension_index, PaddedTensor* padded_tensor,
-                       TfLiteContext* context) {
-  if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk;
-
+  cache_entry.first = output_index;
   int64_t left_pad = 0, right_pad = 0;
-  TF_LITE_ENSURE_STATUS(
-      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+  const int multiplier = (*eval_data->dimension_num_elements)[current_dim];
+  const TfLiteTensor* padding_matrix = eval_data->padding_matrix;
+  const auto offset = eval_data->offset;
+  auto* dims = eval_data->input_dims;
 
+  GetPadding(padding_matrix, current_dim, &left_pad, &right_pad);
+  // Left padding
   for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
        --i, --left_pad) {
-    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+    output_index = Eval(eval_data, current_dim + 1, flat_index + i * multiplier,
+                        output_index);
   }
-  for (int i = padded_tensor->values.size() - (1 + offset);
-       i >= 0 && right_pad > 0; --i, --right_pad) {
-    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+  // Original values.
+  for (int i = 0; i < dims->data[current_dim]; ++i) {
+    output_index = Eval(eval_data, current_dim + 1, flat_index + i * multiplier,
+                        output_index);
   }
-
-  for (auto& tensor : padded_tensor->values) {
-    TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
-                                    tensor.get(), context));
+  // Right padding.
+  for (int i = dims->data[current_dim] - (1 + offset); i >= 0 && right_pad > 0;
+       --i, --right_pad) {
+    output_index = Eval(eval_data, current_dim + 1, flat_index + i * multiplier,
+                        output_index);
   }
-  return kTfLiteOk;
-}
-
-// Fills 'output_data' with data from 'padded_tensor'.
-// The function does this recursively by setting left padding first then
-// original data, followed by the right padding.
-template <typename T>
-int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
-               int index_in_output) {
-  if (padded_tensor == nullptr || output_data == nullptr) {
-    return -1;
-  }
-  if (padded_tensor->value != nullptr) {
-    output_data[index_in_output] = *static_cast<const T*>(padded_tensor->value);
-    return index_in_output + 1;
-  }
-  for (const auto* tensor : padded_tensor->left_pad_ptrs) {
-    index_in_output = FillOutput(tensor, output_data, index_in_output);
-  }
-  for (const auto& tensor : padded_tensor->values) {
-    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
-  }
-  for (const auto* tensor : padded_tensor->right_pad_ptrs) {
-    index_in_output = FillOutput(tensor, output_data, index_in_output);
-  }
-  return index_in_output;
+  cache_entry.second = output_index;
+  return output_index;
 }
 
 // Returns the shape of the final output after padding.
@@ -271,6 +157,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
   auto* params =
       reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   if (params == nullptr) {
     return kTfLiteError;
@@ -287,27 +174,34 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         context->ResizeTensor(context, output_tensor, output_size.release()));
   }
 
-  PaddedTensor padded_tensor;
-  // Initialize memory.
-  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
-  // Set the values from the input_tensor.
-  TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
+  std::vector<int> dimension_num_elements(input_dims, 1);
+  for (int i = input_dims - 2; i >= 0; i--) {
+    dimension_num_elements[i] =
+        dimension_num_elements[i + 1] * input_tensor->dims->data[i + 1];
+  }
+  const int input_size = NumElements(input_tensor);
 
   const int offset =
       params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
                                                                            : 1;
-  // Make sure padding values are sufficient and valid to use.
-  TF_LITE_ENSURE_STATUS(
-      ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context));
-  // Apply padding.
-  TF_LITE_ENSURE_STATUS(
-      PadTensor(padding_matrix, offset, 0, &padded_tensor, context));
-
-  // Fill the output tensor from the padded tensor.
   TfLiteStatus status = kTfLiteOk;
-
-#define TF_LITE_MIRROR_PAD(type) \
-  FillOutput(&padded_tensor, GetTensorData<type>(output_tensor), 0);
+  int output_index = 0;
+  // Reset cache array.
+  std::fill(op_data->cache.begin(), op_data->cache.end(),
+            std::make_pair(-1, -1));
+#define TF_LITE_MIRROR_PAD(type)                              \
+  EvalData<type> eval_data;                                   \
+  eval_data.input_data = GetTensorData<type>(input_tensor);   \
+  eval_data.input_dims = input_tensor->dims;                  \
+  eval_data.input_size = input_size;                          \
+  eval_data.dimension_num_elements = &dimension_num_elements; \
+  eval_data.num_dims = input_dims;                            \
+  eval_data.offset = offset;                                  \
+  eval_data.op_data = op_data;                                \
+  eval_data.output_data = GetTensorData<type>(output_tensor); \
+  eval_data.output_size = NumElements(output_tensor);         \
+  eval_data.padding_matrix = padding_matrix;                  \
+  Eval<type>(&eval_data, 0, 0, output_index);
 
   switch (output_tensor->type) {
     case kTfLiteFloat32: {
@@ -335,20 +229,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
+  return new OpData;
 }
 
-void Free(TfLiteContext* context, void* buffer) {}
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input_tensor = GetInput(context, node, 0);
   const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
   TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
                     NumDimensions(input_tensor));
 
+  int num_elements = NumElements(input_tensor) * NumDimensions(input_tensor);
+  op_data->cache.resize(num_elements + 1);
+
   if (!IsConstantTensor(padding_matrix)) {
     SetTensorToDynamic(output_tensor);
     return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
index fd09e6e4493d3a29bffecfcd4a4d1946840a4e5e..91e48fa68aa352d5f23857fe32b3feb9e3e1b73e 100644
--- a/tensorflow/lite/kernels/mirror_pad_test.cc
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -185,5 +185,18 @@ TEST(MirrorPadTest, Pad_1D_Symmetric) {
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
 }
 
+TEST(MirrorPadTest, Pad_1D_Symmetric_Multiple_Invoke) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+  model.PopulateTensor<int>(model.input_tensor_id(), {4, 5, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 5, 6, 6, 5}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 01039a705438af2a92a68b01c2146daf69c46250..e0ff6724ea2f3ea0fd4693571d6c509f5385a5d4 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -87,8 +88,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                   &data->output_activation_min,
                                   &data->output_activation_max);
   }
+  if (output->type == kTfLiteInt8) {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &data->output_activation_min,
+                                 &data->output_activation_max);
+  }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     double real_multiplier =
         input1->params.scale * input2->params.scale / output->params.scale;
     QuantizeMultiplierSmallerThanOneExp(
@@ -151,8 +158,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteMulParams* params, const OpData* data,
                            const TfLiteTensor* input1,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
-  if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
-      output->type == kTfLiteUInt8) {
+  if (input1->type == input2->type && input1->type == output->type &&
+      (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8)) {
     tflite::ArithmeticParams op_params;
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
@@ -163,23 +170,31 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     op_params.output_shift = data->output_shift;
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-
-    if (kernel_type == kReference) {
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output))
+    if (input1->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
       } else {
-        TF_LITE_MUL(reference_ops, Mul);
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
       }
     } else {
-      if (need_broadcast) {
-        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      // type == kTfLiteUInt8
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+        } else {
+          TF_LITE_MUL(reference_ops, Mul, uint8_t);
+        }
       } else {
-        TF_LITE_MUL(optimized_ops, Mul);
+        if (need_broadcast) {
+          TF_LITE_MUL(optimized_ops, BroadcastMulFivefold, uint8_t);
+        } else {
+          TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+        }
       }
     }
 #undef TF_LITE_MUL
@@ -198,8 +213,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
-             output->type == kTfLiteUInt8) {
-#define TF_LITE_MUL(type, opname)                                      \
+             (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8)) {
+#define TF_LITE_MUL(type, opname, output_dtype)                        \
   tflite::ArithmeticParams op_params;                                  \
   SetActivationParams(data->output_activation_min,                     \
                       data->output_activation_max, &op_params);        \
@@ -207,11 +222,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
                GetTensorData<int16_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, Mul);
+               GetTensorData<output_dtype>(output))
+    if (output->type == kTfLiteInt8) {
+      TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
     } else {
-      TF_LITE_MUL(optimized_ops, Mul);
+      if (kernel_type == kReference) {
+        TF_LITE_MUL(reference_ops, Mul, uint8_t);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+      }
     }
 #undef TF_LITE_MUL
   } else {
@@ -233,14 +252,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalMul<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(
         context, EvalQuantized<kernel_type>(context, node, params, data, input1,
                                             input2, output));
   } else {
     context->ReportError(context,
-                         "Mul only supports FLOAT32, INT32 and quantized UINT8 "
-                         "and INT16 now, got %d.",
+                         "Mul only supports FLOAT32, INT32 and quantized UINT8,"
+                         " INT8 and INT16 now, got %d.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 200cc26dadc3527813a7dabd3b9ca4811d4c8856..96f5a8a0e07e730394510f432b3313724e6c9172 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -73,9 +73,10 @@ class QuantizedMulOpModel : public BaseMulOpModel {
  public:
   using BaseMulOpModel::BaseMulOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -191,19 +192,28 @@ TEST(IntegerMulOpTest, WithBroadcast) {
   }
 }
 
-TEST(QuantizedMulOpTest, NoActivation) {
-  QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {}, -1.0, 1.0},
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivation() {
+  QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
-  m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
-  m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
+  m.QuantizeAndPopulate<integer_dtype>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
+  m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationUInt8) {
+  NoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt8) {
+  NoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedMulOpTest, NoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -219,23 +229,32 @@ TEST(QuantizedMulOpTest, NoActivationInt16) {
                                               kQuantizedToleranceInt16)));
 }
 
-TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivationInt16With8BitOutput() {
   const float kMinInt16 = -1.f;
   const float kMaxInt16 = 32767.f / 32768.f;
   const float kMinUint8 = -1.f;
   const float kMaxUint8 = 127.f / 128.f;
   QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
                         {TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
-                        {TensorType_UINT8, {}, kMinUint8, kMaxUint8},
+                        {tensor_type, {}, kMinUint8, kMaxUint8},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<int16_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
   m.QuantizeAndPopulate<int16_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt16Withint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_INT8, int8_t>();
+}
+
 // for quantized Mul, the error shouldn't exceed 2*step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
@@ -243,25 +262,35 @@ float GetTolerance(int min, int max) {
   return kQuantizedTolerance;
 }
 
-TEST(QuantizedMulOpTest, WithBroadcast) {
+template <TensorType tensor_type, typename integer_dtype>
+void WithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},  // always a scalar
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedMulOpTest, WithBroadcastUInt8) {
+  WithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, WithBroadcastInt8) {
+  WithBroadcast<TensorType_INT8, int8_t>();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index d15a5a08af38672cbdaef76ff8a37c42d6e6f226..e26abaaff1e5c9e460621048eb15d0549b81fb36 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -39,8 +39,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
-      input0->type != kTfLiteInt64) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt8 &&
+      input0->type != kTfLiteInt16 && input0->type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -106,6 +106,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<uint8_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt8: {
+      PackImpl<int8_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     case kTfLiteInt32: {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index 530cc2e50f0fe640cc5b120b8bbb1bade7e996fc..f44111567fc34f17912af7db352b47e57f8704f3 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -191,6 +191,37 @@ TEST(PackOpTest, Uint8MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int8
+TEST(PackOpTest, Int8ThreeInputs) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, Int8ThreeInputsDifferentAxis) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, Int8MultilDimensions) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 8e6ed6e741f782f070714164a7af7b4f98a1558f..b60b3dd9c871bf864492505dd9fa4aabf496364c 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -214,6 +214,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         }
       }
     } break;
+    case kTfLiteInt8: {
+      int8_t pad_value;
+      if (op_context.constant_values == nullptr) {
+        // Quantized Pad requires that 0 is represented in the quantized
+        // range.
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                    std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                    std::numeric_limits<int8_t>::max());
+        pad_value = static_cast<int8_t>(op_context.output->params.zero_point);
+      } else {
+        // Quantized Pad requires that 'constant_values' is represented in the
+        // same quantized range as the input and output tensors.
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
+                          op_context.constant_values->params.zero_point);
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.scale,
+                          op_context.constant_values->params.scale);
+        pad_value = *GetTensorData<int8_t>(op_context.constant_values);
+      }
+      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+        TF_LITE_PAD(reference_ops, PadImageStyle, int8_t, pad_value);
+      } else {
+        TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+      }
+    } break;
     case kTfLiteInt32: {
       int32_t pad_value =
           op_context.constant_values == nullptr
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 3caa4065dcbadd699ee9e61b8e97a42281d32309..ca246e9c3464664f492d85e8f41497da583a3989 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -24,31 +25,37 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
-template <typename T>
+template <typename RegularInputOuput>
 class PadOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor<T>(input_, data);
+  void SetInput(std::initializer_list<RegularInputOuput> data) {
+    PopulateTensor<RegularInputOuput>(input_, data);
   }
 
+  template <typename QuantizedInputOutput>
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<QuantizedInputOutput>(input_, data);
   }
 
+  template <typename QuantizedInputOutput>
   void SetQuantizedPadValue(float data) {
-    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+    QuantizeAndPopulate<QuantizedInputOutput>(constant_values_, {data});
   }
 
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
 
-  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<RegularInputOuput> GetOutput() {
+    return ExtractVector<RegularInputOuput>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  template <typename QuantizedInputOutput>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<QuantizedInputOutput>(
+        ExtractVector<QuantizedInputOutput>(output_), GetScale(output_),
+        GetZeroPoint(output_));
   }
 
  protected:
@@ -59,18 +66,18 @@ class PadOpModel : public SingleOpModel {
 };
 
 // Tests case where paddings is a const tensor. Type T is the dtype.
-template <typename T>
-class PadV2OpConstModel : public PadOpModel<T> {
+template <typename T1>
+class PadV2OpConstModel : public PadOpModel<T1> {
  public:
   PadV2OpConstModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
-                    std::initializer_list<int> paddings, T constant_values,
+                    std::initializer_list<int> paddings, T1 constant_values,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ =
         this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
     this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+        this->AddConstInput(GetTensorType<T1>(), {constant_values}, {1});
 
     this->output_ = this->AddOutput(output);
 
@@ -109,7 +116,7 @@ class PadOpConstModel : public PadOpModel<float> {
                   std::initializer_list<int> paddings_shape,
                   std::initializer_list<int> paddings,
                   const TensorData& output) {
-    input_ = AddInput(input);
+    this->input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
     constant_values_ = AddNullInput();
     output_ = AddOutput(output);
@@ -121,16 +128,17 @@ class PadOpConstModel : public PadOpModel<float> {
 };
 
 // Test case where paddings is a non-const tensor.
-template <typename T>
-class PadV2OpDynamicModel : public PadOpModel<T> {
+template <typename RegularInputOuput>
+class PadV2OpDynamicModel : public PadOpModel<RegularInputOuput> {
  public:
   PadV2OpDynamicModel(const TensorData& input,
                       std::initializer_list<int> paddings_shape,
-                      T constant_values, const TensorData& output) {
+                      RegularInputOuput constant_values,
+                      const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ = this->AddInput(TensorType_INT32);
-    this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+    this->constant_values_ = this->AddConstInput(
+        GetTensorType<RegularInputOuput>(), {constant_values}, {1});
     this->output_ = this->AddOutput(output);
 
     this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
@@ -164,14 +172,14 @@ class PadOpDynamicModel : public PadOpModel<float> {
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
                     const TensorData& output) {
-    input_ = AddInput(input);
-    paddings_ = AddInput(TensorType_INT32);
-    constant_values_ = AddNullInput();
-    output_ = AddOutput(output);
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ = this->AddNullInput();
+    this->output_ = this->AddOutput(output);
 
-    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
-                 CreatePadOptions(builder_).Union());
-    BuildInterpreter({input.shape, paddings_shape});
+    this->SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                       CreatePadOptions(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
@@ -299,61 +307,83 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-class QuantizedPadOpTest : public ::testing::Test {
- protected:
-  std::vector<Matcher<float>> DequantizedArrayNear(
-      const std::vector<float>& values, const float min, const float max) {
-    const float quantization_tolerance = (max - min) / 255.0;
-    return ArrayFloatNear(values, quantization_tolerance);
-  }
-};
+std::vector<Matcher<float>> DequantizedArrayNear(
+    const std::vector<float>& values, const float min, const float max) {
+  const float quantization_tolerance = (max - min) / 255.0;
+  return ArrayFloatNear(values, quantization_tolerance);
+}
+
+class QuantizedPadOpTest : public ::testing::Test {};
 
 #ifdef GTEST_HAS_DEATH_TEST
-TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+template <typename integer_type, TensorType tensor_dtype>
+void ZeroNotInQuantizationRange() {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
-                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                                 {TensorType_UINT8, {}, 1.0, 2.0}),
-               ".*Check failed: f_min <= 0.*");
+  EXPECT_DEATH(
+      PadOpConstModel m({tensor_dtype, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                        {0, 0, 1, 1, 1, 1, 0, 0}, {tensor_dtype, {}, 1.0, 2.0}),
+      ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, UInt8ZeroNotInQuantizationRange) {
+  ZeroNotInQuantizationRange<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadOpTest, Int8ZeroNotInQuantizationRange) {
+  ZeroNotInQuantizationRange<int8_t, TensorType_INT8>();
 }
 #endif
 
-TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleConstTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0},
-                    {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  PadOpConstModel m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
                   -1.0, 1.0)));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
-                      {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+TEST_F(QuantizedPadOpTest, UInt8SimpleConstTest) {
+  SimpleConstTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadOpTest, Int8SimpleConstTest) {
+  SimpleConstTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleDynamicTest() {
+  PadOpDynamicModel m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
                   -1.0, 1.0)));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
-                    {0, 0, 0, 2, 1, 3, 0, 0},
-                    {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+TEST_F(QuantizedPadOpTest, UInt8SimpleDynamicTest) {
+  SimpleDynamicTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadOpTest, Int8SimpleDynamicTest) {
+  SimpleDynamicTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedConstTest() {
+  PadOpConstModel m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
                    0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
@@ -361,13 +391,21 @@ TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
-                      {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+TEST_F(QuantizedPadOpTest, UInt8AdvancedConstTest) {
+  AdvancedConstTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadOpTest, Int8AdvancedConstTest) {
+  AdvancedConstTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedDynamicTest() {
+  PadOpDynamicModel m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
                    0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
@@ -375,31 +413,38 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST_F(QuantizedPadOpTest, UInt8AdvancedDynamicTest) {
+  AdvancedDynamicTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadOpTest, Int8AdvancedDynamicTest) {
+  AdvancedDynamicTest<int8_t, TensorType_INT8>();
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
-  EXPECT_DEATH(PadV2OpConstModel<float>(
-                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
-                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
-                   {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                 {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
+                 {TensorType_FLOAT32}),
                "dims <= 4");
 }
 
 TEST(PadV2OpTest, UnequalDimensions) {
-  EXPECT_DEATH(
-      PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
-                               {1, 1, 2, 2, 3, 3}, 0.0, {TensorType_FLOAT32}),
-      "3 != 4");
+  typedef PadV2OpConstModel<float> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2}, {1, 1, 2, 2, 3, 3},
+                 0.0, {TensorType_FLOAT32}),
+               "3 != 4");
 }
 
 TEST(PadV2OpTest, InvalidPadValue) {
-  EXPECT_DEATH(PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}},
-                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}, 0.0,
-                                        {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                 {0, 0, 1, -1, 2, -1, 0, 0}, 0.0, {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
 #endif
 
-TEST(PadV2OpTest, SimpleConstTest) {
+TEST(PadV2OpTest, SimpleConstTestUint8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
   PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
@@ -412,7 +457,32 @@ TEST(PadV2OpTest, SimpleConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
+TEST(PadV2OpTest, SimpleConstTestInt8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                             {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestInt8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
   PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
@@ -502,58 +572,80 @@ class QuantizedPadV2OpTest : public ::testing::Test {
 };
 
 #ifdef GTEST_HAS_DEATH_TEST
-TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
+template <TensorType tensor_dtype>
+void ZeroNotInQuantizationRangeV2() {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(
-      PadV2OpConstModel<float> m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
-                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0,
-                                 {TensorType_UINT8, {}, 1.0, 2.0}),
-      ".*Check failed: f_min <= 0.*");
+  typedef PadV2OpConstModel<float> f;
+  EXPECT_DEATH(f({tensor_dtype, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                 {0, 0, 1, 1, 1, 1, 0, 0}, 0, {tensor_dtype, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadV2OpTest, UInt8ZeroNotInQuantizationRange) {
+  ZeroNotInQuantizationRangeV2<TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8ZeroNotInQuantizationRange) {
+  ZeroNotInQuantizationRangeV2<TensorType_INT8>();
 }
 #endif
 
-TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleConstTestV2() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.SetQuantizedPadValue(0);
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(0);
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
                   -1.0, 1.0)));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.SetQuantizedPadValue(0);
+TEST_F(QuantizedPadV2OpTest, UInt8SimpleConstTest) {
+  SimpleConstTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8SimpleConstTest) {
+  SimpleConstTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleDynamicTestV2() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(0);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
                   -1.0, 1.0)));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.SetQuantizedPadValue(0);
+TEST_F(QuantizedPadV2OpTest, UInt8SimpleDynamicTest) {
+  SimpleDynamicTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8SimpleDynamicTest) {
+  SimpleDynamicTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedConstTestV2() {
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(0);
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
                    0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
@@ -561,15 +653,23 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.SetQuantizedPadValue(0);
+TEST_F(QuantizedPadV2OpTest, UInt8AdvancedConstTest) {
+  AdvancedConstTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8AdvancedConstTest) {
+  AdvancedConstTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedDynamicTestV2() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(0);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
                    0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
@@ -577,17 +677,24 @@ TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
+TEST_F(QuantizedPadV2OpTest, UInt8AdvancedDynamicTest) {
+  AdvancedDynamicTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8AdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleConstValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.SetQuantizedPadValue(-0.5);
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
                    0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
@@ -595,15 +702,23 @@ TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
-  m.SetQuantizedPadValue(-0.5);
+TEST_F(QuantizedPadV2OpTest, UInt8SimpleConstValuedTest) {
+  SimpleConstValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8SimpleConstValuedTest) {
+  SimpleConstValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleDynamicValuedTest() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
                    0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
@@ -611,15 +726,22 @@ TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.SetQuantizedPadValue(-0.5);
+TEST_F(QuantizedPadV2OpTest, UInt8SimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8SimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedConstValuedTest() {
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
                    -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
@@ -628,15 +750,23 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
-  m.SetQuantizedPadValue(-0.5);
+TEST_F(QuantizedPadV2OpTest, UInt8AdvancedConstValuedTest) {
+  AdvancedConstValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8AdvancedConstValuedTest) {
+  AdvancedConstValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedDynamicValuedTest() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
               ElementsAreArray(DequantizedArrayNear(
                   {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
                    -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
@@ -645,6 +775,13 @@ TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST_F(QuantizedPadV2OpTest, UInt8AdvancedDynamicValuedTest) {
+  AdvancedDynamicValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST_F(QuantizedPadV2OpTest, Int8AdvancedDynamicValuedTest) {
+  AdvancedDynamicValuedTest<int8_t, TensorType_INT8>();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index e6155fcb8c67ed3b5e676c2530ec7966d6cec56f..bdf736dcfb278ad93f43c25b9ae1c0b4038b695f 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -226,9 +226,9 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <KernelType kernel_type>
-void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
+                           TfLitePoolParams* params, OpData* data,
+                           const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -254,6 +254,31 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_MAX_POOL
 }
 
+template <KernelType kernel_type>
+void MaxEvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeInt8(params->activation, output, &activation_min,
+                               &activation_max);
+#define TF_LITE_MAX_POOL(type)                                        \
+  tflite::PoolParams op_params;                                       \
+  op_params.stride_height = params->stride_height;                    \
+  op_params.stride_width = params->stride_width;                      \
+  op_params.filter_height = params->filter_height;                    \
+  op_params.filter_width = params->filter_width;                      \
+  op_params.padding_values.height = data->padding.height;             \
+  op_params.padding_values.width = data->padding.width;               \
+  op_params.quantized_activation_min = activation_min;                \
+  op_params.quantized_activation_max = activation_max;                \
+  type::MaxPool(op_params, GetTensorShape(input),                     \
+                GetTensorData<int8_t>(input), GetTensorShape(output), \
+                GetTensorData<int8_t>(output))
+  TF_LITE_MAX_POOL(reference_integer_ops);
+#undef TF_LITE_MAX_POOL
+}
+
 template <KernelType kernel_type>
 void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
                  TfLitePoolParams* params, OpData* data,
@@ -321,7 +346,12 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
+      MaxEvalQuantizedUInt8<kernel_type>(context, node, params, data, input,
+                                         output);
+      break;
+    case kTfLiteInt8:
+      MaxEvalQuantizedInt8<kernel_type>(context, node, params, data, input,
+                                        output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index e1b79340115ad18e50ecdb6944904bf2ab7c9e84..4627d7a5f0c2803635b9df85dd9275cc7851e8fb 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -97,6 +97,24 @@ class SymmetricQuantizedPoolingOpModel : public BasePoolingOpModel {
   }
 };
 
+// Replicate each entry in a vector n times along depth (innermost dimension).
+// The values are incremented by delta, creating ramps offset by each input
+// value. This is used to create simple and predicatable variation.
+std::vector<float> ReplicateDepthRamp(const std::vector<float>& image_plane,
+                                      int n, float delta) {
+  const int size = image_plane.size();
+  std::vector<float> ramped_data(n * size);
+  // The input is treated as a 1-D even if logically it is multi-dimensional.
+  for (int input_index = 0; input_index < size; ++input_index) {
+    for (int depth = 0; depth < n; ++depth) {
+      ramped_data[n * input_index + depth] =
+          image_plane[input_index] + depth * delta;
+    }
+  }
+
+  return ramped_data;
+}
+
 TEST(FloatPoolingOpTest, AveragePool) {
   FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
@@ -147,6 +165,31 @@ TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
   EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
 }
 
+TEST(QuantizedPoolingOpTest, AveragePoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({2.75f, 5.75f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
 // Test quantized AveragePool with int8 input and output. The input is the same
 // as the uint8 test QuantizedPoolingOpTest.AveragePool. The float output is
 // identical to uint8 test and quantized output is identical to uint8 test with
@@ -204,7 +247,7 @@ TEST(FloatPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 10}));
 }
 
-TEST(QuantizedPoolingOpTest, MaxPool) {
+TEST(QuantizedUInt8PoolingOpTest, MaxPool) {
   // Choose the input ranges carefully so that the dequantized output matches
   // the results of the float model above.
   QuantizedPoolingOpModel m(
@@ -223,6 +266,50 @@ TEST(QuantizedPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({96, 160}));
 }
 
+TEST(QuantizedPoolingOpTest, MaxPoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({6.f, 10.f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
+TEST(QuantizedInt8PoolingOpTest, MaxPool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  SymmetricQuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_INT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, 2, -10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({3, 7})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-80, -16}));
+}
+
 TEST(FloatPoolingOpTest, L2Pool) {
   FloatPoolingOpModel m(BuiltinOperator_L2_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cef1f53a024b833034deb497909beac4b4753e6
--- /dev/null
+++ b/tensorflow/lite/kernels/rank.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace rank {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = kTfLiteInt32;
+
+  // Rank produces a 0-D int32 Tensor representing the rank of input.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
+
+  if (output->type == kTfLiteInt32) {
+    int32_t* output_data = GetTensorData<int32_t>(output);
+    *output_data = NumDimensions(input);
+  } else {
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace rank
+
+TfLiteRegistration* Register_RANK() {
+  static TfLiteRegistration r = {nullptr, nullptr, rank::Prepare, rank::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c31fc5866931708eb8155c2dc88026b623039ed
--- /dev/null
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class RankOpModel : public SingleOpModel {
+ public:
+  RankOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    TensorType output_type = TensorType_INT32;
+    input_ = AddInput(input_type);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_RANK, BuiltinOptions_RankOptions,
+                 CreateRankOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int input() { return input_; }
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RankOpTest, InputTypeFloat) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, InputTypeInt) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, ScalarTensor) {
+  RankOpModel model({}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, EmptyTensor) {
+  RankOpModel model({1, 0}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 336e827ca4c76abf3a08492249dfc0ce9cd81439..a0f1126048ea43458eae9ce327d31f6c1c9ead17 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -17,7 +17,10 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -35,6 +38,13 @@ enum KernelType {
   kReference,
 };
 
+struct OpData {
+  int32_t multiplier;
+  int shift;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int scratch_tensor_index;
+};
+
 struct OpContext {
   OpContext(TfLiteContext* context, TfLiteNode* node) {
     params = reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
@@ -49,15 +59,17 @@ struct OpContext {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  gemm_support::IncrementUsageCounter(context);
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
-  auto* scratch_tensor_index = new int;
-  context->AddTensors(context, 3, scratch_tensor_index);
-  return scratch_tensor_index;
+  auto* op_data = new OpData();
+  context->AddTensors(context, 3, &op_data->scratch_tensor_index);
+  return op_data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
+  gemm_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<OpData*>(buffer);
 }
 
 // Resizes the temp tensor that stores resolved axis.
@@ -148,10 +160,10 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
 TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                                    OpContext* op_context) {
   // Creates a temp index to iterate through input data.
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(3);
-  node->temporaries->data[0] = *scratch_tensor_index;
+  node->temporaries->data[0] = op_data->scratch_tensor_index;
   TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0);
   scratch_tensor->type = kTfLiteInt32;
   scratch_tensor->allocation_type = kTfLiteArenaRw;
@@ -161,11 +173,11 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                     context->ResizeTensor(context, scratch_tensor, index_size));
 
   // Creates a temp tensor to store resolved axis given input data.
-  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
   TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
   resolved_axis->type = kTfLiteInt32;
   // Creates a temp tensor to store temp sums when calculating mean.
-  node->temporaries->data[2] = *scratch_tensor_index + 2;
+  node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
   TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
   switch (op_context->input->type) {
     case kTfLiteFloat32:
@@ -180,6 +192,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     case kTfLiteUInt8:
       temp_sum->type = kTfLiteInt32;
       break;
+    case kTfLiteInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
     case kTfLiteBool:
       temp_sum->type = kTfLiteBool;
       break;
@@ -219,9 +234,18 @@ TfLiteStatus PrepareAny(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   // reduce_mean requires a buffer to store intermediate sum result.
   OpContext op_context(context, node);
+  if (op_context.input->type == kTfLiteInt8) {
+    const double real_multiplier =
+        static_cast<double>(op_context.input->params.scale) /
+        static_cast<double>(op_context.output->params.scale);
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->multiplier, &exponent);
+    data->shift = exponent;
+  }
   TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(temp_sum);
@@ -245,6 +269,8 @@ void ResolveAxis(const int* axis_data, int axis_count,
 template <KernelType kernel_type>
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
   TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
@@ -257,6 +283,51 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (op_context.input->type == kTfLiteFloat32 ||
+      op_context.input->type == kTfLiteUInt8) {
+    tflite::MeanParams op_params;
+    op_params.axis_count = num_axis;
+    ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+    const TfLiteTensor* input = op_context.input;
+    if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+        op_params.axis_count == 2 &&
+        ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+         (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+      if (op_context.input->type == kTfLiteUInt8) {
+        gemmlowp::GemmContext* gemm_context =
+            gemm_support::GetFromContext(context);
+        optimized_ops::Mean(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            op_context.input->params.zero_point, op_context.input->params.scale,
+            GetTensorShape(op_context.output),
+            GetTensorData<uint8_t>(op_context.output),
+            op_context.output->params.zero_point,
+            op_context.output->params.scale, gemm_context);
+      } else {
+        reference_ops::Mean(op_params, GetTensorShape(input),
+                            GetTensorData<float>(input),
+                            GetTensorShape(op_context.output),
+                            GetTensorData<float>(op_context.output));
+      }
+      return kTfLiteOk;
+    }
+  }
+
+  if (op_context.input->type == kTfLiteInt8) {
+    tflite::MeanParams op_params;
+    op_params.axis_count = num_axis;
+    ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+    const TfLiteTensor* input = op_context.input;
+    reference_integer_ops::Mean(
+        op_params, data->multiplier, data->shift, GetTensorShape(input),
+        GetTensorData<int8_t>(input), op_context.input->params.zero_point,
+        GetTensorShape(op_context.output),
+        GetTensorData<int8_t>(op_context.output),
+        op_context.output->params.zero_point);
+    return kTfLiteOk;
+  }
+
 #define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
   kernel_type::Mean<>(                                              \
       GetTensorData<data_type>(op_context.input),                   \
@@ -436,6 +507,9 @@ TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       return EvalType<uint8_t>(context, node, &op_context, reduce_type);
       break;
+    case kTfLiteInt8:
+      return EvalType<int8_t>(context, node, &op_context, reduce_type);
+      break;
     case kTfLiteBool:
       return EvalType<bool>(context, node, &op_context, reduce_type);
       break;
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index c1526bddb719e74a6396dc4aeac4b5827220a65a..dd852b90aef13a307649f8843ec1b3056860e030 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -38,9 +38,10 @@ class BaseOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -256,6 +257,66 @@ TEST(ConstFloatMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
+// Uses a set of reduction conditions that trigger the specialized 4D version
+// of Mean.
+TEST(ConstFloatMeanOpTest, KeepDims4DMean) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {2, 2, 3, 2}},
+                     {TensorType_FLOAT32, {3}}, {2}, {1, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({6, 7, 18, 19})));
+}
+
+TEST(ConstFloatMeanOpTest, KeepDims4DMeanUInt8) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                             0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_UINT8, {1, 2, 2, 3}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 3}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({0.25098, 0.25098, 0.25098},
+                                              kQuantizedTolerance)));
+}
+
+TEST(ConstFloatMeanOpTest, KeepDims4DMeanLargeDepthUInt8) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
+                             0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
+                             0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
+                             0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_UINT8, {1, 2, 2, 9}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425},
+                  kQuantizedTolerance)));
+}
+
+TEST(ConstFloatMeanOpTest, KeepDims4DMeanQuantized) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                             0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_UINT8, {1, 2, 3, 2}, 0.0, 1.0},
+                     {TensorType_UINT8, {3}, -5.0, 5.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.235294, 0.313726}, kQuantizedTolerance)));
+}
+
 TEST(ConstFloatMeanOpTest, Scalar) {
   std::vector<float> data = {3.27};
   MeanOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {},
@@ -318,8 +379,9 @@ TEST(ConstUint8MeanOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {0.4, 0.4}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8MeanOpTest, KeepDims) {
@@ -331,10 +393,44 @@ TEST(ConstUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
+TEST(ConstInt8MeanOpTest, QuantizedSameScale) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
+                             0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
+                             0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
+                             0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_INT8, {1, 2, 2, 9}, -1.0, 1.0},
+                     {TensorType_INT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425},
+                  kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MeanOpTest, QuantizedDifferentScale) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1,
+                             0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9,
+                             0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3,
+                             0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_INT8, {1, 2, 2, 9}, -1.0, 1.0},
+                     {TensorType_INT8, {2}, -4.0, 4.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425},
+                  kQuantizedTolerance)));
+}
+
 TEST(DynamicUint8MeanOpTest, NotKeepDims) {
   float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
   std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
@@ -347,7 +443,7 @@ TEST(DynamicUint8MeanOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
 }
 
@@ -363,7 +459,7 @@ TEST(DynamicUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
@@ -377,7 +473,7 @@ TEST(DynamicUint8MeanOpTest, QuantizedScalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance)));
 }
 
@@ -390,7 +486,7 @@ TEST(ConstUint8MeanOpTest, QuantizedKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
@@ -483,7 +579,7 @@ TEST(ConstUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({-0.823529, -0.815686}, kQuantizedTolerance)));
 }
@@ -496,8 +592,9 @@ TEST(ConstUint8SumOpTest, NotKeepDimsRescaling) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {1.2, 1.2}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({1.2, 1.2}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8SumOpTest, KeepDims) {
@@ -508,7 +605,7 @@ TEST(ConstUint8SumOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({-0.407843, -0.313726, 0.0941177},
                                               kQuantizedTolerance)));
 }
@@ -524,7 +621,7 @@ TEST(DynamicUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.48235, 1.64706}, kQuantizedTolerance)));
 }
@@ -541,7 +638,7 @@ TEST(DynamicUint8SumOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({6.47059, 10.698}, kQuantizedTolerance)));
 }
 
@@ -698,7 +795,20 @@ TEST(ConstUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
 }
@@ -711,7 +821,20 @@ TEST(ConstUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
 }
@@ -727,7 +850,23 @@ TEST(DynamicUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
 }
@@ -743,7 +882,23 @@ TEST(DynamicUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
 }
@@ -758,7 +913,21 @@ TEST(DynamicUint8MaxOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MaxOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
@@ -840,7 +1009,20 @@ TEST(ConstUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
 }
 
@@ -853,7 +1035,20 @@ TEST(ConstUint8MinOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
 }
 
@@ -869,7 +1064,23 @@ TEST(DynamicUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
 }
 
@@ -884,7 +1095,23 @@ TEST(DynamicUint8MinOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
 }
@@ -899,7 +1126,21 @@ TEST(DynamicUint8MinOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MinOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index df2b15fe83ad7e86ac87e64cf97953f3f77f2b59..b11cb9938c8d2410b6093d92ed6693f346d9c1d3 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -22,10 +22,10 @@ namespace ops {
 namespace custom {
 
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
-TfLiteRegistration* Register_RELU_1();
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
 
 }  // namespace custom
 
@@ -94,6 +94,7 @@ TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_CEIL();
 TfLiteRegistration* Register_TILE();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SUM();
@@ -104,6 +105,7 @@ TfLiteRegistration* Register_REDUCE_ANY();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
@@ -112,6 +114,7 @@ TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_RANK();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
 TfLiteRegistration* Register_PACK();
@@ -130,6 +133,12 @@ TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
 TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_ELU();
+TfLiteRegistration* Register_REVERSE_SEQUENCE();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -162,18 +171,26 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
-  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version */ 1,
              /* max_version */ 2);
@@ -195,16 +212,28 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 3);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
-  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
-  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND());
-  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL());
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
@@ -215,62 +244,115 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR());
+             Register_RESIZE_NEAREST_NEIGHBOR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
-  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
-  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
-  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version */ 1,
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
-  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
-  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
   AddBuiltin(BuiltinOperator_TILE, Register_TILE());
   AddBuiltin(BuiltinOperator_SUM, Register_SUM());
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
-  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
-  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_RANK, Register_RANK());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
@@ -286,16 +368,24 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_FILL, Register_FILL());
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
   AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
+  AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2());
+  AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
+  AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND());
+  AddBuiltin(BuiltinOperator_WHERE, Register_WHERE());
+  AddBuiltin(BuiltinOperator_ELU, Register_ELU());
+  AddBuiltin(BuiltinOperator_REVERSE_SEQUENCE, Register_REVERSE_SEQUENCE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
   // AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
   // AddCustom("AudioSpectrogram",
   //          tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
-  AddCustom("LayerNormLstm", tflite::ops::custom::Register_LAYER_NORM_LSTM());
-  AddCustom("Relu1", tflite::ops::custom::Register_RELU_1());
   AddCustom("TFLite_Detection_PostProcess",
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
+
+  // WARNING: Control flow ops are experimental and subject to change.
+  AddCustom("Experimental_If", tflite::ops::custom::Register_IF());
+  AddCustom("Experimental_While", tflite::ops::custom::Register_WHILE());
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 6840ea39bf243f476f7935ed85a53aacb044e498..faa864b0e236e4a61453c6fcecafd2ca09f41ce1 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -22,10 +22,8 @@ namespace ops {
 namespace custom {
 
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
-TfLiteRegistration* Register_RELU_1();
 
 }  // namespace custom
 
@@ -286,8 +284,6 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
   AddCustom("AudioSpectrogram",
             tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
-  AddCustom("LayerNormLstm", tflite::ops::custom::Register_LAYER_NORM_LSTM());
-  AddCustom("Relu1", tflite::ops::custom::Register_RELU_1());
   AddCustom("TFLite_Detection_PostProcess",
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
 }
diff --git a/tensorflow/lite/kernels/relu1_test.cc b/tensorflow/lite/kernels/relu1_test.cc
deleted file mode 100644
index f52d10b0b7f32af3444c702835f0674d7181bb7a..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/relu1_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_RELU_1();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseActivationsOpModel : public SingleOpModel {
- public:
-  explicit BaseActivationsOpModel(const TensorData& input) {
-    input_ = AddInput(input);
-    output_ = AddOutput({input.type, {}});
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {});
-    fbb.Finish();
-    SetCustomOp("RELU_1", fbb.GetBuffer(), Register_RELU_1);
-    BuildInterpreter({GetShape(input_)});
-  }
-
- protected:
-  int input_;
-  int output_;
-};
-
-class FloatActivationsOpModel : public BaseActivationsOpModel {
- public:
-  using BaseActivationsOpModel::BaseActivationsOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-TEST(FloatActivationsOpTest, Relu1) {
-  FloatActivationsOpModel m(/*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
-  m.SetInput({
-      0.0, -0.6, 0.2, -0.4,  //
-      0.3, -2.0, 1.1, -0.1,  //
-  });
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 0.0, 0.0, 0.2, 0.0,  //
-                                 0.3, 0.0, 1.0, 0.0,  //
-                             }));
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index e2210aeaf09395fec8fa9462096f61c4fbf19804..eb05eb6d4e1fbb51a07645ecf4788188a8ad9f7c 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -33,7 +33,7 @@ enum ShapeSpecificationType {
   // Const node, which is guaranteed not to change once inference starts. The
   // shape is also hardcoded as in kAsReshapeOption.
   kAsConstantTensor,
-  // The output shape is specifed as an input tensor that can change based on
+  // The output shape is specified as an input tensor that can change based on
   // external input. That is, the shape is not know before the inference
   // starts. The shape is also hardcoded as in kAsReshapeOption.
   kAsTensor,
@@ -137,21 +137,11 @@ TEST_P(ReshapeOpTest, MismatchedDimensions) {
 #endif
 
 TEST_P(ReshapeOpTest, TooManyDimensions) {
-  if (GetParam() == kAsReshapeOption) {
 #ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
                                        {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam()),
                  "Found too many dimensions");
 #endif
-  } else {
-    ReshapeOpModel<float> m({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
-                            {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam());
-    m.SetInput({3, 4});
-    m.Invoke();
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 4}));
-    EXPECT_THAT(m.GetOutputShape(),
-                ElementsAreArray({1, 1, 1, 1, 1, 1, 1, 1, 2}));
-  }
 }
 
 #ifdef GTEST_HAS_DEATH_TEST
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index d42cb188669587a957dd085f9ecb123f44b59437..7383d03438c65a710efbfe30f3d3c0ce261f0ca8 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -109,6 +109,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
       TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t);
     }
+  } else if (output->type == kTfLiteInt8) {
+    TF_LITE_RESIZE_BILINEAR(reference_ops, int8_t);
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
     context->ReportError(context, "Output type is %d, requires float.",
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index d3f4837a287accd93c23e17fa3a361efd4120101..b7b7fcd41578967e9b96797e6a3c1ed23f29d75a 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -64,7 +64,7 @@ class ResizeBilinearOpModel : public SingleOpModel {
 };
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}}, {});
   m.SetInput<float>({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
@@ -78,8 +78,8 @@ TEST(ResizeBilinearOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
-TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+TEST(ResizeBilinearOpTest, HorizontalResizeUInt8) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}}, {});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
@@ -93,8 +93,23 @@ TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
+TEST(ResizeBilinearOpTest, HorizontalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 1, 2, 1}}, {});
+  m.SetInput<int8_t>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
 TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}, {});
   m.SetInput<float>({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
@@ -108,8 +123,8 @@ TEST(ResizeBilinearOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
-TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+TEST(ResizeBilinearOpTest, VerticalResizeUInt8) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}}, {});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
@@ -123,8 +138,23 @@ TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
+TEST(ResizeBilinearOpTest, VerticalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 1, 1}}, {});
+  m.SetInput<int8_t>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {});
   m.SetInput<float>({
       3, 6,  //
       9, 12  //
@@ -150,8 +180,8 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeUInt8) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}}, {});
   m.SetInput<uint8>({
       3, 6,  //
       9, 12  //
@@ -177,8 +207,35 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 1}}, {});
+  m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,    //
+                                         7, 9, 10,   //
+                                         9, 11, 12,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,    //
+                                               7, 9, 10,   //
+                                               9, 11, 12,  //
+                                           })));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {});
   m.SetInput<float>({
       3, 6,   //
       9, 12,  //
@@ -215,7 +272,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, {});
   m.SetInput<float>({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
@@ -241,8 +298,8 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}}, {});
   m.SetInput<uint8>({
       3, 6,   //
       9, 12,  //
@@ -278,8 +335,45 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {2, 2, 2, 1}}, {});
+  m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,     //
+                                         7, 9, 10,    //
+                                         9, 11, 12,   //
+                                         4, 8, 10,    //
+                                         9, 12, 13,   //
+                                         12, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,     //
+                                               7, 9, 10,    //
+                                               9, 11, 12,   //
+                                               4, 8, 10,    //
+                                               9, 12, 13,   //
+                                               12, 14, 16,  //
+                                           })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeUInt8) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
       10, 12, 14, 16,  //
@@ -304,6 +398,33 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
                                               10, 12, 12, 14, 14, 16,  //
                                           })));
 }
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 2}}, {});
+  m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 5, 8, 6, 10,       //
+                                         7, 9, 10, 12, 11, 13,    //
+                                         10, 12, 12, 14, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 5, 8, 6, 10,       //
+                                               7, 9, 10, 12, 11, 13,    //
+                                               10, 12, 12, 14, 14, 16,  //
+                                           })));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index a48d8004f8b6cead177286328082310237af515a..3030a4f28e22396cbc51e55ff04562fa76a0264e 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -106,8 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorShape(size), GetTensorData<int32>(size),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
     }
+  } else if (output->type == kTfLiteInt8) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   } else {
-    context->ReportError(context, "Output type is %d, requires float or uint8.",
+    context->ReportError(context,
+                         "Output type is %d, requires float, uint8 or int8.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index 03e2effd84c4adb13db1bb3ada4f5cfe1c0b12c9..4d4cec9101c3a2e0561e5b6b1313a5da96276ca8 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -64,7 +64,7 @@ class ResizeNearestNeighborOpModel : public SingleOpModel {
 };
 
 TEST(ResizeNearestNeighborOpTest, HorizontalResize) {
-  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}}, {});
   m.SetInput<float>({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
@@ -79,8 +79,8 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
-TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
-  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeUInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 1, 2, 1}}, {});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
@@ -95,8 +95,23 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 1, 2, 1}}, {});
+  m.SetInput<int8_t>({-3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({-3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+}
+
 TEST(ResizeNearestNeighborOpTest, VerticalResize) {
-  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}, {});
   m.SetInput<float>({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
@@ -111,8 +126,8 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
-TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
-  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+TEST(ResizeNearestNeighborOpTest, VerticalResizeUInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 1, 1}}, {});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
@@ -127,8 +142,23 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
+TEST(ResizeNearestNeighborOpTest, VerticalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 1, 1}}, {});
+  m.SetInput<int8_t>({3, -9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, -9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
-  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {});
   m.SetInput<float>({
       3, 6,  //
       9, 12  //
@@ -155,8 +185,8 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
-  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeUInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 1}}, {});
   m.SetInput<uint8>({
       3, 6,  //
       9, 12  //
@@ -183,8 +213,35 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 1}}, {});
+  m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, -6,  //
+                                         3, 3, -6,  //
+                                         9, 9, 12,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, -6,  //
+                                               3, 3, -6,  //
+                                               9, 9, 12,  //
+                                           })));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {});
   m.SetInput<float>({
       3, 6,   //
       9, 12,  //
@@ -222,7 +279,7 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
 }
 
 TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
-  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, {});
   m.SetInput<float>({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
@@ -249,8 +306,8 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
-  ResizeNearestNeighborOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {2, 2, 2, 1}}, {});
   m.SetInput<uint8>({
       3, 6,   //
       9, 12,  //
@@ -287,8 +344,45 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
-  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {2, 2, 2, 1}}, {});
+  m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, 6,     //
+                                         3, 3, 6,     //
+                                         9, 9, -12,   //
+                                         -4, -4, 10,  //
+                                         -4, -4, 10,  //
+                                         12, 12, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, 6,     //
+                                               3, 3, 6,     //
+                                               9, 9, -12,   //
+                                               -4, -4, 10,  //
+                                               -4, -4, 10,  //
+                                               12, 12, 16,  //
+                                           })));
+}
+
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
       10, 12, 14, 16,  //
@@ -315,6 +409,33 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 2}}, {});
+  m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 3, 4, -6, 10,       //
+                                         3, 4, 3, 4, -6, 10,       //
+                                         10, 12, 10, 12, -14, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 3, 4, -6, 10,       //
+                                               3, 4, 3, 4, -6, 10,       //
+                                               10, 12, 10, 12, -14, 16,  //
+                                           })));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..855aee8df1c0969bba9ec7d32bee78e04aeccbca
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reverse {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxisTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(axis), 1);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= NumElements(axis));
+
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt16 &&
+      input->type != kTfLiteInt64) {
+    context->ReportError(context, "Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(input->type));
+    return kTfLiteError;
+  }
+
+  if (axis->type != kTfLiteInt32) {
+    context->ReportError(context, "Axis Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(axis->type));
+    return kTfLiteError;
+  }
+
+  // TODO(renjieliu): support multi-axis case.
+  if (NumElements(axis) > 1) {
+    context->ReportError(context, "Current does not support more than 1 axis.");
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+  int axis = GetTensorData<int32_t>(axis_tensor)[0];
+
+  TF_LITE_ENSURE(context, axis >= 0 && axis < NumDimensions(input));
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      reference_ops::Reverse<float>(
+          axis, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteUInt8: {
+      reference_ops::Reverse<uint8_t>(
+          axis, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      break;
+    }
+    case kTfLiteInt16: {
+      reference_ops::Reverse<int16_t>(
+          axis, GetTensorShape(input), GetTensorData<int16_t>(input),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+      break;
+    }
+    case kTfLiteInt32: {
+      reference_ops::Reverse<int32_t>(
+          axis, GetTensorShape(input), GetTensorData<int32_t>(input),
+          GetTensorShape(output), GetTensorData<int32_t>(output));
+      break;
+    }
+    case kTfLiteInt64: {
+      reference_ops::Reverse<int64_t>(
+          axis, GetTensorShape(input), GetTensorData<int64_t>(input),
+          GetTensorShape(output), GetTensorData<int64_t>(output));
+      break;
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by reverse.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace reverse
+
+TfLiteRegistration* Register_REVERSE_V2() {
+  static TfLiteRegistration r = {nullptr, nullptr, reverse::Prepare,
+                                 reverse::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reverse_sequence.cc b/tensorflow/lite/kernels/reverse_sequence.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b698bdb16c9ccc5d00bc0c0c93311d8fd3b271d
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse_sequence.cc
@@ -0,0 +1,158 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reverse_sequence {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kSeqLengthsTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* seq_lengths = GetInput(context, node, kSeqLengthsTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(seq_lengths), 1);
+
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt16 &&
+      input->type != kTfLiteInt64) {
+    context->ReportError(context,
+                         "Type '%s' is not supported by reverse_sequence.",
+                         TfLiteTypeGetName(input->type));
+    return kTfLiteError;
+  }
+
+  if (seq_lengths->type != kTfLiteInt32 && seq_lengths->type != kTfLiteInt64) {
+    context->ReportError(
+        context, "Seq_lengths type '%s' is not supported by reverse_sequence.",
+        TfLiteTypeGetName(seq_lengths->type));
+    return kTfLiteError;
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename T, typename TS>
+TfLiteStatus ReverseSequenceImpl(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* seq_lengths_tensor =
+      GetInput(context, node, kSeqLengthsTensor);
+  const TS* seq_lengths = GetTensorData<TS>(seq_lengths_tensor);
+
+  auto* params =
+      reinterpret_cast<TfLiteReverseSequenceParams*>(node->builtin_data);
+  int seq_dim = params->seq_dim;
+  int batch_dim = params->batch_dim;
+
+  TF_LITE_ENSURE(context, seq_dim >= 0);
+  TF_LITE_ENSURE(context, batch_dim >= 0);
+  TF_LITE_ENSURE(context, seq_dim != batch_dim);
+  TF_LITE_ENSURE(context, seq_dim < NumDimensions(input));
+  TF_LITE_ENSURE(context, batch_dim < NumDimensions(input));
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(seq_lengths_tensor, 0),
+                    SizeOfDimension(input, batch_dim));
+  for (int i = 0; i < NumDimensions(seq_lengths_tensor); ++i) {
+    TF_LITE_ENSURE(context, seq_lengths[i] <= SizeOfDimension(input, seq_dim));
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  reference_ops::ReverseSequence<T, TS>(
+      seq_lengths, seq_dim, batch_dim, GetTensorShape(input),
+      GetTensorData<T>(input), GetTensorShape(output),
+      GetTensorData<T>(output));
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus ReverseSequenceHelper(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* seq_lengths_tensor =
+      GetInput(context, node, kSeqLengthsTensor);
+  switch (seq_lengths_tensor->type) {
+    case kTfLiteInt32: {
+      return ReverseSequenceImpl<T, int32_t>(context, node);
+    }
+    case kTfLiteInt64: {
+      return ReverseSequenceImpl<T, int64_t>(context, node);
+    }
+    default: {
+      context->ReportError(
+          context,
+          "Seq_lengths type '%s' is not supported by reverse_sequence.",
+          TfLiteTypeGetName(seq_lengths_tensor->type));
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      return ReverseSequenceHelper<float>(context, node);
+    }
+    case kTfLiteUInt8: {
+      return ReverseSequenceHelper<uint8_t>(context, node);
+    }
+    case kTfLiteInt16: {
+      return ReverseSequenceHelper<int16_t>(context, node);
+    }
+    case kTfLiteInt32: {
+      return ReverseSequenceHelper<int32_t>(context, node);
+    }
+    case kTfLiteInt64: {
+      return ReverseSequenceHelper<int64_t>(context, node);
+    }
+    default: {
+      context->ReportError(context,
+                           "Type '%s' is not supported by reverse_sequence.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}  // namespace
+
+}  // namespace
+}  // namespace reverse_sequence
+
+TfLiteRegistration* Register_REVERSE_SEQUENCE() {
+  static TfLiteRegistration r = {nullptr, nullptr, reverse_sequence::Prepare,
+                                 reverse_sequence::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reverse_sequence_test.cc b/tensorflow/lite/kernels/reverse_sequence_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e81f1380290de3f926ab8fae98e23c072935ca32
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse_sequence_test.cc
@@ -0,0 +1,211 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ReverseSequenceOpModel : public SingleOpModel {
+ public:
+  ReverseSequenceOpModel(const TensorData& input, const TensorData& seq_lengths,
+                         int seq_dim, int batch_dim) {
+    input_ = AddInput(input);
+    seq_lengths_ = AddInput(seq_lengths);
+
+    output_ = AddOutput({input.type, {}});
+
+    SetBuiltinOp(
+        BuiltinOperator_REVERSE_SEQUENCE, BuiltinOptions_ReverseSequenceOptions,
+        CreateReverseSequenceOptions(builder_, seq_dim, batch_dim).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+  int seq_lengths() { return seq_lengths_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int seq_lengths_;
+  int output_;
+};
+
+// float32 tests
+TEST(ReverseSequenceOpTest, FloatSeqDimIsGreater) {
+  ReverseSequenceOpModel<float> model({TensorType_FLOAT32, {4, 3, 2}},
+                                      {TensorType_INT32, {4}}, 1, 0);
+  model.PopulateTensor<float>(model.input(),
+                              {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                               13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  9,  10, 7,  8,  11, 12,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+TEST(ReverseSequenceOpTest, FloatBatchDimIsGreater) {
+  ReverseSequenceOpModel<float> model({TensorType_FLOAT32, {4, 3, 2}},
+                                      {TensorType_INT32, {2}}, 0, 2);
+  model.PopulateTensor<float>(model.input(),
+                              {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                               13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({13, 20, 15, 22, 17, 24, 7, 14, 9, 16, 11, 18, 1,
+                                8,  3,  10, 5,  12, 19, 2, 21, 4, 23, 6}));
+}
+
+// int32 tests
+TEST(ReverseSequenceOpTest, Int32SeqDimIsGreater) {
+  ReverseSequenceOpModel<int32_t> model({TensorType_INT32, {4, 3, 2}},
+                                        {TensorType_INT32, {4}}, 1, 0);
+  model.PopulateTensor<int32_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  9,  10, 7,  8,  11, 12,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+TEST(ReverseSequenceOpTest, Int32BatchDimIsGreater) {
+  ReverseSequenceOpModel<int32_t> model({TensorType_INT32, {4, 3, 2}},
+                                        {TensorType_INT32, {2}}, 0, 2);
+  model.PopulateTensor<int32_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({13, 20, 15, 22, 17, 24, 7, 14, 9, 16, 11, 18, 1,
+                                8,  3,  10, 5,  12, 19, 2, 21, 4, 23, 6}));
+}
+
+// int64 tests
+TEST(ReverseSequenceOpTest, Int64SeqDimIsGreater) {
+  ReverseSequenceOpModel<int64_t> model({TensorType_INT64, {4, 3, 2}},
+                                        {TensorType_INT32, {4}}, 1, 0);
+  model.PopulateTensor<int64_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  9,  10, 7,  8,  11, 12,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+TEST(ReverseSequenceOpTest, Int64BatchDimIsGreater) {
+  ReverseSequenceOpModel<int64_t> model({TensorType_INT64, {4, 3, 2}},
+                                        {TensorType_INT32, {2}}, 0, 2);
+  model.PopulateTensor<int64_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({13, 20, 15, 22, 17, 24, 7, 14, 9, 16, 11, 18, 1,
+                                8,  3,  10, 5,  12, 19, 2, 21, 4, 23, 6}));
+}
+
+// uint8 tests
+TEST(ReverseSequenceOpTest, Uint8SeqDimIsGreater) {
+  ReverseSequenceOpModel<uint8_t> model({TensorType_UINT8, {4, 3, 2}},
+                                        {TensorType_INT32, {4}}, 1, 0);
+  model.PopulateTensor<uint8_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  9,  10, 7,  8,  11, 12,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+TEST(ReverseSequenceOpTest, Uint8BatchDimIsGreater) {
+  ReverseSequenceOpModel<uint8_t> model({TensorType_UINT8, {4, 3, 2}},
+                                        {TensorType_INT32, {2}}, 0, 2);
+  model.PopulateTensor<uint8_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({13, 20, 15, 22, 17, 24, 7, 14, 9, 16, 11, 18, 1,
+                                8,  3,  10, 5,  12, 19, 2, 21, 4, 23, 6}));
+}
+
+// int16 tests
+TEST(ReverseSequenceOpTest, Int16SeqDimIsGreater) {
+  ReverseSequenceOpModel<int16_t> model({TensorType_INT16, {4, 3, 2}},
+                                        {TensorType_INT32, {4}}, 1, 0);
+  model.PopulateTensor<int16_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  9,  10, 7,  8,  11, 12,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+TEST(ReverseSequenceOpTest, Int16BatchDimIsGreater) {
+  ReverseSequenceOpModel<int16_t> model({TensorType_INT16, {4, 3, 2}},
+                                        {TensorType_INT32, {2}}, 0, 2);
+  model.PopulateTensor<int16_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.seq_lengths(), {3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({13, 20, 15, 22, 17, 24, 7, 14, 9, 16, 11, 18, 1,
+                                8,  3,  10, 5,  12, 19, 2, 21, 4, 23, 6}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc0c24b64c197d5c9a60ff74bdd53c5ae0352b9
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ReverseOpModel : public SingleOpModel {
+ public:
+  ReverseOpModel(const TensorData& input, const TensorData& axis) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+
+    output_ = AddOutput({input.type, {}});
+
+    SetBuiltinOp(BuiltinOperator_REVERSE_V2, BuiltinOptions_ReverseV2Options,
+                 CreateReverseV2Options(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+// float32 tests.
+TEST(ReverseOpTest, FloatOneDimension) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, FloatMultiDimensions) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4, 3, 2}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(),
+                              {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                               13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int32 tests
+TEST(ReverseOpTest, Int32OneDimension) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int32MultiDimensions) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int64 tests
+TEST(ReverseOpTest, Int64OneDimension) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int64MultiDimensions) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// uint8 tests
+TEST(ReverseOpTest, Uint8OneDimension) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Uint8MultiDimensions) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int16 tests
+TEST(ReverseOpTest, Int16OneDimension) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int16MultiDimensions) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 4687ab44171fab73ff1b4ef93592b25680f3a59f..d1c63d887db00143fb6b154306313411643cf2b8 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -89,6 +89,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:                                                         \
       TF_LITE_SELECT(uint8_t, op);                                             \
       break;                                                                   \
+    case kTfLiteInt8:                                                          \
+      TF_LITE_SELECT(int8_t, op);                                              \
+      break;                                                                   \
     case kTfLiteInt16:                                                         \
       TF_LITE_SELECT(int16_t, op);                                             \
       break;                                                                   \
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index 5111300e479a92ad9cbf00628750dc61effc50d3..d7cadeb51eb3ee0645eaccc1bbcea59bd279e0d7 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -96,6 +96,19 @@ TEST(SelectOpTest, SelectUInt8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
+TEST(SelectOpTest, SelectInt8) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_INT8);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<int8_t>(model.input2(), {1, -2, 3, 4});
+  model.PopulateTensor<int8_t>(model.input3(), {5, 6, 7, -8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray({5, -2, 7, -8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(SelectOpTest, SelectInt16) {
   SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
                       TensorType_INT16);
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 5fca7a3ea71aa41c6e466b7814921e2e1ac6293d..8472572d7e2a4ad1ab60b144f9d06fbfc7bc7387 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -117,7 +117,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  begin->type == kTfLiteInt32 || begin->type == kTfLiteInt64);
   TF_LITE_ENSURE(context,
                  size->type == kTfLiteInt32 || size->type == kTfLiteInt64);
-  TF_LITE_ENSURE(context, NumDimensions(begin) == NumDimensions(size) == 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
   TF_LITE_ENSURE_MSG(context, NumDimensions(input) <= kMaxDim,
                      "Slice op only supports 1D-4D input arrays.");
 
@@ -204,6 +205,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       TF_LITE_SLICE(int64_t, kernel_type);
       break;
+    case kTfLiteInt8:
+      TF_LITE_SLICE(int8_t, kernel_type);
+      break;
     case kTfLiteUInt8:
       TF_LITE_SLICE(uint8_t, kernel_type);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 563329ddb164d3aa5f13c8ee0d6482d79b84ed32..102218ba23c105014ee6d501d2941f8b4755a44e 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -163,6 +163,28 @@ TEST(SliceOpTest, SizeMinus1) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST(SliceOpTest, SliceUint8) {
+  SliceOpModel<uint8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_UINT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+TEST(SliceOpTest, SliceInt8) {
+  SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                  TensorType_INT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index 1c61b2ef30379e808085f3b0d16a5b1157bea314..2fb7198cd67e8b9d13873d25a2eaa04fd2ff2ae0 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -141,6 +141,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   op_context.output->params.zero_point);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index c5d6e9a53062d97801b518f15305e2052f861e7c..52a77984d935d04a79807707729754abd21d3be6 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -31,8 +31,9 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  template <typename T>
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -46,9 +47,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
  protected:
@@ -233,29 +235,62 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
 }
 #endif
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestUint8) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 0, 2, 0},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt8) {
+  SpaceToBatchNDOpConstModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 0, 2, 0},
+                               {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestUint8) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt8) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                                 {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
@@ -266,10 +301,10 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 1, 2, 4},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
@@ -282,12 +317,12 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
 TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 1, 2, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index 79e28bf47d98b64572d9e7404f8d69788cd30e08..cf6b0bd4d3d4b61b87a14d1090a7e89d9b77a0f2 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -50,7 +50,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto data_type = output->type;
   TF_LITE_ENSURE(context,
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
-                     data_type == kTfLiteInt32 || data_type == kTfLiteInt64);
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
+                     data_type == kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
   const int block_size = params->block_size;
@@ -100,6 +101,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_SPACE_TO_DEPTH(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, int8_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_DEPTH(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 3fa8d86348ef899b9bd42c19f5b1510b4c4e33d3..58665fc9d83007d7bed638418cba058e4ff189c5 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -74,6 +74,14 @@ TEST(SpaceToDepthOpModel, Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(SpaceToDepthOpModel, int8) {
+  SpaceToDepthOpModel m({TensorType_INT8, {1, 2, 2, 1}}, 2);
+  m.SetInput<int8_t>({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({1, 2, 3, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(SpaceToDepthOpModel, Int32) {
   SpaceToDepthOpModel m({TensorType_INT32, {1, 2, 2, 3}}, 2);
   m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/lite/kernels/sparse_output_fully_connected.cc
deleted file mode 100644
index 248969535c66a31be9e15ea366cf461e93206161..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// SparseOutputFullyConnected is a fully connected layer that uses a single
-// row in the weights and bias via a lookup.
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace sparse_output_fully_connected {
-
-// Input tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-// Auxiliary input tensor of size { 1 }
-constexpr int kInputLookupTensor = 1;
-
-// Weights tensor of size { n_embeddings , n_input }
-constexpr int kWeightsTensor = 2;
-// Bias tensor of size { n_embeddings }
-constexpr int kBiasTensor = 3;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-// Temporary tensors.
-enum TemporaryTensor {
-  kInputQuantized = 0,
-  kScalingFactors = 1,
-  kNumTemporaryTensors = 2
-};
-
-// Struct to hold op data.
-struct OpData {
-  int scratch_tensor_index;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  context->AddTensors(context, /*tensors_to_add=*/kNumTemporaryTensors,
-                      &data->scratch_tensor_index);
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
-  // Only support single lookup.
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(lookup, 0), 1);
-
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 2);
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(weights, 1), n_input);
-
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(weights, 0));
-
-  const bool is_hybrid_op =
-      ((weights->type == kTfLiteUInt8 || weights->type == kTfLiteInt8) &&
-       input->type == kTfLiteFloat32);
-
-  // Resize output.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(1);
-  output_size_array->data[0] = 1;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size_array));
-
-  if (is_hybrid_op) {
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-
-    // Allocate temporary tensors to store quantized values of input.
-    node->temporaries->data[kInputQuantized] = op_data->scratch_tensor_index;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, /*index=*/kInputQuantized);
-    input_quantized->type = weights->type;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-
-    // Tell interpreter to allocate temporary tensors to store scaling factors.
-    node->temporaries->data[kScalingFactors] =
-        op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, /*index=*/kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                       const TfLiteTensor* weights, const TfLiteTensor* bias,
-                       TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-
-  // Initialize pointer to right row according to lookup value.
-  int32 lookup_index = lookup->data.i32[0];
-  const float* weights_ptr = weights->data.f + lookup_index * n_input;
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      weights_ptr, /*m_rows=*/1, n_input, input_ptr_batch, n_batch,
-      output->data.f, /*result_stride=*/1);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                        const TfLiteTensor* weights, const TfLiteTensor* bias,
-                        TfLiteTensor* scaling_factors,
-                        TfLiteTensor* input_quantized, TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-
-  // Initialize the pointer to storage for scaling factors.
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  int32 lookup_index = lookup->data.i32[0];
-
-  // Initialize the pointer to storage for quantized values and a pointer to
-  // the row according to lookup value.
-  int8_t *quantized_input_ptr_batch, *weights_ptr;
-  if (weights->type == kTfLiteUInt8) {
-    quantized_input_ptr_batch =
-        reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-    weights_ptr =
-        reinterpret_cast<int8_t*>(weights->data.uint8) + lookup_index * n_input;
-  } else {
-    quantized_input_ptr_batch = input_quantized->data.int8;
-    weights_ptr = weights->data.int8 + lookup_index * n_input;
-  }
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Quantize input from float to int8.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors_ptr[b]);
-      scaling_factors_ptr[b] *= weights->params.scale;
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        weights_ptr, /*m_rows=*/1, n_input, quantized_input_ptr_batch,
-        scaling_factors_ptr, n_batch, output->data.f, /*result_stride=*/1);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, lookup, weights, bias, output);
-    }
-    case kTfLiteUInt8:
-    case kTfLiteInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, /*index=*/kInputQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, /*index=*/kScalingFactors);
-      return EvalHybrid(input, lookup, weights, bias, scaling_factors,
-                        input_quantized, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace sparse_output_fully_connected
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {sparse_output_fully_connected::Init,
-                                 sparse_output_fully_connected::Free,
-                                 sparse_output_fully_connected::Prepare,
-                                 sparse_output_fully_connected::Eval};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
deleted file mode 100644
index 7d5fec192ce6b103c41f47ed60eb1283f72da45f..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite sparse output fully connected op.
-#include <iomanip>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-namespace tflite {
-
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel {
- public:
-  BaseSparseOutputFullyConnectedOpModel(const TensorData& input,
-                                        const TensorData& weights,
-                                        const TensorData& output = {
-                                            TensorType_FLOAT32}) {
-    input_ = AddInput(input);
-    lookup_ = AddInput({TensorType_INT32, {1}});
-    weights_ = AddInput(weights);
-    int bias_size = GetShape(weights_)[0];
-    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
-    output_ = AddOutput(output);
-
-    // Create empty (required) options map.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {});
-    fbb.Finish();
-
-    SetCustomOp("SPARSE_OUTPUT_FULLY_CONNECTED", fbb.GetBuffer(),
-                Register_SPARSE_OUTPUT_FULLY_CONNECTED);
-    BuildInterpreter({GetShape(input_), GetShape(lookup_), GetShape(weights_),
-                      GetShape(bias_)});
-  }
-
-  void SetInput(const std::vector<float>& data) {
-    PopulateTensor(input_, data);
-  }
-
-  void SetLookup(const std::vector<int32_t>& f) { PopulateTensor(lookup_, f); }
-
-  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
- protected:
-  int input_;
-  int lookup_;
-  int weights_;
-  int bias_;
-  int output_;
-};
-
-class FloatSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
-};
-
-class HybridSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
-  }
-
-  void SetSignedWeights(const std::vector<float>& f) {
-    SignedSymmetricQuantizeAndPopulate(weights_, f);
-  }
-};
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestFloat) {
-  FloatSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                           {TensorType_FLOAT32, {3, 5}},
-                                           {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({28}));
-}
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybridUint8) {
-  HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                            {TensorType_UINT8, {3, 5}},
-                                            {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  // We get 28.0552 instead of 28.
-  //
-  // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3.
-  // Looked up weights ->  25, 51, 76, 102, 127 with scale factor of 127/5.
-  //
-  // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0
-  // gives us the expected result.
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553)));
-}
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybridInt8) {
-  HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                            {TensorType_INT8, {3, 5}},
-                                            {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetSignedWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  // We get 28.0552 instead of 28.
-  //
-  // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3.
-  // Looked up weights ->  25, 51, 76, 102, 127 with scale factor of 127/5.
-  //
-  // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0
-  // gives us the expected result.
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553)));
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 7902ed2a46d297cca6f076bf1bb48580f3c4bf40..c0f701f55dd096279e1e9f1e54817490cb0c230b 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -76,9 +76,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
 
   auto input_type = op_context.input->type;
-  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
-                              input_type == kTfLiteUInt8 ||
-                              input_type == kTfLiteInt16);
+  TF_LITE_ENSURE(context,
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
+                     input_type == kTfLiteInt32);
   for (int i = 0; i < NumOutputs(node); ++i) {
     GetOutput(context, node, i)->type = input_type;
   }
@@ -137,15 +138,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_SPLIT(uint8_t);
       break;
     }
+    case kTfLiteInt8: {
+      TF_LITE_SPLIT(int8_t);
+      break;
+    }
     case kTfLiteInt16: {
       TF_LITE_SPLIT(int16_t);
       break;
     }
+    case kTfLiteInt32: {
+      TF_LITE_SPLIT(int32_t);
+      break;
+    }
     default:
-      context->ReportError(
-          context,
-          "Only float32, uint8 and int16 are currently supported, got %d.",
-          op_context.input->type);
+      context->ReportError(context,
+                           "Only float32, uint8, int8, int16 and int32 are "
+                           "currently supported, got %d.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index f3d9ea3bf4158dd51b5102b942125b7561024c19..fa313d4b18f803dc5060425d8162af25129dd5d9 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -47,13 +47,15 @@ class SplitOpModel : public SingleOpModel {
     }
   }
 
-  void SetInput(std::initializer_list<float> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
   void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
 
-  std::vector<float> GetOutput(int i) {
-    return ExtractVector<float>(outputs_[i]);
+  template <typename T>
+  std::vector<T> GetOutput(int i) {
+    return ExtractVector<T>(outputs_[i]);
   }
   std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
 
@@ -63,33 +65,34 @@ class SplitOpModel : public SingleOpModel {
   std::vector<int> outputs_;
 };
 
-using TensorValues = std::initializer_list<float>;
-
+template <typename T>
 void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
            std::initializer_list<int> output_shape,
-           const TensorValues& input_data,
-           const std::vector<TensorValues>& output_data) {
+           const std::initializer_list<T>& input_data,
+           const std::vector<std::initializer_list<T>>& output_data,
+           const TensorType& type = TensorType_FLOAT32) {
   auto debug = [&](int i) {
     std::stringstream ss;
     ss << "for output tensor " << i << " axis=" << axis
        << " and num_splits=" << num_splits;
     return ss.str();
   };
-  SplitOpModel m({TensorType_FLOAT32, input_shape}, num_splits);
+  SplitOpModel m({type, input_shape}, num_splits);
   m.SetInput(input_data);
   m.SetAxis(axis);
   m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i])) << debug(i);
+    EXPECT_THAT(m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
+        << debug(i);
     EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
   }
 
-  SplitOpModel const_m({TensorType_FLOAT32, input_shape}, num_splits, axis);
+  SplitOpModel const_m({type, input_shape}, num_splits, axis);
   const_m.SetInput(input_data);
   const_m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]))
+    EXPECT_THAT(const_m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
         << debug(i);
     EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
@@ -97,44 +100,106 @@ void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
 }
 
 TEST(SplitOpTest, FourDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 9, 10, 11, 12},
-            {5, 6, 7, 8, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 5, 6, 9, 10, 13, 14},
-            {3, 4, 7, 8, 11, 12, 15, 16},
-        });
-  Check(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 3, 5, 7, 9, 11, 13, 15},
-            {2, 4, 6, 8, 10, 12, 14, 16},
-        });
+  Check<float>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 9, 10, 11, 12},
+                   {5, 6, 7, 8, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 5, 6, 9, 10, 13, 14},
+                   {3, 4, 7, 8, 11, 12, 15, 16},
+               });
+  Check<float>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 3, 5, 7, 9, 11, 13, 15},
+                   {2, 4, 6, 8, 10, 12, 14, 16},
+               });
+}
+
+TEST(SplitOpTest, FourDimensionalInt8) {
+  Check<int8_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 5, 6, 7, 8},
+                    {9, 10, 11, 12, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 9, 10, 11, 12},
+                    {5, 6, 7, 8, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 5, 6, 9, 10, 13, 14},
+                    {3, 4, 7, 8, 11, 12, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 3, 5, 7, 9, 11, 13, 15},
+                    {2, 4, 6, 8, 10, 12, 14, 16},
+                },
+                TensorType_INT8);
+}
+
+TEST(SplitOpTest, FourDimensionalInt32) {
+  Check<int32_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 5, 6, 7, 8},
+                     {9, 10, 11, 12, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 9, 10, 11, 12},
+                     {5, 6, 7, 8, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 5, 6, 9, 10, 13, 14},
+                     {3, 4, 7, 8, 11, 12, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 3, 5, 7, 9, 11, 13, 15},
+                     {2, 4, 6, 8, 10, 12, 14, 16},
+                 },
+                 TensorType_INT32);
 }
 
 TEST(SplitOpTest, OneDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+  Check<float>(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+               {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TEST(SplitOpTest, NegativeAxis) {
-  Check(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
+  Check<float>(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index 060e3c5f79c808cd3c8d4b21efd7f2595a68b8e8..c95396c621b988930208ea62cca03ea9aa67a1f9 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -183,10 +183,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(
-          context,
-          "Only float32, uint8 and int16 are currently supported, got %d.",
-          op_context.input->type);
+      context->ReportError(context, "Type %s currently not supported.",
+                           TfLiteTypeGetName(op_context.input->type));
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT_V
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
index 2d1d36d6851c12d1b05374cda5ef32255e162875..27fed63f0eea452104a26dd3d0527ae98ce81dc7 100644
--- a/tensorflow/lite/kernels/split_v_test.cc
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -50,16 +50,18 @@ class SplitVOpModel : public SingleOpModel {
     }
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
   void SetSizeSplits(std::initializer_list<int> data) {
     PopulateTensor(size_splits_, data);
   }
   void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
 
-  std::vector<float> GetOutput(int i) {
-    return ExtractVector<float>(outputs_[i]);
+  template <typename T>
+  std::vector<T> GetOutput(int i) {
+    return ExtractVector<T>(outputs_[i]);
   }
   std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
 
@@ -70,99 +72,132 @@ class SplitVOpModel : public SingleOpModel {
   std::vector<int> outputs_;
 };
 
-// TODO(ruic): Add tests to test quantized values. b/119638735
-using TensorValues = std::initializer_list<float>;
-
+template <typename T, TensorType T1>
 void Check(int axis, std::initializer_list<int> input_shape,
            std::initializer_list<int> size_splits_shape,
            std::vector<std::initializer_list<int>> output_shapes,
-           const TensorValues& input_data,
+           const std::initializer_list<T>& input_data,
            const std::initializer_list<int>& size_splits_data,
-           const std::vector<TensorValues>& output_data) {
+           const std::vector<std::initializer_list<T>>& output_data) {
   int num_splits = size_splits_data.size();
-  SplitVOpModel m({TensorType_FLOAT32, input_shape},
-                  {TensorType_INT32, size_splits_shape}, num_splits,
-                  kAxisIsATensor);
-  m.SetInput(input_data);
+  SplitVOpModel m({T1, input_shape}, {TensorType_INT32, size_splits_shape},
+                  num_splits, kAxisIsATensor);
+  m.SetInput<T>(input_data);
   m.SetSizeSplits(size_splits_data);
   m.SetAxis(axis);
   m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(m.GetOutput<T>(i), ElementsAreArray(output_data[i]));
     EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
   }
 
-  SplitVOpModel const_m({TensorType_FLOAT32, input_shape},
+  SplitVOpModel const_m({T1, input_shape},
                         {TensorType_INT32, size_splits_shape}, num_splits,
                         axis);
-  const_m.SetInput(input_data);
+  const_m.SetInput<T>(input_data);
   const_m.SetSizeSplits(size_splits_data);
   const_m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(const_m.GetOutput<T>(i), ElementsAreArray(output_data[i]));
     EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
   }
 }
 
 TEST(SplitVOpTest, TwoDimensional) {
   // Input shape: {4, 3}
-  // size_splits: {1, 1, 3}
+  // size_splits: {1, 1, 2}
   // axis: 0
   // We should have 3 outpus with shapes respectively:
-  //  output 0 : {1, 3}
   //  output 1 : {1, 3}
-  //  output 1 : {2, 3}
-  Check(/*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
-        {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
+  //  output 2 : {1, 3}
+  //  output 3 : {2, 3}
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
 }
 
 TEST(SplitVOpTest, FourDimensional) {
-  Check(/*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
-        {
-            {1, 2, 3, 4, 9, 10, 11, 12},
-            {5, 6, 7, 8, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
-        {
-            {1, 2, 5, 6, 9, 10, 13, 14},
-            {3, 4, 7, 8, 11, 12, 15, 16},
-        });
-  Check(/*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
-        {
-            {1, 3, 5, 7, 9, 11, 13, 15},
-            {2, 4, 6, 8, 10, 12, 14, 16},
-        });
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+      {
+          {1, 2, 3, 4, 5, 6, 7, 8},
+          {9, 10, 11, 12, 13, 14, 15, 16},
+      });
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
+      {
+          {1, 2, 3, 4, 9, 10, 11, 12},
+          {5, 6, 7, 8, 13, 14, 15, 16},
+      });
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+      {
+          {1, 2, 5, 6, 9, 10, 13, 14},
+          {3, 4, 7, 8, 11, 12, 15, 16},
+      });
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+      {
+          {1, 3, 5, 7, 9, 11, 13, 15},
+          {2, 4, 6, 8, 10, 12, 14, 16},
+      });
 }
 
 TEST(SplitVOpTest, OneDimensional) {
-  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
-        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
+      {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TEST(SplitVOpTest, OneDimensional2) {
-  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
-        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
+      {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
+      {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
 }
 
 TEST(SplitVOpTest, NegativeAxis) {
-  Check(/*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
+  Check<float, TensorType_FLOAT32>(
+      /*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+      {
+          {1, 2, 3, 4, 5, 6, 7, 8},
+          {9, 10, 11, 12, 13, 14, 15, 16},
+      });
+}
+
+TEST(SplitVOpTest, TwoDimensionalUint8) {
+  // Input shape: {4, 3}
+  // size_splits: {1, 1, 2}
+  // axis: 0
+  // We should have 3 outpus with shapes respectively:
+  //  output 1 : {1, 3}
+  //  output 2 : {1, 3}
+  //  output 3 : {2, 3}
+  Check<uint8_t, TensorType_UINT8>(
+      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
+}
+
+TEST(SplitVOpTest, TwoDimensionalInt16) {
+  // Input shape: {4, 3}
+  // size_splits: {1, 1, 2}
+  // axis: 0
+  // We should have 3 outpus with shapes respectively:
+  //  output 1 : {1, 3}
+  //  output 2 : {1, 3}
+  //  output 3 : {2, 3}
+  Check<int16_t, TensorType_INT16>(
+      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index 59b53a6287dbbc863a61875be82090c1b9c6d442..3661cf9f98c5d0133090ae926f8d76e54f428eba 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -105,10 +105,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   } else if (output->type == kTfLiteInt32) {
     EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
   } else {
-    context->ReportError(context,
-                         "SquaredDifference only supports FLOAT32, INT32 and "
-                         "quantized UINT8 now, got %d.",
-                         output->type);
+    context->ReportError(
+        context,
+        "SquaredDifference only supports FLOAT32 and INT32 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index c797a98e9f1bda8595e6822638949bab48cb2eab..8c25ffa3a1a669684d9fb1b552893de3a450264f 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -234,6 +234,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type %d is currently not supported "
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 34875bf0497a000da02f3d0212b042399046a492..cac9e1672f871268d6d37b3488d00a0c1399aaa7 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -577,6 +577,18 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
+  StridedSliceOpModel<int8_t, TensorType_INT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({1, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 06a3b3499a005f19bfd1461dfe861835f8331b96..8bd6052307cc0e032a566e437923cac2f16be69e 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -39,6 +41,23 @@ constexpr int kOutputTensor = 0;
 
 struct OpData {
   bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32 input1_multiplier;
+  int32 input2_multiplier;
+  int32 output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -51,8 +70,126 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
+TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
+                              const TfLiteTensor* input_1,
+                              const TfLiteTensor* input_2, TfLiteTensor* output,
+                              TfLiteSubParams* params, OpData* op_params,
+                              int op_sign) {
+  TF_LITE_ENSURE(context,
+                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+  const auto& input1_quantization_params = input_1->params;
+  const auto& input2_quantization_params = input_2->params;
+  const auto& output_quantization_params = output->params;
+  int32_t integer_type_min = 0;
+  int32_t integer_type_max = 0;
+  if (output->type == kTfLiteUInt8) {
+    integer_type_min = std::numeric_limits<uint8_t>::min();
+    integer_type_max = std::numeric_limits<uint8_t>::max();
+  } else {
+    // output->type == kTfLiteInt8
+    integer_type_min = std::numeric_limits<int8_t>::min();
+    integer_type_max = std::numeric_limits<int8_t>::max();
+  }
+
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point <= integer_type_max);
+
+  op_params->input1_offset = -input1_quantization_params.zero_point;
+  op_params->input2_offset = -input2_quantization_params.zero_point;
+  op_params->output_offset = output_quantization_params.zero_point;
+  op_params->left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1_quantization_params.scale,
+                   input2_quantization_params.scale);
+  const double real_input1_multiplier =
+      input1_quantization_params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2_quantization_params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale /
+      ((1 << op_params->left_shift) * output_quantization_params.scale);
+
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
+                                              &op_params->input1_multiplier,
+                                              &op_params->input1_shift);
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
+                                              &op_params->input2_multiplier,
+                                              &op_params->input2_shift);
+  op_params->input2_multiplier *= op_sign;
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
+                                              &op_params->output_multiplier,
+                                              &op_params->output_shift);
+  if (output->type == kTfLiteUInt8) {
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &op_params->output_activation_min,
+                                  &op_params->output_activation_max);
+  } else {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &op_params->output_activation_min,
+                                 &op_params->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareInt16SubOp(TfLiteContext* context,
+                               const TfLiteTensor* input1,
+                               const TfLiteTensor* input2, TfLiteTensor* output,
+                               TfLiteSubParams* params, OpData* data) {
+  // 16bit -> 16bit special quantized path, supporting only a rather
+  // narrow case of quantization parameters: zero_points must all be 0
+  // ("symmetric quantization") and scales must be power-of-two (which
+  // we abbreviate as "POT" below). The intended use case for this path
+  // is in LSTM cells, where, due to the constraints of implementing
+  // some of the math in these LSTM cells in fixed-point arithmetic,
+  // we need to have such symmetric, power-of-two quantization
+  // (Fixed-point formats are inherently symmetric, power-of-two).
+  TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
+  TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
+  TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+  int input1_scale_log2_rounded;
+  bool input1_scale_is_pot =
+      CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+  TF_LITE_ENSURE(context, input1_scale_is_pot);
+
+  int input2_scale_log2_rounded;
+  bool input2_scale_is_pot =
+      CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+  TF_LITE_ENSURE(context, input2_scale_is_pot);
+
+  int output_scale_log2_rounded;
+  bool output_scale_is_pot =
+      CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+  TF_LITE_ENSURE(context, output_scale_is_pot);
+
+  data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
+  data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
+
+  // Shifting of one input is supported. The graph quantization should ensure
+  // that the other input matches the output.
+  TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
+  TF_LITE_ENSURE(context, data->input1_shift <= 0);
+  TF_LITE_ENSURE(context, data->input2_shift <= 0);
+
+  CalculateActivationRangeQuantized(context, params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+  return kTfLiteOk;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -74,6 +211,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
+                                                params, data, -1));
+  } else if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
+                                                 output, params, data));
+  }
+
   return context->ResizeTensor(context, output, output_size);
 }
 
@@ -129,60 +274,67 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteSubParams* params, const OpData* data,
                    const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
-  auto input1_offset = -input1->params.zero_point;
-  auto input2_offset = -input2->params.zero_point;
-  auto output_offset = output->params.zero_point;
-  const int left_shift = 20;
-  const double twice_max_input_scale =
-      2 * std::max(input1->params.scale, input2->params.scale);
-  const double real_input1_multiplier =
-      input1->params.scale / twice_max_input_scale;
-  const double real_input2_multiplier =
-      input2->params.scale / twice_max_input_scale;
-  const double real_output_multiplier =
-      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = data->left_shift;
+  op_params.input1_offset = data->input1_offset;
+  op_params.input1_multiplier = data->input1_multiplier;
+  op_params.input1_shift = data->input1_shift;
+  op_params.input2_offset = data->input2_offset;
+  op_params.input2_multiplier = data->input2_multiplier;
+  op_params.input2_shift = data->input2_shift;
+  op_params.output_offset = data->output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  SetActivationParams(data->output_activation_min, data->output_activation_max,
+                      &op_params);
 
-  int32 input1_multiplier;
-  int input1_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
-                                      &input1_multiplier, &input1_shift);
-  int32 input2_multiplier;
-  int input2_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
-                                      &input2_multiplier, &input2_shift);
-  int32 output_multiplier;
-  int output_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
-                                      &output_multiplier, &output_shift);
-
-  int32 output_activation_min, output_activation_max;
-  CalculateActivationRangeUint8(params->activation, output,
-                                &output_activation_min, &output_activation_max);
-
-#define TF_LITE_SUB(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.left_shift = left_shift;                                   \
-  op_params.input1_offset = input1_offset;                             \
-  op_params.input1_multiplier = input1_multiplier;                     \
-  op_params.input1_shift = input1_shift;                               \
-  op_params.input2_offset = input2_offset;                             \
-  op_params.input2_multiplier = input2_multiplier;                     \
-  op_params.input2_shift = input2_shift;                               \
-  op_params.output_offset = output_offset;                             \
-  op_params.output_multiplier = output_multiplier;                     \
-  op_params.output_shift = output_shift;                               \
-  SetActivationParams(output_activation_min, output_activation_max,    \
-                      &op_params);                                     \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-  // The quantized version of Sub doesn't support activations, so we
-  // always use BroadcastSub.
-  if (kernel_type == kReference) {
-    TF_LITE_SUB(reference_ops, BroadcastSub4DSlow);
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(input1), GetTensorShape(input2), &op_params);
+
+#define TF_LITE_SUB(type, opname, data_type)                             \
+  type::opname(op_params, GetTensorShape(input1),                        \
+               GetTensorData<data_type>(input1), GetTensorShape(input2), \
+               GetTensorData<data_type>(input2), GetTensorShape(output), \
+               GetTensorData<data_type>(output))
+    // NOTE: We are using the add kernels. This is possible as the second values
+    // multiplier is negated before being passed down.
+  if (output->type == kTfLiteInt8) {
+    if (need_broadcast) {
+      TF_LITE_SUB(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+    } else {
+      TF_LITE_SUB(reference_integer_ops, Add, int8_t);
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (kernel_type == kReference) {
+      if (need_broadcast) {
+        TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, uint8_t);
+      } else {
+        TF_LITE_SUB(reference_ops, Add, uint8_t);
+      }
+    } else {
+      if (op_params.broadcast_category ==
+          BroadcastableOpCategory::kGenericBroadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t);
+      } else if (need_broadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t);
+      } else {
+        TF_LITE_SUB(optimized_ops, Add, uint8_t);
+      }
+    }
   } else {
-    TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow);
+    if (kernel_type == kReference) {
+      if (need_broadcast) {
+        TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, int16_t);
+      } else {
+        TF_LITE_SUB(reference_ops, Sub16, int16_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, int16_t);
+      } else {
+        TF_LITE_SUB(optimized_ops, Sub16, int16_t);
+      }
+    }
   }
 #undef TF_LITE_SUB
 }
@@ -198,7 +350,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 41503300ab599fbfcfee425c41033dd3bc10d2ea..3c19678b20f21894461f5ef79b1df6c45e1cac5a 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -63,17 +63,27 @@ class QuantizedSubOpModel : public BaseSubOpModel {
  public:
   using BaseSubOpModel::BaseSubOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
+  }
+
+  std::vector<float> GetDequantizedOutputInt16() {
+    return Dequantize<int16_t>(ExtractVector<int16_t>(output_),
                                GetScale(output_), GetZeroPoint(output_));
   }
 };
 
-// for quantized Sub, the error shouldn't exceed 2*step
+// for quantized Sub, the error shouldn't exceed step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
+}
+
+float GetToleranceInt16(float min, float max) {
+  float kQuantizedStep = (max - min) / std::numeric_limits<int16_t>::max();
+  return kQuantizedStep;
 }
 
 TEST(FloatSubOpModel, NoActivation) {
@@ -183,7 +193,8 @@ TEST(IntegerSubOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
@@ -193,20 +204,30 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
                                              {-0.8, -0.2, -0.1, 0.9},
                                              {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.5}};
@@ -215,57 +236,185 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
                                              {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
 
-TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedSubOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.1, -0.1, 0.4, 0.3, 0.0, 1.9}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedWithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    QuantizedSubOpModel m(
+        {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0},
+        {tensor_type, {}, -3.0, 3.0}, ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.7});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.7, -0.5, 0.0, 0.1, 0.4, 1.3}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastUInt8) {
+  QuantizedWithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
+  QuantizedWithBroadcast<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
+  const float kMin = -1.f;
+  const float kMax =
+      static_cast<float>(std::numeric_limits<int16_t>::max() - 1) /
+      std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<float>> inputs1 = {
+      {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.3, 0.8}};
+  std::vector<std::vector<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, 0.8, 0.5}};
+  std::vector<std::vector<float>> results = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, -1.0, 0.3}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<int16_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutputInt16(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsReluActivationInt16) {
+  const float kMin = -2.f;
+  const float kMax = 2.0 * (std::numeric_limits<int16_t>::max() - 1) /
+                     std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                             {-1.0, -0.2, 1.0, 0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<int16_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutputInt16(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationBroadcastInt16) {
+  const float kMin = -1.f;
+  const float kMax =
+      static_cast<float>(std::numeric_limits<int16_t>::max() - 1) /
+      std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, test_shapes[i], kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<int16_t>(m.input1(),
+                                   {-0.9, -0.7, -0.3, 0.0, 0.3, 0.5});
+    m.QuantizeAndPopulate<int16_t>(m.input2(), {0.2});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutputInt16(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-1.0, -0.9, -0.5, -0.2, 0.1, 0.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsReluActivationBroadcastInt16) {
+  const float kMin = -2.f;
+  const float kMax = 2.0 * (std::numeric_limits<int16_t>::max() - 1) /
+                     std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, test_shapes[i], kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<int16_t>(m.input1(),
+                                   {-0.9, -0.7, -0.3, 0.0, 0.3, 0.5});
+    m.QuantizeAndPopulate<int16_t>(m.input2(), {0.2});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutputInt16(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-1.0, -0.9, -0.5, -0.2, 0.1, 0.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e712be1b516ed0dca5097e66c2d1f20e63a78038
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -0,0 +1,409 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+// ADD and MUL are used to test simple branch.
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_MUL();
+// ADD and MUL are used to test dynamic sized subgraphs.
+TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_LESS_EQUAL();
+}  // namespace builtin
+namespace custom {
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
+}  // namespace custom
+}  // namespace ops
+
+namespace subgraph_test_util {
+
+namespace {
+
+void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type) {
+  ASSERT_EQ(subgraph->SetTensorParametersReadWrite(tensor_index, type, "", 0,
+                                                   nullptr, {}, false),
+            kTfLiteOk);
+}
+
+}  // namespace
+
+SubgraphBuilder::~SubgraphBuilder() {
+  for (auto buffer : buffers_) {
+    free(buffer);
+  }
+}
+
+void SubgraphBuilder::BuildAddSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |ADD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+}
+
+// Build a subgraph with an mul op. Helper function for testing.
+void SubgraphBuilder::BuildMulSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |MUL| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteMulParams* params =
+      reinterpret_cast<TfLiteMulParams*>(malloc(sizeof(TfLiteMulParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_MUL(), &node_index);
+}
+
+// Build a subgraph with a pad op. Helper function for testing.
+void SubgraphBuilder::BuildPadSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |PAD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLitePadParams* params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLitePadParams)));
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) {
+  const int kCondInput = 0;
+  const int kInput1 = 1;
+  const int kInput2 = 2;
+  const int kOutput = 3;
+  const int kTensorCount = 4;
+
+  // kCondInput(0) --> +----+
+  // kInput1(1)  ----> | IF | --> kOutput(3)
+  // kInput2(2)  ----> +----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kCondInput, kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kCondInput, kTfLiteBool);
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("then_subgraph_index", 1);
+    fbb.Int("else_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kCondInput, kInput1, kInput2}, {kOutput},
+      reinterpret_cast<const char*>(buffer.data()), buffer.size(), nullptr,
+      ::tflite::ops::custom::Register_IF(), &node_index);
+}
+
+void SubgraphBuilder::BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kConstRhs = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) ----> +------------+
+  //                  | LESS_EQUAL | --> kOutput(2)
+  // kConstRhs(3) --> +------------+
+  //
+  // kInput2(1) --> (unused)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteBool);
+
+  CreateConstantInt32Tensor(subgraph, kConstRhs, {1}, {rhs});
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kConstRhs}, {kOutput}, nullptr, 0, nullptr,
+      ::tflite::ops::builtin::Register_LESS_EQUAL(), &node_index);
+}
+
+void SubgraphBuilder::BuildAccumulateLoopBodySubgraph(Subgraph* subgraph) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kTensorCount = 5;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+            |
+  //                                         |
+  //                                         v
+  //                                      +-----+
+  //                                      | ADD | --> kOutputValue(3)
+  // kInputValue(1) ----------------------+-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+
+  int node_index;
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({0, 4}, {2}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+  params = reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({2, 1}, {3}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+}
+
+void SubgraphBuilder::BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                               const std::vector<int> padding) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kConstPadding = 5;
+  const int kTensorCount = 6;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+
+  //
+  // kInputValue(1) ----> +-----+
+  //                      | PAD | --> kOutputValue(3)
+  // kConstPadding(5) --> +-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+  ASSERT_EQ(padding.size() % 2, 0);
+  int padding_dims = padding.size();
+  CreateConstantInt32Tensor(subgraph, kConstPadding, {1, padding_dims},
+                            padding);
+
+  int node_index;
+  TfLiteAddParams* add_params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  add_params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters(
+      {kInputCounter, kConstStep}, {kOutputCounter}, nullptr, 0, add_params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+  TfLitePadParams* pad_params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLiteAddParams)));
+  subgraph->AddNodeWithParameters(
+      {kInputValue, kConstPadding}, {kOutputValue}, nullptr, 0, pad_params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput1 = 2;
+  const int kOutput2 = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) --> +-------+ --> kOutput1(2)
+  //                | WHILE |
+  // kInput2(1) --> +-------+ --> kOutput2(3)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput1, kOutput2}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput1, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput2, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("cond_subgraph_index", 1);
+    fbb.Int("body_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {0, 1}, {2, 3}, reinterpret_cast<const char*>(buffer.data()),
+      buffer.size(), nullptr, ::tflite::ops::custom::Register_WHILE(),
+      &node_index);
+}
+
+void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph,
+                                                int tensor_index,
+                                                const std::vector<int>& shape,
+                                                const std::vector<int>& data) {
+  ASSERT_GT(shape.size(), 0);
+  int num_elements = 1;
+  for (int dim : shape) {
+    num_elements *= dim;
+  }
+  ASSERT_EQ(data.size(), num_elements);
+  size_t size_in_bytes = sizeof(int32_t) * num_elements;
+  // Maybe aligned.
+  int32_t* buffer = reinterpret_cast<int32_t*>(malloc(size_in_bytes));
+  for (int i = 0; i < num_elements; ++i) {
+    buffer[i] = data[i];
+  }
+  buffers_.push_back(buffer);
+  ASSERT_EQ(subgraph->SetTensorParametersReadOnly(
+                tensor_index, kTfLiteInt32, "", shape, {},
+                reinterpret_cast<const char*>(buffer), size_in_bytes),
+            kTfLiteOk);
+}
+
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data) {
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    tensor->data.i32[i] = data[i];
+  }
+}
+
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteInt32);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.i32[i], data[i]);
+  }
+}
+
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteBool);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.b[i], data[i]);
+  }
+}
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..972f1381af2804252461bf81dfbce3563be41c3b
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module provides helper functions for testing the interaction between
+// control flow ops and subgraphs.
+// For convenience, we mostly only use `kTfLiteInt32` in this module.
+
+#ifndef TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter.h"
+
+namespace tflite {
+namespace subgraph_test_util {
+
+// TODO(ycling): This file should be renamed as
+// `control_flow_test_util` to avoid confusion. I'll do it immediately
+// in a separated change.
+class SubgraphBuilder {
+ public:
+  ~SubgraphBuilder();
+
+  // Build a subgraph with a single Add op.
+  // 2 inputs. 1 output.
+  void BuildAddSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Mul op.
+  // 2 inputs. 1 output.
+  void BuildMulSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Pad op.
+  // 2 inputs. 1 output.
+  void BuildPadSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single If op.
+  // 3 inputs:
+  //   The 1st input is condition with boolean type.
+  //   The 2nd and 3rd inputs are feed input the branch subgraphs.
+  // 1 output.
+  void BuildIfSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Less op.
+  // The subgraph is used as the condition subgraph for testing `While` op.
+  // 2 inputs:
+  //   The 1st input is a counter with `kTfLiteInt32` type.
+  //   The 2nd input is ignored in this subgraph.
+  // 1 output with `kTfLiteBool` type.
+  //   Equivalent to (input < rhs).
+  void BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs);
+
+  // An accumulate loop body subgraph. Used to produce triangle number
+  // seqeuence. 2 inputs and 2 outpus
+  //   Equivalent to (counter, value) -> (counter + 1, counter + 1 + value)
+  void BuildAccumulateLoopBodySubgraph(Subgraph* subgraph);
+
+  // A pad loop body subgraph. When used in a loop it will repeatively enlarge
+  // the
+  //   tensor.
+  // 2 inputs and 2 outputs.
+  //   Equivalent to (counter, value) -> (counter + 1, tf.pad(value, padding))
+  // Note the padding is created as a constant tensor.
+  void BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                const std::vector<int> padding);
+
+  // Build a subgraph with a single While op.
+  // 2 inputs, 2 outputs.
+  void BuildWhileSubgraph(Subgraph* subgraph);
+
+ private:
+  void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
+                                 const std::vector<int>& shape,
+                                 const std::vector<int>& data);
+  std::vector<void*> buffers_;
+};
+
+class ControlFlowOpTest : public ::testing::Test {
+ public:
+  ControlFlowOpTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~ControlFlowOpTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+// Fill a `TfLiteTensor` with a 32-bits integer vector.
+// Preconditions:
+// * The tensor must have `kTfLiteInt32` type.
+// * The tensor must be allocated.
+// * The element count of the tensor must be equal to the length or
+//   the vector.
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data);
+
+// Check if the shape and int32 data of a tensor is as expected.
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data);
+// Check if the shape and bool data of a tensor is as expected.
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data);
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/subgraph_test_util_test.cc b/tensorflow/lite/kernels/subgraph_test_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04e5118b543c1723e1de1875ffd9315991a4dd69
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+
+namespace subgraph_test_util {
+
+namespace {
+
+class SubgraphBuilderTest : public ::testing::Test {
+ public:
+  SubgraphBuilderTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~SubgraphBuilderTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  void TestAccumelateLoopBody(int input1, int input2, int output1,
+                              int output2) {
+    interpreter_.reset(new Interpreter);
+    builder_->BuildAccumulateLoopBodySubgraph(
+        &interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {input1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {input2});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor1 =
+        interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output_tensor1, {1}, {output1});
+    TfLiteTensor* output_tensor2 =
+        interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output_tensor2, {1}, {output2});
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+TEST_F(SubgraphBuilderTest, TestBuildAddSubgraph) {
+  builder_->BuildAddSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildMulSubgraph) {
+  builder_->BuildMulSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {5, 14});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadSubgraph) {
+  builder_->BuildPadSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildLessEqualCondSubgraph) {
+  builder_->BuildLessEqualCondSubgraph(&interpreter_->primary_subgraph(), 3);
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {5});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {10, 10});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Test [1, 2, 3, 4, 5] <= 3 == [true, true, true, false, false]
+  // (with broadcasting).
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]),
+                {1, 2, 3, 4, 5});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckBoolTensor(output, {5}, {true, true, true, false, false});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildAccumulateLoopBodySubgraph) {
+  TestAccumelateLoopBody(1, 1, 2, 3);
+  TestAccumelateLoopBody(2, 3, 3, 6);
+  TestAccumelateLoopBody(3, 6, 4, 10);
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadLoopBodySubgraph) {
+  builder_->BuildPadLoopBodySubgraph(&interpreter_->primary_subgraph(), {1, 2});
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {5});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]),
+                {0, 5, 7, 0, 0});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {2});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {8}, {0, 0, 5, 7, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 19d7e37409cba2f4b018082d13a2d3e130a3c5c4..dd8d9ed21830f852c9a13f09626278f4ca4938e2 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -47,7 +47,12 @@ std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
 }
 
 int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
-  int id = AddTensor<float>(t, {}, is_variable);
+  int id = 0;
+  if (t.per_channel_quantization) {
+    id = AddTensorPerChannelQuant(t);
+  } else {
+    id = AddTensor<float>(t, {}, is_variable);
+  }
   inputs_.push_back(id);
   return id;
 }
@@ -119,7 +124,7 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   CHECK(interpreter_ != nullptr);
 
-  for (int i = 0; i < input_shapes.size(); ++i) {
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
     const int input_idx = interpreter_->inputs()[i];
     if (input_idx == kOptionalTensor) continue;
     const auto& shape = input_shapes[i];
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 4a442f9fa7554fbc5c149e1dd20f82c162d392d4..08c027f9d9d4fb469abecd460a09cdd543214215 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -21,13 +21,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace tflite {
 
@@ -82,14 +83,36 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 // A helper struct to construct test tensors. This is particularly useful for
 // quantized tensor which must have their scale and zero_point defined before
 // the actual data is known. This mimics what happens in practice: quantization
-// parameters are calculated during training.
+// parameters are calculated during training or post training..
 struct TensorData {
+  TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
+             float min = 0.0f, float max = 0.0f, float scale = 0.0f,
+             int32_t zero_point = 0, bool per_channel_quantization = false,
+             std::vector<float> per_channel_quantization_scales = {},
+             std::vector<int64_t> per_channel_quantization_offsets = {},
+             int32_t channel_index = 0)
+      : type(type),
+        shape(shape),
+        min(min),
+        max(max),
+        scale(scale),
+        zero_point(zero_point),
+        per_channel_quantization(per_channel_quantization),
+        per_channel_quantization_scales(
+            std::move(per_channel_quantization_scales)),
+        per_channel_quantization_offsets(
+            std::move(per_channel_quantization_offsets)),
+        channel_index(channel_index) {}
   TensorType type;
   std::vector<int> shape;
   float min;
   float max;
   float scale;
   int32_t zero_point;
+  bool per_channel_quantization;
+  std::vector<float> per_channel_quantization_scales;
+  std::vector<int64_t> per_channel_quantization_offsets;
+  int32_t channel_index;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -172,6 +195,46 @@ class SingleOpModel {
     PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
   }
 
+  // Quantize and populate data for filter with per channel quantization.
+  void PerChannelSymmetricQuantizeAndPopulate(
+      int index, const std::vector<float>& input_data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    const int channel_index = params->quantized_dimension;
+
+    std::vector<int32_t> shape(t->dims->size);
+    for (int i = 0; i < shape.size(); ++i) {
+      shape[i] = t->dims->data[i];
+    }
+    const int32_t num_inputs = input_data.size();
+    const int32_t num_channel = shape[channel_index];
+    std::vector<int8_t> quantized_output(num_inputs);
+    std::vector<float> scales_inv(num_channel);
+    for (int i = 0; i < num_channel; ++i) {
+      scales_inv[i] = 1.0f / params->scale->data[i];
+    }
+    optimize::utils::SymmetricPerChannelQuantizeValues(
+        input_data.data(), scales_inv, shape, channel_index, &quantized_output);
+
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
+  // Quantize and populate data for bias with per channel quantization.
+  void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<int32_t> quantized_output(num_inputs);
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    for (int i = 0; i < num_inputs; ++i) {
+      quantized_output[i] = input_data[i] * params->scale->data[i];
+    }
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
   const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
 
   float GetScale(int id) { return tensor_data_.at(id).scale; }
@@ -263,6 +326,10 @@ class SingleOpModel {
     return result;
   }
 
+  void SetNumThreads(int num_threads) {
+    interpreter_->SetNumThreads(num_threads);
+  }
+
   void SetResolver(std::unique_ptr<OpResolver> resolver) {
     resolver_ = std::move(resolver);
   }
@@ -292,6 +359,24 @@ class SingleOpModel {
     return {scale, zero_point};
   }
 
+  int AddTensorPerChannelQuant(TensorData t) {
+    const int id = tensors_.size();
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+    q_params = CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0,
+        /*scale=*/
+        builder_.CreateVector<float>(t.per_channel_quantization_scales),
+        /*zero point=*/
+        builder_.CreateVector<int64_t>(t.per_channel_quantization_offsets),
+        QuantizationDetails_NONE, 0, t.channel_index);
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type,
+                     /*buffer=*/0,
+                     /*name=*/0, q_params, /*is_variable=*/false));
+    tensor_data_[id] = t;
+    return id;
+  }
+
   template <typename T>
   int AddTensor(TensorData t, std::initializer_list<T> data,
                 bool is_variable = false) {
@@ -367,6 +452,17 @@ class SingleOpModel {
     // Update quantization params.
     t->params.scale = scaling_factor;
     t->params.zero_point = 0;
+    // Populate the new quantization params.
+    TfLiteQuantizationFree(&t->quantization);
+    t->quantization.type = kTfLiteAffineQuantization;
+    auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+        malloc(sizeof(TfLiteAffineQuantization)));
+    affine_quantization->quantized_dimension = 0;
+    affine_quantization->scale = TfLiteFloatArrayCreate(1);
+    affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+    affine_quantization->scale->data[0] = scaling_factor;
+    affine_quantization->zero_point->data[0] = 0;
+    t->quantization.params = affine_quantization;
     return q;
   }
 
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 444b01e7b2e055ab4e26a2ea1dce28642dc430b7..64973d7b860fc3089850cc3648ee4fb9da11047e 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -207,6 +207,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TopK(row_size, num_rows, input->data.uint8, k, output_indexes->data.i32,
            output_values->data.uint8);
       break;
+    case kTfLiteInt8:
+      TopK(row_size, num_rows, input->data.int8, k, output_indexes->data.i32,
+           output_values->data.int8);
+      break;
     case kTfLiteInt32:
       TopK(row_size, num_rows, input->data.i32, k, output_indexes->data.i32,
            output_values->data.i32);
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index 108b8123666aaddcc8ba8438bac82c91ce98d50d..0097ae2f9aece116c963a4b460c2e3ff0fc127c4 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -46,6 +46,10 @@ class TopKV2OpModel : public SingleOpModel {
     PopulateTensor<uint8_t>(input_, data);
   }
 
+  void SetInputInt8(std::initializer_list<int8_t> data) {
+    PopulateTensor<int8_t>(input_, data);
+  }
+
   void SetInputInt32(std::initializer_list<int32_t> data) {
     PopulateTensor<int32_t>(input_, data);
   }
@@ -66,6 +70,10 @@ class TopKV2OpModel : public SingleOpModel {
     return ExtractVector<uint8_t>(output_values_);
   }
 
+  std::vector<int8_t> GetValuesInt8() {
+    return ExtractVector<int8_t>(output_values_);
+  }
+
   std::vector<int32_t> GetValuesInt32() {
     return ExtractVector<int32_t>(output_values_);
   }
@@ -128,6 +136,14 @@ TEST(TopKV2OpTest, TypeUint8) {
   EXPECT_THAT(m.GetValuesUInt8(), ElementsAreArray({3, 2, 251, 250}));
 }
 
+TEST(TopKV2OpTest, TypeInt8) {
+  TopKV2OpModel m({2, 3}, TensorType_INT8, 2);
+  m.SetInputInt8({1, 2, 3, -126, 125, -24});
+  m.Invoke();
+  EXPECT_THAT(m.GetIndexes(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetValuesInt8(), ElementsAreArray({3, 2, 125, -24}));
+}
+
 // Check that int32_t works.
 TEST(TopKV2OpTest, TypeInt32) {
   TopKV2OpModel m({2, 3}, TensorType_INT32, 2);
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 7a6d320674ad1c8302f8bf3a9d1d5153223deed3..0ef4972d1a856f84d3511657ec9d9f2f3cc36182 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -117,6 +117,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_TRANSPOSE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_TRANSPOSE(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 44d1336b99fe03535451c7dbacfe77be58fd6fad..05d3451d0056649b8cc958974b105774a1bdcfc3 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -57,7 +57,7 @@ class TransposeConvOpModel : public SingleOpModel {
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_TRANSPOSE_CONV, registration);
     BuildInterpreter(
-        {GetShape(output_shape_), GetShape(input_), GetShape(filter_)});
+        {GetShape(output_shape_), GetShape(filter_), GetShape(input_)});
   }
 
   void SetOutputShape(std::initializer_list<int> i) {
@@ -97,8 +97,8 @@ class TransposeConvOpTest : public SingleOpTest {
 //     [1, 1, 1, 1 ],
 //     "SAME")
 TEST_P(TransposeConvOpTest, SimpleTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 1}},
-                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {1, 4, 4, 1}},
                          {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1);
   m.SetOutputShape({1, 4, 4, 1});
   m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -125,8 +125,8 @@ TEST_P(TransposeConvOpTest, SimpleTest) {
 // And filter value is derived by:
 // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
 TEST_P(TransposeConvOpTest, TwoFiltersTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 2}},
-                         {TensorType_FLOAT32, {1, 3, 3, 2}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 3, 2}},
+                         {TensorType_FLOAT32, {1, 4, 4, 2}},
                          {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1);
   m.SetOutputShape({1, 4, 4, 1});
   m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
@@ -153,8 +153,8 @@ TEST_P(TransposeConvOpTest, TwoFiltersTest) {
 // And filter value is derived by:
 // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
 TEST_P(TransposeConvOpTest, PaddingValidTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 2}},
-                         {TensorType_FLOAT32, {1, 3, 3, 2}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 3, 2}},
+                         {TensorType_FLOAT32, {1, 4, 4, 2}},
                          {TensorType_FLOAT32, {}}, Padding_VALID, 1, 1);
   m.SetOutputShape({1, 6, 6, 1});
   m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
@@ -182,8 +182,8 @@ TEST_P(TransposeConvOpTest, PaddingValidTest) {
 //     [1, 2, 2, 1 ],
 //     "VALID")
 TEST_P(TransposeConvOpTest, StrideValidTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 2, 2, 1}},
-                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {1, 2, 2, 1}},
                          {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2);
   m.SetOutputShape({1, 5, 5, 1});
   m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -208,8 +208,8 @@ TEST_P(TransposeConvOpTest, StrideValidTest) {
 //     [1, 2, 2, 1 ],
 //     "VALID")
 TEST_P(TransposeConvOpTest, MultiChannelTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 2, 2, 1}},
-                         {TensorType_FLOAT32, {2, 3, 3, 1}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 3, 3, 1}},
+                         {TensorType_FLOAT32, {1, 2, 2, 1}},
                          {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2);
   m.SetOutputShape({1, 5, 5, 2});
   m.SetFilter({1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18});
@@ -238,8 +238,8 @@ TEST_P(TransposeConvOpTest, MultiChannelTest) {
 // And filter value is derived by:
 // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
 TEST_P(TransposeConvOpTest, AccuracyTest) {
-  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 1, 2, 1}},
-                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {1, 1, 2, 1}},
                          {TensorType_FLOAT32, {}}, Padding_SAME, 3, 3);
   m.SetOutputShape({1, 3, 4, 1});
   m.SetFilter({9, 5, 6, 9, 8, 5, 3, 1, 4});
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 93df2c81db8c17de7a36d155c7d26b826c859c99..71644159209cc289329f65d1cac929585f2f4200 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -25,16 +25,17 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename T>
 void RunTestPermutation(const std::vector<int>& shape,
                         const std::vector<int>& perms,
-                        std::vector<float>* input_transposed) {
+                        std::vector<T>* input_transposed) {
   // Count elements and allocate output.
   int count = 1;
   for (auto factor : shape) count *= factor;
   input_transposed->resize(count);
 
   // Create the dummy data
-  std::vector<float> input(count);
+  std::vector<T> input(count);
   for (int i = 0; i < input.size(); i++) {
     input[i] = i;
   }
@@ -64,8 +65,8 @@ void RunTestPermutation(const std::vector<int>& shape,
     params.perm[i] = perms[i];
   }
 
-  reference_ops::Transpose<float>(params, input_shape, input.data(),
-                                  output_shape, input_transposed->data());
+  reference_ops::Transpose<T>(params, input_shape, input.data(), output_shape,
+                              input_transposed->data());
 }
 
 TEST(TransposeTest, TestRefOps1D) {
@@ -125,6 +126,28 @@ TEST(TransposeTest, TestRefOps4D) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(TransposeTest, TestRefOps4DInt8) {
+  std::vector<int8_t> out;
+  // Basic 4d.
+  RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
+  ASSERT_EQ(
+      out,
+      std::vector<int8_t>(
+          {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+           60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+           5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+           65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+           10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+           70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+           15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+           75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
+  RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
+  // Basic identity.
+  std::vector<int8_t> ref(out.size());
+  for (int k = 0; k < ref.size(); k++) ref[k] = k;
+  ASSERT_EQ(out, ref);
+}
+
 class TransposeOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<float> data) {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index e2fc73ba29b5c96ad83536fb8752c11d70191d4d..8c2d0d57c7b794c74d5b48f8d902a69efba89645 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -246,7 +246,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 }
 
 // Resize the output and  state tensors based on the sizes of the input tensors.
-// Allocate a temprory scratch tensor. Also check that the sizes of the input
+// Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 1caffe14f90b8ce9d13d8c781e87bf918c02b9f4..99ad4bb4e817ed435043fb17469381192db843ff 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -52,9 +52,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
   TF_LITE_ENSURE(context, NumDimensions(input) > 1);
-  TF_LITE_ENSURE(context, NumDimensions(input) > data->axis);
-  // TODO(renjieliu): Support negative axis.
-  TF_LITE_ENSURE(context, data->axis >= 0);
+  int axis = data->axis;
+  if (axis < 0) {
+    axis += NumDimensions(input);
+  }
+  TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
   if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
     context->ReportError(context,
                          "Currently pack only supports int32 and float32.");
@@ -67,12 +69,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(input) - 1);
   int o = 0;
   for (int index = 0; index < NumDimensions(input); ++index) {
-    if (index != data->axis) {
+    if (index != axis) {
       output_shape->data[o++] = input_shape->data[index];
     }
   }
 
-  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[data->axis]);
+  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[axis]);
   for (int i = 0; i < data->num; ++i) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 9b60cce549804a59e343f3e26f978679a1624c00..76f7dff93e395414f0e5a69fe4cef151a7517315 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -28,14 +28,16 @@ template <typename T>
 class UnpackOpModel : public SingleOpModel {
  public:
   UnpackOpModel(const TensorData& input, int axis) {
-    CHECK_LE(axis, input.shape.size());
+    if (axis < 0) {
+      axis += input.shape.size();
+    }
     const int num_outputs = input.shape[axis];
     input_ = AddInput(input);
     for (int i = 0; i < num_outputs; ++i) {
       outputs_.push_back(AddOutput(input.type));
     }
     SetBuiltinOp(BuiltinOperator_UNPACK, BuiltinOptions_UnpackOptions,
-                 CreatePackOptions(builder_, num_outputs, axis).Union());
+                 CreateUnpackOptions(builder_, num_outputs, axis).Union());
     BuildInterpreter({GetShape(input_)});
   }
 
@@ -104,6 +106,44 @@ TEST(UnpackOpTest, FloatThreeOutputsAxisOne) {
   EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
 }
 
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisOne) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -1);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(3));
+  EXPECT_THAT(output_shapes[1], ElementsAre(3));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
+}
+
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisTwo) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -2);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 3);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2));
+  EXPECT_THAT(output_shapes[2], ElementsAre(2));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 3);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2));
+  EXPECT_THAT(output_datas[1], ElementsAre(3, 4));
+  EXPECT_THAT(output_datas[2], ElementsAre(5, 6));
+}
+
 TEST(UnpackOpTest, FloatOneOutput) {
   UnpackOpModel<float> model({TensorType_FLOAT32, {1, 6}}, 0);
   model.SetInput({1, 2, 3, 4, 5, 6});
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96ee36f08bc0144ce0e4a66d3d7350a791d26d86
--- /dev/null
+++ b/tensorflow/lite/kernels/where.cc
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace where {
+
+constexpr int kInputConditionTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* cond_tensor,
+                                TfLiteTensor* output_tensor) {
+  // Output tensor should have shape:
+  // (num_true, cond_rank), where num_true denotes the number of true values
+  // in condition.
+  const RuntimeShape& cond_shape = GetTensorShape(cond_tensor);
+  const int size = cond_shape.FlatSize();
+  const int cond_rank = cond_shape.DimensionsCount();
+  const bool* cond_data = GetTensorData<bool>(cond_tensor);
+
+  int true_count = 0;
+  for (int i = 0; i < size; ++i) {
+    if (cond_data[i]) {
+      true_count++;
+    }
+  }
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(2);
+  output_dims->data[0] = true_count;
+  output_dims->data[1] = cond_rank;
+  return context->ResizeTensor(context, output_tensor, output_dims);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (cond_tensor->type != kTfLiteBool) {
+    context->ReportError(context,
+                         "Condition tensor must be of type bool, but saw '%s'.",
+                         TfLiteTypeGetName(cond_tensor->type));
+    return kTfLiteError;
+  }
+
+  // As output will be a 2D tensor of indices, we use int32 as data type.
+  output->type = kTfLiteInt32;
+
+  // Exit early if cond is a non-const tensor. Set output tensor to dynamic so
+  // output size can be determined in Eval.
+  if (!IsConstantTensor(cond_tensor)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, cond_tensor, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, cond_tensor, output));
+  }
+
+  reference_ops::SelectTrueCoords(GetTensorShape(cond_tensor),
+                                  GetTensorData<bool>(cond_tensor),
+                                  GetTensorData<int32_t>(output));
+  return kTfLiteOk;
+}
+}  // namespace where
+
+TfLiteRegistration* Register_WHERE() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 where::Prepare, where::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89bd7c43646f80f8b0adb4ef4026f1d9bc7b43c4
--- /dev/null
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseWhereOpModel : public SingleOpModel {
+ public:
+  BaseWhereOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_WHERE, BuiltinOptions_WhereOptions,
+                 CreateWhereOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class IntegerWhereOpModel : public BaseWhereOpModel {
+ public:
+  using BaseWhereOpModel::BaseWhereOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(WhereOpTest, SelectFromVectorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromVector) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrixNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false,  //
+                                     false, false, false,  //
+                                     false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromMatrix1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 1}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               2, 0}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false,   //
+                                     true, false, false,  //
+                                     true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 1,  //
+                                               1, 0,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 5}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, false, true, true,   //
+                                     false, true, true, false, false,  //
+                                     true, false, true, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 3,  //
+                                               0, 4,  //
+                                               1, 1,  //
+                                               1, 2,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3TensorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false, false,  //
+                                     false, false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 1, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true,  //
+                                     false, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 2,  //
+                                               1, 0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true,  //
+                                     false, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 1, 1}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true, false, false,  //
+                                     false, false, true, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 2, 0,  //
+                                               1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3f00d3fe13c40fb9caa2a168c2b56d9ef1a73c0
--- /dev/null
+++ b/tensorflow/lite/kernels/while.cc
@@ -0,0 +1,312 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace while_kernel {
+
+namespace {
+
+// Propagate tensor shapes and types from `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsShapeAndType(TfLiteContext* context,
+                                     Subgraph* src_subgraph,
+                                     const SrcVector& src_tensor_indices,
+                                     Subgraph* dst_subgraph,
+                                     const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    std::vector<int> dims(src_tensor->dims->data,
+                          src_tensor->dims->data + src_tensor->dims->size);
+    dst_subgraph->ResizeInputTensor(dst_tensor_indices[i], dims);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    dst_tensor->type = src_tensor->type;
+  }
+  return kTfLiteOk;
+}
+
+// Copy the tensors data from tensors `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
+                             const SrcVector& src_tensor_indices,
+                             Subgraph* dst_subgraph,
+                             const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    TF_LITE_ENSURE_EQ(context, src_tensor->bytes, dst_tensor->bytes);
+    memcpy(dst_tensor->data.raw, src_tensor->data.raw, src_tensor->bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckCondOutput(TfLiteContext* context,
+                             const TfLiteTensor* cond_output) {
+  // The condition output must be a single boolean value.
+  TF_LITE_ENSURE_EQ(context, cond_output->type, kTfLiteBool);
+  if (cond_output->dims->size == 0) {
+    // It's okay if it's a 0D scalar.
+    return kTfLiteOk;
+  }
+  // Otherwise it must be 1D with shape [1].
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->data[0], 1);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+struct OpData {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+  bool cond_has_dynamic_output_tensors;
+  bool body_has_dynamic_output_tensors;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  op_data->cond_subgraph_index = m["cond_subgraph_index"].AsInt32();
+  op_data->body_subgraph_index = m["body_subgraph_index"].AsInt32();
+  op_data->cond_has_dynamic_output_tensors = false;
+  op_data->body_has_dynamic_output_tensors = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  int num_inputs = node->inputs->size;
+  // The number of outputs should be the same as number of inputs.
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, num_inputs);
+
+  // Check subgraph indices and get subgraphs.
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->cond_subgraph_index < subgraphs->size());
+  TF_LITE_ENSURE(context, op_data->body_subgraph_index < subgraphs->size());
+
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // Check input & output count of the condition subgraph.
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->outputs().size(), 1);
+
+  // Check input & output count of the body subgraph.
+  TF_LITE_ENSURE_EQ(context, body_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, body_subgraph->outputs().size(), num_inputs);
+
+  // Prepare and check the condition subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       cond_subgraph, cond_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+  TfLiteTensor* cond_output =
+      cond_subgraph->tensor(cond_subgraph->outputs()[0]);
+  // TODO(ycling): Handle the case the cond subgraph has dynamic tensor outputs.
+  // This should rarely happens. In most cases the output is static with shape
+  // [1]. However theoretically intermediate tensors in the cond subgraph
+  // can be dynamic.
+  if (IsDynamicTensor(cond_output)) {
+    op_data->cond_has_dynamic_output_tensors = true;
+  } else {
+    TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+  }
+
+  // Prepare and check the body subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       body_subgraph, body_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+  if (body_subgraph->HasDynamicTensors()) {
+    op_data->body_has_dynamic_output_tensors = true;
+  } else {
+    for (int i = 0; i < num_inputs; ++i) {
+      TfLiteTensor* body_input =
+          body_subgraph->tensor(body_subgraph->inputs()[i]);
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TF_LITE_ENSURE_EQ(context, body_input->type, body_output->type);
+
+      // TODO(ycling): Support dynamic sized body subgraph.
+      TF_LITE_ENSURE(context, !IsDynamicTensor(body_output));
+      if (!TfLiteIntArrayEqual(body_input->dims, body_output->dims)) {
+        // If the output shape of the body subgraph is static w.r.t. a fixed
+        // input size, but it's different from input size, it's still considered
+        // dynamic. For example: If a subgraph keeps padding its input with a
+        // fixed padding, the output shape is static w.r.t the input shape and
+        // padding, but running it in a loop will keep bloating the tensor.
+        op_data->body_has_dynamic_output_tensors = true;
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < num_inputs; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (op_data->body_has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(body_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // The follow graph illustrates the current implementation.
+  //
+  // This Subgraph          Cond Subgraph         Body Subgraph
+  // +-----------+   (1)   +------------+   (3)   +------------+
+  // |   WHILE   |-------->|  SUBGRAPH  |-------->|  SUBGRAPH  |
+  // |   INPUT   |        /|   INPUT    |<-----   |   INPUT    |
+  // +-----------+       / +------------+      \  +------------+
+  //                    /        |              \       |
+  //               (6) /         | (2)       (5) \      | (4)
+  //                  /          v                \     v
+  // +-----------+   /     +------------+         +------------+
+  // |   WHILE   |<--      |  SUBGRAPH  |         |  SUBGRAPH  |
+  // |   OUTPUT  |         |   OUTPUT   |         |   OUTPUT   |
+  // +-----------+         +------------+         +------------+
+  //
+  // (1) Copy the inputs of WHILE op to the inputs of condition subgraph.
+  // (2) Invoke condition subgraph.
+  //     Jump to step 5 if result is false.
+  // (3) Copy the inputs of condition subgraph to the inputs of body subgraph.
+  // (4) Invoke body subgraph.
+  // (5) Copy the outputs of body subgraph to the inputs condition subgraph.
+  //     Jump back to step 2!
+  // (6) Copy the inputs of condition subgraph to the outputs of WHILE op.
+  //
+  // If the body subgraph has dynamic sized outputs, it's required to resize the
+  // tensor before copying in step 1, 3, 4 and 6.
+  //
+  // Note the flow is carefully designed to handle the dynamic sized output
+  // case. The loop invariant is: The newest value is in the inputs of condition
+  // subgraph. This is always true before step 2.
+  //
+  // This is the best we can do without sharing tensor buffer across subgraph
+  // boundary. Currently we copy the input / output between the subgraphs. This
+  // isn't optimized yet and a lot of redundant copies are made.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, this_subgraph, TfLiteIntArrayView(node->inputs),
+                      cond_subgraph, cond_subgraph->inputs()));
+
+  while (true) {
+    TF_LITE_ENSURE_OK(context, cond_subgraph->Invoke());
+    int cond_subgraph_output_index = cond_subgraph->outputs()[0];
+    cond_subgraph->EnsureTensorDataIsReadable(cond_subgraph_output_index);
+    TfLiteTensor* cond_output =
+        cond_subgraph->tensor(cond_subgraph_output_index);
+    if (op_data->cond_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+    }
+
+    if (!cond_output->data.b[0]) {
+      break;
+    }
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, cond_subgraph, cond_subgraph->inputs(),
+                            body_subgraph, body_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                        body_subgraph, body_subgraph->inputs()));
+
+    TF_LITE_ENSURE_OK(context, body_subgraph->Invoke());
+
+    for (int tensor_index : body_subgraph->outputs()) {
+      body_subgraph->EnsureTensorDataIsReadable(tensor_index);
+    }
+
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, body_subgraph, body_subgraph->outputs(),
+                            cond_subgraph, cond_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
+                        cond_subgraph, cond_subgraph->inputs()));
+  }
+
+  // Note that copying from body's output will fail if body is never invoked.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  if (op_data->body_has_dynamic_output_tensors) {
+    TF_LITE_ENSURE_OK(
+        context, CopyTensorsShapeAndType(context, cond_subgraph,
+                                         cond_subgraph->inputs(), this_subgraph,
+                                         TfLiteIntArrayView(node->outputs)));
+  }
+
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                      this_subgraph, TfLiteIntArrayView(node->outputs)));
+  return kTfLiteOk;
+}
+
+}  // namespace while_kernel
+
+TfLiteRegistration* Register_WHILE() {
+  static TfLiteRegistration r = {while_kernel::Init, while_kernel::Free,
+                                 while_kernel::Prepare, while_kernel::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9946b4a3280116d7cb176f54b94b73bb956a5f71
--- /dev/null
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
+
+namespace {
+
+class WhileTest : public ControlFlowOpTest {};
+
+// The test builds a model that produces the i-th number of
+// triangular number sequence.
+//
+// TODO(ycling): Consider to improve this test case by adding a
+// concat into the body subgraph.
+TEST_F(WhileTest, TestTriangularNumberSequence) {
+  const std::vector<int> expected = {1, 3, 6, 10, 15, 21, 28};
+  for (int i = 0; i < expected.size(); ++i) {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), i);
+    builder_->BuildAccumulateLoopBodySubgraph(interpreter_->subgraph(2));
+    builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output1, {1}, {i + 1});
+    TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output2, {1}, {expected[i]});
+  }
+}
+
+// This requires dynamic sized subgraphs and it's not supported right now.
+// TODO(ycling): Support dynamic sized subgraphs.
+TEST_F(WhileTest, TestPadLoop) {
+  interpreter_.reset(new Interpreter);
+  interpreter_->AddSubgraphs(2);
+  builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), 3);
+  builder_->BuildPadLoopBodySubgraph(interpreter_->subgraph(2), {1, 2});
+  builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+  // This is not supported yet. The test ensures thatit doesn't crash and raises
+  // an error properly.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {4});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {11}, {0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/minimal_logging.cc b/tensorflow/lite/minimal_logging.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8768ef6e312ec80c9b3653983421e07c662f8e5e
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.cc
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::Log(LogSeverity severity, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VLog(severity, format, args);
+  va_end(args);
+}
+
+const char* MinimalLogger::GetSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return "INFO";
+    case TFLITE_LOG_WARNING:
+      return "WARNING";
+    case TFLITE_LOG_ERROR:
+      return "ERROR";
+    default:
+      return "<Unknown severity>";
+  }
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging.h b/tensorflow/lite/minimal_logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..7682ed8edc401762613a9cae582fdf1e16e61f51
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+#define TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+
+#include <cstdarg>
+
+namespace tflite {
+
+enum LogSeverity {
+  TFLITE_LOG_INFO = 0,
+  TFLITE_LOG_WARNING = 1,
+  TFLITE_LOG_ERROR = 2,
+};
+
+namespace logging_internal {
+
+// Helper class for simple platform-specific console logging. Note that we
+// explicitly avoid the convenience of ostream-style logging to minimize binary
+// size impact.
+class MinimalLogger {
+ public:
+  // Logging hook that takes variadic args.
+  static void Log(LogSeverity severity, const char* format, ...);
+
+  // Logging hook that takes a formatted va_list.
+  static void VLog(LogSeverity severity, const char* format, va_list args);
+
+ private:
+  static const char* GetSeverityName(LogSeverity severity);
+};
+
+}  // namespace logging_internal
+}  // namespace tflite
+
+// Convenience macro for basic internal logging in production builds.
+// Note: This should never be used for debug-type logs, as it will *not* be
+// stripped in release optimized builds. In general, prefer the error reporting
+// APIs for developer-facing errors, and only use this for diagnostic output
+// that should always be logged in user builds.
+#define TFLITE_LOG_PROD(severity, format, ...) \
+  tflite::logging_internal::MinimalLogger::Log(severity, format, ##__VA_ARGS__);
+
+#endif  // TENSORFLOW_LITE_MINIMAL_LOGGING_H_
diff --git a/tensorflow/lite/minimal_logging_android.cc b/tensorflow/lite/minimal_logging_android.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f87e6fa18e1dac7e2b4e093f6d4e91a1e652ba3d
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_android.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <android/log.h>
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return ANDROID_LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return ANDROID_LOG_WARN;
+    case TFLITE_LOG_ERROR:
+      return ANDROID_LOG_ERROR;
+    default:
+      return ANDROID_LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // First log to Android's explicit log(cat) API.
+  va_list args_for_android_log;
+  va_copy(args_for_android_log, args);
+  __android_log_vprint(GetPlatformSeverity(severity), "tflite", format, args);
+  va_end(args_for_android_log);
+
+  // Also print to stderr for standard console applications.
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_default.cc b/tensorflow/lite/minimal_logging_default.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa13e47e63a01b5c15ada3b09498fdb755f6376
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_default.cc
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_ios.cc b/tensorflow/lite/minimal_logging_ios.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a774682a5b42f71d0cc77c49bbcf9a4ec6ef21b7
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_ios.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <syslog.h>
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return LOG_WARNING;
+    case TFLITE_LOG_ERROR:
+      return LOG_ERR;
+    default:
+      return LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // TODO(b/123704468): Use os_log when available.
+  vsyslog(GetPlatformSeverity(severity), format, args);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_test.cc b/tensorflow/lite/minimal_logging_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e59425a2b264b72d44477c6484fc0ffea014a750
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+TEST(MinimalLogging, Basic) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo");
+  EXPECT_EQ("INFO: Foo\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, BasicFormatted) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo %s %s", "Bar", "Baz");
+  EXPECT_EQ("INFO: Foo Bar Baz\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Warn) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "One", "");
+  EXPECT_EQ("WARNING: One\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Error) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Two");
+  EXPECT_EQ("ERROR: Two\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, UnknownSeverity) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(static_cast<LogSeverity>(-1), "Three");
+  EXPECT_EQ("<Unknown severity>: Three\n",
+            testing::internal::GetCapturedStderr());
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 663ee38280ed4d65d9dafb8353dd4746c6da6292..e333138fb78e81316e29a7c37b1fba2df7b1408a 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/model.h"
@@ -245,11 +246,11 @@ class MallocDataAllocator : public BuiltinDataAllocator {
 
 TfLiteStatus InterpreterBuilder::ParseNodes(
     const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-    Interpreter* interpreter) {
+    Subgraph* subgraph) {
   TfLiteStatus status = kTfLiteOk;
 
   // Reduce the number of redundant allocations
-  interpreter->ReserveNodes(operators->Length());
+  subgraph->ReserveNodes(operators->Length());
 
   for (int i = 0; i < operators->Length(); ++i) {
     const auto* op = operators->Get(i);
@@ -279,7 +280,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
     }
 
     if (op->custom_options()) {
-      interpreter->AddNodeWithParameters(
+      subgraph->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()),
           reinterpret_cast<const char*>(op->custom_options()->data()),
@@ -289,24 +290,73 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       MallocDataAllocator malloc_allocator;
       TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
                                         &malloc_allocator, &builtin_data));
-      interpreter->AddNodeWithParameters(
-          FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
-          registration);
+      subgraph->AddNodeWithParameters(FlatBufferIntArrayToVector(op->inputs()),
+                                      FlatBufferIntArrayToVector(op->outputs()),
+                                      nullptr, 0, builtin_data, registration);
     }
   }
 
   return status;
 }
 
+TfLiteStatus InterpreterBuilder::ParseQuantization(
+    const QuantizationParameters* src_quantization,
+    TfLiteQuantization* quantization) {
+  quantization->type = kTfLiteNoQuantization;
+  if (!src_quantization || !src_quantization->scale() ||
+      src_quantization->scale()->size() == 0) {
+    return kTfLiteOk;
+  }
+  if (!src_quantization->zero_point()) {
+    error_reporter_->Report(
+        "Quantization parameters has non-null scale but null zero_point.");
+    return kTfLiteError;
+  }
+
+  // Ensure that the number of scales matches the number of zero_points.
+  if (src_quantization->scale()->size() !=
+      src_quantization->zero_point()->size()) {
+    error_reporter_->Report(
+        "QuantizationParam has %d zero_point values and %d scale values. Must "
+        "have same number.",
+        src_quantization->zero_point()->size(),
+        src_quantization->scale()->size());
+    return kTfLiteError;
+  }
+
+  // Affine-quantization.
+  quantization->type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  const size_t num_scales = src_quantization->scale()->size();
+  affine_quantization->scale = TfLiteFloatArrayCreate(num_scales);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(num_scales);
+  for (size_t i = 0; i < num_scales; ++i) {
+    affine_quantization->scale->data[i] = src_quantization->scale()->Get(i);
+    affine_quantization->zero_point->data[i] =
+        src_quantization->zero_point()->Get(i);
+  }
+  if (src_quantization->quantized_dimension() < 0 ||
+      src_quantization->quantized_dimension() >= num_scales) {
+    error_reporter_->Report(
+        "quantized_dimension must be in range [0, %d). Was %d.", num_scales,
+        src_quantization->quantized_dimension());
+    return kTfLiteError;
+  }
+  affine_quantization->quantized_dimension =
+      src_quantization->quantized_dimension();
+  quantization->params = reinterpret_cast<void*>(affine_quantization);
+  return kTfLiteOk;
+}
+
 TfLiteStatus InterpreterBuilder::ParseTensors(
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-    Interpreter* interpreter) {
+    Subgraph* subgraph) {
   TfLiteStatus status = kTfLiteOk;
 
   // A little helper to get the names of inputs and outputs. Note that they
-  // must outlive the interpreter.
+  // must outlive the subgraph.
   auto get_name = [](const tflite::Tensor* t) -> const char* {
     auto name = t->name();
     if (name) return name->c_str();
@@ -317,36 +367,11 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     const auto* tensor = tensors->Get(i);
     std::vector<int> dims = FlatBufferIntArrayToVector(tensor->shape());
 
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 0;
-    quantization.zero_point = 0;
-    auto* q_params = tensor->quantization();
-    if (q_params) {
-      // Note that the schema could hold per-channel quantization parameters
-      // but we really only support one value for the whole tensor.
-      // TODO(aselle): This breaks as well if these are nullptr's.
-      // TODO(aselle): This assumes non per-channel quantization.
-
-      if (q_params->scale()) {
-        if (q_params->scale()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d scale values (only 1 is supported).",
-              q_params->scale()->size());
-          return kTfLiteError;
-        }
-        quantization.scale = q_params->scale()->Get(0);
-      }
-
-      if (q_params->zero_point()) {
-        if (q_params->zero_point()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d zero_point values"
-              " (only 1 is supported).",
-              q_params->zero_point()->size());
-          return kTfLiteError;
-        }
-        quantization.zero_point = q_params->zero_point()->Get(0);
-      }
+    const auto* src_quantization = tensor->quantization();
+    TfLiteQuantization quantization;
+    if (ParseQuantization(src_quantization, &quantization) != kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
 
     TfLiteType type;
@@ -392,7 +417,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
         status = kTfLiteError;
       }
 
-      if (interpreter->SetTensorParametersReadOnly(
+      if (subgraph->SetTensorParametersReadOnly(
               i, type, get_name(tensor), dims, quantization, buffer_ptr,
               buffer_size, allocation_) != kTfLiteOk) {
         error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
@@ -400,9 +425,9 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
         status = kTfLiteError;
       }
     } else {
-      if (interpreter->SetTensorParametersReadWrite(i, type, get_name(tensor),
-                                                    dims, quantization,
-                                                    is_variable) != kTfLiteOk) {
+      if (subgraph->SetTensorParametersReadWrite(i, type, get_name(tensor),
+                                                 dims, quantization,
+                                                 is_variable) != kTfLiteOk) {
         error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
                                 i);
         status = kTfLiteError;
@@ -484,42 +509,56 @@ TfLiteStatus InterpreterBuilder::operator()(
   // Construct interpreter with correct number of tensors and operators.
   auto* subgraphs = model_->subgraphs();
   auto* buffers = model_->buffers();
-  if (subgraphs->size() != 1) {
-    error_reporter_->Report("Only 1 subgraph is currently supported.\n");
-    return cleanup_and_error();
-  }
-  const tflite::SubGraph* subgraph = (*subgraphs)[0];
-  auto operators = subgraph->operators();
-  auto tensors = subgraph->tensors();
-  if (!operators || !tensors || !buffers) {
-    error_reporter_->Report(
-        "Did not get operators, tensors, or buffers in input flat buffer.\n");
+
+  if (subgraphs->size() == 0) {
+    error_reporter_->Report("No subgraph in the model.\n");
     return cleanup_and_error();
   }
+
   interpreter->reset(new Interpreter(error_reporter_));
-  if ((**interpreter).AddTensors(tensors->Length()) != kTfLiteOk) {
-    return cleanup_and_error();
+  (*interpreter)->SetNumThreads(num_threads);
+  if (subgraphs->Length() > 1) {
+    (*interpreter)->AddSubgraphs(subgraphs->Length() - 1);
   }
-  // Set num threads
-  (**interpreter).SetNumThreads(num_threads);
-  // Parse inputs/outputs
-  (**interpreter).SetInputs(FlatBufferIntArrayToVector(subgraph->inputs()));
-  (**interpreter).SetOutputs(FlatBufferIntArrayToVector(subgraph->outputs()));
-
-  // Finally setup nodes and tensors
-  if (ParseNodes(operators, interpreter->get()) != kTfLiteOk)
-    return cleanup_and_error();
-  if (ParseTensors(buffers, tensors, interpreter->get()) != kTfLiteOk)
-    return cleanup_and_error();
 
-  std::vector<int> variables;
-  for (int i = 0; i < (*interpreter)->tensors_size(); ++i) {
-    auto* tensor = (*interpreter)->tensor(i);
-    if (tensor->is_variable) {
-      variables.push_back(i);
+  for (int subgraph_index = 0; subgraph_index < subgraphs->Length();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = (*subgraphs)[subgraph_index];
+    tflite::Subgraph* modified_subgraph =
+        (*interpreter)->subgraph(subgraph_index);
+    auto operators = subgraph->operators();
+    auto tensors = subgraph->tensors();
+    if (!operators || !tensors || !buffers) {
+      error_reporter_->Report(
+          "Did not get operators, tensors, or buffers in subgraph %d.\n",
+          subgraph_index);
+      return cleanup_and_error();
+    }
+    if (modified_subgraph->AddTensors(tensors->Length()) != kTfLiteOk) {
+      return cleanup_and_error();
+    }
+    // Set num threads
+    // Parse inputs/outputs
+    modified_subgraph->SetInputs(
+        FlatBufferIntArrayToVector(subgraph->inputs()));
+    modified_subgraph->SetOutputs(
+        FlatBufferIntArrayToVector(subgraph->outputs()));
+
+    // Finally setup nodes and tensors
+    if (ParseNodes(operators, modified_subgraph) != kTfLiteOk)
+      return cleanup_and_error();
+    if (ParseTensors(buffers, tensors, modified_subgraph) != kTfLiteOk)
+      return cleanup_and_error();
+
+    std::vector<int> variables;
+    for (int i = 0; i < modified_subgraph->tensors_size(); ++i) {
+      auto* tensor = modified_subgraph->tensor(i);
+      if (tensor->is_variable) {
+        variables.push_back(i);
+      }
     }
+    modified_subgraph->SetVariables(std::move(variables));
   }
-  (**interpreter).SetVariables(std::move(variables));
 
   if (ApplyDelegates(interpreter->get()) != kTfLiteOk)
     return cleanup_and_error();
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 069cefabf91ceceaa6da79fdc8ebbdb31cf9a6d3..bae4229cbab672397392349437e3c1e7e871c5d9 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -35,6 +35,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MODEL_H_
 
 #include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
@@ -197,12 +198,14 @@ class InterpreterBuilder {
   TfLiteStatus BuildLocalIndexToRegistrationMapping();
   TfLiteStatus ParseNodes(
       const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-      Interpreter* interpreter);
+      Subgraph* subgraph);
   TfLiteStatus ParseTensors(
       const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
       const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-      Interpreter* interpreter);
+      Subgraph* subgraph);
   TfLiteStatus ApplyDelegates(Interpreter* interpreter);
+  TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization,
+                                 TfLiteQuantization* quantization);
 
   const ::tflite::Model* model_;
   const OpResolver& op_resolver_;
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
index e677ea94a71b979a01fd4b56e331d592cef76cd5..d58dbf4d45fefcf787b3349426d794146ac84e52 100644
--- a/tensorflow/lite/model_test.cc
+++ b/tensorflow/lite/model_test.cc
@@ -87,20 +87,21 @@ TEST(BasicFlatBufferModel, TestEmptyModelsAndNullDestination) {
 
 // Make sure currently unsupported # of subgraphs are checked
 // TODO(aselle): Replace this test when multiple subgraphs are supported.
-TEST(BasicFlatBufferModel, TestZeroAndMultipleSubgraphs) {
-  auto m1 = FlatBufferModel::BuildFromFile(
+TEST(BasicFlatBufferModel, TestZeroSubgraphs) {
+  auto m = FlatBufferModel::BuildFromFile(
       "tensorflow/lite/testdata/0_subgraphs.bin");
-  ASSERT_TRUE(m1);
-  std::unique_ptr<Interpreter> interpreter1;
-  ASSERT_NE(InterpreterBuilder(*m1, TrivialResolver())(&interpreter1),
-            kTfLiteOk);
+  ASSERT_TRUE(m);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_NE(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
+}
 
-  auto m2 = FlatBufferModel::BuildFromFile(
+TEST(BasicFlatBufferModel, TestMultipleSubgraphs) {
+  auto m = FlatBufferModel::BuildFromFile(
       "tensorflow/lite/testdata/2_subgraphs.bin");
-  ASSERT_TRUE(m2);
-  std::unique_ptr<Interpreter> interpreter2;
-  ASSERT_NE(InterpreterBuilder(*m2, TrivialResolver())(&interpreter2),
-            kTfLiteOk);
+  ASSERT_TRUE(m);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
+  EXPECT_EQ(interpreter->subgraphs_size(), 2);
 }
 
 // Test what happens if we cannot bind any of the ops.
@@ -115,7 +116,7 @@ TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
   ASSERT_EQ(interpreter, nullptr);
 }
 
-// Make sure model is read to interpreter propelrly
+// Make sure model is read to interpreter properly
 TEST(BasicFlatBufferModel, TestModelInInterpreter) {
   auto model = FlatBufferModel::BuildFromFile(
       "tensorflow/lite/testdata/test_model.bin");
diff --git a/tensorflow/lite/models/smartreply/BUILD b/tensorflow/lite/models/smartreply/BUILD
index 9b2f0da64c9c47d649216d64e13d99c6a7541aad..100e1d36b2bbf4679cacad3ef0e285d92bc5d467 100644
--- a/tensorflow/lite/models/smartreply/BUILD
+++ b/tensorflow/lite/models/smartreply/BUILD
@@ -50,6 +50,7 @@ cc_library(
     ],
 )
 
+# TODO(b/118895218): Make this test compatible with oss.
 tf_cc_test(
     name = "predictor_test",
     srcs = ["predictor_test.cc"],
@@ -57,6 +58,7 @@ tf_cc_test(
         "//tensorflow/lite/models:testdata/smartreply_samples.tsv",
         "@tflite_smartreply//:smartreply.tflite",
     ],
+    tags = ["no_oss"],
     deps = [
         ":predictor_lib",
         "//tensorflow/core:test",
@@ -67,10 +69,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "extract_feature_op_test",
     size = "small",
     srcs = ["ops/extract_feature_test.cc"],
+    tags = ["no_oss"],
     deps = [
         ":custom_ops",
         "//tensorflow/lite:framework",
@@ -81,10 +84,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "normalize_op_test",
     size = "small",
     srcs = ["ops/normalize_test.cc"],
+    tags = ["no_oss"],
     deps = [
         ":custom_ops",
         "//tensorflow/lite:framework",
@@ -95,10 +99,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+cc_test(
     name = "predict_op_test",
     size = "small",
     srcs = ["ops/predict_test.cc"],
+    tags = ["no_oss"],
     deps = [
         ":custom_ops",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/models/speech_test.cc b/tensorflow/lite/models/speech_test.cc
index f3509d1ecedfca4eea58343abdef00188bca5cff..a3713c55312cb7cb6526b7e82606cb949e5c2af4 100644
--- a/tensorflow/lite/models/speech_test.cc
+++ b/tensorflow/lite/models/speech_test.cc
@@ -139,7 +139,7 @@ TEST_P(SpeechTest, DISABLED_SpeakerIdOkGoogleTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, DISABLED_AsrAmTest) {
+TEST_P(SpeechTest, AsrAmTest) {
   std::stringstream os;
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
@@ -152,6 +152,19 @@ TEST_P(SpeechTest, DISABLED_AsrAmTest) {
       << test_driver.GetErrorMessage();
 }
 
+TEST_P(SpeechTest, AsrAmQuantizedTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_asr_am_model_int8.tflite", "speech_asr_am_model_in.csv",
+      "speech_asr_am_model_int8_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"104",
+      /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
+      /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
+      << test_driver.GetErrorMessage();
+}
+
 // The original version of speech_asr_lm_model_test.cc ran a few sequences
 // through the interpreter and stored the sum of all the output, which was them
 // compared for correctness. In this test we are comparing all the intermediate
diff --git a/tensorflow/lite/models/testdata/g3doc/README.md b/tensorflow/lite/models/testdata/g3doc/README.md
index 2a4f1c143a21722945e8e396b81bd23e3312e87e..afe5f16b383b26efd7aab866c3215a8d2a203f4c 100644
--- a/tensorflow/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/lite/models/testdata/g3doc/README.md
@@ -3,6 +3,42 @@
 Sample test data has been provided for speech related models in Tensorflow Lite
 to help users working with speech models to verify and test their models.
 
+### Models and Inputs and Outputs:
+
+[ASR AM model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model.tflite)
+
+[ASR AM quantized model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8.tflite)
+
+[ASR AM test inputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_in.csv)
+
+[ASR AM test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_out.csv)
+
+[ASR AM int8 test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8_out.csv)
+
+The models below are not maintained.
+
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+
+### Test Bench
+
+[Model tests](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_test.cc)
+
+Download the ASR AM test models and inputs and output files to the
+models/testdata directory to run the tests.
+
+
+## Speech Model Architectures
+
 For the hotword, speaker-id and automatic speech recognition sample models, the
 architecture assumes that the models receive their input from a speech
 pre-processing module. The speech pre-processing module receives the audio
@@ -87,57 +123,3 @@ The model consists of a convolutional layer, followed by a fully-connected
 layer, two LSTM layers, and two additional fully-connected layers.
 The corresponding parameters as shown in the figure.
 ![endpointer_model](endpointer.svg "Endpointer model")
-
-
-## Speech models test input/output generation
-
-As mentioned above the input to models are generated from a pre-processing
-module (output of a log-mel filterbank, or linguistic features), and the outputs
-are generated by running the equivalent TensorFlow model by feeding them the
-same input.
-
-## Link to the open source code
-
-### Models:
-
-[Speech hotword model (Svdf
-rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
-
-[Speech hotword model (Svdf
-rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
-
-[Speaker-id
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
-
-[TTS
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
-
-[ASR AM
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
-
-### Test benches
-
-[Speech hotword model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_hotword_model_test.cc)
-
-[Speaker-id model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_speakerid_model_test.cc)
-
-[TTS model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_tts_model_test.cc)
-
-[ASR AM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_am_model_test.cc)
-
-[ASR LM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_lm_model_test.cc)
-
-[Endpointer model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_endpointer_model_test.cc)
-
-## Android Support
-The models have been tested on Android phones, using the following tests:
-
-[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=25)
-
-[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index bd3a8a69af66a1cd2c043e76779db7d3d26cc17e..662754d59bfcd23e9164754c473cb3129a055852 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -12,3 +12,40 @@ cc_library(
     ],
     linkopts = ["-ldl"],
 )
+
+cc_library(
+    name = "nnapi_implementation",
+    srcs = select({
+        "//tensorflow:ios": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//tensorflow:windows": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//conditions:default": [
+            "nnapi_implementation.cc",
+        ],
+    }),
+    hdrs = [
+        "nnapi_implementation.h",
+    ],
+    linkopts = ["-ldl"] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:macos": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lrt"],
+    }),
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_lib",
+    ],
+)
+
+cc_test(
+    name = "nnapi_implementation_test",
+    srcs = ["nnapi_implementation_test.cc"],
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/nnapi/NeuralNetworksShim.h b/tensorflow/lite/nnapi/NeuralNetworksShim.h
index 2ce6e50de6ef4c2b530ef6239f5dde94e68988cb..c48528fa2da5e7992beb9b029d2d112a8e48ba4c 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -22,6 +22,11 @@ limitations under the License.
 
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 
+// This interface is now deprecated. You should use instead
+// nnapi_implementation.
+
+// TODO(b/123017568): Update all current usages of this file.
+
 // helpers
 
 #define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
@@ -54,7 +59,8 @@ inline int ASharedMemory_create(const char* name, size_t size) {
       handle != nullptr ? reinterpret_cast<ASharedMemory_create_fn>(
                               dlsym(handle, "ASharedMemory_create"))
                         : nullptr;
-  return fn(name, size);
+  int fd = fn != nullptr ? fn(name, size) : -1;
+  return fd;
 }
 
 inline void* getLibraryHandle() {
@@ -250,6 +256,32 @@ inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
   EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
 }
 
+/**
+ * Sets an operand's per channel quantization parameters.
+ *
+ * Sets parameters required by a tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+ * This function must be called for every tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+ * calling {@link ANeuralNetworksModel_finish}.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param channelQuant The per channel quantization parameters for the operand.
+ *                    No memory in this struct needs to outlive the call to
+ *                    this function.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
+  EXECUTE_FUNCTION_RETURN(model, index, channelQuant);
+}
+
 /**
  * Sets an operand to a value stored in a memory object.
  *
@@ -681,6 +713,445 @@ inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
   EXECUTE_FUNCTION(event);
 }
 
+/**
+ * Get the number of available devices.
+ *
+ * @param numDevices Used to return the number of devices.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworks_getDeviceCount(uint32_t* numDevices) {
+  LOAD_FUNCTION(ANeuralNetworks_getDeviceCount);
+  EXECUTE_FUNCTION_RETURN(numDevices);
+}
+
+/**
+ * Get the representation of the specified device.
+ *
+ * @param devIndex The index of the specified device. Must be less than the
+ *                 number of available devices.
+ * @param device The representation of the specified device.
+ *               The same representation will always be returned for the
+ *               specified device.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+
+inline int ANeuralNetworks_getDevice(uint32_t devIndex,
+                                     ANeuralNetworksDevice** device) {
+  LOAD_FUNCTION(ANeuralNetworks_getDevice);
+  EXECUTE_FUNCTION_RETURN(devIndex, device);
+}
+
+/**
+ * Get the name of the specified device.
+ *
+ * @param device The representation of the specified device.
+ * @param name   The returned name of the specified device. The name will be in
+ *               UTF-8 and will be null-terminated. It will be recognizable as a
+ *               known device name rather than a cryptic string. For devices
+ *               with API level 29 and above, the format of the name is
+ *               {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+ *               level 28 or lower, the name will always be “unknown-device”.
+ *               The name will remain valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getName(const ANeuralNetworksDevice* device,
+                                         const char** name) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getName);
+  EXECUTE_FUNCTION_RETURN(device, name);
+}
+
+/**
+ * Get the version of the driver implementation of the specified device.
+ *
+ * It’s the responsibility of the driver implementor to insure that this version
+ * string uniquely distinguishes this implementation from all previous
+ * implementations.
+ *
+ * This version string must not be confused with the feature level which is
+ * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is no
+ * implicit ordering of the versions. For example, it is not possible to filter
+ * all drivers older than a certain version.
+ *
+ * Application developers may use this version string to avoid or prefer
+ * specific driver implementations. For example, an application may want to do
+ * so because:
+ *     - A specific version of the driver does not provide the required
+ * performance, perhaps because of a performance regression.
+ *     - A specific version of the driver has a bug or returns results that
+ * don’t match the minimum precision requirement for the application.
+ *
+ * @param device  The representation of the specified device.
+ * @param version The returned version string of the driver for the specified
+ *                device. The string will be in UTF-8 and will be
+ *                null-terminated. For devices with feature level 28 or lower,
+ *                "UNKNOWN" will be returned. The version string will remain
+ *                valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device,
+                                            const char** version) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getVersion);
+  EXECUTE_FUNCTION_RETURN(device, version);
+}
+
+/**
+ * Get the supported NNAPI version of the specified device.
+ *
+ * Each device has a supported feature level, which is the most advanced feature
+ * this driver implements. For example, if the driver implements the features
+ * introduced in Android P, but does not implement the features introduced after
+ * Android P, the value would be 28. Developers could decide whether or not the
+ * specified device should be used for a Model that has certain feature
+ * requirements.
+ *
+ * @param device       The representation of the specified device.
+ * @param featureLevel The API level of the most advanced feature this driver
+ *                     implements.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getFeatureLevel(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getFeatureLevel);
+  EXECUTE_FUNCTION_RETURN(device, featureLevel);
+}
+
+/**
+ * Get the supported operations for a specified set of devices. If multiple
+ * devices are selected, the supported operation list is a union of supported
+ * operations of all selected devices.
+ *
+ * @param model        The model to be queried.
+ * @param devices      The set of devices. Must not contain duplicates.
+ * @param numDevices   The number of devices in the set.
+ * @param supportedOps The boolean array to be filled. True means supported. The
+ *                     size of the boolean array must be at least as large as
+ *                     the number of operations in the model. The order of
+ *                     elements in the supportedOps array matches the order in
+ *                     which the corresponding operations were added to the
+ *                     model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksModel_getSupportedOperationsForDevices(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps) {
+  LOAD_FUNCTION(ANeuralNetworksModel_getSupportedOperationsForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, supportedOps);
+}
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model for a
+ * specified set of devices. If more than one device is specified, the
+ * compilation will distribute the workload automatically across the devices.
+ * The model must be fully supported by the specified set of devices. This means
+ * that ANeuralNetworksModel_getSupportedOperationsForDevices() must have
+ * returned true for every operation for that model/devices pair.
+ *
+ * @param model       The {@link ANeuralNetworksModel} to be compiled.
+ * @param devices     The set of devices. Must not contain duplicates.
+ * @param numDevices  The number of devices in the set.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_createForDevices(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_createForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, compilation);
+}
+
+/**
+ * Sets the compilation caching signature and the cache directory.
+ *
+ * Provides optional caching information to the runtime for faster repeated
+ * compilation.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param cacheDir The cache directory to store and retrieve caching data. It is
+ *                 recommended to use the code_cache provided by the Android
+ *                 runtime. If not using the code_cache, the user should choose
+ *                 a directory local to the application, and is responsible to
+ *                 manage and clean the cache entries.
+ * @param token The token provided by the user to specify a model, must be of
+ *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should
+ *              ensure that the token is unique to a model within the
+ *              application. The NNAPI runtime will not detected token
+ *              collisions. If there is a collision, the compilation outcome may
+ *              be incorrect without notifying with error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_setCaching(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_setCaching);
+  EXECUTE_FUNCTION_RETURN(compilation, cacheDir, token);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.
+ * </p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution.
+ * Synchronous execution incurs lower overhead than asynchronous execution.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be scheduled and executed.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+ *         cannot be properly mapped.
+ */
+inline int ANeuralNetworksExecution_compute(
+    ANeuralNetworksExecution* execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_compute);
+  EXECUTE_FUNCTION_RETURN(execution);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * On asynchronous execution initiated by {@link
+ * ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param rank The rank of the output operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandRank(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandRank);
+  EXECUTE_FUNCTION_RETURN(execution, index, rank);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+ * scalar.
+ *
+ * On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is an index
+ *              into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with
+ *              {@link ANeuralNetworksModel_addOperand}.
+ * @param dimensions The dimension array to be filled. The size of the array
+ *                   must be exactly as large as the rank of the output operand
+ *                   to be queried in the model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid or if the target is a scalar.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandDimensions(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandDimensions);
+  EXECUTE_FUNCTION_RETURN(execution, index, dimensions);
+}
+
+/**
+ * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+ * This only creates the burst object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+ * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+ *
+ * <p>The provided compilation must outlive the burst object.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param burst The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+inline int ANeuralNetworksBurst_create(ANeuralNetworksCompilation* compilation,
+                                       ANeuralNetworksBurst** burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_create);
+  EXECUTE_FUNCTION_RETURN(compilation, burst);
+}
+
+/**
+ * Destroys the burst object.
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+inline void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_free);
+  EXECUTE_FUNCTION(burst);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution on a burst object.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.</p>
+ *
+ * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+ * any given time for any given burst object. Any
+ * {@link ANeuralNetworksExecution} launched before the previous has finished
+ * will result in ANEURALNETWORKS_BAD_STATE.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to execute on.
+ * @param execution The execution to be scheduled and executed. The execution
+ *                  must be created from the same {@link
+ *                  ANeuralNetworksCompilation} as the burst object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+inline int ANeuralNetworksExecution_burstCompute(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_burstCompute);
+  EXECUTE_FUNCTION_RETURN(execution, burst);
+}
+
+/**
+ * Creates a shared memory object from an AHardwareBuffer handle.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of
+ * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as shared
+ * memory created from a file handle. See
+ * {@link ANeuralNetworksMemory} for a description on how to use this shared
+ * memory.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs and
+ * outputs. When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared memory,
+ * both offset and length must be set to zero and the entire memory region will
+ * be associated with the specified input or output operand. There is no
+ * guarantee that an arbitrary AHardwareBuffer_Format and
+ * AHardwareBuffer_UsageFlags combination can be used by arbitrary devices. The
+ * execution will fail if selected set of devices cannot consume the buffer.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared
+ * memory backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+ *
+ * TODO(miaowang): add documentation about intended usage with introspection
+ * API.
+ *
+ * Available since API level 29.
+ *
+ * @param ahwb The AHardwareBuffer handle.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ *
+ * @see AHardwareBuffer
+ */
+inline int ANeuralNetworksMemory_createFromAHardwareBuffer(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromAHardwareBuffer);
+  EXECUTE_FUNCTION_RETURN(ahwb, memory);
+}
+
+/**
+ * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+ * measured. By default, duration is not measured.
+ *
+ * The {@link ANeuralNetworksExecution} must have been created with
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be modified.
+ * @param measure 'true' if duration is to be measured, 'false' if not.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_setMeasureTiming(
+    ANeuralNetworksExecution* execution, bool measure) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setMeasureTiming);
+  EXECUTE_FUNCTION_RETURN(execution, measure);
+}
+
+/**
+ * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+ * nanoseconds. The execution must have completed.
+ *
+ * @param execution The execution to be queried.
+ * @param durationCode The measurement to be queried, specified by {@link
+ * DurationCode}.
+ * @param duration The returned duration. If no measurement was requested by
+ *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+ * some other reason the duration is not available, UINT64_MAX will be returned.
+ *                 A particular device need not support any given measurement.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_getDuration(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getDuration);
+  EXECUTE_FUNCTION_RETURN(execution, durationCode, duration);
+}
+
 /**/
 
 #endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index de8b84a8234340cda3c1ae5942c1863a09e3c228..109c6b001438a7963d13acaaa6da455b02cdac27 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <stdint.h>
 #include <stdio.h>
 
+typedef struct AHardwareBuffer AHardwareBuffer;
+
 // NN api types based on NNAPI header file
 // https://developer.android.com/ndk/reference/group/neural-networks
 
@@ -37,6 +39,7 @@ enum {
   ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
   ANEURALNETWORKS_TENSOR_INT32 = 4,
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+  ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
 };
 
 /**
@@ -115,8 +118,10 @@ enum {
   ANEURALNETWORKS_UNEXPECTED_NULL = 3,
   ANEURALNETWORKS_BAD_DATA = 4,
   ANEURALNETWORKS_OP_FAILED = 5,
-  ANEURALNETWORKS_UNMAPPABLE = 5,
   ANEURALNETWORKS_BAD_STATE = 6,
+  ANEURALNETWORKS_UNMAPPABLE = 7,
+  ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
+  ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
 };
 
 /**
@@ -127,6 +132,25 @@ enum {
   ANEURALNETWORKS_PADDING_VALID = 2,
 };
 
+/**
+ * Device types.
+ *
+ * The type of NNAPI device.
+ */
+enum {
+  /** The device type cannot be provided. */
+  ANEURALNETWORKS_DEVICE_UNKNOWN = 0,
+  /** The device does not fall into any category below. */
+  ANEURALNETWORKS_DEVICE_OTHER = 1,
+  /** The device runs NNAPI models on single or multi-core CPU. */
+  ANEURALNETWORKS_DEVICE_CPU = 2,
+  /** The device can run NNAPI models and also accelerate graphics APIs such
+   * as OpenGL ES and Vulkan. */
+  ANEURALNETWORKS_DEVICE_GPU = 3,
+  /** Dedicated accelerator for Machine Learning workloads. */
+  ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
@@ -239,6 +263,53 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
  */
 typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
 
+/**
+ * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
+ */
+typedef struct ANeuralNetworksSymmPerChannelQuantParams {
+  /* The index of the channel dimension. */
+  uint32_t channelDim;
+  /** The size of the scale array. Should be equal to dimension[channelDim] of
+   * the Operand. */
+  uint32_t scaleCount;
+  /** The array of scaling values for each channel. Each value must be greater
+   * than zero. */
+  const float* scales;
+} ANeuralNetworksSymmPerChannelQuantParams;
+
+/**
+ * ANeuralNetworksBurst is an opaque type that can be used to reduce the latency
+ * of a rapid sequence of executions. It will likely cause overhead if only used
+ * for a single execution.
+ *
+ * ANeuralNetworksBurst serves as a context object for any number of inferences
+ * using {@link ANeuralNetworksExecution} objects. An ANeuralNetworksBurst
+ * object and the {@link ANeuralNetworksExecution} objects used with it must all
+ * have been created from the same {@link ANeuralNetworksCompilation} object.
+ *
+ * This object is also used as a hint to drivers, providing insight to the
+ * lifetime of a rapid sequence of executions. For example, a driver may choose
+ * to increase the clock frequency of its accelerator for the lifetime of a
+ * burst object.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new burst object by calling the
+ *        {@link ANeuralNetworksBurst_create} function.</li>
+ *    <li>For each execution:</li><ul>
+ *        <li>Create {@link ANeuralNetworksExecution} and configure its
+ *            properties (see {@link ANeuralNetworksExecution} for
+ * details).</li> <li>Apply the model synchronously with
+ *            {@link ANeuralNetworksExecution_burstCompute}, reusing the same
+ *            {@link ANeuralNetworksBurst} with the new
+ *            {@link ANeuralNetworksExecution}.</li>
+ *        <li>Use and free the {@link ANeuralNetworksExecution}.</li></ul>
+ *    <li>Destroy the burst with
+ *        {@link ANeuralNetworksBurst_free}.</li></ul></p>
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
+
 /**
  * ANeuralNetworksOperandType describes the type of an operand.
  * This structure is used to describe both scalars and tensors.
@@ -266,6 +337,16 @@ typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
 
 typedef int32_t ANeuralNetworksOperationType;
 
+/**
+ * ANeuralNetworksDevice is an opaque type that represents a device.
+ *
+ * This type is used to query basic properties and supported operations of the
+ * corresponding device, and control which device(s) a model is to be run on.
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
+
 // nn api function types
 
 typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
@@ -299,6 +380,10 @@ typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
     ANeuralNetworksModel* model, int32_t index, const void* buffer,
     size_t length);
 
+typedef int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
 typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
     ANeuralNetworksModel* model, int32_t index,
     const ANeuralNetworksMemory* memory, size_t offset, size_t length);
@@ -349,4 +434,70 @@ typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
 
 typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
 
+typedef int (*ANeuralNetworks_getDeviceCount_fn)(uint32_t* numDevices);
+
+typedef int (*ANeuralNetworks_getDevice_fn)(uint32_t devIndex,
+                                            ANeuralNetworksDevice** device);
+
+typedef int (*ANeuralNetworksDevice_getName_fn)(
+    const ANeuralNetworksDevice* device, const char** name);
+
+typedef int (*ANeuralNetworksDevice_getType_fn)(
+    const ANeuralNetworksDevice* device, int32_t* type);
+
+typedef int (*ANeuralNetworksDevice_getVersion_fn)(
+    const ANeuralNetworksDevice* device, const char** version);
+
+typedef int (*ANeuralNetworksDevice_getFeatureLevel_fn)(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+typedef int (*ANeuralNetworksModel_getSupportedOperationsForDevices_fn)(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps);
+
+typedef int (*ANeuralNetworksCompilation_createForDevices_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token);
+
+typedef int (*ANeuralNetworksExecution_compute_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandDimensions_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+typedef int (*ANeuralNetworksBurst_create_fn)(
+    ANeuralNetworksCompilation* compilation, ANeuralNetworksBurst** burst);
+
+typedef void (*ANeuralNetworksBurst_free_fn)(ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksExecution_burstCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksMemory_createFromAHardwareBuffer_fn)(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+typedef int (*ANeuralNetworksExecution_setMeasureTiming_fn)(
+    ANeuralNetworksExecution* execution, bool measure);
+
+typedef enum {
+  // Execution time on hardware (not driver, which runs on host processor).
+  ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
+  // Execution time in driver (including time on hardware).  Excludes overhead
+  // such as that of the runtime itself and the IPC needed for the runtime to
+  // communicate with the driver.
+  ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+} DurationCode;
+
+typedef int (*ANeuralNetworksExecution_getDuration_fn)(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration);
+
 #endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3de3ca5946f0f96e8da06ada26b4e0c46d0dce
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -0,0 +1,203 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdlib>
+
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif  // __ANDROID__
+
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
+
+namespace {
+
+#ifdef __ANDROID__
+int32_t GetAndroidSdkVersion() {
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    int32_t result = 0;
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher than expected;
+        return 0xffff;
+      }
+      result = result * 10 + digit;
+    }
+    // TODO(levp): remove once SDK gets updated to 29th level
+    // Upgrade SDK version for pre-release Q to be able to test functionality
+    // available from SDK level 29.
+    if (result == 28) {
+      char versionCodename[PROP_VALUE_MAX];
+      const char* versionCodenameProp = "ro.build.version.codename";
+      length = __system_property_get(versionCodenameProp, versionCodename);
+      if (length != 0) {
+        if (versionCodename[0] == 'Q') {
+          return 29;
+        }
+      }
+    }
+    return result;
+  }
+  return 0;
+}
+#endif  // __ANDROID__
+
+void* LoadFunction(void* handle, const char* name, bool optional) {
+  if (handle == nullptr) {
+    return nullptr;
+  }
+  void* fn = dlsym(handle, name);
+  if (fn == nullptr && !optional) {
+    NNAPI_LOG("nnapi error: unable to open function %s", name);
+  }
+  return fn;
+}
+
+#ifndef __ANDROID__
+// Add /dev/shm implementation of shared memory for non-Android platforms
+int ASharedMemory_create(const char* name, size_t size) {
+  int fd = shm_open(name, O_RDWR | O_CREAT, 0644);
+  if (fd < 0) {
+    return fd;
+  }
+  int result = ftruncate(fd, size);
+  if (result < 0) {
+    close(fd);
+    return -1;
+  }
+  return fd;
+}
+#endif  // __ANDROID__
+
+#define LOAD_FUNCTION(handle, name)         \
+  nnapi.name = reinterpret_cast<name##_fn>( \
+      LoadFunction(handle, #name, /*optional*/ false));
+
+#define LOAD_FUNCTION_OPTIONAL(handle, name) \
+  nnapi.name = reinterpret_cast<name##_fn>(  \
+      LoadFunction(handle, #name, /*optional*/ true));
+
+const NnApi LoadNnApi() {
+  NnApi nnapi = {};
+  nnapi.android_sdk_version = 0;
+
+#ifdef __ANDROID__
+  void* libandroid = nullptr;
+  nnapi.android_sdk_version = GetAndroidSdkVersion();
+  if (nnapi.android_sdk_version < 27) {
+    NNAPI_LOG("nnapi error: requires android sdk version to be at least %d",
+              27);
+    nnapi.nnapi_exists = false;
+    return nnapi;
+  }
+  libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libandroid == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libandroid.so");
+  }
+#endif  // __ANDROID__
+
+  void* libneuralnetworks = nullptr;
+  // TODO(b/123243014): change RTLD_LOCAL? Assumes there can be multiple
+  // instances of nn api RT
+  libneuralnetworks = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libneuralnetworks == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libneuralnetworks.so");
+  }
+
+  nnapi.nnapi_exists = libneuralnetworks != nullptr;
+
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_createFromFd);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperand);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_setOperandValue);
+  LOAD_FUNCTION_OPTIONAL(
+      libneuralnetworks,
+      ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_setOperandValueFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperation);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_identifyInputsAndOutputs);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_setPreference);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInput);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setOutput);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksExecution_setOutputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_startCompute);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_wait);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_free);
+#ifdef __ANDROID__
+  LOAD_FUNCTION(libandroid, ASharedMemory_create);
+#else
+  nnapi.ASharedMemory_create = ASharedMemory_create;
+#endif  // __ANDROID__
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDeviceCount);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDevice);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getName);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getVersion);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksDevice_getFeatureLevel);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getType);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksModel_getSupportedOperationsForDevices);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_createForDevices);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setCaching);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksExecution_compute);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getOutputOperandRank);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getOutputOperandDimensions);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksBurst_create);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksBurst_free);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_burstCompute);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemory_createFromAHardwareBuffer);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setMeasureTiming);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getDuration);
+  return nnapi;
+}
+
+}  // namespace
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = LoadNnApi();
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..b42c189d523dd9f5a5d014dc318d3e8b46936fde
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -0,0 +1,1017 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+struct NnApi {
+  bool nnapi_exists;
+  int32_t android_sdk_version;
+
+  /**
+   * Creates a shared memory object from a file descriptor.
+   *
+   * The shared memory is backed by a file descriptor via mmap.
+   * See {@link ANeuralNetworksMemory} for a description on how to use
+   * this shared memory.
+   *
+   * @param size The requested size in bytes.
+   *             Must not be larger than the file size.
+   * @param prot The desired memory protection for the mapping.
+   *             It is either PROT_NONE or the bitwise OR of one or
+   *             more of the following flags: PROT_READ, PROT_WRITE.
+   * @param fd The requested file descriptor.
+   *           The file descriptor has to be mmap-able. The file
+   *           descriptor will be duplicated.
+   * @param offset The offset to the beginning of the file of the area to map.
+   *               The offset has to be aligned to a page size.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   */
+  int (*ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd,
+                                            size_t offset,
+                                            ANeuralNetworksMemory** memory);
+
+  /**
+   * Delete a memory object.
+   *
+   * Destroys the object used by the run time to keep track of the memory.
+   * This will free the underlying actual memory if no other code has open
+   * handles to this memory.
+   *
+   * @param memory The memory object to be freed.
+   */
+  void (*ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
+
+  /**
+   * Create an empty {@link ANeuralNetworksModel}.
+   *
+   * <p>This only creates the object. Computation is performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * The model should be constructed with calls to
+   * {@link ANeuralNetworksModel_addOperation} and
+   * {@link ANeuralNetworksModel_addOperand}
+   *
+   * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+   * has been fully constructed.</p>
+   *
+   * <p>{@link ANeuralNetworksModel_free} should be called once the model
+   * is no longer needed.</p>
+   *
+   * @param model The {@link ANeuralNetworksModel} to be created.
+   *              Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
+
+  /**
+   * Destroy a model.
+   *
+   * The model need not have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be destroyed. Passing NULL is acceptable and
+   *              results in no operation.
+   */
+  void (*ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
+
+  /**
+   * Indicate that we have finished modifying a model. Required before
+   * calling {@link ANeuralNetworksCompilation_compile}.
+   *
+   * An application is responsible to make sure that no other thread uses
+   * the model at the same time.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
+
+  /**
+   * Add an operand to a model.
+   *
+   * The order in which the operands are added is important. The first one added
+   * to a model will have the index value 0, the second 1, etc. These indexes
+   * are used as operand identifiers in
+   * {@link ANeuralNetworksModel_addOperation},
+   * {@link ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory},
+   * {@link ANeuralNetworksExecution_setOutput},
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+   * {@link ANeuralNetworksExecution_setOperandValue}.
+   *
+   * To build a model that can accommodate inputs of various sizes, as you may
+   * want to do for a CNN, set the size of the dimensions that will vary at run
+   * time to 0. If you do so, provide the full dimensions when calling
+   * {@link ANeuralNetworksExecution_setInput} or {@link
+   * ANeuralNetworksExecution_setInputFromMemory}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+   * of the operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperand)(
+      ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+  /**
+   * Sets an operand to a constant value.
+   *
+   * For scalar values, the content of buffer is copied into the model.
+   *
+   * For tensor values, a pointer to the buffer is stored within the model.
+   * The application is responsible for not changing the content of this region
+   * until all executions using this model have completed. As the data may
+   * be copied during processing, modifying the data after this call yields
+   * undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model,
+                                              int32_t index, const void* buffer,
+                                              size_t length);
+
+  /**
+   * Sets an operand's per channel quantization parameters.
+   *
+   * Sets parameters required by a tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+   * This function must be called for every tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+   * calling {@link ANeuralNetworksModel_finish}.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param channelQuant The per channel quantization parameters for the
+   *                     operand. No memory in this struct needs to outlive the
+   *                     call to this function.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+  /**
+   * Sets an operand to a value stored in a memory object.
+   *
+   * The content of the memory is not copied. A reference to that memory is
+   * stored inside the model. The application is responsible for not changing
+   * the content of the memory region until all executions using this model have
+   * completed.
+   * As the data may be copied during processing, modifying the data after this
+   * call yields undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromMemory)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Add an operation to a model.
+   *
+   * @param model The model to be modified.
+   * @param type The type of the operation.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying each operand.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying each operand.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model,
+                                           ANeuralNetworksOperationType type,
+                                           uint32_t inputCount,
+                                           const uint32_t* inputs,
+                                           uint32_t outputCount,
+                                           const uint32_t* outputs);
+
+  /**
+   * Specifies which operands will be the model's inputs and outputs.
+   *
+   * An operand cannot be used for both input and output. Doing so will
+   * return an error.
+   *
+   * @param model The model to be modified.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying the input operands.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying the output operands.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   */
+  int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
+      ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+      uint32_t outputCount, const uint32_t* outputs);
+
+  /**
+   * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+   * calculated with range and/or precision as low as that of the
+   * IEEE 754 16-bit floating-point format. By default,
+   * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using at least
+   * the range and precision of the IEEE 754 32-bit floating-point format.
+   *
+   * @param model The model to be modified.
+   * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+   *              calculated with range and/or precision as low as that of the
+   *              IEEE 754 16-bit floating point format. 'false' indicates
+   *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated
+   *              using at least the range and precision of the IEEE 754 32-bit
+   *              floating point format.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * Available since API level 28.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   */
+  int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
+      ANeuralNetworksModel* model, bool allow);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+   * This only creates the object. Compilation is only performed once
+   * {@link ANeuralNetworksCompilation_start} is invoked.
+   *
+   * <p>The provided model must outlive the compilation.</p>
+   *
+   * The model must already have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param model The {@link ANeuralNetworksModel} to be compiled.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   */
+  int (*ANeuralNetworksCompilation_create)(
+      ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Destroy a compilation.
+   *
+   * <p>If called on a compilation for which
+   * {@link ANeuralNetworksCompilation_start} has been called, the
+   * function will return immediately but will mark the compilation to be
+   * deleted once the compilation completes. The
+   * {@link ANeuralNetworksCompilation_wait} will return ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksCompilation_free)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Sets the execution preference.
+   *
+   * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param preference Either {@link PREFER_LOW_POWER},
+   *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+   *                  {@link PREFER_SUSTAINED_SPEED}.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPreference)(
+      ANeuralNetworksCompilation* compilation, int32_t preference);
+
+  /**
+   * Waits until the compilation completes.
+   *
+   * More than one thread can wait on a compilation. When the compilation
+   * completes, all threads will be released.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+   */
+  int (*ANeuralNetworksCompilation_finish)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+   * This only creates the object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * <p>The provided compilation must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param execution The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksExecution_create)(
+      ANeuralNetworksCompilation* compilation,
+      ANeuralNetworksExecution** execution);
+
+  /**
+   * Destroy an execution.
+   *
+   * <p>If called on an execution for which
+   * {@link ANeuralNetworksExecution_startCompute} has been called, the
+   * function will return immediately but will mark the execution to be deleted
+   * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+   * will return ANEURALNETWORKS_ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Associate a user buffer with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This should be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other properties of the type must be the same as
+   *             specified in the model. If the type is the same as specified
+   *             when the model was built, NULL can be passed.
+   * @param buffer The buffer containing the data.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, const void* buffer,
+      size_t length);
+
+  /**
+   * Associate part of a memory object with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Associate a user buffer with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param buffer The buffer where the data is to be written.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+  /**
+   * Associate part of a memory object with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory where the data is to be stored.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The length in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Schedule evaluation of the execution.
+   *
+   * <p>Schedules evaluation of the execution. Once the model has been
+   * applied and the outputs are ready to be consumed, the execution will be
+   * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that
+   * signal.
+   * </p>
+   *
+   * Multiple executions can be scheduled and evaluated concurrently, and
+   * compilations can be performed concurrently with executions. The runtime
+   * makes no guarantee on the ordering of the completion of compilations and
+   * executions. If it's important to the application, the application should
+   * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+   * {@link ANeuralNetworksExecution_wait}.
+   *
+   * ANeuralNetworksExecution_wait must be called to recuperate the resources
+   * used by the execution.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_startCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+  /**
+   * Waits until the execution completes.
+   *
+   * More than one thread can wait on an event. When the execution completes,
+   * all threads will be released.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
+
+  /**
+   * Destroys the event.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   */
+  void (*ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
+
+  // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+  // which was added in 8.1.
+  int (*ASharedMemory_create)(const char* name, size_t size);
+
+  /**
+   * Get the number of available devices.
+   *
+   * @param numDevices Used to return the number of devices.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
+
+  /**
+   * Get the representation of the specified device.
+   *
+   * @param devIndex The index of the specified device. Must be less than the
+   *                 number of available devices.
+   * @param device The representation of the specified device.
+   *               The same representation will always be returned for the
+   *               specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+
+  int (*ANeuralNetworks_getDevice)(uint32_t devIndex,
+                                   ANeuralNetworksDevice** device);
+
+  /**
+   * Get the name of the specified device.
+   *
+   * @param device The representation of the specified device.
+   * @param name The returned name of the specified device. The name will be
+   *             in UTF-8 and will be null-terminated. It will be recognizable
+   *             as a known device name rather than a cryptic string. For
+   *             devices with API level 29 and above, the format of the name is
+   *             {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+   *             level 28 or lower, the name will always be “unknown-device”.
+   *             The name will remain valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device,
+                                       const char** name);
+
+  /**
+   * Get the version of the driver implementation of the specified device.
+   *
+   * It’s the responsibility of the driver implementor to insure that this
+   * version string uniquely distinguishes this implementation from all previous
+   * implementations.
+   *
+   * This version string must not be confused with the feature level which is
+   * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is
+   * no implicit ordering of the versions. For example, it is not possible to
+   * filter all drivers older than a certain version.
+   *
+   * Application developers may use this version string to avoid or prefer
+   * specific driver implementations. For example, an application may want to do
+   * so because:
+   *     - A specific version of the driver does not provide the required
+   * performance, perhaps because of a performance regression.
+   *     - A specific version of the driver has a bug or returns results that
+   * don’t match the minimum precision requirement for the application.
+   *
+   * @param device  The representation of the specified device.
+   * @param version The returned version string of the driver for the specified
+   *                device. The string will be in UTF-8 and will be
+   *                null-terminated. For devices with feature level 28 or lower,
+   *                "UNKNOWN" will be returned. The version string will remain
+   *                valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device,
+                                          const char** version);
+
+  /**
+   * Get the supported NNAPI version of the specified device.
+   *
+   * Each device has a supported feature level, which is the most advanced
+   * feature this driver implements. For example, if the driver implements the
+   * features introduced in Android P, but does not implement the features
+   * introduced after Android P, the value would be 28. Developers could decide
+   * whether or not the specified device should be used for a Model that has
+   * certain feature requirements.
+   *
+   * @param device       The representation of the specified device.
+   * @param featureLevel The API level of the most advanced feature this driver
+   *                     implements.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getFeatureLevel)(
+      const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+  /**
+   * Get the type of a given device.
+   *
+   * The device type can be used to help application developers to distribute
+   * Machine Learning workloads and other workloads such as graphical rendering.
+   * E.g., for an app which renders AR scenes based on real time object
+   * detection results, the developer could choose an ACCELERATOR type device
+   * for ML workloads, and reserve GPU for graphical rendering.
+   *
+   * @param device The representation of the specified device.
+   * @param type The returned {@link DeviceTypeCode} of the specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device,
+                                       int32_t* type);
+
+  /**
+   * Get the supported operations for a specified set of devices. If multiple
+   * devices are selected, the supported operation list is a union of supported
+   * operations of all selected devices.
+   *
+   * @param model        The model to be queried.
+   * @param devices      The set of devices. Must not contain duplicates.
+   * @param numDevices   The number of devices in the set.
+   * @param supportedOps The boolean array to be filled. True means supported.
+   *                     The size of the boolean array must be at least as large
+   *                     as the number of operations in the model. The order of
+   *                     elements in the supportedOps array matches the order in
+   *                     which the corresponding operations were added to the
+   *                     model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksModel_getSupportedOperationsForDevices)(
+      const ANeuralNetworksModel* model,
+      const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+      bool* supportedOps);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model for
+   * a specified set of devices. If more than one device is specified, the
+   * compilation will distribute the workload automatically across the devices.
+   * The model must be fully supported by the specified set of devices. This
+   * means that ANeuralNetworksModel_getSupportedOperationsForDevices() must
+   * have returned true for every operation for that model/devices pair.
+   *
+   * @param model       The {@link ANeuralNetworksModel} to be compiled.
+   * @param devices     The set of devices. Must not contain duplicates.
+   * @param numDevices  The number of devices in the set.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_createForDevices)(
+      ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+      uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Sets the compilation caching signature and the cache directory.
+   *
+   * Provides optional caching information to the runtime for faster repeated
+   * compilation.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param cacheDir The cache directory to store and retrieve caching data. It
+   *                 is recommended to use the code_cache provided by the
+   *                 Android runtime. If not using the code_cache, the user
+   *                 should choose a directory local to the application, and is
+   *                 responsible to manage and clean the cache entries.
+   * @param token The token provided by the user to specify a model, must be of
+   *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user
+   *              should ensure that the token is unique to a model within the
+   *              application. The NNAPI runtime will not detected token
+   *              collisions. If there is a collision, the compilation outcome
+   *              may be incorrect without notifying with error.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_setCaching)(
+      ANeuralNetworksCompilation* compilation, const char* cacheDir,
+      const uint8_t* token);
+
+  /**
+   * Schedule synchronous evaluation of the execution.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.
+   * </p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * See {@link ANeuralNetworksExecution_startCompute} for asynchronous
+   * execution. Synchronous execution incurs lower overhead than asynchronous
+   * execution.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+   *         cannot be properly mapped.
+   */
+  int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param rank The rank of the output operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandRank)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+   * scalar.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is an
+   *              index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param dimensions The dimension array to be filled. The size of the array
+   *                   must be exactly as large as the rank of the output
+   *                   operand to be queried in the model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid or if the target
+   *         is a scalar.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandDimensions)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+  /**
+   * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+   * This only creates the burst object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+   * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+   *
+   * <p>The provided compilation must outlive the burst object.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param burst The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation,
+                                     ANeuralNetworksBurst** burst);
+
+  /**
+   * Destroys the burst object.
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
+
+  /**
+   * Schedule synchronous evaluation of the execution on a burst object.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.</p>
+   *
+   * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+   * any given time for any given burst object. Any
+   * {@link ANeuralNetworksExecution} launched before the previous has finished
+   * will result in ANEURALNETWORKS_BAD_STATE.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to execute on.
+   * @param execution The execution to be scheduled and executed. The execution
+   *                  must be created from the same {@link
+   *                  ANeuralNetworksCompilation} as the burst object.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksExecution_burstCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+  /**
+   * Creates a shared memory object from an AHardwareBuffer handle.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of
+   * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as
+   * shared memory created from a file handle. See
+   * {@link ANeuralNetworksMemory} for a description on how to use this
+   * shared memory.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of a format other
+   * than AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs
+   * and outputs. When calling
+   * {@link ANeuralNetworksExecution_setInputFromMemory} or
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared
+   * memory, both offset and length must be set to zero and the entire
+   * memory region will be associated with the specified input or output
+   * operand. There is no guarantee that an arbitrary AHardwareBuffer_Format
+   * and AHardwareBuffer_UsageFlags combination can be used by arbitrary
+   * devices. The execution will fail if selected set of devices cannot
+   * consume the buffer.
+   *
+   * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with
+   * shared memory backed by an AHardwareBuffer of a format other than
+   * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+   *
+   * TODO(miaowang): add documentation about intended usage with
+   * introspection API.
+   *
+   * Available since API level 29.
+   *
+   * @param ahwb The AHardwareBuffer handle.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   *
+   * @see AHardwareBuffer
+   */
+  int (*ANeuralNetworksMemory_createFromAHardwareBuffer)(
+      const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+  /**
+   * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+   * measured. By default, duration is not measured.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be modified.
+   * @param measure 'true' if duration is to be measured, 'false' if not.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_setMeasureTiming)(
+      ANeuralNetworksExecution* execution, bool measure);
+
+  /**
+   * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+   * nanoseconds. The execution must have completed.
+   *
+   * @param execution The execution to be queried.
+   * @param durationCode The measurement to be queried, specified by {@link
+   * DurationCode}.
+   * @param duration The returned duration. If no measurement was requested by
+   *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+   * some other reason the duration is not available, UINT64_MAX will be
+   * returned. A particular device need not support any given measurement.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_getDuration)(
+      const ANeuralNetworksExecution* execution, int32_t durationCode,
+      uint64_t* duration);
+
+  /**/
+};
+
+/**
+ * Load the NNAPI implementation from the shared libraries.
+ * The NnApi structure is filled with all the pointers. If one function doesn't
+ * exist, a null pointer is stored.
+ */
+const NnApi* NnApiImplementation();
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc78e53da64b209d53bfcfc97e194e7430f016c
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
@@ -0,0 +1,20 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = {};
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_test.cc b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f30b95ec37e3c878d3bdbc1acc96026dfeef9e1
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include <gtest/gtest.h>
+
+namespace {
+
+TEST(NnapiLibTest, NnApiImplementation) {
+  const NnApi* nnapi = NnApiImplementation();
+  EXPECT_NE(nnapi, nullptr);
+#ifdef __ANDROID__
+  EXPECT_GT(nnapi->android_sdk_version, 0);
+  if (nnapi.android_sdk_version < 27) {
+    EXPECT_FALSE(nnapi->nnapi_exists);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+              nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_EQ(nnapi->ASharedMemory_create, nullptr);
+  } else {
+    EXPECT_TRUE(nnapi->nnapi_exists);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    if (nnapi->android_sdk_version >= 28) {
+      // relaxComputationFloat32toFloat16 only available with Android 9.0 (P).
+      EXPECT_NE(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    } else {
+      EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    }
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+    // TODO(b/123423795): Test Q-specific APIs after release.
+  }
+#else
+  EXPECT_FALSE(nnapi->nnapi_exists);
+  EXPECT_EQ(nnapi->android_sdk_version, 0);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+  EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworks_getDeviceCount, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworks_getDevice, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getName, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getVersion, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getFeatureLevel, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_getSupportedOperationsForDevices,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_createForDevices, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setCaching, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_compute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getOutputOperandRank, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getOutputOperandDimensions,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksBurst_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksBurst_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_burstCompute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromAHardwareBuffer, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setMeasureTiming, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getDuration, nullptr);
+#endif
+}
+
+}  // namespace
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index dc8e81cde758f6d187046d865d42141200f753bc..443651b9910fc0c4b5388409ce450f9638de4898 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -84,56 +84,27 @@ void logError(const char* format, ...) {
 static const int64_t kOperandIdNotSet = -1;
 static const int64_t kOperandNotNeeded = -2;
 
-namespace {
-
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return 0xFFFF;
-      }
-    }
-    return atoi(sdkVersion);
-  }
-  FATAL("No %s prop", sdkProp);
-#endif  // __ANDROID__
-  return 0;
-}
-
-int32_t GetAndroidSdkVersionCached() {
-  static int32_t androidSdkVersion = GetAndroidSdkVersion();
-  return androidSdkVersion;
-}
-
-}  // namespace
-
 NNAPIAllocation::NNAPIAllocation(const char* filename,
                                  ErrorReporter* error_reporter)
     : MMAPAllocation(filename, error_reporter) {
   if (mmapped_buffer_ != MAP_FAILED)
-    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
-                                                mmap_fd_, 0, &handle_));
+    CHECK_NN(NnApiImplementation()->ANeuralNetworksMemory_createFromFd(
+        buffer_size_bytes_, PROT_READ, mmap_fd_, 0, &handle_));
 }
 
 NNAPIAllocation::~NNAPIAllocation() {
   if (handle_) {
-    ANeuralNetworksMemory_free(handle_);
+    NnApiImplementation()->ANeuralNetworksMemory_free(handle_);
   }
 }
 
 NNAPIDelegate::~NNAPIDelegate() {
   if (nn_compiled_model_) {
-    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(nn_compiled_model_);
     nn_compiled_model_ = nullptr;
   }
   if (nn_model_) {
-    ANeuralNetworksModel_free(nn_model_);
+    NnApiImplementation()->ANeuralNetworksModel_free(nn_model_);
     nn_model_ = nullptr;
     // TODO(aselle): Is this thread-safe and callable multiple times?
   }
@@ -145,6 +116,7 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
+  const NnApi* nnapi = NnApiImplementation();
   uint32_t next_id = 0;
   for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
@@ -198,24 +170,24 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
     RETURN_ERROR_IF_NN_FAILED(
-        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+        nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
     // TODO(aselle): Based on Michael's suggestion, limiting this to read
     // only memory
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
               static_cast<const Allocation*>(tensor->allocation))) {
         RETURN_ERROR_IF_NN_FAILED(
-            ANeuralNetworksModel_setOperandValueFromMemory(
+            nnapi->ANeuralNetworksModel_setOperandValueFromMemory(
                 nn_model, next_id, alloc->memory(),
                 alloc->offset(tensor->data.raw), tensor->bytes));
       } else {
-        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
+        RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
             nn_model, next_id, tensor->data.raw, tensor->bytes));
       }
     } else if (tensor->bytes == 0) {
       // These size 0 tensors are optional tensors reserved.
-      RETURN_ERROR_IF_NN_FAILED(
-          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+      RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, nullptr, 0));
     }
 
     ++next_id;
@@ -244,6 +216,7 @@ TfLiteStatus AddOpsAndParams(
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
+  const NnApi* nnapi = NnApiImplementation();
   for (size_t i = 0; i < subgraph->nodes_size(); i++) {
     const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
@@ -258,21 +231,21 @@ TfLiteStatus AddOpsAndParams(
     MapAndAddTensorIds(node.outputs->data, node.outputs->size,
                        &augmented_outputs, tensor_id_to_nnapi_id);
 
-    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_int32 = [nnapi, &nn_model, &augmented_inputs,
                              &next_id](int value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(int32_t)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(int32_t)))
       augmented_inputs.push_back(next_id++);
     };
 
-    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_float32 = [nnapi, &nn_model, &augmented_inputs,
                                &next_id](float value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(float)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(float)))
       augmented_inputs.push_back(next_id++);
     };
 
@@ -281,8 +254,8 @@ TfLiteStatus AddOpsAndParams(
           .type = ANEURALNETWORKS_TENSOR_INT32,
           .dimensionCount = 1,
           .dimensions = &num_values};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
           nn_model, next_id, values, sizeof(int32_t) * num_values));
       augmented_inputs.push_back(next_id++);
     };
@@ -291,15 +264,16 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
-         &model_state_outputs](int tensor_id) {
+        [nnapi, subgraph, &nn_model, &next_id, &augmented_inputs,
+         &model_state_inputs, &model_state_outputs](int tensor_id) {
           const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
               reinterpret_cast<uint32_t*>(tensor->dims->data),
               tensor->params.scale, tensor->params.zero_point};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
           augmented_inputs.push_back(next_id);
           model_state_inputs->push_back(next_id);
           model_state_outputs->push_back(tensor_id);
@@ -388,7 +362,7 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [nnapi, subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
@@ -398,7 +372,7 @@ TfLiteStatus AddOpsAndParams(
           static_cast<uint32_t>(tensor->dims->size),
           reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
           tensor->params.zero_point};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
       augmented_outputs.insert(augmented_outputs.begin(), next_id++);
     };
 
@@ -427,15 +401,16 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // Handle optional input tensors.
-    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+    auto add_optional_tensors = [nnapi, &nn_model, &augmented_inputs,
                                  &next_id](int nn_type) {
       for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
         if (augmented_inputs[idx] == kOptionalTensor) {
           const std::vector<uint32_t> dim = {0, 0};
           ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
-                                                        nullptr, 0))
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+              nn_model, next_id, nullptr, 0))
           augmented_inputs[idx] = next_id++;
         }
       }
@@ -635,6 +610,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_SPLIT:
       case tflite::BuiltinOperator_STRIDED_SLICE:
       case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_COS:
       case tflite::BuiltinOperator_LOG_SOFTMAX:
       case tflite::BuiltinOperator_DEQUANTIZE:
       case tflite::BuiltinOperator_DELEGATE:
@@ -687,6 +663,14 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_ABS:
       case tflite::BuiltinOperator_SPLIT_V:
       case tflite::BuiltinOperator_UNIQUE:
+      case tflite::BuiltinOperator_CEIL:
+      case tflite::BuiltinOperator_REVERSE_V2:
+      case tflite::BuiltinOperator_ADD_N:
+      case tflite::BuiltinOperator_GATHER_ND:
+      case tflite::BuiltinOperator_WHERE:
+      case tflite::BuiltinOperator_RANK:
+      case tflite::BuiltinOperator_ELU:
+      case tflite::BuiltinOperator_REVERSE_SEQUENCE:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -696,13 +680,13 @@ TfLiteStatus AddOpsAndParams(
         break;
     }
 
-    if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
+    if (nnapi_version == 11 && nnapi->android_sdk_version < 28) {
       logError("Op %d needs NNAPI1.1", builtin);
       return kTfLiteError;
     }
 
     // Add the operation.
-    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
+    RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
         augmented_inputs.data(),
         static_cast<uint32_t>(augmented_outputs.size()),
@@ -714,9 +698,10 @@ TfLiteStatus AddOpsAndParams(
 TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
+  const NnApi* nnapi = NnApiImplementation();
   // TODO(aselle): This is not correct. need to handle resize invalidation.
   if (!nn_model_) {
-    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_create(&nn_model_));
 
     // Find which tensors should be added to NNAPI. TFLite has temporaries
     // and RNN back-edges which are are not valid for NNAPI. We look through all
@@ -763,21 +748,22 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
 
-    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+    CHECK_NN(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs(
         nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
         static_cast<uint32_t>(augmented_outputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
 
-    if (GetAndroidSdkVersionCached() >= 28) {
-      CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    if (nnapi->android_sdk_version >= 28) {
+      CHECK_NN(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
           nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
-    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_finish(nn_model_));
   }
   if (!nn_compiled_model_) {
-    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
-    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_create(nn_model_,
+                                                      &nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_finish(nn_compiled_model_));
   }
   return kTfLiteOk;
 }
@@ -793,8 +779,10 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     return model_status_;
   }
 
+  const NnApi* nnapi = NnApiImplementation();
   ANeuralNetworksExecution* execution = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+  CHECK_NN(
+      nnapi->ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
   for (size_t i = 0; i < subgraph->inputs().size(); i++) {
@@ -802,7 +790,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
     TfLiteTensor* tensor = subgraph->tensor(input);
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -810,7 +798,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   for (size_t i = 0; i < subgraph->outputs().size(); i++) {
     int output = subgraph->outputs()[i];
     TfLiteTensor* tensor = subgraph->tensor(output);
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -822,21 +810,21 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
   // Currently use blocking compute.
   ANeuralNetworksEvent* event = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
-  CHECK_NN(ANeuralNetworksEvent_wait(event));
-  ANeuralNetworksEvent_free(event);
-  ANeuralNetworksExecution_free(execution);
+  CHECK_NN(nnapi->ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(nnapi->ANeuralNetworksEvent_wait(event));
+  nnapi->ANeuralNetworksEvent_free(event);
+  nnapi->ANeuralNetworksExecution_free(execution);
 
 #if 0
   printf("From the NN API:\n");
@@ -854,6 +842,8 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   return kTfLiteOk;
 }
 
-bool NNAPIDelegate::IsSupported() { return NNAPIExists(); }
+bool NNAPIDelegate::IsSupported() {
+  return NnApiImplementation()->nnapi_exists;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 64723ba3856b75a614ded2a134a6a61254b38657..bbc252045baad0316333bf9bc19dd78b8bd58590 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -74,7 +74,6 @@ tf_cc_test(
         ":profile_summarizer",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 9aa9e411314b2f389fda1bedaa290a87021ee254..2202df2dbe3613aab1f700b88398cae5b9aa01a0 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -128,7 +128,7 @@ class ProfileBuffer {
   // Returns the profile event at the given index. If the index is invalid a
   // nullptr is returned. The return event may get overwritten if more events
   // are added to buffer.
-  const struct ProfileEvent* const At(int index) const {
+  const struct ProfileEvent* const At(size_t index) const {
     size_t size = Size();
     if (index >= size) {
       return nullptr;
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index 6642a15884fdf57cb385e186fd75620183098375..92973302e73c22ad24707983df87001ec81db667 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -27,7 +27,7 @@ namespace {
 
 std::vector<const ProfileEvent*> GetProfileEvents(const ProfileBuffer& buffer) {
   std::vector<const ProfileEvent*> events;
-  for (auto i = 0; i < buffer.Size(); i++) {
+  for (size_t i = 0; i < buffer.Size(); i++) {
     events.push_back(buffer.At(i));
   }
   return events;
@@ -69,7 +69,7 @@ TEST(ProfileBufferTest, OverFlow) {
     EXPECT_EQ(expected_size, buffer.Size());
   }
   EXPECT_EQ(max_size, buffer.Size());
-  for (int j = 0; j < buffer.Size(); ++j) {
+  for (size_t j = 0; j < buffer.Size(); ++j) {
     auto event = buffer.At(j);
     EXPECT_EQ(eventNames[j % 4], event->tag);
     EXPECT_EQ(ProfileEvent::EventType::DEFAULT, event->event_type);
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 4949d7c92eb6fcf9fcb2c6244a9c888d3a0559d6..a31f6cec707718d0a9c9ba5a96c7625f09cd724e 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -4,12 +4,6 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "interpreter_test_data",
-    srcs = glob(["**/testdata/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "interpreter",
     srcs = [
@@ -19,7 +13,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
-        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
@@ -27,9 +20,11 @@ py_library(
 py_test(
     name = "interpreter_test",
     srcs = ["interpreter_test.py"],
-    data = [":interpreter_test_data"],
+    data = ["//tensorflow/lite/python/testdata:interpreter_test_data"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":interpreter",
         "//tensorflow/python:client_testlib",
@@ -44,6 +39,22 @@ py_binary(
     srcs = ["tflite_convert.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":tflite_convert_main_lib"],
+)
+
+py_library(
+    name = "tflite_convert_main_lib",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":tflite_convert_lib"],
+)
+
+py_library(
+    name = "tflite_convert_lib",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":lite",
     ],
@@ -60,6 +71,8 @@ py_library(
         ":interpreter",
         ":lite_constants",
         ":op_hint",
+        "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
+        "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
         "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/keras",
@@ -75,6 +88,36 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "lite_v2_test",
+    srcs = ["lite_v2_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "lite_flex_test",
+    srcs = ["lite_flex_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        # TODO(b/111881877): Enable in oss after resolving op registry issues.
         "no_oss",
         "no_windows",
     ],
@@ -145,7 +188,6 @@ py_library(
     srcs = ["convert_saved_model.py"],
     srcs_version = "PY2AND3",
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9c603998717019ac8624868b16d720e300a30efd..c3f15816e256a8da491dda4b702d68f12e2dc59c 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -214,7 +214,17 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
 
 
 def tensor_name(x):
-  return x.name.split(":")[0]
+  """Returns name of the input tensor."""
+  parts = x.name.split(":")
+  if len(parts) > 2:
+    raise ValueError("Tensor name invalid. Expect 0 or 1 colon, got {0}".format(
+        len(parts) - 1))
+
+  # To be consistent with the tensor naming scheme in tensorflow, we need
+  # drop the ':0' suffix for the first tensor.
+  if len(parts) > 1 and parts[1] != "0":
+    return x.name
+  return parts[0]
 
 
 # Don't expose these for now.
@@ -244,7 +254,7 @@ def build_toco_convert_protos(input_tensors,
 
   Args:
     input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
+      `foo.shape` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     inference_type: Target data type of real-number arrays in the output file.
       Must be `{tf.float32, tf.uint8}`.  (default tf.float32)
@@ -347,7 +357,7 @@ def build_toco_convert_protos(input_tensors,
                          "inference_input_type is QUANTIZED_UINT8.")
       input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     if input_shapes is None:
-      shape = input_tensor.get_shape()
+      shape = input_tensor.shape
     else:
       shape = input_shapes[idx]
     input_array.shape.dims.extend(map(int, shape))
@@ -423,7 +433,7 @@ def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
   Args:
     input_data: Input data (i.e. often `sess.graph_def`),
     input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
+      `foo.shape` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     *args: See `build_toco_convert_protos`,
     **kwargs: See `build_toco_convert_protos`.
@@ -443,7 +453,7 @@ def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
   return data
 
 
-@_tf_export("lite.toco_convert")
+@_tf_export(v1=["lite.toco_convert"])
 @deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
 def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
   """Convert a model using TOCO.
@@ -456,7 +466,7 @@ def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
   Args:
     input_data: Input data (i.e. often `sess.graph_def`),
     input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
+      `foo.shape` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     *args: See `build_toco_convert_protos`,
     **kwargs: See `build_toco_convert_protos`.
diff --git a/tensorflow/lite/python/convert_saved_model.py b/tensorflow/lite/python/convert_saved_model.py
index f8d986b746911c68e0589b587ce0beceafc0c534..b085a106f993b0bff63b3ce52ac45f19bbe7c4f2 100644
--- a/tensorflow/lite/python/convert_saved_model.py
+++ b/tensorflow/lite/python/convert_saved_model.py
@@ -215,8 +215,7 @@ def set_tensor_shapes(tensors, shapes):
           tensor.set_shape(shape)
         except ValueError as error:
           message = ("The shape of tensor '{0}' cannot be changed from {1} to "
-                     "{2}. {3}".format(name, tensor.get_shape(), shape,
-                                       str(error)))
+                     "{2}. {3}".format(name, tensor.shape, shape, str(error)))
           raise ValueError(message)
 
 
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index e270abaa5afa0f2b3bb255e896c706794277c26e..12d8d494c1f1845e5a5f3bd11307cc9c4c1a761e 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -55,6 +55,17 @@ class ConvertTest(test_util.TensorFlowTestCase):
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
     #   result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
 
+  def testTensorName(self):
+    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
+    # out_tensors should have names: "split:0", "split:1", "split:2", "split:3".
+    out_tensors = array_ops.split(
+        value=in_tensor, num_or_size_splits=[1, 1, 1, 1], axis=0)
+    expect_names = ["split", "split:1", "split:2", "split:3"]
+
+    for i in range(len(expect_names)):
+      got_name = convert.tensor_name(out_tensors[i])
+      self.assertEqual(got_name, expect_names[i])
+
   def testQuantization(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
                                       dtype=dtypes.float32)
@@ -323,6 +334,7 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       self.assertEqual(self._get_input_index(a), 0)
       self.assertEqual(self._get_sort_index(a), 0)
       self.assertEqual(self._get_input_index(b), 1)
+      self.assertEqual(self._get_sort_index(b), 0)
       self.assertEqual(self._get_input_index(c), 0)
       self.assertEqual(self._get_sort_index(c), 1)
 
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index a1325f0b1ff8bec11f0ad90846154401b1bb0134..9b9516f6d0bdedb30e9ddcb419639920fe6e000f 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -216,7 +216,8 @@ class Interpreter(object):
   def get_tensor(self, tensor_index):
     """Gets the value of the input tensor (get a copy).
 
-    If you wish to avoid the copy, use `tensor()`.
+    If you wish to avoid the copy, use `tensor()`. This function cannot be used
+    to read intermediate results.
 
     Args:
       tensor_index: Tensor index of tensor to get. This value can be gotten from
@@ -233,7 +234,8 @@ class Interpreter(object):
     This allows reading and writing to this tensors w/o copies. This more
     closely mirrors the C++ Interpreter class interface's tensor() member, hence
     the name. Be careful to not hold these output references through calls
-    to `allocate_tensors()` and `invoke()`.
+    to `allocate_tensors()` and `invoke()`. This function cannot be used to read
+    intermediate results.
 
     Usage:
 
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 7ec56a21c9ffa82e1893d3846d92564539ac34ae..b21779226f62ead3fd4bde5aacdfc393a4d5bff9 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -91,6 +91,41 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     output_data = interpreter.get_tensor(output_details[0]['index'])
     self.assertTrue((expected_output == output_data).all())
 
+  def testString(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/gather_string.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
+    self.assertTrue(([10] == input_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[0]['quantization'])
+    self.assertEqual('indices', input_details[1]['name'])
+    self.assertEqual(np.int64, input_details[1]['dtype'])
+    self.assertTrue(([3] == input_details[1]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
+    self.assertTrue(([3] == output_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), output_details[0]['quantization'])
+
+    test_input = np.array([1, 2, 3], dtype=np.int64)
+    interpreter.set_tensor(input_details[1]['index'], test_input)
+
+    test_input = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
+    expected_output = np.array([b'b', b'c', b'd'])
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
+
 
 class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 767a9fc476398dd8fb60128f73f8ae7c518d9a21..6ec7ce497a51b9b7b66d680ea9a81ef47df51718 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -6,12 +6,26 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
+cc_library(
+    name = "numpy",
+    srcs = ["numpy.cc"],
+    hdrs = ["numpy.h"],
+    deps = [
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 cc_library(
     name = "interpreter_wrapper_lib",
     srcs = ["interpreter_wrapper.cc"],
     hdrs = ["interpreter_wrapper.h"],
     deps = [
+        ":numpy",
+        ":python_error_reporter",
+        ":python_utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -19,6 +33,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "python_error_reporter",
+    srcs = ["python_error_reporter.cc"],
+    hdrs = ["python_error_reporter.h"],
+    deps = [
+        "//tensorflow/lite/core/api",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+cc_library(
+    name = "python_utils",
+    srcs = ["python_utils.cc"],
+    hdrs = ["python_utils.h"],
+    deps = [
+        ":numpy",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tf_py_wrap_cc(
     name = "tensorflow_wrap_interpreter_wrapper",
     srcs = [
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index d14af439ec0ab600ea260da17ef0041cca25d629..6023587d3b191d8c486dac78b889510ff1c22805 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -21,22 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define CPP_TO_PYSTRING PyBytes_FromStringAndSize
-#else
-#define PY_TO_CPPSTRING PyString_AsStringAndSize
-#define CPP_TO_PYSTRING PyString_FromStringAndSize
-#endif
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/string_util.h"
 
 #define TFLITE_PY_CHECK(x)               \
   if ((x) != kTfLiteOk) {                \
@@ -60,43 +48,9 @@ limitations under the License.
 namespace tflite {
 namespace interpreter_wrapper {
 
-class PythonErrorReporter : public tflite::ErrorReporter {
- public:
-  PythonErrorReporter() {}
-
-  // Report an error message
-  int Report(const char* format, va_list args) override {
-    char buf[1024];
-    int formatted = vsnprintf(buf, sizeof(buf), format, args);
-    buffer_ << buf;
-    return formatted;
-  }
-
-  // Set's a Python runtime exception with the last error.
-  PyObject* exception() {
-    std::string last_message = message();
-    PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
-    return nullptr;
-  }
-
-  // Gets the last error message and clears the buffer.
-  std::string message() {
-    std::string value = buffer_.str();
-    buffer_.clear();
-    return value;
-  }
-
- private:
-  std::stringstream buffer_;
-};
-
 namespace {
 
-// Calls PyArray's initialization to initialize all the API pointers. Note that
-// this usage implies only this translation unit can use the pointers. See
-// tensorflow/python/core/numpy.cc for a strategy if we ever need to extend
-// this further.
-void ImportNumpy() { import_array1(); }
+using python_utils::PyDecrefDeleter;
 
 std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     const tflite::FlatBufferModel* model,
@@ -105,7 +59,7 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     return nullptr;
   }
 
-  ImportNumpy();
+  ::tflite::python::ImportNumpy();
 
   std::unique_ptr<tflite::Interpreter> interpreter;
   if (tflite::InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
@@ -114,65 +68,6 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   return interpreter;
 }
 
-int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
-  switch (tf_lite_type) {
-    case kTfLiteFloat32:
-      return NPY_FLOAT32;
-    case kTfLiteInt32:
-      return NPY_INT32;
-    case kTfLiteInt16:
-      return NPY_INT16;
-    case kTfLiteUInt8:
-      return NPY_UINT8;
-    case kTfLiteInt8:
-      return NPY_INT8;
-    case kTfLiteInt64:
-      return NPY_INT64;
-    case kTfLiteString:
-      return NPY_OBJECT;
-    case kTfLiteBool:
-      return NPY_BOOL;
-    case kTfLiteComplex64:
-      return NPY_COMPLEX64;
-    case kTfLiteNoType:
-      return NPY_NOTYPE;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return NPY_NOTYPE;
-}
-
-TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
-  int pyarray_type = PyArray_TYPE(array);
-  switch (pyarray_type) {
-    case NPY_FLOAT32:
-      return kTfLiteFloat32;
-    case NPY_INT32:
-      return kTfLiteInt32;
-    case NPY_INT16:
-      return kTfLiteInt16;
-    case NPY_UINT8:
-      return kTfLiteUInt8;
-    case NPY_INT8:
-      return kTfLiteInt8;
-    case NPY_INT64:
-      return kTfLiteInt64;
-    case NPY_BOOL:
-      return kTfLiteBool;
-    case NPY_OBJECT:
-    case NPY_STRING:
-    case NPY_UNICODE:
-      return kTfLiteString;
-    case NPY_COMPLEX64:
-      return kTfLiteComplex64;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return kTfLiteNoType;
-}
-
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
 PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
   void* pydata = malloc(size * sizeof(int));
   memcpy(pydata, data, size * sizeof(int));
@@ -307,7 +202,7 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
     return nullptr;
   }
 
-  int code = TfLiteTypeToPyArrayType(tensor->type);
+  int code = python_utils::TfLiteTypeToPyArrayType(tensor->type);
   if (code == -1) {
     PyErr_Format(PyExc_ValueError, "Invalid tflite type code %d", code);
     return nullptr;
@@ -350,38 +245,53 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value) {
   }
 
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TfLiteTensor* tensor = interpreter_->tensor(i);
 
-  if (TfLiteTypeFromPyArray(array) != tensor->type) {
+  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
     PyErr_Format(PyExc_ValueError,
                  "Cannot set tensor:"
                  " Got tensor of type %d"
                  " but expected type %d for input %d ",
-                 TfLiteTypeFromPyArray(array), tensor->type, i);
+                 python_utils::TfLiteTypeFromPyArray(array), tensor->type, i);
     return nullptr;
   }
 
   if (PyArray_NDIM(array) != tensor->dims->size) {
-    PyErr_SetString(PyExc_ValueError, "Cannot set tensor: Dimension mismatch");
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor: Dimension mismatch."
+                 " Got %d"
+                 " but expected %d for input %d.",
+                 PyArray_NDIM(array), tensor->dims->size, i);
     return nullptr;
   }
 
   for (int j = 0; j < PyArray_NDIM(array); j++) {
     if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
-      PyErr_SetString(PyExc_ValueError,
-                      "Cannot set tensor: Dimension mismatch");
+      PyErr_Format(PyExc_ValueError,
+                   "Cannot set tensor: Dimension mismatch."
+                   " Got %ld"
+                   " but expected %d for dimension %d of input %d.",
+                   PyArray_SHAPE(array)[j], tensor->dims->data[j], j, i);
       return nullptr;
     }
   }
 
-  size_t size = PyArray_NBYTES(array);
-  if (size != tensor->bytes) {
-    PyErr_Format(PyExc_ValueError,
-                 "numpy array had %zu bytes but expected %zu bytes.", size,
-                 tensor->bytes);
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    size_t size = PyArray_NBYTES(array);
+    if (size != tensor->bytes) {
+      PyErr_Format(PyExc_ValueError,
+                   "numpy array had %zu bytes but expected %zu bytes.", size,
+                   tensor->bytes);
+      return nullptr;
+    }
+    memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  } else {
+    DynamicBuffer dynamic_buffer;
+    if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+      return nullptr;
+    }
+    dynamic_buffer.WriteToTensor(tensor, nullptr);
   }
-  memcpy(tensor->data.raw, PyArray_DATA(array), size);
   Py_RETURN_NONE;
 }
 
@@ -400,7 +310,7 @@ PyObject* CheckGetTensorArgs(Interpreter* interpreter_, int tensor_index,
     return nullptr;
   }
 
-  *type_num = TfLiteTypeToPyArrayType((*tensor)->type);
+  *type_num = python_utils::TfLiteTypeToPyArrayType((*tensor)->type);
   if (*type_num == -1) {
     PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
     return nullptr;
@@ -428,19 +338,51 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
 
   std::vector<npy_intp> dims(tensor->dims->data,
                              tensor->dims->data + tensor->dims->size);
-  // Make a buffer copy but we must tell Numpy It owns that data or else
-  // it will leak.
-  void* data = malloc(tensor->bytes);
-  if (!data) {
-    PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    // Make a buffer copy but we must tell Numpy It owns that data or else
+    // it will leak.
+    void* data = malloc(tensor->bytes);
+    if (!data) {
+      PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
+      return nullptr;
+    }
+    memcpy(data, tensor->data.raw, tensor->bytes);
+    PyObject* np_array =
+        PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
+    PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
+                        NPY_ARRAY_OWNDATA);
+    return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  } else {
+    // Create a C-order array so the data is contiguous in memory.
+    const int32_t kCOrder = 0;
+    PyObject* py_object =
+        PyArray_EMPTY(dims.size(), dims.data(), NPY_OBJECT, kCOrder);
+
+    if (py_object == nullptr) {
+      PyErr_SetString(PyExc_MemoryError, "Failed to allocate PyArray.");
+      return nullptr;
+    }
+
+    PyArrayObject* py_array = reinterpret_cast<PyArrayObject*>(py_object);
+    PyObject** data = reinterpret_cast<PyObject**>(PyArray_DATA(py_array));
+    auto num_strings = GetStringCount(tensor->data.raw);
+    for (int j = 0; j < num_strings; ++j) {
+      auto ref = GetString(tensor->data.raw, j);
+
+      PyObject* bytes = PyBytes_FromStringAndSize(ref.str, ref.len);
+      if (bytes == nullptr) {
+        Py_DECREF(py_object);
+        PyErr_Format(PyExc_ValueError,
+                     "Could not create PyBytes from string %d of input %d.", j,
+                     i);
+        return nullptr;
+      }
+      // PyArray_EMPTY produces an array full of Py_None, which we must decref.
+      Py_DECREF(data[j]);
+      data[j] = bytes;
+    }
+    return py_object;
   }
-  memcpy(data, tensor->data.raw, tensor->bytes);
-  PyObject* np_array =
-      PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
-  PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
-                      NPY_ARRAY_OWNDATA);
-  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
 PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
@@ -477,7 +419,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   char * buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
     return nullptr;
   }
   std::unique_ptr<tflite::FlatBufferModel> model =
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
index f52ef1eeca7db397d84d249b74445a3276bc65fb..ef4b28f04723ab8d7f4f395a028bb565b4ca9cf3 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 %}
 
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff5403d2a60a66886681db73c4aa69bf43369170
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -0,0 +1,25 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy() { import_array1(); }
+
+}  // namespace python
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3b013fcb27ad1837dfb83efbcec2ae800850058
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// To handle PyArray_* calles, numpy defines a static lookup table called
+// PyArray_API, or PY_ARRAY_UNIQUE_SYMBOL, if defined. This causes the
+// PyArray_* pointers to be different for different translation units, unless
+// we take care of selectivel defined NO_IMPORT_ARRAY.
+//
+// Virtually every usage will define NO_IMPORT_ARRAY, and will have access to
+// the lookup table via:
+//   extern void **PyArray_API;
+// In numpy.cc we will define TFLITE_IMPORT_NUMPY, effectively disabling that
+// and instead using:
+//   void **PyArray_API;
+// which is initialized when ImportNumpy() is called.
+//
+// If we don't define PY_ARRAY_UNIQUE_SYMBOL then PyArray_API is a static
+// variable, which causes strange crashes when the pointers are used across
+// translation unit boundaries.
+//
+// For mone info see https://sourceforge.net/p/numpy/mailman/message/5700519
+// See also tensorflow/python/lib/core/numpy.h for a similar approach.
+#define PY_ARRAY_UNIQUE_SYMBOL _tensorflow_numpy_api
+#ifndef TFLITE_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy();
+
+}  // namespace python
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..803a4c29345a44bcdba41d851884fa86d6e87d3e
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+// Report an error message
+int PythonErrorReporter::Report(const char* format, va_list args) {
+  char buf[1024];
+  int formatted = vsnprintf(buf, sizeof(buf), format, args);
+  buffer_ << buf;
+  return formatted;
+}
+
+// Set's a Python runtime exception with the last error.
+PyObject* PythonErrorReporter::exception() {
+  std::string last_message = message();
+  PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
+  return nullptr;
+}
+
+// Gets the last error message and clears the buffer.
+std::string PythonErrorReporter::message() {
+  std::string value = buffer_.str();
+  buffer_.clear();
+  return value;
+}
+}  // namespace interpreter_wrapper
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d4e308834a21b795644f0c1f89607a3b75ad7ce
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+
+#include <Python.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+class PythonErrorReporter : public tflite::ErrorReporter {
+ public:
+  PythonErrorReporter() {}
+
+  // Report an error message
+  int Report(const char* format, va_list args) override;
+
+  // Sets a Python runtime exception with the last error and
+  // clears the error message buffer.
+  PyObject* exception();
+
+  // Gets the last error message and clears the buffer.
+  std::string message();
+
+ private:
+  std::stringstream buffer_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3d713630f6d39dd21b3c01cc4c75d4408243827
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+
+#include <memory>
+
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
+namespace tflite {
+namespace python_utils {
+
+struct PyObjectDereferencer {
+  void operator()(PyObject* py_object) const { Py_DECREF(py_object); }
+};
+
+using UniquePyObjectRef = std::unique_ptr<PyObject, PyObjectDereferencer>;
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
+  switch (tf_lite_type) {
+    case kTfLiteFloat32:
+      return NPY_FLOAT32;
+    case kTfLiteInt32:
+      return NPY_INT32;
+    case kTfLiteInt16:
+      return NPY_INT16;
+    case kTfLiteUInt8:
+      return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
+    case kTfLiteInt64:
+      return NPY_INT64;
+    case kTfLiteString:
+      return NPY_STRING;
+    case kTfLiteBool:
+      return NPY_BOOL;
+    case kTfLiteComplex64:
+      return NPY_COMPLEX64;
+    case kTfLiteNoType:
+      return NPY_NOTYPE;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return NPY_NOTYPE;
+}
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
+  int pyarray_type = PyArray_TYPE(array);
+  switch (pyarray_type) {
+    case NPY_FLOAT32:
+      return kTfLiteFloat32;
+    case NPY_INT32:
+      return kTfLiteInt32;
+    case NPY_INT16:
+      return kTfLiteInt16;
+    case NPY_UINT8:
+      return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
+    case NPY_INT64:
+      return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE:
+      return kTfLiteString;
+    case NPY_COMPLEX64:
+      return kTfLiteComplex64;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return kTfLiteNoType;
+}
+
+#if PY_VERSION_HEX >= 0x03030000
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  Py_ssize_t len = -1;
+  const char* buf = PyUnicode_AsUTF8AndSize(value, &len);
+  if (buf == NULL) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8AndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#else
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  UniquePyObjectRef utemp(PyUnicode_AsUTF8String(value));
+  if (!utemp) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8String() failed.");
+    return false;
+  }
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(utemp.get(), &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#endif
+
+bool FillStringBufferFromPyString(PyObject* value,
+                                  DynamicBuffer* dynamic_buffer) {
+  if (PyUnicode_Check(value)) {
+    return FillStringBufferFromPyUnicode(value, dynamic_buffer);
+  }
+
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(value, &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer) {
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+  switch (PyArray_TYPE(array)) {
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE: {
+      UniquePyObjectRef iter(PyArray_IterNew(value));
+      while (PyArray_ITER_NOTDONE(iter.get())) {
+        UniquePyObjectRef item(PyArray_GETITEM(
+            array, reinterpret_cast<char*>(PyArray_ITER_DATA(iter.get()))));
+
+        if (!FillStringBufferFromPyString(item.get(), dynamic_buffer)) {
+          return false;
+        }
+
+        PyArray_ITER_NEXT(iter.get());
+      }
+      return true;
+    }
+    default:
+      break;
+  }
+
+  PyErr_Format(PyExc_ValueError,
+               "Cannot use numpy array of type %d for string tensor.",
+               PyArray_TYPE(array));
+  return false;
+}
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_AsStringAndSize(obj, data, length);
+#else
+  return PyString_AsStringAndSize(obj, data, length);
+#endif
+}
+
+PyObject* ConvertToPyString(const char* data, size_t length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_FromStringAndSize(data, length);
+#else
+  return PyString_FromStringAndSize(data, length);
+#endif
+}
+
+}  // namespace python_utils
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.h b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4677378cbc177b42c1b802b40beeba86ed605c4
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace python_utils {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type);
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer);
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length);
+PyObject* ConvertToPyString(const char* data, size_t length);
+
+}  // namespace python_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3b0aa02b7c1c5215908c86b35525566669a0cd30..fc8ae7790fcc87713b171362516144376919ba6c 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -12,31 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow Lite tooling helper functionality.
+"""TensorFlow Lite tooling helper functionality."""
 
-EXPERIMENTAL: APIs here are unstable and likely to change without notice.
-
-@@TocoConverter
-@@TFLiteConverter
-@@toco_convert
-@@toco_convert_protos
-@@Interpreter
-@@OpHint
-@@convert_op_hints_to_stubs
-@@build_toco_convert_protos
-
-@@TFLITE
-@@GRAPHVIZ_DOT
-
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+import enum
 from six import PY3
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
+from tensorflow.lite.experimental.examples.lstm.rnn import dynamic_rnn  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TFLiteLSTMCell  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TfLiteRNNCell  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
@@ -52,12 +42,17 @@ from tensorflow.lite.python.convert_saved_model import set_tensor_shapes as _set
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.lite.python.optimize import calibrator as _calibrator
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2 as _rewriter_config_pb2
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
+from tensorflow.python.eager import def_function as _def_function
+from tensorflow.python.eager import function as _function
+from tensorflow.python.framework import convert_to_constants as _convert_to_constants
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import graph_util as _tf_graph_util
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
@@ -71,18 +66,20 @@ from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
-def _run_graph_optimizations(graph_def, input_arrays, output_arrays):
+def _run_graph_optimizations(graph_def, input_arrays, output_arrays,
+                             graph=None):
   """Apply standard TensorFlow optimizations to the graph_def.
 
   Args:
     graph_def: Frozen GraphDef to be optimized.
     input_arrays: List of arrays that are considered inputs of the graph.
     output_arrays: List of arrays that are considered outputs of the graph.
+    graph: TensorFlow Graph. Required when Eager mode is enabled. (default None)
 
   Returns:
     A new, optimized GraphDef.
   """
-  meta_graph = _export_meta_graph(graph_def=graph_def)
+  meta_graph = _export_meta_graph(graph_def=graph_def, graph=graph)
 
   # We need to add a collection called 'train_op' so that grappler
   # knows what the outputs are.
@@ -100,7 +97,220 @@ def _run_graph_optimizations(graph_def, input_arrays, output_arrays):
   return _tf_optimizer.OptimizeGraph(config, meta_graph)
 
 
-@_tf_export("lite.TFLiteConverter")
+@_tf_export("lite.Optimize")
+class Optimize(enum.Enum):
+  """Enum defining the optimizations to apply when generating tflite graphs.
+
+  Some optimizations may come at the cost of accuracy.
+  """
+
+  # Optimize for size.
+  #
+  # Optimizations that reduce the size of the model.
+  # The model size will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_SIZE = "OPTIMIZE_FOR_SIZE"
+
+  # Optimize for latency.
+  #
+  # Optimizations that reduce the latency of the model.
+  # The model latency will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
+
+  def __str__(self):
+    return self.value
+
+
+@_tf_export("lite.RepresentativeDataset")
+class RepresentativeDataset(object):
+  """Representative dataset to evaluate optimizations.
+
+  A representative dataset that can be used to evaluate optimizations by the
+  converter. E.g. converter can use these examples to estimate (min, max) ranges
+  by calibrating the model on inputs. This can allow converter to quantize a
+  converted floating point model.
+  """
+
+  def __init__(self, input_gen, output_gen=None):
+    """Creates a representative dataset.
+
+    Args:
+      input_gen: an input generator that can be used to generate input samples
+        for the model. This must be a callable object that returns an object
+        that supports the `iter()` protocol (e.g. a generator function). The
+        elements generated must have same type and shape as inputs to the model.
+      output_gen: (optional) an output generator that can be used to generate
+        output samples for the model. This must be a callable object that
+        returns an object that supports the `iter()` protocol (e.g. a generator
+        function). The elements generated must have same type and shape as
+        outputs to the model. (default None)
+    """
+    self.input_gen = input_gen
+    self.output_gen = output_gen
+
+
+@_tf_export("lite.TargetSpec")
+class TargetSpec(object):
+  """Specification of target device.
+
+  Details about target device. Converter optimizes the generated model for
+  specific device.
+
+  Attributes:
+    supported_ops: Experimental flag, subject to change. Set of OpsSet options
+      supported by the device. (default set([OpsSet.TFLITE_BUILTINS]))
+  """
+
+  def __init__(self, supported_ops=None):
+    if supported_ops is None:
+      supported_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.supported_ops = supported_ops
+
+
+@_tf_export("lite.TFLiteConverter", v1=[])
+class TFLiteConverterV2(object):
+  """Converts a TensorFlow model into TensorFlow Lite model.
+
+  Attributes:
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    optimizations: Experimental flag, subject to change, A list of optimizations
+      to apply when converting the model. The converter applies the
+      optimizations by giving priority to the optimizations specified earlier in
+      the list. E.g. `[optimize.OPTIMIZE_FOR_SIZE,
+      optimize.OPTIMIZE_FOR_LATENCY]` requires the converter to do both size and
+      latency optimizations giving priority to size optimizations over latency
+      optimizations.
+    representative_dataset: A representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+
+  Example usage:
+
+    ```python
+    # Converting a GraphDef from a ConcreteFunction.
+    converter = lite.TFLiteConverter.from_concrete_function(func)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+    ```
+  """
+
+  def __init__(self, func):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+    """
+    self._func = func
+    self.allow_custom_ops = False
+    self.target_spec = TargetSpec()
+    self.representative_dataset = None
+    self.optimizations = []
+
+  @classmethod
+  def from_concrete_function(cls, func):
+    """Creates a TFLiteConverter class from a ConcreteFunction.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+
+    Returns:
+      TFLiteConverter class.
+    """
+    if not isinstance(func, _function.ConcreteFunction):
+      message = "This function takes in a ConcreteFunction."
+      if isinstance(func, _def_function.Function):
+        message += (" To get the ConcreteFunction from a Function,"
+                    " call from_concrete_function.")
+      raise ValueError(message)
+    return cls(func)
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
+        self._func)
+    input_tensors = [
+        tensor for tensor in frozen_func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = frozen_func.outputs
+
+    # Run a Grappler pass.
+    graph_def = _run_graph_optimizations(frozen_func.graph.as_graph_def(),
+                                         input_tensors, output_tensors,
+                                         frozen_func.graph)
+
+    # Checks dimensions in input tensor.
+    for tensor in input_tensors:
+      # Note that shape_list might be empty for scalar shapes.
+      shape_list = tensor.shape.as_list()
+      if None in shape_list[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+      elif shape_list and shape_list[0] is None:
+        # Set the batch size to 1 if undefined.
+        shape = tensor.shape.as_list()
+        shape[0] = 1
+        tensor.set_shape(shape)
+
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError("`representative_dataset` must be an instance of "
+                        "`RepresentativeDataset`")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for `representative_dataset`")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(
+            set(self.optimizations)
+            & set([Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
+
+    converter_kwargs = {
+        "input_format": constants.TENSORFLOW_GRAPHDEF,
+        "allow_custom_ops": self.allow_custom_ops,
+        "post_training_quantize": weights_only_quantize_flag,
+        "target_ops": self.target_spec.supported_ops,
+    }
+
+    # Converts model.
+    result = _toco_convert_impl(
+        input_data=graph_def,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
+    return result
+
+
+@_tf_export(v1=["lite.TFLiteConverter"])
 class TFLiteConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
@@ -141,10 +351,11 @@ class TFLiteConverter(object):
       created for any op that is unknown. The developer will need to provide
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
-    post_training_quantize: Boolean indicating whether to quantize the weights
-      of the converted float model. Model size will be reduced and there will be
-      latency improvements (at the cost of accuracy).
-      (default False)
+    post_training_quantize: deprecated, please specify
+     `[optimize.OPTIMIZE_FOR_SIZE]` for `optimizations` instead. Boolean
+     indicating whether to quantize the weights of the converted float model.
+     Model size will be reduced and there will be latency improvements
+     (at the cost of accuracy). (default False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
       stages of processing GraphViz .dot files. Preferred over
       --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
@@ -154,6 +365,16 @@ class TFLiteConverter(object):
     target_ops: Experimental flag, subject to change. Set of OpsSet
       options indicating which converter to use.
       (default set([OpsSet.TFLITE_BUILTINS]))
+    optimizations: Experimental flag, subject to change, A list of
+      optimizations to apply when converting the model. The converter applies
+      the optimizations by giving priority to the optimizations specified
+      earlier in the list. E.g.
+      `[optimize.OPTIMIZE_FOR_SIZE, optimize.OPTIMIZE_FOR_LATENCY]` requires
+      the converter to do both size and latency optimizations giving priority
+      to size optimizations over latency optimizations.
+    representative_dataset: A representative dataset that can be used to
+      generate input and output samples for the model. The converter can use
+      the dataset to evaluate different optimizations.
 
   Example usage:
 
@@ -190,7 +411,7 @@ class TFLiteConverter(object):
     Args:
       graph_def: Frozen TensorFlow GraphDef.
       input_tensors: List of input tensors. Type and shape are computed using
-        `foo.get_shape()` and `foo.dtype`.
+        `foo.shape` and `foo.dtype`.
       output_tensors: List of output tensors (only .name is used from this).
       input_arrays_with_shape: Tuple of strings representing input tensor names
         and list of integers representing input shapes
@@ -216,10 +437,12 @@ class TFLiteConverter(object):
     self.reorder_across_fake_quant = False
     self.change_concat_input_ranges = False
     self.allow_custom_ops = False
-    self.post_training_quantize = False
+    self._post_training_quantize = False
     self.dump_graphviz_dir = None
     self.dump_graphviz_video = False
     self.target_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.representative_dataset = None
+    self.optimizations = []
 
     # Attributes are used by models that cannot be loaded into TensorFlow.
     if not self._has_valid_tensors():
@@ -237,7 +460,7 @@ class TFLiteConverter(object):
     Args:
       sess: TensorFlow Session.
       input_tensors: List of input tensors. Type and shape are computed using
-        `foo.get_shape()` and `foo.dtype`.
+        `foo.shape` and `foo.dtype`.
       output_tensors: List of output tensors (only .name is used from this).
 
     Returns:
@@ -419,6 +642,27 @@ class TFLiteConverter(object):
     graph_def = _freeze_graph(sess, output_tensors)
     return cls(graph_def, input_tensors, output_tensors)
 
+  def __setattr__(self, name, value):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      if value:
+        # Use OPTIMIZE_FOR_SIZE for post training for now.
+        self.optimizations = [Optimize.OPTIMIZE_FOR_SIZE]
+      else:
+        self.optimizations = []
+      return
+    object.__setattr__(self, name, value)
+
+  def __getattribute__(self, name):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      return Optimize.OPTIMIZE_FOR_SIZE in set(self.optimizations)
+    return object.__getattribute__(self, name)
+
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
@@ -434,7 +678,7 @@ class TFLiteConverter(object):
     # Checks dimensions in input tensor.
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
-        shape = tensor.get_shape()
+        shape = tensor.shape
         if not shape:
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
@@ -463,6 +707,24 @@ class TFLiteConverter(object):
                          "tensors '{0}'.".format(",".join(invalid_stats)))
     else:
       quantized_stats = None
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError(
+            "representative_dataset must be an instance of "
+            "RepresentativeDataset")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(set(self.optimizations) & set([Optimize.OPTIMIZE_FOR_LATENCY,
+                                           Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
 
     converter_kwargs = {
         "inference_type": self.inference_type,
@@ -475,7 +737,7 @@ class TFLiteConverter(object):
         "reorder_across_fake_quant": self.reorder_across_fake_quant,
         "change_concat_input_ranges": self.change_concat_input_ranges,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": self.post_training_quantize,
+        "post_training_quantize": weights_only_quantize_flag,
         "target_ops": self.target_ops,
         "dump_graphviz_dir": self.dump_graphviz_dir,
         "dump_graphviz_video": self.dump_graphviz_video
@@ -504,6 +766,12 @@ class TFLiteConverter(object):
           input_arrays_with_shape=self._input_arrays_with_shape,
           output_arrays=self._output_arrays,
           **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
     return result
 
   def get_input_arrays(self):
@@ -540,12 +808,12 @@ class TFLiteConverter(object):
                        "use input_shapes parameter.")
 
     for tensor in self._input_tensors:
-      shape = tensor.get_shape().as_list()
+      shape = tensor.shape.as_list()
       shape[0] = batch_size
       tensor.set_shape(shape)
 
 
-@_tf_export("lite.TocoConverter")
+@_tf_export(v1=["lite.TocoConverter"])
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ae629413e782d011fafdb3b7e294cd884a301c
--- /dev/null
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -0,0 +1,58 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py functionality related to select TF op usage."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.python import lite
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_v1_only('b/120545219')
+class FromSessionTest(test_util.TensorFlowTestCase):
+
+  def testFlexMode(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.target_ops = set([lite.OpsSet.SELECT_TF_OPS])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Ensures the model contains TensorFlow ops.
+    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    interpreter = Interpreter(model_content=tflite_model)
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.allocate_tensors()
+    self.assertIn(
+        'Regular TensorFlow ops are not supported by this interpreter. Make '
+        'sure you invoke the Flex delegate before inference.',
+        str(error.exception))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 83fd56bf1d2617b7132d0eb2314c80460e968c18..14d08ec70a6f05fd8a971dce3caf8740375e7f3a 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
 from tensorflow.python.platform import gfile
@@ -131,13 +132,13 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     input_details = interpreter.get_input_details()
     self.assertEqual(1, len(input_details))
     self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.object_, input_details[0]['dtype'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
     self.assertTrue(([4] == input_details[0]['shape']).all())
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('Reshape', output_details[0]['name'])
-    self.assertEqual(np.object_, output_details[0]['dtype'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
     self.assertTrue(([2, 2] == output_details[0]['shape']).all())
     # TODO(b/122659643): Test setting/getting string data via the python
     # interpreter API after support has been added.
@@ -481,6 +482,29 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
+  def testPostTrainingQuantizeDeprecatedAttribute(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[33, 33], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = constant_op.constant(
+        np.random.uniform(low=-10., high=10., size=(33, 33)),
+        shape=[33, 33],
+        dtype=dtypes.float32,
+        name='inputB')
+    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+    sess = session.Session()
+
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    self.assertFalse(quantized_converter.post_training_quantize)
+
+    quantized_converter.post_training_quantize = True
+    self.assertTrue(quantized_converter.post_training_quantize)
+    self.assertEqual(quantized_converter.optimizations,
+                     [lite.Optimize.OPTIMIZE_FOR_SIZE])
+
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
   def testPostTrainingQuantize(self):
     np.random.seed(0)
     # We need the tensor to have more than 1024 elements for quantize_weights
@@ -504,35 +528,58 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
-    quantized_converter.post_training_quantize = True
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
-  def testFlexMode(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
+  def testPostTrainingCalibrateAndQuantize(self):
+    np.random.seed(0)
+    # Create a mobilenet like model.
+    output_channel = 16
+    depth_multiplier = 1
+    inp = array_ops.placeholder(dtype=dtypes.float32, shape=(1, 5, 5, 3))
+    conv = nn_ops.conv2d(
+        inp,
+        filter=array_ops.zeros([3, 3, 3, output_channel]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    dconv = nn_ops.depthwise_conv2d_native(
+        conv,
+        filter=array_ops.zeros(
+            [16, 16, output_channel, output_channel * depth_multiplier]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    pool = nn_ops.pool(
+        dconv, window_shape=[2, 2], pooling_type='AVG', padding='SAME')
+    max_pool = nn_ops.pool(
+        pool, window_shape=[2, 2], pooling_type='MAX', padding='SAME')
+    output = nn_ops.softmax(max_pool)
+
+    def calibration_gen():
+      for _ in range(10):
+        yield [np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)]
+
     sess = session.Session()
 
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.target_ops = set([lite.OpsSet.SELECT_TF_OPS])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
-    interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter. Make '
-        'sure you invoke the Flex delegate before inference.',
-        str(error.exception))
+    # Convert quantized weights model.
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [inp], [output])
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.representative_dataset = lite.RepresentativeDataset(
+        calibration_gen)
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
   def testFloatTocoConverter(self):
     """Tests deprecated test TocoConverter."""
@@ -550,6 +597,35 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
+  def testMultipleOutputNodeNames(self):
+    """Tests converting a graph with an op that have multiple outputs."""
+    input_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.float32)
+    out0, out1, out2, out3 = array_ops.split(input_tensor, [1, 1, 1, 1], axis=0)
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [input_tensor],
+                                                  [out0, out1, out2, out3])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    interpreter.set_tensor(input_details[0]['index'],
+                           np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32))
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(4, len(output_details))
+    self.assertEqual(1.0, interpreter.get_tensor(output_details[0]['index']))
+    self.assertEqual(2.0, interpreter.get_tensor(output_details[1]['index']))
+    self.assertEqual(3.0, interpreter.get_tensor(output_details[2]['index']))
+    self.assertEqual(4.0, interpreter.get_tensor(output_details[3]['index']))
+
 
 @test_util.run_v1_only('b/120545219')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
@@ -694,7 +770,7 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
   # TODO(nupurgarg): Test model loading in open source.
   def _initObjectDetectionArgs(self):
     # Initializes the arguments required for the object detection model.
-    # Looks for the model file which is saved in a different location interally
+    # Looks for the model file which is saved in a different location internally
     # and externally.
     filename = resource_loader.get_path_to_datafile('testdata/tflite_graph.pb')
     if not os.path.exists(filename):
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b0d8235a192abc0a459a7f85e40b1c893ec99c
--- /dev/null
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -0,0 +1,215 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py functionality related to TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.lite.python import lite
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python import keras
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
+
+  def _evaluateTFLiteModel(self, tflite_model, input_data):
+    """Evaluates the model on the `input_data`."""
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    for input_tensor, tensor_data in zip(input_details, input_data):
+      interpreter.set_tensor(input_tensor['index'], tensor_data.numpy())
+    interpreter.invoke()
+    return interpreter.get_tensor(output_details[0]['index'])
+
+  @test_util.run_v2_only
+  def testTypeInvalid(self):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+
+    with self.assertRaises(ValueError) as error:
+      _ = lite.TFLiteConverterV2.from_concrete_function(root.f)
+    self.assertIn('call from_concrete_function', str(error.exception))
+
+  @test_util.run_v2_only
+  def testFloat(self):
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testSizeNone(self):
+    # Test with a shape of None
+    input_data = constant_op.constant(1., shape=None)
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.f = def_function.function(lambda x: root.v1 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    concrete_func = root.add.get_concrete_function(input_data)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testKerasModel(self):
+    input_data = constant_op.constant(1., shape=[1, 1])
+
+    # Create a simple Keras model.
+    x = [-1, 0, 1, 2, 3, 4]
+    y = [-3, -1, 1, 3, 5, 7]
+
+    model = keras.models.Sequential(
+        [keras.layers.Dense(units=1, input_shape=[1])])
+    model.compile(optimizer='sgd', loss='mean_squared_error')
+    model.fit(x, y, epochs=1)
+
+    # Get the concrete function from the Keras model.
+    @def_function.function
+    def to_save(x):
+      return model(x)
+
+    concrete_func = to_save.get_concrete_function(
+        tensor_spec.TensorSpec([None, 1], dtypes.float32))
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = to_save(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 6ec050171fc39308c36ec8f43af639f59f4f387c..aec4b28a33e1e6a3242929a40de9f809c64d23c4 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -36,9 +36,7 @@ Example:
   session = tf.Session()
 
   graphdef_to_convert = tf.lite.convert_op_hints_to_stubs(session)
-  tflite_graph = tf.lite.toco_convert(graphdef_to_convert,
-                                              [image], [output])
-                                              [image], [output])
+  tflite_graph = tf.lite.toco_convert(graphdef_to_convert, [image], [output])
   with open("/tmp/graph.fb", "wb") as fp:
     fp.write(tflite_graph)
 
@@ -73,6 +71,7 @@ from __future__ import print_function
 
 import collections as _collections
 import copy as _copy
+import json as _json
 import uuid as _uuid
 import six as _six
 
@@ -134,6 +133,14 @@ class OpHint(object):
   # "stuff", "foo", "bar", -1 (where -1 is unused). So you would set this
   # attribute to [2, 0, 1, -1].
   TFLITE_INPUT_INDICES = "_tflite_input_indices"
+  # OpHint level.
+  FUNCTION_LEVEL_ATTR = "_tflite_ophint_level"
+  # Ophint internal mapping, this is for high level Ophint only.
+  # This basically contains three kinds of mapping:
+  #   1) How parental ophinted inputs map to the first child ophinted inputs;
+  #   2) How internal children nodes are connected;
+  #   3) How parental ophinted outputs map to the last child ophinted outputs.
+  CHILDREN_INPUTS_MAPPINGS = "_tflite_children_ophint_inputs_mapping"
 
   # Types of aggregations
   #  stack: stacks all ophints with matching tags. i.e. for a static rnn.
@@ -151,10 +158,16 @@ class OpHint(object):
     """Conceptually tracks indices of arguments of "OpHint functions".
 
     The inputs and arguments of these functions both use an instance
-    of the class so they can have independent numbering."""
+    of the class so they can have independent numbering.
+    """
 
-    def __init__(self, function_name, unique_function_id, node_name_prefix,
-                 attr_name):
+    def __init__(self,
+                 function_name,
+                 unique_function_id,
+                 node_name_prefix,
+                 attr_name,
+                 level=1,
+                 children_inputs_mappings=None):
       """Initialize ophint argument.
 
       Args:
@@ -163,6 +176,8 @@ class OpHint(object):
         node_name_prefix: How identities that are created are named.
         attr_name: Name of attribute to use to store the index for this hint.
           i.e. FUNCTION_INPUT_INDEX or FUNCTION_OUTPUT_INDEX
+        level: Hierarchical level of the Ophint node, a number.
+        children_inputs_mappings: Inputs/Outputs mapping for children hints.
       """
 
       # The global index is the argument index of the op. This is in contrast
@@ -178,6 +193,8 @@ class OpHint(object):
       self._tag_to_next_sort_index = {}  # The current index for each tag
       self._node_name_prefix = node_name_prefix
       self._attr_name = attr_name
+      self._level = level
+      self._children_inputs_mappings = children_inputs_mappings
 
     def _get_new_global_index(self, index_override):
       """Return the next unused argument index in order or use an override.
@@ -217,7 +234,7 @@ class OpHint(object):
           and OpHint.AGGREGATE_STACK.
           Note, aggregate is only valid if tag is specified.
         index_override: Specify what input/output index should this be in the
-          final stub. i.e. add(arg0, index=1); add(arg1, index=0) wil make the
+          final stub. i.e. add(arg0, index=1); add(arg1, index=0) will make the
           final stub be as stub_func(inputs[arg1, arg0], outputs=[]) rather than
           the default call order based ordering.
 
@@ -253,6 +270,7 @@ class OpHint(object):
       uuid = self._unique_function_id
       name = "%s-%s-%s-%r-%r-%s" % (self._node_name_prefix, self._function_name,
                                     uuid, global_index, sort_index, name)
+
       identity_op = _array_ops.identity(arg, name=name)
 
       # pylint: disable=protected-access
@@ -266,6 +284,15 @@ class OpHint(object):
               s=_compat.as_bytes(self._unique_function_id)))
       identity_op.op._set_attr(
           self._attr_name, _attr_value_pb2.AttrValue(i=global_index))
+      identity_op.op._set_attr(OpHint.FUNCTION_LEVEL_ATTR,
+                               _attr_value_pb2.AttrValue(i=self._level))
+      if self._children_inputs_mappings:
+        identity_op.op._set_attr(
+            OpHint.CHILDREN_INPUTS_MAPPINGS,
+            _attr_value_pb2.AttrValue(
+                s=_compat.as_bytes(_json.dumps(
+                    self._children_inputs_mappings))))
+
       if sort_index is not None:
         identity_op.op._set_attr(
             OpHint.FUNCTION_SORT_INDEX_ATTR,
@@ -277,23 +304,74 @@ class OpHint(object):
       # pylint: enable=protected-access
       return identity_op
 
-  def __init__(self, function_name, **kwargs):
+  def __init__(self,
+               function_name,
+               level=1,
+               children_inputs_mappings=None,
+               **kwargs):
     """Create a OpHint.
 
     Args:
       function_name: Name of the function (the custom op name in tflite)
+      level: OpHint level.
+      children_inputs_mappings: Children OpHint inputs/outputs mapping.
+        children_inputs_mappings should like below:
+        "parent_first_child_input":
+            [{"parent_input_index": num, "child_input_index": num}, ...]
+        "parent_last_child_output":
+            [{"parent_output_index": num, "child_output_index": num}, ...]
+        "internal_children_input_output":
+            [{"child_input_index": num, "child_output_index": num}, ...]
       **kwargs: Keyword arguments of any constant attributes for the function.
     """
     self._function_name = function_name
+    self._level = level
+    if self._level == 1:
+      assert children_inputs_mappings is None
+    else:
+      assert isinstance(children_inputs_mappings, dict)
+    self._children_inputs_mappings = children_inputs_mappings
+    if self._children_inputs_mappings is not None:
+      self._validate_children_inputs_mappings(self._children_inputs_mappings)
     self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
     self._attrs_to_store_later = kwargs
     self._stored_attrs = False
     self._inputs = OpHint.OpHintArgumentTracker(
         self._function_name, self._unique_function_id, "InputHint",
-        OpHint.FUNCTION_INPUT_INDEX_ATTR)
+        OpHint.FUNCTION_INPUT_INDEX_ATTR, level, self._children_inputs_mappings)
     self._outputs = OpHint.OpHintArgumentTracker(
         self._function_name, self._unique_function_id, "OutputHint",
-        OpHint.FUNCTION_OUTPUT_INDEX_ATTR)
+        OpHint.FUNCTION_OUTPUT_INDEX_ATTR, level,
+        self._children_inputs_mappings)
+
+  def _validate_children_inputs_mappings(self, children_inputs_mappings):
+    """Validate children inputs mappings is in the right format.
+
+    Args:
+      children_inputs_mappings: the Children ophint inputs/outputs mapping.
+    """
+    assert isinstance(children_inputs_mappings, dict)
+    assert "parent_first_child_input" in children_inputs_mappings
+    assert "parent_last_child_output" in children_inputs_mappings
+    assert "internal_children_input_output" in children_inputs_mappings
+
+    # validate parent_first_child_input.
+
+    def assert_dictlist_has_keys(dictlist, keys):
+      for dikt in dictlist:
+        assert isinstance(dikt, dict)
+        for key in keys:
+          assert key in dikt
+
+    assert_dictlist_has_keys(
+        children_inputs_mappings["parent_first_child_input"],
+        ["parent_ophint_input_index", "first_child_ophint_input_index"])
+    assert_dictlist_has_keys(
+        children_inputs_mappings["parent_last_child_output"],
+        ["parent_output_index", "child_output_index"])
+    assert_dictlist_has_keys(
+        children_inputs_mappings["internal_children_input_output"],
+        ["child_input_index", "child_output_index"])
 
   def _setattr(self, dest_op, name, value):
     tensor_value = _ops.convert_to_tensor(value)
@@ -384,7 +462,7 @@ class OpHint(object):
 
 
 class _LiteOperand(object):
-  """Abstract operand for a tflite hint function.
+  """Abstract operand for a tflite hint function._dynamic_rnn_loop.
 
   This is a base class that handles representing arguments to an OpHint.
   It also is able to serialize operands to the stubbed graph_def.
@@ -582,15 +660,18 @@ class _LiteFuncCall(object):
   This is uses to accumulate found hints in the graphdef into a single
   conceptual unit.
 
-  Properties:
-    self.inputs: inputs to the op (hash from index # to argument)
-    self.outputs: outputs to the op (hash from index # to argument)
-    self.function_name: the tflite custom op name to use
-    self.uuid: a unique call id for this particular call  (i.e.
+  Attributes:
+    inputs: inputs to the op (hash from index # to argument)
+    outputs: outputs to the op (hash from index # to argument)
+    function_name: the tflite custom op name to use
+    uuid: a unique call id for this particular call  (i.e.
       multiple function calls would have the same function_name but different
       uuids.
-    self.params: A param name to key value for op constant data. I.e. for
+    params: A param name to key value for op constant data. I.e. for
       axis on a reduction, strides on a convolution, etc.
+    level: Level of the OpHint.
+    children_inputs_mappings: If the Ophint has children, children inputs
+      mappings indicate how their inputs & outputs are mapped.
   """
 
   def __init__(self):
@@ -599,6 +680,8 @@ class _LiteFuncCall(object):
     self.function_name = None
     self.uuid = None
     self.params = {}
+    self.level = -1
+    self.children_inputs_mappings = {}
 
   def flattened_inputs_and_outputs(self):
     """Return a list of inputs and outputs in a flattened format.
@@ -624,22 +707,25 @@ class _LiteFuncCall(object):
     inputs_str = "\tInputs\n" + format_args(self.inputs)
     outputs_str = "\tOutputs\n" + format_args(self.outputs)
 
-    return ("tflite function %s call %s\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s"
-            % (self.function_name, self.uuid, inputs_str, outputs_str))
+    return (
+        "tflite function %s call %s level %d "
+        "\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s" %
+        (self.function_name, self.uuid, self.level, inputs_str, outputs_str))
 
 
-def _find_all_hints_in_graph_def(graphdef):
-  """Look at the current default graph and return a list of LiteFuncCall objs.
+def _find_all_hints_in_nodes(nodes):
+  """Look at the all the input nodes and return a list of LiteFuncCall objs.
 
   Args:
-    graphdef: A TensorFlow graph_def to look for LiteFuncCalls.
+    nodes: A TensorFlow graph_def to look for LiteFuncCalls.
+
   Returns:
     a list of `LifeFuncCall` objects in the form
 
   """
   func_calls = _collections.defaultdict(_LiteFuncCall)
 
-  for node in graphdef.node:
+  for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
@@ -651,6 +737,7 @@ def _find_all_hints_in_graph_def(graphdef):
     call_def = func_calls[uuid]
     call_def.uuid = uuid
     call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+    call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
     sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
@@ -660,6 +747,10 @@ def _find_all_hints_in_graph_def(graphdef):
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
 
+    if OpHint.CHILDREN_INPUTS_MAPPINGS in attr:
+      call_def.children_inputs_mappings = _json.loads(
+          _compat.as_text(attr[OpHint.CHILDREN_INPUTS_MAPPINGS].s))
+
     # Add the input or output
     def put_operand(stuff, index, sort, operand, aggregation):
       """Add a given index into the function structure."""
@@ -685,6 +776,98 @@ def _find_all_hints_in_graph_def(graphdef):
   return func_calls
 
 
+def _extract_topology_sequence_mapping(nodes):
+  return dict(
+      (_tensor_name_base(node.name), idx) for idx, node in enumerate(nodes))
+
+
+def _find_children_hints_in_while_loop(function_def, nodes_mapping):
+  """Find children hints and all nodes inside the while loop.
+
+  Args:
+    function_def: Function def of the while loop.
+    nodes_mapping: While loop input_arg : real node name.
+
+  Returns:
+    Ordered children hints and all re-mapped nodes inside the while loop.
+  """
+  new_nodes = []
+
+  # Make nodes inside function def inputs point to the real nodes.
+  for node in function_def.node_def:
+    for i in range(len(node.input)):
+      if node.input[i] in nodes_mapping:
+        node.input[i] = nodes_mapping[node.input[i]]
+    new_nodes.append(_copy.deepcopy(node))
+  name_to_seq_num = _extract_topology_sequence_mapping(function_def.node_def)
+  children_hints = _find_all_hints_in_nodes(new_nodes)
+  children_hints_q = []
+  # Ordered by the outputs.
+  for hint in _six.itervalues(children_hints):
+    _, output_names = hint.flattened_inputs_and_outputs()
+    seq = name_to_seq_num[output_names[0]]
+    for output_name in output_names:
+      seq = min(seq, name_to_seq_num[output_name])
+    children_hints_q.append((seq, hint))
+  children_hints_q.sort(key=lambda tup: tup[0])
+  ordered_children_hints = [x[1] for x in children_hints_q]
+  return ordered_children_hints, new_nodes
+
+
+def _find_children_hints(call, graph_def):
+  """Find all children hints.
+
+  For a given OpHint, we find all children hints inside it, we also copy all the
+  nodes inside function defs (if applicable) to the original graph_def, they are
+  returned in a list as well.
+
+  Args:
+    call: Parent OpHint that contains children ophints.
+    graph_def: Original graph def.
+
+  Returns:
+    Ordered children hints inside the parent ophint; new graph def that contains
+    nodes inside function defs (if applicable); nodes inside function defs.
+  """
+  name_to_input_name, _, _ = _extract_graph_summary(graph_def)
+  input_names, output_names = call.flattened_inputs_and_outputs()
+
+  reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
+  reachable_by_output = _bfs_for_reachable_nodes(output_names,
+                                                 name_to_input_name)
+  output_nodes_set = set(output_names)
+  children_hints = []
+  out = _graph_pb2.GraphDef()
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
+  function_def_nodes = set()
+  for node in graph_def.node:
+    out.node.extend([_copy.deepcopy(node)])
+    n = _tensor_name_base(node.name)
+    if n in reachable_by_output:
+      if n not in reachable_by_input and n not in output_nodes_set:
+        # special handle for while loop function def.
+        if node.op == "While":
+          body_name = node.attr["body"].func.name
+          inputs_outside_loop = node.input
+          for function_def in graph_def.library.function:
+            if function_def.signature.name == body_name:
+              function_inputs = function_def.signature.input_arg
+              assert len(inputs_outside_loop) == len(function_inputs)
+              nodes_mapping = {}
+              for i in range(len(function_inputs)):
+                nodes_mapping[function_inputs[i].name] = inputs_outside_loop[i]
+              # TODO(b/123050804): Consider use grappler.
+              (children_hints_in_loop,
+               new_nodes) = _find_children_hints_in_while_loop(
+                   function_def, nodes_mapping)
+              function_def_nodes.update([x.name for x in new_nodes])
+              children_hints.extend(children_hints_in_loop)
+              out.node.extend(new_nodes)
+
+  return children_hints, out, function_def_nodes
+
+
 def _tensor_name_base(full_tensor_name):
   """Removes the device assignment code from a tensor.
 
@@ -737,12 +920,20 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
 
 
 # TODO(aselle): This should be converted to grappler in the future.
-def _convert_single_op_hint_to_stub(call, graph_def):
+def _convert_single_op_hint_to_stub(call,
+                                    graph_def,
+                                    function_def_nodes=None,
+                                    is_last_run=True):
   """Given a graph_def, converts `call` into a stub and returns a new graph_def.
 
   Args:
     call: A single function call to be converted.
-    graph_def: A graph_def to use as input (that hass call obviously).
+    graph_def: A graph_def to use as input (that has call obviously).
+    function_def_nodes: Nodes inside the function def those are not connected to
+      the graph.
+    is_last_run: Whether it is the last run for a given pass (for OpHint has
+      children).
+
   Returns:
     A new transformed graph-def that has call as a stub (single op).
 
@@ -750,6 +941,8 @@ def _convert_single_op_hint_to_stub(call, graph_def):
       the tensorflow runtime, so all future manipulations are done in graph_def
       level.
   """
+  if function_def_nodes is None:
+    function_def_nodes = set()
   name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
       graph_def)
   input_names, output_names = call.flattened_inputs_and_outputs()
@@ -757,7 +950,6 @@ def _convert_single_op_hint_to_stub(call, graph_def):
   reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
   reachable_by_output = _bfs_for_reachable_nodes(output_names,
                                                  name_to_input_name)
-  input_nodes_set = set(input_names)
   output_nodes_set = set(output_names)
   nodes_after_fuse = []
   nodes_deleted_by_fuse = set()
@@ -768,19 +960,16 @@ def _convert_single_op_hint_to_stub(call, graph_def):
     n = _tensor_name_base(node.name)
     if n in reachable_by_output:
       if n not in reachable_by_input and n not in output_nodes_set:
-        # n is an internal node. Check to make sure it is really internal.
-        # TODO(aselle): this could be done more efficiently by flooding
-        # the graph first.
-        _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
-                               name_to_input_name)
         nodes_deleted_by_fuse.add(n)
-    elif n not in reachable_by_input:
+    elif n not in reachable_by_input and n not in function_def_nodes:
       # n is a node that after all the fusings, so keep it.
       nodes_after_fuse.append(n)
     else:
-      # n is a node that is randomly in the graph but not connected to
-      # the chain of dependencies.
-      pass
+      # In the last run, n is a node that is randomly in the graph but not
+      # connected to the chain of dependencies, we will delete n, otherwise
+      # we keep them.
+      if not is_last_run:
+        nodes_after_fuse.append(n)
 
   # Make a new graphdef with all the pre-input and input nodes
   out = _graph_pb2.GraphDef()
@@ -802,10 +991,11 @@ def _convert_single_op_hint_to_stub(call, graph_def):
   # non-fused things.
   for input_index in sorted_input_indices:
     inputs = call.inputs[input_index]
-    new_node.input.append(inputs.aggregate_and_return_name_for_input(out))
+    input_name = inputs.aggregate_and_return_name_for_input(out)
+    new_node.input.append(input_name)
   new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
 
-  # Ceate the function
+  # Create the function
   new_node.op = call.function_name
   new_node.name = call.uuid
   out.node.extend([new_node])
@@ -938,7 +1128,18 @@ def _remove_redundant_stack_unstack(graph_def):
   return curr
 
 
-@_tf_export("lite.convert_op_hints_to_stubs")
+def _get_correct_mapping(original_index, nodes):
+  # Special handle for the index is -1 case.
+  # If it is -1, return the last index.
+  if original_index == -1:
+    node_indices = nodes.keys()
+    node_indices = sorted(node_indices)
+    return node_indices[-1]
+  else:
+    return original_index
+  return original_index
+
+
 def _convert_op_hints_to_stubs_helper(
     graph_def, write_callback=lambda sess, graph_def: None):
   """Converts a graph_def to a new graph_def where all op hints are stubbed.
@@ -950,14 +1151,67 @@ def _convert_op_hints_to_stubs_helper(
   Returns:
     A new stubbed graph_def.
   """
+  hints = _find_all_hints_in_nodes(graph_def.node)
+
+  hints_q = []
+  for hint in _six.itervalues(hints):
+    hints_q.append((hint.level, hint.uuid))
+
+  hints_q.sort(key=lambda tup: tup[0])
+  for i in range(len(hints_q) - 1, -1, -1):
+    level, hint_uuid = hints_q[i]
 
-  hints = _find_all_hints_in_graph_def(graph_def)
   curr_graph_def = graph_def
   del graph_def  # prevent using graph_def again (common source of error)
-  for hint in _six.itervalues(hints):
-    curr_graph_def = _convert_single_op_hint_to_stub(
-        hint, curr_graph_def)
-    write_callback(curr_graph_def, "initial")
+  for i in range(len(hints_q) - 1, -1, -1):
+    level, hint_uuid = hints_q[i]
+    if level >= 2:
+      children_hints, curr_graph_def, function_def_nodes = _find_children_hints(
+          hints[hint_uuid], curr_graph_def)
+      # pylint: disable=superfluous-parens
+      assert (len(children_hints) > 0)  #  pylint: disable=g-explicit-length-test
+      # pylint: enable=superfluous-parens
+
+      # Re-wire the children hints inputs/outputs, so latter child's inputs
+      # connect to previous child node's outputs.
+      children_inputs_mappings = hints[hint_uuid].children_inputs_mappings
+      for j in range(len(children_hints)):
+        child_hint = children_hints[j]
+        if j == 0:
+          for mapping in children_inputs_mappings["parent_first_child_input"]:
+            parent_input_index = _get_correct_mapping(
+                mapping["parent_ophint_input_index"], hints[hint_uuid].inputs)
+            child_input_index = _get_correct_mapping(
+                mapping["first_child_ophint_input_index"], child_hint.inputs)
+            child_hint.inputs[child_input_index] = hints[hint_uuid].inputs[
+                parent_input_index]
+        else:
+          for mapping in children_inputs_mappings[
+              "internal_children_input_output"]:
+            input_index = _get_correct_mapping(mapping["child_input_index"],
+                                               child_hint.inputs)
+            output_index = _get_correct_mapping(mapping["child_output_index"],
+                                                children_hints[j - 1].outputs)
+            child_hint.inputs[input_index] = children_hints[
+                j - 1].outputs[output_index]
+        if j == len(children_hints) - 1:
+          for mapping in children_inputs_mappings["parent_last_child_output"]:
+            parent_output_index = _get_correct_mapping(
+                mapping["parent_output_index"], hints[hint_uuid].outputs)
+            child_output_index = _get_correct_mapping(
+                mapping["child_output_index"], child_hint.outputs)
+            child_hint.outputs[child_output_index] = hints[hint_uuid].outputs[
+                parent_output_index]
+
+      for j in range(len(children_hints)):
+        child_hint = children_hints[j]
+        curr_graph_def = _convert_single_op_hint_to_stub(
+            child_hint, curr_graph_def, function_def_nodes,
+            j == len(children_hints) - 1)
+    else:
+      curr_graph_def = _convert_single_op_hint_to_stub(hints[hint_uuid],
+                                                       curr_graph_def)
+      write_callback(curr_graph_def, "initial")
   # The stubbing process can create stacks/unstacks in the case of LSTMs
   # remove them.
   curr_graph_def = _remove_redundant_stack_unstack(curr_graph_def)
@@ -984,15 +1238,16 @@ def find_all_hinted_output_nodes(session=None, graph_def=None):
     raise ValueError("Provide only one of session and graph_def.")
   hinted_outputs_nodes = []
   if session is not None:
-    hints = _find_all_hints_in_graph_def(session.graph_def)
+    hints = _find_all_hints_in_nodes(session.graph_def.node)
   elif graph_def is not None:
-    hints = _find_all_hints_in_graph_def(graph_def)
+    hints = _find_all_hints_in_nodes(graph_def.node)
   for hint in _six.itervalues(hints):
     _, ouput_nodes = hint.flattened_inputs_and_outputs()
     hinted_outputs_nodes.extend(ouput_nodes)
   return hinted_outputs_nodes
 
 
+@_tf_export("lite.experimental.convert_op_hints_to_stubs")
 def convert_op_hints_to_stubs(session=None,
                               graph_def=None,
                               write_callback=lambda graph_def, comments: None):
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..069612ba9f4cf00b445e40d459adb5244d3fc218
--- /dev/null
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -0,0 +1,70 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+cc_library(
+    name = "calibration_wrapper_lib",
+    srcs = ["calibration_wrapper.cc"],
+    hdrs = ["calibration_wrapper.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
+        "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//tensorflow/lite/tools/optimize:quantize_model",
+        "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
+        "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_lite_wrap_calibration_wrapper",
+    srcs = [
+        "calibration_wrapper.i",
+    ],
+    deps = [
+        ":calibration_wrapper_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+py_library(
+    name = "calibrator",
+    srcs = [
+        "calibrator.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.py"],
+    data = [
+        ":test_data",
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    srcs_version = "PY2AND3",
+    tags = ["no_oss"],
+    deps = [
+        ":calibrator",
+        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12bcd6a6283ccc71f4df7758b46aec298a87bb7d
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -0,0 +1,212 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
+
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+
+#define TFLITE_PY_CHECK(x)               \
+  if ((x) != kTfLiteOk) {                \
+    return error_reporter_->exception(); \
+  }
+
+#define TFLITE_PY_ENSURE_VALID_INTERPRETER()                               \
+  if (!interpreter_) {                                                     \
+    PyErr_SetString(PyExc_ValueError, "Interpreter was not initialized."); \
+    return nullptr;                                                        \
+  }
+
+namespace tflite {
+namespace calibration_wrapper {
+
+namespace {
+
+using python_utils::PyDecrefDeleter;
+
+std::unique_ptr<tflite::ModelT> CreateMutableModel(const tflite::Model& model) {
+  std::unique_ptr<tflite::ModelT> copied_model =
+      absl::make_unique<tflite::ModelT>();
+  model.UnPackTo(copied_model.get(), nullptr);
+  return copied_model;
+}
+
+}  // namespace
+
+CalibrationWrapper::CalibrationWrapper(
+    std::unique_ptr<tflite::Interpreter> interpreter,
+    std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+    std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+        error_reporter,
+    std::unique_ptr<tflite::FlatBufferModel> model,
+    std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader)
+    : interpreter_(std::move(interpreter)),
+      error_reporter_(std::move(error_reporter)),
+      resolver_(std::move(resolver)),
+      model_(std::move(model)),
+      reader_(std::move(reader)) {}
+
+CalibrationWrapper::~CalibrationWrapper() {}
+
+PyObject* CalibrationWrapper::Prepare() {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+  TFLITE_PY_CHECK(interpreter_->AllocateTensors());
+  TFLITE_PY_CHECK(interpreter_->ResetVariableTensors());
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::FeedTensor(PyObject* input_value) {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+  if (!PyList_Check(input_value)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Invalid input type: expected input to be a list.");
+    return nullptr;
+  }
+
+  const size_t inputs_size = PyList_Size(input_value);
+
+  if (inputs_size != interpreter_->inputs().size()) {
+    PyErr_Format(PyExc_ValueError,
+                 "Invalid input size: expected %ld items got %ld items.",
+                 interpreter_->inputs().size(), inputs_size);
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < inputs_size; i++) {
+    PyObject* input = PyList_GetItem(input_value, i);
+    if (!input) {
+      return nullptr;
+    }
+    int input_tensor_idx = interpreter_->inputs()[i];
+    if (!SetTensor(input_tensor_idx, input)) {
+      return nullptr;
+    }
+  }
+
+  TFLITE_PY_CHECK(interpreter_->Invoke());
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+
+  std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
+      PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr));
+  if (!array_safe) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Failed to convert value into readable tensor.");
+    return nullptr;
+  }
+
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+  const TfLiteTensor* tensor = interpreter_->tensor(index);
+
+  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Got tensor of type %d"
+                 " but expected type %d for input %d, name: %s ",
+                 python_utils::TfLiteTypeFromPyArray(array), tensor->type,
+                 index, tensor->name);
+    return nullptr;
+  }
+
+  if (PyArray_NDIM(array) != tensor->dims->size) {
+    PyErr_SetString(PyExc_ValueError, "Cannot set tensor: Dimension mismatch");
+    return nullptr;
+  }
+
+  for (int j = 0; j < PyArray_NDIM(array); j++) {
+    if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Cannot set tensor: Dimension mismatch");
+      return nullptr;
+    }
+  }
+
+  size_t size = PyArray_NBYTES(array);
+  if (size != tensor->bytes) {
+    PyErr_Format(PyExc_ValueError,
+                 "numpy array had %zu bytes but expected %zu bytes.", size,
+                 tensor->bytes);
+    return nullptr;
+  }
+  memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::QuantizeModel() {
+  auto tflite_model = CreateMutableModel(*model_->GetModel());
+  reader_->AddCalibrationToModel(tflite_model.get());
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = tflite::optimize::QuantizeModel(&builder, tflite_model.get(),
+                                                error_reporter_.get());
+  if (status != kTfLiteOk) {
+    error_reporter_->exception();
+    return nullptr;
+  }
+
+  return python_utils::ConvertToPyString(
+      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+      builder.GetSize());
+}
+
+/*static*/ CalibrationWrapper* CalibrationWrapper::CreateWrapperCPPFromBuffer(
+    PyObject* data) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+  ::tflite::python::ImportNumpy();
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    return nullptr;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader;
+  auto status = tflite::optimize::calibration::BuildLoggingInterpreter(
+      *model, *resolver, &interpreter, &reader);
+  if (status != kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+
+  auto wrapper = new CalibrationWrapper(
+      std::move(interpreter), std::move(resolver), std::move(error_reporter),
+      std::move(model), std::move(reader));
+  return wrapper;
+}
+
+}  // namespace calibration_wrapper
+}  // namespace tflite
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..213bc4a182d348e5a19b5c2624cca375d367aba7
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tflite {
+namespace ops {
+namespace builtin {
+class BuiltinOpResolver;
+}  // namespace builtin
+}  // namespace ops
+
+class FlatBufferModel;
+class Interpreter;
+
+namespace interpreter_wrapper {
+class PythonErrorReporter;
+}  // namespace interpreter_wrapper
+
+namespace optimize {
+namespace calibration {
+class CalibrationReader;
+}  // namespace calibration
+}  // namespace optimize
+
+namespace calibration_wrapper {
+
+class CalibrationWrapper {
+ public:
+  // SWIG caller takes ownership of pointer.
+  static CalibrationWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
+  ~CalibrationWrapper();
+
+  PyObject* Prepare();
+
+  PyObject* FeedTensor(PyObject* input_value);
+
+  PyObject* QuantizeModel();
+
+ private:
+  // CalibrationWrapper is not copyable or assignable. We avoid the use of
+  // CalibrationWrapper() = delete here for SWIG compatibility.
+  CalibrationWrapper(
+      std::unique_ptr<tflite::Interpreter> interpreter,
+      std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+      std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+          error_reporter,
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader);
+
+  CalibrationWrapper(const CalibrationWrapper& rhs);
+
+  PyObject* SetTensor(int index, PyObject* value);
+
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+      error_reporter_;
+  std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader_;
+};
+
+}  // namespace calibration_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.i b/tensorflow/lite/python/optimize/calibration_wrapper.i
new file mode 100644
index 0000000000000000000000000000000000000000..094ac20733abc3797d5d325b838215c2909045ba
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.i
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "std_string.i"
+
+
+%{
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
+%}
+
+
+%include "tensorflow/lite/python/optimize/calibration_wrapper.h"
\ No newline at end of file
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
new file mode 100644
index 0000000000000000000000000000000000000000..940987932e5469065e01e86e18a35be81990ff5a
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -0,0 +1,68 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrapper for post training quantization with calibration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies. Must use double quotes to match code internal rewrite
+# rule.
+_calibration_wrapper = LazyLoader(
+    "_calibration_wrapper", globals(),
+    "tensorflow.lite.python.optimize."
+    "tensorflow_lite_wrap_calibration_wrapper")
+
+
+class Calibrator(object):
+  """Calibrates a floating point model and then quantizes it.
+
+  This is an internal class, not a public interface.
+  """
+
+  def __init__(self, model_content):
+    """Constructor.
+
+    Args:
+      model_content: Content of a TF-Lite Flatbuffer file.
+
+    Raises:
+      ValueError: If the calibrator was unable to open the model.
+    """
+    if not model_content:
+      raise ValueError("`model_content` must be specified.")
+    try:
+      self._calibrator = (_calibration_wrapper.CalibrationWrapper
+                          .CreateWrapperCPPFromBuffer(model_content))
+    except Exception as e:
+      raise ValueError("Failed to parse the model: %s." % e)
+    if not self._calibrator:
+      raise ValueError("Failed to parse the model.")
+
+  def calibrate_and_quantize(self, dataset_gen):
+    """Calibrates the model with specified generator and then quantizes it.
+
+    Returns:
+      A quantized model.
+
+    Args:
+      dataset_gen: A generator that generates calibration samples.
+    """
+    self._calibrator.Prepare()
+    for calibration_sample in dataset_gen():
+      self._calibrator.FeedTensor(calibration_sample)
+    return self._calibrator.QuantizeModel()
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e0c1efbff3023b0386c53b8eb612bb89c2f19b
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -0,0 +1,109 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Calibrator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+from tensorflow.lite.python.optimize import calibrator as _calibrator
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class CalibratorTest(test_util.TensorFlowTestCase):
+
+  def test_calibration_with_quantization(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator for the model.
+    def input_gen():
+      for _ in range(10):
+        yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
+
+    quantized_model = quantizer.calibrate_and_quantize(input_gen)
+    self.assertIsNotNone(quantized_model)
+
+  def test_calibration_with_quantization_multiple_inputs(self):
+    # Load multi add model from test data.
+    # This model has 4 inputs of size (1, 8, 8, 3).
+    model_path = resource_loader.get_path_to_datafile(
+        '../../testdata/multi_add.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator for the model.
+    def input_gen():
+      for _ in range(10):
+        yield [np.ones(shape=(1, 8, 8, 3), dtype=np.float32) for _ in range(4)]
+
+    quantized_model = quantizer.calibrate_and_quantize(input_gen)
+    self.assertIsNotNone(quantized_model)
+
+  def test_invalid_model_buffer(self):
+    float_model = b'\0' * 100
+    with self.assertRaisesWithRegexpMatch(ValueError,
+                                          'Failed to parse the model'):
+      _calibrator.Calibrator(float_model)
+
+  def test_empty_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    def empty_input_gen():
+      for i in ():
+        yield i
+
+    with self.assertRaises(RuntimeError):
+      quantizer.calibrate_and_quantize(empty_input_gen)
+
+  def test_invalid_shape_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator with incorrect shape.
+    def input_gen():
+      for _ in range(10):
+        yield [np.ones(shape=(1, 2, 2, 3), dtype=np.float32)]
+
+    with self.assertRaisesWithRegexpMatch(ValueError, 'Dimension mismatch'):
+      quantizer.calibrate_and_quantize(input_gen)
+
+  def test_invalid_type_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator with incorrect shape.
+    def input_gen():
+      for _ in range(10):
+        yield np.ones(shape=(1, 5, 5, 3), dtype=np.int32)
+
+    with self.assertRaises(ValueError):
+      quantizer.calibrate_and_quantize(input_gen)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin b/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a2909249ffd2675fad9c0cd60a6ff75f940b3fb0
Binary files /dev/null and b/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin differ
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2fa08e5326990ecda1857fec8eb9caadac1f4102
--- /dev/null
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -0,0 +1,53 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
+
+exports_files(glob(["*.pb"]))
+
+tf_to_tflite(
+    name = "permute_float",
+    src = "permute.pbtxt",
+    out = "permute_float.tflite",
+    options = [
+        "--input_arrays=input",
+        "--output_arrays=output",
+    ],
+)
+
+tf_to_tflite(
+    name = "permute_uint8",
+    src = "permute.pbtxt",
+    out = "permute_uint8.tflite",
+    options = [
+        "--input_arrays=input",
+        "--output_arrays=output",
+        "--inference_type=QUANTIZED_UINT8",
+        "--std_values=1",
+        "--mean_values=0",
+        "--default_ranges_min=0",
+        "--default_ranges_max=255",
+    ],
+)
+
+tf_to_tflite(
+    name = "gather_string",
+    src = "gather.pbtxt",
+    out = "gather_string.tflite",
+    options = [
+        "--input_arrays=input,indices",
+        "--output_arrays=output",
+    ],
+)
+
+filegroup(
+    name = "interpreter_test_data",
+    srcs = [
+        ":gather_string",
+        ":permute_float",
+        ":permute_uint8",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/lite/python/testdata/gather.pbtxt b/tensorflow/lite/python/testdata/gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b1193c475d3b4b663accf036753bfbe9d8adb7d
--- /dev/null
+++ b/tensorflow/lite/python/testdata/gather.pbtxt
@@ -0,0 +1,93 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+}
+node {
+  name: "indices"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "axis"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "GatherV2"
+  input: "input"
+  input: "indices"
+  input: "axis"
+  device: "/device:CPU:0"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/testdata/permute.pbtxt b/tensorflow/lite/python/testdata/permute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..482b5c64828f4f5ef2057b4552a81425485d0841
--- /dev/null
+++ b/tensorflow/lite/python/testdata/permute.pbtxt
@@ -0,0 +1,98 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 4
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 4
+          }
+        }
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 1.0
+
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 1.0
+        float_val: 0.0
+
+        float_val: 0.0
+        float_val: 1.0
+        float_val: 0.0
+        float_val: 0.0
+
+        float_val: 1.0
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "MatMul"
+  input: "input"
+  input: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 401a592273c9c76f1f371bb8972f7f9a3d494278..ad4b04c4f3e3aefee69085856f62868ba667e5e7 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -25,6 +25,7 @@ import sys
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.python import tf2
 from tensorflow.python.platform import app
 
 
@@ -258,6 +259,11 @@ def _check_flags(flags, unparsed):
 
 def run_main(_):
   """Main in toco_convert.py."""
+  if tf2.enabled():
+    raise ValueError("tflite_convert is currently unsupported in 2.0. "
+                     "Please use the Python API "
+                     "tf.lite.TFLiteConverter.from_concrete_function().")
+
   parser = argparse.ArgumentParser(
       description=("Command line tool to run TensorFlow Lite Optimizing "
                    "Converter (TOCO)."))
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index ea516764c929080bc42e48a7cfcdd171f2d6cc57..e55419186e16f62f27f9df0201e814cb8936fc27 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -9,6 +9,12 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 py_binary(
     name = "upgrade_schema",
+    srcs = ["upgrade_schema.py"],
+    deps = [":upgrade_schema_main_lib"],
+)
+
+py_library(
+    name = "upgrade_schema_main_lib",
     srcs = [
         "upgrade_schema.py",
     ],
@@ -39,7 +45,7 @@ py_test(
         "notap",
     ],
     deps = [
-        ":upgrade_schema",
+        ":upgrade_schema_main_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 13f47d2cfcc8bae23ffc34183adde5e8770cc8e7..c6c61a602a8191e2d9c611338c1c5cf4b7cd814d 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -217,6 +217,15 @@ enum BuiltinOperator : byte {
   ABS = 101,
   SPLIT_V = 102,
   UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
 }
 
 // Options for the builtin operators.
@@ -301,6 +310,13 @@ union BuiltinOptions {
   AbsOptions,
   SplitVOptions,
   UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -544,6 +560,9 @@ table TransposeOptions {
 table ExpOptions {
 }
 
+table CosOptions {
+}
+
 table ReducerOptions {
   keep_dims: bool;
 }
@@ -638,6 +657,9 @@ table ShapeOptions {
   out_type : TensorType;
 }
 
+table RankOptions {
+}
+
 table PowOptions {
 }
 
@@ -718,6 +740,22 @@ table UniqueOptions {
   idx_out_type:TensorType = INT32;
 }
 
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index a1174e79f722dc2e6dca59a5b2071b8d461378f1..2a55698a616a52af50e19f9ae0ef14e70e14930e 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -139,6 +139,9 @@ struct TransposeOptionsT;
 struct ExpOptions;
 struct ExpOptionsT;
 
+struct CosOptions;
+struct CosOptionsT;
+
 struct ReducerOptions;
 struct ReducerOptionsT;
 
@@ -214,6 +217,9 @@ struct NotEqualOptionsT;
 struct ShapeOptions;
 struct ShapeOptionsT;
 
+struct RankOptions;
+struct RankOptionsT;
+
 struct PowOptions;
 struct PowOptionsT;
 
@@ -271,6 +277,21 @@ struct MirrorPadOptionsT;
 struct UniqueOptions;
 struct UniqueOptionsT;
 
+struct ReverseV2Options;
+struct ReverseV2OptionsT;
+
+struct AddNOptions;
+struct AddNOptionsT;
+
+struct GatherNdOptions;
+struct GatherNdOptionsT;
+
+struct WhereOptions;
+struct WhereOptionsT;
+
+struct ReverseSequenceOptions;
+struct ReverseSequenceOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -524,11 +545,20 @@ enum BuiltinOperator {
   BuiltinOperator_ABS = 101,
   BuiltinOperator_SPLIT_V = 102,
   BuiltinOperator_UNIQUE = 103,
+  BuiltinOperator_CEIL = 104,
+  BuiltinOperator_REVERSE_V2 = 105,
+  BuiltinOperator_ADD_N = 106,
+  BuiltinOperator_GATHER_ND = 107,
+  BuiltinOperator_COS = 108,
+  BuiltinOperator_WHERE = 109,
+  BuiltinOperator_RANK = 110,
+  BuiltinOperator_ELU = 111,
+  BuiltinOperator_REVERSE_SEQUENCE = 112,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_UNIQUE
+  BuiltinOperator_MAX = BuiltinOperator_REVERSE_SEQUENCE
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[103] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[112] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -632,7 +662,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[103] {
     BuiltinOperator_MIRROR_PAD,
     BuiltinOperator_ABS,
     BuiltinOperator_SPLIT_V,
-    BuiltinOperator_UNIQUE
+    BuiltinOperator_UNIQUE,
+    BuiltinOperator_CEIL,
+    BuiltinOperator_REVERSE_V2,
+    BuiltinOperator_ADD_N,
+    BuiltinOperator_GATHER_ND,
+    BuiltinOperator_COS,
+    BuiltinOperator_WHERE,
+    BuiltinOperator_RANK,
+    BuiltinOperator_ELU,
+    BuiltinOperator_REVERSE_SEQUENCE
   };
   return values;
 }
@@ -743,6 +782,15 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "ABS",
     "SPLIT_V",
     "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
     nullptr
   };
   return names;
@@ -835,11 +883,18 @@ enum BuiltinOptions {
   BuiltinOptions_AbsOptions = 78,
   BuiltinOptions_SplitVOptions = 79,
   BuiltinOptions_UniqueOptions = 80,
+  BuiltinOptions_ReverseV2Options = 81,
+  BuiltinOptions_AddNOptions = 82,
+  BuiltinOptions_GatherNdOptions = 83,
+  BuiltinOptions_CosOptions = 84,
+  BuiltinOptions_WhereOptions = 85,
+  BuiltinOptions_RankOptions = 86,
+  BuiltinOptions_ReverseSequenceOptions = 87,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_UniqueOptions
+  BuiltinOptions_MAX = BuiltinOptions_ReverseSequenceOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[81] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[88] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -921,7 +976,14 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[81] {
     BuiltinOptions_MirrorPadOptions,
     BuiltinOptions_AbsOptions,
     BuiltinOptions_SplitVOptions,
-    BuiltinOptions_UniqueOptions
+    BuiltinOptions_UniqueOptions,
+    BuiltinOptions_ReverseV2Options,
+    BuiltinOptions_AddNOptions,
+    BuiltinOptions_GatherNdOptions,
+    BuiltinOptions_CosOptions,
+    BuiltinOptions_WhereOptions,
+    BuiltinOptions_RankOptions,
+    BuiltinOptions_ReverseSequenceOptions
   };
   return values;
 }
@@ -1009,6 +1071,13 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "AbsOptions",
     "SplitVOptions",
     "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
     nullptr
   };
   return names;
@@ -1343,6 +1412,34 @@ template<> struct BuiltinOptionsTraits<UniqueOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
 };
 
+template<> struct BuiltinOptionsTraits<ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<AddNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ReverseSequenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseSequenceOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2014,6 +2111,62 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_UniqueOptions ?
       reinterpret_cast<const UniqueOptionsT *>(value) : nullptr;
   }
+  ReverseV2OptionsT *AsReverseV2Options() {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<ReverseV2OptionsT *>(value) : nullptr;
+  }
+  const ReverseV2OptionsT *AsReverseV2Options() const {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<const ReverseV2OptionsT *>(value) : nullptr;
+  }
+  AddNOptionsT *AsAddNOptions() {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<AddNOptionsT *>(value) : nullptr;
+  }
+  const AddNOptionsT *AsAddNOptions() const {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<const AddNOptionsT *>(value) : nullptr;
+  }
+  GatherNdOptionsT *AsGatherNdOptions() {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<GatherNdOptionsT *>(value) : nullptr;
+  }
+  const GatherNdOptionsT *AsGatherNdOptions() const {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<const GatherNdOptionsT *>(value) : nullptr;
+  }
+  CosOptionsT *AsCosOptions() {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<CosOptionsT *>(value) : nullptr;
+  }
+  const CosOptionsT *AsCosOptions() const {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<const CosOptionsT *>(value) : nullptr;
+  }
+  WhereOptionsT *AsWhereOptions() {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<WhereOptionsT *>(value) : nullptr;
+  }
+  const WhereOptionsT *AsWhereOptions() const {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<const WhereOptionsT *>(value) : nullptr;
+  }
+  RankOptionsT *AsRankOptions() {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<RankOptionsT *>(value) : nullptr;
+  }
+  const RankOptionsT *AsRankOptions() const {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<const RankOptionsT *>(value) : nullptr;
+  }
+  ReverseSequenceOptionsT *AsReverseSequenceOptions() {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<ReverseSequenceOptionsT *>(value) : nullptr;
+  }
+  const ReverseSequenceOptionsT *AsReverseSequenceOptions() const {
+    return type == BuiltinOptions_ReverseSequenceOptions ?
+      reinterpret_cast<const ReverseSequenceOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4946,6 +5099,46 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
 
 flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct CosOptionsT : public flatbuffers::NativeTable {
+  typedef CosOptions TableType;
+  CosOptionsT() {
+  }
+};
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CosOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  CosOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CosOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CosOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CosOptionsBuilder &operator=(const CosOptionsBuilder &);
+  flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ReducerOptionsT : public flatbuffers::NativeTable {
   typedef ReducerOptions TableType;
   bool keep_dims;
@@ -6192,6 +6385,46 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
 
 flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct RankOptionsT : public flatbuffers::NativeTable {
+  typedef RankOptions TableType;
+  RankOptionsT() {
+  }
+};
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RankOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RankOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RankOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RankOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  RankOptionsBuilder &operator=(const RankOptionsBuilder &);
+  flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct PowOptionsT : public flatbuffers::NativeTable {
   typedef PowOptions TableType;
   PowOptionsT() {
@@ -7110,6 +7343,232 @@ inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
 
 flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ReverseV2OptionsT : public flatbuffers::NativeTable {
+  typedef ReverseV2Options TableType;
+  ReverseV2OptionsT() {
+  }
+};
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseV2OptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReverseV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseV2OptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ReverseV2OptionsBuilder &operator=(const ReverseV2OptionsBuilder &);
+  flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddNOptionsT : public flatbuffers::NativeTable {
+  typedef AddNOptions TableType;
+  AddNOptionsT() {
+  }
+};
+
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AddNOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AddNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AddNOptionsBuilder &operator=(const AddNOptionsBuilder &);
+  flatbuffers::Offset<AddNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AddNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AddNOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherNdOptionsT : public flatbuffers::NativeTable {
+  typedef GatherNdOptions TableType;
+  GatherNdOptionsT() {
+  }
+};
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherNdOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GatherNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherNdOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GatherNdOptionsBuilder &operator=(const GatherNdOptionsBuilder &);
+  flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhereOptionsT : public flatbuffers::NativeTable {
+  typedef WhereOptions TableType;
+  WhereOptionsT() {
+  }
+};
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhereOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  WhereOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhereOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhereOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  WhereOptionsBuilder &operator=(const WhereOptionsBuilder &);
+  flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseSequenceOptionsT : public flatbuffers::NativeTable {
+  typedef ReverseSequenceOptions TableType;
+  int32_t seq_dim;
+  int32_t batch_dim;
+  ReverseSequenceOptionsT()
+      : seq_dim(0),
+        batch_dim(0) {
+  }
+};
+
+struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseSequenceOptionsT NativeTableType;
+  enum {
+    VT_SEQ_DIM = 4,
+    VT_BATCH_DIM = 6
+  };
+  int32_t seq_dim() const {
+    return GetField<int32_t>(VT_SEQ_DIM, 0);
+  }
+  int32_t batch_dim() const {
+    return GetField<int32_t>(VT_BATCH_DIM, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SEQ_DIM) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIM) &&
+           verifier.EndTable();
+  }
+  ReverseSequenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseSequenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseSequenceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_seq_dim(int32_t seq_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_SEQ_DIM, seq_dim, 0);
+  }
+  void add_batch_dim(int32_t batch_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_BATCH_DIM, batch_dim, 0);
+  }
+  explicit ReverseSequenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ReverseSequenceOptionsBuilder &operator=(const ReverseSequenceOptionsBuilder &);
+  flatbuffers::Offset<ReverseSequenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseSequenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t seq_dim = 0,
+    int32_t batch_dim = 0) {
+  ReverseSequenceOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dim(batch_dim);
+  builder_.add_seq_dim(seq_dim);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -7483,6 +7942,27 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const UniqueOptions *builtin_options_as_UniqueOptions() const {
     return builtin_options_type() == BuiltinOptions_UniqueOptions ? static_cast<const UniqueOptions *>(builtin_options()) : nullptr;
   }
+  const ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == BuiltinOptions_ReverseV2Options ? static_cast<const ReverseV2Options *>(builtin_options()) : nullptr;
+  }
+  const AddNOptions *builtin_options_as_AddNOptions() const {
+    return builtin_options_type() == BuiltinOptions_AddNOptions ? static_cast<const AddNOptions *>(builtin_options()) : nullptr;
+  }
+  const GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == BuiltinOptions_GatherNdOptions ? static_cast<const GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == BuiltinOptions_CosOptions ? static_cast<const CosOptions *>(builtin_options()) : nullptr;
+  }
+  const WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == BuiltinOptions_WhereOptions ? static_cast<const WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == BuiltinOptions_RankOptions ? static_cast<const RankOptions *>(builtin_options()) : nullptr;
+  }
+  const ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const {
+    return builtin_options_type() == BuiltinOptions_ReverseSequenceOptions ? static_cast<const ReverseSequenceOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7834,6 +8314,34 @@ template<> inline const UniqueOptions *Operator::builtin_options_as<UniqueOption
   return builtin_options_as_UniqueOptions();
 }
 
+template<> inline const ReverseV2Options *Operator::builtin_options_as<ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
+template<> inline const AddNOptions *Operator::builtin_options_as<AddNOptions>() const {
+  return builtin_options_as_AddNOptions();
+}
+
+template<> inline const GatherNdOptions *Operator::builtin_options_as<GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const CosOptions *Operator::builtin_options_as<CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const WhereOptions *Operator::builtin_options_as<WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const RankOptions *Operator::builtin_options_as<RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
+template<> inline const ReverseSequenceOptions *Operator::builtin_options_as<ReverseSequenceOptions>() const {
+  return builtin_options_as_ReverseSequenceOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -9379,6 +9887,29 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferB
       _fbb);
 }
 
+inline CosOptionsT *CosOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CosOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<CosOptions> CosOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCosOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateCosOptions(
+      _fbb);
+}
+
 inline ReducerOptionsT *ReducerOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ReducerOptionsT();
   UnPackTo(_o, _resolver);
@@ -10008,6 +10539,29 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBuf
       _out_type);
 }
 
+inline RankOptionsT *RankOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RankOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RankOptions> RankOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRankOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRankOptions(
+      _fbb);
+}
+
 inline PowOptionsT *PowOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new PowOptionsT();
   UnPackTo(_o, _resolver);
@@ -10481,6 +11035,127 @@ inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatB
       _idx_out_type);
 }
 
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReverseV2OptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReverseV2Options(
+      _fbb);
+}
+
+inline AddNOptionsT *AddNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AddNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AddNOptions> AddNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAddNOptions(
+      _fbb);
+}
+
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GatherNdOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherNdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGatherNdOptions(
+      _fbb);
+}
+
+inline WhereOptionsT *WhereOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new WhereOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<WhereOptions> WhereOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhereOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateWhereOptions(
+      _fbb);
+}
+
+inline ReverseSequenceOptionsT *ReverseSequenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReverseSequenceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReverseSequenceOptions::UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = seq_dim(); _o->seq_dim = _e; };
+  { auto _e = batch_dim(); _o->batch_dim = _e; };
+}
+
+inline flatbuffers::Offset<ReverseSequenceOptions> ReverseSequenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseSequenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseSequenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _seq_dim = _o->seq_dim;
+  auto _batch_dim = _o->batch_dim;
+  return tflite::CreateReverseSequenceOptions(
+      _fbb,
+      _seq_dim,
+      _batch_dim);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -11059,6 +11734,34 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const ReverseSequenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -11397,6 +12100,34 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const ReverseSequenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -11723,6 +12454,34 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const UniqueOptionsT *>(value);
       return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2OptionsT *>(value);
+      return CreateReverseV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptionsT *>(value);
+      return CreateAddNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptionsT *>(value);
+      return CreateGatherNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptionsT *>(value);
+      return CreateCosOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptionsT *>(value);
+      return CreateWhereOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptionsT *>(value);
+      return CreateRankOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const ReverseSequenceOptionsT *>(value);
+      return CreateReverseSequenceOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -12049,6 +12808,34 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new UniqueOptionsT(*reinterpret_cast<UniqueOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_ReverseV2Options: {
+      value = new ReverseV2OptionsT(*reinterpret_cast<ReverseV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      value = new AddNOptionsT(*reinterpret_cast<AddNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      value = new GatherNdOptionsT(*reinterpret_cast<GatherNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      value = new CosOptionsT(*reinterpret_cast<CosOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      value = new WhereOptionsT(*reinterpret_cast<WhereOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      value = new RankOptionsT(*reinterpret_cast<RankOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      value = new ReverseSequenceOptionsT(*reinterpret_cast<ReverseSequenceOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -12456,6 +13243,41 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<ReverseV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<AddNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<GatherNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<CosOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<WhereOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<RankOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<ReverseSequenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/stderr_reporter.cc b/tensorflow/lite/stderr_reporter.cc
index 09eb1d254a608ba2d19c824a323f0b5173afe15f..366a1816ef2b2ef62e093bbe99690eae52fdc8c4 100644
--- a/tensorflow/lite/stderr_reporter.cc
+++ b/tensorflow/lite/stderr_reporter.cc
@@ -13,28 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/stderr_reporter.h"
-#include <cstdarg>
-#include <cstdio>
 
-#ifdef __ANDROID__
-#include <android/log.h>
-#endif
+#include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
 
 int StderrReporter::Report(const char* format, va_list args) {
-#ifdef __ANDROID__
-  // On Android stderr is not captured for applications, only for code run from
-  // the shell. Rather than assume all users will set up a custom error
-  // reporter, let's output to logcat here
-  va_list args_for_log;
-  va_copy(args_for_log, args);
-  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
-  va_end(args_for_log);
-#endif
-  const int result = vfprintf(stderr, format, args);
-  fputc('\n', stderr);
-  return result;
+  logging_internal::MinimalLogger::VLog(TFLITE_LOG_ERROR, format, args);
+  return 0;
 }
 
 ErrorReporter* DefaultErrorReporter() {
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index adb67c6d4e748ab98e71e6519bddb237e985ebfb..4b56bcaca0157b715704a57560fa2394c6fa3be1 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 //   buf.AddString("AB", 2);
 //   # Write content of DynamicBuffer to tensor in format of string tensor
 //   # described above.
-//   buf.WriteToTensor(tensor)
+//   buf.WriteToTensor(tensor, nullptr)
 
 #ifndef TENSORFLOW_LITE_STRING_UTIL_H_
 #define TENSORFLOW_LITE_STRING_UTIL_H_
@@ -83,10 +83,6 @@ class DynamicBuffer {
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
 
-  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
-  // TODO(b/120230709): remove when people migrate away.
-  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
-
  private:
   // Data buffer to store contents of strings, not including headers.
   std::vector<char> data_;
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index cbf1d7b226af20251d5f70a354a21f1eb40ae1c6..6fc7de90ea534f9c8c4f61b4607ff7d2d8647d00 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -35,8 +35,11 @@ TEST(StringUtil, TestStringUtil) {
 
   char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
 
-  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
-                                          15);
+  TfLiteQuantization quant;
+  quant.type = kTfLiteNoQuantization;
+  quant.params = nullptr;
+  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, quant,
+                                          data, 15);
   TfLiteTensor* t2 = interpreter.tensor(2);
   interpreter.AllocateTensors();
 
diff --git a/tensorflow/lite/testdata/test_input.csv b/tensorflow/lite/testdata/test_input.csv
new file mode 100644
index 0000000000000000000000000000000000000000..33894d3063f35a885fb34c3c5b85bb6a4d8e711e
--- /dev/null
+++ b/tensorflow/lite/testdata/test_input.csv
@@ -0,0 +1 @@
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
\ No newline at end of file
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index ce85a393851898fc6eb27ffbf37b3e7cbb44e2d7..68512b952a9a34c67452b676db97534b1fb3c733 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -10,6 +10,7 @@ load(
     "generated_test_models_all",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -78,6 +79,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_report",
+        ":string_util_wrapper",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:graph_util",
         "//third_party/py/numpy",
@@ -159,6 +161,7 @@ cc_library(
     srcs = ["tflite_driver.cc"],
     hdrs = ["tflite_driver.h"],
     deps = [
+        ":join",
         ":split",
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
@@ -237,7 +240,7 @@ tf_cc_binary(
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
@@ -391,4 +394,29 @@ tf_cc_binary(
     ],
 )
 
+cc_library(
+    name = "string_util_lib",
+    srcs = ["string_util.cc"],
+    hdrs = ["string_util.h"],
+    deps = [
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "string_util_wrapper",
+    srcs = [
+        "string_util.i",
+    ],
+    deps = [
+        ":string_util_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 12b5a8b210137ce19e1321042293b8ac6375be37..213d214c132948910eec923dc18b60703a8ca571 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -36,6 +36,7 @@ import operator
 import os
 import random
 import re
+import string
 import sys
 import tempfile
 import traceback
@@ -52,6 +53,8 @@ import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
 from tensorflow.lite.testing import generate_examples_report as report_lib
+from tensorflow.lite.testing import string_util_wrapper
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.ops import rnn
 
@@ -163,6 +166,16 @@ def toco_options(data_types,
   return s
 
 
+def format_result(t):
+  """Convert a tensor to a format that can be used in test specs."""
+  if t.dtype.kind not in [np.dtype(np.string_).kind, np.dtype(np.object_).kind]:
+    # Output 9 digits after the point to ensure the precision is good enough.
+    values = ["{:.9f}".format(value) for value in list(t.flatten())]
+    return ",".join(values)
+  else:
+    return string_util_wrapper.SerializeAsHexString(t.flatten())
+
+
 def write_examples(fp, examples):
   """Given a list `examples`, write a text format representation.
 
@@ -179,9 +192,7 @@ def write_examples(fp, examples):
     """Write tensor in file format supported by TFLITE example."""
     fp.write("dtype,%s\n" % x.dtype)
     fp.write("shape," + ",".join(map(str, x.shape)) + "\n")
-    # Output 9 digits after the point to ensure the precision is good enough.
-    values = ["{:.9f}".format(value) for value in list(x.flatten())]
-    fp.write("values," + ",".join(values) + "\n")
+    fp.write("values," + format_result(x) + "\n")
 
   fp.write("test_cases,%d\n" % len(examples))
   for example in examples:
@@ -214,11 +225,9 @@ def write_test_cases(fp, model_name, examples):
     fp.write("invoke {\n")
 
     for t in example["inputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  input: \"" + ",".join(values) + "\"\n")
+      fp.write("  input: \"" + format_result(t) + "\"\n")
     for t in example["outputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  output: \"" + ",".join(values) + "\"\n")
+      fp.write("  output: \"" + format_result(t) + "\"\n")
     fp.write("}\n")
 
 
@@ -230,6 +239,7 @@ _TF_TYPE_INFO = {
     tf.int16: (np.int16, "QUANTIZED_INT16"),
     tf.int64: (np.int64, "INT64"),
     tf.bool: (np.bool, "BOOL"),
+    tf.string: (np.string_, "STRING"),
 }
 
 
@@ -245,6 +255,10 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     value = np.random.randint(min_value, max_value+1, shape)
   elif dtype == tf.bool:
     value = np.random.choice([True, False], size=shape)
+  elif dtype == np.string_:
+    # Not the best strings, but they will do for some basic testing.
+    letters = list(string.ascii_uppercase)
+    return np.random.choice(letters, size=shape).astype(dtype)
   return np.dtype(dtype).type(value) if np.isscalar(value) else value.astype(
       dtype)
 
@@ -300,8 +314,13 @@ def make_control_dep_tests(zip_path):
 
   extra_toco_options = ExtraTocoOptions()
   extra_toco_options.drop_control_dependency = True
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    extra_toco_options)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      extra_toco_options,
+      expected_tf_failures=3)
 
 
 def toco_convert(graph_def_str, input_tensors, output_tensors,
@@ -369,7 +388,7 @@ def make_zip_of_tests(zip_path,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
                       use_frozen_graph=False,
-                      expected_tf_success=None):
+                      expected_tf_failures=0):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -389,8 +408,9 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
-    expected_tf_success: Number of times tensorflow is supposed to succeed in
-      executing the input graphs. `None` means "unknown".
+    expected_tf_failures: Number of times tensorflow is expected to fail in
+      executing the input graphs. In some cases it is OK for TensorFlow to
+      fail because the one or more combination of parameters is invalid.
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -472,9 +492,8 @@ def make_zip_of_tests(zip_path,
         report["toco"] = report_lib.FAILED
         report["tf"] = report_lib.SUCCESS
         # Convert graph to toco
-        input_tensors = [(input_tensor.name.split(":")[0],
-                          input_tensor.get_shape(), input_tensor.dtype)
-                         for input_tensor in inputs]
+        input_tensors = [(input_tensor.name.split(":")[0], input_tensor.shape,
+                          input_tensor.dtype) for input_tensor in inputs]
         output_tensors = [normalize_output_name(out.name) for out in outputs]
         graph_def = freeze_graph(
             sess,
@@ -485,6 +504,10 @@ def make_zip_of_tests(zip_path,
           extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
               "split_tflite_lstm_inputs"]
 
+        # Convert ophint ops if presented.
+        graph_def = tf.lite.experimental.convert_op_hints_to_stubs(
+            graph_def=graph_def)
+        graph_def = tf.graph_util.remove_training_nodes(graph_def)
         tflite_model_binary, toco_log = toco_convert(
             graph_def.SerializeToString(), input_tensors, output_tensors,
             extra_toco_options)
@@ -551,10 +574,17 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
-  if expected_tf_success is not None and tf_success != expected_tf_success:
-    raise RuntimeError(
-        "Expected TF to succeed %d times, but that happened %d times" %
-        (expected_tf_success, tf_success))
+  tf_failures = parameter_count - tf_success
+
+  if tf_failures / parameter_count > 0.8:
+    raise RuntimeError(("Test for '%s' is not very useful. "
+                        "TensorFlow fails in %d percent of the cases.") %
+                       (zip_path, int(100 * tf_failures / parameter_count)))
+
+  if tf_failures != expected_tf_failures:
+    raise RuntimeError(("Expected TF to fail %d times while generating '%s', "
+                        "but that happened %d times") % (expected_tf_failures,
+                                                         zip_path, tf_failures))
 
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
@@ -573,11 +603,12 @@ def make_pool_tests(pool_op_in):
 
   pool_op = pool_op_in
 
-  def f(zip_path):
+  def f(zip_path, expected_tf_failures=0):
     """Actual function that generates examples.
 
     Args:
       zip_path: path to write zip to.
+      expected_tf_failures: number of expected tensorflow failures.
     """
 
     # Chose a set of parameters
@@ -606,20 +637,26 @@ def make_pool_tests(pool_op_in):
       return [input_values], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
+
   return f
 
 
 def make_l2_pool_tests(zip_path):
-  make_pool_tests(make_l2_pool)(zip_path)
+  make_pool_tests(make_l2_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_avg_pool_tests(zip_path):
-  make_pool_tests(tf.nn.avg_pool)(zip_path)
+  make_pool_tests(tf.nn.avg_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_max_pool_tests(zip_path):
-  make_pool_tests(tf.nn.max_pool)(zip_path)
+  make_pool_tests(tf.nn.max_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_abs_tests(zip_path):
@@ -645,6 +682,32 @@ def make_abs_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+def make_elu_tests(zip_path):
+  """Make a set of tests to do (float) tf.nn.elu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                          [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.elu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
@@ -814,6 +877,9 @@ def make_constant_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
       "constant_is_also_output": [True, False],
+      # This is a regression test for a bug where Toco rejects models with
+      # unread inputs.
+      "has_unread_input": [True, False],
   }]
 
   def build_graph(parameters):
@@ -823,22 +889,28 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    out = [tf.maximum(dummy_input, constant)]
+    outputs = [tf.maximum(dummy_input, constant)]
     if parameters["constant_is_also_output"]:
-      out.append(constant)
+      outputs.append(constant)
+    inputs = [dummy_input]
+    if parameters["has_unread_input"]:
+      unread_input = tf.placeholder(
+          dtype=parameters["dtype"],
+          name="unread_input",
+          shape=parameters["input_shape"])
+      inputs.append(unread_input)
 
-    return [dummy_input], out
+    return inputs, outputs
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=20)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_binary_op_tests(zip_path, binary_operator):
+def make_binary_op_tests(zip_path, binary_operator, expected_tf_failures=0):
   """Make a set of tests to do binary ops with and without broadcast."""
 
   test_parameters = [
@@ -908,7 +980,12 @@ def make_binary_op_tests(zip_path, binary_operator):
             inputs[1]: input2
         })
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_reduce_tests(reduce_op,
@@ -1074,6 +1151,34 @@ def make_exp_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_cos_tests(zip_path):
+  """Make a set of tests to do cos."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the cos op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    out = tf.cos(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"],
+                           min_value=-np.pi, max_value=np.pi)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_log_softmax_tests(zip_path):
   """Make a set of tests to do log_softmax."""
 
@@ -1137,7 +1242,12 @@ def make_maximum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_minimum_tests(zip_path):
@@ -1172,7 +1282,12 @@ def make_minimum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_binary_op_tests_func(binary_operator):
@@ -1184,6 +1299,51 @@ def make_add_tests(zip_path):
   make_binary_op_tests(zip_path, tf.add)
 
 
+def make_add_n_tests(zip_path):
+  """Make a set of tests for AddN op."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[2, 5, 3, 1]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[5]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Builds the graph given the current parameters."""
+    input_tensors = []
+    for i in range(parameters["num_inputs"]):
+      input_tensors.append(
+          tf.placeholder(
+              dtype=parameters["dtype"],
+              name="input_{}".format(i),
+              shape=parameters["input_shape"]))
+    out = tf.add_n(input_tensors)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Builds operand inputs for op."""
+    input_data = []
+    for i in range(parameters["num_inputs"]):
+      input_data.append(
+          create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    return input_data, sess.run(
+        outputs, feed_dict={i: d for i, d in zip(inputs, input_data)})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_div_tests(zip_path):
   make_binary_op_tests(zip_path, tf.div)
 
@@ -1197,7 +1357,7 @@ def make_mul_tests(zip_path):
 
 
 def make_pow_tests(zip_path):
-  make_binary_op_tests(zip_path, tf.pow)
+  make_binary_op_tests(zip_path, tf.pow, expected_tf_failures=7)
 
 
 def make_floor_div_tests(zip_path):
@@ -1215,16 +1375,23 @@ def make_squared_difference_tests(zip_path):
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
-  test_parameters = [{
-      # TODO(mgubin): add string tests when they are supported by Toco.
-      # TODO(mgubin): add tests for Nd indices when they are supported by
-      # TfLite.
-      "params_dtype": [tf.float32, tf.int32, tf.int64],
-      "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32, tf.int64],
-      "indices_shape": [[3], [5]],
-      "axis": [-1, 0, 1],
-  }]
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[10], [1, 2, 20]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3], [5]],
+          "axis": [-1, 0, 1],
+      },
+      {
+          # TODO(b/123895910): add Nd support for strings.
+          "params_dtype": [tf.string],
+          "params_shape": [[8]],
+          "indices_dtype": [tf.int32],
+          "indices_shape": [[3]],
+          "axis": [0],
+      }
+  ]
 
   def build_graph(parameters):
     """Build the gather op testing graph."""
@@ -1255,7 +1422,56 @@ def make_gather_tests(zip_path):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_success=60)
+      expected_tf_failures=12)
+
+
+def make_gather_nd_tests(zip_path):
+  """Make a set of tests to do gather_nd."""
+
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 1]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[1, 1]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[2, 1], [2, 2]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5, 10]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3, 1], [2, 2], [2, 3], [2, 1, 3]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the gather_nd op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    indices = tf.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    out = tf.gather_nd(params, indices)
+    return [params, indices], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    indices = create_tensor_data(parameters["indices_dtype"],
+                                 parameters["indices_shape"], 0,
+                                 parameters["params_shape"][0] - 1)
+    return [params, indices], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, indices])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_gather_with_constant_tests(zip_path):
@@ -1284,8 +1500,7 @@ def make_gather_with_constant_tests(zip_path):
     return [reference_values], sess.run(
         outputs, feed_dict={inputs[0]: reference_values})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=2)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1421,7 +1636,12 @@ def make_conv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=40)
 
 
 # Note: This is a regression test for a bug (b/122651451) that Toco incorrectly
@@ -1663,7 +1883,12 @@ def make_depthwiseconv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_split_tests(zip_path):
@@ -1686,7 +1911,12 @@ def make_split_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=112)
 
 
 def make_splitv_tests(zip_path):
@@ -1709,7 +1939,12 @@ def make_splitv_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=158)
 
 
 def make_concat_tests(zip_path):
@@ -1751,7 +1986,12 @@ def make_concat_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=60)
 
 
 def make_fully_connected_tests(zip_path):
@@ -1812,7 +2052,12 @@ def make_fully_connected_tests(zip_path):
       values.append(create_tensor_data(np.float32, parameters["shape2"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=10)
 
 
 def make_l2norm_tests(zip_path):
@@ -1842,7 +2087,12 @@ def make_l2norm_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_local_response_norm_tests(zip_path):
@@ -2068,6 +2318,29 @@ def make_shape_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_rank_tests(zip_path):
+  """Make a set of tests to do rank."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[], [0], [1, 1, 1, 3], [2, 3, 4, 5], [5, 5], [10]],
+  }]
+
+  def build_graph(parameters):
+    """Build the rank op testing graph."""
+    input_value = tf.placeholder(dtype=parameters["input_dtype"], name="input")
+    out = tf.rank(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_one_hot_tests(zip_path):
   """Make a set of tests to do one_hot."""
 
@@ -2327,7 +2600,12 @@ def make_space_to_batch_nd_tests(zip_path):
       values.append(np.array(parameters["paddings"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=56)
 
 
 def make_batch_to_space_nd_tests(zip_path):
@@ -2440,7 +2718,12 @@ def make_transpose_tests(zip_path):
       values.append(np.array(parameters["perm"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_squeeze_tests(zip_path):
@@ -2478,10 +2761,48 @@ def make_squeeze_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
+
 
+def make_squeeze_transpose_tests(zip_path):
+  """Make a set of tests to do squeeze followed by transpose."""
 
-def _make_strided_slice_tests(zip_path, test_parameters):
+  test_parameters = [{
+      "dtype": [tf.int32, tf.float32, tf.int64],
+      "input_shape": [[1, 4, 10, 1]],
+      "axis": [[-1], [3]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.squeeze(input_tensor, axis=parameters["axis"])
+    out = tf.transpose(out, perm=[1, 2])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=0)
+
+
+def _make_strided_slice_tests(zip_path, test_parameters,
+                              expected_tf_failures=0):
   """Utility function to make strided_slice_tests based on parameters."""
 
   def build_graph(parameters):
@@ -2541,7 +2862,12 @@ def _make_strided_slice_tests(zip_path, test_parameters):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_strided_slice_tests(zip_path):
@@ -2615,7 +2941,7 @@ def make_strided_slice_tests(zip_path):
           "constant_indices": [False],
       },
   ]
-  _make_strided_slice_tests(zip_path, test_parameters)
+  _make_strided_slice_tests(zip_path, test_parameters, expected_tf_failures=2)
 
 
 def make_strided_slice_1d_exhaustive_tests(zip_path):
@@ -2638,7 +2964,10 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
-def make_strided_slice_buggy_tests(zip_path):
+# For verifying https://github.com/tensorflow/tensorflow/issues/23599
+# TODO(chaomei): refactor the test to cover more cases, like negative stride,
+# negative array index etc.
+def make_resolve_constant_strided_slice_tests(zip_path):
   """Make a set of tests to show strided_slice yields incorrect results."""
 
   test_parameters = [{
@@ -2693,7 +3022,7 @@ def make_lstm_tests(zip_path):
           shape=[num_batchs, input_vec_size])
       inputs_after_split.append(one_timestamp_input)
     # Currently lstm identifier has a few limitations: only supports
-    # forget_bias == 0, inner state activiation == tanh.
+    # forget_bias == 0, inner state activation == tanh.
     # TODO(zhixianyan): Add another test with forget_bias == 1.
     # TODO(zhixianyan): Add another test with relu as activation.
     lstm_cell = tf.contrib.rnn.BasicLSTMCell(
@@ -2817,7 +3146,12 @@ def make_arg_min_max_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_equal_tests(zip_path):
@@ -2852,7 +3186,12 @@ def make_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_not_equal_tests(zip_path):
@@ -2886,7 +3225,12 @@ def make_not_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_tests(zip_path):
@@ -2920,7 +3264,12 @@ def make_greater_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_equal_tests(zip_path):
@@ -2954,7 +3303,12 @@ def make_greater_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_tests(zip_path):
@@ -2988,7 +3342,12 @@ def make_less_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_equal_tests(zip_path):
@@ -3022,7 +3381,12 @@ def make_less_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_floor_tests(zip_path):
@@ -3030,7 +3394,7 @@ def make_floor_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
   }]
 
   def build_graph(parameters):
@@ -3042,6 +3406,31 @@ def make_floor_tests(zip_path):
     out = tf.floor(input_value)
     return [input_value], [out]
 
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_ceil_tests(zip_path):
+  """Make a set of tests to do ceil."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the ceil op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.ceil(input_value)
+    return [input_value], [out]
+
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
@@ -3249,7 +3638,12 @@ def make_slice_tests(zip_path):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=18)
 
 
 def make_conv2d_transpose_tests(zip_path):
@@ -3286,12 +3680,7 @@ def make_conv2d_transpose_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=4)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 # Since compute output_shape is fairly complicated for
@@ -3541,7 +3930,12 @@ def make_pack_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=72)
 
 
 def make_unpack_tests(zip_path):
@@ -3632,13 +4026,18 @@ def make_fill_tests(zip_path):
     return [input1, input2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input1, input2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
 
 
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
-  def logical(zip_path):
+  def logical(zip_path, expected_tf_failures=0):
     """Generate examples."""
     test_parameters = [{
         "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
@@ -3663,19 +4062,24 @@ def _make_logical_tests(op):
       return [input_value1, input_value2], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
 
   return logical
 
 
 def make_logical_or_tests(zip_path):
   """Make a set of tests to do logical_or."""
-  return _make_logical_tests(tf.logical_or)(zip_path)
+  return _make_logical_tests(tf.logical_or)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_and_tests(zip_path):
   """Make a set of tests to do logical_and."""
-  return _make_logical_tests(tf.logical_and)(zip_path)
+  return _make_logical_tests(tf.logical_and)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_xor_tests(zip_path):
@@ -3683,7 +4087,7 @@ def make_logical_xor_tests(zip_path):
 
     Test logical_not as well.
   """
-  return _make_logical_tests(tf.logical_xor)(zip_path)
+  return _make_logical_tests(tf.logical_xor)(zip_path, expected_tf_failures=1)
 
 
 def make_mirror_pad_tests(zip_path):
@@ -3732,6 +4136,12 @@ def make_mirror_pad_tests(zip_path):
           "mode": ["REFLECT"],
           "type": ["const"]
       },
+      {
+          "input_shape": [[3, 2, 4, 5]],
+          "padding_matrix": [[[1, 1], [2, 2], [1, 1], [1, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
   ]
 
   def build_graph(parameters):
@@ -3760,18 +4170,23 @@ def make_mirror_pad_tests(zip_path):
     return input_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, input_values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=7)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_unroll_batch_matmul_tests(zip_path):
   """Make a set of tests to test unroll_batch_matmul."""
 
-  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "shape": [[(2, 2, 3), (2, 3, 2), False, False],
+                [(2, 2, 3), (2, 3, 2), True, True],
+                [(2, 2, 3), (2, 2, 3), False, True],
+                [(2, 2, 3), (2, 2, 3), True, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), False, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), True, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), False, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), True, False]]
+  }]
 
   def build_graph(parameters):
     """Build the batch_matmul op testing graph."""
@@ -3780,7 +4195,11 @@ def make_unroll_batch_matmul_tests(zip_path):
     input_tensor2 = tf.placeholder(
         dtype=parameters["dtype"], shape=parameters["shape"][1])
     # Should be unrolled and replaced with fully_connected ops in the end.
-    out = tf.matmul(input_tensor1, input_tensor2)
+    out = tf.matmul(
+        input_tensor1,
+        input_tensor2,
+        transpose_a=parameters["shape"][2],
+        transpose_b=parameters["shape"][3])
     return [input_tensor1, input_tensor2], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -3817,8 +4236,7 @@ def make_placeholder_with_default_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=3)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_unique_tests(zip_path):
@@ -3863,12 +4281,235 @@ def make_unique_tests(zip_path):
     return input_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, input_values)))
 
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_reverse_v2_tests(zip_path):
+  """Make a set of tests to do reverse_v2."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5, 6, 7, 8]],
+      "axis": [0, 1, 2, 3],
+  }]
+
+  def get_valid_axis(parameters):
+    """Return a tweaked version of 'axis'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    while axis > len(shape) - 1:
+      axis -= 1
+    return axis
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name=("input"), shape=parameters["base_shape"])
+    outs = tf.reverse(input_tensor, axis=[get_valid_axis(parameters)])
+    return [input_tensor], [outs]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(np.float32, shape=parameters["base_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_reverse_sequence_tests(zip_path):
+  """Make a set of tests to do reverse_sequence."""
+
+  test_parameters = [
+      {
+          "input_dtype": [tf.float32, tf.int32, tf.int64],
+          "input_shape": [[8, 4, 5, 5, 6], [4, 4, 3, 5]],
+          "seq_lengths": [[2, 2, 2, 2], [2, 1, 1, 0]],
+          "seq_axis": [0, 3],
+          "batch_axis": [1]
+      },
+      {
+          "input_dtype": [tf.float32],
+          "input_shape": [[2, 4, 5, 5, 6]],
+          "seq_lengths": [[2, 1]],
+          "seq_axis": [2],
+          "batch_axis": [0]
+      },
+      {
+          "input_dtype": [tf.float32],
+          "input_shape": [[4, 2]],
+          "seq_lengths": [[3, 1]],
+          "seq_axis": [0],
+          "batch_axis": [1]
+      }]
+
+  def build_graph(parameters):
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    outs = tf.reverse_sequence(
+        input_value,
+        seq_lengths=parameters["seq_lengths"],
+        batch_axis=parameters["batch_axis"],
+        seq_axis=parameters["seq_axis"])
+    return [input_value], [outs]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+@test_util.enable_control_flow_v2
+def make_unidirectional_sequence_lstm_tests(zip_path):
+  """Make a set of tests to do unidirectional_sequence_lstm."""
+
+  test_parameters = [{
+      "batch_size": [2, 4, 6],
+      "seq_length": [1, 3],
+      "units": [4, 5],
+      "use_peepholes": [False, True],
+      "is_dynamic_rnn": [False, True]
+  }]
+
+  def build_graph(parameters):
+    input_values = []
+    if parameters["is_dynamic_rnn"]:
+      shape = [
+          parameters["seq_length"], parameters["batch_size"],
+          parameters["units"]
+      ]
+      input_value = tf.placeholder(dtype=tf.float32, name="input", shape=shape)
+      input_values.append(input_value)
+      lstm_cell = tf.lite.experimental.nn.TFLiteLSTMCell(
+          parameters["units"],
+          use_peepholes=parameters["use_peepholes"])
+      outs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          lstm_cell, input_value, dtype=tf.float32, time_major=True)
+      outs = tf.unstack(outs, axis=1)
+    else:
+      shape = [parameters["batch_size"], parameters["units"]]
+      for i in range(parameters["seq_length"]):
+        input_value = tf.placeholder(
+            dtype=tf.float32, name=("input_%d" % i), shape=shape)
+        input_values.append(input_value)
+      lstm_cell = tf.lite.experimental.nn.TFLiteLSTMCell(
+          parameters["units"], use_peepholes=parameters["use_peepholes"])
+      outs, _ = tf.nn.static_rnn(lstm_cell, input_values, dtype=tf.float32)
+
+    real_output = tf.zeros([1], dtype=tf.float32) + outs[-1]
+    return input_values, [real_output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = []
+    if parameters["is_dynamic_rnn"]:
+      shape = [
+          parameters["seq_length"], parameters["batch_size"],
+          parameters["units"]
+      ]
+      input_value = create_tensor_data(tf.float32, shape)
+      input_values.append(input_value)
+    else:
+      shape = [parameters["batch_size"], parameters["units"]]
+      for i in range(parameters["seq_length"]):
+        input_value = create_tensor_data(tf.float32, shape)
+        input_values.append(input_value)
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    # Tflite fused kernel takes input as [time, batch, input].
+    # For static unidirectional sequence lstm, the input is an array sized of
+    # time, and pack the array together, however, for time = 1, the input is
+    # not packed.
+    tflite_input_values = input_values
+    if not parameters["is_dynamic_rnn"] and parameters["seq_length"] == 1:
+      tflite_input_values = [
+          input_values[0].reshape((1, parameters["batch_size"],
+                                   parameters["units"]))
+      ]
+    return tflite_input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      use_frozen_graph=True)
+
+
+@test_util.enable_control_flow_v2
+def make_unidirectional_sequence_rnn_tests(zip_path):
+  """Make a set of tests to do unidirectional_sequence_rnn."""
+
+  test_parameters = [{
+      "batch_size": [2, 4, 6],
+      "seq_length": [1, 3],
+      "units": [4, 5],
+      "is_dynamic_rnn": [False, True]
+  }]
+
+  def build_graph(parameters):
+    input_values = []
+    if parameters["is_dynamic_rnn"]:
+      shape = [
+          parameters["seq_length"], parameters["batch_size"],
+          parameters["units"]
+      ]
+      input_value = tf.placeholder(dtype=tf.float32, name="input", shape=shape)
+      input_values.append(input_value)
+      rnn_cell = tf.lite.experimental.nn.TfLiteRNNCell(parameters["units"])
+      outs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          rnn_cell, input_value, dtype=tf.float32, time_major=True)
+      outs = tf.unstack(outs, axis=1)
+    else:
+      shape = [parameters["batch_size"], parameters["units"]]
+      for i in range(parameters["seq_length"]):
+        input_value = tf.placeholder(
+            dtype=tf.float32, name=("input_%d" % i), shape=shape)
+        input_values.append(input_value)
+      rnn_cell = tf.lite.experimental.nn.TfLiteRNNCell(parameters["units"])
+      outs, _ = tf.nn.static_rnn(rnn_cell, input_values, dtype=tf.float32)
+
+    real_output = tf.zeros([1], dtype=tf.float32) + outs[-1]
+    return input_values, [real_output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = []
+    if parameters["is_dynamic_rnn"]:
+      shape = [
+          parameters["seq_length"], parameters["batch_size"],
+          parameters["units"]
+      ]
+      input_value = create_tensor_data(tf.float32, shape)
+      input_values.append(input_value)
+    else:
+      shape = [parameters["batch_size"], parameters["units"]]
+      for i in range(parameters["seq_length"]):
+        input_value = create_tensor_data(tf.float32, shape)
+        input_values.append(input_value)
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    # Tflite fused kernel takes input as [time, batch, input].
+    # For static unidirectional sequence rnn, the input is an array sized of
+    # time, and pack the array together, however, for time = 1, the input is
+    # not packed.
+    tflite_input_values = input_values
+    if not parameters["is_dynamic_rnn"] and parameters["seq_length"] == 1:
+      tflite_input_values = [
+          input_values[0].reshape((1, parameters["batch_size"],
+                                   parameters["units"]))
+      ]
+    return tflite_input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
   make_zip_of_tests(
       zip_path,
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_success=9)
+      use_frozen_graph=True)
+
 
 # Toco binary path provided by the generate rule.
 bin_path = None
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 45bd59a67d10baf61ad981f2fef29e948c2e77d2..fb98cc9b1725f8295bb060ae60ceb151569616e6 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -102,9 +102,6 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/add.*dtype=tf\.int64)", "119126484"},
     {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
     {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
-
-    // Strided Slice chooses the wrong dimension.
-    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Additional list of tests that are expected to fail when
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index d1c314608687f045b346cc5526ea46c8149c2755..d10d2909b5ec4a269fd1a67d7a22f4c1e76f707e 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -24,7 +24,21 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-// Join a list of data separated by delimiter.
+// Join a list of data with default precision separated by delimiter.
+template <typename T>
+string JoinDefault(T* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << data[0];
+  for (int i = 1; i < len; i++) {
+    result << delimiter << data[i];
+  }
+  return result.str();
+}
+
+// Join a list of data with fixed precision separated by delimiter.
 template <typename T>
 string Join(T* data, size_t len, const string& delimiter) {
   if (len == 0 || data == nullptr) {
diff --git a/tensorflow/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
index 0b3c07f37e14e3815ac1eb4acd0aefac3515064c..476a7f20591691ccddff6829c894c640608f6471 100644
--- a/tensorflow/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -26,6 +26,11 @@ TEST(JoinTest, JoinInt) {
   EXPECT_EQ(Join(data.data(), data.size(), ","), "1,2,3");
 }
 
+TEST(JoinDefaultTest, JoinFloat) {
+  float data[] = {1.0, -3, 2.3, 1e-5};
+  EXPECT_EQ(JoinDefault(data, 4, " "), "1 -3 2.3 1e-05");
+}
+
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
   EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c46e80cc360043158928544a54c0221a7b405ad0
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -0,0 +1,124 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "util",
+    hdrs = ["util.h"],
+    deps = [
+        ":input_generator",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+        "//tensorflow/lite/testing:tflite_driver",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":util",
+        "//tensorflow/lite/testing:tflite_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_binary(
+    name = "tflite_kernel_runner",
+    srcs = ["tflite_kernel_runner.cc"],
+    deps = [
+        ":util",
+    ],
+)
+
+tf_cc_binary(
+    name = "generate_diff_report",
+    srcs = ["generate_diff_report.cc"],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:framework_internal",
+    ],
+)
+
+cc_library(
+    name = "input_generator",
+    srcs = ["input_generator.cc"],
+    hdrs = ["input_generator.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:join",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "input_generator_test",
+    size = "small",
+    srcs = ["input_generator_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":input_generator",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "diff_analyzer",
+    srcs = ["diff_analyzer.cc"],
+    hdrs = ["diff_analyzer.h"],
+    deps = [
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "diff_analyzer_test",
+    size = "small",
+    srcs = ["diff_analyzer_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d6fcc80be17b4020f53dddb8215a083031fd501
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <cmath>
+#include <fstream>
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+float CalculateNormalizedMaxDiff(const std::vector<float>& base,
+                                 const std::vector<float>& test) {
+  float diff = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    diff = std::max(diff, std::abs(base[i] - test[i]));
+    base_max = std::max(base_max, base[i]);
+  }
+
+  return diff / base_max;
+}
+
+float CalculateNormalizedL2Norm(const std::vector<float>& base,
+                                const std::vector<float>& test) {
+  float l2_error = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    float diff = base[i] - test[i];
+    l2_error += diff * diff;
+    base_max = std::max(base_max, base[i]);
+  }
+
+  l2_error /= base.size();
+
+  return std::sqrt(l2_error) / base_max;
+}
+
+TfLiteStatus Populate(const string& filename,
+                      std::vector<std::vector<float>>* tensors) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream file(filename);
+  string content;
+  while (std::getline(file, content, '\n')) {
+    tensors->push_back(Split<float>(content, ","));
+  }
+
+  file.close();
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus DiffAnalyzer::ReadFiles(const string& base, const string& test) {
+  TF_LITE_ENSURE_STATUS(Populate(base, &base_tensors_));
+  TF_LITE_ENSURE_STATUS(Populate(test, &test_tensors_));
+
+  if (base_tensors_.size() != test_tensors_.size()) {
+    fprintf(stderr, "Golden and test tensor dimensions don't match.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DiffAnalyzer::WriteReport(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty output file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  output_file << "Normalized L2 Error"
+              << ","
+              << "Normalized Max Diff"
+              << "\n";
+  for (int i = 0; i < base_tensors_.size(); i++) {
+    float l2_error =
+        CalculateNormalizedL2Norm(base_tensors_[i], test_tensors_[i]);
+    float max_diff =
+        CalculateNormalizedMaxDiff(base_tensors_[i], test_tensors_[i]);
+    output_file << l2_error << "," << max_diff << "\n";
+  }
+
+  output_file.close();
+  return kTfLiteOk;
+}
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aecbaea449bda3edd1e5176b9a91b4542afc64f3
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Reads the baseline and test files with output tensor values, and calculates
+// the diff metrics.
+class DiffAnalyzer {
+ public:
+  DiffAnalyzer() = default;
+  TfLiteStatus ReadFiles(const string& base, const string& test);
+  TfLiteStatus WriteReport(const string& filename);
+
+ private:
+  std::vector<std::vector<float>> base_tensors_;
+  std::vector<std::vector<float>> test_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da54b38acf82a9647c545e8577b09188ea54934f
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <fstream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(DiffAnalyzerTest, ZeroDiff) {
+  DiffAnalyzer diff_analyzer;
+  string filename = "tensorflow/lite/testdata/test_input.csv";
+  ASSERT_EQ(diff_analyzer.ReadFiles(filename, filename), kTfLiteOk);
+
+  string output_file =
+      tensorflow::io::JoinPath(FLAGS_test_tmpdir + "diff_report.csv");
+  ASSERT_EQ(diff_analyzer.WriteReport(output_file), kTfLiteOk);
+
+  std::string content;
+  std::ifstream file(output_file);
+  std::getline(file, content);
+  std::getline(file, content);
+  ASSERT_EQ(content, "0,0");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/generate_diff_report.cc b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afa6a9a94ec2ffd824d66a363c53b69455706d06
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+int main(int argc, char** argv) {
+  string base, test, output;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("base", &base, "Path to the base serialized tensor."),
+      tensorflow::Flag("test", &test, "Path to the test serialized tensor."),
+      tensorflow::Flag("output", &output, "Path to the output file."),
+  };
+  tensorflow::Flags::Parse(&argc, argv, flag_list);
+
+  tflite::testing::DiffAnalyzer diff_analyzer;
+  diff_analyzer.ReadFiles(base, test);
+  diff_analyzer.WriteReport(output);
+  return 0;
+}
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c69bdff86bbacbbb343e48b8c45d20811463620
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -0,0 +1,208 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <limits>
+#include <random>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/join.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+template <typename T>
+std::vector<T> GenerateRandomTensor(TfLiteIntArray* dims,
+                                    const std::function<T(int)>& random_func) {
+  int64_t num_elements = 1;
+  for (int i = 0; i < dims->size; i++) {
+    num_elements *= dims->data[i];
+  }
+
+  std::vector<T> result(num_elements);
+  for (int i = 0; i < num_elements; i++) {
+    result[i] = random_func(i);
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<T> GenerateUniform(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    // TODO(yunluli): Change seed for each invocation if needed.
+    // Used rand() instead of rand_r() here to make it runnable on android.
+    return min + (max - min) * static_cast<float>(rand()) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+template <typename T>
+std::vector<T> GenerateGaussian(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    static std::default_random_engine generator;
+    // We generate a float number within [0, 1) following a mormal distribution
+    // with mean = 0.5 and stddev = 1/3, and use it to scale the final random
+    // number into the desired range.
+    static std::normal_distribution<double> distribution(0.5, 1.0 / 3);
+    auto rand_n = distribution(generator);
+    while (rand_n < 0 || rand_n >= 1) {
+      rand_n = distribution(generator);
+    }
+
+    return min + (max - min) * static_cast<float>(rand_n);
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+}  // namespace
+
+TfLiteStatus InputGenerator::LoadModel(const string& model_dir) {
+  model_ = FlatBufferModel::BuildFromFile(model_dir.c_str());
+  if (!model_) {
+    fprintf(stderr, "Cannot load model %s", model_dir.c_str());
+    return kTfLiteError;
+  }
+
+  ::tflite::ops::builtin::BuiltinOpResolver builtin_ops;
+  InterpreterBuilder(*model_, builtin_ops)(&interpreter_);
+  if (!interpreter_) {
+    fprintf(stderr, "Failed to build interpreter.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::ReadInputsFromFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream input_file(filename);
+  string input;
+  while (std::getline(input_file, input, '\n')) {
+    inputs_.push_back(input);
+  }
+  input_file.close();
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::WriteInputsToFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  for (const auto& input : inputs_) {
+    output_file << input << "\n";
+  }
+  output_file.close();
+
+  return kTfLiteOk;
+}
+
+// TODO(yunluli): Support more tensor types when needed.
+TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
+  auto input_tensor_ids = interpreter_->inputs();
+  for (auto id : input_tensor_ids) {
+    auto* tensor = interpreter_->tensor(id);
+    if (distribution == "UNIFORM") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateUniform<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateUniform<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateUniform<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else if (distribution == "GAUSSIAN") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateGaussian<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateGaussian<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateGaussian<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else {
+      fprintf(stderr, "Unsupported distribution %s.", distribution.c_str());
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+std::vector<string> InputGenerator::GetInputs() { return inputs_; }
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..859c7068e5448c837580fe79e89918fbd34c2a66
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Generate random input, or read input from a file for kernel diff test.
+// Needs to load the tflite graph to get information like tensor shape and
+// data type.
+class InputGenerator {
+ public:
+  InputGenerator() = default;
+  TfLiteStatus LoadModel(const string& model_dir);
+  TfLiteStatus ReadInputsFromFile(const string& filename);
+  TfLiteStatus GenerateInput(const string& distribution);
+  std::vector<string> GetInputs();
+  TfLiteStatus WriteInputsToFile(const string& filename);
+
+ private:
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<string> inputs_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
diff --git a/tensorflow/lite/testing/kernel_test/input_generator_test.cc b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2223a9196ff7d6b790e2e7a0170d42c5d5468a5f
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <map>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(InputGeneratorTest, LoadModel) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+}
+
+TEST(InputGeneratorTest, ReadWriteSimpleFile) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.ReadInputsFromFile(
+                "tensorflow/lite/testdata/test_input.csv"),
+            kTfLiteOk);
+
+  std::vector<string> inputs;
+  std::string content = "1";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    content.append(",1");
+  }
+  inputs.push_back(content);
+  ASSERT_EQ(input_generator.GetInputs(), inputs);
+
+  auto output_filename = FLAGS_test_tmpdir + "/out.csv";
+  ASSERT_EQ(input_generator.WriteInputsToFile(output_filename), kTfLiteOk);
+
+  std::ifstream in(output_filename);
+  std::string out;
+  std::getline(in, out, '\n');
+  ASSERT_EQ(out, content);
+}
+
+TEST(InputGeneratorTest, GenerateUniformInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("UNIFORM");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+TEST(InputGeneratorTest, GenerateGaussianInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("GAUSSIAN");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c1728ed1da6ec962989479dccfdc64bc8ca6cd
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+int main(int argc, char** argv) {
+  tflite::testing::kernel_test::TestOptions options =
+      tflite::testing::kernel_test::ParseTfliteKernelTestFlags(&argc, argv);
+  const bool run_reference_kernel = options.kernel_type == "REFERENCE";
+  const bool use_nnapi = options.kernel_type == "NNAPI";
+
+  auto runner = absl::make_unique<tflite::testing::TfLiteDriver>(
+      use_nnapi, "", run_reference_kernel);
+  if (tflite::testing::kernel_test::RunKernelTest(options, runner.get()) ==
+      kTfLiteOk) {
+    return 0;
+  }
+
+  return -1;
+}
diff --git a/tensorflow/lite/testing/kernel_test/util.h b/tensorflow/lite/testing/kernel_test/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d940e5ad12f497ec827ce0dc6be9e6311078b1a9
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util.h
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+
+#include <fstream>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+
+struct TestOptions {
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Path of the input file. If empty, generate at runtime.
+  string read_input_from_file;
+  // Path to dump the input file.
+  string dump_input_to_file;
+  // Path to dump the output.
+  string dump_output_to_file;
+  // Input distribution.
+  string input_distribution;
+  // Kernel type.
+  string kernel_type;
+};
+
+TestOptions ParseTfliteKernelTestFlags(int* argc, char** argv) {
+  TestOptions options;
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tflite_model", &options.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("read_input_from_file", &options.read_input_from_file,
+                       "File to read input data from. If empty, generates "
+                       "input at runtime."),
+      tensorflow::Flag("dump_input_to_file", &options.dump_input_to_file,
+                       "File to dump randomly generated input."),
+      tensorflow::Flag("dump_output_to_file", &options.dump_output_to_file,
+                       "File to dump output."),
+      tensorflow::Flag("input_distribution", &options.input_distribution,
+                       "Input distribution. Default: Gaussian."),
+      tensorflow::Flag("kernel_type", &options.kernel_type, "Kernel type."),
+  };
+
+  tensorflow::Flags::Parse(argc, argv, flags);
+
+  return options;
+}
+
+TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
+                           TestRunner* runner) {
+  InputGenerator input_generator;
+
+  if (options.read_input_from_file.empty()) {
+    TF_LITE_ENSURE_STATUS(input_generator.LoadModel(options.tflite_model));
+    TF_LITE_ENSURE_STATUS(
+        input_generator.GenerateInput(options.input_distribution));
+  } else {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.ReadInputsFromFile(options.read_input_from_file));
+  }
+
+  runner->LoadModel(options.tflite_model);
+  runner->AllocateTensors();
+  if (!runner->IsValid()) return kTfLiteError;
+  auto input_tensor_ids = runner->GetInputs();
+  auto inputs = input_generator.GetInputs();
+  if (inputs.size() != input_tensor_ids.size()) {
+    fprintf(stderr,
+            "Number of input tensors generated doesn't match what the model "
+            "asks for.");
+  }
+  for (int i = 0; i < inputs.size(); i++) {
+    runner->SetInput(input_tensor_ids[i], inputs[i]);
+  }
+
+  runner->Invoke();
+
+  if (!options.dump_input_to_file.empty()) {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.WriteInputsToFile(options.dump_input_to_file));
+  }
+
+  if (!options.dump_output_to_file.empty()) {
+    std::ofstream output_file;
+    output_file.open(options.dump_output_to_file,
+                     std::fstream::out | std::fstream::trunc);
+    if (!output_file) {
+      return kTfLiteError;
+    }
+
+    for (auto id : runner->GetOutputs()) {
+      output_file << runner->ReadOutput(id) << "\n";
+    }
+    output_file.close();
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
diff --git a/tensorflow/lite/testing/kernel_test/util_test.cc b/tensorflow/lite/testing/kernel_test/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbec66092837726e90fb6d37135ba990c7cbdb86
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+#include <fstream>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+namespace {
+
+TEST(UtilTest, SimpleE2ETest) {
+  TestOptions options;
+  options.tflite_model = "tensorflow/lite/testdata/add.bin";
+  options.read_input_from_file =
+      "tensorflow/lite/testdata/test_input.csv";
+  options.dump_output_to_file = FLAGS_test_tmpdir + "/test_out.csv";
+  options.kernel_type = "REFERENCE";
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(false, "", true));
+  RunKernelTest(options, runner.get());
+  std::string expected = "3";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    expected.append(",3");
+  }
+  std::string content;
+  std::ifstream file(options.dump_output_to_file);
+  std::getline(file, content);
+  EXPECT_EQ(content, expected);
+}
+
+}  // namespace
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 804e328d9da248859e806bd070de26a8f5aa37b4..95bda42cd71abe4cecb41f34859cf6990e851ed7 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
@@ -25,13 +26,50 @@ from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
+from tensorflow.python.keras.preprocessing import image
 from tensorflow.python.lib.io import file_io as _file_io
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.saved_model import load as _load
 from tensorflow.python.saved_model import loader as _loader
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
 
 
+def get_filepath(filename, base_dir=None):
+  """Returns the full path of the filename.
+
+  Args:
+    filename: Subdirectory and name of the model file.
+    base_dir: Base directory containing model file.
+
+  Returns:
+    str.
+  """
+  if base_dir is None:
+    base_dir = "learning/brain/mobile/tflite_compat_models"
+  return os.path.join(_resource_loader.get_root_dir_with_all_resources(),
+                      base_dir, filename)
+
+
+def get_image(size):
+  """Returns an image loaded into an np.ndarray with dims [1, size, size, 3].
+
+  Args:
+    size: Size of image.
+
+  Returns:
+    np.ndarray.
+  """
+  img_filename = _resource_loader.get_path_to_datafile(
+      "testdata/grace_hopper.jpg")
+  img = image.load_img(img_filename, target_size=(size, size))
+  img_array = image.img_to_array(img)
+  img_array = np.expand_dims(img_array, axis=0)
+  return img_array
+
+
 def _convert(converter, **kwargs):
   """Converts the model.
 
@@ -186,6 +224,32 @@ def compare_models(tflite_model, tf_eval_func, input_data=None, tolerance=5):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
 
 
+def compare_models_v2(tflite_model, concrete_func, input_data=None,
+                      tolerance=5):
+  """Compares TensorFlow and TFLite models for TensorFlow 2.0.
+
+  Unless the input data is provided, the models are compared with random data.
+  Currently only 1 input and 1 output are supported by this function.
+
+  Args:
+    tflite_model: Serialized TensorFlow Lite model.
+    concrete_func: TensorFlow ConcreteFunction.
+    input_data: np.ndarray to pass into models during inference. (default None)
+    tolerance: Decimal place to check accuracy to. (default 5)
+  """
+  if input_data is None:
+    input_data = _generate_random_input_data(tflite_model)
+  input_data_func = constant_op.constant(input_data[0])
+
+  # Gets the TensorFlow results as a map from the output names to outputs.
+  # Converts the map into a list that is equivalent to the TFLite list.
+  tf_results_map = concrete_func(input_data_func)
+  tf_results = [tf_results_map[tf_results_map.keys()[0]]]
+  tflite_results = _evaluate_tflite_model(tflite_model, input_data)
+  for tf_result, tflite_result in zip(tf_results, tflite_results):
+    np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
+
+
 def test_frozen_graph_quant(filename,
                             input_arrays,
                             output_arrays,
@@ -315,6 +379,39 @@ def test_saved_model(directory,
   compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
+# TODO(nupurgarg): Remove input_shape parameter after bug with shapes is fixed.
+def test_saved_model_v2(directory,
+                        input_shape=None,
+                        tag_set=None,
+                        signature_key=None,
+                        input_data=None,
+                        **kwargs):
+  """Validates the TensorFlow SavedModel converts to a TFLite model.
+
+  Converts the TensorFlow SavedModel to TFLite and checks the accuracy of the
+  model on random data.
+
+  Args:
+    directory: SavedModel directory to convert.
+    input_shape: Input shape for the single input array as a list of integers.
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present.
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    input_data: np.ndarray to pass into models during inference. (default None)
+    **kwargs: Additional arguments to be passed into the converter.
+  """
+  model = _load.load(directory, tags=tag_set)
+  if not signature_key:
+    signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  concrete_func = model.signatures[signature_key]
+  concrete_func.inputs[0].set_shape(input_shape)
+
+  converter = _lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+  tflite_model = _convert(converter, **kwargs)
+
+  compare_models_v2(tflite_model, concrete_func, input_data=input_data)
+
+
 def test_keras_model(filename,
                      input_arrays=None,
                      input_shapes=None,
diff --git a/tensorflow/lite/testing/nnapi_example.cc b/tensorflow/lite/testing/nnapi_example.cc
index 0ff1fbcd411d86738faaddfb740ddf1fc500590a..309cb19628cd54a39ea926a6f3506cf570ff3679 100644
--- a/tensorflow/lite/testing/nnapi_example.cc
+++ b/tensorflow/lite/testing/nnapi_example.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include <iostream>
 #include <sstream>
 #include <string>
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 
diff --git a/tensorflow/lite/testing/string_util.cc b/tensorflow/lite/testing/string_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf9d5087644cc52415a83dd80b457249b85765b5
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/testing/string_util.h"
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* value) {
+  DynamicBuffer dynamic_buffer;
+  if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+    return nullptr;
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return python_utils::ConvertToPyString(s.data(), s.size());
+}
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/string_util.h b/tensorflow/lite/testing/string_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..56c024d918df37641c12851a2a02187d12e03b7d
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+
+#include <Python.h>
+#include <string>
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+// Take a python string array, convert it to TF Lite dynamic buffer format and
+// serialize it as a HexString.
+PyObject* SerializeAsHexString(PyObject* value);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
diff --git a/tensorflow/lite/testing/string_util.i b/tensorflow/lite/testing/string_util.i
new file mode 100644
index 0000000000000000000000000000000000000000..574abb79653ff858721e28d0d33225e3e24cbbfd
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.i
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/lite/testing/string_util.h"
+
+%}
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* string_tensor);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index a637dc86c020d4e16fb4fc02e9f62e8dec6a3a25..55670858338bda0bfe04828c33da6c64982a6656 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -383,5 +384,34 @@ void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensors();
 }
 
+string TfLiteDriver::ReadOutput(int id) {
+  auto* tensor = interpreter_->tensor(id);
+  int num_elements = 1;
+
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    num_elements *= tensor->dims->data[i];
+  }
+
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      return JoinDefault(tensor->data.f, num_elements, ",");
+    case kTfLiteInt32:
+      return JoinDefault(tensor->data.i32, num_elements, ",");
+    case kTfLiteInt64:
+      return JoinDefault(tensor->data.i64, num_elements, ",");
+    case kTfLiteUInt8:
+      return Join(tensor->data.uint8, num_elements, ",");
+    case kTfLiteInt8:
+      return JoinDefault(tensor->data.int8, num_elements, ",");
+    case kTfLiteBool:
+      return JoinDefault(tensor->data.b, num_elements, ",");
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::ReadOutput"));
+      return "";
+  }
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 537f20dfbfd6c6fe0fbefd854358146129d33b7a..3cce6c4222ec36f5eac2f144062b5b850c326345 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -49,7 +49,7 @@ class TfLiteDriver : public TestRunner {
   void SetExpectation(int id, const string& csv_values) override;
   void Invoke() override;
   bool CheckResults() override;
-  string ReadOutput(int id) override { return "no-op"; }
+  string ReadOutput(int id) override;
 
  private:
   void DeallocateStringTensor(TfLiteTensor* t) {
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 81bf6700cb898796a72bea38ea0711556a7215a5..e80816bdf5ecd21d4f147e824188dd3a206d68dd 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -54,6 +54,8 @@ TEST(TfliteDriverTest, SimpleTest) {
   ASSERT_TRUE(runner->IsValid());
 
   ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
 }
 
 TEST(TfliteDriverTest, SingleAddOpTest) {
@@ -88,6 +90,8 @@ TEST(TfliteDriverTest, SingleAddOpTest) {
   ASSERT_TRUE(runner->IsValid());
 
   ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tflite_exported_symbols.lds b/tensorflow/lite/tflite_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..b145204aa1e2b039aa7075047b1fd9ca73157320
--- /dev/null
+++ b/tensorflow/lite/tflite_exported_symbols.lds
@@ -0,0 +1,3 @@
+*TfLite*
+*tflite*
+*TFL_*
diff --git a/tensorflow/lite/tflite_static.bp b/tensorflow/lite/tflite_static.bp
index e9a8378b810b411cd0d0daaba437a4337131393b..3aca8f9c68c8c3ccde454b77fa974c72bf41638a 100644
--- a/tensorflow/lite/tflite_static.bp
+++ b/tensorflow/lite/tflite_static.bp
@@ -28,6 +28,8 @@ cc_library_static {
         "delegates/nnapi/nnapi_delegate.cc",
 	"graph_info.cc",
         "interpreter.cc",
+        "minimal_logging.cc",
+        "minimal_logging_android.cc",
         "mmap_allocation.cc",
         "model.cc",
         "mutable_op_resolver.cc",
@@ -42,12 +44,14 @@ cc_library_static {
 	"kernels/topk_v2.cc",
         "kernels/activations.cc",
         "kernels/add.cc",
+        "kernels/add_n.cc",
         "kernels/arg_min_max.cc",
         "kernels/basic_rnn.cc",
         "kernels/batch_to_space_nd.cc",
         "kernels/bidirectional_sequence_lstm.cc",
         "kernels/bidirectional_sequence_rnn.cc",
         "kernels/cast.cc",
+        "kernels/ceil.cc",
         "kernels/comparisons.cc",
         "kernels/concatenation.cc",
         "kernels/conv.cc",
@@ -67,11 +71,12 @@ cc_library_static {
         "kernels/floor_mod.cc",
         "kernels/fully_connected.cc",
         "kernels/gather.cc",
+        "kernels/gather_nd.cc",
         "kernels/gemm_support.cc",
         "kernels/hashtable_lookup.cc",
+        "kernels/if.cc",
         "kernels/kernel_util.cc",
         "kernels/l2norm.cc",
-        "kernels/layer_norm_lstm.cc",
         "kernels/local_response_norm.cc",
         "kernels/logical.cc",
         "kernels/lsh_projection.cc",
@@ -87,12 +92,14 @@ cc_library_static {
         "kernels/pooling.cc",
         "kernels/pow.cc",
         "kernels/range.cc",
+        "kernels/rank.cc",
         "kernels/reduce.cc",
-        "kernels/relu1.cc",
         "kernels/register.cc",
         "kernels/reshape.cc",
         "kernels/resize_bilinear.cc",
         "kernels/resize_nearest_neighbor.cc",
+        "kernels/reverse.cc",
+        "kernels/reverse_sequence.cc",
         "kernels/select.cc",
         "kernels/shape.cc",
         "kernels/skip_gram.cc",
@@ -113,12 +120,15 @@ cc_library_static {
         "kernels/unidirectional_sequence_rnn.cc",
         "kernels/unique.cc",
         "kernels/unpack.cc",
+        "kernels/where.cc",
+        "kernels/while.cc",
         "kernels/zeros_like.cc",
         "kernels/internal/kernel_utils.cc",
         "kernels/internal/tensor_utils.cc",
         "kernels/internal/quantization_util.cc",
         "kernels/internal/reference/portable_tensor_utils.cc",
         "kernels/internal/optimized/neon_tensor_utils.cc",
+        "nnapi/nnapi_implementation.cc",
     ],
     include_dirs: [
         "external/eigen",
@@ -143,9 +153,11 @@ cc_library_static {
         "-Wno-missing-field-initializers",
         "-Wno-sign-compare",
         "-Wno-typedef-redefinition",
+        "-Wno-unused-function",
         "-Wno-unused-lambda-capture",
         "-Wno-unused-local-typedef",
         "-Wno-unused-parameter",
+        "-Wno-unused-private-field",
         "-Wno-unused-variable",
         "-Wno-invalid-partial-specialization",
         "-Wno-mismatched-tags",
diff --git a/tensorflow/lite/tflite_version_script.lds b/tensorflow/lite/tflite_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..1df70705ebf4a85d2f4c9f2301c53d48e623dff7
--- /dev/null
+++ b/tensorflow/lite/tflite_version_script.lds
@@ -0,0 +1,8 @@
+VERS_1.0 {
+  global:
+    *TfLite*;
+    *tflite*;
+    *TFL_*;
+  local:
+    *;
+};
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 40bceedd6a1e8398d25a4c58a3ee69228ae8d868..8481b0b754c370934860212b4d9d2e45899bb6cd 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -133,6 +133,7 @@ cc_library(
 cc_library(
     name = "model_cmdline_flags",
     srcs = [
+        "args.cc",
         "model_cmdline_flags.cc",
     ],
     hdrs = [
@@ -192,6 +193,7 @@ cc_library(
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
         "graph_transformations/fuse_broadcast_into_following_binary.cc",
         "graph_transformations/graph_transformations.cc",
+        "graph_transformations/group_bidirectional_sequence_ops.cc",
         "graph_transformations/hardcode_min_max.cc",
         "graph_transformations/identify_dilated_conv.cc",
         "graph_transformations/identify_l2_normalization.cc",
@@ -308,7 +310,7 @@ cc_library(
         "toco_tooling.h",
     ],
     copts = tf_copts() + select({
-        "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
+        "//tensorflow:macos": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
         "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
@@ -333,7 +335,7 @@ cc_library(
         "//tensorflow/lite/toco/tflite:export",
         "//tensorflow/lite/toco/tflite:import",
     ] + select({
-        # Placeholder for internal darwin rule.
+        # Placeholder for internal macOS rule.
         "//conditions:default": [],
     }),
 )
@@ -376,6 +378,7 @@ cc_library(
         ":types_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@protobuf_archive//:protobuf_headers",
@@ -477,3 +480,16 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+tf_cc_test(
+    name = "model_cmdline_flags_test",
+    srcs = [
+        "model_cmdline_flags_test.cc",
+    ],
+    deps = [
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da8debc49a697fb77832c93940b60c0bebe1a7f9
--- /dev/null
+++ b/tensorflow/lite/toco/args.cc
@@ -0,0 +1,169 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/toco/args.h"
+#include "absl/strings/str_split.h"
+
+namespace toco {
+namespace {
+
+// Helper class for SplitStructuredLine parsing.
+class ClosingSymbolLookup {
+ public:
+  explicit ClosingSymbolLookup(const char* symbol_pairs)
+      : closing_(), valid_closing_() {
+    // Initialize the opening/closing arrays.
+    for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) {
+      unsigned char opening = *symbol;
+      ++symbol;
+      // If the string ends before the closing character has been found,
+      // use the opening character as the closing character.
+      unsigned char closing = *symbol != 0 ? *symbol : opening;
+      closing_[opening] = closing;
+      valid_closing_[closing] = true;
+      if (*symbol == 0) break;
+    }
+  }
+
+  ClosingSymbolLookup(const ClosingSymbolLookup&) = delete;
+  ClosingSymbolLookup& operator=(const ClosingSymbolLookup&) = delete;
+
+  // Returns the closing character corresponding to an opening one,
+  // or 0 if the argument is not an opening character.
+  char GetClosingChar(char opening) const {
+    return closing_[static_cast<unsigned char>(opening)];
+  }
+
+  // Returns true if the argument is a closing character.
+  bool IsClosing(char c) const {
+    return valid_closing_[static_cast<unsigned char>(c)];
+  }
+
+ private:
+  // Maps an opening character to its closing. If the entry contains 0,
+  // the character is not in the opening set.
+  char closing_[256];
+  // Valid closing characters.
+  bool valid_closing_[256];
+};
+
+bool SplitStructuredLine(absl::string_view line, char delimiter,
+                         const char* symbol_pairs,
+                         std::vector<absl::string_view>* cols) {
+  ClosingSymbolLookup lookup(symbol_pairs);
+
+  // Stack of symbols expected to close the current opened expressions.
+  std::vector<char> expected_to_close;
+
+  ABSL_RAW_CHECK(cols != nullptr, "");
+  cols->push_back(line);
+  for (size_t i = 0; i < line.size(); ++i) {
+    char c = line[i];
+    if (expected_to_close.empty() && c == delimiter) {
+      // We don't have any open expression, this is a valid separator.
+      cols->back().remove_suffix(line.size() - i);
+      cols->push_back(line.substr(i + 1));
+    } else if (!expected_to_close.empty() && c == expected_to_close.back()) {
+      // Can we close the currently open expression?
+      expected_to_close.pop_back();
+    } else if (lookup.GetClosingChar(c)) {
+      // If this is an opening symbol, we open a new expression and push
+      // the expected closing symbol on the stack.
+      expected_to_close.push_back(lookup.GetClosingChar(c));
+    } else if (lookup.IsClosing(c)) {
+      // Error: mismatched closing symbol.
+      return false;
+    }
+  }
+  if (!expected_to_close.empty()) {
+    return false;  // Missing closing symbol(s)
+  }
+  return true;  // Success
+}
+
+inline bool TryStripPrefixString(absl::string_view str,
+                                 absl::string_view prefix, string* result) {
+  bool res = absl::ConsumePrefix(&str, prefix);
+  result->assign(str.begin(), str.end());
+  return res;
+}
+
+inline bool TryStripSuffixString(absl::string_view str,
+                                 absl::string_view suffix, string* result) {
+  bool res = absl::ConsumeSuffix(&str, suffix);
+  result->assign(str.begin(), str.end());
+  return res;
+}
+
+}  // namespace
+
+bool Arg<toco::IntList>::Parse(string text) {
+  parsed_value_.elements.clear();
+  specified_ = true;
+  // strings::Split("") produces {""}, but we need {} on empty input.
+  // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
+  // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
+  if (!text.empty()) {
+    int32 element;
+    for (absl::string_view part : absl::StrSplit(text, ',')) {
+      if (!SimpleAtoi(part, &element)) return false;
+      parsed_value_.elements.push_back(element);
+    }
+  }
+  return true;
+}
+
+bool Arg<toco::StringMapList>::Parse(string text) {
+  parsed_value_.elements.clear();
+  specified_ = true;
+
+  if (text.empty()) {
+    return true;
+  }
+
+  std::vector<absl::string_view> outer_vector;
+  absl::string_view text_disposable_copy = text;
+  // TODO(aselle): Change argument parsing when absl supports structuredline.
+  SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
+  for (const absl::string_view& outer_member_stringpiece : outer_vector) {
+    string outer_member(outer_member_stringpiece);
+    if (outer_member.empty()) {
+      continue;
+    }
+    string outer_member_copy = outer_member;
+    absl::StripAsciiWhitespace(&outer_member);
+    if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
+    if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
+    const std::vector<string> inner_fields_vector =
+        absl::StrSplit(outer_member, ',');
+
+    std::unordered_map<string, string> element;
+    for (const string& member_field : inner_fields_vector) {
+      std::vector<string> outer_member_key_value =
+          absl::StrSplit(member_field, ':');
+      if (outer_member_key_value.size() != 2) return false;
+      string& key = outer_member_key_value[0];
+      string& value = outer_member_key_value[1];
+      absl::StripAsciiWhitespace(&key);
+      absl::StripAsciiWhitespace(&value);
+      if (element.count(key) != 0) return false;
+      element[key] = value;
+    }
+    parsed_value_.elements.push_back(element);
+  }
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/args.h b/tensorflow/lite/toco/args.h
index 188f2f7e7af61c6c9e94da42d528d3fcff4b5e39..c6eeb2859a91643c3e87bdeb25c32a8ef5611c87 100644
--- a/tensorflow/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -22,10 +22,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/lite/toco/toco_port.h"
-#if defined(PLATFORM_GOOGLE)
-#include "strings/split.h"
-#include "strings/strip.h"
-#endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/lite/toco/toco_types.h"
@@ -64,7 +60,7 @@ class Arg final {
   const T& value() const { return value_; }
 
   // Parsing callback for the tensorflow::Flags code
-  bool parse(T value_in) {
+  bool Parse(T value_in) {
     value_ = value_in;
     specified_ = true;
     return true;
@@ -72,7 +68,7 @@ class Arg final {
 
   // Bind the parse member function so tensorflow::Flags can call it.
   std::function<bool(T)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
  private:
@@ -90,24 +86,10 @@ class Arg<toco::IntList> final {
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
-  bool parse(string text) {
-    parsed_value_.elements.clear();
-    specified_ = true;
-    // strings::Split("") produces {""}, but we need {} on empty input.
-    // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
-    // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
-    if (!text.empty()) {
-      int32 element;
-      for (absl::string_view part : absl::StrSplit(text, ',')) {
-        if (!SimpleAtoi(part, &element)) return false;
-        parsed_value_.elements.push_back(element);
-      }
-    }
-    return true;
-  }
+  bool Parse(string text);
 
   std::function<bool(string)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
   const toco::IntList& value() const { return parsed_value_; }
@@ -126,57 +108,10 @@ class Arg<toco::StringMapList> final {
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
 
-  bool parse(string text) {
-    parsed_value_.elements.clear();
-    specified_ = true;
-
-    if (text.empty()) {
-      return true;
-    }
-
-#if defined(PLATFORM_GOOGLE)
-    std::vector<absl::string_view> outer_vector;
-    absl::string_view text_disposable_copy = text;
-    SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
-    for (const absl::string_view& outer_member_stringpiece : outer_vector) {
-      string outer_member(outer_member_stringpiece);
-      if (outer_member.empty()) {
-        continue;
-      }
-      string outer_member_copy = outer_member;
-      absl::StripAsciiWhitespace(&outer_member);
-      if (!strings::TryStripPrefixString(outer_member, "{", &outer_member))
-        return false;
-      if (!strings::TryStripSuffixString(outer_member, "}", &outer_member))
-        return false;
-      const std::vector<string> inner_fields_vector =
-          absl::StrSplit(outer_member, ',');
-
-      std::unordered_map<string, string> element;
-      for (const string& member_field : inner_fields_vector) {
-        std::vector<string> outer_member_key_value =
-            absl::StrSplit(member_field, ':');
-        if (outer_member_key_value.size() != 2) return false;
-        string& key = outer_member_key_value[0];
-        string& value = outer_member_key_value[1];
-        absl::StripAsciiWhitespace(&key);
-        absl::StripAsciiWhitespace(&value);
-        if (element.count(key) != 0) return false;
-        element[key] = value;
-      }
-      parsed_value_.elements.push_back(element);
-    }
-    return true;
-#else
-    // TODO(aselle): Fix argument parsing when absl supports structuredline
-    fprintf(stderr, "%s:%d StringMapList arguments not supported\n", __FILE__,
-            __LINE__);
-    abort();
-#endif
-  }
+  bool Parse(string text);
 
   std::function<bool(string)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
   const toco::StringMapList& value() const { return parsed_value_; }
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index 8896893f3579abcefa87e3411f9b186ca7a45a1b..ad69e4f7b7a4285f36750c60291d7a6a97e7e9f7 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -15,17 +15,21 @@ limitations under the License.
 #include "tensorflow/lite/toco/dump_graphviz.h"
 
 #include <cmath>
+#include <functional>
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
+#include "re2/re2.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 using toco::port::AppendF;
 using toco::port::StringF;
@@ -33,72 +37,158 @@ using toco::port::StringF;
 namespace toco {
 namespace {
 
+// 'nslimit' is a graphviz (dot) paramater that limits the iterations during
+// the layout phase. Omitting it allows infinite iterations, causing some
+// complex graphs to never finish. A value of 125 produces good graphs
+// while allowing complex graphs to finish.
+constexpr char kGraphFmt[] = R"CODE(digraph Computegraph { tooltip = "/"
+    nslimit=125 margin=36 ranksep = 2 labelloc="t" label=%s
+)CODE";
+// Note: tooltip's are only supported on SVGs in Chrome.
+constexpr char kSubgraphFmt[] =
+    R"CODE(    subgraph "cluster_%s" { style=rounded bgcolor="%s" penwidth=0.0 label=%s
+)CODE";
+constexpr char kArrayNodeFmt[] =
+    R"CODE(        "%s" [label=%s tooltip="%s" shape=%s style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kOpNodeFmt[] =
+    R"CODE(        %s [label=%s tooltip=" " shape=box margin=0 style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kInputEdgeFmt[] =
+    R"CODE(        "%s"%s -> %s:i%d:n [penwidth=%f weight=%f];
+)CODE";
+constexpr char kOutputEdgeFmt[] =
+    R"CODE(        %s:o%d:s -> "%s"%s [penwidth=%f weight=%f];
+)CODE";
+constexpr char kRNNBackEdgeFmt[] =
+    R"CODE(        "%s":s -> "%s":n [color="#0F9D58" constraint=false];
+)CODE";
+constexpr char kUnicodeMult[] = "\u00D7";
+constexpr char kUnicodeEllipsis[] = " \u2026 ";
+
 class Color {
  public:
   Color() {}
   Color(uint8 r, uint8 g, uint8 b) : r_(r), g_(g), b_(b) {}
+  explicit Color(uint32 word)
+      : r_((word & 0x00FF0000) >> 16),
+        g_((word & 0x0000FF00) >> 8),
+        b_((word & 0x000000FF) >> 0) {}
+
   // Returns the string serialization of this color in graphviz format,
   // for use as 'fillcolor' in boxes.
-  string FillColorString() const { return StringF("%.2X%.2X%.2X", r_, g_, b_); }
+  string AsHexString() const { return StringF("#%.2X%.2X%.2X", r_, g_, b_); }
+  // The color to use for this node; will be used as 'fillcolor'
+  // for its box. See Color::AsHexString. A suitable, different
+  // color will be chosen for the 'fontcolor' for the inside text
+  // label, see Color::TextColorString.
   // Returns the serialization in graphviz format of a suitable color to use
   // 'fontcolor' in the same boxes. It should black or white, whichever offers
-  // the better contrast from FillColorString().
+  // the better contrast from AsHexString().
   string TextColorString() const {
     // https://en.wikipedia.org/wiki/Relative_luminance
     const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
     const uint8 l = luminance > 128.f ? 0 : 255;
-    return StringF("%.2X%.2X%.2X", l, l, l);
+    return StringF("#%.2X%.2X%.2X", l, l, l);
   }
 
  private:
   uint8 r_ = 0, g_ = 0, b_ = 0;
 };
 
-struct NodeProperties {
-  // The text to display inside the box for this node.
-  string label;
-  // The color to use for this node; will be used as 'fillcolor'
-  // for its box. See Color::FillColorString. A suitable, different
-  // color will be chosen for the 'fontcolor' for the inside text
-  // label, see Color::TextColorString.
-  Color color;
-  float log2_buffer_size;
-};
-
-// All colors in this file are from:
-// https://material.io/guidelines/style/color.html
+Color HashStringToColor(string s) {
+  // Return a unique color for a name.
+  //
+  // This function removes Tensorflow anti-collision suffixes (eg "_2"), hashes
+  // the string to a uint_32, then twiddles some bits to get a light and subtle
+  // color. This seems to be a good heuristic for keeping enough of the name to
+  // hash to a unique color while still revealing structure through naming
+  // similarities.
+  //
+  // The regular expression "_\d+" matches any underscore followed by numbers,
+  // which we strip out. Examples:
+  //
+  //     "Conv"      -> "Conv"
+  //     "Conv_2"    -> "Conv"
+  //     "Conv_72"   -> "Conv"
+  //     "Pad_1_bias -> "Pad_bias"
+  //     "Conv_abc"  -> "Conv_abc"
+
+  RE2::GlobalReplace(&s, R"CODE(_\d+)CODE", "");
+  uint32 color_word = std::hash<std::string>{}(s);
+  color_word |= 0x00E0E0E0;
+  return Color(color_word);
+}
 
-Color GetColorForArray(const Model& model, const string& array_name) {
+void GetArrayColorAndShape(const Model& model, const string& array_name,
+                           Color* color, string* shape) {
+  // All colors in this file are from:
+  // https://material.io/guidelines/style/color.html
   // Arrays involved in RNN back-edges have a different color
   for (const auto& rnn_state : model.flags.rnn_states()) {
     // RNN state, fed by a back-edge. Bold color.
     if (array_name == rnn_state.state_array()) {
-      return Color(0x0F, 0x9D, 0x58);
+      *color = Color(0x0F, 0x9D, 0x58);
+      *shape = "invhouse";
+      return;
     }
     // RNN back-edge source, feeding a RNN state.
     // Light tone of the same color as RNN states.
     if (array_name == rnn_state.back_edge_source_array()) {
-      return Color(0xB7, 0xE1, 0xCD);
+      *color = Color(0xB7, 0xE1, 0xCD);
+      *shape = "house";
+      return;
     }
   }
   // Constant parameter arrays have their own bold color
   if (model.GetArray(array_name).buffer) {
-    return Color(0x42, 0x85, 0xF4);
+    *color = Color(0x42, 0x85, 0xF4);
+    *shape = "cylinder";
+    return;
   }
   // Remaining arrays are activations.
   // We use gray colors for them because they are the majority
   // of arrays so we want to highlight other arrays instead of them.
   // First, we use a bolder gray for input/output arrays:
   if (IsInputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "invhouse";
+    return;
   }
   if (IsOutputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "house";
+    return;
   }
   // Remaining arrays are intermediate activation arrays.
   // Lighter tone of the same grey as for input/output arrays:
   // We want these to be very discrete.
-  return Color(0xF5, 0xF5, 0xF5);
+  *color = Color(0xF5, 0xF5, 0xF5);
+  *shape = "box";
+}
+
+string GetArrayCompassPt(const Model& model, const string& array_name) {
+  // The "compass point" is the point on the node where edge connections are
+  // made. For most arrays we don't care, but input's and outputs look better
+  // connected at the tip of the "house" and "invhouse" shapes used. So we
+  // append ":n" and ":s" respectively for those.
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    // RNN state is essentially an input
+    if (array_name == rnn_state.state_array()) {
+      return ":s";
+    }
+    // RNN back-edge source is essentially an output
+    if (array_name == rnn_state.back_edge_source_array()) {
+      return ":n";
+    }
+  }
+  if (IsInputArray(model, array_name)) {
+    return ":s";
+  }
+  if (IsOutputArray(model, array_name)) {
+    return ":n";
+  }
+  return "";
 }
 
 void AppendArrayVal(string* string, Array const& array, int index) {
@@ -141,239 +231,550 @@ void AppendArrayVal(string* string, Array const& array, int index) {
   }
 }
 
-NodeProperties GetPropertiesForArray(const Model& model,
-                                     const string& array_name) {
-  NodeProperties node_properties;
-  node_properties.color = GetColorForArray(model, array_name);
-  node_properties.label = absl::StrReplaceAll(array_name, {{"/", "/\\n"}});
-  node_properties.log2_buffer_size = 0.0f;
+typedef std::map<string, string> Attributes;
+
+string AttributesToHtml(Attributes attributes) {
+  string html;
+  for (const auto& attr : attributes) {
+    html += R"CODE(<TR><TD CELLPADDING="1" ALIGN="RIGHT">)CODE";
+    html += attr.first;
+    html += R"CODE(:</TD><TD CELLPADDING="1" ALIGN="LEFT">)CODE";
+    html += attr.second;
+    html += "</TD></TR>";
+  }
+  return html;
+}
+
+string GetArrayLabel(const Model& model, const string& array_id) {
+  string html;
 
-  // Append array shape to the label.
-  auto& array = model.GetArray(array_name);
-  AppendF(&node_properties.label, "\\nType: %s",
-          ArrayDataTypeName(array.data_type));
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  html += "<";
 
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html += R"CODE(<TABLE BORDER="0" CELLSPACING="2" CELLPADDING="0">)CODE";
+
+  auto& array = model.GetArray(array_id);
+  if (array.buffer) {
+    // "cylinder" shapes require some extra head room.
+    html += R"CODE(<TR><TD COLSPAN="2"> </TD></TR>)CODE";
+  }
+
+  // "Primary" name of array (last non-slash delimited group of characters).
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><I>)CODE";
+  AppendF(&html, R"CODE(%s)CODE",
+          std::vector<string>(absl::StrSplit(array_id, '/')).back());
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Array data type and dimensions
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="14" FACE="Courier"><B>)CODE";
+  // Type
+  html += ArrayDataTypeName(array.data_type);
+  // Shape
   if (array.has_shape()) {
     auto& array_shape = array.shape();
-    node_properties.label += "\\n[";
-    for (int id = 0; id < array_shape.dimensions_count(); id++) {
-      if (id == 0) {
-        AppendF(&node_properties.label, "%d", array_shape.dims(id));
-      } else {
-        // 0x00D7 is the unicode multiplication symbol
-        AppendF(&node_properties.label, "\u00D7%d", array_shape.dims(id));
+    html += "[";
+    for (int dim = 0; dim < array_shape.dimensions_count(); dim++) {
+      AppendF(&html, "%d", array_shape.dims(dim));
+      if (dim + 1 < array_shape.dimensions_count()) {
+        html += kUnicodeMult;
       }
     }
-    node_properties.label += "]";
+    html += "]";
+  }
 
-    int buffer_size = 0;
-    if (IsNonEmpty(array.shape())) {
-      buffer_size = RequiredBufferSizeForShape(array.shape());
-      node_properties.log2_buffer_size =
-          std::log2(static_cast<float>(buffer_size));
+  // Small buffer sample
+  int buffer_size = 0;
+  if (array.buffer) {
+    buffer_size = RequiredBufferSizeForShape(array.shape());
+  }
+  if ((buffer_size > 0) && (buffer_size <= 4)) {
+    html += " = ";
+    if (array.shape().dimensions_count() > 0) {
+      html += "{";
     }
-
-    if (array.buffer) {
-      const auto& array = model.GetArray(array_name);
-      if (buffer_size <= 4) {
-        AppendF(&node_properties.label, " = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        for (int i = 0; i < buffer_size; i++) {
-          AppendArrayVal(&node_properties.label, array, i);
-          if (i + 1 < buffer_size) {
-            AppendF(&node_properties.label, ", ");
-          }
-        }
-      } else {
-        AppendF(&node_properties.label, "\\n = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        AppendArrayVal(&node_properties.label, array, 0);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, 1);
-        // 0x2026 is the unicode ellipsis symbol
-        AppendF(&node_properties.label, " \u2026 ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 2);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 1);
-      }
-      if (array.shape().dimensions_count() > 0) {
-        AppendF(&node_properties.label, "}");
+    for (int i = 0; i < buffer_size; i++) {
+      AppendArrayVal(&html, array, i);
+      if (i + 1 < buffer_size) {
+        html += ", ";
       }
     }
+    if (array.shape().dimensions_count() > 0) {
+      html += "}";
+    }
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Large buffer samples get their own line
+  if (buffer_size > 4) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER"> = {)CODE";
+    AppendArrayVal(&html, array, 0);
+    html += ", ";
+    AppendArrayVal(&html, array, 1);
+    html += kUnicodeEllipsis;
+    AppendArrayVal(&html, array, buffer_size - 2);
+    html += ", ";
+    AppendArrayVal(&html, array, buffer_size - 1);
+    html += "}</TD></TR>";
   }
 
+  // Other array properties
+  Attributes attrs;
   if (array.minmax) {
-    AppendF(&node_properties.label, "\\nMinMax: [%.7g, %.7g]",
-            array.minmax->min, array.minmax->max);
+    attrs["minmax"] =
+        StringF("[%.7g, %.7g]", array.minmax->min, array.minmax->max);
   }
-
   if (array.quantization_params) {
-    AppendF(&node_properties.label, "\\nQuantization: %7g * (x - %d)",
-            array.quantization_params->scale,
-            array.quantization_params->zero_point);
+    attrs["quant"] = StringF("%7g\u00B7(x-%d)",  // Unicode "cdot"
+                             array.quantization_params->scale,
+                             array.quantization_params->zero_point);
   }
-
   if (array.alloc) {
-    AppendF(&node_properties.label, "\\nTransient Alloc: [%d, %d)",
-            array.alloc->start, array.alloc->end);
+    attrs["alloc"] = StringF("[%d, %d)", array.alloc->start, array.alloc->end);
   }
-
-  return node_properties;
+  html += AttributesToHtml(attrs);
+
+  // output array_id in ultra-small font so it can be searched and copied.
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="3" FACE="">)CODE";
+  AppendF(&html, R"CODE("%s")CODE", array_id);
+  html += R"CODE(</FONT>)CODE";
+  html += "</TD></TR>";
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+  return html;
 }
 
-NodeProperties GetPropertiesForOperator(const Operator& op) {
-  NodeProperties node_properties;
-  if (op.type == OperatorType::kUnsupported) {
-    node_properties.label =
-        static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
-  } else {
-    node_properties.label =
-        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
-  }
+Attributes GetOpAttributes(const Model& model, const Operator& op) {
+  Attributes attrs;
   switch (op.fused_activation_function) {
     case FusedActivationFunctionType::kRelu:
-      AppendF(&node_properties.label, "\\nReLU");
+      attrs["func"] = "ReLU";
       break;
     case FusedActivationFunctionType::kRelu6:
-      AppendF(&node_properties.label, "\\nReLU6");
+      attrs["func"] = "ReLU6";
       break;
     case FusedActivationFunctionType::kRelu1:
-      AppendF(&node_properties.label, "\\nReLU1");
+      attrs["func"] = "ReLU1";
       break;
     default:
       break;
   }
-  // Additional information for some of the operators.
+  // Output state of member vars on derived operators.
   switch (op.type) {
     case OperatorType::kConv: {
       const auto& conv_op = static_cast<const ConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
+      string stride;
+      AppendF(&stride, "%d", conv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", conv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (conv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kDepthwiseConv: {
-      const auto& conv_op = static_cast<const DepthwiseConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
-      break;
-    }
-    case OperatorType::kFullyConnected: {
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      const auto& depthconv_op = static_cast<const ConvOperator&>(op);
+      string stride;
+      AppendF(&stride, "%d", depthconv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", depthconv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (depthconv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kFakeQuant: {
       const auto& fakequant_op = static_cast<const FakeQuantOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      attrs["bits"] = StringF("%d", fakequant_op.num_bits);
       if (fakequant_op.minmax) {
-        AppendF(&node_properties.label, "\\n%dbit [%g,%g]",
-                fakequant_op.num_bits, fakequant_op.minmax->min,
-                fakequant_op.minmax->max);
+        attrs["range"] = StringF("[%g,%g]", fakequant_op.minmax->min,
+                                 fakequant_op.minmax->max);
       } else {
-        AppendF(&node_properties.label, "\\n%dbit [?,?]",
-                fakequant_op.num_bits);
+        attrs["range"] = "[?,?]";
       }
       break;
     }
     default:
-      node_properties.color = Color(0xDB, 0x44, 0x37);
       break;
   }
+  int64 math_ops_count;
+  if (EstimateArithmeticOpsCount(model, op, &math_ops_count) &&
+      (math_ops_count != 0)) {
+    attrs["math"] = FormattedNumber(math_ops_count) + "ops";
+  }
 
-  return node_properties;
+  return attrs;
 }
 
-}  // namespace
+Color GetOpColor(const Operator& op) {
+  if ((op.type == OperatorType::kDepthwiseConv) ||
+      (op.type == OperatorType::kConv) ||
+      (op.type == OperatorType::kFullyConnected) ||
+      (op.type == OperatorType::kFakeQuant)) {
+    // Give some ops a bolder red
+    return Color(0xC5, 0x39, 0x29);
+  } else {
+    return Color(0xDB, 0x44, 0x37);
+  }
+}
+
+string GetOpLabel(const Model& model, const Operator& op) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Input Ports
+  if (!op.inputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.inputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "i%d", i);
+      html += R"CODE(">)CODE";
+      if (op.inputs.size() > 1) {
+        // Only number inputs when op has two or more inputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
+  }
 
-void DumpGraphviz(const Model& model, string* output_file_contents) {
-  AppendF(output_file_contents, "digraph Computegraph {\n");
-  // 'nslimit' is a graphviz (dot) paramater that limits the iterations during
-  // the layout phase. Omitting it allows infinite iterations, causing some
-  // complex graphs to never finish. A value of 125 produces good graphs
-  // while allowing complex graphs to finish.
-  AppendF(output_file_contents, "\t nslimit=125;\n");
-
-  constexpr char kNodeFormat[] =
-      "\t \"%s\" [label=\"%s\", shape=%s, style=filled, fillcolor=\"#%s\", "
-      "fontcolor = \"#%sDD\"];\n";
-
-  constexpr char kEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [penwidth=%f, weight=%f];\n";
-
-  constexpr char kRNNBackEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
-
-  for (const auto& array_kv : model.GetArrayMap()) {
-    // Add node for array.
-    const string& array_name = array_kv.first;
-    const auto& array_properties = GetPropertiesForArray(model, array_name);
-    AppendF(output_file_contents, kNodeFormat, array_name,
-            array_properties.label, "octagon",
-            array_properties.color.FillColorString().c_str(),
-            array_properties.color.TextColorString().c_str());
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><B>)CODE";
+  if (op.type == OperatorType::kUnsupported) {
+    html += static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
+  } else {
+    html += string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs = GetOpAttributes(model, op);
+  html += AttributesToHtml(attrs);
+
+  // Output Ports
+  if (!op.outputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.outputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "o%d", i);
+      html += R"CODE(">)CODE";
+      if (op.outputs.size() > 1) {
+        // Only number outputs when op has two or more outputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
   }
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+float GetLog2BufferSize(const Model& model, const string& array_id) {
+  auto& array = model.GetArray(array_id);
+  if (array.has_shape()) {
+    int buffer_size = 0;
+    if (IsNonEmpty(array.shape())) {
+      buffer_size = RequiredBufferSizeForShape(array.shape());
+      return std::log2(static_cast<float>(buffer_size));
+    }
+  }
+  return 0.0f;
+}
+
+string GetOpId(int op_index) { return StringF("op%05d", op_index); }
+
+void DumpOperator(const Model& model, string* output_file, int op_index) {
+  // Dump node for operator.
+  const Operator& op = *model.operators[op_index];
+  Color color = GetOpColor(op);
+  string label = GetOpLabel(model, op);
+  string op_id = GetOpId(op_index);
+  AppendF(output_file, kOpNodeFmt, op_id, label, color.AsHexString(),
+          color.TextColorString());
+}
+
+void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
+  // Inputs
+  const Operator& op = *model.operators[op_index];
+  string op_id = GetOpId(op_index);
+  for (int i = 0; i < op.inputs.size(); i++) {
+    const auto& input = op.inputs[i];
+    if (!model.HasArray(input)) {
+      // Connected arrays should _always_ exist. Except, perhaps, during
+      // development.
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, input);
+    // Draw lines that transport more data thicker (Otherwise, where would the
+    // data fit? right?).
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    // Keep edges that transport more data shorter than those with less.
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsInputArray(model, input) &&
+        GetOpWithOutput(model, input) == nullptr) {
+      // Give the main line of data flow a straighter path by penalizing edges
+      // to standalone buffers. Weights are generally very large buffers that
+      // would otherwise skew the layout.
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, input);
+    AppendF(output_file, kInputEdgeFmt, input, compass_pt, op_id, i, line_width,
+            weight);
+  }
+  // Outputs
+  for (int i = 0; i < op.outputs.size(); i++) {
+    const auto& output = op.outputs[i];
+    if (!model.HasArray(output)) {
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, output);
+    // See comments above regarding weight and line_width calculations.
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsArrayConsumed(model, output)) {
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, output);
+    AppendF(output_file, kOutputEdgeFmt, op_id, i, output, compass_pt,
+            line_width, weight);
+  }
+}
+
+struct Node {
+  Node() : math_ops(0) {}
+  // Name used as a key in the model's array map
+  string array_id;
+
+  // Estimated number of math ops incurred by this node (the sum of the op
+  // with this array as 1st output, plus all children nodes).
+  int64 math_ops;
+
+  // A map of child nodes keyed by name.
+  std::map<const string, std::unique_ptr<Node>> children;
+};
+
+string GetSubgraphLabel(Node const& node, const string& subgraph) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="12" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="18" FACE="Helvetica"><I>)CODE";
+  html += subgraph;
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  if (node.math_ops > 0) {
+    attrs["math"] = FormattedNumber(node.math_ops) + "ops";
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+void DumpSubgraphHeader(string* output_file, Node const& node,
+                        const string& node_name) {
+  Color color = HashStringToColor(node_name);
+  string label = GetSubgraphLabel(node, node_name);
+  AppendF(output_file, kSubgraphFmt, node_name, color.AsHexString(), label);
+}
+
+void DumpArray(const Model& model, string* output_file,
+               const string& array_id) {
+  Color color;
+  string shape;
+  GetArrayColorAndShape(model, array_id, &color, &shape);
+  string label = GetArrayLabel(model, array_id);
+  AppendF(output_file, kArrayNodeFmt, array_id, label, array_id, shape,
+          color.AsHexString(), color.TextColorString());
+
+  // Ops are placed in the same subgraph as their first output.
   for (int op_index = 0; op_index < model.operators.size(); op_index++) {
     const Operator& op = *model.operators[op_index];
-    // Add node for operator.
-    auto op_properties = GetPropertiesForOperator(op);
-    string operator_id = StringF("op%05d", op_index);
-    AppendF(output_file_contents, kNodeFormat, operator_id, op_properties.label,
-            "box", op_properties.color.FillColorString().c_str(),
-            op_properties.color.TextColorString().c_str());
-    // Add edges for all inputs of the operator.
-    for (const auto& input : op.inputs) {
-      if (!model.HasArray(input)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, input);
-      // Draw lines that transport more data thicker (Otherwise, where would the
-      // data fit? right?).
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      // Keep edges that transport more data shorter than those with less.
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsInputArray(model, input) &&
-          GetOpWithOutput(model, input) == nullptr) {
-        // Give the main line of data flow a straighter path by penalizing edges
-        // to standalone buffers. Weights are generally very large buffers that
-        // otherwise skew the layout without this.
-        weight = 1.0f;
-      }
-      AppendF(output_file_contents, kEdgeFormat, input, operator_id, line_width,
-              weight);
+    if (!op.outputs.empty() && (op.outputs[0] == array_id)) {
+      DumpOperator(model, output_file, op_index);
     }
-    // Add edges for all outputs of the operator.
-    for (const auto& output : op.outputs) {
-      if (!model.HasArray(output)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, output);
-      // See comments above regarding weight and line_width calculations.
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsArrayConsumed(model, output)) {
-        weight = 1.0f;
+  }
+}
+
+void DumpNode(const Model& model, string* output_file, const string& node_name,
+              Node const& node) {
+  bool not_root = !node_name.empty();
+  if (not_root) {
+    DumpSubgraphHeader(output_file, node, node_name);
+  }
+
+  for (const auto& child : node.children) {
+    if (!child.second->array_id.empty()) {
+      // Dump array if this node posesses one.
+      DumpArray(model, output_file, child.second->array_id);
+    }
+    // Note that it is always possible to have children. Unlike a filesystem,
+    // the existence of array "foo/bar" does _not_ prevent other arrays, such as
+    // and "foo/bar/baz", from being nested beneath it.
+    DumpNode(model, output_file, child.first, *child.second);
+  }
+
+  if (not_root) {
+    // End subgraph
+    AppendF(output_file, "    }\n");
+  }
+}
+
+int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
+  for (const auto& op : model.operators) {
+    if (!op->outputs.empty() && op->outputs[0] == array_id) {
+      int64 count;
+      if (EstimateArithmeticOpsCount(model, *op, &count)) {
+        return count;
+      } else {
+        return 0;
       }
-      AppendF(output_file_contents, kEdgeFormat, operator_id, output,
-              line_width, weight);
     }
   }
+  return 0;
+}
 
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    AppendF(output_file_contents, kRNNBackEdgeFormat,
-            rnn_state.back_edge_source_array(), rnn_state.state_array());
+void InsertNode(const Model& model, const string& array_id, Node* node,
+                std::vector<string> prefixes, int64* math_ops) {
+  if (prefixes.empty()) {
+    // Base case: store array in this node.
+    node->array_id = array_id;
+    *math_ops = GetArithmeticOpsCount(model, array_id);
+  } else {
+    // Insert into the sub-tree for that prefix.
+    string prefix = prefixes.back();
+    prefixes.pop_back();
+    if (node->children.count(prefix) == 0) {
+      // Create a new node if this prefix is unseen.
+      node->children[prefix] = absl::make_unique<Node>();
+    }
+    InsertNode(model, array_id, node->children[prefix].get(), prefixes,
+               math_ops);
   }
+  // Sum estimated math ops into all nodes.
+  node->math_ops += *math_ops;
+}
 
-  AppendF(output_file_contents, "}\n");
+void BuildArrayTree(const Model& model, Node* tree) {
+  // Delimit array names by path "/", then place into a tree based on this path.
+  for (const auto& array_id : model.GetArrayMap()) {
+    std::vector<string> prefixes = absl::StrSplit(array_id.first, '/');
+    std::reverse(prefixes.begin(), prefixes.end());
+    int64 math_ops;  // Temporary storage for math ops used during recursion.
+    InsertNode(model, array_id.first, tree, prefixes, &math_ops);
+  }
+}
+
+string GetGraphLabel(const Model& model, const string& graph_name) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="36" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="64" FACE="Helvetica"><B><I>)CODE";
+  html += graph_name;
+  html += R"CODE(</I></B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  attrs["arrays"] = StringF("%d", model.GetArrayMap().size());
+  if (!model.optional_arrays.empty()) {
+    attrs["optional arrays"] = StringF("%d", model.optional_arrays.size());
+  }
+  attrs["operators"] = StringF("%d", model.operators.size());
+  int64 ops_count;
+  if (EstimateArithmeticOpsCount(model, &ops_count) && (ops_count > 0)) {
+    attrs["math"] = FormattedNumber(ops_count) + "ops";
+  }
+  if (model.transient_data_size > 0) {
+    attrs["transient data size"] =
+        StringF("%d KiB", model.transient_data_size / 1024);
+  }
+  if (model.transient_data_alignment > 0) {
+    attrs["transient data alignment"] =
+        StringF("%d bytes", model.transient_data_alignment);
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+}  // namespace
+
+void DumpGraphviz(const Model& model, string* output_file,
+                  const string& graph_name) {
+  // Start graphviz format
+  AppendF(output_file, kGraphFmt, GetGraphLabel(model, graph_name));
+
+  // Organize arrays into a tree for subgraphing
+  Node tree;
+  BuildArrayTree(model, &tree);
+  DumpNode(model, output_file, "", tree);
+
+  // Dump edges outside all subgraphs (otherwise the referred-to nodes are
+  // implicitly included in that subgraph).
+  for (int op_index = 0; op_index < model.operators.size(); op_index++) {
+    DumpOperatorEdges(model, output_file, op_index);
+  }
+
+  // Dump RNN Backedges
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    AppendF(output_file, kRNNBackEdgeFmt, rnn_state.back_edge_source_array(),
+            rnn_state.state_array());
+  }
+  // End graphviz format
+  AppendF(output_file, "}\n");
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
index 9697bd6f0dc434aaf98762698c64fb60cb97f2ee..9bb74dac3f8fb34fb2a440e499c4ed0066ffea4d 100644
--- a/tensorflow/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -21,7 +21,8 @@ limitations under the License.
 
 namespace toco {
 
-void DumpGraphviz(const Model& model, string* output_file_contents);
+void DumpGraphviz(const Model& model, string* output_file_contents,
+                  const string& graph_name);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 9fff0015527ebadf501f571bdd5ed0a7643d66e0..d426a690678fabf2ca344d2fc80cef88b08f196a 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -22,11 +22,6 @@ limitations under the License.
 #include "google/protobuf/text_format.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/toco/model.h"
-#include "tensorflow/lite/toco/model_flags.pb.h"
-#include "tensorflow/lite/toco/runtime/types.h"
-#include "tensorflow/lite/toco/tensorflow_util.h"
-#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -34,6 +29,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tensorflow_util.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 using tensorflow::DT_BOOL;
 using tensorflow::DT_COMPLEX64;
@@ -1205,6 +1205,16 @@ void ConvertFloorOperator(const Model& model, const FloorOperator& src_op,
   (*floor_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertCeilOperator(const Model& model, const CeilOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* ceil_op = tensorflow_graph->add_node();
+  ceil_op->set_op("Ceil");
+  ceil_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *ceil_op->add_input() = src_op.inputs[0];
+  (*ceil_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
                            GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* gather_op = tensorflow_graph->add_node();
@@ -1295,7 +1305,8 @@ void ConvertTensorFlowShapeOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.outputs[0]));
 }
 
-void ConvertRankOperator(const Model& model, const RankOperator& src_op,
+void ConvertRankOperator(const Model& model,
+                         const TensorFlowRankOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* rank_op = tensorflow_graph->add_node();
   rank_op->set_op("Rank");
@@ -2052,6 +2063,33 @@ void ConvertZerosLikeOperator(const Model& model,
   (*zeros_like_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertReverseV2Operator(const Model& model,
+                              const ReverseV2Operator& src_op,
+                              const char* op_name, GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* reverse_v2_op = tensorflow_graph->add_node();
+  reverse_v2_op->set_op(op_name);
+  reverse_v2_op->set_name(src_op.outputs[0]);
+  DCHECK_EQ(src_op.inputs.size(), 2);
+  *reverse_v2_op->add_input() = src_op.inputs[0];
+  *reverse_v2_op->add_input() = src_op.inputs[1];
+  const tensorflow::DataType data_type =
+      GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*reverse_v2_op->mutable_attr())["T"].set_type(data_type);
+}
+
+void ConvertReverseSequenceOperator(const Model& model,
+                                    const ReverseSequenceOperator& src_op,
+                                    GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* reverse_seq_op = tensorflow_graph->add_node();
+  reverse_seq_op->set_op("ReverseSequence");
+  reverse_seq_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *reverse_seq_op->add_input() = src_op.inputs[0];
+  *reverse_seq_op->add_input() = src_op.inputs[1];
+  (*reverse_seq_op->mutable_attr())["seq_dim"].set_i(src_op.seq_dim);
+  (*reverse_seq_op->mutable_attr())["batch_dim"].set_i(src_op.batch_dim);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -2169,6 +2207,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kFloor) {
     ConvertFloorOperator(model, static_cast<const FloorOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kCeil) {
+    ConvertCeilOperator(model, static_cast<const CeilOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kGather) {
     ConvertGatherOperator(model, static_cast<const GatherOperator&>(src_op),
                           tensorflow_graph);
@@ -2247,7 +2288,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
         model, static_cast<const TensorFlowShapeOperator&>(src_op),
         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRank) {
-    ConvertRankOperator(model, static_cast<const RankOperator&>(src_op),
+    ConvertRankOperator(model,
+                        static_cast<const TensorFlowRankOperator&>(src_op),
                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRange) {
     ConvertRangeOperator(model, static_cast<const RangeOperator&>(src_op),
@@ -2328,6 +2370,14 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertZerosLikeOperator(
         model, static_cast<const TensorFlowZerosLikeOperator&>(src_op),
         "ZerosLike", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kReverseV2) {
+    ConvertReverseV2Operator(model,
+                             static_cast<const ReverseV2Operator&>(src_op),
+                             "Reverse_V2", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kReverseSequence) {
+    ConvertReverseSequenceOperator(
+        model, static_cast<const ReverseSequenceOperator&>(src_op),
+        tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index a0260e24013bfda8718e0dc04052abb49b65debf..e4eb7698597f588947bc19f5ab449c9d3ff14adc 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -128,7 +128,8 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
 }
 
 bool GraphTransformationsPass(int increment, Model* model,
-                              const GraphTransformationsSet& transformations) {
+                              const GraphTransformationsSet& transformations,
+                              tensorflow::Status* status) {
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
   if (model->operators.empty()) {
@@ -142,7 +143,10 @@ bool GraphTransformationsPass(int increment, Model* model,
     for (const auto& transformation : transformations) {
       CHECK(!changed_now);
       CHECK(transformation->Messages().empty());
-      CHECK(transformation->Run(model, op_index, &changed_now).ok());
+      *status = transformation->Run(model, op_index, &changed_now);
+      if (!status->ok()) {
+        return false;
+      }
       const char* made_a_change_msg =
           changed_now ? "made a change" : "did NOT make a change";
       const int log_level =
@@ -186,18 +190,21 @@ bool GraphTransformationsPass(int increment, Model* model,
 
 }  // namespace
 
-void RunGraphTransformations(Model* model, const string& msg,
-                             const GraphTransformationsSet& transformations) {
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
+  tensorflow::Status status;
   while (GraphTransformationsPass((pass_index % 2) ? -1 : 1, model,
-                                  transformations)) {
+                                  transformations, &status)) {
     pass_index++;
     const auto& label =
         toco::port::StringF("After %s pass %d", msg, pass_index);
     PrintModelStats(label, *model);
     CheckInvariants(*model);
   }
+  return status;
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 187b584b6989cc55894160fc5508c13474a1d2d3..d92733ba3b5490b0b77e88e3beb1bbe9d4508a3a 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -102,8 +102,16 @@ class GraphTransformationsSet {
 // construct GraphTransformation objects by using 'new', pass us
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
-void RunGraphTransformations(Model* model, const string& message,
-                             const GraphTransformationsSet& transformations);
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations);
+
+inline void RunGraphTransformations(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
+  auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
+  CHECK(s.ok()) << s.error_message();
+}
 
 #define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
   class GTName : public GraphTransformation {                    \
@@ -127,6 +135,10 @@ DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBroadcastIntoFollowingBinary)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceRnn)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceRnn)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03dbf3cb1db6949c865dced5d0b4b8a34f86c406
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -0,0 +1,654 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <iterator>
+#include <memory>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator& op) {
+  return std::find_if(
+      model->operators.begin(), model->operators.end(),
+      [&op](const std::unique_ptr<Operator>& ptr) { return ptr.get() == &op; });
+}
+
+bool MatchTwoUnpackOps(const Operator& op, const Model& model,
+                       Operator** fw_output, Operator** bw_output) {
+  if (op.inputs.size() != 2) {
+    return false;
+  }
+
+  *fw_output = GetOpWithOutput(model, op.inputs[0]);
+  *bw_output = GetOpWithOutput(model, op.inputs[1]);
+  if (*fw_output == nullptr || *bw_output == nullptr) {
+    return false;
+  }
+
+  if ((*fw_output)->type != OperatorType::kUnpack ||
+      (*bw_output)->type != OperatorType::kUnpack) {
+    return false;
+  }
+
+  // TODO(renjieliu): Check the shapes are matching.
+
+  return true;
+}
+
+bool MatchDynamicBidirectionalSequenceOutputs(Operator* op, const Model& model,
+                                              Operator** fw_output,
+                                              Operator** bw_output) {
+  if (op->inputs.size() != 2) {
+    return false;
+  }
+
+  // The concat op is already the fw_rnn_output.
+  *fw_output = op;
+  auto* reverse_output = GetOpWithOutput(model, op->inputs[1]);
+  if (*fw_output == nullptr || reverse_output == nullptr) {
+    return false;
+  }
+
+  if (reverse_output->type != OperatorType::kReverseV2 &&
+      reverse_output->type != OperatorType::kReverseSequence) {
+    return false;
+  }
+
+  *bw_output = reverse_output;
+
+  return true;
+}
+
+bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
+                                  OperatorType operator_type,
+                                  std::stack<Operator*>* sequence_ops,
+                                  Operator** input_op) {
+  Operator* op_it = nullptr;
+  op_it = GetOpWithOutput(model, output_op.inputs[0]);
+  if (op_it == nullptr) {
+    return false;
+  }
+
+  while (op_it->type == operator_type) {
+    sequence_ops->push(op_it);
+    // Check the first input of the unidirectional sequence op.
+    op_it = GetOpWithOutput(model, op_it->inputs[0]);
+    if (op_it == nullptr) {
+      return false;
+    }
+  }
+
+  *input_op = op_it;
+  return true;
+}
+
+bool CheckTwoUnidirectionalSequenceOpsAreValid(
+    const Model& model, std::stack<Operator*> fw_unidirectional_sequence_ops,
+    std::stack<Operator*> bw_unidirectional_sequence_ops,
+    const Operator* first_fw_sequence_op_input,
+    const Operator* first_bw_sequence_op_input, bool is_dynamic_rnn) {
+  if (fw_unidirectional_sequence_ops.size() !=
+          bw_unidirectional_sequence_ops.size() ||
+      fw_unidirectional_sequence_ops.empty()) {
+    return false;
+  }
+
+  // Fw & bw sequence ops are allowed to have different input shapes, but they
+  // need to have the same data type.
+  while (!fw_unidirectional_sequence_ops.empty()) {
+    Operator* fw_sequence_op = fw_unidirectional_sequence_ops.top();
+    Operator* bw_sequence_op = bw_unidirectional_sequence_ops.top();
+
+    if (fw_sequence_op->inputs.size() != bw_sequence_op->inputs.size() ||
+        fw_sequence_op->outputs.size() != bw_sequence_op->outputs.size())
+      return false;
+
+    // Make sure the inputs datatype matches.
+    for (int i = 0; i < fw_sequence_op->inputs.size(); ++i) {
+      const auto& fw_input_array_name = fw_sequence_op->inputs[i];
+      const auto& bw_input_array_name = bw_sequence_op->inputs[i];
+      if (model.HasArray(fw_input_array_name) &&
+          model.HasArray(bw_input_array_name)) {
+        if (model.GetArray(fw_input_array_name).data_type !=
+            model.GetArray(bw_input_array_name).data_type)
+          return false;
+      }
+    }
+
+    // Make sure the outputs datatype matches.
+    for (int i = 0; i < fw_sequence_op->outputs.size(); ++i) {
+      const auto& fw_output_array_name = fw_sequence_op->outputs[i];
+      const auto& bw_output_array_name = bw_sequence_op->outputs[i];
+      if (model.HasArray(fw_output_array_name) &&
+          model.HasArray(bw_output_array_name)) {
+        if (model.GetArray(fw_output_array_name).data_type !=
+            model.GetArray(bw_output_array_name).data_type)
+          return false;
+      }
+    }
+
+    fw_unidirectional_sequence_ops.pop();
+    bw_unidirectional_sequence_ops.pop();
+  }
+
+  if (is_dynamic_rnn) {
+    // For dynamic bidirectional sequence ops, bw_sequence will have a reverse
+    // op.
+    if (first_bw_sequence_op_input->type != OperatorType::kReverseV2 &&
+        first_bw_sequence_op_input->type != OperatorType::kReverseSequence) {
+      return false;
+    }
+
+    const auto* bw_real_input_op =
+        GetOpWithOutput(model, first_bw_sequence_op_input->inputs[0]);
+    if (first_fw_sequence_op_input != bw_real_input_op) {
+      return false;
+    }
+
+  } else {
+    // For static bidirectional sequence ops, we should have two pack ops.
+    if (first_fw_sequence_op_input->type != OperatorType::kPack ||
+        first_bw_sequence_op_input->type != OperatorType::kPack) {
+      return false;
+    }
+
+    // fw_lstm & bw_lstm should point to the same input, but reversed sequence.
+    for (size_t i = 0; i < first_fw_sequence_op_input->inputs.size(); ++i) {
+      if (first_fw_sequence_op_input->inputs[i] !=
+          first_bw_sequence_op_input
+              ->inputs[first_fw_sequence_op_input->inputs.size() - i - 1]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void ConstructBidirectionalSequenceOp(
+    const Operator& fw_lstm_op, const Operator& bw_lstm_op, Model* model,
+    BidirectionalSequenceLstmOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceLstmInputsCount = 47;
+  constexpr int kFwLstmInputsStartIndex = 1;
+  constexpr int kBwLstmInputsStartIndex = 18;
+  constexpr int kFwInputActivationStartIndex = 35;
+  constexpr int kBwInputActivationStartIndex = 37;
+  constexpr int kAuxInputStartIndex = 39;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+  // Fill in the fw_lstm weights.
+  for (; i < kBwLstmInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_lstm_op.inputs[i]);
+  }
+
+  // Fill in the bw_lstm weights. bidirectional lstm backward weights start
+  // from 18.
+  for (; i < kFwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op
+            .inputs[i - (kBwLstmInputsStartIndex - kFwLstmInputsStartIndex)]);
+  }
+
+  // Fill in fw_lstm previous states.
+  for (; i < kBwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        fw_lstm_op.inputs[i - (kFwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // Fill in bw_lstm previous states.
+  for (; i < kAuxInputStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op.inputs[i - (kBwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
+  for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+}
+
+void ConstructBidirectionalSequenceOp(
+    const Operator& fw_rnn_op, const Operator& bw_rnn_op, Model* model,
+    BidirectionalSequenceRnnOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceRnnInputsCount = 12;
+  constexpr int kFwInputsStartIndex = 1;
+  constexpr int kBwInputsStartIndex = 5;
+  constexpr int kAuxInputsStartIndex = 9;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+
+  // Fill in the fw_rnn weights.
+  for (; i < kBwInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_rnn_op.inputs[i]);
+  }
+
+  // Fill in the bw_rnn weights.
+  for (; i < kAuxInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_rnn_op.inputs[i - (kBwInputsStartIndex - kFwInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with optional weights.
+  for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+}
+
+template <typename T>
+void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
+                          std::stack<Operator*> bw_sequence_ops,
+                          std::vector<T*>* bidirectional_sequence_ops) {
+  while (!fw_sequence_ops.empty()) {
+    Operator* fw_sequence_op = fw_sequence_ops.top();
+    Operator* bw_sequence_op = bw_sequence_ops.top();
+    T* bidirectional_sequence_op = new T;
+    ConstructBidirectionalSequenceOp(*fw_sequence_op, *bw_sequence_op, model,
+                                     &bidirectional_sequence_op);
+
+    bidirectional_sequence_ops->push_back(bidirectional_sequence_op);
+    fw_sequence_ops.pop();
+    bw_sequence_ops.pop();
+  }
+}
+
+template <typename T>
+void RewireBidirectionalSequenceSequenceOpsConnections(
+    OperatorType operator_type, const string& input_array_name,
+    const std::vector<T*>& bidirectional_sequence_ops,
+    std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
+  int aux_input_index = -1;
+  switch (operator_type) {
+    case OperatorType::kBidirectionalSequenceLstm:
+      aux_input_index = 39;
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      aux_input_index = 9;
+      break;
+    default:
+      // Should not reach here.
+      DCHECK(false);
+  }
+  string cur_fw_input = input_array_name;
+  string cur_bw_input = input_array_name;
+  for (size_t i = 0; i < bidirectional_sequence_ops.size(); ++i) {
+    DeleteArrayIfUsedOnce(bidirectional_sequence_ops[i]->inputs[0], model);
+    bidirectional_sequence_ops[i]->inputs[0] = cur_fw_input;
+    if (i != 0) {
+      DeleteArrayIfUsedOnce(
+          bidirectional_sequence_ops[i]->inputs[aux_input_index], model);
+      bidirectional_sequence_ops[i]->inputs[aux_input_index] = cur_bw_input;
+    }
+    cur_fw_input = bidirectional_sequence_ops[i]->outputs[0];
+    cur_bw_input = bidirectional_sequence_ops[i]->outputs[1];
+    if (i != (bidirectional_sequence_ops.size() - 1)) {
+      bidirectional_sequence_ops[i]->merge_outputs = false;
+    } else {
+      // TODO(renjieliu): We need to check whether the outputs of the last bidi
+      // lstms needs merged outputs or not.
+      bidirectional_sequence_ops[i]->merge_outputs = true;
+      DeleteArrayIfUnused(bidirectional_sequence_ops[i]->outputs[1], model);
+      bidirectional_sequence_ops[i]->outputs.pop_back();
+    }
+    model->operators.emplace(*op_it, bidirectional_sequence_ops[i]);
+    *op_it += 1;
+  }
+}
+
+template <typename T>
+void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
+                              UnpackOperator** final_unpack_operator,
+                              T** final_bidi_sequence_operator, Model* model) {
+  (*final_unpack_operator)
+      ->inputs.push_back((*final_bidi_sequence_operator)->outputs[0]);
+  (*final_unpack_operator)->axis = original_unpack_operator.axis;
+  (*final_unpack_operator)->num = original_unpack_operator.num;
+
+  for (size_t i = 0; i < original_unpack_operator.outputs.size(); ++i) {
+    const string& output_array_name = original_unpack_operator.outputs[i];
+    const string& final_unpack_output_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_unpack_" + std::to_string(i));
+    model->GetOrCreateArray(final_unpack_output_array_name);
+    (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
+    Operator* unpack_following_op = GetOpWithInput(*model, output_array_name);
+    if (unpack_following_op != nullptr) {
+      // If there's a following op after the unpack, it must be a concat op.
+      DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
+      // For every output of the concat, rewire the outputs.
+      for (const string& concat_output : unpack_following_op->outputs) {
+        (*final_unpack_operator)->outputs[i] = concat_output;
+      }
+      // Remove the concat op.
+      model->operators.erase(FindOperator(model, *unpack_following_op));
+    }
+  }
+}
+
+void RemoveUnpackOperator(const Operator& unpack_op, Model* model) {
+  for (const string& output_array_name : unpack_op.outputs) {
+    DeleteArrayIfUnused(output_array_name, model);
+  }
+  model->operators.erase(FindOperator(model, unpack_op));
+}
+
+void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
+                                     Model* model) {
+  while (!uni_sequence_ops.empty()) {
+    Operator* uni_sequence_op = uni_sequence_ops.top();
+    DeleteArrayIfUnused(uni_sequence_op->outputs[0], model);
+    model->operators.erase(FindOperator(model, *uni_sequence_op));
+    uni_sequence_ops.pop();
+  }
+}
+
+template <typename T>
+::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
+                                             OperatorType operator_type,
+                                             bool* modified) {
+  *modified = false;
+
+  // We assume there's a concatenation right after the bidirectional sequence
+  // ops, it may not be the case.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // for bw, there will be a reverse op at the end.
+  Operator *fw_sequence_output, *bw_sequence_output;
+  if (!MatchDynamicBidirectionalSequenceOutputs(
+          final_concat_op, *model, &fw_sequence_output, &bw_sequence_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional sequence ops.
+  std::stack<Operator*> fw_unidirectional_sequence_ops,
+      bw_unidirectional_sequence_ops;
+  OperatorType unidirectional_op_type;
+  if (operator_type == OperatorType::kBidirectionalSequenceLstm) {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceLstm;
+  } else {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceRnn;
+  }
+  Operator *first_fw_sequence_input, *first_bw_sequence_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_sequence_output, unidirectional_op_type,
+          &fw_unidirectional_sequence_ops, &first_fw_sequence_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_sequence_output, unidirectional_op_type,
+          &bw_unidirectional_sequence_ops, &first_bw_sequence_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_ops,
+          bw_unidirectional_sequence_ops, first_fw_sequence_input,
+          first_bw_sequence_input, /*is_dynamic_rnn=*/true)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<T> bidirectional_sequence_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_ops,
+                       bw_unidirectional_sequence_ops,
+                       &bidirectional_sequence_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_sequence_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
+
+  // Change last bidirectional sequence rnn output to the concat output.
+  bidirectional_sequence_ops[bidirectional_sequence_ops.size() - 1]
+      ->outputs[0] = final_concat_op->outputs[0];
+
+  // Delete unused ops.
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_ops, model);
+
+  DeleteArrayIfUnused(final_concat_op->inputs[0], model);
+  DeleteArrayIfUnused(final_concat_op->inputs[1], model);
+  model->operators.erase(FindOperator(model, *final_concat_op));
+
+  // Only keep the fw lstm's input.
+  DeleteArrayIfUnused(first_bw_sequence_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_sequence_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace
+
+::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
+                                                         std::size_t op_index,
+                                                         bool* modified) {
+  *modified = false;
+  // Bidirectional sequence lstm will generate two separate unidirectional
+  // sequence lstm ops, for static bidirectional sequence lstm, there will be
+  // a concatenation op at very end; for dynamic bidirectional sequence lstm,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional lstm outputs and bw unidirectional lstm outputs:
+  // should be two unstack ops.
+  Operator *fw_lstm_output, *bw_lstm_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_lstm_output,
+                         &bw_lstm_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional lstm ops.
+  std::stack<Operator*> fw_unidirectional_sequence_lstm_ops,
+      bw_unidirectional_sequence_lstm_ops;
+  Operator *first_fw_lstm_input, *first_bw_lstm_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &fw_unidirectional_sequence_lstm_ops, &first_fw_lstm_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &bw_unidirectional_sequence_lstm_ops, &first_bw_lstm_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_lstm_ops,
+          bw_unidirectional_sequence_lstm_ops, first_fw_lstm_input,
+          first_bw_lstm_input, /*is_dynamic_rnn=*/false)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<BidirectionalSequenceLstmOperator*>
+      bidirectional_sequence_lstm_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_lstm_ops,
+                       bw_unidirectional_sequence_lstm_ops,
+                       &bidirectional_sequence_lstm_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_lstm_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceLstm, current_input,
+      bidirectional_sequence_lstm_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_lstm_output), &unpack_operator,
+      &bidirectional_sequence_lstm_ops[bidirectional_sequence_lstm_ops.size() -
+                                       1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_lstm_output, model);
+  RemoveUnpackOperator(*bw_lstm_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_lstm_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_lstm_ops, model);
+  // Only keep the fw lstm's pack input.
+  DeleteArrayIfUnused(first_bw_lstm_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_lstm_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
+  // Bidirectional sequence rnn will generate two separate unidirectional
+  // sequence rnn ops, for static bidirectional sequence rnn, there will be
+  // a concatenation op at very end; for dynamic bidirectional sequence rnn,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional rnn outputs and bw unidirectional rnn outputs:
+  // should be two unstack ops.
+  Operator *fw_rnn_output, *bw_rnn_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_rnn_output,
+                         &bw_rnn_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional rnn ops.
+  std::stack<Operator*> fw_unidirectional_sequence_rnn_ops,
+      bw_unidirectional_sequence_rnn_ops;
+  Operator *first_fw_rnn_input, *first_bw_rnn_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &fw_unidirectional_sequence_rnn_ops, &first_fw_rnn_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &bw_unidirectional_sequence_rnn_ops, &first_bw_rnn_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_rnn_ops,
+          bw_unidirectional_sequence_rnn_ops, first_fw_rnn_input,
+          first_bw_rnn_input, /*is_dynamic_rnn=*/false)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<BidirectionalSequenceRnnOperator*> bidirectional_sequence_rnn_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_rnn_ops,
+                       bw_unidirectional_sequence_rnn_ops,
+                       &bidirectional_sequence_rnn_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_rnn_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceRnn, current_input,
+      bidirectional_sequence_rnn_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_rnn_output), &unpack_operator,
+      &bidirectional_sequence_rnn_ops[bidirectional_sequence_rnn_ops.size() -
+                                      1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_rnn_output, model);
+  RemoveUnpackOperator(*bw_rnn_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_rnn_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_rnn_ops, model);
+  // Only keep the fw rnn's pack input.
+  DeleteArrayIfUnused(first_bw_rnn_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_rnn_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceRnn::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceRnnOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceRnn, modified);
+}
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceLstm::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceLstmOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceLstm, modified);
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 089ecee959a3ab80474782a88fa176b7a9f42001..65dbb8a1766a6aae4347435b392ff4af49e3d44e 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -147,12 +147,26 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   if (final_output_mul->type != OperatorType::kMul) {
     return ::tensorflow::Status::OK();
   }
+  // final_output_mul->outputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(final_output_mul->outputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
   Operator *state_output_tanh, *fc_output_sig;
   if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
                            &state_output_tanh, OperatorType::kLogistic,
                            &fc_output_sig)) {
     return ::tensorflow::Status::OK();
   }
+  // state_output_tanh->inputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(state_output_tanh->inputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
 
   // State output TanH
   // (We don't count an operator as ID'd until we verify it has the correct
@@ -262,11 +276,15 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
   const string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
-  model->GetOrCreateArray(concat_temp_array_name);
+  auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
+  concat_temp_array.data_type =
+      model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
   const string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
-  model->GetOrCreateArray(activ_temp_array_name);
+  auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
+  activ_temp_array.data_type =
+      model->GetArray(fully_connected->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] = activ_temp_array_name;
   AddMessageF("Created temp outputs %s and %s on operator %s",
               concat_temp_array_name, activ_temp_array_name,
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 7aec6728da6bc51779e0ec15377fd0c12696e94c..cb66a2372fdd3edf484902c336821b35befae48d 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -266,6 +266,26 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       model->GetArray(op->outputs[1]).data_type = unique_op->idx_out_type;
       break;
     }
+    case OperatorType::kBidirectionalSequenceLstm: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kBidirectionalSequenceRnn: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kLstmCell: {
+      // It's tricky to propagate data types through a LstmCell, as that has
+      // multiple inputs and outputs, and there are quantized cases with
+      // mixed (8bit vs 16bit) cases. Fortunately, that should never be needed,
+      // as the data formats, such as TFLITE, that have LstmCell nodes, also
+      // have data type fields for all their arrays.
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 04a5a1c1687b4caae2f31548ec549cb95e153df5..38becd6f63386f568ab2d9ff6244ecabb84f8e7f 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -110,6 +110,13 @@ bool DoesOpBlockBackwardPropagation(const Operator& op) {
     case OperatorType::kSelect:
     case OperatorType::kTile:
       // Reshapes and transposes don't change values.
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+      // Relus only clamp the output. If min/max of parent is unknown, just
+      // prop the range backward. This only happens for cases where activations
+      // are not fused to avoid a default being set on the RELU input and
+      // propagating forward to the RELU output.
       return false;
     default:
       return true;
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 1b1780a73b057bc2da91af65e342aff77546a11a..ca72d0037a9e67d549ae3e337a2891c75648c021 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -1081,6 +1081,18 @@ void ProcessUnidirectionalSequenceLstmOperator(
 
   // TODO(renjieliu): check the inputs, as well as all kinds of weights.
   const auto& input_array = model->GetArray(op->inputs[0]);
+
+  constexpr int kInputActivationStateTensor = 18;
+  constexpr int kInputCellStateTensor = 19;
+
+  // TFlite intepreter does not support array which is variable and contains a
+  // buffer (see b/115961645 for more discussion).
+  // The follow block remove buffer from the array to work around the
+  // restriction, as a consequence, downstream applications should not
+  // read lstm state as input to other operations.
+  model->GetArray(op->inputs[kInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kInputCellStateTensor]).buffer.reset();
+
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -1096,12 +1108,6 @@ void ProcessUnidirectionalSequenceLstmOperator(
     return;
   }
 
-  constexpr int kInputActivationStateTensor = 18;
-  constexpr int kInputCellStateTensor = 19;
-  // b(115961645): This is a hack to work around.
-  model->GetArray(op->inputs[kInputActivationStateTensor]).buffer.reset();
-  model->GetArray(op->inputs[kInputCellStateTensor]).buffer.reset();
-
   const auto& output_weights_shape = recurrent_to_output_weights_array.shape();
   const int output_size = output_weights_shape.dims(1);
 
@@ -1122,6 +1128,14 @@ void ProcessUnidirectionalSequenceRnnOperator(
     return;
   }
 
+  constexpr int kHiddenStateTensor = 4;
+  // TFlite intepreter does not support array which is variable and contains a
+  // buffer (see b/115961645 for more discussion).
+  // The follow block remove buffer from the array to work around the
+  // restriction, as a consequence, downstream applications should not
+  // read lstm state as input to other operations.
+  model->GetArray(op->inputs[kHiddenStateTensor]).buffer.reset();
+
   // TODO(renjieliu): check the inputs, as well as all kinds of weights.
   const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
@@ -1138,10 +1152,6 @@ void ProcessUnidirectionalSequenceRnnOperator(
     return;
   }
 
-  constexpr int kHiddenStateTensor = 4;
-  // b(115961645): This is a hack to work around.
-  model->GetArray(op->inputs[kHiddenStateTensor]).buffer.reset();
-
   const auto& bias_shape = bias_array.shape();
   const int output_size = bias_shape.dims(0);
 
@@ -1149,6 +1159,114 @@ void ProcessUnidirectionalSequenceRnnOperator(
   output_shape->ReplaceDims({timestamp, batch_size, output_size});
 }
 
+void ProcessBidirectionalSequenceLstmOperator(
+    Model* model, BidirectionalSequenceLstmOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kBwRecurrentToOutputWeightsTensor = 25;
+  const auto& recurrent_to_output_weights_array =
+      model->GetArray(op->inputs[kBwRecurrentToOutputWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!recurrent_to_output_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwInputActivationStateTensor = 35;
+  constexpr int kFwInputCellStateTensor = 36;
+  constexpr int kBwInputActivationStateTensor = 37;
+  constexpr int kBwInputCellStateTensor = 38;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kFwInputCellStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputCellStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = recurrent_to_output_weights_array.shape();
+  const int output_size = output_weights_shape.dims(1);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
+void ProcessBidirectionalSequenceRnnOperator(
+    Model* model, BidirectionalSequenceRnnOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kFwWeightsTensor = 1;
+  const auto& forward_weights_array =
+      model->GetArray(op->inputs[kFwWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!forward_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwHiddenStateTensor = 4;
+  constexpr int kBwHiddenStateTensor = 8;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwHiddenStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwHiddenStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = forward_weights_array.shape();
+  const int output_size = output_weights_shape.dims(0);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
@@ -1292,6 +1410,38 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
   }
 }
 
+void ProcessGatherNdOperator(Model* model, GatherNdOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& indices_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+
+  // Bail if we already know the output shape.
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape() || !indices_array.has_shape()) {
+    return;
+  }
+
+  const auto& input_shape = input_array.shape();
+  const auto& indices_shape = indices_array.shape();
+  QCHECK_GE(input_shape.dimensions_count(), 1);
+  QCHECK_GE(indices_shape.dimensions_count(), 1);
+  const int indices_nd =
+      indices_shape.dims(indices_shape.dimensions_count() - 1);
+  QCHECK_LE(indices_nd, input_shape.dimensions_count());
+
+  auto output_dims = output_array.mutable_shape()->mutable_dims();
+  for (int dim = 0; dim < indices_shape.dimensions_count() - 1; ++dim) {
+    output_dims->push_back(indices_shape.dims(dim));
+  }
+  for (int dim = indices_nd; dim < input_shape.dimensions_count(); ++dim) {
+    output_dims->push_back(input_shape.dims(dim));
+  }
+}
+
 void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) {
   const auto& input_values = model->GetArray(op->inputs[0]);
   const auto& input_k = model->GetArray(op->inputs[1]);
@@ -1377,7 +1527,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   output_array.copy_shape(output_shape);
 }
 
-void ProcessRankOperator(Model* model, RankOperator* op) {
+void ProcessRankOperator(Model* model, TensorFlowRankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
@@ -1567,11 +1717,16 @@ void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
   const std::vector<int>& input_dims = input_array.shape().dims();
   std::vector<int> output_dims;
 
-  for (int i = 0; i < input_dims.size(); ++i) {
+  std::vector<int> squeeze_dims;
+  const int input_num_dims = input_dims.size();
+  for (int i : op->squeeze_dims) {
+    squeeze_dims.push_back(i < 0 ? i + input_num_dims : i);
+  }
+  for (int i = 0; i < input_num_dims; ++i) {
     if (input_dims[i] != 1 ||
-        (!op->squeeze_dims.empty() &&
-         std::find(op->squeeze_dims.begin(), op->squeeze_dims.end(), i) ==
-             op->squeeze_dims.end())) {
+        (!squeeze_dims.empty() &&
+         std::find(squeeze_dims.begin(), squeeze_dims.end(), i) ==
+             squeeze_dims.end())) {
       output_dims.push_back(input_dims[i]);
     }
   }
@@ -1656,14 +1811,37 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     return;
   }
 
+  const Array& axis_array = model->GetArray(op->inputs[1]);
+  // Yield until input axis array shape has been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
   const std::vector<int>& input_dims = input_array.shape().dims();
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32 ||
+        axis_array.data_type == ArrayDataType::kInt64)
+      << "axis_array must be int32, int64";
+
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int64 axis;
+  if (axis_array.data_type == ArrayDataType::kInt32) {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  } else {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt64>().data[0];
+  }
+
   std::vector<int> output_dims;
 
-  output_dims.reserve(input_dims.size());
-  for (int i = 0; i < input_dims.size() - 1; ++i) {
-    output_dims.push_back(input_dims[i]);
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != axis) {
+      output_dims.push_back(input_dims[i]);
+    }
   }
-  output_dims.push_back(1);
+
   const string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
@@ -1902,6 +2080,7 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
+    case OperatorType::kElu:
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
@@ -1923,17 +2102,24 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kAssert:
     case OperatorType::kCast:
     case OperatorType::kFloor:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kSin:
+    case OperatorType::kCos:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
     case OperatorType::kLogicalOr:
     case OperatorType::kZerosLike:
+    case OperatorType::kReverseV2:
+    case OperatorType::kReverseSequence:
       ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
       ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
       break;
+    case OperatorType::kGatherNd:
+      ProcessGatherNdOperator(model, static_cast<GatherNdOperator*>(op));
+      break;
     case OperatorType::kTopK_V2:
       ProcessTopkV2Operator(model, static_cast<TopKV2Operator*>(op));
       break;
@@ -2050,7 +2236,7 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
       ProcessRangeOperator(model, static_cast<RangeOperator*>(op));
       break;
     case OperatorType::kRank:
-      ProcessRankOperator(model, static_cast<RankOperator*>(op));
+      ProcessRankOperator(model, static_cast<TensorFlowRankOperator*>(op));
       break;
     case OperatorType::kShape:
       ProcessShapeOperator(model, static_cast<TensorFlowShapeOperator*>(op));
@@ -2081,6 +2267,14 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
       ProcessUnidirectionalSequenceRnnOperator(
           model, static_cast<UnidirectionalSequenceRnnOperator*>(op));
       break;
+    case OperatorType::kBidirectionalSequenceLstm:
+      ProcessBidirectionalSequenceLstmOperator(
+          model, static_cast<BidirectionalSequenceLstmOperator*>(op));
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      ProcessBidirectionalSequenceRnnOperator(
+          model, static_cast<BidirectionalSequenceRnnOperator*>(op));
+      break;
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
@@ -2164,6 +2358,11 @@ void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
     case OperatorType::kUnique:
       ProcessUniqueOperator(model, static_cast<UniqueOperator*>(op));
       break;
+    case OperatorType::kWhere:
+      // The size of the output can only be known after evaluating the cond
+      // tensor. Ignore shape propagation here and defer that to the
+      // interpreter.
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index ee65f92e00cd9f9347e62db314ca3a3f5e8bb396..cef1774d4131fed2ab52850a2ebe53634f34a15a 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -106,7 +106,7 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
     // We always want [min, max] to contain 0.
     float min = 0.f;
     float max = 0.f;
-    for (auto val : data) {
+    for (const auto& val : data) {
       min = std::min(min, val);
       max = std::max(max, val);
     }
@@ -121,7 +121,7 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
     // weights arrays for which fake-quantization would make sense, rather
     // they tend to be hardcoded arrays of zeros or ones used in some graphs.
     bool is_quantization_trivially_exact = true;
-    for (auto val : data) {
+    for (const auto& val : data) {
       is_quantization_trivially_exact &= (val == min || val == max);
     }
     if (!is_quantization_trivially_exact) {
@@ -489,20 +489,20 @@ void FixMinMaxPostQuantization(GraphTransformation* transformation,
     }
   }
   if (!SupportsQuantization(op)) {
-    LOG(FATAL) << "Unimplemented: this graph contains an operator of type "
-               << HelpfulOperatorTypeName(op)
-               << " for which the quantized form is not yet implemented. "
-                  "Sorry, and patches welcome (that's a relatively fun patch "
-                  "to write, mostly providing the actual quantized arithmetic "
-                  "code for this op).";
+    return tensorflow::errors::InvalidArgument(
+        "Unimplemented: this graph contains an operator of type ",
+        HelpfulOperatorTypeName(op),
+        " for which the quantized form is not yet implemented. Sorry, and "
+        "patches welcome (that's a relatively fun patch to write, mostly "
+        "providing the actual quantized arithmetic code for this op).");
   }
 
   for (const auto& input : op.inputs) {
     const auto& array = model->GetArray(input);
     if (array.data_type == ArrayDataType::kFloat) {
       if (!array.minmax && !array.buffer) {
-        LOG(ERROR) << "Can't quantize input array " << input
-                   << " because it lacks min/max info";
+        LOG(WARNING) << "Can't quantize input array " << input
+                     << " because it lacks min/max info";
         return ::tensorflow::Status::OK();
       }
       const auto* other_op = GetOpWithOutput(*model, input);
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
index 8879a7cd2664ed3f32e32435f9d45c0744dfbea2..b9405e1fa057944bd2498ce196ae8ee5e357d872 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -31,7 +31,7 @@ namespace {
 template <typename Scalar>
 bool AreAllBufferElementsEqualTo(const std::vector<Scalar>& buffer_data,
                                  Scalar value) {
-  for (auto x : buffer_data) {
+  for (const auto& x : buffer_data) {
     if (x != value) {
       return false;
     }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 6a4b9198548956217d24693bceff2bd6b3b8f0a6..98105d384e176573b248ffc3fd75710768002750 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -30,6 +30,7 @@ namespace {
 bool IsElementwiseOperator(OperatorType optype) {
   switch (optype) {
     case OperatorType::kCast:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kFloor:
     case OperatorType::kNeg:
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index fdd411c84c2678bc483b00849d5142665e706fac..77803d580e98aea94f0a7191666212cb15f58a7a 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -218,6 +218,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   CHECK_EQ(input_dims.size(), new_perm.size());
 
   auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.data_type = ArrayDataType::kInt32;
   transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
   *(transpose_array.mutable_shape()->mutable_dims()) = {
       static_cast<int>(new_perm.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 0c9effee1fd364fa83f61339251e48070f503d1e..a46bb803eba49f8488c83993f79c3d69bd91ef8e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -71,34 +71,29 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   // Each "digit" is incremented individually (by the stride). When it overflows
   // (becomes greater than the stop), that digit is reset and a carry flag is
   // used to increment the next digit.
-  int dst_offset = 0;
-  do {
+  for (size_t dst_offset = 0; dst_offset < output_data.size(); ++dst_offset) {
     // Copy element.
     output_data[dst_offset] = input_buffer.data[Offset(input_shape, src_coord)];
 
-    // Compute next source input coordinates.
-    bool carry = true;
-    for (int axis = 0; axis < num_input_axes; axis++) {
+    // Note we consider elements in the highest dimension are stored
+    // contiguously. So, we increment the stride starting from the highest
+    // dimension.
+    for (int axis = num_input_axes - 1; axis >= 0; --axis) {
       int stride = op.strides[axis];
-      // Increment this axis if we carried from the previous one
-      if (carry) {
-        src_coord[axis] += stride;
-      }
+      src_coord[axis] += stride;
 
-      // Check if we've overflowed.
+      // Check if we've overflowed. If not, we just break from the loop to
+      // continue w/ the element copy. Otherwise, reset the starting coordinate
+      // for this axis and move to the next lower axis.
       int stop = stop_for_axis[axis];
-      if (tflite::strided_slice::LoopCondition(src_coord[axis], stop, stride)) {
-        // Reset axis and set carry
-        src_coord[axis] = tflite::strided_slice::StartForAxis(
-            strided_slice_params, ToRuntimeShape(input_shape), axis);
-        carry = true;
-      } else {
-        carry = false;
+      if (!tflite::strided_slice::LoopCondition(src_coord[axis], stop,
+                                                stride)) {
+        break;
       }
+      src_coord[axis] = tflite::strided_slice::StartForAxis(
+          strided_slice_params, ToRuntimeShape(input_shape), axis);
     }
-    // increment destination buffer offset
-    dst_offset++;
-  } while (dst_offset < output_data.size());
+  }
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 43070b063c4a426907e80f444e00da44417c0e18..ce0854b4721ce5878b0f91f114aa535784e81cff 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -30,7 +30,7 @@ namespace toco {
 namespace {
 
 // Using the function reducer, reduce input along all axes in axes.
-// Put the reduced data in output, which should aleady be appropriately sized.
+// Put the reduced data in output, which should already be appropriately sized.
 // check_output_shape is set to what this code computes the final shape
 // to be, so it can be cross checked with the shape computation logic.
 void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
@@ -54,7 +54,7 @@ void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
   // Reduction mask will be elementwise multiplied against the input
   // indices to figure out the output index for the element.
   std::vector<int> reduction_mask(input_shape.dimensions_count(), 1);
-  for (int axis : axes) {
+  for (const auto& axis : axes) {
     CHECK_GE(axis, 0);
     CHECK_LT(axis, input_shape.dimensions_count());
     reduction_mask[axis] = 0;
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
index c0becaf7d39cdbc01217bbb9b5a6b50017cc2eaa..2c860c30974766a093ef1bf2d9a93fb29bb65949 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
@@ -61,11 +61,11 @@ namespace toco {
   minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
   // We always want [min, max] to contain 0.
   if (minmax.min > 0 || minmax.max < 0) {
-    LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
-               << "[" << minmax.min << ", " << minmax.max
-               << "] does not contain 0. "
-               << "Proceeding by tweaking it to contain 0, which will result "
-                  "in poor accuracy.";
+    LOG(WARNING) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
   }
   minmax.min = std::min(minmax.min, 0.);
   minmax.max = std::max(minmax.max, 0.);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index 51c724dd1ab058e08a3a29a5408e5d584831a3d9..e65a0dc48581dc6c5768b94cb9b45bcc99cb9cd1 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -49,8 +49,8 @@ void FillArrayWithZeros(Array* array) {
 }  // namespace
 
 // Removes a multiplication by array of constant zeros by making the output
-// array an array of constant zeros and removing the input arrays if they are no
-// longer needed.
+// array to an array of constant zeros and removing the input arrays if they
+// are no longer needed.
 ::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
                                                 std::size_t op_index,
                                                 bool* modified) {
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 41a735394d714b65a4c9fc309927e34a7f610431..7492f3e116c60ca2c574bf8d2fd4b08f5914f3d0 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -13,17 +13,192 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
+namespace {
+
+void UnrollBatchMatMul3D(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, const std::vector<int> batch,
+    Model* model, std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    std::vector<string>* pack_inputs) {
+  const std::string batch_name =
+      absl::StrCat(batch_op->outputs[0], "_b", absl::StrJoin(batch, "-"));
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& input_array_b = model->GetArray(input_rhs);
+  const int dims_count = input_array_a.shape().dimensions_count();
+
+  // tf.slice(a, ...).
+  std::vector<int> begin_indices_a = batch;
+  begin_indices_a.resize(dims_count);
+  std::vector<int> slice_size_a = input_array_a.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_a[i] = 1;
+  }
+  auto* slice_a_op = new SliceOperator;
+  slice_a_op->inputs = {
+      input_lhs,
+      CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
+                       begin_indices_a),
+      CreateInt32Array(model, batch_name + "/slice_a/slice/size", slice_size_a),
+  };
+  slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
+  auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
+  slice_a_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
+  slice_a_reshape_op->inputs = {
+      slice_a_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
+                       {-1, input_array_a.shape().dims(dims_count - 1)})};
+  slice_a_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
+  auto& slice_a_reshape_op_output =
+      model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
+  slice_a_reshape_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_reshape_op) + 1;
+
+  // tf.slice(b, ...).
+  std::vector<int> begin_indices_b = batch;
+  begin_indices_b.resize(dims_count);
+  std::vector<int> slice_size_b = input_array_b.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_b[i] = 1;
+  }
+  auto* slice_b_op = new SliceOperator;
+  slice_b_op->inputs = {
+      input_rhs,
+      CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                       begin_indices_b),
+      CreateInt32Array(model, batch_name + "/slice_b/slice/size", slice_size_b),
+  };
+  slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
+  auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
+  slice_b_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
+  slice_b_reshape_op->inputs = {
+      slice_b_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
+                       {-1, input_array_b.shape().dims(dims_count - 1)})};
+  slice_b_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
+  auto& slice_b_reshape_op_output =
+      model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
+  slice_b_reshape_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_reshape_op) + 1;
+
+  // tf.matmul(slice_a, slice_b).
+  auto* matmul_op = new TensorFlowMatMulOperator;
+  matmul_op->inputs = {slice_a_reshape_op->outputs[0],
+                       slice_b_reshape_op->outputs[0]};
+  matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+  auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+  matmul_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, matmul_op) + 1;
+
+  // Add to stack.
+  pack_inputs->push_back(matmul_op->outputs[0]);
+}
+
+std::vector<string> UnrollBatchMatMulRecursion(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, Model* model,
+    std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    const std::vector<int>& batch_prefix) {
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& dims_vec = input_array_a.shape().dims();
+  const int current_dim_size = dims_vec[batch_prefix.size()];
+  std::vector<string> batch_pack_inputs;
+
+  if (batch_prefix.size() + 3 == dims_vec.size()) {
+    // Base case
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      UnrollBatchMatMul3D(input_lhs, input_rhs, batch_op, new_batch_prefix,
+                          model, tail_it, &batch_pack_inputs);
+    }
+  } else {
+    // Recursion
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+          input_lhs, input_rhs, batch_op, model, tail_it, new_batch_prefix);
+
+      // The pack that will join all the individual matmul results together.
+      auto* pack_op = new PackOperator;
+      std::string batch_name = absl::StrCat(
+          batch_op->outputs[0], "_b", absl::StrJoin(new_batch_prefix, "-"));
+      pack_op->inputs = pack_inputs;
+      pack_op->outputs = {AvailableArrayName(*model, batch_name + "/pack")};
+      auto& pack_op_output = model->GetOrCreateArray(pack_op->outputs[0]);
+      pack_op_output.data_type = input_array_a.data_type;
+      pack_op->axis = 0;
+      pack_op->values_count = pack_inputs.size();
+      *tail_it = model->operators.emplace(*tail_it, pack_op) + 1;
+
+      batch_pack_inputs.push_back(pack_op->outputs[0]);
+    }
+  }
+  return batch_pack_inputs;
+}
+
+std::vector<int32> GetTransposePerm(const Array& input_array) {
+  const int32 dims = input_array.shape().dimensions_count();
+  std::vector<int32> perm_array_val(dims);
+  for (int i = 0; i < dims; ++i) {
+    perm_array_val[i] = i;
+  }
+  perm_array_val[dims - 2] = dims - 1;
+  perm_array_val[dims - 1] = dims - 2;
+  return perm_array_val;
+}
+
+std::vector<int32> GetTransposeShape(const Shape& input_shape,
+                                     const std::vector<int32>& perm_array_val) {
+  const int32 dims = input_shape.dimensions_count();
+  std::vector<int32> output_shape(dims);
+  for (int i = 0; i < dims; ++i) {
+    output_shape[i] = input_shape.dims(perm_array_val[i]);
+  }
+  return output_shape;
+}
+
+TransposeOperator* TransposeInput(const string& input, Model* model) {
+  const auto& input_array = model->GetArray(input);
+  const auto perm_array = GetTransposePerm(input_array);
+  const string perm_array_name = CreateInt32Array(
+      model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
+  auto* transpose_op = new TransposeOperator;
+  transpose_op->inputs = {input, perm_array_name};
+  transpose_op->outputs = {AvailableArrayName(*model, input + "/transpose")};
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->outputs[0]);
+  *transpose_array.mutable_shape()->mutable_dims() =
+      GetTransposeShape(input_array.shape(), perm_array);
+  model->GetOrCreateArray(transpose_op->outputs[0]);
+  return transpose_op;
+}
+
+}  // namespace
+
 // Unrolls a BatchMatMul on the batch dimension.
 // We need to slice each batch out of the inputs, matmul them individually, then
 // stack them all back together at the end.
@@ -46,115 +221,67 @@ namespace toco {
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
 
-  // We must have the shape of at least one input to know our batch size.
-  const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
-  const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
-  if (!input_array_a.has_shape() || !input_array_b.has_shape())
+  auto& tail_it = batch_op_it;
+
+  string input_lhs = batch_op->inputs[0];
+  string input_rhs = batch_op->inputs[1];
+  const auto& input_lhs_array = model->GetArray(input_lhs);
+  const auto& input_rhs_array = model->GetArray(input_rhs);
+  if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
     return ::tensorflow::Status::OK();
 
-  // We only support the rank 3 case. If you are batching on rank > 3 you'll
-  // have to figure that out.
-  CHECK_EQ(input_array_a.shape().dimensions_count(),
-           input_array_b.shape().dimensions_count())
-      << "Input dimensions must have the same rank";
-  if (input_array_a.shape().dimensions_count() == 2) {
+  // Transpose LHS input if necessary.
+  if (batch_op->adj_x) {
+    TransposeOperator* transpose_op = TransposeInput(input_lhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_lhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_a = model->GetArray(input_lhs);
+
+  // Transpose RHS input if necessary.
+  if (batch_op->adj_y) {
+    TransposeOperator* transpose_op = TransposeInput(input_rhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_rhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_b = model->GetArray(input_rhs);
+
+  const int dims = input_array_a.shape().dimensions_count();
+  for (int i = 0; i < dims - 2; ++i) {
+    CHECK_EQ(input_array_a.shape().dims(i), input_array_b.shape().dims(i))
+        << "input array not consistent at index " << i;
+  }
+  CHECK_EQ(input_array_a.shape().dims(dims - 1),
+           input_array_b.shape().dims(dims - 2))
+      << "Input dimensions must be compatible for multipication. shape a = ["
+      << absl::StrJoin(input_array_a.shape().dims(), ", ") << "], shape b = ["
+      << absl::StrJoin(input_array_b.shape().dims(), ", ") << "]";
+
+  if (dims == 2) {
     // This is really just a MatMul. This likely means that someone hand-crafted
     // a graphdef with a BatchMatMul when they really wanted a MatMul.
     AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
                 LogName(*batch_op));
     auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = batch_op->inputs;
+    matmul_op->inputs = {input_lhs, input_rhs};
     matmul_op->outputs = batch_op->outputs;
-    const auto matmul_op_it = model->operators.emplace(batch_op_it, matmul_op);
-    batch_op_it = matmul_op_it + 1;
-    CHECK_EQ(batch_op_it->get(), batch_op);
-    model->operators.erase(batch_op_it);
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+    CHECK_EQ(tail_it->get(), batch_op);
+    model->operators.erase(tail_it);
     *modified = true;
     return ::tensorflow::Status::OK();
   }
-  CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
-      << "Input arrays must have rank 3";
 
-  // Perform the matmul for each slice of the batch.
-  int batch_count = input_array_a.shape().dims(0);
-  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
-              batch_count);
-  auto tail_it = batch_op_it;
-  std::vector<string> pack_inputs;
-  for (int batch = 0; batch < batch_count; ++batch) {
-    std::string batch_name =
-        std::string(batch_op->outputs[0]) + "_b" + std::to_string(batch);
-
-    // tf.slice(a, ...).
-    auto* slice_a_op = new SliceOperator;
-    slice_a_op->inputs = {
-        batch_op->inputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_a/slice/size",
-            {1, input_array_a.shape().dims(1), input_array_a.shape().dims(2)}),
-    };
-    slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
-    auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
-    slice_a_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
-    slice_a_reshape_op->inputs = {
-        slice_a_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
-                         {-1, input_array_a.shape().dims(2)})};
-    slice_a_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
-    auto& slice_a_reshape_op_output =
-        model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
-    slice_a_reshape_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_reshape_op) + 1;
-
-    // tf.slice(b, ...).
-    auto* slice_b_op = new SliceOperator;
-    slice_b_op->inputs = {
-        batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_b/slice/size",
-            {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
-    };
-    slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
-    auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
-    slice_b_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
-    slice_b_reshape_op->inputs = {
-        slice_b_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
-                         {-1, input_array_b.shape().dims(2)})};
-    slice_b_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
-    auto& slice_b_reshape_op_output =
-        model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
-    slice_b_reshape_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_reshape_op) + 1;
-
-    // tf.matmul(slice_a, slice_b).
-    auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = {slice_a_reshape_op->outputs[0],
-                         slice_b_reshape_op->outputs[0]};
-    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
-    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
-    matmul_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+  CHECK_GE(input_array_a.shape().dimensions_count(), 3)
+      << "Input arrays must have rank >= 3";
 
-    // Add to stack.
-    pack_inputs.push_back(matmul_op->outputs[0]);
-  }
+  const auto& dims_vec = input_array_a.shape().dims();
+  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
+              std::accumulate(dims_vec.begin(), dims_vec.end() - 2, 1,
+                              std::multiplies<int>()));
 
-  // The pack that will join all the individual matmul results together.
+  std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+      input_lhs, input_rhs, batch_op, model, &tail_it, {});
   auto* pack_op = new PackOperator;
   pack_op->inputs = pack_inputs;
   pack_op->outputs = {batch_op->outputs[0]};
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index dac106b398870f497087fc14fa4c318eb8156408..fdf72bde05754c4af07f368ffac15366be303557 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -1092,11 +1092,14 @@ tensorflow::Status ConvertBatchMatMulOperator(
     Model* model) {
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
 
-  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
-  CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
-  CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
-
   auto* batch_matmul = new BatchMatMulOperator;
+  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+  if (HasAttr(node, "adj_x")) {
+    batch_matmul->adj_x = GetBoolAttr(node, "adj_x");
+  }
+  if (HasAttr(node, "adj_y")) {
+    batch_matmul->adj_y = GetBoolAttr(node, "adj_y");
+  }
   batch_matmul->inputs = {node.input(0), node.input(1)};
   batch_matmul->outputs = {node.name()};
 
@@ -1346,7 +1349,7 @@ tensorflow::Status ConvertUnsupportedOperator(
   }
 
   // Parse outputs. Name them after the node's name, plus an ordinal suffix.
-  // Note that some outputs are to be multipled by a named attribute.
+  // Note that some outputs are to be multiplied by a named attribute.
   const tensorflow::OpDef* op_def = nullptr;
   if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
     GetOutputNamesFromNodeDef(node, *op_def, op);
@@ -1480,7 +1483,7 @@ tensorflow::Status ConvertPlaceholderOperator(
   if (node.attr().count("shape")) {
     const auto& shape = GetShapeAttr(node, "shape");
     auto num_dims = shape.dim_size();
-    // TODO(b/62716978): This logic needs to be revisted.  During dims
+    // TODO(b/62716978): This logic needs to be revisited.  During dims
     // refactoring it is an interim fix.
     if (num_dims > 0 && !HasWildcardDimension(shape)) {
       auto& dst_array_dims = *array.mutable_shape()->mutable_dims();
@@ -1529,6 +1532,20 @@ tensorflow::Status ConvertFloorOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertCeilOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "Ceil");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  const auto data_type = GetDataTypeAttr(node, "T");
+  CHECK(data_type == DT_FLOAT);
+  auto* op = new CeilOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertGatherOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1558,6 +1575,21 @@ tensorflow::Status ConvertGatherOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertGatherNdOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "GatherNd");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
+  const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
+  CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
+  auto* op = new GatherNdOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 template <typename Op>
 tensorflow::Status ConvertArgMinMaxOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
@@ -1993,6 +2025,27 @@ tensorflow::Status ConvertShapeOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertReverseSequenceOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "ReverseSequence");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
+  auto op = absl::make_unique<ReverseSequenceOperator>();
+  if (HasAttr(node, "seq_dim")) {
+    op->seq_dim = GetIntAttr(node, "seq_dim");
+  }
+  // In tf.reverse_sequence, batch_dim defaults to 0.
+  op->batch_dim =
+      HasAttr(node, "batch_dim") ? GetIntAttr(node, "batch_dim") : 0;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.push_back(std::move(op));
+  return tensorflow::Status::OK();
+}
+
 void StripCaretFromArrayNames(Model* model) {
   for (auto& op : model->operators) {
     for (auto& input : op->inputs) {
@@ -2361,7 +2414,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
       {"Abs", ConvertSimpleOperator<AbsOperator, kAnyNumInputs, 1>},
       {"Add", ConvertSimpleOperator<AddOperator, 2, 1>},
-      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator, kAnyNumInputs, 1>},
+      {"AddN", ConvertSimpleOperator<AddNOperator, kAnyNumInputs, 1>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator, kAnyNumInputs, 1>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
@@ -2375,18 +2428,21 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"BatchToSpaceND", ConvertBatchToSpaceNDOperator},
       {"BiasAdd", ConvertBiasAddOperator},
       {"Cast", ConvertCastOperator},
+      {"Ceil", ConvertCeilOperator},
       {"CheckNumerics", ConvertIdentityOperator},
       {"Concat", ConvertConcatOperator},
       {"ConcatV2", ConvertConcatOperator},
       {"Const", ConvertConstOperator},
       {"Conv2D", ConvertConvOperator},
       {"Conv2DBackpropInput", ConvertTransposeConvOperator},
+      {"Cos", ConvertSimpleOperator<CosOperator, 1, 1>},
       {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator},
       {"DepthToSpace", ConvertDepthToSpaceOperator},
       {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator},
       {"Div", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"DynamicPartition", ConvertDynamicPartitionOperator},
       {"DynamicStitch", ConvertDynamicStitchOperator},
+      {"Elu", ConvertSimpleOperator<EluOperator, 1, 1>},
       {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2, 1>},
       {"Exp", ConvertSimpleOperator<ExpOperator, 1, 1>},
       {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2, 1>},
@@ -2399,6 +2455,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"FusedBatchNorm", ConvertFusedBatchNormOperator},
       {"Gather", ConvertGatherOperator},
       {"GatherV2", ConvertGatherOperator},
+      {"GatherNd", ConvertGatherNdOperator},
       {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2, 1>},
       {"GreaterEqual",
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
@@ -2418,7 +2475,8 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"MaxPool", ConvertMaxPoolOperator},
       {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2, 1>},
       {"Mean", ConvertReduceOperator<MeanOperator>},
-      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2, 1>},
+      {"Merge",
+       ConvertSimpleOperator<TensorFlowMergeOperator, kAnyNumInputs, 1>},
       {"Min", ConvertReduceOperator<TensorFlowMinOperator>},
       {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2, 1>},
       {"Mul", ConvertSimpleOperator<MulOperator, 2, 1>},
@@ -2437,13 +2495,15 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Prod", ConvertReduceOperator<TensorFlowProdOperator>},
       {"RandomUniform", ConvertRandomUniform},
       {"Range", ConvertRangeOperator},
-      {"Rank", ConvertSimpleOperator<RankOperator, 1, 1>},
+      {"Rank", ConvertSimpleOperator<TensorFlowRankOperator, 1, 1>},
       {"RealDiv", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"Relu", ConvertSimpleOperator<ReluOperator, 1, 1>},
       {"Relu6", ConvertSimpleOperator<Relu6Operator, 1, 1>},
       {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2, 1>},
       {"ResizeBilinear", ConvertResizeBilinearOperator},
       {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
+      {"ReverseSequence", ConvertReverseSequenceOperator},
+      {"ReverseV2", ConvertSimpleOperator<ReverseV2Operator, 2, 1>},
       {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
       {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"Shape", ConvertShapeOperator},
@@ -2478,6 +2538,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"UnidirectionalSequenceRnn", ConvertUnidirectionalSequenceRnn},
       {"MirrorPad", ConvertMirrorPadOperator},
       {"Unique", ConvertSimpleOperator<UniqueOperator, 1, 2>},
+      {"Where", ConvertSimpleOperator<WhereOperator, 1, 1>},
   });
 }
 
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 8ff3f7733afb4355a8e7863594633a6555287c10..b620ade756e457c0b25829d282ea4bc027fc2681 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -115,7 +115,6 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
     s->add_dim()->set_size(d);
   }
 
-  // TODO(ahentz): also need to test via tensor_content()
   switch (dtype) {
     case DT_FLOAT:
       for (int64_t i = 0; i < num_elements; ++i) {
@@ -385,6 +384,127 @@ std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
           {DT_INT64, ArrayDataType::kInt64}};
 }
 
+class TensorContentTest : public ::testing::Test {
+ public:
+  template <ArrayDataType T>
+  std::vector<DataType<T>> ImportAndGetData(const NodeDef& node) {
+    Model model;
+    auto status = ImportNode(node, &model);
+    CHECK(status.ok()) << status.error_message();
+    const auto& nodearray = model.GetArray("Node1");
+    return nodearray.GetBuffer<T>().data;
+  }
+  template <class T>
+  void NodeWithTensorContent(std::initializer_list<int64_t> shape,
+                             tensorflow::DataType dtype, int64_t num_elements,
+                             NodeDef* node) {
+    node->set_op("Const");
+    node->set_name("Node1");
+
+    // An attribute describing the type of this const node.
+    AttrValue dtype_attr;
+    SetAttrValue(dtype, &dtype_attr);
+    (*node->mutable_attr())["dtype"] = dtype_attr;
+
+    auto allocated_content = absl::make_unique<T[]>(num_elements);
+
+    // An attribute describing the content of this const node.
+    tensorflow::TensorProto t;
+    t.set_dtype(dtype);
+    auto* s = t.mutable_tensor_shape();
+    for (const auto& d : shape) {
+      s->add_dim()->set_size(d);
+    }
+
+    switch (dtype) {
+      case DT_FLOAT:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          allocated_content[i] = i / 10000.0 + 1;
+        }
+        break;
+      case DT_INT32:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          allocated_content[i] = i % std::numeric_limits<int>::max() + 1;
+        }
+        break;
+      case DT_QUINT8:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          allocated_content[i] = i % std::numeric_limits<uint8_t>::max() + 1;
+        }
+        break;
+      case DT_INT64:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          allocated_content[i] = i + 1;
+        }
+        break;
+      case DT_STRING:
+        break;
+      case DT_BOOL:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          allocated_content[i] = ((i % 2) == 0);
+        }
+        break;
+      default:
+        break;
+    }
+    t.set_tensor_content(
+        string(reinterpret_cast<const char*>(allocated_content.get()),
+               num_elements * sizeof(T)));
+
+    AttrValue value_attr;
+    SetAttrValue(t, &value_attr);
+    (*node->mutable_attr())["value"] = value_attr;
+
+    allocated_content.reset();
+  }
+};
+
+TEST_F(TensorContentTest, Int64) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt64;
+
+  NodeDef node;
+  NodeWithTensorContent<int64_t>({1, 2, 3}, DT_INT64, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+}
+
+TEST_F(TensorContentTest, Int32) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt32;
+
+  NodeDef node;
+  NodeWithTensorContent<int>({1, 2, 3}, DT_INT32, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+}
+
+TEST_F(TensorContentTest, Float) {
+  constexpr ArrayDataType kType = ArrayDataType::kFloat;
+
+  NodeDef node;
+  NodeWithTensorContent<float>({1, 2, 3}, DT_FLOAT, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0001, 1.0002, 1.0003, 1.0004, 1.0005));
+}
+
+TEST_F(TensorContentTest, Quint8) {
+  constexpr ArrayDataType kType = ArrayDataType::kUint8;
+
+  NodeDef node;
+  NodeWithTensorContent<uint8_t>({1, 2, 3}, DT_QUINT8, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+}
+
+TEST_F(TensorContentTest, Bool) {
+  constexpr ArrayDataType kType = ArrayDataType::kBool;
+
+  NodeDef node;
+  NodeWithTensorContent<bool>({1, 2, 3}, DT_BOOL, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 0, 1, 0, 1, 0));
+}
+
 class TypeImportTest : public ::testing::TestWithParam<
                            std::pair<tensorflow::DataType, ArrayDataType>> {
  protected:
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 296ed9fc747aa2e2d79e3d706d27ff3bdaacdd30..46f70c9e379de6fcfa5405fd9ef26fb3819432df 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -42,8 +42,10 @@ enum class OperatorType : uint8 {
   kAveragePool,
   kBatchMatMul,
   kBatchNormalization,
+  kCeil,
   kConv,
   kConcatenation,
+  kCos,
   kDepthwiseConv,
   kDepthToSpace,
   kSpaceToDepth,
@@ -159,7 +161,14 @@ enum class OperatorType : uint8 {
   kAbs,
   kMirrorPad,
   kUnique,
-  kUnidirectionalSequenceRnn
+  kUnidirectionalSequenceRnn,
+  kBidirectionalSequenceLstm,
+  kReverseV2,
+  kBidirectionalSequenceRnn,
+  kGatherNd,
+  kWhere,
+  kElu,
+  kReverseSequence
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -649,6 +658,18 @@ struct UnidirectionalSequenceLstmOperator : Operator {
       : Operator(OperatorType::kUnidirectionalSequenceLstm) {}
 };
 
+struct BidirectionalSequenceLstmOperator : Operator {
+  BidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kBidirectionalSequenceLstm) {}
+  bool merge_outputs;
+};
+
+struct BidirectionalSequenceRnnOperator : Operator {
+  BidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kBidirectionalSequenceRnn) {}
+  bool merge_outputs;
+};
+
 // Element-wise multiplication operator.
 //
 // Inputs:
@@ -671,6 +692,17 @@ struct AbsOperator : Operator {
   AbsOperator() : Operator(OperatorType::kAbs) {}
 };
 
+// Elu
+//   f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Elu
+struct EluOperator : Operator {
+  EluOperator() : Operator(OperatorType::kElu) {}
+};
+
 // Element-wise Relu operator:
 //   x -> max(0, x)
 //
@@ -948,6 +980,8 @@ struct TensorFlowIdentityOperator : Operator {
 // TensorFlow equivalent: MatMul
 struct BatchMatMulOperator : Operator {
   BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+  bool adj_x = false;
+  bool adj_y = false;
 };
 
 // General matrix multiplication operator. We don't want to support general
@@ -1150,6 +1184,17 @@ struct ExpOperator : Operator {
   ExpOperator() : Operator(OperatorType::kExp) {}
 };
 
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = cos(x)).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Cos
+struct CosOperator : Operator {
+  CosOperator() : Operator(OperatorType::kCos) {}
+};
+
 // Given a tensor input, this operation inserts a dimension of 1 at the
 // dimension index axis of input's shape. The dimension index axis starts at
 // zero; if you specify a negative number for axis it is counted backward from
@@ -1228,13 +1273,12 @@ struct RangeOperator : Operator {
 // Inputs:
 //   inputs[0]: required: the input array
 //
-// This operation outputs a 0-D integer tensor representing the rank of
-// the input.
+// This operation outputs a 0-D int32 Tensor representing the rank of input.
 //
-// TensorFlow equivalent: Rank.  We currently assume that the output is int32
-// and not int64.  The output type could be stored herein.
-struct RankOperator : Operator {
-  RankOperator() : Operator(OperatorType::kRank) {}
+// TensorFlow equivalent: Rank.
+struct TensorFlowRankOperator : Operator {
+  TensorFlowRankOperator() : Operator(OperatorType::kRank) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
 };
 
 // Element-wise negation (-x) operator.
@@ -1660,6 +1704,16 @@ struct FloorOperator : Operator {
   FloorOperator() : Operator(OperatorType::kFloor) {}
 };
 
+// Ceil operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Ceil
+struct CeilOperator : Operator {
+  CeilOperator() : Operator(OperatorType::kCeil) {}
+};
+
 // Gather operator. It gathers slices from params according to indices.
 // Only 1-D indices are supported at the moment.
 //
@@ -1681,10 +1735,22 @@ struct GatherOperator : Operator {
   int input_rank = 0;
 };
 
+// GatherNd operator. It gathers slices from params according to indices.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//
+// TensorFlow equivalent: GatherNd
+struct GatherNdOperator : Operator {
+  GatherNdOperator() : Operator(OperatorType::kGatherNd) {}
+};
+
 // ArgMax operator. It returns the index of the maximum value along axis.
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMax
 struct ArgMaxOperator : Operator {
@@ -1696,6 +1762,7 @@ struct ArgMaxOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMin
 struct ArgMinOperator : Operator {
@@ -1938,6 +2005,16 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+// ReverseV2 operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+//
+// TensorFlow equivalent: ReverseV2.
+struct ReverseV2Operator : Operator {
+  ReverseV2Operator() : Operator(OperatorType::kReverseV2) {}
+};
+
 enum class MirrorPadMode { kNone, kSymmetric, kReflect };
 
 // MirrorPad Operator:
@@ -1955,6 +2032,19 @@ struct MirrorPadOperator : Operator {
   MirrorPadMode mode;
 };
 
+// ReverseSequence operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+// Inputs[1]: required: the lengths of the elements to be reversed.
+//
+// TensorFlow equivalent: tf.reverse_sequence.
+struct ReverseSequenceOperator : Operator {
+  ReverseSequenceOperator() : Operator(OperatorType::kReverseSequence) {}
+  int seq_dim;
+  int batch_dim = 0;
+};
+
 // Unique Operator:
 //
 // Inputs:
@@ -1973,6 +2063,18 @@ struct UnidirectionalSequenceRnnOperator : Operator {
   FusedActivationFunctionType fused_activation_function;
 };
 
+// Where Operator:
+// Return the coordinates of the true values in condition tensor in row-major
+// order.
+//
+// Inputs:
+//  inputs[0]: required: boolean condition tensor
+//
+//  TensorFlow equivalent: Where
+struct WhereOperator : Operator {
+  WhereOperator() : Operator(OperatorType::kWhere) {}
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 717a28bc615e0a142c41efb3afaa49f64d2a1e14..7e48bd9542b0cc0de4c0218465edfc75e97d0049 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -261,7 +261,7 @@ void ReadModelFlagsFromCommandLineFlags(
     std::vector<string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
     QCHECK(mean_values.size() == model_flags->input_arrays_size());
-    for (int i = 0; i < mean_values.size(); ++i) {
+    for (size_t i = 0; i < mean_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_mean_value(
           strtod(mean_values[i].data(), &last));
@@ -278,7 +278,7 @@ void ReadModelFlagsFromCommandLineFlags(
     std::vector<string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
     QCHECK(std_values.size() == model_flags->input_arrays_size());
-    for (int i = 0; i < std_values.size(); ++i) {
+    for (size_t i = 0; i < std_values.size(); ++i) {
       char* last = nullptr;
       model_flags->mutable_input_arrays(i)->set_std_value(
           strtod(std_values[i].data(), &last));
@@ -296,7 +296,7 @@ void ReadModelFlagsFromCommandLineFlags(
     std::vector<string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
     QCHECK(input_data_types.size() == model_flags->input_arrays_size());
-    for (int i = 0; i < input_data_types.size(); ++i) {
+    for (size_t i = 0; i < input_data_types.size(); ++i) {
       IODataType type;
       QCHECK(IODataType_Parse(input_data_types[i], &type));
       model_flags->mutable_input_arrays(i)->set_data_type(type);
@@ -319,7 +319,7 @@ void ReadModelFlagsFromCommandLineFlags(
     std::vector<string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
     QCHECK(input_shapes.size() == model_flags->input_arrays_size());
-    for (int i = 0; i < input_shapes.size(); ++i) {
+    for (size_t i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
       // Treat an empty input shape as a scalar.
diff --git a/tensorflow/lite/toco/model_cmdline_flags_test.cc b/tensorflow/lite/toco/model_cmdline_flags_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2f8dc59852d06af001c7e084f1eeedcb040b7a8
--- /dev/null
+++ b/tensorflow/lite/toco/model_cmdline_flags_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+
+namespace toco {
+namespace {
+
+TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
+  int args_count = 3;
+  const char* args[] = {
+      "toco",
+      "--input_arrays=input_1",
+      "--rnn_states={state_array:rnn/BasicLSTMCellZeroState/zeros,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Add_1,size:4},"
+      "{state_array:rnn/BasicLSTMCellZeroState/zeros_1,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Mul_2,size:4}",
+  };
+
+  string expected_input_arrays = "input_1";
+  std::vector<std::unordered_map<string, string>> expected_rnn_states;
+  expected_rnn_states.push_back(
+      {{"state_array", "rnn/BasicLSTMCellZeroState/zeros"},
+       {"back_edge_source_array", "rnn/basic_lstm_cell/Add_1"},
+       {"size", "4"}});
+  expected_rnn_states.push_back(
+      {{"state_array", "rnn/BasicLSTMCellZeroState/zeros_1"},
+       {"back_edge_source_array", "rnn/basic_lstm_cell/Mul_2"},
+       {"size", "4"}});
+
+  string message;
+  ParsedModelFlags result_flags;
+
+  EXPECT_TRUE(ParseModelFlagsFromCommandLineFlags(
+      &args_count, const_cast<char**>(args), &message, &result_flags));
+  EXPECT_EQ(result_flags.input_arrays.value(), expected_input_arrays);
+  EXPECT_EQ(result_flags.rnn_states.value().elements, expected_rnn_states);
+}
+
+}  // namespace
+}  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/model_flags.proto b/tensorflow/lite/toco/model_flags.proto
index bcdac295d261c0e7cc04c5a8c3e2e5d88736cd88..dfc425073f51333c8ab4bf9d1dc62b0e88b92ce5 100644
--- a/tensorflow/lite/toco/model_flags.proto
+++ b/tensorflow/lite/toco/model_flags.proto
@@ -85,6 +85,7 @@ message RnnState {
   // Will be expanded with 1's to fit the model.
   // TODO(benoitjacob): should allow a generic, explicit shape.
   optional int32 size = 3;
+  optional int32 num_dims = 4;
 }
 
 // An ArraysExtraInfo message stores a collection of additional Information
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 8a6e82ec46445b5ec5440de129177eae836f8db8..2f5654c56e0acca57a2d644a7c50e87c185f721b 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,5 +1,4 @@
 package(default_visibility = [
-    "//tensorflow/contrib/lite:__subpackages__",
     "//tensorflow/lite:__subpackages__",
     "//tensorflow/tools/pip_package:__subpackages__",
 ])
@@ -26,6 +25,7 @@ cc_library(
     deps = [
         "//third_party/python_runtime:headers",
         "//tensorflow/core:lib",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:toco_graphviz_dump_options",
@@ -47,7 +47,6 @@ tf_py_wrap_cc(
     visibility = [
         "//learning/expander/pod/deep_pod/utils:__subpackages__",
         "//research/handwriting/converters/tflite:__subpackages__",
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/toco/python/toco_from_protos_test.py b/tensorflow/lite/toco/python/toco_from_protos_test.py
index 34cfd2c59fdc3aa3c83728f622fbf5b8d02d7e00..cc0d6f748acb94f8f5c8248bd4cdcc78129e0e17 100644
--- a/tensorflow/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/lite/toco/python/toco_from_protos_test.py
@@ -54,7 +54,7 @@ class TocoFromProtosTest(googletest.TestCase):
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
-    input_array.shape.dims.extend(map(int, in_tensor.get_shape()))
+    input_array.shape.dims.extend(map(int, in_tensor.shape))
     model_flags.output_arrays.append(TensorName(out_tensor))
     # Shell out to run toco (in case it crashes)
     with tempfile.NamedTemporaryFile() as fp_toco, \
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index ce8e3c9df88ba511fcca9d9a256896624194463b..6fad092f35aa386757885f9320f47e9f372e9f47 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/platform/logging.h"
 
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/python/toco_python_api.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -26,14 +27,6 @@ limitations under the License.
 
 namespace toco {
 
-#if PY_MAJOR_VERSION >= 3
-#define TOCO_PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyBytes_FromStringAndSize
-#else
-#define TOCO_PY_TO_CPPSTRING PyString_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyString_FromStringAndSize
-#endif
-
 // NOTE(aselle): We are using raw PyObject's here because we want to make
 // sure we input and output bytes rather than unicode strings for Python3.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
@@ -44,7 +37,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   auto ConvertArg = [&](PyObject* obj, bool* error) {
     char* buf;
     Py_ssize_t len;
-    if (TOCO_PY_TO_CPPSTRING(obj, &buf, &len) == -1) {
+    if (::tflite::python_utils::ConvertFromPyString(obj, &buf, &len) == -1) {
       *error = true;
       return std::string();
     } else {
@@ -96,15 +89,15 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     PyObject* dict = PyDict_New();
     PyDict_SetItemString(
         dict, "flatbuffer",
-        TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                  output_file_contents_txt.size()));
+        ::tflite::python_utils::ConvertToPyString(
+            output_file_contents_txt.data(), output_file_contents_txt.size()));
     PyDict_SetItemString(dict, "arithmetic_ops",
                          PyLong_FromLong(model->ArithmeticOpsCount()));
     return dict;
   }
   // Convert arguments back to byte (py3) or str (py2)
-  return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                   output_file_contents_txt.size());
+  return ::tflite::python_utils::ConvertToPyString(
+      output_file_contents_txt.data(), output_file_contents_txt.size());
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
index fcd9ee45d984f05eabf5d51c223b45433e801308..2f9f9a8c9b0b179c62c9dafb23edc1cdc5f108a3 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
@@ -186,7 +186,7 @@ void SvdfCluster::MaybeMergeConstNodes(
       allocated_tensor->mutable_tensor_shape();
   auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
   int allocated_content_flat_size = 0;
-  for (int i = 0; i < const_node_parts.size(); i++) {
+  for (size_t i = 0; i < const_node_parts.size(); i++) {
     const auto& value_attr = const_node_parts[i]->attr().at("value");
     const tensorflow::TensorProto& tensor = value_attr.tensor();
     if (i == 0) {
@@ -214,7 +214,7 @@ void SvdfCluster::MaybeMergeConstNodes(
   std::unique_ptr<char[]> allocated_content(
       new char[allocated_content_flat_size]);
   char* content_ptr = allocated_content.get();
-  for (int i = 0; i < const_node_parts.size(); i++) {
+  for (size_t i = 0; i < const_node_parts.size(); i++) {
     const auto& value_attr = const_node_parts[i]->attr().at("value");
     const tensorflow::TensorProto& tensor = value_attr.tensor();
     port::CopyToBuffer(tensor.tensor_content(), content_ptr);
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 53f5ecef872774e83fbcb1abe394a2dbaa189f4c..8b0d38da0688a998ca99d3b50e217a89d1d8fe84 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -63,12 +63,12 @@ bool IsControlFlowOp(const string& tensorflow_op) {
   return false;
 }
 
-// Check if a TensorFlow Op is unsupportred by the Flex runtime.
+// Check if a TensorFlow Op is unsupported by the Flex runtime.
 bool IsUnsupportedFlexOp(const string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
-  // `HashTableV2` isn't supported for now since it requires an additinonal
+  // `HashTableV2` isn't supported for now since it requires an additional
   // initialization step.
   // TODO(b/117651199): Support `HashTableV2` with Flex runtime.
   if (tensorflow_op == "HashTableV2") {
@@ -157,7 +157,7 @@ OperatorKey::OperatorKey(
         string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
-    // we produce a custom op. This gives developers a chance to implemenr
+    // we produce a custom op. This gives developers a chance to implement
     // custom ops.
     custom_code_ = name;
   }
@@ -222,7 +222,8 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
 
     std::vector<int> shape;
     if (array.has_shape()) {
-      for (int d : array.shape().dims()) {
+      shape.reserve(array.shape().dims().size());
+      for (const auto& d : array.shape().dims()) {
         shape.push_back(d);
       }
     }
@@ -384,7 +385,7 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
       mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
 
       if (!mutating_input_variables.empty()) {
-        for (int i = 0; i < op->inputs.size(); ++i) {
+        for (size_t i = 0; i < op->inputs.size(); ++i) {
           if (!mutating_input_variables[i]) {
             continue;
           }
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 58cfb4987ff67d87d330688cd7bf75ca2eb98ebd..fb640f776abdef3e5a59d075d3bc15e8d0f9565f 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -51,9 +51,27 @@ class ExportTest : public ::testing::Test {
         output_array.data_type = ArrayDataType::kFloat;
         input_model_.operators.emplace_back(op);
       } else if (name == "Add") {
-        input_model_.operators.emplace_back(new AddOperator);
+        auto* op = new AddOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Sub") {
-        input_model_.operators.emplace_back(new SubOperator);
+        auto* op = new SubOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Assert") {
         auto* op = new TensorFlowAssertOperator;
 
@@ -114,7 +132,18 @@ class ExportTest : public ::testing::Test {
       output_array.data_type = ArrayDataType::kFloat;
       input_model_.operators.emplace_back(op);
     }
-    input_model_.operators.emplace_back(new AddOperator);
+    {
+      auto* op = new AddOperator;
+      op->inputs = {"input1", "input2"};
+      op->outputs = {"output"};
+      Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+      Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+      Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+      input1_array.data_type = ArrayDataType::kFloat;
+      input2_array.data_type = ArrayDataType::kFloat;
+      output_array.data_type = ArrayDataType::kFloat;
+      input_model_.operators.emplace_back(op);
+    }
   }
 
   std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
@@ -445,7 +474,7 @@ TEST_F(VersionedOpExportTest, Export) {
   auto* model = ::tflite::GetModel(result.data());
   auto operator_codes = model->operator_codes();
 
-  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // Verify that 2 operator codes are populated. Both are CONV_2D but with
   // different versions.
   EXPECT_EQ(2, operator_codes->size());
   EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index 93ab5141abe81c4ed4c1ff0ac7ca5e89577c71fb..b00c4124d83ae558b4aa6f5ecc2ba9eb06e5dac0 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -60,7 +60,7 @@ class ImportTest : public ::testing::Test {
                                builder_.CreateString("tensor_one"), q);
     auto t2 =
         ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
-                               ::tflite::TensorType_FLOAT32, 2,
+                               ::tflite::TensorType_FLOAT32, 0,
                                builder_.CreateString("tensor_two"), q);
     return builder_.CreateVector(
         std::vector<Offset<::tflite::Tensor>>({t1, t2}));
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 088673fd95460519d655ff86861bbee24ecf2d75..2919f81571a2c23dce09476268769aa20e07a537 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
+
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -108,6 +109,12 @@ class Convolution
     const Array& input_array = op_signature.model->GetArray(input_name);
     const Array& filter_array = op_signature.model->GetArray(filter_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op has signed int8 inputs and outputs, its version 3.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        filter_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 3;
+    }
     // If the op is a signed int8 hybrid operation, we need to return
     // version 2.
     if (input_array.data_type == ArrayDataType::kFloat &&
@@ -153,6 +160,18 @@ class DepthwiseConvolution
   int GetVersion(const OperatorSignature& op_signature) const override {
     const auto& conv_op =
         static_cast<const DepthwiseConvOperator&>(*op_signature.op);
+    const string& input_name = op_signature.op->inputs[0];
+    const string& filter_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& filter_array = op_signature.model->GetArray(filter_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op has signed int8 inputs and outputs, its version 3.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        filter_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 3;
+    }
     if (conv_op.dilation_width_factor != 1 ||
         conv_op.dilation_height_factor != 1) {
       return 2;
@@ -180,6 +199,31 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class AddN : public BuiltinOperator<AddNOperator, ::tflite::AddNOptions,
+                                    ::tflite::BuiltinOptions_AddNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateAddNOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
@@ -202,6 +246,12 @@ class SpaceToBatchND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -226,6 +276,12 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -271,6 +327,12 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -316,6 +378,12 @@ class Concatenation
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -409,18 +477,29 @@ class FullyConnected
     }
   }
 
+  // +-----------------+--------------------+--------------------------+
+  // |                 |    Weight::Default | Weight::Shuffled4x16Int8 |
+  // +-----------------+--------------------+--------------------------+
+  // | Float           |                  1 |                        2 |
+  // | Quantized Uint8 |                  1 |                        2 |
+  // | Hybrid          |                  3 |                        3 |
+  // | Quantized Int8  |                  4 |                        4 |
+  // +-----------------+--------------------+--------------------------+
   int GetVersion(const OperatorSignature& op_signature) const override {
     const auto& fc_op =
         static_cast<const FullyConnectedOperator&>(*op_signature.op);
-    if (fc_op.weights_format == FullyConnectedWeightsFormat::kDefault) {
-      return 1;
-    }
     const string& input_name = op_signature.op->inputs[0];
     const string& weights_name = op_signature.op->inputs[1];
     const string& output_name = op_signature.op->outputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     const Array& weights_array = op_signature.model->GetArray(weights_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
+    // Int8 fully fixed point kernel is at version 4.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 4;
+    }
     // If the op is a signed int8 hybrid operation, we need to return
     // version 3.
     if (input_array.data_type == ArrayDataType::kFloat &&
@@ -428,7 +507,15 @@ class FullyConnected
         output_array.data_type == ArrayDataType::kFloat) {
       return 3;
     }
-    return 2;
+    // For float and uint8 fixed point kernels, if the weight is
+    // Shuffled4x16Int8, is is version 2.
+    if (fc_op.weights_format ==
+        FullyConnectedWeightsFormat::kShuffled4x16Int8) {
+      return 2;
+    }
+
+    // Otherwise (weight is default), the version is 1.
+    return 1;
   }
 };
 
@@ -448,6 +535,32 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
     op->axis = {options.axis()};
   }
 
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GatherNd
+    : public BuiltinOperator<GatherNdOperator, ::tflite::GatherNdOptions,
+                             ::tflite::BuiltinOptions_GatherNdOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateGatherNdOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
   int GetVersion(const OperatorSignature& op_signature) const override {
     return 1;
   }
@@ -511,6 +624,12 @@ class L2Normalization
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // Version 2 supports signed int8 input types.
+    if (output_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -600,6 +719,39 @@ class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Maximum : public SimpleOperator<TensorFlowMaximumOperator> {
+ public:
+  explicit Maximum() : SimpleOperator("MAXIMUM", OperatorType::kMaximum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Minimum : public SimpleOperator<TensorFlowMinimumOperator> {
+ public:
+  explicit Minimum() : SimpleOperator("MINIMUM", OperatorType::kMinimum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -624,6 +776,12 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -643,6 +801,12 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -680,6 +844,12 @@ class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -753,6 +923,12 @@ class SpaceToDepth
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -772,6 +948,12 @@ class Transpose
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -783,7 +965,7 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    ::tflite::LSTMKernelType kernel_type;
+    ::tflite::LSTMKernelType kernel_type = ::tflite::LSTMKernelType_FULL;
     switch (op.kernel_type) {
       case LstmCellOperator::KERNEL_BASIC:
         kernel_type = ::tflite::LSTMKernelType_BASIC;
@@ -791,6 +973,8 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
       case LstmCellOperator::KERNEL_FULL:
         kernel_type = ::tflite::LSTMKernelType_FULL;
         break;
+      default:
+        return -1;
     }
 
     // Current toco converter only supports tanh, no clip.
@@ -915,6 +1099,94 @@ class UnidirectionalSequenceLstm
   }
 };
 
+class BidirectionalSequenceLstm
+    : public BuiltinOperator<
+          BidirectionalSequenceLstmOperator,
+          ::tflite::BidirectionalSequenceLSTMOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceLSTMOptions(
+        *builder, /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*cell_clip=*/0.0,
+        /*proj_clip=*/0.0,
+        /*merge_outputs=*/op.merge_outputs,
+        /*time_major=*/true);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward input activation state.
+    mutating_input_variables[35] = true;
+    // Forward input cell state.
+    mutating_input_variables[36] = true;
+    // Backward input activation state.
+    mutating_input_variables[37] = true;
+    // Backward input cell state.
+    mutating_input_variables[38] = true;
+    return mutating_input_variables;
+  }
+};
+
+class BidirectionalSequenceRnn
+    : public BuiltinOperator<
+          BidirectionalSequenceRnnOperator,
+          ::tflite::BidirectionalSequenceRNNOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceRNNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceRNNOptions(
+        *builder, /*time_major=*/true,
+        /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*merge_outputs=*/op.merge_outputs);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward hidden state.
+    mutating_input_variables[4] = true;
+    // Backward hidden state.
+    mutating_input_variables[8] = true;
+    return mutating_input_variables;
+  }
+};
+
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::ReducerOptions,
                                     ::tflite::BuiltinOptions_ReducerOptions> {
  public:
@@ -973,6 +1245,12 @@ class ReduceMax
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -994,6 +1272,12 @@ class ReduceMin
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1040,6 +1324,20 @@ class ReduceAny
   }
 };
 
+class Relu6 : public SimpleOperator<Relu6Operator> {
+ public:
+  explicit Relu6() : SimpleOperator("RELU6", OperatorType::kRelu6) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class ResizeBilinear
     : public BuiltinOperator<ResizeBilinearOperator,
                              ::tflite::ResizeBilinearOptions,
@@ -1058,6 +1356,12 @@ class ResizeBilinear
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op takes int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1081,6 +1385,12 @@ class ResizeNearestNeighbor
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1128,6 +1438,14 @@ class Split
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2, for int32 it's version 3.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    } else if (input_array.data_type == ArrayDataType::kInt32) {
+      return 3;
+    }
     return 1;
   }
 };
@@ -1178,6 +1496,12 @@ class StridedSlice
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1196,6 +1520,11 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1217,6 +1546,12 @@ class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
     return 1;
   }
 };
@@ -1238,6 +1573,12 @@ class ArgMin : public BuiltinOperator<ArgMinOperator, ::tflite::ArgMinOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
     return 1;
   }
 };
@@ -1330,6 +1671,12 @@ class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1356,6 +1703,34 @@ class Shape
   }
 };
 
+class Slice : public SimpleOperator<SliceOperator> {
+ public:
+  explicit Slice() : SimpleOperator("SLICE", OperatorType::kSlice) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Tanh : public SimpleOperator<TanhOperator> {
+ public:
+  explicit Tanh() : SimpleOperator("TANH", OperatorType::kTanh) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
                                       ::tflite::BuiltinOptions_OneHotOptions> {
  public:
@@ -1438,6 +1813,35 @@ class LeakyRelu
   }
 };
 
+class Logistic : public SimpleOperator<LogisticOperator> {
+ public:
+  explicit Logistic() : SimpleOperator("LOGISTIC", OperatorType::kLogistic) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LogSoftmax : public SimpleOperator<LogSoftmaxOperator> {
+ public:
+  explicit LogSoftmax()
+      : SimpleOperator("LOG_SOFTMAX", OperatorType::kLogSoftmax) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 class SquaredDifference
     : public BuiltinOperator<
           SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
@@ -1525,7 +1929,7 @@ class UnidirectionalSequenceRnn
   }
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    // Only support tanh actication, so check that tflite type is tanh.
+    // Only support tanh activation, so check that tflite type is tanh.
     DCHECK(options.fused_activation_function() ==
            ::tflite::ActivationFunctionType_TANH);
   }
@@ -1542,6 +1946,25 @@ class UnidirectionalSequenceRnn
   }
 };
 
+class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
+                                     ::tflite::BuiltinOptions_WhereOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateWhereOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1640,7 +2063,14 @@ class TensorFlowUnsupported : public BaseOperator {
           has_valid_attr = true;
           break;
         case tensorflow::AttrValue::kList:
-          if (attr.list().i_size() > 0) {
+          if (attr.list().s_size() > 0) {
+            auto start = fbb->StartVector(key);
+            for (const string& v : attr.list().s()) {
+              fbb->Add(v);
+            }
+            fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+            has_valid_attr = true;
+          } else if (attr.list().i_size() > 0) {
             auto start = fbb->StartVector(key);
             for (const int64_t v : attr.list().i()) {
               fbb->Add(v);
@@ -1722,6 +2152,14 @@ class TensorFlowUnsupported : public BaseOperator {
           }
           break;
         }
+        case 15: {  // flexbuffers::FBT_VECTOR_STRING: {
+          auto* list = (*attr)[key].mutable_list();
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            list->add_s(vector[i].AsString().str());
+          }
+          break;
+        }
         default:
           LOG(WARNING) << "Ignoring unsupported attribute type with key '"
                        << key << "'";
@@ -1767,6 +2205,131 @@ class Dequantize
   }
 };
 
+class ReverseSequence
+    : public BuiltinOperator<ReverseSequenceOperator,
+                             ::tflite::ReverseSequenceOptions,
+                             ::tflite::BuiltinOptions_ReverseSequenceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateReverseSequenceOptions(*builder, op.seq_dim,
+                                                  op.batch_dim);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->seq_dim = options.seq_dim();
+    op->batch_dim = options.batch_dim();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class Equal : public SimpleOperator<TensorFlowEqualOperator> {
+ public:
+  explicit Equal() : SimpleOperator("EQUAL", OperatorType::kEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class NotEqual : public SimpleOperator<TensorFlowNotEqualOperator> {
+ public:
+  explicit NotEqual() : SimpleOperator("NOT_EQUAL", OperatorType::kNotEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Greater : public SimpleOperator<TensorFlowGreaterOperator> {
+ public:
+  explicit Greater() : SimpleOperator("GREATER", OperatorType::kGreater) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GreaterEqual : public SimpleOperator<TensorFlowGreaterEqualOperator> {
+ public:
+  explicit GreaterEqual()
+      : SimpleOperator("GREATER_EQUAL", OperatorType::kGreaterEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Less : public SimpleOperator<TensorFlowLessOperator> {
+ public:
+  explicit Less() : SimpleOperator("LESS", OperatorType::kLess) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LessEqual : public SimpleOperator<TensorFlowLessEqualOperator> {
+ public:
+  explicit LessEqual()
+      : SimpleOperator("LESS_EQUAL", OperatorType::kLessEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Select : public SimpleOperator<SelectOperator> {
+ public:
+  explicit Select() : SimpleOperator("SELECT", OperatorType::kSelect) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
@@ -1776,6 +2339,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // Builtin Operators.
   ops.push_back(
       MakeUnique<Add>(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.push_back(
+      MakeUnique<AddN>(::tflite::BuiltinOperator_ADD_N, OperatorType::kAddN));
   ops.push_back(
       MakeUnique<Div>(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
   ops.push_back(
@@ -1802,6 +2367,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                  OperatorType::kFullyConnected));
   ops.push_back(MakeUnique<Gather>(::tflite::BuiltinOperator_GATHER,
                                    OperatorType::kGather));
+  ops.push_back(MakeUnique<GatherNd>(::tflite::BuiltinOperator_GATHER_ND,
+                                     OperatorType::kGatherNd));
   ops.push_back(
       MakeUnique<L2Normalization>(::tflite::BuiltinOperator_L2_NORMALIZATION,
                                   OperatorType::kL2Normalization));
@@ -1882,6 +2449,12 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.emplace_back(MakeUnique<UnidirectionalSequenceLstm>(
       ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
       OperatorType::kUnidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceLstm>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+      OperatorType::kBidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceRnn>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+      OperatorType::kBidirectionalSequenceRnn));
   ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
@@ -1898,6 +2471,11 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<UnidirectionalSequenceRnn>(
       ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
       OperatorType::kUnidirectionalSequenceRnn));
+  ops.push_back(
+      MakeUnique<Where>(::tflite::BuiltinOperator_WHERE, OperatorType::kWhere));
+  ops.push_back(
+      MakeUnique<ReverseSequence>(::tflite::BuiltinOperator_REVERSE_SEQUENCE,
+                                  OperatorType::kReverseSequence));
 
   // Custom Operators.
   ops.push_back(
@@ -1915,44 +2493,36 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // builtins.
   ops.push_back(
       MakeUnique<SimpleOperator<FloorOperator>>("FLOOR", OperatorType::kFloor));
+  ops.push_back(
+      MakeUnique<SimpleOperator<CeilOperator>>("CEIL", OperatorType::kCeil));
+  ops.push_back(
+      MakeUnique<SimpleOperator<EluOperator>>("ELU", OperatorType::kElu));
   ops.push_back(
       MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
   ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
       "RELU_N1_TO_1", OperatorType::kRelu1));
-  ops.push_back(
-      MakeUnique<SimpleOperator<Relu6Operator>>("RELU6", OperatorType::kRelu6));
+  ops.push_back(MakeUnique<Relu6>());
   ops.push_back(
       MakeUnique<SimpleOperator<PReluOperator>>("PRELU", OperatorType::kPRelu));
-  ops.push_back(MakeUnique<SimpleOperator<LogisticOperator>>(
-      "LOGISTIC", OperatorType::kLogistic));
-  ops.push_back(
-      MakeUnique<SimpleOperator<TanhOperator>>("TANH", OperatorType::kTanh));
+  ops.push_back(MakeUnique<Logistic>());
+  ops.push_back(MakeUnique<Tanh>());
   ops.push_back(
       MakeUnique<SimpleOperator<ExpOperator>>("EXP", OperatorType::kExp));
-  ops.push_back(MakeUnique<SimpleOperator<LogSoftmaxOperator>>(
-      "LOG_SOFTMAX", OperatorType::kLogSoftmax));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMaximumOperator>>(
-      "MAXIMUM", OperatorType::kMaximum));  //  Element-wise Maximum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMinimumOperator>>(
-      "MINIMUM", OperatorType::kMinimum));  //  Element-wise Minimum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterOperator>>(
-      "GREATER", OperatorType::kGreater));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterEqualOperator>>(
-      "GREATER_EQUAL", OperatorType::kGreaterEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessOperator>>(
-      "LESS", OperatorType::kLess));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessEqualOperator>>(
-      "LESS_EQUAL", OperatorType::kLessEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowEqualOperator>>(
-      "EQUAL", OperatorType::kEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowNotEqualOperator>>(
-      "NOT_EQUAL", OperatorType::kNotEqual));
   ops.push_back(
-      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
-  ops.push_back(MakeUnique<SimpleOperator<SelectOperator>>(
-      "SELECT", OperatorType::kSelect));
+      MakeUnique<SimpleOperator<CosOperator>>("COS", OperatorType::kCos));
+  ops.push_back(MakeUnique<LogSoftmax>());
+  ops.push_back(MakeUnique<Maximum>());  //  Element-wise Maximum
+  ops.push_back(MakeUnique<Minimum>());  //  Element-wise Minimum
+  ops.push_back(MakeUnique<Greater>());
+  ops.push_back(MakeUnique<GreaterEqual>());
+  ops.push_back(MakeUnique<Less>());
+  ops.push_back(MakeUnique<LessEqual>());
+  ops.push_back(MakeUnique<Equal>());
+  ops.push_back(MakeUnique<NotEqual>());
   ops.push_back(
-      MakeUnique<SimpleOperator<SliceOperator>>("SLICE", OperatorType::kSlice));
+      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
+  ops.push_back(MakeUnique<Select>());
+  ops.push_back(MakeUnique<Slice>());
   ops.push_back(
       MakeUnique<SimpleOperator<PowOperator>>("POW", OperatorType::kPow));
   ops.push_back(MakeUnique<SimpleOperator<LogicalOrOperator>>(
@@ -1984,6 +2554,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
   ops.push_back(
       MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
+  ops.push_back(MakeUnique<SimpleOperator<ReverseV2Operator>>(
+      "REVERSE_V2", OperatorType::kReverseV2));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowRankOperator>>(
+      "RANK", OperatorType::kRank));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index f77780488ac318b973e8a531fa0194a8401e9da1..1b13f8076a0655577724934ae918f14c98de1cb2 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -112,12 +112,15 @@ class OperatorTest : public ::testing::Test {
 
 TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
+  CheckSimpleOperator<CeilOperator>("CEIL", OperatorType::kCeil);
+  CheckSimpleOperator<EluOperator>("ELU", OperatorType::kElu);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
   CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
+  CheckSimpleOperator<CosOperator>("COS", OperatorType::kCos);
   CheckSimpleOperator<LogSoftmaxOperator>("LOG_SOFTMAX",
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
@@ -150,6 +153,9 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
   CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
+  CheckSimpleOperator<ReverseV2Operator>("REVERSE_V2",
+                                         OperatorType::kReverseV2);
+  CheckSimpleOperator<TensorFlowRankOperator>("RANK", OperatorType::kRank);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -161,6 +167,13 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinAddN) {
+  AddNOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("ADD_N", OperatorType::kAddN), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinReducerOps) {
   CheckReducerOperator<MeanOperator>("MEAN", OperatorType::kMean);
   CheckReducerOperator<TensorFlowSumOperator>("SUM", OperatorType::kSum);
@@ -229,6 +242,20 @@ TEST_F(OperatorTest, BuiltinGather) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinGatherNd) {
+  GatherNdOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("GATHER_ND", OperatorType::kGatherNd), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
+TEST_F(OperatorTest, BuiltinWhere) {
+  WhereOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("WHERE", OperatorType::kWhere), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinL2Pool) {
   L2PoolOperator op;
   op.stride_width = 123;
@@ -569,6 +596,14 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   (*attr)["str_attr"].set_s("Hello World");
   (*attr)["int_attr"].set_i(17);
   (*attr)["bool_attr"].set_b(true);
+  {
+    auto* list = (*attr)["list_string_attr"].mutable_list();
+    list->add_s("abcde");
+    list->add_s("1234");
+    list->add_s("");
+    list->add_s("zyxwv");
+    list->add_s("!-.");
+  }
   {
     auto* list = (*attr)["list_float_attr"].mutable_list();
     list->add_f(std::numeric_limits<float>::min());
@@ -594,6 +629,15 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   EXPECT_EQ("Hello World", output_attr.at("str_attr").s());
   EXPECT_EQ(17, output_attr.at("int_attr").i());
   EXPECT_EQ(true, output_attr.at("bool_attr").b());
+  {
+    const auto& list = output_attr.at("list_string_attr").list();
+    ASSERT_EQ(5, list.s_size());
+    EXPECT_EQ("abcde", list.s(0));
+    EXPECT_EQ("1234", list.s(1));
+    EXPECT_EQ("", list.s(2));
+    EXPECT_EQ("zyxwv", list.s(3));
+    EXPECT_EQ("!-.", list.s(4));
+  }
   {
     const auto& list = output_attr.at("list_float_attr").list();
     ASSERT_EQ(3, list.f_size());
@@ -650,6 +694,194 @@ TEST_F(OperatorTest, BuiltinUnique) {
   EXPECT_EQ(output_toco_op->idx_out_type, op.idx_out_type);
 }
 
+TEST_F(OperatorTest, BuiltinReverseSequence) {
+  ReverseSequenceOperator op;
+  op.seq_dim = 3;
+  op.batch_dim = 1;
+  std::unique_ptr<toco::ReverseSequenceOperator> output_toco_op =
+      SerializeAndDeserialize(
+          GetOperator("REVERSE_SEQUENCE", OperatorType::kReverseSequence), op);
+  EXPECT_EQ(op.seq_dim, output_toco_op->seq_dim);
+  EXPECT_EQ(op.batch_dim, output_toco_op->batch_dim);
+}
+
+// Test version for a simple Op with 2 versions and the input type controls the
+// version.
+template <typename Op>
+void SimpleVersioningTest() {
+  Op op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
+// Test version for a simple Op with 2 versions and the output type controls the
+// version.
+template <typename Op>
+void SimpleOutputVersioningTest() {
+  Op op;
+  op.outputs = {"output1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.outputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.outputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
+TEST_F(OperatorTest, VersioningEqualTest) {
+  SimpleVersioningTest<TensorFlowEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningNotEqualTest) {
+  SimpleVersioningTest<TensorFlowNotEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessTest) {
+  SimpleVersioningTest<TensorFlowLessOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessEqualTest) {
+  SimpleVersioningTest<TensorFlowLessEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterTest) {
+  SimpleVersioningTest<TensorFlowGreaterOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterEqualTest) {
+  SimpleVersioningTest<TensorFlowGreaterEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToBatchNDTest) {
+  SimpleVersioningTest<SpaceToBatchNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogSoftmaxTest) {
+  SimpleVersioningTest<LogSoftmaxOperator>();
+}
+
+TEST_F(OperatorTest, VersioningPackTest) {
+  SimpleVersioningTest<PackOperator>();
+}
+
+TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
+  SimpleVersioningTest<BatchToSpaceNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningTanhTest) {
+  SimpleVersioningTest<TanhOperator>();
+}
+
+TEST_F(OperatorTest, VersioningStridedSliceTest) {
+  SimpleVersioningTest<StridedSliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToDepthTest) {
+  SimpleVersioningTest<SpaceToDepthOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSliceTest) {
+  SimpleVersioningTest<SliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogisticTest) {
+  SimpleVersioningTest<LogisticOperator>();
+}
+
+TEST_F(OperatorTest, VersioningL2NormTest) {
+  SimpleOutputVersioningTest<L2NormalizationOperator>();
+}
+
+TEST_F(OperatorTest, VersioningMaxTest) {
+  SimpleVersioningTest<TensorFlowMaximumOperator>();
+}
+
+TEST_F(OperatorTest, VersioningMinTest) {
+  SimpleVersioningTest<TensorFlowMinimumOperator>();
+}
+
+TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
+
+TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
+
+TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest<MulOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest<PadOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadV2Test) {
+  SimpleVersioningTest<PadV2Operator>();
+}
+
+TEST_F(OperatorTest, VersioningConcatenationTest) {
+  SimpleVersioningTest<ConcatenationOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSelectTest) {
+  SimpleVersioningTest<SelectOperator>();
+}
+
+TEST_F(OperatorTest, VersioningRelu6Test) {
+  SimpleVersioningTest<Relu6Operator>();
+}
+
+TEST_F(OperatorTest, VersioningFullyConnectedTest) {
+  FullyConnectedOperator fully_connected_op;
+  fully_connected_op.inputs = {"input", "weight"};
+  fully_connected_op.outputs = {"output"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op =
+      operator_by_type_map.at(fully_connected_op.type).get();
+
+  Model uint8_model;
+  Array& input_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& weight_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& output_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &fully_connected_op,
+                                       .model = &uint8_model};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& input_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_int8_array.data_type = ArrayDataType::kInt8;
+  Array& weight_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_int8_array.data_type = ArrayDataType::kInt8;
+  Array& output_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &fully_connected_op,
+                                      .model = &int8_model};
+  EXPECT_EQ(op->GetVersion(int8_signature), 4);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index f878dafc1ed3c85197e6b161290ab4da548090f5..96cad557baf24112cc43bd4a6a6170fbc3a8cae2 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -37,7 +37,7 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
 }
 
 // vector<bool> may be implemented using a bit-set, so we can't just
-// reinterpret_cast, accesing it data as vector<bool> and let flatbuffer
+// reinterpret_cast, accessing its data as vector<bool> and let flatbuffer
 // CreateVector handle it.
 // Background: https://isocpp.org/blog/2012/11/on-vectorbool
 DataBuffer::FlatBufferOffset CopyBoolToBuffer(
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index 3611c5d2f2c87ef382dc4a94e8d6641817bdcea2..1b337ebc85f627b2ee90824cacd2a1f9a090428c 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -68,6 +68,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "Cast",
+          "Ceil",
           "CheckNumerics",
           "ComplexAbs",
           "Concat",
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 4a3d6a5848751f4c1d526153bd6f6d08a9f882af..aa7e43350caca295e027a433da1d96af76bb6686 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -49,5 +49,10 @@ int main(int argc, char** argv) {
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
   auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
-  return status.ok() ? 0 : -1;
+  if (!status.ok()) {
+    fprintf(stderr, "%s\n", status.error_message().c_str());
+    fflush(stderr);
+    return 1;
+  }
+  return 0;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 28e7b10ecd056815c8ca6d7a74f324a18d307451..2adfc1dd236bfe3ba8ee1de70e0dbdba08d9f283 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -77,7 +77,7 @@ tensorflow::Status Convert(const string& graph_def_contents,
                            string* output_file_contents) {
   std::unique_ptr<Model> model =
       Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
+  TF_RETURN_IF_ERROR(TransformWithStatus(toco_flags, model.get()));
   return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
                 output_file_contents);
 }
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 55a454e66de4d0afce18421450d875911bea01f4..c66ef1db915b0b055982c06e24a9706b1943c804 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -178,6 +178,23 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
       // Ignore non-real data types.
       continue;
     }
+    // The enum value QUANTIZED_UINT8 for --inference_type and
+    // --inference_input_type has long meant just 'QUANTIZED', being used as
+    // well in mixed 8-bit / 16-bit quantized models. However,
+    // ConvertIODataTypeToArrayDataType still interpretes it as meaning 8bit,
+    // and people have run into issues in the situation where they have an
+    // already mixed 8-bit / 16-bit quantized model in TFLITE format and
+    // want to run it again through toco, without having to re-specify all the
+    // extra array info that was used in the (complicated) process of initially
+    // quantizing that model. In order to have --inference_type=QUANTIZED_UINT8
+    // just work in that case, we implement the logic that when an array is
+    // already quantized, if  --inference_type is quantized (so we're not
+    // asking to dequantize here), no change of quantized data type is to be
+    // recorded.
+    if (array->data_type != toco::ArrayDataType::kFloat &&
+        type != toco::ArrayDataType::kFloat) {
+      continue;
+    }
 
     array->final_data_type = type;
   }
@@ -219,7 +236,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
   return model;
 }
 
-void Transform(const TocoFlags& toco_flags, Model* model) {
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -241,8 +259,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
   // and BatchNorm layers.
-  RunGraphTransformations(model, "Removing unused ops",
-                          {new toco::RemoveUnusedOp});
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Removing unused ops", {new toco::RemoveUnusedOp}));
 
   GraphTransformationsSet transformations;
   MakeGeneralGraphTransformationsSet(&transformations);
@@ -290,22 +308,36 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     identify_dilated_conv->set_identify_depthwise_conv(false);
   }
   transformations.Add(identify_dilated_conv);
-  RunGraphTransformations(model, "general graph transformations",
-                          transformations);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "general graph transformations", transformations));
 
   if (quantize_output) {
     if (toco_flags.propagate_fake_quant_num_bits()) {
-      RunGraphTransformations(model,
-                              "fake quant propagation graph transformations",
-                              {new PropagateFakeQuantNumBits});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "fake quant propagation graph transformations",
+          {new PropagateFakeQuantNumBits}));
     }
-    RunGraphTransformations(model, "pre-quantization graph transformations",
-                            {
-                                new HardcodeMinMax,
-                                new DropFakeQuant,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "pre-quantization graph transformations",
+        {
+            new HardcodeMinMax,
+            new DropFakeQuant,
+        }));
   }
 
+  // Try to merge bidirectional sequence lstm or rnn if present.
+  GraphTransformationsSet bidirectional_transformations;
+  bidirectional_transformations.Add(new RemoveUnusedOp);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceLstm);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceRnn);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceRnn);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceLstm);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Group bidirectional sequence lstm/rnn",
+      bidirectional_transformations));
+
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
@@ -332,12 +364,12 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
           toco_flags.default_int16_ranges_max());
     }
     if (propagate_default_min_max->has_any_ranges_defined()) {
-      RunGraphTransformations(
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
           model, "default min-max range propagation graph transformations",
           {
               propagate_default_min_max.release(),
               new HardcodeMinMax,
-          });
+          }));
     }
 
     CheckIsReadyForQuantization(*model);
@@ -347,17 +379,18 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
     ensure_safe_for_int8_kernels->set_has_default_ranges_flag(
         has_default_ranges_flag);
-    RunGraphTransformations(model, "quantization graph transformations",
-                            {
-                                new RemoveTrivialQuantizedActivationFunc,
-                                new RemoveTrivialQuantizedMinMax,
-                                new Quantize,
-                                new RemoveFinalDequantizeOp,
-                                ensure_safe_for_int8_kernels,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "quantization graph transformations",
+        {
+            new RemoveTrivialQuantizedActivationFunc,
+            new RemoveTrivialQuantizedMinMax,
+            new Quantize,
+            new RemoveFinalDequantizeOp,
+            ensure_safe_for_int8_kernels,
+        }));
     if (SupportsShuffledFCWeights(output_format)) {
-      RunGraphTransformations(model, "shuffling of FC weights",
-                              {new ShuffleFCWeights});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "shuffling of FC weights", {new ShuffleFCWeights}));
     }
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
@@ -367,8 +400,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
       dequantization_transformations.Add(new DropFakeQuant);
     }
 
-    RunGraphTransformations(model, "dequantization graph transformations",
-                            dequantization_transformations);
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "dequantization graph transformations",
+        dequantization_transformations));
   }
 
   if (output_format == TENSORFLOW_GRAPHDEF) {
@@ -400,6 +434,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
               << " billion (note that a multiply-add is counted as 2 ops).";
   }
   model->ops_count = ops_count;
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
@@ -423,7 +458,7 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       return status;
     } break;
     case GRAPHVIZ_DOT:
-      DumpGraphviz(model, output_file_contents);
+      DumpGraphviz(model, output_file_contents, "Computation Graph");
       break;
     default:
       LOG(FATAL) << "Unhandled output_format='"
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 742e3769269859c62522707ba415cd509e8df629..369961519499027ee4e3b04e4ebee6aadfd7c21c 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -31,7 +31,12 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 
 // Transforms a Model. The resulting Model is ready to be passed
 // to Export with the exact same toco_flags.
-void Transform(const TocoFlags& toco_flags, Model* model);
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model);
+inline void Transform(const TocoFlags& toco_flags, Model* model) {
+  auto s = TransformWithStatus(toco_flags, model);
+  CHECK(s.ok()) << s.error_message();
+}
 
 // Exports the Model, which must be of the 'lowered' form returned by
 // Transform, to a file of the format given by
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 2c98ff1d929c9d17ceb8d74a22461b5f484d7be9..ca2477fed1a17b3c8999956d6c21bedefa3b4d8e 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
 #include "re2/re2.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/dump_graphviz.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -66,29 +66,29 @@ string LogName(const Operator& op) {
 string ArrayDataTypeName(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
-      return "Float";
+      return "float";
     case ArrayDataType::kInt8:
-      return "Int8";
+      return "int8";
     case ArrayDataType::kUint8:
-      return "Uint8";
+      return "uint8";
     case ArrayDataType::kInt16:
-      return "Int16";
+      return "int16";
     case ArrayDataType::kUint16:
-      return "Uint16";
+      return "uint16";
     case ArrayDataType::kInt32:
-      return "Int32";
+      return "int32";
     case ArrayDataType::kUint32:
-      return "Uint32";
+      return "uint32";
     case ArrayDataType::kInt64:
-      return "Int64";
+      return "int64";
     case ArrayDataType::kUint64:
-      return "Uint64";
+      return "uint64";
     case ArrayDataType::kString:
-      return "String";
+      return "string";
     case ArrayDataType::kBool:
-      return "Bool";
+      return "bool";
     case ArrayDataType::kComplex64:
-      return "Complex64";
+      return "complex64";
     case ArrayDataType::kNone:
       return "None";
     default:
@@ -331,6 +331,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
     HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
+    HANDLE_OPERATORTYPENAME_CASE(Elu)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
@@ -385,7 +386,9 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ConcatV2)
     HANDLE_OPERATORTYPENAME_CASE(Cast)
     HANDLE_OPERATORTYPENAME_CASE(Floor)
+    HANDLE_OPERATORTYPENAME_CASE(Ceil)
     HANDLE_OPERATORTYPENAME_CASE(Gather)
+    HANDLE_OPERATORTYPENAME_CASE(GatherNd)
     HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
     HANDLE_OPERATORTYPENAME_CASE(SpaceToBatchND)
     HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
@@ -412,12 +415,18 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Unpack)
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceRnn)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
     HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
     HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
     HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
     HANDLE_OPERATORTYPENAME_CASE(Unique)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceRnn)
+    HANDLE_OPERATORTYPENAME_CASE(ReverseV2)
+    HANDLE_OPERATORTYPENAME_CASE(Cos)
+    HANDLE_OPERATORTYPENAME_CASE(Where)
+    HANDLE_OPERATORTYPENAME_CASE(ReverseSequence)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -534,7 +543,8 @@ void DumpGraphvizVideoFrame(const Model& model) {
   static int dump_id = 0;
   static std::unordered_set<std::size_t> dump_hashes;
   string graphviz_dump;
-  DumpGraphviz(model, &graphviz_dump);
+  DumpGraphviz(model, &graphviz_dump,
+               toco::port::StringF("VIDEO frame:%05d", dump_id));
   std::size_t hash = std::hash<string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
@@ -557,7 +567,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
   if (!dump_options.dump_graphviz.empty()) {
     string graphviz_dump;
 
-    DumpGraphviz(model, &graphviz_dump);
+    DumpGraphviz(model, &graphviz_dump, message);
     const auto result = port::file::SetContents(
         port::file::JoinPath(
             dump_options.dump_graphviz,
@@ -895,11 +905,6 @@ void CheckNonExistentIOArrays(const Model& model) {
   static constexpr char general_comment[] =
       "Is it a typo? To silence this message, pass this flag:  "
       "allow_nonexistent_arrays";
-  for (const auto& input_array : model.flags.input_arrays()) {
-    QCHECK(GetOpWithInput(model, input_array.name()))
-        << "Specified input array \"" << input_array.name()
-        << "\" is not consumed by any op in this graph. " << general_comment;
-  }
   for (const string& output_array : model.flags.output_arrays()) {
     if (IsConstantParameterArray(model, output_array)) {
       continue;  // It is OK to request that a constant be an output.
@@ -1092,7 +1097,7 @@ void FixOperatorOrdering(Model* model) {
   std::unordered_map<string, string> reason_why_leftover;
   while (true) {
     bool inserted_something = false;
-    for (auto i : remaining) {
+    for (const auto& i : remaining) {
       bool can_insert = true;
       auto& op = old_operators[i];
       CHECK(op);
@@ -1162,7 +1167,7 @@ void FixOperatorOrdering(Model* model) {
       }
       bad_inputs_already_traced.insert(bad_input);
       bad_op = nullptr;
-      for (auto i : remaining) {
+      for (const auto& i : remaining) {
         const Operator* op = old_operators[i].get();
         for (const string& output : op->outputs) {
           if (bad_input == output) {
@@ -1265,7 +1270,7 @@ void FixEdgeArrays(Model* model) {
 
 void DedupeConstantArrays(Model* model, size_t min_size) {
   // Walk all 0..N and compare with the remaining n+1..N.
-  // This lets us avoid N^2 comparisions and erase duplicate arrays while
+  // This lets us avoid N^2 comparisons and erase duplicate arrays while
   // iterating.
   const auto& array_map = model->GetArrayMap();
   for (auto lhs_array_it = array_map.begin(); lhs_array_it != array_map.end();
@@ -1457,16 +1462,22 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
   }
 }
 
-void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
+void CreateOrCheckRnnStateArray(const string& name, int size,
+                                int state_num_dims, Model* model) {
   int batch = 1;
   int num_dims = -1;
-  for (const auto& input_array : model->flags.input_arrays()) {
-    // Pick 'num_dims' and 'batch' from the first input_arrays, unless we find
-    // a better match by name.
-    if (input_array.name() == name || num_dims == -1) {
-      num_dims = input_array.shape().dims_size();
-      if (num_dims > 0) {
-        batch = input_array.shape().dims(0);
+  if (state_num_dims > 0) {
+    num_dims = state_num_dims;
+  } else {
+    // state_num_dims is not given. We will infer it from an input tensor.
+    for (const auto& input_array : model->flags.input_arrays()) {
+      // Pick 'num_dims' and 'batch' from the first input_arrays, unless we find
+      // a better match by name.
+      if (input_array.name() == name || num_dims == -1) {
+        num_dims = input_array.shape().dims_size();
+        if (num_dims > 0) {
+          batch = input_array.shape().dims(0);
+        }
       }
     }
   }
@@ -1629,7 +1640,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       if (input_array_proto.has_shape()) {
         auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
         CheckValidShapeDimensions(input_array_proto.shape().dims());
-        for (auto dim : input_array_proto.shape().dims()) {
+        for (const auto& dim : input_array_proto.shape().dims()) {
           input_array_dims.push_back(dim);
         }
       }
@@ -1670,7 +1681,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
-                               model);
+                               rnn_state.num_dims(), model);
   }
 
   model->flags.set_change_concat_input_ranges(
@@ -1859,119 +1870,140 @@ string CreateInt32Array(Model* model, const string& param_name,
   return param_array_name;
 }
 
-bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
-  int64 total = 0;
-  for (const auto& op : model.operators) {
-    switch (op->type) {
-      case OperatorType::kFullyConnected:
-      case OperatorType::kConv:
-      case OperatorType::kDepthwiseConv: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        const auto& weights_array = model.GetArray(op->inputs[1]);
-        if (!output_array.has_shape() || !weights_array.has_shape()) {
-          return false;
-        }
-        int cols = 1;
-        for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
-          cols *= output_array.shape().dims(i);
-        }
-        const int64 cost_per_col =
-            2 * RequiredBufferSizeForShape(weights_array.shape());
-        total += cost_per_col * cols;
-        if (op->inputs.size() > 2) {
-          // There is a bias vector. One more op per output value.
-          total += RequiredBufferSizeForShape(output_array.shape());
-        }
-        break;
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result) {
+  switch (op.type) {
+    case OperatorType::kFullyConnected:
+    case OperatorType::kConv:
+    case OperatorType::kDepthwiseConv: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      const auto& weights_array = model.GetArray(op.inputs[1]);
+      if (!output_array.has_shape() || !weights_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAdd:
-      case OperatorType::kSub:
-      case OperatorType::kMul: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape());
-        break;
+      int64 cols = 1;
+      for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
+        cols *= output_array.shape().dims(i);
       }
-      case OperatorType::kAddN: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // AddN cost is roughly the same cost as N-1 Adds.
-        const int num_adds = op->inputs.size() - 1;
-        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      const int64 cost_per_col =
+          2 * RequiredBufferSizeForShape(weights_array.shape());
+      *result = cost_per_col * cols;
+      if (op.inputs.size() > 2) {
+        // There is a bias vector. One more op per output value.
+        *result += RequiredBufferSizeForShape(output_array.shape());
       }
-      case OperatorType::kLogistic:
-      case OperatorType::kSoftmax:
-      case OperatorType::kLogSoftmax:
-      case OperatorType::kTanh: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // As a very rough ballpark, the cost of evaluating a math function
-        // such as tanh or logistic is about 32 multiplications, and about as
-        // many additions/subtractions. (Just a power-of-two order-of-magnitude
-        // from looking at actual implementations that we use in runtime/ code).
-        total += 64 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      break;
+    }
+    case OperatorType::kAdd:
+    case OperatorType::kSub:
+    case OperatorType::kMul: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kMaxPool: {
-        const auto& maxpool = *static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 maxpool.kheight * maxpool.kwidth;
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kAddN: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAveragePool: {
-        const auto& avgpool =
-            *static_cast<const AveragePoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 avgpool.kheight * avgpool.kwidth;
-        break;
+      // AddN cost is roughly the same cost as N-1 Adds.
+      const int64 num_adds = op.inputs.size() - 1;
+      *result = num_adds * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kLogistic:
+    case OperatorType::kSoftmax:
+    case OperatorType::kLogSoftmax:
+    case OperatorType::kTanh: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Pool: {
-        const auto* maxpool = static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // The sum of squares requires (kheight*kwidth) multiply-adds,
-        // and then there is the sqrt which we ballpark at 32 ops.
-        const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
-        total +=
-            RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
-        break;
+      // As a very rough ballpark, the cost of evaluating a math function
+      // such as tanh or logistic is about 32 multiplications, and about as
+      // many additions/subtractions. (Just a power-of-two order-of-magnitude
+      // from looking at actual implementations that we use in runtime/ code).
+      *result = 64 * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kMaxPool: {
+      const auto& maxpool = *static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Normalization: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // Computing the squared L2 norm is N multiply-adds so 2N ops,
-        // then the single inverse-sqrt is negligible, then we multiply each
-        // value by the resulting multiplier, so an extra N ops. Total 3N ops.
-        total += 3 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                maxpool.kheight * maxpool.kwidth;
+      break;
+    }
+    case OperatorType::kAveragePool: {
+      const auto& avgpool = *static_cast<const AveragePoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      default:
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                avgpool.kheight * avgpool.kwidth;
+      break;
     }
+    case OperatorType::kL2Pool: {
+      const auto* maxpool = static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // The sum of squares requires (kheight*kwidth) multiply-adds,
+      // and then there is the sqrt which we ballpark at 32 ops.
+      const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
+      *result = RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
+      break;
+    }
+    case OperatorType::kL2Normalization: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // Computing the squared L2 norm is N multiply-adds so 2N ops,
+      // then the single inverse-sqrt is negligible, then we multiply each
+      // value by the resulting multiplier, so an extra N ops. count 3N ops.
+      *result = 3 * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    default:
+      *result = 0;
+      break;
+  }
+  return true;
+}
+
+bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
+  int64 total = 0;
+  for (const auto& op : model.operators) {
+    int64 num_ops;
+    if (!EstimateArithmeticOpsCount(model, *op, &num_ops)) {
+      return false;
+    }
+    total += num_ops;
   }
   *result = total;
   return true;
 }
 
+string FormattedNumber(int64 x) {
+  const int64 million = 1000000;
+  const int64 billion = 1000000000;
+  if (x < 10000) {
+    return toco::port::StringF("%d ", x);
+  } else if (x < billion) {
+    return toco::port::StringF("%.3f M", static_cast<double>(x) / million);
+  } else {
+    return toco::port::StringF("%.3f G", static_cast<double>(x) / billion);
+  }
+}
+
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle) {
   CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
@@ -2301,7 +2333,7 @@ void UseArraysExtraInfo(Model* model, bool quantize_output) {
         // Make sure to create the shape even if there are no dims, to
         // correctly record 0-D shapes.
         array.mutable_shape();
-        for (int dim : entry.shape().dims()) {
+        for (const auto& dim : entry.shape().dims()) {
           array.mutable_shape()->mutable_dims()->push_back(dim);
         }
       }
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 517da784d0e6395eb06a0bf0fb9004645d292e42..b8a3dfca933273fbfb990d5229b94b67a9f907ca 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -250,7 +250,8 @@ void DropMinMax(Model* model, const string& array_name);
 
 bool IsAllocatableTransientArray(const Model& model, const string& array_name);
 
-void CreateOrCheckRnnStateArray(const string& name, int size, Model* model);
+void CreateOrCheckRnnStateArray(const string& name, int size,
+                                int state_num_dims, Model* model);
 
 string AvailableArrayName(const Model& model, const string& name);
 
@@ -267,7 +268,10 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
 string CreateInt32Array(Model* model, const string& param_name,
                         const std::vector<int>& value);
 
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result);
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
+string FormattedNumber(int64 x);
 
 int AxesCount(AxesOrder axes_order);
 
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 1d141b5dd01a4a03c65d0c8a119ad62eea224d52..f67b3f98e9beafd1548a2033289ffbc9e3b86356 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -70,7 +70,9 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -86,6 +88,7 @@ cc_test(
         "//tensorflow/core:framework_lite",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
index ac3a1566e2a2c834260acbfbee8908cc13efa42a..28ad2e407f331023ebc22a5692693f5669feaff3 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/README.md
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -16,18 +16,25 @@ The binary takes the following parameters:
     The path to the directory containing ground truth images.
 
 *   `ground_truth_labels`: `string` \
-    Path to ground truth labels file. This file should contain the same number of labels as    the number images in the ground truth directory. The labels are assumed to be in the
-    same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation)
-    section for more information about how to generate labels for images.
+    Path to ground truth labels file. This file should contain the same number
+    of labels as the number images in the ground truth directory. The labels are
+    assumed to be in the same order as the sorted filename of images. See
+    [ground truth label generation](#ground-truth-label-generation) section for
+    more information about how to generate labels for images.
 
-*    `model_output_labels`: `string` \
+*   `model_output_labels`: `string` \
     Path to the file containing labels, that is used to interpret the output of
     the model. E.g. in case of mobilenets, this is the path to
     `mobilenet_labels.txt` where each label is in the same order as the output
     1001 dimension tensor.
 
 *   `output_path`: `string` \
-    This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set.
+    This is the path to the output file. The output is a CSV file that has
+    top-10 accuracies in each row. Each line of output file is the cumulative
+    accuracy after processing images in a sorted order. So first line is
+    accuracy after processing the first image, second line is accuracy after
+    processing first two images. The last line of the file is accuracy after
+    processing the entire validation set.
 
 and the following optional parameters:
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 04b6cb755892bd218d899587bd81b818a51f85d8..b730b0804e0df3d559ec99552fb443efc3e867eb 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index e68997a00b6e8bd61bf43185b53afdbfff71084f..ce31eaf42f170b6ce52a961bb984197313e63f96 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -141,10 +141,6 @@ cc_library(
         ":logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/profiling:profile_summarizer",
-        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/profiling:time",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4d9c879eb645019a7626502207e9a3f4e89b1c1..e6ba818c71f23f39e511b7866ce2356848d46493 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -5,7 +5,7 @@
 A simple C++ binary to benchmark a TFLite model and its individual operators,
 both on desktop machines and on Android. The binary takes a TFLite model,
 generates random inputs and then repeatedly runs the model for specified number
-of runs. Aggregrate latency statistics are reported after running the benchmark.
+of runs. Aggregate latency statistics are reported after running the benchmark.
 
 The instructions below are for running the binary on Desktop and Android,
 for iOS please use the
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index f5b67e3f79aa669c5424d46c23f053213ad3a101..db82c59acd3de38bbd8ffcf1542f34adf02c9098 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -51,7 +51,7 @@ and can be appended to the `args` string alongside the required `--graph` flag
 args key).
 
 ```
-adb shell am start -S -n
+adb shell am start -S -n \
   org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
   --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
 ```
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 70f4c94d3588b1645ce6c8422ca3cfe94eddc8e6..a971e645e8b20dedb16e2ce15566518b3beb0eab 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -108,7 +108,9 @@ void BenchmarkModel::LogParams() {
                    << params_.Get<float>("warmup_min_secs") << "]";
 }
 
-void BenchmarkModel::PrepareInputsAndOutputs() {}
+void BenchmarkModel::PrepareInputData() {}
+
+void BenchmarkModel::ResetInputsAndOutputs() {}
 
 Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
                                   RunType run_type) {
@@ -120,7 +122,7 @@ Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
   for (int run = 0;
        run < min_num_times || profiling::time::NowMicros() < min_finish_us;
        run++) {
-    PrepareInputsAndOutputs();
+    ResetInputsAndOutputs();
     listeners_.OnSingleRunStart(run_type);
     int64_t start_us = profiling::time::NowMicros();
     RunImpl();
@@ -151,7 +153,6 @@ void BenchmarkModel::Run() {
   ValidateParams();
   LogParams();
 
-  listeners_.OnBenchmarkStart(params_);
   int64_t initialization_start_us = profiling::time::NowMicros();
   Init();
   int64_t initialization_end_us = profiling::time::NowMicros();
@@ -159,7 +160,9 @@ void BenchmarkModel::Run() {
   TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
                    << "ms";
 
+  PrepareInputData();
   uint64_t input_bytes = ComputeInputBytes();
+  listeners_.OnBenchmarkStart(params_);
   Stat<int64_t> warmup_time_us =
       Run(params_.Get<int32_t>("warmup_runs"),
           params_.Get<float>("warmup_min_secs"), WARMUP);
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index 31ee5c92aa3f1ffb53f1a39fbc6e1344d92a260c..783122153d8f5a82185c27843f8797a7a797e3c7 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -66,9 +66,15 @@ class BenchmarkResults {
 
 class BenchmarkListener {
  public:
+  // Called before the (outer) inference loop begins.
+  // Note that this is called *after* the interpreter has been initialized, but
+  // *before* any warmup runs have been executed.
   virtual void OnBenchmarkStart(const BenchmarkParams& params) {}
+  // Called before a single (inner) inference call starts.
   virtual void OnSingleRunStart(RunType runType) {}
+  // Called before a single (inner) inference call ends.
   virtual void OnSingleRunEnd() {}
+  // Called after the (outer) inference loop begins.
   virtual void OnBenchmarkEnd(const BenchmarkResults& results) {}
   virtual ~BenchmarkListener() {}
 };
@@ -152,7 +158,11 @@ class BenchmarkModel {
   virtual uint64_t ComputeInputBytes() = 0;
   virtual tensorflow::Stat<int64_t> Run(int min_num_times, float min_secs,
                                         RunType run_type);
-  virtual void PrepareInputsAndOutputs();
+  // Prepares input data for benchmark. This can be used to initialize input
+  // data that has non-trivial cost.
+  virtual void PrepareInputData();
+
+  virtual void ResetInputsAndOutputs();
   virtual void RunImpl() = 0;
   BenchmarkParams params_;
   BenchmarkListeners listeners_;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index a4f830122f65bcacb0eae4783998cf8bb5611fb9..a8f7eff8dadde22d758dd588bb1d661512be86e9 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -44,6 +44,7 @@ BenchmarkParams CreateParams() {
   params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
   params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
   params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
   params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   return params;
 }
@@ -54,7 +55,10 @@ class TestBenchmark : public BenchmarkTfLiteModel {
       : BenchmarkTfLiteModel(std::move(params)) {}
   const tflite::Interpreter* GetInterpreter() { return interpreter.get(); }
 
-  void Prepare() { PrepareInputsAndOutputs(); }
+  void Prepare() {
+    PrepareInputData();
+    ResetInputsAndOutputs();
+  }
 };
 
 TEST(BenchmarkTest, DoesntCrash) {
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 0bc7565e82c0471c439c0a0ab84e09dd39c7b9a9..046aa92845f9e40cd7560d16cd095fdee8a09dd8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -118,12 +118,8 @@ bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
 }
 
 template <typename T>
-void FillRandomValue(T* ptr, const std::vector<int>& sizes,
+void FillRandomValue(T* ptr, int num_elements,
                      const std::function<T()>& random_func) {
-  int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
-  }
   for (int i = 0; i < num_elements; ++i) {
     *ptr++ = random_func();
   }
@@ -200,6 +196,7 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   default_params.AddParam("input_layer_shape",
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
   return default_params;
 }
 
@@ -212,6 +209,19 @@ BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
   AddListener(&gemmlowp_profiling_listener_);
 }
 
+void BenchmarkTfLiteModel::CleanUp() {
+  if (inputs_data_.empty()) {
+    return;
+  }
+  // Free up any pre-allocated tensor data during PrepareInputData.
+  for (int i = 0; i < inputs_data_.size(); ++i) {
+    delete[] inputs_data_[i].data.raw;
+  }
+  inputs_data_.clear();
+}
+
+BenchmarkTfLiteModel::~BenchmarkTfLiteModel() { CleanUp(); }
+
 std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
   std::vector<Flag> flags = BenchmarkTfLiteModel::BenchmarkModel::GetFlags();
   std::vector<Flag> specific_flags = {
@@ -219,7 +229,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<std::string>("input_layer", &params_, "input layer names"),
       CreateFlag<std::string>("input_layer_shape", &params_,
                               "input layer shape"),
-      CreateFlag<bool>("use_nnapi", &params_, "use nnapi api")};
+      CreateFlag<bool>("use_nnapi", &params_, "use nnapi api"),
+      CreateFlag<bool>("allow_fp16", &params_, "allow fp16")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
   return flags;
@@ -233,6 +244,8 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Input shapes: ["
                    << params_.Get<std::string>("input_layer_shape") << "]";
   TFLITE_LOG(INFO) << "Use nnapi : [" << params_.Get<bool>("use_nnapi") << "]";
+  TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
+                   << "]";
 }
 
 bool BenchmarkTfLiteModel::ValidateParams() {
@@ -256,38 +269,78 @@ uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
   return total_input_bytes;
 }
 
-void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
+void BenchmarkTfLiteModel::PrepareInputData() {
   auto interpreter_inputs = interpreter->inputs();
-  // Set the values of the input tensors.
-  for (int j = 0; j < interpreter_inputs.size(); ++j) {
+  const size_t input_size = interpreter_inputs.size();
+  CleanUp();
+
+  for (int j = 0; j < input_size; ++j) {
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter->tensor(i);
     std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
-    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
+    int num_elements = 1;
+    // TODO(haoliang): Ignore the 0-th dimension (number of batches).
+    for (int i = 1; i < sizes.size(); ++i) {
+      num_elements *= sizes[i];
+    }
+    InputTensorData t_data;
     if (t->type == kTfLiteFloat32) {
-      FillRandomValue<float>(
-          interpreter->typed_tensor<float>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
+      t_data.bytes = sizeof(float) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<float>(t_data.data.f, num_elements, []() {
+        return static_cast<float>(rand()) / RAND_MAX - 0.5f;
+      });
     } else if (t->type == kTfLiteInt32) {
       // TODO(yunluli): This is currently only used for handling embedding input
       // for speech models. Generalize if necessary.
-      FillRandomValue<int32_t>(
-          interpreter->typed_tensor<int32_t>(i),
-          std::vector<int32_t>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<int32_t>(rand()) % 100; });
+      t_data.bytes = sizeof(int32_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<int32_t>(t_data.data.i32, num_elements, []() {
+        return static_cast<int32_t>(rand()) % 100;
+      });
+    } else if (t->type == kTfLiteUInt8) {
+      t_data.bytes = sizeof(uint8_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<uint8_t>(t_data.data.uint8, num_elements, []() {
+        return static_cast<uint8_t>(rand()) % 255;
+      });
+    } else if (t->type == kTfLiteInt8) {
+      t_data.bytes = sizeof(int8_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<int8_t>(t_data.data.int8, num_elements, []() {
+        return static_cast<int8_t>(rand()) % 255 - 127;
+      });
+    } else if (t->type == kTfLiteString) {
+      // TODO(haoliang): No need to cache string tensors right now.
+    } else {
+      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+                        << " of type " << t->type;
+    }
+    inputs_data_.push_back(t_data);
+  }
+}
+
+void BenchmarkTfLiteModel::ResetInputsAndOutputs() {
+  auto interpreter_inputs = interpreter->inputs();
+  // Set the values of the input tensors from inputs_data_.
+  for (int j = 0; j < interpreter_inputs.size(); ++j) {
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    if (t->type == kTfLiteFloat32) {
+      std::memcpy(interpreter->typed_tensor<float>(i), inputs_data_[j].data.f,
+                  inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteInt32) {
+      std::memcpy(interpreter->typed_tensor<int32_t>(i),
+                  inputs_data_[j].data.i32, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteUInt8) {
-      FillRandomValue<uint8_t>(
-          interpreter->typed_tensor<uint8_t>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<uint8_t>(rand()) % 255; });
+      std::memcpy(interpreter->typed_tensor<uint8_t>(i),
+                  inputs_data_[j].data.uint8, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteInt8) {
-      FillRandomValue<int8_t>(
-          interpreter->typed_tensor<int8_t>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<int8_t>(rand()) % 255 - 127; });
+      std::memcpy(interpreter->typed_tensor<int8_t>(i),
+                  inputs_data_[j].data.int8, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteString) {
       tflite::DynamicBuffer buffer;
+      std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
@@ -328,6 +381,10 @@ void BenchmarkTfLiteModel::Init() {
   interpreter->UseNNAPI(use_nnapi);
   ApplyDelegates();
 
+  bool allow_fp16 = params_.Get<bool>("allow_fp16");
+
+  interpreter->SetAllowFp16PrecisionForFp32(allow_fp16);
+
   auto interpreter_inputs = interpreter->inputs();
 
   if (!inputs.empty()) {
@@ -336,14 +393,17 @@ void BenchmarkTfLiteModel::Init() {
         << " expected: " << inputs.size();
   }
 
-  // TFLITE_BENCHMARK_CHECK that all names and types match
+  // Check if the tensor names match, and log a warning if it doesn't.
+  // TODO(ycling): Consider to make this an error again when the new converter
+  // create tensors with consistent naming.
   for (int j = 0; j < inputs.size(); ++j) {
     const InputLayerInfo& input = inputs[j];
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter->tensor(i);
-    TFLITE_BENCHMARK_CHECK_EQ(t->name, input.name)
-        << "Tensor # " << i << " is named " << t->name << " but flags call it "
-        << input.name;
+    if (input.name != t->name) {
+      TFLITE_LOG(WARN) << "Tensor # " << i << " is named " << t->name
+                       << " but flags call it " << input.name;
+    }
   }
 
   // Resize all non-string tensors.
@@ -356,11 +416,23 @@ void BenchmarkTfLiteModel::Init() {
     }
   }
 
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
+  // Don't allocate tensors if we have delegates.
+  if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
 }
 
+void BenchmarkTfLiteModel::ApplyDelegates() {
+  for (int i = 0; i < delegates_.size(); ++i) {
+    if (interpreter->ModifyGraphWithDelegate(delegates_[i].get()) !=
+        kTfLiteOk) {
+      TFLITE_LOG(FATAL) << "Failed to apply delegate # " << i;
+    } else {
+      TFLITE_LOG(INFO) << "Applied Delegate # " << i;
+    }
+  }
+}
+
 void BenchmarkTfLiteModel::RunImpl() {
   if (interpreter->Invoke() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to invoke!";
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 83599e644d1f41f70fd96f3a73f9155d6e62deef..87b00a543f26d0cf67afebe4787f699e6f9a38ab 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -60,9 +60,14 @@ class GemmlowpProfilingListener : public BenchmarkListener {
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
+  struct InputLayerInfo {
+    std::string name;
+    std::vector<int> shape;
+  };
+
   BenchmarkTfLiteModel();
   explicit BenchmarkTfLiteModel(BenchmarkParams params);
-  virtual ~BenchmarkTfLiteModel() {}
+  virtual ~BenchmarkTfLiteModel();
 
   std::vector<Flag> GetFlags() override;
   void LogParams() override;
@@ -70,26 +75,32 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
-
-  struct InputLayerInfo {
-    std::string name;
-    std::vector<int> shape;
-  };
+  void SetDelegates(std::vector<std::unique_ptr<TfLiteDelegate>> delegates) {
+    delegates_ = std::move(delegates);
+  }
 
  protected:
   static BenchmarkParams DefaultParams();
-  void PrepareInputsAndOutputs() override;
+  void PrepareInputData() override;
+  void ResetInputsAndOutputs() override;
+  void CleanUp();
 
   // Allows installation of custom delegates during initialization
-  virtual void ApplyDelegates() {}
+  virtual void ApplyDelegates();
 
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
 
  private:
+  struct InputTensorData {
+    TfLitePtrUnion data;
+    size_t bytes;
+  };
   std::vector<InputLayerInfo> inputs;
+  std::vector<InputTensorData> inputs_data_;
   ProfilingListener profiling_listener_;
   GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::vector<std::unique_ptr<TfLiteDelegate>> delegates_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index fed9e7ea7e8633e00413118fa3e9e4f12d5188a4..ee880f005dfaec1cd27d5dc093720f5de5433bfa 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -24,11 +24,12 @@ to build TFLite.
 Running
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh
+tensorflow/lite/tools/make/build_ios_universal_lib.sh
 ```
-will also build `tensorflow/lite/gen/lib/benchmark-lib.a` .
 
-- Now copy the downloaded model file to `benchmark_data` directory. 
+will also build `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a` .
+
+- Now copy the downloaded model file to `benchmark_data` directory.
 
 - Modify `benchmark_params.json` change the `input_layer`, `input_layer_shape`
 and other benchmark parameters.
@@ -36,8 +37,8 @@ and other benchmark parameters.
 - Change `Build Phases -> Copy Bundle Resources` and add the model file to the
 resources that need to be copied.
 
-- Ensure that `Build Phases -> Link Binary With Library` contains the 
-`Accelerate framework` and `tensorflow/lite/gen/lib/benchmark-lib.a`.
+- Ensure that `Build Phases -> Link Binary With Library` contains the
+`Accelerate framework` and `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a`.
 
 - Now try running the app. The app has a single button that runs the benchmark
   on the model and displays results in a text view below.
@@ -47,7 +48,7 @@ resources that need to be copied.
 If you want detailed profiling, use the following command:
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh -p
+tensorflow/lite/tools/make/build_ios_universal_lib.sh -p
 ```
 
 Then following the same steps above and run the benchmark app. You will see the
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dc81b7521bdd6f66f8f9b4f2db2a241e4eddd6a6
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -0,0 +1,72 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "evaluation_stage",
+    srcs = ["evaluation_stage.cc"],
+    hdrs = ["evaluation_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "identity_stage",
+    srcs = ["identity_stage.cc"],
+    hdrs = ["identity_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":evaluation_stage",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tf_cc_test(
+    name = "evaluation_stage_test",
+    srcs = ["evaluation_stage_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":evaluation_stage",
+        ":identity_stage",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage.cc b/tensorflow/lite/tools/evaluation/evaluation_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215f170e57a142c84ef8b01c29b47aeb3745e05b
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+
+#include <string>
+
+#include "absl/strings/str_split.h"
+
+namespace tflite {
+namespace evaluation {
+
+bool EvaluationStage::Init(
+    absl::flat_hash_map<std::string, void*>& object_map) {
+  // Process & validate configuration of tags.
+  std::vector<std::string> initializers, inputs, outputs;
+  for (const auto& init : config_.initializers()) {
+    initializers.emplace_back(init);
+  }
+  for (const auto& in : config_.inputs()) {
+    inputs.emplace_back(in);
+  }
+  for (const auto& out : config_.outputs()) {
+    outputs.emplace_back(out);
+  }
+  if (!ProcessExpectedTags(GetInitializerTags(), initializers) ||
+      !ProcessExpectedTags(GetInputTags(), inputs) ||
+      !ProcessExpectedTags(GetOutputTags(), outputs)) {
+    return false;
+  }
+  // Class-specific stuff.
+  return DoInit(object_map);
+}
+
+bool EvaluationStage::ProcessExpectedTags(
+    const std::vector<std::string>& expected_tags,
+    std::vector<std::string>& tag_to_name_mappings) {
+  // Validate format of each TAG:name mapping in tag_to_name_mappings, and add
+  // it to tags_to_names_map_.
+  for (const std::string& tag_name_mapping : tag_to_name_mappings) {
+    if (!std::regex_match(tag_name_mapping, kTagNameMappingPattern)) {
+      LOG(ERROR) << "Invalid TAG:name mapping: " << tag_name_mapping;
+      return false;
+    }
+    std::vector<std::string> tag_and_name =
+        absl::StrSplit(tag_name_mapping, ':');
+    tags_to_names_map_[tag_and_name[0]] = tag_and_name[1];
+  }
+
+  // Ensure each expected TAG is valid & has been mapped to a name.
+  for (const std::string& tag : expected_tags) {
+    if (!std::regex_match(std::string(tag), kTagPattern)) {
+      LOG(ERROR) << "Invalid expected TAG: " << tag;
+      return false;
+    }
+    if (tags_to_names_map_.find(tag) == tags_to_names_map_.end()) {
+      LOG(ERROR) << "TAG " << tag << " has not been mapped to a name in config "
+                 << config_.name();
+      return false;
+    }
+  }
+  return true;
+}
+
+std::map<ProcessClass, FactoryFunc>*
+    EvaluationStage::process_class_to_factory_map_ =
+        new std::map<ProcessClass, FactoryFunc>();
+
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage.h b/tensorflow/lite/tools/evaluation/evaluation_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..68e6e5442305d14611ae0b31429b49b6143de5ee
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage.h
@@ -0,0 +1,240 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+
+#include <functional>
+#include <map>
+#include <regex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+class EvaluationStage;
+
+typedef std::function<std::unique_ptr<EvaluationStage>(
+    const EvaluationStageConfig&)>
+    FactoryFunc;
+
+// Superclass for a single stage of an EvaluationPipeline.
+// Provides basic functionality for construction and accessing
+// initializers/inputs/outputs.
+// Every subclass of EvaluationStage will define its own behavior by specifying
+// appropriate accessor TAGs and implementing the Init, Run and Close methods.
+class EvaluationStage {
+ public:
+  // Initializes an EvaluationStage. Returns false if initialization failed,
+  // true otherwise.
+  // Should be called only once, before any call to Run().
+  // object_map should contain {initializer name : object pointer} mappings
+  // required for initialization.
+  //
+  // NOTE: EvaluationStage will not take ownership of any elements of
+  // object_map.
+  bool Init(absl::flat_hash_map<std::string, void*>& object_map);
+
+  // An individual run of the EvaluationStage. Returns false if there was a
+  // failure, true otherwise.
+  // Init() should be called before any calls to run().
+  // Inputs are acquired from and outputs are written to the incoming
+  // object_map, using appropriate TAGs.
+  //
+  // NOTE: The EvaluationStage should maintain ownership of outputs it
+  // populates into object_map. Ownership of inputs must be maintained
+  // elsewhere.
+  virtual bool Run(absl::flat_hash_map<std::string, void*>& object_map) = 0;
+
+  // Returns the latest metrics based on all Run() calls made so far.
+  virtual EvaluationStageMetrics LatestMetrics() = 0;
+
+  // The canonical way to instantiate EvaluationStages.
+  // Remember to call <classname>_ENABLE() first.
+  static std::unique_ptr<EvaluationStage> Create(
+      const EvaluationStageConfig& config) {
+    if (!config.has_specification() ||
+        !config.specification().has_process_class()) {
+      LOG(ERROR) << "Process specification not present in config: "
+                 << config.name();
+      return nullptr;
+    }
+    auto& factory_ptr =
+        (*GetFactoryMapPtr())[config.specification().process_class()];
+    if (!factory_ptr) return nullptr;
+    return factory_ptr(config);
+  }
+
+  // Used by DEFINE_REGISTRATION.
+  // This method takes ownership of factory.
+  // Should only be used via DEFINE_REGISTRATION macro.
+  static void RegisterStage(const ProcessClass& process_class,
+                            FactoryFunc class_factory) {
+    (*GetFactoryMapPtr())[process_class] = std::move(class_factory);
+  }
+
+  virtual ~EvaluationStage() = default;
+
+ protected:
+  // Constructs an EvaluationStage.
+  // Each subclass constructor must invoke this constructor.
+  //
+  // NOTE: Do NOT use constructors to obtain new EvaluationStages. Use
+  // EvaluationStage::Create instead.
+  explicit EvaluationStage(const EvaluationStageConfig& config)
+      : config_(config) {}
+
+  // Class-specific initialization, to be overridden by EvaluationStage
+  // sub-classes. Gets called in EvaluationStage::Init().
+  //
+  // NOTE: This object should not take ownership of any elements of object_map.
+  virtual bool DoInit(absl::flat_hash_map<std::string, void*>& object_map) = 0;
+
+  // The three following functions return the initializer/input/output TAGs used
+  // by an EvaluationStage. These should be mapped to meaningful names in the
+  // EvaluationStageConfig, and to required objects during calls to Init/Run.
+  // Format for TAGs: [A-Z0-9_]+ (Uppercase letters, numbers, "_")
+  // Refer docs in tflite.evaluation.EvaluationStageConfig for more information.
+
+  // Returns the expected initializer TAGs.
+  virtual std::vector<std::string> GetInitializerTags() = 0;
+
+  // Returns the expected input TAGs.
+  virtual std::vector<std::string> GetInputTags() = 0;
+
+  // Returns the expected output TAGs.
+  virtual std::vector<std::string> GetOutputTags() = 0;
+
+  // Populates a pointer to the object corresponding to provided TAG.
+  // Returns true if success, false otherwise.
+  // object_map contain a {name : object pointer} mapping, with the
+  // name being mapped to the expected TAG in the EvaluationStageConfig.
+  // NOTE: object pointer must be non-NULL.
+  template <class T>
+  bool GetObjectFromTag(const std::string& tag,
+                        absl::flat_hash_map<std::string, void*>& object_map,
+                        T** object_ptr) {
+    *object_ptr = nullptr;
+    // Find name corresponding to TAG.
+    auto mapping_iter = tags_to_names_map_.find(tag);
+    if (mapping_iter == tags_to_names_map_.end()) {
+      LOG(ERROR) << "Unexpected TAG to GetObjectFromTag: " << tag;
+      return false;
+    }
+    const std::string& expected_name = mapping_iter->second;
+
+    // Find object from name.
+    auto object_iter = object_map.find(expected_name);
+    if (object_iter == object_map.end()) {
+      LOG(ERROR) << "Could not find object for name: " << expected_name;
+      return false;
+    }
+    if (!object_iter->second) {
+      LOG(ERROR) << "Found null pointer for name: " << expected_name;
+      return false;
+    }
+    *object_ptr = static_cast<T*>(object_iter->second);
+    return true;
+  }
+
+  // Maps the appropriate name to a given object in object_map. The name is
+  // derived from mappings provided in the EvaluationStageConfig.
+  // Returns false if tag is invalid, true otherwise.
+  //
+  // NOTE: The EvaluationStage must maintain ownership of object for the
+  // lifetime of object_map
+  bool AssignObjectToTag(const std::string& tag, void* object_ptr,
+                         absl::flat_hash_map<std::string, void*>& object_map) {
+    // Find name corresponding to TAG.
+    auto mapping_iter = tags_to_names_map_.find(tag);
+    if (mapping_iter == tags_to_names_map_.end()) {
+      LOG(ERROR) << "Unexpected TAG to AssignObjectToTag: " << tag;
+      return false;
+    }
+    const std::string& expected_name = mapping_iter->second;
+
+    object_map[expected_name] = object_ptr;
+    return true;
+  }
+
+  EvaluationStageConfig config_;
+
+ private:
+  // Verifies that all TAGs from expected_tags are present in
+  // tag_to_name_mappings, and then populates tags_to_names_map_ with the
+  // appropriate entries. Returns false in case any TAG/mapping is invalid, true
+  // otherwise.
+  // expected_tags should be a list of TAG-strings.
+  // tag_to_name_mappings should be RepeatedPtrField of strings mapping TAGs to
+  // names in the form "SOME_TAG:some_name".
+  bool ProcessExpectedTags(const std::vector<std::string>& expected_tags,
+                           std::vector<std::string>& tag_to_name_mappings);
+
+  static std::map<ProcessClass, FactoryFunc>* GetFactoryMapPtr() {
+    return process_class_to_factory_map_;
+  }
+
+  // Used by factories.
+  static std::map<ProcessClass, FactoryFunc>* process_class_to_factory_map_;
+
+  // Maps expected TAGs to their names as defined by the EvaluationStageConfig.
+  absl::flat_hash_map<std::string, std::string> tags_to_names_map_;
+
+  // To ensure correct formatting in the config.
+  const std::regex kTagNameMappingPattern{"^([A-Z0-9_]+):([a-z0-9_]+)$",
+                                          std::regex::optimize};
+
+  // To ensure correct formatting in TAG names.
+  const std::regex kTagPattern{"^[A-Z0-9_]+$", std::regex::optimize};
+};
+
+// Add this to headers of new EvaluationStages.
+#define DECLARE_FACTORY(classname) void classname##_ENABLE();
+
+// Add this to implementation files of new EvaluationStages.
+// Call <stage_name>_ENABLE() before using EvaluationStage::Create for the
+// class.
+#define DEFINE_FACTORY(classname, processclass)                                \
+  void classname##_ENABLE() {                                                  \
+    FactoryFunc classname##Factory = [](const EvaluationStageConfig& config) { \
+      return absl::make_unique<classname>(config);                             \
+    };                                                                         \
+    EvaluationStage::RegisterStage(processclass, classname##Factory);          \
+  }
+
+// Use this to assign a non-nullptr pointer to tag in object_map.
+#define ASSIGN_OBJECT(tag, ptr, object_map)       \
+  if (!AssignObjectToTag(tag, ptr, object_map)) { \
+    return false;                                 \
+  }
+
+// Use this to obtain pointers to required object.
+// Will return false if name corresponding to tag is not found, or if the
+// pointer found is nullptr.
+#define GET_OBJECT(tag, object_map, location)         \
+  if (!GetObjectFromTag(tag, object_map, location)) { \
+    return false;                                     \
+  }
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc b/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe8e287d3f0102efda652260dec662b0771ac52e
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/tools/evaluation/identity_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+
+namespace tflite {
+namespace evaluation {
+namespace {
+
+constexpr char kIdentityStageName[] = "identity_stage";
+constexpr char kDefaultValueName[] = "default";
+constexpr char kInputValueName[] = "in";
+constexpr char kOutputValueName[] = "out";
+constexpr char kInitializerMapping[] = "DEFAULT_VALUE:default";
+constexpr char kInputMapping[] = "INPUT_VALUE:in";
+constexpr char kOutputMapping[] = "OUTPUT_VALUE:out";
+
+EvaluationStageConfig GetIdentityStageConfig() {
+  IdentityStage_ENABLE();
+  EvaluationStageConfig config;
+  config.set_name(kIdentityStageName);
+  config.mutable_specification()->set_process_class(IDENTITY);
+  config.add_initializers(kInitializerMapping);
+  config.add_inputs(kInputMapping);
+  config.add_outputs(kOutputMapping);
+  return config;
+}
+
+TEST(EvaluationStage, CreateFailsForMissingSpecification) {
+  // Construct
+  EvaluationStageConfig config;
+  config.set_name(kIdentityStageName);
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  EXPECT_EQ(stage_ptr, nullptr);
+}
+
+TEST(EvaluationStage, StageEnableRequired) {
+  // Construct
+  EvaluationStageConfig config;
+  config.set_name(kIdentityStageName);
+  config.mutable_specification()->set_process_class(IDENTITY);
+  config.add_initializers(kInitializerMapping);
+  config.add_inputs(kInputMapping);
+  config.add_outputs(kOutputMapping);
+  config.clear_inputs();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  EXPECT_EQ(stage_ptr, nullptr);
+  IdentityStage_ENABLE();
+  stage_ptr = EvaluationStage::Create(config);
+  EXPECT_NE(stage_ptr, nullptr);
+}
+
+TEST(EvaluationStage, IncompleteConfig) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.clear_inputs();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  float default_value = 1.0;
+  object_map[kDefaultValueName] = &default_value;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, IncorrectlyFormattedConfig) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.clear_initializers();
+  config.add_initializers("DEFAULT_VALUE-default");
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  float default_value = 1.0;
+  object_map[kDefaultValueName] = &default_value;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, ConstructFromConfig_UnknownProcess) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.mutable_specification()->clear_process_class();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  EXPECT_EQ(stage_ptr.get(), nullptr);
+}
+
+TEST(EvaluationStage, NoInitializer) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, InvalidInitializer) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  object_map[kDefaultValueName] = nullptr;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, NoInputs) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  float default_value = 1.0;
+  object_map[kDefaultValueName] = &default_value;
+  EXPECT_TRUE(stage_ptr->Init(object_map));
+
+  // Run
+  EXPECT_FALSE(stage_ptr->Run(object_map));
+}
+
+TEST(EvaluationStage, ExpectedIdentityOutput) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr = EvaluationStage::Create(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  float default_value = 1.0;
+  object_map[kDefaultValueName] = &default_value;
+  EXPECT_TRUE(stage_ptr->Init(object_map));
+
+  // Input Data
+  float input_value = 2.0f;
+  // Run
+  object_map[kInputValueName] = &input_value;
+  EXPECT_TRUE(stage_ptr->Run(object_map));
+  EvaluationStageMetrics metrics = stage_ptr->LatestMetrics();
+  // Check output
+  float* output_value_ptr = static_cast<float*>(object_map[kOutputValueName]);
+  EXPECT_NE(output_value_ptr, nullptr);
+  EXPECT_FLOAT_EQ(*output_value_ptr, input_value);
+  EXPECT_EQ(metrics.num_runs(), 1);
+
+  // Input Data
+  input_value = 0.0f;
+  // Run
+  object_map[kInputValueName] = &input_value;
+  EXPECT_TRUE(stage_ptr->Run(object_map));
+  metrics = stage_ptr->LatestMetrics();
+  // Check output
+  output_value_ptr = static_cast<float*>(object_map[kOutputValueName]);
+  EXPECT_NE(output_value_ptr, nullptr);
+  EXPECT_FLOAT_EQ(*output_value_ptr, default_value);
+  EXPECT_EQ(metrics.num_runs(), 2);
+}
+
+}  // namespace
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/identity_stage.cc b/tensorflow/lite/tools/evaluation/identity_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34932f004484f3df780327965504e5eefb096cc2
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/identity_stage.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/identity_stage.h"
+
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+bool IdentityStage::DoInit(
+    absl::flat_hash_map<std::string, void*>& object_map) {
+  float* default_value;
+  if (!GetObjectFromTag(kDefaultValueTag, object_map, &default_value)) {
+    return false;
+  }
+  default_value_ = *default_value;
+  return true;
+}
+
+bool IdentityStage::Run(absl::flat_hash_map<std::string, void*>& object_map) {
+  float* current_value;
+  GET_OBJECT(kInputValueTag, object_map, &current_value);
+  current_value_ = *current_value ? *current_value : default_value_;
+  ASSIGN_OBJECT(kOutputValueTag, &current_value_, object_map);
+  ++num_runs_;
+  return true;
+}
+
+EvaluationStageMetrics IdentityStage::LatestMetrics() {
+  EvaluationStageMetrics metrics;
+  metrics.set_num_runs(num_runs_);
+  return metrics;
+}
+
+const char IdentityStage::kDefaultValueTag[] = "DEFAULT_VALUE";
+const char IdentityStage::kInputValueTag[] = "INPUT_VALUE";
+const char IdentityStage::kOutputValueTag[] = "OUTPUT_VALUE";
+
+DEFINE_FACTORY(IdentityStage, IDENTITY);
+
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/identity_stage.h b/tensorflow/lite/tools/evaluation/identity_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..c89a178e810eb950857f71284586311e01847c60
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/identity_stage.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// Simple EvaluationStage that passes INPUT_VALUE to OUTPUT_VALUE if the former
+// is non-zero, DEFAULT_VALUE otherwise. Primarily used for tests.
+// Initializer TAGs (Object Class): DEFAULT_VALUE (float)
+// Input TAGs (Object Class): INPUT_VALUE (float)
+// Output TAGs (Object Class): OUTPUT_VALUE (float)
+class IdentityStage : public EvaluationStage {
+ public:
+  explicit IdentityStage(const EvaluationStageConfig& config)
+      : EvaluationStage(config) {}
+
+  bool Run(absl::flat_hash_map<std::string, void*>& object_map) override;
+
+  EvaluationStageMetrics LatestMetrics() override;
+
+  ~IdentityStage() {}
+
+ protected:
+  bool DoInit(absl::flat_hash_map<std::string, void*>& object_map) override;
+
+  std::vector<std::string> GetInitializerTags() override {
+    return {kDefaultValueTag};
+  }
+  std::vector<std::string> GetInputTags() override { return {kInputValueTag}; }
+  std::vector<std::string> GetOutputTags() override {
+    return {kOutputValueTag};
+  }
+
+ private:
+  float default_value_ = 0;
+  float current_value_ = 0;
+  int num_runs_ = 0;
+
+  static const char kDefaultValueTag[];
+  static const char kInputValueTag[];
+  static const char kOutputValueTag[];
+};
+
+DECLARE_FACTORY(IdentityStage);
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
diff --git a/tensorflow/tools/dist_test/Dockerfile.local b/tensorflow/lite/tools/evaluation/proto/BUILD
similarity index 51%
rename from tensorflow/tools/dist_test/Dockerfile.local
rename to tensorflow/lite/tools/evaluation/proto/BUILD
index 795aeee1b5d21f973dfa5856969ef3a85d2571ca..6c747357040a968b35ff99ca97a3b8a6677340e6 100644
--- a/tensorflow/tools/dist_test/Dockerfile.local
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Docker image for testing distributed (GRPC) TensorFlow on a single machine.
-#
-# See ./local_test.sh for usage example.
 
-FROM ubuntu:16.04
+licenses(["notice"])  # Apache 2.0
 
-LABEL maintainer="Shanqing Cai <cais@google.com>"
+package(default_visibility = ["//visibility:public"])
 
-# Pick up some TF dependencies.
-RUN apt-get update && apt-get install -y \
-        python-pip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
 
-# Install TensorFlow pip whl
-# TODO(cais): Should we build it locally instead?
-COPY tensorflow-*.whl /
-RUN pip install /tensorflow-*.whl
-RUN rm -f /tensorflow-*.whl
+tf_proto_library_cc(
+    name = "evaluation_stages_proto",
+    srcs = [
+        "evaluation_stages.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
 
-ADD . /var/tf_dist_test
+tf_proto_library_cc(
+    name = "evaluation_config_proto",
+    srcs = [
+        "evaluation_config.proto",
+    ],
+    protodeps = [":evaluation_stages_proto"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..37ba96b22079b74af7c5ea62ee8d4833a81644d6
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package tflite.evaluation;
+
+import "tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto";
+
+// Next ID: 6
+message EvaluationStageConfig {
+  optional string name = 1;
+
+  // Specification defining what this stage does, and any required parameters.
+  optional ProcessSpecification specification = 2;
+
+  // initializers, inputs and outputs are strings that define colon-separated
+  // mappings between TAGs and their corresponding names.
+  // These names help EvaluationStages communicate with each other during runs.
+  // Format for TAGs: [A-Z0-9_]+ (Uppercase letters, numbers, "_")
+  // Format for names: [a-z0-9_]+ (Lowercase letters, numbers, "_")
+  // Example mapping: "BITMAP1:image_in"
+  // It is up to individual EvaluationStage sub-classes to specify the
+  // initializer/input TAGs they require, and outputs TAGs they provide.
+  // For more information: go/mlperflite-framework#heading=h.fxpk50cps4zs
+  repeated string initializers = 3;
+  repeated string inputs = 4;
+  repeated string outputs = 5;
+}
+
+// Metrics returned from EvaluationStage.LatestMetrics() need not have all
+// fields set.
+message EvaluationStageMetrics {
+  // Total number of times the EvaluationStage is run.
+  // Aka number of calls to EvaluationStage::Run().
+  optional int32 num_runs = 1;
+
+  // Process-specific numbers such as latencies, accuracy, etc.
+  optional ProcessMetrics process_metrics = 2;
+}
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
new file mode 100644
index 0000000000000000000000000000000000000000..f45d96fafc3d712677a53d8f8321bb0753d60525
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package tflite.evaluation;
+
+// All EvaluationStage sub-classes must add a value here.
+// A corresponding entry must also be present in EvaluationStage.FromConfig
+enum ProcessClass {
+  // Default/Unknown
+  UNKNOWN = 0;
+  // Identity
+  IDENTITY = 1;
+}
+
+// Defines the functionality executed by an EvaluationStage.
+// TODO(b/122482115): Add stage-specific options using oneof.
+message ProcessSpecification {
+  optional ProcessClass process_class = 1;
+}
+
+// Contains specific metrics, which may differ based on what an EvaluationStage
+// does.
+// TODO(b/122482115): Add stage-specific metrics using oneof.
+message ProcessMetrics {}
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 994f660dba7742de162525dcf6a8c6a288ee71c6..8428e0d2e6bee30bfe260514ce81e7aa4db5c2a7 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -1,3 +1,7 @@
+# Make uses /bin/sh by default, which is incompatible with the bashisms seen
+# below.
+SHELL := /bin/bash
+
 # Find where we're running from, so we can store generated files here.
 ifeq ($(origin MAKEFILE_DIR), undefined)
 	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -51,7 +55,7 @@ LIBS := \
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS := -O3 -DNDEBUG -fPIC
 CXXFLAGS += $(EXTRA_CXXFLAGS)
 CCFLAGS := ${CXXFLAGS}
 CXXFLAGS += --std=c++11
@@ -86,6 +90,8 @@ CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/experimental/c/*.c) \
+$(wildcard tensorflow/lite/experimental/c/*.cc) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc)
 ifneq ($(BUILD_TYPE),micro)
@@ -109,17 +115,37 @@ $(wildcard tensorflow/lite/*test.cc) \
 $(wildcard tensorflow/lite/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
-$(wildcard tensorflow/lite/kernels/test_util.cc) \
+$(wildcard tensorflow/lite/kernels/*test_util.cc) \
 $(MINIMAL_SRCS)
+
 ifeq ($(BUILD_TYPE),micro)
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation.cc \
-tensorflow/lite/nnapi_delegate.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation.cc
 else
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation_disabled.cc \
-tensorflow/lite/nnapi_delegate_disabled.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation_disabled.cc
 endif
+
+BUILD_WITH_NNAPI=true
+ifeq ($(BUILD_TYPE),micro)
+	BUILD_WITH_NNAPI=false
+endif
+ifeq ($(TARGET),ios)
+	BUILD_WITH_NNAPI=false
+endif
+ifeq ($(BUILD_WITH_NNAPI),true)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate.cc
+endif
+
+ifeq ($(TARGET),ios)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_default.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_ios.cc
+endif
+
+
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server_wrapper.sh b/tensorflow/lite/tools/make/build_aarch64_lib.sh
similarity index 66%
rename from tensorflow/tools/dist_test/server/grpc_tensorflow_server_wrapper.sh
rename to tensorflow/lite/tools/make/build_aarch64_lib.sh
index 5380f610ee9fca993bd236ce7dc7cbf979386e26..054b3daedf8240fac96d9d67ea7b52f721c8c303 100755
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server_wrapper.sh
+++ b/tensorflow/lite/tools/make/build_aarch64_lib.sh
@@ -1,5 +1,5 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,13 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-#
-# Wrapper script for grpc_tensorflow_server.py in test server
-
-LOG_FILE="/tmp/grpc_tensorflow_server.log"
 
-SCRIPT_DIR=$( cd ${0%/*} && pwd -P )
+set -e
 
-touch "${LOG_FILE}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
 
-python ${SCRIPT_DIR}/grpc_tensorflow_server.py $@ 2>&1 | tee "${LOG_FILE}"
+CC_PREFIX=aarch64-linux-gnu- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=aarch64 TARGET_ARCH=armv8-a
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index fa3d5d3d3b6657ff327dd6ec34bd65823da13cd2..8c4992a84304ded382e36e9e18e452100d94a391 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -100,5 +100,6 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+cat "$SCRIPT_DIR/../../../../third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/tools/make/targets/aarch64_makefile.inc b/tensorflow/lite/tools/make/targets/aarch64_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..d89440651e6257621548fbe578780c60eba3e145
--- /dev/null
+++ b/tensorflow/lite/tools/make/targets/aarch64_makefile.inc
@@ -0,0 +1,33 @@
+# Settings for generic aarch64 boards such as Odroid C2 or Pine64.
+ifeq ($(TARGET),aarch64)
+  # The aarch64 architecture covers all 64-bit ARM chips. This arch mandates
+  # NEON, so FPU flags are not needed below.
+  TARGET_ARCH := armv8-a
+  TARGET_TOOLCHAIN_PREFIX := aarch64-linux-gnu-
+
+  CXXFLAGS += \
+    -march=armv8-a \
+    -funsafe-math-optimizations \
+    -ftree-vectorize \
+    -fPIC
+
+  CCFLAGS += \
+    -march=armv8-a \
+    -funsafe-math-optimizations \
+    -ftree-vectorize \
+    -fPIC
+
+  LDFLAGS := \
+    -Wl,--no-export-dynamic \
+    -Wl,--exclude-libs,ALL \
+    -Wl,--gc-sections \
+    -Wl,--as-needed
+
+       
+  LIBS := \
+    -lstdc++ \
+    -lpthread \
+    -lm \
+    -ldl
+
+endif
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 2cdec0043346794f0cff592b2cdfe0437ae0e341..3011de4b3999e77f276a777c45e284ef9d96a8ef 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,14 +1,15 @@
-# TODO(suharshs): Write quantize_weights tests that use small exportable files.
-# Then we can remove this file.
-package(
-    default_visibility = ["//visibility:public"],
-)
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-licenses(["notice"])  # Apache 2.0
+package(default_visibility = [
+    "//visibility:public",
+])
 
-exports_files(["LICENSE"])
+licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+exports_files(glob([
+    "testdata/*.bin",
+]))
 
 cc_library(
     name = "quantization_utils",
@@ -16,22 +17,33 @@ cc_library(
     hdrs = ["quantization_utils.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:round",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
-        "@flatbuffers",
     ],
 )
 
-cc_test(
-    name = "quantizatin_utils_test",
+tf_cc_test(
+    name = "quantization_utils_test",
     srcs = ["quantization_utils_test.cc"],
-    copts = ["-Wall"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
     tags = [
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
     deps = [
         ":quantization_utils",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -45,11 +57,137 @@ cc_library(
     hdrs = ["quantize_weights.h"],
     deps = [
         ":quantization_utils",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+        "//tensorflow/lite:framework",
+        # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/core:tflite_portable_logging",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_weights_test",
+    srcs = ["quantize_weights_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/weight_shared_between_convs.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_weights",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "subgraph_quantizer",
+    srcs = ["subgraph_quantizer.cc"],
+    hdrs = ["subgraph_quantizer.h"],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels/internal:round",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
 )
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "subgraph_quantizer_test",
+    srcs = ["subgraph_quantizer_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_minus_127_max_plus_127.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_softmax_min_minus_5_max_plus_5.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":subgraph_quantizer",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "quantize_model",
+    srcs = ["quantize_model.cc"],
+    hdrs = ["quantize_model.h"],
+    deps = [
+        ":subgraph_quantizer",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_model_test",
+    srcs = ["quantize_model_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_model",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c1d2ad2bca8f76b1e07dfe6d6027ec69cd821c8a
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -0,0 +1,138 @@
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "calibrator_lib",
+    srcs = ["calibrator.cc"],
+    hdrs = ["calibrator.h"],
+    deps = [
+        ":calibration_common",
+        ":calibration_logger",
+        ":calibration_reader",
+        ":logging_op_resolver",
+        ":node_info_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":calibrator_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "logging_op_resolver",
+    srcs = ["logging_op_resolver.cc"],
+    hdrs = ["logging_op_resolver.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "logging_op_resolver_test",
+    srcs = ["logging_op_resolver_test.cc"],
+    deps = [
+        ":logging_op_resolver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "calibration_reader",
+    srcs = ["calibration_reader.cc"],
+    hdrs = ["calibration_reader.h"],
+    deps = [
+        ":calibration_logger",
+        "//tensorflow/lite:framework",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "calibration_logger",
+    hdrs = ["calibration_logger.h"],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "calibration_common",
+    hdrs = ["calibration_common.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "node_info_delegate",
+    srcs = ["node_info_delegate.cc"],
+    hdrs = ["node_info_delegate.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "node_info_delegate_test",
+    srcs = ["node_info_delegate_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":node_info_delegate",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/tools/optimize:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration_common.h b/tensorflow/lite/tools/optimize/calibration/calibration_common.h
similarity index 100%
rename from tensorflow/lite/tools/optimize/calibration_common.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_common.h
diff --git a/tensorflow/lite/tools/optimize/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
similarity index 100%
rename from tensorflow/lite/tools/optimize/calibration_logger.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_logger.h
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibration_reader.cc
rename to tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
index b01a62bd6c15dee5b60edf5f3abdd40ba4c3a56b..69e9c5aed8dc3a6a27225fc55d87b900dc9d4730 100644
--- a/tensorflow/lite/tools/optimize/calibration_reader.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 #include "absl/memory/memory.h"
 
diff --git a/tensorflow/lite/tools/optimize/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibration_reader.h
rename to tensorflow/lite/tools/optimize/calibration/calibration_reader.h
index af0da1bb3835493e69ef7a6bccb7149ef14b1db9..0120d841900e4432fcee49e285ade46007bd3660 100644
--- a/tensorflow/lite/tools/optimize/calibration_reader.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
similarity index 95%
rename from tensorflow/lite/tools/optimize/calibrator.cc
rename to tensorflow/lite/tools/optimize/calibration/calibrator.cc
index 0e817f934618ba7759d23e8a038653834488d2cc..eead4e590f8a42c5362b4efb952511b48e51d2de 100644
--- a/tensorflow/lite/tools/optimize/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/calibrator.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
 
 #include <fstream>
 #include <memory>
@@ -30,11 +30,11 @@ limitations under the License.
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
-#include "tensorflow/lite/tools/optimize/calibration_logger.h"
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
@@ -199,8 +199,10 @@ std::vector<int> GetLoggableTensorIndices(
   for (auto tensor_index : tensor_indices) {
     auto tensor = tensors->Get(tensor_index);
     auto buffer_index = tensor->buffer();
-    bool has_no_buffer =
-        buffer_index == 0 || (tensor_buffers->Get(buffer_index) == nullptr);
+    const bool has_no_buffer =
+        (tensor_buffers->Get(buffer_index) == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data() == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data()->size() == 0);
     if (has_no_buffer && tensor->type() == tflite::TensorType_FLOAT32) {
       loggable.push_back(tensor_index);
     }
diff --git a/tensorflow/lite/tools/optimize/calibrator.h b/tensorflow/lite/tools/optimize/calibration/calibrator.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/calibrator.h
rename to tensorflow/lite/tools/optimize/calibration/calibrator.h
index ab3cb27eb7518b7327655023739e310e2a6b0249..fb7e03f5ce71f3601d6a1b0f8c912f570f67b1c9 100644
--- a/tensorflow/lite/tools/optimize/calibrator.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
similarity index 86%
rename from tensorflow/lite/tools/optimize/calibrator_test.cc
rename to tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index bbbcc70fae1a775cf49bedd809799d3472e3d060..60e652ec7a1fcc0d3844f0254fa6ff6072a861ce 100644
--- a/tensorflow/lite/tools/optimize/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -16,18 +16,31 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/calibrator.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+
+namespace {
+tensorflow::string* g_test_model_file = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace calibration {
 namespace {
 
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  if (g_test_model_file) {
+    return FlatBufferModel::BuildFromFile(g_test_model_file->c_str());
+  }
+  return nullptr;
+}
+
 TEST(CalibratorTest, CalibrationStatsAreCollected) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  auto model = ReadModel();
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   std::unique_ptr<CalibrationReader> reader;
@@ -105,8 +118,7 @@ TEST(CalibratorTest, CalibrationStatsAreCollected) {
 }
 
 TEST(CalibratorTest, MultipleInvokes) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  auto model = ReadModel();
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   std::unique_ptr<CalibrationReader> reader;
@@ -183,7 +195,18 @@ TEST(CalibratorTest, MultipleInvokes) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_file = new tensorflow::string(model_file);
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
similarity index 96%
rename from tensorflow/lite/tools/optimize/logging_op_resolver.cc
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
index 7633ebb8dd9d7aee0b8a5befa5d51911f68a7e32..d2a09e898ae213c9a2aaa6e7e26adb6eda638a67 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 
 #include "absl/memory/memory.h"
 
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver.h b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/logging_op_resolver.h
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
index 58a3a0fe3c08288ccba6881a64b1fd581103da10..af4127e42f76dcdcfff00bee4b811dd20111165d 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver.h
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
similarity index 82%
rename from tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
rename to tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
index 18c29abec65de748184cc24c31d5ddd81ce21b0f..d8d29ad8eff0cea0967a6d0e91e84714b5fbe80f 100644
--- a/tensorflow/lite/tools/optimize/logging_op_resolver_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/mutable_op_resolver.h"
@@ -44,16 +44,16 @@ TfLiteStatus WrappingInvoke(TfLiteContext* context, TfLiteNode* node) {
 
 TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
   MutableOpResolver base_resolver;
-  TfLiteRegistration conv_registration = {
-      .prepare = ConvPrepare,
-      .invoke = ConvEval,
-  };
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
 
-  TfLiteRegistration add_registration = {
-      .prepare = AddPrepare,
-      .invoke = AddEval,
-  };
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
   BuiltinOpsSet ops_to_replace = {
       {BuiltinOperator_CONV_2D, /*version*/ 1},
@@ -77,16 +77,16 @@ TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
 
 TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
   MutableOpResolver base_resolver;
-  TfLiteRegistration conv_registration = {
-      .prepare = ConvPrepare,
-      .invoke = ConvEval,
-  };
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
 
-  TfLiteRegistration add_registration = {
-      .prepare = AddPrepare,
-      .invoke = AddEval,
-  };
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
   BuiltinOpsSet ops_to_replace = {
       {BuiltinOperator_CONV_2D, /*version*/ 1},
@@ -103,16 +103,16 @@ TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
 
 TEST(LoggingOpResolverTest, OnlyOpsInReplacementSetAreReplaces) {
   MutableOpResolver base_resolver;
-  TfLiteRegistration conv_registration = {
-      .prepare = ConvPrepare,
-      .invoke = ConvEval,
-  };
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
 
-  TfLiteRegistration add_registration = {
-      .prepare = AddPrepare,
-      .invoke = AddEval,
-  };
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
   base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
   // Only replace conv2d
   BuiltinOpsSet ops_to_replace = {
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
similarity index 88%
rename from tensorflow/lite/tools/optimize/node_info_delegate.cc
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
index ccaa69373fcf55adaef21a948089ea59821ca763..2b9197498b03dad6a37b7370ce2a0d2751ac9bcd 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate.cc
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
 
 namespace tflite {
 namespace optimize {
@@ -33,11 +33,11 @@ TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
 }  // namespace
 
 TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
-  return {.data_ = params,
-          .Prepare = NodeInfoDelegatePrepare,
-          .CopyFromBufferHandle = nullptr,
-          .CopyToBufferHandle = nullptr,
-          .FreeBufferHandle = nullptr};
+  return {/*data_ */ params,
+          /* Prepare */ NodeInfoDelegatePrepare,
+          /* CopyFromBufferHandle*/ nullptr,
+          /* CopyToBufferHandle*/ nullptr,
+          /* FreeBufferHandle*/ nullptr};
 }
 
 TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate.h b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
similarity index 96%
rename from tensorflow/lite/tools/optimize/node_info_delegate.h
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
index 8ee2ce1978cf87b104518c4b64e84df166cef32d..56f6141f21dc3f807c53ac5e92833597f6cef4a9 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate.h
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/lite/context.h"
-#include "tensorflow/lite/tools/optimize/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
similarity index 80%
rename from tensorflow/lite/tools/optimize/node_info_delegate_test.cc
rename to tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
index e762d5c0144fe7f37782dca2dc4bca57b1553450..b110174b6325a8daadacfd472e62321ef69425f7 100644
--- a/tensorflow/lite/tools/optimize/node_info_delegate_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
@@ -16,15 +16,32 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/optimize/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace calibration {
 namespace {
 
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  return ReadModel(internal::kConvModelWith0Plus10Weights);
+}
+
 class TestDelegateObserver : public DelegateObserver {
  public:
   explicit TestDelegateObserver(TfLiteStatus status_to_return)
@@ -45,8 +62,7 @@ TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
   TestDelegateObserver observer(kTfLiteOk);
   NodeInfoDelegateParams params;
   params.delegate_observer = &observer;
-  auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  auto model = ReadModel();
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*model,
@@ -66,8 +82,7 @@ TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
   TestDelegateObserver observer(kTfLiteError);
   NodeInfoDelegateParams params;
   params.delegate_observer = &observer;
-  auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  auto model = ReadModel();
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*model,
@@ -81,8 +96,7 @@ TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
 }
 
 TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  auto model = ReadModel();
   ASSERT_TRUE(model);
 
   std::unordered_map<int, OperatorInfo> index_to_opinfo;
@@ -146,7 +160,19 @@ TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index ebac65ed105da6faa6c043d77cc50b0ab4bdad0b..a5b9b00b8a969399ccdde278ab5386b0be03b4a8 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 #include <cmath>
 #include <cstdint>
@@ -21,6 +26,11 @@ namespace tflite {
 namespace optimize {
 namespace utils {
 
+namespace {
+const int8_t kMinQuantizedValue = -127;
+const int8_t kMaxQuantizedValue = 127;
+}  // namespace
+
 TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements) {
   if (tensor.shape.empty()) {
     return kTfLiteError;
@@ -47,7 +57,11 @@ void GetAsymmetricQuantizationParams(
   min = std::min(static_cast<float>(min), 0.0f);
   max = std::max(static_cast<float>(max), 0.0f);
   const float scale = (max - min) / (quant_max_float - quant_min_float);
-  const float zero_point_from_min = quant_min_float - min / scale;
+  // Scale can be zero if min and max are exactly 0.0f.
+  float zero_point_from_min = quant_min_float;
+  if (scale != 0) {
+    zero_point_from_min = quant_min_float - min / scale;
+  }
   int64_t zero_point;
   if (zero_point_from_min < quant_min_float) {
     zero_point = static_cast<int64_t>(quant_min);
@@ -62,6 +76,127 @@ void GetAsymmetricQuantizationParams(
   quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
 }
 
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value) {
+  const int32_t channel_dim_size = dimension[channel_dim_index];
+  std::vector<float> min_vals(channel_dim_size);
+  std::vector<float> max_vals(channel_dim_size);
+  std::vector<bool> has_min_max_value(channel_dim_size, false);
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+
+  // Compute min max ranges per channel
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          const float val = input[Offset(tensor_dims, indices)];
+          if (has_min_max_value[channel_idx]) {
+            if (min_vals[channel_idx] > val) {
+              min_vals[channel_idx] = val;
+            } else if (max_vals[channel_idx] < val) {
+              max_vals[channel_idx] = val;
+            }
+          } else {
+            min_vals[channel_idx] = val;
+            max_vals[channel_idx] = val;
+            has_min_max_value[channel_idx] = true;
+          }
+        }
+      }
+    }
+  }
+
+  // Calculate scales per channel
+  std::vector<float> scale_invs(channel_dim_size);
+  const float half_scale = kMaxQuantizedValue;
+  for (size_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
+    const float half_range = std::max(std::abs(min_vals[channel_idx]),
+                                      std::abs(max_vals[channel_idx]));
+    output_scales->at(channel_idx) = half_range / half_scale;
+    if (half_range == 0) {
+      scale_invs[channel_idx] = 0;
+    } else {
+      scale_invs[channel_idx] = half_scale / half_range;
+    }
+  }
+
+  // Quantize the values.
+  SymmetricPerChannelQuantizeValues(input, scale_invs, dimension,
+                                    channel_dim_index, output_value);
+}
+
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value) {
+  // Quantize the values.
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          int index = Offset(tensor_dims, indices);
+          const float val = input[index];
+          const int32_t quantized_value =
+              static_cast<int32_t>(TfLiteRound(val * scales_inv[channel_idx]));
+          output_value->at(index) = std::min<int8_t>(
+              kMaxQuantizedValue,
+              std::max<int8_t>(kMinQuantizedValue, quantized_value));
+        }
+      }
+    }
+  }
+}
+
+TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
+  if (model == nullptr || tensor == nullptr) {
+    return kTfLiteError;
+  }
+
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  if (buffer == nullptr) {
+    return kTfLiteError;
+  }
+  float* float_data = reinterpret_cast<float*>(buffer->data.data());
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
+
+  std::vector<int8_t> quantized_buffer;
+  quantized_buffer.resize(num_elements);
+
+  float min_value, max_value, scaling_factor;
+  tensor_utils::SymmetricQuantizeFloats(float_data, num_elements,
+                                        quantized_buffer.data(), &min_value,
+                                        &max_value, &scaling_factor);
+
+  if (tensor->quantization == nullptr) {
+    tensor->quantization = absl::make_unique<QuantizationParametersT>();
+  }
+  tensor->quantization->scale = std::vector<float>(1, scaling_factor);
+  tensor->quantization->zero_point = std::vector<int64_t>(1, 0);
+
+  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized_buffer.data());
+  model->buffers[tensor->buffer]->data.assign(uint8_buffer,
+                                              uint8_buffer + num_elements);
+
+  // Update the tensor type.
+  tensor->type = TensorType_INT8;
+
+  return kTfLiteOk;
+}
+
 }  // namespace utils
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index 8e05c69a40079894230d8543881ae5124b82015a..010bcb931fb075aae2b60dadad6c9fa28dc6ee81 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -35,6 +35,35 @@ void GetAsymmetricQuantizationParams(
     float min, float max, const int quant_min, const int quant_max,
     QuantizationParametersT* quantization_params);
 
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+// Parameters:
+// - input is the float input data to be quantized.
+// - dimension is the dimension of the input data. Only supports dimension of
+//   size 4.
+// - channel_dim_index is the channel index within "dimension".
+//   dimension[channel_dim_index] gives the number of channels.
+// - output_scale is the output scale, the size of which equals the number of
+//   channels.
+// - output_value is the output data, the size of which equals the number of
+//   inputs.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value);
+
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value);
+
+// Quantizes tensor using symmetric quantization with the min and max elements
+// of the tensor.
+TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor);
+
 }  // namespace utils
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 482ec70470c5b4e308d2a3f6db7ff83d1dc32992..1562309a9c810a5a5db1eef62e53cea060f7e7e9 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -15,12 +15,33 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace utils {
 namespace {
 
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadConvModel() {
+  return ReadModel(internal::kConvModelWith0Plus10Weights);
+}
+
+using ::testing::ElementsAreArray;
+
 TEST(QuantizationUtilsTest, NumElements) {
   TensorT tensor;
   tensor.shape = {1, 2, 3, 4};
@@ -124,12 +145,136 @@ TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroInRange) {
   EXPECT_LT(zero_point, quant_max);
 }
 
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroMinMax) {
+  const float float_min = 0;
+  const float float_max = 0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_NEAR(scale, 0, eps);
+  EXPECT_NEAR(zero_point, quant_min, eps);
+  EXPECT_LT(zero_point, quant_max);
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantization) {
+  // Set up an input with [3, 2, 2, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      3.0, 2.0, 5.0,  -2.0, 3.0,  2.0,  5.0,  -2.0,  // Channel 1.
+      1.0, 2.0, 3.0,  4.0,  5.0,  6.0,  7.0,  8.0,   // Channel 2.
+      1.0, 0.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0,  // Channel 3.
+  };
+  const std::vector<int32_t> dimension = {3, 2, 2, 2};
+  const int channel_index = 0;
+
+  // Create holder for output scale and data.
+  std::vector<float> output_scales(3);
+  std::vector<int8_t> output_data(3 * 2 * 2 * 2);
+
+  // Call SymmetricPerChannelQuantization and verify the result.
+  SymmetricPerChannelQuantization(input.data(), dimension, channel_index,
+                                  &output_scales, &output_data);
+  const std::vector<float> expected_output_scales = {0.0393700786, 0.0629921257,
+                                                     0.0472440943};
+  const std::vector<int8_t> expected_output_data = {
+      76, 51, 127, -51, 76,  51,  127,  -51,   // Channel 1.
+      16, 32, 48,  64,  79,  95,  111,  127,   // Channel 2.
+      21, 0,  -21, -42, -64, -85, -106, -127,  // Channel 3.
+  };
+  EXPECT_THAT(output_scales, ElementsAreArray(expected_output_scales));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantizeValues) {
+  // Set up an input with [3, 1, 1, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      13.0, 21.0,  // Channel 1.
+      21.0, 22.0,  // Channel 2.
+      31.0, 40.0,  // Channel 3.
+  };
+  const std::vector<float> scales_inv = {2, 0.5, 3};
+  const std::vector<int32_t> dimension = {3, 1, 1, 2};
+  const int channel_index = 0;
+
+  // Create holder for output data.
+  std::vector<int8_t> output_data(3 * 1 * 1 * 2);
+
+  // Call SymmetricPerChannelQuantizeValues and verify the result.
+  SymmetricPerChannelQuantizeValues(input.data(), scales_inv, dimension,
+                                    channel_index, &output_data);
+  const std::vector<int8_t> expected_output_data = {
+      26, 42,   // Channel 1.
+      11, 11,   // Channel 2.
+      93, 120,  // Channel 3.
+  };
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+TEST(QuantizationUtilsTest, SymmetricQuantizeTensorNullInputs) {
+  EXPECT_EQ(SymmetricQuantizeTensor(nullptr, nullptr), kTfLiteError);
+}
+
+TEST(QuantizationUtilsTest, SymmetricQuantizeTensor) {
+  // Conv model has weights between 0 and 10.
+  // Quantize the weights tensor.
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  auto conv_op = subgraph->operators.at(0).get();
+  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
+            BuiltinOperator_CONV_2D);
+  int32_t weights_tensor_idx = conv_op->inputs[1];
+  TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
+
+  EXPECT_EQ(weights_tensor->type, TensorType_FLOAT32);
+  size_t float_buffer_size =
+      model.buffers.at(weights_tensor->buffer)->data.size();
+
+  EXPECT_EQ(SymmetricQuantizeTensor(&model, weights_tensor), kTfLiteOk);
+
+  size_t quant_buffer_size =
+      model.buffers.at(weights_tensor->buffer)->data.size();
+  EXPECT_EQ(weights_tensor->type, TensorType_INT8);
+  EXPECT_EQ(quant_buffer_size * 4, float_buffer_size);
+}
+
 }  // namespace
 }  // namespace utils
 }  // namespace optimize
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 55a9b1c580a4c08a2f9dabeee527dbc919c74467..bf53e6414a85f711d6fd0e1e5713eb56251145ca 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -38,7 +38,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     internal::SubgraphQuantizer quantizer(model, subgraph, error_reporter);
-    for (int op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
+    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       auto status = quantizer.QuantizeOperator(op_idx);
       if (status != kTfLiteOk) {
         OperatorT* op = subgraph->operators[op_idx].get();
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 52ed16c0b85416847984b79bdcaef29f02a3a444..cf3eb2dde6c3aa95963178041545b9cd8a1909c7 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -142,7 +142,10 @@ int main(int argc, char** argv) {
   };
 
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  CHECK(parse_result) << "Required test_model_file";
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
   g_test_model_dir =
       new tensorflow::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index b376d465e16b12e3ece963babfbe58d1224ddc0d..c8be07ec33ca14d24b5aecd7f541f9c50cd6bf36 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -34,34 +34,37 @@ namespace optimize {
 namespace {
 
 typedef struct {
-  TensorT* tensor;
+  OperatorT* op;
+  // The index of the op in the operators vector.
+  int32_t op_idx;
   // The index of the tensor to quantize in subgraph->tensors.
-  int32_t tensor_idx;
-  // The index of the tensor of the weight tensor to be quantize in op->inputs.
   int32_t op_input_idx;
-  // True if the tensor supports hybrid evaluation.
-  bool eval_hybrid;
-} TensorInfo;
+} ConsumerOpInfo;
 
 // The default minimum number of elements a weights array must have to be
 // quantized by this transformation.
 const int kWeightsMinNumElementsDefault = 1024;
 
-uint64_t CountTensorConsumers(const ModelT* model, const SubGraphT* subgraph,
-                              int32_t tensor_idx) {
-  uint64_t count = 0;
-  for (int op_idx = 0; op_idx < subgraph->operators.size(); ++op_idx) {
-    const OperatorT* op = subgraph->operators[op_idx].get();
+// Gets the operators that consume tensor_idx.
+std::vector<ConsumerOpInfo> GetTensorConsumers(const ModelT* model,
+                                               const SubGraphT* subgraph,
+                                               int32_t tensor_idx) {
+  // TODO(suharshs): If this proves to be too slow, avoid calling it per tensor,
+  // instead doing one sweep for the entire model.
+  std::vector<ConsumerOpInfo> consumer_ops;
+  for (size_t op_idx = 0; op_idx < subgraph->operators.size(); ++op_idx) {
+    OperatorT* op = subgraph->operators[op_idx].get();
     if (op == nullptr) {
       continue;
     }
-    for (int i = 0; i < op->inputs.size(); ++i) {
+    for (size_t i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == tensor_idx) {
-        count++;
+        consumer_ops.push_back(
+            {op, static_cast<int>(op_idx), static_cast<int>(i)});
       }
     }
   }
-  return count;
+  return consumer_ops;
 }
 
 // Gets the list of op->inputs indices of the weights inputs to be quantized for
@@ -119,23 +122,39 @@ bool IsHybridEvaluationOp(const OperatorT* op, const BuiltinOperator& op_code) {
   return eval_hybrid;
 }
 
-// Populates a vector of TensorInfos for each input tensor of op that should be
-// quantized.
-TfLiteStatus GetQuantizableTensorsFromOperator(
+// Returns true if all of the op's inputs are quantized.
+bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
+                               const BuiltinOperator& op_code) {
+  std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
+  for (const int32_t op_input_idx : op_input_indices) {
+    int32_t tensor_idx = op->inputs[op_input_idx];
+
+    if (tensor_idx == -1) {
+      // Optional tensor.
+      continue;
+    }
+
+    TensorT* tensor = subgraph->tensors[tensor_idx].get();
+
+    if (tensor->type != TensorType_INT8) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Inserts Tensors for each input tensor of op that should be
+// quantized into tensor_map.
+TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
-    bool use_hybrid_evaluation, std::vector<TensorInfo>* tensor_infos) {
+    std::unordered_map<int32_t, TensorT*>* tensor_map) {
   SubGraphT* subgraph = model->subgraphs.at(0).get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
 
-  tensor_infos->clear();
-
-  bool eval_hybrid = use_hybrid_evaluation && IsHybridEvaluationOp(op, op_code);
-
   std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
   for (const int32_t op_input_idx : op_input_indices) {
     int32_t tensor_idx = op->inputs[op_input_idx];
-
     if (tensor_idx == -1) {
       LOG(INFO) << "Skipping optional tensor input " << op_input_idx
                 << " of operation " << EnumNameBuiltinOperator(op_code);
@@ -143,14 +162,6 @@ TfLiteStatus GetQuantizableTensorsFromOperator(
     }
 
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
-    // TODO(suharshs): Support shared weights, i.e. If two tensors share the
-    // same weight array, things may break. (i.e. SSD object detection)
-    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
-      LOG(INFO) << "Skipping quantization of tensor " << tensor->name
-                << " that is shared between multiple multiple operations.";
-      continue;
-    }
-
     if (tensor->type != TensorType_FLOAT32) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is not type float.";
@@ -163,9 +174,6 @@ TfLiteStatus GetQuantizableTensorsFromOperator(
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " because it has fewer than " << weights_min_num_elements
                 << " elements (" << num_elements << ").";
-      // If one of the weights isn't quantized, then we cannot use the hybrid
-      // kernel for this operation, since it expects everything to be quantized.
-      eval_hybrid = false;
       continue;
     }
 
@@ -177,93 +185,8 @@ TfLiteStatus GetQuantizableTensorsFromOperator(
       continue;
     }
 
-    TensorInfo tensor_info;
-    tensor_info.eval_hybrid = eval_hybrid;
-    tensor_info.op_input_idx = op_input_idx;
-    tensor_info.tensor_idx = tensor_idx;
-    tensor_info.tensor = tensor;
-
-    tensor_infos->push_back(tensor_info);
-  }
-
-  return kTfLiteOk;
-}
-
-// Quantizes tensor using asymmetric quantization with the min and max elements
-// of the tensor. This is needed to pass to Dequantize operations.
-TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
-  BufferT* buffer = model->buffers[tensor->buffer].get();
-  float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  uint64_t num_elements;
-  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
-  LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for float evaluation.";
-
-  // Compute the quantization params.
-  float min_value = *std::min_element(float_data, float_data + num_elements);
-  float max_value = *std::max_element(float_data, float_data + num_elements);
-
-  if (tensor->quantization == nullptr) {
-    tensor->quantization = absl::make_unique<QuantizationParametersT>();
-  }
-  utils::GetAsymmetricQuantizationParams(min_value, max_value, 0, 255,
-                                         tensor->quantization.get());
-
-  // Quantize the buffer.
-  std::vector<uint8_t> quantized_buffer;
-  quantized_buffer.resize(num_elements);
-  const double inverse_scale = 1. / tensor->quantization->scale[0];
-  for (std::size_t i = 0; i < num_elements; i++) {
-    const float src_val = float_data[i];
-    double scaled_val;
-    if (tensor->quantization->scale[0] == 0) {
-      scaled_val = tensor->quantization->zero_point[0];
-    } else {
-      scaled_val =
-          tensor->quantization->zero_point[0] + inverse_scale * src_val;
-    }
-    uint8_t integer_val = static_cast<uint8_t>(std::round(scaled_val));
-    quantized_buffer[i] = integer_val;
-  }
-  model->buffers[tensor->buffer]->data = quantized_buffer;
-
-  // Update the tensor type.
-  tensor->type = TensorType_UINT8;
-
-  return kTfLiteOk;
-}
-
-// Quantizes tensor using symmetric quantization with the min and max elements
-// of the tensor. This is need for operations with hybrid evaluation
-// implemented.
-TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
-  BufferT* buffer = model->buffers[tensor->buffer].get();
-  float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  uint64_t num_elements;
-  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
-  LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for hybrid evaluation.";
-
-  std::vector<int8_t> quantized_buffer;
-  quantized_buffer.resize(num_elements);
-
-  float min_value, max_value, scaling_factor;
-  tensor_utils::SymmetricQuantizeFloats(float_data, num_elements,
-                                        quantized_buffer.data(), &min_value,
-                                        &max_value, &scaling_factor);
-
-  if (tensor->quantization == nullptr) {
-    tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    tensor_map->insert({tensor_idx, tensor});
   }
-  tensor->quantization->scale = std::vector<float>(1, scaling_factor);
-  tensor->quantization->zero_point = std::vector<int64_t>(1, 0);
-
-  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(quantized_buffer.data());
-  model->buffers[tensor->buffer]->data.assign(uint8_buffer,
-                                              uint8_buffer + num_elements);
-
-  // Update the tensor type.
-  tensor->type = TensorType_UINT8;
 
   return kTfLiteOk;
 }
@@ -271,7 +194,7 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
 // Returns the index of the Dequantize op_code.
 // If a Dequantize op_code doesn't exist, adds it and returns its index.
 int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
-  for (int i = 0; i < model->operator_codes.size(); ++i) {
+  for (size_t i = 0; i < model->operator_codes.size(); ++i) {
     if (model->operator_codes[i]->builtin_code == BuiltinOperator_DEQUANTIZE) {
       return i;
     }
@@ -279,7 +202,8 @@ int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = BuiltinOperator_DEQUANTIZE;
-  // TODO(suharshs): How should the version be set in this op_code?
+  // Version 2 and onwards supports INT8 inputs.
+  model->operator_codes[op_code_idx]->version = 2;
 
   // Return the index of the newly placed OperatorCodeT.
   return op_code_idx;
@@ -306,6 +230,26 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
   tensor->reset(tensor_raw);
 }
 
+// Updates operator code versions for the operators with INT8 inputs.
+void UpdateInt8OperatorVersions(ModelT* model) {
+  for (size_t i = 0; i < model->operator_codes.size(); ++i) {
+    const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
+    if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
+        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
+        op_code == BuiltinOperator_RNN ||
+        op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
+      model->operator_codes[i]->version = 2;
+
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
+               op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+               op_code == BuiltinOperator_LSTM) {
+      model->operator_codes[i]->version = 3;
+    }
+  }
+}
+
 TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
                                      const Model* input_model,
                                      bool use_hybrid_evaluation,
@@ -323,50 +267,82 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
   SubGraphT* subgraph = model->subgraphs.at(0).get();
 
   std::vector<std::unique_ptr<OperatorT>> new_operators;
-  for (int i = 0; i < subgraph->operators.size(); ++i) {
+  std::unordered_map<int32_t, TensorT*> tensor_map;
+  for (size_t i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
+    TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
+        model.get(), op, weights_min_num_elements, &tensor_map));
+  }
 
-    std::vector<TensorInfo> tensor_infos;
-    TF_LITE_ENSURE_STATUS(GetQuantizableTensorsFromOperator(
-        model.get(), op, weights_min_num_elements, use_hybrid_evaluation,
-        &tensor_infos));
-
-    for (const TensorInfo& tensor_info : tensor_infos) {
-      if (tensor_info.eval_hybrid) {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            SymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-      } else {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            AsymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-
-        // Create a new tensor to be the output of the dequantize op.
-        std::unique_ptr<TensorT> dequantize_output;
-        MakeTensor(tensor_info.tensor->name + "_dequantize",
-                   tensor_info.tensor->shape, &dequantize_output);
-        const int32_t dequantize_output_idx = subgraph->tensors.size();
-        subgraph->tensors.push_back(std::move(dequantize_output));
-
-        // Create the Dequantize operation.
-        std::unique_ptr<OperatorT> dequantize_op;
-        MakeDequantizeOperator(model.get(), &dequantize_op,
-                               tensor_info.tensor_idx, dequantize_output_idx);
-
-        // Update the op_input of tensor_idx to dequantize_output_idx.
-        op->inputs[tensor_info.op_input_idx] = dequantize_output_idx;
-
-        // Insert the newly created Dequantize operation.
-        new_operators.push_back(std::move(dequantize_op));
+  // The unordered_map ensures that we quantize each tensor exactly once.
+  // TODO(suharshs): This map key isn't sufficient when we support multiple
+  // subgraphs.
+  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    // Quantize the tensor.
+    TF_LITE_ENSURE_STATUS(
+        utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+  }
+
+  // Examine the tensor consumers to determine which require dequantize ops.
+  for (const auto& tensor_pair : tensor_map) {
+    const int32_t tensor_idx = tensor_pair.first;
+    TensorT* tensor = tensor_pair.second;
+    std::vector<ConsumerOpInfo> consumer_op_infos =
+        GetTensorConsumers(model.get(), subgraph, tensor_idx);
+
+    std::vector<ConsumerOpInfo> dequant_op_infos;  // Ops that need dequants.
+    for (ConsumerOpInfo& consumer_op_info : consumer_op_infos) {
+      OperatorT* consumer_op = consumer_op_info.op;
+      const BuiltinOperator consumer_op_code =
+          model->operator_codes[consumer_op->opcode_index]->builtin_code;
+      // If the op is a hybrid op and all the required tensors are quantized,
+      // we have no further work to do, but for all ops that require
+      // dequantization we need to add a Dequantize op.
+      bool eval_hybrid =
+          use_hybrid_evaluation &&
+          IsHybridEvaluationOp(consumer_op, consumer_op_code) &&
+          CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code);
+      if (!eval_hybrid) {
+        dequant_op_infos.push_back(consumer_op_info);
       }
     }
-    // After (maybe) quantizing inputs, we copy the operator into the new list.
-    new_operators.push_back(std::move(subgraph->operators[i]));
+
+    // If no ops require dequant, we are done for this tensor.
+    if (dequant_op_infos.empty()) {
+      continue;
+    }
+
+    // Create a new tensor to be the output of the dequantize op.
+    std::unique_ptr<TensorT> dequantize_output;
+    const string dequant_name = tensor->name + "_dequantize";
+    MakeTensor(dequant_name, tensor->shape, &dequantize_output);
+    const int32_t dequantize_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(dequantize_output));
+
+    // Create the Dequantize operation.
+    std::unique_ptr<OperatorT> dequantize_op;
+    MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                           dequantize_output_idx);
+
+    LOG(INFO) << "Creating Dequantize op with name " << dequant_name << ".";
+
+    // Update the op_input of all the ops that need the created dequantize
+    // operation.
+    int32_t min_op_idx = 0;
+    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+          dequantize_output_idx;
+      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+    }
+
+    // Insert the newly created Dequantize operation before the earliest
+    // consumer, since TFLite requires operators to be topo-sorted.
+    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                               std::move(dequantize_op));
   }
 
-  // At this point all unique_ptrs in the original operators are invalid, and
-  // we need to replace it with the new_operators vector.
-  subgraph->operators = std::move(new_operators);
+  // Update the modified operator code versions.
+  UpdateInt8OperatorVersions(model.get());
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model.get());
@@ -380,11 +356,12 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
 namespace internal {
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation) {
   // By default we require that only weights with more than
   // kWeightsMinSizeDefault elements are quantized.
   return QuantizeWeightsInternal(builder, input_model, use_hybrid_evaluation,
-                                 kWeightsMinNumElementsDefault);
+                                 weights_min_num_elements);
 }
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index c2c0b0ce83435dc423a62cea598e35ba45a0561f..6baecc210fa0b52ddccace05a3fc7d6a9908712d 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -48,6 +48,7 @@ namespace internal {
 // evaluation disabled.
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation);
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 32725e5ee29c364d56754c08a2cb1084ef049fdb..a18b3bb7ffecfa71f24890fb0cbfbdc94d66c0c2 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -12,215 +12,346 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/quantize_weights.h"
-
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 
-#include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace {
 
-class QuantizeWeightsTest : public ::testing::Test {
+std::unique_ptr<FlatBufferModel> ReadTestModel() {
+  auto model_path = tensorflow::io::JoinPath(
+      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadSharedWeightsTestModel() {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir,
+                                             internal::kModelWithSharedWeights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+template <typename T>
+std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
+  return std::vector<T>(vec->begin(), vec->end());
+}
+
+class QuantizeWeightsTest : public testing::Test {
  protected:
-  int GetElementsNum(const TensorT* tensor) {
-    int tensor_size = 1;
-    for (const int dim : tensor->shape) {
-      tensor_size *= dim;
-    }
-    return tensor_size;
+  QuantizeWeightsTest() {}
+
+  void LoadBasicModel() {
+    input_model_ = ReadTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  const OperatorT* GetOpWithOutput(const SubGraphT* subgraph,
-                                   int32_t output_tensor_idx) {
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      if (std::find(op->outputs.begin(), op->outputs.end(),
-                    output_tensor_idx) != op->outputs.end()) {
-        return op;
-      }
-    }
-    return nullptr;
+  void LoadSharedWeightsModel() {
+    input_model_ = ReadSharedWeightsTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  void SymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                     const BufferT* output_buffer,
-                                     float scale) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const int8_t* output_buffer_data =
-        reinterpret_cast<const int8_t*>(output_buffer->data.data());
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff = input_buffer_data[i] - (output_buffer_data[i] * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  std::unique_ptr<FlatBufferModel> input_model_;
+  const Model* model_;
+
+  bool IsModelInputOrOutput(const Model* model, uint32_t tensor_idx) {
+    for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+         ++subgraph_idx) {
+      const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+      for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
+        if (subgraph->inputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
+      for (size_t i = 0; i < subgraph->outputs()->size(); ++i) {
+        if (subgraph->outputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
     }
+    return false;
   }
 
-  void AsymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                      const BufferT* output_buffer, float scale,
-                                      int64_t zero_point) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const uint8_t* output_buffer_data = output_buffer->data.data();
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff =
-          input_buffer_data[i] - ((output_buffer_data[i] - zero_point) * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  // Returns the producer op code of the specified tensor_idx.
+  bool GetProducerOpCode(const Model* model, uint32_t subgraph_idx,
+                         uint32_t tensor_idx,
+                         tflite::BuiltinOperator* op_code) {
+    const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+    for (size_t op_idx = 0; op_idx < subgraph->operators()->size(); ++op_idx) {
+      const auto op = subgraph->operators()->Get(op_idx);
+      for (size_t i = 0; i < op->outputs()->size(); ++i) {
+        if (op->outputs()->Get(i) == tensor_idx) {
+          const uint32_t op_code_idx = op->opcode_index();
+          *op_code = model->operator_codes()->Get(op_code_idx)->builtin_code();
+          return true;
+        }
+      }
     }
+    return false;
   }
+};
 
-  void CheckWeights(const Model* input_model_packed,
-                    const Model* output_model_packed,
-                    bool use_hybrid_evaluation,
-                    uint64_t weights_min_num_elements = 1024) {
-    std::unique_ptr<ModelT> input_model;
-    input_model.reset(input_model_packed->UnPack());
-
-    std::unique_ptr<ModelT> output_model;
-    output_model.reset(output_model_packed->UnPack());
-
-    SubGraphT* subgraph = output_model->subgraphs.at(0).get();
-
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      const BuiltinOperator op_code =
-          output_model->operator_codes[op->opcode_index]->builtin_code;
-
-      // These are the operations that should be quantized.
-      // TODO(suharshs): Right now this test only checks the relevant operations
-      // for the mobilenet v1 model used in the tests below.
-      int32_t tensor_idx;
-      if (op_code == BuiltinOperator_CONV_2D ||
-          op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
-          op_code == BuiltinOperator_FULLY_CONNECTED) {
-        tensor_idx = op->inputs[1];
-      } else {
-        continue;
-      }
+TEST_F(QuantizeWeightsTest, QuantizationSucceeds) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
-      bool eval_hybrid = false;
-      // These are the ops that support hybrid evaluation.
-      if (op_code == BuiltinOperator_FULLY_CONNECTED ||
-          op_code == BuiltinOperator_CONV_2D) {
-        eval_hybrid = true;
-      }
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+}
 
-      const TensorT* tensor = subgraph->tensors[tensor_idx].get();
-      int tensor_size = GetElementsNum(tensor);
-      // If the tensor_size is less than 1024 we expect the tensor to remain
-      // unquantized.
-      if (tensor_size < weights_min_num_elements) {
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32)
-            << tensor->name << " of type " << tensor->type;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        // The weight tensor should not come from a dequantize op.
-        ASSERT_TRUE(preceding_op == nullptr);
-      } else if (use_hybrid_evaluation && eval_hybrid) {
-        // The input to the op should still be uint8.
-        ASSERT_TRUE(tensor->type == TensorType_UINT8) << tensor->name;
-        // The weight tensor should not come from a dequantize op.
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op == nullptr);
-
-        // Test symmetric quantization.
-        SymmetricDequantizeAndCompare(
-            input_model->buffers[tensor->buffer].get(),
-            output_model->buffers[tensor->buffer].get(),
-            tensor->quantization->scale[0]);
+TEST_F(QuantizeWeightsTest, WeightsMinNumElements) {
+  LoadBasicModel();
+  // Make weights_min_size sufficiently large such that no quantization should
+  // happen, i.e. the original model is the same size as the old one.
+  flatbuffers::FlatBufferBuilder builder;
+  const uint64_t kWeightsMinNumElements = 1000000;
+  EXPECT_EQ(QuantizeWeights(&builder, model_, kWeightsMinNumElements),
+            kTfLiteOk);
 
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      // Everything should remain equal between the two graphs.
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      EXPECT_EQ(quant_tensor->type(), float_tensor->type());
+    }
+  }
+}
+
+TEST_F(QuantizeWeightsTest, HybridConv) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  // Nothing should change.
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    // Make sure the graph only has one Conv operation.
+    ASSERT_EQ(quantized_graph->operators()->size(), 1);
+    const auto op = quantized_graph->operators()->Get(0);
+    const uint32_t op_code_idx = op->opcode_index();
+    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+              BuiltinOperator_CONV_2D);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      // If the tensor is a weight, it should have type INT8, otherwise it
+      // should stay with type FLOAT32.
+      // If the tensor is a bias, it should have type FLOAT32.
+      if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
+            << quant_tensor->name()->str();
       } else {
-        // The input to the op should still be float.
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32) << tensor->name;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op != nullptr);
-        // The float input should be the dequantize output.
-        ASSERT_TRUE(output_model->operator_codes[preceding_op->opcode_index]
-                        ->builtin_code == BuiltinOperator_DEQUANTIZE);
-        // Finally, ensure that the input to the dequantize operation is
-        // quantized.
-        const TensorT* quantized_tensor =
-            subgraph->tensors[preceding_op->inputs[0]].get();
-        ASSERT_TRUE(quantized_tensor->type == TensorType_UINT8);
-
-        // Test the assymetric quantization.
-        AsymmetricDequantizeAndCompare(
-            input_model->buffers[quantized_tensor->buffer].get(),
-            output_model->buffers[quantized_tensor->buffer].get(),
-            quantized_tensor->quantization->scale[0],
-            quantized_tensor->quantization->zero_point[0]);
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
     }
   }
-};
-
-TEST_F(QuantizeWeightsTest, SimpleTestWithHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
+}
 
+TEST_F(QuantizeWeightsTest, DequantizeConv) {
+  LoadBasicModel();
   flatbuffers::FlatBufferBuilder builder;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model), kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation=*/false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, true);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    // The output graph should have an extra tensor from the added dequantize
+    // op.
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size() + 1);
+    // Check that a dequantize op exists.
+    int32_t dequant_input_idx = -1;
+    int32_t dequant_output_idx = -1;
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+          BuiltinOperator_DEQUANTIZE) {
+        dequant_input_idx = op->inputs()->Get(0);
+        dequant_output_idx = op->outputs()->Get(0);
+      }
+    }
+    ASSERT_GT(dequant_input_idx, -1);
+    ASSERT_GT(dequant_output_idx, -1);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      // If the tensor is a weight, it should have type INT8.
+      // If the tensor is a bias, it should have type FLOAT32.
+      // If the tensor is an input or output it should have type FLOAT32.
+      // The input to dequantize should be INT8, and all other tensors should be
+      // FLOAT32.
+      if (i == dequant_input_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else if (i == dequant_output_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        // If its a non-bias constant tensor, is must be the weight.
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      }
+    }
+  }
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithoutHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Disable hybrid evaluation.
-  EXPECT_EQ(internal::QuantizeWeights(&builder, input_model, false), kTfLiteOk);
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, false);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is now INT8.
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(op->inputs()->Get(1));
+        EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithWeightsMinNumElements) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Dequantize) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Make weights_min_size sufficiently large such that no quantization should
-  // happen, i.e. the original model is the same size as the old one.
-  const uint64_t kWeightsMinNumElements = 1000000;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model, kWeightsMinNumElements),
-            kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation*/ false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-  CheckWeights(input_model, output_model, true, kWeightsMinNumElements);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is still FLOAT
+        // (the output of the dequantize).
+        uint32_t weights_tensor_index = op->inputs()->Get(1);
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(weights_tensor_index);
+        EXPECT_EQ(weights_tensor->type(), TensorType_FLOAT32);
+
+        // Check that it comes from a dequantize operation.
+        BuiltinOperator producer_op_code;
+        ASSERT_TRUE(GetProducerOpCode(output_model, subgraph_idx,
+                                      weights_tensor_index, &producer_op_code));
+        EXPECT_EQ(producer_op_code, BuiltinOperator_DEQUANTIZE);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-// TODO(suharshs): Add tests that run the resulting model.
-
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
index bd6e6e81b29a7649892e0f63b4abef98b1e9ed68..9133c136f325e8d250f479453af486f683e97c63 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
@@ -33,9 +33,6 @@ namespace optimize {
 namespace internal {
 
 namespace {
-const int8_t kMinQuantizedValue = -127;
-const int8_t kMaxQuantizedValue = 127;
-
 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
                                    const std::vector<int64_t>& zero_point,
                                    int quantized_dimension,
@@ -63,24 +60,24 @@ bool OpHasOptionalBiasTensor(BuiltinOperator op_code) {
 
 struct OpWithBiasTensors {
   int activation_input_index;
-  int bias_input_index;
   int weights_input_index;
+  int bias_input_index;
   int index_for_channel_in_weights;
 };
 
 const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
   if (op_code == BuiltinOperator_CONV_2D) {
-    static OpWithBiasTensors op_info = {.activation_input_index = 0,
-                                        .weights_input_index = 1,
-                                        .bias_input_index = 2,
-                                        .index_for_channel_in_weights = 0};
+    static OpWithBiasTensors op_info = {/* activation_input_index */ 0,
+                                        /* weights_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 0};
     return &op_info;
   }
   if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
-    static OpWithBiasTensors op_info = {.activation_input_index = 0,
-                                        .weights_input_index = 1,
-                                        .bias_input_index = 2,
-                                        .index_for_channel_in_weights = 3};
+    static OpWithBiasTensors op_info = {/* bias_input_index */ 0,
+                                        /* bias_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 3};
     return &op_info;
   }
 
@@ -91,88 +88,34 @@ const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
 TfLiteStatus SymmetricPerChannelQuantizeTensor(ModelT* model, TensorT* tensor,
                                                int32_t channel_dim_index,
                                                ErrorReporter* error_reporter) {
-  int32_t channel_dim_size = tensor->shape[channel_dim_index];
   if (tensor->shape.size() != 4) {
     error_reporter->Report("Only dims=4 is supported, tensor dims: %d",
                            tensor->shape.size());
     return kTfLiteError;
   }
 
+  // Get dimensions.
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
-  const uint64_t num_elements_per_channel = num_elements / channel_dim_size;
-
-  std::vector<float> min_vals(channel_dim_size);
-  std::vector<float> max_vals(channel_dim_size);
-  std::vector<bool> has_min_max_value(channel_dim_size, false);
-  int indices[4];
-  RuntimeShape tensor_dims{tensor->shape[0], tensor->shape[1], tensor->shape[2],
-                           tensor->shape[3]};
-  BufferT* buffer = model->buffers[tensor->buffer].get();
-  float* float_data = reinterpret_cast<float*>(buffer->data.data());
+  const int32_t channel_dim_size = tensor->shape[channel_dim_index];
 
-  // Compute min max ranges per channel
-  for (indices[0] = 0; indices[0] < tensor->shape[0]; indices[0]++) {
-    for (indices[1] = 0; indices[1] < tensor->shape[1]; indices[1]++) {
-      for (indices[2] = 0; indices[2] < tensor->shape[2]; indices[2]++) {
-        for (indices[3] = 0; indices[3] < tensor->shape[3]; indices[3]++) {
-          int channel_idx = indices[channel_dim_index];
-          const float val = float_data[Offset(tensor_dims, indices)];
-          if (has_min_max_value[channel_idx]) {
-            if (min_vals[channel_idx] > val) {
-              min_vals[channel_idx] = val;
-            } else if (max_vals[channel_idx] < val) {
-              max_vals[channel_idx] = val;
-            }
-          } else {
-            min_vals[channel_idx] = val;
-            max_vals[channel_idx] = val;
-            has_min_max_value[channel_idx] = true;
-          }
-        }
-      }
-    }
-  }
+  // Get input float data.
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_input_data = reinterpret_cast<float*>(buffer->data.data());
 
-  // Calculate scales per channel
+  // Create container for output scale and output data.
   std::vector<float> scales(channel_dim_size);
-  std::vector<float> scale_invs(channel_dim_size);
-  const float half_scale = kMaxQuantizedValue;
-  for (size_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
-    const float half_range = std::max(std::abs(min_vals[channel_idx]),
-                                      std::abs(max_vals[channel_idx]));
-    scales[channel_idx] = half_range / half_scale;
-    if (half_range == 0) {
-      scale_invs[channel_idx] = 0;
-    } else {
-      scale_invs[channel_idx] = half_scale / half_range;
-    }
-  }
-
-  // Quantize the values.
-  std::vector<int8_t> quantized_buffer(num_elements_per_channel);
   std::vector<int8_t> final_buffer(num_elements);
-  memset(indices, 0, 4 * sizeof(int));
-  for (indices[0] = 0; indices[0] < tensor->shape[0]; indices[0]++) {
-    for (indices[1] = 0; indices[1] < tensor->shape[1]; indices[1]++) {
-      for (indices[2] = 0; indices[2] < tensor->shape[2]; indices[2]++) {
-        for (indices[3] = 0; indices[3] < tensor->shape[3]; indices[3]++) {
-          int channel_idx = indices[channel_dim_index];
-          int index = Offset(tensor_dims, indices);
-          const float val = float_data[index];
-          const int32_t quantized_value =
-              static_cast<int32_t>(TfLiteRound(val * scale_invs[channel_idx]));
-          final_buffer[index] = std::min<int8_t>(
-              kMaxQuantizedValue,
-              std::max<int8_t>(kMinQuantizedValue, quantized_value));
-        }
-      }
-    }
-  }
+
+  // Quantize the input data with respect to channel_dim_index.
+  const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1],
+                                        tensor->shape[2], tensor->shape[3]};
+  utils::SymmetricPerChannelQuantization(
+      float_input_data, tensor_dims, channel_dim_index, &scales, &final_buffer);
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int8_t);
+  const size_t buffer_size = num_elements * sizeof(int8_t);
   std::vector<int64_t> zero_point(scales.size(), 0);
   return AddQuantizationParams(scales, zero_point, channel_dim_index,
                                uint8_buffer, buffer_size, TensorType_INT8,
@@ -243,14 +186,10 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(const TensorT* input_tensor,
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
   size_t buffer_size = num_elements * sizeof(int32_t);
-  // For Bias we only set the quantized values, the scale and quantized
-  // dimension is implicit.
-  tensor->quantization = nullptr;
-  model->buffers[tensor->buffer]->data.assign(uint8_buffer,
-                                              uint8_buffer + buffer_size);
-
-  tensor->type = TensorType_INT32;
-  return kTfLiteOk;
+  std::vector<int64_t> zero_point(scales.size(), 0);
+  return AddQuantizationParams(scales, zero_point, channel_dim_index,
+                               uint8_buffer, buffer_size, TensorType_INT32,
+                               model, tensor);
 }
 }  // namespace
 
@@ -368,7 +307,7 @@ TfLiteStatus SubgraphQuantizer::PropagateMinMaxForAvgAndMaxPool(
   return kTfLiteOk;
 }
 
-TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSingleInputOutputOp(
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSoftmax(
     BuiltinOperator op_code, OperatorT* op) {
   TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
   TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
@@ -381,8 +320,34 @@ TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSingleInputOutputOp(
   if (output_tensor->type != TensorType_FLOAT32) {
     return kTfLiteOk;
   }
-  auto quant_params = absl::make_unique<QuantizationParametersT>();
-  TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->outputs[0]));
+
+  // Softmax output is hardcoded to have 1/256 as scale and -128 as zero point.
+  output_tensor->type = TensorType_INT8;
+  output_tensor->quantization->scale = {1.0f / 256.0f};
+  output_tensor->quantization->zero_point = {-128};
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeInputsAndOutputs(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE(this->error_reporter_, !op->inputs.empty());
+  TF_LITE_ENSURE(this->error_reporter_, !op->outputs.empty());
+  for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
+    auto input_tensor = subgraph_->tensors[op->inputs[input_idx]].get();
+    if (IsSubgraphInput(op->inputs[input_idx]) &&
+        input_tensor->type == TensorType_FLOAT32) {
+      TF_LITE_ENSURE_STATUS(
+          AsymmetricQuantizeTensor(op_code, op->inputs[input_idx]));
+    }
+  }
+
+  for (size_t output_idx = 0; output_idx < op->outputs.size(); ++output_idx) {
+    auto output_tensor = subgraph_->tensors[op->outputs[output_idx]].get();
+    if (output_tensor->type == TensorType_FLOAT32) {
+      TF_LITE_ENSURE_STATUS(
+          AsymmetricQuantizeTensor(op_code, op->outputs[output_idx]));
+    }
+  }
   return kTfLiteOk;
 }
 
@@ -403,8 +368,11 @@ TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) {
     case BuiltinOperator_MAX_POOL_2D:
       return PropagateMinMaxForAvgAndMaxPool(op_code, op);
     case BuiltinOperator_SQUEEZE:
+    case BuiltinOperator_RESHAPE:
+    case BuiltinOperator_ADD:
+      return AsymmetricQuantizeInputsAndOutputs(op_code, op);
     case BuiltinOperator_SOFTMAX:
-      return AsymmetricQuantizeSingleInputOutputOp(op_code, op);
+      return AsymmetricQuantizeSoftmax(op_code, op);
     default:
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.h b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
index 9d6ca7fad594f4831847c2b2f9de5d6bc0be5e6d..217f1b66757741bf382976c18eeb609b52cfd540 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer.h
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
@@ -46,10 +46,15 @@ class SubgraphQuantizer {
   TfLiteStatus PropagateMinMaxForAvgAndMaxPool(BuiltinOperator op_code,
                                                OperatorT* op);
 
-  // Asymmetric quantizes inputs and outputs of an Op that has single input and
-  // single output. E.g. Squeeze.
-  TfLiteStatus AsymmetricQuantizeSingleInputOutputOp(BuiltinOperator op_code,
-                                                     OperatorT* op);
+  // Asymmetric quantizes inputs and outputs of an Softmax Op.
+  // Input is quantized with the min-max range and output is hardcoded to have
+  // 1/256 as scale and -128 as zero point.
+  TfLiteStatus AsymmetricQuantizeSoftmax(BuiltinOperator op_code,
+                                         OperatorT* op);
+
+  // Asymmetric quantizes an Op with multiple inputs and outputs. E.g Add.
+  TfLiteStatus AsymmetricQuantizeInputsAndOutputs(BuiltinOperator op_code,
+                                                  OperatorT* op);
 
   TfLiteStatus AsymmetricQuantizeTensor(BuiltinOperator op_code,
                                         int32_t tensor_idx);
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
index 1ea49da899f64b2c2ff65ddf0540daa33a5b28a4..265686c363b4c5692d99a74ebf176a1a54cce9af 100644
--- a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
@@ -53,6 +53,10 @@ std::unique_ptr<FlatBufferModel> ReadAvgPoolModel() {
   return ReadModel(kSingleAvgPoolModelMinMinus5MaxPlus5);
 }
 
+std::unique_ptr<FlatBufferModel> ReadMultiInputAddWithReshapeModel() {
+  return ReadModel(kMultiInputAddWithReshape);
+}
+
 TEST(SubgraphQuantizerTest, VerifyConvQuantizationWithUnitScale) {
   ASSERT_TRUE(g_test_model_dir);
   ASSERT_FALSE(g_test_model_dir->empty());
@@ -90,16 +94,16 @@ TEST(SubgraphQuantizerTest, VerifyConvQuantizationWithUnitScale) {
 
   ASSERT_TRUE(weights_tensor->quantization);
   const int out_channel_size = weights_tensor->shape[0];
-
-  // Bias tensor doesn't contain quantization info.
-  ASSERT_FALSE(bias_tensor->quantization);
-
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
   const std::vector<float>& weights_scales =
       weights_tensor->quantization->scale;
 
   const std::vector<int64_t>& weights_zero_points =
       weights_tensor->quantization->zero_point;
 
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
   ASSERT_EQ(weights_scales.size(), out_channel_size);
   ASSERT_EQ(weights_zero_points.size(), out_channel_size);
   ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
@@ -108,6 +112,7 @@ TEST(SubgraphQuantizerTest, VerifyConvQuantizationWithUnitScale) {
 
   for (size_t i = 0; i < out_channel_size; i++) {
     EXPECT_EQ(weights_scales[i], 1);
+    EXPECT_EQ(bias_scales[i], 1);
     EXPECT_EQ(weights_zero_points[i], 0);
   }
 
@@ -188,21 +193,28 @@ TEST(SubgraphQuantizerTest, VerifyConvQuantization) {
 
   ASSERT_TRUE(weights_tensor->quantization);
   const int out_channel_size = weights_tensor->shape[0];
-
-  // Bias tensor doesn't contain quantization info.
-  ASSERT_FALSE(bias_tensor->quantization);
-
+  ASSERT_TRUE(bias_tensor->quantization);
   ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
   const std::vector<float>& weights_scales =
       weights_tensor->quantization->scale;
   const std::vector<int64_t>& weights_zero_points =
       weights_tensor->quantization->zero_point;
 
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
   ASSERT_EQ(weights_scales.size(), out_channel_size);
   ASSERT_EQ(weights_zero_points.size(), out_channel_size);
   ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
   ASSERT_EQ(output_tensor->quantization->scale.size(), 1);
 
+  const float eps = 1e-7;
+
+  // Bias scale should be input * per_channel_weight_scale.
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_NEAR(bias_scales[i],
+                input_tensor->quantization->scale[0] * weights_scales[i], eps);
+  }
+
   const auto bias_buffer = model.buffers[bias_tensor->buffer].get();
   ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
   const int32_t* bias_values =
@@ -213,10 +225,8 @@ TEST(SubgraphQuantizerTest, VerifyConvQuantization) {
       reinterpret_cast<const float*>(original_bias_buffer->data()->data());
 
   for (size_t i = 0; i < out_channel_size; i++) {
-    const float bias_scale =
-        input_tensor->quantization->scale[0] * weights_scales[i];
-    auto dequantized_value = bias_values[i] * bias_scale;
-    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scale / 2);
+    auto dequantized_value = bias_values[i] * bias_scales[i];
+    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
   }
 
   const auto weights_buffer = model.buffers[weights_tensor->buffer].get();
@@ -285,6 +295,7 @@ TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
   ASSERT_EQ(op->outputs.size(), 1);
   auto float_graph = readonly_model->subgraphs()->Get(0);
 
+  // Verify input.
   ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
             TensorType_FLOAT32);
   ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
@@ -300,12 +311,18 @@ TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
   VerifyAsymmetricQuantizationScale(*float_input_quant_params,
                                     *input_quant_params);
 
+  // Verify output.
   auto float_output_quant_params =
       float_graph->tensors()->Get(op->outputs[0])->quantization();
   auto output_quant_params =
       subgraph->tensors[op->outputs[0]]->quantization.get();
-  VerifyAsymmetricQuantizationScale(*float_output_quant_params,
-                                    *output_quant_params);
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+
+  ASSERT_EQ(output_quant_params->scale.size(), 1);
+  ASSERT_EQ(output_quant_params->zero_point.size(), 1);
+  ASSERT_EQ(1.0f / 256.0f, output_quant_params->scale[0]);
+  ASSERT_EQ(-128, output_quant_params->zero_point[0]);
 }
 
 TEST(SubgraphQuantizerTest, VerifyAvgPoolQuantization) {
@@ -365,6 +382,120 @@ TEST(SubgraphQuantizerTest, VerifyAvgPoolQuantization) {
   EXPECT_EQ(input_quant_params->scale[0], output_quant_params->scale[0]);
 }
 
+TEST(SubgraphQuantizerTest, VerifyReshapeQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadMultiInputAddWithReshapeModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  // 2 operators RESHAPE and ADD
+  ASSERT_EQ(subgraph->operators.size(), 2);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+  status = quantizer.QuantizeOperator(1);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  // Verify Reshape is quantized.
+  auto op = subgraph->operators[1].get();
+  ASSERT_EQ(model.operator_codes[op->opcode_index].get()->builtin_code,
+            BuiltinOperator_RESHAPE);
+
+  ASSERT_EQ(op->inputs.size(), 2);
+  ASSERT_EQ(op->outputs.size(), 1);
+
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  auto float_input_quant_params =
+      float_graph->tensors()->Get(op->inputs[0])->quantization();
+  auto input_quant_params =
+      subgraph->tensors[op->inputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                    *input_quant_params);
+
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+  ASSERT_EQ(output_quant_params->min.size(), 1);
+  ASSERT_EQ(output_quant_params->max.size(), 1);
+}
+
+TEST(SubgraphQuantizerTest, VerifyAddQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadMultiInputAddWithReshapeModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  // 2 operators RESHAPE and ADD
+  ASSERT_EQ(subgraph->operators.size(), 2);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+  status = quantizer.QuantizeOperator(1);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  // Verify ADD is quantized.
+  auto op = subgraph->operators[0].get();
+  ASSERT_EQ(model.operator_codes[op->opcode_index].get()->builtin_code,
+            BuiltinOperator_ADD);
+
+  ASSERT_EQ(op->inputs.size(), 2);
+  ASSERT_EQ(op->outputs.size(), 1);
+
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[1])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  for (size_t input_idx = 0; input_idx < 2; ++input_idx) {
+    EXPECT_EQ(subgraph->tensors[op->inputs[input_idx]].get()->type,
+              TensorType_INT8);
+    auto float_input_quant_params =
+        float_graph->tensors()->Get(op->inputs[input_idx])->quantization();
+    auto input_quant_params =
+        subgraph->tensors[op->inputs[input_idx]]->quantization.get();
+    VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                      *input_quant_params);
+  }
+
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+  ASSERT_EQ(output_quant_params->min.size(), 1);
+  ASSERT_EQ(output_quant_params->max.size(), 1);
+}
+
 }  // namespace
 }  // namespace internal
 }  // namespace optimize
@@ -378,7 +509,10 @@ int main(int argc, char** argv) {
   };
 
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  CHECK(parse_result) << "Required test_model_file";
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
   g_test_model_dir =
       new tensorflow::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 350614290185a06e016c02e7d5de9b6a6ad0a50a..e0f18730d5e52caffe08adc065c5e7906405d60e 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -20,16 +20,20 @@ namespace tflite {
 namespace optimize {
 namespace internal {
 const char* kConvModelWithMinus128Plus127Weights =
-    "single_conv_weights_min_minus_127_max_plus_127.tflite";
+    "single_conv_weights_min_minus_127_max_plus_127.bin";
 
 const char* kConvModelWith0Plus10Weights =
-    "single_conv_weights_min_0_max_plus_10.tflite";
+    "single_conv_weights_min_0_max_plus_10.bin";
 
 const char* kSingleSoftmaxModelMinMinus5MaxPlus5 =
-    "single_softmax_min_minus_5_max_plus_5.tflite";
+    "single_softmax_min_minus_5_max_plus_5.bin";
 
 const char* kSingleAvgPoolModelMinMinus5MaxPlus5 =
-    "single_avg_pool_min_minus_5_max_plus_5.tflite";
+    "single_avg_pool_min_minus_5_max_plus_5.bin";
+
+const char* kModelWithSharedWeights = "weight_shared_between_convs.bin";
+
+const char* kMultiInputAddWithReshape = "multi_input_add_reshape.bin";
 
 int FailOnErrorReporter::Report(const char* format, va_list args) {
   char buf[1024];
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index 1981068acc0649d9b800d093127675ce4979cedd..199769ca55248ae1a3af63029b9fe27b531636d0 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -42,6 +42,13 @@ extern const char* kSingleSoftmaxModelMinMinus5MaxPlus5;
 // and max in range [-5, 5], not necessarily -5 or +5.
 extern const char* kSingleAvgPoolModelMinMinus5MaxPlus5;
 
+// Test model with a weights variable that is shared between a convolution layer
+// and an add operation.
+extern const char* kModelWithSharedWeights;
+
+// Test model with Add followed by a reshape. Model has 2 inputs for add.
+extern const char* kMultiInputAddWithReshape;
+
 // An error reporter that fails on testing.
 class FailOnErrorReporter : public ErrorReporter {
  public:
diff --git a/tensorflow/lite/tools/optimize/testdata/README.md b/tensorflow/lite/tools/optimize/testdata/README.md
index b6b331b828991299a39b32ca9d2a296a4eb45ade..0a924816f996899f005163f38c38e15c2a051c98 100644
--- a/tensorflow/lite/tools/optimize/testdata/README.md
+++ b/tensorflow/lite/tools/optimize/testdata/README.md
@@ -4,20 +4,24 @@ This directory contains test models for testing quantization.
 
 ## Models
 
-* `single_conv_weights_min_0_max_plus_10.tflite` \
+* `single_conv_weights_min_0_max_plus_10.bin` \
    A floating point model with single convolution where all weights are
    integers between [0, 10] weights are randomly distributed. It is not
    guaranteed that min max for weights are going to appear in each channel.
    All activations have min maxes and activations are in range [0,10].
-* `single_conv_weights_min_minus_127_max_plus_127.tflite` \
+* `single_conv_weights_min_minus_127_max_plus_127.bin` \
    A floating point model with a single convolution where weights of the model
    are all integers that lie in range[-127, 127]. The weights have been put in
    such a way that each channel has at least one weight as -127 and one weight
    as 127. The activations are all in range: [-128, 127].
    This means all bias computations should result in 1.0 scale.
-* `single_softmax_min_minus_5_max_5.tflite` \
+* `single_softmax_min_minus_5_max_5.bin` \
    A floating point model with a single softmax. The input tensor has min
    and max in range [-5, 5], not necessarily -5 or +5.
-* `single_avg_pool_input_min_minus_5_max_5.tflite` \
+* `single_avg_pool_input_min_minus_5_max_5.bin` \
    A floating point model with a single average pool. The input tensor has min
    and max in range [-5, 5], not necessarily -5 or +5.
+* `weight_shared_between_convs.bin` \
+   A floating point model with two convs that have a use the same weight tensor.
+* `multi_input_add_reshape.bin` \
+   A floating point model with two inputs with an add followed by a reshape.
diff --git a/tensorflow/lite/tools/optimize/testdata/multi_input_add_reshape.bin b/tensorflow/lite/tools/optimize/testdata/multi_input_add_reshape.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a9217c52314b4f64d2457e656a1edabe4f645f80
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/multi_input_add_reshape.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a65f39ee29514b27ea3af861c10dd452ab9e5ce2
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..70cbc0620ad7222817cf241030acb98387083154
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29b9f47097d466b65831514cec3a00f19f5cbdf3
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3caba63492e174229ef605bfbb0d2ddeda2ba61d
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c903c82eec32df8aa0d3462262b61daa30fc251
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin differ
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 02d6e6b23cdd66c9dd87700e4be6bb2cfbee407f..ffc56c19658986e39c1d8085761120b8373c919e 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/verifier.h"
 #include <climits>
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/version.h"
@@ -53,7 +54,7 @@ const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
 
 // Verifies string tensor has legit buffer contents that follow the schema
 // defined in lite/string_util.h
-bool VerifyStringTensorBuffer(const Buffer& buffer,
+bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                               ErrorReporter* error_reporter) {
   uint32_t buffer_size = buffer.data()->size();
   const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
@@ -61,7 +62,8 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
   uint32_t num_strings = *GetIntPtr(buffer_ptr);
   if (num_strings > kMaxNumString) {
     ReportError(error_reporter,
-                "String tensor has invalid num of string set: %d", num_strings);
+                "String tensor %s has invalid num of string set: %d",
+                tensor.name()->c_str(), num_strings);
     return false;
   }
   uint32_t header_offsets =
@@ -69,9 +71,9 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (buffer_size < header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer requires at least %d bytes, but is "
+                "String tensor %s buffer requires at least %d bytes, but is "
                 "allocated with %d bytes",
-                header_offsets, buffer_size);
+                tensor.name()->c_str(), header_offsets, buffer_size);
     return false;
   }
 
@@ -80,22 +82,24 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer initial offset must be: %d",
-                header_offsets);
+                "String tensor %s buffer initial offset must be: %d",
+                tensor.name()->c_str(), header_offsets);
     return false;
   }
   offset += sizeof(int32_t);
   for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
     int string_offset = *GetIntPtr(buffer_ptr + offset);
     if (string_offset < prev_ptr || string_offset > buffer_size) {
-      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
-                  i);
+      ReportError(error_reporter,
+                  "String tensor %s buffer is invalid: index %d",
+                  tensor.name()->c_str(), i);
       return false;
     }
   }
   if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
-    ReportError(error_reporter, "String tensor buffer last offset must be %d",
-                buffer_size);
+    ReportError(error_reporter,
+                "String tensor %s buffer last offset must be %d",
+                tensor.name()->c_str(), buffer_size);
     return false;
   }
   return true;
@@ -105,10 +109,15 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                                ErrorReporter* error_reporter) {
   uint64_t bytes_required = 1;
+  if (!tensor.shape()) {
+    // Empty tensor. Avoid further checks.
+    return true;
+  }
   for (int dim : *tensor.shape()) {
     bytes_required *= dim;
     if (bytes_required > UINT_MAX) {
-      ReportError(error_reporter, "Tensor dimension overflow");
+      ReportError(error_reporter, "Tensor %s dimension overflow",
+                  tensor.name()->c_str());
       return false;
     }
   }
@@ -116,31 +125,36 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT32:
       bytes_required *= sizeof(float);
       break;
-    case TensorType_INT32:
-      bytes_required *= sizeof(int32_t);
+    case TensorType_INT8:
+      bytes_required *= sizeof(int8_t);
       break;
     case TensorType_UINT8:
       bytes_required *= sizeof(uint8_t);
       break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
     case TensorType_INT64:
       bytes_required *= sizeof(int64_t);
       break;
     case TensorType_FLOAT16:
       // FALLTHROUGH_INTENDED;
     default:
-      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      ReportError(error_reporter, "Tensor %s invalid type: %d",
+                  tensor.name()->c_str(), tensor.type());
       return false;
   }
   if (bytes_required > UINT_MAX) {
-    ReportError(error_reporter, "Tensor dimension overflow");
+    ReportError(error_reporter, "Tensor %s dimension overflow",
+                tensor.name()->c_str());
     return false;
   }
 
   if (bytes_required != buffer.data()->size()) {
     ReportError(
         error_reporter,
-        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
-        bytes_required, buffer.data()->size());
+        "Tensor %s requires %d bytes, but is allocated with %d bytes buffer",
+        tensor.name()->c_str(), bytes_required, buffer.data()->size());
     return false;
   }
   return true;
@@ -166,6 +180,86 @@ bool VerifyOperators(const Vector<Offset<Operator>>& operators,
   return true;
 }
 
+bool IsConstantTensor(const Tensor& tensor, const Model& model) {
+  if (!tensor.buffer() || !model.buffers()) return false;
+  if (tensor.buffer() > 0 && tensor.buffer() < model.buffers()->size()) {
+    auto* buffer = model.buffers()->Get(tensor.buffer());
+    if (buffer && buffer->data()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Performs basic consistency checks on a sub-graph.
+bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
+                               ErrorReporter* error_reporter) {
+  absl::flat_hash_set<int> subgraph_input_tensors, constant_tensors,
+      variable_tensors, output_tensors;
+  for (int i = 0; i < subgraph.tensors()->Length(); ++i) {
+    const auto* tensor = subgraph.tensors()->Get(i);
+    if (IsConstantTensor(*tensor, model)) {
+      constant_tensors.insert(i);
+    } else if (tensor->is_variable()) {
+      variable_tensors.insert(i);
+    }
+  }
+  for (const int tensor_idx : *subgraph.inputs()) {
+    subgraph_input_tensors.insert(tensor_idx);
+  }
+
+  for (int op_idx = 0; op_idx < subgraph.operators()->Length(); ++op_idx) {
+    const auto* op = subgraph.operators()->Get(op_idx);
+    const auto& opcode = model.operator_codes()->Get(op->opcode_index());
+    // Check for invalid inputs by ensuring all exist in produced_tensors.
+    for (const int input_idx : *op->inputs()) {
+      if (input_idx == kOptionalTensor) continue;
+      if (constant_tensors.find(input_idx) == constant_tensors.end() &&
+          variable_tensors.find(input_idx) == variable_tensors.end() &&
+          subgraph_input_tensors.find(input_idx) ==
+              subgraph_input_tensors.end() &&
+          output_tensors.find(input_idx) == output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Input tensor %d to op %d (%s) is not produced", input_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+    }
+    // Check for cycles/invalid outputs by ensuring that none exist in
+    // produced_tensors.
+    for (const int output_idx : *op->outputs()) {
+      if (constant_tensors.find(output_idx) != constant_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a constant", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (variable_tensors.find(output_idx) != variable_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a variable", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (subgraph_input_tensors.find(output_idx) !=
+                 subgraph_input_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a subgraph input",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (output_tensors.find(output_idx) != output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is an output from "
+                    "another op. There is a cycle in the graph",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+      // This can be an input to a subsequent op.
+      output_tensors.insert(output_idx);
+    }
+  }
+  return true;
+}
+
 bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
   if (!model.subgraphs()) {
     ReportError(error_reporter, "Missing 'subgraphs' section.");
@@ -180,6 +274,10 @@ bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
     if (!VerifyOperators(*subgraph->operators(), error_reporter)) {
       return false;
     }
+
+    if (!VerifySubGraphConsistency(model, *subgraph, error_reporter)) {
+      return false;
+    }
   }
   return true;
 }
@@ -203,14 +301,14 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
         continue;
       }
       if (tensor->buffer() >= model.buffers()->size()) {
-        ReportError(error_reporter, "Invalid tensor buffer index: %d",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s invalid buffer index: %d",
+                    tensor->name(), tensor->buffer());
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
       if (!buffer) {
-        ReportError(error_reporter, "Tensor buffer %d not set",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s buffer %d not set",
+                    tensor->name(), tensor->buffer());
         return false;
       }
 
@@ -218,7 +316,7 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
       // buffers will be allocated by the interpreter at run-time.
       if (buffer->data()) {
         if (tensor->type() == TensorType_STRING) {
-          if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          if (!VerifyStringTensorBuffer(*tensor, *buffer, error_reporter)) {
             return false;
           }
         } else {
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 98abafad927ae45cd7de428d0011e234f345dd6e..1c1f764ee24153124c8d38d2c3f97d5f44f8a21b 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -25,13 +27,29 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/verifier.h"
 #include "tensorflow/lite/version.h"
-#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
 using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : buffer_size_(0) {}
+  int Report(const char* format, va_list args) override {
+    buffer_size_ = vsnprintf(buffer_, kBufferSize, format, args);
+    return buffer_size_;
+  }
+  int GetBufferSize() { return buffer_size_; }
+
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
+ private:
+  static constexpr int kBufferSize = 256;
+  char buffer_[kBufferSize];
+  int buffer_size_;
+};
+
 // Build single subgraph model.
 class TfLiteFlatbufferModelBuilder {
  public:
@@ -54,14 +72,22 @@ class TfLiteFlatbufferModelBuilder {
   }
 
   void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
-                 const std::vector<uint8_t>& buffer, const char* name) {
+                 const std::vector<uint8_t>& buffer, const char* name,
+                 const bool is_variable = false) {
     int buffer_index = 0;
     if (!buffer.empty()) {
       buffer_index = buffers_.size();
       buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
+    if (shape.empty()) {
+      tensors_.push_back(CreateTensorDirect(builder_, /*shape=*/nullptr, type,
+                                            buffer_index, name,
+                                            /*quantization=*/0, is_variable));
+      return;
+    }
     tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
-                                          name, /*quantization=*/0));
+                                          name, /*quantization=*/0,
+                                          is_variable));
   }
 
   void AddOperator(const std::vector<int32_t>& inputs,
@@ -92,13 +118,16 @@ class TfLiteFlatbufferModelBuilder {
 
   bool Verify() {
     return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
-                          resolver_, DefaultErrorReporter());
+                          resolver_, &mock_reporter_);
   }
 
+  string GetErrorString() { return mock_reporter_.GetAsString(); }
+
  private:
   FlatBufferBuilder builder_;
   MutableOpResolver resolver_;
   TfLiteRegistration fake_op_;
+  MockErrorReporter mock_reporter_;
   std::vector<Offset<Operator>> operators_;
   std::vector<Offset<OperatorCode>> operator_codes_;
   std::vector<Offset<Tensor>> tensors_;
@@ -112,8 +141,25 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Missing 'subgraphs' section."));
+}
+
+TEST(VerifyModel, TestEmptyVector) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor({}, TensorType_UINT8, {}, "empty_vector");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {3});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestSimpleModel) {
@@ -127,12 +173,16 @@ TEST(VerifyModel, TestSimpleModel) {
   builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
   builder.FinishModel({0, 1}, {2});
   ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   std::string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size(), MutableOpResolver{},
-                      /*error_reporter=*/nullptr));
+  MockErrorReporter mock_reporter;
+  ASSERT_FALSE(
+      Verify(model.data(), model.size(), MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid flatbuffer format"));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -140,8 +190,11 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid model version 1"));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -153,7 +206,7 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
 
   std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
                             builder.GetSize());
-  for (int i = 0; i < model_content.size(); i++) {
+  for (size_t i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
     EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
                         MutableOpResolver{}, DefaultErrorReporter()))
@@ -166,6 +219,9 @@ TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
   builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 6 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
@@ -173,6 +229,9 @@ TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
   builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 2 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeOverflow) {
@@ -181,6 +240,8 @@ TEST(VerifyModel, TestIntTensorShapeOverflow) {
                     "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input dimension overflow"));
 }
 
 TEST(VerifyModel, TensorBufferIsNotValid) {
@@ -203,8 +264,12 @@ TEST(VerifyModel, TensorBufferIsNotValid) {
                            builder.CreateString("SmartReply"), buffers);
 
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(
+      mock_reporter.GetAsString(),
+      ::testing::ContainsRegex("Missing 'operators' section in subgraph."));
 }
 
 TEST(VerifyModel, StringTensorHasInvalidNumString) {
@@ -215,6 +280,11 @@ TEST(VerifyModel, StringTensorHasInvalidNumString) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex(
+          "String tensor input buffer requires at least -2147483640 bytes, "
+          "but is allocated with 18 bytes"));
 }
 
 TEST(VerifyModel, StringTensorOffsetTooSmall) {
@@ -224,6 +294,9 @@ TEST(VerifyModel, StringTensorOffsetTooSmall) {
       {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer initial offset must be: 16"));
 }
 
 TEST(VerifyModel, StringTensorOffsetOutOfRange) {
@@ -233,6 +306,9 @@ TEST(VerifyModel, StringTensorOffsetOutOfRange) {
       {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer is invalid: index 2"));
 }
 
 TEST(VerifyModel, StringTensorIsLargerThanRequired) {
@@ -243,37 +319,144 @@ TEST(VerifyModel, StringTensorIsLargerThanRequired) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer last offset must be 19"));
 }
 
 TEST(VerifyModel, AllOpsAreSupported) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output2");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
-  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "CustomOp");
   builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, UseUnsupportedBuiltinOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_SUB}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex("Unsupported builtin op: ADD, version: 1"));
 }
 
 TEST(VerifyModel, UseUnsupportedCustomOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"NewOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "Not supported");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "Unsupported custom op: Not supported, version: 1"));
+}
+
+TEST(VerifyModel, UnpopulatedInputToOp) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({1, 2}, {3}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  // This tensor will never be populated.
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "invalid_input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 2}, {3});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Input tensor 1 to op 0 (CUSTOM) is not produced",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, MultipleOpsOutputToSameTensor) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
+  // This can't output to "output1", since the first operator does that.
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ(
+      "Output tensor 2 to op 1 (CUSTOM) is an output from another op. "
+      "There is a cycle in the graph",
+      builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAConstantTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be populated with constant value.
+  builder.AddTensor({2, 3}, TensorType_INT32, {1, 2, 3, 4, 5, 6}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a constant",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsSubgraphInput) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  // Output shouldn't be a subgraph input.
+  builder.FinishModel({0, 1, 2}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a subgraph input",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAVariable) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be a variable.
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output", /*variable*/ true);
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a variable",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OpWithOptionalTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({kOptionalTensor, 0, 1}, {2}, BuiltinOperator_CUSTOM,
+                      "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 // TODO(yichengfan): make up malicious files to test with.
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
index 394ab0760b5672978e0638c0ff01a8f00442302c..8bc02eedf68551036cf81eba568118e6f7e32639 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -301,7 +301,7 @@
         "id": "7BONhYtYocQY"
       },
       "source": [
-        "To quantize the model on export, set the `post_training_quantize` flag:"
+        "To quantize the model on export, set the `optimizations` flag to optimize for size:"
       ]
     },
     {
@@ -313,11 +313,11 @@
         "id": "g8PUvLWDlmmz"
       },
       "outputs": [],
-      "source": [
+     "source": [
         "# Note: If you don't have a recent tf-nightly installed, the\n",
-        "# \"post_training_quantize\" line will have no effect.\n",
+        "# \"optimizations\" line will have no effect.\n",
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "tflite_quant_model = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
         "tflite_model_quant_file.write_bytes(tflite_quant_model)"
@@ -329,8 +329,8 @@
         "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
-      "source": [
-        "Note how the resulting file, with `post_training_quantize` set, is approximately `1/4` the size."
+    "source": [
+        "Note how the resulting file, is approximately `1/4` the size."
       ]
     },
     {
@@ -383,7 +383,7 @@
       "source": [
         "import numpy as np\n",
         "mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()\n",
-        "images, labels = tf.to_float(mnist_test[0])/255.0, mnist_test[1]\n",
+        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
         "\n",
         "# Note: If you change the batch size, then use \n",
         "# `tf.lite.Interpreter.resize_tensor_input` to also change it for\n",
@@ -489,7 +489,7 @@
         "plt.imshow(img[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
-        "                              predict=str(predictions[0,0])))\n",
+        "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
       ]
     },
@@ -650,7 +650,7 @@
         "output_arrays = [\"output\"]\n",
         "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
         "resnet_tflite_file.write_bytes(converter.convert())\n"
       ]
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 9c4dd2f4b868e837ac19296bdae1404e2193527e..3be09b70f175ce630dce5075cd57645f3b512a33 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -37,6 +37,7 @@ tensorflow/third_party/toolchains/clang6/README.md
 tensorflow/third_party/toolchains/clang6/repo.bzl
 tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -47,7 +48,6 @@ tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUI
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
@@ -60,12 +60,6 @@ tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
-tensorflow/third_party/toolchains/gpus/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/gpus/cuda/BUILD
-tensorflow/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
-tensorflow/third_party/toolchains/gpus/crosstool/BUILD
-tensorflow/third_party/toolchains/gpus/crosstool/CROSSTOOL
-tensorflow/third_party/toolchains/gpus/py/BUILD
 tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
 tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/cpus/arm/BUILD
@@ -120,6 +114,7 @@ tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProdu
 tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
 tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
 tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/gpu_packet_math.patch
 tensorflow/third_party/eigen3/LICENSE
 tensorflow/third_party/eigen3/BUILD
 tensorflow/third_party/systemlibs/build_defs.bzl.tpl
@@ -148,7 +143,6 @@ tensorflow/third_party/systemlibs/cython.BUILD
 tensorflow/third_party/systemlibs/double_conversion.BUILD
 tensorflow/third_party/systemlibs/zlib.BUILD
 tensorflow/third_party/systemlibs/jsoncpp.BUILD
-tensorflow/third_party/systemlibs/enum34.BUILD
 tensorflow/third_party/systemlibs/re2.BUILD
 tensorflow/third_party/systemlibs/lmdb.BUILD
 tensorflow/third_party/systemlibs/googleapis.BUILD
@@ -214,6 +208,7 @@ tensorflow/third_party/git/BUILD.tpl
 tensorflow/third_party/git/BUILD
 tensorflow/third_party/git/git_configure.bzl
 tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/enum34.BUILD
 tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/py/BUILD
 tensorflow/third_party/py/BUILD.tpl
@@ -245,5 +240,6 @@ tensorflow/third_party/eigen.BUILD
 tensorflow/stream_executor/build_defs.bzl
 tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template_v1.__init__.py
+tensorflow/compat_template.__init__.py
 tensorflow/api_template.__init__.py
 tensorflow/__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d4e6cbf804d6ac706d54394d8ad5c80b3a0dd1f8..e3c026c81c5f1f3d62cc884bf9662de2474b4e93 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -12,6 +12,7 @@ visibility = [
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
     "//third_party/py/cleverhans:__subpackages__",
+    "//third_party/py/tensorflow_examples:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
 ]
@@ -72,6 +73,7 @@ py_library(
     deps = [
         ":no_contrib",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/tpu:tpu_estimator",
     ] + if_not_v2(["//tensorflow/contrib:contrib_py"]),
 )
 
@@ -118,8 +120,10 @@ py_library(
         ":check_ops",
         ":client",
         ":client_testlib",
+        ":clustering_ops",
         ":collective_ops",
         ":cond_v2",
+        ":config",
         ":confusion_matrix",
         ":control_flow_ops",
         ":cudnn_rnn_ops_gen",
@@ -139,9 +143,9 @@ py_library(
         ":lib",
         ":list_ops",
         ":manip_ops",
+        ":map_fn",
         ":math_ops",
         ":metrics",
-        ":mode_keys",
         ":nccl_ops",
         ":nn",
         ":ops",
@@ -172,11 +176,14 @@ py_library(
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/compiler",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:profiler",
+        "//tensorflow/python/eager:profiler_client",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/module",
         "//tensorflow/python/ops/distributions",
@@ -213,6 +220,7 @@ py_library(
         ":pywrap_tensorflow",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "@absl_py//absl:app",
         "@absl_py//absl/flags",
         "@six_archive//:six",
     ],
@@ -233,7 +241,10 @@ py_library(
     name = "platform_test",
     srcs = ["platform/googletest.py"],
     srcs_version = "PY2AND3",
-    deps = [":platform_benchmark"],
+    deps = [
+        ":platform_benchmark",
+        "@absl_py//absl/testing:absltest",
+    ],
 )
 
 tf_py_test(
@@ -264,7 +275,10 @@ tf_py_test(
         ":client_testlib",
         ":platform",
     ],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
 )
 
 tf_py_test(
@@ -489,7 +503,7 @@ tf_cc_shared_object(
         "//conditions:default": [
             "-lm",
         ],
-        "//tensorflow:darwin": [],
+        "//tensorflow:macos": [],
         "//tensorflow:windows": [],
     }),
     deps = [
@@ -644,6 +658,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:execute",
+        "//tensorflow/tools/docs:doc_controls",
     ],
 )
 
@@ -671,6 +686,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":common_shapes",
+        ":composite_tensor",
+        ":convert_to_constants",
         ":cpp_shape_inference_proto_py",
         ":errors",
         ":framework_fast_tensor_util",
@@ -846,6 +863,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "convert_to_constants",
+    srcs = [
+        "framework/convert_to_constants.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":platform",
+        ":tensor_util",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_library(
     name = "kernels",
     srcs = [
@@ -909,6 +942,24 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_fn",
+    srcs = ["ops/map_fn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
+        ":tensor_shape",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "func_graph",
     srcs = ["framework/func_graph.py"],
@@ -945,7 +996,33 @@ tf_py_test(
     additional_deps = [
         ":auto_control_deps",
         ":client_testlib",
+        "//tensorflow/python/keras",
+    ],
+)
+
+py_library(
+    name = "config",
+    srcs = ["framework/config.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_ops",
+        ":util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "config_test",
+    size = "small",
+    srcs = ["framework/config_test.py"],
+    additional_deps = [
+        ":config",
+        ":constant_op",
+        ":client_testlib",
+        ":platform",
+        ":util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -995,6 +1072,18 @@ py_library(
     name = "sparse_tensor",
     srcs = ["framework/sparse_tensor.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":dtypes",
+        ":framework_ops",
+        ":tensor_util",
+    ],
+)
+
+py_library(
+    name = "composite_tensor",
+    srcs = ["framework/composite_tensor.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
         ":framework_ops",
@@ -1002,6 +1091,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "framework_composite_tensor_test",
+    srcs = ["framework/composite_tensor_test.py"],
+    main = "framework/composite_tensor_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform_test",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 # This target is maintained separately from :util to provide separate visibility
 # for legacy users who were granted visibility when the functions were private
 # members of ops.Graph.
@@ -1082,6 +1186,9 @@ py_library(
     name = "framework_test_lib",
     srcs = ["framework/test_util.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility + [
+        "//tensorflow_estimator/python/estimator:__subpackages__",
+    ],
     deps = [
         ":array_ops",
         ":client",
@@ -1142,6 +1249,7 @@ tf_py_test(
     srcs = ["framework/registry_test.py"],
     additional_deps = [
         ":framework_for_generated_wrappers",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
     main = "framework/registry_test.py",
@@ -1197,6 +1305,9 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
     main = "framework/contrib_test.py",
+    tags = [
+        "no_pip",
+    ],
 )
 
 tf_py_test(
@@ -1256,8 +1367,8 @@ py_library(
     ],
 )
 
-cuda_py_tests(
-    name = "framework_function_test",
+cuda_py_test(
+    name = "function_test",
     size = "medium",
     srcs = ["framework/function_test.py"],
     additional_deps = [
@@ -1287,6 +1398,7 @@ cuda_py_tests(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1759,7 +1871,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/training/checkpointable:__pkg__",
+        "//tensorflow/python/training/tracking:__pkg__",
     ],
 )
 
@@ -1932,6 +2044,27 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tpu_ops_gen",
+    visibility = [
+        "//smartass/brain/configure/python:__pkg__",
+        "//tensorflow/contrib/tpu:__pkg__",
+        "//tensorflow/python/tpu:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:tpu_configuration_ops_op_lib",
+        "//tensorflow/core:tpu_cross_replica_ops_op_lib",
+        "//tensorflow/core:tpu_embedding_ops_op_lib",
+        "//tensorflow/core:tpu_functional_ops_op_lib",
+        "//tensorflow/core:tpu_heartbeat_ops_op_lib",
+        "//tensorflow/core:tpu_host_compute_ops_op_lib",
+        "//tensorflow/core:tpu_infeed_ops_op_lib",
+        "//tensorflow/core:tpu_ordinal_selector_ops_op_lib",
+        "//tensorflow/core:tpu_outfeed_ops_op_lib",
+        "//tensorflow/core:tpu_replication_ops_op_lib",
+    ],
+)
+
 py_library(
     name = "array_grad",
     srcs = ["ops/array_grad.py"],
@@ -2084,6 +2217,30 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "clustering_ops",
+    srcs = ["ops/clustering_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":clustering_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+    ],
+)
+
+tf_py_test(
+    name = "clustering_ops_test",
+    size = "medium",
+    srcs = ["ops/clustering_ops_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":clustering_ops",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "collective_ops",
     srcs = ["ops/collective_ops.py"],
@@ -2188,7 +2345,7 @@ py_library(
         ":function_def_to_graph",
         ":functional_ops_gen",
         ":gradients",
-        ":gradients_impl",
+        ":gradients_util",
         ":graph_to_function_def",
         ":pywrap_tensorflow",
         ":util",
@@ -2201,6 +2358,7 @@ py_library(
     name = "while_v2",
     srcs = [
         "ops/while_v2.py",
+        "ops/while_v2_indexed_slices_rewriter.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -2213,7 +2371,7 @@ py_library(
         ":framework_ops",
         ":function_def_to_graph",
         ":functional_ops_gen",
-        ":gradients_impl",
+        ":gradients_util",
         ":list_ops",
         ":tensor_array_ops",
         ":tensor_shape",
@@ -2304,6 +2462,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gradients_impl",
+        ":gradients_util",
         ":unconnected_gradients",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:tape",
@@ -2327,7 +2486,6 @@ py_library(
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_ops",
-        ":functional_ops",
         ":image_grad",
         ":linalg_grad",
         ":linalg_ops",
@@ -2339,15 +2497,34 @@ py_library(
         ":optional_grad",
         ":platform",
         ":random_grad",
-        ":resource_variable_ops",
         ":tensor_array_ops",
+        ":unconnected_gradients",
+        ":util",
+    ],
+)
+
+py_library(
+    name = "gradients_util",
+    srcs = [
+        "ops/gradients_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_ops",
+        ":functional_ops",
+        ":math_ops",
+        ":platform",
+        ":resource_variable_ops",
         ":tensor_util",
         ":unconnected_gradients",
         ":util",
-        ":variable_scope",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -2489,9 +2666,9 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
-        ":functional_ops",
         ":linalg_ops_gen",
         ":linalg_ops_impl",
+        ":map_fn",
         ":math_ops",
         "//third_party/py/numpy",
     ],
@@ -2639,6 +2816,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "critical_section_ops",
+    srcs = ["ops/critical_section_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":resource_variable_ops_gen",
+        ":tensor_array_ops",
+        ":util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "list_ops",
     srcs = ["ops/list_ops.py"],
@@ -2798,6 +2991,7 @@ cuda_py_test(
         ":logging_ops",
         ":random_ops_gen",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -2923,11 +3117,14 @@ tf_py_test(
     name = "sparse_ops_test",
     srcs = ["ops/sparse_ops_test.py"],
     additional_deps = [
+        ":array_grad",
         ":constant_op",
         ":dtypes",
         ":framework_test_lib",
         ":sparse_ops",
         ":sparse_tensor",
+        ":sparse_grad",
+        ":gradient_checker_v2",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -3050,6 +3247,7 @@ py_library(
         ":clip_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":critical_section_ops",
         ":cudnn_rnn_grad",
         ":data_flow_grad",
         ":data_flow_ops",
@@ -3144,10 +3342,12 @@ py_library(
         ":smart_cond",
         ":summary_op_util",
         ":summary_ops_gen",
+        ":tensor_util",
         ":training_util",
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:profiler",
         "@six_archive//:six",
     ],
 )
@@ -3234,7 +3434,7 @@ py_library(
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -3287,6 +3487,7 @@ cuda_py_test(
         ":framework_test_lib",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3314,6 +3515,7 @@ cuda_py_test(
         ":while_v2",
         "//tensorflow/python/eager:def_function",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3330,6 +3532,7 @@ cuda_py_test(
         ":platform",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3346,6 +3549,7 @@ cuda_py_test(
         ":platform",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3377,6 +3581,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_oss"],  # b/118709825
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3392,6 +3597,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3405,6 +3611,7 @@ cuda_py_test(
         ":image_ops",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3430,6 +3637,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3444,6 +3652,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3459,6 +3668,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3476,6 +3686,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3494,6 +3705,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3513,6 +3725,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3529,6 +3742,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     shard_count = 16,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3549,6 +3763,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3563,6 +3778,7 @@ cuda_py_test(
         ":nn_grad",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3579,6 +3795,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -3587,7 +3804,7 @@ py_library(
         ["training/**/*.py"],
         exclude = [
             "**/*test*",
-            "training/checkpointable/**/*.py",
+            "training/tracking/**/*.py",
             "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
@@ -3646,9 +3863,11 @@ py_library(
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras/optimizer_v2:learning_rate_schedule",
         "//tensorflow/python/ops/losses",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:python_state",
+        "//tensorflow/python/training/tracking:util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3728,9 +3947,9 @@ py_library(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3811,6 +4030,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = visibility + [
         "//tensorflow:__pkg__",
+        "//third_party/py/tf_agents:__subpackages__",
     ],
     deps = [
         "//third_party/py/numpy",
@@ -3849,16 +4069,6 @@ tf_py_test(
     main = "util/serialization_test.py",
 )
 
-tf_py_test(
-    name = "future_api_test",
-    size = "small",
-    srcs = ["util/future_api_test.py"],
-    additional_deps = [
-        ":util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
@@ -4051,6 +4261,7 @@ cuda_py_tests(
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cc_library(
@@ -4063,7 +4274,6 @@ cc_library(
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:replay_log_proto_cc",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -4143,7 +4353,7 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
-    # add win_def_file
+    # add win_def_file for pywrap_tensorflow
     win_def_file = select({
         "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
         "//conditions:default": None,
@@ -4201,43 +4411,27 @@ tf_py_wrap_cc(
 # ** Targets for Windows build (start) **
 # We need the following targets to expose symbols from _pywrap_tensorflow.dll
 
-# Build a cc_binary from tf_custom_op_library_additional_deps_impl,
-# it contains all object code from its dependencies.
-tf_native_cc_binary(
-    name = "tf_custom_op_library_additional_deps.so",
-    linkshared = 1,
-    linkstatic = 1,
-    deps = tf_custom_op_library_additional_deps_impl(),
-)
-
-# Get a DEF file generated by parsing all object files
-# of tf_custom_op_library_additional_deps.so
-filegroup(
-    name = "pywrap_tensorflow_def_file",
-    srcs = [":tf_custom_op_library_additional_deps.so"],
-    output_group = "def_file",
-)
-
 # Filter the DEF file to reduce the number of symbols to 64K or less.
 # Note that we also write the name of the pyd file into DEF file so that
 # the dynamic libraries of custom ops can find it at runtime.
 genrule(
     name = "pywrap_tensorflow_filtered_def_file",
-    srcs = [":pywrap_tensorflow_def_file"],
+    srcs = ["//tensorflow:tensorflow_def_file"],
     outs = ["pywrap_tensorflow_filtered_def_file.def"],
     cmd = select({
         "//tensorflow:windows": """
               $(location @local_config_def_file_filter//:def_file_filter) \\
-              --input $(location :pywrap_tensorflow_def_file) \\
+              --input $(location //tensorflow:tensorflow_def_file) \\
               --output $@ \\
               --target _pywrap_tensorflow_internal.pyd
           """,
         "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
     }),
     tools = ["@local_config_def_file_filter//:def_file_filter"],
+    visibility = ["//visibility:public"],
 )
 
-# Get the import library of  _pywrap_tensorflow_internal.dll
+# Get the import library of _pywrap_tensorflow_internal.pyd
 filegroup(
     name = "get_pywrap_tensorflow_import_lib_file",
     srcs = [":_pywrap_tensorflow_internal.so"],
@@ -4446,6 +4640,7 @@ cuda_py_test(
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -4515,7 +4710,7 @@ tf_py_test(
     ],
     grpc_enabled = True,
     tags = [
-        "no_gpu",
+        "no_gpu",  # b/127001953
         "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
         "no_windows",
     ],
@@ -4604,6 +4799,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
 )
 
 cuda_py_test(
@@ -4617,7 +4813,11 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_gpu",  # b/127386241
+        "no_windows_gpu",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -4648,6 +4848,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "convert_to_constants_test",
+    size = "small",
+    srcs = ["framework/convert_to_constants_test.py"],
+    additional_deps = [
+        ":convert_to_constants",
+        "client_testlib",
+        "framework_test_lib",
+    ],
+)
+
 tf_py_test(
     name = "bfloat16_test",
     size = "small",
@@ -4697,6 +4908,7 @@ cuda_py_test(
         ":client_testlib",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -4717,6 +4929,7 @@ cuda_py_test(
         ":variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -4732,7 +4945,6 @@ cuda_py_tests(
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
-        "training/learning_rate_decay_v2_test.py",
         "training/momentum_test.py",
         "training/optimizer_test.py",
         "training/proximal_adagrad_test.py",
@@ -4780,6 +4992,7 @@ cuda_py_tests(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -4827,6 +5040,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
     ],
     tags = ["multi_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -4863,6 +5077,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -4920,6 +5135,7 @@ cuda_py_test(
     ],
     grpc_enabled = True,
     main = "training/session_manager_test.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -4968,6 +5184,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     tags = [
+        "no_pip",  # Relies on contrib
         "no_windows",
         "notsan",  # intermittent races on a few percent of runs
     ],
@@ -5058,6 +5275,7 @@ tf_py_test(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
     tags = [
@@ -5124,7 +5342,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":distribute",
         ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
@@ -5153,6 +5370,7 @@ py_library(
         ":summary_ops_gen",
         ":summary_ops_v2",
         ":util",
+        "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -5399,7 +5617,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "accumulate_n_benchmark",
-    size = "large",
+    size = "medium",
     srcs = ["ops/accumulate_n_benchmark.py"],
     additional_deps = [
         ":array_ops",
@@ -5414,6 +5632,8 @@ cuda_py_test(
         ":state_ops_gen",
     ],
     main = "ops/accumulate_n_benchmark.py",
+    shard_count = 6,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5434,6 +5654,7 @@ cuda_py_test(
         ":variables",
     ],
     main = "ops/batch_norm_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5451,6 +5672,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     main = "ops/concat_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5464,6 +5686,7 @@ cuda_py_test(
         "//tensorflow/python/eager:function",
     ],
     main = "ops/control_flow_ops_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5484,6 +5707,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     main = "ops/conv2d_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5502,6 +5726,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     main = "ops/split_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5521,6 +5746,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     main = "ops/transpose_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5542,6 +5768,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
     ],
     main = "ops/matmul_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5564,6 +5791,7 @@ cuda_py_test(
     ],
     main = "ops/matmul_benchmark_test.py",
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5581,6 +5809,7 @@ cuda_py_test(
     ],
     grpc_enabled = True,
     main = "client/session_benchmark.py",
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5594,6 +5823,7 @@ cuda_py_test(
         ":nn_ops",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -5679,6 +5909,8 @@ cuda_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -5789,6 +6021,7 @@ cuda_py_test(
     tags = [
         "grappler",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -5819,6 +6052,8 @@ cuda_py_test(
     tags = [
         "grappler",
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -5871,6 +6106,7 @@ tf_py_test(
         "grappler",
         "no_cuda_on_cpu_tap",
         "no_pip",
+        "nomac",
     ],
 )
 
@@ -5959,29 +6195,6 @@ py_binary(
     ],
 )
 
-py_library(
-    name = "mode_keys",
-    srcs = [
-        "training/mode_keys.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":util",
-    ],
-)
-
-tf_py_test(
-    name = "mode_keys_test",
-    size = "small",
-    srcs = [
-        "training/mode_keys_test.py",
-    ],
-    additional_deps = [
-        ":client_testlib",
-        ":mode_keys",
-    ],
-)
-
 pyx_library(
     name = "framework_fast_tensor_util",
     srcs = ["framework/fast_tensor_util.pyx"],
@@ -5994,3 +6207,12 @@ py_library(
     srcs = ["tf2.py"],
     srcs_version = "PY2AND3",
 )
+
+cuda_py_test(
+    name = "raw_ops_test",
+    srcs = ["ops/raw_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 398fb375e1453866f3f1953a53012aaae2c22dd6..0016b5beaa52610a52fde00cd0b6f8152f0d1f8d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -62,6 +62,7 @@ from tensorflow.core.util.event_pb2 import *
 # Framework
 from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
 from tensorflow.python.framework.versions import *
+from tensorflow.python.framework import config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
 
@@ -84,12 +85,14 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.module import module
 from tensorflow.python.ops import bitwise_ops as bitwise
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
+from tensorflow.python.ops import stateful_random_ops
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
diff --git a/tensorflow/python/autograph/LIMITATIONS.md b/tensorflow/python/autograph/LIMITATIONS.md
index d8b1cb7616ac348981bf2b69d6e2fd8d8a6e6b78..b4e4ca661ad7a4c6d69019ce56a0832fd1cbb03f 100644
--- a/tensorflow/python/autograph/LIMITATIONS.md
+++ b/tensorflow/python/autograph/LIMITATIONS.md
@@ -8,39 +8,39 @@ Python is a large language, so hoping to convert arbitrary Python code directly
 
 Note: as more complex features in TensorFlow are made more accessible using AutoGraph, we expect to come across use cases that haven't been tried before, some of which might reveal rare bugs. If we do find any such bugs, we may add additional restrictions for the affected configurations, until those bugs are resolved.
 
- Construct | Supported now? | Plan to support? | Notes
- :--------- | :--------------: | :----------------: | :-----
-If statement | Yes |  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
-For statement | Yes | | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
-While statement | Yes | | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
-Continue and break | Yes | | Converts to boolean flags and extra predicates in loop tests.
-Composition of control flow | Yes | | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
-Iterators | Some | Yes | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
-Multiple return values | Yes | | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
-Print expression | Yes | | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
-Static function calls | Yes | | Non-recursive function calls
-Nested call trees | Yes | | For example, `f` calls `g` which calls `h`, all of which need conversion.
-Recursive function calls | No | Maybe | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
-Python built-ins | Some | Yes | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
-List operations | Yes | | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
-Function variables | Yes | | e.g. `f_new = f_orig; f_new()`
-Lambda functions | No | Yes | Planned feature.
-Classes | Yes | | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
-Subclasses | Yes | | Subclassing library objects like tf.keras.Model is also supported.
-Dynamic types | Some | | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
-Dynamic code / exec | No | |
-Reflection | No | |
-Try / Except | No | No | No current sane TF equivalent.
-Global variables | Restricted | | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
-Functions with side effects | Some | | Side effects are allowed, under certain circumstances.
-Collections | Some | Yes | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
-List Comprehensions | Yes | | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
-Custom context managers | No | Yes | Currently low priority. Left unconverted currently.
-Generators | No | Maybe | Could be achievable using queues; very low priority.
-Assertions | Yes | | As `tf.Assert`
-Deletion | Yes | Maybe | Currently unconverted. If new semanti cs are required for `del`, we are able to add it in.
-Inline imports | No | Yes | For example, `import numpy as np; np.eye(3)`. Currently low priority.
-Async | No | No |
+Construct                   | Supported now? | Plan to support? | Notes
+:-------------------------- | :------------: | :--------------: | :----
+If statement                | Yes            |                  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
+For statement               | Yes            |                  | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
+While statement             | Yes            |                  | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
+Continue and break          | Yes            |                  | Converts to boolean flags and extra predicates in loop tests.
+Composition of control flow | Yes            |                  | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
+Iterators                   | Some           | Yes              | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
+Multiple return values      | Yes            |                  | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
+Print expression            | Yes            |                  | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
+Static function calls       | Yes            |                  | Non-recursive function calls
+Nested call trees           | Yes            |                  | For example, `f` calls `g` which calls `h`, all of which need conversion.
+Recursive function calls    | No             | Maybe            | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
+Python built-ins            | Some           | Yes              | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
+List operations             | Yes            |                  | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
+Function variables          | Yes            |                  | e.g. `f_new = f_orig; f_new()`
+Lambda functions            | No             | Yes              | Planned feature.
+Classes                     | Yes            |                  | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
+Subclasses                  | Yes            |                  | Subclassing library objects like tf.keras.Model is also supported.
+Dynamic types               | Some           |                  | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
+Dynamic code / exec         | No             |                  |
+Reflection                  | No             |                  |
+Try / Except                | No             | No               | No current sane TF equivalent.
+Global variables            | Restricted     |                  | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
+Functions with side effects | Some           |                  | Side effects are allowed, under certain circumstances.
+Collections                 | Some           | Yes              | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
+List Comprehensions         | Yes            |                  | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
+Custom context managers     | No             | Yes              | Currently low priority. Left unconverted currently.
+Generators                  | No             | Maybe            | Could be achievable using queues; very low priority.
+Assertions                  | Yes            |                  | As `tf.Assert`
+Deletion                    | Yes            | Maybe            | Currently unconverted. If new semantics are required for `del`, we are able to add it in.
+Inline imports              | No             | Yes              | For example, `import numpy as np; np.eye(3)`. Currently low priority.
+Async                       | No             | No               |
 
 ## Extra capabilities
 
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 8fd6e72da62f08375ef86f21d151697ccc9c625f..5fb9bcb74cd8c395c877626ad7c89d64d55c662b 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -36,7 +36,6 @@ from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
-from tensorflow.python.autograph.core.converter import Verbosity
 from tensorflow.python.autograph.core.errors import GraphConstructionError
 from tensorflow.python.autograph.core.errors import improved_errors
 from tensorflow.python.autograph.core.errors import TfRuntimeError
@@ -49,8 +48,9 @@ from tensorflow.python.autograph.impl.api import to_graph
 from tensorflow.python.autograph.lang.directives import set_element_type
 from tensorflow.python.autograph.lang.directives import set_loop_options
 from tensorflow.python.autograph.lang.special_functions import stack
+from tensorflow.python.autograph.pyct.errors import AutoGraphError
 from tensorflow.python.autograph.lang.special_functions import tensor_list
-from tensorflow.python.autograph.pyct.transformer import AutoGraphParseError
+from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 # TODO(mdan): Revisit this list once we finalize the generated code mechanism.
@@ -70,14 +70,13 @@ _allowed_symbols = [
     'improved_errors',
     'GraphConstructionError',
     'TfRuntimeError',
-    'Verbosity',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
     'stack',
     'tensor_list',
     # Exceptions
-    'AutoGraphParseError',
+    'AutoGraphError',
     # Utilities: to be removed
     'utils',
 ]
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 3ac446db02c6ef1946e76a8b549a85c67fed2872..bafc5b0ca7c203255f098f6e03fa8b417b74d4f6 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -25,7 +25,6 @@ py_library(
         "conditional_expressions.py",
         "continue_statements.py",
         "control_flow.py",
-        "decorators.py",
         "directives.py",
         "error_handlers.py",
         "function_scopes.py",
@@ -139,21 +138,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "decorators_test",
-    srcs = ["decorators_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":converters",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/core:test_lib",
-    ],
-)
-
 py_test(
     name = "directives_test",
     srcs = ["directives_test.py"],
diff --git a/tensorflow/python/autograph/converters/break_statements.py b/tensorflow/python/autograph/converters/break_statements.py
index e4e32ab9761aa13b5a7eefbc297ad3ea79412e99..c2ced26d8d7a40aff052232553ce0d374c0ffc57 100644
--- a/tensorflow/python/autograph/converters/break_statements.py
+++ b/tensorflow/python/autograph/converters/break_statements.py
@@ -53,7 +53,7 @@ class BreakTransformer(converter.Base):
       return block
 
     template = """
-        if not var_name:
+        if ag__.not_(var_name):
           block
       """
     node = templates.replace(
@@ -86,7 +86,7 @@ class BreakTransformer(converter.Base):
 
       template = """
         var_name = False
-        while test and not var_name:
+        while ag__.and_(lambda: test, lambda: ag__.not_(var_name)):
           body
         else:
           orelse
@@ -115,7 +115,7 @@ class BreakTransformer(converter.Base):
       # break did not trigger).
       guarded_orelse = self._guard_if_present(node.orelse, break_var)
       extra_test = templates.replace_as_expression(
-          'not var_name', var_name=break_var)
+          'ag__.not_(var_name)', var_name=break_var)
 
       # The extra test is hidden in the AST, which will confuse the static
       # analysis. To mitigate that, we insert a no-op statement that ensures
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 2683be16ec7ffa91b1df3cd272336366502d9f4f..2e6cf16b9c5af5aad32e6746bf7c5503917200dd 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -55,7 +55,9 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
       with self.session() as sess:
         with self.assertPrints('a\n'):
-          sess.run(result.test_fn('a'))
+          sess.run(result.test_fn(constant_op.constant('a')))
+      with self.assertPrints('a\n'):
+        result.test_fn('a')
 
   @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index d4eb17e976f6fdf321903a878326e668aeb6ea49..8366e19c050831d1adeb9160016afd5a16527ec3 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,318 +22,118 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.util import tf_inspect
-
-
-class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
-  pass
-
-
-# TODO(mdan): Move this to a separate transformer.
-KNOWN_NUMPY_FUNCTIONS = {
-    ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
-}
 
 
-# TODO(mdan): Get rid of these interfaces. Can now depend directly on Namer.
+# TODO(mdan): Rename to FunctionCallsTransformer.
 
 
-class FunctionNamer(object):
-  """Describes the interface for CallTreeTransformer's namer."""
+class _Function(object):
 
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    """Generate the name corresponding to the compiled version of a function.
-
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: Callable, the actual target function, if known.
-      owner_type: Optional object. If present, it indicates that the function is
-          a member of the given type.
-    Returns:
-      string, bool
-    """
-    raise NotImplementedError()
-
-  def compiled_class_name(self, original_fqn, live_entity=None):
-    """Generate the name corresponding to the compiled version of a class.
-
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: The actual target class, if known.
-    Returns:
-      string
-    """
-    raise NotImplementedError()
-
-
-# TODO(mdan): Rename to CallsTransformer.
+  no_root = True
 
 
 class CallTreeTransformer(converter.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def _resolve_decorator_name(self, node):
-    """Used to resolve decorator info."""
-    if isinstance(node, gast.Call):
-      return self._resolve_decorator_name(node.func)
-    if isinstance(node, gast.Name):
-      # TODO(mdan): Add test coverage for this branch.
-      return self.ctx.info.namespace.get(node.id)
-    if isinstance(node, gast.Attribute):
-      parent = self._resolve_decorator_name(node.value)
-      if parent is not None:
-        return getattr(parent, node.attr)
-      return None
-    raise ValueError(node)
-
-  def _try_resolve_target(self, node):
-    """Works for methods of objects of known type."""
-    if anno.hasanno(node, 'live_val'):
-      return anno.getanno(node, 'live_val')
-    if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      owner_type = anno.getanno(node, 'type')
-      if hasattr(owner_type, node.attr):
-        return getattr(owner_type, node.attr)
-      else:
-        # TODO(mdan): We should probably return None here rather than an error.
-        raise ValueError('Type "%s" has no attribute "%s". Is it dynamic?' %
-                         (owner_type, node.attr))
-    return None
-
-  def _function_is_compilable(self, target_entity):
-    """Determines whether an entity can be compiled at all."""
-    # TODO(mdan): Expand.
-
-    if target_entity.__module__ is None:
-      # Functions like builtins and NumPy don't expose a module.
-      # Those in general should not be compiled.
-      return False
-
-    if inspect_utils.isbuiltin(target_entity):
-      return False
-
-    if inspect_utils.isnamedtuple(target_entity):
-      # namedtuple doesn't expose its source code, making it uncompilable.
-      return False
-
-    return True
-
-  def _should_compile(self, node, fqn):
-    """Determines whether an entity should be compiled in the context."""
-    # TODO(mdan): Needs cleanup. We should remove the use of fqn altogether.
-    module_name = fqn[0]
-    for mod in self.ctx.program.uncompiled_modules:
-      if module_name.startswith(mod[0] + '.'):
-        return False
-
-    for i in range(1, len(fqn)):
-      if fqn[:i] in self.ctx.program.uncompiled_modules:
-        return False
-
-    target_entity = self._try_resolve_target(node.func)
-
-    if target_entity is not None:
-
-      # Currently, lambdas are always converted.
-      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
-      if inspect_utils.islambda(target_entity):
-        return True
-
-      # This may be reached when "calling" a callable attribute of an object.
-      # For example:
-      #
-      #   self.fc = tf.keras.layers.Dense()
-      #   self.fc()
-      #
-      for mod in self.ctx.program.uncompiled_modules:
-        if target_entity.__module__.startswith(mod[0] + '.'):
-          return False
-
-      # Inspect the target function decorators. If any include a @convert
-      # or @do_not_convert annotation, then they must be called as they are.
-      # TODO(mdan): This may be quite heavy. Perhaps always dynamically convert?
-      # To parse and re-analyze each function for every call site could be quite
-      # wasteful. Maybe we could cache the parsed AST?
-      try:
-        target_node, _ = parser.parse_entity(target_entity)
-        target_node = target_node.body[0]
-      except TypeError:
-        # Functions whose source we cannot access are compilable (e.g. wrapped
-        # to py_func).
-        return True
-
-      # This attribute is set when the decorator was applied before the
-      # function was parsed. See api.py.
-      if hasattr(target_entity, '__ag_compiled'):
-        return False
-
-      for dec in target_node.decorator_list:
-        decorator_fn = self._resolve_decorator_name(dec)
-        if (decorator_fn is not None and
-            self.ctx.program.options.should_strip(decorator_fn)):
-          return False
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    node.args = self.visit(node.args)
+    node.body = self.visit_block(node.body)
 
-    return True
+    if self.state[_Function].level < 2:
+      # Top-level functions lose their decorator because the conversion is
+      # always just-in-time and by the time it happens the decorators are
+      # already set to be applied.
+      node.decorator_list = []
+    else:
+      # Inner functions are converted already, so we insert a decorator to
+      # prevent double conversion. Double conversion would work too, but this
+      # saves the overhead.
+      node.decorator_list.append(
+          parser.parse_expression('ag__.do_not_convert_internal'))
 
-  def _rename_compilable_function(self, node):
-    assert anno.hasanno(node.func, 'live_val')
-    assert anno.hasanno(node.func, 'fqn')
-    target_entity = anno.getanno(node.func, 'live_val')
-    target_fqn = anno.getanno(node.func, 'fqn')
+    if node.returns:
+      node.returns = self.visit(node.returns)
 
-    if anno.hasanno(node, 'is_constructor'):
-      new_name = self.ctx.namer.compiled_class_name(
-          target_fqn, live_entity=target_entity)
-      do_rename = True
-    else:
-      if anno.hasanno(node.func, 'parent_type'):
-        owner_type = anno.getanno(node.func, 'parent_type')
-      else:
-        # Fallback - not reliable.
-        owner_type = inspect_utils.getmethodclass(target_entity)
-      new_name, do_rename = self.ctx.namer.compiled_function_name(
-          target_fqn, live_entity=target_entity, owner_type=owner_type)
+    self.state[_Function].exit()
+    return node
 
-    if do_rename:
-      if target_entity is not None:
-        if tf_inspect.ismethod(target_entity):
-          # The renaming process will transform it into a regular function.
-          # TODO(mdan): Is this complete? How does it work with nested members?
-          node.args = [node.func.value] + node.args
-      node.func = templates.replace_as_expression(
-          'func_name', func_name=new_name)
+  def visit_With(self, node):
+    # Context manager calls (in node.items) are not converted.
+    node.body = self.visit_block(node.body)
     return node
 
-  def _wrap_to_py_func_single_return(self, node, dtype):
-    # TODO(mdan): Properly handle varargs, etc.
-    template = """
-      ag__.utils.wrap_py_func(func, dtype, (args,), kwargs, False)
-    """
-    return templates.replace_as_expression(
-        template,
-        func=node.func,
-        dtype=parser.parse_expression(dtype),
-        args=node.args,
-        kwargs=ast_util.keywords_to_dict(node.keywords))
+  def visit_Call(self, node):
+    # TODO(mdan): Refactor converted_call as a 'Call' operator.
+
+    # Calls to the internal 'ag__' module are never converted (though their
+    # arguments might be).
+    full_name = str(anno.getanno(node.func, anno.Basic.QN, default=''))
+    if full_name.startswith('ag__.'):
+      return self.generic_visit(node)
+    if (full_name == 'print' and
+        not self.ctx.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS)):
+      return self.generic_visit(node)
 
-  def _insert_dynamic_conversion(self, node):
-    """Inlines a dynamic conversion for a dynamic function."""
-    # TODO(mdan): Pass information on the statically compiled functions.
-    # Having access to the statically compiled functions can help avoid
-    # unnecessary compilation.
-    # For example, this would lead to function `a` being compiled twice:
-    #
-    #   def a():
-    #     v = b
-    #     b()
-    #   def b():
-    #     a()
-    #
-    # This is really a problem with recursive calls, which currently can
-    # only be gated by a static condition, and should be rare.
-    # TODO(mdan): It probably makes sense to use dynamic conversion every time.
-    # Before we could convert all the time though, we'd need a reasonable
-    # caching mechanism.
-    template = """
-      ag__.converted_call(func, owner, options, args)
-    """
     if isinstance(node.func, gast.Attribute):
       func = gast.Str(node.func.attr)
       owner = node.func.value
     else:
       func = node.func
       owner = parser.parse_expression('None')
+
+    starred_arg = None
+    normal_args = []
+    for a in node.args:
+      if isinstance(a, gast.Starred):
+        assert starred_arg is None, 'Multiple *args should be impossible.'
+        starred_arg = a
+      else:
+        normal_args.append(a)
+    if starred_arg is None:
+      args = templates.replace_as_expression('(args,)', args=normal_args)
+    else:
+      args = templates.replace_as_expression(
+          '(args,) + tuple(stararg)',
+          stararg=starred_arg.value,
+          args=normal_args)
+
+    kwargs_arg = None
+    normal_keywords = []
+    for k in node.keywords:
+      if k.arg is None:
+        assert kwargs_arg is None, 'Multiple **kwargs should be impossible.'
+        kwargs_arg = k
+      else:
+        normal_keywords.append(k)
+    if kwargs_arg is None:
+      kwargs = ast_util.keywords_to_dict(normal_keywords)
+    else:
+      kwargs = templates.replace_as_expression(
+          'dict(kwargs, **keywords)',
+          kwargs=kwargs_arg.value,
+          keywords=ast_util.keywords_to_dict(normal_keywords))
+
+    template = """
+      ag__.converted_call(func, owner, options, args, kwargs)
+    """
     new_call = templates.replace_as_expression(
         template,
         func=func,
         owner=owner,
         options=self.ctx.program.options.to_ast(
-            self.ctx,
             internal_convert_user_code=self.ctx.program.options.recursive),
-        args=node.args)
-    # TODO(mdan): Improve the template mechanism to better support this.
-    new_call.keywords = node.keywords
-    return new_call
+        args=args,
+        kwargs=kwargs)
 
-  def _visit_decorators(self, decorator_list):
-    if not self.ctx.program.options.uses(converter.Feature.DECORATORS):
-      # When not processing decorators, strip everything that is encountered.
-      return []
-
-    return self.visit_block(decorator_list)
-
-  def visit_FunctionDef(self, node):
-    node.args = self.visit(node.args)
-    node.body = self.visit_block(node.body)
-    node.decorator_list = self._visit_decorators(node.decorator_list)
-    node.returns = self.visit_block(node.returns)
-    return node
-
-  def visit_Call(self, node):
-    if anno.hasanno(node.func, 'live_val'):
-      target_entity = anno.getanno(node.func, 'live_val')
-
-      if anno.hasanno(node.func, 'fqn'):
-        target_fqn = anno.getanno(node.func, 'fqn')
-      else:
-        target_fqn = None
-
-      if self._function_is_compilable(target_entity):
-        if self._should_compile(node, target_fqn):
-          node = self._rename_compilable_function(node)
-        else:
-          node = self.generic_visit(node)
-          return node
-
-      elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
-        # TODO(mdan): Should we replace these with equivalent TF ops instead?
-        node = self._wrap_to_py_func_single_return(
-            node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
-
-      elif inspect_utils.isbuiltin(target_entity):
-        # Note: Any builtin that passed the builtins converter is assumed to be
-        # safe for graph mode.
-        return node
-
-      elif inspect_utils.isnamedtuple(target_entity):
-        # Although not compilable, we assume they are safe for graph mode.
-        node = self.generic_visit(node)
-        return node
-
-      else:
-        # TODO(mdan): Instert dynamic conversion here instead.
-        raise NotImplementedError(
-            'py_func with return values (unknown function)')
-    else:
-      # Special cases
-      # TODO(mdan): These need a systematic review - there may be more.
-
-      # 1. super() calls - these are preserved. The class conversion mechanism
-      # will ensure that they return the correct value.
-      if ast_util.matches(node, parser.parse_expression('super(_)')):
-        return node
-
-      # 2. super().method calls - these are preserved as well, when the
-      # conversion processes the entire class.
-      if (ast_util.matches(node, parser.parse_expression('super(_)._(_)')) and
-          self.ctx.info.owner_type is not None):
-        return node
-
-      node = self._insert_dynamic_conversion(node)
-    return node
+    return new_call
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 454d75d755c7273d11e1f89e4138cd997eb6e49a..654682edc737f8de291f50259c28a51c131aef58 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,147 +18,97 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-import numpy as np
-
 from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class CallTreesTest(converter_testing.TestCase):
 
-  def test_basic(self):
-
-    def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled version.')
-
-    def other_test_fn_1(a):
-      return a + 1
-
-    def test_fn_2(a):
-      return test_fn_1(a) + 1
-
-    ns = {'test_fn_1': test_fn_1}
-    node, ctx = self.prepare(test_fn_2, ns)
-    node = call_trees.transform(node, ctx)
+  def test_normal_function(self):
 
-    with self.compiled(node, ns) as result:
-      new_name, _ = ctx.namer.compiled_function_name(('test_fn_1',))
-      setattr(result, new_name, other_test_fn_1)
-      self.assertEquals(result.test_fn_2(1), 3)
-
-  def test_dynamic_function(self):
-
-    def test_fn_1():
-      raise ValueError('This should be masked by the mock in self.compiled.')
-
-    def test_fn_2(f):
+    def test_fn(f):
       return f() + 3
 
-    with self.converted(test_fn_2, call_trees, {}) as result:
-      # 10 = 7 (from the mock) + 3 (from test_fn_2)
-      self.assertEquals(10, result.test_fn_2(test_fn_1))
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((), {})])
 
-  def test_basic_method(self):
-
-    class TestClass(object):
+  def test_function_with_kwarg(self):
 
-      def test_fn_1(self, a):
-        return a + 1
+    def test_fn(f, a, b):
+      return f(a, c=b) + 3
 
-      def test_fn_2(self, a):
-        return self.test_fn_1(a) + 1
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 2})])
 
-    ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(
-        TestClass.test_fn_2,
-        ns,
-        namer=converter_testing.FakeNoRenameNamer(),
-        arg_types={'self': (TestClass.__name__, TestClass)})
-    node = call_trees.transform(node, ctx)
+  def test_function_with_kwargs_starargs(self):
 
-    with self.compiled(node, ns) as result:
-      tc = TestClass()
-      self.assertEquals(3, result.test_fn_2(tc, 1))
+    def test_fn(f, a, *args, **kwargs):
+      return f(a, *args, **kwargs) + 5
 
-  def test_known_called_lambda(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, *[2, 3], **{'b': 4, 'c': 5}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})])
 
-    l = lambda x: x
+  def test_function_with_kwargs_starargs_only(self):
 
-    def test_fn(a):
-      return l(a)
+    def f(*unused_args):  # Will not be called.
+      pass
 
-    ns = {'l': l}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(1, result.test_fn(1))
-
-  def test_known_called_namedtuple(self):
-
-    nt = collections.namedtuple('TestNamedTuple', ['a'])
-
-    def test_fn(a):
-      return nt(a)
-
-    ns = {'nt': nt}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(nt(1), result.test_fn(1))
+    def test_fn():
+      args = [1, 2, 3]
+      return f(*args) + 11
 
-  def test_py_func_known_function(self):
+    with self.converted(test_fn, call_trees, {'f': f}) as result:
+      self.assertEquals(
+          result.test_fn(),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 11)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {})])
 
-    def test_fn():
-      return np.random.binomial(2, 0.5)
+  def test_function_with_kwargs_keywords(self):
 
-    with self.converted(test_fn, call_trees, {'np': np},
-                        dtypes.int64) as result:
-      with self.cached_session() as sess:
-        self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
+    def test_fn(f, a, b, **kwargs):
+      return f(a, b=b, **kwargs) + 5
 
-  def test_uncompiled_modules(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2, **{'c': 3}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})])
 
-    def test_fn(a):
-      a = math_ops.multiply(a, constant_op.constant(2))
-      a = math_ops.add(a, constant_op.constant(1))
-      return a
+  def test_class_method(self):
 
-    ns = {'math_ops': math_ops, 'constant_op': constant_op}
-    node, ctx = self.prepare(
-        test_fn,
-        ns,
-        arg_types=set(((math_ops.__name__,), (constant_op.__name__,))))
-    node = call_trees.transform(node, ctx)
+    class TestClass(object):
 
-    with self.compiled(node, ns) as result:
-      with self.cached_session() as sess:
-        result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(self.evaluate(result_tensor), 3)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-  def test_call_to_decorated_function(self):
+    tc = TestClass()
+    with self.converted(TestClass.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
-    def decorator(f):
-      return f
+  def test_object_method(self):
 
-    @decorator
-    def called_fn(a):
-      return a
+    class TestClass(object):
 
-    def test_fn(a):
-      return called_fn(a)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-    node, ctx = self.prepare(test_fn, {'called_fn': called_fn})
-    node = call_trees.transform(node, ctx)
+    tc = TestClass()
+    with self.converted(tc.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index a4eef7e6a1f7c162f5fa19891a3466c23dc86fe9..4538b16660c62df84ad796b5d8824901ef226ecb 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -27,7 +27,8 @@ class ConditionalExpressionTransformer(converter.Base):
 
   def visit_IfExp(self, node):
     return templates.replace_as_expression(
-        'ag__.if_stmt(test, lambda: true_expr, lambda: false_expr)',
+        '''ag__.if_stmt(test, lambda: true_expr,
+                        lambda: false_expr, lambda: (), lambda _: None)''',
         test=node.test,
         true_expr=node.body,
         false_expr=node.orelse)
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 05e19e59fc6701db618e925e1d305f299b270e33..780f837fa3966c68383ab0ba4acdfcb7b221d005 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -29,11 +29,17 @@ class _Continue(object):
   def __init__(self):
     self.used = False
     self.control_var_name = None
-    self.create_guard = False
-    self.guard_created = False
 
   def __repr__(self):
-    return 'used: %s, var: %s' % (self.used, self.control_var_name)
+    return '<_Continue(used: {}, var: {})>'.format(self.used,
+                                                   self.control_var_name)
+
+
+class _Block(object):
+
+  def __init__(self):
+    self.guard_created = False
+    self.create_guard = False
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
@@ -68,17 +74,17 @@ class ContinueCanonicalizationTransformer(converter.Base):
     #    |                #         created if node)
 
     if self.state[_Continue].used:
-      if self.state[_Continue].guard_created:
+      if self.state[_Block].guard_created:
         return node, None
 
-      elif not self.state[_Continue].create_guard:
-        self.state[_Continue].create_guard = True
+      elif not self.state[_Block].create_guard:
+        self.state[_Block].create_guard = True
         return node, None
 
       else:
-        self.state[_Continue].guard_created = True
+        self.state[_Block].guard_created = True
         template = """
-          if not var_name:
+          if ag__.not_(var_name):
             original_node
         """
         cond, = templates.replace(
@@ -90,6 +96,7 @@ class ContinueCanonicalizationTransformer(converter.Base):
 
   def _visit_loop_body(self, node, nodes):
     self.state[_Continue].enter()
+    self.state[_Block].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
     self.state[_Continue].control_var_name = continue_var
@@ -103,14 +110,21 @@ class ContinueCanonicalizationTransformer(converter.Base):
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
+    self.state[_Block].exit()
     self.state[_Continue].exit()
     return nodes
 
+  def _visit_non_loop_body(self, nodes):
+    self.state[_Block].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    self.state[_Block].exit()
+    return nodes
+
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -118,7 +132,29 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_If(self, node):
+    node.body = self.visit_block(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_non_loop_body(node.body)
+    return node
+
+  def visit_Try(self, node):
+    node.body = self._visit_non_loop_body(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    # In Python 3.8 and later continue is allowed in finally blocks
+    node.finalbody = self._visit_non_loop_body(node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_non_loop_body(node.body)
     return node
 
 
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index d6aaa504436aa13007142bc87623605be15667d2..5a1828e3189db7c2ae81991951d153074ff4904c 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -20,15 +20,15 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.eager import context as tfe_ctx
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
 class ContinueCanonicalizationTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, continue_statements, {},
+    with self.converted(test_fn, continue_statements, {'ops': ops},
                         constant_op.constant) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
@@ -43,11 +43,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_for_loop(self):
 
@@ -60,11 +59,89 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, [])
-      self.assertTransformedEquivalent(test_fn, [1])
-      self.assertTransformedEquivalent(test_fn, [2])
-      self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+
+  def test_nested_with(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_statements(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_nested_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          with ops.name_scope(''):
+            v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_nested(self):
 
@@ -83,11 +160,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a39a0b0cdb16280312b830c9c9bbe78c06ab77b0..c8dde8095068b5b4d3b34f7b1832e3aaf718e1a2 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,7 +23,6 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis import annos
 
@@ -89,23 +88,33 @@ class ControlFlowTransformer(converter.Base):
       return templates.replace(
           template, body_name=body_name, body=body, return_stmt=return_stmt)
 
-  def _create_cond_expr(self, results, test, body_name, orelse_name):
+  def _create_cond_expr(self, results, test, body_name, orelse_name,
+                        state_getter_name,
+                        state_setter_name):
     if results is not None:
       template = """
-        results = ag__.if_stmt(test, body_name, orelse_name)
+        results = ag__.if_stmt(test, body_name, orelse_name,
+                               state_getter_name, state_setter_name)
       """
       return templates.replace(
           template,
           test=test,
           results=results,
           body_name=body_name,
-          orelse_name=orelse_name)
+          orelse_name=orelse_name,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name)
     else:
       template = """
-        ag__.if_stmt(test, body_name, orelse_name)
+        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name)
       """
       return templates.replace(
-          template, test=test, body_name=body_name, orelse_name=orelse_name)
+          template,
+          test=test,
+          body_name=body_name,
+          orelse_name=orelse_name,
+          getter_name=state_getter_name,
+          setter_name=state_setter_name)
 
   def _fmt_symbols(self, symbol_set):
     if not symbol_set:
@@ -139,6 +148,47 @@ class ControlFlowTransformer(converter.Base):
           block_live_in.add(s)
     return scope.modified & node_defined_in & block_live_in
 
+  def _create_state_functions(self, composites,
+                              state_getter_name, state_setter_name):
+    if composites:
+      composite_tuple = tuple(composites)
+      template = """
+        def state_getter_name():
+          return composite_tuple,
+        def state_setter_name(vals):
+          composite_tuple, = vals
+      """
+      node = templates.replace(
+          template,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name,
+          composite_tuple=composite_tuple)
+    else:
+      template = """
+        def state_getter_name():
+          return ()
+        def state_setter_name(_):
+          pass
+        """
+      node = templates.replace(
+          template,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name)
+
+    return node
+
+  def _create_undefined_assigns(self, undefined_symbols):
+    assignments = []
+    for s in undefined_symbols:
+      template = '''
+        var = ag__.Undefined(symbol_name)
+      '''
+      assignments += templates.replace(
+          template,
+          var=s,
+          symbol_name=gast.Str(s.ssf()))
+    return assignments
+
   def visit_If(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
@@ -157,14 +207,17 @@ class ControlFlowTransformer(converter.Base):
 
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
+    composites = set()
     for s in modified_in_cond:
       if s in live_out:
         returned_from_cond.add(s)
-      elif s.is_composite():
-        # Special treatment for compound objects: if any of their owner entities
-        # are live, then they are outputs as well.
-        if live_out & s.owner_set:
-          returned_from_cond.add(s)
+      if s.is_composite():
+        # Special treatment for compound objects, always return them.
+        # This allows special handling within the if_stmt itself.
+        # For example, in TensorFlow we need to restore the state of composite
+        # symbols to ensure that only effects from the executed branch are seen.
+        returned_from_cond.add(s)
+        composites.add(s)
 
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
@@ -173,14 +226,12 @@ class ControlFlowTransformer(converter.Base):
         s for s in created_in_body if not s.is_composite())
     basic_created_in_orelse = tuple(
         s for s in created_in_orelse if not s.is_composite())
-    if basic_created_in_body != basic_created_in_orelse:
-      raise ValueError(
-          'if statement may not initialize all variables: the true branch'
-          ' creates %s, while the false branch creates %s. Make sure all'
-          ' these variables are initialized either in both'
-          ' branches or before the if statement.' %
-          (self._fmt_symbols(basic_created_in_body),
-           self._fmt_symbols(basic_created_in_orelse)))
+
+    # These variables are defined only in a single branch. This is fine in
+    # Python so we pass them through. Another backend, e.g. Tensorflow, may need
+    # to handle these cases specially or throw an Error.
+    possibly_undefined = (set(basic_created_in_body) ^
+                          set(basic_created_in_orelse))
 
     # Alias the closure variables inside the conditional functions, to allow
     # the functions access to the respective variables.
@@ -205,6 +256,9 @@ class ControlFlowTransformer(converter.Base):
     cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
     body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
     orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+    all_referenced = body_scope.referenced | orelse_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', all_referenced)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
 
     returned_from_cond = tuple(returned_from_cond)
     if returned_from_cond:
@@ -247,10 +301,16 @@ class ControlFlowTransformer(converter.Base):
         aliased_new_names=aliased_orelse_new_names,
         body=node_orelse,
         returns=returned_from_orelse)
+    undefined_assigns = self._create_undefined_assigns(possibly_undefined)
+    composite_defs = self._create_state_functions(
+        composites, state_getter_name, state_setter_name)
+
     cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
-                                       orelse_name)
+                                       orelse_name, state_getter_name,
+                                       state_setter_name)
 
-    return cond_assign + body_def + orelse_def + cond_expr
+    return (undefined_assigns + cond_assign + composite_defs + body_def +
+            orelse_def + cond_expr)
 
   def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
@@ -259,35 +319,40 @@ class ControlFlowTransformer(converter.Base):
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
     reserved_symbols = body_scope.referenced
 
-    # Note that it doesn't matter whether the variables are live after the loop.
-    # If the loop modifies them nonlocally (e.g. the result of an iteration
-    # depends on the previous iteration), then they need to be included in
-    # the loop state, regardless of whether they are later used or not.
-    loop_state = body_scope.modified & live_in
-
+    loop_state = []
+    for s in body_scope.modified:
+
+      # Variables not live into or out of the loop are considered local to the
+      # loop.
+      if s not in live_in and s not in live_out:
+        continue
+
+      # Mutations made to objects created inside the loop will appear as writes
+      # to composite symbols. Because these mutations appear as modifications
+      # made to composite symbols, we check whether the composite's parent is
+      # actually live into the loop.
+      # Example:
+      #   while cond:
+      #     x = Foo()
+      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      if s.is_composite() and not all(p in live_in for p in s.support_set):
+        continue
+
+      loop_state.append(s)
+    loop_state = frozenset(loop_state)
+
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop
     undefined_lives = loop_state - defined_in
+
     # Only simple variables must be defined. The composite ones will be
     # implicitly checked at runtime.
-    undefined_simple_lives = {v for v in undefined_lives if v.is_simple()}
-    if undefined_simple_lives:
-      raise NameError(
-          'cannot convert loop: it includes symbols that are undefined'
-          ' when entering the loop: {}'.format(
-              self._fmt_symbols(undefined_simple_lives)))
-
-    live_defs_in_loop = (body_scope.modified - live_in) & live_out
-    if live_defs_in_loop:
-      # TODO(mdan): Include reference to explanation why.
-      raise NotImplementedError(
-          'cannot convert loop: it includes symbols that are defined'
-          ' inside the loop, but used later: {}. To fix, initialize'
-          ' these symbols before the loop'.format(
-              self._fmt_symbols(live_defs_in_loop)))
-
-    return loop_state, reserved_symbols
+    possibly_undefs = {v for v in undefined_lives if v.is_simple()}
+
+    return loop_state, reserved_symbols, possibly_undefs
 
   def _state_constructs(self, loop_state, reserved_symbols):
-    loop_state = list(loop_state)
+    loop_state = tuple(loop_state)
     state_ssf = [
         self.ctx.namer.new_symbol(s.ssf(), reserved_symbols) for s in loop_state
     ]
@@ -297,19 +362,18 @@ class ControlFlowTransformer(converter.Base):
         if str(name) != ssf
     }
 
+    state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
+
     if len(loop_state) == 1:
       loop_state = loop_state[0]
       state_ssf = state_ssf[0]
-      state_ast_tuple = loop_state
-    else:
-      state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
 
     return loop_state, state_ssf, state_ast_tuple, ssf_map
 
   def visit_While(self, node):
     self.generic_visit(node)
 
-    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, reserved_symbols, possibly_undefs = self._get_loop_state(node)
 
     # Note: one might expect we can dispatch based on the loop condition.
     # But because that is dependent on the state, it cannot be evaluated ahead
@@ -327,8 +391,7 @@ class ControlFlowTransformer(converter.Base):
     cond_scope = anno.getanno(node, annos.NodeAnno.COND_SCOPE)
     cond_closure = set()
     for s in cond_scope.read:
-      cond_closure.update(s.support_set)
-    cond_closure -= loop_state
+      cond_closure |= s.support_set
 
     loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
         loop_state, reserved_symbols)
@@ -374,67 +437,108 @@ class ControlFlowTransformer(converter.Base):
           extra_deps=tuple(s.ast() for s in cond_closure),
       )
 
-    return node
+    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    return undefined_assigns + node
+
+  def _create_for_loop_early_stopping(self, loop_state, state_ssf,
+                                      state_ast_tuple, original_node,
+                                      extra_test_name, extra_test,
+                                      body_name, loop_body):
+    """Create node for for-loop with early stopping (e.g. break or return)."""
+    template = """
+      def extra_test_name(state_ssf):
+        return extra_test_expr
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return state_ssf,
+      state_ast_tuple = ag__.for_stmt(
+          iter_, extra_test_name, body_name, (state,))
+    """
+    return templates.replace(
+        template,
+        state=loop_state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        extra_test_name=extra_test_name,
+        extra_test_expr=extra_test,
+        body_name=body_name,
+        body=loop_body)
+
+  def _create_for_loop_with_state(self, loop_state, state_ssf, state_ast_tuple,
+                                  original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
+    template = """
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return state_ssf,
+      state_ast_tuple = ag__.for_stmt(
+          iter_, None, body_name, (state,))
+    """
+    return templates.replace(
+        template,
+        state=loop_state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
+
+  def _create_for_loop_without_state(self, original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
+    template = """
+      def body_name(loop_vars):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return ()
+      ag__.for_stmt(iter_, None, body_name, ())
+    """
+    return templates.replace(
+        template,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
 
   def visit_For(self, node):
     self.generic_visit(node)
 
-    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, reserved_symbols, possibly_undefs = self._get_loop_state(node)
     loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
         loop_state, reserved_symbols)
     node_body = ast_util.rename_symbols(node.body, ssf_map)
-    if anno.hasanno(node, 'extra_test'):
-      extra_test = anno.getanno(node, 'extra_test')
-      extra_test = ast_util.rename_symbols(extra_test, ssf_map)
-    else:
-      extra_test = parser.parse_expression('True')
+    body_name = self.ctx.namer.new_symbol('loop_body', reserved_symbols)
 
+    has_extra_test = anno.hasanno(node, 'extra_test')
     if loop_state:
-      template = """
-        def extra_test_name(state_ssf):
-          return extra_test_expr
-        def body_name(loop_vars, state_ssf):
-          # Workaround for PEP-3113
-          iterate = loop_vars
-          body
-          return state_ssf,
-        state_ast_tuple = ag__.for_stmt(
-            iter_, extra_test_name, body_name, (state,))
-      """
-      node = templates.replace(
-          template,
-          state=loop_state,
-          state_ssf=state_ssf,
-          state_ast_tuple=state_ast_tuple,
-          iter_=node.iter,
-          iterate=node.target,
-          extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                    reserved_symbols),
-          extra_test_expr=extra_test,
-          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-          body=node_body)
+      if has_extra_test:
+        # Loop with early stopping (e.g. break or return)
+        extra_test = anno.getanno(node, 'extra_test')
+        extra_test = ast_util.rename_symbols(extra_test, ssf_map)
+        extra_test_name = self.ctx.namer.new_symbol('extra_test',
+                                                    reserved_symbols)
+        node = self._create_for_loop_early_stopping(
+            loop_state, state_ssf, state_ast_tuple, node, extra_test_name,
+            extra_test, body_name, node_body)
+      else:
+        # Loop with loop-carried state and no early stopping
+        node = self._create_for_loop_with_state(
+            loop_state, state_ssf, state_ast_tuple, node, body_name, node_body)
     else:
-      template = """
-        def extra_test_name():
-          return extra_test_expr
-        def body_name(loop_vars):
-          # Workaround for PEP-3113
-          iterate = loop_vars
-          body
-          return ()
-        ag__.for_stmt(iter_, extra_test_name, body_name, ())
-      """
-      node = templates.replace(
-          template,
-          iter_=node.iter,
-          iterate=node.target,
-          extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                    reserved_symbols),
-          extra_test_expr=extra_test,
-          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-          body=node_body)
+      # Loop with no loop-carried state and no early stopping
+      assert not has_extra_test, ('Early stoppiong (e.g. break and/or return) '
+                                  'should create state variables.')
+      node = self._create_for_loop_without_state(node, body_name, node_body)
 
-    return node
+    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    return undefined_assigns + node
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 1a38d0db4d93d331a6ece51a0abc738abace5fa3..9ad229c5e7f1ce7942e9cd2e2d7efb049862db1e 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -18,9 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -29,13 +30,14 @@ from tensorflow.python.platform import test
 
 class ControlFlowTest(converter_testing.TestCase):
 
-  def assertTransformedResult(self, test_fn, inputs, expected):
+  def assertTransformedResult(self, test_fn, inputs, expected, symbols=None):
     if not isinstance(inputs, tuple):
       inputs = (inputs,)
-    with self.converted(test_fn, control_flow, {},
+    if not symbols:
+      symbols = {}
+    with self.converted(test_fn, control_flow, symbols,
                         constant_op.constant) as result:
-      with self.cached_session() as sess:
-        self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
+      self.assertEqual(self.evaluate(result.test_fn(*inputs)), expected)
 
   @test_util.run_deprecated_v1
   def test_while_basic(self):
@@ -79,16 +81,89 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), 0)
 
-  def test_while_variable_defined_in_body(self):
-    def bad_while_loop(n):
+  def test_while_local_composite(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.x = constant_op.constant(3)
+
+    def test_fn(n):
+      while n > 0:
+        tc = TestClass()
+        tc.x = tc.x
+        n -= 1
+      return n
+
+    self.assertTransformedResult(
+        test_fn, constant_op.constant(5), 0, symbols={'TestClass': TestClass})
+
+  # TODO(b/127642077): Add tests for x.y.z = 2*x.y.z and x.y[z] = 2*x.y[z].
+  def test_while_local_composite_complex_nestable(self):
+
+    # This class is ok to be in a tf.while_loop's state.
+    class TestClass(collections.namedtuple('TestClass', ('x'))):
+      pass
+
+    def test_fn(n):
+      tc = TestClass([constant_op.constant(0)])
+      while n > 0:
+        tc = TestClass([constant_op.constant(3)])
+        tc.x[0] = tc.x[0] + 1
+        n -= 1
+      return tc.x[0]
+
+    ns = {'TestClass': TestClass, 'constant_op': constant_op}
+    self.assertTransformedResult(
+        test_fn, constant_op.constant(5), 4, symbols=ns)
+
+  def test_while_local_composite_complex_illegal(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.x = [constant_op.constant(3)]
+
+    def test_fn(n):
+      while n > 0:
+        tc = TestClass()
+        tc.x[0] = tc.x[0] + 1
+        n -= 1
+      return tc.x[0]
+
+    with self.converted(
+        test_fn, control_flow, {'TestClass': TestClass}) as result:
+      # The tested function would require `tc` to become part of the while loop
+      # state, but TensorFlow doesn't support classes at the moment.
+      with self.assertRaisesRegexp(ValueError, 'must.*initialize.*Tensor.*tc'):
+        result.test_fn(constant_op.constant(5))
+
+  @test_util.run_deprecated_v1
+  def test_while_dispatches_by_cond_only(self):
+
+    class TensorIncompatibleNumeric(object):
+      """Works in arithmetic expression, but errors out with TF ops."""
+
+      def __init__(self, val):
+        self.val = val
+
+      def __add__(self, other):
+        return TensorIncompatibleNumeric(self.val + other)
+
+    def test_fn(n, s):
       while n > 0:
         n -= 1
-        s = n
+        s += n
       return s
 
-    node, ctx = self.prepare(bad_while_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
+    self.assertTransformedResult(test_fn, (constant_op.constant(5), 0), 10)
+    with self.converted(test_fn, control_flow, {}) as result:
+      # n alone controls the staging. When the loop is not staged, Python
+      # knows how to add the two objects. But when staged, tf.while_loop will
+      # not know how to deal with the TensorIncompatibleNumeric object.
+      self.assertEqual(result.test_fn(5, TensorIncompatibleNumeric(0)).val, 10)
+      with self.assertRaises(TypeError):
+        result.test_fn(constant_op.constant(5), TensorIncompatibleNumeric(0))
 
   @test_util.run_deprecated_v1
   def test_if_basic(self):
@@ -124,11 +199,10 @@ class ControlFlowTest(converter_testing.TestCase):
       return obj
 
     with self.converted(test_fn, control_flow, {}) as result:
-      with self.cached_session() as sess:
-        res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (-1, 0))
-        res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
+      res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (-1, 0))
+      res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (0, -2))
 
   @test_util.run_deprecated_v1
   def test_if_single_output(self):
@@ -176,16 +250,51 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
-  def test_if_imbalanced_outputs(self):
+  @test_util.run_deprecated_v1
+  def test_if_unbalanced_multiple_composites(self):
+
+    class Foo(object):
 
-    def test_fn(n):
-      if n > 0:
-        b = 4
-      return b
+      def __init__(self):
+        self.b = 2
+        self.c = 3
+
+    def test_fn(x, condition):
+
+      z = 5
+      if condition:
+        x.b = 7
+        x.c = 11
+        z = 13
+
+      return x.b, x.c, z
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(transformer.AutoGraphParseError):
-      control_flow.transform(node, ctx)
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+                                 (7, 11, 13))
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+                                 (2, 3, 5))
+
+  @test_util.run_deprecated_v1
+  def test_if_unbalanced_composite(self):
+
+    class Foo(object):
+
+      def __init__(self):
+        self.b = 2
+
+    def test_fn(x, condition):
+
+      z = 5
+      if condition:
+        x.b = 7
+        z = 13
+
+      return x.b, z
+
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+                                 (7, 13))
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+                                 (2, 5))
 
   @test_util.run_deprecated_v1
   def test_simple_for(self):
@@ -237,16 +346,6 @@ class ControlFlowTest(converter_testing.TestCase):
       self.assertEqual(result.test_fn(5), 10)
       self.assertEqual(eval_count[0], 1)
 
-  def test_for_variable_defined_in_body(self):
-    def bad_for_loop(n):
-      for i in range(n):
-        s = i
-      return s
-
-    node, ctx = self.prepare(bad_for_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
-
   @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
@@ -256,5 +355,7 @@ class ControlFlowTest(converter_testing.TestCase):
       return z
 
     self.assertTransformedResult(test_fn, [3, 3], 7)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/decorators.py b/tensorflow/python/autograph/converters/decorators.py
deleted file mode 100644
index f0ea51277468499937089c89eedb344149cb1ae7..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Handles decorators.
-
-Note: this module only deals with functions whose decorators are still recorded
-in the AST. This does not always happen. See the unit test for an example.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.util import tf_inspect
-
-
-class DecoratorsTransformer(converter.Base):
-  """Converts or removes decorators."""
-
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    kept_decorators = []
-    for dec in node.decorator_list:
-      if isinstance(dec, gast.Call):
-        dec_func = dec.func
-      else:
-        dec_func = dec
-
-      # Special cases.
-      # TODO(mdan): Is there any way we can treat these more generically?
-      # We may want to forego using decorators altogether if we can't
-      # properly support them.
-      if isinstance(dec_func, gast.Name) and dec_func.id in ('classmethod',):
-        # Assumption: decorators are only visible in the AST when converting
-        # a function inline (via another decorator).
-        # In that case, the converted function is no longer part of the
-        # original object that it was declared into.
-        # This is currently verified by tests.
-        continue
-
-      if not anno.hasanno(dec_func, 'live_val'):
-        raise ValueError('could not resolve the decorator "@%s"' %
-                         (anno.getanno(dec_func, anno.Basic.QN)))
-
-      original_dec = anno.getanno(dec_func, anno.Basic.QN)
-      dec_value = anno.getanno(dec_func, 'live_val')
-
-      if dec_value in self.ctx.program.options.strip_decorators:
-        continue
-
-      # When using foo.bar.baz, we only really need to grab foo and import
-      # that.
-      dec_support_node = dec_func
-      while isinstance(dec_support_node, gast.Attribute):
-        dec_support_node = dec_support_node.value
-
-      if not anno.hasanno(dec_support_node, 'live_val'):
-        raise ValueError(
-            'could not resolve symbol "%s" when looking up decorator "%s"' %
-            (anno.getanno(dec_support_node, anno.Basic.QN), original_dec))
-
-      dec_support = anno.getanno(dec_support_node, 'live_val')
-      # The tuple contains:
-      #  * the AST that represents the decorator
-      #  * the entity supporting the decorator (i.e., what we need to import)
-      #  * the name of the module that needs to be imported for this decorator
-      #    to properly resolve.
-      # Examples:
-      #  for foo.bar, the tuple is (<ast>, <module foo>, 'foo')
-      #  for baz, the tuple is (<ast>, <module baz.__module__>, 'baz')
-      kept_decorators.append((dec, dec_support,
-                              anno.getanno(dec_support_node, anno.Basic.QN)))
-
-    for _, dec_support, name in kept_decorators:
-      if tf_inspect.ismodule(dec_support):
-        self.ctx.program.additional_imports.add(
-            'import %s as %s' % (dec_support.__name__, name))
-      else:
-        if dec_support.__module__ == '__main__':
-          raise ValueError(
-              'decorator "%s" was not allowed because it is declared '
-              'in the module "%s". To fix this, declare it in a separate '
-              'module that we can import it from.' % (dec_support,
-                                                      dec_support.__module__))
-        self.ctx.program.additional_imports.add(
-            'from %s import %s' % (dec_support.__module__, name))
-
-    node.decorator_list = [dec for dec, _, _ in kept_decorators]
-    return node
-
-
-def transform(node, ctx):
-  return DecoratorsTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/decorators_test.py b/tensorflow/python/autograph/converters/decorators_test.py
deleted file mode 100644
index bcf502c62b33883450062484ec030f840b0e6b25..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for decorators module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-import imp
-
-from tensorflow.python import autograph
-from tensorflow.python.autograph.converters import decorators
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import test
-
-
-# The Python parser only briefly captures decorators into the AST.
-# The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is normally what you would expect, since
-# they are meant to be transparent).
-# However, decorators are still visible when you analyze the function
-# from inside a decorator, before it was applied - as is the case
-# with our conversion decorators.
-
-
-def simple_decorator(f):
-  return lambda a: f(a) + 1
-
-
-def self_transform_decorator(transform):
-
-  def decorator(f):
-    @wraps(f)
-    def wrapper(*args):
-      # This removing wrapper is defined in the test below. This setup is so
-      # intricate in order to simulate how we use the transformer in practice.
-      transformed_f = transform(f, (self_transform_decorator,))
-      return transformed_f(*args) + 1
-    return wrapper
-  return decorator
-
-
-class DecoratorsTest(converter_testing.TestCase):
-
-  def _transform(self, f, strip_decorators):
-    namespace = {
-        'self_transform_decorator': self_transform_decorator,
-        'simple_decorator': simple_decorator,
-        'converter_testing': converter_testing,
-    }
-    node, ctx = self.prepare(
-        f, namespace, recursive=False, strip_decorators=strip_decorators)
-    node = decorators.transform(node, ctx)
-    import_line = '\n'.join(ctx.program.additional_imports)
-    result, _ = compiler.ast_to_object(node, source_prefix=import_line)
-    return getattr(result, f.__name__)
-
-  def test_noop(self):
-
-    def test_fn(a):
-      return a
-
-    with self.converted(test_fn, decorators, {}) as result:
-      self.assertEqual(1, result.test_fn(1))
-
-  def test_function(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, test_fn(1))
-
-  def test_method(self):
-
-    class TestClass(object):
-
-      @self_transform_decorator(self._transform)
-      def test_fn(self, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass().test_fn(1))
-
-  def test_multiple_decorators(self):
-
-    class TestClass(object):
-
-      # Note that reversing the order of this two doesn't work.
-      @classmethod
-      @self_transform_decorator(self._transform)
-      def test_fn(cls, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass.test_fn(1))
-
-  def test_nested_decorators_local(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      @simple_decorator
-      def inner_fn(b):
-        return b + 11
-      return inner_fn(a)
-
-    # Expected to fail because simple_decorator could not be imported.
-    with self.assertRaises(transformer.AutoGraphParseError):
-      test_fn(1)
-
-  def test_nested_decorators_imported(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-
-      @converter_testing.imported_decorator
-      def inner_fn(b):
-        return b + 11
-
-      return inner_fn(a)
-
-    # Work around TensorFlow's symbol suppression mechanism that causes core to
-    # be invisible in the generated code.
-    core_mod = imp.new_module('core')
-    core_mod.converter_testing = converter_testing
-    autograph.core = core_mod
-
-    # 14 = 1 (a) + 1 (simple_decorator) + 11 (inner_fn)
-    self.assertEqual(14, test_fn(1))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/directives_test.py b/tensorflow/python/autograph/converters/directives_test.py
index 570fb8e379b522472afd7e96b85e05fea1d26d53..870a491ccdf8db0119da5b29ecda7b9e96e70ff4 100644
--- a/tensorflow/python/autograph/converters/directives_test.py
+++ b/tensorflow/python/autograph/converters/directives_test.py
@@ -84,9 +84,9 @@ class DirectivesTest(converter_testing.TestCase):
     def call_invalid_directive():
       invalid_directive(1)
 
-    node, _ = parser.parse_entity(call_invalid_directive)
+    node, _, _ = parser.parse_entity(call_invalid_directive)
     # Find the call to the invalid directive
-    node = node.body[0].body[0].value
+    node = node.body[0].value
     with self.assertRaisesRegexp(ValueError, 'Unexpected keyword.*'):
       directives_converter._map_args(node, invalid_directive)
 
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index 5a1248c8015c36882136421bfe4efc7d3dd58831..0eccf39db7dc4bf3cd50293f82921cd59fcadd66 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -92,7 +92,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
         return l, inner_fn(l)
 
     ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(TestClass, ns, owner_type=TestClass)
+    node, ctx = self.prepare(TestClass, ns)
     node = function_scopes.transform(node, ctx)
 
     with self.compiled(node, {}, ops.name_scope) as result:
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 496c99e3b5247c174f8a74e9b3f23517ddc649f3..3173e676e5dc383f399ca89cdc7814406afb28eb 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -22,310 +22,391 @@ import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# TODO(mdan): Move this logic into transformer_base.
-class BodyVisitor(converter.Base):
-  """Walks breadth- or depth-first the list-of-nodes bodies of AST nodes."""
+BODY_DEFINITELY_RETURNS = 'BODY_DEFINITELY_RETURNS'
+ORELSE_DEFINITELY_RETURNS = 'ORELSE_DEFINITELY_RETURNS'
+STMT_DEFINITELY_RETURNS = 'STMT_DEFINITELY_RETURNS'
 
-  def __init__(self, ctx, depth_first=False):
-    super(BodyVisitor, self).__init__(ctx)
-    self.depth_first = depth_first
-    self.changes_made = False
 
-  def visit_nodelist(self, nodelist):
-    for node in nodelist:
-      if isinstance(node, list):
-        node = self.visit_nodelist(node)
+class _Block(object):
+
+  def __init__(self):
+    self.definitely_returns = False
+
+
+class ConditionalReturnRewriter(converter.Base):
+  """Rewrites a a pattern where it's unbovious that all paths return a value.
+
+  This rewrite allows avoiding intermediate None return values.
+
+  The following pattern:
+
+      if cond:
+        <block 1>
+        return
       else:
-        node = self.generic_visit(node)
-    return nodelist
+        <block 2>
+      <block 3>
 
-  def visit_If(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
-    return node
+  is converted to:
 
-  def visit_For(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+      if cond:
+        <block 1>
+        return
+      else:
+        <block 2>
+        <block 3>
+
+  and vice-versa (if the else returns, subsequent statements are moved under the
+  if branch).
+  """
+
+  def visit_Return(self, node):
+    self.state[_Block].definitely_returns = True
     return node
 
+  def _postprocess_statement(self, node):
+    # If the node definitely returns (e.g. it's a with statement with a
+    # return stateent in it), then the current block also definitely returns.
+    if anno.getanno(node, STMT_DEFINITELY_RETURNS, default=False):
+      self.state[_Block].definitely_returns = True
+
+    # The special case: collapse a typical conditional return pattern into
+    # a single conditional with possibly returns on both branches. This
+    # reduces the use of None return values, which don't work with TF
+    # conditionals.
+    if (isinstance(node, gast.If)
+        and anno.getanno(node, BODY_DEFINITELY_RETURNS, default=False)):
+      return node, node.orelse
+    elif (isinstance(node, gast.If)
+          and anno.getanno(node, ORELSE_DEFINITELY_RETURNS, default=False)):
+      return node, node.body
+
+    return node, None
+
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Block].enter()
+    new_nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    block_definitely_returns = self.state[_Block].definitely_returns
+    self.state[_Block].exit()
+    return new_nodes, block_definitely_returns
+
   def visit_While(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+    node.test = self.visit(node.test)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
-  def visit_Try(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    node.finalbody = self.visit_nodelist(node.finalbody)
-    for i in range(len(node.handlers)):
-      node.handlers[i].body = self.visit_nodelist(node.handlers[i].body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
   def visit_With(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+    node.items = self.visit_block(node.items)
+    node.body, definitely_returns = self._visit_statement_block(node, node.body)
+    if definitely_returns:
+      anno.setanno(node, STMT_DEFINITELY_RETURNS, True)
     return node
 
-  def visit_FunctionDef(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    self.generic_visit(node)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+  def visit_Try(self, node):
+    # We could decide whether a 'try' DEFINITELY_RETURNS based on its components
+    # It is not clear whether we want to do anything with this given
+    # a 'try' is likely to throw an exception in some circumstances.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
+    node.finalbody, _ = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
     return node
 
-
-class FoldElse(BodyVisitor):
-
-  def visit_nodelist(self, nodelist):
-    for i in range(len(nodelist)):
-      node = nodelist[i]
-      if isinstance(node, gast.If):
-        true_branch_returns = isinstance(node.body[-1], gast.Return)
-        false_branch_returns = len(node.orelse) and isinstance(
-            node.orelse[-1], gast.Return)
-        # If the last node in the if body is a return,
-        # then every line after this if statement effectively
-        # belongs in the else.
-        if true_branch_returns and not false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].orelse.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif not true_branch_returns and false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].body.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif true_branch_returns and false_branch_returns:
-          if nodelist[i + 1:]:
-            raise ValueError(
-                'Unreachable code after conditional where both branches return.'
-            )
-          return nodelist
-      elif isinstance(node, gast.Return) and nodelist[i + 1:]:
-        raise ValueError(
-            'Cannot have statements after a return in the same basic block')
-    return nodelist
-
-
-def contains_return(node):
-  for n in gast.walk(node):
-    if isinstance(n, gast.Return):
-      return True
-  return False
-
-
-class LiftReturn(converter.Base):
-  """Move return statements out of If and With blocks."""
-
-  def __init__(self, ctx):
-    super(LiftReturn, self).__init__(ctx)
-    self.changes_made = False
-    self.common_return_name = None
+  def visit_ExceptHandler(self, node):
+    # To determine whether `try` DEFINITELY_RETURNS we need to revisit this.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    return node
 
   def visit_If(self, node):
-    # Depth-first traversal of if statements
-    node = self.generic_visit(node)
-
-    # We check if both branches return, and if so, lift the return out of the
-    # conditional. We don't enforce that the true and false branches either
-    # both return or both do not, because FoldElse might move a return
-    # into a branch after this transform completes. FoldElse and LiftReturn
-    # are alternately run until the code reaches a fixed point.
-    true_branch_returns = isinstance(node.body[-1], gast.Return)
-    false_branch_returns = len(node.orelse) and isinstance(
-        node.orelse[-1], gast.Return)
-    if true_branch_returns and false_branch_returns:
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      node.orelse[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.orelse[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
+    node.test = self.visit(node.test)
 
-  def visit_With(self, node):
-    # Depth-first traversal of syntax
-    node = self.generic_visit(node)
-
-    # If the with statement returns, lift the return
-    if isinstance(node.body[-1], gast.Return):
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      node = self.generic_visit(node)
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
+    node.body, body_definitely_returns = self._visit_statement_block(
+        node, node.body)
+    if body_definitely_returns:
+      anno.setanno(node, BODY_DEFINITELY_RETURNS, True)
+
+    node.orelse, orelse_definitely_returns = self._visit_statement_block(
+        node, node.orelse)
+    if orelse_definitely_returns:
+      anno.setanno(node, ORELSE_DEFINITELY_RETURNS, True)
+
+    if body_definitely_returns and orelse_definitely_returns:
+      self.state[_Block].definitely_returns = True
+
+    return node
 
   def visit_FunctionDef(self, node):
-    # Ensure we're doing depth-first traversal
-    last_return_name = self.common_return_name
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    referenced_names = body_scope.referenced
-    self.common_return_name = self.ctx.namer.new_symbol('return_',
-                                                        referenced_names)
-    node = self.generic_visit(node)
-    self.common_return_name = last_return_name
+    node.args = self.visit(node.args)
+    node.body, _ = self._visit_statement_block(node, node.body)
     return node
 
 
-class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
-  """Throws an error if code returns inside loops or try/except."""
+class _Return(object):
+
+  def __init__(self):
+    self.used = False
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: {}'.format(
+        self.used)
 
-  # First, throw an error if we detect a return statement in a loop.
-  # TODO(alexbw): we need to learn to handle returns inside a loop,
-  # but don't currently have the TF constructs to do so (need something
-  # that looks vaguely like a goto).
+
+class _Function(object):
 
   def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInUnsupportedControlFlow, self).__init__()
+    self.do_return_var_name = None
+    self.retval_var_name = None
 
-  def visit_While(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+  def __repr__(self):
+    return 'return control: {}, return value: {}'.format(
+        self.do_return_var_name, self.retval_var_name)
 
-  def visit_For(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
 
-  def visit_Try(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+class ReturnStatementsTransformer(converter.Base):
+  """Lowers return statements into variables and conditionals.
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+  Specifically, the following pattern:
 
-  def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          '`return` statements are not supported in loops. '
-          'Try assigning to a variable in the while loop, and returning '
-          'outside of the loop')
+      <block 1>
+      return val
+      <block 2>
 
+  is converted to:
 
-class DetectReturnInConditional(gast.NodeVisitor):
-  """Assert that no return statements are present in conditionals."""
+      do_return = False
+      retval = None
 
-  def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInConditional, self).__init__()
+      <block 1>
 
-  def visit_If(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+      do_return = True
+      retval = val
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+      if not do_return:
+        <block 2>
 
-  def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          'After transforms, a conditional contained a `return `statement, '
-          'which is not allowed. This is a bug, and should not happen.')
+      return retval
 
+  The conversion adjusts loops as well:
 
-class DetectReturnInFunctionDef(gast.NodeVisitor):
+      <block 1>
+      while cond:
+        <block 2>
+        return retval
 
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    if not contains_return(node):
-      raise ValueError(
-          'Each function definition should contain at least one return.')
-
-
-def transform(node, ctx):
-  """Ensure a function has only a single return.
-
-  This transforms an AST node with multiple returns successively into containing
-  only a single return node.
-  There are a few restrictions on what we can handle:
-   - An AST being transformed must contain at least one return.
-   - No returns allowed in loops. We have to know the type of the return value,
-   and we currently don't have either a type inference system to discover it,
-   nor do we have a mechanism for late type binding in TensorFlow.
-   - After all transformations are finished, a Return node is not allowed inside
-   control flow. If we were unable to move a return outside of control flow,
-   this is an error.
-
-  Args:
-     node: ast.AST
-     ctx: converter.EntityContext
-
-  Returns:
-     new_node: an AST with a single return value
-
-  Raises:
-    ValueError: if the AST is structured so that we can't perform the
-   transform.
+  is converted to:
+
+      <block 1>
+      while not do_return and cond:
+        <block 2>
+        do_return = True
+        retval = val
   """
-  # Make sure that the function has at least one return statement
-  # TODO(alexbw): turning off this assertion for now --
-  # we need to not require this in e.g. class constructors.
-  # DetectReturnInFunctionDef().visit(node)
 
-  # Make sure there's no returns in unsupported locations (loops, try/except)
-  DetectReturnInUnsupportedControlFlow().visit(node)
+  def __init__(self, ctx, default_to_null_return):
+    super(ReturnStatementsTransformer, self).__init__(ctx)
+    self.default_to_null_return = default_to_null_return
+
+  def visit_Return(self, node):
+    self.state[_Return].used = True
+
+    retval = node.value if node.value else parser.parse_expression('None')
+
+    template = """
+      do_return_var_name = True
+      retval_var_name = retval
+    """
+    node = templates.replace(
+        template,
+        do_return_var_name=self.state[_Function].do_return_var_name,
+        retval_var_name=self.state[_Function].retval_var_name,
+        retval=retval)
+
+    return node
+
+  def _postprocess_statement(self, node):
+    # Example of how the state machine below works:
+    #
+    #   1| stmt           # State: _Return.used = False
+    #    |                # Action: none
+    #   3| return         # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = False
+    #    |                # Action: _Return.create_guard = True
+    #   4| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = True
+    #    |                # Action: create `if not return_used`,
+    #    |                #         set _Return.guard_created = True
+    #   5| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = True
+    #    |                # Action: none (will be wrapped under previously
+    #    |                #         created if node)
+    if self.state[_Return].used:
+      if self.state[_Return].guard_created:
+        return node, None
+
+      elif not self.state[_Return].create_guard:
+        self.state[_Return].create_guard = True
+        return node, None
+
+      elif (not self.state[_Return].guard_created and
+            self.state[_Return].create_guard):
+        self.state[_Return].guard_created = True
+        template = """
+          if ag__.not_(do_return_var_name):
+            original_node
+        """
+        cond, = templates.replace(
+            template,
+            do_return_var_name=self.state[_Function].do_return_var_name,
+            original_node=node)
+        return cond, cond.body
+
+      else:
+        assert False, 'should handle all states'
+
+    return node, None
+
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Return].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    return_used = self.state[_Return].used
+    self.state[_Return].exit()
+    if return_used:
+      self.state[_Return].used = True
+    return nodes
+
+  def visit_While(self, node):
+    node.test = self.visit(node.test)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      node.test = templates.replace_as_expression(
+          'ag__.and_(lambda: ag__.not_(control_var), lambda: test)',
+          test=node.test,
+          control_var=self.state[_Function].do_return_var_name)
+
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      extra_test = anno.getanno(node, 'extra_test', default=None)
+      if extra_test is not None:
+        extra_test = templates.replace_as_expression(
+            'ag__.and_(lambda: ag__.not_(control_var), lambda: extra_test)',
+            extra_test=extra_test,
+            control_var=self.state[_Function].do_return_var_name)
+      else:
+        extra_test = templates.replace_as_expression(
+            'ag__.not_(control_var)',
+            control_var=self.state[_Function].do_return_var_name)
+      anno.setanno(node, 'extra_test', extra_test)
+
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_statement_block(node, node.body)
+    return node
+
+  def visit_Try(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    node.finalbody = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    return node
+
+  def visit_If(self, node):
+    node.test = self.visit(node.test)
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    self.state[_Return].enter()
+
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    do_return_var_name = self.ctx.namer.new_symbol(
+        'do_return', scope.referenced)
+    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+    self.state[_Function].do_return_var_name = do_return_var_name
+    self.state[_Function].retval_var_name = retval_var_name
+
+    converted_body = self._visit_statement_block(node, node.body)
+
+    # Avoid placing statements before any eventual docstring.
+    # TODO(mdan): Should a docstring even be included in the output?
+    docstring = None
+    if converted_body:
+      if (isinstance(converted_body[0], gast.Expr) and
+          isinstance(converted_body[0].value, gast.Str)):
+        docstring = converted_body[0]
+        converted_body = converted_body[1:]
+
+    if self.state[_Return].used:
+      if self.default_to_null_return:
+        template = """
+          do_return_var_name = False
+          retval_var_name = None
+          body
+          return retval_var_name
+        """
+      else:
+        template = """
+          body
+          return retval_var_name
+        """
+      node.body = templates.replace(
+          template,
+          body=converted_body,
+          do_return_var_name=do_return_var_name,
+          retval_var_name=retval_var_name)
+
+      if docstring:
+        node.body.insert(0, docstring)
+
+    self.state[_Return].exit()
+    self.state[_Function].exit()
+    return node
 
-  while True:
 
-    # Try to lift all returns out of if statements and with blocks
-    lr = LiftReturn(ctx)
-    node = lr.visit(node)
-    changes_made = lr.changes_made
-    fe = FoldElse(ctx)
-    node = fe.visit(node)
-    changes_made = changes_made or fe.changes_made
+def transform(node, ctx, default_to_null_return=True):
+  """Ensure a function has only a single return."""
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
 
-    if not changes_made:
-      break
+  node = ConditionalReturnRewriter(ctx).visit(node)
 
-  # Make sure we've scrubbed all returns from conditionals
-  DetectReturnInConditional().visit(node)
+  transformer = ReturnStatementsTransformer(
+      ctx, default_to_null_return=default_to_null_return)
+  node = transformer.visit(node)
 
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 762fbc6f607f56ed6d80dd82f59f8c7653c7312a..b2d3d1b92055216d45071fef1fe9f36553a7fb42 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -49,17 +49,16 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse(self):
+  def test_missing_else(self):
 
     def test_fn(x):
       if x > 0:
         return x
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse_recovrable(self):
+  def test_missing_else_then_default(self):
 
     def test_fn(x):
       if x > 0:
@@ -69,7 +68,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_branch_return_recoverable(self):
+  def test_else_only_then_default(self):
 
     def test_fn(x):
       if x < 0:
@@ -136,7 +135,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
     self.assertTransformedEquivalent(test_fn, 2)
 
-  def test_nested_functions(self):
+  def test_nested_function(self):
 
     def test_fn(x):
 
@@ -151,7 +150,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_nested_functions_in_control_flow(self):
+  def test_nested_function_in_control_flow(self):
 
     def test_fn(x):
 
@@ -163,16 +162,59 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_loop(self):
+  def test_for_loop(self):
 
-    def test_fn(x):
-      for _ in range(10):
-        return x
-      return x
+    def test_fn(n):
+      for _ in range(n):
+        return 1
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 0)
+
+  def test_while_loop(self):
+
+    def test_fn(n):
+      i = 0
+      s = 0
+      while i < n:
+        i += 1
+        s += i
+        if s > 4:
+          return s
+      return -1
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_null_return(self):
+
+    def test_fn(n):
+      if n > 4:
+        return
+      return
+
+    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 5)
+
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            return v
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index d7c0951fcc68318ff82e4873deef8707e7018f73..7e556d95139366cb9747544fbaafe4a4039d82cd 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -125,6 +125,10 @@ class SideEffectGuardTransformer(converter.Base):
     node.orelse = self._visit_and_reindent(node.orelse)
     return node
 
+  # TODO(b/123995141) Remove once ExceptionHandlers are in the CFG
+  def visit_ExceptHandler(self, node):
+    return node
+
   def visit_Expr(self, node):
     self.generic_visit(node)
     if isinstance(node.value, gast.Call):
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index d674f266c8cb7b663ed1b2f63e76466ecb4b4521..11e3736d4fb9e8d06d5f02c991ea66410b35b374 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -23,7 +23,6 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import list_ops
@@ -68,7 +67,7 @@ class SliceTest(converter_testing.TestCase):
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.float32')
     }
-    with self.assertRaises(transformer.AutoGraphParseError):
+    with self.assertRaises(ValueError):
       slices.transform(node, ctx)
 
 
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 3ab2e7b1bcacf7efe136b01a10de2bb7728e2d90..fae327e50db57474f2f72fddbc57f04f90ca4f1e 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -22,6 +22,7 @@ py_library(
         "errors.py",
         "function_wrapping.py",
         "naming.py",
+        "unsupported_features_checker.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -30,6 +31,7 @@ py_library(
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
+        "@gast_archive//:gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 574f819504e526420dd1956359dc974869d735f3..f038704a0741ef31d8701b41566d236f7caff0d8 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -28,21 +28,34 @@ PYTHON_LITERALS = {
     'float': float,
 }
 
+
+def _internal_name(name):
+  """This function correctly resolves internal and external names."""
+  reference_name = utils.__name__
+
+  reference_root = 'tensorflow.'
+  # If the TF module is foo.tensorflow, then all other modules
+  # are then assumed to be prefixed by 'foo'.
+
+  if reference_name.startswith(reference_root):
+    return name
+
+  reference_begin = reference_name.find('.' + reference_root)
+  assert reference_begin > 0
+
+  root_prefix = reference_name[:reference_begin]
+  return root_prefix + '.' + name
+
+
 DEFAULT_UNCOMPILED_MODULES = set((
     ('tensorflow',),
-    (utils.__name__,),
-
-    # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not referring to the module directly to avoid
-    # circular imports.
-    (
-        utils.__name__[:-len('.python.autograph.utils')],),
+    (_internal_name('tensorflow'),),
+    # TODO(mdan): Remove once the conversion process is optimized.
+    ('tensorflow_probability',),
+    (_internal_name('tensorflow_probability'),),
 ))
 
-NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names.
-# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
 )
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index e6d626f215927941dffae9da45ce6b4d24b6402f..5b7880a29995bec355a13d858eb54c700831b40d 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -63,17 +63,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
-
 import enum
 
 from tensorflow.python.autograph.core import config
-from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
@@ -83,7 +79,6 @@ from tensorflow.python.autograph.pyct.static_analysis import live_values
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
-from tensorflow.python.eager import function
 from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): These contexts can be refactored into first class objects.
@@ -93,19 +88,6 @@ from tensorflow.python.util.tf_export import tf_export
 # TODO(mdan): Add a test specific to this converter.
 
 
-@tf_export('autograph.experimental.Verbosity')
-class Verbosity(enum.IntEnum):
-  """Represents conversion verbosity levels.
-
-  Attributes:
-    BRIEF: No logging, minimal error messages.
-    VERBOSE: Detailed logging of generated code, detailed error messages.
-  """
-
-  BRIEF = 0
-  VERBOSE = 1
-
-
 @tf_export('autograph.experimental.Feature')
 class Feature(enum.Enum):
   """Represents conversion options that can be toggled on or off.
@@ -113,11 +95,14 @@ class Feature(enum.Enum):
   Attributes:
     ALL: Enable all features.
     AUTO_CONTROL_DEPS: Insert of control dependencies in the generated code.
-    DECORATORS: Allow decorators in local functions. Note that special
-      decorators, like `tf.function`, are allowed regardless of this toggle.
+    ASSERT_STATEMENTS: Convert Tensor-dependent assert statements to tf.Assert.
+    BUILTIN_FUNCTIONS: Convert builtin functions applied to Tensors to
+      their TF counterparts.
     ERROR_REWRITING: Rewrite errors that occur in the generated code to
       indicate the source code to which the failing code corresponds.
     LISTS: Convert list idioms, like initializers, slices, append, etc.
+    LOGICAL_EXPRESSIONS: Convert data-dependent logical expressions applied to
+      Tensors to their TF counterparts.
     NAME_SCOPES: Insert name scopes that name ops according to context, like the
       function they were defined in.
   """
@@ -125,11 +110,25 @@ class Feature(enum.Enum):
   ALL = 'ALL'
 
   AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
-  DECORATORS = 'DECORATORS'
+  ASSERT_STATEMENTS = 'ASSERT_STATEMENTS'
+  BUILTIN_FUNCTIONS = 'BUILTIN_FUNCTIONS'
   ERROR_REWRITING = 'ERROR_REWRITING'
   LISTS = 'LISTS'
+  LOGICAL_EXPRESSIONS = 'LOGICAL_EXPRESSIONS'
   NAME_SCOPES = 'NAME_SCOPES'
 
+  @classmethod
+  def all(cls):
+    """Returns a tuple that enables all options."""
+    return tuple(cls.__members__.values())
+
+  @classmethod
+  def all_but(cls, exclude):
+    """Returns a tuple that enables all but the excluded options."""
+    if not isinstance(exclude, (list, tuple, set)):
+      exclude = (exclude,)
+    return tuple(set(cls.all()) - set(exclude) - {cls.ALL})
+
 
 class ConversionOptions(object):
   """Immutable container for global conversion flags.
@@ -137,11 +136,6 @@ class ConversionOptions(object):
   Attributes:
     recursive: bool, whether to recursively convert any user functions or
       classes that the converted function may use.
-    verbose: Verbosity, the level of verbosity to use.
-    strip_decorators: Tuple[Callable], contains decorators that should be in
-      excluded from the compiled output. By default, when converting a function
-      before the decorators are applied, the compiled output will include those
-      decorators.
     force_conversion: bool, whether to force convertinng the target entity. When
       force_conversion is turned off, the converter may decide to return the
       function as-is.
@@ -152,14 +146,10 @@ class ConversionOptions(object):
 
   def __init__(self,
                recursive=False,
-               verbose=Verbosity.VERBOSE,
-               strip_decorators=None,
                force_conversion=False,
                internal_convert_user_code=True,
                optional_features=Feature.ALL):
     self.recursive = recursive
-    self.verbose = verbose
-    self._strip_decorators = strip_decorators or ()
     self.force_conversion = force_conversion
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
@@ -171,34 +161,17 @@ class ConversionOptions(object):
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
 
-  @property
-  def strip_decorators(self):
-    # A few decorators are included by default.
-    # TODO(mdan): Revert if function.defun becomes a public symbol.
-    return self._strip_decorators + (function.defun,)
-
-  def should_strip(self, decorator):
-    for blacklisted in self.strip_decorators:
-      if blacklisted is decorator:
-        return True
-      if isinstance(blacklisted, weakref.ref):
-        blacklisted_deref = blacklisted()
-        if (blacklisted_deref is not None and blacklisted_deref is decorator):
-          return True
-    return False
-
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
 
-  def to_ast(self, ctx, internal_convert_user_code=None):
+  def to_ast(self, internal_convert_user_code=None):
     """Returns a representation of this object as an AST node.
 
     The AST node encodes a constructor that would create an object with the
     same contents.
 
     Args:
-      ctx: EntityContext, the entity with which this AST needs to be consistent.
       internal_convert_user_code: Optional[bool], allows ovrriding the
         corresponding value.
 
@@ -206,48 +179,23 @@ class ConversionOptions(object):
       ast.Node
     """
     template = """
-      constructor_name(
+      ag__.ConversionOptions(
           recursive=recursive_val,
-          verbose=verbose_val,
-          strip_decorators=strip_decorators_val,
           force_conversion=force_conversion_val,
           optional_features=optional_features_val,
           internal_convert_user_code=internal_convert_user_code_val)
     """
 
-    def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
-      if not name:
-        if isinstance(o, weakref.ref):
-          # `o` might already be a weak reference, if this object was
-          # constructed from code generated by `to_ast` itself.
-          # If so, unpack it.
-          o = o()
-        # TODO(mdan): This needs to account for the symbols defined locally.
-        name = ctx.namer.new_symbol(o.__name__, ())
-        ctx.program.add_symbol(name, weakref.ref(o))
-      return name
-
-    def list_of_names(values):
-      return parser.parse_expression('({})'.format(', '.join(
-          tuple(as_qualified_name(v) for v in values))))
-
     def list_of_features(values):
       return parser.parse_expression('({})'.format(', '.join(
-          'ag__.Feature.{}'.format(v)
-          for v in Feature.__members__
-          if v in values)))
+          'ag__.{}'.format(str(v)) for v in values)))
 
-    if internal_convert_user_code is not None:
+    if internal_convert_user_code is None:
       internal_convert_user_code = self.internal_convert_user_code
 
     expr_ast = templates.replace(
         template,
-        constructor_name=parser.parse_expression(
-            as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
-        verbose_val=parser.parse_expression(str(int(self.verbose))),
-        strip_decorators_val=list_of_names(self._strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
         internal_convert_user_code_val=parser.parse_expression(
@@ -263,18 +211,8 @@ class ProgramContext(object):
 
   Attributes:
     options: ConversionOptions
-    dependency_cache: Dict[Any, ast.AST], the original entities mapped to their
-      converted AST
-    additional_imports: Set[Any], additional entities which for any reason
-      cannot be attached after loading and need to be explicitly imported in the
-      generated code
-    name_map: Dict[str, str], map of original entity name to the name of their
-      converted counterparts
     autograph_module: Module, a reference to the autograph module. This needs to
       be specified by the caller to avoid circular dependencies.
-    uncompiled_modules: Set[Tuple[str, ...]], with each tuple representing the
-      fully qualified name of a package containing functions that will not be
-      compiled.
     required_imports: str, containing an import statement on each line. These
       are all the imports necessary for the compiled code to run, in addition to
       the closures of each entity, which are attached dynamically.
@@ -283,73 +221,19 @@ class ProgramContext(object):
   def __init__(
       self,
       options,
-      partial_types,
       autograph_module,
-      uncompiled_modules,
   ):
     self.options = options
-    self.partial_types = partial_types if partial_types else ()
     self.autograph_module = autograph_module
-    self.uncompiled_modules = uncompiled_modules
-
-    self.conversion_order = []
-    self.dependency_cache = {}
-    self.additional_imports = set()
-    self.name_map = {}
-    self.additional_symbols = {}
 
   @property
   def required_imports(self):
     """Returns a block containing all imports required by the converted code."""
     # TODO(mdan): Check that these don't clobber one another.
-    return '\n'.join(config.COMPILED_IMPORT_STATEMENTS +
-                     tuple(self.additional_imports))
-
-  def new_namer(self, namespace):
-    return naming.Namer(namespace, self.options.recursive, self.name_map,
-                        self.partial_types)
+    return '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
 
-  def update_name_map(self, namer):
-    """Updates renamed_calls based on the recent activity from the namer.
 
-    Whenever we convert a new entity, any references to other entities are being
-    renamed to match their soon-to-be-converted counterparts. The namer keeps
-    track of these renames. When conversion is complete, we copy those renames
-    so that when those referenced entities are being converted, their new name
-    matches.
-
-    Args:
-      namer: naming.Namer
-
-    Raises:
-      ValueError: when an entity was renamed twice and to different names.
-    """
-    # TODO(mdan): Have call_trees do this directly.
-    # This is done so indirectly, via the namer, for historic reasons. But
-    # now we can have the converter that does the rename record the new name
-    # as well and skip this step altogether.
-    for o, name in namer.renamed_calls.items():
-      if o in self.name_map:
-        if self.name_map[o] != name:
-          raise ValueError(
-              'Calls to %s were converted using multiple names (%s). This is '
-              'possible when an entity with one of these names already '
-              'existed. To fix, avoid using any of these names.' %
-              (o, (name, self.name_map[o])))
-      else:
-        self.name_map[o] = name
-
-  def add_symbol(self, name, value):
-    if name in self.additional_symbols:
-      assert self.additional_symbols[name] is value
-    self.additional_symbols[name] = value
-
-  def add_to_cache(self, original_entity, converted_ast):
-    self.conversion_order.append(original_entity)
-    self.dependency_cache[original_entity] = converted_ast
-
-
-class EntityContext(object):
+class EntityContext(transformer.Context):
   """Tracks the conversion of a single entity.
 
   This object is mutable, and is updated during conversion. Not thread safe.
@@ -361,8 +245,8 @@ class EntityContext(object):
   """
 
   def __init__(self, namer, entity_info, program_ctx):
+    super(EntityContext, self).__init__(entity_info)
     self.namer = namer
-    self.info = entity_info
     self.program = program_ctx
 
 
@@ -374,8 +258,7 @@ class Base(transformer.Base):
   """
 
   def __init__(self, ctx):
-    super(Base, self).__init__(ctx.info)
-    self.ctx = ctx  # Keeping this short because it's used frequently.
+    super(Base, self).__init__(ctx)
 
     self._used = False
     self._ast_depth = 0
@@ -475,13 +358,13 @@ def standard_analysis(node, context, is_initial=False):
   # TODO(mdan): Don't return a node because it's modified by reference.
   graphs = cfg.build(node)
   node = qual_names.resolve(node)
-  node = activity.resolve(node, context.info, None)
-  node = reaching_definitions.resolve(node, context.info, graphs, AnnotatedDef)
-  node = liveness.resolve(node, context.info, graphs)
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, context.info)
+  node = activity.resolve(node, context, None)
+  node = reaching_definitions.resolve(node, context, graphs, AnnotatedDef)
+  node = liveness.resolve(node, context, graphs)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, context)
   # This second call allows resolving first-order class attributes.
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
   if is_initial:
     anno.dup(
         node,
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index 864ea6c7d2b891cd1f21f4b1c83f66949cd6ab9b..938569b8e00d4ed080907995628212c38228f400 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for lists module."""
+"""Tests for converter module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
-
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.platform import test
 
 
@@ -31,34 +31,30 @@ class TestConverter(converter.Base):
   pass
 
 
-class ConversionOptionsTest(test.TestCase):
-
-  def test_should_strip_weakrefs(self):
-    def test_fn():
-      pass
-
-    def weak_test_fn_a():
-      pass
-
-    def weak_test_fn_b():
-      pass
-
-    def weak_test_fn_c():
-      pass
+class ConversionOptionsTest(converter_testing.TestCase):
 
-    wr_a = weakref.ref(weak_test_fn_a)
-    # Create an extra weakref to check whether the existence of multiple weak
-    # references influences the process.
-    _ = weakref.ref(weak_test_fn_b)
-    wr_b = weakref.ref(weak_test_fn_b)
-    _ = weakref.ref(weak_test_fn_c)
+  def test_to_ast(self):
+    opts = converter.ConversionOptions()
+    opts_ast = opts.to_ast()
 
-    opts = converter.ConversionOptions(strip_decorators=(test_fn, wr_a, wr_b))
-
-    self.assertTrue(opts.should_strip(test_fn))
-    self.assertTrue(opts.should_strip(weak_test_fn_a))
-    self.assertTrue(opts.should_strip(weak_test_fn_b))
-    self.assertFalse(opts.should_strip(weak_test_fn_c))
+    template = '''
+    def test_fn():
+      return opts_ast
+    '''
+    opts_packed = templates.replace(template, opts_ast=opts_ast)
+
+    reparsed, _ = compiler.ast_to_object(opts_packed)
+    reparsed.__dict__['ag__'] = self.make_fake_mod(
+        'fake_ag', converter.ConversionOptions, converter.Feature)
+
+    reparsed_opts = reparsed.test_fn()
+
+    self.assertEqual(opts.recursive, reparsed_opts.recursive)
+    self.assertEqual(opts.force_conversion, reparsed_opts.force_conversion)
+    self.assertEqual(
+        opts.internal_convert_user_code,
+        reparsed_opts.internal_convert_user_code)
+    self.assertEqual(opts.optional_features, reparsed_opts.optional_features)
 
 
 class ConverterBaseTest(converter_testing.TestCase):
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index f1374081d3c6e0dd93c39d331c76404859b2f40a..e2d95b89095fab11228d7a41c5605da5ece6c845 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -26,53 +26,19 @@ import six
 
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
-
-def imported_decorator(f):
-  return lambda a: f(a) + 1
-
-
-# TODO(mdan): We should use the real namer here.
-class FakeNamer(object):
-  """A fake namer that uses a global counter to generate unique names."""
-
-  def __init__(self):
-    self.i = 0
-
-  def new_symbol(self, name_root, used):
-    while True:
-      self.i += 1
-      name = '%s%d' % (name_root, self.i)
-      if name not in used:
-        return name
-
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    if inspect_utils.islambda(live_entity):
-      return None, False
-    if owner_type is not None:
-      return None, False
-    return ('renamed_%s' % '_'.join(original_fqn)), True
-
-
-class FakeNoRenameNamer(FakeNamer):
-
-  def compiled_function_name(self, original_fqn, **_):
-    return str(original_fqn), False
+RESULT_OF_MOCK_CONVERTED_CALL = 7
 
 
 class TestCase(test.TestCase):
@@ -95,8 +61,8 @@ class TestCase(test.TestCase):
     self.dynamic_calls = []
     def converted_call(*args):
       """Mock version of api.converted_call."""
-      self.dynamic_calls.append(args)
-      return 7
+      self.dynamic_calls.append(args[3:])  # args only; see api.converted_call
+      return RESULT_OF_MOCK_CONVERTED_CALL
 
     try:
       result, source = compiler.ast_to_object(node, include_source_map=True)
@@ -107,11 +73,13 @@ class TestCase(test.TestCase):
                                    converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
       fake_ag.__dict__.update(special_functions.__dict__)
-      fake_ag.__dict__['utils'] = utils
-      fake_ag.__dict__['rewrite_graph_construction_error'] = (
+      fake_ag.ConversionOptions = converter.ConversionOptions
+      fake_ag.Feature = converter.Feature
+      fake_ag.utils = utils
+      fake_ag.rewrite_graph_construction_error = (
           errors.rewrite_graph_construction_error)
-      fake_ag.__dict__['function_scope'] = function_wrapping.function_scope
-      result.__dict__['ag__'] = fake_ag
+      fake_ag.function_scope = function_wrapping.function_scope
+      result.ag__ = fake_ag
       for k, v in namespace.items():
         result.__dict__[k] = v
       yield result
@@ -151,35 +119,20 @@ class TestCase(test.TestCase):
     for k, v in ns.items():
       setattr(module, k, v)
 
-  def prepare(self,
-              test_fn,
-              namespace,
-              namer=None,
-              arg_types=None,
-              owner_type=None,
-              recursive=True,
-              strip_decorators=()):
+  def prepare(self, test_fn, namespace, arg_types=None, recursive=True):
     namespace['ConversionOptions'] = converter.ConversionOptions
 
-    node, source = parser.parse_entity(test_fn)
-    node = node.body[0]
-    if namer is None:
-      namer = FakeNamer()
+    node, source, _ = parser.parse_entity(test_fn)
+    namer = naming.Namer(namespace)
     program_ctx = converter.ProgramContext(
-        options=converter.ConversionOptions(
-            recursive=recursive,
-            strip_decorators=strip_decorators,
-            verbose=True),
-        partial_types=None,
-        autograph_module=None,
-        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
+        options=converter.ConversionOptions(recursive=recursive),
+        autograph_module=None)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file='<fragment>',
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=owner_type)
+        arg_types=arg_types)
     ctx = converter.EntityContext(namer, entity_info, program_ctx)
     origin_info.resolve(node, source, test_fn)
     node = converter.standard_analysis(node, ctx, is_initial=True)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index 245795c3d2e1c8c33f7de6ee01e17f43433bd410..aa23779dfb5eb5e1441b004475eb6706c64d1e4f 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import enum
 
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.utils import misc
 
@@ -31,27 +30,10 @@ class _NamingStyle(enum.Enum):
 
 
 class Namer(object):
-  """Implementation of the namer interfaces required by various converters.
+  """Symbol name generartor."""
 
-  This implementation performs additional tasks like keeping track of the
-  function calls that have been encountered and replaced with calls to their
-  corresponding compiled counterparts.
-
-  Interfaces currently implemented:
-    * call_trees.FunctionNamer
-    * control_flow.SymbolNamer
-    * side_effect_guards.SymbolNamer
-  """
-
-  def __init__(self, global_namespace, recursive, name_map, partial_types):
+  def __init__(self, global_namespace):
     self.global_namespace = global_namespace
-    self.recursive = recursive
-    self.partial_types = partial_types
-
-    self.renamed_calls = {}
-    if name_map is not None:
-      self.renamed_calls.update(name_map)
-
     self.generated_names = set()
 
   def _as_symbol_name(self, fqn, style=_NamingStyle.SNAKE):
@@ -92,11 +74,8 @@ class Namer(object):
     elif style == _NamingStyle.SNAKE:
       return '_'.join(pieces)
 
-  def compiled_class_name(self, original_fqn, live_entity=None):
-    """See call_trees.FunctionNamer.compiled_class_name."""
-    if live_entity is not None and live_entity in self.renamed_calls:
-      return self.renamed_calls[live_entity]
-
+  def class_name(self, original_fqn):
+    """Returns the name of a converted class."""
     canonical_name = self._as_symbol_name(
         original_fqn, style=_NamingStyle.CAMEL)
     new_name_root = 'Tf%s' % canonical_name
@@ -105,30 +84,11 @@ class Namer(object):
     while new_name in self.global_namespace:
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
-
     self.generated_names.add(new_name)
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     return new_name
 
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    """See call_trees.FunctionNamer.compiled_function_name."""
-    if not self.recursive:
-      return None, False
-
-    if (live_entity is not None and inspect_utils.islambda(live_entity)):
-      return None, False
-
-    if owner_type is not None and owner_type not in self.partial_types:
-      # Members are not renamed when part of an entire converted class.
-      return None, False
-
-    if live_entity is not None and live_entity in self.renamed_calls:
-      return self.renamed_calls[live_entity], True
-
+  def function_name(self, original_fqn):
+    """Returns the name of a converted function."""
     canonical_name = self._as_symbol_name(
         original_fqn, style=_NamingStyle.SNAKE)
     new_name_root = 'tf__%s' % canonical_name
@@ -137,12 +97,8 @@ class Namer(object):
     while new_name in self.global_namespace:
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
-
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
-
-    return new_name, True
+    return new_name
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
diff --git a/tensorflow/python/autograph/core/naming_test.py b/tensorflow/python/autograph/core/naming_test.py
index cc8c4314a700ac43ff5d21ad32706a0c3d5be0f5..49526ed77f34f38e74b88bccdef3bc029d146603 100644
--- a/tensorflow/python/autograph/core/naming_test.py
+++ b/tensorflow/python/autograph/core/naming_test.py
@@ -24,64 +24,47 @@ from tensorflow.python.platform import test
 
 class NamerTest(test.TestCase):
 
-  def test_compiled_function_name_tracks_names(self):
-    def bar():
-      pass
-
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
-    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
-        'bar', bar))
-    self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
+  def test_function_name_tracks_names(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo', namer.function_name('foo'))
+    self.assertEqual('tf__bar', namer.function_name('bar'))
     self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
 
-  def test_compiled_function_name_consistent(self):
-    def foo():
-      pass
-
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
-        'foo', foo))
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
-        'foo', foo))
-
-  def test_compiled_function_name_unsanitized_fqn(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo_bar', True),
-                     namer.compiled_function_name('foo.bar'))
-    self.assertEqual(('tf__foo_bar_baz', True), namer.compiled_function_name(
-        ('foo.bar', 'baz')))
+  def test_function_name_consistent(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo', namer.function_name('foo'))
+    self.assertEqual('tf__foo', namer.function_name('foo'))
 
-  def test_compiled_class_name_basic(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual('TfFooBar', namer.compiled_class_name(('foo', 'Bar')))
+  def test_function_name_unsanitized_fqn(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo_bar', namer.function_name('foo.bar'))
+    self.assertEqual('tf__foo_bar_baz', namer.function_name(('foo.bar', 'baz')))
 
-  def test_compiled_class_name_unsanitized_fqn(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual('TfFooBarBaz',
-                     namer.compiled_class_name(('foo.bar', 'Baz')))
+  def test_class_name_basic(self):
+    namer = naming.Namer({})
+    self.assertEqual('TfFooBar', namer.class_name(('foo', 'Bar')))
 
-  def test_compiled_function_name_avoids_global_conflicts(self):
-    def foo():
-      pass
+  def test_class_name_unsanitized_fqn(self):
+    namer = naming.Namer({})
+    self.assertEqual('TfFooBarBaz', namer.class_name(('foo.bar', 'Baz')))
 
-    namer = naming.Namer({'tf__foo': 1}, True, None, ())
-    self.assertEqual(('tf__foo_1', True),
-                     namer.compiled_function_name('foo', foo))
+  def test_function_name_avoids_global_conflicts(self):
+    namer = naming.Namer({'tf__foo': 1})
+    self.assertEqual('tf__foo_1', namer.function_name('foo'))
 
   def test_new_symbol_tracks_names(self):
-    namer = naming.Namer({}, True, None, ())
+    namer = naming.Namer({})
     self.assertEqual('temp', namer.new_symbol('temp', set()))
     self.assertItemsEqual(('temp',), namer.generated_names)
 
   def test_new_symbol_avoids_duplicates(self):
-    namer = naming.Namer({}, True, None, ())
+    namer = naming.Namer({})
     self.assertEqual('temp', namer.new_symbol('temp', set()))
     self.assertEqual('temp_1', namer.new_symbol('temp', set()))
     self.assertItemsEqual(('temp', 'temp_1'), namer.generated_names)
 
   def test_new_symbol_avoids_conflicts(self):
-    namer = naming.Namer({'temp': 1}, True, None, ())
+    namer = naming.Namer({'temp': 1})
     # temp is reserved in the global namespace
     self.assertEqual('temp_1', namer.new_symbol('temp', set()))
     # temp_2 is reserved in the local namespace
diff --git a/tensorflow/python/autograph/core/unsupported_features_checker.py b/tensorflow/python/autograph/core/unsupported_features_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccbb76fea1c8c9068b1bc1f64cc0f00a0ca2e35
--- /dev/null
+++ b/tensorflow/python/autograph/core/unsupported_features_checker.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkers for detecting unsupported Python features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.pyct import errors
+
+
+class UnsupportedFeaturesChecker(gast.NodeTransformer):
+  """Quick check for Python features we know we don't support.
+
+  Any features detected will cause AutoGraph to not compile a function.
+  """
+
+  # TODO(b/124103128): Implement support for `global` statements
+  def visit_Global(self, node):
+    raise errors.AutoGraphError(
+        'The global keyword is not yet supported.')
+
+  def visit_Nonlocal(self, node):
+    raise errors.AutoGraphError(
+        'The nonlocal keyword is not yet supported.')
+
+  # These checks could potentially be replaced with inspect.isgeneratorfunction
+  # to avoid a getsource/parse/ast-walk round trip.
+  def visit_Yield(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+  def visit_YieldFrom(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+
+def verify(node):
+  UnsupportedFeaturesChecker().visit(node)
+
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 122ea1726b60c641d702fe737db524205b70e389..67fcfb96c9a1adef2d9d8d73f6c88a3dfc4ad27a 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,30 +18,40 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
 import functools
+import os
+import pdb
 import sys
 
 from enum import Enum
 
 # pylint:disable=g-bad-import-order
 import numpy as np
+import six
 # pylint:enable=g-bad-import-order
 
 
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
+
+def is_autograph_strict_conversion_mode():
+  return int(os.environ.get('AUTOGRAPH_STRICT_CONVERSION', '0')) > 0
+
+
 # TODO(mdan): Properly document the type hints.
 # TODO(mdan): Reduce the type hint information to (module, type).
 # (currently we require (module + class name, type))
@@ -52,7 +62,6 @@ from tensorflow.python.util.tf_export import tf_export
 # to write converter.
 def convert(
     recursive=False,
-    verbose=converter.Verbosity.BRIEF,
     optional_features=converter.Feature.ALL):
   """Decorator that compiles a function to use TensorFlow ops.
 
@@ -64,7 +73,6 @@ def convert(
   Args:
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
-    verbose: converter.Verbosity, the level of verbosity.
     optional_features: converted.Feature, allows toggling optional or
       experimental features. When set to None, only the core features are
       enabled.
@@ -83,10 +91,9 @@ def convert(
           f, None,
           converter.ConversionOptions(
               recursive=recursive,
-              verbose=verbose,
               force_conversion=True,
               optional_features=optional_features,
-          ), *args, **kwargs)
+          ), args, kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
@@ -112,6 +119,12 @@ class RunMode(Enum):
   PY_FUNC = 2
 
 
+def do_not_convert_internal(f):
+  """Decorator that marks internal functions which do not need conversion."""
+  setattr(f, '__ag_compiled', True)
+  return f
+
+
 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   """Decorator that suppresses the conversion of a function.
 
@@ -150,17 +163,52 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
     else:
       raise ValueError('unknown value for run_as: %s' % run_as)
 
-    # Sometimes the decorator is just desugared, making it impossible to detect.
-    # This attribute makes detection easier.
     setattr(wrapper, '__ag_compiled', True)
     return wrapper
 
   return decorator
 
 
-def converted_call(f, owner, options, *args, **kwargs):
+def _call_unconverted(f, args, kwargs):
+  """Calls the original function without converting with AutoGraph."""
+  if inspect_utils.istfmethodtarget(f):
+    return f.__self__.call(args, kwargs)
+
+  return f(*args, **kwargs)
+
+
+def _is_known_loaded_type(f, module_name, entity_name):
+  """Tests whether the function or method is an instance of a known type."""
+  if (module_name not in sys.modules or
+      not hasattr(sys.modules[module_name], entity_name)):
+    return False
+  type_entity = getattr(sys.modules[module_name], entity_name)
+  if isinstance(f, type_entity):
+    # The method if of this type. Example:
+    #
+    # o = ClassType()
+    # function(o.method)()
+    return True
+  if tf_inspect.ismethod(f):
+    f = six.get_unbound_function(f)
+    # The the unbound method if of this type. Example:
+    #
+    # class ClassType:
+    #   @function
+    #   def method(self):
+    #     ...
+    # o = ClassType()
+    # o.method()
+    if isinstance(f, type_entity):
+      return True
+  return False
+
+
+def converted_call(f, owner, options, args, kwargs):
   """Compiles a function call inline. For internal use only."""
-  logging.vlog(logging.DEBUG, 'Converted call: %s; owner: %s', f, owner)
+  logging.log(1,
+              'Converted call: %s; owner: %s\n    args: %s\n    kwargs: %s\n',
+              f, owner, args, kwargs)
 
   if owner is not None:
     if not isinstance(f, str):
@@ -179,118 +227,128 @@ def converted_call(f, owner, options, *args, **kwargs):
   if inspect_utils.isbuiltin(f):
     return py_builtins.overload_of(f)(*args, **kwargs)
 
-  # TODO(mdan): This needs cleanup.
-  # In particular, we may want to avoid renaming functions altogether.
-  if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
-
-    # TODO(mdan): This may be inconsistent in certain situations.
-    # If the function had already been annotated with @tf.function, it
-    # may be bound to the incorrect object. It's unclear if those situations
-    # are possible, but if they happen, we need to check if f is bound
-    # to a shim like WeakrefSelf and unpack it.
+  if _is_known_loaded_type(f, 'weakref', 'ref'):
+    logging.log(2, 'Permanently whitelisted: %s: weakref', f)
+    return _call_unconverted(f, args, kwargs)
+
+  # TODO(b/122265385): Remove this bypass.
+  if (_is_known_loaded_type(f, 'wrapt', 'FunctionWrapper') or
+      _is_known_loaded_type(f, 'wrapt', 'BoundFunctionWrapper')):
+    logging.warn(
+        'Entity {} appears to be decorated by wrapt, which is not yet supported'
+        ' by AutoGraph. The function will be called without transformation.'
+        ' You may however apply AutoGraph before the decorator.'.format(f))
+    logging.log(2, 'Permanently whitelisted: %s: wrapt decorated', f)
+    return _call_unconverted(f, args, kwargs)
+
+  # Constructors are permanently whitelisted.
+  # TODO(mdan): Toggle as experimental feature instead.
+  # TODO(b/124016764): Remove this limitation.
+  if tf_inspect.isclass(f):
+    logging.log(2, 'Permanently whitelisted: %s: constructor', f)
+    return _call_unconverted(f, args, kwargs)
+
+  # Other built-in modules are permanently whitelisted.
+  # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
+  # Note: TF linter disallows importing inspect.
+  if any(f in m.__dict__.values()
+         for m in (collections, pdb, copy, tf_inspect._inspect)):  # pylint:disable=protected-access
+    logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
+    return _call_unconverted(f, args, kwargs)
 
-    # Args typically include `self`, as required by the conversion process.
-    # When conversion is skipped, `self` is not necessary, because the
-    # original bound method is being executed. This code removes it.
-    if tf_inspect.ismethod(f) and args:
-      f_self = inspect_utils.getmethodself(f)
-      if args[0] is f_self:
-        args = args[1:]
-
-    return f(*args, **kwargs)
+  if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
+    return _call_unconverted(f, args, kwargs)
 
   # internal_convert_user_code is for example turned off when issuing a dynamic
   # call conversion from generated code while in nonrecursive mode. In that
   # case we evidently don't want to recurse, but we still have to convert
   # things like builtins.
   if not options.internal_convert_user_code:
-    return f(*args, **kwargs)
-
-  # Unwrap functools.partial objects
-  # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
-  while isinstance(f, functools.partial):
-    args = f.args + args
-    new_kwargs = {}
-    if f.keywords is not None:
-      new_kwargs.update(f.keywords)
-    new_kwargs.update(kwargs)
-    kwargs = new_kwargs
-    f = f.func
-
-  if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
-    # Regular functions
-    target_entity = f
-    arg_map_target = f
-    f_self = inspect_utils.getmethodself(f)
-
-    # TODO(b/119246461): This may be more elegantly handled using __get__?
-    if f_self is not None:
-      # If this is a method call, it may or may not include self.
-      #
-      # Example when self is included:
-      #   converted_call(to_graph(foo.bar), foo)
-      #
-      # Example when self is not included:
-      #   super(...).foo(args)
-      #
-      if owner is not None and (not args or args[0] is not owner):
-        effective_args = (owner,) + args
+    return _call_unconverted(f, args, kwargs)
+
+  # TODO(mdan): Move this entire block inside to_graph.
+  try:  # Begin of transformation error guards
+
+    # Unwrap functools.partial objects
+    # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
+    while isinstance(f, functools.partial):
+      args = f.args + args
+      new_kwargs = {}
+      if f.keywords is not None:
+        new_kwargs.update(f.keywords)
+      new_kwargs.update(kwargs)
+      kwargs = new_kwargs
+      f = f.func
+
+    if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
+      # Regular functions
+      target_entity = f
+      arg_map_target = f
+      f_self = inspect_utils.getmethodself(f)
+
+      # TODO(b/119246461): This may be more elegantly handled using __get__?
+      if f_self is not None:
+        effective_args = (f_self,) + args
       else:
-        # When the owner is not specified, use the result of
-        # inspect_utils.getmethodclass.
-        # TODO(b/119246461): Make sure an owner is always specified.
-        if not args or args[0] is not f_self:
-          effective_args = (f_self,) + args
-        else:
-          effective_args = (f_self,) + args[1:]
-      partial_types = (f_self,)
-    else:
+        effective_args = args
+
+    elif tf_inspect.isclass(f):
+      # Constructors
+      # Note: Until we support class constructurs, and enable whole-class
+      # conversion with an experimental flag, this branch is dead code.
+      # TODO(mdan): Consider removing unless there is a compelling use case.
+      target_entity = f
+      arg_map_target = f.__init__
       effective_args = args
-      partial_types = ()
-
-  elif tf_inspect.isclass(f):
-    # Constructors
-    target_entity = f
-    arg_map_target = f.__init__
-    effective_args = args
-    partial_types = ()
-
-  elif hasattr(f, '__call__') and hasattr(f, '__class__'):
-    # Callable objects
-    target_entity = f.__call__
-    arg_map_target = f.__call__
-    effective_args = (f,) + args
-    partial_types = (f.__class__,)
-
-  else:
-    raise NotImplementedError('unknown callable type "%s"' % type(f))
-
-  arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
-  arg_types = {}
-  for name, arg in arg_values.items():
-    arg_class = arg.__class__
-    arg_types[name] = (arg_class.__name__, arg_class)
-
-  # When called from within a decorator, this is the only indication that
-  # the function is a method - it appears that the decorator is applied
-  # before the method is bound.
-  if not partial_types:
-    if 'self' in arg_values:
-      if tf_inspect.isclass(arg_values['self'].__class__):
-        partial_types = (arg_values['self'].__class__,)
-    elif 'cls' in arg_values:
-      if tf_inspect.isclass(arg_values['cls']):
-        partial_types = (arg_values['cls'],)
-
-  converted_f = to_graph(
-      target_entity,
-      recursive=options.recursive,
-      arg_values=arg_values,
-      arg_types=arg_types,
-      experimental_optional_features=options.optional_features,
-      experimental_strip_decorators=options.strip_decorators,
-      experimental_verbose=options.verbose,
-      experimental_partial_types=partial_types)
+
+    elif hasattr(f, '__call__') and hasattr(f, '__class__'):
+      # Callable objects
+      target_entity = f.__call__
+      arg_map_target = f.__call__
+      effective_args = (f,) + args
+
+    else:
+      target_entity = f
+      raise NotImplementedError('unknown callable type "%s"' % type(f))
+
+    arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
+    arg_types = {}
+    for name, arg in arg_values.items():
+      arg_class = arg.__class__
+      arg_types[name] = (arg_class.__name__, arg_class)
+
+    converted_f = to_graph(
+        target_entity,
+        recursive=options.recursive,
+        arg_values=arg_values,
+        arg_types=arg_types,
+        experimental_optional_features=options.optional_features)
+
+    if logging.has_verbosity(2):
+      logging.log(2, 'Defaults of %s : %s', converted_f,
+                  converted_f.__defaults__)
+      callargs = tf_inspect.getcallargs(converted_f, *effective_args, **kwargs)
+      formatted_callargs = '\n'.join(
+          '    {}: {}'.format(k, v) for k, v in callargs.items())
+      logging.log(2, 'Calling %s with\n%s\n', converted_f, formatted_callargs)
+
+  # TODO(mdan): Reduce this list.
+  except (errors.AutoGraphError, AssertionError, AttributeError, IndexError,
+          KeyError, NameError, NotImplementedError, SyntaxError, TypeError,
+          ValueError, IOError) as e:
+
+    logging.log(1, 'Error transforming entity %s', target_entity, exc_info=True)
+
+    if is_autograph_strict_conversion_mode():
+      raise
+
+    logging.warn(
+        'Entity %s could not be transformed and will be staged without change.'
+        ' Error details can be found in the logs when running with the env'
+        ' variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the'
+        ' AutoGraph team. Cause: %s', target_entity, e)
+
+    return _call_unconverted(f, args, kwargs)
 
   result = converted_f(*effective_args, **kwargs)
 
@@ -324,10 +382,7 @@ def to_graph(entity,
              recursive=True,
              arg_values=None,
              arg_types=None,
-             experimental_optional_features=converter.Feature.ALL,
-             experimental_strip_decorators=None,
-             experimental_verbose=converter.Verbosity.BRIEF,
-             experimental_partial_types=None):
+             experimental_optional_features=converter.Feature.ALL):
   """Converts a Python entity into a TensorFlow graph.
 
   Also see: `tf.autograph.to_code`, `tf.function`.
@@ -383,14 +438,6 @@ def to_graph(entity,
     experimental_optional_features: `None`, a tuple of, or a single
       `tf.autograph.experimental.Feature` value. Controls the use of
       optional features in the conversion process.
-    experimental_strip_decorators: A tuple specifying decorators that should be
-      excluded from the compiled output. By default, when converting a function
-      before the decorators are applied, the compiled output will include those
-      decorators.
-    experimental_verbose: The level of printing verbosity to use, as a
-      `tf.autograph.experimental.Verbosity` value.
-    experimental_partial_types: A `set` of `type` values, reserved for internal
-      use.
 
   Returns:
     Same as `entity`, the converted Python function or class.
@@ -398,66 +445,62 @@ def to_graph(entity,
   Raises:
     ValueError: If the entity could not be converted.
   """
-  if experimental_strip_decorators is None:
-    experimental_strip_decorators = ()
-  experimental_strip_decorators += (convert, do_not_convert, converted_call)
-
-  program_ctx = converter.ProgramContext(
-      options=converter.ConversionOptions(
-          recursive=recursive,
-          verbose=experimental_verbose,
-          strip_decorators=experimental_strip_decorators,
-          optional_features=experimental_optional_features),
-      partial_types=experimental_partial_types,
-      autograph_module=tf_inspect.getmodule(to_graph),
-      uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  _, name, namespace = conversion.entity_to_graph(entity, program_ctx,
-                                                  arg_values, arg_types)
-
-  nodes = []
-  for dep in reversed(program_ctx.conversion_order):
-    nodes.extend(program_ctx.dependency_cache[dep])
-
-  compiled_module, _ = compiler.ast_to_object(
-      nodes,
-      source_prefix=program_ctx.required_imports,
-      include_source_map=True)
-
-  # The compiled code should see everything the entry entity saw.
-  # TODO(mdan): This might not work well if the call tree spans modules?
-  for key, val in namespace.items():
-    # Avoid overwriting entities that have been transformed.
-    if key not in compiled_module.__dict__:
-      compiled_module.__dict__[key] = val
-  for key, val in program_ctx.additional_symbols.items():
-    if key not in compiled_module.__dict__:
-      compiled_module.__dict__[key] = val
-  compiled = getattr(compiled_module, name)
-
-  if tf_inspect.isfunction(entity):
-    compiled.__defaults__ = entity.__defaults__
-
-  if hasattr(compiled, '__globals__'):
-    # Remove self to avoid circular references. This will probably only work
-    # so long as the function is not reentrant.
-    del compiled.__globals__[name]
-
-  # Need this so the source_mapping attribute is available for the context
-  # manager to access for runtime errors.
-  #
-  # Note that compiler.ast_to_object attaches the source map 'ag_source_map__'
-  # symbol to the compiled module.
-  # TODO(mdan): Record this statically in the generated code.
-  # TODO(mdan): Rename this attribute to 'autograph_info__'
-  source_map_attribute_name = 'ag_source_map'
-  if getattr(compiled, source_map_attribute_name, None) is not None:
-    raise ValueError('cannot convert %s because is has an attribute '
-                     '"%s", which is reserved for AutoGraph.' %
-                     (compiled, source_map_attribute_name))
-  setattr(compiled, source_map_attribute_name,
-          compiled_module.__dict__['ag_source_map__'])
-
-  return compiled
+  try:
+    program_ctx = converter.ProgramContext(
+        options=converter.ConversionOptions(
+            recursive=recursive,
+            optional_features=experimental_optional_features),
+        autograph_module=tf_inspect.getmodule(to_graph))
+    nodes, name, namespace = conversion.entity_to_graph(entity, program_ctx,
+                                                        arg_values, arg_types)
+
+    compiled_module, _ = compiler.ast_to_object(
+        nodes,
+        source_prefix=program_ctx.required_imports,
+        include_source_map=True)
+
+    # The compiled code should see everything the entry entity saw.
+    # TODO(mdan): This might not work well if the call tree spans modules?
+    for key, val in namespace.items():
+      # Avoid overwriting entities that have been transformed.
+      if key not in compiled_module.__dict__:
+        compiled_module.__dict__[key] = val
+    compiled = getattr(compiled_module, name)
+
+    if hasattr(entity, '__defaults__'):
+      logging.log(3, 'Default args mapping: %s has: %s', entity,
+                  entity.__defaults__)
+      compiled.__defaults__ = entity.__defaults__
+    else:
+      logging.log(3, 'Default args mapping: %s has no __defaults__', entity)
+
+    logging.log(3, 'Namespace of %s includes: %s', compiled,
+                compiled_module.__dict__.keys())
+
+    if hasattr(compiled, '__globals__'):
+      # Remove self to avoid circular references. This will probably only work
+      # so long as the function is not reentrant.
+      del compiled.__globals__[name]
+
+    # Need this so the source_mapping attribute is available for the context
+    # manager to access for runtime errors.
+    #
+    # Note that compiler.ast_to_object attaches the source map 'ag_source_map__'
+    # symbol to the compiled module.
+    # TODO(mdan): Record this statically in the generated code.
+    # TODO(mdan): Rename this attribute to 'autograph_info__'
+    source_map_attribute_name = 'ag_source_map'
+    if getattr(compiled, source_map_attribute_name, None) is not None:
+      # TODO(znado): change input problem errors into TransformError
+      raise ValueError('cannot convert %s because is has an attribute '
+                       '"%s", which is reserved for AutoGraph.' %
+                       (compiled, source_map_attribute_name))
+    setattr(compiled, source_map_attribute_name,
+            compiled_module.__dict__['ag_source_map__'])
+
+    return compiled
+  except (ValueError, AttributeError, KeyError, NameError, AssertionError) as e:
+    errors.report_internal_error(entity, e)
 
 
 @tf_export('autograph.to_code')
@@ -466,8 +509,7 @@ def to_code(entity,
             arg_values=None,
             arg_types=None,
             indentation='  ',
-            experimental_optional_features=converter.Feature.ALL,
-            experimental_partial_types=None):
+            experimental_optional_features=converter.Feature.ALL):
   """Similar to `to_graph`, but returns Python source code as a string.
 
   Also see: `tf.autograph.to_graph`.
@@ -490,8 +532,6 @@ def to_code(entity,
     experimental_optional_features: `None`, a tuple of, or a single
       `tf.autograph.experimental.Feature` value. Controls the use of
       optional features in the conversion process.
-    experimental_partial_types: A `set` of `type` values, reserved for internal
-      use.
 
   Returns:
     The converted code as string.
@@ -499,16 +539,11 @@ def to_code(entity,
   program_ctx = converter.ProgramContext(
       options=converter.ConversionOptions(
           recursive=recursive,
-          verbose=converter.Verbosity.BRIEF,
-          strip_decorators=(convert, do_not_convert, converted_call),
           optional_features=experimental_optional_features),
-      partial_types=experimental_partial_types,
-      autograph_module=tf_inspect.getmodule(to_graph),
-      uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  conversion.entity_to_graph(entity, program_ctx, arg_values, arg_types)
-
-  code = '\n'.join(
-      compiler.ast_to_source(program_ctx.dependency_cache[dep], indentation)
-      for dep in reversed(program_ctx.conversion_order))
+      autograph_module=tf_inspect.getmodule(to_graph))
+  nodes, _, _ = conversion.entity_to_graph(entity, program_ctx, arg_values,
+                                           arg_types)
+
+  code = compiler.ast_to_source(nodes, indentation)
 
   return program_ctx.required_imports + '\n\n' + code
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index d5561ba8249f539e720fa1ecb5800b76c61a8c2f..79a29ca6edf8c15eea4a4f60bc30d49ce4a4cb59 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import gc
 
@@ -26,6 +27,8 @@ import numpy as np
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.pyct import errors
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
@@ -39,6 +42,9 @@ from tensorflow.python.util import tf_inspect
 tf = utils.fake_tf()
 
 
+testing_global_numeric = 2
+
+
 class TestResource(str):
   pass
 
@@ -46,7 +52,7 @@ class TestResource(str):
 class ApiTest(test.TestCase):
 
   @test_util.run_deprecated_v1
-  def test_decorator_recurses(self):
+  def test_decorator_recursive(self):
 
     class TestClass(object):
 
@@ -69,7 +75,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_does_not_recurse(self):
+  def test_decorator_not_recursive(self):
 
     class TestClass(object):
 
@@ -90,7 +96,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_graph(self):
+  def test_convert_then_do_not_convert_graph(self):
 
     class TestClass(object):
 
@@ -105,14 +111,13 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_py_func(self):
+  def test_convert_then_do_not_convert_py_func(self):
 
     class TestClass(object):
 
@@ -132,11 +137,10 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
@@ -192,18 +196,17 @@ class ApiTest(test.TestCase):
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
           x //= api.converted_call(self.called_member, None,
-                                   converter.ConversionOptions(), self, a)
+                                   converter.ConversionOptions(), (a,), {})
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, None, converter.ConversionOptions(), 3)
+    x = api.converted_call(range, None, converter.ConversionOptions(), (3,), {})
     self.assertEqual((0, 1, 2), tuple(x))
 
   def test_converted_call_function(self):
@@ -213,10 +216,9 @@ class ApiTest(test.TestCase):
         return -x
       return x
 
-    with self.cached_session() as sess:
-      x = api.converted_call(test_fn, None, converter.ConversionOptions(),
-                             constant_op.constant(-1))
-      self.assertEqual(1, self.evaluate(x))
+    x = api.converted_call(test_fn, None, converter.ConversionOptions(),
+                           (constant_op.constant(-1),), {})
+    self.assertEqual(1, self.evaluate(x))
 
   @test_util.run_v1_only('b/120545219')
   def test_converted_call_functools_partial(self):
@@ -227,16 +229,14 @@ class ApiTest(test.TestCase):
       return x, y, z
 
     x = api.converted_call(
-        functools.partial(test_fn, constant_op.constant(-1), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+        functools.partial(test_fn, constant_op.constant(-1), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
     x = api.converted_call(
         functools.partial(
-            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
@@ -259,11 +259,54 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None, converter.ConversionOptions(),
+                           (), {})
+    self.assertEqual(1, self.evaluate(x))
+
+  def test_converted_call_method_as_object_attribute(self):
+
+    class AnotherClass(object):
+
+      def __init__(self):
+        self.another_class_attr = constant_op.constant(1)
+
+      def method(self):
+        if self.another_class_attr > 0:
+          return self.another_class_attr + 1
+        return self.another_class_attr + 10
+
+    class TestClass(object):
+
+      def __init__(self, another_obj_method):
+        self.another_obj_method = another_obj_method
+
+    obj = AnotherClass()
+    tc = TestClass(obj.method)
+
+    x = api.converted_call('another_obj_method', tc,
+                           converter.ConversionOptions(), (), {})
+    self.assertEqual(self.evaluate(x), 2)
+
+  def test_converted_call_method_converts_recursively(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def other_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+      def test_method(self):
+        return self.other_method()
+
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None,
+                           converter.ConversionOptions(recursive=True), (), {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -277,11 +320,10 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(TestClass.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(TestClass.test_method, None,
+                           converter.ConversionOptions(), (tc,), {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -295,11 +337,11 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc, None, converter.ConversionOptions(), (), {})
+    self.assertEqual(1, self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_converted_call_constructor(self):
 
     class TestClass(object):
@@ -312,27 +354,45 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
-                              constant_op.constant(-1))
-      # tc is now a converted object.
-      x = tc.test_method()
-      self.assertEqual(1, self.evaluate(x))
+    tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
+                            (constant_op.constant(-1),), {})
+    # tc is still a TestClass - constructors are whitelisted.
+    # TODO(b/124016764): Support this use case.
+    # The error below is specific to the `if` statement not being converted.
+    with self.assertRaisesRegex(
+        TypeError, 'Using a `tf.Tensor` as a Python `bool`'):
+      tc.test_method()
 
   def test_converted_call_already_converted(self):
 
     def f(x):
       return x == 0
 
-    with self.cached_session() as sess:
-      x = api.converted_call(f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+    converted_f = api.to_graph(
+        f, experimental_optional_features=converter.Feature.ALL)
+    x = api.converted_call(converted_f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+  def test_converted_call_then_already_converted_dynamic(self):
+
+    @api.convert()
+    def g(x):
+      if x > 0:
+        return x
+      else:
+        return -x
 
-      converted_f = api.to_graph(f)
-      x = api.converted_call(converted_f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    def f(g, x):
+      return g(x)
+
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (g, constant_op.constant(1)), {})
+    self.assertEqual(self.evaluate(x), 1)
 
   @test_util.run_deprecated_v1
   def test_converted_call_no_user_code(self):
@@ -345,10 +405,10 @@ class ApiTest(test.TestCase):
     # f should not be converted, causing len to error out.
     with self.assertRaisesRegexp(Exception,
                                  'object of type \'Tensor\' has no len()'):
-      api.converted_call(f, None, opts, constant_op.constant([0]))
+      api.converted_call(f, None, opts, (constant_op.constant([0]),), {})
 
     # len on the other hand should work fine.
-    x = api.converted_call(len, None, opts, constant_op.constant([0]))
+    x = api.converted_call(len, None, opts, (constant_op.constant([0]),), {})
     # The constant has static shape so the result is a primitive not a Tensor.
     self.assertEqual(x, 1)
 
@@ -361,13 +421,12 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call(model.call, None, opts,
-                           constant_op.constant([[0.0]]), training=True)
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
-  def test_converted_call_whitelisted_method_extra_self(self):
+  def test_converted_call_whitelisted_method_via_owner(self):
 
     opts = converter.ConversionOptions()
 
@@ -375,27 +434,29 @@ class ApiTest(test.TestCase):
         core.Dense(2)
     ])
 
-    x = api.converted_call(model.call, None, opts,
-                           model, constant_op.constant([[0.0]]), training=True)
+    x = api.converted_call('call', model, opts,
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
-  def test_converted_call_whitelisted_method_via_owner(self):
+  def test_converted_call_namedtuple(self):
 
     opts = converter.ConversionOptions()
 
-    model = sequential.Sequential([
-        core.Dense(2)
-    ])
+    x = api.converted_call(collections.namedtuple, None, opts,
+                           ('TestNamedtuple', ('a', 'b')), {})
 
-    x = api.converted_call('call', model, opts,
-                           constant_op.constant([[0.0]]), training=True)
+    self.assertTrue(inspect_utils.isnamedtuple(x))
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+  def test_converted_call_namedtuple_via_collections(self):
+
+    opts = converter.ConversionOptions()
+
+    x = api.converted_call('namedtuple', collections, opts, ('TestNamedtuple',
+                                                             ('a', 'b')), {})
+
+    self.assertTrue(inspect_utils.isnamedtuple(x))
 
   def test_converted_call_lambda(self):
 
@@ -403,11 +464,10 @@ class ApiTest(test.TestCase):
 
     l = lambda x: x == 0
 
-    x = api.converted_call(l, None, opts, constant_op.constant(0))
+    x = api.converted_call(l, None, opts, (constant_op.constant(0),), {})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual(True, self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(True, self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
@@ -439,6 +499,45 @@ class ApiTest(test.TestCase):
       x = compiled_fn(constant_op.constant([4, 8]))
       self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
+  def test_to_graph_with_globals(self):
+
+    def test_fn(x):
+      global testing_global_numeric
+      testing_global_numeric = x + testing_global_numeric
+      return testing_global_numeric
+
+    # TODO(b/122368197)
+    with self.assertRaisesRegex(
+        errors.AutoGraphError, 'global keyword is not yet supported'):
+      api.to_graph(test_fn)
+
+  def test_to_graph_with_kwargs_clashing_converted_call(self):
+
+    def called_fn(**kwargs):
+      return kwargs['f'] + kwargs['owner']
+
+    def test_fn():
+      # These arg names intentionally match converted_call's
+      return called_fn(f=1, owner=2)
+
+    compiled_fn = api.to_graph(test_fn)
+
+    self.assertEqual(compiled_fn(), 3)
+
+  def test_to_graph_with_kwargs_clashing_unconverted_call(self):
+
+    @api.do_not_convert()
+    def called_fn(**kwargs):
+      return kwargs['f'] + kwargs['owner']
+
+    def test_fn():
+      # These arg names intentionally match _call_unconverted's
+      return called_fn(f=1, owner=2)
+
+    compiled_fn = api.to_graph(test_fn)
+
+    self.assertEqual(compiled_fn(), 3)
+
   def test_to_code_basic(self):
 
     def test_fn(x, s):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 0ca84b1f7a488e28f1900cb3ba76577814562094..bb9464c3361fa3507366bba15d9a937ae392cb4c 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import imp
+import unittest
 
 import gast
 
@@ -33,7 +34,6 @@ from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.converters import conditional_expressions
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.converters import control_flow
-from tensorflow.python.autograph.converters import decorators
 from tensorflow.python.autograph.converters import directives
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.converters import function_scopes
@@ -44,18 +44,22 @@ from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import errors
+from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import naming
+from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -80,25 +84,33 @@ def is_whitelisted_for_graph(o):
     m = functools
   else:
     m = tf_inspect.getmodule(o)
-  if not hasattr(m, '__name__'):
-    logging.vlog(1, '%s is NOT whitelisted for graph: unknown module name', o)
-    return False
 
-  for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
-    if m.__name__.startswith(prefix):
-      logging.vlog(1, '%s is whitelisted: name starts with "%s"', o, prefix)
+  if hasattr(m, '__name__'):
+    # Builtins typically have unnamed modules.
+    for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
+      if m.__name__.startswith(prefix):
+        logging.log(2, 'Whitelisted: %s: name starts with "%s"', o, prefix)
+        return True
+
+    # Temporary -- whitelist tensorboard modules.
+    # TODO(b/122731813): Remove.
+    if m.__name__ == 'tensorboard' or '.tensorboard' in m.__name__:
+      logging.log(2, 'Whitelisted: %s: name contains "tensorboard"', o)
       return True
 
-  if hasattr(o, 'autograph_info__'):
+  if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
+    logging.log(2, 'Whitelisted: %s: already converted', o)
     return True
 
-  if (not inspect_utils.isweakrefself(o) and not tf_inspect.isclass(o) and
-      hasattr(o, '__call__') and hasattr(o, '__class__')):
+  if hasattr(o, '__call__'):
     # Callable objects: whitelisted if their __call__ method is.
-    retval = is_whitelisted_for_graph(o.__call__)
-    logging.vlog(1, '%s is whitelisted: object __call__ whitelisted', o)
-    return retval
+    # The type check avoids infinite recursion around the __call__ method
+    # of function objects.
+    if (type(o) != type(o.__call__)) and is_whitelisted_for_graph(o.__call__):  # pylint: disable=unidiomatic-typecheck
+      logging.log(2, 'Whitelisted: %s: object __call__ whitelisted', o)
+      return True
 
+  owner_class = None
   if tf_inspect.ismethod(o):
     # Methods of whitelisted classes are also whitelisted, even if they are
     # bound via user subclasses.
@@ -117,10 +129,14 @@ def is_whitelisted_for_graph(o):
 
     owner_class = inspect_utils.getmethodclass(o)
     if owner_class is not None:
+      if issubclass(owner_class, unittest.TestCase):
+        logging.log(2, 'Whitelisted: %s: method of TestCase subclass', o)
+        return True
+
       owner_class = inspect_utils.getdefiningclass(o, owner_class)
       if is_whitelisted_for_graph(owner_class):
-        logging.vlog(1, '%s is whitelisted: owner is whitelisted %s', o,
-                     owner_class)
+        logging.log(2, 'Whitelisted: %s: owner is whitelisted %s', o,
+                    owner_class)
         return True
 
   if inspect_utils.isnamedtuple(o):
@@ -128,14 +144,14 @@ def is_whitelisted_for_graph(o):
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
     if tf_inspect.isclass(o) and len(o.__bases__) > 1:
-      logging.log_first_n(
-          logging.level_warning(),
-          'Entity {} looks like a namedtuple subclass. If it has any custom'
-          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
-    logging.vlog(1, '%s is whitelisted: named tuple', o)
+      logging.warn(
+          'Entity {} looks like a namedtuple subclass. Its constructor will'
+          ' not be converted by AutoGraph, but if it has any custom methods,'
+          ' those will be.'.format(o), 1)
+    logging.log(2, 'Whitelisted: %s: named tuple', o)
     return True
 
-  logging.vlog(1, '%s is NOT whitelisted for graph', o)
+  logging.log(2, 'Not whitelisted: %s: default rule', o)
   return False
 
 
@@ -167,14 +183,14 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  logging.vlog(logging.DEBUG, 'Converting %s', o)
+  logging.log(1, 'Converting %s', o)
 
   if tf_inspect.isclass(o):
-    node, name, ns = class_to_graph(o, program_ctx)
+    nodes, name, ns = class_to_graph(o, program_ctx)
   elif tf_inspect.isfunction(o):
-    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    nodes, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    nodes, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
   elif hasattr(o, '__class__'):
     raise NotImplementedError(
@@ -184,7 +200,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
         'conversion. For example, instead of converting the method '
         'of a class, try converting the entire class instead. '
         'See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/'
-        'contrib/autograph/README.md#using-the-functional-api '
+        'python/autograph/README.md#using-the-functional-api '
         'for more information.')
   else:
     raise ValueError(
@@ -197,35 +213,22 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   template = '''
       entity.autograph_info__ = {}
   '''
-  node.extend(templates.replace(template, entity=name))
-
-  program_ctx.add_to_cache(o, node)
+  nodes.extend(templates.replace(template, entity=name))
 
-  if logging.get_verbosity() <= logging.DEBUG:
-    logging.vlog(logging.DEBUG, 'Compiled output of %s:\n\n%s\n', o,
-                 compiler.ast_to_source(node))
+  if logging.has_verbosity(2):
+    logging.log(2, 'Compiled output of %s:\n\n%s\n', o,
+                compiler.ast_to_source(nodes))
+  if logging.has_verbosity(4):
+    for n in nodes:
+      logging.log(4, 'Compiled AST of %s:\n\n%s\n\n', o,
+                  pretty_printer.fmt(n, color=False))
 
-  if program_ctx.options.recursive:
-    while True:
-      candidate = None
-      for obj in program_ctx.name_map.keys():
-        if obj not in program_ctx.dependency_cache:
-          candidate = obj
-          break
-      if candidate is None:
-        break
-      if (hasattr(candidate, 'im_class') and
-          getattr(candidate, 'im_class') not in program_ctx.partial_types):
-        # Class members are converted with their objects, unless they're
-        # only converted partially.
-        continue
-      entity_to_graph(candidate, program_ctx, {}, {})
-
-  return node, name, ns
+  return nodes, name, ns
 
 
 def class_to_graph(c, program_ctx):
   """Specialization of `entity_to_graph` for classes."""
+  # TODO(mdan): Revisit this altogether. Not sure we still need it.
   converted_members = {}
   method_filter = lambda m: tf_inspect.isfunction(m) or tf_inspect.ismethod(m)
   members = tf_inspect.getmembers(c, predicate=method_filter)
@@ -237,25 +240,22 @@ def class_to_graph(c, program_ctx):
     # Only convert the members that are directly defined by the class.
     if inspect_utils.getdefiningclass(m, c) is not c:
       continue
-    node, _, namespace = function_to_graph(
+    nodes, _, namespace = function_to_graph(
         m,
         program_ctx=program_ctx,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
-        owner_type=c)
+        do_rename=False)
     if class_namespace is None:
       class_namespace = namespace
     else:
       class_namespace.update(namespace)
-    converted_members[m] = node[0]
-  namer = program_ctx.new_namer(class_namespace)
-  class_name = namer.compiled_class_name(c.__name__, c)
+    converted_members[m] = nodes[0]
+  namer = naming.Namer(class_namespace)
+  class_name = namer.class_name(c.__name__)
 
-  # TODO(mdan): This needs to be explained more thoroughly.
   # Process any base classes: if the superclass if of a whitelisted type, an
-  # absolute import line is generated. Otherwise, it is marked for conversion
-  # (as a side effect of the call to namer.compiled_class_name() followed by
-  # program_ctx.update_name_map(namer)).
+  # absolute import line is generated.
   output_nodes = []
   renames = {}
   base_names = []
@@ -271,11 +271,12 @@ def class_to_graph(c, program_ctx):
               names=[gast.alias(name=base.__name__, asname=alias)],
               level=0))
     else:
-      # This will trigger a conversion into a class with this name.
-      alias = namer.compiled_class_name(base.__name__, base)
+      raise NotImplementedError(
+          'Conversion of classes that do not directly extend classes from'
+          ' whitelisted modules is temporarily suspended. If this breaks'
+          ' existing code please notify the AutoGraph team immediately.')
     base_names.append(alias)
     renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
-  program_ctx.update_name_map(namer)
 
   # Generate the definition of the converted class.
   bases = [gast.Name(n, gast.Load(), None) for n in base_names]
@@ -307,6 +308,7 @@ def _add_reserved_symbol(namespace, name, entity):
 ag_internal = None
 
 
+# TODO(mdan): Move into core or replace with an actual importable module.
 def _add_self_references(namespace, autograph_module):
   """Adds namespace references to the module that exposes the api itself."""
   global ag_internal
@@ -315,10 +317,12 @@ def _add_self_references(namespace, autograph_module):
     # internal modules.
     ag_internal = imp.new_module('autograph')
     ag_internal.__dict__.update(autograph_module.__dict__)
+    ag_internal.ConversionOptions = converter.ConversionOptions
+    ag_internal.Feature = converter.Feature
     ag_internal.utils = utils
     ag_internal.function_scope = function_wrapping.function_scope
     ag_internal.rewrite_graph_construction_error = (
-        errors.rewrite_graph_construction_error)
+        ag_errors.rewrite_graph_construction_error)
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
@@ -328,70 +332,59 @@ def _add_self_references(namespace, autograph_module):
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
 
 
-def function_to_graph(f,
-                      program_ctx,
-                      arg_values,
-                      arg_types,
-                      owner_type=None):
+def function_to_graph(f, program_ctx, arg_values, arg_types, do_rename=True):
   """Specialization of `entity_to_graph` for callable functions."""
 
-  node, source = parser.parse_entity(f)
-  node = node.body[0]
-
-  # In general, the output of inspect.getsource is inexact because it uses
-  # regex matching to adjust the exact location around the line number that
-  # CPython records. This is particularly problematic for lambda functions,
-  # where the entire containing lines are returned.
-  nodes = ast_util.find_matching_definitions(node, f)
-  if len(nodes) != 1:
-    if f.__name__ == '<lambda>':
+  node, source, _ = parser.parse_entity(f)
+  logging.log(3, 'Source code of %s:\n\n%s\n', f, source)
+
+  # In general, the output of inspect.getsource is inexact for lambdas because
+  # it uses regex matching to adjust the exact location around the line number
+  # that CPython records. Then, the entire containing line is returned, which
+  # we may have trouble disambiguating. For example:
+  # x, y = lambda: 1, lambda: 2
+  if f.__name__ == '<lambda>':
+    nodes = ast_util.find_matching_definitions(node, f)
+    if len(nodes) != 1:
       raise ValueError(
           'Unable to identify source code of lambda function {}. It was'
           ' defined on this line: {}, which must contain a single lambda with'
           ' matching signature. To avoid ambiguity, define each lambda'
           ' in a separate expression.'.format(f, source))
-    else:
-      raise ValueError(
-          'Unable to identify source code of function {}({}). The source code'
-          ' reported by Python did not include exactly one matching signature:'
-          '\n{}\n. This is an extremely rare occurrence. Please report it to'
-          ' the TensorFlow team.'.format(f, tf_inspect.getfullargspec(f),
-                                         source))
-  node, = nodes
+    node, = nodes
 
   # TODO(znado): Place inside standard_analysis.
   origin_info.resolve(node, source, f)
   namespace = inspect_utils.getnamespace(f)
   _add_self_references(namespace, program_ctx.autograph_module)
-  namer = program_ctx.new_namer(namespace)
+  namer = naming.Namer(namespace)
 
   entity_info = transformer.EntityInfo(
       source_code=source,
       source_file='<fragment>',
       namespace=namespace,
       arg_values=arg_values,
-      arg_types=arg_types,
-      owner_type=owner_type)
+      arg_types=arg_types)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context)
+  try:
+    node = node_to_graph(node, context)
+  except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
+    logging.error(1, 'Error converting %s', f, exc_info=True)
+    raise errors.InternalError('conversion', e)
+    # TODO(mdan): Catch and rethrow syntax errors.
 
   if isinstance(node, gast.Lambda):
     new_name = namer.new_symbol('tf__lambda', ())
     node = gast.Assign(
         targets=[gast.Name(new_name, gast.Store(), None)], value=node)
 
-  else:
+  elif do_rename:
     # TODO(mdan): This somewhat duplicates the renaming logic in call_trees.py
-    new_name, did_rename = namer.compiled_function_name(f.__name__, f,
-                                                        owner_type)
-    if did_rename:
-      node.name = new_name
-    else:
-      new_name = f.__name__
-      assert node.name == new_name
-
-  program_ctx.update_name_map(namer)
-  # TODO(mdan): Use this at compilation.
+    new_name = namer.function_name(f.__name__)
+    node.name = new_name
+  else:
+    new_name = f.__name__
+    assert node.name == new_name
 
   return [node], new_name, namespace
 
@@ -410,19 +403,18 @@ def node_to_graph(node, context):
             dependencies that this node has.
   """
   # TODO(mdan): Insert list_comprehensions somewhere.
+  unsupported_features_checker.verify(node)
 
   node = converter.standard_analysis(node, context, is_initial=True)
   # Past this point, line numbers are no longer accurate so we ignore the
   # source.
   # TODO(mdan): Is it feasible to reconstruct intermediate source code?
   context.info.source_code = None
-
-  if context.program.options.uses(converter.Feature.DECORATORS):
-    node = converter.apply_(node, context, decorators)
   node = converter.apply_(node, context, arg_defaults)
   node = converter.apply_(node, context, directives)
   node = converter.apply_(node, context, break_statements)
-  node = converter.apply_(node, context, asserts)
+  if context.program.options.uses(converter.Feature.ASSERT_STATEMENTS):
+    node = converter.apply_(node, context, asserts)
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
@@ -431,11 +423,13 @@ def node_to_graph(node, context):
   if context.program.options.uses(converter.Feature.LISTS):
     node = converter.apply_(node, context, lists)
     node = converter.apply_(node, context, slices)
-  node = converter.apply_(node, context, builtin_functions)
+  if context.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS):
+    node = converter.apply_(node, context, builtin_functions)
   node = converter.apply_(node, context, call_trees)
   node = converter.apply_(node, context, control_flow)
   node = converter.apply_(node, context, conditional_expressions)
-  node = converter.apply_(node, context, logical_expressions)
+  if context.program.options.uses(converter.Feature.LOGICAL_EXPRESSIONS):
+    node = converter.apply_(node, context, logical_expressions)
   if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
     node = converter.apply_(node, context, side_effect_guards)
   # TODO(mdan): If function scopes ever does more, the toggle will need moving.
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 9a4fbdad8c1994d8c8cc534b6e0b4af45f5c4c80..7902fa697f6ac2fe46c7e67612dce698a7632d97 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -21,11 +21,10 @@ from __future__ import print_function
 import gast
 
 from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
-from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.impl import conversion
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.platform import test
@@ -36,9 +35,7 @@ class ConversionTest(test.TestCase):
   def _simple_program_ctx(self):
     return converter.ProgramContext(
         options=converter.ConversionOptions(recursive=True),
-        partial_types=(),
-        autograph_module=api,
-        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
+        autograph_module=api)
 
   def test_is_whitelisted_for_graph(self):
 
@@ -89,15 +86,9 @@ class ConversionTest(test.TestCase):
       return g(a)
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(f, program_ctx, None, None)
-
-    self.assertTrue(f in program_ctx.dependency_cache)
-    self.assertTrue(g in program_ctx.dependency_cache)
-    f_node = program_ctx.dependency_cache[f][0]
-    g_node = program_ctx.dependency_cache[g][0]
+    nodes, _, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    f_node = nodes[0]
     self.assertEqual('tf__f', f_node.name)
-    self.assertEqual('tf__g', f_node.body[0].body[0].body[0].value.func.id)
-    self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
 
@@ -125,16 +116,8 @@ class ConversionTest(test.TestCase):
         return self.y
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
-
-    self.assertTrue(TestBase in program_ctx.dependency_cache)
-    self.assertTrue(TestSubclass in program_ctx.dependency_cache)
-    # The returned nodes will include:
-    # <import nodes>, <class node>, <assignment node>
-    self.assertEqual('TfTestBase',
-                     program_ctx.dependency_cache[TestBase][-2].name)
-    self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass][-2].name)
+    with self.assertRaisesRegex(NotImplementedError, 'classes.*whitelisted'):
+      conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
 
   def test_entity_to_graph_class_hierarchy_whitelisted(self):
 
@@ -148,16 +131,12 @@ class ConversionTest(test.TestCase):
         return 3 * x
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
+    nodes, name, _ = conversion.entity_to_graph(TestSubclass, program_ctx, None,
+                                                None)
+    class_node = nodes[-2]  # TODO(mdan): This is brittle.
 
-    self.assertTrue(TestSubclass in program_ctx.dependency_cache)
-    self.assertFalse(training.Model in program_ctx.dependency_cache)
-    self.assertEqual(
-        'Model', program_ctx.dependency_cache[TestSubclass][0].names[0].name)
-    # The returned nodes will include:
-    # <import nodes>, <class node>, <assignment node>
-    self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass][-2].name)
+    self.assertEqual(name, 'TfTestSubclass')
+    self.assertEqual(class_node.name, 'TfTestSubclass')
 
   def test_entity_to_graph_lambda(self):
     b = 2
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 21a66c86b79e2116319bb240b138c6757484c6e0..f046d2d6a174fccb15996f944260b67ddeb80f1c 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -26,6 +26,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
+        "special_values.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -59,6 +60,9 @@ py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",  # b/127001953
+    ],
     deps = [
         ":operators",
         "//tensorflow/python:client_testlib",
@@ -105,3 +109,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "special_values_test",
+    srcs = ["special_values_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 35f8028c295550443b98ca430d459967e03a6edf..5b3f45de056bf0354c3864aa51fd485fbc891624 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -71,3 +71,5 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
+from tensorflow.python.autograph.operators.special_values import is_undefined
+from tensorflow.python.autograph.operators.special_values import Undefined
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 035ea1bd9277a8dc66d9766cd00f5b8ccd6ad272..365afdd4b3b18dfc51fad784ff386530c277a6b2 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -19,7 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -62,6 +65,17 @@ def for_stmt(iter_, extra_test, body, init_state):
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
   elif isinstance(iter_, dataset_ops.DatasetV2):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
+
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -71,13 +85,9 @@ def _py_for_stmt(iter_, extra_test, body, init_state):
   """Overload of for_stmt that executes a Python for loop."""
   state = init_state
   for target in iter_:
-    if not extra_test(*state):
+    if extra_test is not None and not extra_test(*state):
       break
     state = body(target, *state)
-
-  # TODO(mdan): Remove this special case.
-  if len(state) == 1:
-    return state[0]
   return state
 
 
@@ -88,13 +98,17 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
   def while_body(iterate_index, *state):
     iterate = iter_[iterate_index]
     new_state = body(iterate, *state)
+
+    state = (iterate_index + 1,)
     if new_state:
-      return (iterate_index + 1,) + new_state
-    else:
-      return iterate_index + 1
+      state += new_state
+
+    return state
 
   def while_cond(iterate_index, *state):
-    return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    if extra_test is not None:
+      return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    return iterate_index < n
 
   results = while_stmt(
       while_cond,
@@ -109,9 +123,6 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
     assert len(results) >= 1  # Has at least the iterate.
     if len(results) > 1:
       results = results[1:]
-    if len(results) == 1:
-      # TODO(mdan): Remove this special case.
-      results, = results
   else:
     results = ()
 
@@ -120,20 +131,26 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  if extra_test(*init_state) is not True:
+
+  if extra_test is not None:
     raise NotImplementedError(
-        'break statements are not yet supported in for/Dataset loops')
+        'break and return statements are not yet supported in '
+        'for/Dataset loops.')
 
   def reduce_body(state, iterate):
     new_state = body(iterate, *state)
     return new_state
 
-  results = ds.reduce(init_state, reduce_body)
+  if init_state:
+    return ds.reduce(init_state, reduce_body)
 
-  # TODO(mdan): Remove this special case.
-  if len(results) == 1:
-    return results[0]
-  return results
+  # Workaround for Datset.reduce not allowing empty state tensors - create
+  # a dummy state variable that remains unused.
+  def reduce_body_with_dummy_state(state, iterate):
+    reduce_body((), iterate)
+    return state
+  ds.reduce((constant_op.constant(0),), reduce_body_with_dummy_state)
+  return ()
 
 
 def while_stmt(test, body, init_state, extra_deps, opts=None):
@@ -161,18 +178,45 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
   # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
-  if any(tensor_util.is_tensor(v)
-         for v in nest.flatten(init_state + extra_deps)):
+  if any(
+      tensor_util.is_tensor(v) or isinstance(v, data_flow_ops.QueueBase)
+      for v in nest.flatten(extra_deps)):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
     return _tf_while_stmt(test, body, init_state, opts)
   else:
     return _py_while_stmt(test, body, init_state, opts)
 
 
+def _filter_undefined(all_symbols):
+  """Returns the names of undefined symbols contained in all_symbols."""
+  undefined_symbols = [
+      s.symbol_name
+      for s in all_symbols
+      if special_values.is_undefined(s)
+  ]
+  return undefined_symbols
+
+
 def _tf_while_stmt(test, body, init_state, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   if opts is None:
     opts = {}
-  return control_flow_ops.while_loop(test, body, init_state, **opts)
+
+  # Non-v2 while_loop unpacks the results when there is only one return value.
+  # This enforces consistency across versions.
+  opts['return_same_structure'] = True
+
+  retval = control_flow_ops.while_loop(test, body, init_state, **opts)
+  return retval
 
 
 def _py_while_stmt(test, body, init_state, opts):
@@ -184,7 +228,7 @@ def _py_while_stmt(test, body, init_state, opts):
   return state
 
 
-def if_stmt(cond, body, orelse):
+def if_stmt(cond, body, orelse, get_state, set_state):
   """Functional form of an if statement.
 
   Args:
@@ -193,19 +237,71 @@ def if_stmt(cond, body, orelse):
         as return type.
     orelse: Callable with no arguments, and outputs of the negative (else)
         branch as return type.
+    get_state: Function that returns a tuple containing the values of all
+        composite symbols modified within the conditional. This allows access to
+        state that branches may mutate through side effects. This function is
+        not needed and should not be called when dispatching to code matching
+        Python's default semantics. This is useful for checkpointing to avoid
+        unintended side-effects when staging requires evaluating all code-paths.
+    set_state: Function to set the values of all composite symbols modified
+        within the conditional. This is the complement to get_state, used to
+        restore checkpointed values. The single argument a tuple containing
+        values for each composite symbol that may be modified in a branch of the
+        conditional. The is usually the result of a call to get_state.
 
   Returns:
     Tuple containing the statement outputs.
   """
   if tensor_util.is_tensor(cond):
-    return tf_if_stmt(cond, body, orelse)
+    return tf_if_stmt(cond, body, orelse, get_state, set_state)
   else:
     return _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse):
+def tf_if_stmt(cond, body, orelse, get_state, set_state):
   """Overload of if_stmt that stages a TF cond."""
-  return control_flow_ops.cond(cond, body, orelse)
+  checkpointed_body = _wrap_in_state_isolation(body, get_state, set_state)
+  checkpointed_orelse = _wrap_in_state_isolation(orelse, get_state,
+                                                 set_state)
+  protected_body = _wrap_in_protection_from_undefined(
+      checkpointed_body, branch_name='if')
+  protected_orelse = _wrap_in_protection_from_undefined(
+      checkpointed_orelse, branch_name='else')
+
+  return control_flow_ops.cond(cond, protected_body, protected_orelse)
+
+
+def _wrap_in_state_isolation(func, get_state, set_state):
+  """Wraps function to checkpoint the value of modified composites."""
+  def checkpoint_func():
+    init_values = get_state()
+    ret_values = func()
+    set_state(init_values)
+    return ret_values
+
+  return checkpoint_func
+
+
+def _wrap_in_protection_from_undefined(func, branch_name):
+  """Wraps function to raise useful error when it returns undefined symbols."""
+  def protected_func():
+    """Calls function and raises an error if undefined symbols are returned."""
+    results = func()
+    undefined_symbols = None
+    if isinstance(results, tuple):
+      undefined_symbols = _filter_undefined(results)
+    elif special_values.is_undefined(results):
+      # Single return value
+      undefined_symbols = results.symbol_name
+
+    if undefined_symbols:
+      message = ('The following symbols must also be initialized in the %s '
+                 'branch: {}. Alternatively, you may initialize them before '
+                 'the if statement.') % branch_name
+      message = message.format(undefined_symbols)
+      raise ValueError(message)
+    return results
+  return protected_func
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index f9e006f7ad330aed3a130f2f1198f236aef15eea..e17d548a43ce01280851ddd392b23bd8a208fa40 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -45,18 +45,28 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    self.assertEqual(10, s)
+    self.assertEqual((10,), s)
 
-  @test_util.run_deprecated_v1
   def test_dataset(self):
-    to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
-        dataset_ops.Dataset.range(5).map(to_int32),
-        extra_test=lambda s: True,
+        dataset_ops.Dataset.range(5),
+        extra_test=None,
         body=lambda i, s: (s + i,),
-        init_state=(0,))
-    with self.cached_session():
-      self.assertEqual((10,), self.evaluate(s))
+        init_state=(constant_op.constant(0, dtype=dtypes.int64),))
+    self.assertEqual(self.evaluate(s), (10,))
+
+  @test_util.run_v2_only
+  def test_dataset_no_state(self):
+    v = variables.Variable(0, dtype=dtypes.int64)
+    def stateless_with_side_effects(i):
+      v.assign(v.read_value() + i)
+    s = control_flow.for_stmt(
+        dataset_ops.Dataset.range(5),
+        extra_test=None,
+        body=stateless_with_side_effects,
+        init_state=())
+    self.evaluate(s)
+    self.assertEqual(self.evaluate(v.read_value()), 10)
 
 
 class WhileLoopTest(test.TestCase):
@@ -65,30 +75,41 @@ class WhileLoopTest(test.TestCase):
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
-        test=lambda i, sum: i < n,
-        body=lambda i, sum: (i + 1, sum + i,),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.cached_session():
-      self.assertEqual((5, 10), self.evaluate(results))
+    self.assertEqual((5, 10), self.evaluate(results))
 
   @test_util.run_deprecated_v1
-  def test_tensor_dict_state(self):
+  def test_python_with_tensor_state(self):
     n = 5
-    init_state = {'i': constant_op.constant(0), 'sum': constant_op.constant(0)}
     results = control_flow.while_stmt(
-        test=lambda s: s['i'] < n,
-        body=lambda s: ({'i': s['i'] + 1, 'sum': s['sum'] + s['i']},),
-        init_state=(init_state,),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
+        init_state=(0, constant_op.constant(0)),
         extra_deps=())
-    with self.cached_session():
-      self.assertEqual(({'i': 5, 'sum': 10},), self.evaluate(results))
+    result_i, result_s = results
+    self.assertEqual(5, result_i)
+    self.assertEqual(10, self.evaluate(result_s))
+
+  @test_util.run_deprecated_v1
+  def test_python_due_to_hidden_cond_type(self):
+    n = 5
+
+    # TODO(b/124002646): Improve the error message.
+    with self.assertRaises(Exception):
+      control_flow.while_stmt(
+          test=lambda i, s: i < n,
+          body=lambda i, s: (i + 1, s + i),
+          init_state=(constant_op.constant(0), constant_op.constant(0)),
+          extra_deps=())
 
   def test_python(self):
     n = 5
     results = control_flow.while_stmt(
-        test=lambda i, sum: i < n,
-        body=lambda i, sum: (i + 1, sum + i),
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
         init_state=(0, 0),
         extra_deps=(n,))
     self.assertEqual((5, 10), results)
@@ -97,11 +118,20 @@ class WhileLoopTest(test.TestCase):
 class IfStmtTest(test.TestCase):
 
   def single_return_if_stmt(self, cond):
-    return control_flow.if_stmt(cond=cond, body=lambda: 1, orelse=lambda: -1)
+    return control_flow.if_stmt(
+        cond=cond,
+        body=lambda: 1,
+        orelse=lambda: -1,
+        get_state=lambda: (),
+        set_state=lambda _: None)
 
   def multi_return_if_stmt(self, cond):
     return control_flow.if_stmt(
-        cond=cond, body=lambda: (1, 2), orelse=lambda: (-1, -2))
+        cond=cond,
+        body=lambda: (1, 2),
+        orelse=lambda: (-1, -2),
+        get_state=lambda: (),
+        set_state=lambda _: None)
 
   @test_util.run_deprecated_v1
   def test_tensor(self):
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index dadb0daf1ae22016d0cff2889472423149258ffb..cafb0583e8f66841f0d905f5d98bfc3cb1780513 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -25,10 +25,25 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 
 
+# Note: the implementations in this file are split into very small-grained
+# functions in preparation for the factoring out the more generic pyct library.
+# At that time, the py_* and tf_* functions will reside in different libraries.
+
+
 def not_(a):
   """Functional form of "not"."""
   if tensor_util.is_tensor(a):
-    return gen_math_ops.logical_not(a)
+    return _tf_not(a)
+  return _py_not(a)
+
+
+def _tf_not(a):
+  """Implementation of the "not_" operator for TensorFlow."""
+  return gen_math_ops.logical_not(a)
+
+
+def _py_not(a):
+  """Default Python implementation of the "not_" operator."""
   return not a
 
 
@@ -37,7 +52,7 @@ def and_(a, b):
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_and(a_val, b)
-  return a_val and b()
+  return _py_lazy_and(a_val, b)
 
 
 def _tf_lazy_and(cond, b):
@@ -46,12 +61,17 @@ def _tf_lazy_and(cond, b):
   return control_flow_ops.cond(cond, b, lambda: cond)
 
 
+def _py_lazy_and(cond, b):
+  """Lazy-eval equivalent of "and" in Python."""
+  return cond and b()
+
+
 def or_(a, b):
   """Functional form of "or". Uses lazy evaluation semantics."""
   a_val = a()
   if tensor_util.is_tensor(a_val):
     return _tf_lazy_or(a_val, b)
-  return a_val or b()
+  return _py_lazy_or(a_val, b)
 
 
 def _tf_lazy_or(cond, b):
@@ -60,11 +80,16 @@ def _tf_lazy_or(cond, b):
   return control_flow_ops.cond(cond, lambda: cond, b)
 
 
+def _py_lazy_or(cond, b):
+  """Lazy-eval equivalent of "or" in Python."""
+  return cond or b()
+
+
 def eq(a, b):
   """Functional form of "equal"."""
   if tensor_util.is_tensor(a) or tensor_util.is_tensor(b):
     return _tf_equal(a, b)
-  return a == b
+  return _py_equal(a, b)
 
 
 def _tf_equal(a, b):
@@ -72,25 +97,30 @@ def _tf_equal(a, b):
   return gen_math_ops.equal(a, b)
 
 
+def _py_equal(a, b):
+  """Overload of "equal" that falls back to Python's default implementation."""
+  return a == b
+
+
 def not_eq(a, b):
   """Functional form of "not-equal"."""
   return not_(eq(a, b))
 
 
-# Default implementation for the remainings.
+# Default implementation for the rest.
 
 is_ = operator.is_
 is_not = operator.is_not
 
 
 def in_(a, b):
-  """Functional form of "less-than"."""
+  """Functional form of "in"."""
   # TODO(mdan): in and not_in should probably be convertible for some types.
   return a in b
 
 
 def not_in(a, b):
-  """Functional form of "less-than"."""
+  """Functional form of "not-in"."""
   return a not in b
 
 gt = operator.gt
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index ddf05f73f37821c6ff7e246051cd82a560f370e3..fe9486ca1ed41ce55f2219b3771639eb081a6afe 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 
 
-UNDEFINED = object()
+UNSPECIFIED = object()
 
 
 def overload_of(f):
@@ -77,14 +77,14 @@ def _py_float(x):
   return float(x)
 
 
-def int_(x=0, base=UNDEFINED):
+def int_(x=0, base=UNSPECIFIED):
   if tensor_util.is_tensor(x):
     return _tf_int(x, base)
   return _py_int(x, base)
 
 
 def _tf_int(x, base):
-  if base not in (10, UNDEFINED):
+  if base not in (10, UNSPECIFIED):
     raise NotImplementedError('base {} not supported for int'.format(base))
 
   # TODO(mdan): We shouldn't assume int32.
@@ -94,7 +94,7 @@ def _tf_int(x, base):
 
 
 def _py_int(x, base):
-  if base is UNDEFINED:
+  if base is UNSPECIFIED:
     return int(x)
   return int(x, base)
 
@@ -155,19 +155,28 @@ def _py_len(s):
 
 
 def print_(*objects, **kwargs):
+  """Overload of the print builtin."""
   # Note: Python 2.6 doesn't support explicit keywords after starargs.
   unknown_kwargs = tuple(
       set(kwargs.keys()) - set(('sep', 'end', 'file', 'flush')))
   if unknown_kwargs:
     raise ValueError('invalid keyword arguments: {}'.format(unknown_kwargs))
 
-  # TODO(mdan): use logging_ops.Print when py_func is not supported.
-  return _tf_py_func_print(objects, kwargs)
+  # TODO(mdan): Use next.flatten(objects) instead?
+  if any(tensor_util.is_tensor(o) for o in objects):
+    # TODO(mdan): use tf.print instead.
+    return _tf_py_func_print(objects, kwargs)
+  else:
+    _py_print(*objects, **kwargs)
+
+
+def _py_print(*objects, **kwargs):
+  print(*objects, **kwargs)
 
 
 def _tf_py_func_print(objects, kwargs):
   """Overload of print_ as a py_func implementation."""
-  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNDEFINED}
+  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNSPECIFIED}
   if 'flush' not in override_kwargs:
     # Defaulting to flushing the console in graph mode, which helps reduce
     # garbled output in IPython.
@@ -187,7 +196,7 @@ def _tf_py_func_print(objects, kwargs):
       print_wrapper, None, objects, use_dummy_return=True)
 
 
-def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
+def range_(start_or_stop, stop=UNSPECIFIED, step=UNSPECIFIED):
   if any(tensor_util.is_tensor(s) for s in (start_or_stop, stop, step)):
     return _tf_range(start_or_stop, stop, step)
   return _py_range(start_or_stop, stop, step)
@@ -200,10 +209,10 @@ def _tf_range(start_or_stop, stop, step):
   # graph construction error aligns the semantics with Python.
 
   # TODO(mdan): We should optimize this when a full tensor is not required.
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     # TODO(mdan): Add argument coercion similar to other cases.
     return math_ops.range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     stop = math_ops.maximum(start_or_stop, stop)
     return math_ops.range(start_or_stop, stop)
   start_or_stop = math_ops.maximum(start_or_stop, 0)
@@ -211,9 +220,9 @@ def _tf_range(start_or_stop, stop, step):
 
 
 def _py_range(start_or_stop, stop, step):
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     return range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     return range(start_or_stop, stop)
   return range(start_or_stop)
 
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/special_values.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c1b3d1f30b36c98b969e92bd2587ab62fbfc2a9
--- /dev/null
+++ b/tensorflow/python/autograph/operators/special_values.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used to capture Python idioms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class Undefined(object):
+  """Represents an undefined symbol in Python.
+
+  This is used to reify undefined symbols, which is required to use the
+  functional form of loops.
+  Example:
+
+    while n > 0:
+      n = n - 1
+      s = n
+    return s  # Runtime error if n == 0
+
+  This is valid Python code and will not result in an error as long as n
+  is positive. The use of this class is to stay as close to Python semantics
+  as possible for staged code of this nature.
+
+  Converted version of the above showing the possible usage of this class:
+
+    s = Undefined('s')
+    init_state = (s,)
+    s = while_loop(cond, body, init_state)
+    return s  # s is an instance of Undefined if the loop never runs
+
+  Attributes:
+    symbol_name: Text, identifier for the undefined symbol
+  """
+
+  def __init__(self, symbol_name):
+    self.symbol_name = symbol_name
+
+
+def is_undefined(value):
+  """Checks whether Autograph has determined that a given value is undefined.
+
+  This only works in places where Autograph reifies undefined symbols. Note that
+  if this function is passed a truly undefined symbol the call-site will raise
+  NameError.
+
+  Args:
+    value: value to test for undefinedness
+  Returns:
+    Boolean, whether the input value is undefined.
+  """
+  return isinstance(value, Undefined)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/special_values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1e087a9f3f586b646c9a73877d9bb4470c6f3e
--- /dev/null
+++ b/tensorflow/python/autograph/operators/special_values_test.py
@@ -0,0 +1,38 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for python_lang_utils module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.platform import test
+
+
+class SpecialValuesTest(test.TestCase):
+
+  def test_undefined(self):
+    undefined_symbol = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+
+    undefined_symbol2 = special_values.Undefined('name')
+    self.assertNotEqual(undefined_symbol, undefined_symbol2)
+
+    self.assertTrue(special_values.is_undefined(undefined_symbol))
+    self.assertTrue(special_values.is_undefined(undefined_symbol2))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ba8ec271394981ec878473205a8dbbd19d255f3b..67ea42aa051d9883163a9ffabe33184d5bc341f5 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -24,6 +24,7 @@ py_library(
         "ast_util.py",
         "cfg.py",
         "compiler.py",
+        "errors.py",
         "inspect_utils.py",
         "origin_info.py",
         "parser.py",
@@ -94,6 +95,7 @@ py_test(
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/pyct/testing:test_modules",
         "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index f7723412abf2e3bf0c45305e1282b4a2d032112d..b091285cab6f2f643d7f99f3063a903c1e5efdb8 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -283,13 +283,18 @@ def parallel_walk(node, other):
     n = node_stack.pop()
     o = other_stack.pop()
 
-    if (not isinstance(n, (ast.AST, gast.AST)) or
-        not isinstance(o, (ast.AST, gast.AST)) or
+    if (not isinstance(n, (ast.AST, gast.AST, str)) or
+        not isinstance(o, (ast.AST, gast.AST, str)) or
         n.__class__.__name__ != o.__class__.__name__):
-      raise ValueError('inconsistent nodes: {} and {}'.format(n, o))
+      raise ValueError('inconsistent nodes: {} ({}) and {} ({})'.format(
+          n, n.__class__.__name__, o, o.__class__.__name__))
 
     yield n, o
 
+    if isinstance(n, str):
+      assert isinstance(o, str), 'The check above should have ensured this'
+      continue
+
     for f in n._fields:
       n_child = getattr(n, f, None)
       o_child = getattr(o, f, None)
@@ -315,8 +320,8 @@ def parallel_walk(node, other):
                 f, n_child, o_child))
 
 
-class FunctionDefMatcher(gast.NodeVisitor):
-  """Finds nodes that match a given function's signature."""
+class LambdaDefinitionMatcher(gast.NodeVisitor):
+  """Finds lambda nodes that match a given lambda's signature."""
 
   def __init__(self, fn):
     self.fn = fn
@@ -349,26 +354,6 @@ class FunctionDefMatcher(gast.NodeVisitor):
 
     return True
 
-  def _argspec_compatible(self, node):
-    arg_spec = tf_inspect.getfullargspec(self.fn)
-
-    node_args = tuple(self._arg_name(arg) for arg in node.args.args)
-    if len(node_args) != len(arg_spec.args) and node.args.vararg is None:
-      return False
-
-    if arg_spec.varargs is not None and node.args.vararg is None:
-      return False
-
-    if arg_spec.varkw is not None and node.args.kwarg is None:
-      return False
-
-    node_kwonlyargs = tuple(self._arg_name(arg) for arg in node.args.kwonlyargs)
-    if (len(node_kwonlyargs) != len(arg_spec.kwonlyargs) and
-        node.args.kwarg is None):
-      return False
-
-    return True
-
   def visit_Lambda(self, node):
     self.generic_visit(node)
 
@@ -379,27 +364,8 @@ class FunctionDefMatcher(gast.NodeVisitor):
 
     self.matching_nodes.append(node)
 
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-
-    if self.fn.__name__ != node.name:
-      return
-
-    # Decorators have the ability to modify a function's signature. They usually
-    # claim that the result is indistinguishable from the original function,
-    # but it's very difficult to fool this test. As a consequence, we relax the
-    # verification and just check that the arguments are compatible.
-    if node.decorator_list:
-      if not self._argspec_compatible(node):
-        return
-    else:
-      if not self._argspec_matches(node):
-        return
-
-    self.matching_nodes.append(node)
-
 
 def find_matching_definitions(node, f):
-  matcher = FunctionDefMatcher(f)
+  matcher = LambdaDefinitionMatcher(f)
   matcher.visit(node)
   return tuple(matcher.matching_nodes)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index d4f1e1c7cdc21cc0aaea978c22081a33e9c6d2a0..c6c1132dd64351dde9274d8d3c408d4bc4988b03 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -159,11 +159,20 @@ class AstUtilTest(test.TestCase):
     })
 
   def test_parallel_walk(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
+    src = """
       def f(a):
         return a + 1
-    """))
+    """
+    node = parser.parse_str(textwrap.dedent(src))
+    for child_a, child_b in ast_util.parallel_walk(node, node):
+      self.assertEqual(child_a, child_b)
+
+  def test_parallel_walk_string_leaves(self):
+    src = """
+      def f(a):
+        global g
+    """
+    node = parser.parse_str(textwrap.dedent(src))
     for child_a, child_b in ast_util.parallel_walk(node, node):
       self.assertEqual(child_a, child_b)
 
@@ -230,99 +239,6 @@ class AstUtilTest(test.TestCase):
     nodes = ast_util.find_matching_definitions(node, f)
     self.assertLambdaNodes(nodes, ('(2)',))
 
-  def assertFunctionDefNodes(self, matching_nodes, expected_bodies):
-    self.assertEqual(len(matching_nodes), len(expected_bodies))
-    for node in matching_nodes:
-      self.assertIsInstance(node, gast.FunctionDef)
-      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
-
-  def test_find_matching_definitions_function(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x):
-        return 1
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_nested_functions_same_name(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x, *args, **kwargs):
-        def f(x, y):
-          return 1
-        return 2
-    """))
-
-    def f(x, y):
-      return x + y
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_nested_functions_same_args(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def g(x):
-        def f(x):
-          return 1
-        return 2
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_multiple_matches(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x):
-        return 1
-      def f(x):
-        return 2
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1', 'return 2'))
-
-  def test_find_matching_definitions_decorated_compatible(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      @sneaky_decorator
-      def f(x, *args, **kwargs):
-        return 1
-    """))
-
-    def f(a, b, c, d=1):
-      return a + b + c + d
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_decorated_incompatible(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      @sneaky_decorator
-      def f(x, y, z):
-        return 1
-    """))
-
-    def f(a, b, c, d, *args):
-      del args
-      return a + b + c + d
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ())
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fdfcd4dcc15b0c6238dcdc3fedef60f2984c33a4..0cedfa84ab33cc3d0931c3998214b017a6907cfc 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -393,6 +393,8 @@ class GraphBuilder(object):
   def _connect_jump_to_finally_sections(self, node):
     """Connects a jump node to the finally sections protecting it."""
     cursor = set((node,))
+    if node not in self.finally_sections:
+      return cursor
     for guard_section_id in self.finally_sections[node]:
       guard_begin, guard_ends = self.finally_section_subgraphs[guard_section_id]
       self._connect_nodes(cursor, guard_begin)
@@ -620,10 +622,10 @@ class AstToCfg(gast.NodeVisitor):
     leaving_node = self.lexical_scopes.pop()
     assert node == leaving_node
 
-  def _get_enclosing_scopes(self, include, stop_at):
+  def _get_enclosing_finally_scopes(self, stop_at):
     included = []
     for node in reversed(self.lexical_scopes):
-      if isinstance(node, include):
+      if isinstance(node, gast.Try) and node.finalbody:
         included.append(node)
       if isinstance(node, stop_at):
         return node, included
@@ -635,10 +637,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_exit_statement(self, node, *exits_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(exits_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(exits_nodes_of_type))
     if try_node is None:
       raise ValueError(
           '%s that is not enclosed by any of %s' % (node, exits_nodes_of_type))
@@ -646,10 +646,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_continue_statement(self, node, *loops_to_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(loops_to_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(loops_to_nodes_of_type))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any of %s' %
                        (node, loops_to_nodes_of_type))
@@ -698,10 +696,7 @@ class AstToCfg(gast.NodeVisitor):
     self._process_basic_statement(node)
 
   def visit_Raise(self, node):
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=(gast.FunctionDef,),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes((gast.FunctionDef,))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any FunctionDef' % node)
     self.builder.add_error_node(node, guards)
@@ -797,16 +792,13 @@ class AstToCfg(gast.NodeVisitor):
     for stmt in node.orelse:
       self.visit(stmt)
 
-    if node.handlers:
-      # TODO(mdan): Should we still support bare try/except? Might be confusing.
-      raise NotImplementedError('exceptions are not yet supported')
-
     self._exit_lexical_scope(node)
 
-    self.builder.enter_finally_section(node)
-    for stmt in node.finalbody:
-      self.visit(stmt)
-    self.builder.exit_finally_section(node)
+    if node.finalbody:
+      self.builder.enter_finally_section(node)
+      for stmt in node.finalbody:
+        self.visit(stmt)
+      self.builder.exit_finally_section(node)
 
   def visit_With(self, node):
     # TODO(mdan): Mark the context manager's exit call as exit guard.
diff --git a/tensorflow/python/autograph/pyct/cfg_test.py b/tensorflow/python/autograph/pyct/cfg_test.py
index d5870124bcec1989af27949b70e490a7a0899461..8fb66ca7a76d97767cc90b565b2766b6a536f511 100644
--- a/tensorflow/python/autograph/pyct/cfg_test.py
+++ b/tensorflow/python/autograph/pyct/cfg_test.py
@@ -40,7 +40,7 @@ class CountingVisitor(cfg.GraphVisitor):
 class GraphVisitorTest(test.TestCase):
 
   def _build_cfg(self, fn):
-    node, _ = parser.parse_entity(fn)
+    node, _, _ = parser.parse_entity(fn)
     cfgs = cfg.build(node)
     return cfgs, node
 
@@ -57,15 +57,14 @@ class GraphVisitorTest(test.TestCase):
     graph, = graphs.values()
     visitor = CountingVisitor(graph)
     visitor.visit_forward()
-    fn_node = node.body[0]
 
-    self.assertEqual(visitor.counts[fn_node.args], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].test], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].body[0]], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].body[1]], 1)
+    self.assertEqual(visitor.counts[node.args], 1)
+    self.assertEqual(visitor.counts[node.body[0].test], 1)
+    self.assertEqual(visitor.counts[node.body[0].body[0]], 1)
+    self.assertEqual(visitor.counts[node.body[0].body[1]], 1)
     # The return node should be unreachable in forward direction.
-    self.assertTrue(fn_node.body[0].body[2] not in visitor.counts)
-    self.assertEqual(visitor.counts[fn_node.body[1]], 1)
+    self.assertNotIn(node.body[0].body[2], visitor.counts)
+    self.assertEqual(visitor.counts[node.body[1]], 1)
 
   def test_basic_coverage_reverse(self):
 
@@ -80,20 +79,19 @@ class GraphVisitorTest(test.TestCase):
     graph, = graphs.values()
     visitor = CountingVisitor(graph)
     visitor.visit_reverse()
-    fn_node = node.body[0]
 
-    self.assertEqual(visitor.counts[fn_node.args], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].test], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].body[0]], 1)
-    self.assertEqual(visitor.counts[fn_node.body[0].body[1]], 1)
-    self.assertTrue(visitor.counts[fn_node.body[0].body[2]], 1)
-    self.assertEqual(visitor.counts[fn_node.body[1]], 1)
+    self.assertEqual(visitor.counts[node.args], 1)
+    self.assertEqual(visitor.counts[node.body[0].test], 1)
+    self.assertEqual(visitor.counts[node.body[0].body[0]], 1)
+    self.assertEqual(visitor.counts[node.body[0].body[1]], 1)
+    self.assertTrue(visitor.counts[node.body[0].body[2]], 1)
+    self.assertEqual(visitor.counts[node.body[1]], 1)
 
 
 class AstToCfgTest(test.TestCase):
 
   def _build_cfg(self, fn):
-    node, _ = parser.parse_entity(fn)
+    node, _, _ = parser.parse_entity(fn)
     cfgs = cfg.build(node)
     return cfgs
 
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf.py b/tensorflow/python/autograph/pyct/common_transformers/anf.py
index 192621b1cd329acec56c9517f3c885ee622b62e9..246c26833f0c30c757526209b710ef6df90eebf0 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf.py
@@ -36,10 +36,10 @@ from tensorflow.python.autograph.pyct import transformer
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,19 +68,19 @@ class AnfTransformer(transformer.Base):
   # processing the `body` and the `orelse` need to be kept together with them,
   # and not accidentally lifted out of the `if`.
 
-  def __init__(self, entity_info, gensym_source=None):
+  def __init__(self, ctx, gensym_source=None):
     """Creates an ANF transformer.
 
     Args:
-      entity_info: transformer.EntityInfo
+      ctx: transformer.Context
       gensym_source: An optional object with the same interface as `DummyGensym`
         for generating unique names
     """
-    super(AnfTransformer, self).__init__(entity_info)
+    super(AnfTransformer, self).__init__(ctx)
     if gensym_source is None:
-      self._gensym = DummyGensym(entity_info)
+      self._gensym = DummyGensym(ctx)
     else:
-      self._gensym = gensym_source(entity_info)
+      self._gensym = gensym_source(ctx)
     self._pending_statements = []
 
   def _consume_pending_statements(self):
@@ -406,7 +406,7 @@ class AnfTransformer(transformer.Base):
     return node
 
 
-def transform(node, entity_info, gensym_source=None):
+def transform(node, ctx, gensym_source=None):
   """Converts the given node to A-normal form (ANF).
 
   The general idea of A-normal form: https://en.wikipedia.org/wiki/A-normal_form
@@ -416,9 +416,9 @@ def transform(node, entity_info, gensym_source=None):
 
   Args:
     node: The node to transform.
-    entity_info: transformer.EntityInfo.  TODO(mdan): What information does this
+    ctx: transformer.EntityInfo.  TODO(mdan): What information does this
       argument provide?
     gensym_source: An optional object with the same interface as `DummyGensym`
       for generating unique names.
   """
-  return AnfTransformer(entity_info, gensym_source=gensym_source).visit(node)
+  return AnfTransformer(ctx, gensym_source=gensym_source).visit(node)
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index 525d4886dee37c79d4087a293fa9ce5424a74c15..d7750604778e1a20b61a49856d1de9eb62306329 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -30,10 +30,10 @@ from tensorflow.python.platform import test
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,21 +68,21 @@ def exec_expected_result():
 
 class AnfTransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
+    return transformer.Context(entity_info)
 
   def test_basic(self):
     def test_function():
       a = 0
       return a
-    node, _ = parser.parse_entity(test_function)
-    node = anf.transform(node.body[0], self._simple_source_info())
+    node, _, _ = parser.parse_entity(test_function)
+    node = anf.transform(node, self._simple_context())
     result, _ = compiler.ast_to_object(node)
     self.assertEqual(test_function(), result.test_function())
 
@@ -97,19 +97,19 @@ class AnfTransformerTest(test.TestCase):
     # Testing the code bodies only.  Wrapping them in functions so the
     # syntax highlights nicely, but Python doesn't try to execute the
     # statements.
-    exp_node, _ = parser.parse_entity(expected_fn)
-    node, _ = parser.parse_entity(test_fn)
+    exp_node, _, _ = parser.parse_entity(expected_fn)
+    node, _, _ = parser.parse_entity(test_fn)
     node = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
-    exp_name = exp_node.body[0].name
+        node, self._simple_context(), gensym_source=DummyGensym)
+    exp_name = exp_node.name
     # Ignoring the function names in the result because they can't be
     # the same (because both functions have to exist in the same scope
     # at the same time).
-    node.body[0].name = exp_name
+    node.name = exp_name
     self.assert_same_ast(exp_node, node)
     # Check that ANF is idempotent
     node_repeated = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
+        node, self._simple_context(), gensym_source=DummyGensym)
     self.assert_same_ast(node_repeated, node)
 
   def test_binop_basic(self):
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 420f3bb22388801c54f27e8bf1701febb90ad34a..76e86d0d632194f0bf6586a53314e298fea39a8e 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -46,8 +46,8 @@ def ast_to_source(node, indentation='  '):
   """
   if not isinstance(node, (list, tuple)):
     node = (node,)
-  generator = astor.codegen.SourceGenerator(indentation, False,
-                                            astor.string_repr.pretty_string)
+  generator = astor.code_gen.SourceGenerator(indentation, False,
+                                             astor.string_repr.pretty_string)
 
   for n in node:
     if isinstance(n, gast.AST):
@@ -77,6 +77,17 @@ def ast_to_source(node, indentation='  '):
   return code
 
 
+def _source_to_module(source, delete_on_exit):
+  with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+    module_name = os.path.basename(f.name[:-3])
+    f.write(source)
+
+  # TODO(mdan): Try flush() and delete=False instead.
+  if delete_on_exit:
+    atexit.register(lambda: os.remove(f.name))
+  return imp.load_source(module_name, f.name), f.name
+
+
 def ast_to_object(nodes,
                   indentation='  ',
                   include_source_map=False,
@@ -98,8 +109,7 @@ def ast_to_object(nodes,
         compilation on exit.
 
   Returns:
-    compiled_nodes: A module object containing the compiled source code.
-    source: The source code of the compiled object
+    (module, source): A compiled module, and the source code of the module.
   Raises:
     ValueError: If ag_source_map__ is already in the namespace of the compiled
     nodes.
@@ -112,33 +122,25 @@ def ast_to_object(nodes,
   if source_prefix:
     source = source_prefix + '\n' + source
 
-  with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-    module_name = os.path.basename(f.name[:-3])
-    f.write(source)
+  module, filename = _source_to_module(source, delete_on_exit)
 
+  if include_source_map:
     if isinstance(nodes, (list, tuple)):
       indices = range(-len(nodes), 0)
     else:
       indices = (-1,)
 
-    if include_source_map:
-      source_map = origin_info.create_source_map(nodes, source, f.name, indices)
+    source_map = origin_info.create_source_map(nodes, source, filename, indices)
 
-  # TODO(mdan): Try flush() and delete=False instead.
-  if delete_on_exit:
-    atexit.register(lambda: os.remove(f.name))
-  compiled_nodes = imp.load_source(module_name, f.name)
-
-  # TODO(znado): Clean this up so we don't need to attach it to the namespace.
-  # We cannot get the rewritten function name until it is too late so templating
-  # is hard, and this cleanly fixes the issues encountered with nested functions
-  # because this is attached to the outermost one.
-  if include_source_map:
+    # TODO(znado): Clean this up so we don't need to attach it to the namespace.
+    # We cannot get the rewritten function name until it is too late so
+    # templating is hard, and this cleanly fixes the issues encountered with
+    # nested functions because this is attached to the outermost one.
     # TODO(mdan): This name should be decided by the caller.
     source_map_name = 'ag_source_map__'
-    assert source_map_name not in compiled_nodes.__dict__, (
+    assert source_map_name not in module.__dict__, (
         'cannot convert %s because is has namespace attribute "%s", which is '
-        'reserved for AutoGraph.') % (compiled_nodes, source_map_name)
-    compiled_nodes.__dict__[source_map_name] = source_map
+        'reserved for AutoGraph.') % (module, source_map_name)
+    module.__dict__[source_map_name] = source_map
 
-  return compiled_nodes, source
+  return module, source
diff --git a/tensorflow/python/autograph/pyct/compiler_test.py b/tensorflow/python/autograph/pyct/compiler_test.py
index 6fa289d3cc34a391296060d8edd0a21e4d80561b..29e8a198fe6bd29203d98a671c77980d7e1a169e 100644
--- a/tensorflow/python/autograph/pyct/compiler_test.py
+++ b/tensorflow/python/autograph/pyct/compiler_test.py
@@ -39,11 +39,12 @@ class CompilerTest(test.TestCase):
         b = x + 1
       return b
 
+    _, _, all_nodes = parser.parse_entity(test_fn)
+
     self.assertEqual(
         textwrap.dedent(tf_inspect.getsource(test_fn)),
         tf_inspect.getsource(
-            compiler.ast_to_object(
-                parser.parse_entity(test_fn)[0].body[0])[0].test_fn))
+            compiler.ast_to_object(all_nodes)[0].test_fn))
 
   def test_ast_to_source(self):
     node = gast.If(
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2049c40bb5fb72f7dd8d3191bc3163abdcf309
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code transformation exceptions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.utils import ag_logging
+
+
+class AutoGraphError(Exception):
+  pass
+
+
+class InternalError(AutoGraphError):
+  """Raised when AutoGraph finds an unexpected error."""
+
+  def __init__(self, message, original_exc):
+    super(InternalError, self).__init__()
+    self.message = message
+    self.original_exc = original_exc
+
+  def __str__(self):
+    return '{} during {}: {}'.format(
+        type(self.original_exc).__name__, self.message, self.original_exc)
+
+
+def report_internal_error(entity, exception):
+  ag_logging.log(1, 'Error transforming %s', entity, exc_info=True)
+  # TODO(znado): Add external bug reporting instructions.
+  raise AutoGraphError(
+      'Unexpected error transforming %s. If you believe this is due to a bug,'
+      ' please set the verbosity to 10 (on Linux, `export '
+      'AUTOGRAPH_VERBOSITY=10`) and attach the full output when filing the bug '
+      'report. Caused by: %s' % (entity, exception))
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 6d9bc43d34652f2fd67b74faf4bff77afad54119..f2b780d7fcde7d56e1465e80fc3fc82ae0eb090c 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -31,7 +31,7 @@ from tensorflow.python.util import tf_inspect
 
 # These functions test negative for isinstance(*, types.BuiltinFunctionType)
 # and inspect.isbuiltin, and are generally not visible in globals().
-# TODO(mdan): Find a more generic way to test this - just enumerate __builtin__?
+# TODO(mdan): Remove this.
 SPECIAL_BUILTINS = {
     'dict': dict,
     'enumerate': enumerate,
@@ -42,6 +42,7 @@ SPECIAL_BUILTINS = {
     'print': print,
     'range': range,
     'tuple': tuple,
+    'type': type,
     'zip': zip
 }
 
@@ -73,7 +74,7 @@ def isnamedtuple(f):
 
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
-  if f in SPECIAL_BUILTINS.values():
+  if f in six.moves.builtins.__dict__.values():
     return True
   if isinstance(f, types.BuiltinFunctionType):
     return True
@@ -125,6 +126,10 @@ def getqualifiedname(namespace, object_, max_depth=5, visited=None):
   if visited is None:
     visited = set()
 
+  # Copy the dict to avoid "changed size error" during concurrent invocations.
+  # TODO(mdan): This is on the hot path. Can we avoid the copy?
+  namespace = dict(namespace)
+
   for name in namespace:
     # The value may be referenced by more than one symbol, case in which
     # any symbol will be fine. If the program contains symbol aliases that
@@ -186,9 +191,12 @@ def getdefiningclass(m, owner_class):
   return owner_class
 
 
-def isweakrefself(m):
-  """Tests whether an object is a "weakref self" wrapper, see getmethodself."""
-  return hasattr(m, '__self__') and hasattr(m.__self__, 'ag_self_weakref__')
+def istfmethodtarget(m):
+  """Tests whether an object is a `function.TfMethodTarget`."""
+  # See eager.function.TfMethodTarget for more details.
+  return (hasattr(m, '__self__') and
+          hasattr(m.__self__, 'weakrefself_target__') and
+          hasattr(m.__self__, 'weakrefself_func__'))
 
 
 def getmethodself(m):
@@ -201,8 +209,8 @@ def getmethodself(m):
   # A fallback allowing methods to be actually bound to a type different
   # than __self__. This is useful when a strong reference from the method
   # to the object is not desired, for example when caching is involved.
-  if isweakrefself(m):
-    return m.__self__.ag_self_weakref__()
+  if istfmethodtarget(m):
+    return m.__self__.target
 
   return m.__self__
 
@@ -281,6 +289,21 @@ def getmethodclass(m):
   return None
 
 
+def getfutureimports(entity):
+  """Detects what future imports are necessary to safely execute entity source.
+
+  Args:
+    entity: Any object
+
+  Returns:
+    A tuple of future strings
+  """
+  if not tf_inspect.isfunction(entity):
+    return tuple()
+  return tuple(sorted(name for name, value in entity.__globals__.items()
+                      if getattr(value, '__module__', None) == '__future__'))
+
+
 class SuperWrapperForDynamicAttrs(object):
   """A wrapper that supports dynamic attribute lookup on the super object.
 
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 4c4c0977b0fef2fdfee69d2e7c608ad1a412aa21..75b41d226f68f5a64a1eba6fbad9e7a256e62ff0 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -28,6 +28,8 @@ import six
 
 from tensorflow.python import lib
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct.testing import future_import_module
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -358,15 +360,13 @@ class InspectUtilsTest(test.TestCase):
   def test_getmethodclass_weakref_mechanism(self):
     test_obj = TestClass()
 
-    class WeakrefWrapper(object):
-
-      def __init__(self):
-        self.ag_self_weakref__ = weakref.ref(test_obj)
-
     def test_fn(self):
       return self
 
-    bound_method = types.MethodType(test_fn, WeakrefWrapper())
+    bound_method = types.MethodType(
+        test_fn,
+        function.TfMethodTarget(
+            weakref.ref(test_obj), test_obj.member_function))
     self.assertEqual(inspect_utils.getmethodclass(bound_method), TestClass)
 
   def test_getmethodclass_no_bool_conversion(self):
@@ -415,6 +415,12 @@ class InspectUtilsTest(test.TestCase):
     self.assertTrue(inspect_utils.isbuiltin(zip))
     self.assertFalse(inspect_utils.isbuiltin(function_decorator))
 
+  def test_getfutureimports_simple_case(self):
+    expected_imports = ('absolute_import', 'division', 'print_function',
+                        'with_statement')
+    self.assertEqual(inspect_utils.getfutureimports(future_import_module.f),
+                     expected_imports)
+
   def test_super_wrapper_for_dynamic_attrs(self):
 
     a = object()
diff --git a/tensorflow/python/autograph/pyct/origin_info.py b/tensorflow/python/autograph/pyct/origin_info.py
index 102bd42c91ca8189355fe39d014521151c0a6377..82e3ed69ddb3b012f3e414b684a2c311a49c0485 100644
--- a/tensorflow/python/autograph/pyct/origin_info.py
+++ b/tensorflow/python/autograph/pyct/origin_info.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import difflib
+import os
 import tokenize
 
 import gast
@@ -26,6 +28,8 @@ import six
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -48,6 +52,7 @@ class Location(
     filename: Text
     lineno: int, 1-based
     col_offset: int
+    line_loc: LineLocation
   """
 
   @property
@@ -73,6 +78,13 @@ class OriginInfo(
     return (self.loc.filename, self.loc.lineno, self.function_name,
             self.source_code_line)
 
+  def __repr__(self):
+    if self.loc.filename:
+      return '{}:{}:{}'.format(
+          os.path.split(self.loc.filename)[1], self.loc.lineno,
+          self.loc.col_offset)
+    return '<no file>:{}:{}'.format(self.loc.lineno, self.loc.col_offset)
+
 
 # TODO(mdan): This source map should be a class - easier to refer to.
 def create_source_map(nodes, code, filename, indices_in_code):
@@ -88,66 +100,79 @@ def create_source_map(nodes, code, filename, indices_in_code):
         which the corresponding of node should appear.
 
   Returns:
-    Dict[CodeLocation, OriginInfo], mapping locations in code to locations
+    Dict[LineLocation, OriginInfo], mapping locations in code to locations
     indicated by origin annotations in node.
   """
   reparsed_nodes = parser.parse_str(code)
   reparsed_nodes = [reparsed_nodes.body[i] for i in indices_in_code]
+  for node in reparsed_nodes:
+    resolve(node, code)
 
-  resolve(reparsed_nodes, code)
   result = {}
 
-  for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
-    # Note: generated code might not be mapped back to its origin.
-    # TODO(mdan): Generated code should always be mapped to something.
-    origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
-    final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
-    if origin_info is None or final_info is None:
-      continue
+  try:
+    for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
+      # Note: generated code might not be mapped back to its origin.
+      # TODO(mdan): Generated code should always be mapped to something.
+      origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
+      final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
+      if origin_info is None or final_info is None:
+        continue
 
-    line_loc = LineLocation(filename, final_info.loc.lineno)
+      line_loc = LineLocation(filename, final_info.loc.lineno)
 
-    existing_origin = result.get(line_loc)
-    if existing_origin is not None:
-      # Overlaps may exist because of child nodes, but almost never to
-      # different line locations. Exception make decorated functions, where
-      # both lines are mapped to the same line in the AST.
+      existing_origin = result.get(line_loc)
+      if existing_origin is not None:
+        # Overlaps may exist because of child nodes, but almost never to
+        # different line locations. Exception make decorated functions, where
+        # both lines are mapped to the same line in the AST.
 
-      # Line overlaps: keep bottom node.
-      if existing_origin.loc.line_loc == origin_info.loc.line_loc:
-        if existing_origin.loc.lineno >= origin_info.loc.lineno:
-          continue
+        # Line overlaps: keep bottom node.
+        if existing_origin.loc.line_loc == origin_info.loc.line_loc:
+          if existing_origin.loc.lineno >= origin_info.loc.lineno:
+            continue
 
-      # In case of overlaps, keep the leftmost node.
-      if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
-        continue
+        # In case of overlaps, keep the leftmost node.
+        if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
+          continue
 
-    result[line_loc] = origin_info
+      result[line_loc] = origin_info
+  except ValueError:
+    if logging.has_verbosity(3):
+      for n, rn in zip(nodes, reparsed_nodes):
+        nodes_str = pretty_printer.fmt(n, color=False, noanno=True)
+        reparsed_nodes_str = pretty_printer.fmt(rn, color=False, noanno=True)
+        diff = difflib.context_diff(
+            nodes_str.split('\n'),
+            reparsed_nodes_str.split('\n'),
+            fromfile='Original nodes',
+            tofile='Reparsed nodes',
+            n=7)
+        diff = '\n'.join(diff)
+        logging.log(3, 'AST seems to lack integrity. Diff:\n%s', diff)
+    raise
 
   return result
 
 
 # TODO(znado): Consider refactoring this into a Visitor.
 # TODO(mdan): Does this work correctly with inner functions?
-def resolve(nodes, source, function=None):
-  """Adds an origin information to all nodes inside the body of function.
+def resolve(node, source, function=None):
+  """Adds an origin information to node and its subnodes.
 
-  Args:
-    nodes: Union[ast.AST, Iterable[ast.AST, ...]]
-    source: Text, the source code string for the function whose body nodes will
-      be annotated.
-    function: Callable, the function that will have all nodes inside of it
-      annotation with an OriginInfo annotation with key anno.Basic.ORIGIN.  If
-      it is None then only the line numbers and column offset will be set in the
-      annotation, with the rest of the information being None.
+  This allows us to map the original source code line numbers to generated
+  source code.
 
-  Returns:
-    A tuple of the AST node for function and a String containing its source
-    code.
+  Args:
+    node: gast.AST node. Should be a gast.FunctionDef. This is the node we
+        annotate with origin information.
+    source: Text, the source code. Should satisfy relationship
+        `node in iter_tree(gast.parse(source))`; otherwise the lineno will be
+        unreliable.
+    function: The original function. If it is None then only the line numbers
+        and column offset will be set in the annotation, with the rest of the
+        information being None.
   """
-  if not isinstance(nodes, (list, tuple)):
-    nodes = (nodes,)
-
   if function:
     _, function_lineno = tf_inspect.getsourcelines(function)
     function_filepath = tf_inspect.getsourcefile(function)
@@ -165,22 +190,21 @@ def resolve(nodes, source, function=None):
       comment_map[srow] = tok_string.strip()[1:].strip()
 
   source_lines = source.split('\n')
-  for node in nodes:
-    for n in gast.walk(node):
-      if not hasattr(n, 'lineno'):
-        continue
+  for n in gast.walk(node):
+    if not hasattr(n, 'lineno'):
+      continue
 
-      lineno_in_body = n.lineno
+    within_body_offset = n.lineno - node.lineno
 
-      source_code_line = source_lines[lineno_in_body - 1]
-      if function:
-        source_lineno = function_lineno + lineno_in_body
-        function_name = function.__name__
-      else:
-        source_lineno = lineno_in_body
-        function_name = None
+    source_code_line = source_lines[n.lineno - 1]
+    if function:
+      source_lineno = function_lineno + within_body_offset
+      function_name = function.__name__
+    else:
+      source_lineno = n.lineno
+      function_name = None
 
-      location = Location(function_filepath, source_lineno, n.col_offset)
-      origin = OriginInfo(location, function_name,
-                          source_code_line, comment_map.get(source_lineno))
-      anno.setanno(n, anno.Basic.ORIGIN, origin)
+    location = Location(function_filepath, source_lineno, n.col_offset)
+    origin = OriginInfo(location, function_name,
+                        source_code_line, comment_map.get(source_lineno))
+    anno.setanno(n, anno.Basic.ORIGIN, origin)
diff --git a/tensorflow/python/autograph/pyct/origin_info_test.py b/tensorflow/python/autograph/pyct/origin_info_test.py
index 3b1d5f2040e691bcb8ff47f3a16d3bcbc6936704..a3dc2f827168acf2a1372c866489e18f3cde6a53 100644
--- a/tensorflow/python/autograph/pyct/origin_info_test.py
+++ b/tensorflow/python/autograph/pyct/origin_info_test.py
@@ -32,18 +32,17 @@ class OriginInfoTest(test.TestCase):
     def test_fn(x):
       return x + 1
 
-    node, _ = parser.parse_entity(test_fn)
+    node, _, _ = parser.parse_entity(test_fn)
     fake_origin = origin_info.OriginInfo(
         loc=origin_info.Location('fake_filename', 3, 7),
         function_name='fake_function_name',
         source_code_line='fake source line',
         comment=None)
-    fn_node = node.body[0]
-    anno.setanno(fn_node.body[0], anno.Basic.ORIGIN, fake_origin)
-    converted_code = compiler.ast_to_source(fn_node)
+    anno.setanno(node.body[0], anno.Basic.ORIGIN, fake_origin)
+    converted_code = compiler.ast_to_source(node)
 
     source_map = origin_info.create_source_map(
-        fn_node, converted_code, 'test_filename', [0])
+        node, converted_code, 'test_filename', [0])
 
     loc = origin_info.LineLocation('test_filename', 2)
     self.assertIn(loc, source_map)
@@ -54,12 +53,11 @@ class OriginInfoTest(test.TestCase):
     def test_fn(x):
       return x + 1
 
-    node, _ = parser.parse_entity(test_fn)
-    fn_node = node.body[0]
-    converted_code = compiler.ast_to_source(fn_node)
+    node, _, _ = parser.parse_entity(test_fn)
+    converted_code = compiler.ast_to_source(node)
 
     source_map = origin_info.create_source_map(
-        fn_node, converted_code, 'test_filename', [0])
+        node, converted_code, 'test_filename', [0])
 
     self.assertEqual(len(source_map), 0)
 
@@ -69,29 +67,57 @@ class OriginInfoTest(test.TestCase):
       """Docstring."""
       return x  # comment
 
-    node, source = parser.parse_entity(test_fn)
-    fn_node = node.body[0]
+    node, source, _ = parser.parse_entity(test_fn)
 
-    origin_info.resolve(fn_node, source)
+    origin_info.resolve(node, source)
 
-    origin = anno.getanno(fn_node, anno.Basic.ORIGIN)
+    origin = anno.getanno(node, anno.Basic.ORIGIN)
     self.assertEqual(origin.loc.lineno, 1)
     self.assertEqual(origin.loc.col_offset, 0)
     self.assertEqual(origin.source_code_line, 'def test_fn(x):')
     self.assertIsNone(origin.comment)
 
-    origin = anno.getanno(fn_node.body[0], anno.Basic.ORIGIN)
+    origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
     self.assertEqual(origin.loc.lineno, 2)
     self.assertEqual(origin.loc.col_offset, 2)
     self.assertEqual(origin.source_code_line, '  """Docstring."""')
     self.assertIsNone(origin.comment)
 
-    origin = anno.getanno(fn_node.body[1], anno.Basic.ORIGIN)
+    origin = anno.getanno(node.body[1], anno.Basic.ORIGIN)
     self.assertEqual(origin.loc.lineno, 3)
     self.assertEqual(origin.loc.col_offset, 2)
     self.assertEqual(origin.source_code_line, '  return x  # comment')
     self.assertEqual(origin.comment, 'comment')
 
+  def disabled_test_resolve_with_future_imports(self):
+
+    def test_fn(x):
+      """Docstring."""
+      print(x)
+      return x  # comment
+
+    node, source, _ = parser.parse_entity(test_fn)
+
+    origin_info.resolve(node, source)
+
+    origin = anno.getanno(node, anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 2)
+    self.assertEqual(origin.loc.col_offset, 0)
+    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
+    self.assertIsNone(origin.comment)
+
+    origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 3)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  """Docstring."""')
+    self.assertIsNone(origin.comment)
+
+    origin = anno.getanno(node.body[2], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 5)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  return x  # comment')
+    self.assertEqual(origin.comment, 'comment')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index d04a40157e7ef59c887b2e3af0870ab087fd93d0..d6f517418094eaab0d0b8f3dc016517292bc3942 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -21,7 +21,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import textwrap
+import threading
 
 import gast
 import six
@@ -29,11 +31,34 @@ import six
 from tensorflow.python.util import tf_inspect
 
 
+_parse_lock = threading.Lock()  # Prevents linecache concurrency errors.
+
+
 def parse_entity(entity):
-  """Returns the AST of given entity."""
-  source = tf_inspect.getsource(entity)
+  """Returns the AST and source code of given entity.
+
+  Args:
+    entity: A python function/method/class
 
-  def fail(comment):
+  Returns:
+    gast.AST, str, gast.ModuleNode: a tuple of the AST node corresponding
+    exactly to the entity; the string that was parsed to generate the AST; and
+    the containing module AST node, which might contain extras like future
+    import nodes.
+  """
+  try:
+    with _parse_lock:
+      source = tf_inspect.getsource_no_unwrap(entity)
+  except (IOError, OSError) as e:
+    raise ValueError(
+        'Unable to locate the source code of {}. Note that functions defined'
+        ' in certain environments, like the interactive Python shell do not'
+        ' expose their source code. If that is the case, you should to define'
+        ' them in a .py source file. If you are certain the code is'
+        ' graph-compatible, wrap the call using'
+        ' @tf.autograph.do_not_convert. Original error: {}'.format(entity, e))
+
+  def raise_parse_failure(comment):
     raise ValueError(
         'Failed to parse source code of {}, which Python reported as:\n{}\n'
         '{}'.format(entity, source, comment))
@@ -44,13 +69,16 @@ def parse_entity(entity):
   source = textwrap.dedent(source)
 
   try:
-    return parse_str(source), source
+    module_node = parse_str(source)
+    assert len(module_node.body) == 1
+    return module_node.body[0], source, module_node
 
   except IndentationError:
     # The text below lists the causes of this error known to us. There may
     # be more.
-    fail('This may be caused by multiline strings or comments not indented at'
-         'the same level as the code.')
+    raise_parse_failure(
+        'This may be caused by multiline strings or comments not indented at'
+        ' the same level as the code.')
 
   except SyntaxError as e:
     if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
@@ -71,8 +99,9 @@ def parse_entity(entity):
 
     # Give up if there's nothing we can chip away.
     if len(lines) == lineno and len(lines[-1]) == offset:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement.')
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement.')
 
     # Drop all lines following the error location
     # TODO(mdan): What's with the pylint errors?
@@ -82,18 +111,20 @@ def parse_entity(entity):
     new_source = '\n'.join(lines)
 
     try:
-      return parse_str(new_source), new_source
+      module_node = parse_str(new_source)
+      return module_node.body[0], new_source, module_node
     except SyntaxError as e:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement. Tried to strip down the'
-           ' source to:\n{}\nBut that did not work.'.format(new_source))
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement. Tried to strip down the'
+          ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
 
-  if six.PY2 and '.print(' in src:
+  if six.PY2 and re.search('\\Wprint\\s*\\(', src):
     # This special treatment is required because gast.parse is not aware of
     # whether print_function was present in the original context.
     src = 'from __future__ import print_function\n' + src
diff --git a/tensorflow/python/autograph/pyct/parser_test.py b/tensorflow/python/autograph/pyct/parser_test.py
index d3a7b7a014646601339a79e6cf97461853bccbb2..ee3e2808259e7b746b1c936318115aa5e2259a10 100644
--- a/tensorflow/python/autograph/pyct/parser_test.py
+++ b/tensorflow/python/autograph/pyct/parser_test.py
@@ -31,8 +31,8 @@ class ParserTest(test.TestCase):
     def f(x):
       return x + 1
 
-    mod, _ = parser.parse_entity(f)
-    self.assertEqual('f', mod.body[0].name)
+    node, _, _ = parser.parse_entity(f)
+    self.assertEqual('f', node.name)
 
   def test_parse_str(self):
     mod = parser.parse_str(
@@ -42,6 +42,24 @@ class ParserTest(test.TestCase):
     """))
     self.assertEqual('f', mod.body[0].name)
 
+  def test_parse_str_print(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+            def f(x):
+              print(x)
+              return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
+  def test_parse_str_weird_print(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+            def f(x):
+              print (x)
+              return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
   def test_parse_comments(self):
     def f():
 # unindented comment
diff --git a/tensorflow/python/autograph/pyct/pretty_printer.py b/tensorflow/python/autograph/pyct/pretty_printer.py
index bacc1e4a7774ec5b84495255042392fe089150d5..9a4f509ec36bd5077a8dca439a06bdc3132870df 100644
--- a/tensorflow/python/autograph/pyct/pretty_printer.py
+++ b/tensorflow/python/autograph/pyct/pretty_printer.py
@@ -18,17 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 import gast
+import six
 import termcolor
 
 
 class PrettyPrinter(gast.NodeVisitor):
   """Print AST nodes."""
 
-  def __init__(self, color):
+  def __init__(self, color, noanno):
     self.indent_lvl = 0
     self.result = ''
     self.color = color
+    self.noanno = noanno
 
   def _color(self, string, color, attrs=None):
     if self.color:
@@ -55,6 +58,15 @@ class PrettyPrinter(gast.NodeVisitor):
     self.result += '\n'
 
   def generic_visit(self, node, name=None):
+    # In very rare instances, a list can contain something other than a Node.
+    # e.g. Global contains a list of strings.
+    if isinstance(node, str):
+      if name:
+        self._print('%s%s="%s"' % (self._indent(), name, node))
+      else:
+        self._print('%s"%s"' % (self._indent(), node))
+      return
+
     if node._fields:
       cont = ':'
     else:
@@ -68,6 +80,8 @@ class PrettyPrinter(gast.NodeVisitor):
 
     self.indent_lvl += 1
     for f in node._fields:
+      if self.noanno and f.startswith('__'):
+        continue
       if not hasattr(node, f):
         self._print('%s%s' % (self._indent(), self._warning('%s=<unset>' % f)))
         continue
@@ -94,17 +108,20 @@ class PrettyPrinter(gast.NodeVisitor):
           self._print('%s%s=()' % (self._indent(), self._field(f)))
       elif isinstance(v, gast.AST):
         self.generic_visit(v, f)
-      elif isinstance(v, str):
+      elif isinstance(v, six.binary_type):
+        self._print('%s%s=%s' % (self._indent(), self._field(f),
+                                 self._value('b"%s"' % v)))
+      elif isinstance(v, six.text_type):
         self._print('%s%s=%s' % (self._indent(), self._field(f),
-                                 self._value('"%s"' % v)))
+                                 self._value('u"%s"' % v)))
       else:
         self._print('%s%s=%s' % (self._indent(), self._field(f),
                                  self._value(v)))
     self.indent_lvl -= 1
 
 
-def fmt(node, color=True):
-  printer = PrettyPrinter(color)
+def fmt(node, color=True, noanno=False):
+  printer = PrettyPrinter(color, noanno)
   if isinstance(node, (list, tuple)):
     for n in node:
       printer.visit(n)
diff --git a/tensorflow/python/autograph/pyct/pretty_printer_test.py b/tensorflow/python/autograph/pyct/pretty_printer_test.py
index 1c76744547f5842736e02dae8284161f8825f449..26d70f2e6006feb75c117c0034e4c5d570f76c2a 100644
--- a/tensorflow/python/autograph/pyct/pretty_printer_test.py
+++ b/tensorflow/python/autograph/pyct/pretty_printer_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
+import textwrap
 
 from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.platform import test
@@ -26,6 +27,14 @@ from tensorflow.python.platform import test
 
 class PrettyPrinterTest(test.TestCase):
 
+  def test_unicode_bytes(self):
+    source = textwrap.dedent('''
+    def f():
+      return b'b', u'u', 'depends_py2_py3'
+    ''')
+    node = ast.parse(source)
+    self.assertIsNotNone(pretty_printer.fmt(node))
+
   def test_format(self):
     node = ast.FunctionDef(
         name='f',
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 4359e0a2682f0f6818a0c2e0aaffeaa12718c514..dd3d1d5d1365c6a5aa5f1a7f16a485d40c3da6a1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -25,6 +25,7 @@ import copy
 import weakref
 
 import gast
+import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import qual_names
@@ -149,6 +150,14 @@ class _Lambda(object):
     self.args = set()
 
 
+class _Comprehension(object):
+
+  no_root = True
+
+  def __init__(self):
+    self.targets = set()
+
+
 class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information.
 
@@ -199,12 +208,27 @@ class ActivityAnalyzer(transformer.Base):
       if qn.owner_set & set(l.args):
         return
 
+    # When inside a comprehension, ignore any of the comprehensions's targets.
+    # This includes attributes or slices of those arguments.
+    # This is not true in Python2, which leaks symbols.
+    if six.PY3:
+      for l in self.state[_Comprehension]:
+        if qn in l.targets:
+          return
+        if qn.owner_set & set(l.targets):
+          return
+
     if isinstance(node.ctx, gast.Store):
-      self.scope.mark_modified(qn)
-      if qn.is_composite and composite_writes_alter_parent:
-        self.scope.mark_modified(qn.parent)
-      if self._in_aug_assign:
-        self.scope.mark_read(qn)
+      # In comprehensions, modified symbols are the comprehension targets.
+      if six.PY3 and self.state[_Comprehension].level > 0:
+        # Like a lambda's args, they are tracked separately in Python3.
+        self.state[_Comprehension].targets.add(qn)
+      else:
+        self.scope.mark_modified(qn)
+        if qn.is_composite and composite_writes_alter_parent:
+          self.scope.mark_modified(qn.parent)
+        if self._in_aug_assign:
+          self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
@@ -241,10 +265,10 @@ class ActivityAnalyzer(transformer.Base):
     self._exit_scope()
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
   def visit_Expr(self, node):
@@ -338,12 +362,41 @@ class ActivityAnalyzer(transformer.Base):
     self.state[_Lambda].exit()
     return node
 
+  def _process_iterable_comprehension(self, node):
+    # This handles ListComp, SetComp, GeneratorExp.
+    self.state[_Comprehension].enter()
+    # Note: it's important to visit the generators first to properly account
+    # for the variables local to these generators. Example: `x` is local to the
+    # expression `x for x in y`.
+    node.generators = self.visit_block(node.generators)
+    node.elt = self.visit(node.elt)
+    self.state[_Comprehension].exit()
+    return node
+
+  def visit_DictComp(self, node):
+    # Identical to _process_iterable_comprehension, different node names.
+    self.state[_Comprehension].enter()
+    node.generators = self.visit_block(node.generators)
+    node.key = self.visit(node.key)
+    node.value = self.visit(node.value)
+    self.state[_Comprehension].exit()
+    return node
+
+  def visit_ListComp(self, node):
+    return self._process_iterable_comprehension(node)
+
+  def visit_SetComp(self, node):
+    return self._process_iterable_comprehension(node)
+
+  def visit_GeneratorExp(self, node):
+    return self._process_iterable_comprehension(node)
+
   def visit_arguments(self, node):
     return self._process_statement(node)
 
   def visit_FunctionDef(self, node):
     # The FunctionDef node itself has a Scope object that tracks the creation
-    # of its name, along with the usage of any decorator accompany it.
+    # of its name, along with the usage of any decorator accompanying it.
     self._enter_scope(False)
     node.decorator_list = self.visit_block(node.decorator_list)
     self.scope.mark_modified(qual_names.QN(node.name))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 997d9a8aff111dfb0c223840da642ce8b2f138ce..ef3390e03fa1a76d0033827b9fedfb075eab78e7 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -112,16 +112,16 @@ class ScopeTest(test.TestCase):
 class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
-    node, source = parser.parse_entity(test_fn)
+    node, source, _ = parser.parse_entity(test_fn)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     return node, entity_info
 
   def assertSymbolSetsAre(self, expected, actual, name):
@@ -149,7 +149,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return c
 
     node, _ = self._parse_and_analyze(test_fn)
-    print_node = node.body[0].body[2]
+    print_node = node.body[2]
     if isinstance(print_node, gast.Print):
       # Python 2
       print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
@@ -172,7 +172,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return c
 
     node, _ = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[2].value
+    call_node = node.body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
     self.assertScopeIs(
@@ -189,7 +189,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return a.d
 
     node, _ = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[1].value
+    call_node = node.body[1].value
     self.assertScopeIs(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), ())
 
@@ -205,7 +205,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return a[c]
 
     node, _ = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[2].value
+    call_node = node.body[2].value
     self.assertScopeIs(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
         ('a', 'a[0]', 'a[b]', 'b'), ())
@@ -220,7 +220,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return b, c
 
     node, _ = self._parse_and_analyze(test_fn)
-    while_node = node.body[0].body[1]
+    while_node = node.body[1]
     self.assertScopeIs(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'))
     self.assertScopeIs(
@@ -239,7 +239,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return b, c
 
     node, _ = self._parse_and_analyze(test_fn)
-    for_node = node.body[0].body[1]
+    for_node = node.body[1]
     self.assertScopeIs(
         anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'))
     self.assertScopeIs(
@@ -260,7 +260,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return z, u
 
     node, _ = self._parse_and_analyze(test_fn)
-    if_node = node.body[0].body[0]
+    if_node = node.body[0]
     self.assertScopeIs(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'))
     self.assertScopeIs(
@@ -285,7 +285,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return d
 
     node, _ = self._parse_and_analyze(test_fn)
-    if_node = node.body[0].body[0]
+    if_node = node.body[0]
     self.assertScopeIs(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a', 'a.c'), ('a.b', 'd'))
     self.assertScopeIs(
@@ -307,7 +307,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return d
 
     node, _ = self._parse_and_analyze(test_fn)
-    if_node = node.body[0].body[0]
+    if_node = node.body[0]
     self.assertScopeIs(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a', 'b', 'c', 'a[c]'),
         ('a[b]', 'd'))
@@ -329,7 +329,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return a
 
     node, _ = self._parse_and_analyze(test_fn)
-    inner_if_node = node.body[0].body[0].body[0]
+    inner_if_node = node.body[0].body[0]
     self.assertScopeIs(
         anno.getanno(inner_if_node, NodeAnno.BODY_SCOPE), ('b',), ('a',))
     self.assertScopeIs(
@@ -350,7 +350,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return b, c
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_def_node = node.body[0].body[0]
+    fn_def_node = node.body[0]
 
     self.assertScopeIs(
         anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',))
@@ -364,7 +364,7 @@ class ActivityAnalyzerTest(test.TestCase):
         self.b.c = 1
 
     node, _ = self._parse_and_analyze(TestClass)
-    init_node = node.body[0].body[0]
+    init_node = node.body[0]
     self.assertScopeIs(
         anno.getanno(init_node, NodeAnno.BODY_SCOPE), ('self', 'a', 'self.b'),
         ('self', 'self.b', 'self.b.c'))
@@ -375,7 +375,7 @@ class ActivityAnalyzerTest(test.TestCase):
       a[0] += 1
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     self.assertScopeIs(
         anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('a', 'a[0]'), ('a[0]',))
 
@@ -385,7 +385,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return c
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     self.assertScopeIs(anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('c',), ())
 
   def test_aug_assign(self):
@@ -394,7 +394,7 @@ class ActivityAnalyzerTest(test.TestCase):
       a += b
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     self.assertScopeIs(
         anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('a', 'b'), ('a'))
 
@@ -409,7 +409,7 @@ class ActivityAnalyzerTest(test.TestCase):
       foo()['bar'] += x
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     self.assertScopeIs(
         anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('foo', 'x'), ())
 
@@ -419,7 +419,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return b
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('b',), ())
     self.assertScopeIs(body_scope.parent, ('b',), ('a', 'b'))
@@ -433,7 +433,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return lambda: a + b
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('a', 'b'), ())
     # Nothing local to the lambda is tracked.
@@ -445,7 +445,7 @@ class ActivityAnalyzerTest(test.TestCase):
       return lambda a: a + b
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('b',), ())
     self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
@@ -456,7 +456,7 @@ class ActivityAnalyzerTest(test.TestCase):
       a = (lambda a, b, c: a + b + c)(d, 1, 2) + b
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('b', 'd'), ('a',))
     self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
@@ -467,7 +467,7 @@ class ActivityAnalyzerTest(test.TestCase):
       a = lambda a, b: d(lambda b: a + b + c)  # pylint: disable=undefined-variable
 
     node, _ = self._parse_and_analyze(test_fn)
-    fn_node = node.body[0]
+    fn_node = node
     body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
     self.assertScopeIs(body_scope, ('c', 'd'), ('a',))
     self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values.py b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
index e8e3d229bea4bb505d58cdae24de87377b1b50e6..eca4571d38977905cc51387e47ee9a7d763f6703 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
@@ -39,7 +39,8 @@ class LiveValueResolver(transformer.Base):
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.entity_info.namespace[node.name])
+    anno.setanno(
+        node, 'live_val', self.ctx.info.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,8 +54,8 @@ class LiveValueResolver(transformer.Base):
       if not is_defined:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
-        elif node.id in self.entity_info.namespace:
-          obj = self.entity_info.namespace[node.id]
+        elif node.id in self.ctx.info.namespace:
+          obj = self.ctx.info.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           if hasattr(obj, '__name__'):
             anno.setanno(node, 'fqn', (obj.__name__,))
@@ -86,8 +87,8 @@ class LiveValueResolver(transformer.Base):
         def_, = defs
         # Note: param_of is a weakref.
         if def_.param_of and def_.param_of() is self.enclosing_entities[0]:
-          if node.id in self.entity_info.arg_values:
-            obj = self.entity_info.arg_values[node.id]
+          if node.id in self.ctx.info.arg_values:
+            obj = self.ctx.info.arg_values[node.id]
             anno.setanno(node, 'live_val', obj)
             anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
index 882c380b7888250560e0bf69ca44c3e7f4264979..14bb3682e3b1f12364abf4dd40fe8fdcaaa6c41c 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
@@ -41,22 +41,22 @@ class LiveValuesResolverTest(test.TestCase):
                          literals=None,
                          arg_types=None):
     literals = literals or {}
-    node, source = parser.parse_entity(test_fn)
+    node, source, _ = parser.parse_entity(test_fn)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=None)
+        arg_types=arg_types)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, literals)
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, literals)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
     return node
 
   def test_literals(self):
@@ -67,7 +67,7 @@ class LiveValuesResolverTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn, {}, literals={'a': 'bar'})
-    retval_node = node.body[0].body[0].value
+    retval_node = node.body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
   def test_primitive_values(self):
@@ -78,7 +78,7 @@ class LiveValuesResolverTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn, {'a': True})
-    retval_node = node.body[0].body[0].value
+    retval_node = node.body[0].value
     if six.PY2:
       self.assertEqual(
           anno.getanno(retval_node, 'fqn'), ('__builtin__', 'bool'))
@@ -94,7 +94,7 @@ class LiveValuesResolverTest(test.TestCase):
       return foo()
 
     node = self._parse_and_analyze(test_fn, {'foo': foo})
-    func_node = node.body[0].body[0].value.func
+    func_node = node.body[0].value.func
     self.assertEquals(foo, anno.getanno(func_node, 'live_val'))
     self.assertEquals(('foo',), anno.getanno(func_node, 'fqn'))
 
@@ -104,7 +104,7 @@ class LiveValuesResolverTest(test.TestCase):
       return constant_op.constant(0)
 
     node = self._parse_and_analyze(test_fn, {'constant_op': constant_op})
-    func_node = node.body[0].body[0].value.func
+    func_node = node.body[0].value.func
     self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val'))
     self.assertEquals((constant_op.__name__, 'constant'),
                       anno.getanno(func_node, 'fqn'))
@@ -122,7 +122,7 @@ class LiveValuesResolverTest(test.TestCase):
     node = self._parse_and_analyze(
         TestClass.test_fn, {'constant_op': constant_op},
         arg_types={'self': (TestClass.__name__, TestClass)})
-    func_node = node.body[0].body[0].value.func
+    func_node = node.body[0].value.func
     self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
     self.assertEquals(TestClass, anno.getanno(func_node, 'parent_type'))
     self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5..ad567a0a4fc97e246461274f33fa403634638ed8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -144,10 +144,10 @@ class WholeTreeAnalyzer(transformer.Base):
     self.current_analyzer = parent_analyzer
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
 
@@ -219,6 +219,10 @@ class Annotator(transformer.Base):
                  frozenset(self.current_analyzer.out[cfg_node]))
     return node
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
 
 def resolve(node, source_info, graphs):
   """Resolves the live symbols at the exit of control flow statements.
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 4366808d4962394b98cb3d939abed9666899a6d3..c32abb9efd1771fb966645b6d2f85762beef8777 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
@@ -31,18 +33,18 @@ from tensorflow.python.platform import test
 class LivenessTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
-    node, source = parser.parse_entity(test_fn)
+    node, source, _ = parser.parse_entity(test_fn)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    liveness.resolve(node, entity_info, graphs)
+    liveness.resolve(node, ctx, graphs)
     return node
 
   def assertHasLiveOut(self, node, expected):
@@ -73,7 +75,7 @@ class LivenessTest(test.TestCase):
       return x
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], ('a', 'x'))
     self.assertHasLiveOut(fn_body[1], 'x')
@@ -90,7 +92,7 @@ class LivenessTest(test.TestCase):
       return x
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], 'a')
     self.assertHasLiveOut(fn_body[1], 'x')
@@ -103,7 +105,7 @@ class LivenessTest(test.TestCase):
       return x
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], 'x')
 
@@ -115,7 +117,7 @@ class LivenessTest(test.TestCase):
       return x.y
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], ('x.y', 'x'))
 
@@ -131,7 +133,7 @@ class LivenessTest(test.TestCase):
       foo()
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], 'a')
 
@@ -149,7 +151,7 @@ class LivenessTest(test.TestCase):
       child()
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], 'max')
 
@@ -163,7 +165,7 @@ class LivenessTest(test.TestCase):
           y = 0
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveOut(fn_body[0], ())
 
@@ -177,7 +179,7 @@ class LivenessTest(test.TestCase):
       return x
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveIn(fn_body[0], ('a', 'b', 'c', 'x'))
     self.assertHasLiveIn(fn_body[1], ('c', 'x'))
@@ -194,7 +196,7 @@ class LivenessTest(test.TestCase):
       return x
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveIn(fn_body[0], ('a', 'b', 'c', 'd'))
     self.assertHasLiveIn(fn_body[1], ('d', 'x'))
@@ -209,7 +211,7 @@ class LivenessTest(test.TestCase):
       return y, z
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveIn(fn_body[0], ('a', 'y', 'z'))
 
@@ -224,7 +226,7 @@ class LivenessTest(test.TestCase):
       return y, z
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveIn(fn_body[0], ('a', 'y', 'z'))
 
@@ -238,10 +240,66 @@ class LivenessTest(test.TestCase):
           y = 0
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasLiveIn(fn_body[0], ('a', 'x', 'y'))
 
+  def test_live_in_generator_comprehension(self):
+
+    def test_fn(y):
+      if all(x for x in y):
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('all', 'x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('all', 'y'))
+
+  def test_live_in_list_comprehension(self):
+
+    def test_fn(y):
+      if [x for x in y]:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
+  def test_live_in_set_comprehension(self):
+
+    def test_fn(y):
+      if {x for x in y}:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
+  def test_live_in_dict_comprehension(self):
+
+    def test_fn(y):
+      if {k: v for k, v in y}:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('k', 'v', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
index d1587d81780780f56ab0ec1fb0dbb9942a3d4539..ce6f3c528477713bb3ac04af00baffb9a1b7a145 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
@@ -217,12 +217,16 @@ class TreeAnnotator(transformer.Base):
 
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
   def visit_Name(self, node):
     if self.current_analyzer is None:
       # Names may appear outside function defs - for example in class
@@ -232,7 +236,8 @@ class TreeAnnotator(transformer.Base):
     analyzer = self.current_analyzer
     cfg_node = self.current_cfg_node
 
-    assert cfg_node is not None, 'name node outside of any statement?'
+    assert cfg_node is not None, ('name node, %s, outside of any statement?'
+                                  % node.id)
 
     qn = anno.getanno(node, anno.Basic.QN)
     if isinstance(node.ctx, gast.Load):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 8c0d51850770e90c6755951e4ca5b01bb0987c51..3359886f50db892c908be66f87f612e7e3c30bd8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
@@ -31,18 +33,18 @@ from tensorflow.python.platform import test
 class DefinitionInfoTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
-    node, source = parser.parse_entity(test_fn)
+    node, source, _ = parser.parse_entity(test_fn)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
     return node
 
@@ -84,7 +86,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasDefs(fn_body[0].targets[0], 1)
     self.assertHasDefs(fn_body[1].test, 1)
@@ -103,7 +105,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasDefs(fn_body[0].value.args[0], 1)
     self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
@@ -126,7 +128,7 @@ class DefinitionInfoTest(test.TestCase):
       return x, y
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasDefs(fn_body[0].targets[0], 1)
     self.assertHasDefs(fn_body[1].test, 2)
@@ -151,7 +153,7 @@ class DefinitionInfoTest(test.TestCase):
       return x, y
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasDefs(fn_body[0].targets[0], 1)
     self.assertHasDefs(fn_body[1].target, 1)
@@ -176,7 +178,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
     def_of_a_in_if = fn_body[1].body[0].targets[0]
 
     self.assertHasDefs(fn_body[0].targets[0], 1)
@@ -200,7 +202,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     parent_return = fn_body[3]
     child_return = fn_body[1].body[1]
@@ -217,7 +219,7 @@ class DefinitionInfoTest(test.TestCase):
         return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     self.assertHasDefs(fn_body[0].items[0].context_expr.func, 0)
     self.assertHasDefs(fn_body[0].items[0].context_expr.args[0], 1)
@@ -230,7 +232,7 @@ class DefinitionInfoTest(test.TestCase):
       return l
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     creation = fn_body[0].targets[0]
     mutation = fn_body[1].targets[0].value
@@ -249,7 +251,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     first_def = fn_body[0].targets[0]
     second_def = fn_body[1].orelse[0].targets[0]
@@ -268,7 +270,7 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
     use = fn_body[2].value
     self.assertHasDefs(use, 0)
@@ -283,9 +285,9 @@ class DefinitionInfoTest(test.TestCase):
       return a
 
     node = self._parse_and_analyze(test_fn)
-    fn_body = node.body[0].body
+    fn_body = node.body
 
-    param = node.body[0].args.args[0]
+    param = node.args.args[0]
     source = fn_body[0].value.args[0]
     target = fn_body[0].targets[0]
     retval = fn_body[1].value
@@ -293,6 +295,24 @@ class DefinitionInfoTest(test.TestCase):
     self.assertNotSameDef(source, target)
     self.assertSameDef(target, retval)
 
+  def test_comprehension_leaking(self):
+
+    def test_fn(a):
+      all(x for x in a)
+      return x  # pylint:disable=undefined-variable
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    listcomp_target = fn_body[0].value.args[0].generators[0].target
+    retval = fn_body[1].value
+
+    # Python2 leaks comprehension symbols. Python3 doesn't.
+    if six.PY2:
+      self.assertSameDef(retval, listcomp_target)
+    else:
+      self.assertHasDefs(retval, 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info.py b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
index edb2ef0e274c53136560ce508bfa862781e380b8..68a53661d3701960f56033edfb75fabc2a6d6956 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
@@ -45,6 +45,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
@@ -141,10 +142,11 @@ class TypeInfoResolver(transformer.Base):
     arg_name = str(qn)
     self.scope.setval(qn, arg_node)
     if (len(self.enclosing_entities) == 1 and
-        arg_name in self.entity_info.arg_types):
+        arg_name in self.ctx.info.arg_types):
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
-      type_string, type_obj = self.entity_info.arg_types[arg_name]
+      type_string, type_obj = self.ctx.info.arg_types[
+          arg_name]
       anno.setanno(arg_node, 'type', type_obj)
       anno.setanno(arg_node, 'type_fqn', tuple(type_string.split('.')))
 
@@ -177,7 +179,8 @@ class TypeInfoResolver(transformer.Base):
       func = value.func
       if anno.hasanno(func, 'live_val'):
         func_obj = anno.getanno(func, 'live_val')
-        if tf_inspect.isclass(func_obj):
+        if (tf_inspect.isclass(func_obj) and
+            not inspect_utils.isbuiltin(func_obj)):
           anno.setanno(value, 'is_constructor', True)
           anno.setanno(value, 'type', func_obj)
           anno.setanno(value, 'type_fqn', anno.getanno(func, 'fqn'))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
index 34ba3d2f13889273ac9351b6194a46762a4ac39b..42e52a6b3b9f3160f15bb7d8d6c2d02d7ee3b7f1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
@@ -62,22 +62,22 @@ class TypeInfoResolverTest(test.TestCase):
                          test_fn,
                          namespace,
                          arg_types=None):
-    node, source = parser.parse_entity(test_fn)
+    node, source, _ = parser.parse_entity(test_fn)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=None)
+        arg_types=arg_types)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, {})
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, {})
+    node = live_values.resolve(node, ctx, {})
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     return node
 
   def test_constructor_detection(self):
@@ -87,12 +87,23 @@ class TypeInfoResolverTest(test.TestCase):
       return opt
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    call_node = node.body[0].body[0].value
+    call_node = node.body[0].value
+    self.assertTrue(anno.getanno(call_node, 'is_constructor'))
     self.assertEquals(training.GradientDescentOptimizer,
                       anno.getanno(call_node, 'type'))
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
+  def test_constructor_detection_builtin_class(self):
+
+    def test_fn(x):
+      res = zip(x)
+      return res
+
+    node = self._parse_and_analyze(test_fn, {})
+    call_node = node.body[0].value
+    self.assertFalse(anno.hasanno(call_node, 'is_constructor'))
+
   def test_class_members_of_detected_constructor(self):
 
     def test_fn():
@@ -100,7 +111,7 @@ class TypeInfoResolverTest(test.TestCase):
       opt.minimize(0)
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    method_call = node.body[0].body[1].value.func
+    method_call = node.body[1].value.func
     self.assertEquals(training.GradientDescentOptimizer.minimize,
                       anno.getanno(method_call, 'live_val'))
 
@@ -111,12 +122,12 @@ class TypeInfoResolverTest(test.TestCase):
         sess.run(x)
 
     node = self._parse_and_analyze(test_fn, {'session': session})
-    constructor_call = node.body[0].body[0].items[0].context_expr
+    constructor_call = node.body[0].items[0].context_expr
     self.assertEquals(session.Session, anno.getanno(constructor_call, 'type'))
     self.assertEquals((session.__name__, 'Session'),
                       anno.getanno(constructor_call, 'type_fqn'))
 
-    method_call = node.body[0].body[0].body[0].value.func
+    method_call = node.body[0].body[0].value.func
     self.assertEquals(session.Session.run, anno.getanno(method_call,
                                                         'live_val'))
 
@@ -130,7 +141,7 @@ class TypeInfoResolverTest(test.TestCase):
       opt.minimize(0)
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    method_call = node.body[0].body[1].value.func
+    method_call = node.body[1].value.func
     self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members(self):
@@ -139,7 +150,7 @@ class TypeInfoResolverTest(test.TestCase):
       opt.minimize(0)
 
     node = self._parse_and_analyze(test_fn, {})
-    method_call = node.body[0].body[0].value.func
+    method_call = node.body[0].value.func
     self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members_with_value_hints(self):
@@ -154,7 +165,7 @@ class TypeInfoResolverTest(test.TestCase):
                     training.GradientDescentOptimizer)
         })
 
-    method_call = node.body[0].body[0].value.func
+    method_call = node.body[0].value.func
     self.assertEquals(training.GradientDescentOptimizer.minimize,
                       anno.getanno(method_call, 'live_val'))
 
@@ -168,7 +179,7 @@ class TypeInfoResolverTest(test.TestCase):
       foo()
 
     node = self._parse_and_analyze(test_fn, {'bar': bar})
-    method_call = node.body[0].body[1].value.func
+    method_call = node.body[1].value.func
     self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_nested_members(self):
@@ -178,7 +189,7 @@ class TypeInfoResolverTest(test.TestCase):
       foo.bar.baz()
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    method_call = node.body[0].body[1].value.func
+    method_call = node.body[1].value.func
     self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_nested_unpacking(self):
@@ -194,7 +205,7 @@ class TypeInfoResolverTest(test.TestCase):
       return a, b, c
 
     node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'Bar': Bar})
-    a, b, c = node.body[0].body[1].value.elts
+    a, b, c = node.body[1].value.elts
     self.assertEquals(anno.getanno(a, 'type'), Foo)
     self.assertEquals(anno.getanno(b, 'type'), Bar)
     self.assertEquals(anno.getanno(c, 'type'), Foo)
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 43279b3ca0111b8ea3860f1c467df1c602b3de74..b682a21bec16bcfae4c873dcd9c6ab8f0f3eb73b 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -91,6 +91,18 @@ class ContextAdjuster(gast.NodeTransformer):
     self._ctx_override = None
     return self.generic_visit(node)
 
+  def visit_comprehension(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Lambda(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
 
 class ReplaceTransformer(gast.NodeTransformer):
   """Replace AST nodes."""
@@ -108,6 +120,7 @@ class ReplaceTransformer(gast.NodeTransformer):
         anno.Basic.ORIGIN,
         anno.Basic.SKIP_PROCESSING,
         anno.Static.ORIG_DEFINITIONS,
+        'extra_test',
     }
 
   def _prepare_replacement(self, replaced, key):
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index cdb44b822e84ad5822c78d50c2f958b1fba9ec18..4762aaf3ff68391bf4cfdee46ba88ff69cd7e8c0 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -238,6 +238,26 @@ class TemplatesTest(test.TestCase):
     source = parser.parse_expression('[a(b(1))]')
     templates.replace_as_expression(template, bar=source)
 
+  def test_star_comprehension_in_function_call(self):
+    template = """
+      a = foo(func, args)
+    """
+    source = parser.parse_expression('bar(*[i for i in range(j)])')
+    node = templates.replace(template, func=source.func, args=source.args)
+    arg_node = node[0].value.args[1].value
+    self.assertIsInstance(arg_node.generators[0].target.ctx, gast.Store)
+    self.assertIsInstance(arg_node.elt.ctx, gast.Load)
+
+  def test_lambda_in_function_call(self):
+    template = """
+      a = foo(arg)
+    """
+    source = parser.parse_expression('[lambda i: i]')
+    node = templates.replace(template, arg=source)
+    lambda_arg = node[0].value.args[0].elts[0]
+    self.assertIsInstance(lambda_arg.args.args[0].ctx, gast.Param)
+    self.assertIsInstance(lambda_arg.body.ctx, gast.Load)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index c244cbd747eaa25bd7e74a2d26bc7ae325a52b65..231c35d7687cac4cd762f2001291e48364045cf5 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -15,7 +15,16 @@ filegroup(
 )
 
 py_library(
-    name = "testing",
+    name = "test_modules",
+    srcs = [
+        "future_import_module.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "codegen",
     srcs = [
         "codegen.py",
     ],
@@ -40,7 +49,7 @@ py_test(
         "notap",
     ],
     deps = [
-        ":testing",
+        ":codegen",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct",
         "@gast_archive//:gast",
diff --git a/tensorflow/examples/autograph/integration_tests/list_literals_test.py b/tensorflow/python/autograph/pyct/testing/future_import_module.py
similarity index 63%
rename from tensorflow/examples/autograph/integration_tests/list_literals_test.py
rename to tensorflow/python/autograph/pyct/testing/future_import_module.py
index e85d4abcfc9adfbb4bc6390589b846f7e59f3739..a167322dbfed4abb6f3fa913f6aa8d4b595a8140 100644
--- a/tensorflow/examples/autograph/integration_tests/list_literals_test.py
+++ b/tensorflow/python/autograph/pyct/testing/future_import_module.py
@@ -12,30 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests of functions that use list literals."""
+"""Module with print function."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+# This import is useless, but serves to distinguish this module's future imports
+# from the standard set of future imports used in TensorFlow.
+from __future__ import with_statement
 
-import tensorflow as tf
 
-from tensorflow.python import autograph as ag
-
-
-def list_used_as_tuple():
-  return tf.constant([1, 2, 3])
-
-
-class ListLiteralsTest(tf.test.TestCase):
-
-  def test_basic(self):
-    converted = ag.to_graph(list_used_as_tuple)
-    result = converted()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual(self.evaluate(result), [1, 2, 3])
-
-
-if __name__ == '__main__':
-  tf.test.main()
+def f():
+  print('foo')
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 9c776a737aa9cf010f629115cf623f13a532510f..150719258f94f55bc3656cf4fed6862abd6055c5 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -18,10 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 
 import gast
-import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
@@ -29,35 +27,48 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import templates
 
 
-class AutoGraphParseError(SyntaxError):
-  pass
+# TODO(znado): Use namedtuple.
+class Context(object):
+  """Contains information about a source code transformation.
+
+  This object is mutable, and is updated during conversion. Not thread safe.
+
+  Attributes:
+    info: EntityInfo, immutable.
+    current_origin: origin_info.OriginInfo, holds the OriginInfo of the last
+      AST node to be processed successfully. Useful for error handling.
+  """
+
+  def __init__(self, info):
+    self.info = info
+    self.current_origin = None
 
 
 # TODO(mdan): Use namedtuple.
 class EntityInfo(object):
-  """Contains information about a Python entity. Immutable.
+  """Contains information about a Python entity.
+
+  Immutable.
 
   Examples of entities include functions and classes.
 
   Attributes:
     source_code: The entity's source code.
     source_file: The entity's source file.
-    namespace: Dict[str, ], containing symbols visible to the entity
-        (excluding parameters).
+    namespace: Dict[str, ], containing symbols visible to the entity (excluding
+      parameters).
     arg_values: dict[str->*], containing parameter values, if known.
     arg_types: dict[str->*], containing parameter types, if known.
-    owner_type: The surrounding class type of the function, if present.
   """
 
   # TODO(mdan): Remove the default and update tests.
-  def __init__(self, source_code, source_file, namespace, arg_values, arg_types,
-               owner_type):
+  def __init__(self, source_code, source_file, namespace, arg_values,
+               arg_types):
     self.source_code = source_code
     self.source_file = source_file
     self.namespace = namespace
     self.arg_values = {} if arg_values is None else arg_values
     self.arg_types = {} if arg_types is None else arg_types
-    self.owner_type = owner_type
 
 
 class _StateStack(object):
@@ -198,17 +209,17 @@ class Base(gast.NodeTransformer):
 
   # TODO(mdan): Document all extra features.
 
-  def __init__(self, entity_info):
-    """Initialize the transformer. Subclasses should call this.
+  def __init__(self, ctx):
+    """Initialize the transformer.
+
+    Subclasses should call this.
 
     Args:
-      entity_info: An EntityInfo object.
+      ctx: A Context object.
     """
-    self._current_origin = None
     self._lineno = 0
     self._col_offset = 0
-    # TODO(znado): remove this from the constructor of all Transformers.
-    self.entity_info = entity_info
+    self.ctx = ctx
     self._enclosing_entities = []
 
     # A stack that allows keeping mutable, scope-local state where scopes may be
@@ -232,13 +243,15 @@ class Base(gast.NodeTransformer):
     return len(self._local_scope_state)
 
   def enter_local_scope(self, inherit=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks entry into a new local scope.
 
     Args:
-      inherit: Optional enumerable of variable names to copy from the
-          parent scope.
+      inherit: Optional enumerable of variable names to copy from the parent
+        scope.
     """
     scope_entered = {}
     if inherit:
@@ -249,13 +262,15 @@ class Base(gast.NodeTransformer):
     self._local_scope_state.append(scope_entered)
 
   def exit_local_scope(self, keep=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks exit from the current local scope.
 
     Args:
-      keep: Optional enumerable of variable names to copy into the
-          parent scope.
+      keep: Optional enumerable of variable names to copy into the parent scope.
+
     Returns:
       A dict containing the scope that has just been exited.
     """
@@ -276,11 +291,17 @@ class Base(gast.NodeTransformer):
     return self._local_scope_state[-1].get(name, default)
 
   def debug_print(self, node):
-    """Helper method useful for debugging."""
+    """Helper method useful for debugging. Prints the AST."""
     if __debug__:
       print(pretty_printer.fmt(node))
     return node
 
+  def debug_print_src(self, node):
+    """Helper method useful for debugging. Prints the AST as code."""
+    if __debug__:
+      print(compiler.ast_to_source(node))
+    return node
+
   def create_assignment(self, target, expression):
     template = """
       target = expression
@@ -390,11 +411,11 @@ class Base(gast.NodeTransformer):
 
     Args:
       targets: list, tuple of or individual AST node. Should be used with the
-          targets field of an ast.Assign node.
+        targets field of an ast.Assign node.
       values: an AST node.
       apply_fn: a function of a single argument, which will be called with the
-          respective nodes of each single assignment. The signature is
-          apply_fn(target, value), no return value.
+        respective nodes of each single assignment. The signature is
+        apply_fn(target, value), no return value.
     """
     if not isinstance(targets, (list, tuple)):
       targets = (targets,)
@@ -429,75 +450,54 @@ class Base(gast.NodeTransformer):
       # call `visit`.  The error needs to be raised before the exception handler
       # below is installed, because said handler will mess up if `node` is not,
       # in fact, a node.
-      msg = (
-          'invalid value for "node": expected "ast.AST", got "{}"; to'
-          ' visit lists of nodes, use "visit_block" instead').format(type(node))
+      msg = ('invalid value for "node": expected "ast.AST", got "{}"; to'
+             ' visit lists of nodes, use "visit_block" instead').format(
+                 type(node))
       raise ValueError(msg)
 
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
     processing_expr_node = False
 
-    try:
-      parent_origin = self._current_origin
-      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-        did_enter_function = True
-      elif isinstance(node, gast.Expr):
-        processing_expr_node = True
-
-      if did_enter_function:
-        self._enclosing_entities.append(node)
-
-      if anno.hasanno(node, anno.Basic.ORIGIN):
-        self._current_origin = anno.getanno(node, anno.Basic.ORIGIN)
-
-      if processing_expr_node:
-        entry_expr_value = node.value
-
-      if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        result = super(Base, self).visit(node)
-      self._current_origin = parent_origin
-
-      # Adjust for consistency: replacing the value of an Expr with
-      # an Assign node removes the need for the Expr node.
-      if processing_expr_node:
-        if isinstance(result, gast.Expr) and result.value != entry_expr_value:
-          # When the replacement is a list, it is assumed that the list came
-          # from a template that contained a number of statements, which
-          # themselves are standalone and don't require an enclosing Expr.
-          if isinstance(result.value,
-                        (list, tuple, gast.Assign, gast.AugAssign)):
-            result = result.value
-
-      # On exception, the local scope integrity is not guaranteed.
-      if did_enter_function:
-        self._enclosing_entities.pop()
-
-      if local_scope_size_at_entry != len(self._local_scope_state):
-        raise AssertionError(
-            'Inconsistent local scope stack. Before entering node %s, the'
-            ' stack had length %d, after exit it has length %d. This'
-            ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.' % (node, local_scope_size_at_entry,
-                               len(self._local_scope_state)))
-      return result
-
-    except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
-      if not self._current_origin:
-        raise e
-      original_file_path = self._current_origin.loc.filename
-      original_line_number = self._current_origin.loc.lineno
-      original_col_offset = self._current_origin.loc.col_offset
-      original_source_line = self._current_origin.source_code_line
-      msg = '%s: %s.' % (e.__class__.__name__, str(e))
-
-      # TODO(mdan): Avoid the printing of the original exception.
-      # In other words, we need to find how to suppress the "During handling
-      # of the above exception, another exception occurred" message.
-      six.reraise(
-          AutoGraphParseError,
-          AutoGraphParseError(msg, (original_file_path, original_line_number,
-                                    original_col_offset, original_source_line)),
-          sys.exc_info()[2])
-    finally:
-      self._current_origin = parent_origin
+    parent_origin = self.ctx.current_origin
+    if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+      did_enter_function = True
+    elif isinstance(node, gast.Expr):
+      processing_expr_node = True
+
+    if did_enter_function:
+      self._enclosing_entities.append(node)
+
+    if anno.hasanno(node, anno.Basic.ORIGIN):
+      self.ctx.current_origin = anno.getanno(node, anno.Basic.ORIGIN)
+
+    if processing_expr_node:
+      entry_expr_value = node.value
+
+    if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      result = super(Base, self).visit(node)
+    self.ctx.current_origin = parent_origin
+
+    # Adjust for consistency: replacing the value of an Expr with
+    # an Assign node removes the need for the Expr node.
+    if processing_expr_node:
+      if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+        # When the replacement is a list, it is assumed that the list came
+        # from a template that contained a number of statements, which
+        # themselves are standalone and don't require an enclosing Expr.
+        if isinstance(result.value,
+                      (list, tuple, gast.Assign, gast.AugAssign)):
+          result = result.value
+
+    # On exception, the local scope integrity is not guaranteed.
+    if did_enter_function:
+      self._enclosing_entities.pop()
+
+    if local_scope_size_at_entry != len(self._local_scope_state):
+      raise AssertionError(
+          'Inconsistent local scope stack. Before entering node %s, the'
+          ' stack had length %d, after exit it has length %d. This'
+          ' indicates enter_local_scope and exit_local_scope are not'
+          ' well paired.' % (node, local_scope_size_at_entry,
+                             len(self._local_scope_state)))
+    return result
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 0c68d2a7648ccd3f44fb53db994bd0bb94a813eb..bd19ebad5c5e0b09ed045835fb9a263f8381feac 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -28,14 +28,14 @@ from tensorflow.python.platform import test
 
 class TransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
+    return transformer.Context(entity_info)
 
   def test_entity_scope_tracking(self):
 
@@ -52,7 +52,7 @@ class TransformerTest(test.TestCase):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function():
       a = 0
@@ -68,10 +68,10 @@ class TransformerTest(test.TestCase):
           return b, inner_function
       return a, TestClass
 
-    node, _ = parser.parse_entity(test_function)
+    node, _, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
 
-    test_function_node = node.body[0]
+    test_function_node = node
     test_class = test_function_node.body[1]
     test_method = test_class.body[0]
     inner_function = test_method.body[1]
@@ -126,7 +126,7 @@ class TransformerTest(test.TestCase):
         self.state[CondState].exit()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       a = 1
@@ -141,10 +141,10 @@ class TransformerTest(test.TestCase):
           while True:
             raise '1'
 
-    node, _ = parser.parse_entity(test_function)
+    node, _, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
 
-    fn_body = node.body[0].body
+    fn_body = node.body
     outer_while_body = fn_body[1].body
     self.assertSameAnno(fn_body[0], outer_while_body[0], 'cond_state')
     self.assertDifferentAnno(fn_body[0], outer_while_body[0], 'loop_state')
@@ -192,7 +192,7 @@ class TransformerTest(test.TestCase):
       def visit_For(self, node):
         return self._annotate_result(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       """Docstring."""
@@ -207,10 +207,10 @@ class TransformerTest(test.TestCase):
             raise '1'
       return 'nor this'
 
-    node, _ = parser.parse_entity(test_function)
+    node, _, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
 
-    for_node = node.body[0].body[2]
+    for_node = node.body[2]
     while_node = for_node.body[1].orelse[1]
 
     self.assertFalse(anno.hasanno(for_node, 'string'))
@@ -231,14 +231,14 @@ class TransformerTest(test.TestCase):
         self.exit_local_scope()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def no_exit(a):
       if a > 0:
         print(a)
       return None
 
-    node, _ = parser.parse_entity(no_exit)
+    node, _, _ = parser.parse_entity(no_exit)
     with self.assertRaises(AssertionError):
       tr.visit(node)
 
@@ -246,7 +246,7 @@ class TransformerTest(test.TestCase):
       for _ in a:
         print(a)
 
-    node, _ = parser.parse_entity(no_entry)
+    node, _, _ = parser.parse_entity(no_entry)
     with self.assertRaises(AssertionError):
       tr.visit(node)
 
@@ -270,11 +270,10 @@ class TransformerTest(test.TestCase):
       z = y
       return z
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
-    node, _ = parser.parse_entity(test_function)
+    node, _, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
-    node = node.body[0]
 
     self.assertEqual(len(node.body), 2)
     self.assertTrue(isinstance(node.body[0], gast.Assign))
@@ -301,11 +300,11 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
-    node, _ = parser.parse_entity(test_function)
+    _, _, all_nodes = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
-      node = tr.visit(node)
+      all_nodes = tr.visit(all_nodes)
     obtained_message = str(cm.exception)
     expected_message = r'expected "ast.AST", got "\<(type|class) \'list\'\>"'
     self.assertRegexpMatches(obtained_message, expected_message)
@@ -332,11 +331,11 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
-    node, _ = parser.parse_entity(test_function)
+    _, _, all_nodes = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
-      node = tr.visit(node)
+      all_nodes = tr.visit(all_nodes)
     obtained_message = str(cm.exception)
     # The message should reference the exception actually raised, not anything
     # from the exception handler.
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 790c661661dabab7c5e1d5dd097a60562c8cc358..f5e0dbf00bf5ce35ae049755b32b47d12e5c9960 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -20,6 +20,7 @@ py_library(
     name = "utils",
     srcs = [
         "__init__.py",
+        "ag_logging.py",
         "context_managers.py",
         "misc.py",
         "py_func.py",
@@ -33,7 +34,9 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
diff --git a/tensorflow/python/autograph/utils/ag_logging.py b/tensorflow/python/autograph/utils/ag_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..8229b828d305dc5acd8e61ceacb325d3f681487f
--- /dev/null
+++ b/tensorflow/python/autograph/utils/ag_logging.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logging and debugging utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# TODO(mdan): Use a custom logger class.
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+VERBOSITY_VAR_NAME = 'AUTOGRAPH_VERBOSITY'
+DEFAULT_VERBOSITY = 0
+
+verbosity_level = None  # vlog-like. Takes precedence over the env variable.
+echo_log_to_stdout = False
+
+# In interactive Python, logging echo is enabled by default.
+if hasattr(sys, 'ps1') or hasattr(sys, 'ps2'):
+  echo_log_to_stdout = True
+
+
+@tf_export('autograph.set_verbosity')
+def set_verbosity(level, alsologtostdout=False):
+  """Sets the AutoGraph verbosity level.
+
+  _Debug logging in AutoGraph_
+
+  More verbose logging is useful to enable when filing bug reports or doing
+  more in-depth debugging.
+
+  There are two controls that control the logging verbosity:
+
+   * The `set_verbosity` function
+
+   * The `AUTOGRAPH_VERBOSITY` environment variable
+
+  `set_verbosity` takes precedence over the environment variable.
+
+  For example:
+
+  ```python
+  import os
+  import tensorflow as tf
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 5
+  # Verbosity is now 5
+
+  tf.autograph.set_verbosity(0)
+  # Verbosity is now 0
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 1
+  # No effect, because set_verbosity was already called.
+  ```
+
+  Logs entries are output to [absl](https://abseil.io)'s default output,
+  with `INFO` level.
+  Logs can be mirrored to stdout by using the `alsologtostdout` argument.
+  Mirroring is enabled by default when Python runs in interactive mode.
+
+  Args:
+    level: int, the verbosity level; larger values specify increased verbosity;
+      0 means no logging. When reporting bugs, it is recommended to set this
+      value to a larges number, like 10.
+    alsologtostdout: bool, whether to also output log messages to `sys.stdout`.
+  """
+  global verbosity_level
+  global echo_log_to_stdout
+  verbosity_level = level
+  echo_log_to_stdout = alsologtostdout
+
+
+@tf_export('autograph.trace')
+def trace(*args):
+  """Traces argument information at compilation time.
+
+  `trace` is useful when debugging, and it always executes during the tracing
+  phase, that is, when the TF graph is constructed.
+
+  _Example usage_
+
+  ```python
+  import tensorflow as tf
+
+  for i in tf.range(10):
+    tf.autograph.trace(i)
+  # Output: <Tensor ...>
+  ```
+
+  Args:
+    *args: Arguments to print to `sys.stdout`.
+  """
+  print(*args)
+
+
+def get_verbosity():
+  global verbosity_level
+  if verbosity_level is not None:
+    return verbosity_level
+  return int(os.getenv(VERBOSITY_VAR_NAME, DEFAULT_VERBOSITY))
+
+
+def has_verbosity(level):
+  return get_verbosity() >= level
+
+
+def error(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.error(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def log(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.info(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def warn(msg, *args, **kwargs):
+  logging.warn(msg, *args, **kwargs)
+  if echo_log_to_stdout:
+    print('WARNING:', msg % args)
+
+
+def warn_first_n(msg, *args, **kwargs):
+  logging.log_first_n(logging.WARN, msg, *args, **kwargs)
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index b9056f86e6d0465a8521f054a459c06eb5aeb37c..244820f41a85778a01cd811d96c3e8228d8b7c8c 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -12,22 +12,26 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 # consumers of the tf_gen_op_wrapper_py rule would be simplified if we don't
 # hard code the ops/ directory.
 
-def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
-                                 require_shape_functions=True,
-                                 visibility=[]):
-  if not name.endswith("_gen"):
-    fail("name must end in _gen")
-  if not visibility:
-    visibility = ["//visibility:private"]
-  bare_op_name = name[:-4] # Strip off the _gen
-  tf_gen_op_wrapper_py(name=bare_op_name,
-    out=out,
-    visibility=visibility,
-    deps=deps,
-    require_shape_functions=require_shape_functions,
-    generated_target_name=name,
-    api_def_srcs = [
-        "//tensorflow/core/api_def:base_api_def",
-        "//tensorflow/core/api_def:python_api_def",
-    ],
-  )
+def tf_gen_op_wrapper_private_py(
+        name,
+        out = None,
+        deps = [],
+        require_shape_functions = True,
+        visibility = []):
+    if not name.endswith("_gen"):
+        fail("name must end in _gen")
+    if not visibility:
+        visibility = ["//visibility:private"]
+    bare_op_name = name[:-4]  # Strip off the _gen
+    tf_gen_op_wrapper_py(
+        name = bare_op_name,
+        out = out,
+        visibility = visibility,
+        deps = deps,
+        require_shape_functions = require_shape_functions,
+        generated_target_name = name,
+        api_def_srcs = [
+            "//tensorflow/core/api_def:base_api_def",
+            "//tensorflow/core/api_def:python_api_def",
+        ],
+    )
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index b97eb884b36ed2246d6bf59f215786114d719e0f..4f3eb61d4fddbdad2758e0aef00727ede5d37b74 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1532,7 +1532,7 @@ class Session(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
@@ -1590,7 +1590,21 @@ class Session(BaseSession):
     self._default_session_context_manager = None
     self._default_graph_context_manager = None
 
-    self.close()
+    # If we are closing due to an exception, set a time limit on our Close() to
+    # avoid blocking forever.
+    # TODO(b/120204635) remove this when deadlock is fixed.
+    if exec_type:
+      close_thread = threading.Thread(
+          name='SessionCloseThread', target=self.close)
+      close_thread.daemon = True
+      close_thread.start()
+      close_thread.join(30.0)
+      if close_thread.is_alive():
+        logging.error(
+            'Session failed to close after 30 seconds. Continuing after this '
+            'point may leave your program in an undefined state.')
+    else:
+      self.close()
 
   @staticmethod
   def reset(target, containers=None, config=None):
@@ -1675,7 +1689,7 @@ class InteractiveSession(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index 895624f5929fbffb9613a51093f61c450ede51ed..6639cf506e0a2f3d53373959b47cf98e5fcb0887 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -18,9 +18,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -480,8 +477,6 @@ Status SessionRef::ReleaseCallable(CallableHandle handle) {
   LOG_AND_RUN_OPERATION(ReleaseCallable, handle);
 }
 
-static const absl::Duration kMaxCloseWaitTime = absl::Seconds(60);
-
 Status SessionRef::Close(const RunOptions& run_options) {
   TF_RETURN_IF_ERROR(CheckNotClosed());
   mutex_lock l(run_lock_);
@@ -492,17 +487,8 @@ Status SessionRef::Close(const RunOptions& run_options) {
     status = session_->Close(run_options);
   }
   session_.reset();
-
-  // Wait no more than kMaxCloseWaitTime for all pending operations to finish.
-  absl::Time start_time = absl::Now();
-  absl::Duration wait_time = start_time + kMaxCloseWaitTime - absl::Now();
-  while (run_count_ > 0 && wait_time > absl::ZeroDuration()) {
-    if (run_finished_.wait_for(l, absl::ToChronoMilliseconds(wait_time)) ==
-        std::cv_status::timeout) {
-      status.Update(errors::DeadlineExceeded("Timeout closing session."));
-      return status;
-    }
-    wait_time = start_time + kMaxCloseWaitTime - absl::Now();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
   }
   return status;
 }
@@ -517,17 +503,8 @@ Status SessionRef::Close() {
     status = session_->Close();
   }
   session_.reset();
-
-  // Wait no more than kMaxCloseWaitTime for all pending operations to finish.
-  absl::Time start_time = absl::Now();
-  absl::Duration wait_time = start_time + kMaxCloseWaitTime - absl::Now();
-  while (run_count_ > 0 && wait_time > absl::ZeroDuration()) {
-    if (run_finished_.wait_for(l, absl::ToChronoMilliseconds(wait_time)) ==
-        std::cv_status::timeout) {
-      status.Update(errors::DeadlineExceeded("Timeout closing session."));
-      return status;
-    }
-    wait_time = start_time + kMaxCloseWaitTime - absl::Now();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
   }
   return status;
 }
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index da6218663de8b02fcda3f3e67e68bb46e47e914a..bd4def565e7f308280bbf89391b41d37528951b8 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -33,6 +33,7 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as framework_device_lib
@@ -1880,7 +1881,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
-  @test_util.run_v1_only('b/120545219')
   def testDefaultLogDevicePlacement(self):
 
     class CaptureStderr(str):
@@ -1916,19 +1916,26 @@ class SessionTest(test_util.TensorFlowTestCase):
       def __str__(self):
         return self._output
 
-    # Passing the config to the server, but not the session should still result
-    # in logging device placement.
-    config = config_pb2.ConfigProto(log_device_placement=True)
-    server = server_lib.Server.create_local_server(config=config)
-    a = constant_op.constant(1)
-    b = constant_op.constant(2)
-    c = a + b
-    with session.Session(server.target) as sess:
+    if context.executing_eagerly():
+      context.set_log_device_placement(True)
       with CaptureStderr() as log:
-        sess.run(c)
-      # Ensure that we did log device placement.
-      self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in str(log),
-                      str(log))
+        a = constant_op.constant(1)
+        b = constant_op.constant(2)
+        c = a + b
+    else:
+      # Passing the config to the server, but not the session should still
+      # result in logging device placement.
+      config = config_pb2.ConfigProto(log_device_placement=True)
+      server = server_lib.Server.create_local_server(config=config)
+      a = constant_op.constant(1)
+      b = constant_op.constant(2)
+      c = a + b
+      with session.Session(server.target) as sess:
+        with CaptureStderr() as log:
+          sess.run(c)
+
+    # Ensure that we did log device placement.
+    self.assertTrue('/replica:0/task:0/device:CPU:0' in str(log), str(log))
 
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index ef7527d887f062621d1fb21511e08c5f7ea389c0..3b9677bf251cad98e1ed54403f93e9de2741e1b5 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -604,6 +604,27 @@ def TF_Reset(target, containers=None, config=None):
   }
 }
 
+// $input is a Python list of wrapped TF_Operations
+%typemap(in) (const std::vector<TF_Operation*>* control_outputs)
+    (std::vector<TF_Operation*> control_outputs) {
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      TF_Operation* oper_ptr;
+      SWIG_ConvertPtr(item, reinterpret_cast<void**>(&oper_ptr),
+                      $descriptor(TF_Operation*), 0);
+      control_outputs.push_back(oper_ptr);
+    }
+    $1 = &control_outputs;
+  } else {
+    $1 = nullptr;
+  }
+}
+
 // Typemaps for TF_GraphGetTensorShapeHelper.
 
 // Convert from C++ integer vector to Python list of ints.
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index dc0c10bab74635e240502e2f8e762b61e533b319..56b4eec98e314dd6474acec51b4208d5120f2fa4 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -590,7 +590,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status) {
   if (!output_names.empty() && output_names.size() != outputs.size()) {
     Set_TF_Status_from_Status(
@@ -613,10 +615,18 @@ TF_Function* TF_GraphToFunction_wrapper(
       output_names.empty() ? nullptr
                            : const_cast<const char**>(output_names.data());
 
-  return TF_GraphToFunction(fn_body, fn_name, append_hash_to_fn_name, nopers,
-                            opers_array, inputs.size(), inputs.data(),
-                            outputs.size(), outputs.data(), output_names_ptr,
-                            opts, description, out_status);
+  const char** control_output_names_ptr =
+      control_output_names.empty()
+          ? nullptr
+          : const_cast<const char**>(control_output_names.data());
+
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, nopers, opers_array,
+      inputs.size(), inputs.data(), outputs.size(), outputs.data(),
+      output_names_ptr,
+      control_outputs == nullptr ? 0 : control_outputs->size(),
+      control_outputs == nullptr ? nullptr : control_outputs->data(),
+      control_output_names_ptr, opts, description, out_status);
 }
 
 void TF_GraphSetOutputHandleShapesAndTypes_wrapper(
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index dab7e71aac5a7f4cbf9f8825ad6dd5d3f556bd43..d2c7dc34d8d54f384a69954db37f7ba18b527197 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -208,7 +208,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 61c0da01b836843a756c90fee20fbcb0ee94f59c..e7d60de690523d4e81b8609b2eb9c48b17145e6d 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -57,7 +57,7 @@ class TimelineTest(test.TestCase):
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testTimelineCpu(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -86,6 +86,7 @@ class TimelineTest(test.TestCase):
         show_memory=False, show_dataflow=False)
     self._validateTrace(ctf)
 
+  @test_util.deprecated_graph_mode_only
   def testTimelineGpu(self):
     if not test.is_gpu_available(cuda_only=True):
       return
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index e82ee0666c30f8dcf71d3e6609fc7d7a8ec7eeed..f6dee3bfd8e35db1683a9ad941d185efd89021e9 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -198,6 +198,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
     super(VirtualGpuTest, self).__init__(method_name)
     self._util = VirtualGpuTestUtil()
 
+  @test_util.deprecated_graph_mode_only
   def testStatsContainAllDeviceNames(self):
     with self.session(config=self._util.config) as sess:
       # TODO(laigd): b/70811538. The is_gpu_available() call will invoke
@@ -231,6 +232,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
     self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:1' in devices)
     self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:2' in devices)
 
+  @test_util.deprecated_graph_mode_only
   def testLargeRandomGraph(self):
     with self.session(config=self._util.config) as sess:
       if not test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c558722061cbf67a08d397a85e286f13ddf71fa0..e08015088b8422fc1d7800a486fd05cb5ae041b2 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 1, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 3, 15)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
index 8a94939ae11dbf28146ae12ab21d11990dbb2516..9961cae11c5b43f5253c7af9eb48ddd3ed142412 100644
--- a/tensorflow/python/compat/v2_compat.py
+++ b/tensorflow/python/compat/v2_compat.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python import tf2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.layers import normalization
 from tensorflow.python.ops import variable_scope
 
 from tensorflow.python.util.tf_export import tf_export
@@ -43,7 +42,6 @@ def enable_v2_behavior():
   ops.enable_eager_execution()
   tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
   variable_scope.enable_resource_variables()
-  normalization.enable_v2_batch_normalization()
 
 
 @tf_export(v1=["disable_v2_behavior"])
@@ -61,4 +59,3 @@ def disable_v2_behavior():
   ops.disable_eager_execution()
   tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
   variable_scope.disable_resource_variables()
-  normalization.disable_v2_batch_normalization()
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..07209a9eca976e049f6e44eef6a75952dd8a1823
--- /dev/null
+++ b/tensorflow/python/compiler/BUILD
@@ -0,0 +1,19 @@
+# Description:
+# Python APIs for various Tensorflow backends.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "compiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = if_not_windows([
+        "//tensorflow/python/compiler/tensorrt:init_py",
+    ]),
+)
diff --git a/tensorflow/tools/dist_test/__init__.py b/tensorflow/python/compiler/__init__.py
similarity index 100%
rename from tensorflow/tools/dist_test/__init__.py
rename to tensorflow/python/compiler/__init__.py
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..eb269b4db41a7856e759590b0fe924722e73417f
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -0,0 +1,168 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+exports_files(glob([
+    "test/testdata/*",
+]))
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_trt_integration_test_base",
+        ":trt_convert_py",
+    ],
+)
+
+py_library(
+    name = "trt_convert_py",
+    srcs = ["trt_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":wrap_conversion",
+        "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
+        "//tensorflow/python:convert_to_constants",
+        "//tensorflow/python:func_graph",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "wrap_conversion",
+    srcs = ["trt_conversion.i"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/compiler/tf2tensorrt:py_utils",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+py_library(
+    name = "tf_trt_integration_test_base",
+    srcs = ["test/tf_trt_integration_test_base.py"],
+    deps = [
+        ":trt_convert_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_test(
+    name = "trt_convert_test",
+    srcs = ["trt_convert_test.py"],
+    additional_deps = [
+        ":trt_convert_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/tools:freeze_graph_lib",
+        "//tensorflow/python/tools:saved_model_utils",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_tests(
+    name = "tf_trt_integration_test",
+    srcs = [
+        "test/base_test.py",
+        "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
+        "test/binary_tensor_weight_broadcast_test.py",
+        "test/concatenation_test.py",
+        "test/const_broadcast_test.py",
+        "test/conv2d_test.py",
+        "test/dynamic_input_shapes_test.py",
+        "test/identity_output_test.py",
+        "test/int32_test.py",
+        "test/lru_cache_test.py",
+        "test/memory_alignment_test.py",
+        "test/multi_connection_neighbor_engine_test.py",
+        "test/neighboring_engine_test.py",
+        "test/quantization_test.py",
+        "test/rank_two_test.py",
+        "test/reshape_transpose_test.py",
+        "test/topk_test.py",
+        "test/unary_test.py",
+        "test/vgg_block_nchw_test.py",
+        "test/vgg_block_test.py",
+    ],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/125290478): allow running in at least some OSS configurations.
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/python/compiler/tensorrt/README.md
similarity index 100%
rename from tensorflow/contrib/tensorrt/README.md
rename to tensorflow/python/compiler/tensorrt/README.md
diff --git a/tensorflow/python/training/mode_keys.py b/tensorflow/python/compiler/tensorrt/__init__.py
similarity index 68%
rename from tensorflow/python/training/mode_keys.py
rename to tensorflow/python/compiler/tensorrt/__init__.py
index ef64554bd5783e7e3ac802708099424ff8244fd8..db3540ba45d8082079a04db9e9de5bf7aa178f93 100644
--- a/tensorflow/python/training/mode_keys.py
+++ b/tensorflow/python/compiler/tensorrt/__init__.py
@@ -11,23 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Model modeKeys for TensorFlow and Estimator."""
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-class ModeKeys(object):
-  """Standard names for model modes.
-
-  The following standard keys are defined:
-
-  * `TRAIN`: training/fitting mode.
-  * `TEST`: testing/evaluation mode.
-  * `PREDICT`: prediction/inference mode.
-  """
-
-  TRAIN = 'train'
-  TEST = 'test'
-  PREDICT = 'predict'
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.compiler.tensorrt.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/base_test.py
rename to tensorflow/python/compiler/tensorrt/test/base_test.py
index 17e0b6f4d2c4bbaf56ef143b78c543c9e130b458..6aa32f73676e343156b1f5c506a61ed6b8d2decb 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -20,8 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -148,13 +147,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
         rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
 
-class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
-
-  def setUp(self):
-    """Setup method."""
-    super(PartiallyConvertedTestA, self).setUp()
-    # Let it fail to build the second engine.
-    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
+class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -190,8 +183,8 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        # Only the first engine is built.
-        "TRTEngineOp_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
+        "TRTEngineOp_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"],
+        "TRTEngineOp_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
     }
 
   def ShouldRunTest(self, run_params):
@@ -203,23 +196,6 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
                  not run_params.use_calibration))
 
 
-class PartiallyConvertedTestB(PartiallyConvertedTestA):
-
-  def setUp(self):
-    """Setup method."""
-    super(PartiallyConvertedTestB, self).setUp()
-    # Let it fail to build the first engine.
-    trt_convert.clear_test_values("")
-    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
-
-  def ExpectedEnginesToBuild(self, run_params):
-    """Return the expected engines to build."""
-    return {
-        # Only the second engine is built.
-        "TRTEngineOp_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
-    }
-
-
 class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetParams(self):
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
similarity index 87%
rename from tensorflow/contrib/tensorrt/test/batch_matmul_test.py
rename to tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index 46e3407d9669085a9737bacbeec1a20765ef88cc..cd72b3fa68e98fb5bbd7119d5919328778da7b18 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -69,7 +69,7 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
         out = x1 + x2 + x3
       array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
+        gdef=g.as_graph_def(add_shapes=True),
         input_names=[input_name, w1_name, w2_name],
         input_dims=[[input_dims, w1_dims, w2_dims]],
         output_names=[output_name],
@@ -77,14 +77,7 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    if (run_params.dynamic_engine and
-        not trt_test.IsQuantizationMode(run_params.precision_mode)):
-      return ["TRTEngineOp_0", "TRTEngineOp_1"]
-    return ["TRTEngineOp_1"]
-
-  def ExpectedEnginesToRun(self, run_params):
-    """Return the expected engines to run."""
-    return ["TRTEngineOp_1"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
similarity index 98%
rename from tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
rename to tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index ca23629efacba1df27ffb466d24b189d6074a917..2b7bbbc960558a5020eca48af855885f4251a748 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
rename to tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 846fd009db07b151e1eca794e9a8a38ff834a465..7e1d3afdd9388813b2cf030274a3d2bd4a08b994 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/concatenation_test.py
rename to tensorflow/python/compiler/tensorrt/test/concatenation_test.py
index 5d8742ae356c091ba831bbd48741dee34cd39d08..f30324e7dba2392fb0d2c1a058bf3bc53c8493c6 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/const_broadcast_test.py
rename to tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index 9137d0072f66321d8420b7caac6acc329541123f..2d764665beffa4198c87ef5816f352288310ec4f 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
similarity index 79%
rename from tensorflow/contrib/tensorrt/test/conv2d_test.py
rename to tensorflow/python/compiler/tensorrt/test/conv2d_test.py
index e7993b4620931736cd872bfffb4ebe177555fcd2..326cad529740335310a4851cdbbea8b21cdd244e 100644
--- a/tensorflow/contrib/tensorrt/test/conv2d_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -20,12 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
@@ -187,5 +188,46 @@ class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
 
+class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
+    np.random.seed(1234)
+    dtype = dtypes.float32
+    input_name = "input"
+    n, c, h, w = 13, 3, 7, 11
+    num_filters = 8
+    input_dims = [n, c, h, w]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        weights_shape = [2, 2, num_filters, c]
+        weights = constant_op.constant(
+            np.random.randn(*weights_shape), dtype=dtype)
+        output_shape = constant_op.constant([n, num_filters, h * 2, w * 2],
+                                            dtype=dtypes.int32)
+        output = nn_ops.conv2d_transpose(
+            inp,
+            weights,
+            output_shape,
+            strides=[1, 1, 2, 2],
+            padding="SAME",
+            data_format="NCHW")
+        output = array_ops.identity(output, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[n, num_filters, h * 2, w * 2]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py
rename to tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index cc28cd6087997359e81ffaa6dc8bd958109cc565..cb358d4f9bd91ddd1d45c5e7555652c5c2bca157 100644
--- a/tensorflow/contrib/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/identity_output_test.py b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/identity_output_test.py
rename to tensorflow/python/compiler/tensorrt/test/identity_output_test.py
index b568eeda945d997a832b7f71a5bfd8c42e127e65..23a72c5b0b75994fb0662dd22618b95c02cfff55 100644
--- a/tensorflow/contrib/tensorrt/test/identity_output_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
@@ -25,7 +25,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/int32_test.py
rename to tensorflow/python/compiler/tensorrt/test/int32_test.py
index 8cf538703880b130322a7dd504094c7a298e6522..6d4446940aadf252b3d81f9978a374e4aedbb247 100644
--- a/tensorflow/contrib/tensorrt/test/int32_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/lru_cache_test.py b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/lru_cache_test.py
rename to tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
index 7702413e6cee667796b7fbf4121c6e0d9118d35c..18e6d32dfe5fde6ff6f70522df599d3a8fb142a5 100644
--- a/tensorflow/contrib/tensorrt/test/lru_cache_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/memory_alignment_test.py
rename to tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index cc64329bbd53eaaebf7929e48bbfa8d8beeeadff..89625aa629b1aa824bd95bfbf31e93174294faa8 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
rename to tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index a14bb0396ece74c8de73008d2007bce5c763b0ed..d04c6958fbc466faba7c4de4be53710aedc8b3b2 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
rename to tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index 06a86bbb8df4c11a471475c040b74099a6fe2361..1f7189f0eb2cd452882d79b1371e7e6baa9b629a 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
rename to tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 753b47f4d9fe3c30c75143bfa3c7225ab774645b..3eb7f5fd73247f86184fdca69cc8765551eec3ad 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import data
 from tensorflow.python import keras
+from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
@@ -139,9 +138,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     if use_trt:
       logging.info('Number of nodes before TF-TRT conversion: %d',
                    len(graph_def.node))
-      graph_def = trt_convert.create_inference_graph(
-          graph_def,
-          outputs=[OUTPUT_NODE_NAME],
+      converter = trt_convert.TrtGraphConverter(
+          input_graph_def=graph_def,
+          nodes_blacklist=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
           # There is a 2GB GPU memory limit for each test, so we set
@@ -150,7 +149,8 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
           max_workspace_size_bytes=1 << 28,
           minimum_segment_size=2,
           use_calibration=False,
-      )
+          use_function_backup=False)
+      graph_def = converter.convert()
       logging.info('Number of nodes after TF-TRT conversion: %d',
                    len(graph_def.node))
       num_engines = len(
@@ -263,9 +263,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
   #     num_epochs=100,
   #     model_dir=model_dir)
   def testEval(self):
-    if not trt_convert.is_tensorrt_enabled():
+    if not is_tensorrt_enabled():
       return
-    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+    model_dir = test.test_src_dir_path('python/compiler/tensorrt/test/testdata')
 
     accuracy_tf_native = self._Run(
         is_training=False,
@@ -274,9 +274,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_native: %f', accuracy_tf_native)
-    self.assertAllClose(0.9662, accuracy_tf_native, rtol=1e-3, atol=1e-3)
+    self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3)
 
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return
 
     accuracy_tf_trt = self._Run(
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/quantization_test.py
rename to tensorflow/python/compiler/tensorrt/test/quantization_test.py
index ce1b25ebf3c52ac5710dea654134925bb5b6ceca..3e1c9ff8ddc70469ba3516111b9d3821f1bbb6bc 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -72,7 +72,7 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=False)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Only test static engine mode, with or without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
@@ -96,7 +96,7 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=True)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Test static/dynamic engine with/without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/rank_two_test.py
rename to tensorflow/python/compiler/tensorrt/test/rank_two_test.py
index 97159bb008068efbbcdb0fc6844890a42a08f46c..a951638b5055b66255bc93291ae906220590e64a 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
similarity index 98%
rename from tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
rename to tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
index 7fb2cbde07c4987d925e9abc915ede52119ec6df..423d70f2e4ed7a6728bc3f77a8d598566c209d41 100644
--- a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/python/compiler/tensorrt/test/testdata/checkpoint
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/checkpoint
rename to tensorflow/python/compiler/tensorrt/test/testdata/checkpoint
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
rename to tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.index
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
rename to tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.index
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
similarity index 75%
rename from tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
rename to tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index db435814048e3810cdc7fc92a94d1d71dc36b962..3409f25e12a0c04c645b07bf05d4724dbc830e79 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -25,12 +25,10 @@ import warnings
 import numpy as np
 import six
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_ops
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -71,6 +69,10 @@ def IsQuantizationMode(mode):
   return mode == "INT8"
 
 
+def IsQuantizationWithCalibration(params):
+  return IsQuantizationMode(params.precision_mode) and params.use_calibration
+
+
 class GraphState(object):
   ORIGINAL = 0
   CALIBRATE = 1
@@ -110,7 +112,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   @property
   def trt_incompatible_op(self):
-    return math_ops.sin
+    return math_ops.erf
 
   @property
   def precision_modes(self):
@@ -147,12 +149,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         return s
       return s.decode("utf-8")
 
-  @classmethod
-  def setUpClass(cls):
-    """Setup method for the module."""
-    super(TfTrtIntegrationTestBase, cls).setUpClass()
-    trt_convert.enable_test_value()
-
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TfTrtIntegrationTestBase, self).__init__(methodName)
     self._trt_test_params = None
@@ -161,7 +157,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Setup method."""
     super(TfTrtIntegrationTestBase, self).setUp()
     warnings.simplefilter("always")
-    trt_convert.clear_test_values("")
 
   def GetParams(self):
     """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
@@ -197,39 +192,10 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     return not (IsQuantizationMode(run_params.precision_mode) and
                 not run_params.use_calibration)
 
-  def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True):
-    """Verify the state of a particular engine after sess.run()."""
-    if graph_state == GraphState.ORIGINAL:
-      self._ExpectCalibration(engine_name, "")
-      self._ExpectNativeSegment(engine_name, "")
-      self._ExpectTrtEngine(engine_name, "")
-    elif graph_state == GraphState.CALIBRATE:
-      self._ExpectCalibration(engine_name, "done")
-      self._ExpectNativeSegment(engine_name, "done")
-      self._ExpectTrtEngine(engine_name, "")
-    elif graph_state == GraphState.INFERENCE:
-      self._ExpectCalibration(engine_name, "")
-      if expect_run:
-        self._ExpectNativeSegment(engine_name, "")
-        self._ExpectTrtEngine(engine_name, "done")
-      else:
-        self._ExpectNativeSegment(engine_name, "done")
-        self._ExpectTrtEngine(engine_name, "")
-
-  def VerifyRun(self, run_params, graph_state):
-    """Verify the state of all engines after sess.run()."""
-    for engine_name in self.ExpectedEnginesToBuild(run_params):
-      expect_run = (engine_name in self.ExpectedEnginesToRun(run_params))
-      self.VerifyRunForEngine(engine_name, graph_state, expect_run)
-
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build, implemented by subclass."""
     raise NotImplementedError()
 
-  def ExpectedEnginesToRun(self, run_params):
-    """Return the expected engines to run."""
-    return self.ExpectedEnginesToBuild(run_params)
-
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-02
@@ -243,13 +209,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       self._trt_test_params = self.GetParams()
     return self._trt_test_params
 
-  def _PrepareRun(self, graph_state):
-    """Set up necessary testing environment before calling sess.run()."""
-    # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
-
   def _GetGPUOptions(self):
     gpu_options = config_pb2.GPUOptions()
     gpu_options.allow_growth = True
@@ -258,16 +217,18 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     conversion_params = self.GetConversionParams(run_params)
-    if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-          conversion_params.rewriter_config, conversion_params.max_batch_size,
+    if graph_state == GraphState.INFERENCE and run_params.use_optimizer:
+      rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
+          conversion_params.rewriter_config,
+          conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
           conversion_params.minimum_segment_size,
           conversion_params.is_dynamic_op,
           conversion_params.maximum_cached_engines,
           conversion_params.cached_engine_batches,
-          conversion_params.use_calibration)
+          conversion_params.use_calibration,
+          use_function_backup=IsQuantizationWithCalibration(conversion_params))
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
@@ -280,23 +241,22 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         gpu_options=self._GetGPUOptions(), graph_options=graph_options)
     return config
 
-  def _ExpectTestValue(self, engine_name, method, expected_value):
-    label = "%s:%s" % (engine_name, method)
-    actual_value = trt_convert.get_test_value(label)
-    self.assertEqual(
-        expected_value,
-        actual_value,
-        msg="Unexpected test value with label %s. Actual: %s; expected: %s" %
-        (label, actual_value, expected_value))
-
-  def _ExpectCalibration(self, engine_name, value):
-    self._ExpectTestValue(engine_name, "ExecuteCalibration", value)
-
-  def _ExpectTrtEngine(self, engine_name, value):
-    self._ExpectTestValue(engine_name, "ExecuteTrtEngine", value)
+  def _GetFeedNames(self):
+    params = self._GetParamsCached()
+    # Construct the feeds tensor names by appending :0 to the node names.
+    return [input_name + ":0" for input_name in params.input_names]
 
-  def _ExpectNativeSegment(self, engine_name, value):
-    self._ExpectTestValue(engine_name, "ExecuteNativeSegment", value)
+  def _GetFetchNames(self):
+    params = self._GetParamsCached()
+    # Construct the fetches tensor names by appending :0 to the node names.
+    return [output_name + ":0" for output_name in params.output_names]
+
+  def _GetFeedDict(self, inputs_data, input_shape_index):
+    assert input_shape_index < len(inputs_data)
+    feeds = self._GetFeedNames()
+    return {
+        feeds[i]: inputs_data[input_shape_index][i] for i in range(len(feeds))
+    }
 
   def _RunGraph(self,
                 run_params,
@@ -307,32 +267,22 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                 num_runs=2):
     """Run given graphdef multiple times."""
     params = self._GetParamsCached()
-    for current_input_data in inputs_data:
-      assert len(params.input_names) == len(current_input_data)
+    for data in inputs_data:
+      assert len(params.input_names) == len(data)
 
-    vals = []
+    fetches = self._GetFetchNames()
     g = ops.Graph()
     with g.as_default():
-      io_ops = importer.import_graph_def(
-          graph_def=gdef,
-          return_elements=params.input_names + params.output_names,
-          name="")
-      inputs = [op.outputs[0] for op in io_ops[:len(params.input_names)]]
-      for current_input_data in inputs_data:
-        assert len(inputs) == len(current_input_data)
-      outputs = [op.outputs[0] for op in io_ops[len(params.input_names):]]
-      with self.test_session(
+      importer.import_graph_def(graph_def=gdef, name="")
+      with self.session(
           graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+        vals = []
         # Run for each input(s) shape
         for shape_index in range(len(inputs_data)):
           val = None
-          # Defaults to 2 runs to verify result across multiple runs is same.
           for _ in range(num_runs):
-            self._PrepareRun(graph_state)
-            new_val = sess.run(outputs, {
-                inputs[i]: inputs_data[shape_index][i]
-                for i in range(len(inputs))
-            })
+            new_val = sess.run(fetches,
+                               self._GetFeedDict(inputs_data, shape_index))
             output_len = len(params.expected_output_dims[shape_index])
             self.assertEqual(output_len, len(new_val))
             for i in range(output_len):
@@ -342,27 +292,16 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             if val is not None:
               self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
             val = new_val
-            self.VerifyRun(run_params, graph_state)
           vals.append(val)
-    return vals
-
-  # Use real data that is representative of the inference dataset
-  # for calibration. For this test script it is random data.
-  def _RunCalibration(self, run_params, gdef, inputs_data, config):
-    """Run calibration on given graph."""
-    return self._RunGraph(
-        run_params, gdef, inputs_data, config, GraphState.CALIBRATE, num_runs=5)
+        return vals
 
-  def _GetTrtGraphDef(self, run_params, graph_state, gdef):
-    """Return trt converted graphdef."""
+  def _CreateConverter(self, gdef, session_config, conversion_params):
+    """Return a TrtGraphConverter."""
     params = self._GetParamsCached()
-    conversion_params = self.GetConversionParams(run_params)
-    logging.info(conversion_params)
-
-    config_for_trt = self._GetConfigProto(run_params, graph_state)
-    return trt_convert.create_inference_graph(
+    converter = trt_convert.TrtGraphConverter(
         input_graph_def=gdef,
-        outputs=params.input_names + params.output_names,
+        nodes_blacklist=params.input_names + params.output_names,
+        session_config=session_config,
         max_batch_size=conversion_params.max_batch_size,
         max_workspace_size_bytes=conversion_params.max_workspace_size_bytes,
         precision_mode=conversion_params.precision_mode,
@@ -371,7 +310,42 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batches=conversion_params.cached_engine_batches,
         use_calibration=conversion_params.use_calibration,
-        session_config=config_for_trt)
+        use_function_backup=IsQuantizationWithCalibration(conversion_params))
+    return converter
+
+  def _GetCalibratedInferGraph(self, run_params, gdef, inputs_data):
+    """Return trt converted graphdef in INT8 mode."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+    assert conversion_params.precision_mode == "INT8"
+    assert conversion_params.is_dynamic_op
+    assert conversion_params.maximum_cached_engines == 1
+    assert not conversion_params.cached_engine_batches
+    assert conversion_params.use_calibration
+    assert len(inputs_data) == 1  # We only support calibrating single engine.
+
+    session_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
+    logging.info("Running calibration graph, config:\n%s", str(session_config))
+
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    int8_gdef = converter.convert()
+    self._VerifyGraphDef(run_params, int8_gdef, GraphState.CALIBRATE)
+
+    return converter.calibrate(
+        fetch_names=self._GetFetchNames(),
+        num_runs=5,
+        feed_dict_fn=lambda: self._GetFeedDict(inputs_data, 0))
+
+  def _GetInferGraph(self, run_params, gdef):
+    """Return trt converted graphdef."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+
+    session_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
+    logging.info("Creating TRT graph for inference, config\n%s",
+                 str(session_config))
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    return converter.convert()
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -446,23 +420,29 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(
         expected_input_map,
         actual_input_map,
-        msg="expected:\n%s\nvs actual:\n%s" % (sorted(
-            expected_input_map.items()), sorted(actual_input_map.items())))
+        msg="expected:\n%s\nvs actual:\n%s" %
+        (sorted(expected_input_map.items()), sorted(actual_input_map.items())))
 
   def _VerifyGraphDef(self, run_params, gdef, graph_state):
     self._WriteGraph(run_params, gdef, graph_state)
 
     expected_engines = self.ExpectedEnginesToBuild(run_params)
     num_engines = 0
+    functions = [f.signature.name for f in gdef.library.function]
     for node in gdef.node:
       if node.op == "TRTEngineOp":
         logging.info("Found TRTEngineOp: " + node.name)
-    for node in gdef.node:
-      if node.op == "TRTEngineOp":
         num_engines += 1
-        self.assertTrue(node.name in expected_engines, node.name)
+        segment_funcdef_name = node.attr["segment_funcdef_name"].s
+        function_name = node.name + "_native_segment"
+        if IsQuantizationWithCalibration(run_params):
+          self.assertNotEmpty(segment_funcdef_name, node.name)
+          self.assertIn(function_name, functions)
+        else:
+          self.assertEmpty(segment_funcdef_name, node.name)
+          self.assertNotIn(function_name, functions)
+        self.assertIn(node.name, expected_engines)
         self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
-        self.assertTrue(len(node.attr["segment_funcdef_name"].s), node.name)
         self.assertEqual(
             self._ToBytes(run_params.precision_mode),
             node.attr["precision_mode"].s, node.name)
@@ -521,9 +501,10 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             (scale * np.random.random_sample(dims)).astype(dtype))
       inputs_data.append(current_input_data)
 
+    # Verify original graph.
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
-    # Get reference result without running trt.
+    # Run original graph without trt to get reference result.
     config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
@@ -533,27 +514,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     # Run calibration if necessary.
     if (IsQuantizationMode(run_params.precision_mode) and
         run_params.use_calibration):
-
-      calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
-      logging.info("Running calibration graph, config:\n%s", str(calib_config))
-      if run_params.use_optimizer:
-        result = self._RunCalibration(run_params, input_gdef, inputs_data,
-                                      calib_config)
-      else:
-        calib_gdef = self._GetTrtGraphDef(run_params, GraphState.CALIBRATE,
-                                          input_gdef)
-        self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
-        result = self._RunCalibration(run_params, calib_gdef, inputs_data,
-                                      calib_config)
-      infer_gdef = trt_convert.calib_graph_to_infer_graph(
-          calib_gdef, run_params.dynamic_engine)
+      infer_gdef = self._GetCalibratedInferGraph(run_params, input_gdef,
+                                                 inputs_data)
+      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
+    elif not run_params.use_optimizer:
+      infer_gdef = self._GetInferGraph(run_params, input_gdef)
       self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
-      self.assertAllClose(
-          ref_result,
-          result,
-          atol=self.ExpectedAbsoluteTolerance(run_params),
-          rtol=self.ExpectedRelativeTolerance(run_params))
     else:
       infer_gdef = input_gdef
 
@@ -561,11 +527,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
-    if not run_params.use_optimizer:
-      infer_gdef = self._GetTrtGraphDef(run_params, GraphState.INFERENCE,
-                                        infer_gdef)
-      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
     result = self._RunGraph(run_params, infer_gdef, inputs_data, infer_config,
                             GraphState.INFERENCE)
     self.assertAllClose(
@@ -610,9 +571,8 @@ def _AddTests(test_class):
   for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
-        # TODO(aaroey): if use_optimizer is True we need to get the inference
-        # graphdef using custom python wrapper class, which is not currently
-        # supported yet.
+        # We ignore the use_optimizer option and always use TrtGraphConverter
+        # for INT8 mode, so no need to run it twice.
         continue
       if use_calibration and not dynamic_engine:
         # Static engine with use_calibration=False will be static, so we want to
@@ -639,5 +599,5 @@ def _AddTests(test_class):
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
-if trt_convert.is_tensorrt_enabled():
+if is_tensorrt_enabled():
   _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/contrib/tensorrt/test/topk_test.py b/tensorflow/python/compiler/tensorrt/test/topk_test.py
similarity index 60%
rename from tensorflow/contrib/tensorrt/test/topk_test.py
rename to tensorflow/python/compiler/tensorrt/test/topk_test.py
index 633a51982b9a6acf1926033628793c1edbd2d118..1e2bf3b65c32054693e92a83d1ba2d9074387f2d 100644
--- a/tensorflow/contrib/tensorrt/test/topk_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/topk_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import constant_op
@@ -54,5 +54,36 @@ class TopKTest(trt_test.TfTrtIntegrationTestBase):
     return {"TRTEngineOp_0": ["Const", "TopK"]}
 
 
+class TopKOutputTypeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing that output type of engine using Top-K is set correctly."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 100]
+    k = 5
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+      # Reshape will act as a layer between the TopK output and the engine
+      # output, requiring the output tensor of reshape to be set explicitly to
+      # int32.
+      indices = array_ops.reshape(indices, [100, 1, 5], name="Reshape")
+      values = array_ops.identity(values, name="output_values")
+      indices = array_ops.identity(indices, name="output_indices")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=["output_values", "output_indices"],
+        expected_output_dims=[[[100, k], [100, 1, k]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {"TRTEngineOp_0": ["Const", "TopK", "Reshape", "Reshape/shape"]}
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/python/compiler/tensorrt/test/unary_test.py
similarity index 97%
rename from tensorflow/contrib/tensorrt/test/unary_test.py
rename to tensorflow/python/compiler/tensorrt/test/unary_test.py
index 497ea2848aae42a61db4f8f5a5c973525d5892d9..83569bcfbf12a27fec8590d18a1b016b92a9cf86 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/unary_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
rename to tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index b5fed73d2d75271e2c5c533670923d42f233e80b..97ee11747e889e4821c64c1cbafcfcee78d4405b 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
similarity index 96%
rename from tensorflow/contrib/tensorrt/test/vgg_block_test.py
rename to tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index 307128f1a89c46d63e851b6a7cd2d6abe7e39ff8..a4fa1d67059093e93da1bda55a20ba75f45776ff 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/compiler/tensorrt/trt_conversion.i b/tensorflow/python/compiler/tensorrt/trt_conversion.i
new file mode 100644
index 0000000000000000000000000000000000000000..d6e8eac5836cc73a61db8c9aec4bb8f5da5f89b0
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/trt_conversion.i
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Wrap trt_conversion */
+%{
+#define SWIG_FILE_WITH_INIT
+%}
+
+%{
+struct version_struct{
+  int vmajor;
+  int vminor;
+  int vpatch;
+};
+
+PyObject* version_helper(version_struct* in) {
+  PyObject *tuple(nullptr);
+  tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch);
+  if (!tuple) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Tuple creation from version structure failed!");
+    }
+    return NULL;
+  }
+  return tuple;
+}
+
+%}
+
+%typemap(out) version_struct {
+  PyObject *tuple = version_helper(&$1);
+  if (!tuple) SWIG_fail;
+  $result = tuple;
+}
+
+%{
+#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
+%}
+
+%ignore "";
+%rename("%s") get_linked_tensorrt_version;
+%rename("%s") get_loaded_tensorrt_version;
+%rename("%s") is_tensorrt_enabled;
+
+%{
+
+version_struct get_linked_tensorrt_version() {
+  // Return the version at the link time.
+  version_struct s;
+  tensorflow::tensorrt::GetLinkedTensorRTVersion(
+      &s.vmajor, &s.vminor, &s.vpatch);
+  return s;
+}
+
+version_struct get_loaded_tensorrt_version() {
+  // Return the version from the loaded library.
+  version_struct s;
+  tensorflow::tensorrt::GetLoadedTensorRTVersion(
+      &s.vmajor, &s.vminor, &s.vpatch);
+  return s;
+}
+
+bool is_tensorrt_enabled() {
+  return tensorflow::tensorrt::IsGoogleTensorRTEnabled();
+}
+
+%}
+
+version_struct get_linked_tensorrt_version();
+version_struct get_loaded_tensorrt_version();
+bool is_tensorrt_enabled();
+
+%rename("%s") "";
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1363c8728121666869702b3b89a20ec45fc8a0ea
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -0,0 +1,876 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper conversion to trt_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six as _six
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import saver
+
+
+def _to_bytes(s):
+  """Encode s if it is a sequence of chars."""
+  if isinstance(s, _six.text_type):
+    return s.encode("utf-8", errors="surrogateescape")
+  return s
+
+
+def _to_string(s):
+  """Decode s if it is a sequence of bytes."""
+  if isinstance(s, _six.binary_type):
+    return s.decode("utf-8")
+  return s
+
+
+class GraphConverter(object):
+  """Base class for offline converters to optimize SavedModels/GraphDefs.
+
+  A `GraphConverter` object encapsulates the environment to convert (optimize) a
+  TensorFlow SavedModel or GraphDef.
+
+  To create a custom GraphConverter:
+
+  ```python
+  class MyGraphConverter(GraphConverter):
+    ...
+
+    def get_rewriter_config(self, rewriter_config_template=None):
+      my_rewriter_config = ...
+      return my_rewriter_config
+  ```
+
+  Then to run the conversion without quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  converted_graph_def = my_converter.convert()
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+
+  To run the conversion with quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  my_converter.convert()
+
+  # Run calibration 10 times.
+  converted_graph_def = my_converter.calibrate(
+      fetch_names=['output:0'],
+      num_runs=10,
+      feed_dict_fn=lambda: {'input:0': my_next_data()})
+
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+  """
+
+  # TODO(laigd): clean up the parameters.
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_saved_model_signature_key=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_saved_model_signature_key: the key of the signature to optimize the
+        graph for.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a RewriterConfig for conversion. If not
+        specified, a default ConfigProto will be used.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+    """
+    if context.executing_eagerly():
+      if input_graph_def or not input_saved_model_dir:
+        raise ValueError(
+            "TF 2.0 only supports conversion of SavedModel, please specify "
+            "input_saved_model_dir as input.")
+    else:
+      if input_graph_def and input_saved_model_dir:
+        raise ValueError(
+            "Can only specify one of input_graph_def and input_saved_model_dir")
+      if not input_graph_def and not input_saved_model_dir:
+        raise ValueError("Must specify one of input_graph_def and "
+                         "input_saved_model_dir")
+
+      self._input_graph_def = input_graph_def
+      self._nodes_blacklist = nodes_blacklist
+
+    self._input_saved_model_dir = input_saved_model_dir
+    self._converted = False
+    self._grappler_meta_graph_def = None
+
+    self._input_saved_model_tags = (
+        input_saved_model_tags or [tag_constants.SERVING])
+    self._input_saved_model_signature_key = (
+        input_saved_model_signature_key or
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+    self._session_config = session_config or config_pb2.ConfigProto()
+
+    # For calibration usage.
+    self._calibration_graph = None
+    self._calibration_sess = None
+    self._calibration_data_collected = False
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        RewriterConfig for the conversion. The implementation should not modify
+        the template. If None, it will use a default one.
+
+    Returns:
+      A RewriterConfig proto which will be used to run the conversion using
+      Grappler.
+    """
+    raise NotImplementedError("get_rewriter_config")
+
+  def _run_conversion(self):
+    """Run Grappler's OptimizeGraph() tool to convert the graph."""
+    # Create custom ConfigProto for Grappler.
+    grappler_session_config = config_pb2.ConfigProto()
+    grappler_session_config.CopyFrom(self._session_config)
+    rewriter_config = None
+    if (grappler_session_config.HasField("graph_options") and
+        grappler_session_config.graph_options.HasField("rewrite_options")):
+      rewriter_config = grappler_session_config.graph_options.rewrite_options
+    custom_rewriter_config = self.get_rewriter_config(rewriter_config)
+    grappler_session_config.graph_options.rewrite_options.CopyFrom(
+        custom_rewriter_config)
+
+    # Run Grappler.
+    self._converted_graph_def = tf_optimizer.OptimizeGraph(
+        grappler_session_config,
+        self._grappler_meta_graph_def,
+        graph_id=b"tf_graph")
+    self._converted = True
+
+  def _add_nodes_blacklist(self):
+    if self._nodes_blacklist:
+      collection_def = self._grappler_meta_graph_def.collection_def["train_op"]
+      blacklist = collection_def.node_list.value
+      for i in self._nodes_blacklist:
+        if isinstance(i, ops.Tensor):
+          blacklist.append(_to_bytes(i.name))
+        else:
+          blacklist.append(_to_bytes(i))
+
+  def _convert_graph_def(self):
+    """Convert the input GraphDef."""
+    graph = ops.Graph()
+    with graph.as_default():
+      importer.import_graph_def(self._input_graph_def, name="")
+    self._grappler_meta_graph_def = saver.export_meta_graph(
+        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
+    self._add_nodes_blacklist()
+
+    self._run_conversion()
+
+  def _convert_saved_model(self):
+    """Convert the input SavedModel."""
+    graph = ops.Graph()
+    with session.Session(graph=graph, config=self._session_config) as sess:
+      input_meta_graph_def = loader.load(sess, self._input_saved_model_tags,
+                                         self._input_saved_model_dir)
+      input_signature_def = input_meta_graph_def.signature_def[
+          self._input_saved_model_signature_key]
+
+      def _gather_names(tensor_info):
+        """Get the node names from a TensorInfo."""
+        return set([tensor_info[key].name.split(":")[0] for key in tensor_info])
+
+      # Get input and outputs from all SignatureDef.
+      output_node_names = _gather_names(input_signature_def.inputs).union(
+          _gather_names(input_signature_def.outputs))
+
+      # Freeze the variables in the SavedModel graph and copy the frozen
+      # graph over.
+      frozen_graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph.as_graph_def(add_shapes=True),
+          list(output_node_names))
+      self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
+      self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
+
+      # Copy the collections that are not variables.
+      for key in input_meta_graph_def.collection_def:
+        # TODO(laigd): currently we use the collection key to filter out
+        # collections that depend on variable ops, but this may miss some
+        # other user-defined collections. A better way would be to use
+        # CollectionDef::NodeList for the filtering.
+        if key not in [
+            "variables", "local_variables", "model_variables",
+            "trainable_variables", "train_op", "table_initializer"
+        ]:
+          self._grappler_meta_graph_def.collection_def[key].CopyFrom(
+              input_meta_graph_def.collection_def[key])
+
+      self._add_nodes_blacklist()
+
+      # Copy other information.
+      self._grappler_meta_graph_def.meta_info_def.CopyFrom(
+          input_meta_graph_def.meta_info_def)
+      self._grappler_meta_graph_def.signature_def[
+          self._input_saved_model_signature_key].CopyFrom(input_signature_def)
+      # TODO(laigd): maybe add back AssetFileDef.
+
+    self._run_conversion()
+
+  # TODO(laigd): provide a utility function to optimize a ConcreteFunction and
+  # use it here (b/124792963).
+  def _convert_saved_model_v2(self):
+    """Convert the input SavedModel in 2.0 format."""
+    self._saved_model = load.load(self._input_saved_model_dir,
+                                  self._input_saved_model_tags)
+    func = self._saved_model.signatures[self._input_saved_model_signature_key]
+    frozen_func = convert_to_constants.convert_variables_to_constants_v2(func)
+    self._grappler_meta_graph_def = saver.export_meta_graph(
+        graph_def=frozen_func.graph.as_graph_def(), graph=frozen_func.graph)
+
+    # Add a collection 'train_op' so that Grappler knows the outputs.
+    fetch_collection = meta_graph_pb2.CollectionDef()
+    for array in func.inputs + func.outputs:
+      fetch_collection.node_list.value.append(array.name)
+    self._grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
+        fetch_collection)
+
+    # Run TRT optimizer in Grappler to convert the graph.
+    self._run_conversion()
+
+    def _get_tensor(graph, tensors):
+      new_tensors = []
+      for tensor in tensors:
+        new_tensor = graph.get_tensor_by_name(tensor.name)
+        new_tensor.set_shape(tensor.shape)
+        new_tensors.append(new_tensor)
+      return new_tensors
+
+    # TODO(laigd): do we need to use different name e.g. "trt_func_graph"?
+    converted_graph = func_graph.FuncGraph(func.graph.name)
+    with converted_graph.as_default():
+      importer.import_graph_def(self._converted_graph_def, name="")
+
+    converted_graph.inputs = _get_tensor(converted_graph, func.graph.inputs)
+    converted_graph.outputs = _get_tensor(converted_graph, func.graph.outputs)
+    converted_graph.structured_outputs = func.graph.structured_outputs
+    converted_graph.structured_input_signature = (
+        func.graph.structured_input_signature)
+
+    # pylint: disable=protected-access
+    # TODO(laigd): should we set up the signature as well?
+    self._converted_func = function.ConcreteFunction(
+        converted_graph, attrs=None, signature=None)
+    self._converted_func.add_to_graph()
+    self._converted_func._arg_keywords = func._arg_keywords
+    self._converted_func._num_positional_args = func._num_positional_args
+    self._converted_func._captured_inputs = func._captured_inputs
+    self._converted_func.graph.variables = func.graph.variables
+    # pylint: enable=protected-access
+
+  def convert(self):
+    """Run the conversion.
+
+    Returns:
+      The converted GraphDef for TF 1.x, or the converted ConcreteFunction in TF
+      2.0+.
+    """
+    assert not self._converted
+
+    if context.executing_eagerly():
+      self._convert_saved_model_v2()
+      return self._converted_func
+    else:
+      if self._input_graph_def:
+        self._convert_graph_def()
+      else:
+        self._convert_saved_model()
+      return self._converted_graph_def
+
+  def calibrate(self,
+                fetch_names,
+                num_runs,
+                feed_dict_fn=None,
+                input_map_fn=None):
+    """Run the calibration and return the calibrated GraphDef.
+
+    Args:
+      fetch_names: a list of output tensor name to fetch during calibration.
+      num_runs: number of runs of the graph during calibration.
+      feed_dict_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to values (e.g. Python list,
+        numpy arrays, etc). One and only one of `feed_dict_fn` and
+        `input_map_fn` should be specified.
+      input_map_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to Tensor objects. The values
+        of the named input tensors in the GraphDef to be calibrated will be
+        re-mapped to the respective `Tensor` values during calibration. One and
+        only one of `feed_dict_fn` and `input_map_fn` should be specified.
+
+    Raises:
+      ValueError: if the input combination is invalid.
+      RuntimeError: if this method is called in eager mode.
+
+    Returns:
+      The GraphDef after the calibration.
+    """
+    assert self._converted
+    assert not self._calibration_sess
+
+    if context.executing_eagerly():
+      raise RuntimeError("Calibration for TF 2.0 is not supported yet.")
+
+    if (feed_dict_fn and input_map_fn) or (not feed_dict_fn and
+                                           not input_map_fn):
+      raise ValueError(
+          "Should specify one and only one of feed_dict_fn and input_map_fn.")
+
+    self._calibration_graph = ops.Graph()
+    with self._calibration_graph.as_default():
+      fetches = importer.import_graph_def(
+          self._converted_graph_def,
+          input_map=input_map_fn() if input_map_fn else None,
+          return_elements=fetch_names,
+          name="")
+    self._calibration_sess = session.Session(
+        graph=self._calibration_graph, config=self._session_config)
+
+    for _ in range(num_runs):
+      self._calibration_sess.run(
+          fetches, feed_dict=feed_dict_fn() if feed_dict_fn else None)
+
+    self.finalize_calibration()
+    return self._converted_graph_def
+
+  def finalize_calibration(self):
+    """Clean up calibration resources and finalize the calibration.
+
+    Implementations need to close self._calibration_sess before returning.
+    """
+    raise NotImplementedError("finalize_calibration")
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel.
+
+    Args:
+      output_saved_model_dir: construct a SavedModel using the converted
+        GraphDef and save it to the specified directory. This option only works
+        when the input graph is loaded from a SavedModel, i.e. when
+        input_saved_model_dir is specified and input_graph_def is None in
+        __init__().
+
+    Raises:
+      ValueError: if the input to the converter is a GraphDef instead of a
+      SavedModel.
+    """
+    assert self._converted
+
+    if context.executing_eagerly():
+      # Rewrite the signature map using the optimized ConcreteFunction.
+      signatures = {
+          key: value for key, value in self._saved_model.signatures.items()
+      }
+      signatures[self._input_saved_model_signature_key] = self._converted_func
+      save.save(self._saved_model, output_saved_model_dir, signatures)
+    else:
+      if self._input_graph_def:
+        raise ValueError(
+            "Not able to save to a SavedModel since input is a GraphDef")
+
+      # Write the transformed graphdef as SavedModel.
+      saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
+      with ops.Graph().as_default():
+        importer.import_graph_def(self._converted_graph_def, name="")
+        # We don't use any specific converter here.
+        with session.Session(config=self._session_config) as sess:
+          saved_model_builder.add_meta_graph_and_variables(
+              sess,
+              self._input_saved_model_tags,
+              signature_def_map=self._grappler_meta_graph_def.signature_def)
+      # Ignore other meta graphs from the input SavedModel.
+      saved_model_builder.save()
+
+
+class TrtPrecisionMode(object):
+  FP32 = "FP32"
+  FP16 = "FP16"
+  INT8 = "INT8"
+
+  @staticmethod
+  def supported_precision_modes():
+    return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
+
+
+# Use a large enough number as the default max_workspace_size for TRT engines,
+# so it can produce reasonable performance results with the default.
+DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
+
+
+class TrtGraphConverter(GraphConverter):
+  """A GraphConverter for TRT transformation."""
+
+  _TRT_CALIBRATION_RESOURCE_CONTAINER_NAME = "TF_TRT_Calibration"
+
+  @classmethod
+  def get_tensorrt_rewriter_config(
+      cls,
+      rewriter_config_template=None,
+      max_batch_size=1,
+      max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+      precision_mode=TrtPrecisionMode.FP32,
+      minimum_segment_size=3,
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=None,
+      use_calibration=True,
+      use_function_backup=True):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        TRT-enabled RewriterConfig. If None, it will use a default one.
+      max_batch_size: max size for the input batch
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph by running calibration with calibrate(). If set to False,
+        quantization nodes will be expected for every tensor in the graph
+        (exlcuding those which will be fused). If a range is missing, an error
+        will occur. Please note that accuracy may be negatively affected if
+        there is a mismatch between which tensors TRT quantizes and which
+        tensors were trained with fake quantization.
+      use_function_backup: if set to True, it will create a FunctionDef for each
+        subgraph that is converted to TRT op, and if TRT ops fail to execute at
+        runtime, it'll invoke that function as a fallback.
+
+    Returns:
+      A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
+
+    Raises:
+      TypeError: if any of the parameters are of unexpected type.
+      ValueError: if any of the parameters are of unexpected value.
+    """
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+    # Import a random symbol to trigger loading of TRT library.
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    # pylint: enable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+
+    if rewriter_config_template is not None and not isinstance(
+        rewriter_config_template, rewriter_config_pb2.RewriterConfig):
+      raise TypeError(
+          "rewriter_config_template should be a RewriterConfig proto.")
+
+    rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
+    if rewriter_config_template is None:
+      # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
+      # need to run constant folding again.
+      rewriter_config_with_trt.optimizers.extend(
+          ["constfold", "layout", "constfold"])
+      rewriter_config_with_trt.meta_optimizer_iterations = (
+          rewriter_config_pb2.RewriterConfig.ONE)
+    else:
+      rewriter_config_with_trt.CopyFrom(rewriter_config_template)
+
+    optimizer = rewriter_config_with_trt.custom_optimizers.add()
+    optimizer.name = "TensorRTOptimizer"
+    optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
+    optimizer.parameter_map["max_batch_size"].i = max_batch_size
+    optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
+    optimizer.parameter_map[
+        "max_workspace_size_bytes"].i = max_workspace_size_bytes
+    optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
+    optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
+    if cached_engine_batches:
+      optimizer.parameter_map["cached_engine_batches"].list.i.extend(
+          cached_engine_batches)
+    optimizer.parameter_map["use_calibration"].b = use_calibration
+    optimizer.parameter_map["use_function_backup"].b = use_function_backup
+    return rewriter_config_with_trt
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_saved_model_signature_key=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None,
+               max_batch_size=1,
+               max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+               precision_mode=TrtPrecisionMode.FP32,
+               minimum_segment_size=3,
+               is_dynamic_op=False,
+               maximum_cached_engines=1,
+               cached_engine_batches=None,
+               use_calibration=True,
+               use_function_backup=True):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_saved_model_signature_key: the key of the signature to optimize the
+        graph for.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a TRT-enabled ConfigProto for conversion. If not
+        specified, a default ConfigProto will be used.
+      max_batch_size: max size for the input batch.
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph by running calibration with calibrate(). If set to False,
+        quantization nodes will be expected for every tensor in the graph
+        (exlcuding those which will be fused). If a range is missing, an error
+        will occur. Please note that accuracy may be negatively affected if
+        there is a mismatch between which tensors TRT quantizes and which
+        tensors were trained with fake quantization.
+      use_function_backup: if set to True, it will create a FunctionDef for each
+        subgraph that is converted to TRT op, and if TRT ops fail to execute at
+        runtime, it'll invoke that function as a fallback.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+      RuntimeError: if the TensorRT library version is incompatible.
+    """
+    super(TrtGraphConverter, self).__init__(
+        input_saved_model_dir=input_saved_model_dir,
+        input_saved_model_tags=input_saved_model_tags,
+        input_saved_model_signature_key=input_saved_model_signature_key,
+        input_graph_def=input_graph_def,
+        nodes_blacklist=nodes_blacklist,
+        session_config=session_config)
+
+    # TODO(laigd): move all the validations below to
+    # get_tensorrt_rewriter_config().
+
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # Check compatibility of TensorRT version.
+    compiled_version = get_linked_tensorrt_version()
+    loaded_version = get_loaded_tensorrt_version()
+    tf_logging.info("Linked TensorRT version: %s" % str(compiled_version))
+    tf_logging.info("Loaded TensorRT version: %s" % str(loaded_version))
+    version_mismatch = False
+    if loaded_version[0] < compiled_version[0]:
+      tf_logging.error(
+          "TensorRT version mismatch. Tensorflow was compiled against " +
+          "TensorRT %s but library loaded from environment is TensorRT %s" %
+          (".".join([str(x) for x in compiled_version]),
+           ".".join([str(x) for x in loaded_version])) +
+          ". Please make sure that correct version of TensorRT " +
+          "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
+      raise RuntimeError("Incompatible TensorRT library version")
+    for i in zip(loaded_version, compiled_version):
+      if i[0] != i[1]:
+        tf_logging.warn("TensorRT mismatch. Compiled against version " +
+                        "%s, but loaded %s. Things may not work" %
+                        (".".join([str(x) for x in compiled_version]),
+                         ".".join([str(x) for x in loaded_version])))
+        version_mismatch = True
+        break
+    if not version_mismatch:
+      tf_logging.info("Running against TensorRT version %s" %
+                      ".".join([str(x) for x in loaded_version]))
+
+    # Check input arguments.
+    supported_precision_modes = TrtPrecisionMode.supported_precision_modes()
+    if precision_mode not in supported_precision_modes:
+      raise ValueError(("precision mode '{}' is not supported."
+                        "It should be one of {}").format(
+                            precision_mode, supported_precision_modes))
+
+    if cached_engine_batches:
+      if not isinstance(cached_engine_batches, list):
+        raise TypeError("cached_engine_batches should be a list.")
+      if len(cached_engine_batches) > maximum_cached_engines:
+        raise ValueError("cached_engine_batches should not contain more than "
+                         "maximum_cached_engines items.")
+
+    self._need_calibration = (
+        precision_mode == TrtPrecisionMode.INT8 and use_calibration)
+    self._use_function_backup = use_function_backup
+
+    # TODO(laigd): consider provide a mechanism to remove the fallback path
+    # after calibration is done.
+    if self._need_calibration and not use_function_backup:
+      raise ValueError(
+          "Calibration requires enabling fallback to TF function execution.")
+
+    # TODO(laigd):
+    # - Get rid of is_dynamic_op option, it should always be True, and it should
+    #   accept N shapes as input.
+    # - Verify in int8 mode that maximum_cached_engines and
+    #   cached_engine_batches are set appropriately.
+    # - If it fails to build the int8 engine it should return error.
+    self._max_batch_size = max_batch_size
+    self._max_workspace_size_bytes = max_workspace_size_bytes
+    self._precision_mode = precision_mode
+    self._minimum_segment_size = minimum_segment_size
+    self._is_dynamic_op = is_dynamic_op
+    self._maximum_cached_engines = maximum_cached_engines
+    self._cached_engine_batches = cached_engine_batches
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    return TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template,
+        max_batch_size=self._max_batch_size,
+        max_workspace_size_bytes=self._max_workspace_size_bytes,
+        precision_mode=self._precision_mode,
+        minimum_segment_size=self._minimum_segment_size,
+        is_dynamic_op=self._is_dynamic_op,
+        maximum_cached_engines=self._maximum_cached_engines,
+        cached_engine_batches=self._cached_engine_batches,
+        use_calibration=self._need_calibration,
+        use_function_backup=self._use_function_backup)
+
+  def finalize_calibration(self):
+    assert self._need_calibration
+    assert self._converted
+    assert not self._calibration_data_collected
+
+    # Lazily load the op, since it's not available in cpu-only builds. Importing
+    # this at top will cause tests that imports TF-TRT fail when they're built
+    # and run without CUDA/GPU.
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import get_serialized_resource_op
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # TODO(laigd): a better way would be to use self._calibration_sess to list
+    # all the devices, add one get_serialized_resource_op for each device, and
+    # fetch each such op for every resource until its found. This can work
+    # even when the device of the TRTEngineOp is empty or not fully specified.
+
+    # Maps device name to the corresponding get_serialized_resource_op.
+    device_to_get_resource_op_map = {}
+
+    with self._calibration_graph.as_default():
+      container_input = array_ops.placeholder(dtypes.string)
+      resource_name_input = array_ops.placeholder(dtypes.string)
+
+      for node in self._converted_graph_def.node:
+        if node.op == "TRTEngineOp":
+          # Adds the get_serialized_resource_op for the device if not done
+          # before. We only add one such op for each device.
+          # TODO(laigd): What if the device is empty?????
+          if node.device not in device_to_get_resource_op_map:
+            with self._calibration_graph.device(node.device):
+              serialized_resources_output = (
+                  get_serialized_resource_op(container_input,
+                                             resource_name_input))
+            device_to_get_resource_op_map[node.device] = (
+                serialized_resources_output)
+
+          # Get the calibration resource.
+          calibration_result = self._calibration_sess.run(
+              device_to_get_resource_op_map[node.device],
+              feed_dict={
+                  container_input:
+                      TrtGraphConverter
+                      ._TRT_CALIBRATION_RESOURCE_CONTAINER_NAME,
+                  resource_name_input:
+                      node.name
+              })
+          node.attr["calibration_data"].s = calibration_result
+
+    self._calibration_data_collected = True
+    self._calibration_sess.close()
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel."""
+    if self._need_calibration:
+      assert self._calibration_data_collected
+    super(TrtGraphConverter, self).save(output_saved_model_dir)
+
+
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    input_saved_model_signature_key=None,
+    output_saved_model_dir=None,
+    session_config=None):
+  """Python wrapper for the TRT transformation.
+
+  Args:
+    input_graph_def: a GraphDef object containing a model to be transformed. If
+      set to None, the graph will be read from the SavedModel loaded from
+      input_saved_model_dir.
+    outputs: list of tensors or node names for the model outputs. Only used when
+      input_graph_def is not None.
+    max_batch_size: max size for the input batch.
+    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+      engine can use at execution time. This corresponds to the 'workspaceSize'
+      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+    minimum_segment_size: the minimum number of nodes required for a subgraph to
+      be replaced by TRTEngineOp.
+    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
+      network and engine at run time.
+    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
+      If the number of cached engines is already at max but none of them can
+      serve the input, the TRTEngineOp will fall back to run the TF function
+      based on which the TRTEngineOp is created.
+    cached_engine_batches: a list of batch sizes used to create cached engines,
+      only used when is_dynamic_op is True. The length of the list should be <=
+      maximum_cached_engines, and the dynamic TRT op will use this list to
+      determine the batch sizes of the cached engines, instead of making the
+      decision on the fly. This is useful when we know the most common batch
+      size(s) the application is going to generate.
+    input_saved_model_dir: the directory to load the SavedModel which contains
+      the input graph to transforms. Used only when input_graph_def is None.
+    input_saved_model_tags: list of tags to load the SavedModel.
+    input_saved_model_signature_key: the key of the signature to optimize the
+      graph for.
+    output_saved_model_dir: if not None, construct a SavedModel using the
+      returned GraphDef and save it to the specified directory. This option only
+      works when the input graph is loaded from a SavedModel, i.e. when
+      input_saved_model_dir is specified and input_graph_def is None.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
+
+  Returns:
+    A GraphDef transformed from input_graph_def (or the SavedModel graph def
+    loaded from input_saved_model_dir, if input_graph_def is not present), where
+    all TRT compatible subgraphs are replaced with TRTEngineOps, and a TF
+    function is added for each of the subgraphs.
+
+    If is_dynamic_op is True, each TRTEngineOp will contain a serialized
+    subgraph GraphDef, which will be converted to a TRT engine at execution time
+    and the TRT engine will be cached for future usage. A new TRT engine will be
+    created each time when none of the cached engines match the input shapes. If
+    it fails to execute the TRT engine or the number of cached engines reaches
+    maximum_cached_engines, the op will fall back to call the corresponding TF
+    function.
+
+    If is_dynamic_op is False, each TRTEngineOp will contain a serialized TRT
+    engine created from the corresponding subgraph. No more engines will be
+    created on the fly, and the op will fall back to call the corresponding TF
+    function when it fails to execute the engine.
+
+  Raises:
+    ValueError: if the combination of the parameters is invalid.
+  """
+  trt_converter = TrtGraphConverter(
+      input_saved_model_dir=input_saved_model_dir,
+      input_saved_model_tags=input_saved_model_tags,
+      input_saved_model_signature_key=input_saved_model_signature_key,
+      input_graph_def=input_graph_def,
+      nodes_blacklist=outputs,
+      session_config=session_config,
+      max_batch_size=max_batch_size,
+      max_workspace_size_bytes=max_workspace_size_bytes,
+      precision_mode=precision_mode,
+      minimum_segment_size=minimum_segment_size,
+      is_dynamic_op=is_dynamic_op,
+      maximum_cached_engines=maximum_cached_engines,
+      cached_engine_batches=cached_engine_batches,
+      use_calibration=False)
+  converted_graph_def = trt_converter.convert()
+  if output_saved_model_dir:
+    trt_converter.save(output_saved_model_dir)
+  return converted_graph_def
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74fb5eadd1a54d99a3ea4868a391824dc55c6d30
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -0,0 +1,472 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import utils
+from tensorflow.python.tools import saved_model_utils
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.tracking import tracking
+
+
+class TrtConvertTest(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration python API."""
+
+  # Use a small max_workspace_size for tests so they don't consume too much GPU
+  # memory.
+  _TRT_MAX_WORKSPACE_SIZE_BYTES = 2 << 20
+
+  def testGetTensorrtRewriterConfig(self):
+    """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
+    if not is_tensorrt_enabled():
+      return
+    rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template=None,
+        max_batch_size=128,
+        max_workspace_size_bytes=1234,
+        precision_mode="INT8",
+        minimum_segment_size=10,
+        is_dynamic_op=True,
+        maximum_cached_engines=2,
+        cached_engine_batches=[1, 128])
+    self.assertEqual(["constfold", "layout", "constfold"],
+                     rewriter_cfg.optimizers)
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
+                     rewriter_cfg.meta_optimizer_iterations)
+    trt_optimizer = None
+    for optimizer in rewriter_cfg.custom_optimizers:
+      if optimizer.name == "TensorRTOptimizer":
+        self.assertTrue(trt_optimizer is None)
+        trt_optimizer = optimizer
+    self.assertTrue(trt_optimizer is not None)
+    for key in [
+        "minimum_segment_size", "max_batch_size", "is_dynamic_op",
+        "max_workspace_size_bytes", "precision_mode", "maximum_cached_engines",
+        "cached_engine_batches"
+    ]:
+      self.assertTrue(key in trt_optimizer.parameter_map)
+    self.assertEqual(10, trt_optimizer.parameter_map["minimum_segment_size"].i)
+    self.assertEqual(128, trt_optimizer.parameter_map["max_batch_size"].i)
+    self.assertEqual(True, trt_optimizer.parameter_map["is_dynamic_op"].b)
+    self.assertEqual(1234,
+                     trt_optimizer.parameter_map["max_workspace_size_bytes"].i)
+    self.assertEqual(
+        trt_convert._to_bytes("INT8"),
+        trt_optimizer.parameter_map["precision_mode"].s)
+    self.assertEqual(2, trt_optimizer.parameter_map["maximum_cached_engines"].i)
+    self.assertEqual(
+        [1, 128], trt_optimizer.parameter_map["cached_engine_batches"].list.i)
+
+  def _GetConfigProto(self):
+    """Get ConfigProto for session creation."""
+    config = config_pb2.ConfigProto(
+        gpu_options=config_pb2.GPUOptions(allow_growth=True))
+    return config
+
+  def _GetGraph(self):
+    """Get the graph for testing."""
+    # The graph computes (input+1)^2, it looks like:
+    #
+    # input (Placeholder)  v1 (Variable)
+    #               |   \ /
+    #                \   +
+    #                 \ / \
+    #                  *   |
+    #                   \ /
+    #                    +
+    #                    |
+    #                 output (Identity)
+    g = ops.Graph()
+    with g.as_default():
+      with g.device("/GPU:0"):
+        inp = array_ops.placeholder(
+            dtype=dtypes.float32, shape=[None, 1, 1], name="input")
+        var = variables.VariableV1([[[1.0]]],
+                                   dtype=dtypes.float32,
+                                   name="v1",
+                                   use_resource=False)
+        add = inp + var.value()
+        mul = inp * add
+        add = mul + add
+        out = array_ops.identity(add, name="output")
+    return g, var, inp, out
+
+  def _GetGraphDef(self):
+    """Get the graph def for testing."""
+    g, var, _, _ = self._GetGraph()
+    with self.session(graph=g, config=self._GetConfigProto()) as sess:
+      sess.run(var.initializer)
+      graph_def = graph_util.convert_variables_to_constants(
+          sess, g.as_graph_def(add_shapes=True), ["output"])
+    node_name_to_op = {node.name: node.op for node in graph_def.node}
+    self.assertEqual({
+        "v1": "Const",
+        "v1/read": "Identity",
+        "input": "Placeholder",
+        "add": "Add",
+        "mul": "Mul",
+        "add_1": "Add",
+        "output": "Identity"
+    }, node_name_to_op)
+    return graph_def
+
+  def _WriteInputSavedModel(self, input_saved_model_dir):
+    """Write the saved model as an input for testing."""
+    g, var, inp, out = self._GetGraph()
+    signature_def = signature_def_utils.build_signature_def(
+        inputs={"myinput": utils.build_tensor_info(inp)},
+        outputs={"myoutput": utils.build_tensor_info(out)},
+        method_name=signature_constants.PREDICT_METHOD_NAME)
+    saved_model_builder = builder.SavedModelBuilder(input_saved_model_dir)
+    with self.session(graph=g, config=self._GetConfigProto()) as sess:
+      sess.run(var.initializer)
+      saved_model_builder.add_meta_graph_and_variables(
+          sess, [tag_constants.SERVING],
+          signature_def_map={"mypredict": signature_def})
+    saved_model_builder.save()
+
+  def _ConvertGraph(self,
+                    input_saved_model_dir=None,
+                    output_saved_model_dir=None,
+                    need_calibration=False,
+                    max_batch_size=1,
+                    minimum_segment_size=3,
+                    is_dynamic_op=False,
+                    maximum_cached_engines=1,
+                    use_function_backup=False):
+    """Helper method to convert a GraphDef or SavedModel using TF-TRT."""
+    converter = trt_convert.TrtGraphConverter(
+        input_saved_model_dir=input_saved_model_dir,
+        input_saved_model_signature_key="mypredict",
+        input_graph_def=None if input_saved_model_dir else self._GetGraphDef(),
+        nodes_blacklist=["output"],
+        session_config=self._GetConfigProto(),
+        max_batch_size=max_batch_size,
+        max_workspace_size_bytes=TrtConvertTest._TRT_MAX_WORKSPACE_SIZE_BYTES,
+        precision_mode=(trt_convert.TrtPrecisionMode.INT8 if need_calibration
+                        else trt_convert.TrtPrecisionMode.FP32),
+        minimum_segment_size=minimum_segment_size,
+        is_dynamic_op=is_dynamic_op,
+        maximum_cached_engines=maximum_cached_engines,
+        use_function_backup=use_function_backup)
+    conversion_result = converter.convert()
+
+    if context.executing_eagerly():
+      output_graph_def = conversion_result.graph.as_graph_def()
+    else:
+      output_graph_def = conversion_result
+
+      if need_calibration:
+
+        class CalibrationData(object):
+
+          def __init__(self):
+            self._data = 0
+
+          def next(self):
+            self._data += 1
+            return {"input:0": [[[self._data]]]}
+
+        output_graph_def = converter.calibrate(
+            fetch_names=["output:0"],
+            num_runs=10,
+            feed_dict_fn=CalibrationData().next)
+
+    if output_saved_model_dir is not None:
+      converter.save(output_saved_model_dir=output_saved_model_dir)
+    return output_graph_def
+
+  def _TestTrtGraphConverter(self,
+                             input_saved_model_dir=None,
+                             output_saved_model_dir=None,
+                             need_calibration=False,
+                             is_dynamic_op=False):
+    """General method to test trt_convert.TrtGraphConverter()."""
+    output_graph_def = self._ConvertGraph(
+        input_saved_model_dir=input_saved_model_dir,
+        output_saved_model_dir=output_saved_model_dir,
+        need_calibration=need_calibration,
+        is_dynamic_op=is_dynamic_op,
+        use_function_backup=need_calibration)
+    graph_defs_to_verify = [output_graph_def]
+
+    if output_saved_model_dir:
+      if context.executing_eagerly():
+        root = load.load(output_saved_model_dir)
+        saved_model_graph_def = root.signatures[
+            signature_constants
+            .DEFAULT_SERVING_SIGNATURE_DEF_KEY].graph.as_graph_def()
+      else:
+        saved_model_graph_def = saved_model_utils.get_meta_graph_def(
+            output_saved_model_dir, tag_constants.SERVING).graph_def
+      self.assertTrue(isinstance(saved_model_graph_def, graph_pb2.GraphDef))
+      graph_defs_to_verify.append(saved_model_graph_def)
+
+    for graph_def in graph_defs_to_verify:
+      node_name_to_op = {node.name: node.op for node in graph_def.node}
+      if context.executing_eagerly():
+        # In V2 the actual graph could be inside a function.
+        for func in graph_def.library.function:
+          node_name_to_op.update({node.name: node.op for node in func.node_def})
+        self.assertIn("TRTEngineOp_0", node_name_to_op)
+        self.assertEqual("TRTEngineOp", node_name_to_op["TRTEngineOp_0"])
+      else:
+        self.assertEqual({
+            "input": "Placeholder",
+            "TRTEngineOp_0": "TRTEngineOp",
+            "output": "Identity"
+        }, node_name_to_op)
+
+      if need_calibration:
+        trt_engine_nodes = [
+            node for node in graph_def.node if node.op == "TRTEngineOp"
+        ]
+        self.assertNotEmpty(trt_engine_nodes)
+        for node in trt_engine_nodes:
+          self.assertTrue(len(node.attr["calibration_data"].s))
+        # Run the calibrated graph.
+        # TODO(laigd): consider having some input where the answer is different.
+        with ops.Graph().as_default():
+          importer.import_graph_def(graph_def, name="")
+          with self.session(config=self._GetConfigProto()) as sess:
+            for test_data in range(10):
+              self.assertEqual((test_data + 1.0)**2,
+                               sess.run(
+                                   "output:0",
+                                   feed_dict={"input:0": [[[test_data]]]}))
+
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_BasicConversion(self):
+    """Test case for trt_convert.TrtGraphConverter()."""
+    if not is_tensorrt_enabled():
+      return
+
+    tmp_dir = self.get_temp_dir()
+    input_saved_model_dir = os.path.join(tmp_dir, "in_dir1")
+    self._WriteInputSavedModel(input_saved_model_dir)
+
+    for need_calibration in [False, True]:
+      # Use GraphDef as input.
+      self._TestTrtGraphConverter()
+
+      # Use SavedModel as input.
+      output_saved_model_dir = os.path.join(
+          tmp_dir, "out_dir1%s" % ("_int8" if need_calibration else ""))
+      self._TestTrtGraphConverter(
+          input_saved_model_dir=input_saved_model_dir,
+          output_saved_model_dir=output_saved_model_dir,
+          need_calibration=need_calibration)
+
+  @test_util.run_v2_only
+  def testTrtGraphConverter_BasicConversion_v2(self):
+    """Test case for trt_convert.TrtGraphConverter()."""
+    if not is_tensorrt_enabled():
+      return
+
+    # TODO(laigd): we need to use ops like conv2d so Grappler can infer the
+    # shapes (at least rank) of the tensors, so we're able to build an TRT
+    # engine in dynamic mode. Currently shape information is not propagate from
+    # ConcreteFunction to GraphDef, need to investigate and fix it.
+    class SimpleModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = None
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[None, 24, 24, 2], dtype=dtypes.float32)
+      ])
+      def run(self, inp):
+        if self.v is None:
+          self.v = variables.Variable([[[[1., 0.5, 4., 6., 0.5, 1.],
+                                         [1., 0.5, 1., 1., 0.5, 1.]]]])
+        conv = gen_nn_ops.conv2d(
+            input=inp, filter=self.v, strides=[1, 2, 2, 1], padding="SAME")
+        identity = array_ops.identity(conv)
+        return identity
+
+    tmp_dir = self.get_temp_dir()
+    input_saved_model_dir = os.path.join(tmp_dir, "in_dir1_v2")
+    root = SimpleModel()
+    save.save(root, input_saved_model_dir)
+
+    # Convert the SavedModel and verify the result.
+    output_saved_model_dir = os.path.join(tmp_dir, "out_dir1_v2")
+    self._TestTrtGraphConverter(
+        input_saved_model_dir=input_saved_model_dir,
+        output_saved_model_dir=output_saved_model_dir,
+        is_dynamic_op=True)
+
+  def _TestRun(self,
+               sess,
+               batch_size,
+               use_function_backup=False,
+               expect_engine_is_run=True):
+    try:
+      result = sess.run(
+          "output:0", feed_dict={"input:0": [[[1.0]]] * batch_size})
+      self.assertAllEqual([[[4.0]]] * batch_size, result)
+    except errors.OpError as e:
+      # This should happen only when fallback path is disabled and TRT engine
+      # fails to run.
+      self.assertTrue(not use_function_backup and not expect_engine_is_run)
+      self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e))
+
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_MinimumSegmentSize(self):
+    if not is_tensorrt_enabled():
+      return
+    output_graph_def = self._ConvertGraph(minimum_segment_size=5)
+    node_name_to_op = {node.name: node.op for node in output_graph_def.node}
+    self.assertEqual({
+        "v1/read": "Const",
+        "input": "Placeholder",
+        "add": "Add",
+        "mul": "Mul",
+        "add_1": "Add",
+        "output": "Identity"
+    }, node_name_to_op)
+
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_DynamicOp(self):
+    if not is_tensorrt_enabled():
+      return
+
+    tmp_dir = self.get_temp_dir()
+    input_saved_model_dir = os.path.join(tmp_dir, "in_dir2")
+    output_saved_model_dir = os.path.join(tmp_dir, "out_dir2")
+    self._WriteInputSavedModel(input_saved_model_dir)
+    output_graph_def = self._ConvertGraph(
+        input_saved_model_dir=input_saved_model_dir,
+        output_saved_model_dir=output_saved_model_dir,
+        is_dynamic_op=True,
+        maximum_cached_engines=2,
+        use_function_backup=False)  # Disallow fallback.
+
+    # Test the output GraphDef.
+    with ops.Graph().as_default():
+      importer.import_graph_def(output_graph_def, name="")
+      with self.session(config=self._GetConfigProto()) as sess:
+        # Run with batch size 1, a new engine is created and cached.
+        self._TestRun(sess, 1)
+        # Run with batch size 2, a new engine is created and cached.
+        self._TestRun(sess, 2)
+        # Run with batch size 3, since the number of cached engines has reached
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3)
+
+    # Test the output SavedModel
+    with ops.Graph().as_default():
+      with self.session(config=self._GetConfigProto()) as sess:
+        loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
+        # Run with batch size 1, a new engine is created and cached.
+        self._TestRun(sess, 1)
+        # Run with batch size 2, a new engine is created and cached.
+        self._TestRun(sess, 2)
+        # Run with batch size 3, since the number of cached engines has reached
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3)
+
+  def _TestStaticOp(self, use_function_backup):
+    if not is_tensorrt_enabled():
+      return
+
+    tmp_dir = self.get_temp_dir()
+    input_saved_model_dir = os.path.join(tmp_dir, "in_dir3")
+    output_saved_model_dir = os.path.join(tmp_dir, "out_dir3")
+    self._WriteInputSavedModel(input_saved_model_dir)
+    output_graph_def = self._ConvertGraph(
+        input_saved_model_dir=input_saved_model_dir,
+        output_saved_model_dir=output_saved_model_dir,
+        maximum_cached_engines=2,  # This is noop, added just for testing.
+        use_function_backup=use_function_backup)
+
+    # Test the output GraphDef.
+    with ops.Graph().as_default():
+      importer.import_graph_def(output_graph_def, name="")
+      with self.session(config=self._GetConfigProto()) as sess:
+        # Run with batch size 1, the default engine embedded in the graphdef
+        # will be used.
+        self._TestRun(
+            sess,
+            1,
+            use_function_backup=use_function_backup,
+            expect_engine_is_run=True)
+        # Run with batch size 2, which exceed the max_batch_size, it should try
+        # to fall back to TF function.
+        self._TestRun(
+            sess,
+            2,
+            use_function_backup=use_function_backup,
+            expect_engine_is_run=False)
+
+    # Test the output SavedModel
+    with ops.Graph().as_default():
+      with self.session(config=self._GetConfigProto()) as sess:
+        loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
+        # Run with batch size 1, the default engine embedded in the graphdef
+        # will be used.
+        self._TestRun(
+            sess,
+            1,
+            use_function_backup=use_function_backup,
+            expect_engine_is_run=True)
+        # Run with batch size 2, which exceed the max_batch_size, it should try
+        # to fall back to TF function.
+        self._TestRun(
+            sess,
+            2,
+            use_function_backup=use_function_backup,
+            expect_engine_is_run=False)
+
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_StaticOp_NoFallback(self):
+    self._TestStaticOp(use_function_backup=False)
+
+  @test_util.deprecated_graph_mode_only
+  def testTrtGraphConverter_StaticOp_WithFallback(self):
+    self._TestStaticOp(use_function_backup=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index fd0eca9dd7012ce44435dbbf6749121022c7ba29..031476100f448528503b5bc9b7c6c360caf9f8b1 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -6,15 +6,34 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_test(
+    name = "meta_benchmark",
+    srcs = ["meta_benchmark.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "benchmark_base",
+    srcs = ["benchmark_base.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -26,12 +45,8 @@ py_test(
     srcs = ["filter_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -40,9 +55,7 @@ py_test(
     srcs = ["from_tensor_slices_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
@@ -53,6 +66,7 @@ py_test(
     srcs = ["list_files_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":benchmark_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
@@ -67,11 +81,8 @@ py_test(
     srcs = ["map_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -80,8 +91,7 @@ py_test(
     srcs = ["range_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index 0ccf5c57d1954078bea1fca02885824a796236f5..8cad91212a7c3699be8fcc0140505a9c8824723e 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -17,70 +17,37 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
-# TODO(b/119837791): Add eager benchmarks.
-class BatchBenchmark(test.Benchmark):
+class BatchBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.batch()`."""
 
-  def benchmarkBatchSparse(self):
+  def benchmark_batch_sparse(self):
     non_zeros_per_row_values = [0, 1, 5, 10, 100]
     batch_size_values = [1, 32, 64, 128, 1024]
 
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
     for non_zeros_per_row in non_zeros_per_row_values:
 
-      sparse_value = sparse_tensor.SparseTensorValue(
+      tensor = sparse_tensor.SparseTensor(
           indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
           values=np.arange(non_zeros_per_row, dtype=np.int64),
           dense_shape=[1000])
 
       for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        self.report_benchmark(
-            iters=10000,
-            wall_time=median_wall_time,
-            name="sparse_num_elements_%d_batch_size_%d" %
-            (non_zeros_per_row, batch_size))
+        dataset = dataset_ops.Dataset.from_tensors(tensor).repeat().batch(
+            batch_size)
+        self.run_and_report_benchmark(
+            dataset,
+            num_elements=100000 // batch_size,
+            iters=1,
+            name="sparse_num_elements_%d_batch_size_%d" % (non_zeros_per_row,
+                                                           batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..11aaebacc08f9ad745514462cf8f7aaaa0731d5d
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -0,0 +1,89 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data benchmarking functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class DatasetBenchmarkBase(test.Benchmark):
+  """Base class for dataset benchmarks."""
+
+  def run_benchmark(self, dataset, num_elements, iters=1):
+    """Benchmarks the dataset.
+
+    Runs the dataset `iters` times. In each iteration, the benchmark measures
+    the time it takes to go through `num_elements` elements of the dataset.
+
+    Args:
+      dataset: Dataset to benchmark.
+      num_elements: Number of dataset elements to iterate through each benchmark
+        iteration.
+      iters: Number of times to repeat the timing.
+
+    Returns:
+      A float, representing the per-element wall time of the dataset in seconds.
+      This is the median time (with respect to `iters`) it takes for the dataset
+      to go through `num_elements` elements, divided by `num_elements.`
+    """
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
+    # the overhead of multiple `session.run()` calls. Note that this relies on
+    # the underlying implementation of `skip`: if it is optimized in the future,
+    # we will have to change this code.
+    dataset = dataset.skip(num_elements - 1)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+    next_element = nest.flatten(next_element)[0]
+
+    deltas = []
+    for _ in range(iters):
+      with session.Session() as sess:
+        # Run once to warm up the session caches.
+        sess.run(iterator.initializer)
+        sess.run(next_element)
+
+        sess.run(iterator.initializer)
+        start = time.time()
+        sess.run(next_element.op)
+        end = time.time()
+      deltas.append(end - start)
+    return np.median(deltas) / float(num_elements)
+
+  def run_and_report_benchmark(self,
+                               dataset,
+                               num_elements,
+                               name,
+                               iters=5,
+                               extras=None):
+    # Measure the per-element wall time.
+    wall_time = self.run_benchmark(dataset, num_elements, iters)
+
+    if extras is None:
+      extras = {}
+    extras["num_elements"] = num_elements
+    self.report_benchmark(
+        wall_time=wall_time, iters=iters, name=name, extras=extras)
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
index e0ecf19e11f95f0f2726eb0959ddc23ac9141283..eb47b4089c7f57f9426fd5dcc15b2296fdb0bd25 100644
--- a/tensorflow/python/data/benchmarks/filter_benchmark.py
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -17,51 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FilterBenchmark(test.Benchmark):
+class FilterBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.filter()`."""
 
   def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      options = dataset_ops.Options()
-      options.experimental_optimization.apply_default_optimizations = False
-      dataset = dataset.with_options(options)
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        self.report_benchmark(iters=100, wall_time=median_wall_time, name=name)
+    dataset = (
+        dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+    self.run_and_report_benchmark(dataset, num_elements=100000, name=name)
 
-  def benchmarkSimpleFunction(self):
+  def benchmark_simple_function(self):
     self._benchmark(array_ops.identity, "simple_function")
 
-  def benchmarkReturnComponentOptimization(self):
+  def benchmark_return_component_optimization(self):
     self._benchmark(lambda x: x, "return_component")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
index 4e5559ddbafb2ee0501ec9c87a98b314594cdc75..3af174acc320186ae368f23145bb9700e4d3aaa1 100644
--- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -17,170 +17,70 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FromTensorSlicesBenchmark(test.Benchmark):
+class FromTensorSlicesBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
 
-  def benchmarkSliceRepeatBatch(self):
+  def benchmark_slice_repeat_batch(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
+        dataset_ops.Dataset.from_tensor_slices(input_data).repeat(
+            num_epochs).batch(batch_size))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
         name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
 
-  def benchmarkSliceRepeatBatchCallable(self):
+  def benchmark_reshape_slice_repeat(self):
     input_size = 10000
-    batch_size = 100
+    reshape_dim = [100, 100]
     num_epochs = 100
 
+    num_elements = num_epochs * reshape_dim[0]
+
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
+        dataset_ops.Dataset.from_tensor_slices(
+            input_data.reshape(*reshape_dim)).repeat(num_epochs))
 
-    input_data = np.random.randn(input_size)
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="reshape_slice_repeat_input_%d" % input_size,
+    )
 
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
+  def benchmark_slice_batch_cache_repeat(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    options = dataset_ops.Options()
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+            batch_size).cache().repeat(num_epochs))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="slice_batch_cache_repeat_input_%d_batch_%d" % (input_size,
+                                                             batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index b620eaaed52c5bdea4fab776442ddd6bc2801605..75b71fffac4d493cb0d2e4d579597de302ad89c2 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -17,114 +17,51 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class MapBenchmark(test.Benchmark):
-  """Bechmarks for `tf.data.Dataset.map()`."""
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          benchmark_label = "_short_circuit"
+class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.Dataset.map()`."""
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                map_fn,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          options = dataset_ops.Options()
-          options.experimental_optimization.apply_default_optimizations = False
-          dataset = dataset.with_options(options)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+  def benchmark_chain_of_maps(self):
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
+    def benchmark_helper(chain_length, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset_ops.MapDataset(
+            dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="chain_length_%d%s" % (chain_length, label))
 
-            median_wall_time = np.median(deltas) / 100
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="chain_length_%d%s" % (chain_length, benchmark_label))
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      benchmark_helper(chain_length, lambda x: x + 1, True, "")
+      benchmark_helper(chain_length, lambda x: x + 1, False, "_single_threaded")
+      benchmark_helper(chain_length, lambda x: x, True, "_short_circuit")
 
-  def benchmarkMapFanOut(self):
+  def benchmark_map_fan_out(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          benchmark_label = "_short_circuit"
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              map_fn,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          options = dataset_ops.Options()
-          options.experimental_optimization.apply_default_optimizations = False
-          dataset = dataset.with_options(options)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+    def benchmark_helper(fan_out, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(
+          tuple(0 for _ in range(fan_out))).repeat(None)
+      dataset = dataset_ops.MapDataset(
+          dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="fan_out_%d%s" % (fan_out, label))
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="fan_out_%d%s" % (fan_out, benchmark_label))
+    for fan_out in fan_outs:
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], True, "")
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], False,
+                       "_single_threaded")
+      benchmark_helper(fan_out, lambda *xs: xs, True, "_short_circuit")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/meta_benchmark.py b/tensorflow/python/data/benchmarks/meta_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d888b2df0925140623c655d53ea473e08868af
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/meta_benchmark.py
@@ -0,0 +1,151 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data benchmarking functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import sleep
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.platform import test
+
+
+class MetaBenchmark(test.Benchmark):
+  """Benchmark that compares various ways of running tf.data benchmarks."""
+
+  # Note that each of these benchmarks is a separate method so that we can
+  # run them independently and collect a performance profile.
+
+  def setup_fast_dataset(self):
+    self.num_reps = 15
+    self.iters = 100000
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    return dataset_ops.Dataset.range(10000**2).with_options(options)
+
+  def benchmarkFastDatasetWithOnlyCppIterations(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_only_cpp_iterations(dataset)
+
+  def benchmarkFastDatasetWithSessionRun(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_session_run(dataset)
+
+  def benchmarkFastDatasetWithSessionCallable(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_session_run(dataset, make_callable=True)
+
+  def benchmarkFastDatasetInEager(self):
+    with context.eager_mode():
+      dataset = self.setup_fast_dataset()
+      self.run_benchmark_in_eager(dataset)
+
+  def setup_slow_dataset(self):
+    dataset = self.setup_fast_dataset()
+    self.iters = 1000
+    # sleep for 1e-3s per iteration
+    return dataset.apply(sleep.sleep(1000))
+
+  def benchmarkSlowDatasetWithOnlyCppIterations(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_only_cpp_iterations(dataset)
+
+  def benchmarkSlowDatasetWithSessionRun(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_session_run(dataset)
+
+  def benchmarkSlowDatasetWithSessionCallable(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_session_run(dataset, make_callable=True)
+
+  def benchmarkSlowDatasetInEager(self):
+    with context.eager_mode():
+      dataset = self.setup_slow_dataset()
+      self.run_benchmark_in_eager(dataset)
+
+  def report(self, deltas):
+    # Each `delta` is the time taken for `self.iters` iterations. Divide by the
+    # number of iterations here to get per-element iteration time.
+    deltas = np.array(deltas) / self.iters
+    # Discard the first 5 results from "warming up" the session.
+    deltas = deltas[5:]
+
+    median = np.median(deltas)
+    mean = np.mean(deltas)
+    min_val = np.min(deltas)
+    max_val = np.max(deltas)
+    extras = {
+        "iters_per_second": 1 / median,
+        "median": median,
+        "mean": mean,
+        "min": min_val,
+        "max": max_val,
+        "num_reps": self.num_reps - 5,
+    }
+    self.report_benchmark(wall_time=median, iters=self.iters, extras=extras)
+
+  def run_benchmark_in_eager(self, dataset):
+    deltas = []
+    for _ in range(self.num_reps):
+      iterator = iter(dataset)
+      deltas.append(timeit.timeit(lambda: next(iterator), number=self.iters))  # pylint: disable=cell-var-from-loop
+
+    self.report(deltas)
+
+  def run_benchmark_with_session_run(self, dataset, make_callable=False):
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      deltas = []
+      for _ in range(self.num_reps):
+        if make_callable:
+          get_next_element = sess.make_callable(next_element)
+        else:
+          # Note: session.run(next_element.op) is more performant than
+          # session.run(next_element) because we avoid the cost of copying the
+          # tensor from C++ to python.
+          get_next_element = lambda: sess.run(next_element.op)
+
+        sess.run(iterator.initializer)
+        deltas.append(timeit.timeit(get_next_element, number=self.iters))
+    self.report(deltas)
+
+  def run_benchmark_with_only_cpp_iterations(self, dataset):
+    """Benchmarks the dataset with the iterations performed in C++."""
+    # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
+    # the overhead of multiple `session.run()` calls. Note that this relies on
+    # the underlying implementation of `skip`: if it is optimized in the future,
+    # we will have to change this code.
+    dataset = dataset.skip(self.iters - 1)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      deltas = []
+      for _ in range(self.num_reps):
+        sess.run(iterator.initializer)
+        deltas.append(
+            timeit.timeit(lambda: sess.run(next_element.op), number=1))
+    self.report(deltas)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 375ff339a82207f8c5662ecf67ac47fc8c79c2a6..4eb7c94b564646964e236ed46c015f8f319f474e 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -17,54 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
 
 
-class RangeBenchmark(test.Benchmark):
+class RangeBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def _benchmarkRangeHelper(self, modeling_enabled):
-    num_elements = 10000000 if modeling_enabled else 50000000
-
-    # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
-    # C++, and focus on the minimal overheads (excluding Python invocation
-    # costs).
-    dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1)
-    options = dataset_ops.Options()
-    options.experimental_autotune = modeling_enabled
-    options.experimental_optimization.apply_default_optimizations = False
-    dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      # Run once to warm up the session caches.
-      sess.run(iterator.initializer)
-      sess.run(next_element)
-
-      # Run once for timing.
-      sess.run(iterator.initializer)
-      start = time.time()
-      sess.run(next_element)
-      end = time.time()
-
-      time_per_element = (end - start) / num_elements
-      self.report_benchmark(
-          iters=num_elements,
-          wall_time=time_per_element,
-          name="modeling_%s" % ("on" if modeling_enabled else "off"))
-
-  def benchmarkRange(self):
+  def benchmark_range(self):
     for modeling_enabled in [False, True]:
-      self._benchmarkRangeHelper(modeling_enabled)
+      num_elements = 10000000 if modeling_enabled else 50000000
+      options = dataset_ops.Options()
+      options.experimental_optimization.autotune = modeling_enabled
+      dataset = dataset_ops.Dataset.range(num_elements)
+      dataset = dataset.with_options(options)
+
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=num_elements,
+          name="modeling_%s" % ("on" if modeling_enabled else "off"))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 3c1d798bd23fec5990d6d1f3080e5a8557240aed..a5da41bdf4ac224dee042c1da6c6e147e0c721ae 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -42,12 +42,12 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@ThreadingOptions
 
 @@bucket_by_sequence_length
+@@bytes_produced_stats
 @@cardinality
 @@choose_from_datasets
 @@copy_to_device
 @@dense_to_sparse_batch
 @@enumerate_dataset
-@@filter_for_shard
 @@get_next_as_optional
 @@get_single_element
 @@group_by_reducer
@@ -58,6 +58,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
+@@map_and_batch_with_legacy_function
 @@parallel_interleave
 @@parse_example_dataset
 @@prefetch_to_device
@@ -82,6 +83,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
+from tensorflow.python.data.experimental.ops.batching import map_and_batch_with_legacy_function
 from tensorflow.python.data.experimental.ops.batching import unbatch
 from tensorflow.python.data.experimental.ops.cardinality import cardinality
 from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
@@ -89,7 +91,6 @@ from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNO
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
-from tensorflow.python.data.experimental.ops.filter_for_shard_ops import filter_for_shard
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
 from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
 from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
@@ -114,6 +115,7 @@ from tensorflow.python.data.experimental.ops.resampling import rejection_resampl
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
+from tensorflow.python.data.experimental.ops.stats_ops import bytes_produced_stats
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
 from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
 from tensorflow.python.data.experimental.ops.take_while_ops import take_while
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 4f2117ec9b07a7d22391d8e856588fe34ed4086f..39567d31529c4b7b4ed67cff46488bac9f4cf87b 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -124,6 +124,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_branch_benchmark",
+    srcs = ["choose_fastest_branch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/benchmarks:benchmark_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index bda7d38792a4aaaff6622f32f2101ad345eaa6da..4d9e625818262a6fb419b948dff0b0e8250c8e2d 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -45,7 +45,8 @@ class AutotuneBenchmark(test.Benchmark):
     dataset = dataset.map(
         math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
     options = dataset_ops.Options()
-    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = autotune
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
@@ -83,8 +84,8 @@ class AutotuneBenchmark(test.Benchmark):
             num_parallel_calls=optimization.AUTOTUNE,
             batch_size=batch_size))
     options = dataset_ops.Options()
-    options.experimental_autotune = autotune
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = autotune
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
@@ -121,8 +122,8 @@ class AutotuneBenchmark(test.Benchmark):
         cycle_length=10,
         num_parallel_calls=optimization.AUTOTUNE)
     options = dataset_ops.Options()
-    options.experimental_autotune = autotune
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = autotune
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
@@ -181,8 +182,8 @@ class AutotuneBenchmark(test.Benchmark):
     dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
     dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
     options = dataset_ops.Options()
-    options.experimental_autotune = autotune
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = autotune
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
diff --git a/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f8efedf6c1b1991b3d0d9a709982a01dc36dcb
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py
@@ -0,0 +1,69 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for ChooseFastestBranchDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.benchmarks import benchmark_base
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+
+
+class ChooseFastestBranchBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for ChooseFastestBranchDatast."""
+
+  def make_benchmark_datasets(self):
+
+    dataset = dataset_ops.Dataset.range(1000**2).repeat()
+
+    def branch_0(dataset):
+      return dataset.map(lambda x: x + 1).batch(100)
+
+    def branch_1(dataset):
+      return dataset.batch(100).map(lambda x: x + 1)
+
+    map_batch_dataset = branch_0(dataset)
+    batch_map_dataset = branch_1(dataset)
+    choose_fastest_dataset = optimization._ChooseFastestBranchDataset(  # pylint: disable=protected-access
+        dataset, [branch_0, branch_1],
+        ratio_numerator=100)
+    return map_batch_dataset, batch_map_dataset, choose_fastest_dataset
+
+  def benchmarkChooseFastest(self):
+    map_batch, batch_map, choose_fastest = self.make_benchmark_datasets()
+
+    def benchmark(dataset, name):
+      self.run_and_report_benchmark(dataset, 5000, name, iters=1)
+
+    benchmark(map_batch, "map_batch_dataset")
+    benchmark(batch_map, "batch_map_dataset")
+    benchmark(choose_fastest, "choose_fastest_dataset")
+
+  def benchmarkChooseFastestFirstNIterations(self):
+
+    map_batch, batch_map, choose_fastest = self.make_benchmark_datasets()
+
+    def benchmark(dataset, name):
+      self.run_and_report_benchmark(
+          dataset, num_elements=10, name="%s_first_10" % name, iters=5)
+
+    benchmark(map_batch, "map_batch_dataset")
+    benchmark(batch_map, "batch_map_dataset")
+    benchmark(choose_fastest, "choose_fastest_dataset")
+
+
+if __name__ == "__main__":
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
index 49297ca7c58f4ce3127e6e64944a09d0837cea3f..ac6d7d0360292f74cdd8b57eeab7450e362a0f27 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
@@ -25,7 +25,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -56,14 +56,14 @@ class MapDefunBenchmark(test.Benchmark):
     def defun(x):
       return array_ops.identity(x)
 
-    def map_fn(x):
+    def fn(x):
       return array_ops.identity(x)
 
     base = math_ops.range(100)
     for input_size in [10, 100, 1000, 10000]:
       num_iters = 100000 // input_size
       map_defun_op = map_defun.map_defun(defun, [base], [dtypes.int32], [()])
-      map_fn_op = functional_ops.map_fn(map_fn, base)
+      map_fn_op = map_fn.map_fn(fn, base)
 
       self._run(
           map_defun_op, "with_defun_size_%d" % input_size, num_iters=num_iters)
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index 50e3a5c469232e2ff3ea8f0bd74866d829c31770..4e3d58658fb34942d1367ab30e822d72a134d617 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -159,7 +159,7 @@ class MapVectorizationBenchmark(test.Benchmark):
 
   def benchmarkCast(self):
     self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+        lambda *args: [math_ops.cast(x, dtypes.float32) for x in args], "cast")
 
   def benchmarkReshape(self):
     self._benchmark_helper(
@@ -191,7 +191,8 @@ class MapVectorizationBenchmark(test.Benchmark):
       base_dataset = base_dataset.repeat()
       input_size = [
           tuple(shape.as_list())
-          for shape in nest.flatten(base_dataset.output_shapes)
+          for shape in nest.flatten(
+              dataset_ops.get_legacy_output_shapes(base_dataset))
       ]
       self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 04819130642d9558d5fbe247524b8a32bddefaf2..50626e4817839f9a4d448054a106b8a97ae611ce 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -45,6 +45,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:iterator_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_test(
@@ -113,6 +114,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "auto_shard_dataset_test",
+    size = "medium",
+    srcs = ["auto_shard_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":reader_dataset_ops_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "enumerate_dataset_test",
     size = "small",
@@ -322,6 +341,8 @@ py_test(
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/data/kernel_tests:test_base",
     ],
@@ -441,19 +462,15 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
     name = "reader_dataset_ops_test_base",
-    testonly = 1,
     srcs = [
         "reader_dataset_ops_test_base.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = [
-        "//tensorflow/python/data/experimental/kernel_tests:__pkg__",
-        "//tensorflow/python/data/experimental/kernel_tests/serialization:__pkg__",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -470,6 +487,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "rebatch_dataset_test",
+    size = "small",
+    srcs = ["rebatch_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "rejection_resample_test",
     size = "medium",
@@ -589,7 +621,7 @@ py_library(
 
 py_test(
     name = "sql_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["sql_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -729,4 +761,5 @@ cuda_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6b2c181cbb342de5b91eae7665462c5cb550e5e
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -0,0 +1,209 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `_AutoShardDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.experimental.ops import unique
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+def chunk(l, n):
+  for i in range(0, len(l), n):
+    yield l[i:i + n]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
+                           parameterized.TestCase):
+
+  def setUp(self):
+    super(AutoShardDatasetTest, self).setUp()
+    self._num_files = 10
+    self._num_records = 10
+    self.test_filenames = self._createFiles()
+
+  def testFlatMapReaderPipeline(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in (3, 8)
+        for r in range(0, 10)
+    ]
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
+  def testZipReaderPipeline(self):
+    dataset1 = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset1 = dataset1.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset2 = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset2 = dataset2.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        (b"Record %d of file %d" % (r, f), b"Record %d of file %d" % (r, f))  # pylint:disable=g-complex-comprehension
+        for r in range(0, 10)
+        for f in (3, 8)
+    ]
+
+    self.assertDatasetProduces(dataset, expected)
+
+  def testConcatenateReaderPipeline(self):
+    dataset1 = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset1 = dataset1.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset1 = dataset1.batch(5)
+    dataset2 = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset2 = dataset2.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset2 = dataset2.batch(5)
+
+    dataset = dataset1.concatenate(dataset2)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for r in range(0, 10)
+        for f in (3, 8)
+    ]
+    expected += expected
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
+  def testPipelineWithMap(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
+    dataset = dataset.batch(5)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for r in range(0, 10)
+        for f in (3, 8)
+    ]
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
+  def testValidPipelineWithRangeDataset(self):
+    dataset = dataset_ops.Dataset.range(self._num_files)
+    dataset = dataset.map(lambda n: string_ops.string_join(  # pylint:disable=g-long-lambda
+        [self.get_temp_dir(),
+         string_ops.string_format("/tf_record.{}.txt", [n])]))
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset = dataset.map(lambda x: string_ops.substr_v2(x, 2, 1000))
+    dataset = dataset.batch(5)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for r in range(0, 10)
+        for f in (3, 8)
+    ]
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
+  @parameterized.parameters((1, 0, 10, 10), (2, 1, 20, 5), (10, 1, 1, 10))
+  def testStandardReaderPipeline(self, num_epochs, index, batch_size,
+                                 parallel_reads):
+    dataset = readers.make_tf_record_dataset(
+        file_pattern=self.test_filenames,
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        parser_fn=None,
+        num_parallel_reads=parallel_reads,
+        drop_final_batch=True,
+        shuffle=False)
+    dataset = distribute._AutoShardDataset(dataset, 2, index)
+    outputs = self.getNext(dataset)
+    self._verify_records(
+        outputs,
+        batch_size=batch_size,
+        file_index=[i for i in range(index, self._num_records, 2)],
+        num_epochs=num_epochs,
+        interleave_cycle_length=parallel_reads,
+        drop_final_batch=True,
+        use_parser_fn=None)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(outputs())
+
+  def testSampleResNetPipeline(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset = dataset.batch(5)
+    dataset = distribute._AutoShardDataset(dataset, 5, 3)
+
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for r in range(0, 10)
+        for f in (3, 8)
+    ]
+    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
+
+  def testWorkersGreaterThanNumFiles(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(core_readers.TFRecordDataset, 10))
+    dataset = dataset.batch(5)
+    dataset = distribute._AutoShardDataset(dataset, 500, 499)
+    self.assertDatasetProduces(dataset, [])
+
+  def testNoReaderPipelines(self):
+    dataset = dataset_ops.Dataset.range(1024)
+    with self.assertRaises(errors.NotFoundError):
+      dataset = distribute._AutoShardDataset(dataset, 2, 0)
+      self.evaluate(self.getNext(dataset)())
+
+  def testUnsupportedOpInPipeline(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+    dataset = dataset.apply(unique.unique())
+
+    with self.assertRaises(errors.NotFoundError):
+      dataset = distribute._AutoShardDataset(dataset, 2, 0)
+      self.evaluate(self.getNext(dataset)())
+
+  def testInvalidWorkerIndex(self):
+    dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=True)
+    dataset = dataset.flat_map(core_readers.TFRecordDataset)
+    dataset = dataset.batch(5)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = distribute._AutoShardDataset(dataset, 2, 2)
+      self.evaluate(self.getNext(dataset)())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 0bbf0e9a12ba3170bd3c69e43824322b8b1eb059..4839bc8e49b843574e0824a929a462ac8ecd192f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -375,7 +375,7 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase,
           bucket_batch_sizes=[2, 2, 2],
           bucket_boundaries=[0, 8],
           no_padding=no_padding))
-      shapes = dataset.output_shapes
+      shapes = dataset_ops.get_legacy_output_shapes(dataset)
       self.assertEqual([None, None], shapes[0].as_list())
       self.assertEqual([None], shapes[1].as_list())
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index 4a8296d08482d4d800eb3bb0b94bbae940264da6..993b511d5e3635b38e6e0a73f86c873a39a6c127 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -49,8 +49,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
            dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
       ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
-       concatenate(dataset_ops.Dataset.range(5)),
-       cardinality.INFINITE),
+       concatenate(dataset_ops.Dataset.range(5)), cardinality.INFINITE),
       ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
           dataset_ops.Dataset.range(5).filter(lambda _: True)),
        cardinality.UNKNOWN),
@@ -70,8 +69,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
            dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
       ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)),
-       cardinality.UNKNOWN),
+          lambda _: dataset_ops.Dataset.from_tensors(0)), cardinality.UNKNOWN),
       ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
        cardinality.UNKNOWN),
       ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
@@ -117,6 +115,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        cardinality.INFINITE),
       ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
        5),
+      ("Shard1", lambda: dataset_ops.Dataset.range(5).shard(2, 0), 3),
+      ("Shard2", lambda: dataset_ops.Dataset.range(5).shard(8, 7), 0),
+      ("Shard3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).shard(2, 0),
+       cardinality.UNKNOWN),
+      ("Shard4", lambda: dataset_ops.Dataset.range(5).repeat().shard(2, 0),
+       cardinality.INFINITE),
       ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
       ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
       ("Skip3",
@@ -138,15 +143,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        5),
       ("Zip2", lambda: dataset_ops.Dataset.zip(
           (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
-      ("Zip3", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).repeat())), 5),
-      ("Zip4", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5).repeat(),
-           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
-      ("Zip5", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+      ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5).repeat(), dataset_ops.Dataset.range(3).repeat())),
+       cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).filter(lambda _: True))),
        cardinality.UNKNOWN),
       # pylint: enable=g-long-lambda
   )
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index d9fbe9e0e18c526e7e0bf88b9c3b477bf0917fe5..5d9c44f4de4f8bfd8b40800b1ef583632672863a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.util import compat as util_compat
 # TODO(b/117581999): add eager coverage when supported.
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -46,12 +46,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -63,7 +61,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceInt32(self):
     host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3])
     device_dataset = host_dataset.apply(
@@ -73,12 +71,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int32, next_element.dtype)
     self.assertEqual((4,), next_element.shape)
@@ -89,7 +85,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -99,12 +95,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -116,7 +110,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -126,12 +120,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -143,7 +135,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
@@ -153,12 +145,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
@@ -170,7 +160,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyDictToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
@@ -180,12 +170,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
@@ -197,7 +185,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopySparseTensorsToDevice(self):
 
     def make_tensor(i):
@@ -213,12 +201,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
 
@@ -232,7 +218,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopySparseTensorsToDeviceWithPrefetch(self):
 
     def make_tensor(i):
@@ -248,12 +234,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
 
@@ -267,6 +251,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -287,6 +272,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuWithPrefetch(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -307,6 +293,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuWithMap(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -326,7 +313,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = device_dataset.apply(
         prefetching_ops.map_on_gpu(gpu_map_func))
     options = dataset_ops.Options()
-    options.experimental_autotune = False
+    options.experimental_optimization.autotune = False
     device_dataset = device_dataset.with_options(options)
 
     with ops.device("/gpu:0"):
@@ -344,6 +331,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuInt32(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -363,6 +351,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuInt32AndPrefetch(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -382,6 +371,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuStrings(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -401,6 +391,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuStringsAndPrefetch(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -420,6 +411,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDevicePingPongCPUGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -443,7 +435,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         with self.assertRaises(errors.OutOfRangeError):
           self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -453,12 +445,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -474,7 +464,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceWithReInitAndPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -484,12 +474,10 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -505,6 +493,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -528,6 +517,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testCopyToDeviceGpuWithReInitAndPrefetch(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -551,6 +541,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testIteratorGetNextAsOptionalOnGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 436fa506c419dd73bf1836b9ba5486f9d435105b..79e4523ea4302e76357ffd93df431819d1fbd3fa 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -30,8 +31,9 @@ class CounterTest(test_base.DatasetTestBase):
   def testCounter(self):
     """Test dataset construction using `count`."""
     dataset = counter.Counter(start=3, step=4)
-    self.assertEqual([], dataset.output_shapes.as_list())
-    self.assertEqual(dtypes.int64, dataset.output_types)
+    self.assertEqual(
+        [], dataset_ops.get_legacy_output_shapes(dataset).as_list())
+    self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset))
     get_next = self.getNext(dataset)
 
     negative_dataset = counter.Counter(start=0, step=-1)
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index cbc048e3ab460c9bc3bf4efa63221f814075f4ac..8842f552e79dff5ae556999699a3f9eee37adc83 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -37,10 +37,12 @@ class EnumerateDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         enumerate_ops.enumerate_dataset(start))
 
-    self.assertEqual(dtypes.int64, dataset.output_types[0])
-    self.assertEqual((), dataset.output_shapes[0])
+    self.assertEqual(dtypes.int64,
+                     dataset_ops.get_legacy_output_types(dataset)[0])
+    dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+    self.assertEqual((), dataset_output_shapes[0])
     self.assertEqual([tensor_shape.TensorShape([])] * 3,
-                     [shape for shape in dataset.output_shapes[1]])
+                     [shape for shape in dataset_output_shapes[1]])
 
     self.assertDatasetProduces(dataset, [(20, (b"a", 1, 37.0)),
                                          (21, (b"b", 2, 38.0))])
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 3e2cf779a3f9d138e83986abcf5b8387d7c19412..f65740c56518c2c0baa1d1d56cac5e0314db4b97 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -22,10 +22,12 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -71,6 +73,52 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
 
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 1)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def increment_fn(x):
+      counter_var.assign(counter_var + 1)
+      return x
+
+    def multiply_fn(x):
+      counter_var.assign(counter_var * 2)
+      return x
+
+    def dataset1_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    def dataset2_fn():
+      return dataset_ops.Dataset.range(1).map(multiply_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset1_fn())
+      _ = get_single_element.get_single_element(dataset2_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 4194f06a34a8008ac2ed835b5300959bda9e3f78..60b493b5d775672e4baf176397c9d1ac43675d99 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -120,8 +120,9 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     for i in range(1, 11):
       dataset = dataset_ops.Dataset.from_tensors(np.int64(0)).repeat(i).apply(
           grouping.group_by_reducer(lambda x: x, reducer))
-      self.assertEqual([None], dataset.output_shapes[0].as_list())
-      self.assertIs(None, dataset.output_shapes[1].ndims)
+      dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+      self.assertEqual([None], dataset_output_shapes[0].as_list())
+      self.assertIs(None, dataset_output_shapes[1].ndims)
       get_next = self.getNext(dataset)
       x, y = self.evaluate(get_next())
       self.assertAllEqual([0] * (2**i), x)
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index 1d02f4fb773537de3800d4039d10112e465df285..dc3139812f3b2df0a0a647474e0414d379b4cc9d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -24,8 +24,10 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
@@ -100,6 +102,30 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  def testTFRecordDatasetIgnoreError(self):
+    filenames = []
+    for i in range(5):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(10):
+        writer.write(b"record")
+      writer.close()
+      # Append corrupted data
+      with open(fn, "a") as f:
+        f.write("corrupted data")
+
+    dataset = readers.TFRecordDataset(filenames).apply(
+        error_ops.ignore_errors())
+    get_next = self.getNext(dataset)
+
+    # All of the files are present.
+    for filename in filenames:
+      for j in range(10):
+        self.assertEqual(b"record", self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 1fb6971ecdec90964a6f860a797d7bf8ddf8bfb8..2ddff457bc461c55437977457332c38c98af3504 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -206,8 +207,9 @@ class MakeBatchedFeaturesDatasetTest(
         label_key="label",
         num_epochs=None,
         batch_size=32)
-    for shape, clazz in zip(nest.flatten(dataset.output_shapes),
-                            nest.flatten(dataset.output_classes)):
+    for shape, clazz in zip(
+        nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)),
+        nest.flatten(dataset_ops.get_legacy_output_classes(dataset))):
       if issubclass(clazz, ops.Tensor):
         self.assertEqual(32, shape[0])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index 3b7b335e7066175fba6ef190b977362bc461ca1d..267e3e894874ded71cf04c369d9719fc17f9166e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -449,6 +450,28 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         header=True,
     )
 
+  def testMakeCSVDataset_withNAValuesAndFieldDelim(self):
+    """Tests that datasets can be created from different delim and na_value."""
+    column_names = ["col%d" % i for i in range(5)]
+    inputs = [["0 1 2 3 4", "5 6 7 8 9"], ["10 11 12 13 14", "15 16 17 ? 19"]]
+    expected_output = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14],
+                       [15, 16, 17, 0, 19]]
+    label = "col0"
+
+    self._test_dataset(
+        inputs,
+        expected_output=expected_output,
+        expected_keys=column_names,
+        column_names=column_names,
+        label_name=label,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False,
+        header=False,
+        na_value="?",
+        field_delim=" ",
+    )
+
   def testMakeCSVDataset_withSelectCols(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
@@ -646,7 +669,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
     ]]
     filenames = self._setup_files(inputs)
     dataset = self._make_csv_dataset(filenames, batch_size=32, num_epochs=None)
-    for shape in nest.flatten(dataset.output_shapes):
+    for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
       self.assertEqual(32, shape[0])
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 9f35aa69a834dc82d50550a99665d5d248e02e0f..31b9cd65c4c011970ee31fc9a96ae15226775fcf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
@@ -30,84 +31,6 @@ from tensorflow.python.platform import test
 class MakeTFRecordDatasetTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
-  def _interleave(self, iterators, cycle_length):
-    pending_iterators = iterators
-    open_iterators = []
-    num_open = 0
-    for i in range(cycle_length):
-      if pending_iterators:
-        open_iterators.append(pending_iterators.pop(0))
-        num_open += 1
-
-    while num_open:
-      for i in range(min(cycle_length, len(open_iterators))):
-        if open_iterators[i] is None:
-          continue
-        try:
-          yield next(open_iterators[i])
-        except StopIteration:
-          if pending_iterators:
-            open_iterators[i] = pending_iterators.pop(0)
-          else:
-            open_iterators[i] = None
-            num_open -= 1
-
-  def _next_expected_batch(self,
-                           file_indices,
-                           batch_size,
-                           num_epochs,
-                           cycle_length,
-                           drop_final_batch,
-                           use_parser_fn):
-
-    def _next_record(file_indices):
-      for j in file_indices:
-        for i in range(self._num_records):
-          yield j, i
-
-    def _next_record_interleaved(file_indices, cycle_length):
-      return self._interleave([_next_record([i]) for i in file_indices],
-                              cycle_length)
-
-    record_batch = []
-    batch_index = 0
-    for _ in range(num_epochs):
-      if cycle_length == 1:
-        next_records = _next_record(file_indices)
-      else:
-        next_records = _next_record_interleaved(file_indices, cycle_length)
-      for f, r in next_records:
-        record = self._record(f, r)
-        if use_parser_fn:
-          record = record[1:]
-        record_batch.append(record)
-        batch_index += 1
-        if len(record_batch) == batch_size:
-          yield record_batch
-          record_batch = []
-          batch_index = 0
-    if record_batch and not drop_final_batch:
-      yield record_batch
-
-  def _verify_records(self,
-                      outputs,
-                      batch_size,
-                      file_index,
-                      num_epochs,
-                      interleave_cycle_length,
-                      drop_final_batch,
-                      use_parser_fn):
-    if file_index is not None:
-      file_indices = [file_index]
-    else:
-      file_indices = range(self._num_files)
-
-    for expected_batch in self._next_expected_batch(
-        file_indices, batch_size, num_epochs, interleave_cycle_length,
-        drop_final_batch, use_parser_fn):
-      actual_batch = self.evaluate(outputs())
-      self.assertAllEqual(expected_batch, actual_batch)
-
   def _read_test(self, batch_size, num_epochs, file_index=None,
                  num_parallel_reads=1, drop_final_batch=False, parser_fn=False):
     if file_index is None:
@@ -234,7 +157,7 @@ class MakeTFRecordDatasetTest(
   def testIndefiniteRepeatShapeInference(self):
     dataset = readers.make_tf_record_dataset(
         file_pattern=self.test_filenames, num_epochs=None, batch_size=32)
-    for shape in nest.flatten(dataset.output_shapes):
+    for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)):
       self.assertEqual(32, shape[0])
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 775dc61e480f56f60b54a1334e51e6e2c5a133e7..9109e6d84ec2fde41fabf7fda9c1bf6eb85ba4d0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -83,8 +83,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     # total number of elements.
     dataset = dataset_fn(14, 28)
     get_next = self.getNext(dataset)
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [shape.as_list() for shape in dataset.output_shapes])
+    self.assertEqual(
+        [[None] + list(c.shape[1:]) for c in components],
+        [shape.as_list()
+         for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     num_batches = (28 * 7) // 14
     for i in range(num_batches):
       result = self.evaluate(get_next())
@@ -143,9 +145,11 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset.with_options(options)
 
     if drop_remainder:
-      self.assertEqual([4, 1], dataset.output_shapes.as_list())
+      self.assertEqual(
+          [4, 1], dataset_ops.get_legacy_output_shapes(dataset).as_list())
     else:
-      self.assertEqual([None, 1], dataset.output_shapes.as_list())
+      self.assertEqual(
+          [None, 1], dataset_ops.get_legacy_output_shapes(dataset).as_list())
     expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]]]
     if not drop_remainder:
       expected_output.append([[64], [81]])
@@ -164,7 +168,8 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    self.assertEqual([None, 1], dataset.output_shapes.as_list())
+    self.assertEqual(
+        [None, 1], dataset_ops.get_legacy_output_shapes(dataset).as_list())
     expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]],
                        [[64], [81]]]
     self.assertDatasetProduces(dataset, expected_output=expected_output)
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 19830a23bb2ea7ace55a458351d4eda556ba3bf8..f93f8f6686bf5afbc5f3761ca5d5ce23686ab060 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -27,17 +27,34 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): add eager coverage.
+@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage")
 class MapDefunTest(test_base.DatasetTestBase):
 
+  def testNoIntraOpLimit(self):
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
+    def simple_fn(x):
+      return x * 2 + 3
+
+    nums = [[1, 2], [3, 4], [5, 6]]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    r = map_defun.map_defun(
+        simple_fn, [elems], [dtypes.int32], [(2,)],
+        max_intra_op_parallelism=0)[0]
+    expected = elems * 2 + 3
+    self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
+
   def testMapDefunSimple(self):
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
@@ -253,6 +270,70 @@ class MapDefunTest(test_base.DatasetTestBase):
     expected = x + c
     self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op))
 
+  def testMapDefunWithVariantTensor(self):
+
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.variant)])
+    def fn(x):
+      return x
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.variant],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithVariantTensorAsCaptured(self):
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
+    def fn(x):
+      del x
+      return serialized
+
+    x = constant_op.constant([0, 0])
+    map_defun_op = map_defun.map_defun(fn, [x], [dtypes.variant], [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithStrTensor(self):
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    def fn(x):
+      return x
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.string)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.string],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 703d3350db3f9f8cca490799c9e4fe4b0c984612..396f7ea93e3cc5454492ff748b264af08b899349 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -207,7 +207,7 @@ py_test(
 
 py_test(
     name = "map_vectorization_test",
-    size = "medium",
+    size = "small",
     srcs = ["map_vectorization_test.py"],
     shard_count = 8,
     srcs_version = "PY2AND3",
@@ -233,6 +233,7 @@ py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -262,6 +263,29 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_branch_dataset_test",
+    size = "small",
+    srcs = ["choose_fastest_branch_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "model_dataset_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index e05dcbd9d582da05a4049e76d4f8c057a53b3161..83cbb3cbcaa14b852aec467cd02c77411114d375 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -53,8 +53,8 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
     options = dataset_ops.Options()
-    options.experimental_autotune = False
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = False
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee34e8eb1f37234872f7f7b6ecebbee826fe33d
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_branch_dataset_test.py
@@ -0,0 +1,176 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental._ChooseFastestBranchDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ChooseFastestBranchDatasetTest(test_base.DatasetTestBase,
+                                     parameterized.TestCase):
+
+  def testSimple(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
+
+    def branch(dataset):
+      return dataset.map(lambda x: x)
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch, branch])
+
+    self.assertDatasetProduces(
+        choose_fastest,
+        expected_output=[0, 1, 2, 3, 4],
+        expected_shapes=dataset.output_shapes)
+
+  def testCaptureSimple(self):
+    dataset = dataset_ops.Dataset.range(10)
+
+    const_64 = constant_op.constant(1, dtypes.int64)
+    const_32 = constant_op.constant(1, dtypes.int32)
+
+    def branch_0(dataset):
+      return dataset.map(lambda x: x + const_64)
+
+    def branch_1(dataset):
+      return dataset.map(lambda x: x + math_ops.cast(const_32, dtypes.int64))
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch_0, branch_1])
+
+    self.assertDatasetProduces(
+        choose_fastest, expected_output=list(range(1, 11)))
+
+  def testDifferentFunctions(self):
+    dataset = dataset_ops.Dataset.range(100)
+
+    def branch_0(dataset):
+      return dataset.map(lambda x: x).batch(10)
+
+    def branch_1(dataset):
+      return dataset.batch(10).map(lambda x: x)
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch_0, branch_1], ratio_numerator=10)
+
+    self.assertDatasetProduces(
+        choose_fastest,
+        expected_output=[list(range(10 * x, 10 * x + 10)) for x in range(10)])
+
+  def testWithRepeatBeforeAndAfter(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+
+    def branch_0(dataset):
+      return dataset.map(lambda x: x).batch(10)
+
+    def branch_1(dataset):
+      return dataset.batch(10).map(lambda x: x)
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch_0, branch_1], ratio_numerator=10)
+    choose_fastest = choose_fastest.repeat(10)
+
+    self.assertDatasetProduces(
+        choose_fastest, expected_output=[[0] * 10 for _ in range(10)])
+
+  def testWithPrefetch(self):
+    """Should maintain ordering even if the branches do prefetching."""
+    dataset = dataset_ops.Dataset.range(100)
+
+    def branch_0(dataset):
+      return dataset.prefetch(1)
+
+    def branch_1(dataset):
+      return dataset.prefetch(2)
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch_0, branch_1])
+
+    self.assertDatasetProduces(choose_fastest, expected_output=list(range(100)))
+
+  def testWithMoreOutputThanInput(self):
+
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(1000).batch(100)
+
+    def branch(dataset):
+      return dataset.apply(batching.unbatch())
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch, branch],
+        ratio_denominator=100,
+        num_elements_per_branch=100)
+
+    self.assertDatasetProduces(choose_fastest, expected_output=[0] * 1000)
+
+  def testWithBadNumElements(self):
+
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(1000).batch(100)
+
+    def branch(dataset):
+      return dataset.apply(batching.unbatch())
+
+    def make_dataset():
+      return optimization._ChooseFastestBranchDataset(
+          dataset, [branch, branch],
+          ratio_denominator=100,
+          num_elements_per_branch=10)
+
+    expected_error_msg = ("`num_elements_per_branch` must be divisible by "
+                          "`ratio_denominator`")
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg):
+        make_dataset()
+    else:
+      choose_fastest = make_dataset()
+      self.assertDatasetProduces(
+          choose_fastest,
+          expected_error=(errors.InvalidArgumentError, expected_error_msg))
+
+  def testErrorWithRepeat(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+
+    def branch(dataset):
+      return dataset.repeat(10)
+
+    choose_fastest = optimization._ChooseFastestBranchDataset(
+        dataset, [branch, branch],
+        ratio_denominator=10,
+        num_elements_per_branch=10)
+    self.assertDatasetProduces(
+        choose_fastest,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Cannot create more than one WrapperIterator per WrapperDataset."),
+        expected_error_iter=2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
index ec7a85ae113d0d517434827e5dae64804861070a..3e51de9f1eecb2a9b88efafe2212b7f94f412281 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
@@ -38,7 +38,7 @@ class ChooseFastestDatasetTest(test_base.DatasetTestBase,
     self.assertDatasetProduces(
         merge,
         expected_output=[0, 1, 2, 3, 4],
-        expected_shapes=dataset.output_shapes)
+        expected_shapes=dataset_ops.get_legacy_output_shapes(dataset))
 
   def testChooseFastestManyInputs(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
@@ -46,7 +46,7 @@ class ChooseFastestDatasetTest(test_base.DatasetTestBase,
     self.assertDatasetProduces(
         merge,
         expected_output=[0, 1, 2, 3, 4],
-        expected_shapes=dataset.output_shapes)
+        expected_shapes=dataset_ops.get_legacy_output_shapes(dataset))
 
   def testChooseFastest(self):
     dataset = dataset_ops.Dataset.range(600)
@@ -59,7 +59,7 @@ class ChooseFastestDatasetTest(test_base.DatasetTestBase,
         expected_output=[
             [i * 2 for i in range(j * 50, (j + 1) * 50)] for j in range(12)
         ],
-        expected_shapes=dataset_a.output_shapes)
+        expected_shapes=dataset_ops.get_legacy_output_shapes(dataset_a))
 
   @parameterized.named_parameters(
       ("Shapes", [0], [[1, 2, 3]], "must have compatible output shapes."),
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index ec543aeb76c27258835c8eb34749ac283f94aa5c..8dfcdc7e4b5108aca8773eede2bdb2da0a3b2e18 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -22,9 +22,11 @@ import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -225,6 +227,10 @@ def _generate_csv_test_case():
 
 
 def _generate_parse_single_example_test_case():
+  # When sparse tensors are used, map_vectorization is not
+  # attempted because the output_shapes of the map dataset are not defined.
+  # TODO(rachelim): Consider being more lax with checking the output_shapes of
+  # the map node.
 
   def parse_example_factory():
 
@@ -243,8 +249,6 @@ def _generate_parse_single_example_test_case():
                     feature={
                         "dense_int": _int64_feature(i),
                         "dense_str": _bytes_feature(str(i)),
-                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
-                        "sparse_str": _bytes_feature(*["abc"] * i)
                     })).SerializeToString() for i in range(10)
         ]))
 
@@ -252,8 +256,6 @@ def _generate_parse_single_example_test_case():
     features = {
         "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
         "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
-        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
-        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
     }
     return parsing_ops.parse_single_example(x, features)
 
@@ -355,7 +357,9 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
-    optimized = _make_dataset(["Batch", map_node_name]
+    # Note that because of the `ChooseDataset` fork, we can't use `assert_next`
+    # to verify the optimization result.
+    optimized = _make_dataset(["ChooseFastestBranch"]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
@@ -375,6 +379,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
+
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
@@ -384,7 +389,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(nxt)
 
   def testOptimizationWithCapturedInputs(self):
-    # Tests that vectorization works with captured inputs
+    # Tests that vectorization works with captured inputs.
     y = constant_op.constant(1, shape=(2,))
     z = constant_op.constant(2, shape=(2,))
 
@@ -397,8 +402,85 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreStateful(self):
+  def testOptimizationWithMapAndBatchFusion(self):
+    # Tests that vectorization works on fused map and batch.
+    y = constant_op.constant(1, shape=(2,))
+    z = constant_op.constant(2, shape=(2,))
+
+    def map_fn(x):
+      return x, y, z
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    base_dataset = base_dataset.with_options(options)
+
+    def _make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.apply(batching.map_and_batch(map_fn, 100))
+      return dataset
+
+    unoptimized = _make_dataset(["MapAndBatch"])
+    optimized = _make_dataset(["ChooseFastestBranch"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+    self.assertDatasetsEqual(optimized, unoptimized)
+
+  @parameterized.named_parameters(
+      ("1", True, True),
+      ("2", True, False),
+      ("3", False, True),
+      ("4", False, False),
+  )
+  def testOptimizationWithChainedMapAndBatch(self, fuse_first, fuse_second):
+    # Tests that vectorization works on chained map and batch functions.
+    def map_fn(x):
+      return x * 2
+
+    unoptimized_seq = []
+
+    def make_apply_fn(is_fused):
+      if is_fused:
+        unoptimized_seq.append("MapAndBatch")
+
+        def apply_fn(dataset):
+          return dataset.apply(
+              batching.map_and_batch(map_fn, 2, 12, drop_remainder=True))
+
+        return apply_fn
+      else:
+        unoptimized_seq.extend(["ParallelMap", "Batch"])
+
+        def apply_fn(dataset):
+          return dataset.map(map_fn, 12).batch(2, drop_remainder=True)
+
+        return apply_fn
+
+    base_dataset = dataset_ops.Dataset.range(1000)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = base_dataset.with_options(options)
+
+    apply_fn_1 = make_apply_fn(fuse_first)
+    apply_fn_2 = make_apply_fn(fuse_second)
+
+    def make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = apply_fn_1(dataset)
+      dataset = apply_fn_2(dataset)
+      return dataset
+
+    unoptimized = make_dataset(unoptimized_seq)
+    optimized = make_dataset(["ChooseFastestBranch", "ChooseFastestBranch"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+
+    self.assertDatasetsEqual(optimized, unoptimized)
+
+  def testOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -408,10 +490,13 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                            [3, 4]]).repeat(5)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
 
   def testOptimizationIgnoreRagged(self):
     # Make sure we ignore inputs that might not be uniformly sized
@@ -424,8 +509,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreRaggedMap(self):
+  def testOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -433,10 +517,58 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
+
+  def testOptimizationWithUnknownBatchShape(self):
+    tensor = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    # Datasets with sparse tensors have unknown output shapes.
+    base_dataset = dataset_ops.Dataset.from_tensors(tensor)
+    unoptimized = base_dataset.apply(batching.map_and_batch(lambda x: x, 2))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized = unoptimized.with_options(options)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = unoptimized.with_options(options)
+    self.assertDatasetsEqual(unoptimized, optimized)
+
+  def testOptimizationWithSparseTensor(self):
+    base_dataset = dataset_ops.Dataset.from_tensors(0)
+
+    def map_fn(x):
+      del x
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    # Datasets with sparse tensors have unknown output shapes.
+    unoptimized = base_dataset.apply(batching.map_and_batch(map_fn, 2))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized = unoptimized.with_options(options)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = unoptimized.with_options(options)
+    self.assertDatasetsEqual(unoptimized, optimized)
+
+  def testOptimizationWithPrefetch(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.map(lambda x: x)
+    dataset = dataset.prefetch(1)
+    dataset = dataset.batch(10)
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, [list(range(10))])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 5c1ae7a98a2326f61518b1550d0678da50e78401..dd2031f7b02b468d8791a7647cc75c857a2d3cde 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -35,8 +35,8 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.map(lambda x: x).apply(
         optimization.assert_next(["Model"]))
     options = dataset_ops.Options()
-    options.experimental_autotune = True
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = True
     dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index f44ade049b42d41d47db32b1bbb20b1485a244c2..a85e0cf801cda08cfe997c4ebce6497ae806aecd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -113,47 +112,35 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
 
-  def testOptimizationLargeInputFromTensor(self):
-    def dataset_fn(input_t):
-      dataset = dataset_ops.Dataset.from_tensors(input_t)
-      options = dataset_ops.Options()
-      options.experimental_optimization.apply_default_optimizations = False
-      return dataset.with_options(options)
-
-    if context.executing_eagerly():
-      input_t = np.ones([512, 1024, 1025], np.int32)
-      get_next = self.getNext(dataset_fn(input_t))
-      self.evaluate(get_next())
-    else:
-      input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
-      iterator = dataset_ops.make_initializable_iterator(dataset_fn(input_t))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      with self.cached_session() as sess:
-        sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
-        self.evaluate(get_next)
-
-  def testOptimizationLargeInputFromTensorSlices(self):
-    def dataset_fn(input_t):
-      dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
-      options = dataset_ops.Options()
-      options.experimental_optimization.apply_default_optimizations = False
-      return dataset.with_options(options)
-
-    if context.executing_eagerly():
-      input_t = np.ones([1, 512, 1024, 1025], np.int32)
-      get_next = self.getNext(dataset_fn(input_t))
-      self.evaluate(get_next())
-    else:
-      input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
-      iterator = dataset_ops.make_initializable_iterator(dataset_fn(input_t))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      with self.cached_session() as sess:
-        sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
-        self.evaluate(get_next)
+  @test_util.run_v1_only("b/123902160")
+  def testSkipEagerOptimizationLargeInputFromTensor(self):
+    input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
+    dataset = dataset_ops.Dataset.from_tensors(input_t)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
+      self.evaluate(get_next)
+
+  @test_util.run_v1_only("b/123902160")
+  def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
+    input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
+      self.evaluate(get_next)
 
   def testOptimizationNestedDataset(self):
 
@@ -232,7 +219,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   @parameterized.named_parameters(_generate_captured_refvar_test_cases())
-  # Skip eager because RefVariables are not supported in eager mode.
+  @test_util.run_v1_only("RefVariables are not supported in eager mode.")
   def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
     """Tests that default optimizations are disabled with ref variables."""
     variable = variable_scope.get_variable(
@@ -249,7 +236,7 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_optimization.noop_elimination = True
       options.experimental_optimization.map_and_batch_fusion = True
       optimized_dataset = unoptimized_dataset.with_options(options)
-      optimized_it = optimized_dataset.make_initializable_iterator()
+      optimized_it = dataset_ops.make_initializable_iterator(optimized_dataset)
 
     self.assertGreaterEqual(len(w), 1)
     expected = ("tf.data static optimizations are not compatible with "
@@ -261,7 +248,8 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # Check that outputs are the same in the optimized and unoptimized cases,
     # when the variable value is changing.
-    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    unoptimized_it = dataset_ops.make_initializable_iterator(
+        unoptimized_dataset)
     with ops.control_dependencies([assign_op]):
       unoptimized_output = unoptimized_it.get_next()
       optimized_output = optimized_it.get_next()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index 4dbb188f2cffa08ff47cb4bd85ea6d3672edd222..5d6787090a751200e51f6f72b7655a671f02ae7c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -103,9 +103,12 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         else np.asarray(input_tensor).size)
     for k, f in feature_val.items():
       if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
-        self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
+        self.assertEqual(
+            dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[0],
+            batch_size)
       elif isinstance(f, parsing_ops.VarLenFeature):
-        self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
+        self.assertEqual(
+            dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[1], None)
 
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 238c5cd5060cafe7590fde72e4ac1e7b9b4ea6f4..8b5ddf403f45a09c52274a322d9a3901ce1f7278 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import test
 # TODO(b/117581999): add eager coverage when supported.
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -42,12 +42,10 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -59,7 +57,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testPrefetchToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -70,12 +68,10 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -86,7 +82,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testPrefetchDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
@@ -96,12 +92,10 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
@@ -113,7 +107,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testPrefetchSparseTensorsToDevice(self):
     def make_tensor(i):
       return sparse_tensor.SparseTensorValue(
@@ -127,12 +121,10 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
 
@@ -146,6 +138,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -165,7 +158,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -175,12 +168,10 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
-    self.assertEqual(host_dataset.output_types, iterator.output_types)
-    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
-    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
-    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
-    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(device_dataset)))
+    self.assertTrue(dataset_ops.get_structure(host_dataset).is_compatible_with(
+        dataset_ops.get_structure(iterator)))
 
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
@@ -196,6 +187,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @test_util.deprecated_graph_mode_only
   def testPrefetchToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index f36f94c02fec98f95d9cb718ae2d1dd19905b454..a739e7485e5e0d9d3bb8ecd69aa8104960063ee3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -311,6 +311,76 @@ class TextLineDatasetTestBase(test_base.DatasetTestBase):
 class TFRecordDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing TFRecordDataset."""
 
+  def _interleave(self, iterators, cycle_length):
+    pending_iterators = iterators
+    open_iterators = []
+    num_open = 0
+    for i in range(cycle_length):
+      if pending_iterators:
+        open_iterators.append(pending_iterators.pop(0))
+        num_open += 1
+
+    while num_open:
+      for i in range(min(cycle_length, len(open_iterators))):
+        if open_iterators[i] is None:
+          continue
+        try:
+          yield next(open_iterators[i])
+        except StopIteration:
+          if pending_iterators:
+            open_iterators[i] = pending_iterators.pop(0)
+          else:
+            open_iterators[i] = None
+            num_open -= 1
+
+  def _next_expected_batch(self, file_indices, batch_size, num_epochs,
+                           cycle_length, drop_final_batch, use_parser_fn):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    def _next_record_interleaved(file_indices, cycle_length):
+      return self._interleave([_next_record([i]) for i in file_indices],
+                              cycle_length)
+
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for f, r in next_records:
+        record = self._record(f, r)
+        if use_parser_fn:
+          record = record[1:]
+        record_batch.append(record)
+        batch_index += 1
+        if len(record_batch) == batch_size:
+          yield record_batch
+          record_batch = []
+          batch_index = 0
+    if record_batch and not drop_final_batch:
+      yield record_batch
+
+  def _verify_records(self, outputs, batch_size, file_index, num_epochs,
+                      interleave_cycle_length, drop_final_batch, use_parser_fn):
+    if file_index is not None:
+      if isinstance(file_index, list):
+        file_indices = file_index
+      else:
+        file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length,
+        drop_final_batch, use_parser_fn):
+      actual_batch = self.evaluate(outputs())
+      self.assertAllEqual(expected_batch, actual_batch)
+
   def setUp(self):
     super(TFRecordDatasetTestBase, self).setUp()
     self._num_files = 2
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..38d3abe6f9dfe2dafba410ecc68a62a9831d62b1
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -0,0 +1,324 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `_RebatchDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _flat_shapes(dataset):
+  return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
+
+
+@parameterized.named_parameters(("WithDropRemainder", True),
+                                ("WithoutDropRemainder", False))
+@test_util.run_all_in_graph_and_eager_modes
+class RebatchDatasetTest(test_base.DatasetTestBase):
+
+  def testBasic(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testScalarInputError(self, _):
+    dataset = dataset_ops.Dataset.range(1024)
+    with self.assertRaisesRegexp(ValueError, "at least one dimension"):
+      batching._RebatchDataset(dataset, num_workers=4)
+
+  def testNotDivisibleError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "not divisible by"):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=5)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testTupleOutput(self, drop_remainder):
+    dataset = (
+        dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(
+            32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        [k for k in range(i, i + 8)])
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testNestedDictionaryOutput(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).map(
+        lambda x: {"a": x, "b": {"c": x}}).batch(
+            32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        "b": {"c": [k for k in range(i, i + 8)]}}
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchOriginal(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1032).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1032, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchAfterRebatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(34).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
+    if not drop_remainder:
+      expected_output += [[32, 33]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMultipleBatches(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(
+        4, drop_remainder=drop_remainder)
+    dataset = dataset.batch(8, drop_remainder=drop_remainder)
+    self.assertEqual(
+        [[8, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    # Each element is a list of 8 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, 4)
+    self.assertEqual(
+        [[2, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # Each element is a list of 2 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMapAndBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).apply(
+        batching.map_and_batch(
+            math_ops.square, 32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = [[k**2 for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testPaddedBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(4).padded_batch(
+        8, padded_shapes=[5], drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    # Each element is a list of 8 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+    self.assertEqual(
+        [[2, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # Each element is a list of 2 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenate(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[2 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = ([[i, i + 1] for i in range(0, 64, 2)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenateDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[None]], [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = ([[i, i + 1, i + 2, i + 3] for i in range(0, 64, 4)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZip(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[2], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZipDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[16], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    self.assertEqual(
+        [[4], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    expected_output = [([2 * i, 2 * i + 1, 2 * i + 2, 2 * i + 3], [i, i + 1])
+                       for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testUnsupportedTransformError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder).apply(
+            scan_ops.scan([0], lambda _, a: ([0], a)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testFlatMapBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).flat_map(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder))
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # Two elements where each element is a list of 4 elements where each element
+    # is a list of 8.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for _ in range(2)
+                       for i in range(0, 32, 8)]  # generates 4 elements
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testParallelInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2,
+                      num_parallel_calls=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(dataset)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice in collated fashion i.e [0...8], [0...8] etc.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 4d35b160fdc15e22b9b62718af9407978d20d7e2..063e12309080ed2776f217a532fde48b49a1c930 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -44,7 +44,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
     initial_dist = [0.2] * 5 if initial_known else None
-    classes = math_ops.to_int64(classes)  # needed for Windows build.
+    classes = math_ops.cast(classes, dtypes.int64)  # needed for Windows build.
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index ddac02b9e29fc54efd962d9697be66cd7e756354..88c14f0a6ead8e4c07487264bc17af0b6f81cc7b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -48,10 +48,11 @@ class RestructuredDatasetTest(test_base.DatasetTestBase):
       # pylint: disable=protected-access
       new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
       # pylint: enable=protected-access
-      self.assertEqual(new_types, new.output_types)
+      self.assertEqual(new_types, dataset_ops.get_legacy_output_types(new))
       if new_shape_lists is not None:
         for expected_shape_list, shape in zip(
-            nest.flatten(new_shape_lists), nest.flatten(new.output_shapes)):
+            nest.flatten(new_shape_lists),
+            nest.flatten(dataset_ops.get_legacy_output_shapes(new))):
           if expected_shape_list is None:
             self.assertIs(None, shape.ndims)
           else:
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 38e9b1e128157e4ff284ae0065ee474b20bad86c..24221a1f0f03e07e6ce7e7dae7638d357236fb8d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -108,9 +108,12 @@ class ScanTest(test_base.DatasetTestBase):
 
     dataset = dataset_ops.Dataset.from_tensors(0).repeat(5).apply(
         scan_ops.scan(([0], 1), _scan_fn))
-    self.assertEqual([None], dataset.output_shapes[0][0].as_list())
-    self.assertIs(None, dataset.output_shapes[0][1].ndims)
-    self.assertEqual([], dataset.output_shapes[1].as_list())
+    self.assertEqual(
+        [None], dataset_ops.get_legacy_output_shapes(dataset)[0][0].as_list())
+    self.assertIs(
+        None, dataset_ops.get_legacy_output_shapes(dataset)[0][1].ndims)
+    self.assertEqual(
+        [], dataset_ops.get_legacy_output_shapes(dataset)[1].as_list())
 
     next_element = self.getNext(dataset)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 4fd2a2ec4bfc4ca44b1b421bbe00ebf16bc55936..293dc4864cc193b2579ee41676efe423a0b14bac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -93,6 +93,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_branch_dataset_serialization_test",
+    size = "medium",
+    srcs = ["choose_fastest_branch_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "choose_fastest_dataset_serialization_test",
     size = "small",
@@ -166,6 +187,29 @@ py_test(
     ],
 )
 
+py_test(
+    name = "auto_shard_dataset_serialization_test",
+    size = "medium",
+    srcs = ["auto_shard_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "filter_dataset_serialization_test",
     size = "medium",
@@ -408,6 +452,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rebatch_dataset_serialization_test",
+    size = "small",
+    srcs = ["rebatch_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "padded_batch_dataset_serialization_test",
     size = "medium",
@@ -605,6 +667,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "shard_dataset_serialization_test",
+    size = "medium",
+    srcs = ["shard_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "shuffle_and_repeat_dataset_serialization_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf83651ef5093726e007e06eb817a7303cc944b
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/auto_shard_dataset_serialization_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the _AutoShard dataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class AutoShardDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(10):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(10):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def setUp(self):
+    self._filenames = self._createFiles()
+
+  def testCore(self):
+
+    def build_dataset():
+      dataset = dataset_ops.Dataset.list_files(self._filenames, shuffle=False)
+      dataset = dataset.apply(
+          interleave_ops.parallel_interleave(readers.TFRecordDataset, 10))
+      dataset = distribute._AutoShardDataset(dataset, 5, 3)
+      return dataset
+
+    self.run_core_tests(build_dataset, None, 20)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 8cc66d0c29392b206015ad886780d854fb2b5d5c..84b8e5ca3647a0597f6823249743a678900751b8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import iterator_ops
-from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +34,8 @@ from tensorflow_estimator.python.estimator import estimator
 from tensorflow_estimator.python.estimator import model_fn
 
 
-class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
+@test_util.run_v1_only('b/123904664')
+class CheckpointInputPipelineHookTest(test.TestCase):
 
   @staticmethod
   def _model_fn(features, labels, mode, config):
@@ -69,7 +69,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
-  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -82,7 +81,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -97,7 +95,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaedcae421014ab52a2c56740e91c669594c579a
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_branch_dataset_serialization_test.py
@@ -0,0 +1,104 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ChooseFastestBranchDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ChooseFastestBranchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+
+    def build_ds(size):
+      dataset = dataset_ops.Dataset.range(size)
+
+      def branch_0(dataset):
+        return dataset.map(lambda x: x).batch(10)
+
+      def branch_1(dataset):
+        return dataset.batch(10).map(lambda x: x)
+
+      return optimization._ChooseFastestBranchDataset(  # pylint: disable=protected-access
+          dataset, [branch_0, branch_1],
+          ratio_numerator=10)
+
+    for size in [100, 1000]:
+      self.run_core_tests(lambda: build_ds(size), None, size // 10)  # pylint: disable=cell-var-from-loop
+
+  def testWithCapture(self):
+
+    def build_ds():
+      dataset = dataset_ops.Dataset.range(10)
+      const_64 = constant_op.constant(1, dtypes.int64)
+      const_32 = constant_op.constant(1, dtypes.int32)
+
+      def branch_0(dataset):
+        return dataset.map(lambda x: x + const_64)
+
+      def branch_1(dataset):
+        return dataset.map(lambda x: x + math_ops.cast(const_32, dtypes.int64))
+
+      return optimization._ChooseFastestBranchDataset(
+          dataset, [branch_0, branch_1], num_elements_per_branch=3)
+
+    self.run_core_tests(build_ds, None, 10)
+
+  def testWithPrefetch(self):
+
+    def build_ds():
+      dataset = dataset_ops.Dataset.range(10)
+      const_64 = constant_op.constant(1, dtypes.int64)
+      const_32 = constant_op.constant(1, dtypes.int32)
+
+      def branch_0(dataset):
+        return dataset.map(lambda x: x + const_64)
+
+      def branch_1(dataset):
+        return dataset.map(lambda x: x + math_ops.cast(const_32, dtypes.int64))
+
+      return optimization._ChooseFastestBranchDataset(
+          dataset, [branch_0, branch_1], num_elements_per_branch=3)
+
+    self.run_core_tests(build_ds, None, 10)
+
+  def testWithMoreOutputThanInput(self):
+
+    def build_ds():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(1000).batch(100)
+
+      def branch(dataset):
+        return dataset.apply(batching.unbatch())
+
+      return optimization._ChooseFastestBranchDataset(
+          dataset, [branch, branch],
+          ratio_denominator=10,
+          num_elements_per_branch=100)
+
+    self.run_core_tests(build_ds, None, 1000)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index bdbd8702b7f8d315a730c5cd2b000218ea5e19be..0398effdc1db8aa91719924458a54e4eb855d93b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,7 +23,6 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
@@ -78,7 +77,6 @@ class DatasetSerializationTestBase(test.TestCase):
     # NOTE: We disable all default optimizations in serialization tests in order
     # to test the actual dataset in question.
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.apply_default_optimizations = False
 
     def ds_fn1_no_opt():
@@ -670,15 +668,15 @@ class DatasetSerializationTestBase(test.TestCase):
 
   def _get_output_types(self, ds_fn):
     with ops.Graph().as_default():
-      return ds_fn().output_types
+      return dataset_ops.get_legacy_output_types(ds_fn())
 
   def _get_output_shapes(self, ds_fn):
     with ops.Graph().as_default():
-      return ds_fn().output_shapes
+      return dataset_ops.get_legacy_output_shapes(ds_fn())
 
   def _get_output_classes(self, ds_fn):
     with ops.Graph().as_default():
-      return ds_fn().output_classes
+      return dataset_ops.get_legacy_output_classes(ds_fn())
 
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
index c30534a9e9a8986a17f2445a89f5d88fc960906d..e18cfa5002d6ca67a187bc27ce27eee886c4a0a8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/flat_map_dataset_serialization_test.py
@@ -52,7 +52,7 @@ class FlatMapDatasetSerializationTest(
       def flat_map_fn(_):
 
         def map_fn(y):
-          return 10 * math_ops.to_int32(y)
+          return 10 * math_ops.cast(y, dtypes.int32)
 
         return dataset_ops.Dataset.range(100).map(map_fn)
 
@@ -68,7 +68,7 @@ class FlatMapDatasetSerializationTest(
 
         @function.Defun(dtypes.int64)
         def defun_fn(x):
-          return constant_op.constant(1000) + math_ops.to_int32(x)
+          return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
 
         return dataset_ops.Dataset.from_tensor_slices([defun_fn(x)])
 
@@ -94,7 +94,7 @@ class FlatMapDatasetSerializationTest(
 
         def map_fn(x):
           return random_ops.random_uniform(
-              (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+              (), 0, 10, dtype=dtypes.int32) * math_ops.cast(x, dtypes.int32)
 
         return dataset_ops.Dataset.range(100).map(map_fn)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
index b93156a96cb34b5cfb46bfd5ce0aa24a91de1880..a8667c2aad0bd7c6d354dfbc0ba8860feb9653c0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_dataset_serialization_test.py
@@ -64,7 +64,7 @@ class MapDatasetSerializationTest(
 
       def _map_fn(x):
         return random_ops.random_uniform(
-            (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+            (), 0, 10, dtype=dtypes.int32) * math_ops.cast(x, dtypes.int32)
 
       return dataset_ops.Dataset.range(100).map(_map_fn)
 
@@ -96,7 +96,7 @@ class MapDatasetSerializationTest(
 
       @function.Defun(dtypes.int64)
       def defun_fn(x):
-        return constant_op.constant(1000) + math_ops.to_int32(x)
+        return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
@@ -112,9 +112,10 @@ class MapDatasetSerializationTest(
 
         @function.Defun(dtypes.int32)
         def defun_fn_deep(x):
-          return constant_op.constant(1000) + math_ops.to_int32(x)
+          return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
 
-        return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
+        return constant_op.constant(11000) + defun_fn_deep(
+            math_ops.cast(x, dtypes.int32))
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index ed4a1da59679c8c85141cb38e46ad95441b71b73..aaa46bacefed1865ff85bf5478fbd0f22c65c227 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -34,6 +34,20 @@ class OptimizeDatasetSerializationTest(
 
     self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
 
+  def testWithNewFunction(self):
+    """Tests that optimized datasets with new functions work."""
+
+    def build_dataset():
+      dataset = dataset_ops.Dataset.range(100)
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(5)
+      # map_vectorization adds a new vectorized function to the function
+      # library.
+      dataset = dataset.apply(optimization.optimize(["map_vectorization"]))
+      return dataset
+
+    self.run_core_tests(build_dataset, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
index a0bdd4fa59bba7db6064d20b1ce991caec6a0dba..4e4ed6870464bf52a0c8edc7d888c2cfbe63ba65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/parallel_map_dataset_serialization_test.py
@@ -74,7 +74,7 @@ class ParallelMapDatasetSerializationTest(
 
       def _map_fn(x):
         return random_ops.random_uniform(
-            (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+            (), 0, 10, dtype=dtypes.int32) * math_ops.cast(x, dtypes.int32)
 
       return dataset_ops.Dataset.range(100).map(
           _map_fn, num_parallel_calls=2).prefetch(2)
@@ -108,7 +108,7 @@ class ParallelMapDatasetSerializationTest(
 
       @function.Defun(dtypes.int64)
       def defun_fn(x):
-        return constant_op.constant(1000) + math_ops.to_int32(x)
+        return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
 
       return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
@@ -125,9 +125,10 @@ class ParallelMapDatasetSerializationTest(
 
         @function.Defun(dtypes.int32)
         def defun_fn_deep(x):
-          return constant_op.constant(1000) + math_ops.to_int32(x)
+          return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
 
-        return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
+        return constant_op.constant(11000) + defun_fn_deep(
+            math_ops.cast(x, dtypes.int32))
 
       return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30db589069a26cf9f5322e3bde498413ca39108
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -0,0 +1,41 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the _RebatchDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class RebatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+
+    def build_dataset(num_elements, batch_size):
+      return batching._RebatchDataset(
+          dataset_ops.Dataset.range(num_elements).batch(
+              4 * batch_size, drop_remainder=True),
+          num_workers=4)
+
+    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99674b6910312c35f065fc3dd2cdd738fe544615
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ShardDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class ShardDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  def _build_dataset(self, num_elements, num_shards, index):
+    return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
+
+  @parameterized.parameters((10, 5, 2, 3), (10, 10, 0, 9), (100, 2, 0, 1))
+  def testCore(self, elems, num_shards, index1, index2):
+    self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index1),
+                        lambda: self._build_dataset(elems, num_shards, index2),
+                        elems // num_shards)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index a4fe847f04baa0f8dd7c45bae4e02617e33053ca..4733c2a8330c377a6860c4207f6b50b7d83dc9ef 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -33,6 +33,7 @@ _NUMPY_RANDOM_SEED = 42
 class SleepTest(test_base.DatasetTestBase):
 
   def testSleep(self):
+    self.skipTest("b/123597912")
     sleep_microseconds = 100
     dataset = dataset_ops.Dataset.range(10).apply(
         sleep.sleep(sleep_microseconds))
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 8b330559f5f927bed1c0a206c962f4350868b276..a97824a3b70f75007c3bff2801802b40e41f4fdf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
 def function_set_stats_aggregator(dataset,
                                   aggregator,
                                   prefix="",
@@ -53,6 +52,7 @@ def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   return dataset.with_options(options)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
     ("SetStatsAggregator", function_set_stats_aggregator),
     ("StatsOptions", function_apply_options),
@@ -66,7 +66,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
     next_element = self.getNext(dataset, requires_initialization=True)
-    summary_t = aggregator.get_summary()
 
     expected_sum = 0.0
     for i in range(100):
@@ -78,8 +77,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
-    # TODO(shivaniagrawal): ntentional breaking case
-    summary_str = self.evaluate(summary_t)
+    summary_str = self.evaluate(aggregator.get_summary())
     self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
     self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
@@ -110,17 +108,24 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self.assertAllEqual(
           np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
       summary_str = self.evaluate(aggregator.get_summary())
-      self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                  float(i + 1))
-      self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
-      self._assertSummaryContains(summary_str, "Prefetch::buffer_size")
-      self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
-                                  0, 1)
+      self._assertSummaryHasCount(
+          summary_str,
+          self.regexForNodeName("PrefetchDataset", "buffer_utilization"),
+          float(i + 1))
+      self._assertSummaryContains(
+          summary_str,
+          self.regexForNodeName("PrefetchDataset", "buffer_capacity"))
+      self._assertSummaryContains(
+          summary_str, self.regexForNodeName("PrefetchDataset", "buffer_size"))
+      self._assertSummaryHasRange(
+          summary_str,
+          self.regexForNodeName("PrefetchDataset", "buffer_utilization"), 0, 1)
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     summary_str = self.evaluate(aggregator.get_summary())
-    self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                100)
+    self._assertSummaryHasCount(
+        summary_str,
+        self.regexForNodeName("PrefetchDataset", "buffer_utilization"), 100)
 
   def testPrefetchBufferScalars(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
@@ -133,9 +138,12 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
       self.assertAllEqual(
           np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
       summary_str = self.evaluate(aggregator.get_summary())
-      self._assertSummaryHasScalarValue(summary_str,
-                                        "Prefetch::buffer_capacity", 1)
-      self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size", 1)
+      self._assertSummaryHasScalarValue(
+          summary_str,
+          self.regexForNodeName("PrefetchDataset", "buffer_capacity"), 1)
+      self._assertSummaryHasScalarValue(
+          summary_str, self.regexForNodeName("PrefetchDataset", "buffer_size"),
+          1)
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
 
@@ -149,19 +157,24 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     for i in range(34):
       self.assertEqual(i * 3, self.evaluate(next_element()))
       summary_str = self.evaluate(aggregator.get_summary())
-      if i is not 0:
-        self._assertSummaryHasScalarValue(summary_str,
-                                          "Filter::dropped_elements",
-                                          float(i * 2))
+      if i != 0:
+        self._assertSummaryHasScalarValue(
+            summary_str,
+            self.regexForNodeName("FilterDataset", "dropped_elements"),
+            float(i * 2))
       self._assertSummaryHasScalarValue(
-          summary_str, "Filter::filtered_elements", float(i + 1))
+          summary_str,
+          self.regexForNodeName("FilterDataset", "filtered_elements"),
+          float(i + 1))
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     summary_str = self.evaluate(aggregator.get_summary())
-    self._assertSummaryHasScalarValue(summary_str, "Filter::dropped_elements",
-                                      67.0)
-    self._assertSummaryHasScalarValue(summary_str, "Filter::filtered_elements",
-                                      34.0)
+    self._assertSummaryHasScalarValue(
+        summary_str, self.regexForNodeName("FilterDataset", "dropped_elements"),
+        67.0)
+    self._assertSummaryHasScalarValue(
+        summary_str, self.regexForNodeName("FilterDataset",
+                                           "filtered_elements"), 34.0)
 
   def testMapBufferUtilization(self, dataset_transformation):
 
@@ -171,8 +184,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
           num_parallel_calls=4)
 
     self._testParallelCallsStats(
-        dataset_fn,
-        "ParallelMap",
+        dataset_fn, {self.regexForNodeName("ParallelMapDataset")},
         10,
         dataset_transformation,
         function_processing_time=True)
@@ -180,16 +192,12 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
   def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
-      dataset = dataset_ops.Dataset.range(10).map(
+      return dataset_ops.Dataset.range(10).map(
           lambda x: array_ops.tile([x], ops.convert_to_tensor([x])),
           num_parallel_calls=optimization.AUTOTUNE)
-      options = dataset_ops.Options()
-      options.experimental_autotune = True
-      return dataset.with_options(options)
 
     self._testParallelCallsStats(
-        dataset_fn,
-        "ParallelMap",
+        dataset_fn, {self.regexForNodeName("ParallelMapDataset")},
         10,
         dataset_transformation,
         function_processing_time=True)
@@ -202,33 +210,27 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         return dataset_ops.Dataset.range(
             10).map(lambda x: array_ops.tile([x], ops.convert_to_tensor([x])))
 
-      dataset = dataset_ops.Dataset.range(1).interleave(
+      return dataset_ops.Dataset.range(1).interleave(
           interleave_fn,
           cycle_length=1,
           num_parallel_calls=optimization.AUTOTUNE)
-      options = dataset_ops.Options()
-      options.experimental_autotune = True
-      return dataset.with_options(options)
 
-    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
-                                 dataset_transformation)
+    self._testParallelCallsStats(
+        dataset_fn, {self.regexForNodeName("ParallelInterleaveDatasetV2")}, 10,
+        dataset_transformation)
 
   def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
-      dataset = dataset_ops.Dataset.range(100).apply(
+      return dataset_ops.Dataset.range(100).apply(
           batching.map_and_batch(
               lambda x: array_ops.tile([x], ops.convert_to_tensor([2])),
               num_parallel_calls=optimization.AUTOTUNE,
               batch_size=16))
-      options = dataset_ops.Options()
-      options.experimental_autotune = True
-      return dataset.with_options(options)
 
     num_output = 100 // 16 + 1
     self._testParallelCallsStats(
-        dataset_fn,
-        "MapAndBatch",
+        dataset_fn, {self.regexForNodeName("ExperimentalMapAndBatchDataset")},
         num_output,
         dataset_transformation,
         check_elements=False,
@@ -356,14 +358,38 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self.evaluate(aggregator.get_summary()), "dataset2_record_latency",
         100.0)
 
+  def testMultiplePrefetchStats(self, dataset_transformation):
 
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.range(10).prefetch(
+        2).map(lambda x: math_ops.add(x, 2)).prefetch(1)
+
+    dataset = dataset_transformation(dataset, aggregator)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(10):
+      self.assertEqual(i + 2, self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      # TODO(shivaniagarwal): using exact name of prefetch node than the regex,
+      # to differentiate between two prefetch. This might break in future, at
+      # which point, it would be best to disable this test.
+      self._assertSummaryHasScalarValue(
+          summary_str, "PrefetchDataset/_5::buffer_capacity", 2)
+      self._assertSummaryContains(summary_str,
+                                  "PrefetchDataset/_5::buffer_size")
+      self._assertSummaryHasScalarValue(
+          summary_str, "PrefetchDataset/_8::buffer_capacity", 1)
+      self._assertSummaryContains(summary_str,
+                                  "PrefetchDataset/_8::buffer_size")
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
-    dict(
-        testcase_name="SetStatsAggregator",
-        dataset_transformation=function_set_stats_aggregator),
-    dict(
-        testcase_name="StatsOptions",
-        dataset_transformation=function_apply_options))
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options)
+)
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
@@ -388,8 +414,7 @@ class FeatureStatsDatasetTest(
       num_output = total_records // batch_size + 1
 
     self._testParallelCallsStats(
-        dataset_fn,
-        "ParseExample",
+        dataset_fn, {self.regexForNodeName("ExperimentalParseExampleDataset")},
         num_output,
         dataset_transformation,
         check_elements=False)
@@ -405,16 +430,21 @@ class FeatureStatsDatasetTest(
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     self._assertSummaryHasCount(
-        self.evaluate(aggregator.get_summary()), "record_stats_features",
-        total_records)
+        self.evaluate(aggregator.get_summary()),
+        self.regexForNodeName("record_stats_ExperimentalParseExampleDataset",
+                              "features_count"), total_records)
     self._assertSummaryHasCount(
-        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
-        total_records)
+        self.evaluate(aggregator.get_summary()),
+        self.regexForNodeName("record_stats_ExperimentalParseExampleDataset",
+                              "feature_values_count"), total_records)
     self._assertSummaryHasSum(
-        self.evaluate(aggregator.get_summary()), "record_stats_features",
-        total_records * 4)
+        self.evaluate(aggregator.get_summary()),
+        self.regexForNodeName("record_stats_ExperimentalParseExampleDataset",
+                              "features_count"), total_records * 4)
     self._assertSummaryHasSum(
-        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
+        self.evaluate(aggregator.get_summary()),
+        self.regexForNodeName("record_stats_ExperimentalParseExampleDataset",
+                              "feature_values_count"),
         self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index f5a15f4c848c536ac07636469ea1f8b762bd317e..f55b96a903e51dc88c12a169259242287084f7b8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
@@ -28,30 +29,30 @@ from tensorflow.python.framework import errors
 class StatsDatasetTestBase(test_base.DatasetTestBase):
   """Base class for testing statistics gathered in `StatsAggregator`."""
 
-  def _assertSummaryContains(self, summary_str, tag):
-    summary_proto = summary_pb2.Summary()
-    summary_proto.ParseFromString(summary_str)
-    for value in summary_proto.value:
-      if tag == value.tag:
-        return
-    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+  def regexForNodeName(self, op_name, stats_type=""):
+    return "".join([op_name, r"/_\d+::", stats_type])
 
-  def _assertSummaryHasCount(self, summary_str, tag, expected_value):
+  def _assertSummaryContains(self, summary_str, tag):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
     for value in summary_proto.value:
-      if tag == value.tag:
-        self.assertEqual(expected_value, value.histo.num)
+      if re.match(tag, value.tag):
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
-  def _assertSummaryHasCountMoreOrEqualGeneralisedTag(self, summary_str, tag,
-                                                      expected_value):
+  def _assertSummaryHasCount(self,
+                             summary_str,
+                             tag,
+                             expected_value,
+                             greater_than=False):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
     for value in summary_proto.value:
-      if tag in value.tag:
-        self.assertGreaterEqual(value.histo.num, expected_value)
+      if re.match(tag, value.tag):
+        if greater_than:
+          self.assertGreaterEqual(value.histo.num, expected_value)
+        else:
+          self.assertEqual(expected_value, value.histo.num)
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
@@ -59,7 +60,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
     for value in summary_proto.value:
-      if tag == value.tag:
+      if re.match(tag, value.tag):
         self.assertLessEqual(min_value, value.histo.min)
         self.assertGreaterEqual(max_value, value.histo.max)
         return
@@ -69,7 +70,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
     for value in summary_proto.value:
-      if tag == value.tag:
+      if re.match(tag, value.tag):
         self.assertEqual(expected_value, value.histo.sum)
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
@@ -78,14 +79,14 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
     for value in summary_proto.value:
-      if tag == value.tag:
+      if re.match(tag, value.tag):
         self.assertEqual(expected_value, value.simple_value)
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
   def _testParallelCallsStats(self,
                               dataset_fn,
-                              dataset_name,
+                              dataset_names,
                               num_output,
                               dataset_transformation,
                               function_processing_time=False,
@@ -100,14 +101,22 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
       if check_elements:
         self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
       summary_str = self.evaluate(aggregator.get_summary())
-      if function_processing_time:
-        self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-            summary_str, "::execution_time", float(i + 1))
-      self._assertSummaryContains(summary_str,
-                                  dataset_name + "::thread_utilization")
+      for dataset_name in dataset_names:
+        if function_processing_time:
+          self._assertSummaryHasCount(
+              summary_str,
+              r"(.*)::execution_time$",
+              float(i + 1),
+              greater_than=True)
+        self._assertSummaryContains(summary_str,
+                                    dataset_name + "thread_utilization")
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
     if function_processing_time:
       summary_str = self.evaluate(aggregator.get_summary())
-      self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-          summary_str, "::execution_time", float(num_output))
+      for dataset_name in dataset_names:
+        self._assertSummaryHasCount(
+            summary_str,
+            r"(.*)::execution_time$",
+            float(num_output),
+            greater_than=True)
diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
index 14a4241ec2e6930622aaf9e35ae70e18eaaa004f..783b2e6e22ae618f255673011d72201c993e0a85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.experimental.ops import writers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.lib.io import tf_record
@@ -94,6 +95,20 @@ class TFRecordWriterTest(test_base.DatasetTestBase):
     with self.assertRaises(TypeError):
       writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
 
+  def testSideEffect(self):
+    def writer_fn():
+      input_dataset = readers.TFRecordDataset(self._createFile())
+      return writers.TFRecordWriter(self._outputFilename()).write(input_dataset)
+
+    @function.defun
+    def fn():
+      _ = writer_fn()
+      return "hello"
+
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 613fe0da6b3d3db81a969a3cea261f238951fab4..dfcc14e960e28532c50173e3b0248bece031d60f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -49,9 +49,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = dataset_ops.Dataset.from_tensor_slices(data)
     expected_types = (dtypes.int32,) * 3
     data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
     data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
 
@@ -61,9 +61,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
     expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
     data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
     data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(
         data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
@@ -100,9 +100,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = dataset_ops.Dataset.from_tensor_slices(data)
     expected_types = ((dtypes.int32,),) * 3
     data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
     data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(data, [((i,),) * 3 for i in range(10)])
 
@@ -112,9 +112,11 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = dataset_ops.Dataset.from_tensor_slices(data)
     expected_types = ((dtypes.int32, dtypes.string),) * 3
     data = data.batch(2)
-    self.assertAllEqual(expected_types, data.output_types)
+    self.assertAllEqual(expected_types,
+                        dataset_ops.get_legacy_output_types(data))
     data = data.apply(batching.unbatch())
-    self.assertAllEqual(expected_types, data.output_types)
+    self.assertAllEqual(expected_types,
+                        dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(
         data,
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
index a8f50501517c24d5aea78d7dda18240f54921197..e6e77575a6f98b5becc9fe4ceb3126e22403b471 100644
--- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -31,7 +31,7 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase):
 
   def testBasic(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
 
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
     unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
@@ -42,10 +42,10 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase):
     for i in range(100):
       self.assertEqual(i, self.evaluate(get_next()))
 
-  # TODO(b/117581999): add eager coverage when supported.
+  @test_util.run_v1_only("b/123901304")
   def testSkipEagerGPU(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
 
     with ops.device("/gpu:0"):
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 56bf59344f8881d96525c197268ad9dac988166a..d79d45272e85ebd8f043b74a6aa69a60b7e8e908 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -140,24 +140,25 @@ py_library(
 )
 
 py_library(
-    name = "enumerate_ops",
-    srcs = ["enumerate_ops.py"],
+    name = "distribute",
+    srcs = [
+        "distribute.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
 py_library(
-    name = "filter_for_shard_ops",
-    srcs = ["filter_for_shard_ops.py"],
+    name = "enumerate_ops",
+    srcs = ["enumerate_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -450,9 +451,9 @@ py_library(
         ":batching",
         ":cardinality",
         ":counter",
+        ":distribute",
         ":enumerate_ops",
         ":error_ops",
-        ":filter_for_shard_ops",
         ":get_single_element",
         ":grouping",
         ":indexed_dataset_ops",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index f0cf7f0a9954044e20a1487fb357aa8b4c974263..5ad917eec4053b97b9969950fa592f130414fc33 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -38,6 +39,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,14 +52,15 @@ def batch_window(dataset):
   Returns:
     A `Tensor` representing the batch of the entire input dataset.
   """
-  if isinstance(dataset.output_classes, tuple):
+  dataset_output_classes = dataset_ops.get_legacy_output_classes(dataset)
+  if isinstance(dataset_output_classes, tuple):
     raise TypeError("Input dataset expected to have a single component")
-  if dataset.output_classes is ops.Tensor:
+  if dataset_output_classes is ops.Tensor:
     return _batch_dense_window(dataset)
-  elif dataset.output_classes is sparse_tensor.SparseTensor:
+  elif dataset_output_classes is sparse_tensor.SparseTensor:
     return _batch_sparse_window(dataset)
   else:
-    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
+    raise TypeError("Unsupported dataset type: %s" % dataset_output_classes)
 
 
 def _batch_dense_window(dataset):
@@ -76,8 +79,9 @@ def _batch_dense_window(dataset):
   def finalize_fn(state):
     return state
 
-  if dataset.output_shapes.is_fully_defined():
-    shape = dataset.output_shapes
+  dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+  if dataset_output_shapes.is_fully_defined():
+    shape = dataset_output_shapes
   else:
     first_element = get_single_element.get_single_element(dataset.take(1))
     shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
@@ -87,7 +91,8 @@ def _batch_dense_window(dataset):
 
   def batch_init_fn(_):
     batch_shape = array_ops.concat([[0], shape], 0)
-    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
+    return gen_array_ops.empty(
+        batch_shape, dtype=dataset_ops.get_legacy_output_types(dataset))
 
   def batch_reduce_fn(state, value):
     return array_ops.concat([state, [value]], 0)
@@ -113,8 +118,9 @@ def _batch_sparse_window(dataset):
   def finalize_fn(state):
     return state
 
-  if dataset.output_shapes.is_fully_defined():
-    shape = dataset.output_shapes
+  dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+  if dataset_output_shapes.is_fully_defined():
+    shape = dataset_output_shapes
   else:
     first_element = get_single_element.get_single_element(dataset.take(1))
     shape_reducer = grouping.Reducer(shape_init_fn, shape_reduce_fn,
@@ -126,7 +132,8 @@ def _batch_sparse_window(dataset):
     indices_shape = array_ops.concat([[0], [array_ops.size(shape) + 1]], 0)
     return sparse_tensor.SparseTensor(
         indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
+        values=constant_op.constant(
+            [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
         dense_shape=array_ops.concat(
             [np.array([0], dtype=np.int64),
              math_ops.cast(shape, dtypes.int64)], 0))
@@ -217,17 +224,18 @@ def padded_batch_window(dataset, padded_shape, padding_value=None):
   Raises:
     ValueError: if invalid arguments are provided.
   """
-  if not issubclass(dataset.output_classes,
+  dataset_output_classes = dataset_ops.get_legacy_output_classes(dataset)
+  if not issubclass(dataset_output_classes,
                     (ops.Tensor, sparse_tensor.SparseTensor)):
     raise TypeError("Input dataset expected to have a single tensor component")
-  if issubclass(dataset.output_classes, (ops.Tensor)):
+  if issubclass(dataset_output_classes, (ops.Tensor)):
     return _padded_batch_dense_window(dataset, padded_shape, padding_value)
-  elif issubclass(dataset.output_classes, (sparse_tensor.SparseTensor)):
+  elif issubclass(dataset_output_classes, (sparse_tensor.SparseTensor)):
     if padding_value is not None:
       raise ValueError("Padding value not allowed for sparse tensors")
     return _padded_batch_sparse_window(dataset, padded_shape)
   else:
-    raise TypeError("Unsupported dataset type: %s" % dataset.output_classes)
+    raise TypeError("Unsupported dataset type: %s" % dataset_output_classes)
 
 
 def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
@@ -263,12 +271,13 @@ def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
   padded_shape = get_single_element.get_single_element(
       dataset.apply(grouping.group_by_reducer(key_fn, max_reducer)))
 
+  dataset_output_types = dataset_ops.get_legacy_output_types(dataset)
   if padding_value is None:
-    if dataset.output_types == dtypes.string:
+    if dataset_output_types == dtypes.string:
       padding_value = ""
-    elif dataset.output_types == dtypes.bool:
+    elif dataset_output_types == dtypes.bool:
       padding_value = False
-    elif dataset.output_types == dtypes.variant:
+    elif dataset_output_types == dtypes.variant:
       raise TypeError("Unable to create padding for field of type 'variant'")
     else:
       padding_value = 0
@@ -276,7 +285,7 @@ def _padded_batch_dense_window(dataset, padded_shape, padding_value=None):
   def batch_init_fn(_):
     batch_shape = array_ops.concat(
         [np.array([0], dtype=np.int32), padded_shape], 0)
-    return gen_array_ops.empty(batch_shape, dtype=dataset.output_types)
+    return gen_array_ops.empty(batch_shape, dtype=dataset_output_types)
 
   def batch_reduce_fn(state, value):
     return array_ops.concat([state, [value]], 0)
@@ -329,7 +338,8 @@ def _padded_batch_sparse_window(dataset, padded_shape):
                                      0)
     return sparse_tensor.SparseTensor(
         indices=gen_array_ops.empty(indices_shape, dtype=dtypes.int64),
-        values=constant_op.constant([], shape=[0], dtype=dataset.output_types),
+        values=constant_op.constant(
+            [], shape=[0], dtype=dataset_ops.get_legacy_output_types(dataset)),
         dense_shape=array_ops.concat(
             [np.array([0], dtype=np.int64), padded_shape], 0))
 
@@ -352,7 +362,8 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset):
     """See `unbatch()` for more details."""
-    flat_shapes = nest.flatten(input_dataset.output_shapes)
+    input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
+    flat_shapes = nest.flatten(input_shapes)
     if any(s.ndims == 0 for s in flat_shapes):
       raise ValueError("Cannot unbatch an input with scalar components.")
     known_batch_dim = tensor_shape.Dimension(None)
@@ -365,9 +376,9 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
 
     self._structure = structure.convert_legacy_structure(
-        input_dataset.output_types,
-        nest.map_structure(lambda s: s[1:], input_dataset.output_shapes),
-        input_dataset.output_classes)
+        dataset_ops.get_legacy_output_types(input_dataset),
+        nest.map_structure(lambda s: s[1:], input_shapes),
+        dataset_ops.get_legacy_output_classes(input_dataset))
 
     variant_tensor = ged_ops.experimental_unbatch_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -425,9 +436,9 @@ def unbatch():
     # original dataset.
     restructured_dataset = _RestructuredDataset(
         normalized_dataset,
-        dataset.output_types,
-        dataset.output_shapes,
-        dataset.output_classes,
+        dataset_ops.get_legacy_output_types(dataset),
+        dataset_ops.get_legacy_output_shapes(dataset),
+        dataset_ops.get_legacy_output_classes(dataset),
         allow_unsafe_cast=True)
     return _UnbatchDataset(restructured_dataset)
 
@@ -439,15 +450,16 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, batch_size, row_shape):
     """See `Dataset.dense_to_sparse_batch()` for more details."""
-    if not isinstance(input_dataset.output_types, dtypes.DType):
+    if not isinstance(
+        dataset_ops.get_legacy_output_types(input_dataset), dtypes.DType):
       raise TypeError("DenseToSparseDataset requires an input whose elements "
                       "have a single component, whereas the input has %r." %
-                      input_dataset.output_types)
+                      dataset_ops.get_legacy_output_types(input_dataset))
     self._input_dataset = input_dataset
     self._batch_size = batch_size
     self._row_shape = row_shape
     self._structure = structure.SparseTensorStructure(
-        input_dataset.output_types,
+        dataset_ops.get_legacy_output_types(input_dataset),
         tensor_shape.vector(None).concatenate(self._row_shape))
 
     variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
@@ -500,25 +512,28 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
     """
     self._input_dataset = dataset
 
+    input_types = dataset_ops.get_legacy_output_types(dataset)
     if not allow_unsafe_cast:
       # Validate that the types are compatible.
       output_types = nest.map_structure(dtypes.as_dtype, output_types)
-      flat_original_types = nest.flatten(dataset.output_types)
+      flat_original_types = nest.flatten(input_types)
       flat_new_types = nest.flatten(output_types)
       if flat_original_types != flat_new_types:
         raise ValueError(
             "Dataset with output types %r cannot be restructured to have "
-            "output types %r" % (dataset.output_types, output_types))
+            "output types %r" %
+            (dataset_ops.get_legacy_output_types(dataset), output_types))
 
+    input_shapes = dataset_ops.get_legacy_output_shapes(dataset)
     if output_shapes is None:
       # Inherit shapes from the original `dataset`.
       output_shapes = nest.pack_sequence_as(
-          output_types, nest.flatten(dataset.output_shapes))
+          output_types, nest.flatten(input_shapes))
     else:
       if not allow_unsafe_cast:
         # Validate that the shapes are compatible.
         nest.assert_same_structure(output_types, output_shapes)
-        flat_original_shapes = nest.flatten(dataset.output_shapes)
+        flat_original_shapes = nest.flatten(input_shapes)
         flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
 
         for original_shape, new_shape in zip(flat_original_shapes,
@@ -526,14 +541,16 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
           if not original_shape.is_compatible_with(new_shape):
             raise ValueError(
                 "Dataset with output shapes %r cannot be restructured to have "
-                "incompatible output shapes %r" % (dataset.output_shapes,
+                "incompatible output shapes %r" % (input_shapes,
                                                    output_shapes))
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+
+    input_classes = dataset_ops.get_legacy_output_classes(dataset)
     if output_classes is None:
       # Inherit class types from the original `dataset`.
       output_classes = nest.pack_sequence_as(
-          output_types, nest.flatten(dataset.output_classes))
+          output_types, nest.flatten(input_classes))
 
     self._structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
@@ -549,12 +566,15 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
-               drop_remainder):
+               drop_remainder, use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
 
     self._map_func = dataset_ops.StructuredFunctionWrapper(
-        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
+        map_func,
+        "tf.data.experimental.map_and_batch()",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -589,6 +609,67 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
     return self._structure
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.map_and_batch()")
+@tf_export(v1=["data.experimental.map_and_batch_with_legacy_function"])
+def map_and_batch_with_legacy_function(map_func,
+                                       batch_size,
+                                       num_parallel_batches=None,
+                                       drop_remainder=False,
+                                       num_parallel_calls=None):
+  """Fused implementation of `map` and `batch`.
+
+  NOTE: This is an escape hatch for existing uses of `map_and_batch` that do not
+  work with V2 functions. New uses are strongly discouraged and existing uses
+  should migrate to `map_and_batch` as this method will not be removed in V2.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors to another
+      nested structure of tensors.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
+  """
+
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
+  def _apply_fn(dataset):
+    return _MapAndBatchDataset(dataset, map_func, batch_size,
+                               num_parallel_calls, drop_remainder,
+                               use_legacy_function=True)
+
+  return _apply_fn
+
+
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by "
+    "`tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data "
+    "optimizations will take care of using the fused implementation.")
 @tf_export("data.experimental.map_and_batch")
 def map_and_batch(map_func,
                   batch_size,
@@ -645,3 +726,41 @@ def map_and_batch(map_func,
                                num_parallel_calls, drop_remainder)
 
   return _apply_fn
+
+
+class _RebatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that divides the batch size by `num_workers`."""
+
+  def __init__(self, input_dataset, num_workers):
+    self._input_dataset = input_dataset
+
+    def recalculate_output_shapes(output_shapes):
+      """Recalculates the output_shapes after dividing it by num_workers."""
+      if len(output_shapes) < 1:
+        raise ValueError("Input shape should have at least one dimension.")
+      if (tensor_shape.dimension_value(output_shapes[0]) and
+          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
+        raise errors.InvalidArgumentError(
+            None, None,
+            "First dim of input shape: %d is not divisible by num_workers: %d" %
+            (output_shapes[0], num_workers))
+      output_dims = [d for d in output_shapes.dims]
+      output_dims[0] = output_dims[0] // num_workers
+      return tensor_shape.TensorShape(output_dims)
+
+    input_types = dataset_ops.get_legacy_output_types(self._input_dataset)
+    input_shapes = dataset_ops.get_legacy_output_shapes(self._input_dataset)
+    input_classes = dataset_ops.get_legacy_output_classes(self._input_dataset)
+    output_shapes = nest.map_structure(recalculate_output_shapes, input_shapes)
+
+    self._structure = structure.convert_legacy_structure(
+        input_types, output_shapes, input_classes)
+    variant_tensor = ged_ops.experimental_rebatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_workers=num_workers,
+        **dataset_ops.flat_structure(self))
+    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c0934e03be649e74baf2cc1aa2501970f76749
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -0,0 +1,56 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution Strategy-related dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+class _AutoShardDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that shards the `Dataset` automatically.
+
+  This dataset takes in an existing dataset and tries to automatically figure
+  out how to shard the dataset in a multi-worker scenario. Currently, it uses
+  Grappler to walk up the dataset graph until it finds a reader dataset (e.g.
+  CSVDataset, TFRecordDataset), then inserts a ShardDataset op before that node
+  so that each worker only sees some files.
+
+  Args:
+    num_workers: Total number of workers to shard this dataset across.
+    index: The current worker index (out of the total number of workers) this
+      dataset is for.
+
+  Raises:
+    NotFoundError: If we cannot find a suitable reader dataset to begin
+      automatically sharding the dataset.
+  """
+
+  def __init__(self, input_dataset, num_workers, index):
+    self._input_dataset = input_dataset
+
+    self._structure = input_dataset._element_structure  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_auto_shard_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_workers=num_workers,
+        index=index,
+        **dataset_ops.flat_structure(self))
+    super(_AutoShardDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
deleted file mode 100644
index 91d3dca3e9a883cf5eeacb368bbbf1af4420f3a1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Naive shard dataset transformation."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export("data.experimental.filter_for_shard")
-def filter_for_shard(num_shards, shard_index):
-  """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-  This dataset operator is very useful when running distributed training, as
-  it allows each worker to read a unique subset.
-
-  When reading a single input file, you can skip elements as follows:
-
-  ```python
-  d = tf.data.TFRecordDataset(FLAGS.input_file)
-  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
-                                               FLAGS.worker_index))
-  d = d.repeat(FLAGS.num_epochs)
-  d = d.shuffle(FLAGS.shuffle_buffer_size)
-  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-  ```
-
-  Important caveats:
-
-  - Be sure to shard before you use any randomizing operator (such as
-    shuffle).
-  - Generally it is best if the shard operator is used early in the dataset
-    pipeline. For example, when reading from a set of TFRecord files, shard
-    before converting the dataset to input samples. This avoids reading every
-    file on every worker. The following is an example of an efficient
-    sharding strategy within a complete pipeline:
-
-  ```python
-  d = Dataset.list_files(FLAGS.pattern)
-  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
-                                               FLAGS.worker_index))
-  d = d.repeat(FLAGS.num_epochs)
-  d = d.shuffle(FLAGS.shuffle_buffer_size)
-  d = d.interleave(tf.data.TFRecordDataset,
-                   cycle_length=FLAGS.num_readers, block_length=1)
-  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-  ```
-
-  Args:
-    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-      shards operating in parallel.
-    shard_index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-
-  Raises:
-    ValueError: if `num_shards` or `shard_index` are illegal values. Note: error
-      checking is done on a best-effort basis, and errors aren't guaranteed to
-      be caught upon dataset creation. (e.g. providing in a placeholder tensor
-      bypasses the early checking, and will instead result in an error during
-      a session.run call.)
-  """
-  num_shards = ops.convert_to_tensor(
-      num_shards, name="num_shards", dtype=dtypes.int64)
-  num_shards_static = tensor_util.constant_value(num_shards)
-  shard_index = ops.convert_to_tensor(shard_index, name="shard_index",
-                                      dtype=dtypes.int64)
-  shard_index_static = tensor_util.constant_value(shard_index)
-
-  if num_shards_static is not None and num_shards_static < 1:
-    raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
-  if shard_index_static is not None and shard_index_static < 0:
-    raise ValueError("shard_index must be >= 0; got: %s" % shard_index_static)
-  if (shard_index_static is not None and num_shards_static is not None and
-      shard_index_static >= num_shards_static):
-    raise ValueError("shard_index must be < num_shards; %s is not < %s" %
-                     (shard_index_static, num_shards_static))
-
-  def filter_fn(elem_index, _):
-    mod_result = math_ops.mod(elem_index, num_shards)
-    return math_ops.equal(mod_result, shard_index)
-
-  def _apply_fn(dataset):
-    # pylint: disable=protected-access
-    return dataset._enumerate().filter(filter_fn).map(lambda _, elem: elem)
-
-  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 4e83acf6bbadc065adae1a6fe3da81bc6ff19d0e..a4a324e5a1b4317c714a81bd95b1c74e38f9a829 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -229,9 +229,9 @@ def bucket_by_sequence_length(element_length_func,
                                             dtype=dtypes.int64)
           bucket_boundary = boundaries[bucket_id]
           none_filler = bucket_boundary - 1
-      shapes = make_padded_shapes(
-          padded_shapes or grouped_dataset.output_shapes,
-          none_filler=none_filler)
+      input_shapes = dataset_ops.get_legacy_output_shapes(grouped_dataset)
+      shapes = make_padded_shapes(padded_shapes or input_shapes,
+                                  none_filler=none_filler)
       return grouped_dataset.padded_batch(
           batch_size, shapes, padding_values, drop_remainder=drop_remainder)
 
@@ -276,6 +276,7 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
           "`key_func` must return a single tf.int64 tensor. "
           "Got type=%s and shape=%s"
           % (self._key_func.output_types, self._key_func.output_shapes))
+
   def _make_init_func(self, init_func):
     """Make wrapping defun for init_func."""
     self._init_func = dataset_ops.StructuredFunctionWrapper(
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index f4b7123df119dddd65ea07b0c3afab8ad05d202c..0480ac41a884c3636a86dcb15a4cdb74727df1a9 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -24,14 +24,19 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, "
+    "num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy "
+    "execution is desired, use `tf.data.Options.experimental_determinstic`.")
 @tf_export("data.experimental.parallel_interleave")
 def parallel_interleave(map_func,
                         cycle_length,
@@ -97,22 +102,25 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     self._selector_input = selector_input
     self._data_inputs = list(data_inputs)
 
+    first_output_types = dataset_ops.get_legacy_output_types(data_inputs[0])
+    first_output_classes = dataset_ops.get_legacy_output_classes(data_inputs[0])
+
     for data_input in data_inputs[1:]:
-      if (data_input.output_types != data_inputs[0].output_types or
-          data_input.output_classes != data_inputs[0].output_classes):
+      if (dataset_ops.get_legacy_output_types(data_input) != first_output_types
+          or dataset_ops.get_legacy_output_classes(data_input)
+          != first_output_classes):
         raise TypeError("All datasets must have the same type and class.")
 
-    output_shapes = self._data_inputs[0].output_shapes
+    output_shapes = dataset_ops.get_legacy_output_shapes(self._data_inputs[0])
     for data_input in self._data_inputs[1:]:
       output_shapes = nest.pack_sequence_as(output_shapes, [
           ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
               nest.flatten(output_shapes),
-              nest.flatten(data_input.output_shapes))
+              nest.flatten(dataset_ops.get_legacy_output_shapes(data_input)))
       ])
 
     self._structure = structure.convert_legacy_structure(
-        data_inputs[0].output_types, output_shapes,
-        data_inputs[0].output_classes)
+        first_output_types, output_shapes, first_output_classes)
     super(_DirectedInterleaveDataset, self).__init__()
 
   def _as_variant_tensor(self):
@@ -259,10 +267,8 @@ def choose_from_datasets_v2(datasets, choice_dataset):
     TypeError: If the `datasets` or `choice_dataset` arguments have the wrong
       type.
   """
-  if not (choice_dataset.output_types == dtypes.int64
-          and choice_dataset.output_shapes.is_compatible_with(
-              tensor_shape.scalar())
-          and choice_dataset.output_classes == ops.Tensor):
+  if not dataset_ops.get_structure(choice_dataset).is_compatible_with(
+      structure.TensorStructure(dtypes.int64, [])):
     raise TypeError("`choice_dataset` must be a dataset of scalar "
                     "`tf.int64` tensors.")
   return _DirectedInterleaveDataset(choice_dataset, datasets)
diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
index 5d729d392ac5ec9745cbfdd269bc536a74f3e865..429ef60db4ce9540a62c8bd7cd2b2cefcbf0c1c8 100644
--- a/tensorflow/python/data/experimental/ops/map_defun.py
+++ b/tensorflow/python/data/experimental/ops/map_defun.py
@@ -23,7 +23,11 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def map_defun(fn, elems, output_dtypes, output_shapes):
+def map_defun(fn,
+              elems,
+              output_dtypes,
+              output_shapes,
+              max_intra_op_parallelism=1):
   """Map a function on the list of tensors unpacked from `elems` on dimension 0.
 
   Args:
@@ -35,8 +39,10 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
     elems: A list of tensors.
     output_dtypes: A list of dtypes corresponding to the output types of the
       function.
-    output_shapes: A list of `TensorShape`s corresponding to the output
-      shapes from each invocation of the function on slices of inputs.
+    output_shapes: A list of `TensorShape`s corresponding to the output shapes
+      from each invocation of the function on slices of inputs.
+    max_intra_op_parallelism: An integer. If positive, sets the max parallelism
+      limit of each function call to this.
 
   Raises:
     ValueError: if any of the inputs are malformed.
@@ -58,4 +64,5 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
   elems = [ops.convert_to_tensor(e) for e in elems]
   output_shapes = [tensor_shape.TensorShape(s) for s in output_shapes]
   return gen_dataset_ops.map_defun(elems, concrete_fn.captured_inputs,
-                                   output_dtypes, output_shapes, concrete_fn)
+                                   output_dtypes, output_shapes, concrete_fn,
+                                   max_intra_op_parallelism)
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 75769b899890f3e41ec4ff452077a2e43ea1284e..feb25383ae47f2a93c38281a6bb820427ef2962c 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -18,12 +18,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
-
 # A constant that can be used to enable auto-tuning.
 AUTOTUNE = -1
 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
@@ -162,10 +162,12 @@ class _ChooseFastestDataset(dataset_ops.DatasetV2):
       A `Dataset` that has the same elements the inputs.
     """
     self._datasets = list(datasets)
+    self._structure = self._datasets[0]._element_structure  # pylint: disable=protected-access
     variant_tensor = (
         gen_experimental_dataset_ops.experimental_choose_fastest_dataset(
             [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
-            num_experiments=num_experiments))
+            num_experiments=num_experiments,
+            **dataset_ops.flat_structure(self)))
     super(_ChooseFastestDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
@@ -174,3 +176,117 @@ class _ChooseFastestDataset(dataset_ops.DatasetV2):
   @property
   def _element_structure(self):
     return self._datasets[0]._element_structure  # pylint: disable=protected-access
+
+
+class _ChooseFastestBranchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that merges two input datasets."""
+
+  def __init__(self,
+               input_dataset,
+               functions,
+               ratio_numerator=1,
+               ratio_denominator=1,
+               num_elements_per_branch=None):
+    """Chooses the fastest of some dataset functions.
+
+    Given dataset functions that take input_dataset as input and output
+    another dataset, produces elements as quickly as the fastest of these
+    output datasets. Note that datasets in the dataset functions are assumed
+    to be stateless, and the iterators created by the functions' output datasets
+    will, given the same input elements, all produce the same output elements.
+    Datasets in the functions are also expected to iterate over the input
+    dataset at most once. The violation of these conditions may lead to
+    undefined behavior.
+
+    For example:
+    ```python
+    dataset = tf.data.Dataset.range(100)
+    dataset = _ChooseFastestDataset(
+        dataset,
+        [
+            lambda ds: ds.map(lambda x: tf.reshape(x, [1])).batch(10),
+            lambda ds: ds.batch(10).map(lambda x: tf.reshape(x, [10, 1]))
+        ],
+        ratio=10,
+        num_elements_per_branch=10
+    )
+    ```
+    The resulting dataset will produce elements equivalent to
+    `tf.data.Dataset.range(100).map(lambda x: tf.reshape(x, [1])).batch(10)`, or
+    `tf.data.Dataset.range(100).batch(10).map(lambda x: tf.reshape(x, [10, 1]))`
+
+    Note that the first `num_elements_per_branch` iterations may be slower due
+    to the
+    overhead of dynamically picking the fastest dataset. Namely, for these
+    iterations, the dataset will produce elements from any of branches to
+    determine which input is the fastest. For all subsequent iterations, that
+    input will be used.
+
+    Args:
+      input_dataset: A `Dataset` that can be used as input to `functions`.
+      functions: A list of callables, each of which takes a `Dataset` as input
+        and returns a `Dataset`.
+      ratio_numerator: The numerator in the ratio of input elements consumed to
+        output elements produced for each function. This should be the same for
+        all functions. For example, if the function is
+        `lambda ds: ds.batch(10)`, the ratio is 10:1, i.e. the input dataset
+          must produce 10 elements for every element of the output dataset. In
+          this case, ratio_numerator should be 10.
+      ratio_denominator: The denominator in the ratio of input elements consumed
+        to output elements produced for each function. This should be the same
+        for all functions. For example, if the function is
+        `lambda ds: ds.batch(10)`, the ratio is 10:1, i.e. the input dataset
+          must produce 10 elements for every element of the output dataset. In
+          this case, ratio_denominator should be 1.
+      num_elements_per_branch: The number of elements to get from each branch
+        before deciding which dataset is fastest. In the first len(functions) *
+        num_elements_per_branch iterations, the dataset will call from one of
+        the branches, and update its knowledge of which input is the fastest.
+        Note that (num_elements_per_branch * ratio) is expected to be an
+        integer.
+
+    Returns:
+      A `Dataset` that has the same elements the inputs.
+    """
+    nested_structure = structure_lib.NestedStructure(
+        dataset_ops.DatasetStructure(
+            structure_lib.convert_legacy_structure(
+                input_dataset.output_types, input_dataset.output_shapes,
+                input_dataset.output_classes)))
+    self._funcs = [
+        dataset_ops.StructuredFunctionWrapper(
+            f, "ChooseFastestV2", input_structure=nested_structure)
+        for f in functions
+    ]
+    self._structure = self._funcs[0].output_structure._element_structure  # pylint: disable=protected-access
+
+    self._captured_arguments = []
+    for f in self._funcs:
+      self._captured_arguments.extend(f.function.captured_inputs)
+    self._capture_lengths = [
+        len(f.function.captured_inputs) for f in self._funcs
+    ]
+
+    if ratio_numerator <= 0 or ratio_denominator <= 0:
+      raise ValueError("ratio must be positive.")
+
+    if num_elements_per_branch is None:
+      # Pick a sensible default based on `ratio_denominator`
+      num_elements_per_branch = 10 * ratio_denominator
+
+    variant_tensor = (
+        gen_experimental_dataset_ops.choose_fastest_branch_dataset(
+            input_dataset._variant_tensor,  # pylint: disable=protected-access
+            ratio_numerator=ratio_numerator,
+            ratio_denominator=ratio_denominator,
+            other_arguments=self._captured_arguments,
+            num_elements_per_branch=num_elements_per_branch,
+            branches=[f.function for f in self._funcs],
+            other_arguments_lengths=self._capture_lengths,
+            **dataset_ops.flat_structure(self)))
+    super(_ChooseFastestBranchDataset, self).__init__(input_dataset,
+                                                      variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index be1fb4c7cacdbfffa43fa801e2f30d9e1d16ade9..e11fa884409e0c55b4239ace3d30731f124ab805 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -44,6 +44,22 @@ class OptimizationOptions(options.OptionsBase):
       "Whether to apply default static optimizations. If False, only static "
       "optimizations that have been explicitly enabled will be applied.")
 
+  autotune = options.create_option(
+      name="autotune",
+      ty=bool,
+      docstring=
+      "Whether to automatically tune performance knobs. If None, defaults to "
+      "True.")
+
+  autotune_cpu_budget = options.create_option(
+      name="autotune_cpu_budget",
+      ty=int,
+      docstring=
+      "When autotuning is enabled (through `autotune`), determines the CPU "
+      "budget to use. Values greater than the number of schedulable CPU cores "
+      "are allowed but may result in CPU contention. If None, defaults to the "
+      "number of schedulable CPU cores.")
+
   filter_fusion = options.create_option(
       name="filter_fusion",
       ty=bool,
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index a5ca96e89b5eb10160d59fd3e36489488d986422..f6cf2cea2b945f99b40c0ca60f26a2f5f85780aa 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -57,14 +57,12 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
     self._dense_defaults = dense_defaults_vec
     self._dense_shapes = dense_shapes
     self._dense_types = dense_types
-    dense_output_shapes = [
-        self._input_dataset.output_shapes.concatenate(shape)
-        for shape in dense_shape_as_shape
-    ]
-    sparse_output_shapes = [
-        self._input_dataset.output_shapes.concatenate([None])
-        for _ in range(len(sparse_keys))
-    ]
+    input_dataset_shape = dataset_ops.get_legacy_output_shapes(
+        self._input_dataset)
+    dense_output_shapes = [input_dataset_shape.concatenate(shape)
+                           for shape in dense_shape_as_shape]
+    sparse_output_shapes = [input_dataset_shape.concatenate([None])
+                            for _ in range(len(sparse_keys))]
 
     output_shapes = dict(
         zip(self._dense_keys + self._sparse_keys,
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index ef9db2f2d06c5a01b02ef7bd8cbd6d25e58be94d..af351c6b6b933a5168c6c7d829bbea103fb0e983 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -70,8 +70,8 @@ def copy_to_device(target_device, source_device="/cpu:0"):
 
   def _apply_fn(dataset):
     options = dataset_ops.Options()
-    options.experimental_autotune = False
     options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = False
     return _CopyToDeviceDataset(
         dataset, target_device=target_device,
         source_device=source_device).with_options(options)
@@ -141,13 +141,17 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
       """
       with ops.device(self._source_device_string):
         iterator = iterator_ops.Iterator.from_string_handle(
-            string_handle, self.output_types, self.output_shapes,
-            self.output_classes)
+            string_handle,
+            dataset_ops.get_legacy_output_types(self),
+            dataset_ops.get_legacy_output_shapes(self),
+            dataset_ops.get_legacy_output_classes(self))
       return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    @function.defun_with_attributes(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        attributes={"experimental_ints_on_device": True})
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 177886e64beabfd404864ffe75371d742a8d1385..a631fa61a91b84193117baf15baa266fe1fda812 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -24,6 +24,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import parsing_ops
@@ -328,6 +329,7 @@ def make_csv_dataset_v2(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):
   """Reads CSV files into a dataset.
 
@@ -402,6 +404,10 @@ def make_csv_dataset_v2(
       the files. Defaults to 100.
     compression_type: (Optional.) A `tf.string` scalar evaluating to one of
       `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.
+    ignore_errors: (Optional.) If `True`, ignores errors with CSV file parsing,
+      such as malformed data or empty lines, and moves on to the next valid
+      CSV record. Otherwise, the dataset raises an error and stops processing
+      when encountering any invalid records. Defaults to `False`.
 
   Returns:
     A dataset, where each element is a (features, labels) tuple that corresponds
@@ -457,7 +463,7 @@ def make_csv_dataset_v2(
     raise ValueError("`label_name` provided must be one of the columns.")
 
   def filename_to_dataset(filename):
-    return CsvDataset(
+    dataset = CsvDataset(
         filename,
         record_defaults=column_defaults,
         field_delim=field_delim,
@@ -465,8 +471,11 @@ def make_csv_dataset_v2(
         na_value=na_value,
         select_cols=select_columns,
         header=header,
-        compression_type=compression_type,
+        compression_type=compression_type
     )
+    if ignore_errors:
+      dataset = dataset.apply(error_ops.ignore_errors())
+    return dataset
 
   def map_fn(*columns):
     """Organizes columns into a features dictionary.
@@ -528,13 +537,14 @@ def make_csv_dataset_v1(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):  # pylint: disable=missing-docstring
   return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
       file_pattern, batch_size, column_names, column_defaults, label_name,
       select_columns, field_delim, use_quote_delim, na_value, header,
       num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
       prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
-      compression_type))
+      compression_type, ignore_errors))
 make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
 
 
@@ -571,6 +581,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     ```
 
     We can construct a CsvDataset from it as follows:
+
     ```python
     tf.enable_eager_execution()
 
@@ -585,6 +596,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     ```
 
     The expected output of its iterations is:
+
     ```python
     for element in dataset:
       print(element)
@@ -823,7 +835,8 @@ def make_batched_features_dataset_v2(file_pattern,
           sloppy=sloppy_ordering))
 
   # Extract values if the `Example` tensors are stored as key-value tuples.
-  if dataset.output_types == (dtypes.string, dtypes.string):
+  if dataset_ops.get_legacy_output_types(dataset) == (
+      dtypes.string, dtypes.string):
     dataset = dataset_ops.MapDataset(
         dataset, lambda _, v: v, use_inter_op_parallelism=False)
 
@@ -951,7 +964,7 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
             lambda dtype: structure.TensorStructure(dtype, []), output_types))
     variant_tensor = gen_experimental_dataset_ops.experimental_sql_dataset(
         self._driver_name, self._data_source_name, self._query,
-        nest.flatten(self.output_types), nest.flatten(self.output_shapes))
+        **dataset_ops.flat_structure(self))
     super(SqlDatasetV2, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/data/experimental/ops/resampling.py b/tensorflow/python/data/experimental/ops/resampling.py
index 3a3040ae9a4b072ae5c1a2dc218863246b6310e6..6676085ae593bf98d7c7c3cc9bd7fdbdb1db90ff 100644
--- a/tensorflow/python/data/experimental/ops/resampling.py
+++ b/tensorflow/python/data/experimental/ops/resampling.py
@@ -168,8 +168,7 @@ def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
 def _estimate_initial_dist_ds(
     target_dist_t, class_values_ds, dist_estimation_batch_size=32,
     smoothing_constant=10):
-  num_classes = (target_dist_t.shape[0].value or
-                 array_ops.shape(target_dist_t)[0])
+  num_classes = (target_dist_t.shape[0] or array_ops.shape(target_dist_t)[0])
   initial_examples_per_class_seen = array_ops.fill(
       [num_classes], np.int64(smoothing_constant))
 
@@ -207,7 +206,7 @@ def _estimate_data_distribution(c, num_examples_per_class_seen):
       `[num_classes]`.
     dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  num_classes = num_examples_per_class_seen.get_shape()[0]
   # Update the class-count based on what labels are seen in batch.
   num_examples_per_class_seen = math_ops.add(
       num_examples_per_class_seen, math_ops.reduce_sum(
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index 86a615d52400afca84b4c2537044f2adb35b574d..98f682e01af0388ef62564d01796f22411d830d5 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,11 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
                                                    variant_tensor)
 
 
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by "
+    "`tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take "
+    "care of using the fused implementation.")
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
   """Shuffles and repeats a Dataset returning a new permutation for each epoch.
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index 3e4c66be27018d25d4877d26ac565b4500633d0d..0c6e68648115d94566598ac838628c77cd20865c 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -44,7 +44,7 @@ class StatsAggregator(object):
   dataset = ...
 
   # Apply `StatsOptions` to associate `dataset` with `aggregator`.
-  options = dataset_ops.Options()
+  options = tf.data.Options()
   options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
   ```
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index 13dcb92fa0643c0f89110307f2c13cb6e8425a56..dff4286dabffcac8f3274d8e45e07f9a7a284fe3 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -48,8 +48,7 @@ def set_stats_aggregator(stats_aggregator, prefix="", counter_prefix=""):
   return _apply_fn
 
 
-# TODO(b/38416882): Properly export in the `tf.data.experimental` API when
-# stable or make private / remove.
+@tf_export("data.experimental.bytes_produced_stats")
 def bytes_produced_stats(tag):
   """Records the number of bytes produced by each element of the input dataset.
 
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index dd26cfa4ee9fe19153a99fb3c732546d777ba12f..cefc21d6bdf13f87e933364aa63a7363f90ff812 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -54,8 +54,8 @@ class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
   def __init__(self, input_dataset):
     """See `unique()` for details."""
     self._input_dataset = input_dataset
-    if input_dataset.output_types not in (dtypes.int32, dtypes.int64,
-                                          dtypes.string):
+    if dataset_ops.get_legacy_output_types(input_dataset) not in (
+        dtypes.int32, dtypes.int64, dtypes.string):
       raise TypeError(
           "`tf.data.experimental.unique()` only supports inputs with a single "
           "`tf.int32`, `tf.int64`, or `tf.string` component.")
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 49eae14652377ed652e5bb71b57f38244ef25749..de1c636263e183661f0cbe4dd01130c3922817e4 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -50,11 +50,12 @@ class TFRecordWriter(object):
     """
     if not isinstance(dataset, dataset_ops.DatasetV2):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-    if (dataset.output_types != dtypes.string or
-        dataset.output_shapes != tensor_shape.scalar()):
+    if not dataset_ops.get_structure(dataset).is_compatible_with(
+        structure.TensorStructure(dtypes.string, [])):
       raise TypeError(
           "`dataset` must produce scalar `DT_STRING` tensors whereas it "
-          "produces shape {0} and types {1}".format(dataset.output_shapes,
-                                                    dataset.output_types))
+          "produces shape {0} and types {1}".format(
+              dataset_ops.get_legacy_output_shapes(dataset),
+              dataset_ops.get_legacy_output_types(dataset)))
     return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
         dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 8206ce382cef7ccfd9cd36c478c9df69e26a64ec..b56049f32dab6e2edea361ae2b9a55df3a1ce103 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -108,8 +108,26 @@ tf_py_test(
     size = "small",
     srcs = ["filter_test.py"],
     additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "filter_with_legacy_function_test",
+    size = "small",
+    srcs = ["filter_with_legacy_function_test.py"],
+    additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "filter_test_base",
+    srcs = ["filter_test_base.py"],
+    deps = [
         ":test_base",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -118,6 +136,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -272,7 +291,7 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -287,6 +306,7 @@ tf_py_test(
     size = "small",
     srcs = ["iterator_cluster_test.py"],
     additional_deps = [
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -324,7 +344,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -350,6 +370,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -422,6 +443,23 @@ cuda_py_test(
         "no_oss",  # TODO(b/117920141): Investigate breakage and re-enable.
         "no_windows_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "memory_cleanup_test",
+    size = "medium",
+    srcs = ["memory_cleanup_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -443,6 +481,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:tensor_shape",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index 2551250346745b6030d11e4af12ffd8e30ef6021..30fdd4b39feff472778422cc94fd75284a81b9ae 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -70,7 +70,8 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     else:
       dim0 = None
     self.assertEqual(
-        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)],
+        [ts.as_list() for ts in nest.flatten(
+            dataset_ops.get_legacy_output_shapes(dataset))],
         [[dim0] + list(c.shape[1:]) for c in components])
 
     num_full_batches = (count * 7) // batch_size
@@ -116,7 +117,7 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=array_ops.expand_dims(
               math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
+          values=array_ops.fill([math_ops.cast(i, dtypes.int32)], i),
           dense_shape=[i])
 
     dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index 4806101d8c7e3dcaaf3d698727d863b3bcccc3ed..aa1fd6e5aea2d0a3194f128ba1f3f83739507099 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -59,7 +59,7 @@ class FileCacheTest(test_base.DatasetTestBase):
 
     self.assertEqual(
         tuple([c.shape[1:] for c in components]),
-        dataset_fn().output_shapes)
+        dataset_ops.get_legacy_output_shapes(dataset_fn()))
 
     get_next = self.getNext(dataset_fn())
 
diff --git a/tensorflow/python/data/kernel_tests/concatenate_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
index 5d8bfdc8f3afc2aed265f3907c22ff442ba590c4..384fd289f1614a69d0b90e8f4322a9183abbb426 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -45,8 +45,10 @@ class ConcatenateTest(test_base.DatasetTestBase):
     dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
         to_concatenate_components)
     concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
-        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
+    self.assertEqual(
+        dataset_ops.get_legacy_output_shapes(concatenated),
+        (tensor_shape.TensorShape([20]), tensor_shape.TensorShape([15]),
+         tensor_shape.TensorShape([])))
 
     get_next = self.getNext(concatenated)
 
@@ -76,7 +78,9 @@ class ConcatenateTest(test_base.DatasetTestBase):
     concatenated = input_dataset.concatenate(dataset_to_concatenate)
     self.assertEqual(
         [ts.as_list()
-         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
+         for ts in nest.flatten(
+             dataset_ops.get_legacy_output_shapes(concatenated))],
+        [[20], [None]])
     get_next = self.getNext(concatenated)
     for i in range(9):
       result = self.evaluate(get_next())
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index f319b24bee87d127cda11f84c75fa295a1cb67c3..1e764b3e25205d4fb369e8fb76f8908a76ed4c02 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -92,15 +92,16 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("TFRecord", lambda: readers.TFRecordDataset(""), 1),
   )
   def testDatasetSimpleSourceInputs(self, dataset_fn, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset_fn()._inputs()))
+    self.assertLen(dataset_fn()._inputs(), num_inputs)
 
+  @test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
   def testDatasetComplexSourceInputs(self):
     dataset_fn = dataset_ops.Dataset.from_sparse_tensor_slices(
         sparse_tensor.SparseTensor(
             indices=np.array([[0, 0], [1, 0], [2, 0]]),
             values=np.array([0, 0, 0]),
             dense_shape=np.array([3, 1])))
-    self.assertEqual(0, len(dataset_fn._inputs()))
+    self.assertEmpty(dataset_fn._inputs())
 
   @parameterized.named_parameters(
       ("Batch",
@@ -266,27 +267,24 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           round_trip_dataset, [self.evaluate(tf_value_fn())],
           requires_initialization=True)
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorOneShot(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         dataset = dataset.batch(2)
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorOneShotSimple(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
       with test.mock.patch.object(logging, "warning") as mock_log:
-        _ = dataset.make_one_shot_iterator()
+        _ = dataset_ops.make_one_shot_iterator(dataset)
         self.assertRegexpMatches(
             str(mock_log.call_args), "Please ensure that all datasets in the "
             "pipeline are created in the same graph as the iterator.")
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
   def testSkipEagerSameGraphErrorInitializable(self):
     dataset = dataset_ops.Dataset.range(10)
     with ops.Graph().as_default():
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
index afaf954cbc6a96984239cb22665bbe1f17d6d40d..b81e9a892dfbb0baded27cbfb36ec94a0101d78f 100644
--- a/tensorflow/python/data/kernel_tests/filter_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,111 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.data.kernel_tests import filter_test_base
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FilterTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def do_test(count, modulus):
-      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
-          _map_fn).repeat(count).filter(
-              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-      self.assertEqual([c.shape[1:] for c in components],
-                       [shape for shape in dataset.output_shapes])
-      get_next = self.getNext(dataset)
-      for _ in range(count):
-        for i in [x for x in range(7) if x**2 % modulus == 0]:
-          result = self.evaluate(get_next())
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next())
-
-    do_test(14, 2)
-    do_test(4, 18)
-
-    # Test an empty dataset.
-    do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(4).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
-
-  def testFilterDict(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
-            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
-                lambda d: d["foo"] + d["bar"])
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
-    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-        lambda x, i: x)
-    self.assertDatasetProduces(
-        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
-
-  def testShortCircuit(self):
-    dataset = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(10),
-         dataset_ops.Dataset.from_tensors(True).repeat(None)
-        )).filter(lambda x, y: y)
-    self.assertDatasetProduces(
-        dataset, expected_output=[(i, True) for i in range(10)])
+class FilterTest(filter_test_base.FilterTestBase):
 
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    next_elements = [self.getNext(dataset) for _ in range(10)]
-    self.assertEqual([0 for _ in range(10)],
-                     self.evaluate(
-                         [next_element() for next_element in next_elements]))
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter(predicate)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/filter_test_base.py b/tensorflow/python/data/kernel_tests/filter_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e5d285f2c716e865dffce88f756b3ca82d8945
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test_base.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+
+
+class FilterTestBase(test_base.DatasetTestBase):
+  """Base class for FilterDataset tests."""
+
+  def apply_filter(self, input_dataset, predicate):
+    raise NotImplementedError("FilterTestBase._apply_filter")
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):  # pylint: disable=missing-docstring
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count)
+      # pylint: disable=g-long-lambda
+      dataset = self.apply_filter(
+          dataset, lambda x, _y, _z: math_ops.equal(
+              math_ops.mod(x, modulus), 0))
+      # pylint: enable=g-long-lambda
+      self.assertEqual(
+          [c.shape[1:] for c in components],
+          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = self.apply_filter(
+        dataset, lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2})
+    dataset = self.apply_filter(
+        dataset, lambda d: math_ops.equal(d["bar"] % 2, 0))
+    dataset = dataset.map(lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]])
+    dataset = self.apply_filter(dataset, _predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    dataset = self.apply_filter(dataset, _filter_fn)
+    dataset = dataset.map(lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        ))
+    dataset = self.apply_filter(dataset, lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = self.apply_filter(dataset, lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
diff --git a/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a283fb3302318ca526c0d43f8b025749b52c2fc
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter_with_legacy_function()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import filter_test_base
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_v1_only
+class FilterWithLegacyFunctionTest(filter_test_base.FilterTestBase):
+
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter_with_legacy_function(predicate)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index ff52821b10740196286c30d19b0cda3b4b44bae5..69b5fd0d77fe743c02f441f1d65ae0bc9d731dae 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -65,11 +65,11 @@ class FlatMapTest(test_base.DatasetTestBase):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
     iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
+        dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+                lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+                    lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))),
+            shared_name="shared_flat_map_iterator"))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index 546c2fb2ed3f7584001e9aa2dbeb93ac82ca7709..2ce9c9a061c63b6acea899aef0518e516befb388 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -29,10 +29,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-# NOTE: deprecated method in V2, no eager coverage added.
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index 72db6387718712b97442eb3f7ddc3befcbbf6a12..ef46f8eef74b0f88fb657815ee6435eded0d1d76 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -43,8 +43,9 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components)
     get_next = self.getNext(dataset)
 
-    self.assertEqual([c.shape[1:] for c in components],
-                     [shape for shape in dataset.output_shapes])
+    self.assertEqual(
+        [c.shape[1:] for c in components],
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
 
     for i in range(4):
       results = self.evaluate(get_next())
@@ -68,7 +69,7 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
 
     self.assertEqual(
         [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in dataset.output_shapes])
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
 
     expected = [
         (sparse_tensor.SparseTensorValue(
@@ -117,7 +118,7 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     self.assertEqual([
         tensor_shape.TensorShape(c.dense_shape[1:])
         if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in dataset.output_shapes])
+    ], [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
 
     expected = [
         (sparse_tensor.SparseTensorValue(
@@ -161,10 +162,12 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components)
     get_next = self.getNext(dataset)
 
-    self.assertEqual(dtypes.int32, dataset.output_types["foo"])
-    self.assertEqual(dtypes.float32, dataset.output_types["bar"])
-    self.assertEqual((), dataset.output_shapes["foo"])
-    self.assertEqual((1,), dataset.output_shapes["bar"])
+    self.assertEqual(dtypes.int32,
+                     dataset_ops.get_legacy_output_types(dataset)["foo"])
+    self.assertEqual(dtypes.float32,
+                     dataset_ops.get_legacy_output_types(dataset)["bar"])
+    self.assertEqual((), dataset_ops.get_legacy_output_shapes(dataset)["foo"])
+    self.assertEqual((1,), dataset_ops.get_legacy_output_shapes(dataset)["bar"])
 
     for i in range(3):
       results = self.evaluate(get_next())
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index 82ccdebc7ff7adec439791f205c30e3011afa996..e9f1084e0429f6e84182b9495a394a3714d9e8bf 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -45,8 +45,9 @@ class FromTensorsTest(test_base.DatasetTestBase):
 
     dataset = dataset_ops.Dataset.from_tensors(components)
 
-    self.assertEqual([c.shape for c in components],
-                     nest.flatten(dataset.output_shapes))
+    self.assertEqual(
+        [c.shape for c in components],
+        nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)))
 
     self.assertDatasetProduces(dataset, expected_output=[components])
 
@@ -65,7 +66,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
 
     self.assertEqual(
         [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in dataset.output_shapes])
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     self.assertDatasetProduces(dataset, expected_output=[components])
 
   def testFromTensorsMixed(self):
@@ -84,7 +85,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
     self.assertEqual([
         tensor_shape.TensorShape(c.dense_shape)
         if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in dataset.output_shapes])
+    ], [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
 
     self.assertDatasetProduces(dataset, expected_output=[components])
 
@@ -95,51 +96,67 @@ class FromTensorsTest(test_base.DatasetTestBase):
                   np.array([8, 9, 10], dtype=np.int64))
 
     dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+    self.assertEqual((dtypes.int64, (dtypes.float64, dtypes.float64),
+                      dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([3], ([2], [2]), [3]),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+    self.assertEqual((dtypes.int64, (dtypes.float64, dtypes.float64),
+                      dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([3], ([2], [2]), [3]),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+    self.assertEqual((dtypes.int64, (dtypes.float64, dtypes.float64),
+                      dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([3], ([2], [2]), [3]),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+    self.assertEqual((dtypes.int64, (dtypes.float64, dtypes.float64),
+                      dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([3], ([2], [2]), [3]),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+    self.assertEqual((dtypes.int64, (dtypes.float64, dtypes.float64),
+                      dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([3], ([2], [2]), [3]),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+    self.assertEqual(((dtypes.int64, dtypes.int64),
+                      (dtypes.float64, dtypes.float64)),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual((([3], [3]), ([2], [2])),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.flat_map(
         lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
                                                        (y[0], y[1])))
     )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+    self.assertEqual(((dtypes.int64, dtypes.int64),
+                      (dtypes.float64, dtypes.float64)),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual((([3], [3]), ([2], [2])),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
+    self.assertEqual(((dtypes.int64, dtypes.int64),
+                      (dtypes.float64, dtypes.float64)),
+                     dataset_ops.get_legacy_output_types(dataset))
+    dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+    self.assertEqual((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                     nest.pack_sequence_as(dataset_output_shapes, [
+                         s.as_list()
+                         for s in nest.flatten(dataset_output_shapes)
+                     ]))
 
     # Define a separate set of components with matching leading
     # dimension for the from-slices constructor.
@@ -148,10 +165,11 @@ class FromTensorsTest(test_base.DatasetTestBase):
                              np.array([10, 11, 12], dtype=np.int64))
 
     dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64,
-                       (dtypes.float64, dtypes.float64), dtypes.int64),
-                      dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+    self.assertEqual((dtypes.int64,
+                      (dtypes.float64, dtypes.float64), dtypes.int64),
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual(([], ([], []), []),
+                     dataset_ops.get_legacy_output_shapes(dataset))
 
   # TODO(b/117581999): more specific shapes in eager mode.
   @test_util.run_deprecated_v1
@@ -169,60 +187,70 @@ class FromTensorsTest(test_base.DatasetTestBase):
 
     get_next = self.getNext(dataset)
     (w, x), (y, z) = get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
+    self.assertEqual(dtypes.int64, w.dtype)
+    self.assertEqual(dtypes.int64, x.dtype)
+    self.assertEqual(dtypes.float64, y.dtype)
+    self.assertEqual(dtypes.float64, z.dtype)
+    self.assertEqual([None, 3], w.shape.as_list())
+    self.assertEqual([None, 3], x.shape.as_list())
+    self.assertEqual([None, 2], y.shape.as_list())
+    self.assertEqual([None, 2], z.shape.as_list())
 
     get_next = self.getNext(dataset)
     (w, x), (y, z) = get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
+    self.assertEqual(dtypes.int64, w.dtype)
+    self.assertEqual(dtypes.int64, x.dtype)
+    self.assertEqual(dtypes.float64, y.dtype)
+    self.assertEqual(dtypes.float64, z.dtype)
+    self.assertEqual([None, 3], w.shape.as_list())
+    self.assertEqual([None, 3], x.shape.as_list())
+    self.assertEqual([None, 2], y.shape.as_list())
+    self.assertEqual([None, 2], z.shape.as_list())
 
   def testNestedDict(self):
     components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
     dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
+    self.assertEqual(dtypes.int32,
+                     dataset_ops.get_legacy_output_types(dataset)["a"]["aa"])
+    self.assertEqual(dtypes.float32,
+                     dataset_ops.get_legacy_output_types(dataset)["a"]["ab"])
+    self.assertEqual(dtypes.int32,
+                     dataset_ops.get_legacy_output_types(dataset)["b"])
+    self.assertEqual([],
+                     dataset_ops.get_legacy_output_shapes(dataset)["a"]["aa"])
+    self.assertEqual([2],
+                     dataset_ops.get_legacy_output_shapes(dataset)["a"]["ab"])
+    self.assertEqual([3],
+                     dataset_ops.get_legacy_output_shapes(dataset)["b"])
 
   def testNonSequenceNestedStructure(self):
     components = np.array([1, 2, 3], dtype=np.int64)
 
     dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
+    self.assertEqual(dtypes.int64,
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.filter(
         lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
+    self.assertEqual(dtypes.int64,
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
+    self.assertEqual(dtypes.int64,
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual([2, 3], dataset_ops.get_legacy_output_shapes(dataset))
 
     dataset = dataset.flat_map(
         lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
+    self.assertEqual(dtypes.int64,
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertEqual([3], dataset_ops.get_legacy_output_shapes(dataset))
 
     get_next = self.getNext(dataset)
-    self.assertEquals(dtypes.int64, get_next().dtype)
-    self.assertEquals([3], get_next().shape)
+    self.assertEqual(dtypes.int64, get_next().dtype)
+    self.assertEqual([3], get_next().shape)
 
   # TODO(b/121264236): needs mechanism for multiple device in eager mode.
   def testSkipEagerSplitPipelineFailsWithPlacementError(self):
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
index 91b356691b75eb337ad61643646ba717e4929ab9..dfb54b50ad6b2dd8f242fba09218d6eae871a49c 100644
--- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -43,7 +43,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], get_next())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], get_next())
@@ -73,7 +73,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset_2)
     get_next_3 = iterator_3.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], get_next_1())
     self.assertAllEqual(0, get_next_3())
@@ -96,7 +96,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual(0, get_next())
     self.assertAllEqual(1, get_next())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -115,7 +115,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     iterator = iter(dataset) if context.executing_eagerly(
     ) else dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     for i in range(5):
       checkpoint.restore(
           checkpoint_management.latest_checkpoint(
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index 20088234953b1cdc8f85381ded45cf22aa93c75a..ef198869e4ef9bd75877beca2ca1ffd31c1e96df 100644
--- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib import lookup as lookup_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -31,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -53,7 +53,8 @@ class IteratorClusterTest(test.TestCase):
 
     with ops.device("/job:worker/replica:0/task:0/cpu:0"):
       remote_it = iterator_ops.Iterator.from_string_handle(
-          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
+          iterator_3_handle, dataset_ops.get_legacy_output_types(dataset_3),
+          dataset_ops.get_legacy_output_shapes(dataset_3))
       get_next_op = remote_it.get_next()
 
     with session.Session(worker[0].target) as sess:
@@ -69,7 +70,8 @@ class IteratorClusterTest(test.TestCase):
     @function.Defun(dtypes.string)
     def _remote_fn(h):
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
+          h, dataset_ops.get_legacy_output_types(dataset_3),
+          dataset_ops.get_legacy_output_shapes(dataset_3))
       return remote_iterator.get_next()
 
     with ops.device(device0):
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 916cf8bb45ce7dbf55261d3f67ca17c0cdbb10fd..f5a3645dd2916af9eb4fad7006675cc5b65a9e42 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -55,7 +55,7 @@ from tensorflow.python.util import compat
 
 class IteratorTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testNoGradients(self):
     component = constant_op.constant([1.])
     side = constant_op.constant(0.)
@@ -66,7 +66,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     self.assertIsNone(gradients_impl.gradients(value, side)[0])
     self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testCapturingStateInOneShotRaisesException(self):
     var = variables.Variable(37.0, name="myvar")
     dataset = (
@@ -77,7 +77,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         "datasets that capture stateful objects.+myvar"):
       dataset_ops.make_one_shot_iterator(dataset)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testOneShotIterator(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -103,7 +103,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testOneShotIteratorCaptureByValue(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -166,7 +166,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testOneShotIteratorNonBlocking(self):
     dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
@@ -205,7 +205,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                        len([None for r in results if r is None]))
       self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testOneShotIteratorInitializerFails(self):
     # Define a dataset whose initialization will always fail.
     dataset = dataset_ops.Dataset.from_tensors(
@@ -286,7 +286,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testNotInitializedError(self):
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
     iterator = dataset_ops.make_initializable_iterator(
@@ -298,7 +298,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                                    "iterator has not been initialized"):
         sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testReinitializableIterator(self):
     dataset_3 = dataset_ops.Dataset.from_tensors(
         constant_op.constant([1, 2, 3]))
@@ -313,7 +313,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(dataset_3.output_types, iterator.output_types)
     self.assertEqual(dataset_4.output_types, iterator.output_types)
-    self.assertEqual([None], iterator.output_shapes.as_list())
+    self.assertEqual(
+        [None], dataset_ops.get_legacy_output_shapes(iterator).as_list())
 
     with self.cached_session() as sess:
       # The iterator is initially uninitialized.
@@ -338,7 +339,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testReinitializableIteratorWithFunctions(self):
 
     def g():
@@ -398,7 +399,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
               (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
                constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testIteratorStringHandle(self):
     dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
     dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
@@ -408,12 +409,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     feedable_iterator = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
+        handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3),
+        dataset_ops.get_legacy_output_shapes(dataset_3))
     next_element = feedable_iterator.get_next()
 
-    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-    self.assertEqual([], feedable_iterator.output_shapes)
+    self.assertTrue(dataset_ops.get_structure(dataset_3).is_compatible_with(
+        dataset_ops.get_structure(feedable_iterator)))
+    self.assertTrue(dataset_ops.get_structure(dataset_4).is_compatible_with(
+        dataset_ops.get_structure(feedable_iterator)))
 
     with self.cached_session() as sess:
       iterator_3_handle = sess.run(iterator_3.string_handle())
@@ -454,7 +457,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         sess.run(
             next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testIteratorStringHandleFuture(self):
     with forward_compat.forward_compatibility_horizon(2018, 8, 4):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
@@ -465,12 +468,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
       handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
       feedable_iterator = iterator_ops.Iterator.from_string_handle(
-          handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
+          handle_placeholder, dataset_ops.get_legacy_output_types(dataset_3),
+          dataset_ops.get_legacy_output_shapes(dataset_3))
       next_element = feedable_iterator.get_next()
 
-      self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-      self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-      self.assertEqual([], feedable_iterator.output_shapes)
+      self.assertTrue(dataset_ops.get_structure(dataset_3).is_compatible_with(
+          dataset_ops.get_structure(feedable_iterator)))
+      self.assertTrue(dataset_ops.get_structure(dataset_4).is_compatible_with(
+          dataset_ops.get_structure(feedable_iterator)))
 
       with self.cached_session() as sess:
         iterator_3_handle = sess.run(iterator_3.string_handle())
@@ -518,7 +523,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
           sess.run(
               next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testIteratorStringHandleReuseTensorObject(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
     one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
@@ -547,7 +552,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     self.assertEqual("foo_1", handle_with_same_name.op.name)
     self.assertIsNot(handle_with_name, handle_with_same_name)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testIteratorStringHandleError(self):
     dataset_int_scalar = (
         dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
@@ -588,7 +593,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
             feedable_int_vector.get_next(),
             feed_dict={handle_placeholder: handle_float_vector}))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 3
@@ -601,7 +606,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     @function.Defun(dtypes.string)
     def _remote_fn(h):
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
+          h, dataset_ops.get_legacy_output_types(dataset_3),
+          dataset_ops.get_legacy_output_shapes(dataset_3))
       return remote_iterator.get_next()
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
@@ -645,7 +651,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
             })
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
     s1 = server_lib.Server.create_local_server()
     s2 = server_lib.Server.create_local_server()
@@ -677,7 +683,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     @function.Defun(dtypes.string)
     def loading_func(h):
       remote_itr = iterator_ops.Iterator.from_string_handle(
-          h, itr.output_types, itr.output_shapes)
+          h, dataset_ops.get_legacy_output_types(itr),
+          dataset_ops.get_legacy_output_shapes(itr))
       return remote_itr.get_next()
 
     def map_fn(target, handle):
@@ -698,6 +705,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(n)
 
+  @test_util.deprecated_graph_mode_only
   def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -714,7 +722,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _remote_fn(h):
       handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, dataset_3.output_types, dataset_3.output_shapes)
+          handle, dataset_ops.get_legacy_output_types(dataset_3),
+          dataset_ops.get_legacy_output_shapes(dataset_3))
       return remote_iterator.get_next()
 
     with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
@@ -753,7 +762,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
             })
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testIncorrectIteratorRestore(self):
 
     def _path():
@@ -812,7 +821,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testRepeatedGetNextWarning(self):
     iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
     warnings.simplefilter("always")
@@ -865,9 +874,12 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(iterator._element_structure.is_compatible_with(
         expected_element_structure))
 
-    self.assertEqual(expected_output_classes, iterator.output_classes)
-    self.assertEqual(expected_output_types, iterator.output_types)
-    self.assertEqual(expected_output_shapes, iterator.output_shapes)
+    self.assertEqual(expected_output_classes,
+                     dataset_ops.get_legacy_output_classes(iterator))
+    self.assertEqual(expected_output_types,
+                     dataset_ops.get_legacy_output_types(iterator))
+    self.assertEqual(expected_output_shapes,
+                     dataset_ops.get_legacy_output_shapes(iterator))
 
   def testIteratorGetNextName(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 97ab6b21bc27283d0e3630690b9d7cbf20b09b47..e68c0e008e833e8362f68eee58200f7c60561380 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -39,8 +40,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
@@ -83,6 +84,7 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   return dataset, coordination_events
 
 
+# TODO(jsimsa): Add tests for `map_with_legacy_function`.
 @test_util.run_all_in_graph_and_eager_modes
 class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
@@ -93,8 +95,9 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
         _map_fn).repeat(count)
-    self.assertEqual([c.shape[1:] for c in components],
-                     [shape for shape in dataset.output_shapes])
+    self.assertEqual(
+        [c.shape[1:] for c in components],
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     return dataset
 
   def testMapDataset(self):
@@ -159,8 +162,9 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         _map_fn, num_parallel_calls=num_parallel_calls).prefetch(
             output_buffer_size).repeat(count)
 
-    self.assertEqual([c.shape[1:] for c in components],
-                     [shape for shape in dataset.output_shapes])
+    self.assertEqual(
+        [c.shape[1:] for c in components],
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     return dataset
 
   def testParallelMapDataset(self):
@@ -312,8 +316,8 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
       if context.executing_eagerly():
         captured_iterator = iter(dataset_ops.Dataset.range(10))
       else:
-        captured_iterator = dataset_ops.Dataset.range(
-            10).make_initializable_iterator()
+        captured_iterator = dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.range(10))
       ds = _build_ds(captured_iterator)
       return captured_iterator, ds
 
@@ -350,6 +354,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  @test_util.run_v1_only("b/123904513")
   def testCaptureQueue(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
@@ -391,36 +396,6 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
-  def testCaptureVariable(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
-        10).map(lambda _: counter_var.assign_add(1))
-    get_next = self.getNext(dataset, requires_initialization=True)
-
-    self.evaluate(counter_var.initializer)
-
-    for i in range(10):
-      self.assertEqual(i, self.evaluate(counter_var))
-      self.assertEqual(i + 1, self.evaluate(get_next()))
-    self.assertEqual(10, self.evaluate(counter_var))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
-    self.assertEqual(10, self.evaluate(counter_var))
-
-  # TODO(b/117581999): error not captured for eager mode, debug.
-  @test_util.run_v1_only("b/120545219")
-  def testSkipEagerCaptureUninitializedVariableError(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
-        10).map(lambda _: counter_var.assign_add(1))
-
-    get_next = self.getNext(dataset, requires_initialization=True)
-
-    with self.assertRaises(errors.NotFoundError):
-      self.evaluate(get_next())
-
   def testSeededStatefulOperatorIsProperlyStateful(self):
     dataset = dataset_ops.Dataset.from_tensors(0).repeat(
         10).map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
@@ -522,7 +497,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testUseStepContainerInMap(self):
     row = np.arange(6)
     dataset = dataset_ops.Dataset.from_tensors(
-        row).map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
+        row).map(lambda elems: map_fn.map_fn(lambda x: x * x, elems))
     self.assertDatasetProduces(dataset, expected_output=[row**2])
 
   def testCaseAndCondInMap(self):
@@ -586,7 +561,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     def build_dataset(row, num):
       # pylint: disable=g-long-lambda
       dataset = dataset_ops.Dataset.from_tensors(
-          row).map(lambda elems: functional_ops.map_fn(
+          row).map(lambda elems: map_fn.map_fn(
               lambda x: control_map_fn(x, num), elems))
       return self.getNext(dataset)
 
@@ -628,7 +603,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     num = 2
     # pylint: disable=g-long-lambda
     dataset = dataset_ops.Dataset.from_tensors(
-        row).map(lambda elems: functional_ops.map_fn(
+        row).map(lambda elems: map_fn.map_fn(
             lambda x: control_map_fn(x, num), elems))
     # pylint: enable=g-long-lambda
     get_next = self.getNext(dataset)
@@ -639,6 +614,13 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  def testNestedListMapDataset(self):
+    dataset = dataset_ops.Dataset.from_tensors(
+        [0, 1, 2]).repeat(10).map(lambda a: ([a[1], a[0] + a[2]], a[1]))
+
+    expected_output = [(np.array([1, 2]), 1)] * 10
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
   def testPrefetch(self):
     # We will use this event to test that `_map_py_func()` has been
     # invoked a certain number of times (6 times, to be exact) after
@@ -746,6 +728,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset,
         expected_output=[self.evaluate(_check(_sparse(i))) for i in range(10)])
 
+  @test_util.run_v1_only("b/123904513")
   def testParallelMapOutOfRangeError(self):
     def raising_py_func(i):
       if i == 100:
@@ -769,7 +752,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testWarnOnLookupTable(self):
     def collecting_function(x):
       _ = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1")
+          lookup_ops.KeyValueTensorInitializer(["a"], [1.]), 0.0, name="t1")
       return x
 
     warnings.simplefilter("always")
@@ -780,12 +763,93 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertGreaterEqual(len(w), 1)
     found_warning = False
     for warning in w:
-      if ("Creating lookup tables inside a function passed to Dataset.map() is "
+      if ("Creating resources inside a function passed to Dataset.map() is "
+          "not supported." in str(warning)):
+        found_warning = True
+        break
+    self.assertTrue(found_warning)
+
+  @test_util.run_v1_only("map_with_legacy_function v1 only")
+  def testWarnOnLookupTableLegacyFunction(self):
+
+    def collecting_function(x):
+      _ = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(["a"], [1.]), 0.0, name="t1")
+      return x
+
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      _ = dataset_ops.Dataset.range(10).map_with_legacy_function(
+          collecting_function)
+    # NOTE(mrry): Python 3 prints other warnings in addition to the one we are
+    # testing, so we search for the expected warning.
+    self.assertGreaterEqual(len(w), 1)
+    found_warning = False
+    for warning in w:
+      if ("Creating resources inside a function passed to Dataset.map() is "
           "not supported." in str(warning)):
         found_warning = True
         break
     self.assertTrue(found_warning)
 
+  def testWarnOnSeedFromOuterGraph(self):
+    with ops.Graph().as_default() as g:
+      g.seed = 10
+      warnings.simplefilter("always")
+
+      # map_fun doesn't use seed, so no warning is generated.
+      with warnings.catch_warnings(record=True) as w:
+        _ = dataset_ops.Dataset.range(10).map(math_ops.square)
+      found_warning = False
+      for warning in w:
+        if ("Explicitly set the seed in the function if this is not the "
+            "intended behavior" in str(warning)):
+          found_warning = True
+          break
+      self.assertFalse(found_warning)
+
+      def random_func(x):
+        x = math_ops.add(x, 1)
+        random_ops.random_shuffle([x, math_ops.square(x)])
+        return x
+
+      with warnings.catch_warnings(record=True) as w:
+        _ = dataset_ops.Dataset.range(10).map(random_func)
+      self.assertGreaterEqual(len(w), 1)
+      found_warning = False
+      for warning in w:
+        if ("Explicitly set the seed in the function if this is not the "
+            "intended behavior" in str(warning)):
+          found_warning = True
+          break
+      self.assertTrue(found_warning)
+
+      def random_func_seeded(x):
+        ops.get_default_graph().seed = None
+        random_ops.random_shuffle(x)
+        return x
+
+      with warnings.catch_warnings(record=True) as w:
+        _ = dataset_ops.Dataset.range(10).batch(2).map(random_func_seeded)
+      found_warning = False
+      for warning in w:
+        if ("Explicitly set the seed in the function if this is not the "
+            "intended behavior" in str(warning)):
+          found_warning = True
+          break
+      self.assertFalse(found_warning)
+
+      with warnings.catch_warnings(record=True) as w:
+        _ = dataset_ops.Dataset.range(10).batch(
+            2).map(lambda x: random_ops.random_shuffle(x, seed=37))
+      found_warning = False
+      for warning in w:
+        if ("Explicitly set the seed in the function if this is not the "
+            "intended behavior" in str(warning)):
+          found_warning = True
+          break
+      self.assertFalse(found_warning)
+
   def testNestedDatasetMap(self):
     # TODO(b/110122868): When iterators can yield a `tf.data.Dataset`, remove
     # the `get_single_element()` call.
@@ -955,6 +1019,190 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
 
+  # NOTE: collection test is specific to graph mode only, no eager coverage.
+  @test_util.run_v1_only("graph specific test")
+  def testSkipEagerCollectionCopy(self):
+    w = variable_scope.get_variable("w", [])
+    self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+
+    def func(x):
+      self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      return x
+
+    dataset = dataset_ops.Dataset.from_tensors(constant_op.constant(1.0))
+    dataset.map(func)
+
+# TODO(shivaniagarwal): separate out `map` and `map_with_legacy_function` tests
+# as later would not work in v2.
+@test_util.run_all_in_graph_and_eager_modes
+class MapWithCapturedVariableTests(test_base.DatasetTestBase,
+                                   parameterized.TestCase):
+
+  # TODO(b/126553094): map doesnt work with variable defined inside function in
+  # eager mode, possible Graph tensors leak out of the function building context
+  # from function graph in eager mode as variables are created in init_scope.
+  @test_util.run_v1_only("b/126553094")
+  def testSkipEagerCreateVariableInsideFunctionWithGetter(self):
+
+    def func(_):
+      with variable_scope.variable_scope(
+          "variable", reuse=variable_scope.AUTO_REUSE):
+        counter_var = variable_scope.get_variable(
+            "counter", (), dtypes.int32, use_resource=True)
+      return counter_var.assign_add(1)
+
+    # NOTE: In the legacy function, resource is captured by value for variable
+    # getter.
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+    with self.assertRaisesWithPredicateMatch(
+        AttributeError, "'Tensor' object has no attribute 'assign_add'"):
+      dataset.map_with_legacy_function(func)
+
+    dataset = dataset.map(func)
+    self.evaluate(variables.global_variables_initializer())
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(10):
+      self.assertEqual(i + 1, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(
+      ("MapLegacyFunction",
+       lambda dataset, func: dataset.map_with_legacy_function(func)),
+      ("Map", lambda dataset, func: dataset.map(func)),
+  )
+  @test_util.run_v1_only("map_with_legacy_function is only available in v1.")
+  def testCaptureVariable(self, transformation_function):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+    dataset = transformation_function(
+        dataset, lambda _: counter_var.assign_add(1))
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    self.evaluate(counter_var.initializer)
+
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter_var))
+      self.assertEqual(i + 1, self.evaluate(get_next()))
+    self.assertEqual(10, self.evaluate(counter_var))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(10, self.evaluate(counter_var))
+
+  # NOTE: no need to explicitly initialize variables in eager mode.
+  @parameterized.named_parameters(
+      ("MapLegacyFunction",
+       lambda dataset, func: dataset.map_with_legacy_function(func)),
+      ("Map", lambda dataset, func: dataset.map(func)),
+  )
+  @test_util.run_v1_only("this test is meant to run in graph mode only.")
+  def testSkipEagerCaptureUninitializedVariableError(self,
+                                                     transformation_function):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+    dataset = transformation_function(
+        dataset, lambda _: counter_var.assign_add(1))
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+    with self.assertRaises(errors.NotFoundError):
+      self.evaluate(get_next())
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @parameterized.named_parameters(
+      ("MapLegacyFunction",
+       lambda dataset, func: dataset.map_with_legacy_function(func)),
+      ("Map", lambda dataset, func: dataset.map(func)),
+  )
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerCaptureConstantsWithConflictingDevices(
+      self, transformation_function):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.cached_session(config=config):
+      with ops.device("/device:CPU:0"):
+        a = constant_op.constant(3.0)
+      with ops.device("/device:CPU:1"):
+        b = constant_op.constant(5.0)
+
+      def func(_):
+        return math_ops.add(a, b)
+
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = transformation_function(dataset, func)
+      expected_output = [8.0] * 10
+      self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerRefVariablesWithConflictingDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.cached_session(config=config):
+
+      def func(_):
+        with ops.device("/device:CPU:0"):
+          a = variables.VariableV1(3.0)
+        with ops.device("/device:CPU:1"):
+          b = variables.VariableV1(5.0)
+        return math_ops.add(a, b)
+
+      # NOTE: Use the legacy function implementation as eager function will
+      # convert RefVariables to ResourceVariables.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = dataset.map_with_legacy_function(func)
+      self.evaluate(variables.global_variables_initializer())
+      expected_output = [8.0] * 10
+      self.assertDatasetProduces(
+          dataset,
+          expected_output=expected_output,
+          requires_initialization=True)
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerResourceVariablesWithConflictingDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+
+    def func(_):
+      with variable_scope.variable_scope(
+          "variable", reuse=variable_scope.AUTO_REUSE):
+        with ops.device("/device:CPU:0"):
+          a = variable_scope.get_variable(
+              "a", (), dtypes.int32, use_resource=True)
+          a = math_ops.add(a, 1)
+        with ops.device("/device:CPU:1"):
+          b = variable_scope.get_variable(
+              "b", (), dtypes.int32, use_resource=True)
+      return math_ops.add(a, b)
+
+    g_1 = ops.Graph()
+    with self.session(config=config, graph=g_1):
+      # The MapDataset node ends up with two ResourceVariable inputs, one on
+      # device CPU:0 and the other on device CPU:1. The placer cannot resolve
+      # this as it cannot place the MapDatasetOp on both devices.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = dataset.map(func)
+      expected_error = (
+          errors.InvalidArgumentError,
+          "Cannot place the graph because a reference or resource edge "
+          "connects colocation groups with incompatible assigned devices")
+      self.assertDatasetProduces(
+          dataset, expected_error=expected_error, requires_initialization=True)
+
+    g_2 = ops.Graph()
+    with self.session(config=config, graph=g_2):
+      # In old-Defun variable is captured as value, hence there is no colocation
+      # error.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = dataset.map_with_legacy_function(func)
+      self.evaluate(variables.global_variables_initializer())
+      expected_output = [1] * 10
+      self.assertDatasetProduces(
+          dataset,
+          expected_output=expected_output,
+          requires_initialization=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/memory_cleanup_test.py b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b48c1fe37c9b430759c42d801bec36a687b66525
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/memory_cleanup_test.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Verify that memory usage is minimal in eager mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import six
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+# memory_profiler might not be available in the OSS version of TensorFlow.
+try:
+  import memory_profiler  # pylint:disable=g-import-not-at-top
+except ImportError:
+  memory_profiler = None
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MemoryCleanupTest(test_base.DatasetTestBase):
+
+  def assertNotIncreasingMemory(self,
+                                f,
+                                num_iters=100000,
+                                increase_threshold_absolute_mb=10):
+    """Assert memory usage doesn't increase beyond given threshold for f."""
+    with context.eager_mode():
+      # Warm up.
+      f()
+      # Wait for background threads to start up and take over memory.
+      # FIXME: The nature of this test leaves few other options. Maybe there
+      # is a better way to do this.
+      time.sleep(4)
+      initial = memory_profiler.memory_usage(-1)[0]
+      for _ in six.moves.range(num_iters):
+        f()
+      increase = memory_profiler.memory_usage(-1)[0] - initial
+      logging.info("Memory increase observed: %f MB" % increase)
+      assert increase < increase_threshold_absolute_mb, (
+          "Increase is too high. Initial memory usage: %f MB. Increase: %f MB. "
+          "Maximum allowed increase: %f") % (initial, increase,
+                                             increase_threshold_absolute_mb)
+
+  @test_util.run_v1_only("b/121264236")
+  def testEagerMemoryUsageWithReset(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only eager mode test")
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+
+    def f():
+      self.evaluate(multi_device_iterator.get_next())
+      multi_device_iterator._eager_reset()
+
+    self.assertNotIncreasingMemory(
+        f, num_iters=100, increase_threshold_absolute_mb=350)
+
+  @test_util.run_v1_only("b/121264236")
+  def testEagerMemoryUsageWithRecreation(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only eager mode test")
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    dataset = dataset_ops.Dataset.range(10)
+
+    def f():
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/cpu:2"])
+      self.evaluate(multi_device_iterator.get_next())
+      del multi_device_iterator
+
+    # TODO(b/123316347): Reduce threshold once bug is fixed.
+    self.assertNotIncreasingMemory(
+        f, num_iters=100, increase_threshold_absolute_mb=500)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={"CPU": 3, "GPU": 1}))
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 0c88d7533f146c6fdf33a9dea2baba653adbd588..c379afcb160c3b6a55a247fc8dc228766ac0fc44 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -34,15 +34,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-# TODO(b/121264236): Once we have a mechanism to have multiple devices in eager
-# / V2 mode, we should remove this annotation and the run_v1_only annotations
-# as well.
 @test_util.run_all_in_graph_and_eager_modes
 class MultiDeviceIteratorTest(test_base.DatasetTestBase,
                               parameterized.TestCase):
 
-  @test_util.run_v1_only
   @parameterized.parameters(0, 1, 42,)
+  @test_util.run_v1_only("b/121264236")
   def testInitOnly(self, num_inits):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -53,7 +50,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       for _ in range(num_inits):
         self.evaluate(multi_device_iterator.initializer)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -71,7 +68,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
@@ -90,7 +87,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
@@ -115,7 +112,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_3)
         self.evaluate(elem_on_4)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -135,7 +132,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptional(self):
     if context.executing_eagerly():
       return
@@ -172,7 +169,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -192,7 +189,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testMultipleInitializationsGraph(self):
     if context.executing_eagerly():
       return
@@ -214,7 +211,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
                                                           elem_on_2]))
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testMultipleInitializationsEager(self):
     if not context.executing_eagerly():
       return
@@ -224,13 +221,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       dataset2 = dataset_ops.Dataset.range(1000)
       dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
 
-    for _ in range(1000):
+    for _ in range(5):
       multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
           dataset, ["/cpu:1", "/cpu:2"], prefetch_buffer_size=4)
       elem_on_1, elem_on_2 = multi_device_iterator.get_next()
       self.assertEqual([(0, 0), (1, 1)], self.evaluate([elem_on_1, elem_on_2]))
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -251,7 +248,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -274,7 +271,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptionalGpu(self):
     if not test_util.is_gpu_available() or context.executing_eagerly():
       self.skipTest("No GPU available")
@@ -311,7 +308,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only
+  @test_util.run_v1_only("b/121264236")
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index 2269bb8724dba40f73ceef8797206adb513a2f60..cd1015b08dca422e74a4d0d0b3d3291319f86e6c 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import structure
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -303,7 +304,6 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(
           self.evaluate(tf_value), self.evaluate(round_trip_opt.get_value()))
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @parameterized.named_parameters(
       ("Tensor", np.array([1, 2, 3], dtype=np.int32),
        lambda: constant_op.constant([4, 5, 6], dtype=dtypes.int32), True),
@@ -323,42 +323,62 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                     indices=[[0, 1], [1, 0]], values=[37.0, 42.0],
                     dense_shape=[2, 2])}, False),
   )
-  @test_util.run_deprecated_v1
-  def testSkipEagerIteratorGetNextAsOptional(self, np_value, tf_value_fn,
-                                             works_on_gpu):
+  def testIteratorGetNextAsOptional(self, np_value, tf_value_fn,
+                                    works_on_gpu):
     if not works_on_gpu and test.is_gpu_available():
       self.skipTest("Test case not yet supported on GPU.")
     ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
-    iterator = ds.make_initializable_iterator()
-    next_elem = iterator_ops.get_next_as_optional(iterator)
-    self.assertIsInstance(next_elem, optional_ops.Optional)
-    self.assertTrue(
-        next_elem.value_structure.is_compatible_with(
-            structure.Structure.from_value(tf_value_fn())))
-    elem_has_value_t = next_elem.has_value()
-    elem_value_t = next_elem.get_value()
-    with self.cached_session() as sess:
+
+    if context.executing_eagerly():
+      iterator = dataset_ops.make_one_shot_iterator(ds)
+      # For each element of the dataset, assert that the optional evaluates to
+      # the expected value.
+      for _ in range(3):
+        next_elem = iterator_ops.get_next_as_optional(iterator)
+        self.assertIsInstance(next_elem, optional_ops.Optional)
+        self.assertTrue(
+            next_elem.value_structure.is_compatible_with(
+                structure.Structure.from_value(tf_value_fn())))
+        self.assertTrue(next_elem.has_value())
+        self._assertElementValueEqual(np_value, next_elem.get_value())
+      # After exhausting the iterator, `next_elem.has_value()` will evaluate to
+      # false, and attempting to get the value will fail.
+      for _ in range(2):
+        next_elem = iterator_ops.get_next_as_optional(iterator)
+        self.assertFalse(self.evaluate(next_elem.has_value()))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_elem.get_value())
+    else:
+      iterator = dataset_ops.make_initializable_iterator(ds)
+      next_elem = iterator_ops.get_next_as_optional(iterator)
+      self.assertIsInstance(next_elem, optional_ops.Optional)
+      self.assertTrue(
+          next_elem.value_structure.is_compatible_with(
+              structure.Structure.from_value(tf_value_fn())))
       # Before initializing the iterator, evaluating the optional fails with
-      # a FailedPreconditionError.
+      # a FailedPreconditionError. This is only relevant in graph mode.
+      elem_has_value_t = next_elem.has_value()
+      elem_value_t = next_elem.get_value()
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_has_value_t)
+        self.evaluate(elem_has_value_t)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_value_t)
-
+        self.evaluate(elem_value_t)
+      # Now we initialize the iterator.
+      self.evaluate(iterator.initializer)
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
       for _ in range(3):
-        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
+        elem_has_value, elem_value = self.evaluate(
+            [elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
         self._assertElementValueEqual(np_value, elem_value)
 
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(elem_value_t)
+          self.evaluate(elem_value_t)
 
   def testFunctionBoundaries(self):
     @def_function.function
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index f5bad3e7ae58885a5d013b0dc0f9dec41e0204c8..222d8c6f1a61e4231317784e5c3b19d99e697676 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -39,40 +39,40 @@ class OptionsTest(test_base.DatasetTestBase):
 
   def testOptionsTwiceSame(self):
     options = dataset_ops.Options()
-    options.experimental_autotune = True
+    options.experimental_optimization.autotune = True
     ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
         options)
     self.assertEqual(options, ds.options())
 
   def testOptionsTwiceDifferent(self):
     options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
+    options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = False
     ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
         options2)
-    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_optimization.autotune)
     # Explicitly check that flag is False since assertFalse allows None
     self.assertIs(ds.options().experimental_deterministic, False)
 
   def testOptionsTwiceDifferentError(self):
     options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
+    options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
-    options2.experimental_autotune = False
+    options2.experimental_optimization.autotune = False
     with self.assertRaisesRegexp(ValueError,
                                  "Cannot merge incompatible values"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
   def testOptionsMergeOptionsFromMultipleInputs(self):
     options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
+    options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = True
     ds = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(0).with_options(options1),
          dataset_ops.Dataset.range(0).with_options(options2)))
-    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_optimization.autotune)
     self.assertTrue(ds.options().experimental_deterministic)
 
   def testOptionsHaveDefaults(self):
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index 042af7a6f9fb19b25fd9b01c509ed267833720f9..b54749002ccd0d4336a37db32498cea94a7309f8 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -185,9 +185,10 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
         dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
     ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+      dataset_output_shapes = dataset_ops.get_legacy_output_shapes(dataset)
+      self.assertEqual([None, None], dataset_output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset_output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset_output_shapes[2].as_list())
 
   def testPaddedBatchSparseError(self):
 
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 93acc1565fd34beb3b0be1eaf0408272c81effed..846d9a6cef9cd362eca269fa44824436766afa2a 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -22,12 +22,14 @@ import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -123,6 +125,71 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, result["dense"])
       self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
 
+  def testDatasetSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10).map(increment_fn)
+
+    def reduce_fn(state, value):
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10)
+
+    def reduce_fn(state, value):
+      counter_var.assign_add(1)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1)
+
+    def reduce1_fn(state, value):
+      counter_var.assign(counter_var + 1)
+      return state + value
+
+    def reduce2_fn(state, value):
+      counter_var.assign(counter_var * 2)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce1_fn)
+      _ = dataset_fn().reduce(np.int64(0), reduce2_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/repeat_test.py b/tensorflow/python/data/kernel_tests/repeat_test.py
index 4ef2fc1bfc8fb139cb855305f4e4f2ec70221ce2..8a8537b30cf63df048c7e67fd9f708f7cd64ebe0 100644
--- a/tensorflow/python/data/kernel_tests/repeat_test.py
+++ b/tensorflow/python/data/kernel_tests/repeat_test.py
@@ -37,8 +37,9 @@ class RepeatTest(test_base.DatasetTestBase):
 
     def do_test(count):
       dataset = dataset_ops.Dataset.from_tensors(components).repeat(count)
-      self.assertEqual([c.shape for c in components],
-                       [shape for shape in dataset.output_shapes])
+      self.assertEqual(
+          [c.shape for c in components],
+          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
       self.assertDatasetProduces(dataset, [components] * count)
 
     # Test a finite repetition.
@@ -54,8 +55,9 @@ class RepeatTest(test_base.DatasetTestBase):
     # NOTE(mrry): There's not a good way to test that the sequence
     # actually is infinite.
     dataset = dataset_ops.Dataset.from_tensors(components).repeat(-1)
-    self.assertEqual([c.shape for c in components],
-                     [shape for shape in dataset.output_shapes])
+    self.assertEqual(
+        [c.shape for c in components],
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     get_next = self.getNext(dataset)
     for _ in range(17):
       results = self.evaluate(get_next())
@@ -69,8 +71,9 @@ class RepeatTest(test_base.DatasetTestBase):
 
     dataset = dataset_ops.Dataset.from_tensors(components).repeat(
         inner_count).repeat(outer_count)
-    self.assertEqual([c.shape for c in components],
-                     [shape for shape in dataset.output_shapes])
+    self.assertEqual(
+        [c.shape for c in components],
+        [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
     self.assertDatasetProduces(dataset,
                                [components] * (inner_count * outer_count))
 
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 928550676d5b05c2e5a459af355acebe2f1f1cc4..9fc70ff60752c02ec626ee5f89606b428fc183fd 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -19,11 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ShardTest(test_base.DatasetTestBase):
 
   def testSimpleCase(self):
@@ -41,20 +42,24 @@ class ShardTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(dataset, expected_output=[0, 5])
 
   def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, 7)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, -3)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(-3, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(0, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testIteratorEndsBeforeFirstElem(self):
     dataset = dataset_ops.Dataset.range(1).shard(5, 2)
@@ -72,5 +77,10 @@ class ShardTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).shard(4, 3)
     self.assertDatasetProduces(dataset, expected_output=[3, 7])
 
+  def testNumShardsLargerThanDataset(self):
+    dataset = dataset_ops.Dataset.range(10).shard(20, 5)
+    self.assertDatasetProduces(dataset, expected_output=[5])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index 13df870938d1cee7b29e0189b9b1db1731bb4114..ea0eeaf686fcb7faf20dac7c9abe54202781d42b 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -51,7 +51,7 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
         self.assertEqual(
             tuple([c.shape[1:] for c in components]),
-            shuffle_dataset.output_shapes)
+            dataset_ops.get_legacy_output_shapes(shuffle_dataset))
         return shuffle_dataset
       else:
         return repeat_dataset
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
index c22be576921c6d8e569ecb60c90925d004a0e5de..74dc8b7f55c715db874812268037c82820167349 100644
--- a/tensorflow/python/data/kernel_tests/skip_test.py
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -33,8 +33,9 @@ class SkipTest(test_base.DatasetTestBase):
 
     def do_test(count):
       dataset = dataset_ops.Dataset.from_tensor_slices(components).skip(count)
-      self.assertEqual([c.shape[1:] for c in components],
-                       [shape for shape in dataset.output_shapes])
+      self.assertEqual(
+          [c.shape[1:] for c in components],
+          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
       start_range = min(count, 10) if count != -1 else 10
       self.assertDatasetProduces(
           dataset,
diff --git a/tensorflow/python/data/kernel_tests/take_test.py b/tensorflow/python/data/kernel_tests/take_test.py
index 03a7ece2d8c8ea88d4504a4341ae3bb13ee2c3bf..665ed59a7bceaf1940a8b161dfd7fb6d823981e2 100644
--- a/tensorflow/python/data/kernel_tests/take_test.py
+++ b/tensorflow/python/data/kernel_tests/take_test.py
@@ -33,8 +33,9 @@ class TakeTest(test_base.DatasetTestBase):
 
     def do_test(count):
       dataset = dataset_ops.Dataset.from_tensor_slices(components).take(count)
-      self.assertEqual([c.shape[1:] for c in components],
-                       [shape for shape in dataset.output_shapes])
+      self.assertEqual(
+          [c.shape[1:] for c in components],
+          [shape for shape in dataset_ops.get_legacy_output_shapes(dataset)])
       num_output = min(count, 10) if count != -1 else 10
       self.assertDatasetProduces(
           dataset, [tuple(components[0][i:i + 1]) for i in range(num_output)])
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 7aa7f33003cf7195f5ecde406e181b26644c8038..01315e790dc35b1ab71f5c8025c34195b6c27d78 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
@@ -32,6 +33,13 @@ from tensorflow.python.platform import test
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
+  @classmethod
+  def setUpClass(cls):
+    if tf2.enabled():
+      dataset_ops.Dataset = dataset_ops.DatasetV2
+    else:
+      dataset_ops.Dataset = dataset_ops.DatasetV1
+
   def assertSparseValuesEqual(self, a, b):
     """Asserts that two SparseTensors/SparseTensorValues are equal."""
     self.assertAllEqual(a.indices, b.indices)
@@ -92,7 +100,8 @@ class DatasetTestBase(test.TestCase):
                             expected_error=None,
                             requires_initialization=False,
                             num_test_iterations=1,
-                            assert_items_equal=False):
+                            assert_items_equal=False,
+                            expected_error_iter=1):
     """Asserts that a dataset produces the expected output / error.
 
     Args:
@@ -114,6 +123,8 @@ class DatasetTestBase(test.TestCase):
         to 2.
       assert_items_equal: Tests expected_output has (only) the same elements
         regardless of order.
+      expected_error_iter: How many times to iterate before expecting an error,
+        if an error is expected.
     """
     self.assertTrue(
         expected_error is not None or expected_output is not None,
@@ -127,10 +138,12 @@ class DatasetTestBase(test.TestCase):
                                                expected_error[1]):
         get_next = self.getNext(
             dataset, requires_initialization=requires_initialization)
-        self.evaluate(get_next())
+        for _ in range(expected_error_iter):
+          self.evaluate(get_next())
       return
     if expected_shapes:
-      self.assertEqual(expected_shapes, dataset.output_shapes)
+      self.assertEqual(expected_shapes,
+                       dataset_ops.get_legacy_output_shapes(dataset))
     self.assertGreater(num_test_iterations, 0)
     for _ in range(num_test_iterations):
       get_next = self.getNext(
@@ -146,9 +159,12 @@ class DatasetTestBase(test.TestCase):
 
   def assertDatasetsEqual(self, dataset1, dataset2):
     """Checks that datasets are equal. Supports both graph and eager mode."""
-    self.assertEqual(dataset1.output_types, dataset2.output_types)
-    self.assertEqual(dataset1.output_classes, dataset2.output_classes)
-    flattened_types = nest.flatten(dataset1.output_types)
+    self.assertTrue(dataset_ops.get_structure(dataset1).is_compatible_with(
+        dataset_ops.get_structure(dataset2)))
+    self.assertTrue(dataset_ops.get_structure(dataset2).is_compatible_with(
+        dataset_ops.get_structure(dataset1)))
+    flattened_types = nest.flatten(
+        dataset_ops.get_legacy_output_types(dataset1))
 
     next1 = self.getNext(dataset1)
     next2 = self.getNext(dataset2)
@@ -178,6 +194,8 @@ class DatasetTestBase(test.TestCase):
                                    exception_class,
                                    replacements=None):
     """Checks that datasets raise the same error on the first get_next call."""
+    if replacements is None:
+      replacements = []
     next1 = self.getNext(dataset1)
     next2 = self.getNext(dataset2)
     try:
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
index a7b4d86fcf958b1ec06781380724c6f48dcf2a24..1eeaeb5fc1d2dbdb99ca057a66893e2bea77237a 100644
--- a/tensorflow/python/data/kernel_tests/window_test.py
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -81,9 +81,9 @@ class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
             drop_remainder=drop_remainder).flat_map(_flat_map_fn)
     get_next = self.getNext(dataset)
 
-    self.assertEqual(
-        [[None] + list(c.shape[1:]) for c in components],
-        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [ts.as_list() for ts in nest.flatten(
+                         dataset_ops.get_legacy_output_shapes(dataset))])
 
     num_full_batches = max(0,
                            (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
@@ -147,7 +147,7 @@ class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=array_ops.expand_dims(
               math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
+          values=array_ops.fill([math_ops.cast(i, dtypes.int32)], i),
           dense_shape=[i])
 
     dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
index 477c9fa7da14276f5ad0b503402e24711b139832..72f739e4e4ef760f6280224dba98a38890461f59 100644
--- a/tensorflow/python/data/kernel_tests/zip_test.py
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -81,7 +81,7 @@ class ZipTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
 
     self.assertEqual(
-        dataset.output_shapes,
+        dataset_ops.get_legacy_output_shapes(dataset),
         (tensor_shape.TensorShape([20]),
          (tensor_shape.TensorShape([22]), tensor_shape.TensorShape([]))))
 
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 112aa926ae5c1f6cedb967de7943dc8d1ec4048d..d018ba21708ec22b4776418bf36471e977d8a5eb 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -26,7 +26,6 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/experimental/ops:threading_options",
@@ -74,7 +73,7 @@ py_library(
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 45e732be0d79a27105aa0d6ca2880bb7340c261b..a86f74542f0bcf41889c7d6b07fbf6fccc06a7c1 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -28,7 +28,6 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 
 from tensorflow.python.compat import compat
-from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.experimental.ops import threading_options
@@ -40,6 +39,7 @@ from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -48,6 +48,7 @@ from tensorflow.python.framework import random_seed as core_random_seed
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -58,6 +59,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -86,12 +88,12 @@ class DatasetV2(object):
     Args:
       variant_tensor: A DT_VARIANT tensor that represents the dataset.
     """
-    self._dataset_variant_tensor = variant_tensor
+    self._variant_tensor_attr = variant_tensor
     self._graph_attr = ops.get_default_graph()
 
   @property
   def _variant_tensor(self):
-    return self._dataset_variant_tensor
+    return self._variant_tensor_attr
 
   @_variant_tensor.setter
   def _variant_tensor(self, _):
@@ -143,6 +145,8 @@ class DatasetV2(object):
     return any(
         [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
 
+  # TODO(jsimsa): Change this to be the transitive closure of functions used
+  # by this dataset and its inputs.
   def _functions(self):
     """Returns a list of functions associated with this dataset.
 
@@ -171,12 +175,12 @@ class DatasetV2(object):
     options = self.options()
     if options.experimental_threading is not None:
       t_options = options.experimental_threading
-      if t_options.private_threadpool_size is not None:
-        dataset = _PrivateThreadPoolDataset(dataset,
-                                            t_options.private_threadpool_size)
       if t_options.max_intra_op_parallelism is not None:
         dataset = _MaxIntraOpParallelismDataset(
             dataset, t_options.max_intra_op_parallelism)
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
       if self._has_captured_ref():
@@ -189,8 +193,17 @@ class DatasetV2(object):
       else:
         dataset = _OptimizeDataset(dataset, static_optimizations)
 
-    if options.experimental_autotune is not False:
-      dataset = _ModelDataset(dataset)
+    autotune = True
+    cpu_budget = 0  # Indicates that all CPU cores should be used.
+    if options.experimental_optimization is not None:
+      if options.experimental_optimization.autotune is False:  # pylint: disable=g-bool-id-comparison
+        autotune = False
+      if options.experimental_optimization.autotune_cpu_budget is not None:
+        cpu_budget = options.experimental_optimization.autotune_cpu_budget
+
+    if autotune:
+      dataset = _ModelDataset(dataset, cpu_budget)
+
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
       dataset = _SetStatsAggregatorDataset(  # pylint: disable=protected-access
           dataset, options.experimental_stats.aggregator,
@@ -226,42 +239,10 @@ class DatasetV2(object):
     """
     raise NotImplementedError("Dataset._element_structure")
 
-  @property
-  def output_classes(self):
-    """Returns the class of each component of an element of this dataset.
-
-    The expected values are `tf.Tensor` and `tf.SparseTensor`.
-
-    Returns:
-      A nested structure of Python `type` objects corresponding to each
-      component of an element of this dataset.
-    """
-    return self._element_structure._to_legacy_output_classes()  # pylint: disable=protected-access
-
-  @property
-  def output_shapes(self):
-    """Returns the shape of each component of an element of this dataset.
-
-    Returns:
-      A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this dataset.
-    """
-    return self._element_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
-
-  @property
-  def output_types(self):
-    """Returns the type of each component of an element of this dataset.
-
-    Returns:
-      A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this dataset.
-    """
-    return self._element_structure._to_legacy_output_types()  # pylint: disable=protected-access
-
   def __repr__(self):
-    output_shapes = nest.map_structure(str, self.output_shapes)
+    output_shapes = nest.map_structure(str, get_legacy_output_shapes(self))
     output_shapes = str(output_shapes).replace("'", "")
-    output_types = nest.map_structure(repr, self.output_types)
+    output_types = nest.map_structure(repr, get_legacy_output_types(self))
     output_types = str(output_types).replace("'", "")
     return ("<%s shapes: %s, types: %s>" % (type(self).__name__, output_shapes,
                                             output_types))
@@ -747,6 +728,12 @@ class DatasetV2(object):
     elements. For perfect shuffling, a buffer size greater than or equal to the
     full size of the dataset is required.
 
+    For instance, if your dataset contains 10,000 elements but `buffer_size` is
+    set to 1,000, then `shuffle` will initially select a random element from
+    only the first 1,000 elements in the buffer. Once an element is selected,
+    its space in the buffer is replaced by the next (i.e. 1,001-st) element,
+    maintaining the 1,000 element buffer.
+
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
         number of elements from this dataset from which the new
@@ -805,6 +792,59 @@ class DatasetV2(object):
     """
     return SkipDataset(self, count)
 
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(input_file)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(pattern)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      InvalidArgumentError: if `num_shards` or `index` are illegal values.
+        Note: error checking is done on a best-effort basis, and errors aren't
+        guaranteed to be caught upon dataset creation. (e.g. providing in a
+        placeholder tensor bypasses the early checking, and will instead result
+        in an error during a session.run call.)
+    """
+    return ShardDataset(self, num_shards, index)
+
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -1100,6 +1140,18 @@ class DatasetV2(object):
   def filter(self, predicate):
     """Filters this dataset according to `predicate`.
 
+    ```python
+    d = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+
+    d = d.filter(lambda x: x < 3) # [1, 2]
+
+    # `tf.math.equal(x, y)` is required for equality comparison
+    def filter_fn(x):
+      return tf.math.equal(x, 1)
+
+    d = d.filter(filter_fn) # [1]
+    ```
+
     Args:
       predicate: A function mapping a nested structure of tensors (having shapes
         and types defined by `self.output_shapes` and `self.output_types`) to a
@@ -1136,7 +1188,9 @@ class DatasetV2(object):
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, DatasetV2):
-      raise TypeError("`transformation_func` must return a Dataset.")
+      raise TypeError(
+          "`transformation_func` must return a Dataset. Got {}.".format(
+              dataset))
     dataset._input_datasets = [self]  # pylint: disable=protected-access
     return dataset
 
@@ -1364,6 +1418,9 @@ class DatasetV1(DatasetV2):
     Returns:
       An `Iterator` over the elements of this dataset.
     """
+    return self._make_one_shot_iterator()
+
+  def _make_one_shot_iterator(self):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
       return iterator_ops.EagerIterator(self)
 
@@ -1407,10 +1464,12 @@ class DatasetV1(DatasetV2):
       else:
         six.reraise(ValueError, err)
 
+    # pylint: disable=protected-access
     return iterator_ops.Iterator(
         gen_dataset_ops.one_shot_iterator(
             dataset_factory=_make_dataset, **flat_structure(self)),
-        None, self.output_types, self.output_shapes, self.output_classes)
+        None, get_legacy_output_types(self), get_legacy_output_shapes(self),
+        get_legacy_output_classes(self))
 
   @deprecation.deprecated(
       None, "Use `for ... in dataset:` to iterate over a dataset. If using "
@@ -1441,6 +1500,10 @@ class DatasetV1(DatasetV2):
     Raises:
       RuntimeError: If eager execution is enabled.
     """
+
+    return self._make_initializable_iterator(shared_name)
+
+  def _make_initializable_iterator(self, shared_name=None):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
       raise RuntimeError(
           "dataset.make_initializable_iterator is not supported when eager "
@@ -1459,9 +1522,42 @@ class DatasetV1(DatasetV2):
       initializer = gen_dataset_ops.make_iterator(
           dataset._variant_tensor,  # pylint: disable=protected-access
           iterator_resource)
-    return iterator_ops.Iterator(iterator_resource, initializer,
-                                 dataset.output_types, dataset.output_shapes,
-                                 dataset.output_classes)
+    # pylint: disable=protected-access
+    return iterator_ops.Iterator(
+        iterator_resource, initializer, get_legacy_output_types(dataset),
+        get_legacy_output_shapes(dataset), get_legacy_output_classes(dataset))
+
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this dataset.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._element_structure._to_legacy_output_classes()  # pylint: disable=protected-access
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._element_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    return self._element_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   @property
   def _element_structure(self):
@@ -1543,60 +1639,9 @@ class DatasetV1(DatasetV2):
   def skip(self, count):
     return DatasetV1Adapter(super(DatasetV1, self).skip(count))
 
-  @deprecation.deprecated(
-      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  @functools.wraps(DatasetV2.shard)
   def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and errors aren't guaranteed
-        to be caught upon dataset creation. (e.g. providing in a placeholder
-        tensor bypasses the early checking, and will instead result in an error
-        during a session.run call.)
-    """
-    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+    return DatasetV1Adapter(super(DatasetV1, self).shard(num_shards, index))
 
   @functools.wraps(DatasetV2.batch)
   def batch(self, batch_size, drop_remainder=False):
@@ -1622,6 +1667,43 @@ class DatasetV1(DatasetV2):
           ParallelMapDataset(
               self, map_func, num_parallel_calls, preserve_cardinality=False))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.map()")
+  def map_with_legacy_function(self, map_func, num_parallel_calls=None):
+    """Maps `map_func` across the elements of this dataset.
+
+    NOTE: This is an escape hatch for existing uses of `map` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `map` as this method will be removed in V2.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to
+        another nested structure of tensors.
+      num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
+
+    Returns:
+      Dataset: A `Dataset`.
+    """
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(
+              self,
+              map_func,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self,
+              map_func,
+              num_parallel_calls,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+
   @functools.wraps(DatasetV2.flat_map)
   def flat_map(self, map_func):
     return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
@@ -1639,6 +1721,25 @@ class DatasetV1(DatasetV2):
   def filter(self, predicate):
     return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.filter()")
+  def filter_with_legacy_function(self, predicate):
+    """Filters this dataset according to `predicate`.
+
+    NOTE: This is an escape hatch for existing uses of `filter` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `filter` as this method will be removed in V2.
+
+    Args:
+      predicate: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        scalar `tf.bool` tensor.
+
+    Returns:
+      Dataset: The `Dataset` containing the elements of this dataset for which
+          `predicate` is `True`.
+    """
+    return FilterDataset(self, predicate, use_legacy_function=True)
+
   @functools.wraps(DatasetV2.apply)
   def apply(self, transformation_func):
     return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
@@ -1725,15 +1826,15 @@ def make_one_shot_iterator(dataset):
     A `tf.data.Iterator` over the elements of this dataset.
   """
   try:
-    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # Call the defined `_make_one_shot_iterator()` if there is one, because some
     # datasets (e.g. for prefetching) override its behavior.
-    return dataset.make_one_shot_iterator()
+    return dataset._make_one_shot_iterator()  # pylint: disable=protected-access
   except AttributeError:
-    return DatasetV1Adapter(dataset).make_one_shot_iterator()
+    return DatasetV1Adapter(dataset)._make_one_shot_iterator()  # pylint: disable=protected-access
 
 
 @tf_export(v1=["data.make_initializable_iterator"])
-def make_initializable_iterator(dataset):
+def make_initializable_iterator(dataset, shared_name=None):
   """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
 
   Note: The returned iterator will be in an uninitialized state,
@@ -1741,13 +1842,16 @@ def make_initializable_iterator(dataset):
 
   ```python
   dataset = ...
-  iterator = dataset.make_initializable_iterator()
+  iterator = tf.data.make_initializable_iterator(dataset)
   # ...
   sess.run(iterator.initializer)
   ```
 
   Args:
     dataset: A `tf.data.Dataset`.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+      shared under the given name across multiple sessions that share the
+      same devices (e.g. when using a remote server).
 
   Returns:
     A `tf.data.Iterator` over the elements of `dataset`.
@@ -1756,11 +1860,91 @@ def make_initializable_iterator(dataset):
     RuntimeError: If eager execution is enabled.
   """
   try:
-    # Call the defined `make_initializable_iterator()` if there is one, because
+    # Call the defined `_make_initializable_iterator()` if there is one, because
     # some datasets (e.g. for prefetching) override its behavior.
-    return dataset.make_initializable_iterator()
+    return dataset._make_initializable_iterator(shared_name)  # pylint: disable=protected-access
+  except AttributeError:
+    return DatasetV1Adapter(dataset)._make_initializable_iterator(shared_name)  # pylint: disable=protected-access
+
+
+# TODO(b/110122868): Replace this method with a public API for reflecting on
+# dataset structure.
+def get_structure(dataset_or_iterator):
+  """Returns the `tf.data.experimental.Structure` of a `Dataset` or `Iterator`.
+
+  Args:
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
+    `EagerIterator`.
+
+  Returns:
+    A `tf.data.experimental.Structure` representing the structure of the
+    elements of `dataset_or_iterator`.
+
+  Raises:
+    TypeError: If `dataset_or_iterator` is not a dataset or iterator object.
+  """
+  try:
+    ret = dataset_or_iterator._element_structure  # pylint: disable=protected-access
+    if isinstance(ret, structure_lib.Structure):
+      return ret
   except AttributeError:
-    return DatasetV1Adapter(dataset).make_initializable_iterator()
+    pass
+  raise TypeError("`dataset_or_iterator` must be a Dataset or Iterator object, "
+                  "but got %s." % type(dataset_or_iterator))
+
+
+# TODO(b/110122868): Remove all uses of this method.
+def get_legacy_output_shapes(dataset_or_iterator):
+  """Returns the output shapes of a `Dataset` or `Iterator`.
+
+  This utility method replaces the deprecated-in-V2
+  `tf.compat.v1.Dataset.output_shapes` property.
+
+  Args:
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
+    `EagerIterator`.
+
+  Returns:
+    A nested structure of `tf.TensorShape` objects corresponding to each
+    component of an element of the given dataset or iterator.
+  """
+  return get_structure(dataset_or_iterator)._to_legacy_output_shapes()  # pylint: disable=protected-access
+
+
+# TODO(b/110122868): Remove all uses of this method.
+def get_legacy_output_types(dataset_or_iterator):
+  """Returns the output shapes of a `Dataset` or `Iterator`.
+
+  This utility method replaces the deprecated-in-V2
+  `tf.compat.v1.Dataset.output_types` property.
+
+  Args:
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
+    `EagerIterator`.
+
+  Returns:
+    A nested structure of `tf.DType` objects corresponding to each component
+    of an element of this dataset.
+  """
+  return get_structure(dataset_or_iterator)._to_legacy_output_types()  # pylint: disable=protected-access
+
+
+# TODO(b/110122868): Remove all uses of this method.
+def get_legacy_output_classes(dataset_or_iterator):
+  """Returns the output classes of a `Dataset` or `Iterator`.
+
+  This utility method replaces the deprecated-in-V2
+  `tf.compat.v1.Dataset.output_classes` property.
+
+  Args:
+    dataset_or_iterator: A `tf.data.Dataset`, `tf.data.Iterator`, or
+    `EagerIterator`.
+
+  Returns:
+    A nested structure of Python `type` or `tf.data.experimental.Structure`
+    objects corresponding to each component of an element of this dataset.
+  """
+  return get_structure(dataset_or_iterator)._to_legacy_output_classes()  # pylint: disable=protected-access
 
 
 @tf_export("data.Options")
@@ -1773,13 +1957,6 @@ class Options(options_lib.OptionsBase):
   `tf.data.Dataset.interleave`.
   """
 
-  experimental_autotune = options_lib.create_option(
-      name="experimental_autotune",
-      ty=bool,
-      docstring=
-      "Whether to dynamically adjust the values of tunable parameters (e.g. "
-      "degrees of parallelism). If None, defaults to True.")
-
   experimental_deterministic = options_lib.create_option(
       name="experimental_deterministic",
       ty=bool,
@@ -1946,7 +2123,9 @@ class SparseTensorSliceDataset(DatasetSource):
   def __init__(self, sparse_tensor):
     """See `Dataset.from_sparse_tensor_slices()` for details."""
     if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
-      raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
+      raise TypeError(
+          "`sparse_tensor` must be a `tf.SparseTensor` object. Was {}.".format(
+              sparse_tensor))
     self._sparse_tensor = sparse_tensor
 
     indices_shape = self._sparse_tensor.indices.get_shape()
@@ -2047,9 +2226,9 @@ structure_lib.Structure._register_custom_converter(DatasetV2,
 
 
 class StructuredFunctionWrapper(object):
-  """A wrapper for `Defun` that supports structured arguments and return values.
-  """
+  """A function wrapper that supports structured arguments and return values."""
 
+  # pylint: disable=protected-access
   def __init__(self,
                func,
                transformation_name,
@@ -2059,6 +2238,7 @@ class StructuredFunctionWrapper(object):
                input_types=None,
                input_structure=None,
                add_to_graph=True,
+               use_legacy_function=False,
                defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
@@ -2080,9 +2260,12 @@ class StructuredFunctionWrapper(object):
         defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
+      use_legacy_function: (Optional.) A boolean that determines whether the
+        function be created using `tensorflow.python.eager.function.defun`
+        (default behavior) or `tensorflow.python.framework.function.Defun`
+        (legacy beheavior).
       defun_kwargs: (Optional.) A dictionary mapping string argument names to
-        values. If supplied, will be passed to `function.Defun()` as keyword
-        arguments.
+        values. If supplied, will be passed to `function` as keyword arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -2102,7 +2285,7 @@ class StructuredFunctionWrapper(object):
           raise ValueError("Either `dataset`, `input_structure` or all of "
                            "`input_classes`, `input_shapes`, and `input_types` "
                            "must be specified.")
-        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
+        self._input_structure = dataset._element_structure
     else:
       if not (dataset is None and input_classes is None and input_shapes is None
               and input_types is None):
@@ -2111,24 +2294,34 @@ class StructuredFunctionWrapper(object):
                          "must be specified.")
       self._input_structure = input_structure
 
-    self._transformation_name = transformation_name
+    if defun_kwargs is None:
+      defun_kwargs = {}
+
     readable_transformation_name = transformation_name.replace(
         ".", "_")[:-2] if len(transformation_name) > 2 else ""
-    self._func_name = "_".join([
-        readable_transformation_name,
-        function_utils.get_func_name(func),
-        str(ops.uid())
-    ])
 
-    if defun_kwargs is None:
-      defun_kwargs = {}
+    func_name = "_".join(
+        [readable_transformation_name,
+         function_utils.get_func_name(func)])
+
+    def _warn_if_collections(transformation_name):
+      """Prints a warning if the given graph uses common graph collections.
+
+      NOTE(mrry): Currently a warning is only generated for resources. Any
+      variables created will be automatically hoisted out to the outermost scope
+      using `init_scope()`. Some collections (such as for control-flow contexts)
+      are benign and should not generate a warning.
 
-    @function.Defun(
-        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
-        **defun_kwargs)
-    def tf_data_structured_function_wrapper(*args):
+      Args:
+        transformation_name: A human-readable name for the transformation.
+      """
+      warnings.warn("Creating resources inside a function passed to %s "
+                    "is not supported. Create each resource outside the "
+                    "function, and capture it inside the function to use it." %
+                    transformation_name, stacklevel=5)
+
+    def _wrapper_helper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      # pylint: disable=protected-access
       nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
@@ -2152,18 +2345,68 @@ class StructuredFunctionWrapper(object):
       except (ValueError, TypeError):
         raise TypeError("Unsupported return value from function passed to "
                         "%s: %s." % (transformation_name, ret))
+      return ret
+
+    if use_legacy_function:
+      func_name = func_name + "_" + str(ops.uid())
+
+      @function.Defun(
+          *self._input_structure._flat_types,
+          func_name=func_name,
+          **defun_kwargs)
+      def wrapper_fn(*args):
+        ret = _wrapper_helper(*args)
+        # _warn_if_collections(transformation_name, ops.get_default_graph(), 0)
+        return self._output_structure._to_tensor_list(ret)
+
+      self._function = wrapper_fn
+      resource_tracker = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker):
+        if add_to_graph:
+          self._function.add_to_graph(ops.get_default_graph())
+        else:
+          # Use the private method that will execute `wrapper_fn` but delay
+          # adding it to the graph in case (e.g.) we need to rerun the function.
+          self._function._create_definition_if_needed()
+      if resource_tracker.resources:
+        _warn_if_collections(transformation_name)
 
-      _warn_if_collections(transformation_name)
-      return self._output_structure._to_tensor_list(ret)
-
-    self._function = tf_data_structured_function_wrapper
-    if add_to_graph:
-      self._function.add_to_graph(ops.get_default_graph())
     else:
-      # Use the private method that will execute
-      # `tf_data_structured_function_wrapper` but delay adding it to the graph
-      # in case (e.g.) we need to rerun the function.
-      self._function._create_definition_if_needed()  # pylint: disable=protected-access
+      defun_kwargs.update({"func_name": func_name})
+
+      # TODO(b/124254153): Enable autograph once the overhead is low enough.
+      # TODO(mdan): Make sure autograph recurses into _wrapper_helper when on.
+      @eager_function.defun_with_attributes(
+          input_signature=[
+              tensor_spec.TensorSpec(input_shape, input_type)  # pylint: disable=g-complex-comprehension
+              for input_shape, input_type in zip(
+                  self._input_structure._flat_shapes,
+                  self._input_structure._flat_types)
+          ],
+          autograph=False,
+          attributes=defun_kwargs)
+      def wrapper_fn(*args):  # pylint: disable=missing-docstring
+        ret = _wrapper_helper(*args)
+        ret = self._output_structure._to_tensor_list(ret)
+        return [ops.convert_to_tensor(t) for t in ret]
+
+      resource_tracker = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker):
+        self._function = wrapper_fn._get_concrete_function_internal()
+        if add_to_graph:
+          self._function.add_to_graph(ops.get_default_graph())
+      if resource_tracker.resources:
+        _warn_if_collections(transformation_name)
+
+      outer_graph_seed = ops.get_default_graph().seed
+      if outer_graph_seed and self._function.graph.seed == outer_graph_seed:
+        if self._function.graph._seed_used:
+          warnings.warn(
+              "Seed %s from outer graph might be getting used by function %s, "
+              "if the random op has not been provided any seed. Explicitly set "
+              "the seed in the function if this is not the intended behavior."
+              %(outer_graph_seed, func_name), stacklevel=4)
+  # pylint: enable=protected-access
 
   @property
   def output_structure(self):
@@ -2309,24 +2552,25 @@ class ConcatenateDataset(DatasetV2):
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
 
-    output_types = input_dataset.output_types
-    if output_types != dataset_to_concatenate.output_types:
+    output_types = get_legacy_output_types(input_dataset)
+    if output_types != get_legacy_output_types(dataset_to_concatenate):
       raise TypeError(
           "Two datasets to concatenate have different types %s and %s" %
-          (output_types, dataset_to_concatenate.output_types))
+          (output_types, get_legacy_output_types(dataset_to_concatenate)))
 
-    output_classes = input_dataset.output_classes
-    if output_classes != dataset_to_concatenate.output_classes:
+    output_classes = get_legacy_output_classes(input_dataset)
+    if output_classes != get_legacy_output_classes(dataset_to_concatenate):
       raise TypeError(
           "Two datasets to concatenate have different classes %s and %s" %
-          (output_classes, dataset_to_concatenate.output_classes))
+          (output_classes, get_legacy_output_classes(dataset_to_concatenate)))
 
-    input_shapes = self._input_dataset.output_shapes
+    input_shapes = get_legacy_output_shapes(self._input_dataset)
     output_shapes = nest.pack_sequence_as(input_shapes, [
         ts1.most_specific_compatible_shape(ts2)
         for (ts1, ts2) in zip(
             nest.flatten(input_shapes),
-            nest.flatten(self._dataset_to_concatenate.output_shapes))
+            nest.flatten(get_legacy_output_shapes(
+                self._dataset_to_concatenate)))
     ])
 
     self._structure = structure_lib.convert_legacy_structure(
@@ -2494,6 +2738,23 @@ class SkipDataset(UnaryUnchangedStructureDataset):
     super(SkipDataset, self).__init__(input_dataset, variant_tensor)
 
 
+class ShardDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` for sharding its input."""
+
+  def __init__(self, input_dataset, num_shards, index):
+    """See `Dataset.shard()` for details."""
+    self._input_dataset = input_dataset
+    self._num_shards = ops.convert_to_tensor(
+        num_shards, dtype=dtypes.int64, name="num_shards")
+    self._index = ops.convert_to_tensor(index, dtype=dtypes.int64, name="index")
+    variant_tensor = gen_dataset_ops.shard_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_shards=self._num_shards,
+        index=self._index,
+        **flat_structure(self))
+    super(ShardDataset, self).__init__(input_dataset, variant_tensor)
+
+
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
 
@@ -2628,11 +2889,16 @@ def _default_padding(input_dataset):
     if t.base_dtype == dtypes.string:
       return ""
     elif t.base_dtype == dtypes.variant:
-      raise TypeError("Unable to create padding for field of type 'variant'")
+      error_msg = ("Unable to create padding for field of type 'variant' "
+                   "because t.base_type == dtypes.variant == "
+                   "{}.".format(
+                       t.base_dtype))
+      raise TypeError(error_msg)
     else:
       return np.zeros_like(t.as_numpy_dtype())
 
-  return nest.map_structure(make_zero, input_dataset.output_types)
+  return nest.map_structure(
+      make_zero, get_legacy_output_types(input_dataset))
 
 
 class PaddedBatchDataset(UnaryDataset):
@@ -2642,7 +2908,7 @@ class PaddedBatchDataset(UnaryDataset):
                drop_remainder):
     """See `Dataset.batch()` for details."""
     self._input_dataset = input_dataset
-    if sparse.any_sparse(input_dataset.output_classes):
+    if sparse.any_sparse(get_legacy_output_classes(input_dataset)):
       # TODO(b/63669786): support batching of sparse tensors
       raise TypeError(
           "Batching of padded sparse tensors is not currently supported")
@@ -2653,22 +2919,22 @@ class PaddedBatchDataset(UnaryDataset):
         padding_values
         if padding_values is not None else _default_padding(input_dataset))
 
-    flat_padded_shapes = nest.flatten_up_to(input_dataset.output_shapes,
-                                            padded_shapes)
+    input_shapes = get_legacy_output_shapes(input_dataset)
+    flat_padded_shapes = nest.flatten_up_to(input_shapes, padded_shapes)
 
     flat_padded_shapes_as_tensors = []
 
     for input_component_shape, padded_shape in zip(
-        nest.flatten(input_dataset.output_shapes), flat_padded_shapes):
+        nest.flatten(input_shapes), flat_padded_shapes):
       flat_padded_shapes_as_tensors.append(
           _padded_shape_to_tensor(padded_shape, input_component_shape))
 
-    self._padded_shapes = nest.pack_sequence_as(input_dataset.output_shapes,
+    self._padded_shapes = nest.pack_sequence_as(input_shapes,
                                                 flat_padded_shapes_as_tensors)
 
     self._padding_values = nest.map_structure_up_to(
-        input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
-        input_dataset.output_types)
+        input_shapes, _padding_value_to_tensor, padding_values,
+        get_legacy_output_types(input_dataset))
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
@@ -2681,8 +2947,8 @@ class PaddedBatchDataset(UnaryDataset):
     output_shapes = nest.map_structure(
         _padded_shape_to_batch_shape, self._padded_shapes)
     self._structure = structure_lib.convert_legacy_structure(
-        self._input_dataset.output_types, output_shapes,
-        self._input_dataset.output_classes)
+        get_legacy_output_types(self._input_dataset), output_shapes,
+        get_legacy_output_classes(self._input_dataset))
 
     # pylint: disable=protected-access
     # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
@@ -2719,24 +2985,6 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
-def _warn_if_collections(transformation_name):
-  """Prints warning message if the current graph uses common graph collections.
-
-  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
-  variables created will be automatically hoisted out to the outermost scope
-  using `init_scope()`. Some collections (such as for control-flow contexts)
-  are benign and should not generate a warning.
-
-  Args:
-    transformation_name: A human-readable name for the transformation.
-  """
-  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
-    warnings.warn("Creating lookup tables inside a function passed to %s is not"
-                  " supported. Create each table outside the function, and "
-                  "capture it inside the function to use it."
-                  % transformation_name)
-
-
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -2744,13 +2992,17 @@ class MapDataset(UnaryDataset):
                input_dataset,
                map_func,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
     self._preserve_cardinality = preserve_cardinality
     self._map_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     variant_tensor = gen_dataset_ops.map_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
@@ -2779,12 +3031,16 @@ class ParallelMapDataset(UnaryDataset):
                map_func,
                num_parallel_calls,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
     self._map_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
     self._preserve_cardinality = preserve_cardinality
@@ -2818,7 +3074,9 @@ class FlatMapDataset(UnaryDataset):
     self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
     if not isinstance(self._map_func.output_structure, DatasetStructure):
-      raise TypeError("`map_func` must return a `Dataset` object.")
+      raise TypeError(
+          "`map_func` must return a `Dataset` object. Got {}".format(
+              type(self._map_func.output_structure)))
     self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
     variant_tensor = gen_dataset_ops.flat_map_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -2848,7 +3106,9 @@ class InterleaveDataset(UnaryDataset):
     self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
     if not isinstance(self._map_func.output_structure, DatasetStructure):
-      raise TypeError("`map_func` must return a `Dataset` object.")
+      raise TypeError(
+          "`map_func` must return a `Dataset` object. Got {}".format(
+              type(self._map_func.output_structure)))
     self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
@@ -2876,8 +3136,7 @@ class InterleaveDataset(UnaryDataset):
 
 
 class ParallelInterleaveDataset(UnaryDataset):
-  """A `Dataset` that maps a function over its input and interleaves the result.
-  """
+  """A `Dataset` that maps a function over its input and interleaves the result."""
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
                num_parallel_calls):
@@ -2886,7 +3145,9 @@ class ParallelInterleaveDataset(UnaryDataset):
     self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
     if not isinstance(self._map_func.output_structure, DatasetStructure):
-      raise TypeError("`map_func` must return a `Dataset` object.")
+      raise TypeError(
+          "`map_func` must return a `Dataset` object. Got {}".format(
+              type(self._map_func.output_structure)))
     self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
@@ -2919,14 +3180,20 @@ class ParallelInterleaveDataset(UnaryDataset):
 class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
-  def __init__(self, input_dataset, predicate):
+  def __init__(self, input_dataset, predicate, use_legacy_function=False):
     """See `Dataset.filter()` for details."""
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
-        predicate, self._transformation_name(), dataset=input_dataset)
+        predicate,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     if not wrapped_func.output_structure.is_compatible_with(
         structure_lib.TensorStructure(dtypes.bool, [])):
-      raise ValueError("`predicate` must return a scalar boolean tensor.")
+      error_msg = ("`predicate` return type must be convertible to a scalar "
+                   "boolean tensor. Was {}.").format(
+                       wrapped_func.output_structure)
+      raise ValueError(error_msg)
     self._predicate = wrapped_func
     variant_tensor = gen_dataset_ops.filter_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -2972,14 +3239,14 @@ class WindowDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
     nest_of_structures = nest.pack_sequence_as(
-        input_dataset.output_classes,
+        get_legacy_output_classes(input_dataset),
         [
             DatasetStructure(structure_lib.convert_legacy_structure(
                 output_type, output_shape, output_class))
             for output_class, output_shape, output_type in zip(
-                nest.flatten(input_dataset.output_classes),
-                nest.flatten(input_dataset.output_shapes),
-                nest.flatten(input_dataset.output_types))
+                nest.flatten(get_legacy_output_classes(input_dataset)),
+                nest.flatten(get_legacy_output_shapes(input_dataset)),
+                nest.flatten(get_legacy_output_types(input_dataset)))
         ])
     self._structure = structure_lib.NestedStructure(nest_of_structures)
     variant_tensor = gen_dataset_ops.window_dataset(
@@ -3016,10 +3283,11 @@ class _OptionsDataset(UnaryUnchangedStructureDataset):
 class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
-  def __init__(self, input_dataset):
+  def __init__(self, input_dataset, cpu_budget):
     self._input_dataset = input_dataset
     variant_tensor = gen_dataset_ops.model_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
+        cpu_budget=cpu_budget,
         **flat_structure(self))
     super(_ModelDataset, self).__init__(input_dataset, variant_tensor)
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d6fb73813cd06e440d69f900e6b1076606a068c0..7521d5b8175c36c3e2754dd394f84671febdb8fc 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -31,8 +31,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -68,7 +68,7 @@ def _device_stack_is_empty():
 
 
 @tf_export(v1=["data.Iterator"])
-class Iterator(checkpointable.Checkpointable):
+class Iterator(trackable.Trackable):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
@@ -333,28 +333,39 @@ class Iterator(checkpointable.Checkpointable):
         element structure.
     """
     with ops.name_scope(name, "make_initializer") as name:
-      nest.assert_same_structure(self.output_types, dataset.output_types)
-      nest.assert_same_structure(self.output_shapes, dataset.output_shapes)
+      # pylint: disable=protected-access
+      # NOTE(mrry): Cannot depend on `dataset_ops.get_legacy_output*()` due
+      # to that creating a circular dependency.
+      dataset_output_types = (
+          dataset._element_structure._to_legacy_output_types())
+      dataset_output_shapes = (
+          dataset._element_structure._to_legacy_output_shapes())
+      dataset_output_classes = (
+          dataset._element_structure._to_legacy_output_classes())
+      # pylint: enable=protected-access
+
+      nest.assert_same_structure(self.output_types, dataset_output_types)
+      nest.assert_same_structure(self.output_shapes, dataset_output_shapes)
       for iterator_class, dataset_class in zip(
           nest.flatten(self.output_classes),
-          nest.flatten(dataset.output_classes)):
+          nest.flatten(dataset_output_classes)):
         if iterator_class is not dataset_class:
           raise TypeError(
               "Expected output classes %r but got dataset with output class %r."
-              % (self.output_classes, dataset.output_classes))
+              % (self.output_classes, dataset_output_classes))
       for iterator_dtype, dataset_dtype in zip(
-          nest.flatten(self.output_types), nest.flatten(dataset.output_types)):
+          nest.flatten(self.output_types), nest.flatten(dataset_output_types)):
         if iterator_dtype != dataset_dtype:
           raise TypeError(
               "Expected output types %r but got dataset with output types %r." %
-              (self.output_types, dataset.output_types))
+              (self.output_types, dataset_output_types))
       for iterator_shape, dataset_shape in zip(
           nest.flatten(self.output_shapes), nest.flatten(
-              dataset.output_shapes)):
+              dataset_output_shapes)):
         if not iterator_shape.is_compatible_with(dataset_shape):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
-                          (self.output_shapes, dataset.output_shapes))
+                          (self.output_shapes, dataset_output_shapes))
     with ops.colocate_with(self._iterator_resource):
       return gen_dataset_ops.make_iterator(
           dataset._variant_tensor, self._iterator_resource, name=name)  # pylint: disable=protected-access
@@ -491,7 +502,7 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class EagerIterator(checkpointable.Checkpointable):
+class EagerIterator(trackable.Trackable):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
 
   def __init__(self, dataset):
@@ -525,18 +536,17 @@ class EagerIterator(checkpointable.Checkpointable):
       # pylint: disable=protected-access
       dataset = dataset._apply_options()
       ds_variant = dataset._variant_tensor
-      self._structure = structure_lib.convert_legacy_structure(
-          dataset.output_types, dataset.output_shapes, dataset.output_classes)
+      self._structure = dataset._element_structure
       self._flat_output_types = self._structure._flat_types
       self._flat_output_shapes = self._structure._flat_shapes
       with ops.colocate_with(ds_variant):
-        self._resource = gen_dataset_ops.anonymous_iterator(
+        self._iterator_resource = gen_dataset_ops.anonymous_iterator(
             output_types=self._flat_output_types,
             output_shapes=self._flat_output_shapes)
-        gen_dataset_ops.make_iterator(ds_variant, self._resource)
+        gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
         # Delete the resource when this object is deleted
         self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-            handle=self._resource, handle_device=self._device)
+            handle=self._iterator_resource, handle_device=self._device)
       # pylint: enable=protected-access
 
   def __iter__(self):
@@ -562,7 +572,7 @@ class EagerIterator(checkpointable.Checkpointable):
         # to a background thread, and can achieve a small constant performance
         # boost by invoking the iterator synchronously.
         ret = gen_dataset_ops.iterator_get_next_sync(
-            self._resource,
+            self._iterator_resource,
             output_types=self._flat_output_types,
             output_shapes=self._flat_output_shapes)
 
@@ -636,12 +646,12 @@ class EagerIterator(checkpointable.Checkpointable):
   def _gather_saveables_for_checkpoint(self):
 
     def _saveable_factory(name):
-      return _IteratorSaveable(self._resource, name)
+      return _IteratorSaveable(self._iterator_resource, name)
 
     return {"ITERATOR": _saveable_factory}
 
 
-# TODO(b/71645805): Expose checkpointable stateful objects from dataset
+# TODO(b/71645805): Expose trackable stateful objects from dataset
 # attributes(potential).
 class _IteratorSaveable(BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 44aebb60cf5df12c5762a0522ab5f468df8a3875..efa8a11b75ba4c4c7094fc415c3b445076ebb337 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 class _PerDeviceGenerator(dataset_ops.DatasetV2):
@@ -41,13 +42,15 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
         gen_dataset_ops.multi_device_iterator_to_string_handle(
             multi_device_iterator_resource))
 
-    @function.defun()
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
+    @function.defun(autograph=False)  # Pure graph code.
     def _init_func():
       return multi_device_iterator_string_handle
 
     init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.defun()
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
+    @function.defun(autograph=False)  # Pure graph code.
     def _remote_init_func():
       return functional_ops.remote_call(
           target=source_device,
@@ -58,7 +61,10 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._init_captured_args = self._init_func.captured_inputs
 
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        autograph=False)  # Pure graph code.
     def _next_func(string_handle):
       # pylint: disable=protected-access
       multi_device_iterator = (
@@ -75,9 +81,11 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
     @function.defun_with_attributes(
         input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
-        attributes={"experimental_ints_on_device": True})
+        attributes={"experimental_ints_on_device": True},
+        autograph=False)  # Pure graph code.
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
@@ -88,13 +96,24 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    self._incarnation_id_index = -1
+    for i, arg in enumerate(self._next_captured_args):
+      if arg == incarnation_id:
+        self._incarnation_id_index = i
+
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        autograph=False)  # Pure graph code.
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
 
     finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    # TODO(b/124254153): Enable autograph once the overhead is low enough.
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        autograph=False)  # Pure graph code.
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
@@ -125,15 +144,52 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     return self._structure
 
 
-class MultiDeviceIterator(object):
-  """An iterator over multiple devices.
+class _ReincarnatedPerDeviceGenerator(dataset_ops.DatasetV2):
+  """Creates a _PerDeviceGenerator-like dataset with a new incarnation_id.
 
-  @compatibility(eager)
-  MultiDeviceIterator isn't currently supported in Eager mode but support is
-  coming soon.
-  @end_compatibility
+  Re-uses the functions from the provided per_device_dataset and just switches
+  out the function argument corresponding to the incarnation_id.
   """
 
+  def __init__(self, per_device_dataset, incarnation_id):
+    # pylint: disable=protected-access
+    self._structure = per_device_dataset._structure
+
+    self._init_func = per_device_dataset._init_func
+    self._init_captured_args = self._init_func.captured_inputs
+
+    self._next_func = per_device_dataset._next_func
+    self._next_captured_args = per_device_dataset._next_captured_args
+    # The captured arguments to the next_func are string_handle, incarnation_id.
+    # We update the incarnation id to the new one.
+    self._next_captured_args[
+        per_device_dataset._incarnation_id_index] = incarnation_id
+
+    self._finalize_func = per_device_dataset._finalize_func
+    self._finalize_captured_args = per_device_dataset._finalize_captured_args
+
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        self._init_captured_args,
+        self._next_captured_args,
+        self._finalize_captured_args,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
+    super(_ReincarnatedPerDeviceGenerator, self).__init__(variant_tensor)
+
+  def _inputs(self):
+    # TODO(b/116506223): Determine which datasets should be used as inputs here.
+    return []
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class MultiDeviceIterator(object):
+  """An iterator over multiple devices."""
+
   def __init__(self,
                dataset,
                devices,
@@ -150,6 +206,10 @@ class MultiDeviceIterator(object):
         to prefetch into.
       source_device: The host device to place the `dataset` on.
 
+      In order to prevent deadlocks, if the prefetch_buffer_size is greater
+      than the max_buffer_size, we set the max_buffer_size to
+      prefetch_buffer_size.
+
     Raises:
       RuntimeError: If run in Eager mode.
     """
@@ -157,28 +217,44 @@ class MultiDeviceIterator(object):
     self._devices = devices
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
+    self._max_buffer_size = max_buffer_size
+    self._prefetch_buffer_size = prefetch_buffer_size
+
+    if self._prefetch_buffer_size > self._max_buffer_size:
+      self._max_buffer_size = self._prefetch_buffer_size
 
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
       # TODO(b/121378567): Get rid of this shared_name hack.
       shared_name = ""
       if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        shared_name += str(ops.uid())
+        shared_name = context.shared_name()
       self._multi_device_iterator_resource = (
           gen_dataset_ops.multi_device_iterator(
               devices=self._devices,
               shared_name=shared_name,
               container="",
-              **dataset_ops.flat_structure(dataset)))
+              **dataset_ops.flat_structure(self._dataset)))
+      if context.executing_eagerly():
+        # Delete the resource when this object is deleted
+        self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=self._multi_device_iterator_resource,
+            handle_device=self._source_device)
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
       self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
           self._dataset._variant_tensor,  # pylint: disable=protected-access
           self._multi_device_iterator_resource,
-          max_buffer_size=max_buffer_size)
+          max_buffer_size=self._max_buffer_size)
+
+    self._prototype_device_datasets = []
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = _PerDeviceGenerator(
+            i, self._multi_device_iterator_resource, self._incarnation_id,
+            self._source_device_tensor, self._dataset._element_structure)  # pylint: disable=protected-access
+        self._prototype_device_datasets.append(ds)
 
     # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
     # initialize the device side of the pipeline. This would allow the
@@ -188,17 +264,7 @@ class MultiDeviceIterator(object):
     self._device_iterators = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
-        ds = _PerDeviceGenerator(
-            i, self._multi_device_iterator_resource, self._incarnation_id,
-            self._source_device_tensor, dataset._element_structure)  # pylint: disable=protected-access
-        if prefetch_buffer_size > 0:
-          ds = ds.prefetch(prefetch_buffer_size)
-        # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
-        # non-CPU devices.
-        options = dataset_ops.Options()
-        options.experimental_autotune = False
-        options.experimental_optimization.apply_default_optimizations = False
-        ds = ds.with_options(options)
+        ds = self._create_device_dataset(i)
         if context.executing_eagerly():
           self._device_iterators.append(dataset_ops.make_one_shot_iterator(ds))
         else:
@@ -211,6 +277,20 @@ class MultiDeviceIterator(object):
       ]
       self._initializer = control_flow_ops.group(*device_iterator_initializers)
 
+  def _create_device_dataset(self, i):
+    """Uses _prototype_device_datasets[i] to build a dataset for the device."""
+    ds = self._prototype_device_datasets[i]
+    ds = _ReincarnatedPerDeviceGenerator(ds, self._incarnation_id)
+    if self._prefetch_buffer_size > 0:
+      ds = ds.prefetch(self._prefetch_buffer_size)
+    # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+    # non-CPU devices.
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = False
+    ds = ds.with_options(options)
+    return ds
+
   def get_next(self, device=None):
     """Returns the next element given a `device`, else returns all in a list."""
     if device is not None:
@@ -237,14 +317,23 @@ class MultiDeviceIterator(object):
       return control_flow_ops.no_op()
     return self._initializer
 
-  @property
-  def output_types(self):
-    return self._dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._dataset.output_shapes
+  def _eager_reset(self):
+    """Resets the MultiDeviceIterator in eager mode."""
+    if not context.executing_eagerly():
+      raise ValueError("Eager reset is only supported in eager mode.")
+    # pylint: disable=protected-access
+    self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
+        self._dataset._variant_tensor,
+        self._multi_device_iterator_resource,
+        max_buffer_size=self._max_buffer_size)
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = self._create_device_dataset(i)
+        # Reset the device iterator resources with the new dataset.
+        ds_variant = ds._variant_tensor
+        gen_dataset_ops.make_iterator(
+            ds_variant, self._device_iterators[i]._iterator_resource)
 
   @property
-  def output_classes(self):
-    return self._dataset.output_classes
+  def _element_structure(self):
+    return dataset_ops.get_structure(self._dataset)
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 5e61bcf6be0a099b75d9190aad17a6046e70c665..477ff94a8d3b5784ab3129fae2398af0fd151944 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -197,10 +197,11 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
       ValueError: If any argument does not have the expected shape.
     """
     if isinstance(filenames, dataset_ops.DatasetV2):
-      if filenames.output_types != dtypes.string:
+      if dataset_ops.get_legacy_output_types(filenames) != dtypes.string:
         raise TypeError(
             "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
-      if not filenames.output_shapes.is_compatible_with(tensor_shape.scalar()):
+      if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with(
+          tensor_shape.scalar()):
         raise ValueError(
             "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` "
             "elements.")
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e5abc654da77cd9409f52d3ba5c8868c0916c712..ebfd8af34233516d3f447d03735371b7e2be8f22 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -46,7 +46,7 @@ from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(list(dict_))
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -68,7 +68,7 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif (isinstance(instance, tuple) and
         hasattr(instance, "_fields") and
         isinstance(instance._fields, _collections.Sequence) and
@@ -317,8 +317,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
         raise ValueError(
             "The two structures don't have the same keys. Input "
             "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+            (list(input_tree), list(shallow_tree)))
       input_tree = list(sorted(_six.iteritems(input_tree)))
       shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 27a700f813cf0fd3896a85fd799b02776672795c..b757aa0629917e38cbe4903927e51affa07a26df 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -47,7 +47,9 @@ py_library(
         ":cli_test_utils",
         ":debug_py",
         ":grpc_debug_test_server",
-        ":offline_analyzer",
+        ":grpc_tensorflow_server",
+        ":grpc_tensorflow_server_lib",
+        ":offline_analyzer_lib",
         ":session_debug_testlib",
         ":source_remote",
     ] + if_not_windows([
@@ -111,6 +113,26 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_binary(
+    name = "grpc_tensorflow_server",
+    srcs = ["lib/grpc_tensorflow_server.py"],
+    srcs_version = "PY2AND3",
+    deps = [":grpc_tensorflow_server_lib"],
+)
+
+py_library(
+    name = "grpc_tensorflow_server_lib",
+    srcs = [
+        "lib/grpc_tensorflow_server.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+    ],
+)
+
 py_library(
     name = "source_utils",
     srcs = ["lib/source_utils.py"],
@@ -393,6 +415,13 @@ py_binary(
     name = "offline_analyzer",
     srcs = ["cli/offline_analyzer.py"],
     srcs_version = "PY2AND3",
+    deps = [":offline_analyzer_lib"],
+)
+
+py_library(
+    name = "offline_analyzer_lib",
+    srcs = ["cli/offline_analyzer.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":analyzer_cli",
         ":debug_data",
@@ -404,12 +433,12 @@ py_binary(
 py_library(
     name = "debug_examples",
     deps = [
-        ":debug_errors",
-        ":debug_fibonacci",
-        ":debug_keras",
+        ":debug_errors_lib",
+        ":debug_fibonacci_lib",
+        ":debug_keras_lib",
     ] + if_not_v2([
-        ":debug_mnist",
-        ":debug_tflearn_iris",
+        ":debug_mnist_lib",
+        ":debug_tflearn_iris_lib",
     ]),
 )
 
@@ -417,6 +446,13 @@ py_binary(
     name = "debug_fibonacci",
     srcs = ["examples/debug_fibonacci.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_fibonacci_lib"],
+)
+
+py_library(
+    name = "debug_fibonacci_lib",
+    srcs = ["examples/debug_fibonacci.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -429,6 +465,13 @@ py_binary(
     name = "debug_errors",
     srcs = ["examples/debug_errors.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_errors_lib"],
+)
+
+py_library(
+    name = "debug_errors_lib",
+    srcs = ["examples/debug_errors.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -440,6 +483,13 @@ py_binary(
     name = "debug_mnist",
     srcs = ["examples/debug_mnist.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_mnist_lib"],
+)
+
+py_library(
+    name = "debug_mnist_lib",
+    srcs = ["examples/debug_mnist.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -451,6 +501,13 @@ py_binary(
     name = "debug_tflearn_iris",
     srcs = ["examples/debug_tflearn_iris.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_tflearn_iris_lib"],
+)
+
+py_library(
+    name = "debug_tflearn_iris_lib",
+    srcs = ["examples/debug_tflearn_iris.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -462,6 +519,13 @@ py_binary(
     name = "debug_keras",
     srcs = ["examples/debug_keras.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_keras_lib"],
+)
+
+py_library(
+    name = "debug_keras_lib",
+    srcs = ["examples/debug_keras.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -527,6 +591,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -615,6 +680,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -774,6 +840,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -791,6 +858,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -808,6 +876,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -933,6 +1002,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows"],  # TODO: needs investigation on Windows
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -973,6 +1043,13 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    tags = [
+        "manual",
+        "no_pip",
+        "no_windows",
+        "notap",
+    ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -1000,6 +1077,7 @@ cuda_py_test(
         "notsan",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -1025,6 +1103,7 @@ cuda_py_test(
         "optonly",  # Test flaky (b/80130873)
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 # TODO(cais): Run the test in OSS, perhaps through a sh_test.
@@ -1046,13 +1125,14 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
-    data = ["//tensorflow/tools/dist_test/server:grpc_tensorflow_server"],
+    data = [":grpc_tensorflow_server"],
     grpc_enabled = True,
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
         "no_windows",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
index 94eb2754da21b2a6c66271f53a2a0917deb25515..fe1a012a5444a0140edd15b9e66a4de0449a5e47 100644
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ b/tensorflow/python/debug/cli/stepper_cli.py
@@ -251,6 +251,9 @@ class NodeStepperCLI(object):
       lines.extend(
           ["Topologically-sorted transitive input(s) and fetch(es):", ""])
 
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+    self._add_deprecation_warning(output)
+
     for i, element_name in enumerate(self._sorted_nodes):
       if i < index_range[0] or i >= index_range[1]:
         continue
@@ -269,15 +272,36 @@ class NodeStepperCLI(object):
           override_names,
           dirty_variable_names)
 
-      lines.append(node_prefix + "] " + element_name)
-
-    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+      output.append_rich_line(node_prefix + "] " + element_name)
 
     if verbose:
       output.extend(self._node_status_label_legend())
 
     return output
 
+  def _add_deprecation_warning(self, message):
+    """Add deprecation warning as RichTextLines."""
+    color = "yellow"
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "WARNING: the invoke_stepper feature of tfdbg has been deprecated ",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "and will be removed in the next release of TensorFlow.",
+            color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "There now exist better alternatives of stepping debugging, "
+            "including:",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- TensorBoard Debugger Plugin", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- Eager Execution", color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+
   def _get_status_labels(self,
                          element_name,
                          handle_node_names,
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 5cf69d0168b70a4d03162512b5024736c50cf23a..c728373ae2bf75b216415034ec275fc2bd29b15a 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -235,6 +235,9 @@ class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
       ], output.lines)
 
   def testContToValidNodeShouldUpdateStatus(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with stepper.NodeStepper(self.sess, self.e) as node_stepper:
       cli = stepper_cli.NodeStepperCLI(node_stepper)
 
@@ -275,6 +278,9 @@ class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
       self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[index_d])
 
   def testSteppingOneStepAtATimeShouldUpdateStatus(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with stepper.NodeStepper(self.sess, self.e) as node_stepper:
       cli = stepper_cli.NodeStepperCLI(node_stepper)
 
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 2405e29aaa51c2e0c422fa6f950ec46553ae75c0..67bac8533f5a1ddb33152a2cc6a08df92020ffa5 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -60,8 +60,7 @@ class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
     cluster_spec = "worker|localhost:%d" % worker_port
     tf_logging.info("cluster_spec: %s", cluster_spec)
 
-    server_bin = test.test_src_dir_path(
-        "tools/dist_test/server/grpc_tensorflow_server")
+    server_bin = test.test_src_dir_path("python/debug/grpc_tensorflow_server")
 
     cls.server_target = "grpc://localhost:%d" % worker_port
 
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/python/debug/lib/grpc_tensorflow_server.py
similarity index 94%
rename from tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
rename to tensorflow/python/debug/lib/grpc_tensorflow_server.py
index bd6700a0b1f43208b317e14953c1110cbe39248b..312ba687c55636d73b562cbae000eb8d0a48df9a 100644
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/python/debug/lib/grpc_tensorflow_server.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,6 +38,7 @@ import sys
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
@@ -74,7 +74,7 @@ def parse_cluster_spec(cluster_spec, cluster, verbose=False):
     job_def.name = job_name
 
     if verbose:
-      print("Added job named \"%s\"" % job_name)
+      logging.info("Added job named \"%s\"", job_name)
 
     job_tasks = job_string.split("|")[1].split(";")
     for i in range(len(job_tasks)):
@@ -84,7 +84,8 @@ def parse_cluster_spec(cluster_spec, cluster, verbose=False):
       job_def.tasks[i] = job_tasks[i]
 
       if verbose:
-        print("  Added task \"%s\" to job \"%s\"" % (job_tasks[i], job_name))
+        logging.info("  Added task \"%s\" to job \"%s\"",
+                     job_tasks[i], job_name)
 
 
 def main(unused_args):
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
index 29add425e946aadfe941c73e9f9cef4aef3c8a9c..dce400c9ab0b6be3cabaea7c465baa1a6d2f471d 100644
--- a/tensorflow/python/debug/lib/source_remote_test.py
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -48,7 +48,8 @@ class SendTracebacksTest(test_util.TensorFlowTestCase):
     test_util.TensorFlowTestCase.setUpClass()
     (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
      cls._server_thread,
-     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
     cls._server_address = "localhost:%d" % cls._server_port
     (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
      cls._server_thread_2,
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 9e78e207b80a99f3812c5909cf3753d90eab3680..bec858a1ba6ce1df58a8fc8d18f7a4f802f7d87e 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -94,6 +94,9 @@ class StepperTest(test_util.TensorFlowTestCase):
       self.assertAllClose(6.0, stepper.cont("c"))
 
   def testUsingNamesNotUsingIntermediateTensors(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, "e:0") as stepper:
       # The first cont() call should have used no feeds.
       result = stepper.cont("c:0")
@@ -119,6 +122,9 @@ class StepperTest(test_util.TensorFlowTestCase):
       }, stepper.last_feed_types())
 
   def testUsingNodesNotUsingIntermediateTensors(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, self.e) as stepper:
       # There should be no handles before any cont() calls.
       self.assertEqual([], stepper.handle_names())
@@ -493,6 +499,9 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
       self.assertSetEqual({"ph0", "ph1"}, set(stepper.placeholders()))
 
   def testContWithPlaceholders(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(
         self.sess,
         self.y,
@@ -739,6 +748,9 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
 
   def testContToUpdateA(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, "optim") as stepper:
       result = stepper.cont("a:0")
       self.assertAllClose(1.0, result)
@@ -887,6 +899,8 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
     "clean" means no Variables have been updated by preceding cont() calls.
     """
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
 
     with NodeStepper(self.sess, "optim") as stepper:
       # First, call cont() on the two tensors on the intermediate level: e and
@@ -979,6 +993,8 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
   def testOverrideThenContToUpdateThenRemoveOverrideThenUpdateAgain(self):
     """Test cont() to update nodes after overriding tensor values."""
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
 
     with NodeStepper(self.sess, "optim") as stepper:
       result = stepper.cont("d:0")
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index a6a1c470b413958d524eed7488c35961b55d9912..9f9e285cce209e7bb104f3594f3f16298c07969f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
@@ -101,6 +102,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -110,8 +112,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":cross_device_ops",
         ":distribute_lib",
         ":mirrored_strategy",
+        ":one_device_strategy",
+        "//tensorflow/python/distribute/experimental",
     ],
 )
 
@@ -138,7 +143,6 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/ops/losses",
         "//tensorflow/tools/docs:doc_controls",
     ],
 )
@@ -185,9 +189,6 @@ py_test(
     name = "distribute_coordinator_test",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -267,6 +268,47 @@ py_library(
     ],
 )
 
+py_library(
+    name = "one_device_strategy",
+    srcs = ["one_device_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":distribute_lib",
+        ":input_lib",
+        ":numpy_dataset",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "collective_all_reduce_strategy",
+    srcs = ["collective_all_reduce_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":mirrored_strategy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:numpy_dataset",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -316,10 +358,7 @@ py_library(
         ":distribute_lib",
         ":input_ops",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:context",
     ],
@@ -330,7 +369,6 @@ py_library(
     srcs = ["input_ops.py"],
     deps = [
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
         "//tensorflow/python/data/util:nest",
     ],
 )
@@ -350,16 +388,13 @@ cuda_py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
     ],
-    tags = [
-        "no_pip",
-    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
@@ -374,6 +409,26 @@ py_test(
     ],
 )
 
+py_library(
+    name = "tpu_strategy",
+    srcs = ["tpu_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":one_device_strategy",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/tpu:tpu_lib",
+    ],
+)
+
 # Used only by estimator.
 py_library(
     name = "estimator_training",
@@ -414,6 +469,16 @@ py_test(
     ],
 )
 
+py_library(
+    name = "summary_op_util",
+    srcs = ["summary_op_util.py"],
+    deps = [
+        ":distribute_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "values",
     srcs = ["values.py"],
@@ -427,7 +492,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
index 4ff912ae10d8336cfeeb42d060bd0d9c52e24482..f9d0a95ea580a8bb125e6610c232d1eabfe105a6 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -19,7 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute import cross_device_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.distribute.experimental import collective_all_reduce_strategy
+from tensorflow.python.distribute.experimental import parameter_server_strategy
 # pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
index ef87f59b7fd7ef1774ed97370c75e16f3ec4e295..39ea191fb04a9e6a8c091eabff9fb5aeec888dfd 100644
--- a/tensorflow/python/distribute/cluster_resolver/__init__.py
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -18,40 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import gce_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import kubernetes_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import slurm_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'cluster_resolver',
-    'gce_cluster_resolver',
-    'kubernetes_cluster_resolver',
-    'slurm_cluster_resolver',
-    'tfconfig_cluster_resolver',
-    'tpu_cluster_resolver',
-    'ClusterResolver',
-    'SimpleClusterResolver',
-    'UnionClusterResolver',
-    'GceClusterResolver',
-    'KubernetesClusterResolver',
-    'TFConfigClusterResolver',
-    'TPUClusterResolver',
-    'SlurmClusterResolver',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
-
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index 73188bd7caaeb8f60e1e19dc11ce20e0a4349433..c636c98254c19b70720b252e0c52d56c87b41572 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -20,11 +20,18 @@ from __future__ import print_function
 
 import abc
 
+import collections
+import re
 import six
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
+
+
+DEVICE_TYPE_REGEX = re.compile('.*device:([^:]+).*')
 
 
 def format_master_url(master, rpc_layer=None):
@@ -35,13 +42,27 @@ def format_master_url(master, rpc_layer=None):
 
 
 def get_accelerator_devices(master, config_proto):
-  # TODO(frankchn): Add support for eager mode as well as graph mode.
-  with ops.Graph().as_default():
-    with session.Session(master, config=config_proto) as s:
-      devices = s.list_devices()
-  return devices
+  """Returns accelerator devices given a master and a configuration."""
+  if context.executing_eagerly():
+    device_names = context.list_devices()  # list_devices returns list(string)
+    devices = []
+    for name in device_names:
+      device_type = 'GPU'  # default device type is GPU
+      device_match = DEVICE_TYPE_REGEX.match(name)
+      if device_match:
+        device_type = device_match.group(1)
+      if device_type == 'CPU' or device_type == 'XLA_CPU':  # Filter CPUs
+        continue
+      devices.append(session._DeviceAttributes(name, device_type, 0, 0))  # pylint: disable=protected-access
+    return devices
+  else:
+    with ops.Graph().as_default():
+      with session.Session(master, config=config_proto) as s:
+        devices = s.list_devices()
+    return devices
 
 
+@tf_export('distribute.cluster_resolver.ClusterResolver')
 @six.add_metaclass(abc.ABCMeta)
 class ClusterResolver(object):
   """Abstract class for all implementations of ClusterResolvers.
@@ -56,13 +77,13 @@ class ClusterResolver(object):
   underlying machine failures and scale TensorFlow worker clusters up and down.
 
   Note to Implementors: In addition to these abstract methods, you must also
-  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  implement the task_type, task_id, and rpc_layer attributes. You may choose
   to implement them either as properties with getters or setters or directly
   set the attributes.
 
   - task_type is the name of the server's current named job (e.g. 'worker',
      'ps' in a distributed parameterized training job).
-  - task_index is the ordinal index of the server within the task type.
+  - task_id is the ordinal index of the server within the task type.
   - rpc_layer is the protocol used by TensorFlow to communicate with other
       TensorFlow servers in a distributed environment.
   """
@@ -84,12 +105,12 @@ class ClusterResolver(object):
     raise NotImplementedError()
 
   @abc.abstractmethod
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Retrieves the name or URL of the session master.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
@@ -103,53 +124,74 @@ class ClusterResolver(object):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
+                       task_id=None,
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
 
     This returns the number of accelerator cores (such as GPUs and TPUs)
-    available per worker. If workers only has CPU cores available, then this
-    should return 0. This method will query the master for this information
-    if it is not otherwise known.
+    available per worker.
 
-    Optionally, we allow callers to specify the task_type, task_index, and
-    rpc_layer, if they want to target a specific TensorFlow process to query
+    Optionally, we allow callers to specify the task_type, and task_id, for
+    if they want to target a specific TensorFlow process to query
     the number of accelerators. This is to support heterogenous environments,
     where the number of accelerators cores per host is different.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the machine we
         want to query.
-      task_index: (Optional) The index of the TensorFlow task of the machine we
+      task_id: (Optional) The index of the TensorFlow task of the machine we
         want to query.
-      accelerator_type: (Optional) The type of accelerator we are trying to
-        query (defaults to 'GPU').
       config_proto: (Optional) Configuration for starting a new session to
         query how many accelerator cores it has.
+
+    Returns:
+      A map of accelerator types to number of cores.
     """
-    master = self.master(task_type, task_index)
+    master = self.master(task_type, task_id)
     devices = get_accelerator_devices(master, config_proto)
-    return sum(1 for d in devices if d.device_type == accelerator_type)
+    mapping = collections.defaultdict(int)
+    for device in devices:
+      if task_type is not None and task_id is not None:
+        job_path = '/job:%s' % task_type
+        task_path = '/task:%s' % task_id
+        if job_path not in device.name or task_path not in device.name:
+          continue
+      mapping[device.device_type] += 1
+    return mapping
 
-  @abc.abstractproperty
+  @property
   def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    raise NotImplementedError()
+    """Returns the current environment which TensorFlow is running in.
+
+    There are two possible return values, "google" (when TensorFlow is running
+    in a Google-internal environment) or an empty string (when TensorFlow is
+    running elsewhere).
+
+    If you are implementing a ClusterResolver that works in both the Google
+    environment and the open-source world (for instance, a TPU ClusterResolver
+    or similar), you will have to return the appropriate string depending on the
+    environment, which you will have to detect.
+
+    Otherwise, if you are implementing a ClusterResolver that will only work
+    in open-source TensorFlow, you do not need to implement this property.
+    """
+    return ''
 
 
+@tf_export('distribute.cluster_resolver.SimpleClusterResolver')
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
-               environment='', num_accelerators=0,
+  def __init__(self, cluster_spec, master='', task_type=None, task_id=None,
+               environment='', num_accelerators=None,
                rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._environment = environment
+
     self._num_accelerators = num_accelerators
     self._rpc_layer = rpc_layer
 
@@ -165,22 +207,22 @@ class SimpleClusterResolver(ClusterResolver):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC used by distributed TensorFlow.
 
     Returns:
       The name or URL of the session master.
 
-    If a task_type and task_index is given, this will override the `master`
+    If a task_type and task_id is given, this will override the `master`
     string passed into the initialization function.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
     else:
       master = self._master
 
@@ -191,16 +233,16 @@ class SimpleClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -208,24 +250,24 @@ class SimpleClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
+                       task_id=None,
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
 
     The SimpleClusterResolver does not do automatic detection of accelerators,
     so a TensorFlow session will never be created, and thus all arguments are
-    unused and we simply return whatever was passed in when this object was
-    initialized.
+    unused and we simply assume that the type of accelerator is a GPU and return
+    the value in provided to us in the constructor.
 
     Args:
       task_type: Unused.
-      task_index: Unused.
-      accelerator_type: Unused.
+      task_id: Unused.
       config_proto: Unused.
     """
     # Unused
-    del task_type, task_index, accelerator_type, config_proto
+    del task_type, task_id, config_proto
+    if self._num_accelerators is None:
+      return {}
     return self._num_accelerators
 
   @property
@@ -237,6 +279,7 @@ class SimpleClusterResolver(ClusterResolver):
     self._rpc_layer = rpc_layer
 
 
+@tf_export('distribute.cluster_resolver.UnionResolver')
 class UnionClusterResolver(ClusterResolver):
   """Performs a union on underlying ClusterResolvers.
 
@@ -259,7 +302,7 @@ class UnionClusterResolver(ClusterResolver):
         rpc_layer - (Optional) Override value for the RPC layer used by
           TensorFlow.
         task_type - (Optional) Override value for the current task type.
-        task_index - (Optional) Override value for the current task index.
+        task_id - (Optional) Override value for the current task index.
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
@@ -269,7 +312,7 @@ class UnionClusterResolver(ClusterResolver):
 
     self._rpc_layer = kwargs.pop('rpc_layer', None)
     self._task_type = kwargs.pop('task_type', None)
-    self._task_index = kwargs.pop('task_index', None)
+    self._task_id = kwargs.pop('task_id', None)
 
     if kwargs:
       raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
@@ -357,22 +400,22 @@ class UnionClusterResolver(ClusterResolver):
 
     return ClusterSpec(merged_cluster)
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     This usually returns the master from the first ClusterResolver passed in,
-    but you can override this by specifying the task_type and task_index.
+    but you can override this by specifying the task_type and task_id.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       return format_master_url(master, rpc_layer or self._rpc_layer)
 
     return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
@@ -382,16 +425,16 @@ class UnionClusterResolver(ClusterResolver):
     return self._task_type or self._cluster_resolvers[0].task_type
 
   @property
-  def task_index(self):
-    return self._task_index or self._cluster_resolvers[0].task_index
+  def task_id(self):
+    return self._task_id or self._cluster_resolvers[0].task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -399,11 +442,10 @@ class UnionClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
+                       task_id=None,
                        config_proto=None):
     return self._cluster_resolvers[0].num_accelerators(
-        task_type, task_index, accelerator_type, config_proto)
+        task_type, task_id, config_proto)
 
   @property
   def rpc_layer(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index 0ff6b6be62122b3a7b71124613a694d9bb5fd357..c9aebbb46850f051eb02f2589c00eb3f29c0882d 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import eager
 from tensorflow.python.client import session
 from tensorflow.python.distribute.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -33,17 +35,20 @@ class MockBaseClusterResolver(ClusterResolver):
   def cluster_spec(self):
     return None
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     return ""
 
   def environment(self):
     return ""
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BaseClusterResolverTest(test.TestCase):
 
+  @mock.patch.object(eager.context, "list_devices")
   @mock.patch.object(session.BaseSession, "list_devices")
-  def testNumAcceleratorsSuccess(self, mock_list_devices):
+  def testNumAcceleratorsSuccess(self, mock_list_devices,
+                                 mock_eager_list_devices):
     device_names = [
         "/job:worker/task:0/device:GPU:0",
         "/job:worker/task:0/device:GPU:1",
@@ -51,30 +56,67 @@ class BaseClusterResolverTest(test.TestCase):
         "/job:worker/task:0/device:GPU:3",
     ]
     device_list = [
-        session._DeviceAttributes(
-            name, "GPU", 1024, 0) for name in device_names
+        session._DeviceAttributes(name, "GPU", 1024, 0)
+        for name in device_names
     ]
+    mock_eager_list_devices.return_value = device_names
     mock_list_devices.return_value = device_list
 
     resolver = MockBaseClusterResolver()
-    self.assertEqual(resolver.num_accelerators(), 4)
+    self.assertEqual(resolver.num_accelerators(), {"GPU": 4})
 
+  @mock.patch.object(eager.context, "list_devices")
   @mock.patch.object(session.BaseSession, "list_devices")
-  def testNumAcceleratorsFilterSuccess(self, mock_list_devices):
+  def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices,
+                                            mock_eager_list_devices):
     device_names = [
         "/job:worker/task:0/device:TPU:0",
         "/job:worker/task:0/device:TPU:1",
         "/job:worker/task:0/device:TPU:2",
         "/job:worker/task:0/device:TPU:3",
+        "/job:worker/task:0/device:GPU:0",
+        "/job:worker/task:0/device:GPU:1",
+        "/job:worker/task:0/device:GPU:2",
+        "/job:worker/task:0/device:GPU:3",
+    ]
+    device_list = [
+        session._DeviceAttributes(name, name[26:29], 1024, 0)
+        for name in device_names
+    ]
+    mock_eager_list_devices.return_value = device_names
+    mock_list_devices.return_value = device_list
+
+    resolver = MockBaseClusterResolver()
+    self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
+
+  @mock.patch.object(eager.context, "list_devices")
+  @mock.patch.object(session.BaseSession, "list_devices")
+  def testNumAcceleratorsFilterTasks(self, mock_list_devices,
+                                     mock_eager_list_devices):
+    device_names = [
+        "/job:worker1/task:0/device:TPU:0",
+        "/job:worker1/task:0/device:TPU:1",
+        "/job:worker1/task:0/device:GPU:0",
+        "/job:worker1/task:0/device:GPU:1",
+        "/job:worker2/task:1/device:TPU:2",
+        "/job:worker2/task:2/device:TPU:3",
+        "/job:worker2/task:3/device:GPU:2",
+        "/job:worker2/task:4/device:GPU:3",
     ]
     device_list = [
-        session._DeviceAttributes(
-            name, "TPU", 1024, 0) for name in device_names
+        session._DeviceAttributes(name, name[27:30], 1024, 0)
+        for name in device_names
     ]
+    mock_eager_list_devices.return_value = device_names
     mock_list_devices.return_value = device_list
 
     resolver = MockBaseClusterResolver()
-    self.assertEqual(resolver.num_accelerators(), 0)
+    self.assertEqual(resolver.num_accelerators(task_type="worker1", task_id=0),
+                     {"TPU": 2, "GPU": 2})
+    self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=3),
+                     {"GPU": 1})
+    self.assertEqual(resolver.num_accelerators(task_type="worker2", task_id=4),
+                     {"GPU": 1})
 
 
 class UnionClusterResolverTest(test.TestCase):
@@ -117,14 +159,14 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
-                                            num_accelerators=8,
+                                            task_id=1, environment="cloud",
+                                            num_accelerators={"GPU": 8},
                                             rpc_layer="grpc")
 
     self.assertEqual(simple_resolver.task_type, "ps")
-    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.task_id, 1)
     self.assertEqual(simple_resolver.environment, "cloud")
-    self.assertEqual(simple_resolver.num_accelerators(), 8)
+    self.assertEqual(simple_resolver.num_accelerators(), {"GPU": 8})
     self.assertEqual(simple_resolver.rpc_layer, "grpc")
 
   def testOverrideSimpleClusterResolver(self):
@@ -134,16 +176,16 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
-                                            num_accelerators=8,
+                                            task_id=1, environment="cloud",
+                                            num_accelerators={"GPU": 8},
                                             rpc_layer="grpc")
 
     simple_resolver.task_type = "worker"
-    simple_resolver.task_index = 2
+    simple_resolver.task_id = 2
     simple_resolver.rpc_layer = "http"
 
     self.assertEqual(simple_resolver.task_type, "worker")
-    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.task_id, 2)
     self.assertEqual(simple_resolver.rpc_layer, "http")
 
   def testSimpleOverrideMasterWithTaskIndexZero(self):
@@ -182,8 +224,8 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
-                                      task_index=1, environment="cloud",
-                                      num_accelerators=8,
+                                      task_id=1, environment="cloud",
+                                      num_accelerators={"GPU": 8},
                                       rpc_layer="grpc")
 
     cluster_spec_2 = server_lib.ClusterSpec({
@@ -191,24 +233,24 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
     })
     resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
-                                      task_index=2, environment="local",
-                                      num_accelerators=16,
+                                      task_id=2, environment="local",
+                                      num_accelerators={"GPU": 16},
                                       rpc_layer="http")
 
     union_resolver = UnionClusterResolver(resolver1, resolver2)
 
     self.assertEqual(union_resolver.task_type, "ps")
-    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.task_id, 1)
     self.assertEqual(union_resolver.environment, "cloud")
-    self.assertEqual(union_resolver.num_accelerators(), 8)
+    self.assertEqual(union_resolver.num_accelerators(), {"GPU": 8})
     self.assertEqual(union_resolver.rpc_layer, "grpc")
 
     union_resolver.task_type = "worker"
-    union_resolver.task_index = 2
+    union_resolver.task_id = 2
     union_resolver.rpc_layer = "http"
 
     self.assertEqual(union_resolver.task_type, "worker")
-    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.task_id, 2)
     self.assertEqual(union_resolver.rpc_layer, "http")
 
   def testTwoNonOverlappingJobMergedClusterResolver(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
index 06512613cbe34b09730dd7c6914ea9d7098204d5..9d7dfdd1ea9078ae4fd5fcf1da0f56a3f8b91a1f 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
+
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
@@ -29,11 +31,8 @@ except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
 
-def _format_master_url(master, rpc_layer=None):
-  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
-
-
-class GceClusterResolver(ClusterResolver):
+@tf_export('distribute.cluster_resolver.GCEClusterResolver')
+class GCEClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Compute Engine.
 
   This is an implementation of cluster resolvers for the Google Compute Engine
@@ -49,13 +48,13 @@ class GceClusterResolver(ClusterResolver):
                instance_group,
                port,
                task_type='worker',
-               task_index=0,
+               task_id=0,
                rpc_layer='grpc',
                credentials='default',
                service=None):
-    """Creates a new GceClusterResolver object.
+    """Creates a new GCEClusterResolver object.
 
-    This takes in a few parameters and creates a GceClusterResolver project. It
+    This takes in a few parameters and creates a GCEClusterResolver project. It
     will then use these parameters to query the GCE API for the IP addresses of
     each instance in the instance group.
 
@@ -66,7 +65,7 @@ class GceClusterResolver(ClusterResolver):
       port: Port of the listening TensorFlow server (default: 8470)
       task_type: Name of the TensorFlow job this GCE instance group of VM
         instances belong to.
-      task_index: The task index for this particular VM, within the GCE
+      task_id: The task index for this particular VM, within the GCE
         instance group. In particular, every single instance should be assigned
         a unique ordinal index within an instance group manually so that they
         can be distinguished from each other.
@@ -85,7 +84,7 @@ class GceClusterResolver(ClusterResolver):
     self._zone = zone
     self._instance_group = instance_group
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
     self._port = port
     self._credentials = credentials
@@ -149,12 +148,12 @@ class GceClusterResolver(ClusterResolver):
     worker_list.sort()
     return ClusterSpec({self._task_type: worker_list})
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     task_type = task_type if task_type is not None else self._task_type
-    task_index = task_index if task_index is not None else self._task_index
+    task_id = task_id if task_id is not None else self._task_id
 
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       if rpc_layer or self._rpc_layer:
         return '%s://%s' % (rpc_layer or self._rpc_layer, master)
       else:
@@ -167,28 +166,18 @@ class GceClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     raise RuntimeError(
-        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'You cannot reset the task_type of the GCEClusterResolver after it has '
         'been created.')
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the GCE environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def rpc_layer(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index d4f0660c922d593d81c0927dea0d6271e89c53e1..47d1cdc0da9689d78647d8a584267707c6e85e64 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for GceClusterResolver."""
+"""Tests for GCEClusterResolver."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute.cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -27,7 +27,7 @@ from tensorflow.python.training import server_lib
 mock = test.mock
 
 
-class GceClusterResolverTest(test.TestCase):
+class GCEClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
@@ -121,7 +121,7 @@ class GceClusterResolverTest(test.TestCase):
     return self.standard_mock_service_client(mock_instance_group, mock_instance)
 
   def testSimpleSuccessfulRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -136,11 +136,11 @@ class GceClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
   def testMasterRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        task_index=0,
+        task_id=0,
         port=8470,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -153,7 +153,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -172,7 +172,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -181,11 +181,11 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
-    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.task_id = 1
     gce_cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
-    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.task_id, 1)
     self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
     self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
 
@@ -196,21 +196,21 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
         task_type='',
-        task_index=1,
+        task_id=1,
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
     self.assertEqual(gce_cluster_resolver.master(
-        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+        task_type='', task_id=0), 'grpc://10.1.2.3:8470')
 
   def testCustomJobNameAndPortRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -232,7 +232,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -266,7 +266,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'ps2', 'ip': '10.100.2.3'},
     ]
 
-    worker1_gce_cluster_resolver = GceClusterResolver(
+    worker1_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -275,7 +275,7 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(worker1_name_to_ip))
 
-    worker2_gce_cluster_resolver = GceClusterResolver(
+    worker2_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -284,7 +284,7 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(worker2_name_to_ip))
 
-    ps_gce_cluster_resolver = GceClusterResolver(
+    ps_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index 7ff6ec0f2d5c6f6d2315e98cf5e7250b118fbadd..28b2712590d0519f1dbbdde1b43fab829238fa25 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 _KUBERNETES_API_CLIENT_INSTALLED = True
 try:
@@ -30,6 +31,7 @@ except ImportError:
   _KUBERNETES_API_CLIENT_INSTALLED = False
 
 
+@tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
 class KubernetesClusterResolver(ClusterResolver):
   """Cluster Resolver for Kubernetes.
 
@@ -88,31 +90,31 @@ class KubernetesClusterResolver(ClusterResolver):
     self._override_client = override_client
 
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
-    You must have set the task_type and task_index object properties before
-    calling this function, or pass in the `task_type` and `task_index`
+    You must have set the task_type and task_id object properties before
+    calling this function, or pass in the `task_type` and `task_id`
     parameters when using this function. If you do both, the function parameters
     will override the object properties.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    if task_type is not None and task_index is not None:
+    if task_type is not None and task_id is not None:
       return format_master_url(
-          self.cluster_spec().task_address(task_type, task_index),
+          self.cluster_spec().task_address(task_type, task_id),
           rpc_layer or self.rpc_layer)
 
     return ''
@@ -154,13 +156,3 @@ class KubernetesClusterResolver(ClusterResolver):
       cluster_map[tf_job] = all_pods
 
     return server_lib.ClusterSpec(cluster_map)
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the Cloud environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index a9750fa60b993a3504bbd01f0663cfdf868a2f01..f4e4cd82129a807cc62b81e7b7ac07d6b7c8d92c 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -119,9 +119,9 @@ class KubernetesClusterResolverTest(test.TestCase):
         override_client=_mock_kubernetes_client(
             {'job-name=tensorflow': ret}))
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 0
+    cluster_resolver.task_id = 0
     self.assertEqual(cluster_resolver.task_type, 'worker')
-    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.task_id, 0)
     self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
     self.assertEqual(cluster_resolver.master('worker', 2),
                      'grpc://10.1.2.5:8470')
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 9dbe25b613447fde2140585742d005dab82fb018..0e49cebee2b5d4602ff8025126e9cd506647030f 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -25,8 +25,10 @@ import subprocess
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('distribute.cluster_resolver.SlurmClusterResolver')
 class SlurmClusterResolver(ClusterResolver):
   """Cluster Resolver for system with Slurm workload manager.
 
@@ -112,7 +114,7 @@ class SlurmClusterResolver(ClusterResolver):
 
     self._auto_set_gpu = auto_set_gpu
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
     self._gpu_allocation = []
@@ -170,7 +172,7 @@ class SlurmClusterResolver(ClusterResolver):
 
       if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
         self.task_type = task_type
-        self.task_index = self._rank - cluster_rank_offset_start
+        self.task_id = self._rank - cluster_rank_offset_start
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
@@ -180,7 +182,7 @@ class SlurmClusterResolver(ClusterResolver):
     return ClusterSpec(self._cluster_allocation)
 
   def get_task_info(self):
-    """Returns job name and task_index for the process which calls this.
+    """Returns job name and task_id for the process which calls this.
 
     This returns the job name and task index for the process which calls this
     function according to its rank and cluster specification. The job name and
@@ -191,14 +193,14 @@ class SlurmClusterResolver(ClusterResolver):
       A string specifying job name the process belongs to and an integner
         specifying the task index the process belongs to in that job.
     """
-    return self.task_type, self.task_index
+    return self.task_type, self.task_id
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master string for connecting to a TensorFlow master.
 
     Args:
       task_type: (Optional) Overrides the default auto-selected task type.
-      task_index: (Optional) Overrides the default auto-slected task index.
+      task_id: (Optional) Overrides the default auto-slected task index.
       rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
         to communicate across nodes.
 
@@ -206,30 +208,19 @@ class SlurmClusterResolver(ClusterResolver):
       A connection string for connecting to a TensorFlow master.
     """
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    if task_type is not None and task_index is not None:
+    if task_type is not None and task_id is not None:
       return format_master_url(
-          self.cluster_spec().task_address(task_type, task_index),
+          self.cluster_spec().task_address(task_type, task_id),
           rpc_layer or self.rpc_layer)
 
     return ''
 
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the Slurm environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
-
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
+                       task_id=None,
                        config_proto=None):
     # Unused, since this is set in __init__ manually.
-    del task_type, task_index, accelerator_type, config_proto
-    return self._gpus_per_node
+    del task_type, task_id, config_proto
+    return {'GPU': self._gpus_per_node}
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 076539d16f17d64a9a28052960b61a5b99a7c9c6..c641fe60853a4b131cb6035c48e3d9f6ef9ddadf 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -83,7 +83,7 @@ class SlurmClusterResolverTest(test.TestCase):
         auto_set_gpu=False)
 
     slurm_cluster_resolver.task_type = 'worker'
-    slurm_cluster_resolver.task_index = 1
+    slurm_cluster_resolver.task_id = 1
     self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
 
     slurm_cluster_resolver.rpc_layer = 'ab'
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index 8d530cc15a035afcf2d3356599ed06e0b9d9a4cd..c9b6191a1c0e3d3b06ad20c537ae4e5229a880b2 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -24,6 +24,7 @@ import os
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
 
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _SESSION_MASTER_KEY = 'session_master'
@@ -47,12 +48,13 @@ def _get_value_in_tfconfig(key, default=None):
   return tf_config[key] if key in tf_config else default
 
 
+@tf_export('distribute.cluster_resolver.TFConfigClusterResolver')
 class TFConfigClusterResolver(ClusterResolver):
   """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
 
   def __init__(self,
                task_type=None,
-               task_index=None,
+               task_id=None,
                rpc_layer=None,
                environment=None):
     """Creates a new TFConfigClusterResolver.
@@ -60,14 +62,14 @@ class TFConfigClusterResolver(ClusterResolver):
     Args:
       task_type: (String, optional) Overrides the task type specified in the
         TF_CONFIG environment variable.
-      task_index: (Integer, optional) Overrides the task index specified in the
+      task_id: (Integer, optional) Overrides the task index specified in the
         TF_CONFIG environment variable.
       rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
       environment: (String, optional) Overrides the environment TensorFlow
         operates in.
     """
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
     self._environment = environment
 
@@ -75,25 +77,25 @@ class TFConfigClusterResolver(ClusterResolver):
   def task_type(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['type'] if 'type' in task_info else None
+      return str(task_info['type']) if 'type' in task_info else None
     else:
-      return self._task_type
+      return str(self._task_type)
 
   @property
-  def task_index(self):
+  def task_id(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['index'] if 'index' in task_info else None
+      return int(task_info['index']) if 'index' in task_info else None
     else:
-      return self._task_index
+      return int(self._task_id)
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -110,6 +112,15 @@ class TFConfigClusterResolver(ClusterResolver):
   def rpc_layer(self, rpc_layer):
     self._rpc_layer = rpc_layer
 
+  def num_accelerators(self,
+                       task_type=None,
+                       task_id=None,
+                       config_proto=None):
+    task_type = self.task_type if task_type is None else task_type
+    task_id = self.task_id if task_id is None else task_id
+    return super(TFConfigClusterResolver, self).num_accelerators(
+        task_type, task_id, config_proto)
+
   def cluster_spec(self):
     """Returns a ClusterSpec based on the TF_CONFIG environment variable.
 
@@ -121,13 +132,13 @@ class TFConfigClusterResolver(ClusterResolver):
       return ClusterSpec({})
     return ClusterSpec(tf_config['cluster'])
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a TensorFlow session.
 
     Args:
       task_type: (String, optional) Overrides and sets the task_type of the
         master.
-      task_index: (Integer, optional) Overrides and sets the task id of the
+      task_id: (Integer, optional) Overrides and sets the task id of the
         master.
       rpc_layer: (String, optional) Overrides and sets the protocol over which
         TensorFlow nodes communicate with each other.
@@ -155,7 +166,7 @@ class TFConfigClusterResolver(ClusterResolver):
     # We try to auto-detect the task type and id, but uses the user-supplied one
     # where available
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    return format_master_url(cluster_spec.task_address(task_type, task_index),
+    return format_master_url(cluster_spec.task_address(task_type, task_id),
                              self.rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 36b3bb9c1e1a32960525f8cff7f852e204c72211..b68d8bcd0ef1768f0c24012bb2bc773c8ce4fad4 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -1,13 +1,13 @@
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -20,11 +20,17 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python import eager
+from tensorflow.python.client import session
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
+mock = test.mock
 
+
+@test_util.run_all_in_graph_and_eager_modes
 class TFConfigClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
@@ -150,7 +156,7 @@ class TFConfigClusterResolverTest(test.TestCase):
 
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(0, cluster_resolver.task_id)
     self.assertEqual('grpc', cluster_resolver.rpc_layer)
 
   def testParameterOverrides(self):
@@ -168,21 +174,55 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0)
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_id=0)
 
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(0, cluster_resolver.task_id)
 
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 1
+    cluster_resolver.task_id = 1
     cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual('test://worker1:2222', cluster_resolver.master())
     self.assertEqual('worker', cluster_resolver.task_type)
-    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual(1, cluster_resolver.task_id)
     self.assertEqual('test', cluster_resolver.rpc_layer)
 
+  def testTaskTypeCastToString(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "123456": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": 123456,
+        "index": 0
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('123456', cluster_resolver.task_type)
+
+  def testTaskIndexCastToInteger(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": "1"
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual(1, cluster_resolver.task_id)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
@@ -203,6 +243,50 @@ class TFConfigClusterResolverTest(test.TestCase):
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('', cluster_resolver.master())
 
+  @mock.patch.object(eager.context, 'list_devices')
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsFilterTasksByEnvVar(self, mock_list_devices,
+                                             mock_eager_list_devices):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "worker1": ["w10:2222"],
+        "worker2": ["w21:2222", "w22:2222", "w23:2222", "w24:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "worker1",
+        "index": "0"
+      }
+    }
+    """
+
+    device_names = [
+        '/job:worker1/task:0/device:TPU:0',
+        '/job:worker1/task:0/device:TPU:1',
+        '/job:worker1/task:0/device:GPU:0',
+        '/job:worker1/task:0/device:GPU:1',
+        '/job:worker2/task:1/device:TPU:2',
+        '/job:worker2/task:2/device:TPU:3',
+        '/job:worker2/task:3/device:GPU:2',
+        '/job:worker2/task:4/device:GPU:3',
+    ]
+    device_list = [
+        session._DeviceAttributes(name, name[27:30], 1024, 0)
+        for name in device_names
+    ]
+    mock_eager_list_devices.return_value = device_names
+    mock_list_devices.return_value = device_list
+
+    resolver = TFConfigClusterResolver()
+
+    # By default we read from TF_CONFIG
+    self.assertEqual(resolver.num_accelerators(), {'TPU': 2, 'GPU': 2})
+
+    # Override still works when we want it to
+    self.assertEqual(resolver.num_accelerators(task_type='worker2', task_id=3),
+                     {'GPU': 1})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 635948fd1994dd896dae1293617f591effcf5d13..9a36dc77ae7d424cbb8e3e610ab6dcb976c59dfd 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
@@ -42,7 +43,6 @@ try:
 except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
-
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
 _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
@@ -56,38 +56,7 @@ DeviceDetails = collections.namedtuple(
     'DeviceDetails', ['device_map', 'total_cores'])
 
 
-def _get_device_dict_and_cores(devices):
-  """Returns a dict of hosts to cores and total cores given devices names.
-
-  Returns a namedtuple with two attributes:
-    device_map: A map of host_ids to a list of core_ids.
-    total_cores: The total number of cores within the TPU system.
-
-  Args:
-    devices: A list of devices returned by session.list_devices()
-  """
-  device_map = collections.defaultdict(list)
-  num_cores = 0
-  for device in devices:
-    match = _TPU_DEVICE_REGEX.match(device.name)
-    if match:
-      host_id = match.group('host_id')
-      core_id = match.group('core_id')
-      device_map[host_id].append(core_id)
-      num_cores += 1
-  return DeviceDetails(device_map, num_cores)
-
-
-def _verify_and_return_same_core_count(device_dict):
-  """Verifies that every device in device_dict has the same number of cores."""
-  num_cores_per_host_set = (
-      {len(core_ids) for core_ids in device_dict.values()})
-  if len(num_cores_per_host_set) != 1:
-    raise RuntimeError('TPU cores on each device is not the same. This '
-                       'should never happen. Devices: {}'.format(device_dict))
-  return num_cores_per_host_set.pop()
-
-
+@tf_export('distribute.cluster_resolver.TPUClusterResolver')
 class TPUClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
@@ -143,6 +112,38 @@ class TPUClusterResolver(ClusterResolver):
       return False
     return True
 
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
   @staticmethod
   def _inGke():
     """When running in GKE, the environment variable will be set."""
@@ -254,10 +255,10 @@ class TPUClusterResolver(ClusterResolver):
       raise RuntimeError('You need to specify a TPU Name if you are running in '
                          'the Google Cloud environment.')
 
-    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
     # first worker in the task).
     self.task_type = job_name
-    self.task_index = 0
+    self.task_id = 0
 
     if tpu.startswith('grpc://'):
       # Cloud environment, where we are using GRPC to communicate to TPUs.
@@ -285,7 +286,7 @@ class TPUClusterResolver(ClusterResolver):
     # in later in self.master().
     if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
       tpu = tpu[len(self.rpc_layer + '://'):]
-      self._tpu = tpu
+      self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
       self._should_resolve_override = False
 
     # Whether we should actually attempt to contact Cloud APIs
@@ -327,7 +328,7 @@ class TPUClusterResolver(ClusterResolver):
     else:
       self._coordinator_address = coordinator_address
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Get the Master string to be used for the session.
 
     In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
@@ -341,7 +342,7 @@ class TPUClusterResolver(ClusterResolver):
     Args:
       task_type: (Optional, string) The type of the TensorFlow task of the
         master.
-      task_index: (Optional, integer) The index of the TensorFlow task of the
+      task_id: (Optional, integer) The index of the TensorFlow task of the
         master.
       rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
         communicate with TPUs.
@@ -355,12 +356,12 @@ class TPUClusterResolver(ClusterResolver):
     if self._shouldResolve():
       # We are going to communicate with the Cloud TPU APIs to get a Cluster.
       cluster_spec = self.cluster_spec()
-      if task_type is not None and task_index is not None:
-        # task_type and task_index is from the function parameter
-        master = cluster_spec.task_address(task_type, task_index)
-      elif self.task_type is not None and self.task_index is not None:
-        # task_type and task_index is from the object
-        master = cluster_spec.task_address(self.task_type, self.task_index)
+      if task_type is not None and task_id is not None:
+        # task_type and task_id is from the function parameter
+        master = cluster_spec.task_address(task_type, task_id)
+      elif self.task_type is not None and self.task_id is not None:
+        # task_type and task_id is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_id)
       else:
         # by default we take the first item in the cluster with the right name
         job_tasks = cluster_spec.job_tasks(self.task_type)
@@ -369,7 +370,7 @@ class TPUClusterResolver(ClusterResolver):
         master = job_tasks[0]
     else:
       if isinstance(self._tpu, (bytes, bytearray)):
-        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+        master = compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR)[0]
       else:
         master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
     return format_master_url(master, rpc_layer or self.rpc_layer)
@@ -379,7 +380,7 @@ class TPUClusterResolver(ClusterResolver):
 
   def get_job_name(self):
     if (self._shouldResolve() or
-        self._tpu.startswith(compat.as_bytes('grpc://'))):
+        self._isRunningInGCE()):
       return self.task_type
 
   def cluster_spec(self):
@@ -418,10 +419,6 @@ class TPUClusterResolver(ClusterResolver):
         raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
                            (compat.as_text(self._tpu), response['state']))
 
-      if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
-                           (compat.as_text(self._tpu), response['health']))
-
       if 'networkEndpoints' in response:
         worker_list = [
             '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
@@ -439,7 +436,7 @@ class TPUClusterResolver(ClusterResolver):
         return None
       # Case 2.
       tpus = []
-      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+      for tpu in compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR):
         # We are working around the fact that GKE environment variable that is
         # supplied to us has the protocol string embedded in it, but we want
         # to strip it out for the ClusterSpec.
@@ -458,8 +455,7 @@ class TPUClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='TPU',
+                       task_id=None,
                        config_proto=None):
     """Returns the number of TPU cores per worker.
 
@@ -469,8 +465,7 @@ class TPUClusterResolver(ClusterResolver):
 
     Args:
       task_type: Unused.
-      task_index: Unused.
-      accelerator_type: Unused.
+      task_id: Unused.
       config_proto: Used to create a connection to a TPU master in order to
         retrieve the system metadata.
 
@@ -482,7 +477,7 @@ class TPUClusterResolver(ClusterResolver):
     # TODO(b/120564445): Replace with standard library for retries.
     while True:
       try:
-        device_details = _get_device_dict_and_cores(
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
             get_accelerator_devices(self.master(), config_proto=config_proto))
         break
       except errors.DeadlineExceededError:
@@ -497,8 +492,9 @@ class TPUClusterResolver(ClusterResolver):
           raise RuntimeError(error_message)
 
     if device_details.total_cores:
-      return _verify_and_return_same_core_count(device_details.device_map)
-    return 0
+      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)}
+    return {'TPU': 0}
 
   @property
   def environment(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 58c332a5098d34cca361e0920ce0a22d12cc0ffd..a0027b211c735350f549183604863ed09e130ff8 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -23,10 +23,11 @@ import os
 import six
 from six.moves.urllib.error import URLError
 
+from tensorflow.python import eager
 from tensorflow.python.client import session
-from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -89,6 +90,7 @@ def mock_not_running_in_gce_urlopen(cls, *args, **kwargs):
   raise URLError(reason='Host does not exist.')
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TPUClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
@@ -129,26 +131,26 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_isRunningInGCE',
                      mock_is_running_in_gce)
   def testCheckRunningInGceWithNoTpuName(self):
     with self.assertRaisesRegexp(RuntimeError, '.*Google Cloud.*'):
-      cluster_resolver.TPUClusterResolver(tpu='')
+      TPUClusterResolver(tpu='')
 
   @mock.patch.object(six.moves.urllib.request,
                      'urlopen',
                      mock_running_in_gce_urlopen)
   def testIsRunningInGce(self):
-    self.assertTrue(cluster_resolver.TPUClusterResolver._isRunningInGCE())
+    self.assertTrue(TPUClusterResolver._isRunningInGCE())
 
   @mock.patch.object(six.moves.urllib.request,
                      'urlopen',
                      mock_not_running_in_gce_urlopen)
   def testIsNotRunningInGce(self):
-    self.assertFalse(cluster_resolver.TPUClusterResolver._isRunningInGCE())
+    self.assertFalse(TPUClusterResolver._isRunningInGCE())
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadata(self):
@@ -160,7 +162,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -182,7 +184,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
     self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
@@ -194,7 +196,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -209,30 +211,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
     self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
-                     '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testUnhealthyCloudTpu(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'UNHEALTHY'
-        }
-    }
-
-    resolver = cluster_resolver.TPUClusterResolver(
-        project=None,
-        zone=None,
-        tpu='test-tpu-1',
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    with self.assertRaises(RuntimeError):
-      resolver.cluster_spec()
-
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testNotReadyCloudTpu(self):
@@ -244,7 +223,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -264,7 +243,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
@@ -292,7 +271,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -309,7 +288,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
     self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testPodResolution(self):
@@ -338,7 +317,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map),
@@ -387,7 +366,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -412,7 +391,7 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {}
 
     with self.assertRaises(ValueError):
-      cluster_resolver.TPUClusterResolver(
+      TPUClusterResolver(
           project='test-project',
           zone='us-central1-c',
           tpu=[],
@@ -422,7 +401,7 @@ class TPUClusterResolverTest(test.TestCase):
 
   # TODO(saeta): Convert to parameterized test when included in OSS TF.
   def verifyShouldResolve(self, tpu, should_resolve):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=tpu,
@@ -432,7 +411,7 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual(should_resolve, resolver._shouldResolve(),
                      "TPU: '%s'" % tpu)
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_isRunningInGCE',
                      mock_is_not_running_in_gce)
   def testShouldResolveNoName(self):
@@ -457,22 +436,21 @@ class TPUClusterResolverTest(test.TestCase):
     self.verifyShouldResolve('grpctpu', True)
 
   def testNoCallComputeMetadata(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='/bns/foo/bar')
-    self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual('/bns/foo/bar', resolver.master())
     self.assertEqual(None, resolver.cluster_spec())
 
   def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
 
-    resolver = cluster_resolver.TPUClusterResolver()
+    resolver = TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
         compat.as_bytes(resolver.master()))
@@ -494,15 +472,15 @@ class TPUClusterResolverTest(test.TestCase):
                                                      'grpc://10.120.27.8:8470')
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470,'
                         'grpc://10.120.27.6:8470,'
                         'grpc://10.120.27.7:8470,'
                         'grpc://10.120.27.8:8470'),
-        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
 
-    resolver = cluster_resolver.TPUClusterResolver()
+    resolver = TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
         compat.as_bytes(resolver.master()))
@@ -523,17 +501,17 @@ class TPUClusterResolverTest(test.TestCase):
   def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     (cluster_resolver.TPUClusterResolver.
+                     (TPUClusterResolver.
                       _environmentDiscoveryUrl()))
 
   def testEnvironmentAndRpcDetectionForGoogle(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='/bns/ab/cd/ef')
     self.assertEqual(resolver.environment, 'google')
     self.assertEqual(resolver.rpc_layer, None)
 
   def testEnvironmentAndRpcDetectionForGrpcString(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='grpc://10.1.2.3:8470')
     self.assertEqual(resolver.environment, '')
     self.assertEqual(resolver.rpc_layer, 'grpc')
@@ -565,7 +543,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -576,12 +554,12 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
     resolver.task_type = 'worker'
-    resolver.task_index = 3
+    resolver.task_id = 3
     self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
 
     self.assertEqual(
         resolver.master(
-            task_type='worker', task_index=2, rpc_layer='test'),
+            task_type='worker', task_id=2, rpc_layer='test'),
         'test://10.2.3.6:8470')
 
   def testGetDeviceDictAndCoresWithTPUs(self):
@@ -600,7 +578,7 @@ class TPUClusterResolverTest(test.TestCase):
             name, 'TPU', 1024, 0) for name in device_names
     ]
 
-    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
+    device_details = TPUClusterResolver._get_device_dict_and_cores(
         device_list)
     self.assertEqual(device_details.total_cores, 8)
     self.assertEqual(device_details.device_map,
@@ -625,27 +603,29 @@ class TPUClusterResolverTest(test.TestCase):
             name, 'XLA', 1024, 0) for name in device_names
     ]
 
-    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
+    device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores(
         device_list)
     self.assertEqual(num_cores, 0)
     self.assertEqual(device_dict, {})
 
   def testVerifySameCoreCount(self):
     self.assertEqual(
-        tpu_cluster_resolver._verify_and_return_same_core_count(
+        TPUClusterResolver._verify_and_return_same_core_count(
             {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
     self.assertEqual(
-        tpu_cluster_resolver._verify_and_return_same_core_count(
+        TPUClusterResolver._verify_and_return_same_core_count(
             {0: [0, 1], 1: [2, 3]}), 2)
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver._verify_and_return_same_core_count(
+      TPUClusterResolver._verify_and_return_same_core_count(
           {0: [0], 1: [1, 2]})
 
+  @mock.patch.object(eager.context, 'list_devices')
   @mock.patch.object(session.BaseSession, 'list_devices')
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_isRunningInGCE',
                      mock_is_not_running_in_gce)
-  def testNumAcceleratorsSuccess(self, mock_list_devices):
+  def testNumAcceleratorsSuccess(self, mock_list_devices,
+                                 mock_eager_list_devices):
     device_names = [
         '/job:tpu_worker/task:0/device:TPU:0',
         '/job:tpu_worker/task:1/device:TPU:1',
@@ -660,19 +640,24 @@ class TPUClusterResolverTest(test.TestCase):
         session._DeviceAttributes(
             name, 'TPU', 1024, 0) for name in device_names
     ]
+    mock_eager_list_devices.return_value = device_names
     mock_list_devices.return_value = device_list
 
-    resolver = cluster_resolver.TPUClusterResolver(tpu='')
-    self.assertEqual(resolver.num_accelerators(), 2)
+    resolver = TPUClusterResolver(tpu='')
+    self.assertEqual(resolver.num_accelerators(), {'TPU': 2})
 
+  @mock.patch.object(eager.context, 'list_devices')
   @mock.patch.object(session.BaseSession, 'list_devices')
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_isRunningInGCE',
                      mock_is_not_running_in_gce)
-  def testNumAcceleratorsRetryFailure(self, mock_list_devices):
-    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+  def testNumAcceleratorsRetryFailure(self, mock_list_devices,
+                                      mock_eager_list_devices):
+    resolver = TPUClusterResolver(tpu='')
     mock_list_devices.side_effect = errors.DeadlineExceededError(
         None, None, 'timeout')
+    mock_eager_list_devices.side_effect = errors.DeadlineExceededError(
+        None, None, 'timeout')
     with self.assertRaises(RuntimeError):
       resolver.num_accelerators()
 
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..26998faa96170e8a28d16a2790884fe42108d42a
--- /dev/null
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -0,0 +1,483 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class CollectiveAllReduceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(yuefengz): support in-graph replication.
+@tf_export("distribute.experimental.MultiWorkerMirroredStrategy")
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
+  """Distribution strategy that uses collective ops for all-reduce.
+
+  It is similar to MirroredStrategy but it uses collective ops for reduction.
+
+  By default it uses all local GPUs or CPU for single-worker training.
+
+  When 'TF_CONFIG' environment variable is given, it parses cluster_spec,
+  task_type and task_id from 'TF_CONFIG' and turns into a multi-worker strategy
+  which mirrores models on GPUs of all machines in a cluster. In the current
+  implementation, it uses all GPUs in a cluster and it assumes all workers have
+  the same number of GPUs.
+
+  It supports both eager mode and graph mode. However, for eager mode, it has to
+  set up the eager context in its constructor and therefore all ops in eager
+  mode have to run after the strategy object is created.
+
+  Args:
+    communication: optional Enum of type
+      `distribute.experimental.CollectiveCommunication`.  This provides a way
+      for the user to override the choice of collective op communication.
+      Possible values include `AUTO`, `RING`, and `NCCL`.
+  """
+
+  def __init__(
+      self,
+      communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
+    """Initializes the object."""
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(
+            self,
+            communication=communication))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               communication,
+               cluster_resolver=TFConfigClusterResolver()):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
+    assert isinstance(
+        communication,
+        cross_device_ops_lib.CollectiveCommunication)
+    self._communication = communication
+    self._initialize_strategy(cluster_resolver)
+    assert isinstance(self._get_cross_device_ops(),
+                      cross_device_ops_lib.CollectiveAllReduce)
+
+  def _initialize_strategy(self, cluster_resolver):
+    if cluster_resolver.cluster_spec().as_dict():
+      self._initialize_multi_worker(cluster_resolver)
+    else:
+      self._initialize_local(cluster_resolver)
+
+  def _initialize_local(self, cluster_resolver):
+    """Initializes the object for local training."""
+    self._is_chief = True
+    self._num_workers = 1
+
+    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
+    # some cases.
+    if isinstance(cluster_resolver, TFConfigClusterResolver):
+      num_gpus = context.num_gpus()
+    else:
+      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+
+    if num_gpus:
+      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
+    else:
+      local_devices = ("/device:CPU:0",)
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
+
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
+    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus,
+        collective_keys=self._collective_keys)
+
+    self._cluster_spec = None
+    self._task_type = None
+    self._task_id = None
+
+    # This is a mark to tell whether we are running with standalone client or
+    # independent worker. Right now with standalone client, strategy object is
+    # created as local strategy and then turn into multi-worker strategy via
+    # configure call.
+    self._local_or_standalone_client_mode = True
+
+    # Save the num_gpus_per_worker and rpc_layer for configure method.
+    self._num_gpus_per_worker = num_gpus
+    self._rpc_layer = cluster_resolver.rpc_layer
+
+    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
+                 local_devices)
+
+  def _initialize_multi_worker(self, cluster_resolver):
+    """Initializes the object for multi-worker training."""
+    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
+    # assumes all workers have the same number of GPUs. We should remove this
+    # assumption by querying all tasks for their numbers of GPUs.
+    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
+    # some cases.
+    if isinstance(cluster_resolver, TFConfigClusterResolver):
+      num_gpus = context.num_gpus()
+    else:
+      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+
+    cluster_spec = multi_worker_util.normalize_cluster_spec(
+        cluster_resolver.cluster_spec())
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_id
+    if task_type is None or task_id is None:
+      raise ValueError("When `cluster_spec` is given, you must also specify "
+                       "`task_type` and `task_id` in the `cluster_resolver`.")
+    if task_type not in ("chief", "worker"):
+      raise ValueError(
+          "Unrecognized task_type: %r, valid task types are: \"chief\", "
+          "\"worker\"." % task_type)
+
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
+    if not self._num_workers:
+      raise ValueError("No `worker` or `chief` tasks can be found in "
+                       "`cluster_spec`.")
+
+    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                task_id)
+
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
+    if num_gpus:
+      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
+                            for i in range(num_gpus))
+    else:
+      local_devices = (self._worker_device,)
+
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(self._worker_device, self.worker_devices)])
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus,
+        collective_keys=self._collective_keys)
+
+    # Add a default device so that ops without specified devices will not end up
+    # on other workers.
+    self._default_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+
+    # Save the num_gpus_per_worker and rpc_layer for configure method.
+    self._num_gpus_per_worker = num_gpus
+    self._rpc_layer = cluster_resolver.rpc_layer
+
+    logging.info(
+        "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
+        "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, "
+        "communication = %s", cluster_spec.as_dict(), task_type,
+        task_id, self._num_workers, local_devices,
+        self._communication)
+
+    if (context.executing_eagerly() and
+        not getattr(self, "_std_server_started", False) and
+        not getattr(self, "_local_or_standalone_client_mode", False)):
+      # Checking _local_or_standalone_client_mode as well because we should not
+      # create the std server in standalone client mode.
+      config_proto = config_pb2.ConfigProto()
+      config_proto = self._update_config_proto(config_proto)
+      server_def = tensorflow_server_pb2.ServerDef(
+          cluster=cluster_spec.as_cluster_def(),
+          default_session_config=config_proto,
+          job_name=task_type,
+          task_index=task_id,
+          protocol=cluster_resolver.rpc_layer or "grpc")
+      context.context().enable_collective_ops(server_def)
+      self._std_server_started = True
+      logging.info(
+          "Enabled multi-worker collective ops with available devices: %r",
+          context.context().devices())
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      device_map = self._device_map
+      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      device_map = colocate_with.device_map
+      logical_device = colocate_with.logical_device
+
+    def _real_mirrored_creator(devices, *args, **kwargs):
+      """Creates one MirroredVariable on the current worker."""
+      unique_var_name = ops.get_default_graph().unique_name(
+          kwargs["name"], mark_as_used=False).rstrip("/")
+      # pylint: disable=protected-access
+      collective_instance_key = self._collective_keys.get_instance_key(
+          key_id=unique_var_name)
+      # Only the first device participles in the broadcast of initial values.
+      group_key = self._collective_keys.get_group_key([devices[0]])
+      group_size = self._num_workers
+      if "initial_value" not in kwargs:
+        raise ValueError("Initial value must be specified.")
+      initial_value = kwargs["initial_value"]
+      if callable(initial_value):
+        initial_value_fn = initial_value
+      else:
+        initial_value_fn = lambda: initial_value
+
+      value_list = []
+      for i, d in enumerate(devices):
+        with ops.init_scope(), ops.device(d):
+          if i == 0:
+            # The initial value fn makes sure variables all initialized to
+            # same values. The first device of the chief worker will send their
+            # variable values to other workers.
+            def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
+              with ops.device(device):
+                initial_value = initial_value_fn()
+                assert not callable(initial_value)
+                initial_value = ops.convert_to_tensor(initial_value)
+
+                assert index == 0, index
+                if self._num_workers > 1:
+                  if self._is_chief:
+                    bcast_send = collective_ops.broadcast_send(
+                        initial_value, initial_value.shape, initial_value.dtype,
+                        group_size, group_key, collective_instance_key)
+                    with ops.control_dependencies([bcast_send]):
+                      return array_ops.identity(initial_value)
+                  else:
+                    return collective_ops.broadcast_recv(
+                        initial_value.shape, initial_value.dtype, group_size,
+                        group_key, collective_instance_key)
+                return initial_value
+          else:
+            # Give replicas meaningful distinct names:
+            var0name = value_list[0].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+
+            # Variables on non-first replica get initial values from the
+            # variables created on the first device of each worker.
+            def _overridden_initial_value_fn(device=d, index=i):
+              assert index > 0
+              with ops.device(device):
+                if context.executing_eagerly():
+                  return array_ops.identity(value_list[0].value())
+                else:
+                  return array_ops.identity(value_list[0].initial_value)
+
+          kwargs["initial_value"] = _overridden_initial_value_fn
+          with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+
+          if i == 0:
+            actual_var_name = v.name.split(":")[0]
+            assert unique_var_name == actual_var_name, "%r vs %r" % (
+                unique_var_name, actual_var_name)
+          assert not isinstance(v, values.DistributedVariable)
+          value_list.append(v)
+      return value_list
+
+    # pylint: disable=protected-access
+    return mirrored_strategy._create_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
+
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the object.
+
+    Args:
+      session_config: a `tf.ConfigProto`
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type, such as "worker".
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `task_type` is not in the `cluster_spec`.
+    """
+    if cluster_spec:
+      # Use the num_gpus_per_worker recorded in constructor since _configure
+      # doesn't take num_gpus.
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators={"GPU": self._num_gpus_per_worker},
+          rpc_layer=self._rpc_layer)
+      self._initialize_multi_worker(cluster_resolver)
+      assert isinstance(self._get_cross_device_ops(),
+                        cross_device_ops_lib.CollectiveAllReduce)
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    # Enable the scoped allocator optimization for CollectiveOps.  This
+    # optimization converts many small all-reduces into fewer larger
+    # all-reduces.
+    rewrite_options = updated_config.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
+    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
+    # clear and then append.
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
+
+    if ((self._communication ==
+         cross_device_ops_lib.CollectiveCommunication.NCCL) and
+        self._num_gpus_per_worker > 0):
+      updated_config.experimental.collective_nccl = True
+
+    if not self._cluster_spec:
+      return updated_config
+
+    assert self._task_type
+    assert self._task_id is not None
+
+    # Collective group leader is needed for collective ops to coordinate
+    # workers.
+    if "chief" in self._cluster_spec.jobs:
+      updated_config.experimental.collective_group_leader = (
+          "/job:chief/replica:0/task:0")
+    else:
+      if "worker" not in self._cluster_spec.jobs:
+        raise ValueError(
+            "You must have `chief` or `worker` jobs in the `cluster_spec`.")
+      updated_config.experimental.collective_group_leader = (
+          "/job:worker/replica:0/task:0")
+
+    # The device filters prevent communication between workers.
+    del updated_config.device_filters[:]
+    updated_config.device_filters.append(
+        "/job:%s/task:%d" % (self._task_type, self._task_id))
+
+    return updated_config
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
+    assert not isinstance(value, values.Mirrored)
+
+    if (isinstance(value, values.DistributedValues) and
+        len(self.worker_devices) == 1):
+      value = value.values[0]
+
+    # When there are multiple workers, we need to reduce across workers using
+    # collective ops.
+    if (not isinstance(value, values.DistributedValues) and
+        self._num_workers == 1):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  @property
+  def experimental_between_graph(self):
+    return True
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self.worker_devices) * self._num_workers
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9729302c6dc1e22772c1a80a25eff17720c50994..ef124baf378e890a21441574a7627f3e03d13446 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import enum
 import six
 
 from tensorflow.python.client import device_lib
@@ -32,6 +33,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def check_destinations(destinations):
@@ -114,6 +117,9 @@ def _make_tensor_into_per_replica(input_tensor):
 def _normalize_value_destination_pairs(value_destination_pairs):
   """Converts each tensor into a PerReplica object in the input list."""
   result = []
+
+  value_destination_pairs = list(value_destination_pairs)
+
   if not isinstance(value_destination_pairs, (list, tuple)):
     raise ValueError("`value_destination_pairs` should be a list or tuple")
   for pair in value_destination_pairs:
@@ -204,7 +210,7 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
   count = len(all_values)
 
   with ops.device(reduce_to_device):
-    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+    with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
       reduced = cross_device_utils.aggregate_tensors_or_indexed_slices(
           all_values, accumulation_fn)
       if reduce_op == reduce_util.ReduceOp.MEAN:
@@ -215,6 +221,7 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
   return reduced
 
 
+@tf_export("distribute.CrossDeviceOps")
 class CrossDeviceOps(object):
   """Base class for cross-device reduction and broadcasting algorithms."""
 
@@ -244,7 +251,8 @@ class CrossDeviceOps(object):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(reduce_op, per_replica_value, destinations)
+    return self.reduce_implementation(reduce_op, per_replica_value,
+                                      destinations)
 
   def batch_reduce(self, reduce_op, value_destination_pairs):
     """Reduce PerReplica objects in a batch.
@@ -265,6 +273,8 @@ class CrossDeviceOps(object):
       ValueError: if `value_destination_pairs` is not a list or a tuple of
         tuples of PerReplica objects and destinations
     """
+    # TODO(yuefengz): if destinations are different, split into several
+    # `_batch_reduce` invocations.
     if not _validate_value_destination_pairs(value_destination_pairs):
       # If the first element of each pair is a tensor, we try to turn it into a
       # PerReplica object.
@@ -274,7 +284,7 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(reduce_op, value_destination_pairs)
+    return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -287,41 +297,93 @@ class CrossDeviceOps(object):
       a Mirrored object.
     """
     validate_destinations(destinations)
-    return self._broadcast(tensor, destinations)
+    return self.broadcast_implementation(tensor, destinations)
+
+  @doc_controls.for_subclass_implementers
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    """The implementation of reduce of `per_replica_value` to `destinations`.
+
+    It runs the reduction operation defined by `reduce_op` and put the
+    result on `destinations`.
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      per_replica_value: a PerReplica object or a tensor with device set.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a PerReplica
+        object.
+    """
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  @doc_controls.for_subclass_implementers
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
+    """Implementation of reduce PerReplica objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+        (or tensors with device set if there is one device) and destinations.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerReplica objects and destinations
+    """
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
-  def _broadcast(self, tensor, destinations):
+  @doc_controls.for_subclass_implementers
+  def broadcast_implementation(self, tensor, destinations):
+    """Implementation of broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
     return simple_broadcast(tensor, destinations, always_mirrored=True)
 
 
-class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
+@tf_export("distribute.ReductionToOneDevice")
+class ReductionToOneDevice(CrossDeviceOps):
   """Always do reduction to one device first and then do broadcasting.
 
     Batch reduction is done by reduction on each element one by one.
   """
 
-  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+  def __init__(self, reduce_to_device=None, accumulation_fn=None):
     """Constructor.
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
         to the first device in `destinations` of the reduce() method.
-      accumulation_fn: a function that does accumulation.
+      accumulation_fn: a function that does accumulation.  If None, then
+        `tf.math.add_n` is used.
     """
     self.reduce_to_device = reduce_to_device
-    self.accumulation_fn = accumulation_fn
-    super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
+    self.accumulation_fn = accumulation_fn or math_ops.add_n
+    super(ReductionToOneDevice, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
-    assert check_destinations(destinations)
-    devices = get_devices_from(destinations)
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     logging.log_first_n(
         logging.INFO,
@@ -330,9 +392,9 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
                              self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, destinations)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(reduce_op, t, destinations=v)
+        self.reduce_implementation(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
@@ -399,11 +461,11 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
 
 
-class ConcatAndSplitPacker(object):
+class _ConcatAndSplitPacker(object):
   """Concatenate and split tensors for reduction."""
 
   def __init__(self, num_packs=1):
-    """Initialize the ConcatAndSplitPacker object.
+    """Initialize the _ConcatAndSplitPacker object.
 
     Args:
       num_packs: specifies the number of split packs that will be
@@ -503,13 +565,13 @@ class ConcatAndSplitPacker(object):
     return aggregated_device_grads
 
 
-class AggregateSmallTensorPacker(object):
+class _AggregateSmallTensorPacker(object):
   """Concatenate small gradient tensors together for reduction."""
 
   def __init__(self,
                agg_small_grads_max_bytes=1048576,
                agg_small_grads_max_group=16):
-    """Initialize the AggregateSmallTensorPacker object.
+    """Initialize the _AggregateSmallTensorPacker object.
 
     Args:
       agg_small_grads_max_bytes: largest tensor eligible for aggregation,
@@ -549,11 +611,11 @@ def _pack_tensors(device_grads,
                   agg_small_grads_max_group=0):
   """Pack tensors if specified."""
   if num_packs > 0:
-    tensor_packer = ConcatAndSplitPacker(num_packs)
+    tensor_packer = _ConcatAndSplitPacker(num_packs)
     device_grad_packs = tensor_packer.pack(device_grads)
   elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
-                                               agg_small_grads_max_group)
+    tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,
+                                                agg_small_grads_max_group)
     device_grad_packs = tensor_packer.pack(device_grads)
   else:
     tensor_packer = None
@@ -569,7 +631,7 @@ def _unpack_tensors(reduced, tensor_packer=None):
 
 
 class AllReduceCrossDeviceOps(CrossDeviceOps):
-  """Reduction using all reduce."""
+  """Reduction using all-reduce."""
 
   def __init__(self,
                all_reduce_alg="nccl",
@@ -594,37 +656,22 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
       num_packs: see above.
       agg_small_grads_max_bytes: see above.
       agg_small_grads_max_group: see above.
-        tensors.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
     self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
     self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._simple_cross_replica_ops = ReductionToOneDevice()
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
-    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
-        per_replica_value)
-    if (_devices_match(per_replica_value, destinations)
-        and not context.executing_eagerly()
-        and not contains_indexed_slices):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    if _devices_match(per_replica_value, destinations):
       return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
-      if contains_indexed_slices:
-        logging.log_first_n(
-            logging.WARN,
-            "Efficient allreduce is not supported for IndexedSlices.", 10)
-
-      if check_destinations(destinations):
-        devices = get_devices_from(destinations)
-      else:
-        devices = get_devices_from(per_replica_value)
-      reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                               math_ops.add_n, reduce_op)
-      return self.broadcast(reduced, destinations)
+      return self._simple_cross_replica_ops.reduce(reduce_op, per_replica_value,
+                                                   destinations)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
     contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
@@ -640,20 +687,37 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
+    dense_values, dense_indices, sparse_values, sparse_indices = (
+        cross_device_utils.split_by_sparsity(per_replica_values))
+    if dense_values:
+      dense_results = self._do_batch_all_reduce(reduce_op, dense_values)
+    else:
+      dense_results = []
+    if sparse_values:
+      sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
+                                                        sparse_values)
+    else:
+      sparse_results = []
+    return cross_device_utils.stitch_values(((dense_results, dense_indices),
+                                             (sparse_results, sparse_indices)))
+
+  def _do_batch_all_reduce(self, reduce_op, dense_values):
+    """Run batch all-reduces."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
         "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
-        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
+        (len(dense_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_replica_values[0].devices
-    grouped = _group_value_by_device(per_replica_values)
+
+    destinations = dense_values[0].devices
+    grouped = _group_value_by_device(dense_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
         grouped, self._num_packs, self._agg_small_grads_max_bytes,
@@ -674,7 +738,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_replica_values[0], reduce_op)
+    return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
+
+  def _do_batch_all_reduce_sparse(self, reduce_op, sparse_values):
+    """Run batch all-reduce for sparse values."""
+    logging.log_first_n(
+        logging.WARN,
+        "Efficient allreduce is not supported for %d IndexedSlices" %
+        len(sparse_values), 10)
+    # Use `sparse_values` as destinations to do all-reduces. It is effectively
+    # an allgather under the hood but not an efficient one.
+    return self._simple_cross_replica_ops.batch_reduce(
+        reduce_op, zip(sparse_values, sparse_values))
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -685,6 +760,49 @@ AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
                                             "alg shards limit")
 
 
+@tf_export("distribute.NcclAllReduce")
+class NcclAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using NCCL all-reduce."""
+
+  def __init__(self, num_packs=1):
+    """NCCL all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    assert num_packs > 0, (
+        "NCLL all-reduce requires num_packs > 0, but {} is specified".format(
+            num_packs))
+    super(NcclAllReduce, self).__init__(
+        all_reduce_alg="nccl", num_packs=num_packs)
+
+
+@tf_export("distribute.HierarchicalCopyAllReduce")
+class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using hierarchical copy all-reduce.
+
+  This is a good reduction for configurations like Nvidia DGX-1.
+  """
+
+  def __init__(self, num_packs=1):
+    """Hierarchical copy all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    super(HierarchicalCopyAllReduce, self).__init__(
+        all_reduce_alg="hierarchical_copy",
+        num_packs=num_packs)
+
+
 class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
   """All-reduce algorithms for distributed TensorFlow."""
 
@@ -758,7 +876,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
         "distributed batch_all_reduce invoked for batches size = %d with "
@@ -769,7 +887,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
 
     device_grads = _group_value_by_device(per_replica_values)
 
-    # The all reduce library requires fully defined shapes.
+    # The all-reduce library requires fully defined shapes.
     # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
     # required as well.
     for device_grad in device_grads:
@@ -807,6 +925,21 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
                                       reduce_op)
 
 
+@tf_export("distribute.experimental.CollectiveCommunication")
+class CollectiveCommunication(enum.Enum):
+  """Communication choices for CollectiveOps.
+
+  * `AUTO`: Default to runtime's automatic choices.
+  * `RING`: TensorFlow's ring algorithms for all-reduce and
+    all-gather.
+  * `NCCL`: Use ncclAllReduce for all-reduce, and ring algorithms for
+    all-gather.  TODO(ayushd): add ncclAllGather implementation.
+  """
+  AUTO = "AUTO"
+  RING = "RING"
+  NCCL = "NCCL"
+
+
 # TODO(yuefengz): support in-graph collective all-reduce.
 class CollectiveAllReduce(CrossDeviceOps):
   """All-reduce cross device ops using collective ops.
@@ -838,7 +971,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
@@ -861,7 +994,7 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     return value_lib.Mirrored(device_map, index, logical_device)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
@@ -877,7 +1010,7 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
@@ -930,7 +1063,7 @@ _dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
 def _has_dgx1_like_links(gpu_links):
   if not gpu_links:
     return False
-  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # TODO(yuefengz): figure out the right topology for hierarchical copy if
   # number of gpus are less than 8.
   if len(gpu_links) < 8:
     return False
@@ -943,10 +1076,9 @@ def _has_dgx1_like_links(gpu_links):
 
 def _choose_all_reduce_algorithm(device_links):
   if _has_dgx1_like_links(device_links):
-    return AllReduceCrossDeviceOps(
-        "hierarchical_copy", num_packs=len(device_links))
+    return HierarchicalCopyAllReduce(num_packs=len(device_links))
   else:
-    return AllReduceCrossDeviceOps("nccl", num_packs=1)
+    return NcclAllReduce(num_packs=1)
 
 
 def choose_the_best(devices, session_config=None):
@@ -973,12 +1105,12 @@ def choose_the_best(devices, session_config=None):
   if len(using_devices) != len(requested_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   if any(d.device_type.lower() != "gpu" for d in using_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   device_links = [[] for _ in range(len(using_devices))]
   for i, device in enumerate(using_devices):
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index e8066dd467c285c50cb39b98450f5150756d6db9..612a958ebba3c989c6f873a978b889061cdbe1b6 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 
@@ -350,7 +350,7 @@ def build_collective_reduce(input_tensors,
   """
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
-    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
+    return input_tensors
   devices = [t.device for t in input_tensors]
   num_devices = len(devices)
   group_key = collective_keys.get_group_key(devices)
@@ -645,14 +645,14 @@ def unpack_small_tensors(replica_grads, packing):
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
   """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
   if any(isinstance(v, ops.IndexedSlices) for v in values):
-    return gradients_impl._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
+    return gradients_util._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
   else:
     return accumulation_fn(values)
 
 
 def divide_by_n_tensors_or_indexed_slices(value, n):
   if isinstance(value, ops.IndexedSlices):
-    value = gradients_impl._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
+    value = gradients_util._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
     return ops.IndexedSlices(
         value.values / n, value.indices, value.dense_shape)
   else:
@@ -681,3 +681,58 @@ def contains_indexed_slices(value):
     return contains_indexed_slices(value.values)
   else:
     return False
+
+
+def is_indexed_slices(value):
+  if isinstance(value, ops.IndexedSlices):
+    return True
+  assert isinstance(value, value_lib.DistributedValues)
+  return all([isinstance(v, ops.IndexedSlices) for v in value.values])
+
+
+def split_by_sparsity(values):
+  """Split values into dense and sparse values.
+
+  Args:
+    values: a list of tensors or `PerReplica`s.
+
+  Returns:
+    Four lists:
+      a list of dense values, a list of their indices in `values` and
+      a list of sparse values, a list of their indices in `values`.
+  """
+  dense_values = []
+  dense_indices = []
+  sparse_values = []
+  sparse_indices = []
+  for i, v in enumerate(values):
+    if is_indexed_slices(v):
+      sparse_values.append(v)
+      sparse_indices.append(i)
+    else:
+      dense_values.append(v)
+      dense_indices.append(i)
+  return dense_values, dense_indices, sparse_values, sparse_indices
+
+
+def stitch_values(values_and_indices_list):
+  """Stitch values together according to their indices.
+
+  Args:
+    values_and_indices_list: a list of tuples of values and indices indicating
+      the values and postions in the returned list.
+
+  Returns:
+    a stitched list of values.
+  """
+  length = 0
+  for values_and_indices in values_and_indices_list:
+    length += len(values_and_indices[0])
+
+  result = [None] * length
+  for values_and_indices in values_and_indices_list:
+    if values_and_indices and values_and_indices[0]:
+      for v, i in zip(*values_and_indices):
+        assert result[i] is None
+        result[i] = v
+  return result
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 89bd8537b4c9a9dd9265d2979acdc039b76c6be5..345de0dc50da91ab5af6f0a398bf13a4ee43296a 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -34,6 +34,9 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
 
 
+_thread_local = threading.local()
+
+
 class _TaskType(object):
   PS = "ps"
   WORKER = "worker"
@@ -77,8 +80,6 @@ class _Barrier(object):
 
   def wait(self):
     """Waits until all other callers reach the same wait call."""
-    if not hasattr(self._local_sense, "value"):
-      self._local_sense.value = False
     self._local_sense.value = not self._flag
     with self._lock:
       self._counter += 1
@@ -210,8 +211,8 @@ class _WorkerContext(object):
       ValueError: if `worker_barrier` is not passed to the __init__ method.
     """
     if not self._worker_barrier:
-      raise ValueError("`worker_barrier is not set in the worker context.` \t" +
-                       self._debug_message())
+      # TODO(yuefengz): we should throw an error in independent worker mode.
+      return
     self._worker_barrier.wait()
 
   def session_creator(self,
@@ -385,6 +386,27 @@ def _run_std_server(cluster_spec=None,
                     rpc_layer=None,
                     environment=None):
   """Runs a standard server."""
+  # Check if the Server is already running. If so, assert that no configuration
+  # options have changed, and return the existing Server. This allows us to
+  # call `run_distribute_coordinator` multiple times.
+  if getattr(_thread_local, "server", None) is not None:
+    assert _thread_local.cluster_spec == cluster_spec
+    assert _thread_local.task_type == task_type
+    assert _thread_local.task_id == task_id
+    assert _thread_local.session_config_str == repr(session_config)
+    assert _thread_local.rpc_layer == rpc_layer
+    assert _thread_local.environment == environment
+    return _thread_local.server
+  else:
+    # This method is not thread-safe.
+    _thread_local.server_started = True
+    _thread_local.cluster_spec = cluster_spec
+    _thread_local.task_type = task_type
+    _thread_local.task_id = task_id
+    _thread_local.session_config_str = repr(session_config)
+    _thread_local.rpc_layer = rpc_layer
+    _thread_local.environment = environment
+
   assert cluster_spec
   target = cluster_spec.task_address(task_type, task_id)
   if rpc_layer:
@@ -406,8 +428,6 @@ def _run_std_server(cluster_spec=None,
 
   if environment == "google":
     server = _FakeServer()
-    server.start()
-    return server
   else:
     if session_config:
       logging.info(
@@ -422,8 +442,10 @@ def _run_std_server(cluster_spec=None,
         task_index=task_id,
         config=session_config,
         protocol=rpc_layer)
-    server.start()
-    return server
+
+  server.start()
+  _thread_local.server = server
+  return server
 
 
 def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
@@ -650,7 +672,7 @@ def run_distribute_coordinator(worker_fn,
   for a task. The distribute coordinator will make a copy of the `strategy`
   object, call its `configure` method and pass it to `worker_fn` as an argument.
 
-  The `worker_fn` defines the training logic and is called under a its own
+  The `worker_fn` defines the training logic and is called under its own
   worker context which can be accessed to via `get_current_worker_context`. A
   worker context provides access to configurations for each task, e.g. the
   task_type, task_id, master target and so on. Since `worker_fn` will be called
@@ -676,7 +698,7 @@ def run_distribute_coordinator(worker_fn,
   the worker context.
 
   The `cluster_spec` can be either passed by the argument or parsed from the
-  "TF_CONFIG" envrionment variable. Example of a TF_CONFIG:
+  "TF_CONFIG" environment variable. Example of a TF_CONFIG:
   ```
     cluster = {'chief': ['host0:2222'],
                'ps': ['host1:2222', 'host2:2222'],
@@ -691,19 +713,19 @@ def run_distribute_coordinator(worker_fn,
   will be created to call `eval_fn` with its `task_type` set to "evaluator". If
   `eval_fn` is not defined, fall back to `worker_fn`. This implies that
   evaluation will be done on a single machine if there is an "evaluator" task.
-  If "evaluator" doesn't exit in the cluster_spec, it entirely depends on the
+  If "evaluator" doesn't exist in the cluster_spec, it entirely depends on the
   `worker_fn` for how to do evaluation.
 
   Args:
     worker_fn: the function to be called. The function should accept a
       `strategy` object and will be given access to a context object via a
       context manager scope.
-    strategy: a DistributionStrategy object which specifying whether it should
+    strategy: a DistributionStrategy object specifying whether it should
       run between-graph replicated training or not, whether to run init ops,
       etc. This object will also be configured given `session_config`,
       `cluster_spec`, `task_type` and `task_id`.
     eval_fn: optional function for "evaluator" task. If `eval_fn` is not passed
-      in but a "evaluator" task found in the `cluster_spec`, the `worker_fn`
+      in but a "evaluator" task is found in the `cluster_spec`, the `worker_fn`
       will be used for this task.
     eval_strategy: optional DistributionStrategy object for "evaluator" task.
     mode: in which mode this distribute coordinator runs.
@@ -739,7 +761,7 @@ def run_distribute_coordinator(worker_fn,
   rpc_layer = tf_config.get("rpc_layer", rpc_layer)
   environment = tf_config.get("environment", None)
 
-  # Setting the session config is necessary for some strategies such
+  # Setting the session config is necessary for some strategies such as
   # CollectiveAllReduceStrategy.
   session_config = session_config or config_pb2.ConfigProto(
       allow_soft_placement=True)
@@ -809,14 +831,18 @@ def run_distribute_coordinator(worker_fn,
     _configure_session_config_for_std_servers(strategy, eval_strategy,
                                               session_config, cluster_spec,
                                               task_type, task_id)
-    server = _run_std_server(
-        cluster_spec=cluster_spec,
-        task_type=task_type,
-        task_id=task_id,
-        session_config=session_config,
-        rpc_layer=rpc_layer,
-        environment=environment)
 
+    if not getattr(strategy.extended, "_std_server_started", False):
+      # Right now, with eager mode, context is configured with a std server at
+      # the very beginning while with graph mode the std server is started when
+      # distribute coordinator is called. We should consolidate these two paths.
+      server = _run_std_server(
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id,
+          session_config=session_config,
+          rpc_layer=rpc_layer,
+          environment=environment)
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
       if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index ceb4483ebbc2086ddad43a14521a2eedd2bd6fb6..22997169fc1477c9cbf8753d1e36f21101ff8fe8 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -864,6 +864,9 @@ class StrategyConfigureTest(test.TestCase):
     cluster_spec = {"worker": ["localhost:0"]}
     tf_config = {"cluster": cluster_spec}
 
+    # Reset the saved Server state.
+    distribute_coordinator._thread_local = threading.local()  # pylint: disable=protected-access
+
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(tf_config)}):
       distribute_coordinator.run_distribute_coordinator(
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index e7d52c6c985c1076eea76154159df60da7b698de..05b928b9a2ba9d229e7a80ac75d39f52a074c325 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -35,9 +35,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -79,26 +79,28 @@ class UpdateContext(object):
 # Public utility functions.
 
 
-@tf_export("distribute.get_loss_reduction")
+@tf_export(v1=["distribute.get_loss_reduction"])
 def get_loss_reduction():
-  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if (loss_reduction == losses_impl.Reduction.SUM or
-      loss_reduction == losses_impl.ReductionV2.SUM):
-    return reduce_util.ReduceOp.SUM
-  return reduce_util.ReduceOp.MEAN
+  """DEPRECATED: Now always returns `tf.distribute.ReduceOp.SUM`.
+
+  We now always make the complete adjustment when computing the loss, so
+  code should always add gradients/losses across replicas, never average.
+  """
+  return reduce_util.ReduceOp.SUM
 
 
 # ------------------------------------------------------------------------------
 # Internal API for validating the current thread mode
 
 
-def _require_cross_replica_context_extended(extended):
+def _require_cross_replica_or_default_context_extended(extended):
   """Verify in cross-replica context."""
   context = _get_per_thread_mode()
   cross_replica = context.cross_replica_context
   if cross_replica is not None and cross_replica.extended is extended:
     return
+  if context is _get_default_replica_mode():
+    return
   strategy = extended._container_strategy()  # pylint: disable=protected-access
   # We have an error to report, figure out the right message.
   if context.strategy is not strategy:
@@ -333,36 +335,6 @@ class DistributionStrategy(object):
     """DEPRECATED: use extended.colocate_vars_with() instead."""
     return self._extended.colocate_vars_with(colocate_with_variable)
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.  DEPRECATED.
-
-    DEPRECATED: Please use `make_dataset_iterator` or
-    `make_input_fn_iterator` instead.
-
-    Suitable for providing input to `extended.call_for_each_replica()` by
-    creating an iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-
-    with strategy.scope():
-      distributed_dataset = strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_initializable_iterator()
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset` with per-replica
-        batching.
-
-    Returns:
-      A `PerReplicaDataset` that will produce data for each replica.
-    """
-    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
-
   def make_dataset_iterator(self, dataset):
     """Makes an iterator for input provided via `dataset`.
 
@@ -427,8 +399,9 @@ class DistributionStrategy(object):
     if replication_mode != InputReplicationMode.PER_WORKER:
       raise ValueError(
           "Input replication mode not supported: %r" % replication_mode)
-    return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
-        input_fn, replication_mode=replication_mode)
+    with self.scope():
+      return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
+          input_fn, replication_mode=replication_mode)
 
   def experimental_make_numpy_iterator(
       self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
@@ -468,7 +441,7 @@ class DistributionStrategy(object):
     """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
 
     When eager execution is enabled, executes ops specified by `fn` on each
-    replica.  Otherwise, builds a graph to execute the ops on each replica.
+    replica. Otherwise, builds a graph to execute the ops on each replica.
 
     Each replica will take a single, different input from the inputs provided by
     one `get_next` call on the input iterator.
@@ -476,13 +449,13 @@ class DistributionStrategy(object):
     `fn` may call `tf.distribute.get_replica_context()` to access members such
     as `replica_id_in_sync_group`.
 
-    IMPORTANT: Depending on the `DistributionStrategy` being used, and whether
-    eager execution is enabled, `fn` may be called one or more times (once for
-    each replica).
+    IMPORTANT: Depending on the `tf.distribute.Strategy` implementation being
+    used, and whether eager execution is enabled, `fn` may be called one or more
+    times (once for each replica).
 
     Args:
-      fn: function to run. The inputs to the function must match the outputs of
-        `input_iterator.get_next()`. The output must be a `tf.nest` of
+      fn: The function to run. The inputs to the function must match the outputs
+        of `input_iterator.get_next()`. The output must be a `tf.nest` of
         `Tensor`s.
       input_iterator: (Optional) input iterator from which the inputs are taken.
 
@@ -494,17 +467,36 @@ class DistributionStrategy(object):
       single replica).
     """
     with self.scope():
-      if input_iterator is None:
-        return self._extended.call_for_each_replica(fn)
-      else:
-        inputs = input_iterator.get_next()
-        return self._extended.call_for_each_replica(fn, args=(inputs,))
+      args = (input_iterator.get_next(),) if input_iterator is not None else ()
+    return self.experimental_run_v2(fn, args=args)
+
+  def experimental_run_v2(self, fn, args=(), kwargs=None):
+    """Runs ops in `fn` on each replica, with the given arguments.
+
+    When eager execution is enabled, executes ops specified by `fn` on each
+    replica. Otherwise, builds a graph to execute the ops on each replica.
+
+    `fn` may call `tf.distribute.get_replica_context()` to access members such
+    as `replica_id_in_sync_group`.
+
+    IMPORTANT: Depending on the `tf.distribute.Strategy` implementation being
+    used, and whether eager execution is enabled, `fn` may be called one or more
+    times (once for each replica).
+
+    Args:
+      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
+      args: (Optional) Positional arguments to `fn`.
+      kwargs: (Optional) Keyword arguments to `fn`.
 
-  # TODO(b/121296772,b/121300973): Add logical_device argument (default of 0).
-  def broadcast(self, tensor):
-    """Broadcasts `tensor` to all replicas, returning a per-replica value."""
-    _require_cross_replica_context_extended(self._extended)
-    return self._extended._broadcast(tensor)  # pylint: disable=protected-access
+    Returns:
+      Merged return value of `fn` across replicas. The structure of the return
+      value is the same as the return value from `fn`. Each element in the
+      structure can either be `PerReplica` (if the values are unsynchronized),
+      `Mirrored` (if the values are kept in sync), or `Tensor` (if running on a
+      single replica).
+    """
+    with self.scope():
+      return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
 
   def reduce(self, reduce_op, value):
     """Reduce `value` across replicas.
@@ -517,26 +509,53 @@ class DistributionStrategy(object):
     Returns:
       A `Tensor`.
     """
-    _require_cross_replica_context_extended(self._extended)
+    _require_cross_replica_or_default_context_extended(self._extended)
     return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  @doc_controls.do_not_generate_docs  # DEPRECATED
   def unwrap(self, value):
-    """Returns the list of all per-replica values contained in `value`.
+    """Returns the list of all local per-replica values contained in `value`.
+
+    DEPRECATED: Please use `experimental_local_results` instead.
+
+    Note: This only returns values on the workers initiated by this client.
+    When using a `Strategy` like
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy`, each worker
+    will be its own client, and this function will only return values
+    computed on that worker.
 
     Args:
-      value: A value returned by `extended.call_for_each_replica()` or a
-        variable created in `scope`.
+      value: A value returned by `experimental_run()`,
+        `extended.call_for_each_replica()`, or a variable created in `scope`.
 
     Returns:
       A tuple of values contained in `value`. If `value` represents a single
       value, this returns `(value,).`
     """
-    return self._extended._unwrap(value)  # pylint: disable=protected-access
+    return self._extended._local_results(value)  # pylint: disable=protected-access
+
+  def experimental_local_results(self, value):
+    """Returns the list of all local per-replica values contained in `value`.
+
+    Note: This only returns values on the workers initiated by this client.
+    When using a `Strategy` like
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy`, each worker
+    will be its own client, and this function will only return values
+    computed on that worker.
+
+    Args:
+      value: A value returned by `experimental_run()`, `experimental_run_v2()`,
+        `extended.call_for_each_replica()`, or a variable created in `scope`.
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+    Returns:
+      A tuple of values contained in `value`. If `value` represents a single
+      value, this returns `(value,).`
+    """
+    return self._extended._local_results(value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED: TF v1.x only
   def group(self, value, name=None):
-    """Shortcut for `tf.group(self.unwrap(value))`."""
+    """Shortcut for `tf.group(self.experimental_local_results(value))`."""
     return self._extended._group(value, name)  # pylint: disable=protected-access
 
   @property
@@ -756,10 +775,8 @@ class DistributionStrategyExtended(object):
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.make_dataset_iterator(dataset)` (or the deprecated
-    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+  * `d.make_dataset_iterator(dataset)`: in cross-replica
     context, produces an iterator with locality T
-  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
   * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
     with locality V(`v`)
   * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
@@ -845,7 +862,7 @@ class DistributionStrategyExtended(object):
   def _scope(self, strategy):
     """Implementation of DistributionStrategy.scope()."""
     if distribution_strategy_context.has_strategy():
-      _require_cross_replica_context_extended(self)
+      _require_cross_replica_or_default_context_extended(self)
       return _SameScopeAgainContext(strategy)
 
     def creator_with_resource_vars(*args, **kwargs):
@@ -965,21 +982,6 @@ class DistributionStrategyExtended(object):
     """Validate `colocate_with_variable` argument to `colocate_vars_with`."""
     pass
 
-  def _call_dataset_fn(self, dataset_fn):
-    """Call the `dataset_fn` with `input_context` as argument."""
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.DatasetV2):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "tf.distribute.Strategy.")
-    return result
-
-  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def _distribute_dataset(self, dataset_fn):
-    raise NotImplementedError("must be implemented in descendants")
-
   def _make_dataset_iterator(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1003,7 +1005,7 @@ class DistributionStrategyExtended(object):
     Returns:
       A `tf.data.Dataset` representing `numpy_input`.
     """
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     return self._experimental_make_numpy_dataset(numpy_input, session=session)
 
   def _experimental_make_numpy_dataset(self, numpy_input, session):
@@ -1020,14 +1022,12 @@ class DistributionStrategyExtended(object):
     Returns:
       A value mirrored to `destinations` devices.
     """
+    assert destinations is not None  # from old strategy.broadcast()
     # TODO(josh11b): More docstring
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
     return self._broadcast_to(tensor, destinations)
 
-  def _broadcast(self, tensor):
-    return self._broadcast_to(tensor, None)  # Default implementation
-
   def _broadcast_to(self, tensor, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -1068,9 +1068,10 @@ class DistributionStrategyExtended(object):
         - non_tensor_outputs: A dictionatry containing anything that was set by
           `fn` by calling `context.set_non_tensor_output`.
     """
-    _require_cross_replica_context_extended(self)
-    return self._experimental_run_steps_on_iterator(
-        fn, iterator, iterations, initial_loop_values)
+    _require_cross_replica_or_default_context_extended(self)
+    with self._container_strategy().scope():
+      return self._experimental_run_steps_on_iterator(
+          fn, iterator, iterations, initial_loop_values)
 
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values):
@@ -1093,7 +1094,7 @@ class DistributionStrategyExtended(object):
     # Called once in "cross-replica" context.
     def merge_fn(distribution, three_plus_replica_id):
       # sum the values across replicas
-      return sum(distribution.unwrap(three_plus_replica_id))
+      return sum(distribution.experimental_local_results(three_plus_replica_id))
 
     # Called once per replica in `distribution`, in a "replica" context.
     def fn(three):
@@ -1108,7 +1109,8 @@ class DistributionStrategyExtended(object):
       ...
       merged_results = distribution.call_for_each_replica(fn, args=[3])
       # merged_results has the values from every replica execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
+      # This statement prints a list:
+      print(distribution.experimental_local_results(merged_results))
     ```
 
     Args:
@@ -1119,18 +1121,20 @@ class DistributionStrategyExtended(object):
     Returns:
       Merged return value of `fn` across all replicas.
     """
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     if kwargs is None:
       kwargs = {}
-    return self._call_for_each_replica(fn, args, kwargs)
+    with self._container_strategy().scope():
+      return self._call_for_each_replica(fn, args, kwargs)
 
   def _call_for_each_replica(self, fn, args, kwargs):
     raise NotImplementedError("must be implemented in descendants")
 
   def _reduce(self, reduce_op, value):
     # Default implementation until we have an implementation for each strategy.
-    return self._unwrap(self._reduce_to(
-        reduce_op, value, device_util.current() or "/device:CPU:0"))[0]
+    return self._local_results(
+        self._reduce_to(reduce_op, value,
+                        device_util.current() or "/device:CPU:0"))[0]
 
   def reduce_to(self, reduce_op, value, destinations):
     """Combine (via e.g. sum or mean) values across replicas.
@@ -1147,7 +1151,7 @@ class DistributionStrategyExtended(object):
       A value mirrored to `destinations`.
     """
     # TODO(josh11b): More docstring
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     assert (reduce_op == reduce_util.ReduceOp.SUM or
@@ -1169,7 +1173,7 @@ class DistributionStrategyExtended(object):
       A list of mirrored values, one per pair in `value_destination_pairs`.
     """
     # TODO(josh11b): More docstring
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     return self._batch_reduce_to(reduce_op, value_destination_pairs)
 
@@ -1216,10 +1220,11 @@ class DistributionStrategyExtended(object):
       where each list has an element per replica, and the caller is responsible
       for ensuring all elements are executed.
     """
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     if kwargs is None:
       kwargs = {}
-    return self._update(var, fn, args, kwargs, group)
+    with self._container_strategy().scope():
+      return self._update(var, fn, args, kwargs, group)
 
   def _update(self, var, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
@@ -1239,15 +1244,16 @@ class DistributionStrategyExtended(object):
     Returns:
       Return value of `fn`, possibly merged across devices.
     """
-    _require_cross_replica_context_extended(self)
+    _require_cross_replica_or_default_context_extended(self)
     if kwargs is None:
       kwargs = {}
-    return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+    with self._container_strategy().scope():
+      return self._update_non_slot(colocate_with, fn, args, kwargs, group)
 
   def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _unwrap(self, distributed_value):
+  def _local_results(self, distributed_value):
     raise NotImplementedError("must be implemented in descendants")
 
   def value_container(self, value):
@@ -1261,13 +1267,14 @@ class DistributionStrategyExtended(object):
       A container that `value` belongs to.
       If value does not belong to any container (including the case of
       container having been destroyed), returns the value itself.
-      `value in unwrap(value_container(value))` will always be true.
+      `value in experimental_local_results(value_container(value))` will
+      always be true.
     """
     raise NotImplementedError("must be implemented in descendants")
 
   def _group(self, value, name=None):
-    """Shortcut for `tf.group(distribution.unwrap(value))`."""
-    value = nest.flatten(self._unwrap(value))
+    """Implementation of `group`."""
+    value = nest.flatten(self._local_results(value))
 
     if len(value) != 1 or name is not None:
       return control_flow_ops.group(value, name=name)
@@ -1380,11 +1387,24 @@ class ReplicaContext(object):
     self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
         self)
     self._replica_id_in_sync_group = replica_id_in_sync_group
+    self._summary_recording_distribution_strategy = None
 
   def __enter__(self):
     _push_per_thread_mode(self._thread_context)
+    ctx = eager_context.context()
+
+    def replica_id_is_zero():
+      return math_ops.equal(self._replica_id_in_sync_group,
+                            constant_op.constant(0))
+
+    self._summary_recording_distribution_strategy = (
+        ctx.summary_recording_distribution_strategy)
+    ctx.summary_recording_distribution_strategy = replica_id_is_zero
 
   def __exit__(self, exception_type, exception_value, traceback):
+    ctx = eager_context.context()
+    ctx.summary_recording_distribution_strategy = (
+        self._summary_recording_distribution_strategy)
     _pop_per_thread_mode()
 
   def merge_call(self, merge_fn, args=(), kwargs=None):
@@ -1549,9 +1569,6 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   def variable_created_in_scope(self, v):
     return v._distribute_strategy is None  # pylint: disable=protected-access
 
-  def _distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
-
   def _make_dataset_iterator(self, dataset):
     return _DefaultDistributionExtended.DefaultInputIterator(dataset)
 
@@ -1603,12 +1620,12 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
       if should_group:
         return result
       else:
-        return nest.map_structure(self._unwrap, result)
+        return nest.map_structure(self._local_results, result)
 
   def read_var(self, replica_local_var):
     return array_ops.identity(replica_local_var)
 
-  def _unwrap(self, distributed_value):
+  def _local_results(self, distributed_value):
     return (distributed_value,)
 
   def value_container(self, value):
@@ -1685,3 +1702,5 @@ resource_variable_ops._from_proto_fn = _from_proto_fn
 _push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
 _get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
 _pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
+_get_default_replica_mode = (
+    distribution_strategy_context._get_default_replica_mode)  # pylint: disable=protected-access
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index c147849e5de62659b91ccdbf38c35611aad3fd91..391a70c562f458a12c917fa8e7f2d1a7257d0f5c 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
@@ -60,13 +60,12 @@ class _TestExtended(distribute_lib.DistributionStrategyExtended):
 
 
 def _assert_in_default_state(t):
-  t.assertIs(distribution_strategy_context._get_default_replica_context(),
-             distribution_strategy_context.get_replica_context())
-  t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
-  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
-  t.assertIs(distribution_strategy_context._get_default_strategy(),
-             distribution_strategy_context.get_strategy())
-  t.assertFalse(distribution_strategy_context.has_strategy())
+  t.assertIs(ds_context._get_default_replica_context(),
+             ds_context.get_replica_context())
+  t.assertIs(None, ds_context.get_cross_replica_context())
+  t.assertFalse(ds_context.in_cross_replica_context())
+  t.assertIs(ds_context._get_default_strategy(), ds_context.get_strategy())
+  t.assertFalse(ds_context.has_strategy())
 
 
 class TestStrategyTest(test.TestCase):
@@ -76,14 +75,12 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
 
     def run_fn():
-      replica_context = distribution_strategy_context.get_replica_context()
+      replica_context = ds_context.get_replica_context()
       self.assertTrue(replica_context is not None)
-      self.assertIs(None,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
+      self.assertIs(None, ds_context.get_cross_replica_context())
+      self.assertFalse(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       self.assertEqual("foo", replica_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
@@ -91,8 +88,7 @@ class TestStrategyTest(test.TestCase):
       self.assertDictEqual(expected_value,
                            variable_scope.variable(1.0, name="bar"))
 
-    with self.assertRaises(RuntimeError):
-      dist.extended.call_for_each_replica(run_fn)
+    dist.extended.call_for_each_replica(run_fn)
     with dist.scope():
       dist.extended.call_for_each_replica(run_fn)
     _assert_in_default_state(self)
@@ -101,13 +97,11 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
@@ -138,22 +132,16 @@ class DefaultDistributionStrategyTest(test.TestCase):
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(
-          distribution_strategy_context._get_default_strategy(),
-          dist)
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_strategy())
-      self.assertFalse(
-          distribution_strategy_context.has_strategy())
+      self.assertIs(ds_context._get_default_strategy(), dist)
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertIs(dist, ds_context.get_strategy())
+      self.assertFalse(ds_context.has_strategy())
       return "foo_" + s
 
-    replica_ctx = distribution_strategy_context.get_replica_context()
-    self.assertIs(distribution_strategy_context._get_default_replica_context(),
-                  replica_ctx)
+    replica_ctx = ds_context.get_replica_context()
+    self.assertIs(ds_context._get_default_replica_context(), replica_ctx)
     self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 7d5f231c37da41f10f945adc468f40ffd0ecc743..0ec6703b8692fe313f12b6e9952e19f43e1e7adb 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
@@ -296,10 +297,11 @@ def estimator_train(estimator, train_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._train_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
@@ -344,10 +346,11 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._eval_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0631b99cc95204a22b10f8809196a4396bcb25a4
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -0,0 +1,20 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "experimental",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy",
+        "//tensorflow/python/distribute:tpu_strategy",
+    ],
+)
diff --git a/tensorflow/python/util/future_api_test.py b/tensorflow/python/distribute/experimental/__init__.py
similarity index 68%
rename from tensorflow/python/util/future_api_test.py
rename to tensorflow/python/distribute/experimental/__init__.py
index 7cafdec6f0ae40d70fa5f2bfc74a56dfcf337984..e76897ce31a99ea192b5e19d26d665f960af1e23 100644
--- a/tensorflow/python/util/future_api_test.py
+++ b/tensorflow/python/distribute/experimental/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,24 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for future_api."""
+"""Experimental Distribution Strategy library."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
-
 # pylint: disable=unused-import
-from tensorflow.python.util import future_api
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import tpu_strategy
 # pylint: enable=unused-import
-
-
-class ExampleParserConfigurationTest(tf.test.TestCase):
-
-  def testBasic(self):
-    self.assertFalse(hasattr(tf, 'arg_max'))
-    self.assertTrue(hasattr(tf, 'argmax'))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 3ba80c9287180e96d51d29c6da0141b82be34f6e..7aa861dee97dfaec990437589aecfd035c821901 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -21,16 +21,20 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
 
@@ -91,237 +95,6 @@ class InputWorkers(object):
         self.__class__.__name__, debug_repr, self._device_map)
 
 
-class PerReplicaDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
-
-  def __init__(self, iterator, input_workers, worker_index, prefetch_on_device):
-    assert isinstance(input_workers, InputWorkers)
-    self._iterator = iterator
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-    self._prefetch_on_device = prefetch_on_device
-
-  @property
-  def initializer(self):
-    return self._iterator.initializer
-
-  def get_next_as_list(self, name=None):
-    """Scatter the input across devices."""
-    if self._prefetch_on_device:
-      data_list = self._iterator.get_next()
-    else:
-      batch = self._iterator.get_next(name=name)
-      data_list = []
-      def get_ith(i):
-        return lambda x: x[i]
-
-      devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-      for i, d in enumerate(devices):
-        v = nest.map_structure(get_ith(i), batch)
-        if context.executing_eagerly():
-          with ops.device(d):
-            v = nest.map_structure(array_ops.identity, v)
-        data_list.append(v)
-
-    return data_list
-
-  def get_next(self, name=None):
-    assert self._input_workers.num_workers == 1
-    data_list = self.get_next_as_list(name)
-    return values.regroup(self._input_workers.device_map, data_list)
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-class PerReplicaDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
-
-  def __init__(self, dataset, input_workers, worker_index,
-               prefetch_on_device=None):
-    assert isinstance(input_workers, InputWorkers)
-    assert worker_index is not None
-    assert worker_index is not True  # pylint: disable=g-bool-id-comparison
-    assert worker_index is not False  # pylint: disable=g-bool-id-comparison
-    self._input_workers = input_workers
-    self._worker_index = worker_index
-
-    # Default to using prefetching, unless specified.
-    self._prefetch_on_device = prefetch_on_device
-    if self._prefetch_on_device is None:
-      self._prefetch_on_device = True
-
-    self._dataset = dataset
-    if not self._prefetch_on_device:
-      # TODO(priyag): If dropping remainder is not appropriate, find another
-      # approach to distributing the dataset when not possible to divide evenly.
-      # Possibly not an issue when we start using PartitionedDataset.
-      num_replicas = len(
-          self._input_workers.compute_devices_for_worker(self._worker_index))
-      self._dataset = self._dataset.batch(num_replicas, drop_remainder=True)
-    else:
-      self._replica_devices = self._input_workers.compute_devices_for_worker(
-          self._worker_index)
-
-  def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerReplicaDataset."""
-    # Graph mode with one shot iterator is disabled.
-    if not context.executing_eagerly():
-      raise ValueError("Cannot create a one shot iterator. Please use "
-                       "`make_initializable_iterator()` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._replica_devices)
-    else:
-      dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator,
-        self._input_workers,
-        self._worker_index,
-        prefetch_on_device=self._prefetch_on_device)
-
-  def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerReplicaDataset."""
-    # Eager mode generates already initialized iterators. Hence we cannot create
-    # an initializable iterator.
-    if context.executing_eagerly():
-      raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._replica_devices)
-    else:
-      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator, self._input_workers, self._worker_index,
-        prefetch_on_device=self._prefetch_on_device)
-
-
-class MultiWorkerDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
-
-  def __init__(self, iterators, input_workers):
-    """Initialize the `MultiWorkerDataIterator` object.
-
-    Args:
-      iterators: a list of worker, iterator pairs.
-      input_workers: an `InputWorkers` object.
-
-    Raises:
-      ValueError: if iterators and input_workers are not compatible.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    workers = tuple(d for d, _ in iterators)
-    if workers != input_workers.worker_devices:
-      raise ValueError("iterators and input_workers are not compatible. "
-                       "iterator workers: %r input_workers devices: %r" %
-                       (workers, input_workers.worker_devices))
-    self._iterators = tuple(i for _, i in iterators)
-    self._input_workers = input_workers
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        tuple(iterator.initializer for iterator in self._iterators))
-
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._input_workers.worker_devices):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  def get_next(self, name=None):
-    """Scatter the input across hosts and devices."""
-    replicas = []
-    for worker, iterator in zip(self._input_workers.worker_devices,
-                                self._iterators):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = iterator.get_next_as_list(name=new_name)
-        # Append to replicas to get a flat list of values indexed by replica.
-        replicas.extend(data_per_worker)
-
-    return values.regroup(self._input_workers.device_map, replicas)
-
-
-class MultiWorkerDataset(object):
-  """Like a `tf.data.Dataset` that distributes data to different workers.
-
-  Each worker gets one shard of the input dataset. This currently does not work
-  in eager mode.
-  """
-
-  def __init__(self, dataset_fn, input_workers, prefetch_on_device=None,
-               auto_shard=False):
-    """Initialize the MultiWorkerDataset object.
-
-    Args:
-      dataset_fn: a function or a list of functions that returns a
-        `tf.data.Dataset`.
-      input_workers: an `InputWorkers` object.
-      prefetch_on_device: whether to prefetch to devices.
-      auto_shard: whether to auto-shard the dataset.
-    """
-    assert isinstance(input_workers, InputWorkers)
-    if isinstance(dataset_fn, (list, tuple)):
-      if len(dataset_fn) != input_workers.num_workers:
-        raise ValueError("If `dataset_fn` is a list, it must have one entry "
-                         "per worker")
-    # TODO(rohanj): b/120673685 to track re-enabling auto sharding.
-    if auto_shard:
-      raise ValueError("Currently autosharding is not supported.")
-    self._input_workers = input_workers
-    self._datasets = []
-    # TODO(yuefengz, priyag): support different set of jobs for input
-    # processing.
-    for i, worker in enumerate(input_workers.worker_devices):
-      with ops.device(worker):
-        if isinstance(dataset_fn, (list, tuple)):
-          worker_input = dataset_fn[i]()
-        else:
-          worker_input = dataset_fn()
-        dataset = PerReplicaDataset(worker_input, input_workers, i,
-                                    prefetch_on_device=prefetch_on_device)
-        self._datasets.append((worker, dataset))
-
-  def make_one_shot_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-  def make_initializable_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append(
-            (worker, dataset_ops.make_initializable_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._input_workers)
-
-
 class InputIterator(object):
   """An input iterator, intended to be passed to `DistributionStrategy.run`."""
 
@@ -360,6 +133,7 @@ class InputIteratorImpl(InputIterator):
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
     replicas = []
+    worker_has_values = []
     for i, worker in enumerate(self._input_workers.worker_devices):
       if name is not None:
         d = tf_device.DeviceSpec.from_string(worker)
@@ -367,8 +141,61 @@ class InputIteratorImpl(InputIterator):
       else:
         new_name = None
       with ops.device(worker):
+        worker_has_value, next_element = (
+            self._iterators[i].get_next_as_list(new_name))
+        worker_has_values.append(worker_has_value)
         # Make `replicas` a flat list of values across all replicas.
-        replicas.extend(self._iterators[i].get_next_as_list(new_name))
+        replicas.append(next_element)
+
+    out_of_range_replicas = []
+
+    def out_of_range_fn(worker_index, device):
+      """This function will throw an OutOfRange error."""
+      # As this will be only called when there is no data left, so calling
+      # get_next() will trigger an OutOfRange error.
+      data = self._iterators[worker_index].get_next(device)
+      out_of_range_replicas.append(data)
+      return data
+
+    # `global_has_value` indicates whether there is data in this global batch.
+    # We do a all-reduce across all the workers in the multi-worker case.
+    # TODO(b/126259107): Do strategy.reduce for CollectiveAllReduceStrategy.
+    if len(worker_has_values) > 1:
+      with ops.device(self._input_workers.compute_devices_for_worker(0)[0]):
+        # Place the tf.reduce_any op in device 0 to minimize communication
+        # cost.
+        # TODO(b/128545270): Investigate why placing it on worker 0 will cause
+        # the entire data to copy back from device to host.
+        global_has_value = math_ops.reduce_any(worker_has_values)
+    else:
+      global_has_value = worker_has_values[0]
+
+    results = []
+    for i, worker in enumerate(self._input_workers.worker_devices):
+      with ops.device(worker):
+        devices = self._input_workers.compute_devices_for_worker(i)
+        for j, device in enumerate(devices):
+          with ops.device(device):
+            # pylint: disable=undefined-loop-variable
+            # pylint: disable=cell-var-from-loop
+            # It is fine for the lambda to capture variables from the loop as
+            # the lambda is executed in the loop as well.
+            result = control_flow_ops.cond(global_has_value,
+                                           lambda: replicas[i][j],
+                                           lambda: out_of_range_fn(i, device))
+            # pylint: enable=cell-var-from-loop
+            # pylint: enable=undefined-loop-variable
+            results.append(result)
+    replicas = results
+
+    # Some dimensions in `replicas` will become unknown after we conditionally
+    # return the real tensors or the dummy tensors. We fix the input shapes by
+    # using the shapes from `out_of_range_replicas` because it is calling
+    # get_next() inside.
+    flattened_replicas = nest.flatten(replicas)
+    for i, replica_data in enumerate(nest.flatten(out_of_range_replicas)):
+      flattened_replicas[i].set_shape(replica_data.get_shape())
+    replicas = nest.pack_sequence_as(replicas, flattened_replicas)
 
     return values.regroup(self._input_workers.device_map, replicas)
 
@@ -416,8 +243,6 @@ class InputFunctionIterator(InputIteratorImpl):
     once on each worker.
 
     TODO(priyag): Add other replication modes.
-    TODO(priyag): Allow taking input function that returns a callable that
-    returns nest of tensors.
 
     Args:
       input_fn: Input function that returns a `tf.data.Dataset` object.
@@ -438,10 +263,14 @@ class InputFunctionIterator(InputIteratorImpl):
       worker = input_workers.worker_devices[i]
       with ops.device(worker):
         result = input_fn(ctx)
-        if not isinstance(result, dataset_ops.DatasetV2):
-          raise ValueError("input_fn must return a tf.data.Dataset.")
         devices = input_workers.compute_devices_for_worker(i)
-        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        if isinstance(result, dataset_ops.DatasetV2):
+          iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        elif callable(result):
+          iterator = _SingleWorkerCallableIterator(result, worker, devices)
+        else:
+          raise ValueError(
+              "input_fn must return a tf.data.Dataset or a callable.")
         iterators.append(iterator)
 
     super(InputFunctionIterator, self).__init__(input_workers, iterators)
@@ -478,7 +307,7 @@ class DatasetIterator(InputIteratorImpl):
     """
     assert isinstance(input_workers, InputWorkers)
     if split_batch_by:
-      dataset = _split_dataset_batch(dataset, split_batch_by)
+      dataset = batching._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
 
     iterators = []
     for i, worker in enumerate(input_workers.worker_devices):
@@ -487,13 +316,55 @@ class DatasetIterator(InputIteratorImpl):
         cloned_dataset = dataset
         if not context.executing_eagerly():
           cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
+          cloned_dataset = cloned_dataset.with_options(dataset.options())
         iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
                                                 worker_devices)
         iterators.append(iterator)
 
+    self._element_structure = dataset._element_structure  # pylint: disable=protected-access
+
     super(DatasetIterator, self).__init__(input_workers, iterators)
 
 
+def _dummy_tensor_fn(value_structure):
+  """A function to create dummy tensors from `value_structure`."""
+
+  def create_dummy_tensor(feature_shape, feature_type):
+    """Create a dummy tensor with possible batch dimensions set to 0."""
+
+    # Ideally we should set the batch dimension to 0, however as in
+    # DistributionStrategy we don't know the batch dimension, we try to
+    # guess it as much as possible. If the feature has unknown dimensions, we
+    # will set them to 0. If the feature shape is already static, we guess the
+    # first dimension as batch dimension and set it to 0.
+    dims = []
+    for dim in feature_shape.dims:
+      if dim.value is None:
+        dims.append(tensor_shape.Dimension(0))
+      else:
+        dims.append(dim)
+    if feature_shape.is_fully_defined() and dims:
+      dims[0] = tensor_shape.Dimension(0)
+
+    # Create the dummy tensor.
+    dummy_tensor = array_ops.zeros(tensor_shape.TensorShape(dims), feature_type)
+    return dummy_tensor
+
+  result = []
+  # pylint: disable=protected-access
+  for feature_shape, feature_type in zip(value_structure._flat_shapes,
+                                         value_structure._flat_types):
+    result.append(create_dummy_tensor(feature_shape, feature_type))
+
+  if isinstance(value_structure, structure.NestedStructure):
+    result = nest.pack_sequence_as(value_structure._nested_structure, result)
+  else:
+    result = result[0]
+  # pylint: enable=protected-access
+
+  return result
+
+
 class _SingleWorkerDatasetIterator(object):
   """Iterator for a single `tf.data.Dataset`."""
 
@@ -519,12 +390,51 @@ class _SingleWorkerDatasetIterator(object):
       self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
           self._dataset, self._devices)
 
+  def get_next(self, device, name=None):
+    """Get next element for the given device."""
+    del name
+    with ops.device(self._worker):
+      return self._iterator.get_next(device)
+
   def get_next_as_list(self, name=None):
-    """Get next element from the underlying iterator."""
+    """Get next element from underlying iterator.
+
+    If there is no data left, a list of dummy tensors with possible batch
+    dimensions set to 0 will be returned.
+
+    Args:
+      name: not used.
+
+    Returns:
+      A boolean tensor indicates whether there is any data in next element and
+      the real data as the next element or a list of dummy tensors if no data
+      left.
+    """
     del name
     with ops.device(self._worker):
-      data_list = self._iterator.get_next()
-      return data_list
+      data_list = self._iterator.get_next_as_optional()
+      result = []
+      for i, data in enumerate(data_list):
+        # Place the condition op in the same device as the data so the data
+        # doesn't need to be sent back to the worker.
+        with ops.device(self._devices[i]):
+          # As MultiDeviceIterator will fetch data in order, so we only need to
+          # check if the first replica has value to see whether there is data
+          # left for this single worker.
+          if i == 0:
+            worker_has_value = data.has_value()
+
+          # pylint: disable=unnecessary-lambda
+          # pylint: disable=cell-var-from-loop
+          real_data = control_flow_ops.cond(
+              data.has_value(),
+              lambda: data.get_value(),
+              lambda: _dummy_tensor_fn(data.value_structure))
+          result.append(real_data)
+          # pylint: enable=cell-var-from-loop
+          # pylint: enable=unnecessary-lambda
+
+      return worker_has_value, result
 
   def initialize(self):
     """Initialze underlying iterator.
@@ -537,56 +447,80 @@ class _SingleWorkerDatasetIterator(object):
       A list of any initializer ops that should be run.
     """
     if context.executing_eagerly():
-      self._make_iterator()
+      self._iterator._eager_reset()  # pylint: disable=protected-access
       return []
     else:
       return [self._iterator.initializer]
 
   @property
   def output_classes(self):
-    return self._iterator.output_classes
+    return dataset_ops.get_legacy_output_classes(self._iterator)
 
   @property
   def output_shapes(self):
-    return self._iterator.output_shapes
+    return dataset_ops.get_legacy_output_shapes(self._iterator)
 
   @property
   def output_types(self):
-    return self._iterator.output_types
+    return dataset_ops.get_legacy_output_types(self._iterator)
+
+
+class _SingleWorkerCallableIterator(object):
+  """Iterator for a single tensor-returning callable."""
 
+  def __init__(self, fn, worker, devices):
+    self._fn = fn
+    self._worker = worker
+    self._devices = devices
 
-def _split_dataset_batch(dataset, split_batch_by):
-  """Divide a batch-ed dataset's batches into smaller batches."""
-  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  def get_next(self, device, name=None):
+    """Get next element for the given device from the callable."""
+    del device, name
+    with ops.device(self._worker):
+      return self._fn()
+
+  def get_next_as_list(self, name=None):
+    """Get next element from the callable."""
+    del name
+    with ops.device(self._worker):
+      data_list = [self._fn() for _ in self._devices]
+      return constant_op.constant(True), data_list
+
+  def initialize(self):
+    # TODO(petebu) Should this throw an exception instead?
+    return []
+
+
+# TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+def _get_batched_dataset(d):
+  """Get the batched dataset from `d`."""
   # pylint: disable=protected-access
-  def _get_batch_dataset(d):
-    """Get the underlying batch dataset from the dataset object."""
-    if isinstance(d, dataset_ops.DatasetV1Adapter):
-      d = d._dataset
-
-    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
-      return d
-    elif isinstance(d, dataset_ops.PrefetchDataset):
-      return _get_batch_dataset(d._input_dataset)
-    raise ValueError(
-        "Unable to get batched dataset from the input dataset. `batch` "
-        "`map_and_batch` need to be the last operations on the dataset. "
-        "The batch operations can be followed by a prefetch.")
-
-  batched_dataset = _get_batch_dataset(dataset)
-  if isinstance(batched_dataset, dataset_ops.BatchDataset):
-    batch_size = batched_dataset._batch_size
-    drop_remainder = batched_dataset._drop_remainder
-  elif isinstance(batched_dataset, batching._MapAndBatchDataset):
-    batch_size = batched_dataset._batch_size_t
-    drop_remainder = batched_dataset._drop_remainder_t
+  if isinstance(d, dataset_ops.DatasetV1Adapter):
+    d = d._dataset
 
-  prefetch_buffer = None
-  if isinstance(dataset, dataset_ops.PrefetchDataset):
-    prefetch_buffer = dataset._buffer_size
-  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
-        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
-    prefetch_buffer = dataset._dataset._buffer_size
+  if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+    return d
+  elif isinstance(d, (dataset_ops.PrefetchDataset,
+                      dataset_ops._OptionsDataset)):
+    return _get_batched_dataset(d._input_dataset)
+
+  raise ValueError(
+      "Unable to get batched dataset from the input dataset. `batch` "
+      "`map_and_batch` need to be the last operations on the dataset. "
+      "The batch operations can be followed by a prefetch.")
+
+
+def _get_batched_dataset_attributes(d):
+  """Get `batch_size`, `drop_remainder` of dataset."""
+  # pylint: disable=protected-access
+  assert isinstance(d,
+                    (dataset_ops.BatchDataset, batching._MapAndBatchDataset))
+  if isinstance(d, dataset_ops.BatchDataset):
+    batch_size = d._batch_size
+    drop_remainder = d._drop_remainder
+  elif isinstance(d, batching._MapAndBatchDataset):
+    batch_size = d._batch_size_t
+    drop_remainder = d._drop_remainder_t
   # pylint: enable=protected-access
 
   if tensor_util.is_tensor(batch_size):
@@ -595,17 +529,29 @@ def _split_dataset_batch(dataset, split_batch_by):
   if tensor_util.is_tensor(drop_remainder):
     drop_remainder = tensor_util.constant_value(drop_remainder)
 
-  if batch_size % split_batch_by:
-    raise ValueError(
-        "Batch size %s cannot be sharded evenly across replicas %s" % (
-            batch_size, split_batch_by))
-  new_batch_size = batch_size // split_batch_by
+  return batch_size, drop_remainder
+
+
+# TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+def _get_dataset_attributes(dataset):
+  """Get the underlying attributes from the dataset object."""
+  # pylint: disable=protected-access
+
+  # First, get batch_size and drop_remainder from the dataset. We need
+  # to walk back the dataset creation process and find the batched version in
+  # order to get the attributes.
+  batched_dataset = _get_batched_dataset(dataset)
+  batch_size, drop_remainder = _get_batched_dataset_attributes(batched_dataset)
+
+  # Second, prefetch buffer should be get from the original dataset.
+  prefetch_buffer = None
+  if isinstance(dataset, dataset_ops.PrefetchDataset):
+    prefetch_buffer = dataset._buffer_size
+  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
+        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
+    prefetch_buffer = dataset._dataset._buffer_size
 
-  dataset = dataset.apply(batching.unbatch())
-  dataset = dataset.batch(new_batch_size, drop_remainder=drop_remainder)
-  if prefetch_buffer is not None:
-    dataset = dataset.prefetch(prefetch_buffer)
-  return dataset
+  return batch_size, drop_remainder, prefetch_buffer
 
 
 class MultiStepContext(object):
@@ -702,6 +648,7 @@ class MultiStepContext(object):
       def merge_fn(distribution, value):
         # NOTE(priyag): For non tensor outputs, we simply return all the values
         # in a list as reduction doesn't make sense on non tensors.
-        self._non_tensor_outputs[name] = distribution.unwrap(value)
+        self._non_tensor_outputs[name] = (
+            distribution.experimental_local_results(value))
       distribution_strategy_context.get_replica_context().merge_call(
           merge_fn, args=(output,))
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index d9e833b6bc6b123b6875440df7c35b0af02d0941..5121bd934435ad4408e960ece3dcb32508124fa1 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -40,10 +40,9 @@ def auto_shard_dataset(dataset, num_shards, index):
     dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
       dataset transformations.
     num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel. Same usage as in
-        `tf.data.experimental.filter_for_shard`.
+        shards operating in parallel. Same usage as in `tf.data.Dataset.shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-      Same usage as in `Dataset.shard`.
+      Same usage as in `tf.data.Dataset.shard`.
 
   Returns:
     A modified `Dataset` obtained by updating the pipeline sharded by the
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index f279c3ba1b8ef5d6ec28b7acfc847241a447b5a6..3b34732cee374799a75534ac3060ee891e995649 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import contextlib
 import copy
-import functools
 import threading
 
 from tensorflow.python import pywrap_tensorflow
@@ -33,6 +32,7 @@ from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
@@ -214,15 +214,16 @@ def _create_mirrored_variable(strategy, device_map, logical_device,  # pylint: d
                      kwargs["name"])
   elif synchronization == variable_scope.VariableSynchronization.ON_READ:
     # Variables that are to be synced on read are replica local.
-    is_replica_local = True
+    is_sync_on_read = True
     kwargs["trainable"] = False
   elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
         synchronization == variable_scope.VariableSynchronization.AUTO):
     # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
-    is_replica_local = False
+    is_sync_on_read = False
   else:
-    raise ValueError("Invalid variable synchronization mode: " +
-                     synchronization + " for variable: " + kwargs["name"])
+    raise ValueError(
+        "Invalid variable synchronization mode: %s for variable: %s" %
+        (synchronization, kwargs["name"]))
 
   # Get aggregation value
   aggregation = kwargs.pop("aggregation",
@@ -233,8 +234,9 @@ def _create_mirrored_variable(strategy, device_map, logical_device,  # pylint: d
       variable_scope.VariableAggregation.MEAN,
       variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
   ):
-    raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                     " for variable: " + kwargs["name"])
+    raise ValueError(
+        "Invalid variable aggregation mode: %s for variable: %s" %
+        (aggregation, kwargs["name"]))
 
   # Ignore user-specified caching device, not needed for mirrored variables.
   kwargs.pop("caching_device", None)
@@ -246,8 +248,8 @@ def _create_mirrored_variable(strategy, device_map, logical_device,  # pylint: d
     devices = device_map.logical_to_actual_devices(logical_device)
     value_list = real_mirrored_creator(devices, *args, **kwargs)
 
-    if is_replica_local:
-      result = values.ReplicaLocalVariable(
+    if is_sync_on_read:
+      result = values.SyncOnReadVariable(
           strategy, device_map, value_list, aggregation,
           logical_device=logical_device)
     else:
@@ -317,7 +319,7 @@ def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
   devices = []
   for task_type in ("chief", "worker"):
     for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
-      if num_gpus_per_worker is 0:
+      if num_gpus_per_worker == 0:
         devices.append("/job:%s/task:%d" % (task_type, task_id))
       else:
         devices.extend([
@@ -407,6 +409,15 @@ def all_local_devices(num_gpus=None):
           ("/device:CPU:0",))
 
 
+def _all_devices():
+  devices = []
+  tfconfig = TFConfigClusterResolver()
+  if tfconfig.cluster_spec().as_dict():
+    devices = _cluster_spec_to_device_list(tfconfig.cluster_spec(),
+                                           context.num_gpus())
+  return devices if devices else all_local_devices()
+
+
 @tf_export("distribute.MirroredStrategy")
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
@@ -414,7 +425,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
-  The multi-worker version will be added in the fture.
+  The multi-worker version will be added in the future.
 
   Args:
     devices: a list of device strings.
@@ -434,7 +445,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   def __init__(self, container_strategy, devices=None, cross_device_ops=None):
     super(MirroredExtended, self).__init__(container_strategy)
     if devices is None:
-      devices = all_local_devices()
+      devices = _all_devices()
     if not devices:
       raise ValueError("Got an empty `devices` list. Please make sure the "
                        "`devices` you pass in is not empty.")
@@ -455,7 +466,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     assert devices, "Must specify at least one device."
     devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument: %s" % devices)
+        "No duplicates allowed in `devices` argument: %s" % (devices,))
     # TODO(josh11b): Require at least 2 devices?
     self._device_map = values.ReplicaDeviceMap(devices)
     self._input_workers = input_lib.InputWorkers(self._device_map)
@@ -495,8 +506,15 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     self._device_map = values.ReplicaDeviceMap(devices)
     self._input_workers = input_lib.InputWorkers(
         self._device_map, worker_devices)
-    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
-        workers, _infer_num_gpus_per_worker(devices))
+
+    if len(workers) > 1:
+      self._inferred_cross_device_ops = (
+          cross_device_ops_lib.MultiWorkerAllReduce(
+              workers, _infer_num_gpus_per_worker(devices)))
+    else:
+      # TODO(yuefengz): make `choose_the_best` work with device strings
+      # containing job names.
+      self._inferred_cross_device_ops = cross_device_ops_lib.NcclAllReduce()
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
@@ -532,7 +550,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
                   init_value = value_list[0].initial_value
                   return array_ops.identity(init_value)
             kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
             # Don't record operations (e.g. other variable reads) during
             # variable creation.
             with tape.stop_recording():
@@ -548,17 +566,6 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   def _validate_colocate_with_variable(self, colocate_with_variable):
     values.validate_colocate_distributed_variable(colocate_with_variable, self)
 
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      worker_index = 0
-      return input_lib.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._input_workers, worker_index)
-    else:
-      return input_lib.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._input_workers,
-          auto_shard=False)
-
   def _make_dataset_iterator(self, dataset):
     return input_lib.DatasetIterator(
         dataset, self._input_workers, self._num_replicas_in_sync)
@@ -595,7 +602,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       fn_result = fn(ctx, iterator.get_next())
       for (name, output) in ctx.last_step_outputs.items():
         # Convert all outputs to tensors, potentially from `DistributedValues`.
-        ctx.last_step_outputs[name] = self._unwrap(output)
+        ctx.last_step_outputs[name] = self._local_results(output)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
@@ -729,12 +736,12 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   def read_var(self, replica_local_var):
     """Read the aggregate value of a replica-local variable."""
-    if isinstance(replica_local_var, values.ReplicaLocalVariable):
+    if isinstance(replica_local_var, values.SyncOnReadVariable):
       return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
     assert isinstance(replica_local_var, values.Mirrored)
     return array_ops.identity(replica_local_var.get())
 
-  def _unwrap(self, val):
+  def _local_results(self, val):
     if isinstance(val, values.DistributedValues):
       return val.values
     return (val,)
@@ -784,8 +791,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
   def _global_batch_size(self):
     """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
 
-    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
-    batching.
+    `make_input_fn_iterator` assumes per-replica batching.
 
     Returns:
       Boolean.
@@ -831,6 +837,7 @@ class _MirroredReplicaThread(threading.Thread):
     # parent thread:
     ctx = context.context()
     self.in_eager = ctx.executing_eagerly()
+    self.record_thread_local_context_fields()
     # pylint: disable=protected-access
     if not ctx._context_handle:
       ctx._initialize_handle_and_devices()
@@ -859,12 +866,13 @@ class _MirroredReplicaThread(threading.Thread):
     try:
       if self.coord.should_stop():
         return
+      self.restore_thread_local_context_fields()
       # TODO(josh11b): Use current logical device instead of 0 here.
       with self.coord.stop_on_exception(), \
           _enter_graph(self._init_graph, self._init_in_eager), \
           _enter_graph(self.graph, self.in_eager,
                        self._variable_creator_stack), \
-          context.context().device_policy(self.context_device_policy), \
+          context.device_policy(self.context_device_policy), \
           MirroredReplicaContext(self.distribution, constant_op.constant(
               self.replica_id, dtypes.int32)), \
           ops.device(self.device_map.logical_to_actual_devices(0)[
@@ -878,6 +886,24 @@ class _MirroredReplicaThread(threading.Thread):
     finally:
       self.has_paused.set()
 
+  def record_thread_local_context_fields(self):
+    """Record thread local fields of context.context() in self."""
+    ctx = context.context()
+    self._summary_writer = ctx.summary_writer
+    self._summary_recording = ctx.summary_recording
+    self._summary_recording_distribution_strategy = (
+        ctx.summary_recording_distribution_strategy)
+    # TODO(b/125892694): record other fields in EagerContext.
+
+  def restore_thread_local_context_fields(self):
+    """Restore thread local fields of context.context() from self."""
+    ctx = context.context()
+    ctx.summary_writer = self._summary_writer
+    ctx.summary_recording = self._summary_recording
+    ctx.summary_recording_distribution_strategy = (
+        self._summary_recording_distribution_strategy)
+    # TODO(b/125892694): restore other fields in EagerContext.
+
 
 class MirroredReplicaContext(distribute_lib.ReplicaContext):
   """ReplicaContext used in MirroredStrategy.extended.call_for_each_replica().
@@ -901,6 +927,28 @@ class MirroredReplicaContext(distribute_lib.ReplicaContext):
       t.captured_name_scope += "/"
 
     t.captured_control_deps = t.graph._current_control_dependencies()  # pylint: disable=protected-access
+
+    # NOTE(priyag): Throw an error if there is a merge call in the middle of a
+    # `fn` passed to call_for_each_replica which changes the graph being used
+    # while calling `fn`. This can happen when the `fn` is decorated with
+    # `tf.function` and there is a merge_call in `fn`. This breaks because each
+    # thread tries to create a distinct tf.function. Each tf.function creation
+    # takes a lock, and so if there is a merge call in the middle, the lock is
+    # never releases and subsequent replica threads cannot proceed to define
+    # their own functions. Checking for the graph being the same is one way for
+    # us to check this didn't happen.
+    if ops.get_default_graph() != t.graph:
+      raise RuntimeError(
+          "`merge_call` called while defining a new graph. "
+          "This can happen if the function `fn` passed to "
+          "`strategy.experimental_run()` or "
+          "`strategy.extended.call_for_each_replica()` is decorated with "
+          "`@tf.function`. In this case, wrap the call to "
+          "`strategy.experimental_run()` or "
+          "`strategy.extended.call_for_each_replica()` with `@tf.function` "
+          "instead of `fn`. This will avoid mismatching graphs and also "
+          "improve performance.")
+
     t.has_paused.set()
     t.should_run.wait()
     t.should_run.clear()
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..576db795a0b5385b06573976d9f435e305e72919
--- /dev/null
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -0,0 +1,219 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class OneDeviceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import values
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@tf_export("distribute.OneDeviceStrategy")
+class OneDeviceStrategy(distribute_lib.DistributionStrategy):
+  """A distribution strategy for running on a single device."""
+  # TODO(josh11b): Do we wrap values in types to generate errors if you are
+  # doing something that won't work with other DistributionStrategy
+  # implementations?
+
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
+    self._device = device
+    self._input_device = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(self._input_device, [self._device])]
+    device_map = values.SingleDeviceMap(device)
+    self._input_workers = input_lib.InputWorkers(
+        device_map, worker_device_pairs)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      with ops.device(self._device):
+        return next_creator(*args, **kwargs)
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      with ops.colocate_with(colocate_with):
+        return next_creator(*args, **kwargs)
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    # Note that split_batch_by argument is not passed because it is always 1 in
+    # this strategy, and adding it adds unnecessary overhead to the dataset.
+    return input_lib.DatasetIterator(dataset, self._input_workers)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, [distribute_lib.InputContext()])
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self._input_device), session)
+
+  def _broadcast_to(self, tensor, destinations):
+    del destinations
+    return tensor
+
+  # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = input_lib.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_result = fn(ctx, iterator.get_next())
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    # TODO(priyag): Use max_iterations instead of an explicit counter.
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
+      return fn(*args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
+    return value
+
+  def _update(self, var, fn, args, kwargs, group):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    del colocate_with
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._local_results, result)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    return array_ops.identity(replica_local_var)
+
+  def _local_results(self, value):
+    return (value,)
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def _num_replicas_in_sync(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    return (self._device,)
+
+  @property
+  def parameter_devices(self):
+    return (self._device,)
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return (self._device,)
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for OneDeviceStrategy."""
+    return True
+
+
+class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for OneDeviceStrategy."""
+
+  def __init__(self, strategy):
+    zero = constant_op.constant(0, dtypes.int32)
+    distribute_lib.ReplicaContext.__init__(
+        self, strategy, replica_id_in_sync_group=zero)
+
+  @property
+  def devices(self):
+    return self._strategy.extended.worker_devices
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 8093f8ccb036776aa2c8ea9c15530560be2f9f03..b4c8bee53c78fcbbffc3535168ab14846ca9df49 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import copy
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import values
@@ -39,12 +40,14 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_setter
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 _LOCAL_CPU = "/device:CPU:0"
 _LOCAL_GPU_0 = "/device:GPU:0"
 
 
 # TODO(yuefengz): maybe cache variables on local CPU.
+@tf_export("distribute.experimental.ParameterServerStrategy")
 class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   """A parameter server DistributionStrategy.
 
@@ -101,16 +104,13 @@ class ParameterServerStrategyExtended(
 
     # We typically don't need to do all-reduce in this strategy.
     self._cross_device_ops = (
-        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
-            reduce_to_device=_LOCAL_CPU))
+        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
 
   def _initialize_strategy(self, cluster_resolver):
     if cluster_resolver.cluster_spec().as_dict():
       self._initialize_multi_worker(cluster_resolver)
     else:
       self._initialize_local(cluster_resolver)
-    # Save the num_gpus_per_worker for configure method.
-    self._num_gpus_per_worker = cluster_resolver.num_accelerators()
 
   def _initialize_multi_worker(self, cluster_resolver):
     """Initialize devices for multiple workers.
@@ -127,10 +127,19 @@ class ParameterServerStrategyExtended(
     Raises:
       ValueError: if the cluster doesn't have ps jobs.
     """
-    num_gpus = cluster_resolver.num_accelerators()
+    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
+    # some cases.
+    if isinstance(cluster_resolver, TFConfigClusterResolver):
+      num_gpus = context.num_gpus()
+    else:
+      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = num_gpus
+
     cluster_spec = cluster_resolver.cluster_spec()
     task_type = cluster_resolver.task_type
-    task_id = cluster_resolver.task_index
+    task_id = cluster_resolver.task_id
     if not task_type or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
                        "`task_type` and `task_id`")
@@ -198,7 +207,17 @@ class ParameterServerStrategyExtended(
     """Initialize internal devices for local training."""
     worker_device = device_util.canonicalize("/device:CPU:0")
     self._input_host_device = numpy_dataset.SingleDevice(worker_device)
-    num_gpus = cluster_resolver.num_accelerators()
+
+    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
+    # some cases.
+    if isinstance(cluster_resolver, TFConfigClusterResolver):
+      num_gpus = context.num_gpus()
+    else:
+      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = num_gpus
+
     # Define compute devices which is a list of device strings and one for each
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
@@ -233,14 +252,6 @@ class ParameterServerStrategyExtended(
   def _validate_colocate_with_variable(self, colocate_with_variable):
     values.validate_colocate(colocate_with_variable, self)
 
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    return input_lib.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn),
-        self._input_workers,
-        0,
-        prefetch_on_device=True)
-
   def _make_dataset_iterator(self, dataset):
     return input_lib.DatasetIterator(dataset, self._input_workers,
                                      self._num_replicas_in_sync)
@@ -326,7 +337,8 @@ class ParameterServerStrategyExtended(
           if kwargs.get("trainable", True):
             collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
             l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l.remove(v)
+            if v in l:
+              l.remove(v)
           g.add_to_collections(collections, wrapped)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
@@ -414,7 +426,7 @@ class ParameterServerStrategyExtended(
       if group:
         return result
       else:
-        return nest.map_structure(self._unwrap, result)
+        return nest.map_structure(self._local_results, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
   def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
@@ -424,9 +436,9 @@ class ParameterServerStrategyExtended(
       if group:
         return result
       else:
-        return nest.map_structure(self._unwrap, result)
+        return nest.map_structure(self._local_results, result)
 
-  def _unwrap(self, val):
+  def _local_results(self, val):
     if isinstance(val, values.DistributedValues):
       return val.values
     return (val,)
@@ -471,8 +483,8 @@ class ParameterServerStrategyExtended(
       cluster_resolver = SimpleClusterResolver(
           cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
           task_type=task_type,
-          task_index=task_id,
-          num_accelerators=self._num_gpus_per_worker)
+          task_id=task_id,
+          num_accelerators={"GPU": self._num_gpus_per_worker})
       self._initialize_multi_worker(cluster_resolver)
 
     if session_config:
@@ -538,8 +550,7 @@ class ParameterServerStrategyExtended(
   def _global_batch_size(self):
     """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
 
-    `distribute_dataset` and `make_input_fn_iterator` assume per-replica
-    batching.
+    `make_input_fn_iterator` assumes per-replica batching.
 
     Returns:
       Boolean.
diff --git a/tensorflow/python/distribute/summary_op_util.py b/tensorflow/python/distribute/summary_op_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7086b365b6299afe36775409e5e6312a8517d5
--- /dev/null
+++ b/tensorflow/python/distribute/summary_op_util.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#==============================================================================
+"""Contains utility functions used by summary ops in distribution strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+
+
+def skip_summary():
+  """Determines if summary should be skipped.
+
+  If using multiple replicas in distributed strategy, skip summaries on all
+  replicas except the first one (replica_id=0).
+
+  Returns:
+    True if the summary is skipped; False otherwise.
+  """
+
+  # TODO(priyag): Add a new optional argument that will provide multiple
+  # alternatives to override default behavior. (e.g. run on last replica,
+  # compute sum or mean across replicas).
+  replica_context = distribution_strategy_context.get_replica_context()
+  if not replica_context:
+    return False
+  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
+  # initialized, remember to change here as well.
+  replica_id = replica_context.replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id and replica_id > 0
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7902815b136a4fcfe31b5a1131256f3ba58b458
--- /dev/null
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -0,0 +1,672 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU Strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.tpu import training_loop
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+def get_tpu_system_metadata(tpu_cluster_resolver):
+  """Retrieves TPU system metadata given a TPUClusterResolver."""
+  master = tpu_cluster_resolver.master()
+
+  # pylint: disable=protected-access
+  cluster_spec = tpu_cluster_resolver.cluster_spec()
+  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+  tpu_system_metadata = (
+      tpu_system_metadata_lib._query_tpu_system_metadata(
+          master,
+          cluster_def=cluster_def,
+          query_topology=False))
+
+  return tpu_system_metadata
+
+
+# TODO(jhseu): Deduplicate with MirroredStrategy?
+def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
+    strategy, device_map, logical_device, real_mirrored_creator,
+    *args, **kwargs):
+  # Figure out what collections this variable should be added to.
+  # We'll add the TPUMirroredVariable to those collections instead.
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # TODO(jhseu): Should we have different behavior for different
+  # synchronization settings?
+
+  # Get aggregation value
+  # TODO(jhseu): Support aggregation in a replica context.
+  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+  if aggregation not in [
+      vs.VariableAggregation.NONE,
+      vs.VariableAggregation.SUM,
+      vs.VariableAggregation.MEAN,
+      vs.VariableAggregation.ONLY_FIRST_REPLICA,
+  ]:
+    raise ValueError("Invalid variable aggregation mode: {} for variable: {}"
+                     .format(aggregation, kwargs["name"]))
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    devices = device_map.logical_to_actual_devices(logical_device)
+    value_list = real_mirrored_creator(devices, *args, **kwargs)
+    result = values.TPUMirroredVariable(
+        strategy, device_map, value_list, aggregation,
+        logical_device=logical_device)
+
+  if not (context.executing_eagerly() or ops.inside_function()):
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in value_list:
+        l.remove(v)
+    g.add_to_collections(var_collections, result)
+  return result
+
+
+@tf_export("distribute.experimental.TPUStrategy")
+class TPUStrategy(distribute_lib.DistributionStrategy):
+  """TPU distribution strategy implementation."""
+
+  def __init__(self,
+               tpu_cluster_resolver=None,
+               steps_per_run=None,
+               device_assignment=None):
+    """Initializes the TPUStrategy object.
+
+    Args:
+      tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+          which provides information about the TPU cluster.
+      steps_per_run: Number of steps to run on device before returning to the
+          host. Note that this can have side-effects on performance, hooks,
+          metrics, summaries etc.
+          This parameter is only used when Distribution Strategy is used with
+          estimator or keras.
+      device_assignment: Optional `tf.contrib.tpu.DeviceAssignment` to specify
+          the placement of replicas on the TPU cluster. Currently only supports
+          the usecase of using a single core within a TPU cluster.
+    """
+    super(TPUStrategy, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver, steps_per_run, device_assignment))
+
+  @property
+  def steps_per_run(self):
+    """DEPRECATED: use .extended.steps_per_run instead."""
+    return self._extended.steps_per_run
+
+  # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
+  # can use the default implementation.
+  # This implementation runs a single step. It does not use infeed or outfeed.
+  def experimental_run_v2(self, fn, args=(), kwargs=None):
+    """See base class."""
+    if context.executing_eagerly() and not ops.inside_function():
+      raise NotImplementedError(
+          "Eager mode not supported in TPUStrategy outside TF functions.")
+
+    if kwargs is None:
+      kwargs = {}
+
+    result = [None]
+    def replicated_fn(replica_id, replica_args, replica_kwargs):
+      """Wraps user function to provide replica ID and `Tensor` inputs."""
+      with _TPUReplicaContext(self, replica_id_in_sync_group=replica_id):
+        result[0] = fn(*replica_args, **replica_kwargs)
+      return result[0]
+
+    replicate_inputs = []  # By replica.
+    for i in range(self.num_replicas_in_sync):
+      replicate_inputs.append(
+          [constant_op.constant(i, dtype=dtypes.int32),
+           values.select_replica(i, args),
+           values.select_replica(i, kwargs)])
+
+    with self.scope():
+      replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs)
+
+    # Workaround for `tpu.replicate` behaviour when single `Tensor` returned.
+    replicate_outputs = [
+        nest.pack_sequence_as(result[0], nest.flatten(replica_outputs))
+        for replica_outputs in replicate_outputs]
+
+    device_map = self.extended._device_map  # pylint: disable=protected-access
+    return values.regroup(device_map, replicate_outputs)
+
+
+class TPUExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of TPUStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               tpu_cluster_resolver=None,
+               steps_per_run=None,
+               device_assignment=None):
+    super(TPUExtended, self).__init__(container_strategy)
+
+    if tpu_cluster_resolver is None:
+      tpu_cluster_resolver = TPUClusterResolver("")
+
+    if steps_per_run is None:
+      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
+      # not specified.
+      steps_per_run = 1
+
+    self._tpu_cluster_resolver = tpu_cluster_resolver
+    self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
+    self._device_assignment = device_assignment
+
+    # Device assignment is currently only supported for 1 core case.
+    if self._device_assignment:
+      assert isinstance(self._device_assignment,
+                        device_assignment_lib.DeviceAssignment)
+      if self._device_assignment.num_replicas != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if self._device_assignment.num_cores_per_replica != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+
+    # TODO(jhseu): Switch to DeviceAssignment to support pods and model
+    # parallelism.
+    self._tpu_devices = [d.name for d in self._tpu_metadata.devices
+                         if "device:TPU:" in d.name]
+
+    self._host_device = tpu_strategy_util.get_first_tpu_host_device(
+        self._tpu_cluster_resolver)
+
+    # Only create variables for the number of replicas we're running.
+    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
+    self._device_map = values.ReplicaDeviceMap(self._tpu_devices)
+
+    # Preload the data onto the TPUs.
+    input_worker_devices = collections.OrderedDict()
+    for tpu_device in self._tpu_devices:
+      host_device = _get_host_for_device(tpu_device)
+      input_worker_devices.setdefault(host_device, [])
+      input_worker_devices[host_device].append(tpu_device)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, tuple(input_worker_devices.items()))
+
+    # TODO(sourabhbajaj): Remove this once performance of running one step
+    # at a time is comparable to multiple steps.
+    self.steps_per_run = steps_per_run
+    self._require_static_shapes = True
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate_tpu_variable(colocate_with_variable, self)
+
+  def _make_dataset_iterator(self, dataset):
+    """Make iterators for each of the TPU hosts."""
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    num_workers = self._input_workers.num_workers
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, input_contexts)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self._host_device),
+        session)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
+  # a mechanism to infer the outputs of `fn`. Pending b/110550782.
+  def _experimental_run_steps_on_iterator(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
+    output_shapes = multi_worker_iterator.output_shapes
+    shapes = nest.flatten(output_shapes)
+    if any(not s.is_fully_defined() for s in shapes):
+      raise ValueError(
+          "TPU currently requires fully defined shapes. Either use "
+          "set_shape() on the input tensors or use "
+          "dataset.batch(..., drop_remainder=True).")
+
+    # Wrap `fn` for repeat.
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+    ctx = input_lib.MultiStepContext()
+
+    def run_fn(inputs):
+      """Single step on the TPU device."""
+      fn_result = fn(ctx, inputs)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      if flat_last_step_outputs:
+        with ops.control_dependencies([fn_result]):
+          return [array_ops.identity(f) for f in flat_last_step_outputs]
+      else:
+        return fn_result
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop and TPU replicate context. This is useful in cases
+    # where we might need to exit these contexts and get back to the outer
+    # context to do some things, for e.g. create an op which should be
+    # evaluated only once at the end of the loop on the host. One such usage
+    # is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    def rewrite_fn(*args):
+      """The rewritten step fn running on TPU."""
+      del args
+
+      per_replica_inputs = multi_worker_iterator.get_next()
+      replicate_inputs = []
+      for replica_id in range(self._num_replicas_in_sync):
+        select_replica = lambda x: values.select_replica(replica_id, x)  # pylint: disable=cell-var-from-loop
+        replicate_inputs.append((nest.map_structure(
+            select_replica, per_replica_inputs),))
+
+      replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
+
+      # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
+      # will flatten it in this case. If run_fn has no tensor outputs,
+      # tpu.replicate returns a list of no_ops, we will keep the output as it
+      # is.
+      if isinstance(replicate_outputs[0], list):
+        replicate_outputs = nest.flatten(replicate_outputs)
+
+      return replicate_outputs
+
+    # TODO(sourabhbajaj): The input to while loop should be based on the
+    # output type of the step_fn
+    assert isinstance(initial_loop_values, list)
+    initial_loop_values = initial_loop_values * self._num_replicas_in_sync
+
+    # Put the while loop op on TPU host 0.
+    with ops.device(self._host_device):
+      if self.steps_per_run == 1:
+        replicate_outputs = rewrite_fn()
+      else:
+        replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                                 initial_loop_values)
+
+    del self._outer_control_flow_context
+    ctx.run_op = control_flow_ops.group(replicate_outputs)
+
+    if isinstance(replicate_outputs, list):
+      # Filter out any ops from the outputs, typically this would be the case
+      # when there were no tensor outputs.
+      last_step_tensor_outputs = [
+          x for x in replicate_outputs if not isinstance(x, ops.Operation)
+      ]
+
+      # Outputs are currently of the structure (flattened)
+      # [output0_device0, output1_device0, output2_device0,
+      #  output0_device1, output1_device1, output2_device1,
+      #  ...]
+      # Convert this to the following structure instead: (grouped by output)
+      # [[output0_device0, output0_device1],
+      #  [output1_device0, output1_device1],
+      #  [output2_device0, output2_device1]]
+      output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
+      last_step_tensor_outputs = [
+          last_step_tensor_outputs[i::output_num] for i in range(output_num)
+      ]
+    else:
+      # no tensors returned.
+      last_step_tensor_outputs = []
+
+    _set_last_step_outputs(ctx, last_step_tensor_outputs)
+    return ctx
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # TODO(jhseu): Consider making it so call_for_each_replica implies that
+    # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
+    with _TPUReplicaContext(self._container_strategy()):
+      return fn(*args, **kwargs)
+
+  def _experimental_initialize_system(self):
+    """Experimental method added to be used by Estimator.
+
+    This is a private method only to be used by Estimator. Other frameworks
+    should directly be calling `tf.contrib.distribute.initialize_tpu_system`
+    """
+    tpu_strategy_util.initialize_tpu_system(self._tpu_cluster_resolver)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      device_map = self._device_map
+      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      device_map = colocate_with.device_map
+      logical_device = colocate_with.logical_device
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      value_list = []
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = value_list[0].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            if context.executing_eagerly() or ops.inside_function():
+              with ops.init_scope():
+                kwargs["initial_value"] = array_ops.identity(
+                    value_list[0].value())
+            else:
+              def initial_value_fn(device=d):
+                with ops.device(device):
+                  return array_ops.identity(value_list[0].initial_value)
+              kwargs["initial_value"] = initial_value_fn
+          with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
+            v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.TPUMirroredVariable)
+          value_list.append(v)
+      return value_list
+
+    return _create_tpu_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        # TODO(jhseu):  Revisit once we support model-parallelism.
+        value *= (1. / self._num_replicas_in_sync)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
+        raise NotImplementedError(
+            "Currently only support sum & mean in TPUStrategy.")
+      return tpu_ops.cross_replica_sum(value)
+
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+
+    devices = cross_device_ops_lib.get_devices_from(destinations)
+    if len(devices) != 1:
+      raise ValueError("Multiple devices are not supported for TPUStrategy")
+
+    # Always performs the reduction on the TPU host.
+    with ops.device(self._host_device):
+      output = math_ops.add_n(value.values)
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        output *= (1. / len(value.values))
+
+    # If necessary, copy to requested destination.
+    dest_canonical = device_util.canonicalize(devices[0])
+    host_canonical = device_util.canonicalize(self._host_device)
+
+    if dest_canonical != host_canonical:
+      with ops.device(devices[0]):
+        output = array_ops.identity(output)
+
+    return output
+
+  def _update(self, var, fn, args, kwargs, group):
+    assert isinstance(var, values.TPUMirroredVariable)
+    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+      if group:
+        return fn(var, *args, **kwargs)
+      else:
+        return (fn(var, *args, **kwargs),)
+
+    # Otherwise, we revert to MirroredStrategy behavior and update each variable
+    # directly.
+    updates = []
+    for i, (d, v) in enumerate(zip(var.devices, var.values)):
+      name = "update_%d" % i
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates.append(fn(v,
+                          *values.select_device_mirrored(d, args),
+                          **values.select_device_mirrored(d, kwargs)))
+    return values.update_regroup(self, self._device_map, updates, group)
+
+  def read_var(self, var):
+    assert isinstance(var, values.TPUMirroredVariable)
+    return var.read_value()
+
+  def _local_results(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    elif isinstance(val, list):
+      # TODO(josh11b): We need to remove this case; per device values should
+      # be represented using a PerReplica wrapper instead of a list with
+      # one entry per device.
+      return tuple(val)
+    elif isinstance(val, values.TPUMirroredVariable):
+      # pylint: disable=protected-access
+      if values._enclosing_tpu_context() is not None:
+        return (val,)
+      return val.values
+    return (val,)
+
+  def value_container(self, value):
+    return value
+
+  def _broadcast_to(self, tensor, destinations):
+    del destinations
+    return tensor
+
+  @property
+  def num_hosts(self):
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_hosts
+
+    return len(set([self._device_assignment.host_device(r)
+                    for r in range(self._device_assignment.num_replicas)]))
+
+  @property
+  def num_replicas_per_host(self):
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_of_cores_per_host
+
+    # TODO(sourabhbajaj): Remove this method we use inputs and remove infeed
+    # as the computation of num_replicas_per_host is not a constant
+    # when using device_assignment. This is a temporary workaround to support
+    # StatefulRNN as everything is 1 in that case.
+    # This method needs to take host_id as input for correct computation.
+    max_models_per_host = (self._tpu_metadata.num_of_cores_per_host //
+                           self._device_assignment.num_cores_per_replica)
+    models_per_host = min(self._device_assignment.num_replicas,
+                          max_models_per_host)
+    return models_per_host * self._device_assignment.num_cores_per_replica
+
+  @property
+  def _num_replicas_in_sync(self):
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_cores
+    return (self._device_assignment.num_replicas *
+            self._device_assignment.num_cores_per_replica)
+
+  @property
+  def experimental_between_graph(self):
+    return False
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  @property
+  def worker_devices(self):
+    return self._tpu_devices
+
+  @property
+  def parameter_devices(self):
+    return self._tpu_devices
+
+  def non_slot_devices(self, var_list):
+    return self._host_device
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    del colocate_with
+    with ops.device(self._host_device), distribute_lib.UpdateContext(
+        self._host_device):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._local_results, result)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    del cluster_spec, task_type, task_id
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    cluster_spec = self._tpu_cluster_resolver.cluster_spec()
+    if cluster_spec:
+      updated_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+    return updated_config
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
+
+
+class _TPUReplicaContext(distribute_lib.ReplicaContext):
+  """Replication Context class for TPU Strategy."""
+
+  # TODO(sourabhbajaj): Call for each replica should be updating this.
+  # TODO(b/118385803): Always properly initialize replica_id.
+  def __init__(self, strategy, replica_id_in_sync_group=None):
+    if replica_id_in_sync_group is None:
+      replica_id_in_sync_group = constant_op.constant(0, dtypes.int32)
+    distribute_lib.ReplicaContext.__init__(
+        self, strategy, replica_id_in_sync_group=replica_id_in_sync_group)
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    ds = self._strategy
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+
+    if replica_id is None:  # Non-constant `Tensor` inside `tpu.replicate`.
+      # TODO(cjfj): Return other devices when model parallelism is supported.
+      return (tpu.core(0),)
+    else:
+      return (ds.extended.worker_devices[replica_id],)
+
+
+def _get_host_for_device(device):
+  spec = tf_device.DeviceSpec.from_string(device)
+  return tf_device.DeviceSpec(
+      job=spec.job, replica=spec.replica, task=spec.task,
+      device_type="CPU", device_index=0).to_string()
+
+
+def _set_last_step_outputs(ctx, last_step_tensor_outputs):
+  """Sets the last step outputs on the given context."""
+  # Convert replicate_outputs to the original dict structure of
+  # last_step_outputs.
+  last_step_tensor_outputs_dict = nest.pack_sequence_as(
+      ctx.last_step_outputs, last_step_tensor_outputs)
+
+  for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+    output = last_step_tensor_outputs_dict[name]
+    # For outputs that have already been reduced, take the first value
+    # from the list as each value should be the same. Else return the full
+    # list of values.
+    # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
+    # value.
+    if reduce_op is not None:
+      # TODO(priyag): Should this return the element or a list with 1 element
+      last_step_tensor_outputs_dict[name] = output[0]
+  ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 585ae1bd6c627fc270a9617c37f03fd97d9040e5..fc74fc67bd873dc9e86d7d6091ff2de092685181 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
@@ -272,7 +272,7 @@ class DistributedValues(object):
   def device_map(self):
     return self._device_map
 
-  # TODO(josh11b): Replace unwrap with this?
+  # TODO(josh11b): Replace experimental_local_results with this?
   @property
   def values(self):
     return self._values
@@ -422,6 +422,16 @@ def _assert_strategy(strategy):
         (current_strategy, strategy))
 
 
+@contextlib.contextmanager
+def _enter_or_assert_strategy(strategy):
+  if not distribution_strategy_context.has_strategy():
+    with strategy.scope():
+      yield
+  else:
+    _assert_strategy(strategy)
+    yield
+
+
 DistributedVarOp = collections.namedtuple(
     "DistributedVarOp", ["name", "graph", "type"])
 
@@ -534,6 +544,10 @@ class DistributedVariable(DistributedDelegate):
   def shape(self):
     return self.primary.shape
 
+  @property
+  def trainable(self):
+    return self.primary.trainable
+
   @property
   def distribute_strategy(self):
     return self._distribute_strategy
@@ -608,8 +622,9 @@ def validate_colocate(v, extended):
 
 def _apply_aggregation(strategy, value, aggregation, destinations):
   if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(strategy.unwrap(value)[0],
-                                          destinations=destinations)
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
   reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
   return strategy.extended.reduce_to(reduce_op, value, destinations)
 
@@ -630,7 +645,7 @@ class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
 
 
 class MirroredVariable(DistributedVariable, Mirrored,
-                       checkpointable.Checkpointable):
+                       trackable.Trackable):
   """Holds a map from device to variables whose values are kept in sync."""
 
   def __init__(
@@ -646,39 +661,39 @@ class MirroredVariable(DistributedVariable, Mirrored,
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    _assert_strategy(self._distribute_strategy)
-    f = kwargs.pop("f")
-    if distribution_strategy_context.in_cross_replica_context():
-      update_device = distribute_lib.get_update_device()
-      if update_device is not None:
-        # We are calling an assign function on the mirrored variable in an
-        # update context.
-        v = self.get(device=update_device)
-        return f(v, *args, **kwargs)
-
-      # We are calling assign on the mirrored variable in cross replica context,
-      # use `strategy.extended.update()` to update the variable.
-      return self._distribute_strategy.extended.update(
-          self, f, args=args, kwargs=kwargs)
-    else:
-      _assert_replica_context(self._distribute_strategy)
-      # We are calling an assign function on the mirrored variable in replica
-      # context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function on each of the mirrored variables with the reduced
-      # value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "MirroredVariable in Replica Context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.extended.update(
-            self, f, args=(v,) + other_args, kwargs=other_kwargs)
-
-      return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=args, kwargs=kwargs)
+    with _enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if distribution_strategy_context.in_cross_replica_context():
+        update_device = distribute_lib.get_update_device()
+        if update_device is not None:
+          # We are calling an assign function on the mirrored variable in an
+          # update context.
+          v = self.get(device=update_device)
+          return f(v, *args, **kwargs)
+
+        # We are calling assign on the mirrored variable in cross replica
+        # context, use `strategy.extended.update()` to update the variable.
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        _assert_replica_context(self._distribute_strategy)
+        # We are calling an assign function on the mirrored variable in replica
+        # context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function on each of the mirrored variables with the
+        # reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError("You must specify an aggregation method to update a "
+                           "MirroredVariable in Replica Context.")
+
+        def merge_fn(strategy, value, *other_args, **other_kwargs):
+          v = _apply_aggregation(strategy, value, self._aggregation, self)
+          return strategy.extended.update(
+              self, f, args=(v,) + other_args, kwargs=other_kwargs)
+
+        return distribution_strategy_context.get_replica_context().merge_call(
+            merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -710,7 +725,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -720,7 +735,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
     """
     def _saveable_factory(name=self._common_name):
       return _MirroredSaveable(self, self.primary, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
 # Register a conversion function which reads the value of the variable,
@@ -752,7 +767,7 @@ def _enclosing_tpu_context():
 # tpu.replicate() because it assumes that you're in a device context where you
 # can operate on a single version of the variable, but a tpu.replicate()
 # operates on all variables and is replicated during a rewrite pass.
-class TPUMirroredVariable(checkpointable.Checkpointable):
+class TPUMirroredVariable(trackable.Trackable):
   """Holds a map from device to TPU variables whose values are kept in sync."""
 
   def __init__(
@@ -810,7 +825,7 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
   def device_map(self):
     return self._device_map
 
-  # TODO(josh11b): Replace unwrap with this?
+  # TODO(josh11b): Replace experimental_local_results with this?
   @property
   def values(self):
     return self._values
@@ -904,7 +919,7 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
 
   @property
   def device(self):
-    return self._get().device
+    return self.handle.device
 
   def eval(self, session=None):
     return self.primary.eval(session)
@@ -916,42 +931,43 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    _assert_strategy(self._distribute_strategy)
-    f = kwargs.pop("f")
-    if distribution_strategy_context.in_cross_replica_context():
-      if _enclosing_tpu_context() is not None:
+    with _enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if distribution_strategy_context.in_cross_replica_context():
+        if _enclosing_tpu_context() is not None:
+          return self._distribute_strategy.extended.update(
+              self, f, args=args, kwargs=kwargs)
+
+        update_device = distribute_lib.get_update_device()
+        # We are calling update on the mirrored variable in cross replica
+        # context.
+        if update_device is not None:
+          # We are calling an assign function on the mirrored variable in cross
+          # replica context.
+          v = self._get(device=update_device)
+          return f(v, *args, **kwargs)
+
         return self._distribute_strategy.extended.update(
             self, f, args=args, kwargs=kwargs)
-
-      update_device = distribute_lib.get_update_device()
-      # We are calling update on the mirrored variable in cross replica context.
-      if update_device is not None:
-        # We are calling an assign function on the mirrored variable in cross
-        # replica context.
-        v = self._get(device=update_device)
-        return f(v, *args, **kwargs)
-
-      return self._distribute_strategy.extended.update(
-          self, f, args=args, kwargs=kwargs)
-    else:
-      _assert_replica_context(self._distribute_strategy)
-      # We are calling an assign function on the mirrored variable in replica
-      # context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function on each of the mirrored variables with the reduced
-      # value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "TPUMirroredVariable in Replica Context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.extended.update(
-            self, f, args=(v,) + other_args, kwargs=other_kwargs)
-
-      return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=args, kwargs=kwargs)
+      else:
+        _assert_replica_context(self._distribute_strategy)
+        # We are calling an assign function on the mirrored variable in replica
+        # context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function on each of the mirrored variables with the
+        # reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError("You must specify an aggregation method to update a "
+                           "TPUMirroredVariable in Replica Context.")
+
+        def merge_fn(strategy, value, *other_args, **other_kwargs):
+          v = _apply_aggregation(strategy, value, self._aggregation, self)
+          return strategy.extended.update(
+              self, f, args=(v,) + other_args, kwargs=other_kwargs)
+
+        return distribution_strategy_context.get_replica_context().merge_call(
+            merge_fn, args=args, kwargs=kwargs)
 
   @contextlib.contextmanager
   def _handle_graph(self, handle):
@@ -983,7 +999,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._read_variable_op()
 
   def assign_sub(self, *args, **kwargs):
-    def assign_sub_fn(var, delta, **kw):
+    def assign_sub_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -997,7 +1014,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._assign_func(f=assign_sub_fn, *args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    def assign_add_fn(var, delta, **kw):
+    def assign_add_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -1011,7 +1029,8 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     return self._assign_func(f=assign_add_fn, *args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    def assign_fn(var, value, **kw):
+    def assign_fn(var, value, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -1080,12 +1099,14 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.in_cross_replica_context():
-      return self.primary._as_graph_element()
-    return self._read_variable_op()
+    if _enclosing_tpu_context() is None:
+      if distribution_strategy_context.in_cross_replica_context():
+        return self.primary._as_graph_element()
+      return self._get()._as_graph_element()
+    return None
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -1095,7 +1116,7 @@ class TPUMirroredVariable(checkpointable.Checkpointable):
     """
     def _saveable_factory(name=self._common_name):
       return _MirroredSaveable(self, self.primary, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -1170,28 +1191,28 @@ ops.register_tensor_conversion_function(TPUMirroredVariable,
 ops.register_dense_tensor_like_type(TPUMirroredVariable)
 
 
-class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
-  """Class for defining how to restore a ReplicaLocalVariable."""
+class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Class for defining how to restore a SyncOnReadVariable."""
 
-  def __init__(self, replica_local_variable, name):
-    self._replica_local_variable = replica_local_variable
+  def __init__(self, sync_on_read_variable, name):
+    self._sync_on_read_variable = sync_on_read_variable
     # We use a callable so that we don't have to evaluate this expression
     # in the case where we are trying to restore instead of save.
     def tensor():
-      strategy = replica_local_variable._distribute_strategy  # pylint: disable=protected-access
-      return strategy.extended.read_var(replica_local_variable)
+      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
+      return strategy.extended.read_var(sync_on_read_variable)
 
     spec = saver.BaseSaverBuilder.SaveSpec(
         tensor=tensor,
         slice_spec="",
         name=name,
-        dtype=replica_local_variable.dtype)
-    super(_ReplicaLocalSaveable, self).__init__(tensor, [spec], name)
+        dtype=sync_on_read_variable.dtype)
+    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    return self._replica_local_variable.assign(tensor)
+    return self._sync_on_read_variable.assign(tensor)
 
 
 def _assert_replica_context(strategy):
@@ -1204,14 +1225,13 @@ def _assert_replica_context(strategy):
         "Replica-local variables may only be assigned in a replica context.")
 
 
-class ReplicaLocalVariable(DistributedVariable, PerReplica,
-                           checkpointable.Checkpointable):
+class SyncOnReadVariable(DistributedVariable, PerReplica, trackable.Trackable):
   """Holds a map from device to variables whose values are reduced on save."""
 
   def __init__(
       self, strategy, device_map, values, aggregation, logical_device=None):
     self._aggregation = aggregation
-    super(ReplicaLocalVariable, self).__init__(
+    super(SyncOnReadVariable, self).__init__(
         strategy, device_map, values, logical_device=logical_device)
 
   def assign_sub(self, *args, **kwargs):
@@ -1243,11 +1263,8 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       return self.primary
-    # TODO(josh11b): Use a strategy-specific method.
-    total = math_ops.add_n(self._values)
-    if self._aggregation == vs.VariableAggregation.MEAN:
-      return total * (1./ len(self._values))
-    return total
+    return self._distribute_strategy.reduce(
+        reduce_util.ReduceOp.from_variable_aggregation(self.aggregation), self)
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
@@ -1256,28 +1273,28 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
-    ReplicaLocalVariables.
+    `SyncOnReadVariable`s.
 
     Returns:
       A dictionary mapping attribute names to `SaveableObject` factories.
     """
     def _saveable_factory(name=self._common_name):
-      return _ReplicaLocalSaveable(self, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+      return _SyncOnReadSaveable(self, name)
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
-# Register a conversion function for ReplicaLocalVariable which allows as_ref to
+# Register a conversion function for SyncOnReadVariable which allows as_ref to
 # be true.
-def _tensor_conversion_replica_local(var, dtype=None, name=None, as_ref=False):
+def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return ops.internal_convert_to_tensor(
       var.get(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-ops.register_tensor_conversion_function(ReplicaLocalVariable,
-                                        _tensor_conversion_replica_local)
+ops.register_tensor_conversion_function(SyncOnReadVariable,
+                                        _tensor_conversion_sync_on_read)
 
 
 def regroup(device_map, values, wrap_class=PerReplica):
@@ -1326,7 +1343,7 @@ def regroup(device_map, values, wrap_class=PerReplica):
       break
   # Consider three cases where same_id is true:
   # * If v0 is a DistributedVariable (a MirroredVariable or
-  #   ReplicaLocalVariable, and same_id means it is the same across all
+  #   SyncOnReadVariable, and same_id means it is the same across all
   #   devices), we want to return it. We check DistributedVariable
   #   specifically since it can look like it has a
   #   _distributed_container member since its members do.
@@ -1341,7 +1358,7 @@ def regroup(device_map, values, wrap_class=PerReplica):
     return v0
 
   # Detect the case where each device has a parallel component of the
-  # same MirroredVariable (or ReplicaLocalVariable). In this case we
+  # same MirroredVariable (or SyncOnReadVariable). In this case we
   # want to return the containing MirroredVariable, after a bunch of
   # sanity checking. In particular, each component should have the
   # same container, and the devices of the variables should match the
@@ -1394,7 +1411,7 @@ def update_regroup(extended, device_map, updates, group):
   # so we can avoid all these nest operations.
   regrouped = regroup(device_map, updates, Mirrored)
   if not group:
-    return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
+    return nest.map_structure(extended._local_results, regrouped)  # pylint: disable=protected-access
   grouped_flat = []
   for u in nest.flatten(regrouped):
     if isinstance(u, DistributedValues):
@@ -1436,7 +1453,7 @@ def value_container(val):
 
 
 # TODO(josh11b): Descend from Variable.
-class AggregatingVariable(checkpointable.Checkpointable):
+class AggregatingVariable(trackable.Trackable):
   """A wrapper around a variable that aggregates updates across replicas."""
 
   def __init__(self, strategy, v, aggregation):
@@ -1458,35 +1475,35 @@ class AggregatingVariable(checkpointable.Checkpointable):
     return getattr(self._v, name)
 
   def _assign_func(self, *args, **kwargs):
-    _assert_strategy(self._distribute_strategy)
-    f = kwargs.pop("f")
-    if distribution_strategy_context.in_cross_replica_context():
-      update_device = distribute_lib.get_update_device()
-      if update_device is not None:
-        # We are calling an assign function in an update context.
-        return f(self._v, *args, **kwargs)
-
-      # We are calling an assign function in cross replica context, wrap it in
-      # an update call.
-      return self._distribute_strategy.extended.update(
-          self, f, args=args, kwargs=kwargs)
-    else:
-      replica_context = distribution_strategy_context.get_replica_context()
-      assert replica_context
-      # We are calling an assign function in replica context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function with the reduced value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "a variable in replica context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.extended.update(
-            self, f, args=(v,) + other_args, kwargs=other_kwargs)
-
-      return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
+    with _enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if distribution_strategy_context.in_cross_replica_context():
+        update_device = distribute_lib.get_update_device()
+        if update_device is not None:
+          # We are calling an assign function in an update context.
+          return f(self._v, *args, **kwargs)
+
+        # We are calling an assign function in cross replica context, wrap it in
+        # an update call.
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        replica_context = distribution_strategy_context.get_replica_context()
+        assert replica_context
+        # We are calling an assign function in replica context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function with the reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError("You must specify an aggregation method to update a "
+                           "a variable in replica context.")
+
+        def merge_fn(strategy, value, *other_args, **other_kwargs):
+          v = _apply_aggregation(strategy, value, self._aggregation, self)
+          return strategy.extended.update(
+              self, f, args=(v,) + other_args, kwargs=other_kwargs)
+
+        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -1514,7 +1531,7 @@ class AggregatingVariable(checkpointable.Checkpointable):
 
   # TODO(josh11b): Test saving & restoring.
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
 
   # pylint: disable=multiple-statements
   def __add__(self, o): return self._v + o
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 3a35734051e0e129b098726ecaa73d73b617a3e2..0c3db5114d718fba2551e578cc950528f5ae7418 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -6,6 +6,7 @@ load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
 )
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 cc_library(
     name = "pywrap_tfe_lib",
@@ -54,9 +55,11 @@ py_library(
         ":core",
         ":def_function",
         ":execute",
+        ":execution_callbacks",
         ":function",
         ":graph_only_ops",
         ":profiler",
+        ":profiler_client",
         ":tape",
         ":test",
         ":wrap_function",
@@ -99,6 +102,7 @@ py_library(
     deps = [
         ":context",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -108,8 +112,19 @@ cuda_py_test(
     additional_deps = [
         ":profiler",
         ":test",
-        "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:constant_op",
+        "//tensorflow/core/profiler:protos_all_py",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+py_library(
+    name = "profiler_client",
+    srcs = ["profiler_client.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
     ],
 )
 
@@ -129,6 +144,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -150,6 +166,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
     ],
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -167,6 +184,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:pywrap_tensorflow",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -181,6 +199,7 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -196,6 +215,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -213,6 +233,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
     ],
     shard_count = 5,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -313,6 +334,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -383,12 +405,14 @@ cuda_py_test(
         ":context",
         ":function",
         ":test",
+        ":profiler",
         "//third_party/py/numpy",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/keras",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_logged_benchmark(
@@ -422,6 +446,7 @@ cuda_py_test(
         ":test",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:config",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -434,6 +459,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -479,6 +505,7 @@ cuda_py_test(
     tags = [
         "optonly",  # The test is too slow in non-opt mode
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -497,7 +524,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -509,6 +536,18 @@ py_library(
     deps = [
         ":context",
         "//tensorflow/python:framework_ops",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "lift_to_graph_test",
+    size = "medium",
+    srcs = ["lift_to_graph_test.py"],
+    additional_deps = [
+        "lift_to_graph",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:framework_ops",
     ],
 )
 
@@ -523,6 +562,23 @@ tf_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "def_function_xla_test",
+    srcs = ["def_function_xla_test.py"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "nomac",
+    ],
+    deps = [
+        ":def_function",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_library(
     name = "wrap_function",
     srcs = ["wrap_function.py"],
@@ -535,7 +591,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 6117d8a4ea154fc09acc77f8dbd5daa5afea81e0..0bce60d69979a774ee56c000d50d2b022143c9ae 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -80,6 +80,8 @@ def make_attr(attr_type, value):
     return tensor_shape.as_shape(value).as_proto()
   elif attr_type == [pywrap_tensorflow.TF_ATTR_SHAPE]:
     return [tensor_shape.as_shape(v).as_proto() for v in value]
+  elif isinstance(value, str):
+    return value.encode()
   return value
 
 
@@ -622,13 +624,14 @@ def _zeros(shape, dtype):
 
 
 def _ones(shape, dtype):
-  if dtypes.as_dtype(dtype) == dtypes.string:
+  as_dtype = dtypes.as_dtype(dtype)
+  if as_dtype == dtypes.string:
     return None
 
   if not context.context().executing_eagerly():
     return array_ops.ones(shape, dtype)
 
-  if dtypes.as_dtype(dtype).is_bool:
+  if as_dtype.is_bool:
     value = True
   else:
     value = 1
@@ -971,13 +974,15 @@ class GradientTape(object):
     definition of a Jacobian.
 
     Example usage:
-
+    
+    ```python
     with tf.GradientTape() as g:
       x  = tf.constant([1.0, 2.0])
       g.watch(x)
       y = x * x
     jacobian = g.jacobian(y, x)
     # jacobian value is [[2., 0.], [0., 4.]]
+    ```
 
     Args:
       target: Tensor to be differentiated.
@@ -1080,12 +1085,14 @@ class GradientTape(object):
     result in the jacobian computation given the independence assumption.
 
     Example usage:
+    ```python
     with tf.GradientTape() as g:
       x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
       g.watch(x)
       y = x * x
     batch_jacobian = g.batch_jacobian(y, x)
     # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
+    ```
 
     Args:
       target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 5f4fda8897b3913ffeb165819a4b7859821ec3b8..7d887d3806c3e25a898dc98f3837eb9118744b3a 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -132,6 +132,20 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(dx, 2.0)
     self.assertAllEqual(dy, 3.0)
 
+  def testCustomGradientEmptyError(self):
+
+    @custom_gradient.custom_gradient
+    def identity(x):
+      def grad(_):
+        return []  # This return value is wrong!
+      return x, grad
+
+    x = variables.Variable(1.0)
+    with backprop.GradientTape() as t:
+      y = identity(x)
+    with self.assertRaises(ValueError):
+      t.gradient(y, [x])
+
   def testOutputGradUsedInComputation(self):
     with backprop.GradientTape() as t:
       x = constant_op.constant(3.0)
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 62c4a12cbfade450cf7c2acff2ec4d14c30ab1aa..8a1319f9efa847911b799cb54007f5971973ebb2 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import function
+from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -145,7 +146,7 @@ class MicroBenchmarks(test.Benchmark):
   def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
     ctx = context.context()
-    with ctx.execution_mode(execution_mode):
+    with context.execution_mode(execution_mode):
       func()
       if execution_mode == context.ASYNC:
         ctx.async_wait()
@@ -815,10 +816,26 @@ class MicroBenchmarks(test.Benchmark):
       model = make_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
+  def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self):
+    profiler.start()
+    with context.graph_mode():
+      model = make_keras_model(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_functional_fit_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model, run_eagerly=True)
 
+  def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler(
+      self):
+    profiler.start()
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_sequential_fit(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model)
@@ -890,10 +907,6 @@ class MicroBenchmarks(test.Benchmark):
     self._run(scan, 100)
 
   def benchmarkScanDefun(self):
-    if context.num_gpus():
-      # TODO(b/122081934): Re-enable this after figuring out why this became
-      # really slow with control flow V2
-      return
     elems = math_ops.range(1600)
 
     @function.defun
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 6c5b191cf9d502593e94c845fae19cec356cef15..0092ab5430311f5de9750b40a4cdc92d6dbc5a07 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -44,6 +44,7 @@ default_execution_mode = EAGER_MODE if tf2.enabled() else GRAPH_MODE
 # Note that we do not protect this with a lock and instead rely on python's GIL
 # and the idempotent nature of writes to provide thread safety.
 _device_parsing_cache = {}
+_starting_device_spec = pydev.DeviceSpec.from_string("")
 
 _MAXINT32 = 2**31 - 1
 
@@ -82,6 +83,7 @@ class _EagerTensorCache(object):
 
 class FunctionCallOptions(object):
   """Options applied at call sites of eager functions.
+
   Eager functions are functions decorated with tf.contrib.eager.defun.
   """
 
@@ -129,36 +131,42 @@ class FunctionCallOptions(object):
                        "proto or None. got: {}".format(type(config)))
 
 
-# TODO(agarwal): better name ?
-class _EagerContext(threading.local):
-  """Thread local eager context."""
+class _ThreadLocalData(threading.local):
+  """Thread local storage for the eager context."""
 
-  def __init__(self, config=None):
-    super(_EagerContext, self).__init__()
-    self.device_spec = pydev.DeviceSpec.from_string("")
-    self.device_name = self.device_spec.to_string()
+  def __init__(self):
+    super(_ThreadLocalData, self).__init__()
+    self.device_spec = _starting_device_spec
+    self.device_name = ""
     self.mode = default_execution_mode
     self.is_eager = default_execution_mode == EAGER_MODE
     self.scope_name = ""
-    self.recording_summaries = False
-    self.summary_writer_resource = None
+    self.summary_writer = None
+    self.summary_recording = None
+    self.summary_recording_distribution_strategy = True
+    self.summary_step = None
     self.scalar_cache = {}
-    self.ones_rank_cache = _EagerTensorCache()
-    self.zeros_cache = _EagerTensorCache()
-    self.execution_mode = None
-
-    # Default rewriter config corresponds to turning all default grappler
-    # optimizations on.
-    base_config = config_pb2.ConfigProto()
+    self._ones_rank_cache = None
+    self._zeros_cache = None
+    self.execution_mode = SYNC
+    self.function_call_options = None
 
-    if config is not None:
-      base_config.MergeFrom(config)
+  @property
+  def ones_rank_cache(self):
+    if not self._ones_rank_cache:
+      self._ones_rank_cache = _EagerTensorCache()
+    return self._ones_rank_cache
 
-    self.function_call_options = FunctionCallOptions(config_proto=base_config)
+  @property
+  def zeros_cache(self):
+    if not self._zeros_cache:
+      self._zeros_cache = _EagerTensorCache()
+    return self._zeros_cache
 
 
 ContextSwitch = collections.namedtuple(
-    "ContextSwitch", ["is_building_function", "enter_context_fn"])
+    "ContextSwitch", ["is_building_function", "enter_context_fn",
+                      "device_stack"])
 
 
 # `_ContextSwitchStack` is a `threading.local` to match the semantics of
@@ -175,9 +183,10 @@ class _ContextSwitchStack(threading.local):
       # across threads, since (1) `enable_eager_execution` modifies a
       # process-level flag (`default_execution_mode`) and (2) `__init__` is
       # called each time a threading.local object is used in a separate thread.
-      self.push(is_building_function=False, enter_context_fn=eager_mode)
+      self.push(is_building_function=False, enter_context_fn=eager_mode,
+                device_stack=None)
 
-  def push(self, is_building_function, enter_context_fn):
+  def push(self, is_building_function, enter_context_fn, device_stack):
     """Push metadata about a context switch onto the stack.
 
     A context switch can take any one of the two forms: installing a graph as
@@ -188,10 +197,14 @@ class _ContextSwitchStack(threading.local):
       is_building_function: (bool.) Whether the context is building a function.
       enter_context_fn: (function.) A callable that executes the context switch.
         For example, `graph.as_default` or `eager_mode`.
+      device_stack: If applicable, the device function stack for this
+        graph. When breaking out of graphs in init_scope, the innermost nonempty
+        device stack is used. Eager contexts put `None` here and the value is
+        never used.
     """
 
     self.stack.append(
-        ContextSwitch(is_building_function, enter_context_fn))
+        ContextSwitch(is_building_function, enter_context_fn, device_stack))
 
   def pop(self):
     """Pop the stack."""
@@ -218,27 +231,27 @@ class Context(object):
         options for the Context. Note that a lot of these options may be
         currently unimplemented or irrelevant when eager execution is enabled.
       device_policy: (Optional.) What policy to use when trying to run an
-         operation on a device with inputs which are not on that device.
-         When set to None, an appropriate value will be picked automatically.
-         The value picked may change between TensorFlow releases.
-
-         Defaults to tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32.
-         Valid values:
-         - tfe.DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is
-           not correct.
-         - tfe.DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
-           right device but raises a warning.
-         - tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
-           hide performance problems.
-         - tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
-           raising errors on the other ones.
+        operation on a device with inputs which are not on that device.
+        When set to None, an appropriate value will be picked automatically.
+        The value picked may change between TensorFlow releases.
+
+        Defaults to DEVICE_PLACEMENT_SILENT.
+        Valid values:
+        - DEVICE_PLACEMENT_EXPLICIT: raises an error if the placement is
+          not correct.
+        - DEVICE_PLACEMENT_WARN: copies the tensors which are not on the
+          right device but raises a warning.
+        - DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
+          hide performance problems.
+        - DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+          raising errors on the other ones.
       execution_mode: (Optional.) Policy controlling how operations dispatched
         are actually executed. When set to None, an appropriate value will be
         picked automatically. The value picked may change between TensorFlow
         releases.
         Valid values:
-        - tf.contrib.eager.SYNC: executes each operation synchronously.
-        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+        - SYNC: executes each operation synchronously.
+        - ASYNC: executes each operation asynchronously. These
           operations may return "non-ready" handles.
       server_def: (Optional.) A tensorflow::ServerDef proto.
         Enables execution on remote devices. GrpcServers need to be started by
@@ -249,14 +262,21 @@ class Context(object):
     Raises:
      ValueError: If execution_mode is not valid.
     """
-    self._eager_context = _EagerContext(config)
+    if config is None:
+      config = config_pb2.ConfigProto(
+          allow_soft_placement=True,
+          log_device_placement=False,
+      )
+    self._config = config
+    self._thread_local_data = _ThreadLocalData()
     self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
     self._post_execution_callbacks = []
-    self._config = config
     self._seed = None
     self._initialize_lock = threading.Lock()
+    if device_policy is None:
+      device_policy = DEVICE_PLACEMENT_SILENT
     self._device_policy = device_policy
     if execution_mode not in (None, SYNC, ASYNC):
       raise ValueError(
@@ -433,7 +453,7 @@ class Context(object):
   @tf_contextlib.contextmanager
   def _mode(self, mode):
     """A context manager to allow setting the mode to EAGER/GRAPH."""
-    ctx = self._eager_context
+    ctx = self._thread_local_data
     old_mode = ctx.mode
     old_is_eager = ctx.is_eager
     ctx.mode = mode
@@ -442,7 +462,7 @@ class Context(object):
       # Entering graph mode does not provide us with sufficient information to
       # record a context switch; graph-based context switches are only logged
       # when a graph is registered as the default graph.
-      self.context_switches.push(False, eager_mode)
+      self.context_switches.push(False, eager_mode, None)
     try:
       yield
     finally:
@@ -453,49 +473,79 @@ class Context(object):
 
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
-    return self._eager_context.is_eager
+    return self._thread_local_data.is_eager
 
   def scalar_cache(self):
     """Per-device cache for scalars."""
-    return self._eager_context.scalar_cache
+    return self._thread_local_data.scalar_cache
 
   def ones_rank_cache(self):
     """Per-device cache for scalars."""
-    return self._eager_context.ones_rank_cache
+    return self._thread_local_data.ones_rank_cache
 
   def zeros_cache(self):
     """Per-device cache for scalars."""
-    return self._eager_context.zeros_cache
+    return self._thread_local_data.zeros_cache
 
   @property
   def scope_name(self):
     """Returns scope name for the current thread."""
-    return self._eager_context.scope_name
+    return self._thread_local_data.scope_name
 
   @scope_name.setter
   def scope_name(self, s):
     """Sets scope name for the current thread."""
-    self._eager_context.scope_name = s
+    self._thread_local_data.scope_name = s
+
+  @property
+  def summary_writer(self):
+    """Returns default summary writer for the current thread."""
+    return self._thread_local_data.summary_writer
+
+  @summary_writer.setter
+  def summary_writer(self, writer):
+    """Sets default summary writer for the current thread."""
+    self._thread_local_data.summary_writer = writer
+
+  @property
+  def summary_recording(self):
+    """Returns summary recording condition."""
+    return self._thread_local_data.summary_recording
+
+  @summary_recording.setter
+  def summary_recording(self, condition):
+    """Sets summary recording condition."""
+    self._thread_local_data.summary_recording = condition
+
+  @property
+  def summary_recording_distribution_strategy(self):
+    """Returns summary recording condition for distribution strategy."""
+    return self._thread_local_data.summary_recording_distribution_strategy
+
+  @summary_recording_distribution_strategy.setter
+  def summary_recording_distribution_strategy(self, condition):
+    """Sets summary recording condition for distribution strategy."""
+    self._thread_local_data.summary_recording_distribution_strategy = condition
 
   @property
-  def summary_writer_resource(self):
-    """Returns summary writer resource."""
-    return self._eager_context.summary_writer_resource
+  def summary_step(self):
+    """Returns summary step variable."""
+    return self._thread_local_data.summary_step
 
-  @summary_writer_resource.setter
-  def summary_writer_resource(self, resource):
-    """Sets summary writer resource."""
-    self._eager_context.summary_writer_resource = resource
+  @summary_step.setter
+  def summary_step(self, step):
+    """Sets summary step variable."""
+    self._thread_local_data.summary_step = step
 
   @property
   def device_name(self):
     """Returns the device name for the current thread."""
-    return self._eager_context.device_name
+    return self._thread_local_data.device_name
 
   @property
   def device_spec(self):
     """Returns the device spec for the current thread."""
-    return self._eager_context.device_spec
+    return self._thread_local_data.device_spec
 
   @tf_contextlib.contextmanager
   def device(self, name):
@@ -510,7 +560,7 @@ class Context(object):
     Raises:
       ValueError: If name is not a string or is an invalid device name.
     """
-    eager_context = self._eager_context
+    eager_context = self._thread_local_data
     old_device_name = eager_context.device_name
     old_device_spec = eager_context.device_spec
     cache_key = (old_device_name, name)
@@ -551,60 +601,58 @@ class Context(object):
     """List of the names of devices available to execute operations."""
     return self._devices
 
-  def get_execution_mode(self):
-    mode = self._eager_context.execution_mode
+  @property
+  def execution_mode(self):
+    """Gets execution mode for current thread."""
+    # Only get the execution mode from the context if it has already been
+    # initialized
+    if self._context_handle is None:
+      return self._execution_mode
+
+    mode = self._thread_local_data.execution_mode
     if mode is None:
       mode = self._execution_mode
     return mode
 
-  def set_execution_mode(self, mode):
+  @execution_mode.setter
+  def execution_mode(self, mode):
     """Sets execution mode for current thread."""
     if mode not in (None, SYNC, ASYNC):
       raise ValueError(
           "Execution mode should be None/SYNC/ASYNC. Got %s" % mode)
     if mode is None:
       mode = SYNC
-    self._eager_context.execution_mode = mode
-    pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle, mode == ASYNC)
 
-  @tf_contextlib.contextmanager
-  def execution_mode(self, mode):
-    """Context manager for setting execution mode for current thread."""
-    old_mode = self.get_execution_mode()
-    try:
-      self.set_execution_mode(mode)
-      yield
-    finally:
-      self.set_execution_mode(old_mode)
+    if self._thread_local_data.execution_mode != mode:
+      self._thread_local_data.execution_mode = mode
+
+      # Only set the execution mode if the context has already been initialized
+      if self._context_handle is not None:
+        pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._context_handle,
+                                                       mode == ASYNC)
+      else:
+        self._execution_mode = mode
 
-  def get_function_call_options(self):
+  @property
+  def function_call_options(self):
     """Returns function call options for current thread.
 
     Note that the returned object is still referenced by the eager context.
 
     Returns: the FunctionCallOptions for current thread.
     """
-    return self._eager_context.function_call_options
-
-  @tf_contextlib.contextmanager
-  def function_call_options(self, set_options_func):
-    """Context manager for setting function call options of current thread.
+    if self._thread_local_data.function_call_options is None:
+      base_config = config_pb2.ConfigProto()
+      base_config.CopyFrom(self._config)
+      self._thread_local_data.function_call_options = FunctionCallOptions(
+          config_proto=base_config)
 
-    Args:
-      set_options_func: A callable that takes one argument of type
-        FunctionCallOptions. It should set the properties of that
-        FunctionCallOptions.
+    return self._thread_local_data.function_call_options
 
-    Yields:
-      Nothing.
-    """
-    current_options = self.get_function_call_options()
-    old_options = copy.copy(current_options)
-    try:
-      set_options_func(current_options)
-      yield
-    finally:
-      self._eager_context.function_call_options = old_options
+  @function_call_options.setter
+  def function_call_options(self, options):
+    """Returns function call options for current thread."""
+    self._thread_local_data.function_call_options = options
 
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
@@ -643,6 +691,10 @@ class Context(object):
     pywrap_tensorflow.TFE_ContextAddFunctionDef(
         self._handle, fdef_string, len(fdef_string))
 
+  def has_function(self, name):
+    """Check if a function `name` is registered."""
+    return bool(pywrap_tensorflow.TFE_ContextHasFunction(self._handle, name))
+
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
 
@@ -678,6 +730,97 @@ class Context(object):
     """Get the list of post-execution callbacks added to the context."""
     return self._post_execution_callbacks
 
+  @property
+  def gpu_per_process_memory_fraction(self):
+    return self._config.gpu_options.per_process_gpu_memory_fraction
+
+  @gpu_per_process_memory_fraction.setter
+  def gpu_per_process_memory_fraction(self, fraction):
+    if self._context_handle is not None:
+      raise RuntimeError(
+          "GPU options must be set at program startup")
+
+    self._config.gpu_options.per_process_gpu_memory_fraction = fraction
+
+  @property
+  def gpu_per_process_memory_growth(self):
+    return self._config.gpu_options.allow_growth
+
+  @gpu_per_process_memory_growth.setter
+  def gpu_per_process_memory_growth(self, enabled):
+    if self._context_handle is not None:
+      raise RuntimeError(
+          "GPU options must be set at program startup")
+
+    self._config.gpu_options.allow_growth = enabled
+
+  @property
+  def intra_op_parallelism_threads(self):
+    return self._config.intra_op_parallelism_threads
+
+  @intra_op_parallelism_threads.setter
+  def intra_op_parallelism_threads(self, num_threads):
+    if self._context_handle is not None:
+      raise RuntimeError(
+          "Intra op parallelism must be set at program startup")
+
+    self._config.intra_op_parallelism_threads = num_threads
+
+  @property
+  def inter_op_parallelism_threads(self):
+    return self._config.inter_op_parallelism_threads
+
+  @inter_op_parallelism_threads.setter
+  def inter_op_parallelism_threads(self, num_threads):
+    if self._context_handle is not None:
+      raise RuntimeError(
+          "Inter op parallelism must be set at program startup")
+
+    self._config.inter_op_parallelism_threads = num_threads
+
+  @property
+  def soft_device_placement(self):
+    return self._config.allow_soft_placement
+
+  @soft_device_placement.setter
+  def soft_device_placement(self, enabled):
+    self._config.allow_soft_placement = enabled
+
+    self._thread_local_data.function_call_options = None
+
+  @property
+  def log_device_placement(self):
+    return self._config.log_device_placement
+
+  @log_device_placement.setter
+  def log_device_placement(self, enabled):
+    if self._context_handle is not None:
+      raise RuntimeError(
+          "Device placement logging must be set at program startup")
+
+    self._config.log_device_placement = enabled
+
+  @property
+  def device_policy(self):
+    # Only get the policy from the context if it has already been initialized
+    if self._context_handle is not None:
+      return pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(self._handle)
+
+    return self._device_policy
+
+  @device_policy.setter
+  def device_policy(self, policy):
+    if policy is None:
+      policy = DEVICE_PLACEMENT_SILENT
+
+    if self._device_policy != policy:
+      self._device_policy = policy
+
+      # Only set the policy if the context has already been initialized
+      if self._context_handle is not None:
+        pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
+            self._handle, self._device_policy)
+
   def enable_run_metadata(self):
     """Enables tracing of op execution via RunMetadata.
 
@@ -686,24 +829,26 @@ class Context(object):
     """
     pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
 
-  @tf_contextlib.contextmanager
-  def device_policy(self, policy):
-    handle = self._handle
-    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(handle)
-    pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-        handle, policy)
-    try:
-      yield
-    finally:
-      pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-          handle, old)
-
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
     if not self._context_handle:
       return
     pywrap_tensorflow.TFE_ContextDisableRunMetadata(self._context_handle)
 
+  def enable_graph_collection(self):
+    """Enables graph collection of executed functions.
+
+    To retrieve the accumulated graphs call context.export_run_metadata()
+    and to stop collecting graphs call context.disable_graph_collection().
+    """
+    pywrap_tensorflow.TFE_ContextEnableGraphCollection(self._handle)
+
+  def disable_graph_collection(self):
+    """Disables graph collections of executed functions."""
+    if not self._context_handle:
+      return
+    pywrap_tensorflow.TFE_ContextDisableGraphCollection(self._context_handle)
+
   def export_run_metadata(self):
     """Returns a RunMetadata proto with accumulated information.
 
@@ -860,6 +1005,7 @@ def device(name):
   return context().device(name)
 
 
+@tf_export("config.experimental_list_devices")
 def list_devices():
   """List the names of the available devices.
 
@@ -869,33 +1015,76 @@ def list_devices():
   return context().devices()
 
 
+@tf_export("debugging.get_log_device_placement")
+def get_log_device_placement():
+  """Get if device placements are logged.
+
+  Returns:
+    If device placements are logged.
+  """
+  return context().log_device_placement
+
+
+@tf_export("debugging.set_log_device_placement")
+def set_log_device_placement(enabled):
+  """Set if device placements should be logged.
+
+  Args:
+    enabled: Whether to enabled device placement logging.
+  """
+  context().log_device_placement = enabled
+
+
+@tf_contextlib.contextmanager
+def device_policy(policy):
+  """Context manager for setting device placement policy for current thread."""
+  ctx = context()
+  old_policy = ctx.device_policy
+  try:
+    ctx.device_policy = policy
+    yield
+  finally:
+    ctx.device_policy = old_policy
+
+
 def set_execution_mode(mode):
   """Sets execution mode for the current thread."""
-  context().set_execution_mode(mode)
+  context().execution_mode = mode
 
 
+@tf_contextlib.contextmanager
 def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
-  return context().execution_mode(mode)
+  ctx = context()
+  old_mode = ctx.execution_mode
+  try:
+    ctx.execution_mode = mode
+    yield
+  finally:
+    ctx.execution_mode = old_mode
 
 
 @tf_export("experimental.function_executor_type")
+@tf_contextlib.contextmanager
 def function_executor_type(executor_type):
-  """Context manager for setting the executor of eagar defined functions.
+  """Context manager for setting the executor of eager defined functions.
 
   Eager defined functions are functions decorated by tf.contrib.eager.defun.
 
   Args:
-    executor_type: a string for the name of the executor to be used
-    to execute functions defined by tf.contrib.eager.defun.
+    executor_type: a string for the name of the executor to be used to execute
+      functions defined by tf.contrib.eager.defun.
 
-  Returns:
+  Yields:
     Context manager for setting the executor of eager defined functions.
   """
-  def _set_options_func(options):
-    options.executor_type = executor_type
-
-  return context().function_call_options(_set_options_func)
+  current_options = context().function_call_options
+  old_options = copy.copy(current_options)
+  try:
+    current_options.executor_type = executor_type
+    yield
+  finally:
+    context().function_call_options = old_options
 
 
 def async_wait():
@@ -931,6 +1120,20 @@ def disable_run_metadata():
   context().disable_run_metadata()
 
 
+def enable_graph_collection():
+  """Enables tracing of op execution via RunMetadata.
+
+  To retrieve the accumulated metadata call context.export_run_metadata()
+  and to stop tracing call context.disable_run_metadata().
+  """
+  context().enable_graph_collection()
+
+
+def disable_graph_collection():
+  """Disables tracing of op execution via RunMetadata."""
+  context().disable_graph_collection()
+
+
 def export_run_metadata():
   """Returns a RunMetadata proto with accumulated information.
 
@@ -943,25 +1146,6 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
-def function_config_proto(config_proto):
-  """Context manager for setting the grappler rewrite config.
-
-  This config is used by Grappler when optimizing the function graph.
-
-  Args:
-    config_proto: a `config_pb2.ConfigProto` proto or
-      a serialized string of that proto or None. If None, the default instance
-      of `config_pb2.ConfigProto` will be used.
-
-  Returns:
-    A context manager.
-  """
-  def _set_options_func(options):
-    options.config_proto_serialized = config_proto
-
-  return context().function_call_options(_set_options_func)
-
-
 def set_server_def(server_def):
   context().set_server_def(server_def)
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index e601aa376fa2ef8e0e240e4da03bfcd9ea227bd9..4cddf84232025bb8392f2befa9f63f8ce10442d0 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -65,19 +65,21 @@ class TFETest(test_util.TensorFlowTestCase):
     ctx.scope_name = 'foo'
     self.assertEqual('foo', ctx.scope_name)
 
-    self.assertEqual(context.SYNC, ctx.get_execution_mode())
-    ctx.set_execution_mode(context.ASYNC)
-    self.assertEqual(context.ASYNC, ctx.get_execution_mode())
-    ctx.set_execution_mode(context.SYNC)
-    self.assertEqual(context.SYNC, ctx.get_execution_mode())
-    with ctx.execution_mode(context.ASYNC):
-      self.assertEqual(context.ASYNC, ctx.get_execution_mode())
-    ctx.set_execution_mode(context.SYNC)
-    self.assertEqual(context.SYNC, ctx.get_execution_mode())
-
-    self.assertIsNone(ctx.summary_writer_resource)
-    ctx.summary_writer_resource = 'mock'
-    self.assertEqual('mock', ctx.summary_writer_resource)
+    self.assertEqual(context.SYNC, ctx.execution_mode)
+    ctx.execution_mode = context.ASYNC
+    self.assertEqual(context.ASYNC, ctx.execution_mode)
+    ctx.execution_mode = context.SYNC
+    self.assertEqual(context.SYNC, ctx.execution_mode)
+
+    self.assertIsNone(ctx.summary_writer)
+    ctx.summary_writer = 'mock'
+    self.assertEqual('mock', ctx.summary_writer)
+    self.assertIsNone(ctx.summary_recording)
+    ctx.summary_recording = 'mock'
+    self.assertEqual('mock', ctx.summary_recording)
+    self.assertIsNone(ctx.summary_step)
+    ctx.summary_step = 'mock'
+    self.assertEqual('mock', ctx.summary_step)
 
     self.assertEqual('', ctx.device_name)
     self.assertEqual(ctx.device_name, ctx.device_spec.to_string())
@@ -167,7 +169,11 @@ class TFETest(test_util.TensorFlowTestCase):
 
     def get_context_values(ctx):
       return [
-          ctx.executing_eagerly(), ctx.scope_name, ctx.summary_writer_resource,
+          ctx.executing_eagerly(),
+          ctx.scope_name,
+          ctx.summary_writer,
+          ctx.summary_recording,
+          ctx.summary_step,
           ctx.device_name,
           ctx.num_gpus()
       ]
@@ -259,7 +265,7 @@ class TFETest(test_util.TensorFlowTestCase):
       self.skipTest('No GPUs found')
     constant = constant_op.constant(1.0)
     with ops.device('gpu:0'):
-      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+      with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
         c = constant + 1.0
     self.assertAllEqual(c, 2.0)
 
@@ -315,7 +321,7 @@ class TFETest(test_util.TensorFlowTestCase):
                  three.dtype.as_datatype_enum))
       context.async_wait()
     context.async_clear_error()
-    context.set_execution_mode(context.SYNC)
+    context.context().execution_mode = context.SYNC
 
   def testExecuteTooManyNumOutputs(self):
     # num_outputs provided is 50, but only one output is produced.
@@ -631,7 +637,8 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in tensors:
       self.assertIsInstance(t, ops.EagerTensor)
 
-  def testSmallIntegerOpsForcedToCPU(self):
+  # TODO(b/123637108): re-enable
+  def disabled_testSmallIntegerOpsForcedToCPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
 
@@ -660,6 +667,39 @@ class TFETest(test_util.TensorFlowTestCase):
     # Op not forced to CPU since the constants are not integers.
     self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
 
+  def testExecutionModeIsStoredThreadLocal(self):
+    cv = threading.Condition()
+    count = [0]
+    num_threads = 10
+
+    def execution_mode_test(cond, count, num_threads, ctx, mode):
+      cond.acquire()
+      # Ensure that all threads set their mode simultaneously
+      # Note that this is not a simple assignment, as the execution_mode is an
+      # @property with a custom setter.
+      ctx.execution_mode = mode
+      count[0] = count[0] + 1
+      if count[0] < num_threads:
+        cond.wait()
+      else:
+        cond.notify_all()
+      cond.release()
+      self.assertEqual(ctx.execution_mode, mode)
+
+    ctx = context.Context()
+    threads = []
+    for i in range(num_threads):
+      t = threading.Thread(
+          target=execution_mode_test,
+          args=(cv, count, num_threads, ctx,
+                context.SYNC if i % 2 == 0 else context.ASYNC))
+      t.start()
+      threads.append(t)
+
+    for t in threads:
+      t.join()
+
+
 class SendRecvTest(test_util.TensorFlowTestCase):
 
   cpu_device = '/job:localhost/replica:0/task:0/device:CPU:0'
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index e5950f54545d38721e506052a4175e350f2e7d09..4b85b6af4ca02fb81b16b7ca60296e17e0426a88 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -30,9 +30,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
@@ -56,6 +55,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                dtype=None,
                constraint=None,
                add_initializers_to=None,
+               lifted_initializer_graph=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -87,15 +87,16 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       add_initializers_to: if not None and not in legacy graph mode, the
-        initializer tensor will be added to this map instead of adding the
+        initializer tensor will be added to this map in addition to adding the
         assignment to the function.
+      lifted_initializer_graph: FuncGraph to try to lift initializers to.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
       RuntimeError: If called outside of a function definition.
     """
-    if context.executing_eagerly():
+    if not ops.inside_function():
       # If we've been init_scope()d out of the function definition nothing to do
       # here; we can't really do the capturing or conditional logic.
       resource_variable_ops.ResourceVariable.__init__(
@@ -112,8 +113,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -142,8 +143,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             name="initial_value", dtype=dtype)
       with ops.init_scope():
         self._handle = resource_variable_ops.eager_safe_variable_handle(
-            shape=initial_value.get_shape(),
-            dtype=initial_value.dtype.base_dtype,
+            initial_value=initial_value,
             shared_name=shared_name,
             name=name,
             graph_mode=self._in_graph_mode)
@@ -156,8 +156,14 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
+        func_graph = ops.get_default_graph()
+        function_placeholders = (
+            func_graph.inputs + func_graph.internal_captures)
+        placeholder_ops = set(
+            [tensor.op for tensor in function_placeholders])
         lifted_initializer = lift_to_graph.lift_to_graph(
-            initial_value, outer_graph)[initial_value]
+            [initial_value], outer_graph,
+            disallowed_placeholders=placeholder_ops)[initial_value]
         with ops.init_scope():
           self._initial_value = lifted_initializer
           with ops.name_scope("IsInitialized"):
@@ -177,22 +183,21 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       else:
         if add_initializers_to is not None:
           add_initializers_to[self] = initial_value
-        else:
-          def assign_fn():
-            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-              resource_variable_ops.assign_variable_op(
-                  self._handle,
-                  initial_value,
-                  name=n)
-              # Returning values to keep tf.cond happy.
-            return ops.convert_to_tensor(1)
-          def not_assign_fn():
-            return ops.convert_to_tensor(0)
-          # Note: this cond is always guaranteed to run because we're inside a
-          # defun which will insert automatic control dependencies.
-          control_flow_ops.cond(
-              resource_variable_ops.var_is_initialized_op(self._handle),
-              not_assign_fn, assign_fn)
+        def assign_fn():
+          with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+            resource_variable_ops.assign_variable_op(
+                self._handle,
+                initial_value,
+                name=n)
+            # Returning values to keep tf.cond happy.
+          return ops.convert_to_tensor(1)
+        def not_assign_fn():
+          return ops.convert_to_tensor(0)
+        # Note: this cond is always guaranteed to run because we're inside a
+        # defun which will insert automatic control dependencies.
+        control_flow_ops.cond(
+            resource_variable_ops.var_is_initialized_op(self._handle),
+            not_assign_fn, assign_fn)
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -205,6 +210,42 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     self._cached_shape_as_list = None
 
 
+RUN_FUNCTIONS_EAGERLY = False
+
+
+@tf_export("config.experimental_run_functions_eagerly")
+def run_functions_eagerly(run_eagerly):
+  """Enables / disables eager execution of `tf.function`s.
+
+  After calling `tf.config.experimental_run_functions_eagerly(True)` all
+  invocations of tf.function will run eagerly instead of running through a graph
+  function.
+
+  This can be useful for debugging or profiling.
+
+  Similarly, calling `tf.config.experimental_run_functions_eagerly(False)` will
+  revert the behavior of all functions to graph functions.
+
+  Args:
+    run_eagerly: Boolean. Whether to run functions eagerly.
+  """
+  global RUN_FUNCTIONS_EAGERLY
+  RUN_FUNCTIONS_EAGERLY = bool(run_eagerly)
+
+
+class FunctionDeleter(object):
+
+  def __init__(self, func_graph):
+    self.func_graph = func_graph
+
+  def __del__(self):
+    try:
+      func_graph_module.dismantle_func_graph(self.func_graph)
+    except:  # pylint: disable=bare-except
+      # Note: bare except here because this can be noisy at shutdown time.
+      pass
+
+
 class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
@@ -239,15 +280,12 @@ class Function(object):
         argspec has keyword arguments.
     """
     self._python_function = python_function
-    self._input_signature = input_signature
     # TODO(vbardiovsky): Both _stateful_fn and _stateless_fn are populating the
     # same FunctionSpec. Consider removing it from both and passing in instead.
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
         python_function, input_signature)
     self._autograph = autograph
     self._experimental_autograph_options = experimental_autograph_options
-    if self._experimental_autograph_options is not None:
-      raise NotImplementedError()
     self._created_variables = None
     self._stateful_fn = None
     self._stateless_fn = None
@@ -259,7 +297,20 @@ class Function(object):
 
     weak_wrapped_fn = None
     def wrapped_fn(*args, **kwds):
-      with variable_scope.variable_creator_scope(scope):
+      """Wraps `self._python_function` in a variable creator scope."""
+      # We register a variable creator with reduced priority. If an outer
+      # variable creator is just modifying keyword arguments to the variable
+      # constructor, this will work harmoniously. Since the `scope` registered
+      # here actually creates the variable, it taking priority would otherwise
+      # ignore the outer creator.
+      #
+      # If an outer variable creator calls the variable constructor manually,
+      # for example creating a MirroredVariable, then they won't call our
+      # creator. This means we won't be able to trace the initialization graph,
+      # and so variable initializers can't depend on function arguments. This is
+      # better than the alternative, tracing the initialization graph but giving
+      # the user a variable type they didn't want.
+      with ops.get_default_graph()._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
         # __wrapped__ allows AutoGraph to swap in a converted function. We give
         # the function a weak reference to itself to avoid a reference cycle.
         return weak_wrapped_fn().__wrapped__(*args, **kwds)
@@ -267,18 +318,13 @@ class Function(object):
 
     # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
-        tf_decorator.make_decorator(self._python_function, wrapped_fn),
-        input_signature=self._input_signature,
-        autograph=self._autograph)
-
-  def _canonicalize_function_inputs(self, args, kwds):
-    """Canonicalize the inputs to the Python function."""
-    if self._input_signature is None or args or kwds:
-      return self._function_spec.canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
-    # If an input signature is defined, we may need to fetch a concrete function
-    # without any inputs specified. In this case args and kwds should be ignored
-    # but running _canonicalize_function_inputs would raise an exception.
-    return (), {}
+        tf_decorator.make_decorator(
+            self._python_function,
+            wrapped_fn,
+            decorator_argspec=self._function_spec.fullargspec),
+        input_signature=self.input_signature,
+        autograph=self._autograph,
+        experimental_autograph_options=self._experimental_autograph_options)
 
   def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call.
@@ -296,11 +342,13 @@ class Function(object):
     """
 
     created_variables = []
+    lifted_initializer_graph = func_graph_module.FuncGraph("initializer")
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
       v = UnliftedInitializerVariable(
-          add_initializers_to=add_initializers_to, **kwds)
+          add_initializers_to=add_initializers_to,
+          lifted_initializer_graph=lifted_initializer_graph, **kwds)
       created_variables.append(weakref.ref(v))
       return v
 
@@ -308,6 +356,8 @@ class Function(object):
     self._stateful_fn = self._defun_with_scope(variable_capturing_scope)
     self._stateful_fn._name = self._name  # pylint: disable=protected-access
     # Force the definition of the function for these arguments
+    self._lifted_initializer_graph = lifted_initializer_graph
+    self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
     self._concrete_stateful_fn = (
         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
             *args, **kwds))
@@ -321,8 +371,36 @@ class Function(object):
     self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
     self._stateless_fn._name = self._name  # pylint: disable=protected-access
 
+  def _decorate(self, decorator):
+    """Allows the captured Python function to be decorated in place.
+
+    This method is only safe to call when the Function has not been called by a
+    user. It makes sense to use this method to push a decorator into the
+    function rather than wrapping the function in the decorator.
+
+    We use this in tf.Module to allow user annotated `tf.functions` to remain as
+    `Function` objects but still automatically enter the Module name_scope
+    when they are evaluated like all other methods.
+
+    Args:
+      decorator: A callable accepting a single argument which is the function
+        to decorate and returning a callable result.
+
+    Raises:
+      ValueError: If the function has been called a ValueError is raised.
+    """
+    if self._stateful_fn is not None or self._stateless_fn is not None:
+      raise ValueError(
+          "Functions cannot be decorated after they have been traced.")
+
+    self._python_function = decorator(self._python_function)
+    self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
+        self._python_function, self.input_signature)
+
   def __call__(self, *args, **kwds):
     """Calls the graph function."""
+    if RUN_FUNCTIONS_EAGERLY:
+      return self._python_function(*args, **kwds)
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
       # defunned version which is guaranteed to never create variables.
@@ -337,10 +415,24 @@ class Function(object):
       return results
 
     # This is the first call of __call__, so we have to initialize.
-    self._initialize(args, kwds)
-    canon_args, canon_kwds = self._canonicalize_function_inputs(args, kwds)
-
-    if not self._created_variables:
+    initializer_map = {}
+    self._initialize(args, kwds, add_initializers_to=initializer_map)
+    if self._created_variables:
+      try:
+        # Attempt to initialize variables eagerly and without conds by lifting
+        # out initialization graphs. This is the only initialization strategy
+        # compatible with XLA at the moment.
+        self._initialize_uninitialized_variables(initializer_map)
+      except lift_to_graph.UnliftableError:
+        pass  # Fall through to cond-based initialization.
+      else:
+        # Lifting succeeded, so variables are initialized and we can run the
+        # stateless function.
+        return self._stateless_fn(*args, **kwds)
+    else:
+      canon_args, canon_kwds = \
+          self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
+              *args, **kwds)
       # If we did not create any variables the trace we have is good enough.
       return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)  # pylint: disable=protected-access
 
@@ -395,6 +487,11 @@ class Function(object):
           functools.partial(self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
                             inner_args, inner_kwds))
 
+    # We've created variables and are unable to lift the initialization graphs,
+    # so we fall back to initializing with conds while running the function.
+    canon_args, canon_kwds = \
+        self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
+            *args, **kwds)
     return function_lib.defun(fn_with_cond)(*canon_args, **canon_kwds)
 
   @property
@@ -404,12 +501,30 @@ class Function(object):
 
   @property
   def input_signature(self):
-    return self._input_signature
+    return self._function_spec.input_signature
 
   @property
   def function_spec(self):
     return self._function_spec
 
+  def _initialize_uninitialized_variables(self, initializer_map):
+    """Make and call a `ConcreteFunction` which initializes variables."""
+
+    # Note: using defun here avoids an infinite recursion.
+    # Note: there is no reason not to autograph once the overhead is negligible.
+    @function_lib.defun(autograph=False)  # tf.function internal, pure graph
+    def initialize_variables():
+      for v, init in initializer_map.items():
+        with ops.init_scope():
+          if resource_variable_ops.var_is_initialized_op(v.handle):
+            # Ignore variables which are already initialized at trace time.
+            continue
+        v.assign(lift_to_graph.lift_to_graph(
+            [init], ops.get_default_graph())[init])
+
+    with ops.init_scope():
+      return initialize_variables.get_concrete_function()()
+
   def get_initialization_function(self, *args, **kwargs):
     """Returns a `ConcreteFunction` which initializes this function's variables.
 
@@ -418,6 +533,9 @@ class Function(object):
     function which does not depend on the concrete values of the inputs to this
     function.
 
+    Note that running this function will overwrite any values currently assigned
+    to variables, for example restores from a checkpoint.
+
     Args:
       *args: arguments to the underlying python callable.
       **kwargs: keyword arguments to the python callable.
@@ -443,7 +561,7 @@ class Function(object):
     def initialize_variables():
       for v, init in initializer_map.items():
         v.assign(lift_to_graph.lift_to_graph(
-            init, ops.get_default_graph())[init])
+            [init], ops.get_default_graph())[init])
 
     return initialize_variables.get_concrete_function()
 
@@ -453,14 +571,16 @@ class Function(object):
     Returns:
       A list of instances of `Function`.
     """
-    if self._input_signature is not None:
+    if self.input_signature is not None:
       self.get_concrete_function()
     concrete_functions = []
     # pylint: disable=protected-access
     if self._stateful_fn:
-      concrete_functions.extend(self._stateful_fn._function_cache.values())
+      concrete_functions.extend(
+          self._stateful_fn._function_cache.all_values())
     if self._stateless_fn:
-      concrete_functions.extend(self._stateless_fn._function_cache.values())
+      concrete_functions.extend(
+          self._stateless_fn._function_cache.all_values())
     # pylint: enable=protected-access
     deduplicated_concrete_functions = list()
     seen_signatures = list()
@@ -560,9 +680,10 @@ class Function(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    assert context.executing_eagerly()
     if self._stateful_fn is None:
-      self.get_initialization_function(*args, **kwargs)()
+      initializer_map = {}
+      self._initialize(args, kwargs, add_initializers_to=initializer_map)
+      self._initialize_uninitialized_variables(initializer_map)
 
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
@@ -605,8 +726,7 @@ class Function(object):
     return self._descriptor_cache[instance]
 
 
-# In TensorFlow 1.x, exported as tf.contrib.eager.function
-@tf_export("function", v1=[])
+@tf_export("function")
 def function(func=None,
              input_signature=None,
              autograph=True,
@@ -663,6 +783,9 @@ def function(func=None,
       l.append(i)                           # Caution! Doesn't work.
   ```
 
+  Note that unlike other TensorFlow operations, we don't convert python
+  numerical inputs to tensors.
+
   _Referencing `tf.Variable`s_
 
   The Python function `func` may reference stateful objects (such as
@@ -763,8 +886,8 @@ def function(func=None,
   def f(x): return tf.add(x, 1.)
   ```
 
-  When an `input_signature` is specified, the callable will only accept `Tensor`
-  (or NumPy `ndarray`) objects as arguments.
+  When an `input_signature` is specified, the callable will convert the inputs
+  to the specified TensorSpecs.
 
   _Tracing and staging_
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index e0a82bb0be6fb2fbc0791dfc4f7150dfc2b1337e..193f8195937ccfed886166087389e3275a9856d1 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -21,16 +21,23 @@ import functools
 import weakref
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -61,6 +68,21 @@ class _HasDecoratedMethod(object):
   def f(self, x):
     return x * 3.
 
+# pylint: disable=bad-continuation,anomalous-backslash-in-string
+MIXING_GRAPH_EAGER_TENSORS_ERROR = (
+"""An op outside of the function building code is being passed
+a "Graph" tensor. It is possible to have Graph tensors
+leak out of the function building context by including a
+tf.init_scope in your function building code.
+For example, the following function will fail:
+  @tf.function
+  def has_init_scope\(\):
+    my_constant = tf.constant\(1.\)
+    with tf.init_scope\(\):
+      added = my_constant \* 2
+The graph tensor has name: Const:0""")
+# pylint: enable=bad-continuation,anomalous-backslash-in-string
+
 
 class DefFunctionTest(test.TestCase):
 
@@ -192,7 +214,8 @@ class DefFunctionTest(test.TestCase):
           state.append(variables.Variable(2.0 * x))
         return state[0] * x
 
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(
+          lift_to_graph.UnliftableError, r'transitively.* mul .* x'):
         fn(constant_op.constant(3.0))
 
   def testMethod(self):
@@ -223,6 +246,14 @@ class DefFunctionTest(test.TestCase):
         def_function.function(functools.partial(lambda x, y: x + y, 1.))(
             constant_op.constant(2.)))
 
+  def test_functools_partial_keywords(self):
+    def f(x, y):
+      return x + y
+
+    func = def_function.function(
+        functools.partial(f, x=array_ops.zeros([1]), y=array_ops.zeros([1])))
+    self.assertAllEqual(func(), [0.0])
+
   def test_unspecified_default_argument(self):
     wrapped = def_function.function(
         lambda x, y=2: x + y,
@@ -249,7 +280,73 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose(4., concrete(constant_op.constant(2.)))
     signature_args, _ = concrete.structured_input_signature
     self.assertEqual(signature_args,
-                     (tensor_spec.TensorSpec(None, dtypes.float32),))
+                     (tensor_spec.TensorSpec(
+                         None, dtypes.float32, name='x'),))
+
+  def test_concrete_function_keyword_arguments(self):
+    @def_function.function
+    def f(x):
+      return x
+
+    conc = f.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32, 'y'))
+    conc(y=constant_op.constant(3.0))
+    signature_args, _ = conc.structured_input_signature
+    self.assertEqual('y', signature_args[0].name)
+
+    conc = f.get_concrete_function(tensor_spec.TensorSpec(None, dtypes.float32))
+    conc(x=constant_op.constant(3.0))
+    signature_args, _ = conc.structured_input_signature
+    self.assertEqual('x', signature_args[0].name)
+
+    @def_function.function
+    def g(x):
+      return x[0]
+
+    conc = g.get_concrete_function(
+        [tensor_spec.TensorSpec(None, dtypes.float32, 'z'), 2])
+    conc(z=constant_op.constant(3.0))
+    signature_args, _ = conc.structured_input_signature
+    self.assertEqual('z', signature_args[0][0].name)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'either zero or all names have to be specified'):
+      conc = g.get_concrete_function([
+          tensor_spec.TensorSpec(None, dtypes.float32, 'z'),
+          tensor_spec.TensorSpec(None, dtypes.float32),
+      ])
+
+  def test_error_inner_capture(self):
+
+    @def_function.function
+    def f(inputs):
+      num_steps, _ = inputs.shape[:2]
+      outputs = []
+      for t in math_ops.range(num_steps):
+        outputs.append(inputs[t])
+      return outputs
+
+    with self.assertRaisesRegexp(ValueError, 'inner'):
+      f(array_ops.zeros(shape=(8, 42, 3)))
+
+  def testRuntimeErrorNotSticky(self):
+
+    @def_function.function
+    def fail(i):
+      control_flow_ops.Assert(math_ops.equal(i, 0), ['ick'])
+
+    fail(constant_op.constant(0))  # OK
+    with self.assertRaises(errors.InvalidArgumentError):
+      fail(constant_op.constant(1))  # InvalidArgument: "ick"
+    fail(constant_op.constant(0))  # OK
+
+  def testUnderscoreName(self):
+
+    @def_function.function
+    def f(_):
+      return _ + _
+
+    self.assertAllEqual(2.0, f(constant_op.constant(1.0)))
 
   def test_serialization_signature_cache(self):
 
@@ -269,10 +366,10 @@ class DefFunctionTest(test.TestCase):
 
     self.assertEqual(
         signatures_args,
-        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32),
-              tensor_spec.TensorSpec([1], dtypes.float32)),
-             (tensor_spec.TensorSpec([1, 3], dtypes.int32),
-              tensor_spec.TensorSpec([1], dtypes.int32)))))
+        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.float32, name='y')),
+             (tensor_spec.TensorSpec([1, 3], dtypes.int32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.int32, name='y')))))
 
   @test_util.assert_no_garbage_created
   def testFunctionReferenceCycles(self):
@@ -296,6 +393,152 @@ class DefFunctionTest(test.TestCase):
     # function itself is not involved in a reference cycle.
     self.assertIs(None, weak_fn())
 
+  def testErrorMessageWhenGraphTensorIsPassedToEager(self):
+
+    @def_function.function
+    def failing_function():
+      a = constant_op.constant(1.)
+
+      with ops.init_scope():
+        _ = a + a
+
+    with self.assertRaisesRegexp(TypeError, MIXING_GRAPH_EAGER_TENSORS_ERROR):
+      failing_function()
+
+  def testVariableCreatorScope(self):
+    created_variables = []
+    captured_variables = []
+
+    @def_function.function
+    def f():
+      if not created_variables:
+        created_variables.append(variables.Variable(1.))
+      return created_variables[0] + 1.
+
+    def capture_creator(next_creator, **kwargs):
+      created = next_creator(**kwargs)
+      captured_variables.append(created)
+      return created
+
+    with variable_scope.variable_creator_scope(capture_creator):
+      f()
+    self.assertEqual(created_variables, captured_variables)
+
+  def testVarAlreadyInitializedNoClobbering(self):
+    v_holder = []
+
+    @def_function.function
+    def add_var(x):
+      if not v_holder:
+        v = variables.Variable([1., 2.])
+        v_holder.append(v)
+        already_initialized = variables.Variable(3.)
+        with ops.init_scope():
+          already_initialized.assign(10.)
+        v_holder.append(already_initialized)
+      return v_holder[0] + v_holder[1] + x
+
+    add_var.get_concrete_function(constant_op.constant(2.))
+    self.assertAllClose([13., 14.], add_var(constant_op.constant(2.)))
+
+  def testSameVariableTwice(self):
+
+    v = variables.Variable(1.0)
+
+    @def_function.function
+    def add(a, b):
+      return a + b
+
+    self.assertAllEqual(add(v, v), 2.0)
+
+  def testShapeCache(self):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    func_a = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+    func_b = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertIs(func_a, func_b)
+
+  def testInitializationInNestedCall(self):
+    v_holder = []
+
+    @def_function.function
+    def add_var(x):
+      if not v_holder:
+        v = variables.Variable([1., 2.])
+        v_holder.append(v)
+        already_initialized = variables.Variable(3.)
+        with ops.init_scope():
+          already_initialized.assign(10.)
+        v_holder.append(already_initialized)
+      return v_holder[0] + v_holder[1] + x
+
+    @def_function.function
+    def wrapper(x):
+      return add_var(x)
+
+    self.assertAllClose([13., 14.], wrapper(constant_op.constant(2.)))
+    v_holder[1].assign(11.)
+    self.assertAllClose([14., 15.], wrapper(constant_op.constant(2.)))
+
+  def testDeviceAnnotationRespected(self):
+    if not context.num_gpus():
+      self.skipTest("Needs multiple devices")
+
+    a = []
+
+    @def_function.function()
+    def create_variable():
+      with ops.init_scope():
+        initial_value = random_ops.random_uniform(
+            (2, 2), maxval=1000000, dtype=dtypes.int64)
+
+      if not a:
+        with ops.device("CPU:0"):
+          a.append(resource_variable_ops.ResourceVariable(initial_value))
+
+      return a[0].read_value()
+
+    created_variable_read = create_variable()
+    self.assertRegexpMatches(created_variable_read.device, "CPU")
+
+  def testDecorate(self):
+    func = def_function.function(lambda: 1)
+    def decorator(f):
+      return lambda: 1 + f()
+
+    func._decorate(decorator)
+    self.assertEqual(func().numpy(), 2)
+
+  def testLiftPlaceholderInitializedVariable(self):
+    with ops.Graph().as_default():
+      var_list = []
+
+      @def_function.function
+      def use_variable():
+        if not var_list:
+          initial_value = array_ops.placeholder(shape=[], dtype=dtypes.float32)
+          v = variables.Variable(initial_value)
+          var_list.append(v)
+        return var_list[0] + 1.
+
+      var_plus_one = use_variable()
+      with self.session() as session:
+        init_op = var_list[0].initializer
+        session.run(init_op, feed_dict={init_op.inputs[1]: 2.})
+        self.assertEqual(3., session.run(var_plus_one))
+
+  def testDecorate_rejectedAfterTrace(self):
+    func = def_function.function(lambda: 1)
+    self.assertEqual(func().numpy(), 1)
+    msg = 'Functions cannot be decorated after they have been traced.'
+    with self.assertRaisesRegexp(ValueError, msg):
+      func._decorate(lambda f: f)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/def_function_xla_test.py b/tensorflow/python/eager/def_function_xla_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9115d8a6943532fb87f1514ee20354067015a7d8
--- /dev/null
+++ b/tensorflow/python/eager/def_function_xla_test.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class DefFunctionTests(xla_test.XLATestCase):
+
+  def testVarInitializedInFunction(self):
+    with self.test_scope():
+      v_holder = []
+
+      @def_function.function
+      def add_var(x):
+        if not v_holder:
+          v = variables.Variable([1., 2.])
+          v_holder.append(v)
+          already_initialized = variables.Variable(3.)
+          with ops.init_scope():
+            already_initialized.assign(10.)
+          v_holder.append(already_initialized)
+        return v_holder[0] + v_holder[1] + x
+
+      self.assertAllClose([13., 14.], add_var(constant_op.constant(2.)))
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 9c3e20f93669b75b6b349844673f2fcb166b9980..ff92f8c3dbfd1913509d5cc97c85963f9060e110 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 
 import collections
 import functools
-import re
-import sys
 import threading
 import types as types_lib
 import weakref
@@ -40,15 +38,15 @@ from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
@@ -58,23 +56,132 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-# This is to avoid a circular dependency with gradients_impl
-gradients_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
 BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 
-# TODO(scottzhu): Update this to allow arbitrary attribute names in future.
-WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
-    "experimental_.*",
-    FORWARD_FUNCTION_ATTRIBUTE_NAME,
-    BACKWARD_FUNCTION_ATTRIBUTE_NAME
-]
 
 CacheKey = collections.namedtuple("CacheKey", [
-    "input_signature", "parent_graph", "device_functions", "colocation_stack",
-    "uses_xla"
-])
+    "input_signature", "parent_graph", "device_functions",
+    "colocation_stack"])
+
+CacheKey.replace = CacheKey._replace  # pylint: disable=protected-access
+
+
+def _flat_shape_list(*params):
+  """Return a flat list of TensorShapes, one for each tensor[spec] in `*params`.
+
+  Args:
+    *params: Set of nested entries containing Tensors, TensorSpec, and
+      non-tensors.
+
+  Returns:
+    A list of entries containing either `None` or `TensorShape`.
+  """
+  return [tensor_shape.TensorShape(x.shape)
+          if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec)) else None
+          for x in nest.flatten(params)]
+
+
+def _shape_less_specific_than(relaxed, to_check):
+  """Checks if `relaxed` is less specific than `to_check`.
+
+  This is an asymmetric check, unlike `TensorShape.is_compatible_with`. If
+  `to_check` has a dimension with an undefined shape, `relaxed` must also have
+  an undefined shape for that dimension.
+
+  Args:
+    relaxed: A `TensorShape` to check against.
+    to_check: A second `TensorShape`.
+
+  Returns:
+    True if `to_check` represents a set of shapes which is a subset of
+    `relaxed`'s shapes and False otherwise.
+  """
+  if to_check.dims is not None and relaxed.dims is not None:
+    if to_check.rank != relaxed.rank:
+      return False
+    for check_dim, relaxed_dim in zip(to_check.dims, relaxed.dims):
+      if check_dim.value is None and relaxed_dim.value is not None:
+        return False
+      if not relaxed_dim.is_compatible_with(check_dim):
+        return False
+  return True
+
+
+def _compatible_shapes(flat_relaxed, flat_to_check):
+  """Check if lists of TensorShapes contain compatible shapes.
+
+  Checks that each `flat_relaxed` shape covers a superset of the shapes of the
+  corresponding `flat_to_check` shape.
+
+  Args:
+    flat_relaxed: List of TensorShape or None.
+    flat_to_check: List of TensorShape or None.
+
+  Returns:
+    A python bool.
+
+  Raises:
+    RuntimeError:
+      if `len(flat_relaxed) != len(flat_to_check)`.
+    RuntimeError:
+      if `flat_relaxed[i] is None != flat_to_check[i] is None` for any `i`.
+  """
+
+  if len(flat_relaxed) != len(flat_to_check):
+    raise RuntimeError("Expected shape lists of identical lengths, but saw: "
+                       "%s and %s" % (flat_relaxed, flat_to_check))
+  def is_compatible(relaxed, to_check):
+    """Internal help function.
+
+    Args:
+      relaxed: TensorShape or None.
+      to_check: TensorShape or None.
+
+    Returns:
+      Python bool.
+
+    Raises:
+      RuntimeError: If `relaxed is None != to_check is None`.
+    """
+    # If both x and y are None, there is no shape to compare.  Otherwise check
+    # if they are compatible with each other.  Either way, both input signatures
+    # must have have Tensors in the same entries.  If not, raise an assertion
+    # error.
+    if relaxed is None != to_check is None:
+      raise RuntimeError(
+          "Expected signature type matches between flattened input shapes "
+          "%s and %s; but saw that (%s is None) != (%s is None)"
+          % (flat_relaxed, flat_to_check, relaxed, to_check))
+    return relaxed is None or _shape_less_specific_than(relaxed, to_check)
+  return all(is_compatible(relaxed, to_check)
+             for relaxed, to_check in zip(flat_relaxed, flat_to_check))
+
+
+def _common_shape(x, y):
+  """Find a `TensorShape` that is compatible with both `x` and `y`."""
+  if x is None != y is None:
+    raise RuntimeError(
+        "Cannot find a common shape when LHS shape is None but RHS shape "
+        "is not (or vice versa): %s vs. %s" % (x, y))
+  if x is None:
+    return None  # The associated input was not a Tensor, no shape generated.
+  if not isinstance(x, tensor_shape.TensorShape):
+    raise TypeError("Expected x to be a TensorShape but saw %s" % (x,))
+  if not isinstance(y, tensor_shape.TensorShape):
+    raise TypeError("Expected y to be a TensorShape but saw %s" % (y,))
+  if x.rank != y.rank or x.rank is None:
+    return tensor_shape.TensorShape(None)
+  dims = []
+  for dim_x, dim_y in zip(x.dims, y.dims):
+    if (dim_x != dim_y
+        or tensor_shape.dimension_value(dim_x) is None
+        or tensor_shape.dimension_value(dim_y) is None):
+      dims.append(None)
+    else:
+      dims.append(tensor_shape.dimension_value(dim_x))
+  return tensor_shape.TensorShape(dims)
 
 
 def is_same_structure(structure1,
@@ -111,12 +218,6 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any(re.match(reg, key)
-               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
-      raise ValueError("Attribute name is not whitelisted. "
-                       "Whitelisted: prefix %s, got: %s" %
-                       (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
-
     if isinstance(value, attr_value_pb2.AttrValue):
       attrs[key] = value
     # bool type check has to happen before int since bool is a subclass of int.
@@ -222,6 +323,8 @@ class _EagerDefinedFunction(object):
         [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
         [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
         [],
+        [o._c_op for o in graph.control_outputs],  # pylint: disable=protected-access
+        [],  # control_output_names
         None,
         compat.as_str(""))
 
@@ -248,6 +351,7 @@ class _EagerDefinedFunction(object):
     self._num_outputs = len(self.signature.output_arg)
     self._output_types = [o.type for o in self.signature.output_arg]
     self._output_shapes = [o.shape for o in outputs]
+    self._control_captures = graph.control_captures
     self._func_graph_outputs = outputs
     self.grad_func_name = None
     self.python_grad_func = None
@@ -293,7 +397,7 @@ class _EagerDefinedFunction(object):
           "Arguments and signature arguments do not match: %s %s " %
           (len(args), len(list(self.signature.input_arg))))
 
-    function_call_options = ctx.get_function_call_options()
+    function_call_options = ctx.function_call_options
     if function_call_options.config_proto_serialized is None:
       config = function_utils.get_disabled_rewriter_config()
     else:
@@ -312,36 +416,19 @@ class _EagerDefinedFunction(object):
             ctx=ctx)
       # Replace empty list with None
       outputs = outputs or None
-    elif self._graph._xla_compile:  # pylint: disable=protected-access
-      g = ops.get_default_graph()
-      self.add_to_graph(g)
-      signature = self.signature
-      op = g.create_op(
-          signature.name,
-          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
-          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
-          op_def=signature,
-          name="FunctionCall",
-          compute_shapes=False)
-      outputs = op.outputs
-      if not outputs:
-        return op
-      if isinstance(outputs, (ops.Tensor, type(None))):
-        outputs = [outputs]
-      else:
-        outputs = list(outputs)
     else:
       # TODO(akshayka): Either remove this if the FunctionLibraryRuntime
       # creates `PartitionedCallOp` kernels by default, or remove the previous
       # branch if a TPU kernel is registered for `PartitionedCall`.
       with _InterpolateFunctionError(self):
-        outputs = functional_ops.partitioned_call(
-            args=args,
-            f=self,
-            tout=self._output_types,
-            executing_eagerly=executing_eagerly,
-            config=config,
-            executor_type=executor_type)
+        with ops.control_dependencies(self._control_captures):
+          outputs = functional_ops.partitioned_call(
+              args=args,
+              f=self,
+              tout=self._output_types,
+              executing_eagerly=executing_eagerly,
+              config=config,
+              executor_type=executor_type)
 
     if executing_eagerly:
       return outputs
@@ -426,21 +513,30 @@ class ConcreteFunction(object):
           "through the public interface. Use get_concrete_function instead.")
     if len(args) > self._num_positional_args:
       raise TypeError(
-          ("Expected at most {} positional arguments ({}), got {}. When "
-           "calling a concrete function, positional arguments may not be bound "
-           "to Tensors within nested structures.").format(
-               self._num_positional_args,
-               self._arg_keywords[:self._num_positional_args],
-               args))
+          ("Expected at most {} positional arguments (and the rest keywords, "
+           "of {}), got {}. When calling a concrete function, positional "
+           "arguments may not be bound to Tensors within nested structures."
+          ).format(self._num_positional_args, self._arg_keywords, args))
     args = list(args)
     for keyword in self._arg_keywords[len(args):]:
-      args.append(kwargs.pop(compat.as_str(keyword)))
+      try:
+        args.append(kwargs.pop(compat.as_str(keyword)))
+      except KeyError:
+        specified_keywords = (list(self._arg_keywords[:len(args)])
+                              + list(kwargs.keys()))
+        raise TypeError(
+            "Expected argument names {} but got values for {}. Missing: {}."
+            .format(
+                list(self._arg_keywords),
+                specified_keywords,
+                list(set(self._arg_keywords) - set(specified_keywords))))
     if kwargs:
       positional_arg_keywords = set(self._arg_keywords[:len(args)])
       for unused_key in kwargs:
         if unused_key in positional_arg_keywords:
           raise TypeError("Got two values for keyword '{}'.".format(unused_key))
-      raise TypeError("Keyword arguments {} unknown.".format(kwargs.keys()))
+      raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
+          list(kwargs.keys()), list(self._arg_keywords)))
     return self._call_flat(args)
 
   def _filtered_call(self, args, kwargs):
@@ -478,11 +574,17 @@ class ConcreteFunction(object):
     tape.variables_accessed(self._func_graph.variables)
 
     tensor_inputs = []
+    variables_used = set([])
     for i, arg in enumerate(args):
       if isinstance(arg, resource_variable_ops.ResourceVariable):
+        # We can pass a variable more than once, and in this case we need to
+        # pass its handle only once.
+        if arg.handle in variables_used:
+          continue
         if arg.trainable:
           tape.variable_accessed(arg)
         tensor_inputs.append(arg.handle)
+        variables_used.add(arg.handle)
       elif isinstance(arg, ops.Tensor):
         tensor_inputs.append(arg)
       elif (self._signature is not None and
@@ -633,7 +735,7 @@ class ConcreteFunction(object):
     # In case of eager execution, function definition gets added to context
     # during construction itself.
 
-    # TODO(allel/shivaniagrawal): rename this to register to reflect the
+    # TODO(allenl/shivaniagrawal): rename this to register to reflect the
     # method's functionality better. Remove register_gradient_functions argument
     # and figure out if these needs to be registered.
 
@@ -664,12 +766,12 @@ class ConcreteFunction(object):
         _backward_name(self._func_graph.name))
     forward_function_name = _forward_name(self._func_graph.name)
     outputs = [x for x in self._func_graph.outputs
-               if gradients_impl.IsTrainable(x)]
+               if gradients_util.IsTrainable(x)]
     with backwards_graph.as_default():
       gradients_wrt_outputs = [
           graph_placeholder(x.dtype, x.shape) for x in outputs
       ]
-      gradients_wrt_inputs = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
+      gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
           outputs,
           self._func_graph.inputs,
           grad_ys=gradients_wrt_outputs,
@@ -689,7 +791,8 @@ class ConcreteFunction(object):
     # Clear captures, since we pass them in as inputs.
     backwards_graph.captures = {}
     backwards_graph.outputs.extend(
-        grad for grad in func_graph_module.flatten(gradients_wrt_inputs)
+        grad
+        for grad in nest.flatten(gradients_wrt_inputs, expand_composites=True)
         if grad is not None)
     backwards_graph.structured_outputs = gradients_wrt_inputs
     self._backward_graph_function = ConcreteFunction(
@@ -737,7 +840,7 @@ class ConcreteFunction(object):
     # the forward graph function so that we can compute its gradient.
     real_outputs = outputs[:self._num_outputs]
     skip_positions = [i for i, t in enumerate(real_outputs)
-                      if not gradients_impl.IsTrainable(t)]
+                      if not gradients_util.IsTrainable(t)]
     side_outputs = outputs[self._num_outputs:]
 
     def backward_function(*args):
@@ -843,6 +946,10 @@ class FunctionSpec(object):
       python_function_to_inspect = python_function.func
       args_to_prepend = python_function.args or tuple()
       kwargs_to_include = python_function.keywords or {}
+      if input_signature is not None:
+        # TODO(b/124441704): Add support for input_signature + partial.
+        raise NotImplementedError(
+            "Missing support for input_signature when using partial functions.")
     else:
       python_function_to_inspect = python_function
       args_to_prepend = tuple()
@@ -944,6 +1051,21 @@ class FunctionSpec(object):
         argument when an input signature is specified, or when the inputs
         do not conform to the input signature.
     """
+    if self._input_signature is not None:
+      if len(args) > len(self._input_signature):
+        raise TypeError(
+            "When input_signature is provided, only pass arguments "
+            "covered by it. Received %d argument(s)." % len(args))
+      for arg in six.iterkeys(kwargs):
+        index = self._args_to_indices.get(arg, None)
+        if index is None:
+          raise TypeError(
+              "Function got an unexpected keyword argument %s" % arg)
+        if index >= len(self._input_signature):
+          raise TypeError(
+              "When input_signature is provided, only pass arguments "
+              "covered by it. Received argument %s." % arg)
+
     args = self._args_to_prepend + args
     kwargs = dict(kwargs, **self._kwargs_to_include)
     if not kwargs:
@@ -975,42 +1097,110 @@ class FunctionSpec(object):
         # opposed to named arguments called in a keyword-like fashion.
         kwargs.pop(arg)
       inputs = args + _deterministic_dict_values(arg_indices_to_values)
-    flat_inputs = nest.flatten(inputs)
-
-    # Check for NumPy arrays in arguments and convert them to Tensors.
-    # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
-    # finding a way to store them directly in the cache key (currently not
-    # possible since ndarrays are not hashable).
-    need_packing = False
-    for index, value in enumerate(flat_inputs):
-      if type(value) == np.ndarray:
-        flat_inputs[index] = constant_op.constant(value)
-        need_packing = True
-    if need_packing:
-      inputs = nest.pack_sequence_as(
-          structure=inputs, flat_sequence=flat_inputs)
+
     if self._input_signature is None:
+      inputs = _convert_numpy_inputs(inputs)
       return inputs, kwargs
     else:
       assert not kwargs
-      signature_relevant_inputs = inputs[:len(self._input_signature)]
-      if not is_same_structure(self._input_signature,
-                               signature_relevant_inputs):
-        raise ValueError("Structure of Python function inputs does not match "
-                         "input_signature.")
-      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
-      if any(
-          not pywrap_tensorflow.IsTensor(arg) for arg in signature_inputs_flat):
-        raise ValueError("When input_signature is provided, all inputs to "
-                         "the Python function must be Tensors.")
-      if any(not spec.is_compatible_with(other) for spec, other in zip(
-          self._flat_input_signature, signature_inputs_flat)):
-        raise ValueError("Python inputs incompatible with input_signature: "
-                         "inputs (%s), input_signature (%s)" %
-                         (str(inputs), str(self._input_signature)))
+      inputs = _convert_inputs_to_signature(
+          inputs,
+          self._input_signature,
+          self._flat_input_signature)
       return inputs, {}
 
 
+def _convert_numpy_inputs(inputs):
+  """Convert numpy array inputs to tensors."""
+  flat_inputs = nest.flatten(inputs)
+
+  # Check for NumPy arrays in arguments and convert them to Tensors.
+  # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
+  # finding a way to store them directly in the cache key (currently not
+  # possible since ndarrays are not hashable).
+  need_packing = False
+  for index, value in enumerate(flat_inputs):
+    if type(value) == np.ndarray:
+      flat_inputs[index] = constant_op.constant(value)
+      need_packing = True
+  if need_packing:
+    return nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+  else:
+    return inputs
+
+
+def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
+  """Convert inputs to pass into a function with an explicit signature."""
+  try:
+    # TODO(b/124370185): Use all elements as inputs to throw an error if there
+    # are ignored arguments. Calling with arguments that are not part of the
+    # signature should throw an error.
+    flatten_inputs = nest.flatten_up_to(
+        input_signature,
+        inputs[:len(input_signature)])
+  except ValueError:
+    raise ValueError("Structure of Python function inputs does not match "
+                     "input_signature. Inputs (%s), input_signature(%s)." %
+                     (str(inputs), str(input_signature)))
+
+  need_packing = False
+  for index, (value, spec) in enumerate(zip(flatten_inputs,
+                                            flat_input_signature)):
+    if not pywrap_tensorflow.IsTensor(value):
+      try:
+        flatten_inputs[index] = ops.convert_to_tensor(
+            value, dtype_hint=spec.dtype)
+        need_packing = True
+      except ValueError:
+        raise ValueError("When input_signature is provided, all inputs to "
+                         "the Python function must be convertible to tensors."
+                         "Inputs (%s), input_signature(%s)." %
+                         (str(inputs), str(input_signature)))
+
+  if any(not spec.is_compatible_with(other) for spec, other in zip(
+      flat_input_signature,
+      flatten_inputs)):
+    raise ValueError("Python inputs incompatible with input_signature: "
+                     "inputs (%s), input_signature (%s)" %
+                     (str(inputs), str(input_signature)))
+
+  if need_packing:
+    inputs = nest.pack_sequence_as(
+        structure=input_signature,
+        flat_sequence=flatten_inputs)
+
+  return inputs
+
+
+class FunctionCache(object):
+  """A lightweight container for cached functions.
+  """
+
+  def __init__(self):
+    # The set of functions that have been missed; entries are CacheKey with
+    # input_signature `None` (e.g. a "call context key")
+    self.missed = set()
+    # The primary cache, mapping a fully shaped CacheKey to a function.
+    self.primary = collections.OrderedDict()
+    # A cache key lookup, mapping a CacheKey generated without shape info to a
+    # flat list of relaxed shapes (one for each argument).  Arguments that are
+    # not Tensors contain a `None` for the corresponding relaxed shape.
+    self.arg_relaxed_shapes = collections.OrderedDict()
+    # The secondary cache, mapping a CacheKey generated without shape info to a
+    # function.
+    self.arg_relaxed = collections.OrderedDict()
+    # All OrderedDicts require manual garbage collection.
+    self._garbage_collectors = [
+        _FunctionGarbageCollector(self.primary),
+        _FunctionGarbageCollector(self.arg_relaxed),
+        _FunctionGarbageCollector(self.arg_relaxed_shapes)]
+
+  def all_values(self):
+    """A set of all `ConcreteFunction` instances held by this cache."""
+    return set(self.primary.values()) | set(self.arg_relaxed.values())
+
+
 class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
@@ -1028,7 +1218,9 @@ class Function(object):
                name,
                input_signature=None,
                attributes=None,
-               autograph=True):
+               autograph=True,
+               autograph_options=None,
+               capture_by_value=None):
     """Initializes a `Function`.
 
     Args:
@@ -1041,7 +1233,13 @@ class Function(object):
         of the function.
       autograph: whether to use autograph to compile
         `python_function`. See https://www.tensorflow.org/guide/autograph for
-          more information.
+        more information.
+      autograph_options: Experimental knobs to control behavior
+        `when autograph=True`. See https://www.tensorflow.org/guide/autograph
+        for more information.
+      capture_by_value: Experimental. Whether to capture resource variables by
+        value or reference. If None, will inherit from a parent context or
+        default to False.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -1055,9 +1253,10 @@ class Function(object):
         python_function, input_signature)
     self._name = name
     self._autograph = autograph
-    self._function_cache = collections.OrderedDict()
-    self._garbage_collector = _FunctionGarbageCollector(self._function_cache)
+    self._autograph_options = autograph_options
+    self._function_cache = FunctionCache()
     self._function_attributes = attributes or {}
+    self._capture_by_value = capture_by_value
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
@@ -1080,18 +1279,18 @@ class Function(object):
     return self._function_spec
 
   @property
-  def _input_signature(self):
-    """Returns the wrapped Python function."""
-    return self._function_spec.input_signature  # pylint: disable=protected-access
+  def input_signature(self):
+    """Returns the input signature."""
+    return self._function_spec.input_signature
 
   @property
-  def _flat_input_signature(self):
-    """Returns the wrapped Python function."""
-    return self._function_spec.flat_input_signature  # pylint: disable=protected-access
+  def flat_input_signature(self):
+    """Returns the flattened input signature."""
+    return self._function_spec.flat_input_signature
 
   def _get_concrete_function_internal_garbage_collected(self, *args, **kwargs):
     """Returns a concrete function which cleans up its graph function."""
-    if self._input_signature:
+    if self.input_signature:
       args, kwargs = None, None
     graph_function, _, _ = self._maybe_define_function(args, kwargs)
     return graph_function
@@ -1114,14 +1313,14 @@ class Function(object):
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
-    if self._input_signature:
+    if self.input_signature:
       if kwargs:
         raise ValueError("Cannot define a TensorFlow function from a Python "
                          "function with keyword arguments when "
                          "input_signature is provided.")
       if args:
         # If args are provided, they must match the input signature.
-        if not is_same_structure(self._input_signature, args):
+        if not is_same_structure(self.input_signature, args):
           raise ValueError("Structure of Python function inputs does not match "
                            "input_signature.")
         flat_inputs = nest.flatten(args)
@@ -1131,14 +1330,14 @@ class Function(object):
                            "the Python function must be Tensors or "
                            "tf.TensorSpec objects.")
         if any(not spec.is_compatible_with(other)
-               for spec, other in zip(self._flat_input_signature, flat_inputs)):
+               for spec, other in zip(self.flat_input_signature, flat_inputs)):
           raise ValueError("Python inputs incompatible with input_signature: "
                            "inputs (%s), input_signature (%s)" %
-                           (str(args), str(self._input_signature)))
+                           (str(args), str(self.input_signature)))
       args, kwargs = None, None
     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-    if self._input_signature:
-      args = self._input_signature
+    if self.input_signature:
+      args = self.input_signature
       kwargs = {}
     seen_names = set()
     captured = frozenset(graph_function.graph.internal_captures)
@@ -1212,14 +1411,16 @@ class Function(object):
     # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
-  def _cache_key(self, args, kwargs):
+  def _cache_key(self, args, kwargs, include_tensor_ranks_only=False):
     """Computes the cache key given inputs and execution context."""
-    if self._input_signature is None:
+    if self.input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(
+          inputs, include_tensor_ranks_only)
     else:
       del args, kwargs
-      input_signature = self._flat_input_signature
+      assert not include_tensor_ranks_only
+      input_signature = self.flat_input_signature
 
     ctx = context.context()
 
@@ -1243,16 +1444,13 @@ class Function(object):
         default_graph._distribution_strategy_stack)
     if executing_eagerly:
       colocation_stack = ()
-      uses_xla = ctx.device_spec.device_type == "TPU"
-      if uses_distribution_strategy or uses_xla:
+      if uses_distribution_strategy:
         device_functions = (pydev.merge_device(ctx.device_name),)
       else:
         device_functions = ()
     else:
       colocation_stack = tuple(default_graph._colocation_stack.peek_objs())
-      uses_xla = getattr(default_graph, "_xla_compile", False)
       if (uses_distribution_strategy
-          or uses_xla
           or func_graph_module.device_stack_has_callable(
               default_graph._device_function_stack)):
         # Putting the device in the cache key ensures that call-site device
@@ -1262,7 +1460,47 @@ class Function(object):
         device_functions = ()
     # pylint: enable=protected-access
     return CacheKey(input_signature, parent_graph, device_functions,
-                    colocation_stack, uses_xla)
+                    colocation_stack)
+
+  def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
+    """Create a `ConcreteFunction` from `args` and `kwargs`."""
+    if self.input_signature is None:
+      arglen = len(args)
+    else:
+      arglen = len(self.input_signature)
+    base_arg_names = self._function_spec.arg_names[:arglen]
+    num_missing_args = arglen - len(self._function_spec.arg_names)
+    missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
+    # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
+    # where arg is based on the self._function_spec.vararg_name.
+    missing_arg_names = [
+        "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
+    ]
+    arg_names = base_arg_names + missing_arg_names
+    graph_function = ConcreteFunction(
+        func_graph_module.func_graph_from_py_func(
+            self._name,
+            self._python_function,
+            args,
+            kwargs,
+            self.input_signature,
+            autograph=self._autograph,
+            autograph_options=self._autograph_options,
+            arg_names=arg_names,
+            override_flat_arg_shapes=override_flat_arg_shapes,
+            capture_by_value=self._capture_by_value),
+        self._function_attributes)
+
+    # pylint: disable=protected-access
+    # Tell the ConcreteFunction to clean up its graph once it goes out of
+    # scope. ConcreteFunction does not do this in its constructor since it
+    # gets used in some places (like Keras) where the FuncGraph lives
+    # longer than the ConcreteFunction.
+    graph_function._garbage_collector = ConcreteFunctionGarbageCollector(
+        graph_function.graph)
+    # pylint: enable=protected-access
+
+    return graph_function
 
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
@@ -1281,53 +1519,76 @@ class Function(object):
     Raises:
       ValueError: If inputs are incompatible with the input signature.
       TypeError: If the function inputs include non-hashable objects
+      RuntimeError: If there's an internal bug (inconsistency) in handling
+        shape relaxation retracing.
     """
-    if self._input_signature is None or args is not None or kwargs is not None:
+    if self.input_signature is None or args is not None or kwargs is not None:
       args, kwargs = self._function_spec.canonicalize_function_inputs(
           *args, **kwargs)
     cache_key = self._cache_key(args, kwargs)
+
+    try:
+      hash(cache_key)
+    except TypeError as e:
+      raise TypeError(
+          "Arguments supplied to `defun`-generated functions must be"
+          " hashable.  Original error: %s" % e)
+
     with self._lock:
-      try:
-        graph_function = self._function_cache.get(cache_key, None)
-      except TypeError:
-        raise TypeError("Arguments supplied to `defun`-generated functions "
-                        "must be hashable.")
-
-      if graph_function is None:
-        logging.vlog(1,
-                     "Creating new FuncGraph for Python function %r (key: %r)",
-                     self._python_function, cache_key)
-        if self._input_signature is None:
-          arglen = len(args)
-        else:
-          arglen = len(self._input_signature)
-        base_arg_names = self._function_spec.arg_names[:arglen]
-        num_missing_args = arglen - len(self._function_spec.arg_names)
-        missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
-        # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
-        # where arg is based on the self._function_spec.vararg_name.
-        missing_arg_names = [
-            "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
-        ]
-        arg_names = base_arg_names + missing_arg_names
-        graph_function = ConcreteFunction(
-            func_graph_module.func_graph_from_py_func(
-                self._name,
-                self._python_function,
-                args,
-                kwargs,
-                self._input_signature,
-                autograph=self._autograph,
-                arg_names=arg_names), self._function_attributes)
-        # pylint: disable=protected-access
-        # Tell the ConcreteFunction to clean up its graph once it goes out of
-        # scope. ConcreteFunction does not do this in its constructor since it
-        # gets used in some places (like Keras) where the FuncGraph lives
-        # longer than the ConcreteFunction.
-        graph_function._garbage_collector = _ConcreteFunctionGarbageCollector(
-            graph_function.graph)
-        # pylint: enable=protected-access
-        self._function_cache[cache_key] = graph_function
+      graph_function = self._function_cache.primary.get(cache_key, None)
+      if graph_function is not None:
+        return graph_function, args, kwargs
+
+      logging.vlog(1,
+                   "Creating new FuncGraph for Python function %r (key: %r)",
+                   self._python_function, cache_key)
+      logging.vlog(2,
+                   "Python function signature [args: %s] [kwargs: %s]",
+                   args,
+                   kwargs)
+
+      call_context_key = cache_key.replace(input_signature=None)
+
+      # If there's a provided input signature, or
+      # there's no cache miss for this calling context so far, go ahead and
+      # build the function and bypass shape relaxation retracing.
+      if (self.input_signature is not None
+          or call_context_key not in self._function_cache.missed):
+        self._function_cache.missed.add(call_context_key)
+        graph_function = self._create_graph_function(args, kwargs)
+        self._function_cache.primary[cache_key] = graph_function
+        return graph_function, args, kwargs
+
+      rank_only_cache_key = self._cache_key(
+          args, kwargs, include_tensor_ranks_only=True)
+
+      arg_shapes = _flat_shape_list(args, kwargs)
+      relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
+          rank_only_cache_key, None)
+      relaxed_arg_function = self._function_cache.arg_relaxed.get(
+          rank_only_cache_key, None)
+
+      if (relaxed_arg_function is not None
+          and _compatible_shapes(flat_relaxed=relaxed_arg_shapes,
+                                 flat_to_check=arg_shapes)):
+        return relaxed_arg_function, args, kwargs
+
+      if relaxed_arg_shapes is None:
+        relaxed_arg_shapes = arg_shapes
+      else:
+        if len(arg_shapes) != len(relaxed_arg_shapes):
+          raise RuntimeError("Expected arg_shapes len to match "
+                             "relaxed_arg_shapes len: %d vs. %d"
+                             % (len(arg_shapes), len(relaxed_arg_shapes)))
+        relaxed_arg_shapes = [
+            _common_shape(x, y) for (x, y) in zip(
+                arg_shapes, relaxed_arg_shapes)]
+      self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
+          relaxed_arg_shapes)
+      graph_function = self._create_graph_function(
+          args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
+      self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
+
       return graph_function, args, kwargs
 
 
@@ -1364,7 +1625,10 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, autograph=True):
+def defun(func=None,
+          input_signature=None,
+          autograph=True,
+          experimental_autograph_options=None):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") compiles a Python function
@@ -1678,6 +1942,9 @@ def defun(func=None, input_signature=None, autograph=True):
     autograph: Whether `func` should be compiled before
       constructing the graph. See https://www.tensorflow.org/guide/autograph
       for more information.
+    experimental_autograph_options: Experimental knobs (in the form of a tuple
+      of tensorflow.autograph.Feature values) to control behavior when
+      autograph=True.
 
 
   Returns:
@@ -1693,13 +1960,15 @@ def defun(func=None, input_signature=None, autograph=True):
   return defun_with_attributes(
       func=func,
       input_signature=input_signature,
-      autograph=autograph)
+      autograph=autograph,
+      experimental_autograph_options=experimental_autograph_options)
 
 
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          autograph=True):
+                          autograph=True,
+                          experimental_autograph_options=None):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1717,6 +1986,8 @@ def defun_with_attributes(func=None,
       the whitelisted argument which is a python string, and sets the name for
       this `ConcreteFunction` in the graph.
     autograph: same as defun()'s autograph.
+    experimental_autograph_options: same as defun()'s
+      experimental_autograph_options.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -1741,7 +2012,8 @@ def defun_with_attributes(func=None,
             name,
             input_signature=input_signature,
             attributes=attributes,
-            autograph=autograph))
+            autograph=autograph,
+            autograph_options=experimental_autograph_options))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1758,13 +2030,26 @@ def defun_with_attributes(func=None,
 
 
 # When a method is bound to objects of this type, it allows AutoGraph to
-# recover a weak reference the original method's self pointer. This uses the
-# mechanism from pyct.inspect_utils.getmethodclass.
+# recover a weak reference the original method's self pointer, so that it can
+# execute it consistent with class_method_to_instance_method's
+# bound_method_wrapper.
 # TODO(b/119246461): This is not pretty. Use a descriptor instead?
-class _WeakrefSelf(object):
+class TfMethodTarget(object):
+  """Binding target for methods replaced by function and defun."""
+
+  def __init__(self, target, original_python_function):
+    self.weakrefself_target__ = target
+    self.weakrefself_func__ = weakref.ref(original_python_function)
+
+  @property
+  def target(self):
+    return self.weakrefself_target__()
 
-  def __init__(self, target):
-    self.ag_self_weakref__ = target
+  def call(self, args, kwargs):
+    wrapped_fn = self.weakrefself_func__()
+    if tf_inspect.ismethod(wrapped_fn):
+      wrapped_fn = six.get_unbound_function(wrapped_fn)
+    return wrapped_fn(self.weakrefself_target__(), *args, **kwargs)
 
 
 def class_method_to_instance_method(original_function, instance):
@@ -1773,14 +2058,15 @@ def class_method_to_instance_method(original_function, instance):
 
   # Note: while we could bind to a weakref proxy instead, that causes the
   # bound method to be unhashable.
-  bound_method = types_lib.MethodType(original_function.python_function,
-                                      _WeakrefSelf(weak_instance))
+  bound_method = types_lib.MethodType(
+      original_function.python_function,
+      TfMethodTarget(weak_instance, original_function.python_function))
 
   # original_function is expected to be of one of the two `Function` types
   # (defined either in function.py or def_function.py).
   assert hasattr(original_function, "_name")
   assert hasattr(original_function, "_autograph")
-  assert hasattr(original_function, "_input_signature")
+  assert hasattr(original_function, "_function_spec")
   assert hasattr(original_function, "python_function")
 
   weak_bound_method_wrapper = None
@@ -1792,14 +2078,16 @@ def class_method_to_instance_method(original_function, instance):
 
     if wrapped_fn is strong_bound_method_wrapper.__original_wrapped__:
       # If __wrapped__ was not replaced, then call original_function.
+      # TODO(mdan): For better consistency, use the wrapper's call().
       wrapped_fn = original_function.python_function
       if tf_inspect.ismethod(wrapped_fn):
         wrapped_fn = six.get_unbound_function(wrapped_fn)
       return wrapped_fn(weak_instance(), *args, **kwargs)
 
-    # If __wrapped__ was replaced, then it is always an unbound function
-    # that takes self as first argument.
-    return wrapped_fn(weak_instance(), *args, **kwargs)
+    # If __wrapped__ was replaced, then it is always an unbound function.
+    # However, the replacer is still responsible for attaching self properly.
+    # TODO(mdan): Is it possible to do it here instead?
+    return wrapped_fn(*args, **kwargs)
   weak_bound_method_wrapper = weakref.ref(bound_method_wrapper)
 
   # pylint: disable=protected-access
@@ -1810,7 +2098,7 @@ def class_method_to_instance_method(original_function, instance):
       tf_decorator.make_decorator(bound_method, bound_method_wrapper),
       name=original_function._name,
       autograph=original_function._autograph,
-      input_signature=original_function._input_signature)
+      input_signature=original_function.input_signature)
   # pylint: enable=protected-access
 
   # And we wrap the function with tf_decorator so inspection works correctly
@@ -1836,7 +2124,7 @@ class _FunctionGarbageCollector(object):
       pass
 
 
-class _ConcreteFunctionGarbageCollector(object):
+class ConcreteFunctionGarbageCollector(object):
   """Cleans up reference cycles when a `ConcreteFunction` goes out of scope."""
 
   def __init__(self, func_graph):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index a2da088d639c7ad447095fe21903777ad44c0991..96184f6656c17af57f27eb16e6d960e1d8c0d6e0 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -49,6 +49,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
@@ -64,6 +66,13 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
+def total_function_cache(defined):
+  # pylint: disable=protected-access
+  return (set(defined._function_cache.primary)
+          | set(defined._function_cache.arg_relaxed))
+  # pylint: enable=protected-access
+
+
 class MiniModel(keras_training.Model):
   """Minimal model for mnist.
 
@@ -97,19 +106,142 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
     self.assertAllEqual(sq2.numpy().reshape(-1), [52, 76, 74, 108])
 
-  def testWastedAdd(self):
+  def testVariable(self):
+    v1 = variables.Variable(1.0)
+    add = def_function.function(lambda x, v: x + v1 + v)
+    v2 = variables.Variable(1.0)
+    x = constant_op.constant(1.0)
+    r = add(x, v2)
+    self.assertEqual(3.0, self.evaluate(r))
 
-    @def_function.function()
-    def add(x, y):
-      _ = x * y
-      return x + y
+  def testExternalControlDependency(self):
+    with ops.Graph().as_default(), self.test_session():
+      v = variables.Variable(1.0)
+      v.initializer.run()
+
+      op = v.assign_add(1.0)
+
+      @function.defun
+      def f():
+        with ops.control_dependencies([op]):
+          return 1.0
+
+      self.evaluate(f())
+      self.assertAllEqual(self.evaluate(v), 2.0)
+
+  def testInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
 
-    # The default config allows all rewrites.
-    config_proto = config_pb2.ConfigProto()
+    @function.defun
+    def func(a):
+      if a._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return a + 1
+
+    func(constant_op.constant([]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(constant_op.constant([1.0]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(constant_op.constant([1.0, 2.0]))
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+  def testNestedInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
+
+    @function.defun
+    def func(a_, b_=None):
+      del a_  # Only used to check which cache is used.
+      self.assertEqual(b_[0]._shape_tuple(), ())
+      if b_[1]._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return b_[0] + 1
+
+    a = 'hi'
+    b0 = constant_op.constant(1.0)
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(a, b_=[b0, constant_op.constant([1.0, 1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    unknown_dim[0] = False
+
+    # Now do the same except with a new a which is not a tensor; this should
+    # change the cache key.
+    a = 'bye'
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+    # Since we already marked a cache miss for a function with the same
+    # non-input signatures, here we will immediately start relaxing shapes.
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+  def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
+    layer = keras.layers.Dense(1)
+    fn = def_function.function()(layer)
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      fn(array_ops.ones((3, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+    with self.captureWritesToStream(sys.stderr) as printed:
+      # Use batch size 2 to trigger a second cache miss on the shape.
+      fn(array_ops.ones((2, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+
+    # Shape relaxation passes TensorShape([None, None]), which causes layer
+    # matmul to fail, due to incompatible dims.  What would have been a graph
+    # build time error (layer would complain about the inner dim being 4).
+    with self.captureWritesToStream(sys.stderr) as printed:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, r'MatMul'):
+        fn(array_ops.ones((3, 4)))
+
+  def testNestedShapeFunctionRelaxation(self):
+
+    got_shape = [None]
+
+    # The inner function will go through shape relaxation because the shapes it
+    # receives will be [1], [2], [3], ...
+    @def_function.function
+    def bar(x_shape):
+      got_shape[0] = x_shape._shape_tuple()
+      return x_shape
+
+    # The outer function will not go through shape relaxation because the shapes
+    # it receives will be [1], [[1]], [[[1]]], ...
+    @def_function.function
+    def foo(ones):
+      return bar(array_ops.shape(ones))
+
+    for rank in range(1, 6):
+      x_shape = self.evaluate(foo(array_ops.ones([1] * rank)))
+      self.assertAllEqual(x_shape, [1] * rank)
+      if rank < 3:
+        self.assertEqual(got_shape[0], (rank,))
+      else:
+        self.assertEqual(got_shape[0], (None,))
 
-    with context.function_config_proto(config_proto):
-      t = constant_op.constant(1.0)
-      self.assertAllEqual(add(t, t).numpy(), 2.0)
+  def testNoHash(self):
+
+    @def_function.function()
+    def f(_):
+      return 1.0
+
+    with self.assertRaisesRegexp(TypeError, 'set'):
+      f(set([]))
 
   def testFuncName(self):
 
@@ -371,13 +503,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     x = random_ops.random_uniform([2, 2]).numpy()
     defined = function.defun(f)
     defined(x)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     x = random_ops.random_uniform([2, 2]).numpy()
     defined(x)
     # A NumPy array with different values but the same shape and dtype
     # shouldn't trigger another function definition.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # Test that the numpy array is properly an argument to the graph function.
     self.assertEqual(1., defined(numpy.ones([])).numpy())
@@ -456,6 +588,50 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     value = tensor_init()
     self.assertAllEqual(value, 2.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGetConcreteFunctionCreatesVariables(self):
+
+    v_holder = []
+
+    @def_function.function
+    def tensor_init():
+      if not v_holder:
+        v_holder.append(variables.Variable(5.))
+      return v_holder[0].read_value()
+
+    concrete = tensor_init.get_concrete_function()
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(5., self.evaluate(concrete()))
+    self.assertAllEqual(5., self.evaluate(tensor_init()))
+
+  def testFuncGraphCaptureByValue(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return v.read_value()
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
+  def testFuncGraphCaptureByValueNested(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return control_flow_ops.cond(
+          array_ops.placeholder_with_default(True, ()),
+          v.read_value, v.read_value)
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
 
@@ -474,6 +650,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
     self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
+  def testShapeInferenceForMoreSpecificInput(self):
+    self.skipTest('b/124219898')
+
+    def f(a):
+      return array_ops.reshape(a, [-1, 3])
+
+    signature = [tensor_spec.TensorSpec(None, dtypes.float32)]
+    compiled = def_function.function(f, input_signature=signature)
+
+    with ops.Graph().as_default():
+      inputs = array_ops.zeros([10, 10, 3])
+      self.assertAllEqual(f(inputs).shape, compiled(inputs).shape)
+
+  def testFuncListAttr(self):
+
+    @function.defun
+    def test_function(val):
+
+      def fn1():
+        return array_ops.ones([10])
+
+      fn2 = lambda: array_ops.ones([10]) * 2
+
+      def fn3(x=2):
+        return array_ops.ones([10]) * x
+      fn3 = functools.partial(fn3, x=3)
+
+      return gen_functional_ops.case(val, [], [dtypes.float32],
+                                     [function.defun(f).get_concrete_function()
+                                      for f in (fn1, fn2, fn3)])
+
+    ones = array_ops.ones([10])
+    self.assertAllEqual([ones], test_function(0))
+    self.assertAllEqual([ones * 2], test_function(1))
+    self.assertAllEqual([ones * 3], test_function(2))
+    self.assertAllEqual([ones * 3], test_function(22))  # default branch
+
   @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
 
@@ -569,7 +782,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     run_metadata = context.export_run_metadata()
     context.disable_run_metadata()
     step_stats = run_metadata.step_stats
-    self.assertGreater(len(step_stats.dev_stats), 0)
+    self.assertNotEmpty(step_stats.dev_stats)
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
@@ -578,10 +791,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # arbitrarily many (placeholders, return identities, etc, might be included
     # or not in the future, so shouldn't be tested for exactly.
     self.assertGreaterEqual(len(cpu_stats.node_stats), 2)
-    self.assertEqual(len(run_metadata.partition_graphs), 1)
+    self.assertLen(run_metadata.partition_graphs, 1)
 
   def testGraphModeCaptureVariable(self):
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
 
       class HasAVar(object):
 
@@ -797,8 +1010,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return None
 
     with self.assertRaisesRegexp(
-        errors.InvalidArgumentError, 'Could not colocate node with its '
-        'resource and reference inputs.*'):
+        errors.InvalidArgumentError,
+        'Cannot place the graph because a reference or resource edge connects '
+        'colocation groups with incompatible assigned devices'):
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
       self.evaluate(resource_apply_adam())
@@ -905,7 +1119,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                   constant_op.constant(4)],
         constant_op.constant(5)
     ])
-    self.assertEqual(len(ret), 2)
+    self.assertLen(ret, 2)
     self.assertAllEqual(ret[0][0], 2)
     self.assertAllEqual(ret[0][1][0][0], 8)
     self.assertAllEqual(ret[0][1][0][1], 4)
@@ -954,6 +1168,32 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
+  # Variable lifting is somewhat different between defun/tf.function, so testing
+  # device placement on both makes sense.
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun',
+           function_decorator=function.defun),
+      dict(testcase_name='DefFunction',
+           function_decorator=def_function.function))
+  @test_util.run_in_graph_and_eager_modes
+  def testVariablesPlacedOnOutsideDevice(self, function_decorator):
+
+    class _Obj(object):
+
+      def __init__(self):
+        self.v = None
+
+      @function_decorator
+      def f(self):
+        if self.v is None:
+          self.v = variables.Variable(1.)
+        return self.v + 1.
+
+    has_device = _Obj()
+    with ops.device('cpu:0'):
+      has_device.f()
+    self.assertIn('CPU', has_device.v.device)
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDefunKerasModelCall(self):
     model = MiniModel()
@@ -991,7 +1231,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(multi_device_fn)
     outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -999,7 +1239,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with ops.device('/cpu:3'):
       outputs = self.evaluate(defined())
     # All function definitions are agnostic to call site devices.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1007,7 +1247,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     with ops.device('/cpu:0'):
       outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1021,7 +1261,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def func():
       return constant_op.constant(0)
 
-    defined = function.defun(func)
+    defined = def_function.function(func)
     with ops.device('cpu:0'):
       cpu_graph_function = defined.get_concrete_function()
 
@@ -1092,10 +1332,38 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(func)
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorDtypeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex128)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorShapeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([1.0], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorShapeDtypeCollision(self):
 
@@ -1105,11 +1373,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     t = constant_op.constant([1.0], dtype=dtypes.complex128)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorUnknownShapesCollision(self):
 
@@ -1119,21 +1387,34 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with context.graph_mode(), self.cached_session():
       defined = function.defun(func)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 1)
+      self.assertLen(total_function_cache(defined), 1)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[1])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 2)
+      self.assertLen(total_function_cache(defined), 2)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[2])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 3)
-
-      t = constant_op.constant(1.0, dtype=dtypes.float32)
+      # Gradual shape relaxation is performed; and the common shape between
+      # [1] and [2] is one containing unknown dimensions.
+      self.assertLen(total_function_cache(defined), 2)
+
+      # pylint: disable=protected-access
+      self.assertLen(defined._function_cache.arg_relaxed_shapes, 1)
+      relaxed_shapes = (
+          list(defined._function_cache.arg_relaxed_shapes.values())[0])
+      self.assertEqual(len(relaxed_shapes), 1)
+      relaxed_shape = relaxed_shapes[0]
+      # pylint: enable=protected-access
+      self.assertEqual(relaxed_shape.rank, 1)
+      self.assertEqual(tensor_shape.dimension_value(relaxed_shape[0]), None)
+
+      t = constant_op.constant([1.0, 1.0, 1.0], dtype=dtypes.float32)
       defined(t)
-      self.assertEqual(len(defined._function_cache), 4)
+      # Shape (3,) matches the relaxed shape TensorShape([None])
+      self.assertLen(total_function_cache(defined), 2)
 
   def testPythonFunctionWithDefaultArgs(self):
 
@@ -1148,35 +1429,36 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     def cache_keys():
       """Sanitizes cache keys of non-input metadata."""
-      return tuple(key[0] for key in defined._function_cache)
+      return tuple(key[0] for key in total_function_cache(defined))
 
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn(('URRR', (0, 1, 20)), cache_keys())
+    self.assertIn(('URRRu', (0, 1, 20)), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn(('URRR', (1, 1, 2)), cache_keys())
+    self.assertIn(('URRRu', (1, 1, 2)), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     defined(1, 2, 3)
-    self.assertIn(('URRR', (1, 2, 3)), cache_keys())
+    self.assertLen(total_function_cache(defined), 3)
+    self.assertIn(('URRRu', (1, 2, 3)), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
     # This matches the previous call.
     defined(1, baz=3, bar=2)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
   def testFunctoolsPartialUnwrappedCorrectly(self):
 
     def full_function(a, b, c=3):
       return a, b, c
 
-    partial = functools.partial(full_function, 1, c=3)
+    partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2)
 
     defined = function.defun(partial)
@@ -1185,7 +1467,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(func_b.numpy(), b)
     self.assertEqual(func_c.numpy(), c)
 
-  def testInputSignatureWithCompatibleInputs(self):
+  def testInputSignatureWithMatchingInputs(self):
 
     def foo(a):
       self.assertEqual(a.shape, (2,))
@@ -1195,12 +1477,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2])
     self.assertAllEqual(a, defined(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(a, defined.get_concrete_function()(a))
     self.assertAllEqual(a, defined.get_concrete_function(a)(a))
     self.assertAllEqual(a, defined.get_concrete_function(
         tensor_spec.TensorSpec((2,), dtype=dtypes.float32))(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     def bar(a):
       self.assertEqual(a._shape_tuple(), (2, None))
@@ -1210,31 +1492,55 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, a)
 
     # Changing the second dimension shouldn't create a new function.
     b = array_ops.ones([2, 3])
     out = defined(b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, b)
 
+  def testInputSignatureWithCompatibleInputs(self):
+
+    rank2_spec = tensor_spec.TensorSpec(shape=(None, None),
+                                        dtype=dtypes.float32)
+
+    @function.defun(input_signature=[rank2_spec])
+    def func(a):
+      self.assertEqual([None, None], a.shape.as_list())
+      return array_ops.shape(a)
+
+    self.assertAllEqual([3, 1], func([[0], [1.0], [1]]))
+    self.assertAllEqual([2, 2], func(numpy.array([[1, 1], [2, 2]])))
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([0.0, 1.0, 2.0])  # Wrong shape.
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([['wrong dtype']])
+
   def testNestedInputSignatures(self):
 
+    def expected_foo(a, b):
+      return [a, b]
+
+    @function.defun(input_signature=[
+        [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
+        tensor_spec.TensorSpec((1,), dtypes.float32),
+    ])
     def foo(a, b):
       self.assertEqual(a[0]._shape_tuple(), (2, None))
       self.assertEqual(a[1]._shape_tuple(), (2, None))
       self.assertEqual(b._shape_tuple(), (1,))
       return [a, b]
 
-    signature = [[tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
-                 tensor_spec.TensorSpec((1,), dtypes.float32)]
-    defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2, 1])
     b = array_ops.ones([1])
-    out = defined([a, a], b)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, a], b])
+    expected = expected_foo([a, a], b)
+    out = foo([a, a], b)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], a)
     self.assertAllEqual(out[1], b)
@@ -1243,33 +1549,58 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = array_ops.ones([2, 3])
     b = array_ops.ones([2, 5])
     c = array_ops.ones([1])
-    out = defined([a, b], c)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, b], c])
+    expected = expected_foo([a, b], c)
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out[0][0], a)
+    self.assertAllEqual(out[0][1], b)
+    self.assertAllEqual(out[1], c)
+
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    c = c.numpy().tolist()
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], b)
     self.assertAllEqual(out[1], c)
 
+  def testNestedInputSignaturesWithDict(self):
+    def expected_bar(a):
+      return a
+
+    @function.defun(input_signature=[{
+        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'c': tensor_spec.TensorSpec((1,), dtypes.float32)}])
     def bar(a):
       self.assertEqual(a['a']._shape_tuple(), (2, None))
       self.assertEqual(a['b']._shape_tuple(), (2, None))
       self.assertEqual(a['c']._shape_tuple(), (1,))
       return a
 
-    signature = [{
-        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'c': tensor_spec.TensorSpec((1,), dtypes.float32)
-    }]
     a = array_ops.ones([2, 3])
     b = array_ops.ones([1])
     inputs = {'a': a, 'b': a, 'c': b}
-    defined = def_function.function(bar, input_signature=signature)
-    out = defined(inputs)
-    nest.assert_same_structure(out, inputs)
-    self.assertAllEqual(out['a'], inputs['a'])
-    self.assertAllEqual(out['b'], inputs['b'])
-    self.assertAllEqual(out['c'], inputs['c'])
+    expected = expected_bar(inputs)
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
+
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    inputs = {'a': a, 'b': a, 'c': b}
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
 
   def testInputSignatureMustBeSequenceOfTensorSpecs(self):
 
@@ -1288,6 +1619,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'tuple or a list.*'):
       function.defun(foo, input_signature=signature)
 
+  @test_util.run_in_graph_and_eager_modes
   def testInputsIncompatibleWithSignatureRaisesError(self):
 
     def foo(a):
@@ -1304,9 +1636,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Arguments and signature arguments do not match.*'):
+    with self.assertRaisesRegexp(TypeError, 'Received 2 argument\(s\)'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1317,7 +1647,60 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
-  def testInputSignatureForFunctionWithNonTensorInputsNotAllowed(self):
+  def testInputsIncompatibleWithNestedSignatureRaisesError(self):
+
+    def foo(a, b):
+      return [a, b]
+
+    signature = [[tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
+                 [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2]
+    defined = function.defun(foo, input_signature=signature)
+    a = array_ops.ones([1])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a, a, a], [a])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a], [a, a, a])
+    defined([a, a], [a, a])
+
+  def testUnderspecifiedInputSignature(self):
+    @function.defun(input_signature=[
+        tensor_spec.TensorSpec([], dtypes.float32),
+    ])
+    def foo(a, training=True):
+      if training:
+        return a
+      else:
+        return -1.0 * a
+
+    x = constant_op.constant(1.0)
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=True)
+
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=False)
+
+    self.assertAllEqual(x.numpy(), foo(x).numpy())
+
+  def testInputSignatureWithPartialFunction(self):
+    self.skipTest('b/124441704')
+    def full_function(a, b, c=3.0):
+      return a, b, c
+
+    partial = functools.partial(full_function, 1, c=4)
+    a, b, c = partial(2.0)
+    signature = [tensor_spec.TensorSpec([], dtypes.float32)]
+    defined = function.defun(partial, input_signature=signature)
+    x = constant_op.constant(2.0)
+    func_a, func_b, func_c = defined(x)
+    self.assertEqual(func_a.numpy(), a)
+    self.assertEqual(func_b.numpy(), b)
+    self.assertEqual(func_c.numpy(), c)
+
+  def testInputSignatureConversionWithDefaultArg(self):
 
     def foo(a, training=True):
       if training:
@@ -1331,11 +1714,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'When input_signature is provided, all inputs to '
-        'the Python function must be Tensors.'):
-      defined(a, training=True)
+    self.assertAllEqual(a.numpy(), defined(a))
+    self.assertAllEqual(a.numpy(), defined(a, training=True))
+    self.assertAllEqual(-a.numpy(), defined(a, training=False))
 
   def testInputSignatureWithKeywordPositionalArgs(self):
 
@@ -1350,22 +1731,22 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     integer = constant_op.constant(2, dtypes.int64)
 
     out1, out2 = foo(flt, integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt=flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(integer=integer, flt=flt)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
@@ -1395,27 +1776,27 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     two = defined(a=a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     three = defined(b=b, a=a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     four = defined(a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # The next call corresponds to a new input signature, hence
     # we expect another function to be defined.
     five = defined(b, a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     six = defined(a=b, b=a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     seven = defined(b=a, a=b)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     self.assertAllEqual(one, [1.0, 2.0])
     self.assertAllEqual(two, [1.0, 2.0])
@@ -1476,6 +1857,25 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     side_effecting_function.python_function()
     self.assertAllEqual(state, [0, 0])
 
+  def testFunctionWithNestedFunctionCallAndSideEffects(self):
+    v1 = variables.Variable(1.0)
+    v2 = variables.Variable(1.0)
+
+    @def_function.function
+    def add_one(a):
+      a.assign_add(1.0)
+
+    # Grappler will inline calls to `add_one` into the function body, we check
+    # that all side-effects were executed.
+    @def_function.function
+    def side_effecting_function(a, b):
+      add_one(a)
+      add_one(b)
+      return a + b
+
+    result = side_effecting_function(v1, v2)
+    self.assertEqual(result.numpy(), 4.0)
+
   def testFunctionWithExtraAttributes(self):
     @function.defun_with_attributes(attributes={'experimental_1': 'value1',
                                                 'experimental_2': 2})
@@ -1497,35 +1897,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 2)
+        self.assertLen(graph._functions, 2)
         functions = list(graph._functions.values())
         self.assertRegexpMatches(
             functions[0].definition.signature.name, '.*matmul.*')
         attrs = functions[0].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_1'].s, b'value1')
         self.assertEqual(attrs['experimental_2'].i, 2)
 
         self.assertRegexpMatches(
             functions[1].definition.signature.name, '.*add.*')
         attrs = functions[1].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_3'].b, True)
         self.assertEqual(attrs['experimental_4'].f, 1.0)
         # pylint: enable=protected-access
 
   def testFunctionWithInvalidAttribute(self):
-    @function.defun_with_attributes(attributes={'attr1': 'value1'})
-    def matmul(x, y):
-      return math_ops.matmul(x, y)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 '.*Attribute name is not whitelisted.*'):
-      with context.graph_mode(), self.cached_session():
-        with ops.get_default_graph().as_default():
-          t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-          matmul(t, t)
-
     @function.defun_with_attributes(attributes={'experimental_1': ['value1']})
     def add(x, y):
       return math_ops.add(x, y)
@@ -1555,7 +1944,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1594,7 +1983,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual(double.eval().reshape(-1), [2, 4, 6, 8])
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         functions = list(graph._functions.values())
         for i in range(len(functions)):
           self.assertEqual(captured_function_names[i],
@@ -1631,7 +2020,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1657,7 +2046,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual([[1, 2], [3, 4]], self.evaluate(composite_t))
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
 
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
@@ -1675,12 +2064,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
         # Test register function with cache, note inputs are ignored.
         function.register(defun_matmul)
         graph = ops.get_default_graph()
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
@@ -1697,7 +2086,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         graph = ops.get_default_graph()
         # Only one function is registered since the input param are in same type
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testCallingFunctionWithDifferentVariables(self):
 
@@ -1708,8 +2097,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     v = resource_variable_ops.ResourceVariable(0.0)
     graph_function = foo.get_concrete_function(v)
-    self.assertEqual(len(graph_function.inputs), 1)
-    self.assertEqual(len(graph_function.captured_inputs), 0)
+    self.assertLen(graph_function.inputs, 1)
+    self.assertEmpty(graph_function.captured_inputs)
 
     self.assertEqual(float(graph_function(v)), 1.0)
     self.assertEqual(float(graph_function(v)), 2.0)
@@ -1737,34 +2126,30 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       graph_function('Not a Tensor.')
 
   def testSwapImplementationWithGrapplerPlugin(self):
+    # Set the min_graph_nodes to -1 since the graph in this test is too small,
+    # and will be ignored by grappler if don't set this.
     rewrites = rewriter_config_pb2.RewriterConfig()
-    # function_optimizer has to be turn off, otherwise it will delete the
-    # registered function if it does not get called.
-    # TODO(scottzhu): Move the ExperimentalImplementationSelector to be called
-    # before function_optimizer in future.
-    rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-    customer_optimizer = rewrites.custom_optimizers.add()
-    customer_optimizer.name = 'ExperimentalImplementationSelector'
+    rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
     rewrites.min_graph_nodes = -1
     graph_options = config_pb2.GraphOptions(
         rewrite_options=rewrites, build_cost_model=1)
     config = config_pb2.ConfigProto(graph_options=graph_options)
 
     with context.graph_mode(), self.cached_session(
-        config=config, graph=ops.Graph(), use_gpu=True) as sess:
+        config=config, graph=ops.Graph(), use_gpu=True):
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'CPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'CPU'
           })
       def cpu_boost(x):
         return math_ops.add(x, 2.0)
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'GPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'GPU'
           })
       def gpu_boost(x):
         return math_ops.add(x, 4.0)
@@ -1798,18 +2183,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 1)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 1)
+        self.assertLen(total_function_cache(add), 1)
 
         maybe_add(x, False)
-        self.assertEqual(len(maybe_add._function_cache), 2)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 2)
+        self.assertLen(total_function_cache(add), 1)
 
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 3)
-        self.assertEqual(len(add._function_cache), 2)
+        self.assertLen(total_function_cache(maybe_add), 3)
+        self.assertLen(total_function_cache(add), 2)
+
+  def testCacheKeyOverlappingShapes(self):
+    @function.defun
+    def defined(t):
+      return t
+
+    defined(array_ops.zeros([12, 1]))
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined(array_ops.zeros([1, 21]))
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheKeyNestedLists(self):
+    @function.defun
+    def defined(l):
+      return l
+
+    a = constant_op.constant(1.)
+    b = constant_op.constant(2.)
+    c = constant_op.constant(3.)
+    defined([[a], b, c])
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined([[a, b], c])
+    self.assertLen(total_function_cache(defined), 2)
 
   def testDecoratedMethod(self):
     m = DefunnedMiniModel()
@@ -2031,7 +2441,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     m = DefunnedMiniModel()
     m(array_ops.ones([1, 2]))
     weak_variables = weakref.WeakSet(m.variables)
-    self.assertEqual(2, len(weak_variables))
+    self.assertLen(weak_variables, 2)
     del m
     self.assertEqual([], list(weak_variables))
 
@@ -2086,13 +2496,65 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def fn(x):
       return fn2(x)
 
-    try:
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
       fn(2)
-      self.assertFail()
-    except errors.InvalidArgumentError as e:
-      self.assertIn('fn -> fn2', e.message)
-      self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
-      self.assertNotIn('fn3', e.message)
+    e = cm.exception
+    self.assertIn('fn -> fn2', e.message)
+    self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
+    self.assertNotIn('fn3', e.message)
+
+  def testFunctionIsNotPinned(self):
+    """Tests that functions aren't pinned to the CPU by the eager runtime."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+    seed1, seed2 = 79, 25
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float32
+
+    @def_function.function
+    def func():
+      with ops.device('GPU:0'):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    with ops.device('GPU:0'):
+      x = func()
+      self.assertRegexpMatches(x.device, 'GPU')
+
+  @test_util.run_in_graph_and_eager_modes
+  def testShapeCaching(self):
+
+    @function.defun
+    def func(x):
+      return array_ops.shape(x)
+
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([None, None], dtypes.float32)])
+    def calls_func(x):
+      return func(x)
+
+    self.assertAllEqual([1, 1], self.evaluate(func(array_ops.zeros([1, 1]))))
+    self.assertAllEqual([2, 2], self.evaluate(func(array_ops.zeros([2, 2]))))
+    self.assertAllEqual(
+        [3, 3],
+        self.evaluate(calls_func(array_ops.zeros([3, 3]))))
+
+  def testLimitedRetracing(self):
+    trace_count = [0]
+    @function.defun
+    def func(x):
+      trace_count[0] += 1
+      return x
+
+    for _ in range(50):
+      func(constant_op.constant(3.))
+      func(constant_op.constant(4.))
+      func(constant_op.constant([[1., 2.]]))
+      func(constant_op.constant([[]]))
+      func(constant_op.constant([[3., 4.], [5., 6.]]))
+      func(constant_op.constant([[3., 4.], [5., 6.], [7., 8.]]))
+    # Tracing more than twice per input doesn't make sense.
+    self.assertLess(trace_count[0], 13)
 
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
@@ -2354,6 +2816,7 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     result = func(g1, g2, c1, g3, c2)
     self.assertEqual(result.numpy(), 5.0 * 7.0 * 17.0)
 
+
 if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
index 2e9d24f61ea110bb01c9a80174e6eb3618b765de..949ff5ef49f6a6fdcfb30d099914f3c346759987 100644
--- a/tensorflow/python/eager/lift_to_graph.py
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -20,9 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import six
 
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 def _graph_inputs(op):
@@ -35,35 +38,231 @@ def _as_operation(op_or_tensor):
   return op_or_tensor
 
 
-def lift_to_graph(init_tensor, graph, sources=None):
-  """Copies the tensor and all its inputs recursively to the outer graph."""
-  # Check that the initializer does not depend on any placeholders.
-  if sources is None:
-    sources = set([])
+class UnliftableError(Exception):
+  """Raised if a Tensor cannot be lifted from the graph."""
+  pass
+
+
+def _constant_inputs(op_or_tensor):
+  return all(_as_operation(i).type == u"Const"
+             and not _as_operation(i).control_inputs
+             for i in _graph_inputs(_as_operation(op_or_tensor)))
+
+
+def _path_from(from_op, tensor, sources):
+  """Find one path from `from_op` to `tensor`, ignoring `sources`.
+
+  Args:
+    from_op: A `tf.Operation`.
+    tensor: A `tf.Operation` or `tf.Tensor`.
+    sources: A list of `tf.Tensor`.
+
+  Returns:
+    A python string containing the path, or "??" if none is found.
+  """
   visited_ops = set([x.op for x in sources])
+  ops_to_visit = [_as_operation(tensor)]
+  some_op_output = {}
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+    if op == from_op:
+      path_op = op
+      path = [path_op]
+      final_op = _as_operation(tensor)
+      while path_op != final_op:
+        path_op = some_op_output[path_op]
+        path.append(path_op)
+      return " <- ".join(["%s (%s)" % (x.name, x.type) for x in reversed(path)])
+    else:
+      for inp in _graph_inputs(op):
+        if inp not in visited_ops and inp not in sources:
+          some_op_output[inp] = op
+          ops_to_visit.append(inp)
+  return "??"
+
+
+def _map_subgraph(init_tensor, sources, disallowed_placeholders, visited_ops,
+                  op_outputs, add_sources):
+  """Walk a Graph and capture the subgraph between init_tensor and sources.
+
+  Note: This function mutates visited_ops and op_outputs.
+
+  Arguments:
+    init_tensor:  A Tensor or Operation where the subgraph terminates.
+    sources:  A set of Tensors where subgraph extraction should stop.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    visited_ops: A set of operations which were visited in a prior pass.
+    op_outputs: A defaultdict containing the outputs of an op which are to be
+      copied into the new subgraph.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+
+  Returns:
+    The set of placeholders upon which init_tensor depends and are not in
+    sources.
+
+  Raises:
+    UnliftableError: if init_tensor depends on a placeholder which is not in
+      sources and add_sources is False.
+  """
   ops_to_visit = [_as_operation(init_tensor)]
-  op_outputs = collections.defaultdict(set)
+  extra_sources = set()
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in visited_ops:
       continue
     visited_ops.add(op)
-    # TODO(apassos) distinguish arg placeholders, capture placeholders,
-    # and placeholders the user might directly use to initialize
-    # variables.
-    if op.type == "Placeholder":
-      raise ValueError(
-          "Unable to lift tensor", init_tensor,
-          "because it depends transitively on placeholder ", op)
+
+    should_raise = False
+    if disallowed_placeholders is not None and op in disallowed_placeholders:
+      should_raise = True
+    elif op.type == "Placeholder":
+      if disallowed_placeholders is None and not add_sources:
+        should_raise = True
+      extra_sources.update(op.outputs)
+
+    if should_raise:
+      raise UnliftableError(
+          "Unable to lift tensor %s because it depends transitively on "
+          "placeholder %s via at least one path, e.g.: %s"
+          % (repr(init_tensor), repr(op), _path_from(op, init_tensor, sources)))
     for inp in _graph_inputs(op):
       op_outputs[inp].add(op)
-      if inp not in visited_ops and inp not in sources:
+      if inp not in visited_ops and inp not in (sources or extra_sources):
         ops_to_visit.append(inp)
+
+  return extra_sources
+
+
+def _copy_non_source(op, graph, op_map):
+  """Copy an op directly to a given graph.
+
+  This function assumes that all of the inputs to an op have already been
+  copied.
+
+  Args:
+    op: The op to be copied.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+  """
+  copied_inputs = [op_map[x] for x in op.inputs]
+  copied_control_inputs = [op_map[x] for x in op.control_inputs]
+  with ops.control_dependencies(copied_control_inputs), ops.device(op.device):
+    copied_op = graph.create_op(
+        op_type=op.type,
+        inputs=copied_inputs,
+        dtypes=[x.dtype for x in op.outputs],
+        attrs=op.node_def.attr,
+        name=op.name)
+  op_map[op] = copied_op
+  for i, o in enumerate(op.outputs):
+    op_map[o] = copied_op.outputs[i]
+
+
+def _copy_source(s, graph, op_map, handle_captures, inverse_captures):
+  """Create a source in a graph based on a Tensor from a different graph.
+
+  This function creates a placeholder analog of `s` in a graph with the
+  following behavior:
+
+  1) If s is a captured Tensor or Variable and handle_captures is set to True,
+     simply capture it in the new graph as well.
+
+  2) If s is a PlaceholderWithDefault whose default is a constant, preserve
+     said default in the new graph.
+
+  3) When applicable, copy resource variable metadata from `s` to the newly
+     created placeholder.
+
+  Args:
+    s: The source of interest.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    inverse_captures: A dict mapping s back to the Tensor or Variable that it
+      captures.
+  """
+  if handle_captures and s in inverse_captures:
+    copied_placeholder = graph.capture(inverse_captures[s], name=s.op.name)
+  elif s.op.type == "PlaceholderWithDefault" and _constant_inputs(s):
+    # Copy the default value to the graph.
+    default_value = s.op.inputs[0]
+    _copy_non_source(op=default_value.op, graph=graph, op_map=op_map)
+
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder_with_default(
+          input=op_map[default_value], shape=s.shape, name=s.op.name)
+  else:
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder(
+          dtype=s.dtype, shape=s.shape, name=s.op.name)
+
+  base_handle = resource_variable_ops.get_resource_handle_data(s)
+  if base_handle.shape_and_type:
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        copied_placeholder,
+        base_handle,
+        graph_mode=True)
+
+  op_map[s] = copied_placeholder
+
+
+def lift_to_graph(init_tensors, graph, sources=None,
+                  disallowed_placeholders=None, add_sources=False,
+                  handle_captures=False, base_graph=None):
+  """Copies the tensor and all its inputs recursively to the outer graph.
+
+  Args:
+    init_tensors: The Tensor to lift.
+    graph: The graph to lift to.
+    sources: Optional sequence of nodes to start from. If omitted the whole
+      subgraph which feeds into `init_tensor` is lifted.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    base_graph: The graph from which to lift ops. This will be inferred if not
+      specified.
+
+  Returns:
+    A mapping from ops in the current default graph to ops in `graph`.
+
+  Raises:
+    UnliftableError: If a placeholder blocks lifting.
+  """
+  variable_init_tensors = {i for i in init_tensors if isinstance(
+      i, resource_variable_ops.ResourceVariable)}
+  init_tensors = set(init_tensors).difference(variable_init_tensors)
+  base_graph = base_graph or list(init_tensors)[0].graph
+
+  # Check that the initializer does not depend on any placeholders.
+  sources = set(sources or [])
+  visited_ops = set([x.op for x in sources])
+  op_outputs = collections.defaultdict(set)
+
+  # First we extract the subgraph between init_tensors and sources.
+  for init_tensor in init_tensors:
+    sources.update(_map_subgraph(
+        init_tensor=init_tensor,
+        sources=sources,
+        disallowed_placeholders=disallowed_placeholders,
+        visited_ops=visited_ops,
+        op_outputs=op_outputs,
+        add_sources=add_sources))
+
   # Topologically sort the nodes we've extracted. Now we know how many of their
   # outputs are part of this subgraph.
   ops_to_copy = []
   marked_ops = set([])
-  ops_to_visit = [_as_operation(init_tensor)]
+  ops_to_visit = [_as_operation(t) for t in init_tensors
+                  if not op_outputs[_as_operation(t)]]
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in marked_ops:
@@ -71,27 +270,48 @@ def lift_to_graph(init_tensor, graph, sources=None):
     marked_ops.add(op)
     ops_to_copy.append(op)
     for inp in _graph_inputs(op):
-      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+      if (all(x in marked_ops for x in op_outputs[inp]) and
+          inp not in sources):
         ops_to_visit.append(inp)
+
+  # When lifting from one FuncGraph to another, we will need to capture the
+  # relevant tensors as well.
+  captures = collections.OrderedDict()
+  if (isinstance(base_graph, func_graph.FuncGraph) and
+      isinstance(graph, func_graph.FuncGraph)):
+    captures = base_graph.captures
+  inverse_captures = {v: k for k, v in captures.items()}
+
   # ops_to_copy now holds a reverse topologically sorted list of ops which
   # ends in the initializer. We copy those to the outermost graph and
   # build the initialization op there.
   with graph.as_default():
-    op_map = {}
+    op_map = {i: i for i in variable_init_tensors}  # Pass through variables.
     source_ops = set()
+    # Add the sources in the same order as the original graph.
+    for s in six.itervalues(captures):
+      if s in sources:
+        sources.remove(s)
+        source_ops.add(s.op)
+        _copy_source(
+            s=s,
+            graph=graph,
+            op_map=op_map,
+            handle_captures=handle_captures,
+            inverse_captures=inverse_captures)
     for s in sources:
       source_ops.add(s.op)
-      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+      _copy_source(
+          s=s,
+          graph=graph,
+          op_map=op_map,
+          handle_captures=handle_captures,
+          inverse_captures=inverse_captures)
+
     for op in reversed(ops_to_copy):
       if op in source_ops:
         continue
-      copied_inputs = [op_map[x] for x in op.inputs]
-      copied_control_inputs = [op_map[x] for x in op.control_inputs]
-      with ops.control_dependencies(copied_control_inputs):
-        copied_op = graph.create_op(
-            op.type, copied_inputs, [x.dtype for x in op.outputs],
-            attrs=op.node_def.attr)
-      op_map[op] = copied_op
-      for i, o in enumerate(op.outputs):
-        op_map[o] = copied_op.outputs[i]
+
+      _copy_non_source(op=op, graph=graph, op_map=op_map)
+
     return op_map
diff --git a/tensorflow/python/eager/lift_to_graph_test.py b/tensorflow/python/eager/lift_to_graph_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..27bcfd1852818892b6e5a908e041a0f194c2faad
--- /dev/null
+++ b/tensorflow/python/eager/lift_to_graph_test.py
@@ -0,0 +1,56 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lift_to_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.eager import test
+from tensorflow.python.framework import func_graph
+from tensorflow.python.ops import resource_variable_ops
+
+
+class LiftToGraphTest(test.TestCase):
+
+  def testCaptureOrdering(self):
+    v1 = resource_variable_ops.ResourceVariable(1.0)
+    v2 = resource_variable_ops.ResourceVariable(2.0)
+    v3 = resource_variable_ops.ResourceVariable(3.0)
+
+    @def_function.function
+    def fn():
+      return v1 + v2 + v3
+
+    concrete_fn = fn.get_concrete_function()
+    original_captures = concrete_fn.graph.captures
+    outputs = concrete_fn.graph.outputs
+
+    for _ in range(100):
+      g = func_graph.FuncGraph('lifted')
+
+      lift_to_graph.lift_to_graph(
+          outputs, g, add_sources=True, handle_captures=True)
+      lifted_captures = g.captures
+      self.assertLen(lifted_captures, 3)
+      for original_capture, lifted_capture in zip(original_captures.values(),
+                                                  lifted_captures.values()):
+        self.assertEqual(original_capture.name, lifted_capture.name)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index ab4bdaa601d94bee077dd9567fef0415164eb821..0eb0e6cfbeaed23c51102851bbf7ba09e7da5713 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -22,10 +22,10 @@ import weakref
 
 import numpy as np
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -274,9 +274,9 @@ class OpsTest(test_util.TensorFlowTestCase):
     # Temporarily replace the context
     # pylint: disable=protected-access
     del context._context
+    context._context = context.Context()
     try:
-      context._context = context.Context(
-          device_policy=context.DEVICE_PLACEMENT_SILENT)
+      config.set_device_policy('silent')
       cpu_tensor = constant_op.constant(1.0)
       gpu_tensor = cpu_tensor.gpu()
       self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)
@@ -291,10 +291,10 @@ class OpsTest(test_util.TensorFlowTestCase):
     # Temporarily replace the context
     # pylint: disable=protected-access
     del context._context
+    context._context = context.Context()
     try:
-      context._context = context.Context(
-          device_policy=context.DEVICE_PLACEMENT_SILENT,
-          config=config_pb2.ConfigProto(allow_soft_placement=True))
+      config.set_device_policy('silent')
+      config.set_soft_device_placement(True)
       cpu_tensor = constant_op.constant(1.0)
       result = cpu_tensor + cpu_tensor
       self.assertEqual(result.device,
diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
index f1d4373085abd86889f2cac48c6f8844d3a58c27..b04749d3fb4d134677b50a154588bca3a89fc85f 100644
--- a/tensorflow/python/eager/profiler.py
+++ b/tensorflow/python/eager/profiler.py
@@ -12,82 +12,178 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Profiler for eager mode."""
+"""TensorFlow 2.0 Profiler for both Eager Mode and Graph Mode.
+
+The profiler has two mode:
+- Programmatic Mode: start(), stop() and Profiler class. It will perform
+                    when calling start() or create Profiler class and will stop
+                    when calling stop() or destroying Profiler class.
+- On-demand Mode: start_profiler_server(). It will perform profiling when
+                  receive profiling request.
+
+NOTE: Only one active profiler session is allowed. Use of simultaneous
+Programmatic Mode and On-demand Mode is undefined and will likely fail.
+
+NOTE: The Keras TensorBoard callback will automatically perform sampled
+profiling. Before enabling customized profiling, set the callback flag
+"profile_batches=[]" to disable automatic sampled profiling.
+customized profiling.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import datetime
+import os
 import threading
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 
 _profiler = None
 _profiler_lock = threading.Lock()
+_run_num = 0
+# This suffix should be kept in sync with kProfileEmptySuffix in
+# tensorflow/core/profiler/rpc/client/capture_profile.cc.
+_EVENT_FILE_SUFFIX = '.profile-empty'
+
+
+class ProfilerAlreadyRunningError(Exception):
+  pass
+
+
+class ProfilerNotRunningError(Exception):
+  pass
 
 
 def start():
   """Start profiling.
 
-  Only one active profiling session is allowed.
-
   Raises:
-    AssertionError: If another profiling session is running.
+    ProfilerAlreadyRunningError: If another profiling session is running.
   """
   global _profiler
-  if _profiler is not None:
-    raise AssertionError('Another profiler is running.')
   with _profiler_lock:
-    _profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle)  # pylint: disable=protected-access
+    if _profiler is not None:
+      raise ProfilerAlreadyRunningError('Another profiler is running.')
+    profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+    if context.default_execution_mode == context.EAGER_MODE:
+      pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+          profiler_context,
+          context.context()._handle)  # pylint: disable=protected-access
+    _profiler = pywrap_tensorflow.TFE_NewProfiler(profiler_context)
+    pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+    if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
+      logging.warning('Another profiler session is running which is probably '
+                      'created by profiler server. Please avoid using profiler '
+                      'server and profiler APIs at the same time.')
 
 
 def stop():
   """Stop current profiling session and return its result.
 
   Returns:
-    A binary string of tensorflow.tfprof.ProfileProto. User can write the string
-    to file for offline analysis by tfprof command-line tools or graphical user
-    interface.
+    A binary string of tensorflow.tpu.Trace. User can write the string
+    to file for offline analysis by tensorboard.
 
   Raises:
-    AssertionError: If there is no active profiling session.
+    ProfilerNotRunningError: If there is no active profiling session.
   """
   global _profiler
-  if _profiler is None:
-    raise AssertionError('Cannot stop profiling. No profiler is running.')
-  with c_api_util.tf_buffer() as buffer_:
-    pywrap_tensorflow.TFE_ProfilerSerializeToString(
-        context.context()._handle,  # pylint: disable=protected-access
-        _profiler,
-        buffer_)
-    result = pywrap_tensorflow.TF_GetBuffer(buffer_)
+  global _run_num
   with _profiler_lock:
+    if _profiler is None:
+      raise ProfilerNotRunningError(
+          'Cannot stop profiling. No profiler is running.')
+    with c_api_util.tf_buffer() as buffer_:
+      pywrap_tensorflow.TFE_ProfilerSerializeToString(
+          context.context()._handle,  # pylint: disable=protected-access
+          _profiler,
+          buffer_)
+      result = pywrap_tensorflow.TF_GetBuffer(buffer_)
     pywrap_tensorflow.TFE_DeleteProfiler(_profiler)
     _profiler = None
+    _run_num += 1
   return result
 
 
+def maybe_create_event_file(logdir):
+  """Create an empty event file if not already exists.
+
+  This event file indicates that we have a plugins/profile/ directory in the
+  current logdir.
+
+  Args:
+    logdir: log directory.
+  """
+  for file_name in gfile.ListDirectory(logdir):
+    if file_name.endswith(_EVENT_FILE_SUFFIX):
+      return
+  # TODO(b/127330388): Use summary_ops_v2.create_file_writer instead.
+  event_writer = pywrap_tensorflow.EventsWriter(
+      compat.as_bytes(os.path.join(logdir, 'events')))
+  event_writer.InitWithSuffix(compat.as_bytes(_EVENT_FILE_SUFFIX))
+
+
+def save(logdir, result):
+  """Save profile result to TensorBoard logdir.
+
+  Args:
+    logdir: log directory read by TensorBoard.
+    result: profiling result returned by stop().
+  """
+  plugin_dir = os.path.join(
+      logdir, 'plugins', 'profile',
+      datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+  gfile.MakeDirs(plugin_dir)
+  maybe_create_event_file(logdir)
+  with gfile.Open(os.path.join(plugin_dir, 'local.trace'), 'wb') as f:
+    f.write(result)
+
+
+def start_profiler_server(port):
+  """Start a profiler grpc server that listens to given port.
+
+  The profiler server will keep the program running even the training finishes.
+  Please shutdown the server with CTRL-C. It can be used in both eager mode and
+  graph mode. The service defined in
+  tensorflow/core/profiler/profiler_service.proto. Please use
+  tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+  file following https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace
+
+  Args:
+    port: port profiler server listens to.
+  """
+  profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+  if context.default_execution_mode == context.EAGER_MODE:
+    pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+        profiler_context,
+        context.context()._handle)  # pylint: disable=protected-access
+  pywrap_tensorflow.TFE_StartProfilerServer(profiler_context, port)
+  pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+
+
 class Profiler(object):
   """Context-manager eager profiler api.
 
   Example usage:
   ```python
-  with Profiler("/path/to/save/result"):
+  with Profiler("/path/to/logdir"):
     # do some work
   ```
   """
 
-  def __init__(self, filename):
-    self._filename = filename
+  def __init__(self, logdir):
+    self._logdir = logdir
 
   def __enter__(self):
     start()
 
   def __exit__(self, typ, value, tb):
     result = stop()
-    with gfile.Open(self._filename, 'wb') as f:
-      f.write(result)
-
+    save(self._logdir, result)
diff --git a/tensorflow/python/eager/profiler_client.py b/tensorflow/python/eager/profiler_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f09d8b63419f4f837f74cd59fb1b3083b7d968b
--- /dev/null
+++ b/tensorflow/python/eager/profiler_client.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler client APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors
+
+
+def start_tracing(service_addr,
+                  logdir,
+                  duration_ms,
+                  worker_list='',
+                  include_dataset_ops=True,
+                  num_tracing_attempts=3):
+  """Sending grpc requests to profiler server to perform on-demand profiling.
+
+  Note: This method will block caller thread until receives tracing result.
+
+  Args:
+    service_addr: Address of profiler service e.g. localhost:6009.
+    logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
+    duration_ms: Duration of tracing or monitoring in ms.
+    worker_list: The list of worker TPUs that we are about to profile in the
+      current session. (TPU only)
+    include_dataset_ops: Set to false to profile longer traces.
+    num_tracing_attempts: Automatically retry N times when no trace event is
+      collected.
+
+  Raises:
+    UnavailableError: If no trace event is collected.
+  """
+  # TODO(fishx): Uses errors.raise_exception_on_not_ok_status instead.
+  if not pywrap_tensorflow.TFE_ProfilerClientStartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts):
+    raise errors.UnavailableError(None, None, 'No trace event is collected.')
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
index b3fe2efabed01f44502301b94f701355cb2dcc4f..b6c4ec09663c19c269e5f463c2dcf8a77f35fc44 100644
--- a/tensorflow/python/eager/profiler_test.py
+++ b/tensorflow/python/eager/profiler_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.profiler import tfprof_log_pb2
+import os
+
+from tensorflow.core.profiler import trace_events_pb2
 from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
 
 
 class ProfilerTest(test_util.TensorFlowTestCase):
@@ -33,11 +36,29 @@ class ProfilerTest(test_util.TensorFlowTestCase):
     five = constant_op.constant(5)
     product = three * five
     self.assertAllEqual(15, product)
+    with self.assertRaises(profiler.ProfilerAlreadyRunningError):
+      profiler.start()
+
     profile_result = profiler.stop()
-    profile_pb = tfprof_log_pb2.ProfileProto()
+    profile_pb = trace_events_pb2.Trace()
     profile_pb.ParseFromString(profile_result)
     profile_pb_str = '%s' % profile_pb
     self.assertTrue('Mul' in profile_pb_str)
+    with self.assertRaises(profiler.ProfilerNotRunningError):
+      profiler.stop()
+
+  def test_save_profile(self):
+    logdir = self.get_temp_dir()
+    profile_pb = trace_events_pb2.Trace()
+    profile_result = profile_pb.SerializeToString()
+    profiler.save(logdir, profile_result)
+    file_list = gfile.ListDirectory(logdir)
+    self.assertEqual(len(file_list), 2)
+    for file_name in gfile.ListDirectory(logdir):
+      if gfile.IsDirectory(os.path.join(logdir, file_name)):
+        self.assertEqual(file_name, 'plugins')
+      else:
+        self.assertTrue(file_name.endswith('.profile-empty'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 8d6f212499f80513eeb2a20cee8b2e0d7be21e3f..1db1b23d4c94ad911a2ffbd475134615f370af22 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -231,7 +231,12 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor);
 
 // Encodes the object as a tuple that is meant to be used as part of the key
-// for the defun function cache.
-PyObject* TFE_Py_EncodeArg(PyObject*);
+// for the defun function cache.  If `include_tensor_ranks_only` is true,
+// then the encoding only stores tensor ranks, and the key is
+// agnostic to dimension sizes.  Otherwise, full tensor shape encodings are
+// returned.
+PyObject* TFE_Py_EncodeArg(PyObject*, bool include_tensor_ranks_only);
+
+void TFE_Py_EnableInteractivePythonLogging();
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 9ce500bc08e478815f2dbe1d5d5353eefa4f17a8..e31a55f93965c8d5524bc583d0d3017ac4ab58a4 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <cstring>
 #include <thread>
 
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "absl/strings/str_cat.h"
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -264,7 +264,8 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
 }
 
 bool SetOpAttrList(
-    TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type,
+    TFE_Context* ctx, TFE_Op* op, const char* key, PyObject* py_list,
+    TF_AttrType type,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -369,6 +370,40 @@ bool SetOpAttrList(
     TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
                            status);
     if (TF_GetCode(status) != TF_OK) return false;
+  } else if (type == TF_ATTR_FUNC) {
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
+    for (int i = 0; i < num_values; ++i) {
+      tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));
+      // Allow:
+      // (1) String function name, OR
+      // (2) A Python object with a .name attribute
+      //     (A crude test for being a
+      //     tensorflow.python.framework.function._DefinedFunction)
+      //     (which is what the various "defun" or "Defun" decorators do).
+      // And in the future also allow an object that can encapsulate
+      // the function name and its attribute values.
+      tensorflow::StringPiece func_name;
+      if (!ParseStringValue(key, py_value.get(), status, &func_name)) {
+        PyObject* name_attr = PyObject_GetAttrString(py_value.get(), "name");
+        if (name_attr == nullptr ||
+            !ParseStringValue(key, name_attr, status, &func_name)) {
+          TF_SetStatus(
+              status, TF_INVALID_ARGUMENT,
+              tensorflow::strings::StrCat(
+                  "unable to set function value attribute from a ",
+                  py_value.get()->ob_type->tp_name,
+                  " object. If you think this is an error, please file an "
+                  "issue at "
+                  "https://github.com/tensorflow/tensorflow/issues/new")
+                  .c_str());
+          return false;
+        }
+      }
+      funcs[i] = TFE_NewOp(ctx, func_name.data(), status);
+      if (TF_GetCode(status) != TF_OK) return false;
+    }
+    TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
+    if (TF_GetCode(status) != TF_OK) return false;
   } else {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  tensorflow::strings::StrCat("Attr ", key,
@@ -619,7 +654,8 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
     if (TF_GetCode(out_status) != TF_OK) return;
     if (is_list != 0) {
-      if (!SetOpAttrList(op, key, py_value, type, nullptr, out_status)) return;
+      if (!SetOpAttrList(ctx, op, key, py_value, type, nullptr, out_status))
+        return;
     } else {
       if (!SetOpAttrScalar(ctx, op, key, py_value, type, nullptr, out_status))
         return;
@@ -649,7 +685,8 @@ void SetOpAttrWithDefaults(
     }
   } else {
     if (is_list != 0) {
-      SetOpAttrList(op, attr_name, attr_value, type, attr_list_sizes, status);
+      SetOpAttrList(ctx, op, attr_name, attr_value, type, attr_list_sizes,
+                    status);
     } else {
       SetOpAttrScalar(ctx, op, attr_name, attr_value, type, attr_list_sizes,
                       status);
@@ -835,15 +872,15 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
 }
 
 const char* TFE_GetPythonString(PyObject* o) {
+#if PY_MAJOR_VERSION >= 3
   if (PyBytes_Check(o)) {
     return PyBytes_AsString(o);
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(o)) {
+  } else {
     return PyUnicode_AsUTF8(o);
   }
+#else
+  return PyBytes_AsString(o);
 #endif
-  return nullptr;
 }
 
 int64_t get_uid() {
@@ -1011,8 +1048,18 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   void MarkAsResult(PyObject* gradient) const final { Py_INCREF(gradient); }
 
   PyObject* Zeros(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_dtype = tensor.GetDType();
+    if (PyErr_Occurred()) {
+      Py_DECREF(py_shape);
+      return nullptr;
+    }
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
     PyObject* result = PyEval_CallObject(zeros_fn_, arg_list);
     Py_DECREF(arg_list);
@@ -1022,6 +1069,9 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   }
 
   PyObject* Ones(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
     PyObject* py_dtype = tensor.GetDType();
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
@@ -1903,8 +1953,6 @@ bool OpGradientDoesntRequireOutputIndices(
           {"SparseSegmentSum", {true, {}}},
           {"SparseSegmentMean", {true, {}}},
           {"SparseSegmentSqrtN", {true, {}}},
-          {"SegmentMin", {true, {}}},
-          {"SegmentMax", {true, {}}},
           {"UnsortedSegmentSum", {true, {}}},
           {"UnsortedSegmentMax", {true, {}}},
           {"Abs", {true, {}}},
@@ -2086,6 +2134,9 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
         PyBackwardFunction* function =
             new PyBackwardFunction([op_name, attrs, num_inputs, op_inputs,
                                     op_outputs](PyObject* output_grads) {
+              if (PyErr_Occurred()) {
+                return static_cast<PyObject*>(nullptr);
+              }
               tensorflow::Safe_PyObjectPtr callback_args(
                   Py_BuildValue("OOOOOO", op_name, attrs, num_inputs, op_inputs,
                                 op_outputs, output_grads));
@@ -2188,31 +2239,9 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
   TFE_Execute(op, &output_handle, &num_retvals, status);
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
 
-  if (!PyObject_HasAttrString(input, "_read_dtype")) {
-    // Always create the py object (and correctly DECREF it) from the returned
-    // value, else the data will leak.
-    output->reset(EagerTensorFromHandle(output_handle));
-  } else {
-    // This is a _MixedPrecisionVariable which potentially does casting when
-    // being read.
-    tensorflow::Safe_PyObjectPtr read_dtype(
-        PyObject_GetAttrString(input, "_read_dtype"));
-    int desired_dtype = -1;
-    if (!ParseTypeValue("_read_dtype", read_dtype.get(), status,
-                        &desired_dtype)) {
-      return false;
-    }
-
-    auto safe_output_handle = tensorflow::make_safe(output_handle);
-    // Retires output_handle in the future.
-    output_handle = nullptr;
-    if (!CastTensor(parent_op_exec_info,
-                    static_cast<TF_DataType>(desired_dtype),
-                    &safe_output_handle, status)) {
-      return false;
-    }
-    output->reset(EagerTensorFromHandle(safe_output_handle.release()));
-  }
+  // Always create the py object (and correctly DECREF it) from the returned
+  // value, else the data will leak.
+  output->reset(EagerTensorFromHandle(output_handle));
 
   // TODO(nareshmodi): Should we run post exec callbacks here?
   if (parent_op_exec_info.run_gradient_callback) {
@@ -2411,14 +2440,14 @@ bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
 
 bool RunCallbacks(
     const FastPathOpExecInfo& op_exec_info, PyObject* args,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_attrs,
     PyObject* flattened_result) {
   if (!op_exec_info.run_callbacks) return true;
 
-  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
-  for (int i = 0; i < flattened_inputs.size(); i++) {
-    PyObject* input = flattened_inputs[i].get();
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs->size()));
+  for (int i = 0; i < flattened_inputs->size(); i++) {
+    PyObject* input = (*flattened_inputs)[i].get();
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs.get(), i, input);
   }
@@ -2426,7 +2455,7 @@ bool RunCallbacks(
   int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
                                op_exec_info.op_def->input_arg_size() -
                                kFastPathExecuteInputStartIndex;
-  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
+  int num_attrs = flattened_attrs->size() + num_non_inferred_attrs;
   tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
@@ -2438,7 +2467,7 @@ bool RunCallbacks(
   }
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
     PyObject* attr_or_name =
-        flattened_attrs.at(i - num_non_inferred_attrs).get();
+        flattened_attrs->at(i - num_non_inferred_attrs).get();
     Py_INCREF(attr_or_name);
     PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
@@ -2676,9 +2705,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       for (Py_ssize_t j = 0; j < len; j++) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
-        if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             []() { Py_RETURN_NONE; },
-                             [](const TF_DataType& dtype) {}, status)) {
+        if (!ConvertToTensor(
+                op_exec_info, py_input, &py_eager_tensor,
+                []() { Py_RETURN_NONE; }, [](const TF_DataType& dtype) {},
+                status)) {
           return nullptr;
         }
 
@@ -2757,8 +2787,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     PyList_SET_ITEM(flat_result.get(), i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (!RunCallbacks(op_exec_info, args, *flattened_inputs, *flattened_attrs,
-                    flat_result.get())) {
+  if (!RunCallbacks(op_exec_info, args, flattened_inputs.get(),
+                    flattened_attrs.get(), flat_result.get())) {
     return nullptr;
   }
 
@@ -2823,10 +2853,13 @@ namespace {
 const char kTensor[] = "T";
 const char kIndexedSlices[] = "I";
 const char kList[] = "L";
+const char kListEnd[] = "l";
 const char kTuple[] = "U";
+const char kTupleEnd[] = "u";
 const char kDict[] = "D";
 const char kRaw[] = "R";
 const char kShape[] = "s";
+const char kShapeDelim[] = "-";
 const char kDType[] = "d";
 const char kNone[] = "n";
 
@@ -2856,7 +2889,9 @@ struct EncodeResult {
   }
 };
 
-tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
+                                       bool include_tensor_ranks_only,
+                                       EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
     TFE_TensorHandle* t = EagerTensor_Handle(arg);
     tensorflow::TensorShape tensor_shape;
@@ -2865,10 +2900,13 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
     absl::StrAppend(&result->str, kDType, t->handle->dtype);
 
     absl::StrAppend(&result->str, kShape);
-    for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
-      absl::StrAppend(&result->str, dim_size);
+    if (include_tensor_ranks_only) {
+      absl::StrAppend(&result->str, tensor_shape.dim_sizes().size());
+    } else {
+      for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
+        absl::StrAppend(&result->str, dim_size, kShapeDelim);
+      }
     }
-
     return tensorflow::Status::OK();
   }
 
@@ -2892,6 +2930,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       static_cast<tensorflow::DataType>(MakeInt(dtype_enum.get()));
 
   absl::StrAppend(&result->str, kDType, dtype);
+
   static char _shape_tuple[] = "_shape_tuple";
   tensorflow::Safe_PyObjectPtr shape_tuple(
       PyObject_CallMethod(arg, _shape_tuple, nullptr));
@@ -2912,22 +2951,30 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       shape_tuple.get(), "shape_tuple didn't return a sequence"));
 
   int len = PySequence_Fast_GET_SIZE(shape_seq.get());
-  for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
-    if (item == Py_None) {
-      absl::StrAppend(&result->str, kNone);
-    } else {
-      absl::StrAppend(&result->str, MakeInt(item));
+
+  if (include_tensor_ranks_only) {
+    absl::StrAppend(&result->str, len);
+  } else {
+    for (int i = 0; i < len; ++i) {
+      PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
+      if (item == Py_None) {
+        absl::StrAppend(&result->str, kNone);
+      } else {
+        absl::StrAppend(&result->str, MakeInt(item));
+      }
     }
   }
-
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result);
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result);
 
 // This function doesn't set the type of sequence before
 tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
+                                         const char* end_type,
+                                         bool include_tensor_ranks_only,
                                          EncodeResult* result) {
   tensorflow::Safe_PyObjectPtr arg_seq(
       PySequence_Fast(arg, "unable to create seq from list/tuple"));
@@ -2939,17 +2986,22 @@ tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
     if (item == Py_None) {
       absl::StrAppend(&result->str, kNone);
     } else {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(item, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(item, include_tensor_ranks_only, result));
     }
   }
+  absl::StrAppend(&result->str, end_type);
 
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result) {
   if (tensorflow::swig::IsTensor(arg)) {
     absl::StrAppend(&result->str, kTensor);
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(arg, result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(arg, include_tensor_ranks_only, result));
   } else if (tensorflow::swig::IsIndexedSlices(arg)) {
     absl::StrAppend(&result->str, kIndexedSlices);
     tensorflow::Safe_PyObjectPtr values(PyObject_GetAttrString(arg, "values"));
@@ -2958,7 +3010,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a values attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(values.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(values.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr indices(
         PyObject_GetAttrString(arg, "indices"));
@@ -2967,7 +3020,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a indices attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(indices.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(indices.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr dense_shape(
         PyObject_GetAttrString(arg, "dense_shape"));
@@ -2977,12 +3031,15 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
           "IndexedSlices does not have a dense_shape attr");
     }
     if (dense_shape.get() != Py_None) {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(dense_shape.get(), result));
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(
+          dense_shape.get(), include_tensor_ranks_only, result));
     }
   } else if (PyList_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kList, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kList, kListEnd, include_tensor_ranks_only, result));
   } else if (PyTuple_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kTuple, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kTuple, kTupleEnd, include_tensor_ranks_only, result));
   } else if (PyDict_Check(arg)) {
     tensorflow::Safe_PyObjectPtr keys(PyDict_Keys(arg));
     if (PyList_Sort(keys.get()) == -1) {
@@ -2994,9 +3051,11 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 
     for (int i = 0; i < len; i++) {
       PyObject* key = PyList_GetItem(keys.get(), i);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(key, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(key, include_tensor_ranks_only, result));
       PyObject* value = PyDict_GetItem(arg, key);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(value, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(value, include_tensor_ranks_only, result));
     }
   } else {
     PyObject* object = PyWeakref_NewRef(arg, nullptr);
@@ -3023,13 +3082,51 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 // on known shapes to produce slimmer graphs, and correctness, as some
 // high-level APIs require shapes to be fully-known.
 //
+// `include_tensor_ranks_only` allows caching on arguments excluding shape info,
+// so that a slow path using relaxed shape can rely on a cache key that excludes
+// shapes.
+//
 // TODO(nareshmodi): Add support for sparse tensors.
-PyObject* TFE_Py_EncodeArg(PyObject* arg) {
+PyObject* TFE_Py_EncodeArg(PyObject* arg, bool include_tensor_ranks_only) {
   EncodeResult result;
-  const auto status = TFE_Py_EncodeArgHelper(arg, &result);
+  const auto status =
+      TFE_Py_EncodeArgHelper(arg, include_tensor_ranks_only, &result);
   if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
     return nullptr;
   }
 
   return result.ToPyTuple();
 }
+
+// A method prints incoming messages directly to Python's
+// stdout using Python's C API. This is necessary in Jupyter notebooks
+// and colabs where messages to the C stdout don't go to the notebook
+// cell outputs, but calls to Python's stdout do.
+void PrintToPythonStdout(const char* msg) {
+  if (Py_IsInitialized()) {
+    PyGILState_STATE py_threadstate;
+    py_threadstate = PyGILState_Ensure();
+
+    string string_msg = msg;
+    // PySys_WriteStdout truncates strings over 1000 bytes, so
+    // we write the message in chunks small enough to not be truncated.
+    int CHUNK_SIZE = 900;
+    auto len = string_msg.length();
+    for (int i = 0; i < len; i += CHUNK_SIZE) {
+      PySys_WriteStdout("%s", string_msg.substr(i, CHUNK_SIZE).c_str());
+    }
+    PySys_WriteStdout("\n");
+
+    PyGILState_Release(py_threadstate);
+  }
+}
+
+// Register PrintToPythonStdout as a log listener, to allow
+// printing in colabs and jupyter notebooks to work.
+void TFE_Py_EnableInteractivePythonLogging() {
+  static bool enabled_interactive_logging = false;
+  if (!enabled_interactive_logging) {
+    enabled_interactive_logging = true;
+    TF_RegisterLogListener(PrintToPythonStdout);
+  }
+}
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 2abd4678af5bf4ec6d5a8fabb72473067e06708f..8204473745e3ada6cdb6c8269847ef58fb29475b 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -22,6 +22,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -71,25 +72,6 @@ class Tests(test.TestCase):
 
     self.assertAllEqual(x, y)
 
-  @test_util.assert_no_new_tensors
-  @test_util.assert_no_garbage_created
-  def testFastpathExecute_MixedPrecisionVariableMatMulCorrectResponse(self):
-    ctx = context.context()
-    a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
-    a_2_by_2_fp16 = math_ops.cast(a_2_by_2, dtype=dtypes.float16)
-    m = resource_variable_ops.ResourceVariable(a_2_by_2)
-    m = resource_variable_ops._MixedPrecisionVariable(
-        m, read_dtype=dtypes.float16)
-    x = pywrap_tensorflow.TFE_Py_FastPathExecute(
-        ctx._handle, ctx.device_name, "MatMul", None, None, m, m, "transpose_a",
-        False, "transpose_b", False)
-    y = pywrap_tensorflow.TFE_Py_FastPathExecute(
-        ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2_fp16,
-        a_2_by_2_fp16, "transpose_a", False, "transpose_b", False)
-
-    self.assertEqual(x.dtype, dtypes.float16)
-    self.assertAllEqual(x, y)
-
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testFastpathExecute_TapeWrite(self):
@@ -119,29 +101,6 @@ class Tests(test.TestCase):
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
 
-  @test_util.assert_no_new_tensors
-  @test_util.assert_no_garbage_created
-  def testFastpathExecute_MixedPrecisionVariableTapeWrite(self):
-    ctx = context.context()
-    with backprop.GradientTape(persistent=True) as tape:
-      a_2_by_2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]],
-                                      dtype=dtypes.float32)
-      a_2_by_2_fp16 = math_ops.cast(a_2_by_2, dtype=dtypes.float16)
-      m1 = resource_variable_ops.ResourceVariable(a_2_by_2)
-      m2 = resource_variable_ops._MixedPrecisionVariable(
-          m1, read_dtype=dtypes.float16)
-      tape.watch(m2)
-      z = pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2_fp16, m2,
-          "transpose_a", False, "transpose_b", False)
-    dz_dy = tape.gradient(z, [m2])[0]
-    self.assertEqual(dz_dy.dtype, dtypes.float16)
-
-    expected_grads = math_ops.matmul(
-        array_ops.transpose(a_2_by_2_fp16),
-        constant_op.constant(1., shape=[2, 2], dtype=dtypes.float16)).numpy()
-    self.assertAllEqual(dz_dy.numpy(), expected_grads)
-
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -259,6 +218,16 @@ class Tests(test.TestCase):
         "Value for attr 'num_split' of 0 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
 
+  def testIsFunction(self):
+    ctx = context.context()
+    self.assertFalse(ctx.has_function("not_a_function"))
+
+    @def_function.function
+    def f():
+      return 1.
+
+    self.assertTrue(ctx.has_function(f.get_concrete_function().name))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index fdea95fa8038c7ce63257d5651f1ccd6fc3de3bd..5a01e3da1aedc09c89949f14c876fe429eeae5fc 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -58,6 +58,11 @@ def connect_to_remote_host(remote_host=None, job_name="worker"):
   """
   if remote_host is None:
     raise ValueError("Must provide an remote_host")
+
+  grpc_prefix = "grpc://"
+  if remote_host.startswith(grpc_prefix):
+    remote_host = remote_host[len(grpc_prefix):]
+
   cluster_def = ClusterDef()
   job_def = cluster_def.job.add()
   job_def.name = job_name
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0ee2ff68c209aa13aaeb32be610302c11616b9d7..23fb983767beb377e499b7eb4c5fd2c435a37b88 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -66,11 +66,11 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     device = ctx.device_name
     # Missing context.
     with self.assertRaisesRegexp(
-        TypeError, r"Required argument 'context' \(pos 2\) not found"):
+        TypeError, r".*argument 'context' \(pos 2\).*"):
       ops.EagerTensor(1, device=device)
     # Missing device.
     with self.assertRaisesRegexp(
-        TypeError, r"Required argument 'device' \(pos 3\) not found"):
+        TypeError, r".*argument 'device' \(pos 3\).*"):
       ops.EagerTensor(1, context=handle)
     # Bad dtype type.
     with self.assertRaisesRegexp(TypeError,
@@ -339,6 +339,24 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyZeroDim(self):
+    for np_type, dtype in [(np.int32, dtypes.int32),
+                           (np.half, dtypes.half),
+                           (np.float32, dtypes.float32)]:
+      x = ops.convert_to_tensor([np.array(65, dtype=np_type),
+                                 np.array(16, dtype=np_type)])
+      self.assertEqual(x.dtype, dtype)
+      self.assertAllEqual(x, [65, 16])
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyScalar(self):
+    x = ops.convert_to_tensor([np.asscalar(np.array(321, dtype=np.int)),
+                               np.asscalar(np.array(16, dtype=np.int))])
+    self.assertAllEqual(x, [321, 16])
+
   def testEagerTensorError(self):
     with self.assertRaisesRegexp(
         TypeError,
@@ -347,7 +365,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
       _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
 
 
-
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
   def testListOfThree(self):
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 61b48768302751341cd1017eed8fdbb46089669d..b4ece94848c1f4b5c0efad36f522ee2d3b3b2b23 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -19,12 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -33,21 +37,40 @@ from tensorflow.python.util.tf_export import tf_export
 class VariableHolder(object):
   """Holds variables for a python function."""
 
-  def __init__(self, fn):
+  def __init__(self, fn=None, share_variables=False):
     self._fn = fn
+
     self._variables = []
 
+    self._share_variables = share_variables
+    self._variables_by_name = {}
+
+  @property
+  def variables(self):
+    return self._variables
+
   def variable_creator_scope(self, next_creator, **kwargs):
     """Creates variables & adds them to collections to match legacy code."""
-    v = next_creator(**kwargs)
-    self._variables.append(v)
+    collections = kwargs.pop("collections", None)
+    v = None
 
-    collections = kwargs.get("collections")
-    trainable = v.trainable
+    # Get expected variable name.
+    name = kwargs.get("name", None)
+    with ops.name_scope(name, "Variable") as name_scope:
+      name = name_scope
+
+    if self._share_variables:
+      v = self._variables_by_name.get(name, None)
+
+    if v is None:
+      v = next_creator(**kwargs)
+      self._variables.append(v)
+      if self._share_variables:
+        self._variables_by_name[name] = v
 
     if collections is None:
       collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+    if v.trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
 
     ops.add_to_collections(collections, v)
@@ -55,11 +78,16 @@ class VariableHolder(object):
     return v
 
   def __call__(self, *args, **kwargs):
-    with variable_scope.variable_creator_scope(self.variable_creator_scope):
-      return self._fn(*args, **kwargs)
+    return self.call_with_variable_creator_scope(self._fn)(*args, **kwargs)
+
+  def call_with_variable_creator_scope(self, fn):
+    def wrapped(*args, **kwargs):
+      with variable_scope.variable_creator_scope(self.variable_creator_scope):
+        return fn(*args, **kwargs)
+    return wrapped
 
 
-# TODO(allenl): make this checkpointable
+# TODO(allenl): make this trackable
 class WrappedFunction(function.ConcreteFunction):
   """Wraps a tf V1 piece of code in a function."""
 
@@ -67,8 +95,57 @@ class WrappedFunction(function.ConcreteFunction):
     super(WrappedFunction, self).__init__(
         fn_graph, attrs=attrs, signature=signature)
     self._variable_holder = variable_holder
-
-  def prune(self, feeds, fetches):
+    if ops.executing_eagerly_outside_functions():
+      # TODO(allenl): Make this work in 1.x?
+      self._lift_unlifted_variables()
+
+  def _lift_unlifted_variables(self):
+    """Finds resource variables and lifts them into the outer context.
+
+    When we import a GraphDef inside a wrap_function, no Python graph building
+    code runs. This means we get VarHandleOps which create variable resources,
+    but no corresponding Python objects. Leaving them like this works but gives
+    the user no way to interact with or modify the variables outside the graph.
+
+    This method searches for variables and lifts them out as regular variable
+    objects when possible, indicating to the FuncGraph that they are captures.
+    """
+    with self.graph.as_default():
+      collection_variables = (
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+          + ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+      existing_captures = set(self.graph.internal_captures)
+      lifted_variables = {}
+      for old_variable in collection_variables:
+        if (old_variable._in_graph_mode  # pylint: disable=protected-access
+            and isinstance(old_variable,
+                           resource_variable_ops.ResourceVariable)):
+          if old_variable.handle in existing_captures:
+            continue
+          new_variable = def_function.UnliftedInitializerVariable(
+              array_ops.placeholder(
+                  name="unused_{}_initializer".format(old_variable.op.name),
+                  shape=old_variable.shape,
+                  dtype=old_variable.dtype),
+              name=old_variable.op.name,
+              trainable=old_variable.trainable)
+          self.graph.captures[new_variable.handle] = old_variable.handle
+          existing_captures.add(old_variable.handle)
+          lifted_variables[old_variable] = new_variable
+          # pylint: disable=protected-access
+          self._variable_holder._variables.append(new_variable)
+          self.graph._weak_variables.append(weakref.ref(new_variable))
+          # pylint: enable=protected-access
+      # Update the graph's collections, partly for the user and partly so this
+      # function is idempotent when it runs again in prune() calls.
+      for collection_name in [ops.GraphKeys.GLOBAL_VARIABLES,
+                              ops.GraphKeys.LOCAL_VARIABLES]:
+        mutable_collection = ops.get_collection_ref(collection_name)
+        for index, current in enumerate(mutable_collection):
+          mutable_collection[index] = lifted_variables.get(current, current)
+
+  def prune(self, feeds, fetches, name=None):
+    name = name or "pruned"
     flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
@@ -96,22 +173,23 @@ class WrappedFunction(function.ConcreteFunction):
             "are from this graph (%s). Tensor %s from graph %s" % (
                 self._func_graph, f, f.graph))
     with self._func_graph.as_default():
-      pruned_graph = func_graph.FuncGraph("pruned")
+      pruned_graph = func_graph.FuncGraph(name)
       with ops.control_dependencies(operation_fetches):
         if tensor_fetches:
           identity_fetches = array_ops.identity_n(tensor_fetches)
           sink_tensor = identity_fetches[0]
         else:
           identity_fetches = []
-          sink_tensor = control_flow_ops.no_op()
+          sink_tensor = array_ops.zeros([])
     lift_map = lift_to_graph.lift_to_graph(
-        sink_tensor, pruned_graph,
-        sources=flat_feeds + internal_captures)
+        [sink_tensor], pruned_graph, sources=flat_feeds + internal_captures)
     for original_fetch, identity_fetch in zip(
         tensor_fetches, identity_fetches):
       lift_map[original_fetch] = lift_map[identity_fetch]
     pruned_graph.outputs.extend(
         lift_map[x] for x in flat_fetches if isinstance(x, ops.Tensor))
+    if not tensor_fetches:
+      pruned_graph.outputs.append(lift_map[sink_tensor])
     for external_capture, internal_capture in self.graph.captures.items():
       pruned_graph.captures[external_capture] = lift_map[internal_capture]
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
@@ -134,6 +212,89 @@ class WrappedFunction(function.ConcreteFunction):
     return pruned_fn
 
 
+class WrappedGraph(object):
+  """Class for wrapping multiple TF 1.X functions in a single graph.
+
+  Maintains a dictionary mapping names to wrapped functions. See
+  `tf.compat.v1.wrap_function` to learn more about wrapping V1 functions.
+
+  Functions wrapped using this class have access to variables and collections
+  created in other wrapped functions, using the standard TF 1.X API (
+  `tf.compat.v1.get_variable` or
+  `tf.compat.v1.get_default_graph().get_collection(...)`)
+
+  Outside a function, variables and collections may be accessed using the
+  `variables` and `graph` properties.
+
+  Example:
+
+  ```
+  def add_v1(x):
+    with tf.compat.v1.variable_scope('vars', reuse=tf.AUTO_REUSE):
+      v = tf.compat.v1.get_variable('v', shape=[], dtype=tf.int32)
+    return v + x
+
+  def increment_var_v1(x):
+    with tf.compat.v1.variable_scope('vars', reuse=tf.AUTO_REUSE):
+      v = tf.compat.v1.get_variable('v', shape=[], dtype=tf.int32)
+    return v.assign_add(x)
+
+  g = WrappedGraph()
+  add = g.wrap_function(add_v1, [tf.TensorSpec([], tf.int32)])
+  increment_var = g.wrap_function(increment_var_v1,
+                                  [tf.TensorSpec([], tf.int32)])
+
+  assert len(g.variables) == 1
+  assert g.variables[0].numpy() == 0
+  increment_var(tf.constant(5))
+  assert g.variables[0].numpy() == 5
+
+  ```
+  """
+
+  def __init__(self, variable_holder=None, **kwargs):
+    self._variable_holder = (
+        variable_holder or VariableHolder(share_variables=True))
+
+    name = kwargs.pop("name", "wrapped_function_graph")
+    # Always start with empty collections, unless otherwise specified. Setting
+    # `collections=None` will copy the collections from the outer graph.
+    collections = kwargs.pop("collections", {})
+    self.graph = func_graph.FuncGraph(name, collections=collections, **kwargs)
+
+    self._wrapped_function = WrappedFunction(self.graph, self._variable_holder)
+    self._functions = {}
+
+  @property
+  def functions(self):
+    return self._functions
+
+  @property
+  def variables(self):
+    return self._variable_holder.variables
+
+  def wrap_function(self, fn, signature, name=None):
+    """Wrap a TF 1.X function and save to functions dictionary."""
+    func_graph.func_graph_from_py_func(
+        None,  # Name is unused.
+        self._variable_holder.call_with_variable_creator_scope(fn),
+        args=None, kwargs=None, signature=signature,
+        add_control_dependencies=False,
+        func_graph=self.graph)
+
+    # This code relies on questional behavior from `func_graph_from_py_func`.
+    # If an existing FuncGraph is passed into the `func_graph` arg, the inputs
+    # and structured outputs are overwritten. Pretty sure this is a bug,
+    # because structured outputs doesn't match up with the outputs...
+    fn_inputs = self.graph.inputs[:-len(self.graph.captures)]
+    fn_outputs = self.graph.structured_outputs
+
+    wrapped_function = self._wrapped_function.prune(fn_inputs, fn_outputs)
+    name = name or fn.__name__
+    self._functions[name] = wrapped_function
+    return wrapped_function
+
+
 @tf_export(v1=["wrap_function"])
 def wrap_function(fn, signature, name=None):
   """Wraps the TF 1.x function fn into a graph function.
@@ -192,12 +353,38 @@ def wrap_function(fn, signature, name=None):
     the wrapped graph function.
   """
   holder = VariableHolder(fn)
+  func_graph_name = "wrapped_function"
+  if name is not None:
+    func_graph_name = "wrapped_function_" + name
   return WrappedFunction(
       func_graph.func_graph_from_py_func(
-          name,
+          func_graph_name,
           holder,
           args=None, kwargs=None, signature=signature,
           add_control_dependencies=False,
           collections={}),
       variable_holder=holder,
       signature=signature)
+
+
+def function_from_graph_def(graph_def, inputs, outputs):
+  """Creates a ConcreteFunction from a GraphDef.
+
+  Args:
+    graph_def: A GraphDef to make a function out of.
+    inputs: A Tensor name or nested structure of names in `graph_def` which
+      should be inputs to the function.
+    outputs: A Tensor name or nested structure of names in `graph_def` which
+      should be outputs of the function.
+
+  Returns:
+    A ConcreteFunction.
+  """
+  def _imports_graph_def():
+    importer.import_graph_def(graph_def, name="")
+
+  wrapped_import = wrap_function(_imports_graph_def, [])
+  import_graph = wrapped_import.graph
+  return wrapped_import.prune(
+      nest.map_structure(import_graph.as_graph_element, inputs),
+      nest.map_structure(import_graph.as_graph_element, outputs))
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index a6e1931fcdac796fe5851211f8aae4b21c7ed83b..fa3d5823d9d1795ba2b972921b474ddc886d00c6 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -19,12 +19,15 @@ from __future__ import print_function
 
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -206,7 +209,7 @@ class WrapFunctionTest(test.TestCase):
         fetches=(f_wrapped.graph.get_operation_by_name('increment'),
                  f_wrapped.graph.get_tensor_by_name('other:0')))
     first_output, second_output = increments(constant_op.constant(2))
-    self.assertEqual(['Placeholder:0', 'Placeholder_1:0'],
+    self.assertEqual(['step:0', 'increment/resource:0'],
                      [t.name for t in increments.inputs])
     self.assertIs(None, first_output)
     self.assertEqual(1, second_output.numpy())
@@ -217,6 +220,192 @@ class WrapFunctionTest(test.TestCase):
     self.assertEqual(1, does_not_increment(constant_op.constant(3)).numpy())
     self.assertEqual(3, v.numpy())
 
+  def testPruneStatefulOpsFromWrappedFunc(self):
+
+    v0 = variables.Variable(0)
+    v1 = variables.Variable(0)
+
+    # When we wrap a function, we expect it to be executed with 'tf.Graph`
+    # rules: it's allowed to prune all ops that are not in transitive fanin of
+    # the fetches.
+    def f(x):
+      v0.assign_add(1, name='increment_v0')
+      v1.assign_add(1, name='increment_v1')
+      return x
+
+    f_wrapped = wrap_function.wrap_function(f, [1])
+
+    self.assertEqual(1, f_wrapped().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
+    f_wrapped_with_name = wrap_function.wrap_function(f, [2], name='func')
+
+    self.assertEqual(2, f_wrapped_with_name().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
+  def test_function_from_graph_def(self):
+    @def_function.function
+    def make_graph_def(x):
+      return x + 1.
+
+    original_func_graph = make_graph_def.get_concrete_function(
+        tensor_spec.TensorSpec([None, 2], dtypes.float32)).graph
+    graph_def = original_func_graph.as_graph_def()
+    revived_function = wrap_function.function_from_graph_def(
+        graph_def, inputs=original_func_graph.inputs[0].name,
+        outputs=original_func_graph.outputs[0].name)
+    self.assertEqual(2., revived_function(constant_op.constant(1.)).numpy())
+
+
+class WrappedGraphTest(test.TestCase):
+
+  def testAddFunction(self):
+
+    def fn(x):
+      v = variables.Variable(3, name='v')
+      v2 = variable_scope.get_variable(
+          'v', initializer=init_ops.Constant(4), shape=[], dtype=dtypes.int32)
+      return v + v2 + x
+
+    with self.cached_session() as sess:
+      result = fn(constant_op.constant(5))
+      sess.run(variables.global_variables_initializer())
+      expected = sess.run(result)
+
+    g = wrap_function.WrappedGraph()
+    signature = [tensor_spec.TensorSpec([], dtypes.int32)]
+    wrapped_fn = g.wrap_function(fn, signature)
+    self.assertEqual(expected, wrapped_fn(constant_op.constant(5)).numpy())
+
+  def testCollections(self):
+
+    def fn(x):
+      v = variables.VariableV1(3, name='v', trainable=False, collections=['a'])
+      v2 = variable_scope.get_variable(
+          'v', initializer=init_ops.Constant(4), shape=[], dtype=dtypes.int32,
+          collections=['a', 'b'])
+      return v + v2 + x
+
+    def assert_collections(graph):
+      self.assertLen(graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES), 1)
+      self.assertLen(graph.get_collection('a'), 2)
+      self.assertLen(graph.get_collection('b'), 1)
+
+    g = wrap_function.WrappedGraph()
+    g.wrap_function(fn, [tensor_spec.TensorSpec([], dtypes.int32)])
+    assert_collections(g.graph)
+
+    def assert_fn():
+      assert_collections(ops.get_default_graph())
+      return 1  # Return is required
+
+    # Assert that collections are accessible within a wrapped function.
+    g.wrap_function(assert_fn, [])
+
+  def testShareVariablesSameGraph(self):
+
+    def add_v1(x):
+      with variable_scope.variable_scope(
+          'reuse', reuse=variable_scope.AUTO_REUSE):
+        v = variable_scope.get_variable(
+            'v', initializer=init_ops.Constant(3), shape=[], dtype=dtypes.int32)
+      return v + x
+
+    def subtract_v1(x):
+      with variable_scope.variable_scope(
+          'reuse', reuse=variable_scope.AUTO_REUSE):
+        v = variable_scope.get_variable(
+            'v', initializer=init_ops.Constant(4), shape=[], dtype=dtypes.int32)
+      return v - x
+
+    def different_variable_fn_v1(x):
+      with variable_scope.variable_scope(
+          'no_reuse', reuse=variable_scope.AUTO_REUSE):
+        v = variable_scope.get_variable(
+            'v', initializer=init_ops.Constant(5), shape=[], dtype=dtypes.int32)
+      return v * x
+
+    def increment_variable_v1(x):
+      with variable_scope.variable_scope(
+          'reuse', reuse=variable_scope.AUTO_REUSE):
+        v = variable_scope.get_variable(
+            'v', initializer=init_ops.Constant(6), shape=[], dtype=dtypes.int32)
+      return v.assign_add(x)
+
+    g = wrap_function.WrappedGraph()
+    signature = [tensor_spec.TensorSpec([], dtypes.int32)]
+    add = g.wrap_function(add_v1, signature)
+    subtract = g.wrap_function(subtract_v1, signature)
+    different_variable_fn = g.wrap_function(different_variable_fn_v1, signature)
+    increment_variable = g.wrap_function(increment_variable_v1, signature)
+
+    self.assertEqual(10, add(constant_op.constant(7)).numpy())
+    self.assertEqual(35, different_variable_fn(constant_op.constant(7)).numpy())
+
+    # The shared variable has a starting value of 3 because add_v1 was wrapped
+    # first.
+    self.assertEqual(-4, subtract(constant_op.constant(7)).numpy())
+    self.assertEqual(10, increment_variable(constant_op.constant(7)).numpy())
+
+    # Check that variable updates
+    self.assertEqual(17, add(constant_op.constant(7)).numpy())
+    self.assertEqual(3, subtract(constant_op.constant(7)).numpy())
+
+    # Sanity check - result from this function shouldn't change.
+    self.assertEqual(35, different_variable_fn(constant_op.constant(7)).numpy())
+
+    self.assertAllEqual({'reuse/v:0', 'no_reuse/v:0'},
+                        set([v.name for v in g.variables]))
+
+  def testShareVariablesDifferentGraphs(self):
+
+    def add_v1(x):
+      v = variables.Variable(3, name='v')
+      return v + x
+
+    def subtract_v1(x):
+      v = variables.Variable(4, name='v')
+      return v - x
+
+    def different_variable_fn_v1(x):
+      with ops.name_scope('different_scope'):
+        v = variables.Variable(5, name='v')
+      return v * x
+
+    def increment_variable_v1(x):
+      v = variables.Variable(6, name='v')
+      return v.assign_add(x)
+
+    signature = [tensor_spec.TensorSpec([], dtypes.int32)]
+    vh = wrap_function.VariableHolder(share_variables=True)
+    new_graph = lambda: wrap_function.WrappedGraph(variable_holder=vh)
+
+    add = new_graph().wrap_function(add_v1, signature)
+    subtract = new_graph().wrap_function(subtract_v1, signature)
+    different_variable_fn = new_graph().wrap_function(
+        different_variable_fn_v1, signature)
+    increment_variable = new_graph().wrap_function(
+        increment_variable_v1, signature)
+
+    self.assertEqual(10, add(constant_op.constant(7)).numpy())
+    self.assertEqual(35, different_variable_fn(constant_op.constant(7)).numpy())
+
+    # Because the variable in add_v1 was created first, its starting value is 3
+    # instead of the values defined in subtract_v1 or increment_variable_v1.
+    self.assertEqual(-4, subtract(constant_op.constant(7)).numpy())
+    self.assertEqual(10, increment_variable(constant_op.constant(7)).numpy())
+
+    # Check that variable updates
+    self.assertEqual(17, add(constant_op.constant(7)).numpy())
+    self.assertEqual(3, subtract(constant_op.constant(7)).numpy())
+
+    # Sanity check - result from this function shouldn't change.
+    self.assertEqual(35, different_variable_fn(constant_op.constant(7)).numpy())
+
+    self.assertAllEqual({'v:0', 'different_scope/v:0'},
+                        set([v.name for v in vh.variables]))
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 2b986348b7879554daf741cf7bda8f031a4572c2..d696f7cb4627deb4880dbdaf16822d3bf340cd27 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -13,6 +13,7 @@ py_library(
     deps = [
         ":feature_column",
         ":feature_column_v2",
+        ":sequence_feature_column",
         "//tensorflow/python:util",
     ],
 )
@@ -22,6 +23,7 @@ py_library(
     srcs = ["feature_column.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -29,6 +31,7 @@ py_library(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers_base",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
@@ -55,6 +58,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_column",
+        ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -163,3 +167,73 @@ tf_py_test(
         "no_windows",
     ],
 )
+
+py_library(
+    name = "sequence_feature_column",
+    srcs = ["sequence_feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column_v2",
+        ":utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_feature_column_test",
+    srcs = ["sequence_feature_column_test.py"],
+    additional_deps = [
+        ":feature_column_v2",
+        ":feature_column_v2_test",
+        ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+    tags = ["no_pip"],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["sequence_feature_column_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_v2",
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:layers",
+    ],
+)
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 42a07cd9275927f69d4795ffd51404998560672e..a9fdc13e7a2e77a6107ba2095bc468ef5cba1ad8 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -138,8 +138,8 @@ import math
 import numpy as np
 import six
 
-
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -982,13 +982,14 @@ def _numeric_column(key,
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
-  default_value = _check_default_value(shape, default_value, dtype, key)
+  default_value = fc_utils.check_default_value(
+      shape, default_value, dtype, key)
 
   if normalizer_fn is not None and not callable(normalizer_fn):
     raise TypeError(
         'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
-  _assert_key_is_string(key)
+  fc_utils.assert_key_is_string(key)
   return _NumericColumn(
       key,
       shape=shape,
@@ -1080,19 +1081,6 @@ def _bucketized_column(source_column, boundaries):
   return _BucketizedColumn(source_column, tuple(boundaries))
 
 
-def _assert_string_or_int(dtype, prefix):
-  if (dtype != dtypes.string) and (not dtype.is_integer):
-    raise ValueError(
-        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
-
-
-def _assert_key_is_string(key):
-  if not isinstance(key, six.string_types):
-    raise ValueError(
-        'key must be a string. Got: type {}. Given key: {}.'.format(
-            type(key), key))
-
-
 def _categorical_column_with_hash_bucket(key,
                                          hash_bucket_size,
                                          dtype=dtypes.string):
@@ -1145,8 +1133,8 @@ def _categorical_column_with_hash_bucket(key,
                      'hash_bucket_size: {}, key: {}'.format(
                          hash_bucket_size, key))
 
-  _assert_key_is_string(key)
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
 
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
@@ -1259,8 +1247,8 @@ def _categorical_column_with_vocabulary_file(key,
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
-  _assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
   return _VocabularyFileCategoricalColumn(
       key=key,
       vocabulary_file=vocabulary_file,
@@ -1367,7 +1355,7 @@ def _categorical_column_with_vocabulary_list(key,
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
-  _assert_string_or_int(
+  fc_utils.assert_string_or_int(
       vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   if dtype is None:
     dtype = vocabulary_dtype
@@ -1375,8 +1363,8 @@ def _categorical_column_with_vocabulary_list(key,
     raise ValueError(
         'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
             dtype, vocabulary_dtype, key))
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
-  _assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
 
   return _VocabularyListCategoricalColumn(
       key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
@@ -1445,7 +1433,7 @@ def _categorical_column_with_identity(key, num_buckets, default_value=None):
     raise ValueError(
         'default_value {} not in range [0, {}), column_name {}'.format(
             default_value, num_buckets, key))
-  _assert_key_is_string(key)
+  fc_utils.assert_key_is_string(key)
   return _IdentityCategoricalColumn(
       key=key, num_buckets=num_buckets, default_value=default_value)
 
@@ -2304,7 +2292,7 @@ class _NumericColumn(_DenseColumn,
           'SparseTensor is not supported. key: {}'.format(self.key))
     if self.normalizer_fn is not None:
       input_tensor = self.normalizer_fn(input_tensor)
-    return math_ops.to_float(input_tensor)
+    return math_ops.cast(input_tensor, dtypes.float32)
 
   @property
   def _variable_shape(self):
@@ -2495,7 +2483,7 @@ class _EmbeddingColumn(
         trainable=trainable)
 
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = _sequence_length_from_sparse_tensor(
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -2637,25 +2625,12 @@ class _SharedEmbeddingColumn(
         weight_collections=weight_collections,
         trainable=trainable)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = _sequence_length_from_sparse_tensor(
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
 
-def _create_tuple(shape, value):
-  """Returns a tuple with given shape and filled with value."""
-  if shape:
-    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
-  return value
-
-
-def _as_tuple(value):
-  if not nest.is_sequence(value):
-    return value
-  return tuple([_as_tuple(v) for v in value])
-
-
 def _check_shape(shape, key):
   """Returns shape if it's valid, raises error otherwise."""
   assert shape is not None
@@ -2672,82 +2647,6 @@ def _check_shape(shape, key):
   return shape
 
 
-def _is_shape_and_default_value_compatible(default_value, shape):
-  """Verifies compatibility of shape and default_value."""
-  # Invalid condition:
-  #  * if default_value is not a scalar and shape is empty
-  #  * or if default_value is an iterable and shape is not empty
-  if nest.is_sequence(default_value) != bool(shape):
-    return False
-  if not shape:
-    return True
-  if len(default_value) != shape[0]:
-    return False
-  for i in range(shape[0]):
-    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
-      return False
-  return True
-
-
-def _check_default_value(shape, default_value, dtype, key):
-  """Returns default value as tuple if it's valid, otherwise raises errors.
-
-  This function verifies that `default_value` is compatible with both `shape`
-  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
-  it casts default_value to a tuple and returns it. `key` is used only
-  for error message.
-
-  Args:
-    shape: An iterable of integers specifies the shape of the `Tensor`.
-    default_value: If a single value is provided, the same value will be applied
-      as the default value for every item. If an iterable of values is
-      provided, the shape of the `default_value` should be equal to the given
-      `shape`.
-    dtype: defines the type of values. Default value is `tf.float32`. Must be a
-      non-quantized, real integer or floating point type.
-    key: Column name, used only for error messages.
-
-  Returns:
-    A tuple which will be used as default value.
-
-  Raises:
-    TypeError: if `default_value` is an iterable but not compatible with `shape`
-    TypeError: if `default_value` is not compatible with `dtype`.
-    ValueError: if `dtype` is not convertible to `tf.float32`.
-  """
-  if default_value is None:
-    return None
-
-  if isinstance(default_value, int):
-    return _create_tuple(shape, default_value)
-
-  if isinstance(default_value, float) and dtype.is_floating:
-    return _create_tuple(shape, default_value)
-
-  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
-    default_value = default_value.tolist()
-
-  if nest.is_sequence(default_value):
-    if not _is_shape_and_default_value_compatible(default_value, shape):
-      raise ValueError(
-          'The shape of default_value must be equal to given shape. '
-          'default_value: {}, shape: {}, key: {}'.format(
-              default_value, shape, key))
-    # Check if the values in the list are all integers or are convertible to
-    # floats.
-    is_list_all_int = all(
-        isinstance(v, int) for v in nest.flatten(default_value))
-    is_list_has_float = any(
-        isinstance(v, float) for v in nest.flatten(default_value))
-    if is_list_all_int:
-      return _as_tuple(default_value)
-    if is_list_has_float and dtype.is_floating:
-      return _as_tuple(default_value)
-  raise TypeError('default_value must be compatible with dtype. '
-                  'default_value: {}, dtype: {}, key: {}'.format(
-                      default_value, dtype, key))
-
-
 class _HashedCategoricalColumn(
     _CategoricalColumn,
     collections.namedtuple('_HashedCategoricalColumn',
@@ -2767,7 +2666,7 @@ class _HashedCategoricalColumn(
     if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
       raise ValueError('SparseColumn input must be a SparseTensor.')
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -2822,7 +2721,7 @@ class _VocabularyFileCategoricalColumn(
           'key: {}, column dtype: {}, tensor dtype: {}'.format(
               self.key, self.dtype, input_tensor.dtype))
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -2874,7 +2773,7 @@ class _VocabularyListCategoricalColumn(
           'key: {}, column dtype: {}, tensor dtype: {}'.format(
               self.key, self.dtype, input_tensor.dtype))
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -3003,7 +2902,7 @@ class _WeightedCategoricalColumn(
       weight_tensor = _to_sparse_input_and_drop_ignore_values(
           weight_tensor, ignore_value=0.0)
     if not weight_tensor.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
+      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
     return (inputs.get(self.categorical_column), weight_tensor)
 
   def _get_sparse_tensors(
@@ -3210,7 +3109,7 @@ class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = _sequence_length_from_sparse_tensor(
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return _SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3242,31 +3141,6 @@ def _verify_static_batch_size_equality(tensors, columns):
                 expected_batch_size, tensors[i].shape.dims[0]))
 
 
-def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
-  """Returns a [batch_size] Tensor with per-example sequence length."""
-  with ops.name_scope(None, 'sequence_length') as name_scope:
-    row_ids = sp_tensor.indices[:, 0]
-    column_ids = sp_tensor.indices[:, 1]
-    # Add one to convert column indices to element length
-    column_ids += array_ops.ones_like(column_ids)
-    # Get the number of elements we will have per example/row
-    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
-
-    # The raw values are grouped according to num_elements;
-    # how many entities will we have after grouping?
-    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
-    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
-    # these will get grouped, and the final seq_length is [1, 1]
-    seq_length = math_ops.cast(
-        math_ops.ceil(seq_length / num_elements), dtypes.int64)
-
-    # If the last n rows do not have ids, seq_length will have shape
-    # [batch_size - n]. Pad the remaining values with zeros.
-    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
-    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
-    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
-
-
 class _SequenceCategoricalColumn(
     _CategoricalColumn,
     collections.namedtuple(
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 68a2712425c56ae4b3e42c6bd7ae497c0358a074..15950403566b00025d93e643e6be880dac9bbb3d 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
+from tensorflow.python.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 5976f647af0ceebe577ca5a85f902a8714da0935..d8bcea29487213ba05142ebd1a799d559fb6cdc0 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -137,15 +137,16 @@ import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.engine.base_layer import Layer
 # TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
 # of the main repo.
 from tensorflow.python.keras import utils
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -162,14 +163,14 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-_FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
+_FEATURE_COLUMN_DEPRECATION_DATE = None
 _FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being '
                                'deprecated. Please use the new FeatureColumn '
                                'APIs instead.')
@@ -304,8 +305,84 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-@keras_export('keras.layers.DenseFeatures', v1=[])
-class DenseFeatures(Layer):
+class _BaseFeaturesLayer(Layer):
+  """Base class for DenseFeatures and SequenceFeatures.
+
+  Defines common methods and helpers.
+
+  Args:
+    feature_columns: An iterable containing the FeatureColumns to use as
+      inputs to your model.
+    expected_column_type: Expected class for provided feature columns.
+    trainable:  Boolean, whether the layer's variables will be updated via
+      gradient descent during training.
+    name: Name to give to the DenseFeatures.
+    **kwargs: Keyword arguments to construct a layer.
+
+  Raises:
+    ValueError: if an item in `feature_columns` doesn't match
+      `expected_column_type`.
+  """
+  def __init__(self, feature_columns, expected_column_type, trainable, name,
+               **kwargs):
+    super(_BaseFeaturesLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    for column in self._feature_columns:
+      if not isinstance(column, expected_column_type):
+        raise ValueError(
+            'Items of feature_columns must be a {}. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(
+                expected_column_type, column))
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          column.create_state(self._state_manager)
+    super(_BaseFeaturesLayer, self).build(None)
+
+  def _output_shape(self, input_shape, num_elements):
+    """Computes expected output shape of the layer or a column's dense tensor.
+
+    Args:
+      input_shape: Tensor or array with batch shape.
+      num_elements: Size of the last dimension of the output.
+
+    Returns:
+      Tuple with output shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def compute_output_shape(self, input_shape):
+    total_elements = 0
+    for column in self._feature_columns:
+      total_elements += column.variable_shape.num_elements()
+    return self._target_shape(input_shape, total_elements)
+
+  def _process_dense_tensor(self, column, tensor):
+    """Reshapes the dense tensor output of a column based on expected shape.
+
+    Args:
+      column: A DenseColumn or SequenceDenseColumn object.
+      tensor: A dense tensor obtained from the same column.
+
+    Returns:
+      Reshaped dense tensor."""
+    num_elements = column.variable_shape.num_elements()
+    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
+    return array_ops.reshape(tensor, shape=target_shape)
+
+  def _verify_and_concat_tensors(self, output_tensors):
+    """Verifies and concatenates the dense output of several columns."""
+    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
+    return array_ops.concat(output_tensors, -1)
+
+
+@keras_export('keras.layers.DenseFeatures')
+class DenseFeatures(_BaseFeaturesLayer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -345,8 +422,8 @@ class DenseFeatures(Layer):
         `bucketized_column`, `indicator_column`. If you have categorical
         features, you can wrap them with an `embedding_column` or
         `indicator_column`.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
       name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
@@ -354,28 +431,18 @@ class DenseFeatures(Layer):
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
     super(DenseFeatures, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    for column in self._feature_columns:
-      if not isinstance(column, DenseColumn):
-        raise ValueError(
-            'Items of feature_columns must be a DenseColumn. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(column))
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=DenseColumn,
+        **kwargs)
 
   @property
   def _is_feature_layer(self):
     return True
 
-  def build(self, _):
-    for column in self._feature_columns:
-      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          column.create_state(self._state_manager)
-      super(DenseFeatures, self).build(None)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], total_elements)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -401,27 +468,15 @@ class DenseFeatures(Layer):
                        features)
     transformation_cache = FeatureTransformationCache(features)
     output_tensors = []
-    ordered_columns = []
     for column in self._feature_columns:
       with ops.name_scope(column.name):
-        ordered_columns.append(column)
         tensor = column.get_dense_tensor(transformation_cache,
                                          self._state_manager)
-        num_elements = column.variable_shape.num_elements()
-        batch_size = array_ops.shape(tensor)[0]
-        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-        output_tensors.append(tensor)
+        processed_tensors = self._process_dense_tensor(column, tensor)
         if cols_to_output_tensors is not None:
-          cols_to_output_tensors[column] = tensor
-
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return (input_shape[0], total_elements)
+          cols_to_output_tensors[column] = processed_tensors
+        output_tensors.append(processed_tensors)
+    return self._verify_and_concat_tensors(output_tensors)
 
 
 class _LinearModelLayer(Layer):
@@ -438,7 +493,6 @@ class _LinearModelLayer(Layer):
         name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
     for column in self._feature_columns:
       if not isinstance(column, (DenseColumn, CategoricalColumn)):
         raise ValueError(
@@ -694,7 +748,7 @@ def _transform_features_v2(features, feature_columns, state_manager):
   with ops.name_scope(
       None, default_name='transform_features', values=features.values()):
     transformation_cache = FeatureTransformationCache(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
+    for column in feature_columns:
       with ops.name_scope(None, default_name=column.name):
         outputs[column] = transformation_cache.get(column, state_manager)
   return outputs
@@ -1038,7 +1092,7 @@ def shared_embedding_columns(categorical_columns,
   return result
 
 
-@tf_export('feature_column.shared_embedding_columns', v1=[])
+@tf_export('feature_column.shared_embeddings', v1=[])
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -1265,13 +1319,14 @@ def numeric_column(key,
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
-  default_value = _check_default_value(shape, default_value, dtype, key)
+  default_value = fc_utils.check_default_value(
+      shape, default_value, dtype, key)
 
   if normalizer_fn is not None and not callable(normalizer_fn):
     raise TypeError(
         'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
-  _assert_key_is_string(key)
+  fc_utils.assert_key_is_string(key)
   return NumericColumn(
       key,
       shape=shape,
@@ -1365,19 +1420,6 @@ def bucketized_column(source_column, boundaries):
   return BucketizedColumn(source_column, tuple(boundaries))
 
 
-def _assert_string_or_int(dtype, prefix):
-  if (dtype != dtypes.string) and (not dtype.is_integer):
-    raise ValueError(
-        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
-
-
-def _assert_key_is_string(key):
-  if not isinstance(key, six.string_types):
-    raise ValueError(
-        'key must be a string. Got: type {}. Given key: {}.'.format(
-            type(key), key))
-
-
 @tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
@@ -1431,8 +1473,8 @@ def categorical_column_with_hash_bucket(key,
                      'hash_bucket_size: {}, key: {}'.format(
                          hash_bucket_size, key))
 
-  _assert_key_is_string(key)
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
 
   return HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
@@ -1637,8 +1679,8 @@ def categorical_column_with_vocabulary_file_v2(key,
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
-  _assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
   return VocabularyFileCategoricalColumn(
       key=key,
       vocabulary_file=vocabulary_file,
@@ -1746,7 +1788,7 @@ def categorical_column_with_vocabulary_list(key,
     if num_oov_buckets < 0:
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
-  _assert_string_or_int(
+  fc_utils.assert_string_or_int(
       vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   if dtype is None:
     dtype = vocabulary_dtype
@@ -1754,8 +1796,8 @@ def categorical_column_with_vocabulary_list(key,
     raise ValueError(
         'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
             dtype, vocabulary_dtype, key))
-  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
-  _assert_key_is_string(key)
+  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  fc_utils.assert_key_is_string(key)
 
   return VocabularyListCategoricalColumn(
       key=key,
@@ -1828,7 +1870,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
     raise ValueError(
         'default_value {} not in range [0, {}), column_name {}'.format(
             default_value, num_buckets, key))
-  _assert_key_is_string(key)
+  fc_utils.assert_key_is_string(key)
   return IdentityCategoricalColumn(
       key=key, number_buckets=num_buckets, default_value=default_value)
 
@@ -2660,7 +2702,7 @@ def _normalize_feature_columns(feature_columns):
                                                name_to_column[column.name]))
     name_to_column[column.name] = column
 
-  return feature_columns
+  return sorted(feature_columns, key=lambda x: x.name)
 
 
 class NumericColumn(
@@ -2702,7 +2744,7 @@ class NumericColumn(
           'SparseTensor is not supported. key: {}'.format(self.key))
     if self.normalizer_fn is not None:
       input_tensor = self.normalizer_fn(input_tensor)
-    return math_ops.to_float(input_tensor)
+    return math_ops.cast(input_tensor, dtypes.float32)
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
                           _FEATURE_COLUMN_DEPRECATION)
@@ -2846,7 +2888,7 @@ class BucketizedColumn(
 
   def _get_dense_tensor_for_input_tensor(self, input_tensor):
     return array_ops.one_hot(
-        indices=math_ops.to_int64(input_tensor),
+        indices=math_ops.cast(input_tensor, dtypes.int64),
         depth=len(self.boundaries) + 1,
         on_value=1.,
         off_value=0.)
@@ -2893,9 +2935,10 @@ class BucketizedColumn(
         array_ops.reshape(input_tensor, (-1,)) +
         (len(self.boundaries) + 1) * i2)
 
-    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
-    dense_shape = math_ops.to_int64(array_ops.stack(
-        [batch_size, source_dimension]))
+    indices = math_ops.cast(
+        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
+    dense_shape = math_ops.cast(
+        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         indices=indices,
         values=bucket_indices,
@@ -3068,10 +3111,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Get sparse IDs and weights.
@@ -3088,10 +3131,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -3105,7 +3148,7 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3113,7 +3156,7 @@ class EmbeddingColumn(
         transformation_cache, state_manager)
     dense_tensor = self._get_dense_tensor_internal(sparse_tensors,
                                                    state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3129,8 +3172,8 @@ class EmbeddingColumn(
         (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
       raise ValueError(
           'In embedding_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'categorical_column must be of type SequenceCategoricalColumn '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3139,7 +3182,7 @@ class EmbeddingColumn(
         sparse_tensors,
         weight_collections=weight_collections,
         trainable=trainable)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3175,7 +3218,7 @@ def _raise_shared_embedding_column_error():
                    '`DenseFeatures` or `LinearModel` instead.')
 
 
-class SharedEmbeddingColumnCreator(tracking.AutoCheckpointable):
+class SharedEmbeddingColumnCreator(tracking.AutoTrackable):
 
   def __init__(self,
                dimension,
@@ -3298,10 +3341,10 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'SequenceFeatureLayer instead of FeatureLayer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     return self._get_dense_tensor_internal(transformation_cache, state_manager)
@@ -3315,7 +3358,7 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use SequenceFeatureLayer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3323,7 +3366,7 @@ class SharedEmbeddingColumn(
                                                    state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3349,19 +3392,6 @@ class SharedEmbeddingColumn(
     raise NotImplementedError()
 
 
-def _create_tuple(shape, value):
-  """Returns a tuple with given shape and filled with value."""
-  if shape:
-    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
-  return value
-
-
-def _as_tuple(value):
-  if not nest.is_sequence(value):
-    return value
-  return tuple([_as_tuple(v) for v in value])
-
-
 def _check_shape(shape, key):
   """Returns shape if it's valid, raises error otherwise."""
   assert shape is not None
@@ -3378,82 +3408,6 @@ def _check_shape(shape, key):
   return shape
 
 
-def _is_shape_and_default_value_compatible(default_value, shape):
-  """Verifies compatibility of shape and default_value."""
-  # Invalid condition:
-  #  * if default_value is not a scalar and shape is empty
-  #  * or if default_value is an iterable and shape is not empty
-  if nest.is_sequence(default_value) != bool(shape):
-    return False
-  if not shape:
-    return True
-  if len(default_value) != shape[0]:
-    return False
-  for i in range(shape[0]):
-    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
-      return False
-  return True
-
-
-def _check_default_value(shape, default_value, dtype, key):
-  """Returns default value as tuple if it's valid, otherwise raises errors.
-
-  This function verifies that `default_value` is compatible with both `shape`
-  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
-  it casts default_value to a tuple and returns it. `key` is used only
-  for error message.
-
-  Args:
-    shape: An iterable of integers specifies the shape of the `Tensor`.
-    default_value: If a single value is provided, the same value will be applied
-      as the default value for every item. If an iterable of values is
-      provided, the shape of the `default_value` should be equal to the given
-      `shape`.
-    dtype: defines the type of values. Default value is `tf.float32`. Must be a
-      non-quantized, real integer or floating point type.
-    key: Column name, used only for error messages.
-
-  Returns:
-    A tuple which will be used as default value.
-
-  Raises:
-    TypeError: if `default_value` is an iterable but not compatible with `shape`
-    TypeError: if `default_value` is not compatible with `dtype`.
-    ValueError: if `dtype` is not convertible to `tf.float32`.
-  """
-  if default_value is None:
-    return None
-
-  if isinstance(default_value, int):
-    return _create_tuple(shape, default_value)
-
-  if isinstance(default_value, float) and dtype.is_floating:
-    return _create_tuple(shape, default_value)
-
-  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
-    default_value = default_value.tolist()
-
-  if nest.is_sequence(default_value):
-    if not _is_shape_and_default_value_compatible(default_value, shape):
-      raise ValueError(
-          'The shape of default_value must be equal to given shape. '
-          'default_value: {}, shape: {}, key: {}'.format(
-              default_value, shape, key))
-    # Check if the values in the list are all integers or are convertible to
-    # floats.
-    is_list_all_int = all(
-        isinstance(v, int) for v in nest.flatten(default_value))
-    is_list_has_float = any(
-        isinstance(v, float) for v in nest.flatten(default_value))
-    if is_list_all_int:
-      return _as_tuple(default_value)
-    if is_list_has_float and dtype.is_floating:
-      return _as_tuple(default_value)
-  raise TypeError('default_value must be compatible with dtype. '
-                  'default_value: {}, dtype: {}, key: {}'.format(
-                      default_value, dtype, key))
-
-
 class HashedCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3486,7 +3440,7 @@ class HashedCategoricalColumn(
     if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
       raise ValueError('SparseColumn input must be a SparseTensor.')
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -3598,7 +3552,7 @@ class VocabularyFileCategoricalColumn(
           'key: {}, column dtype: {}, tensor dtype: {}'.format(
               self.key, self.dtype, input_tensor.dtype))
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -3606,7 +3560,7 @@ class VocabularyFileCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_file` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     # TODO(rohanj): Use state manager to manage the index table creation.
     return lookup_ops.index_table_from_file(
@@ -3710,7 +3664,7 @@ class VocabularyListCategoricalColumn(
           'key: {}, column dtype: {}, tensor dtype: {}'.format(
               self.key, self.dtype, input_tensor.dtype))
 
-    _assert_string_or_int(
+    fc_utils.assert_string_or_int(
         input_tensor.dtype,
         prefix='column_name: {} input_tensor'.format(self.key))
 
@@ -3718,7 +3672,7 @@ class VocabularyListCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_tensor` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     # TODO(rohanj): Use state manager to manage the index table creation.
     return lookup_ops.index_table_from_tensor(
@@ -3819,9 +3773,10 @@ class IdentityCategoricalColumn(
           'Invalid input, not integer. key: {} dtype: {}'.format(
               self.key, input_tensor.dtype))
 
-    values = math_ops.to_int64(input_tensor.values, name='values')
-    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
-    zero = math_ops.to_int64(0, name='zero')
+    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
+    num_buckets = math_ops.cast(
+        self.num_buckets, dtypes.int64, name='num_buckets')
+    zero = math_ops.cast(0, dtypes.int64, name='zero')
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
@@ -3839,9 +3794,8 @@ class IdentityCategoricalColumn(
               values < zero, values >= num_buckets, name='out_of_range'),
           array_ops.fill(
               dims=array_ops.shape(values),
-              value=math_ops.to_int64(self.default_value),
-              name='default_values'),
-          values)
+              value=math_ops.cast(self.default_value, dtypes.int64),
+              name='default_values'), values)
 
     return sparse_tensor_lib.SparseTensor(
         indices=input_tensor.indices,
@@ -3964,18 +3918,14 @@ class WeightedCategoricalColumn(
       weight_tensor = _to_sparse_input_and_drop_ignore_values(
           weight_tensor, ignore_value=0.0)
     if not weight_tensor.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
+      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
     return weight_tensor
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
-    print('WeightedCategoricalColumn.transform_feature: ', self.name)
-    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
-    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
-    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -3989,9 +3939,7 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
-    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
-    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -4333,10 +4281,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4354,10 +4302,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4370,7 +4318,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4379,7 +4327,7 @@ class IndicatorColumn(
     dense_tensor = transformation_cache.get(self, state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4400,7 +4348,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4408,7 +4356,7 @@ class IndicatorColumn(
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 137d5e0a8c85867261921ab9f073c3cc8551ecb3..b3de056afd68dc46f0d5d4d1c1944128d0712296 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -1838,6 +1838,22 @@ class LinearModelTest(test.TestCase):
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
+  def test_sparse_combiner_sqrtn(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      model = fc.LinearModel([wire_cast], sparse_combiner='sqrtn')
+      predictions = model(features)
+      wire_cast_var, bias = model.variables
+      with _initialized_session() as sess:
+        self.evaluate(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.evaluate(bias.assign([5.]))
+        self.assertAllClose([[1005.], [7083.139]], self.evaluate(predictions))
+
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
@@ -3262,7 +3278,7 @@ class DenseFeaturesTest(test.TestCase):
       fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
+    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
       fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
@@ -3423,7 +3439,7 @@ class DenseFeaturesTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
+      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
         fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/python/feature_column/sequence_feature_column.py
similarity index 71%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
rename to tensorflow/python/feature_column/sequence_feature_column.py
index 83b93ec332044f754f9dcde8d7c5c19b26e53a4a..7e31497db877dd942c9e2fa3fe97a3f0085f52e2 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -25,8 +25,8 @@ from __future__ import print_function
 import collections
 
 
-from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -34,107 +34,118 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 
 
-def sequence_input_layer(
-    features,
-    feature_columns,
-    weight_collections=None,
-    trainable=True):
-  """"Builds input layer for sequence input.
+@keras_export('keras.experimental.SequenceFeatures')
+class SequenceFeatures(fc._BaseFeaturesLayer):
+  """A layer for sequence input.
 
-  All `feature_columns` must be sequence dense columns with the same
-  `sequence_length`. The output of this method can be fed into sequence
-  networks, such as RNN.
+    All `feature_columns` must be sequence dense columns with the same
+    `sequence_length`. The output of this method can be fed into sequence
+    networks, such as RNN.
 
-  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
-  `T` is the maximum sequence length for this batch, which could differ from
-  batch to batch.
+    The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+    `T` is the maximum sequence length for this batch, which could differ from
+    batch to batch.
 
-  If multiple `feature_columns` are given with `Di` `num_elements` each, their
-  outputs are concatenated. So, the final `Tensor` has shape
-  `[batch_size, T, D0 + D1 + ... + Dn]`.
+    If multiple `feature_columns` are given with `Di` `num_elements` each, their
+    outputs are concatenated. So, the final `Tensor` has shape
+    `[batch_size, T, D0 + D1 + ... + Dn]`.
 
-  Example:
+    Example:
 
-  ```python
-  rating = sequence_numeric_column('rating')
-  watches = sequence_categorical_column_with_identity(
-      'watches', num_buckets=1000)
-  watches_embedding = embedding_column(watches, dimension=10)
-  columns = [rating, watches]
+    ```python
+    rating = sequence_numeric_column('rating')
+    watches = sequence_categorical_column_with_identity(
+        'watches', num_buckets=1000)
+    watches_embedding = embedding_column(watches, dimension=10)
+    columns = [rating, watches_embedding]
 
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+    sequence_input_layer = SequenceFeatures(columns)
+    features = tf.parse_example(..., features=make_parse_example_spec(columns))
+    sequence_input, sequence_length = sequence_input_layer(features)
+    sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
-  ```
+    rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+    rnn_layer = tf.keras.layers.RNN(rnn_cell)
+    outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
+    ```
+  """
 
-  Args:
-    features: A dict mapping keys to tensors.
-    feature_columns: An iterable of dense sequence columns. Valid columns are
-      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
-      - `sequence_numeric_column`.
-    weight_collections: A list of collection names to which the Variable will be
-      added. Note that variables will also be added to collections
-      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-    trainable: If `True` also add the variable to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES`.
+  def __init__(
+      self,
+      feature_columns,
+      trainable=True,
+      name=None,
+      **kwargs):
+    """"Constructs a SequenceFeatures layer.
 
-  Returns:
-    An `(input_layer, sequence_length)` tuple where:
-    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-        `T` is the maximum sequence length for this batch, which could differ
-        from batch to batch. `D` is the sum of `num_elements` for all
-        `feature_columns`.
-    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-        length for each example.
+    Args:
+      feature_columns: An iterable of dense sequence columns. Valid columns are
+        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+        - `sequence_numeric_column`.
+      trainable: Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the SequenceFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: If any of the `feature_columns` is not a
+        `SequenceDenseColumn`.
+    """
+    super(SequenceFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=fc.SequenceDenseColumn,
+        **kwargs)
 
-  Raises:
-    ValueError: If any of the `feature_columns` is the wrong type.
-  """
-  feature_columns = fc_old._normalize_feature_columns(feature_columns)
-  for c in feature_columns:
-    if not isinstance(c, fc_old._SequenceDenseColumn):
-      raise ValueError(
-          'All feature_columns must be of type _SequenceDenseColumn. '
-          'You can wrap a sequence_categorical_column with an embedding_column '
-          'or indicator_column. '
-          'Given (type {}): {}'.format(type(c), c))
-
-  with variable_scope.variable_scope(
-      None, default_name='sequence_input_layer', values=features.values()):
-    builder = fc_old._LazyBuilder(features)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], input_shape[1], total_elements)
+
+  def call(self, features):
+    """Returns sequence input corresponding to the `feature_columns`.
+
+    Args:
+      features: A dict mapping keys to tensors.
+
+    Returns:
+      An `(input_layer, sequence_length)` tuple where:
+      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+          `T` is the maximum sequence length for this batch, which could differ
+          from batch to batch. `D` is the sum of `num_elements` for all
+          `feature_columns`.
+      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+          length for each example.
+
+    Raises:
+      ValueError: If features are not a dictionary.
+    """
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    transformation_cache = fc.FeatureTransformationCache(features)
     output_tensors = []
     sequence_lengths = []
-    ordered_columns = []
-
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):
-        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
+
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        dense_tensor, sequence_length = column.get_sequence_dense_tensor(
+            transformation_cache, self._state_manager)
         # Flattens the final dimension to produce a 3D Tensor.
-        num_elements = column._variable_shape.num_elements()
-        shape = array_ops.shape(dense_tensor)
-        target_shape = [shape[0], shape[1], num_elements]
-        output_tensors.append(
-            array_ops.reshape(dense_tensor, shape=target_shape))
+        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
         sequence_lengths.append(sequence_length)
 
-    fc_old._verify_static_batch_size_equality(output_tensors, ordered_columns)
-    fc_old._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    # Check and process sequence lengths.
+    fc._verify_static_batch_size_equality(sequence_lengths,
+                                          self._feature_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
 
-    return array_ops.concat(output_tensors, -1), sequence_length
+    return self._verify_and_concat_tensors(output_tensors), sequence_length
 
 
 def concatenate_context_input(context_input, sequence_input):
@@ -186,6 +197,7 @@ def concatenate_context_input(context_input, sequence_input):
   return array_ops.concat([sequence_input, tiled_context_input], 2)
 
 
+@tf_export('feature_column.sequence_categorical_column_with_identity')
 def sequence_categorical_column_with_identity(
     key, num_buckets, default_value=None):
   """Returns a feature column that represents sequences of integers.
@@ -203,12 +215,13 @@ def sequence_categorical_column_with_identity(
   columns = [watches_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -233,6 +246,7 @@ def sequence_categorical_column_with_identity(
           default_value=default_value))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
 def sequence_categorical_column_with_hash_bucket(
     key, hash_bucket_size, dtype=dtypes.string):
   """A sequence of categorical terms where ids are set by hashing.
@@ -250,12 +264,13 @@ def sequence_categorical_column_with_hash_bucket(
   columns = [tokens_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -277,6 +292,7 @@ def sequence_categorical_column_with_hash_bucket(
           dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
 def sequence_categorical_column_with_vocabulary_file(
     key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
     default_value=None, dtype=dtypes.string):
@@ -296,12 +312,13 @@ def sequence_categorical_column_with_vocabulary_file(
   columns = [states_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -340,6 +357,7 @@ def sequence_categorical_column_with_vocabulary_file(
           dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
 def sequence_categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A sequence of categorical terms where ids use an in-memory list.
@@ -358,12 +376,13 @@ def sequence_categorical_column_with_vocabulary_list(
   columns = [colors_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -400,6 +419,7 @@ def sequence_categorical_column_with_vocabulary_list(
           num_oov_buckets=num_oov_buckets))
 
 
+@tf_export('feature_column.sequence_numeric_column')
 def sequence_numeric_column(
     key,
     shape=(1,),
@@ -415,12 +435,13 @@ def sequence_numeric_column(
   columns = [temperature]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -445,7 +466,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc_old._check_shape(shape=shape, key=key)
+  shape = fc._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
@@ -540,9 +561,11 @@ class SequenceNumericColumn(
     # For the 2D case, the raw values are grouped according to num_elements;
     # for the 3D case, the grouping happens in the third dimension, and
     # sequence length is not affected.
-    num_elements = (self.variable_shape.num_elements()
-                    if sp_tensor.shape.ndims == 2 else 1)
-    seq_length = fc_old._sequence_length_from_sparse_tensor(
+    if sp_tensor.shape.ndims == 2:
+      num_elements = self.variable_shape.num_elements()
+    else:
+      num_elements = 1
+    seq_length = fc_utils.sequence_length_from_sparse_tensor(
         sp_tensor, num_elements=num_elements)
 
     return fc.SequenceDenseColumn.TensorSequenceLengthPair(
diff --git a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c67945c6bc05f1f0ff6be356e3cf7e844ee29b
--- /dev/null
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import tempfile
+
+from google.protobuf import text_format
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc.embedding_column(col, dimension=10),
+        fc.numeric_column('float_ctx')
+    ]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc.embedding_column(identity_col, dimension=10),
+        fc.embedding_column(bucket_col, dimension=20)
+    ]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec_v2(ctx_cols),
+          sequence_features=fc.make_parse_example_spec_v2(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = ds.make_one_shot_iterator().get_next()
+
+    # Tile the context features across the sequence features
+    sequence_input_layer = sfc.SequenceFeatures(seq_cols)
+    seq_layer, _ = sequence_input_layer(features)
+    input_layer = fc.DenseFeatures(ctx_cols)
+    ctx_layer = input_layer(features)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+
+class SequenceExampleParsingTest(test.TestCase):
+
+  def test_seq_ex_in_sequence_categorical_column_with_identity(self):
+    self._test_parsed_sequence_example(
+        'int_list', sfc.sequence_categorical_column_with_identity,
+        10, [3, 6], [2, 4, 6])
+
+  def test_seq_ex_in_sequence_categorical_column_with_hash_bucket(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_hash_bucket,
+        10, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_list(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_list,
+        list(string.ascii_lowercase), [3, 4],
+        [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_file(self):
+    _, fname = tempfile.mkstemp()
+    with open(fname, 'w') as f:
+      f.write(string.ascii_lowercase)
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_file,
+        fname, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def _test_parsed_sequence_example(
+      self, col_name, col_fn, col_arg, shape, values):
+    """Helper function to check that each FeatureColumn parses correctly.
+
+    Args:
+      col_name: string, name to give to the feature column. Should match
+        the name that the column will parse out of the features dict.
+      col_fn: function used to create the feature column. For example,
+        sequence_numeric_column.
+      col_arg: second arg that the target feature column is expecting.
+      shape: the expected dense_shape of the feature after parsing into
+        a SparseTensor.
+      values: the expected values at index [0, 2, 6] of the feature
+        after parsing into a SparseTensor.
+    """
+    example = _make_sequence_example()
+    columns = [
+        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc.numeric_column('float_ctx'),
+        col_fn(col_name, col_arg)
+    ]
+    context, seq_features = parsing_ops.parse_single_sequence_example(
+        example.SerializeToString(),
+        context_features=fc.make_parse_example_spec_v2(columns[:2]),
+        sequence_features=fc.make_parse_example_spec_v2(columns[2:]))
+
+    with self.cached_session() as sess:
+      ctx_result, seq_result = sess.run([context, seq_features])
+      self.assertEqual(list(seq_result[col_name].dense_shape), shape)
+      self.assertEqual(
+          list(seq_result[col_name].values[[0, 2, 6]]), values)
+      self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1])
+      self.assertEqual(ctx_result['int_ctx'].values[0], 5)
+      self.assertEqual(list(ctx_result['float_ctx'].shape), [1])
+      self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
similarity index 71%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
rename to tensorflow/python/feature_column/sequence_feature_column_test.py
index be012a87690c24c6d9b7808790393e1aa6d01211..0c8f37b107122882e1f72c0bbb10ebe2c2885f5e 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -22,23 +22,24 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
-from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column.feature_column_v2_test import _TestStateManager
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_v2_test as fc_test
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
 
 
-class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -82,6 +83,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -111,37 +113,36 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a,
         dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc_old._embedding_column(
+    embedding_column_b = fc.embedding_column(
         categorical_column_b,
         dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[embedding_column_b, embedding_column_a])
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [embedding_column_b, embedding_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.evaluate(variables_lib.global_variables_initializer())
+    weights = sequence_input_layer.weights
     self.assertCountEqual(
-        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
-         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
-      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
+        ('sequence_features/aaa_embedding/embedding_weights:0',
+         'sequence_features/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in weights]))
+    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
+    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence embedding column."""
     vocabulary_size = 3
@@ -152,86 +153,87 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([embedding_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
+  @test_util.run_in_graph_and_eager_modes
   def test_shared_embedding_column(self):
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
-    )
-
-    def _get_initializer(embedding_dimension, embedding_values):
-
-      def _initializer(shape, dtype, partition_info):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(dtypes.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-
-      return _initializer
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
-    ]
-    expected_sequence_length = [1, 2]
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension,
-        initializer=_get_initializer(embedding_dimension, embedding_values))
-
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=shared_embedding_columns)
-
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [1]
+          # example 1, ids [2, 0]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, 2, 0),
+          dense_shape=(2, 2))
+
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 4.),  # id 1
+          (5., 6.)  # id 2
+      )
 
+      def _get_initializer(embedding_dimension, embedding_values):
+
+        def _initializer(shape, dtype, partition_info):
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertEqual(dtypes.float32, dtype)
+          self.assertIsNone(partition_info)
+          return embedding_values
+
+        return _initializer
+
+      expected_input_layer = [
+          # example 0, ids_a [2], ids_b [1]
+          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+          # example 1, ids_a [0, 1], ids_b [2, 0]
+          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+      ]
+      expected_sequence_length = [1, 2]
+
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      # Test that columns are reordered alphabetically.
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension,
+          initializer=_get_initializer(embedding_dimension, embedding_values))
+
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      input_layer, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(
+          ('aaa_bbb_shared_embedding:0',),
+          tuple([v.name for v in global_vars]))
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(embedding_values,
+                            global_vars[0].eval(session=sess))
+        self.assertAllEqual(expected_input_layer,
+                            input_layer.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length, sequence_length.eval(session=sess))
+
+  @test_util.run_deprecated_v1
   def test_shared_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence shared embedding column."""
     vocabulary_size = 3
@@ -248,23 +250,20 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old._categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_shared_embedding\. categorical_column must '
-        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={
-              'aaa': sparse_input_a,
-              'bbb': sparse_input_b
-          },
-          feature_columns=shared_embedding_columns)
+        r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      _, _ = sequence_input_layer({'aaa': sparse_input_a,
+                                   'bbb': sparse_input_b})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -308,6 +307,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -319,23 +319,21 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc_old._indicator_column(categorical_column_b)
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[indicator_column_b, indicator_column_a])
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    indicator_column_b = fc.indicator_column(categorical_column_b)
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [indicator_column_b, indicator_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence categorical column."""
     vocabulary_size = 3
@@ -346,17 +344,16 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([indicator_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -375,27 +372,26 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            # feature 0, ids [[20, 3], [5]]
            # feature 1, ids [[3], [8]]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (20, 3, 5., 3., 8.),
+           'values': (20., 3., 5., 3., 8.),
            'dense_shape': (2, 2, 2)},
        'expected_input_layer': [
            [[20.], [3.], [5.], [0.]],
            [[3.], [0.], [8.], [0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa')
+    numeric_column = sfc.sequence_numeric_column('aaa')
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -426,22 +422,22 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
        'expected_sequence_length': [2, 1]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column_multi_dim(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    """Tests SequenceFeatures for multi-dimensional numeric_column."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequence_length_not_equal(self):
     """Tests that an error is raised when sequence lengths are not equal."""
     # Input a with sequence_length = [2, 1]
@@ -454,23 +450,17 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         indices=((0, 0), (1, 0)),
         values=(1., 10.),
         dense_shape=(2, 2))
-    numeric_column_a = sfc_old.sequence_numeric_column('aaa')
-    numeric_column_b = sfc_old.sequence_numeric_column('bbb')
-
-    _, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=[numeric_column_a, numeric_column_b])
-
-    with monitored_session.MonitoredSession() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Condition x == y did not hold element-wise:\] '
-          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
-          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
-        sess.run(sequence_length)
+    numeric_column_a = sfc.sequence_numeric_column('aaa')
+    numeric_column_b = sfc.sequence_numeric_column('bbb')
+
+    sequence_input_layer = sfc.SequenceFeatures(
+        [numeric_column_a, numeric_column_b])
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
+      _, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+      self.evaluate(sequence_length)
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -487,21 +477,21 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
            # example 1, [[10., 11., 12., 13.], []]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
                        (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
            'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
            'dense_shape': (2, 2, 4)},
        'expected_shape': [2, 2, 4]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_numeric(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
@@ -528,20 +518,58 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            'dense_shape': (4, 2, 2)},
        'expected_shape': [4, 2, 3]}
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_indicator(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    sequence_input_layer = sfc.SequenceFeatures([indicator_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
-
+  @test_util.run_in_graph_and_eager_modes
+  def test_compute_output_shape(self):
+    price1 = sfc.sequence_numeric_column('price1', shape=2)
+    price2 = sfc.sequence_numeric_column('price2')
+    features = {
+        'price1': sparse_tensor.SparseTensor(
+            indices=[[0, 0, 0], [0, 0, 1],
+                     [0, 1, 0], [0, 1, 1],
+                     [1, 0, 0], [1, 0, 1],
+                     [2, 0, 0], [2, 0, 1],
+                     [3, 0, 0], [3, 0, 1]],
+            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
+            dense_shape=(4, 3, 2)),
+        'price2': sparse_tensor.SparseTensor(
+            indices=[[0, 0],
+                     [0, 1],
+                     [1, 0],
+                     [2, 0],
+                     [3, 0]],
+            values=[10., 11., 20., 30., 40.],
+            dense_shape=(4, 3))}
+    sequence_features = sfc.SequenceFeatures([price1, price2])
+    seq_input, seq_len = sequence_features(features)
+    self.assertEqual(
+        sequence_features.compute_output_shape((None, None)),
+        (None, None, 3))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
+                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
+                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
+                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
+                        self.evaluate(seq_input))
+    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
   """Tests the utility fn concatenate_context_input."""
 
@@ -556,9 +584,8 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
         [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
         [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
     ], dtype=np.float32)
-    with monitored_session.MonitoredSession() as sess:
-      output = sess.run(input_layer)
-      self.assertAllEqual(expected, output)
+    output = self.evaluate(input_layer)
+    self.assertAllEqual(expected, output)
 
   @parameterized.named_parameters(
       {'testcase_name': 'rank_lt_3',
@@ -605,8 +632,9 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-class InputLayerTest(test.TestCase):
-  """Tests input_layer with sequence feature columns."""
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
 
   def test_embedding_column(self):
     """Tests that error is raised for sequence embedding column."""
@@ -620,16 +648,15 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([embedding_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
   def test_indicator_column(self):
     """Tests that error is raised for sequence indicator column."""
@@ -643,15 +670,14 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([indicator_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -676,10 +702,11 @@ def _get_sequence_dense_tensor(column, features):
 
 
 def _get_sequence_dense_tensor_state(column, features):
-  state_manager = _TestStateManager()
+  state_manager = fc._StateManagerImpl(Layer(), trainable=True)
   column.create_state(state_manager)
-  return column.get_sequence_dense_tensor(
+  dense_tensor, lengths = column.get_sequence_dense_tensor(
       fc.FeatureTransformationCache(features), state_manager)
+  return dense_tensor, lengths, state_manager
 
 
 def _get_sparse_tensors(column, features):
@@ -687,6 +714,7 @@ def _get_sparse_tensors(column, features):
       fc.FeatureTransformationCache(features), None)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithIdentityTest(
     test.TestCase, parameterized.TestCase):
 
@@ -718,11 +746,11 @@ class SequenceCategoricalColumnWithIdentityTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithHashBucketTest(
     test.TestCase, parameterized.TestCase):
 
@@ -757,11 +785,11 @@ class SequenceCategoricalColumnWithHashBucketTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_indices_shape(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_indices_shape(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyFileTest(
     test.TestCase, parameterized.TestCase):
 
@@ -810,37 +838,40 @@ class SequenceCategoricalColumnWithVocabularyFileTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
   def test_get_sparse_tensors_dynamic_zero_length(self):
     """Tests _get_sparse_tensors with a dynamic sequence length."""
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
-    expected = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 3)),
-        values=np.array((), dtype=np.int64),
-        dense_shape=(2, 0, 1))
-    column = sfc.sequence_categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    input_placeholder_shape = list(inputs.dense_shape)
-    # Make second dimension (sequence length) dynamic.
-    input_placeholder_shape[1] = None
-    input_placeholder = array_ops.sparse_placeholder(
-        dtypes.string, shape=input_placeholder_shape)
-    id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
-
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      result = id_weight_pair.id_tensor.eval(
-          session=sess, feed_dict={input_placeholder: inputs})
-      _assert_sparse_tensor_value(
-          self, expected, result)
-
-
+    with ops.Graph().as_default():
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+      expected = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 3)),
+          values=np.array((), dtype=np.int64),
+          dense_shape=(2, 0, 1))
+      column = sfc.sequence_categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      input_placeholder_shape = list(inputs.dense_shape)
+      # Make second dimension (sequence length) dynamic.
+      input_placeholder_shape[1] = None
+      input_placeholder = array_ops.sparse_placeholder(
+          dtypes.string, shape=input_placeholder_shape)
+      id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
+
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with fc_test._initialized_session() as sess:
+        result = id_weight_pair.id_tensor.eval(
+            session=sess, feed_dict={input_placeholder: inputs})
+        _assert_sparse_tensor_value(
+            self, expected, result)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyListTest(
     test.TestCase, parameterized.TestCase):
 
@@ -874,11 +905,13 @@ class SequenceCategoricalColumnWithVocabularyListTest(
     id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceEmbeddingColumnTest(
     test.TestCase, parameterized.TestCase):
 
@@ -942,15 +975,15 @@ class SequenceEmbeddingColumnTest(
         categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup, _ = _get_sequence_dense_tensor_state(
+    embedding_lookup, _, state_manager = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': inputs})
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+    variables = state_manager._layer.weights
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.assertCountEqual(
+        ('embedding_weights:0',), tuple([v.name for v in variables]))
+    self.assertAllEqual(embedding_values, self.evaluate(variables[0]))
+    self.assertAllEqual(expected, self.evaluate(embedding_lookup))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -979,13 +1012,12 @@ class SequenceEmbeddingColumnTest(
     embedding_column = fc.embedding_column(
         categorical_column, dimension=2)
 
-    _, sequence_length = _get_sequence_dense_tensor_state(
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1007,16 +1039,16 @@ class SequenceEmbeddingColumnTest(
     embedding_column = fc.embedding_column(
         categorical_column, dimension=2)
 
-    _, sequence_length = _get_sequence_dense_tensor_state(
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
         embedding_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 class SequenceSharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_sequence_dense_tensor(self):
     vocabulary_size = 3
     embedding_dimension = 2
@@ -1085,100 +1117,102 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     embedding_lookup_b = _get_sequence_dense_tensor(
         shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
 
+    self.evaluate(variables_lib.global_variables_initializer())
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_a, embedding_lookup_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_b, embedding_lookup_b.eval(session=sess))
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(
+        expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_sequence_length(self):
-    vocabulary_size = 3
-
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_a = [1, 2]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0, 2]
-        # example 1, ids [1]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0, 2, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_b = [2, 1]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = _get_sequence_dense_tensor(
-        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
-    sequence_length_b = _get_sequence_dense_tensor(
-        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length_a = sess.run(sequence_length_a)
-      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
-      self.assertEqual(np.int64, sequence_length_a.dtype)
-      sequence_length_b = sess.run(sequence_length_b)
-      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
-      self.assertEqual(np.int64, sequence_length_b.dtype)
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_a = [1, 2]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [0, 2]
+          # example 1, ids [1]
+          indices=((0, 0), (0, 1), (1, 0)),
+          values=(0, 2, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_b = [2, 1]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        sequence_length_a = sess.run(sequence_length_a)
+        self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
+        self.assertEqual(np.int64, sequence_length_a.dtype)
+        sequence_length_b = sess.run(sequence_length_b)
+        self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
+        self.assertEqual(np.int64, sequence_length_b.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids []
-        # example 1, ids [2]
-        # example 2, ids [0, 1]
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids []
-        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        # example 2, ids []
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids [0, 1]
-        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
-        values=(2, 1, 0, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-
-    shared_embedding_columns = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = _get_sequence_dense_tensor(
-        shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
-    sequence_length_b = _get_sequence_dense_tensor(
-        shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length_a, sequence_length_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length_b, sequence_length_b.eval(session=sess))
-
-
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids []
+          # example 1, ids [2]
+          # example 2, ids [0, 1]
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids []
+          indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids []
+          # example 2, ids []
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids [0, 1]
+          indices=((0, 0), (4, 0), (5, 0), (5, 1)),
+          values=(2, 1, 0, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(
+            expected_sequence_length_a, sequence_length_a.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length_b, sequence_length_b.eval(session=sess))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -1231,8 +1265,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     indicator_tensor, _ = _get_sequence_dense_tensor(
         indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(indicator_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1263,10 +1296,9 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1290,11 +1322,11 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         indicator_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
@@ -1355,8 +1387,7 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(dense_tensor))
 
   def test_get_sequence_dense_tensor_with_normalizer_fn(self):
 
@@ -1389,9 +1420,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1427,9 +1457,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1476,10 +1505,9 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1499,9 +1527,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/feature_column/utils.py b/tensorflow/python/feature_column/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dd17aadc2963af570dc915b0bab3b9ef3587c5f
--- /dev/null
+++ b/tensorflow/python/feature_column/utils.py
@@ -0,0 +1,154 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines functions common to multiple feature column files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
+
+
+def sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
+  """Returns a [batch_size] Tensor with per-example sequence length."""
+  with ops.name_scope(None, 'sequence_length') as name_scope:
+    row_ids = sp_tensor.indices[:, 0]
+    column_ids = sp_tensor.indices[:, 1]
+    # Add one to convert column indices to element length
+    column_ids += array_ops.ones_like(column_ids)
+    # Get the number of elements we will have per example/row
+    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
+
+    # The raw values are grouped according to num_elements;
+    # how many entities will we have after grouping?
+    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
+    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
+    # these will get grouped, and the final seq_length is [1, 1]
+    seq_length = math_ops.cast(
+        math_ops.ceil(seq_length / num_elements), dtypes.int64)
+
+    # If the last n rows do not have ids, seq_length will have shape
+    # [batch_size - n]. Pad the remaining values with zeros.
+    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
+    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
+    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
+
+
+def assert_string_or_int(dtype, prefix):
+  if (dtype != dtypes.string) and (not dtype.is_integer):
+    raise ValueError(
+        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
+
+
+def assert_key_is_string(key):
+  if not isinstance(key, six.string_types):
+    raise ValueError(
+        'key must be a string. Got: type {}. Given key: {}.'.format(
+            type(key), key))
+
+
+def check_default_value(shape, default_value, dtype, key):
+  """Returns default value as tuple if it's valid, otherwise raises errors.
+
+  This function verifies that `default_value` is compatible with both `shape`
+  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
+  it casts default_value to a tuple and returns it. `key` is used only
+  for error message.
+
+  Args:
+    shape: An iterable of integers specifies the shape of the `Tensor`.
+    default_value: If a single value is provided, the same value will be applied
+      as the default value for every item. If an iterable of values is
+      provided, the shape of the `default_value` should be equal to the given
+      `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    key: Column name, used only for error messages.
+
+  Returns:
+    A tuple which will be used as default value.
+
+  Raises:
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  if default_value is None:
+    return None
+
+  if isinstance(default_value, int):
+    return _create_tuple(shape, default_value)
+
+  if isinstance(default_value, float) and dtype.is_floating:
+    return _create_tuple(shape, default_value)
+
+  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
+    default_value = default_value.tolist()
+
+  if nest.is_sequence(default_value):
+    if not _is_shape_and_default_value_compatible(default_value, shape):
+      raise ValueError(
+          'The shape of default_value must be equal to given shape. '
+          'default_value: {}, shape: {}, key: {}'.format(
+              default_value, shape, key))
+    # Check if the values in the list are all integers or are convertible to
+    # floats.
+    is_list_all_int = all(
+        isinstance(v, int) for v in nest.flatten(default_value))
+    is_list_has_float = any(
+        isinstance(v, float) for v in nest.flatten(default_value))
+    if is_list_all_int:
+      return _as_tuple(default_value)
+    if is_list_has_float and dtype.is_floating:
+      return _as_tuple(default_value)
+  raise TypeError('default_value must be compatible with dtype. '
+                  'default_value: {}, dtype: {}, key: {}'.format(
+                      default_value, dtype, key))
+
+
+def _create_tuple(shape, value):
+  """Returns a tuple with given shape and filled with value."""
+  if shape:
+    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
+  return value
+
+
+def _as_tuple(value):
+  if not nest.is_sequence(value):
+    return value
+  return tuple([_as_tuple(v) for v in value])
+
+
+def _is_shape_and_default_value_compatible(default_value, shape):
+  """Verifies compatibility of shape and default_value."""
+  # Invalid condition:
+  #  * if default_value is not a scalar and shape is empty
+  #  * or if default_value is an iterable and shape is not empty
+  if nest.is_sequence(default_value) != bool(shape):
+    return False
+  if not shape:
+    return True
+  if len(default_value) != shape[0]:
+    return False
+  for i in range(shape[0]):
+    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
+      return False
+  return True
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index da76a84e55e5f299bb324eeb1b3e6050fb46eb54..a8ba4ea50d144854c9b38bca427ae9f820994fdd 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -32,12 +32,65 @@ from tensorflow.python.util import tf_decorator
 # Op types that should not run in program order, e.g. because they need to run
 # asynchronously to avoid deadlock.
 ASYNC_STATEFUL_OPS = [
+    "CollectiveGather",
     "CollectiveReduce",
     "CollectiveBcastSend",
     "CollectiveBcastRecv",
     "NcclAllReduce",
 ]
 
+LEGACY_RANDOM_OPS = [
+    # These may be used in variable initializers -- thus their execution should
+    # not be dependent on other stateful operations.  This is because although
+    # according to program order, tf.Variables may be created in sequence,
+    # their initialization happens outside of the program order (specifically,
+    # in graph mode their initialization happens by calling a grouped
+    # initializer operation or in eager mode, where initialization is lifted
+    # out of the tf.function and executed the first time the function is
+    # executed).
+    #
+    # Unless there is a specific dependency between the initializers
+    # themselves (e.g. one initializer depends on a Variable whose value depends
+    # on another initializer), the initialization can happen in any order so
+    # long as it's before the associated Variable read operations.
+    #
+    # Note that in general the randomness of legacy random operations is only
+    # guaranteed by providing a graph-level and op-level seed (and ordering of
+    # the same op across multiple iterations of a while_loop is specifically not
+    # guaranteed; see the discussion below).
+    #
+    # There is a possible race condition inside while_loop where the same
+    # random OpKernel instantiation is reused across multiple steps
+    # of the loop.  Since legacy Random OpKernels have an internal rng state,
+    # automatic dependency tracking across loop steps would likely
+    # fix this race; and for that case this blacklist is problematic.
+    # However, since automatic dependency tracking inside while loops is not
+    # currently supported, and there are no other examples of OpKernel reuse
+    # (each OpKernel is associated with a unique op in graph mode),
+    # this blacklist has no effect on the aforementioned behavior.
+    #
+    # TODO(ebrevdo,skyewm): Modify the check against this blacklist to
+    # only occur when the op is inside a "variable initialization scope"; and
+    # add proper autodeps inside while_loops that respects this updated check.
+    "RandomUniform",
+    "RandomUniformInt",
+    "RandomStandardNormal",
+    "ParameterizedTruncatedNormal",
+    "TruncatedNormal",
+    "RandomShuffle",
+    "Multinomial",
+    "RandomGamma",
+    "RandomGammaGrad",
+    "RandomPoisson",
+    "RandomPoissonV2",
+]
+
+_ALL_BLACKLISTED_OPS = set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
+
+
+def op_is_stateful(op_def):
+  return op_def.is_stateful and op_def.name not in _ALL_BLACKLISTED_OPS
+
 
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
@@ -45,7 +98,7 @@ class AutomaticControlDependencies(object):
   Code under this context manager will act as if a sensible set of control
   dependencies were present. More specifically:
     1. All stateful ops in the scope will execute (with the exception of ops in
-       ASYNC_STATEFUL_OPS)
+       ASYNC_STATEFUL_OPS and LEGACY_RANDOM_OPS)
     2. Stateful ops which modify the same resource will execute in program order
 
   Note: creating variables in an automatic control dependencies context is not
@@ -57,6 +110,7 @@ class AutomaticControlDependencies(object):
 
   def __init__(self):
     self._returned_tensors = set()
+    self.ops_which_must_run = set()
 
   def mark_as_return(self, tensor):
     """Acts like identity but marks the `Tensor` as a return value.
@@ -233,8 +287,7 @@ class AutomaticControlDependencies(object):
       control_inputs = set()
       # Ensure stateful ops run
       if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
-          or (self._graph._registered_ops[op.type].is_stateful   # pylint: disable=protected-access
-              and op.type not in ASYNC_STATEFUL_OPS)):
+          or op_is_stateful(self._graph._registered_ops[op.type])):  # pylint: disable=protected-access
         ops_which_must_run.add(op)
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
@@ -249,25 +302,30 @@ class AutomaticControlDependencies(object):
         ops_which_must_run = set([op])
         continue
       found_resource = False
-      for inp in op.inputs:
-        if inp.dtype == dtypes_module.resource:
-          found_resource = True
-          # Deal with switches, finally.
-          if inp.op.type == "Switch":
-            self._process_switch(inp.op, ops_which_must_run,
-                                 last_op_using_resource_tensor,
-                                 merge_for_resource)
-          # Ensure uses of resources are serialized
-          if inp in last_op_using_resource_tensor:
-            if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
-                is op._control_flow_context):  # pylint: disable=protected-access
-              control_inputs.add(last_op_using_resource_tensor[inp])
-          # Ensure merges happen after the closing of a cond block
-          if inp in merge_for_resource:
-            merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
-          last_op_using_resource_tensor[inp] = op
-      if (op.op_def.is_stateful and op.type not in ASYNC_STATEFUL_OPS
-          and not found_resource and op._control_flow_context is None):  # pylint: disable=protected-access
+      # Check for any resource inputs. If we find any, we update control_inputs
+      # and last_op_using_resource_tensor. Note that we dedup op.inputs in case
+      # op receives the same resource tensor twice as input, which would result
+      # in op getting a control dependency on itself.
+      for inp in set(op.inputs):
+        if inp.dtype != dtypes_module.resource:
+          continue
+        found_resource = True
+        # Deal with switches, finally.
+        if inp.op.type == "Switch":
+          self._process_switch(inp.op, ops_which_must_run,
+                               last_op_using_resource_tensor,
+                               merge_for_resource)
+        # Ensure uses of resources are serialized
+        if inp in last_op_using_resource_tensor:
+          if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
+              is op._control_flow_context):  # pylint: disable=protected-access
+            control_inputs.add(last_op_using_resource_tensor[inp])
+        # Ensure merges happen after the closing of a cond block
+        if inp in merge_for_resource:
+          merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
+        last_op_using_resource_tensor[inp] = op
+      if (op_is_stateful(op.op_def) and not found_resource
+          and op._control_flow_context is None):  # pylint: disable=protected-access
         if None in last_op_using_resource_tensor:
           op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
         last_op_using_resource_tensor[None] = op
@@ -276,10 +334,11 @@ class AutomaticControlDependencies(object):
       op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
 
     # Ensure all ops which must run do run
+    self.ops_which_must_run.update(ops_which_must_run)
     for r in self._returned_tensors:
-      if ops_which_must_run:
+      if self.ops_which_must_run:
         r.op._add_control_inputs(  # pylint: disable=protected-access
-            [o for o in ops_which_must_run
+            [o for o in self.ops_which_must_run
              if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index d81adef26a06ca231d640a9d4e0c4262926aad58..d9df96f6d70c36ddd9b942f66929b1033e4542f6 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -19,12 +19,15 @@ from __future__ import print_function
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import auto_control_deps as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -281,6 +284,44 @@ class AutomaticControlDependenciesTest(test.TestCase):
     train()
     self.assertEqual(v.numpy(), -1.0)
 
+  def testRepeatedResourceInput(self):
+    var = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def inner(var1, var2):
+      return (resource_variable_ops.read_variable_op(var1, dtypes.float32) +
+              resource_variable_ops.read_variable_op(var2, dtypes.float32))
+
+    @def_function.function
+    def outer():
+      return inner(var.handle, var.handle)
+
+    self.assertEqual(self.evaluate(outer()), 2.0)
+
+  def testVariableInitializersCanBeLifted(self):
+    # The initializer is a stateful op, but using it inside a function should
+    # *not* create additional dependencies.  That's what we're testing.
+    layer = keras_core.Dense(1, kernel_initializer="glorot_uniform")
+
+    @def_function.function
+    def fn(x):
+      # Stateful operation
+      control_flow_ops.Assert(x, ["Error"])
+      # Variable initialization should be lifted.  Prior to the change that
+      # added this test, the lifting would crash because of an auto control dep
+      # added on `x`.  Note, the error did not happen if we
+      # manually created a tf.Variable outside of function and used it
+      # here.  Alternatively, creating a tf.Variable inside fn() causes
+      # a different sort of error that is out of scope for this test.
+      return layer(ops.convert_to_tensor([[1.0, 1.0]]))
+
+    true = ops.convert_to_tensor(True)
+
+    concrete = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool))
+    self.evaluate(concrete(true))
+    self.evaluate(fn(True))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e9985506eab288517a60d7d643c9c0322c811b
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -0,0 +1,117 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensor-like objects that are composed from tf.Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util import nest
+
+
+@six.add_metaclass(abc.ABCMeta)
+class CompositeTensor(object):
+  """Abstract base class for Tensor-like objects that are composed from Tensors.
+
+  Each `CompositeTensor` can be decomposed into a structured collection of
+  component `tf.Tensor`s, and reconstructed from those components.
+
+  The `tensorflow.python.util.nest` module has support for treating composite
+  tensors as structure, which makes it easy to flatten and reconstruct
+  composite tensors (or larger structures that contain composite tensors).
+  E.g.:
+
+  ```python
+  ct = ...  # Create a composite tensor.
+  flat_list_of_tensors = nest.flatten(ct, expand_composites=True)
+  transformed_list_of_tensors = ...  # do something with the flat tensors.
+  result = nest.pack_sequence_as(ct, transformed_list_of_tensors)
+  ```
+  """
+
+  @abc.abstractmethod
+  def _to_components(self):
+    """Decomposes this composite tensor into its components.
+
+    Returns:
+      The components that comprise this composite tensor: a nested structure
+      (as defined by `tf.python.util.nest`) whose values are `tf.Tensor`s or
+      `CompositeTensor`s.
+    """
+    raise NotImplementedError("CompositeTensor._to_components")
+
+  @abc.abstractmethod
+  def _from_components(cls, components):  # pylint: disable=no-self-argument
+    """Creates a composite tensor of type `cls` from components.
+
+    Args:
+      components: The components that should be used to form the
+        composite tensor: a nested structure (as defined by
+        `tf.python.util.nest`) whose values are tf.Tensors or composite
+        tensors.
+
+    Returns:
+      A `CompositeTensor` of type `cls`.
+    """
+    raise NotImplementedError("CompositeTensor._from_components")
+
+  @abc.abstractmethod
+  def _shape_invariant_to_components(self, shape=None):
+    """Converts a shape invariant into invariants for individual components.
+
+    Args:
+      shape: A `tf.TensorShape` object.  The shape invariant for this
+        `CompositeTensor`, or `None` if a default shape invariant should be
+        used (based on the value of this `CompositeTensor`).
+
+    Returns:
+      A nested structure whose values are `tf.TensorShape` objects, specifying
+      the shape invariants for the tensors that comprise this `CompositeTensor`.
+    """
+    raise NotImplementedError("CompositeTensor._shape_invariant_to_components")
+
+  @abc.abstractproperty
+  def _is_graph_tensor(self):
+    """Returns True if this tensor's components belong to a TF graph."""
+    raise NotImplementedError("CompositeTensor._is_symbolic_tensor")
+
+  def consumers(self):
+    """Returns a list of `Operation`s that consume this `CompositeTensor`.
+
+    Returns:
+      A list of `Operation`s.
+
+    Raises:
+      RuntimeError: If this method is called while executing eagerly.
+    """
+    consumers = nest.flatten([
+        component.consumers()
+        for component in self._to_components()
+        if getattr(component, "graph", None) is not None
+    ])
+    return list(set(consumers))
+
+
+pywrap_tensorflow.RegisterType("CompositeTensor", CompositeTensor)
+
+
+# @TODO(edloper): Can we replace convert_to_tensor_or_xyz with just
+# convert_to_tensor_or_composite?  Alternatively, should composite tensors
+# register a dispatch override for tf.convert_to_tensor?
diff --git a/tensorflow/python/framework/composite_tensor_test.py b/tensorflow/python/framework/composite_tensor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f249faa5d685b411742a65025000e00c2edadbc5
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.composite_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import nest
+
+
+class TestCompositeTensor(composite_tensor.CompositeTensor):
+
+  def __init__(self, *components):
+    self._components = components
+
+  def _to_components(self):
+    return self._components
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    raise NotImplementedError('CompositeTensor._shape_invariant_to_components')
+
+  def _is_graph_tensor(self):
+    return True
+
+
+class CompositeTensorTest(test_util.TensorFlowTestCase):
+
+  def assertNestEqual(self, a, b, expand_composites=False):
+    if isinstance(a, dict):
+      self.assertIsInstance(b, dict)
+      self.assertEqual(set(a), set(b))
+      for key in a:
+        self.assertNestEqual(a[key], b[key])
+    elif isinstance(a, (list, tuple)):
+      self.assertIsInstance(b, (list, tuple))
+      self.assertEqual(len(a), len(b))
+      for a_val, b_val in zip(a, b):
+        self.assertNestEqual(a_val, b_val)
+    elif expand_composites and isinstance(a, composite_tensor.CompositeTensor):
+      self.assertIsInstance(b, composite_tensor.CompositeTensor)
+      self.assertNestEqual(a._to_components(),
+                           b._to_components())
+
+  def testNestFlatten(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure = [[st1], 'foo', {'y': [st2]}]
+    x = nest.flatten(structure, expand_composites=True)
+    self.assertEqual(len(x), 7)
+    self.assertIs(x[0], st1.indices)
+    self.assertIs(x[1], st1.values)
+    self.assertIs(x[2], st1.dense_shape)
+    self.assertEqual(x[3], 'foo')
+    self.assertIs(x[4], st2.indices)
+    self.assertIs(x[5], st2.values)
+    self.assertIs(x[6], st2.dense_shape)
+
+  def testNestPackSequenceAs(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure1 = [[st1], 'foo', {'y': [st2]}]
+    flat = [st2.indices, st2.values, st2.dense_shape, 'bar',
+            st1.indices, st1.values, st1.dense_shape]
+    result = nest.pack_sequence_as(structure1, flat, expand_composites=True)
+    expected = [[st2], 'bar', {'y': [st1]}]
+    self.assertNestEqual(expected, result)
+
+  def testAssertSameStructure(self):
+    st1 = sparse_tensor.SparseTensor([[0]], [0], [100])
+    st2 = sparse_tensor.SparseTensor([[0, 3]], ['x'], [100, 100])
+    test = TestCompositeTensor(st1.indices, st1.values, st1.dense_shape)
+    nest.assert_same_structure(st1, st2, expand_composites=False)
+    nest.assert_same_structure(st1, st2, expand_composites=True)
+    nest.assert_same_structure(st1, test, expand_composites=False)
+    with self.assertRaises(TypeError):
+      nest.assert_same_structure(st1, test, expand_composites=True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..de79ca3b2ab2d1fa06ff844c9cce7a96d3126b3f
--- /dev/null
+++ b/tensorflow/python/framework/config.py
@@ -0,0 +1,259 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for configuring TensorFlow execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('config.gpu.get_per_process_memory_fraction')
+def get_gpu_per_process_memory_fraction():
+  """Get fraction of the available GPU memory to allocate for each process.
+
+  1.0 means to allocate all of the GPU memory, 0.5 means the process allocates
+  up to half of the available GPU memory.
+
+  Returns:
+    Current GPU per process memory fraction
+  """
+  return context.context().gpu_per_process_memory_fraction
+
+
+@tf_export('config.gpu.set_per_process_memory_fraction')
+def set_gpu_per_process_memory_fraction(fraction):
+  """Set fraction of the available GPU memory to allocate for each process.
+
+  1.0 means to allocate all of the GPU memory, 0.5 means the process allocates
+  up to half of the available GPU memory.
+
+  Args:
+    fraction: Fraction of GPU memory to allocate
+  """
+  context.context().gpu_per_process_memory_fraction = fraction
+
+
+@tf_export('config.gpu.get_per_process_memory_growth')
+def get_gpu_per_process_memory_growth():
+  """Get if GPU memory should be pre-allocated or allowed to grow.
+
+  Returns:
+    If GPU memory growth should be enabled
+  """
+  return context.context().gpu_per_process_memory_growth
+
+
+@tf_export('config.gpu.set_per_process_memory_growth')
+def set_gpu_per_process_memory_growth(enabled):
+  """Set if GPU memory should be pre-allocated or allowed to grow.
+
+  Args:
+    enabled: Indicates if GPU memory growth should be enabled
+  """
+  context.context().gpu_per_process_memory_growth = enabled
+
+
+@tf_export('config.threading.intra_op_parallelism_threads')
+def get_intra_op_parallelism_threads():
+  """Get number of threads used within an individual op for parallelism.
+
+  Certain operations like matrix multiplication and reductions can utilize
+  parellel threads for speed ups. A value of 0 means the system picks an
+  appropriate number.
+
+  Returns:
+    Number of parallel threads
+  """
+  return context.context().intra_op_parallelism_threads
+
+
+@tf_export('config.threading.set_intra_op_parallelism_threads')
+def set_intra_op_parallelism_threads(num_threads):
+  """Set number of threads used within an individual op for parallelism.
+
+  Certain operations like matrix multiplication and reductions can utilize
+  parellel threads for speed ups. A value of 0 means the system picks an
+  appropriate number.
+
+  Args:
+    num_threads: Number of parallel threads
+  """
+  context.context().intra_op_parallelism_threads = num_threads
+
+
+@tf_export('config.threading.inter_op_parallelism_threads')
+def get_inter_op_parallelism_threads():
+  """Get number of threads used for parallelism between independent operations.
+
+  Determines the number of threads used by independent non-blokcing operations.
+  0 means the system picks an appropriate number.
+
+  Returns:
+    Number of parallel threads
+  """
+  return context.context().inter_op_parallelism_threads
+
+
+@tf_export('config.threading.set_inter_op_parallelism_threads')
+def set_inter_op_parallelism_threads(num_threads):
+  """Set number of threads used for parallelism between independent operations.
+
+  Determines the number of threads used by independent non-blokcing operations.
+  0 means the system picks an appropriate number.
+
+  Args:
+    num_threads: Number of parallel threads
+  """
+  context.context().inter_op_parallelism_threads = num_threads
+
+
+@tf_export('config.get_soft_device_placement')
+def get_soft_device_placement():
+  """Get if soft device placement is enabled.
+
+  If enabled, an op will be placed on CPU if any of the following are true
+    1. there's no GPU implementation for the OP
+    2. no GPU devices are known or registered
+    3. need to co-locate with reftype input(s) which are from CPU
+
+  Returns:
+    If soft placement is enabled.
+  """
+  return context.context().soft_device_placement
+
+
+@tf_export('config.set_soft_device_placement')
+def set_soft_device_placement(enabled):
+  """Set if soft device placement is enabled.
+
+  If enabled, an op will be placed on CPU if any of the following are true
+    1. there's no GPU implementation for the OP
+    2. no GPU devices are known or registered
+    3. need to co-locate with reftype input(s) which are from CPU
+
+  Args:
+    enabled: Whether to enabled soft placement.
+  """
+  context.context().soft_device_placement = enabled
+
+
+@tf_export('config.experimental.get_device_policy')
+def get_device_policy():
+  """Gets the current device policy.
+
+  The device policy controls how operations requiring inputs on a specific
+  device (e.g., on GPU:0) handle inputs on a different device (e.g. GPU:1).
+
+  This function only gets the device policy for the current thread. Any
+  subsequently started thread will again use the default policy.
+
+  Returns:
+    Current thread device policy
+  """
+  device_policy = context.context().device_policy
+  if device_policy == context.DEVICE_PLACEMENT_SILENT:
+    return 'silent'
+  elif device_policy == context.DEVICE_PLACEMENT_SILENT_FOR_INT32:
+    return 'silent_for_int32'
+  elif device_policy == context.DEVICE_PLACEMENT_WARN:
+    return 'warn'
+  elif device_policy == context.DEVICE_PLACEMENT_EXPLICIT:
+    return 'explicit'
+  else:
+    raise ValueError('Not a valid device policy: %r' % device_policy)
+
+
+@tf_export('config.experimental.set_device_policy')
+def set_device_policy(device_policy):
+  """Sets the current thread device policy.
+
+  The device policy controls how operations requiring inputs on a specific
+  device (e.g., on GPU:0) handle inputs on a different device (e.g. GPU:1).
+
+  When using the default, an appropriate policy will be picked automatically.
+  The default policy may change over time.
+
+  This function only sets the device policy for the current thread. Any
+  subsequently started thread will again use the default policy.
+
+  Args:
+    device_policy: A device policy.
+      Valid values:
+      - None: Switch to a system default.
+      - 'warn': Copies the tensors which are not on the right device and logs
+          a warning.
+      - 'explicit': Raises an error if the placement is not as required.
+      - 'silent': Silently copies the tensors. Note that this may hide
+          performance problems as there is no notification provided when
+          operations are blocked on the tensor being copied between devices.
+      - 'silent_for_int32': silently copies `int32` tensors, raising errors on
+          the other ones.
+
+  Raises:
+      ValueError: If an invalid `device_policy` is passed.
+  """
+  if device_policy == 'silent':
+    context.context().device_policy = context.DEVICE_PLACEMENT_SILENT
+  elif device_policy == 'silent_for_int32':
+    context.context().device_policy = context.DEVICE_PLACEMENT_SILENT_FOR_INT32
+  elif device_policy == 'warn':
+    context.context().device_policy = context.DEVICE_PLACEMENT_WARN
+  elif device_policy == 'explicit':
+    context.context().device_policy = context.DEVICE_PLACEMENT_EXPLICIT
+  elif device_policy is None:
+    context.context().device_policy = None
+  else:
+    raise ValueError('Not a valid device policy: %r' % device_policy)
+
+
+@tf_export('config.experimental.get_synchronous_execution')
+def get_synchronous_execution():
+  """Gets whether operations are executed synchronously or asynchronously.
+
+  TensorFlow can execute operations synchronously or asynchronously. If
+  asynchronous execution is enabled, operations may return "non-ready" handles.
+
+  Returns:
+    Current thread execution mode
+  """
+  return context.context().execution_mode == context.SYNC
+
+
+@tf_export('config.experimental.set_synchronous_execution')
+def set_synchronous_execution(enable):
+  """Specifies whether operations are executed synchronously or asynchronously.
+
+  TensorFlow can execute operations synchronously or asynchronously. If
+  asynchronous execution is enabled, operations may return "non-ready" handles.
+
+  When `enable` is set to None, an appropriate value will be picked
+  automatically. The value picked may change between TensorFlow releases.
+
+  Args:
+    enable: Whether operations should be dispatched synchronously.
+      Valid values:
+      - None: sets the system default.
+      - True: executes each operation synchronously.
+      - False: executes each operation asynchronously.
+  """
+  if enable is None:
+    context.context().execution_mode = None
+  elif enable:
+    context.context().execution_mode = context.SYNC
+  else:
+    context.context().execution_mode = context.ASYNC
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7287c84dbb59eccd2d4224a6e1bc226b4fd18e7
--- /dev/null
+++ b/tensorflow/python/framework/config_test.py
@@ -0,0 +1,229 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the system configuration methods work properly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def reset_eager(fn):
+  def wrapper(*args, **kwargs):
+    try:
+      return fn(*args, **kwargs)
+    finally:
+      del context._context
+      context._context = context.Context()
+      ops.enable_eager_execution()
+
+  return wrapper
+
+
+class ConfigTest(test.TestCase):
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testDevicePolicy(self):
+    self.assertEqual(context.DEVICE_PLACEMENT_SILENT,
+                     context.context().device_policy)
+
+    # If no op has been executed we should be able to set the device policy as
+    # well as any init-time configs.
+    config.set_intra_op_parallelism_threads(1)
+    config.set_device_policy('silent')
+    config.set_intra_op_parallelism_threads(2)
+
+    # Excute a dummy op to ensure that the context has been initialized
+    constant_op.constant(1)
+
+    def copy_tensor(dtype=dtypes.int32):
+      cpu_tensor = constant_op.constant(1, dtype=dtype)
+      gpu_tensor = cpu_tensor.gpu()
+      self.assertAllEqual(cpu_tensor + gpu_tensor, 2.0)
+
+    config.set_device_policy('silent')
+    self.assertEqual(config.get_device_policy(), 'silent')
+    self.assertEqual(context.DEVICE_PLACEMENT_SILENT,
+                     context.context().device_policy)
+    copy_tensor()
+
+    config.set_device_policy('silent_for_int32')
+    self.assertEqual(config.get_device_policy(), 'silent_for_int32')
+    self.assertEqual(context.DEVICE_PLACEMENT_SILENT_FOR_INT32,
+                     context.context().device_policy)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Tensors on conflicting devices'):
+      copy_tensor(dtypes.float32)
+    copy_tensor()
+
+    config.set_device_policy('warn')
+    self.assertEqual(config.get_device_policy(), 'warn')
+    self.assertEqual(context.DEVICE_PLACEMENT_WARN,
+                     context.context().device_policy)
+    copy_tensor()
+
+    config.set_device_policy('explicit')
+    self.assertEqual(config.get_device_policy(), 'explicit')
+    self.assertEqual(context.DEVICE_PLACEMENT_EXPLICIT,
+                     context.context().device_policy)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Tensors on conflicting devices'):
+      copy_tensor()
+
+    config.set_device_policy(None)
+    self.assertEqual(config.get_device_policy(), 'silent')
+
+  @reset_eager
+  def testExecutionMode(self):
+    self.assertTrue(config.get_synchronous_execution())
+    self.assertEqual(context.SYNC, context.context().execution_mode)
+
+    # If no op has been executed we should be able to set the execution mode as
+    # well as any init-time configs.
+    config.set_intra_op_parallelism_threads(1)
+    config.set_synchronous_execution(False)
+    config.set_intra_op_parallelism_threads(2)
+
+    config.set_synchronous_execution(True)
+    self.assertTrue(config.get_synchronous_execution())
+    self.assertEqual(context.SYNC, context.context().execution_mode)
+    config.set_synchronous_execution(False)
+    self.assertFalse(config.get_synchronous_execution())
+    self.assertEqual(context.ASYNC, context.context().execution_mode)
+
+  @reset_eager
+  def testGpuPerProcessMemoryFraction(self):
+    config.set_gpu_per_process_memory_fraction(0.5)
+    self.assertEqual(
+        config.get_gpu_per_process_memory_fraction(),
+        context.context().gpu_per_process_memory_fraction)
+
+    constant_op.constant(1)
+    with self.assertRaises(RuntimeError):
+      config.set_gpu_per_process_memory_fraction(0.5)
+
+  @reset_eager
+  def testGpuPerProcessMemoryGrowth(self):
+    self.assertFalse(config.get_gpu_per_process_memory_growth())
+
+    config.set_gpu_per_process_memory_growth(True)
+    self.assertTrue(config.get_gpu_per_process_memory_growth())
+    self.assertEqual(
+        config.get_gpu_per_process_memory_growth(),
+        context.context().gpu_per_process_memory_growth)
+
+    config.set_gpu_per_process_memory_growth(False)
+    self.assertFalse(config.get_gpu_per_process_memory_growth())
+    self.assertEqual(
+        config.get_gpu_per_process_memory_growth(),
+        context.context().gpu_per_process_memory_growth)
+
+    constant_op.constant(1)
+    with self.assertRaises(RuntimeError):
+      config.set_gpu_per_process_memory_growth(True)
+
+  @reset_eager
+  def testIntraOpParallelismThreads(self):
+    config.set_intra_op_parallelism_threads(10)
+    self.assertEqual(
+        config.get_intra_op_parallelism_threads(),
+        context.context().intra_op_parallelism_threads)
+
+    constant_op.constant(1)
+    with self.assertRaises(RuntimeError):
+      config.set_intra_op_parallelism_threads(1)
+
+  @reset_eager
+  def testInterOpParallelismThreads(self):
+    config.set_inter_op_parallelism_threads(10)
+    self.assertEqual(
+        config.get_inter_op_parallelism_threads(),
+        context.context().inter_op_parallelism_threads)
+
+    constant_op.constant(1)
+    with self.assertRaises(RuntimeError):
+      config.set_inter_op_parallelism_threads(1)
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testSoftPlacement(self):
+    self.assertEqual(config.get_soft_device_placement(), True)
+
+    @def_function.function
+    def mod():
+      with ops.device('/device:GPU:0'):
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(1.0)
+        return math_ops.mod(a, b)
+
+    # Since soft placement is enabled, the mod operation should work with CPU
+    mod()
+
+    config.set_soft_device_placement(False)
+    self.assertEqual(config.get_soft_device_placement(), False)
+    self.assertEqual(
+        config.get_soft_device_placement(),
+        context.context().soft_device_placement)
+
+    # Since soft placement is disabled, the mod operation should fail on GPU
+    with self.assertRaises(errors.InvalidArgumentError):
+      mod()
+
+    config.set_soft_device_placement(True)
+    self.assertEqual(config.get_soft_device_placement(), True)
+    self.assertEqual(
+        config.get_soft_device_placement(),
+        context.context().soft_device_placement)
+
+    # Since soft placement is re-enabled, the mod operation should work with CPU
+    mod()
+
+  @reset_eager
+  def testLogDevicePlacement(self):
+    self.assertEqual(context.get_log_device_placement(), False)
+
+    context.set_log_device_placement(True)
+    self.assertEqual(context.get_log_device_placement(), True)
+    self.assertEqual(
+        context.get_log_device_placement(),
+        context.context().log_device_placement)
+
+    context.set_log_device_placement(False)
+    self.assertEqual(context.get_log_device_placement(), False)
+    self.assertEqual(
+        context.get_log_device_placement(),
+        context.context().log_device_placement)
+
+    constant_op.constant(1)
+    with self.assertRaises(RuntimeError):
+      context.set_log_device_placement(True)
+    with self.assertRaises(RuntimeError):
+      context.set_log_device_placement(False)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..30c6f0b8faab26ce495fc8aaf9a255020a5b34f9
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -0,0 +1,211 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to convert variables to constants in TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import function
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.saver import export_meta_graph
+
+
+def _run_inline_graph_optimization(func):
+  """Apply function inline optimization to the graph.
+
+  Returns the GraphDef after Grappler's function inlining optimization is
+  applied. This optimization does not work on models with control flow.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    GraphDef
+  """
+  meta_graph = export_meta_graph(
+      graph_def=func.graph.as_graph_def(), graph=func.graph)
+
+  # Add a collection 'train_op' so that Grappler knows the outputs.
+  fetch_collection = meta_graph_pb2.CollectionDef()
+  for array in func.inputs + func.outputs:
+    fetch_collection.node_list.value.append(array.name)
+  meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+  # Initialize RewriterConfig with everything disabled except function inlining.
+  config = config_pb2.ConfigProto()
+  rewrite_options = config.graph_options.rewrite_options
+  rewrite_options.optimizers.append("function")
+  return tf_optimizer.OptimizeGraph(config, meta_graph)
+
+
+def _get_tensors_from_graph(graph, tensors):
+  """Gets the Tensors in `graph` with the name of the tensors in `tensors`.
+
+  Args:
+    graph: TensorFlow Graph.
+    tensors: List of Tensors.
+
+  Returns:
+    List of Tensors.
+  """
+  new_tensors = []
+  for orig_tensor in tensors:
+    new_tensor = graph.get_tensor_by_name(orig_tensor.name)
+    if new_tensor.shape.rank is None:
+      new_tensor.set_shape(orig_tensor.shape)
+    new_tensors.append(new_tensor)
+  return new_tensors
+
+
+def _construct_concrete_function(input_func, graph_def):
+  """Creates a ConcreteFunction from the input function and frozen graph.
+
+  Args:
+    input_func: ConcreteFunction.
+    graph_def: TensorFlow GraphDef.
+
+  Returns:
+    ConcreteFunction containing the graph_def.
+  """
+  output_graph = func_graph.FuncGraph(input_func.graph.name)
+  with output_graph.as_default():
+    importer.import_graph_def(graph_def, name="")
+    output_graph.inputs = _get_tensors_from_graph(output_graph,
+                                                  input_func.inputs)
+    output_graph.outputs = _get_tensors_from_graph(output_graph,
+                                                   input_func.outputs)
+
+  output_graph.structured_outputs = input_func.graph.structured_outputs
+  output_graph.structured_input_signature = (
+      input_func.graph.structured_input_signature)
+
+  # Create the ConcreteFunction and add it to the global context.
+  output_func = function.ConcreteFunction(output_graph)
+  output_func.add_to_graph()
+
+  # Inject the captured inputs into the ConcreteFunction.
+  output_func._captured_inputs = input_func.captured_inputs  # pylint: disable=protected-access
+  output_func.graph.variables = input_func.graph.variables
+
+  output_func._arg_keywords = input_func._arg_keywords  # pylint: disable=protected-access
+  output_func._num_position_args = input_func._num_positional_args  # pylint: disable=protected-access
+
+  # Register the gradients in the current root context.
+  with ops.init_scope():
+    output_func._register_gradient()  # pylint: disable=protected-access
+  return output_func
+
+
+def convert_variables_to_constants_v2(func):
+  """Replaces all the variables in a graph with constants of the same values.
+
+  TensorFlow 2.0 function for converting all Variable ops into Const ops holding
+  the same values. This makes it possible to describe the network fully with a
+  single GraphDef file, and allows the removal of a lot of ops related to
+  loading and saving the variables. This function runs Grappler's function
+  inlining optimization in order to return a single subgraph.
+
+  The current implementation only works for graphs that do not contain any
+  control flow or embedding related ops.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    ConcreteFunction containing a simplified version of the original.
+  """
+  # TODO(nupurgarg): Replace ResourceGather with Gather.
+  # TODO(nupurgarg): Change attr for Variables in control flow and functions.
+  graph_def = _run_inline_graph_optimization(func)
+
+  # Identify the ReadVariableOps.
+  get_name = lambda name: name.split(":")[0]
+  map_name_to_node = {get_name(node.name): node for node in graph_def.node}
+
+  # TODO(b/125838789): Use `func.graph.captures`.
+  # Get mapping from input name to variable value.
+  tensor_data = {}
+  input_tensors = func.inputs[-len(func.captured_inputs):]
+  for var in func.graph.variables:
+    index = func.captured_inputs.index(var.handle)
+    tensor = input_tensors[index]
+    tensor_data[get_name(tensor.name)] = var.numpy()
+
+  resource_identities = {}
+  resource_placeholders = {}
+  for node in graph_def.node:
+    if node.op == "ReadVariableOp":
+      # Get name of Placeholder op associated with ReadVariableOp. There can be
+      # an Identity in between the ReadVariableOp and Placeholder. Store the
+      # Identity ops with the associated dtypes.
+      input_name = get_name(node.input[0])
+      while map_name_to_node[input_name].op == "Identity":
+        resource_identities[input_name] = node.attr["dtype"]
+        input_name = get_name(map_name_to_node[input_name].input[0])
+      if map_name_to_node[input_name].op != "Placeholder":
+        raise ValueError("Cannot find the Placeholder op that is an input "
+                         "to the ReadVariableOp.")
+      # Build a map of Placeholder ops that are inputs to ReadVariableOps to the
+      # variable's dtype and data.
+      resource_placeholders[input_name] = {
+          "dtype": node.attr["dtype"],
+          "data": tensor_data[input_name],
+      }
+
+  # Reconstruct the graph with constants in place of variables.
+  output_graph_def = graph_pb2.GraphDef()
+  how_many_converted = 0
+
+  for input_node in graph_def.node:
+    output_node = output_graph_def.node.add()
+    # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops.
+    if input_node.name in resource_placeholders:
+      dtype = resource_placeholders[input_node.name]["dtype"]
+      data = resource_placeholders[input_node.name]["data"]
+
+      output_node.op = "Const"
+      output_node.name = input_node.name
+      output_node.attr["dtype"].CopyFrom(dtype)
+      output_node.attr["value"].tensor.CopyFrom(
+          tensor_util.make_tensor_proto(
+              data, dtype=dtype.type, shape=data.shape))
+      how_many_converted += 1
+    # Change the dtype for Identity ops that are inputs to ReadVariableOps.
+    elif input_node.name in resource_identities:
+      output_node.CopyFrom(input_node)
+      output_node.attr["T"].CopyFrom(resource_identities[input_node.name])
+    # Convert ReadVariableOps into Identity ops.
+    elif input_node.op == "ReadVariableOp":
+      output_node.op = "Identity"
+      output_node.name = input_node.name
+      output_node.input.extend([input_node.input[0]])
+      output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
+    else:
+      output_node.CopyFrom(input_node)
+
+  logging.info("Converted %d variables to const ops.", how_many_converted)
+  # TODO(b/126613403): Use wrap_function.function_from_graph_def.
+  return _construct_concrete_function(func, output_graph_def)
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c99bc019203f2190567297e853649ef63002a1
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -0,0 +1,260 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convert_to_constants.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+# TODO(nupurgarg): Simplify the test cases to use the ConcreteFunction.
+class VariablesToConstantsTest(test.TestCase):
+
+  def _hasStatefulPartitionedCallOp(self, graph_def):
+    """Determines if a StatefulPartitionedCall op exists in the graph."""
+    for node in graph_def.node:
+      if node.op == "StatefulPartitionedCall":
+        return True
+    return False
+
+  def _getNumVariables(self, graph_def):
+    """Returns the number of ReadVariableOp in the graph."""
+    return sum(node.op == "ReadVariableOp" for node in graph_def.node)
+
+  def _getTensors(self, sess, tensor_list):
+    """Returns a list of Tensor objects from the Session."""
+    return [
+        sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
+    ]
+
+  def _evaluateGraphDef(self, graph_def, func, input_data):
+    """Evaluates the GraphDef using Sessions."""
+    with ops.Graph().as_default() as graph:
+      importer.import_graph_def(graph_def, name="")
+      func.add_to_graph(graph)
+      sess = session.Session(graph=graph)
+
+    input_tensors = self._getTensors(sess, func.inputs)
+    output_tensors = self._getTensors(sess, func.outputs)
+    return sess.run(
+        output_tensors, feed_dict=dict(zip(input_tensors, input_data)))
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    input_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(variable_graph_def))
+    self.assertTrue(variable_graph_def.library.function)
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(constant_graph_def.library.function)
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableModel(self):
+    """Test a basic model with Variables."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    input_func = root.f.get_concrete_function(input_data)
+
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertEqual(2, self._getNumVariables(variable_graph_def))
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    input_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertTrue(self._hasStatefulPartitionedCallOp(variable_graph_def))
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    input_func = root.add.get_concrete_function(input_data)
+
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertEqual(1, self._getNumVariables(variable_graph_def))
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testConstructConcreteFunction(self):
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    func = root.f.get_concrete_function(input_data)
+
+    input_func = convert_to_constants._construct_concrete_function(
+        func, func.graph.as_graph_def())
+
+    # Test if model has enough metadata to be frozen afterwards.
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertEqual(2, self._getNumVariables(variable_graph_def))
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testKerasModel(self):
+    input_data = constant_op.constant(1., shape=[1, 1])
+
+    # Create a simple Keras model.
+    x = [-1, 0, 1, 2, 3, 4]
+    y = [-3, -1, 1, 3, 5, 7]
+
+    model = keras.models.Sequential(
+        [keras.layers.Dense(units=1, input_shape=[1])])
+    model.compile(optimizer="sgd", loss="mean_squared_error")
+    model.fit(x, y, epochs=1)
+
+    # Get the concrete function from the Keras model.
+    @def_function.function
+    def to_save(x):
+      return model(x)
+
+    input_func = to_save.get_concrete_function(input_data)
+
+    variable_graph_def = input_func.graph.as_graph_def()
+    self.assertEqual(2, self._getNumVariables(variable_graph_def))
+
+    output_func = convert_to_constants.convert_variables_to_constants_v2(
+        input_func)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = to_save(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, input_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index 0859e956ffd5a2c905837c5f6e68658d11403ae5..cd4b4ea51e62dd1c022316b30cb9203f089a92d3 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import device
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -116,6 +119,20 @@ class DeviceTest(test_util.TensorFlowTestCase):
         "/job:muu/device:MyFunnyDevice:2"))
     self.assertEquals("/job:muu/task:1/device:MyFunnyDevice:2", d.to_string())
 
+    if not context.executing_eagerly():
+      with ops.device(device.merge_device("/device:GPU:0")):
+        var1 = variables.Variable(1.0)
+        self.assertEquals("/device:GPU:0", var1.device)
+        with ops.device(device.merge_device("/job:worker")):
+          var2 = variables.Variable(1.0)
+          self.assertEquals("/job:worker/device:GPU:0", var2.device)
+          with ops.device(device.merge_device("/device:CPU:0")):
+            var3 = variables.Variable(1.0)
+            self.assertEquals("/job:worker/device:CPU:0", var3.device)
+            with ops.device(device.merge_device("/job:ps")):
+              var4 = variables.Variable(1.0)
+              self.assertEquals("/job:ps/device:CPU:0", var4.device)
+
   def testCanonicalName(self):
     self.assertEqual("/job:foo/replica:0",
                      device.canonical_name("/job:foo/replica:0"))
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 465ea4229be2b782f10a5121fe09c880285e9f58..8440e82a5952b0305e6740166b9811bc11fa6e9b 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -282,9 +282,6 @@ class DType(object):
     """Returns the string name for this `DType`."""
     return _TYPE_TO_STRING[self._type_enum]
 
-  def __int__(self):
-    return self._type_enum
-
   def __str__(self):
     return "<dtype: %r>" % self.name
 
@@ -561,9 +558,14 @@ _NP_TO_TF = {
     _np_bfloat16: bfloat16,
 }
 
-# On Python 2.X `np.longlong` could be a distinct type used for long
-# integers e.g. 42L. See numpy/numpy#9799.
+# Map (some) NumPy platform dtypes to TF ones using their fixed-width
+# synonyms. Note that platform dtypes are not always simples aliases,
+# i.e. reference equality is not guaranteed. See e.g. numpy/numpy#9799.
 for pdt in [
+    np.intc,
+    np.uintc,
+    np.int_,
+    np.uint,
     np.longlong,
     np.ulonglong,
 ]:
@@ -723,4 +725,4 @@ def as_dtype(type_value):
     pass
 
   raise TypeError(
-      "Cannot convert value %r to a TensorFlow DType." % type_value)
+      "Cannot convert value %r to a TensorFlow DType." % (type_value,))
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 7dd2a792d1254027401d03b9dacddbb815cf4858..126516a66e707d7c1f1dc73e6043a8b9988cc219 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -311,6 +311,10 @@ class TypesTest(test_util.TensorFlowTestCase):
       reconstructed = ctor(*args)
       self.assertEquals(reconstructed, dtype)
 
+  def testAsDtypeInvalidArgument(self):
+    with self.assertRaises(TypeError):
+      dtypes.as_dtype((dtypes.int32, dtypes.float32))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index af83b70a465cd061c2ed713639cc4a5d531f388d..b671dfbfaa12ed47b2ca5de0a923280af95de2ef 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -41,6 +41,8 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
     os.path.join("tensorflow", "contrib"),
+    os.path.join("tensorflow_estimator", "python"),
+    os.path.join("tensorflow_estimator", "contrib"),
     "<embedded",
 ]
 
@@ -210,6 +212,39 @@ def _get_defining_frame_from_op(op):
   frame_index = _find_index_of_defining_frame_for_op(op)
   return op.traceback[frame_index]
 
+def compute_useful_stack(op):
+  """Return a list of line name and lineno pairs, which form a 'useful' stack.
+
+  Starting from the defining frame to the outermost one, this method computes
+  the contiguous portion of the 'useful' stack trace and returns each line as
+  a line name and lineno pair.
+
+  Args:
+    op: op.Operation object having a _traceback member.
+
+  Returns:
+    A list of line name and lineno pairs. Below is an example of returned list:
+    [("tool_utils.py", "124", "func1", "a={}"), ("tool_utils.py", "21", "func2",
+    "for i in range(10):"), ....]
+  """
+  defining_frame_index = _find_index_of_defining_frame_for_op(op)
+  stack_trace = []
+  # The stack trace is collected from the defining (included) to the outermost.
+  # Include `frame_num` frames at most.
+  # Two lines from the TensorFlow library are included to show the node
+  # definition.
+  frame_num = 10
+  innermost_excluded = min(defining_frame_index + 2 + 1, len(op.traceback))
+  outermost_included = max(innermost_excluded - frame_num, 0)
+  for index in reversed(range(outermost_included, innermost_excluded)):
+    frame = op.traceback[index]
+    filename = frame[tf_stack.TB_FILENAME]
+    lineno = frame[tf_stack.TB_LINENO]
+    func = frame[tf_stack.TB_FUNCNAME]
+    code = frame[tf_stack.TB_CODEDICT]
+    stack_trace.append((filename, lineno, func, code))
+  return stack_trace
+
 
 def compute_field_dict(op, strip_file_prefix=""):
   """Return a dictionary mapping interpolation tokens to values.
@@ -288,21 +323,18 @@ def traceback_files_common_prefix(all_ops):
   return os.path.split(os.path.commonprefix(list(files)))[0]
 
 
-def _sources_for_node(name, graph):
-  """Gets the top-level root input nodes for 'name' node.
-
-  We recursively traverse the graph from 'name' node to its inputs and collect
-  all the nodes which don't have any inputs.
+def _sources_for_node(node, graph):
+  """Gets the input op nodes for 'node'.
 
   Args:
-    name: The name of the node.
+    node: The node.
     graph: The graph containing the node.
 
   Returns:
-    The unique top-level root input nodes.
+    The unique input nodes.
   """
-  def _helper(name, graph, seen_names, inputs):
-    """Recursive helper. 'seen_names' and 'inputs' are mutated."""
+  inputs = set()
+  for name in node.node_def.input:
     if name.startswith("^"):
       name = name[1:]
     try:
@@ -312,20 +344,9 @@ def _sources_for_node(name, graph):
       try:
         op = graph.get_operation_by_name(name)
       except KeyError:
-        return
-    name = op.name
-    if name in seen_names:
-      return
-    seen_names.add(name)
-    if not op.node_def.input:
-      inputs.add(op)
-      return
-    for n in op.node_def.input:
-      _helper(n, graph, seen_names, inputs)
-
-  names = set()
-  inputs = set()
-  _helper(name, graph, names, inputs)
+        continue
+    inputs.add(op)
+
   return list(inputs)
 
 
@@ -389,7 +410,7 @@ def interpolate(error_message, graph):
     if op is None:
       tagged_ops.append(None)
     else:
-      tagged_ops.append([op] + _sources_for_node(op.name, graph))
+      tagged_ops.append([op] + _sources_for_node(op, graph))
 
   common_prefix = traceback_files_common_prefix(tagged_ops)
   for tag, ops in zip(tags, tagged_ops):
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 922b9e2bd308a03b0b6b28aa741c4e6f54c1b347..c473dfeedf8d232d5b5211fe5982ab4f8ea41fee 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -511,7 +511,11 @@ def exception_type_from_error_code(error_code):
 
 @tf_export("errors.error_code_from_exception_type")
 def error_code_from_exception_type(cls):
-  return _EXCEPTION_CLASS_TO_CODE[cls]
+  try:
+    return _EXCEPTION_CLASS_TO_CODE[cls]
+  except KeyError:
+    warnings.warn("Unknown class exception")
+    return UnknownError(None, None, "Unknown class exception", None)
 
 
 def _make_specific_exception(node_def, op, message, error_code):
diff --git a/tensorflow/python/framework/errors_test.py b/tensorflow/python/framework/errors_test.py
index 574b126caeef87c5e05f4f08a9432b22d2f8040d..c044202d92ad549d48fd0f4d9ace79b4e9a8ef97 100644
--- a/tensorflow/python/framework/errors_test.py
+++ b/tensorflow/python/framework/errors_test.py
@@ -70,6 +70,10 @@ class ErrorsTest(test.TestCase):
           isinstance(
               errors_impl._make_specific_exception(None, None, None,
                                                    error_code), exc_type))
+      # error_code_from_exception_type and exception_type_from_error_code should
+      # be consistent with operation result.
+      self.assertEqual(error_code,
+                       errors_impl.error_code_from_exception_type(exc_type))
       # pylint: enable=protected-access
 
   def testKnownErrorClassForEachErrorCodeInProto(self):
@@ -98,6 +102,14 @@ class ErrorsTest(test.TestCase):
     self.assertTrue("Unknown error code: 37" in str(w[0].message))
     self.assertTrue(isinstance(exc, errors_impl.OpError))
 
+    with warnings.catch_warnings(record=True) as w:
+      # pylint: disable=protected-access
+      exc = errors_impl.error_code_from_exception_type("Unknown")
+      # pylint: enable=protected-access
+    self.assertEqual(1, len(w))
+    self.assertTrue("Unknown class exception" in str(w[0].message))
+    self.assertTrue(isinstance(exc, errors_impl.OpError))
+
   def testStatusDoesNotLeak(self):
     try:
       with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 2e3e15f53a919bac669b56e4a8f27c1808da345a..dc14361d666637f6fc37fb6b39ed2ea313b2286a 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -131,4 +131,4 @@ def AppendBoolArrayToTensorProto(tensor_proto, nparray):
   cdef long i, n
   n = nparray.size
   for i in range(n):
-    tensor_proto.bool_val.append(np.asscalar(nparray[i]))
+    tensor_proto.bool_val.append(nparray.item(i))
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 36f7aa67c79981d83f7fb95a2f5d94ba7c28d592..aafab297ca196ac04678b834b7069a816ea2113e 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -19,14 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as py_collections
+import itertools
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
@@ -63,12 +65,6 @@ class UnknownArgument(object):
   pass
 
 
-# TODO(vbardiovsky): Remove this when nest is updated with new
-# flatten_with_tuple_paths.
-def flatten_with_tuple_paths(structure):
-  return list(zip(nest.yield_flat_paths(structure), nest.flatten(structure)))
-
-
 def convert_structure_to_signature(structure, arg_names=None):
   """Convert a potentially nested structure to a signature.
 
@@ -82,30 +78,48 @@ def convert_structure_to_signature(structure, arg_names=None):
     Identical structure that has TensorSpec objects instead of Tensors and
     UknownArgument instead of any unsupported types.
   """
-
-  def encode_arg(arg, name=None):
+  def encode_arg(arg, path):
     """A representation for this argument, for converting into signatures."""
     if isinstance(arg, ops.Tensor):
+      user_specified_name = None
+      try:
+        user_specified_name = compat.as_str(
+            arg.op.get_attr("_user_specified_name"))
+      except ValueError:
+        pass
+
+      if path and user_specified_name and user_specified_name != path[0]:
+        # The user has explicitly named the argument differently than the name
+        # of the function argument.
+        name = user_specified_name
+      else:
+        name = "/".join([str(p) for p in path])
       return tensor_spec.TensorSpec(arg.shape, arg.dtype, name)
-    if isinstance(arg, (int, float, bool, tensor_spec.TensorSpec)):
+    if isinstance(arg, (
+        int,
+        float,
+        bool,
+        type(None),
+        dtypes.DType,
+        tensor_spec.TensorSpec,
+    )):
       return arg
     return UnknownArgument()
 
   # We are using the flattened paths to name the TensorSpecs. We need an
   # explicit name for them downstream.
-  flattened = flatten_with_tuple_paths(structure)
+  flattened = nest.flatten_with_tuple_paths(structure)
   if arg_names:
     if len(arg_names) != len(structure):
       raise ValueError(
           "Passed in arg_names don't match actual signature (%s)." % arg_names)
     # Replace all top-level names with their actual arg_names. If a path before
     # was "(2,'a',1)", it will become "(arg_names[2],'a',1)".
-    flattened = [((arg_names[0],) + path[1:], arg) for path, arg in flattened]
+    flattened = [
+        ((arg_names[path[0]],) + path[1:], arg) for path, arg in flattened
+    ]
 
-  mapped = [
-      encode_arg(arg, "/".join([str(p) for p in path]))
-      for path, arg in flattened
-  ]
+  mapped = [encode_arg(arg, path) for path, arg in flattened]
   return nest.pack_sequence_as(structure, mapped)
 
 
@@ -120,6 +134,8 @@ class FuncGraph(ops.Graph):
       inputs coming first.
     outputs: Tensors that will be returned by this function. The tensors are in
       this FuncGraph.
+    control_outputs: Operations that must be executed before the function
+      represented by this graph can be said to have been executed.
     structured_input_signature: A tuple of (args, kwargs), which are both
       possibly-nested python objects that were received by this function. Note
       that these structures might contain Python `None`s.
@@ -131,10 +147,14 @@ class FuncGraph(ops.Graph):
       or the global default Graph.
     captures: Maps external tensor -> internal tensor (i.e. input placeholder).
       The entries are in the order they were captured.
+    control_captures: Set of external ops on which this graph has a control
+      dependency.
     seed: The graph-level random seed.
+    capture_by_value: If True, the func graph will capture Variables by value
+      instead of reference.
   """
 
-  def __init__(self, name, collections=None):
+  def __init__(self, name, collections=None, capture_by_value=None):
     """Construct a new FuncGraph.
 
     The graph will inherit its graph key, collections, seed, and distribution
@@ -149,17 +169,30 @@ class FuncGraph(ops.Graph):
         The current whitelisted collections are the global variables, the
         local variables, and the trainable variables.
         Defaults to None.
+      capture_by_value: An optional boolean. If True, the func graph will
+        capture Variables by value instead of reference. By default inherit
+        from outer graphs, and failing that will default to False.
     """
     super(FuncGraph, self).__init__()
 
     self.name = name
     self.inputs = []
     self.outputs = []
+    self.control_outputs = []
+    self.control_captures = set()
     self.structured_input_signature = None
     self.structured_outputs = None
     self._weak_variables = []
     self.outer_graph = ops.get_default_graph()
     self.captures = py_collections.OrderedDict()
+    # Inherit capture-by-value from outer graph.
+    if capture_by_value is not None:
+      self.capture_by_value = capture_by_value
+    elif self.outer_graph is not None and isinstance(
+        self.outer_graph, FuncGraph):
+      self.capture_by_value = self.outer_graph.capture_by_value
+    else:
+      self.capture_by_value = False
 
     self._building_function = True
     # Map from resource tensor name to last op (in program order) which uses
@@ -171,12 +204,13 @@ class FuncGraph(ops.Graph):
 
     if context.executing_eagerly():
       self.seed = context.global_seed()
-      device_type = context.context().device_spec.device_type
-      self._xla_compile = (device_type == "TPU" or device_type == "XLA_GPU"
-                           or device_type == "XLA_CPU")
+      # [for tf-data user migration from TF1.0 to 2.0] seed_used keep track of
+      # any None op_seed for random_op in the function, in which case we end up
+      # using function seed, which could be unintended behavior for the op.
+      self._seed_used = False
     else:
       self.seed = graph.seed
-      self._xla_compile = getattr(graph, "_xla_compile", False)
+      self._seed_used = False
       # TODO(allenl): Figure out if we can remove colocation stack
       # specialization (currently used in cond_v2), here and in the cache key.
       self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
@@ -192,6 +226,50 @@ class FuncGraph(ops.Graph):
     else:
       self._collections = collections
 
+  def __str__(self):
+    return "FuncGraph(name=%s, id=%s)" % (self.name, id(self))
+
+  def control_dependencies(self, control_inputs):
+    """Handles control dependencies.
+
+    FuncGraph wraps Graph's control_dependencies logic by first filtering out
+    any external tensors / operations and storing them in the graph's
+    control_captures member. Any consumers of this function graph must then
+    decide how to handle the control captures.
+
+    Args:
+      control_inputs: A list of `Operation` or `Tensor` objects which
+        must be executed or computed before running the operations
+        defined in the context.  Can also be `None` to clear the control
+        dependencies.
+
+    Returns:
+     A context manager that specifies control dependencies for all
+     operations constructed within the context.
+
+    Raises:
+      TypeError: If `control_inputs` is not a list of `Operation` or
+        `Tensor` objects.
+    """
+    if control_inputs is None:
+      return super(FuncGraph, self).control_dependencies(control_inputs)
+
+    filtered_control_inputs = []
+    for c in control_inputs:
+      # Check for _UnreadVariable
+      if (isinstance(c, ops.IndexedSlices) or
+          (hasattr(c, "_handle") and hasattr(c, "op"))):
+        c = c.op
+      graph_element = ops._as_graph_element(c)  # pylint: disable=protected-access
+      if graph_element is None:
+        graph_element = c
+      if graph_element is not None and getattr(
+          graph_element, "graph", None) is not self:
+        self.control_captures.add(graph_element)
+      else:
+        filtered_control_inputs.append(graph_element)
+    return super(FuncGraph, self).control_dependencies(filtered_control_inputs)
+
   def as_default(self):
     outer_cm = super(FuncGraph, self).as_default()
 
@@ -215,11 +293,10 @@ class FuncGraph(ops.Graph):
       # restored.
       old_device_stack = self._device_function_stack
       if context.executing_eagerly():
-        if self._distribution_strategy_stack or self._xla_compile:
+        if self._distribution_strategy_stack:
           self._add_device_to_stack(context.context().device_name)
       else:
         if (self._distribution_strategy_stack
-            or self._xla_compile
             or device_stack_has_callable(graph._device_function_stack)):
           # Hard-code devices from device functions in the function body
           self._device_function_stack = graph._device_function_stack.copy()
@@ -275,11 +352,39 @@ class FuncGraph(ops.Graph):
   def variables(self, var_list):
     self._weak_variables = [weakref.ref(v) for v in var_list]
 
+  def _capture_by_value(
+      self,
+      op_type,
+      inputs,
+      dtypes,  # pylint: disable=redefined-outer-name
+      input_types=None,
+      name=None,
+      attrs=None,
+      op_def=None,
+      compute_shapes=True,
+      compute_device=True):
+    # When capturing by value, do the read outside
+    reverse_captures = dict((v, k) for k, v in self.captures.items())
+    uncaptured_inputs = [reverse_captures.get(t, t) for t in inputs]
+    with ops.init_scope():
+      if context.executing_eagerly():
+        attr_list = ("dtype", int(attrs["dtype"].type))
+        value, = execute.execute(
+            compat.as_bytes(op_type), 1, uncaptured_inputs, attr_list,
+            context.context())
+      else:
+        op = ops.get_default_graph().create_op(
+            op_type, uncaptured_inputs, dtypes, input_types, name, attrs,
+            op_def, compute_shapes, compute_device)
+        value = op.outputs[0]
+    captured_value = self.capture(value)
+    return captured_value.op
+
   def create_op(
       self,
       op_type,
       inputs,
-      dtypes,
+      dtypes=None,  # pylint: disable=redefined-outer-name
       input_types=None,
       name=None,
       attrs=None,
@@ -296,8 +401,8 @@ class FuncGraph(ops.Graph):
       op_type: The `Operation` type to create. This corresponds to the
         `OpDef.name` field for the proto that defines the operation.
       inputs: A list of `Tensor` objects that will be inputs to the `Operation`.
-      dtypes: A list of `DType` objects that will be the types of the tensors
-        that the operation produces.
+      dtypes: (Optional) A list of `DType` objects that will be the types of the
+        tensors that the operation produces.
       input_types: (Optional.) A list of `DType`s that will be the types of
         the tensors that the operation consumes. By default, uses the base
         `DType` of each input in `inputs`. Operations that expect
@@ -318,6 +423,12 @@ class FuncGraph(ops.Graph):
     Returns:
       An `Operation` object.
     """
+    if self.capture_by_value and op_type in ["ReadVariableOp",
+                                             "ResourceGather"]:
+      return self._capture_by_value(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+
     # This capturing logic interacts poorly with control flow contexts which
     # want to replace inputs of ops far too late in the process. This can lead
     # the context to get confused and try to create an Enter for an Enter. We
@@ -363,6 +474,19 @@ class FuncGraph(ops.Graph):
     if tensor.graph is not self:
       if name is None:
         name = tensor.op.name
+      inner_graph = tensor.graph
+      while inner_graph is not None and isinstance(inner_graph, FuncGraph):
+        if inner_graph is self:
+          raise ValueError(
+              "Trying to capture a tensor from an inner function. This can be "
+              "caused by accessing a tensor defined inside a loop or "
+              "conditional body, or a subfunction, from a calling function, "
+              "without going through the proper return value mechanism. "
+              "Consider using TensorFlow mechanisms such as TensorArrays "
+              "to return tensors from inner functions or loop / conditional "
+              "bodies. Tensor: %s; tensor graph: %s; this graph: %s"
+              % (tensor, tensor.graph, self))
+        inner_graph = inner_graph.outer_graph
       return self._capture_helper(tensor, name)
     return tensor
 
@@ -395,10 +519,13 @@ def func_graph_from_py_func(name,
                             signature=None,
                             func_graph=None,
                             autograph=False,
+                            autograph_options=None,
                             add_control_dependencies=True,
                             arg_names=None,
                             op_return_value=None,
-                            collections=None):
+                            collections=None,
+                            capture_by_value=None,
+                            override_flat_arg_shapes=None):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -417,6 +544,8 @@ def func_graph_from_py_func(name,
       this graph else a new one is built and returned.
     autograph: whether to use autograph to compile `python_func`.
       See https://www.tensorflow.org/guide/autograph for more information.
+    autograph_options: additional knobs to control when `autograph=True`.
+      See https://www.tensorflow.org/guide/autograph for more information.
     add_control_dependencies: If True, automatically adds control dependencies
       to ensure program order matches execution order and stateful ops always
       execute.
@@ -432,6 +561,15 @@ def func_graph_from_py_func(name,
       The current whitelisted collections are the global variables, the
       local variables, and the trainable variables.
       Defaults to None.
+    capture_by_value: An optional boolean. If True, the func graph will capture
+      Variables by value instead of reference. By default inherit from outer
+      graphs, and failing that will default to False.
+    override_flat_arg_shapes: An optional list of instances that are either
+      `None` or `TensorShape`.  The length must match that of
+      `nest.flatten((args, kwargs))`.  The entries containing value `None`
+      must match entries in flattened arguments containing non-tensors, while
+      entries containing a `TensorShape` must match entries in the flattened
+      arguments containing tensors.
 
   Returns:
     A FuncGraph.
@@ -439,28 +577,45 @@ def func_graph_from_py_func(name,
   Raises:
     TypeError: If any of `python_func`'s return values is neither `None` nor a
       `Tensor`.
+    ValueError: If both `signature` and `override_flat_arg_shapes` are
+      passed in.
   """
   if op_return_value is not None:
     assert isinstance(op_return_value, ops.Tensor), op_return_value
   if func_graph is None:
-    func_graph = FuncGraph(name, collections=collections)
+    func_graph = FuncGraph(name, collections=collections,
+                           capture_by_value=capture_by_value)
   assert isinstance(func_graph, FuncGraph)
   if add_control_dependencies:
-    control_manager = AutomaticControlDependencies
+    control_manager = AutomaticControlDependencies()
   else:
-    control_manager = ops.NullContextmanager
-  with func_graph.as_default(), control_manager() as a:
+    control_manager = ops.NullContextmanager()
+  with func_graph.as_default(), control_manager as a:
     current_scope = variable_scope.get_variable_scope()
     default_use_recource = current_scope.use_resource
     current_scope.set_use_resource(True)
 
+    if signature is not None and override_flat_arg_shapes is not None:
+      raise ValueError(
+          "Passed both signature and override_flat_arg_shapes: %s and %s."
+          % (signature, override_flat_arg_shapes))
+
     if signature is not None:
       args = signature
       kwargs = {}
 
     # Creates and names placeholders for all arguments.
-    func_args = _get_defun_inputs_from_args(args, arg_names)
-    func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
+    if override_flat_arg_shapes is not None:
+      flat_args = nest.flatten(args)
+      arg_shapes = override_flat_arg_shapes[:len(flat_args)]
+      kwarg_shapes = override_flat_arg_shapes[len(flat_args):]
+    else:
+      arg_shapes = None
+      kwarg_shapes = None
+    func_args = _get_defun_inputs_from_args(
+        args, arg_names, flat_shapes=arg_shapes)
+    func_kwargs = _get_defun_inputs_from_kwargs(
+        kwargs, flat_shapes=kwarg_shapes)
 
     # Convert all Tensors into TensorSpecs before saving the structured inputs.
     # If storing pure concrete functions that are not called through polymorphic
@@ -470,12 +625,19 @@ def func_graph_from_py_func(name,
         convert_structure_to_signature(func_args, arg_names),
         convert_structure_to_signature(func_kwargs))
 
+    flat_func_args = nest.flatten(func_args)
+    flat_func_kwargs = nest.flatten(func_kwargs)
+    # Temporarily set inputs to allow graph building code to inspect
+    # them. Reassigned below.
+    func_graph.inputs = [arg for arg in flat_func_args + flat_func_kwargs
+                         if isinstance(arg, ops.Tensor)]
+
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
-    func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
+    func_args_before = nest.pack_sequence_as(func_args, flat_func_args)
     func_kwargs_before = nest.pack_sequence_as(
-        func_kwargs, nest.flatten(func_kwargs))
+        func_kwargs, flat_func_kwargs)
 
     def convert(x):
       """Converts a function output to a Tensor."""
@@ -489,7 +651,7 @@ def func_graph_from_py_func(name,
           x = array_ops.identity(op_return_value)
       elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
-          x = ops.convert_to_tensor_or_indexed_slices(x)
+          x = ops.convert_to_tensor_or_composite(x)
         except (ValueError, TypeError):
           raise TypeError(
               "To be compatible with tf.contrib.eager.defun, Python functions "
@@ -516,17 +678,16 @@ def func_graph_from_py_func(name,
           return autograph.converted_call(
               original_func, None,
               autograph.ConversionOptions(
-                  verbose=autograph.Verbosity.BRIEF,
                   recursive=True,
-                  strip_decorators=(def_function.function,),
-                  optional_features=(),
+                  optional_features=autograph_options,
                   force_conversion=True,
-              ), *args, **kwargs)
+              ), args, kwargs)
 
         # Wrapping around a decorator allows checks like tf_inspect.getargspec
         # to be accurate.
         converted_func = tf_decorator.make_decorator(original_func, wrapper)
-        tf_decorator.rewrap(python_func, original_func, converted_func)
+        python_func = tf_decorator.rewrap(python_func, original_func,
+                                          converted_func)
 
       func_outputs = python_func(*func_args, **func_kwargs)
 
@@ -550,7 +711,9 @@ def func_graph_from_py_func(name,
         # Even if an argument variable was not used in the function, we've
         # already manually captured the resource Tensor when creating argument
         # placeholders.
-        resource_placeholder = func_graph.captures.pop(arg.handle)
+        resource_placeholder = func_graph.captures.pop(arg.handle, None)
+        if resource_placeholder is None:
+          continue
         arg_variables.add(arg)
         inputs.append(resource_placeholder)
       elif isinstance(arg, ops.Tensor):
@@ -567,7 +730,10 @@ def func_graph_from_py_func(name,
 
     func_graph.variables = variables
 
-  # Register any other functions defined in the graph.
+  if add_control_dependencies:
+    func_graph.control_outputs.extend(control_manager.ops_which_must_run)
+
+# Register any other functions defined in the graph.
   with ops.init_scope():
     if context.executing_eagerly():
       for f in func_graph._functions.values():  # pylint: disable=protected-access
@@ -623,36 +789,25 @@ def flatten(sequence):
   Flattens non-tensor objects into their constituent tensors.
 
   Args:
-    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+    sequence: A nested structure of Tensors, CompositeTensors, and
       TensorArrays.
 
   Returns:
     A list of tensors.
   """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
-  flat_sequence = nest.flatten(sequence)
-  outputs = []
-  for item in flat_sequence:
-    if isinstance(item, ops.IndexedSlices):
-      if item.dense_shape is not None:
-        outputs.extend([item.values, item.indices, item.dense_shape])
-      else:
-        outputs.extend([item.values, item.indices])
-    elif isinstance(item, sparse_tensor.SparseTensor):
-      outputs.extend([item.indices, item.values, item.dense_shape])
-    elif isinstance(item, tensor_array_ops.TensorArray):
-      outputs.append(item.flow)
-    else:
-      outputs.append(item)
-  return outputs
+  flat_sequence = nest.flatten(sequence, expand_composites=True)
+  return [
+      item.flow if isinstance(item, tensor_array_ops.TensorArray) else item
+      for item in flat_sequence]
 
 
 def pack_sequence_as(structure, flat_sequence):
   """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
 
   Args:
-    structure: The structure to pack into. May contain Tensors, IndexedSlices,
-      TensorArrays or SparseTensors.
+    structure: The structure to pack into. May contain Tensors,
+      CompositeTensors, or TensorArrays.
     flat_sequence: An iterable containing tensors.
 
   Returns:
@@ -661,33 +816,16 @@ def pack_sequence_as(structure, flat_sequence):
   Raises:
     AssertionError if `structure` and `flat_sequence` are not compatible.
   """
-  flattened_structure = nest.flatten(structure)
-  flat_sequence_with_slices_and_tas = []
-  index = 0
-  for t in flattened_structure:
-    if isinstance(t, ops.IndexedSlices):
-      if t.dense_shape is not None:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 3]))
-        index += 3
-      else:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 2]))
-        index += 2
-    elif isinstance(t, sparse_tensor.SparseTensor):
-      flat_sequence_with_slices_and_tas.append(
-          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
-      index += 3
-    elif isinstance(t, tensor_array_ops.TensorArray):
-      flow = flat_sequence[index]
-      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
-      flat_sequence_with_slices_and_tas.append(ta)
-      index += 1
-    else:
-      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
-      index += 1
-  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
-  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+  flat_sequence = list(flat_sequence)
+  flattened_structure = nest.flatten(structure, expand_composites=True)
+  if len(flattened_structure) != len(flat_sequence):
+    raise ValueError("Mismatch in element count")
+  for i in range(len(flat_sequence)):
+    if isinstance(flattened_structure[i], tensor_array_ops.TensorArray):
+      flat_sequence[i] = tensor_array_ops.build_ta_with_new_flow(
+          old_ta=flattened_structure[i], flow=flat_sequence[i])
+  return nest.pack_sequence_as(structure, flat_sequence, expand_composites=True)
+
 
 
 def _create_substitute_placeholder(value, name=None, dtype=None):
@@ -701,37 +839,82 @@ def _create_substitute_placeholder(value, name=None, dtype=None):
   return placeholder
 
 
-def _get_defun_inputs_from_args(args, names):
+def _get_defun_inputs_from_args(args, names, flat_shapes=None):
   """Maps Python function positional args to graph-construction inputs."""
-  return _get_defun_inputs(args, names, structure=args)
+  return _get_defun_inputs(
+      args, names, structure=args, flat_shapes=flat_shapes)
 
 
-def _get_defun_inputs(flat_args, names, structure):
+def _get_defun_inputs(args, names, structure, flat_shapes=None):
   """Maps python function args to graph-construction inputs.
 
   Args:
-    flat_args: A flat list of user-specified arguments.
+    args: A flat list of user-specified arguments.
     names: A list of strings with user-specified argument names, same length as
-      `flat_args`. May be `None`, in which case a generic name is used.
+      `args`. May be `None`, in which case a generic name is used.
     structure: The original argument list or dictionary.
+    flat_shapes: A flat list of values that are either `None` or
+      instances of `TensorShape`.  If provided, then length must match
+      that of `nest.flatten(args)`; and locations where `args` are
+      instances of `Tensor` must have a corresponding `TensorShape` in
+      `flat_shapes`.  May be `None`, in which case exact shapes are read
+      directly from the args.
 
   Returns:
     Placeholders with the same structure as `structure`.
+
+  Raises:
+    RuntimeError: if `flat_shapes` is provided, but
+     `len(flat_shapes) != len(nest.flatten(args))`.
+    RuntimeError: if a shape from `flat_shapes` is not None
+     for an argument that is not a `Tensor`, `TensorSpec`,
+     or `ResourceVariable`.
   """
   func_graph = ops.get_default_graph()
   function_inputs = []
   if names is None:
-    names = [None] * len(flat_args)
-  for arg_value, name in zip(flat_args, names):
-    for arg in nest.flatten(arg_value):
+    names = [None] * len(args)
+  if flat_shapes is None:
+    shapes_iter = itertools.repeat(None)
+  else:
+    len_flat_args = len(nest.flatten(args))
+    if len_flat_args != len(flat_shapes):
+      raise RuntimeError(
+          "Length of fully flat shapes (%d) must match that of "
+          "flatten(args) (%d).  args: %s, flat_shapes: %s"
+          % (len(flat_shapes),
+             len_flat_args,
+             args,
+             flat_shapes))
+    shapes_iter = iter(flat_shapes)
+  for arg_value, name in zip(args, names):
+    flattened = nest.flatten(arg_value)
+    tensor_specs = [
+        arg for arg in flattened if isinstance(arg, tensor_spec.TensorSpec)
+    ]
+    specified_names = [arg.name for arg in tensor_specs if arg.name]
+    if specified_names and len(specified_names) < len(tensor_specs):
+      raise ValueError("If specifying TensorSpec names for nested structures, "
+                       "either zero or all names have to be specified.")
+
+    for arg in flattened:
+      # We have a shape entry for each arg, regadless of whether it's a real
+      # Tensor or not.  For non-tensor entries it should be None.
+      shape = next(shapes_iter)
       if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
         if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
           requested_name = arg.name
         else:
           requested_name = name
-        placeholder = graph_placeholder(
-            arg.dtype, arg.shape,
-            name=requested_name)
+        placeholder_shape = shape if shape is not None else arg.shape
+        try:
+          placeholder = graph_placeholder(
+              arg.dtype, placeholder_shape,
+              name=requested_name)
+        except ValueError:
+          # Sometimes parameter names are not valid op names, so fall back to
+          # unnamed placeholders.
+          placeholder = graph_placeholder(arg.dtype, placeholder_shape)
         if name is not None:
           # Record the requested/user-specified name in case it's different than
           # the uniquified name, for validation when exporting signatures.
@@ -750,18 +933,24 @@ def _get_defun_inputs(flat_args, names, structure):
             attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
         function_inputs.append(arg)
       else:
+        if shape is not None:
+          raise RuntimeError(
+              "Expected provided shape override to be None for arg that isn't "
+              "a Tensor, but saw arg: '%s', shape: '%s'.  args: %s"
+              % (arg, shape, args))
         function_inputs.append(arg)
   return nest.pack_sequence_as(structure, function_inputs)
 
 
-def _get_defun_inputs_from_kwargs(kwargs):
+def _get_defun_inputs_from_kwargs(kwargs, flat_shapes):
   """Maps Python function keyword args to graph-construction inputs."""
   if kwargs:
-    names, flat_args = zip(*sorted(kwargs.items()))
+    names, args = zip(*sorted(kwargs.items()))
   else:
     names = []
-    flat_args = []
-  return _get_defun_inputs(flat_args, names, structure=kwargs)
+    args = []
+  return _get_defun_inputs(
+      args, names, structure=kwargs, flat_shapes=flat_shapes)
 
 
 def dismantle_func_graph(func_graph):
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index afc11b17bfd1447e502906bb973eb5746dfe0274..3a00444b74623c0788515f311b6f5de915fa2259 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,8 @@
 # =============================================================================
 """Python front-end supports for functions.
 
-NOTE: functions are currently experimental and subject to change!
+NOTE: At this time, functions are experimental and subject to change!. Proceed
+with caution.
 """
 
 from __future__ import absolute_import
@@ -59,8 +60,8 @@ class Defun(object):
       def foo(x, y):
         ...
 
-  When you call the decorated function it will add `call` ops to the
-  default graph and adds the definition of the function into the
+  When you call the decorated function, it adds the `call` ops to the
+  default graph. In addition, it adds the definition of the function into the
   default graph. Because the addition of the function into the graph
   is deferred, the decorator can be used anywhere in the program.
 
@@ -129,13 +130,15 @@ class Defun(object):
   def __call__(self, func):
     # Various sanity checks on the callable func.
     if not callable(func):
-      raise ValueError("func %s must be callable" % func)
+      raise ValueError("function %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
     argspec = tf_inspect.getargspec(func)
     if argspec.keywords or argspec.defaults:
-      raise ValueError("Functions with argument defaults or keywords "
-                       "arguments are not supported.")
+      raise ValueError(
+          "function with argument defaults or keywords arguments are not"
+          " supported. {} has defaults {} and keywords {}.".format(
+              func, argspec.defaults, argspec.keywords))
 
     # Computes how many arguments 'func' has.
     min_args = len(argspec.args)
@@ -210,6 +213,7 @@ class _DefinedFunction(object):
                shape_func=None,
                capture_by_value=False,
                whitelisted_stateful_ops=None,
+               capture_resource_var_by_value=True,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -232,6 +236,8 @@ class _DefinedFunction(object):
         will be copied into the function body.
       whitelisted_stateful_ops: A set of ops that if stateful we ignore and
         copy into the function body, when `capture_by_value` is True.
+      capture_resource_var_by_value: Boolean (defaults to True). If False,
+        captured resource variable returns the handle instead of value.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -250,6 +256,7 @@ class _DefinedFunction(object):
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
     if self._whitelisted_stateful_ops is None:
       self._whitelisted_stateful_ops = set()
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -306,7 +313,7 @@ class _DefinedFunction(object):
 
   @property
   def grad_func_name(self):
-    """Its gradient function's name."""
+    """Returns the name of the gradient function."""
     return self._grad_func.name if self._grad_func else None
 
   @property
@@ -352,7 +359,8 @@ class _DefinedFunction(object):
         self._func_name,
         self._capture_by_value,
         self._caller_device,
-        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops,
+        capture_resource_var_by_value=self._capture_resource_var_by_value)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -407,6 +415,8 @@ class _DefinedFunction(object):
           [t._as_tf_output() for t in temp_graph.inputs],
           [t._as_tf_output() for t in temp_graph.outputs],
           output_names,
+          [], # control_outputs
+          [], # control_output_names
           None,  # opts
           description)
       self._c_func = c_api_util.ScopedTFFunction(c_func)
@@ -636,11 +646,12 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
-               **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops,
+               capture_resource_var_by_value, *args, **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -735,7 +746,8 @@ class _FuncGraph(ops.Graph):
           collections=collections,
           use_resource=use_resource)
       self.extra_vars.append(var)
-      if isinstance(var, resource_variable_ops.ResourceVariable):
+      if (isinstance(var, resource_variable_ops.ResourceVariable) and
+          self._capture_resource_var_by_value):
         # For resource-based variables read the variable outside the function
         # and pass in the value. This ensures that the function is pure and
         # differentiable. TODO(apassos) this may have performance problems if
@@ -743,12 +755,12 @@ class _FuncGraph(ops.Graph):
         return var.value()
       return var
 
-  def create_op(self, op_type, inputs, data_types, **kwargs):
+  def create_op(self, op_type, inputs, dtypes=None, **kwargs):  # pylint: disable=redefined-outer-name
     for i, x in enumerate(inputs):
       if isinstance(x, ops.EagerTensor) or x.graph is not self:
         inputs[i] = self.capture(x)
-    return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
-                                             **kwargs)
+    return super(_FuncGraph, self).create_op(op_type, inputs,
+                                             dtypes=dtypes, **kwargs)
 
   def capture(self, tensor, name=None):
     """Adds the given tensor to this graph and returns the captured tensor."""
@@ -830,7 +842,8 @@ def func_graph_from_py_func(func,
                             container=None,
                             collections_ref=None,
                             arg_shapes=None,
-                            whitelisted_stateful_ops=None):
+                            whitelisted_stateful_ops=None,
+                            capture_resource_var_by_value=True):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -850,6 +863,8 @@ def func_graph_from_py_func(func,
     arg_shapes: A sequence of the function's argument shapes.
     whitelisted_stateful_ops: A set of ops that if stateful we ignore and
       re-create.
+    capture_resource_var_by_value: Boolean (defaults to True). If False,
+      captured resource variable returns the handle instead of value.
 
   Returns:
     A _FuncGraph.
@@ -859,7 +874,8 @@ def func_graph_from_py_func(func,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops,
+                          capture_resource_var_by_value)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 10ad7ad541f2f0eb15776deb0c3225421bf47a17..79cc72918ee9b8a6d2b511a2e1ac06412346a817 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
@@ -72,10 +73,22 @@ def function_def_to_graph(fdef, input_shapes=None):
     func_graph.outputs = [
         func_graph.get_tensor_by_name(name) for name in output_tensor_names
     ]
+    func_graph.control_outputs = [
+        func_graph.get_operation_by_name(fdef.control_ret[ret_name])
+        for ret_name in fdef.signature.control_output
+    ]
 
   return func_graph
 
 
+def _is_function(fname):
+  """Checks for a function definition with `fname` in the current context."""
+  if context.executing_eagerly():
+    return context.context().has_function(fname)
+  else:
+    return ops.get_default_graph()._is_function(fname)  # pylint: disable=protected-access
+
+
 def function_def_to_graph_def(fdef, input_shapes=None):
   """Convert a FunctionDef to a GraphDef.
 
@@ -147,12 +160,12 @@ def function_def_to_graph_def(fdef, input_shapes=None):
     for attr in op_def.attr:
       if attr.type == "func":
         fname = node_def.attr[attr.name].func.name
-        if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+        if not _is_function(fname):
           raise ValueError("%s function not found." % fname)
       elif attr.type == "list(func)":
         for fn in node_def.attr[attr.name].list.func:
           fname = fn.name
-          if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+          if not _is_function(fname):
             raise ValueError("%s function not found." % fname)
 
     # Iterate over output_args in op_def to build the map.
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 3d5a5fe79758d43e54a7acaa689bd7d7fe902c56..cd623223e32c29c48b4b338bf508a9cabd02c643 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -284,7 +284,7 @@ class FunctionTest(test.TestCase):
         out, = sess.run(dlogits, {logits: x, labels: y})
       self.assertAllClose(out, np.exp(prob - y))
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/124286351")  # No error is raised
   def testCustomGradientError(self):
     dtype = dtypes.float32
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 1b61ac925ce3d555525c9086172d43c75a3af10c..62964384c719180f530d7cd8c79e7a53055559db 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -53,7 +53,7 @@ def _is_variable_op(op):
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.must_run_on_cpu")
+    instructions="Use `tf.compat.v1.graph_util.must_run_on_cpu`")
 @tf_export(v1=["graph_util.must_run_on_cpu"])
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
@@ -143,19 +143,20 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   # Breadth first search to find all the nodes that we should keep.
   next_to_visit = target_nodes[:]
   while next_to_visit:
-    n = next_to_visit[0]
+    node = next_to_visit[0]
     del next_to_visit[0]
-    if n in nodes_to_keep:
+    if node in nodes_to_keep:
       # Already visited this node.
       continue
-    nodes_to_keep.add(n)
-    next_to_visit += name_to_input_name[n]
+    nodes_to_keep.add(node)
+    if node in name_to_input_name:
+      next_to_visit += name_to_input_name[node]
   return nodes_to_keep
 
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.extract_sub_graph")
+    instructions="Use `tf.compat.v1.graph_util.extract_sub_graph`")
 @tf_export(v1=["graph_util.extract_sub_graph"])
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
@@ -196,7 +197,8 @@ def extract_sub_graph(graph_def, dest_nodes):
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+    instructions="Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`"
+)
 @tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
@@ -214,7 +216,7 @@ def tensor_shape_from_node_def_name(graph, input_name):
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.convert_variables_to_constants")
+    instructions="Use `tf.compat.v1.graph_util.convert_variables_to_constants`")
 @tf_export(v1=["graph_util.convert_variables_to_constants"])
 def convert_variables_to_constants(sess,
                                    input_graph_def,
@@ -304,7 +306,7 @@ def convert_variables_to_constants(sess,
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+    instructions="Use `tf.compat.v1.graph_util.remove_training_nodes`")
 @tf_export(v1=["graph_util.remove_training_nodes"])
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
@@ -352,19 +354,27 @@ def remove_training_nodes(input_graph, protected_nodes=None):
     nodes_after_removal.append(new_node)
 
   types_to_splice = {"Identity": True}
+  control_input_names = set()
+  node_names_with_control_input = set()
+  for node in nodes_after_removal:
+    for node_input in node.input:
+      if "^" in node_input:
+        control_input_names.add(node_input.replace("^", ""))
+        node_names_with_control_input.add(node.name)
+
   names_to_splice = {}
   for node in nodes_after_removal:
     if node.op in types_to_splice and node.name not in protected_nodes:
       # We don't want to remove nodes that have control edge inputs, because
       # they might be involved in subtle dependency issues that removing them
       # will jeopardize.
-      has_control_edge = False
-      for input_name in node.input:
-        if re.match(r"^\^", input_name):
-          has_control_edge = True
-      if not has_control_edge:
+      if node.name not in node_names_with_control_input:
         names_to_splice[node.name] = node.input[0]
 
+  # We also don't want to remove nodes which are used as control edge inputs.
+  names_to_splice = {name: value for name, value in names_to_splice.items()
+                     if name not in control_input_names}
+
   nodes_after_splicing = []
   for node in nodes_after_removal:
     if node.name in names_to_splice:
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index dd26b8a78e9d2e13b34770775fcb1219745396e0..78777dc87724ab202e267a3aab4666c81465de59 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -308,8 +308,9 @@ class DeviceFunctionsTest(test.TestCase):
       new_node.input.extend([input_name])
     return new_node
 
-  def create_constant_node_def(self, name, value, dtype, shape=None):
-    node = self.create_node_def("Const", name, [])
+  def create_constant_node_def(self, name, value, dtype,
+                               shape=None, inputs=None):
+    node = self.create_node_def("Const", name, inputs or [])
     self.set_attr_dtype(node, "dtype", dtype)
     self.set_attr_tensor(node, "value", value, dtype, shape)
     return node
@@ -393,6 +394,18 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertProtoEquals(expected_graph_def,
                            graph_util.remove_training_nodes(graph_def))
 
+  def testRemoveIdentityUsedAsControlInputInConst(self):
+    """Check that Identity nodes used as control inputs are not removed."""
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]),
+        self.create_node_def("Identity", "I", ["Base"]),
+        self.create_node_def("BaseOp", "Base", [])
+    ])
+
+    self.assertProtoEquals(graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 727f6aa44c2ed11414e805eb635a9adbc5519da6..a31146008ccca1ff5df540d90a621f48485b89ac 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -84,7 +84,8 @@ def load_op_library(library_filename):
   return module
 
 
-@deprecation.deprecated(date=None, instructions='Use tf.load_library instead.')
+@deprecation.deprecated(date=None,
+                        instructions='Use `tf.load_library` instead.')
 @tf_export(v1=['load_file_system_library'])
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index ddf6f66e8ab5e17aa611cce40b01953fb7a5d3b1..fc566ce0b24fa52c712fe5f64357b066e5e41a08 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -29,10 +29,12 @@ from google.protobuf import text_format
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import op_def_pb2
+from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import op_def_registry
@@ -509,6 +511,53 @@ def strip_graph_default_valued_attrs(meta_graph_def):
   meta_graph_def.meta_info_def.stripped_default_attrs = True
 
 
+def create_graph_debug_info_def(operations):
+  """Construct and returns a `GraphDebugInfo` protocol buffer.
+
+  Args:
+    operations: An iterable of op.Operation objects having _traceback members.
+
+  Returns:
+    GraphDebugInfo protocol buffer.
+
+  Raises:
+    TypeError: If the arguments are not of the correct proto buffer type.
+  """
+  # Creates an empty GraphDebugInfoDef proto.
+  graph_debug_info_def = graph_debug_info_pb2.GraphDebugInfo()
+
+  # Gets the file names and line numbers for the exported node names. Also
+  # collects the unique file names.
+  all_file_names = set()
+  node_to_trace = {}
+  for op in operations:
+    # Gets the stack trace of the operation and then the file location.
+    node_name = op.name
+    node_to_trace[node_name] = error_interpolation.compute_useful_stack(op)
+    for trace in node_to_trace[node_name]:
+      all_file_names.add(trace[0])
+
+  # Sets the `files` field in the GraphDebugInfo proto
+  graph_debug_info_def.files.extend(all_file_names)
+
+  # Builds a mapping between file names and index of the `files` field, so we
+  # only store the indexes for the nodes in the GraphDebugInfo.
+  file_to_index = dict(
+      [(y, x) for x, y in enumerate(graph_debug_info_def.files)])
+
+  # Creates the FileLineCol proto for each node and sets the value in the
+  # GraphDebugInfo proto. We only store the file name index for each node to
+  # save the storage space.
+  for node_name, trace in node_to_trace.items():
+    trace_def = graph_debug_info_def.traces[node_name]
+    for file_name, line, func, code in trace:
+      file_index = file_to_index[file_name]
+      trace_def.file_line_cols.add(
+          file_index=file_index, line=line, func=func, code=code)
+
+  return graph_debug_info_def
+
+
 def create_meta_graph_def(meta_info_def=None,
                           graph_def=None,
                           saver_def=None,
@@ -881,6 +930,7 @@ def export_scoped_meta_graph(filename=None,
                              saver_def=None,
                              clear_extraneous_savers=False,
                              strip_default_attrs=False,
+                             save_debug_info=False,
                              **kwargs):
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
@@ -910,7 +960,10 @@ def export_scoped_meta_graph(filename=None,
         graph (both Save/Restore ops and SaverDefs) that are not associated
         with the provided SaverDef.
     strip_default_attrs: Set to true if default valued attributes must be
-        removed while exporting the GraphDef.
+      removed while exporting the GraphDef.
+    save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+      which in the same directory of filename and with `_debug` added before the
+      file extension.
     **kwargs: Optional keyed arguments, including meta_info_def and
         collection_list.
 
@@ -920,8 +973,11 @@ def export_scoped_meta_graph(filename=None,
 
   Raises:
     ValueError: When the `GraphDef` is larger than 2GB.
+    ValueError: When executing in Eager mode and either `graph_def` or `graph`
+      is undefined.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "Eager Execution is enabled.")
   graph = graph or ops.get_default_graph()
@@ -1005,6 +1061,24 @@ def export_scoped_meta_graph(filename=None,
         os.path.dirname(filename),
         os.path.basename(filename),
         as_text=as_text)
+    if save_debug_info:
+      name, _ = os.path.splitext(filename)
+      debug_filename = "{name}{ext}".format(name=name, ext=".debug")
+
+      # Gets the operation from the graph by the name. Exludes variable nodes,
+      # so only the nodes in the frozen models are included.
+      ops_to_export = []
+      for node in scoped_meta_graph_def.graph_def.node:
+        scoped_op_name = ops.prepend_name_scope(node.name, export_scope)
+        ops_to_export.append(graph.get_operation_by_name(scoped_op_name))
+
+      graph_debug_info = create_graph_debug_info_def(ops_to_export)
+
+      graph_io.write_graph(
+          graph_debug_info,
+          os.path.dirname(debug_filename),
+          os.path.basename(debug_filename),
+          as_text=as_text)
 
   return scoped_meta_graph_def, var_list
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index e6e87881649729ca65db8cba9914e29b5a0d064e..3a0f338e23a414862eda0ec0836ee6e4e18dfb32 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -707,6 +707,26 @@ class ScopedMetaGraphTest(test.TestCase):
     test_util.assert_meta_graph_protos_equal(self, orig_meta_graph,
                                              new_meta_graph)
 
+  def testExportDebugInfo(self):
+    graph1 = ops.Graph()
+    with graph1.as_default():
+      with ops.name_scope("hidden1/hidden2/hidden3"):
+        images = constant_op.constant(
+            1.0, dtypes.float32, shape=[3, 2], name="images")
+        weights1 = variables.Variable([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                                      name="weights")
+        biases1 = resource_variable_ops.ResourceVariable(
+            [0.1] * 3, name="biases")
+        nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
+    debug_info_def = meta_graph.create_graph_debug_info_def(
+        operations=graph1.get_operations())
+
+    # The unique file names in all the stack traces should be larger or equal
+    # than 1.
+    self.assertTrue(len(debug_info_def.files) >= 1)
+    # All the nodes from the exported graphdef are included.
+    self.assertEqual(len(debug_info_def.traces), len(graph1.get_operations()))
+
   # Verifies that we can export a subgraph in a nested name scope containing a
   # "hidden1/hidden2" and import it into "new_hidden1/new_hidden2" in a new
   # graph.
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 2318b32ef10d67c48950061d2c489f6c7dfb20a0..1a4243b5f5adfcfa67819fe6ce697a4143cbf51e 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -212,6 +212,22 @@ def _MakeTensor(v, arg_name):
       (repr(v), arg_name))
 
 
+def _MakeFunc(v, arg_name):
+  """Ensure v is a func."""
+  if isinstance(v, attr_value_pb2.NameAttrList):
+    return v
+  fn_attr = attr_value_pb2.NameAttrList()
+  if isinstance(v, compat.bytes_or_text_types):
+    fn_attr.name = v
+  elif hasattr(v, "add_to_graph"):
+    v.add_to_graph(ops.get_default_graph())
+    fn_attr.name = v.name
+  else:
+    raise TypeError("Don't know how to convert {} to a func for "
+                    "argument {}".format(v, arg_name))
+  return fn_attr
+
+
 class _OpInfo(object):
   """All per-Op state we would like to precompute/validate."""
 
@@ -515,9 +531,9 @@ class OpDefLibrary(object):
             else:
               raise TypeError(
                   "Expected %s passed to parameter '%s' of op '%s', got %s of "
-                  "type '%s' instead." %
+                  "type '%s' instead. Error: %s" %
                   (dtypes.as_dtype(dtype).name, input_arg.name, op_type_name,
-                   repr(values), type(values).__name__))
+                   repr(values), type(values).__name__, err))
           except ValueError:
             # What type does convert_to_tensor think it has?
             try:
@@ -733,13 +749,9 @@ class OpDefLibrary(object):
           attr_value.list.tensor.extend(
               [_MakeTensor(x, key) for x in value])
         elif attr_def.type == "func":
-          if isinstance(value, attr_value_pb2.NameAttrList):
-            attr_value.func.CopyFrom(value)
-          elif isinstance(value, compat.bytes_or_text_types):
-            attr_value.func.name = value
-          else:
-            value.add_to_graph(ops.get_default_graph())
-            attr_value.func.name = value.name
+          attr_value.func.CopyFrom(_MakeFunc(value, key))
+        elif attr_def.type == "list(func)":
+          attr_value.list.func.extend([_MakeFunc(x, key) for x in value])
         else:
           raise TypeError("Unrecognized Attr type " + attr_def.type)
 
@@ -747,31 +759,19 @@ class OpDefLibrary(object):
       del attrs  # attrs is no longer authoritative, use attr_protos instead
 
       # Determine output types (possibly using attrs)
-      output_types = []
       output_structure = []
       for arg in op_def.output_arg:
-        types = []
         if arg.number_attr:
           n = _AttrValue(attr_protos, arg.number_attr).i
-          if arg.type_attr:
-            types = [_AttrValue(attr_protos, arg.type_attr).type] * n
-          else:
-            types = [arg.type] * n
           output_structure.append(n)
         elif arg.type_attr:
           t = _AttrValue(attr_protos, arg.type_attr)
-          types = [t.type]
           output_structure.append(None)
         elif arg.type_list_attr:
           t = _AttrValue(attr_protos, arg.type_list_attr)
-          types = t.list.type
-          output_structure.append(len(types))
+          output_structure.append(len(t.list.type))
         else:
-          types = [arg.type]
           output_structure.append(None)
-        if arg.is_ref:
-          types = [dtypes.as_dtype(x)._as_ref for x in types]  # pylint: disable=protected-access
-        output_types.extend(types)
 
       if keywords:
         raise TypeError("apply_op() got unexpected keyword arguments: " +
@@ -783,7 +783,7 @@ class OpDefLibrary(object):
                               if arg.is_ref]
       with _MaybeColocateWith(must_colocate_inputs):
         # Add Op to graph
-        op = g.create_op(op_type_name, inputs, output_types, name=scope,
+        op = g.create_op(op_type_name, inputs, dtypes=None, name=scope,
                          input_types=input_types, attrs=attr_protos,
                          op_def=op_def)
       return output_structure, op_def.is_stateful, op
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 66cfe213b3cc943de4cd423e8e2ffffbe0b49f8b..71d708dd89ecfbda9d64240b707563cd3fb2a9e9 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -24,6 +24,7 @@ from google.protobuf import text_format
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
@@ -140,40 +141,43 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a="Bad string")
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got 'Bad string' of type 'str' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got 'Bad string' of type 'str' instead." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=self.Tensor(dtypes.string))
-      self.assertEqual(str(cm.exception),
-                       "Input 'a' of 'Simple' Op has type string "
-                       "that does not match expected type of int32.")
+      self.assertTrue(
+          "Input 'a' of 'Simple' Op has type string "
+          "that does not match expected type of int32." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra="bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra"
+          in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra1="bogus", extra2="also_bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra1, "
-                       "extra2")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra1, "
+          "extra2" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple")
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", wrong=7)
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a={"label": 1})
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got {'label': 1} of type 'dict' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got {'label': 1} of type 'dict' instead." in str(cm.exception))
 
   def testReservedInput(self):
     with ops.Graph().as_default():
@@ -268,19 +272,13 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
         attr { key: 'T' value { type: DT_STRING } }
         """, out.op.node_def)
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary", a="left", b=12)
-      self.assertEqual(str(cm.exception),
-                       "Expected string passed to parameter 'b' of op 'Binary',"
-                       " got 12 of type 'int' instead.")
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary",
                            a=self.Tensor(dtypes.string),
                            b=self.Tensor(dtypes.int32))
-      self.assertEqual(str(cm.exception),
-                       "Input 'b' of 'Binary' Op has type int32 "
-                       "that does not match type string of argument 'a'.")
 
   def testRestrict(self):
     with ops.Graph().as_default():
@@ -466,6 +464,46 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
       self.assertEqual(str(cm.exception),
                        "Expected float for argument 'a' not 'bad'.")
 
+  def testAttrFunc(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn(x):
+        return 2 + x
+      op = self._lib.apply_op("FuncAttr", f=fn, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncAttr' attr { key: 'f'
+                                        value { func { name: 'MyFn' } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncAttr", f=3)
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
+  def testAttrFuncList(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn1(x):
+        return 2 + x
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="MyFn2")
+      def fn2(x, y):
+        return 2 + x, y * 3
+      @function.Defun(dtypes.int32, func_name="MyFn3")
+      def fn3(y):
+        return 2 + y
+      op = self._lib.apply_op("FuncListAttr", f=[fn1, fn2, fn3], name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncListAttr'
+        attr { key: 'f' value { list { func { name: 'MyFn' }
+                                       func { name: 'MyFn2' }
+                                       func { name: 'MyFn3' } } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncListAttr", f=[fn1, 3, fn2])
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
   def testAttrBool(self):
     with ops.Graph().as_default():
       op = self._lib.apply_op("AttrBool", a=True, name="t")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 6da6637b0bf19f36ec85e4109d3ec3ea97591bce..8dfcf381626802e3de1899a74e7e384a50594962 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -94,9 +95,12 @@ class _UserDeviceSpec(object):
         lineno = -1
       self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)
 
+    self.raw_string = None
+
     self.function = self._device_name_or_function
     if not (self._device_name_or_function is None or
             callable(self._device_name_or_function)):
+      self.raw_string = self._device_name_or_function
       self.function = pydev.merge_device(self._device_name_or_function)
 
 
@@ -990,7 +994,8 @@ register_dense_tensor_like_type(Tensor)
 
 
 @tf_export(v1=["convert_to_tensor"])
-def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
+def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None,
+                      dtype_hint=None):
   """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1030,15 +1035,18 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       dtype in mind when converting to a tensor, so preferred_dtype
       can be used as a soft preference.  If the conversion to
       `preferred_dtype` is not possible, this argument has no effect.
+    dtype_hint: same meaning as preferred_dtype, and overrides it.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
     ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
+  preferred_dtype = deprecation.deprecated_argument_lookup(
+      "dtype_hint", dtype_hint, "preferred_dtype", preferred_dtype)
   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
 
@@ -1085,7 +1093,7 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
@@ -1228,7 +1236,7 @@ def internal_convert_n_to_tensor(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   if ctx is None: ctx = context.context()
   for i, value in enumerate(values):
@@ -1292,7 +1300,7 @@ def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
     name: (Optional.) A name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1305,7 +1313,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
                                                  dtype=None,
                                                  name=None,
                                                  as_ref=False):
-  """Converts the given object to an `Tensor` or an `IndexedSlices`.
+  """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
   If `value` is an `IndexedSlices` or `SparseTensor` it is returned
   unmodified. Otherwise, it is converted to a `Tensor` using
@@ -1320,7 +1328,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1351,7 +1359,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
   Args:
     values: A list of `None`, `IndexedSlices`, `SparseTensor`, or objects that
       can be consumed by `convert_to_tensor()`.
-    dtype: (Optional.) The required `DType` of the returned `Tensor`
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
       `IndexedSlices`.
     name: (Optional.) A name prefix to used when a new `Tensor` is
       created, in which case element `i` will be given the name `name
@@ -1359,7 +1367,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    A list of `Tensor`, `IndexedSlices`, and/or `SparseTensor` objects.
+    A list of `Tensor`, `IndexedSlices`, `SparseTensor` and/or `None` objects.
 
   Raises:
     TypeError: If no conversion function is registered for an element in
@@ -1368,7 +1376,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   for i, value in enumerate(values):
     if value is None:
@@ -1409,6 +1417,132 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
       values=values, dtype=dtype, name=name, as_ref=False)
 
 
+def convert_to_tensor_or_composite(value, dtype=None, name=None):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified. Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor` or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  return internal_convert_to_tensor_or_composite(
+      value=value, dtype=dtype, name=name, as_ref=False)
+
+
+def internal_convert_to_tensor_or_composite(value,
+                                            dtype=None,
+                                            name=None,
+                                            as_ref=False):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified.  Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor`, or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  if isinstance(value, composite_tensor.CompositeTensor):
+    value_dtype = getattr(value, "dtype", None)
+    if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value_dtype):
+      raise ValueError(
+          "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
+          (dtypes.as_dtype(dtype).name, value.dtype.name, str(value)))
+    return value
+  else:
+    return internal_convert_to_tensor(
+        value, dtype=dtype, name=name, as_ref=as_ref)
+
+
+def internal_convert_n_to_tensor_or_composite(values,
+                                              dtype=None,
+                                              name=None,
+                                              as_ref=False):
+  """Converts `values` to a list of `Tensor` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor`, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A list of `Tensor`, `CompositeTensor`, and/or `None` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  if not isinstance(values, collections.Sequence):
+    raise TypeError("values must be a sequence.")
+  ret = []
+  for i, value in enumerate(values):
+    if value is None:
+      ret.append(value)
+    else:
+      n = None if name is None else "%s_%d" % (name, i)
+      ret.append(
+          internal_convert_to_tensor_or_composite(
+              value, dtype=dtype, name=n, as_ref=as_ref))
+  return ret
+
+
+def convert_n_to_tensor_or_composite(values, dtype=None, name=None):
+  """Converts `values` to a list of `Output` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor``, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+
+  Returns:
+    A list of `Tensor` and/or `CompositeTensor` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  return internal_convert_n_to_tensor_or_composite(
+      values=values, dtype=dtype, name=name, as_ref=False)
+
+
 # TODO(josh11b): Add ctx argument to conversion_func() signature.
 @tf_export("register_tensor_conversion_function")
 def register_tensor_conversion_function(base_type,
@@ -1488,7 +1622,7 @@ def register_tensor_conversion_function(base_type,
 
 
 @tf_export("IndexedSlices")
-class IndexedSlices(_TensorLike):
+class IndexedSlices(_TensorLike, composite_tensor.CompositeTensor):
   """A sparse representation of a set of tensor slices at given indices.
 
   This class is a simple wrapper for a pair of `Tensor` objects:
@@ -1571,6 +1705,29 @@ class IndexedSlices(_TensorLike):
   def __neg__(self):
     return IndexedSlices(-self.values, self.indices, self.dense_shape)
 
+  def _to_components(self):
+    if self._dense_shape is None:
+      return (self._values, self._indices)
+    else:
+      return (self._values, self._indices, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self._values.shape
+    if self._dense_shape is None:
+      return [shape, shape[:1]]  # values, indices
+    else:
+      # values, indices, dense_shape
+      return [shape, shape[:1], tensor_shape.TensorShape([shape.ndims])]
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 IndexedSlicesValue = collections.namedtuple(
     "IndexedSlicesValue", ["values", "indices", "dense_shape"])
@@ -2394,6 +2551,12 @@ class Operation(object):
     shapes_list = attr_value_pb2.AttrValue.ListValue(shape=shapes)
     self._set_attr(attr_name, attr_value_pb2.AttrValue(list=shapes_list))
 
+  def _clear_attr(self, attr_name):
+    """Private method used to clear an attribute in the node_def."""
+    # pylint: disable=protected-access
+    c_api.ClearAttr(self._graph._c_graph, self._c_op, attr_name)
+    # pylint: enable=protected-access
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -2894,9 +3057,6 @@ class Graph(object):
     # being called inside function definitions behave as if they were seeing the
     # actual outside graph).
     self._graph_key = "grap-key-%d/" % (uid(),)
-    # A string with the last reduction method passed to
-    # losses.compute_weighted_loss(), or None.
-    self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
     # Set to True if this graph is being built in an
@@ -2916,11 +3076,27 @@ class Graph(object):
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
   @tf_contextlib.contextmanager
-  def _variable_creator_scope(self, creator):
+  def _variable_creator_scope(self, creator, priority=100):
+    """Scope which defines a variable creation function.
+
+    Args:
+      creator: A callable taking `next_creator` and `kwargs`. See the
+        `tf.variable_creator_scope` docstring.
+      priority: Creators with a higher `priority` are called first. Within the
+        same priority, creators are called inner-to-outer.
+
+    Yields:
+      `_variable_creator_scope` is a context manager with a side effect, but
+      doesn't return a value.
+    """
     # This step makes a copy of the existing stack, and it also initializes
     # self._thread_local._variable_creator_stack if it doesn't exist yet.
     old = list(self._variable_creator_stack)
-    self._thread_local._variable_creator_stack.append(creator)  # pylint: disable=protected-access
+    stack = self._thread_local._variable_creator_stack  # pylint: disable=protected-access
+    stack.append((priority, creator))
+    # Sorting is stable, so we'll put higher-priority creators later in the list
+    # but otherwise maintain registration order.
+    stack.sort(key=lambda item: item[0])
     try:
       yield
     finally:
@@ -3226,7 +3402,7 @@ class Graph(object):
       self,
       op_type,
       inputs,
-      dtypes,  # pylint: disable=redefined-outer-name
+      dtypes=None,  # pylint: disable=redefined-outer-name
       input_types=None,
       name=None,
       attrs=None,
@@ -3244,8 +3420,8 @@ class Graph(object):
       op_type: The `Operation` type to create. This corresponds to the
         `OpDef.name` field for the proto that defines the operation.
       inputs: A list of `Tensor` objects that will be inputs to the `Operation`.
-      dtypes: A list of `DType` objects that will be the types of the tensors
-        that the operation produces.
+      dtypes: (Optional) A list of `DType` objects that will be the types of the
+        tensors that the operation produces.
       input_types: (Optional.) A list of `DType`s that will be the types of
         the tensors that the operation consumes. By default, uses the base
         `DType` of each input in `inputs`. Operations that expect
@@ -4878,6 +5054,48 @@ class Graph(object):
     self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
         _distribution_strategy_stack)
 
+  @property
+  def _auto_cast_variable_read_dtype(self):
+    """The dtype that instances of `AutoCastVariable` will be casted to.
+
+    This is None if `AutoCastVariables` should not be casted.
+
+    See `AutoCastVariable` for more information.
+
+    Returns:
+      The dtype that instances of `AutoCastVariable` will be casted to.
+    """
+    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+      self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
+    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+
+  @_auto_cast_variable_read_dtype.setter
+  def _auto_cast_variable_read_dtype(self, _auto_cast_variable_read_dtype):
+    self._thread_local._auto_cast_variable_read_dtype = (  # pylint: disable=protected-access
+        _auto_cast_variable_read_dtype)
+
+  @tf_contextlib.contextmanager
+  def _enable_auto_casting_variables(self, dtype):
+    """Context manager to automatically cast AutoCastVariables.
+
+    If an AutoCastVariable `var` is used under this context manager, it will be
+    casted to `dtype` before being used.
+
+    See `AutoCastVariable` for more information.
+
+    Args:
+      dtype: The dtype that AutoCastVariables should be casted to.
+
+    Yields:
+      Nothing.
+    """
+    prev_read_dtype = self._auto_cast_variable_read_dtype
+    try:
+      self._auto_cast_variable_read_dtype = dtype
+      yield
+    finally:
+      self._auto_cast_variable_read_dtype = prev_read_dtype
+
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -4995,12 +5213,19 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
         op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
 
 
+# Internal interface to colocate_with. colocate_with has been deprecated from
+# public API. There are still a few internal uses of colocate_with. Add internal
+# only API for those uses to avoid deprecation warning.
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+
+
 @deprecation.deprecated(
     date=None,
     instructions="Colocations handled automatically by placer.")
 @tf_export(v1=["colocate_with"])
-def colocate_with(op, ignore_existing=False):
-  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+def _colocate_with(op, ignore_existing=False):
+  return colocate_with(op, ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -5253,7 +5478,8 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     context.context().context_switches.push(
-        default.building_function, default.as_default)
+        default.building_function, default.as_default,
+        default._device_function_stack)
     try:
       with super(_DefaultGraphStack, self).get_controller(
           default) as g, context.graph_mode():
@@ -5333,7 +5559,7 @@ def init_scope():
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
       scope = scope + "/"
-    inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
+    innermost_nonempty_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
     if not _default_graph_stack.stack:
@@ -5346,6 +5572,8 @@ def init_scope():
     else:
       # Find a context that is not building a function.
       for stack_entry in reversed(context.context().context_switches.stack):
+        if not innermost_nonempty_device_stack:
+          innermost_nonempty_device_stack = stack_entry.device_stack
         if not stack_entry.is_building_function:
           outer_context = stack_entry.enter_context_fn
           break
@@ -5367,6 +5595,8 @@ def init_scope():
     try:
       with outer_context(), name_scope(scope), control_dependencies(
           None), tape.stop_recording():
+        context_manager = NullContextmanager
+        context_manager_input = None
         if not context.executing_eagerly():
           # The device stack is preserved when lifting into a graph. Eager
           # execution doesn't implement device stacks and in particular it
@@ -5374,8 +5604,22 @@ def init_scope():
           # to do the same when lifting into the eager context.
           outer_graph = get_default_graph()
           outer_device_stack = outer_graph._device_function_stack  # pylint: disable=protected-access
-          outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
-        yield
+          outer_graph._device_function_stack = innermost_nonempty_device_stack  # pylint: disable=protected-access
+        elif innermost_nonempty_device_stack is not None:
+          for device_spec in innermost_nonempty_device_stack.peek_objs():
+            if device_spec.function is None:
+              break
+            if device_spec.raw_string:
+              context_manager = context.device
+              context_manager_input = device_spec.raw_string
+              break
+            # It is currently not possible to have a device function in V2,
+            # but in V1 we are unable to apply device functions in eager mode.
+            # This means that we will silently skip some of the entries on the
+            # device stack in V1 + eager mode.
+
+        with context_manager(context_manager_input):
+          yield
     finally:
       # If an exception is raised here it may be hiding a related exception in
       # try-block (just above).
@@ -5479,7 +5723,7 @@ def disable_eager_execution():
   context.default_execution_mode = context.GRAPH_MODE
   c = context.context_safe()
   if c is not None:
-    c._eager_context.is_eager = False  # pylint: disable=protected-access
+    c._thread_local_data.is_eager = False  # pylint: disable=protected-access
 
 
 def enable_eager_execution_internal(config=None,
@@ -5998,7 +6242,7 @@ name_scope_cache = {}
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-@tf_export("name_scope")
+@tf_export(v1=["name_scope"])
 class name_scope(object):  # pylint: disable=invalid-name
   """A context manager for use when defining a Python op.
 
@@ -6119,6 +6363,47 @@ class name_scope(object):  # pylint: disable=invalid-name
     return False  # False values do not suppress exceptions
 
 
+@tf_export("name_scope", v1=[])
+class name_scope_v2(name_scope):
+  """A context manager for use when defining a Python op.
+
+  This context manager pushes a name scope, which will make the name of all
+  operations added within it have a prefix.
+
+  For example, to define a new Python op called `my_op`:
+
+  ```python
+  def my_op(a, b, c, name=None):
+    with tf.name_scope("MyOp") as scope:
+      a = tf.convert_to_tensor(a, name="a")
+      b = tf.convert_to_tensor(b, name="b")
+      c = tf.convert_to_tensor(c, name="c")
+      # Define some computation that uses `a`, `b`, and `c`.
+      return foo_op(..., name=scope)
+  ```
+
+  When executed, the Tensors `a`, `b`, `c`, will have names `MyOp/a`, `MyOp/b`,
+  and `MyOp/c`.
+
+  If the scope name already exists, the name will be made unique by appending
+  `_n`. For example, calling `my_op` the second time will generate `MyOp_1/a`,
+  etc.
+  """
+
+  def __init__(self, name):
+    """Initialize the context manager.
+
+    Args:
+      name: The prefix to use on all names created within the name scope.
+
+    Raises:
+      ValueError: If name is None, or not a string.
+    """
+    if name is None or not isinstance(name, six.string_types):
+      raise ValueError("name for name_scope must be a string.")
+    super(name_scope_v2, self).__init__(name=None, default_name=name)
+
+
 def strip_name_scope(name, export_scope):
   """Removes name scope from a name.
 
diff --git a/tensorflow/python/framework/ops_enable_eager_test.py b/tensorflow/python/framework/ops_enable_eager_test.py
index 99d06f1c2d4cee7e9265d934caea4d0ec82fd45e..4da0798dd32818acd9b6ed4f9b8227bd16528c86 100644
--- a/tensorflow/python/framework/ops_enable_eager_test.py
+++ b/tensorflow/python/framework/ops_enable_eager_test.py
@@ -23,9 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import googletest
 
 
-class OpsEnableEagerTest(googletest.TestCase):
+class OpsEnableAndDisableEagerTest(googletest.TestCase):
 
-  def test_enable_eager_execution_multiple_times(self):
+  def setUp(self):
+    # test for enable eager test
     ops.enable_eager_execution()
     self.assertTrue(context.executing_eagerly())
 
@@ -33,6 +34,15 @@ class OpsEnableEagerTest(googletest.TestCase):
     ops.enable_eager_execution()
     self.assertTrue(context.executing_eagerly())
 
+  def tearDown(self):
+    # test for disable eager test
+    ops.disable_eager_execution()
+    self.assertFalse(context.executing_eagerly())
+
+    # Calling disable eager execution a second time should not cause an error.
+    ops.disable_eager_execution()
+    self.assertFalse(context.executing_eagerly())
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 58d311fe4e7e645d1a9965208638c505195a2563..7d9799a1a7e28c3317ddca1ce3ffada51517b508 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -1587,6 +1587,8 @@ class CollectionTest(test_util.TensorFlowTestCase):
     self.assertSequenceEqual(g.collections, ["key"])
     g.add_to_collection("other", "foo")
     self.assertSequenceEqual(sorted(g.collections), ["key", "other"])
+    self.assertSequenceEqual(
+        sorted(g.get_all_collection_keys()), ["key", "other"])
 
   def test_add_to_collection(self):
     g = ops.Graph()
@@ -2153,13 +2155,19 @@ class InitScopeTest(test_util.TensorFlowTestCase):
     with g0.as_default(), ops.device("CPU:0"):
       g1 = ops.Graph()
       g1._building_function = True  # pylint: disable=protected-access
-      with g1.as_default(), ops.device("GPU:0"):
+      with g1.as_default():
+        with ops.device("GPU:0"):
+          with ops.init_scope():
+            # init_scope should preserve device set under `g1`.
+            on_gpu = constant_op.constant(1.0)
+            self.assertEqual(on_gpu.device, "/device:GPU:0")
+          still_on_gpu = constant_op.constant(1.0)
+          self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+        blank = constant_op.constant(1.0)
+        self.assertEqual(blank.device, "")
         with ops.init_scope():
-          # init_scope should preserve device set under `g1`.
-          on_gpu = constant_op.constant(1.0)
-          self.assertEqual(on_gpu.device, "/device:GPU:0")
-        still_on_gpu = constant_op.constant(1.0)
-        self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+          now_on_cpu = constant_op.constant(1.0)
+          self.assertEqual(now_on_cpu.device, "/device:CPU:0")
       on_cpu = constant_op.constant(1.0)
       self.assertEqual(on_cpu.device, "/device:CPU:0")
 
@@ -2348,7 +2356,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           math_ops.add(c, c)
         c2 = constant_op.constant(2.0)
       with self.assertRaisesRegexp(
-          TypeError, "contains objects other than 'EagerTensor'"):
+          TypeError, "Graph tensors"):
         math_ops.add(c2, c2)
 
   def testPreservesNameScopeInEagerExecution(self):
@@ -2408,17 +2416,22 @@ class GraphTest(test_util.TensorFlowTestCase):
 
   def testDefaultGraph(self):
     orig = ops.get_default_graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     g0 = ops.Graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     context_manager_0 = g0.as_default()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     with context_manager_0 as g0:
       self._AssertDefault(g0)
       with ops.Graph().as_default() as g1:
+        self.assertTrue(ops.has_default_graph())
         self._AssertDefault(g1)
       self._AssertDefault(g0)
     self._AssertDefault(orig)
+    self.assertFalse(ops.has_default_graph())
 
   def testPreventFeeding(self):
     g = ops.Graph()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index d460168631c3032bb91894c9997b2de29bf026e6..c8338a344dc946f11a78d703894429f1d48925b4 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -144,6 +144,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                        const string& num_outputs_expr);
   void AddDispatch(const string& prefix);
 
+  void AddRawOpExport(const string& parameters);
+
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
                             op_def_.input_arg(arg_index).name());
@@ -298,6 +300,7 @@ string GenEagerPythonOp::Code() {
     attrs_.push_back(p.first.GetName());
   }
 
+  // TODO(slebedev): call AvoidPythonReserved on each param?
   param_names_.reserve(params_no_default_.size() + params_with_default_.size());
   param_names_.insert(param_names_.begin(), params_no_default_.begin(),
                       params_no_default_.end());
@@ -315,8 +318,7 @@ string GenEagerPythonOp::Code() {
     strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
                        param_and_default.second);
   }
-  if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-  strings::StrAppend(&parameters, "name=None");
+  strings::StrAppend(&parameters, parameters.empty() ? "" : ", ", "name=None");
 
   // Add attr_expressions_ for attrs that are params.
   for (int i = 0; i < attrs_.size(); ++i) {
@@ -545,7 +547,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_tensor(_t, \"", attr_api_name,
                          "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type != "func") {
+    } else if (attr_type != "func" && attr_type != "list(func)") {
       *function_setup =
           strings::StrCat("# No definition for ", function_name_,
                           " since we don't support attrs with type\n"
@@ -637,6 +639,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
   }
+
   AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
@@ -648,10 +651,11 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
 
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is not None and _ctx._eager_context.is_eager:",
-                     "\n");
+  strings::StrAppend(
+      &result_,
+      "  _ctx = _context._context or _context.context()\n"
+      "  if _ctx is not None and _ctx._thread_local_data.is_eager:",
+      "\n");
   if (eager_not_allowed_error.empty()) {
     AddEagerFastPathExecute();
   } else {
@@ -668,6 +672,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddEagerFunctionTeardown("  ", output_sizes,
                            true /* execute_record_gradient */);
 
+  AddRawOpExport(parameters);
   strings::StrAppend(&result_, "\n\n");
   return true;
 }
@@ -675,8 +680,9 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& num_outputs_expr, const string& eager_not_allowed_error) {
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
+  AddDefLine(
+      strings::StrCat(function_name_, kEagerFallbackSuffix),
+      strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx=None"));
 
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
@@ -712,7 +718,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params = strings::StrCat(
-      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      "_ctx._context_handle, _ctx._thread_local_data.device_name, \"",
       op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
   string fallback_params;
 
@@ -921,6 +927,35 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
   strings::StrAppend(&result_, prefix, "  raise\n");
 }
 
+void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
+  string arguments;
+  for (const auto& param_names : param_names_) {
+    const string renamed = param_names.GetRenameTo();
+    strings::StrAppend(&arguments, arguments.empty() ? "" : ", ", renamed, "=",
+                       renamed);
+  }
+  strings::StrAppend(&arguments, arguments.empty() ? "" : ", ", "name=name");
+
+  const string raw_function_name =
+      python_op_gen_internal::AvoidPythonReserved(op_def_.name());
+
+  strings::StrAppend(&result_, "def ", raw_function_name, "(", parameters,
+                     "):\n");
+  strings::StrAppend(&result_, "  return ", function_name_, "(", arguments,
+                     ")\n");
+
+  // Copy the __doc__ from the original op and apply the decorators.
+  strings::StrAppend(&result_, raw_function_name, ".__doc__", " = ",
+                     function_name_, ".__doc__\n");
+  strings::StrAppend(&result_, raw_function_name, " = ",
+                     "_doc_controls.do_not_generate_docs(_kwarg_only(",
+                     raw_function_name, "))\n");
+
+  // Export.
+  strings::StrAppend(&result_, "tf_export(\"raw_ops.", raw_function_name,
+                     "\")(", raw_function_name, ")\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -962,6 +997,8 @@ from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import kwarg_only as _kwarg_only
+from tensorflow.tools.docs import doc_controls as _doc_controls
 
 )");
 
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 6b7f56a92cc02fd9f44a541ed3536b35653031d9..53d47c04d419e8e63a6423421865298663c6929b 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -64,6 +64,8 @@ def get_seed(op_seed):
   if global_seed is not None:
     if op_seed is None:
       # pylint: disable=protected-access
+      if hasattr(ops.get_default_graph(), '_seed_used'):
+        ops.get_default_graph()._seed_used = True
       if eager:
         op_seed = context.internal_operation_seed()
       else:
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 4357c76bd6cc8ccac55b5e123fa0ce7cf3c0d19d..53c68b046192818da31ece0c3e9181986e671829 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -64,8 +64,12 @@ class Registry(object):
     # stack trace is [this_function, Register(), user_function,...]
     # so the user function is #2.
     stack = tf_stack.extract_stack()
-    user_function = stack[2]
-    location_tag = tf_stack.convert_stack([user_function])[0]
+    stack_index = min(2, len(stack)-1)
+    if stack_index >= 0:
+      user_function = stack[stack_index]
+      location_tag = tf_stack.convert_stack([user_function])[0]
+    else:
+      location_tag = "UNKNOWN"
     self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
 
   def list(self):
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index 1a0d3f200d9427363ae36c19b6214ac6c9b75bec..5adf12fdacf5fa1e8ea096e3d6494824f26d282e 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -19,28 +19,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.framework import registry
 from tensorflow.python.platform import test
 
 
-class RegistryTest(test.TestCase):
+def bar():
+  pass
+
+
+class RegistryTest(test.TestCase, parameterized.TestCase):
 
   class Foo(object):
     pass
 
-  def testRegisterClass(self):
-    myreg = registry.Registry('testfoo')
+  # Test the registry basics on both classes (Foo) and functions (bar).
+  @parameterized.parameters([Foo, bar])
+  def testRegistryBasics(self, candidate):
+    myreg = registry.Registry('testRegistry')
     with self.assertRaises(LookupError):
-      myreg.lookup('Foo')
-    myreg.register(RegistryTest.Foo, 'Foo')
-    assert myreg.lookup('Foo') == RegistryTest.Foo
-
-  def testRegisterFunction(self):
-    myreg = registry.Registry('testbar')
-    with self.assertRaises(LookupError):
-      myreg.lookup('Bar')
-    myreg.register(bar, 'Bar')
-    assert myreg.lookup('Bar') == bar
+      myreg.lookup('testKey')
+    myreg.register(candidate)
+    self.assertEqual(myreg.lookup(candidate.__name__), candidate)
+    myreg.register(candidate, 'testKey')
+    self.assertEqual(myreg.lookup('testKey'), candidate)
+    self.assertEqual(
+        sorted(myreg.list()), sorted(['testKey', candidate.__name__]))
 
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
@@ -51,9 +56,5 @@ class RegistryTest(test.TestCase):
       myreg.register(bar, 'Bar')
 
 
-def bar():
-  pass
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 8546c2299aad8f6145f8dd59c3c51410038d8847..b21b109de6d426ff9089a048297504c64b05ab8c 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and functions used to construct graphs."""
+"""Sparse tensors."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,10 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -34,7 +36,7 @@ _override_helper = ops._override_helper
 
 
 @tf_export("sparse.SparseTensor", "SparseTensor")
-class SparseTensor(_TensorLike):
+class SparseTensor(_TensorLike, composite_tensor.CompositeTensor):
   """Represents a sparse tensor.
 
   TensorFlow represents a sparse tensor as three separate dense tensors:
@@ -191,18 +193,6 @@ class SparseTensor(_TensorLike):
     """The `Graph` that contains the index, value, and dense_shape tensors."""
     return self._indices.graph
 
-  def consumers(self):
-    """Returns a list of `Operation`s that consume this `SparseTensor`.
-
-    Returns:
-      A list of `Operation`s.
-    """
-    values_consumers = set(self._values.consumers())
-    indices_consumers = set(self._indices.consumers())
-    dense_shape_consumers = set(self._dense_shape.consumers())
-    return list(values_consumers \
-                .union(indices_consumers, dense_shape_consumers))
-
   def __str__(self):
     return "SparseTensor(indices=%s, values=%s, dense_shape=%s)" % (
         self._indices, self._values, self._dense_shape)
@@ -237,6 +227,30 @@ class SparseTensor(_TensorLike):
   def _override_operator(operator, func):
     _override_helper(SparseTensor, operator, func)
 
+  def _to_components(self):
+    return (self._indices, self._values, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self.dense_shape.shape
+    if shape.ndims is None:
+      shape = tensor_shape.TensorShape([None])
+    if shape.ndims != 1:
+      raise ValueError("Shape invariant for SparseTensor must have the form "
+                       "TensorShape([r]), got %r" % shape)
+    rank = tensor_shape.dimension_value(shape[0])
+    return [tensor_shape.TensorShape([None, rank]),  # indices
+            tensor_shape.TensorShape([None]),  # values
+            tensor_shape.TensorShape([rank])]  # dense_shape
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index a999c12ca89b0c1746751eb04e9abfe380abf336..03aa63b624eb2bfdc6d8a9e546a200161d187af2 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -65,18 +66,18 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         sparse_tensor.is_sparse(
             sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
-  @test_util.run_deprecated_v1
   def testConsumers(self):
-    sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
-    w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
-    out = sparse_ops.sparse_tensor_dense_matmul(sp, w)
-    self.assertEqual(len(sp.consumers()), 1)
-    self.assertEqual(sp.consumers()[0], out.op)
-
-    dense = sparse_ops.sparse_tensor_to_dense(sp)
-    self.assertEqual(len(sp.consumers()), 2)
-    self.assertTrue(dense.op in sp.consumers())
-    self.assertTrue(out.op in sp.consumers())
+    with context.graph_mode():
+      sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
+      w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
+      out = sparse_ops.sparse_tensor_dense_matmul(sp, w)
+      self.assertEqual(len(sp.consumers()), 1)
+      self.assertEqual(sp.consumers()[0], out.op)
+
+      dense = sparse_ops.sparse_tensor_to_dense(sp)
+      self.assertEqual(len(sp.consumers()), 2)
+      self.assertIn(dense.op, sp.consumers())
+      self.assertIn(out.op, sp.consumers())
 
 
 class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index a7537bb5f1adfe70018f50cb9a627bfffe176226..0dc3dde4f6e95dbe4156a29d03f465e95cb4a5f6 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -74,9 +74,8 @@ def enable_v2_tensorshape():
   # in `tensor_shape[i]`, but they would not be.
   ```
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = True
-  TensorShape = TensorShapeV2
 
 
 @tf_export(v1=["disable_v2_tensorshape"])
@@ -85,9 +84,8 @@ def disable_v2_tensorshape():
 
   See docstring for `enable_v2_tensorshape` for details about the new behavior.
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = False
-  TensorShape = TensorShapeV1
 
 
 @tf_export("compat.dimension_value",
@@ -470,6 +468,54 @@ class Dimension(object):
     """
     return self // other
 
+  def __rdiv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __truediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'Dimension' and 'int'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: 'Dimension' and '{}', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __rtruediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
   def __mod__(self, other):
     """Returns `self` modulo `other`.
 
@@ -635,8 +681,8 @@ def as_dimension(value):
     return Dimension(value)
 
 
-@tf_export(v1=["TensorShape"])
-class TensorShapeV1(object):
+@tf_export("TensorShape")
+class TensorShape(object):
   """Represents the shape of a `Tensor`.
 
   A `TensorShape` represents a possibly-partial shape specification for a
@@ -695,7 +741,7 @@ class TensorShapeV1(object):
   @property
   def _v2_behavior(self):
     if _TENSORSHAPE_V2_OVERRIDE is None:
-      return False
+      return tf2.enabled()
     return _TENSORSHAPE_V2_OVERRIDE
 
   def __repr__(self):
@@ -1151,22 +1197,6 @@ def unknown_shape(rank=None, **kwargs):
     return TensorShape([Dimension(None)] * rank)
 
 
-@tf_export("TensorShape", v1=[])
-class TensorShapeV2(TensorShapeV1):
-
-  @property
-  def _v2_behavior(self):
-    if _TENSORSHAPE_V2_OVERRIDE is None:
-      return True
-    return _TENSORSHAPE_V2_OVERRIDE
-
-
-if tf2.enabled():
-  TensorShape = TensorShapeV2
-else:
-  TensorShape = TensorShapeV1
-
-
 def scalar():
   """Returns a shape representing a scalar."""
   return TensorShape([])
diff --git a/tensorflow/python/framework/tensor_shape_div_test.py b/tensorflow/python/framework/tensor_shape_div_test.py
index 8e63d7f54705bb5c8384315f068598a86c047599..5160c75e5272d9326a35a0813809387605cca1ea 100644
--- a/tensorflow/python/framework/tensor_shape_div_test.py
+++ b/tensorflow/python/framework/tensor_shape_div_test.py
@@ -35,6 +35,16 @@ class DimensionDivTest(test_util.TensorFlowTestCase):
         for y in values:
           self.assertEqual((x / y).value, (x // y).value)
 
+  def testRDivFail(self):
+    # Note: This test is related to GitHub issue 25790.
+    """Without from __future__ import division, __rdiv__ is used."""
+    if six.PY2:  # Old division exists only in Python 2
+      two = tensor_shape.Dimension(2)
+      message = (r"unsupported operand type\(s\) for /: "
+                 r"'int' and 'Dimension', please use // instead")
+      with self.assertRaisesRegexp(TypeError, message):
+        _ = 6 / two
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 7d85e0a99e662512b29e4134091658190a3bc500..770573f86d11d4daaf789ee9073b507f8a1252ad 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -200,10 +200,27 @@ class DimensionTest(test_util.TensorFlowTestCase):
   def testReduce(self):
     dim = tensor_shape.Dimension(5)
     ctor, args = dim.__reduce__()
-    self.assertEquals(ctor, tensor_shape.Dimension)
-    self.assertEquals(args, (5,))
+    self.assertEqual(ctor, tensor_shape.Dimension)
+    self.assertEqual(args, (5,))
     reconstructed = ctor(*args)
-    self.assertEquals(reconstructed, dim)
+    self.assertEqual(reconstructed, dim)
+
+  def testDiv(self):
+    # Note: This test is related to GitHub issue 25790.
+    six = tensor_shape.Dimension(6)
+    two = tensor_shape.Dimension(2)
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / two
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'int', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / 2
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'int' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = 6 / two
 
 
 class ShapeTest(test_util.TensorFlowTestCase):
@@ -440,11 +457,12 @@ class ShapeTest(test_util.TensorFlowTestCase):
   def testReduce(self):
     shape = tensor_shape.TensorShape([2, 3])
     ctor, args = shape.__reduce__()
-    self.assertEquals(ctor, tensor_shape.TensorShape)
-    self.assertEquals(args, ([tensor_shape.Dimension(2),
-                              tensor_shape.Dimension(3)],))
+    self.assertEqual(ctor, tensor_shape.TensorShape)
+    self.assertEqual(args,
+                     ([tensor_shape.Dimension(2),
+                       tensor_shape.Dimension(3)],))
     reconstructed = ctor(*args)
-    self.assertEquals(reconstructed, shape)
+    self.assertEqual(reconstructed, shape)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index c44636edc4ec5101c588766714c98a7da15793e4..55ae043c6b534819fcf2675f5874091950e72f08 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -37,7 +37,7 @@ class TensorSpec(object):
 
   __slots__ = ["_shape", "_shape_tuple", "_dtype", "_name"]
 
-  def __init__(self, shape, dtype, name=None):
+  def __init__(self, shape, dtype=dtypes.float32, name=None):
     """Creates a TensorSpec.
 
     Args:
@@ -108,7 +108,9 @@ class TensorSpec(object):
     return hash((self._shape_tuple, self.dtype))
 
   def __eq__(self, other):
-    return self.shape == other.shape and self.dtype == other.dtype
+    return (self._shape_tuple == other._shape_tuple  # pylint: disable=protected-access
+            and self.dtype == other.dtype
+            and self._name == other._name)  # pylint: disable=protected-access
 
   def __ne__(self, other):
     return not self == other
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 75c197df09e97b8e5c9ebf15ffb33206f69a172f..175aaebe67a15ffda526f3b83fb87f522a6374c1 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -33,6 +33,10 @@ from tensorflow.python.platform import googletest
 
 class TensorSpecTest(test_util.TensorFlowTestCase):
 
+  def testDefaultDType(self):
+    desc = tensor_spec.TensorSpec([1])
+    self.assertEqual(desc.dtype, dtypes.float32)
+
   def testAcceptsNumpyDType(self):
     desc = tensor_spec.TensorSpec([1], np.float32)
     self.assertEqual(desc.dtype, dtypes.float32)
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index ca8b067935c067f9ff8fe39b72f4ba32400b03bd..b3621a47580ebfba2b7e064b33e621f5bda22c66 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -42,7 +43,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 def ExtractBitsFromFloat16(x):
-  return np.asscalar(np.asarray(x, dtype=np.float16).view(np.uint16))
+  return np.asarray(x, dtype=np.float16).view(np.uint16).item()
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
@@ -58,8 +59,8 @@ def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
 
 
 def ExtractBitsFromBFloat16(x):
-  return np.asscalar(
-      np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+  return np.asarray(
+      x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16).item()
 
 
 def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
@@ -122,39 +123,39 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
 else:
 
   def SlowAppendFloat32ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.float_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.float_val.extend([x.item() for x in proto_values])
 
   def SlowAppendFloat64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.double_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.double_val.extend([x.item() for x in proto_values])
 
   def SlowAppendIntArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.int_val.extend([x.item() for x in proto_values])
 
   def SlowAppendInt64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int64_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.int64_val.extend([x.item() for x in proto_values])
 
   def SlowAppendQIntArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int_val.extend([np.asscalar(x[0]) for x in proto_values])
+    tensor_proto.int_val.extend([x.item()[0] for x in proto_values])
 
   def SlowAppendUInt32ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.uint32_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.uint32_val.extend([x.item() for x in proto_values])
 
   def SlowAppendUInt64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.uint64_val.extend([x.item() for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.scomplex_val.extend(
-        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
+        [v.item() for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.dcomplex_val.extend(
-        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
+        [v.item() for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
 
   def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.bool_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.bool_val.extend([x.item() for x in proto_values])
 
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
@@ -781,19 +782,16 @@ def _ConstantValue(tensor, partial):
     return None
 
 
+@tf_export('get_static_value')
 def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   """Returns the constant value of the given tensor, if efficiently calculable.
 
   This function attempts to partially evaluate the given tensor, and
   returns its value as a numpy ndarray if this succeeds.
 
-  TODO(mrry): Consider whether this function should use a registration
-  mechanism like gradients and ShapeFunctions, so that it is easily
-  extensible.
-
-  NOTE: If `constant_value(tensor)` returns a non-`None` result, it will no
-  longer be possible to feed a different value for `tensor`. This allows the
-  result of this function to influence the graph that is constructed, and
+  Compatibility(V1): If `constant_value(tensor)` returns a non-`None` result, it
+  will no longer be possible to feed a different value for `tensor`. This allows
+  the result of this function to influence the graph that is constructed, and
   permits static shape optimizations.
 
   Args:
@@ -810,6 +808,10 @@ def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   """
   if isinstance(tensor, ops.EagerTensor):
     return tensor.numpy()
+  if not is_tensor(tensor):
+    return tensor
+  if not isinstance(tensor, ops.Tensor):
+    return None
   ret = _ConstantValue(tensor, partial)
   if ret is not None:
     # The caller may now depend on the constant value of `tensor`, so we
@@ -935,13 +937,15 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   return ret
 
 
+@tf_export("is_tensor")
 def is_tensor(x):  # pylint: disable=invalid-name
   """Check whether `x` is of tensor type.
 
-  Check whether an object is a tensor. This check is equivalent to calling
-  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.Variable))` and also checks
-  if all the component variables of a MirroredVariable or a ReplicaLocalVariable
-  are tensors.
+  Check whether an object is a tensor or a composite tensor. This check is
+  equivalent to calling
+  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.RaggedTensor, tf.Variable))`
+  and also checks if all the component variables of a MirroredVariable or a
+  SyncOnReadVariable are tensors.
 
   Args:
     x: A python object to check.
@@ -950,4 +954,5 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor, `False` if not.
   """
   return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
+          isinstance(x, composite_tensor.CompositeTensor) or
           (hasattr(x, "is_tensor_like") and x.is_tensor_like))
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index cdacdfaaada96e21d4f4d6a9fb2a9247e332969f..92a0ae2cdc97c933b27e8dd9034742cb894e7c0d 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -773,6 +774,16 @@ class TensorUtilTest(test.TestCase):
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
 
+class IsTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantTensor(self):
+    np_val = np.random.rand(3).astype(np.int32)
+    tf_val = constant_op.constant(np_val)
+    self.assertFalse(tensor_util.is_tensor(np_val))
+    self.assertTrue(tensor_util.is_tensor(tf_val))
+
+
 class ConstantValueTest(test.TestCase):
 
   def testConstant(self):
@@ -943,6 +954,22 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllEqual(c_val, [[False, True], [True, False]])
 
+  def testLiteral(self):
+    x = "hi"
+    self.assertIs(x, tensor_util.constant_value(x))
+
+  def testNumpyNdarray(self):
+    np_val = np.random.rand(3, 4, 7).astype(np.float32)
+    self.assertIs(np_val, tensor_util.constant_value(np_val))
+
+  def testVariable(self):
+    var = variables.Variable(1.0, name="variable_node")
+    self.assertIsNone(tensor_util.constant_value(var))
+
+  def testVariableV1(self):
+    var = variables.VariableV1(1.0, name="variable_node")
+    self.assertIsNone(tensor_util.constant_value(var))
+
 
 class ConstantValueAsShapeTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 1d0145f61c84969cf1b52eb070ec3f933d25741a..5d1386c26d73816772936bac9fe57c575a399066 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -406,6 +406,10 @@ REGISTER_OP("FuncAttr")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("FuncListAttr")
+    .Attr("f: list(func)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("Simple")
     .Input("a: int32")
     .Output("out: float")
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 732fd3138977caa64a3e133c5fb2386e584b4398..21e48591bf9364e0d34b309d60af62f008e34f62 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -132,7 +132,7 @@ def assert_ops_in_graph(expected_ops, graph):
 
 
 @tf_export("test.assert_equal_graph_def", v1=[])
-def assert_equal_graph_def_v2(actual, expected):
+def assert_equal_graph_def_v2(expected, actual):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -141,8 +141,8 @@ def assert_equal_graph_def_v2(actual, expected):
   ignores randomized attribute values that may appear in V2 checkpoints.
 
   Args:
-    actual: The `GraphDef` we have.
     expected: The `GraphDef` we expected.
+    actual: The `GraphDef` we have.
 
   Raises:
     AssertionError: If the `GraphDef`s do not match.
@@ -650,7 +650,7 @@ def _find_reference_cycle(objects, idx):
     return None
 
   # Note: this function is meant to help with diagnostics. Its output is purely
-  # a human readable representation, so you may freely modify it to suit your
+  # a human-readable representation, so you may freely modify it to suit your
   # needs.
   def describe(obj, blacklist, leaves_only=False):
     """Returns a custom human-readable summary of obj.
@@ -1012,10 +1012,12 @@ def py_func_if_in_function(f):
     if not ops.get_default_graph()._building_function:
       return f(*args, **kwds)
 
-    tensor_args, tensor_indices = zip(*[(x, i)
-                                        for i, x in enumerate(args)
-                                        if isinstance(x, (ops.Tensor,
-                                                          variables.Variable))])
+    tensor_args = []
+    tensor_indices = []
+    for i, arg in enumerate(args):
+      if isinstance(arg, (ops.Tensor, variables.Variable)):
+        tensor_args.append(arg)
+        tensor_indices.append(i)
 
     def inner_f(*inner_tensor_args):
       my_args = list(args)
@@ -1200,7 +1202,7 @@ def run_v2_only(func=None):
 def run_gpu_only(func=None):
   """Execute the decorated test only if a GPU is available.
 
-  This function is intended to be applied to tests that require the precense
+  This function is intended to be applied to tests that require the presence
   of a GPU. If a GPU is absent, it will simply be skipped.
 
   Args:
@@ -1273,7 +1275,7 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
       CUDA compute capability required, or None if no requirement.
 
   Returns:
-    True iff a gpu device of the requested kind is available.
+    True if a gpu device of the requested kind is available.
   """
 
   def compute_capability_from_device_desc(device_desc):
@@ -1374,7 +1376,7 @@ class FakeEagerSession(object):
 
   Since the feed_dict is empty when not using placeholders we should be able to
   call self.evaluate(), however this requires rewriting the test case.
-  This class shold be considered a stop-gap solution to get tests running with
+  This class should be considered a stop-gap solution to get tests running with
   eager with minimal changes to the actual test.
   """
 
@@ -1426,6 +1428,36 @@ class ErrorLoggingSession(session.Session):
       raise
 
 
+def use_deterministic_cudnn(func):
+  """Disable autotuning during the call to this function.
+
+  Some tests want to base assertions on a graph being isomorphic with a copy.
+  To ensure this, this decorator disables autotuning.
+
+  Args:
+    func: Function to run with CUDNN autotuning turned off.
+
+  Returns:
+    Decorated function.
+  """
+
+  def decorator(f):
+
+    def decorated(self, *args, **kwargs):
+      original_var = os.environ.get("TF_CUDNN_DETERMINISTIC", "")
+      os.environ["TF_CUDNN_DETERMINISTIC"] = "true"
+      result = f(self, *args, **kwargs)
+      os.environ["TF_CUDNN_DETERMINISTIC"] = original_var
+      return result
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
 # The description is just for documentation purposes.
 def disable_xla(description):
 
@@ -1460,7 +1492,7 @@ def disable_all_xla(description):
       value = getattr(cls, name)
       if callable(value) and name.startswith(
           "test") and not name == "test_session":
-        setattr(cls, name, base_decorator(value))
+        setattr(cls, name, base_decorator(description)(value))
     return cls
 
   return disable_all_impl
@@ -1487,7 +1519,8 @@ class TensorFlowTestCase(googletest.TestCase):
     if is_xla_enabled():
       os.putenv(
           "TF_XLA_FLAGS", "--tf_xla_auto_jit=2 --tf_xla_min_cluster_size=1 "
-          "--tf_xla_enable_lazy_compilation=false")
+          "--tf_xla_enable_lazy_compilation=false " +
+          os.getenv("TF_XLA_FLAGS", ""))
     self._threads = []
     self._tempdir = None
     self._cached_session = None
@@ -1661,6 +1694,10 @@ class TensorFlowTestCase(googletest.TestCase):
           return sparse_tensor.SparseTensorValue(tensor.indices.numpy(),
                                                  tensor.values.numpy(),
                                                  tensor.dense_shape.numpy())
+        elif isinstance(tensor, ops.IndexedSlices):
+          return ops.IndexedSlicesValue(values=tensor.values.numpy(),
+                                        indices=tensor.indices.numpy(),
+                                        dense_shape=tensor.dense_shape.numpy())
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -2170,7 +2207,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
   @py_func_if_in_function
   def assertNotAllClose(self, a, b, **kwargs):
-    """Assert that two numpy arrays, or or Tensors, do not have near values.
+    """Assert that two numpy arrays, or Tensors, do not have near values.
 
     Args:
       a: the first value to compare.
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index af9276c508b1db1e57a0dc8690cd5d6dfd0574e5..2a53da734a657b9f472861ef85a7a750e32c0a6c 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -153,7 +153,7 @@ static GCluster TF_NewVirtualCluster(
   for (const auto& named_device : named_devices) {
     devices[named_device.name()]= named_device.properties();
   }
-  tensorflow::grappler::Cluster*cluster_ =
+  tensorflow::grappler::Cluster* cluster_ =
       new tensorflow::grappler::VirtualCluster(devices);
   PyGILState_STATE gstate = PyGILState_Ensure();
   tensorflow::Status status = cluster_->Provision();
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index 541747867fa81b49e48ddc86e1daf8e522b577d3..2014c0dde3fbcab528f76908a89c4ae4411808f7 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -99,9 +99,7 @@ class ClusterTest(test.TestCase):
           type='GPU',
           frequency=1000,
           num_cores=60,
-          environment={
-              'architecture': '7'
-          })
+          environment={'architecture': '7'})
       named_device = device_properties_pb2.NamedDevice(
           properties=device_properties, name='/device:GPU:0')
       grappler_cluster = cluster.Cluster(
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index 30c1e1468146ce58216acbfbb1aef1ab1408027f..3ba5b7418a75157acf24425d7e22069137a5e6a1 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -60,9 +59,9 @@ class ConstantFoldingTest(test.TestCase):
           loop_vars=[0, init_y],
           back_prop=False,
           parallel_iterations=1)
-      with session.Session() as sess:
-        y_v = self.evaluate(y)
-        self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
+
+      y_v = self.evaluate(y)
+      self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 9aa5fbca383d126ebb927a7e47fc714503fcefed..de4b82c84dc5604ed5cf8cb662e6f5f84e6c73e8 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -27,7 +27,8 @@ CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                            const string& suffix)
     : item_(&item),
       measure_estimator_(cluster, 10, 0),
-      analytical_estimator_(cluster, false),
+      analytical_estimator_(cluster, /*use_static_shapes=*/false,
+                            /*use_aggressive_shape_inference=*/true),
       suffix_(suffix) {}
 
 Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report,
@@ -125,7 +126,6 @@ void CostAnalyzer::PreprocessCosts() {
   }
 }
 
-
 void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
   for (const auto& op : ops) {
     ops_.push_back(op.second);
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index ee3e289f65d05e96a580a62adb7f39552e6ced1c..1a988db11b3fa17235f472623d5b4a5fbbf977ed 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -124,7 +124,7 @@ class CostAnalysisTest(test.TestCase):
       op_count = int(m.group(1))
       # upper = int(m.group(5))
       lower = int(m.group(6))
-      if op_type is b"MatMul":
+      if op_type == b"MatMul":
         self.assertEqual(3, op_count)
       else:
         self.assertEqual(1, op_count)
diff --git a/tensorflow/python/grappler/hierarchical_controller.py b/tensorflow/python/grappler/hierarchical_controller.py
index c0866c1069ac7f7e25cbd12cb5a490e2ed5e4bec..c62fd5c4a86de93d92f98b83d4cc331f75bf8fe4 100644
--- a/tensorflow/python/grappler/hierarchical_controller.py
+++ b/tensorflow/python/grappler/hierarchical_controller.py
@@ -572,7 +572,7 @@ class HierarchicalController(Controller):
     logits = array_ops.reshape(logits,
                                [batch_size * self.num_ops, self.num_groups])
     actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
-    actions = math_ops.to_int32(actions)
+    actions = math_ops.cast(actions, dtypes.int32)
     actions = array_ops.reshape(actions, [batch_size, self.num_ops])
     action_label = array_ops.reshape(actions, [-1])
     log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits(
@@ -924,7 +924,7 @@ class HierarchicalController(Controller):
         next_y = array_ops.slice(y, [0, i], [-1, 1])
       else:
         raise NotImplementedError
-      next_y = math_ops.to_int32(next_y)
+      next_y = math_ops.cast(next_y, dtypes.int32)
       next_y = array_ops.reshape(next_y, [self.hparams.num_children])
       actions = actions.write(i, next_y)
       log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 593d38206d127978f1982a0f2cc22e17daee1a3d..0d4f7de9f55b2bd13cd1ab7988b7f6c43d3e018c 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -272,7 +272,6 @@ static PyObject* TF_GetColocationGroups(GItem item) {
     if (!s.ok()) {
       continue;
     }
-    int i = 0;
     for (const auto& arg : op_def->input_arg()) {
       if (!arg.is_ref()) {
         continue;
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 98f2e6d71816a4b6d8cd3f7fc836b09e5cc058a4..3b6d2ce26af8bf8966e0173eea317f0067b20cd1 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -34,10 +34,10 @@ from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -120,7 +120,7 @@ def _loop():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
   return outputs
 
 
@@ -131,8 +131,7 @@ def _loop_with_branch():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(
-      _model_with_branch, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_model_with_branch, elems, dtype=dtypes.float32)
   return outputs
 
 
@@ -143,8 +142,7 @@ def _loop_with_vec_and_4d():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(
-      _model_with_vec_and_4d, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_model_with_vec_and_4d, elems, dtype=dtypes.float32)
   return outputs
 
 
@@ -256,6 +254,7 @@ class LayoutOptimizerTest(test.TestCase):
       else:
         saver.save(sess, checkpoint_path)
 
+  @test_util.deprecated_graph_mode_only
   def testTwoConvLayers(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -285,6 +284,7 @@ class LayoutOptimizerTest(test.TestCase):
 
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSplitWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -320,6 +320,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_map_nhwc_to_nchw('split-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSplitVWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -354,6 +355,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_map_nhwc_to_nchw('SplitV-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testPadWithConstPaddings(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -388,6 +390,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('Pad-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSum(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -417,6 +420,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testCast(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -447,6 +451,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Cast-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSqueeze(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -477,6 +482,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSqueezeAlongHW(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -507,6 +513,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSqueezeAlongNHW(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -537,6 +544,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongHWC(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -566,6 +574,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongNHW(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -595,6 +604,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongC(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -624,6 +634,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongCKeepDims(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -654,6 +665,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Sum-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongHKeepDims(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -683,6 +695,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReduceSumAlongWCKeepDims(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -712,6 +725,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testConcatWithControlDependency(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -747,6 +761,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('concat-2-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testFill(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -790,6 +805,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Fill-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testTile(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -826,6 +842,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_vec_nhwc_to_nchw('Tile-1', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReverseWithConstDims(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -858,6 +875,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('ReverseV2-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testReverseWithNonConstDims(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -894,6 +912,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_map_nhwc_to_nchw('ReverseV2-1', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSelectOp(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -925,6 +944,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Select-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSelectOpConditionUnknownShape(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -956,6 +976,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSelectOpScalarCondition(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -986,6 +1007,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Select-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testPadWithNonConstPaddings(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1022,6 +1044,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_vec_nhwc_to_nchw('Pad-1', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testMaxPoolV2(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1058,6 +1081,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('MaxPoolV2-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testMaxPoolGradV2(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1095,6 +1119,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testSliceWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1131,6 +1156,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_vec_nhwc_to_nchw('Slice-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testStridedSliceWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1169,6 +1195,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('StridedSlice-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testStridedSliceWithMask1011(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1204,6 +1231,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testStridedSliceWithMask0111(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1239,6 +1267,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testStridedSliceGradWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
@@ -1281,6 +1310,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('StridedSlice-2-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testShapeN(self):
     if test.is_gpu_available(cuda_only=True):
       x = array_ops.placeholder(dtype='float32')
@@ -1312,6 +1342,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_vec_nchw_to_nhwc('ShapeN-0-0', nodes)
       self.assertAllEqual(output_val_ref, output_val)
 
+  @test_util.deprecated_graph_mode_only
   def testShapeNFollowedByNotConvertibleNodeReshape(self):
     if test.is_gpu_available(cuda_only=True):
       x = array_ops.placeholder(dtype='float32')
@@ -1343,6 +1374,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testLoop(self):
     if test.is_gpu_available(cuda_only=True):
       output = _loop()
@@ -1370,6 +1402,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('map/while/MaxPool_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testLoopWithBranch(self):
     if test.is_gpu_available(cuda_only=True):
       output = _loop_with_branch()
@@ -1394,6 +1427,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testLoopWithVecAnd4D(self):
     if test.is_gpu_available(cuda_only=True):
       output = _loop_with_vec_and_4d()
@@ -1418,6 +1452,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
   def testBinaryOpSecondPort(self):
     if test.is_gpu_available(cuda_only=True):
       output = _model_with_second_port()
@@ -1442,7 +1477,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testGradient(self):
     meta_graph = _simple_metagraph()
     config = config_pb2.ConfigProto()
@@ -1460,7 +1495,7 @@ class LayoutOptimizerTest(test.TestCase):
         self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 5)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testDepthwise(self):
     meta_graph = _simple_metagraph(depthwise=True)
     config = config_pb2.ConfigProto()
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index c4c403119cf97070f1c36cc30008ee1dfa71f56f..2404c15e78e4bedcda748c01948ffaafea947c16 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -63,6 +63,7 @@ py_library(
         ":pil_for_keras",
         ":saving",
         "//tensorflow/python:training",
+        "//tensorflow/python/keras/mixed_precision/experimental:mixed_precision_experimental",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/saved_model",
         "@keras_applications_archive//:keras_applications",
@@ -91,6 +92,7 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:ctc_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
@@ -100,6 +102,7 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:logging_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
@@ -113,8 +116,10 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_coordinator",
     ],
 )
 
@@ -134,6 +139,7 @@ py_library(
         "engine/input_layer.py",
         "engine/input_spec.py",
         "engine/network.py",
+        "engine/partial_batch_padding_handler.py",
         "engine/saving.py",
         "engine/sequential.py",
         "engine/training.py",
@@ -151,17 +157,23 @@ py_library(
         ":activations",
         ":backend",
         ":callbacks",
+        ":callbacks_v1",
         ":constraints",
         ":engine_utils",
         ":initializers",
         ":losses",
+        ":mode_keys",
         ":optimizers",
         ":regularizers",
         ":saving",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/training/tracking:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
@@ -180,9 +192,9 @@ py_library(
     deps = [
         ":backend",
         ":engine_utils",
+        ":mode_keys",
         ":optimizers",
         "//tensorflow/python:lib",
-        "//tensorflow/python:mode_keys",
         "//tensorflow/python:saver",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model/model_utils",
@@ -210,6 +222,20 @@ py_library(
     deps = [
         ":backend",
         ":engine_utils",
+        ":mode_keys",
+    ],
+)
+
+py_library(
+    name = "callbacks_v1",
+    srcs = [
+        "callbacks_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        "//tensorflow/python/eager:profiler",
     ],
 )
 
@@ -285,6 +311,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -297,38 +324,68 @@ py_library(
         "layers/convolutional_recurrent.py",
         "layers/core.py",
         "layers/cudnn_recurrent.py",
+        "layers/dense_attention.py",
         "layers/embeddings.py",
+        "layers/kernelized.py",
         "layers/local.py",
         "layers/merge.py",
         "layers/noise.py",
         "layers/normalization.py",
+        "layers/normalization_v2.py",
         "layers/pooling.py",
         "layers/recurrent.py",
+        "layers/recurrent_v2.py",
         "layers/serialization.py",
         "layers/wrappers.py",
-        "utils/generic_utils.py",
+        "utils/kernelized_utils.py",
         "utils/layer_utils.py",
         "utils/tf_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":engine",
+        ":generic_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:standard_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "generic_utils",
+    srcs = [
+        "utils/generic_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "mode_keys",
+    srcs = [
+        "utils/mode_keys.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/saved_model/model_utils:mode_keys",
     ],
 )
 
@@ -341,6 +398,7 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:nn_ops",
     ],
     shard_count = 12,
     tags = ["notsan"],
@@ -355,6 +413,7 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:nn_ops",
     ],
 )
 
@@ -385,7 +444,7 @@ tf_py_test(
 
 tf_py_test(
     name = "regularizers_test",
-    size = "small",
+    size = "medium",
     srcs = ["regularizers_test.py"],
     additional_deps = [
         ":keras",
@@ -458,16 +517,29 @@ tf_py_test(
     shard_count = 4,
 )
 
+tf_py_test(
+    name = "metrics_correctness_test",
+    size = "medium",
+    srcs = ["metrics_correctness_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
 tf_py_test(
     name = "applications_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["applications/applications_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
+    shard_count = 11,
 )
 
 tf_py_test(
@@ -490,11 +562,12 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
 )
 
 tf_py_test(
     name = "convolutional_recurrent_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/convolutional_recurrent_test.py"],
     additional_deps = [
         ":keras",
@@ -502,7 +575,7 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -516,6 +589,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -528,6 +602,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -541,7 +616,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 4,
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_rocm",
+        "no_windows_gpu",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -554,6 +633,10 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    # TODO(b/127881287): Re-enable.
+    tags = [
+        "no_windows_gpu",
+    ],
 )
 
 tf_py_test(
@@ -569,6 +652,18 @@ tf_py_test(
     shard_count = 3,
 )
 
+tf_py_test(
+    name = "dense_attention_test",
+    size = "medium",
+    srcs = ["layers/dense_attention_test.py"],
+    additional_deps = [
+        ":keras",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "embeddings_test",
     size = "medium",
@@ -578,6 +673,7 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -686,7 +782,20 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 4,
+    shard_count = 10,
+)
+
+cuda_py_test(
+    name = "recurrent_v2_test",
+    size = "medium",
+    srcs = ["layers/recurrent_v2_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 2,
 )
 
 cuda_py_test(
@@ -699,12 +808,13 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
-    name = "unified_lstm_test",
+    name = "lstm_v2_test",
     size = "medium",
-    srcs = ["layers/unified_lstm_test.py"],
+    srcs = ["layers/lstm_v2_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
@@ -712,19 +822,21 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     shard_count = 8,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
-    name = "unified_gru_test",
+    name = "gru_v2_test",
     size = "medium",
-    srcs = ["layers/unified_gru_test.py"],
+    srcs = ["layers/gru_v2_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 6,
+    shard_count = 8,
+    tags = ["no_rocm"],
 )
 
 tf_py_test(
@@ -738,6 +850,32 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "kernelized_test",
+    size = "small",
+    srcs = ["layers/kernelized_test.py"],
+    additional_deps = [
+        ":backend",
+        ":initializers",
+        ":keras",
+        ":layers",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 tf_py_test(
     name = "wrappers_test",
     size = "medium",
@@ -755,6 +893,21 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "time_distributed_learning_phase_test",
+    size = "small",
+    srcs = ["layers/time_distributed_learning_phase_test.py"],
+    additional_deps = [
+        ":keras",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "noasan",  # http://b/78599823
+        "notsan",
+    ],
+)
+
 tf_py_test(
     name = "scikit_learn_test",
     size = "small",
@@ -770,7 +923,7 @@ tf_py_test(
 
 tf_py_test(
     name = "data_utils_test",
-    size = "large",
+    size = "medium",
     srcs = ["utils/data_utils_test.py"],
     additional_deps = [
         ":keras",
@@ -778,6 +931,7 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 6,
     tags = [
         "no_oss",
         "no_windows",
@@ -808,6 +962,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "composite_tensor_support_test",
+    size = "medium",
+    srcs = ["utils/composite_tensor_support_test.py"],
+    additional_deps = [
+        ":engine",
+        ":layers",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
 tf_py_test(
     name = "io_utils_test",
     size = "small",
@@ -836,6 +1011,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "kernelized_utils_test",
+    size = "small",
+    srcs = ["utils/kernelized_utils_test.py"],
+    additional_deps = [
+        ":layers",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+    ],
+)
+
 cuda_py_test(
     name = "multi_gpu_utils_test",
     srcs = ["utils/multi_gpu_utils_test.py"],
@@ -849,6 +1036,7 @@ cuda_py_test(
         "guitar",
         "multi_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -861,6 +1049,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -921,6 +1110,20 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "callbacks_v1_test",
+    size = "medium",
+    srcs = ["callbacks_v1_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = ["notsan"],
 )
 
@@ -965,9 +1168,23 @@ tf_py_test(
     shard_count = 4,
 )
 
+tf_py_test(
+    name = "training_arrays_test",
+    size = "small",
+    srcs = ["engine/training_arrays_test.py"],
+    additional_deps = [
+        ":keras",
+        ":layers",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 tf_py_test(
     name = "training_generator_test",
-    size = "large",
+    size = "medium",
     srcs = ["engine/training_generator_test.py"],
     additional_deps = [
         ":keras",
@@ -975,9 +1192,10 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 3,
+    shard_count = 6,
     tags = [
         "no_oss",
+        "notap",  #TODO(b/123544294): Re-enable this test.
         "notsan",
     ],
 )
@@ -1053,7 +1271,7 @@ tf_py_test(
 
 tf_py_test(
     name = "base_layer_test",
-    size = "small",
+    size = "medium",
     srcs = ["engine/base_layer_test.py"],
     additional_deps = [
         ":keras",
@@ -1061,6 +1279,8 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 8,
+    tags = ["no_rocm"],
 )
 
 tf_py_test(
@@ -1073,6 +1293,7 @@ tf_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
 tf_py_test(
@@ -1113,6 +1334,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
     ],
+    shard_count = 4,
 )
 
 tf_py_test(
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index f024b9b59a21df6e6771e89ef00428bcaaf49524..b7ec63837d92d11258a88b870e5af5be04c32e5e 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -25,6 +25,7 @@ from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import callbacks_v1
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
 from tensorflow.python.keras import estimator
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 8f10aca02098d481153dc9e647ba24d076021028..3f5f125087305aed006b73098a059ccc2fca8dd0 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -26,6 +26,19 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import keras_export
 
+# b/123041942
+# In TF 2.x, if the `tf.nn.softmax` is used as an activation function in Keras
+# layers, it gets serialized as 'softmax_v2' instead of 'softmax' as the
+# internal method name is returned in serialization. This results in errors in
+# model exporting and loading as Keras can't find any activation function with
+# the name of `softmax_v2`.
+
+# This dict maps the activation function name from its v2 version to its
+# canonical name.
+_TF_ACTIVATIONS_V2 = {
+    'softmax_v2': 'softmax',
+}
+
 
 @keras_export('keras.activations.softmax')
 def softmax(x, axis=-1):
@@ -152,16 +165,41 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 @keras_export('keras.activations.tanh')
 def tanh(x):
+  """Hyperbolic Tangent activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The tanh activation: `tanh(x) = sinh(x)/cosh(x) = ((exp(x) -
+      exp(-x))/(exp(x) + exp(-x)))`.
+  """
   return nn.tanh(x)
 
 
 @keras_export('keras.activations.sigmoid')
 def sigmoid(x):
+  """Sigmoid activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The sigmoid activation: `(1.0 / (1.0 + exp(-x)))`.
+  """
   return nn.sigmoid(x)
 
 
 @keras_export('keras.activations.exponential')
 def exponential(x):
+  """Exponential activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The exponential activation: `exp(x)`.
+  """
   return math_ops.exp(x)
 
 
@@ -185,11 +223,21 @@ def hard_sigmoid(x):
 
 @keras_export('keras.activations.linear')
 def linear(x):
+  """Linear activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The linear activation: `x`.
+  """
   return x
 
 
 @keras_export('keras.activations.serialize')
 def serialize(activation):
+  if activation.__name__ in _TF_ACTIVATIONS_V2:
+    return _TF_ACTIVATIONS_V2[activation.__name__]
   return activation.__name__
 
 
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index 33001f419ef076b1473b5407bc6a5ba4ee788104..a23b13bb5c8df0dc760aa493f3fddec6c264b43c 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import nn_ops as nn
 from tensorflow.python.platform import test
 
 
@@ -46,6 +47,14 @@ class KerasActivationsTest(test.TestCase):
       fn = keras.activations.deserialize(config)
       assert fn == ref_fn
 
+  def test_serialization_v2(self):
+    activation_map = {nn.softmax_v2: 'softmax'}
+    for fn_v2_key in activation_map:
+      fn_v2 = keras.activations.get(fn_v2_key)
+      config = keras.activations.serialize(fn_v2)
+      fn = keras.activations.deserialize(config)
+      assert fn.__name__ == activation_map[fn_v2_key]
+
   def test_softmax(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.softmax(x)])
@@ -136,10 +145,14 @@ class KerasActivationsTest(test.TestCase):
   def test_relu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.relu(x)])
-    test_values = np.random.random((2, 5))
-    result = f([test_values])[0]
-    # No negative values in test values...
-    self.assertAllClose(result, test_values, rtol=1e-05)
+    positive_values = np.random.random((2, 5))
+    result = f([positive_values])[0]
+    self.assertAllClose(result, positive_values, rtol=1e-05)
+
+    negative_values = np.random.uniform(-1, 0, (2, 5))
+    result = f([negative_values])[0]
+    expected = np.zeros((2, 5))
+    self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_elu(self):
     x = keras.backend.placeholder(ndim=2)
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index b15ca5990aef9bed088cccd0dea1be049386eaf2..ad6b58992a9195975fbee8d4b81d8f810ab6e3df 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -32,13 +32,11 @@ MODEL_LIST = [
     (applications.InceptionV3, 2048),
     (applications.InceptionResNetV2, 1536),
     (applications.MobileNet, 1024),
-    # TODO(fchollet): enable MobileNetV2 tests when a new TensorFlow test image
-    # is released with keras_applications upgraded to 1.0.5 or above.
+    (applications.MobileNetV2, 1280),
     (applications.DenseNet121, 1024),
     (applications.DenseNet169, 1664),
     (applications.DenseNet201, 1920),
     (applications.NASNetMobile, 1056),
-    (applications.NASNetLarge, 4032),
 ]
 
 
@@ -47,7 +45,8 @@ class ApplicationsTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters(*MODEL_LIST)
   def test_feature_extration_model(self, model_fn, output_dim):
     model = model_fn(include_top=False, weights=None)
-    self.assertEqual(model.output_shape, (None, None, None, output_dim))
+    self.assertLen(model.output_shape, 4)
+    self.assertEqual(model.output_shape[-1], output_dim)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 9677f12d34470e0be3678c1f05bb9de6ff83f44a..fa34927eba50233932eee16f4c1ad54be9a274e4 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -32,8 +32,12 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
@@ -51,6 +55,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -60,7 +65,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
@@ -75,6 +80,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
+# A graph which is used for constructing functions in eager mode.
+_CURRENT_SCRATCH_GRAPH = None
+
 # This is a thread local object that will hold the default internal TF session
 # used by Keras. It can be set manually via `set_session(sess)`.
 _SESSION = threading.local()
@@ -87,11 +95,8 @@ _GRAPH_LEARNING_PHASES = weakref.WeakKeyDictionary()
 
 # _DUMMY_EAGER_GRAPH is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
-# _GRAPH_LEARNING_PHASES. We use a dummy class instead of something like a
-# string because strings are not weakly-referencable.
-class _DummyEagerGraph(object):
-  pass
-_DUMMY_EAGER_GRAPH = _DummyEagerGraph()
+# _GRAPH_LEARNING_PHASES.
+_DUMMY_EAGER_GRAPH = threading.local()
 
 # This boolean flag can be set to True to leave variable initialization
 # up to the user.
@@ -218,8 +223,9 @@ def clear_session():
   _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
-    phase = array_ops.placeholder_with_default(
-        False, shape=(), name='keras_learning_phase')
+    with ops.name_scope(''):
+      phase = array_ops.placeholder_with_default(
+          False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES = {}
     _GRAPH_LEARNING_PHASES[graph] = phase
     _GRAPH_VARIABLES.pop(graph, None)
@@ -254,20 +260,33 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  if context.executing_eagerly():
-    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-      # Fallback to inference mode as default.
-      return 0
-    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
-  return symbolic_learning_phase()
+  if ops.get_default_graph() is _GRAPH:
+    # Don't enter an init_scope for the learning phase if eager execution
+    # is enabled but we're inside the Keras workspace graph.
+    return symbolic_learning_phase()
+  with ops.init_scope():
+    # We always check & set the learning phase inside the init_scope,
+    # otherwise the wrong default_graph will be used to look up the learning
+    # phase inside of functions & defuns.
+    #
+    # This is because functions & defuns (both in graph & in eager mode)
+    # will always execute non-eagerly using a function-specific default
+    # subgraph.
+    if context.executing_eagerly():
+      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+        # Fallback to inference mode as default.
+        return 0
+      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+    return symbolic_learning_phase()
 
 
 def symbolic_learning_phase():
   graph = get_graph()
   with graph.as_default():
     if graph not in _GRAPH_LEARNING_PHASES:
-      phase = array_ops.placeholder_with_default(
-          False, shape=(), name='keras_learning_phase')
+      with ops.name_scope(''):
+        phase = array_ops.placeholder_with_default(
+            False, shape=(), name='keras_learning_phase')
       _GRAPH_LEARNING_PHASES[graph] = phase
     return _GRAPH_LEARNING_PHASES[graph]
 
@@ -287,11 +306,25 @@ def set_learning_phase(value):
     raise ValueError('Expected learning phase to be 0 or 1.')
   with ops.init_scope():
     if context.executing_eagerly():
+      # In an eager context, the learning phase values applies to both the eager
+      # context and the internal Keras graph.
       _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
-    else:
-      _GRAPH_LEARNING_PHASES[get_graph()] = value
+    _GRAPH_LEARNING_PHASES[get_graph()] = value
 
 
+def set_eager_learning_phase(value):
+  """Internal utility that sets the learning phase in eager execution only.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+
+
+@keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
 def learning_phase_scope(value):
   """Provides a scope within which the learning phase is equal to `value`.
@@ -302,47 +335,102 @@ def learning_phase_scope(value):
      value: Learning phase value, either 0 or 1 (integers).
 
   Yields:
-    The provided value.
+    None.
 
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be 0 or 1.')
-  previous_value = learning_phase()
+
+  with ops.init_scope():
+    if context.executing_eagerly():
+      previous_eager_value = _GRAPH_LEARNING_PHASES.get(
+          _DUMMY_EAGER_GRAPH, None)
+    previous_graph_value = _GRAPH_LEARNING_PHASES.get(get_graph(), None)
+
   try:
     set_learning_phase(value)
-    yield value
+    yield
   finally:
     # Restore learning phase to initial value.
     with ops.init_scope():
       if context.executing_eagerly():
-        _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
-      else:
-        _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
+        if previous_eager_value is not None:
+          _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_eager_value
+        elif _DUMMY_EAGER_GRAPH in _GRAPH_LEARNING_PHASES:
+          del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+
+      graph = get_graph()
+      if previous_graph_value is not None:
+        _GRAPH_LEARNING_PHASES[graph] = previous_graph_value
+      elif graph in _GRAPH_LEARNING_PHASES:
+        del _GRAPH_LEARNING_PHASES[graph]
 
+@tf_contextlib.contextmanager
+def eager_learning_phase_scope(value):
+  """Internal scope that sets the learning phase in eager execution only.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
 
-def _get_session():
+  Yields:
+    None.
+
+  Raises:
+     ValueError: if `value` is neither `0` nor `1`.
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  previous_value = learning_phase()
+  try:
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+    yield
+  finally:
+    # Restore learning phase to initial value.
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
+
+
+def _current_graph(op_input_list):
+  """Return the graph members of `op_input_list`, or the current graph."""
+  return ops._get_graph_from_inputs(op_input_list)
+
+
+def _get_session(op_input_list=()):
   """Returns the session object for the current thread."""
   global _SESSION
   default_session = ops.get_default_session()
   if default_session is not None:
     session = default_session
   else:
-    if getattr(_SESSION, 'session', None) is None:
-      _SESSION.session = session_module.Session(
-          config=get_default_session_config())
+    if ops.inside_function():
+      raise RuntimeError('Cannot get session inside Tensorflow graph function.')
+    # If we don't have a session, or that session does not match the current
+    # graph, create and cache a new session.
+    if (getattr(_SESSION, 'session', None) is None or
+        _SESSION.session.graph is not _current_graph(op_input_list)):
+      # If we are creating the Session inside a tf.distribute.Strategy scope,
+      # we ask the strategy for the right session options to use.
+      if distribution_strategy_context.has_strategy():
+        configure_and_create_distributed_session(
+            distribution_strategy_context.get_strategy())
+      else:
+        _SESSION.session = session_module.Session(
+            config=get_default_session_config())
     session = _SESSION.session
   return session
 
 
 @keras_export(v1=['keras.backend.get_session'])
-def get_session():
+def get_session(op_input_list=()):
   """Returns the TF session to be used by the backend.
 
   If a default TensorFlow session is available, we will return it.
 
-  Else, we will return the global Keras session.
+  Else, we will return the global Keras session assuming it matches
+  the current graph.
 
   If no global Keras session exists at this point:
   we will create a new global session.
@@ -350,10 +438,15 @@ def get_session():
   Note that you can manually set the global session
   via `K.set_session(sess)`.
 
+  Arguments:
+      op_input_list: An option sequence of tensors or ops, which will be used
+        to determine the current graph. Otherwise the default graph will be
+        used.
+
   Returns:
       A TensorFlow session.
   """
-  session = _get_session()
+  session = _get_session(op_input_list)
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -370,6 +463,40 @@ def get_graph():
     return ops.get_default_graph()
 
 
+@tf_contextlib.contextmanager
+def _scratch_graph(graph=None):
+  """Retrieve a shared and temporary func graph.
+
+  The eager execution path lifts a subgraph from the keras global graph into
+  a scratch graph in order to create a function. DistributionStrategies, in
+  turn, constructs multiple functions as well as a final combined function. In
+  order for that logic to work correctly, all of the functions need to be
+  created on the same scratch FuncGraph.
+
+  Args:
+    graph: A graph to be used as the current scratch graph. If not set then
+      a scratch graph will either be retrieved or created:
+
+  Yields:
+    The current scratch graph.
+  """
+  global _CURRENT_SCRATCH_GRAPH
+  if (_CURRENT_SCRATCH_GRAPH is not None and graph is not None and
+      _CURRENT_SCRATCH_GRAPH is not graph):
+    raise ValueError('Multiple scratch graphs specified.')
+
+  if _CURRENT_SCRATCH_GRAPH:
+    yield _CURRENT_SCRATCH_GRAPH
+    return
+
+  graph = graph or func_graph.FuncGraph('keras_scratch_graph')
+  try:
+    _CURRENT_SCRATCH_GRAPH = graph
+    yield graph
+  finally:
+    _CURRENT_SCRATCH_GRAPH = None
+
+
 @keras_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
@@ -387,7 +514,9 @@ def get_default_session_config():
   else:
     num_thread = int(os.environ.get('OMP_NUM_THREADS'))
     config = config_pb2.ConfigProto(
-        intra_op_parallelism_threads=num_thread, allow_soft_placement=True)
+        intra_op_parallelism_threads=num_thread,
+        inter_op_parallelism_threads=num_thread,
+        allow_soft_placement=True)
   return config
 
 
@@ -657,6 +786,14 @@ def constant(value, dtype=None, shape=None, name=None):
   """
   if dtype is None:
     dtype = floatx()
+
+  # If the outer context is eager but we are executing under the keras
+  # FuncGraph, we create EagerTensors and use them as constants.
+  if (ops.executing_eagerly_outside_functions() and
+      getattr(get_graph(), 'name', '') == 'keras_graph'):
+    with ops.init_scope():
+      return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
+
   return constant_op.constant(value, dtype=dtype, shape=shape, name=name)
 
 
@@ -883,10 +1020,10 @@ def dtype(x):
       # Keras variable
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
   ```
   """
   return x.dtype.base_dtype.name
@@ -1517,7 +1654,7 @@ def min(x, axis=None, keepdims=False):
           the reduced dimension is retained with length 1.
 
   Returns:
-      A tensor with miminum values of `x`.
+      A tensor with minimum values of `x`.
   """
   return math_ops.reduce_min(x, axis, keepdims)
 
@@ -1558,6 +1695,7 @@ def prod(x, axis=None, keepdims=False):
   return math_ops.reduce_prod(x, axis, keepdims)
 
 
+@keras_export('keras.backend.cumsum')
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -1571,6 +1709,7 @@ def cumsum(x, axis=0):
   return math_ops.cumsum(x, axis=axis)
 
 
+@keras_export('keras.backend.cumprod')
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2678,7 +2817,7 @@ def get_value(x):
       return x.numpy()
   elif ops.inside_function():
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
-  return x.eval(session=get_session())
+  return x.eval(session=get_session((x,)))
 
 
 @keras_export('keras.backend.batch_get_value')
@@ -2699,7 +2838,7 @@ def batch_get_value(tensors):
   elif ops.inside_function():  # pylint: disable=protected-access
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   if tensors:
-    return get_session().run(tensors)
+    return get_session(tensors).run(tensors)
   else:
     return []
 
@@ -2818,8 +2957,11 @@ class GraphExecutionFunction(object):
                       'should be a list or tuple.')
     self.inputs = nest.flatten(inputs)
     self._outputs_structure = outputs
-    self.outputs = nest.flatten(outputs)
-    with ops.control_dependencies(self.outputs):
+    self.outputs = cast_variables_to_tensor(nest.flatten(outputs))
+    # TODO(b/127668432): Consider using autograph to generate these
+    # dependencies in call.
+    # Index 0 = total loss or model output for `predict`.
+    with ops.control_dependencies([self.outputs[0]]):
       updates_ops = []
       for update in updates:
         if isinstance(update, tuple):
@@ -2917,7 +3059,7 @@ class GraphExecutionFunction(object):
   def __call__(self, inputs):
     inputs = nest.flatten(inputs)
 
-    session = get_session()
+    session = get_session(inputs)
     feed_arrays = []
     array_vals = []
     feed_symbols = []
@@ -2974,48 +3116,79 @@ class EagerExecutionFunction(object):
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None):
+    self.name = name
+    self._outputs_structure = outputs
+    inputs = nest.flatten(inputs)
+    outputs = nest.flatten(outputs)
+
     updates = updates or []
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = nest.flatten(inputs)
-    self._outputs_structure = outputs
-    self.outputs = nest.flatten(outputs)
-    self.name = name
 
-    graph = get_graph()
+    if updates and not outputs:
+      # Edge case; never happens in practice
+      raise ValueError('Cannot create a Keras backend function with updates'
+                       ' but no outputs during eager execution.')
+
+    graphs = {i.graph for i in nest.flatten([inputs, outputs, updates])
+              if hasattr(i, 'graph')}
+    if len(graphs) > 1:
+      raise ValueError('Cannot create an execution function which is comprised '
+                       'of elements from multiple graphs.')
+
+    source_graph = graphs.pop()
+    global_graph = get_graph()
+
+    updates_ops = []
+    legacy_update_ops = []
+    for update in updates:
+      # For legacy reasons it is allowed to pass an update as a tuple
+      # `(variable, new_value)` (this maps to an assign op). Otherwise it
+      # is assumed to already be an op -- we cannot control its execution
+      # order.
+      if isinstance(update, tuple):
+        legacy_update_ops.append(update)
+      else:
+        if hasattr(update, 'op'):
+          update = update.op
+        updates_ops.append(update)
+
+    with _scratch_graph() as exec_graph:
+      global_graph = get_graph()
+      if source_graph not in (exec_graph, global_graph):
+        raise ValueError('Unknown graph. Aborting.')
+
+      if source_graph is global_graph and exec_graph is not global_graph:
+        init_tensors = (
+            outputs + updates_ops + [p for [p, _] in legacy_update_ops] +
+            [p_new for [_, p_new] in legacy_update_ops
+             if isinstance(p_new, ops.Tensor)])
+        lifted_map = lift_to_graph.lift_to_graph(
+            init_tensors=init_tensors, graph=exec_graph, sources=inputs,
+            add_sources=True, handle_captures=True, base_graph=source_graph)
+
+        inputs = [lifted_map[i] for i in inputs]
+        outputs = [lifted_map[i] for i in outputs]
+        updates_ops = [lifted_map[i] for i in updates_ops]
+        legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
+                             for p, p_new in legacy_update_ops]
+
     # Consolidate updates
-    with graph.as_default():
-      with ops.control_dependencies(self.outputs):
-        # In general, updates should be run after the outputs have been
-        # computed. However, we can only ensure this when we create
-        # the updates here (i.e. when updates are passed as tuples).
-        # We cannot modify the control dependencies of preexisting update ops.
-        updates_ops = []
-        for update in updates:
-          # For legacy reasons it is allowed to pass an update as a tuple
-          # `(variable, new_value)` (this maps to an assign op).
-          if isinstance(update, tuple):
-            p, new_p = update
-            updates_ops.append(state_ops.assign(p, new_p))
-          else:
-            # Assumed already an op -- we cannot control its execution order.
-            updates_ops.append(update)
-
-      # We set the update ops to run at the end by conditioning it on output[0]
-      if updates and not self.outputs:
-        # Edge case; never happens in practice
-        raise ValueError('Cannot create a Keras backend function with updates'
-                         ' but no outputs during eager execution.')
+    with exec_graph.as_default():
+      outputs = cast_variables_to_tensor(outputs)
+      with ops.control_dependencies(outputs):
+        for p, p_new in legacy_update_ops:
+          updates_ops.append(state_ops.assign(p, p_new))
+
+      self.inputs, self.outputs = inputs, outputs
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-    # Prepare graph function
-    # TODO(fchollet): can we restrict `captures` to variables actually used in
-    # the relevant subgraph?
-    graph.inputs = self.inputs + list(graph.captures.values())
-    graph.outputs = self.outputs
-    graph_fn = eager_function.ConcreteFunction(graph)
+      exec_graph.inputs = self.inputs + list(exec_graph.captures.values())
+      exec_graph.outputs = self.outputs
+      graph_fn = eager_function.ConcreteFunction(exec_graph)
+
     graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
@@ -3023,7 +3196,7 @@ class EagerExecutionFunction(object):
     # Handle placeholders with default
     # (treated as required placeholder by graph functions)
     self._placeholder_default_values = {}
-    with graph.as_default():
+    with exec_graph.as_default():
       for x in self.inputs:
         if x.op.type == 'PlaceholderWithDefault':
           self._placeholder_default_values[x] = tensor_util.constant_value(
@@ -3234,7 +3407,7 @@ def rnn(step_function,
   if unroll:
     if not time_steps:
       raise ValueError('Unrolling requires a fixed number of timesteps.')
-    states = initial_states
+    states = tuple(initial_states)
     successive_states = []
     successive_outputs = []
 
@@ -3266,7 +3439,8 @@ def rnn(step_function,
       for i in range(time_steps):
         inp = _get_input_tensor(i)
         mask_t = mask_list[i]
-        output, new_states = step_function(inp, states + constants)
+        output, new_states = step_function(inp,
+                                           tuple(states) + tuple(constants))
         tiled_mask_t = _expand_mask(mask_t, output)
 
         if not successive_outputs:
@@ -3301,7 +3475,7 @@ def rnn(step_function,
     else:
       for i in range(time_steps):
         inp = _get_input_tensor(i)
-        output, states = step_function(inp, states + constants)
+        output, states = step_function(inp, tuple(states) + tuple(constants))
         successive_outputs.append(output)
         successive_states.append(states)
       last_output = successive_outputs[-1]
@@ -3739,11 +3913,11 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
   if not from_logits:
-    if context.executing_eagerly() or output.op.type != 'Softmax':
+    if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or
+        output.op.type != 'Softmax'):
       axis = axis % len(output.shape)
       # scale preds so that the class probas of each sample sum to 1
       output = output / math_ops.reduce_sum(output, axis, True)
-
       # Compute cross entropy from probabilities.
       epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
       output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
@@ -3780,7 +3954,8 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
   if not from_logits:
-    if context.executing_eagerly() or output.op.type != 'Softmax':
+    if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or
+        output.op.type != 'Softmax'):
       epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
       output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
       output = math_ops.log(output)
@@ -3825,7 +4000,8 @@ def binary_crossentropy(target, output, from_logits=False):
       A tensor.
   """
   if not from_logits:
-    if context.executing_eagerly() or output.op.type != 'Sigmoid':
+    if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or
+        output.op.type != 'Sigmoid'):
       epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
       output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
 
@@ -3904,12 +4080,9 @@ def dropout(x, level, noise_shape=None, seed=None):
   Returns:
       A tensor.
   """
-  retain_prob = 1. - level
   if seed is None:
     seed = np.random.randint(10e6)
-  # the dummy 1. works around a TF bug
-  # (float32_ref vs. float32 incompatibility)
-  return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
+  return nn.dropout_v2(x, rate=level, noise_shape=noise_shape, seed=seed)
 
 
 @keras_export('keras.backend.l2_normalize')
@@ -4069,8 +4242,8 @@ def conv1d(x,
   x = nn.convolution(
       input=x,
       filter=kernel,
-      dilation_rate=(dilation_rate,),
-      strides=(strides,),
+      dilation_rate=dilation_rate,
+      strides=strides,
       padding=padding,
       data_format=tf_data_format)
   if data_format == 'channels_first' and tf_data_format == 'NWC':
@@ -4291,6 +4464,7 @@ def separable_conv2d(x,
   Raises:
       ValueError: if `data_format` is neither `channels_last` or
       `channels_first`.
+      ValueError: if `strides` is not a tuple of 2 integers.
   """
   if data_format is None:
     data_format = image_data_format()
@@ -4498,6 +4672,8 @@ def pool2d(x,
   Raises:
       ValueError: if `data_format` is neither `"channels_last"` or
       `"channels_first"`.
+      ValueError: if `pool_size` is not a tuple of 2 integers.
+      ValueError: if `strides` is not a tuple of 2 integers.
       ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
   """
   if data_format is None:
@@ -4662,6 +4838,7 @@ def local_conv(inputs,
   return permute_dimensions(output, permutation)
 
 
+@keras_export('keras.backend.local_conv1d')
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -4696,6 +4873,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
                     data_format)
 
 
+@keras_export('keras.backend.local_conv2d')
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -4955,7 +5133,8 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
   vals_sparse = array_ops.gather_nd(labels, indices)
 
   return sparse_tensor.SparseTensor(
-      math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
+      math_ops.cast(indices, dtypes_module.int64), vals_sparse,
+      math_ops.cast(label_shape, dtypes_module.int64))
 
 
 @keras_export('keras.backend.ctc_batch_cost')
@@ -4976,10 +5155,12 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
       Tensor with shape (samples,1) containing the
           CTC loss of each element.
   """
-  label_length = math_ops.to_int32(array_ops.squeeze(label_length, axis=-1))
-  input_length = math_ops.to_int32(array_ops.squeeze(input_length, axis=-1))
-  sparse_labels = math_ops.to_int32(
-      ctc_label_dense_to_sparse(y_true, label_length))
+  label_length = math_ops.cast(
+      array_ops.squeeze(label_length, axis=-1), dtypes_module.int32)
+  input_length = math_ops.cast(
+      array_ops.squeeze(input_length, axis=-1), dtypes_module.int32)
+  sparse_labels = math_ops.cast(
+      ctc_label_dense_to_sparse(y_true, label_length), dtypes_module.int32)
 
   y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
 
@@ -5018,7 +5199,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
               the log probability of each decoded sequence.
   """
   y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
-  input_length = math_ops.to_int32(input_length)
+  input_length = math_ops.cast(input_length, dtypes_module.int32)
 
   if greedy:
     (decoded, log_prob) = ctc.ctc_greedy_decoder(
@@ -5053,7 +5234,7 @@ def map_fn(fn, elems, name=None, dtype=None):
   Returns:
       Tensor with dtype `dtype`.
   """
-  return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
+  return map_fn_lib.map_fn(fn, elems, name=name, dtype=dtype)
 
 
 @keras_export('keras.backend.foldl')
@@ -5135,3 +5316,70 @@ if not os.path.exists(_config_path):
   except IOError:
     # Except permission denied.
     pass
+
+
+def in_multi_worker_mode():
+  """Whether we are operating in a Multi-Worker setting."""
+  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
+  cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
+  return tf_config and 'master' not in cluster_spec.jobs
+
+
+def configure_and_create_distributed_session(distribution_strategy):
+  """Configure session config and create a session with it."""
+
+  def _create_session(distribution_strategy):
+    """Create the Distributed Strategy session."""
+    session_config = get_default_session_config()
+
+    # If a session already exists, merge in its config; in the case there is a
+    # conflict, take values of the existing config.
+    global _SESSION
+    if getattr(_SESSION, 'session', None) and _SESSION.session._config:
+      session_config.MergeFrom(_SESSION.session._config)
+
+    if is_tpu_strategy(distribution_strategy):
+      # TODO(priyag, yuefengz): Remove this workaround when Distribute
+      # Coordinator is integrated with keras and we can create a session from
+      # there.
+      distribution_strategy.configure(session_config)
+      master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+      session = session_module.Session(config=session_config, target=master)
+    else:
+      worker_context = dc_context.get_current_worker_context()
+      if worker_context:
+        dc_session_config = worker_context.session_config
+        # Merge the default session config to the one from distribute
+        # coordinator, which is fine for now since they don't have
+        # conflicting configurations.
+        dc_session_config.MergeFrom(session_config)
+        session = session_module.Session(
+            config=dc_session_config, target=worker_context.master_target)
+      else:
+        distribution_strategy.configure(session_config)
+        session = session_module.Session(config=session_config)
+
+    set_session(session)
+
+  if in_multi_worker_mode():
+    dc.run_distribute_coordinator(
+        _create_session,
+        distribution_strategy,
+        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+  else:
+    _create_session(distribution_strategy)
+
+
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def cast_variables_to_tensor(tensors):
+
+  def _cast_variables_to_tensor(tensor):
+    if isinstance(tensor, variables_module.Variable):
+      return array_ops.identity(tensor)
+    return tensor
+
+  return nest.map_structure(_cast_variables_to_tensor, tensors)
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index e3f86ee2c09d8a8e93bc0e11d9e7a915313461c1..1084c32bdd05e99da1e43bae72c2d52342dad483 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -108,34 +108,61 @@ class BackendUtilsTest(test.TestCase):
 
   def test_learning_phase(self):
     with self.cached_session() as sess:
-      keras.backend.set_learning_phase(1)
-      self.assertEqual(keras.backend.learning_phase(), 1)
       with self.assertRaises(ValueError):
         keras.backend.set_learning_phase(2)
 
       # Test running with a learning-phase-consuming layer
-      keras.backend.set_learning_phase(0)
-      x = keras.Input((3,))
-      y = keras.layers.BatchNormalization()(x)
-      if not context.executing_eagerly():
-        self.evaluate(variables.global_variables_initializer())
-        sess.run(y, feed_dict={x: np.random.random((2, 3))})
+      with keras.backend.learning_phase_scope(0):
+        x = keras.Input((3,))
+        y = keras.layers.BatchNormalization()(x)
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          sess.run(y, feed_dict={x: np.random.random((2, 3))})
+
+  def test_learning_phase_name(self):
+    with ops.name_scope('test_scope'):
+      # Test that outer name scopes do not affect the learning phase's name.
+      lp = keras.backend.symbolic_learning_phase()
+    self.assertEqual(lp.name, 'keras_learning_phase:0')
 
   def test_learning_phase_scope(self):
-    with self.cached_session():
-      initial_learning_phase = keras.backend.learning_phase()
-      with keras.backend.learning_phase_scope(1) as lp:
-        self.assertEqual(lp, 1)
-        self.assertEqual(keras.backend.learning_phase(), 1)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with keras.backend.learning_phase_scope(0) as lp:
-        self.assertEqual(lp, 0)
-        self.assertEqual(keras.backend.learning_phase(), 0)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with self.assertRaises(ValueError):
-        with keras.backend.learning_phase_scope(None):
-          pass
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    initial_learning_phase = keras.backend.learning_phase()
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with keras.backend.learning_phase_scope(0):
+      self.assertEqual(keras.backend.learning_phase(), 0)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with self.assertRaises(ValueError):
+      with keras.backend.learning_phase_scope(None):
+        pass
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+
+    new_learning_phase = 0
+    keras.backend.set_learning_phase(new_learning_phase)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+
+  def test_learning_phase_scope_in_graph(self):
+    initial_learning_phase_outside_graph = keras.backend.learning_phase()
+    with keras.backend.get_graph().as_default():
+      initial_learning_phase_in_graph = keras.backend.learning_phase()
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+
+    with keras.backend.get_graph().as_default():
+      self.assertEqual(keras.backend.learning_phase(),
+                       initial_learning_phase_in_graph)
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
 
   def test_int_shape(self):
     x = keras.backend.ones(shape=(3, 4))
@@ -146,21 +173,20 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(keras.backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
-    with self.cached_session():
-      y1 = keras.backend.variable(1)
-      y2 = keras.backend.variable(2)
-      if context.executing_eagerly():
-        with keras.backend.learning_phase_scope(0):
-          y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
-        with keras.backend.learning_phase_scope(1):
-          y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
-      else:
-        y = keras.backend.in_train_phase(y1, y2)
-        f = keras.backend.function([keras.backend.learning_phase()], [y])
-        y_val_test = f([0])[0]
-        y_val_train = f([1])[0]
-      self.assertAllClose(y_val_test, 2)
-      self.assertAllClose(y_val_train, 1)
+    y1 = keras.backend.variable(1)
+    y2 = keras.backend.variable(2)
+    if context.executing_eagerly():
+      with keras.backend.learning_phase_scope(0):
+        y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
+      with keras.backend.learning_phase_scope(1):
+        y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
+    else:
+      y = keras.backend.in_train_phase(y1, y2)
+      f = keras.backend.function([keras.backend.learning_phase()], [y])
+      y_val_test = f([0])[0]
+      y_val_train = f([1])[0]
+    self.assertAllClose(y_val_test, 2)
+    self.assertAllClose(y_val_train, 1)
 
   def test_is_keras_tensor(self):
     x = keras.backend.variable(1)
@@ -187,74 +213,63 @@ class BackendUtilsTest(test.TestCase):
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones(self):
-    with self.cached_session():
-      x = keras.backend.ones((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.ones((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_eye(self):
-    with self.cached_session():
-      x = keras.backend.eye(4)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.eye(4))
+    x = keras.backend.eye(4)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.eye(4))
 
   def test_zeros_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.zeros_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.zeros_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.ones_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.ones_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_random_uniform_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1.5, atol=1e-1)
-      self.assertAllClose(val.max(), 2., atol=1e-1)
-      self.assertAllClose(val.min(), 1., atol=1e-1)
+    x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+    self.assertAllClose(val.max(), 2., atol=1e-1)
+    self.assertAllClose(val.min(), 1., atol=1e-1)
 
   def test_random_normal_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
-                                               seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1., atol=1e-1)
-      self.assertAllClose(val.std(), 0.5, atol=1e-1)
+    x = keras.backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1., atol=1e-1)
+    self.assertAllClose(val.std(), 0.5, atol=1e-1)
 
   def test_count_params(self):
-    with self.cached_session():
-      x = keras.backend.zeros((4, 5))
-      val = keras.backend.count_params(x)
-      self.assertAllClose(val, 20)
+    x = keras.backend.zeros((4, 5))
+    val = keras.backend.count_params(x)
+    self.assertAllClose(val, 20)
 
   def test_constant(self):
-    with self.cached_session():
-      ref_val = np.random.random((3, 4)).astype('float32')
-      x = keras.backend.constant(ref_val)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, ref_val)
+    ref_val = np.random.random((3, 4)).astype('float32')
+    x = keras.backend.constant(ref_val)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, ref_val)
 
   def test_sparse_variable(self):
-    with self.cached_session():
-      val = scipy.sparse.eye(10)
-      x = keras.backend.variable(val)
-      self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
+    val = scipy.sparse.eye(10)
+    x = keras.backend.variable(val)
+    self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
 
-      y = keras.backend.to_dense(x)
-      self.assertFalse(keras.backend.is_sparse(y))
+    y = keras.backend.to_dense(x)
+    self.assertFalse(keras.backend.is_sparse(y))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -264,18 +279,18 @@ class BackendLinearAlgebraTest(test.TestCase):
     x = keras.backend.ones(shape=(2, 3))
     y = keras.backend.ones(shape=(3, 4))
     xy = keras.backend.dot(x, y)
-    self.assertEqual(xy.get_shape().as_list(), [2, 4])
+    self.assertEqual(xy.shape.as_list(), [2, 4])
 
     x = keras.backend.ones(shape=(32, 28, 3))
     y = keras.backend.ones(shape=(3, 4))
     xy = keras.backend.dot(x, y)
-    self.assertEqual(xy.get_shape().as_list(), [32, 28, 4])
+    self.assertEqual(xy.shape.as_list(), [32, 28, 4])
 
   def test_batch_dot(self):
     x = keras.backend.ones(shape=(32, 20, 1))
     y = keras.backend.ones(shape=(32, 30, 20))
     xy = keras.backend.batch_dot(x, y, axes=[1, 2])
-    self.assertEqual(xy.get_shape().as_list(), [32, 1, 30])
+    self.assertEqual(xy.shape.as_list(), [32, 1, 30])
 
     # TODO(fchollet): insufficiently tested.
 
@@ -292,20 +307,19 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.argmax, np.argmax),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': 1},
-                                         np_kwargs={'axis': 1})
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': -1},
-                                         np_kwargs={'axis': -1})
-        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
-          compare_single_input_op_to_numpy(keras_op, np_op,
-                                           input_shape=(4, 7, 5),
-                                           keras_kwargs={'axis': 1,
-                                                         'keepdims': True},
-                                           np_kwargs={'axis': 1,
-                                                      'keepdims': True})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': 1},
+                                       np_kwargs={'axis': 1})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': -1},
+                                       np_kwargs={'axis': -1})
+      if 'keepdims' in tf_inspect.getargspec(keras_op).args:
+        compare_single_input_op_to_numpy(keras_op, np_op,
+                                         input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': 1,
+                                                       'keepdims': True},
+                                         np_kwargs={'axis': 1,
+                                                    'keepdims': True})
 
   def test_elementwise_ops(self):
     ops_to_test = [
@@ -318,32 +332,28 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.exp, np.exp),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
 
     ops_to_test = [
         (keras.backend.sqrt, np.sqrt),
         (keras.backend.log, np.log),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op,
-                                         input_shape=(4, 7),
-                                         negative_values=False)
+      compare_single_input_op_to_numpy(keras_op, np_op,
+                                       input_shape=(4, 7),
+                                       negative_values=False)
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.clip, np.clip,
-          input_shape=(6, 4),
-          keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
-          np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+    compare_single_input_op_to_numpy(
+        keras.backend.clip, np.clip,
+        input_shape=(6, 4),
+        keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
+        np_kwargs={'a_min': 0.1, 'a_max': 1.4})
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.pow, np.power,
-          input_shape=(6, 4),
-          keras_args=[3],
-          np_args=[3])
+    compare_single_input_op_to_numpy(
+        keras.backend.pow, np.power,
+        input_shape=(6, 4),
+        keras_args=[3],
+        np_args=[3])
 
   def test_two_tensor_ops(self):
     ops_to_test = [
@@ -357,98 +367,95 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.minimum, np.minimum),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                       input_shape_a=(4, 7),
-                                       input_shape_b=(4, 7))
+      compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                     input_shape_a=(4, 7),
+                                     input_shape_b=(4, 7))
 
   def test_relu(self):
     x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
-    with self.cached_session():
-      # standard relu
-      relu_op = keras.backend.relu(x)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # alpha (leaky relu used)
-      relu_op = keras.backend.relu(x, alpha=0.5)
-      if not context.executing_eagerly():
-        self.assertTrue('LeakyRelu' in relu_op.name)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+    # standard relu
+    relu_op = keras.backend.relu(x)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+    # alpha (leaky relu used)
+    relu_op = keras.backend.relu(x, alpha=0.5)
+    if not context.executing_eagerly():
+      self.assertTrue('LeakyRelu' in relu_op.name)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
 
-      # max_value < some elements
-      relu_op = keras.backend.relu(x, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+    # max_value < some elements
+    relu_op = keras.backend.relu(x, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
 
-      # nn.relu6 used
-      relu_op = keras.backend.relu(x, max_value=6)
-      if not context.executing_eagerly():
-        self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+    # nn.relu6 used
+    relu_op = keras.backend.relu(x, max_value=6)
+    if not context.executing_eagerly():
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
 
-      # max value > 6
-      relu_op = keras.backend.relu(x, max_value=10)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # max value > 6
+    relu_op = keras.backend.relu(x, max_value=10)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # max value is float
-      relu_op = keras.backend.relu(x, max_value=4.3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+    # max value is float
+    relu_op = keras.backend.relu(x, max_value=4.3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
 
-      # max value == 0
-      relu_op = keras.backend.relu(x, max_value=0)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+    # max value == 0
+    relu_op = keras.backend.relu(x, max_value=0)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
 
-      # alpha and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+    # alpha and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
 
-      # threshold
-      relu_op = keras.backend.relu(x, threshold=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+    # threshold
+    relu_op = keras.backend.relu(x, threshold=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
 
-      # threshold is float
-      relu_op = keras.backend.relu(x, threshold=1.5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # threshold is float
+    relu_op = keras.backend.relu(x, threshold=1.5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # threshold is negative
-      relu_op = keras.backend.relu(x, threshold=-5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+    # threshold is negative
+    relu_op = keras.backend.relu(x, threshold=-5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
 
-      # threshold and max_value
-      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+    # threshold and max_value
+    relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
 
-      # threshold and alpha
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+    # threshold and alpha
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
 
-      # threshold, alpha, and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+    # threshold, alpha, and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
-                                       input_shape=(4, 7),
-                                       keras_args=[(2, 14)],
-                                       np_args=[(2, 14)])
+    compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
+                                     input_shape=(4, 7),
+                                     keras_args=[(2, 14)],
+                                     np_args=[(2, 14)])
 
   def test_concatenate(self):
     a = keras.backend.variable(np.ones((1, 2, 3)))
     b = keras.backend.variable(np.ones((1, 2, 2)))
     y = keras.backend.concatenate([a, b], axis=-1)
-    self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
+    self.assertEqual(y.shape.as_list(), [1, 2, 5])
 
   def test_permute_dimensions(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
-                                       np.transpose,
-                                       input_shape=(4, 7),
-                                       keras_args=[(1, 0)],
-                                       np_args=[(1, 0)])
+    compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
+                                     np.transpose,
+                                     input_shape=(4, 7),
+                                     keras_args=[(1, 0)],
+                                     np_args=[(1, 0)])
 
   def test_resize_images(self):
     height_factor = 2
@@ -459,7 +466,7 @@ class BackendShapeOpsTest(test.TestCase):
                                     height_factor,
                                     width_factor,
                                     data_format)
-    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 3])
+    self.assertEqual(y.shape.as_list(), [1, 4, 4, 3])
 
     data_format = 'channels_first'
     x = keras.backend.variable(np.ones((1, 3, 2, 2)))
@@ -467,7 +474,7 @@ class BackendShapeOpsTest(test.TestCase):
                                     height_factor,
                                     width_factor,
                                     data_format)
-    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4])
+    self.assertEqual(y.shape.as_list(), [1, 3, 4, 4])
 
     # Invalid use:
     with self.assertRaises(ValueError):
@@ -487,7 +494,7 @@ class BackendShapeOpsTest(test.TestCase):
                                      height_factor,
                                      width_factor,
                                      data_format)
-    self.assertEqual(y.get_shape().as_list(), [1, 4, 4, 4, 3])
+    self.assertEqual(y.shape.as_list(), [1, 4, 4, 4, 3])
 
     data_format = 'channels_first'
     x = keras.backend.variable(np.ones((1, 3, 2, 2, 2)))
@@ -496,7 +503,7 @@ class BackendShapeOpsTest(test.TestCase):
                                      height_factor,
                                      width_factor,
                                      data_format)
-    self.assertEqual(y.get_shape().as_list(), [1, 3, 4, 4, 4])
+    self.assertEqual(y.shape.as_list(), [1, 3, 4, 4, 4])
 
     # Invalid use:
     with self.assertRaises(ValueError):
@@ -509,32 +516,30 @@ class BackendShapeOpsTest(test.TestCase):
   def test_repeat_elements(self):
     x = keras.backend.variable(np.ones((1, 3, 2)))
     y = keras.backend.repeat_elements(x, 3, axis=1)
-    self.assertEqual(y.get_shape().as_list(), [1, 9, 2])
+    self.assertEqual(y.shape.as_list(), [1, 9, 2])
 
     # Use with a dynamic axis:
     if not context.executing_eagerly():
       x = keras.backend.placeholder(shape=(2, None, 2))
       y = keras.backend.repeat_elements(x, 3, axis=1)
-      self.assertEqual(y.get_shape().as_list(), [2, None, 2])
+      self.assertEqual(y.shape.as_list(), [2, None, 2])
 
   def test_repeat(self):
     x = keras.backend.variable(np.ones((1, 3)))
     y = keras.backend.repeat(x, 2)
-    self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
+    self.assertEqual(y.shape.as_list(), [1, 2, 3])
 
   def test_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4 * 7 * 6,)])
+    compare_single_input_op_to_numpy(keras.backend.flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4 * 7 * 6,)])
 
   def test_batch_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.batch_flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4, 7 * 6)])
+    compare_single_input_op_to_numpy(keras.backend.batch_flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4, 7 * 6)])
 
   def test_temporal_padding(self):
 
@@ -545,12 +550,11 @@ class BackendShapeOpsTest(test.TestCase):
       y[:, padding[0]:-padding[1], :] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.temporal_padding,
-                                       ref_op,
-                                       input_shape=(4, 7, 6),
-                                       keras_args=[(2, 3)],
-                                       np_args=[(2, 3)])
+    compare_single_input_op_to_numpy(keras.backend.temporal_padding,
+                                     ref_op,
+                                     input_shape=(4, 7, 6),
+                                     keras_args=[(2, 3)],
+                                     np_args=[(2, 3)])
 
   def test_spatial_2d_padding(self):
 
@@ -568,23 +572,22 @@ class BackendShapeOpsTest(test.TestCase):
         y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_first'})
 
   def test_spatial_3d_padding(self):
 
@@ -611,73 +614,70 @@ class BackendShapeOpsTest(test.TestCase):
           padding[2][0]:-padding[2][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_first'})
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
-    with self.cached_session():
-      keras_op = keras.backend.bias_add
-      np_op = np.add
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 2, 7),
-                                     input_shape_b=(7,))
-
-      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((3, 4))
-        keras.backend.bias_add(x, b)
-      with self.assertRaises(ValueError):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((4,))
-        keras.backend.bias_add(x, b, data_format='unknown')
+    keras_op = keras.backend.bias_add
+    np_op = np.add
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 2, 7),
+                                   input_shape_b=(7,))
+
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((3, 4))
+      keras.backend.bias_add(x, b)
+    with self.assertRaises(ValueError):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((4,))
+      keras.backend.bias_add(x, b, data_format='unknown')
 
   def test_bias_add_channels_first(self):
-    with self.cached_session():
 
-      def keras_op(x, b):
-        return keras.backend.bias_add(x, b, data_format='channels_first')
+    def keras_op(x, b):
+      return keras.backend.bias_add(x, b, data_format='channels_first')
 
-      def np_op(x, b):
-        if x.ndim == 3:
-          b = b.reshape((1, b.shape[0], 1))
-        if x.ndim == 4:
-          b = b.reshape((1, b.shape[0], 1, 1))
-        return x + b
+    def np_op(x, b):
+      if x.ndim == 3:
+        b = b.reshape((1, b.shape[0], 1))
+      if x.ndim == 4:
+        b = b.reshape((1, b.shape[0], 1, 1))
+      return x + b
 
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(3,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(3,))
 
   def test_pool2d(self):
     val = np.random.random((10, 3, 10, 10))
@@ -685,30 +685,30 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
                              padding='valid', data_format='channels_first',
                              pool_mode='max')
-    self.assertEqual(y.get_shape().as_list(), [10, 3, 9, 9])
+    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
 
     y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
                              padding='valid', data_format='channels_first',
                              pool_mode='avg')
-    self.assertEqual(y.get_shape().as_list(), [10, 3, 9, 9])
+    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
 
     val = np.random.random((10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
                              padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 9, 9, 3])
+    self.assertEqual(y.shape.as_list(), [10, 9, 9, 3])
 
     val = np.random.random((10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 10, 3])
+    self.assertEqual(y.shape.as_list(), [10, 10, 10, 3])
 
     val = np.random.random((10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool2d(x, (2, 2), strides=(2, 2),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5, 3])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5, 3])
 
     with self.assertRaises(ValueError):
       y = keras.backend.pool2d(x, (2, 2), strides=(2, 2),
@@ -729,30 +729,30 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
                              padding='valid', data_format='channels_first',
                              pool_mode='max')
-    self.assertEqual(y.get_shape().as_list(), [10, 3, 9, 9, 9])
+    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
 
     y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
                              padding='valid', data_format='channels_first',
                              pool_mode='avg')
-    self.assertEqual(y.get_shape().as_list(), [10, 3, 9, 9, 9])
+    self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
 
     val = np.random.random((10, 10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
                              padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 9, 9, 9, 3])
+    self.assertEqual(y.shape.as_list(), [10, 9, 9, 9, 3])
 
     val = np.random.random((10, 10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 10, 10, 3])
+    self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 3])
 
     val = np.random.random((10, 10, 10, 10, 3))
     x = keras.backend.variable(val)
     y = keras.backend.pool3d(x, (2, 2, 2), strides=(2, 2, 2),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5, 5, 3])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 3])
 
   def test_conv1d(self):
     val = np.random.random((10, 4, 10))
@@ -761,25 +761,25 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     k = keras.backend.variable(kernel_val)
     y = keras.backend.conv1d(x, k, strides=(1,),
                              padding='valid', data_format='channels_first')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 8])
+    self.assertEqual(y.shape.as_list(), [10, 5, 8])
 
     val = np.random.random((10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv1d(x, k, strides=(1,),
                              padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 8, 5])
+    self.assertEqual(y.shape.as_list(), [10, 8, 5])
 
     val = np.random.random((10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv1d(x, k, strides=(1,),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 5])
+    self.assertEqual(y.shape.as_list(), [10, 10, 5])
 
     val = np.random.random((10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv1d(x, k, strides=(2,),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5])
 
   def test_local_conv_channels_dim(self):
     filters = 3
@@ -837,9 +837,9 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                              strides,
                                              output_shape,
                                              'channels_last')
-          with self.cached_session():
-            conv_cf = keras.backend.eval(conv_cf)
-            conv_cl = keras.backend.eval(conv_cl)
+
+          conv_cf = keras.backend.eval(conv_cf)
+          conv_cl = keras.backend.eval(conv_cl)
 
           self.assertAllCloseAccordingToType(
               conv_cf,
@@ -887,9 +887,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                   output_shape,
                                                   'channels_last')
 
-    with self.cached_session():
-      local_conv = keras.backend.eval(local_conv)
-      local_conv_dim = keras.backend.eval(local_conv_dim)
+    local_conv = keras.backend.eval(local_conv)
+    local_conv_dim = keras.backend.eval(local_conv_dim)
 
     self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
 
@@ -900,25 +899,25 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     k = keras.backend.variable(kernel_val)
     y = keras.backend.conv2d(x, k,
                              padding='valid', data_format='channels_first')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 8, 8])
+    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv2d(x, k, strides=(1, 1),
                              padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 8, 8, 5])
+    self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv2d(x, k, strides=(1, 1),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 10, 5])
+    self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv2d(x, k, strides=(2, 2),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5, 5])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
     with self.assertRaises(ValueError):
       y = keras.backend.conv2d(x, k, (2, 2),
                                padding='other', data_format='channels_last')
@@ -937,25 +936,25 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     pk = keras.backend.variable(pointwise_kernel_val)
     y = keras.backend.separable_conv2d(
         x, dk, pk, padding='valid', data_format='channels_first')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 8, 8])
+    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.separable_conv2d(
         x, dk, pk, strides=(1, 1), padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 8, 8, 5])
+    self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.separable_conv2d(
         x, dk, pk, strides=(1, 1), padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 10, 5])
+    self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
 
     val = np.random.random((10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.separable_conv2d(
         x, dk, pk, strides=(2, 2), padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5, 5])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
     with self.assertRaises(ValueError):
       y = keras.backend.separable_conv2d(
           x, dk, pk, (2, 2), padding='other', data_format='channels_last')
@@ -972,25 +971,25 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     k = keras.backend.variable(kernel_val)
     y = keras.backend.conv3d(x, k,
                              padding='valid', data_format='channels_first')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 8, 8, 8])
+    self.assertEqual(y.shape.as_list(), [10, 5, 8, 8, 8])
 
     val = np.random.random((10, 10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv3d(x, k, strides=(1, 1, 1),
                              padding='valid', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 8, 8, 8, 5])
+    self.assertEqual(y.shape.as_list(), [10, 8, 8, 8, 5])
 
     val = np.random.random((10, 10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv3d(x, k, strides=(1, 1, 1),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 10, 10, 10, 5])
+    self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 5])
 
     val = np.random.random((10, 10, 10, 10, 4))
     x = keras.backend.variable(val)
     y = keras.backend.conv3d(x, k, strides=(2, 2, 2),
                              padding='same', data_format='channels_last')
-    self.assertEqual(y.get_shape().as_list(), [10, 5, 5, 5, 5])
+    self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 5])
     with self.assertRaises(ValueError):
       y = keras.backend.conv3d(x, k, (2, 2, 2),
                                padding='other', data_format='channels_last')
@@ -1045,24 +1044,21 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        for state in new_states:
-          self.assertEqual(state.get_shape().as_list(),
-                           [num_samples, output_dim])
-
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 1)
-        state_list[i].append(keras.backend.eval(new_states[0]))
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
+      self.assertEqual(outputs.shape.as_list(),
+                       [num_samples, timesteps, output_dim])
+      for state in new_states:
+        self.assertEqual(state.shape.as_list(), [num_samples, output_dim])
+
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 1)
+      state_list[i].append(keras.backend.eval(new_states[0]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1144,29 +1140,26 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        # for state in new_states:
-        #   self.assertEqual(state.get_shape().as_list(),
-        #                     [num_samples, output_dim])
-        self.assertEqual(new_states[0].get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(new_states[1].get_shape().as_list(),
-                         [num_samples, 2 * output_dim])
-
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 2)
-        state_list[i].append(keras.backend.eval(new_states[0]))
-        additional_state_list[i].append(keras.backend.eval(new_states[1]))
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
+      self.assertEqual(outputs.shape.as_list(),
+                       [num_samples, timesteps, output_dim])
+      # for state in new_states:
+      #   self.assertEqual(state.shape.as_list(),
+      #                     [num_samples, output_dim])
+      self.assertEqual(new_states[0].shape.as_list(), [num_samples, output_dim])
+      self.assertEqual(new_states[1].shape.as_list(),
+                       [num_samples, 2 * output_dim])
+
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 2)
+      state_list[i].append(keras.backend.eval(new_states[0]))
+      additional_state_list[i].append(keras.backend.eval(new_states[1]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1331,78 +1324,100 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     beta = keras.backend.variable(b_val)
     normed, mean, var = keras.backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
+    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+    self.assertEqual(mean.shape.as_list(), [
+        3,
+    ])
+    self.assertEqual(var.shape.as_list(), [
+        3,
+    ])
 
     # case: gamma=None
     gamma = None
     normed, mean, var = keras.backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
+    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+    self.assertEqual(mean.shape.as_list(), [
+        3,
+    ])
+    self.assertEqual(var.shape.as_list(), [
+        3,
+    ])
 
     # case: beta=None
     beta = None
     normed, mean, var = keras.backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
+    self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
+    self.assertEqual(mean.shape.as_list(), [
+        3,
+    ])
+    self.assertEqual(var.shape.as_list(), [
+        3,
+    ])
+
+  def test_dropout(self):
+    inputs = array_ops.ones((200, 200))
+    outputs = keras.backend.dropout(inputs, 0.2)
+    outputs_val = keras.backend.eval(outputs)
+    self.assertEqual(np.min(outputs_val), 0)
+    self.assertAllClose(np.count_nonzero(outputs_val), 32000, atol=1000)
+    # Test noise shape
+    outputs = keras.backend.dropout(inputs, 0.2, noise_shape=(200, 1))
+    outputs_val = keras.backend.eval(outputs)
+    self.assertAllClose(outputs_val[2, :], outputs_val[3, :], atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
-    with self.cached_session():
-      depth = 6
-      seq_len_0 = 5
-      input_prob_matrix_0 = np.asarray(
-          [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
-           [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
-           [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
-           [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
-           [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
-           # Random entry added in at time=5
-           [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
-          dtype=np.float32)
-
-      # len max_time_steps array of batch_size x depth matrices
-      inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
-                 for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
-                2 * [np.zeros((1, depth), dtype=np.float32)])
-
-      inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
-
-      # batch_size length vector of sequence_lengths
-      input_length = keras.backend.variable(
-          np.array([seq_len_0], dtype=np.int32))
-      # batch_size length vector of negative log probabilities
-      log_prob_truth = np.array([
-          -3.5821197,  # output beam 0
-          -3.777835    # output beam 1
-      ], np.float32)[np.newaxis, :]
-
-      decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
-      beam_width = 2
-      top_paths = 2
-
-      decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
-          inputs,
-          input_length,
-          greedy=False,
-          beam_width=beam_width,
-          top_paths=top_paths)
-
-      self.assertEqual(len(decode_pred_tf), top_paths)
-      log_prob_pred = keras.backend.eval(log_prob_pred_tf)
-      for i in range(top_paths):
-        self.assertTrue(
-            np.alltrue(
-                decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
-      self.assertAllClose(log_prob_truth, log_prob_pred)
+    depth = 6
+    seq_len_0 = 5
+    input_prob_matrix_0 = np.asarray(
+        [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
+         [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
+         [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
+         [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
+         [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
+         # Random entry added in at time=5
+         [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
+        dtype=np.float32)
+
+    # len max_time_steps array of batch_size x depth matrices
+    inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
+               for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
+              2 * [np.zeros((1, depth), dtype=np.float32)])
+
+    inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
+
+    # batch_size length vector of sequence_lengths
+    input_length = keras.backend.variable(
+        np.array([seq_len_0], dtype=np.int32))
+    # batch_size length vector of negative log probabilities
+    log_prob_truth = np.array([
+        -3.5821197,  # output beam 0
+        -3.777835    # output beam 1
+    ], np.float32)[np.newaxis, :]
+
+    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    beam_width = 2
+    top_paths = 2
+
+    decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
+        inputs,
+        input_length,
+        greedy=False,
+        beam_width=beam_width,
+        top_paths=top_paths)
+
+    self.assertEqual(len(decode_pred_tf), top_paths)
+    log_prob_pred = keras.backend.eval(log_prob_pred_tf)
+    for i in range(top_paths):
+      self.assertTrue(
+          np.alltrue(
+              decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
+    self.assertAllClose(log_prob_truth, log_prob_pred)
 
   @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
@@ -1463,29 +1478,26 @@ class TestCTC(test.TestCase):
 class TestRandomOps(test.TestCase):
 
   def test_random_binomial(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.random_binomial((1000, 1000), p=0.5)
-      self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.random_binomial((1000, 1000), p=0.5)
+    self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
 
   def test_truncated_normal(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
-      y = keras.backend.eval(x)
-      self.assertAllClose(np.mean(y), 0., atol=0.1)
-      self.assertAllClose(np.std(y), 0.88, atol=0.1)
-      self.assertAllClose(np.max(y), 2., atol=0.1)
-      self.assertAllClose(np.min(y), -2., atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
+    y = keras.backend.eval(x)
+    self.assertAllClose(np.mean(y), 0., atol=0.1)
+    self.assertAllClose(np.std(y), 0.88, atol=0.1)
+    self.assertAllClose(np.max(y), 2., atol=0.1)
+    self.assertAllClose(np.min(y), -2., atol=0.1)
 
   def test_string_input(self):
-    with self.cached_session():
-      seq = keras.Sequential([
-          keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
-          keras.layers.Lambda(lambda x: x[0])
-      ])
-      preds = seq.predict([['tensorflow eager']])
-      self.assertEqual(preds.shape, (1,))
+    seq = keras.Sequential([
+        keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
+        keras.layers.Lambda(lambda x: x[0])
+    ])
+    preds = seq.predict([['tensorflow eager']])
+    self.assertEqual(preds.shape, (1,))
 
 
 class BackendGraphTests(test.TestCase):
@@ -1712,9 +1724,9 @@ class BackendGraphTests(test.TestCase):
 
   def test_placeholder(self):
     x = keras.backend.placeholder(shape=(3, 4))
-    self.assertEqual(x.get_shape().as_list(), [3, 4])
+    self.assertEqual(x.shape.as_list(), [3, 4])
     x = keras.backend.placeholder(shape=(3, 4), sparse=True)
-    self.assertEqual(x.get_shape().as_list(), [3, 4])
+    self.assertEqual(x.shape.as_list(), [3, 4])
 
   @test_util.run_deprecated_v1
   def test_batch_normalization(self):
@@ -1748,6 +1760,16 @@ class BackendGraphTests(test.TestCase):
         x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
 
+  def test_get_session_different_graphs(self):
+    with ops.Graph().as_default():
+      x = keras.backend.constant(1)
+      session = keras.backend.get_session()
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIs(session, keras.backend.get_session())
+    with ops.Graph().as_default():
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIsNot(session, keras.backend.get_session())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index e8985c2306359c1e5eefa0f2ec740a5bd6aff1a6..89aa058ddce8927942f93c93a1a14cc928f53933 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -32,18 +32,14 @@ import six
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary as tf_summary
-from tensorflow.python.training import saver
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util.tf_export import keras_export
 
 try:
@@ -52,7 +48,6 @@ except ImportError:
   requests = None
 
 
-# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
@@ -91,20 +86,57 @@ def configure_callbacks(callbacks,
   # Add additional callbacks during training.
   if mode == ModeKeys.TRAIN:
     model.history = History()
-    stateful_metric_names = None
-    if hasattr(model, 'metrics_names'):
-      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-                ] + (callbacks or []) + [model.history]
+    callbacks = [BaseLogger()] + (callbacks or []) + [model.history]
     if verbose:
-      callbacks.append(
-          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+      callbacks.append(ProgbarLogger(count_mode))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()
+  callback_model = model._get_callback_model()  # pylint: disable=protected-access
   callback_list.set_model(callback_model)
 
+  set_callback_parameters(
+      callback_list,
+      model,
+      do_validation=do_validation,
+      batch_size=batch_size,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      samples=samples,
+      verbose=verbose,
+      mode=mode)
+
+  callback_list.model.stop_training = False
+  return callback_list
+
+
+def set_callback_parameters(callback_list,
+                            model,
+                            do_validation=False,
+                            batch_size=None,
+                            epochs=None,
+                            steps_per_epoch=None,
+                            samples=None,
+                            verbose=1,
+                            mode=ModeKeys.TRAIN):
+  """Sets callback parameters.
+
+  Arguments:
+      callback_list: CallbackList instance.
+      model: Model being trained.
+      do_validation: Whether or not validation loop will be run.
+      batch_size: Number of samples per batch.
+      epochs: Number of epoch to train.
+      steps_per_epoch: Number of batches to run per training epoch.
+      samples: Number of training samples.
+      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+        Which loop mode to configure callbacks for.
+  """
+  for cbk in callback_list:
+    if isinstance(cbk, (BaseLogger, ProgbarLogger)):
+      cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
+
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
@@ -123,9 +155,6 @@ def configure_callbacks(callbacks,
       'metrics': callback_metrics,
   }
   callback_list.set_params(callback_params)
-  callback_list.model.stop_training = False
-  return callback_list
-# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -395,7 +424,7 @@ class Callback(object):
   take as argument will contain keys for quantities relevant to
   the current batch or epoch.
 
-  Currently, the `.fit()` method of the `Sequential` model class
+  Currently, the `.fit()` method of the `Model` class
   will include the following quantities in the `logs` that
   it passes to its callbacks:
 
@@ -412,6 +441,10 @@ class Callback(object):
   def __init__(self):
     self.validation_data = None
     self.model = None
+    # Whether this Callback should only run on the chief worker in a
+    # Multi-Worker setting.
+    # TODO(omalleyt): Make this attr public once solution is stable.
+    self._chief_worker_only = None
 
   def set_params(self, params):
     self.params = params
@@ -815,6 +848,17 @@ class ModelCheckpoint(Callback):
         self.monitor_op = np.less
         self.best = np.Inf
 
+    # Only the chief worker writes model checkpoints.
+    self._chief_worker_only = True
+
+  def set_model(self, model):
+    self.model = model
+    # Use name matching rather than `isinstance` to avoid circular dependencies.
+    if (not self.save_weights_only and
+        not model._is_graph_network and  # pylint: disable=protected-access
+        model.__class__.__name__ != 'Sequential'):
+      self.save_weights_only = True
+
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
     self.epochs_since_last_save += 1
@@ -1054,388 +1098,302 @@ class LearningRateScheduler(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
 
 
-@keras_export('keras.callbacks.TensorBoard')
+@keras_export('keras.callbacks.TensorBoard', v1=[])
 class TensorBoard(Callback):
   # pylint: disable=line-too-long
-  """Tensorboard basic visualizations.
-
-  This callback writes a log for TensorBoard, which allows
-  you to visualize dynamic graphs of your training and test
-  metrics, as well as activation histograms for the different
-  layers in your model.
+  """Enable visualizations for TensorBoard.
 
   TensorBoard is a visualization tool provided with TensorFlow.
 
+  This callback logs events for TensorBoard, including:
+  * Metrics summary plots
+  * Training graph visualization
+  * Activation histograms
+  * Sampled profiling
+
   If you have installed TensorFlow with pip, you should be able
   to launch TensorBoard from the command line:
 
   ```sh
-  tensorboard --logdir=/full_path_to_your_logs
+  tensorboard --logdir=path_to_your_logs
   ```
 
   You can find more information about TensorBoard
   [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
   Arguments:
-      log_dir: the path of the directory where to save the log
-          files to be parsed by TensorBoard.
-      histogram_freq: frequency (in epochs) at which to compute activation
-          and weight histograms for the layers of the model. If set to 0,
-          histograms won't be computed. Validation data (or split) must be
-          specified for histogram visualizations.
-      write_graph: whether to visualize the graph in TensorBoard.
-          The log file can become quite large when
-          write_graph is set to True.
-      write_grads: whether to visualize gradient histograms in TensorBoard.
-          `histogram_freq` must be greater than 0.
-      batch_size: size of batch of inputs to feed to the network
-          for histograms computation.
-      write_images: whether to write model weights to visualize as
-          image in TensorBoard.
-      embeddings_freq: frequency (in epochs) at which selected embedding
-          layers will be saved. If set to 0, embeddings won't be computed.
-          Data to be visualized in TensorBoard's Embedding tab must be passed
-          as `embeddings_data`.
-      embeddings_layer_names: a list of names of layers to keep eye on. If
-          None or empty list all the embedding layer will be watched.
-      embeddings_metadata: a dictionary which maps layer name to a file name
-          in which metadata for this embedding layer is saved. See the
-          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
-          about metadata files format. In case if the same metadata file is
-          used for all embedding layers, string can be passed.
-      embeddings_data: data to be embedded at layers specified in
-          `embeddings_layer_names`. Numpy array (if the model has a single
-          input) or list of Numpy arrays (if the model has multiple inputs).
-          Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
       update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
-          writes the losses and metrics to TensorBoard after each batch.
-          The same applies for `'epoch'`. If using an integer, let's say `1000`,
-          the callback will write the metrics and losses to TensorBoard every
-          1000 samples. Note that writing too frequently to TensorBoard
-          can slow down your training.
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
+      profile_batch: Profile the batch to sample compute characteristics. By
+        default, it will profile the second batch. Set profile_batch=0 to
+        disable profiling. Must run in TensorFlow eager mode.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
-
-  @compatibility(eager)
-  Using `Tensorboard` callback will work while eager execution is enabled,
-  however outputting histogram summaries of weights and gradients is not
-  supported, and thus `histogram_freq` will be ignored.
-  @end_compatibility
   """
 
   # pylint: enable=line-too-long
 
   def __init__(self,
-               log_dir='./logs',
+               log_dir='logs',
                histogram_freq=0,
-               batch_size=32,
                write_graph=True,
-               write_grads=False,
                write_images=False,
-               embeddings_freq=0,
-               embeddings_layer_names=None,
-               embeddings_metadata=None,
-               embeddings_data=None,
-               update_freq='epoch'):
+               update_freq='epoch',
+               profile_batch=2,
+               **kwargs):
     super(TensorBoard, self).__init__()
+    self._validate_kwargs(kwargs)
+
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
-    if self.histogram_freq and context.executing_eagerly():
-      logging.warning(
-          UserWarning('Weight and gradient histograms not supported for eager'
-                      'execution, setting `histogram_freq` to `0`.'))
-      self.histogram_freq = 0
-    self.merged = None
     self.write_graph = write_graph
-    self.write_grads = write_grads
     self.write_images = write_images
-    self.batch_size = batch_size
-    self._current_batch = 0
-    self._total_batches_seen = 0
-    self._total_val_batches_seen = 0
-    self.embeddings_freq = embeddings_freq
-    self.embeddings_layer_names = embeddings_layer_names
-    self.embeddings_metadata = embeddings_metadata
-    self.embeddings_data = embeddings_data
     if update_freq == 'batch':
       self.update_freq = 1
     else:
       self.update_freq = update_freq
+
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
 
-  def _init_writer(self):
-    """Sets file writer."""
-    if context.executing_eagerly():
-      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
-    elif self.write_graph:
-      self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph)
-    else:
-      self.writer = tf_summary.FileWriter(self.log_dir)
-
-  def _make_histogram_ops(self, model):
-    """Defines histogram ops when histogram_freq > 0."""
-    # only make histogram summary op if it hasn't already been made
-    if self.histogram_freq and self.merged is None:
-      for layer in self.model.layers:
-        for weight in layer.weights:
-          mapped_weight_name = weight.name.replace(':', '_')
-          tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_images:
-            w_img = array_ops.squeeze(weight)
-            shape = K.int_shape(w_img)
-            if len(shape) == 2:  # dense layer kernel case
-              if shape[0] > shape[1]:
-                w_img = array_ops.transpose(w_img)
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
-            elif len(shape) == 3:  # convnet case
-              if K.image_data_format() == 'channels_last':
-                # switch to channels_first to display
-                # every kernel as a separate image
-                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img,
-                                        [shape[0], shape[1], shape[2], 1])
-            elif len(shape) == 1:  # bias case
-              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
-            else:
-              # not possible to handle 3D convnets etc.
-              continue
-
-            shape = K.int_shape(w_img)
-            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf_summary.image(mapped_weight_name, w_img)
-
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [
-                grad.values if is_indexed_slices(grad) else grad
-                for grad in grads
-            ]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
-        if hasattr(layer, 'output'):
-          if isinstance(layer.output, list):
-            for i, output in enumerate(layer.output):
-              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
-          else:
-            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+    # A collection of file writers currently in use, to be closed when
+    # training ends for this callback. Writers are keyed by the
+    # directory name under the root logdir: e.g., "train" or
+    # "validation".
+    self._writers = {}
+    self._train_run_name = 'train'
+    self._validation_run_name = 'validation'
+
+    self._profile_batch = profile_batch
+    # True when a trace is running.
+    self._is_tracing = False
+
+    # TensorBoard should only write summaries on the chief when in a
+    # Multi-Worker setting.
+    self._chief_worker_only = True
+
+  def _validate_kwargs(self, kwargs):
+    """Handle arguments were supported in V1."""
+    if kwargs.get('write_grads', False):
+      logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+    if kwargs.get('embeddings_freq', False):
+      logging.warning('Embeddings will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+    if kwargs.get('batch_size', False):
+      logging.warning('`batch_size` is no longer needed in the '
+                      '`TensorBoard` Callback and will be ignored '
+                      'in TensorFlow 2.0.')
+
+    unrecognized_kwargs = set(kwargs.keys()) - {
+        'write_grads', 'embeddings_freq', 'embeddings_layer_names',
+        'embeddings_metadata', 'embeddings_data', 'batch_size'
+    }
+
+    # Only allow kwargs that were supported in V1.
+    if unrecognized_kwargs:
+      raise ValueError('Unrecognized arguments in `TensorBoard` '
+                       'Callback: ' + str(unrecognized_kwargs))
 
   def set_model(self, model):
-    """Sets Keras model and creates summary ops."""
-
+    """Sets Keras model and writes graph if specified."""
     self.model = model
-    self._init_writer()
-    # histogram summaries only enabled in graph mode
-    if not context.executing_eagerly():
-      self._make_histogram_ops(model)
-      self.merged = tf_summary.merge_all()
-
-    # If both embedding_freq and embeddings_data are available, we will
-    # visualize embeddings.
-    if self.embeddings_freq and self.embeddings_data is not None:
-      # Avoid circular dependency.
-      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
-      self.embeddings_data = training_utils.standardize_input_data(
-          self.embeddings_data, model.input_names)
-
-      # If embedding_layer_names are not provided, get all of the embedding
-      # layers from the model.
-      embeddings_layer_names = self.embeddings_layer_names
-      if not embeddings_layer_names:
-        embeddings_layer_names = [
-            layer.name
-            for layer in self.model.layers
-            if type(layer).__name__ == 'Embedding'
-        ]
-
-      self.assign_embeddings = []
-      embeddings_vars = {}
-
-      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
-      self.step = step = array_ops.placeholder(dtypes.int32)
+    with context.eager_mode():
+      self._close_writers()
+      if self.write_graph:
+        with self._get_writer(self._train_run_name).as_default():
+          with summary_ops_v2.always_record_summaries():
+            if not model.run_eagerly:
+              summary_ops_v2.graph(K.get_graph())
+
+            summary_writable = (
+                self.model._is_graph_network or  # pylint: disable=protected-access
+                self.model.__class__.__name__ == 'Sequential')  # pylint: disable=protected-access
+            if summary_writable:
+              summary_ops_v2.keras_model('keras', self.model, step=0)
+
+  def _close_writers(self):
+    """Close all remaining open file writers owned by this callback.
+
+    If there are no such file writers, this is a no-op.
+    """
+    with context.eager_mode():
+      for writer in six.itervalues(self._writers):
+        writer.close()
+      self._writers.clear()
 
-      for layer in self.model.layers:
-        if layer.name in embeddings_layer_names:
-          embedding_input = self.model.get_layer(layer.name).output
-          embedding_size = np.prod(embedding_input.shape[1:])
-          embedding_input = array_ops.reshape(embedding_input,
-                                              (step, int(embedding_size)))
-          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
-          embedding = variables.Variable(
-              array_ops.zeros(shape), name=layer.name + '_embedding')
-          embeddings_vars[layer.name] = embedding
-          batch = state_ops.assign(embedding[batch_id:batch_id + step],
-                                   embedding_input)
-          self.assign_embeddings.append(batch)
-
-      self.saver = saver.Saver(list(embeddings_vars.values()))
-
-      # Create embeddings_metadata dictionary
-      if isinstance(self.embeddings_metadata, str):
-        embeddings_metadata = {
-            layer_name: self.embeddings_metadata
-            for layer_name in embeddings_vars.keys()
-        }
-      else:
-        # If embedding_metadata is already a dictionary
-        embeddings_metadata = self.embeddings_metadata
-
-      try:
-        from tensorboard.plugins import projector
-      except ImportError:
-        raise ImportError('Failed to import TensorBoard. Please make sure that '
-                          'TensorBoard integration is complete."')
-
-      # TODO(psv): Add integration tests to test embedding visualization
-      # with TensorBoard callback. We are unable to write a unit test for this
-      # because TensorBoard dependency assumes TensorFlow package is installed.
-      config = projector.ProjectorConfig()
-      for layer_name, tensor in embeddings_vars.items():
-        embedding = config.embeddings.add()
-        embedding.tensor_name = tensor.name
-
-        if (embeddings_metadata is not None and
-            layer_name in embeddings_metadata):
-          embedding.metadata_path = embeddings_metadata[layer_name]
-
-      projector.visualize_embeddings(self.writer, config)
-
-  def _fetch_callback(self, summary):
-    self.writer.add_summary(summary, self._total_val_batches_seen)
-    self._total_val_batches_seen += 1
-
-  def _write_custom_summaries(self, step, logs=None):
-    """Writes metrics out as custom scalar summaries.
+  def _get_writer(self, writer_name):
+    """Get a summary writer for the given subdirectory under the logdir.
 
-    Arguments:
-        step: the global step to use for Tensorboard.
-        logs: dict. Keys are scalar summary names, values are
-            NumPy scalars.
+    A writer will be created if it does not yet exist.
 
+    Args:
+      writer_name: The name of the directory for which to create or
+        retrieve a writer. Should be either `self._train_run_name` or
+        `self._validation_run_name`.
+
+    Returns:
+      A `SummaryWriter` object.
     """
-    logs = logs or {}
-    if context.executing_eagerly():
-      # use v2 summary ops
-      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
-        for name, value in logs.items():
-          if isinstance(value, np.ndarray):
-            value = value.item()
-          summary_ops_v2.scalar(name, value, step=step)
-    else:
-      # use FileWriter from v1 summary
-      for name, value in logs.items():
-        if isinstance(value, np.ndarray):
-          value = value.item()
-        summary = tf_summary.Summary()
-        summary_value = summary.value.add()
-        summary_value.simple_value = value
-        summary_value.tag = name
-        self.writer.add_summary(summary, step)
-    self.writer.flush()
+    if writer_name not in self._writers:
+      path = os.path.join(self.log_dir, writer_name)
+      writer = summary_ops_v2.create_file_writer_v2(path)
+      self._writers[writer_name] = writer
+    return self._writers[writer_name]
+
+  def on_train_begin(self, logs=None):
+    if self._profile_batch == 1:
+      summary_ops_v2.trace_on(graph=True, profiler=True)
+      self._is_tracing = True
 
   def on_batch_end(self, batch, logs=None):
-    """Writes scalar summaries for metrics on every training batch."""
-    # Don't output batch_size and batch number as Tensorboard summaries
+    """Writes scalar summaries for metrics on every training batch.
+
+    Performs profiling if current batch is in profiler_batches.
+    """
+    # Don't output batch_size and batch number as TensorBoard summaries
     logs = logs or {}
     self._samples_seen += logs.get('size', 1)
     samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
     if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
-      batch_logs = {('batch_' + k): v
-                    for k, v in logs.items()
-                    if k not in ['batch', 'size', 'num_steps']}
-      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._log_metrics(logs, prefix='batch_', step=self._total_batches_seen)
       self._samples_seen_at_last_write = self._samples_seen
     self._total_batches_seen += 1
-
-  def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model eval_function callbacks, reset batch count."""
-
-    # check if histogram summary should be run for this epoch
-    if self.histogram_freq and epoch % self.histogram_freq == 0:
-      self._epoch = epoch
-      # pylint: disable=protected-access
-      # add the histogram summary op if it should run this epoch
-      self.model._make_eval_function()
-      if self.merged not in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.append(self.merged)
-        self.model._eval_function.fetch_callbacks[
-            self.merged] = self._fetch_callback
-      # pylint: enable=protected-access
+    if self._is_tracing:
+      self._log_trace()
+    elif (not self._is_tracing and
+          self._total_batches_seen == self._profile_batch - 1):
+      self._enable_trace()
 
   def on_epoch_end(self, epoch, logs=None):
-    """Checks if summary ops should run next epoch, logs scalar summaries."""
-
-    # don't output batch_size and
-    # batch number as Tensorboard summaries
-    logs = {('epoch_' + k): v
-            for k, v in logs.items()
-            if k not in ['batch', 'size', 'num_steps']}
-    if self.update_freq == 'epoch':
-      step = epoch
-    else:
-      step = self._samples_seen
-    self._write_custom_summaries(step, logs)
-
-    # pop the histogram summary op after each epoch
-    if self.histogram_freq:
-      # pylint: disable=protected-access
-      if self.merged in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.remove(self.merged)
-      if self.merged in self.model._eval_function.fetch_callbacks:
-        self.model._eval_function.fetch_callbacks.pop(self.merged)
-      # pylint: enable=protected-access
-
-    if self.embeddings_data is None and self.embeddings_freq:
-      raise ValueError('To visualize embeddings, embeddings_data must '
-                       'be provided.')
-
-    if self.embeddings_freq and self.embeddings_data is not None:
-      if epoch % self.embeddings_freq == 0:
-        # We need a second forward-pass here because we're passing
-        # the `embeddings_data` explicitly. This design allows to pass
-        # arbitrary data as `embeddings_data` and results from the fact
-        # that we need to know the size of the `tf.Variable`s which
-        # hold the embeddings in `set_model`. At this point, however,
-        # the `validation_data` is not yet set.
-
-        embeddings_data = self.embeddings_data
-        n_samples = embeddings_data[0].shape[0]
-        i = 0
-        while i < n_samples:
-          step = min(self.batch_size, n_samples - i)
-          batch = slice(i, i + step)
-
-          if isinstance(self.model.input, list):
-            feed_dict = {
-                model_input: embeddings_data[idx][batch]
-                for idx, model_input in enumerate(self.model.input)
-            }
-          else:
-            feed_dict = {self.model.input: embeddings_data[0][batch]}
+    """Runs metrics and histogram summaries at epoch end."""
+    step = epoch if self.update_freq == 'epoch' else self._samples_seen
+    self._log_metrics(logs, prefix='epoch_', step=step)
 
-          feed_dict.update({self.batch_id: i, self.step: step})
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._log_weights(epoch)
 
-          if not isinstance(K.learning_phase(), int):
-            feed_dict[K.learning_phase()] = False
+  def on_train_end(self, logs=None):
+    if self._is_tracing:
+      self._log_trace()
+    self._close_writers()
 
-          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
-          self.saver.save(self.sess,
-                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
-                          epoch)
+  def _enable_trace(self):
+    if context.executing_eagerly():
+      summary_ops_v2.trace_on(graph=True, profiler=True)
+      self._is_tracing = True
 
-          i += self.batch_size
+  def _log_trace(self):
+    if context.executing_eagerly():
+      with self._get_writer(self._train_run_name).as_default(), \
+          summary_ops_v2.always_record_summaries():
+        # TODO(b/126388999): Remove step info in the summary name.
+        summary_ops_v2.trace_export(
+            name='batch_%d' % self._total_batches_seen,
+            step=self._total_batches_seen,
+            profiler_outdir=os.path.join(self.log_dir, 'train'))
+      self._is_tracing = False
+
+  def _log_metrics(self, logs, prefix, step):
+    """Writes metrics out as custom scalar summaries.
 
-  def on_train_end(self, logs=None):
-    self.writer.close()
+    Arguments:
+        logs: Dict. Keys are scalar summary names, values are NumPy scalars.
+        prefix: String. The prefix to apply to the scalar summary names.
+        step: Int. The global step to use for TensorBoard.
+    """
+    if logs is None:
+      logs = {}
+
+    # Group metrics by the name of their associated file writer. Values
+    # are lists of metrics, as (name, scalar_value) pairs.
+    logs_by_writer = {
+        self._train_run_name: [],
+        self._validation_run_name: [],
+    }
+    validation_prefix = 'val_'
+    for (name, value) in logs.items():
+      if name in ('batch', 'size', 'num_steps'):
+        # Scrub non-metric items.
+        continue
+      if name.startswith(validation_prefix):
+        name = name[len(validation_prefix):]
+        writer_name = self._validation_run_name
+      else:
+        writer_name = self._train_run_name
+      name = prefix + name  # assign batch or epoch prefix
+      logs_by_writer[writer_name].append((name, value))
+
+    with context.eager_mode():
+      with summary_ops_v2.always_record_summaries():
+        for writer_name in logs_by_writer:
+          these_logs = logs_by_writer[writer_name]
+          if not these_logs:
+            # Don't create a "validation" events file if we don't
+            # actually have any validation data.
+            continue
+          writer = self._get_writer(writer_name)
+          with writer.as_default():
+            for (name, value) in these_logs:
+              summary_ops_v2.scalar(name, value, step=step)
+
+  def _log_weights(self, epoch):
+    """Logs the weights of the Model to TensorBoard."""
+    writer = self._get_writer(self._train_run_name)
+    with context.eager_mode(), \
+          writer.as_default(), \
+          summary_ops_v2.always_record_summaries():
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          weight_name = weight.name.replace(':', '_')
+          with ops.init_scope():
+            weight = K.get_value(weight)
+          summary_ops_v2.histogram(weight_name, weight, step=epoch)
+          if self.write_images:
+            self._log_weight_as_image(weight, weight_name, epoch)
+      writer.flush()
+
+  def _log_weight_as_image(self, weight, weight_name, epoch):
+    """Logs a weight as a TensorBoard image."""
+    w_img = array_ops.squeeze(weight)
+    shape = K.int_shape(w_img)
+    if len(shape) == 1:  # Bias case
+      w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+    elif len(shape) == 2:  # Dense layer kernel case
+      if shape[0] > shape[1]:
+        w_img = array_ops.transpose(w_img)
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+    elif len(shape) == 3:  # ConvNet case
+      if K.image_data_format() == 'channels_last':
+        # Switch to channels_first to display every kernel as a separate
+        # image.
+        w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1])
+
+    shape = K.int_shape(w_img)
+    # Not possible to handle 3D convnets etc.
+    if len(shape) == 4 and shape[-1] in [1, 3, 4]:
+      summary_ops_v2.image(weight_name, w_img, step=epoch)
 
 
 @keras_export('keras.callbacks.ReduceLROnPlateau')
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 7af9a9a2b20322e74736ad91cb0db5dfa75fe6ef..d25ef6360f07d329eda9d36869afc90cccc0728d 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -23,21 +23,22 @@ import csv
 import os
 import re
 import shutil
-import tempfile
+import sys
 import threading
 import unittest
 
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import adam
 
 try:
@@ -218,173 +219,219 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         })
 
 
-class KerasCallbacksTest(test.TestCase):
+class KerasCallbacksTest(keras_parameterized.TestCase):
 
-  def test_ModelCheckpoint(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
+  def _get_model(self, input_shape=None):
+    layers = [
+        keras.layers.Dense(3, activation='relu'),
+        keras.layers.Dense(2, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=input_shape)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=[keras.metrics.CategoricalAccuracy(name='my_acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
 
-    with self.cached_session():
-      np.random.seed(1337)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging(self):
+    model = self._get_model(input_shape=(3,))
 
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
-      filepath = os.path.join(temp_dir, 'checkpoint.h5')
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-      # case 1
-      monitor = 'val_loss'
-      save_best_only = False
-      mode = 'auto'
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
 
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+  @keras_parameterized.run_with_all_model_types(exclude_models='functional')
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging_deferred_model_build(self):
+    model = self._get_model()
+    self.assertFalse(model.built)
 
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
-      # case 2
-      mode = 'min'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
 
-      # case 3
-      mode = 'max'
-      monitor = 'val_acc'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
-
-      # case 4
-      save_best_only = True
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+  @keras_parameterized.run_with_all_model_types
+  def test_ModelCheckpoint(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
 
-      # Case: metric not available.
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor='unknown',
-              save_best_only=True)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      # File won't be written.
-      assert not os.path.exists(filepath)
+    layers = [
+        keras.layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'),
+        keras.layers.Dense(NUM_CLASSES, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
-      # case 5
-      save_best_only = False
-      period = 2
-      mode = 'auto'
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
 
-      filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode,
-              period=period)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=4,
-          verbose=1)
-      assert os.path.exists(filepath.format(epoch=2))
-      assert os.path.exists(filepath.format(epoch=4))
-      os.remove(filepath.format(epoch=2))
-      os.remove(filepath.format(epoch=4))
-      assert not os.path.exists(filepath.format(epoch=1))
-      assert not os.path.exists(filepath.format(epoch=3))
-
-      # Invalid use: this will raise a warning but not an Exception.
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          mode='unknown')
+    filepath = os.path.join(temp_dir, 'checkpoint')
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+    # case 1
+    monitor = 'val_loss'
+    save_best_only = False
+    mode = 'auto'
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(
+            NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+    model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
+
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 2
+    mode = 'min'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 3
+    mode = 'max'
+    monitor = 'val_acc'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 4
+    save_best_only = True
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # Case: metric not available.
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor='unknown',
+            save_best_only=True)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    # File won't be written.
+    assert not os.path.exists(filepath)
+
+    # case 5
+    save_best_only = False
+    period = 2
+    mode = 'auto'
+
+    filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode,
+            period=period)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=4,
+        verbose=1)
+    assert os.path.exists(filepath.format(epoch=2))
+    assert os.path.exists(filepath.format(epoch=4))
+    os.remove(filepath.format(epoch=2))
+    os.remove(filepath.format(epoch=4))
+    assert not os.path.exists(filepath.format(epoch=1))
+    assert not os.path.exists(filepath.format(epoch=3))
+
+    # Invalid use: this will raise a warning but not an Exception.
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        mode='unknown')
 
   def test_EarlyStopping(self):
     with self.cached_session():
@@ -399,9 +446,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
       cases = [
           ('max', 'val_acc'),
@@ -459,7 +504,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=1, num_classes=1, input_dim=1)
       model.compile(
-          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
+          optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
 
       stopper = keras.callbacks.EarlyStopping(monitor='acc',
                                               baseline=baseline)
@@ -585,13 +630,15 @@ class KerasCallbacksTest(test.TestCase):
             optimizer=keras.optimizers.SGD(lr=0.1))
         return model
 
+      # TODO(psv): Make sure the callback works correctly when min_delta is
+      # set as 0. Test fails when the order of this callback and assertion is
+      # interchanged.
       model = make_model()
-      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=10,
+              min_delta=0,
               patience=1,
               cooldown=5)
       ]
@@ -601,19 +648,18 @@ class KerasCallbacksTest(test.TestCase):
           batch_size=BATCH_SIZE,
           validation_data=(x_test, y_test),
           callbacks=cbks,
-          epochs=5,
+          epochs=2,
           verbose=0)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)),
-          0.01,
-          atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
 
       model = make_model()
+      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=0,
+              min_delta=10,
               patience=1,
               cooldown=5)
       ]
@@ -623,10 +669,10 @@ class KerasCallbacksTest(test.TestCase):
           batch_size=BATCH_SIZE,
           validation_data=(x_test, y_test),
           callbacks=cbks,
-          epochs=5,
+          epochs=2,
           verbose=2)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
 
   def test_ReduceLROnPlateau_patience(self):
 
@@ -835,310 +881,6 @@ class KerasCallbacksTest(test.TestCase):
       self.assertEqual(len(loss), 1)
       self.assertEqual(loss[0], np.inf)
 
-  @test_util.run_deprecated_v1
-  def test_TensorBoard(self):
-    np.random.seed(1337)
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    def data_generator(train):
-      if train:
-        max_batch_index = len(x_train) // BATCH_SIZE
-      else:
-        max_batch_index = len(x_test) // BATCH_SIZE
-      i = 0
-      while 1:
-        if train:
-          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        else:
-          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        i += 1
-        i %= max_batch_index
-
-    # case: Sequential
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=temp_dir, histogram_freq=1, write_images=True,
-          write_grads=True, batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      # fit with validation data and accuracy
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      # fit generator with validation data
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data
-      # histogram_freq must be zero
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator with validation data and accuracy
-      tsb.histogram_freq = 1
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data and accuracy
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
-      assert os.path.exists(temp_dir)
-
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_multi_input_output(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          else:
-            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          i += 1
-          i %= max_batch_index
-
-      inp1 = keras.Input((INPUT_DIM,))
-      inp2 = keras.Input((INPUT_DIM,))
-      inp = keras.layers.add([inp1, inp2])
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model([inp1, inp2], [output1, output2])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(log_dir=filepath,
-                                            histogram_freq=histogram_freq,
-                                            write_images=True, write_grads=True,
-                                            batch_size=5)]
-
-      # fit without validation data
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
-
-      # fit with validation data and accuracy
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                validation_data=([x_test] * 2, [y_test] * 2),
-                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
-
-      # fit generator without validation data
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          callbacks=callbacks_factory(histogram_freq=0))
-
-      # fit generator with validation data and accuracy
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          validation_data=([x_test] * 2, [y_test] * 2),
-                          callbacks=callbacks_factory(histogram_freq=1))
-      assert os.path.isdir(filepath)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_histogram_summaries_in_test_function(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.steps_seen = []
-
-      def add_summary(self, summary, global_step):
-        summary_obj = summary_pb2.Summary()
-
-        # ensure a valid Summary proto is being sent
-        if isinstance(summary, bytes):
-          summary_obj.ParseFromString(summary)
-        else:
-          assert isinstance(summary, summary_pb2.Summary)
-          summary_obj = summary
-
-        # keep track of steps seen for the merged_summary op,
-        # which contains the histogram summaries
-        if len(summary_obj.value) > 1:
-          self.steps_seen.append(global_step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    def _init_writer(obj):
-      obj.writer = FileWriterStub(obj.log_dir)
-
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      keras.callbacks.TensorBoard._init_writer = _init_writer
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_histogram_summaries_with_generator(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation generator
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=2,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=cbks,
-          verbose=0)
-
-      with self.assertRaises(ValueError):
-        # fit with validation generator but no
-        # validation_steps
-        model.fit_generator(
-            generator(),
-            steps_per_epoch=2,
-            epochs=2,
-            validation_data=generator(),
-            callbacks=cbks,
-            verbose=0)
-
-      self.assertTrue(os.path.exists(tmpdir))
-
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
@@ -1186,239 +928,463 @@ class KerasCallbacksTest(test.TestCase):
       t.join()
       assert not t.is_alive()
 
-  def test_TensorBoard_with_ReduceLROnPlateau(self):
+  def test_RemoteMonitorWithJsonPayload(self):
+    if requests is None:
+      self.skipTest('`requests` required to run this test')
     with self.cached_session():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=TRAIN_SAMPLES,
           test_samples=TEST_SAMPLES,
           input_shape=(INPUT_DIM,),
           num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+      y_test = keras.utils.np_utils.to_categorical(y_test)
+      y_train = keras.utils.np_utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
-          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
-
-      cbks = [
-          keras.callbacks.ReduceLROnPlateau(
-              monitor='val_loss', factor=0.5, patience=4, verbose=1),
-          keras.callbacks.TensorBoard(log_dir=temp_dir)
-      ]
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
 
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
+      with test.mock.patch.object(requests, 'post'):
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1)
 
-      assert os.path.exists(temp_dir)
 
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_batch_logging(self):
+# A summary that was emitted during a test. Fields:
+#   logdir: str. The logdir of the FileWriter to which the summary was
+#     written.
+#   tag: str. The name of the summary.
+_ObservedSummary = collections.namedtuple('_ObservedSummary', ('logdir', 'tag'))
 
-    class FileWriterStub(object):
 
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batches_logged = []
-        self.summary_values = []
-        self.summary_tags = []
+class _SummaryFile(object):
+  """A record of summary tags and the files to which they were written.
 
-      def add_summary(self, summary, step):
-        self.summary_values.append(summary.value[0].simple_value)
-        self.summary_tags.append(summary.value[0].tag)
-        self.batches_logged.append(step)
+  Fields `scalars`, `images`, `histograms`, and `tensors` are sets
+  containing `_ObservedSummary` values.
+  """
 
-      def flush(self):
-        pass
+  def __init__(self):
+    self.scalars = set()
+    self.images = set()
+    self.histograms = set()
+    self.tensors = set()
 
-      def close(self):
-        pass
 
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+def list_summaries(logdir):
+  """Read all summaries under the logdir into a `_SummaryFile`.
 
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
+  Args:
+    logdir: A path to a directory that contains zero or more event
+      files, either as direct children or in transitive subdirectories.
+      Summaries in these events must only contain old-style scalars,
+      images, and histograms. Non-summary events, like `graph_def`s, are
+      ignored.
 
-    for batch in range(5):
-      tb_cbk.on_batch_end(batch, {'acc': batch})
-    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
-    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
-    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+  Returns:
+    A `_SummaryFile` object reflecting all summaries written to any
+    event files in the logdir or any of its descendant directories.
 
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_epoch_and_batch_logging(self):
+  Raises:
+    ValueError: If an event file contains an summary of unexpected kind.
+  """
+  result = _SummaryFile()
+  for (dirpath, dirnames, filenames) in os.walk(logdir):
+    del dirnames  # unused
+    for filename in filenames:
+      if not filename.startswith('events.out.'):
+        continue
+      path = os.path.join(dirpath, filename)
+      for event in summary_iterator.summary_iterator(path):
+        if not event.summary:  # (e.g., it's a `graph_def` event)
+          continue
+        for value in event.summary.value:
+          tag = value.tag
+          # Case on the `value` rather than the summary metadata because
+          # the Keras callback uses `summary_ops_v2` to emit old-style
+          # summaries. See b/124535134.
+          kind = value.WhichOneof('value')
+          container = {
+              'simple_value': result.scalars,
+              'image': result.images,
+              'histo': result.histograms,
+              'tensor': result.tensors,
+          }.get(kind)
+          if container is None:
+            raise ValueError(
+                'Unexpected summary kind %r in event file %s:\n%r'
+                % (kind, path, event))
+          container.add(_ObservedSummary(logdir=dirpath, tag=tag))
+  return result
 
-    class FileWriterStub(object):
 
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TestTensorBoardV2(keras_parameterized.TestCase):
 
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summary = (step, summary)
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summary = (step, summary)
+  def setUp(self):
+    super(TestTensorBoardV2, self).setUp()
+    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
+    self.train_dir = os.path.join(self.logdir, 'train')
+    self.validation_dir = os.path.join(self.logdir, 'validation')
 
-      def flush(self):
-        pass
+  def _get_model(self):
+    layers = [
+        keras.layers.Conv2D(8, (3, 3)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1)
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    return model
 
-      def close(self):
-        pass
+  def test_TensorBoard_default_logdir(self):
+    """Regression test for cross-platform pathsep in default logdir."""
+    os.chdir(self.get_temp_dir())
 
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard()  # no logdir specified
 
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+
+    summary_file = list_summaries(logdir='.')
+    train_dir = os.path.join('.', 'logs', 'train')
+    validation_dir = os.path.join('.', 'logs', 'validation')
+    self.assertEqual(
+        summary_file.scalars, {
+            _ObservedSummary(logdir=train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=validation_dir, tag='epoch_loss'),
+        })
 
-    tb_cbk.on_batch_end(0, {'acc': 5.0})
-    batch_step, batch_summary = tb_cbk.writer.batch_summary
-    self.assertEqual(batch_step, 0)
-    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+  def test_TensorBoard_basic(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
 
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-    tb_cbk.on_epoch_end(0, {'acc': 10.0})
-    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
-    self.assertEqual(epoch_step, 0)
-    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.scalars, {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        })
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_Tensorboard_eager(self):
-    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+  def test_TensorBoard_across_invocations(self):
+    """Regression test for summary writer resource use-after-free.
 
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
+    See: <https://github.com/tensorflow/tensorflow/issues/25707>
+    """
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
 
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer=adam.AdamOptimizer(0.01),
-        metrics=['accuracy'])
+    for _ in (1, 2):
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.scalars, {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        })
 
-    cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
+  def test_TensorBoard_no_spurious_event_files(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
 
     model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
+        x,
+        y,
+        batch_size=2,
         epochs=2,
-        verbose=0)
-
-    self.assertTrue(os.path.exists(temp_dir))
+        callbacks=[tb_cbk])
 
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_update_freq(self):
+    events_file_run_basenames = set()
+    for (dirpath, dirnames, filenames) in os.walk(self.logdir):
+      del dirnames  # unused
+      if any(fn.startswith('events.out.') for fn in filenames):
+        events_file_run_basenames.add(os.path.basename(dirpath))
+    self.assertEqual(events_file_run_basenames, {'train'})
 
-    class FileWriterStub(object):
+  def test_TensorBoard_batch_metrics(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
 
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batch_summaries = []
-        self.epoch_summaries = []
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='batch_loss'),
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+
+  def test_TensorBoard_weight_histograms(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir, histogram_freq=1)
+    model_type = testing_utils.get_model_type()
 
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summaries.append((step, summary))
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summaries.append((step, summary))
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms, model_type),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+
+  def test_TensorBoard_weight_images(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, write_images=True)
+    model_type = testing_utils.get_model_type()
 
-      def flush(self):
-        pass
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms, model_type),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.images, model_type),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0/image/0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/1'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0/image/2'),
+        },
+    )
+
+  def _strip_layer_names(self, summaries, model_type):
+    """Deduplicate summary names modulo layer prefix.
+
+    This removes the first slash-component of each tag name: for
+    instance, "foo/bar/baz" becomes "bar/baz".
+
+    Args:
+      summaries: A `set` of `_ObservedSummary` values.
+      model_type: The model type currently being tested.
+
+    Returns:
+      A new `set` of `_ObservedSummary` values with layer prefixes
+      removed.
+    """
+    result = set()
+    for summary in summaries:
+      if '/' not in summary.tag:
+        raise ValueError('tag has no layer name: %r' % summary.tag)
+      start_from = 2 if 'subclass' in model_type else 1
+      new_tag = '/'.join(summary.tag.split('/')[start_from:])
+      result.add(summary._replace(tag=new_tag))
+    return result
+
+  def test_TensorBoard_invalid_argument(self):
+    with self.assertRaisesRegexp(ValueError, 'Unrecognized arguments'):
+      keras.callbacks.TensorBoard(wwrite_images=True)
+
+
+# Note that this test specifies model_type explicitly.
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
+
+  def setUp(self):
+    super(TestTensorBoardV2NonParameterizedTest, self).setUp()
+    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
+    self.train_dir = os.path.join(self.logdir, 'train')
+    self.validation_dir = os.path.join(self.logdir, 'validation')
+
+  def _get_seq_model(self):
+    model = keras.models.Sequential([
+        keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1),
+    ])
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    return model
 
-      def close(self):
-        pass
+  def fitModelAndAssertKerasModelWritten(self, model):
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir,
+                                         write_graph=True,
+                                         profile_batch=0)
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+    self.assertEqual(
+        summary_file.tensors,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='keras'),
+        },
+    )
+
+  def test_TensorBoard_writeSequentialModel_noInputShape(self):
+    model = keras.models.Sequential([
+        keras.layers.Conv2D(8, (3, 3)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1),
+    ])
+    model.compile('sgd', 'mse', run_eagerly=False)
+    self.fitModelAndAssertKerasModelWritten(model)
+
+  def test_TensorBoard_writeSequentialModel_withInputShape(self):
+    model = keras.models.Sequential([
+        keras.layers.Conv2D(8, (3, 3), input_shape=(10, 10, 1)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1),
+    ])
+    model.compile('sgd', 'mse', run_eagerly=False)
+    self.fitModelAndAssertKerasModelWritten(model)
+
+  def test_TensoriBoard_writeModel(self):
+    inputs = keras.layers.Input([10, 10, 1])
+    x = keras.layers.Conv2D(8, (3, 3), activation='relu')(inputs)
+    x = keras.layers.Flatten()(x)
+    x = keras.layers.Dense(1)(x)
+    model = keras.models.Model(inputs=inputs, outputs=[x])
+    model.compile('sgd', 'mse', run_eagerly=False)
+    self.fitModelAndAssertKerasModelWritten(model)
+
+  def test_TensorBoard_autoTrace(self):
+    model = self._get_seq_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, profile_batch=1, write_graph=False)
 
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.tensors,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag=u'batch_1'),
+        },
+    )
+
+  def test_TensorBoard_autoTrace_tagNameWithBatchNum(self):
+    model = self._get_seq_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, profile_batch=2, write_graph=False)
 
-    # Epoch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(tb_cbk.writer.batch_summaries, [])
-    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
-
-    # Batch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
-
-    # Integer mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq=20)
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertFalse(tb_cbk.writer.batch_summaries)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.tensors,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag=u'batch_2'),
+        },
+    )
+
+  def test_TensorBoard_autoTrace_profile_batch_largerThanBatchCount(self):
+    model = self._get_seq_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, profile_batch=10000, write_graph=False)
 
-  def test_RemoteMonitorWithJsonPayload(self):
-    if requests is None:
-      self.skipTest('`requests` required to run this test')
-    with self.cached_session():
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.np_utils.to_categorical(y_test)
-      y_train = keras.utils.np_utils.to_categorical(y_train)
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
-      cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
+    model.fit(
+        x,
+        y,
+        batch_size=2,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
 
-      with test.mock.patch.object(requests, 'post'):
-        model.fit(
-            x_train,
-            y_train,
-            batch_size=BATCH_SIZE,
-            validation_data=(x_test, y_test),
-            callbacks=cbks,
-            epochs=1)
+    # Enabled trace only on the 10000th batch, thus it should be empty.
+    self.assertEmpty(summary_file.tensors)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e403887181c92c0562269df8b66c0a5ee5d0a191
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -0,0 +1,457 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import profiler
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.callbacks.TensorBoard'])
+class TensorBoard(callbacks.Callback):
+  # pylint: disable=line-too-long
+  """Enable visualizations for TensorBoard.
+
+  TensorBoard is a visualization tool provided with TensorFlow.
+
+  This callback logs events for TensorBoard, including:
+  * Metrics summary plots
+  * Training graph visualization
+  * Activation histograms
+  * Sampled profiling
+
+  If you have installed TensorFlow with pip, you should be able
+  to launch TensorBoard from the command line:
+
+  ```sh
+  tensorboard --logdir=path_to_your_logs
+  ```
+
+  You can find more information about TensorBoard
+  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+  Arguments:
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_grads: whether to visualize gradient histograms in TensorBoard.
+        `histogram_freq` must be greater than 0.
+      batch_size: size of batch of inputs to feed to the network for histograms
+        computation.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
+      embeddings_freq: frequency (in epochs) at which selected embedding layers
+        will be saved. If set to 0, embeddings won't be computed. Data to be
+        visualized in TensorBoard's Embedding tab must be passed as
+        `embeddings_data`.
+      embeddings_layer_names: a list of names of layers to keep eye on. If None
+        or empty list all the embedding layer will be watched.
+      embeddings_metadata: a dictionary which maps layer name to a file name in
+        which metadata for this embedding layer is saved. See the
+          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+            about metadata files format. In case if the same metadata file is
+            used for all embedding layers, string can be passed.
+      embeddings_data: data to be embedded at layers specified in
+        `embeddings_layer_names`. Numpy array (if the model has a single input)
+        or list of Numpy arrays (if the model has multiple inputs). Learn [more
+        about
+            embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
+      profile_batch: Profile the batch to sample compute characteristics. By
+        default, it will profile the second batch. Set profile_batch=0 to
+        disable profiling.
+
+  Raises:
+      ValueError: If histogram_freq is set and no validation data is provided.
+
+  @compatibility(eager)
+  Using the `TensorBoard` callback will work when eager execution is enabled,
+  with the restriction that outputting histogram summaries of weights and
+  gradients is not supported. Consequently, `histogram_freq` will be ignored.
+  @end_compatibility
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               log_dir='./logs',
+               histogram_freq=0,
+               batch_size=32,
+               write_graph=True,
+               write_grads=False,
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None,
+               embeddings_data=None,
+               update_freq='epoch',
+               profile_batch=2):
+    super(TensorBoard, self).__init__()
+    self.log_dir = log_dir
+    self.histogram_freq = histogram_freq
+    if self.histogram_freq and context.executing_eagerly():
+      logging.warning(
+          UserWarning('Weight and gradient histograms not supported for eager'
+                      'execution, setting `histogram_freq` to `0`.'))
+      self.histogram_freq = 0
+    self.merged = None
+    self.write_graph = write_graph
+    self.write_grads = write_grads
+    self.write_images = write_images
+    self.batch_size = batch_size
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata
+    self.embeddings_data = embeddings_data
+    if update_freq == 'batch':
+      self.update_freq = 1
+    else:
+      self.update_freq = update_freq
+    self._samples_seen = 0
+    self._samples_seen_at_last_write = 0
+    # TODO(fishx): Add a link to the full profiler tutorial.
+    self._profile_batch = profile_batch
+    # One profiler session is running if it is True.
+    self._is_profiling = False
+
+    # TensorBoard should only write summaries on the chief when in a
+    # Multi-Worker setting.
+    self._chief_worker_only = True
+
+  def _init_writer(self, model):
+    """Sets file writer."""
+    if context.executing_eagerly():
+      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+      if not model.run_eagerly and self.write_graph:
+        with self.writer.as_default():
+          summary_ops_v2.graph(K.get_graph())
+    elif self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
+
+  def _make_histogram_ops(self, model):
+    """Defines histogram ops when histogram_freq > 0."""
+    # only make histogram summary op if it hasn't already been made
+    if self.histogram_freq and self.merged is None:
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          mapped_weight_name = weight.name.replace(':', '_')
+          tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_images:
+            w_img = array_ops.squeeze(weight)
+            shape = K.int_shape(w_img)
+            if len(shape) == 2:  # dense layer kernel case
+              if shape[0] > shape[1]:
+                w_img = array_ops.transpose(w_img)
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+            elif len(shape) == 3:  # convnet case
+              if K.image_data_format() == 'channels_last':
+                # switch to channels_first to display
+                # every kernel as a separate image
+                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img,
+                                        [shape[0], shape[1], shape[2], 1])
+            elif len(shape) == 1:  # bias case
+              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+            else:
+              # not possible to handle 3D convnets etc.
+              continue
+
+            shape = K.int_shape(w_img)
+            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
+            tf_summary.image(mapped_weight_name, w_img)
+
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [
+                grad.values if is_indexed_slices(grad) else grad
+                for grad in grads
+            ]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
+        if hasattr(layer, 'output'):
+          if isinstance(layer.output, list):
+            for i, output in enumerate(layer.output):
+              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
+          else:
+            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+
+  def set_model(self, model):
+    """Sets Keras model and creates summary ops."""
+
+    self.model = model
+    self._init_writer(model)
+    # histogram summaries only enabled in graph mode
+    if not context.executing_eagerly():
+      self._make_histogram_ops(model)
+      self.merged = tf_summary.merge_all()
+
+    # If both embedding_freq and embeddings_data are available, we will
+    # visualize embeddings.
+    if self.embeddings_freq and self.embeddings_data is not None:
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
+
+      # If embedding_layer_names are not provided, get all of the embedding
+      # layers from the model.
+      embeddings_layer_names = self.embeddings_layer_names
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name
+            for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      self.assign_embeddings = []
+      embeddings_vars = {}
+
+      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
+      self.step = step = array_ops.placeholder(dtypes.int32)
+
+      for layer in self.model.layers:
+        if layer.name in embeddings_layer_names:
+          embedding_input = self.model.get_layer(layer.name).output
+          embedding_size = np.prod(embedding_input.shape[1:])
+          embedding_input = array_ops.reshape(embedding_input,
+                                              (step, int(embedding_size)))
+          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
+          embedding = variables.Variable(
+              array_ops.zeros(shape), name=layer.name + '_embedding')
+          embeddings_vars[layer.name] = embedding
+          batch = state_ops.assign(embedding[batch_id:batch_id + step],
+                                   embedding_input)
+          self.assign_embeddings.append(batch)
+
+      self.saver = saver.Saver(list(embeddings_vars.values()))
+
+      # Create embeddings_metadata dictionary
+      if isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings_vars.keys()
+        }
+      else:
+        # If embedding_metadata is already a dictionary
+        embeddings_metadata = self.embeddings_metadata
+
+      try:
+        from tensorboard.plugins import projector
+      except ImportError:
+        raise ImportError('Failed to import TensorBoard. Please make sure that '
+                          'TensorBoard integration is complete."')
+
+      # TODO(psv): Add integration tests to test embedding visualization
+      # with TensorBoard callback. We are unable to write a unit test for this
+      # because TensorBoard dependency assumes TensorFlow package is installed.
+      config = projector.ProjectorConfig()
+      for layer_name, tensor in embeddings_vars.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        if (embeddings_metadata is not None and
+            layer_name in embeddings_metadata):
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
+  def _fetch_callback(self, summary):
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
+
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for TensorBoard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    if context.executing_eagerly():
+      # use v2 summary ops
+      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
+        for name, value in logs.items():
+          if isinstance(value, np.ndarray):
+            value = value.item()
+          summary_ops_v2.scalar(name, value, step=step)
+    else:
+      # use FileWriter from v1 summary
+      for name, value in logs.items():
+        if isinstance(value, np.ndarray):
+          value = value.item()
+        summary = tf_summary.Summary()
+        summary_value = summary.value.add()
+        summary_value.simple_value = value
+        summary_value.tag = name
+        self.writer.add_summary(summary, step)
+    self.writer.flush()
+
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch.
+
+    Performs profiling if current batch is in profiler_batches.
+    """
+    # Don't output batch_size and batch number as TensorBoard summaries
+    logs = logs or {}
+    self._samples_seen += logs.get('size', 1)
+    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
+    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
+      batch_logs = {('batch_' + k): v
+                    for k, v in logs.items()
+                    if k not in ['batch', 'size', 'num_steps']}
+      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._samples_seen_at_last_write = self._samples_seen
+    self._total_batches_seen += 1
+    if self._is_profiling:
+      profiler.save(self.log_dir, profiler.stop())
+      self._is_profiling = False
+    elif (not self._is_profiling and
+          self._total_batches_seen == self._profile_batch - 1):
+      profiler.start()
+      self._is_profiling = True
+
+  def on_train_begin(self, logs=None):
+    if self._profile_batch == 1:
+      profiler.start()
+      self._is_profiling = True
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Add histogram op to Model eval_function callbacks, reset batch count."""
+
+    # check if histogram summary should be run for this epoch
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._epoch = epoch
+      # pylint: disable=protected-access
+      # add the histogram summary op if it should run this epoch
+      self.model._make_test_function()
+      if self.merged not in self.model.test_function.fetches:
+        self.model.test_function.fetches.append(self.merged)
+        self.model.test_function.fetch_callbacks[
+            self.merged] = self._fetch_callback
+      # pylint: enable=protected-access
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Checks if summary ops should run next epoch, logs scalar summaries."""
+
+    # don't output batch_size and
+    # batch number as TensorBoard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size', 'num_steps']}
+    if self.update_freq == 'epoch':
+      step = epoch
+    else:
+      step = self._samples_seen
+    self._write_custom_summaries(step, logs)
+
+    # pop the histogram summary op after each epoch
+    if self.histogram_freq:
+      # pylint: disable=protected-access
+      if self.merged in self.model.test_function.fetches:
+        self.model.test_function.fetches.remove(self.merged)
+      if self.merged in self.model.test_function.fetch_callbacks:
+        self.model.test_function.fetch_callbacks.pop(self.merged)
+      # pylint: enable=protected-access
+
+    if self.embeddings_data is None and self.embeddings_freq:
+      raise ValueError('To visualize embeddings, embeddings_data must '
+                       'be provided.')
+
+    if self.embeddings_freq and self.embeddings_data is not None:
+      if epoch % self.embeddings_freq == 0:
+        # We need a second forward-pass here because we're passing
+        # the `embeddings_data` explicitly. This design allows to pass
+        # arbitrary data as `embeddings_data` and results from the fact
+        # that we need to know the size of the `tf.Variable`s which
+        # hold the embeddings in `set_model`. At this point, however,
+        # the `validation_data` is not yet set.
+
+        embeddings_data = self.embeddings_data
+        n_samples = embeddings_data[0].shape[0]
+        i = 0
+        sess = K.get_session()
+        while i < n_samples:
+          step = min(self.batch_size, n_samples - i)
+          batch = slice(i, i + step)
+
+          if isinstance(self.model.input, list):
+            feed_dict = {
+                model_input: embeddings_data[idx][batch]
+                for idx, model_input in enumerate(self.model.input)
+            }
+          else:
+            feed_dict = {self.model.input: embeddings_data[0][batch]}
+
+          feed_dict.update({self.batch_id: i, self.step: step})
+
+          if not isinstance(K.learning_phase(), int):
+            feed_dict[K.learning_phase()] = False
+
+          sess.run(self.assign_embeddings, feed_dict=feed_dict)
+          self.saver.save(sess,
+                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
+                          epoch)
+
+          i += self.batch_size
+
+  def on_train_end(self, logs=None):
+    if self._is_profiling:
+      profiler.save(self.log_dir, profiler.stop())
+      self._is_profiling = False
+    self.writer.close()
diff --git a/tensorflow/python/keras/callbacks_v1_test.py b/tensorflow/python/keras/callbacks_v1_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb02b84c8f9a908dd203c1c01e4f8f5aafaa857
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1_test.py
@@ -0,0 +1,567 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import callbacks_v1
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 10
+NUM_CLASSES = 2
+INPUT_DIM = 3
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class TestTensorBoardV1(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard(self):
+    np.random.seed(1337)
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def data_generator(train):
+      if train:
+        max_batch_index = len(x_train) // BATCH_SIZE
+      else:
+        max_batch_index = len(x_test) // BATCH_SIZE
+      i = 0
+      while 1:
+        if train:
+          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        else:
+          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        i += 1
+        i %= max_batch_index
+
+    # case: Sequential
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=temp_dir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      # fit with validation data and accuracy
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      # fit generator with validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data
+      # histogram_freq must be zero
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator with validation data and accuracy
+      tsb.histogram_freq = 1
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data and accuracy
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_multi_input_output(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    with self.cached_session():
+      filepath = os.path.join(tmpdir, 'logs')
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def data_generator(train):
+        if train:
+          max_batch_index = len(x_train) // BATCH_SIZE
+        else:
+          max_batch_index = len(x_test) // BATCH_SIZE
+        i = 0
+        while 1:
+          if train:
+            # simulate multi-input/output models
+            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          else:
+            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          i += 1
+          i %= max_batch_index
+
+      inp1 = keras.Input((INPUT_DIM,))
+      inp2 = keras.Input((INPUT_DIM,))
+      inp = keras.layers.add([inp1, inp2])
+      hidden = keras.layers.Dense(2, activation='relu')(inp)
+      hidden = keras.layers.Dropout(0.1)(hidden)
+      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      model = keras.models.Model([inp1, inp2], [output1, output2])
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='sgd',
+                    metrics=['accuracy'])
+
+      # we must generate new callbacks for each test, as they aren't stateless
+      def callbacks_factory(histogram_freq):
+        return [
+            callbacks_v1.TensorBoard(
+                log_dir=filepath,
+                histogram_freq=histogram_freq,
+                write_images=True,
+                write_grads=True,
+                batch_size=5)
+        ]
+
+      # fit without validation data
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
+
+      # fit with validation data and accuracy
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                validation_data=([x_test] * 2, [y_test] * 2),
+                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
+
+      # fit generator without validation data
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          callbacks=callbacks_factory(histogram_freq=0))
+
+      # fit generator with validation data and accuracy
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          validation_data=([x_test] * 2, [y_test] * 2),
+                          callbacks=callbacks_factory(histogram_freq=1))
+      assert os.path.isdir(filepath)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_in_test_function(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.steps_seen = []
+
+      def add_summary(self, summary, global_step):
+        summary_obj = summary_pb2.Summary()
+
+        # ensure a valid Summary proto is being sent
+        if isinstance(summary, bytes):
+          summary_obj.ParseFromString(summary)
+        else:
+          assert isinstance(summary, summary_pb2.Summary)
+          summary_obj = summary
+
+        # keep track of steps seen for the merged_summary op,
+        # which contains the histogram summaries
+        if len(summary_obj.value) > 1:
+          self.steps_seen.append(global_step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    def _init_writer(obj, _):
+      obj.writer = FileWriterStub(obj.log_dir)
+
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      callbacks_v1.TensorBoard._init_writer = _init_writer
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_with_generator(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    def generator():
+      x = np.random.randn(10, 100).astype(np.float32)
+      y = np.random.randn(10, 10).astype(np.float32)
+      while True:
+        yield x, y
+
+    with self.cached_session():
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=10, input_dim=100)
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation generator
+      model.fit_generator(
+          generator(),
+          steps_per_epoch=2,
+          epochs=2,
+          validation_data=generator(),
+          validation_steps=2,
+          callbacks=cbks,
+          verbose=0)
+
+      with self.assertRaises(ValueError):
+        # fit with validation generator but no
+        # validation_steps
+        model.fit_generator(
+            generator(),
+            steps_per_epoch=2,
+            epochs=2,
+            validation_data=generator(),
+            callbacks=cbks,
+            verbose=0)
+
+      self.assertTrue(os.path.exists(tmpdir))
+
+  def test_TensorBoard_with_ReduceLROnPlateau(self):
+    with self.cached_session():
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+      model.compile(
+          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.5, patience=4, verbose=1),
+          callbacks_v1.TensorBoard(log_dir=temp_dir)
+      ]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': batch})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0})
+    tb_cbk.on_train_end()
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+    tb_cbk.on_epoch_end(0, {'acc': 10.0})
+    tb_cbk.on_train_end()
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_Tensorboard_eager(self):
+    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=adam.AdamOptimizer(0.01),
+        metrics=['accuracy'])
+
+    cbks = [callbacks_v1.TensorBoard(log_dir=temp_dir)]
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=2,
+        verbose=0)
+
+    self.assertTrue(os.path.exists(temp_dir))
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_update_freq(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batch_summaries = []
+        self.epoch_summaries = []
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summaries.append((step, summary))
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summaries.append((step, summary))
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    # Epoch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(tb_cbk.writer.batch_summaries, [])
+    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
+    tb_cbk.on_train_end()
+
+    # Batch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+    tb_cbk.on_train_end()
+
+    # Integer mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq=20)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertFalse(tb_cbk.writer.batch_summaries)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+    tb_cbk.on_train_end()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 7bfc301239571525bc652257aaafb7abdd308dc2..cf70c28e75e8510370343751ca690c8d69e783ca 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -26,6 +26,7 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -38,6 +39,8 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
@@ -45,9 +48,12 @@ from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint:
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import object_identity
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -57,7 +63,7 @@ from tensorflow.tools.docs import doc_controls
 
 
 @keras_export('keras.layers.Layer')
-class Layer(checkpointable.Checkpointable):
+class Layer(trackable.Trackable):
   """Base layer class.
 
   This is the class from which all layers inherit.
@@ -110,7 +116,7 @@ class Layer(checkpointable.Checkpointable):
       constraints on inputs that can be accepted by the layer.
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
     # These properties should be set by the user via keyword arguments.
@@ -170,7 +176,9 @@ class Layer(checkpointable.Checkpointable):
     # A dictionary that maps metric names to metric result tensors. The results
     # are the running averages of metric values over an epoch.
     self._metrics_tensors = {}
-    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
+
+    self._set_dtype_and_policy(dtype)
+
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
@@ -213,6 +221,12 @@ class Layer(checkpointable.Checkpointable):
     else:
       self._initial_weights = None
 
+    # This flag is used to keep track of whether symbolic tensors are added to
+    # the model outside of the call context. This is required for disabling
+    # `run_eagerly` on compile.
+    # TODO(b/124303407): Remove this flag after we add support for the use case.
+    self._contains_symbolic_tensors = False
+
   def build(self, input_shape):
     """Creates the variables of the layer (optional, for subclass implementers).
 
@@ -244,8 +258,8 @@ class Layer(checkpointable.Checkpointable):
 
   @doc_controls.for_subclass_implementers
   def add_weight(self,
-                 name,
-                 shape,
+                 name=None,
+                 shape=None,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
@@ -259,8 +273,8 @@ class Layer(checkpointable.Checkpointable):
     """Adds a new variable to the layer.
 
     Arguments:
-      name: variable name.
-      shape: variable shape.
+      name: Variable name.
+      shape: Variable shape. Defaults to scalar if unspecified.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       initializer: initializer instance (callable).
       regularizer: regularizer instance (callable).
@@ -272,7 +286,7 @@ class Layer(checkpointable.Checkpointable):
         marked as non-trainable. `trainable` defaults to `True` unless
         `synchronization` is set to `ON_READ`.
       constraint: constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      partitioner: Partitioner to be passed to the `Trackable` API.
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
@@ -297,16 +311,23 @@ class Layer(checkpointable.Checkpointable):
       ValueError: When giving unsupported dtype and no initializer or when
         trainable has been set to True with synchronization set as `ON_READ`.
     """
+    if shape is None:
+      shape = ()
     # Validate optional keyword arguments.
     for kwarg in kwargs:
-      if kwarg not in ['getter', 'collections']:
+      if kwarg not in ['getter', 'collections', 'experimental_autocast']:
         raise TypeError('Unknown keyword argument:', kwarg)
     getter = kwargs.pop('getter', None)
     collections = kwargs.pop('collections', None)
+    # 'experimental_autocast' can be set to False by the caller to indicate an
+    # AutoCastVariable should never be created.
+    autocast = kwargs.pop('experimental_autocast', True)
 
     if dtype is None:
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
+    if self._dtype is None:
+      self._dtype = dtype.base_dtype.name
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -342,9 +363,9 @@ class Layer(checkpointable.Checkpointable):
         name=name,
         shape=shape,
         # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Checkpointable` method.
+        # `Trackable` method.
         getter=getter or base_layer_utils.make_variable,
-        # Manage errors in Layer rather than Checkpointable.
+        # Manage errors in Layer rather than Trackable.
         overwrite=True,
         initializer=initializer,
         dtype=dtype,
@@ -357,12 +378,20 @@ class Layer(checkpointable.Checkpointable):
         aggregation=aggregation)
     backend.track_variable(variable)
 
+    if autocast and self._mixed_precision_policy.should_cast_variables:
+      if isinstance(variable, distribute_values.DistributedVariable):
+        variable = autocast_variable.AutoCastDistributedVariable(variable)
+      else:
+        variable = autocast_variable.AutoCastVariable(variable)
+
     if regularizer is not None:
       # TODO(fchollet): in the future, this should be handled at the
       # level of variable creation, and weight regularization losses
       # should be variable attributes.
-      self._handle_weight_regularization(name, variable, regularizer)
-
+      name_in_scope = variable.name[:variable.name.find(':')]
+      self._handle_weight_regularization(name_in_scope,
+                                         variable,
+                                         regularizer)
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -389,6 +418,7 @@ class Layer(checkpointable.Checkpointable):
       config['batch_input_shape'] = self._batch_input_shape
     if hasattr(self, 'dtype'):
       config['dtype'] = self.dtype
+    # TODO(reedwm): Handle serializing self._mixed_precision_policy.
     return config
 
   @classmethod
@@ -435,13 +465,9 @@ class Layer(checkpointable.Checkpointable):
       with context.graph_mode():
         graph = func_graph.FuncGraph('graph')
         with graph.as_default():
-          if isinstance(input_shape, list):
-            inputs = [base_layer_utils.generate_placeholders_from_shape(shape)
-                      for shape in input_shape]
-          else:
-            inputs = base_layer_utils.generate_placeholders_from_shape(
-                input_shape)
-
+          input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+          inputs = nest.map_structure(
+              base_layer_utils.generate_placeholders_from_shape, input_shape)
           try:
             if self._expects_training_arg:
               outputs = self(inputs, training=False)
@@ -453,10 +479,7 @@ class Layer(checkpointable.Checkpointable):
                                       ' Please implement the '
                                       '`compute_output_shape` method on your '
                                       'layer (%s).' % self.__class__.__name__)
-      if isinstance(outputs, list):
-        return [output.shape for output in outputs]
-      else:
-        return outputs.shape
+      return nest.map_structure(lambda t: t.shape, outputs)
     raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
@@ -471,15 +494,10 @@ class Layer(checkpointable.Checkpointable):
             one per output tensor of the layer).
     """
     if not self.supports_masking:
-      if mask is not None:
-        if isinstance(mask, list):
-          if any(m is not None for m in mask):
-            raise TypeError('Layer ' + self.name + ' does not support masking, '
-                            'but was passed an input_mask: ' + str(mask))
-        else:
-          raise TypeError('Layer ' + self.name + ' does not support masking, '
-                          'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask
+      if any(m is not None for m in nest.flatten(mask)):
+        raise TypeError('Layer ' + self.name + ' does not support masking, '
+                        'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask.
       return None
     # if masking is explicitly supported, by default
     # carry over the input mask
@@ -511,11 +529,17 @@ class Layer(checkpointable.Checkpointable):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     input_list = nest.flatten(inputs)
-    if context.executing_eagerly():
-      # Accept NumPy inputs by converting to Tensors when executing eagerly.
-      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
-        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
-        input_list = nest.flatten(inputs)
+    # Accept NumPy inputs by converting to Tensors.
+    if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+      # Don't call `ops.convert_to_tensor` on all `inputs` because
+      # `SparseTensors` can't be converted to `Tensor`.
+      def _convert_non_tensor(x):
+        if isinstance(x, (np.ndarray, float, int)):
+          return ops.convert_to_tensor(x)
+        return x
+
+      inputs = nest.map_structure(_convert_non_tensor, inputs)
+      input_list = nest.flatten(inputs)
 
     # We will attempt to build a TF graph if & only if all inputs are symbolic.
     # This is always the case in graph mode. It can also be the case in eager
@@ -527,52 +551,73 @@ class Layer(checkpointable.Checkpointable):
       # Only create Keras history if at least one tensor originates from a
       # `keras.Input`. Otherwise this Layer may be being used outside the Keras
       # framework.
-      if base_layer_utils.uses_keras_input_layers(inputs):
+      if base_layer_utils.needs_keras_history(inputs):
         base_layer_utils.create_keras_history(inputs)
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
-    if build_graph and (not hasattr(self, '_compute_previous_mask') or
-                        self._compute_previous_mask):
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
       previous_mask = base_layer_utils.collect_previous_mask(inputs)
       if not hasattr(self, '_call_fn_args'):
         self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
+        # The previous layer generated a mask, and mask was not explicitly
+        # pass to __call__, hence we set previous_mask as the default value.
         kwargs['mask'] = previous_mask
 
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        # Build layer if applicable (if the `build` method has been overridden).
-        self._maybe_build(inputs)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
+    # Clear eager losses on top level model call.
+    # We are clearing the losses only on the top level model call and not on
+    # every layer/mode call because layer/model may be reused.
+    if (context.executing_eagerly() and
+        not base_layer_utils.is_in_call_context()):
+      self._clear_losses()
 
+    with base_layer_utils.call_context():
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
-        input_spec.assert_input_compatibility(
-            self.input_spec, inputs, self.name)
+        input_spec.assert_input_compatibility(self.input_spec, inputs,
+                                              self.name)
         graph = backend.get_graph()
-        with graph.as_default():
+        with graph.as_default(), ops.name_scope(self._name_scope()):
+          # Build layer if applicable (if the `build` method has been
+          # overridden).
+          self._maybe_build(inputs)
+          # Explicitly pass the learning phase placeholder to `call` if
+          # the `training` argument was left unspecified by the user.
+          # This behavior is restricted to the managed Keras FuncGraph.
+          learning_phase_passed_by_framework = False
+          if (self._expects_training_arg and
+              not base_layer_utils.training_arg_passed_to_call(
+                  tf_inspect.getfullargspec(self.call), args, kwargs) and
+              getattr(graph, 'name', None) == 'keras_graph'):
+            learning_phase_passed_by_framework = True
+            kwargs['training'] = backend.learning_phase()
           if not self.dynamic:
             try:
-              outputs = self.call(inputs, *args, **kwargs)
+              with base_layer_utils.autocast_context_manager(
+                  input_list,
+                  self._mixed_precision_policy.should_cast_variables), (
+                      base_layer_utils.AutoAddUpdates(self,
+                                                      inputs)) as auto_updater:
+                outputs = self.call(inputs, *args, **kwargs)
+                auto_updater.set_outputs(outputs)
+
             except TypeError as e:
-              messages = ['`tf.Tensor` as a Python `bool` is not allowed',
-                          'Tensor objects are only iterable when eager']
+              messages = ('`tf.Tensor` as a Python `bool` is not allowed',
+                          'Tensor objects are only iterable when eager')
+              exception_str = str(e)
               for msg in messages:
-                if msg in str(e):
+                if msg in exception_str:
                   raise TypeError('You are attempting to use Python control '
                                   'flow in a layer that was not declared to be '
                                   'dynamic. Pass `dynamic=True` to the class '
                                   'constructor.\nEncountered error:\n"""\n' +
-                                  str(e) + '\n"""')
-              raise e
+                                  exception_str + '\n"""')
+              raise
           else:
             # We will use static shape inference to return symbolic tensors
             # matching the specifications of the layer outputs.
@@ -586,11 +631,13 @@ class Layer(checkpointable.Checkpointable):
             raise ValueError('A layer\'s `call` method should return a '
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
-          self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, previous_mask)
           if base_layer_utils.have_all_keras_metadata(inputs):
+            if learning_phase_passed_by_framework:
+              kwargs.pop('training')
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
@@ -600,9 +647,13 @@ class Layer(checkpointable.Checkpointable):
             self._set_inputs(inputs, outputs)
       else:
         # Eager execution on data tensors.
-        outputs = self.call(inputs, *args, **kwargs)
-        self._handle_activity_regularization(inputs, outputs)
-        return outputs
+        with ops.name_scope(self._name_scope()):
+          self._maybe_build(inputs)
+          with base_layer_utils.autocast_context_manager(
+              input_list, self._mixed_precision_policy.should_cast_variables):
+            outputs = self.call(inputs, *args, **kwargs)
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
 
     if not context.executing_eagerly():
       # Optionally load weight values specified at layer instantiation.
@@ -663,9 +714,7 @@ class Layer(checkpointable.Checkpointable):
 
   @property
   def updates(self):
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates + self._gather_children_attribute('updates')
+    return self._get_unfiltered_updates(check_trainable=True)
 
   @property
   def losses(self):
@@ -679,7 +728,12 @@ class Layer(checkpointable.Checkpointable):
       A list of tensors.
     """
     collected_losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       collected_losses.extend(self._eager_losses)
     else:
       collected_losses.extend(self._losses)
@@ -710,6 +764,7 @@ class Layer(checkpointable.Checkpointable):
     Arguments:
       losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
         may also be zero-argument callables which create a loss tensor.
+        Other types of input are ignored.
       inputs: Ignored when executing eagerly. If anything other than None is
         passed, it signals the losses are conditional on some of the layer's
         inputs, and thus they should only be run where these inputs are
@@ -735,10 +790,24 @@ class Layer(checkpointable.Checkpointable):
         self._callable_losses.append(
             functools.partial(_tag_unconditional, loss))
       else:
-        if context.executing_eagerly():
-          self._eager_losses.append(_tag_unconditional(loss))
-        else:
+        if not tensor_util.is_tensor(loss):
+          # Ignoring constant values as this does not affect the gradients.
+          return
+        if tf_utils.is_symbolic_tensor(loss):
+          if not base_layer_utils.is_in_call_context():
+            self._contains_symbolic_tensors = True
           self._losses.append(_tag_unconditional(loss))
+        else:
+          self._eager_losses.append(_tag_unconditional(loss))
+
+  @trackable.no_automatic_dependency_tracking
+  def _clear_losses(self):
+    """Used every step in eager to reset losses."""
+    self._eager_losses = []
+    if hasattr(self, '_layers'):
+      for layer in trackable_layer_utils.filter_empty_layer_containers(
+          self._layers):
+        layer._clear_losses()
 
   @doc_controls.for_subclass_implementers
   def add_metric(self, value, aggregation=None, name=None):
@@ -748,10 +817,11 @@ class Layer(checkpointable.Checkpointable):
       value: Metric tensor.
       aggregation: Sample-wise metric reduction function. If `aggregation=None`,
         it indicates that the metric tensor provided has been aggregated
-        already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
-        y_pred))`. If aggregation='mean', the given metric tensor will be
-        sample-wise reduced using `mean` function. eg, `model.add_metric(
-        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+        already. eg, `bin_acc = BinaryAccuracy(name='acc')` followed by
+        `model.add_metric(bin_acc(y_true, y_pred))`. If aggregation='mean', the
+        given metric tensor will be sample-wise reduced using `mean` function.
+        eg, `model.add_metric(tf.reduce_sum(outputs), name='output_mean',
+        aggregation='mean')`.
       name: String metric name.
 
     Raises:
@@ -762,8 +832,26 @@ class Layer(checkpointable.Checkpointable):
           'We currently support only `mean` sample-wise metric aggregation. '
           'You provided aggregation=`%s`' % aggregation)
 
-    if tf_utils.is_symbolic_tensor(value):
-      self._symbolic_add_metric(value, aggregation, name)
+    is_symbolic = tf_utils.is_symbolic_tensor(value)
+    if name is None and (not is_symbolic or not hasattr(value, '_metric_obj')):
+      # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
+      # In eager mode, we use metric name to lookup a metric. Without a name,
+      # a new Mean metric wrapper will be created on every model/layer call.
+      # So, we raise an error when no name is provided.
+      # We will do the same for symbolic mode for consistency although a name
+      # will be generated if no name is provided.
+
+      # We will not raise this error in the foll use case for the sake of
+      # consistency as name in provided in the metric constructor.
+      # mean = metrics.Mean(name='my_metric')
+      # model.add_metric(mean(outputs))
+      raise ValueError('Please provide a name for your metric like '
+                       '`self.add_metric(tf.reduce_sum(inputs), '
+                       'name=\'mean_activation\', aggregation=\'mean\')`')
+
+    if is_symbolic:
+      with backend.get_graph().as_default():
+        self._symbolic_add_metric(value, aggregation, name)
     else:
       self._eager_add_metric(value, aggregation, name)
 
@@ -877,16 +965,15 @@ class Layer(checkpointable.Checkpointable):
 
     if inputs is None:
       # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+      return [
+          x for x in self._get_unfiltered_updates() if x._unconditional_update  # pylint: disable=protected-access
+      ]
 
     # Requesting input-conditional updates.
     inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
+    reachable = tf_utils.get_reachable_from_inputs(
+        inputs, self._get_unfiltered_updates())
+    return [u for u in self._get_unfiltered_updates() if u in reachable]  # pylint: disable=protected-access
 
   def get_losses_for(self, inputs):
     """Retrieves losses relevant to a specific set of inputs.
@@ -1255,6 +1342,24 @@ class Layer(checkpointable.Checkpointable):
   # Methods & attributes below are all private and only used by the framework. #
   ##############################################################################
 
+  def _set_dtype_and_policy(self, dtype):
+    """Sets self._dtype and self._mixed_precision_policy."""
+    if dtype:
+      if isinstance(dtype, policy.Policy):
+        self._mixed_precision_policy = dtype
+        self._dtype = self._mixed_precision_policy.default_variable_dtype
+      else:
+        # If a non-policy dtype is passed, no casting should be done. So we use
+        # the "infer" policy, which does no casting.
+        self._mixed_precision_policy = policy.Policy('infer')
+        self._dtype = dtypes.as_dtype(dtype).name
+    else:
+      self._mixed_precision_policy = policy.global_policy()
+      # If the global policy has not been set, it will be an "infer" policy
+      # without a default variable dtype, and so self._dtype will be None. In
+      # that case, self._dtype will be set when the layer is built or called.
+      self._dtype = self._mixed_precision_policy.default_variable_dtype
+
   def _name_scope(self):
     return self.name
 
@@ -1285,13 +1390,16 @@ class Layer(checkpointable.Checkpointable):
       match(value)  # Update the metric state.
       return
     else:
-      if aggregation is None:
-        raise ValueError('We do not support adding an aggregated metric tensor '
-                         'in `call` in eager execution.')
+      # Aggregation will always be set in this use case. If not we will raise
+      # error on model/layer call in graph function mode when model/layer is
+      # created.
+      assert aggregation is not None
       metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
       self._metrics.append(metric_obj)
 
   def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    if not base_layer_utils.is_in_call_context():
+      self._contains_symbolic_tensors = True
     if aggregation is None:
       # Iterate over the metrics and check if the given metric exists already.
       # This can happen when a metric instance is created in subclassed model
@@ -1306,10 +1414,20 @@ class Layer(checkpointable.Checkpointable):
         else:
           raise ValueError(
               'We currently do not support reusing a metric instance.')
-      else:
+      elif hasattr(value, '_metric_obj'):
         # We track the instance using the metadata on the result tensor.
         result_tensor = value
         metric_obj = result_tensor._metric_obj
+      else:
+        raise ValueError(
+            'We do not support adding an aggregated metric result tensor that '
+            'is not the output of a `tf.keras.metrics.Metric` metric instance. '
+            'Without having access to the metric instance we cannot reset the '
+            'state of a metric after every epoch during training. You can '
+            'create a `tf.keras.metrics.Metric` instance and pass the result '
+            'here or pass an un-aggregated result with `aggregation` parameter '
+            'set as `mean`. For example: `self.add_metric(tf.reduce_sum(inputs)'
+            ', name=\'mean_activation\', aggregation=\'mean\')`')
     else:
       # If a non-aggregated tensor is given as input (ie. `aggregation` is
       # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
@@ -1349,27 +1467,35 @@ class Layer(checkpointable.Checkpointable):
           self.add_loss(mean_activity_loss, inputs=inputs)
 
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    # In some cases the mask of the outputs has already been computed by
-    # inner layers and does not need to be recomputed by this layer.
-    mask_already_computed = all(
-        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
-    if hasattr(self, 'compute_mask') and not mask_already_computed:
-      output_mask = self.compute_mask(inputs, previous_mask)
-    else:
-      output_mask = None
-    if isinstance(outputs, (list, tuple)):
-      if output_mask is None:
-        output_mask = [None for _ in range(len(outputs))]
-      for x, m in zip(outputs, output_mask):
+    flat_outputs = nest.flatten(outputs)
+    mask_already_computed = (
+        getattr(self, '_compute_output_and_mask_jointly', False) or
+        all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
+
+    if not mask_already_computed:
+      if hasattr(self, 'compute_mask'):
+        output_masks = self.compute_mask(inputs, previous_mask)
+        # `compute_mask` can return a single `None` even when a Layer
+        # has multiple outputs.
+        if output_masks is None:
+          flat_masks = [None for _ in flat_outputs]
+        else:
+          flat_masks = nest.flatten(output_masks)
+      else:
+        flat_masks = [None for _ in flat_outputs]
+
+      for output, mask in zip(flat_outputs, flat_masks):
         try:
-          x._keras_mask = m  # pylint: disable=protected-access
+          output._keras_mask = mask
         except AttributeError:
-          pass  # C type such as dict. Masking not supported in this case.
-    else:
-      try:
-        outputs._keras_mask = output_mask  # pylint: disable=protected-access
-      except AttributeError:
-        pass  # C type such as dict. Masking not supported in this case.
+          # C Type such as np.ndarray.
+          pass
+
+    if tf_utils.are_all_symbolic_tensors(flat_outputs):
+      for output in flat_outputs:
+        if getattr(output, '_keras_mask', None) is not None:
+          # Do not track masks for `TensorFlowOpLayer` construction.
+          output._keras_mask._keras_history_checked = True
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
     call_convention = getattr(
@@ -1564,6 +1690,9 @@ class Layer(checkpointable.Checkpointable):
 
   def _maybe_build(self, inputs):
     # Check input assumptions set before layer building, e.g. input rank.
+    if self.built:
+      return
+
     input_spec.assert_input_compatibility(
         self.input_spec, inputs, self.name)
     input_list = nest.flatten(inputs)
@@ -1578,26 +1707,95 @@ class Layer(checkpointable.Checkpointable):
     # Only call `build` if the user has manually overridden the build method.
     if not hasattr(self.build, '_is_default'):
       self.build(input_shapes)
+    # We must set self.built since user defined build functions are not
+    # constrained to set self.built.
+    self.built = True
 
   def _symbolic_call(self, inputs):
     input_shapes = nest.map_structure(lambda x: x.shape, inputs)
     output_shapes = self.compute_output_shape(input_shapes)
-    return nest.map_structure(
-        lambda shape: backend.placeholder(shape, dtype=self.dtype),
-        output_shapes)
+
+    def _make_placeholder_like(shape):
+      ph = backend.placeholder(shape=shape, dtype=self.dtype)
+      ph._keras_mask = None
+      return ph
+
+    return nest.map_structure(_make_placeholder_like, output_shapes)
+
+  @property
+  def _obj_reference_counts(self):
+    """A dictionary counting the number of attributes referencing an object."""
+    if not hasattr(self, '_obj_reference_counts_dict'):
+      super(Layer, self).__setattr__(
+          '_obj_reference_counts_dict',
+          object_identity.ObjectIdentityDictionary())
+    return self._obj_reference_counts_dict
+
+  def __delattr__(self, name):
+    existing_value = getattr(self, name, None)
+
+    # If this value is replacing an existing object assigned to an attribute, we
+    # should clean it out to avoid leaking memory. First we check if there are
+    # other attributes referencing it.
+    reference_counts = self._obj_reference_counts
+    if existing_value not in reference_counts:
+      super(Layer, self).__delattr__(name)
+      return
+
+    reference_count = reference_counts[existing_value]
+    if reference_count > 1:
+      # There are other remaining references. We can't remove this object from
+      # _layers etc.
+      reference_counts[existing_value] = reference_count - 1
+      super(Layer, self).__delattr__(name)
+      return
+    else:
+      # This is the last remaining reference.
+      del reference_counts[existing_value]
+
+    super(Layer, self).__delattr__(name)
+
+    if (isinstance(existing_value, Layer)
+        or trackable_layer_utils.has_weights(existing_value)):
+      super(Layer, self).__setattr__(
+          '_layers',
+          [l for l in self._layers if l is not existing_value])
+    if isinstance(existing_value, tf_variables.Variable):
+      super(Layer, self).__setattr__(
+          '_trainable_weights',
+          [w for w in self._trainable_weights if w is not existing_value])
+      super(Layer, self).__setattr__(
+          '_non_trainable_weights',
+          [w for w in self._non_trainable_weights if w is not existing_value])
 
   def __setattr__(self, name, value):
     if (not getattr(self, '_setattr_tracking', True) or
-        getattr(self, '_is_graph_network', False)):
+        getattr(self, '_is_graph_network', False) or
+        # Exclude @property.setters from tracking
+        hasattr(self.__class__, name)):
       super(Layer, self).__setattr__(name, value)
       return
 
+    # Keep track of trackable objects, for the needs of `Network.save_weights`.
+    value = data_structures.sticky_attribute_assignment(
+        trackable=self, value=value, name=name)
+
+    reference_counts = self._obj_reference_counts
+    reference_counts[value] = reference_counts.get(value, 0) + 1
+
+    # Clean out the old attribute, which clears _layers and _trainable_weights
+    # if necessary.
+    try:
+      self.__delattr__(name)
+    except AttributeError:
+      pass
+
     # Append value to self._layers if relevant
     if (isinstance(value, Layer) or
-        checkpointable_layer_utils.has_weights(value)):
+        trackable_layer_utils.has_weights(value)):
       # Initialize `_layers` here in case `__init__` has not yet been called.
       if not hasattr(self, '_layers'):
-        self._layers = []
+        super(Layer, self).__setattr__('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
@@ -1608,34 +1806,52 @@ class Layer(checkpointable.Checkpointable):
           value._use_resource_variables = True
 
     # Append value to list of trainable / non-trainable weights if relevant
-    if isinstance(value, tf_variables.Variable):
-      # Users may add extra weights/variables
-      # simply by assigning them to attributes (invalid for graph networks)
-      if not hasattr(self, '_trainable_weights'):
-        self._trainable_weights = []
-      if not hasattr(self, '_non_trainable_weights'):
-        self._non_trainable_weights = []
-      if value not in self._trainable_weights + self._non_trainable_weights:
-        if value.trainable:
-          self._trainable_weights.append(value)
-        else:
-          self._non_trainable_weights.append(value)
+    # TODO(b/125122625): This won't pick up on any variables added to a
+    # list/dict after creation.
+    for val in nest.flatten(value):
+      # TODO(b/126450014): Remove `_UnreadVariable` check here when assign ops
+      # no longer return True for isinstance Variable checks.
+      if (isinstance(val, tf_variables.Variable) and
+          not isinstance(val, resource_variable_ops._UnreadVariable)):  # pylint: disable=protected-access
+        # Users may add extra weights/variables
+        # simply by assigning them to attributes (invalid for graph networks)
+        if not hasattr(self, '_trainable_weights'):
+          super(Layer, self).__setattr__('_trainable_weights', [])
+        if not hasattr(self, '_non_trainable_weights'):
+          super(Layer, self).__setattr__('_non_trainable_weights', [])
+        if val not in self._trainable_weights + self._non_trainable_weights:
+          if val.trainable:
+            self._trainable_weights.append(val)
+          else:
+            self._non_trainable_weights.append(val)
+          backend.track_variable(val)
+
     super(Layer, self).__setattr__(name, value)
 
   def _gather_children_attribute(self, attribute):
-    assert attribute in {'weights', 'trainable_weights',
-                         'non_trainable_weights', 'updates', 'losses'}
+    assert attribute in {
+        'weights', 'trainable_weights', 'non_trainable_weights', 'updates',
+        'losses'
+    }
     if hasattr(self, '_layers'):
-      return list(itertools.chain.from_iterable(
-          getattr(layer, attribute) for layer in self._layers))
+      nested_layers = trackable_layer_utils.filter_empty_layer_containers(
+          self._layers)
+      return list(
+          itertools.chain.from_iterable(
+              getattr(layer, attribute) for layer in nested_layers))
     return []
 
   # This is a hack so that the is_layer (within
-  # training/checkpointable/layer_utils.py) check doesn't get the weights attr.
+  # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
   def _is_layer(self):
     return True
 
+  def _get_unfiltered_updates(self, check_trainable=True):
+    if check_trainable and not self.trainable and not self.stateful:
+      return []
+    return self._updates + self._gather_children_attribute('updates')
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -1797,6 +2013,9 @@ class TensorFlowOpLayer(Layer):
         name=name, trainable=trainable, dtype=dtype)
     self.node_def = node_def_pb2.NodeDef.FromString(node_def)
     self.constants = constants or {}
+    # Layer uses original op unless it is called on new inputs.
+    # This means `built` is not set in `__call__`.
+    self.built = True
 
   def call(self, inputs):
     if context.executing_eagerly():
@@ -1812,6 +2031,10 @@ class TensorFlowOpLayer(Layer):
         inputs.insert(index, constant)
 
       self.node_def.name = graph.unique_name(self.node_def.name)
+      # Check for case where first input should be a list of Tensors.
+      if 'N' in self.node_def.attr:
+        num_tensors = self.node_def.attr['N'].i
+        inputs = [inputs[:num_tensors]] + inputs[num_tensors:]
       c_op = ops._create_c_op(graph, self.node_def, inputs, control_inputs=[])
       op = graph._create_op_from_tf_operation(c_op)
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index ebee4a3043e57d149bb8d81812e7568aff8f8eb8..6180044b7093bbfdb737d55352234fcf8e9470b1 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -18,16 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import itertools as it
+import sys
+import traceback
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -71,7 +79,7 @@ class InvalidLayer(base_layer.Layer):
     raise ValueError('You did something wrong!')
 
 
-class BaseLayerTest(test.TestCase, parameterized.TestCase):
+class BaseLayerTest(keras_parameterized.TestCase):
 
   @parameterized.parameters(DynamicLayer1, DynamicLayer2)
   def test_dynamic_layer_in_functional_model_in_graph_mode(self, layer_class):
@@ -210,6 +218,156 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       _ = InvalidLayer()(inputs)
 
+  @keras_parameterized.run_with_all_model_types
+  @test_util.run_in_graph_and_eager_modes
+  def test_build_with_numpy_data(self):
+    model_layers = [
+        keras.layers.Dense(3, activation='relu', kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
+    model(np.zeros((2, 4), dtype='float32'))
+    self.assertTrue(model.built)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_default_add_weight(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__()
+        self.default_weight = self.add_weight()
+        self.weight_without_name = self.add_weight(shape=(3, 4))
+        self.regularized_weight_without_name = self.add_weight(
+            shape=(3, 4), regularizer='l2')
+
+    layer = TestLayer()
+    self.assertEqual(layer.default_weight.shape.as_list(), [])
+    self.assertEqual(layer.weight_without_name.shape.as_list(), [3, 4])
+    self.assertEqual(layer.default_weight.dtype.name, 'float32')
+    self.assertEqual(layer.weight_without_name.dtype.name, 'float32')
+    self.assertEqual(len(layer.losses), 1)
+    if not context.executing_eagerly():
+      # Cannot access tensor.name in eager execution.
+      self.assertTrue('Variable_2/Regularizer' in layer.losses[0].name)
+
+  def test_learning_phase_freezing_for_layers(self):
+    # This test is only meant to run in graph functions mode (ambient eager).
+    # In forced eager, `model.predict` ignores the global learning phase
+    # and just uses training=False. TODO(fchollet): consider unifying the
+    # behaviors.
+
+    class LearningPhaseLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs))
+
+    def get_learning_phase_value():
+      model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
+      return np.sum(model.predict(np.ones((1, 1))))
+
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test scope.
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(get_learning_phase_value(), 1)
+
+    # The effects of the scope end after exiting it.
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test setting.
+    keras.backend.set_learning_phase(1)
+    self.assertEqual(get_learning_phase_value(), 1)
+    keras.backend.set_learning_phase(0)
+    self.assertEqual(get_learning_phase_value(), 0)
+
+  # Cannot be enabled with `run_eagerly=True`, see b/123904578
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_layer_can_return_variable(self):
+
+    class ComputeSum(keras.layers.Layer):
+
+      def __init__(self):
+        super(ComputeSum, self).__init__()
+        self.total = variables.Variable(
+            initial_value=array_ops.zeros((1, 1)), trainable=False)
+        if not context.executing_eagerly():
+          keras.backend.get_session().run(self.total.initializer)
+
+      def call(self, inputs):
+        self.total.assign_add(inputs)
+        return self.total
+
+    inputs = keras.Input(shape=(1,))
+    model = keras.Model(inputs, ComputeSum()(inputs))
+    model.predict(np.ones((1, 1)))
+
+  def _get_layer_with_training_arg(self):
+
+    class TrainingLayer(keras.layers.Layer):
+      """A layer with a `training` argument in a defuned `call`."""
+
+      @def_function.function
+      def call(self, inputs, training=None):
+        if training is None:
+          training = keras.backend.learning_phase()
+        return tf_utils.smart_cond(training,
+                                   lambda: array_ops.ones_like(inputs),
+                                   lambda: array_ops.zeros_like(inputs))
+
+    return TrainingLayer()
+
+  @keras_parameterized.run_with_all_model_types
+  # b/124459427: can't test with `run_eagerly=True` for now.
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_arg_in_defun(self):
+    layer = self._get_layer_with_training_arg()
+    model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 1.)
+    loss = model.evaluate(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(loss, 0.)
+
+    # Test that the argument injection performed in `call` is not active
+    # when the argument is passed explicitly.
+    layer = self._get_layer_with_training_arg()
+    inputs = keras.Input(shape=(1,))
+    # Pass `training` by name
+    outputs = layer(inputs, training=False)
+    model = keras.Model(inputs, outputs)
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 0.)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_raw_variable_assignment(self):
+
+    class RawVariableLayer(keras.layers.Layer):
+
+      def __init__(self, **kwargs):
+        super(RawVariableLayer, self).__init__(**kwargs)
+        # Test variables in nested structure.
+        self.var_list = [variables.Variable(1.), {'a': variables.Variable(2.)}]
+
+      def call(self, inputs):
+        return inputs * self.var_list[0] * self.var_list[1]['a']
+
+    model = testing_utils.get_model_from_layers([RawVariableLayer()],
+                                                input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10)), np.ones((10, 10))
+    # Checks that variables get initialized.
+    model.fit(x, y, batch_size=2, epochs=2)
+
+
+class SymbolicSupportTest(test.TestCase):
+
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = keras.Input((3,))
@@ -239,18 +397,14 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
     self.assertIsInstance(x2, ops.EagerTensor)
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   def test_mixing_numpy_arrays_and_graph_tensors(self):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   @test_util.run_in_graph_and_eager_modes
@@ -279,6 +433,31 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
                         np.matmul(x_val, y_val),
                         atol=1e-5)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_reraising_exception(self):
+    # When layer is not dynamic, we have some pattern matching during exception
+    # handling to detect when the user is trying to use python control flow.
+    # When an exception is thrown but the pattern doesn't match, we want to
+    # preserve the originating stack trace. An early implementation of this
+    # logic lost the stack trace. We test the correct behavior here.
+
+    class TypeErrorLayer(base_layer.Layer):
+
+      def call(self, inputs):
+        def easily_identifiable_name():
+          raise TypeError('Non-matching TypeError message.')
+        easily_identifiable_name()
+
+    inputs = keras.Input((3,))
+
+    try:
+      _ = TypeErrorLayer()(inputs)
+    except TypeError:
+      tb = traceback.extract_tb(sys.exc_info()[2])
+      last_entry = tb[-1]
+      function_name = last_entry[2]
+      self.assertEqual(function_name, 'easily_identifiable_name')
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class NestedTrackingTest(test.TestCase):
@@ -322,6 +501,9 @@ class NestedTrackingTest(test.TestCase):
     self.assertEqual(len(layer.weights), 8)
     self.assertEqual(len(layer.trainable_weights), 0)
     self.assertEqual(len(layer.non_trainable_weights), 8)
+    self.assertEqual(
+        set([layer.dense1, layer.dense2, layer.v1, layer.v2]),
+        set([obj for unused_name, obj in layer._checkpoint_dependencies]))
 
   def test_nested_layer_updates_losses_tracking(self):
     # Test that updates and losses from nested sublayers are
@@ -365,6 +547,130 @@ class NestedTrackingTest(test.TestCase):
       self.assertEqual(len(layer.losses), 3)
       self.assertEqual(len(layer.updates), 3)
 
+  def test_attribute_reassignment(self):
+    l = keras.layers.Layer()
+    l.a = keras.layers.Layer()
+    l.a = []
+    l.a = variables.Variable(1.)
+    l.a = keras.layers.Layer()
+    last_assignment = keras.layers.Layer()
+    l.a = last_assignment
+    l.b = variables.Variable(1.)
+    del l.b
+    l.c = keras.layers.Layer()
+    del l.c
+    l.d = last_assignment
+    del l.d
+    self.assertEqual([last_assignment], l._layers)
+    self.assertEqual([], l.trainable_weights)
+    self.assertEqual([], l.non_trainable_weights)
+    self.assertEqual([], l.weights)
+    del l.a
+    self.assertEqual([], l._layers)
+
+  def test_assign_op_not_tracked_as_variable(self):
+
+    class LayerWithAssignAttr(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.v = variables.Variable(1.)
+        self.v_assign = self.v.assign_add(2.)
+
+    layer = LayerWithAssignAttr()
+    layer.build((10, 10))
+
+    self.assertEqual([layer.v], layer.variables)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class NameScopingTest(keras_parameterized.TestCase):
+
+  def test_name_scope_layer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(10, name='MyName')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
+
+  def test_name_scope_sublayer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName2')
+    y = layer(x)
+    self.assertEqual(layer.bias.name, 'MyName2/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
+    self.assertEqual(y.name, 'MyName2/MyAct/Relu:0')
+
+  def test_name_scope_tf_tensor(self):
+    x = ops.convert_to_tensor(np.ones((10, 10)))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName3')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName3/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
+
+
+_LAYERS_TO_TEST = [
+    (keras.layers.Dense, (1,), collections.OrderedDict(units=[1])),
+    (keras.layers.Activation, (2, 2),
+     collections.OrderedDict(activation=['relu'])),
+    (keras.layers.Dropout, (16,), collections.OrderedDict(rate=[0.25])),
+    (keras.layers.BatchNormalization, (8, 8, 3), collections.OrderedDict(
+        axis=[3], center=[True, False], scale=[True, False])),
+    (keras.layers.Conv1D, (8, 8), collections.OrderedDict(
+        filters=[1], kernel_size=[1, 3], strides=[1, 2],
+        padding=['valid', 'same'], use_bias=[True, False],
+        kernel_regularizer=[None, 'l2'])),
+    (keras.layers.Conv2D, (8, 8, 3), collections.OrderedDict(
+        filters=[1], kernel_size=[1, 3], strides=[1, 2],
+        padding=['valid', 'same'], use_bias=[True, False],
+        kernel_regularizer=[None, 'l2'])),
+    (keras.layers.LSTM, (8, 8), collections.OrderedDict(
+        units=[1],
+        activation=[None, 'relu'],
+        kernel_regularizer=[None, 'l2'],
+        dropout=[0, 0.5],
+        stateful=[True, False],
+        unroll=[True, False])),
+]
+
+OUTPUT_TEST_CASES = []
+for layer_type, inp_shape, arg_dict in _LAYERS_TO_TEST:
+  arg_combinations = [[(k, i) for i in v] for k, v in arg_dict.items()]  # pylint: disable=g-complex-comprehension
+  for args in it.product(*arg_combinations):
+    name = '_{}_{}'.format(
+        layer_type.__name__, '_'.join('{}_{}'.format(k, v) for k, v in args))
+    OUTPUT_TEST_CASES.append(
+        (name, layer_type, inp_shape, {k: v for k, v in args}))
+
+
+class OutputTypeTest(keras_parameterized.TestCase):
+  """Test that layers and models produce the correct tensor types."""
+
+  # In v1 graph there are only symbolic tensors.
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  @parameterized.named_parameters(*OUTPUT_TEST_CASES)
+  def test_layer_outputs(self, layer_to_test, input_shape, layer_kwargs):
+    layer = layer_to_test(**layer_kwargs)
+
+    input_data = np.ones(shape=(2,) + input_shape, dtype=np.float32)
+    layer_result = layer(input_data)
+
+    inp = keras.layers.Input(shape=input_shape, batch_size=2)
+    model = keras.models.Model(inp, layer_to_test(**layer_kwargs)(inp))
+    model_result = model(input_data)
+
+    for x in [layer_result, model_result]:
+      if not isinstance(x, ops.Tensor):
+        raise ValueError('Tensor or EagerTensor expected, got type {}'
+                         .format(type(x)))
+
+      if isinstance(x, ops.EagerTensor) != context.executing_eagerly():
+        expected_type = (ops.EagerTensor if context.executing_eagerly()
+                         else ops.Tensor)
+        raise ValueError('Expected type {}, got type {}'
+                         .format(expected_type, type(x)))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 0d0257a1dd5ea7274fe369614e0bad750a49ff87..4697f8d1f9e796d2ed958bfd9b0d208cbc3df927 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -18,16 +18,24 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as collections_lib
+import threading
 import enum
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
+
+_call_context = threading.local()
 
 
 class CallConvention(enum.Enum):
@@ -72,7 +80,7 @@ def make_variable(name,
   that has fewer constraints (`variable_scope.variable()`).
 
   In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
+  should exist in `Trackable` instead. When this happens, we can get
   rid of this temporary solution.
 
   TODO(fchollet): remove this method when no longer needed.
@@ -119,9 +127,9 @@ def make_variable(name,
       variable_dtype = None
     else:
       # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      elif isinstance(initializer, type(init_ops_v2.Initializer)):
+      if isinstance(
+          initializer,
+          (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
         initializer = initializer()
       init_val = lambda: initializer(shape, dtype=dtype)
       variable_dtype = dtype.base_dtype
@@ -205,22 +213,16 @@ def collect_previous_mask(input_tensors):
   """Retrieves the output mask(s) of the previous node.
 
   Arguments:
-      input_tensors: A tensor or list of tensors.
+      input_tensors: An arbitrary structure of Tensors.
 
   Returns:
       A mask tensor or list of mask tensors.
   """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
+
+  def _collect_previous_mask(x):
+    return getattr(x, '_keras_mask', None)
+
+  return nest.map_structure(_collect_previous_mask, input_tensors)
 
 
 def have_all_keras_metadata(tensors):
@@ -237,8 +239,7 @@ def create_keras_history(tensors):
   This method checks to see if a Tensor in `tensors` is missing Keras metadata
   and has its origin in a Keras `Input` Layer. If so, this method will replace
   the raw TensorFlow Operations that created this tensor with
-  `TensorFlowOpLayer`
-  instances that create identical operations.
+  `TensorFlowOpLayer` instances that create identical operations.
 
   Any Tensors not originating from a Keras `Input` Layer will be treated as
   constants when constructing `TensorFlowOpLayer` instances.
@@ -247,7 +248,6 @@ def create_keras_history(tensors):
     tensors: A structure of Tensors, some of which come from raw TensorFlow
       operations and need to have Keras metadata assigned to them.
   """
-
   _create_keras_history_helper(tensors, set())
 
 
@@ -256,8 +256,8 @@ def _create_keras_history_helper(tensors, processed_ops=None):
 
   Arguments:
     tensors: A structure of Tensors for which to create Keras metadata.
-    processed_ops: TensorFlow operations that have already been wrapped in
-      `TensorFlowOpLayer` instances.
+    processed_ops: Set. TensorFlow operations that have already been wrapped
+      in `TensorFlowOpLayer` instances.
 
   Returns:
     The updated set of TensorFlow Operations that have been wrapped
@@ -278,14 +278,14 @@ def _create_keras_history_helper(tensors, processed_ops=None):
       constants = {}
       layer_inputs = []
       for i, op_input in enumerate(op_inputs):
-        if uses_keras_input_layers(op_input):
+        if uses_keras_history(op_input):
           layer_inputs.append(op_input)
         else:
           # Treat any value not originating from a `keras.Input` as
           # a constant (Variables currently have `Placeholder` op type
           # when originating from an eager context
           # so can't be supported.
-          constants[i] = backend.function([], [op_input])([])
+          constants[i] = backend.function([], op_input)([])
       processed_ops = _create_keras_history_helper(layer_inputs, processed_ops)
       name = op.name
       node_def = op.node_def.SerializeToString()
@@ -297,33 +297,250 @@ def _create_keras_history_helper(tensors, processed_ops=None):
   return processed_ops
 
 
-def uses_keras_input_layers(tensors):
-  """Checks if at least one Tensor in `tensors` originates from a Keras `Input`.
+def needs_keras_history(tensors):
+  """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
 
-  If so, the Functional API is being used.
+  This will never return True inside a sublayer, because sublayers
+  do not need to create Keras History. Otherwise, this returns True
+  if one or more of `tensors` originates from a `keras.Input` and
+  does not have `_keras_history` set.
 
   Arguments:
     tensors: An arbitrary nested structure of Tensors.
 
   Returns:
-    Bool, whether at least one Tensor originates from a Keras `Input`.
+    Bool, whether at least one Tensor needs to be wrapped.
   """
-  checked_tensors = set()
   input_tensors = nest.flatten(tensors)
+  if is_in_call_context() or all(
+      getattr(tensor, '_keras_history', None) is not None
+      for tensor in input_tensors):
+    # KerasHistory already set.
+    return False
+  return uses_keras_history(tensors)
+
+
+def is_in_call_context():
+  """Returns true if inside of a model/layer '__call__'."""
+  return getattr(_call_context, 'in_call', False)
+
+
+def uses_keras_history(tensors):
+  """Check if at least one Tensor originates from a `keras.Input`.
+
+  This is `True` if at least one Tensor has its origin in a `keras.Input`.
+  Any Tensor that originates from a `keras.Input` will have a dependency
+  Tensor with a `_keras_history` attribute attached. Tensors that have
+  already been checked to not originate from a `keras.Input`
+  are marked as `_keras_history_checked`.
+
+  Arguments:
+    tensors: An arbitrary nested structure of Tensors.
+
+  Returns:
+    Bool, whether at least one Tensor originates from a `keras.Input`.
+  """
+  checked_tensors = set()
+  tensors_to_check = nest.flatten(tensors)
+
+  while tensors_to_check:
+    new_tensors_to_check = set()
+    for tensor in tensors_to_check:
+      if getattr(tensor, '_keras_history_checked', None) is not None:
+        continue
+      if getattr(tensor, '_keras_history', None) is not None:
+        return True
 
-  while input_tensors:
-    if any(
-        getattr(tensor, '_keras_history', None) is not None
-        for tensor in input_tensors):
-      return True
-    checked_tensors.update(input_tensors)
-    new_input_tensors = set()
-    for tensor in input_tensors:
       try:
-        new_input_tensors.update(tensor.op.inputs)
+        new_tensors_to_check.update(tensor.op.inputs)
       except AttributeError:
-        # In case `tensor` is a Variable created in an Eager
-        # context
+        # In case `tensor` is a Variable created in an Eager context.
         pass
-    input_tensors = list(new_input_tensors - checked_tensors)
+
+    checked_tensors.update(tensors_to_check)
+    tensors_to_check = list(new_tensors_to_check - checked_tensors)
+
+  # Mark that these Tensors have been checked once for `_keras_history`,
+  # and should not be checked again for performance reasons.
+  mark_checked(tensors)
   return False
+
+
+def mark_checked(tensors):
+  """Marks that these Tensors should not be tracked.
+
+  This prevents Layers from attempting to create TensorFlowOpLayers
+  for these Tensors.
+
+  Arguments:
+    tensors: An arbitrary structure of Tensors.
+  """
+
+  def _mark_checked(tensor):
+    tensor._keras_history_checked = True  # pylint: disable=protected-access
+
+  nest.map_structure(_mark_checked, tensors)
+
+
+@tf_contextlib.contextmanager
+def call_context():
+  """Scope that marks when we are currently inside a Layer/Model's `call`."""
+  was_in_call = is_in_call_context()
+  _call_context.in_call = True
+  try:
+    yield
+  finally:
+    _call_context.in_call = was_in_call
+
+
+def training_arg_passed_to_call(argspec, args, kwargs):
+  """Returns whether a user passed the `training` argument in `__call__`."""
+  # `argspec.args` starts with ['self', 'inputs']
+  full_args = dict(zip(argspec.args[2:], args))
+  full_args.update(kwargs)
+  return 'training' in full_args
+
+
+class AutoAddUpdates(object):
+  """Automatically track stateful ops with `add_update`.
+
+  This context manager is used to automatically add stateful ops to a Layer
+  or Model's `.updates`. This ensures that stateful ops are run in the Keras
+  training loop. It also allows for these stateful ops to be disabled by
+  setting `trainable=False`.
+
+  Example:
+
+  ```
+  with AutoAddUpdates(layer, inputs) as auto_updates:
+    outputs = layer.call(inputs)
+    auto_updates.set_outputs(outputs)
+  ```
+
+  Attributes:
+    layer: Layer or Model instance to add the updates to.
+    inputs: The inputs to this Layer or Model, to be used for input-conditional
+      updates.
+    outputs: The outputs of this Layer or Model.
+  """
+
+  def __init__(self, layer, inputs):
+    self.layer = layer
+    self.inputs = inputs
+    self.outputs = []
+
+  def set_outputs(self, outputs):
+    if self.outputs:
+      raise RuntimeError('`set_outputs` should only be called once on an'
+                         '`AutoAddUpdates` instance.')
+    self.outputs = outputs
+
+  def __enter__(self):
+    # Only run in V2 Function mode.
+    if (context.executing_eagerly() or
+        not ops.executing_eagerly_outside_functions()):
+      return self
+
+    self._graph = ops.get_default_graph()
+    self._num_operations = len(self._graph.get_operations())
+    return self
+
+  def __exit__(self, error_type, unused_value, unused_traceback):
+    if error_type:
+      # Allow errors that occurred inside this context manager to pass through
+      # normally.
+      return
+
+    # Only run in V2 Function mode.
+    if (context.executing_eagerly() or
+        not ops.executing_eagerly_outside_functions()):
+      return
+
+    if (self._graph is not ops.get_default_graph() or
+        self._graph.name != 'keras_graph'):
+      # Only auto-track updates when the Keras Graph is the only one used.
+      return
+
+    new_operations = self._graph.get_operations()[self._num_operations:]
+    new_stateful_ops = set()
+
+    # pylint: disable=protected-access
+    for op in new_operations:
+      # While loop is not supported in general for automatic control
+      # dependencies.
+      if control_flow_util.IsInWhileLoop(op):
+        continue
+
+      # Track stateful ops via `add_update`.
+      is_stateful_op = (
+          op.type not in self._graph._registered_ops or
+          auto_control_deps.op_is_stateful(
+              self._graph._registered_ops[op.type]))
+
+      # Ignore ReadVariableOps as they are not needed to be run separately.
+      # This ensures existing Layers don't get extra updates.
+      if is_stateful_op and op.type != 'ReadVariableOp':
+        new_stateful_ops.add(op)
+
+    explicit_updates = set([
+        u for u in self.layer._get_unfiltered_updates(check_trainable=False)
+        if not isinstance(u, tuple)
+    ])
+    # pylint: enable=protected-access
+
+    # Don't add updates that will already be run by virtue of being consumed by
+    # other stateful ops or by the Layer's outputs. This ensures that existing
+    # Layers like `BatchNormalization` continue to return the same values for
+    # `.update` calls.
+    minimum_ops = set()
+    targets = new_stateful_ops.union(
+        set(nest.flatten(self.outputs)), explicit_updates)
+    for op in new_stateful_ops:
+      # Scrub any ops that are consumed by the outputs or other stateful ops.
+      reachable = tf_utils.get_reachable_from_inputs(op)
+      if not (targets - {op}).intersection(reachable):
+        minimum_ops.add(op)
+    new_stateful_ops = minimum_ops
+
+    # Don't double-track updates added via explicitly calling `add_update`.
+    # Also don't double-track updates already tracked in sublayers.
+    new_stateful_ops = new_stateful_ops - explicit_updates
+
+    # Decide whether to track as input-conditional or unconditional.
+    input_reachable_ops = tf_utils.get_reachable_from_inputs(
+        self.inputs, targets=new_stateful_ops)
+    unconditional_updates = new_stateful_ops - input_reachable_ops
+    conditional_updates = new_stateful_ops - unconditional_updates
+
+    if unconditional_updates:
+      self.layer.add_update(list(unconditional_updates))
+    if conditional_updates:
+      self.layer.add_update(list(conditional_updates), inputs=self.inputs)
+
+
+def _get_var_read_dtype(input_list, should_cast):
+  """Gets the dtype that AutoCastVariables should be read in."""
+  if should_cast and input_list and input_list[0].dtype.is_floating:
+    return input_list[0].dtype.base_dtype
+  else:
+    return None
+
+
+def autocast_context_manager(input_list, should_cast):
+  """Returns a context manager to autocast AutoCastVariables.
+
+  Under this context manager, if `should_cast` is True, AutoCastVariables will
+  be casted. If `should_cast` is False, AutoCastVariables will not be casted,
+  which can be used to disable autocasting if nested under another
+  call to `autocast_context_manager`.
+
+  Args:
+    input_list: The inputs to the layer with the AutoCastVariables.
+    should_cast: Whether AutoCastVariables should be casted.
+
+  Returns:
+    A context manager to automatically cast AutoCastVariables.
+  """
+  var_read_dtype = _get_var_read_dtype(input_list, should_cast)
+  return ops.get_default_graph()._enable_auto_casting_variables(  # pylint: disable=protected-access
+      var_read_dtype)
diff --git a/tensorflow/python/keras/engine/correctness_test.py b/tensorflow/python/keras/engine/correctness_test.py
index c2f3b040de3269c6921d95d8a845869511ac0634..68634235d1b5731d4359ef0796eaa28eeb9ca002 100644
--- a/tensorflow/python/keras/engine/correctness_test.py
+++ b/tensorflow/python/keras/engine/correctness_test.py
@@ -66,7 +66,10 @@ class SimpleBiasTest(keras_parameterized.TestCase):
 
   def _get_simple_bias_model(self):
     model = testing_utils.get_model_from_layers([Bias()], input_shape=(1,))
-    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   def test_simple_bias_fit(self):
@@ -101,7 +104,10 @@ class MultipleInputTest(keras_parameterized.TestCase):
       model = MultiInputSubclassed()
     else:
       model = multi_input_functional()
-    model.compile(keras.optimizer_v2.gradient_descent.SGD(0.1), 'mae')
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
     return model
 
   @parameterized.named_parameters(('subclassed', True), ('functional', False))
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index d13e0b6fa90e27050abdc3fcd8ee0b8b758a53d2..79836f80179fe53f2bea2cf702bc33c343a7e63d 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -20,11 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,13 +32,15 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 
 
 def set_weights(distribution_strategy, dist_model, weights):
@@ -67,7 +68,7 @@ def set_weights(distribution_strategy, dist_model, weights):
     weights = weights[num_param:]
 
   if not ops.executing_eagerly_outside_functions():
-    K.get_session().run(assign_ops)
+    K.get_session(assign_ops).run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
@@ -104,7 +105,7 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+    loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
                                         grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
@@ -196,14 +197,14 @@ def validate_callbacks(input_callbacks, optimizer):
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
-        if callback.__getattribute__('histogram_freq'):
+        if getattr(callback, 'histogram_freq', False):
           logging.warning(
               UserWarning(
                   '`histogram_freq` in the TensorBoard callback is not '
                   'supported when using DistributionStrategy. Setting '
                   '`histogram_freq` to `0`.'))
           callback.histogram_freq = 0
-        if callback.__getattribute__('write_grads'):
+        if getattr(callback, 'write_grads', False):
           logging.warning(
               UserWarning(
                   '`write_grads` in the TensorBoard callback is not supported '
@@ -344,41 +345,13 @@ def init_restore_or_wait_for_variables():
   session = K._get_session()  # pylint: disable=protected-access
   worker_context = dc_context.get_current_worker_context()
   if not worker_context or worker_context.experimental_should_init:
-    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    # TODO(yuefengz): if checkpoints exist, restore from checkpoint.
     K._initialize_variables(session)  # pylint: disable=protected-access
   else:
     _wait_for_variable_initialization(session)
 
 
-def configure_and_create_session(distribution_strategy):
-  """Configure session config and create a session with it."""
-  # TODO(priyag): Throw error if a session already exists.
-  session_config = K.get_default_session_config()
-
-  if is_tpu_strategy(distribution_strategy):
-    # TODO(priyag, yuefengz): Remove this workaround when Distribute
-    # Coordinator is integrated with keras and we can create a session from
-    # there.
-    distribution_strategy.configure(session_config)
-    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
-    session = session_module.Session(config=session_config, target=master)
-  else:
-    worker_context = dc_context.get_current_worker_context()
-    if worker_context:
-      dc_session_config = worker_context.session_config
-      # Merge the default session config to the one from distribute coordinator,
-      # which is fine for now since they don't have conflicting configurations.
-      dc_session_config.MergeFrom(session_config)
-      session = session_module.Session(
-          config=dc_session_config, target=worker_context.master_target)
-    else:
-      distribution_strategy.configure(session_config)
-      session = session_module.Session(config=session_config)
-
-  K.set_session(session)
-
-
-def validate_inputs(x, y, distribution_strategy):
+def validate_inputs(x, y, distribution_strategy, allow_partial_batch=False):
   """Validate inputs when using DistributionStrategy.
 
   Args:
@@ -386,16 +359,13 @@ def validate_inputs(x, y, distribution_strategy):
     y: Model Targets.
     distribution_strategy: The DistributionStrategy with which the model is
       compiled.
+    allow_partial_batch: Boolean. If false, datasets must have fully
+      defined shapes.
 
   Raises:
     ValueError: if input is not a Dataset or a numpy array(when we use
       MirroredStrategy).
   """
-  if isinstance(x, dict) or isinstance(y, dict):
-    raise ValueError('`DistributionStrategy` does not support inputs of type '
-                     'dict. You must pass a `tf.data.Dataset` object or a '
-                     'numpy array as input.')
-
   if (isinstance(x, iterator_ops.Iterator) or
       isinstance(y, iterator_ops.Iterator)):
     raise ValueError('`DistributionStrategy` does not support inputs of type '
@@ -404,18 +374,13 @@ def validate_inputs(x, y, distribution_strategy):
 
   if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
-      if isinstance(i, dataset_ops.DatasetV2):
-        shapes = nest.flatten(i.output_shapes)
-        try:
-          s = next(s for s in shapes if not s.is_fully_defined())
-        except StopIteration:
-          continue
-        else:
+      if (isinstance(i, dataset_ops.DatasetV2) and not allow_partial_batch):
+        if not is_dataset_shape_fully_defined(i):
           raise ValueError(
               'Using TPUs currently requires fully defined shapes. Either use '
               'set_shape() on the input tensors or use '
               'dataset.batch(..., drop_remainder=True).'
-              'Found unknown shape {} in input {}.'.format(s, i))
+              'Found unknown shape in input {}.'.format(i))
 
 
 # TODO(b/118776054): Currently we support global batch size for TPUStrategy and
@@ -431,8 +396,15 @@ def is_tpu_strategy(strategy):
   return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
 
 
+def is_dataset_shape_fully_defined(dataset):
+  """Returns whether a dataset contains a final partial batch."""
+  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
+  unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
+  return not unknown_shapes
+
+
 def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
-                     is_training=False):
+                     mode=None):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
@@ -441,8 +413,10 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       model input.
     steps:  The specified number of steps.
     batch_size: The specified batch_size.
-    is_training: Boolean to relax the constraints on consuming all the training
-      samples to keep compatibility till we support partial batches.
+    mode: ModeKey representing whether input will be used for training,
+      evaluation, or prediction. This is used to relax the constraints on
+      consuming all the training samples to keep compatibility till we
+      support partial batches. If none, then partial batches are not allowed.
 
   Returns:
     steps: The steps or steps_per_epoch argument depending on if a user is
@@ -460,6 +434,14 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
   use_per_replica_batch = not global_batch_size_supported(
       distribution_strategy)
 
+  # Partial batches are allowed for training as we repeat the
+  # dataset when converting numpy arrays into a dataset.
+  # For other modes uneven batch sizes are not allowed except
+  # for `predict()` on TPUStrategy.
+  allow_partial_batch = (mode == ModeKeys.TRAIN or
+                         (mode == ModeKeys.PREDICT
+                          and is_tpu_strategy(distribution_strategy)))
+
   if steps is None:
     if batch_size is None:
       # If neither the batch size or number of steps are set. We choose the
@@ -472,10 +454,13 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       global_batch_size = batch_size
       if use_per_replica_batch:
         global_batch_size *= distribution_strategy.num_replicas_in_sync
-    if not is_training and num_samples % global_batch_size:
-      raise ValueError('The number of samples %s is not divisible by '
-                       'batch size %s.' % (num_samples, global_batch_size))
-    steps = num_samples // global_batch_size
+    if allow_partial_batch:
+      steps = np.ceil(num_samples / global_batch_size).astype(int)
+    else:
+      if num_samples % global_batch_size:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'batch size %s.' % (num_samples, global_batch_size))
+      steps = num_samples // global_batch_size
   else:
     if batch_size is None:
       # We calculate the batch size based on the number of steps specified
@@ -492,7 +477,11 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       if use_per_replica_batch:
         global_batch_size *= distribution_strategy.num_replicas_in_sync
 
-      if num_samples < (global_batch_size * steps):
+      min_num_samples = global_batch_size * steps
+      if allow_partial_batch:
+        min_num_samples = global_batch_size * (steps-1) + 1 if steps > 1 else 0
+
+      if num_samples < min_num_samples:
         raise ValueError('Number of samples %s is less than samples required '
                          'for specified batch_size %s and steps %s' % (
                              num_samples, global_batch_size, steps))
@@ -512,7 +501,7 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
 
 
 def get_batch_dimension(iterator):
-  shapes = nest.flatten(iterator.output_shapes)
+  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(iterator))
   # Take the batch size from the first element, as it should be the same for
   # all.
   dims = shapes[0].dims
@@ -537,7 +526,7 @@ def initialize_iterator(iterator, distribution_strategy):
   with distribution_strategy.scope():
     init_op = control_flow_ops.group(iterator.initialize())
     if not context.executing_eagerly():
-      K.get_session().run(init_op)
+      K.get_session((init_op,)).run(init_op)
 
 
 def _get_input_from_iterator(iterator, model):
@@ -578,6 +567,11 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
   inputs = flatten_perdevice_values(strategy, inputs)
   targets = flatten_perdevice_values(strategy, targets)
+  # Expand 1-dimensional inputs.
+  # TODO(b/124535720): Remove once this standarize data logic is shared with
+  # main flow.
+  inputs, targets = nest.map_structure(training_utils.standardize_single_array,
+                                       (inputs, targets))
   if mode == ModeKeys.PREDICT:
     sample_weights = []
     targets = []
@@ -601,14 +595,12 @@ def _custom_compile_for_predict(model):
     return
   model._is_compiled = True
   model.total_loss = None
-  model._fit_function = None
-  model._eval_function = None
   model.train_function = None
   model.test_function = None
   model.predict_function = None
 
 
-def _build_network_on_replica(model, inputs=None, targets=None, mode=None):
+def _build_network_on_replica(model, mode, inputs=None, targets=None):
   """Build an updated model on replicas.
 
   We create a new Keras model while sharing the variables from the old graph.
@@ -626,9 +618,9 @@ def _build_network_on_replica(model, inputs=None, targets=None, mode=None):
 
   Args:
     model: Model to be replicated across Replicas
+    mode: Which of fit/eval/predict is building the distributed network
     inputs: Input variables to be passed to the model
     targets: Target tensor to be passed to model.compile
-    mode: Which of fit/eval/predict is building the distributed network
 
   Returns:
     A new model with shared layers with the old model.
@@ -660,7 +652,7 @@ def _build_network_on_replica(model, inputs=None, targets=None, mode=None):
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
 
-  if mode == ModeKeys.PREDICT:
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
     _custom_compile_for_predict(updated_model)
   else:
     updated_model.compile(
@@ -675,24 +667,17 @@ def _build_network_on_replica(model, inputs=None, targets=None, mode=None):
   return updated_model
 
 
-def _build_distributed_network(model, strategy, inputs=None, targets=None,
-                               mode=None):
+def _build_distributed_network(model, strategy, mode, inputs=None,
+                               targets=None):
   """Create a cloned model on each replica."""
   with K.get_graph().as_default(), strategy.scope():
     distributed_model = strategy.extended.call_for_each_replica(
         _build_network_on_replica,
-        args=(model, inputs, targets, mode))
-    if mode is ModeKeys.TRAIN:
-      model._distributed_model_train = distributed_model
-    elif mode is ModeKeys.TEST:
-      model._distributed_model_test = distributed_model
-    elif mode is ModeKeys.PREDICT:
-      model._distributed_model_predict = distributed_model
-    else:
-      model._distributed_model = distributed_model
+        args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
 
 
-def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
+def _clone_and_build_model(model, mode, inputs=None, targets=None):
   """Clone and build the given keras_model."""
   # We need to set the import here since we run into a circular dependency
   # error.
@@ -719,7 +704,7 @@ def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
 
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
-  if mode == ModeKeys.PREDICT:
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
     _custom_compile_for_predict(cloned_model)
   else:
     cloned_model.compile(
@@ -734,54 +719,69 @@ def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
   return cloned_model
 
 
-def clone_model_on_replicas(model, strategy, make_callback_model=False,
-                            inputs=None, targets=None, mode=None):
+def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
   """Create a cloned model on each replica."""
   with K.get_graph().as_default(), strategy.scope():
     distributed_model = strategy.extended.call_for_each_replica(
-        _clone_and_build_model, args=(model, inputs, targets, mode))
-    if mode is ModeKeys.TRAIN:
-      model._distributed_model_train = distributed_model
-    elif mode is ModeKeys.TEST:
-      model._distributed_model_test = distributed_model
-    elif mode is ModeKeys.PREDICT:
-      model._distributed_model_predict = distributed_model
-    else:
-      model._distributed_model = distributed_model
-  if make_callback_model:
+        _clone_and_build_model, args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+  if mode == ModeKeys.TRAIN:
     model._make_callback_model(distributed_model)
 
 
 def _make_execution_function(model, mode):
-  """Makes function to run one step of distributed model execution."""
+  """Makes or reuses function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+
+  distributed_model = get_distributed_model(model, mode)
+  # If distributed model for a particular `mode` is already built, use the
+  # `_distribution_function` on that distributed model.
+  if distributed_model:
+    return distributed_model._distributed_function
+
+  # If distributed_model is not built, create one for `mode`.
+  if model._compile_distribution:
+    clone_model_on_replicas(model, strategy, mode)
+  else:
+    _build_distributed_network(model, strategy, mode)
+
+  # We've just created the distributed model. So `distributed_model` should be
+  # not None.
+  distributed_model = get_distributed_model(model, mode)
+  assert distributed_model
+
+  # Also create an execution fuction on that distributed model.
   if context.executing_eagerly():
-    return _make_eager_execution_function(model, mode)
+    distributed_function = _make_eager_execution_function(model, mode)
+  else:
+    distributed_function = _make_graph_execution_function(model, mode)
+
+  # We cache the distributed execution function on the model since creating
+  # distributed models and exection functions are expensive.
+  distributed_model._distributed_function = distributed_function
+  return distributed_function
 
-  strategy = model._distribution_strategy
-  if not model._distributed_model:
-    if model._compile_distribution:
-      clone_model_on_replicas(
-          model, strategy, make_callback_model=(mode == ModeKeys.TRAIN))
-    else:
-      _build_distributed_network(model, strategy)
+
+def _make_graph_execution_function(model, mode):
+  """Makes function to run one step of distributed model in graph mode."""
 
   def _per_device_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
 
+  strategy = model._distribution_strategy
   with strategy.scope():
     # Create train ops on each of the devices when we call
     # `_per_device_fit_function`.
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_device_function, args=(model._distributed_model,))
+         _per_device_function, args=(get_distributed_model(model, mode),))
 
-    if mode == ModeKeys.TRAIN:
-      # Initialize the variables in the replicated model. This is necessary for
-      # multi-worker training because on some workers, initialization is not
-      # needed. This method does initialization or waiting for initialization
-      # according to the context object of distribute coordinator.
-      init_restore_or_wait_for_variables()
+    # Initialize the variables in the replicated model. This is necessary for
+    # multi-worker training because on some workers, initialization is not
+    # needed. This method does initialization or waiting for initialization
+    # according to the context object of distribute coordinator.
+    init_restore_or_wait_for_variables()
 
     # Unwrap all the per device values returned from `call_for_each_replica`.
     # Unwrapping per device values gives you a list of values that can be
@@ -805,58 +805,62 @@ def _make_execution_function(model, mode):
 
 def _make_eager_execution_function(model, mode):
   """Makes function to run one step of distributed model eager execution."""
-  strategy = model._distribution_strategy
-  if not model._distributed_model:
-    if model._compile_distribution:
-      clone_model_on_replicas(
-          model, strategy, make_callback_model=(mode == ModeKeys.TRAIN))
-    else:
-      _build_distributed_network(model, strategy)
-
   def _per_device_function(model):
     f = model._make_execution_function(mode)
     return (f.inputs, f.outputs)
 
   # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
   # the global one.
-  with K.get_graph().as_default(), strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs) = strategy.extended.call_for_each_replica(
-        _per_device_function, args=(model._distributed_model,))
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of inptus/outputs
-    # on all the devices over which the model is distributed.
-    (all_inputs, all_outputs, _, _) = unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        with_loss_tensor=(mode != ModeKeys.PREDICT))
-
+  strategy = model._distribution_strategy
+  global_graph = K.get_graph()
+
+  with global_graph.as_default(), strategy.scope():
+    # First we gather the relevant portions of the model across all replicas.
+    # `K._scratch_graph(global_graph)` signals to Keras that it should not
+    # lift to a separate graph when creating the per-replica functions.
+    with K._scratch_graph(global_graph):
+      # Create train ops on each of the devices when we call
+      # `_per_device_fit_function`.
+      grouped = strategy.extended.call_for_each_replica(
+          _per_device_function, args=(get_distributed_model(model, mode),))
+      grouped_inputs, grouped_outputs = grouped
+
+      # Unwrap all the per device values returned from `call_for_each_replica`.
+      # Unwrapping per device values gives you a list of values that can be
+      # used to construct a new train function that is composed of
+      # inputs/outputs on all the devices over which the model is distributed.
+      (all_inputs, all_outputs, _, _) = unwrap_values(
+          strategy,
+          grouped_inputs,
+          grouped_outputs,
+          with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    # Finally, a joint Keras function is created; this one will be created in
+    # a separate FuncGraph.
     return K.function(
         all_inputs,
         all_outputs,
         name='eager_distributed_{}_function'.format(mode))
 
 
-def _copy_weights_to_distributed_model(original_model, grouped_model):
+def _copy_weights_to_distributed_model(original_model, mode):
   """Copies weights from original model to distributed models."""
   strategy = original_model._distribution_strategy
+  distributed_model = get_distributed_model(original_model, mode)
   if strategy:
     # Copy the weights from the original model to each of the replicated
     # models.
     orig_model_weights = original_model.get_weights()
-    distributed_model = strategy.unwrap(grouped_model)[0]
-    set_weights(strategy, distributed_model, orig_model_weights)
+    first_model = strategy.unwrap(distributed_model)[0]
+    set_weights(strategy, first_model, orig_model_weights)
 
 
-def _copy_weights_to_original_model(model, grouped_model, mode):
+def _copy_weights_to_original_model(model, mode):
   """Copies weights from first distributed model back to original model."""
   if model._distribution_strategy and mode == ModeKeys.TRAIN:
+    distributed_model = get_distributed_model(model, mode)
     updated_weights = model._distribution_strategy.unwrap(
-        grouped_model)[0].get_weights()
+        distributed_model)[0].get_weights()
     model.set_weights(updated_weights)
 
 
@@ -872,9 +876,66 @@ def _per_device_aggregate_batch(batch_outs, model, mode):
   return batch_outs
 
 
-def _reset_metrics(model, distributed_model=None):
+def _reset_metrics(model):
   if model._distribution_strategy:
-    distributed_model = (
-        distributed_model or
-        model._distribution_strategy.unwrap(model._distributed_model)[0])
-    distributed_model.reset_metrics()
+    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+      distributed_model = get_distributed_model(model, mode)
+      if distributed_model:
+        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
+        first_model.reset_metrics()
+
+
+def get_distributed_model(model, mode):
+  key = _generate_cache_key(mode)
+  return model._distributed_model_cache.get(key, None)
+
+
+def set_distributed_model(model, mode, distributed_model):
+  key = _generate_cache_key(mode)
+  model._distributed_model_cache[key] = distributed_model
+
+
+def _generate_cache_key(mode):
+  key = hash(mode)
+  return key
+
+
+@tf_contextlib.contextmanager
+def distributed_scope(strategy, learning_phase):
+  with strategy.scope(), K.learning_phase_scope(learning_phase):
+    yield
+
+
+def filter_distributed_callbacks(callbacks_list):
+  """Filter Callbacks based on the worker context when running multi-worker.
+
+  Arguments:
+    callbacks_list: A list of `Callback` instances.
+
+  Returns:
+    The list of `Callback` instances that should be run on this worker.
+  """
+
+  if not K.in_multi_worker_mode():
+    raise ValueError(
+        'filter_distributed_callbacks() should only be called when Keras '
+        'is in multi worker mode.')
+
+  worker_context = dc_context.get_current_worker_context()
+  callbacks_list = callbacks_list or []
+  if not [
+      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
+  ]:
+    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
+    # fails to.
+    logging.warning('ModelCheckpoint callback is not provided. '
+                    'Workers will need to restart training if any fails.')
+  # TODO(rchao): Add similar warning for restoring callback (to be designed).
+
+  if callbacks_list is None or worker_context.is_chief:
+    return callbacks_list
+
+  # Some Callbacks should only run on the chief worker.
+  return [
+      callback for callback in callbacks_list if not callback._chief_worker_only
+  ]  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index c6dcedfce2f620b039fc8cfa7c3366d801e9c176..32fbbea8a162aaac592519739d3d8bb3ecbce57f 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -123,6 +123,7 @@ class InputLayer(base_layer.Layer):
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
+    input_tensor._keras_mask = None
     base_layer.Node(
         self,
         inbound_layers=[],
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 41f5f319bc625ef044964658e12daf720cd26a0a..fdda1141fd7d737c5f769ffa951cdea8f2030146 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -22,9 +22,7 @@ from __future__ import print_function
 import copy
 import json
 import os
-import weakref
 
-import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
@@ -38,6 +36,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
@@ -45,11 +44,12 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
 
 
@@ -153,7 +153,7 @@ class Network(base_layer.Layer):
   # empty lists shouldn't cause issues; adding or removing them will not break
   # checkpoints, but may cause "all Python objects matched" assertions to fail
   # (in which case less strict assertions may be substituted if necessary).
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _base_init(self, name=None):
     # The following are implemented as property functions:
     # self.trainable_weights
@@ -170,15 +170,8 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # In many internal cases one needs to compute both the model's output
-    # and its output mask without relying on `__call__` (which would do both and
-    # set mask metadata), but for models, computing the mask requires to
-    # recompute the output.
-    # Hence the pattern `output = model.call(); mask = model.compute_mask()`
-    # would be redundant, and internal logic
-    # (susceptible to use `call` directly) should prefer using the
-    # internal method `output, mask = _call_and_compute_mask()`.
-    # This is True for Sequential networks and graph networks.
+
+    # This is True for Sequential networks and Functional networks.
     self._compute_output_and_mask_jointly = False
 
     self.supports_masking = False
@@ -214,10 +207,16 @@ class Network(base_layer.Layer):
     self._outbound_nodes = []
     self._inbound_nodes = []
 
-    self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
-        weakref.ref(self))
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
+
+    # Networks do not need to do any casting of inputs or variables, because
+    # each of its layers will handle casting through the layer's own
+    # implementation. Therefore networks use the 'infer' policy, which does no
+    # casting.
+    self._mixed_precision_policy = policy.Policy('infer')
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
@@ -234,9 +233,9 @@ class Network(base_layer.Layer):
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
+    self._base_init(name=name)
     self._validate_graph_inputs_and_outputs()
 
-    self._base_init(name=name)
     self._compute_previous_mask = (
         'mask' in tf_inspect.getfullargspec(self.call).args or
         hasattr(self, 'compute_mask'))
@@ -246,6 +245,9 @@ class Network(base_layer.Layer):
     self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
     self._dynamic = False
+    # `_expects_training_arg` is True since the `training` argument is always
+    # present in the signature of the `call` method of a graph network.
+    self._expects_training_arg = True
 
     self._input_layers = []
     self._output_layers = []
@@ -314,7 +316,7 @@ class Network(base_layer.Layer):
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _init_subclassed_network(self, name=None, dynamic=False):
     self._base_init(name=name)
     self._is_graph_network = False
@@ -375,20 +377,20 @@ class Network(base_layer.Layer):
       return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
-    """Add Checkpointable dependencies on a list of Layers."""
+    """Add Trackable dependencies on a list of Layers."""
     weight_layer_index = 0
     for layer_index, layer in enumerate(layers):
       if layer.weights:
         # Keep a separate index for layers which have weights. This allows users
         # to insert Layers without weights anywhere in the network without
         # breaking checkpoints.
-        self._track_checkpointable(
+        self._track_trackable(
             layer, name='layer_with_weights-%d' % weight_layer_index,
             overwrite=True)
         weight_layer_index += 1
       # Even if it doesn't have weights, we should still track everything in
-      # case it has/will have Checkpointable dependencies.
-      self._track_checkpointable(
+      # case it has/will have Trackable dependencies.
+      self._track_trackable(
           layer, name='layer-%d' % layer_index, overwrite=True)
 
   def __setattr__(self, name, value):
@@ -398,18 +400,15 @@ class Network(base_layer.Layer):
 
     if all(
         isinstance(v, (base_layer.Layer,
-                       data_structures.CheckpointableDataStructure)) or
-        checkpointable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
         self._is_graph_network
       except AttributeError:
         raise RuntimeError('It looks like you are subclassing `Model` and you '
                            'forgot to call `super(YourClass, self).__init__()`.'
                            ' Always start with this line.')
-    # Keep track of checkpointable objects,
-    # for the needs of `self.save/save_weights`.
-    value = data_structures.sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
+
     super(Network, self).__setattr__(name, value)
 
     # Keep track of metric instance created in subclassed model/layer.
@@ -478,12 +477,15 @@ class Network(base_layer.Layer):
     if not self._is_graph_network:
       return None
 
-    _, output_masks = self._run_internal_graph(inputs, mask=mask)
-    return output_masks
+    # TODO(omalleyt): b/123540974 This function is not really safe to call
+    # by itself because it will duplicate any updates and losses in graph
+    # mode by `call`ing the Layers again.
+    output_tensors = self._run_internal_graph(inputs, mask=mask)
+    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
 
   @property
   def layers(self):
-    return checkpointable_layer_utils.filter_empty_layer_containers(
+    return trackable_layer_utils.filter_empty_layer_containers(
         self._layers)
 
   def get_layer(self, name=None, index=None):
@@ -519,21 +521,24 @@ class Network(base_layer.Layer):
         return layer
     raise ValueError('No such layer: ' + name)
 
-  @property
-  def _unfiltered_updates(self):
+  def _get_unfiltered_updates(self, check_trainable=True):
+    if check_trainable and not self.trainable and not self.stateful:
+      return []
     updates = []
     for layer in self.layers:
-      if isinstance(layer, Network):
-        updates += layer._unfiltered_updates
-      else:
-        updates += layer.updates
-    updates += self._updates
+      updates += layer._get_unfiltered_updates(check_trainable=check_trainable)
+    updates += list(self._updates)
     return updates
 
   @property
   def _unfiltered_losses(self):
     losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       losses.extend(self._eager_losses)
     else:
       losses.extend(self._losses)
@@ -544,15 +549,12 @@ class Network(base_layer.Layer):
         losses += layer.losses
     return losses
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
     self._eager_losses = []
     for layer in self.layers:
-      if isinstance(layer, Network):
-        layer._clear_losses()
-      else:
-        layer._eager_losses = []
+      layer._clear_losses()
 
   @property
   def updates(self):
@@ -604,10 +606,8 @@ class Network(base_layer.Layer):
     Returns:
         A list of update ops.
     """
-    if not self.trainable and not self.stateful:
-      return []
 
-    updates = self._unfiltered_updates
+    updates = self._get_unfiltered_updates(check_trainable=True)
 
     # `updates` might contain irrelevant updates, so it needs to be filtered
     # with respect to inputs the model has been called on.
@@ -684,14 +684,14 @@ class Network(base_layer.Layer):
 
   @property
   def trainable_weights(self):
-    return checkpointable_layer_utils.gather_trainable_weights(
+    return trackable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
-    return checkpointable_layer_utils.gather_non_trainable_weights(
+    return trackable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._non_trainable_weights + self._trainable_weights)
@@ -728,7 +728,7 @@ class Network(base_layer.Layer):
         A list of `InputSpec` instances (one per input to the model)
             or a single instance if the model has only one input.
     """
-    # If not a graph network, can't assume anything.
+    # If subclassed model, can't assume anything.
     if not self._is_graph_network:
       return None
 
@@ -866,10 +866,6 @@ class Network(base_layer.Layer):
       raise NotImplementedError('When subclassing the `Model` class, you should'
                                 ' implement a `call` method.')
 
-    outputs, _ = self._run_internal_graph(inputs, training=training, mask=mask)
-    return outputs
-
-  def _call_and_compute_mask(self, inputs, training=None, mask=None):
     return self._run_internal_graph(inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
@@ -972,92 +968,59 @@ class Network(base_layer.Layer):
     else:
       masks = nest.flatten(mask)
 
+    for input_t, mask in zip(inputs, masks):
+      input_t._keras_mask = mask
+
     # Dictionary mapping reference tensors to computed tensors.
     tensor_dict = {}
-    # Dictionary mapping reference tensors to computed masks.
-    mask_dict = {}
 
     for x, y, mask in zip(self.inputs, inputs, masks):
       tensor_dict[str(id(x))] = y
-      mask_dict[str(id(x))] = mask
 
     depth_keys = list(self._nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
+    # Ignore the InputLayers when computing the graph.
+    depth_keys = depth_keys[1:]
+
     for depth in depth_keys:
       nodes = self._nodes_by_depth[depth]
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-        # node_input_tensors = node.input_tensors
-        # node_output_tensors = node.output_tensors
 
         if all(
             str(id(tensor)) in tensor_dict
             for tensor in nest.flatten(node.input_tensors)):
+
           # Call layer (reapplying ops to new inputs).
-          with ops.name_scope(layer.name):
-            computed_tensors = nest.map_structure(
-                lambda t: tensor_dict[str(id(t))], node.input_tensors)
-            computed_masks = nest.map_structure(lambda t: mask_dict[str(id(t))],
-                                                node.input_tensors)
-            kwargs = node.arguments or {}
-            # Ensure `training` arg propagation if applicable.
-            argspec = self._layer_call_argspecs[layer].args
-            if 'training' in argspec:
-              kwargs.setdefault('training', training)
-            if 'mask' in argspec:
-              kwargs.setdefault('mask', computed_masks)
-
-            # Compute outputs and masks.
-            output_masks = None
-            if (isinstance(layer, Network) and
-                layer._compute_output_and_mask_jointly):
-              output_tensors, output_masks = layer._call_and_compute_mask(
-                  computed_tensors, **kwargs)
-            else:
-              if context.executing_eagerly():
-                output_tensors = layer(computed_tensors, **kwargs)
-              elif layer.dynamic:
-                output_tensors = layer._symbolic_call(computed_tensors)  # pylint: disable=protected-call
-              else:
-                output_tensors = layer.call(computed_tensors, **kwargs)
-              if hasattr(layer, 'compute_mask'):
-                output_masks = layer.compute_mask(computed_tensors,
-                                                  computed_masks)
-            if output_masks is None:
-              output_masks = nest.pack_sequence_as(
-                  output_tensors, [None for _ in nest.flatten(output_tensors)])
-
-            if not context.executing_eagerly():
-              # Set mask metadata.
-              for x, m in zip(
-                  nest.flatten(output_tensors), nest.flatten(output_masks)):
-                try:
-                  x._keras_mask = m
-                except AttributeError:
-                  pass
-
-              # Apply activity regularizer if any.
-              layer._handle_activity_regularization(computed_tensors,
-                                                    output_tensors)
+          computed_tensors = nest.map_structure(
+              lambda t: tensor_dict[str(id(t))], node.input_tensors)
+
+          # Ensure `training` and `mask` arg propagation if applicable.
+          kwargs = node.arguments or {}
+          argspec = self._layer_call_argspecs[layer].args
+          if 'training' in argspec:
+            kwargs.setdefault('training', training)
+          if 'mask' in argspec:
+            computed_masks = nest.map_structure(lambda t: t._keras_mask,
+                                                computed_tensors)
+            kwargs.setdefault('mask', computed_masks)
+
+          # Compute outputs.
+          output_tensors = layer(computed_tensors, **kwargs)
 
           # Update tensor_dict.
-          for x, y, mask in zip(
-              nest.flatten(node.output_tensors), nest.flatten(output_tensors),
-              nest.flatten(output_masks)):
+          for x, y in zip(
+              nest.flatten(node.output_tensors), nest.flatten(output_tensors)):
             tensor_dict[str(id(x))] = y
-            mask_dict[str(id(x))] = mask
 
     output_tensors = []
-    output_masks = []
     output_shapes = []
     for x in self.outputs:
       assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
       tensor = tensor_dict[str(id(x))]
-      mask = mask_dict[str(id(x))]
       output_shapes.append(x.shape)
       output_tensors.append(tensor)
-      output_masks.append(mask)
 
     if output_shapes is not None:
       input_shapes = [x.shape for x in inputs]
@@ -1066,8 +1029,7 @@ class Network(base_layer.Layer):
           self._nested_outputs, output_shapes)
 
     output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
-    output_masks = nest.pack_sequence_as(self._nested_outputs, output_masks)
-    return output_tensors, output_masks
+    return output_tensors
 
   def get_config(self):
     if not self._is_graph_network:
@@ -1144,6 +1106,9 @@ class Network(base_layer.Layer):
       model_inputs.append(
           tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
     model_inputs = nest.pack_sequence_as(self._nested_inputs, model_inputs)
+    # Preserve external Keras compat for Models with single input.
+    if not nest.is_sequence(model_inputs):
+      model_inputs = [model_inputs]
     model_inputs = tf_utils.convert_inner_node_data(model_inputs)
     config['input_layers'] = model_inputs
 
@@ -1157,6 +1122,9 @@ class Network(base_layer.Layer):
       model_outputs.append(
           tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
     model_outputs = nest.pack_sequence_as(self._nested_outputs, model_outputs)
+    # Preserve external Keras compat for Models with single output.
+    if not nest.is_sequence(model_outputs):
+      model_outputs = [model_outputs]
     model_outputs = tf_utils.convert_inner_node_data(model_outputs)
     config['output_layers'] = model_outputs
     return copy.deepcopy(config)
@@ -1341,7 +1309,10 @@ class Network(base_layer.Layer):
     """
     if not self._is_graph_network:
       raise NotImplementedError(
-          'Currently `save` requires model to be a graph network. Consider '
+          'The `save` method requires the model to be a Functional model or a '
+          'Sequential model. It does not work for subclassed models, '
+          'because such models are defined via the body of a Python method, '
+          'which isn\'t safely serializable. Consider '
           'using `save_weights`, in order to save the weights of the model.')
 
     from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
@@ -1434,7 +1405,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
       optimizer = getattr(self, 'optimizer', None)
       if (optimizer
-          and not isinstance(optimizer, checkpointable.Checkpointable)):
+          and not isinstance(optimizer, trackable.Trackable)):
         logging.warning(
             ('This model was compiled with a Keras optimizer (%s) but is being '
              'saved in TensorFlow format with `save_weights`. The model\'s '
@@ -1442,7 +1413,7 @@ class Network(base_layer.Layer):
              'the TensorFlow format the optimizer\'s state will not be '
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
-      self._checkpointable_saver.save(filepath, session=session)
+      self._trackable_saver.save(filepath, session=session)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
       checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
@@ -1501,7 +1472,7 @@ class Network(base_layer.Layer):
         # The checkpoint is not readable in TensorFlow format. Try HDF5.
         save_format = 'h5'
     if save_format == 'tf':
-      status = self._checkpointable_saver.restore(filepath)
+      status = self._trackable_saver.restore(filepath)
       if by_name:
         raise NotImplementedError(
             'Weights may only be loaded based on topology into Models when '
@@ -1511,7 +1482,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
         # Restore existing variables (if any) immediately, and set up a
         # streaming restore for any variables created in the future.
-        checkpointable_utils.streaming_restore(status=status, session=session)
+        trackable_utils.streaming_restore(status=status, session=session)
       status.assert_nontrivial_match()
       return status
     if h5py is None:
@@ -1560,22 +1531,9 @@ class Network(base_layer.Layer):
     Returns:
         A JSON string.
     """
-    def get_json_type(obj):
-      # If obj is any numpy type
-      if type(obj).__module__ == np.__name__:
-        if isinstance(obj, np.ndarray):
-          return obj.tolist()
-        else:
-          return obj.item()
-
-      # If obj is a python 'type'
-      if type(obj).__name__ == type.__name__:
-        return obj.__name__
-
-      raise TypeError('Not JSON Serializable:', obj)
-
     model_config = self._updated_config()
-    return json.dumps(model_config, default=get_json_type, **kwargs)
+    return json.dumps(
+        model_config, default=serialization.get_json_type, **kwargs)
 
   def to_yaml(self, **kwargs):
     """Returns a yaml string containing the network configuration.
diff --git a/tensorflow/python/keras/engine/partial_batch_padding_handler.py b/tensorflow/python/keras/engine/partial_batch_padding_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3951ead6e1d75473d3847ca52a895e7f50aed3a
--- /dev/null
+++ b/tensorflow/python/keras/engine/partial_batch_padding_handler.py
@@ -0,0 +1,111 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility object to handler partial batches for TPUStrategy."""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+
+
+class PartialBatchPaddingHandler(object):
+  """A container that holds info about partial batches for `predict()`."""
+
+  def __init__(self, output_shape):
+    self.padded_batch_size = 0
+    self.padding_mask = array_ops.zeros(0)
+    self.output_shape = output_shape
+
+  def get_real_batch_size(self, dataset_batch):
+    """Returns the number of elements in a potentially partial batch."""
+    if isinstance(dataset_batch, (tuple, list)):
+      dataset_batch = dataset_batch[0]
+
+    assert nest.flatten(dataset_batch)
+
+    def _find_any_tensor(batch_features):
+      tensors = [
+          x for x in nest.flatten(batch_features) if tensor_util.is_tensor(x)
+      ]
+      if not tensors:
+        raise ValueError('Cannot find any Tensor in features dict.')
+      return tensors[0]
+
+    return K.cast(K.shape(_find_any_tensor(dataset_batch))[0],
+                  dtype='int64')
+
+  def update_mask(self, padding_mask, dataset_batch):
+    """Calculate and cache the amount of padding required for a batch."""
+    original_batch_size = self.get_real_batch_size(dataset_batch)
+    missing_count = self.padded_batch_size - original_batch_size
+    mask = K.concatenate([array_ops.ones(original_batch_size),
+                          array_ops.zeros(missing_count)], axis=0)
+    return K.concatenate([padding_mask, mask], axis=0)
+
+  def pad_batch(self, *dataset_batch_elements):
+    """Pads out the batch dimension of a tensor to the complete batch size."""
+    def _pad(batch):
+      """Helper function to pad nested data within each batch elements."""
+      padded_dict_batch = {}
+      if isinstance(batch, dict):
+        for key, value in six.iteritems(batch):
+          padded_dict_batch[key] = _pad(value)
+        return padded_dict_batch
+
+      rank = len(batch.shape)
+      assert rank > 0
+      missing_count = (self.padded_batch_size -
+                       self.get_real_batch_size(batch))
+      padding = K.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      return array_ops.pad(batch, padding, 'constant')
+
+    if len(dataset_batch_elements) == 1:
+      return _pad(dataset_batch_elements[0])
+
+    batch_elements = []
+    for batch_element in dataset_batch_elements:
+      batch_elements.append(_pad(batch_element))
+    return tuple(batch_elements)
+
+  def apply_mask(self, prediction_result):
+    """Removes prediction output that corresponds to padded input."""
+    padding_mask = K.get_value(self.padding_mask)
+    assert len(padding_mask.shape) == 1
+
+    if len(self.output_shape) == 1:
+      prediction = np.take(prediction_result,
+                           np.nonzero(
+                               padding_mask[:len(prediction_result)]),
+                           axis=0)
+      if prediction.shape[0] == 1:
+        prediction = np.squeeze(prediction, axis=0)
+      return prediction
+
+    else:
+      predictions = []
+      for i in range(len(self.output_shape)):
+        prediction = prediction_result[i]
+        prediction = np.take(prediction, np.nonzero(
+            padding_mask[:len(prediction)]), axis=0)
+        predictions.append(np.squeeze(prediction))
+
+      return predictions
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 37eb2840b3ba5d5574a326cdae14f1291ab39749..6c8f5c2f3984f4a445e43dc12f5817b25a3d63a4 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -21,25 +21,21 @@ from __future__ import print_function
 
 import copy
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine.input_layer import Input
-from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
-from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.models.Sequential', 'keras.Sequential')
-class Sequential(Model):
+class Sequential(training.Model):
   """Linear stack of layers.
 
   Arguments:
@@ -97,7 +93,7 @@ class Sequential(Model):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layers=None, name=None):
     super(Sequential, self).__init__(name=name)
     self.supports_masking = True
@@ -116,10 +112,10 @@ class Sequential(Model):
     # Historically, `sequential.layers` only returns layers that were added
     # via `add`, and omits the auto-generated `InputLayer` that comes at the
     # bottom of the stack.
-    # `CheckpointableBase` manages the `_layers` attributes and does filtering
+    # `Trackable` manages the `_layers` attributes and does filtering
     # over it.
     layers = super(Sequential, self).layers
-    if layers and isinstance(layers[0], InputLayer):
+    if layers and isinstance(layers[0], input_layer.InputLayer):
       return layers[1:]
     return layers[:]
 
@@ -127,7 +123,7 @@ class Sequential(Model):
   def dynamic(self):
     return any(layer.dynamic for layer in self.layers)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
 
@@ -142,6 +138,14 @@ class Sequential(Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
+    # If we are passed a Keras tensor created by keras.Input(), we can extract
+    # the input layer from its keras history and use that without any loss of
+    # generality.
+    if hasattr(layer, '_keras_history'):
+      origin_layer = layer._keras_history[0]
+      if isinstance(origin_layer, input_layer.InputLayer):
+        layer = origin_layer
+
     if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
@@ -149,7 +153,7 @@ class Sequential(Model):
     self.built = False
     set_inputs = False
     if not self._layers:
-      if isinstance(layer, InputLayer):
+      if isinstance(layer, input_layer.InputLayer):
         # Corner case where the user passes an InputLayer layer via `add`.
         assert len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) == 1
         set_inputs = True
@@ -157,10 +161,8 @@ class Sequential(Model):
         batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
-          x = Input(
-              batch_shape=batch_shape,
-              dtype=dtype,
-              name=layer.name + '_input')
+          x = input_layer.Input(
+              batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
           # This will build the current layer
           # and create the node connecting the current layer
           # to the input layer we just created.
@@ -199,7 +201,7 @@ class Sequential(Model):
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def pop(self):
     """Removes the last layer in the model.
 
@@ -233,18 +235,12 @@ class Sequential(Model):
       super(Sequential, self).build(input_shape)
     self.built = True
 
-  def call(self, inputs, training=None, mask=None):
+  def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-outer-name
     if self._is_graph_network:
+      if not self.built:
+        self._init_graph_network(self.inputs, self.outputs, name=self.name)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
-    outputs, _ = self._call_and_compute_mask(
-        inputs, training=training, mask=mask)
-    return outputs
-
-  def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    if not self.built and self._is_graph_network:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
-
     outputs = inputs  # handle the corner case where self.layers is empty
     for layer in self.layers:
       # During each iteration, `inputs` are the inputs to `layer`, and `outputs`
@@ -257,35 +253,13 @@ class Sequential(Model):
       if 'training' in argspec:
         kwargs['training'] = training
 
-      if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
-        outputs, mask = layer._call_and_compute_mask(inputs, **kwargs)
-      else:
-        if not layer.built:
-          # Build layer if applicable.
-          with ops.name_scope(layer._name_scope()):
-            layer._maybe_build(inputs)
-          layer.built = True
-        if layer.supports_masking:
-          mask = layer.compute_mask(inputs, mask)
-        else:
-          mask = None
-
-        if context.executing_eagerly():
-          # __call__ handles activity regularization.
-          outputs = layer(inputs, **kwargs)
-        elif layer.dynamic:
-          outputs = layer._symbolic_call(inputs)
-          layer._handle_activity_regularization(inputs, outputs)
-        else:
-          outputs = layer.call(inputs, **kwargs)
-          layer._handle_activity_regularization(inputs, outputs)
-
-      if not context.executing_eagerly():
-        outputs._keras_mask = mask
+      outputs = layer(inputs, **kwargs)
 
       # `outputs` will be the inputs to the next layer.
       inputs = outputs
-    return outputs, mask
+      mask = outputs._keras_mask
+
+    return outputs
 
   def compute_output_shape(self, input_shape):
     shape = input_shape
@@ -294,8 +268,11 @@ class Sequential(Model):
     return shape
 
   def compute_mask(self, inputs, mask):
-    _, mask = self._call_and_compute_mask(inputs, mask=mask)
-    return mask
+    # TODO(omalleyt): b/123540974 This function is not really safe to call
+    # by itself because it will duplicate any updates and losses in graph
+    # mode by `call`ing the Layers again.
+    outputs = self.call(inputs, mask=mask)
+    return outputs._keras_mask
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 8e5d6fe93a1830efe8bb8116b8a14be5311b7ccd..afd7d230f9a8e69ed45e374de90216580de5a367 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -47,6 +47,18 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
+  @keras_parameterized.run_all_keras_modes
+  def test_input_defined_first_layer(self):
+    model = keras.models.Sequential()
+    model.add(keras.Input(shape=(2,), name='input_layer'))
+    model.add(keras.layers.Dense(1))
+    model.add(keras.layers.Dropout(0.3, name='dp'))
+    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
+                                 kernel_constraint='max_norm'))
+    self.assertLen(model.layers, 3)
+    self.assertLen(model.weights, 2 * 2)
+    self.assertEqual(model.get_layer(name='dp').name, 'dp')
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
@@ -307,6 +319,20 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 8)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_sequential_deferred_manual_build(self):
+    model = testing_utils.get_small_sequential_mlp(4, 5)
+    self.assertFalse(model.built)
+    model(array_ops.zeros([1, 2]))
+    self.assertTrue(model.built)
+    self.assertEqual(len(model.outputs), 0)
+    model.compile('rmsprop',
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    self.assertEqual(len(model.outputs), 0)
+    model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
+    self.assertEqual(len(model.outputs), 1)
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 951988d852fe361a6b50b558b64169150bba6f53..5b1c74adb9d71fc411057c1c96b05fdc9ca625d7 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -85,7 +85,6 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
 
     network = network_lib.Network(x2, y2)
     self.assertEqual(len(network.updates), 2)
-    self.assertEqual(len(network.get_updates_for(x1)), 0)
     self.assertEqual(len(network.get_updates_for(x2)), 1)
     self.assertEqual(len(network.get_updates_for(None)), 1)
 
@@ -294,12 +293,12 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     # test callability on Input
     x_2 = input_layer_lib.Input(shape=(32,))
     y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+    self.assertEqual(y_2.shape.as_list(), [None, 2])
 
     # test callability on regular tensor
     x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
     y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+    self.assertEqual(y_2.shape.as_list(), [None, 2])
 
     # test network `trainable` attribute
     network.trainable = False
@@ -380,7 +379,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       _ = keras.layers.Input(shape=(32,), unknown_kwarg=None)
 
-    self.assertListEqual(a.get_shape().as_list(), [None, 32])
+    self.assertListEqual(a.shape.as_list(), [None, 32])
     a_layer, a_node_index, a_tensor_index = a._keras_history
     b_layer, _, _ = b._keras_history
     self.assertEqual(len(a_layer._inbound_nodes), 1)
@@ -410,7 +409,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     # test layer properties
     test_layer = keras.layers.Dense(16, name='test_layer')
     a_test = test_layer(a)
-    self.assertListEqual(test_layer.kernel.get_shape().as_list(), [32, 16])
+    self.assertListEqual(test_layer.kernel.shape.as_list(), [32, 16])
     self.assertEqual(test_layer.input, a)
     self.assertEqual(test_layer.output, a_test)
     self.assertEqual(test_layer.input_shape, (None, 32))
@@ -441,7 +440,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
       b_2 = dense(b)
 
       merged = keras.layers.concatenate([a_2, b_2], name='merge')
-      self.assertListEqual(merged.get_shape().as_list(), [None, 16 * 2])
+      self.assertListEqual(merged.shape.as_list(), [None, 16 * 2])
       merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
 
       self.assertEqual(merge_node_index, 0)
@@ -524,8 +523,8 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
       self.assertEqual(len(model.inputs), 2)
       self.assertEqual(g.name, 'model/dense_2/BiasAdd:0')
 
-      self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
-      self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
+      self.assertListEqual(g.shape.as_list(), c.shape.as_list())
+      self.assertListEqual(h.shape.as_list(), d.shape.as_list())
 
       # test separate manipulation of different layer outputs
       i = keras.layers.Dense(7, name='dense_4')(h)
@@ -588,10 +587,10 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
       p = keras.layers.Input(shape=(32,), name='input_p')
       q, _ = model([o, p])
 
-      self.assertListEqual(n.get_shape().as_list(), [None, 5])
-      self.assertListEqual(q.get_shape().as_list(), [None, 64])
+      self.assertListEqual(n.shape.as_list(), [None, 5])
+      self.assertListEqual(q.shape.as_list(), [None, 64])
       s = keras.layers.concatenate([n, q], name='merge_nq')
-      self.assertListEqual(s.get_shape().as_list(), [None, 64 + 5])
+      self.assertListEqual(s.shape.as_list(), [None, 64 + 5])
 
       # test with single output as 1-elem list
       multi_io_model = keras.models.Model([j, k, o, p], [s])
@@ -714,8 +713,8 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
     k_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
     m_tf, n_tf = tf_model([j_tf, k_tf])
-    self.assertListEqual(m_tf.get_shape().as_list(), [None, 64])
-    self.assertListEqual(n_tf.get_shape().as_list(), [None, 5])
+    self.assertListEqual(m_tf.shape.as_list(), [None, 64])
+    self.assertListEqual(n_tf.shape.as_list(), [None, 5])
 
     # test merge
     keras.layers.concatenate([j_tf, k_tf], axis=1)
@@ -733,7 +732,7 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
     a = keras.layers.Input(shape=(10, 32), name='input_a')
     b = keras.layers.Masking()(a)
     model = keras.models.Model(a, b)
-    self.assertEqual(model.output_mask.get_shape().as_list(), [None, 10])
+    self.assertEqual(model.output_mask.shape.as_list(), [None, 10])
 
   @test_util.run_deprecated_v1
   def testMaskingSingleInput(self):
@@ -766,12 +765,12 @@ class TopologyConstructionTest(keras_parameterized.TestCase):
       # test callability on Input
       x_2 = input_layer_lib.Input(shape=(32,))
       y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+      self.assertEqual(y_2.shape.as_list(), [None, 32])
 
       # test callability on regular tensor
       x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
       y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+      self.assertEqual(y_2.shape.as_list(), [None, 32])
 
   @test_util.run_deprecated_v1
   def test_activity_regularization_with_model_composition(self):
@@ -1181,6 +1180,18 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
       self.assertAllClose(mask_outputs_val[0], np.any(model_input, axis=-1))
       self.assertAllClose(mask_outputs_val[1], np.any(model_input, axis=-1))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_external_keras_serialization_compat(self):
+    inputs = keras.Input(shape=(10,))
+    outputs = keras.layers.Dense(1)(inputs)
+    model = keras.Model(inputs, outputs)
+    config = model.get_config()
+    # Checks that single inputs and outputs are still saved as 1-element lists.
+    # Saving as 1-element lists or not is equivalent in TF Keras, but only the
+    # 1-element list format is supported in TF.js and keras-team/Keras.
+    self.assertLen(config['input_layers'], 1)
+    self.assertLen(config['output_layers'], 1)
+
 
 class GraphUtilsTest(test.TestCase):
 
@@ -1285,6 +1296,19 @@ class NestedNetworkTest(test.TestCase):
     output_shape = network.compute_output_shape([(None, 1), (None, 1)])
     self.assertListEqual(output_shape.as_list(), [None, 1])
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_updates_with_direct_call(self):
+    inputs = keras.Input(shape=(10,))
+    x = keras.layers.BatchNormalization()(inputs)
+    x = keras.layers.Dense(10)(x)
+    model = keras.Model(inputs, x)
+
+    ph = keras.backend.placeholder(shape=(10, 10))
+    model(ph)
+
+    self.assertLen(model.get_updates_for(ph), 2)
+    self.assertLen(model.get_updates_for(None), 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index b80b906f675665eeff395e97aaea685ecd473584..56c787b97aaeaaf88ba51a5e4f1c2c4a9c3bda99 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -25,7 +25,6 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -36,28 +35,26 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine.network import Network
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer as tf_optimizer_module
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.mode_keys import ModeKeys
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.models.Model', 'keras.Model')
-class Model(Network):
+class Model(network.Network):
   """`Model` groups layers into an object with training and inference features.
 
   There are two ways to instantiate a `Model`:
@@ -130,7 +127,6 @@ class Model(Network):
     # passing distribution strategy to compile rather than creating the model
     # under distribution strategy scope.
     self._compile_distribution = False
-    self._distributed_session_is_configured = False
 
     self.run_eagerly = None
 
@@ -145,7 +141,16 @@ class Model(Network):
         return super(Model, self).get_weights()
     return super(Model, self).get_weights()
 
-  @checkpointable.no_automatic_dependency_tracking
+  def load_weights(self, filepath, by_name=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 file."""
+    if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    return super(Model, self).load_weights(filepath, by_name)
+
+  @trackable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
               loss=None,
@@ -161,17 +166,20 @@ class Model(Network):
     Arguments:
         optimizer: String (name of optimizer) or optimizer instance.
             See `tf.keras.optimizers`.
-        loss: String (name of objective function) or objective function.
-            See `tf.losses`. If the model has multiple outputs, you can use a
-            different loss on each output by passing a dictionary or a list of
-            losses. The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
+        loss: String (name of objective function), objective function or
+            `tf.losses.Loss` instance. See `tf.losses`. If the model has
+            multiple outputs, you can use a different loss on each output by
+            passing a dictionary or a list of losses. The loss value that will
+            be minimized by the model will then be the sum of all individual
+            losses.
+        metrics: List of metrics to be evaluated by the model during training
+            and testing. Typically you will use `metrics=['accuracy']`.
             To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
+            multi-output model, you could also pass a dictionary, such as
+            `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
+            You can also pass a list (len = len(outputs)) of lists of metrics
+            such as `metrics=[['accuracy'], ['accuracy', 'mse']]` or
+            `metrics=['accuracy', ['accuracy', 'mse']]`.
         loss_weights: Optional list or dictionary specifying scalar
             coefficients (Python floats) to weight the loss contributions
             of different model outputs.
@@ -207,6 +215,12 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     run_eagerly = kwargs.pop('run_eagerly', None)
+    if run_eagerly and getattr(self, '_contains_symbolic_tensors', False):
+      raise ValueError(
+          'We currently do not support enabling `run_eagerly` on compile if '
+          '`model.add_loss(tensor)` or `model.add_metric(tensor)` '
+          'has been called.')
+
     self._run_eagerly = run_eagerly
     optimizer = optimizers.get(optimizer)
 
@@ -219,7 +233,6 @@ class Model(Network):
                       'create the model under the distribution strategy scope.')
       self._distribution_strategy = distribute
       self._compile_distribution = True
-      self._distributed_session_is_configured = False
     else:
       if distribution_strategy_context.has_strategy():
         # When the user builds the model in the DS scope and cross replica
@@ -233,12 +246,6 @@ class Model(Network):
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
     if self._distribution_strategy:
-      if not isinstance(optimizer,
-                        (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
-                         optimizer_v2.OptimizerV2)):
-        raise NotImplementedError(
-            'optimizer must be an instance of '
-            'tf.train.Optimizer, not a %s' % type(optimizer))
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -249,20 +256,30 @@ class Model(Network):
         raise ValueError('target_tensors is not supported with '
                          'DistributionStrategy.')
 
+      if run_eagerly:
+        raise ValueError(
+            'We currently do not support enabling `run_eagerly` with '
+            'distribution strategy.')
+
+      if getattr(self, '_contains_symbolic_tensors', False):
+        raise ValueError(
+            'We currently do not support compiling the model with distribution '
+            'strategy if `model.add_loss(tensor)` or `model.add_metric(tensor)`'
+            ' has been called.')
+
+      if not self.built or not self.inputs or not self.outputs:
+        raise ValueError(
+            'We currently do not support distribution strategy with a '
+            '`Sequential` model that is created without `input_shape`/'
+            '`input_dim` set in its first layer or a subclassed model.')
+
     loss = loss or {}
-    if self.run_eagerly and not isinstance(
-        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer,
-                    optimizer_v2.OptimizerV2)):
-      raise ValueError(
-          'When running a model in eager execution, the optimizer must be an '
-          'instance of tf.train.Optimizer. Received: '
-          '%s' % optimizer)
 
     self.optimizer = optimizer
     # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's checkpointable.
-    if isinstance(self.optimizer, checkpointable.Checkpointable):
-      self._track_checkpointable(
+    # to add a checkpoint dependency on the optimizer if it's trackable.
+    if isinstance(self.optimizer, trackable.Trackable):
+      self._track_trackable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
     self._compile_metrics = metrics or []
@@ -276,87 +293,45 @@ class Model(Network):
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
-    self._distributed_model = None
+    self._distributed_model_cache = {}
+
+    if self._distribution_strategy is not None:
+      # Ensures a Session is created and configured correctly for Distribution
+      # Strategy.
+      K.configure_and_create_distributed_session(self._distribution_strategy)
     # Initialize model metric attributes.
     self._init_metric_attributes()
-    if not self.built:
+    if not self.built or not self.inputs or not self.outputs:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
       # time the model gets called on training data.
       return
     self._is_compiled = True
 
-    # Prepare loss functions.
-    if isinstance(loss, dict):
-      for name in loss:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_functions = []
-      for name in self.output_names:
-        if name not in loss:
-          logging.warning(
-              'Output "' + name +
-              '" missing from loss dictionary. We assume '
-              'this was done on purpose. The fit and evaluate APIs will not be '
-              'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
-    elif isinstance(loss, list):
-      if len(loss) != len(self.outputs):
-        raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model outputs. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [training_utils.get_loss_function(l) for l in loss]
-    else:
-      loss_function = training_utils.get_loss_function(loss)
-      loss_functions = [loss_function for _ in range(len(self.outputs))]
-    self.loss_functions = loss_functions
+    # Prepare list of loss functions, same size of model outputs.
+    self.loss_functions = training_utils.prepare_loss_functions(
+        loss, self.output_names)
 
-    skip_target_indices = []
-    skip_target_weighing_indices = []
     self._feed_outputs = []
     self._feed_output_names = []
     self._feed_output_shapes = []
     self._feed_loss_fns = []
-    for i in range(len(loss_functions)):
-      if loss_functions[i] is None:
+    # if loss function is None, then this output will be skipped during total
+    # loss calculation and feed targets preparation.
+    skip_target_indices = []
+    skip_target_weighing_indices = []
+    for i, loss_function in enumerate(self.loss_functions):
+      if loss_function is None:
         skip_target_indices.append(i)
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
     if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
-      if not isinstance(masks, list):
-        masks = [masks]
-
-    # Prepare loss weights.
-    if loss_weights is None:
-      loss_weights_list = [1. for _ in range(len(self.outputs))]
-    elif isinstance(loss_weights, dict):
-      for name in loss_weights:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss_weights '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_weights_list = []
-      for name in self.output_names:
-        loss_weights_list.append(loss_weights.get(name, 1.))
-    elif isinstance(loss_weights, list):
-      if len(loss_weights) != len(self.outputs):
-        raise ValueError(
-            'When passing a list as loss_weights, '
-            'it should have one entry per model output. '
-            'The model has ' + str(len(self.outputs)) +
-            ' outputs, but you passed loss_weights=' + str(loss_weights))
-      loss_weights_list = loss_weights
-    else:
-      raise TypeError('Could not interpret loss_weights argument: ' +
-                      str(loss_weights) + ' - expected a list of dicts.')
-    self.loss_weights_list = loss_weights_list
+
+    # Prepare list loss weights, same size of model outputs.
+    self.loss_weights_list = training_utils.prepare_loss_weights(
+        self.output_names, loss_weights)
 
     # Initialization for Eager mode execution.
     if self.run_eagerly:
@@ -370,15 +345,9 @@ class Model(Network):
         raise ValueError('target_tensors are not currently supported in Eager '
                          'mode.')
       self.total_loss = None
-      for i in range(len(self.outputs)):
-        if len(self.outputs) > 1:
-          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
 
       self.targets = []
       for i in range(len(self.outputs)):
@@ -451,73 +420,9 @@ class Model(Network):
       # Save all metric attributes per output of the model.
       self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-      # Compute total loss.
-      total_loss = None
-      with K.name_scope('loss'):
-        for i in range(len(self.outputs)):
-          if i in skip_target_indices:
-            continue
-          y_true = self.targets[i]
-          y_pred = self.outputs[i]
-          loss_fn = loss_functions[i]
-          sample_weight = self.sample_weights[i]
-          mask = masks[i]
-          loss_weight = loss_weights_list[i]
-          with K.name_scope(self.output_names[i] + '_loss'):
-            if isinstance(loss_fn, losses.Loss):
-              if mask is not None:
-                mask = math_ops.cast(mask, y_pred.dtype)
-                # Update weights with mask.
-                if sample_weight is None:
-                  sample_weight = mask
-                else:
-                  # Update dimensions of weights to match with mask if possible.
-                  mask, _, sample_weight = squeeze_or_expand_dimensions(
-                      mask, None, sample_weight)
-                  sample_weight *= mask
-              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            else:
-              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
-
-          if len(self.outputs) > 1:
-            # Keep track of the un-aggregated loss result tensor.
-            self._compile_metrics_tensors[self.output_names[i] +
-                                          '_loss'] = output_loss
-
-            # Keep track of stateful result tensor and function for the loss.
-            loss_name = loss_fn.name if isinstance(
-                loss_fn, losses.Loss) else loss_fn.__name__
-            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-                loss_fn, name=loss_name)
-            result_tensor = self._call_metric_fn(mean_wrapped_loss, y_true,
-                                                 y_pred, sample_weight, mask)
-            self._compile_stateful_metrics_tensors[self.output_names[i] +
-                                                   '_loss'] = result_tensor
-            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
-
-            self._compile_metrics_names.append(self.output_names[i] + '_loss')
-          if total_loss is None:
-            total_loss = loss_weight * output_loss
-          else:
-            total_loss += loss_weight * output_loss
-        if total_loss is None:
-          if not self.losses:
-            raise ValueError('The model cannot be compiled '
-                             'because it has no loss to optimize.')
-          else:
-            total_loss = 0.
-
-        # Add regularization penalties
-        # and other layer-specific losses.
-        for loss_tensor in self.losses:
-          total_loss += loss_tensor
-
       # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
+
       # Invoke metric functions for all the outputs.
       self._handle_metrics(
           self.outputs,
@@ -526,16 +431,18 @@ class Model(Network):
           skip_target_indices=skip_target_indices,
           sample_weights=self.sample_weights)
 
-      # Prepare gradient updates and state updates.
-      self.total_loss = total_loss
+      # Compute total loss.
+      # Used to keep track of the total loss value (stateless).
+      # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+      #                   loss_weight_2 * output_2_loss_fn(...) +
+      #                   layer losses.
+      self.total_loss = self._prepare_total_loss(skip_target_indices, masks)
 
       # Functions for train, test and predict will
       # be compiled lazily when required.
       # This saves time when the user is not using all functions.
       self._function_kwargs = kwargs
 
-      self._fit_function = None
-      self._eval_function = None
       self.train_function = None
       self.test_function = None
       self.predict_function = None
@@ -564,7 +471,7 @@ class Model(Network):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
-      metrics += self._compile_stateful_metric_functions
+      metrics += self._compile_metric_functions
     return metrics + super(Model, self).metrics
 
   @property
@@ -736,11 +643,15 @@ class Model(Network):
             next epoch. When training with input tensors such as
             TensorFlow data tensors, the default `None` is equal to
             the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined.
+            the batch size, or 1 if that cannot be determined. If x is a
+            `tf.data` dataset or a dataset iterator, and 'steps_per_epoch'
+            is None, the epoch will run until the input dataset is exhausted.
         validation_steps: Only relevant if `validation_data` is provided and
             is a dataset or dataset iterator. Total number of steps (batches of
             samples) to draw before stopping when performing validation
-            at the end of every epoch.
+            at the end of every epoch. If validation_data is a `tf.data` dataset
+            or a dataset iterator, and 'validation_steps' is None, validation
+            will run until the `validation_data` dataset is exhausted.
         validation_freq: Only relevant if validation data is provided. Integer
             or `collections.Container` instance (e.g. list, tuple, etc.). If an
             integer, specifies how many training epochs to run before a new
@@ -784,19 +695,15 @@ class Model(Network):
     if kwargs:
       raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
 
-    # When the model expects dictionary inputs (i.e. FeatureColumn-based
-    # models), set run_eagerly to True as there's no support for graph
-    # functions.
-    training_utils.set_run_eagerly_for_dict_structure(self, x)
-
     # Case 1: distribution strategy.
     if self._distribution_strategy:
-      if training_utils.should_run_multi_worker():
+      if K.in_multi_worker_mode():
         # Multi-Worker mode runs the Keras training loop on multiple
         # servers via the Distribute Coordinator.
         def _worker_fn(_):
           """Run training inside the distributed coordinator."""
-          self._configure_distributed_session()
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
           return training_distributed.fit_distributed(
               self,
               x=x,
@@ -804,7 +711,7 @@ class Model(Network):
               batch_size=batch_size,
               epochs=epochs,
               verbose=verbose,
-              callbacks=callbacks,
+              callbacks=filtered_callbacks,
               validation_split=validation_split,
               validation_data=validation_data,
               shuffle=shuffle,
@@ -812,7 +719,8 @@ class Model(Network):
               sample_weight=sample_weight,
               initial_epoch=initial_epoch,
               steps_per_epoch=steps_per_epoch,
-              validation_steps=validation_steps)
+              validation_steps=validation_steps,
+              validation_freq=validation_freq)
 
         # Independent worker only for now.
         return dc.run_distribute_coordinator(
@@ -820,7 +728,6 @@ class Model(Network):
             self._distribution_strategy,
             mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
       else:
-        self._configure_distributed_session()
         return training_distributed.fit_distributed(
             self,
             x=x,
@@ -836,7 +743,8 @@ class Model(Network):
             sample_weight=sample_weight,
             initial_epoch=initial_epoch,
             steps_per_epoch=steps_per_epoch,
-            validation_steps=validation_steps)
+            validation_steps=validation_steps,
+            validation_freq=validation_freq)
 
     batch_size = self._validate_or_infer_batch_size(
         batch_size, steps_per_epoch, x)
@@ -1023,6 +931,8 @@ class Model(Network):
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
             Ignored with the default value of `None`.
+            If x is a `tf.data` dataset or a dataset iterator, and `steps` is
+            None, 'evaluate' will run until the dataset is exhausted.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during evaluation.
             See [callbacks](/api_docs/python/tf/keras/callbacks).
@@ -1051,16 +961,38 @@ class Model(Network):
     """
     # Case 1: distribution strategy.
     if self._distribution_strategy:
-      self._configure_distributed_session()
-      return training_distributed.evaluate_distributed(
-          self,
-          x=x,
-          y=y,
-          batch_size=batch_size,
-          verbose=verbose,
-          sample_weight=sample_weight,
-          steps=steps,
-          callbacks=callbacks)
+      if K.in_multi_worker_mode():
+        # Multi-Worker mode runs the Keras evaluation loop on multiple
+        # servers via the Distribute Coordinator.
+        def _worker_fn(_):
+          """Run evaluation inside the distributed coordinator."""
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
+          return training_distributed.evaluate_distributed(
+              self,
+              x=x,
+              y=y,
+              batch_size=batch_size,
+              verbose=verbose,
+              sample_weight=sample_weight,
+              steps=steps,
+              callbacks=filtered_callbacks)
+
+        # Independent worker only for now.
+        return dc.run_distribute_coordinator(
+            _worker_fn,
+            self._distribution_strategy,
+            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+      else:
+        return training_distributed.evaluate_distributed(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            verbose=verbose,
+            sample_weight=sample_weight,
+            steps=steps,
+            callbacks=callbacks)
 
     batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
@@ -1149,7 +1081,9 @@ class Model(Network):
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
-            Ignored with the default value of `None`.
+            Ignored with the default value of `None`. If x is a `tf.data`
+            dataset or a dataset iterator, and `steps` is None, `predict` will
+            run until the input dataset is exhausted.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during prediction.
             See [callbacks](/api_docs/python/tf/keras/callbacks).
@@ -1179,7 +1113,6 @@ class Model(Network):
     """
     # Case 1: distribution strategy.
     if self._distribution_strategy:
-      self._configure_distributed_session()
       return training_distributed.predict_distributed(self,
                                                       x=x,
                                                       batch_size=batch_size,
@@ -1239,8 +1172,15 @@ class Model(Network):
     if hasattr(self, 'metrics'):
       for m in self.metrics:
         m.reset_states()
-      if self._distribution_strategy:
-        distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
+
+    # Reset the state of loss metric wrappers.
+    if getattr(self, '_output_loss_metrics', None) is not None:
+      for m in self._output_loss_metrics:
+        m.reset_states()
+
+    # Reset metrics on all the distributed (cloned) models.
+    if self._distribution_strategy:
+      distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
 
   def train_on_batch(self,
                      x,
@@ -1299,19 +1239,20 @@ class Model(Network):
 
     if self.run_eagerly:
       outputs = training_eager.train_on_batch(
-          self, x, y, sample_weights=sample_weights)
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          output_loss_metrics=self._output_loss_metrics)
     else:
+      x = training_utils.ModelInputs(x).as_list()
+      ins = x + (y or []) + (sample_weights or [])
+
       if not isinstance(K.symbolic_learning_phase(), int):
-        ins = x + y + sample_weights + [True]
-      else:
-        ins = x + y + sample_weights
+        ins += [True]  # Add learning phase value.
 
-      if reset_metrics:
-        self._make_train_function()
-        outputs = self.train_function(ins)  # pylint: disable=not-callable
-      else:
-        self._make_fit_function()
-        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+      self._make_train_function()
+      outputs = self.train_function(ins)  # pylint: disable=not-callable
 
     if reset_metrics:
       self.reset_metrics()
@@ -1368,15 +1309,17 @@ class Model(Network):
 
     if self.run_eagerly:
       outputs = training_eager.test_on_batch(
-          self, x, y, sample_weights=sample_weights)
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          output_loss_metrics=self._output_loss_metrics)
     else:
-      inputs = x + y + sample_weights
-      if reset_metrics:
-        self._make_test_function()
-        outputs = self.test_function(inputs)  # pylint: disable=not-callable
-      else:
-        self._make_eval_function()
-        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+      x = training_utils.ModelInputs(x).as_list()
+      inputs = x + (y or []) + (sample_weights or [])
+
+      self._make_test_function()
+      outputs = self.test_function(inputs)  # pylint: disable=not-callable
 
     if reset_metrics:
       self.reset_metrics()
@@ -1684,6 +1627,99 @@ class Model(Network):
         verbose=verbose,
         callbacks=callbacks)
 
+  def _prepare_total_loss(self, skip_target_indices=None, masks=None):
+    """Computes total loss from loss functions.
+
+    Arguments:
+        skip_target_indices: A list of indices of model outputs where loss
+          function is None.
+        masks: List of mask values corresponding to each model output.
+
+    Returns:
+        A list of loss weights of python floats.
+
+    Raises:
+        TypeError: If model run_eagerly is True.
+    """
+    if self.run_eagerly:
+      raise TypeError('total loss can not be computed when compiled with '
+                      'run_eagerly = True.')
+    skip_target_indices = skip_target_indices or []
+    total_loss = None
+    with K.name_scope('loss'):
+      zipped_inputs = zip(self.targets, self.outputs, self.loss_functions,
+                          self.sample_weights, masks, self.loss_weights_list)
+      for i, (y_true, y_pred, loss_fn, sample_weight, mask,
+              loss_weight) in enumerate(zipped_inputs):
+        if i in skip_target_indices:
+          continue
+        loss_name = self.output_names[i] + '_loss'
+        with K.name_scope(loss_name):
+          if mask is not None:
+            mask = math_ops.cast(mask, y_pred.dtype)
+            # Update weights with mask.
+            if sample_weight is None:
+              sample_weight = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, sample_weight = (
+                  losses_utils.squeeze_or_expand_dimensions(
+                      mask, None, sample_weight))
+              sample_weight *= mask
+
+          # Reset reduction on the loss so that we can get the per sample loss
+          # value. We use this to get both the stateless and stateful loss
+          # values without having to compute the underlying loss function
+          # twice.
+          weighted_losses = None
+          if hasattr(loss_fn, 'reduction'):
+            current_loss_reduction = loss_fn.reduction
+            loss_fn.reduction = losses_utils.ReductionV2.NONE
+            weighted_losses = loss_fn(
+                y_true, y_pred, sample_weight=sample_weight)
+            loss_fn.reduction = current_loss_reduction
+
+            # Compute the stateless loss value.
+            output_loss = losses_utils.reduce_weighted_loss(
+                weighted_losses, reduction=current_loss_reduction)
+          else:
+            # Compute the stateless loss value for a custom loss class.
+            # Here we assume that the class takes care of loss reduction
+            # because if this class returns a vector value we cannot
+            # differentiate between use case where a custom optimizer
+            # expects a vector loss value vs unreduced per-sample loss value.
+            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+
+        if len(self.outputs) > 1:
+          # Keep track of stateful result tensor and function for the loss.
+          # Compute the stateful loss value.
+          if weighted_losses is not None:
+            # TODO(b/120571621): Directly call metric when the bug is fixed.
+            aggregated_output_loss = self._call_fn_for_each_replica(
+                self._output_loss_metrics[i], weighted_losses)
+          else:
+            # Custom loss class.
+            aggregated_output_loss = self._call_metric_fn(
+                self._output_loss_metrics[i], y_true, y_pred, sample_weight)
+          self._compile_metrics_tensors[loss_name] = aggregated_output_loss
+
+        if total_loss is None:
+          total_loss = loss_weight * output_loss
+        else:
+          total_loss += loss_weight * output_loss
+      if total_loss is None:
+        if not self.losses:
+          raise ValueError('The model cannot be compiled '
+                           'because it has no loss to optimize.')
+        else:
+          total_loss = 0.
+
+      # Add regularization penalties and other layer-specific losses.
+      if self.losses:
+        total_loss += losses_utils.scale_loss_for_distribution(
+            math_ops.add_n(self.losses))
+    return total_loss
+
   def _get_callback_model(self):
     """Returns the Callback Model for this Model."""
 
@@ -1747,7 +1783,7 @@ class Model(Network):
         if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
                           iterator_ops.EagerIterator)):
           ds_batch_size = tensor_shape.as_dimension(
-              nest.flatten(x.output_shapes)[0][0]).value
+              nest.flatten(dataset_ops.get_legacy_output_shapes(x))[0][0]).value
           if ds_batch_size is not None and ds_batch_size != static_batch_size:
             raise ValueError('The batch output shape of your `Dataset` is {}, '
                              'which is incompatible with the specified batch '
@@ -1763,9 +1799,10 @@ class Model(Network):
       batch_size = 32
     return batch_size
 
-  @property
-  def _default_save_signature(self):
-    return saving_utils.trace_model_call(self)
+  def _list_functions_for_serialization(self):
+    return {
+        '_default_save_signature': saving_utils.trace_model_call(self)
+    }
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -1795,10 +1832,13 @@ class Model(Network):
         output_shapes.append(output.shape.as_list())
     self._per_output_metrics = training_utils.collect_per_output_metric_info(
         metrics, self.output_names, output_shapes, self.loss_functions)
-    self._per_output_weighted_metrics = \
+    self._per_output_weighted_metrics = (
         training_utils.collect_per_output_metric_info(
-            weighted_metrics, self.output_names, output_shapes,
-            self.loss_functions, self.sample_weights)
+            weighted_metrics,
+            self.output_names,
+            output_shapes,
+            self.loss_functions,
+            is_weighted=True))
 
   def _add_unique_metric_name(self, metric_name, output_index):
     """Makes the metric name unique and adds it to the model's metric name list.
@@ -1827,37 +1867,29 @@ class Model(Network):
 
   @property
   def _all_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
-    metrics_tensors = {}
-    if self._is_compiled:
-      metrics_tensors.update(self._compile_metrics_tensors)
-    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
-    return metrics_tensors
+    """Returns a dictionary that maps metric names to metric result tensors.
 
-  @property
-  def _all_stateful_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
+    This maps metric names from `model.metric_names` to result tensors.
+    Just like model.metric_names, this includes loss names and tensors.
+    """
     metrics_tensors = {}
     if self._is_compiled:
-      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+      metrics_tensors.update(self._compile_metrics_tensors)
     metrics_tensors.update(super(Model, self)._all_metrics_tensors)
     return metrics_tensors
 
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
-    # List of all metric names in the model.
+    # List of all metric names in the model. This includes loss metrics.
     self._compile_metrics_names = ['loss']
     # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    # This includes loss functions when there are multiple outputs.
-    self._compile_stateful_metric_functions = []
+    # training/eval. This includes loss metric functions.
+    self._compile_metric_functions = []
     # Dict of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors when there are multiple outputs.
-    self._compile_stateful_metrics_tensors = {}
-    # Dict of all metric result tensors (aggregated or not - based on the
-    # values given in compile.). This includes aggregated loss result tensors
-    # when there are multiple outputs.
+    # loss result tensors.
     self._compile_metrics_tensors = {}
+    # List of metric wrappers on output losses.
+    self._output_loss_metrics = None
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -1871,20 +1903,32 @@ class Model(Network):
       Metrics dict updated with unique metric names as keys.
     """
     updated_metrics_dict = collections.OrderedDict()
-    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
+    for metric_name, metric_fn in metrics_dict.items():
       metric_name = self._add_unique_metric_name(metric_name, output_index)
-      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
-      # Keep track of metric name, function and stateful function.
+
+      # Update the name on the metric class to be the unique generated name.
+      metric_fn._name = metric_name  # pylint: disable=protected-access
+      updated_metrics_dict[metric_name] = metric_fn
+      # Keep track of metric name and function.
       self._compile_metrics_names.append(metric_name)
-      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+      self._compile_metric_functions.append(metric_fn)
     return updated_metrics_dict
 
-  def _set_metric_attributes(self, outputs, skip_target_indices=None):
+  def _set_metric_attributes(self, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
+    # Add loss metric names to the model metric names list.
+    if len(self.outputs) > 1:
+      output_names = [
+          self.output_names[i] + '_loss'
+          for i in range(len(self.outputs))
+          if i not in skip_target_indices
+      ]
+      self._compile_metrics_names.extend(output_names)
+
     skip_target_indices = skip_target_indices or []
     updated_per_output_metrics = []
     updated_per_output_weighted_metrics = []
-    for i in range(len(outputs)):
+    for i in range(len(self.outputs)):
       if i in skip_target_indices:
         updated_per_output_metrics.append(self._per_output_metrics[i])
         updated_per_output_weighted_metrics.append(
@@ -1897,11 +1941,29 @@ class Model(Network):
           self._set_per_output_metric_attributes(
               self._per_output_weighted_metrics[i], i))
 
+    # Create a metric wrapper for each output loss.
+    if len(self.outputs) > 1:
+      self._output_loss_metrics = [
+          metrics_module.SumOverBatchSize() if hasattr(loss_fn, 'reduction')
+          else metrics_module.SumOverBatchSizeMetricWrapper(loss_fn)
+          for loss_fn in self.loss_functions
+      ]
+
     self._per_output_metrics = updated_per_output_metrics
     self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
-  def _call_metric_fn(self, fn, y_true, y_pred, weights, mask):
+  def _call_metric_fn(self, metric_fn, y_true, y_pred, weights, mask=None):
+    # TODO(b/120571621): Remove this function when the bug is fixed.
     """Helper function to call metric function with distribution strategy."""
+    return self._call_fn_for_each_replica(
+        training_utils.call_metric_function,
+        metric_fn,
+        y_true,
+        y_pred,
+        weights=weights,
+        mask=mask)
+
+  def _call_fn_for_each_replica(self, fn, *args, **kwargs):
     # TODO(b/120571621): We want to avoid metric reductions here since
     # since TPUStrategy does not implement replica local variables.
     # Remove this hack once we support TPUReplicaLocalVariables.
@@ -1911,18 +1973,15 @@ class Model(Network):
         distribution_strategy_context.in_cross_replica_context()):
       with self._distribution_strategy.scope():
         return self._distribution_strategy.extended.call_for_each_replica(
-            training_utils.call_metric_function,
-            (fn, y_true, y_pred, weights, mask))
-    return training_utils.call_metric_function(
-        fn, y_true, y_pred, weights=weights, mask=mask)
+            fn, args, kwargs)
+    return fn(*args, **kwargs)
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
                                  y_true,
                                  y_pred,
                                  mask,
-                                 weights=None,
-                                 return_stateful_result=True):
+                                 weights=None):
     """Calls metric functions for a single output.
 
     Arguments:
@@ -1931,49 +1990,18 @@ class Model(Network):
       y_pred: Predicted output.
       mask: Computed mask value for the current output.
       weights: Weights to be applied on the current output.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
     """
     metric_results = []
-    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
+    for metric_name, metric_fn in metrics_dict.items():
       with K.name_scope(metric_name):
-
-        def _call_stateful_fn(fn):
-          """Create stateful metrics correctly."""
-          return self._call_metric_fn(fn, y_true, y_pred, weights, mask)
-
-        def _call_stateless_fn(fn):
-          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
-          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
-
-        def _track_metric_tensors(name, stateless_result, stateful_result):
-          self._compile_metrics_tensors[name] = stateless_result
-          self._compile_stateful_metrics_tensors[name] = stateful_result
-
-        if isinstance(metric_fn, metrics_module.Metric):
-          # If the given metric fn is stateful, call the fn and return result.
-          metric_result = _call_stateful_fn(metric_fn)
-          metric_results.append(metric_result)
-          if not self.run_eagerly:
-            _track_metric_tensors(metric_name, metric_result, metric_result)
-        elif self.run_eagerly:
-          # In eager mode, if the given metric fn is not stateful, we invoke the
-          # given fn or its stateful version based on the given flag.
-          if return_stateful_result:
-            metric_result = _call_stateful_fn(stateful_fn)
-          else:
-            metric_result = _call_stateless_fn(metric_fn)
-          metric_results.append(metric_result)
-        else:
-          # In graph mode, we build the sub-graph for both the stateful and the
-          # stateless fns.
-          stateful_metric_result = _call_stateful_fn(stateful_fn)
-          metric_result = _call_stateless_fn(metric_fn)
-          _track_metric_tensors(metric_name, metric_result,
-                                stateful_metric_result)
+        metric_result = self._call_metric_fn(metric_fn, y_true, y_pred, weights,
+                                             mask)
+        metric_results.append(metric_result)
+        if not self.run_eagerly:
+          self._compile_metrics_tensors[metric_name] = metric_result
 
     return metric_results
 
@@ -1982,8 +2010,7 @@ class Model(Network):
                       skip_target_indices=None,
                       targets=None,
                       sample_weights=None,
-                      masks=None,
-                      return_stateful_result=True):
+                      masks=None):
     """Handles calling metric functions.
 
     Arguments:
@@ -1992,8 +2019,6 @@ class Model(Network):
       targets: List of targets.
       sample_weights: Optional list of sample weight arrays.
       masks: List of computed output mask values.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
@@ -2009,25 +2034,20 @@ class Model(Network):
         target = targets[i] if targets else None
         output_mask = masks[i] if masks else None
         metric_results.extend(
-            self._handle_per_output_metrics(
-                self._per_output_metrics[i],
-                target,
-                output,
-                output_mask,
-                return_stateful_result=return_stateful_result))
+            self._handle_per_output_metrics(self._per_output_metrics[i], target,
+                                            output, output_mask))
         metric_results.extend(
             self._handle_per_output_metrics(
                 self._per_output_weighted_metrics[i],
                 target,
                 output,
                 output_mask,
-                weights=sample_weights[i],
-                return_stateful_result=return_stateful_result))
+                weights=sample_weights[i]))
 
     # Add metric results from the `add_metric` metrics in eager mode.
     if context.executing_eagerly():
       for m in self.metrics:
-        if m not in self._compile_stateful_metric_functions:
+        if m not in self._compile_metric_functions:
           metric_results.append(m.result())
     return metric_results
 
@@ -2049,11 +2069,14 @@ class Model(Network):
           ' trainable weights, did you set `model.trainable`'
           ' without calling `model.compile` after ?', 1)
 
-  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
+  def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
-    if getattr(self, fn_name) is None:
+    if getattr(self, 'train_function') is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
@@ -2070,70 +2093,37 @@ class Model(Network):
       updates += self.get_updates_for(None)
       # Conditional updates relevant to this model
       updates += self.get_updates_for(self.inputs)
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
 
       with K.name_scope('training'):
         # Gets loss and metrics. Updates weights at each call.
         fn = K.function(
-            inputs,
-            outputs,
+            inputs, [self.total_loss] + metrics_tensors,
             updates=updates,
             name='train_function',
             **self._function_kwargs)
-        setattr(self, fn_name, fn)
+        setattr(self, 'train_function', fn)
 
-  def _make_train_function(self):
+  def _make_test_function(self):
     metrics_tensors = [
         self._all_metrics_tensors[m] for m in self.metrics_names[1:]
     ]
-    self._make_train_function_helper('train_function',
-                                     [self.total_loss] + metrics_tensors)
-
-  def _make_fit_function(self):
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_train_function_helper(
-        '_fit_function', [self.total_loss] + metrics_tensors)
-
-  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
     if not self._is_compiled:
       raise RuntimeError('You must compile your model before using it.')
-    if getattr(self, fn_name) is None:
+    if getattr(self, 'test_function') is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
 
       with K.name_scope('evaluation'):
         updates = self.state_updates
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
         # Return loss and metrics, no gradient updates.
         # Does update the network states.
         fn = K.function(
-            inputs,
-            outputs,
+            inputs, [self.total_loss] + metrics_tensors,
             updates=updates,
             name='test_function',
             **self._function_kwargs)
-        setattr(self, fn_name, fn)
-
-  def _make_test_function(self):
-    metrics_tensors = [
-        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper('test_function',
-                                    [self.total_loss] + metrics_tensors)
-
-  def _make_eval_function(self):
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper(
-        '_eval_function', [self.total_loss] + metrics_tensors)
+        setattr(self, 'test_function', fn)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
@@ -2153,11 +2143,11 @@ class Model(Network):
 
   def _make_execution_function(self, mode):
     if mode == ModeKeys.TRAIN:
-      self._make_fit_function()
-      return self._fit_function
+      self._make_train_function()
+      return self.train_function
     if mode == ModeKeys.TEST:
-      self._make_eval_function()
-      return self._eval_function
+      self._make_test_function()
+      return self.test_function
     if mode == ModeKeys.PREDICT:
       self._make_predict_function()
       return self.predict_function
@@ -2168,11 +2158,10 @@ class Model(Network):
                                           sample_weight=None,
                                           class_weight=None,
                                           batch_size=None,
-                                          check_steps=False,
-                                          steps_name='steps',
-                                          steps=None,
                                           validation_split=0,
-                                          shuffle=False):
+                                          shuffle=False,
+                                          repeat=False,
+                                          allow_partial_batch=False):
     """Runs validation checks on input and target data passed by the user.
 
     This is called when using DistributionStrategy to train, evaluate or serve
@@ -2188,14 +2177,13 @@ class Model(Network):
         to, as conveyed by `y`.
       batch_size: Integer batch size. If provided, it is used to run additional
         validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
       shuffle: Boolean whether to shuffle the training data before each epoch.
+      repeat: Boolean whether to repeat the numpy training data when converting
+        to training dataset.
+      allow_partial_batch: Boolean whether to enforce that all batches have the
+        same size.
 
     Returns:
       Dataset instance.
@@ -2214,6 +2202,20 @@ class Model(Network):
       raise NotImplementedError('`sample_weight` is currently not supported '
                                 'when using TPUStrategy.')
 
+    if (self.stateful and distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy) and self._distribution_strategy.
+        num_replicas_in_sync != 1):
+      raise ValueError('Single core must be used for computation on '
+                       'stateful models. Consider adding `device_assignment` '
+                       'parameter to TPUStrategy using\n'
+                       'topology = tf.contrib.distribute.'
+                       'initialize_tpu_system()\n'
+                       'device_assignment = tf.contrib.tpu.DeviceAssignment('
+                       'topology, core_assignment=tf.contrib.tpu.'
+                       'SINGLE_CORE_ASSIGNMENT)\n'
+                       'tpu_strategy = tf.contrib.distribute.TPUStrategy('
+                       'device_assignment=device_assignment)')
+
     # Validates `steps` and `shuffle` arguments right at the beginning
     # since we use it to construct the dataset object.
     # TODO(anjalisridhar): Remove this check once we refactor the
@@ -2223,18 +2225,15 @@ class Model(Network):
       if shuffle:
         training_utils.verify_dataset_shuffled(x)
 
-      if check_steps and steps is None:
-        raise ValueError('When using Datasets as input, '
-                         'you should specify the `{steps_name}` argument.'
-                         .format(steps_name=steps_name))
-
-    if ops.executing_eagerly_outside_functions():
-      session = None
-    else:
-      session = K.get_session()
-
     strategy = self._distribution_strategy
     with strategy.scope():
+      # We should be sure to call get_session() inside the strategy.scope()
+      # so the strategy can affect the session options.
+      if ops.executing_eagerly_outside_functions():
+        session = None
+      else:
+        session = K.get_session()
+
       first_x_value = nest.flatten(x)[0]
       if isinstance(first_x_value, np.ndarray):
         x = distributed_training_utils.list_to_tuple(x)
@@ -2249,22 +2248,21 @@ class Model(Network):
         else:
           in_tuple = x
 
-        if shuffle:
-          # 1024 is a good buffer size since it is much larger than the average
-          # batch size provided by the user and provides sufficient randomness.
-          # One thing to keep in mind is the memory usage based on the size of
-          # each sample.
-          shuffle_buffer = 1024
-        else:
-          shuffle_buffer = None
         ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
                                                                session=session)
-        if shuffle_buffer:
-          ds = ds.shuffle(shuffle_buffer)
-        ds = ds.repeat()
+        if shuffle:
+          # We want a buffer size that is larger than the batch size provided by
+          # the user and provides sufficient randomness. Note that larger
+          # numbers introduce more memory usage based on the size of each
+          # sample.
+          ds = ds.shuffle(max(1024, batch_size * 8))
+        if repeat:
+          ds = ds.repeat()
+
         # We need to use the drop_remainder argument to get a known static
         # input shape which is required for TPUs.
-        drop_remainder = strategy.extended.experimental_require_static_shapes
+        drop_remainder = (not allow_partial_batch and
+                          strategy.extended.experimental_require_static_shapes)
         x = ds.batch(batch_size, drop_remainder=drop_remainder)
       else:
         assert isinstance(x, dataset_ops.DatasetV2)
@@ -2525,18 +2523,21 @@ class Model(Network):
         feed_output_shapes = []
         for output_shape, loss_fn in zip(self._feed_output_shapes,
                                          self._feed_loss_fns):
-          if loss_fn is losses.sparse_categorical_crossentropy:
+          if ((isinstance(loss_fn, losses.LossFunctionWrapper) and
+               loss_fn.fn == losses.sparse_categorical_crossentropy)) or (
+                   isinstance(loss_fn, losses.SparseCategoricalCrossentropy)):
             if K.image_data_format() == 'channels_first':
               feed_output_shapes.append(
                   (output_shape[0], 1) + output_shape[2:])
             else:
               feed_output_shapes.append(output_shape[:-1] + (1,))
-          elif (not hasattr(loss_fn, '__name__') or
-                getattr(losses, loss_fn.__name__, None) is None):
-            # If `loss_fn` is not a function (e.g. callable class)
-            # or if it not in the `losses` module, then
-            # it is a user-defined loss and we make no assumptions
-            # about it.
+          elif (not isinstance(loss_fn, losses.Loss) or
+                (isinstance(loss_fn, losses.LossFunctionWrapper) and
+                 (getattr(losses, loss_fn.fn.__name__, None) is None))):
+            # If the given loss is not an instance of the `Loss` class (custom
+            # class) or if the loss function that is wrapped is not in the
+            # `losses` module, then it is a user-defined loss and we make no
+            # assumptions about it.
             feed_output_shapes.append(None)
           else:
             feed_output_shapes.append(output_shape)
@@ -2611,7 +2612,7 @@ class Model(Network):
           'However we received `validation_data=%s`' % validation_data)
     return val_x, val_y, val_sample_weight
 
-  @checkpointable.no_automatic_dependency_tracking
+  # TODO(omalleyt): Consider changing to a more descriptive function name.
   def _set_inputs(self, inputs, outputs=None, training=None):
     """Set model's input and output specs based on the input data received.
 
@@ -2638,6 +2639,22 @@ class Model(Network):
       ValueError: If dict inputs are passed to a Sequential Model where the
         first layer isn't FeatureLayer.
     """
+    inputs = self._set_input_attrs(inputs)
+
+    if outputs is None:
+      kwargs = {'training': training} if self._expects_training_arg else {}
+      try:
+        outputs = self(inputs, **kwargs)
+      except NotImplementedError:
+        # This Model or a submodel is dynamic and hasn't overridden
+        # `compute_output_shape`.
+        outputs = None
+
+    self._set_output_attrs(outputs)
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_input_attrs(self, inputs):
+    """Sets attributes related to the inputs of the Model."""
     if self.inputs:
       raise ValueError('Model inputs are already set.')
 
@@ -2674,51 +2691,16 @@ class Model(Network):
         self._feed_inputs.append(v)
         self._feed_input_shapes.append(K.int_shape(v))
 
-    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
-    if outputs is None:
-      if not self._dynamic:
-        # The network may include dynamic layers but its `call`
-        # itself isn't dynamic.
-        # Obtain symbolic outputs by calling the model.
-        with K.get_graph().as_default():
-          if self._expects_training_arg:
-            outputs = self.call(inputs, training=training)
-          else:
-            outputs = self.call(inputs)
-      else:
-        # Case: network's `call` is dynamic.
-        try:
-          outputs = self._symbolic_call(inputs)
-        except NotImplementedError:
-          # Static shape inference was not implemented for this dynamic net.
-          # Do not specify symbolic outputs.
-          outputs = None
+    return inputs
 
+  @trackable.no_automatic_dependency_tracking
+  def _set_output_attrs(self, outputs):
+    """Sets attributes related to the outputs of the Model."""
     outputs = nest.flatten(outputs)
     self.outputs = outputs
     self.output_names = training_utils.generic_output_names(outputs)
     self.built = True
 
-  def _configure_distributed_session(self):
-    """Configure a Session for use with Distribution Strategies.
-
-    Raises:
-      ValueError: If a non-distributed Session has already been created.
-    """
-    if not self._distributed_session_is_configured:
-      if (dc_context.get_current_worker_context() is not None and
-          getattr(K._SESSION, 'session', None) is not None):  # pylint: disable=protected-access
-        raise ValueError('Session was created before `fit`, `evaluate`, '
-                         'or `predict` was called. With Multi-Worker '
-                         'mode, this is not allowed. Please avoid '
-                         'creating a Session outside of these methods. '
-                         'The Session may have been created by a call '
-                         'to `keras.backend.get_session()` or '
-                         'functions that use Sessions, like `load_weights`.')
-      distributed_training_utils.configure_and_create_session(
-          self._distribution_strategy)
-      self._distributed_session_is_configured = True
-
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index d0881c11a50bd72b409d774ab72b717b8eee1acc..eff5f8dfa49fa9671279f9a0f81906fb86fe703e 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -60,6 +60,7 @@ def model_iteration(model,
                     validation_freq=1,
                     mode=ModeKeys.TRAIN,
                     validation_in_fit=False,
+                    prepared_feed_values_from_dataset=False,
                     steps_name='steps',
                     **kwargs):
   """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
@@ -94,11 +95,14 @@ def model_iteration(model,
         which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
         validation at the end of the 1st, 2nd, and 10th epochs.
       mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-      validation_in_fit: DEPRECATED: if true, then this method is invoked from
-        within training iteration (for validation). In this case, do not copy
-        weights when using a tf.distribute.Strategy. The input is deprecated as
-        it is not required if the user creates a distributed model under the
-        distribution strategy scope rather than passing it to compile.
+      validation_in_fit: if true, then this method is invoked from within
+        training iteration (for validation). In the case where `val_inputs` is a
+        dataset, this flag indicates that its iterator and feed values are
+        already created so should properly reuse resources.
+      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
+        tensors returned from `_prepare_feed_values` call on the validation
+        dataset, so do not call it again on `inputs`. Should only be used for
+        inline validation (i.e., only if `validation_in_fit` is also True).
       steps_name: The string name of the steps argument, either `steps`,
         `validation_steps`, or `steps_per_epoch`. Only used for error message
         formatting.
@@ -133,16 +137,14 @@ def model_iteration(model,
           inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
     input_iterator = _get_iterator(inputs, model._distribution_strategy)
 
-  val_iterator = None
-  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
-
   if mode == ModeKeys.TRAIN:
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
   # Enter DistributionStrategy scope.
   if model._distribution_strategy:
-    scope = model._distribution_strategy.scope()
+    scope = distributed_training_utils.distributed_scope(
+        strategy=model._distribution_strategy,
+        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
     scope.__enter__()
 
   # Get step function and loop type.
@@ -156,13 +158,38 @@ def model_iteration(model,
 
   # Prepare input data.
   inputs = input_iterator or inputs
-  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+  if validation_in_fit and prepared_feed_values_from_dataset:
+    # When invoking validation in training loop, avoid creating iterator and
+    # list of feed values for the same validation dataset multiple times (which
+    # essentially would call `iterator.get_next()` that slows down execution and
+    # leads to OOM errors eventually.
+    ins = inputs
+  else:
+    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
   if not is_dataset:
     num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                      steps_per_epoch)
   else:
     num_samples_or_steps = steps_per_epoch
 
+  # Prepare validation data. Hold references to the iterator and the input list
+  # to properly reinitialize and reuse in multiple validation passes.
+  val_iterator = None
+  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+    if validation_steps is None:
+      # Because we pass an iterator feed instead of a Dataset to the eval
+      # model_iteration() call, it will not trigger the dataset-input path
+      # that determines the number of steps required. To avoid this issue,
+      # set validation_steps here if validation_steps is None.
+      validation_steps = training_utils.infer_steps_for_dataset(
+          val_inputs,
+          validation_steps,
+          epochs=epochs,
+          steps_name='validation_steps')
+    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
+    val_inputs = _prepare_feed_values(
+        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)
+
   # Configure callbacks.
   count_mode = 'steps' if use_steps else 'samples'
   callbacks = cbks.configure_callbacks(
@@ -196,9 +223,8 @@ def model_iteration(model,
     aggregator = training_utils.MetricsAggregator(use_steps,
                                                   num_samples_or_steps)
 
-  if model._compile_distribution and not validation_in_fit:
-    distributed_training_utils._copy_weights_to_distributed_model(
-        model, model._distributed_model)
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -236,8 +262,29 @@ def model_iteration(model,
           actual_inputs = ins() if callable(ins) else ins
           batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
-          if not is_dataset:
+          if is_dataset:
+            # The dataset passed by the user ran out of batches.
+            # Now we know the cardinality of the dataset.
+            # If steps_per_epoch was specified, then running out of data is
+            # unexpected, so we stop training and inform the user.
+            if steps_per_epoch:
+              callbacks.model.stop_training = True
+              logging.warning(
+                  'Your dataset ran out of data; interrupting training. '
+                  'Make sure that your dataset can generate at least '
+                  '`%s * epochs` batches (in this case, %d batches). '
+                  'You may need to use the repeat() function when '
+                  'building your dataset.'
+                  % (steps_name, steps_per_epoch * epochs))
+            elif step > 0:
+              steps_per_epoch = step
+              aggregator.num_samples_or_steps = steps_per_epoch
+              if mode == ModeKeys.TRAIN:
+                progbar.params['steps'] = steps_per_epoch
+                progbar.progbar.target = steps_per_epoch
+          else:
             # We ran out of batches while the user passed an iterator (legacy).
+            callbacks.model.stop_training = True
             logging.warning(
                 'Your dataset iterator ran out of data; '
                 'interrupting training. Make sure that your iterator '
@@ -245,15 +292,6 @@ def model_iteration(model,
                 'batches (in this case, %d batches). You may need to'
                 'use the repeat() function when building your '
                 'dataset.' % (steps_name, steps_per_epoch * epochs))
-            callbacks.model.stop_training = True
-          else:
-            # The dataset passed by the user ran out of batches.
-            # Now we know the cardinality of the dataset.
-            if step > 0:
-              steps_per_epoch = step
-              aggregator.num_samples_or_steps = steps_per_epoch
-              progbar.params['steps'] = steps_per_epoch
-              progbar.progbar.target = steps_per_epoch
           break
 
         if not isinstance(batch_outs, list):
@@ -338,7 +376,13 @@ def model_iteration(model,
     if (do_validation and
         training_utils.should_run_validation(validation_freq, epoch) and
         not callbacks.model.stop_training):
-      val_inputs = val_iterator or val_inputs
+
+      if model._compile_distribution:
+        # Since we create a new clone from the original model we need to copy
+        # the weights back to the original model before we can run validation.
+        distributed_training_utils._copy_weights_to_original_model(
+            model, ModeKeys.TRAIN)
+
       val_results = model_iteration(
           model,
           val_inputs,
@@ -350,6 +394,7 @@ def model_iteration(model,
           verbose=0,
           mode=ModeKeys.TEST,
           validation_in_fit=True,
+          prepared_feed_values_from_dataset=(val_iterator is not None),
           steps_name='validation_steps')
       if not isinstance(val_results, list):
         val_results = [val_results]
@@ -366,15 +411,13 @@ def model_iteration(model,
     # Reinitialize dataset iterator for the next epoch.
     if reset_dataset_after_each_epoch and epoch < epochs - 1:
       _reinitialize_iterator(input_iterator, model._distribution_strategy)
-      ins = _prepare_feed_values(model, input_iterator, None, None, mode)
 
   callbacks._call_end_hook(mode)
 
   if model._distribution_strategy:
-    if model._compile_distribution and not validation_in_fit:
+    if model._compile_distribution:
       # TODO(priyag, psv): Copy back metrics to the original model as well?
-      distributed_training_utils._copy_weights_to_original_model(
-          model, model._distributed_model, mode)
+      distributed_training_utils._copy_weights_to_original_model(model, mode)
     scope.__exit__(None, None, None)
 
   if mode == ModeKeys.TRAIN:
diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa4735c838ed428e9e6d36eece6859b8bf47dea
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_arrays_test.py
@@ -0,0 +1,61 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for model.fit calls with a Dataset object passed as validation_data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ValidationDatasetNoLimitTest(keras_parameterized.TestCase):
+
+  def create_dataset(self, num_samples, batch_size):
+    input_data = np.random.rand(num_samples, 1)
+    expected_data = input_data * 3
+    dataset = dataset_ops.Dataset.from_tensor_slices((input_data,
+                                                      expected_data))
+    return dataset.shuffle(10 * batch_size).batch(batch_size)
+
+  def test_validation_dataset_with_no_step_arg(self):
+    # Create a model that learns y=Mx.
+    layers = [core.Dense(1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+    model.compile(loss="mse", optimizer="adam", metrics=["mean_absolute_error"])
+
+    train_dataset = self.create_dataset(num_samples=200, batch_size=10)
+    eval_dataset = self.create_dataset(num_samples=50, batch_size=25)
+
+    history = model.fit(x=train_dataset, validation_data=eval_dataset, epochs=2)
+    evaluation = model.evaluate(x=eval_dataset)
+
+    # If the fit call used the entire dataset, then the final val MAE error
+    # from the fit history should be equal to the final element in the output
+    # of evaluating the model on the same eval dataset.
+    self.assertAlmostEqual(history.history["val_mean_absolute_error"][-1],
+                           evaluation[-1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 1c67100f2035d8e7c3d1bb52dba50ee8202198ba..c2e893dbe0dcc54590225dc8427b4249caed33f8 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -19,22 +19,33 @@ from __future__ import division
 from __future__ import print_function
 
 import logging
+import sys
 
 import numpy as np
+import six
 
 from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
+class BatchCounterCallback(callbacks.Callback):
+
+  def __init__(self):
+    self.batch_count = 0
+
+  def on_batch_end(self, *args, **kwargs):
+    self.batch_count += 1
+
+
 class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
@@ -226,6 +237,66 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
                                  'the `steps` argument'):
       model.predict(dataset, verbose=0)
 
+  # TODO(b/123531973): Include tests using dataset_v1.
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_training_and_eval_methods_on_multi_input_output_dataset(self):
+    input_a = keras.layers.Input(shape=(3,), name='input_1')
+    input_b = keras.layers.Input(shape=(3,), name='input_2')
+    dense = keras.layers.Dense(4, name='dense')
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dropout]
+
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    input_a_np = np.random.random((10, 3)).astype(dtype=np.float32)
+    input_b_np = np.random.random((10, 3)).astype(dtype=np.float32)
+    output_d_np = np.random.random((10, 4)).astype(dtype=np.float32)
+    output_e_np = np.random.random((10, 4)).astype(dtype=np.float32)
+
+    # Test with tuples
+    dataset_tuple = dataset_ops.Dataset.from_tensor_slices((
+        (input_a_np, input_b_np), (output_d_np, output_e_np)))
+    dataset_tuple = dataset_tuple.repeat(100)
+    dataset_tuple = dataset_tuple.batch(10)
+
+    model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset_tuple, steps=2, verbose=1)
+
+    predict_dataset_tuple = dataset_ops.Dataset.from_tensor_slices(
+        (input_a_np, input_b_np))
+    # TODO(b/123360757): Remove below assertion once predict() supports
+    # muti-input datasets.
+    with self.assertRaisesRegexp(ValueError,
+                                 'Error when checking model input'):
+      model.predict(predict_dataset_tuple, steps=1)
+
+    # Test with dict
+    input_dict = {'input_1': input_a_np, 'input_2': input_b_np}
+    if testing_utils.get_model_type() == 'subclass':
+      output_dict = {'output_1': output_d_np, 'output_2': output_e_np}
+    else:
+      output_dict = {'dense': output_d_np, 'dropout': output_e_np}
+
+    dataset_dict = dataset_ops.Dataset.from_tensor_slices((
+        input_dict, output_dict))
+    dataset_dict = dataset_dict.repeat(100)
+    dataset_dict = dataset_dict.batch(10)
+
+    model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset_dict, steps=2, verbose=1)
+
+    predict_dataset_dict = dataset_ops.Dataset.from_tensor_slices(
+        input_dict)
+    predict_dataset_dict = predict_dataset_dict.repeat(100)
+    predict_dataset_dict = predict_dataset_dict.batch(10)
+    model.predict(predict_dataset_dict, steps=1)
+
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sample_weights(self):
@@ -253,18 +324,18 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   def test_dataset_with_sparse_labels(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     optimizer = 'rmsprop'
-    for loss in ['sparse_categorical_crossentropy',
-                 losses_impl.sparse_softmax_cross_entropy]:
-      model.compile(optimizer, loss,
-                    run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        optimizer,
+        loss='sparse_categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
+    inputs = np.zeros((10, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
 
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
 
   @keras_parameterized.run_all_keras_modes
   def test_dataset_fit_correctness(self):
@@ -286,12 +357,32 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     inputs[20:30, :] = 1
     inputs[30:, :] = 4
     targets = np.zeros((40, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.batch(10)
-    history = model.fit(dataset,
-                        epochs=2, steps_per_epoch=2, verbose=1, shuffle=False)
+
+    # Test correctness with `steps_per_epoch`.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    history = model.fit(train_dataset,
+                        epochs=2, steps_per_epoch=2, verbose=1,
+                        validation_data=val_dataset, validation_steps=2)
     self.assertListEqual(history.history['loss'],
                          [inputs[:20].sum() / 2, inputs[20:].sum() / 2])
+    # The validation dataset will be reset at the end of each validation run.
+    self.assertListEqual(history.history['val_loss'],
+                         [inputs[:20].sum() / 2, inputs[:20].sum() / 2])
+
+    # Test correctness with dataset reset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (inputs, targets)).batch(10)
+    history = model.fit(train_dataset,
+                        epochs=2, verbose=1, validation_data=val_dataset)
+    self.assertListEqual(history.history['loss'],
+                         [inputs.sum() / 4, inputs.sum() / 4])
+    self.assertListEqual(history.history['val_loss'],
+                         [inputs.sum() / 4, inputs.sum() / 4])
 
   @tf_test_util.run_deprecated_v1
   def test_dataset_input_shape_validation(self):
@@ -326,8 +417,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_known_cardinality_no_steps_arg(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    model.compile(optimizer, 'mse',
+    model.compile('rmsprop', 'mse',
                   run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
@@ -335,8 +425,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.batch(10)
 
-    history = model.fit(dataset, epochs=2, verbose=1)
-    self.assertEqual(len(history.history['loss']), 2)
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
     model.evaluate(dataset)
     out = model.predict(dataset)
     self.assertEqual(out.shape[0], 100)
@@ -345,8 +438,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = 'rmsprop'
-    model.compile(optimizer, 'mse',
+    model.compile('rmsprop', 'mse',
                   run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.zeros((100, 3), dtype=np.float32)
@@ -356,8 +448,97 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)),
                      cardinality.UNKNOWN)
 
-    history = model.fit(dataset, epochs=2, verbose=1)
-    self.assertEqual(len(history.history['loss']), 2)
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self):
+
+    class CaptureStdout(object):
+
+      def __enter__(self):
+        self._stdout = sys.stdout
+        string_io = six.StringIO()
+        sys.stdout = string_io
+        self._stringio = string_io
+        return self
+
+      def __exit__(self, *args):
+        self.output = self._stringio.getvalue()
+        sys.stdout = self._stdout
+
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(
+        keras.backend.get_value(cardinality.cardinality(dataset)),
+        cardinality.UNKNOWN)
+
+    batch_counter = BatchCounterCallback()
+    with CaptureStdout() as capture:
+      history = model.fit(
+          dataset,
+          epochs=2,
+          callbacks=[batch_counter],
+          validation_data=dataset.take(3))
+
+    lines = capture.output.splitlines()
+
+    self.assertIn('1/Unknown', lines[2])
+    self.assertIn('10/10', lines[-1])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_unknown_cardinality_out_of_data(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(
+        keras.backend.get_value(cardinality.cardinality(dataset)),
+        cardinality.UNKNOWN)
+
+    batch_counter = BatchCounterCallback()
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      # steps_per_epoch (200) is greater than the dataset size (100). As this is
+      # unexpected, training will stop and not make it to the second epoch.
+      history = model.fit(
+          dataset,
+          epochs=2,
+          verbose=1,
+          callbacks=[batch_counter],
+          steps_per_epoch=200)
+      self.assertIn(
+          'Your dataset ran out of data; interrupting training. '
+          'Make sure that your dataset can generate at least '
+          '`steps_per_epoch * epochs` batches (in this case, 400 batches). '
+          'You may need to use the repeat() function when '
+          'building your dataset.', str(mock_log.call_args))
+
+    self.assertLen(history.history['loss'], 1)
+    self.assertEqual(batch_counter.batch_count, 10)
     model.evaluate(dataset)
     out = model.predict(dataset)
     self.assertEqual(out.shape[0], 100)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 9b11c721d4d68d15cbbc2920c6569cf93aa5dead..5cfd7e6dc7262697fd6324918f5adf62175dcfee 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
@@ -29,12 +30,13 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.engine import partial_batch_padding_handler as padding_util
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
@@ -61,10 +63,13 @@ def fit_distributed(model,
 
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
+    # Until support for partial batch is implemented across all
+    # functions and distribution strategy, we pass `mode` to selectively
+    # relax the costraint to consume all the training samples.
     steps_per_epoch, batch_size = (
         distributed_training_utils.get_input_params(
             model._distribution_strategy, first_x_value, steps_per_epoch,
-            batch_size, is_training=True))
+            batch_size, mode=ModeKeys.TRAIN))
   batch_size = model._validate_or_infer_batch_size(
       batch_size, steps_per_epoch, x)
   dataset = model._distribution_standardize_user_data(
@@ -72,11 +77,9 @@ def fit_distributed(model,
       sample_weight=sample_weight,
       class_weight=class_weight,
       batch_size=batch_size,
-      check_steps=True,
-      steps_name='steps_per_epoch',
-      steps=steps_per_epoch,
       validation_split=validation_split,
-      shuffle=shuffle)
+      shuffle=shuffle,
+      repeat=True)
 
   val_dataset = None
   if validation_data:
@@ -94,9 +97,6 @@ def fit_distributed(model,
         sample_weight=val_sample_weights,
         class_weight=None,
         batch_size=batch_size,
-        check_steps=True,
-        steps_name='validation_steps',
-        steps=validation_steps,
         validation_split=validation_split,
         shuffle=shuffle)
   elif validation_split:
@@ -114,7 +114,7 @@ def fit_distributed(model,
         initial_epoch=initial_epoch,
         steps_per_epoch=steps_per_epoch,
         validation_steps=validation_steps,
-        validation_freq=1)
+        validation_freq=validation_freq)
   else:
     return training_arrays.fit_loop(
         model,
@@ -128,7 +128,8 @@ def fit_distributed(model,
         initial_epoch=initial_epoch,
         steps_per_epoch=steps_per_epoch,
         validation_steps=validation_steps,
-        validation_freq=validation_freq)
+        validation_freq=validation_freq,
+        steps_name='steps_per_epoch')
 
 
 def evaluate_distributed(model,
@@ -149,15 +150,11 @@ def evaluate_distributed(model,
   dataset = model._distribution_standardize_user_data(
       x, y,
       sample_weight=sample_weight,
-      batch_size=batch_size,
-      check_steps=True,
-      steps_name='steps',
-      steps=steps)
+      batch_size=batch_size)
 
   if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
-    # TODO(fchollet): why aren't callbacks supported here?
     return experimental_tpu_test_loop(
-        model, dataset, verbose=verbose, steps=steps)
+        model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
   else:
     return training_arrays.test_loop(
         model,
@@ -176,22 +173,20 @@ def predict_distributed(model,
                         callbacks=None):
   """Predict loop for Distribution Strategies."""
   distributed_training_utils.validate_inputs(
-      x, None, model._distribution_strategy)
+      x, None, model._distribution_strategy, allow_partial_batch=True)
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
     steps, batch_size = distributed_training_utils.get_input_params(
-        model._distribution_strategy, first_x_value, steps, batch_size)
+        model._distribution_strategy, first_x_value, steps,
+        batch_size, mode=ModeKeys.PREDICT)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
   dataset = model._distribution_standardize_user_data(
       x,
       batch_size=batch_size,
-      check_steps=True,
-      steps_name='steps',
-      steps=steps)
+      allow_partial_batch=True)
   if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
-    # TODO(fchollet): why aren't callbacks supported here?
     return experimental_tpu_predict_loop(
-        model, dataset, verbose=verbose, steps=steps)
+        model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
   else:
     return training_arrays.predict_loop(
         model,
@@ -202,6 +197,73 @@ def predict_distributed(model,
         callbacks=callbacks)
 
 
+def _make_step_fn(model, mode, strategy, output_labels):
+  """Create step fn.
+
+  Arguments:
+    model: a Keras Model instance.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+    strategy: a `tf.distribute.Strategy` instance.
+    output_labels: the output labels for the step function.
+
+  Returns:
+    A step function to run by `tf.distribute.Strategy`.
+  """
+
+  def _per_device_execution_function(model):
+    exec_func = model._make_execution_function(mode)
+    return (exec_func.inputs, exec_func.outputs, exec_func.updates_op,
+            exec_func.session_kwargs)
+
+  def step_fn(ctx, inputs):
+    """A step fn that returns update ops."""
+    if mode == ModeKeys.PREDICT:
+      targets = None
+    else:
+      inputs, targets = inputs
+
+    if model._compile_distribution:
+      distributed_training_utils.clone_model_on_replicas(
+          model, strategy, mode, inputs=inputs, targets=targets)
+    else:
+      distributed_training_utils._build_distributed_network(
+          model, strategy, mode, inputs, targets)
+
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_execution_function,
+         args=(distributed_training_utils.get_distributed_model(model, mode),))
+    (all_inputs, all_outputs, all_updates,
+     all_session_args) = distributed_training_utils.unwrap_values(
+         strategy, grouped_inputs, grouped_outputs, grouped_updates,
+         grouped_session_args)
+    combined_fn = K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_' + str(mode) + '_function',
+        **all_session_args)
+
+    for label, output in zip(output_labels, combined_fn.outputs):
+      if mode == ModeKeys.PREDICT:
+        ctx.set_last_step_output(label, output)
+      else:
+        if label == 'loss':
+          reduce_op = ds_reduce_util.ReduceOp.SUM
+        else:
+          # We reduce all other metrics using mean for now. This is temporary
+          # workaround until new metrics are in place.
+          reduce_op = ds_reduce_util.ReduceOp.MEAN
+        ctx.set_last_step_output(label, output, reduce_op)
+
+    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
+    # feed_dict, session kwargs, run options, run_metadata for now. These should
+    # be handled appropriately
+    return combined_fn.updates_op
+
+  return step_fn
+
+
 def experimental_tpu_fit_loop(model,
                               dataset,
                               epochs=100,
@@ -243,89 +305,54 @@ def experimental_tpu_fit_loop(model,
   Raises:
       ValueError: in case of invalid arguments.
   """
+  mode = ModeKeys.TRAIN
   # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   current_strategy = model._distribution_strategy
   iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
-  scope = current_strategy.scope()
-  scope.__enter__()
+  steps_per_epoch = training_utils.infer_steps_for_dataset(
+      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
+  if (current_strategy.extended.steps_per_run != 1 and
+      steps_per_epoch is None):
+    raise ValueError('`steps_per_epoch` should be specified when calling '
+                     '`fit` on the model with TPUStrategy when '
+                     '`steps_per_run` != 1 .')
 
-  def _per_device_fit_function(model):
-    model._make_fit_function()
-    return (model._fit_function.inputs, model._fit_function.outputs,
-            model._fit_function.updates_op, model._fit_function.session_kwargs)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=1)
+  scope.__enter__()
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
-  def step_fn(ctx, inputs):
-    """Clones the model and calls make_fit_function."""
-    inputs, targets = inputs
-    if model._compile_distribution:
-      distributed_training_utils.clone_model_on_replicas(
-          model, current_strategy,
-          make_callback_model=True, inputs=inputs,
-          targets=targets, mode=distributed_training_utils.ModeKeys.TRAIN)
-    else:
-      distributed_training_utils._build_distributed_network(
-          model, current_strategy, inputs,
-          targets, mode=distributed_training_utils.ModeKeys.TRAIN)
-
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_fit_function, args=(model._distributed_model_train,))
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs,
-         grouped_updates, grouped_session_args)
-    combined_fn = K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_fit_function',
-        **all_session_args)
-
-    for label, output in zip(out_labels, combined_fn.outputs):
-      if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
-      else:
-        # We reduce all other metrics using mean for now. This is temporary
-        # workaround until new metrics are in place.
-        reduce_op = ds_reduce_util.ReduceOp.MEAN
-      ctx.set_last_step_output(label, output, reduce_op)
-
-    # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
-    # feed_dict, session kwargs, run options, run_metadata for now. These should
-    # be handled appropriately
-    return combined_fn.updates_op
+  step_fn = _make_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels)
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
   for name in model.metrics_names[1:]:
-    tensor = model._all_stateful_metrics_tensors[name]
+    tensor = model._all_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  if steps_per_epoch is None:
-    raise ValueError('`steps_per_epoch` should be specified when calling '
-                     '`fit` on the model.')
+  use_steps = steps_per_epoch is not None
+  if use_steps:
+    iteration_value = min(steps_per_epoch,
+                          current_strategy.extended.steps_per_run)
+  else:
+    iteration_value = current_strategy.extended.steps_per_run
+
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
+      value=iteration_value,
       dtype='int32',
       name='steps_per_run')
-
   ctx = current_strategy.extended.experimental_run_steps_on_iterator(
       step_fn, iterator, iterations=steps_per_run,
       initial_loop_values=initial_loop_values)
-
   train_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
 
   do_validation = bool(validation_steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(
-        model, model._distributed_model_train)
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -333,42 +360,59 @@ def experimental_tpu_fit_loop(model,
       do_validation=do_validation,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
-      verbose=verbose)
+      verbose=verbose,
+      count_mode='steps',
+      mode=mode)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.extended.steps_per_run] * (
-      steps_per_epoch // current_strategy.extended.steps_per_run)
-  if steps_per_epoch % current_strategy.extended.steps_per_run:
-    steps_to_run.append(
-        steps_per_epoch % current_strategy.extended.steps_per_run)
+  if use_steps:
+    steps_to_run = ([current_strategy.extended.steps_per_run] *
+                    (steps_per_epoch //
+                     current_strategy.extended.steps_per_run))
+    if steps_per_epoch % current_strategy.extended.steps_per_run:
+      steps_to_run.append(
+          steps_per_epoch % current_strategy.extended.steps_per_run)
+    target_steps = len(steps_to_run)
+  else:
+    target_steps = np.inf
 
-  callbacks.on_train_begin()
+  callbacks._call_begin_hook(mode)
   for epoch in range(initial_epoch, epochs):
-    distributed_training_utils._reset_metrics(
-        model, model._distributed_model_train)
+    distributed_training_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
     prev_step_count = None
-    for step_count in steps_to_run:
+    current_step = 0
+    while current_step < target_steps:
+      step_count = steps_to_run[current_step] if use_steps else 1
       batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
-      callbacks.on_batch_begin(step_index, batch_logs)
+      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
       if prev_step_count is None or step_count != prev_step_count:
         steps_per_run.load(step_count, K.get_session())
         prev_step_count = step_count
       try:
-        _, outputs = K.get_session().run([train_op, output_tensors])
+        _, outputs = K.batch_get_value([train_op, output_tensors])
       except errors.OutOfRangeError:
-        logging.warning('Your dataset iterator ran out of data; '
-                        'interrupting training. Make sure that your dataset '
-                        'can generate at least `steps_per_epoch * epochs` '
-                        'batches (in this case, %d batches).' %
-                        steps_per_epoch * epochs)
+        if use_steps:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+        else:
+          target_steps = current_step
+          logging.info('Dataset iterator ran out of data. Inferring the '
+                       'value of `steps_per_epoch` as %s  .' % target_steps)
+          distributed_training_utils.initialize_iterator(iterator,
+                                                         current_strategy)
         break
 
       batch_logs.update(outputs)
-      callbacks.on_batch_end(step_index, batch_logs)
+      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
       step_index = step_index + step_count
+      current_step += 1
+
       if callbacks.model.stop_training:
         break
 
@@ -380,13 +424,14 @@ def experimental_tpu_fit_loop(model,
         # Since we create a new clone from the original model we need to copy
         # the weights back to the original model before we can run validation.
         distributed_training_utils._copy_weights_to_original_model(
-            model, model._distributed_model_train, ModeKeys.TRAIN)
+            model, ModeKeys.TRAIN)
 
       val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
           model,
           val_dataset,
           steps=validation_steps,
-          verbose=verbose)
+          verbose=verbose,
+          callbacks=callbacks)
       if not isinstance(val_outs, list):
         val_outs = [val_outs]
       # Same labels assumed.
@@ -396,12 +441,12 @@ def experimental_tpu_fit_loop(model,
     callbacks.on_epoch_end(epoch, epoch_logs)
     if callbacks.model.stop_training:
       break
-  callbacks.on_train_end()
+  callbacks._call_end_hook(mode)
 
   if model._compile_distribution:
     # Copy the weights back from the replicated model to the original model.
     distributed_training_utils._copy_weights_to_original_model(
-        model, model._distributed_model_train, ModeKeys.TRAIN)
+        model, ModeKeys.TRAIN)
   scope.__exit__(None, None, None)
   return model.history
 
@@ -409,7 +454,8 @@ def experimental_tpu_fit_loop(model,
 def experimental_tpu_test_loop(model,
                                dataset,
                                verbose=0,
-                               steps=None):
+                               steps=None,
+                               callbacks=None):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
@@ -419,6 +465,7 @@ def experimental_tpu_test_loop(model,
       steps: Total number of steps (batches of samples)
           before declaring predictions finished.
           Ignored with the default value of `None`.
+      callbacks: List of callbacks to be called during training
 
   Returns:
       Scalar loss (if the model has a single output and no metrics)
@@ -426,64 +473,25 @@ def experimental_tpu_test_loop(model,
       and/or metrics). The attribute `model.metrics_names` will give you
       the display labels for the outputs.
   """
+  mode = ModeKeys.TEST
   current_strategy = model._distribution_strategy
-  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
-  scope = current_strategy.scope()
-  scope.__enter__()
-
-  def _per_device_eval_function(model):
-    model._make_eval_function()
-    return (model._eval_function.inputs, model._eval_function.outputs,
-            model._eval_function.updates_op,
-            model._eval_function.session_kwargs)
-
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
-
-  def step_fn(ctx, inputs):
-    """Clones the model and calls make_eval_function."""
-    inputs, targets = inputs
-    if model._compile_distribution:
-      distributed_training_utils. clone_model_on_replicas(
-          model, current_strategy,
-          make_callback_model=False, inputs=inputs,
-          targets=targets, mode=distributed_training_utils.ModeKeys.TEST)
-    else:
-      distributed_training_utils._build_distributed_network(
-          model, current_strategy, inputs, targets,
-          mode=distributed_training_utils.ModeKeys.TEST)
-
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_eval_function, args=(model._distributed_model_test,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    combined_fn = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_test_function',
-        **all_session_args)
+  iterator = distributed_training_utils.get_iterator(dataset,
+                                                     current_strategy)
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
 
-    for label, output in zip(model.metrics_names, combined_fn.outputs):
-      if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
-      else:
-        # We reduce all other metrics using mean for now. This is temporary
-        # workaround until new metrics are in place.
-        reduce_op = ds_reduce_util.ReduceOp.MEAN
-      ctx.set_last_step_output(label, output, reduce_op)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
+  scope.__enter__()
 
-    return combined_fn.updates_op
+  out_labels = model.metrics_names
+  step_fn = _make_step_fn(model, ModeKeys.TEST, current_strategy, out_labels)
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
   for name in model.metrics_names[1:]:
-    tensor = model._all_stateful_metrics_tensors[name]
+    tensor = model._all_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   # TODO(priyag): Use steps_per_run when we use new metrics as they will
@@ -499,16 +507,44 @@ def experimental_tpu_test_loop(model,
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(
-        model, model._distributed_model_test)
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
-  distributed_training_utils._reset_metrics(
-      model, model._distributed_model_test)
+  distributed_training_utils._reset_metrics(model)
+
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=False,
+      epochs=1,
+      steps_per_epoch=steps,
+      verbose=verbose,
+      count_mode='steps',
+      mode=ModeKeys.TEST)
+  callbacks._call_begin_hook(mode)
 
-  assert steps is not None
   outs = [0.] * len(model.metrics_names)
-  for step in range(steps):
-    _, batch_outs = K.get_session().run([test_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([test_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      target_steps = current_step
+      break
     for i, label in enumerate(model.metrics_names):
       if i == 0:
         # Loss is stateless metrics.
@@ -517,19 +553,28 @@ def experimental_tpu_test_loop(model,
         # For all stateful metrics, the aggregation is handled by mirrored vars.
         outs[i] = batch_outs[label]
 
+    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
+
+  callbacks._call_end_hook(mode)
 
   scope.__exit__(None, None, None)
   if len(outs) >= 0:
-    outs[0] /= (steps)
+    outs[0] /= (target_steps)
 
   if len(outs) == 1:
     return outs[0]
   return outs
 
 
-def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
+def experimental_tpu_predict_loop(model,
+                                  dataset,
+                                  verbose=0,
+                                  steps=None,
+                                  callbacks=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
@@ -539,58 +584,49 @@ def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
       steps: Total number of steps (batches of samples)
           before declaring `_predict_loop` finished.
           Ignored with the default value of `None`.
+      callbacks: List of callbacks to be called during training
 
   Returns:
       Array of predictions (if the model has a single output)
       or list of arrays of predictions
       (if the model has multiple outputs).
   """
+  mode = ModeKeys.PREDICT
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
+  dataset_fully_shaped = (distributed_training_utils.
+                          is_dataset_shape_fully_defined(dataset))
+  padding_handler = None
+  if not dataset_fully_shaped:
+    # TODO(hongjunchoi): Investigate whether operations from
+    # PartialBatchPaddingHandler are unnecessarily pruned out
+    # during graph optimization.
+    padding_handler = padding_util.PartialBatchPaddingHandler(
+        model._feed_output_shapes)
+    batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
+    padding_handler.padded_batch_size = batch_size
+    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
+                                                  padding_handler.update_mask)
+
+    dataset = dataset.map(padding_handler.pad_batch)
+    dataset = dataset.apply(batching.unbatch())
+    # Upon this point, it is guaranteed that the dataset does not
+    # have partial batches. Thus, we set `drop_remainder=True` to
+    # get static shape information about the elements in the dataset.
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+
+    if prefetch_buffer is not None:
+      dataset = dataset.prefetch(prefetch_buffer)
+
   current_strategy = model._distribution_strategy
   iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
-  scope = current_strategy.scope()
-  scope.__enter__()
-
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
-
-  def _per_device_predict_function(model):
-    model._make_predict_function()
-    return (model.predict_function.inputs,
-            model.predict_function.outputs,
-            model.predict_function.updates_op,
-            model.predict_function.session_kwargs)
-
-  def step_fn(ctx, inputs):
-    """Clones the model and calls make_predict_function."""
-    if model._compile_distribution:
-      distributed_training_utils. clone_model_on_replicas(
-          model, current_strategy,
-          make_callback_model=False, inputs=inputs,
-          mode=distributed_training_utils.ModeKeys.PREDICT)
-    else:
-      distributed_training_utils._build_distributed_network(
-          model, current_strategy, inputs,
-          mode=distributed_training_utils.ModeKeys.PREDICT)
-
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_predict_function, args=(model._distributed_model_predict,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    combined_fn = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_predict_function',
-        **all_session_args)
 
-    for label, output in zip(model.output_names, combined_fn.outputs):
-      ctx.set_last_step_output(label, output)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
+  scope.__enter__()
 
-    return combined_fn.updates_op
+  out_labels = model.output_names
+  step_fn = _make_step_fn(model, ModeKeys.PREDICT, current_strategy, out_labels)
 
   # Add initial dummy values for outputs.
   initial_loop_values = {}
@@ -614,29 +650,69 @@ def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(
-        model, model._distributed_model_predict)
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
-  distributed_training_utils._reset_metrics(
-      model, model._distributed_model_predict)
+  distributed_training_utils._reset_metrics(model)
+
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=False,
+      epochs=1,
+      steps_per_epoch=steps,
+      verbose=verbose,
+      count_mode='steps',
+      mode=mode)
+  callbacks._call_begin_hook(mode)
 
-  assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
   # and concatenate them upon returning.
   unconcatenated_outs = [[] for _ in model.outputs]
-  for step in range(steps):
-    _, batch_outs = K.get_session().run([predict_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([predict_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      break
+
     # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
     for i, label in enumerate(model.output_names):
       unconcatenated_outs[i].extend(batch_outs[label])
+    batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
+
+  callbacks._call_end_hook(mode)
 
   scope.__exit__(None, None, None)
+
   if len(unconcatenated_outs) == 1:
-    return np.concatenate(unconcatenated_outs[0], axis=0)
-  return [
-      np.concatenate(unconcatenated_outs[i], axis=0)
-      for i in range(len(unconcatenated_outs))
-  ]
+    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
+  else:
+    prediction_result = [
+        np.concatenate(unconcatenated_outs[i], axis=0)
+        for i in range(len(unconcatenated_outs))
+    ]
+
+  if padding_handler:
+    prediction_result = padding_handler.apply_mask(prediction_result)
+
+  return prediction_result
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index e62dd1ffa8d73dd956b20a7d19c6bf772cc5f0a3..228ed39d20b2ae6d4da027e803cd236ba8dd5d0b 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -21,13 +21,14 @@ from __future__ import print_function
 
 import collections
 
+import numpy as np
+
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -39,12 +40,7 @@ def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   return loss
 
 
-def _eager_metrics_fn(model,
-                      outputs,
-                      targets,
-                      sample_weights=None,
-                      masks=None,
-                      return_stateful_result=True):
+def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
   """Calculates the metrics for each output of the given model.
 
   Arguments:
@@ -53,8 +49,6 @@ def _eager_metrics_fn(model,
       targets: The predictions or targets of the given model.
       sample_weights: Optional list of sample weights for each output.
       masks: Optional list of masks for each output.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
 
   Returns:
       Returns the metric results for each output of the model.
@@ -63,11 +57,7 @@ def _eager_metrics_fn(model,
   targets = nest.flatten(targets)
   # TODO(psv): Consider supporting skip target indices in eager mode?
   metric_results = model._handle_metrics(
-      outputs,
-      targets=targets,
-      sample_weights=sample_weights,
-      masks=masks,
-      return_stateful_result=return_stateful_result)
+      outputs, targets=targets, sample_weights=sample_weights, masks=masks)
   return [backend.mean(t) for t in metric_results]
 
 
@@ -95,6 +85,10 @@ def _model_loss(model,
      regularization losses and applies masking and sample weighting
      to the loss value.
   """
+  # Used to keep track of the total loss value (stateless).
+  # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+  #                   loss_weight_2 * output_2_loss_fn(...) +
+  #                   layer losses.
   total_loss = 0
   kwargs = {}
   if model._expects_training_arg:
@@ -102,76 +96,84 @@ def _model_loss(model,
   if len(inputs) == 1 and not isinstance(inputs, dict):
     inputs = inputs[0]
 
-  if model._compute_output_and_mask_jointly:
-    outs, masks = model._call_and_compute_mask(inputs, **kwargs)
-    masks = nest.flatten(masks)
-  else:
-    outs = model.call(inputs, **kwargs)
-    masks = None
+  # Allow mixed `NumPy` and `EagerTensor` input here.
+  if any(
+      isinstance(input_t, (np.ndarray, float, int))
+      for input_t in nest.flatten(inputs)):
+    inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+
+  outs = model(inputs, **kwargs)
 
   outs = nest.flatten(outs)
-  if masks is None:
-    masks = [None for _ in outs]
+  # `None` by default for `EagerTensors`.
+  masks = [t._keras_mask for t in outs]
   targets = nest.flatten(targets)
 
-  loss_metrics = []
-  aggregated_loss_metrics = []
+  # Used to keep track of individual output losses.
+  output_losses = []
+
   with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
-      if sample_weights:
-        weights = sample_weights[i]
-      else:
-        weights = None
+      weights = sample_weights[i] if sample_weights else None
       mask = masks[i]
       with backend.name_scope(model.output_names[i] + '_loss'):
-        if isinstance(loss_fn, losses_module.Loss):
-          if mask is not None:
-            mask = math_ops.cast(mask, outs[i].dtype)
-            # Update weights with mask.
-            if weights is None:
-              weights = mask
-            else:
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, weights = squeeze_or_expand_dimensions(
-                  mask, None, weights)
-              weights *= mask
-          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+        if mask is not None:
+          mask = math_ops.cast(mask, outs[i].dtype)
+          # Update weights with mask.
+          if weights is None:
+            weights = mask
+          else:
+            # Update dimensions of weights to match with mask if possible.
+            mask, _, weights = (
+                losses_utils.squeeze_or_expand_dimensions(mask, None, weights))
+            weights *= mask
+
+        # Reset reduction on the loss so that we can get the per sample loss
+        # value. We use this to get both the stateless and stateful loss
+        # values without having to compute the underlying loss function
+        # twice.
+        weighted_losses = None
+        if hasattr(loss_fn, 'reduction'):
+          current_loss_reduction = loss_fn.reduction
+          loss_fn.reduction = losses_utils.ReductionV2.NONE
+          weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights)
+          loss_fn.reduction = current_loss_reduction
+
+          # Compute the stateless loss value.
+          output_loss = losses_utils.reduce_weighted_loss(weighted_losses)
         else:
-          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
-          output_loss = weighted_masked_fn(
-              targets[i], outs[i], weights, mask=mask)
+          # Compute the stateless loss value for a custom loss class.
+          # Here we assume that the class takes care of loss reduction
+          # because if this class returns a vector value we cannot
+          # differentiate between use case where a custom optimizer
+          # expects a vector loss value vs unreduced per-sample loss value.
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
 
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
       # as part of the loss_metrics.
       if len(model.outputs) > 1:
-        loss_metrics.append(backend.mean(output_loss))
-
-        if output_loss_metrics is not None:
-          # Keep track of the stateful loss result.
-          aggregated_loss_metrics.append(
-              training_utils.call_metric_function(
-                  output_loss_metrics[i],
-                  targets[i],
-                  outs[i],
-                  weights=weights,
-                  mask=mask))
-
-      loss_weight = model.loss_weights_list[i]
-      if total_loss is None:
-        total_loss = loss_weight * output_loss
-      else:
-        total_loss += loss_weight * output_loss
+        # Compute the stateful loss value.
+        if weighted_losses is not None:
+          aggregated_output_loss = output_loss_metrics[i](weighted_losses)
+        else:
+          # Custom loss class.
+          aggregated_output_loss = training_utils.call_metric_function(
+              output_loss_metrics[i], targets[i], outs[i], weights=weights)
+        # Keep track of the stateful output loss result.
+        output_losses.append(aggregated_output_loss)
+
+      total_loss += model.loss_weights_list[i] * output_loss
 
     total_loss = backend.mean(total_loss)
     # Add regularization losses
     custom_losses = model.losses
     if custom_losses:
-      total_loss += math_ops.add_n(custom_losses)
-    model._clear_losses()
+      total_loss += losses_utils.scale_loss_for_distribution(
+          math_ops.add_n(custom_losses))
 
-  return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
+  return outs, total_loss, output_losses, masks
 
 
 def _process_single_batch(model,
@@ -202,32 +204,36 @@ def _process_single_batch(model,
   Raises:
       ValueError: If the model has no loss to optimize.
   """
-  with backend.learning_phase_scope(1 if training else 0):
+  with backend.eager_learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics, aggregated_loss_metrics, masks\
-        = _model_loss(
-            model,
-            inputs,
-            targets,
-            output_loss_metrics=output_loss_metrics,
-            sample_weights=sample_weights,
-            training=training)
-      if loss is None:
+      outs, total_loss, output_losses, masks = (
+          _model_loss(
+              model,
+              inputs,
+              targets,
+              output_loss_metrics=output_loss_metrics,
+              sample_weights=sample_weights,
+              training=training))
+      if total_loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
     if training:
-      if not model._collected_trainable_weights:
+      if not model.trainable_weights:
         logging.warning('The list of trainable weights is empty. Make sure that'
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
       else:
-        grads = tape.gradient(loss, model._collected_trainable_weights)
+        grads = tape.gradient(total_loss, model.trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
-                                            model._collected_trainable_weights))
-    return outs, loss, loss_metrics, aggregated_loss_metrics, masks
+                                            model.trainable_weights))
+    return outs, total_loss, output_losses, masks
 
 
-def train_on_batch(model, inputs, targets, sample_weights=None):
+def train_on_batch(model,
+                   inputs,
+                   targets,
+                   sample_weights=None,
+                   output_loss_metrics=None):
   """Calculates the loss and gradient updates for one input batch.
 
   Arguments:
@@ -235,6 +241,8 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss and the loss associated with each output.
@@ -244,38 +252,39 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, _, masks = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=True)
+  outs, total_loss, output_losses, masks = (
+      _process_single_batch(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=True,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model,
-      outs,
-      targets,
-      sample_weights=sample_weights,
-      masks=masks,
-      return_stateful_result=True)
-  loss = nest.flatten(loss)
+      model, outs, targets, sample_weights=sample_weights, masks=masks)
+  total_loss = nest.flatten(total_loss)
+  results = total_loss + output_losses + metrics_results
 
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
+  return [tensor_util.constant_value(v) for v in results]
 
 
-def test_on_batch(model, inputs, targets, sample_weights=None):
+def test_on_batch(model,
+                  inputs,
+                  targets,
+                  sample_weights=None,
+                  output_loss_metrics=None):
   """Calculates the loss for one input batch.
 
   Arguments:
@@ -283,6 +292,8 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss, loss and metrics associated with each output.
@@ -292,31 +303,28 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, _, masks = _model_loss(
-      model, inputs, targets, sample_weights=sample_weights, training=False)
+  outs, total_loss, output_losses, masks = (
+      _model_loss(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=False,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model,
-      outs,
-      targets,
-      sample_weights=sample_weights,
-      masks=masks,
-      return_stateful_result=True)
-  loss = nest.flatten(loss)
-
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
+      model, outs, targets, sample_weights=sample_weights, masks=masks)
+  total_loss = nest.flatten(total_loss)
+  results = total_loss + output_losses + metrics_results
+
+  return [tensor_util.constant_value(v) for v in results]
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 84f1fa0efcba08c227cc6eb4e3e2ad4623c7adc9..bda2b972014d8801f2fe45f4ec088b047b438e5e 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -24,6 +24,7 @@ from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
@@ -34,6 +35,32 @@ from tensorflow.python.platform import test
 
 class TrainingTest(keras_parameterized.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_dynamic_model_has_trainable_weights(self):
+    if not context.executing_eagerly():
+      # Only test Eager modes, as Graph mode is not relevant for dynamic models.
+      return
+
+    class DynamicModel(keras.Model):
+
+      def __init__(self):
+        super(DynamicModel, self).__init__(dynamic=True)
+        self.dense = keras.layers.Dense(
+            1, kernel_initializer='zeros', bias_initializer='ones')
+
+      def call(self, inputs):
+        return self.dense(inputs)
+
+    model = DynamicModel()
+    model.compile('rmsprop', 'mae')
+    hist = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(hist.history['loss'][-1], 1)
+    self.assertEqual(len(model.trainable_weights), 2)
+    loss = model.train_on_batch(np.zeros((1, 1)), np.zeros((1, 1)))
+    # The loss must have been updated if the trainable weights are taken into
+    # account during tracking.
+    self.assertLess(loss, 1)
+
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_multi_io(self):
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index b8defad8b2606ec666be25727c8f0587c917b710..d71bc6fe19b9ae6ebd52de1d07c8fec39e973bfc 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.mode_keys import ModeKeys
 from tensorflow.python.util import nest
 
 
@@ -140,7 +140,6 @@ def model_iteration(model,
       shuffle=shuffle)
 
   do_validation = validation_data is not None
-  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   is_sequence = isinstance(generator, data_utils.Sequence)
   _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                       steps_per_epoch, validation_data, validation_steps, mode,
@@ -150,12 +149,14 @@ def model_iteration(model,
       model, mode, class_weight=class_weight)
 
   # Create the queue for the generator.
-  output_generator, enqueuer = _make_enqueued_generator(
-      generator,
-      workers=workers,
-      use_multiprocessing=use_multiprocessing,
-      max_queue_size=max_queue_size,
-      shuffle=shuffle)
+  enqueuer = None
+  if not is_dataset:
+    generator, enqueuer = _make_enqueued_generator(
+        generator,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        max_queue_size=max_queue_size,
+        shuffle=shuffle)
 
   num_samples_or_steps, use_steps = _get_num_samples_or_steps(
       data, steps_per_epoch)
@@ -181,9 +182,10 @@ def model_iteration(model,
   else:
     aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   if should_set_learning_phase:
     old_learning_phase = backend.learning_phase()
-    backend.set_learning_phase(1 if mode == ModeKeys.TRAIN else 0)
+    backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -208,10 +210,31 @@ def model_iteration(model,
 
     step = 0
     while step < target_steps:
-      batch_data = _get_next_batch(output_generator, mode)
+      batch_data = _get_next_batch(generator, mode)
       if batch_data is None:
-        if not is_dataset:
+        if is_dataset:
+          # The dataset passed by the user ran out of batches.
+          # Now we know the cardinality of the dataset.
+          # If steps_per_epoch was specified, then running out of data is
+          # unexpected, so we stop training and inform the user.
+          if steps_per_epoch:
+            callbacks.model.stop_training = True
+            logging.warning(
+                'Your dataset ran out of data; interrupting training. '
+                'Make sure that your dataset can generate at least '
+                '`%s * epochs` batches (in this case, %d batches). '
+                'You may need to use the repeat() function when '
+                'building your dataset.'
+                % (steps_name, steps_per_epoch * epochs))
+          elif step > 0:
+            steps_per_epoch = step
+            aggregator.num_samples_or_steps = steps_per_epoch
+            if mode == ModeKeys.TRAIN:
+              progbar.params['steps'] = steps_per_epoch
+              progbar.progbar.target = steps_per_epoch
+        else:
           # We ran out of batches while the user passed an iterator (legacy).
+          callbacks.model.stop_training = True
           logging.warning(
               'Your dataset iterator ran out of data; '
               'interrupting training. Make sure that your iterator '
@@ -219,16 +242,6 @@ def model_iteration(model,
               'batches (in this case, %d batches). You may need to'
               'use the repeat() function when building your '
               'dataset.' % (steps_name, steps_per_epoch * epochs))
-          callbacks.model.stop_training = True
-        else:
-          # The dataset passed by the user ran out of batches.
-          # Now we know the cardinality of the dataset.
-          # assert steps_per_epoch is None
-          if step > 0:
-            steps_per_epoch = step
-            aggregator.num_samples_or_steps = steps_per_epoch
-            progbar.params['steps'] = steps_per_epoch
-            progbar.progbar.target = steps_per_epoch
         break
 
       # `batch_size` used for validation data if validation
@@ -240,13 +253,32 @@ def model_iteration(model,
       callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
       progbar.on_batch_begin(step, batch_logs)
 
+      is_deferred = not model._is_compiled
       batch_outs = batch_function(*batch_data)
       if not isinstance(batch_outs, list):
         batch_outs = [batch_outs]
 
-      # Aggregate results.
       if step == 0:
         aggregator.create(batch_outs)
+
+        if is_deferred:
+          # Set callbacks params. We do this here when model is compiled only
+          # in the first iteration of this loop (deferred build scenario).
+          cbks.set_callback_parameters(
+              callbacks,
+              model,
+              do_validation=do_validation,
+              batch_size=batch_size,
+              epochs=epochs,
+              steps_per_epoch=steps_per_epoch,
+              samples=num_samples_or_steps,
+              verbose=verbose,
+              mode=mode)
+
+          progbar.params = callbacks.params
+          progbar.params['verbose'] = verbose
+
+      # Aggregate results.
       aggregator.aggregate(batch_outs)
 
       # Callbacks batch end.
@@ -302,7 +334,7 @@ def model_iteration(model,
     enqueuer.stop()
 
   if should_set_learning_phase:
-    backend.set_learning_phase(old_learning_phase)
+    backend.set_eager_learning_phase(old_learning_phase)
 
   if mode == ModeKeys.TRAIN:
     return model.history
@@ -317,10 +349,10 @@ predict_generator = functools.partial(
     model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
 
 
-def _get_next_batch(output_generator, mode):
+def _get_next_batch(generator, mode):
   """Retrieves the next batch of input data."""
   try:
-    generator_output = next(output_generator)
+    generator_output = next(generator)
   except (StopIteration, errors.OutOfRangeError):
     return None
   if not isinstance(generator_output, tuple):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 6be4da70f6e60c4da4a0e96999e51cae1aabc005..9307a36140cd8c9e249143b567646b925d4fee5f 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -27,6 +27,7 @@ import numpy as np
 import six
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -34,18 +35,17 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
-from tensorflow.python.keras.engine.training_utils import set_run_eagerly_for_dict_structure
-from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -54,6 +54,129 @@ except ImportError:
   scipy_sparse = None
 
 
+class CompileTest(keras_parameterized.TestCase):
+
+  def _get_multi_output_model(self):
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    output_a = keras.layers.Dense(1, name='dense_1')(input_a)
+    output_b = keras.layers.Dense(1, name='dense_2')(input_a)
+    return keras.models.Model(input_a, [output_a, output_b])
+
+  def _do_test_compile_with_model_and_single_loss(self, model, loss):
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss, loss)
+
+    loss = losses.get(loss)
+    if not isinstance(loss, list):
+      loss_list = [loss] * len(model.outputs)
+
+    self.assertEqual(len(model.loss_functions), len(loss_list))
+    for i in range(len(loss_list)):
+      self.assertIsInstance(model.loss_functions[i], losses.LossFunctionWrapper)
+      if not isinstance(loss_list[i], losses.LossFunctionWrapper):
+        self.assertEqual(model.loss_functions[i].fn, loss_list[i])
+    self.assertAllEqual(model.loss_weights_list, [1.] * len(loss_list))
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_single_output(self, loss):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_multi_output(self, loss):
+    model = self._get_multi_output_model()
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_multi_loss(self):
+    model = self._get_multi_output_model()
+    # Test loss is a list.
+    loss = ['mse', 'mae']
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_squared_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_absolute_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+    # Test loss is a dict.
+    loss = {'dense_1': 'mae', 'dense_2': 'mse'}
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_absolute_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_squared_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_loss_weights_list(self):
+    model = self._get_multi_output_model()
+    loss_weights = [1., 2.]
+    model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+    self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+  def test_compile_with_multi_output_and_loss_weights_dict(self):
+    with context.graph_mode():
+      model = self._get_multi_output_model()
+      loss_weights = {'dense_1': 1., 'dense_2': 2.}
+      model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+      self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+      input_np = np.random.random((10, 3))
+      output_a_np = np.random.random((10, 1))
+      output_b_np = np.random.random((10, 1))
+
+      with self.cached_session() as sess:
+        sess.run(variables_lib.global_variables_initializer())
+        total_loss, y_preds = sess.run(
+            [model.total_loss, model.outputs],
+            feed_dict={
+                'input_a:0': input_np,
+                'dense_1_target:0': output_a_np,
+                'dense_2_target:0': output_b_np
+            })
+        self.assertAllClose(
+            total_loss,
+            np.mean(
+                np.add((output_a_np - y_preds[0])**2,
+                       2 * (output_b_np - y_preds[1])**2)))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError, 'The model has 1 outputs'):
+      model.compile(optimizer='adam', loss=['mse', 'mae'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss dictionary: unknown_output'):
+      model.compile(optimizer='adam', loss={'unknown_output': 'mse'})
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError,
+                                 'it should have one entry per model output'):
+      model.compile(optimizer='adam', loss='mse', loss_weights=[1., 2.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss_weights dictionary: unknown_output'):
+      model.compile(
+          optimizer='adam', loss='mse', loss_weights={'unknown_output': 1.})
+
+
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
@@ -431,6 +554,21 @@ class TrainingTest(keras_parameterized.TestCase):
     _ = model(x)
     self.assertEqual(1, len(model.losses))
 
+  @keras_parameterized.run_all_keras_modes
+  def test_custom_mapping_in_config(self):
+
+    class MyModel(keras.Model):
+
+      def call(self, inputs):
+        return inputs
+
+      def get_config(self):
+        self.a = {}
+        return {'a': self.a}
+
+    model = MyModel()
+    self.assertIn('{"a": {}}', model.to_json())
+
   @keras_parameterized.run_all_keras_modes
   def test_training_on_sparse_data_with_dense_placeholders(self):
     # TODO(kaftan) Test seems to not work, file ticket
@@ -566,14 +704,17 @@ class TrainingTest(keras_parameterized.TestCase):
           validation_data=(x_train, y_train))
       self.assertEqual(test_callback.batch_end_call_count, 10)
       self.assertEqual(test_callback.epoch_end_call_count, 2)
+
+      weighted_metric = ('mae'
+                         if tf2.enabled() else 'weighted_mean_absolute_error')
       self.assertSetEqual(
           set(test_callback.batch_end_logs.keys()),
-          set(['batch', 'size', 'acc', 'loss', 'weighted_mean_absolute_error']))
+          set(['batch', 'size', 'acc', 'loss', weighted_metric]))
       self.assertSetEqual(
           set(test_callback.epoch_end_logs.keys()),
           set([
-              'acc', 'loss', 'weighted_mean_absolute_error', 'val_acc',
-              'val_loss', 'val_weighted_mean_absolute_error'
+              'acc', 'loss', weighted_metric, 'val_acc', 'val_loss',
+              'val_' + weighted_metric
           ]))
 
   @keras_parameterized.run_all_keras_modes
@@ -832,6 +973,95 @@ class TrainingTest(keras_parameterized.TestCase):
         callbacks=[val_counter])
     self.assertEqual(val_counter.val_runs, expected_runs)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_loss_correctness(self):
+    class Bias(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+      def call(self, inputs):
+        return inputs + self.bias
+
+    inputs = keras.Input(shape=(1,))
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
+    targets = keras.Input(shape=(1,))
+
+    model.add_loss(
+        math_ops.reduce_mean(
+            keras.losses.mean_absolute_error(targets, outputs)))
+
+    # If we want to use the loss class instance as shown below, we will need to
+    # add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_loss(keras.losses.MeanAbsoluteError()(targets, outputs))
+
+    if testing_utils.should_run_eagerly():
+      with self.assertRaisesRegex(
+          ValueError,
+          'We currently do not support enabling `run_eagerly` on compile if '
+          r'`model.add_loss\(tensor\)` or `model.add_metric\(tensor\)` '
+          'has been called.'):
+        model.compile('sgd', run_eagerly=True)
+      return
+    else:
+      model.compile(
+          keras.optimizer_v2.gradient_descent.SGD(0.033333),
+          loss=keras.losses.MeanAbsoluteError(),
+          target_tensors=[targets],
+          run_eagerly=False)
+
+      x = np.array([[0.], [1.], [2.]])
+      y = np.array([[0.5], [2.], [3.5]])
+      history = model.fit(x, y, batch_size=3, epochs=5)
+      self.assertAllClose(history.history['loss'], [3., 2.7, 2.4, 2.1, 1.8],
+                          1e-3)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_clear_losses(self):
+
+    class LayerWithSharedNestedLossLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(LayerWithSharedNestedLossLayer, self).__init__()
+        self.loss_layer = keras.layers.ActivityRegularization()
+        self.add_weight(shape=(1,), regularizer='l2')
+
+      def call(self, x):
+        x = self.loss_layer(x)
+        return self.loss_layer(x)
+
+    inputs = keras.Input(shape=(1,))
+    outputs = LayerWithSharedNestedLossLayer()(inputs)
+    model = keras.Model(inputs, outputs)
+
+    model(array_ops.ones((1, 1)))
+    self.assertEqual(len(model.losses), 3)  # Weight loss + 2 activity losses.
+
+    model(array_ops.ones((1, 1)))
+    self.assertEqual(len(model.losses), 3)  # Losses are reset upon __call__.
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_layer_with_variable_output(self):
+
+    class VariableOutputLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.v = self.add_weight('output_var', shape=(2, 5), initializer='ones')
+
+      def call(self, inputs):
+        return self.v
+
+    model = testing_utils.get_model_from_layers(
+        [VariableOutputLayer(), keras.layers.Dense(1)], input_shape=(10,))
+    # TODO(omalleyt): Make this work with `run_eagerly=True`.
+    model.compile('sgd', 'mse', run_eagerly=False)
+    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=5)
+
+    self.assertLen(model.trainable_variables, 3)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -885,9 +1115,9 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
                 'dense_1': metrics_module.CategoricalAccuracy(),
             },
             run_eagerly=testing_utils.should_run_eagerly())
-        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
+        msg = ('Output dense_1 missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
-               'expecting any data to be passed to "dense_1".')
+               'expecting any data to be passed to dense_1.')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
@@ -1247,60 +1477,38 @@ class LossWeightingTest(keras_parameterized.TestCase):
       model.fit(x, y, epochs=1, batch_size=10)
 
 
-class LossMaskingTest(keras_parameterized.TestCase):
+@keras_parameterized.run_all_keras_modes
+class MaskingTest(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_all_keras_modes
-  def test_masking_graph_sequential(self):
-    if testing_utils.should_run_eagerly():
-      self.skipTest('b/120495761')
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    run_eagerly=testing_utils.should_run_eagerly())
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_masking_deferred_sequential(self):
-    if testing_utils.should_run_eagerly():
-      self.skipTest('b/120495761')
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(mask_value=0))
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    run_eagerly=testing_utils.should_run_eagerly())
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
+  def _get_model(self, input_shape=None):
+    layers = [
+        keras.layers.Masking(mask_value=0),
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(1, kernel_initializer='one'))
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape)
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
 
-  @keras_parameterized.run_all_keras_modes
-  def test_masking_functional(self):
-    if testing_utils.should_run_eagerly():
-      self.skipTest('b/120495761')
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.layers.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    run_eagerly=testing_utils.should_run_eagerly())
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
+  @keras_parameterized.run_with_all_model_types
+  def test_masking(self):
+    model = self._get_model(input_shape=(2, 1))
+    x = np.array([[[1], [1]], [[0], [0]]])
+    y = np.array([[[1], [1]], [[1], [1]]])
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(loss, 0)
+
+  @keras_parameterized.run_with_all_model_types(exclude_models='functional')
+  def test_masking_deferred(self):
+    model = self._get_model()
+    x = np.array([[[1], [1]], [[0], [0]]])
+    y = np.array([[[1], [1]], [[1], [1]]])
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(loss, 0)
 
-  @keras_parameterized.run_all_keras_modes
   def test_mask_argument_in_layer(self):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
@@ -1318,35 +1526,18 @@ class LossMaskingTest(keras_parameterized.TestCase):
       def compute_output_shape(self, input_shape):
         return input_shape
 
-    with self.cached_session():
-      x = np.random.random((5, 3))
-      inputs = keras.layers.Input((3,))
-      masked = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = CustomMaskedLayer()(masked)
-
-      model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
-                    run_eagerly=testing_utils.should_run_eagerly())
-      y = np.random.random((5, 3))
-      model.train_on_batch(x, y)
+    x = np.random.random((5, 3))
+    inputs = keras.layers.Input((3,))
+    masked = keras.layers.Masking(mask_value=0)(inputs)
+    outputs = CustomMaskedLayer()(masked)
 
-  def test_loss_masking(self):
-    with self.cached_session():
-      weighted_loss = weighted_masked_objective(keras.losses.get('mae'))
-      shape = (3, 4, 2)
-      x = np.arange(24).reshape(shape)
-      y = 2 * x
-
-      # Normally the trailing 1 is added by standardize_weights
-      weights = np.ones((3,))
-      mask = np.ones((3, 4))
-      mask[1, 0] = 0
-
-      keras.backend.eval(
-          weighted_loss(
-              keras.backend.variable(x),
-              keras.backend.variable(y),
-              keras.backend.variable(weights), keras.backend.variable(mask)))
+    model = keras.Model(inputs, outputs)
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
+    y = np.random.random((5, 3))
+    model.train_on_batch(x, y)
 
 
 class TestDynamicTrainability(keras_parameterized.TestCase):
@@ -2095,9 +2286,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     metrics = ['mse', metrics_module.BinaryAccuracy()]
     model.compile(optimizer, loss='mae', metrics=metrics,
                   run_eagerly=testing_utils.should_run_eagerly())
+
+    mse_metric = 'mse' if tf2.enabled() else 'mean_squared_error'
     reference_metric_names = [
-        'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
-        'dense_binary_accuracy', 'dropout_mean_squared_error',
+        'loss', 'dense_loss', 'dropout_loss', 'dense_' + mse_metric,
+        'dense_binary_accuracy', 'dropout_' + mse_metric,
         'dropout_binary_accuracy'
     ]
     self.assertEqual(reference_metric_names, model.metrics_names)
@@ -2114,69 +2307,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            3, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='mae',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # verify correctness of stateful and stateless metrics.
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 1.)
-    self.assertEqual(outs[2], 1.)
-
-    y = np.zeros((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness_with_weighted_metrics(self):
-    np.random.seed(1337)
-    x = np.array([[[1.], [1.]], [[0.], [0.]]])
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_initializer='ones'),
-            input_shape=(2, 1)))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='mse',
-        sample_weight_mode='temporal',
-        weighted_metrics=['accuracy', 'mse'],
-        run_eagerly=testing_utils.should_run_eagerly())
-    y = np.array([[[1.], [1.]], [[1.], [1.]]])
-
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs, [0.5, 0.5, 0.5])
-
-    w = np.array([[0., 0.], [0., 0.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertEqual(outs, [0., 0., 0.])
-
-    w = np.array([[3., 4.], [1., 2.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
-
-    # Verify that metric value is same with arbitrary weights and batch size.
-    x = np.random.random((50, 2, 1))
-    y = np.random.random((50, 2, 1))
-    w = np.random.random((50, 2))
-    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
-    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
-    self.assertNear(mse1, mse2, err=1e-7)
-
   @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
@@ -2199,6 +2329,67 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
+  @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+  @keras_parameterized.run_all_keras_modes
+  def test_metrics_valid_compile_input_formats(self):
+    inp_1 = keras.layers.Input(shape=(1,), name='input_1')
+    inp_2 = keras.layers.Input(shape=(1,), name='input_2')
+    x = keras.layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    # list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[keras.metrics.MeanSquaredError()],
+        weighted_metrics=[keras.metrics.MeanSquaredError()],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # list of list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        weighted_metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # dict of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        weighted_metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        run_eagerly=testing_utils.should_run_eagerly())
+
   @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
@@ -2216,6 +2407,17 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           metrics=metrics_module.CategoricalAccuracy(),
           run_eagerly=testing_utils.should_run_eagerly())
 
+    inp = keras.layers.Input(shape=(1,))
+    x = keras.layers.Dense(3, activation='relu')(inp)
+    out_1 = keras.layers.Dense(1, activation='sigmoid', name='output_1')(x)
+    out_2 = keras.layers.Dense(1, activation='sigmoid', name='output_2')(x)
+    model = keras.models.Model(inp, [out_1, out_2])
+    with self.assertRaisesRegex(
+        ValueError, 'When passing a list of lists as `metrics`, '
+        'it should have one entry per model output. '
+        'The model has 2 outputs, but you passed metrics='):
+      model.compile('rmsprop', loss='mse', metrics=[['mse']])
+
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
     if testing_utils.should_run_eagerly():
@@ -2244,40 +2446,50 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       scores = model.train_on_batch(x, y, sample_weight=w)
       self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  @tf_test_util.run_deprecated_v1
-  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_with_tensor_on_model(self):
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
 
-      # test with a metric which does not have the standard signature:
-      # (y_true, y_pred, sample_Weight)
+    # test with a metric which does not have the standard signature:
+    # (y_true, y_pred, sample_Weight)
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse')
 
-      inputs = np.ones(shape=(10, 1))
-      targets = np.ones(shape=(10, 1))
-      history = model.fit(
-          inputs,
-          targets,
-          epochs=2,
-          batch_size=5,
-          validation_data=(inputs, targets))
-      self.assertEqual(history.history['metric_1'][-1], 5)
-      self.assertEqual(history.history['metric_2'][-1], 1)
-      self.assertEqual(history.history['val_metric_1'][-1], 5)
-      self.assertEqual(history.history['val_metric_2'][-1], 1)
+    if testing_utils.should_run_eagerly():
+      with self.assertRaisesRegex(
+          ValueError,
+          'We currently do not support enabling `run_eagerly` on compile if '
+          r'`model.add_loss\(tensor\)` or `model.add_metric\(tensor\)` '
+          'has been called.'):
+        model.compile('sgd', run_eagerly=True)
+      return
+    else:
+      model.compile('sgd', loss='mse', run_eagerly=False)
 
-      eval_results = model.evaluate(inputs, targets, batch_size=5)
-      self.assertEqual(eval_results[-1], 1)
-      self.assertEqual(eval_results[-2], 5)
+    inputs = np.ones(shape=(10, 1))
+    targets = np.ones(shape=(10, 1))
+    history = model.fit(
+        inputs,
+        targets,
+        epochs=2,
+        batch_size=5,
+        validation_data=(inputs, targets))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertEqual(history.history['metric_2'][-1], 1)
+    self.assertEqual(history.history['val_metric_1'][-1], 5)
+    self.assertEqual(history.history['val_metric_2'][-1], 1)
 
-      model.predict(inputs, batch_size=5)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
+    eval_results = model.evaluate(inputs, targets, batch_size=5)
+    self.assertEqual(eval_results[-1], 1)
+    self.assertEqual(eval_results[-2], 5)
+
+    model.predict(inputs, batch_size=5)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
@@ -2317,6 +2529,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_layer_call(self):
 
@@ -2332,9 +2545,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
         return inputs + 1
 
-    model = keras.Sequential()
-    model.add(TestLayer(input_shape=(1,)))
-    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    layers = [
+        TestLayer(input_shape=(1,)),
+        keras.layers.Dense(2, kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
     model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
                   run_eagerly=testing_utils.should_run_eagerly())
 
@@ -2344,60 +2559,63 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  @tf_test_util.run_deprecated_v1
+  @keras_parameterized.run_all_keras_modes
   def test_model_metrics_list(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse', metrics=['acc'])
-
-      # Verify that the metrics added using `compile` and `add_metric` API are
-      # included
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
-
-  def test_model_eager_metrics_list(self):
-    with context.eager_mode():
 
-      class TestModel(keras.Model):
+    if testing_utils.should_run_eagerly():
+      with self.assertRaisesRegex(
+          ValueError,
+          'We currently do not support enabling `run_eagerly` on compile if '
+          r'`model.add_loss\(tensor\)` or `model.add_metric\(tensor\)` '
+          'has been called.'):
+        model.compile('sgd', run_eagerly=True)
+      return
+    else:
+      model.compile(
+          'sgd',
+          loss='mse',
+          metrics=[metrics_module.Accuracy('acc')],
+          run_eagerly=False)
 
-        def __init__(self):
-          super(TestModel, self).__init__(name='test_model')
-          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+    # Verify that the metrics added using `compile` and `add_metric` API are
+    # included
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics],
+                     ['acc', 'metric_1', 'metric_2'])
 
-        def call(self, x):
-          self.add_metric(
-              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
-          return self.dense1(x)
+  @keras_parameterized.run_all_keras_modes
+  def test_model_metrics_list_in_call(self):
 
-      model = TestModel()
-      model.compile(
-          loss='mse',
-          optimizer=RMSPropOptimizer(0.01),
-          metrics=['acc'],
-          run_eagerly=True)
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(0.01),
+        metrics=[metrics_module.Accuracy('acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics], ['acc', 'metric_1'])
 
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
@@ -2435,28 +2653,34 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
-  def test_invalid_metric_tensor_in_call(self):
-    with context.eager_mode():
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_metric_tensor(self):
 
-      class TestLayer(keras.layers.Layer):
+    class TestLayer(keras.layers.Layer):
 
-        def call(self, inputs):
-          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
-          return inputs + 1
+      def build(self, input_shape):
+        self.built = True
 
-      model = keras.Sequential()
-      model.add(TestLayer(input_shape=(1,)))
-      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
-      model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+      def call(self, inputs):
+        self.add_metric(math_ops.reduce_mean(inputs), name='metric_1')
+        return inputs + 1
 
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      with self.assertRaisesRegexp(
-          ValueError,
-          'We do not support adding an aggregated metric tensor in `call` in '
-          'eager execution.'):
-        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    layers = [TestLayer(input_shape=(1,))]
+    layers.append(keras.layers.Dense(2, kernel_initializer='ones'))
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'We do not support adding an aggregated metric result tensor that is '
+        'not the output of a `tf.keras.metrics.Metric` metric instance.'):
+      model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
@@ -2486,10 +2710,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
-  def test_multiple_no_name_input_to_add_metric(self):
-    # TODO(kaftan) Test seems to not work, file ticket
-    if testing_utils.should_run_eagerly() and context.executing_eagerly():
-      self.skipTest('Skipping running model eagerly.')
+  def test_add_metric_without_name(self):
 
     class TestModel(keras.Model):
 
@@ -2498,7 +2719,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
 
       def call(self, x):
-        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         return self.dense1(x)
 
@@ -2507,81 +2727,163 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
                   run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
-    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_a1_total_loss_available_with_dict_dataset(self):
+    with self.assertRaisesRegex(ValueError,
+                                'Please provide a name for your metric like'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-    class TestModel(keras.models.Model):
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_correctness(self):
+    inputs = keras.Input(shape=(1,))
+    targets = keras.Input(shape=(1,))
 
-      def call(self, inputs, training=None, mask=None):
-        return math_ops.to_float(inputs['id'])
+    class Bias(keras.layers.Layer):
 
-    model = TestModel()
-    model.compile(
-        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
-        run_eagerly=testing_utils.should_run_eagerly())
-    dataset = dataset_ops.Dataset.from_tensor_slices(({
-        'id': [[6], [3], [1]]
-    }, [[0.7], [0.4], [0.2]]))
-    val_dataset = dataset_ops.Dataset.from_tensor_slices(({
-        'id': [[8], [5]]
-    }, [[0.9], [0.6]]))
-    history = model.fit(
-        dataset,
-        steps_per_epoch=2,
-        validation_data=val_dataset,
-        validation_steps=2)
-    self.assertAlmostEqual(history.history['val_loss'][0], 34.885, 2)
-    model.evaluate(dataset, steps=30)
-    model.predict([7])
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+        self.mae = metrics_module.MeanAbsoluteError(name='mae_1')
 
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_total_loss_available_with_dict_array(self):
+      def call(self, inputs):
+        outputs = inputs + self.bias
+        self.add_metric(self.mae(targets, outputs), name='mae_1')
+        return outputs
 
-    class TestModel(keras.models.Model):
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
 
-      def call(self, inputs, training=None, mask=None):
-        return math_ops.to_float(inputs['id'])
+    model.add_metric(
+        metrics_module.mean_absolute_error(targets, outputs),
+        name='mae_2',
+        aggregation='mean')
 
-    model = TestModel()
-    model.compile(
-        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
-        run_eagerly=testing_utils.should_run_eagerly())
-    x = {'id': np.array([[3], [1]])}
-    y = np.array([[4], [2]])
-    val_dataset = (x, y)
-    history = model.fit(
-        x,
-        y,
-        batch_size=32,
-        steps_per_epoch=2,
-        validation_data=val_dataset,
-        validation_steps=2)
-    self.assertAlmostEqual(history.history['val_loss'][0], 1.0, 2)
-    model.evaluate(x, y)
-    model.predict([7])
-
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_set_run_eagerly_for_dict_structure(self):
-    test_model = keras.models.Model()
-    self.assertFalse(test_model.run_eagerly)
-    set_run_eagerly_for_dict_structure(
-        test_model,
-        {'a': 2})
-    self.assertTrue(test_model.run_eagerly)
-
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_set_run_eagerly_for_dict_dataset(self):
-    test_model = keras.models.Model()
-    self.assertFalse(test_model.run_eagerly)
-    set_run_eagerly_for_dict_structure(
-        test_model,
-        dataset_ops.Dataset.from_tensor_slices(({
-            'id': [[3], [1]]
-        }, [[0.5], [0.2]])))
-    self.assertTrue(test_model.run_eagerly)
+    # If we want to use the metric class instance as shown below, we will need
+    # to add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_metric(
+          metrics_module.MeanAbsoluteError(name='mae_3')(targets, outputs))
+
+    if testing_utils.should_run_eagerly():
+      with self.assertRaisesRegex(
+          ValueError,
+          'We currently do not support enabling `run_eagerly` on compile if '
+          r'`model.add_loss\(tensor\)` or `model.add_metric\(tensor\)` '
+          'has been called.'):
+        model.compile('sgd', run_eagerly=True)
+      return
+    else:
+      model.compile(
+          loss='mae',
+          optimizer=keras.optimizer_v2.gradient_descent.SGD(0.1),
+          metrics=[metrics_module.MeanAbsoluteError(name='mae_4')],
+          target_tensors=[targets],
+          run_eagerly=False)
+
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    history = model.fit(x, y, batch_size=3, epochs=5)
+
+    expected_val = [1., 0.9, 0.8, 0.7, 0.6]
+    for key in ['loss', 'mae_1', 'mae_2', 'mae_3', 'mae_4']:
+      self.assertAllClose(history.history[key], expected_val, 1e-3)
+
+
+class BareUpdateLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.counter = self.add_weight(
+        'counter',
+        dtype='int32',
+        shape=(),
+        initializer='zeros',
+        trainable=False)
+
+  def call(self, inputs):
+    state_ops.assign_add(self.counter, 1)
+    return math_ops.cast(self.counter, inputs.dtype) * inputs
+
+
+class AddUpdateLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.counter = self.add_weight(
+        'counter',
+        dtype='int32',
+        shape=(),
+        initializer='zeros',
+        trainable=False)
+
+  def call(self, inputs):
+    # Make sure update isn't run twice.
+    self.add_update(state_ops.assign_add(self.counter, 1))
+    return math_ops.cast(self.counter, inputs.dtype) * inputs
+
+
+class NestedUpdateLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.layer = BareUpdateLayer()
+    self.layer.build(input_shape)
+
+  @property
+  def counter(self):
+    return self.layer.counter
+
+  def call(self, inputs):
+    return self.layer(inputs)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TestAutoUpdates(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @parameterized.named_parameters(('bare_update', BareUpdateLayer()),
+                                  ('add_update', AddUpdateLayer()),
+                                  ('nested_update', NestedUpdateLayer()))
+  def test_updates_in_model(self, layer):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    model = testing_utils.get_model_from_layers(
+        [layer, keras.layers.Dense(1)], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, batch_size=2, epochs=1)
+    if not testing_utils.should_run_eagerly():
+      # Check that `trainable=False` disables updates.
+      layer.trainable = False
+      model.compile(
+          'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, batch_size=2, epochs=1)
+    self.assertEqual(self.evaluate(layer.counter), 5)
+
+  @parameterized.named_parameters(('bare_update', BareUpdateLayer()),
+                                  ('add_update', AddUpdateLayer()),
+                                  ('nested_update', NestedUpdateLayer()))
+  def test_updates_standalone_layer(self, layer):
+    y = layer(np.ones((10, 10)))
+    self.evaluate(layer.counter.initializer)
+    self.evaluate(y)
+    self.assertEqual(self.evaluate(layer.counter), 1)
+
+  def test_trainable_false(self):
+    x = keras.backend.placeholder(shape=(10, 10), dtype='float32')
+    layer = NestedUpdateLayer()
+    layer.trainable = False
+    y = layer(x)
+    func = keras.backend.function([x], [y])
+    x_val = np.ones((10, 10))
+    func(x_val)
+    counter = keras.backend.get_value(layer.counter)
+    self.assertEqual(counter, 0)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_batchnorm_trainable_false(self):
+    bn = keras.layers.BatchNormalization()
+    bn.trainable = False
+    model = testing_utils.get_model_from_layers([bn, keras.layers.Dense(1)],
+                                                input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    model.fit(x, y, batch_size=2, epochs=1)
+    self.assertAllEqual(self.evaluate(bn.moving_mean), np.zeros((10,)))
+    self.assertAllEqual(self.evaluate(bn.moving_variance), np.ones((10,)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 7e5bc08e5e462f58743184b477fde52d7fb6c6aa..ee3bec11de5b7aa54e9f5b55627774a2eba5c7d8 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Training-related utilities.
-"""
+"""Training-related utilities."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,19 +20,18 @@ from __future__ import print_function
 import abc
 import collections
 from collections import OrderedDict
-import copy
-import json
-import os
 
 import numpy as np
 import six
 
+from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -42,14 +40,11 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 
@@ -189,10 +184,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   return slices
 
 
-def check_num_samples(ins,
-                      batch_size=None,
-                      steps=None,
-                      steps_name='steps'):
+def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
   """Determine the number of samples provided for training and evaluation.
 
   The number of samples is not defined when running with `steps`,
@@ -201,9 +193,8 @@ def check_num_samples(ins,
   Arguments:
       ins: List of tensors to be fed to the Keras function.
       batch_size: Integer batch size or `None` if not defined.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
+      steps: Total number of steps (batches of samples) before declaring
+        `_predict_loop` finished. Ignored with the default value of `None`.
       steps_name: The public API's parameter name for `steps`.
 
   Raises:
@@ -217,13 +208,10 @@ def check_num_samples(ins,
       processed based on the size of the first dimension of the
       first input numpy array. When steps is not `None` and
       `batch_size` is `None`, returns `None`.
-
-  Raises:
-      ValueError: In case of invalid arguments.
   """
   if steps is not None and batch_size is not None:
-    raise ValueError(
-        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+    raise ValueError('If ' + steps_name +
+                     ' is set, the `batch_size` must be None.')
   if check_steps_argument(ins, steps, steps_name):
     return None
   if hasattr(ins[0], 'shape'):
@@ -236,9 +224,8 @@ def standardize_single_array(x, expected_shape=None):
   if x is None:
     return None
 
-  if (x.shape is not None
-      and len(x.shape) == 1
-      and (expected_shape is None or len(expected_shape) != 1)):
+  if (x.shape is not None and len(x.shape) == 1 and
+      (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
       x = array_ops.expand_dims(x, axis=1)
     else:
@@ -262,9 +249,8 @@ def standardize_input_data(data,
       data: User-provided input data (polymorphic).
       names: List of expected array names.
       shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that
-          the batch axis of the arrays matches the expected
-          value found in `shapes`.
+      check_batch_axis: Boolean; whether to check that the batch axis of the
+        arrays matches the expected value found in `shapes`.
       exception_prefix: String prefix used for exception formatting.
 
   Returns:
@@ -276,8 +262,9 @@ def standardize_input_data(data,
   if not names:
     if (data is not None and hasattr(data, '__len__') and len(data) and
         not isinstance(data, dict)):
-      raise ValueError('Error when checking model ' + exception_prefix + ': '
-                       'expected no data, but got:', data)
+      raise ValueError(
+          'Error when checking model ' + exception_prefix + ': '
+          'expected no data, but got:', data)
     return []
   if data is None:
     return [None for _ in range(len(names))]
@@ -305,8 +292,9 @@ def standardize_input_data(data,
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
   if shapes is not None:
-    data = [standardize_single_array(x, shape)
-            for (x, shape) in zip(data, shapes)]
+    data = [
+        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
+    ]
   else:
     data = [standardize_single_array(x) for x in data]
 
@@ -319,11 +307,11 @@ def standardize_input_data(data,
                        'but instead got the following list of ' +
                        str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
     elif len(names) > 1:
-      raise ValueError(
-          'Error when checking model ' + exception_prefix +
-          ': you are passing a list as input to your model, '
-          'but the model expects a list of ' + str(len(names)) +
-          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': you are passing a list as input to your model, '
+                       'but the model expects a list of ' + str(len(names)) +
+                       ' Numpy arrays instead. The list you passed was: ' +
+                       str(data)[:200])
     elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
                       ': data should be a Numpy array, or list/dict of '
@@ -353,10 +341,10 @@ def standardize_input_data(data,
           shape = shape[1:]
         for dim, ref_dim in zip(data_shape, shape):
           if ref_dim != dim and ref_dim is not None and dim is not None:
-            raise ValueError(
-                'Error when checking ' + exception_prefix + ': expected ' +
-                names[i] + ' to have shape ' + str(shape) +
-                ' but got array with shape ' + str(data_shape))
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shape) + ' but got array with shape ' +
+                             str(data_shape))
   return data
 
 
@@ -398,10 +386,10 @@ def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError(
-        'The model has multiple outputs, so `' + weight_type + '` '
-        'should be either a list or a dict. '
-        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list or a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
 
 
 def standardize_class_weights(class_weight, output_names):
@@ -432,8 +420,11 @@ def check_array_lengths(inputs, targets, weights=None):
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x
-                  if y is not None and not tensor_util.is_tensor(y)])
+      return set([
+          y.shape[0]
+          for y in x
+          if y is not None and not tensor_util.is_tensor(y)
+      ])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -477,17 +468,20 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
       ValueError: if a loss function or target array
           is incompatible with an output.
   """
-  key_losses = {
+  key_loss_fns = {
       losses.mean_squared_error, losses.binary_crossentropy,
       losses.categorical_crossentropy
   }
+  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
+                      losses.CategoricalCrossentropy)
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if y is None or loss is None or tensor_util.is_tensor(y):
       continue
-    if loss is losses.categorical_crossentropy:
+    if losses.is_categorical_crossentropy(loss):
       if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' + str(
-            y.shape) + ' while using as loss `categorical_crossentropy`. '
+        raise ValueError('You are passing a target array of shape ' +
+                         str(y.shape) +
+                         ' while using as loss `categorical_crossentropy`. '
                          '`categorical_crossentropy` expects '
                          'targets to be binary matrices (1s and 0s) '
                          'of shape (samples, classes). '
@@ -501,14 +495,20 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
                          'Alternatively, you can use the loss function '
                          '`sparse_categorical_crossentropy` instead, '
                          'which does expect integer targets.')
-    if loss in key_losses:
+
+    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
+    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
+                                               (loss.fn in key_loss_fns))):
       for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
         if out_dim is not None and target_dim != out_dim:
+          loss_name = loss.name
+          if loss_name is None:
+            loss_type = loss.fn if is_loss_wrapper else type(loss)
+            loss_name = loss_type.__name__
           raise ValueError('A target array with shape ' + str(y.shape) +
                            ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss.__name__ + '`. '
-                           'This loss expects '
-                           'targets to have the same shape '
+                           ' while using as loss `' + loss_name + '`. '
+                           'This loss expects targets to have the same shape '
                            'as the output.')
 
 
@@ -516,45 +516,57 @@ def collect_per_output_metric_info(metrics,
                                    output_names,
                                    output_shapes,
                                    loss_fns,
-                                   sample_weights=None):
+                                   is_weighted=False):
   """Maps metric names and functions to model outputs.
 
   Arguments:
-      metrics: a list or dict of metric functions.
+      metrics: a list or a list of lists or a dict of metric functions.
       output_names: a list of the names (strings) of model outputs.
       output_shapes: a list of the shapes (strings) of model outputs.
       loss_fns: a list of the loss functions corresponding to the model outputs.
-      sample_weights: a list of weights to be applied on the model outputs.
+      is_weighted: Boolean indicating whether the given metrics are weighted.
 
   Returns:
       A list (one entry per model output) of dicts.
       For instance, if the model has 2 outputs, and for the first output
       we want to compute "binary_accuracy" and "binary_crossentropy",
       and just "binary_accuracy" for the second output,
-      the list would look like: `[
-        {
-          'acc': (binary_accuracy(), mean_obj_1),
-          'ce': (binary_crossentropy(), mean_obj_2)
-        },
-        {
-          'acc': (binary_accuracy(), mean_obj_3)
-        }
-      ]`
+      the list would look like: `[{
+          'acc': binary_accuracy(),
+          'ce': binary_crossentropy(),
+        }, {
+          'acc': binary_accuracy(),
+        }]`
 
   Raises:
       TypeError: if an incorrect type is passed for the `metrics` argument.
   """
   if not metrics:
     return [{} for _ in output_names]
+
   if isinstance(metrics, list):
-    # we then apply all metrics to all outputs.
-    nested_metrics = [copy.copy(metrics) for _ in output_names]
+    any_sub_list = any(isinstance(m, list) for m in metrics)
+    if any_sub_list:
+      if len(metrics) != len(output_names):
+        raise ValueError('When passing a list of lists as `metrics`, '
+                         'it should have one entry per model output. '
+                         'The model has ' + str(len(output_names)) +
+                         ' outputs, but you passed metrics=' + str(metrics))
+      # User has provided a list of len = len(outputs).
+      nested_metrics = [generic_utils.to_list(m) for m in metrics]
+    else:
+      # If it is a single list we then apply all metrics to all outputs.
+      if len(output_names) > 1:
+        nested_metrics = []
+        for _ in output_names:
+          nested_metrics.append(
+              [metrics_module.clone_metric(m) for m in metrics])
+      else:
+        nested_metrics = [metrics]
   elif isinstance(metrics, dict):
     nested_metrics = []
     for name in output_names:
-      output_metrics = metrics.get(name, [])
-      if not isinstance(output_metrics, list):
-        output_metrics = [output_metrics]
+      output_metrics = generic_utils.to_list(metrics.get(name, []))
       nested_metrics.append(output_metrics)
   else:
     raise TypeError('Type of `metrics` argument not understood. '
@@ -564,24 +576,15 @@ def collect_per_output_metric_info(metrics,
   for i, metrics in enumerate(nested_metrics):
     metrics_dict = OrderedDict()
     for metric in metrics:
-      weighted = False if (sample_weights is None) else (
-          sample_weights[i] is not None)
-      metric_name = get_metric_name(metric, weighted)
+      metric_name = get_metric_name(metric, is_weighted)
       metric_fn = get_metric_function(
           metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
 
-      # If the metric function is not stateful, we create a stateful version and
-      # return both the stateless and the stateful version together. For batch
-      # APIs like `train_on_batch` we will use the stateless version and for
-      # other APIs like `fit` we will use the stateful version.
-      is_stateful = isinstance(metric_fn,
-                               base_layer.Layer) and metric_fn.stateful
-      stateful_fn = metric_fn
-      if not is_stateful:
-        stateful_fn = metrics_module.MeanMetricWrapper(
-            metric_fn, name=metric_fn.__name__)
-
-      metrics_dict[metric_name] = (metric_fn, stateful_fn)
+      # If the metric function is not stateful, we create a stateful version.
+      if not isinstance(metric_fn, metrics_module.Metric):
+        metric_fn = metrics_module.MeanMetricWrapper(
+            metric_fn, name=metric_name)
+      metrics_dict[metric_name] = metric_fn
     per_output_metrics.append(metrics_dict)
 
   return per_output_metrics
@@ -611,71 +614,6 @@ def batch_shuffle(index_array, batch_size):
   return np.append(index_array, last_batch)
 
 
-def weighted_masked_objective(fn):
-  """Adds support for masking and sample-weighting to an objective function.
-
-  It transforms an objective function `fn(y_true, y_pred)`
-  into a sample-weighted, cost-masked objective function
-  `fn(y_true, y_pred, weights, mask)`.
-
-  Arguments:
-      fn: The objective function to wrap,
-          with signature `fn(y_true, y_pred)`.
-
-  Returns:
-      A function with signature `fn(y_true, y_pred, weights, mask)`.
-  """
-  if fn is None:
-    return None
-
-  def weighted(y_true, y_pred, weights, mask=None):
-    """Wrapper function.
-
-    Arguments:
-        y_true: `y_true` argument of `fn`.
-        y_pred: `y_pred` argument of `fn`.
-        weights: Weights tensor.
-        mask: Mask tensor.
-
-    Returns:
-        Scalar tensor.
-    """
-    # score_array has ndim >= 2
-    score_array = fn(y_true, y_pred)
-    if mask is not None:
-      mask = math_ops.cast(mask, y_pred.dtype)
-      # Update weights with mask.
-      if weights is None:
-        weights = mask
-      else:
-        # Update dimensions of weights to match with mask if possible.
-        mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
-        weights *= mask
-
-    # Apply sample weighting.
-    if weights is not None:
-
-      # Update dimensions of weights to match with values if possible.
-      score_array, _, weights = squeeze_or_expand_dimensions(
-          score_array, None, weights)
-      try:
-        # Broadcast weights if possible.
-        weights = weights_broadcast_ops.broadcast_weights(weights, score_array)
-      except ValueError:
-        # Reduce values to same ndim as weight array.
-        ndim = K.ndim(score_array)
-        weight_ndim = K.ndim(weights)
-        score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
-
-      score_array = math_ops.multiply(score_array, weights)
-      score_array = math_ops.reduce_sum(score_array)
-      weights = math_ops.reduce_sum(weights)
-      score_array = math_ops.div_no_nan(score_array, weights)
-    return K.mean(score_array)
-
-  return weighted
-
-
 def standardize_weights(y,
                         sample_weight=None,
                         class_weight=None,
@@ -690,10 +628,10 @@ def standardize_weights(y,
       y: Numpy array of model targets to be weighted.
       sample_weight: User-provided `sample_weight` argument.
       class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`.
-          `"temporal"` indicated that we expect 2D weight data
-          that will be applied to the last 2 dimensions of
-          the targets (i.e. we are weighting timesteps, not samples).
+      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
+        that we expect 2D weight data that will be applied to the last 2
+        dimensions of the targets (i.e. we are weighting timesteps, not
+        samples).
 
   Returns:
       A numpy array of target weights, one entry per sample to weight.
@@ -734,17 +672,17 @@ def standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError(
-          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
-          'Expected sample_weight with rank '
-          'less than or equal to ' + str(len(y.shape)))
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
 
     if (not tensor_util.is_tensor(sample_weight) and
         y.shape[:sample_weight.ndim] != sample_weight.shape):
-      raise ValueError(
-          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
-          ' for an input with shape ' + str(y.shape) + '. '
-          'sample_weight cannot be broadcast.')
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
+                       'sample_weight cannot be broadcast.')
 
   # Class weights applied per-sample.
   class_sample_weight = None
@@ -768,10 +706,10 @@ def standardize_weights(y,
       # subtract the sets to pick all missing classes
       existing_classes = set(y_classes)
       existing_class_weight = set(class_weight.keys())
-      raise ValueError('`class_weight` must contain all classes in the data.'
-                       ' The classes %s exist in the data but not in '
-                       '`class_weight`.' %
-                       (existing_classes - existing_class_weight))
+      raise ValueError(
+          '`class_weight` must contain all classes in the data.'
+          ' The classes %s exist in the data but not in '
+          '`class_weight`.' % (existing_classes - existing_class_weight))
 
   if class_sample_weight is not None and sample_weight is not None:
     # Multiply weights if both are provided.
@@ -807,21 +745,29 @@ def get_metric_name(metric, weighted=False):
   Returns:
       The metric name.
   """
-  metric_name_prefix = 'weighted_' if weighted else ''
-  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-    if metric in ('accuracy', 'acc'):
-      suffix = 'acc'
-    elif metric in ('crossentropy', 'ce'):
-      suffix = 'ce'
+  if tf2.enabled():
+    # We keep the string that the user has set in compile as the metric name.
+    if isinstance(metric, six.string_types):
+      return metric
+
+    metric = metrics_module.get(metric)
+    return metric.name if hasattr(metric, 'name') else metric.__name__
   else:
-    metric_fn = metrics_module.get(metric)
-    # Get metric name as string
-    if hasattr(metric_fn, 'name'):
-      suffix = metric_fn.name
+    metric_name_prefix = 'weighted_' if weighted else ''
+    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+      if metric in ('accuracy', 'acc'):
+        suffix = 'acc'
+      elif metric in ('crossentropy', 'ce'):
+        suffix = 'ce'
     else:
-      suffix = metric_fn.__name__
-  metric_name = metric_name_prefix + suffix
-  return metric_name
+      metric_fn = metrics_module.get(metric)
+      # Get metric name as string
+      if hasattr(metric_fn, 'name'):
+        suffix = metric_fn.name
+      else:
+        suffix = metric_fn.__name__
+    metric_name = metric_name_prefix + suffix
+    return metric_name
 
 
 def get_metric_function(metric, output_shape=None, loss_fn=None):
@@ -829,29 +775,41 @@ def get_metric_function(metric, output_shape=None, loss_fn=None):
 
   Arguments:
       metric: Metric function name or reference.
-      output_shape: The shape of the output that this metric
-          will be calculated for.
+      output_shape: The shape of the output that this metric will be calculated
+        for.
       loss_fn: The loss function used.
 
   Returns:
       The metric function.
   """
+  if metric not in ['accuracy', 'acc', 'crossentropy', 'ce']:
+    return metrics_module.get(metric)
+
+  is_sparse_categorical_crossentropy = (
+      isinstance(loss_fn, losses.SparseCategoricalCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.sparse_categorical_crossentropy))
+
+  is_binary_crossentropy = (
+      isinstance(loss_fn, losses.BinaryCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.binary_crossentropy))
+
   if metric in ['accuracy', 'acc']:
-    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
-      return metrics_module.binary_accuracy  # case: binary accuracy
-    elif loss_fn == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_accuracy
+    elif is_sparse_categorical_crossentropy:
       return metrics_module.sparse_categorical_accuracy
-    return metrics_module.categorical_accuracy  # case: categorical accuracy
-  elif metric in ['crossentropy', 'ce']:
-    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
-      return metrics_module.binary_crossentropy  # case: binary cross-entropy
-    elif loss_fn == losses.sparse_categorical_crossentropy:
-      # case: categorical cross-entropy with sparse targets
+    # If the output_shape[-1] is not 1, then we know output is `categorical`.
+    # We assume it is sparse categorical only if loss is explicitly given
+    # as sparse categorical crossentropy loss.
+    return metrics_module.categorical_accuracy
+  else:
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_crossentropy
+    elif is_sparse_categorical_crossentropy:
       return metrics_module.sparse_categorical_crossentropy
-    # case: categorical cross-entropy
     return metrics_module.categorical_crossentropy
-  return metrics_module.get(metric)
 
 
 def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
@@ -875,10 +833,18 @@ def get_loss_function(loss):
   if loss is None or isinstance(loss, losses.Loss):
     return loss
 
-  # TODO(psv): After we have added all V2 losses, update this function.
-  if loss in ['mse', 'MSE', 'mean_squared_error']:
-    return losses.MeanSquaredError()
-  return losses.get(loss)
+  # Deserialize loss configuration, if needed.
+  if isinstance(loss, collections.Mapping):
+    loss = losses.get(loss)
+
+  # Custom callable class.
+  if callable(loss) and not hasattr(loss, '__name__'):
+    return loss
+
+  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
+  # in `LossFunctionWrapper` class.
+  loss_fn = losses.get(loss)
+  return losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
 
 
 def validate_dataset_input(x, y, sample_weight, validation_split=None):
@@ -887,13 +853,13 @@ def validate_dataset_input(x, y, sample_weight, validation_split=None):
   Arguments:
     x: Input data. A `tf.data` dataset or iterator.
     y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
-        Expected to be `None` when `x` is a dataset iterator.
-    sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`. Expected to be `None` when
-        `x` is a dataset iterator
-    validation_split: Float between 0 and 1. Fraction of the training data to
-        be used as validation data. Expected to be `None` when `x` is a dataset
-        iterator.
+      Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to weight
+      the importance of each sample in `x`. Expected to be `None` when `x` is a
+      dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to be
+      used as validation data. Expected to be `None` when `x` is a dataset
+      iterator.
 
   Raises:
     ValueError: if argument `y` or `sample_weight` or `validation_split` are
@@ -919,8 +885,7 @@ def validate_dataset_input(x, y, sample_weight, validation_split=None):
         'Received: x=%s, validation_split=%f' % (x, validation_split))
 
 
-def check_generator_arguments(y=None,
-                              sample_weight=None,
+def check_generator_arguments(y=None, sample_weight=None,
                               validation_split=None):
   """Validates arguments passed when using a generator."""
   if y is not None:
@@ -960,8 +925,8 @@ def check_steps_argument(input_data, steps, steps_name):
         but not provided.
   """
   # TODO(fchollet): allow datasets with steps=None if cardinality is known.
-  is_x_iterator = isinstance(input_data, (iterator_ops.Iterator,
-                                          iterator_ops.EagerIterator))
+  is_x_iterator = isinstance(
+      input_data, (iterator_ops.Iterator, iterator_ops.EagerIterator))
   if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
       (isinstance(input_data, list) and not input_data)):
     if steps is None:
@@ -1079,6 +1044,95 @@ def prepare_sample_weights(output_names, sample_weight_mode,
   return sample_weights, sample_weight_modes
 
 
+def prepare_loss_functions(loss, output_names):
+  """Converts loss to a list of loss functions.
+
+  Arguments:
+      loss: String (name of objective function), objective function or
+        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
+        outputs, you can use a different loss on each output by passing a
+        dictionary or a list of losses. The loss value that will be minimized by
+        the model will then be the sum of all individual losses.
+      output_names: List of model output names.
+
+  Returns:
+      A list of loss objective functions.
+
+  Raises:
+      ValueError: If loss is a dict with keys not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if isinstance(loss, collections.Mapping):
+    for name in loss:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss dictionary: {}. Only expected '
+                         'following keys: {}'.format(name, output_names))
+    loss_functions = []
+    for name in output_names:
+      if name not in loss:
+        logging.warning(
+            'Output {0} missing from loss dictionary. We assume '
+            'this was done on purpose. The fit and evaluate APIs will not be '
+            'expecting any data to be passed to {0}.'.format(name))
+      loss_functions.append(get_loss_function(loss.get(name, None)))
+  elif isinstance(loss, six.string_types):
+    loss_functions = [get_loss_function(loss) for _ in output_names]
+  elif isinstance(loss, collections.Sequence):
+    if len(loss) != len(output_names):
+      raise ValueError('When passing a list as loss, it should have one entry '
+                       'per model outputs. The model has {} outputs, but you '
+                       'passed loss={}'.format(len(output_names), loss))
+    loss_functions = nest.map_structure(get_loss_function, loss)
+  else:
+    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
+
+  return loss_functions
+
+
+def prepare_loss_weights(output_names, loss_weights=None):
+  """Converts loss weights to a list of loss weights.
+
+  Arguments:
+      output_names: List of model output names.
+      loss_weights: Optional list or dictionary specifying scalar coefficients
+        (Python floats) to weight the loss contributions of different model
+        outputs. The loss value that will be minimized by the model will then be
+        the *weighted sum* of all individual losses, weighted by the
+          `loss_weights` coefficients. If a list, it is expected to have a 1:1
+            mapping to the model's outputs. If a dict, it is expected to map
+            output names (strings) to scalar coefficients.
+
+  Returns:
+      A list of loss weights of python floats.
+
+  Raises:
+      ValueError: If loss weight is a dict with key not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if loss_weights is None:
+    weights_list = [1.] * len(output_names)
+  elif isinstance(loss_weights, dict):
+    for name in loss_weights:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss_weights dictionary: {}. '
+                         'Only expected the following keys: {}'.format(
+                             name, output_names))
+    weights_list = [loss_weights.get(name, 1.) for name in output_names]
+  elif isinstance(loss_weights, list):
+    if len(loss_weights) != len(output_names):
+      raise ValueError('When passing a list as loss_weights, '
+                       'it should have one entry per model output. '
+                       'The model has ' + str(len(output_names)) +
+                       ' outputs, but you passed loss_weights=' +
+                       str(loss_weights))
+    weights_list = loss_weights
+  else:
+    raise TypeError('Could not interpret loss_weights argument: ' +
+                    str(loss_weights) + ' - expected a list of dicts.')
+
+  return weights_list
+
+
 # TODO(rohanj): This is a hack to get around not depending on feature_column and
 # create a cyclical dependency. Figure out a cleaner solution
 def is_feature_layer(layer):
@@ -1088,8 +1142,7 @@ def is_feature_layer(layer):
 
 def is_eager_dataset_or_iterator(data):
   return context.executing_eagerly() and isinstance(
-      data, (dataset_ops.DatasetV1,
-             dataset_ops.DatasetV2,
+      data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
              iterator_ops.EagerIterator))
 
 
@@ -1226,10 +1279,8 @@ def verify_dataset_shuffled(x):
 
 
 def is_dataset_or_iterator(data):
-  return isinstance(data, (dataset_ops.DatasetV1,
-                           dataset_ops.DatasetV2,
-                           iterator_ops.EagerIterator,
-                           iterator_ops.Iterator))
+  return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                           iterator_ops.EagerIterator, iterator_ops.Iterator))
 
 
 def get_iterator(dataset):
@@ -1242,7 +1293,7 @@ def get_iterator(dataset):
 def initialize_iterator(iterator):
   init_op = iterator.initializer
   if not context.executing_eagerly():
-    K.get_session().run(init_op)
+    K.get_session((init_op,)).run(init_op)
 
 
 def extract_tensors_from_dataset(dataset):
@@ -1317,7 +1368,7 @@ def infer_steps_for_dataset(dataset, steps, epochs=1, steps_name='steps'):
   if size == cardinality.INFINITE and steps is None:
     raise ValueError('When passing an infinitely repeating dataset, you '
                      'must specify the `%s` argument.' % (steps_name,))
-  if size != cardinality.UNKNOWN:
+  if size >= 0:
     if steps is not None and steps * epochs > size:
       if epochs > 1:
         raise ValueError('The dataset you passed contains %s batches, but you '
@@ -1389,7 +1440,10 @@ class ModelInputs(object):
         # we have. The user should call `model._set_inputs(placeholders)`
         # to specify custom placeholders if the need arises.
         shape = (None,) + tuple(v.shape[1:])
-        v = K.placeholder(shape=shape, name=k)
+        dtype = dtypes.as_dtype(v.dtype)
+        if dtype.is_floating:
+          dtype = K.floatx()
+        v = K.placeholder(shape=shape, name=k, dtype=dtype)
       elif isinstance(v, tensor_shape.TensorShape):
         shape = (None,) + tuple(v.as_list()[1:])
         v = K.placeholder(shape=shape, name=k)
@@ -1425,7 +1479,7 @@ def get_input_shape_and_dtype(layer):
       does not have a defined input shape.
 
   Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
+    ValueError: in case an empty Sequential or Functional model is passed.
   """
 
   def _is_graph_model(layer):
@@ -1467,28 +1521,6 @@ def generic_output_names(outputs_list):
   return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
 
 
-def set_run_eagerly_for_dict_structure(model, x):
-  """Set model.run_eagerly to true if x is dict structure.
-
-  Set model.run_eagerly to true if x is dict or
-  Iterator/EagerIterator/Dataset of dict.
-
-  Args:
-    model: A Keras model.
-    x: Input data.
-  """
-  if not context.executing_eagerly():
-    return
-  if isinstance(x, dict):
-    model.run_eagerly = True
-  if (isinstance(x, (iterator_ops.Iterator, iterator_ops.EagerIterator,
-                     dataset_ops.DatasetV2))):
-    for item in x.output_shapes:
-      if isinstance(item, dict):
-        model.run_eagerly = True
-        return
-
-
 def convert_eager_tensors_to_numpy(structure):
   """Convert every EagerTensor in `structure` to NumPy.
 
@@ -1508,13 +1540,6 @@ def convert_eager_tensors_to_numpy(structure):
   return nest.map_structure(_convert, structure)
 
 
-def should_run_multi_worker():
-  """Whether a model should be run using DistributedCoordinator."""
-  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
-  cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
-  return tf_config and 'master' not in cluster_spec.jobs
-
-
 def should_run_validation(validation_freq, epoch):
   """Checks if validation should be run this epoch.
 
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index ec44252c9b4dc56bc535b851b7336856cb8765f6..30e3d1f3e4029ce837647352f19fd70486d7bf1a 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -25,8 +25,11 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
@@ -44,10 +47,11 @@ class ModelInputsTest(test.TestCase):
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
     self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
+    self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
     with context.eager_mode():
-      a = np.ones(10)
+      a = np.ones(10, dtype=np.int32)
       model_inputs = training_utils.ModelInputs(a)
       self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
@@ -55,6 +59,7 @@ class ModelInputsTest(test.TestCase):
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
       self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
@@ -225,6 +230,25 @@ class StandardizeWeightsTest(keras_parameterized.TestCase):
     expected = sample_weights * np.array([0.5, 1., 0.5, 0.5, 1.5])
     self.assertAllClose(weights, expected)
 
+  def test_dataset_with_class_weight(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse')
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    class_weight_np = np.array([0.25, 0.25, 0.25, 0.25])
+    class_weight = dict(enumerate(class_weight_np))
+
+    model.fit(
+        dataset,
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=1,
+        class_weight=class_weight)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index ac55ff965e693905407a534f083c8fab3f679c21..572ac9f6e4f25091ec24ebaf90ce1a0e4e23ae51 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import six
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import init_ops_v2
 
 # These imports are brought in so that keras.initializers.deserialize
 # has them available in module_objects.
@@ -160,9 +162,20 @@ def serialize(initializer):
 
 @keras_export('keras.initializers.deserialize')
 def deserialize(config, custom_objects=None):
+  """Return an `Initializer` object from its config."""
+  if tf2.enabled():
+    # Class names are the same for V1 and V2 but the V2 classes
+    # are aliased in this file so we need to grab them directly
+    # from `init_ops_v2`.
+    module_objects = {
+        obj_name: getattr(init_ops_v2, obj_name)
+        for obj_name in dir(init_ops_v2)
+    }
+  else:
+    module_objects = globals()
   return deserialize_keras_object(
       config,
-      module_objects=globals(),
+      module_objects=module_objects,
       custom_objects=custom_objects,
       printable_module_name='initializer')
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 36f2d405326f4bb96027d8022545c585072dcc98..f7ca58080b2b868793601b8f5e145abe0b6e6df9 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class KerasInitializersTest(test.TestCase):
 
   def _runner(self, init, shape, target_mean=None, target_std=None,
@@ -39,146 +41,178 @@ class KerasInitializersTest(test.TestCase):
     output_2 = keras.backend.get_value(variable)
     self.assertAllClose(output, output_2, atol=1e-4)
 
-  @test_util.run_deprecated_v1
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.cached_session():
-      self._runner(keras.initializers.RandomUniform(minval=-1,
-                                                    maxval=1,
-                                                    seed=124),
-                   tensor_shape,
-                   target_mean=0., target_max=1, target_min=-1)
+      self._runner(
+          keras.initializers.RandomUniformV2(minval=-1, maxval=1, seed=124),
+          tensor_shape,
+          target_mean=0.,
+          target_max=1,
+          target_min=-1)
 
-  @test_util.run_deprecated_v1
   def test_normal(self):
     tensor_shape = (8, 12, 99)
     with self.cached_session():
-      self._runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
-                   tensor_shape,
-                   target_mean=0., target_std=1)
+      self._runner(
+          keras.initializers.RandomNormalV2(mean=0, stddev=1, seed=153),
+          tensor_shape,
+          target_mean=0.,
+          target_std=1)
 
-  @test_util.run_deprecated_v1
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
     with self.cached_session():
-      self._runner(keras.initializers.TruncatedNormal(mean=0,
-                                                      stddev=1,
-                                                      seed=126),
-                   tensor_shape,
-                   target_mean=0., target_max=2, target_min=-2)
+      self._runner(
+          keras.initializers.TruncatedNormalV2(mean=0, stddev=1, seed=126),
+          tensor_shape,
+          target_mean=0.,
+          target_max=2,
+          target_min=-2)
 
-  @test_util.run_deprecated_v1
   def test_constant(self):
     tensor_shape = (5, 6, 4)
     with self.cached_session():
-      self._runner(keras.initializers.Constant(2), tensor_shape,
-                   target_mean=2, target_max=2, target_min=2)
+      self._runner(
+          keras.initializers.ConstantV2(2.),
+          tensor_shape,
+          target_mean=2,
+          target_max=2,
+          target_min=2)
 
-  @test_util.run_deprecated_v1
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
-      self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.lecun_uniformV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.GlorotUniformV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
-      self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.he_uniformV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
-      self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.lecun_normalV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.GlorotNormalV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
-      self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=std)
+      self._runner(
+          keras.initializers.he_normalV2(seed=123),
+          tensor_shape,
+          target_mean=0.,
+          target_std=std)
 
-  @test_util.run_deprecated_v1
   def test_orthogonal(self):
     tensor_shape = (20, 20)
     with self.cached_session():
-      self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
-                   target_mean=0.)
+      self._runner(
+          keras.initializers.OrthogonalV2(seed=123),
+          tensor_shape,
+          target_mean=0.)
 
-  @test_util.run_deprecated_v1
   def test_identity(self):
     with self.cached_session():
       tensor_shape = (3, 4, 5)
       with self.assertRaises(ValueError):
-        self._runner(keras.initializers.identity(), tensor_shape,
-                     target_mean=1. / tensor_shape[0], target_max=1.)
+        self._runner(
+            keras.initializers.IdentityV2(),
+            tensor_shape,
+            target_mean=1. / tensor_shape[0],
+            target_max=1.)
 
       tensor_shape = (3, 3)
-      self._runner(keras.initializers.identity(), tensor_shape,
-                   target_mean=1. / tensor_shape[0], target_max=1.)
+      self._runner(
+          keras.initializers.IdentityV2(),
+          tensor_shape,
+          target_mean=1. / tensor_shape[0],
+          target_max=1.)
 
-  @test_util.run_deprecated_v1
   def test_zero(self):
     tensor_shape = (4, 5)
     with self.cached_session():
-      self._runner(keras.initializers.zeros(), tensor_shape,
-                   target_mean=0., target_max=0.)
+      self._runner(
+          keras.initializers.ZerosV2(),
+          tensor_shape,
+          target_mean=0.,
+          target_max=0.)
 
-  @test_util.run_deprecated_v1
   def test_one(self):
     tensor_shape = (4, 5)
     with self.cached_session():
-      self._runner(keras.initializers.ones(), tensor_shape,
-                   target_mean=1., target_max=1.)
+      self._runner(
+          keras.initializers.OnesV2(),
+          tensor_shape,
+          target_mean=1.,
+          target_max=1.)
 
-  @test_util.run_deprecated_v1
   def test_default_random_uniform(self):
     ru = keras.initializers.get('uniform')
     self.assertEqual(ru.minval, -0.05)
     self.assertEqual(ru.maxval, 0.05)
 
-  @test_util.run_deprecated_v1
   def test_default_random_normal(self):
     rn = keras.initializers.get('normal')
     self.assertEqual(rn.mean, 0.0)
     self.assertEqual(rn.stddev, 0.05)
 
-  @test_util.run_deprecated_v1
   def test_default_truncated_normal(self):
     tn = keras.initializers.get('truncated_normal')
     self.assertEqual(tn.mean, 0.0)
     self.assertEqual(tn.stddev, 0.05)
 
+  def test_initializer_v2_get(self):
+    tf2_force_enabled = tf2._force_enable  # pylint: disable=protected-access
+    try:
+      tf2.enable()
+      rn = keras.initializers.get('random_normal')
+      self.assertIn('init_ops_v2', rn.__class__.__module__)
+    finally:
+      tf2._force_enable = tf2_force_enabled  # pylint: disable=protected-access
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index dc8d1deddb26172a724deaf51a0403302554d9f2..7250db2f99bcc68ca562564ce798c9f9f7020c35 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import nn_ops as nn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
@@ -46,10 +49,11 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
          keras.layers.Dropout(0.1),
          keras.layers.Dense(y_train.shape[-1], activation='softmax')],
         input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -81,10 +85,11 @@ class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
     y = base_model(x)
     y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
     model = keras.models.Model(x, y)
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     if not testing_utils.should_run_eagerly():
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
@@ -118,10 +123,11 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
     ]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -146,10 +152,11 @@ class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
     model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
                                                 activation='softmax',
                                                 dtype=dtypes.float32)))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=15, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -183,10 +190,11 @@ class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
     ]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=x_train.shape[1:])
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
-                  metrics=['accuracy'],
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
     history = model.fit(x_train, y_train, epochs=10, batch_size=10,
                         validation_data=(x_train, y_train),
                         verbose=2)
@@ -197,5 +205,47 @@ class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
     self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
 
+@keras_parameterized.run_all_keras_modes
+class ActivationV2IntegrationTest(keras_parameterized.TestCase):
+  """Tests activation function V2 in model exporting and loading.
+
+  This test is to verify in TF 2.x, when 'tf.nn.softmax' is used as an
+  activition function, its model exporting and loading work as expected.
+  Check b/123041942 for details.
+  """
+
+  def test_serialization_v2_model(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = keras.Sequential([
+        keras.layers.Flatten(input_shape=x_train.shape[1:]),
+        keras.layers.Dense(10, activation=nn.relu),
+        # To mimic 'tf.nn.softmax' used in TF 2.x.
+        keras.layers.Dense(y_train.shape[-1], activation=nn.softmax_v2),
+    ])
+
+    # Check if 'softmax' is in model.get_config().
+    last_layer_activation = model.get_layer(index=2).get_config()['activation']
+    self.assertEqual(last_layer_activation, 'softmax')
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x_train, y_train, epochs=2, batch_size=10,
+              validation_data=(x_train, y_train),
+              verbose=2)
+
+    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
+    keras.saving.saved_model.export_saved_model(model, output_path)
+    loaded_model = keras.saving.saved_model.load_from_saved_model(output_path)
+    self.assertEqual(model.summary(), loaded_model.summary())
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 285388f340fc9aa6890a7d141127d1192d565528..016cb116823d2a65a63f40ea23140c038bdc079d 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -109,7 +109,12 @@ from tensorflow.python.keras.layers.noise import GaussianNoise
 from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
+from tensorflow.python.keras.layers.normalization import LayerNormalization
 from tensorflow.python.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization_v2 import BatchNormalization as BatchNormalizationV2
+
+# Kernelized layers.
+from tensorflow.python.keras.layers.kernelized import RandomFourierFeatures
 
 # Pooling layers.
 from tensorflow.python.keras.layers.pooling import MaxPooling1D
@@ -141,16 +146,18 @@ from tensorflow.python.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
 from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.layers.recurrent import AbstractRNNCell
 from tensorflow.python.keras.layers.recurrent import StackedRNNCells
 from tensorflow.python.keras.layers.recurrent import SimpleRNNCell
 from tensorflow.python.keras.layers.recurrent import GRUCell
 from tensorflow.python.keras.layers.recurrent import LSTMCell
 from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
+
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
-from tensorflow.python.keras.layers.recurrent import UnifiedGRU
-from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
+from tensorflow.python.keras.layers.recurrent_v2 import GRU as GRU_v2
+from tensorflow.python.keras.layers.recurrent_v2 import LSTM as LSTM_v2
 
 # Convolutional-recurrent layers.
 from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index 5095287430735b4d370b0545c3971da14a4c0b6d..b339de0fa0eea22a6a809e5d5cd2e3570d01b60f 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -38,15 +38,15 @@ class LeakyReLU(Layer):
   `f(x) = x for x >= 0`.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      alpha: float >= 0. Negative slope coefficient.
+    alpha: Float >= 0. Negative slope coefficient.
 
   """
 
@@ -78,26 +78,25 @@ class PReLU(Layer):
   where `alpha` is a learned array with the same shape as x.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      alpha_initializer: initializer function for the weights.
-      alpha_regularizer: regularizer for the weights.
-      alpha_constraint: constraint for the weights.
-      shared_axes: the axes along which to share learnable
-          parameters for the activation function.
-          For example, if the incoming feature maps
-          are from a 2D convolution
-          with output shape `(batch, height, width, channels)`,
-          and you wish to share parameters across space
-          so that each filter only has one set of parameters,
-          set `shared_axes=[1, 2]`.
-
+    alpha_initializer: Initializer function for the weights.
+    alpha_regularizer: Regularizer for the weights.
+    alpha_constraint: Constraint for the weights.
+    shared_axes: The axes along which to share learnable
+      parameters for the activation function.
+      For example, if the incoming feature maps
+      are from a 2D convolution
+      with output shape `(batch, height, width, channels)`,
+      and you wish to share parameters across space
+      so that each filter only has one set of parameters,
+      set `shared_axes=[1, 2]`.
   """
 
   def __init__(self,
@@ -139,7 +138,7 @@ class PReLU(Layer):
     self.input_spec = InputSpec(ndim=len(input_shape), axes=axes)
     self.built = True
 
-  def call(self, inputs, mask=None):
+  def call(self, inputs):
     pos = K.relu(inputs)
     neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -168,16 +167,15 @@ class ELU(Layer):
   `f(x) = x for x >= 0`.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      alpha: scale for the negative factor.
-
+    alpha: Scale for the negative factor.
   """
 
   def __init__(self, alpha=1.0, **kwargs):
@@ -207,16 +205,15 @@ class ThresholdedReLU(Layer):
   `f(x) = 0 otherwise`.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      theta: float >= 0. Threshold location of activation.
-
+    theta: Float >= 0. Threshold location of activation.
   """
 
   def __init__(self, theta=1.0, **kwargs):
@@ -224,7 +221,7 @@ class ThresholdedReLU(Layer):
     self.supports_masking = True
     self.theta = K.cast_to_floatx(theta)
 
-  def call(self, inputs, mask=None):
+  def call(self, inputs):
     return inputs * math_ops.cast(
         math_ops.greater(inputs, self.theta), K.floatx())
 
@@ -243,15 +240,15 @@ class Softmax(Layer):
   """Softmax activation function.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      axis: Integer, axis along which the softmax normalization is applied.
+    axis: Integer, axis along which the softmax normalization is applied.
   """
 
   def __init__(self, axis=-1, **kwargs):
@@ -284,17 +281,17 @@ class ReLU(Layer):
   `f(x) = negative_slope * (x - threshold)` otherwise.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as the input.
+    Same shape as the input.
 
   Arguments:
-      max_value: float >= 0. Maximum activation value.
-      negative_slope: float >= 0. Negative slope coefficient.
-      threshold: float. Threshold value for thresholded activation.
+    max_value: Float >= 0. Maximum activation value.
+    negative_slope: Float >= 0. Negative slope coefficient.
+    threshold: Float. Threshold value for thresholded activation.
   """
 
   def __init__(self, max_value=None, negative_slope=0, threshold=0, **kwargs):
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index f32bb457c825d9769c6dccf625d9318c07843237..f04185417effae2b705a610edddd97a2ccf2ad74 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
@@ -88,6 +90,13 @@ class AdvancedActivationsTest(keras_parameterized.TestCase):
             kwargs={'negative_slope': -2},
             input_shape=(2, 3, 4))
 
+  @keras_parameterized.run_with_all_model_types
+  def test_layer_as_activation(self):
+    layer = keras.layers.Dense(1, activation=keras.layers.ReLU())
+    model = testing_utils.get_model_from_layers([layer], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 30b919cc0a9038cf0eeb10a240105fbabd591efa..fb6d175ac168f3c920546f30380348ada60009c1 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -46,7 +46,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 class Conv(Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
+  """Abstract N-D convolution layer (private, used as implementation base).
 
   This layer creates a convolution kernel that is convolved
   (actually cross-correlated) with the layer input to produce a tensor of
@@ -91,8 +91,8 @@ class Conv(Layer):
         not safe to use when doing asynchronous distributed training.
     bias_constraint: Optional projection function to be applied to the
         bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    trainable: Boolean, if `True` the weights of this layer will be marked as
+      trainable (and listed in `layer.trainable_weights`).
     name: A string, the name of the layer.
   """
 
@@ -289,45 +289,45 @@ class Conv1D(Conv):
   or `(None, 128)` for variable-length sequences of 128-dimensional vectors.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of a single integer,
-          specifying the length of the 1D convolution window.
-      strides: An integer or tuple/list of a single integer,
-          specifying the stride length of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
-          `"causal"` results in causal (dilated) convolutions, e.g. output[t]
-          does not depend on input[t+1:]. Useful when modeling temporal data
-          where the model should not violate the temporal order.
-          See [WaveNet: A Generative Model for Raw Audio, section
-            2.1](https://arxiv.org/abs/1609.03499).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-      dilation_rate: an integer or tuple/list of a single integer, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer,
+      specifying the length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+      `"causal"` results in causal (dilated) convolutions, e.g. output[t]
+      does not depend on input[t+1:]. Useful when modeling temporal data
+      where the model should not violate the temporal order.
+      See [WaveNet: A Generative Model for Raw Audio, section
+        2.1](https://arxiv.org/abs/1609.03499).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+    dilation_rate: an integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to the kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      3D tensor with shape: `(batch_size, steps, input_dim)`
+    3D tensor with shape: `(batch_size, steps, input_dim)`
 
   Output shape:
-      3D tensor with shape: `(batch_size, new_steps, filters)`
+    3D tensor with shape: `(batch_size, new_steps, filters)`
       `steps` value might have changed due to padding or strides.
   """
 
@@ -390,61 +390,61 @@ class Conv2D(Conv):
   in `data_format="channels_last"`.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          height and width of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the height and width.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: an integer or tuple/list of 2 integers, specifying
-          the dilation rate to use for dilated convolution.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any stride value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to the kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      4D tensor with shape:
-      `(samples, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(samples, rows, cols, channels)` if data_format='channels_last'.
+    4D tensor with shape:
+    `(samples, channels, rows, cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(samples, rows, cols, channels)` if data_format='channels_last'.
 
   Output shape:
-      4D tensor with shape:
-      `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
+    4D tensor with shape:
+    `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
   """
 
   def __init__(self,
@@ -502,67 +502,67 @@ class Conv3D(Conv):
   in `data_format="channels_last"`.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 3 integers, specifying the
-          depth, height and width of the 3D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 3 integers,
-          specifying the strides of the convolution along each spatial
-            dimension.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: an integer or tuple/list of 3 integers, specifying
-          the dilation rate to use for dilated convolution.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any stride value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along each spatial
+        dimension.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to the kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      5D tensor with shape:
-      `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if
-        data_format='channels_first'
-      or 5D tensor with shape:
-      `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if
-        data_format='channels_last'.
+    5D tensor with shape:
+    `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if
+      data_format='channels_first'
+    or 5D tensor with shape:
+    `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if
+      data_format='channels_last'.
 
   Output shape:
-      5D tensor with shape:
-      `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if
-        data_format='channels_first'
-      or 5D tensor with shape:
-      `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if
-        data_format='channels_last'.
-      `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
-        changed due to padding.
+    5D tensor with shape:
+    `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if
+      data_format='channels_first'
+    or 5D tensor with shape:
+    `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if
+      data_format='channels_last'.
+    `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
+      changed due to padding.
   """
 
   def __init__(self,
@@ -621,75 +621,75 @@ class Conv2DTranspose(Conv2D):
   in `data_format="channels_last"`.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          height and width of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the height and width.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
-      output_padding: An integer or tuple/list of 2 integers,
-          specifying the amount of padding along the height and width
-          of the output tensor.
-          Can be a single integer to specify the same value for all
-          spatial dimensions.
-          The amount of output padding along a given dimension must be
-          lower than the stride along that same dimension.
-          If set to `None` (default), the output shape is inferred.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: an integer or tuple/list of 2 integers, specifying
-          the dilation rate to use for dilated convolution.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any stride value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    output_padding: An integer or tuple/list of 2 integers,
+      specifying the amount of padding along the height and width
+      of the output tensor.
+      Can be a single integer to specify the same value for all
+      spatial dimensions.
+      The amount of output padding along a given dimension must be
+      lower than the stride along that same dimension.
+      If set to `None` (default), the output shape is inferred.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: an integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to the kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      4D tensor with shape:
-      `(batch, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, rows, cols, channels)` if data_format='channels_last'.
+    4D tensor with shape:
+    `(batch, channels, rows, cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(batch, rows, cols, channels)` if data_format='channels_last'.
 
   Output shape:
-      4D tensor with shape:
-      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
+    4D tensor with shape:
+    `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
 
   References:
-      - [A guide to convolution arithmetic for deep
-        learning](https://arxiv.org/abs/1603.07285v1)
-      - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+    - [A guide to convolution arithmetic for deep
+      learning](https://arxiv.org/abs/1603.07285v1)
+    - [Deconvolutional
+      Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
@@ -893,86 +893,78 @@ class Conv3DTranspose(Conv3D):
   if `data_format="channels_last"`.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 3 integers, specifying the
-          depth, height and width of the 3D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 3 integers,
-          specifying the strides of the convolution along the depth, height
-            and width.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
-      output_padding: An integer or tuple/list of 3 integers,
-          specifying the amount of padding along the depth, height, and
-          width.
-          Can be a single integer to specify the same value for all
-          spatial dimensions.
-          The amount of output padding along a given dimension must be
-          lower than the stride along that same dimension.
-          If set to `None` (default), the output shape is inferred.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, depth, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, depth, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: an integer or tuple/list of 3 integers, specifying
-          the dilation rate to use for dilated convolution.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any stride value != 1.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
-      kernel_constraint: Constraint function applied to the kernel matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+    filters: Integer, the dimensionality of the output space
+        (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+        depth, height and width of the 3D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+        specifying the strides of the convolution along the depth, height
+          and width.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    output_padding: An integer or tuple/list of 3 integers,
+      specifying the amount of padding along the depth, height, and
+      width.
+      Can be a single integer to specify the same value for all
+      spatial dimensions.
+      The amount of output padding along a given dimension must be
+      lower than the stride along that same dimension.
+      If set to `None` (default), the output shape is inferred.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: an integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation").
+    kernel_constraint: Constraint function applied to the kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      5D tensor with shape:
-      `(batch, channels, depth, rows, cols)` if data_format='channels_first'
-      or 5D tensor with shape:
-      `(batch, depth, rows, cols, channels)` if data_format='channels_last'.
+    5D tensor with shape:
+    `(batch, channels, depth, rows, cols)` if data_format='channels_first'
+    or 5D tensor with shape:
+    `(batch, depth, rows, cols, channels)` if data_format='channels_last'.
 
   Output shape:
-      5D tensor with shape:
-      `(batch, filters, new_depth, new_rows, new_cols)` if
-        data_format='channels_first'
-      or 5D tensor with shape:
-      `(batch, new_depth, new_rows, new_cols, filters)` if
-        data_format='channels_last'.
-      `depth` and `rows` and `cols` values might have changed due to padding.
+    5D tensor with shape:
+    `(batch, filters, new_depth, new_rows, new_cols)` if
+      data_format='channels_first'
+    or 5D tensor with shape:
+    `(batch, new_depth, new_rows, new_cols, filters)` if
+      data_format='channels_last'.
+    `depth` and `rows` and `cols` values might have changed due to padding.
 
   References:
-      - [A guide to convolution arithmetic for deep
-        learning](https://arxiv.org/abs/1603.07285v1)
-      - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+    - [A guide to convolution arithmetic for deep
+      learning](https://arxiv.org/abs/1603.07285v1)
+    - [Deconvolutional
+      Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
@@ -1219,17 +1211,17 @@ class SeparableConv(Conv):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
     depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
+      depthwise kernel after being updated by an `Optimizer` (e.g. used for
+      norm constraints or value constraints for layer weights). The function
+      must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are
+      not safe to use when doing asynchronous distributed training.
     pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
+      pointwise kernel after being updated by an `Optimizer`.
     bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` the weights of this layer will be marked as
+      trainable (and listed in `layer.trainable_weights`).
     name: A string, the name of the layer.
   """
 
@@ -1423,17 +1415,17 @@ class SeparableConv1D(SeparableConv):
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
     depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
+      depthwise kernel after being updated by an `Optimizer` (e.g. used for
+      norm constraints or value constraints for layer weights). The function
+      must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are
+      not safe to use when doing asynchronous distributed training.
     pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
+      pointwise kernel after being updated by an `Optimizer`.
     bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` the weights of this layer will be marked as
+      trainable (and listed in `layer.trainable_weights`).
     name: A string, the name of the layer.
   """
 
@@ -1541,69 +1533,69 @@ class SeparableConv2D(SeparableConv):
   or as an extreme version of an Inception block.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          height and width of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the height and width.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of 2 integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix.
-      pointwise_initializer: Initializer for the pointwise kernel matrix.
-      bias_initializer: Initializer for the bias vector.
-      depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix.
-      pointwise_regularizer: Regularizer function applied to
-          the pointwise kernel matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix.
-      pointwise_constraint: Constraint function applied to
-          the pointwise kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    depth_multiplier: The number of depthwise convolution output channels
+      for each input channel.
+      The total number of depthwise convolution output
+      channels will be equal to `filters_in * depth_multiplier`.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    depthwise_initializer: Initializer for the depthwise kernel matrix.
+    pointwise_initializer: Initializer for the pointwise kernel matrix.
+    bias_initializer: Initializer for the bias vector.
+    depthwise_regularizer: Regularizer function applied to
+      the depthwise kernel matrix.
+    pointwise_regularizer: Regularizer function applied to
+      the pointwise kernel matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    depthwise_constraint: Constraint function applied to
+      the depthwise kernel matrix.
+    pointwise_constraint: Constraint function applied to
+      the pointwise kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      4D tensor with shape:
-      `(batch, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, rows, cols, channels)` if data_format='channels_last'.
+    4D tensor with shape:
+    `(batch, channels, rows, cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(batch, rows, cols, channels)` if data_format='channels_last'.
 
   Output shape:
-      4D tensor with shape:
-      `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
+    4D tensor with shape:
+    `(batch, filters, new_rows, new_cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(batch, new_rows, new_cols, filters)` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
   """
 
   def __init__(self,
@@ -1688,43 +1680,43 @@ class DepthwiseConv2D(Conv2D):
 
   Arguments:
     kernel_size: An integer or tuple/list of 2 integers, specifying the
-        height and width of the 2D convolution window.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
     strides: An integer or tuple/list of 2 integers,
-        specifying the strides of the convolution along the height and width.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
-        Specifying any stride value != 1 is incompatible with specifying
-        any `dilation_rate` value != 1.
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
     padding: one of `'valid'` or `'same'` (case-insensitive).
     depth_multiplier: The number of depthwise convolution output channels
-        for each input channel.
-        The total number of depthwise convolution output
-        channels will be equal to `filters_in * depth_multiplier`.
+      for each input channel.
+      The total number of depthwise convolution output
+      channels will be equal to `filters_in * depth_multiplier`.
     data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, channels, height, width)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be 'channels_last'.
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be 'channels_last'.
     activation: Activation function to use.
-        If you don't specify anything, no activation is applied
-        (ie. 'linear' activation: `a(x) = x`).
+      If you don't specify anything, no activation is applied
+      (ie. 'linear' activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     depthwise_initializer: Initializer for the depthwise kernel matrix.
     bias_initializer: Initializer for the bias vector.
     depthwise_regularizer: Regularizer function applied to
-        the depthwise kernel matrix.
+      the depthwise kernel matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to
-        the output of the layer (its 'activation').
+      the output of the layer (its 'activation').
     depthwise_constraint: Constraint function applied to
-        the depthwise kernel matrix.
+      the depthwise kernel matrix.
     bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
@@ -1812,7 +1804,7 @@ class DepthwiseConv2D(Conv2D):
     self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
     self.built = True
 
-  def call(self, inputs, training=None):
+  def call(self, inputs):
     outputs = backend.depthwise_conv2d(
         inputs,
         self.depthwise_kernel,
@@ -1877,13 +1869,13 @@ class UpSampling1D(Layer):
   Repeats each temporal step `size` times along the time axis.
 
   Arguments:
-      size: integer. Upsampling factor.
+    size: Integer. Upsampling factor.
 
   Input shape:
-      3D tensor with shape: `(batch, steps, features)`.
+    3D tensor with shape: `(batch, steps, features)`.
 
   Output shape:
-      3D tensor with shape: `(batch, upsampled_steps, features)`.
+    3D tensor with shape: `(batch, upsampled_steps, features)`.
   """
 
   def __init__(self, size=2, **kwargs):
@@ -1911,36 +1903,36 @@ class UpSampling2D(Layer):
   """Upsampling layer for 2D inputs.
 
   Repeats the rows and columns of the data
-  by size[0] and size[1] respectively.
+  by `size[0]` and `size[1]` respectively.
 
   Arguments:
-      size: int, or tuple of 2 integers.
-          The upsampling factors for rows and columns.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      interpolation: A string, one of `nearest` or `bilinear`.
+    size: Int, or tuple of 2 integers.
+      The upsampling factors for rows and columns.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    interpolation: A string, one of `nearest` or `bilinear`.
 
   Input shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, rows, cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, rows, cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, rows, cols, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, rows, cols)`
 
   Output shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, upsampled_rows, upsampled_cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, upsampled_rows, upsampled_cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, upsampled_rows, upsampled_cols, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, upsampled_rows, upsampled_cols)`
   """
 
   def __init__(self,
@@ -1990,35 +1982,35 @@ class UpSampling3D(Layer):
   """Upsampling layer for 3D inputs.
 
   Repeats the 1st, 2nd and 3rd dimensions
-  of the data by size[0], size[1] and size[2] respectively.
+  of the data by `size[0]`, `size[1]` and `size[2]` respectively.
 
   Arguments:
-      size: int, or tuple of 3 integers.
-          The upsampling factors for dim1, dim2 and dim3.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    size: Int, or tuple of 3 integers.
+      The upsampling factors for dim1, dim2 and dim3.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, dim1, dim2, dim3, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, dim1, dim2, dim3)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, dim1, dim2, dim3, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, dim1, dim2, dim3)`
 
   Output shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, upsampled_dim1, upsampled_dim2, upsampled_dim3, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, upsampled_dim1, upsampled_dim2, upsampled_dim3)`
   """
 
   def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
@@ -2063,7 +2055,7 @@ class ZeroPadding1D(Layer):
   """Zero-padding layer for 1D input (e.g. temporal sequence).
 
   Arguments:
-      padding: int, or tuple of int (length 2), or dictionary.
+      padding: Int, or tuple of int (length 2), or dictionary.
           - If int:
           How many zeros to add at the beginning and end of
           the padding dimension (axis 1).
@@ -2107,40 +2099,40 @@ class ZeroPadding2D(Layer):
   at the top, bottom, left and right side of an image tensor.
 
   Arguments:
-      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
-          - If int: the same symmetric padding
-              is applied to height and width.
-          - If tuple of 2 ints:
-              interpreted as two different
-              symmetric padding values for height and width:
-              `(symmetric_height_pad, symmetric_width_pad)`.
-          - If tuple of 2 tuples of 2 ints:
-              interpreted as
-              `((top_pad, bottom_pad), (left_pad, right_pad))`
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    padding: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+      - If int: the same symmetric padding
+        is applied to height and width.
+      - If tuple of 2 ints:
+        interpreted as two different
+        symmetric padding values for height and width:
+        `(symmetric_height_pad, symmetric_width_pad)`.
+      - If tuple of 2 tuples of 2 ints:
+        interpreted as
+        `((top_pad, bottom_pad), (left_pad, right_pad))`
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, rows, cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, rows, cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, rows, cols, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, rows, cols)`
 
   Output shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, padded_rows, padded_cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, padded_rows, padded_cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, padded_rows, padded_cols, channels)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, channels, padded_rows, padded_cols)`
   """
 
   def __init__(self, padding=(1, 1), data_format=None, **kwargs):
@@ -2206,45 +2198,45 @@ class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
   Arguments:
-      padding: int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
-          - If int: the same symmetric padding
-              is applied to height and width.
-          - If tuple of 3 ints:
-              interpreted as two different
-              symmetric padding values for height and width:
-              `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
-          - If tuple of 3 tuples of 2 ints:
-              interpreted as
-              `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
-                right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    padding: Int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
+      - If int: the same symmetric padding
+        is applied to height and width.
+      - If tuple of 3 ints:
+        interpreted as two different
+        symmetric padding values for height and width:
+        `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
+      - If tuple of 3 tuples of 2 ints:
+        interpreted as
+        `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
+          right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
-            depth)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, depth, first_axis_to_pad, second_axis_to_pad,
-            third_axis_to_pad)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
+          depth)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, depth, first_axis_to_pad, second_axis_to_pad,
+          third_axis_to_pad)`
 
   Output shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad,
-            depth)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, depth, first_padded_axis, second_padded_axis,
-            third_axis_to_pad)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+        `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad,
+          depth)`
+    - If `data_format` is `"channels_first"`:
+        `(batch, depth, first_padded_axis, second_padded_axis,
+          third_axis_to_pad)`
   """
 
   def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
@@ -2326,17 +2318,16 @@ class Cropping1D(Layer):
   It crops along the time dimension (axis 1).
 
   Arguments:
-      cropping: int or tuple of int (length 2)
-          How many units should be trimmed off at the beginning and end of
-          the cropping dimension (axis 1).
-          If a single int is provided,
-          the same value will be used for both.
+    cropping: Int or tuple of int (length 2)
+      How many units should be trimmed off at the beginning and end of
+      the cropping dimension (axis 1).
+      If a single int is provided, the same value will be used for both.
 
   Input shape:
-      3D tensor with shape `(batch, axis_to_crop, features)`
+    3D tensor with shape `(batch, axis_to_crop, features)`
 
   Output shape:
-      3D tensor with shape `(batch, cropped_axis, features)`
+    3D tensor with shape `(batch, cropped_axis, features)`
   """
 
   def __init__(self, cropping=(1, 1), **kwargs):
@@ -2371,52 +2362,52 @@ class Cropping2D(Layer):
   It crops along spatial dimensions, i.e. height and width.
 
   Arguments:
-      cropping: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
-          - If int: the same symmetric cropping
-              is applied to height and width.
-          - If tuple of 2 ints:
-              interpreted as two different
-              symmetric cropping values for height and width:
-              `(symmetric_height_crop, symmetric_width_crop)`.
-          - If tuple of 2 tuples of 2 ints:
-              interpreted as
-              `((top_crop, bottom_crop), (left_crop, right_crop))`
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    cropping: Int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+      - If int: the same symmetric cropping
+        is applied to height and width.
+      - If tuple of 2 ints:
+        interpreted as two different
+        symmetric cropping values for height and width:
+        `(symmetric_height_crop, symmetric_width_crop)`.
+      - If tuple of 2 tuples of 2 ints:
+        interpreted as
+        `((top_crop, bottom_crop), (left_crop, right_crop))`
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, rows, cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, rows, cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+      `(batch, rows, cols, channels)`
+    - If `data_format` is `"channels_first"`:
+      `(batch, channels, rows, cols)`
 
   Output shape:
-      4D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, cropped_rows, cropped_cols, channels)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, channels, cropped_rows, cropped_cols)`
+    4D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+      `(batch, cropped_rows, cropped_cols, channels)`
+    - If `data_format` is `"channels_first"`:
+      `(batch, channels, cropped_rows, cropped_cols)`
 
   Examples:
 
   ```python
-      # Crop the input 2D images or feature maps
-      model = Sequential()
-      model.add(Cropping2D(cropping=((2, 2), (4, 4)),
-                           input_shape=(28, 28, 3)))
-      # now model.output_shape == (None, 24, 20, 3)
-      model.add(Conv2D(64, (3, 3), padding='same))
-      model.add(Cropping2D(cropping=((2, 2), (2, 2))))
-      # now model.output_shape == (None, 20, 16. 64)
+  # Crop the input 2D images or feature maps
+  model = Sequential()
+  model.add(Cropping2D(cropping=((2, 2), (4, 4)),
+                       input_shape=(28, 28, 3)))
+  # now model.output_shape == (None, 24, 20, 3)
+  model.add(Conv2D(64, (3, 3), padding='same))
+  model.add(Cropping2D(cropping=((2, 2), (2, 2))))
+  # now model.output_shape == (None, 20, 16. 64)
   ```
   """
 
@@ -2498,50 +2489,46 @@ class Cropping2D(Layer):
 
 @keras_export('keras.layers.Cropping3D')
 class Cropping3D(Layer):
-  """Cropping layer for 3D data (e.g.
-
-  spatial or spatio-temporal).
+  """Cropping layer for 3D data (e.g. spatial or spatio-temporal).
 
   Arguments:
-      cropping: int, or tuple of 23ints, or tuple of 3 tuples of 2 ints.
-          - If int: the same symmetric cropping
-              is applied to depth, height, and width.
-          - If tuple of 3 ints:
-              interpreted as two different
-              symmetric cropping values for depth, height, and width:
-              `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
-          - If tuple of 3 tuples of 2 ints:
-              interpreted as
-              `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
-                right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    cropping: Int, or tuple of 23ints, or tuple of 3 tuples of 2 ints.
+      - If int: the same symmetric cropping
+        is applied to depth, height, and width.
+      - If tuple of 3 ints: interpreted as two different
+        symmetric cropping values for depth, height, and width:
+        `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
+      - If tuple of 3 tuples of 2 ints: interpreted as
+        `((left_dim1_crop, right_dim1_crop), (left_dim2_crop,
+          right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
-            depth)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, depth, first_axis_to_crop, second_axis_to_crop,
-            third_axis_to_crop)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+      `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
+        depth)`
+    - If `data_format` is `"channels_first"`:
+      `(batch, depth, first_axis_to_crop, second_axis_to_crop,
+        third_axis_to_crop)`
 
   Output shape:
-      5D tensor with shape:
-      - If `data_format` is `"channels_last"`:
-          `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis,
-            depth)`
-      - If `data_format` is `"channels_first"`:
-          `(batch, depth, first_cropped_axis, second_cropped_axis,
-            third_cropped_axis)`
+    5D tensor with shape:
+    - If `data_format` is `"channels_last"`:
+      `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis,
+        depth)`
+    - If `data_format` is `"channels_first"`:
+      `(batch, depth, first_cropped_axis, second_cropped_axis,
+        third_cropped_axis)`
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index c0479e71a24dc4b8c7ed1e660f18d610784448e1..030908e51a26595c1b2acb55fcf3e7f82776059d 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -28,12 +28,13 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras.layers.recurrent import _standardize_args
+from tensorflow.python.keras.layers.recurrent import DropoutRNNCellMixin
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -42,30 +43,42 @@ class ConvRNN2D(RNN):
 
   Arguments:
     cell: A RNN cell instance. A RNN cell is a class that has:
-        - a `call(input_at_t, states_at_t)` method, returning
-            `(output_at_t, states_at_t_plus_1)`. The call method of the
-            cell can also take the optional argument `constants`, see
-            section "Note on passing external constants" below.
-        - a `state_size` attribute. This can be a single integer
-            (single state) in which case it is
-            the number of channels of the recurrent state
-            (which should be the same as the number of channels of the cell
-            output). This can also be a list/tuple of integers
-            (one size per state). In this case, the first entry
-            (`state_size[0]`) should be the same as
-            the size of the cell output.
+      - a `call(input_at_t, states_at_t)` method, returning
+        `(output_at_t, states_at_t_plus_1)`. The call method of the
+        cell can also take the optional argument `constants`, see
+        section "Note on passing external constants" below.
+      - a `state_size` attribute. This can be a single integer
+        (single state) in which case it is
+        the number of channels of the recurrent state
+        (which should be the same as the number of channels of the cell
+        output). This can also be a list/tuple of integers
+        (one size per state). In this case, the first entry
+        (`state_size[0]`) should be the same as
+        the size of the cell output.
     return_sequences: Boolean. Whether to return the last output.
-        in the output sequence, or the full sequence.
+      in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
-        in addition to the output.
+      in addition to the output.
     go_backwards: Boolean (default False).
-        If True, process the input sequence backwards and return the
-        reversed sequence.
+      If True, process the input sequence backwards and return the
+      reversed sequence.
     stateful: Boolean (default False). If True, the last state
-        for each sample at index i in a batch will be used as initial
-        state for the sample of index i in the following batch.
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
     input_shape: Use this argument to specify the shape of the
-        input when this layer is the first one in a model.
+      input when this layer is the first one in a model.
+
+  Call arguments:
+    inputs: A 5D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is for use with cells that use dropout.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+    constants: List of constant tensors to be passed to the cell at each
+      timestep.
 
   Input shape:
     5D tensor with shape:
@@ -75,33 +88,31 @@ class ConvRNN2D(RNN):
     if data_format='channels_last'.
 
   Output shape:
-    - if `return_state`: a list of tensors. The first tensor is
-        the output. The remaining tensors are the last states,
-        each 5D tensor with shape:
-        `(samples, timesteps, filters, new_rows, new_cols)`
-        if data_format='channels_first'
-        or 5D tensor with shape:
-        `(samples, timesteps, new_rows, new_cols, filters)`
-        if data_format='channels_last'.
-        `rows` and `cols` values might have changed due to padding.
-    - if `return_sequences`: 5D tensor with shape:
-        `(samples, timesteps, filters, new_rows, new_cols)`
-        if data_format='channels_first'
-        or 5D tensor with shape:
-        `(samples, timesteps, new_rows, new_cols, filters)`
-        if data_format='channels_last'.
-    - else, 4D tensor with shape:
-        `(samples, filters, new_rows, new_cols)`
-        if data_format='channels_first'
-        or 4D tensor with shape:
-        `(samples, new_rows, new_cols, filters)`
-        if data_format='channels_last'.
+    - If `return_state`: a list of tensors. The first tensor is
+      the output. The remaining tensors are the last states,
+      each 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+    - If `return_sequences`: 5D tensor with shape:
+      `(samples, timesteps, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, timesteps, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+    - Else, 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
 
   Masking:
     This layer supports masking for input data with a variable number
-    of timesteps. To introduce masks to your data,
-    use an Embedding layer with the `mask_zero` parameter
-    set to `True`.
+    of timesteps.
 
   Note on using statefulness in RNNs:
     You can set RNN layers to be 'stateful', which means that the states
@@ -109,19 +120,19 @@ class ConvRNN2D(RNN):
     for the samples in the next batch. This assumes a one-to-one mapping
     between samples in different successive batches.
     To enable statefulness:
-        - specify `stateful=True` in the layer constructor.
-        - specify a fixed batch size for your model, by passing
-             - if sequential model:
-                `batch_input_shape=(...)` to the first layer in your model.
-             - if functional model with 1 or more Input layers:
-                `batch_shape=(...)` to all the first layers in your model.
-                This is the expected shape of your inputs
-                *including the batch size*.
-                It should be a tuple of integers,
-                e.g. `(32, 10, 100, 100, 32)`.
-                Note that the number of rows and columns should be specified
-                too.
-        - specify `shuffle=False` when calling fit().
+      - Specify `stateful=True` in the layer constructor.
+      - Specify a fixed batch size for your model, by passing
+         - If sequential model:
+            `batch_input_shape=(...)` to the first layer in your model.
+         - If functional model with 1 or more Input layers:
+            `batch_shape=(...)` to all the first layers in your model.
+            This is the expected shape of your inputs
+            *including the batch size*.
+            It should be a tuple of integers,
+            e.g. `(32, 10, 100, 100, 32)`.
+            Note that the number of rows and columns should be specified
+            too.
+      - Specify `shuffle=False` when calling fit().
     To reset the states of your model, call `.reset_states()` on either
     a specific layer, or on your entire model.
 
@@ -272,7 +283,7 @@ class ConvRNN2D(RNN):
     shape = list(self.cell.kernel_shape)
     shape[-1] = self.cell.filters
     initial_state = self.cell.input_conv(initial_state,
-                                         K.zeros(tuple(shape)),
+                                         array_ops.zeros(tuple(shape)),
                                          padding=self.cell.padding)
 
     if hasattr(self.cell.state_size, '__len__'):
@@ -471,61 +482,68 @@ class ConvRNN2D(RNN):
         K.set_value(state, value)
 
 
-class ConvLSTM2DCell(Layer):
+class ConvLSTM2DCell(DropoutRNNCellMixin, Layer):
   """Cell class for the ConvLSTM2D layer.
 
-  # Arguments
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      unit_forget_bias: Boolean.
-          If True, add 1 to the bias of the forget gate at initialization.
-          Use in combination with `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et al.]
-          (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
+  Arguments:
+    filters: Integer, the dimensionality of the output space
+      (i.e. the number of output filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      dimensions of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the strides of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix,
+      used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+      If True, add 1 to the bias of the forget gate at initialization.
+      Use in combination with `bias_initializer="zeros"`.
+      This is recommended in [Jozefowicz et al.]
+      (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+
+  Call arguments:
+    inputs: A 4D tensor.
+    states:  List of state tensors corresponding to the previous timestep.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. Only relevant when `dropout` or
+      `recurrent_dropout` is used.
   """
 
   def __init__(self,
@@ -579,8 +597,6 @@ class ConvLSTM2DCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_size = (self.filters, self.filters)
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
 
   def build(self, input_shape):
 
@@ -625,55 +641,19 @@ class ConvLSTM2DCell(Layer):
           initializer=bias_initializer,
           regularizer=self.bias_regularizer,
           constraint=self.bias_constraint)
-
     else:
       self.bias = None
-
-    self.kernel_i = self.kernel[:, :, :, :self.filters]
-    self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
-    self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
-    self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
-                                                    self.filters * 2]
-    self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
-    self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
-                                                    self.filters * 3]
-    self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
-    self.recurrent_kernel_o = self.recurrent_kernel[:, :, :, self.filters * 3:]
-
-    if self.use_bias:
-      self.bias_i = self.bias[:self.filters]
-      self.bias_f = self.bias[self.filters: self.filters * 2]
-      self.bias_c = self.bias[self.filters * 2: self.filters * 3]
-      self.bias_o = self.bias[self.filters * 3:]
-    else:
-      self.bias_i = None
-      self.bias_f = None
-      self.bias_c = None
-      self.bias_o = None
     self.built = True
 
   def call(self, inputs, states, training=None):
-    if 0 < self.dropout < 1 and self._dropout_mask is None:
-      self._dropout_mask = _generate_dropout_mask(
-          K.ones_like(inputs),
-          self.dropout,
-          training=training,
-          count=4)
-    if (0 < self.recurrent_dropout < 1 and
-        self._recurrent_dropout_mask is None):
-      self._recurrent_dropout_mask = _generate_dropout_mask(
-          K.ones_like(states[1]),
-          self.recurrent_dropout,
-          training=training,
-          count=4)
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
 
     # dropout matrices for input units
-    dp_mask = self._dropout_mask
+    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
     # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
-    h_tm1 = states[0]  # previous memory state
-    c_tm1 = states[1]  # previous carry state
+    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+        h_tm1, training, count=4)
 
     if 0 < self.dropout < 1.:
       inputs_i = inputs * dp_mask[0]
@@ -697,22 +677,26 @@ class ConvLSTM2DCell(Layer):
       h_tm1_c = h_tm1
       h_tm1_o = h_tm1
 
-    x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i,
-                          padding=self.padding)
-    x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f,
-                          padding=self.padding)
-    x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c,
-                          padding=self.padding)
-    x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o,
-                          padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1_i,
-                              self.recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1_f,
-                              self.recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1_c,
-                              self.recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1_o,
-                              self.recurrent_kernel_o)
+    (kernel_i, kernel_f,
+     kernel_c, kernel_o) = array_ops.split(self.kernel, 4, axis=3)
+    (recurrent_kernel_i,
+     recurrent_kernel_f,
+     recurrent_kernel_c,
+     recurrent_kernel_o) = array_ops.split(self.recurrent_kernel, 4, axis=3)
+
+    if self.use_bias:
+      bias_i, bias_f, bias_c, bias_o = array_ops.split(self.bias, 4)
+    else:
+      bias_i, bias_f, bias_c, bias_o = None, None, None, None
+
+    x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding)
+    x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding)
+    x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding)
+    x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding)
+    h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o)
 
     i = self.recurrent_activation(x_i + h_i)
     f = self.recurrent_activation(x_f + h_f)
@@ -779,95 +763,106 @@ class ConvLSTM2D(ConvRNN2D):
 
   Arguments:
     filters: Integer, the dimensionality of the output space
-        (i.e. the number of output filters in the convolution).
+      (i.e. the number of output filters in the convolution).
     kernel_size: An integer or tuple/list of n integers, specifying the
-        dimensions of the convolution window.
+      dimensions of the convolution window.
     strides: An integer or tuple/list of n integers,
-        specifying the strides of the convolution.
-        Specifying any stride value != 1 is incompatible with specifying
-        any `dilation_rate` value != 1.
+      specifying the strides of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
     data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, time, ..., channels)`
-        while `channels_first` corresponds to
-        inputs with shape `(batch, time, channels, ...)`.
-        It defaults to the `image_data_format` value found in your
-        Keras config file at `~/.keras/keras.json`.
-        If you never set it, then it will be "channels_last".
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, time, ..., channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, time, channels, ...)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
     dilation_rate: An integer or tuple/list of n integers, specifying
-        the dilation rate to use for dilated convolution.
-        Currently, specifying any `dilation_rate` value != 1 is
-        incompatible with specifying any `strides` value != 1.
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
     activation: Activation function to use.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use
-        for the recurrent step.
+      for the recurrent step.
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix,
-        used for the linear transformation of the inputs.
+      used for the linear transformation of the inputs.
     recurrent_initializer: Initializer for the `recurrent_kernel`
-        weights matrix,
-        used for the linear transformation of the recurrent state.
+      weights matrix,
+      used for the linear transformation of the recurrent state.
     bias_initializer: Initializer for the bias vector.
     unit_forget_bias: Boolean.
-        If True, add 1 to the bias of the forget gate at initialization.
-        Use in combination with `bias_initializer="zeros"`.
-        This is recommended in [Jozefowicz et al.]
-        (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      If True, add 1 to the bias of the forget gate at initialization.
+      Use in combination with `bias_initializer="zeros"`.
+      This is recommended in [Jozefowicz et al.]
+      (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
     kernel_regularizer: Regularizer function applied to
-        the `kernel` weights matrix.
+      the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
-        the `recurrent_kernel` weights matrix.
+      the `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to.
     kernel_constraint: Constraint function applied to
-        the `kernel` weights matrix.
+      the `kernel` weights matrix.
     recurrent_constraint: Constraint function applied to
-        the `recurrent_kernel` weights matrix.
+      the `recurrent_kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     return_sequences: Boolean. Whether to return the last output
-        in the output sequence, or the full sequence.
+      in the output sequence, or the full sequence.
     go_backwards: Boolean (default False).
-        If True, process the input sequence backwards.
+      If True, process the input sequence backwards.
     stateful: Boolean (default False). If True, the last state
-        for each sample at index i in a batch will be used as initial
-        state for the sample of index i in the following batch.
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
     dropout: Float between 0 and 1.
-        Fraction of the units to drop for
-        the linear transformation of the inputs.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
     recurrent_dropout: Float between 0 and 1.
-        Fraction of the units to drop for
-        the linear transformation of the recurrent state.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+
+  Call arguments:
+    inputs: A 5D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or `recurrent_dropout`
+      are set.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
 
   Input shape:
-    - if data_format='channels_first'
+    - If data_format='channels_first'
         5D tensor with shape:
         `(samples, time, channels, rows, cols)`
-    - if data_format='channels_last'
+    - If data_format='channels_last'
         5D tensor with shape:
         `(samples, time, rows, cols, channels)`
 
   Output shape:
-    - if `return_sequences`
-         - if data_format='channels_first'
-            5D tensor with shape:
-            `(samples, time, filters, output_row, output_col)`
-         - if data_format='channels_last'
-            5D tensor with shape:
-            `(samples, time, output_row, output_col, filters)`
-    - else
-        - if data_format ='channels_first'
-            4D tensor with shape:
-            `(samples, filters, output_row, output_col)`
-        - if data_format='channels_last'
-            4D tensor with shape:
-            `(samples, output_row, output_col, filters)`
-        where o_row and o_col depend on the shape of the filter and
-        the padding
+    - If `return_sequences`
+       - If data_format='channels_first'
+          5D tensor with shape:
+          `(samples, time, filters, output_row, output_col)`
+       - If data_format='channels_last'
+          5D tensor with shape:
+          `(samples, time, output_row, output_col, filters)`
+    - Else
+      - If data_format ='channels_first'
+          4D tensor with shape:
+          `(samples, filters, output_row, output_col)`
+      - If data_format='channels_last'
+          4D tensor with shape:
+          `(samples, output_row, output_col, filters)`
+      where `o_row` and `o_col` depend on the shape of the filter and
+      the padding
 
   Raises:
     ValueError: in case of invalid constructor arguments.
@@ -877,7 +872,6 @@ class ConvLSTM2D(ConvRNN2D):
     Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
     The current implementation does not include the feedback loop on the
     cells output.
-
   """
 
   def __init__(self,
@@ -936,6 +930,8 @@ class ConvLSTM2D(ConvRNN2D):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
+    self.cell.reset_dropout_mask()
+    self.cell.reset_recurrent_dropout_mask()
     return super(ConvLSTM2D, self).call(inputs,
                                         mask=mask,
                                         training=training,
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
index 4a757938846767d0cff7ab312f211f17965c5971..d0da360ef5f75c138369ec4b223964311219eaeb 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -18,16 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-class ConvLSTMTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class ConvLSTMTest(keras_parameterized.TestCase):
 
-  def test_conv_lstm(self):
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          data_format=['channels_first', 'channels_last'],
+          return_sequences=[True, False]))
+  def test_conv_lstm(self, data_format, return_sequences):
     num_row = 3
     num_col = 3
     filters = 2
@@ -36,47 +44,44 @@ class ConvLSTMTest(test.TestCase):
     input_num_row = 5
     input_num_col = 5
     sequence_len = 2
-    for data_format in ['channels_first', 'channels_last']:
-      if data_format == 'channels_first':
-        inputs = np.random.rand(num_samples, sequence_len,
-                                input_channel,
-                                input_num_row, input_num_col)
-      else:
-        inputs = np.random.rand(num_samples, sequence_len,
-                                input_num_row, input_num_col,
-                                input_channel)
-
-      for return_sequences in [True, False]:
-        with self.cached_session():
-          # test for return state:
-          x = keras.Input(batch_shape=inputs.shape)
-          kwargs = {'data_format': data_format,
-                    'return_sequences': return_sequences,
-                    'return_state': True,
-                    'stateful': True,
-                    'filters': filters,
-                    'kernel_size': (num_row, num_col),
-                    'padding': 'valid'}
-          layer = keras.layers.ConvLSTM2D(**kwargs)
-          layer.build(inputs.shape)
-          outputs = layer(x)
-          _, states = outputs[0], outputs[1:]
-          self.assertEqual(len(states), 2)
-          model = keras.models.Model(x, states[0])
-          state = model.predict(inputs)
-
-          self.assertAllClose(
-              keras.backend.eval(layer.states[0]), state, atol=1e-4)
-
-          # test for output shape:
-          testing_utils.layer_test(
-              keras.layers.ConvLSTM2D,
-              kwargs={'data_format': data_format,
-                      'return_sequences': return_sequences,
-                      'filters': filters,
-                      'kernel_size': (num_row, num_col),
-                      'padding': 'valid'},
-              input_shape=inputs.shape)
+    if data_format == 'channels_first':
+      inputs = np.random.rand(num_samples, sequence_len,
+                              input_channel,
+                              input_num_row, input_num_col)
+    else:
+      inputs = np.random.rand(num_samples, sequence_len,
+                              input_num_row, input_num_col,
+                              input_channel)
+
+    # test for return state:
+    x = keras.Input(batch_shape=inputs.shape)
+    kwargs = {'data_format': data_format,
+              'return_sequences': return_sequences,
+              'return_state': True,
+              'stateful': True,
+              'filters': filters,
+              'kernel_size': (num_row, num_col),
+              'padding': 'valid'}
+    layer = keras.layers.ConvLSTM2D(**kwargs)
+    layer.build(inputs.shape)
+    outputs = layer(x)
+    _, states = outputs[0], outputs[1:]
+    self.assertEqual(len(states), 2)
+    model = keras.models.Model(x, states[0])
+    state = model.predict(inputs)
+
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+    # test for output shape:
+    testing_utils.layer_test(
+        keras.layers.ConvLSTM2D,
+        kwargs={'data_format': data_format,
+                'return_sequences': return_sequences,
+                'filters': filters,
+                'kernel_size': (num_row, num_col),
+                'padding': 'valid'},
+        input_shape=inputs.shape)
 
   def test_conv_lstm_statefulness(self):
     # Tests for statefulness
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 9140ce426e6881b2abbc821e835c1e792c884343..ea0887836069872226b290bb3ca89741424d64fd 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -420,6 +421,7 @@ class ZeroPaddingTest(keras_parameterized.TestCase):
       keras.layers.ZeroPadding3D(padding=None)
 
 
+@test_util.disable_all_xla('align_corners=False not supported by XLA')
 @keras_parameterized.run_all_keras_modes
 class UpSamplingTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index dfbab80be3f806fd7463bb792993e00c90442c10..11f78e8b2e499394eb43872f483f3f0ff533ada0 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
@@ -45,6 +46,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -67,13 +70,13 @@ class Masking(Layer):
   You want to mask timestep #3 and #5 because you lack data for
   these timesteps. You can:
 
-      - set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
-      - insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
+  - Set `x[:, 3, :] = 0.` and `x[:, 5, :] = 0.`
+  - Insert a `Masking` layer with `mask_value=0.` before the LSTM layer:
 
   ```python
-      model = Sequential()
-      model.add(Masking(mask_value=0., input_shape=(timesteps, features)))
-      model.add(LSTM(32))
+  model = Sequential()
+  model.add(Masking(mask_value=0., input_shape=(timesteps, features)))
+  model.add(LSTM(32))
   ```
   """
 
@@ -81,6 +84,7 @@ class Masking(Layer):
     super(Masking, self).__init__(**kwargs)
     self.supports_masking = True
     self.mask_value = mask_value
+    self._compute_output_and_mask_jointly = True
 
   def compute_mask(self, inputs, mask=None):
     return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
@@ -88,7 +92,10 @@ class Masking(Layer):
   def call(self, inputs):
     boolean_mask = K.any(
         math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
+    outputs = inputs * math_ops.cast(boolean_mask, inputs.dtype)
+    # Compute the mask and outputs simultaneously.
+    outputs._keras_mask = array_ops.squeeze(boolean_mask, axis=-1)  # pylint: disable=protected-access
+    return outputs
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -108,14 +115,19 @@ class Dropout(Layer):
   which helps prevent overfitting.
 
   Arguments:
-      rate: float between 0 and 1. Fraction of the input units to drop.
-      noise_shape: 1D integer tensor representing the shape of the
-          binary dropout mask that will be multiplied with the input.
-          For instance, if your inputs have shape
-          `(batch_size, timesteps, features)` and
-          you want the dropout mask to be the same for all timesteps,
-          you can use `noise_shape=(batch_size, 1, features)`.
-      seed: A Python integer to use as random seed.
+    rate: Float between 0 and 1. Fraction of the input units to drop.
+    noise_shape: 1D integer tensor representing the shape of the
+      binary dropout mask that will be multiplied with the input.
+      For instance, if your inputs have shape
+      `(batch_size, timesteps, features)` and
+      you want the dropout mask to be the same for all timesteps,
+      you can use `noise_shape=(batch_size, 1, features)`.
+    seed: A Python integer to use as random seed.
+
+  Call arguments:
+    inputs: Input tensor (of any rank).
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
@@ -138,9 +150,12 @@ class Dropout(Layer):
       training = K.learning_phase()
 
     def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
+      return nn.dropout(
+          inputs,
+          noise_shape=self._get_noise_shape(inputs),
+          seed=self.seed,
+          rate=self.rate)
+
     output = tf_utils.smart_cond(training,
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
@@ -172,18 +187,23 @@ class SpatialDropout1D(Dropout):
   between feature maps and should be used instead.
 
   Arguments:
-      rate: float between 0 and 1. Fraction of the input units to drop.
+    rate: Float between 0 and 1. Fraction of the input units to drop.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
 
   Input shape:
-      3D tensor with shape:
-      `(samples, timesteps, channels)`
+    3D tensor with shape:
+    `(samples, timesteps, channels)`
 
   Output shape:
-      Same as input
+    Same as input.
 
   References:
-      - [Efficient Object Localization Using Convolutional
-        Networks](https://arxiv.org/abs/1411.4280)
+    - [Efficient Object Localization Using Convolutional
+      Networks](https://arxiv.org/abs/1411.4280)
   """
 
   def __init__(self, rate, **kwargs):
@@ -209,27 +229,32 @@ class SpatialDropout2D(Dropout):
   between feature maps and should be used instead.
 
   Arguments:
-      rate: float between 0 and 1. Fraction of the input units to drop.
-      data_format: 'channels_first' or 'channels_last'.
-          In 'channels_first' mode, the channels dimension
-          (the depth) is at index 1,
-          in 'channels_last' mode is it at index 3.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    rate: Float between 0 and 1. Fraction of the input units to drop.
+    data_format: 'channels_first' or 'channels_last'.
+      In 'channels_first' mode, the channels dimension
+      (the depth) is at index 1,
+      in 'channels_last' mode is it at index 3.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
+
+  Call arguments:
+    inputs: A 4D tensor.
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
 
   Input shape:
-      4D tensor with shape:
-      `(samples, channels, rows, cols)` if data_format='channels_first'
-      or 4D tensor with shape:
-      `(samples, rows, cols, channels)` if data_format='channels_last'.
+    4D tensor with shape:
+    `(samples, channels, rows, cols)` if data_format='channels_first'
+    or 4D tensor with shape:
+    `(samples, rows, cols, channels)` if data_format='channels_last'.
 
   Output shape:
-      Same as input
+    Same as input.
 
   References:
-      - [Efficient Object Localization Using Convolutional
-        Networks](https://arxiv.org/abs/1411.4280)
+    - [Efficient Object Localization Using Convolutional
+      Networks](https://arxiv.org/abs/1411.4280)
   """
 
   def __init__(self, rate, data_format=None, **kwargs):
@@ -263,26 +288,31 @@ class SpatialDropout3D(Dropout):
   between feature maps and should be used instead.
 
   Arguments:
-      rate: float between 0 and 1. Fraction of the input units to drop.
-      data_format: 'channels_first' or 'channels_last'.
-          In 'channels_first' mode, the channels dimension (the depth)
-          is at index 1, in 'channels_last' mode is it at index 4.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    rate: Float between 0 and 1. Fraction of the input units to drop.
+    data_format: 'channels_first' or 'channels_last'.
+        In 'channels_first' mode, the channels dimension (the depth)
+        is at index 1, in 'channels_last' mode is it at index 4.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
+
+  Call arguments:
+    inputs: A 5D tensor.
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
 
   Input shape:
-      5D tensor with shape:
-      `(samples, channels, dim1, dim2, dim3)` if data_format='channels_first'
-      or 5D tensor with shape:
-      `(samples, dim1, dim2, dim3, channels)` if data_format='channels_last'.
+    5D tensor with shape:
+    `(samples, channels, dim1, dim2, dim3)` if data_format='channels_first'
+    or 5D tensor with shape:
+    `(samples, dim1, dim2, dim3, channels)` if data_format='channels_last'.
 
   Output shape:
-      Same as input
+    Same as input.
 
   References:
-      - [Efficient Object Localization Using Convolutional
-        Networks](https://arxiv.org/abs/1411.4280)
+    - [Efficient Object Localization Using Convolutional
+      Networks](https://arxiv.org/abs/1411.4280)
   """
 
   def __init__(self, rate, data_format=None, **kwargs):
@@ -308,16 +338,16 @@ class Activation(Layer):
   """Applies an activation function to an output.
 
   Arguments:
-      activation: name of activation function to use
-          or alternatively, a Theano or TensorFlow operation.
+    activation: Activation function, such as `tf.nn.relu`, or string name of
+      built-in activation function, such as "relu".
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
+    Same shape as input.
   """
 
   def __init__(self, activation, **kwargs):
@@ -342,34 +372,34 @@ class Reshape(Layer):
   """Reshapes an output to a certain shape.
 
   Arguments:
-      target_shape: target shape. Tuple of integers,
-          does not include the samples dimension (batch size).
+    target_shape: Target shape. Tuple of integers,
+      does not include the samples dimension (batch size).
 
   Input shape:
-      Arbitrary, although all dimensions in the input shaped must be fixed.
-      Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary, although all dimensions in the input shaped must be fixed.
+    Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      `(batch_size,) + target_shape`
+    `(batch_size,) + target_shape`
 
   Example:
 
   ```python
-      # as first layer in a Sequential model
-      model = Sequential()
-      model.add(Reshape((3, 4), input_shape=(12,)))
-      # now: model.output_shape == (None, 3, 4)
-      # note: `None` is the batch dimension
-
-      # as intermediate layer in a Sequential model
-      model.add(Reshape((6, 2)))
-      # now: model.output_shape == (None, 6, 2)
-
-      # also supports shape inference using `-1` as dimension
-      model.add(Reshape((-1, 2, 2)))
-      # now: model.output_shape == (None, 3, 2, 2)
+  # as first layer in a Sequential model
+  model = Sequential()
+  model.add(Reshape((3, 4), input_shape=(12,)))
+  # now: model.output_shape == (None, 3, 4)
+  # note: `None` is the batch dimension
+
+  # as intermediate layer in a Sequential model
+  model.add(Reshape((6, 2)))
+  # now: model.output_shape == (None, 6, 2)
+
+  # also supports shape inference using `-1` as dimension
+  model.add(Reshape((-1, 2, 2)))
+  # now: model.output_shape == (None, 3, 2, 2)
   ```
   """
 
@@ -384,21 +414,18 @@ class Reshape(Layer):
     `_fix_unknown_dimension` in `numpy/core/src/multiarray/shape.c`
 
     Arguments:
-        input_shape: shape of array being reshaped
-        output_shape: desired shape of the array with at most
-            a single -1 which indicates a dimension that should be
-            derived from the input shape.
+      input_shape: Shape of array being reshaped
+      output_shape: Desired shape of the array with at most
+        a single -1 which indicates a dimension that should be
+        derived from the input shape.
 
     Returns:
-        The new output shape with a -1 replaced with its computed value.
-
-        Raises a ValueError if the total array size of the output_shape is
-        different then the input_shape, or more than one unknown dimension
-        is specified.
+      The new output shape with a -1 replaced with its computed value.
 
     Raises:
-        ValueError: in case of invalid values
-            for `input_shape` or `input_shape`.
+      ValueError: If the total array size of the output_shape is
+      different than the input_shape, or more than one unknown dimension
+      is specified.
     """
     output_shape = list(output_shape)
     msg = 'total size of new array must be unchanged'
@@ -453,26 +480,26 @@ class Permute(Layer):
   Example:
 
   ```python
-      model = Sequential()
-      model.add(Permute((2, 1), input_shape=(10, 64)))
-      # now: model.output_shape == (None, 64, 10)
-      # note: `None` is the batch dimension
+  model = Sequential()
+  model.add(Permute((2, 1), input_shape=(10, 64)))
+  # now: model.output_shape == (None, 64, 10)
+  # note: `None` is the batch dimension
   ```
 
   Arguments:
-      dims: Tuple of integers. Permutation pattern, does not include the
-          samples dimension. Indexing starts at 1.
-          For instance, `(2, 1)` permutes the first and second dimensions
-          of the input.
+    dims: Tuple of integers. Permutation pattern, does not include the
+      samples dimension. Indexing starts at 1.
+      For instance, `(2, 1)` permutes the first and second dimensions
+      of the input.
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same as the input shape, but with the dimensions re-ordered according
-      to the specified pattern.
+    Same as the input shape, but with the dimensions re-ordered according
+    to the specified pattern.
   """
 
   def __init__(self, dims, **kwargs):
@@ -510,27 +537,27 @@ class Flatten(Layer):
   adds an extra channel dimension and output shapes are `(batch, 1)`.
 
   Arguments:
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, ..., channels)` while `channels_first` corresponds to
-          inputs with shape `(batch, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Example:
 
   ```python
-      model = Sequential()
-      model.add(Convolution2D(64, 3, 3,
-                              border_mode='same',
-                              input_shape=(3, 32, 32)))
-      # now: model.output_shape == (None, 64, 32, 32)
-
-      model.add(Flatten())
-      # now: model.output_shape == (None, 65536)
+  model = Sequential()
+  model.add(Convolution2D(64, 3, 3,
+                          border_mode='same',
+                          input_shape=(3, 32, 32)))
+  # now: model.output_shape == (None, 64, 32, 32)
+
+  model.add(Flatten())
+  # now: model.output_shape == (None, 65536)
   ```
   """
 
@@ -579,23 +606,23 @@ class RepeatVector(Layer):
   Example:
 
   ```python
-      model = Sequential()
-      model.add(Dense(32, input_dim=32))
-      # now: model.output_shape == (None, 32)
-      # note: `None` is the batch dimension
+  model = Sequential()
+  model.add(Dense(32, input_dim=32))
+  # now: model.output_shape == (None, 32)
+  # note: `None` is the batch dimension
 
-      model.add(RepeatVector(3))
-      # now: model.output_shape == (None, 3, 32)
+  model.add(RepeatVector(3))
+  # now: model.output_shape == (None, 3, 32)
   ```
 
   Arguments:
-      n: integer, repetition factor.
+    n: Integer, repetition factor.
 
   Input shape:
-      2D tensor of shape `(num_samples, features)`.
+    2D tensor of shape `(num_samples, features)`.
 
   Output shape:
-      3D tensor of shape `(num_samples, n, features)`.
+    3D tensor of shape `(num_samples, n, features)`.
   """
 
   def __init__(self, n, **kwargs):
@@ -618,53 +645,83 @@ class RepeatVector(Layer):
 
 @keras_export('keras.layers.Lambda')
 class Lambda(Layer):
-  """Wraps arbitrary expression as a `Layer` object.
+  """Wraps arbitrary expressions as a `Layer` object.
+
+  The `Lambda` layer exists so that aribtrary TensorFlow functions
+  can be used when constructing `Sequential` and Functional API
+  models. `Lambda` layers are best suited for simple operations or
+  quick experimentation. For more advanced use cases, subclassing
+  `keras.layers.Layer` is preferred. One reason for this is that
+  when saving a Model, `Lambda` layers are saved by serializing the
+  Python bytecode, whereas subclassed Layers are saved via overriding
+  their `get_config` method and are thus more portable. Models that rely
+  on subclassed Layers are also often easier to visualize and reason
+  about.
 
   Examples:
 
   ```python
-      # add a x -> x^2 layer
-      model.add(Lambda(lambda x: x ** 2))
+  # add a x -> x^2 layer
+  model.add(Lambda(lambda x: x ** 2))
   ```
   ```python
-      # add a layer that returns the concatenation
-      # of the positive part of the input and
-      # the opposite of the negative part
-
-      def antirectifier(x):
-          x -= K.mean(x, axis=1, keepdims=True)
-          x = K.l2_normalize(x, axis=1)
-          pos = K.relu(x)
-          neg = K.relu(-x)
-          return K.concatenate([pos, neg], axis=1)
-
-      model.add(Lambda(antirectifier))
+  # add a layer that returns the concatenation
+  # of the positive part of the input and
+  # the opposite of the negative part
+
+  def antirectifier(x):
+      x -= K.mean(x, axis=1, keepdims=True)
+      x = K.l2_normalize(x, axis=1)
+      pos = K.relu(x)
+      neg = K.relu(-x)
+      return K.concatenate([pos, neg], axis=1)
+
+  model.add(Lambda(antirectifier))
   ```
 
-  Arguments:
-      function: The function to be evaluated.
-          Takes input tensor as first argument.
-      output_shape: Expected output shape from function.
-            This argument can be inferred if not explicitly provided.
-            Can be a tuple or function.
-            If a tuple, it only specifies the first dimension onward;
-                 sample dimension is assumed either the same as the input:
-                 `output_shape = (input_shape[0], ) + output_shape`
-                 or, the input is `None` and
-                 the sample dimension is also `None`:
-                 `output_shape = (None, ) + output_shape`
-            If a function, it specifies the entire shape as a function of the
-            input shape: `output_shape = f(input_shape)`
-      arguments: optional dictionary of keyword arguments to be passed
-            to the function.
+  Variables can be created within a `Lambda` layer. Like with
+  other layers, these variables will be created only once and reused
+  if the `Lambda` layer is called on new inputs. If creating more
+  than one variable in a given `Lambda` instance, be sure to use
+  a different name for each variable. Note that calling sublayers
+  from within a `Lambda` is not supported.
 
-  Input shape:
-      Arbitrary. Use the keyword argument input_shape
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+  Example of variable creation:
 
-  Output shape:
-      Specified by `output_shape` argument
+  ```python
+  def linear_transform(x):
+    v1 = tf.Variable(1., name='multiplier')
+    v2 = tf.Variable(0., name='bias')
+    return x*v1 + v2
+
+  linear_layer = Lambda(linear_transform)
+  model.add(linear_layer)
+  model.add(keras.layers.Dense(10, activation='relu'))
+  model.add(linear_layer)  # Reuses existing Variables
+  ```
+
+  Note that creating two instances of `Lambda` using the same function
+  will *not* share Variables between the two instances. Each instance of
+  `Lambda` will create and manage its own weights.
+
+  Arguments:
+    function: The function to be evaluated. Takes input tensor as first
+      argument.
+    output_shape: Expected output shape from function. This argument can be
+      inferred if not explicitly provided. Can be a tuple or function. If a
+      tuple, it only specifies the first dimension onward;
+      sample dimension is assumed either the same as the input: `output_shape =
+        (input_shape[0], ) + output_shape` or, the input is `None` and
+      the sample dimension is also `None`: `output_shape = (None, ) +
+        output_shape` If a function, it specifies the entire shape as a function
+        of the
+      input shape: `output_shape = f(input_shape)`
+    arguments: Optional dictionary of keyword arguments to be passed to the
+      function.
+  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Specified by `output_shape` argument
   """
 
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
@@ -675,72 +732,60 @@ class Lambda(Layer):
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
-    if (output_shape is not None and not isinstance(output_shape,
-                                                    (tuple, list)) and
-        not callable(output_shape)):
-      raise TypeError('In Lambda, `output_shape` '
-                      'must be a list, a tuple, or a function.')
-    # Convert a list representing a single shape into a tuple.
-    if (isinstance(output_shape, list) and isinstance(output_shape[0],
-                                                      (int, type(None)))):
-      output_shape = tuple(output_shape)
     self._output_shape = output_shape
+    self._variable_dict = {}
+    # These attributes are inherited from `Layer`.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self._output_shape is None:
-      if context.executing_eagerly():
-        # Make use of existing autocomputation for Eager mode but provide
-        # Lambda-specific error message.
+      # Make use of existing autocomputation but provide Lambda-specific
+      # error message. This is always safe to run even whn the outer context
+      # is Graph mode because Lambda layers don't have side effects such as
+      # `add_loss`.
+      with context.eager_mode():
         try:
           return super(Lambda, self).compute_output_shape(input_shape)
         except NotImplementedError:
-          raise NotImplementedError('We could not automatically infer '
-                                    'the static shape of the Lambda\'s output.'
-                                    ' Please specify the `output_shape` for'
-                                    ' this Lambda.')
-      if isinstance(input_shape, list):
-        x = [K.placeholder(shape=shape) for shape in input_shape]
-      else:
-        x = K.placeholder(shape=input_shape)
-      x = self.call(x)
-      if isinstance(x, list):
-        return [tensor_shape.TensorShape(K.int_shape(x_elem)) for x_elem in x]
-      else:
-        return tensor_shape.TensorShape(K.int_shape(x))
-    elif isinstance(self._output_shape, (tuple, list)):
-      if isinstance(input_shape, list):
-        num_samples = input_shape[0][0]
-      else:
-        num_samples = input_shape[0] if input_shape else None
-      # List here represents multiple outputs.
-      if isinstance(self._output_shape, list):
-        return [
-            tensor_shape.TensorShape((num_samples,) + tuple(single_shape))
-            for single_shape in self._output_shape
-        ]
-      return tensor_shape.TensorShape((num_samples,) + self._output_shape)
-    else:
-      shape = self._output_shape(input_shape)
-      if not isinstance(shape, (list, tuple)):
-        raise ValueError(
-            '`output_shape` function must return a tuple or a list of tuples.')
-      # List here can represent multiple outputs or single output.
-      if isinstance(shape, list):
-        # Convert list representing single output into a tuple.
-        if isinstance(shape[0], (int, type(None))):
-          shape = tuple(shape)
-        else:
-          return [
-              tensor_shape.TensorShape(single_shape) for single_shape in shape
-          ]
-      return tensor_shape.TensorShape(shape)
+          raise NotImplementedError(
+              'We could not automatically infer the shape of the Lambda\'s '
+              'output. Please specify `output_shape` for this Lambda.')
+
+    if callable(self._output_shape):
+      output_shapes = self._output_shape(input_shape)
+      return tf_utils.convert_shapes(output_shapes, to_tuples=False)
+
+    # Output shapes are passed directly and don't include batch dimension.
+    input_tensor_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
+    batch_size = nest.flatten(input_tensor_shape)[0][0] if input_shape else None
+
+    def _add_batch(shape):
+      return tensor_shape.TensorShape([batch_size] + shape.as_list())
+
+    output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
+    return nest.map_structure(_add_batch, output_shapes)
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
     if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
-    return self.function(inputs, **arguments)
+    with variable_scope.variable_creator_scope(self._variable_creator):
+      return self.function(inputs, **arguments)
+
+  def _variable_creator(self, next_creator, **kwargs):
+    name = kwargs['name']
+    if name in self._variable_dict:
+      return self._variable_dict[name]
+    var = next_creator(**kwargs)
+    self._variable_dict[name] = var
+    if var.trainable:
+      self._trainable_weights.append(var)
+    else:
+      self._non_trainable_weights.append(var)
+    K.track_variable(var)
+    return var
 
   def compute_mask(self, inputs, mask=None):
     if callable(self.mask):
@@ -856,49 +901,49 @@ class Dense(Layer):
   created by the layer, and `bias` is a bias vector created by the layer
   (only applicable if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then
+  Note: If the input to the layer has a rank greater than 2, then
   it is flattened prior to the initial dot product with `kernel`.
 
   Example:
 
   ```python
-      # as first layer in a sequential model:
-      model = Sequential()
-      model.add(Dense(32, input_shape=(16,)))
-      # now the model will take as input arrays of shape (*, 16)
-      # and output arrays of shape (*, 32)
-
-      # after the first layer, you don't need to specify
-      # the size of the input anymore:
-      model.add(Dense(32))
+  # as first layer in a sequential model:
+  model = Sequential()
+  model.add(Dense(32, input_shape=(16,)))
+  # now the model will take as input arrays of shape (*, 16)
+  # and output arrays of shape (*, 32)
+
+  # after the first layer, you don't need to specify
+  # the size of the input anymore:
+  model.add(Dense(32))
   ```
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
-      nD tensor with shape: `(batch_size, ..., input_dim)`.
-      The most common situation would be
-      a 2D input with shape `(batch_size, input_dim)`.
+    N-D tensor with shape: `(batch_size, ..., input_dim)`.
+    The most common situation would be
+    a 2D input with shape `(batch_size, input_dim)`.
 
   Output shape:
-      nD tensor with shape: `(batch_size, ..., units)`.
-      For instance, for a 2D input with shape `(batch_size, input_dim)`,
-      the output would have shape `(batch_size, units)`.
+    N-D tensor with shape: `(batch_size, ..., units)`.
+    For instance, for a 2D input with shape `(batch_size, input_dim)`,
+    the output would have shape `(batch_size, units)`.
   """
 
   def __init__(self,
@@ -932,6 +977,10 @@ class Dense(Layer):
     self.input_spec = InputSpec(min_ndim=2)
 
   def build(self, input_shape):
+    dtype = dtypes.as_dtype(self.dtype or K.floatx())
+    if not (dtype.is_floating or dtype.is_complex):
+      raise TypeError('Unable to build `Dense` layer with non-floating point '
+                      'dtype %s' % (dtype,))
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
@@ -972,6 +1021,11 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
+      # Cast the inputs to self.dtype, which is the variable dtype. We do not
+      # cast if `should_cast_variables` is True, as in that case the variable
+      # will be automatically casted to inputs.dtype.
+      if not self._mixed_precision_policy.should_cast_variables:
+        inputs = math_ops.cast(inputs, self.dtype)
       outputs = gen_math_ops.mat_mul(inputs, self.kernel)
     if self.use_bias:
       outputs = nn.bias_add(outputs, self.bias)
@@ -1011,16 +1065,16 @@ class ActivityRegularization(Layer):
   """Layer that applies an update to the cost function based input activity.
 
   Arguments:
-      l1: L1 regularization factor (positive float).
-      l2: L2 regularization factor (positive float).
+    l1: L1 regularization factor (positive float).
+    l2: L2 regularization factor (positive float).
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
+    Same shape as input.
   """
 
   def __init__(self, l1=0., l2=0., **kwargs):
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 9df40f806fa2cd78699218298b6d31199ed126d6..92ddaa9ee9611a864d792abd40a63244baf02526 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -22,9 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -170,6 +173,15 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     output_shape = l.compute_output_shape([(10, 10), (10, 20)])
     self.assertAllEqual([(10, 10), (10, 20)], output_shape)
 
+  def test_lambda_output_shape_nested(self):
+
+    def lambda_fn(inputs):
+      return (inputs[1]['a'], {'b': inputs[0]})
+
+    l = keras.layers.Lambda(lambda_fn)
+    output_shape = l.compute_output_shape(((10, 20), {'a': (10, 5)}))
+    self.assertAllEqual(((10, 5), {'b': (10, 20)}), output_shape)
+
   def test_lambda_config_serialization(self):
     # Test serialization with output_shape and output_shape_type
     layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
@@ -181,6 +193,40 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     })
     layer = keras.layers.Lambda.from_config(config)
 
+  def test_lambda_with_variable(self):
+
+    def fn(x):
+      return x * variables.Variable(2., name='multiplier')
+
+    layer = keras.layers.Lambda(fn)
+    for _ in range(10):
+      layer(np.ones((10, 10), 'float32'))
+    self.assertLen(layer.trainable_weights, 1)
+    self.assertEqual(layer.trainable_weights[0].name, 'lambda/multiplier:0')
+
+
+class TestStatefulLambda(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
+  def test_lambda_with_variable_in_model(self):
+
+    def lambda_fn(x):
+      # Variable will only get created once.
+      v = variables.Variable(1., trainable=True)
+      return x * v
+
+    model = testing_utils.get_model_from_layers(
+        [keras.layers.Lambda(lambda_fn)], input_shape=(10,))
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10), 'float32'), 2 * np.ones((10, 10), 'float32')
+    model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
+    self.assertLen(model.trainable_weights, 1)
+    self.assertAllClose(keras.backend.get_value(model.trainable_weights[0]), 2.)
+
 
 @keras_parameterized.run_all_keras_modes
 class CoreLayersTest(keras_parameterized.TestCase):
@@ -189,6 +235,13 @@ class CoreLayersTest(keras_parameterized.TestCase):
     testing_utils.layer_test(
         keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
 
+  def test_keras_mask(self):
+    x = np.ones((10, 10))
+    y = keras.layers.Masking(1.)(x)
+    self.assertTrue(hasattr(y, '_keras_mask'))
+    self.assertTrue(y._keras_mask is not None)
+    self.assertAllClose(self.evaluate(y._keras_mask), np.zeros((10,)))
+
   def test_activation(self):
     # with string argument
     testing_utils.layer_test(
@@ -283,6 +336,21 @@ class CoreLayersTest(keras_parameterized.TestCase):
     testing_utils.layer_test(
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
+  def test_dense_dtype(self):
+    inputs = ops.convert_to_tensor(
+        np.random.randint(low=0, high=7, size=(2, 2)))
+    layer = keras.layers.Dense(5, dtype='float32')
+    outputs = layer(inputs)
+    self.assertEqual(outputs.dtype, 'float32')
+
+  def test_dense_with_policy(self):
+    inputs = ops.convert_to_tensor(
+        np.random.randint(low=0, high=7, size=(2, 2)), dtype='float16')
+    layer = keras.layers.Dense(5, dtype=policy.Policy('infer_float32_vars'))
+    outputs = layer(inputs)
+    self.assertEqual(outputs.dtype, 'float16')
+    self.assertEqual(layer.kernel.dtype, 'float32')
+
   def test_dense_regularization(self):
     layer = keras.layers.Dense(
         3,
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index a74308f69cd6cbfccec1eb044c208149de214450..193447c08efba4ebd9aefded17d8a83c65d191ae 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -275,7 +275,7 @@ class CuDNNGRU(_CuDNNRNN):
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
+    params = recurrent_v2._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -470,7 +470,7 @@ class CuDNNLSTM(_CuDNNRNN):
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
+    params = recurrent_v2._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffa265a4afef78193debf79e0ecde1d2f12b1bb8
--- /dev/null
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -0,0 +1,254 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Attention layers that can be used in sequence DNN/CNN models.
+
+This file follows the terminology of https://arxiv.org/abs/1706.03762 Figure 2.
+Attention is formed by three tensors: Query, Key and Value.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+class BaseDenseAttention(Layer):
+  """Base Attention class for Dense networks.
+
+  This class is suitable for Dense or CNN networks, and not for RNN networks.
+
+  Implementations of attention mechanisms should inherit from this class, and
+  reuse the `apply_attention_scores()` method.
+
+  Call Arguments:
+
+    inputs: List of the following tensors:
+      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
+        given, will use `value` for both `key` and `value`, which is the
+        most common case.
+    mask: List of the following tensors:
+      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+        If given, the output will be zero at the positions where
+        `mask==False`.
+      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+  Output shape:
+
+    Attention outputs of shape `[batch_size, Tq, dim]`.
+  """
+
+  def _calculate_scores(self, query, key):
+    """Calculates attention scores.
+
+    Args:
+      query: Query tensor of shape `[batch_size, Tq, dim]`.
+      key: Key tensor of shape `[batch_size, Tv, dim]`.
+    Returns:
+      Tensor of shape `[batch_size, Tq, Tv]`.
+    """
+    return NotImplementedError
+
+  def _apply_scores(self, scores, value, value_mask=None):
+    """Applies attention scores to the given value tensor.
+
+    To use this method in your attention layer, follow the steps:
+
+    * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of shape
+      `[batch_size, Tv]` to calculate the attention `scores`.
+    * Pass `scores` and `value` tensors to this method. The method applies
+      `value_mask`, calculates `attention_distribution = softmax(scores)`, then
+      returns `matmul(attention_distribution, value).
+    * Apply `query_mask` and return the result.
+
+    Args:
+      scores: Scores float tensor of shape `[batch_size, Tq, Tv]`.
+      value: Value tensor of shape `[batch_size, Tv, dim]`.
+      value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+    Returns:
+      Tensor of shape `[batch_size, Tq, dim]`.
+    """
+    if value_mask is not None:
+      # Mask of shape [batch_size, 1, Tv] that is True in padding position.
+      padding_mask = array_ops.expand_dims(
+          math_ops.logical_not(value_mask), axis=1)
+      # Bias so padding positions do not contribute to attention distribution.
+      scores -= 1.e9 * math_ops.cast(padding_mask, dtype=K.floatx())
+    attention_distribution = nn.softmax(scores)
+    return math_ops.matmul(attention_distribution, value)
+
+  # TODO(b/125916026): Consider exposing a __call__ method with named args.
+  def call(self, inputs, mask=None):
+    self._validate_call_args(inputs=inputs, mask=mask)
+    q = inputs[0]
+    v = inputs[1]
+    k = inputs[2] if len(inputs) > 2 else v
+    q_mask = mask[0] if mask else None
+    v_mask = mask[1] if mask else None
+    # TODO(b/125916026): Support query_mask.
+    if q_mask is not None:
+      raise NotImplementedError('query_mask is not supported yet.')
+    scores = self._calculate_scores(query=q, key=k)
+    return self._apply_scores(scores=scores, value=v, value_mask=v_mask)
+
+  def _validate_call_args(self, inputs, mask):
+    """Validates arguments of the call method."""
+    class_name = self.__class__.__name__
+    if not isinstance(inputs, list):
+      raise ValueError(
+          '{} layer must be called on a list of inputs, namely [query, value] '
+          'or [query, value, key].'.format(class_name))
+    if len(inputs) < 2 or len(inputs) > 3:
+      raise ValueError(
+          '{} layer accepts inputs list of length 2 or 3, '
+          'namely [query, value] or [query, value, key]. '
+          'Given length: {}'.format(class_name, len(inputs)))
+    if mask:
+      if not isinstance(mask, list):
+        raise ValueError(
+            '{} layer mask must be a list, '
+            'namely [query_mask, value_mask].'.format(class_name))
+      if len(mask) != 2:
+        raise ValueError(
+            '{} layer mask must be a list of length 2, namely [query_mask, '
+            'value_mask]. Given length: {}'.format(class_name, len(mask)))
+
+
+class Attention(BaseDenseAttention):
+  """Dot-product attention layer, a.k.a. Luong-style attention.
+
+  Inputs are `query` tensor of shape `[batch_size, Tq]`, `value` tensor of shape
+  `[batch_size, Tv]` and `key` tensor of shape `[batch_size, Tv]`.
+  The calculation follows the steps:
+
+  1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
+     product: `scores = tf.matmul(query, key, transpose_b=True)`.
+  2. Use scores to calculate a distribution with shape
+     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+  3. Use `distribution` to create a linear combination of `value` with
+     shape `batch_size, Tq, dim]`:
+     `return tf.matmul(distribution, value)`.
+
+  Args:
+    scale: If `True`, will create a scalar variable to scale the attention
+      scores.
+
+  Call Arguments:
+
+    inputs: List of the following tensors:
+      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
+        given, will use `value` for both `key` and `value`, which is the
+        most common case.
+    mask: List of the following tensors:
+      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+        If given, the output will be zero at the positions where
+        `mask==False`.
+      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+  Output shape:
+
+    Attention outputs of shape `[batch_size, Tq, dim]`.
+
+  The meaning of `query`, `value` and `key` depend on the application. In the
+  case of text similarity, for example, `query` is the sequence embeddings of
+  the first piece of text and `value` is the sequence embeddings of the second
+  piece of text. `key` is usually the same tensor as `value`.
+
+  Here is a code example for using `Attention` in a CNN+Attention network:
+
+  ```python
+  # Variable-length int sequences.
+  query_input = tf.keras.Input(shape=(None,), dtype='int32')
+  value_input = tf.keras.Input(shape=(None,), dtype='int32')
+
+  # Embedding lookup.
+  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
+  # Query embeddings of shape [batch_size, Tq, dimension].
+  query_embeddings = token_embedding(query_input)
+  # Value embeddings of shape [batch_size, Tv, dimension].
+  value_embeddings = token_embedding(query_input)
+
+  # CNN layer.
+  cnn_layer = tf.keras.layers.Conv1D(
+      filters=100,
+      kernel_size=4,
+      # Use 'same' padding so outputs have the same shape as inputs.
+      padding='same')
+  # Query encoding of shape [batch_size, Tq, filters].
+  query_seq_encoding = cnn_layer(query_embeddings)
+  # Value encoding of shape [batch_size, Tv, filters].
+  value_seq_encoding = cnn_layer(value_embeddings)
+
+  # Query-value attention of shape [batch_size, Tq, filters].
+  query_value_attention_seq = tf.keras.layers.Attention()(
+      [query_seq_encoding, value_seq_encoding])
+
+  # Reduce over the sequence axis to produce encodings of shape
+  # [batch_size, filters].
+  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
+      query_seq_encoding)
+  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
+      query_value_attention_seq)
+
+  # Concatenate query and document encodings to produce a DNN input layer.
+  input_layer = tf.keras.layers.Concatenate()(
+      [query_encoding, query_value_attention])
+
+  # Add DNN layers, and create Model.
+  # ...
+  ```
+  """
+
+  def __init__(self, scale=False, **kwargs):
+    super(Attention, self).__init__(**kwargs)
+    # TODO(b/125916026): Support scale.
+    if scale:
+      raise NotImplementedError('scale=True is not supported yet.')
+    self.scale = scale
+
+  def build(self, input_shape):
+    """Creates scale variable if scale==True."""
+    # TODO(b/125916026): Create scale variable if self.scale is True.
+    self.scale_var = None
+    super(Attention, self).build(input_shape)
+
+  def _calculate_scores(self, query, key):
+    """Calculates attention scores as a query-key dot product.
+
+    Args:
+      query: Query tensor of shape `[batch_size, Tq, dim]`.
+      key: Key tensor of shape `[batch_size, Tv, dim]`.
+    Returns:
+      Tensor of shape `[batch_size, Tq, Tv]`.
+    """
+    scores = math_ops.matmul(query, key, transpose_b=True)
+    if self.scale_var is not None:
+      scores *= self.scale_var
+    return scores
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..adae877b72cd6a17915c89ff1c7587b27de10962
--- /dev/null
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -0,0 +1,329 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests dense attention layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.layers import dense_attention
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BaseDenseAttentionTest(test.TestCase):
+
+  def test_one_dim_with_mask(self):
+    # Scores tensor of shape [1, 1, 1]
+    scores = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 1, 1]
+    v = np.array([[[1.6]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 1]
+    v_mask = np.array([[True]], dtype=np.bool_)
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
+        scores=scores, value=v, value_mask=v_mask)
+
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
+    expected = np.array([[[1.6]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_one_dim_no_mask(self):
+    # Scores tensor of shape [1, 1, 1]
+    scores = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 1, 1]
+    v = np.array([[[1.6]]], dtype=np.float32)
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
+        scores=scores, value=v)
+
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
+    expected = np.array([[[1.6]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_with_mask(self):
+    # Scores tensor of shape [1, 1, 3]
+    scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
+        scores=scores, value=v, value_mask=v_mask)
+
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
+    #    attention_distribution001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
+    #             = 1.35795272077
+    expected = np.array([[[1.35795272077]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_no_mask(self):
+    # Scores tensor of shape [1, 1, 3]
+    scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
+        scores=scores, value=v)
+
+    # Expected attention distribution = softmax(scores).
+    # => attention_distribution000 = exp(1)/(exp(1) + exp(0) + exp(1))
+    #                              = 0.42231879825
+    #    attention_distribution001 = exp(0)/(exp(1) + exp(0) + exp(1))
+    #                              = 0.15536240349
+    #    attention_distribution002 = exp(1)/(exp(1) + exp(0) + exp(1))
+    #                              = 0.42231879825
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.42231879825 * 1.6 + 0.15536240349 * 0.7
+    #               - 0.42231879825 * 0.8
+    #             = 0.44660872104
+    expected = np.array([[[0.44660872104]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_one_dim_batch_size_two(self):
+    # Scores tensor of shape [2, 1, 1]
+    scores = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+    # Value tensor of shape [2, 1, 1]
+    v = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+    # Value mask tensor of shape [2, 1]
+    v_mask = np.array([[True], [True]], dtype=np.bool_)
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
+        scores=scores, value=v, value_mask=v_mask)
+
+    # Expected tensor of shape [2, 1, 1].
+    # expected000 = softmax(scores)[0, 0] * 1.6 = 1.6
+    # expected100 = softmax(scores)[1, 0] * 2.6 = 2.6
+    expected = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionTest(test.TestCase):
+
+  def test_calculate_scores_one_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Key tensor of shape [1, 1, 1]
+    k = np.array([[[1.6]]], dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 1.1*1.6 = 1.76
+    expected = np.array([[[1.76]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_multi_dim(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [1, 2, 3].
+    # expected000 = 1.*1.5+1.1*1.6+1.2*1.7+1.3*1.8 = 7.64
+    # expected001 = 1.*2.5+1.1*2.6+1.2*2.7+1.3*2.8 = 12.24
+    # expected002 = 1.*3.5+1.1*3.6+1.2*3.7+1.3*3.8 = 16.84
+    # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
+    # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
+    # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
+    expected = np.array(
+        [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_one_dim_batch_size_two(self):
+    # Query tensor of shape [2, 1, 1]
+    q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+    # Key tensor of shape [2, 1, 1]
+    k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [2, 1, 1].
+    # expected000 = 1.1*1.6 = 1.76
+    # expected100 = 2.1*2.6 = 5.46
+    expected = np.array([[[1.76]], [[5.46]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_shape(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_shape_with_key(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_multi_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+    #                              = 0.72908792234
+    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+    #                              = 0.27091207765
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
+    #             = 1.3561791301
+    expected = np.array([[[1.3561791301]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_with_key(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 1]
+    k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+    #                              = 0.72908792234
+    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+    #                              = 0.27091207765
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.72908792234 * 0.5 + 0.27091207765 * 0.8 - 0 * 0.3
+    #             = 0.58127362329
+    expected = np.array([[[0.58127362329]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_scale_not_implemented(self):
+    with self.assertRaisesRegexp(
+        NotImplementedError, 'scale=True is not supported yet'):
+      dense_attention.Attention(scale=True)
+
+  def test_query_mask_not_implemented(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        NotImplementedError, 'query_mask is not supported yet'):
+      attention_layer([q, q], mask=[mask, mask])
+
+  def test_inputs_not_list(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer must be called on a list of inputs'):
+      attention_layer(q)
+
+  def test_inputs_too_short(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Attention layer accepts inputs list of length 2 or 3'):
+      attention_layer([q])
+
+  def test_inputs_too_long(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Attention layer accepts inputs list of length 2 or 3'):
+      attention_layer([q, q, q, q])
+
+  def test_mask_not_list(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list'):
+      attention_layer([q, q], mask=mask)
+
+  def test_mask_too_short(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list of length 2'):
+      attention_layer([q, q], mask=[mask])
+
+  def test_mask_too_long(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list of length 2'):
+      attention_layer([q, q], mask=[mask, mask, mask])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index df5e82c2459b4c1beb1c5b74a7048be022144535..85285db20aaede453c4b98754eefd48e1ed112fd 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -35,58 +35,57 @@ from tensorflow.python.util.tf_export import keras_export
 class Embedding(Layer):
   """Turns positive integers (indexes) into dense vectors of fixed size.
 
-  eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
+  e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
 
   This layer can only be used as the first layer in a model.
 
   Example:
 
   ```python
-    model = Sequential()
-    model.add(Embedding(1000, 64, input_length=10))
-    # the model will take as input an integer matrix of size (batch,
-    # input_length).
-    # the largest integer (i.e. word index) in the input should be no larger
-    # than 999 (vocabulary size).
-    # now model.output_shape == (None, 10, 64), where None is the batch
-    # dimension.
-
-    input_array = np.random.randint(1000, size=(32, 10))
-
-    model.compile('rmsprop', 'mse')
-    output_array = model.predict(input_array)
-    assert output_array.shape == (32, 10, 64)
+  model = Sequential()
+  model.add(Embedding(1000, 64, input_length=10))
+  # the model will take as input an integer matrix of size (batch,
+  # input_length).
+  # the largest integer (i.e. word index) in the input should be no larger
+  # than 999 (vocabulary size).
+  # now model.output_shape == (None, 10, 64), where None is the batch
+  # dimension.
+
+  input_array = np.random.randint(1000, size=(32, 10))
+
+  model.compile('rmsprop', 'mse')
+  output_array = model.predict(input_array)
+  assert output_array.shape == (32, 10, 64)
   ```
 
   Arguments:
     input_dim: int > 0. Size of the vocabulary,
-        i.e. maximum integer index + 1.
+      i.e. maximum integer index + 1.
     output_dim: int >= 0. Dimension of the dense embedding.
     embeddings_initializer: Initializer for the `embeddings` matrix.
     embeddings_regularizer: Regularizer function applied to
-        the `embeddings` matrix.
+      the `embeddings` matrix.
     embeddings_constraint: Constraint function applied to
-        the `embeddings` matrix.
+      the `embeddings` matrix.
     mask_zero: Whether or not the input value 0 is a special "padding"
-        value that should be masked out.
-        This is useful when using recurrent layers
-        which may take variable length input.
-        If this is `True` then all subsequent layers
-        in the model need to support masking or an exception will be raised.
-        If mask_zero is set to True, as a consequence, index 0 cannot be
-        used in the vocabulary (input_dim should equal size of
-        vocabulary + 1).
+      value that should be masked out.
+      This is useful when using recurrent layers
+      which may take variable length input.
+      If this is `True` then all subsequent layers
+      in the model need to support masking or an exception will be raised.
+      If mask_zero is set to True, as a consequence, index 0 cannot be
+      used in the vocabulary (input_dim should equal size of
+      vocabulary + 1).
     input_length: Length of input sequences, when it is constant.
-        This argument is required if you are going to connect
-        `Flatten` then `Dense` layers upstream
-        (without it, the shape of the dense outputs cannot be computed).
+      This argument is required if you are going to connect
+      `Flatten` then `Dense` layers upstream
+      (without it, the shape of the dense outputs cannot be computed).
 
   Input shape:
-      2D tensor with shape: `(batch_size, input_length)`.
+    2D tensor with shape: `(batch_size, input_length)`.
 
   Output shape:
-      3D tensor with shape: `(batch_size, input_length, output_dim)`.
-
+    3D tensor with shape: `(batch_size, input_length, output_dim)`.
   """
 
   def __init__(self,
@@ -181,10 +180,8 @@ class Embedding(Layer):
 
   def get_config(self):
     config = {
-        'input_dim':
-            self.input_dim,
-        'output_dim':
-            self.output_dim,
+        'input_dim': self.input_dim,
+        'output_dim': self.output_dim,
         'embeddings_initializer':
             initializers.serialize(self.embeddings_initializer),
         'embeddings_regularizer':
@@ -193,10 +190,8 @@ class Embedding(Layer):
             regularizers.serialize(self.activity_regularizer),
         'embeddings_constraint':
             constraints.serialize(self.embeddings_constraint),
-        'mask_zero':
-            self.mask_zero,
-        'input_length':
-            self.input_length
+        'mask_zero': self.mask_zero,
+        'input_length': self.input_length
     }
     base_config = super(Embedding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
similarity index 84%
rename from tensorflow/python/keras/layers/unified_gru_test.py
rename to tensorflow/python/keras/layers/gru_v2_test.py
index 11322764ac2bd4028ba50667a9150b34dff659b9..8f241f4e19f0da3b1bcf13a1145673766c25a45e 100644
--- a/tensorflow/python/keras/layers/unified_gru_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -34,6 +34,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -47,16 +49,14 @@ from tensorflow.python.training import gradient_descent
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
-_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-_customer_optimizer = _rewrites.custom_optimizers.add()
-_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
 _rewrites.min_graph_nodes = -1
 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
 _config = config_pb2.ConfigProto(graph_options=_graph_options)
 
 
 @keras_parameterized.run_all_keras_modes(config=_config)
-class UnifiedGRUTest(keras_parameterized.TestCase):
+class GRUV2Test(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True),
@@ -69,13 +69,13 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
   def test_could_use_defun_backend(self, activation, recurrent_activation,
                                    recurrent_dropout, unroll, use_bias,
                                    reset_after):
-    layer = keras.layers.UnifiedGRU(1,
-                                    activation=activation,
-                                    recurrent_activation=recurrent_activation,
-                                    recurrent_dropout=recurrent_dropout,
-                                    unroll=unroll,
-                                    use_bias=use_bias,
-                                    reset_after=reset_after)
+    layer = rnn.GRU(1,
+                    activation=activation,
+                    recurrent_activation=recurrent_activation,
+                    recurrent_dropout=recurrent_dropout,
+                    unroll=unroll,
+                    use_bias=use_bias,
+                    reset_after=reset_after)
     self.assertFalse(layer.could_use_cudnn)
 
   def test_keras_model_with_gru(self):
@@ -93,7 +93,7 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
         num_classes=output_shape)
     y_train = keras.utils.to_categorical(y_train, output_shape)
 
-    layer = keras.layers.UnifiedGRU(rnn_state_size)
+    layer = rnn.GRU(rnn_state_size)
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
@@ -110,7 +110,7 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer = keras.layers.UnifiedGRU(units, input_shape=(None, embedding_dim))
+    layer = rnn.GRU(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
     model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
@@ -123,15 +123,15 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
-    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(10, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_GRU(self):
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     for stateful in (False, True):
       l1 = layer_class(units=1, stateful=stateful)
       l2 = layer_class.from_config(l1.get_config())
@@ -154,9 +154,9 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
 
       inputs = keras.layers.Input(
           shape=[timestep, input_shape], dtype=dtypes.float32)
-      gru_layer = keras.layers.GRU(rnn_state_size,
-                                   recurrent_activation='sigmoid',
-                                   reset_after=True)
+      gru_layer = rnn_v1.GRU(rnn_state_size,
+                             recurrent_activation='sigmoid',
+                             reset_after=True)
       output = gru_layer(inputs)
       gru_model = keras.models.Model(inputs, output)
       weights = gru_model.get_weights()
@@ -166,9 +166,9 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
       y_2 = gru_model.predict(x_train)
 
       with test_util.device(use_gpu=True):
-        cudnn_layer = keras.layers.UnifiedGRU(rnn_state_size,
-                                              recurrent_activation='sigmoid',
-                                              reset_after=True)
+        cudnn_layer = rnn.GRU(rnn_state_size,
+                              recurrent_activation='sigmoid',
+                              reset_after=True)
         cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
       cudnn_model.set_weights(weights)
       y_3 = cudnn_model.predict(x_train)
@@ -176,8 +176,8 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
       cudnn_model.fit(x_train, y_train)
       y_4 = cudnn_model.predict(x_train)
 
-      self.assertAllClose(y_1, y_3, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(y_2, y_4, rtol=1e-5, atol=1e-5)
+      self.assertAllClose(y_1, y_3, rtol=2e-5, atol=2e-5)
+      self.assertAllClose(y_2, y_4, rtol=2e-5, atol=2e-5)
 
   @parameterized.named_parameters(
       # test_name, use_bias, bias_initializer, activation
@@ -200,7 +200,7 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     def build_model():
       inputs = keras.layers.Input(
           shape=[timestep, input_dim], dtype=dtypes.float32)
-      layer = keras.layers.UnifiedGRU(
+      layer = rnn.GRU(
           units,
           use_bias=use_bias,
           bias_initializer=bias_initializer)
@@ -229,14 +229,14 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
     with test_util.device(use_gpu=False):
-      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
       y_1 = cpu_model.predict(x_train)
 
     with test_util.device(use_gpu=True):
-      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
       gpu_model.set_weights(weights)
@@ -246,9 +246,9 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     # 'sigmoid' as default. Construct the canonical GRU with sigmoid to achieve
     # the same output.
     with test_util.device(use_gpu=True):
-      layer = keras.layers.GRU(rnn_state_size,
-                               recurrent_activation='sigmoid',
-                               reset_after=True)
+      layer = rnn_v1.GRU(rnn_state_size,
+                         recurrent_activation='sigmoid',
+                         reset_after=True)
       output = layer(inputs)
       canonical_model = keras.models.Model(inputs, output)
       canonical_model.set_weights(weights)
@@ -291,18 +291,18 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
         outputs = layer(inputs)
       return keras.models.Model(inputs, outputs)
 
-    gru_model = build_model(keras.layers.GRU)
+    gru_model = build_model(rnn_v1.GRU)
     y_ref = gru_model.predict(x_train)
     weights = gru_model.get_weights()
 
-    unified_gru_model = build_model(keras.layers.UnifiedGRU)
+    unified_gru_model = build_model(rnn.GRU)
     unified_gru_model.set_weights(weights)
     y = unified_gru_model.predict(x_train)
 
     self.assertAllClose(y, y_ref)
 
   def test_with_masking_layer_GRU(self):
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -319,8 +319,8 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(10, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
@@ -332,18 +332,33 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_return_states_GRU(self):
+    layer_class = rnn.GRU
+    x = np.random.random((2, 3, 4))
+    y = np.abs(np.random.random((2, 5)))
+    s = np.abs(np.random.random((2, 5)))
+    inputs = keras.layers.Input(
+        shape=[3, 4], dtype=dtypes.float32)
+    masked = keras.layers.Masking()(inputs)
+    outputs, states = layer_class(units=5, return_state=True)(masked)
+
+    model = keras.models.Model(inputs, [outputs, states])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+    model.fit(x, [y, s], epochs=1, batch_size=2, verbose=1)
+
   def test_dropout_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'dropout': 0.1,
                 'recurrent_dropout': 0.1},
@@ -351,7 +366,7 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
 
   def test_constraints_GRU(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     k_constraint = keras.constraints.max_norm(0.01)
     r_constraint = keras.constraints.max_norm(0.01)
     b_constraint = keras.constraints.max_norm(0.01)
@@ -375,14 +390,14 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'implementation': implementation_mode},
         input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_regularizers_GRU(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     layer = layer_class(
         5,
         return_sequences=False,
@@ -407,7 +422,7 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     model = keras.models.Sequential()
     model.add(
         keras.layers.Embedding(
@@ -464,6 +479,27 @@ class UnifiedGRUTest(keras_parameterized.TestCase):
 
     np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  def test_stateful_GRU_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        rnn.GRU(units, return_sequences=True, stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
+
 
 class GRULayerGradientTapeTest(test.TestCase):
 
@@ -475,11 +511,11 @@ class GRULayerGradientTapeTest(test.TestCase):
     embedding_size = 11
     gru_unit_size = 12
 
-    gru = keras.layers.UnifiedGRU(gru_unit_size,
-                                  return_sequences=True,
-                                  return_state=True,
-                                  recurrent_activation='sigmoid',
-                                  recurrent_initializer='glorot_uniform')
+    gru = rnn.GRU(gru_unit_size,
+                  return_sequences=True,
+                  return_state=True,
+                  recurrent_activation='sigmoid',
+                  recurrent_initializer='glorot_uniform')
 
     x = random_ops.random_uniform([1, time_steps, embedding_size])
     y = random_ops.random_uniform([1, gru_unit_size])
@@ -513,7 +549,7 @@ class GRULayerGraphOnlyTest(test.TestCase):
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+      layer = rnn.GRU(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -562,7 +598,7 @@ class GRULayerGraphOnlyTest(test.TestCase):
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+      layer = rnn.GRU(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
new file mode 100644
index 0000000000000000000000000000000000000000..9753fc66de9ad98b831b225974db180e6f5737d1
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -0,0 +1,258 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers that implement explicit (approximate) kernel feature maps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+
+_SUPPORTED_RBF_KERNEL_TYPES = ['gaussian', 'laplacian']
+
+
+class RandomFourierFeatures(base_layer.Layer):
+  r"""Layer that maps its inputs using random Fourier features.
+
+  This layer implements a feature map \\(\phi: \mathbb{R}^d \rightarrow
+  \mathbb{R}^D\\) which approximates shift-invariant kernels. A kernel function
+  K(x, y) defined over \\(\mathbb{R}^d x \mathbb{R}^d\\) is shift-invariant if
+  K(x, y) = k(x-y) for some function defined over \\(\mathbb{R}^d\\). Many
+  popular Radial Basis Functions (in short RBF), including gaussian and
+  laplacian kernels are shift-invariant.
+
+  The layer approximates a (shift invariant) kernel K in the following sense:
+    up to a scaling factor, for all inputs \\(x, y \in \mathbb{R}^d\\)
+        \\(\phi(x)^T \cdot \phi(y) \approx K(x, y)\\)
+
+  The implementation of this layer is based on the following paper:
+  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
+  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+
+  The distribution from which the parameters of the random features map (layer)
+  are sampled, determines which shift-invariant kernel the layer approximates
+  (see paper for more details). The users can use the distribution of their
+  choice. Due to their popularity, the layer supports the out-of-the-box
+  approximation of the following RBF kernels:
+  - Gaussian: \\(K(x, y) = e^{-\frac{\|x-y\|_2^2}{2 \cdot scale^2}}\\)
+  - Laplacian: \\(K(x, y) = e^{-\frac{\|x-y\|_1}{scale}}\\)
+
+  NOTE: Unlike the map described in the paper and the scikit-learn
+  implementation, the output of this layer does not apply the sqrt(2/D)
+  normalization factor.
+
+  Usage for ML: Typically, this layer is used to "kernelize" linear models by
+  applying a non-linear transformation (this layer) to the input features and
+  then training a linear model on top of the transformed features. Depending on
+  the loss function of the linear model, the composition of this layer and the
+  linear model results to models that are equivalent (up to approximation) to
+  kernel SVMs (for hinge loss), kernel logistic regression (for logistic loss),
+  kernel linear regression (for squared loss) etc.
+
+  Example of building a kernel multinomial logistic regression model with
+  Gaussian kernel in keras:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer='gaussian',
+      scale=5.0,
+      ...)
+
+  model = tf.keras.models.Sequential()
+  model.add(random_features_layer)
+  model.add(tf.keras.layers.Dense(units=num_classes, activation='softmax')
+
+  model.compile(elif isinstance(identifier, six.string_types):
+    loss=tf.keras.losses.categorical_crossentropy, optimizer=..., metrics=...)
+  ```
+
+  To use another kernel, replace the layer creation command with:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer=<my_initializer>,
+      scale=...,
+      ...)
+  ```
+
+  Arguments:
+    output_dim: Positive integer, the dimension of the layer's output, i.e., the
+      number of random features used to approximate the kernel.
+    kernel_initializer: Determines the distribution of the parameters of the
+      random features map (and therefore the kernel approximated by the layer).
+      It can be either a string or an instance of TensorFlow's Initializer
+      class. Currently only 'gaussian' and 'laplacian' are supported as string
+      initializers (case insensitive). Note that these parameters are not
+      trainable.
+    scale: For gaussian and laplacian kernels, this corresponds to a scaling
+      factor of the corresponding kernel approximated by the layer (see concrete
+      definitions above). When provided, it should be a positive float. If None,
+      the implementation chooses a default value (1.0 typically). Both the
+      approximation error of the kernel and the classification quality are
+      sensitive to this parameter. If trainable is set to True, this paramater
+      is learned end-to-end during training and the provided value serves as an
+      initialization value.
+      NOTE: When this layer is used to map the initial features and then the
+        transformed features are fed to a linear model, by making `scale`
+        trainable, the resulting optimization problem is no longer convex (even
+        if the loss function used by the linear model is convex).
+    trainable: Whether the scaling parameter of th layer is trainable. Defaults
+      to False.
+    name: name for the RandomFourierFeatures layer.
+
+  Raises:
+    ValueError: if output_dim or stddev are not positive or if the provided
+      kernel_initializer is not supported.
+  """
+
+  def __init__(self,
+               output_dim,
+               kernel_initializer='gaussian',
+               scale=None,
+               trainable=False,
+               name=None,
+               **kwargs):
+    if output_dim <= 0:
+      raise ValueError(
+          '`output_dim` should be a positive integer. Given: {}.'.format(
+              output_dim))
+    if isinstance(kernel_initializer, six.string_types):
+      if kernel_initializer.lower() not in _SUPPORTED_RBF_KERNEL_TYPES:
+        raise ValueError(
+            'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'
+            .format(kernel_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+    if scale is not None and scale <= 0.0:
+      raise ValueError('When provided, `scale` should be a positive float. '
+                       'Given: {}.'.format(scale))
+    super(RandomFourierFeatures, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self.output_dim = output_dim
+    self.kernel_initializer = kernel_initializer
+    self.scale = scale
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    # TODO(sibyl-vie3Poto): Allow higher dimension inputs. Currently the input is expected
+    # to have shape [batch_size, dimension].
+    if input_shape.rank != 2:
+      raise ValueError(
+          'The rank of the input tensor should be 2. Got {} instead.'.format(
+              input_shape.ndims))
+    if input_shape.dims[1].value is None:
+      raise ValueError(
+          'The last dimension of the inputs to `RandomFourierFeatures` '
+          'should be defined. Found `None`.')
+    self.input_spec = input_spec.InputSpec(
+        ndim=2, axes={1: input_shape.dims[1].value})
+    input_dim = input_shape.dims[1].value
+
+    kernel_initializer = _get_random_features_initializer(
+        self.kernel_initializer, shape=(input_dim, self.output_dim))
+
+    unscaled_kernel = self.add_weight(
+        name='unscaled_random_features',
+        shape=(input_dim, self.output_dim),
+        dtype=dtypes.float32,
+        initializer=kernel_initializer,
+        trainable=False)
+
+    self.bias = self.add_weight(
+        name='random_features_bias',
+        shape=(self.output_dim,),
+        dtype=dtypes.float32,
+        initializer=init_ops.random_uniform_initializer(
+            minval=0.0, maxval=2 * np.pi, dtype=dtypes.float32),
+        trainable=False)
+
+    if self.scale is None:
+      self.scale = _get_default_scale(self.kernel_initializer, input_dim)
+    scale = self.add_weight(
+        name='random_features_scale',
+        shape=(1,),
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(self.scale),
+        trainable=True,
+        constraint='NonNeg')
+    self.kernel = (1.0 / scale) * unscaled_kernel
+    super(RandomFourierFeatures, self).build(input_shape)
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    inputs = gen_math_ops.cast(inputs, dtypes.float32)
+    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    outputs = nn.bias_add(outputs, self.bias)
+    return gen_math_ops.cos(outputs)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank(2)
+    if input_shape.dims[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input shape must be defined. Given: %s' %
+          input_shape)
+    return input_shape[:-1].concatenate(self.output_dim)
+
+  def get_config(self):
+    kernel_initializer = self.kernel_initializer
+    if isinstance(self.kernel_initializer, init_ops.Initializer):
+      kernel_initializer = initializers.serialize(self.kernel_initializer)
+    config = {
+        'output_dim': self.output_dim,
+        'kernel_initializer': kernel_initializer,
+        'scale': self.scale,
+    }
+    base_config = super(RandomFourierFeatures, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def _get_random_features_initializer(initializer, shape):
+  """Returns Initializer object for random features."""
+
+  def _get_cauchy_samples(loc, scale, shape):
+    probs = np.random.uniform(low=0., high=1., size=shape)
+    return loc + scale * np.tan(np.pi * (probs - 0.5))
+
+  random_features_initializer = initializer
+  if isinstance(initializer, six.string_types):
+    if initializer.lower() == 'gaussian':
+      random_features_initializer = init_ops.random_normal_initializer(
+          stddev=1.0)
+    elif initializer.lower() == 'laplacian':
+      random_features_initializer = init_ops.constant_initializer(
+          _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape))
+
+    else:
+      raise ValueError(
+          'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'.format(
+              random_features_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+  return random_features_initializer
+
+
+def _get_default_scale(initializer, input_dim):
+  if (isinstance(initializer, six.string_types) and
+      initializer.lower() == 'gaussian'):
+    return np.sqrt(input_dim / 2.0)
+  return 1.0
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc6e8b144a6ab79ff6b0d0fe936683a5478b9e3
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -0,0 +1,391 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as keras_backend
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.layers import kernelized as kernel_layers
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual, atol=0.001):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        self.assertAllClose(expected, actual, atol=atol)
+    else:
+      self.assertAllClose(expected, actual, atol=atol)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_output_dim(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_unsupported_kernel_type(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
+      _ = kernel_layers.RandomFourierFeatures(
+          3, 'unsupported_kernel', stddev=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_scale(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'When provided, `scale` should be a positive float. Given: 0.0.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_input_shape(self):
+    inputs = random_ops.random_uniform((3, 2, 4), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The rank of the input tensor should be 2. Got 3 instead.'):
+      _ = rff_layer.apply(inputs)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, False),
+      ('random', init_ops.random_uniform_initializer, 1.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_random_features_properties(self, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable)
+    self.assertEqual(rff_layer.output_dim, 10)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+  @parameterized.named_parameters(('gaussian', 'gaussian', False),
+                                  ('laplacian', 'laplacian', True),
+                                  ('other', init_ops.ones_initializer, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_call(self, initializer, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=1.0,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, 10], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def test_no_eager_Leak(self):
+    # Tests that repeatedly constructing and building a Layer does not leak
+    # Python objects.
+    inputs = random_ops.random_uniform((5, 4), seed=1)
+    kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
+    kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_output_shape(self):
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=7, name='random_fourier_features', trainable=True)
+    outputs = rff_layer(inputs)
+    self.assertEqual([3, 7], outputs.get_shape().as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
+      ('other', init_ops.random_uniform_initializer))
+  @test_util.run_deprecated_v1
+  def test_call_on_placeholder(self, initializer):
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5, name='random_fourier_features')
+    rff_layer(inputs)
+
+  @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
+                                  ('laplacian', 5, 'laplacian', None),
+                                  ('other', 10, init_ops.ones_initializer, 1.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_output_shape(self, output_dim, initializer, scale):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim, initializer, scale=scale, name='rff')
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape(None))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The innermost dimension of input shape must be defined.'):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))
+
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape((None, 3)).as_list())
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape(
+                         tensor_shape.TensorShape([None, 2])).as_list())
+    self.assertEqual([4, output_dim],
+                     rff_layer.compute_output_shape((4, 1)).as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, False),
+      ('laplacian', 5, 'laplacian', 5.5, True),
+      ('other', 7, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_get_config(self, output_dim, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim,
+        initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features',
+    )
+    expected_initializer = initializer
+    if isinstance(initializer, init_ops.Initializer):
+      expected_initializer = initializers.serialize(initializer)
+
+    expected_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': expected_initializer,
+        'scale': scale,
+        'name': 'random_fourier_features',
+        'trainable': trainable,
+        'dtype': None,
+    }
+    self.assertLen(expected_config, len(rff_layer.get_config()))
+    self.assertSameElements(
+        list(expected_config.items()), list(rff_layer.get_config().items()))
+
+  @parameterized.named_parameters(
+      ('gaussian', 5, 'gaussian', None, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 7, init_ops.ones_initializer(), 2.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_from_config(self, output_dim, initializer, scale, trainable):
+    model_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': initializer,
+        'scale': scale,
+        'trainable': trainable,
+        'name': 'random_fourier_features',
+    }
+    rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config)
+    self.assertEqual(rff_layer.output_dim, output_dim)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, output_dim], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
+    if trainable:
+      self.assertEqual('random_fourier_features/random_features_scale:0',
+                       rff_layer.trainable_variables[0].name)
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 10, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_same_random_features_params_reused(self, output_dim, initializer,
+                                              scale, trainable):
+    """Applying the layer on the same input twice gives the same output."""
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = constant_op.constant(
+        np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
+    output1 = rff_layer.apply(inputs)
+    output2 = rff_layer.apply(inputs)
+    self._assert_all_close(output1, output2)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
+      ('other', init_ops.random_uniform_initializer(), 5.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_different_params_similar_approximation(self, initializer, scale):
+    random_seed.set_random_seed(12345)
+    rff_layer1 = kernel_layers.RandomFourierFeatures(
+        output_dim=3000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff1')
+    rff_layer2 = kernel_layers.RandomFourierFeatures(
+        output_dim=2000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff2')
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    # Apply both layers to both inputs.
+    output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(x)
+    output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(y)
+    output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(x)
+    output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(y)
+
+    # Compute the inner products of the outputs (on inputs x and y) for both
+    # layers. For any fixed random features layer rff_layer, and inputs x, y,
+    # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
+    approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
+    approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
+    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
+      ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
+    """Approximation is bad when output dimension is small."""
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    small_output_dim = 10
+    random_seed.set_random_seed(1234)
+    # Initialize layer.
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=small_output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # Apply layer to both inputs.
+    output_x = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(y)
+
+    # The inner products of the outputs (on inputs x and y) approximates the
+    # real value of the RBF kernel but poorly since the output dimension of the
+    # layer is small.
+    exact_kernel_value = exact_kernel_fn(x, y)
+    approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
+    abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        abs_error_eval = sess.run([abs_error])
+        self.assertGreater(abs_error_eval[0][0], 0.05)
+        self.assertLess(abs_error_eval[0][0], 0.5)
+    else:
+      self.assertGreater(abs_error, 0.05)
+      self.assertLess(abs_error, 0.5)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, _exact_gaussian(stddev=10.0)),
+      ('laplacian', 'laplacian', 50.0, _exact_laplacian(stddev=50.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
+                                                     exact_kernel_fn):
+    # Parameters.
+    input_dim = 5
+    output_dim = 5000
+    x_rows = 20
+    y_rows = 30
+
+    random_seed.set_random_seed(1234)
+    x = random_ops.random_uniform(shape=(x_rows, input_dim), maxval=1.0)
+    y = random_ops.random_uniform(shape=(y_rows, input_dim), maxval=1.0)
+
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # The shapes of output_x and output_y are (x_rows, output_dim) and
+    # (y_rows, output_dim) respectively.
+    output_x = math.sqrt(2.0 / output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / output_dim) * rff_layer.apply(y)
+
+    approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y)
+    exact_kernel_matrix = exact_kernel_fn(x, y)
+    self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
similarity index 90%
rename from tensorflow/python/keras/layers/unified_lstm_test.py
rename to tensorflow/python/keras/layers/lstm_v2_test.py
index 375894b166215ed7068767eed095fec2f60963ca..26f588e3d2ef583fed3201e40f6f54232d1e3a66 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -35,6 +35,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -43,20 +45,19 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.util import nest
 
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
-_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-_customer_optimizer = _rewrites.custom_optimizers.add()
-_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
 _rewrites.min_graph_nodes = -1
 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
 _config = config_pb2.ConfigProto(graph_options=_graph_options)
 
 
 @keras_parameterized.run_all_keras_modes(config=_config)
-class UnifiedLSTMTest(keras_parameterized.TestCase):
+class LSTMV2Test(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('non_tan_activation', 'relu', 'sigmoid', 0, False, True),
@@ -67,7 +68,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
   )
   def test_could_use_defun_backend(self, activation, recurrent_activation,
                                    recurrent_dropout, unroll, use_bias):
-    layer = keras.layers.UnifiedLSTM(
+    layer = rnn.LSTM(
         1,
         activation=activation,
         recurrent_activation=recurrent_activation,
@@ -86,7 +87,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     inputs = keras.layers.Dense(
         embedding_dim, input_shape=(timesteps, embedding_dim))
     model.add(inputs)
-    layer = keras.layers.UnifiedLSTM(units, return_sequences=True)
+    layer = rnn.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
     self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
@@ -96,7 +97,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer = keras.layers.UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    layer = rnn.LSTM(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
     model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
@@ -109,15 +110,15 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
-    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(10, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     for stateful in (False, True):
       l1 = layer_class(units=1, stateful=stateful)
       l2 = layer_class.from_config(l1.get_config())
@@ -133,7 +134,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     # Test with Keras tensor
     inputs = keras.Input((timesteps, embedding_dim))
     initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = keras.layers.UnifiedLSTM(units)
+    layer = rnn.LSTM(units)
     if len(initial_state) == 1:
       output = layer(inputs, initial_state=initial_state[0])
     else:
@@ -165,7 +166,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
         keras.backend.random_normal_variable((num_samples, units), 0, 1)
         for _ in range(num_states)
     ]
-    layer = keras.layers.UnifiedLSTM(units)
+    layer = rnn.LSTM(units)
     output = layer(inputs, initial_state=initial_state)
 
     model = keras.models.Model(inputs, output)
@@ -184,8 +185,9 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     units = 3
     num_samples = 2
 
-    layer = keras.layers.UnifiedLSTM(units, stateful=True)
+    layer = rnn.LSTM(units, stateful=True)
     layer.build((num_samples, timesteps, embedding_dim))
+    initial_weight_count = len(layer.weights)
     layer.reset_states()
     assert len(layer.states) == num_states
     assert layer.states[0] is not None
@@ -207,6 +209,12 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       layer.reset_states([1] * (len(layer.states) + 1))
 
+    self.assertEqual(initial_weight_count, len(layer.weights))
+    # Variables in "states" shouldn't show up in .weights
+    layer.states = nest.map_structure(variables.Variable, values)
+    layer.reset_states()
+    self.assertEqual(initial_weight_count, len(layer.weights))
+
   def test_specify_state_with_masking(self):
     num_states = 2
     timesteps = 3
@@ -217,7 +225,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     inputs = keras.Input((timesteps, embedding_dim))
     _ = keras.layers.Masking()(inputs)
     initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    output = keras.layers.UnifiedLSTM(units)(
+    output = rnn.LSTM(units)(
         inputs, initial_state=initial_state)
 
     model = keras.models.Model([inputs] + initial_state, output)
@@ -240,8 +248,9 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     num_samples = 2
 
     inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.UnifiedLSTM(units, return_state=True, stateful=True)
-    outputs = layer(inputs)
+    masked = keras.layers.Masking()(inputs)
+    layer = rnn.LSTM(units, return_state=True, stateful=True)
+    outputs = layer(masked)
     state = outputs[1:]
     assert len(state) == num_states
     model = keras.models.Model(inputs, state[0])
@@ -257,11 +266,11 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     num_samples = 2
 
     inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.UnifiedLSTM(
+    layer = rnn.LSTM(
         units, return_state=True, return_sequences=True)
     outputs = layer(inputs)
     output, state = outputs[0], outputs[1:]
-    output = keras.layers.UnifiedLSTM(units)(output, initial_state=state)
+    output = rnn.LSTM(units)(output, initial_state=state)
     model = keras.models.Model(inputs, output)
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
@@ -273,7 +282,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     units = 3
     num_samples = 2
     num_states = 2
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
 
     # Test with Keras tensor
     main_inputs = keras.Input((timesteps, embedding_dim))
@@ -313,8 +322,8 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
 
       inputs = keras.layers.Input(
           shape=[timestep, input_shape], dtype=dtypes.float32)
-      lstm_layer = keras.layers.LSTM(rnn_state_size,
-                                     recurrent_activation='sigmoid')
+      lstm_layer = rnn_v1.LSTM(rnn_state_size,
+                               recurrent_activation='sigmoid')
       output = lstm_layer(inputs)
       lstm_model = keras.models.Model(inputs, output)
       weights = lstm_model.get_weights()
@@ -324,7 +333,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
       y_2 = lstm_model.predict(x_train)
 
       with test_util.device(use_gpu=True):
-        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size)
+        cudnn_layer = rnn.LSTM(rnn_state_size)
         cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
       cudnn_model.set_weights(weights)
       y_3 = cudnn_model.predict(x_train)
@@ -332,24 +341,24 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
       cudnn_model.fit(x_train, y_train)
       y_4 = cudnn_model.predict(x_train)
 
-      self.assertAllClose(y_1, y_3, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(y_2, y_4, rtol=1e-5, atol=1e-5)
+      self.assertAllClose(y_1, y_3, rtol=1e-5, atol=2e-5)
+      self.assertAllClose(y_2, y_4, rtol=1e-5, atol=2e-5)
 
   @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
-  def test_implementation_mode_LSTM(self, implementation_mode):
+  def DISABLED_test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
+        rnn.LSTM,
         kwargs={
             'units': units,
             'implementation': implementation_mode
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     k_constraint = keras.constraints.max_norm(0.01)
     r_constraint = keras.constraints.max_norm(0.01)
     b_constraint = keras.constraints.max_norm(0.01)
@@ -366,7 +375,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -384,8 +393,8 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(10, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
@@ -424,11 +433,11 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
         outputs = layer(inputs)
       return keras.models.Model(inputs, outputs)
 
-    lstm_model = build_model(keras.layers.LSTM)
+    lstm_model = build_model(rnn_v1.LSTM)
     y_ref = lstm_model.predict(x_train)
     weights = lstm_model.get_weights()
 
-    unified_lstm_model = build_model(keras.layers.UnifiedLSTM)
+    unified_lstm_model = build_model(rnn.LSTM)
     unified_lstm_model.set_weights(weights)
     y = unified_lstm_model.predict(x_train)
 
@@ -448,7 +457,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
         num_classes=output_shape)
     y_train = keras.utils.to_categorical(y_train, output_shape)
 
-    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+    layer = rnn.LSTM(rnn_state_size)
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
@@ -481,7 +490,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     def build_model():
       inputs = keras.layers.Input(
           shape=[timestep, input_dim], dtype=dtypes.float32)
-      layer = keras.layers.UnifiedLSTM(
+      layer = rnn.LSTM(
           units,
           use_bias=use_bias,
           bias_initializer=bias_initializer)
@@ -510,14 +519,14 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
     with test_util.device(use_gpu=False):
-      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
     y_1 = cpu_model.predict(x_train)
 
     with test_util.device(use_gpu=True):
-      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
       gpu_model.set_weights(weights)
@@ -527,7 +536,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
     # the same output.
     with test_util.device(use_gpu=True):
-      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      layer = rnn_v1.LSTM(rnn_state_size, recurrent_activation='sigmoid')
       output = layer(inputs)
       canonical_model = keras.models.Model(inputs, output)
       # Remove the extra cudnn bias since canonical lstm will not use it.
@@ -537,13 +546,13 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     self.assertAllClose(y_1, y_2)
     self.assertAllClose(y_2, y_3)
 
-  def test_return_sequences_LSTM(self):
+  def DISABLED_test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
+        rnn.LSTM,
         kwargs={
             'units': units,
             'return_sequences': True
@@ -552,7 +561,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
 
   def test_regularizers_LSTM(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     layer = layer_class(
         5,
         return_sequences=False,
@@ -576,7 +585,7 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     model = keras.models.Sequential()
     model.add(
         keras.layers.Embedding(
@@ -633,6 +642,41 @@ class UnifiedLSTMTest(keras_parameterized.TestCase):
 
     self.assertAllClose(out7, out6, atol=1e-5)
 
+  def test_stateful_LSTM_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        rnn.LSTM(units, return_sequences=True, stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
+
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        rnn.LSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
@@ -654,7 +698,7 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+      layer = rnn.LSTM(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -703,7 +747,7 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+      layer = rnn.LSTM(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -742,24 +786,6 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
         existing_loss = loss_value
 
 
-class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_dropout_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
-        kwargs={
-            'units': units,
-            'dropout': 0.1,
-            'recurrent_dropout': 0.1
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
-
-
 class UnifiedLSTMPerformanceTest(test.Benchmark):
 
   def _measure_performance(self, test_config, model, x_train, y_train):
@@ -801,7 +827,7 @@ class UnifiedLSTMPerformanceTest(test.Benchmark):
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+    layer = rnn.LSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
@@ -822,7 +848,7 @@ class UnifiedLSTMPerformanceTest(test.Benchmark):
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    layer = keras.layers.LSTM(rnn_state_size)
+    layer = rnn_v1.LSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index f962a75b32421860296476607a5dacdaaf5468cd..7432ad4af886f2cbe20574d2c264d81681b210a6 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.keras import backend as K
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -35,7 +36,8 @@ class MergeLayersTest(keras_parameterized.TestCase):
     i2 = keras.layers.Input(shape=(4, 5))
     i3 = keras.layers.Input(shape=(4, 5))
 
-    o = keras.layers.add([i1, i2, i3])
+    add_layer = keras.layers.Add()
+    o = add_layer([i1, i2, i3])
     self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
@@ -47,6 +49,56 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
+    self.assertEqual(
+        add_layer.compute_mask([i1, i2, i3], [None, None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                add_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      add_layer.compute_mask([i1, i2, i3], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      add_layer.compute_mask(i1, [None, None, None])
+    with self.assertRaisesRegexp(ValueError, " should have the same length."):
+      add_layer.compute_mask([i1, i2, i3], [None, None])
+
+  def test_merge_subtract(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    i3 = keras.layers.Input(shape=(4, 5))
+
+    subtract_layer = keras.layers.Subtract()
+    o = subtract_layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
+
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, x1 - x2, atol=1e-4)
+
+    self.assertEqual(subtract_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                subtract_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      subtract_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      subtract_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1, i2, i3])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1])
+
   def test_merge_multiply(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -108,7 +160,8 @@ class MergeLayersTest(keras_parameterized.TestCase):
   def test_merge_concatenate(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.concatenate([i1, i2], axis=1)
+    concat_layer = keras.layers.Concatenate(axis=1)
+    o = concat_layer([i1, i2])
     self.assertListEqual(o.shape.as_list(), [None, 8, 5])
     model = keras.models.Model([i1, i2], o)
     model.run_eagerly = testing_utils.should_run_eagerly()
@@ -119,6 +172,23 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(out.shape, (2, 8, 5))
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
+    self.assertEqual(concat_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                concat_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      concat_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      concat_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError, "should have the same length"):
+      concat_layer.compute_mask([i1, i2], [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on a list of inputs"):
+      concat_layer(i1)
+
   def test_merge_dot(self):
     i1 = keras.layers.Input(shape=(4,))
     i2 = keras.layers.Input(shape=(4,))
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index 958ab7c0f616a94bd7b35b0575ac8bee91fa037b..f230d23c15a828563f3183490569682e1c89fc9f 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -40,15 +40,20 @@ class GaussianNoise(Layer):
   As it is a regularization layer, it is only active at training time.
 
   Arguments:
-      stddev: float, standard deviation of the noise distribution.
+    stddev: Float, standard deviation of the noise distribution.
+
+  Call arguments:
+    inputs: Input tensor (of any rank).
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding noise) or in inference mode (doing nothing).
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
+    Same shape as input.
   """
 
   def __init__(self, stddev, **kwargs):
@@ -81,18 +86,22 @@ class GaussianDropout(Layer):
   As it is a regularization layer, it is only active at training time.
 
   Arguments:
-      rate: float, drop probability (as with `Dropout`).
-          The multiplicative noise will have
-          standard deviation `sqrt(rate / (1 - rate))`.
+    rate: Float, drop probability (as with `Dropout`).
+      The multiplicative noise will have
+      standard deviation `sqrt(rate / (1 - rate))`.
+
+  Call arguments:
+    inputs: Input tensor (of any rank).
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
-
+    Same shape as input.
   """
 
   def __init__(self, rate, **kwargs):
@@ -132,19 +141,23 @@ class AlphaDropout(Layer):
   by randomly setting activations to the negative saturation value.
 
   Arguments:
-      rate: float, drop probability (as with `Dropout`).
-          The multiplicative noise will have
-          standard deviation `sqrt(rate / (1 - rate))`.
-      seed: A Python integer to use as random seed.
+    rate: float, drop probability (as with `Dropout`).
+      The multiplicative noise will have
+      standard deviation `sqrt(rate / (1 - rate))`.
+    seed: A Python integer to use as random seed.
+
+  Call arguments:
+    inputs: Input tensor (of any rank).
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (doing nothing).
 
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
-
+    Same shape as input.
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 171e1e231e13d2e571820c5582d66e78ff385269..f7ce5e654e4feb45941e74e754b557b82085abfb 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -39,12 +38,10 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
-@keras_export('keras.layers.BatchNormalization', v1=[])
-class BatchNormalizationV2(Layer):
-  """Batch normalization layer (Ioffe and Szegedy, 2014).
+class BatchNormalizationBase(Layer):
+  """Base class of Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
   i.e. applies a transformation that maintains the mean activation
@@ -52,19 +49,19 @@ class BatchNormalizationV2(Layer):
 
   Arguments:
     axis: Integer, the axis that should be normalized
-        (typically the features axis).
-        For instance, after a `Conv2D` layer with
-        `data_format="channels_first"`,
-        set `axis=1` in `BatchNormalization`.
+      (typically the features axis).
+      For instance, after a `Conv2D` layer with
+      `data_format="channels_first"`,
+      set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
     center: If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored.
+      If False, `beta` is ignored.
     scale: If True, multiply by `gamma`.
-        If False, `gamma` is not used.
-        When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling
-        will be done by the next layer.
+      If False, `gamma` is not used.
+      When the next layer is linear (also e.g. `nn.relu`),
+      this can be disabled since the scaling
+      will be done by the next layer.
     beta_initializer: Initializer for the beta weight.
     gamma_initializer: Initializer for the gamma weight.
     moving_mean_initializer: Initializer for the moving mean.
@@ -90,8 +87,7 @@ class BatchNormalizationV2(Layer):
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
       implementation.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    trainable: Boolean, if `True` the variables will be marked as trainable.
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
       which means batch normalization is performed across the whole batch. When
       `virtual_batch_size` is not `None`, instead perform "Ghost Batch
@@ -111,20 +107,30 @@ class BatchNormalizationV2(Layer):
       `None`, no adjustment is applied. Cannot be specified if
       virtual_batch_size is specified.
 
+  Call arguments:
+    inputs: Input tensor (of any rank).
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode.
+      - `training=True`: The layer will normalize its inputs using the
+        mean and variance of the current batch of inputs.
+      - `training=False`: The layer will normalize its inputs using the
+        mean and variance of its moving statistics, learned during training.
+
   Input shape:
-      Arbitrary. Use the keyword argument `input_shape`
-      (tuple of integers, does not include the samples axis)
-      when using this layer as the first layer in a model.
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
 
   Output shape:
-      Same shape as input.
+    Same shape as input.
 
   References:
-      - [Batch Normalization: Accelerating Deep Network Training by Reducing
-        Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+    - [Batch Normalization: Accelerating Deep Network Training by Reducing
+      Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
   """
 
-  # The BatchNormalizationV1 subclass sets this to False to use the V1 behavior.
+  # By default, the base class uses V2 behavior. The BatchNormalization V1
+  # subclass sets this to False to use the V1 behavior.
   _USE_V2_BEHAVIOR = True
 
   def __init__(self,
@@ -150,7 +156,7 @@ class BatchNormalizationV2(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalizationV2, self).__init__(
+    super(BatchNormalizationBase, self).__init__(
         name=name, trainable=trainable, **kwargs)
     if isinstance(axis, list):
       self.axis = axis[:]
@@ -230,6 +236,14 @@ class BatchNormalizationV2(Layer):
     except ValueError:
       return False
 
+  @property
+  def _param_dtype(self):
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      return dtypes.float32
+    else:
+      return self.dtype or dtypes.float32
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -292,12 +306,6 @@ class BatchNormalizationV2(Layer):
         raise ValueError('Unsupported axis, fused batch norm only supports '
                          'axis == [1] or axis == [3]')
 
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
-      param_dtype = dtypes.float32
-    else:
-      param_dtype = self.dtype or dtypes.float32
-
     axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
     for x in axis_to_dim:
       if axis_to_dim[x] is None:
@@ -322,31 +330,33 @@ class BatchNormalizationV2(Layer):
       self.gamma = self.add_weight(
           name='gamma',
           shape=param_shape,
-          dtype=param_dtype,
+          dtype=self._param_dtype,
           initializer=self.gamma_initializer,
           regularizer=self.gamma_regularizer,
           constraint=self.gamma_constraint,
-          trainable=True)
+          trainable=True,
+          experimental_autocast=False)
     else:
       self.gamma = None
       if self.fused:
-        self._gamma_const = array_ops.constant(
-            1.0, dtype=param_dtype, shape=param_shape)
+        self._gamma_const = K.constant(
+            1.0, dtype=self._param_dtype, shape=param_shape)
 
     if self.center:
       self.beta = self.add_weight(
           name='beta',
           shape=param_shape,
-          dtype=param_dtype,
+          dtype=self._param_dtype,
           initializer=self.beta_initializer,
           regularizer=self.beta_regularizer,
           constraint=self.beta_constraint,
-          trainable=True)
+          trainable=True,
+          experimental_autocast=False)
     else:
       self.beta = None
       if self.fused:
-        self._beta_const = array_ops.constant(
-            0.0, dtype=param_dtype, shape=param_shape)
+        self._beta_const = K.constant(
+            0.0, dtype=self._param_dtype, shape=param_shape)
 
     try:
       # Disable variable partitioning when creating the moving mean and variance
@@ -358,20 +368,22 @@ class BatchNormalizationV2(Layer):
       self.moving_mean = self.add_weight(
           name='moving_mean',
           shape=param_shape,
-          dtype=param_dtype,
+          dtype=self._param_dtype,
           initializer=self.moving_mean_initializer,
           synchronization=tf_variables.VariableSynchronization.ON_READ,
           trainable=False,
-          aggregation=tf_variables.VariableAggregation.MEAN)
+          aggregation=tf_variables.VariableAggregation.MEAN,
+          experimental_autocast=False)
 
       self.moving_variance = self.add_weight(
           name='moving_variance',
           shape=param_shape,
-          dtype=param_dtype,
+          dtype=self._param_dtype,
           initializer=self.moving_variance_initializer,
           synchronization=tf_variables.VariableSynchronization.ON_READ,
           trainable=False,
-          aggregation=tf_variables.VariableAggregation.MEAN)
+          aggregation=tf_variables.VariableAggregation.MEAN,
+          experimental_autocast=False)
 
       if self.renorm:
         # Create variables to maintain the moving mean and standard deviation.
@@ -382,14 +394,16 @@ class BatchNormalizationV2(Layer):
         # stack to be cleared. The nested ones use a `lambda` to set the desired
         # device and ignore any devices that may be set by the custom getter.
         def _renorm_variable(name, shape):
+          """Create a renorm variable."""
           var = self.add_weight(
               name=name,
               shape=shape,
-              dtype=param_dtype,
+              dtype=self._param_dtype,
               initializer=init_ops.zeros_initializer(),
               synchronization=tf_variables.VariableSynchronization.ON_READ,
               trainable=False,
-              aggregation=tf_variables.VariableAggregation.MEAN)
+              aggregation=tf_variables.VariableAggregation.MEAN,
+              experimental_autocast=False)
           return var
 
         with distribution_strategy_context.get_strategy(
@@ -623,7 +637,9 @@ class BatchNormalizationV2(Layer):
       # but not a constant. However, this makes the code simpler.
       keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
       mean, variance = self._moments(
-          inputs, reduction_axes, keep_dims=keep_dims)
+          math_ops.cast(inputs, self._param_dtype),
+          reduction_axes,
+          keep_dims=keep_dims)
 
       moving_mean = self.moving_mean
       moving_variance = self.moving_variance
@@ -702,6 +718,10 @@ class BatchNormalizationV2(Layer):
     variance = math_ops.cast(variance, inputs.dtype)
     if offset is not None:
       offset = math_ops.cast(offset, inputs.dtype)
+    if scale is not None:
+      scale = math_ops.cast(scale, inputs.dtype)
+    # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
+    # math in float16 hurts validation accuracy of popular models like resnet.
     outputs = nn.batch_normalization(inputs,
                                      _broadcast(mean),
                                      _broadcast(variance),
@@ -750,22 +770,22 @@ class BatchNormalizationV2(Layer):
                       'layer cannot be serialized and has been omitted from '
                       'the layer config. It will not be included when '
                       're-creating the layer from the saved config.')
-    base_config = super(BatchNormalizationV2, self).get_config()
+    base_config = super(BatchNormalizationBase, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-def _replace_in_v2_docstring(old, new):
-  string = BatchNormalizationV2.__doc__
+def _replace_in_base_docstring(old, new):
+  string = BatchNormalizationBase.__doc__
   if old not in string:
-    raise ValueError('Could not find following string in BatchNormalizationV2 '
-                     'docstring: "{}"'.format(old))
+    raise ValueError('Could not find following string in BatchNormalizationBase'
+                     ' docstring: "{}"'.format(old))
   return string.replace(old, new)
 
 
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
-class BatchNormalizationV1(BatchNormalizationV2):
+class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = _replace_in_v2_docstring(
+  __doc__ = _replace_in_base_docstring(
       '''
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
@@ -779,22 +799,224 @@ class BatchNormalizationV1(BatchNormalizationV2):
   _USE_V2_BEHAVIOR = False
 
 
-BatchNormalization = None  # pylint: disable=invalid-name
+@keras_export('keras.layers.experimental.LayerNormalization')
+class LayerNormalization(Layer):
+  """Layer normalization layer (Ba et al., 2016).
+
+  Normalize the activations of the previous layer for each given example in a
+  batch independently, rather than across a batch like Batch Normalization.
+  i.e. applies a transformation that maintains the mean activation within each
+  example close to 0 and the activation standard deviation close to 1.
+
+  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
+  is performed over all axes in norm_axis.  Scaling and centering,
+  if requested, is performed over all axes in params_axis.
+
+  By default, normalization is performed over all but the first axis
+  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
+  parameters are calculated for the rightmost axis (the `C` if `inputs` is
+  `NHWC`).  Scaling and recentering is performed via broadcast of the
+  `beta` and `gamma` parameters with the normalized tensor.
+
+  The shapes of `beta` and `gamma` are
+  `[inputs.shape[i] for i in (param axes)]`,
+  and this part of the inputs' shape must be fully defined.
+
+  Arguments:
+    norm_axis: Integer or List. normalization will be
+      performed along these dimensions. If unspecified (None), it will default
+      to the dimensions `begin_norm_axis : rank(inputs)`
+    params_axis: Integer or List. The (beta, gamma) dimensions: scale
+      and centering parameters will have take their shapes from these axes and
+      will be broadcast with the normalized inputs accordingly. If unspecified
+      (None), it will default to the last dimension
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+      If False, `gamma` is not used.
+      When the next layer is linear (also e.g. `nn.relu`),
+      this can be disabled since the scaling
+      will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    trainable: Boolean, if `True` the variables will be marked as trainable.
+
+  Input shape:
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
+
+  Output shape:
+    Same shape as input.
+
+  References:
+    - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+  """
+
+  def __init__(self,
+               norm_axis=None,
+               params_axis=-1,
+               epsilon=1e-12,
+               center=True,
+               scale=True,
+               beta_initializer='zeros',
+               gamma_initializer='ones',
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(LayerNormalization, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(norm_axis, list):
+      self.norm_axis = norm_axis[:]
+    elif isinstance(norm_axis, int):
+      self.norm_axis = norm_axis
+    elif norm_axis is None:
+      self.norm_axis = None
+    else:
+      raise TypeError('norm_axis must be int or list or None, type given: %s'
+                      % type(norm_axis))
+
+    if isinstance(params_axis, list):
+      self.params_axis = params_axis[:]
+    elif isinstance(params_axis, int):
+      self.params_axis = params_axis
+    else:
+      raise TypeError('params_axis must be int or list, type given: %s'
+                      % type(params_axis))
+
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    ndims = len(input_shape)
+    if ndims is None:
+      raise ValueError('Input shape %s has undefined rank.' % input_shape)
+
+    # Handle an unspecified norm_axis
+    if self.norm_axis is None:
+      self.norm_axis = list(range(1, ndims))
+
+    # Convert axes to lists and resolve negatives
+    if isinstance(self.norm_axis, int):
+      self.norm_axis = [self.norm_axis]
+    for idx, x in enumerate(self.norm_axis):
+      if x < 0:
+        self.norm_axis[idx] = ndims + x
+
+    if isinstance(self.params_axis, int):
+      self.params_axis = [self.params_axis]
+    for idx, x in enumerate(self.params_axis):
+      if x < 0:
+        self.params_axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.norm_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.norm_axis) != len(set(self.norm_axis)):
+      raise ValueError('Duplicate axis: %s' % self.norm_axis)
+
+    for x in self.params_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.params_axis) != len(set(self.params_axis)):
+      raise ValueError('Duplicate axis: %s' % self.params_axis)
+
+    param_shape = [input_shape[dim] for dim in self.params_axis]
+
+    if self.scale:
+      self.gamma = self.add_weight(
+          name='gamma',
+          shape=param_shape,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True,
+          experimental_autocast=False)
+    else:
+      self.gamma = None
+
+    if self.center:
+      self.beta = self.add_weight(
+          name='beta',
+          shape=param_shape,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True,
+          experimental_autocast=False)
+    else:
+      self.beta = None
+
+  def call(self, inputs):
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+
+    # Calculate the moments on the last axis (layer activations).
+    mean, variance = nn.moments(inputs, self.norm_axis, keep_dims=True)
 
+    # Broadcasting only necessary for norm where the params axes aren't just
+    # the last dimension
+    broadcast_shape = [1] * ndims
+    for dim in self.params_axis:
+      broadcast_shape[dim] = input_shape.dims[dim].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          self.params_axis != [ndims - 1]):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
 
-@tf_export(v1=['enable_v2_batch_normalization'])
-def enable_v2_batch_normalization():
-  global BatchNormalization  # pylint: disable=invalid-name
-  BatchNormalization = BatchNormalizationV2
+    # Compute layer normalization using the batch_normalization function.
+    outputs = nn.batch_normalization(
+        inputs,
+        mean,
+        variance,
+        offset=offset,
+        scale=scale,
+        variance_epsilon=self.epsilon)
 
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
 
-@tf_export(v1=['disable_v2_batch_normalization'])
-def disable_v2_batch_normalization():
-  global BatchNormalization  # pylint: disable=invalid-name
-  BatchNormalization = BatchNormalizationV1
+    return outputs
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
-if tf2.enabled():
-  enable_v2_batch_normalization()
-else:
-  disable_v2_batch_normalization()
+  def get_config(self):
+    config = {
+        'norm_axis': self.norm_axis,
+        'params_axis': self.params_axis,
+        'epsilon': self.epsilon,
+        'center': self.center,
+        'scale': self.scale,
+        'beta_initializer': initializers.serialize(self.beta_initializer),
+        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint': constraints.serialize(self.beta_constraint),
+        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+    }
+    base_config = super(LayerNormalization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index f81ddcecb42662c8cfa481808919c4382771467b..0a422e39f2e58cd622ade0bc0ff19cbaa7e0f186 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -18,13 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
+from tensorflow.python.keras.layers import normalization_v2
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
@@ -129,36 +133,45 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     _run_batchnorm_correctness_test(
         normalization.BatchNormalization, dtype='float32')
     _run_batchnorm_correctness_test(
-        normalization.BatchNormalization, dtype='float32', fused=True)
-    _run_batchnorm_correctness_test(
-        normalization.BatchNormalization, dtype='float32', fused=False)
+        normalization_v2.BatchNormalization, dtype='float32')
 
   @keras_parameterized.run_all_keras_modes
   def test_batchnorm_mixed_precision(self):
     _run_batchnorm_correctness_test(
         normalization.BatchNormalization, dtype='float16')
     _run_batchnorm_correctness_test(
-        normalization.BatchNormalization, dtype='float16', fused=True)
-    _run_batchnorm_correctness_test(
-        normalization.BatchNormalization, dtype='float16', fused=False)
+        normalization_v2.BatchNormalization, dtype='float16')
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_batchnorm_policy(self):
+    norm = keras.layers.BatchNormalization(
+        axis=-1,
+        input_shape=(4, 4, 3),
+        momentum=0.8,
+        dtype=policy.Policy('infer_float32_vars'))
+    x = np.random.normal(size=(10, 4, 4, 3)).astype('float16')
+    y = norm(x)
+    self.assertEqual(y.dtype, 'float16')
+    self.assertEqual(norm.beta.dtype.base_dtype, 'float32')
+    self.assertEqual(norm.gamma.dtype.base_dtype, 'float32')
 
 
 class BatchNormalizationV1Test(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_v1_fused_attribute(self):
-    norm = normalization.BatchNormalizationV1()
+    norm = normalization.BatchNormalization()
     inp = keras.layers.Input((4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, True)
 
-    norm = normalization.BatchNormalizationV1(fused=False)
+    norm = normalization.BatchNormalization(fused=False)
     self.assertEqual(norm.fused, False)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, False)
 
-    norm = normalization.BatchNormalizationV1(virtual_batch_size=2)
+    norm = normalization.BatchNormalization(virtual_batch_size=2)
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(2, 2, 2))
     norm(inp)
@@ -170,63 +183,63 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_basic_batchnorm_v2(self):
     testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
+        normalization_v2.BatchNormalization,
         kwargs={'fused': True},
         input_shape=(3, 3, 3, 3))
     testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
+        normalization_v2.BatchNormalization,
         kwargs={'fused': None},
         input_shape=(3, 3, 3))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_v2_fused_attribute(self):
-    norm = normalization.BatchNormalizationV2()
+    norm = normalization_v2.BatchNormalization()
     self.assertEqual(norm.fused, None)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, True)
 
-    norm = normalization.BatchNormalizationV2()
+    norm = normalization_v2.BatchNormalization()
     self.assertEqual(norm.fused, None)
     inp = keras.layers.Input(shape=(4, 4))
     norm(inp)
     self.assertEqual(norm.fused, False)
 
-    norm = normalization.BatchNormalizationV2(virtual_batch_size=2)
+    norm = normalization_v2.BatchNormalization(virtual_batch_size=2)
     self.assertEqual(norm.fused, False)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, False)
 
-    norm = normalization.BatchNormalizationV2(fused=False)
+    norm = normalization_v2.BatchNormalization(fused=False)
     self.assertEqual(norm.fused, False)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, False)
 
-    norm = normalization.BatchNormalizationV2(fused=True, axis=[3])
+    norm = normalization_v2.BatchNormalization(fused=True, axis=[3])
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4, 4))
     norm(inp)
     self.assertEqual(norm.fused, True)
 
     with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
-      normalization.BatchNormalizationV2(fused=True, renorm=True)
+      normalization_v2.BatchNormalization(fused=True, renorm=True)
 
     with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
-      normalization.BatchNormalizationV2(fused=True, axis=2)
+      normalization_v2.BatchNormalization(fused=True, axis=2)
 
     with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
-      normalization.BatchNormalizationV2(fused=True, axis=[1, 3])
+      normalization_v2.BatchNormalization(fused=True, axis=[1, 3])
 
     with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
-      normalization.BatchNormalizationV2(fused=True, virtual_batch_size=2)
+      normalization_v2.BatchNormalization(fused=True, virtual_batch_size=2)
 
     with self.assertRaisesRegexp(ValueError, 'fused.*adjustment'):
-      normalization.BatchNormalizationV2(fused=True,
-                                         adjustment=lambda _: (1, 0))
+      normalization_v2.BatchNormalization(fused=True,
+                                          adjustment=lambda _: (1, 0))
 
-    norm = normalization.BatchNormalizationV2(fused=True)
+    norm = normalization_v2.BatchNormalization(fused=True)
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4))
     with self.assertRaisesRegexp(ValueError, '4D input tensors'):
@@ -235,8 +248,12 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
 
 def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
   model = keras.models.Sequential()
-  norm = layer(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
+  model.add(keras.Input(shape=(2, 2, 2), dtype=dtype))
+  norm = layer(momentum=0.8, fused=fused)
   model.add(norm)
+  if dtype == 'float16':
+    # Keras models require float32 losses.
+    model.add(keras.layers.Lambda(lambda x: keras.backend.cast(x, 'float32')))
   model.compile(loss='mse',
                 optimizer=gradient_descent.GradientDescentOptimizer(0.01),
                 run_eagerly=testing_utils.should_run_eagerly())
@@ -253,14 +270,16 @@ def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
   np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
-class NormalizationLayersGraphModeOnlyTest(test.TestCase):
+@parameterized.parameters(
+    [normalization.BatchNormalization, normalization_v2.BatchNormalization])
+class NormalizationLayersGraphModeOnlyTest(
+    test.TestCase, parameterized.TestCase):
 
-  def test_shared_batchnorm(self):
-    """Test that a BN layer can be shared across different data streams.
-    """
+  def test_shared_batchnorm(self, layer):
+    """Test that a BN layer can be shared across different data streams."""
     with self.cached_session():
       # Test single layer reuse
-      bn = keras.layers.BatchNormalization()
+      bn = layer()
       x1 = keras.layers.Input(shape=(10,))
       _ = bn(x1)
 
@@ -275,7 +294,6 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
       self.assertEqual(len(bn.updates), 4)
       self.assertEqual(len(model.updates), 2)
-      self.assertEqual(len(model.get_updates_for(x1)), 0)
       self.assertEqual(len(model.get_updates_for(x2)), 2)
 
       # Test model-level reuse
@@ -289,13 +307,13 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
       new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
-  def test_that_trainable_disables_updates(self):
+  def test_that_trainable_disables_updates(self, layer):
     with self.cached_session():
       val_a = np.random.random((10, 4))
       val_out = np.random.random((10, 4))
 
       a = keras.layers.Input(shape=(4,))
-      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      layer = layer(input_shape=(4,))
       b = layer(a)
       model = keras.models.Model(a, b)
 
@@ -328,11 +346,14 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
       self.assertAllClose(x1, x2, atol=1e-7)
 
   @tf_test_util.run_deprecated_v1
-  def test_batchnorm_trainable(self):
+  def test_batchnorm_trainable(self, layer):
     """Tests that batchnorm layer is trainable when learning phase is enabled.
 
     Computes mean and std for current inputs then
     applies batch normalization using them.
+
+    Args:
+      layer: Either V1 or V2 of BatchNormalization layer.
     """
     # TODO(fchollet): enable in all execution modes when issue with
     # learning phase setting is resolved.
@@ -343,7 +364,7 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
       def get_model(bn_mean, bn_std):
         inp = keras.layers.Input(shape=(1,))
-        x = keras.layers.BatchNormalization()(inp)
+        x = layer()(inp)
         model1 = keras.models.Model(inp, x)
         model1.set_weights([
             np.array([1.]),
@@ -355,12 +376,241 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
       # Simulates training-mode with trainable layer.
       # Should use mini-batch statistics.
-      keras.backend.set_learning_phase(1)
-      model = get_model(bn_mean, bn_std)
-      model.compile(loss='mse', optimizer='rmsprop')
-      out = model.predict(val_a)
-      self.assertAllClose(
-          (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+      with keras.backend.learning_phase_scope(1):
+        model = get_model(bn_mean, bn_std)
+        model.compile(loss='mse', optimizer='rmsprop')
+        out = model.predict(val_a)
+        self.assertAllClose(
+            (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+
+
+def _run_layernorm_correctness_test(layer, dtype='float32'):
+  model = keras.models.Sequential()
+  norm = layer(input_shape=(2, 2, 2))
+  model.add(norm)
+  model.compile(loss='mse',
+                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                run_eagerly=testing_utils.should_run_eagerly())
+
+  # centered on 5.0, variance 10.0
+  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+       .astype(dtype))
+  model.fit(x, x, epochs=4, verbose=0)
+  out = model.predict(x)
+  out -= keras.backend.eval(norm.beta)
+  out /= keras.backend.eval(norm.gamma)
+
+  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class LayerNormalizationTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_layernorm(self):
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_regularizer': keras.regularizers.l2(0.01),
+            'beta_regularizer': keras.regularizers.l2(0.01)
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={'scale': False,
+                'center': False},
+        input_shape=(3, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_weights(self):
+    layer = keras.layers.LayerNormalization(scale=False, center=False)
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.weights), 0)
+
+    layer = keras.layers.LayerNormalization()
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.weights), 2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_regularization(self):
+    layer = keras.layers.LayerNormalization(
+        gamma_regularizer='l1', beta_regularizer='l1')
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.losses), 2)
+    max_norm = keras.constraints.max_norm
+    layer = keras.layers.LayerNormalization(
+        gamma_constraint=max_norm, beta_constraint=max_norm)
+    layer.build((None, 3, 4))
+    self.assertEqual(layer.gamma.constraint, max_norm)
+    self.assertEqual(layer.beta.constraint, max_norm)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        model = keras.models.Sequential()
+        norm = keras.layers.LayerNormalization(
+            input_shape=(3, 4, 4), params_axis=1)
+        model.add(norm)
+        model.compile(loss='mse',
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet_channel_last(self):
+    model = keras.models.Sequential()
+    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_correctness(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float32')
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_mixed_precision(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float16')
+
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   norm_axis=None,
+                   params_axis=-1,
+                   dtype=None):
+    ndim = len(input_shape)
+    if norm_axis is None:
+      moments_axis = range(1, ndim)
+    elif isinstance(norm_axis, int):
+      if norm_axis < 0:
+        moments_axis = [norm_axis + ndim]
+      else:
+        moments_axis = [norm_axis]
+    else:
+      moments_axis = []
+      for dim in norm_axis:
+        if dim < 0:
+          dim = dim + ndim
+        moments_axis.append(dim)
+
+    moments_axis = tuple(moments_axis)
+    expected_shape = []
+    for i in range(ndim):
+      if i not in moments_axis:
+        expected_shape.append(input_shape[i])
+
+    expected_mean = np.zeros(expected_shape)
+    expected_var = np.ones(expected_shape)
+    for mu in [0.0, 1e2]:
+      for sigma in [1.0, 0.1]:
+        inputs = np.random.randn(*input_shape) * sigma + mu
+        inputs_t = constant_op.constant(inputs, shape=input_shape)
+        layer = normalization.LayerNormalization(
+            norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
+        outputs = layer(inputs_t)
+        beta = layer.beta
+        gamma = layer.gamma
+        for weight in layer.weights:
+          self.evaluate(weight.initializer)
+        outputs = self.evaluate(outputs)
+        beta = self.evaluate(beta)
+        gamma = self.evaluate(gamma)
+
+        # The mean and variance of the output should be close to 0 and 1
+        # respectively.
+
+        # Make sure that there are no NaNs
+        self.assertFalse(np.isnan(outputs).any())
+        mean = np.mean(outputs, axis=moments_axis)
+        var = np.var(outputs, axis=moments_axis)
+        # Layer-norm implemented in numpy
+        eps = 1e-12
+        expected_out = (
+            (gamma * (inputs - np.mean(
+                inputs, axis=moments_axis, keepdims=True)) /
+             np.sqrt(eps + np.var(
+                 inputs, axis=moments_axis, keepdims=True))) + beta)
+        self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
+        self.assertAllClose(expected_var, var, atol=tol)
+        # The full computation gets a bigger tolerance
+        self.assertAllClose(expected_out, outputs, atol=5 * tol)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInput(self):
+    self.doOutputTest((10, 300))
+    self.doOutputTest((10, 300), norm_axis=[0])
+    self.doOutputTest((10, 300), params_axis=[0, 1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInputDegenerateNormAxis(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 2'):
+      self.doOutputTest((10, 300), norm_axis=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInput(self):
+    self.doOutputTest((100, 10, 10, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInputNormOnInnermostAxis(self):
+    # Equivalent tests
+    shape = (100, 10, 10, 3)
+    self.doOutputTest(
+        shape, norm_axis=list(range(3, len(shape))), tol=1e-4, dtype='float64')
+    self.doOutputTest(shape, norm_axis=-1, tol=1e-4, dtype='float64')
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInput(self):
+    self.doOutputTest((10, 10, 10, 30))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnInnermostAxis(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=3)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnMixedAxes(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3])
+    self.doOutputTest((10, 10, 10, 30), params_axis=[-2, -1])
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3],
+                      params_axis=[-3, -2, -1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputBigInput(self):
+    self.doOutputTest((1, 100, 100, 1))
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2])
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2],
+                      params_axis=[-2, -1])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..05501a7bf2ca3c80faf9497fa4c6089ad6ad52c4
--- /dev/null
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -0,0 +1,28 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The V2 implementation of Normalization layers.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.normalization import BatchNormalizationBase
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.layers.BatchNormalization', v1=[])  # pylint: disable=missing-docstring
+class BatchNormalization(BatchNormalizationBase):
+
+  _USE_V2_BEHAVIOR = True
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index 6d76f962166fe123e6c46f5524a59ed742d7d0dc..a0096184f6e3b2144cfbb954045fd73bc36377b1 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -113,34 +113,30 @@ class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
   Arguments:
-      pool_size: Integer, size of the max pooling windows.
-      strides: Integer, or None. Factor by which to downscale.
-          E.g. 2 will halve the input.
-          If None, it will default to `pool_size`.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, steps, features)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, features, steps)`.
+    pool_size: Integer, size of the max pooling windows.
+    strides: Integer, or None. Factor by which to downscale.
+      E.g. 2 will halve the input.
+      If None, it will default to `pool_size`.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
 
   Input shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, steps, features)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, features, steps)`.
 
   Output shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, downsampled_steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, downsampled_steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, downsampled_steps, features)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, features, downsampled_steps)`.
   """
 
   def __init__(self, pool_size=2, strides=None,
@@ -160,34 +156,30 @@ class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
   Arguments:
-      pool_size: Integer, size of the max pooling windows.
-      strides: Integer, or None. Factor by which to downscale.
-          E.g. 2 will halve the input.
-          If None, it will default to `pool_size`.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, steps, features)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, features, steps)`.
+    pool_size: Integer, size of the max pooling windows.
+    strides: Integer, or None. Factor by which to downscale.
+      E.g. 2 will halve the input.
+      If None, it will default to `pool_size`.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
 
   Input shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, steps, features)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, features, steps)`.
 
   Output shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, downsampled_steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, downsampled_steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, downsampled_steps, features)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, features, downsampled_steps)`.
   """
 
   def __init__(self, pool_size=2, strides=None,
@@ -291,41 +283,37 @@ class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
   Arguments:
-      pool_size: integer or tuple of 2 integers,
-          factors by which to downscale (vertical, horizontal).
-          (2, 2) will halve the input in both spatial dimension.
-          If only one integer is specified, the same window length
-          will be used for both dimensions.
-      strides: Integer, tuple of 2 integers, or None.
-          Strides values.
-          If None, it will default to `pool_size`.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    pool_size: integer or tuple of 2 integers,
+      factors by which to downscale (vertical, horizontal).
+      `(2, 2)` will halve the input in both spatial dimension.
+      If only one integer is specified, the same window length
+      will be used for both dimensions.
+    strides: Integer, tuple of 2 integers, or None.
+      Strides values.
+      If None, it will default to `pool_size`.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, rows, cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, rows, cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, rows, cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, rows, cols)`.
 
   Output shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, pooled_rows, pooled_cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, pooled_rows, pooled_cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
   """
 
   def __init__(self,
@@ -345,41 +333,37 @@ class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
   Arguments:
-      pool_size: integer or tuple of 2 integers,
-          factors by which to downscale (vertical, horizontal).
-          (2, 2) will halve the input in both spatial dimension.
-          If only one integer is specified, the same window length
-          will be used for both dimensions.
-      strides: Integer, tuple of 2 integers, or None.
-          Strides values.
-          If None, it will default to `pool_size`.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    pool_size: integer or tuple of 2 integers,
+      factors by which to downscale (vertical, horizontal).
+      `(2, 2)` will halve the input in both spatial dimension.
+      If only one integer is specified, the same window length
+      will be used for both dimensions.
+    strides: Integer, tuple of 2 integers, or None.
+      Strides values.
+      If None, it will default to `pool_size`.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, rows, cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, rows, cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, rows, cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, rows, cols)`.
 
   Output shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, pooled_rows, pooled_cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, pooled_rows, pooled_cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
   """
 
   def __init__(self,
@@ -495,37 +479,37 @@ class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
-      pool_size: tuple of 3 integers,
-          factors by which to downscale (dim1, dim2, dim3).
-          (2, 2, 2) will halve the size of the 3D input in each dimension.
-      strides: tuple of 3 integers, or None. Strides values.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    pool_size: Tuple of 3 integers,
+      factors by which to downscale (dim1, dim2, dim3).
+      `(2, 2, 2)` will halve the size of the 3D input in each dimension.
+    strides: tuple of 3 integers, or None. Strides values.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
 
   Output shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
   """
 
   def __init__(self,
@@ -545,37 +529,37 @@ class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
-      pool_size: tuple of 3 integers,
-          factors by which to downscale (dim1, dim2, dim3).
-          (2, 2, 2) will halve the size of the 3D input in each dimension.
-      strides: tuple of 3 integers, or None. Strides values.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    pool_size: tuple of 3 integers,
+      factors by which to downscale (dim1, dim2, dim3).
+      `(2, 2, 2)` will halve the size of the 3D input in each dimension.
+    strides: tuple of 3 integers, or None. Strides values.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
 
   Output shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`
   """
 
   def __init__(self,
@@ -591,8 +575,7 @@ class AveragePooling3D(Pooling3D):
 
 
 class GlobalPooling1D(Layer):
-  """Abstract class for different global pooling 1D layers.
-  """
+  """Abstract class for different global pooling 1D layers."""
 
   def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
@@ -622,24 +605,28 @@ class GlobalAveragePooling1D(GlobalPooling1D):
 
   Arguments:
     data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, steps, features)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, features, steps)`.
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(batch_size, steps)` indicating whether
+      a given step should be masked (excluded from the average).
 
   Input shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape:
+      `(batch_size, steps, features)`
+    - If `data_format='channels_first'`:
+      3D tensor with shape:
+      `(batch_size, features, steps)`
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, features)`
+    2D tensor with shape `(batch_size, features)`.
   """
 
   def __init__(self, data_format='channels_last', **kwargs):
@@ -670,24 +657,23 @@ class GlobalMaxPooling1D(GlobalPooling1D):
 
   Arguments:
     data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, steps, features)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, features, steps)`.
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
 
   Input shape:
-      - If `data_format='channels_last'`:
-          3D tensor with shape:
-          `(batch_size, steps, features)`
-      - If `data_format='channels_first'`:
-          3D tensor with shape:
-          `(batch_size, features, steps)`
+    - If `data_format='channels_last'`:
+      3D tensor with shape:
+      `(batch_size, steps, features)`
+    - If `data_format='channels_first'`:
+      3D tensor with shape:
+      `(batch_size, features, steps)`
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, features)`
+    2D tensor with shape `(batch_size, features)`.
   """
 
   def call(self, inputs):
@@ -727,27 +713,24 @@ class GlobalAveragePooling2D(GlobalPooling2D):
 
   Arguments:
       data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, rows, cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, rows, cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, rows, cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, rows, cols)`.
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, channels)`
+    2D tensor with shape `(batch_size, channels)`.
   """
 
   def call(self, inputs):
@@ -762,28 +745,25 @@ class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
   Arguments:
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          4D tensor with shape:
-          `(batch_size, rows, cols, channels)`
-      - If `data_format='channels_first'`:
-          4D tensor with shape:
-          `(batch_size, channels, rows, cols)`
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, rows, cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, rows, cols)`.
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, channels)`
+    2D tensor with shape `(batch_size, channels)`.
   """
 
   def call(self, inputs):
@@ -794,8 +774,7 @@ class GlobalMaxPooling2D(GlobalPooling2D):
 
 
 class GlobalPooling3D(Layer):
-  """Abstract class for different global pooling 3D layers.
-  """
+  """Abstract class for different global pooling 3D layers."""
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling3D, self).__init__(**kwargs)
@@ -824,28 +803,27 @@ class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
   Arguments:
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, channels)`
+    2D tensor with shape `(batch_size, channels)`.
   """
 
   def call(self, inputs):
@@ -860,28 +838,27 @@ class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
   Arguments:
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-          while `channels_first` corresponds to inputs with shape
-          `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+      while `channels_first` corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+      It defaults to the `image_data_format` value found in your
+      Keras config file at `~/.keras/keras.json`.
+      If you never set it, then it will be "channels_last".
 
   Input shape:
-      - If `data_format='channels_last'`:
-          5D tensor with shape:
-          `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
-      - If `data_format='channels_first'`:
-          5D tensor with shape:
-          `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
+    - If `data_format='channels_last'`:
+      5D tensor with shape:
+      `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`
+    - If `data_format='channels_first'`:
+      5D tensor with shape:
+      `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`
 
   Output shape:
-      2D tensor with shape:
-      `(batch_size, channels)`
+    2D tensor with shape `(batch_size, channels)`.
   """
 
   def call(self, inputs):
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 9404543ed9c2a4a48eaf40eb4190ac21be0e0d9d..dccae5c6401e31a00820c352b367819c0f5d79db 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,15 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import uuid
+import collections
 
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
@@ -39,22 +35,13 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
-# The following string constants are used by Defun approach for unified backend
-# of LSTM and GRU.
-_DEFUN_API_NAME_ATTRIBUTE = 'experimental_api_implements'
-_DEFUN_DEVICE_ATTRIBUTE = 'experimental_api_preferred_device'
-_CPU_DEVICE_NAME = 'CPU'
-_GPU_DEVICE_NAME = 'GPU'
-
-
 @keras_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
@@ -62,23 +49,22 @@ class StackedRNNCells(Layer):
   Used to implement efficient stacked RNNs.
 
   Arguments:
-      cells: List of RNN cell instances.
+    cells: List of RNN cell instances.
 
   Examples:
 
   ```python
-      cells = [
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-          keras.layers.LSTMCell(output_dim),
-      ]
-
-      inputs = keras.Input((timesteps, input_dim))
-      x = keras.layers.RNN(cells)(inputs)
+  cells = [
+      keras.layers.LSTMCell(output_dim),
+      keras.layers.LSTMCell(output_dim),
+      keras.layers.LSTMCell(output_dim),
+  ]
+
+  inputs = keras.Input((timesteps, input_dim))
+  x = keras.layers.RNN(cells)(inputs)
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, cells, **kwargs):
     for cell in cells:
       if not hasattr(cell, 'call'):
@@ -138,6 +124,9 @@ class StackedRNNCells(Layer):
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
       states = states if nest.is_sequence(states) else [states]
+      # TF cell does not wrap the state into list when there is only one state.
+      is_tf_rnn_cell = getattr(cell, '_is_tf_rnn_cell', None) is not None
+      states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
       if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
@@ -189,261 +178,190 @@ class StackedRNNCells(Layer):
           deserialize_layer(cell_config, custom_objects=custom_objects))
     return cls(cells, **config)
 
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for cell in self.cells:
-        if isinstance(cell, Layer):
-          trainable_weights += cell.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  def get_weights(self):
-    """Retrieves the weights of the model.
-
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    weights = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        weights += cell.weights
-    return K.batch_get_value(weights)
-
-  def set_weights(self, weights):
-    """Sets the weights of the model.
-
-    Arguments:
-        weights: A list of Numpy arrays with shapes and types matching
-            the output of `model.get_weights()`.
-    """
-    tuples = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        num_param = len(cell.weights)
-        weights = weights[:num_param]
-        for sw, w in zip(cell.weights, weights):
-          tuples.append((sw, w))
-        weights = weights[num_param:]
-    K.batch_set_value(tuples)
-
-  @property
-  def losses(self):
-    losses = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        losses += cell.losses
-    return losses + self._losses
-
-  @property
-  def updates(self):
-    updates = []
-    for cell in self.cells:
-      if isinstance(cell, Layer):
-        updates += cell.updates
-    return updates + self._updates
-
 
 @keras_export('keras.layers.RNN')
 class RNN(Layer):
   """Base class for recurrent layers.
 
   Arguments:
-      cell: A RNN cell instance or a list of RNN cell instances.
-          A RNN cell is a class that has:
-          - a `call(input_at_t, states_at_t)` method, returning
-              `(output_at_t, states_at_t_plus_1)`. The call method of the
-              cell can also take the optional argument `constants`, see
-              section "Note on passing external constants" below.
-          - a `state_size` attribute. This can be a single integer
-              (single state) in which case it is the size of the recurrent
-              state. This can also be a list/tuple of integers (one size per
-              state).
-              The `state_size` can also be TensorShape or tuple/list of
-              TensorShape, to represent high dimension state.
-          - a `output_size` attribute. This can be a single integer or a
-              TensorShape, which represent the shape of the output. For backward
-              compatible reason, if this attribute is not available for the
-              cell, the value will be inferred by the first element of the
-              `state_size`.
-          - a `get_initial_state(inputs=None, batch_size=None, dtype=None)`
-              method that creates a tensor meant to be fed to `call()` as the
-              initial state, if user didn't specify any initial state via other
-              means. The returned initial state should be in shape of
-              [batch, cell.state_size]. Cell might choose to create zero filled
-              tensor, or with other values based on the cell implementations.
-              `inputs` is the input tensor to the RNN layer, which should
-              contain the batch size as its shape[0], and also dtype. Note that
-              the shape[0] might be None during the graph construction. Either
-              the `inputs` or the pair of `batch` and `dtype `are provided.
-              `batch` is a scalar tensor that represent the batch size
-              of the input. `dtype` is `tf.dtype` that represent the dtype of
-              the input.
-              For backward compatible reason, if this method is not implemented
-              by the cell, RNN layer will create a zero filled tensors with the
-              size of [batch, cell.state_size].
-          In the case that `cell` is a list of RNN cell instances, the cells
-          will be stacked on after the other in the RNN, implementing an
-          efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-      input_dim: dimensionality of the input (integer or tuple of integers).
-          This argument (or alternatively, the keyword argument `input_shape`)
-          is required when using this layer as the first layer in a model.
-      input_length: Length of input sequences, to be specified
-          when it is constant.
-          This argument is required if you are going to connect
-          `Flatten` then `Dense` layers upstream
-          (without it, the shape of the dense outputs cannot be computed).
-          Note that if the recurrent layer is not the first layer
-          in your model, you would need to specify the input length
-          at the level of the first layer
-          (e.g. via the `input_shape` argument)
-      time_major: The shape format of the `inputs` and `outputs` tensors.
-          If True, the inputs and outputs will be in shape
-          `(timesteps, batch, ...)`, whereas in the False case, it will be
-          `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
-          efficient because it avoids transposes at the beginning and end of the
-          RNN calculation. However, most TensorFlow data is batch-major, so by
-          default this function accepts input and emits output in batch-major
-          form.
+    cell: A RNN cell instance or a list of RNN cell instances.
+      A RNN cell is a class that has:
+      - A `call(input_at_t, states_at_t)` method, returning
+        `(output_at_t, states_at_t_plus_1)`. The call method of the
+        cell can also take the optional argument `constants`, see
+        section "Note on passing external constants" below.
+      - A `state_size` attribute. This can be a single integer
+        (single state) in which case it is the size of the recurrent
+        state. This can also be a list/tuple of integers (one size per
+        state).
+        The `state_size` can also be TensorShape or tuple/list of
+        TensorShape, to represent high dimension state.
+      - A `output_size` attribute. This can be a single integer or a
+        TensorShape, which represent the shape of the output. For backward
+        compatible reason, if this attribute is not available for the
+        cell, the value will be inferred by the first element of the
+        `state_size`.
+      - A `get_initial_state(inputs=None, batch_size=None, dtype=None)`
+        method that creates a tensor meant to be fed to `call()` as the
+        initial state, if user didn't specify any initial state via other
+        means. The returned initial state should be in shape of
+        [batch, cell.state_size]. Cell might choose to create zero filled
+        tensor, or with other values based on the cell implementations.
+        `inputs` is the input tensor to the RNN layer, which should
+        contain the batch size as its shape[0], and also dtype. Note that
+        the shape[0] might be None during the graph construction. Either
+        the `inputs` or the pair of `batch` and `dtype `are provided.
+        `batch` is a scalar tensor that represent the batch size
+        of the input. `dtype` is `tf.dtype` that represent the dtype of
+        the input.
+        For backward compatible reason, if this method is not implemented
+        by the cell, RNN layer will create a zero filled tensors with the
+        size of [batch, cell.state_size].
+      In the case that `cell` is a list of RNN cell instances, the cells
+      will be stacked on after the other in the RNN, implementing an
+      efficient stacked RNN.
+    return_sequences: Boolean. Whether to return the last output
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled, else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+    time_major: The shape format of the `inputs` and `outputs` tensors.
+        If True, the inputs and outputs will be in shape
+        `(timesteps, batch, ...)`, whereas in the False case, it will be
+        `(batch, timesteps, ...)`. Using `time_major = True` is a bit more
+        efficient because it avoids transposes at the beginning and end of the
+        RNN calculation. However, most TensorFlow data is batch-major, so by
+        default this function accepts input and emits output in batch-major
+        form.
+
+  Call arguments:
+    inputs: Input tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is for use with cells that use dropout.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+    constants: List of constant tensors to be passed to the cell at each
+      timestep.
 
   Input shape:
-      N-D tensor with shape `(batch_size, timesteps, ...)` or
-      `(timesteps, batch_size, ...)` when time_major is True.
+    N-D tensor with shape `(batch_size, timesteps, ...)` or
+    `(timesteps, batch_size, ...)` when time_major is True.
 
   Output shape:
-      - if `return_state`: a list of tensors. The first tensor is
-          the output. The remaining tensors are the last states,
-          each with shape `(batch_size, state_size)`, where `state_size` could
-          be a high dimension tensor shape.
-      - if `return_sequences`: N-D tensor with shape
-          `(batch_size, timesteps, output_size)`, where `output_size` could
-          be a high dimension tensor shape, or
-          `(timesteps, batch_size, output_size)` when `time_major` is True.
-      - else, N-D tensor with shape `(batch_size, output_size)`, where
-          `output_size` could be a high dimension tensor shape.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
-      set to `True`.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch. This assumes a one-to-one mapping
-      between samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              if sequential model:
-                `batch_input_shape=(...)` to the first layer in your model.
-              else for functional model with 1 or more Input layers:
-                `batch_shape=(...)` to all the first layers in your model.
-              This is the expected shape of your inputs
-              *including the batch size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-          - specify `shuffle=False` when calling fit().
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
-
-  # Note on specifying the initial state of RNNs
-      You can specify the initial state of RNN layers symbolically by
-      calling them with the keyword argument `initial_state`. The value of
-      `initial_state` should be a tensor or list of tensors representing
-      the initial state of the RNN layer.
-
-      You can specify the initial state of RNN layers numerically by
-      calling `reset_states` with the keyword argument `states`. The value of
-      `states` should be a numpy array or list of numpy arrays representing
-      the initial state of the RNN layer.
-
-  # Note on passing external constants to RNNs
-      You can pass "external" constants to the cell using the `constants`
-      keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
-      requires that the `cell.call` method accepts the same keyword argument
-      `constants`. Such constants can be used to condition the cell
-      transformation on additional static inputs (not changing over time),
-      a.k.a. an attention mechanism.
+    - If `return_state`: a list of tensors. The first tensor is
+      the output. The remaining tensors are the last states,
+      each with shape `(batch_size, state_size)`, where `state_size` could
+      be a high dimension tensor shape.
+    - If `return_sequences`: N-D tensor with shape
+      `(batch_size, timesteps, output_size)`, where `output_size` could
+      be a high dimension tensor shape, or
+      `(timesteps, batch_size, output_size)` when `time_major` is True.
+    - Else, N-D tensor with shape `(batch_size, output_size)`, where
+      `output_size` could be a high dimension tensor shape.
+
+  Masking:
+    This layer supports masking for input data with a variable number
+    of timesteps. To introduce masks to your data,
+    use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
+    set to `True`.
+
+  Note on using statefulness in RNNs:
+    You can set RNN layers to be 'stateful', which means that the states
+    computed for the samples in one batch will be reused as initial states
+    for the samples in the next batch. This assumes a one-to-one mapping
+    between samples in different successive batches.
+
+    To enable statefulness:
+      - Specify `stateful=True` in the layer constructor.
+      - Specify a fixed batch size for your model, by passing
+        If sequential model:
+          `batch_input_shape=(...)` to the first layer in your model.
+        Else for functional model with 1 or more Input layers:
+          `batch_shape=(...)` to all the first layers in your model.
+        This is the expected shape of your inputs
+        *including the batch size*.
+        It should be a tuple of integers, e.g. `(32, 10, 100)`.
+      - Specify `shuffle=False` when calling fit().
+
+    To reset the states of your model, call `.reset_states()` on either
+    a specific layer, or on your entire model.
+
+  Note on specifying the initial state of RNNs:
+    You can specify the initial state of RNN layers symbolically by
+    calling them with the keyword argument `initial_state`. The value of
+    `initial_state` should be a tensor or list of tensors representing
+    the initial state of the RNN layer.
+
+    You can specify the initial state of RNN layers numerically by
+    calling `reset_states` with the keyword argument `states`. The value of
+    `states` should be a numpy array or list of numpy arrays representing
+    the initial state of the RNN layer.
+
+  Note on passing external constants to RNNs:
+    You can pass "external" constants to the cell using the `constants`
+    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+    requires that the `cell.call` method accepts the same keyword argument
+    `constants`. Such constants can be used to condition the cell
+    transformation on additional static inputs (not changing over time),
+    a.k.a. an attention mechanism.
 
   Examples:
 
   ```python
-      # First, let's define a RNN Cell, as a layer subclass.
-
-      class MinimalRNNCell(keras.layers.Layer):
-
-          def __init__(self, units, **kwargs):
-              self.units = units
-              self.state_size = units
-              super(MinimalRNNCell, self).__init__(**kwargs)
-
-          def build(self, input_shape):
-              self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                            initializer='uniform',
-                                            name='kernel')
-              self.recurrent_kernel = self.add_weight(
-                  shape=(self.units, self.units),
-                  initializer='uniform',
-                  name='recurrent_kernel')
-              self.built = True
-
-          def call(self, inputs, states):
-              prev_output = states[0]
-              h = K.dot(inputs, self.kernel)
-              output = h + K.dot(prev_output, self.recurrent_kernel)
-              return output, [output]
-
-      # Let's use this cell in a RNN layer:
-
-      cell = MinimalRNNCell(32)
-      x = keras.Input((None, 5))
-      layer = RNN(cell)
-      y = layer(x)
-
-      # Here's how to use the cell to build a stacked RNN:
-
-      cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
-      x = keras.Input((None, 5))
-      layer = RNN(cells)
-      y = layer(x)
+  # First, let's define a RNN Cell, as a layer subclass.
+
+  class MinimalRNNCell(keras.layers.Layer):
+
+      def __init__(self, units, **kwargs):
+          self.units = units
+          self.state_size = units
+          super(MinimalRNNCell, self).__init__(**kwargs)
+
+      def build(self, input_shape):
+          self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                        initializer='uniform',
+                                        name='kernel')
+          self.recurrent_kernel = self.add_weight(
+              shape=(self.units, self.units),
+              initializer='uniform',
+              name='recurrent_kernel')
+          self.built = True
+
+      def call(self, inputs, states):
+          prev_output = states[0]
+          h = K.dot(inputs, self.kernel)
+          output = h + K.dot(prev_output, self.recurrent_kernel)
+          return output, [output]
+
+  # Let's use this cell in a RNN layer:
+
+  cell = MinimalRNNCell(32)
+  x = keras.Input((None, 5))
+  layer = RNN(cell)
+  y = layer(x)
+
+  # Here's how to use the cell to build a stacked RNN:
+
+  cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
+  x = keras.Input((None, 5))
+  layer = RNN(cells)
+  y = layer(x)
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
   def __init__(self,
                cell,
                return_sequences=False,
@@ -466,10 +384,15 @@ class RNN(Layer):
     # If True, the output for masked timestep will be zeros, whereas in the
     # False case, output from previous timestep is returned for masked timestep.
     self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
+
+    if 'input_shape' not in kwargs and (
+        'input_dim' in kwargs or 'input_length' in kwargs):
+      input_shape = (kwargs.pop('input_length', None),
+                     kwargs.pop('input_dim', None))
+      kwargs['input_shape'] = input_shape
+
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self.cell, name='cell')
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
@@ -495,6 +418,9 @@ class RNN(Layer):
     return self._states
 
   @states.setter
+  # Automatic tracking catches "self._states" which adds an extra weight and
+  # breaks HDF5 checkpoints.
+  @trackable.no_automatic_dependency_tracking
   def states(self, states):
     self._states = states
 
@@ -655,7 +581,7 @@ class RNN(Layer):
     Args:
       cell_state_sizes: list, the `state_size` attribute from the cell.
       init_state_specs: list, the `state_spec` from the initial_state that is
-        passed in call()
+        passed in `call()`.
 
     Raises:
       ValueError: When initial state spec is not compatible with the state size.
@@ -749,10 +675,12 @@ class RNN(Layer):
       full_input_spec = [None for _ in range(len(nest.flatten(inputs)))
                         ] + additional_specs
       # Perform the call with temporarily replaced input_spec
-      original_input_spec = self.input_spec
       self.input_spec = full_input_spec
       output = super(RNN, self).__call__(full_input, **kwargs)
-      self.input_spec = original_input_spec
+      # Remove the additional_specs from input spec and keep the rest. It is
+      # important to keep since the input spec was populated by build(), and
+      # will be reused in the stateful=True.
+      self.input_spec = self.input_spec[:-len(additional_specs)]
       return output
     else:
       if initial_state is not None:
@@ -781,9 +709,9 @@ class RNN(Layer):
     else:
       input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
-    if self.unroll and timesteps in [None, 1]:
+    if self.unroll and timesteps is None:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined or equal to 1. \n'
+                       'time dimension is undefined. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -858,7 +786,8 @@ class RNN(Layer):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
+    if (isinstance(inputs, collections.Sequence)
+        and not isinstance(inputs, tuple)):
       # get initial_state from full input spec
       # as they could be copied to multiple GPU.
       if self._num_constants is None:
@@ -885,10 +814,14 @@ class RNN(Layer):
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
-    if self.time_major:
-      batch_size = self.input_spec[0].shape[1]
+    spec_shape = None if self.input_spec is None else self.input_spec[0].shape
+    if spec_shape is None:
+      # It is possible to have spec shape to be None, eg when construct a RNN
+      # with a custom cell, or standard RNN layers (LSTM/GRU) which we only know
+      # it has 3 dim input, but not its full shape spec before build().
+      batch_size = None
     else:
-      batch_size = self.input_spec[0].shape[0]
+      batch_size = spec_shape[1] if self.time_major else spec_shape[0]
     if not batch_size:
       raise ValueError('If a RNN is stateful, it needs to know '
                        'its batch size. Specify the batch size '
@@ -975,70 +908,262 @@ class RNN(Layer):
     layer._num_constants = num_constants
     return layer
 
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    if isinstance(self.cell, Layer):
-      return self.cell.trainable_weights
-    return []
 
-  @property
-  def non_trainable_weights(self):
-    if isinstance(self.cell, Layer):
-      if not self.trainable:
-        return self.cell.weights
-      return self.cell.non_trainable_weights
-    return []
+@keras_export('keras.layers.AbstractRNNCell')
+class AbstractRNNCell(Layer):
+  """Abstract object representing an RNN cell.
+
+  This is the base class for implementing RNN cells with custom behavior.
+
+  Every `RNNCell` must have the properties below and implement `call` with
+  the signature `(output, next_state) = call(input, state)`.
+
+  Examples:
+
+  ```python
+    class MinimalRNNCell(AbstractRNNCell):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        super(MinimalRNNCell, self).__init__(**kwargs)
+
+      @property
+      def state_size(self):
+        return self.units
+
+      def build(self, input_shape):
+        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                      initializer='uniform',
+                                      name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.built = True
+
+      def call(self, inputs, states):
+        prev_output = states[0]
+        h = K.dot(inputs, self.kernel)
+        output = h + K.dot(prev_output, self.recurrent_kernel)
+        return output, output
+  ```
+
+  This definition of cell differs from the definition used in the literature.
+  In the literature, 'cell' refers to an object with a single scalar output.
+  This definition refers to a horizontal array of such units.
+
+  An RNN cell, in the most abstract setting, is anything that has
+  a state and performs some operation that takes a matrix of inputs.
+  This operation results in an output matrix with `self.output_size` columns.
+  If `self.state_size` is an integer, this operation also results in a new
+  state matrix with `self.state_size` columns.  If `self.state_size` is a
+  (possibly nested tuple of) TensorShape object(s), then it should return a
+  matching structure of Tensors having shape `[batch_size].concatenate(s)`
+  for each `s` in `self.batch_size`.
+  """
+
+  def call(self, inputs, states):
+    """The function that contains the logic for one RNN step calculation.
+
+    Args:
+      inputs: the input tensor, which is a slide from the overall RNN input by
+        the time dimension (usually the second dimension).
+      states: the state tensor from previous step, which has the same shape
+        as `(batch, state_size)`. In the case of timestep 0, it will be the
+        initial state user specified, or zero filled tensor otherwise.
+
+    Returns:
+      A tuple of two tensors:
+        1. output tensor for the current timestep, with size `output_size`.
+        2. state tensor for next step, which has the shape of `state_size`.
+    """
+    raise NotImplementedError('Abstract method')
 
   @property
-  def losses(self):
-    layer_losses = super(RNN, self).losses
-    if isinstance(self.cell, Layer):
-      return self.cell.losses + layer_losses
-    return layer_losses
+  def state_size(self):
+    """size(s) of state(s) used by this cell.
+
+    It can be represented by an Integer, a TensorShape or a tuple of Integers
+    or TensorShapes.
+    """
+    raise NotImplementedError('Abstract method')
 
   @property
-  def updates(self):
-    updates = []
-    if isinstance(self.cell, Layer):
-      updates += self.cell.updates
-    return updates + self._updates
+  def output_size(self):
+    """Integer or TensorShape: size of outputs produced by this cell."""
+    raise NotImplementedError('Abstract method')
+
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
+
+
+class DropoutRNNCellMixin(object):
+  """Object that hold dropout related fields for RNN Cell.
+
+  This class is not a standalone RNN cell. It suppose to be used with a RNN cell
+  by multiple inheritance. Any cell that mix with class should have following
+  fields:
+    dropout: a float number within range [0, 1). The ratio that the input
+      tensor need to dropout.
+    recurrent_dropout: a float number within range [0, 1). The ratio that the
+      recurrent state weights need to dropout.
+  This object will create and cache created dropout masks, and reuse them for
+  the incoming data, so that the same mask is used for every batch input.
+  """
+
+  def __init__(self, *args, **kwargs):
+    # Note that the following two masks will be used in "graph function" mode,
+    # e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
+    # tensors will be generated differently than in the "graph function" case,
+    # and they will be cached.
+    # Also note that in graph mode, we still cache those masks only because the
+    # RNN could be created with `unroll=True`. In that case, the `cell.call()`
+    # function will be invoked multiple times, and we want to ensure same mask
+    # is used every time.
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
+    self._eager_dropout_mask = None
+    self._eager_recurrent_dropout_mask = None
+    super(DropoutRNNCellMixin, self).__init__(*args, **kwargs)
+
+  def reset_dropout_mask(self):
+    """Reset the cached dropout masks if any.
+
+    This is important for the RNN layer to invoke this in it call() method so
+    that the cached mask is cleared before calling the cell.call(). The mask
+    should be cached across the timestep within the same batch, but shouldn't
+    be cached between batches. Otherwise it will introduce unreasonable bias
+    against certain index of data within the batch.
+    """
+    self._dropout_mask = None
+    self._eager_dropout_mask = None
+
+  def reset_recurrent_dropout_mask(self):
+    """Reset the cached recurrent dropout masks if any.
+
+    This is important for the RNN layer to invoke this in it call() method so
+    that the cached mask is cleared before calling the cell.call(). The mask
+    should be cached across the timestep within the same batch, but shouldn't
+    be cached between batches. Otherwise it will introduce unreasonable bias
+    against certain index of data within the batch.
+    """
+    self._recurrent_dropout_mask = None
+    self._eager_recurrent_dropout_mask = None
+
+  def get_dropout_mask_for_cell(self, inputs, training, count=1):
+    """Get the dropout mask for RNN cell's input.
+
+    It will create mask based on context if there isn't any existing cached
+    mask. If a new mask is generated, it will update the cache in the cell.
+
+    Args:
+      inputs: the input tensor whose shape will be used to generate dropout
+        mask.
+      training: boolean tensor, whether its in training mode, dropout will be
+        ignored in non-training mode.
+      count: int, how many dropout mask will be generated. It is useful for cell
+        that has internal weights fused together.
+    Returns:
+      List of mask tensor, generated or cached mask based on context.
+    """
+    if self.dropout == 0:
+      return None
+    if (not context.executing_eagerly() and self._dropout_mask is None
+        or context.executing_eagerly() and self._eager_dropout_mask is None):
+      # Generate new mask and cache it based on context.
+      dp_mask = _generate_dropout_mask(
+          array_ops.ones_like(inputs),
+          self.dropout,
+          training=training,
+          count=count)
+      if context.executing_eagerly():
+        self._eager_dropout_mask = dp_mask
+      else:
+        self._dropout_mask = dp_mask
+    else:
+      # Reuse the existing mask.
+      dp_mask = (self._eager_dropout_mask
+                 if context.executing_eagerly() else self._dropout_mask)
+    return dp_mask
+
+  def get_recurrent_dropout_mask_for_cell(self, inputs, training, count=1):
+    """Get the recurrent dropout mask for RNN cell.
+
+    It will create mask based on context if there isn't any existing cached
+    mask. If a new mask is generated, it will update the cache in the cell.
+
+    Args:
+      inputs: the input tensor whose shape will be used to generate dropout
+        mask.
+      training: boolean tensor, whether its in training mode, dropout will be
+        ignored in non-training mode.
+      count: int, how many dropout mask will be generated. It is useful for cell
+        that has internal weights fused together.
+    Returns:
+      List of mask tensor, generated or cached mask based on context.
+    """
+    if self.recurrent_dropout == 0:
+      return None
+    if (not context.executing_eagerly() and self._recurrent_dropout_mask is None
+        or context.executing_eagerly()
+        and self._eager_recurrent_dropout_mask is None):
+      # Generate new mask and cache it based on context.
+      rec_dp_mask = _generate_dropout_mask(
+          array_ops.ones_like(inputs),
+          self.recurrent_dropout,
+          training=training,
+          count=count)
+      if context.executing_eagerly():
+        self._eager_recurrent_dropout_mask = rec_dp_mask
+      else:
+        self._recurrent_dropout_mask = rec_dp_mask
+    else:
+      # Reuse the existing mask.
+      rec_dp_mask = (self._eager_recurrent_dropout_mask
+                     if context.executing_eagerly()
+                     else self._recurrent_dropout_mask)
+    return rec_dp_mask
 
 
 @keras_export('keras.layers.SimpleRNNCell')
-class SimpleRNNCell(Layer):
+class SimpleRNNCell(DropoutRNNCellMixin, Layer):
   """Cell class for SimpleRNN.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix, used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+
+  Call arguments:
+    inputs: A 2D tensor.
+    states: List of state tensors corresponding to the previous timestep.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. Only relevant when `dropout` or
+      `recurrent_dropout` is used.
   """
 
   def __init__(self,
@@ -1078,8 +1203,6 @@ class SimpleRNNCell(Layer):
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_size = self.units
     self.output_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -1108,20 +1231,9 @@ class SimpleRNNCell(Layer):
 
   def call(self, inputs, states, training=None):
     prev_output = states[0]
-    if 0 < self.dropout < 1 and self._dropout_mask is None:
-      self._dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(inputs),
-          self.dropout,
-          training=training)
-    if (0 < self.recurrent_dropout < 1 and
-        self._recurrent_dropout_mask is None):
-      self._recurrent_dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(prev_output),
-          self.recurrent_dropout,
-          training=training)
-
-    dp_mask = self._dropout_mask
-    rec_dp_mask = self._recurrent_dropout_mask
+    dp_mask = self.get_dropout_mask_for_cell(inputs, training)
+    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+        prev_output, training)
 
     if dp_mask is not None:
       h = K.dot(inputs * dp_mask, self.kernel)
@@ -1181,52 +1293,63 @@ class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass None, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix,
+      used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    return_sequences: Boolean. Whether to return the last output
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled,
+      else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
   """
 
   def __init__(self,
@@ -1283,8 +1406,8 @@ class SimpleRNN(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._dropout_mask = None
-    self.cell._recurrent_dropout_mask = None
+    self.cell.reset_dropout_mask()
+    self.cell.reset_recurrent_dropout_mask()
     return super(SimpleRNN, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1389,52 +1512,58 @@ class SimpleRNN(RNN):
 
 
 @keras_export('keras.layers.GRUCell')
-class GRUCell(Layer):
+class GRUCell(DropoutRNNCellMixin, Layer):
   """Cell class for the GRU layer.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass None, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      reset_after: GRU convention (whether to apply reset gate after or
-          before matrix multiplication). False = "before" (default),
-          True = "after" (CuDNN compatible).
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass None, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: hard sigmoid (`hard_sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix,
+      used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+    reset_after: GRU convention (whether to apply reset gate after or
+      before matrix multiplication). False = "before" (default),
+      True = "after" (CuDNN compatible).
+
+  Call arguments:
+    inputs: A 2D tensor.
+    states: List of state tensors corresponding to the previous timestep.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. Only relevant when `dropout` or
+      `recurrent_dropout` is used.
   """
 
   def __init__(self,
@@ -1480,8 +1609,6 @@ class GRUCell(Layer):
     self.reset_after = reset_after
     self.state_size = self.units
     self.output_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -1520,24 +1647,9 @@ class GRUCell(Layer):
   def call(self, inputs, states, training=None):
     h_tm1 = states[0]  # previous memory
 
-    if 0 < self.dropout < 1 and self._dropout_mask is None:
-      self._dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(inputs),
-          self.dropout,
-          training=training,
-          count=3)
-    if (0 < self.recurrent_dropout < 1 and
-        self._recurrent_dropout_mask is None):
-      self._recurrent_dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(h_tm1),
-          self.recurrent_dropout,
-          training=training,
-          count=3)
-
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
+    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
+    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+        h_tm1, training, count=3)
 
     if self.use_bias:
       if not self.reset_after:
@@ -1683,67 +1795,76 @@ class GRU(RNN):
   `recurrent_activation='sigmoid'`.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-      reset_after: GRU convention (whether to apply reset gate after or
-          before matrix multiplication). False = "before" (default),
-          True = "after" (CuDNN compatible).
-
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: hard sigmoid (`hard_sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix, used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+    return_sequences: Boolean. Whether to return the last output
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled,
+      else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+    reset_after: GRU convention (whether to apply reset gate after or
+      before matrix multiplication). False = "before" (default),
+      True = "after" (CuDNN compatible).
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
   """
 
   def __init__(self,
@@ -1805,8 +1926,8 @@ class GRU(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._dropout_mask = None
-    self.cell._recurrent_dropout_mask = None
+    self.cell.reset_dropout_mask()
+    self.cell.reset_recurrent_dropout_mask()
     return super(GRU, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1928,440 +2049,62 @@ class GRU(RNN):
     return cls(**config)
 
 
-@keras_export('keras.layers.GRU', v1=[])
-class UnifiedGRU(GRU):
-  """Gated Recurrent Unit - Cho et al. 2014.
-
-  `UnifiedGRU` unifies the implementations between standard `GRU` layer and
-  `CuDNNGRU` layer. Based on available runtime hardware and constraints,
-  `UnifiedGRU` will choose different implementations to maximize the
-  performance. For instance, if GPU is available and all the parameters meet the
-  requirement of CuDNN kernel, `UnifiedGRU` will use CuDNN kernel for the
-  calculation. The requirements to use CuDNN kernel are:
-
-    1. `activation` == 'tanh'
-    2. `recurrent_activation` == 'sigmoid'
-    3. `recurrent_dropout` == 0
-    4. `unroll` is False
-    5. `use_bias` is True
-    6. `reset_after` is True
-    7. Use masking in previous layers.
-
-  There are two variants. The default one is based on
-  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
-  state before matrix multiplication. The other one is based on
-  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
-
-  The second variant is compatible with CuDNNGRU (GPU-only) and allows
-  inference on CPU. Thus it has separate biases for `kernel` and
-  `recurrent_kernel`. Use `'reset_after'=True` and
-  `recurrent_activation='sigmoid'`.
-
-  Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-          Default: sigmoid (`sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-      reset_after: GRU convention (whether to apply reset gate after or
-          before matrix multiplication). False = "before",
-          True = "after" (default and CuDNN compatible).
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               time_major=False,
-               reset_after=True,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self._return_runtime = kwargs.pop('return_runtime', False)
-
-    super(UnifiedGRU, self).__init__(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        time_major=time_major,
-        reset_after=reset_after,
-        **kwargs)
-    self._dropout_mask = None
-    # CuDNN uses following setting by default and not configurable.
-    self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_activation == 'sigmoid' and
-        recurrent_dropout == 0 and not unroll and use_bias and
-        reset_after is True)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # GRU does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    input_shape = K.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if mask is not None or not self.could_use_cudnn:
-      # CuDNN does not support masking, fall back to use the normal GRU.
-      kwargs = {'training': training}
-      self.cell._dropout_mask = None
-      self.cell._recurrent_dropout_mask = None
-
-      def step(cell_inputs, cell_states):
-        return self.cell.call(cell_inputs, cell_states, **kwargs)
-
-      last_output, outputs, states = K.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask)
-      # This is a dummy tensor for testing purpose.
-      runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
-    else:
-      last_output, outputs, runtime, states = self._defun_gru_call(
-          inputs, initial_state, training)
-
-    if self.stateful:
-      updates = [state_ops.assign(self.states[0], states[0])]
-      self.add_update(updates, inputs)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + states
-    elif self._return_runtime:
-      return output, runtime
-    else:
-      return output
-
-  def _defun_gru_call(self, inputs, initial_state, training):
-    # Use the new defun approach for backend implementation swap.
-    # Note that different implementations need to have same function
-    # signature, eg, the tensor parameters need to have same shape and dtypes.
-    if self.go_backwards:
-      # Reverse time axis.
-      inputs = K.reverse(inputs, 0 if self.time_major else 1)
-    if 0 < self.dropout < 1:
-      if self._dropout_mask is None:
-        self._dropout_mask = _generate_dropout_mask(
-            array_ops.ones_like(inputs),
-            self.dropout,
-            training=training,
-            count=3)
-
-      inputs *= self._dropout_mask[0]
-    experimental_api_name = 'gru_' + str(uuid.uuid4())
-    defun_standard_gru = _generate_defun_backend(
-        experimental_api_name, _CPU_DEVICE_NAME, standard_gru)
-    defun_cudnn_gru = _generate_defun_backend(
-        experimental_api_name, _GPU_DEVICE_NAME, cudnn_gru)
-    if ops.executing_eagerly_outside_functions():
-      # Under eager context, the device placement is already known. Prefer the
-      # GPU implementation when GPU is available.
-      if context.num_gpus() > 0:
-        last_output, outputs, new_h, runtime = defun_cudnn_gru(
-            inputs=inputs,
-            init_h=initial_state[0],
-            kernel=self.cell.kernel,
-            recurrent_kernel=self.cell.recurrent_kernel,
-            bias=self.cell.bias,
-            time_major=self.time_major)
-      else:
-        last_output, outputs, new_h, runtime = defun_standard_gru(
-            inputs=inputs,
-            init_h=initial_state[0],
-            kernel=self.cell.kernel,
-            recurrent_kernel=self.cell.recurrent_kernel,
-            bias=self.cell.bias,
-            activation=self.activation,
-            recurrent_activation=self.recurrent_activation,
-            time_major=self.time_major)
-    else:
-      # Call the normal GRU impl and register the CuDNN impl function. The
-      # grappler will kick in during session execution to optimize the graph.
-      last_output, outputs, new_h, runtime = defun_standard_gru(
-          inputs=inputs,
-          init_h=initial_state[0],
-          kernel=self.cell.kernel,
-          recurrent_kernel=self.cell.recurrent_kernel,
-          bias=self.cell.bias,
-          activation=self.activation,
-          recurrent_activation=self.recurrent_activation,
-          time_major=self.time_major)
-
-      function.register(defun_cudnn_gru, inputs, initial_state[0],
-                        self.cell.kernel, self.cell.recurrent_kernel,
-                        self.cell.bias, self.time_major)
-    states = [new_h]
-    return last_output, outputs, runtime, states
-
-
-def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
-                 recurrent_activation, time_major):
-  """GRU with standard kernel implementation.
-
-  This implementation can be run on all types of hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the CuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since CuDNN implementation does not support that.
-
-  Args:
-    inputs: input tensor of GRU layer.
-    init_h: initial state tensor for the cell output.
-    kernel: weights for cell kernel.
-    recurrent_kernel: weights for cell recurrent kernel.
-    bias: weights for cell kernel bias and recurrent bias. The bias contains the
-      combined input_bias and recurrent_bias.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
-    time_major: boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs: output tensor for all timesteps, which has shape
-      [batch, time, units].
-    state_0: the cell output, which has same shape as init_h.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = K.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  input_bias, recurrent_bias = array_ops.unstack(bias)
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]
-
-    # inputs projected by all gate matrices at once
-    matrix_x = K.dot(cell_inputs, kernel)
-    matrix_x = K.bias_add(matrix_x, input_bias)
-
-    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
-
-    # hidden state projected by all gate matrices at once
-    matrix_inner = K.dot(h_tm1, recurrent_kernel)
-    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
-
-    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
-                                                            axis=1)
-    z = recurrent_activation(x_z + recurrent_z)
-    r = recurrent_activation(x_r + recurrent_r)
-    hh = activation(x_h + r * recurrent_h)
-
-    # previous and candidate state mixed by update gate
-    h = z * h_tm1 + (1 - z) * hh
-    return h, [h]
-
-  last_output, outputs, new_states = K.rnn(
-      step,
-      inputs, [init_h],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      input_length=timesteps)
-  return last_output, outputs, new_states[0], constant_op.constant(
-      'cpu', dtype=dtypes.string, name='runtime')
-
-
-def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
-  """GRU with CuDNN implementation which is only available for GPU."""
-  if not time_major:
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  init_h = array_ops.expand_dims(init_h, axis=0)
-
-  weights = array_ops.split(kernel, 3, axis=1)
-  weights += array_ops.split(recurrent_kernel, 3, axis=1)
-  # Note that the bias was initialized as shape (2, 3 * units), flat it into
-  # (6 * units)
-  bias = array_ops.split(K.flatten(bias), 6)
-  # Note that the gate order for CuDNN is different from the canonical format.
-  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
-  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
-  # z is update gate weights.
-  # r is reset gate weights.
-  # h is output gate weights.
-  weights[0], weights[1] = weights[1], weights[0]
-  weights[3], weights[4] = weights[4], weights[3]
-  bias[0], bias[1] = bias[1], bias[0]
-  bias[3], bias[4] = bias[4], bias[3]
-
-  params = _canonical_to_params(
-      weights=weights,
-      biases=bias,
-      shape=constant_op.constant([-1]),
-      transpose_weights=True)
-
-  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      inputs,
-      input_h=init_h,
-      input_c=0,
-      params=params,
-      is_training=True,
-      rnn_mode='gru')
-  last_output = outputs[-1]
-  if not time_major:
-    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  return last_output, outputs, h, constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
-
-
 @keras_export('keras.layers.LSTMCell')
-class LSTMCell(Layer):
+class LSTMCell(DropoutRNNCellMixin, Layer):
   """Cell class for the LSTM layer.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state.
-      bias_initializer: Initializer for the bias vector.
-      unit_forget_bias: Boolean.
-          If True, add 1 to the bias of the forget gate at initialization.
-          Setting it to true will also force `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: hard sigmoid (`hard_sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix,
+      used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+      If True, add 1 to the bias of the forget gate at initialization.
+      Setting it to true will also force `bias_initializer="zeros"`.
+      This is recommended in [Jozefowicz et
+        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+
+  Call arguments:
+    inputs: A 2D tensor.
+    states: List of state tensors corresponding to the previous timestep.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. Only relevant when `dropout` or
+      `recurrent_dropout` is used.
   """
 
   def __init__(self,
@@ -2407,8 +2150,6 @@ class LSTMCell(Layer):
     self.implementation = implementation
     self.state_size = [self.units, self.units]
     self.output_size = self.units
-    self._dropout_mask = None
-    self._recurrent_dropout_mask = None
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -2471,28 +2212,13 @@ class LSTMCell(Layer):
     return c, o
 
   def call(self, inputs, states, training=None):
-    if 0 < self.dropout < 1 and self._dropout_mask is None:
-      self._dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(inputs),
-          self.dropout,
-          training=training,
-          count=4)
-    if (0 < self.recurrent_dropout < 1 and
-        self._recurrent_dropout_mask is None):
-      self._recurrent_dropout_mask = _generate_dropout_mask(
-          array_ops.ones_like(states[0]),
-          self.recurrent_dropout,
-          training=training,
-          count=4)
-
-    # dropout matrices for input units
-    dp_mask = self._dropout_mask
-    # dropout matrices for recurrent units
-    rec_dp_mask = self._recurrent_dropout_mask
-
     h_tm1 = states[0]  # previous memory state
     c_tm1 = states[1]  # previous carry state
 
+    dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
+    rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
+        h_tm1, training, count=4)
+
     if self.implementation == 1:
       if 0 < self.dropout < 1.:
         inputs_i = inputs * dp_mask[0]
@@ -2504,15 +2230,19 @@ class LSTMCell(Layer):
         inputs_f = inputs
         inputs_c = inputs
         inputs_o = inputs
-      x_i = K.dot(inputs_i, self.kernel[:, :self.units])
-      x_f = K.dot(inputs_f, self.kernel[:, self.units:self.units * 2])
-      x_c = K.dot(inputs_c, self.kernel[:, self.units * 2:self.units * 3])
-      x_o = K.dot(inputs_o, self.kernel[:, self.units * 3:])
+      k_i, k_f, k_c, k_o = array_ops.split(
+          self.kernel, num_or_size_splits=4, axis=1)
+      x_i = K.dot(inputs_i, k_i)
+      x_f = K.dot(inputs_f, k_f)
+      x_c = K.dot(inputs_c, k_c)
+      x_o = K.dot(inputs_o, k_o)
       if self.use_bias:
-        x_i = K.bias_add(x_i, self.bias[:self.units])
-        x_f = K.bias_add(x_f, self.bias[self.units:self.units * 2])
-        x_c = K.bias_add(x_c, self.bias[self.units * 2:self.units * 3])
-        x_o = K.bias_add(x_o, self.bias[self.units * 3:])
+        b_i, b_f, b_c, b_o = array_ops.split(
+            self.bias, num_or_size_splits=4, axis=0)
+        x_i = K.bias_add(x_i, b_i)
+        x_f = K.bias_add(x_f, b_f)
+        x_c = K.bias_add(x_c, b_c)
+        x_o = K.bias_add(x_o, b_o)
 
       if 0 < self.recurrent_dropout < 1.:
         h_tm1_i = h_tm1 * rec_dp_mask[0]
@@ -2537,12 +2267,7 @@ class LSTMCell(Layer):
       if self.use_bias:
         z = K.bias_add(z, self.bias)
 
-      z0 = z[:, :self.units]
-      z1 = z[:, self.units:2 * self.units]
-      z2 = z[:, 2 * self.units:3 * self.units]
-      z3 = z[:, 3 * self.units:]
-
-      z = (z0, z1, z2, z3)
+      z = array_ops.split(z, num_or_size_splits=4, axis=1)
       c, o = self._compute_carry_and_output_fused(z, c_tm1)
 
     h = o * self.activation(c)
@@ -2603,28 +2328,26 @@ class PeepholeLSTMCell(LSTMCell):
 
   From [Gers et al.](http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
 
-    "We find that LSTM augmented by 'peephole connections' from its internal
-    cells to its multiplicative gates can learn the fine distinction between
-    sequences of spikes spaced either 50 or 49 time steps apart without the help
-    of any short training exemplars."
+  "We find that LSTM augmented by 'peephole connections' from its internal
+  cells to its multiplicative gates can learn the fine distinction between
+  sequences of spikes spaced either 50 or 49 time steps apart without the help
+  of any short training exemplars."
 
   The peephole implementation is based on:
 
-    https://research.google.com/pubs/archive/43905.pdf
-
-  Hasim Sak, Andrew Senior, and Francoise Beaufays.
-  "Long short-term memory recurrent neural network architectures for
-   large scale acoustic modeling." INTERSPEECH, 2014.
+  [Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling.
+  ](https://research.google.com/pubs/archive/43905.pdf)
 
   Example:
 
   ```python
-      # Create 2 PeepholeLSTMCells
-      peephole_lstm_cells = [PeepholeLSTMCell(size) for size in [128, 256]]
-      # Create a layer composed sequentially of the peephole LSTM cells.
-      layer = RNN(peephole_lstm_cells)
-      input = keras.Input((timesteps, input_dim))
-      output = layer(input)
+  # Create 2 PeepholeLSTMCells
+  peephole_lstm_cells = [PeepholeLSTMCell(size) for size in [128, 256]]
+  # Create a layer composed sequentially of the peephole LSTM cells.
+  layer = RNN(peephole_lstm_cells)
+  input = keras.Input((timesteps, input_dim))
+  output = layer(input)
   ```
   """
 
@@ -2681,69 +2404,79 @@ class LSTM(RNN):
   `tf.keras.layers.CuDNNLSTM` for better performance on GPU.
 
   Arguments:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-          weights matrix,
-          used for the linear transformation of the recurrent state..
-      bias_initializer: Initializer for the bias vector.
-      unit_forget_bias: Boolean.
-          If True, add 1 to the bias of the forget gate at initialization.
-          Setting it to true will also force `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
-      recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-      dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the inputs.
-      recurrent_dropout: Float between 0 and 1.
-          Fraction of the units to drop for
-          the linear transformation of the recurrent state.
-      implementation: Implementation mode, either 1 or 2.
-          Mode 1 will structure its operations as a larger number of
-          smaller dot products and additions, whereas mode 2 will
-          batch them into fewer, larger operations. These modes will
-          have different performance profiles on different hardware and
-          for different applications.
-      return_sequences: Boolean. Whether to return the last output.
-          in the output sequence, or the full sequence.
-      return_state: Boolean. Whether to return the last state
-          in addition to the output.
-      go_backwards: Boolean (default False).
-          If True, process the input sequence backwards and return the
-          reversed sequence.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
-      unroll: Boolean (default False).
-          If True, the network will be unrolled,
-          else a symbolic loop will be used.
-          Unrolling can speed-up a RNN,
-          although it tends to be more memory-intensive.
-          Unrolling is only suitable for short sequences.
-
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: hard sigmoid (`hard_sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix,
+      used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+      If True, add 1 to the bias of the forget gate at initialization.
+      Setting it to true will also force `bias_initializer="zeros"`.
+      This is recommended in [Jozefowicz et
+        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+    return_sequences: Boolean. Whether to return the last output.
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled,
+      else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
   """
 
   def __init__(self,
@@ -2775,10 +2508,6 @@ class LSTM(RNN):
       logging.warning('`implementation=0` has been deprecated, '
                       'and now defaults to `implementation=1`.'
                       'Please update your layer call.')
-    if context.executing_eagerly() and context.num_gpus() > 0:
-      logging.warn('%s: Note that this layer is not optimized for performance. '
-                   'Please use tf.keras.layers.CuDNNLSTM for better '
-                   'performance on GPU.', self)
     cell = LSTMCell(
         units,
         activation=activation,
@@ -2809,8 +2538,8 @@ class LSTM(RNN):
     self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._dropout_mask = None
-    self.cell._recurrent_dropout_mask = None
+    self.cell.reset_dropout_mask()
+    self.cell.reset_recurrent_dropout_mask()
     return super(LSTM, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2932,376 +2661,6 @@ class LSTM(RNN):
     return cls(**config)
 
 
-@keras_export('keras.layers.LSTM', v1=[])
-class UnifiedLSTM(LSTM):
-  """Long Short-Term Memory layer - Hochreiter 1997.
-
-  `UnifiedLSTM` unifies the implementations between standard `LSTM` layer and
-  `CuDNNLSTM` layer. Based on available runtime hardware and constrains,
-  `UnifiedLSTM` will choose different implementations to maximize the
-  performance. For instance, if GPU is available and all the parameters meet the
-  requirement of CuDNN kernel, `UnifiedLSTM` will use CuDNN kernel for the
-  calculation.
-
-  Arguments:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
-      is applied (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
-      applied (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs..
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state..
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Setting it to true will also force
-      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation")..
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
-      its operations as a larger number of smaller dot products and additions,
-      whereas mode 2 will batch them into fewer, larger operations. These modes
-      will have different performance profiles on different hardware and for
-      different applications.
-    return_sequences: Boolean. Whether to return the last output. in the output
-      sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state in addition to the
-      output.
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    unroll: Boolean (default False). If True, the network will be unrolled, else
-      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
-      tends to be more memory-intensive. Unrolling is only suitable for short
-      sequences.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               time_major=False,
-               unroll=False,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self.return_runtime = kwargs.pop('return_runtime', False)
-
-    super(UnifiedLSTM, self).__init__(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        time_major=time_major,
-        unroll=unroll,
-        **kwargs)
-
-    self.state_spec = [
-        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
-    ]
-    self._dropout_mask = None
-    self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_activation == 'sigmoid' and
-        recurrent_dropout == 0 and not unroll and use_bias)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # LSTM does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    input_shape = K.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if mask is not None or not self.could_use_cudnn:
-      # CuDNN does not support masking, fall back to use the normal LSTM.
-      kwargs = {'training': training}
-
-      def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
-
-      last_output, outputs, states = K.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask)
-      runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
-    else:
-      # Use the new defun approach for backend implementation swap.
-      # Note that different implementations need to have same function
-      # signature, eg, the tensor parameters need to have same shape and dtypes.
-      # Since the CuDNN has an extra set of bias, those bias will be passed to
-      # both normal and CuDNN implementations.
-      if self.go_backwards:
-        # Reverse time axis.
-        inputs = K.reverse(inputs, 0 if self.time_major else 1)
-
-      if 0 < self.dropout < 1:
-        if self._dropout_mask is None:
-          self._dropout_mask = _generate_dropout_mask(
-              array_ops.ones_like(inputs),
-              self.dropout,
-              training=training,
-              count=4)
-
-        inputs *= self._dropout_mask[0]
-
-      # Each time a defun function is called, we will give a unique identifiable
-      # API name, so that the grappler won't get confused when it sees multiple
-      # LSTM layer added into same graph, and it will be able to pair up the
-      # different implementations across them.
-      experimental_api_name = 'lstm_' + str(uuid.uuid4())
-      defun_standard_lstm = _generate_defun_backend(
-          experimental_api_name, _CPU_DEVICE_NAME, standard_lstm)
-      defun_cudnn_lstm = _generate_defun_backend(
-          experimental_api_name, _GPU_DEVICE_NAME, cudnn_lstm)
-
-      if ops.executing_eagerly_outside_functions():
-        # Under eager context, the device placement is already known. Prefer the
-        # GPU implementation here.
-        if context.num_gpus() > 0:
-          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
-              inputs, initial_state[0], initial_state[1], self.cell.kernel,
-              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
-        else:
-          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-              inputs, initial_state[0], initial_state[1], self.cell.kernel,
-              self.cell.recurrent_kernel, self.cell.bias, self.activation,
-              self.recurrent_activation, self.time_major)
-      else:
-        # Call the normal LSTM impl and register the CuDNN impl function. The
-        # grappler will kick in during session execution to optimize the graph.
-        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-            inputs, initial_state[0], initial_state[1], self.cell.kernel,
-            self.cell.recurrent_kernel, self.cell.bias, self.activation,
-            self.recurrent_activation, self.time_major)
-
-        function.register(defun_cudnn_lstm, inputs, initial_state[0],
-                          initial_state[1], self.cell.kernel,
-                          self.cell.recurrent_kernel, self.cell.bias,
-                          self.time_major)
-      states = [new_h, new_c]
-
-    if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(state_ops.assign(self.states[i], states[i]))
-      self.add_update(updates, inputs)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + states
-    elif self.return_runtime:
-      return output, runtime
-    else:
-      return output
-
-
-def _canonical_to_params(weights, biases, shape, transpose_weights=False):
-  """Utility function convert variable to CuDNN compatible parameter.
-
-  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
-
-  ```
-    Keras                 CuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-  ```
-
-  If the input weights need to be in a unified format, then set
-  `transpose_weights=True` to convert the weights.
-
-  Args:
-    weights: list of weights for the individual kernels and recurrent kernels.
-    biases: list of biases for individual gate.
-    shape: the shape for the converted variables that will be feed to CuDNN.
-    transpose_weights: boolean, whether to transpose the weights.
-
-  Returns:
-    The converted weights that can be feed to CuDNN ops as param.
-  """
-  def convert(w):
-    return array_ops.transpose(w) if transpose_weights else w
-
-  weights = [array_ops.reshape(convert(x), shape) for x in weights]
-  biases = [array_ops.reshape(x, shape) for x in biases]
-  return array_ops.concat(weights + biases, axis=0)
-
-
-def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
-                  activation, recurrent_activation, time_major):
-  """LSTM with standard kernel implementation.
-
-  This implementation can be run on all types for hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the CuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since CuDNN implementation does not support that.
-
-  Note that the first half of the bias tensor should be ignored by this impl.
-  The CuDNN impl need an extra set of input gate bias. In order to make the both
-  function take same shape of parameter, that extra set of bias is also feed
-  here.
-
-  Args:
-    inputs: input tensor of LSTM layer.
-    init_h: initial state tensor for the cell output.
-    init_c: initial state tensor for the cell hidden state.
-    kernel: weights for cell kernel.
-    recurrent_kernel: weights for cell recurrent kernel.
-    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
-    time_major: boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs: output tensor for all timesteps, which has shape
-      [batch, time, units].
-    state_0: the cell output, which has same shape as init_h.
-    state_1: the cell hidden state, which has same shape as init_c.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = K.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]  # previous memory state
-    c_tm1 = cell_states[1]  # previous carry state
-
-    z = K.dot(cell_inputs, kernel)
-    z += K.dot(h_tm1, recurrent_kernel)
-    z = K.bias_add(z, bias)
-
-    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
-
-    i = recurrent_activation(z0)
-    f = recurrent_activation(z1)
-    c = f * c_tm1 + i * activation(z2)
-    o = recurrent_activation(z3)
-
-    h = o * activation(c)
-    return h, [h, c]
-
-  last_output, outputs, new_states = K.rnn(
-      step,
-      inputs, [init_h, init_c],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      input_length=timesteps)
-  return last_output, outputs, new_states[0], new_states[
-      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
-
-
-def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
-               time_major):
-  """LSTM with CuDNN implementation which is only available for GPU."""
-  if not time_major:
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  input_h = array_ops.expand_dims(input_h, axis=0)
-  input_c = array_ops.expand_dims(input_c, axis=0)
-
-  weights = array_ops.split(kernel, 4, axis=1)
-  weights += array_ops.split(recurrent_kernel, 4, axis=1)
-  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
-  # so that mathematically it is same as the canonical LSTM implementation.
-  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
-
-  params = _canonical_to_params(
-      weights=weights,
-      biases=array_ops.split(full_bias, 8),
-      shape=constant_op.constant([-1]),
-      transpose_weights=True)
-
-  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
-  last_output = outputs[-1]
-  if not time_major:
-    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  c = c[0]
-
-  return last_output, outputs, h, c, constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
-
-
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
@@ -3325,19 +2684,19 @@ def _standardize_args(
   `constants` are lists of tensors (or None).
 
   Arguments:
-      inputs: Tensor or list/tuple of tensors. which may include constants
-        and initial states. In that case `num_constant` must be specified.
-      initial_state: Tensor or list of tensors or None, initial states.
-      constants: Tensor or list of tensors or None, constant tensors.
-      num_constants: Expected number of constants (if constants are passed as
-        part of the `inputs` list.
-      num_inputs: Expected number of real input tensors (exclude initial_states
-        and constants).
+    inputs: Tensor or list/tuple of tensors. which may include constants
+      and initial states. In that case `num_constant` must be specified.
+    initial_state: Tensor or list of tensors or None, initial states.
+    constants: Tensor or list of tensors or None, constant tensors.
+    num_constants: Expected number of constants (if constants are passed as
+      part of the `inputs` list.
+    num_inputs: Expected number of real input tensors (exclude initial_states
+      and constants).
 
   Returns:
-      inputs: Single tensor or tuple of tensors.
-      initial_state: List of tensors or None.
-      constants: List of tensors or None.
+    inputs: Single tensor or tuple of tensors.
+    initial_state: List of tensors or None.
+    constants: List of tensors or None.
   """
   if isinstance(inputs, list):
     # There are several situations here:
@@ -3409,12 +2768,3 @@ def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
     return nest.map_structure(create_zeros, state_size)
   else:
     return create_zeros(state_size)
-
-
-def _generate_defun_backend(unique_api_name, preferred_device, func):
-  function_attributes = {
-      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
-      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
-  }
-  return function.defun_with_attributes(func=func,
-                                        attributes=function_attributes)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index ddea2f4eae49e0a1948ca2de151eaa5f74f6a378..41bbdacec2a8f8ca4f27deae55a23df738b5f8d0 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -23,15 +23,20 @@ from __future__ import print_function
 
 import collections
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -40,7 +45,8 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import nest
 
 # Used for nested input/output/state RNN test.
@@ -217,6 +223,61 @@ class RNNTest(keras_parameterized.TestCase):
     y_np_2 = model.predict(x_np)
     self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
+  def test_minimal_rnn_cell_abstract_rnn_cell(self):
+
+    class MinimalRNNCell(keras.layers.AbstractRNNCell):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        super(MinimalRNNCell, self).__init__(**kwargs)
+
+      @property
+      def state_size(self):
+        return self.units
+
+      def build(self, input_shape):
+        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                      initializer='uniform',
+                                      name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.built = True
+
+      def call(self, inputs, states):
+        prev_output = states[0]
+        h = keras.backend.dot(inputs, self.kernel)
+        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+        return output, output
+
+      @property
+      def output_size(self):
+        return self.units
+
+    cell = MinimalRNNCell(32)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(
+        optimizer="rmsprop",
+        loss="mse",
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8),
+             MinimalRNNCell(16),
+             MinimalRNNCell(32)]
+    layer = keras.layers.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop',
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
   def test_rnn_with_time_major(self):
     batch = 10
     time_step = 5
@@ -669,14 +730,38 @@ class RNNTest(keras_parameterized.TestCase):
       y_np_2 = model.predict(x_np)
       self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
-  def DISABLED_test_stacked_rnn_dropout(self):
-    # Temporarily disabled test due an occasional Grappler segfault.
-    # See b/115523414
-    cells = [keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1),
-             keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
-    layer = keras.layers.RNN(cells)
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          layer=[rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM,
+                 rnn_v2.GRU, rnn_v2.LSTM],
+          unroll=[True, False]))
+  def test_rnn_dropout(self, layer, unroll):
+    rnn_layer = layer(3, dropout=0.1, recurrent_dropout=0.1, unroll=unroll)
+    if not unroll:
+      x = keras.Input((None, 5))
+    else:
+      x = keras.Input((5, 5))
+    y = rnn_layer(x)
+    model = keras.models.Model(x, y)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x_np = np.random.random((6, 5, 5))
+    y_np = np.random.random((6, 3))
+    model.train_on_batch(x_np, y_np)
 
-    x = keras.Input((None, 5))
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          cell=[keras.layers.SimpleRNNCell, keras.layers.GRUCell,
+                keras.layers.LSTMCell],
+          unroll=[True, False]))
+  def test_stacked_rnn_dropout(self, cell, unroll):
+    cells = [cell(3, dropout=0.1, recurrent_dropout=0.1),
+             cell(3, dropout=0.1, recurrent_dropout=0.1)]
+    layer = keras.layers.RNN(cells, unroll=unroll)
+
+    if not unroll:
+      x = keras.Input((None, 5))
+    else:
+      x = keras.Input((5, 5))
     y = layer(x)
     model = keras.models.Model(x, y)
     model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
@@ -684,6 +769,38 @@ class RNNTest(keras_parameterized.TestCase):
     y_np = np.random.random((6, 3))
     model.train_on_batch(x_np, y_np)
 
+  def test_dropout_mask_reuse(self):
+    # The layer is created with recurrent_initializer = zero, so that the
+    # the recurrent state won't affect the output. By doing this, we can verify
+    # the output and see if the same mask is applied to for each timestep.
+    rnn = keras.layers.SimpleRNN(3,
+                                 dropout=0.5,
+                                 kernel_initializer='ones',
+                                 recurrent_initializer='zeros',
+                                 return_sequences=True,
+                                 unroll=True)
+
+    inputs = constant_op.constant(1.0, shape=(6, 2, 5))
+    out = rnn(inputs, training=True)
+    if not context.executing_eagerly():
+      self.evaluate(variables_lib.global_variables_initializer())
+    batch_1 = self.evaluate(out)
+    batch_1_t0, batch_1_t1 = batch_1[:, 0, :], batch_1[:, 1, :]
+    self.assertAllClose(batch_1_t0, batch_1_t1)
+
+    # This simulate the layer called with multiple batches in eager mode
+    if context.executing_eagerly():
+      out2 = rnn(inputs, training=True)
+    else:
+      out2 = out
+    batch_2 = self.evaluate(out2)
+    batch_2_t0, batch_2_t1 = batch_2[:, 0, :], batch_2[:, 1, :]
+    self.assertAllClose(batch_2_t0, batch_2_t1)
+
+    # Also validate that different dropout is used by between batches.
+    self.assertNotAllClose(batch_1_t0, batch_2_t0)
+    self.assertNotAllClose(batch_1_t1, batch_2_t1)
+
   def test_stacked_rnn_compute_output_shape(self):
     cells = [keras.layers.LSTMCell(3),
              keras.layers.LSTMCell(6)]
@@ -715,7 +832,7 @@ class RNNTest(keras_parameterized.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-  def test_checkpointable_dependencies(self):
+  def test_trackable_dependencies(self):
     rnn = keras.layers.SimpleRNN
     x = np.random.random((2, 2, 2))
     y = np.random.random((2, 2))
@@ -728,8 +845,9 @@ class RNNTest(keras_parameterized.TestCase):
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = object_identity.ObjectIdentitySet(
+        trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -1162,6 +1280,64 @@ class RNNTest(keras_parameterized.TestCase):
       result_1[5, 3:] = 0
       self.assertAllClose(result_1, result_2)
 
+  def test_unroll_single_step(self):
+    """Even if the time dimension is only one, we should be able to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((1, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    np_x = np.ones((6, 1, 5))
+    result = model.predict(np_x)
+    self.assertEqual((6, 1, 5), result.shape)
+
+  def test_unroll_zero_step(self):
+    """If the time dimension is None, we should fail to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    with self.assertRaisesRegexp(ValueError, 'Cannot unroll a RNN.*'):
+      layer(x)
+
+  def test_full_input_spec(self):
+    # See https://github.com/tensorflow/tensorflow/issues/25985
+    inputs = keras.layers.Input(batch_shape=(1, 1, 1))
+    state_h = keras.layers.Input(batch_shape=(1, 1))
+    state_c = keras.layers.Input(batch_shape=(1, 1))
+    states = [state_h, state_c]
+    decoder_out = keras.layers.LSTM(1, stateful=True)(
+        inputs,
+        initial_state=states
+    )
+    model = keras.Model([inputs, state_h, state_c], decoder_out)
+    model.reset_states()
+
+  def test_reset_states(self):
+    # See https://github.com/tensorflow/tensorflow/issues/25852
+    with self.assertRaisesRegexp(ValueError, 'it needs to know its batch size'):
+      simple_rnn = keras.layers.SimpleRNN(1, stateful=True)
+      simple_rnn.reset_states()
+
+    with self.assertRaisesRegexp(ValueError, 'it needs to know its batch size'):
+      cell = Minimal2DRNNCell(1, 2)
+      custom_rnn = keras.layers.RNN(cell, stateful=True)
+      custom_rnn.reset_states()
+
+  def test_input_dim_length(self):
+    simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
+
+    simple_rnn = keras.layers.SimpleRNN(5, input_dim=8)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, None, 8))
+
+    simple_rnn = keras.layers.SimpleRNN(5, input_length=10)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, None))
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
@@ -1269,3 +1445,4 @@ class NestedCell(keras.layers.Layer):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7355baed3f580873a393235da6275bacaad24b3
--- /dev/null
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -0,0 +1,836 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent layers for TF 2.0.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import uuid
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+# The following string constants are used by Defun approach for unified backend
+# of LSTM and GRU.
+_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
+_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
+_CPU_DEVICE_NAME = 'CPU'
+_GPU_DEVICE_NAME = 'GPU'
+
+
+@keras_export('keras.layers.GRU', v1=[])
+class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
+  """Gated Recurrent Unit - Cho et al. 2014.
+
+  Based on available runtime hardware and constraints, this layer
+  will choose different implementations (cuDNN-based or pure-TensorFlow)
+  to maximize the performance. If a GPU is available and all
+  the arguments to the layer meet the requirement of the CuDNN kernel
+  (see below for details), the layer will use a fast cuDNN implementation.
+
+  The requirements to use the cuDNN implementation are:
+
+  1. `activation` == 'tanh'
+  2. `recurrent_activation` == 'sigmoid'
+  3. `recurrent_dropout` == 0
+  4. `unroll` is False
+  5. `use_bias` is True
+  6. `reset_after` is True
+  7. No use of masking.
+
+  There are two variants of the GRU implementation. The default one is based on
+  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
+  state before matrix multiplication. The other one is based on
+  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. To use this variant, set `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: sigmoid (`sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+       weights matrix,
+       used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+    return_sequences: Boolean. Whether to return the last output
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled,
+      else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+    reset_after: GRU convention (whether to apply reset gate after or
+      before matrix multiplication). False = "before",
+      True = "after" (default and CuDNN compatible).
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               time_major=False,
+               reset_after=True,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self._return_runtime = kwargs.pop('return_runtime', False)
+
+    super(GRU, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        time_major=time_major,
+        reset_after=reset_after,
+        **kwargs)
+    # CuDNN uses following setting by default and not configurable.
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias and
+        reset_after)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # GRU does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal GRU.
+      kwargs = {'training': training}
+
+      def step(cell_inputs, cell_states):
+        return self.cell.call(cell_inputs, cell_states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      # This is a dummy tensor for testing purpose.
+      runtime = _runtime('unknown')
+    else:
+      last_output, outputs, runtime, states = self._defun_gru_call(
+          inputs, initial_state, training)
+
+    if self.stateful:
+      updates = [state_ops.assign(self.states[0], states[0])]
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + list(states)
+    elif self._return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  def _defun_gru_call(self, inputs, initial_state, training):
+    # Use the new defun approach for backend implementation swap.
+    # Note that different implementations need to have same function
+    # signature, eg, the tensor parameters need to have same shape and dtypes.
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+    self.reset_dropout_mask()
+    dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
+    if dropout_mask is not None:
+      inputs *= dropout_mask[0]
+    if context.executing_eagerly():
+      device_type = _get_context_device_type()
+      if device_type == _GPU_DEVICE_NAME or (
+          device_type is None and context.num_gpus() > 0):
+        # Under eager context, check the device placement and prefer the
+        # GPU implementation when GPU is available.
+        last_output, outputs, new_h, runtime = cudnn_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            time_major=self.time_major)
+      else:
+        last_output, outputs, new_h, runtime = standard_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            activation=self.activation,
+            recurrent_activation=self.recurrent_activation,
+            time_major=self.time_major)
+    else:
+      api_name = 'gru_' + str(uuid.uuid4())
+      defun_standard_gru = _generate_defun_backend(
+          api_name, _CPU_DEVICE_NAME, standard_gru)
+      defun_cudnn_gru = _generate_defun_backend(
+          api_name, _GPU_DEVICE_NAME, cudnn_gru)
+      # Call the normal GRU impl and register the CuDNN impl function. The
+      # grappler will kick in during session execution to optimize the graph.
+      last_output, outputs, new_h, runtime = defun_standard_gru(
+          inputs=inputs,
+          init_h=initial_state[0],
+          kernel=self.cell.kernel,
+          recurrent_kernel=self.cell.recurrent_kernel,
+          bias=self.cell.bias,
+          activation=self.activation,
+          recurrent_activation=self.recurrent_activation,
+          time_major=self.time_major)
+
+      function.register(defun_cudnn_gru, inputs, initial_state[0],
+                        self.cell.kernel, self.cell.recurrent_kernel,
+                        self.cell.bias, self.time_major)
+    states = [new_h]
+    return last_output, outputs, runtime, states
+
+
+def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
+                 recurrent_activation, time_major):
+  """GRU with standard kernel implementation.
+
+  This implementation can be run on all types of hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Arguments:
+    inputs: input tensor of GRU layer.
+    init_h: initial state tensor for the cell output.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. The bias contains the
+      combined input_bias and recurrent_bias.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  input_bias, recurrent_bias = array_ops.unstack(bias)
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = recurrent_activation(x_z + recurrent_z)
+    r = recurrent_activation(x_r + recurrent_r)
+    hh = activation(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], _runtime('cpu')
+
+
+def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
+  """GRU with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  init_h = array_ops.expand_dims(init_h, axis=0)
+
+  weights = array_ops.split(kernel, 3, axis=1)
+  weights += array_ops.split(recurrent_kernel, 3, axis=1)
+  # Note that the bias was initialized as shape (2, 3 * units), flat it into
+  # (6 * units)
+  bias = array_ops.split(K.flatten(bias), 6)
+  # Note that the gate order for CuDNN is different from the canonical format.
+  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
+  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
+  # z is update gate weights.
+  # r is reset gate weights.
+  # h is output gate weights.
+  weights[0], weights[1] = weights[1], weights[0]
+  weights[3], weights[4] = weights[4], weights[3]
+  bias[0], bias[1] = bias[1], bias[0]
+  bias[3], bias[4] = bias[4], bias[3]
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=bias,
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs,
+      input_h=init_h,
+      input_c=0,
+      params=params,
+      is_training=True,
+      rnn_mode='gru')
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  return last_output, outputs, h, _runtime('cudnn')
+
+
+@keras_export('keras.layers.LSTM', v1=[])
+class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  Based on available runtime hardware and constraints, this layer
+  will choose different implementations (cuDNN-based or pure-TensorFlow)
+  to maximize the performance. If a GPU is available and all
+  the arguments to the layer meet the requirement of the CuDNN kernel
+  (see below for details), the layer will use a fast cuDNN implementation.
+
+  The requirements to use the cuDNN implementation are:
+
+  1. `activation` == 'tanh'
+  2. `recurrent_activation` == 'sigmoid'
+  3. `recurrent_dropout` == 0
+  4. `unroll` is False
+  5. `use_bias` is True
+  7. No use of masking.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+      is applied (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+      applied (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(LSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = _runtime('unknown')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+      self.reset_dropout_mask()
+      dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
+      if dropout_mask is not None:
+        inputs *= dropout_mask[0]
+
+      if context.executing_eagerly():
+        device_type = _get_context_device_type()
+        if device_type == _GPU_DEVICE_NAME or (
+            device_type is None and context.num_gpus() > 0):
+          # Under eager context, check the device placement and prefer the
+          # GPU implementation when GPU is available.
+          last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple LSTM layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = 'lstm_' + str(uuid.uuid4())
+        defun_standard_lstm = _generate_defun_backend(
+            api_name, _CPU_DEVICE_NAME, standard_lstm)
+        defun_cudnn_lstm = _generate_defun_backend(
+            api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, self.cell.bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, self.cell.bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + list(states)
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+
+def _canonical_to_params(weights, biases, shape, transpose_weights=False):
+  """Utility function convert variable to CuDNN compatible parameter.
+
+  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
+
+  ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+  ```
+
+  If the input weights need to be in a unified format, then set
+  `transpose_weights=True` to convert the weights.
+
+  Args:
+    weights: list of weights for the individual kernels and recurrent kernels.
+    biases: list of biases for individual gate.
+    shape: the shape for the converted variables that will be feed to CuDNN.
+    transpose_weights: boolean, whether to transpose the weights.
+
+  Returns:
+    The converted weights that can be feed to CuDNN ops as param.
+  """
+  def convert(w):
+    return array_ops.transpose(w) if transpose_weights else w
+
+  weights = [array_ops.reshape(convert(x), shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+  # so that mathematically it is same as the canonical LSTM implementation.
+  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(full_bias, 8),
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+
+  return last_output, outputs, h, c, _runtime('cudnn')
+
+
+def _generate_defun_backend(unique_api_name, preferred_device, func):
+  function_attributes = {
+      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
+      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes)
+
+
+def _get_context_device_type():
+  """Parse the current context and return the device type, eg CPU/GPU."""
+  current_device = context.context().device_name
+  if current_device is None:
+    return None
+  return device.DeviceSpec.from_string(current_device).device_type
+
+
+def _runtime(runtime_name):
+  with ops.device('/cpu:0'):
+    return constant_op.constant(
+        runtime_name, dtype=dtypes.string, name='runtime')
diff --git a/tensorflow/python/keras/layers/recurrent_v2_test.py b/tensorflow/python/keras/layers/recurrent_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8610a5dd4e21fa2aa84f507ad57d4672a82725ac
--- /dev/null
+++ b/tensorflow/python/keras/layers/recurrent_v2_test.py
@@ -0,0 +1,67 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for recurrent v2 layers functionality other than GRU, LSTM.
+
+See also: lstm_v2_test.py, gru_v2_test.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class RNNV2Test(keras_parameterized.TestCase):
+
+  @parameterized.parameters([rnn_v2.LSTM, rnn_v2.GRU])
+  def test_device_placement(self, layer):
+    if not test.is_gpu_available():
+      self.skipTest('Need GPU for testing.')
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    # Test when GPU is available but not used, the graph should be properly
+    # created with CPU ops.
+    with test_util.device(use_gpu=False):
+      model = keras.Sequential([
+          keras.layers.Embedding(vocab_size, embedding_dim,
+                                 batch_input_shape=[batch_size, timestep]),
+          layer(units, return_sequences=True, stateful=True),
+          keras.layers.Dense(vocab_size)
+      ])
+      model.compile(optimizer='adam',
+                    loss='sparse_categorical_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, epochs=1, shuffle=False)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index e4c508b0f3bd114ef470353a75eb39bf74237253..35202617716810bc17ce2032878db8e9ade1abe3 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -16,10 +16,12 @@
 """
 # pylint: disable=wildcard-import
 # pylint: disable=unused-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import tf2
 from tensorflow.python.keras.engine.base_layer import TensorFlowOpLayer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
@@ -37,12 +39,19 @@ from tensorflow.python.keras.layers.pooling import *
 from tensorflow.python.keras.layers.recurrent import *
 from tensorflow.python.keras.layers.wrappers import *
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.util.tf_export import keras_export
+
+if tf2.enabled():
+  from tensorflow.python.keras.layers.normalization_v2 import *  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers.recurrent_v2 import *     # pylint: disable=g-import-not-at-top
 
 
+@keras_export('keras.layers.serialize')
 def serialize(layer):
   return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
 
 
+@keras_export('keras.layers.deserialize')
 def deserialize(config, custom_objects=None):
   """Instantiates a layer from a config dictionary.
 
@@ -59,6 +68,7 @@ def deserialize(config, custom_objects=None):
   globs['Network'] = models.Network
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
+
   return deserialize_keras_object(
       config,
       module_objects=globs,
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index 548c3ec1ac760a33d6eb998e7d601c843bd87779..5e9fa3cef8d19f428e49f15c39d3c7dd54318419 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -18,13 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras.layers import normalization as batchnorm_v1
+from tensorflow.python.keras.layers import normalization_v2 as batchnorm_v2
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.platform import test
 
 
 @tf_test_util.run_all_in_graph_and_eager_modes
-class LayerSerializationTest(test.TestCase):
+class LayerSerializationTest(parameterized.TestCase, test.TestCase):
 
   def test_serialize_deserialize(self):
     layer = keras.layers.Dense(
@@ -34,10 +41,61 @@ class LayerSerializationTest(test.TestCase):
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
                      keras.regularizers.L1L2)
-    self.assertEqual(new_layer.kernel_initializer.__class__,
-                     keras.initializers.Ones)
+    if tf2.enabled():
+      self.assertEqual(new_layer.kernel_initializer.__class__,
+                       keras.initializers.OnesV2)
+    else:
+      self.assertEqual(new_layer.kernel_initializer.__class__,
+                       keras.initializers.Ones)
     self.assertEqual(new_layer.units, 3)
 
+  @parameterized.parameters(
+      [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
+  def test_serialize_deserialize_batchnorm(self, batchnorm_layer):
+    layer = batchnorm_layer(
+        momentum=0.9, beta_initializer='zeros', gamma_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    self.assertEqual(config['class_name'], 'BatchNormalization')
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.momentum, 0.9)
+    if tf2.enabled():
+      self.assertIsInstance(new_layer, batchnorm_v2.BatchNormalization)
+      self.assertEqual(new_layer.beta_initializer.__class__,
+                       keras.initializers.ZerosV2)
+    else:
+      self.assertIsInstance(new_layer, batchnorm_v1.BatchNormalization)
+      self.assertEqual(new_layer.beta_initializer.__class__,
+                       keras.initializers.Zeros)
+    self.assertEqual(new_layer.gamma_regularizer.__class__,
+                     keras.regularizers.L1L2)
+
+  @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
+  def test_serialize_deserialize_lstm(self, layer):
+    lstm = layer(5, return_sequences=True)
+    config = keras.layers.serialize(lstm)
+    self.assertEqual(config['class_name'], 'LSTM')
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.units, 5)
+    self.assertEqual(new_layer.return_sequences, True)
+    if tf2.enabled():
+      self.assertIsInstance(new_layer, rnn_v2.LSTM)
+    else:
+      self.assertIsInstance(new_layer, rnn_v1.LSTM)
+      self.assertNotIsInstance(new_layer, rnn_v2.LSTM)
+
+  @parameterized.parameters([rnn_v1.GRU, rnn_v2.GRU])
+  def test_serialize_deserialize_gru(self, layer):
+    gru = layer(5, return_sequences=True)
+    config = keras.layers.serialize(gru)
+    self.assertEqual(config['class_name'], 'GRU')
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.units, 5)
+    self.assertEqual(new_layer.return_sequences, True)
+    if tf2.enabled():
+      self.assertIsInstance(new_layer, rnn_v2.GRU)
+    else:
+      self.assertIsInstance(new_layer, rnn_v1.GRU)
+      self.assertNotIsInstance(new_layer, rnn_v2.GRU)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index 3268bf8b6466c92f9c0fcda7a6beb24c8584ae96..d0c5f25c3d4fa87ce8784065834a949174378970 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -25,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -34,23 +37,30 @@ from tensorflow.python.util import nest
 def _single_op_at_end():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
-  outputs = gen_nn_ops.relu(x, name='hey')
+  outputs = gen_nn_ops.relu(x)
   return inputs, outputs
 
 
-def _multiple_ops_at_end():
+def _single_identity_op_at_end():
   inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  outputs = array_ops.identity(x)
+  assert 'Identity' in outputs.name
+  return inputs, outputs
+
+
+def _multiple_ops_at_end():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
-  x = gen_nn_ops.relu(x, name='hey')
-  outputs = gen_nn_ops.relu(x, name='hey2')
+  x = gen_nn_ops.relu(x)
+  outputs = gen_nn_ops.relu(x)
   return inputs, outputs
 
 
 def _single_op_in_middle():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
-  x = gen_nn_ops.relu(x, name='hey')
+  x = gen_nn_ops.relu(x)
   outputs = keras.layers.Dense(10)(x)
   return inputs, outputs
 
@@ -58,8 +68,8 @@ def _single_op_in_middle():
 def _multiple_ops_in_middle():
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(10)(inputs)
-  x = gen_nn_ops.relu(x, name='hey')
-  x = gen_nn_ops.relu(x, name='hey2')
+  x = gen_nn_ops.relu(x)
+  x = gen_nn_ops.relu(x)
   outputs = keras.layers.Dense(10)(x)
   return inputs, outputs
 
@@ -78,16 +88,71 @@ def _single_op_with_attrs():
   return inputs, outputs
 
 
+def _multiple_uses():
+  inputs = keras.Input(shape=(10,))
+  x = math_ops.reduce_mean(inputs, axis=1, keepdims=True)
+  x1 = keras.layers.Dense(10)(x)
+  x2 = keras.layers.Dense(10)(x)
+  outputs = x1 + x2
+  return inputs, outputs
+
+
+def _op_with_tensor_list():
+  inputs = keras.Input(shape=(10,))
+  x = array_ops.concat([inputs, inputs], axis=1)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _add_n():
+  inputs = keras.Input(shape=(10,))
+  outputs = math_ops.add_n([inputs, inputs, inputs])
+  return inputs, outputs
+
+
+def _reuse_op():
+  inputs = keras.Input(shape=(10,))
+  # This op needs to be checked multiple times.
+  x = gen_nn_ops.relu(inputs)
+  y = keras.layers.Dense(10)(x)
+  x2 = x * 2
+  y2 = keras.layers.Dense(10)(x2)
+  outputs = y + y2
+  return inputs, outputs
+
+
+class LayerWithLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.bias = self.add_weight(name='bias', dtype='float32')
+    self.layer = keras.layers.Dense(10)
+
+  def call(self, inputs):
+    inputs = inputs * self.bias
+    # Would throw an error if Keras History was created here.
+    return self.layer(inputs)
+
+
+def _inner_layer():
+  inputs = keras.Input(shape=(10,))
+  outputs = LayerWithLayer()(inputs)
+  return inputs, outputs
+
+
 @keras_parameterized.run_all_keras_modes
 class AutoLambdaTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('single_op_at_end', _single_op_at_end),
+      ('single_identity_op_at_end', _single_identity_op_at_end),
       ('multiple_ops_at_end', _multiple_ops_at_end),
       ('single_op_in_middle', _single_op_in_middle),
       ('multiple_ops_in_middle', _multiple_ops_in_middle),
       ('single_standalone_branch', _single_standalone_branch),
-      ('single_op_with_attrs', _single_op_with_attrs))
+      ('single_op_with_attrs', _single_op_with_attrs),
+      ('multiple_uses', _multiple_uses),
+      ('op_with_tensor_list', _op_with_tensor_list), ('add_n', _add_n),
+      ('_reuse_op', _reuse_op), ('_inner_layer', _inner_layer))
   def test_autolambda(self, model_fn):
     inputs, outputs = model_fn()
     model = keras.Model(inputs, outputs)
@@ -99,6 +164,14 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     np_outputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
                                     outputs)
     model.fit(np_inputs, np_outputs, batch_size=2)
+    model(np_inputs)  # Test calling the model directly on inputs.
+
+    new_model = keras.Model.from_config(
+        model.get_config(), custom_objects={'LayerWithLayer': LayerWithLayer})
+    new_model.compile(
+        adam.Adam(0.001), 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    new_model.fit(np_inputs, np_outputs, batch_size=2)
+    new_model(np_inputs)  # Test calling the new model directly on inputs.
 
   def test_numerical_correctness_simple(self):
     x = ops.convert_to_tensor([[-1., 0., -2., 1.]])
@@ -116,7 +189,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     y = self.evaluate(model(x))
     self.assertAllClose(y, [1.5, 3.])
 
-  def test_serialization(self):
+  def test_numerical_correctness_serialization(self):
     x = ops.convert_to_tensor([-1., 0., -2., 1.])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
@@ -126,6 +199,46 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     y2 = self.evaluate(model2(x))
     self.assertAllClose(y1, y2)
 
+  def test_no_tracking(self):
+    x = keras.backend.placeholder((10, 10))
+    keras.layers.Dense(1)(x)
+    self.assertTrue(x._keras_history_checked)
+
+  def test_timing_scales_linearly(self):
+
+    def _construct_graph_of_size(size):
+      start = time.time()
+      x = keras.backend.placeholder(shape=(10, 4))
+
+      for _ in range(size):
+        x = keras.layers.Dense(4)(x)
+        x = gen_nn_ops.relu(x)
+
+      end = time.time()
+      return end - start
+
+    size_50 = _construct_graph_of_size(50)
+    size_500 = _construct_graph_of_size(500)
+
+    # Check construction time grows approx. linearly with size.
+    e = 3  # Fudge factor to prevent flakiness.
+    self.assertLess(size_500, (10 * e) * size_50)
+
+  def test_no_mask_tracking(self):
+    x = keras.backend.placeholder((10, 10))
+    y = keras.layers.Masking(0.)(x)
+    self.assertTrue(y._keras_mask._keras_history_checked)
+
+  def test_built(self):
+    inputs = keras.Input(shape=(10,))
+    outputs = gen_nn_ops.relu(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+    for layer in model.layers:
+      self.assertTrue(layer.built)
+    # Test something that requires Layers to be built.
+    model.summary()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/time_distributed_learning_phase_test.py b/tensorflow/python/keras/layers/time_distributed_learning_phase_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c38f25d331ddaeeb6718be1894c17164dcfb23e
--- /dev/null
+++ b/tensorflow/python/keras/layers/time_distributed_learning_phase_test.py
@@ -0,0 +1,43 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer wrappers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.platform import test
+
+
+# TODO(b/125513261): Move this back into wrappers_test.py.
+class TimeDistributedLearningPhaseTest(test.TestCase):
+
+  def test_TimeDistributed_learning_phase(self):
+    with self.cached_session():
+      # test layers that need learning_phase to be set
+      np.random.seed(1234)
+      x = keras.layers.Input(shape=(3, 2))
+      y = keras.layers.TimeDistributed(keras.layers.Dropout(.999))(
+          x, training=True)
+      model = keras.models.Model(x, y)
+      y = model.predict(np.random.random((10, 3, 2)))
+      self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index c9424c9f637706be18b95a7b5529dd121ea7377e..7fd375fbbe7c2763f52c415860f6b60d33df85fe 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -27,9 +27,9 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -43,10 +43,9 @@ class Wrapper(Layer):
   Two usable wrappers are the `TimeDistributed` and `Bidirectional` wrappers.
 
   Arguments:
-      layer: The layer to be wrapped.
+    layer: The layer to be wrapped.
   """
 
-  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, layer, **kwargs):
     assert isinstance(layer, Layer)
     self.layer = layer
@@ -66,36 +65,6 @@ class Wrapper(Layer):
     else:
       return None
 
-  @property
-  def trainable(self):
-    return self.layer.trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    self.layer.trainable = value
-
-  @property
-  def trainable_weights(self):
-    return self.layer.trainable_weights
-
-  @property
-  def non_trainable_weights(self):
-    return self.layer.non_trainable_weights
-
-  @property
-  def updates(self):
-    return self.layer.updates + self._updates
-
-  @property
-  def losses(self):
-    return self.layer.losses + self._losses
-
-  def get_weights(self):
-    return self.layer.get_weights()
-
-  def set_weights(self, weights):
-    self.layer.set_weights(weights)
-
   def get_config(self):
     config = {
         'layer': {
@@ -130,10 +99,10 @@ class TimeDistributed(Wrapper):
   to each of the 10 timesteps, independently:
 
   ```python
-      # as the first layer in a model
-      model = Sequential()
-      model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
-      # now model.output_shape == (None, 10, 8)
+  # as the first layer in a model
+  model = Sequential()
+  model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
+  # now model.output_shape == (None, 10, 8)
   ```
 
   The output will then have shape `(32, 10, 8)`.
@@ -141,8 +110,8 @@ class TimeDistributed(Wrapper):
   In subsequent layers, there is no need for the `input_shape`:
 
   ```python
-      model.add(TimeDistributed(Dense(32)))
-      # now model.output_shape == (None, 10, 32)
+  model.add(TimeDistributed(Dense(32)))
+  # now model.output_shape == (None, 10, 32)
   ```
 
   The output will then have shape `(32, 10, 32)`.
@@ -151,16 +120,25 @@ class TimeDistributed(Wrapper):
   for instance with a `Conv2D` layer:
 
   ```python
-      model = Sequential()
-      model.add(TimeDistributed(Conv2D(64, (3, 3)),
-                                input_shape=(10, 299, 299, 3)))
+  model = Sequential()
+  model.add(TimeDistributed(Conv2D(64, (3, 3)),
+                            input_shape=(10, 299, 299, 3)))
   ```
 
   Arguments:
-      layer: a layer instance.
+    layer: a layer instance.
+
+  Call arguments:
+    inputs: Input tensor.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the
+      wrapped layer (only if the layer supports this argument).
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked. This argument is passed to the
+      wrapped layer (only if the layer supports this argument).
 
   Raises:
-      ValueError: If not initialized with a `Layer` instance.
+    ValueError: If not initialized with a `Layer` instance.
   """
 
   def __init__(self, layer, **kwargs):
@@ -170,7 +148,12 @@ class TimeDistributed(Wrapper):
           '`Layer` instance. You passed: {input}'.format(input=layer))
     super(TimeDistributed, self).__init__(layer, **kwargs)
     self.supports_masking = True
-    self._track_checkpointable(layer, name='layer')
+
+    # It is safe to use the fast, reshape-based approach with all of our
+    # built-in Layers.
+    self._always_use_reshape = (
+        layer_utils.is_builtin_layer(layer) and
+        not getattr(layer, 'stateful', False))
 
   def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
     """Finds non-specific dimensions in the static shapes.
@@ -179,18 +162,19 @@ class TimeDistributed(Wrapper):
     tensor.
 
     Arguments:
-        init_tuple: a tuple, the first part of the output shape
-        tensor: the tensor from which to get the (static and dynamic) shapes
-            as the last part of the output shape
-        start_idx: int, which indicate the first dimension to take from
-            the static shape of the tensor
-        int_shape: an alternative static shape to take as the last part
-            of the output shape
+      init_tuple: a tuple, the first part of the output shape
+      tensor: the tensor from which to get the (static and dynamic) shapes
+        as the last part of the output shape
+      start_idx: int, which indicate the first dimension to take from
+        the static shape of the tensor
+      int_shape: an alternative static shape to take as the last part
+        of the output shape
+
     Returns:
-        The new int_shape with the first part from init_tuple
-        and the last part from either `int_shape` (if provided)
-        or `tensor.shape`, where every `None` is replaced by
-        the corresponding dimension from `tf.shape(tensor)`.
+      The new int_shape with the first part from init_tuple
+      and the last part from either `int_shape` (if provided)
+      or `tensor.shape`, where every `None` is replaced by
+      the corresponding dimension from `tf.shape(tensor)`.
     """
     # replace all None in int_shape by K.shape
     if int_shape is None:
@@ -206,8 +190,12 @@ class TimeDistributed(Wrapper):
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    assert len(input_shape) >= 3
-    self.input_spec = InputSpec(shape=input_shape)
+    if len(input_shape) < 3:
+      raise ValueError(
+          '`TimeDistributed` Layer should be passed an `input_shape ` '
+          'with at least 3 dimensions, received: ' + str(input_shape))
+    # Don't enforce the batch or time dimension.
+    self.input_spec = InputSpec(shape=[None, None] + input_shape[2:])
     child_input_shape = [input_shape[0]] + input_shape[2:]
     if not self.layer.built:
       # The base layer class calls a conversion function on the input shape to
@@ -234,7 +222,7 @@ class TimeDistributed(Wrapper):
       kwargs['training'] = training
 
     input_shape = K.int_shape(inputs)
-    if input_shape[0]:
+    if input_shape[0] and not self._always_use_reshape:
       # batch size matters, use rnn-based implementation
       def step(x, _):
         output = self.layer.call(x, **kwargs)
@@ -298,10 +286,10 @@ class TimeDistributed(Wrapper):
 
     Arguments:
       inputs: Tensor with shape [batch size, timesteps, ...] indicating the
-          input to TimeDistributed. If static shape information is available for
-          "batch size", `mask` is returned unmodified.
+        input to TimeDistributed. If static shape information is available for
+        "batch size", `mask` is returned unmodified.
       mask: Either None (indicating no masking) or a Tensor indicating the
-          input mask for TimeDistributed. The shape can be static or dynamic.
+        input mask for TimeDistributed. The shape can be static or dynamic.
 
     Returns:
       Either None (no masking), or a [batch size, timesteps, ...] Tensor with
@@ -358,31 +346,34 @@ class Bidirectional(Wrapper):
   """Bidirectional wrapper for RNNs.
 
   Arguments:
-      layer: `Recurrent` instance.
-      merge_mode: Mode by which outputs of the
-          forward and backward RNNs will be combined.
-          One of {'sum', 'mul', 'concat', 'ave', None}.
-          If None, the outputs will not be combined,
-          they will be returned as a list.
+    layer: `Recurrent` instance.
+    merge_mode: Mode by which outputs of the
+      forward and backward RNNs will be combined.
+      One of {'sum', 'mul', 'concat', 'ave', None}.
+      If None, the outputs will not be combined,
+      they will be returned as a list.
+
+  Call arguments:
+    The call arguments for this layer are the same as those of the wrapped RNN
+      layer.
 
   Raises:
-      ValueError: If not initialized with a `Layer` instance or
-          In case of invalid `merge_mode` argument.
+    ValueError: If not initialized with a `Layer` instance or
+      In case of invalid `merge_mode` argument.
 
   Examples:
 
   ```python
-      model = Sequential()
-      model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
-      10)))
-      model.add(Bidirectional(LSTM(10)))
-      model.add(Dense(5))
-      model.add(Activation('softmax'))
-      model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
+  model = Sequential()
+  model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
+  10)))
+  model.add(Bidirectional(LSTM(10)))
+  model.add(Dense(5))
+  model.add(Activation('softmax'))
+  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
   def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
     if not isinstance(layer, Layer):
       raise ValueError(
@@ -393,8 +384,8 @@ class Bidirectional(Wrapper):
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
     if getattr(layer, 'zero_output_for_mask', None) is not None:
-      # Force the zero_output_for_mask to be True if it presents.
-      layer.zero_output_for_mask = True
+      # Force the zero_output_for_mask to be True if returning sequences.
+      layer.zero_output_for_mask = layer.return_sequences
 
     self.forward_layer = copy.copy(layer)
     config = layer.get_config()
@@ -413,28 +404,12 @@ class Bidirectional(Wrapper):
     self.supports_masking = True
     self._trainable = True
     self._num_constants = None
+    # We don't want to track `layer` since we're already tracking the two copies
+    # of it we actually run.
+    self._setattr_tracking = False
     super(Bidirectional, self).__init__(layer, **kwargs)
+    self._setattr_tracking = True
     self.input_spec = layer.input_spec
-    self._track_checkpointable(self.forward_layer, name='forward_layer')
-    self._track_checkpointable(self.backward_layer, name='backward_layer')
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @trainable.setter
-  def trainable(self, value):
-    self._trainable = value
-    self.forward_layer.trainable = value
-    self.backward_layer.trainable = value
-
-  def get_weights(self):
-    return self.forward_layer.get_weights() + self.backward_layer.get_weights()
-
-  def set_weights(self, weights):
-    nw = len(weights)
-    self.forward_layer.set_weights(weights[:nw // 2])
-    self.backward_layer.set_weights(weights[nw // 2:])
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -628,32 +603,6 @@ class Bidirectional(Wrapper):
       return [output_mask] + state_mask * 2
     return output_mask
 
-  @property
-  def trainable_weights(self):
-    if hasattr(self.forward_layer, 'trainable_weights'):
-      return (self.forward_layer.trainable_weights +
-              self.backward_layer.trainable_weights)
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if hasattr(self.forward_layer, 'non_trainable_weights'):
-      return (self.forward_layer.non_trainable_weights +
-              self.backward_layer.non_trainable_weights)
-    return []
-
-  @property
-  def updates(self):
-    if hasattr(self.forward_layer, 'updates'):
-      return self.forward_layer.updates + self.backward_layer.updates
-    return []
-
-  @property
-  def losses(self):
-    if hasattr(self.forward_layer, 'losses'):
-      return self.forward_layer.losses + self.backward_layer.losses
-    return []
-
   @property
   def constraints(self):
     constraints = {}
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index f3aa5c4d684592d1b276cac47d31f8d1ba06d600..bb54adf2c7629bd6c41368e1eeedb8efb63405f5 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -23,10 +23,12 @@ import copy
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -87,8 +89,9 @@ class TimeDistributedTest(test.TestCase):
     model.get_config()
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = object_identity.ObjectIdentitySet(
+        trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -153,17 +156,6 @@ class TimeDistributedTest(test.TestCase):
       model.compile(optimizer='rmsprop', loss='mse')
       self.assertEqual(len(model.losses), 1)
 
-  def test_TimeDistributed_learning_phase(self):
-    with self.cached_session():
-      # test layers that need learning_phase to be set
-      np.random.seed(1234)
-      x = keras.layers.Input(shape=(3, 2))
-      y = keras.layers.TimeDistributed(keras.layers.Dropout(.999))(
-          x, training=True)
-      model = keras.models.Model(x, y)
-      y = model.predict(np.random.random((10, 3, 2)))
-      self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
-
   def test_TimeDistributed_batchnorm(self):
     with self.cached_session():
       # test that wrapped BN updates still work.
@@ -256,6 +248,49 @@ class TimeDistributedTest(test.TestCase):
       self.assertEqual((mask_outputs_val[1]).all(),
                        model_input.all())
 
+  def test_TimeDistributed_with_different_time_shapes(self):
+    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+    ph_1 = keras.backend.placeholder(shape=(None, 10, 13))
+    out_1 = time_dist(ph_1)
+    self.assertEqual(out_1.shape.as_list(), [None, 10, 5])
+
+    ph_2 = keras.backend.placeholder(shape=(None, 1, 13))
+    out_2 = time_dist(ph_2)
+    self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
+
+    ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
+    with self.assertRaisesRegexp(ValueError, 'is incompatible with layer'):
+      time_dist(ph_3)
+
+  def test_TimeDistributed_with_invalid_dimensions(self):
+    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+    ph = keras.backend.placeholder(shape=(None, 10))
+    with self.assertRaisesRegexp(
+        ValueError,
+        '`TimeDistributed` Layer should be passed an `input_shape `'):
+      time_dist(ph)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_TimeDistributed_reshape(self):
+
+    class NoReshapeLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return inputs
+
+    # Built-in layers that aren't stateful use the reshape implementation.
+    td1 = keras.layers.TimeDistributed(keras.layers.Dense(5))
+    self.assertTrue(td1._always_use_reshape)
+
+    # Built-in layers that are stateful don't use the reshape implementation.
+    td2 = keras.layers.TimeDistributed(
+        keras.layers.RNN(keras.layers.SimpleRNNCell(10), stateful=True))
+    self.assertFalse(td2._always_use_reshape)
+
+    # Custom layers are not whitelisted for the fast reshape implementation.
+    td3 = keras.layers.TimeDistributed(NoReshapeLayer())
+    self.assertFalse(td3._always_use_reshape)
+
 
 class BidirectionalTest(test.TestCase):
 
@@ -280,8 +315,9 @@ class BidirectionalTest(test.TestCase):
         model.fit(x, y, epochs=1, batch_size=1)
 
         # check whether the model variables are present in the
-        # checkpointable list of objects
-        checkpointed_objects = set(checkpointable_util.list_objects(model))
+        # trackable list of objects
+        checkpointed_objects = object_identity.ObjectIdentitySet(
+            trackable_util.list_objects(model))
         for v in model.variables:
           self.assertIn(v, checkpointed_objects)
 
@@ -374,7 +410,6 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -402,10 +437,10 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_sequences=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
         f_forward = keras.backend.function([inputs],
-                                           [layer.forward_layer.call(inputs)])
+                                           [layer.forward_layer(inputs)])
         f_backward = keras.backend.function(
             [inputs],
-            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+            [keras.backend.reverse(layer.backward_layer(inputs), 1)])
 
         y_merged = f_merged(x)
         y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
@@ -419,9 +454,9 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_state=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], layer(inputs))
         f_forward = keras.backend.function([inputs],
-                                           layer.forward_layer.call(inputs))
+                                           layer.forward_layer(inputs))
         f_backward = keras.backend.function([inputs],
-                                            layer.backward_layer.call(inputs))
+                                            layer.backward_layer(inputs))
         n_states = len(layer.layer.states)
 
         y_merged = f_merged(x)
@@ -505,8 +540,10 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
+    if context.executing_eagerly():
+      self.skipTest('layer.updates is only available in graph mode.')
+
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       x_reachable_update = x * x
@@ -534,10 +571,15 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.losses) == 4
       assert len(layer.get_losses_for(None)) == 4
       assert not layer.get_losses_for(x)
+
+      # Create a random tensor that is not conditional on the inputs.
+      with keras.backend.get_graph().as_default():
+        const_tensor = constant_op.constant(1)
+
       layer.forward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.forward_layer.add_loss(1, inputs=None)
+      layer.forward_layer.add_loss(const_tensor, inputs=None)
       layer.backward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.backward_layer.add_loss(1, inputs=None)
+      layer.backward_layer.add_loss(const_tensor, inputs=None)
       assert len(layer.losses) == 8
       assert len(layer.get_losses_for(None)) == 6
       assert len(layer.get_losses_for(x)) == 2
@@ -636,7 +678,33 @@ class BidirectionalTest(test.TestCase):
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
-  def test_Bidirectional_with_masking(self):
+  def test_Bidirectional_last_output_with_masking(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'concat'
+    x = np.random.rand(samples, timesteps, dim)
+    # clear the first record's timestep 2. Last output should be same as state,
+    # not zeroed.
+    x[0, 2] = 0
+
+    with self.cached_session():
+      inputs = keras.Input((timesteps, dim))
+      masked_inputs = keras.layers.Masking()(inputs)
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, return_state=True), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(masked_inputs, training=True))
+      self.assertEqual(len(outputs), 5)
+      self.assertEqual(outputs[0].get_shape().as_list(), [None, units * 2])
+
+      model = keras.Model(inputs, outputs)
+      y = _to_list(model.predict(x))
+      self.assertEqual(len(y), 5)
+      self.assertAllClose(y[0], np.concatenate([y[1], y[3]], axis=1))
+
+  def test_Bidirectional_sequence_output_with_masking(self):
     rnn = keras.layers.LSTM
     samples = 2
     dim = 5
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9144eb0bf9d3e3b90bd9f77c18851f65d3b7a9c8..62bfcc53889356263dffaa3e29ddb4675b8577d2 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=unused-import
 """Built-in loss functions.
 """
 from __future__ import absolute_import
@@ -26,16 +25,19 @@ import six
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
 
 
+@keras_export('keras.losses.Loss')
 class Loss(object):
   """Loss base class.
 
@@ -52,13 +54,13 @@ class Loss(object):
   ```
 
   Args:
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None):
     self.reduction = reduction
     self.name = name
@@ -86,10 +88,13 @@ class Loss(object):
     Raises:
       ValueError: If the shape of `sample_weight` is invalid.
     """
-    with ops.name_scope(self.name, format(self.__class__.__name__),
+    # If we are wrapping a lambda function strip '<>' from the name as it is not
+    # accepted in scope name.
+    scope_name = 'lambda' if self.name == '<lambda>' else self.name
+    with ops.name_scope(scope_name, format(self.__class__.__name__),
                         (y_pred, y_true, sample_weight)):
       losses = self.call(y_true, y_pred)
-      return compute_weighted_loss(
+      return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self.reduction)
 
   @classmethod
@@ -108,6 +113,7 @@ class Loss(object):
     return {'reduction': self.reduction, 'name': self.name}
 
   @abc.abstractmethod
+  @doc_controls.for_subclass_implementers
   def call(self, y_true, y_pred):
     """Invokes the `Loss` instance.
 
@@ -118,8 +124,49 @@ class Loss(object):
     NotImplementedError('Must be implemented in subclasses.')
 
 
+class LossFunctionWrapper(Loss):
+  """Wraps a loss function in the `Loss` class.
+
+  Args:
+    fn: The loss function to wrap, with signature `fn(y_true, y_pred,
+      **kwargs)`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: (Optional) name for the loss.
+    **kwargs: The keyword arguments that are passed on to `fn`.
+  """
+
+  def __init__(self,
+               fn,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None,
+               **kwargs):
+    super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
+    self.fn = fn
+    self._fn_kwargs = kwargs
+
+  def call(self, y_true, y_pred):
+    """Invokes the `LossFunctionWrapper` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Loss values per sample.
+    """
+    return self.fn(y_true, y_pred, **self._fn_kwargs)
+
+  def get_config(self):
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+    base_config = super(LossFunctionWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @keras_export('keras.losses.MeanSquaredError')
-class MeanSquaredError(Loss):
+class MeanSquaredError(LossFunctionWrapper):
   """Computes the mean of squares of errors between labels and predictions.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -136,28 +183,20 @@ class MeanSquaredError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanSquaredError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean squared error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_squared_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_squared_error'):
+    super(MeanSquaredError, self).__init__(
+        mean_squared_error, name=name, reduction=reduction)
 
 
 @keras_export('keras.losses.MeanAbsoluteError')
-class MeanAbsoluteError(Loss):
+class MeanAbsoluteError(LossFunctionWrapper):
   """Computes the mean of absolute difference between labels and predictions.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -174,28 +213,20 @@ class MeanAbsoluteError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanAbsoluteError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean absolute error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_absolute_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_absolute_error'):
+    super(MeanAbsoluteError, self).__init__(
+        mean_absolute_error, name=name, reduction=reduction)
 
 
 @keras_export('keras.losses.MeanAbsolutePercentageError')
-class MeanAbsolutePercentageError(Loss):
+class MeanAbsolutePercentageError(LossFunctionWrapper):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -212,28 +243,20 @@ class MeanAbsolutePercentageError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanAbsolutePercentageError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean absolute percentage error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_absolute_percentage_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_absolute_percentage_error'):
+    super(MeanAbsolutePercentageError, self).__init__(
+        mean_absolute_percentage_error, name=name, reduction=reduction)
 
 
 @keras_export('keras.losses.MeanSquaredLogarithmicError')
-class MeanSquaredLogarithmicError(Loss):
+class MeanSquaredLogarithmicError(LossFunctionWrapper):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -250,29 +273,28 @@ class MeanSquaredLogarithmicError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanSquaredLogarithmicError` instance.
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_squared_logarithmic_error'):
+    super(MeanSquaredLogarithmicError, self).__init__(
+        mean_squared_logarithmic_error, name=name, reduction=reduction)
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
 
-    Returns:
-      Mean squared logarithmic error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_squared_logarithmic_error(y_true, y_pred)
+@keras_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
+  Use this crossentropy loss function when there are only two label classes
+  (assumed to be 0 and 1). There should be a single floating point value per
+  feature.
 
-@keras_export('keras.losses.BinaryCrossentropy')
-class BinaryCrossentropy(Loss):
-  """Computes the binary cross entropy loss between the labels and predictions.
+  In the snippet below, there is a single floating pointing value per example,
+  and the shape of both `y_pred` and `y_true` are `[batch_size]`.
 
   Usage:
 
@@ -285,51 +307,45 @@ class BinaryCrossentropy(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
     label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
-    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='binary_crossentropy'):
+    super(BinaryCrossentropy, self).__init__(
+        binary_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
     self.from_logits = from_logits
-    self.label_smoothing = ops.convert_to_tensor(
-        label_smoothing, dtype=K.floatx())
-
-  def call(self, y_true, y_pred):
-    """Invokes the `BinaryCrossentropy` instance.
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
 
-    Returns:
-      Binary cross entropy losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return binary_crossentropy(
-        y_true,
-        y_pred,
-        from_logits=self.from_logits,
-        label_smoothing=self.label_smoothing)
+@keras_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided in a `one_hot` representation. If you want to
+  provide labels as integers, please use `SparseCategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature.
 
-@keras_export('keras.losses.CategoricalCrossentropy')
-class CategoricalCrossentropy(Loss):
-  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+  In the snippet below, there is `# classes` floating pointing values per
+  example. The shape of both `y_pred` and `y_true` are
+  `[batch_size, num_classes]`.
 
   Usage:
 
@@ -344,199 +360,188 @@ class CategoricalCrossentropy(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. This
-      option is currently not supported when `y_pred` is a sparse input
-      (not one-hot).
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='categorical_crossentropy'):
     super(CategoricalCrossentropy, self).__init__(
-        reduction=reduction, name=name)
-    self.from_logits = from_logits
-    self.label_smoothing = ops.convert_to_tensor(
-        label_smoothing, dtype=K.floatx())
-
-  def call(self, y_true, y_pred):
-    """Invokes the `CategoricalCrossentropy` instance.
+        categorical_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
 
-    Returns:
-      Categorical cross entropy losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = ops.convert_to_tensor(y_true)
-    is_sparse = y_pred.shape != y_true.shape
-
-    if is_sparse:
-      return sparse_categorical_crossentropy(
-          y_true, y_pred, from_logits=self.from_logits)
-    else:
-      y_true = math_ops.cast(y_true, y_pred.dtype)
-      return categorical_crossentropy(
-          y_true,
-          y_pred,
-          from_logits=self.from_logits,
-          label_smoothing=self.label_smoothing)
+@keras_export('keras.losses.SparseCategoricalCrossentropy')
+class SparseCategoricalCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided as integers. If you want to provide labels
+  using `one-hot` representation, please use `CategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature for `y_pred`
+  and a single floating point value per feature for `y_true`.
 
-@keras_export('keras.losses.Hinge')
-class Hinge(Loss):
-  """Computes the hinge loss between `y_true` and `y_pred`.
+  In the snippet below, there is a single floating point value per example for
+  `y_true` and `# classes` floating pointing values per example for `y_pred`.
+  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+  `[batch_size, num_classes]`.
 
   Usage:
 
   ```python
-  h = tf.losses.Hinge()
-  loss = h([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: 0.66
+  cce = tf.keras.losses.SparseCategoricalCrossentropy()
+  loss = cce(
+    [0, 1, 2],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Hinge())
-  ```
-  """
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SparseCategoricalCrossentropy())
+  ````
 
-  def call(self, y_true, y_pred):
-    """Calculates the hinge loss.
+  Args:
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
+  def __init__(self,
+               from_logits=False,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(SparseCategoricalCrossentropy, self).__init__(
+        sparse_categorical_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits)
 
-    Returns:
-      Hinge loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return hinge(y_true, y_pred)
 
+@keras_export('keras.losses.Hinge')
+class Hinge(LossFunctionWrapper):
+  """Computes the hinge loss between `y_true` and `y_pred`.
 
-@keras_export('keras.losses.SquaredHinge')
-class SquaredHinge(Loss):
-  """Computes the squared hinge loss between `y_true` and `y_pred`.
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
 
   Usage:
 
   ```python
-  sh = tf.losses.SquaredHinge()
-  loss = sh([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: 0.66
+  h = tf.keras.losses.Hinge()
+  loss = h([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # loss = max(0, 1 - y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3
+
+  print('Loss: ', loss.numpy())  # Loss: 1.6
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.SquaredHinge())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Hinge())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Calculates the squared hinge loss.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(Hinge, self).__init__(hinge, name=name, reduction=reduction)
 
-    Returns:
-      Squared hinge loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return squared_hinge(y_true, y_pred)
 
+@keras_export('keras.losses.SquaredHinge')
+class SquaredHinge(LossFunctionWrapper):
+  """Computes the squared hinge loss between `y_true` and `y_pred`.
 
-@keras_export('keras.losses.CategoricalHinge')
-class CategoricalHinge(Loss):
-  """Computes the categorical hinge loss between `y_true` and `y_pred`.
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
 
   Usage:
 
   ```python
-  ch = tf.losses.CategoricalHinge()
-  loss = ch([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: 1.0
+  sh = tf.keras.losses.SquaredHinge()
+  loss = sh([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # loss = (max(0, 1 - y_true * y_pred))^2 = [1.6^2 + 1.7^2 + 1.5^2] / 3
+
+  print('Loss: ', loss.numpy())  # Loss: 2.566666
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.CategoricalHinge())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SquaredHinge())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Calculates the categorical hinge loss.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Categorical hinge loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return categorical_hinge(y_true, y_pred)
-
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='squared_hinge'):
+    super(SquaredHinge, self).__init__(
+        squared_hinge, name=name, reduction=reduction)
 
-class LogLoss(Loss):
-  """Computes the log loss between `y_true` and `y_pred`.
 
-  logloss = - y_true * log(y_pred) - (1 - y_true) * log(1 - y_pred)
+@keras_export('keras.losses.CategoricalHinge')
+class CategoricalHinge(LossFunctionWrapper):
+  """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
   Usage:
 
   ```python
-  l = tf.losses.LogLoss()
-  loss = l([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: 10.745
+  ch = tf.keras.losses.CategoricalHinge()
+  loss = ch([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 1.0
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.LogLoss())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalHinge())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return logloss(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='categorical_hinge'):
+    super(CategoricalHinge, self).__init__(
+        categorical_hinge, name=name, reduction=reduction)
 
 
-class Poisson(Loss):
-  """Computes the poisson loss between `y_true` and `y_pred`.
+@keras_export('keras.losses.Poisson')
+class Poisson(LossFunctionWrapper):
+  """Computes the Poisson loss between `y_true` and `y_pred`.
 
-  loss = y_pred - y_true * log(y_pred)
+  `loss = y_pred - y_true * log(y_pred)`
 
   Usage:
 
   ```python
-  p = tf.losses.Poisson()
+  p = tf.keras.losses.Poisson()
   loss = p([1, 9, 2], [4, 8, 12])
   print('Loss: ', loss.numpy())  # Loss: -4.63
   ```
@@ -544,26 +549,27 @@ class Poisson(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Poisson())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Poisson())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return poisson(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='poisson'):
+    super(Poisson, self).__init__(poisson, name=name, reduction=reduction)
 
 
-class Logcosh(Loss):
+@keras_export('keras.losses.LogCosh')
+class LogCosh(LossFunctionWrapper):
   """Computes the logarithm of the hyperbolic cosine of the prediction error.
 
-  logcosh = log((exp(x) + exp(-x))/2) where x is the error `y_pred` - `y_true`.
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
 
   Usage:
 
   ```python
-  l = tf.losses.Logcosh()
+  l = tf.keras.losses.LogCosh()
   loss = l([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 0.289
   ```
@@ -571,26 +577,27 @@ class Logcosh(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.Logcosh())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.LogCosh())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return logcosh(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='logcosh'):
+    super(LogCosh, self).__init__(logcosh, name=name, reduction=reduction)
 
 
-class KullbackLeiblerDivergence(Loss):
-  """Computes kullback leibler divergence loss between `y_true` and `y_pred`.
+@keras_export('keras.losses.KLDivergence')
+class KLDivergence(LossFunctionWrapper):
+  """Computes Kullback Leibler divergence loss between `y_true` and `y_pred`.
 
-  loss = y_true * log(y_true / y_pred)
+  `loss = y_true * log(y_true / y_pred)`
 
   Usage:
 
   ```python
-  k = tf.losses.KullbackLeiblerDivergence()
+  k = tf.keras.losses.KLDivergence()
   loss = k([.4, .9, .2], [.5, .8, .12])
   print('Loss: ', loss.numpy())  # Loss: -0.043
   ```
@@ -598,32 +605,34 @@ class KullbackLeiblerDivergence(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.KullbackLeiblerDivergence())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.KLDivergence())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return kullback_leibler_divergence(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='kullback_leibler_divergence'):
+    super(KLDivergence, self).__init__(
+        kullback_leibler_divergence, name=name, reduction=reduction)
 
 
-class HuberLoss(Loss):
-  """Computes the huber loss between `y_true` and `y_pred`.
+@keras_export('keras.losses.Huber')
+class Huber(LossFunctionWrapper):
+  """Computes the Huber loss between `y_true` and `y_pred`.
 
   For each value x in `error=y_true-y_pred`, the following is calculated:
 
-    ```
-    0.5 * x^2                  if |x| <= d
-    0.5 * d^2 + d * (|x| - d)  if |x| > d
-    ```
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
   where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
 
   Usage:
 
   ```python
-  l = tf.losses.HuberLoss()
+  l = tf.keras.losses.Huber()
   loss = l([0., 1., 1.], [1., 0., 1.])
   print('Loss: ', loss.numpy())  # Loss: 0.333
   ```
@@ -631,29 +640,24 @@ class HuberLoss(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.HuberLoss())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Huber())
   ```
 
   Args:
-    delta: A float, the point where the huber loss function changes from a
+    delta: A float, the point where the Huber loss function changes from a
       quadratic to linear.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                delta=1.0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
-    super(HuberLoss, self).__init__(reduction=reduction, name=name)
-    self.delta = delta
-
-  def call(self, y_true, y_pred):
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return huber_loss(y_true, y_pred, delta=self.delta)
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='huber_loss'):
+    super(Huber, self).__init__(
+        huber_loss, name=name, reduction=reduction, delta=delta)
 
 
 @keras_export('keras.metrics.mean_squared_error',
@@ -663,7 +667,9 @@ class HuberLoss(Loss):
               'keras.losses.mse',
               'keras.losses.MSE')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
 
 @keras_export('keras.metrics.mean_absolute_error',
@@ -673,6 +679,8 @@ def mean_squared_error(y_true, y_pred):
               'keras.losses.mae',
               'keras.losses.MAE')
 def mean_absolute_error(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
@@ -682,7 +690,9 @@ def mean_absolute_error(y_true, y_pred):
               'keras.losses.mean_absolute_percentage_error',
               'keras.losses.mape',
               'keras.losses.MAPE')
-def mean_absolute_percentage_error(y_true, y_pred):
+def mean_absolute_percentage_error(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   diff = math_ops.abs(
       (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
@@ -694,52 +704,90 @@ def mean_absolute_percentage_error(y_true, y_pred):
               'keras.losses.mean_squared_logarithmic_error',
               'keras.losses.msle',
               'keras.losses.MSLE')
-def mean_squared_logarithmic_error(y_true, y_pred):
+def mean_squared_logarithmic_error(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(math_ops.square(first_log - second_log), axis=-1)
+  return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
+
+
+def _maybe_convert_labels(y_true):
+  """Converts binary labels into -1/1."""
+  are_zeros = math_ops.equal(y_true, 0)
+  are_ones = math_ops.equal(y_true, 1)
+  is_binary = math_ops.reduce_all(math_ops.logical_or(are_zeros, are_ones))
+
+  def _convert_binary_labels():
+    # Convert the binary labels to -1 or 1.
+    return 2. * y_true - 1.
+
+  updated_y_true = smart_cond.smart_cond(is_binary,
+                                         _convert_binary_labels, lambda: y_true)
+  return updated_y_true
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
+  """Computes the squared hinge loss between `y_true` and `y_pred`.
+
+  Args:
+    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
+      If binary (0 or 1) labels are provided we will convert them to -1 or 1.
+    y_pred: The predicted values.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  y_true = _maybe_convert_labels(y_true)
   return K.mean(
       math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
 @keras_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
+  """Computes the hinge loss between `y_true` and `y_pred`.
+
+  Args:
+    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
+      If binary (0 or 1) labels are provided we will convert them to -1 or 1.
+    y_pred: The predicted values.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  y_true = _maybe_convert_labels(y_true)
   return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
 @keras_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
   return math_ops.maximum(0., neg - pos + 1.)
 
 
-def logloss(y_true, y_pred):
-  losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon()))
-  losses += math_ops.multiply((1 - y_true),
-                              math_ops.log(1 - y_pred + K.epsilon()))
-  return K.mean(-losses, axis=-1)
-
-
 def huber_loss(y_true, y_pred, delta=1.0):
-  """Computes huber loss value.
+  """Computes Huber loss value.
 
   For each value x in `error=y_true-y_pred`, the following is calculated:
 
-    ```
-    0.5 * x^2                  if |x| <= d
-    0.5 * d^2 + d * (|x| - d)  if |x| > d
-    ```
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
   where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
 
   Args:
     y_true: tensor of true targets.
     y_pred: tensor of predicted targets.
-    delta: A float, the point where the huber loss function changes from a
+    delta: A float, the point where the Huber loss function changes from a
       quadratic to linear.
 
   Returns:
@@ -774,6 +822,8 @@ def logcosh(y_true, y_pred):
   Returns:
       Tensor with one scalar loss entry per sample.
   """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
 
   def _logcosh(x):
     return x + nn.softplus(-2. * x) - math_ops.log(2.)
@@ -793,12 +843,15 @@ def categorical_crossentropy(y_true,
     y_true: tensor of true targets.
     y_pred: tensor of predicted targets.
     from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
-      we consider that `y_pred` encodes a probability distribution.
+      we assume that `y_pred` encodes a probability distribution.
     label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
 
   Returns:
     Categorical crossentropy loss value.
   """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
@@ -811,14 +864,17 @@ def categorical_crossentropy(y_true,
 
 @keras_export('keras.metrics.sparse_categorical_crossentropy',
               'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   return K.sparse_categorical_crossentropy(
-      y_true, y_pred, from_logits=from_logits)
+      y_true, y_pred, from_logits=from_logits, axis=axis)
 
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
+def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
@@ -835,7 +891,9 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
               'keras.losses.kullback_leibler_divergence',
               'keras.losses.kld',
               'keras.losses.KLD')
-def kullback_leibler_divergence(y_true, y_pred):
+def kullback_leibler_divergence(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
   return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
@@ -843,66 +901,69 @@ def kullback_leibler_divergence(y_true, y_pred):
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
-@keras_export('keras.metrics.cosine_proximity',
-              'keras.metrics.cosine',
-              'keras.losses.cosine_proximity',
-              'keras.losses.cosine')
+# Retaining the legacy namespaces: 'cosine_proximity' and 'cosine'.
+# TODO(psv): Change name of this function to `cosine_similarity` after fixing
+# estimator test.
+@keras_export(
+    'keras.losses.cosine_similarity',
+    v1=[
+        'keras.metrics.cosine_proximity',
+        'keras.metrics.cosine',
+        'keras.losses.cosine_proximity',
+        'keras.losses.cosine',
+        'keras.losses.cosine_similarity',
+    ])
 def cosine_proximity(y_true, y_pred, axis=-1):
+  """Computes the cosine similarity between labels and predictions."""
   y_true = nn.l2_normalize(y_true, axis=axis)
   y_pred = nn.l2_normalize(y_pred, axis=axis)
-  return -math_ops.reduce_sum(y_true * y_pred, axis=axis)
+  return math_ops.reduce_sum(y_true * y_pred, axis=axis)
 
 
-@keras_export('keras.losses.CosineProximity')
-class CosineProximity(Loss):
-  """Computes the cosine proximity between `y_true` and `y_pred`.
+@keras_export('keras.losses.CosineSimilarity')
+class CosineSimilarity(LossFunctionWrapper):
+  """Computes the cosine similarity between `y_true` and `y_pred`.
 
   Usage:
 
   ```python
-  cosine_loss = tf.losses.CosineProximity()
-  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: -0.5
+  cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
+  loss = cosine_loss([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Loss: ', loss.numpy())  # Loss: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.CosineProximity())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CosineSimilarity(axis=1))
   ```
 
   Args:
     axis: (Optional) Defaults to -1. The dimension along which the cosine
-      proximity is computed.
-    reduction: (Optional) Type of `tf.losses.Reduction` to apply to loss.
+      similarity is computed.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
       Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                axis=-1,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
-    super(CosineProximity, self).__init__(reduction=reduction, name=name)
-    self.axis = axis
-
-  def call(self, y_true, y_pred):
-    """Calculates the cosine proximity loss.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Cosine distance loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return cosine_proximity(y_true, y_pred, axis=self.axis)
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='cosine_similarity'):
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, reduction=reduction, name=name, axis=axis)
 
 
 # Aliases.
@@ -912,7 +973,17 @@ mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
 kld = KLD = kullback_leibler_divergence
-cosine = cosine_proximity
+cosine_similarity = cosine_proximity
+
+
+def is_categorical_crossentropy(loss):
+  result = ((isinstance(loss, CategoricalCrossentropy) or
+             (isinstance(loss, LossFunctionWrapper) and
+              loss.fn == categorical_crossentropy) or
+             (hasattr(loss, '__name__') and
+              loss.__name__ == 'categorical_crossentropy') or
+             (loss == 'categorical_crossentropy')))
+  return result
 
 
 @keras_export('keras.losses.serialize')
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index d276854e299ece55fbff3a1fb34f4cd57fbcbc5a..766a7e2e84f2ed25c4543f6a52262d9720be2dba 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -27,7 +27,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.platform import test
 
 try:
@@ -45,7 +45,7 @@ ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.binary_crossentropy,
               keras.losses.kullback_leibler_divergence,
               keras.losses.poisson,
-              keras.losses.cosine_proximity,
+              keras.losses.cosine_similarity,
               keras.losses.logcosh,
               keras.losses.categorical_hinge]
 
@@ -56,7 +56,7 @@ class _MSEMAELoss(object):
   def __init__(self, mse_fraction):
     self.mse_fraction = mse_fraction
 
-  def __call__(self, y_true, y_pred):
+  def __call__(self, y_true, y_pred, sample_weight=None):
     return (self.mse_fraction * keras.losses.mse(y_true, y_pred) +
             (1 - self.mse_fraction) * keras.losses.mae(y_true, y_pred))
 
@@ -180,15 +180,34 @@ class KerasLossesTest(test.TestCase):
         loaded_model = keras.models.load_model(model_filename)
         loaded_model.predict(np.random.rand(128, 2))
 
+  def test_loss_wrapper(self):
+    loss_fn = keras.losses.get('mse')
+    mse_obj = keras.losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+
+    self.assertEqual(mse_obj.name, 'mean_squared_error')
+    self.assertEqual(mse_obj.reduction,
+                     losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+
+    y_true = constant_op.constant([[1., 9.], [2., 5.]])
+    y_pred = constant_op.constant([[4., 8.], [12., 3.]])
+    sample_weight = constant_op.constant([1.2, 0.5])
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+    # mse = [5, 52]
+    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+    # reduced_weighted_mse = (6 + 26) / 2 =
+    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mse_1')
     self.assertEqual(mse_obj.name, 'mse_1')
-    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mse_obj = keras.losses.MeanSquaredError()
@@ -254,7 +273,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -265,7 +284,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -279,9 +298,9 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_config(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mae_1')
     self.assertEqual(mae_obj.name, 'mae_1')
-    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mae_obj = keras.losses.MeanAbsoluteError()
@@ -347,7 +366,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -358,7 +377,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -372,9 +391,9 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(mape_obj.name, 'mape_1')
-    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError()
@@ -429,9 +448,9 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(msle_obj.name, 'mape_1')
-    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError()
@@ -482,7 +501,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
     epsilon = 1e-12
@@ -496,28 +515,27 @@ class CosineProximityTest(test.TestCase):
 
     y_true = self.l2_norm(self.np_y_true, axis)
     y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
 
     self.y_true = constant_op.constant(self.np_y_true)
     self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = keras.losses.CosineProximity(
-        axis=2, reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    cosine_obj = keras.losses.CosineSimilarity(
+        axis=2, reduction=losses_utils.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
-    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
-    self.assertEqual(cosine_obj.axis, 2)
+    self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     sample_weight = 2.3
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     expected_loss = np.mean(self.expected_loss * sample_weight)
@@ -525,7 +543,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     sample_weight = np.asarray([1.2, 3.4])
     loss = cosine_obj(
         self.y_true,
@@ -536,14 +554,14 @@ class CosineProximityTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     np_y_true = self.np_y_true.reshape((2, 3, 1))
     np_y_pred = self.np_y_pred.reshape((2, 3, 1))
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
 
     y_true = self.l2_norm(np_y_true, 2)
     y_pred = self.l2_norm(np_y_pred, 2)
-    expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(2,))
+    expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(2,))
 
     y_true = constant_op.constant(np_y_true)
     y_pred = constant_op.constant(np_y_pred)
@@ -555,13 +573,13 @@ class CosineProximityTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineProximity()
+    cosine_obj = keras.losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
   def test_axis(self):
     self.setup(axis=1)
-    cosine_obj = keras.losses.CosineProximity(axis=1)
+    cosine_obj = keras.losses.CosineSimilarity(axis=1)
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
@@ -572,9 +590,9 @@ class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
     bce_obj = keras.losses.BinaryCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(bce_obj.name, 'bce_1')
-    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -703,7 +721,7 @@ class BinaryCrossentropyTest(test.TestCase):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = bce_obj(y_true, logits)
 
     # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
@@ -738,9 +756,9 @@ class CategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
     cce_obj = keras.losses.CategoricalCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(cce_obj.name, 'bce_1')
-    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -804,7 +822,7 @@ class CategoricalCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
     cce_obj = keras.losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
@@ -830,22 +848,26 @@ class CategoricalCrossentropyTest(test.TestCase):
     expected_value = 400.0 * label_smoothing / 3.0
     self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
-  def test_all_correct_unweighted_sparse(self):
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseCategoricalCrossentropyTest(test.TestCase):
+
+  def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
                                   dtype=dtypes.float32)
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     loss = cce_obj(y_true, y_pred)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
     # Test with logits.
     logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
-  def test_unweighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_unweighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([0, 1, 2])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -854,12 +876,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
 
-  def test_scalar_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -868,12 +890,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
 
-  def test_sample_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -883,15 +905,15 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
 
-  def test_no_reduction_sparse(self):
+  def test_no_reduction(self):
     y_true = constant_op.constant([[0], [1], [2]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
@@ -901,60 +923,99 @@ class HingeTest(test.TestCase):
 
   def test_config(self):
     hinge_obj = keras.losses.Hinge(
-        reduction=losses_impl.ReductionV2.SUM, name='hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='hinge_loss')
     self.assertEqual(hinge_obj.name, 'hinge_loss')
-    self.assertEqual(hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     hinge_obj = keras.losses.Hinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # reduced loss = (0.6 + 0.4125) / 2
+
     loss = hinge_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 7.3333, 3)
+    self.assertAllClose(0.506, self.evaluate(loss), atol=1e-3)
 
   def test_scalar_weighted(self):
     hinge_obj = keras.losses.Hinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # weighted_loss = [0.6 * 2.3, 0.4125 * 2.3]
+    # reduced loss = (0.6 + 0.4125) * 2.3 / 2
+
     loss = hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 16.8666, 3)
+    self.assertAlmostEqual(self.evaluate(loss), 1.164, 3)
 
     # Verify we get the same output when the same input is given
     loss_2 = hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+    self.assertAllClose(self.evaluate(loss), self.evaluate(loss_2), 1e-3)
 
   def test_sample_weighted(self):
     hinge_obj = keras.losses.Hinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # weighted loss = [0.6 * 1.2, 0.4125 * 3.4]
+    # reduced loss = (0.6 * 1.2 + 0.4125 * 3.4) / 2
+
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 24.9333, 3)
+    self.assertAllClose(self.evaluate(loss), 1.061, 1e-3)
 
   def test_timestep_weighted(self):
     hinge_obj = keras.losses.Hinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3, 1),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+    y_pred = constant_op.constant(
+        [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+    #                    [[0.25], [1], [0.5], [0.6]]]
+    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+    #                        [[0.75], [0], [0.5], [0.4]]]
+    # loss = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # weighted loss    = [[2.1, 4.8, 4.5, 0], [3, 0, 0.5, 1.2]]
+    # reduced loss = (2.1 + 4.8 + 4.5 + 0 + 3 + 0 + 0.5 + 1.2) / 8
+
     loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 2.0, 3)
+    self.assertAllClose(self.evaluate(loss), 2.012, 1e-3)
 
   def test_zero_weighted(self):
     hinge_obj = keras.losses.Hinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
     loss = hinge_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -962,32 +1023,52 @@ class SquaredHingeTest(test.TestCase):
 
   def test_config(self):
     sq_hinge_obj = keras.losses.SquaredHinge(
-        reduction=losses_impl.ReductionV2.SUM, name='sq_hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='sq_hinge_loss')
     self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
-    self.assertEqual(sq_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
-    y_true = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
-    y_pred = constant_op.constant([4, 8, 12, 8],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # reduced loss = (0.485 + 0.2431) / 2
 
-    # Sq hinge = mean(square(max(1. - y_true * y_pred, 0.)), axis=-1)
-    # (1. - y_true * y_pred) = [[1-4, 1-72], [1-24, 1+40]] = [0, 48]
-    # sq(max(above val, 0)) = sq([[0, 0], [0, 41]) = [[0, 0], [0, 1681]]
-    # Mean = [0, 840.5]. Reduced loss = (0 + 840.5)/2 = 420.25
     loss = sq_hinge_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 420.25, 3)
+    self.assertAllClose(self.evaluate(loss), 0.364, 1e-3)
 
   def test_scalar_weighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # weighted loss = [0.485 * 2.3, 0.2431 * 2.3]
+    # reduced loss = (0.485 + 0.2431) * 2.3 / 2
+
     loss = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 647.833, 3)
+    self.assertAllClose(self.evaluate(loss), 0.837, 1e-3)
 
     # Verify we get the same output when the same input is given
     loss_2 = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
@@ -995,32 +1076,55 @@ class SquaredHingeTest(test.TestCase):
 
   def test_sample_weighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # weighted loss = [0.485 * 1.2, 0.2431 * 3.4]
+    # reduced loss = (0.485 * 1.2 + 0.2431 * 3.4) / 2
+
+    sample_weight = constant_op.constant([1.2, 3.4])
     loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 957.667, 3)
+    self.assertAllClose(self.evaluate(loss), 0.704, 1e-3)
 
   def test_timestep_weighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3, 1),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+    y_pred = constant_op.constant(
+        [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+    #                    [[0.25], [1], [0.5], [0.6]]]
+    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+    #                        [[0.75], [0], [0.5], [0.4]]]
+    # loss = [[0.49, 0.64, 0.81, 0], [0.5625, 0, 0.25, 0.16]]
+    # weighted loss    = [[1.47, 3.84, 4.05, 0], [2.25, 0, 0.25, 0.48]]
+    # reduced loss = (1.47 + 3.84 + 4.05 + 0 + 2.25 + 0 + 0.25 + 0.48) / 8
+
     loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 6.0, 3)
+    self.assertAllClose(self.evaluate(loss), 1.542, 1e-3)
 
   def test_zero_weighted(self):
     sq_hinge_obj = keras.losses.SquaredHinge()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
     loss = sq_hinge_obj(y_true, y_pred, sample_weight=0)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -1028,9 +1132,9 @@ class CategoricalHingeTest(test.TestCase):
 
   def test_config(self):
     cat_hinge_obj = keras.losses.CategoricalHinge(
-        reduction=losses_impl.ReductionV2.SUM, name='cat_hinge_loss')
+        reduction=losses_utils.ReductionV2.SUM, name='cat_hinge_loss')
     self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
-    self.assertEqual(cat_hinge_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     cat_hinge_obj = keras.losses.CategoricalHinge()
@@ -1090,98 +1194,7 @@ class CategoricalHingeTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class LogLossTest(test.TestCase):
-
-  def setup(self):
-    # TODO(psv): Change to setUp() after b/122319309 is fixed.
-    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
-    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
-    epsilon = 1e-7  # to avoid log 0
-
-    self.batch_size = 6
-    self.expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
-    self.expected_losses += np.multiply(1 - y_true,
-                                        np.log(1 - y_pred + epsilon))
-    self.expected_losses = -self.expected_losses
-
-    self.y_pred = constant_op.constant(y_pred)
-    self.y_true = constant_op.constant(y_true)
-
-  def test_config(self):
-    log_loss_obj = keras.losses.LogLoss(
-        reduction=losses_impl.ReductionV2.SUM, name='log')
-    self.assertEqual(log_loss_obj.name, 'log')
-    self.assertEqual(log_loss_obj.reduction, losses_impl.ReductionV2.SUM)
-
-  def test_all_correct(self):
-    self.setup()
-    log_loss_obj = keras.losses.LogLoss()
-    loss = log_loss_obj(self.y_true, self.y_true)
-    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
-
-  def test_unweighted(self):
-    self.setup()
-    log_loss_obj = keras.losses.LogLoss()
-    loss = log_loss_obj(self.y_true, self.y_pred)
-    actual_loss = np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_scalar_weighted(self):
-    self.setup()
-    log_loss_obj = keras.losses.LogLoss()
-    sample_weight = 2.3
-    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-    # Verify we get the same output when the same input is given
-    loss_2 = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
-
-  def test_sample_weighted(self):
-    self.setup()
-    log_loss_obj = keras.losses.LogLoss()
-    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
-
-    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    actual_loss = np.multiply(
-        self.expected_losses,
-        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
-    actual_loss = np.sum(actual_loss) / self.batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_timestep_weighted(self):
-    log_loss_obj = keras.losses.LogLoss()
-
-    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3, 1))
-    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3, 1))
-    epsilon = 1e-7  # to avoid log 0
-    batch_size = 6
-
-    expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
-    expected_losses += np.multiply(1 - y_true, np.log(1 - y_pred + epsilon))
-
-    y_pred = constant_op.constant(y_pred)
-    y_true = constant_op.constant(y_true)
-    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
-    loss = log_loss_obj(
-        y_true,
-        y_pred,
-        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
-    actual_loss = np.multiply(-expected_losses, sample_weight)
-    actual_loss = np.sum(actual_loss) / batch_size
-    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
-
-  def test_zero_weighted(self):
-    self.setup()
-    log_loss_obj = keras.losses.LogLoss()
-    sample_weight = 0
-    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class LogcoshTest(test.TestCase):
+class LogCoshTest(test.TestCase):
 
   def setup(self):
     y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
@@ -1195,14 +1208,14 @@ class LogcoshTest(test.TestCase):
     self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    logcosh_obj = keras.losses.Logcosh(
-        reduction=losses_impl.ReductionV2.SUM, name='logcosh_loss')
+    logcosh_obj = keras.losses.LogCosh(
+        reduction=losses_utils.ReductionV2.SUM, name='logcosh_loss')
     self.assertEqual(logcosh_obj.name, 'logcosh_loss')
-    self.assertEqual(logcosh_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    logcosh_obj = keras.losses.Logcosh()
+    logcosh_obj = keras.losses.LogCosh()
 
     loss = logcosh_obj(self.y_true, self.y_pred)
     expected_loss = np.sum(self.expected_losses) / self.batch_size
@@ -1210,7 +1223,7 @@ class LogcoshTest(test.TestCase):
 
   def test_scalar_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.Logcosh()
+    logcosh_obj = keras.losses.LogCosh()
     sample_weight = 2.3
 
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1224,7 +1237,7 @@ class LogcoshTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.Logcosh()
+    logcosh_obj = keras.losses.LogCosh()
 
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1237,7 +1250,7 @@ class LogcoshTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.Logcosh()
+    logcosh_obj = keras.losses.LogCosh()
     y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
     y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
     error = y_pred - y_true
@@ -1255,7 +1268,7 @@ class LogcoshTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.Logcosh()
+    logcosh_obj = keras.losses.LogCosh()
     sample_weight = 0
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
@@ -1277,9 +1290,9 @@ class PoissonTest(test.TestCase):
 
   def test_config(self):
     poisson_obj = keras.losses.Poisson(
-        reduction=losses_impl.ReductionV2.SUM, name='poisson')
+        reduction=losses_utils.ReductionV2.SUM, name='poisson')
     self.assertEqual(poisson_obj.name, 'poisson')
-    self.assertEqual(poisson_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
@@ -1343,7 +1356,7 @@ class PoissonTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class KullbackLeiblerDivergenceTest(test.TestCase):
+class KLDivergenceTest(test.TestCase):
 
   def setup(self):
     self.np_y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
@@ -1357,14 +1370,14 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
     self.y_true = constant_op.constant(self.np_y_true)
 
   def test_config(self):
-    k_obj = keras.losses.KullbackLeiblerDivergence(
-        reduction=losses_impl.ReductionV2.SUM, name='kld')
+    k_obj = keras.losses.KLDivergence(
+        reduction=losses_utils.ReductionV2.SUM, name='kld')
     self.assertEqual(k_obj.name, 'kld')
-    self.assertEqual(k_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    k_obj = keras.losses.KullbackLeiblerDivergence()
+    k_obj = keras.losses.KLDivergence()
 
     loss = k_obj(self.y_true, self.y_pred)
     expected_loss = np.sum(self.expected_losses) / self.batch_size
@@ -1372,7 +1385,7 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
 
   def test_scalar_weighted(self):
     self.setup()
-    k_obj = keras.losses.KullbackLeiblerDivergence()
+    k_obj = keras.losses.KLDivergence()
     sample_weight = 2.3
 
     loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1386,7 +1399,7 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    k_obj = keras.losses.KullbackLeiblerDivergence()
+    k_obj = keras.losses.KLDivergence()
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
@@ -1398,7 +1411,7 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    k_obj = keras.losses.KullbackLeiblerDivergence()
+    k_obj = keras.losses.KLDivergence()
     y_true = self.np_y_true.reshape(2, 3, 1)
     y_pred = self.np_y_pred.reshape(2, 3, 1)
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
@@ -1417,7 +1430,7 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    k_obj = keras.losses.KullbackLeiblerDivergence()
+    k_obj = keras.losses.KLDivergence()
     loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
@@ -1447,27 +1460,27 @@ class HuberLossTest(test.TestCase):
     self.y_true = constant_op.constant(self.np_y_true)
 
   def test_config(self):
-    h_obj = keras.losses.HuberLoss(
-        reduction=losses_impl.ReductionV2.SUM, name='huber')
+    h_obj = keras.losses.Huber(
+        reduction=losses_utils.ReductionV2.SUM, name='huber')
     self.assertEqual(h_obj.name, 'huber')
-    self.assertEqual(h_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     loss = h_obj(self.y_true, self.y_true)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     loss = h_obj(self.y_true, self.y_pred)
     actual_loss = np.sum(self.expected_losses) / self.batch_size
     self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
 
   def test_scalar_weighted(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     sample_weight = 2.3
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
@@ -1479,7 +1492,7 @@ class HuberLossTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
 
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1491,7 +1504,7 @@ class HuberLossTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     y_pred = self.np_y_pred.reshape((2, 3, 1))
     y_true = self.np_y_true.reshape((2, 3, 1))
     expected_losses = self.huber_loss(y_true, y_pred)
@@ -1509,14 +1522,14 @@ class HuberLossTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    h_obj = keras.losses.HuberLoss()
+    h_obj = keras.losses.Huber()
     sample_weight = 0
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
   def test_non_default_delta(self):
     self.setup(delta=0.8)
-    h_obj = keras.losses.HuberLoss(delta=0.8)
+    h_obj = keras.losses.Huber(delta=0.8)
     sample_weight = 2.3
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index d3bd523d29955388842206d871b3be1bd690ccd9..59ceceaff6d3a60d488e185f6dde35eecca78b81 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -20,13 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import sys
 import types
 import numpy as np
 import six
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -35,7 +34,7 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
@@ -51,19 +50,19 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
+@keras_export('keras.metrics.Metric')
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
@@ -110,33 +109,31 @@ class Metric(Layer):
   Example subclass implementation:
 
   ```
-  class BinaryTruePositives(Metric):
-    def __init__(self, name='binary_true_positives', dtype=None):
-      super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
-      self.true_positives = self.add_weight(
-          'true_positives', initializer=init_ops.zeros_initializer)
+  class BinaryTruePositives(tf.keras.metrics.Metric):
+
+    def __init__(self, name='binary_true_positives', **kwargs):
+      super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+      self.true_positives = self.add_weight(name='tp', initializer='zeros')
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-      y_true = math_ops.cast(y_true, dtypes.bool)
-      y_pred = math_ops.cast(y_pred, dtypes.bool)
-      y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-          y_pred, y_true, sample_weight)
-
-      values = math_ops.logical_and(
-          math_ops.equal(y_true, True), math_ops.equal(y_pred, True))
-      values = math_ops.cast(values, self._dtype)
+      y_true = tf.cast(y_true, tf.bool)
+      y_pred = tf.cast(y_pred, tf.bool)
+
+      values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+      values = tf.cast(values, self.dtype)
       if sample_weight is not None:
-        sample_weight = math_ops.cast(sample_weight, self._dtype)
-        values = math_ops.multiply(values, sample_weight)
-      state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
+        sample_weight = tf.cast(sample_weight, self.dtype)
+        sample_weight = tf.broadcast_weights(sample_weight, values)
+        values = tf.multiply(values, sample_weight)
+      self.true_positives.assign_add(tf.reduce_sum(values))
 
     def result(self):
-      return array_ops.identity(self.true_positives)
+      return self.true_positives
   ```
   """
 
-  def __init__(self, name=None, dtype=None):
-    super(Metric, self).__init__(name=name, dtype=dtype)
+  def __init__(self, name=None, dtype=None, **kwargs):
+    super(Metric, self).__init__(name=name, dtype=dtype, **kwargs)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
     self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
@@ -144,24 +141,19 @@ class Metric(Layer):
   def __new__(cls, *args, **kwargs):
     obj = super(Metric, cls).__new__(cls)
 
-    if sys.version_info < (3,):
-      # Wrap methods in `weakmethod` function to remove binding and create a
-      # weak reference. This is to remove reference cycle that is created here.
-      # This is not an issue in python versions > 3.
-      if context.executing_eagerly():
-        obj.update_state = metrics_utils.weakmethod(obj.update_state)
-      obj.update_state = metrics_utils.weakmethod(
-          types.MethodType(
-              metrics_utils.update_state_wrapper(obj.update_state), obj))
-      result = metrics_utils.weakmethod(obj.result)
-      obj.result = metrics_utils.weakmethod(
-          types.MethodType(metrics_utils.result_wrapper(result), obj))
+    # TODO(psv): We are excluding wrapping `update_state` of built-in metrics
+    # with function here because of b/121302287. With this, built-in metrics
+    # will continue to work with TPUs and custom metrics will not, however
+    # users writing custom metrics need not worry about control dependencies
+    # and returning ops.
+    if cls.__module__ == Metric.__module__:
+      update_state_fn = obj.update_state
     else:
-      obj.update_state = types.MethodType(
-          metrics_utils.update_state_wrapper(obj.update_state), obj)
-      obj.result = types.MethodType(
-          metrics_utils.result_wrapper(obj.result), obj)
+      update_state_fn = def_function.function(obj.update_state)
 
+    obj.update_state = types.MethodType(
+        metrics_utils.update_state_wrapper(update_state_fn), obj)
+    obj.result = types.MethodType(metrics_utils.result_wrapper(obj.result), obj)
     return obj
 
   def __call__(self, *args, **kwargs):
@@ -175,9 +167,9 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)
+    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
     with ops.control_dependencies([update_op]):
-      result_t = self.result()
+      result_t = self.result()  # pylint: disable=not-callable
 
       # We are adding the metric object as metadata on the result tensor.
       # This is required when we want to use a metric with `add_metric` API on
@@ -185,19 +177,27 @@ class Metric(Layer):
       # to reset variable state after each epoch of training.
       # Example:
       #   model = Model()
-      #   model.add_metric(Mean()(values), name='mean')
+      #   mean = Mean()
+      #   model.add_metric(mean(values), name='mean')
       if not context.executing_eagerly():
         result_t._metric_obj = self  # pylint: disable=protected-access
       return result_t
 
+  @property
+  def dtype(self):
+    return self._dtype
+
+  def get_config(self):
+    """Returns the serializable config of the metric."""
+    return {'name': self.name, 'dtype': self.dtype}
+
   def reset_states(self):
     """Resets all of the metric state variables.
 
     This function is called between epochs/steps,
     when a metric is evaluated during training.
     """
-    for v in self.variables:
-      K.set_value(v, 0)
+    K.batch_set_value([(v, 0) for v in self.variables])
 
   @abc.abstractmethod
   def update_state(self, *args, **kwargs):
@@ -212,7 +212,9 @@ class Metric(Layer):
          All update ops added to the graph by this function will be executed.
       As a result, code should generally work the same way with graph or
       eager execution.
-    and adds the update op to the metric layer.
+
+    Please use `tf.config.experimental_run_functions_eagerly(True)` to execute
+    this function eagerly for debugging or profiling.
 
     Args:
       *args:
@@ -229,12 +231,6 @@ class Metric(Layer):
     """
     NotImplementedError('Must be implemented in subclasses.')
 
-  @classmethod
-  def from_config(cls, config):
-    if 'trainable' in config:
-      config.pop('trainable')
-    return cls(**config)
-
   ### For use by subclasses ###
   @doc_controls.for_subclass_implementers
   def add_weight(self,
@@ -258,56 +254,32 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
-@keras_export('keras.metrics.Mean')
-class Mean(Metric):
-  """Computes the (weighted) mean of the given values.
-
-  For example, if values is [1, 3, 5, 7] then the mean is 4.
-  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
-
-  This metric creates two variables, `total` and `count` that are used to
-  compute the average of `values`. This average is ultimately returned as `mean`
-  which is an idempotent operation that simply divides `total` by `count`.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use `sample_weight` of 0 to mask values.
-
-  Usage:
-
-  ```python
-  m = tf.keras.metrics.Mean()
-  m.update_state([1, 3, 5, 7])
-  print('Final result: ', m.result().numpy())  # Final result: 4.0
-  ```
-
-  Usage with tf.keras API:
-
-  ```python
-  model = keras.models.Model(inputs, outputs)
-  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
-  model.compile('sgd', loss='mse')
-  ```
-  """
+class Reduce(Metric):
+  """Encapsulates metrics that perform a reduce operation on the values."""
 
-  def __init__(self, name='mean', dtype=None):
-    """Creates a `Mean` instance.
+  def __init__(self, reduction, name, dtype=None):
+    """Creates a `Reduce` instance.
 
     Args:
-      name: (Optional) string name of the metric instance.
+      reduction: a `tf.keras.metrics.Reduction` enum value.
+      name: string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
-    super(Mean, self).__init__(name=name, dtype=dtype)
-    # Create new state variables
+    super(Reduce, self).__init__(name=name, dtype=dtype)
+    self.reduction = reduction
     self.total = self.add_weight(
         'total', initializer=init_ops.zeros_initializer)
-    self.count = self.add_weight(
-        'count', initializer=init_ops.zeros_initializer)
+    if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+                     metrics_utils.Reduction.WEIGHTED_MEAN]:
+      self.count = self.add_weight(
+          'count', initializer=init_ops.zeros_initializer)
 
   def update_state(self, values, sample_weight=None):
-    """Accumulates statistics for computing the mean.
+    """Accumulates statistics for computing the reduction metric.
 
-    For example, if `values` is [1, 3, 5, 7] then the mean is 4. If
-    the `sample_weight` is specified as [1, 1, 0, 0] then the mean would be 2.
+    For example, if `values` is [1, 3, 5, 7] and reduction=SUM_OVER_BATCH_SIZE,
+    then the value of `result()` is 4. If the `sample_weight` is specified as
+    [1, 1, 0, 0] then value of `result()` would be 2.
 
     Args:
       values: Per-example value.
@@ -317,11 +289,8 @@ class Mean(Metric):
       Update op.
     """
     values = math_ops.cast(values, self._dtype)
-    if sample_weight is None:
-      num_values = math_ops.cast(array_ops.size(values), self._dtype)
-    else:
+    if sample_weight is not None:
       sample_weight = math_ops.cast(sample_weight, self._dtype)
-
       # Update dimensions of weights to match with values if possible.
       values, _, sample_weight = squeeze_or_expand_dimensions(
           values, None, sample_weight)
@@ -333,23 +302,134 @@ class Mean(Metric):
         # Reduce values to same ndim as weight array
         ndim = K.ndim(values)
         weight_ndim = K.ndim(sample_weight)
-        values = math_ops.reduce_mean(
-            values, axis=list(range(weight_ndim, ndim)))
-
-      num_values = math_ops.reduce_sum(sample_weight)
+        if self.reduction == metrics_utils.Reduction.SUM:
+          values = math_ops.reduce_sum(
+              values, axis=list(range(weight_ndim, ndim)))
+        else:
+          values = math_ops.reduce_mean(
+              values, axis=list(range(weight_ndim, ndim)))
       values = math_ops.multiply(values, sample_weight)
-    values = math_ops.reduce_sum(values)
 
-    # Update state variables. Count should be updated only when total is
-    # updated.
-    update_total_op = state_ops.assign_add(self.total, values)
+    value_sum = math_ops.reduce_sum(values)
+    with ops.control_dependencies([value_sum]):
+      update_total_op = self.total.assign_add(value_sum)
+
+    # Exit early if the reduction doesn't have a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return update_total_op
+
+    # Update `count` for reductions that require a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE:
+      num_values = math_ops.cast(array_ops.size(values), self._dtype)
+    elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN:
+      if sample_weight is None:
+        num_values = math_ops.cast(array_ops.size(values), self._dtype)
+      else:
+        num_values = math_ops.reduce_sum(sample_weight)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
     with ops.control_dependencies([update_total_op]):
-      return state_ops.assign_add(self.count, num_values)
+      return self.count.assign_add(num_values)
 
   def result(self):
-    return math_ops.div_no_nan(self.total, self.count)
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return array_ops.identity(self.total)
+    elif self.reduction in [
+        metrics_utils.Reduction.WEIGHTED_MEAN,
+        metrics_utils.Reduction.SUM_OVER_BATCH_SIZE
+    ]:
+      return math_ops.div_no_nan(self.total, self.count)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
 
+@keras_export('keras.metrics.Sum')
+class Sum(Reduce):
+  """Computes the (weighted) sum of the given values.
 
+  For example, if values is [1, 3, 5, 7] then the sum is 16.
+  If the weights were specified as [1, 1, 0, 0] then the sum would be 4.
+
+  This metric creates one variable, `total`, that is used to compute the sum of
+  `values`. This is ultimately returned as `sum`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Sum()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 16.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
+  """
+
+  def __init__(self, name='sum', dtype=None):
+    """Creates a `Sum` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Sum, self).__init__(reduction=metrics_utils.Reduction.SUM,
+                              name=name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Mean')
+class Mean(Reduce):
+  """Computes the (weighted) mean of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the mean is 4.
+  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+
+  This metric creates two variables, `total` and `count` that are used to
+  compute the average of `values`. This average is ultimately returned as `mean`
+  which is an idempotent operation that simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Mean()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 4.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
+  """
+
+  def __init__(self, name='mean', dtype=None):
+    """Creates a `Mean` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Mean, self).__init__(
+        reduction=metrics_utils.Reduction.WEIGHTED_MEAN, name=name, dtype=dtype)
+
+
+@keras_export('keras.metrics.MeanRelativeError')
 class MeanRelativeError(Mean):
   """Computes the mean relative error by normalizing with the given values.
 
@@ -376,7 +456,7 @@ class MeanRelativeError(Mean):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -416,7 +496,7 @@ class MeanRelativeError(Mean):
 
     y_pred, self.normalizer = confusion_matrix.remove_squeezable_dimensions(
         y_pred, self.normalizer)
-    y_pred.shape.assert_is_compatible_with(y_pred.shape)
+    y_pred.shape.assert_is_compatible_with(y_true.shape)
     relative_errors = math_ops.div_no_nan(
         math_ops.abs(y_true - y_pred), self.normalizer)
 
@@ -424,7 +504,8 @@ class MeanRelativeError(Mean):
         relative_errors, sample_weight=sample_weight)
 
   def get_config(self):
-    config = {'normalizer': self.normalizer}
+    n = self.normalizer
+    config = {'normalizer': K.eval(n) if is_tensor_or_variable(n) else n}
     base_config = super(MeanRelativeError, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -471,8 +552,9 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = {'fn': self._fn}
-    config.update(self._fn_kwargs)
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -504,7 +586,7 @@ class Accuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
   ```
   """
@@ -512,12 +594,6 @@ class Accuracy(MeanMetricWrapper):
   def __init__(self, name='accuracy', dtype=None):
     super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Accuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
@@ -546,7 +622,7 @@ class BinaryAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
   ```
   """
@@ -563,12 +639,6 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(BinaryAccuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
@@ -577,7 +647,8 @@ class CategoricalAccuracy(MeanMetricWrapper):
   For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
   [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
   If the weights were specified as [0.7, 0.3] then the categorical accuracy
-  would be .3.
+  would be .3. You can provide logits of classes as `y_pred`, since argmax of
+  logits and probabilities are same.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -601,7 +672,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -619,12 +690,6 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CategoricalAccuracy, cls).from_config(config)
-
 
 @keras_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
@@ -633,7 +698,8 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   For example, if `y_true` is [[2], [1]] and `y_pred` is
   [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
   If the weights were specified as [0.7, 0.3] then the categorical accuracy
-  would be .3.
+  would be .3. You can provide logits of classes as `y_pred`, since argmax of
+  logits and probabilities are same.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -654,7 +720,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -666,13 +732,8 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(SparseCategoricalAccuracy, cls).from_config(config)
-
 
+@keras_export('keras.metrics.TopKCategoricalAccuracy')
 class TopKCategoricalAccuracy(MeanMetricWrapper):
   """Computes how often targets are in the top `K` predictions.
 
@@ -687,7 +748,7 @@ class TopKCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
   ```
   """
@@ -704,13 +765,8 @@ class TopKCategoricalAccuracy(MeanMetricWrapper):
     super(TopKCategoricalAccuracy, self).__init__(
         top_k_categorical_accuracy, name, dtype=dtype, k=k)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(TopKCategoricalAccuracy, cls).from_config(config)
-
 
+@keras_export('keras.metrics.SparseTopKCategoricalAccuracy')
 class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
   """Computes how often integer targets are in the top `K` predictions.
 
@@ -725,7 +781,7 @@ class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
@@ -744,12 +800,6 @@ class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
     super(SparseTopKCategoricalAccuracy, self).__init__(
         sparse_top_k_categorical_accuracy, name, dtype=dtype, k=k)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(SparseTopKCategoricalAccuracy, cls).from_config(config)
-
 
 class _ConfusionMatrixConditionCount(Metric):
   """Calculates the number of the given confusion matrix condition."""
@@ -810,8 +860,8 @@ class _ConfusionMatrixConditionCount(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {'thresholds': self.init_thresholds}
@@ -845,7 +895,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
   ```
   """
@@ -895,7 +945,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
   ```
   """
@@ -945,7 +995,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
   ```
   """
@@ -995,7 +1045,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
   ```
   """
@@ -1055,7 +1105,7 @@ class Precision(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
   ```
   """
@@ -1133,8 +1183,8 @@ class Precision(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
@@ -1181,7 +1231,7 @@ class Recall(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
   ```
   """
@@ -1259,8 +1309,8 @@ class Recall(Metric):
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
@@ -1337,8 +1387,8 @@ class SensitivitySpecificityBase(Metric):
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
 
 @keras_export('keras.metrics.SensitivityAtSpecificity')
@@ -1372,7 +1422,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1453,7 +1503,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1503,6 +1553,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.metrics.AUC')
 class AUC(Metric):
   """Computes the approximate AUC (Area under the curve) via a Riemann sum.
 
@@ -1547,15 +1598,15 @@ class AUC(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.AUC()])
   ```
   """
 
   def __init__(self,
                num_thresholds=200,
-               curve=metrics_utils.AUCCurve.ROC,
-               summation_method=metrics_utils.AUCSummationMethod.INTERPOLATION,
+               curve='ROC',
+               summation_method='interpolation',
                name=None,
                dtype=None):
     """Creates an `AUC` instance.
@@ -1578,18 +1629,29 @@ class AUC(Metric):
     # Validate configurations.
     if num_thresholds <= 1:
       raise ValueError('`num_thresholds` must be > 1.')
-    if curve not in list(metrics_utils.AUCCurve):
+    if isinstance(curve, metrics_utils.AUCCurve) and curve not in list(
+        metrics_utils.AUCCurve):
       raise ValueError('Invalid curve: "{}". Valid options are: "{}"'.format(
           curve, list(metrics_utils.AUCCurve)))
-    if summation_method not in list(metrics_utils.AUCSummationMethod):
+    if isinstance(
+        summation_method,
+        metrics_utils.AUCSummationMethod) and summation_method not in list(
+            metrics_utils.AUCSummationMethod):
       raise ValueError(
           'Invalid summation method: "{}". Valid options are: "{}"'.format(
               summation_method, list(metrics_utils.AUCSummationMethod)))
 
     # Update properties.
     self.num_thresholds = num_thresholds
-    self.curve = curve
-    self.summation_method = summation_method
+    if isinstance(curve, metrics_utils.AUCCurve):
+      self.curve = curve
+    else:
+      self.curve = metrics_utils.AUCCurve.from_str(curve)
+    if isinstance(summation_method, metrics_utils.AUCSummationMethod):
+      self.summation_method = summation_method
+    else:
+      self.summation_method = metrics_utils.AUCSummationMethod.from_str(
+          summation_method)
     super(AUC, self).__init__(name=name, dtype=dtype)
 
     # Create metric variables
@@ -1743,63 +1805,67 @@ class AUC(Metric):
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
   def get_config(self):
     config = {
         'num_thresholds': self.num_thresholds,
-        'curve': self.curve,
-        'summation_method': self.summation_method,
+        'curve': self.curve.value,
+        'summation_method': self.summation_method.value,
     }
     base_config = super(AUC, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@keras_export('keras.metrics.CosineProximity')
-class CosineProximity(MeanMetricWrapper):
-  """Computes the cosine distance between the labels and predictions.
+@keras_export('keras.metrics.CosineSimilarity')
+class CosineSimilarity(MeanMetricWrapper):
+  """Computes the cosine similarity between the labels and predictions.
+
+  cosine similarity = (a . b) / ||a|| ||b||
+  (https://en.wikipedia.org/wiki/Cosine_similarity)
 
   For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
-  proximity is -0.5.
+  similarity is 0.5.
 
-  This metric keeps the average cosine distance between `predictions` and
+  This metric keeps the average cosine similarity between `predictions` and
   `labels` over a stream of data.
 
   Usage:
   ```python
-  m = tf.metrics.CosineProximity()
-  m.update_state([0, 1, 1], [1, 0, 1])
-  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  m = tf.keras.metrics.CosineSimilarity(axis=1)
+  m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.CosineProximity()])
+      metrics=[tf.keras.metrics.CosineSimilarity(axis=1)])
   ```
   """
 
-  def __init__(self, name='cosine_proximity', dtype=None, axis=-1):
-    """Creates a `CosineProximity` instance.
+  def __init__(self, name='cosine_similarity', dtype=None, axis=-1):
+    """Creates a `CosineSimilarity` instance.
 
     Args:
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
       axis: (Optional) Defaults to -1. The dimension along which the cosine
-        proximity is computed.
+        similarity is computed.
     """
-    super(CosineProximity, self).__init__(cosine, name, dtype=dtype, axis=axis)
-
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CosineProximity, cls).from_config(config)
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, name, dtype=dtype, axis=axis)
 
 
 @keras_export('keras.metrics.MeanAbsoluteError')
@@ -1811,7 +1877,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
 
   Usage:
   ```python
-  m = tf.metrics.MeanAbsoluteError()
+  m = tf.keras.metrics.MeanAbsoluteError()
   m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
   print('Final result: ', m.result().numpy())  # Final result: 0.75
   ```
@@ -1819,7 +1885,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()])
   ```
   """
@@ -1828,12 +1894,6 @@ class MeanAbsoluteError(MeanMetricWrapper):
     super(MeanAbsoluteError, self).__init__(
         mean_absolute_error, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(MeanAbsoluteError, cls).from_config(config)
-
 
 @keras_export('keras.metrics.MeanAbsolutePercentageError')
 class MeanAbsolutePercentageError(MeanMetricWrapper):
@@ -1853,7 +1913,7 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
   ```
   """
@@ -1862,12 +1922,6 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
     super(MeanAbsolutePercentageError, self).__init__(
         mean_absolute_percentage_error, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(MeanAbsolutePercentageError, cls).from_config(config)
-
 
 @keras_export('keras.metrics.MeanSquaredError')
 class MeanSquaredError(MeanMetricWrapper):
@@ -1887,7 +1941,7 @@ class MeanSquaredError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()])
   ```
   """
@@ -1896,12 +1950,6 @@ class MeanSquaredError(MeanMetricWrapper):
     super(MeanSquaredError, self).__init__(
         mean_squared_error, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(MeanSquaredError, cls).from_config(config)
-
 
 @keras_export('keras.metrics.MeanSquaredLogarithmicError')
 class MeanSquaredLogarithmicError(MeanMetricWrapper):
@@ -1921,7 +1969,7 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
   ```
   """
@@ -1930,32 +1978,32 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
     super(MeanSquaredLogarithmicError, self).__init__(
         mean_squared_logarithmic_error, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(MeanSquaredLogarithmicError, cls).from_config(config)
-
 
 @keras_export('keras.metrics.Hinge')
 class Hinge(MeanMetricWrapper):
   """Computes the hinge metric between `y_true` and `y_pred`.
 
-  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
-  the hinge metric value is 0.66.
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5]
+  the hinge metric value is 1.6.
 
   Usage:
 
   ```python
   m = tf.keras.metrics.Hinge()
-  m.update_state([0., 1., 1.], [1., 0., 1.])
-  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # result = max(0, 1-y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.6
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.Hinge()])
   ```
   """
@@ -1963,32 +2011,32 @@ class Hinge(MeanMetricWrapper):
   def __init__(self, name='hinge', dtype=None):
     super(Hinge, self).__init__(hinge, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Hinge, cls).from_config(config)
-
 
 @keras_export('keras.metrics.SquaredHinge')
 class SquaredHinge(MeanMetricWrapper):
   """Computes the squared hinge metric between `y_true` and `y_pred`.
 
-  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
-  the squared hinge metric value is 0.66.
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5]
+  the squared hinge metric value is 2.6.
 
   Usage:
 
   ```python
   m = tf.keras.metrics.SquaredHinge()
-  m.update_state([0., 1., 1.], [1., 0., 1.])
-  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # result = max(0, 1-y_true * y_pred) = [1.6^2 + 1.7^2 + 1.5^2] / 3
+
+  print('Final result: ', m.result().numpy())  # Final result: 2.6
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()])
   ```
   """
@@ -1996,12 +2044,6 @@ class SquaredHinge(MeanMetricWrapper):
   def __init__(self, name='squared_hinge', dtype=None):
     super(SquaredHinge, self).__init__(squared_hinge, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(SquaredHinge, cls).from_config(config)
-
 
 @keras_export('keras.metrics.CategoricalHinge')
 class CategoricalHinge(MeanMetricWrapper):
@@ -2021,7 +2063,7 @@ class CategoricalHinge(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()])
   ```
   """
@@ -2029,13 +2071,8 @@ class CategoricalHinge(MeanMetricWrapper):
   def __init__(self, name='categorical_hinge', dtype=None):
     super(CategoricalHinge, self).__init__(categorical_hinge, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CategoricalHinge, cls).from_config(config)
-
 
+@keras_export('keras.metrics.RootMeanSquaredError')
 class RootMeanSquaredError(Mean):
   """Computes root mean squared error metric between `y_true` and `y_pred`.
 
@@ -2050,7 +2087,7 @@ class RootMeanSquaredError(Mean):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()])
   ```
   """
@@ -2075,7 +2112,7 @@ class RootMeanSquaredError(Mean):
     y_pred = math_ops.cast(y_pred, self._dtype)
     y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
         y_pred, y_true, sample_weight)
-    error_sq = math_ops.square(y_pred - y_true)
+    error_sq = math_ops.squared_difference(y_pred, y_true)
     return super(RootMeanSquaredError, self).update_state(
         error_sq, sample_weight=sample_weight)
 
@@ -2083,15 +2120,16 @@ class RootMeanSquaredError(Mean):
     return math_ops.sqrt(math_ops.div_no_nan(self.total, self.count))
 
 
-class Logcosh(MeanMetricWrapper):
+@keras_export('keras.metrics.LogCoshError')
+class LogCoshError(MeanMetricWrapper):
   """Computes the logarithm of the hyperbolic cosine of the prediction error.
 
-  logcosh = log((exp(x) + exp(-x))/2) where x is the error `y_pred` - `y_true`.
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
 
   Usage:
 
   ```python
-  m = tf.keras.metrics.Logcosh()
+  m = tf.keras.metrics.LogCoshError()
   m.update_state([0., 1., 1.], [1., 0., 1.])
   print('Final result: ', m.result().numpy())  # Final result: 0.289
   ```
@@ -2099,25 +2137,20 @@ class Logcosh(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', metrics=[tf.keras.metrics.Logcosh()])
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.LogCoshError()])
   ```
   """
 
   def __init__(self, name='logcosh', dtype=None):
-    super(Logcosh, self).__init__(logcosh, name, dtype=dtype)
-
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Logcosh, cls).from_config(config)
+    super(LogCoshError, self).__init__(logcosh, name, dtype=dtype)
 
 
+@keras_export('keras.metrics.Poisson')
 class Poisson(MeanMetricWrapper):
-  """Computes the poisson metric between `y_true` and `y_pred`.
+  """Computes the Poisson metric between `y_true` and `y_pred`.
 
-  metric = y_pred - y_true * log(y_pred)
+  `metric = y_pred - y_true * log(y_pred)`
 
   Usage:
 
@@ -2130,7 +2163,7 @@ class Poisson(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', metrics=[tf.keras.metrics.Poisson()])
   ```
   """
@@ -2138,22 +2171,17 @@ class Poisson(MeanMetricWrapper):
   def __init__(self, name='poisson', dtype=None):
     super(Poisson, self).__init__(poisson, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Poisson, cls).from_config(config)
-
 
-class KullbackLeiblerDivergence(MeanMetricWrapper):
-  """Computes kullback leibler divergence metric between `y_true` and `y_pred`.
+@keras_export('keras.metrics.KLDivergence')
+class KLDivergence(MeanMetricWrapper):
+  """Computes Kullback Leibler divergence metric between `y_true` and `y_pred`.
 
-  metric = y_true * log(y_true / y_pred)
+  `metric = y_true * log(y_true / y_pred)`
 
   Usage:
 
   ```python
-  m = tf.keras.metrics.KullbackLeiblerDivergence()
+  m = tf.keras.metrics.KLDivergence()
   m.update_state([.4, .9, .2], [.5, .8, .12])
   print('Final result: ', m.result().numpy())  # Final result: -0.043
   ```
@@ -2161,22 +2189,17 @@ class KullbackLeiblerDivergence(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', metrics=[tf.keras.metrics.KullbackLeiblerDivergence()])
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.KLDivergence()])
   ```
   """
 
   def __init__(self, name='kullback_leibler_divergence', dtype=None):
-    super(KullbackLeiblerDivergence, self).__init__(
+    super(KLDivergence, self).__init__(
         kullback_leibler_divergence, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(KullbackLeiblerDivergence, cls).from_config(config)
-
 
+@keras_export('keras.metrics.MeanIoU')
 class MeanIoU(Metric):
   """Computes the mean Intersection-Over-Union metric.
 
@@ -2207,7 +2230,7 @@ class MeanIoU(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -2266,7 +2289,7 @@ class MeanIoU(Metric):
         self.num_classes,
         weights=sample_weight,
         dtype=dtypes.float64)
-    return state_ops.assign_add(self.total_cm, current_cm)
+    return self.total_cm.assign_add(current_cm)
 
   def result(self):
     """Compute the mean intersection-over-union via the confusion matrix."""
@@ -2301,6 +2324,7 @@ class MeanIoU(Metric):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.metrics.MeanTensor')
 class MeanTensor(Metric):
   """Computes the element-wise (weighted) mean of the given tensors.
 
@@ -2312,7 +2336,7 @@ class MeanTensor(Metric):
   Usage:
 
   ```python
-  m = tf.metrics.MeanTensor()
+  m = tf.keras.metrics.MeanTensor()
   m.update_state([0, 1, 2, 3])
   m.update_state([4, 5, 6, 7])
   print('Result: ', m.result().numpy())  # Result: [2, 3, 4, 5]
@@ -2393,9 +2417,9 @@ class MeanTensor(Metric):
       num_values = math_ops.multiply(num_values, sample_weight)
       values = math_ops.multiply(values, sample_weight)
 
-    update_total_op = state_ops.assign_add(self._total, values)
+    update_total_op = self._total.assign_add(values)
     with ops.control_dependencies([update_total_op]):
-      return state_ops.assign_add(self._count, num_values)
+      return self._count.assign_add(num_values)
 
   def result(self):
     if not self._built:
@@ -2407,12 +2431,16 @@ class MeanTensor(Metric):
 
   def reset_states(self):
     if self._built:
-      for v in self.variables:
-        K.set_value(v, np.zeros(self._shape.as_list()))
+      K.batch_set_value(
+          [(v, np.zeros(self._shape.as_list())) for v in self.variables])
 
 
+@keras_export('keras.metrics.BinaryCrossentropy')
 class BinaryCrossentropy(MeanMetricWrapper):
-  """Computes the binary crossentropy between `y_true` and `y_pred`.
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are only two
+  label classes (0 and 1).
 
   Usage:
 
@@ -2436,7 +2464,7 @@ class BinaryCrossentropy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -2461,7 +2489,6 @@ class BinaryCrossentropy(MeanMetricWrapper):
         e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for
         label `0` and `0.9` for label `1`"
     """
-    label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
 
     super(BinaryCrossentropy, self).__init__(
         binary_crossentropy,
@@ -2470,11 +2497,196 @@ class BinaryCrossentropy(MeanMetricWrapper):
         from_logits=from_logits,
         label_smoothing=label_smoothing)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(BinaryCrossentropy, cls).from_config(config)
+
+@keras_export('keras.metrics.CategoricalCrossentropy')
+class CategoricalCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are multiple
+  label classes (2 or more). Here we assume that labels are given as a `one_hot`
+  representation. eg., When labels values are [2, 0, 1],
+   `y_true` = [[0, 0, 1], [1, 0, 0], [0, 1, 0]].
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalCrossentropy()
+  m.update_state([[0, 1, 0], [0, 0, 1]],
+                 [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+
+  # EPSILON = 1e-7, y = y_true, y` = y_pred
+  # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+  # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+  # xent = -sum(y * log(y'), axis = -1)
+  #      = -((log 0.95), (log 0.1))
+  #      = [0.051, 2.302]
+  # Reduced xent = (0.051 + 2.302) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.176
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalCrossentropy()])
+  ```
+
+  Args:
+    name: (Optional) string name of the metric instance.
+    dtype: (Optional) data type of the metric result.
+    from_logits: (Optional ) Whether `y_pred` is expected to be a logits tensor.
+      By default, we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
+  """
+
+  def __init__(self,
+               name='categorical_crossentropy',
+               dtype=None,
+               from_logits=False,
+               label_smoothing=0):
+
+    super(CategoricalCrossentropy, self).__init__(
+        categorical_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
+
+
+@keras_export('keras.metrics.SparseCategoricalCrossentropy')
+class SparseCategoricalCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  Use this crossentropy metric when there are two or more label classes.
+  We expect labels to be provided as integers. If you want to provide labels
+  using `one-hot` representation, please use `CategoricalCrossentropy` metric.
+  There should be `# classes` floating point values per feature for `y_pred`
+  and a single floating point value per feature for `y_true`.
+
+  In the snippet below, there is a single floating point value per example for
+  `y_true` and `# classes` floating pointing values per example for `y_pred`.
+  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+  `[batch_size, num_classes]`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseCategoricalCrossentropy()
+  m.update_state(
+    [1, 2],
+    [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+
+  # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+  # logits = log(y_pred)
+  # softmax = exp(logits) / sum(exp(logits), axis=-1)
+  # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+  # xent = -sum(y * log(softmax), 1)
+  # log(softmax) = [[-2.9957, -0.0513, -16.1181], [-2.3026, -0.2231, -2.3026]]
+  # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+
+  # xent = [0.0513, 2.3026]
+  # Reduced xent = (0.0513 + 2.3026) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.176
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.SparseCategoricalCrossentropy()])
+  ```
+
+  Args:
+    name: (Optional) string name of the metric instance.
+    dtype: (Optional) data type of the metric result.
+    from_logits: (Optional ) Whether `y_pred` is expected to be a logits tensor.
+      By default, we assume that `y_pred` encodes a probability distribution.
+    axis: (Optional) Defaults to -1. The dimension along which the metric is
+      computed.
+  """
+
+  def __init__(self,
+               name='sparse_categorical_crossentropy',
+               dtype=None,
+               from_logits=False,
+               axis=-1):
+
+    super(SparseCategoricalCrossentropy, self).__init__(
+        sparse_categorical_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        axis=axis)
+
+
+class SumOverBatchSize(Reduce):
+  """Computes the weighted sum over batch size of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the metric value is 4.
+  If the weights were specified as [1, 1, 0, 0] then the value would be 1.
+
+  This metric creates two variables, `total` and `count` that are used to
+  compute the average of `values`. This average is ultimately returned as sum
+  over batch size which is an idempotent operation that simply divides `total`
+  by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+  """
+
+  def __init__(self, name='sum_over_batch_size', dtype=None):
+    super(SumOverBatchSize, self).__init__(
+        reduction=metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+        name=name,
+        dtype=dtype)
+
+
+class SumOverBatchSizeMetricWrapper(SumOverBatchSize):
+  """Wraps a function with the `SumOverBatchSizeMetricWrapper` metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `SumOverBatchSizeMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature `fn(y_true, y_pred,
+        **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(SumOverBatchSizeMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    return super(SumOverBatchSizeMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+    base_config = super(SumOverBatchSizeMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 def accuracy(y_true, y_pred):
@@ -2540,7 +2752,7 @@ mse = MSE = mean_squared_error
 mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
-cosine = cosine_proximity
+cosine_proximity = cosine_similarity
 
 
 def clone_metric(metric):
diff --git a/tensorflow/python/keras/metrics_confusion_matrix_test.py b/tensorflow/python/keras/metrics_confusion_matrix_test.py
index 6a0fcfed07730e6d72a16638a8dc0f20860a97b6..972f7b6de7bd6a8b856737a57bf79ae58746e758 100644
--- a/tensorflow/python/keras/metrics_confusion_matrix_test.py
+++ b/tensorflow/python/keras/metrics_confusion_matrix_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -949,8 +951,8 @@ class AUCTest(test.TestCase):
   def test_config(self):
     auc_obj = metrics.AUC(
         num_thresholds=100,
-        curve=metrics_utils.AUCCurve.PR,
-        summation_method=metrics_utils.AUCSummationMethod.MAJORING,
+        curve='PR',
+        summation_method='majoring',
         name='auc_1')
     self.assertEqual(auc_obj.name, 'auc_1')
     self.assertEqual(len(auc_obj.variables), 4)
@@ -958,6 +960,8 @@ class AUCTest(test.TestCase):
     self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
     self.assertEqual(auc_obj.summation_method,
                      metrics_utils.AUCSummationMethod.MAJORING)
+    old_config = auc_obj.get_config()
+    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
 
     # Check save and restore config
     auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
@@ -967,6 +971,8 @@ class AUCTest(test.TestCase):
     self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
     self.assertEqual(auc_obj2.summation_method,
                      metrics_utils.AUCSummationMethod.MAJORING)
+    new_config = auc_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
 
   def test_value_is_idempotent(self):
     self.setup()
@@ -1021,8 +1027,7 @@ class AUCTest(test.TestCase):
   def test_weighted_roc_majoring(self):
     self.setup()
     auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        summation_method=metrics_utils.AUCSummationMethod.MAJORING)
+        num_thresholds=self.num_thresholds, summation_method='majoring')
     self.evaluate(variables.variables_initializer(auc_obj.variables))
     result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
 
@@ -1037,8 +1042,7 @@ class AUCTest(test.TestCase):
   def test_weighted_roc_minoring(self):
     self.setup()
     auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        summation_method=metrics_utils.AUCSummationMethod.MINORING)
+        num_thresholds=self.num_thresholds, summation_method='minoring')
     self.evaluate(variables.variables_initializer(auc_obj.variables))
     result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
 
@@ -1054,8 +1058,8 @@ class AUCTest(test.TestCase):
     self.setup()
     auc_obj = metrics.AUC(
         num_thresholds=self.num_thresholds,
-        curve=metrics_utils.AUCCurve.PR,
-        summation_method=metrics_utils.AUCSummationMethod.MAJORING)
+        curve='PR',
+        summation_method='majoring')
     self.evaluate(variables.variables_initializer(auc_obj.variables))
     result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
 
@@ -1071,8 +1075,8 @@ class AUCTest(test.TestCase):
     self.setup()
     auc_obj = metrics.AUC(
         num_thresholds=self.num_thresholds,
-        curve=metrics_utils.AUCCurve.PR,
-        summation_method=metrics_utils.AUCSummationMethod.MINORING)
+        curve='PR',
+        summation_method='minoring')
     self.evaluate(variables.variables_initializer(auc_obj.variables))
     result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
 
@@ -1086,10 +1090,7 @@ class AUCTest(test.TestCase):
 
   def test_weighted_pr_interpolation(self):
     self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        curve=metrics_utils.AUCCurve.PR,
-        summation_method=metrics_utils.AUCSummationMethod.INTERPOLATION)
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR')
     self.evaluate(variables.variables_initializer(auc_obj.variables))
     result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
 
@@ -1115,6 +1116,16 @@ class AUCTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
       metrics.AUC(num_thresholds=1)
 
+  def test_invalid_curve(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid AUC curve value "Invalid".'):
+      metrics.AUC(curve='Invalid')
+
+  def test_invalid_summation_method(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Invalid AUC summation method value "Invalid".'):
+      metrics.AUC(summation_method='Invalid')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..abef3c4d3f1bd78ae70ea9662c3d49f473c0561c
--- /dev/null
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -0,0 +1,322 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests metrics correctness using Keras model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import tf2
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
+
+  def _get_multi_io_model(self):
+    inp_1 = layers.Input(shape=(1,), name='input_1')
+    inp_2 = layers.Input(shape=(1,), name='input_2')
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    inputs = np.asarray([[1.], [2.], [3.], [4.]])
+    targets = np.asarray([[2.], [4.], [6.], [8.]])
+    w1 = np.asarray([2., 3., 4., 5.])
+    w2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      x = [inputs[start:end], inputs[start:end]]
+      y = [targets[start:end], targets[start:end]]
+      w = [w1[start:end], w2[start:end]]
+      yield x, y, w
+
+  def setUp(self):
+    super(TestMetricsCorrectnessMultiIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights_1 = np.asarray([2., 3., 4., 5.])
+    self.weights_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric `output_1`, `output_2`:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Weighted metric `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = (3.5 + 2.5) + (1.5 + 0.5)
+    #   Result = 4.375
+
+    # Loss `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    # Loss `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = 2 + 2
+    #   Result = 8.75
+
+    # Total loss = 32.5 + 8.75 = 41.25
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'output_1_mean_squared_error': [7.5, 7.5],
+        'output_2_mean_squared_error': [7.5, 7.5],
+        'output_1_' + wmse: [9.286, 9.286],
+        'output_2_' + wmse: [4.375, 4.375],
+        'loss': [41.25, 41.25],
+        'output_1_loss': [32.5, 32.5],
+        'output_2_loss': [8.75, 8.75],
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
+    # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
+    self.expected_batch_result = [41.25, 32.5, 8.75, 7.5, 9.286, 7.5, 4.375]
+
+  def test_fit(self):
+    model = self._get_multi_io_model()
+    history = model.fit([self.x, self.x], [self.y, self.y],
+                        sample_weight={
+                            'output_1': self.weights_1,
+                            'output_2': self.weights_2,
+                        },
+                        batch_size=2,
+                        epochs=2,
+                        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
+                                 batch_size=2,
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate([x, x], [y, y], sample_weight=[w, w], batch_size=5)[3]
+    mse2 = model.evaluate([x, x], [y, y], sample_weight=[w, w],
+                          batch_size=10)[3]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.train_on_batch([self.x, self.x], [self.y, self.y],
+                                  sample_weight={
+                                      'output_1': self.weights_1,
+                                      'output_2': self.weights_2,
+                                  })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.test_on_batch([self.x, self.x], [self.y, self.y],
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_multi_io_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
+
+  def _get_model(self):
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out = layers.Dense(
+        1, kernel_initializer='ones', name='output', trainable=False)
+    model = testing_utils.get_model_from_layers([x, out], input_shape=(1,))
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    x = np.asarray([[1.], [2.], [3.], [4.]])
+    y = np.asarray([[2.], [4.], [6.], [8.]])
+    w = np.asarray([2., 3., 4., 5.])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      yield x[start:end], y[start:end], w[start:end]
+
+  def setUp(self):
+    super(TestMetricsCorrectnessSingleIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights = np.asarray([2., 3., 4., 5.])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Total loss:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130,
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'mean_squared_error': [7.5, 7.5],
+        wmse: [9.286, 9.286],
+        'loss': [32.5, 32.5]
+    }
+
+    # In the order: 'loss', 'mean_squared_error', 'mean_squared_error_2'
+    self.expected_batch_result = [32.5, 7.5, 9.286]
+
+  def test_fit(self):
+    model = self._get_model()
+    history = model.fit(
+        self.x,
+        self.y,
+        sample_weight=self.weights,
+        batch_size=2,
+        epochs=2,
+        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_model()
+    eval_result = model.evaluate(
+        self.x, self.y, batch_size=2, sample_weight=self.weights)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[1]
+    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[1]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_model()
+    result = model.train_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_model()
+    result = model.test_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 4e755dd3017403e242841be32d607203e15821dc..d210f0ebeea6eeaa80c27b407a08d539d38f849a 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
 import math
 import os
 import numpy as np
@@ -34,9 +35,127 @@ from tensorflow.python.keras import metrics
 from tensorflow.python.keras import Model
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasSumTest(test.TestCase):
+
+  def test_sum(self):
+    m = metrics.Sum(name='my_sum')
+
+    # check config
+    self.assertEqual(m.name, 'my_sum')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 1)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check initial state
+    self.assertEqual(self.evaluate(m.total), 0)
+
+    # check __call__()
+    self.assertEqual(self.evaluate(m(100)), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+    self.evaluate(update_op)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106)
+    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+
+    # check reset_states()
+    m.reset_states()
+    self.assertEqual(self.evaluate(m.total), 0)
+
+  def test_sum_with_sample_weight(self):
+    m = metrics.Sum(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check scalar weight
+    result_t = m(100, sample_weight=0.5)
+    self.assertEqual(self.evaluate(result_t), 50)
+    self.assertEqual(self.evaluate(m.total), 50)
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 52., 4)  # 50 + 1 + 5 * 0.2
+    self.assertAlmostEqual(self.evaluate(m.total), 52., 4)
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAlmostEqual(self.evaluate(result_t), 53.5, 1)  # 52 + 0.5 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 1)
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAlmostEqual(self.evaluate(result_t), 55.5, 1)  # 53.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 1)
+
+    # check weights expand
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAlmostEqual(self.evaluate(result_t), 57.5, 2)  # 55.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 1)
+
+    # check values reduced to the dimensions of weight
+    result_t = m([[[1., 2.], [3., 2.], [0.5, 4.]]], sample_weight=[0.5])
+    result = np.round(self.evaluate(result_t), decimals=2)
+    # result = (prev: 57.5) + 0.5 + 1 + 1.5 + 1 + 0.25 + 2
+    self.assertAlmostEqual(result, 63.75, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 63.75, 2)
+
+  def test_sum_graph_with_placeholder(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      m = metrics.Sum()
+      v = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+      self.evaluate(variables.variables_initializer(m.variables))
+
+      # check __call__()
+      result_t = m(v, sample_weight=w)
+      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+      self.assertEqual(result, 50)
+      self.assertEqual(self.evaluate(m.total), 50)
+
+      # check update_state() and result()
+      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+      self.assertAlmostEqual(result, 52., 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52., 2)
+
+  def test_save_restore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    m = metrics.Sum()
+    checkpoint = trackable_utils.Checkpoint(sum=m)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # update state
+    self.evaluate(m(100.))
+    self.evaluate(m(200.))
+
+    # save checkpoint and then add an update
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(m(1000.))
+
+    # restore to the same checkpoint sum object (= 300)
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.evaluate(m(300.))
+    self.assertEqual(600., self.evaluate(m.result()))
+
+    # restore to a different checkpoint sum object
+    restore_sum = metrics.Sum()
+    restore_checkpoint = trackable_utils.Checkpoint(sum=restore_sum)
+    status = restore_checkpoint.restore(save_path)
+    restore_update = restore_sum(300.)
+    status.assert_consumed().run_restore_ops()
+    self.evaluate(restore_update)
+    self.assertEqual(600., self.evaluate(restore_sum.result()))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -149,7 +268,7 @@ class KerasMeanTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
     m = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=m)
+    checkpoint = trackable_utils.Checkpoint(mean=m)
     self.evaluate(variables.variables_initializer(m.variables))
 
     # update state
@@ -167,7 +286,7 @@ class KerasMeanTest(test.TestCase):
 
     # restore to a different checkpoint mean object
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
@@ -180,10 +299,10 @@ class KerasMeanTest(test.TestCase):
 class KerasAccuracyTest(test.TestCase):
 
   def test_accuracy(self):
-    acc_obj = metrics.Accuracy(name='my acc')
+    acc_obj = metrics.Accuracy(name='my_acc')
 
     # check config
-    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertEqual(acc_obj.name, 'my_acc')
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
@@ -197,7 +316,7 @@ class KerasAccuracyTest(test.TestCase):
 
     # Check save and restore config
     a2 = metrics.Accuracy.from_config(acc_obj.get_config())
-    self.assertEqual(a2.name, 'my acc')
+    self.assertEqual(a2.name, 'my_acc')
     self.assertTrue(a2.stateful)
     self.assertEqual(len(a2.variables), 2)
     self.assertEqual(a2.dtype, dtypes.float32)
@@ -208,10 +327,10 @@ class KerasAccuracyTest(test.TestCase):
     self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
 
   def test_binary_accuracy(self):
-    acc_obj = metrics.BinaryAccuracy(name='my acc')
+    acc_obj = metrics.BinaryAccuracy(name='my_acc')
 
     # check config
-    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertEqual(acc_obj.name, 'my_acc')
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
@@ -247,10 +366,10 @@ class KerasAccuracyTest(test.TestCase):
     self.assertAlmostEqual(result, 0.5, 2)
 
   def test_categorical_accuracy(self):
-    acc_obj = metrics.CategoricalAccuracy(name='my acc')
+    acc_obj = metrics.CategoricalAccuracy(name='my_acc')
 
     # check config
-    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertEqual(acc_obj.name, 'my_acc')
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
@@ -270,10 +389,10 @@ class KerasAccuracyTest(test.TestCase):
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
   def test_sparse_categorical_accuracy(self):
-    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
 
     # check config
-    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertEqual(acc_obj.name, 'my_acc')
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
@@ -293,10 +412,10 @@ class KerasAccuracyTest(test.TestCase):
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
   def test_sparse_categorical_accuracy_mismatched_dims(self):
-    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
 
     # check config
-    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertEqual(acc_obj.name, 'my_acc')
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
@@ -316,7 +435,7 @@ class KerasAccuracyTest(test.TestCase):
 
   def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
     with context.graph_mode(), self.cached_session() as sess:
-      acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+      acc_obj = metrics.SparseCategoricalAccuracy(name='my_acc')
       self.evaluate(variables.variables_initializer(acc_obj.variables))
 
       t = array_ops.placeholder(dtypes.float32)
@@ -335,7 +454,7 @@ class KerasAccuracyTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
     epsilon = 1e-12
@@ -349,25 +468,25 @@ class CosineProximityTest(test.TestCase):
 
     y_true = self.l2_norm(self.np_y_true, axis)
     y_pred = self.l2_norm(self.np_y_pred, axis)
-    self.expected_loss = -np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
 
     self.y_true = constant_op.constant(self.np_y_true)
     self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = metrics.CosineProximity(
+    cosine_obj = metrics.CosineSimilarity(
         axis=2, name='my_cos', dtype=dtypes.int32)
     self.assertEqual(cosine_obj.name, 'my_cos')
     self.assertEqual(cosine_obj._dtype, dtypes.int32)
 
     # Check save and restore config
-    cosine_obj2 = metrics.CosineProximity.from_config(cosine_obj.get_config())
+    cosine_obj2 = metrics.CosineSimilarity.from_config(cosine_obj.get_config())
     self.assertEqual(cosine_obj2.name, 'my_cos')
     self.assertEqual(cosine_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
     self.setup()
-    cosine_obj = metrics.CosineProximity()
+    cosine_obj = metrics.CosineSimilarity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
@@ -375,7 +494,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_weighted(self):
     self.setup()
-    cosine_obj = metrics.CosineProximity()
+    cosine_obj = metrics.CosineSimilarity()
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     sample_weight = np.asarray([1.2, 3.4])
     loss = cosine_obj(
@@ -388,7 +507,7 @@ class CosineProximityTest(test.TestCase):
 
   def test_axis(self):
     self.setup(axis=1)
-    cosine_obj = metrics.CosineProximity(axis=1)
+    cosine_obj = metrics.CosineSimilarity(axis=1)
     self.evaluate(variables.variables_initializer(cosine_obj.variables))
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = np.mean(self.expected_loss)
@@ -567,26 +686,43 @@ class HingeTest(test.TestCase):
   def test_unweighted(self):
     hinge_obj = metrics.Hinge()
     self.evaluate(variables.variables_initializer(hinge_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #        = [0.6, 0.4125]
+    # reduced metric = (0.6 + 0.4125) / 2
 
     update_op = hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
     result = hinge_obj.result()
-    self.assertAllClose(0.65, result, atol=1e-5)
+    self.assertAllClose(0.506, result, atol=1e-3)
 
   def test_weighted(self):
     hinge_obj = metrics.Hinge()
     self.evaluate(variables.variables_initializer(hinge_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    y_true = constant_op.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    sample_weight = constant_op.constant([1.5, 2.])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #        = [0.6, 0.4125]
+    # weighted metric = [0.6 * 1.5, 0.4125 * 2]
+    # reduced metric = (0.6 * 1.5 + 0.4125 * 2) / (1.5 + 2)
+
     result = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.65714, self.evaluate(result), atol=1e-5)
+    self.assertAllClose(0.493, self.evaluate(result), atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -605,26 +741,49 @@ class SquaredHingeTest(test.TestCase):
   def test_unweighted(self):
     sq_hinge_obj = metrics.SquaredHinge()
     self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #        = [0.485, 0.2431]
+    # reduced metric = (0.485 + 0.2431) / 2
 
     update_op = sq_hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
     result = sq_hinge_obj.result()
-    self.assertAllClose(0.65, result, atol=1e-5)
+    self.assertAllClose(0.364, result, atol=1e-3)
 
   def test_weighted(self):
     sq_hinge_obj = metrics.SquaredHinge()
     self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    y_true = constant_op.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    sample_weight = constant_op.constant([1.5, 2.])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #        = [0.485, 0.2431]
+    # weighted metric = [0.485 * 1.5, 0.2431 * 2]
+    # reduced metric = (0.485 * 1.5 + 0.2431 * 2) / (1.5 + 2)
+
     result = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(0.65714, self.evaluate(result), atol=1e-5)
+    self.assertAllClose(0.347, self.evaluate(result), atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -778,7 +937,7 @@ class SparseTopKCategoricalAccuracyTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class LogcoshTest(test.TestCase):
+class LogCoshErrorTest(test.TestCase):
 
   def setup(self):
     y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
@@ -792,13 +951,13 @@ class LogcoshTest(test.TestCase):
     self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    logcosh_obj = metrics.Logcosh(name='logcosh', dtype=dtypes.int32)
+    logcosh_obj = metrics.LogCoshError(name='logcosh', dtype=dtypes.int32)
     self.assertEqual(logcosh_obj.name, 'logcosh')
     self.assertEqual(logcosh_obj._dtype, dtypes.int32)
 
   def test_unweighted(self):
     self.setup()
-    logcosh_obj = metrics.Logcosh()
+    logcosh_obj = metrics.LogCoshError()
     self.evaluate(variables.variables_initializer(logcosh_obj.variables))
 
     update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
@@ -809,7 +968,7 @@ class LogcoshTest(test.TestCase):
 
   def test_weighted(self):
     self.setup()
-    logcosh_obj = metrics.Logcosh()
+    logcosh_obj = metrics.LogCoshError()
     self.evaluate(variables.variables_initializer(logcosh_obj.variables))
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     result = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -867,7 +1026,7 @@ class PoissonTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class KullbackLeiblerDivergenceTest(test.TestCase):
+class KLDivergenceTest(test.TestCase):
 
   def setup(self):
     y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
@@ -880,17 +1039,17 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
     self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    k_obj = metrics.KullbackLeiblerDivergence(name='kld', dtype=dtypes.int32)
+    k_obj = metrics.KLDivergence(name='kld', dtype=dtypes.int32)
     self.assertEqual(k_obj.name, 'kld')
     self.assertEqual(k_obj._dtype, dtypes.int32)
 
-    k_obj2 = metrics.KullbackLeiblerDivergence.from_config(k_obj.get_config())
+    k_obj2 = metrics.KLDivergence.from_config(k_obj.get_config())
     self.assertEqual(k_obj2.name, 'kld')
     self.assertEqual(k_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
     self.setup()
-    k_obj = metrics.KullbackLeiblerDivergence()
+    k_obj = metrics.KLDivergence()
     self.evaluate(variables.variables_initializer(k_obj.variables))
 
     update_op = k_obj.update_state(self.y_true, self.y_pred)
@@ -901,7 +1060,7 @@ class KullbackLeiblerDivergenceTest(test.TestCase):
 
   def test_weighted(self):
     self.setup()
-    k_obj = metrics.KullbackLeiblerDivergence()
+    k_obj = metrics.KLDivergence()
     self.evaluate(variables.variables_initializer(k_obj.variables))
 
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
@@ -1052,7 +1211,7 @@ class MeanIoUTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
-class MeanTensorTest(keras_parameterized.TestCase):
+class MeanTensorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_config(self):
@@ -1128,7 +1287,7 @@ class MeanTensorTest(keras_parameterized.TestCase):
     self.assertAllClose(self.evaluate(m.count), [3, 1.4])
 
     # check weights expand
-    m = metrics.MeanTensor((2, 1), dtype=dtypes.float64)
+    m = metrics.MeanTensor(dtype=dtypes.float64)
     self.evaluate(variables.variables_initializer(m.variables))
     result_t = m([[1], [5]], sample_weight=[1, 0.2])
     self.assertAllClose(self.evaluate(result_t), [[1], [5]])
@@ -1210,14 +1369,14 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertEqual(bce_obj._dtype, dtypes.int32)
 
     old_config = bce_obj.get_config()
-    self.assertAllClose(self.evaluate(old_config['label_smoothing']), 0.2, 1e-3)
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
 
     # Check save and restore config
     bce_obj2 = metrics.BinaryCrossentropy.from_config(old_config)
     self.assertEqual(bce_obj2.name, 'bce')
     self.assertEqual(bce_obj2._dtype, dtypes.int32)
     new_config = bce_obj2.get_config()
-    self.assertAllClose(self.evaluate(new_config['label_smoothing']), 0.2, 1e-3)
+    self.assertDictEqual(old_config, new_config)
 
   def test_unweighted(self):
     bce_obj = metrics.BinaryCrossentropy()
@@ -1320,6 +1479,345 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = metrics.CategoricalCrossentropy(
+        name='cce', dtype=dtypes.int32, label_smoothing=0.2)
+    self.assertEqual(cce_obj.name, 'cce')
+    self.assertEqual(cce_obj._dtype, dtypes.int32)
+
+    old_config = cce_obj.get_config()
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
+
+    # Check save and restore config
+    cce_obj2 = metrics.CategoricalCrossentropy.from_config(old_config)
+    self.assertEqual(cce_obj2.name, 'cce')
+    self.assertEqual(cce_obj2._dtype, dtypes.int32)
+    new_config = cce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    result = cce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Reduced metric = (0.051 + 2.302) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+  def test_unweighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    result = cce_obj(y_true, logits)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+
+    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+    # xent = [0.00045, 7.00182]
+    # Reduced xent = (0.00045 + 7.00182) / 2
+
+    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+  def test_weighted(self):
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Weighted metric = [0.051 * 1.5, 2.302 * 2.]
+    # Reduced metric = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, logits, sample_weight=sample_weight)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # xent = [0.00045, 7.00182]
+    # weighted xent = [0.000675, 14.00364]
+    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+  def test_label_smoothing(self):
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    label_smoothing = 0.1
+
+    # Label smoothing: z' = z * (1 - L) + L/n,
+    #     where L = label smoothing value and n = num classes
+    # Label value 1 becomes: 1 - L + L/n
+    # Label value 0 becomes: L/n
+    # y_true with label_smoothing = [[0.0333, 0.9333, 0.0333],
+    #                               [0.0333, 0.0333, 0.9333]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[-0.26641, -0.00042, -0.29971],
+    #                          [-0.23316, -0.00006, -6.53479]]
+    # xent = [0.56654, 6.76801]
+    # Reduced xent = (0.56654 + 6.76801) / 2
+
+    cce_obj = metrics.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseCategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(
+        name='scce', dtype=dtypes.int32)
+    self.assertEqual(scce_obj.name, 'scce')
+    self.assertEqual(scce_obj.dtype, dtypes.int32)
+    old_config = scce_obj.get_config()
+    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+    # Check save and restore config
+    scce_obj2 = metrics.SparseCategoricalCrossentropy.from_config(old_config)
+    self.assertEqual(scce_obj2.name, 'scce')
+    self.assertEqual(scce_obj2.dtype, dtypes.int32)
+    new_config = scce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    result = scce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+    #                      [-2.3026, -0.2231, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # sum(exp(logits), axis=-1) = [1, 1]
+    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+    #                 [-2.3026, -0.2231, -2.3026]]
+    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Reduced xent = (0.0513 + 2.3026) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+  def test_unweighted_from_logits(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    result = scce_obj(y_true, logits)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y_true * log(softmax), 1)
+
+    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # y_true * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+    # xent = [0.00045, 7.00182]
+    # Reduced xent = (0.00045 + 7.00182) / 2
+
+    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+  def test_weighted(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = scce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+    #                      [-2.3026, -0.2231, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # sum(exp(logits), axis=-1) = [1, 1]
+    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+    #                 [-2.3026, -0.2231, -2.3026]]
+    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Weighted xent = [0.051 * 1.5, 2.302 * 2.]
+    # Reduced xent = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = scce_obj(y_true, logits, sample_weight=sample_weight)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y_true * log(softmax), 1)
+    # xent = [0.00045, 7.00182]
+    # weighted xent = [0.000675, 14.00364]
+    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+  def test_axis(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(axis=0)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.1], [0.95, 0.8], [0, 0.1]])
+    result = scce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -2.3026],
+    #                      [-0.0513, -0.2231],
+    #                      [-16.1181, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 0], [1, 0], [0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # sum(exp(logits)) = [1, 1]
+    # softmax = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # log(softmax) = [[-2.9957, -2.3026],
+    #                 [-0.0513, -0.2231],
+    #                 [-16.1181, -2.3026]]
+    # y * log(softmax) = [[0, 0], [-0.0513, 0], [0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Reduced xent = (0.0513 + 2.3026) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+
+class BinaryTruePositives(metrics.Metric):
+
+  def __init__(self, name='binary_true_positives', **kwargs):
+    super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+    self.true_positives = self.add_weight(name='tp', initializer='zeros')
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    y_true = math_ops.cast(y_true, dtypes.bool)
+    y_pred = math_ops.cast(y_pred, dtypes.bool)
+
+    values = math_ops.logical_and(
+        math_ops.equal(y_true, True), math_ops.equal(y_pred, True))
+    values = math_ops.cast(values, self.dtype)
+    if sample_weight is not None:
+      sample_weight = math_ops.cast(sample_weight, dtype=self.dtype)
+      sample_weight = weights_broadcast_ops.broadcast_weights(
+          sample_weight, values)
+      values = math_ops.multiply(values, sample_weight)
+    self.true_positives.assign_add(math_ops.reduce_sum(values))
+
+  def result(self):
+    return self.true_positives
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CustomMetricsTest(test.TestCase):
+
+  def test_config(self):
+    btp_obj = BinaryTruePositives(name='btp', dtype=dtypes.int32)
+    self.assertEqual(btp_obj.name, 'btp')
+    self.assertEqual(btp_obj.dtype, dtypes.int32)
+
+    # Check save and restore config
+    btp_obj2 = BinaryTruePositives.from_config(btp_obj.get_config())
+    self.assertEqual(btp_obj2.name, 'btp')
+    self.assertEqual(btp_obj2.dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    btp_obj = BinaryTruePositives()
+    self.evaluate(variables.variables_initializer(btp_obj.variables))
+    y_true = constant_op.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
+                                   [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
+    y_pred = constant_op.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
+                                   [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
+
+    update_op = btp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = btp_obj.result()
+    self.assertEqual(7, self.evaluate(result))
+
+  def test_weighted(self):
+    btp_obj = BinaryTruePositives()
+    self.evaluate(variables.variables_initializer(btp_obj.variables))
+    y_true = constant_op.constant([[0, 0.9, 0, 1, 0], [0, 0, 1, 1, 1],
+                                   [1, 1, 1, 1, 0], [0, 0, 0, 0, 1.5]])
+    y_pred = constant_op.constant([[0, 0, 1, 5, 0], [1, 1, 1, 1, 1],
+                                   [0, 1, 0, 1, 0], [1, 10, 1, 1, 1]])
+    sample_weight = constant_op.constant([[1.], [1.5], [2.], [2.5]])
+    result = btp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertEqual(12, self.evaluate(result))
+
+
 def _get_model(compile_metrics):
   model_layers = [
       layers.Dense(3, activation='relu', kernel_initializer='ones'),
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4ff08d662f53c18a94abdde782919fa3d20c0b18
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -0,0 +1,136 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Contains the Keras Mixed Precision API (TensorFlow version).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "mixed_precision_experimental",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale_optimizer",
+        ":policy",
+    ],
+)
+
+py_library(
+    name = "policy",
+    srcs = [
+        "policy.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+    ],
+)
+
+py_test(
+    name = "policy_test",
+    size = "medium",
+    srcs = [
+        "policy_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":policy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_library(
+    name = "autocast_variable",
+    srcs = [
+        "autocast_variable.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/distribute:values",
+    ],
+)
+
+py_test(
+    name = "autocast_variable_test",
+    size = "medium",
+    srcs = ["autocast_variable_test.py"],
+    deps = [
+        ":autocast_variable",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/keras/optimizer_v2",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "medium",
+    srcs = ["loss_scale_optimizer_test.py"],
+    deps = [
+        ":loss_scale_optimizer",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "//tensorflow/python/keras",
+    ],
+)
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+    ],
+)
+
+py_test(
+    name = "keras_test",
+    size = "medium",
+    srcs = ["keras_test.py"],
+    deps = [
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/__init__.py b/tensorflow/python/keras/mixed_precision/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0854602f362b0fad71d63b729a7ea7bc391ca050
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mixed precision API."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer import LossScaleOptimizer
+from tensorflow.python.keras.mixed_precision.experimental.policy import global_policy
+from tensorflow.python.keras.mixed_precision.experimental.policy import Policy
+from tensorflow.python.keras.mixed_precision.experimental.policy import set_policy
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64b5178316009354c6adecc8213bf7681504e6f
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -0,0 +1,178 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains AutoCastVariable, a variable which automatically casts itself."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import values as distribute_values
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+
+
+# TODO(reedwm): Make checkpointable?
+class AutoCastVariable(object):
+  """Variable that will cast itself to a different dtype in applicable contexts.
+
+  This class wraps a floating-point tf.Variable. It emulates the variable
+  interface and delegates to the wrapped variable, but it additionally will cast
+  the wrapped variable under a `Graph._enable_variable_auto_cast(dtype)` context
+  manager.
+
+  For example:
+
+  ```
+  v = tf.Variable(1.0, dtype=tf.float32)
+  v = AutoCastVariable(v)
+  print(tf.identity(v).dtype)  # tf.float32
+  with ops.get_default_graph()._enable_variable_auto_cast(tf.float16):
+    print(tf.identity(v).dtype)  # tf.float16, as v will cast itself to float16
+    print(v.dtype)  # tf.float16, as v.dtype also changes under the ctx manager.
+  ```
+
+  The purpose of this class is to allow Keras layers to create variables in
+  float32, and automatically cast them to float16 or bfloat16 when the layer is
+  called.
+  """
+
+  def __init__(self, variable):
+    """Creates an AutoCastVariable instance.
+
+    Args:
+      variable: A floating-point resource variable to wrap.
+
+    Raises:
+      ValueError: If `variable` is not a floating-point resource variable
+    """
+    if not resource_variable_ops.is_resource_variable(variable):
+      raise ValueError('variable must be of type tf.ResourceVariable, but got: '
+                       '%s' % variable)
+    if not variable.dtype.is_floating:
+      raise ValueError('variable must be a floating point variable but has '
+                       'type: %s' % variable.dtype.name)
+    self._variable = variable
+
+  @property
+  def name(self):
+    return self._variable.name
+
+  def _should_cast(self):
+    """Returns True if this variable should be casted when accessed."""
+    g = ops.get_default_graph()
+    # pylint:disable=protected-access
+    return (g._auto_cast_variable_read_dtype is not None and
+            self.true_dtype != g._auto_cast_variable_read_dtype)
+    # pylint:enable=protected-access
+
+  @property
+  def dtype(self):
+    """The dtype this variable will be casted to when read."""
+    if self._should_cast():
+      return ops.get_default_graph()._auto_cast_variable_read_dtype  # pylint:disable=protected-access
+    else:
+      return self._variable.dtype
+
+  @property
+  def true_dtype(self):
+    """The dtype of the underlying variable, before any casts are done."""
+    return self._variable.dtype
+
+  def value(self):
+    val = self._variable.value()
+    if not self._should_cast():
+      return val
+    # We colocate_with(None) to ignore the existing device constraints, so that
+    # the cast is always done on the variable's device
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def read_value(self):
+    val = self._variable.read_value()
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def sparse_read(self, indices, name=None):
+    """Reads the value of this variable sparsely, using `gather`."""
+    val = self._variable.sparse_read(indices, name=name)
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    return self._variable.assign(
+        value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_add(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_add(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_sub(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  # TODO(reedwm): Support assigning variables with tf.assign(), var.scatter_add,
+  # etc.
+
+  def __getattr__(self, name):
+    return getattr(self._variable, name)
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts this variable to a tensor."""
+    if not self._should_cast():
+      return ops.internal_convert_to_tensor(self._variable, dtype, name,
+                                            as_ref)
+    # TODO(reedwm): Support as_ref?
+    assert not as_ref
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          'Incompatible type conversion requested to type {!r} for variable '
+          'of type {!r}'.format(dtype.name, self.dtype.name))
+    val = ops.internal_convert_to_tensor(self._variable,
+                                         self._variable.dtype, name,
+                                         as_ref=False)
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  # TODO(reedwm): Define operator overloads.
+
+
+ops.register_tensor_conversion_function(
+    AutoCastVariable, AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
+ops.register_dense_tensor_like_type(AutoCastVariable)
+
+
+# We have DistributedVariable subclass to pass
+# isinstance(..., DistributedVariable) checks when wrapping a
+# DistributedVariable.
+# TODO(reedwm): We should not wrap DistributedVariable, but instead have
+# DistributedVariable wrap AutoCastVariable. Subclassing DistributedVariable is
+# messy, because we do not fully implement the interface of DistributedVariable.
+class AutoCastDistributedVariable(AutoCastVariable,
+                                  distribute_values.DistributedVariable):
+  """Version of AutoCastVariable that subclasses DistributedVariable."""
+
+  def __init__(self, variable):
+    if not isinstance(variable, distribute_values.DistributedValues):
+      raise ValueError('variable must be of type DistributedValues, '
+                       'but got: %s' % variable)
+    super(AutoCastDistributedVariable, self).__init__(variable)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caec6a738709768b35aeab9bd18fe67e90982a9
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -0,0 +1,245 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AutoCastVariable."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+TESTCASES = ({
+    'testcase_name': 'base',
+    'distribute': False
+}, {
+    'testcase_name': 'distribute',
+    'distribute': True
+})
+
+
+def get_distribute_scope(distribute):
+
+  class DummyContextManager(object):
+
+    def __enter__(self):
+      pass
+
+    def __exit__(self, *args):
+      pass
+
+  if distribute:
+    return mirrored_strategy.MirroredStrategy(['cpu:0']).scope()
+  else:
+    return DummyContextManager()
+
+
+def get_autocast_var(var, distribute):
+  if distribute:
+    return autocast_variable.AutoCastDistributedVariable(var)
+  else:
+    return autocast_variable.AutoCastVariable(var)
+
+
+def get_var(val, dtype):
+  return variables.VariableV1(val, use_resource=True, dtype=dtype)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      self.assertEqual(x.dtype, dtypes.float32)
+      self.assertEqual(x.value().dtype, dtypes.float32)
+      self.assertEqual(x.read_value().dtype, dtypes.float32)
+      self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+      # within auto cast scope of different dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.value().dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
+
+      # within auto cast scope of same dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float32):
+        self.assertEqual(x.dtype, dtypes.float32)
+        self.assertEqual(x.value().dtype, dtypes.float32)
+        self.assertEqual(x.read_value().dtype, dtypes.float32)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read_nested_scopes(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+        with ops.get_default_graph()._enable_auto_casting_variables(
+            dtypes.float32):
+          self.assertEqual(x.dtype, dtypes.float32)
+          self.assertEqual(x.read_value().dtype, dtypes.float32)
+
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_operator_overloads(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+    v1 = constant_op.constant(2., dtype=dtypes.float32)
+    v2 = constant_op.constant(2., dtype=dtypes.float16)
+
+    # Because autocast variables do not yet define operator overloads, the
+    # operator is defined by the non-variable tensor
+
+    # Test variable as the LHS. Currently, this is not supported with
+    # distributed autocast variables
+    if not distribute:
+      self.assertEqual(self.evaluate(x + v1), 3.)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(self.evaluate(x + v2), 3.)
+
+    # Test variable as the RHS
+    self.assertEqual(self.evaluate(v1 + x), 3.)
+
+    with ops.get_default_graph()._enable_auto_casting_variables(
+        dtypes.float16):
+      self.assertEqual(self.evaluate(v2 + x), 3.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(0., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      v1 = constant_op.constant(3.14, dtype=dtypes.float32)
+      v2 = constant_op.constant(3.14, dtype=dtypes.float16)
+
+      def run_and_check():
+        # Assign float32 values
+        self.assertAllClose(3.14, self.evaluate(x.assign(v1)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1)))
+
+        # Attempt to assign float16 values
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_add(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_sub(v2))
+
+        # Assign Python floats
+        self.assertAllClose(3.14, self.evaluate(x.assign(3.14)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14)))
+
+      run_and_check()
+      # reset x
+      self.evaluate(x.assign(0.))
+      # within auto cast scope.
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # assign still expect float32 value even if in float16 scope
+        run_and_check()
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign_stays_in_true_dtype(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+      # small_val is a value such that 1.0 + small_val == 1.0 in fp16, but not
+      # in fp32
+      small_val = np.finfo('float16').eps / 2
+      small_tensor = constant_op.constant(small_val, dtype=dtypes.float32)
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # Variable should be increased, despite it appearing to be the same
+        # float16 value.
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign(1. + small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+      self.evaluate(x.assign(1.))
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign_add(small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_invalid_wrapped_variable(self, distribute):
+    with get_distribute_scope(distribute):
+      # Wrap a non-variable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = constant_op.constant([1.], dtype=dtypes.float32)
+        get_autocast_var(x, distribute)
+
+      # Wrap a non-floating point variable
+      with self.assertRaisesRegexp(ValueError,
+                                   'variable must be a floating point'):
+        x = get_var(1, dtypes.int32)
+        get_autocast_var(x, distribute)
+
+    if distribute:
+      # Wrap a non-distributed variable with AutoCastDistributedVariable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = get_var(1., dtypes.float32)
+        get_autocast_var(x, distribute)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..34f71f719c872f30f7f7ceb91cb244a25dedf09b
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -0,0 +1,382 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests mixed precision works correctly with Keras layers and models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import models
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class AssertTypeLayer(base_layer.Layer):
+  """A layer which asserts it's inputs are a certain type."""
+
+  def __init__(self, assert_type=None, **kwargs):
+    self._assert_type = assert_type
+    super(AssertTypeLayer, self).__init__(**kwargs)
+
+  def assert_input_types(self, inputs):
+    """Asserts `inputs` are of the correct type. Should be called in call()."""
+    if self._assert_type:
+      inputs_flattened = nest.flatten(inputs)
+      for inp in inputs_flattened:
+        assert inp.dtype.base_dtype == self._assert_type, (
+            'Input tensor has type %s which does not match assert type %s' %
+            (inp.dtype.name, self._assert_type.name))
+
+
+class AddLayer(AssertTypeLayer):
+  """A layer which adds it's input to a scalar variable."""
+
+  def __init__(self, regularizer=None, use_operator=False, **kwargs):
+    """Initializes the AddLayer.
+
+    Args:
+      regularizer: The regularizer on the scalar variable.
+      use_operator: If True, add using the + operator. If False, add using
+        tf.add.
+      **kwargs: Passed to AssertTypeLayer constructor.
+    """
+    self._regularizer = regularizer
+    self._use_operator = use_operator
+    super(AddLayer, self).__init__(**kwargs)
+
+  def build(self, _):
+    self.v = self.add_weight('v', (), initializer='ones',
+                             regularizer=self._regularizer)
+    self.built = True
+
+  def call(self, inputs):
+    self.assert_input_types(inputs)
+    assert inputs.dtype == self.v.dtype
+    return self._add(inputs, self.v)
+
+  def _add(self, x, y):
+    if self._use_operator:
+      return x + y
+    else:
+      return math_ops.add(x, y)
+
+
+class AddLayerWithoutAutoCast(AddLayer):
+  """Same as AddLayer, but does not use AutoCastVariables."""
+
+  def build(self, _):
+    dtype = self.dtype
+    if dtype in ('float16', 'bfloat16'):
+      dtype = 'float32'
+    self.v = self.add_weight('v', (), initializer='ones', dtype=dtype,
+                             experimental_autocast=False,
+                             regularizer=self._regularizer)
+    self.built = True
+
+  def call(self, inputs):
+    self.assert_input_types(inputs)
+    assert self.v.dtype in (dtypes.float32, dtypes.float64)
+    return self._add(inputs, math_ops.cast(self.v, inputs.dtype))
+
+
+class IdentityRegularizer(regularizers.Regularizer):
+
+  def __call__(self, x):
+    assert x.dtype == dtypes.float32
+    return array_ops.identity(x)
+
+
+def create_one_device_strategy():
+  return one_device_strategy.OneDeviceStrategy('cpu:0')
+
+
+def create_mirrored_strategy():
+  if context.num_gpus() >= 1:
+    return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
+  else:
+    return mirrored_strategy.MirroredStrategy(['cpu:0'])
+
+
+TESTCASES = ({
+    'testcase_name': 'base',
+    'strategy_fn': create_one_device_strategy
+}, {
+    'testcase_name': 'distribute',
+    'strategy_fn': create_mirrored_strategy
+})
+
+
+class KerasLayerTest(test.TestCase, parameterized.TestCase):
+  """Test mixed precision with Keras layers."""
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_variables_in_float32(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope():
+      with policy.policy_scope('infer_float32_vars'):
+        layer = AddLayer(assert_type=dtypes.float16)
+        y = layer(x)
+        self.assertEqual(layer.v.dtype, dtypes.float32)
+        self.assertEqual(y.dtype, dtypes.float16)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(y), 2.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_layer_with_non_autocast_variable(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope():
+      with policy.policy_scope('infer_float32_vars'):
+        layer = AddLayerWithoutAutoCast(assert_type=dtypes.float16)
+        y = layer(x)
+        self.assertEqual(layer.v.dtype, dtypes.float32)
+        self.assertEqual(y.dtype, dtypes.float16)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(y), 2.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_layer_regularizer_runs_in_float32(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope():
+      with policy.policy_scope('infer_float32_vars'):
+        # Test on AddLayer
+        layer = AddLayer(assert_type=dtypes.float16,
+                         regularizer=IdentityRegularizer())
+        layer(x)
+        (regularizer_loss,) = layer.losses
+        self.assertEqual(regularizer_loss.dtype, dtypes.float32)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(regularizer_loss), 1.)
+
+        # Test on AddLayerWithoutAutoCast
+        layer = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
+                                        regularizer=IdentityRegularizer())
+        layer(x)
+        (regularizer_loss,) = layer.losses
+        self.assertEqual(regularizer_loss.dtype, dtypes.float32)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(regularizer_loss), 1.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_passing_policy_to_layer(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope():
+      # Passing a Policy to 'dtype' sets the policy for that layer.
+      layer = AddLayer(assert_type=dtypes.float16,
+                       dtype=policy.Policy('infer_float32_vars'))
+      # layer.dtype refers to the variable dtype
+      self.assertEqual(layer.dtype, dtypes.float32)
+      layer(x)
+      self.assertEqual(layer.v.dtype, dtypes.float32)
+      with policy.policy_scope('infer_float32_vars'):
+        # Passing a Policy to dtype overrides the global Policy
+        layer = AddLayer(assert_type=dtypes.float16,
+                         dtype=policy.Policy('infer'))
+        # layer dtype is not yet known
+        self.assertEqual(layer.dtype, None)
+        layer(x)
+        self.assertEqual(layer.v.dtype, dtypes.float16)
+        self.assertEqual(layer.dtype, dtypes.float16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_gradient(self, strategy_fn):
+    x = constant_op.constant([1.], dtype=dtypes.float16)
+    with strategy_fn().scope() as strategy:
+      with policy.policy_scope('infer_float32_vars'):
+        layer = AddLayer(assert_type=dtypes.float16)
+        def run_fn():
+          with backprop.GradientTape() as tape:
+            y = layer(x)
+            # Divide by num_replicas_in_sync, as the effective total loss is the
+            # sum of each of the replica's losses.
+            y /= strategy.num_replicas_in_sync
+
+          # Learning rate is small enough that if applied to a float16 variable,
+          # the variable will not change. So this tests the learning rate is not
+          # applied to a float16 value, but instead the float32 variable.
+          opt = gradient_descent.SGD(2 ** -14)
+          grad = tape.gradient(y, layer.v)
+          return opt.apply_gradients([(grad, layer.v)])
+
+        op = strategy.experimental_run(run_fn)
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          self.evaluate(op)
+        # The gradient with respective to the variable is 1. Since the
+        # variable is initialized with 1 and the learning rate is 2**-14, the
+        # new variable value should be: init_val - gradient * learning_rate,
+        # which is  1 - 1 * 2**-14
+        self.assertEqual(self.evaluate(layer.v), 1 - 2 ** -14)
+
+
+class KerasModelTest(test.TestCase, parameterized.TestCase):
+  """Test mixed precision with Keras models."""
+
+  @parameterized.named_parameters({
+      'testcase_name': 'base',
+      'strategy_fn': create_one_device_strategy,
+  }, {
+      'testcase_name': 'distribute',
+      'strategy_fn': create_mirrored_strategy,
+  }, {
+      'testcase_name': 'operator',
+      'strategy_fn': create_mirrored_strategy,
+      'use_operator': True
+  }, {
+      'testcase_name': 'regularizer',
+      'strategy_fn': create_mirrored_strategy,
+      'use_regularizer': True
+  })
+  @test_util.run_in_graph_and_eager_modes
+  def test_model(self, strategy_fn, use_operator=False, use_regularizer=False):
+    regularizer = IdentityRegularizer() if use_regularizer else None
+    with strategy_fn().scope():
+      with policy.policy_scope('infer_float32_vars'):
+        x = layers.Input(shape=(), batch_size=2, dtype=dtypes.float16)
+        layer = AddLayer(assert_type=dtypes.float16, use_operator=use_operator,
+                         regularizer=regularizer)
+        y = layer(x)
+        y = math_ops.cast(y, dtypes.float32)
+        model = models.Model(inputs=x, outputs=y)
+
+        def loss_fn(y_true, y_pred):
+          del y_true
+          return math_ops.reduce_mean(y_pred)
+
+        # Learning rate is small enough that if applied to a float16 variable,
+        # the variable will not change. So this tests the learning rate not
+        # applied to a float16 value, but instead the float32 variable.
+        opt = gradient_descent.SGD(2 ** -14)
+        model.compile(opt, loss=loss_fn)
+
+      self.assertEqual(backend.eval(layer.v), 1)
+      x = np.ones((2, 1))
+      y = np.ones((2, 1))
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
+      model.fit(dataset)
+      # Variable starts at 1, and should have gradient of 2 ** -14 subtracted
+      # from it.
+      expected = 1 - 2 ** -14
+      if use_regularizer:
+        # Regularizer adds another 2 ** -14 to the gradient.
+        expected -= 2 ** -14
+      self.assertEqual(backend.eval(layer.v), expected)
+
+  @parameterized.named_parameters({
+      'testcase_name': 'base',
+      'strategy_fn': create_one_device_strategy,
+  }, {
+      'testcase_name': 'distribute',
+      'strategy_fn': create_mirrored_strategy,
+  }, {
+      'testcase_name': 'loss_scaling',
+      'strategy_fn': create_mirrored_strategy,
+      'use_loss_scaling': True
+  })
+  @test_util.run_in_graph_and_eager_modes
+  def test_advanced_model(self, strategy_fn, use_loss_scaling=False):
+
+    # The advanced model tests mixed-precision-related features that would occur
+    # in a resnet50 model. It tests a model that has:
+    #  * Multiple layers, some which use auto-cast variables and some which do
+    #    not
+    #  * Regularization on some variables and not others.
+    #  * Loss scaling (if use_loss_scaling is True)
+
+    strategy = strategy_fn()
+    if use_loss_scaling:
+      loss_scale = 8.
+    learning_rate = 2 ** -14
+
+    with strategy.scope():
+      with policy.policy_scope(policy.Policy('infer_float32_vars')):
+        x = layers.Input(shape=(), batch_size=2, dtype=dtypes.float16)
+        layer1 = AddLayer(assert_type=dtypes.float16,
+                          regularizer=IdentityRegularizer(), use_operator=True)
+        layer2 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
+                                         use_operator=True)
+        layer3 = AddLayer(assert_type=dtypes.float16, use_operator=False)
+        layer4 = AddLayerWithoutAutoCast(assert_type=dtypes.float16,
+                                         regularizer=IdentityRegularizer(),
+                                         use_operator=False)
+        y = layer1(x)
+        y = layer2(y)
+        y = layer3(y)
+        y = layer4(y)
+        if use_loss_scaling:
+          # The gradient of 'y' at this point is 1. With loss scaling, the
+          # gradient is 'loss_scale'. The DistributionStrategy additionally
+          # scales the gradient by 1/num_replicas in_sync. We divide by the
+          # batch size of 2 since the loss is averaged across batch elements.
+          expected_gradient = loss_scale / strategy.num_replicas_in_sync / 2
+          identity_with_grad_check_fn = (
+              mp_test_util.create_identity_with_grad_check_fn(
+                  expected_dtype=dtypes.float16,
+                  expected_gradient=[expected_gradient] * 2))
+          y = core.Lambda(identity_with_grad_check_fn)(y)
+        y = math_ops.cast(y, dtypes.float32)
+        model = models.Model(inputs=x, outputs=y)
+
+        def loss_fn(y_true, y_pred):
+          self.assertEqual(y_true.dtype, dtypes.float32)
+          self.assertEqual(y_pred.dtype, dtypes.float32)
+          return math_ops.reduce_mean(y_pred)
+
+        opt = gradient_descent.SGD(learning_rate)
+        if use_loss_scaling:
+          opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        model.compile(opt, loss=loss_fn)
+
+      x = np.ones((2, 1))
+      y = np.ones((2, 1))
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
+      model.fit(dataset)
+      for layer in (layer1, layer2, layer3, layer4):
+        if layer.losses:
+          # Layer has weight regularizer
+          self.assertEqual(backend.eval(layer.v), 1 - 2 * learning_rate)
+        else:
+          # Layer does not have weight regularizer
+          self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f5259dbc3467e6692b02bc70bc3540bcd17d3e
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -0,0 +1,128 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the loss scaling optimizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
+class LossScaleOptimizer(optimizer_v2.OptimizerV2):
+  """An optimizer that applies loss scaling.
+
+  Loss scaling is a process that multiplies the loss by a multiplier called the
+  loss scale, and divides each gradient by the same multiplier. The pseudocode
+  for this process is:
+
+  ```
+  loss = ...
+  loss *= loss_scale
+  grads = gradients(loss, vars)
+  grads /= loss_scale
+  ```
+
+  Mathematically, loss scaling has no effect, but can help avoid numerical
+  underflow in intermediate gradients when float16 tensors are used. By
+  multiplying the loss, each intermediate gradient will have the same multiplier
+  applied.
+
+  This optimizer wraps another optimizer and applies loss scaling to it. Loss
+  scaling is applied whenever gradients are computed, either through
+  `minimize()` or `get_gradients()`.
+  """
+
+  def __init__(self, opt, loss_scale):
+    """Initializes this loss scale optimizer.
+
+    Args:
+      opt: The Optimizer instance to wrap.
+      loss_scale: A float loss scale to scale loss and gradients by
+    """
+    if not isinstance(opt, optimizer_v2.OptimizerV2):
+      raise ValueError('"opt" must be an instance of OptimizerV2, but got: %s'
+                       % opt)
+    if hasattr(opt, 'clipnorm'):
+      raise ValueError('LossScaleOptimizer does not support wrapping '
+                       'optimizers with a clipnorm. Optimizer %s has clipnorm '
+                       '%s' % (opt, opt.clipnorm))
+
+    if hasattr(opt, 'clipvalue'):
+      raise ValueError('LossScaleOptimizer does not support wrapping '
+                       'optimizers with a clipvalue. Optimizer %s has '
+                       'clipvalue %s' % (opt, opt.clipvalue))
+
+    self._optimizer = opt
+    self._loss_scale = float(loss_scale)
+
+  def _compute_gradients(self, loss, var_list, grad_loss=None):
+    loss = self._scale_loss(loss)
+    grads_and_vars = self._optimizer._compute_gradients(loss, var_list,  # pylint: disable=protected-access
+                                                        grad_loss)
+    grads = [g for g, _ in grads_and_vars]
+    variables = [v for _, v in grads_and_vars]
+    scaled_grads = self._scale_grads(grads)
+    return list(zip(scaled_grads, variables))
+
+  def get_gradients(self, loss, params):
+    loss = self._scale_loss(loss)
+    grads = self._optimizer.get_gradients(loss, params)
+    return self._scale_grads(grads)
+
+  def _scale_loss(self, loss):
+    # The loss is callable for `_compute_gradients`, but not `get_gradients`.
+    if callable(loss):
+      return lambda: loss() * self._loss_scale
+    else:
+      return loss * self._loss_scale
+
+  def _scale_grads(self, grads):
+    loss_scale_reciprocal = 1 / self._loss_scale
+    return [None if g is None else g * loss_scale_reciprocal for g in grads]
+
+  def apply_gradients(self, grads_and_vars, name=None):
+    return self._optimizer.apply_gradients(grads_and_vars, name)
+
+  @property
+  def learning_rate(self):
+    return self._optimizer.learning_rate
+
+  @learning_rate.setter
+  def learning_rate(self, lr):
+    self._optimizer.learning_rate = lr
+
+  # TODO(reedwm): Support dynamic loss scaling.
+
+  # TODO(reedwm): Maybe merge this class's functionality into OptimizerV2.
+
+  # TODO(reedwm): Maybe throw an error if mixed precision is used without this
+  # optimizer being used.
+
+  # TODO(reedwm): Define __getattr__ to delegate all methods/attributes to
+  # self._optimizer. This is tricky because the super class overrides
+  # __getattribute__.
+
+  # TODO(reedwm): Implement get_config and from_config. This will first require
+  # implementing deserialization support for OptimizerV2.
+  def get_config(self):
+    raise NotImplementedError('get_config() is not yet implemented for '
+                              'LossScaleOptimizers')
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    raise NotImplementedError('from_config() is not yet implemented for '
+                              'LossScaleOptimizers')
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..021a0d5a0a0c271658b47a9332d26e3ca15950d8
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -0,0 +1,123 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScaleOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
+from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def create_one_device_strategy():
+  return one_device_strategy.OneDeviceStrategy('cpu:0')
+
+
+def create_mirrored_strategy():
+  if context.num_gpus() >= 1:
+    return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
+  else:
+    return mirrored_strategy.MirroredStrategy(['cpu:0'])
+
+
+TESTCASES = ({
+    'testcase_name': 'Base',
+    'strategy_fn': create_one_device_strategy
+}, {
+    'testcase_name': 'Distribute',
+    'strategy_fn': create_mirrored_strategy
+})
+
+
+class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
+
+  def _run_if_in_graph_mode(self, val):
+    # Running only in graph mode is useful, because optimizers sometimes return
+    # a value that, in Graph mode, is runnable with self.evaluate. But in Eager
+    # mode, the optimizer already does the computations and the return value
+    # cannot be run.
+    if not context.executing_eagerly():
+      self.evaluate(val)
+
+  def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
+    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
+        expected_grad)
+    loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
+    return lambda: opt.minimize(loss, var_list=[var])
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testLossScaleAppliedToLossWithMinimize(self, strategy_fn):
+    with strategy_fn().scope() as strategy:
+      var = variables.Variable([5.0])
+      opt = gradient_descent.SGD(2.0)
+      loss_scale = 10.
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
+      # / strategy.num_replicas_in_sync will not be exact, which could lead to
+      # assertion failures due to rounding issues.
+      self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
+      run_fn = self._run_fn_with_grad_check(
+          strategy, var, opt, loss_scale / strategy.num_replicas_in_sync)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
+  @test_util.deprecated_graph_mode_only
+  def testLossScaleAppliedToLossWithGetGradientsTest(self):
+    var = variables.Variable([2.0])
+    opt = gradient_descent.SGD(1.0)
+    loss_scale = 10.
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(loss_scale)
+    loss = grad_check_fn(var)
+    run_op = opt.get_gradients(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    # This will cause an assertion to run, as
+    # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
+    self.evaluate(run_op)
+
+  def testInvalidConstructorArguments(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'must be an instance of OptimizerV2'):
+      loss_scale_optimizer.LossScaleOptimizer(optimizers.SGD(), 10.)
+
+    with self.assertRaisesRegexp(ValueError, 'does not support wrapping '
+                                             'optimizers with a clipnorm'):
+      loss_scale_optimizer.LossScaleOptimizer(
+          gradient_descent.SGD(1.0, clipnorm=1.0), 10.)
+
+    with self.assertRaisesRegexp(ValueError, 'does not support wrapping '
+                                             'optimizers with a clipvalue'):
+      loss_scale_optimizer.LossScaleOptimizer(
+          gradient_descent.SGD(1.0, clipvalue=1.0), 10.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e9b6219eb7758248c669a0892dec608367e9b8
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -0,0 +1,163 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the Policy class for mixed precision training."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.mixed_precision.experimental.Policy')
+class Policy(object):
+  """A mixed precision policy for a Keras layer.
+
+  A mixed precision policy determines the floating-point dtype that Keras layers
+  should create variables in. For non-default policies, if the variable dtype
+  does not match the input dtype, variables will automatically be casted to the
+  input dtype to avoid type errors. Policies can be passed to the 'dtype'
+  argument of layer constructors, or a global policy can be set with
+  'set_policy'.
+
+  In the near future, policies will also determine the computation dtype of
+  layers, as well as the loss scaling algorithm.
+
+  Policies are intended to enable mixed precision training, which require using
+  float32 variables and [b]float16 computations for most layers. The term "mixed
+  precision" refers to the use of both float16 (or bfloat16) and float32 in a
+  model. See https://arxiv.org/abs/1710.03740 for more information on mixed
+  precision training.
+
+  Policies are constructed by passing a string to the `name` constructor
+  argument. `name` determines the behavior of the policy. Currently, `name` can
+  be one of the following values.
+
+    * 'infer': Infer the variable and computation dtypes from the input dtype.
+      This is the default behavior.
+    * 'infer_float32_vars': Infer the computation dtypes from the input
+      dtype, but create variables in float32. Variables will be casted to the
+      computation dtype. This is intended to enable mixed precision. Users can
+      cast tensors to float16 before passing them to a layer, which causes the
+      layer to run it's computation in float16 while keeping variables in
+      float32.
+
+  To use mixed precision in a model, the 'infer_float32_vars' policy can be used
+  alongside float16 input tensors, which results in float16 computations and
+  float32 variables. For example:
+
+  ```python
+  tf.keras.mixed_precision.experimental.set_policy('infer_float32_vars')
+  model = tf.keras.models.Sequential(
+      tf.keras.layers.Input((100,), dtype='float16'),
+      tf.keras.layers.Dense(10),
+      tf.keras.layers.Dense(10),
+      tf.keras.layers.Lambda(lambda x: tf.cast(x, 'float32')),
+      tf.keras.layers.Activation('Softmax')
+  )
+  ```
+
+  Alternatively, the policy can be passed to individual layers instead of
+  setting the global policy with `set_policy`:
+
+  ```python
+  policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars')
+  model = tf.keras.models.Sequential(
+      tf.keras.layers.Input((100,), dtype='float16'),
+      tf.keras.layers.Dense(10, dtype=policy),
+      tf.keras.layers.Dense(10, dtype=policy),
+      tf.keras.layers.Lambda(lambda x: tf.cast(x, 'float32')),
+      tf.keras.layers.Activation('Softmax')
+  )
+  ```
+
+  Note that a LossScaleOptimizer should also be used for mixed precision models
+  to avoid numerical underflow. See `LossScaleOptimizer`.
+  """
+
+  def __init__(self, name):
+    self._name = name
+    if name == 'infer':
+      self._default_variable_dtype = None
+    elif name == 'infer_float32_vars':
+      self._default_variable_dtype = 'float32'
+    else:
+      raise ValueError('"name" argument to Policy constructor must be "infer" '
+                       'or "infer_float32_vars", but got: %s' % name)
+
+  @property
+  def name(self):
+    """Returns the name of the policy: "infer" or "infer_float32_vars."""
+    return self._name
+
+  @property
+  def default_variable_dtype(self):
+    """Returns the default variable dtype of this policy.
+
+    This is the dtype layers will create their variables in, unless a layer
+    explicit chooses a different dtype. Layers will cast variables to the
+    appropriate dtype to avoid type errors.
+
+    Returns:
+      The default variable dtype of this policy, or None if the default variable
+      dtype should be derived from the inputs.
+    """
+    return self._default_variable_dtype
+
+  @property
+  def should_cast_variables(self):
+    """Returns true if variables should be casted."""
+    return self.default_variable_dtype is not None
+
+  # TODO(reedwm): Implement get_config/from_config.
+
+
+# TODO(reedwm): Make this thread local?
+_global_policy = Policy('infer')
+
+
+@keras_export('keras.mixed_precision.experimental.global_policy')
+def global_policy():
+  """Returns the global Policy.
+
+  The global policy is the default policy used for layers, if no policy is
+  passed to the layer constructor. When TensorFlow starts, the global policy is
+  set to an "infer" policy, and can be changed with `set_policy`.
+
+  Returns:
+    The global Policy.
+  """
+  return _global_policy
+
+
+@keras_export('keras.mixed_precision.experimental.set_policy')
+def set_policy(policy):
+  """Sets the global Policy."""
+  global _global_policy
+  if not isinstance(policy, Policy):
+    policy = Policy(policy)
+  _global_policy = policy
+
+
+# TODO(reedwm): Make this thread local
+@contextlib.contextmanager
+def policy_scope(policy):
+  old_policy = _global_policy
+  try:
+    set_policy(policy)
+    yield
+  finally:
+    set_policy(old_policy)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..278f52110449cbaef95e5acdef8d19b1a955ca2e
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -0,0 +1,69 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests Policies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.mixed_precision.experimental import policy as mp_policy
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PolicyTest(test.TestCase):
+  """Tests Policies."""
+
+  def test_infer(self):
+    policy = mp_policy.Policy('infer')
+    self.assertEqual(policy.name, 'infer')
+    self.assertEqual(policy.default_variable_dtype, None)
+
+  def test_infer_float32_vars(self):
+    policy = mp_policy.Policy('infer_float32_vars')
+    self.assertEqual(policy.name, 'infer_float32_vars')
+    self.assertEqual(policy.default_variable_dtype, 'float32')
+
+  def test_global_policy(self):
+    self.assertEqual(mp_policy.global_policy().name, 'infer')
+    default_policy = mp_policy.global_policy()
+    try:
+      mp_policy.set_policy('infer_float32_vars')
+      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+      self.assertEqual(mp_policy.global_policy().default_variable_dtype,
+                       'float32')
+      with ops.Graph().as_default():  # Policies are not associated with a graph
+        self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+      mp_policy.set_policy('infer')
+      self.assertEqual(mp_policy.global_policy().name, 'infer')
+      self.assertEqual(mp_policy.global_policy().default_variable_dtype, None)
+      policy = mp_policy.Policy('infer_float32_vars')
+      mp_policy.set_policy(policy)
+      self.assertIs(mp_policy.global_policy(), policy)
+    finally:
+      mp_policy.set_policy(default_policy)
+
+  def test_policy_scope(self):
+    with mp_policy.policy_scope('infer_float32_vars'):
+      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+      with mp_policy.policy_scope('infer'):
+        self.assertEqual(mp_policy.global_policy().name, 'infer')
+      self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
+    self.assertEqual(mp_policy.global_policy().name, 'infer')
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/test_util.py b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a949af94f81afe09bf1bff34b469b3afe95cb560
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
@@ -0,0 +1,58 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains testing utilities related to mixed precision."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import custom_gradient
+
+
+def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
+  """Returns a function that asserts it's gradient has a certain value.
+
+  This serves as a hook to assert intermediate gradients have a certain value.
+  This returns an identity function. The identity's gradient function is also
+  the identity function, except it asserts that the gradient equals
+  `expected_gradient` and has dtype `expected_dtype`.
+
+  Args:
+    expected_gradient: The gradient function asserts that the gradient is this
+      value.
+    expected_dtype: The gradient function asserts the gradient has this dtype.
+
+  Returns:
+    An identity function whose gradient function asserts the gradient has a
+    certain value.
+  """
+  @custom_gradient.custom_gradient
+  def identity_with_grad_check(x):
+    """Function that asserts it's gradient has a certain value."""
+    x = array_ops.identity(x)
+    def grad(dx):
+      if expected_dtype:
+        assert dx.dtype == expected_dtype, (
+            'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
+      expected_tensor = ops.convert_to_tensor(expected_gradient, dtype=dx.dtype)
+      assert_op = check_ops.assert_equal(dx, expected_tensor, data=[dx])
+      with ops.control_dependencies([assert_op]):
+        dx = array_ops.identity(dx)
+      return dx
+    return x, grad
+  return identity_with_grad_check
+
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 6d8ff9d847bafe8a6632741dd8ccb09295db3057..768b8e4dd3bb0ff280b12e8764650ca5e32c642b 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import data_structures
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -187,8 +187,8 @@ def get_nested_model_3(input_dim, num_classes):
   return keras.Model(inputs, outputs, name='nested_model_3')
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class ModelSubclassingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class ModelSubclassingTest(keras_parameterized.TestCase):
 
   def test_custom_build(self):
     class DummyModel(keras.Model):
@@ -210,6 +210,26 @@ class ModelSubclassingTest(test.TestCase):
     self.assertTrue(test_model.uses_custom_build, 'Model should use user '
                                                   'defined build when called.')
 
+  def test_custom_build_with_fit(self):
+
+    class DummyModel(keras.Model):
+
+      def __init__(self):
+        super(DummyModel, self).__init__()
+        self.layer1 = keras.layers.Dense(10, activation='relu')
+
+      def build(self, input_shape):
+        self.layer2 = keras.layers.Dense(1, activation='relu')
+
+      def call(self, inputs):
+        return self.layer2(self.layer1(inputs))
+
+    model = DummyModel()
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=2)
+    self.assertLen(model.layers, 2)
+    self.assertLen(model.trainable_variables, 4)
+
   def test_invalid_input_shape_build(self):
     num_classes = 2
     input_dim = 50
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 9bc5aa2be5628d05b97ac58058f0183b5375b7d3..e4371c2a93db2a992ed461a20c9a382726aa24ab 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -88,7 +88,6 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
   tensor_map = {}  # Map {reference_tensor: corresponding_tensor}
   if input_tensors is None:
     # Create placeholders to build the model on top of.
-    input_layers = []
     input_tensors = []
     for layer in model._input_layers:
       input_tensor = Input(
@@ -100,10 +99,6 @@ def _clone_functional_model(model, input_tensors=None, share_weights=False):
       # Cache newly created input layer.
       newly_created_input_layer = input_tensor._keras_history[0]
       layer_map[layer] = newly_created_input_layer
-
-    for original_input_layer, cloned_input_layer in zip(model._input_layers,
-                                                        input_layers):
-      layer_map[original_input_layer] = cloned_input_layer
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
@@ -313,15 +308,15 @@ def _in_place_subclassed_model_reset(model):
       continue
     if isinstance(value, Layer):
       attributes_cache[name] = value
-      assert value in model._layers
-      if hasattr(value, '_layers') and value._layers:
+      assert value in model.layers
+      if hasattr(value, 'layers') and value.layers:
         raise ValueError('We do not support the use of nested layers '
                          'in `model_to_estimator` at this time. Found nested '
                          'layer: %s' % value)
     elif isinstance(
-        value,
-        (list, tuple)) and name not in ('layers', '_layers', 'metrics',
-                                        '_compile_stateful_metric_functions'):
+        value, (list, tuple)) and name not in ('layers', '_layers', 'metrics',
+                                               '_compile_metric_functions',
+                                               '_output_loss_metrics'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -367,8 +362,6 @@ def _in_place_subclassed_model_reset(model):
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
-          '_fit_function',
-          '_eval_function',
           'train_function',
           'test_function',
           'predict_function',
@@ -461,7 +454,7 @@ def clone_and_build_model(
       or functions.
     compile_clone: Boolean, whether to compile model clone (default `True`).
     in_place_reset: Boolean, whether to reset the model in place. Only used if
-      the model is not a graph network. If the model is a subclassed model, then
+      the model is a subclassed model. In the case of a subclassed model,
       this argument must be set to `True` (default `False`). To restore the
       original model, use the function
       `in_place_subclassed_model_state_restoration(model)`.
@@ -504,8 +497,8 @@ def clone_and_build_model(
   else:
     if not in_place_reset:
       raise ValueError(
-          'Model is not a graph network (usually means that it is a subclassed '
-          'model). The model cannot be cloned, but there is a workaround where '
+          'This model is a subclassed model. '
+          'Such a model cannot be cloned, but there is a workaround where '
           'the model is reset in-place. To use this, please set the argument '
           '`in_place_reset` to `True`. This will reset the attributes in the '
           'original model. To restore the attributes, call '
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 3eab10f624a4c36ba423e817b373fccf35ceeda6..f429aba498d90b3afc9d18925543c88b48c5ffd9 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -291,7 +291,10 @@ class CheckpointingTests(keras_parameterized.TestCase):
         optimizer=opt, loss='mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
-    model.fit(x=np.array([[1., 2., 3., 4.]]), y=np.array([1.]), epochs=2)
+    model.fit(
+        x=np.array([[1., 2., 3., 4.]]),
+        y=np.array([[1., 1., 1., 1.]]),
+        epochs=2)
     save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
     beta1_power, _ = opt._get_beta_accumulators()
     self.evaluate(beta1_power.assign(12.))
diff --git a/tensorflow/python/keras/ops.py b/tensorflow/python/keras/ops.py
index bc14eef505853723dc494e0f8c6b764bf5d297d0..b2d852054448b4887fe9f9f28ad4f99e12ce7680 100644
--- a/tensorflow/python/keras/ops.py
+++ b/tensorflow/python/keras/ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import init_ops_v2
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -56,22 +55,33 @@ keras_export(v1=["keras.initializers.he_uniform"])(
 
 keras_export("keras.initializers.Initializer", v1=[])(
     init_ops_v2.Initializer)
-keras_export("keras.initializers.Zeros", v1=[])(
-    init_ops_v2.Zeros)
-keras_export("keras.initializers.Ones", v1=[])(
-    init_ops_v2.Ones)
-keras_export("keras.initializers.Constant", v1=[])(
-    init_ops_v2.Constant)
+keras_export(
+    "keras.initializers.Zeros", "keras.initializers.zeros", v1=[])(
+        init_ops_v2.Zeros)
+keras_export(
+    "keras.initializers.Ones", "keras.initializers.ones", v1=[])(
+        init_ops_v2.Ones)
+keras_export(
+    "keras.initializers.Constant", "keras.initializers.constant", v1=[])(
+        init_ops_v2.Constant)
 keras_export("keras.initializers.VarianceScaling", v1=[])(
     init_ops_v2.VarianceScaling)
-keras_export("keras.initializers.Orthogonal", v1=[])(
-    init_ops_v2.Orthogonal)
-keras_export("keras.initializers.Identity", v1=[])(
-    init_ops_v2.Identity)
-keras_export("keras.initializers.GlorotUniform", v1=[])(
-    init_ops_v2.GlorotUniform)
-keras_export("keras.initializers.GlorotNormal", v1=[])(
-    init_ops_v2.GlorotNormal)
+keras_export(
+    "keras.initializers.Orthogonal", "keras.initializers.orthogonal", v1=[])(
+        init_ops_v2.Orthogonal)
+keras_export(
+    "keras.initializers.Identity", "keras.initializers.identity", v1=[])(
+        init_ops_v2.Identity)
+keras_export(
+    "keras.initializers.GlorotUniform",
+    "keras.initializers.glorot_uniform",
+    v1=[])(
+        init_ops_v2.GlorotUniform)
+keras_export(
+    "keras.initializers.GlorotNormal",
+    "keras.initializers.glorot_normal",
+    v1=[])(
+        init_ops_v2.GlorotNormal)
 keras_export("keras.initializers.lecun_normal", v1=[])(
     init_ops_v2.lecun_normal)
 keras_export("keras.initializers.lecun_uniform", v1=[])(
@@ -90,6 +100,3 @@ keras_export("keras.initializers.TruncatedNormal", v1=[])(
 
 
 keras_export("keras.backend.name_scope")(ops.name_scope)
-
-keras_export("keras.losses.Reduction", v1=[])(
-    losses_impl.ReductionV2)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 45afe2a134cdfb5a6bae1f0c5be760433602f65b..e15cf80466ba8a32c0cfaa85b68b26ded982aece 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -25,6 +25,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":learning_rate_schedule",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
@@ -39,6 +40,21 @@ py_library(
     ],
 )
 
+py_library(
+    name = "learning_rate_schedule",
+    srcs = [
+        "learning_rate_schedule.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/keras:generic_utils",
+    ],
+)
+
 cuda_py_test(
     name = "adagrad_test",
     size = "medium",
@@ -56,6 +72,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -75,6 +92,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -94,6 +112,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -113,6 +132,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -132,6 +152,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -151,6 +172,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -170,6 +192,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 py_test(
@@ -178,7 +201,10 @@ py_test(
     srcs = ["optimizer_v2_test.py"],
     shard_count = 8,
     tags = [
+        "no_gpu",  # b/127001953
         "no_windows",
+        # TODO(b/127092862): Re-enable this test in Kokoro.
+        "no_oss",
     ],
     deps = [
         ":optimizer_v2",
@@ -197,12 +223,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "learning_rate_schedule_test",
+    size = "medium",
+    srcs = ["learning_rate_schedule_test.py"],
+    shard_count = 4,
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "rmsprop_test",
     size = "medium",
     srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":optimizer_v2",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
@@ -215,4 +256,5 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 2,
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index a3d5538ea86a0e0ed86e5ee70df69248ec76ba48..1ceb93328246f44525a17098fa5e8fc3eaf98611 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -41,13 +41,14 @@ class Adadelta(optimizer_v2.OptimizerV2):
 
   Initialization:
 
-  $$accum_g_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
-  $$accum_x_0 := 0 \text{(Initialize variable update 2nd order moment vector)}$$
+  $$E[g^2]_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
+  $$E[\Delta x^2]_0 := 0 \text{(Initialize 2nd order variable update)}$$
 
   $$t := t + 1$$
-  $$accum_g_t := rho * accum_g_{t-1} + (1 - rho) * g * g$$
-  $$delta = -\sqrt{accum_x_{t-1}} / (\sqrt{accum_g_{t-1}} + \epsilon)$$
-  $$accum_x_t := rho * accum_x_{t-1} + (1 - rho) * delta * delta$$
+  $$E[g^2]_t := \rho * E[g^2]_{t-1} + (1 - \rho) * g^2$$
+  $$\Delta x_t = -RMS[\Delta x]_{t-1} * g_t / RMS[g]_t$$
+  $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
+  $$x_t := x_{t-1} + \Delta x_{t}
 
   References
     See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index 0840aa6fae5be0b698de69827f483ec55b9ea37a..c30c0661506cbd962c69d191a47cbd4021ecb484 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -41,18 +41,18 @@ class Adagrad(optimizer_v2.OptimizerV2):
   the smaller the updates.
 
   Initialization:
+  $$accum_{g_0} := \text{initial_accumulator_value}$$
 
-  $$accum_g_0 := initial_accumulator_value$$
-
+  Update step:
   $$t := t + 1$$
-  $$accum_g_t := accum_g_{t-1} + g * g$$
-  $$theta_t := theta_{t-1} - lr * g / (\sqrt{accum_g_t} + \epsilon)$$
-
-  References
-    See [paper]
-      (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-    or this
-      [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  $$accum_{g_t} := accum_{g_{t-1}} + g^2$$
+  $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
+
+  References:
+
+  * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
+  * [Introduction]
+    (https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 864aefaf70def4249b8d93cfad34f0a594e03ba9..9c8d3ff8a4ef89a34fc1217f2a27d13d3e172d68 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -160,6 +161,52 @@ class AdagradOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        decay = 0.5
+        lr_schedule = learning_rate_schedule.InverseTimeDecay(
+            learning_rate, decay_steps=1.0, decay_rate=decay)
+
+        ada_opt = adagrad.Adagrad(lr_schedule)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          lr_np = learning_rate / (1 + decay * t)
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, lr_np)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, lr_np)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 4fa7c7361543d5389872ebd70cb0df261325a9c4..29d3beea2b4acf76fe0b6393e7572aa5efd9cce3 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -37,8 +37,6 @@ class Adam(optimizer_v2.OptimizerV2):
   requirement, invariant to diagonal rescaling of gradients, and is well suited
   for problems that are large in terms of data/parameters'.
 
-  Note, amsgrad is currently not supported and the argument can only be False.
-
   # References
       See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
         ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
@@ -64,7 +62,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -82,7 +80,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 7918c09b7e04aa0a558b7ebc30ee0120eb358b9f..761b6a0854d761c22e1ea236bb29184992f892a9 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -399,6 +400,55 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        decay = 0.5
+        lr_schedule = learning_rate_schedule.InverseTimeDecay(
+            learning_rate, decay_steps=1.0, decay_rate=decay)
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-7
+
+        opt = adam.Adam(
+            learning_rate=lr_schedule,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.evaluate(update)
+
+          lr_np = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, lr=lr_np)
+          var1_np, m1, v1 = adam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, lr=lr_np)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index a86fd8d89dbc824cc35a4a6585c85e1794a6aa5c..09e3e060bbbb87feb20852514bc02f4d5055a1ab 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -26,13 +26,31 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export('keras.optimizers.Ftrl')
 class Ftrl(optimizer_v2.OptimizerV2):
-  """Optimizer that implements the FTRL algorithm.
+  r"""Optimizer that implements the FTRL algorithm.
 
-  See this [paper](
+  See Algorithm 1 of this [paper](
   https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
   This version has support for both online L2 (the L2 penalty given in the paper
   above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
   loss function).
+
+  Initialization:
+  $t = 0$
+  $n_{0} = 0$
+  $\sigma_{0} = 0$
+  $z_{0} = 0$
+
+  Update ($i$ is variable index):
+  $t = t + 1$
+  $n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$
+  $\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$
+  $z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$
+  $w_{t,i} = - ((\beta+\sqrt{n+{t}}) / \alpha + \lambda_{2})^{-1} * (z_{i} -
+               sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i} else 0$
+
+  Check the documentation for the l2_shrinkage_regularization_strength
+  parameter for more details when shrinkage is enabled, where gradient is
+  replaced with gradient_with_shrinkage.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 333a6f288eaeda7313e07002d8fad229d372ebec..f579c2f657e332dbd3d18c15b8ea6306eb135927 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -41,205 +42,217 @@ class GradientDescentOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd = gradient_descent.SGD(3.0)
-        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           self.evaluate(var1))
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      sgd = gradient_descent.SGD(3.0)
+      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                         self.evaluate(var1))
+
+  def _test_basic_sgd_with_learning_rate_decay(self, sgd, dtype):
+    var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+    var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+    grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+    grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+    if not context.executing_eagerly():
+      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+    # Run 2 steps of sgd
+    if not context.executing_eagerly():
+      self.evaluate(sgd_op)
+    else:
+      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    # Validate updated params
+    self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                       self.evaluate(var0))
+    self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                       self.evaluate(var1))
+
+    if not context.executing_eagerly():
+      self.evaluate(sgd_op)
+    else:
+      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    # Validate updated params
+    self.assertAllCloseAccordingToType(
+        [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+        self.evaluate(var0))
+    self.assertAllCloseAccordingToType(
+        [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+        self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testBasicWithLearningRateDecay(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        learning_rate = 3.0
-        decay = 0.5
-        sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
-        if not context.executing_eagerly():
-          sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 2 steps of sgd
-        if not context.executing_eagerly():
-          self.evaluate(sgd_op)
-        else:
-          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           self.evaluate(var1))
-
-        if not context.executing_eagerly():
-          self.evaluate(sgd_op)
-        else:
-          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
-            self.evaluate(var1))
+      learning_rate = 3.0
+      decay = 0.5
+      sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      learning_rate = learning_rate_schedule.InverseTimeDecay(
+          3.0, decay_steps=1.0, decay_rate=0.5)
+      sgd = gradient_descent.SGD(learning_rate=learning_rate)
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      learning_rate = learning_rate_schedule.InverseTimeDecay(
+          3.0, decay_steps=1.0, decay_rate=0.5)
+      sgd = gradient_descent.SGD(learning_rate=learning_rate)
+      sgd = gradient_descent.SGD.from_config(sgd.get_config())
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
   @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        lr = lambda: 3.0
-        sgd = gradient_descent.SGD(lr)
-        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           self.evaluate(var1))
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      lr = lambda: 3.0
+      sgd = gradient_descent.SGD(lr)
+      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                         self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
-        sgd = gradient_descent.SGD(1.0)
-        sgd_op = sgd.minimize(loss, [var0, var1])
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
+      sgd = gradient_descent.SGD(1.0)
+      sgd_op = sgd.minimize(loss, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
 
-        def loss():
-          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
-          pred += var1  # pylint: disable=cell-var-from-loop
-          return pred * pred
+      def loss():
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+        pred += var1  # pylint: disable=cell-var-from-loop
+        return pred * pred
 
-        sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
-        np_grad = 2 * np_pred
-        self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
+      sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+      np_grad = 2 * np_pred
+      self.assertAllCloseAccordingToType(
+          [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        lrate = constant_op.constant(3.0)
-        sgd_op = gradient_descent.SGD(lrate).apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           self.evaluate(var1))
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      lrate = constant_op.constant(3.0)
+      sgd_op = gradient_descent.SGD(lrate).apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                         self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        opt = gradient_descent.SGD(3.0)
-        values = [1.0, 3.0]
-        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
-        loss = lambda: vars_[0] + vars_[1]  # pylint: disable=cell-var-from-loop
-        grads_and_vars = opt._compute_gradients(loss, vars_)
-        self.evaluate(variables.global_variables_initializer())
-        for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
+      opt = gradient_descent.SGD(3.0)
+      values = [1.0, 3.0]
+      vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+      loss = lambda: vars_[0] + vars_[1]  # pylint: disable=cell-var-from-loop
+      grads_and_vars = opt._compute_gradients(loss, vars_)
+      self.evaluate(variables.global_variables_initializer())
+      for grad, _ in grads_and_vars:
+        self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
-        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
-            constant_op.constant([0]), constant_op.constant([2, 1]))
-        grads1 = ops.IndexedSlices(
-            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
-            constant_op.constant([1]), constant_op.constant([2, 1]))
-        sgd_op = gradient_descent.SGD(3.0).apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 1 step of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           self.evaluate(var1))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+      var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+      sgd_op = gradient_descent.SGD(3.0).apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                         self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testSparseBasicWithLearningRateDecay(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
-        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
-            constant_op.constant([0]), constant_op.constant([2, 1]))
-        grads1 = ops.IndexedSlices(
-            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
-            constant_op.constant([1]), constant_op.constant([2, 1]))
-        sgd_op = gradient_descent.SGD(
-            3.0, decay=0.5).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 2 steps of sgd
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           self.evaluate(var1))
-
-        self.evaluate(sgd_op)
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+      var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+      sgd_op = gradient_descent.SGD(
+          3.0, decay=0.5).apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Run 2 steps of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                         self.evaluate(var0))
+      self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                         self.evaluate(var1))
+
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
@@ -285,100 +298,98 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                      dtype=dtype,
-                                                      name="var0")
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
-                                                      dtype=dtype,
-                                                      name="var1")
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        learning_rate = 2.0
-        momentum = 0.9
-        mom_opt = gradient_descent.SGD(
-            learning_rate=learning_rate, momentum=momentum)
-        # self.assertFalse(mom_opt._initial_decay)
-        mom_update = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-
-        # Check we have slots
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.get_shape(), var0.get_shape())
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.get_shape(), var1.get_shape())
-
-        # Step 1: the momentum accumulators where 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        self.evaluate(variables.global_variables_initializer())
-        self.evaluate(mom_update)
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([-0.2, -0.2]), self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02, -0.02]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-            self.evaluate(var1))
-        # Step 2: the momentum accumulators contain the previous update.
-        self.evaluate(mom_update)
-        if context.executing_eagerly():
-          mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), self.evaluate(var1))
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtype,
+                                                    name="var0")
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtype,
+                                                    name="var1")
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = 2.0
+      momentum = 0.9
+      mom_opt = gradient_descent.SGD(
+          learning_rate=learning_rate, momentum=momentum)
+      # self.assertFalse(mom_opt._initial_decay)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+
+      # Check we have slots
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEqual(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([-0.2, -0.2]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([-0.02, -0.02]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      self.evaluate(mom_update)
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                    (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                      dtype=dtype,
-                                                      name="var0")
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
-                                                      dtype=dtype,
-                                                      name="var1")
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        mom_op = gradient_descent.SGD(
-            learning_rate=2.0, momentum=0.9, nesterov=True)
-        opt_op = mom_op.minimize(loss, [var0, var1])
-        variables.global_variables_initializer().run()
-        for _ in range(1, 5):
-          opt_op.run()
-          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
-              var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, self.evaluate(var0))
-          self.assertAllClose(var1_np, self.evaluate(var1))
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtype,
+                                                    name="var0")
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtype,
+                                                    name="var1")
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      mom_op = gradient_descent.SGD(
+          learning_rate=2.0, momentum=0.9, nesterov=True)
+      opt_op = mom_op.minimize(loss, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      for _ in range(1, 5):
+        self.evaluate(opt_op)
+        var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+            var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+        var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+            var1_np, accum1_np, 3, 2.0, 0.9)
+        self.assertAllClose(var0_np, self.evaluate(var0))
+        self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
+      with self.cached_session() as sess:
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -406,9 +417,9 @@ class MomentumOptimizerTest(test.TestCase):
         grads_and_vars = [(y_feed, var0),
                           (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
         opt_update = mom_op.apply_gradients(grads_and_vars)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         for t in range(1, 5):
-          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          sess.run(opt_update, feed_dict={x_feed: grads[t - 1]})
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
@@ -474,231 +485,227 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        mom_opt = gradient_descent.SGD(
-            learning_rate=constant_op.constant(2.0),
-            momentum=constant_op.constant(0.9))
-        mom_update = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Check we have slots
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.get_shape(), var0.get_shape())
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.get_shape(), var1.get_shape())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the momentum accumulators where 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        mom_update.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([-0.2, -0.2]), self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02, -0.02]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-            self.evaluate(var1))
-        # Step 2: the momentum accumulators contain the previous update.
-        mom_update.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), self.evaluate(var1))
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      mom_opt = gradient_descent.SGD(
+          learning_rate=constant_op.constant(2.0),
+          momentum=constant_op.constant(0.9))
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Check we have slots
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEqual(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([-0.2, -0.2]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([-0.02, -0.02]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                    (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
 
   @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
-        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
-        grads0 = ops.IndexedSlices(
-            constant_op.constant([[.1, .1]], dtype=dtype),
-            constant_op.constant([1]), constant_op.constant([4, 2]))
-        grads1 = ops.IndexedSlices(
-            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
-            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
-        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
-        mom_update = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        # Check we have slots
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.get_shape(), var0.get_shape())
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.get_shape(), var1.get_shape())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([0, 0], self.evaluate(var0)[0])
-        self.assertAllClose([0, 0], self.evaluate(var0)[1])
-        self.assertAllClose([1, 1], self.evaluate(var1)[2])
-
-        # Step 1: the momentum accumulators are 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        mom_update.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([0, 0]),
-            self.evaluate(slot0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.0 * .1, -2.0 * .1]),
-            self.evaluate(slot0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([-2.0 * .01, -2.0 * .01]),
-            self.evaluate(slot1)[2])
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([0, 0]),
-            self.evaluate(var0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
-            self.evaluate(var0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
-            self.evaluate(var1)[2])
-        # Step 2: the momentum accumulators contain the previous update.
-        mom_update.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]),
-            self.evaluate(slot1)[2])
-        # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
-        self.assertAllCloseAccordingToType(
-            np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]),
-            self.evaluate(var0)[1])
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]),
-            self.evaluate(var1)[2])
+      var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+      var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([[.1, .1]], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([4, 2]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+          constant_op.constant([2, 3]), constant_op.constant([4, 2]))
+      mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check we have slots
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEqual(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+      # Fetch params to validate initial values
+      self.assertAllClose([0, 0], self.evaluate(var0)[0])
+      self.assertAllClose([0, 0], self.evaluate(var0)[1])
+      self.assertAllClose([1, 1], self.evaluate(var1)[2])
+
+      # Step 1: the momentum accumulators are 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0, 0]),
+          self.evaluate(slot0)[0])
+      self.assertAllCloseAccordingToType(
+          np.array([-2.0 * .1, -2.0 * .1]),
+          self.evaluate(slot0)[1])
+      self.assertAllCloseAccordingToType(
+          np.array([-2.0 * .01, -2.0 * .01]),
+          self.evaluate(slot1)[2])
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0, 0]),
+          self.evaluate(var0)[0])
+      self.assertAllCloseAccordingToType(
+          np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+          self.evaluate(var0)[1])
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+          self.evaluate(var1)[2])
+      # Step 2: the momentum accumulators contain the previous update.
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+          self.evaluate(slot0)[1])
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                    (0.9 * (-0.02) - 2.0 * 0.01)]),
+          self.evaluate(slot1)[2])
+      # Check that the parameters have been updated.
+      self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
+      self.assertAllCloseAccordingToType(
+          np.array([
+              -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]),
+          self.evaluate(var0)[1])
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+          ]),
+          self.evaluate(var1)[2])
 
   @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
-        mom_update1 = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        mom_update2 = mom_opt.apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEqual(slot0.get_shape(), var0.get_shape())
-        slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEqual(slot1.get_shape(), var1.get_shape())
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the momentum accumulators where 0. So we should see a normal
-        # update: v -= grad * learning_rate
-        mom_update1.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([-0.2, -0.2]), self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([-0.02, -0.02]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
-            self.evaluate(var1))
-        # Step 2: the second momentum accumulators contain the previous update.
-        mom_update2.run()
-        # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
-            self.evaluate(slot0))
-        self.assertAllCloseAccordingToType(
-            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
-                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
-        # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
-                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
-                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), self.evaluate(var1))
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+      mom_update1 = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      mom_update2 = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEqual(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      self.evaluate(mom_update1)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([-0.2, -0.2]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([-0.02, -0.02]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the second momentum accumulators contain the previous update.
+      self.evaluate(mom_update2)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                    (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
-    with self.cached_session():
-      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
-      config = opt.get_config()
-      opt2 = gradient_descent.SGD.from_config(config)
-      lr = opt.lr
-      lr2 = opt2.lr
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
-      self.assertAllClose(
-          self.evaluate(opt._get_hyper("momentum")),
-          self.evaluate(opt2._get_hyper("momentum")))
-      self.assertAllClose(
-          self.evaluate(opt._get_hyper("decay")),
-          self.evaluate(opt2._get_hyper("decay")))
-      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
-      loss = lambda: 3 * var0
-      # learning rate variable created when calling minimize.
-      opt.minimize(loss, [var0])
-      self.evaluate(variables.global_variables_initializer())
-      config = opt.get_config()
-      opt3 = gradient_descent.SGD.from_config(config)
-      lr3 = opt3.lr
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
-      self.assertAllClose(
-          self.evaluate(opt._get_hyper("momentum")),
-          self.evaluate(opt3._get_hyper("momentum")))
-      self.assertAllClose(
-          self.evaluate(opt._get_hyper("decay")),
-          self.evaluate(opt3._get_hyper("decay")))
-      self.assertTrue(opt3.nesterov)
+    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
+    config = opt.get_config()
+    opt2 = gradient_descent.SGD.from_config(config)
+    lr = opt.lr
+    lr2 = opt2.lr
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
+    self.assertAllClose(
+        self.evaluate(opt._get_hyper("momentum")),
+        self.evaluate(opt2._get_hyper("momentum")))
+    self.assertAllClose(
+        self.evaluate(opt._get_hyper("decay")),
+        self.evaluate(opt2._get_hyper("decay")))
+    var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+    loss = lambda: 3 * var0
+    # learning rate variable created when calling minimize.
+    opt.minimize(loss, [var0])
+    self.evaluate(variables.global_variables_initializer())
+    config = opt.get_config()
+    opt3 = gradient_descent.SGD.from_config(config)
+    lr3 = opt3.lr
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
+    self.assertAllClose(
+        self.evaluate(opt._get_hyper("momentum")),
+        self.evaluate(opt3._get_hyper("momentum")))
+    self.assertAllClose(
+        self.evaluate(opt._get_hyper("decay")),
+        self.evaluate(opt3._get_hyper("decay")))
+    self.assertTrue(opt3.nesterov)
 
   def testNesterovWithoutMomentum(self):
     with self.assertRaisesRegexp(ValueError, "must be between"):
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44263bdcf2237ae998f7d796bd4086361a4146c
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -0,0 +1,1028 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various learning rate decay functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.optimizers.schedules.LearningRateSchedule")
+class LearningRateSchedule(object):
+  """A serializable learning rate decay schedule.
+
+  `LearningRateSchedule`s can be passed in as the learning rate of optimizers in
+  `tf.keras.optimizers`. They can be serialized and deserialized using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+  """
+
+  @abc.abstractmethod
+  def __call__(self, step):
+    raise NotImplementedError("Learning rate schedule must override __call__")
+
+  @abc.abstractmethod
+  def get_config(self):
+    raise NotImplementedError("Learning rate schedule must override get_config")
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates a `LearningRateSchedule` from its config.
+
+    Args:
+        config: Output of `get_config()`.
+
+    Returns:
+        A `LearningRateSchedule` instance.
+    """
+    return cls(**config)
+
+
+@keras_export("keras.optimizers.schedules.ExponentialDecay")
+class ExponentialDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses an exponential decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      decay_rate,
+      staircase=False,
+      name=None):
+    """Applies exponential decay to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies an exponential decay function
+    to an optimizer step, given a provided initial learning rate.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate * decay_rate ^ (step / decay_steps)
+    ```
+
+    If the argument `staircase` is `True`, then `step / decay_steps` is
+    an integer division and the decayed learning rate follows a
+    staircase function.
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: When fitting a Keras model, decay every 100000 steps with a base
+    of 0.96:
+
+    ```python
+    initial_learning_rate = 0.1
+    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate,
+        decay_steps=100000,
+        decay_rate=0.96,
+        staircase=True)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Must be positive.  See the decay computation above.
+      decay_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The decay rate.
+      staircase: Boolean.  If `True` decay the learning rate at discrete
+        intervals
+      name: String.  Optional name of the operation.  Defaults to
+        'ExponentialDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(ExponentialDecay, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.decay_rate = decay_rate
+    self.staircase = staircase
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(
+        self.name, "ExponentialDecay",
+        [self.initial_learning_rate, step, self.decay_steps, self.decay_rate]
+    ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      decay_rate = math_ops.cast(self.decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      p = global_step_recomp / decay_steps
+      if self.staircase:
+        p = math_ops.floor(p)
+      return math_ops.multiply(
+          initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "decay_rate": self.decay_rate,
+        "staircase": self.staircase,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
+class PiecewiseConstantDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a piecewise constant decay schedule."""
+
+  def __init__(
+      self,
+      boundaries,
+      values,
+      name=None):
+    """Piecewise constant from boundaries and interval values.
+
+    The function returns a 1-arg callable to compute the piecewise constant
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+
+    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+      for the next 10000 steps, and 0.1 for any additional steps.
+
+    ```python
+    step = tf.Variable(0, trainable=False)
+    boundaries = [100000, 110000]
+    values = [1.0, 0.5, 0.1]
+    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries, values)
+
+    # Later, whenever we perform an optimization step, we pass in the step.
+    learning_rate = learning_rate_fn(step)
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s or `int`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      name: A string. Optional name of the operation. Defaults to
+        'PiecewiseConstant'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as the boundary tensors.
+
+      The output of the 1-arg function that takes the `step`
+      is `values[0]` when `step <= boundaries[0]`,
+      `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
+      and values[-1] when `step > boundaries[-1]`.
+
+    Raises:
+      ValueError: if types of all `values` do not match or
+          the number of elements in the lists does not match.
+    """
+    super(PiecewiseConstantDecay, self).__init__()
+
+    if len(boundaries) != len(values) - 1:
+      raise ValueError(
+          "The length of boundaries should be 1 less than the length of values")
+
+    self.boundaries = boundaries
+    self.values = values
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "PiecewiseConstant",
+                        [step, self.boundaries, self.values, self.name]):
+      boundaries = ops.convert_n_to_tensor(self.boundaries)
+      values = ops.convert_n_to_tensor(self.values)
+      x_recomp = ops.convert_to_tensor(step)
+      # Avoid explicit conversion to x's dtype. This could result in faulty
+      # comparisons, for example if floats are converted to integers.
+      for i, b in enumerate(boundaries):
+        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
+          # We can promote int32 boundaries to int64 without loss of precision.
+          # This covers the most common case where the user passes in boundaries
+          # as an array of Python integers.
+          if (b.dtype.base_dtype == dtypes.int32 and
+              x_recomp.dtype.base_dtype == dtypes.int64):
+            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
+            boundaries[i] = b
+          else:
+            raise ValueError(
+                "Boundaries (%s) must have the same dtype as x (%s)." %
+                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
+      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
+      for v in values[1:]:
+        if v.dtype.base_dtype != values[0].dtype.base_dtype:
+          raise ValueError(
+              "Values must have elements all with the same dtype (%s vs %s)." %
+              (values[0].dtype.base_dtype, v.dtype.base_dtype))
+      pred_fn_pairs = []
+      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
+      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
+      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
+        # Need to bind v here; can do this with lambda v=v: ...
+        pred = (x_recomp > low) & (x_recomp <= high)
+        pred_fn_pairs.append((pred, lambda v=v: v))
+
+      # The default isn't needed here because our conditions are mutually
+      # exclusive and exhaustive, but tf.case requires it.
+      default = lambda: values[0]
+      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
+
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.PolynomialDecay")
+class PolynomialDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a polynomial decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      end_learning_rate=0.0001,
+      power=1.0,
+      cycle=False,
+      name=None):
+    """Applies a polynomial decay to the learning rate.
+
+    It is commonly observed that a monotonically decreasing learning rate, whose
+    degree of change is carefully chosen, results in a better performing model.
+    This schedule applies a polynomial decay function to an optimizer step,
+    given a provided `initial_learning_rate`, to reach an `end_learning_rate`
+    in the given `decay_steps`.
+
+    It requires a `step` value to compute the decayed learning rate. You
+    can just pass a TensorFlow variable that you increment at each training
+    step.
+
+    The schedule is a 1-arg callable that produces a decayed learning rate
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    If `cycle` is True then a multiple of `decay_steps` is used, the first one
+    that is bigger than `step`.
+
+    ```python
+    def decayed_learning_rate(step):
+      decay_steps = decay_steps * ceil(step / decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
+    sqrt (i.e. power=0.5):
+
+    ```python
+    ...
+    starter_learning_rate = 0.1
+    end_learning_rate = 0.01
+    decay_steps = 10000
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        starter_learning_rate,
+        decay_steps,
+        end_learning_rate,
+        power=0.5)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Must be positive.  See the decay computation above.
+      end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The minimal end learning rate.
+      power: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The power of the polynomial. Defaults to linear, 1.0.
+      cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      name: String.  Optional name of the operation. Defaults to
+        'PolynomialDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(PolynomialDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.end_learning_rate = end_learning_rate
+    self.power = power
+    self.cycle = cycle
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(
+        self.name, "PolynomialDecay",
+        [self.initial_learning_rate, step, self.decay_steps,
+         self.end_learning_rate, self.power]
+    ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
+      power = math_ops.cast(self.power, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
+      if self.cycle:
+        # Find the first multiple of decay_steps that is bigger than
+        # global_step. If global_step is zero set the multiplier to 1
+        multiplier = control_flow_ops.cond(
+            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
+            lambda: math_ops.ceil(global_step_recomp / self.decay_steps))
+        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
+      else:
+        # Make sure that the global_step used is not bigger than decay_steps.
+        global_step_recomp = math_ops.minimum(global_step_recomp,
+                                              self.decay_steps)
+
+      p = math_ops.div(global_step_recomp, decay_steps_recomp)
+      return math_ops.add(
+          math_ops.multiply(initial_learning_rate - end_learning_rate,
+                            math_ops.pow(1 - p, power)),
+          end_learning_rate,
+          name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "end_learning_rate": self.end_learning_rate,
+        "power": self.power,
+        "cycle": self.cycle,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.InverseTimeDecay")
+class InverseTimeDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses an inverse time decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      decay_rate,
+      staircase=False,
+      name=None):
+    """Applies inverse time decay to the initial learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies the inverse decay function
+    to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * step / decay_step)
+    ```
+
+    or, if `staircase` is `True`, as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
+
+    ```python
+    ...
+    initial_learning_rate = 0.1
+    decay_steps = 1.0
+    decay_rate = 0.5
+    learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
+      initial_learning_rate, global_step, decay_steps, decay_rate)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: How often to apply decay.
+      decay_rate: A Python number.  The decay rate.
+      staircase: Whether to apply decay in a discrete staircase, as opposed to
+        continuous, fashion.
+      name: String.  Optional name of the operation.  Defaults to
+        'InverseTimeDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(InverseTimeDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.decay_rate = decay_rate
+    self.staircase = staircase
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "InverseTimeDecay",
+                        [self.initial_learning_rate, step, self.decay_rate]
+                       ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      decay_rate = math_ops.cast(self.decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      p = global_step_recomp / decay_steps
+      if self.staircase:
+        p = math_ops.floor(p)
+      const = math_ops.cast(constant_op.constant(1), dtype)
+      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
+      return math_ops.div(initial_learning_rate, denom, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "decay_rate": self.decay_rate,
+        "staircase": self.staircase,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.CosineDecay")
+class CosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      alpha=0.0,
+      name=None):
+    """Applies cosine decay to the learning rate.
+
+    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a cosine decay function
+    to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
+      decayed = (1 - alpha) * cosine_decay + alpha
+      return initial_learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = tf.keras.experimental.CosineDecay(
+        initial_learning_rate, global_step, decay_steps)
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a
+        Python number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      alpha: A scalar `float32` or `float64` Tensor or a Python number.
+        Minimum learning rate value as a fraction of initial_learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(CosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.alpha = alpha
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "CosineDecay",
+                        [self.initial_learning_rate, step]):
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      completed_fraction = global_step_recomp / decay_steps
+      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+
+      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
+      return math_ops.multiply(initial_learning_rate, decayed)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "alpha": self.alpha,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.CosineDecayRestarts")
+class CosineDecayRestarts(LearningRateSchedule):
+  """A LearningRateSchedule that uses a cosine decay schedule with restarts."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      first_decay_steps,
+      t_mul=2.0,
+      m_mul=1.0,
+      alpha=0.0,
+      name=None):
+    """Applies cosine decay with restarts to the learning rate.
+
+    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a cosine decay function with
+    restarts to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+
+    The learning rate multiplier first decays
+    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+    restart is performed. Each new warm restart runs for `t_mul` times more
+    steps and with `m_mul` times smaller initial learning rate.
+
+    Example usage:
+    ```python
+    first_decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.CosineDecayRestarts(
+          initial_learning_rate,
+          global_step,
+          first_decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
+        number. Number of steps to decay over.
+      t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+        Used to derive the number of iterations in the i-th period
+      m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+        Used to derive the initial learning rate of the i-th period:
+      alpha: A scalar `float32` or `float64` Tensor or a Python number.
+        Minimum learning rate value as a fraction of the initial_learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+    """
+    super(CosineDecayRestarts, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.first_decay_steps = first_decay_steps
+    self._t_mul = t_mul
+    self._m_mul = m_mul
+    self.alpha = alpha
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "SGDRDecay",
+                        [self.initial_learning_rate, step]
+                       ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      t_mul = math_ops.cast(self._t_mul, dtype)
+      m_mul = math_ops.cast(self._m_mul, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      completed_fraction = global_step_recomp / first_decay_steps
+
+      def compute_step(completed_fraction, geometric=False):
+        """Helper for `cond` operation."""
+        if geometric:
+          i_restart = math_ops.floor(
+              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+              math_ops.log(t_mul))
+
+          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
+
+        else:
+          i_restart = math_ops.floor(completed_fraction)
+          completed_fraction -= i_restart
+
+        return i_restart, completed_fraction
+
+      i_restart, completed_fraction = control_flow_ops.cond(
+          math_ops.equal(t_mul, 1.0),
+          lambda: compute_step(completed_fraction, geometric=False),
+          lambda: compute_step(completed_fraction, geometric=True))
+
+      m_fac = m_mul**i_restart
+      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+      decayed = (1 - alpha) * cosine_decayed + alpha
+
+      return math_ops.multiply(initial_learning_rate, decayed, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "first_decay_steps": self.first_decay_steps,
+        "t_mul": self._t_mul,
+        "m_mul": self._m_mul,
+        "alpha": self.alpha,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.LinearCosineDecay")
+class LinearCosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a linear cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      num_periods=0.5,
+      alpha=0.0,
+      beta=0.001,
+      name=None):
+    """Applies linear cosine decay to the learning rate.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.LinearCosineDecay(
+        initial_learning_rate, global_step, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      num_periods: Number of periods in the cosine part of the decay.
+        See computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'LinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(LinearCosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.num_periods = num_periods
+    self.alpha = alpha
+    self.beta = beta
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "LinearCosineDecay",
+                        [self.initial_learning_rate, step]) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      num_periods = math_ops.cast(self.num_periods, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      beta = math_ops.cast(self.beta, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+
+      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
+      return math_ops.multiply(initial_learning_rate, linear_cosine_decayed,
+                               name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "num_periods": self.num_periods,
+        "alpha": self.alpha,
+        "beta": self.beta,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.NoisyLinearCosineDecay")
+class NoisyLinearCosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a noisy linear cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      initial_variance=1.0,
+      variance_decay=0.55,
+      num_periods=0.5,
+      alpha=0.0,
+      beta=0.001,
+      name=None):
+    """Applies noisy linear cosine decay to the learning rate.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a noisy linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps)
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+    where eps_t is 0-centered gaussian noise with variance
+    initial_variance / (1 + global_step) ** variance_decay
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.NoisyLinearCosineDecay(
+        initial_learning_rate, global_step, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      initial_variance: initial variance for the noise. See computation above.
+      variance_decay: decay for the noise's variance. See computation above.
+      num_periods: Number of periods in the cosine part of the decay.
+        See computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'NoisyLinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(NoisyLinearCosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.initial_variance = initial_variance
+    self.variance_decay = variance_decay
+    self.num_periods = num_periods
+    self.alpha = alpha
+    self.beta = beta
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
+                        [self.initial_learning_rate, step]) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      initial_variance = math_ops.cast(self.initial_variance, dtype)
+      variance_decay = math_ops.cast(self.variance_decay, dtype)
+      num_periods = math_ops.cast(self.num_periods, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      beta = math_ops.cast(self.beta, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      variance = initial_variance / (
+          math_ops.pow(1.0 + global_step_recomp, variance_decay))
+      std = math_ops.sqrt(variance)
+      noisy_linear_decayed = (
+          linear_decayed + random_ops.random_normal(
+              linear_decayed.shape, stddev=std))
+
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+      noisy_linear_cosine_decayed = (
+          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
+
+      return math_ops.multiply(
+          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "initial_variance": self.initial_variance,
+        "variance_decay": self.variance_decay,
+        "num_periods": self.num_periods,
+        "alpha": self.alpha,
+        "beta": self.beta,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.serialize")
+def serialize(learning_rate_schedule):
+  return generic_utils.serialize_keras_object(learning_rate_schedule)
+
+
+@keras_export("keras.optimizers.schedules.deserialize")
+def deserialize(config, custom_objects=None):
+  return generic_utils.deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name="decay")
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..87b97fa76ca39850d111db75aeed3f991e46ddc6
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -0,0 +1,527 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional test for learning rate decay."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+def _maybe_serialized(lr_decay, serialize_and_deserialize):
+  if serialize_and_deserialize:
+    serialized = learning_rate_schedule.serialize(lr_decay)
+    return learning_rate_schedule.deserialize(serialized)
+  else:
+    return lr_decay
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContinuous(self, serialize):
+    self.evaluate(variables.global_variables_initializer())
+    step = 5
+    decayed_lr = learning_rate_schedule.ExponentialDecay(0.05, 10, 0.96)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = .05 * 0.96**(5.0 / 10.0)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self, serialize):
+    if context.executing_eagerly():
+      step = resource_variable_ops.ResourceVariable(0)
+      self.evaluate(variables.global_variables_initializer())
+      decayed_lr = learning_rate_schedule.ExponentialDecay(
+          .1, 3, 0.96, staircase=True)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+      # No change to learning rate due to staircase
+      expected = .1
+      self.evaluate(step.assign(1))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+      expected = .1
+      self.evaluate(step.assign(2))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+      # Decayed learning rate
+      expected = .1 * 0.96 ** (100 // 3)
+      self.evaluate(step.assign(100))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_deprecated_v1
+  def testVariables(self, serialize):
+    step = variables.Variable(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_schedule.ExponentialDecay(
+        .1, 3, 0.96, staircase=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstant(self, serialize):
+    x = resource_variable_ops.ResourceVariable(-999)
+    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+        [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+    self.evaluate(x.assign(100))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+    self.evaluate(x.assign(105))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+    self.evaluate(x.assign(110))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+    self.evaluate(x.assign(120))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.01, 1e-6)
+    self.evaluate(x.assign(999))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstantEdgeCases(self, serialize):
+    x_int = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int32)
+    boundaries, values = [-1.0, 1.0], [1, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+          boundaries, values)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      decayed_lr(x_int)
+
+    x = resource_variable_ops.ResourceVariable(0.0)
+    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+          boundaries, values)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      decayed_lr(x)
+
+    # Test casting boundaries from int32 to int64.
+    x_int64 = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int64)
+    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
+    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+        boundaries, values)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(1))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(2))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.5, 1e-6)
+    self.evaluate(x_int64.assign(3))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.6, 1e-6)
+    self.evaluate(x_int64.assign(4))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.7, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self, serialize):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr + end_lr) * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.25 + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class SqrtDecayTestV2(test_util.TensorFlowTestCase,
+                      parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr * 0.5**power
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self, serialize):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.5**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.25**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class PolynomialDecayTestV2(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeginWithCycle(self, serialize):
+    lr = 0.001
+    decay_steps = 10
+    step = 0
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, decay_steps, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_schedule.InverseTimeDecay(initial_lr, k,
+                                                         decay_rate)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + i / k * decay_rate)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self, serialize):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_schedule.InverseTimeDecay(
+        initial_lr, k, decay_rate, staircase=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + decay_rate * (i // k))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
+    step = min(step, decay_steps)
+    completed_fraction = step / decay_steps
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
+                                                      num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
+                                                      num_training_steps,
+                                                      alpha)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay(step, num_training_steps, alpha)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
+                                parameterized.TestCase):
+
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step -= decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, alpha=alpha)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, alpha=alpha)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMMul(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, m_mul=m_mul)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, m_mul=m_mul)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTMul(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, t_mul=t_mul)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, t_mul=t_mul)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+
+  def np_linear_cosine_decay(self,
+                             step,
+                             decay_steps,
+                             alpha=0.0,
+                             beta=0.001,
+                             num_periods=0.5):
+    step = min(step, decay_steps)
+    linear_decayed = float(decay_steps - step) / decay_steps
+    fraction = 2.0 * num_periods * step / float(decay_steps)
+    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
+    return (alpha + linear_decayed) * cosine_decayed + beta
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.LinearCosineDecay(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_linear_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.LinearCosineDecay(
+          initial_lr,
+          num_training_steps,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_linear_cosine_decay(
+          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase,
+                                   parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultNoisyLinearCosine(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr(step))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultNoisyLinearCosine(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+          initial_lr,
+          num_training_steps,
+          initial_variance=0.5,
+          variance_decay=0.1,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr(step))
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index d515f987251f26cd46c2358f068b325cb29fa5cc..77a897124be9620414a6c11b11d6b0b7636f6983 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -86,6 +87,12 @@ class Nadam(optimizer_v2.OptimizerV2):
 
     # Backwards compatiblity with keras NAdam optimizer.
     kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
+    learning_rate = kwargs.get('lr', learning_rate)
+    if isinstance(learning_rate, learning_rate_schedule.LearningRateSchedule):
+      raise ValueError('The Nadam optimizer does not support '
+                       'tf.keras.optimizers.LearningRateSchedules as the '
+                       'learning rate.')
+
     if epsilon is None:
       epsilon = backend_config.epsilon()
     super(Nadam, self).__init__(name, **kwargs)
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 6cd6cf0a8d934a04e04fab8ba7bd1810b304de12..ecdeafe1c7eccf61c056f8468d06f5e95dd0d712 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -25,7 +25,6 @@ import functools
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.distribute import values as distributed_values
@@ -36,6 +35,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
@@ -43,7 +44,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -70,43 +72,36 @@ def _deduplicate_indexed_slices(values, indices):
 
 @six.add_metaclass(abc.ABCMeta)
 @keras_export("keras.optimizers.Optimizer")
-class OptimizerV2(checkpointable.Checkpointable):
+class OptimizerV2(trackable.Trackable):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
   class directly, but instead instantiate one of its subclasses such as
-  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+  `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
 
   ### Usage
 
   ```python
   # Create an optimizer with the desired parameters.
-  opt = GradientDescentOptimizer(learning_rate=0.1)
-  # Add Ops to the graph to minimize a cost by updating a list of variables.
-  # "cost" is a Tensor, and the list of variables contains tf.Variable
-  # objects.
-  opt_op = opt.minimize(cost, var_list=<list of variables>)
-  ```
-
-  In the training program you will just have to run the returned Op.
-
-  ```python
-  # Execute opt_op to do one step of training:
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
+  # `loss` is a callable that takes no argument and returns the value
+  # to minimize.
+  loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
+  # In graph mode, returns op that minimizes the loss by updating the listed
+  # variables.
+  opt_op = opt.minimize(loss, var_list=[var1, var2])
   opt_op.run()
+  # In eager mode, simply call minimize to update the list of variables.
+  opt.minimize(loss, var_list=[var1, var2])
   ```
 
-  ### Thread Compatibility
-
-  The entire optimizer is currently thread compatible, not thread-safe. The user
-  needs to perform synchronization if necessary.
-
   ### Processing gradients before applying them.
 
   Calling `minimize()` takes care of both computing the gradients and
   applying them to the variables.  If you want to process the gradients
   before applying them you can instead use the optimizer in three steps:
 
-  1.  Compute the gradients with `compute_gradients()`.
+  1.  Compute the gradients with `tf.GradientTape`.
   2.  Process the gradients as you wish.
   3.  Apply the processed gradients with `apply_gradients()`.
 
@@ -114,10 +109,15 @@ class OptimizerV2(checkpointable.Checkpointable):
 
   ```python
   # Create an optimizer.
-  opt = GradientDescentOptimizer(learning_rate=0.1)
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
 
   # Compute the gradients for a list of variables.
-  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+  with tf.GradientTape() as tape:
+    loss = <call_loss_function>
+  vars = <list_of_variables>
+  grads = tape.gradient(loss, vars)
+  processed_grads = [process_gradient(g) for g in grads]
+  grads_and_vars = zip(processed_grads, var_list)
 
   # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
   # need to the 'gradient' part, for example cap them, etc.
@@ -127,13 +127,43 @@ class OptimizerV2(checkpointable.Checkpointable):
   opt.apply_gradients(capped_grads_and_vars)
   ```
 
+  ### Use with `tf.distribute.Strategy`.
+
+  This optimizer class is `tf.distribute.Strategy` aware, which means it
+  automatically sums gradients across all replicas. To average gradients,
+  you divide your loss by the global batch size, which is done automatically
+  if you use a member of `tf.keras.losses` or `tf.losses`. See the
+  `reduction` argument of your loss which should be set to
+  `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
+  `tf.keras.losses.Reduction.SUM` for not.
+
+  If you are not using these and you want to average gradients, you should use
+  `tf.math.reduce_sum` to add up your per-example losses and then divide by the
+  global batch size. Note that when using `tf.distribute.Strategy`, the first
+  component of a tensor's shape is the *replica-local* batch size, which is off
+  by a factor equal to the number of replicas being used to compute a single
+  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
+  resulting in gradients that can be many times too big.
+
+  ### Variable Constraint
+
+  All Keras optimizers respect variable constraints. If constraint function is
+  passed to any variable, the constraint will be applied to the variable after
+  the gradient has been applied to the variable.
+  Important: If gradient is sparse tensor, variable constraint is not supported.
+
+  ### Thread Compatibility
+
+  The entire optimizer is currently thread compatible, not thread-safe. The user
+  needs to perform synchronization if necessary.
+
   ### Slots
 
-  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
-  allocate and manage additional variables associated with the variables to
-  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
-  optimizer for the names of the slots that it uses.  Once you have a slot name
-  you can ask the optimizer for the variable it created to hold the slot value.
+  Many optimizer subclasses, such as `Adam` and `Adagrad` allocate and manage
+  additional variables associated with the variables to train.  These are called
+  <i>Slots</i>.  Slots have names and you can ask the optimizer for the names of
+  the slots that it uses.  Once you have a slot name you can ask the optimizer
+  for the variable it created to hold the slot value.
 
   This can be useful if you want to log debug a training algorithm, report stats
   about the slots, etc.
@@ -146,6 +176,31 @@ class OptimizerV2(checkpointable.Checkpointable):
   callables. If they are callable, the callable will be called during
   `apply_gradients()` to get the value for the hyper parameter.
 
+  Hyper parameters can be overwritten through user code:
+
+  Example:
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
+  # `loss` is a callable that takes no argument and returns the value
+  # to minimize.
+  loss = lambda: 3 * var1 + 2 * var2
+  # In eager mode, simply call minimize to update the list of variables.
+  opt.minimize(loss, var_list=[var1, var2])
+  # update learning rate
+  opt.learning_rate = 0.05
+  opt.minimize(loss, var_list=[var1, var2])
+  ```
+
+  ### Write a customized optimizer.
+  If you intend to create your own optimization algorithm, simply inherit from
+  this class and override the following methods:
+
+    - resource_apply_dense (update variable given gradient tensor is dense)
+    - resource_apply_sparse (update variable given gradient tensor is sparse)
+    - create_slots (if your optimizer algorithm requires additional variables)
+    - get_config (serialization of the optimizer, include all hyper parameters)
   """
 
   def __init__(self, name, **kwargs):
@@ -191,9 +246,9 @@ class OptimizerV2(checkpointable.Checkpointable):
     self._weights = []
     self._iterations = None
 
-    # For implementing Checkpointable. Stores information about how to restore
+    # For implementing Trackable. Stores information about how to restore
     # slot variables which have not yet been created
-    # (checkpointable._CheckpointPosition objects).
+    # (trackable._CheckpointPosition objects).
     #  {slot_name :
     #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
     #   ... }
@@ -213,9 +268,9 @@ class OptimizerV2(checkpointable.Checkpointable):
   def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Add operations to minimize `loss` by updating `var_list`.
 
-    This method simply combines calls `compute_gradients()` and
+    This method simply computes gradient using `tf.GradientTape` and calls
     `apply_gradients()`. If you want to process the gradient before applying
-    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    then call `tf.GradientTape` and `apply_gradients()` explicitly instead
     of using this function.
 
     Args:
@@ -273,7 +328,6 @@ class OptimizerV2(checkpointable.Checkpointable):
     with backprop.GradientTape() as tape:
       tape.watch(var_list)
       loss_value = loss()
-      loss_value = self._scale_loss(loss_value)
     grads = tape.gradient(loss_value, var_list, grad_loss)
 
     if hasattr(self, "clipnorm"):
@@ -292,14 +346,6 @@ class OptimizerV2(checkpointable.Checkpointable):
 
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
-
   def get_gradients(self, loss, params):
     """Returns gradients of `loss` with respect to `params`.
 
@@ -314,8 +360,8 @@ class OptimizerV2(checkpointable.Checkpointable):
       ValueError: In case any gradient cannot be computed (e.g. if gradient
         function not implemented).
     """
-    loss = self._scale_loss(loss)
-    grads = gradients.gradients(loss, params)
+    with backend.get_graph().as_default():
+      grads = gradients.gradients(loss, params)
     if None in grads:
       raise ValueError("An operation has `None` for gradient. "
                        "Please make sure that all of your ops have a "
@@ -338,8 +384,7 @@ class OptimizerV2(checkpointable.Checkpointable):
     applies gradients.
 
     Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        `compute_gradients()`.
+      grads_and_vars: List of (gradient, variable) pairs.
       name: Optional name for the returned operation.  Default to the name
         passed to the `Optimizer` constructor.
 
@@ -353,18 +398,26 @@ class OptimizerV2(checkpointable.Checkpointable):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribute_ctx.has_strategy():
-      reduced_grads = merge_grads(grads_and_vars)
-      grads_and_vars = zip(reduced_grads, var_list)
 
+    # Create iteration if necessary.
+    _ = self.iterations
     self._create_hypers()
     with ops.init_scope():
       self._create_slots(var_list)
-    update_ops = []
 
     self._prepare(var_list)
 
-    def update_grad_to_var(grad, var):
+    return distribute_ctx.get_replica_context().merge_call(
+        self._distributed_apply, args=(grads_and_vars,), kwargs={"name": name})
+
+  def _distributed_apply(self, distribution, grads_and_vars, name):
+    """`apply_gradients` using a `DistributionStrategy`."""
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    def apply_grad_to_update_var(var, grad):
       """Apply gradient to variable."""
       if isinstance(var, ops.Tensor):
         raise NotImplementedError("Trying to update a Tensor ", var)
@@ -381,18 +434,27 @@ class OptimizerV2(checkpointable.Checkpointable):
       else:
         return update_op
 
+    update_ops = []
     with ops.name_scope(name, self._name) as name:
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
         with ops.name_scope("update" + scope_name):
-          update_ops.append(update_grad_to_var(grad, var))
-      # control dependencies does not work in per replica mode, please change
-      # this once b/118841692 is fixed.
-      # with ops.control_dependencies(update_ops):
-      #   apply_updates = self._iterations.assign_add(1).op
-      apply_updates = merge_update_step(update_ops, self.iterations)
-      return apply_updates
+          update_ops.extend(
+              distribution.extended.update(
+                  var, apply_grad_to_update_var, args=(grad,), group=False))
+
+      any_symbolic = any(isinstance(i, ops.Operation) or
+                         tf_utils.is_symbolic_tensor(i) for i in update_ops)
+      if not context.executing_eagerly() or any_symbolic:
+        # If the current context is graph mode or any of the update ops are
+        # symbolic then the step update should be carried out under a graph
+        # context. (eager updates execute immediately)
+        with ops._get_graph_from_inputs(update_ops).as_default():  # pylint: disable=protected-access
+          with ops.control_dependencies(update_ops):
+            return self._iterations.assign_add(1).op
+
+      return self._iterations.assign_add(1)
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
@@ -405,12 +467,17 @@ class OptimizerV2(checkpointable.Checkpointable):
 
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
+    if isinstance(value, trackable.Trackable):
+      self._track_trackable(value, name, overwrite=True)
     if name not in self._hyper:
       self._hyper[name] = value
     else:
       prev_value = self._hyper[name]
-      if callable(prev_value) or isinstance(prev_value,
-                                            (ops.Tensor, int, float)):
+      if (callable(prev_value)
+          or isinstance(prev_value,
+                        (ops.Tensor, int, float,
+                         learning_rate_schedule.LearningRateSchedule))
+          or isinstance(value, learning_rate_schedule.LearningRateSchedule)):
         self._hyper[name] = value
       else:
         backend.set_value(self._hyper[name], value)
@@ -419,6 +486,8 @@ class OptimizerV2(checkpointable.Checkpointable):
     if not self._hypers_created:
       self._create_hypers()
     value = self._hyper[name]
+    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+      return value
     if callable(value):
       value = value()
     if dtype:
@@ -469,11 +538,13 @@ class OptimizerV2(checkpointable.Checkpointable):
             initializer, shape=var.shape, dtype=var.dtype)
       else:
         initial_value = initializer
-      weight = tf_variables.Variable(
-          name="%s/%s" % (var._shared_name, slot_name),  # pylint: disable=protected-access
-          dtype=var.dtype,
-          trainable=False,
-          initial_value=initial_value)
+      strategy = distribute_ctx.get_strategy()
+      with strategy.colocate_vars_with(var):
+        weight = tf_variables.Variable(
+            name="%s/%s" % (var._shared_name, slot_name),  # pylint: disable=protected-access
+            dtype=var.dtype,
+            trainable=False,
+            initial_value=initial_value)
       backend.track_variable(weight)
       slot_dict[slot_name] = weight
       self._restore_slot_variable(
@@ -493,18 +564,10 @@ class OptimizerV2(checkpointable.Checkpointable):
   def _create_hypers(self):
     if self._hypers_created:
       return
-    if self._iterations is None:
-      with ops.device("cpu:0"):
-        self._iterations = self.add_weight(
-            "iter",
-            shape=[],
-            dtype=dtypes.int64,
-            trainable=False,
-            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-        self._weights.append(self._iterations)
-    for name, value in self._hyper.items():
+    # Iterate hyper values deterministically.
+    for name, value in sorted(self._hyper.items()):
       if isinstance(value, ops.Tensor) or callable(value):
-        pass
+        continue
       else:
         self._hyper[name] = self.add_weight(
             name,
@@ -517,13 +580,19 @@ class OptimizerV2(checkpointable.Checkpointable):
   @property
   def iterations(self):
     """Variable. The number of training steps this Optimizer has run."""
-    if not self._hypers_created:
-      self._create_hypers()
+    if self._iterations is None:
+      self._iterations = self.add_weight(
+          "iter",
+          shape=[],
+          dtype=dtypes.int64,
+          trainable=False,
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self._weights.append(self._iterations)
     return self._iterations
 
   @iterations.setter
   def iterations(self, variable):
-    if self._hypers_created:
+    if self._iterations is not None:
       raise RuntimeError("Cannot set `iterations` to a new Variable after"
                          "the Optimizer weights have been created")
     self._iterations = variable
@@ -532,6 +601,9 @@ class OptimizerV2(checkpointable.Checkpointable):
   def _decayed_lr(self, var_dtype):
     """Get decayed learning rate as a Tensor with dtype=var_dtype."""
     lr_t = self._get_hyper("learning_rate", var_dtype)
+    if isinstance(lr_t, learning_rate_schedule.LearningRateSchedule):
+      local_step = math_ops.cast(self.iterations, var_dtype)
+      lr_t = math_ops.cast(lr_t(local_step), var_dtype)
     if self._initial_decay > 0.:
       local_step = math_ops.cast(self.iterations, var_dtype)
       decay_t = self._get_hyper("decay", var_dtype)
@@ -576,15 +648,22 @@ class OptimizerV2(checkpointable.Checkpointable):
     """
     if "lr" in config:
       config["learning_rate"] = config.pop("lr")
+    if "learning_rate" in config:
+      if isinstance(config["learning_rate"], dict):
+        config["learning_rate"] = learning_rate_schedule.deserialize(
+            config["learning_rate"])
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
     """Serialize a hyperparameter that can be a float, callable, or Tensor."""
     value = self._hyper[hyperparameter_name]
+    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+      return learning_rate_schedule.serialize(value)
     if callable(value):
       return value()
     if isinstance(value, (ops.Tensor, tf_variables.Variable,
-                          distributed_values.TPUMirroredVariable)):
+                          distributed_values.TPUMirroredVariable,
+                          distributed_values.DistributedVariable)):
       return backend.get_value(value)
     return value
 
@@ -764,7 +843,7 @@ class OptimizerV2(checkpointable.Checkpointable):
       return x.value()
 
   # ---------------
-  # For implementing the checkpointable interface
+  # For implementing the trackable interface
   # ---------------
 
   def _restore_slot_variable(self, slot_name, variable, slot_variable):
@@ -795,8 +874,8 @@ class OptimizerV2(checkpointable.Checkpointable):
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
@@ -814,7 +893,7 @@ class OptimizerV2(checkpointable.Checkpointable):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.add_slot(
           var=variable,
@@ -864,33 +943,6 @@ def _filter_grads(grads_and_vars):
   return filtered
 
 
-def merge_update_step(update_ops, local_step):
-  """Merge local step counter update from different replicas."""
-
-  def merge_update_step_fn(strategy, update_ops, local_step):
-    merged_ops = []
-    for update_op in update_ops:
-      merged_ops.append(strategy.group(update_op))
-    with ops.control_dependencies(merged_ops):
-      incre_op = local_step.assign_add(1).op
-    return incre_op
-
-  return distribute_ctx.get_replica_context().merge_call(
-      merge_update_step_fn, args=(update_ops, local_step))
-
-
-def merge_grads(grads_and_vars):
-  """Merge gradients from different replicas."""
-
-  def merge_grad_fn(strategy, grads_and_vars):
-    reduced_grads = strategy.extended.batch_reduce_to(
-        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
-    return reduced_grads
-
-  return distribute_ctx.get_replica_context().merge_call(
-      merge_grad_fn, args=(grads_and_vars,))
-
-
 def _var_key(var):
   """Key for representing a primary variable, for looking up slots.
 
@@ -906,9 +958,10 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribute_ctx.has_strategy() and hasattr(var, "_primary_var"):
-    var = var._primary_var
-  if hasattr(var, "op"):
+  # Get the distributed variable if it exists.
+  if getattr(var, "_distributed_container", None) is not None:
+    var = var._distributed_container()
+  if var._in_graph_mode:
     return var._shared_name
   return var._unique_id
 
@@ -918,3 +971,37 @@ def _get_slot_key_from_var(var, slot_name):
 
   name = _var_key(var)
   return name + "/" + slot_name
+
+
+class _RestoredOptimizer(OptimizerV2):
+  """A non-functional Optimizer implementation for checkpoint compatibility.
+
+  Holds slot variables and hyperparameters when an optimizer is restored from a
+  SavedModel. These variables may be referenced in functions along with ops
+  created by the original optimizer, but currently we do not support using the
+  optimizer object iself (e.g. through `apply_gradients`).
+  """
+  # TODO(allenl): Make the restored optimizer functional by tracing its apply
+  # methods.
+
+  def __init__(self):
+    super(_RestoredOptimizer, self).__init__("_RestoredOptimizer")
+    self._hypers_created = True
+
+  def get_config(self):
+    # TODO(allenl): Save and restore the Optimizer's config
+    raise NotImplementedError(
+        "Restoring functional Optimzers from SavedModels is not currently "
+        "supported. Please file a feature request if this limitation bothers "
+        "you.")
+
+revived_types.register_revived_type(
+    "optimizer",
+    lambda obj: isinstance(obj, OptimizerV2),
+    versions=[revived_types.VersionedTypeRegistration(
+        object_factory=lambda proto: _RestoredOptimizer(),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_RestoredOptimizer._set_hyper  # pylint: disable=protected-access
+    )])
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 8069703b7a2ba2fb94c319be5b64dbd98ece2da6..5b7b26f54431e32593ab554eb1666bfd81568fc1 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -41,7 +41,9 @@ from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -112,6 +114,13 @@ class OptimizerTest(test.TestCase):
       # var1 = [0., 1.] - 0.5 * [3, 3]
       self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
 
+      sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
+          0.5, decay_steps=1.0, decay_rate=0.5)
+      if context.executing_eagerly():
+        sgd.minimize(loss, [var0, var1])
+      else:
+        self.evaluate(opt_op)
+
   @test_util.run_in_graph_and_eager_modes
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -280,6 +289,33 @@ class OptimizerTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConfigWithLearningRateDecay(self):
+    with self.cached_session():
+      decay_schedule = learning_rate_schedule.InverseTimeDecay(
+          0.5, decay_steps=1.0, decay_rate=0.1)
+      step = 10
+      opt = gradient_descent.SGD(decay_schedule)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      # assert both are equal float values.
+      self.assertAllEqual(
+          decay_schedule(step),
+          opt._get_hyper('learning_rate')(step))
+      self.assertAllEqual(
+          decay_schedule(step),
+          opt2._get_hyper('learning_rate')(step))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt.get_config()
+      opt3 = gradient_descent.SGD.from_config(config)
+      self.assertAllEqual(
+          self.evaluate(opt._get_hyper('learning_rate')(step)),
+          opt3._get_hyper('learning_rate')(step))
+
   @test_util.run_in_graph_and_eager_modes
   def testGradClipValue(self):
     with self.cached_session():
@@ -440,6 +476,7 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testOptimizerWithCallbacks(self):
+    np.random.seed(1331)
     input_np = np.random.random((10, 3))
     output_np = np.random.random((10, 4))
     a = input_layer.Input(shape=(3,), name='input_a')
@@ -460,7 +497,7 @@ class OptimizerTest(test.TestCase):
         batch_size=10,
         validation_data=(input_np, output_np),
         callbacks=cbks,
-        epochs=5,
+        epochs=2,
         verbose=0)
     self.assertAllClose(
         float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
@@ -480,7 +517,7 @@ class OptimizerTest(test.TestCase):
         batch_size=10,
         validation_data=(input_np, output_np),
         callbacks=cbks,
-        epochs=5,
+        epochs=2,
         verbose=2)
     self.assertAllClose(
         float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
@@ -500,6 +537,17 @@ class OptimizerTest(test.TestCase):
     new_step_value = self.evaluate(global_step)
     self.assertEqual(new_step_value, init_step_value + 1)
 
+  def testVarKey(self):
+    with context.graph_mode():
+      a = variables.Variable([1., 2.], name='var')
+      b = variables.Variable([1.], name='var')
+      self.assertTrue(a._in_graph_mode)
+      self.assertTrue(b._in_graph_mode)
+      var_key = optimizer_v2._var_key(a)
+      self.assertEqual('var', var_key)
+      var_key = optimizer_v2._var_key(b)
+      self.assertEqual('var_1', var_key)
+
 
 @keras_parameterized.run_with_all_model_types
 class OptimizersCompatibilityTest(keras_parameterized.TestCase):
@@ -673,6 +721,23 @@ class OptimizerWithFunctionTest(test.TestCase):
       self.assertAllClose([0., 1.], fn(), atol=1e-4)
       self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
+  def testVarKeyWithVarCreatedInEager(self):
+    with context.eager_mode():
+      a = variables.Variable([1., 2.], name='var')
+      b = variables.Variable([1.], name='var')
+
+      @test_util.also_run_as_tf_function
+      def var_key_test():
+        self.assertFalse(a._in_graph_mode)
+        self.assertFalse(b._in_graph_mode)
+        var_key_a = optimizer_v2._var_key(a)
+        self.assertStartsWith(var_key_a, 'var_')
+        var_key_b = optimizer_v2._var_key(b)
+        self.assertStartsWith(var_key_b, 'var_')
+        self.assertNotEquals(var_key_a, var_key_b)
+
+      var_key_test()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index a9ddc2155a63e4030e56104293fc1a92b11de5d1..99789ffbea3153400417b314c1bc72cbf38fbb6d 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -22,6 +22,7 @@ import copy
 import itertools
 import math
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -29,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -244,6 +246,78 @@ class RMSpropOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
       self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testDenseWithLearningRateInverseTimeDecay(self):
+    var0_np = np.array([1.0, 2.0])
+    grads0_np = np.array([0.1, 0.2])
+    var1_np = np.array([3.0, 4.0])
+    grads1_np = np.array([0.01, 0.2])
+
+    var0 = resource_variable_ops.ResourceVariable(var0_np)
+    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    grads0 = constant_op.constant(grads0_np)
+    grads1 = constant_op.constant(grads1_np)
+    learning_rate = 0.01
+    rho = 0.9
+    momentum = 0.0
+    epsilon = 1e-7
+    centered = False
+    decay = 0.5
+    lr_schedule = learning_rate_schedule.InverseTimeDecay(
+        learning_rate, decay_steps=1.0, decay_rate=decay)
+    opt = rmsprop.RMSprop(
+        learning_rate=lr_schedule,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+
+    rms0 = opt.get_slot(var0, "rms")
+    self.assertTrue(rms0 is not None)
+    rms1 = opt.get_slot(var1, "rms")
+    self.assertTrue(rms1 is not None)
+    if momentum > 0.:
+      mom0 = opt.get_slot(var0, "momentum")
+      mom1 = opt.get_slot(var1, "momentum")
+    else:
+      mom0 = None
+      mom1 = None
+
+    mg0_np = np.array([0.0, 0.0])
+    mg1_np = np.array([0.0, 0.0])
+    rms0_np = np.array([0.0, 0.0])
+    rms1_np = np.array([0.0, 0.0])
+    mom0_np = np.array([0.0, 0.0])
+    mom1_np = np.array([0.0, 0.0])
+
+    # Fetch params to validate initial values
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 4 steps of RMSprop
+    for t in range(2):
+      self.evaluate(update)
+
+      lr = learning_rate / (1 + decay * t)
+      var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+          var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
+          epsilon, centered)
+      var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+          var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
+          epsilon, centered)
+
+      # Validate updated params
+      self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+      self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+      if momentum > 0.:
+        self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+        self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -481,5 +555,51 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertEqual(config["epsilon"], 1e-8)
 
 
+class SlotColocationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([True, False])
+  @test_util.run_in_graph_and_eager_modes
+  def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+
+    with ops.device("/device:CPU:0"):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtypes.float32)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtypes.float32)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+
+    def loss():
+      return 5 * var0 + 3 * var1
+
+    opt = rmsprop.RMSprop(
+        learning_rate=1.0, decay=0.9, momentum=0.5, epsilon=1.0)
+
+    # Fetch params to validate initial values
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 1 step through optimizer on GPU.
+    # Slot variables are created the first time optimizer is used on some
+    # variable. This tests that slot variables will be colocated with the base
+    # variable.
+    with ops.device("/device:GPU:0"):
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      opt_op = opt.minimize(loss, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+
+    # Validate updated params, All variables should have decreased.
+    self.assertTrue(all(v < 0.0 for v in self.evaluate(var0)),
+                    msg="updated variables: %s" % self.evaluate(var0))
+    self.assertTrue(all(v < 2.0 for v in self.evaluate(var1)),
+                    msg="updated variables: %s" % self.evaluate(var1))
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index b704b885cb967997a7a8735b31f08a1537cf4a1c..1fb8f8d2802d5e067a6c8ed79915147e086cf172 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
 from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import ftrl
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
@@ -40,7 +41,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -710,19 +711,19 @@ class Nadam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer, checkpointable.Checkpointable):
+class TFOptimizer(Optimizer, trackable.Trackable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
   def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
-    self._track_checkpointable(optimizer, name='optimizer')
+    self._track_trackable(optimizer, name='optimizer')
     if iterations is None:
       with K.name_scope(self.__class__.__name__):
         self.iterations = K.variable(0, dtype='int64', name='iterations')
     else:
       self.iterations = iterations
-    self._track_checkpointable(self.iterations, name='global_step')
+    self._track_trackable(self.iterations, name='global_step')
 
   def apply_gradients(self, grads):
     self.optimizer.apply_gradients(grads, global_step=self.iterations)
@@ -806,7 +807,8 @@ def deserialize(config, custom_objects=None):
       'adamax': adamax_v2.Adamax,
       'nadam': nadam_v2.Nadam,
       'rmsprop': rmsprop_v2.RMSprop,
-      'sgd': gradient_descent_v2.SGD
+      'sgd': gradient_descent_v2.SGD,
+      'ftrl': ftrl.Ftrl
   }
 
   # Make deserialization case-insensitive for built-in optimizers.
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 606e711483137e3e7eb5336029edcb52b3cfe916..03ce3ab8071cb45b61a86d1211aae9d65bc1db99 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -19,17 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
-import os
 import weakref
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.adam import AdamOptimizer
@@ -53,9 +49,7 @@ class KerasOptimizersTest(test.TestCase):
     y_train = keras.utils.to_categorical(y_train)
     model = _get_model(x_train.shape[1], 20, y_train.shape[1])
     model.compile(
-        loss='categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=['accuracy'])
+        loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
     np.testing.assert_equal(
         keras.backend.get_value(model.optimizer.iterations), 0)
     history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
@@ -224,40 +218,5 @@ class KerasOptimizersTest(test.TestCase):
       _ = keras.optimizers.Adam(clipnorm=-2.0)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class KerasV2OptimizersTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('adadelta_tf2', 'adadelta', True), ('adadelta_tf1', 'adadelta', False),
-      ('adagrad_tf2', 'adagrad', True), ('adagrad_tf1', 'adagrad', False),
-      ('adam_tf2', 'adam', True), ('adam_tf1', 'adam', False),
-      ('adamax_tf2', 'adamax', True), ('adamax_tf1', 'adamax', False),
-      ('sgd_tf2', 'sgd', True), ('sgd_tf1', 'sgd', False),
-      ('nadam_tf2', 'nadam', True), ('nadam_tf1', 'nadam', False),
-      ('rmsprop_tf2', 'rmsprop', True), ('rmsprop_tf1', 'rmsprop', False))
-  def test_load_from_string(self, optimizer_string, tf2mode):
-    old_mode = os.environ.get('TF2_BEHAVIOR', None)
-    if tf2mode:
-      os.environ['TF2_BEHAVIOR'] = 'enabled'
-    else:
-      if 'TF2_BEHAVIOR' in os.environ:
-        del os.environ['TF2_BEHAVIOR']
-
-    # Sanity check.
-    self.assertEqual(tf2.enabled(), tf2mode)
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_shape=(10,)))
-    model.compile(optimizer_string, 'binary_crossentropy')
-
-    self.assertEqual(optimizer_string,
-                     model.optimizer.__class__.__name__.lower())
-
-    model.fit(np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'))
-
-    if old_mode is not None:
-      os.environ['TF2_BEHAVIOR'] = old_mode
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 4abaadfcd305f493b163ad710d11c977b3d1adac..f7cbb589dc9de63e4426a0a0338a67f78d7f07d3 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -386,6 +386,8 @@ class TestImage(test.TestCase):
     _ = keras.preprocessing.image.random_shift(x, 0.2, 0.2)
     _ = keras.preprocessing.image.random_shear(x, 2.)
     _ = keras.preprocessing.image.random_zoom(x, (0.5, 0.5))
+    _ = keras.preprocessing.image.apply_channel_shift(x, 2, 2)
+    _ = keras.preprocessing.image.apply_affine_transform(x, 2)
     with self.assertRaises(ValueError):
       keras.preprocessing.image.random_zoom(x, (0, 0, 0))
     _ = keras.preprocessing.image.random_channel_shift(x, 2.)
diff --git a/tensorflow/python/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/preprocessing/sequence_test.py
index ab6a09106b5f3c8bc340a25ebe3fc82be3f71cd2..ce26b2072269903ae46b8e57f79a12f15221c650 100644
--- a/tensorflow/python/keras/preprocessing/sequence_test.py
+++ b/tensorflow/python/keras/preprocessing/sequence_test.py
@@ -97,6 +97,14 @@ class TestSequence(test.TestCase):
     for l in labels:
       self.assertEqual(len(l), 2)
 
+  def test_remove_long_seq(self):
+    a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
+
+    new_seq, new_label = keras.preprocessing.sequence._remove_long_seq(
+        maxlen=3, seq=a, label=['a', 'b', ['c', 'd']])
+    self.assertEqual(new_seq, [[[1, 1]], [[2, 1], [2, 2]]])
+    self.assertEqual(new_label, ['a', 'b'])
+
   def test_TimeseriesGenerator(self):
     data = np.array([[i] for i in range(50)])
     targets = np.array([[i] for i in range(50)])
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index b828fa933bb56ca7cd79336bff6918648f4bf32c..2dabe504b0c5fcdb95223ace15a60c33bfa058e2 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -54,6 +54,8 @@ class L1L2(Regularizer):
     self.l2 = K.cast_to_floatx(l2)
 
   def __call__(self, x):
+    if not self.l1 and not self.l2:
+      return K.constant(0.)
     regularization = 0.
     if self.l1:
       regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index 3d6b259d87de8b6533d008a839f0df2226d71ed4..fb2439395bd94e781b9c4b7576c9b249ee44286f 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python import keras
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -28,50 +30,54 @@ DATA_DIM = 5
 NUM_CLASSES = 2
 
 
-def get_data():
-  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-      train_samples=10,
-      test_samples=10,
-      input_shape=(DATA_DIM,),
-      num_classes=NUM_CLASSES)
-  y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
-  y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
-  return (x_train, y_train), (x_test, y_test)
-
-
-def create_model(kernel_regularizer=None, activity_regularizer=None):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(NUM_CLASSES,
-                               kernel_regularizer=kernel_regularizer,
-                               activity_regularizer=activity_regularizer,
-                               input_shape=(DATA_DIM,)))
-  return model
+class KerasRegularizersTest(test.TestCase, parameterized.TestCase):
 
+  def create_model(self, kernel_regularizer=None, activity_regularizer=None):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(NUM_CLASSES,
+                                 kernel_regularizer=kernel_regularizer,
+                                 activity_regularizer=activity_regularizer,
+                                 input_shape=(DATA_DIM,)))
+    return model
 
-class KerasRegularizersTest(test.TestCase):
+  def get_data(self):
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=10,
+        test_samples=10,
+        input_shape=(DATA_DIM,),
+        num_classes=NUM_CLASSES)
+    y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
+    return (x_train, y_train), (x_test, y_test)
 
-  def test_kernel_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l1_l2', keras.regularizers.l1_l2()),
+  ])
+  def test_kernel_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(),
-                  keras.regularizers.l2(),
-                  keras.regularizers.l1_l2()]:
-        model = create_model(kernel_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(kernel_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
-  @test_util.run_deprecated_v1
-  def test_activity_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l2_zero', keras.regularizers.l2(0.)),
+  ])
+  @test_util.deprecated_graph_mode_only
+  def test_activity_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
-        model = create_model(activity_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(activity_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
   def test_zero_regularization(self):
     inputs = keras.backend.ones(shape=(10, 10))
diff --git a/tensorflow/python/keras/saving/__init__.py b/tensorflow/python/keras/saving/__init__.py
index bb4db681248e8f25672cacd2d80dc65ea43a4113..b32ae4041c6d5c6111de42c5401095607972281f 100644
--- a/tensorflow/python/keras/saving/__init__.py
+++ b/tensorflow/python/keras/saving/__init__.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.saving.hdf5_format import save_weights_to_hdf5_grou
 from tensorflow.python.keras.saving.model_config import model_from_config
 from tensorflow.python.keras.saving.model_config import model_from_json
 from tensorflow.python.keras.saving.model_config import model_from_yaml
-from tensorflow.python.keras.saving.saved_model import export
+from tensorflow.python.keras.saving.saved_model import export_saved_model
 from tensorflow.python.keras.saving.saved_model import load_from_saved_model
 from tensorflow.python.keras.saving.saving_utils import trace_model_call
 
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index cb925c30a63c00a29bd03a2f541d0dd322c22e01..95b865320eace17929d5ccbb7487d1f0500d5ab6 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -25,7 +25,6 @@ import os
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.saving import model_config as model_config_lib
@@ -115,7 +114,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
             'optimizer attributes or optimizer state '
             'after instantiation. '
             'As a result, we cannot save the optimizer '
-            'as part of the model save file.'
+            'as part of the model save file. '
             'You will have to compile your model again after loading it. '
             'Prefer using a Keras optimizer instead '
             '(see keras.io/optimizers).')
@@ -135,23 +134,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
             default=serialization.get_json_type).encode('utf8')
 
         # Save optimizer weights.
-        symbolic_weights = getattr(model.optimizer, 'weights')
-        if symbolic_weights:
-          optimizer_weights_group = f.create_group('optimizer_weights')
-          weight_values = K.batch_get_value(symbolic_weights)
-          weight_names = []
-          for w, val in zip(symbolic_weights, weight_values):
-            name = str(w.name)
-            weight_names.append(name.encode('utf8'))
-          optimizer_weights_group.attrs['weight_names'] = weight_names
-          for name, val in zip(weight_names, weight_values):
-            param_dset = optimizer_weights_group.create_dataset(
-                name, val.shape, dtype=val.dtype)
-            if not val.shape:
-              # scalar
-              param_dset[()] = val
-            else:
-              param_dset[:] = val
+        save_optimizer_weights_to_hdf5_group(f, model.optimizer)
     f.flush()
   finally:
     if opened_new_file:
@@ -271,14 +254,7 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
         # weights.
         if model._is_graph_network:  # pylint: disable=protected-access
           model._make_train_function()
-          optimizer_weights_group = f['optimizer_weights']
-          optimizer_weight_names = [
-              n.decode('utf8')
-              for n in optimizer_weights_group.attrs['weight_names']
-          ]
-          optimizer_weight_values = [
-              optimizer_weights_group[n] for n in optimizer_weight_names
-          ]
+          optimizer_weight_values = load_optimizer_weights_from_hdf5_group(f)
           try:
             model.optimizer.set_weights(optimizer_weight_values)
           except ValueError:
@@ -655,6 +631,45 @@ def _convert_rnn_weights(layer, weights):
   return weights
 
 
+def save_optimizer_weights_to_hdf5_group(hdf5_group, optimizer):
+  """Saves optimizer weights of a optimizer to a HDF5 group.
+
+  Arguments:
+      hdf5_group: HDF5 group.
+      optimizer: optimizer instance.
+  """
+
+  symbolic_weights = getattr(optimizer, 'weights')
+  if symbolic_weights:
+    weights_group = hdf5_group.create_group('optimizer_weights')
+    weight_names = [str(w.name).encode('utf8') for w in symbolic_weights]
+    save_attributes_to_hdf5_group(weights_group, 'weight_names', weight_names)
+    weight_values = K.batch_get_value(symbolic_weights)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = weights_group.create_dataset(
+          name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_optimizer_weights_from_hdf5_group(hdf5_group):
+  """Load optimizer weights from a HDF5 group.
+
+  Arguments:
+      hdf5_group: A pointer to a HDF5 group.
+
+  Returns:
+      data: List of optimizer weight names.
+  """
+  weights_group = hdf5_group['optimizer_weights']
+  optimizer_weight_names = load_attributes_from_hdf5_group(
+      weights_group, 'weight_names')
+  return [weights_group[weight_name] for weight_name in optimizer_weight_names]
+
+
 def save_weights_to_hdf5_group(f, layers):
   """Saves the weights of a list of layers to a HDF5 group.
 
@@ -669,25 +684,10 @@ def save_weights_to_hdf5_group(f, layers):
   f.attrs['backend'] = K.backend().encode('utf8')
   f.attrs['keras_version'] = str(keras_version).encode('utf8')
 
-  # On TPUs, modifying the graph between session.runs() triggers some expensive
-  # recompilation overhead. To avoid this, we build up the full set of tensors
-  # to save before fetching weights, thus only modifying the graph once.
-  layer_weights_dict = {}
-  for layer in layers:
-    layer_weights_dict[layer.name] = [ops.convert_to_tensor(w)
-                                      for w in layer.weights]
-
   for layer in layers:
     g = f.create_group(layer.name)
-    symbolic_weights = layer_weights_dict[layer.name]
-    weight_values = K.batch_get_value(symbolic_weights)
-    weight_names = []
-    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
-      if hasattr(w, 'name') and w.name:
-        name = str(w.name)
-      else:
-        name = 'param_' + str(i)
-      weight_names.append(name.encode('utf8'))
+    weight_values = K.batch_get_value(layer.weights)
+    weight_names = [w.name.encode('utf8') for w in layer.weights]
     save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
     for name, val in zip(weight_names, weight_values):
       param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 3b496e70c1e8106b5a3effa77bbf349197e73fe5..38d798c0a76ef0094b7fad4da7cf792d73a35d80 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training as training_module
-from tensorflow.python.training.checkpointable import util as checkpointable
+from tensorflow.python.training.tracking import util as trackable
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -50,6 +50,7 @@ except ImportError:
 
 class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def test_weight_loading(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(2,))
@@ -96,6 +97,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
       y = model.predict(x)
       self.assertAllClose(ref_y, y)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_weight_preprocessing(self):
     input_dim = 3
     output_dim = 3
@@ -222,6 +224,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
           for (x, y) in zip(weights1, weights2)
       ]
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_weight_loading(self):
     if h5py is None:
       return
@@ -253,6 +256,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
       self.assertAllClose(y, ref_y)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_weight_loading_group_name_with_incorrect_length(self):
     if h5py is None:
       return
@@ -348,13 +352,16 @@ class TestWholeModelSaving(test.TestCase):
           optimizer=keras.optimizers.RMSprop(lr=0.0001),
           metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.categorical_crossentropy,
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           sample_weight_mode='temporal')
+
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -640,7 +647,6 @@ class TestWholeModelSaving(test.TestCase):
       os.remove(fname)
 
   def test_saving_model_with_long_weights_names(self):
-    self.skipTest('b/120921503')
     if h5py is None:
       self.skipTest('h5py required to run this test')
 
@@ -992,7 +998,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_incompatible_checkpoint(self):
-    save_path = checkpointable.Checkpoint().save(
+    save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
     m = keras.Model()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
index 06e02acf327ccb34b5fefdc5cbc548ee168625ef..ffaf02be7d992e1a126d9e131efe317acee5cd10 100644
--- a/tensorflow/python/keras/saving/saved_model.py
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -25,8 +25,10 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import model_from_json
 from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -35,46 +37,39 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import model_utils
 from tensorflow.python.saved_model import save as save_lib
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training import mode_keys
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.experimental.export')
-def export(
-    model, saved_model_path, custom_objects=None, as_text=None,
-    input_signature=None, serving_only=False):
-  """Saves a `tf.keras.Model` into Tensorflow SavedModel format.
-
-  `save_model` generates new files/folders under the `saved_model_path` folder:
-  1) a checkpoint containing the model weights.
-  2) a saved_model.pb file containing the model's MetaGraphs. The prediction
-     graph is always exported. The evaluaton and training graphs are exported
-     if the following conditions are met:
-     - Evaluation: model loss is defined.
-     - Training: model is compiled with an optimizer defined under `tf.train`.
-       This is because `tf.keras.optimizers.Optimizer` instances cannot be
-       saved to checkpoints.
-  3) Model's json configuration, if model.get_config() has been implemented.
-     This file can be used to reload the model using
-     tf.keras.models.model_from_json(). Note that if any custom objects were
-     used, they should be passed to the `custom_object` argument when loading
-     the model.
-
-  Model limitations:
-  - Sequential and functional models can always be saved.
-  - Subclassed models can only be saved when `serving_only=True`. This is due to
-    the current implementation copying the model in order to export the training
-    and evaluation graphs. Because the topology of subclassed models cannot be
-    determined, the subclassed models cannot be cloned. Subclassed models will
-    be entirely exportable in the future.
-
-  Note that each mode is exported in separate graphs, so different modes do not
-  share variables. To use the train graph with evaluation or prediction graphs,
-  create a new checkpoint if variable values have been updated.
+@keras_export('keras.experimental.export_saved_model')
+def export_saved_model(model,
+                       saved_model_path,
+                       custom_objects=None,
+                       as_text=False,
+                       input_signature=None,
+                       serving_only=False):
+  """Exports a `tf.keras.Model` as a Tensorflow SavedModel.
+
+  Note that at this time, subclassed models can only be saved using
+  `serving_only=True`.
+
+  The exported `SavedModel` is a standalone serialization of Tensorflow objects,
+  and is supported by TF language APIs and the Tensorflow Serving system.
+  To load the model, use the function
+  `tf.keras.experimental.load_from_saved_model`.
+
+  The `SavedModel` contains:
+
+  1. a checkpoint containing the model weights.
+  2. a `SavedModel` proto containing the Tensorflow backend graph. Separate
+     graphs are saved for prediction (serving), train, and evaluation. If
+     the model has not been compiled, then only the graph computing predictions
+     will be exported.
+  3. the model's json config. If the model is subclassed, this will only be
+     included if the model's `get_config()` method is overwritten.
 
   Example:
 
@@ -87,62 +82,48 @@ def export(
   model.summary()
 
   # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.keras.experimental.export(
-        model, '/tmp/my_simple_tf_keras_saved_model')
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
 
   # Load the saved keras model back.
-  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
-  model_prime.summary()
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
   ```
 
   Args:
     model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
       `serving_only` must be set to True.
     saved_model_path: a string specifying the path to the SavedModel directory.
-      The SavedModel will be saved to a timestamped folder created within this
-      directory.
     custom_objects: Optional dictionary mapping string names to custom classes
       or functions (e.g. custom loss functions).
-    as_text: whether to write the `SavedModel` proto in text format. Currently
-      unavailable in serving-only mode.
+    as_text: bool, `False` by default. Whether to write the `SavedModel` proto
+      in text format. Currently unavailable in serving-only mode.
     input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
-      to specify the expected model inputs. `input_signature`'s nested structure
-      should match the expected nested structure of the inputs to the model. If
-      this is not set, this function will attempt to infer the input shapes and
-      dtypes from the model. Note that if the model is subclassed, the tensor
-      inputs to the call function should be nested in the first argument (this
-      is a general requirement for using subclassed models with Keras functions
-      .fit(), .predict(), etc.).
-    serving_only: Export only the outputs produced from calling the model in
-      predict mode. The losses, optimizer, and other training configurations are
-      not saved. If the SavedModel will only be used for serving (rather than
-      retraining), or if the model is subclassed, this can be set to True.
-
-  Returns:
-    String path to the SavedModel folder, a subdirectory of `saved_model_path`.
+      to specify the expected model inputs. See `tf.function` for more details.
+    serving_only: bool, `False` by default. When this is true, only the
+      prediction graph is saved.
 
   Raises:
     NotImplementedError: If the model is a subclassed model, and serving_only is
       False.
     ValueError: If the input signature cannot be inferred from the model.
+    AssertionError: If the SavedModel directory already exists and isn't empty.
   """
-  export_dir = model_utils.get_timestamped_export_dir(saved_model_path)
-
   if serving_only:
     save_lib.save(
-        model, export_dir,
+        model,
+        saved_model_path,
         signatures=saving_utils.trace_model_call(model, input_signature))
   else:
-    _save_v1_format(model, export_dir, custom_objects, as_text, input_signature)
+    _save_v1_format(model, saved_model_path, custom_objects, as_text,
+                    input_signature)
 
   try:
-    _export_model_json(model, export_dir)
+    _export_model_json(model, saved_model_path)
   except NotImplementedError:
     logging.warning('Skipped saving model JSON, subclassed model does not have '
                     'get_config() defined.')
 
-  return export_dir
-
 
 def _export_model_json(model, saved_model_path):
   """Saves model configuration as a json string under assets folder."""
@@ -202,8 +183,8 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
 
   has_saved_vars = False
   if model.optimizer:
-    # TODO(kathywu): Verify this works with v2 optimizer.
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+    if isinstance(model.optimizer, (optimizers.TFOptimizer,
+                                    optimizer_v2.OptimizerV2)):
       _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
       has_saved_vars = True
       _export_mode(mode_keys.ModeKeys.TEST, has_saved_vars, **export_args)
@@ -220,7 +201,8 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
 
 def _get_var_list(model):
   """Returns list of all checkpointed saveable objects in the model."""
-  return checkpointable_utils.named_saveables(model)
+  var_list, _, _ = graph_view.ObjectGraphView(model).serialize_object_graph()
+  return var_list
 
 
 def create_placeholder(spec):
@@ -255,9 +237,8 @@ def _export_mode(
         'Model does not have an optimizer. Cannot export mode %s' % mode)
 
   model_graph = ops.get_default_graph()
-  with ops.Graph().as_default() as g:
-
-    K.set_learning_phase(mode == mode_keys.ModeKeys.TRAIN)
+  with ops.Graph().as_default() as g, K.learning_phase_scope(
+      mode == mode_keys.ModeKeys.TRAIN):
 
     if input_signature is None:
       input_tensors = None
@@ -288,9 +269,8 @@ def _export_mode(
       clone._make_predict_function()
     g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
 
-    clone_var_list = checkpointable_utils.named_saveables(clone)
-
     with session.Session().as_default():
+      clone_var_list = _get_var_list(clone)
       if has_saved_vars:
         # Confirm all variables in the clone have an entry in the checkpoint.
         status = clone.load_weights(checkpoint_path)
@@ -300,10 +280,10 @@ def _export_mode(
         # not counting optimizer objects. Optimizer objects are ignored because
         # if the model has not trained, the slot variables will not have been
         # created yet.
-        # TODO(b/113179535): Replace with checkpointable equivalence.
+        # TODO(b/113179535): Replace with trackable equivalence.
         _assert_same_non_optimizer_objects(model, model_graph, clone, g)
 
-        # TODO(b/113178242): Use value transfer for checkpointable objects.
+        # TODO(b/113178242): Use value transfer for trackable objects.
         clone.load_weights(checkpoint_path)
 
         # Add graph and variables to SavedModel.
@@ -311,13 +291,13 @@ def _export_mode(
         clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
         builder._has_saved_variables = True
 
-    # Add graph to the SavedModel builder.
-    builder.add_meta_graph(
-        model_utils.EXPORT_TAG_MAP[mode],
-        signature_def_map=_create_signature_def_map(clone, mode),
-        saver=saver_lib.Saver(clone_var_list),
-        init_op=variables.local_variables_initializer(),
-        train_op=train_op)
+      # Add graph to the SavedModel builder.
+      builder.add_meta_graph(
+          model_utils.EXPORT_TAG_MAP[mode],
+          signature_def_map=_create_signature_def_map(clone, mode),
+          saver=saver_lib.Saver(clone_var_list),
+          init_op=variables.local_variables_initializer(),
+          train_op=train_op)
     return None
 
 
@@ -361,15 +341,15 @@ def _create_signature_def_map(model, mode):
 
 
 def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Asserts model and clone contain the same checkpointable objects."""
+  """Asserts model and clone contain the same trackable objects."""
 
   # TODO(fchollet, kathywu): make sure this works in eager mode.
   return True
 
 
 @keras_export('keras.experimental.load_from_saved_model')
-def load_from_saved_model(saved_model_path):
-  """Loads a keras.Model from a SavedModel created by keras export().
+def load_from_saved_model(saved_model_path, custom_objects=None):
+  """Loads a keras Model from a SavedModel created by `export_saved_model()`.
 
   This function reinstantiates model state by:
   1) loading model topology from json (this will eventually come
@@ -387,16 +367,19 @@ def load_from_saved_model(saved_model_path):
   model.summary()
 
   # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.keras.experimental.export(
-        model, '/tmp/my_simple_tf_keras_saved_model')
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
 
   # Load the saved keras model back.
-  model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path)
-  model_prime.summary()
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
   ```
 
   Args:
     saved_model_path: a string specifying the path to an existing SavedModel.
+    custom_objects: Optional dictionary mapping names
+        (strings) to custom classes or functions to be
+        considered during deserialization.
 
   Returns:
     a keras.Model instance.
@@ -407,7 +390,7 @@ def load_from_saved_model(saved_model_path):
       compat.as_bytes(constants.ASSETS_DIRECTORY),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
   model_json = file_io.read_file_to_string(model_json_filepath)
-  model = model_from_json(model_json)
+  model = model_from_json(model_json, custom_objects=custom_objects)
 
   # restore model weights
   checkpoint_prefix = os.path.join(
diff --git a/tensorflow/python/keras/saving/saved_model_test.py b/tensorflow/python/keras/saving/saved_model_test.py
index 8063b8af4de91f73fcc9a00bb626a88a204b44cc..50ddf1f24c7c3360702ea4d9222b2661886dffff 100644
--- a/tensorflow/python/keras/saving/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -32,14 +33,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.keras.saving import saved_model as keras_saved_model
+from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import model_utils
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import mode_keys
 from tensorflow.python.training import training as training_module
 
 
@@ -67,10 +68,10 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
@@ -85,9 +86,9 @@ class TestModelSavingandLoading(test.TestCase):
       x = np.random.random((1, 3))
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -109,9 +110,9 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -130,58 +131,56 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
   @test_util.run_in_graph_and_eager_modes
   def test_saving_with_tf_optimizer(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.export(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path)
-      loaded_model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test that new updates are the same with both models
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_loss = model.train_on_batch(x, y)
-      loss = loaded_model.train_on_batch(x, y)
-      self.assertAllClose(ref_loss, loss, atol=1e-05)
-
-      ref_y = model.predict(x)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test saving/loading again
-      temp_saved_model2 = self._save_model_dir('saved_model_2')
-      output_path2 = keras_saved_model.export(
-          loaded_model, temp_saved_model2)
-      loaded_model = keras_saved_model.load_from_saved_model(output_path2)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    model.add(keras.layers.Dense(3))
+    model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+    model.train_on_batch(x, y)
+    ref_y = model.predict(x)
+
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(model, saved_model_dir)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
+    loaded_model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test that new updates are the same with both models
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+
+    ref_loss = model.train_on_batch(x, y)
+    loss = loaded_model.train_on_batch(x, y)
+    self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+    ref_y = model.predict(x)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test saving/loading again
+    saved_model_dir2 = self._save_model_dir('saved_model_2')
+    keras_saved_model.export_saved_model(loaded_model, saved_model_dir2)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir2)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
 
   def test_saving_subclassed_model_raise_error(self):
     # For now, saving subclassed model should raise an error. It should be
@@ -199,9 +198,9 @@ class TestModelSavingandLoading(test.TestCase):
 
     model = SubclassedModel()
 
-    temp_saved_model = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with self.assertRaises(NotImplementedError):
-      keras_saved_model.export(model, temp_saved_model)
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -264,10 +263,7 @@ def subclassed_model():
 
 def load_model(sess, path, mode):
   tags = model_utils.EXPORT_TAG_MAP[mode]
-  if mode == mode_keys.ModeKeys.PREDICT:
-    sig_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-  else:
-    sig_def_key = mode
+  sig_def_key = model_utils.SIGNATURE_KEY_MAP[mode]
 
   meta_graph_def = loader_impl.load(sess, tags, path)
   inputs = {
@@ -291,61 +287,64 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': True},
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False},
       {
           'model_builder': functional_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': True},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model_without_input_shape,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False})
   def testSaveAndLoadSavedModelExport(
-      self, model_builder, uses_learning_phase, optimizer, train_before_export):
-    saved_model_path = self._save_model_dir()
-    with self.session(graph=ops.Graph()):
-      np.random.seed(130)
-      input_arr = np.random.random((1, 3))
-      target_arr = np.random.random((1, 3))
-
-      model = model_builder(uses_learning_phase)
-      if optimizer is not None:
-        model.compile(
-            loss='mse',
-            optimizer=optimizer,
-            metrics=['mae'])
-        if train_before_export:
-          model.train_on_batch(input_arr, target_arr)
-
-        ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+      self, model_builder, uses_learning_phase, optimizer_cls,
+      train_before_export):
+    optimizer = None if optimizer_cls is None else optimizer_cls()
 
-      ref_predict = model.predict(input_arr)
+    saved_model_dir = self._save_model_dir()
 
-      # Export SavedModel
-      output_path = keras_saved_model.export(model, saved_model_path)
+    np.random.seed(130)
+    input_arr = np.random.random((1, 3))
+    target_arr = np.random.random((1, 3))
+
+    model = model_builder(uses_learning_phase)
+    if optimizer is not None:
+      model.compile(
+          loss='mse',
+          optimizer=optimizer,
+          metrics=['mae'])
+      if train_before_export:
+        model.train_on_batch(input_arr, target_arr)
+
+      ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+
+    ref_predict = model.predict(input_arr)
+
+    # Export SavedModel
+    keras_saved_model.export_saved_model(model, saved_model_dir)
 
     input_name = model.input_names[0]
     output_name = model.output_names[0]
@@ -353,7 +352,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
@@ -363,21 +362,25 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
                                         mode_keys.ModeKeys.TEST)
 
         # First obtain the loss and predictions, and run the metric update op by
         # feeding in the inputs and targets.
+        metrics_name = 'mae' if tf2.enabled() else 'mean_absolute_error'
+        metrics_update_op_key = 'metrics/' + metrics_name + '/update_op'
+        metrics_value_op_key = 'metrics/' + metrics_name + '/value'
+
         loss, predictions, _ = sess.run(
             (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mean_absolute_error/update_op']), {
+             outputs[metrics_update_op_key]), {
                  inputs[input_name]: input_arr,
                  inputs[target_name]: target_arr
              })
 
         # The metric value should be run after the update op, to ensure that it
         # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
+        metric_value = sess.run(outputs[metrics_value_op_key])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
@@ -388,12 +391,12 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
         inputs, outputs, meta_graph_def = load_model(
-            sess, output_path, mode_keys.ModeKeys.TRAIN)
+            sess, saved_model_dir, mode_keys.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
-        self.assertIn('metrics/mean_absolute_error/value', outputs)
+        self.assertIn(metrics_update_op_key, outputs)
+        self.assertIn(metrics_value_op_key, outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
@@ -414,17 +417,17 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
               atol=1e-05)
 
   def testSaveAndLoadSavedModelWithCustomObject(self):
-    saved_model_path = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with session.Session(graph=ops.Graph()) as sess:
       def relu6(x):
         return keras.backend.relu(x, max_value=6)
       inputs = keras.layers.Input(shape=(1,))
       outputs = keras.layers.Activation(relu6)(inputs)
       model = keras.models.Model(inputs, outputs)
-      output_path = keras_saved_model.export(
-          model, saved_model_path, custom_objects={'relu6': relu6})
+      keras_saved_model.export_saved_model(
+          model, saved_model_dir, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
@@ -490,17 +493,18 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     model = sequential_model_without_input_shape(True)
     # A Sequential model that hasn't been built should raise an error.
     with self.assertRaisesRegexp(ValueError, 'Please build the model'):
-      keras_saved_model.export(model, '')
+      keras_saved_model.export_saved_model(model, '')
 
-    saved_model_path = self._save_model_dir()
-    output_path = keras_saved_model.export(
-        model, saved_model_path,
-        input_signature=tensor_spec.TensorSpec(shape=(10, 11, 12, 13, 14),
-                                               dtype=dtypes.float32,
-                                               name='spec_input'))
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(
+        model,
+        saved_model_dir,
+        input_signature=tensor_spec.TensorSpec(
+            shape=(10, 11, 12, 13, 14), dtype=dtypes.float32,
+            name='spec_input'))
 
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
                                       mode_keys.ModeKeys.PREDICT)
       self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
       self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
@@ -517,18 +521,20 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
                                                      dtype=dtypes.float32)]})
   def testServingOnly(self, model_builder, input_signature):
     if context.executing_eagerly():
-      saved_model_path = self._save_model_dir()
+      saved_model_dir = self._save_model_dir()
       input_arr = np.random.random((5, 3)).astype(np.float32)
       model = model_builder()
       ref_predict = model.predict(input_arr)
 
-      output_path = keras_saved_model.export(
-          model, saved_model_path, serving_only=True,
+      keras_saved_model.export_saved_model(
+          model,
+          saved_model_dir,
+          serving_only=True,
           input_signature=input_signature)
 
       # Load predict graph, and test predictions
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
                                         mode_keys.ModeKeys.PREDICT)
         predictions = sess.run(outputs[next(iter(outputs.keys()))],
                                {inputs[next(iter(inputs.keys()))]: input_arr})
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 95da169e82367c7e6ee7ef17fcb22295f8b0242b..55ddde5c7c5d21fea98288afca7a80a285b8b4e1 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -24,7 +24,7 @@ from tensorflow.python.util import nest
 
 
 def extract_model_metrics(model):
-  """Convert metrics from a Keras model to (value, update) ops.
+  """Convert metrics from a Keras model `compile` API to dictionary.
 
   This is used for converting Keras models to Estimators and SavedModels.
 
@@ -32,20 +32,16 @@ def extract_model_metrics(model):
     model: A `tf.keras.Model` object.
 
   Returns:
-    Dictionary mapping metric names to tuples of (value, update) ops. May return
-    `None` if the model does not contain any metrics.
+    Dictionary mapping metric names to metric instances. May return `None` if
+    the model does not contain any metrics.
   """
-  from tensorflow.python.keras import metrics  # pylint: disable=g-import-not-at-top
   if not getattr(model, '_compile_metrics', None):
     return None
 
   # TODO(psv/kathywu): use this implementation in model to estimator flow.
-  eval_metric_ops = {}
-  for metric_name in model.metrics_names[1:]:  # Index 0 is `loss`.
-    m = metrics.Mean()
-    m(model._compile_metrics_tensors[metric_name])
-    eval_metric_ops[metric_name] = m
-  return eval_metric_ops
+  # We are not using model.metrics here because we want to exclude the metrics
+  # added using `add_metric` API.
+  return {m.name: m for m in model._compile_metric_functions}
 
 
 def trace_model_call(model, input_signature=None):
@@ -76,11 +72,15 @@ def trace_model_call(model, input_signature=None):
           'set. Usually, input shapes are automatically determined from calling'
           ' .fit() or .predict(). To manually set the shapes, call '
           'model._set_inputs(inputs).'.format(model))
-    input_specs = []
-    for input_tensor, input_name in zip(inputs, input_names):
-      input_specs.append(tensor_spec.TensorSpec(
+    flat_inputs = nest.flatten(inputs)
+    flat_input_names = nest.flatten(input_names)
+    flat_input_specs = []
+    for input_tensor, input_name in zip(flat_inputs, flat_input_names):
+      flat_input_specs.append(tensor_spec.TensorSpec(
           shape=input_tensor.shape, dtype=input_tensor.dtype,
           name=input_name))
+    input_specs = nest.pack_sequence_as(structure=inputs,
+                                        flat_sequence=flat_input_specs)
     # The input signature of the call function is a list with one element, since
     # all tensor inputs must be passed in as the first argument.
     input_signature = [input_specs] if len(input_specs) > 1 else input_specs
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index ae267e283ce1f4e22b92eda73fb42f80b1abb5bf..5952a4d7638b18d46bdc7d5751a6009c632746c1 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -40,6 +41,7 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save as save_lib
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import rmsprop
 
 
 class TraceModelCallTest(keras_parameterized.TestCase):
@@ -205,5 +207,46 @@ class ModelSaveTest(keras_parameterized.TestCase):
         {model.output_names[0]: model.predict_on_batch(inputs)},
         _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
 
+
+class ExtractModelMetricsTest(test.TestCase):
+
+  def test_extract_model_metrics(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(None, extract_metrics)
+
+    extract_metric_names = [
+        'dense_binary_accuracy', 'dropout_binary_accuracy',
+        'dense_mean_squared_error', 'dropout_mean_squared_error'
+    ]
+    if tf2.enabled():
+      extract_metric_names.extend(['dense_mae', 'dropout_mae'])
+    else:
+      extract_metric_names.extend(
+          ['dense_mean_absolute_error', 'dropout_mean_absolute_error'])
+
+    model_metric_names = ['loss', 'dense_loss', 'dropout_loss'
+                         ] + extract_metric_names
+    model.compile(
+        loss='mae',
+        metrics=[
+            keras.metrics.BinaryAccuracy(), 'mae',
+            keras.metrics.mean_squared_error
+        ],
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=None)
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(set(model_metric_names), set(model.metrics_names))
+    self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index eff0f39b6d006c60198a607e796e7619b968eaf3..dabfe1a79b3e6fc6ee5e7247ea5fd3644fc5f990 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
@@ -65,6 +66,7 @@ def get_test_data(train_samples,
           (x[train_samples:], y[train_samples:]))
 
 
+@test_util.use_deterministic_cudnn
 def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
                input_data=None, expected_output=None,
                expected_output_dtype=None):
@@ -160,7 +162,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -209,7 +211,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # for further checks in the caller function
   return actual_output
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 61940ad789c4009fca5462079014482fb8bfec1b..66d9817a6aecd28aafcf01896d089a342401fca7 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -34,10 +34,12 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.layer_utils import print_summary
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import model_to_dot
 from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e7711ebd61079b625890b14520ef18b724ddbb
--- /dev/null
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras composite tensor support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import Layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+# Define test-only Layer classes to validate passing Sparse and Ragged tensors
+# between layers.
+class ToDense(Layer):
+  """Create a dense (standard) tensor from the given input tensor."""
+
+  def __init__(self, default_value, **kwargs):
+    super(ToDense, self).__init__(**kwargs)
+    self._default_value = default_value
+
+  def call(self, inputs):
+    if isinstance(inputs, ragged_tensor.RaggedTensor):
+      return inputs.to_tensor(default_value=self._default_value)
+    elif isinstance(inputs, sparse_tensor.SparseTensor):
+      return sparse_ops.sparse_tensor_to_dense(
+          inputs, default_value=self._default_value)
+    elif isinstance(inputs, ops.Tensor):
+      return inputs
+    else:
+      raise TypeError("Unexpected tensor type %s" % type(inputs).__name__)
+
+
+class ToRagged(Layer):
+  """Create a ragged tensor based on a given dense tensor."""
+
+  def __init__(self, padding, ragged_rank=1, **kwargs):
+    super(ToRagged, self).__init__(**kwargs)
+    self._padding = padding
+    self._ragged_rank = ragged_rank
+
+  def call(self, inputs):
+    return ragged_tensor.RaggedTensor.from_tensor(
+        inputs, padding=self._padding, ragged_rank=self._ragged_rank)
+
+
+class ToSparse(Layer):
+  """Create a sparse tensor based on a given dense tensor."""
+
+  def call(self, inputs):
+    indices = array_ops.where(math_ops.not_equal(inputs, 0))
+    values = array_ops.gather_nd(inputs, indices)
+    shape = array_ops.shape(inputs, out_type=dtypes.int64)
+    return sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class InternalCompositeTest(keras_parameterized.TestCase):
+
+  def test_model_with_internal_ragged_tensors(self):
+    # Create a model that accepts an input, converts it to Ragged, and
+    # converts the ragged tensor back to a dense tensor.
+    layers = [ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1], [2, 3]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_model_with_internal_sparse_tensors(self):
+    # Create a model that accepts an input, converts it to Sparse, and
+    # converts the sparse tensor back to a dense tensor.
+    layers = [ToSparse(), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_training_model_with_internal_ragged_tensors(self):
+
+    # Create a model that implements y=Mx. This is easy to learn and will
+    # demonstrate appropriate gradient passing. (We have to use RaggedTensors
+    # for this test, as ToSparse() doesn't support gradient propagation through
+    # the layer.) TODO(b/124796939): Investigate this.
+    layers = [core.Dense(2), ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+
+    input_data = np.random.rand(1024, 1)
+    expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1)
+
+    model.compile(
+        loss="mse",
+        optimizer="adam",
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(input_data, expected_data, epochs=10, verbose=0)
+
+    # If the model trained, the loss stored at history[0] should be different
+    # than the one stored at history[-1].
+    self.assertNotEqual(history.history["loss"][-1], history.history["loss"][0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/utils/conv_utils_test.py b/tensorflow/python/keras/utils/conv_utils_test.py
index eb2a360bfdaf04d695a599b477c0d154bac062cd..ef7ad1b8c53edbc313d95382b248b159c6c2da1d 100644
--- a/tensorflow/python/keras/utils/conv_utils_test.py
+++ b/tensorflow/python/keras/utils/conv_utils_test.py
@@ -52,6 +52,114 @@ input_shapes = [
 ]
 
 
+class TestBasicConvUtilsTest(test.TestCase):
+
+  def test_convert_data_format(self):
+    self.assertEqual('NCDHW', conv_utils.convert_data_format(
+        'channels_first', 5))
+    self.assertEqual('NCHW', conv_utils.convert_data_format(
+        'channels_first', 4))
+    self.assertEqual('NCW', conv_utils.convert_data_format('channels_first', 3))
+    self.assertEqual('NHWC', conv_utils.convert_data_format('channels_last', 4))
+    self.assertEqual('NWC', conv_utils.convert_data_format('channels_last', 3))
+    self.assertEqual('NDHWC', conv_utils.convert_data_format(
+        'channels_last', 5))
+
+    with self.assertRaises(ValueError):
+      conv_utils.convert_data_format('invalid', 2)
+
+  def test_normalize_tuple(self):
+    self.assertEqual((2, 2, 2),
+                     conv_utils.normalize_tuple(2, n=3, name='strides'))
+    self.assertEqual((2, 1, 2),
+                     conv_utils.normalize_tuple((2, 1, 2), n=3, name='strides'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple((2, 1), n=3, name='strides')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple(None, n=3, name='strides')
+
+  def test_normalize_data_format(self):
+    self.assertEqual('channels_last',
+                     conv_utils.normalize_data_format('Channels_Last'))
+    self.assertEqual('channels_first',
+                     conv_utils.normalize_data_format('CHANNELS_FIRST'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_data_format('invalid')
+
+  def test_normalize_padding(self):
+    self.assertEqual('same', conv_utils.normalize_padding('SAME'))
+    self.assertEqual('valid', conv_utils.normalize_padding('VALID'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_padding('invalid')
+
+  def test_conv_output_length(self):
+    self.assertEqual(4, conv_utils.conv_output_length(4, 2, 'same', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'same', 2, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'valid', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'valid', 2, 1))
+    self.assertEqual(5, conv_utils.conv_output_length(4, 2, 'full', 1, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'full', 2, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(5, 2, 'valid', 2, 2))
+
+  def test_conv_input_length(self):
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'same', 1))
+    self.assertEqual(2, conv_utils.conv_input_length(2, 2, 'same', 2))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'valid', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(2, 2, 'valid', 2))
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'full', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'full', 2))
+
+  def test_deconv_output_length(self):
+    self.assertEqual(4, conv_utils.deconv_output_length(4, 2, 'same', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(4, 2, 'same', stride=2))
+    self.assertEqual(5, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=2))
+    self.assertEqual(3, conv_utils.deconv_output_length(4, 2, 'full', stride=1))
+    self.assertEqual(6, conv_utils.deconv_output_length(4, 2, 'full', stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=2))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=1))
+    self.assertEqual(
+        9,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=1, dilation=2))
+    self.assertEqual(
+        12,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=2, dilation=3))
+    self.assertEqual(
+        6,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=2, dilation=3))
+
+
 @parameterized.parameters(input_shapes)
 class TestConvUtils(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 9b4a50dd7e7546d04b1b31a256c9f8b1c4061be4..0f6e89b4d273ba37174cfa2f5c20a473ab6087ea 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -246,10 +246,10 @@ def get_file(fname,
     try:
       try:
         urlretrieve(origin, fpath, dl_progress)
-      except URLError as e:
-        raise Exception(error_msg.format(origin, e.errno, e.reason))
       except HTTPError as e:
         raise Exception(error_msg.format(origin, e.code, e.msg))
+      except URLError as e:
+        raise Exception(error_msg.format(origin, e.errno, e.reason))
     except (Exception, KeyboardInterrupt) as e:
       if os.path.exists(fpath):
         os.remove(fpath)
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 8b5cdadd45ff89997af3ef4db6517e0b56a96e4d..f5a3203f3b3b229ebd00e57cae578ee984f3330f 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -392,7 +392,7 @@ class Progbar(object):
 
       if self.target is not None:
         numdigits = int(np.log10(self.target)) + 1
-        bar = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+        bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
         prog = float(current) / self.target
         prog_width = int(self.width * prog)
         if prog_width > 0:
@@ -572,11 +572,8 @@ def to_snake_case(name):
   return 'private' + insecure
 
 
-def is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
+def is_all_none(structure):
+  iterable = nest.flatten(structure)
   # We cannot use Python's `any` because the iterable may return Tensors.
   for element in iterable:
     if element is not None:
diff --git a/tensorflow/python/keras/utils/kernelized_utils.py b/tensorflow/python/keras/utils/kernelized_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e73cb2d4c63df2f1098802deffbcc899039d0cb
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods related to kernelized layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _to_matrix(u):
+  """If input tensor is a vector (i.e., has rank 1), converts it to matrix."""
+  u_rank = len(u.shape)
+  if u_rank not in [1, 2]:
+    raise ValueError('The input tensor should have rank 1 or 2. Given rank: {}'
+                     .format(u_rank))
+  if u_rank == 1:
+    return array_ops.expand_dims(u, 0)
+  return u
+
+
+def _align_matrices(x, y):
+  """Aligns x and y tensors to allow computations over pairs of their rows."""
+  x_matrix = _to_matrix(x)
+  y_matrix = _to_matrix(y)
+  x_shape = x_matrix.shape
+  y_shape = y_matrix.shape
+  if y_shape[1] != x_shape[1]:  # dimensions do not match.
+    raise ValueError(
+        'The outermost dimensions of the input tensors should match. Given: {} '
+        'vs {}.'.format(y_shape[1], x_shape[1]))
+
+  x_tile = array_ops.tile(
+      array_ops.expand_dims(x_matrix, 1), [1, y_shape[0], 1])
+  y_tile = array_ops.tile(
+      array_ops.expand_dims(y_matrix, 0), [x_shape[0], 1, 1])
+  return x_tile, y_tile
+
+
+def inner_product(u, v):
+  u = _to_matrix(u)
+  v = _to_matrix(v)
+  return math_ops.matmul(u, v, transpose_b=True)
+
+
+def exact_gaussian_kernel(x, y, stddev):
+  """Computes exact Gaussian kernel value(s) for tensors x and y and stddev.
+
+  The Gaussian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
+  where the norm is the l2-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix
+      of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for
+      all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_squared_l2_norm = math_ops.reduce_sum(
+      math_ops.squared_difference(x_aligned, y_aligned), 2)
+  return math_ops.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
+
+
+def exact_laplacian_kernel(x, y, stddev):
+  """Computes exact Laplacian kernel value(s) for tensors x and y using stddev.
+
+  The Laplacian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v|| / stddev)
+  where the norm is the l1-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
+    of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
+    all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_l1_norm = math_ops.reduce_sum(
+      math_ops.abs(math_ops.subtract(x_aligned, y_aligned)), 2)
+  return math_ops.exp(-diff_l1_norm / stddev)
diff --git a/tensorflow/python/keras/utils/kernelized_utils_test.py b/tensorflow/python/keras/utils/kernelized_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9a72493ddee5cf1d0f310c06d0fa1860b2a61f
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class KernelizedUtilsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_equal_vectors(self, exact_kernel_fn, expected_values):
+    """Identical vectors give exactly the identity kernel value."""
+    x = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    y = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are identical and therefore K(x, y) will be precisely equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-6)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_almost_identical_vectors(self, exact_kernel_fn, expected_values):
+    """Almost identical vectors give the identity kernel value."""
+    x = constant_op.constant([1.0, 0.4, -2.1, -1.1])
+    y = constant_op.constant([1.01, 0.39, -2.099, -1.101])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are almost identical and therefore K(x, y) will be almost equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-3)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.99], [0.977]]),
+      ('laplacian', _exact_laplacian(stddev=5.0), [[0.96], [0.94]]))
+  def test_similar_matrices(self, exact_kernel_fn, expected_values):
+    """Pairwise "close" vectors give high kernel values (similarity scores)."""
+    x = constant_op.constant([1.0, 3.4, -2.1, 0.9, 3.3, -2.0], shape=[2, 3])
+    y = constant_op.constant([1.1, 3.35, -2.05])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # The 2 rows of x are close to y. The pairwise kernel values (similarity
+    # scores) are somewhat close to the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=2.0), [[.997, .279], [.251, 1.],
+                                                 [.164, 0.019]]),
+      ('laplacian', _exact_laplacian(stddev=2.0), [[.904, .128], [.116, 1.],
+                                                   [.07, 0.027]]))
+  def test_matrices_varying_similarity(self, exact_kernel_fn, expected_values):
+    """Test matrices with row vectors of varying pairwise similarity."""
+    x = constant_op.constant([1.0, 2., -2., 0.9, 3.3, -1.0], shape=[3, 2])
+    y = constant_op.constant([1.1, 2.1, -2., 0.9], shape=[2, 2])
+    exact_kernel = exact_kernel_fn(x, y)
+
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.0]]),
+      ('laplacian', _exact_laplacian(stddev=1.0), [[0.0]]))
+  def test_completely_dissimilar_vectors(self, exact_kernel_fn,
+                                         expected_values):
+    """Very dissimilar vectors give very low similarity scores."""
+    x = constant_op.constant([1.0, 3.4, -2.1, -5.1])
+    y = constant_op.constant([0.5, 2.1, 1.0, 3.0])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are very "far" from each other and so the corresponding kernel
+    # value will be very low.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 1d85e8a25fe8e39f0edc22012c4a62f9ef14a058..ffe4285802d100976e6198f548647936a24bc98a 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -158,6 +158,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       line += ' ' * (positions[i] - len(line))
     print_fn(line)
 
+  print_fn('Model: "{}"'.format(model.name))
   print_fn('_' * line_length)
   print_row(to_display, positions)
   print_fn('=' * line_length)
@@ -358,3 +359,13 @@ def convert_dense_weights_data_format(dense,
       ki = np.transpose(ki, (1, 2, 0))  # first -> last
     kernel[:, i] = np.reshape(ki, (np.prod(previous_feature_map_shape),))
   dense.set_weights([kernel, bias])
+
+
+def is_builtin_layer(layer):
+  if not getattr(layer, '_keras_api_names', None):
+    return False
+
+  # Subclasses of `Layer` that are not exported inherit the export name
+  # of the base layer class.
+  return (layer._keras_api_names != ('keras.layers.Layer',) and
+          layer._keras_api_names_v1 != ('keras.layers.Layer',))
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index d42b354fb140bc592ee1127c3789069365371bc4..4b37c741d1c53febed28252c4bb12b77c8c75722 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -26,7 +27,34 @@ from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.losses.Reduction', v1=[])
+class ReductionV2(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
+  """
+
+  NONE = 'none'
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError('Invalid Reduction Key %s.' % key)
 
 
 def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
@@ -140,21 +168,23 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-def _reduce_weighted_loss(
-    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+def reduce_weighted_loss(weighted_losses,
+                         reduction=ReductionV2.SUM_OVER_BATCH_SIZE):
   """Reduces the individual weighted loss measurements."""
-  if reduction == losses_impl.ReductionV2.NONE:
+  if reduction == ReductionV2.NONE:
     loss = weighted_losses
   else:
     loss = math_ops.reduce_sum(weighted_losses)
-    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
-      loss = _safe_mean(loss, _num_elements(weighted_losses))
+    if reduction == ReductionV2.SUM_OVER_BATCH_SIZE:
+      num_replicas = (  # Used to convert from local to global batch size.
+          distribution_strategy_context.get_strategy().num_replicas_in_sync)
+      loss = _safe_mean(loss, num_replicas * _num_elements(weighted_losses))
   return loss
 
 
 def compute_weighted_loss(losses,
                           sample_weight=None,
-                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          reduction=ReductionV2.SUM_OVER_BATCH_SIZE,
                           name=None):
   """Computes the weighted loss.
 
@@ -162,8 +192,8 @@ def compute_weighted_loss(losses,
     losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
     sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
       `losses`, or be broadcastable to `losses`.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
 
   Raises:
@@ -173,15 +203,10 @@ def compute_weighted_loss(losses,
     Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
     `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
   """
-  losses_impl.ReductionV2.validate(reduction)
+  ReductionV2.validate(reduction)
   if sample_weight is None:
     sample_weight = 1.0
   with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     # Update dimensions of `sample_weight` to match with `losses` if possible.
     losses, _, sample_weight = squeeze_or_expand_dimensions(
         losses, None, sample_weight)
@@ -203,7 +228,16 @@ def compute_weighted_loss(losses,
     sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
     weighted_losses = math_ops.multiply(losses, sample_weight)
     # Apply reduction function to the individual weighted losses.
-    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    loss = reduce_weighted_loss(weighted_losses, reduction)
     # Convert the result back to the input type.
     loss = math_ops.cast(loss, input_dtype)
     return loss
+
+
+def scale_loss_for_distribution(loss_value):
+  """Scales and returns the given loss value by the number of replicas."""
+  num_replicas = (
+      distribution_strategy_context.get_strategy().num_replicas_in_sync)
+  if num_replicas > 1:
+    loss_value *= (1. / num_replicas)
+  return loss_value
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 108d6ee642c5deaece161392c44ad981877c0c1c..027361c9ade0dc23eeb30758e50599436d588d4c 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -34,14 +34,27 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util import tf_decorator
 
-
 NEG_INF = -1e10
 
 
+class Reduction(Enum):
+  """Types of metrics reduction.
+
+  Contains the following values:
+
+  * `SUM`: Scalar sum of weighted values.
+  * `SUM_OVER_BATCH_SIZE`: Scalar sum of weighted values divided by
+        number of elements.
+  * `WEIGHTED_MEAN`: Scalar sum of weighted values divided by sum of weights.
+  """
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+  WEIGHTED_MEAN = 'weighted_mean'
+
+
 def update_state_wrapper(update_state_fn):
   """Decorator to wrap metric `update_state()` with `add_update()`.
 
@@ -86,7 +99,7 @@ def result_wrapper(result_fn):
     """Decorated function with merge_call."""
     replica_context = distribution_strategy_context.get_replica_context()
     if replica_context is None:  # if in cross replica context already
-      result_t = result_fn(*args)
+      result_t = array_ops.identity(result_fn(*args))
     else:
       # TODO(psv): Test distribution of metrics using different distribution
       # strategies.
@@ -97,7 +110,12 @@ def result_wrapper(result_fn):
       def merge_fn_wrapper(distribution, merge_fn, *args):
         # We will get `PerDevice` merge function. Taking the first one as all
         # are identical copies of the function that we had passed below.
-        return distribution.unwrap(merge_fn)[0](*args)
+        merged_result_fn = distribution.unwrap(merge_fn)[0](*args)
+
+        # Wrapping result in identity so that control dependency between
+        # update_op from `update_state` and result works in case result returns
+        # a tensor.
+        return array_ops.identity(merged_result_fn)
 
       # Wrapping result in merge_call. merge_call is used when we want to leave
       # replica mode and compute a value in cross replica mode.
@@ -147,15 +165,49 @@ class ConfusionMatrix(Enum):
 
 
 class AUCCurve(Enum):
+  """Type of AUC Curve (ROC or PR)."""
   ROC = 'ROC'
   PR = 'PR'
 
+  @staticmethod
+  def from_str(key):
+    if key in ('pr', 'PR'):
+      return AUCCurve.PR
+    elif key in ('roc', 'ROC'):
+      return AUCCurve.ROC
+    else:
+      raise ValueError('Invalid AUC curve value "%s".' % key)
+
 
 class AUCSummationMethod(Enum):
+  """Type of AUC summation method.
+
+  https://en.wikipedia.org/wiki/Riemann_sum)
+
+  Contains the following values:
+  * 'interpolation': Applies mid-point summation scheme for `ROC` curve. For
+    `PR` curve, interpolates (true/false) positives but not the ratio that is
+    precision (see Davis & Goadrich 2006 for details).
+  * 'minoring': Applies left summation for increasing intervals and right
+    summation for decreasing intervals.
+  * 'majoring': Applies right summation for increasing intervals and left
+    summation for decreasing intervals.
+  """
   INTERPOLATION = 'interpolation'
   MAJORING = 'majoring'
   MINORING = 'minoring'
 
+  @staticmethod
+  def from_str(key):
+    if key in ('interpolation', 'Interpolation'):
+      return AUCSummationMethod.INTERPOLATION
+    elif key in ('majoring', 'Majoring'):
+      return AUCSummationMethod.MAJORING
+    elif key in ('minoring', 'Minoring'):
+      return AUCSummationMethod.MINORING
+    else:
+      raise ValueError('Invalid AUC summation method value "%s".' % key)
+
 
 def update_confusion_matrix_variables(variables_to_update,
                                       y_true,
@@ -208,8 +260,8 @@ def update_confusion_matrix_variables(variables_to_update,
   """
   if variables_to_update is None:
     return
-  y_true = ops.convert_to_tensor(y_true)
-  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, dtype=dtypes.float32)
+  y_pred = math_ops.cast(y_pred, dtype=dtypes.float32)
   y_pred.shape.assert_is_compatible_with(y_true.shape)
 
   if not any(
@@ -239,8 +291,7 @@ def update_confusion_matrix_variables(variables_to_update,
           message='predictions must be <= 1')
   ]):
     y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-        math_ops.cast(y_pred, dtype=dtypes.float32),
-        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+        y_pred, y_true, sample_weight)
 
   if top_k is not None:
     y_pred = _filter_top_k(y_pred, top_k)
@@ -286,7 +337,7 @@ def update_confusion_matrix_variables(variables_to_update,
         math_ops.logical_and(label, pred), dtype=dtypes.float32)
     if weights is not None:
       label_and_pred *= weights
-    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+    return var.assign_add(math_ops.reduce_sum(label_and_pred, 1))
 
   loop_vars = {
       ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
diff --git a/tensorflow/contrib/lite/python/__init__.py b/tensorflow/python/keras/utils/mode_keys.py
similarity index 72%
rename from tensorflow/contrib/lite/python/__init__.py
rename to tensorflow/python/keras/utils/mode_keys.py
index 27b1ffb251e76469092eb613d3c381718d8dc4fd..fb6fc3eef7e8967b8a87707569770f7ec1495022 100644
--- a/tensorflow/contrib/lite/python/__init__.py
+++ b/tensorflow/python/keras/utils/mode_keys.py
@@ -12,15 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Keras model mode constants."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow import lite
-
-import warnings as _warnings
-
-WARNING = ("WARNING: TF Lite has moved from tf.contrib.lite to tf.lite. Please "
-           "update your imports. This will be a breaking error in TensorFlow "
-           "version 2.0.")
-_warnings.warn(WARNING, PendingDeprecationWarning)
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
+# pylint: enable=unused-import
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 8c1abd632484273a01fd99cbd72ee73b66e46f27..9c711bd2a28395279c1e8cd726084d6b9ab4e188 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -148,7 +148,6 @@ class TestMultiGPUModel(test.TestCase):
       input_shape = (num_samples,) + shape
       x_train = np.random.randint(0, 255, input_shape)
       y_train = np.random.randint(0, num_classes, (input_shape[0],))
-      keras.backend.set_learning_phase(True)
 
       y_train = keras.utils.to_categorical(y_train, num_classes)
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index dc5c4f1d905a5cd7f11e9f1b7a9ea4328207f479..460ac1a0126b5ca5afa2cf23a0894478f4483001 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -112,11 +112,19 @@ def get_reachable_from_inputs(inputs, targets=None):
 
   while queue:
     x = queue.pop()
+    if isinstance(x, tuple(_user_convertible_tensor_types)):
+      # Can't find consumers of user-specific types.
+      continue
+
     if isinstance(x, ops.Operation):
       outputs = x.outputs[:] or []
       outputs += x._control_outputs  # pylint: disable=protected-access
     elif isinstance(x, variables.Variable):
-      outputs = [x.op]
+      try:
+        outputs = [x.op]
+      except AttributeError:
+        # Variables can be created in an Eager context.
+        outputs = []
     elif tensor_util.is_tensor(x):
       outputs = x.consumers()
     else:
@@ -171,7 +179,7 @@ def map_structure_with_atomic(is_atomic_fn, map_fn, nested):
 
 
 def convert_shapes(input_shape, to_tuples=True):
-  """Converts nested shape representations  to desired format.
+  """Converts nested shape representations to desired format.
 
   Performs:
 
@@ -193,17 +201,16 @@ def convert_shapes(input_shape, to_tuples=True):
     Nested structure of shapes in desired format.
   """
 
-  def _is_shape_component(element):
-    value = tensor_shape.as_dimension(element).value
-    return value is None or isinstance(value, int)
+  def _is_shape_component(value):
+    return value is None or isinstance(value, (int, tensor_shape.Dimension))
 
   def _is_atomic_shape(input_shape):
     # Ex: TensorShape or (None, 10, 32) or 5 or `None`
-    if input_shape is None or isinstance(input_shape, int):
+    if _is_shape_component(input_shape):
       return True
     if isinstance(input_shape, tensor_shape.TensorShape):
       return True
-    if (isinstance(input_shape, tuple) and
+    if (isinstance(input_shape, (tuple, list)) and
         all(_is_shape_component(ele) for ele in input_shape)):
       return True
     return False
@@ -312,8 +319,16 @@ def is_symbolic_tensor(tensor):
     True for symbolic tensors, False for eager tensors.
   """
   if isinstance(tensor, variables.Variable):
-    return not context.executing_eagerly()
-  if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
+    # Variables that are output of a Keras Layer in Functional API mode
+    # should be considered symbolic.
+    # TODO(omalleyt): We need a better way to check this in order to
+    # enable `run_eagerly=True` for Models containing Layers that
+    # return Variables as outputs.
+    return (getattr(tensor, '_keras_history', False) or
+            not context.executing_eagerly())
+  if isinstance(tensor, composite_tensor.CompositeTensor):
+    return tensor._is_graph_tensor  # pylint: disable=protected-access
+  if isinstance(tensor, ops.Tensor):
     return hasattr(tensor, 'graph')
   if isinstance(tensor, tuple(_user_convertible_tensor_types)):
     return hasattr(ops.convert_to_tensor(tensor), 'graph')
@@ -351,3 +366,7 @@ def register_symbolic_tensor_type(cls):
   """
   global _user_convertible_tensor_types
   _user_convertible_tensor_types.add(cls)
+
+
+def is_tensor_or_variable(x):
+  return tensor_util.is_tensor(x) or isinstance(x, variables.Variable)
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9c478af4ecbbe1bb976c982e596f82ac56e2045d..902ecf91670d52ff6839f42d345944b9be009f85 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -89,6 +89,10 @@ class TestIsSymbolicTensor(test.TestCase):
         self._input = input_
         self.value = ops.convert_to_tensor(42.)
 
+      @property
+      def dtype(self):
+        return self.value.dtype
+
     ops.register_tensor_conversion_function(
         Foo, lambda x, *args, **kwargs: x.value)
     tf_utils.register_symbolic_tensor_type(Foo)
@@ -128,6 +132,16 @@ class TestIsSymbolicTensor(test.TestCase):
     # `Tensor`.
     y = model(ops.convert_to_tensor(7.))
     self.assertIsInstance(y, Foo)
+    # Confirm that (custom) loss sees `Foo` instance, not Tensor.
+    obtained_prediction_box = [None]
+    def custom_loss(y_obs, y_pred):
+      del y_obs
+      obtained_prediction_box[0] = y_pred
+      return y_pred
+    # Apparently `compile` calls the loss function enough to trigger the
+    # side-effect.
+    model.compile('SGD', loss=custom_loss)
+    self.assertIsInstance(obtained_prediction_box[0], Foo)
 
 
 class ConvertInnerNodeDataTest(test.TestCase):
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index c7c45f381e9650f3dd9867ee1b7622ca54898656..b34bed5bfe655386bf59445e1c17d99ef39553f3 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import sys
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -43,11 +44,11 @@ def _check_pydot():
     # Attempt to create an image of a blank graph
     # to check the pydot/graphviz installation.
     pydot.Dot.create(pydot.Dot())
-  except Exception:
+    return True
+  except Exception:  # pylint: disable=broad-except
     # pydot raises a generic Exception here,
     # so no specific class can be caught.
-    raise ImportError('Failed to import pydot. You must install pydot'
-                      ' and graphviz for `pydotprint` to work.')
+    return False
 
 
 def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
@@ -63,13 +64,28 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
           'LR' creates a horizontal plot.
 
   Returns:
-      A `pydot.Dot` instance representing the Keras model.
+      A `pydot.Dot` instance representing the Keras model (or None if the Dot
+      file could not be generated).
+
+  Raises:
+    ImportError: if graphviz or pydot are not available.
   """
   from tensorflow.python.keras.layers.wrappers import Wrapper
   from tensorflow.python.keras.models import Sequential
   from tensorflow.python.util import nest
 
-  _check_pydot()
+  check = _check_pydot()
+  if not check:
+    if 'IPython.core.magics.namespace' in sys.modules:
+      # We don't raise an exception here in order to avoid crashing notebook
+      # tests where graphviz is not available.
+      print('Failed to import pydot. You must install pydot'
+            ' and graphviz for `pydotprint` to work.')
+      return
+    else:
+      raise ImportError('Failed to import pydot. You must install pydot'
+                        ' and graphviz for `pydotprint` to work.')
+
   dot = pydot.Dot()
   dot.set('rankdir', rankdir)
   dot.set('concentrate', True)
@@ -78,7 +94,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   if isinstance(model, Sequential):
     if not model.built:
       model.build()
-  layers = model.layers
+  layers = model._layers
 
   # Create graph nodes.
   for layer in layers:
@@ -145,11 +161,26 @@ def plot_model(model,
           a string specifying the format of the plot:
           'TB' creates a vertical plot;
           'LR' creates a horizontal plot.
+
+  Returns:
+      A Jupyter notebook Image object if Jupyter is installed.
+      This enables in-line display of the model plots in notebooks.
   """
   dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
+  if dot is None:
+    return
   _, extension = os.path.splitext(to_file)
   if not extension:
     extension = 'png'
   else:
     extension = extension[1:]
+  # Save image to disk.
   dot.write(to_file, format=extension)
+  # Return the image as a Jupyter Image object, to be displayed in-line.
+  # Note that we cannot easily detect whether the code is running in a
+  # notebook, and thus we always return the Image if Jupyter is available.
+  try:
+    from IPython import display
+    return display.Image(filename=to_file)
+  except ImportError:
+    pass
diff --git a/tensorflow/python/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
index 566f9db5d4459b92b0e707df4bc8a5c391a2e9ae..149ad06f57c23990777a854836c7c8beb352799f 100644
--- a/tensorflow/python/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -23,6 +23,7 @@ import types
 
 import numpy as np
 
+from tensorflow.python.keras import losses
 from tensorflow.python.keras.models import Sequential
 from tensorflow.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras.utils.np_utils import to_categorical
@@ -155,10 +156,8 @@ class BaseWrapper(object):
     else:
       self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
 
-    loss_name = self.model.loss
-    if hasattr(loss_name, '__name__'):
-      loss_name = loss_name.__name__
-    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+    if (losses.is_categorical_crossentropy(self.model.loss) and
+        len(y.shape) != 2):
       y = to_categorical(y)
 
     fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
@@ -304,7 +303,7 @@ class KerasClassifier(BaseWrapper):
     if not isinstance(outputs, list):
       outputs = [outputs]
     for name, output in zip(self.model.metrics_names, outputs):
-      if name == 'acc':
+      if name in ['accuracy', 'acc']:
         return output
     raise ValueError('The model is not configured to compute accuracy. '
                      'You should pass `metrics=["accuracy"]` to '
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 40bf4aedd093f5ea13d38bd45868043f39aa145a..e3b4ef3cc69bb531626dd2c77a3b5393fc0b12c4 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -82,6 +82,9 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
     ],
+    tags = [
+        "no_gpu",  # b/127001953
+    ],
 )
 
 tf_py_test(
@@ -131,6 +134,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -175,6 +179,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 tf_py_test(
@@ -241,7 +246,10 @@ tf_py_test(
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
-    tags = ["no_windows"],
+    tags = [
+        "no_gpu",  # b/127001953
+        "no_windows",
+    ],
 )
 
 tf_py_test(
@@ -289,6 +297,20 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "cudnn_determinism_test",
+    size = "small",
+    srcs = ["cudnn_determinism_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:nn_ops",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
 tf_py_test(
     name = "decode_csv_op_test",
     size = "small",
@@ -798,6 +820,7 @@ cuda_py_test(
     size = "small",
     srcs = ["resource_variable_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:client_testlib",
@@ -807,6 +830,8 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
     ],
+    # TODO(b/128347673): Re-enable.
+    tags = ["no_windows"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1082,7 +1107,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "summary_ops_test",
     size = "small",
     srcs = ["summary_ops_test.py"],
@@ -1090,15 +1115,24 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary_ops_v2",
+        "@six_archive//:six",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1148,6 +1182,35 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "template_mirrored_strategy_test",
+    size = "small",
+    srcs = ["template_mirrored_strategy_test.py"],
+    additional_deps = [
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+tf_py_test(
+    name = "tridiagonal_solve_op_test",
+    size = "medium",
+    srcs = ["tridiagonal_solve_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+    shard_count = 5,
+)
+
 tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
@@ -1632,7 +1695,6 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     shard_count = 16,
-    tags = ["no_gpu"],  # TODO(b/117928656)
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1674,6 +1736,21 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "conv1d_transpose_test",
+    size = "small",
+    srcs = ["conv1d_transpose_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_ops",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1836,7 +1913,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "functional_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["functional_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2076,6 +2153,29 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "map_fn_test",
+    size = "small",
+    srcs = ["map_fn_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    grpc_enabled = True,
+    shard_count = 2,
+    tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
+)
+
 cuda_py_test(
     name = "pad_op_test",
     size = "small",
@@ -2119,6 +2219,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
+    grpc_enabled = True,
     tags = ["no_windows"],
     xla_enable_strict_auto_jit = True,
 )
@@ -2148,8 +2249,11 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
-    shard_count = 4,
-    tags = ["no_windows_gpu"],
+    shard_count = 6,
+    tags = [
+        "no_oss",
+        "no_windows_gpu",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2164,12 +2268,12 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 3,
     tags = [
         "manual",
         "no_gpu",
         "nogpu",
         "noguitar",
-        "notap",
     ],
     xla_enable_strict_auto_jit = True,
 )
@@ -2283,6 +2387,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["shape_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2554,6 +2659,8 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     flaky = 1,  # create_local_cluster sometimes times out.
+    shard_count = 10,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2664,7 +2771,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = [
-        "no_gpu",  #  Flaky: b/80127739
+        "no_gpu",  #  Flaky: b/80127739, b/127001953
     ],
     xla_enable_strict_auto_jit = True,
 )
@@ -2713,6 +2820,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = [
+        "no_rocm",
         "optonly",  # flaky timeouts unless optimized
     ],
     xla_enable_strict_auto_jit = True,
@@ -2754,8 +2862,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
-        # TODO(b/118887316): Re-enable this test in Kokoro.
-        "no_oss",
+        "no_rocm",
         "optonly",  # times out
     ],
     xla_enable_strict_auto_jit = True,
@@ -2819,6 +2926,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2838,6 +2946,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops_gen",
     ],
     shard_count = 4,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2847,7 +2956,6 @@ cuda_py_test(
     srcs = ["rnn_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -2871,6 +2979,34 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "rnn_cell_test",
+    size = "medium",
+    srcs = ["rnn_cell_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    shard_count = 10,
+    xla_enable_strict_auto_jit = True,
+)
+
 cuda_py_test(
     name = "scatter_ops_test",
     size = "medium",  # NOTE: This is not run by default.
@@ -2899,6 +3035,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    tags = ["no_windows"],  # b/126916429
     xla_enable_strict_auto_jit = True,
 )
 
@@ -2994,6 +3131,24 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
 )
 
+cuda_py_test(
+    name = "extract_volume_patches_grad_test",
+    size = "medium",
+    srcs = ["extract_volume_patches_grad_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = [
+        "no_pip",
+        "notap",  # http://b/31080670
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
 cuda_py_test(
     name = "stage_op_test",
     size = "medium",
@@ -3039,6 +3194,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],  # b/126916429
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3069,6 +3225,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     shard_count = 30,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3207,8 +3364,11 @@ cuda_py_test(
     ],
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
-    tags = ["no_windows"],
-    xla_enable_strict_auto_jit = True,
+    tags = [
+        "no_rocm",  # flaky test
+        "no_windows",
+    ],
+    # b/127344411: xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3236,6 +3396,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
@@ -3244,7 +3405,7 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
-    xla_enable_strict_auto_jit = True,
+    # b/127344411: xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3264,7 +3425,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
-    xla_enable_strict_auto_jit = True,
+    # b/127344411: xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3391,6 +3552,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_ops",
     ],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -3556,3 +3718,24 @@ cuda_py_test(
     grpc_enabled = True,
     xla_enable_strict_auto_jit = True,
 )
+
+cuda_py_test(
+    name = "critical_section_test",
+    size = "medium",
+    srcs = ["critical_section_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:critical_section_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index f4c442b7b1932c3ddab0d255f57c3fac5a23954a..cd581cc8351d2ec47e97770279aeaea929d3d47f 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -507,6 +507,22 @@ class StridedSliceChecker(object):
 
   def __getitem__(self, spec):
     op = self.x.__getitem__(spec)
+
+    def eval_if_tensor(x):
+      try:
+        return x.eval()
+      except AttributeError:
+        return x
+
+    if isinstance(spec, bool) or \
+      (isinstance(spec, ops.Tensor) and spec.dtype == dtypes.bool) or \
+      (isinstance(spec, np.ndarray) and spec.dtype == bool) or \
+      (isinstance(spec, (list, tuple)) and np.asarray(spec).dtype == bool):
+      tensor = op.eval()
+      np_spec = eval_if_tensor(spec)
+      self.test.assertAllEqual(self.x_np[np_spec], tensor)
+      return tensor
+
     if not isinstance(spec, (list, tuple)):
       spec = [spec]
 
@@ -515,12 +531,6 @@ class StridedSliceChecker(object):
     # Make a numpy spec that pre-evals the tensors
     np_specs = []
 
-    def eval_if_tensor(x):
-      try:
-        return x.eval()
-      except AttributeError:
-        return x
-
     for s in spec:
       if isinstance(s, slice):
         start = eval_if_tensor(s.start)
@@ -611,6 +621,10 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[-1:0, :, :]
       # empty interval in every dimension
       _ = checker[-1:0, 2:2, 2:3:-1]
+      # empty first dimension only (used to break for aligned tensors).
+      checker = StridedSliceChecker(self,
+                                    StridedSliceChecker.REF_TENSOR_ALIGNED)
+      _ = checker[1:0]
 
   @test_util.run_deprecated_v1
   def testEllipsis(self):
@@ -678,6 +692,10 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
         _ = checker[0.0]
       with self.assertRaisesRegexp(TypeError, expected):
         _ = checker[constant_op.constant(0.0)]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant([1, 2, 3])]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[[2.1, -0.7, 1.5]]
 
   @test_util.run_deprecated_v1
   def testExpand(self):
@@ -722,6 +740,31 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # First axis slice
       _ = checker[np.newaxis, 1:]
 
+  def testMasks(self):
+    with self.session(use_gpu=True):
+      scalar = np.array(0)
+      # Test tensor type mask
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      _ = checker[checker.x > 2]
+      _ = checker[checker.x <= 5]
+      _ = checker[ops.convert_to_tensor(scalar)]
+
+      # Test numpy array type mask
+      raw = np.array([[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+                       [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23,
+                                                              24]]]]])
+      checker1 = StridedSliceChecker(self, raw)
+      _ = checker1[raw >= 4]
+      _ = checker1[raw < 19]
+      _ = checker1[scalar]
+
+      # Test boolean and non boolean cases
+      mask = np.array([True, False, True])
+      raw1 = np.array([[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]])
+      checker2 = StridedSliceChecker(self, raw1)
+      _ = checker2[mask]
+      _ = checker2[ops.convert_to_tensor(mask)]
+
 
 class StridedSliceShapeChecker(object):
 
@@ -807,7 +850,7 @@ class GradSliceChecker(object):
     analytic_grad2 = 2 * slice_val
 
     dy = variables.Variable(
-        array_ops.ones(shape=slice_var.get_shape(), dtype=dtypes.float32))
+        array_ops.ones_like(slice_var, dtype=dtypes.float32))
     assign = dy.assign(slice_var)
     slice_val_grad, = gradients_impl.gradients(slice_val, self.var, grad_ys=dy)
     slice_val_grad2, = gradients_impl.gradients(
@@ -821,6 +864,8 @@ class GradSliceChecker(object):
     # compute analytic gradient for slice
     np_val_grad = (2 * self.varnp * self.varnp)
     np_sliceval_grad = np.zeros(self.var.get_shape())
+    if isinstance(spec, ops.Tensor):
+      spec = self.sess.run([spec])
     np_sliceval_grad[spec] = np_val_grad[spec]
     # verify gradient
     self.test.assertAllEqual(slice_val_grad_evaled, np_sliceval_grad)
@@ -838,8 +883,8 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       init = variables.global_variables_initializer()
       sess.run(init)
 
-      grad = GradSliceChecker(self, sess, var,
-                              np.array(range(1, 97, 1)).reshape((6, 4, 4)))
+      raw = np.array(range(1, 97, 1)).reshape((6, 4, 4))
+      grad = GradSliceChecker(self, sess, var, raw)
       _ = grad[2:6:2, 1:3, 1:3]
       _ = grad[3:0:-2, 1:3, 1:3]
       _ = grad[3:0:-2, array_ops.newaxis, 1:3, 2, array_ops.newaxis]
@@ -851,6 +896,11 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "out of bounds"):
         _ = grad[:, 200, :]
 
+      # Test numpy array type mask
+      _ = grad[raw > 51]
+      # Test tensor type mask
+      _ = grad[ops.convert_to_tensor(raw) <= 76]
+
   def testGradientZero(self):
     with self.session(use_gpu=True) as sess:
       var = variables.Variable(8.)
@@ -1050,10 +1100,12 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[None] = [6]  # new axis
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123559667")
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123559667")
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index 5d7b2dd30f80e4bea3f8e4bd2782e787f9773008..b84e76472399943279c1f9b680332f69f8ed48d8 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -139,7 +139,6 @@ class AtrousConv2DTest(test.TestCase):
                   y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")  # larger error range
   def testGradient(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -161,7 +160,7 @@ class AtrousConv2DTest(test.TestCase):
                                                       [x_shape, f_shape],
                                                       output, y_shape)
         print("atrous_conv2d gradient err = %g " % err)
-        err_tolerance = 1e-3
+        err_tolerance = 4e-3 if test_util.is_xla_enabled() else 1e-3
         self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 00dba9996dd909786301d56da41fa037328ba3e5..0b557bda2e3436846df9a4a64c915c33b0d72c68 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -195,5 +196,46 @@ class ExtractGlimpseTest(test.TestCase):
         expected_rows=[None, None, None, 1, 2, 3, 4],
         expected_cols=[56, 57, 58, 59, 60])
 
+  def testGlimpseNoiseZero(self):
+    # Image:
+    # [  0.   1.   2.   3.   4.]
+    # [  5.   6.   7.   8.   9.]
+    # [ 10.  11.  12.  13.  14.]
+    # [ 15.  16.  17.  18.  19.]
+    # [ 20.  21.  22.  23.  24.]
+    img = constant_op.constant(
+        np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32)
+    with self.test_session():
+      # Result 1:
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      result1 = image_ops.extract_glimpse_v2(
+          img, [3, 3], [[-2, 2]],
+          centered=False,
+          normalized=False,
+          noise='zero')
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
+          self.evaluate(result1)[0, :, :, 0])
+
+      # Result 2:
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   1.   2.   3.   4.   0.]
+      # [  0.   5.   6.   7.   8.   9.   0.]
+      # [  0.  10.  11.  12.  13.  14.   0.]
+      # [  0.  15.  16.  17.  18.  19.   0.]
+      # [  0.  20.  21.  22.  23.  24.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]]
+      result2 = image_ops.extract_glimpse_v2(
+          img, [7, 7], [[0, 0]], normalized=False, noise='zero')
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
+                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
+                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+                      [0, 0, 0, 0, 0, 0, 0]]),
+          self.evaluate(result2)[0, :, :, 0])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 1a8513d022d43e3bd206bc0ab607012d05aef6a9..789f6e90c9f092279966b651a5aad7be1bf9d3f1 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -257,7 +257,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
           if len(results) != 1:
             break
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only('b/126596827 needs graph mode in multiple threads')
   def testConcurrentSessions(self):
     n_threads = 4
     threads = []
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 7e0b3e1b5eadc7fe5541612fc607aeb9a135ceb4..8a7d8669d0803c940ac8745627492db17c103311 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -88,13 +88,13 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
-  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
       indices_tf = constant_op.constant([1])
-      self.assertAllEqual([[b"qwer", b"uiop"]],
-                          array_ops.batch_gather(params, indices_tf).eval())
+      self.assertAllEqual(
+          [[b"qwer", b"uiop"]],
+          self.evaluate(array_ops.batch_gather(params, indices_tf)))
 
   @test_util.run_deprecated_v1
   def testUnknownIndices(self):
@@ -107,17 +107,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     with self.session(use_gpu=False):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
-        array_ops.batch_gather(params, [7]).eval()
+        self.evaluate(array_ops.batch_gather(params, [7]))
 
-  @test_util.run_deprecated_v1
   def testEmptySlices(self):
     with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
           indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.batch_gather(params, indices)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+          self.assertAllEqual(
+              self.evaluate(array_ops.batch_gather(params, indices)),
+              np.zeros((2, 0, 0)))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index a91f96cf952252bf162e9e708d7b8e5808aad38c..3fa2054847db635a96caedf4d596020ec2137003 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -126,7 +126,7 @@ class BenchmarkTest(test.TestCase):
     self.assertFalse(_ran_somebenchmark_2[0])
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123744455")  # GPU memory is incorrect
   def testReportingBenchmark(self):
     tempdir = test.get_temp_dir()
     try:
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 8d04da6dbd7ca6c548349c047c7c4980a04560c6..94e20d93017b07f8c3b5343744537cd7ce08896d 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -35,8 +35,6 @@ class BiasAddTest(test.TestCase):
 
   def _npBias(self, inputs, bias):
     assert len(bias.shape) == 1
-    print(inputs.shape)
-    print(bias.shape)
     assert inputs.shape[-1] == bias.shape[0]
     return inputs + bias.reshape(([1] * (len(inputs.shape) - 1)) +
                                  [bias.shape[0]])
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index d064d736cf253ddf6ebf3ef0f416f449fcf7f565..2168206c3a5d79147d74839858ea9153c579658a 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -30,44 +30,48 @@ from tensorflow.python.platform import googletest
 
 class BincountTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def test_empty(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(
-          math_ops.bincount([], minlength=5).eval(), [0, 0, 0, 0, 0])
-      self.assertAllEqual(math_ops.bincount([], minlength=1).eval(), [0])
-      self.assertAllEqual(math_ops.bincount([], minlength=0).eval(), [])
-      self.assertEqual(
-          math_ops.bincount([], minlength=0, dtype=np.float32).eval().dtype,
-          np.float32)
-      self.assertEqual(
-          math_ops.bincount([], minlength=3, dtype=np.float64).eval().dtype,
-          np.float64)
+      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=5)),
+                          [0, 0, 0, 0, 0])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=1)),
+                          [0])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=0)),
+                          [])
+      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=0,
+                                                       dtype=np.float32)).dtype,
+                       np.float32)
+      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=3,
+                                                       dtype=np.float64)).dtype,
+                       np.float64)
 
-  @test_util.run_deprecated_v1
   def test_values(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(
-          math_ops.bincount([1, 1, 1, 2, 2, 3]).eval(), [0, 3, 2, 1])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([1, 1, 1, 2, 2, 3])),
+                          [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
-      self.assertAllEqual(math_ops.bincount(arr).eval(), [0, 5, 4, 3, 2, 1])
+      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
+                          [0, 5, 4, 3, 2, 1])
       arr += [0, 0, 0, 0, 0, 0]
-      self.assertAllEqual(math_ops.bincount(arr).eval(), [6, 5, 4, 3, 2, 1])
+      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
+                          [6, 5, 4, 3, 2, 1])
 
-      self.assertAllEqual(math_ops.bincount([]).eval(), [])
-      self.assertAllEqual(math_ops.bincount([0, 0, 0]).eval(), [3])
-      self.assertAllEqual(math_ops.bincount([5]).eval(), [0, 0, 0, 0, 0, 1])
-      self.assertAllEqual(
-          math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
+      self.assertAllEqual(self.evaluate(math_ops.bincount([])), [])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([0, 0, 0])), [3])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([5])),
+                          [0, 0, 0, 0, 0, 1])
+      self.assertAllEqual(self.evaluate(math_ops.bincount(np.arange(10000))),
+                          np.ones(10000))
 
-  @test_util.run_deprecated_v1
   def test_maxlength(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
-      self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
-      self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([5], maxlength=3)),
+                          [0, 0, 0])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([1], maxlength=3)),
+                          [0, 1])
+      self.assertAllEqual(self.evaluate(math_ops.bincount([], maxlength=3)),
+                          [])
 
-  @test_util.run_deprecated_v1
   def test_random_with_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -79,9 +83,9 @@ class BincountTest(test_util.TensorFlowTestCase):
         else:
           weights = np.random.random(num_samples)
         self.assertAllClose(
-            math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
+            self.evaluate(math_ops.bincount(arr, weights)),
+            np.bincount(arr, weights))
 
-  @test_util.run_deprecated_v1
   def test_random_without_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -90,20 +94,20 @@ class BincountTest(test_util.TensorFlowTestCase):
         arr = np.random.randint(0, 1000, num_samples)
         weights = np.ones(num_samples).astype(dtype)
         self.assertAllClose(
-            math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
+            self.evaluate(math_ops.bincount(arr, None)),
+            np.bincount(arr, weights))
 
-  @test_util.run_deprecated_v1
   def test_zero_weights(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
-          math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
+          self.evaluate(math_ops.bincount(np.arange(1000), np.zeros(1000))),
           np.zeros(1000))
 
   def test_negative(self):
     # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.cached_session():
       with self.assertRaises(errors.InvalidArgumentError):
-        math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
+        self.evaluate(math_ops.bincount([1, 2, 3, -1, 6, 8]))
 
   @test_util.run_deprecated_v1
   def test_shape_function(self):
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 6b04e8abf40dc6fc396581e82b59bc6c4dec2a41..e74193049b1df732e5d986340c16329a207cf2fe 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -896,12 +896,37 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionOnEmptyEnsembleMultiClass(self):
+    """Tests that prediction on empty ensemble does not fail for multiclass."""
+    with self.cached_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+      logits_dimension = 2
+      expected_logits = [[0.0, 0.0], [0.0, 0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -1007,6 +1032,158 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionMultipleTreeMultiClass(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.51
+              }
+              vector: {
+                value: 1.14
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 1.29
+              }
+              vector: {
+                value: 8.79
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.33
+              }
+              vector: {
+                value: 7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.2
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.1
+              }
+              vector: {
+                value: 6.0
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 2.0
+              }
+              vector: {
+                value: -7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 6.3
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: (0.51, 1.14), tree 1: (0.2, 5.0), tree 2: (6.3, 5.0)
+      #
+      #            logits = (0.1*0.51+0.2*0.2+1*6.3,
+      #                      0.1*1.14+0.2*5.0+1*5)
+      # Example 2: tree 0: (0.51, 1.14), tree 1: (-4.33, 7.0), tree 2: (2.0, -7)
+      #
+      #            logits = (0.1*0.51+0.2*-4.33+1*2.0,
+      #                      0.1*1.14+0.2*7.0+1*-7)
+      logits_dimension = 2
+      expected_logits = [[6.391, 6.114], [1.185, -5.486]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 37a60fa0e38c6d45a4ff40fcc3863226ca98e6be..0315456447dec43264e48d918b74ba3bf0e119c5 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -145,7 +145,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -164,7 +164,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
       save.save(sess, save_path)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
@@ -177,7 +177,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -195,7 +195,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index b9eb2391b490f659bd20e26a2c5b290ab4bfea1b..3c5433cb8990539d28bac70df2e8d589ffd9bb7a 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -66,6 +66,36 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeInnerDim(self):
+    input_shape = [2, 1, 3]
+    output_shape = [2, 5, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2]
+    output_shape = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 15, 3, 2, 2, 2]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim2(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
   @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
@@ -78,8 +108,9 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
-      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4])
-      v_np = np.broadcast_to(x, [2, 3, 4])
+      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4,
+                                                                1, 1, 1])
+      v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
       self.assertAllEqual(v_tf.eval(), v_np)
 
   @test_util.run_deprecated_v1
@@ -130,14 +161,26 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
-    x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [5, 4, 6])
+    x = constant_op.constant([1], dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [5, 2, 3])
     out = 2 * v
     with self.cached_session():
       err = gradient_checker.compute_gradient_error(x, x.get_shape(),
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
+  def testGradientWithLargeDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    x = constant_op.constant(np.array(np.random.randn(*input_shape),
+                                      dtype=np.float32))
+    v = array_ops.broadcast_to(x, output_shape)
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
+                                                    out, out.get_shape())
+    self.assertLess(err, 1e-4)
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index b3187e1637193a8b34f7f3668220d94d783b6170..e9be8e7d5f73c9ea6f7a0fe15d84ecba7201156b 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -157,7 +157,7 @@ class CastOpTest(test.TestCase):
       # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
       # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
       # Tensorflow link to relevant discussion - https://github.com/tensorflow/tensorflow/issues/9360
-      if platform.machine() == "ppc64le":
+      if platform.machine() == "ppc64le" or platform.machine() == "aarch64":
         self._compare(-np.inf, np.int32, i4.min, False)
         self._compare(-np.inf, np.int64, i8.min, False)
       else:
@@ -169,8 +169,13 @@ class CastOpTest(test.TestCase):
     self._compare(-np.inf, np.int64, i8.min, False)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float32, False)), True)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float64, False)), True)
-    self._compare(np.nan, np.int32, i4.min, False)
-    self._compare(np.nan, np.int64, i8.min, False)
+    # np.float64(np.nan).astype(np.int32) is 0 on ARM
+    if platform.machine() == "aarch64":
+      self._compare(np.nan, np.int32, 0, False)
+      self._compare(np.nan, np.int64, 0, False)
+    else:
+      self._compare(np.nan, np.int32, i4.min, False)
+      self._compare(np.nan, np.int64, i8.min, False)
 
     self._compare(np.inf, np.float32, np.inf, True)
     self._compare(np.inf, np.float64, np.inf, True)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index d5f3696a9dc8a86e8a6fb75a4c59f9accf279ba9..7d00919cc8a9927c3e8d05b1c92aa89c8fb54ad9 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -889,8 +889,8 @@ class EnsureShapeTest(test.TestCase):
 
   # Dynamic shape check
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Dynamic shapes not supported now with XLA
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -904,8 +904,8 @@ class EnsureShapeTest(test.TestCase):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Dynamic shapes not supported now with XLA
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index abb71a672c13dd62eda24f0b0e31c7625ea6727a..2305c0b568ee6220dab8dd9be8b7bda339b9f082 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -163,7 +163,9 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
-  @test_util.disable_xla("This test never passed for XLA")  # all nan on XLA
+  # The below invalid Cholesky call returns an error with TF Classic and just
+  # returns NaNs with XLA.
+  @test_util.disable_xla("b/123337890")
   def testNotInvertibleCPU(self):
     # The input should be invertible.
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 45f1e6152a2a335a83dec1f385354df123a192bf..9dfe61c642a6aeefb6ae92b03a61002e68a0bcec 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -460,7 +460,7 @@ class ClipTest(test.TestCase):
       clip_norm = constant_op.constant(0.8)
       with_norm = clip_ops.clip_by_average_norm(x, clip_norm)
       without_norm = clip_ops.clip_by_norm(
-          x, clip_norm * math_ops.to_float(array_ops.size(x)))
+          x, clip_norm * math_ops.cast(array_ops.size(x), dtypes.float32))
       clip_by_average_norm_ans = self.evaluate(with_norm)
       clip_by_norm_ans = self.evaluate(without_norm)
       self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index 215ea97f36d5fc72581f1ad96e7e68166e12e08c..1088e903109a2c80ae7c1ae7337e758a85911255 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -31,15 +30,14 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with test_util.use_gpu():
-      ans = math_ops.compare_and_bitpack(x, threshold)
-      if expected_err_re is None:
-        tf_ans = self.evaluate(ans)
-        self.assertShapeEqual(truth, ans)
-        self.assertAllEqual(tf_ans, truth)
-      else:
-        with self.assertRaisesOpError(expected_err_re):
-          self.evaluate(ans)
+    ans = math_ops.compare_and_bitpack(x, threshold)
+    if expected_err_re is None:
+      tf_ans = self.evaluate(ans)
+      self.assertShapeEqual(truth, ans)
+      self.assertAllEqual(tf_ans, truth)
+    else:
+      with self.assertRaisesOpError(expected_err_re):
+        self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a968b061270ae00ddcb056f73cad3b215e413d1d..7e37785344391364b2e5d8ea54170e68659335dc 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -642,7 +641,6 @@ class ConcatOpTest(test.TestCase):
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
@@ -686,8 +684,7 @@ class ConcatOffsetTest(test.TestCase):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla(
-      "This test never passed for XLA")  # Different error message on XLA
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testSizeMismatch(self):
     cdim = constant_op.constant(1, dtypes.int32)
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 356c6f0a16f96478fac746e1449e2b188d763201..e1c991d799935c71c60661cfcad2efe2cbe15c92 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -85,6 +85,21 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  def testExternalControlDependencies(self):
+    with ops.Graph().as_default(), self.test_session():
+      v = variables.Variable(1.0)
+      v.initializer.run()
+      op = v.assign_add(1.0)
+
+      def true_branch():
+        with ops.control_dependencies([op]):
+          return 1.0
+
+      cond_v2.cond_v2(array_ops.placeholder_with_default(False, None),
+                      true_branch,
+                      lambda: 2.0).eval()
+      self.assertAllEqual(self.evaluate(v), 2.0)
+
   @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     x = constant_op.constant(1.0, name="x")
@@ -785,8 +800,8 @@ class CondV2Test(test.TestCase):
       return ((x,), y * 3.0)
 
     with self.assertRaisesRegexp(
-        ValueError, "Outputs of true_fn and false_fn must"
-        " have the same structure"):
+        TypeError, "true_fn and false_fn arguments to tf.cond must have the "
+        "same number, type, and overall structure of return values."):
       control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
 
   @test_util.enable_control_flow_v2
@@ -814,6 +829,8 @@ class CondV2Test(test.TestCase):
         self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
 
   @test_util.enable_control_flow_v2
+  @test_util.disable_xla(
+      "b/127846988: No tf2xla kernel for IfOp taking DT_VARIANT")
   def testCondAndTensorArrayInDefun(self):
 
     @function.defun
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 0ea5b1f5d8c35a1d5f7e883872475fdeb97688c6..cd6bd29e0de01aed9bc7a1954b3427923b283373 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
@@ -173,7 +172,7 @@ class ConfusionMatrixTest(test.TestCase):
   def testWeighted(self):
     labels = np.arange(5, dtype=np.int32)
     predictions = np.arange(5, dtype=np.int32)
-    weights = constant_op.constant(np.arange(5, dtype=np.int32))
+    weights = np.arange(5, dtype=np.int32)
 
     truth = np.asarray(
         [[0, 0, 0, 0, 0],
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 104e9e82eee929cd6941a11b1022375389c60d22..381372a585c2df9c396942413d2fc4b9d960efa4 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -53,16 +53,19 @@ from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
@@ -70,6 +73,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2  # pylint: disable=unused-import
 # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_factory_ops
 import tensorflow.python.ops.tensor_array_grad
 # pylint: enable=unused-import
 from tensorflow.python.platform import test
@@ -427,23 +431,49 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.run_v1_only("b/120545219")
+  def testCondMismatchedIndexedSlices(self):
+    @def_function.function
+    def foo():
+      values = constant_op.constant(10)
+      indices = constant_op.constant(0)
+      x = ops.IndexedSlices(values, indices)
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
+      with self.assertRaisesRegexp(
+          TypeError,
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+        control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
+            lambda: math_ops.add(x.values, 1), indices)
+    foo()
+
   def testCondSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-      pred = math_ops.less(1, 2)
-      fn1 = lambda: sparse_tensor.SparseTensor(
-          indices + 1, x.values + 1, dense_shape=shape)
-      fn2 = lambda: sparse_tensor.SparseTensor(
-          indices, x.values - 1, dense_shape=shape)
-      r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([3.0, 5.0], r.values)
-      self.assertAllEqual([[1], [4]], r.indices)
-      self.assertAllEqual(r.values.get_shape(), (2,))
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: sparse_tensor.SparseTensor(
+        indices + 1, x.values + 1, dense_shape=shape)
+    fn2 = lambda: sparse_tensor.SparseTensor(
+        indices, x.values - 1, dense_shape=shape)
+    r = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3.0, 5.0], r.values)
+    self.assertAllEqual([[1], [4]], r.indices)
+    self.assertAllEqual(r.values.get_shape(), (2,))
+
+  def testCondRaggedTensor(self):
+    rt = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: array_ops.concat([rt + 2, [[100]]], axis=0)
+    fn2 = lambda: rt[:2] - 2
+    result = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3, 4, 5, 6, 7, 8, 100], result.values)
+    self.assertAllEqual([0, 2, 3, 6, 7], result.row_splits)
 
   @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
@@ -468,7 +498,7 @@ class ControlFlowTest(test.TestCase):
         pred = array_ops.placeholder(dtypes.bool, [])
         x = constant_op.constant([1.0, 2.0, 3.0])
         y = control_flow_ops.cond(
-            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            pred, lambda: map_fn.map_fn(lambda z: z * 2.0, x),
             lambda: constant_op.constant([1.0, 1.0, 1.0]))
         g = gradients_impl.gradients(y, x)[0]
 
@@ -550,6 +580,7 @@ class ControlFlowTest(test.TestCase):
       result = self.evaluate(r)
     self.assertAllEqual(12, result)
 
+  @test_util.disable_xla("b/128638446")
   @test_util.run_in_graph_and_eager_modes
   def testCondPruning(self):
     v1 = variables.Variable(7)
@@ -624,6 +655,98 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       self.assertAllEqual([11, 12], self.evaluate(r))
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCond_Device(self):
+    x = constant_op.constant(-10.)
+
+    # True branch function defined outside of device scope
+    def true_fn():
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.cond(
+          constant_op.constant(True), true_fn, lambda: 0.)
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
+  def _count_matching_switch_nodes_on_device(self, run_metadata, device_str):
+    # Returns the number of Switch nodes with type float32 placed on
+    # `device_str`.
+    device_graphs = [
+        g for g in run_metadata.partition_graphs
+        if device_str in g.node[0].device
+    ]
+    self.assertLen(device_graphs, 1)
+    switch_nodes = [
+        n for n in device_graphs[0].node if n.op == "Switch" and
+        n.attr["T"].type == dtypes.float32.as_datatype_enum
+    ]
+    return len(switch_nodes)
+
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCondSwitchColocatedWithInputWhenInputOnCPU(self):
+    x = array_ops.placeholder(dtypes.float32)
+
+    # `arg` is used in the cond then branch so a Switch node is created for it.
+    # We test that the Switch node gets placed on the same device as `arg`.
+    # We force `arg` to be on CPU here.
+    with ops.device("CPU:0"):
+      arg = x + 10.
+
+    def true_fn():
+      with ops.device("CPU:0"):
+        return arg + 1
+
+    r = control_flow_ops.cond(constant_op.constant(True), true_fn, lambda: 0.)
+
+    with session.Session() as sess:
+      run_metadata = config_pb2.RunMetadata()
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      sess.run(
+          r, feed_dict={x: -10.}, options=options, run_metadata=run_metadata)
+      self.assertEqual(len(run_metadata.partition_graphs), 2)
+      # Check that the Switch for `arg` gets placed on CPU.
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "CPU"), 1)
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "GPU"), 0)
+
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCondSwitchColocatedWithInputWhenInputOnGPU(self):
+    x = array_ops.placeholder(dtypes.float32)
+
+    # `arg` is used in the cond then branch so a Switch node is created for it.
+    # We test that the Switch node gets placed on the same device as `arg`.
+    # Note: `arg` gets placed on GPU by default by the placer.
+    arg = x + 10.
+
+    def true_fn():
+      with ops.device("CPU:0"):
+        return arg + 1
+
+    r = control_flow_ops.cond(constant_op.constant(True), true_fn, lambda: 0.)
+
+    with session.Session() as sess:
+      run_metadata = config_pb2.RunMetadata()
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      sess.run(
+          r, feed_dict={x: -10.}, options=options, run_metadata=run_metadata)
+      self.assertEqual(len(run_metadata.partition_graphs), 2)
+      # Check that the Switch for `arg` gets placed on GPU.
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "CPU"), 0)
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "GPU"), 1)
+
   def testCondListOutput(self):
     with self.cached_session() as sess:
       x = constant_op.constant(10)
@@ -704,12 +827,12 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
       fn2 = lambda: {"c": y, "d": y}
       v1_msg = "The two structures don't have the same nested structure"
-      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
       with self.assertRaisesRegexp(
-          ValueError,
+          TypeError if control_flow_util.ENABLE_CONTROL_FLOW_V2 else ValueError,
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
-        r = control_flow_ops.cond(pred, fn1, fn2)
-        self.evaluate(r)
+        control_flow_ops.cond(pred, fn1, fn2)
 
   @test_util.run_deprecated_v1
   def testCondRef(self):
@@ -726,7 +849,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
       self.assertAllEqual([2.0], self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.disable_control_flow_v2("b/79881896 (placeholder)")
   @test_util.run_v1_only("b/120545219")
   def testCondWithControl(self):
     with self.cached_session():
@@ -914,6 +1037,69 @@ class ControlFlowTest(test.TestCase):
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_deprecated_v1
+  def testCondGrad_ResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x = constant_op.constant(1.0)
+    r = control_flow_ops.cond(
+        constant_op.constant(True),
+        lambda: x * math_ops.reduce_sum(var.sparse_read([1, 2])),
+        lambda: constant_op.constant(np.zeros((2, 3)),
+                                     dtype=dtypes.float32))
+    grad = gradients_impl.gradients(r, var)[0]
+
+    self.evaluate(variables.global_variables_initializer())
+    grad_val = self.evaluate(grad)
+    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad_val), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0.]])
+
+  @test_util.disable_xla("b/128643464")
+  def testCondGrad_MultiGather(self):
+    # NOTE(skyewm): this test is interesting because the array_ops.gather and
+    # ResourceVariable.sparse_read gradient functions returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x1 = constant_op.constant(np.ones((3, 3), dtype=np.float32))
+    x2 = constant_op.constant(2.0)
+
+    def true_fn():
+      y1 = var.sparse_read([1, 2])
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = x2 * [1., 1., 1.]
+      return y1, y2, y3
+
+    def false_fn():
+      y1 = np.zeros((2, 2), dtype=np.float32)
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = array_ops.gather(x1, [2])
+      return y1, y2, y3
+
+    @def_function.function
+    def foo():
+      r = control_flow_ops.cond(constant_op.constant(True), true_fn, false_fn)
+      return gradients_impl.gradients(r, [var, x1, x2])
+
+    grad = foo()
+    self.evaluate(variables.global_variables_initializer())
+    var_grad, x1_grad, x2_grad = self.evaluate(grad)
+    self.assertIsInstance(var_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var_grad), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(x1_grad), [[0., 0., 0.],
+                                                                 [0., 0., 0.],
+                                                                 [2., 2., 2.]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertEqual(gradient_checker_v2._to_numpy(x2_grad), 6.)
+
   @test_util.run_v1_only("b/120545219")
   def testCondPredicateTensor(self):
     """Regression test for lowering predicate from non-first output of an op."""
@@ -925,6 +1111,32 @@ class ControlFlowTest(test.TestCase):
     r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
     self.assertEqual(self.evaluate(r), 1.0)
 
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedConstantPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = constant_op.constant(True)
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertEqual(0.0, sess.run(result))
+
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedPlaceholderWithDefaultPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = array_ops.placeholder_with_default(
+          constant_op.constant(True), [])
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertAllEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertAllEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertAllEqual(0.0, sess.run(result))
+
+  @test_util.disable_xla("b/128644469 PrintV2")
   @test_util.run_in_graph_and_eager_modes
   def testCondAutoControlDeps(self):
 
@@ -963,7 +1175,8 @@ class ControlFlowTest(test.TestCase):
 
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(cond()), 10)
-      self.assertEqual(printed.contents(), "A\nB\nC\n")
+      self.assertTrue(printed.contents().endswith("A\nB\nC\n"),
+                      printed.contents())
 
       @eager_function.defun
       def nested_cond():
@@ -971,7 +1184,8 @@ class ControlFlowTest(test.TestCase):
 
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(nested_cond()), 10)
-      self.assertEqual(printed.contents(), "A\nB\nC\n")
+      self.assertTrue(printed.contents().endswith("A\nB\nC\n"),
+                      printed.contents())
 
     # wrap_function should prune.
     def pruned_cond():
@@ -990,6 +1204,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(self.evaluate(pruned_nested_cond()), 10)
     self.assertEqual(printed.contents(), "C\n")
 
+  @test_util.disable_xla("b/128643646 PrintV2")
   @test_util.run_in_graph_and_eager_modes
   def testWhileAutoControlDeps(self):
     # Legacy while_loop fails this test because it produces deprecation notices
@@ -1020,11 +1235,13 @@ class ControlFlowTest(test.TestCase):
       with self.cached_session():
         with self.captureWritesToStream(sys.stderr) as printed:
           self.assertEqual(self.evaluate(build_while()[0]), 2)
-        self.assertEqual(printed.contents(), "D\nD\n")
+        self.assertTrue(printed.contents().endswith("D\nD\n"),
+                        printed.contents())
 
         with self.captureWritesToStream(sys.stderr) as printed:
           self.assertEqual(self.evaluate(build_nested_while()[0]), 2)
-        self.assertEqual(printed.contents(), "D\nD\n")
+        self.assertTrue(printed.contents().endswith("D\nD\n"),
+                        printed.contents())
 
     # In defuns, all prints should execute in program order.
     @eager_function.defun
@@ -1033,7 +1250,8 @@ class ControlFlowTest(test.TestCase):
 
     with self.captureWritesToStream(sys.stderr) as printed:
       self.assertEqual(self.evaluate(while_loop()), 2)
-    self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+    self.assertTrue(printed.contents().endswith("A\nB\nC\nD\nA\nB\nC\nD\nA\n"),
+                    printed.contents())
 
     @eager_function.defun
     def nested_while_loop():
@@ -1043,7 +1261,9 @@ class ControlFlowTest(test.TestCase):
     if not context.executing_eagerly():
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(nested_while_loop()), 2)
-      self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+      self.assertTrue(
+          printed.contents().endswith("A\nB\nC\nD\nA\nB\nC\nD\nA\n"),
+          printed.contents())
 
     # wrap_function should prune.
     def pruned_while():
@@ -1052,7 +1272,7 @@ class ControlFlowTest(test.TestCase):
 
     with self.captureWritesToStream(sys.stderr) as printed:
       self.assertEqual(self.evaluate(pruned_while()), 2)
-    self.assertEqual(printed.contents(), "D\nD\n")
+    self.assertTrue(printed.contents().endswith("D\nD\n"), printed.contents())
 
     def pruned_nested_while():
       return build_nested_while()[0]
@@ -1062,7 +1282,7 @@ class ControlFlowTest(test.TestCase):
     if not context.executing_eagerly():
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(pruned_nested_while()), 2)
-      self.assertEqual(printed.contents(), "D\nD\n")
+      self.assertTrue(printed.contents().endswith("D\nD\n"), printed.contents())
 
   # Microbenchmark: 256,000 iterations/s.
   def testWhile_1(self):
@@ -1073,7 +1293,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
@@ -1090,7 +1309,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(result, 2)
       self.assertAllEqual(v.read_value(), 1.0)
 
-  @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
@@ -1266,9 +1484,11 @@ class ControlFlowTest(test.TestCase):
           r"while loop context '' \(currently defined in 'cond/.+'\)"):
         _ = gradients_impl.gradients(loop, v)
 
-  @test_util.disable_control_flow_v2("b/118457764")
   @test_util.run_v1_only("b/120545219")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/128646372, b/128645947 fails in opensource build")
+
     v = constant_op.constant(1.0)
 
     p = array_ops.placeholder(dtype=dtypes.int32)
@@ -1311,12 +1531,14 @@ class ControlFlowTest(test.TestCase):
 
     with self.session(use_gpu=False) as sess:
       opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata_without_xla_context = config_pb2.RunMetadata()
       run_metadata = config_pb2.RunMetadata()
 
       final_value_without_xla_context = sess.run(
-          final_without_xla_context, feed_dict={
-              p: [0, 0, 0]
-          })
+          final_without_xla_context,
+          feed_dict={p: [0, 0, 0]},
+          options=opts,
+          run_metadata=run_metadata_without_xla_context)
 
       final_value_with_xla_context = sess.run(
           final_with_xla_context,
@@ -1324,12 +1546,21 @@ class ControlFlowTest(test.TestCase):
           options=opts,
           run_metadata=run_metadata)
 
-      node_stats = run_metadata.step_stats.dev_stats[0].node_stats
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        # With while_v2 on xla, run_metadata only contains the unlowered While
+        # op so node_stats does not have statistics for the pushes. So as a
+        # loose check we check the pushes in the lowered version.
+        node_stats = run_metadata_without_xla_context.step_stats.dev_stats[
+            0].node_stats
+        stack_push_op = "TensorListPushBack"
+      else:
+        node_stats = run_metadata.step_stats.dev_stats[0].node_stats
+        stack_push_op = "StackPushV2"
       stack_push_count = len(
-          [x for x in node_stats if x.node_name.endswith("StackPushV2")])
+          [x for x in node_stats if x.node_name.endswith(stack_push_op)])
       # Pushes to the stack = product of maximum_iterations values;
       # the last two "3"s comes from size(p), when p == [0, 0, 0].
-      self.assertEqual(stack_push_count, 5 * 3 * 3)
+      self.assertEqual(stack_push_count, 5 * 3 * 3, str(node_stats))
 
       self.assertAllClose(final_value_with_xla_context,
                           final_value_without_xla_context)
@@ -1404,6 +1635,26 @@ class ControlFlowTest(test.TestCase):
       result = r[2]
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testWhile_Device(self):
+
+    # Body function defined outside of device scope
+    def body(x):
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.while_loop(
+          lambda x: x < 10, body, [constant_op.constant(-10.)])
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
   @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
@@ -1519,35 +1770,95 @@ class ControlFlowTest(test.TestCase):
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      i = constant_op.constant(0)
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-
-      def c(i, _):
-        return i < 10
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+    def c(i, _):
+      return i < 10
+
+    def b1(i, x):  # modifies values.  (shape of components is not changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
+      ]
 
-      def b(i, x):
-        return [
-            i + 1,
-            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
-        ]
+    def b2(i, x):  # adds new values.  (shape of components is changed.)
+      return [
+          i + 1,
+          sparse_ops.sparse_add(
+              x,
+              sparse_tensor.SparseTensor(
+                  indices=math_ops.cast(
+                      array_ops.fill([1, 1], i), dtypes.int64),
+                  values=array_ops.fill([1], 1.0),
+                  dense_shape=x.dense_shape))
+      ]
 
-      _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0], 1)
+    def b3(i, x):  # modifies rank.  (shape of all components is changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(
+              array_ops.concat([x.indices, [[i], [i]]], axis=1), x.values * 2.0,
+              array_ops.concat([x.dense_shape, [10]], axis=0))
+      ]
 
+    # Default shape invariant; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b2 adds new values
+    _, r = control_flow_ops.while_loop(c, b2, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b3 modifies rank (which is not allowed).
+    with self.assertRaises(ValueError):
+      _, r = control_flow_ops.while_loop(c, b3, [i, x])
+
+    # Explicit shape invariant, allowing any rank; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, allowing any rank; b3 modifies rank.
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Shape invariant with ndims=None.  Technically, this isn't supported
+    # according to the docs, but we support it for backwards compatibility.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, with a specific (incompatible) rank.
+    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x],
-          [i.get_shape(), tensor_shape.TensorShape([None])])
-      self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
-
-      with self.assertRaisesRegexp(ValueError, "is not compatible with"):
-        _, r = control_flow_ops.while_loop(
-            c, b, [i, x],
-            [i.get_shape(), tensor_shape.TensorShape([5])])
+          c, b1, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
   @test_util.run_v1_only("b/120545219")
@@ -1583,6 +1894,69 @@ class ControlFlowTest(test.TestCase):
             c, b, [i, x],
             [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensor(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    c = lambda i, _: i < 10
+
+    def b1(i, x):  # Adds new values to rows (but doesn't create new rows)
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=1)
+      ]
+
+    def b2(i, x):  # Adds new rows.
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=0)
+      ]
+
+    # Default shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [4])
+
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Default shape invariant; b2 adds new rows (not allowed).
+    if not context.executing_eagerly():
+      with self.assertRaises(ValueError):
+        _, r = control_flow_ops.while_loop(c, b2, [i, x])
+
+    # Explicit shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([4], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Explicit shape invariant; b2 adds new rows.
+    _, r = control_flow_ops.while_loop(
+        c, b2, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([3 * 2**10 + 1], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensorRaggedRank2(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[[1, 2], [3], [4, 5, 6]],
+                                     [[], [8, 9, 10]]])
+    c = lambda i, _: i < 10
+    def b(i, x):
+      return [
+          i + 1,
+          array_ops.concat([x, x[..., i:i+1]], axis=-1)
+      ]
+    _, r = control_flow_ops.while_loop(c, b, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [3])
+    self.assertTrue(r.values.row_splits.shape.as_list() in ([6], [None]))
+    self.assertTrue(r.values.values.shape.as_list() in ([49], [None]))
+
   def _testNestedWhile_1(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
@@ -1693,7 +2067,6 @@ class ControlFlowTest(test.TestCase):
             lambda x: x < 10, lambda x: x + array_ops.identity(c), [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
-  @test_util.disable_control_flow_v2("b/79881896 (control_deps)")
   @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_5(self):
     with self.cached_session() as sess:
@@ -2307,6 +2680,199 @@ class ControlFlowTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(216.0, g[0])
 
+  def testWhileGrad_EagerResourceVariable(self):
+    with context.eager_mode():
+      a = resource_variable_ops.ResourceVariable(
+          np.ones([2, 2], dtype=np.float32))
+      v = constant_op.constant(1.0)
+
+      @eager_function.defun
+      def fn():
+        r = control_flow_ops.while_loop(
+            lambda i, _: i < 2,
+            lambda i, x: (i + 1, x * math_ops.reduce_sum(a) * v),
+            [0, 1.0])[1]
+        return gradients_impl.gradients(r, [v])[0]
+
+      self.assertEqual(self.evaluate(fn()), 32.)
+
+  @test_util.disable_xla("b/128643381")
+  def testWhileGrad_ResourceVarInFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + math_ops.reduce_sum(var.sparse_read([1, 3]))
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 2., 3., 4.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
+
+  @test_util.disable_xla("b/128643461")
+  def testWhileGrad_ResourceVarInNestedFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + math_ops.reduce_sum(var.sparse_read([1, 3]))
+
+    @def_function.function
+    def foo2(x, var):
+      return foo(x, var)
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo2(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
+
+  def testWhileGrad_ResourceVarInLoopInFunctionCall(self):
+    if test.is_gpu_available():
+      self.skipTest("b/128635252")
+
+    @def_function.function
+    def foo(x, var):
+      return control_flow_ops.while_loop(
+          lambda j, _: j < 3,
+          lambda j, y: (j + 1,
+                        y + math_ops.reduce_sum(var.sparse_read([1, 2]))),
+          [0, x])[1]
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 6., 6., 0.])
+
+  @test_util.disable_xla("b/128639858")
+  def testWhileCondGrad_ResourceVarInFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + var.sparse_read([1])[0]
+
+    def body(i, x):
+      return (i + 1, control_flow_ops.cond(
+          math_ops.equal(i % 2, 0),
+          lambda: foo(x, var1),
+          lambda: foo(x, var2)))
+
+    @def_function.function
+    def bar(var1, var2):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 4, body, [0, 0.0])
+      return gradients_impl.gradients(r, [var1, var2])
+
+    var1 = resource_variable_ops.ResourceVariable([1., 2., 3.])
+    var2 = resource_variable_ops.ResourceVariable([4., 5.])
+    self.evaluate(variables.global_variables_initializer())
+    grads = self.evaluate(bar(var1, var2))
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grads[0]), [0., 2., 0.])
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grads[1]), [0., 2.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_ResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(np.ones(5),
+                                                 dtype=dtypes.float32)
+    r = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, x * math_ops.reduce_sum(var.sparse_read([1, 3]))),
+        [0, constant_op.constant(1.0)])[1]
+    grad = gradients_impl.gradients(r, var)[0]
+
+    self.evaluate(variables.global_variables_initializer())
+    grad_val = self.evaluate(grad)
+    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
+    arr = gradient_checker_v2._to_numpy(grad_val)
+    self.assertAllEqual(arr, [0., 12., 0., 12., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_MultiResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    var1 = resource_variable_ops.ResourceVariable(np.ones(5),
+                                                  dtype=dtypes.float32)
+    var2 = resource_variable_ops.ResourceVariable(np.ones(3),
+                                                  dtype=dtypes.float32)
+    x1_init = constant_op.constant([0., 0.])
+    x2_init = constant_op.constant(1.)
+    x3_init = constant_op.constant(1.)
+
+    def body(i, unused_x1, x2, x3):
+      y1 = var1.sparse_read([1, 3])
+      y2 = x2 * 2
+      y3 = x3 * math_ops.reduce_sum(var2.sparse_read([0]))
+      return i + 1, y1, y2, y3
+
+    r = control_flow_ops.while_loop(
+        lambda i, x1, x2, x3: i < 3, body,
+        [0, x1_init, x2_init, x3_init])[1:]
+    var1_grad, var2_grad = gradients_impl.gradients(r, [var1, var2])
+
+    self.evaluate(variables.global_variables_initializer())
+    var1_grad_val = self.evaluate(var1_grad)
+    var2_grad_val = self.evaluate(var2_grad)
+    self.assertIsInstance(var1_grad_val, ops.IndexedSlicesValue)
+    self.assertIsInstance(var2_grad_val, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var1_grad_val),
+                        [0., 1., 0., 1., 0.])
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var2_grad_val),
+                        [3., 0., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_Gather(self):
+    # NOTE(skyewm): this test is interesting because the gather gradient
+    # function returns an IndexedSlices.
+    x = constant_op.constant([1., 1., 1., 1., 1.])
+    y = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, x + array_ops.gather(x, [0])),
+        [0, x[:1]])[1]
+    z = y * 3.0
+    grad = gradients_impl.gradients(z, x)[0]
+    self.assertEqual(self.evaluate(y), 8.)
+    self.assertAllEqual(self.evaluate(grad), [24., 0., 0., 0., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_GatherNoFanOut(self):
+    # NOTE(skyewm): this test is interesting because the gather gradient
+    # function returns an IndexedSlices.
+    x = constant_op.constant([1., 1., 1., 1., 1.])
+    y = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, array_ops.gather(x, [0])),
+        [0, x[:1]])[1]
+    z = y * 3.0
+    grad = gradients_impl.gradients(z, x)[0]
+    self.assertEqual(self.evaluate(y), 1.)
+    self.assertAllEqual(self.evaluate(grad), [3., 0., 0., 0., 0.])
+
   @test_util.run_v1_only("b/120545219")
   def testWhileGradInCond(self):
 
@@ -2379,8 +2945,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
-  @test_util.disable_xla("This test never passed for XLA"
-                        )  # Resource variable issue for ControlFlowV2
   @test_util.run_gpu_only
   def testGpuResourceAccess(self):
     with ops.device(test.gpu_device_name()):
@@ -2399,6 +2963,7 @@ class ControlFlowTest(test.TestCase):
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(foo()), 9.0)
 
+  @test_util.disable_xla("b/128643398")
   def testNestedResourceAccess(self):
     var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
 
@@ -2723,10 +3288,10 @@ class ControlFlowTest(test.TestCase):
 
       def inner_loop(t):
         fn = lambda n: n + math_ops.square(var)
-        return functional_ops.map_fn(fn=fn, elems=t, parallel_iterations=10)
+        return map_fn.map_fn(fn=fn, elems=t, parallel_iterations=10)
 
       def outer_loop(inp):
-        return functional_ops.map_fn(
+        return map_fn.map_fn(
             fn=inner_loop, elems=inp, parallel_iterations=10)
 
       var = variables.Variable(constant_op.constant(3.0))
@@ -2912,7 +3477,7 @@ class ControlFlowTest(test.TestCase):
       def b(i, y):
         return [
             i + 1,
-            functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y)
+            map_fn.map_fn(lambda x: math_ops.multiply(x, param), y)
         ]
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
@@ -2923,25 +3488,24 @@ class ControlFlowTest(test.TestCase):
   def testNestedWhileAndTensorArray(self):
     n = constant_op.constant(3.0)
 
-    def Body(row, ta, n):
+    def Body(row, ta):
 
-      def InnerBody(row, col, ta, n):
+      def InnerBody(row, col, ta):
         # Note: row and col are 1-based.
         ta = ta.write(
             math_ops.cast(n * (row - 1.) + col - 1., dtypes.int32), row * col)
-        return row, col + 1., ta, n
+        return row, col + 1., ta
 
-      # TODO(b/118457764): Remove n from loop_vars from both loops once fixed.
       ta = control_flow_ops.while_loop(
-          lambda _, col, _1, n: col <= n,
-          InnerBody, [row, constant_op.constant(1.), ta, n],
+          lambda _, col, _1: col <= n,
+          InnerBody, [row, constant_op.constant(1.), ta],
           return_same_structure=False)[2]
-      return row + 1., ta, n
+      return row + 1., ta
 
     ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
     ta = control_flow_ops.while_loop(
-        lambda row, _, _1: row <= n,
-        Body, [constant_op.constant(1.), ta, n],
+        lambda row, _: row <= n,
+        Body, [constant_op.constant(1.), ta],
         return_same_structure=False)[1]
 
     output = array_ops.reshape(ta.stack(), [3, 3])
@@ -3631,6 +4195,21 @@ class ControlFlowTest(test.TestCase):
       result = func(qint)
       self.evaluate(result)
 
+  def testSparseIdentity(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Identity(st1)
+    self.assertAllEqual(st1.indices, st2.indices)
+    self.assertAllEqual(st1.values, st2.values)
+    self.assertAllEqual(st1.dense_shape, st2.dense_shape)
+
+  def testSparseEnterExit(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Enter(st1, "foo_1")
+    st3 = control_flow_ops.exit(st2)
+    self.assertAllEqual(st1.indices, st3.indices)
+    self.assertAllEqual(st1.values, st3.values)
+    self.assertAllEqual(st1.dense_shape, st3.dense_shape)
+
 
 class ControlFlowContextCheckTest(test.TestCase):
 
@@ -3687,14 +4266,14 @@ class ControlFlowContextCheckTest(test.TestCase):
     while_tensor = self._getWhileTensor()
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'while_1/Add' as input to 'while/Const_1' because they are "
+        "Cannot use 'while/Const_1' as input to 'while_1/Add' because they are "
         "in different while loops. See info log for more details."):
       control_flow_ops.while_loop(lambda i: i < 10,
                                   lambda x: math_ops.add(1, while_tensor), [0])
 
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'while_2/NextIteration' as input to 'while/Const_1' "
+        "Cannot use 'while/Const_1' as input to 'while_2/NextIteration' "
         "because they are in different while loops. See info log for more "
         "details."):
       control_flow_ops.while_loop(lambda i: i < 10, lambda i: while_tensor, [0])
@@ -3751,7 +4330,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
+        "Cannot use 'cond/while/Const_1' as input to 'cond/while_1/add' because"
         " they are in different while loops. See info log for more details."):
       control_flow_ops.cond(
           math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
@@ -3848,6 +4427,9 @@ class AssertTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testGuardedAssertDoesNotCopyWhenTrue(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/128646478 fails in opensource")
+
     with self.session(use_gpu=True) as sess:
       with ops.device(test.gpu_device_name()):
         value = constant_op.constant(1.0)
@@ -3879,7 +4461,8 @@ class AssertTest(test.TestCase):
       ]
       if "GPU" in [d.device_type for d in device_lib.list_local_devices()]:
         # A copy was performed for the unguarded assert
-        self.assertLess(0, len(unguarded_memcpy_nodestat_names))
+        self.assertLess(0, len(unguarded_memcpy_nodestat_names),
+                        str(unguarded_nodestat_names))
       # No copy was performed for the guarded assert
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e8463323df90bd37d927f88bd41b09bef45de541..4b44bb6c913533b3025692b0eb06d7e2b77bfb9e 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -68,7 +68,7 @@ class Conv1DTest(test.TestCase):
       f = constant_op.constant(
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
-          x, f, y_shape, stride=stride, padding="VALID")
+          x, f, y_shape, strides=stride, padding="VALID")
       value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
diff --git a/tensorflow/python/kernel_tests/conv1d_transpose_test.py b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ac5af7aae80277d7a93ef0585c1ccb41286bae
--- /dev/null
+++ b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
@@ -0,0 +1,260 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolution related functionality in tensorflow.ops.nn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+class Conv1DTransposeTest(test.TestCase):
+
+  def testConv1DTransposeSingleStride(self):
+    with self.cached_session():
+      strides = [1, 1, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 6, 3]
+      y_shape = [2, 6, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(y_shape[0]):
+        for w in xrange(y_shape[1]):
+          for c in xrange(y_shape[2]):
+            target = 2 * 3.0
+            w_in = w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, c])
+
+  def testConv1DTransposeSame(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 8, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(y_shape[1]):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, k])
+
+  def testConv1DTransposeValid(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 9, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="VALID")
+      value = self.evaluate(output)
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(pad, y_shape[1] - pad):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > pad and w < y_shape[1] - 1 - pad
+            if w_in:
+              target += 3.0
+            cache_values[n, w, k] = target
+
+          # copy values in the border
+          cache_values[n, 0, k] = cache_values[n, 1, k]
+          cache_values[n, -1, k] = cache_values[n, -2, k]
+          cache_values[n, :, k] = cache_values[n, :, k]
+
+    self.assertAllClose(cache_values, value)
+
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    x_shape = [2, 4, 3]
+    f_shape = [3, 2, 3]
+    y_shape = [2, 8, 2]
+    strides = [1, 2, 1]
+    np.random.seed(1)  # Make it reproducible.
+    x_val = np.random.random_sample(x_shape).astype(np.float64)
+    f_val = np.random.random_sample(f_shape).astype(np.float64)
+    with self.cached_session():
+      x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      err = gradient_checker.compute_gradient_error([x, f], [x_shape, f_shape],
+                                                    output, y_shape)
+    print("conv1d_transpose gradient err = %g " % err)
+    err_tolerance = 0.0005
+    self.assertLess(err, err_tolerance)
+
+  def testConv1DTransposeSingleStrideNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 1]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 4]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 2 * 3.0
+              w_in = w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeSameNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 8]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeValidNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 9]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="VALID", data_format="NCW")
+
+        value = self.evaluate(output)
+        cache_values = np.zeros(y_shape, dtype=np.float32)
+        # The amount of padding added
+        pad = 1
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(pad, y_shape[2] - pad):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > pad and \
+                     w < y_shape[2] - 1 - pad
+              if w_in:
+                target += 3.0
+              cache_values[n, k, w] = target
+
+            # copy values in the border
+            cache_values[n, k, 0] = cache_values[n, k, 1]
+            cache_values[n, k, -1] = cache_values[n, k, -2]
+            cache_values[n, k, :] = cache_values[n, k, :]
+
+        self.assertAllClose(cache_values, value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 4a689b3fdfa5f43c8b6a4c67b7ebb31104d83db7..48c1b7dff78141e92234aab524e88082d4f5a832 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -653,7 +653,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
-  # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
+  # Test the fast path in gemm_pack_rhs/gemm_pack_colmajor_block, when channel
   # dimension is a multiple of packet size.
   @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOneFastPath(self):
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 7ff1a61e472b0dae054804f8f014ead7782958b6..833f90c08a0d4d9e32e9655610057b262e357e1a 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -25,7 +25,6 @@ import time
 import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensorflow.contrib import layers
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
@@ -36,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
@@ -256,7 +256,7 @@ class Conv2DTest(test.TestCase):
       tensors.append(_SetupVal(data_format, use_gpu))
     values = self.evaluate(tensors)
     for i in range(1, len(values)):
-      self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
+      self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
 
   def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
                                    stride, dilation, padding, data_format,
@@ -297,7 +297,7 @@ class Conv2DTest(test.TestCase):
     return expected, computed
 
   def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
-                               padding, dilations):
+                               padding, dilations, rtol=1e-4):
     expected_results = []
     computed_results = []
     for data_format, use_gpu in GetTestConfigs():
@@ -313,7 +313,7 @@ class Conv2DTest(test.TestCase):
         tf_logging.debug("expected = %s", e_value)
         tf_logging.debug("actual = %s", c_value)
         self.assertAllClose(
-            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
 
   def _VerifyValues(self,
                     tensor_in_sizes,
@@ -1161,6 +1161,7 @@ class Conv2DTest(test.TestCase):
       tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1175,6 +1176,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1189,6 +1191,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2DEmptyBackpropFilterDilation1x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1203,6 +1206,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1217,6 +1221,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1231,6 +1236,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1245,6 +1251,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1259,6 +1266,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2DEmptyBackpropInputDilation1x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1273,6 +1281,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1289,6 +1298,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-4)
 
+  @test_util.deprecated_graph_mode_only
   def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
     if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
@@ -1703,6 +1713,7 @@ class Conv2DTest(test.TestCase):
         tf_logging.debug("conv_2d gradient error = %s", err)
         self.assertLess(err, max_err)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientValidPaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1720,6 +1731,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientValidPaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1737,6 +1749,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientValidPaddingStrideTwo(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1754,6 +1767,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientValidPaddingStrideTwo(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1771,6 +1785,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientValidPaddingStrideThree(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1788,6 +1803,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientValidPaddingStrideThree(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1805,6 +1821,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientSamePaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1822,6 +1839,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientSamePaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1839,6 +1857,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientSamePaddingStrideTwo(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1856,6 +1875,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientSamePaddingStrideTwo(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1873,6 +1893,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientSamePaddingStrideThree(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1890,6 +1911,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientSamePaddingStrideThree(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1907,6 +1929,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientSamePaddingStride2x1(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1924,6 +1947,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradientKernelSizeMatchesInputSize(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1941,6 +1965,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradientKernelSizeMatchesInputSize(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self.ConstructAndTestGradient(
@@ -1958,6 +1983,7 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient1x1PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1979,6 +2005,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             max_err=0.0025)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient1x1PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -1999,6 +2026,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient1x1PaddingStrideTwo(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2019,6 +2047,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient1x1PaddingStrideTwo(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2039,6 +2068,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient2x2PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2059,6 +2089,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient2x2PaddingStrideOne(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2080,6 +2111,7 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             max_err=0.003)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient1_2_3_4PaddingStride3x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2100,6 +2132,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient1_2_3_4PaddingStride3x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2120,6 +2153,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient4_3_2_1PaddingStride2x1(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2140,6 +2174,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient4_3_2_1PaddingStride2x1(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2160,6 +2195,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testInputGradient0_0_0_5PaddingStride1x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2180,6 +2216,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testFilterGradient0_0_0_5PaddingStride1x2(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -2200,6 +2237,7 @@ class Conv2DTest(test.TestCase):
             data_format=data_format,
             use_gpu=use_gpu)
 
+  @test_util.deprecated_graph_mode_only
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     c1 = nn_ops.conv2d(
@@ -2286,6 +2324,8 @@ class Conv2DTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding=[0, 0, 0, 0])
 
+  @test_util.deprecated_graph_mode_only
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
@@ -2588,6 +2628,7 @@ class SeparableConv2DTest(test.TestCase):
         expected=expected_output,
         data_format=data_format)
 
+  @test_util.deprecated_graph_mode_only
   def testSeparableConv2DEqualInputOutputDepth(self):
     self._testSeparableConv2DEqualInputOutputDepth("NHWC")
 
@@ -2663,7 +2704,7 @@ class Conv2DBenchmark(test.Benchmark):
       kernel_w = 3
       x = inputs
       for num_outputs in num_outputs_list:
-        x = layers.convolution2d(x, num_outputs, [1, kernel_w])
+        x = convolutional.conv2d(x, num_outputs, [1, kernel_w])
       outputs = x
 
       variables.global_variables_initializer().run()
@@ -2891,7 +2932,8 @@ def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
           filter_in_sizes=filter_size,
           strides=[stride, stride],
           dilations=[2, 2],
-          padding=padding)
+          padding=padding,
+          rtol=5e-4)
 
   return Test
 
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
similarity index 84%
rename from tensorflow/contrib/framework/python/ops/critical_section_test.py
rename to tensorflow/python/kernel_tests/critical_section_test.py
index 34fd5018af125335845540dedfdffc984ba02313..7b1519c5e3c77d4676e5084ab06ed49b1a3c42f9 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import critical_section_ops
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import critical_section_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -47,7 +49,7 @@ class CriticalSectionTest(test.TestCase):
           return array_ops.identity(c)
 
     num_concurrent = 100
-    r = [cs.execute(fn, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn(1.0, 2.0)) for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     r_value = self.evaluate(r)
     self.assertAllClose([2.0 * i for i in range(num_concurrent)],
@@ -73,7 +75,7 @@ class CriticalSectionTest(test.TestCase):
               array_ops.identity(inner_cond), true_fn, lambda: c)
 
         def execute():
-          return cs.execute(fn, 1.0, 2.0)
+          return cs.execute(lambda: fn(1.0, 2.0))
 
         r = [
             control_flow_ops.cond(array_ops.identity(outer_cond),
@@ -91,6 +93,7 @@ class CriticalSectionTest(test.TestCase):
         else:
           self.assertAllClose([0] * num_concurrent, r_value)
 
+  @test_util.run_v1_only("b/123990562 Sees CancelledError on some calls")
   def testCriticalSectionInParallelDoesntDeadlockOnError(self):
     # No eager mode execution of this test because eager does not
     # run fn() in parallel, which is where the deadlock could
@@ -102,12 +105,23 @@ class CriticalSectionTest(test.TestCase):
       error = control_flow_ops.Assert((i % 2) == 1, ["Error"])
       with ops.control_dependencies([error]):
         return v.read_value()
+
     num_concurrent = 2
-    r = [cs.execute(fn, i) for i in range(num_concurrent)]
+
+    @def_function.function(autograph=False)
+    def run_concurrently():
+      return [cs.execute(lambda: fn(i)) for i in range(num_concurrent)]
+
+    if not context.executing_eagerly():
+      run_concurrently = run_concurrently()
+
     self.evaluate(v.initializer)
     for _ in range(100):
       with self.assertRaisesOpError("Error"):
-        self.evaluate(r)
+        if context.executing_eagerly():
+          run_concurrently()
+        else:
+          self.evaluate(run_concurrently)
 
   @test_util.run_in_graph_and_eager_modes
   def testCreateCriticalSectionFnReturnsOp(self):
@@ -122,17 +136,20 @@ class CriticalSectionTest(test.TestCase):
           return control_flow_ops.no_op()
 
     num_concurrent = 100
-    r = [cs.execute(fn_return_op, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn_return_op(1.0, 2.0))
+         for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     self.evaluate(r)
     final_v = self.evaluate(v)
     self.assertAllClose(2.0 * num_concurrent, final_v)
 
+  @test_util.run_v1_only("Collections don't exist in TF2")
   def testCollection(self):
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     self.assertIn(
         cs, ops.get_collection(critical_section_ops.CRITICAL_SECTIONS))
-    execute = cs.execute(lambda x: x + 1, 1.0, name="my_execute")
+    add = lambda x: x + 1
+    execute = cs.execute(lambda: add(1.0), name="my_execute")
     execute_op = [
         x for x in execute.graph.get_operations()
         if "my_execute" in x.name and "MutexLock" in x.type
@@ -142,18 +159,21 @@ class CriticalSectionTest(test.TestCase):
         [signature.op for signature in
          ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegal(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection()
+    add = lambda y: y + 1
     def fn(x):
-      return cs.execute(lambda y: y + 1, x)
+      return cs.execute(lambda: add(x))
+
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
   def testRecursiveCriticalSectionAccessViaCapturedTensorIsProtected(self):
     # This one is subtle; and we're being overly cautious here.  The
@@ -173,24 +193,24 @@ class CriticalSectionTest(test.TestCase):
     # operations are finished before anything runs within the critical section.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     fn = array_ops.identity
-    to_capture = cs.execute(fn, 1.0)
+    to_capture = cs.execute(lambda: fn(1.0))
     fn_captures = lambda x: x + to_capture
     to_capture_too = array_ops.identity(to_capture)
 
-    ex_0 = cs.execute(fn_captures, 1.0)
+    ex_0 = cs.execute(lambda: fn_captures(1.0))
 
     with ops.control_dependencies([to_capture]):
       # This is OK because to_capture will execute before this next call
-      ex_1 = cs.execute(fn_captures, 1.0)
+      ex_1 = cs.execute(lambda: fn_captures(1.0))
 
     dependency = array_ops.identity(to_capture)
 
     fn_captures_dependency = lambda x: x + dependency
 
-    ex_2 = cs.execute(fn_captures_dependency, 1.0)
+    ex_2 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     with ops.control_dependencies([to_capture_too]):
-      ex_3 = cs.execute(fn_captures_dependency, 1.0)
+      ex_3 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     # Ensure there's no actual deadlock on to_execute.
     self.assertEquals(2.0, self.evaluate(ex_0))
@@ -216,6 +236,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -241,6 +263,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture_protected,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -257,13 +281,15 @@ class CriticalSectionTest(test.TestCase):
       # This version is ok because j is an argument to fn and we can
       # ensure there's a control dependency on j.
       fn = lambda x: x + 1
-      return (i + 1, cs.execute(fn, j))
+      return (i + 1, cs.execute(lambda: fn(j)))
 
     (i_n, j_n) = control_flow_ops.while_loop(
         lambda i, _: i < 1000,
         body_args_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -276,20 +302,23 @@ class CriticalSectionTest(test.TestCase):
         "body_args_capture'\n"
         "==============\n")
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegalSameSharedName(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     cs_same = critical_section_ops.CriticalSection(shared_name="cs")
+    add = lambda x: x + 1
     def fn(x):
-      return cs_same.execute(lambda x: x+1, x)
+      return cs_same.execute(lambda: add(x))
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testMultipleCSExecutionsRequestSameResource(self):
     cs0 = critical_section_ops.CriticalSection()
     cs1 = critical_section_ops.CriticalSection()
@@ -327,20 +356,32 @@ class CriticalSectionTest(test.TestCase):
     # Note, here v must be a resource variable (or something similar),
     # otherwise it gets hoisted into the while_loop by the time we add
     # control dependencies to the lock_op.
+    def body(i):
+      add_j = lambda j: v + j + 1
+      return cs.execute(lambda: add_j(i))
     out = control_flow_ops.while_loop(
-        lambda i: i < 10, lambda i: cs.execute(lambda j: v + j + 1, i), [0])
+        lambda i: i < 10, body, [0])
     self.evaluate(v.initializer)
     self.assertEqual(10, self.evaluate(out))
 
   @test_util.run_in_graph_and_eager_modes
   def testInsideFunction(self):
+    if test_util.is_gpu_available():
+      self.skipTest(
+          "b/123899495: Colocation errors for critical sections in map on GPU")
     cs = critical_section_ops.CriticalSection()
-    v = resource_variable_ops.ResourceVariable(1)
+    with ops.device("/gpu:0" if test_util.is_gpu_available() else "/cpu:0"):
+      v = resource_variable_ops.ResourceVariable(1)
     def fn():
       return v.read_value()
 
     # map() creates a TensorFlow function.
-    ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn))
+    ds = dataset_ops.Dataset.range(1)
+    if test_util.is_gpu_available():
+      ds = (ds.apply(prefetching_ops.copy_to_device("/gpu:0"))
+            .apply(prefetching_ops.map_on_gpu(lambda _: cs.execute(fn))))
+    else:
+      ds = ds.map(lambda _: cs.execute(fn))
 
     def get_first():
       if context.executing_eagerly():
diff --git a/tensorflow/python/kernel_tests/cudnn_determinism_test.py b/tensorflow/python/kernel_tests/cudnn_determinism_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ba33dcb7dea4440aa4faed07d52a9ec9f57bc48
--- /dev/null
+++ b/tensorflow/python/kernel_tests/cudnn_determinism_test.py
@@ -0,0 +1,103 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TF_CUDNN_DETERMINISTIC=true."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+# The TF_CUDNN_DETERMINISTIC flag disables autotuning of cuDNN algorithms and
+# causes deterministic cuDNN algorithms to be selected when both deterministic
+# and non-deterministic algorithms are available. These tests are intended to
+# confirm that deterministic algorithms are chosen when
+# TF_CUDNN_DETERMINISTIC=true. The configurations tested were confirmed to
+# produce non-deterministic results without setting TF_CUDNN_DETERMINISTIC=true
+
+_PADDING = 'SAME'
+_STRIDES = [1, 1, 1, 1]
+
+LayerShape = collections.namedtuple('LayerShape',
+                                    'batch, height, width, channels')
+FilterShape = collections.namedtuple(
+    'FilterShape', 'height, width, in_channels, out_channels')
+
+
+class ConvolutionTest(test.TestCase):
+
+  def _random_data_op(self, shape):
+    # np.random.random_sample can properly interpret either tf.TensorShape or
+    # namedtuple as a list.
+    return constant_op.constant(
+        2 * np.random.random_sample(shape) - 1, dtype=dtypes.float32)
+
+  def _random_out_op(self, in_shape, filter_shape):
+    # Choosing not to use array_op.zeros() to prevent possible removal by
+    # optimization
+    in_op = self._random_data_op(in_shape)
+    filter_op = self._random_data_op(filter_shape)
+    # Use the forward op's shape-inference
+    conv_op = nn_ops.conv2d(
+        in_op, filter_op, strides=_STRIDES, padding=_PADDING)
+    out_shape = conv_op.get_shape()
+    out_op = self._random_data_op(out_shape)
+    return out_op
+
+  def _assert_reproducible(self, operation):
+    with self.cached_session(force_gpu=True):
+      result_1 = self.evaluate(operation)
+      result_2 = self.evaluate(operation)
+    self.assertAllEqual(result_1, result_2)
+
+  @test_util.run_cuda_only
+  def testBackwardFilterGradient(self):
+    np.random.seed(1)
+    in_shape = LayerShape(batch=8, height=128, width=128, channels=8)
+    filter_shape = FilterShape(height=3, width=3, in_channels=8, out_channels=8)
+    in_op = self._random_data_op(in_shape)
+    out_op = self._random_out_op(in_shape, filter_shape)
+    filter_gradient_op = nn_ops.conv2d_backprop_filter(
+        in_op, filter_shape, out_op, strides=_STRIDES, padding=_PADDING)
+    self._assert_reproducible(filter_gradient_op)
+
+  @test_util.run_cuda_only
+  def testBackwardInputGradient(self):
+    np.random.seed(2)
+    in_shape = LayerShape(batch=8, height=32, width=32, channels=8)
+    filter_shape = FilterShape(
+        height=7, width=7, in_channels=8, out_channels=128)
+    filter_op = self._random_data_op(filter_shape)
+    out_op = self._random_out_op(in_shape, filter_shape)
+    input_gradient_op = nn_ops.conv2d_backprop_input(
+        in_shape, filter_op, out_op, strides=_STRIDES, padding=_PADDING)
+    self._assert_reproducible(input_gradient_op)
+
+  # TODO(duncanriach): (1) add test to confirm that forward autotuning is
+  #   disabled for cuDNN convolution; (2) add test for deterministic cuDNN
+  #   max-pooling
+
+
+if __name__ == '__main__':
+  os.environ['TF_CUDNN_DETERMINISTIC'] = 'true'
+  test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 70f19f9d2f9d9155f5cc5e3458cb8cad8fb18064..50d52e64ff59c1692c071274da0a4b5409dda79a 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -26,7 +26,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
@@ -595,7 +597,7 @@ class MinMaxOpTest(test.TestCase):
 
   def testScalar(self):
     x = np.random.rand(1, 3, 2) * 100.
-    y = np.asscalar(np.random.rand(1) * 100.)  # should broadcast
+    y = np.random.rand(1).item() * 100.  # should broadcast
     # dropped np.float64, int64 because TF automatically converts to 32 bit
     for t in [np.float32, np.int32]:
       self._compare(x.astype(t), t(y), use_gpu=False)
@@ -1109,5 +1111,54 @@ class PolyvalTest(test.TestCase):
       self.assertAllClose(np_val, self.evaluate(tf_val))
 
 
+class SingularGradientOpTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testGradientAtSingularity(self):
+    ops_and_singularity = [
+        (gen_math_ops.reciprocal, (0.,)),
+        (gen_math_ops.rsqrt, (0.,)),
+        (gen_math_ops.sqrt, (0.,)),
+        (gen_math_ops.sqrt_grad, (
+            0.,
+            0.,
+        )),
+        (gen_math_ops.reciprocal_grad, (
+            1.,
+            0.,
+        )),
+        (gen_math_ops.tan, (np.pi / 2,)),
+        (gen_math_ops.log, (0.,)),
+        (gen_math_ops.log1p, (-1.,)),
+        (gen_math_ops.acosh, (0.,)),
+        (gen_math_ops.asin, (1.,)),
+        (gen_math_ops.acos, (1.,)),
+        (gen_math_ops.atan2, (0., 0.)),
+        (gen_math_ops.div, (1., 0.)),
+        (math_ops.pow, (0., -1.)),
+    ]
+    for op, singularity in ops_and_singularity:
+      for dtype in (dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+                    dtypes_lib.complex64, dtypes_lib.complex128):
+        if dtype.is_complex and op in [
+            gen_math_ops.asin, gen_math_ops.acos, gen_math_ops.atan2
+        ]:
+          continue
+        if dtype == dtypes_lib.half and op in [
+            gen_math_ops.acosh, gen_math_ops.asin, gen_math_ops.acos,
+            gen_math_ops.atan2
+        ]:
+          continue
+        with self.cached_session():
+          print("op = ", op, ", singularity = ", singularity, ", type = ",
+                dtype)
+          args = [constant_op.constant(s, dtype=dtype) for s in singularity]
+          grad_y = constant_op.constant(0, dtype=dtype)
+          y = op(*args)
+          g = gradients_impl.gradients(y, args, grad_ys=grad_y)
+          g_val = self.evaluate(g)
+          self.assertAllEqual(g_val, np.zeros(len(singularity)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 008e59ba3e64915d8642243d335701e8adea19c0..bb8d2cf6a051867a28f984378d0db4779b06c0e0 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -89,6 +89,32 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
+  def testToComplex64(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex64)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c8")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
+  @test_util.run_deprecated_v1
+  def testToComplex128(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex128)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c16")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
   @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 97d3645b617947c2ced88ac52207ced98c59c877..96c9b5258e2a4a103a3d981a3340f67a01bbec94 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -295,7 +295,6 @@ class DepthToSpaceTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 6f6d17b4eb540b109e9c1fd646bb568648bacc8f..5b1a47fb03563f3c104e0d0ca158a0918dcb39b6 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -481,7 +481,6 @@ class DepthwiseConv2DTest(test.TestCase):
             data_format="NCHW")
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla("This test never passed for XLA")
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -613,7 +612,6 @@ class DepthwiseConv2DTest(test.TestCase):
     cpu_value = _GetVal(use_gpu=False)
     self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testDepthwiseConv2DFilterGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index b8139918c597fa455ce9b726d165ec685c959fb3..0bf48fd228fda5640e203f74b4717a2cfffd2ba3 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -65,7 +65,7 @@ class MatrixDiagTest(test.TestCase):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -270,7 +270,7 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 7a10ed9852faab88dfaf3d43f63ea9b6c51b3d57..22c98201dd1847586af6a30eed8004757a21b335 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -230,9 +230,10 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = [
-        # TODO(b/121223043): Re-enable this test on mac after fixing "mean not
-        # defined" errors.
+        # TODO(b/121223043): Re-enable this test after fixing "mean not defined"
+        # errors.
         "no_mac",
+        "no_oss",
         # disable to avoid false positives from scipy.
         "nomsan",
     ],
diff --git a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
index 6aa757e293ef69040266d194aef85370b86e5b2b..1b1b77432a6734654906fb9d646d9aa65f451dd0 100644
--- a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
+++ b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
@@ -80,7 +80,7 @@ class DrawBoundingBoxOpTest(test.TestCase):
       test_drawn_image = self._fillBorder(image, color)
       bboxes = np.asarray([0, 0, 1, 1])
       bboxes = np.vstack([bboxes for _ in range(num_boxes)])
-      bboxes = math_ops.to_float(bboxes)
+      bboxes = math_ops.cast(bboxes, dtypes.float32)
       bboxes = array_ops.expand_dims(bboxes, 0)
       image = ops.convert_to_tensor(image)
       image = image_ops_impl.convert_image_dtype(image, dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 4f338880aa3564c4bf37102c7d01c8768ef07d58..4d57c1b264a1919587dd5da5d619a74cde4e0f6e 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -63,104 +63,99 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with test_util.use_gpu():
-      # Test various datatypes in the simple case to ensure that the op was
-      # registered under those types.
-      dtypes_to_test = [
-          dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
-      ]
-      for dtype in dtypes_to_test:
-        indices = [
-            constant_op.constant([0, 4, 7]),
-            constant_op.constant([1, 6, 2, 3, 5])
-        ]
-        data = [
-            math_ops.cast(constant_op.constant([0, 40, 70]), dtype=dtype),
-            math_ops.cast(
-                constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
-        ]
-        stitched_t = self.stitch_op(indices, data)
-        stitched_val = self.evaluate(stitched_t)
-        self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
-        # Dimension 0 is max(flatten(indices))+1.
-        self.assertEqual([8], stitched_t.get_shape().as_list())
-
-  def testOneListOneDimensional(self):
-    with test_util.use_gpu():
-      indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
-      data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
-      # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8], stitched_t.get_shape().as_list())
-
-  def testSimpleTwoDimensional(self):
-    with test_util.use_gpu():
+    # Test various datatypes in the simple case to ensure that the op was
+    # registered under those types.
+    dtypes_to_test = [
+        dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
+    ]
+    for dtype in dtypes_to_test:
       indices = [
           constant_op.constant([0, 4, 7]),
-          constant_op.constant([1, 6]),
-          constant_op.constant([2, 3, 5])
+          constant_op.constant([1, 6, 2, 3, 5])
       ]
       data = [
-          constant_op.constant([[0, 1], [40, 41], [70, 71]]),
-          constant_op.constant([[10, 11], [60, 61]]),
-          constant_op.constant([[20, 21], [30, 31], [50, 51]])
+          math_ops.cast(constant_op.constant([0, 40, 70]), dtype=dtype),
+          math_ops.cast(
+              constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
       ]
       stitched_t = self.stitch_op(indices, data)
       stitched_val = self.evaluate(stitched_t)
-      self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
-                           [50, 51], [60, 61], [70, 71]], stitched_val)
+      self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8, 2], stitched_t.get_shape().as_list())
+      self.assertEqual([8], stitched_t.get_shape().as_list())
+
+  def testOneListOneDimensional(self):
+    indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
+    data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8], stitched_t.get_shape().as_list())
+
+  def testSimpleTwoDimensional(self):
+    indices = [
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6]),
+        constant_op.constant([2, 3, 5])
+    ]
+    data = [
+        constant_op.constant([[0, 1], [40, 41], [70, 71]]),
+        constant_op.constant([[10, 11], [60, 61]]),
+        constant_op.constant([[20, 21], [30, 31], [50, 51]])
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
+                         [50, 51], [60, 61], [70, 71]], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with test_util.use_gpu():
-      indices = [
-          constant_op.constant([0, 4, 7]),
-          constant_op.constant([1, 6]),
-          constant_op.constant([2, 3, 5]),
-          array_ops.zeros([0], dtype=dtypes.int32)
-      ]
-      data = [
-          constant_op.constant([[0, 1], [40, 41], [70, 71]]),
-          constant_op.constant([[10, 11], [60, 61]]),
-          constant_op.constant([[20, 21], [30, 31], [50, 51]]),
-          array_ops.zeros([0, 2], dtype=dtypes.int32)
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
-                           [50, 51], [60, 61], [70, 71]], stitched_val)
-      # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8, 2], stitched_t.get_shape().as_list())
+    indices = [
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6]),
+        constant_op.constant([2, 3, 5]),
+        array_ops.zeros([0], dtype=dtypes.int32)
+    ]
+    data = [
+        constant_op.constant([[0, 1], [40, 41], [70, 71]]),
+        constant_op.constant([[10, 11], [60, 61]]),
+        constant_op.constant([[20, 21], [30, 31], [50, 51]]),
+        array_ops.zeros([0, 2], dtype=dtypes.int32)
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
+                         [50, 51], [60, 61], [70, 71]], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=True) as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61., 62.]),
-          constant_op.constant([[41., 42.], [11., 12.]]),
-          constant_op.constant([[[51., 52.], [21., 22.]],
-                                [[1., 2.], [31., 32.]]])
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10. * np.arange(7)[:, None] + [1., 2.]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7. * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61., 62.]),
+        constant_op.constant([[41., 42.], [11., 12.]]),
+        constant_op.constant([[[51., 52.], [21., 22.]],
+                              [[1., 2.], [31., 32.]]])
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10. * np.arange(7)[:, None] + [1., 2.]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7. * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7. * self.evaluate(datum), grad)
 
   @test_util.run_deprecated_v1
   def testErrorIndicesMultiDimensional(self):
@@ -241,69 +236,66 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=True) as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61, 62], dtype=dtypes.float32),
-          constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant(
-              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
-      ]
-      stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7 * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61, 62], dtype=dtypes.float32),
+        constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
+        constant_op.constant(
+            [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+    ]
+    stitched_t = data_flow_ops.dynamic_stitch(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7 * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
-    with self.cached_session():
-      indices = [constant_op.constant(0), constant_op.constant(1)]
-      data = [constant_op.constant(40.0), constant_op.constant(60.0)]
-      for step in -1, 1:
-        stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = self.evaluate(stitched_t)
-        self.assertAllEqual([40.0, 60.0][::step], stitched_val)
-        # Dimension 0 is max(flatten(indices))+1.
-        self.assertEqual([2], stitched_t.get_shape().as_list())
+    indices = [constant_op.constant(0), constant_op.constant(1)]
+    data = [constant_op.constant(40.0), constant_op.constant(60.0)]
+    for step in -1, 1:
+      stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
+      stitched_val = self.evaluate(stitched_t)
+      self.assertAllEqual([40.0, 60.0][::step], stitched_val)
+      # Dimension 0 is max(flatten(indices))+1.
+      self.assertEqual([2], stitched_t.get_shape().as_list())
 
   @test_util.run_deprecated_v1
   def testHigherRankGPU(self):
-    with self.cached_session() as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61, 62], dtype=dtypes.float32),
-          constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant(
-              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
-      ]
-      stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7 * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61, 62], dtype=dtypes.float32),
+        constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
+        constant_op.constant(
+            [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+    ]
+    stitched_t = data_flow_ops.dynamic_stitch(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7 * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 3ea2071e13a24fb804924081add2f2b41f314716..fec6c310341c76e53556335df789fe48b90c06f4 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -584,7 +584,13 @@ class EmbeddingLookupTest(test.TestCase):
           # Compare nonsharded to gather
           simple = embedding_ops.embedding_lookup(
               params, ids, max_norm=1.0).eval()
-          self.assertAllEqual(simple, array_ops.gather(params_norm, ids).eval())
+          # assertAllClose is used here as different implementations of sqrt may
+          # be used to compute each of the values being compared.  For example,
+          # on AVX512 builds the embedding operation makes use of Eigen's fast
+          # vectorized square root algorithm for doubles.  These different
+          # implementations of sqrt are not guaranteed to produce exactly the
+          # same results. Therefore, an exact comparison cannot be made.
+          self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
           # Run a few different sharded versions.
           for procs in 1, 2, 3:
             stride = procs * math_ops.range(params.shape[0] // procs)
@@ -630,7 +636,13 @@ class EmbeddingLookupTest(test.TestCase):
           sharded = embedding_ops._embedding_lookup_and_transform(
               split_params, ids, max_norm=l2_norm,
               transform_fn=transform).eval()
-          self.assertAllEqual(simple, sharded)
+          # assertAllClose is used here as different implementations of sqrt may
+          # be used to compute each of the values being compared.  For example,
+          # on AVX512 builds the embedding operation makes use of Eigen's fast
+          # vectorized square root algorithm for doubles.  These different
+          # implementations of sqrt are not guaranteed to produce exactly the
+          # same results. Therefore, an exact comparison cannot be made.
+          self.assertAllClose(simple, sharded)
 
 
 class EmbeddingLookupSparseTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index bb3c0ae80694035dd362f5024ecdddeb0e364bb0..2e4244e94a24359693028eafedb4490963d2a798 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -44,15 +43,14 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with test_util.use_gpu():
-      out_tensor = array_ops.extract_image_patches(
-          constant_op.constant(image),
-          ksizes=ksizes,
-          strides=strides,
-          rates=rates,
-          padding=padding,
-          name="im2col")
-      self.assertAllClose(patches, self.evaluate(out_tensor))
+    out_tensor = array_ops.extract_image_patches(
+        constant_op.constant(image),
+        ksizes=ksizes,
+        strides=strides,
+        rates=rates,
+        padding=padding,
+        name="im2col")
+    self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..381bd1fdcba962f366bff46cfd2475f78b175d26
--- /dev/null
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_grad_test.py
@@ -0,0 +1,106 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ExtractVolumePatches gradient."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed as random_seed_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+class ExtractVolumePatchesGradTest(test.TestCase):
+  """Gradient-checking for ExtractVolumePatches op."""
+
+  _TEST_CASES = [
+      {
+          'in_shape': [2, 5, 5, 5, 3],
+          'ksizes': [1, 1, 1, 1, 1],
+          'strides': [1, 2, 3, 4, 1],
+      },
+      {
+          'in_shape': [2, 7, 7, 7, 3],
+          'ksizes': [1, 3, 3, 3, 1],
+          'strides': [1, 1, 1, 1, 1],
+      },
+      {
+          'in_shape': [2, 5, 7, 6, 3],
+          'ksizes': [1, 3, 2, 2, 1],
+          'strides': [1, 1, 1, 1, 1],
+      },
+      {
+          'in_shape': [2, 7, 8, 6, 3],
+          'ksizes': [1, 2, 3, 2, 1],
+          'strides': [1, 2, 4, 3, 1],
+      },
+  ]
+
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    # Set graph seed for determinism.
+    random_seed = 42
+    random_seed_lib.set_random_seed(random_seed)
+
+    with self.cached_session():
+      for test_case in self._TEST_CASES:
+        np.random.seed(random_seed)
+        in_shape = test_case['in_shape']
+        in_val = constant_op.constant(
+            np.random.random(in_shape), dtype=dtypes.float32)
+
+        for padding in ['VALID', 'SAME']:
+          out_val = array_ops.extract_volume_patches(
+              in_val, test_case['ksizes'], test_case['strides'], padding)
+          out_shape = out_val.get_shape().as_list()
+
+          err = gradient_checker.compute_gradient_error(in_val, in_shape,
+                                                        out_val, out_shape)
+
+          print('extract_volume_patches gradient err: %.4e' % err)
+          self.assertLess(err, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testConstructGradientWithLargeVolumess(self):
+    batch_size = 4
+    planes = 8
+    height = 32
+    width = 32
+    ksize = 5
+    volumes = variable_scope.get_variable(
+        'inputs', (batch_size, planes, height, width, 1))
+    patches = array_ops.extract_volume_patches(
+        volumes,
+        ksizes=[1, ksize, ksize, ksize, 1],
+        strides=[1, 1, 1, 1, 1],
+        padding='SAME')
+    # Github issue: #20146
+    # tf.extract_volume_patches() gradient very slow at graph construction time
+    gradients = gradients_impl.gradients(patches, volumes)
+    # Won't time out.
+    self.assertIsNotNone(gradients)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index 88f7df8fbb64512c9ca362ec7c310a5805c9c728..7a63e590cf32d8789dc35a4500bf1c4d6d377b64 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -46,14 +45,13 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with test_util.use_gpu():
-      out_tensor = array_ops.extract_volume_patches(
-          constant_op.constant(image),
-          ksizes=ksizes,
-          strides=strides,
-          padding=padding,
-          name="im2col_3d")
-      self.assertAllClose(patches, self.evaluate(out_tensor))
+    out_tensor = array_ops.extract_volume_patches(
+        constant_op.constant(image),
+        ksizes=ksizes,
+        strides=strides,
+        padding=padding,
+        name="im2col_3d")
+    self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 92f4e7b39e047a6a6b95a34f09161f4828535663..91683047a8f1ebac20949e218b4eadc7621ba52a 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -23,17 +23,17 @@ import numpy as np
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -58,7 +58,6 @@ def simple_scoped_fn(a, x):
 
 
 @test_util.with_control_flow_v2
-@test_util.disable_all_xla("This test never passed for XLA")
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -193,140 +192,6 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual(720.0, self.evaluate(r))
   # pylint: enable=unnecessary-lambda
 
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_Simple(self):
-    nums = [1, 2, 3, 4, 5, 6]
-    elems = constant_op.constant(nums, name="data")
-    r = functional_ops.map_fn(
-        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), elems)
-    self.assertAllEqual(
-        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
-
-  def testMapDtypeEager(self):
-    with context.eager_mode():
-      dtype = functional_ops.map_fn(lambda x: constant_op.constant(""),
-                                    constant_op.constant([]),
-                                    dtype=dtypes.string).dtype
-      self.assertEqual(dtype, dtypes.string)
-
-  def testMapSparseTensor(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        functional_ops.map_fn(
-            lambda x: x,
-            sparse_tensor.SparseTensor(
-                indices=[[0, 0], [0, 1], [1, 0]],
-                values=constant_op.constant([0, 1, 2]),
-                dense_shape=[2, 2]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMapOverScalarErrors(self):
-    with self.assertRaisesRegexp(ValueError, "not scalars"):
-      functional_ops.map_fn(lambda x: x, [1, 2])
-    with self.assertRaisesRegexp(ValueError, "not a scalar"):
-      functional_ops.map_fn(lambda x: x, 1)
-
-  @test_util.run_deprecated_v1
-  def testMap_Scoped(self):
-    with self.cached_session() as sess:
-
-      def double_scoped(x):
-        """2x with a dummy 2 that is scoped."""
-        with variable_scope.variable_scope("body"):
-          # Dummy variable, just to check that scoping works as intended.
-          two = variable_scope.get_variable(
-              "two", [],
-              dtype=dtypes.int32,
-              initializer=init_ops.constant_initializer(2))
-          return math_ops.multiply(x, two)
-
-      with variable_scope.variable_scope("root") as varscope:
-        elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-        doubles = np.array([2 * x for x in [1, 2, 3, 4, 5, 6]])
-
-        r = functional_ops.map_fn(double_scoped, elems)
-        # Check that we have the one variable we asked for here.
-        self.assertEqual(len(variables.trainable_variables()), 1)
-        self.assertEqual(variables.trainable_variables()[0].name,
-                         "root/body/two:0")
-        sess.run([variables.global_variables_initializer()])
-        self.assertAllEqual(doubles, self.evaluate(r))
-
-        # Now let's reuse our single variable.
-        varscope.reuse_variables()
-        r = functional_ops.map_fn(double_scoped, elems)
-        self.assertEqual(len(variables.trainable_variables()), 1)
-        self.assertAllEqual(doubles, self.evaluate(r))
-
-  @test_util.run_deprecated_v1
-  def testMap_Grad(self):
-    with self.cached_session():
-      param = constant_op.constant(2.0)
-      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
-      y = functional_ops.map_fn(
-          lambda x: math_ops.multiply(math_ops.square(x), param), elems)
-      r = gradients_impl.gradients(y, param)[0]
-      self.assertAllEqual(91.0, self.evaluate(r))
-      r = gradients_impl.gradients(y, elems)[0]
-      self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_SimpleNotTensor(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), nums)
-    self.assertAllEqual(
-        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_SingleInputMultiOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: ((x + 3) * 2, -(x + 3) * 2),
-        nums,
-        dtype=(dtypes.int64, dtypes.int64))
-    self.assertEqual(2, len(r))
-    self.assertEqual((6,), r[0].get_shape())
-    self.assertEqual((6,), r[1].get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual((nums + 3) * 2, received[0])
-    self.assertAllEqual(-(nums + 3) * 2, received[1])
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiOutputMismatchedDtype(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    with self.assertRaisesRegexp(
-        TypeError, r"two structures don't have the same nested structure"):
-      # lambda emits tuple, but dtype is a list
-      functional_ops.map_fn(
-          lambda x: ((x + 3) * 2, -(x + 3) * 2),
-          nums,
-          dtype=[dtypes.int64, dtypes.int64])
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiInputSingleOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: x[0] * x[1][0] + x[1][1], (nums, (nums, -nums)),
-        dtype=dtypes.int64)
-    self.assertEqual((6,), r.get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual(nums * nums + (-nums), received)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiInputSameStructureOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(lambda x: (x[1][0], (x[1][1], x[0])),
-                              (nums, (2 * nums, -nums)))
-    r = [r[0], r[1][0], r[1][1]]
-    self.assertEqual((6,), r[0].get_shape())
-    self.assertEqual((6,), r[1].get_shape())
-    self.assertEqual((6,), r[2].get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual(2 * nums, received[0])
-    self.assertAllEqual(-nums, received[1])
-    self.assertAllEqual(nums, received[2])
-
   @test_util.run_in_graph_and_eager_modes
   def testScan_Simple(self):
     elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -489,37 +354,6 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.foldl(fn, x, initializer=initializer)
     self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testMapShape(self):
-    x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    y = functional_ops.map_fn(lambda e: e, x)
-    self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
-
-  @test_util.run_deprecated_v1
-  def testMapUnknownShape(self):
-    x = array_ops.placeholder(dtypes.float32)
-    y = functional_ops.map_fn(lambda e: e, x)
-    self.assertIs(None, y.get_shape().dims)
-
-  @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
-  def testMapEmptyScalar(self):
-    map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
-    self.assertAllEqual([0], map_return.get_shape().dims)
-    self.assertAllEqual([0], self.evaluate(map_return).shape)
-
-  # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
-  # so the body of the while loop never executes
-  @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_v1_only("b/120545219")
-  def testMapEmptyTensor(self):
-    with self.cached_session():
-      map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
-                                         constant_op.constant([]))
-      self.assertAllEqual([0, 3, 2], map_return.get_shape().dims)
-      self.assertAllEqual([0, 3, 2], self.evaluate(map_return).shape)
-
   @test_util.run_in_graph_and_eager_modes
   def testScanShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
@@ -641,6 +475,7 @@ class FunctionalOpsTest(test.TestCase):
       mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionCPUGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -665,6 +500,7 @@ class FunctionalOpsTest(test.TestCase):
       mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionGPUCPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -689,6 +525,7 @@ class FunctionalOpsTest(test.TestCase):
       mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionGPUCPUStrings(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -827,8 +664,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(Run(100., True), 5050.)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla(
-      "This test never passed for XLA")  # Different error message
+  @test_util.disable_xla("b/123337890")  # Different error message
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -918,7 +754,7 @@ class FunctionalOpsTest(test.TestCase):
 
           def TestCondCapture(n, *args):
             del args
-            return math_ops.to_float(n) + v < 10
+            return math_ops.cast(n, dtypes.float32) + v < 10
 
           with self.assertRaises(ValueError):
             _ = functional_ops.While(
@@ -934,7 +770,7 @@ class FunctionalOpsTest(test.TestCase):
 
         @function.Defun(dtypes.int32, dtypes.float32)
         def Body(n, x):
-          return x + math_ops.to_float(n)
+          return x + math_ops.cast(n, dtypes.float32)
 
         xs = [
             # 1 + 2  + ... + 20
@@ -963,7 +799,7 @@ class FunctionalOpsTest(test.TestCase):
 
       @function.Defun(dtypes.int32, dtypes.float32, func_name="TestBody")
       def TestBody(n, x):
-        return x + math_ops.to_float(n)
+        return x + math_ops.cast(n, dtypes.float32)
 
       _ = functional_ops.For(
           1, 21, 1, [0.], TestBody, rewrite_with_while=True)[0]
@@ -981,15 +817,15 @@ class FunctionalOpsTest(test.TestCase):
 
     @function.Defun(dtypes.int32)
     def TestNullary(n):
-      v + math_ops.to_float(n)  # pylint: disable=expression-not-assigned
+      v + math_ops.cast(n, dtypes.float32)  # pylint: disable=expression-not-assigned
 
     @function.Defun(dtypes.int32, dtypes.float32)
     def TestUnary(n, x):
-      return x + math_ops.to_float(n) + v
+      return x + math_ops.cast(n, dtypes.float32) + v
 
     @function.Defun(dtypes.int32, dtypes.float32, dtypes.float32)
     def TestBinary(n, x, x2):
-      return x + math_ops.to_float(n) + v, x2 + v
+      return x + math_ops.cast(n, dtypes.float32) + v, x2 + v
 
     for rewrite_with_while in (True, False):
       use_gpu = not rewrite_with_while
@@ -1063,7 +899,7 @@ class FunctionalOpsTest(test.TestCase):
 
     @function.Defun(dtypes.int32, dtypes.float32)
     def Foo(i, v):
-      return math_ops.to_float(i) + v
+      return math_ops.cast(i, dtypes.float32) + v
 
     @function.Defun(dtypes.int32, dtypes.float32)
     def ReturnsTooManyArgs(unused_i, v):
@@ -1105,7 +941,6 @@ class FunctionalOpsTest(test.TestCase):
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
-@test_util.disable_all_xla("This test never passed for XLA")
 class PartitionedCallTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -1149,6 +984,7 @@ class PartitionedCallTest(test.TestCase):
                 constant_op.constant(2.)], f=Body)
       self.assertEqual(output.eval(), 12.)
 
+  @test_util.run_deprecated_v1
   def testBasicMultiDeviceGPU(self):
     if not test_util.is_gpu_available():
       return
@@ -1229,6 +1065,7 @@ class PartitionedCallTest(test.TestCase):
     value = self.evaluate(v.read_value())
     self.assertEqual(value, 2.0)
 
+  @test_util.run_deprecated_v1
   def testFunctionWithResourcesOnDifferentDevices(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPUs available.")
@@ -1280,6 +1117,37 @@ class PartitionedCallTest(test.TestCase):
       self.evaluate(op)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
+class FunctionalOpsCaseTest(test.TestCase):
+
+  def testCase(self):
+    @eager_function.defun
+    def two(x):
+      return x * 2
+
+    @eager_function.defun
+    def three(x):
+      return x * 3
+
+    @eager_function.defun
+    def four(x):
+      return x * 4
+
+    def f(branch, x):
+      tmpl = array_ops.zeros_like(x)
+      return array_ops.identity(gen_functional_ops.case(
+          branch, input=[x], Tout=[dtypes.float32],
+          branches=[f.get_concrete_function(tmpl)
+                    for f in (two, three, four)])[0])
+    one = array_ops.ones([])
+    self.assertAllEqual(np.float32(2), self.evaluate(f(0, one)))
+    self.assertAllEqual(np.float32(3), self.evaluate(f(1, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(2, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(-1, one)))  # <0 default
+    self.assertAllEqual(np.float32(4), self.evaluate(f(6, one)))  # >=N default
+
+
 if __name__ == "__main__":
   test.main()
 
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 76ae2fcb72f606d95a6d4523f08ecad3514eb974..ad8376b48c8f05809b310a432a12a92786aba989 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -34,7 +34,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdTest(test.TestCase):
 
   def _testSimpleDtype(self, dtype):
@@ -57,7 +56,7 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
@@ -360,7 +359,6 @@ class GatherNdTest(test.TestCase):
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class GatherNdOpBenchmark(test.Benchmark):
 
   def benchmark_gather_nd_op(self):
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 3b8a9a38de8f882d93f2146b60c2571adcb28fb9..58106a56218c897e8fad49859ac15223ba027c25 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -193,12 +193,12 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(None, gather_t.shape)
 
   def testBadIndicesCPU(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
-        array_ops.gather(params, [[7]], axis=0).eval()
+        self.evaluate(array_ops.gather(params, [[7]], axis=0))
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
-        array_ops.gather(params, [[7]], axis=1).eval()
+        self.evaluate(array_ops.gather(params, [[7]], axis=1))
 
   def _disabledTestBadIndicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -213,8 +213,6 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         array_ops.gather(params, [[7]], axis=1).eval()
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla(
-      "This test never passed for XLA")  # Different error message.
   def testBadAxis(self):
     with self.session(use_gpu=True):
       params = [0, 1, 2]
@@ -469,12 +467,12 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   def testErrors(self):
 
     with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 2 must be less than ndims\(indices\) = 2"):
+        ValueError, r"batch_dims = 2 must be less than rank\(indices\) = 2"):
       array_ops.gather(
           params=[[1, 2], [3, 4]], indices=[[1, 2], [3, 4]], batch_dims=2)
 
     with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 1 must be less than ndims\(params\) = 1"):
+        ValueError, r"batch_dims = 1 must be less than rank\(params\) = 1"):
       array_ops.gather(
           params=[1, 2, 3, 4], indices=[[1, 2], [3, 4]], batch_dims=1)
 
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 0148de5047afe6144433d69beb03e066ae395865..682566742c205b87befea023a6b0ed82df7c02ae 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -59,7 +59,7 @@ class GradientCorrectnessTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testGradientWithIntegerPath(self):
     x = constant_op.constant([3.9, 4.1])
-    k = math_ops.to_float(math_ops.to_int32(x))
+    k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
     y = x * k
     dy_dx, = gradients_impl.gradients(y, x)
     with self.cached_session() as sess:
@@ -68,7 +68,7 @@ class GradientCorrectnessTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
-    k = math_ops.to_float(math_ops.to_int32(x))
+    k = math_ops.cast(math_ops.cast(x, dtypes.int32), dtypes.float32)
     y = k * k
     dy_dx, = gradients_impl.gradients(y, x)
     self.assertIsNone(dy_dx)
@@ -76,7 +76,7 @@ class GradientCorrectnessTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoIntegerGradient2(self):
     k = constant_op.constant([3, 4])
-    x = math_ops.to_float(k)
+    x = math_ops.cast(k, dtypes.float32)
     y = x * x
     dy_dk, = gradients_impl.gradients(y, k)
     self.assertIsNone(dy_dk)
@@ -106,7 +106,7 @@ class GradientCorrectnessTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoIntegerGradient6(self):
     k = constant_op.constant(3)
-    x = math_ops.to_float(k)
+    x = math_ops.cast(k, dtypes.float32)
     grad_1, = gradients_impl.gradients(k * k, k)
     grad_2, = gradients_impl.gradients(x * x, k)
     grad_3, = gradients_impl.gradients(math_ops.square(k), k)
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index c1aa99cc3756dcea3489d2d134d758848c84965e..53815858e4c8fc9c9dad0246f9ff9933a47459bc 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -150,7 +150,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out, b/79171797
     ],
-    xla_enable_strict_auto_jit = True,
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
index 1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95..f70d8c4e1cd557c34f07a90a39b102830d82dd0f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -114,5 +114,29 @@ class LinearOperatorAdjointTest(
     self.assertEqual("my_operator_adjoint", operator.name)
 
 
+class LinearOperatorAdjointNonSquareTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+  """Tests done in the base class NonSquareLinearOperatorDerivedClassTest."""
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    shape_before_adjoint = list(build_info.shape)
+    # We need to swap the last two dimensions because we are taking the adjoint
+    # of this operator
+    shape_before_adjoint[-1], shape_before_adjoint[-2] = (
+        shape_before_adjoint[-2], shape_before_adjoint[-1])
+    matrix = linear_operator_test_util.random_normal(
+        shape_before_adjoint, dtype=dtype)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    operator = LinearOperatorAdjoint(
+        linalg.LinearOperatorFullMatrix(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index ec78a3ffe0b2ae1ff5c5f6c4d73480f2ad92fd26..12da8659caca2dcbd8e981dd7124b52737bff970 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -26,15 +26,59 @@ from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
+_ADJOINTS = linear_operator_algebra._ADJOINTS
+_registered_adjoint = linear_operator_algebra._registered_adjoint
 _CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
-_MATMUL = linear_operator_algebra._MATMUL
 _registered_cholesky = linear_operator_algebra._registered_cholesky
-_registered_matmul = linear_operator_algebra._registered_matmul
 _INVERSES = linear_operator_algebra._INVERSES
 _registered_inverse = linear_operator_algebra._registered_inverse
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_matmul = linear_operator_algebra._registered_matmul
 # pylint: enable=protected-access
 
 
+class AdjointTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Adjoint to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterAdjoint(CustomLinOp)
+    def _adjoint(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    self.assertEqual("OK", CustomLinOp(dtype=None).adjoint())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+  def testExactAdjointRegistrationsAllMatch(self):
+    for (k, v) in _ADJOINTS.items():
+      self.assertEqual(v, _registered_adjoint(k[0]))
+
+
 class CholeskyTest(test.TestCase):
 
   def testRegistration(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 96e6e3c04c77e2a32d11d72feea02c177cfa3e61..28f8d20f61515328261771684d2571b80b686c64 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -136,6 +136,27 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_block_diag_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = block_diag.LinearOperatorBlockDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 4d7a31be87cf5f51d952704ee585d140c3147a3f..5c3220e60f49e872bbf2b4f2f1bb63a2271f7b1d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -187,6 +187,11 @@ class LinearOperatorDiagTest(
         linalg_lib.LinearOperatorDiag))
     self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
 
+  def test_diag_adjoint_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.adjoint(), linalg.LinearOperatorDiag)
+
   def test_diag_cholesky_type(self):
     diag = [1., 3., 5., 8.]
     operator = linalg.LinearOperatorDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index aff0b1ae14ce5bfb62ba9984f60cf30f9b553ea7..0679bdacd203e231b02ecb27a5f1f7d9c82cabba 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -106,7 +107,7 @@ class SquareLinearOperatorFullMatrixTest(
     matrix = [[1., 1.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=True)
     with self.cached_session():
-      with self.assertRaisesOpError("Cholesky decomposition was not success"):
+      with self.assertRaises(errors.InvalidArgumentError):
         operator.assert_positive_definite().run()
 
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index ea9ee99a582fee6441207a5d9710571bc5fd6804..55eff59e03e83d12e7019922758ef065fe2e2812 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -259,6 +259,12 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_adjoint_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.adjoint(), linalg_lib.LinearOperatorIdentity)
+
   def test_identity_cholesky_type(self):
     operator = linalg_lib.LinearOperatorIdentity(
         num_rows=2,
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 54ccc0c5f642ad98c04174d01d9fca0c0fc056d6..166188f6cecac1c472d0855069c61fe0e2937b02 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -192,6 +192,23 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_kronecker_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = kronecker.LinearOperatorKronecker(
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index ec6906f20c706277d3a019e0ea9e7caa3f5168e3..034a51524b83e877762aac71160bc4d480ded90c 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -89,6 +90,58 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       self.evaluate(l)
 
+  def testPopUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
+    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, np.zeros((2, 3)))
+
+  def testPopUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    _, e = gen_list_ops.tensor_list_pop_back(
+        l, element_dtype=dtypes.float32, element_shape=[4, 3])
+    self.assertAllEqual(e, np.zeros((4, 3)))
+
+  def testPopUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(e)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      _, e = gen_list_ops.tensor_list_pop_back(
+          l, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e)
+
+  def testPushGetGrad(self):
+    with backprop.GradientTape() as tape:
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None)
+      c0 = constant_op.constant(5.0)
+      c1 = constant_op.constant([10.0, 20.0])
+      tape.watch(c0)
+      tape.watch(c1)
+      l = list_ops.tensor_list_push_back(l, c0)
+      l = list_ops.tensor_list_push_back(l, c1)
+      t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t1), [10.0, 20.0])
+      # t1 == c1 so the gradient should be [0., [1., 1.]]
+      # This tests that the gradient of push_back correctly converts DT_INVALID
+      # tensors to zeros. The list returned by the gradient of GetItem will
+      # have only have tensor at index 1 set and others set to DT_INVALID.
+      dt0, dt1 = tape.gradient(t1, [c0, c1])
+      self.assertAllEqual(self.evaluate(dt1), [1.0, 1.0])
+      self.assertEqual(self.evaluate(dt0), 0.0)
+
   def _testStack(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
@@ -130,7 +183,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -151,7 +205,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -193,7 +249,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.tensor_list_reserve(
         element_dtype=dtypes.float32, element_shape=[], num_elements=3)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(t), [0., 0., 0.])
+    self.assertAllEqual(t, [0., 0., 0.])
 
   def testStackWithUninitializedTensors(self):
     self._testStackWithUninitializedTensors()
@@ -209,7 +265,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         element_dtype=dtypes.float32, element_shape=None, num_elements=3)
     l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(t), [[0., 0.], [1., 2.], [0., 0.]])
+    self.assertAllEqual(t, [[0., 0.], [1., 2.], [0., 0.]])
 
   def testStackWithUninitializedTensorsInferShape(self):
     self._testStackWithUninitializedTensorsInferShape()
@@ -230,6 +286,13 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testStackUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_stack(
+        l, element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
   def testGatherGrad(self, max_num_elements):
@@ -268,7 +331,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -292,7 +356,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -347,37 +413,116 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     dl_length = list_ops.tensor_list_length(dl)
     self.assertAllEqual(self.evaluate(dl_length), 3)
 
+  def _testGatherWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [0., 0.])
+
+  def testGatherWithUninitializedTensors(self):
+    self._testGatherWithUninitializedTensors()
+
+  def testGatherWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensors()
+
+  def _testGatherWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_gather(l, [1, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[1., 2.], [0., 0.]])
+
+  def testGatherWithUninitializedTensorsInferShape(self):
+    self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tried to gather uninitialized tensors from a"
+        " list with non-fully-defined element_shape"):
+      t = list_ops.tensor_list_gather(l, [0], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testGatherUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_gather(
+        l, [0, 1, 2], element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
   def testScatterOutputListSize(self):
     c0 = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_scatter(
-        c0, [1, 3], ops.convert_to_tensor([], dtype=dtypes.int32))
+    l = list_ops.tensor_list_scatter(c0, [1, 3], [])
     # TensorListScatter should return a list with size largest index + 1.
-    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 4)
+    self.assertAllEqual(list_ops.tensor_list_length(l), 4)
+
+  def testScatterOutputListSizeWithNumElementsSpecified(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = gen_list_ops.tensor_list_scatter_v2(
+        c0, [1, 3], list_ops._build_element_shape([]), num_elements=5)
+    # TensorListScatter should return a list with size num_elements.
+    self.assertAllEqual(list_ops.tensor_list_length(l), 5)
+
+  def testScatterFailsWhenIndexLargerThanNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter: Trying to scatter at index 3 in list with size 3"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=3)
+      self.evaluate(l)
+
+  def testScatterFailsWithInvalidNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter expects num_elements >= -1, found: -2"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=-2)
+      self.evaluate(l)
 
   def testScatterWithInvalidRowsInInputTensorFails(self):
     c0 = constant_op.constant([1.0, 2.0])
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         "Invalid number of rows in input tensor. Expected: 3 Actual: 2"):
-      l = list_ops.tensor_list_scatter(
-          c0, [1, 0, 2], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [1, 0, 2], [])
       self.evaluate(l)
 
   def testScatterWithNegativeIndicesFails(self):
     c0 = constant_op.constant([1.0, 2.0])
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
-        "Indices in TensorListScatter must all be positive."):
-      l = list_ops.tensor_list_scatter(
-          c0, [-1, -2], ops.convert_to_tensor([], dtype=dtypes.int32))
+        "Indices in TensorListScatter must all be non-negative."):
+      l = list_ops.tensor_list_scatter(c0, [-1, -2], element_shape=[])
       self.evaluate(l)
 
+  def testScatterIntoExistingList(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_scatter(tensor=[1.], indices=[0], element_shape=[])
+    l = list_ops.tensor_list_scatter(
+        tensor=[2., 3.], indices=[1, 2], element_shape=[], input_handle=l)
+    self.assertAllEqual(
+        list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
       tape.watch(c0)
-      l = list_ops.tensor_list_scatter(
-          c0, [1, 0], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
       t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
       t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(t0), 2.0)
@@ -386,14 +531,27 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
+  def testScatterWithPartialReadGrad(self):
+    with backprop.GradientTape() as tape:
+      c0 = constant_op.constant([1.0, 2.0])
+      tape.watch(c0)
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
+      t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t0), 2.0)
+      loss = t0 * t0
+    dt = tape.gradient(loss, c0)
+    self.assertAllEqual(self.evaluate(dt), [0., 4.])
+
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 1.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 2.0)
+    self.assertAllEqual(e, 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 1.0)
-    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
+    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
 
   def testFromTensorGPU(self):
     if not context.num_gpus():
@@ -401,7 +559,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  def testGetSetItem(self):
+  def testGetSet(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
@@ -414,7 +572,22 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testGetSetItem()
+      self.testGetSet()
+
+  def testGetSetReserved(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+    e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e0, 0.0)
+    l = list_ops.tensor_list_set_item(l, 0, 3.0)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [3.0, 0.0])
+
+  def testGetSetReservedGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testGetSetReserved()
 
   def testSetGetGrad(self):
     with backprop.GradientTape() as tape:
@@ -427,6 +600,50 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  def testGetUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 0, 5.)
+    e1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+    e2 = list_ops.tensor_list_get_item(l, 2, element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e1), 0.)
+    self.assertEqual(self.evaluate(e2), 0.)
+
+  def testGetUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    e0 = gen_list_ops.tensor_list_get_item(
+        l, 0, element_shape=[], element_dtype=dtypes.float32)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e0), 0.)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+  def testGetUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.evaluate(e0)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      e0 = gen_list_ops.tensor_list_get_item(
+          l, 0, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e0)
+
   @test_util.run_deprecated_v1
   @test_util.enable_control_flow_v2
   def testSkipEagerSetItemIndexOutOfBounds(self):
@@ -762,16 +979,25 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
               list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "element shapes are not identical at index 0"):
+    if context.executing_eagerly():
+      expected_error = (
+          errors.InvalidArgumentError,
+          "element shapes are not identical at index 0")
+    else:
+      expected_error = (ValueError, "Shapes must be equal rank")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_vec_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
                                             element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"input_b\[0\].dtype != element_dtype."):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError,
+                        r"input_b\[0\].dtype != element_dtype.")
+    else:
+      expected_error = (ValueError, "input_b.type != element_dtype")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_int_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
@@ -816,8 +1042,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(
           list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Invalid data type at index 0"):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError, "Invalid data type")
+    else:
+      expected_error = (ValueError, "wrong element dtype")
+    with self.assertRaisesRegexp(*expected_error):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
   def testZerosLike(self):
@@ -976,6 +1205,43 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(result_0), [6., 8.])
     self.assertAllEqual(self.evaluate(result_1), [10., 12.])
 
+  def testAddTensorListsFailsIfLeadingDimsMismatch(self):
+    l1 = list_ops.tensor_list_reserve(
+        element_shape=[], element_dtype=dtypes.float32, num_elements=2)
+    l2 = list_ops.tensor_list_reserve(
+        element_shape=[], element_dtype=dtypes.float32, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to add two lists of tensors with different lengths"):
+      l = math_ops.add_n([l1, l2])
+      self.evaluate(list_ops.tensor_list_stack(l, element_dtype=dtypes.float32))
+
+  @test_util.run_v1_only("Uses placeholders")
+  def testSkipEagerAddTensorListsFailsIfElementShapesMismatch(self):
+    with self.cached_session() as sess:
+      # Use placeholders instead of constant values for shapes to prevent TF's
+      # shape inference from catching this early.
+      l1_element_shape = array_ops.placeholder(dtype=dtypes.int32)
+      l2_element_shape = array_ops.placeholder(dtype=dtypes.int32)
+      l1 = list_ops.tensor_list_reserve(
+          element_shape=l1_element_shape,
+          element_dtype=dtypes.float32,
+          num_elements=3)
+      l2 = list_ops.tensor_list_reserve(
+          element_shape=l2_element_shape,
+          element_dtype=dtypes.float32,
+          num_elements=3)
+      l = math_ops.add_n([l1, l2])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Trying to add two lists of tensors with incompatible element shapes"
+      ):
+        sess.run(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32), {
+                l1_element_shape: [],
+                l2_element_shape: [2]
+            })
+
   @test_util.run_deprecated_v1
   def testSkipEagerConcatShapeInference(self):
 
@@ -1013,9 +1279,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.tensor_list_push_back(l, [[0., 1.]])
     l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
     with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r"Tried to concat tensors with unequal shapes: "
-        r"\[2\] vs \[1\]"):
+        errors.InvalidArgumentError, r"Incompatible shapes during merge: "
+        r"\[2\] vs. \[1\]"):
       t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -1076,6 +1341,65 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testConcatWithUninitializedTensorsUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(np.zeros((6, 3)), t)
+
+  def testConcatWithUninitializedTensorsUseProvidedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = list_ops.tensor_list_concat(
+        l, element_dtype=dtypes.float32, element_shape=(2, 3))
+    self.assertAllEqual(np.zeros((6, 3)), t)
+
+  def testConcatWithUninitializedTensorsUseProvidedElementShapeAndLengths(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t, _ = gen_list_ops.tensor_list_concat_v2(
+        l,
+        element_dtype=dtypes.float32,
+        element_shape=list_ops._build_element_shape((None, 3)),
+        leading_dims=[2, 3, 5])
+    self.assertAllEqual(np.zeros((10, 3)), t)
+    l = list_ops.tensor_list_set_item(l, 1, [[2., 3.], [4., 5.], [6., 7.]])
+    t, _ = gen_list_ops.tensor_list_concat_v2(
+        l,
+        element_dtype=dtypes.float32,
+        element_shape=list_ops._build_element_shape((None, 2)),
+        leading_dims=[2, 3, 4])
+    self.assertAllEqual([[0., 0.], [0., 0.], [2., 3.], [4., 5.], [6., 7.],
+                         [0., 0.], [0., 0.], [0., 0.], [0., 0.]], t)
+
+  def testConcatWithUninitializedTensorsInferShapeFromElements(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [[2., 3.], [4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual([[0., 0.], [0., 0.], [0., 0.], [2., 3.], [4., 5.],
+                         [6., 7.], [0., 0.], [0., 0.], [0., 0.]], t)
+
+  def testConcatWithUninitializedTensorsFailsIfNoElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Trying to concat list with only uninitialized tensors "
+        r"but element_shape_except_first_dim_ is not fully defined"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatWithUninitializedTensorsFailsIfNoInputLengths(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"List contains uninitialized tensor at index 0"
+        r" but leading_dims has only 0 elements."):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
   def testEvenSplit(self):
 
     def RunTest(input_tensor, lengths, expected_stacked_output):
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 3dd1ee33d91764e42e074fea87a40ad1e786b260..8376ec3e7a7d21acd4f6ab51745a0a36d7600db0 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -18,10 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 import numpy as np
+import six
 
+from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -32,199 +38,206 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
+from tensorflow.python.training.tracking import util as trackable
 
 
-class HashTableOpTest(test.TestCase):
+class BaseLookupTableTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
-  def testHashTable(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+  def getHashTable(self):
+    if tf2.enabled():
+      return lookup_ops.StaticHashTable
+    else:
+      return lookup_ops.StaticHashTableV1
 
-      self.assertAllEqual(3, table.size().eval())
+  def getVocabularyTable(self):
+    if tf2.enabled():
+      return lookup_ops.StaticVocabularyTable
+    else:
+      return lookup_ops.StaticVocabularyTableV1
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+  def initialize_table(self, table):
+    if not tf2.enabled():
+      self.evaluate(table.initializer)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
 
-      exported_keys_tensor, exported_values_tensor = table.export()
+class StaticHashTableTest(BaseLookupTableTest):
 
-      self.assertItemsEqual([b"brain", b"salad", b"surgery"],
-                            self.evaluate(exported_keys_tensor))
-      self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
+  def testStaticHashTable(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def testHashTableFindHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      self.assertAllEqual(3, table.size().eval())
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
 
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
-      output = table.lookup(input_string)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([[0, 1], [-1, -1]], result)
+    exported_keys_tensor, exported_values_tensor = table.export()
 
-  @test_util.run_deprecated_v1
-  def testHashTableInitWithPythonArrays(self):
-    with self.cached_session():
-      default_val = -1
-      keys = ["brain", "salad", "surgery"]
-      values = [0, 1, 2]
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(
-              keys, values, value_dtype=dtypes.int64), default_val)
-      table.initializer.run()
+    self.assertItemsEqual([b"brain", b"salad", b"surgery"],
+                          self.evaluate(exported_keys_tensor))
+    self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
-      self.assertAllEqual(3, table.size().eval())
+  def testStaticHashTableFindHighRank(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["tank", "tarkus"]])
+    output = table.lookup(input_string)
 
-  @test_util.run_deprecated_v1
-  def testHashTableInitWithNumPyArrays(self):
-    with self.cached_session():
-      default_val = -1
-      keys = np.array(["brain", "salad", "surgery"], dtype=np.str)
-      values = np.array([0, 1, 2], dtype=np.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [-1, -1]], result)
 
-      self.assertAllEqual(3, table.size().eval())
+  def testStaticHashTableInitWithPythonArrays(self):
+    default_val = -1
+    keys = ["brain", "salad", "surgery"]
+    values = [0, 1, 2]
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(
+            keys, values, value_dtype=dtypes.int64), default_val)
+    self.initialize_table(table)
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-  @test_util.run_deprecated_v1
-  def testMultipleHashTables(self):
-    with self.cached_session() as sess:
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-      table1 = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table2 = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table3 = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+  def testStaticHashTableInitWithNumPyArrays(self):
+    default_val = -1
+    keys = np.array(["brain", "salad", "surgery"], dtype=np.str)
+    values = np.array([0, 1, 2], dtype=np.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual(3, table1.size().eval())
-      self.assertAllEqual(3, table2.size().eval())
-      self.assertAllEqual(3, table3.size().eval())
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      out1, out2, out3 = self.evaluate([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
-  def testHashTableWithTensorDefault(self):
-    with self.cached_session():
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+  def testMultipleStaticHashTables(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    table1 = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    table2 = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    table3 = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+    self.initialize_table(table1)
+    self.initialize_table(table2)
+    self.initialize_table(table3)
+    self.assertAllEqual(3, self.evaluate(table1.size()))
+    self.assertAllEqual(3, self.evaluate(table2.size()))
+    self.assertAllEqual(3, self.evaluate(table3.size()))
+
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output1 = table1.lookup(input_string)
+    output2 = table2.lookup(input_string)
+    output3 = table3.lookup(input_string)
+
+    out1, out2, out3 = self.evaluate([output1, output2, output3])
+    self.assertAllEqual([0, 1, -1], out1)
+    self.assertAllEqual([0, 1, -1], out2)
+    self.assertAllEqual([0, 1, -1], out3)
+
+  def testStaticHashTableWithTensorDefault(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-  @test_util.run_deprecated_v1
-  def testHashTableWithSparseTensorInput(self):
-    with self.cached_session() as sess:
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
+
+  def testStaticHashTableWithSparseTensorInput(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      sp_indices = [[0, 0], [0, 1], [1, 0]]
-      sp_shape = [2, 2]
-      input_tensor = sparse_tensor.SparseTensor(
-          constant_op.constant(sp_indices, dtypes.int64),
-          constant_op.constant(["brain", "salad", "tank"]),
-          constant_op.constant(sp_shape, dtypes.int64))
-      output = table.lookup(input_tensor)
+    sp_indices = [[0, 0], [0, 1], [1, 0]]
+    sp_shape = [2, 2]
+    input_tensor = sparse_tensor.SparseTensor(
+        constant_op.constant(sp_indices, dtypes.int64),
+        constant_op.constant(["brain", "salad", "tank"]),
+        constant_op.constant(sp_shape, dtypes.int64))
+    output = table.lookup(input_tensor)
 
-      out_indices, out_values, out_shape = self.evaluate(output)
+    out_indices, out_values, out_shape = self.evaluate(output)
 
-      self.assertAllEqual([0, 1, -1], out_values)
-      self.assertAllEqual(sp_indices, out_indices)
-      self.assertAllEqual(sp_shape, out_shape)
+    self.assertAllEqual([0, 1, -1], out_values)
+    self.assertAllEqual(sp_indices, out_indices)
+    self.assertAllEqual(sp_shape, out_shape)
 
-  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      # Ref types do not produce a lookup signature mismatch.
-      input_string_ref = variables.Variable("brain")
-      variables.global_variables_initializer().run()
-      self.assertEqual(0, table.lookup(input_string_ref).eval())
+    # Ref types do not produce a lookup signature mismatch.
+    input_string_ref = variables.Variable("brain")
+    self.evaluate(input_string_ref.initializer)
+    self.assertEqual(0, self.evaluate(table.lookup(input_string_ref)))
 
-      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(TypeError):
-        table.lookup(input_string)
+    input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+    with self.assertRaises(TypeError):
+      table.lookup(input_string)
 
-      with self.assertRaises(TypeError):
-        lookup_ops.HashTable(
-            lookup_ops.KeyValueTensorInitializer(keys, values), "UNK")
+    with self.assertRaises(TypeError):
+      self.getHashTable()(
+          lookup_ops.KeyValueTensorInitializer(keys, values), "UNK")
 
   def testDTypes(self):
-    with self.cached_session():
-      default_val = -1
-      with self.assertRaises(TypeError):
-        lookup_ops.HashTable(
-            lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
-                                                 dtypes.int64), default_val)
+    default_val = -1
+    with self.assertRaises(TypeError):
+      self.getHashTable()(
+          lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
+                                               dtypes.int64), default_val)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only
   def testNotInitialized(self):
     with self.cached_session():
       default_val = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(
-              ["a"], [1], value_dtype=dtypes.int64), default_val)
+      table = self.getHashTable()(
+          lookup_ops.KeyValueTensorInitializer(["a"], [1],
+                                               value_dtype=dtypes.int64),
+          default_val)
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -232,37 +245,37 @@ class HashTableOpTest(test.TestCase):
       with self.assertRaisesOpError("Table not initialized"):
         self.evaluate(output)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only
   def testInitializeTwice(self):
     with self.cached_session():
       default_val = -1
       keys = constant_op.constant(["brain", "salad", "surgery"])
       values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.HashTable(
+      table = self.getHashTable()(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.initialize_table(table)
 
       with self.assertRaisesOpError("Table already initialized"):
-        table.initializer.run()
+        self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
 
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    raised_error = ValueError
+    if context.executing_eagerly():
+      raised_error = errors_impl.InvalidArgumentError
+    with self.assertRaises(raised_error):
+      self.getHashTable()(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only
   def testMultipleSessions(self):
     # Start a server
-    server = server_lib.Server(
-        {
-            "local0": ["localhost:0"]
-        }, protocol="grpc", start=True)
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
     # Create two sessions sharing the same state
     session1 = session.Session(server.target)
     session2 = session.Session(server.target)
@@ -270,40 +283,111 @@ class HashTableOpTest(test.TestCase):
     default_val = -1
     keys = constant_op.constant(["brain", "salad", "surgery"])
     values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
+    table = self.getHashTable()(
         lookup_ops.KeyValueTensorInitializer(keys, values),
         default_val,
         name="t1")
 
     # Init the table in the first session.
     with session1:
-      table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.initialize_table(table)
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
       table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
-  def testHashTableInt32String(self):
-    with self.cached_session():
-      default_val = "n/a"
-      keys = constant_op.constant([0, 1, 2], dtypes.int32)
-      values = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+  def testStaticHashTableInt32String(self):
+    default_val = "n/a"
+    keys = constant_op.constant([0, 1, 2], dtypes.int32)
+    values = constant_op.constant(["brain", "salad", "surgery"])
+    table = self.getHashTable()(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+    self.initialize_table(table)
 
-      input_tensor = constant_op.constant([0, 1, -1])
-      output = table.lookup(input_tensor)
+    input_tensor = constant_op.constant([0, 1, -1])
+    output = table.lookup(input_tensor)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
+
+  def testTableUseInFunction(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only Eager mode test.")
+    keys = constant_op.constant([0, 1, 2], dtypes.int32)
+    values = constant_op.constant(["brain", "salad", "surgery"])
+    table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer(
+        keys, values), "n/a")
+
+    @function.defun()
+    def lookup_table_func(k):
+      return table.lookup(k)
+
+    result = lookup_table_func(constant_op.constant([0, 1, -1]))
+    self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
+    result = lookup_table_func(constant_op.constant([2, -1, 1]))
+    self.assertAllEqual([b"surgery", b"n/a", b"salad"], result)
+
+  def testTableCreatedInFunction(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only Eager mode test.")
+    keys = constant_op.constant([0, 1, 2], dtypes.int32)
+    values = constant_op.constant(["brain", "salad", "surgery"])
 
+    @function.defun()
+    def lookup_table_func(k):
+      table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer(
+          keys, values), "n/a")
+      return table.lookup(k)
 
-class IndexTableFromFile(test.TestCase):
+    result = lookup_table_func(constant_op.constant([0, 1, -1]))
+    self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
+    result = lookup_table_func(constant_op.constant([2, -1, 1]))
+    self.assertAllEqual([b"surgery", b"n/a", b"salad"], result)
+
+
+class KeyValueTensorInitializerTest(BaseLookupTableTest):
+
+  def test_string(self):
+    init = lookup_ops.KeyValueTensorInitializer(
+        ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+    table = self.getHashTable()(init, default_value=-1)
+    self.initialize_table(table)
+
+  def test_multiple_tables(self):
+    with ops.name_scope("table_scope"):
+      init1 = lookup_ops.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table1 = self.getHashTable()(init1, default_value=-1)
+      if not context.executing_eagerly():
+        self.assertEqual("hash_table", table1.name)
+        self.assertEqual("table_scope/hash_table",
+                         table1.resource_handle.op.name)
+      init2 = lookup_ops.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table2 = self.getHashTable()(init2, default_value=-1)
+      if not context.executing_eagerly():
+        self.assertEqual("hash_table_1", table2.name)
+        self.assertEqual("table_scope/hash_table_1",
+                         table2.resource_handle.op.name)
+
+  def test_int64(self):
+    init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                dtypes.int64, dtypes.int64)
+    table = self.getHashTable()(init, default_value=-1)
+    self.initialize_table(table)
+
+  def test_int32(self):
+    init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                dtypes.int32, dtypes.int64)
+    with self.assertRaises(errors_impl.OpError):
+      table = self.getHashTable()(init, default_value=-1)
+      self.initialize_table(table)
+
+
+class InitializeTableFromFileOpTest(BaseLookupTableTest):
 
   def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
@@ -311,410 +395,329 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
-  def test_string_index_table_from_file(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
-
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+  def testInitializeStringTable(self):
+    vocabulary_file = self._createVocabFile("one_column_1.txt")
+    default_value = -1
+    init = lookup_ops.TextFileInitializer(
+        vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+        dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+    self.assertIn("one_column_1.txt_-2_-1", init._shared_name)
+    table = self.getHashTable()(init, default_value)
+    self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def test_string_index_table_from_multicolumn_file(self):
-    vocabulary_file = self._createVocabFile(
-        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
-  def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
+  def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
-        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
+        "one_column_int64.txt", values=("42", "1", "-1000"))
+
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          delimiter=" ")
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      default_value = -1
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertIn("one_column_int64.txt_-2_-1", init._shared_name)
+      table = self.getHashTable()(init, default_value)
+      self.initialize_table(table)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      output = table.lookup(
+          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
-  @test_util.run_deprecated_v1
-  def test_string_index_table_from_file_tensor_filename(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      vocabulary_file = constant_op.constant(vocabulary_file)
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      self.assertEqual(1,
-                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+  def testInitializeIndexTable(self):
+    vocabulary_file = self._createVocabFile("one_column_2.txt")
 
-  @test_util.run_deprecated_v1
-  def test_string_index_table_from_file_placeholder_filename(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
-      vocabulary_placeholder = array_ops.placeholder(dtypes.string, [])
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_placeholder, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, key_index, dtypes.string, value_index)
+      self.assertIn("one_column_2.txt_-1_-2", init._shared_name)
+      table = self.getHashTable()(init, default_value)
+      self.initialize_table(table)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-
-      feed_dict = {vocabulary_placeholder.name: vocabulary_file}
-      lookup_ops.tables_initializer().run(feed_dict=feed_dict)
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      self.assertEqual(0,
-                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
-
-  @test_util.run_deprecated_v1
-  def test_int32_index_table_from_file(self):
-    vocabulary_file = self._createVocabFile(
-        "f2i_vocab2.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
-
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      output = table.lookup(input_values)
 
-  @test_util.run_deprecated_v1
-  def test_int64_index_table_from_file(self):
-    vocabulary_file = self._createVocabFile(
-        "f2i_vocab3.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+      result = self.evaluate(output)
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+  def testMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_file_with_default_value(self):
-    default_value = -42
-    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
-
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+      default_value = -1
+      key_index = 1
+      value_index = 2
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_file_with_oov_buckets(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
-      ids = table.lookup(
-          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertIn("three_columns.txt_1_2", init._shared_name)
+      table = self.getHashTable()(init, default_value)
+      self.initialize_table(table)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual(
-          (
-              1,  # From vocabulary file.
-              2,  # From vocabulary file.
-              867,  # 3 + fingerprint("tarkus") mod 300.
-              860),  # 3 + fingerprint("toccata") mod 300.
-          self.evaluate(ids))
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
 
-  def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
-    self.assertRaises(
-        ValueError, lookup_ops.index_table_from_file, vocabulary_file="")
+      result = self.evaluate(output)
+      self.assertAllEqual([1, 5, 6], result)
 
-  def test_index_table_from_file_fails_with_empty_vocabulary(self):
-    self.assertRaises(
-        ValueError, lookup_ops.index_table_from_file, vocabulary_file=None)
+  def testInvalidDataTypeInMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
 
-  def test_index_table_from_file_str_fails_with_zero_size_vocabulary(self):
-    vocabulary_file = self._createVocabFile("zero_vocab_str.txt")
-    self.assertRaisesRegexp(
-        ValueError,
-        "vocab_size must be greater than 0, got 0. "
-        "vocabulary_file: .*zero_vocab_str.txt",
-        lookup_ops.index_table_from_file,
-        vocabulary_file=vocabulary_file,
-        vocab_size=0)
+    with self.cached_session():
+      default_value = -1
+      key_index = 2
+      value_index = 1
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertIn("three_columns.txt_2_1", init._shared_name)
+      with self.assertRaisesOpError("is not a valid"):
+        table = self.getHashTable()(init, default_value)
+        self.initialize_table(table)
 
-  def test_index_table_from_file_tensor_fails_with_zero_size_vocabulary(self):
-    vocabulary_file = constant_op.constant(
-        self._createVocabFile("zero_vocab_tensor.txt"))
-    self.assertRaisesRegexp(
-        ValueError,
-        "vocab_size must be greater than 0, got 0. "
-        "vocabulary_file: .*zero_vocab_tensor.txt",
-        lookup_ops.index_table_from_file,
-        vocabulary_file=vocabulary_file,
-        vocab_size=0)
+  def testInvalidDataType(self):
+    vocabulary_file = self._createVocabFile("one_column_3.txt")
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_file_with_vocab_size_too_small(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=2)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
-      self.assertEqual(2, table.size().eval())
+      with self.assertRaises(ValueError):
+        init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                              key_index, dtypes.string,
+                                              value_index)
+        self.assertIn("one_column_3.txt_-2_-1", init._shared_name)
+        self.getHashTable()(init, default_value)
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_file_with_vocab_size_too_large(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+  def testInvalidIndex(self):
+    vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", table.initializer.run)
+      default_value = -1
+      key_index = 1  # second column of the line
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertIn("one_column_4.txt_1_-1", init._shared_name)
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_file_with_vocab_size(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
+      with self.assertRaisesOpError("Invalid number of columns"):
+        table = self.getHashTable()(init, default_value)
+        self.initialize_table(table)
 
-    self.assertRaises(
-        ValueError,
-        lookup_ops.index_table_from_file,
-        vocabulary_file=vocabulary_file,
-        vocab_size=0)
+  def testInitializeSameTableWithMultipleNodes(self):
+    vocabulary_file = self._createVocabFile("one_column_5.txt")
 
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      default_value = -1
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertIn("one_column_5.txt_-2_-1", init1._shared_name)
+      table1 = self.getHashTable()(init1, default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertIn("one_column_5.txt_-2_-1", init2._shared_name)
+      table2 = self.getHashTable()(init2, default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertIn("one_column_5.txt_-2_-1", init3._shared_name)
+      table3 = self.getHashTable()(init3, default_value)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
-      self.assertEqual(3, table.size().eval())
+      self.evaluate(lookup_ops.tables_initializer())
 
-  def test_index_table_from_file_with_invalid_hashers(self):
-    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_file(
-            vocabulary_file=vocabulary_file,
-            vocab_size=3,
-            num_oov_buckets=1,
-            hasher_spec=1)
+      input_string = constant_op.constant(["brain", "salad", "tank"])
 
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          vocab_size=3,
-          num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
 
-  def test_index_table_from_file_table_ref_with_oov_buckets(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
+  def testInitializeTableWithNoFilename(self):
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      self.assertIsNotNone(table.resource_handle)
+      default_value = -1
+      with self.assertRaises(ValueError):
+        self.getHashTable()(lookup_ops.TextFileInitializer(
+            "", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+            dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
 
-  def test_index_table_from_file_table_ref_without_oov_buckets(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
+  def testInitializeWithVocabSize(self):
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=0)
-      self.assertIsNotNone(table.resource_handle)
+      default_value = -1
+      vocab_size = 3
+      vocabulary_file1 = self._createVocabFile("one_column6.txt")
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file1,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertIn("one_column6.txt_3_-2_-1", init1._shared_name)
+      table1 = self.getHashTable()(init1, default_value)
 
+      # Initialize from file.
+      self.initialize_table(table1)
+      self.assertEqual(vocab_size, self.evaluate(table1.size()))
+
+      vocabulary_file2 = self._createVocabFile("one_column7.txt")
+      vocab_size = 5
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file2,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertIn("one_column7.txt_5_-2_-1", init2._shared_name)
+      with self.assertRaisesOpError("Invalid vocab_size"):
+        table2 = self.getHashTable()(init2, default_value)
+        self.initialize_table(table2)
 
-class KeyValueTensorInitializerTest(test.TestCase):
+      vocab_size = 1
+      vocabulary_file3 = self._createVocabFile("one_column3.txt")
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file3,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertIn("one_column3.txt_1_-2_-1", init3._shared_name)
+      table3 = self.getHashTable()(init3, default_value)
 
-  def test_string(self):
-    with ops.Graph().as_default(), self.cached_session():
-      init = lookup_ops.KeyValueTensorInitializer(
-          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
-      self.assertEqual("", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value=-1)
-      table.initializer.run()
+      # Smaller vocab size reads only vocab_size records.
+      self.initialize_table(table3)
+      self.assertEqual(vocab_size, self.evaluate(table3.size()))
 
-  def test_multiple_tables(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with ops.name_scope("table_scope"):
-        init1 = lookup_ops.KeyValueTensorInitializer(
-            ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
-            dtypes.int64)
-        self.assertEqual("", init1._shared_name)
-        table1 = lookup_ops.HashTable(init1, default_value=-1)
-        self.assertEquals("hash_table", table1.name)
-        self.assertEquals("table_scope/hash_table",
-                          table1.resource_handle.op.name)
-        init2 = lookup_ops.KeyValueTensorInitializer(
-            ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
-            dtypes.int64)
-        self.assertEqual("", init2._shared_name)
-        table2 = lookup_ops.HashTable(init2, default_value=-1)
-        self.assertEquals("hash_table_1", table2.name)
-        self.assertEquals("table_scope/hash_table_1",
-                          table2.resource_handle.op.name)
+  @test_util.run_v1_only("placeholder usage")
+  def testFeedVocabularyName(self):
+    vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
-  def test_int64(self):
-    with ops.Graph().as_default(), self.cached_session():
-      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
-                                                  dtypes.int64, dtypes.int64)
-      self.assertEqual("", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value=-1)
-      table.initializer.run()
+    with self.cached_session():
+      default_value = -1
+      init = lookup_ops.TextFileInitializer(
+          "old_file.txt", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertIn("old_file.txt_-2_-1", init._shared_name)
+      table = self.getHashTable()(init, default_value)
 
-  @test_util.run_deprecated_v1
-  def test_int32(self):
-    with ops.Graph().as_default(), self.cached_session():
-      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
-                                                  dtypes.int32, dtypes.int64)
-      self.assertEqual("", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value=-1)
-      with self.assertRaisesRegexp(
-          errors_impl.OpError, "No OpKernel was registered"):
+      # Initialize with non existing file (old_file.txt) should fail.
+      # TODO(yleon): Update message, which might change per FileSystem.
+      with self.assertRaisesOpError("old_file.txt"):
         table.initializer.run()
 
+      # Initialize the model feeding the vocabulary file.
+      filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      table.initializer.run(feed_dict={filenames[0]: vocabulary_file})
 
-class IndexTableFromTensor(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
-  def test_index_table_from_tensor_with_tensor_init(self):
-    table = lookup_ops.index_table_from_tensor(
-        vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
 
-    if not context.executing_eagerly():
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(
-            table.lookup(constant_op.constant(("salad", "surgery", "tarkus"))))
-    else:
-      # Reinitializing a table in eager should work.
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
-    self.evaluate(lookup_ops.tables_initializer())
-    ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
-    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInvalidFilenames(self):
+    vocabulary_file = self._createVocabFile("filename_shape.txt")
 
-  @test_util.run_deprecated_v1
-  def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+      default_value = -1
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      # Invalid data type
+      other_type = constant_op.constant(1)
+      with self.assertRaises(Exception) as cm:
+        self.getHashTable()(lookup_ops.TextFileInitializer(
+            other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+            dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      self.assertIsInstance(cm.exception, (ValueError, TypeError))
 
-  @test_util.run_deprecated_v1
-  def test_int64_index_table_from_tensor_with_tensor_init(self):
+      # Non-scalar filename
+      filenames = constant_op.constant([vocabulary_file, vocabulary_file])
+      if not context.executing_eagerly():
+        with self.assertRaises(Exception) as cm:
+          self.getHashTable()(lookup_ops.TextFileInitializer(
+              filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+                              default_value)
+        self.assertIsInstance(cm.exception, (ValueError, TypeError))
+      else:
+        with self.assertRaises(errors_impl.InvalidArgumentError):
+          self.getHashTable()(lookup_ops.TextFileInitializer(
+              filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+                              default_value)
+
+  def testIdToStringTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+      default_value = "UNK"
+      vocab_size = 3
+      init = lookup_ops.TextFileStringTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_1.txt_3_-1_-2", init._shared_name)
+      table = self.getHashTable()(init, default_value)
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_tensor_with_default_value(self):
-    default_value = -42
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=["brain", "salad", "surgery"],
-          default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+      out = table.lookup(input_values)
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
+                          self.evaluate(out))
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
-  def test_index_table_from_tensor_missing_vocabulary_list(self):
+  def testStringToIdTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   "vocabulary_list must be specified"):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=None, num_oov_buckets=1)
+      default_value = -1
+      vocab_size = 3
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_2.txt_3_-1_-2", init._shared_name)
+      table = self.getHashTable()(init, default_value)
+      self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def test_index_table_from_tensor_empty_vocabulary_list(self):
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      with self.assertRaisesRegexp(
-          errors_impl.OpError, "keys and values cannot be empty"):
-        lookup_ops.tables_initializer().run()
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
-  def test_index_table_from_tensor_with_invalid_hashers(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=["brain", "salad", "surgery"],
-            num_oov_buckets=1,
-            hasher_spec=1)
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=["brain", "salad", "surgery"],
-          num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+  def testInt64ToIdTable(self):
+    vocab_file = self._createVocabFile(
+        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+    with self.cached_session():
+      default_value = -1
+      vocab_size = 3
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64)
+      self.assertTrue("feat_to_id_3.txt_3_-1_-2", init._shared_name)
+      table = self.getHashTable()(init, default_value)
+      self.initialize_table(table)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+      out = table.lookup(
+          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
+      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
 
-class IndexToStringTableFromFileTest(test.TestCase):
+class StaticVocabularyTableTest(BaseLookupTableTest):
 
   def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
@@ -722,497 +725,1420 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table(self):
-    vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
-    # vocabulary_file supports string and tensor
-    type_funcs = [str, constant_op.constant]
-    for type_func in type_funcs:
-      vocabulary_file = type_func(vocabulary_path)
-      with self.cached_session():
-        table = lookup_ops.index_to_string_table_from_file(
-            vocabulary_file=vocabulary_file)
-        features = table.lookup(
-            constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-        lookup_ops.tables_initializer().run()
-        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            self.evaluate(features))
+  def testStringStaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    vocab_size = 3
+    oov_buckets = 1
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=vocab_size), oov_buckets)
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_from_multicolumn_file(self):
-    vocabulary_file = self._createVocabFile(
-        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0)
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
-    vocabulary_file = self._createVocabFile(
-        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0,
-          delimiter=" ")
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_with_default_value(self):
-    default_value = b"NONE"
-    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+    out = table.lookup(input_string)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_with_vocab_size_too_small(self):
-    default_value = b"NONE"
-    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          vocab_size=2,
-          default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", default_value, default_value),
-                          self.evaluate(features))
+  def testInt32StaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    vocab_size = 3
+    oov_buckets = 1
+    table = self.getVocabularyTable()(
+        lookup_ops.TextFileIdTableInitializer(
+            vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+        oov_buckets,
+        lookup_key_dtype=dtypes.int32)
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_with_vocab_size_too_large(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    self.initialize_table(table)
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      init = lookup_ops.tables_initializer()
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", init.run)
+    values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_with_vocab_size(self):
-    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    out = table.lookup(values)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+  def testInt64StaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    vocab_size = 3
+    oov_buckets = 1
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64), oov_buckets)
 
+    self.initialize_table(table)
 
-class IndexToStringTableFromTensorTest(test.TestCase):
+    values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_table_from_tensor(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
+    out = table.lookup(values)
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
-      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+  def testStringStaticVocabularyTableNoInitializer(self):
+    oov_buckets = 5
 
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    # Set a table that only uses hash buckets, for each input value returns
+    # an id calculated by fingerprint("input") mod oov_buckets.
+    table = self.getVocabularyTable()(None, oov_buckets)
+    self.initialize_table(table)
 
-  @test_util.run_deprecated_v1
-  def test_duplicate_entries(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["hello", "hello"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
-      indices = constant_op.constant([0, 1, 4], dtypes.int64)
-      features = table.lookup(indices)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
+    values = constant_op.constant(("brain", "salad", "surgery"))
 
-  @test_util.run_deprecated_v1
-  def test_index_to_string_with_default_value(self):
-    default_value = b"NONE"
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list, default_value=default_value)
-      indices = constant_op.constant([1, 2, 4], dtypes.int64)
-      features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
+    out = table.lookup(values)
+    self.assertAllEqual(
+        [
+            3,  # fingerprint("brain") mod 5.
+            1,  # fingerprint("salad") mod 5.
+            4  # fingerprint("surgery") mod 5
+        ],
+        self.evaluate(out))
+    self.assertEqual(oov_buckets, self.evaluate(table.size()))
 
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+  def testStaticVocabularyTableWithMultipleInitializers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    vocab_size = 3
+    oov_buckets = 3
 
+    init = lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=vocab_size)
+    table1 = self.getVocabularyTable()(init, oov_buckets, name="table1")
 
-class InitializeTableFromFileOpTest(test.TestCase):
+    table2 = self.getVocabularyTable()(init, oov_buckets, name="table2")
 
-  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
-    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
-    with open(vocabulary_file, "w") as f:
-      f.write("\n".join(values) + "\n")
-    return vocabulary_file
+    self.evaluate(lookup_ops.tables_initializer())
 
-  @test_util.run_in_graph_and_eager_modes
-  def testInitializeStringTable(self):
-    vocabulary_file = self._createVocabFile("one_column_1.txt")
-    default_value = -1
-    init = lookup_ops.TextFileInitializer(
-        vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-        dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-    self.assertTrue("one_column_1.txt_-2_-1" in init._shared_name)
-    table = lookup_ops.HashTable(init, default_value)
-    self.evaluate(table.initializer)
+    input_string = constant_op.constant(
+        ["fruit", "brain", "salad", "surgery", "UNK"])
 
-    output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
-
-    result = self.evaluate(output)
-    self.assertAllEqual([0, 1, -1], result)
+    out1 = table1.lookup(input_string)
+    out2 = table2.lookup(input_string)
 
-  @test_util.run_deprecated_v1
-  def testInitializeInt64Table(self):
-    vocabulary_file = self._createVocabFile(
-        "one_column_int64.txt", values=("42", "1", "-1000"))
+    out1, out2 = self.evaluate([out1, out2])
+    self.assertAllEqual([5, 0, 1, 2, 5], out1)
+    self.assertAllEqual([5, 0, 1, 2, 5], out2)
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
 
+  def testStaticVocabularyTableInitializationAcrossSessions(self):
+    vocab_file = self._createVocabFile("feat_to_id_5.txt")
     with self.cached_session():
-      default_value = -1
-      init = lookup_ops.TextFileInitializer(
-          vocabulary_file, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-      self.assertTrue("one_column_int64.txt_-2_-1" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      table.initializer.run()
+      vocab_size = 3
+      oov_buckets = 1
+      table1 = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size), oov_buckets)
 
-      output = table.lookup(
-          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
+      self.initialize_table(table1)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
 
-  @test_util.run_deprecated_v1
-  def testInitializeIndexTable(self):
-    vocabulary_file = self._createVocabFile("one_column_2.txt")
+      out1 = table1.lookup(input_string_1)
+
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
 
     with self.cached_session():
-      default_value = "UNK"
-      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
-      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
-      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                            key_index, dtypes.string,
-                                            value_index)
-      self.assertTrue("one_column_2.txt_-1_-2" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      table.initializer.run()
+      vocab_size = 3
+      oov_buckets = 1
 
-      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      output = table.lookup(input_values)
+      # Underlying lookup table already initialized in previous session.
+      # No need to initialize table2
+      table2 = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size), oov_buckets)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
-  @test_util.run_deprecated_v1
-  def testMultiColumn(self):
-    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
-    with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+      out2 = table2.lookup(input_string_2)
+
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+
+  def testSparseTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                             dtypes.string),
+        constant_op.constant(input_shape, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=3), 1)
+    self.initialize_table(table)
+
+    sp_ids = table.lookup(sp_features)
+
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+        constant_op.constant(input_shape, dtypes.int64))
+
+    table = self.getVocabularyTable()(
+        lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                             dtypes.int64, dtypes.int64),
+        1,
+        lookup_key_dtype=dtypes.int32)
+    self.initialize_table(table)
+
+    sp_ids = table.lookup(sp_features)
+
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    sp_features = sparse_tensor.SparseTensor(
+        constant_op.constant(input_indices, dtypes.int64),
+        constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+        constant_op.constant(input_shape, dtypes.int64))
+
+    table = self.getVocabularyTable()(lookup_ops.KeyValueTensorInitializer(
+        (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), 1)
+    self.initialize_table(table)
+
+    sp_ids = table.lookup(sp_features)
+
+    self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+    sp_ids_ind, sp_ids_val, sp_ids_shape = self.evaluate(
+        [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+    self.assertAllEqual(input_indices, sp_ids_ind)
+    self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+    self.assertAllEqual(input_shape, sp_ids_shape)
 
+  def testStaticVocabularyTableNoInnerTable(self):
+    table = self.getVocabularyTable()(None, num_oov_buckets=1)
+    self.assertIsNone(table.resource_handle)
+
+
+class DenseHashTableOpTest(test.TestCase):
+
+  def testBasic(self):
     with self.cached_session():
-      default_value = -1
-      key_index = 1
-      value_index = 2
 
-      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                            key_index, dtypes.int64,
-                                            value_index)
-      self.assertTrue("three_columns.txt_1_2" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      table.initializer.run()
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([12, 15], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
       output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
 
       result = self.evaluate(output)
-      self.assertAllEqual([1, 5, 6], result)
-
-  @test_util.run_deprecated_v1
-  def testInvalidDataTypeInMultiColumn(self):
-    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
-    with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+      self.assertAllEqual([0, -1, -1], result)
 
+  def testBasicBool(self):
     with self.cached_session():
-      default_value = -1
-      key_index = 2
-      value_index = 1
-      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                            key_index, dtypes.int64,
-                                            value_index)
-      self.assertTrue("three_columns.txt_2_1" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      with self.assertRaisesOpError("is not a valid"):
-        table.initializer.run()
 
-  def testInvalidDataType(self):
-    vocabulary_file = self._createVocabFile("one_column_3.txt")
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([True, True, True, True], dtypes.bool)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.bool,
+          default_value=False,
+          empty_key=0,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([11, 15], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([False, True, False], result)
 
+  def testSameEmptyAndDeletedKey(self):
     with self.cached_session():
-      default_value = "UNK"
-      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
-      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys"):
+        table = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, self.evaluate(table.size()))
 
-      with self.assertRaises(ValueError):
-        init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                              key_index, dtypes.string,
-                                              value_index)
-        self.assertTrue("one_column_3.txt_-2_-1" in init._shared_name)
-        lookup_ops.HashTable(init, default_value)
+  @test_util.run_v1_only("uses placeholders")
+  def testLookupUnknownShape(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      placeholder_keys = array_ops.placeholder(dtypes.int64)
+      output = table.lookup(placeholder_keys)
+      self.assertAllEqual(None, output.get_shape())
+      result = output.eval({placeholder_keys: [11, 12, 15]})
+      self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
-  def testInvalidIndex(self):
-    vocabulary_file = self._createVocabFile("one_column_4.txt")
+  def testMapStringToFloat(self):
     with self.cached_session():
-      default_value = -1
-      key_index = 1  # second column of the line
-      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
-      init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                            key_index, dtypes.int64,
-                                            value_index)
-      self.assertTrue("one_column_4.txt_1_-1" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
 
-      with self.assertRaisesOpError("Invalid number of columns"):
-        table.initializer.run()
+      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
+      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
+      default_value = constant_op.constant(-1.5, dtypes.float32)
+      table = lookup_ops.DenseHashTable(
+          dtypes.string,
+          dtypes.float32,
+          default_value=default_value,
+          empty_key="",
+          deleted_key="$")
+      self.assertAllEqual(0, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
-  def testInitializeSameTableWithMultipleNodes(self):
-    vocabulary_file = self._createVocabFile("one_column_5.txt")
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
 
-    with self.cached_session():
-      shared_name = "shared-one-columm"
-      default_value = -1
-      init1 = lookup_ops.TextFileInitializer(
-          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-      self.assertTrue("one_column_5.txt_-2_-1" in init1._shared_name)
-      table1 = lookup_ops.HashTable(init1, default_value,
-                                    shared_name=shared_name)
-      init2 = lookup_ops.TextFileInitializer(
-          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-      self.assertTrue("one_column_5.txt_-2_-1" in init2._shared_name)
-      table2 = lookup_ops.HashTable(init2, default_value,
-                                    shared_name=shared_name)
-      init3 = lookup_ops.TextFileInitializer(
-          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-      self.assertTrue("one_column_5.txt_-2_-1" in init3._shared_name)
-      table3 = lookup_ops.HashTable(init3, default_value,
-                                    shared_name=shared_name)
+      remove_string = constant_op.constant(["b", "e"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-      lookup_ops.tables_initializer().run()
+      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4], output.get_shape())
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
+      result = self.evaluate(output)
+      self.assertAllClose([0, -1.5, 3.3, -1.5], result)
 
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
+  def testMapInt64ToFloat(self):
+    for float_dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
 
-      out1, out2, out3 = self.evaluate([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
+        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
+        default_value = constant_op.constant(-1.5, float_dtype)
+        table = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            float_dtype,
+            default_value=default_value,
+            empty_key=0,
+            deleted_key=-1)
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+        self.evaluate(table.insert(keys, values))
+        self.assertAllEqual(4, self.evaluate(table.size()))
+
+        remove_string = constant_op.constant([12, 15], dtypes.int64)
+        self.evaluate(table.remove(remove_string))
+        self.assertAllEqual(3, self.evaluate(table.size()))
+
+        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+        output = table.lookup(input_string)
+        self.assertAllEqual([4], output.get_shape())
+
+        result = self.evaluate(output)
+        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
+
+  def testVectorValues(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
+                                    dtypes.int64)
+      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=4)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      self.evaluate(
+          table.insert(
+              constant_op.constant([14], dtypes.int64),
+              constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      remove_string = constant_op.constant([12, 16], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4, 4],
+                          output.shape,
+                          msg="Saw shape: %s" % output.shape)
 
-  def testInitializeTableWithNoFilename(self):
-    with self.cached_session():
-      default_value = -1
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(
-                "", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
-            default_value)
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
+          result)
+
+  def testVectorKeys(self):
+    with self.cached_session():
+      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
+      values = constant_op.constant([10, 11, 12], dtypes.int64)
+      empty_key = constant_op.constant([0, 3], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      self.evaluate(
+          table.insert(
+              constant_op.constant([[0, 0]], dtypes.int64),
+              constant_op.constant([13], dtypes.int64)))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
+                                          dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4], output.get_shape())
 
-  @test_util.run_deprecated_v1
-  def testInitializeWithVocabSize(self):
+      result = self.evaluate(output)
+      self.assertAllEqual([10, -1, 12, -1], result)
+
+  def testResize(self):
     with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=4)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      keys2 = constant_op.constant([12, 99], dtypes.int64)
+      self.evaluate(table.remove(keys2))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
+      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
+
+      self.evaluate(table.insert(keys3, values3))
+      self.assertAllEqual(6, self.evaluate(table.size()))
+      self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
+
+      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
+                                   dtypes.int64)
+      output = table.lookup(keys4)
+      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
+
+  def testExport(self):
+    with self.cached_session():
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=100,
+          deleted_key=200,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      self.evaluate(table.remove(keys2))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      exported_keys, exported_values = table.export()
+
+      np_keys = self.evaluate(exported_keys)
+      np_values = self.evaluate(exported_values)
+
+      self.assertAllEqual(8, len(np_keys))
+      self.assertAllEqual(8, len(np_values))
+
+      # pair up keys and values, drop extra added dimension
+      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
+      # sort by key
+      pairs = pairs[pairs[:, 0].argsort()]
+      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
+                           [100, 0], [100, 0], [200, 2]], pairs)
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
       default_value = -1
-      vocab_size = 3
-      vocabulary_file1 = self._createVocabFile("one_column6.txt")
-      init1 = lookup_ops.TextFileInitializer(
-          vocabulary_file1, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
-          vocab_size=vocab_size)
-      self.assertTrue("one_column6.txt_3_-2_-1" in init1._shared_name)
-      table1 = lookup_ops.HashTable(init1, default_value)
+      empty_key = 0
+      deleted_key = -1
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([11, 14], dtypes.int64),
+          constant_op.constant([12, 24], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
 
-      # Initialize from file.
-      table1.initializer.run()
-      self.assertEquals(vocab_size, table1.size().eval())
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
 
-      vocabulary_file2 = self._createVocabFile("one_column7.txt")
-      vocab_size = 5
-      init2 = lookup_ops.TextFileInitializer(
-          vocabulary_file2, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
-          vocab_size=vocab_size)
-      self.assertTrue("one_column7.txt_5_-2_-1" in init2._shared_name)
-      table2 = lookup_ops.HashTable(init2, default_value)
-      with self.assertRaisesOpError("Invalid vocab_size"):
-        table2.initializer.run()
+      input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
 
-      vocab_size = 1
-      vocabulary_file3 = self._createVocabFile("one_column3.txt")
-      init3 = lookup_ops.TextFileInitializer(
-          vocabulary_file3, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
-          vocab_size=vocab_size)
-      self.assertTrue("one_column3.txt_1_-2_-1" in init3._shared_name)
-      table3 = lookup_ops.HashTable(init3, default_value)
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-      # Smaller vocab size reads only vocab_size records.
-      table3.initializer.run()
-      self.assertEquals(vocab_size, table3.size().eval())
+    default_value = -1
+    empty_key = 0
+    deleted_key = -1
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    save_table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=32)
+
+    save_checkpoint = trackable.Checkpoint(table=save_table)
+
+    self.assertAllEqual(0, self.evaluate(save_table.size()))
+    self.evaluate(save_table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(save_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(save_table.export()[0])))
+
+    save_path = save_checkpoint.save(save_prefix)
+    del save_table, save_checkpoint
+
+    load_table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=64)
+    self.evaluate(
+        load_table.insert(
+            constant_op.constant([11, 14], dtypes.int64),
+            constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(load_table.size()))
+    self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
+
+    restore_checkpoint = trackable.Checkpoint(table=load_table)
+
+    # Restore the saved values in the parameter nodes.
+    restore_checkpoint.restore(save_path).run_restore_ops()
+
+    self.assertAllEqual(3, self.evaluate(load_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(load_table.export()[0])))
+
+    input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
+    output = load_table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testVectorSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
+      default_value = constant_op.constant([-1, -2], dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([[0, 1], [2, 3], [2, 4], [4, 5]],
+                                    dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
+      default_value = constant_op.constant([-1, -2], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
+          constant_op.constant([[21, 22], [23, 24]], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
 
-  @test_util.run_deprecated_v1
-  def testFeedVocabularyName(self):
-    vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
 
-    with self.cached_session():
-      default_value = -1
-      init = lookup_ops.TextFileInitializer(
-          "old_file.txt", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
-      self.assertTrue("old_file.txt_-2_-1" in init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
+      input_string = constant_op.constant(
+          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([[0, 1], [2, 3], [-1, -2], [4, 5], [-1, -2]],
+                          output.eval())
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testVectorScalarSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "vector_scalar_save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t2",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t2",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
+          constant_op.constant([3, 4], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
 
-      # Initialize with non existing file (old_file.txt) should fail.
-      # TODO(yleon): Update message, which might change per FileSystem.
-      with self.assertRaisesOpError("old_file.txt"):
-        table.initializer.run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
 
-      # Initialize the model feeding the vocabulary file.
-      filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      table.initializer.run(feed_dict={filenames[0]: vocabulary_file})
+      input_string = constant_op.constant(
+          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
+
+  def testReprobe(self):
+    with self.cached_session():
+      # Insert 6 keys into a table with 8 buckets.
+      # The values are chosen to make sure collisions occur when using GCC STL
+      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
+      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(6, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
+                                          dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([9], output.get_shape())
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
+      result = self.evaluate(output)
+      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
+
+  def testCustomEmptyKey(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 0, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=12,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
       output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
 
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
-  def testInvalidFilenames(self):
-    vocabulary_file = self._createVocabFile("filename_shape.txt")
+  def testErrors(self):
+    with self.cached_session():
+      table = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+
+      # Inserting the empty key returns an error
+      keys1 = constant_op.constant([11, 0], dtypes.int64)
+      values1 = constant_op.constant([0, 1], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "empty_key"):
+        self.evaluate(table.insert(keys1, values1))
+
+      # Looking up the empty key returns an error
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "empty_key"):
+        self.evaluate(table.lookup(keys1))
+
+      # Inserting the deleted key returns an error
+      keys2 = constant_op.constant([11, -1], dtypes.int64)
+      values2 = constant_op.constant([0, 1], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        self.evaluate(table.insert(keys2, values2))
+
+      # Looking up the empty key returns an error
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        self.evaluate(table.lookup(keys2))
+
+      # Arbitrary tensors of keys are not supported
+      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected key shape"):
+        self.evaluate(table.lookup(keys))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected key shape"):
+        self.evaluate(table.insert(keys, values))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Number of buckets must be"):
+        table2 = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=17,
+            deleted_key=-1,
+            initial_num_buckets=12)
+        self.assertAllEqual(0, self.evaluate(table2.size()))
 
-    with self.cached_session():
-      default_value = -1
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Empty and deleted keys must have same shape"):
+        table3 = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=[1, 2])
+        self.assertAllEqual(0, self.evaluate(table3.size()))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table4 = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, self.evaluate(table4.size()))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table5 = lookup_ops.DenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=[1, 2, 3],
+            deleted_key=[1, 2, 3])
+        self.assertAllEqual(0, self.evaluate(table5.size()))
 
-      # Invalid data type
-      other_type = constant_op.constant(1)
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(
-                other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
-            default_value)
 
-      # Non-scalar filename
-      filenames = constant_op.constant([vocabulary_file, vocabulary_file])
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(
-                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
-            default_value)
+class IndexTableFromFile(test.TestCase):
 
-  @test_util.run_deprecated_v1
-  def testIdToStringTable(self):
-    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def test_string_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
-      default_value = "UNK"
-      vocab_size = 3
-      init = lookup_ops.TextFileStringTableInitializer(
-          vocab_file, vocab_size=vocab_size)
-      self.assertTrue("feat_to_id_1.txt_3_-1_-2", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      table.initializer.run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+  def test_string_index_table_from_multicolumn_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_column_index=0,
+          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      out = table.lookup(input_values)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
-                          self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
-  def testStringToIdTable(self):
-    vocab_file = self._createVocabFile("feat_to_id_2.txt")
+  def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
     with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      init = lookup_ops.TextFileIdTableInitializer(
-          vocab_file, vocab_size=vocab_size)
-      self.assertTrue("feat_to_id_2.txt_3_-1_-2", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      table.initializer.run()
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_column_index=0,
+          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          delimiter=" ")
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_string_index_table_from_file_tensor_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.cached_session():
+      vocabulary_file = constant_op.constant(vocabulary_file)
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      if not context.executing_eagerly():
+        self.assertEqual(1,
+                         len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  @test_util.run_v1_only("placeholder usage")
+  def test_string_index_table_from_file_placeholder_filename(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.cached_session():
+      vocabulary_placeholder = array_ops.placeholder(dtypes.string, [])
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_placeholder, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+
+      feed_dict = {vocabulary_placeholder.name: vocabulary_file}
+      lookup_ops.tables_initializer().run(feed_dict=feed_dict)
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+      self.assertEqual(0,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+
+  def test_int32_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab2.txt", values=("42", "1", "-1000"))
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_int64_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab3.txt", values=("42", "1", "-1000"))
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_index_table_from_file_with_default_value(self):
+    default_value = -42
+    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+
+  def test_index_table_from_file_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+      ids = table.lookup(
+          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual(
+          (
+              1,  # From vocabulary file.
+              2,  # From vocabulary file.
+              867,  # 3 + fingerprint("tarkus") mod 300.
+              860),  # 3 + fingerprint("toccata") mod 300.
+          self.evaluate(ids))
+
+  def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file="")
+
+  def test_index_table_from_file_fails_with_empty_vocabulary(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file=None)
+
+  def test_index_table_from_file_str_fails_with_zero_size_vocabulary(self):
+    vocabulary_file = self._createVocabFile("zero_vocab_str.txt")
+    self.assertRaisesRegexp(
+        ValueError,
+        "vocab_size must be greater than 0, got 0. "
+        "vocabulary_file: .*zero_vocab_str.txt",
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+  def test_index_table_from_file_tensor_fails_with_zero_size_vocabulary(self):
+    vocabulary_file = constant_op.constant(
+        self._createVocabFile("zero_vocab_tensor.txt"))
+    self.assertRaisesRegexp(
+        ValueError,
+        "vocab_size must be greater than 0, got 0. "
+        "vocabulary_file: .*zero_vocab_tensor.txt",
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+  def test_index_table_from_file_with_vocab_size_too_small(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=2)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
+      self.assertEqual(2, self.evaluate(table.size()))
+
+  def test_index_table_from_file_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.cached_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        table = lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(table.initializer)
+
+  def test_index_table_from_file_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
+
+    self.assertRaises(
+        ValueError,
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
+      self.assertEqual(3, self.evaluate(table.size()))
+
+  def test_index_table_from_file_with_invalid_hashers(self):
+    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file,
+            vocab_size=3,
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=3,
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+  def test_index_table_from_file_table_ref_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      self.assertIsNotNone(table.resource_handle)
+
+  def test_index_table_from_file_table_ref_without_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
+    with self.cached_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=0)
+      self.assertIsNotNone(table.resource_handle)
+
+
+class IndexTableFromTensor(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_index_table_from_tensor_with_tensor_init(self):
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(
+            table.lookup(constant_op.constant(("salad", "surgery", "tarkus"))))
+    else:
+      # Reinitializing a table in eager should work.
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+    self.evaluate(lookup_ops.tables_initializer())
+    ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_int32_index_table_from_tensor_with_tensor_init(self):
+    with self.cached_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_int64_index_table_from_tensor_with_tensor_init(self):
+    with self.cached_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+
+  def test_index_table_from_tensor_with_default_value(self):
+    default_value = -42
+    with self.cached_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+
+  def test_index_table_from_tensor_missing_vocabulary_list(self):
+    with self.cached_session():
+      with self.assertRaisesRegexp(ValueError,
+                                   "vocabulary_list must be specified"):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=None, num_oov_buckets=1)
+
+  def test_index_table_from_tensor_empty_vocabulary_list(self):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "keys and values cannot be empty"):
+        _ = lookup_ops.index_table_from_tensor(
+            vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+        self.evaluate(lookup_ops.tables_initializer())
+
+  def test_index_table_from_tensor_with_invalid_hashers(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=["brain", "salad", "surgery"],
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class IndexToStringTableFromFileTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def test_index_to_string_table(self):
+    vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
+    # vocabulary_file supports string and tensor
+    type_funcs = [str, constant_op.constant]
+    for type_func in type_funcs:
+      vocabulary_file = type_func(vocabulary_path)
+      with self.cached_session():
+        table = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file)
+        features = table.lookup(
+            constant_op.constant([0, 1, 2, 3], dtypes.int64))
+        if not context.executing_eagerly():
+          with self.assertRaises(errors_impl.OpError):
+            self.evaluate(features)
+        self.evaluate(lookup_ops.tables_initializer())
+        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                            self.evaluate(features))
+
+  def test_index_to_string_table_from_multicolumn_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
+    with self.cached_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          value_column_index=0)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
+
+  def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
+    with self.cached_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          value_column_index=0,
+          delimiter=" ")
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
+
+  def test_index_to_string_table_with_default_value(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.cached_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          self.evaluate(features))
+
+  def test_index_to_string_table_with_vocab_size_too_small(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.cached_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=2,
+          default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"salad", default_value, default_value),
+                          self.evaluate(features))
+
+  def test_index_to_string_table_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.cached_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        _ = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(lookup_ops.tables_initializer())
+
+  def test_index_to_string_table_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.cached_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
+
+
+class IndexToStringTableFromTensorTest(test.TestCase):
+
+  def test_index_to_string_table_from_tensor(self):
+    with self.cached_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
 
-      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      features = table.lookup(indices)
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
 
-      out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
-  @test_util.run_deprecated_v1
-  def testInt64ToIdTable(self):
-    vocab_file = self._createVocabFile(
-        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+  def test_duplicate_entries(self):
     with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      init = lookup_ops.TextFileIdTableInitializer(
-          vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64)
-      self.assertTrue("feat_to_id_3.txt_3_-1_-2", init._shared_name)
-      table = lookup_ops.HashTable(init, default_value)
-      table.initializer.run()
+      vocabulary_list = constant_op.constant(["hello", "hello"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+      indices = constant_op.constant([0, 1, 4], dtypes.int64)
+      features = table.lookup(indices)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
-      out = table.lookup(
-          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
-      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+  def test_index_to_string_with_default_value(self):
+    default_value = b"NONE"
+    with self.cached_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list, default_value=default_value)
+      indices = constant_op.constant([1, 2, 4], dtypes.int64)
+      features = table.lookup(indices)
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          self.evaluate(features))
 
 
 class IdTableWithHashBucketsTest(test.TestCase):
@@ -1231,7 +2157,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       vocab_size = 3
       oov_buckets = 1
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
                   vocab_file, vocab_size=vocab_size), default_value),
           oov_buckets)
@@ -1242,7 +2168,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(input_string)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
@@ -1252,7 +2178,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       vocab_size = 3
       oov_buckets = 1
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
                   vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
               default_value),
@@ -1265,7 +2191,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(values)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
@@ -1275,7 +2201,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       vocab_size = 3
       oov_buckets = 1
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
                   vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
               default_value), oov_buckets)
@@ -1286,7 +2212,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(values)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
@@ -1308,7 +2234,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4  # fingerprint("surgery") mod 5
           ],
           self.evaluate(out))
-      self.assertEquals(oov_buckets, table.size().eval())
+      self.assertEqual(oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
@@ -1331,7 +2257,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               2  # fingerprint("-1000") mod 5
           ],
           self.evaluate(out))
-      self.assertEquals(oov_buckets, table.size().eval())
+      self.assertEqual(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
     with self.cached_session():
@@ -1353,7 +2279,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       vocab_size = 3
       oov_buckets = 3
 
-      vocab_table = lookup_ops.HashTable(
+      vocab_table = lookup_ops.StaticHashTable(
           lookup_ops.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size), default_value)
       table1 = lookup_ops.IdTableWithHashBuckets(
@@ -1379,8 +2305,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([5, 0, 1, 2, 5], out1)
       self.assertAllEqual([5, 0, 1, 2, 3], out2)
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
       test_util.assert_ops_in_graph({
           "table1_Lookup/hash_bucket": "StringToHashBucketFast",
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
@@ -1389,17 +2315,15 @@ class IdTableWithHashBucketsTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
-    shared_name = "across-sessions"
     with self.cached_session():
       default_value = -1
       vocab_size = 3
       oov_buckets = 1
       table1 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       table1.initializer.run()
 
@@ -1409,7 +2333,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
 
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
       default_value = -1
@@ -1419,18 +2343,17 @@ class IdTableWithHashBucketsTest(test.TestCase):
       # Underlying lookup table already initialized in previous session.
       # No need to call table2.initializer.run()
       table2 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
       out2 = table2.lookup(input_string_2)
 
       self.assertAllEqual([3, 1, 3], self.evaluate(out2))
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
 
   @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
@@ -1440,14 +2363,14 @@ class IdTableWithHashBucketsTest(test.TestCase):
       vocab_size = 3
       oov_buckets = 0
       table1 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
                   vocab_file, vocab_size=vocab_size), default_value1),
           oov_buckets)
 
       default_value2 = -2
       table2 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(
                   vocab_file, vocab_size=vocab_size), default_value2),
           oov_buckets)
@@ -1464,8 +2387,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([0, 1, 2, -1], out1)
       self.assertAllEqual([-2, 1, -2], out2)
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
 
   @test_util.run_deprecated_v1
   def testSparseTensor(self):
@@ -1480,7 +2403,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           constant_op.constant(input_shape, dtypes.int64))
 
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
               -1), 1)
       table.initializer.run()
@@ -1507,7 +2430,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           constant_op.constant(input_shape, dtypes.int64))
 
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.KeyValueTensorInitializer(
                   (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
           1,
@@ -1536,7 +2459,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           constant_op.constant(input_shape, dtypes.int64))
 
       table = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.HashTable(
+          lookup_ops.StaticHashTable(
               lookup_ops.KeyValueTensorInitializer(
                   (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
           1,
@@ -1560,7 +2483,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       default_value = -1
       vocab_size = 3
       oov_buckets = 1
-      lookup_table = lookup_ops.HashTable(
+      lookup_table = lookup_ops.StaticHashTable(
           lookup_ops.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size), default_value)
 
@@ -1602,5 +2525,607 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertIsNone(table.resource_handle)
 
 
+class MutableHashTableOpTest(test.TestCase):
+
+  def testMutableHashTable(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+      exported_keys, exported_values = table.export()
+
+      # exported data is in the order of the internal map, i.e. undefined
+      sorted_keys = np.sort(self.evaluate(exported_keys))
+      sorted_values = np.sort(self.evaluate(exported_values))
+      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+      self.assertAllEqual([0, 1, 2], sorted_values)
+
+  @test_util.run_v1_only("SaverV1")
+  def testSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+
+      default_val = -1
+      keys = constant_op.constant(["b", "c", "d"], dtypes.string)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(
+          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+
+      save = saver.Saver()
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      val = save.save(sess, save_path)
+      self.assertIsInstance(val, six.string_types)
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      default_val = -1
+      table = lookup_ops.MutableHashTable(
+          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+      self.evaluate(
+          table.insert(
+              constant_op.constant(["a", "c"], dtypes.string),
+              constant_op.constant([12, 24], dtypes.int64)))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      # Check that the parameter nodes have been restored.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["a", "b", "c", "d", "e"],
+                                          dtypes.string)
+      output = table.lookup(input_string)
+      self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(20.0, name="v1")
+
+    default_val = -1
+    keys = constant_op.constant(["b", "c", "d"], dtypes.string)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
+    self.evaluate([v0.initializer, v1.initializer])
+
+    # Check that the parameter nodes have been initialized.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(0, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    save_path = checkpoint.save(save_prefix)
+    del table, checkpoint, v0, v1
+
+    v0 = variables.Variable(-1.0, name="v0")
+    v1 = variables.Variable(-1.0, name="v1")
+    default_val = -1
+    table = lookup_ops.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+    self.evaluate(
+        table.insert(
+            constant_op.constant(["a", "c"], dtypes.string),
+            constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(table.size()))
+
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
+
+    # Restore the saved values in the parameter nodes.
+    checkpoint.restore(save_path).run_restore_ops()
+    # Check that the parameter nodes have been restored.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant(["a", "b", "c", "d", "e"],
+                                        dtypes.string)
+    output = table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_v1_only("Multiple sessions")
+  def testSharing(self):
+    # Start a server to store the table state
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target)
+    session2 = session.Session(server.target)
+
+    table = lookup_ops.MutableHashTable(
+        dtypes.int64, dtypes.string, "-", name="t1")
+
+    # Populate the table in the first session
+    with session1:
+      self.assertAllEqual(0, table.size().eval())
+
+      keys = constant_op.constant([11, 12], dtypes.int64)
+      values = constant_op.constant(["a", "b"])
+      table.insert(keys, values).run()
+      self.assertAllEqual(2, table.size().eval())
+
+      output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
+      self.assertAllEqual([b"a", b"b", b"-"], output.eval())
+
+    # Verify that we can access the shared data from the second session
+    with session2:
+      self.assertAllEqual(2, table.size().eval())
+
+      output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
+      self.assertAllEqual([b"-", b"a", b"b"], output.eval())
+
+  def testMutableHashTableOfTensors(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3, 2], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
+
+      exported_keys, exported_values = table.export()
+      # exported data is in the order of the internal map, i.e. undefined
+      sorted_keys = np.sort(self.evaluate(exported_keys))
+      sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
+      self.assertAllEqual(sorted_expected_values, sorted_values)
+
+  def testMutableHashTableExportInsert(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.assertAllEqual(0, self.evaluate(table1.size()))
+      self.evaluate(table1.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      expected_output = [[0, 1], [2, 3], [-1, -1]]
+      output1 = table1.lookup(input_string)
+      self.assertAllEqual(expected_output, self.evaluate(output1))
+
+      exported_keys, exported_values = table1.export()
+      self.assertAllEqual(3, self.evaluate(exported_keys).size)
+      self.assertAllEqual(6, self.evaluate(exported_values).size)
+
+      # Populate a second table from the exported data
+      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.assertAllEqual(0, self.evaluate(table2.size()))
+      self.evaluate(table2.insert(exported_keys, exported_values))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+
+      # Verify lookup result is still the same
+      output2 = table2.lookup(input_string)
+      self.assertAllEqual(expected_output, self.evaluate(output2))
+
+  def testMutableHashTableOfTensorsInvalidShape(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      # Shape [6] instead of [3, 2]
+      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [2,3] instead of [3, 2]
+      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [2, 2] instead of [3, 2]
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [3, 1] instead of [3, 2]
+      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Valid Insert
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+  def testMutableHashTableInvalidDefaultValue(self):
+    with self.cached_session():
+      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
+      with self.assertRaisesOpError("Default value must be a vector"):
+        table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                            default_val)
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+  def testMutableHashTableDuplicateInsert(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([3, 1, -1], result)
+
+  def testMutableHashTableFindHighRank(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testMutableHashTableInsertHighRank(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, 3, -1], result)
+
+  def testMutableHashTableRemoveHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["salad", "tarkus"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, -1, 3, -1], result)
+
+  def testMutableHashTableOfTensorsFindHighRank(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2, 3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
+
+  def testMutableHashTableOfTensorsRemoveHighRank(self):
+    with self.test_session():
+      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([["brain", "tank"]])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["surgery", "tank"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2, 3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
+
+  def testMultipleMutableHashTables(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.evaluate(table1.insert(keys, values))
+      self.evaluate(table2.insert(keys, values))
+      self.evaluate(table3.insert(keys, values))
+
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+      self.assertAllEqual(3, self.evaluate(table3.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testMutableHashTableWithTensorDefault(self):
+    with self.cached_session():
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testSignatureMismatch(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      # insert with keys of the wrong type
+      with self.assertRaises(ValueError):
+        self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
+
+      # insert with values of the wrong type
+      with self.assertRaises(ValueError):
+        self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string_ref = variables.Variable("brain")
+      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
+      self.evaluate(variables.global_variables_initializer())
+
+      # Ref types do not produce an insert signature mismatch.
+      self.evaluate(table.insert(input_string_ref, input_int64_ref))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      # Ref types do not produce a lookup signature mismatch.
+      self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
+
+      # lookup with keys of the wrong type
+      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+      with self.assertRaises(ValueError):
+        self.evaluate(table.lookup(input_string))
+
+      # default value of the wrong type
+      with self.assertRaises(TypeError):
+        lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
+
+  def testMutableHashTableStringFloat(self):
+    with self.cached_session():
+      default_val = -1.5
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllClose([0, 1.1, default_val], result)
+
+  def testMutableHashTableIntFloat(self):
+    with self.cached_session():
+      default_val = -1.0
+      keys = constant_op.constant([3, 7, 0], dtypes.int64)
+      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
+      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllClose([-1.2, 9.9, default_val], result)
+
+  def testMutableHashTableInt64String(self):
+    with self.cached_session():
+      default_val = "n/a"
+      keys = constant_op.constant([0, 1, 2], dtypes.int64)
+      values = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
+
+
+class MutableHashTableBenchmark(test.Benchmark):
+
+  def _create_table(self):
+    return lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
+
+  def benchmark_single_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable(1.0)
+    insert = table.insert(0, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) == 1
+
+  def benchmark_many_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
+    value = variables.Variable(1.0)
+    insert = table.insert(c, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) >= 10000
+
+  def benchmark_single_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) == 32
+
+  def benchmark_many_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(32 * c + list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) >= 1000 * 32
+
+
+class DenseHashTableBenchmark(MutableHashTableBenchmark):
+
+  def _create_table(self):
+    return lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.float32,
+        default_value=0.0,
+        empty_key=-1,
+        deleted_key=-2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 06deb0e1c82175c33b028e017a5f54cc2549253b..1c0280c3ce6e60aeea9f1bd9542b3a69e75d70e4 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -27,8 +27,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -66,64 +66,62 @@ class LuOpTest(test.TestCase):
 
   def _verifyLu(self, x, output_idx_type=dtypes.int64):
     # Verify that Px = LU.
-    with test_util.use_gpu():
-
-      lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
-
-      # Prepare the lower factor of shape num_rows x num_rows
-      lu_shape = np.array(lu.shape.as_list())
-      batch_shape = lu_shape[:-2]
-      num_rows = lu_shape[-2]
-      num_cols = lu_shape[-1]
-
-      lower = array_ops.matrix_band_part(lu, -1, 0)
-
-      if num_rows > num_cols:
-        eye = linalg_ops.eye(
-            num_rows, batch_shape=batch_shape, dtype=lower.dtype)
-        lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
-      elif num_rows < num_cols:
-        lower = lower[..., :num_rows]
-
-      # Fill the diagonal with ones.
-      ones_diag = array_ops.ones(
-          np.append(batch_shape, num_rows), dtype=lower.dtype)
-      lower = array_ops.matrix_set_diag(lower, ones_diag)
-
-      # Prepare the upper factor.
-      upper = array_ops.matrix_band_part(lu, 0, -1)
-
-      verification = math_ops.matmul(lower, upper)
-
-      # Permute the rows of product of the Cholesky factors.
-      if num_rows > 0:
-        # Reshape the product of the triangular factors and permutation indices
-        # to a single batch dimension. This makes it easy to apply
-        # invert_permutation and gather_nd ops.
-        perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
-        verification_reshaped = array_ops.reshape(verification,
-                                                  [-1, num_rows, num_cols])
-        # Invert the permutation in each batch.
-        inv_perm_reshaped = functional_ops.map_fn(array_ops.invert_permutation,
-                                                  perm_reshaped)
-        batch_size = perm_reshaped.shape.as_list()[0]
-        # Prepare the batch indices with the same shape as the permutation.
-        # The corresponding batch index is paired with each of the `num_rows`
-        # permutation indices.
-        batch_indices = math_ops.cast(
-            array_ops.broadcast_to(
-                math_ops.range(batch_size)[:, None], perm_reshaped.shape),
-            dtype=output_idx_type)
-        permuted_verification_reshaped = array_ops.gather_nd(
-            verification_reshaped,
-            array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
-
-        # Reshape the verification matrix back to the original shape.
-        verification = array_ops.reshape(permuted_verification_reshaped,
-                                         lu_shape)
-
-      self._verifyLuBase(x, lower, upper, perm, verification,
-                         output_idx_type)
+    lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
+
+    # Prepare the lower factor of shape num_rows x num_rows
+    lu_shape = np.array(lu.shape.as_list())
+    batch_shape = lu_shape[:-2]
+    num_rows = lu_shape[-2]
+    num_cols = lu_shape[-1]
+
+    lower = array_ops.matrix_band_part(lu, -1, 0)
+
+    if num_rows > num_cols:
+      eye = linalg_ops.eye(
+          num_rows, batch_shape=batch_shape, dtype=lower.dtype)
+      lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
+    elif num_rows < num_cols:
+      lower = lower[..., :num_rows]
+
+    # Fill the diagonal with ones.
+    ones_diag = array_ops.ones(
+        np.append(batch_shape, num_rows), dtype=lower.dtype)
+    lower = array_ops.matrix_set_diag(lower, ones_diag)
+
+    # Prepare the upper factor.
+    upper = array_ops.matrix_band_part(lu, 0, -1)
+
+    verification = math_ops.matmul(lower, upper)
+
+    # Permute the rows of product of the Cholesky factors.
+    if num_rows > 0:
+      # Reshape the product of the triangular factors and permutation indices
+      # to a single batch dimension. This makes it easy to apply
+      # invert_permutation and gather_nd ops.
+      perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
+      verification_reshaped = array_ops.reshape(verification,
+                                                [-1, num_rows, num_cols])
+      # Invert the permutation in each batch.
+      inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation,
+                                        perm_reshaped)
+      batch_size = perm_reshaped.shape.as_list()[0]
+      # Prepare the batch indices with the same shape as the permutation.
+      # The corresponding batch index is paired with each of the `num_rows`
+      # permutation indices.
+      batch_indices = math_ops.cast(
+          array_ops.broadcast_to(
+              math_ops.range(batch_size)[:, None], perm_reshaped.shape),
+          dtype=output_idx_type)
+      permuted_verification_reshaped = array_ops.gather_nd(
+          verification_reshaped,
+          array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
+
+      # Reshape the verification matrix back to the original shape.
+      verification = array_ops.reshape(permuted_verification_reshaped,
+                                       lu_shape)
+
+    self._verifyLuBase(x, lower, upper, perm, verification,
+                       output_idx_type)
 
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [10., 0., 5.]])
@@ -140,46 +138,44 @@ class LuOpTest(test.TestCase):
         self._verifyLu(complex_data, output_idx_type=output_idx_type)
 
   def testPivoting(self):
-    with test_util.use_gpu():
-      # This matrix triggers partial pivoting because the first diagonal entry
-      # is small.
-      data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
-      self._verifyLu(data.astype(np.float32))
-
-      for dtype in (np.float32, np.float64):
-        self._verifyLu(data.astype(dtype))
-        _, p = linalg_ops.lu(data)
-        p_val = self.evaluate([p])
-        # Make sure p_val is not the identity permutation.
-        self.assertNotAllClose(np.arange(3), p_val)
-
-      for dtype in (np.complex64, np.complex128):
-        complex_data = np.tril(1j * data, -1).astype(dtype)
-        complex_data += np.triu(-1j * data, 1).astype(dtype)
-        complex_data += data
-        self._verifyLu(complex_data)
-        _, p = linalg_ops.lu(data)
-        p_val = self.evaluate([p])
-        # Make sure p_val is not the identity permutation.
-        self.assertNotAllClose(np.arange(3), p_val)
+    # This matrix triggers partial pivoting because the first diagonal entry
+    # is small.
+    data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
+    self._verifyLu(data.astype(np.float32))
+
+    for dtype in (np.float32, np.float64):
+      self._verifyLu(data.astype(dtype))
+      _, p = linalg_ops.lu(data)
+      p_val = self.evaluate([p])
+      # Make sure p_val is not the identity permutation.
+      self.assertNotAllClose(np.arange(3), p_val)
+
+    for dtype in (np.complex64, np.complex128):
+      complex_data = np.tril(1j * data, -1).astype(dtype)
+      complex_data += np.triu(-1j * data, 1).astype(dtype)
+      complex_data += data
+      self._verifyLu(complex_data)
+      _, p = linalg_ops.lu(data)
+      p_val = self.evaluate([p])
+      # Make sure p_val is not the identity permutation.
+      self.assertNotAllClose(np.arange(3), p_val)
 
   def testInvalidMatrix(self):
     # LU factorization gives an error when the input is singular.
     # Note: A singular matrix may return without error but it won't be a valid
     # factorization.
-    with test_util.use_gpu():
-      for dtype in self.float_types:
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(
-              linalg_ops.lu(
-                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
-                           dtype=dtype)))
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(
-              linalg_ops.lu(
-                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
-                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
-                           dtype=dtype)))
+    for dtype in self.float_types:
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(
+            linalg_ops.lu(
+                np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                         dtype=dtype)))
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(
+            linalg_ops.lu(
+                np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                          [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                         dtype=dtype)))
 
   def testBatch(self):
     simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
@@ -220,14 +216,13 @@ class LuOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      lu1, p1 = linalg_ops.lu(matrix1)
-      lu2, p2 = linalg_ops.lu(matrix2)
-      lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
-      self.assertAllEqual(lu1_val, lu2_val)
-      self.assertAllEqual(p1_val, p2_val)
+    matrix1 = random_ops.random_normal([5, 5], seed=42)
+    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    lu1, p1 = linalg_ops.lu(matrix1)
+    lu2, p2 = linalg_ops.lu(matrix2)
+    lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
+    self.assertAllEqual(lu1_val, lu2_val)
+    self.assertAllEqual(p1_val, p2_val)
 
 
 class LuBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b1d433c780a520fbb5a0168053f6708e74b95a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.kernels.functional_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+# pylint: disable=invalid-name
+def simple_scoped_fn(a, x):
+  """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
+  with variable_scope.variable_scope("body"):
+    # Dummy variable, just to check that scoping works as intended.
+    two = variable_scope.get_variable(
+        "two", [],
+        dtype=dtypes.int32,
+        initializer=init_ops.constant_initializer(2))
+    return math_ops.multiply(math_ops.add(a, x), two)
+
+
+@test_util.with_control_flow_v2
+class MapFnTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_Simple(self):
+    nums = [1, 2, 3, 4, 5, 6]
+    elems = constant_op.constant(nums, name="data")
+    r = map_fn.map_fn(
+        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), elems)
+    self.assertAllEqual(
+        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
+
+  def testMapDtypeEager(self):
+    with context.eager_mode():
+      dtype = map_fn.map_fn(lambda x: constant_op.constant(""),
+                            constant_op.constant([]),
+                            dtype=dtypes.string).dtype
+      self.assertEqual(dtype, dtypes.string)
+
+  def testMapSparseTensor(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        map_fn.map_fn(
+            lambda x: x,
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1], [1, 0]],
+                values=constant_op.constant([0, 1, 2]),
+                dense_shape=[2, 2]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      map_fn.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      map_fn.map_fn(lambda x: x, 1)
+
+  @test_util.run_deprecated_v1
+  def testMap_Scoped(self):
+    with self.cached_session() as sess:
+
+      def double_scoped(x):
+        """2x with a dummy 2 that is scoped."""
+        with variable_scope.variable_scope("body"):
+          # Dummy variable, just to check that scoping works as intended.
+          two = variable_scope.get_variable(
+              "two", [],
+              dtype=dtypes.int32,
+              initializer=init_ops.constant_initializer(2))
+          return math_ops.multiply(x, two)
+
+      with variable_scope.variable_scope("root") as varscope:
+        elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
+        doubles = np.array([2 * x for x in [1, 2, 3, 4, 5, 6]])
+
+        r = map_fn.map_fn(double_scoped, elems)
+        # Check that we have the one variable we asked for here.
+        self.assertEqual(len(variables.trainable_variables()), 1)
+        self.assertEqual(variables.trainable_variables()[0].name,
+                         "root/body/two:0")
+        sess.run([variables.global_variables_initializer()])
+        self.assertAllEqual(doubles, self.evaluate(r))
+
+        # Now let's reuse our single variable.
+        varscope.reuse_variables()
+        r = map_fn.map_fn(double_scoped, elems)
+        self.assertEqual(len(variables.trainable_variables()), 1)
+        self.assertAllEqual(doubles, self.evaluate(r))
+
+  @test_util.run_deprecated_v1
+  def testMap_Grad(self):
+    with self.cached_session():
+      param = constant_op.constant(2.0)
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
+      y = map_fn.map_fn(
+          lambda x: math_ops.multiply(math_ops.square(x), param), elems)
+      r = gradients_impl.gradients(y, param)[0]
+      self.assertAllEqual(91.0, self.evaluate(r))
+      r = gradients_impl.gradients(y, elems)[0]
+      self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_SimpleNotTensor(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), nums)
+    self.assertAllEqual(
+        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_SingleInputMultiOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: ((x + 3) * 2, -(x + 3) * 2),
+        nums,
+        dtype=(dtypes.int64, dtypes.int64))
+    self.assertEqual(2, len(r))
+    self.assertEqual((6,), r[0].get_shape())
+    self.assertEqual((6,), r[1].get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual((nums + 3) * 2, received[0])
+    self.assertAllEqual(-(nums + 3) * 2, received[1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiOutputMismatchedDtype(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    with self.assertRaisesRegexp(
+        TypeError, r"two structures don't have the same nested structure"):
+      # lambda emits tuple, but dtype is a list
+      map_fn.map_fn(
+          lambda x: ((x + 3) * 2, -(x + 3) * 2),
+          nums,
+          dtype=[dtypes.int64, dtypes.int64])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiInputSingleOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: x[0] * x[1][0] + x[1][1], (nums, (nums, -nums)),
+        dtype=dtypes.int64)
+    self.assertEqual((6,), r.get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual(nums * nums + (-nums), received)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiInputSameStructureOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(lambda x: (x[1][0], (x[1][1], x[0])),
+                      (nums, (2 * nums, -nums)))
+    r = [r[0], r[1][0], r[1][1]]
+    self.assertEqual((6,), r[0].get_shape())
+    self.assertEqual((6,), r[1].get_shape())
+    self.assertEqual((6,), r[2].get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual(2 * nums, received[0])
+    self.assertAllEqual(-nums, received[1])
+    self.assertAllEqual(nums, received[2])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMapShape(self):
+    x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    y = map_fn.map_fn(lambda e: e, x)
+    self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
+
+  @test_util.run_deprecated_v1
+  def testMapUnknownShape(self):
+    x = array_ops.placeholder(dtypes.float32)
+    y = map_fn.map_fn(lambda e: e, x)
+    self.assertIs(None, y.get_shape().dims)
+
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
+  # so the body of the while loop never executes
+  @test_util.run_v1_only("b/120545219")
+  def testMapEmptyScalar(self):
+    map_return = map_fn.map_fn(lambda x: 1,
+                               constant_op.constant([], dtype=dtypes.int32))
+    self.assertAllEqual([0], map_return.get_shape().dims)
+    self.assertAllEqual([0], self.evaluate(map_return).shape)
+
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
+  # so the body of the while loop never executes
+  @test_util.run_v1_only("b/120545219")
+  def testMapEmptyTensor(self):
+    with self.cached_session():
+      map_return = map_fn.map_fn(lambda x: array_ops.zeros([3, 2]),
+                                 constant_op.constant([]))
+      self.assertAllEqual([0, 3, 2], map_return.get_shape().dims)
+      self.assertAllEqual([0, 3, 2], self.evaluate(map_return).shape)
+
+
+if __name__ == "__main__":
+  test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index d31ecbcd3f1d57386fa629cd533f5f698176ca76..66125c17f2117af427eade069574dbd8f1e82ca4 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -51,7 +51,7 @@ def _AddTest(test, op_name, testcase_name, fn):
   test_name = "_".join(["test", op_name, testcase_name])
   if hasattr(test, test_name):
     raise RuntimeError("Test %s defined more than once" % test_name)
-  setattr(test, test_name, fn)
+  setattr(test, test_name, test_util.deprecated_graph_mode_only(fn))
 
 
 def _GetTransposedMatrices(x, x_name, kwargs):
@@ -127,7 +127,7 @@ def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
-    with self.session(), test_util.use_gpu():
+    with self.session():
       theoretical, numerical = gradient_checker_v2.compute_gradient(
           lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
           [effective_a_np],
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 372b6dc17f4d080f3a59705611e05f0f0865c50d..705f25b4fcdc242f852b8134014d4374630133cd 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -89,6 +89,7 @@ class ExponentialOpTest(test.TestCase):
     # A multidimensional batch of 2x2 matrices
     self._verifyExponentialReal(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_deprecated_v1
   def testNonsymmetricComplex(self):
     matrix1 = np.array([[1., 2.], [3., 4.]])
     matrix2 = np.array([[1., 3.], [3., 5.]])
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 5cef4b79a32b85e3366ce018d1d8634867c20a75..60603f62112acb77f781287bcabd0c72048b01ec 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -101,12 +102,14 @@ class InverseOpTest(test.TestCase):
     # Complex batch
     self._verifyInverseComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.deprecated_graph_mode_only
   def testNonSquareMatrix(self):
     # When the inverse of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
       linalg_ops.matrix_inverse(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
+  @test_util.deprecated_graph_mode_only
   def testWrongDimensions(self):
     # The input to the inverse should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
@@ -137,6 +140,7 @@ class InverseOpTest(test.TestCase):
               size=np.prod(shape)).reshape(shape).astype(dtype)
           self._verifyInverseReal(matrix)
 
+  @test_util.deprecated_graph_mode_only
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 682ac12adc6acef378ccbb256066cbd2b099e1b9..82f249a6444d45026d1af94562b3a058ade97981 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -58,6 +58,7 @@ class LogarithmOpTest(test.TestCase):
     matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
     return matrix_batch
 
+  @test_util.run_v1_only("b/120545219")
   def testNonsymmetric(self):
     # 2x2 matrices
     matrix1 = np.array([[1., 2.], [3., 4.]])
@@ -71,6 +72,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_v1_only("b/120545219")
   def testSymmetricPositiveDefinite(self):
     # 2x2 matrices
     matrix1 = np.array([[2., 1.], [1., 2.]])
@@ -99,10 +101,12 @@ class LogarithmOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       gen_linalg_ops.matrix_logarithm(tensor3)
 
+  @test_util.run_v1_only("b/120545219")
   def testEmpty(self):
     self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
     self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64))
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomSmallAndLargeComplex64(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
@@ -113,6 +117,7 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex64)
         self._verifyLogarithmComplex(matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomSmallAndLargeComplex128(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 3edb390c724b6c71cd8849efc2b22a579e87247f..51a90e8f33795a9fc8a4544448883c3df48170ef 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -32,12 +32,12 @@ class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
     matrix = matrix.astype(np_type)
-    with test_util.use_gpu():
-      # Verify that matmul(sqrtm(A), sqrtm(A)) = A
-      sqrt = gen_linalg_ops.matrix_square_root(matrix)
-      square = math_ops.matmul(sqrt, sqrt)
-      self.assertShapeEqual(matrix, square)
-      self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
+
+    # Verify that matmul(sqrtm(A), sqrtm(A)) = A
+    sqrt = gen_linalg_ops.matrix_square_root(matrix)
+    square = math_ops.matmul(sqrt, sqrt)
+    self.assertShapeEqual(matrix, square)
+    self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
 
   def _verifySquareRootReal(self, x):
     for np_type in [np.float32, np.float64]:
@@ -114,7 +114,7 @@ class SquareRootOpTest(test.TestCase):
       sqrt2 = gen_linalg_ops.matrix_square_root(square2)
       all_ops = [sqrt1, sqrt2]
       sqrt = self.evaluate(all_ops)
-      self.assertAllEqual(sqrt[0], sqrt[1])
+      self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index dde83f12f3cee1882d921be292f6a33b8c7f1b48..2d0427cad94e913ee5238f483538d0afe79f687c 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -167,6 +167,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         self._verifySolve(matrix, rhs, batch_dims=[2, 3])
 
+  @test_util.run_deprecated_v1
   def testNotInvertible(self):
     # The input should be invertible.
     # The matrix is singular because it has a zero on the diagonal.
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 380d2860da4771faf1c22fe870e38b8c13edd896..e5ae9574e38bf0e042f8fa2df494ad85613ea540 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -165,6 +165,7 @@ class DepthwiseConv2DTest(test.TestCase):
       self._VerifyValues(
           input_size, filter_size, stride, padding, use_gpu=False)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseConv2DFormat(self):
     if not test.is_gpu_available():
       return
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index 20b9ad95c8be7aa59a2a1b70d59341e2f3ec8fa4..bfea21344548275f67059bdf958018d61261c580 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -89,7 +89,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
     if ((not is_matrix_norm and ord_ == "fro") or
         (is_matrix_norm and is_fancy_p_norm)):
       self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm")
-    if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2):
+    if ord_ == "euclidean" or (axis_ is None and len(shape) > 2):
       self.skipTest("Not supported by numpy.linalg.norm")
     matrix = np.random.randn(*shape_).astype(dtype_)
     if dtype_ in (np.complex64, np.complex128):
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 7b1b054ae0656ef8ae988c1a3220a2a643afbcab..6fb8a4b5d8678e54623d194ef97ae65f2e494b15 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -223,7 +223,7 @@ class PadOpTest(test.TestCase):
   def testIntTypes(self):
     # TODO(touts): Figure out why the padding tests do not work on GPU
     # for int types and rank > 2.
-    for t in [np.int8, np.int32, np.int64]:
+    for t in [np.int8, np.uint8, np.int32, np.int64]:
       self._testAll(
           np.random.randint(-100, 100, (4, 4, 3)).astype(t),
           [[1, 0], [2, 3], [0, 2]], 0)
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 367c94dd1e689ade820a96322c786d416bd2b1b1..78e786f01ca9c167b5b175fcd833a83281c078de 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -303,7 +303,6 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")  # Much larger error
   def testGradient1D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 27a71904340a1058bec5b9f993f78c5766345f01..aa207ebbdd11e4aa7ad2d5d8b296edbe92ded108 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -37,6 +40,15 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+def GetDeviceScope(self, use_gpu=False):
+  if context.executing_eagerly():
+    if use_gpu and test.is_gpu_available():
+      return ops.device("GPU:0")
+    return ops.device("CPU:0")
+  else:
+    return self.session(use_gpu=use_gpu)
+
+
 def GetTestConfigs(include_nchw_vect_c=False):
   """Get all the valid tests configs to run.
 
@@ -730,7 +742,7 @@ class PoolingTest(test.TestCase):
         t = nn_ops.max_pool(
             t, ksize=ksize, strides=strides, padding="SAME").eval()
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testDepthwiseMaxPoolInvalidConfigs(self):
     self._testDepthwiseMaxPoolInvalidConfig(
         [1, 2, 2, 4], [1, 2, 2, 2], [1, 1, 1, 2],
@@ -743,7 +755,7 @@ class PoolingTest(test.TestCase):
     if test.is_gpu_available():
       with self.session(use_gpu=True):
         t = variables.Variable(np.ones([1, 2, 2, 4]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         with self.assertRaisesOpError("for CPU devices"):
           nn_ops.max_pool(
               t, ksize=[1, 1, 1, 2], strides=[1, 1, 1, 2],
@@ -800,7 +812,7 @@ class PoolingTest(test.TestCase):
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
-      with self.cached_session(use_gpu=True):
+      with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
         argmax = self.evaluate(argmax_op)
@@ -824,62 +836,110 @@ class PoolingTest(test.TestCase):
           cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
-    tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
-    with self.session(use_gpu=True) as sess:
-      t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
-      out_op, argmax_op = nn_ops.max_pool_with_argmax(
-          t,
-          ksize=[1, 2, 2, 1],
-          strides=[1, 1, 1, 1],
-          Targmax=dtypes.int64,
-          padding="VALID")
-      out, argmax = self.evaluate([out_op, argmax_op])
-      self.assertShapeEqual(out, out_op)
-      self.assertShapeEqual(argmax, argmax_op)
-      self.assertAllClose(out.ravel(), [1.0, 1.0, 1.0, 1.0])
-      self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
+    tensor_input = [
+        1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0, 1.0
+    ]
+
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(False, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(False, True, [0, 1, 3, 5, 9, 11, 15, 17]),
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, use_gpu=config.use_gpu):
+        t = constant_op.constant(tensor_input, shape=[2, 3, 3, 1])
+        out_op, argmax_op = nn_ops.max_pool_with_argmax(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 1, 1, 1],
+            Targmax=dtypes.int64,
+            padding="VALID",
+            include_batch_in_index=config.include_batch_in_index)
+        out, argmax = self.evaluate([out_op, argmax_op])
+        self.assertShapeEqual(out, out_op)
+        self.assertShapeEqual(argmax, argmax_op)
+        self.assertAllClose(out.ravel(),
+                            [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+        self.assertAllEqual(argmax.ravel(), config.argmax)
 
   def testMaxPoolingGradWithArgmax(self):
-    orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
-    tensor_input = [11.0, 12.0, 13.0, 14.0]
-    tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.session(use_gpu=True):
-      orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
-      t = constant_op.constant(tensor_input, shape=[1, 2, 2, 1])
-      argmax = constant_op.constant(
-          tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
-      out_op = gen_nn_ops.max_pool_grad_with_argmax(
-          orig_in,
-          t,
-          argmax,
-          ksize=[1, 2, 2, 1],
-          strides=[1, 1, 1, 1],
-          padding="VALID")
-      out = self.evaluate(out_op).flatten()
-      self.assertAllClose(out,
-                          [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
+    orig_input = [
+        1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0, 1.0
+    ]
+    tensor_input = [11.0, 12.0, 13.0, 14.0, 21.0, 22.0, 23.0, 24.0]
+
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(False, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(False, True, [0, 1, 3, 5, 9, 11, 15, 17]),
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, config.use_gpu):
+        orig_in = constant_op.constant(orig_input, shape=[2, 3, 3, 1])
+        t = constant_op.constant(tensor_input, shape=[2, 2, 2, 1])
+        argmax_t = constant_op.constant(
+            config.argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
+        out_op = gen_nn_ops.max_pool_grad_with_argmax(
+            orig_in,
+            t,
+            argmax_t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            include_batch_in_index=config.include_batch_in_index)
+        out = self.evaluate(out_op).flatten()
+        self.assertAllClose(out, [
+            11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0, 21.0, 0.0, 22.0,
+            0.0, 0.0, 0.0, 23.0, 0.0, 24.0
+        ])
 
   def testMaxPoolingGradGradWithArgmax(self):
     # MaxPoolWithArgMax is implemented only on CUDA.
     if not test.is_gpu_available(cuda_only=True):
       return
-    orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
-    tensor_input = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]
-    tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.session(use_gpu=True):
-      orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
-      t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
-      argmax = constant_op.constant(
-          tensor_argmax, shape=[1, 2, 2, 1], dtype=dtypes.int64)
-      out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
-          orig_in,
-          t,
-          argmax,
-          ksize=[1, 2, 2, 1],
-          strides=[1, 1, 1, 1],
-          padding="VALID")
-      out = self.evaluate(out_op).flatten()
-      self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
+    orig_input = [
+        1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0, 1.0
+    ]
+    tensor_input = [
+        11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 21.0, 22.0, 23.0,
+        24.0, 25.0, 26.0, 27.0, 28.0, 29.0
+    ]
+
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, config.use_gpu):
+        orig_in = constant_op.constant(orig_input, shape=[2, 3, 3, 1])
+        t = constant_op.constant(tensor_input, shape=[2, 3, 3, 1])
+        argmax_t = constant_op.constant(
+            config.argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
+        out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
+            orig_in,
+            t,
+            argmax_t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            include_batch_in_index=config.include_batch_in_index)
+        out = self.evaluate(out_op).flatten()
+        self.assertAllClose(out,
+                            [11.0, 12.0, 14.0, 16.0, 21.0, 23.0, 27.0, 29.0])
 
   def _ConstructAndTestGradient(self,
                                 pool_func,
@@ -1175,7 +1235,6 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1212,7 +1271,6 @@ class PoolingTest(test.TestCase):
                      [1, window_rows, window_cols, 1],
                      [1, row_stride, col_stride, 1], padding)
 
-  @test_util.disable_xla("This test never passed for XLA")
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
@@ -1221,7 +1279,7 @@ class PoolingTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       input_tensor = variables.Variable(
           np.array(input_data, dtype=np.float32).reshape(input_sizes))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       output_tensor = pool_func(input_tensor, [1, window_rows, window_cols, 1],
                                 [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
@@ -1354,6 +1412,7 @@ class PoolingTest(test.TestCase):
             use_gpu=use_gpu,
             v2=v2)
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
     output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
@@ -1428,6 +1487,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
@@ -1628,7 +1688,6 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1663,7 +1722,6 @@ class PoolingTest(test.TestCase):
         [1, row_stride, col_stride, 1], padding)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1823,7 +1881,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
@@ -1899,17 +1957,9 @@ if __name__ == "__main__":
        padding_) in GetShrunkInceptionMaxPoolShapes():
     setattr(PoolingTest, "testMaxPoolFwd_" + name_,
             GetMaxPoolFwdTest(input_size_, filter_size_, stride_, padding_))
-    if name_ == "maxpool5":
-      setattr(
-          PoolingTest, "testMaxPoolGrad_" + name_,
-          test_util.disable_xla("maxpool5 fails while all others pass")(
-              GetMaxPoolGradTest(input_size_, filter_size_, output_size_,
-                                 stride_, padding_)))
-    else:
-      setattr(
-          PoolingTest, "testMaxPoolGrad_" + name_,
-          GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
-                             padding_))
+    setattr(PoolingTest, "testMaxPoolGrad_" + name_,
+            GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
+                               padding_))
     setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
             GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
                                    stride_, padding_))
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 482633d539dfb0d1b0737846ba44ff3e0826ad43..f2e28218ea3c99a06b2a55d26826d0ddf8423eb2 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -486,6 +486,21 @@ class PyFuncTest(test.TestCase):
     ret = self.evaluate(output)
     self.assertAllEqual(ret, [[3], [3], [3]])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testRenamedDeviceInTestClusterCorrectlyIdentifiedAsLocalhost(self):
+    if context.executing_eagerly():
+      self.skipTest("b/126565353: We don't test eager's remote execution.")
+
+    workers, _ = test_util.create_local_cluster(num_workers=1, num_ps=0)
+    worker = workers[0]
+    session = session_lib.Session(worker.target)
+    with ops.device("/job:worker/task:0/cpu:0"):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
+    ret = session.run(output)
+    self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
+
   @test_util.run_in_graph_and_eager_modes
   def testEagerSingleOutputFloat32(self):
     with test_util.device(use_gpu=True):
@@ -519,6 +534,7 @@ class PyFuncTest(test.TestCase):
         self.assertIsNone(ret)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_xla("XLA cannot compile functions containing py_func")
   def testEagerPyFuncInDefun(self):
     with test_util.device(use_gpu=True):
       def wrapper():
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 5adb95c7d60e88e43f6f171f6594c8542ef53143..f9b221a365821265dfccce63f2e018779a14eb5d 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -67,8 +67,8 @@ class QrOpTest(test.TestCase):
       val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
-        self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
-        self.assertAllEqual(val[q + 1], val[q + 3])  # r1 == r2
+        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
+        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 445aaf9dc48b06048d74c0989c744495de4ec5ee..8452982a447ff5eaa1b4eaa11c5d6f8cbd6a7e8c 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -14,6 +14,14 @@ load("//tensorflow:tensorflow.bzl", "sycl_py_test")
 # Please avoid the py_tests and cuda_py_tests (plural) while we
 # fix the shared/overbroad dependencies.
 
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
@@ -115,6 +123,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_gamma_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -151,6 +160,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_poisson_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index a5952a21968c79c8bfbcbfef2b09852f24f29923..5cc13f67777aef07ab40e8926effc3a2a0d6430b 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -27,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -69,16 +68,6 @@ class RandomGammaTest(test.TestCase):
       tf_logging.warn("Cannot test moments: %s" % e)
       return
 
-    # Check the given array of samples matches the given theoretical moment
-    # function at different orders. The test is considered passing if the
-    # z-tests of all statistical moments are all below z_limit.
-    # Parameters:
-    #   max_moments: the largest moments of the distribution to be tested
-    #   stride: the distance between samples to check for statistical properties
-    #       0 means the n-th moment of each sample
-    #       any other strides tests for spatial correlation between samples;
-    #   z_limit: the maximum z-test we would consider the test to pass;
-
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -94,46 +83,13 @@ class RandomGammaTest(test.TestCase):
           max_moment = min(6, scale // 2)
           sampler = self._Sampler(
               20000, alpha, 1 / scale, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.gamma(alpha, scale=scale)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            # Calculate moment variance safely:
-            # This is just
-            #  (moments_i_squared - moments_i_mean**2) / moments_sample_count[i]
-            normalized_moments_i_var = (
-                moments_i_mean / moments_sample_count[i] *
-                (moments_i_squared / moments_i_mean - moments_i_mean))
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * np.finfo(dt.as_numpy_dtype).eps
-            total_variance = (normalized_moments_i_var + error_per_moment)
-            tiny = np.finfo(dt.as_numpy_dtype).tiny
-            self.assertGreaterEqual(total_variance, 0)
-            if total_variance < tiny:
-              total_variance = tiny
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / math.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.gamma(alpha, scale=scale),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   def _testZeroDensity(self, alpha):
     """Zero isn't in the support of the gamma distribution.
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index aac6eeac06abca3148947901b92b43058fe76e3c..38fa44f37152bbc1cb720594d171142ec7af9007 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -79,7 +79,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -89,7 +89,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = []
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -99,7 +99,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
     beta = array_ops.placeholder(dtypes.float32)
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
 
     alpha_val = np.ones([1, 2])
@@ -129,7 +129,8 @@ class RandomGammaGradTest(test.TestCase):
 
       alpha_val = np.logspace(-2, 3, dtype=np_dtype)
       alpha = constant_op.constant(alpha_val)
-      sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+      sample = random_ops.random_gamma(
+          [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
       actual = gradients_impl.gradients(sample, alpha)[0]
 
       (sample_val, actual_val) = self.evaluate((sample, actual))
@@ -175,7 +176,8 @@ class RandomGammaGradTest(test.TestCase):
     """
     np_dtype = dtype.as_numpy_dtype
     alpha = constant_op.constant(np.logspace(-2, 3, dtype=np_dtype))
-    sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+    sample = random_ops.random_gamma(
+        [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
     actual = gradients_impl.gradients(sample, alpha)[0]
 
     sample_sg = array_ops.stop_gradient(sample)
@@ -207,9 +209,9 @@ class RandomGammaGradTest(test.TestCase):
     Here we verify that the rhs is fairly close to one.
     The convergence speed is not great, so we use many samples and loose bounds.
     """
-    num_samples = 1000
+    num_samples = 10000
     alpha = constant_op.constant([0.8, 1e1, 1e3], dtype=dtypes.float32)
-    sample = random_ops.random_gamma([num_samples], alpha)
+    sample = random_ops.random_gamma([num_samples], alpha, seed=12345)
     # We need to average the gradients, which is equivalent to averaging the
     # samples and then doing backprop.
     mean_sample = math_ops.reduce_mean(sample, axis=0)
@@ -234,13 +236,13 @@ class RandomGammaGradTest(test.TestCase):
     We compare the Monte-Carlo estimate of the expectation with the
     true gradient.
     """
-    num_samples = 1000
+    num_samples = 10000
     t = 0.3
     alpha = 0.5
     expected = 1 + 2 * alpha - 2 * t
 
     alpha = constant_op.constant(alpha)
-    sample = random_ops.random_gamma([num_samples], alpha, 1.0)
+    sample = random_ops.random_gamma([num_samples], alpha, 1.0, seed=12345)
     loss = math_ops.reduce_mean(math_ops.square(sample - t))
     dloss_dalpha = gradients_impl.gradients(loss, alpha)[0]
     dloss_dalpha_val = self.evaluate(dloss_dalpha)
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 0a6b004d682e5d810a5a3e09ca6dce867e5f41f1..51dd4cb47ca8561dfd01e20031651047fb2b70b9 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -49,14 +50,13 @@ class RandomPoissonTest(test.TestCase):
 
     return func
 
-  # TODO(srvasude): Factor this out along with the corresponding moment testing
-  # method in random_gamma_test into a single library.
   def testMoments(self):
     try:
       from scipy import stats  # pylint: disable=g-import-not-at-top
     except ImportError as e:
       tf_logging.warn("Cannot test moments: %s", e)
       return
+
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -67,41 +67,13 @@ class RandomPoissonTest(test.TestCase):
         for lam in (3., 20):
           max_moment = 5
           sampler = self._Sampler(10000, lam, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.poisson(lam)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            moments_i_var = (
-                moments_i_squared - moments_i_mean * moments_i_mean)
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * 1e-6
-            total_variance = (
-                moments_i_var / moments_sample_count[i] + error_per_moment)
-            if not total_variance:
-              total_variance = 1e-10
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / np.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.poisson(lam),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
diff --git a/tensorflow/python/kernel_tests/random/util.py b/tensorflow/python/kernel_tests/random/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..67805c7f262480e18fd296e15fc4a436e70c0c58
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random/util.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for testing random variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def test_moment_matching(
+    samples,
+    number_moments,
+    dist,
+    stride=0):
+  """Return z-test scores for sample moments to match analytic moments.
+
+  Given `samples`, check that the first sample `number_moments` match
+  the given  `dist` moments by doing a z-test.
+
+  Args:
+    samples: Samples from target distribution.
+    number_moments: Python `int` describing how many sample moments to check.
+    dist: SciPy distribution object that provides analytic moments.
+    stride: Distance between samples to check for statistical properties.
+      A stride of 0 means to use all samples, while other strides test for
+      spatial correlation.
+  Returns:
+    Array of z_test scores.
+  """
+
+  sample_moments = []
+  expected_moments = []
+  variance_sample_moments = []
+  x = samples.flat
+  for i in range(1, number_moments + 1):
+    strided_range = x[::(i - 1) * stride + 1]
+    sample_moments.append(np.mean(strided_range ** i))
+    expected_moments.append(dist.moment(i))
+    variance_sample_moments.append(
+        (dist.moment(2 * i) - dist.moment(i) ** 2) / len(strided_range))
+
+  z_test_scores = []
+  for i in range(1, number_moments + 1):
+    # Assume every operation has a small numerical error.
+    # It takes i multiplications to calculate one i-th moment.
+    total_variance = (
+        variance_sample_moments[i - 1] +
+        i * np.finfo(samples.dtype).eps)
+    tiny = np.finfo(samples.dtype).tiny
+    assert np.all(total_variance > 0)
+    if total_variance < tiny:
+      total_variance = tiny
+    # z_test is approximately a unit normal distribution.
+    z_test_scores.append(abs(
+        (sample_moments[i - 1] - expected_moments[i - 1]) / np.sqrt(
+            total_variance)))
+  return z_test_scores
+
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index ad8188b372fc5e4ac627098cbbfd8fac73359272..a37eca69add43d5df262c417f3c35406646bbbbe 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -125,6 +125,7 @@ class RecordInputOpTest(test.TestCase):
             self.assertTrue(r[0] not in epoch_set)
             epoch_set.add(r[0])
 
+  @test_util.run_deprecated_v1
   def testDoesNotDeadlock(self):
     # Iterate multiple times to cause deadlock if there is a chance it can occur
     for _ in range(30):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 67a89461f3a885056f47c62af40bf6cfccd60583..5ab8bc3a0089742cfad891e772bd3a4ee447a55e 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -104,7 +104,8 @@ class ReductionUnknownShape(test.TestCase):
       for dtype, reductions in [(dtypes.float32,
                                  (math_ops.reduce_sum, math_ops.reduce_mean,
                                   math_ops.reduce_prod, math_ops.reduce_max,
-                                  math_ops.reduce_min)),
+                                  math_ops.reduce_min,
+                                  math_ops.reduce_euclidean_norm)),
                                 (dtypes.bool, (math_ops.reduce_all,
                                                math_ops.reduce_any))]:
         for reduction in reductions:
@@ -487,6 +488,79 @@ class MeanReductionTest(BaseReductionTest):
         self.assertTrue(np.all(np.isnan(y)))
 
 
+class EuclideanNormReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_euclidean_norm(x, reduction_axes, keepdims)
+
+  def _np_reduce(self, x, reduction_axes, keepdims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    if reduction_axes is None or reduction_axes != tuple():
+      np_fro = np.sqrt(
+          np.sum(x * np.conj(x), axis=reduction_axes, keepdims=keepdims))
+    else:
+      np_fro = x
+    if np.issubdtype(x.dtype, np.integer):
+      np_fro = np.floor(np_fro)
+    return np_fro
+
+  @test_util.run_deprecated_v1
+  def testAxesType(self):
+    for dtype in [dtypes.int64, dtypes.int32]:
+      with self.cached_session(use_gpu=True):
+        v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
+        tf_v = self.evaluate(v)
+      self.assertAllEqual(tf_v, 0)
+
+  @test_util.run_deprecated_v1
+  def testInfinity(self):
+    for dtype in [np.float32, np.float64]:
+      for special_value_x in [-np.inf, np.inf]:
+        for special_value_y in [-np.inf, np.inf]:
+          np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
+          self._compareAll(np_arr, None)
+
+  @test_util.run_deprecated_v1
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
+
+    with self.session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_euclidean_norm(x, [0]).eval()
+        self.assertEqual(y.shape, (9938,))
+        self.assertAllEqual(y, np.zeros(9938))
+
+
 class ProdReductionTest(BaseReductionTest):
 
   def _tf_reduce(self, x, reduction_axes, keepdims):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
index 1e8524f72a9760af90695b3b24c6dda3e9ba8c4a..2d5cff383e46c3aac83eab6b830859a7614fd803 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -46,6 +48,7 @@ class BigReductionTest(BaseReductionTest):
   def _tf_reduce_sum(self, x, reduction_axes, keepdims):
     return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
+  @test_util.run_deprecated_v1
   def testFloat32Sum(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -64,11 +67,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([], dtype=np.float32) * size_x * size_y
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_sum(arr, 1, False)
-          tf_col_sum = self._tf_reduce_sum(arr, 0, False)
-          tf_full_sum = self._tf_reduce_sum(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_sum(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_sum(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_sum(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -82,12 +87,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.float32)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_mean(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_mean(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_mean(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_mean(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y],
+                                                   {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat32Max(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -107,11 +116,13 @@ class BigReductionTest(BaseReductionTest):
         full_max = np.max(col_max)
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_max = self._tf_reduce_max(arr, 1, False)
-          tf_col_max = self._tf_reduce_max(arr, 0, False)
-          tf_full_max = self._tf_reduce_max(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_max = self._tf_reduce_max(arr_placeholder, 1, False)
+          tf_col_max = self._tf_reduce_max(arr_placeholder, 0, False)
+          tf_full_max = self._tf_reduce_max(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_max, tf_col_max, tf_full_max])
+              [tf_row_max, tf_col_max, tf_full_max], {arr_placeholder: arr})
         self.assertAllClose(col_max, tf_out_col)
         self.assertAllClose(row_max, tf_out_row)
         self.assertAllClose(full_max, tf_out_full)
@@ -126,12 +137,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.max(arr, axis=(0, 2))
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_max(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_max(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_max(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_max(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testBooleanAll(self):
     # make sure we test all possible kernel invocations
     # test operation where T(0) is not the identity
@@ -150,11 +165,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([1], dtype=np.bool).reshape([])
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_all(arr, 1, False)
-          tf_col_sum = self._tf_reduce_all(arr, 0, False)
-          tf_full_sum = self._tf_reduce_all(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.bool,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_all(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_all(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_all(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -168,9 +185,12 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.bool)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_all(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_all(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.bool, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_all(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_all(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 3b8924904c5eb20670a2d61fe1f5a3af470eebde..2913d5b0a35ca6ef76e4ae0e4e1aeda73ad80a3e 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -86,7 +86,7 @@ class ReluTest(test.TestCase):
     self.assertAllClose(np_relu, tf_relu)
     self.assertShapeEqual(np_relu, tf_relu)
 
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testReluInt8x4BadShape(self):
     if not test.is_gpu_available(cuda_only=True):
       self.skipTest("No GPU available")
@@ -522,7 +522,7 @@ class SeluTest(test.TestCase):
     for t in [np.float16, np.float32, np.float64]:
       self._testSelu(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
-      # Force executed on CPU in case GPU kernels are avaiable.
+      # Force executed on CPU in case GPU kernels are available.
       with ops.device("/device:CPU:0"):
         self._testSelu(
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index d31652eb895e424281932fd424cb52bd234bc0f9..a6ce2cc90e19c2c0436179a2df30b3c085f2cfd3 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -22,13 +22,18 @@ import gc
 import os
 import pickle
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -49,7 +54,8 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
+class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
 
   def tearDown(self):
     gc.collect()
@@ -741,7 +747,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla("This test never passed for XLA")
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
     self.evaluate(variables.global_variables_initializer())
@@ -1035,67 +1040,190 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           session.run(copied.initializer)
 
-
-class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
-
+  def create_variant_shape_and_type_data(self):
+    variant_shape_and_type_data = (
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData())
+    variant_shape_and_type_data.is_set = True
+    stored_shape = tensor_shape.TensorShape([None, 4]).as_proto()
+    stored_dtype = dtypes.float32.as_datatype_enum
+    # NOTE(ebrevdo): shape_and_type lacks append() in some versions of protobuf.
+    variant_shape_and_type_data.shape_and_type.extend([
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType(
+            shape=stored_shape, dtype=stored_dtype)])
+    return variant_shape_and_type_data
+
+  @def_function.function
+  def create_constant_variant(self, value):
+    value = constant_op.constant(
+        tensor_pb2.TensorProto(
+            dtype=dtypes.variant.as_datatype_enum,
+            tensor_shape=tensor_shape.TensorShape([]).as_proto(),
+            variant_val=[
+                tensor_pb2.VariantTensorDataProto(
+                    # Match registration in variant_op_registry.cc
+                    type_name=b"int",
+                    metadata=np.array(value, dtype=np.int32).tobytes())
+            ]))
+    return value
+
+  # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
+  # EagerTensor constants with TensorProto inputs.
   @test_util.run_in_graph_and_eager_modes()
-  def test_dense_var_to_tensor_read_dtype_same_as_var_dtype(self):
-    # read_dtype is same as dtype
-    v = resource_variable_ops.ResourceVariable(1.0, dtype=dtypes.float32)
-    v = resource_variable_ops._MixedPrecisionVariable(v, dtypes.float32)
-    if not context.executing_eagerly():
-      v.initializer.run()
-
-    # dtype is not read_dtype, return NotImplemented
+  def testVariantInitializer(self):
+    variant_shape_and_type_data = self.create_variant_shape_and_type_data()
+    value = self.create_constant_variant(3)
+    initializer = array_ops.fill([3], value)
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        initializer, variant_shape_and_type_data,
+        graph_mode=not context.executing_eagerly())
+    v = resource_variable_ops.ResourceVariable(initializer)
+    read = array_ops.identity(v)
+    read_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(read))
     self.assertEqual(
-        NotImplemented, v._dense_var_to_tensor(dtype=dtypes.float16))
-    self.assertEqual(NotImplemented,
-                     v._dense_var_to_tensor(dtype=dtypes.float16, as_ref=True))
-
-    # as_ref is False
-    t = v._dense_var_to_tensor(as_ref=False)
-    self.assertTrue(isinstance(t, ops.Tensor))
-    self.assertEqual(t.dtype, dtypes.float32)
-    self.assertEqual(self.evaluate(t), 1.0)
-
-    t = v._dense_var_to_tensor(dtype=dtypes.float32, as_ref=False)
-    self.assertTrue(isinstance(t, ops.Tensor))
-    self.assertEqual(t.dtype, dtypes.float32)
-    self.assertEqual(self.evaluate(t), 1.0)
-
-    # as_ref is True
-    self.assertEqual(NotImplemented, v._dense_var_to_tensor(as_ref=True))
-    self.assertEqual(NotImplemented,
-                     v._dense_var_to_tensor(dtype=dtypes.float32, as_ref=True))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_dense_var_to_tensor_read_dtype_different_from_var_dtype(self):
-    # read_dtype is different from dtype
-    v = resource_variable_ops.ResourceVariable(1.0, dtype=dtypes.float32)
-    v = resource_variable_ops._MixedPrecisionVariable(v, dtypes.float16)
+        read_variant_shape_and_type, variant_shape_and_type_data)
+    gather = v.sparse_read([0])
+    gather_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(gather))
+    self.assertEqual(
+        gather_variant_shape_and_type, variant_shape_and_type_data)
+    # Make sure initializer runs.
     if not context.executing_eagerly():
-      v.initializer.run()
-
-    # as_ref is False
-    t = v._dense_var_to_tensor(as_ref=False)
-    self.assertTrue(isinstance(t, ops.Tensor))
-    self.assertEqual(t.dtype, dtypes.float16)
-    self.assertEqual(self.evaluate(t), 1.0)
-
-    t = v._dense_var_to_tensor(dtype=dtypes.float16, as_ref=False)
-    self.assertTrue(isinstance(t, ops.Tensor))
-    self.assertEqual(t.dtype, dtypes.float16)
-    self.assertEqual(self.evaluate(t), 1.0)
-
-    # as_ref is True
-    self.assertEqual(NotImplemented, v._dense_var_to_tensor(as_ref=True))
-    self.assertEqual(NotImplemented,
-                     v._dense_var_to_tensor(dtype=dtypes.float16, as_ref=True))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testDistributeStrategy(self):
-    v = resource_variable_ops.ResourceVariable(1, dtype=dtypes.int32)
-    self.assertIsNone(v._distribute_strategy)
+      self.evaluate(v.initializer)
+      self.evaluate(read.op)
+      self.evaluate(gather.op)
+
+  @parameterized.parameters([
+      # batch_dims=0 (equivalent to tf.gather)
+      dict(  # 2D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[2, 1], [0, 3]],
+          expected=[[8, 7], [6, 9]]),
+      dict(  # 3D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[9, 7], [8, 6]], [[6, 9], [8, 8]]]),
+      dict(  # 4D indices
+          batch_dims=0,
+          params=[8, 9],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[8, 9], [9, 8]], [[8, 8], [9, 9]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+
+      # batch_dims=indices.shape.ndims - 1 (equivalent to tf.batch_gather)
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+
+      # 0 < batch_dims < indices.shape.ndims - 1
+      dict(  # 3D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[13, 11], [12, 10]], [[20, 23], [22, 22]]]),
+      dict(  # 4D indices (1 batch dim)
+          batch_dims=1,
+          params=[[6, 7], [8, 9]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[6, 7], [7, 6]], [[6, 6], [7, 7]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+      dict(  # 4D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[2, 3], [4, 5]], [[6, 7], [8, 9]]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[2, 3], [3, 2]], [[4, 4], [5, 5]]],
+                    [[[7, 7], [6, 6]], [[8, 9], [9, 8]]]]),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testGatherWithBatchDims(self, params, indices, batch_dims, expected):
+    var = resource_variable_ops.ResourceVariable(params, name="var0")
+    with ops.control_dependencies([var.initializer]):
+      result = resource_variable_ops.resource_gather(
+          var.handle, indices, dtype=var.dtype, batch_dims=batch_dims)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=0,
+          output_shape=[2, 3, 8, 9, 10, 3, 4, 5, 6, 7]
+          # = indices.shape + params.shape[1:]
+      ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=1,
+          output_shape=[2, 3, 8, 9, 10, 4, 5, 6, 7]
+          # = params.shape[:1] + indices.shape[1:] + params.shape[2:]
+      ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          output_shape=[2, 3, 8, 9, 10, 5, 6, 7]
+          # = params.shape[:2] + indices.shape[2:] + params.shape[3:]
+      ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 4, 9, 10],
+          batch_dims=3,
+          output_shape=[2, 3, 4, 9, 10, 6, 7]
+          # = params.shape[:3] + indices.shape[3:] + params.shape[4:]
+      ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 4, 5, 10],
+          batch_dims=4,
+          output_shape=[2, 3, 4, 5, 10, 7]
+          # = params.shape[:4] + indices.shape[4:] + params.shape[5:]
+      ),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testGatherWithBatchDimsMatchesTensor(self, params_shape, indices_shape,
+                                           batch_dims, output_shape):
+    """Checks that gather with batch_dims returns the correct shape."""
+    # Generate a `params` tensor with the indicated shape.
+    params_size = np.prod(params_shape)
+    params = np.reshape(np.arange(params_size, dtype=np.int32), params_shape)
+
+    # Generate an `indices` tensor with the indicated shape, where each index
+    # is within the appropriate range.
+    indices_size = np.prod(indices_shape)
+    indices = np.reshape(np.arange(indices_size, dtype=np.int32), indices_shape)
+    indices = indices % params_shape[batch_dims]
+
+    var = resource_variable_ops.ResourceVariable(params, name="var0")
+    with ops.control_dependencies([var.initializer]):
+      expected = array_ops.gather(
+          var.read_value(), indices, batch_dims=batch_dims)
+      result = resource_variable_ops.resource_gather(
+          var.handle, indices, dtype=var.dtype, batch_dims=batch_dims)
+
+    self.assertAllEqual(output_shape, result.shape.as_list())
+    self.assertAllEqual(expected, result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
similarity index 67%
rename from tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
rename to tensorflow/python/kernel_tests/rnn_cell_test.py
index ef372b947cedf71e9d44423f10cc43375b467cd9..f838b1f92256bf0b307335470e7408a92c3e410e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -12,25 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for rnn module."""
+"""Tests for RNN cells."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
 
+from absl.testing import parameterized
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib import rnn as rnn_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -38,16 +42,18 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 
 
-class Plus1RNNCell(rnn_lib.RNNCell):
+class Plus1RNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -62,7 +68,7 @@ class Plus1RNNCell(rnn_lib.RNNCell):
     return (input_ + 1, state + 1)
 
 
-class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
+class DummyMultiDimensionalLSTM(rnn_cell.RNNCell):
   """LSTM Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input to this cell may have an arbitrary number of dimensions that follow
@@ -97,7 +103,7 @@ class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
     return (input_ + 1, (h + 1, c + 1))
 
 
-class NestedRNNCell(rnn_lib.RNNCell):
+class NestedRNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input, output and state of this cell is a tuple of two tensors.
@@ -161,18 +167,19 @@ class TestStateSaverWithCounters(TestStateSaver):
   inherits from the TestStateSaver and adds the counters for calls of functions.
   """
 
+  @test_util.run_v1_only("b/124229375")
   def __init__(self, batch_size, state_size):
     super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
     self._num_state_calls = variables_lib.VariableV1(0)
     self._num_save_state_calls = variables_lib.VariableV1(0)
 
   def state(self, name):
-    with ops_lib.control_dependencies(
+    with ops.control_dependencies(
         [state_ops.assign_add(self._num_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).state(name)
 
   def save_state(self, name, state):
-    with ops_lib.control_dependencies([state_ops.assign_add(
+    with ops.control_dependencies([state_ops.assign_add(
         self._num_save_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).save_state(name, state)
 
@@ -191,12 +198,14 @@ class RNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
     inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
       rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
 
+  @test_util.run_v1_only("b/124229375")
   def testRNN(self):
     cell = Plus1RNNCell()
     batch_size = 2
@@ -224,6 +233,7 @@ class RNNTest(test.TestCase):
                           max_length * np.ones(
                               (batch_size, input_size), dtype=np.float32))
 
+  @test_util.run_v1_only("b/124229375")
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
@@ -260,6 +270,7 @@ class RNNTest(test.TestCase):
       for d_v in full_dropout_values[:-1]:  # Add 1.0 to dropped_out (all zeros)
         self.assertAllClose(d_v, np.ones_like(input_value))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicCalculation(self):
     cell = Plus1RNNCell()
     sequence_length = array_ops.placeholder(dtypes.int64)
@@ -310,7 +321,7 @@ class RNNTest(test.TestCase):
                                      1.0 * (2 + 1) * np.ones((input_size)))))
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -329,6 +340,7 @@ class RNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testScope(self):
 
     def factory(scope):
@@ -358,21 +370,22 @@ class LSTMTest(test.TestCase):
     lstm = rnn_cell.LSTMCell(10)
     input_tensor = array_ops.ones([10, 50])
     lstm.build(input_tensor.get_shape())
-    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+    self.assertEqual(lstm._bias.dtype.base_dtype, dtypes.float32)
 
     # Explicitly pass dtype in constructor
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       lstm = rnn_cell.LSTMCell(10, dtype=dtype)
       input_tensor = array_ops.ones([10, 50])
       lstm.build(input_tensor.get_shape())
-      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+      self.assertEqual(lstm._bias.dtype.base_dtype, dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -389,12 +402,13 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testCellClipping(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -419,12 +433,13 @@ class LSTMTest(test.TestCase):
       # if cell c is clipped to 0, tanh(c) = 0 => m==0
       self.assertAllEqual(value, np.zeros((batch_size, num_units)))
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingSimpleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
@@ -452,12 +467,13 @@ class LSTMTest(test.TestCase):
           })
       self.assertAllEqual(last_state_value, saved_state_value)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, num_units)
@@ -486,12 +502,13 @@ class LSTMTest(test.TestCase):
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingNestedTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(
@@ -556,13 +573,14 @@ class LSTMTest(test.TestCase):
         self.assertAllEqual(last_states[i],
                             named_saved_states[flat_state_names[i]])
 
+  @test_util.run_v1_only("b/124229375")
   def testProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -588,7 +606,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -630,7 +648,7 @@ class LSTMTest(test.TestCase):
       self.assertEqual(len(outputs_notuple), len(inputs))
       self.assertEqual(len(outputs_tuple), len(inputs))
       self.assertTrue(isinstance(state_tuple, tuple))
-      self.assertTrue(isinstance(state_notuple, ops_lib.Tensor))
+      self.assertTrue(isinstance(state_notuple, ops.Tensor))
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -651,6 +669,7 @@ class LSTMTest(test.TestCase):
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
+  @test_util.run_v1_only("b/124229375")
   def testProjSharding(self):
     num_units = 3
     input_size = 5
@@ -659,7 +678,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
@@ -684,6 +703,7 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInput(self):
     num_units = 3
     input_size = 5
@@ -692,7 +712,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float64, shape=(None, input_size))
@@ -720,6 +740,7 @@ class LSTMTest(test.TestCase):
       values = sess.run(outputs, feed_dict={inputs[0]: input_value})
       self.assertEqual(values[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testShardNoShardEquivalentOutput(self):
     num_units = 3
     input_size = 5
@@ -728,7 +749,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
@@ -774,6 +795,7 @@ class LSTMTest(test.TestCase):
       for (s_noshard, s_shard) in zip(state_values_noshard, state_values_shard):
         self.assertAllClose(s_noshard, s_shard, atol=1e-3)
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInputWithDropoutAndDynamicCalculation(self):
     """Smoke test for using LSTM with doubles, dropout, dynamic calculation."""
 
@@ -784,7 +806,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       sequence_length = array_ops.placeholder(dtypes.int64)
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -828,13 +850,14 @@ class LSTMTest(test.TestCase):
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithReuse(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
@@ -878,13 +901,14 @@ class LSTMTest(test.TestCase):
         # Different weights used so outputs should be different.
         self.assertTrue(np.linalg.norm(o1 - o3) > 1e-6)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithDifferentNamescope(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
@@ -896,10 +920,10 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
 
-      with ops_lib.name_scope("scope0"):
+      with ops.name_scope("scope0"):
         with variable_scope.variable_scope("share_scope"):
           outputs0, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
-      with ops_lib.name_scope("scope1"):
+      with ops.name_scope("scope1"):
         with variable_scope.variable_scope("share_scope", reuse=True):
           outputs1, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -915,6 +939,7 @@ class LSTMTest(test.TestCase):
       for out0, out1 in zip(outputs0_values, outputs1_values):
         self.assertAllEqual(out0, out1)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicRNNAllowsUnknownTimeDimension(self):
     inputs = array_ops.placeholder(dtypes.float32, shape=[1, None, 20])
     cell = rnn_cell.GRUCell(30)
@@ -930,7 +955,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1006,7 +1031,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1117,7 +1142,7 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
     ########### Step 1: Run static graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1154,8 +1179,8 @@ class LSTMTest(test.TestCase):
             for y in [outputs_static[0], outputs_static[-1], state_static]
         ])
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         # pylint: disable=bad-builtin
@@ -1177,7 +1202,7 @@ class LSTMTest(test.TestCase):
             static_individual_variable_gradients, feed_dict=feeds)
 
     ########## Step 2: Run dynamic graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1225,8 +1250,8 @@ class LSTMTest(test.TestCase):
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
@@ -1259,12 +1284,12 @@ class LSTMTest(test.TestCase):
 
     self.assertEqual(len(values_static), len(values_dynamic))
     for (value_static, value_dynamic) in zip(values_static, values_dynamic):
-      self.assertAllEqual(value_static, value_dynamic)
-    self.assertAllEqual(state_value_static, state_value_dynamic)
+      self.assertAllClose(value_static, value_dynamic)
+    self.assertAllClose(state_value_static, state_value_dynamic)
 
     if in_graph_mode:
 
-      self.assertAllEqual(static_grad_values, dynamic_grad_values)
+      self.assertAllClose(static_grad_values, dynamic_grad_values)
 
       self.assertEqual(
           len(static_individual_grad_values),
@@ -1276,14 +1301,14 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_grad_values, dynamic_individual_grad_values)):
         tf_logging.info("Comparing individual gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
         tf_logging.info(
             "Comparing individual variable gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
   @test_util.run_in_graph_and_eager_modes
   def testDynamicEquivalentToStaticRNN(self):
@@ -1337,7 +1362,7 @@ class BidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testBidirectionalRNN(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalRNN(use_shape, True))
       variables_lib.global_variables_initializer().run()
@@ -1358,33 +1383,33 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # First sequence in batch is length=2
       # Check that the time=0 forward output is equal to time=1 backward output
-      self.assertEqual(out[0][0][0], out[1][0][3])
-      self.assertEqual(out[0][0][1], out[1][0][4])
-      self.assertEqual(out[0][0][2], out[1][0][5])
+      self.assertAllClose(out[0][0][0], out[1][0][3])
+      self.assertAllClose(out[0][0][1], out[1][0][4])
+      self.assertAllClose(out[0][0][2], out[1][0][5])
       # Check that the time=1 forward output is equal to time=0 backward output
-      self.assertEqual(out[1][0][0], out[0][0][3])
-      self.assertEqual(out[1][0][1], out[0][0][4])
-      self.assertEqual(out[1][0][2], out[0][0][5])
+      self.assertAllClose(out[1][0][0], out[0][0][3])
+      self.assertAllClose(out[1][0][1], out[0][0][4])
+      self.assertAllClose(out[1][0][2], out[0][0][5])
 
       # Second sequence in batch is length=3
       # Check that the time=0 forward output is equal to time=2 backward output
-      self.assertEqual(out[0][1][0], out[2][1][3])
-      self.assertEqual(out[0][1][1], out[2][1][4])
-      self.assertEqual(out[0][1][2], out[2][1][5])
+      self.assertAllClose(out[0][1][0], out[2][1][3])
+      self.assertAllClose(out[0][1][1], out[2][1][4])
+      self.assertAllClose(out[0][1][2], out[2][1][5])
       # Check that the time=1 forward output is equal to time=1 backward output
-      self.assertEqual(out[1][1][0], out[1][1][3])
-      self.assertEqual(out[1][1][1], out[1][1][4])
-      self.assertEqual(out[1][1][2], out[1][1][5])
+      self.assertAllClose(out[1][1][0], out[1][1][3])
+      self.assertAllClose(out[1][1][1], out[1][1][4])
+      self.assertAllClose(out[1][1][2], out[1][1][5])
       # Check that the time=2 forward output is equal to time=0 backward output
-      self.assertEqual(out[2][1][0], out[0][1][3])
-      self.assertEqual(out[2][1][1], out[0][1][4])
-      self.assertEqual(out[2][1][2], out[0][1][5])
+      self.assertAllClose(out[2][1][0], out[0][1][3])
+      self.assertAllClose(out[2][1][1], out[0][1][4])
+      self.assertAllClose(out[2][1][2], out[0][1][5])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
   def _testBidirectionalRNNWithoutSequenceLength(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
@@ -1402,22 +1427,19 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # Both sequences in batch are length=8.  Check that the time=i
       # forward output is equal to time=8-1-i backward output
-      for i in xrange(8):
-        self.assertEqual(out[i][0][0], out[8 - 1 - i][0][3])
-        self.assertEqual(out[i][0][1], out[8 - 1 - i][0][4])
-        self.assertEqual(out[i][0][2], out[8 - 1 - i][0][5])
-      for i in xrange(8):
-        self.assertEqual(out[i][1][0], out[8 - 1 - i][1][3])
-        self.assertEqual(out[i][1][1], out[8 - 1 - i][1][4])
-        self.assertEqual(out[i][1][2], out[8 - 1 - i][1][5])
+      for i in range(8):
+        self.assertAllClose(out[i][0][0:3], out[8 - 1 - i][0][3:6])
+        self.assertAllClose(out[i][1][0:3], out[8 - 1 - i][1][3:6])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNN(self):
     self._testBidirectionalRNN(use_shape=False)
     self._testBidirectionalRNN(use_shape=True)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNWithoutSequenceLength(self):
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=False)
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=True)
@@ -1472,7 +1494,7 @@ class BidirectionalRNNTest(test.TestCase):
 
   def _testBidirectionalDynamicRNN(self, use_shape, use_state_tuple,
                                    use_time_major, use_sequence_length):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalDynamicRNN(
               use_shape, use_state_tuple, use_time_major, use_sequence_length))
@@ -1534,6 +1556,7 @@ class BidirectionalRNNTest(test.TestCase):
           self.assertAllEqual(out[t, :, 0:3], out[max_length - t - 1, :, 3:6])
         self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNN(self):
     # Generate 2^5 option values
     # from [True, True, True, True, True] to [False, False, False, False, False]
@@ -1549,7 +1572,7 @@ class BidirectionalRNNTest(test.TestCase):
     # REMARKS: factory(scope) is a function accepting a scope
     #          as an argument, such scope can be None, a string
     #          or a VariableScope instance.
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1568,6 +1591,7 @@ class BidirectionalRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNScope(self):
 
     def factory(scope):
@@ -1578,6 +1602,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._testScope(factory, use_outer_scope=False)
     self._testScope(factory, prefix=None, use_outer_scope=False)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNNScope(self):
 
     def get_factory(use_time_major):
@@ -1606,13 +1631,14 @@ class MultiDimensionalLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testMultiDimensionalLSTMAllRNNContainers(self):
     feature_dims = (3, 4, 5)
     input_size = feature_dims
     batch_size = 2
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
@@ -1717,13 +1743,14 @@ class NestedLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testNestedIOLSTMAllRNNContainers(self):
     input_size = 5
     batch_size = 2
     state_size = 6
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
           dtypes.float32, shape=(None, input_size)),
@@ -1868,7 +1895,7 @@ class StateSaverRNNTest(test.TestCase):
     batch_size = 2
     state_saver = TestStateSaver(batch_size, 2 * num_units)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           self._factory(scope=scope, state_saver=state_saver)
@@ -1900,6 +1927,7 @@ class StateSaverRNNTest(test.TestCase):
     have influence on number of calls to save_state and state methods of
     state_saver object (the number of calls should be same.)
     """
+    self.skipTest("b/124196246 Breakage for sess.run([out, ...]): 2 != 1")
 
     num_units = 3
     batch_size = 2
@@ -1935,6 +1963,7 @@ class GRUTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamic(self):
     time_steps = 8
     num_units = 3
@@ -1945,7 +1974,7 @@ class GRUTest(test.TestCase):
 
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
 
@@ -1967,7 +1996,7 @@ class GRUTest(test.TestCase):
       sess.run([outputs_dynamic, state_dynamic], feed_dict=feeds)
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1986,6 +2015,7 @@ class GRUTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicScope(self):
     time_steps = 8
     num_units = 3
@@ -2016,8 +2046,9 @@ class RawRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def _testRawRNN(self, max_time):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       batch_size = 16
       input_depth = 4
       num_units = 3
@@ -2115,6 +2146,7 @@ class RawRNNTest(test.TestCase):
         for i in range(1, len(gradients_val)):
           self.assertAllClose(gradients_dynamic_rnn_val[i], gradients_val[i])
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNZeroLength(self):
     # NOTE: Because with 0 time steps, raw_rnn does not have shape
     # information about the input, it is impossible to perform
@@ -2125,8 +2157,9 @@ class RawRNNTest(test.TestCase):
   def testRawRNN(self):
     self._testRawRNN(max_time=10)
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopState(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2161,8 +2194,9 @@ class RawRNNTest(test.TestCase):
       loop_state = r[-1]
       self.assertEqual([10], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopStateWithTensorArray(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 4
       batch_size = 16
       input_depth = 4
@@ -2204,8 +2238,9 @@ class RawRNNTest(test.TestCase):
       loop_state = loop_state.stack()
       self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testEmitDifferentStructureThanCellOutput(self):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2253,7 +2288,7 @@ class RawRNNTest(test.TestCase):
           np.ones((max_time, batch_size, 1), np.int64), output_vals[1])
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -2272,6 +2307,7 @@ class RawRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNScope(self):
     max_time = 10
     batch_size = 16
@@ -2329,7 +2365,7 @@ class DeviceWrapperCell(rnn_cell.RNNCell):
 
   def __call__(self, input_, state, scope=None):
     if self._device is not None:
-      with ops_lib.device(self._device):
+      with ops.device(self._device):
         return self._cell(input_, state, scope=scope)
     else:
       return self._cell(input_, state, scope=scope)
@@ -2353,11 +2389,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
-      with ops_lib.device(input_device):
+      with ops.device(input_device):
         inputs = constant_op.constant(inputs)
 
     if rnn_device is not None:
-      with ops_lib.device(rnn_device):
+      with ops.device(rnn_device):
         outputs, _ = rnn.dynamic_rnn(
             gpu_cell,
             inputs,
@@ -2443,5 +2479,874 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     _assert_in("TensorArray", gpu_stats, cpu_stats)
 
 
+class RNNCellTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCellNotTrainable(self):
+    with self.cached_session() as sess:
+
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.non_trainable_variables])
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test GRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.156736, 0.156736]])
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          m = array_ops.zeros([1, 8], dtype=dtype)
+          cell = rnn_cell_impl.MultiRNNCell(
+              [
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                  for _ in range(2)
+              ],
+              state_is_tuple=False)
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, out_m = cell(x, m)
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run([g, out_m], {
+              x: np.array([[1., 1.]]),
+              m: 0.1 * np.ones([1, 8])
+          })
+          self.assertEqual(len(res), 2)
+          variables = variables_lib.global_variables()
+          self.assertEqual(expected_variable_names, [v.name for v in variables])
+          # The numbers in results were not calculated, this is just a
+          # smoke test.
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          expected_mem = np.array(
+              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
+              dtype=np_dtype)
+          self.assertAllClose(res[1], expected_mem, 1e-2)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test BasicLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          m = array_ops.zeros([1, 4], dtype=dtype)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run(
+              [g, out_m], {
+                  x: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
+          self.assertEqual(len(res), 2)
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size - 1, state_size])
+              })
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3  # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size, state_size])
+              })
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellStateTupleType(self):
+    with self.cached_session():
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = (array_ops.zeros([1, 2]),) * 2
+        m1 = (array_ops.zeros([1, 2]),) * 2
+        cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
+            state_is_tuple=True)
+        self.assertTrue(isinstance(cell.state_size, tuple))
+        self.assertTrue(
+            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(
+            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in regular tuples
+        _, (out_m0, out_m1) = cell(x, (m0, m1))
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in LSTMStateTuples
+        variable_scope.get_variable_scope().reuse_variables()
+        zero_state = cell.zero_state(1, dtypes.float32)
+        self.assertTrue(isinstance(zero_state, tuple))
+        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
+        _, (out_m0, out_m1) = cell(x, zero_state)
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = array_ops.zeros([1, 4])
+        m1 = array_ops.zeros([1, 4])
+        cell = rnn_cell_impl.MultiRNNCell(
+            [
+                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                for _ in range(2)
+            ],
+            state_is_tuple=True)
+        g, (out_m0, out_m1) = cell(x, (m0, m1))
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x: np.array([[1., 1.]]),
+                m0: 0.1 * np.ones([1, 4]),
+                m1: 0.1 * np.ones([1, 4])
+            })
+        self.assertEqual(len(res), 3)
+        # The numbers in results were not calculated, this is just a smoke test.
+        # Note, however, these values should match the original
+        # version having state_is_tuple=False.
+        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
+        expected_mem0 = np.array(
+            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
+        expected_mem1 = np.array(
+            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
+        self.assertAllClose(res[1], expected_mem0)
+        self.assertAllClose(res[2], expected_mem1)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCell(self):
+    with self.cached_session() as sess:
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        output, state = cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [output, state], {
+                x: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m: 0.1 * np.ones((batch_size, state_size))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1].shape, (batch_size, state_size))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCellVariables(self):
+    with self.cached_session():
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        cell(x, m)  # Execute to create variables
+      variables = variables_lib.global_variables()
+      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
+      self.assertEquals(variables[2].op.name,
+                        "root/lstm_cell/projection/kernel")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperCheckpointing(self):
+    for wrapper_type in [
+        rnn_cell_impl.DropoutWrapper,
+        rnn_cell_impl.ResidualWrapper,
+        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
+      cell = rnn_cell_impl.BasicRNNCell(1)
+      wrapper = wrapper_type(cell)
+      wrapper(array_ops.ones([1, 1]),
+              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
+      self.evaluate([v.initializer for v in cell.variables])
+      checkpoint = trackable_utils.Checkpoint(wrapper=wrapper)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      self.evaluate(cell._bias.assign([40.]))
+      save_path = checkpoint.save(prefix)
+      self.evaluate(cell._bias.assign([0.]))
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.assertAllEqual([40.], self.evaluate(cell._bias))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapper(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+    wrapper_object = wrapper_type(base_cell)
+    (name, dep), = wrapper_object._checkpoint_dependencies
+    wrapper_object.get_config()  # Should not throw an error
+    self.assertIs(dep, base_cell)
+    self.assertEqual("cell", name)
+
+    g_res, m_new_res = wrapper_object(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res[1], res[0] + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res[2], res[3])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapperWithSlice(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+
+    def residual_with_slice_fn(inp, out):
+      inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
+      return inp_sliced + out
+
+    g_res, m_new_res = wrapper_type(
+        base_cell, residual_with_slice_fn)(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
+        [g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res_m_new, res_m_new_res)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DeviceWrapper, rnn_cell_impl.DeviceWrapperV2])
+  def testDeviceWrapper(self, wrapper_type):
+    x = array_ops.zeros([1, 3])
+    m = array_ops.zeros([1, 3])
+    cell = rnn_cell_impl.GRUCell(3)
+    wrapped_cell = wrapper_type(cell, "/cpu:0")
+    (name, dep), = wrapped_cell._checkpoint_dependencies
+    wrapped_cell.get_config()  # Should not throw an error
+    self.assertIs(dep, cell)
+    self.assertEqual("cell", name)
+
+    outputs, _ = wrapped_cell(x, m)
+    self.assertIn("cpu:0", outputs.device.lower())
+
+  def _retrieve_cpu_gpu_stats(self, run_metadata):
+    cpu_stats = None
+    gpu_stats = None
+    step_stats = run_metadata.step_stats
+    for ds in step_stats.dev_stats:
+      if "cpu:0" in ds.device[-5:].lower():
+        cpu_stats = ds.node_stats
+      if "gpu:0" == ds.device[-5:].lower():
+        gpu_stats = ds.node_stats
+    return cpu_stats, gpu_stats
+
+  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
+    if not test.is_gpu_available():
+      # Can't perform this test w/o a GPU
+      return
+
+    gpu_dev = test.gpu_device_name()
+    with self.session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1, 3])
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
+        with ops.device("/cpu:0"):
+          outputs, _ = rnn.dynamic_rnn(
+              cell=cell, inputs=x, dtype=dtypes.float32)
+        run_metadata = config_pb2.RunMetadata()
+        opts = config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.FULL_TRACE)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
+      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
+      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 4])
+        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)
+        _, ml = multi_rnn_cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(ml, {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1, 0.1, 0.1]])
+        })
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
+        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
+        self.assertTrue(
+            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m_bad = array_ops.zeros([1, 4])
+        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
+
+        # Test incorrectness of state
+        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
+          rnn_cell_impl.MultiRNNCell(
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
+
+        _, ml = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            ml, {
+                x: np.array([[1., 1.]]),
+                m_good[0]: np.array([[0.1, 0.1]]),
+                m_good[1]: np.array([[0.1, 0.1]])
+            })
+
+        # The numbers in results were not calculated, this is just a
+        # smoke test.  However, these numbers should match those of
+        # the test testMultiRNNCell.
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+        self.assertAllClose(res[1], [[0.13248, 0.13248]])
+
+  @parameterized.parameters(
+      [[rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2],
+       [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2]])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperKerasStyle(self, wrapper, wrapper_v2):
+    """Tests if wrapper cell is instantiated in keras style scope."""
+    wrapped_cell_v2 = wrapper_v2(rnn_cell_impl.BasicRNNCell(1))
+    self.assertIsNone(getattr(wrapped_cell_v2, "_keras_style", None))
+
+    wrapped_cell = wrapper(rnn_cell_impl.BasicRNNCell(1))
+    self.assertFalse(wrapped_cell._keras_style)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2VariableNames(self, wrapper):
+    """Tests that variables names do not depend on wrapper in RNN layer."""
+
+    def _rnn_input(apply_wrapper, name):
+      """Creates a RNN layer with/without wrapper and returns built rnn cell."""
+      with base_layer.keras_style_scope():
+        base_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
+             for _ in range(2)])
+      if apply_wrapper:
+        rnn_cell = wrapper(base_cell)
+      else:
+        rnn_cell = base_cell
+      rnn_layer = keras_layers.RNN(rnn_cell, name=name)
+      inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+      _ = rnn_layer(inputs)
+      return base_cell._cells[0]
+
+    rnn_1 = _rnn_input(True, name="rnn_0")
+    rnn_2 = _rnn_input(False, name="rnn_1")
+
+    for i, cell in enumerate([rnn_1, rnn_2]):
+      var_prefix = "rnn_{}/cell_0/basic_rnn_cell/".format(i)
+      self.assertCountEqual([v.name for v in cell.weights],
+                            (var_prefix + "kernel:0", var_prefix + "bias:0"))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperWeights(self, wrapper):
+    """Tests that wrapper weights contain wrapped cells weights."""
+    base_cell = keras_layers.SimpleRNNCell(1, name="basic_rnn_cell")
+    rnn_cell = wrapper(base_cell)
+    rnn_layer = keras_layers.RNN(rnn_cell)
+    inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+    rnn_layer(inputs)
+
+    expected_weights = ["rnn/" + var for var in
+                        ("kernel:0", "recurrent_kernel:0", "bias:0")]
+    self.assertEqual(len(rnn_cell.weights), 3)
+    self.assertCountEqual([v.name for v in rnn_cell.weights], expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.trainable_variables],
+                          expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.non_trainable_variables],
+                          [])
+    self.assertCountEqual([v.name for v in rnn_cell.cell.weights],
+                          expected_weights)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Caller(self, wrapper):
+    """Tests that wrapper V2 is using the LayerRNNCell's caller."""
+
+    with base_layer.keras_style_scope():
+      base_cell = rnn_cell_impl.MultiRNNCell(
+          [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
+    rnn_cell = wrapper(base_cell)
+    inputs = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    state = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    _ = rnn_cell(inputs, [state, state])
+    weights = base_cell._cells[0].weights
+    self.assertLen(weights, expected_len=2)
+    self.assertTrue(all(["_wrapper" in v.name for v in weights]))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Build(self, wrapper):
+    cell = rnn_cell_impl.LSTMCell(10)
+    wrapper = wrapper(cell)
+    wrapper.build((1,))
+    self.assertTrue(cell.built)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
+
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          wrapper_type=None,
+                          scope="root",
+                          **kwargs):
+    if batch_size is None and time_steps is None:
+      # 2 time steps, batch size 1, depth 3
+      batch_size = 1
+      time_steps = 2
+      x = constant_op.constant(
+          [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
+      m = rnn_cell_impl.LSTMStateTuple(
+          *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)] * 2)
+    else:
+      x = constant_op.constant(
+          np.random.randn(time_steps, batch_size, 3).astype(np.float32))
+      m = rnn_cell_impl.LSTMStateTuple(*[
+          constant_op.
+          constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)] * 2)
+    outputs, final_state = rnn.dynamic_rnn(
+        cell=wrapper_type(
+            rnn_cell_impl.LSTMCell(
+                3, initializer=init_ops.constant_initializer(0.5)),
+            dtype=x.dtype, **kwargs),
+        time_major=True,
+        parallel_iterations=parallel_iterations,
+        inputs=x,
+        initial_state=m,
+        scope=scope)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([outputs, final_state])
+    self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
+    self.assertEqual(res[1].c.shape, (batch_size, 3))
+    self.assertEqual(res[1].h.shape, (batch_size, 3))
+    return res
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperProperties(self, wrapper_type):
+    cell = rnn_cell_impl.BasicRNNCell(10)
+    wrapper = wrapper_type(cell)
+    # Github issue 15810
+    self.assertEqual(wrapper.wrapped_cell, cell)
+    self.assertEqual(wrapper.state_size, 10)
+    self.assertEqual(wrapper.output_size, 10)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperZeroState(self, wrapper_type):
+    class _Cell(rnn_cell_impl.BasicRNNCell):
+
+      def zero_state(self, batch_size=None, dtype=None):
+        return "wrapped_cell_zero_state"
+    wrapper = wrapper_type(_Cell(10))
+    self.assertEqual(wrapper.zero_state(10, dtypes.float32),
+                     "wrapped_cell_zero_state")
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAllConstantInput(self, wrapper_type):
+    keep = array_ops.ones([])
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAll(self, wrapper_type):
+    keep = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperWithSeed(self, wrapper_type):
+    keep_some = 0.5
+    random_seed.set_random_seed(2)
+    ## Use parallel_iterations = 1 in both calls to
+    ## _testDropoutWrapper to ensure the (per-time step) dropout is
+    ## consistent across both calls.  Otherwise the seed may not end
+    ## up being munged consistently across both graphs.
+    res_standard_1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_1")
+    random_seed.set_random_seed(2)
+    res_standard_2 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_2")
+    self.assertAllClose(res_standard_1[0], res_standard_2[0])
+    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
+    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoOutput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(np.zeros(res[0].shape), res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    # Even though we dropout state, by default DropoutWrapper never
+    # drops out the memory ("c") term of an LSTMStateTuple.
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_none,
+        wrapper_type=wrapper_type)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    self.assertAllClose(true_full_output[0], res[0][0])
+    # Second output is modified by zero input state
+    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
+    # h state has been set to zero
+    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
+    # c state of an LSTMStateTuple is NEVER modified.
+    self.assertAllClose(true_c_state, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoInput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    # All outputs are different because inputs are zeroed out
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentOutput(self, wrapper_type):
+    keep_some = 0.8
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+    # Ensure the same dropout pattern for all time steps
+    output_mask = np.abs(res[0]) > 1e-6
+    for m in output_mask[1:]:
+      self.assertAllClose(output_mask[0], m)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutput(self, wrapper_type):
+    keep_some = 0.9
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+
+    # Smoke test for the state/input masks.
+    output_mask = np.abs(res[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res[1].c) > 1e-6
+    state_h_mask = np.abs(res[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(
+      self, wrapper_type):
+    keep_some = 0.9
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res0 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_0")
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_1")
+
+    output_mask = np.abs(res0[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res0[1].c) > 1e-6
+    state_h_mask = np.abs(res0[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+    # Ensure seeded calculation is identical.
+    self.assertAllClose(res0[0], res1[0])
+    self.assertAllClose(res0[1].c, res1[1].c)
+    self.assertAllClose(res0[1].h, res1[1].h)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 12b69da6c2e4806110b4af93042f94d5248d64e5..bdb01700582ec013b9e934a5ba127170e0f86bf3 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -25,7 +25,6 @@ import timeit
 import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
 from tensorflow.python.client import session
@@ -360,12 +359,6 @@ class RNNTest(test.TestCase):
     self._assert_cell_builds(rnn_cell_impl.GRUCell, f64, 5, 7, 3)
     self._assert_cell_builds(rnn_cell_impl.LSTMCell, f32, 5, 7, 3)
     self._assert_cell_builds(rnn_cell_impl.LSTMCell, f64, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndRNNCell, f32, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndRNNCell, f64, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndyGRUCell, f32, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndyGRUCell, f64, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f32, 5, 7, 3)
-    self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f64, 5, 7, 3)
 
   @test_util.run_deprecated_v1
   def testRNNWithKerasSimpleRNNCell(self):
@@ -655,13 +648,14 @@ class RNNTest(test.TestCase):
       save.restore(sess, save_path)
       self.assertAllEqual([10.] * 4, self.evaluate(lstm_cell._bias))
 
+  # TODO(scottzhu): Look into updating for V2 Intializers.
+  @test_util.run_deprecated_v1
   def testRNNCellSerialization(self):
     for cell in [
         rnn_cell_impl.LSTMCell(32, use_peepholes=True, cell_clip=True),
         rnn_cell_impl.BasicLSTMCell(32, dtype=dtypes.float32),
         rnn_cell_impl.BasicRNNCell(32, activation="relu", dtype=dtypes.float32),
-        rnn_cell_impl.GRUCell(
-            32, kernel_initializer="ones", dtype=dtypes.float32)
+        rnn_cell_impl.GRUCell(32, dtype=dtypes.float32)
     ]:
       with self.cached_session():
         x = keras.Input((None, 5))
@@ -724,12 +718,12 @@ class RNNTest(test.TestCase):
 def _static_vs_dynamic_rnn_benchmark_static(inputs_list_t, sequence_length):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = contrib_rnn.LSTMCell(
+  cell = rnn_cell_impl.LSTMCell(
       num_units=input_size,
       use_peepholes=True,
       initializer=initializer,
       state_is_tuple=False)
-  outputs, final_state = contrib_rnn.static_rnn(
+  outputs, final_state = rnn.static_rnn(
       cell,
       inputs_list_t,
       sequence_length=sequence_length,
@@ -746,7 +740,7 @@ def _static_vs_dynamic_rnn_benchmark_static(inputs_list_t, sequence_length):
 def _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length):
   (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = contrib_rnn.LSTMCell(
+  cell = rnn_cell_impl.LSTMCell(
       num_units=input_size,
       use_peepholes=True,
       initializer=initializer,
@@ -857,12 +851,12 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
 def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = contrib_rnn.LSTMCell(
+  cell = rnn_cell_impl.LSTMCell(
       num_units=input_size,
       use_peepholes=True,
       initializer=initializer,
       state_is_tuple=False)
-  outputs, final_state = contrib_rnn.static_rnn(
+  outputs, final_state = rnn.static_rnn(
       cell,
       inputs_list_t,
       sequence_length=sequence_length,
@@ -923,12 +917,12 @@ def _concat_state_vs_tuple_state_rnn_benchmark(inputs_list_t, sequence_length,
                                                state_is_tuple):
   (_, input_size) = inputs_list_t[0].get_shape().as_list()
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = contrib_rnn.LSTMCell(
+  cell = rnn_cell_impl.LSTMCell(
       num_units=input_size,
       use_peepholes=True,
       initializer=initializer,
       state_is_tuple=state_is_tuple)
-  outputs, final_state = contrib_rnn.static_rnn(
+  outputs, final_state = rnn.static_rnn(
       cell,
       inputs_list_t,
       sequence_length=sequence_length,
@@ -990,7 +984,7 @@ def concat_state_vs_tuple_state_rnn_benchmark(batch_size, max_time, num_units,
 def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length, swap_memory):
   (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-  cell = contrib_rnn.LSTMCell(
+  cell = rnn_cell_impl.LSTMCell(
       num_units=input_size,
       use_peepholes=True,
       initializer=initializer,
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 4e15894fb4aa8a90d8dd9914ba25dcfd27d5fe95..2a3021f982149f619eef2d32edd7929e0c8b7603 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -70,7 +70,6 @@ def handle_options(func, x, axis, exclusive, reverse):
   return x
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class CumsumTest(test.TestCase):
 
   valid_dtypes = [
@@ -135,6 +134,7 @@ class CumsumTest(test.TestCase):
         self._compareAll(x, axis)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123860949")  # The computation is constant folded
   def testLarge(self):
     for dtype in self.valid_dtypes:
       x = np.ones([1000000], dtype=dtype) / 1024
@@ -194,7 +194,6 @@ class CumsumTest(test.TestCase):
           self._compareGradient([5, 10], axis, exclusive, reverse)
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class CumprodTest(test.TestCase):
 
   valid_dtypes = [
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 88f7b27b77ee24c732b84a674587b63638b2c903..5bc301b61360584969e391b093a3f488dec06925 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -296,7 +296,7 @@ class StatefulScatterNdTest(test.TestCase):
                                     updates).get_shape().as_list(), shape)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index ce7e0c04c861dcbeee85d496496b3e657b883e56..8844895fb295c55ed01718bd717743304e1e8d15 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -179,7 +179,7 @@ class ScatterTest(test.TestCase):
             np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
           np_scatter(new, indices, updates)
           # Scatter via tensorflow
-          ref = variables.VariableV1(old)
+          ref = variables.VariableV1(old, use_resource=False)
           ref.initializer.run()
           tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
@@ -327,7 +327,7 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       if not test.is_gpu_available():
         with self.session(use_gpu=False):
-          ref = variables.VariableV1(params)
+          ref = variables.VariableV1(params, use_resource=False)
           ref.initializer.run()
 
           # Indices all in range, no problem.
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index c8e7c143ade2ca740833ea5f9bd18ab5c7b4a2e6..5bcce235547e8de499d8845623c2c15fd1b0dc26 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import node_def_pb2
@@ -417,7 +418,7 @@ class ShapeOpsTest(test.TestCase):
       self.assertRaises(ValueError, array_ops.squeeze, a, [100])
 
 
-class TileTest(test.TestCase):
+class TileTest(test.TestCase, parameterized.TestCase):
 
   def testScalar(self):
     for use_gpu in False, True:
@@ -556,13 +557,15 @@ class TileTest(test.TestCase):
     for _ in range(5):
       self._RunAndVerifyResult(10, use_gpu=True)
 
+  @parameterized.parameters(dtypes.int32, dtypes.int64)
   @test_util.run_deprecated_v1
-  def testGradientSimpleReduction(self):
+  def testGradientSimpleReduction(self, multiples_dtype):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.flatten()], shape=[4, 1], dtype=dtypes.float32)
-      tiled = array_ops.tile(a, [1, 4])
+      multiples = constant_op.constant([1, 4], dtype=multiples_dtype)
+      tiled = array_ops.tile(a, multiples)
       grad_shape = [4, 4]
       grad_inp = np.random.rand(*grad_shape).astype("f")
       grad_tensor = constant_op.constant(
@@ -660,11 +663,13 @@ class TileTest(test.TestCase):
       err = gradient_checker.compute_gradient_error(a, [4, 2], tiled, [4, 4])
     self.assertLess(err, 1e-3)
 
+  @parameterized.parameters(dtypes.int32, dtypes.int64)
   @test_util.run_deprecated_v1
-  def testGradientWithSparseGradWithRank1(self):
+  def testGradientWithSparseGradWithRank1(self, multiples_dtype):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
-    outputs = array_ops.gather(array_ops.tile(inputs, [3]),
+    multiples = constant_op.constant([3], dtype=dtypes.int64)
+    outputs = array_ops.gather(array_ops.tile(inputs, multiples),
                                [1, 5, 9, 3, 7, 2, 2, 2])
     with self.cached_session():
       error = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 554bf38029473bb9ff204a09556a182b378dd549..4caecc85ca5a1ab72648e015666a5666cf5335ab 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -29,6 +29,7 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -45,7 +46,10 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
     ],
     shard_count = 4,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -73,6 +77,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:spectral_ops_test_util",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -130,7 +135,10 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
-    tags = ["nomac"],
+    tags = [
+        "no_rocm",
+        "nomac",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index d98a568491fdb79b11f203863af746988eba10e0..c3a288730ac2572c991a78187d96442930d8913d 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -110,13 +110,13 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
-    with self.cached_session(use_gpu=True):
-      return self._tfFFTForRank(rank)(x).eval(feed_dict=feed_dict)
+    with self.cached_session(use_gpu=True) as sess:
+      return sess.run(self._tfFFTForRank(rank)(x), feed_dict=feed_dict)
 
   def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
-    with self.cached_session(use_gpu=True):
-      return self._tfIFFTForRank(rank)(x).eval(feed_dict=feed_dict)
+    with self.cached_session(use_gpu=True) as sess:
+      return sess.run(self._tfIFFTForRank(rank)(x), feed_dict=feed_dict)
 
   def _npFFT(self, x, rank, fft_length=None):
     if rank == 1:
@@ -292,12 +292,14 @@ class RFFTOpsTest(BaseFFTOpsTest):
                                               use_placeholder)
 
   def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.cached_session(use_gpu=True):
-      return self._tfFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
+    with self.cached_session(use_gpu=True) as sess:
+      return sess.run(
+          self._tfFFTForRank(rank)(x, fft_length), feed_dict=feed_dict)
 
   def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.cached_session(use_gpu=True):
-      return self._tfIFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
+    with self.cached_session(use_gpu=True) as sess:
+      return sess.run(
+          self._tfIFFTForRank(rank)(x, fft_length), feed_dict=feed_dict)
 
   def _npFFT(self, x, rank, fft_length=None):
     if rank == 1:
@@ -397,6 +399,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 rank, (size,) * rank,
                 use_placeholder=True)
 
+  @test_util.run_deprecated_v1
   def testFftLength(self):
     if test.is_gpu_available(cuda_only=True):
       with spectral_ops_test_util.fft_kernel_label_map():
@@ -465,7 +468,6 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -512,7 +514,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           x = np.zeros((5,) * rank).astype(np.float32)
           fft_length = [6] * rank
           with self.cached_session():
-            rfft_fn(x, fft_length).eval()
+            self.evaluate(rfft_fn(x, fft_length))
 
         with self.assertRaisesWithPredicateMatch(
             errors.InvalidArgumentError,
@@ -520,7 +522,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           x = np.zeros((3,) * rank).astype(np.complex64)
           fft_length = [6] * rank
           with self.cached_session():
-            irfft_fn(x, fft_length).eval()
+            self.evaluate(irfft_fn(x, fft_length))
 
   @test_util.run_deprecated_v1
   def testGrad_Simple(self):
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index e0ce06418a457eee9a45b172f9cc5887d1167153..a9bcbb8cd4b5445f9935d6640885b6dfab8dbfc1 100644
--- a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -232,7 +232,8 @@ class ReconstructionOpsTest(test.TestCase):
       # overlap, the gradient for this batch item will be 0-99 shaped as (10,
       # 10).
       reconstruction *= array_ops.stack(
-          [array_ops.zeros((100,)), math_ops.to_float(math_ops.range(100))])
+          [array_ops.zeros((100,)),
+           math_ops.cast(math_ops.range(100), dtypes.float32)])
       loss = math_ops.reduce_sum(reconstruction)
 
       # Verify that only the second batch item receives gradient.
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index a82492996a48448c3e5829ee6a8cede0bf20ad92..22a4d8b4d915d587144963ae196ae605b81fb773 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import tf_logging as logging
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
-    if dim is -1:
+    if dim == -1:
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index 8641156604c98e2737f8854db3a218905cfd9281..51a407bd33bc147f6d699f34b85dea331122a83c 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -104,11 +104,15 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     with self.cached_session(use_gpu=True):
       # outputs = space_to_batch(inputs)
       x_tf = self.space_to_batch(
-          math_ops.to_float(inputs), paddings, block_size=block_size)
+          math_ops.cast(inputs, dtypes.float32),
+          paddings,
+          block_size=block_size)
       self.assertAllEqual(x_tf.eval(), outputs)
       # inputs = batch_to_space(outputs)
       x_tf = self.batch_to_space(
-          math_ops.to_float(outputs), paddings, block_size=block_size)
+          math_ops.cast(outputs, dtypes.float32),
+          paddings,
+          block_size=block_size)
       self.assertAllEqual(x_tf.eval(), inputs)
 
   def _testOne(self, inputs, block_size, outputs):
@@ -200,11 +204,11 @@ class SpaceToBatchNDTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(
-            math_ops.to_float(inputs), block_shape, paddings)
+            math_ops.cast(inputs, dtypes.float32), block_shape, paddings)
         self.assertAllEqual(x_tf.eval(), outputs)
         # inputs = batch_to_space(outputs)
         x_tf = array_ops.batch_to_space_nd(
-            math_ops.to_float(outputs), block_shape, paddings)
+            math_ops.cast(outputs, dtypes.float32), block_shape, paddings)
         self.assertAllEqual(x_tf.eval(), inputs)
 
   def _testDirect(self, input_shape, block_shape, paddings):
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 7f3c381fa161bd59b1956e880d82e62d6b051b25..e96bc09f3652aaa4d41bddac6ad06daaff8bfbd6 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -285,7 +285,6 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
-  @test_util.disable_xla("This test never passed for XLA")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 637cfaec9907a59f7559053792e513739aad293f..c8dc99c8ec00960775c83b9a312e37ad2935e78d 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -81,7 +81,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       self.assertEqual(tf_value_ans.get_shape()[1], np_ans.shape[1])
       self.assertEqual(tf_tensor_ans.get_shape()[1], np_ans.shape[1])
 
-      for out in (tf_value_ans.eval(), self.evaluate(tf_tensor_ans)):
+      for out in (self.evaluate(tf_value_ans), self.evaluate(tf_tensor_ans)):
         if x.dtype == np.float32:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
         elif x.dtype == np.float64:
@@ -134,6 +134,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
       sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
 
+  @test_util.deprecated_graph_mode_only
   def testInvalidIndicesForSparseTensorDenseMatmul(self):
     # Note: use_gpu=False because nice errors are only returned from CPU kernel.
     with self.session(use_gpu=False):
@@ -147,23 +148,25 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
       with self.assertRaisesOpError(
           "k .10. from index.0,1. out of bounds .>=2."):
-        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+        self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
       dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
       with self.assertRaisesOpError(
           "k .10. from index.0,1. out of bounds .>=2."):
-        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+        self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
 
       # Repeat with adjoint_a, to get a different error.
       dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
       with self.assertRaisesOpError(
           "m .10. from index.0,1. out of bounds .>=2."):
-        sparse_ops.sparse_tensor_dense_matmul(
-            sparse_t, dense_t, adjoint_a=True).eval()
+        self.evaluate(
+            sparse_ops.sparse_tensor_dense_matmul(
+                sparse_t, dense_t, adjoint_a=True))
       dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
       with self.assertRaisesOpError(
           "m .10. from index.0,1. out of bounds .>=2."):
-        sparse_ops.sparse_tensor_dense_matmul(
-            sparse_t, dense_t, adjoint_a=True).eval()
+        self.evaluate(
+            sparse_ops.sparse_tensor_dense_matmul(
+                sparse_t, dense_t, adjoint_a=True))
 
   def testInvalidIndicesForSparseTensorDenseMatmulOnGPU(self):
     # Note: use_gpu=False because nice errors are only returned from CPU kerne
@@ -181,13 +184,13 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
       self.assertAllClose(expected_t,
                           sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t).eval())
+                              sparse_t, dense_t))
       dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
       expected_t = np.array(
           [[0] * 500, [np.nan] * 500, [0] * 500], dtype=np.float32)
       self.assertAllClose(expected_t,
                           sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t).eval())
+                              sparse_t, dense_t))
 
       # Repeat with adjoint_a, now the error is that the sparse index
       # is OOO w.r.t. the output.  The GPU kernel can't do much here,
@@ -197,13 +200,13 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
       self.assertAllClose(expected_t,
                           sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t, adjoint_a=True).eval())
+                              sparse_t, dense_t, adjoint_a=True))
 
       dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
       expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
       self.assertAllClose(expected_t,
                           sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t, adjoint_a=True).eval())
+                              sparse_t, dense_t, adjoint_a=True))
 
   # Tests setting one dimension to be a high value.
   def _testLarge(self, np_dtype):
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 80004db833c50ef460ed16237f4f775eb80b6877..14c5b53de922684ac6b64d618ee666ef2bcfd015 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -59,17 +59,17 @@ class SplitOpTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(array_ops.split(model_input, [4]), {model_input: inp})
 
-    # test that we can pass a scalar Tensor as num_splits
+    # scalar Tensors are not permitted as num_splits
     for axis in [0, -2]:
       with self.cached_session(use_gpu=True) as sess:
-        result = sess.run(
-            array_ops.split(
-                array_ops.ones([4, 4]),
-                num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
-                axis=axis))
-
-      self.assertEqual(result[0].shape, (2, 4))
-      self.assertEqual(result[1].shape, (2, 4))
+        with self.assertRaises(ValueError):
+          # pylint: disable=expression-not-assigned
+          sess.run(
+              array_ops.split(
+                  array_ops.ones([4, 4]),
+                  num_or_size_splits=constant_op.constant(2),
+                  axis=axis))
+          # pylint: enable=expression-not-assigned
 
     # test that none split dimensions remain, even if we don't know how
     # the split_dim will be split, but we do know the axis
@@ -113,8 +113,8 @@ class SplitOpTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testListOfScalarTensors(self):
-    a = math_ops.to_int32(5)
-    b = math_ops.to_int32(6)
+    a = math_ops.cast(5, dtypes.int32)
+    b = math_ops.cast(6, dtypes.int32)
 
     value = np.random.rand(11, 11)
 
@@ -373,7 +373,7 @@ class SplitOpTest(test.TestCase):
     assert s1.shape.as_list() == [1]
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index ca3357a0ed8f87cfcccd08a62c5b8526a898b664..04d635cdb1e6f35db10193c74623b5aa1013ee9d 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -81,7 +81,7 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.bool, np.float32, np.int32, np.int64]:
+        for dtype in [np.bool, np.float32, np.int16, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
           # Stack back into a single tensorflow tensor directly using np array
           c = array_ops.stack(data)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 83e06ba48bdbbe3189eafde7d0f42c2e4ced68ab..29cd00b78923cf7413114f858fe4c23a379a5af5 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -166,6 +166,7 @@ class StageTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testCapacity(self):
+    self.skipTest('b/123423516 this test is flaky on gpu.')
     capacity = 3
 
     with ops.Graph().as_default() as G:
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
index cd446eb40eb9ff1931a3eb4555f9dd81a77b659f..c03360042765afe39b0c88be799aa09571b5e81b 100644
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -19,27 +19,44 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import unittest
 
+import six
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.sequential import Sequential
+from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Dense
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
-class SummaryOpsTest(test_util.TensorFlowTestCase):
+class SummaryOpsCoreTest(test_util.TensorFlowTestCase):
 
   def testWrite(self):
     logdir = self.get_temp_dir()
     with context.eager_mode():
-      with summary_ops.create_file_writer(logdir).as_default():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
         output = summary_ops.write('tag', 42, step=12)
         self.assertTrue(output.numpy())
     events = events_from_logdir(logdir)
@@ -51,11 +68,12 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
 
   def testWrite_fromFunction(self):
     logdir = self.get_temp_dir()
-    @def_function.function
-    def f():
-      with summary_ops.create_file_writer(logdir).as_default():
-        return summary_ops.write('tag', 42, step=12)
     with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      @def_function.function
+      def f():
+        with writer.as_default():
+          return summary_ops.write('tag', 42, step=12)
       output = f()
       self.assertTrue(output.numpy())
     events = events_from_logdir(logdir)
@@ -70,7 +88,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     metadata = summary_pb2.SummaryMetadata()
     metadata.plugin_data.plugin_name = 'foo'
     with context.eager_mode():
-      with summary_ops.create_file_writer(logdir).as_default():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
         summary_ops.write('obj', 0, 0, metadata=metadata)
         summary_ops.write('bytes', 0, 0, metadata=metadata.SerializeToString())
         m = constant_op.constant(metadata.SerializeToString())
@@ -91,7 +109,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
   def testWrite_ndarray(self):
     logdir = self.get_temp_dir()
     with context.eager_mode():
-      with summary_ops.create_file_writer(logdir).as_default():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
         summary_ops.write('tag', [[1, 2], [3, 4]], step=12)
     events = events_from_logdir(logdir)
     value = events[1].summary.value[0]
@@ -101,7 +119,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     logdir = self.get_temp_dir()
     with context.eager_mode():
       t = constant_op.constant([[1, 2], [3, 4]])
-      with summary_ops.create_file_writer(logdir).as_default():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
         summary_ops.write('tag', t, step=12)
       expected = t.numpy()
     events = events_from_logdir(logdir)
@@ -110,11 +128,12 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
 
   def testWrite_tensor_fromFunction(self):
     logdir = self.get_temp_dir()
-    @def_function.function
-    def f(t):
-      with summary_ops.create_file_writer(logdir).as_default():
-        summary_ops.write('tag', t, step=12)
     with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      @def_function.function
+      def f(t):
+        with writer.as_default():
+          summary_ops.write('tag', t, step=12)
       t = constant_op.constant([[1, 2], [3, 4]])
       f(t)
       expected = t.numpy()
@@ -125,55 +144,321 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
   def testWrite_stringTensor(self):
     logdir = self.get_temp_dir()
     with context.eager_mode():
-      with summary_ops.create_file_writer(logdir).as_default():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
         summary_ops.write('tag', [b'foo', b'bar'], step=12)
     events = events_from_logdir(logdir)
     value = events[1].summary.value[0]
     self.assertAllEqual([b'foo', b'bar'], to_numpy(value))
 
+  @test_util.run_gpu_only
+  def testWrite_gpuDeviceContext(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        with ops.device('/GPU:0'):
+          value = constant_op.constant(42.0)
+          step = constant_op.constant(12, dtype=dtypes.int64)
+          summary_ops.write('tag', value, step=step).numpy()
+    empty_metadata = summary_pb2.SummaryMetadata()
+    events = events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(12, events[1].step)
+    self.assertEqual(42, to_numpy(events[1].summary.value[0]))
+    self.assertEqual(empty_metadata, events[1].summary.value[0].metadata)
+
   @test_util.also_run_as_tf_function
   def testWrite_noDefaultWriter(self):
-    with context.eager_mode():
-      self.assertFalse(summary_ops.write('tag', 42, step=0))
+    # Use assertAllEqual instead of assertFalse since it works in a defun.
+    self.assertAllEqual(False, summary_ops.write('tag', 42, step=0))
+
+  @test_util.also_run_as_tf_function
+  def testWrite_noStep_okayIfAlsoNoDefaultWriter(self):
+    # Use assertAllEqual instead of assertFalse since it works in a defun.
+    self.assertAllEqual(False, summary_ops.write('tag', 42))
+
+  @test_util.also_run_as_tf_function
+  def testWrite_noStep(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.create_file_writer(logdir).as_default():
+      with self.assertRaisesRegex(ValueError, 'No step set'):
+        summary_ops.write('tag', 42)
+
+  def testWrite_usingDefaultStep(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.eager_mode():
+        with summary_ops.create_file_writer(logdir).as_default():
+          summary_ops.set_step(1)
+          summary_ops.write('tag', 1.0)
+          summary_ops.set_step(2)
+          summary_ops.write('tag', 1.0)
+          mystep = variables.Variable(10, dtype=dtypes.int64)
+          summary_ops.set_step(mystep)
+          summary_ops.write('tag', 1.0)
+          mystep.assign_add(1)
+          summary_ops.write('tag', 1.0)
+      events = events_from_logdir(logdir)
+      self.assertEqual(5, len(events))
+      self.assertEqual(1, events[1].step)
+      self.assertEqual(2, events[2].step)
+      self.assertEqual(10, events[3].step)
+      self.assertEqual(11, events[4].step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  def testWrite_usingDefaultStepConstant_fromFunction(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.eager_mode():
+        writer = summary_ops.create_file_writer(logdir)
+        @def_function.function
+        def f():
+          with writer.as_default():
+            summary_ops.write('tag', 1.0)
+        summary_ops.set_step(1)
+        f()
+        summary_ops.set_step(2)
+        f()
+      events = events_from_logdir(logdir)
+      self.assertEqual(3, len(events))
+      self.assertEqual(1, events[1].step)
+      # The step value will still be 1 because the value was captured at the
+      # time the function was first traced.
+      self.assertEqual(1, events[2].step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  def testWrite_usingDefaultStepVariable_fromFunction(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.eager_mode():
+        writer = summary_ops.create_file_writer(logdir)
+        @def_function.function
+        def f():
+          with writer.as_default():
+            summary_ops.write('tag', 1.0)
+        mystep = variables.Variable(0, dtype=dtypes.int64)
+        summary_ops.set_step(mystep)
+        f()
+        mystep.assign_add(1)
+        f()
+        mystep.assign(10)
+        f()
+      events = events_from_logdir(logdir)
+      self.assertEqual(4, len(events))
+      self.assertEqual(0, events[1].step)
+      self.assertEqual(1, events[2].step)
+      self.assertEqual(10, events[3].step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  def testWrite_usingDefaultStepConstant_fromLegacyGraph(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.graph_mode():
+        writer = summary_ops.create_file_writer(logdir)
+        summary_ops.set_step(1)
+        with writer.as_default():
+          write_op = summary_ops.write('tag', 1.0)
+        summary_ops.set_step(2)
+        with self.cached_session() as sess:
+          sess.run(writer.init())
+          sess.run(write_op)
+          sess.run(write_op)
+          sess.run(writer.flush())
+      events = events_from_logdir(logdir)
+      self.assertEqual(3, len(events))
+      self.assertEqual(1, events[1].step)
+      # The step value will still be 1 because the value was captured at the
+      # time the graph was constructed.
+      self.assertEqual(1, events[2].step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  def testWrite_usingDefaultStepVariable_fromLegacyGraph(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.graph_mode():
+        writer = summary_ops.create_file_writer(logdir)
+        mystep = variables.Variable(0, dtype=dtypes.int64)
+        summary_ops.set_step(mystep)
+        with writer.as_default():
+          write_op = summary_ops.write('tag', 1.0)
+        first_assign_op = mystep.assign_add(1)
+        second_assign_op = mystep.assign(10)
+        with self.cached_session() as sess:
+          sess.run(writer.init())
+          sess.run(mystep.initializer)
+          sess.run(write_op)
+          sess.run(first_assign_op)
+          sess.run(write_op)
+          sess.run(second_assign_op)
+          sess.run(write_op)
+          sess.run(writer.flush())
+      events = events_from_logdir(logdir)
+      self.assertEqual(4, len(events))
+      self.assertEqual(0, events[1].step)
+      self.assertEqual(1, events[2].step)
+      self.assertEqual(10, events[3].step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
 
-  def testWrite_shouldRecordSummaries(self):
+  def testWrite_recordIf_constant(self):
     logdir = self.get_temp_dir()
     with context.eager_mode():
-      with summary_ops.create_file_writer(logdir).as_default():
-        self.assertTrue(summary_ops.write('default_on', 1, step=0))
-        with summary_ops.always_record_summaries():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
+        self.assertTrue(summary_ops.write('default', 1, step=0))
+        with summary_ops.record_if(True):
           self.assertTrue(summary_ops.write('set_on', 1, step=0))
-        with summary_ops.never_record_summaries():
+        with summary_ops.record_if(False):
           self.assertFalse(summary_ops.write('set_off', 1, step=0))
     events = events_from_logdir(logdir)
     self.assertEqual(3, len(events))
-    self.assertEqual('default_on', events[1].summary.value[0].tag)
+    self.assertEqual('default', events[1].summary.value[0].tag)
     self.assertEqual('set_on', events[2].summary.value[0].tag)
 
-  def testWrite_shouldRecordSummaries_fromFunction(self):
+  def testWrite_recordIf_constant_fromFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      @def_function.function
+      def f():
+        with writer.as_default():
+          # Use assertAllEqual instead of assertTrue since it works in a defun.
+          self.assertAllEqual(summary_ops.write('default', 1, step=0), True)
+          with summary_ops.record_if(True):
+            self.assertAllEqual(summary_ops.write('set_on', 1, step=0), True)
+          with summary_ops.record_if(False):
+            self.assertAllEqual(summary_ops.write('set_off', 1, step=0), False)
+      f()
+    events = events_from_logdir(logdir)
+    self.assertEqual(3, len(events))
+    self.assertEqual('default', events[1].summary.value[0].tag)
+    self.assertEqual('set_on', events[2].summary.value[0].tag)
+
+  def testWrite_recordIf_callable(self):
     logdir = self.get_temp_dir()
-    @def_function.function
-    def f(tag_prefix):
-      with summary_ops.create_file_writer(logdir).as_default():
-        default_output = summary_ops.write(tag_prefix + '_default', 1, step=0)
-        with summary_ops.always_record_summaries():
-          on_output = summary_ops.write(tag_prefix + '_on', 1, step=0)
-        with summary_ops.never_record_summaries():
-          off_output = summary_ops.write(tag_prefix + '_off', 1, step=0)
-        return [default_output, on_output, off_output]
     with context.eager_mode():
-      self.assertAllEqual([True, True, False], f('default'))
-      with summary_ops.always_record_summaries():
-        self.assertAllEqual([True, True, False], f('on'))
-      with summary_ops.never_record_summaries():
-        self.assertAllEqual([False, True, False], f('off'))
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      def record_fn():
+        step.assign_add(1)
+        return int(step % 2) == 0
+      with summary_ops.create_file_writer_v2(logdir).as_default():
+        with summary_ops.record_if(record_fn):
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
     events = events_from_logdir(logdir)
-    self.assertEqual(6, len(events))
-    self.assertEqual('default_default', events[1].summary.value[0].tag)
-    self.assertEqual('default_on', events[2].summary.value[0].tag)
-    self.assertEqual('on_default', events[3].summary.value[0].tag)
-    self.assertEqual('on_on', events[4].summary.value[0].tag)
-    self.assertEqual('off_on', events[5].summary.value[0].tag)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_callable_fromFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      @def_function.function
+      def record_fn():
+        step.assign_add(1)
+        return math_ops.equal(step % 2, 0)
+      @def_function.function
+      def f():
+        with writer.as_default():
+          with summary_ops.record_if(record_fn):
+            return [
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step)]
+      self.assertAllEqual(f(), [True, False, True])
+      self.assertAllEqual(f(), [False, True, False])
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_tensorInput_fromFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=[], dtype=dtypes.int64)])
+      def f(step):
+        with writer.as_default():
+          with summary_ops.record_if(math_ops.equal(step % 2, 0)):
+            return summary_ops.write('tag', 1, step=step)
+      self.assertTrue(f(0))
+      self.assertFalse(f(1))
+      self.assertTrue(f(2))
+      self.assertFalse(f(3))
+      self.assertTrue(f(4))
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  @test_util.also_run_as_tf_function
+  def testGetSetStep(self):
+    try:
+      self.assertIsNone(summary_ops.get_step())
+      summary_ops.set_step(1)
+      # Use assertAllEqual instead of assertEqual since it works in a defun.
+      self.assertAllEqual(1, summary_ops.get_step())
+      summary_ops.set_step(constant_op.constant(2))
+      self.assertAllEqual(2, summary_ops.get_step())
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  def testGetSetStep_variable(self):
+    with context.eager_mode():
+      try:
+        mystep = variables.Variable(0)
+        summary_ops.set_step(mystep)
+        self.assertAllEqual(0, summary_ops.get_step().read_value())
+        mystep.assign_add(1)
+        self.assertAllEqual(1, summary_ops.get_step().read_value())
+        # Check that set_step() properly maintains reference to variable.
+        del mystep
+        self.assertAllEqual(1, summary_ops.get_step().read_value())
+        summary_ops.get_step().assign_add(1)
+        self.assertAllEqual(2, summary_ops.get_step().read_value())
+      finally:
+        # Reset to default state for other tests.
+        summary_ops.set_step(None)
+
+  def testGetSetStep_variable_fromFunction(self):
+    with context.eager_mode():
+      try:
+        @def_function.function
+        def set_step(step):
+          summary_ops.set_step(step)
+          return summary_ops.get_step()
+        @def_function.function
+        def get_and_increment():
+          summary_ops.get_step().assign_add(1)
+          return summary_ops.get_step()
+        mystep = variables.Variable(0)
+        self.assertAllEqual(0, set_step(mystep))
+        self.assertAllEqual(0, summary_ops.get_step().read_value())
+        self.assertAllEqual(1, get_and_increment())
+        self.assertAllEqual(2, get_and_increment())
+        # Check that set_step() properly maintains reference to variable.
+        del mystep
+        self.assertAllEqual(3, get_and_increment())
+      finally:
+        # Reset to default state for other tests.
+        summary_ops.set_step(None)
 
   @test_util.also_run_as_tf_function
   def testSummaryScope(self):
@@ -223,6 +508,601 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual('with/slash', tag)
 
 
+class SummaryWriterTest(test_util.TensorFlowTestCase):
+
+  def testCreate_withInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(
+          logdir, max_queue=1000, flush_millis=1000000)
+      get_total = lambda: len(events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, get_total())
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, get_total())
+
+  def testCreate_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      # Returned SummaryWriter must be stored in a non-local variable so it
+      # lives throughout the function execution.
+      if not hasattr(f, 'writer'):
+        f.writer = summary_ops.create_file_writer_v2(logdir)
+    with context.eager_mode():
+      f()
+    event_files = gfile.Glob(os.path.join(logdir, '*'))
+    self.assertEqual(1, len(event_files))
+
+  def testCreate_graphTensorArgument_raisesError(self):
+    logdir = self.get_temp_dir()
+    with context.graph_mode():
+      logdir_tensor = constant_op.constant(logdir)
+    with context.eager_mode():
+      with self.assertRaisesRegex(
+          ValueError, 'Invalid graph Tensor argument.*logdir'):
+        summary_ops.create_file_writer_v2(logdir_tensor)
+    self.assertEmpty(gfile.Glob(os.path.join(logdir, '*')))
+
+  def testCreate_fromFunction_graphTensorArgument_raisesError(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      summary_ops.create_file_writer_v2(constant_op.constant(logdir))
+    with context.eager_mode():
+      with self.assertRaisesRegex(
+          ValueError, 'Invalid graph Tensor argument.*logdir'):
+        f()
+    self.assertEmpty(gfile.Glob(os.path.join(logdir, '*')))
+
+  def testCreate_fromFunction_unpersistedResource_raisesError(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
+        pass  # Calling .as_default() is enough to indicate use.
+    with context.eager_mode():
+      # TODO(nickfelt): change this to a better error
+      with self.assertRaisesRegex(
+          errors.NotFoundError, 'Resource.*does not exist'):
+        f()
+    # Even though we didn't use it, an event file will have been created.
+    self.assertEqual(1, len(gfile.Glob(os.path.join(logdir, '*'))))
+
+  def testCreate_immediateSetAsDefault_retainsReference(self):
+    logdir = self.get_temp_dir()
+    try:
+      with context.eager_mode():
+        summary_ops.create_file_writer_v2(logdir).set_as_default()
+        summary_ops.flush()
+    finally:
+      # Ensure we clean up no matter how the test executes.
+      context.context().summary_writer_resource = None
+
+  def testCreate_immediateAsDefault_retainsReference(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer_v2(logdir).as_default():
+        summary_ops.flush()
+
+  def testNoSharing(self):
+    # Two writers with the same logdir should not share state.
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer1 = summary_ops.create_file_writer_v2(logdir)
+      with writer1.as_default():
+        summary_ops.write('tag', 1, step=1)
+      event_files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(event_files))
+      file1 = event_files[0]
+
+      writer2 = summary_ops.create_file_writer_v2(logdir)
+      with writer2.as_default():
+        summary_ops.write('tag', 1, step=2)
+      event_files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(2, len(event_files))
+      event_files.remove(file1)
+      file2 = event_files[0]
+
+      # Extra writes to ensure interleaved usage works.
+      with writer1.as_default():
+        summary_ops.write('tag', 1, step=1)
+      with writer2.as_default():
+        summary_ops.write('tag', 1, step=2)
+
+    events = iter(events_from_file(file1))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(1, next(events).step)
+    self.assertEqual(1, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+    events = iter(events_from_file(file2))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(2, next(events).step)
+    self.assertEqual(2, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+  def testNoSharing_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f1():
+      if not hasattr(f1, 'writer'):
+        f1.writer = summary_ops.create_file_writer_v2(logdir)
+      with f1.writer.as_default():
+        summary_ops.write('tag', 1, step=1)
+    @def_function.function
+    def f2():
+      if not hasattr(f2, 'writer'):
+        f2.writer = summary_ops.create_file_writer_v2(logdir)
+      with f2.writer.as_default():
+        summary_ops.write('tag', 1, step=2)
+    with context.eager_mode():
+      f1()
+      event_files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(event_files))
+      file1 = event_files[0]
+
+      f2()
+      event_files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(2, len(event_files))
+      event_files.remove(file1)
+      file2 = event_files[0]
+
+      # Extra writes to ensure interleaved usage works.
+      f1()
+      f2()
+
+    events = iter(events_from_file(file1))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(1, next(events).step)
+    self.assertEqual(1, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+    events = iter(events_from_file(file2))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(2, next(events).step)
+    self.assertEqual(2, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer_v2(
+          logdir, max_queue=1, flush_millis=999999).as_default():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        # Should flush after second summary since max_queue = 1
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(events_from_logdir(logdir))
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(
+          logdir, max_queue=1000, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(2, get_total())
+      # Exiting the "as_default()" should do an implicit flush
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(
+          logdir, max_queue=999999, flush_millis=999999)
+      with writer.as_default():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        summary_ops.flush()
+        self.assertEqual(3, get_total())
+        # Test "writer" parameter
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+        summary_ops.flush(writer=writer)
+        self.assertEqual(4, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(4, get_total())
+        summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+        self.assertEqual(5, get_total())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testEagerMemory(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.create_file_writer_v2(logdir).as_default():
+      summary_ops.write('tag', 1, step=0)
+
+  def testClose_preventsLaterUse(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      writer.close()
+      writer.close()  # redundant close() is a no-op
+      writer.flush()  # redundant flush() is a no-op
+      with self.assertRaisesRegex(RuntimeError, 'already closed'):
+        writer.init()
+      with self.assertRaisesRegex(RuntimeError, 'already closed'):
+        with writer.as_default():
+          self.fail('should not get here')
+      with self.assertRaisesRegex(RuntimeError, 'already closed'):
+        writer.set_as_default()
+
+  def testClose_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      writer.close()
+      self.assertNotIn(eventfile, get_open_filenames())
+
+  def testDereference_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer_v2(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      del writer
+      self.assertNotIn(eventfile, get_open_filenames())
+
+
+class SummaryOpsTest(test_util.TensorFlowTestCase):
+
+  def tearDown(self):
+    summary_ops.trace_off()
+
+  def run_metadata(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1]
+
+  def run_metadata_graphs(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata_graphs(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1]
+
+  def create_run_metadata(self):
+    step_stats = step_stats_pb2.StepStats(dev_stats=[
+        step_stats_pb2.DeviceStepStats(
+            device='cpu:0',
+            node_stats=[step_stats_pb2.NodeExecStats(node_name='hello')])
+    ])
+    return config_pb2.RunMetadata(
+        function_graphs=[
+            config_pb2.RunMetadata.FunctionGraphs(
+                pre_optimization_graph=graph_pb2.GraphDef(
+                    node=[node_def_pb2.NodeDef(name='foo')]))
+        ],
+        step_stats=step_stats)
+
+  def keras_model(self, *args, **kwargs):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.keras_model(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    # The first event contains no summary values. The written content goes to
+    # the second event.
+    return events[1]
+
+  def run_trace(self, f, step=1):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    summary_ops.trace_on(graph=True, profiler=False)
+    with writer.as_default():
+      f()
+      summary_ops.trace_export(name='foo', step=step)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1]
+
+  @test_util.run_v2_only
+  def testRunMetadata_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      event = self.run_metadata(name='my_name', data=meta, step=1)
+      first_val = event.summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadata_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    event = self.run_metadata(name='my_name', data=meta, step=1)
+    actual_summary_metadata = event.summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadata_wholeRunMetadata(self):
+    expected_run_metadata = """
+      step_stats {
+        dev_stats {
+          device: "cpu:0"
+          node_stats {
+            node_name: "hello"
+          }
+        }
+      }
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+    event = self.run_metadata(name='my_name', data=meta, step=1)
+    first_val = event.summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadata_usesDefaultStep(self):
+    meta = config_pb2.RunMetadata()
+    try:
+      summary_ops.set_step(42)
+      event = self.run_metadata(name='my_name', data=meta)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      event = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+      first_val = event.summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata_graph"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    event = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    actual_summary_metadata = event.summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_runMetadataFragment(self):
+    expected_run_metadata = """
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+
+    event = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    first_val = event.summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_usesDefaultStep(self):
+    meta = config_pb2.RunMetadata()
+    try:
+      summary_ops.set_step(42)
+      event = self.run_metadata_graphs(name='my_name', data=meta)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  @test_util.run_v2_only
+  def testKerasModel(self):
+    model = Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    event = self.keras_model(name='my_name', data=model, step=1)
+    first_val = event.summary.value[0]
+    self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
+
+  @test_util.run_v2_only
+  def testKerasModel_usesDefaultStep(self):
+    model = Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    try:
+      summary_ops.set_step(42)
+      event = self.keras_model(name='my_name', data=model)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+  @test_util.run_v2_only
+  def testKerasModel_subclass(self):
+
+    class SimpleSubclass(Model):
+
+      def __init__(self):
+        super(SimpleSubclass, self).__init__(name='subclass')
+        self.dense = Dense(10, input_shape=(100,))
+        self.activation = Activation('relu', name='my_relu')
+
+      def call(self, inputs):
+        x = self.dense(inputs)
+        return self.activation(x)
+
+    model = SimpleSubclass()
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      self.assertFalse(
+          summary_ops.keras_model(name='my_name', data=model, step=1))
+      self.assertRegexpMatches(
+          str(mock_log.call_args), 'Model failed to serialize as JSON.')
+
+  @test_util.run_v2_only
+  def testKerasModel_otherExceptions(self):
+    model = Sequential()
+
+    with test.mock.patch.object(model, 'to_json') as mock_to_json:
+      with test.mock.patch.object(logging, 'warn') as mock_log:
+        mock_to_json.side_effect = Exception('oops')
+        self.assertFalse(
+            summary_ops.keras_model(name='my_name', data=model, step=1))
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'Model failed to serialize as JSON. Ignoring... oops')
+
+  @test_util.run_v2_only
+  def testTrace(self):
+
+    @def_function.function
+    def f():
+      x = constant_op.constant(2)
+      y = constant_op.constant(3)
+      return x**y
+
+    event = self.run_trace(f)
+
+    first_val = event.summary.value[0]
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+
+    # Content of function_graphs is large and, for instance, device can change.
+    self.assertTrue(hasattr(actual_run_metadata, 'function_graphs'))
+
+  @test_util.run_v2_only
+  def testTrace_cannotEnableTraceInFunction(self):
+
+    @def_function.function
+    def f():
+      summary_ops.trace_on(graph=True, profiler=False)
+      x = constant_op.constant(2)
+      y = constant_op.constant(3)
+      return x**y
+
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      f()
+      self.assertRegexpMatches(
+          str(mock_log.call_args), 'Cannot enable trace inside a tf.function.')
+
+  @test_util.run_v2_only
+  def testTrace_cannotEnableTraceInGraphMode(self):
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      with context.graph_mode():
+        summary_ops.trace_on(graph=True, profiler=False)
+      self.assertRegexpMatches(
+          str(mock_log.call_args), 'Must enable trace in eager mode.')
+
+  @test_util.run_v2_only
+  def testTrace_cannotExportTraceWithoutTrace(self):
+    with six.assertRaisesRegex(self, ValueError,
+                               'Must enable trace before export.'):
+      summary_ops.trace_export(name='foo', step=1)
+
+  @test_util.run_v2_only
+  def testTrace_cannotExportTraceInFunction(self):
+    summary_ops.trace_on(graph=True, profiler=False)
+
+    @def_function.function
+    def f():
+      x = constant_op.constant(2)
+      y = constant_op.constant(3)
+      summary_ops.trace_export(name='foo', step=1)
+      return x**y
+
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      f()
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'Cannot export trace inside a tf.function.')
+
+  @test_util.run_v2_only
+  def testTrace_cannotExportTraceInGraphMode(self):
+    with test.mock.patch.object(logging, 'warn') as mock_log:
+      with context.graph_mode():
+        summary_ops.trace_export(name='foo', step=1)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'Can only export trace while executing eagerly.')
+
+  @test_util.run_v2_only
+  def testTrace_usesDefaultStep(self):
+
+    @def_function.function
+    def f():
+      x = constant_op.constant(2)
+      y = constant_op.constant(3)
+      return x**y
+
+    try:
+      summary_ops.set_step(42)
+      event = self.run_trace(f, step=None)
+      self.assertEqual(42, event.step)
+    finally:
+      # Reset to default state for other tests.
+      summary_ops.set_step(None)
+
+
 def events_from_file(filepath):
   """Returns all events in a single event file.
 
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index cfa9f122d1fcee1748cd30bdc4212d81a5709ae6..0f8043ca50e4452ab043a6e1e43a734631b7b51a 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -195,28 +196,29 @@ class SvdGradOpTest(test.TestCase):
   pass  # Filled in below
 
 
-def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
+def _NormalizingSvd(tf_a, full_matrices_):
+  tf_s, tf_u, tf_v = linalg_ops.svd(
+      tf_a, compute_uv=True, full_matrices=full_matrices_)
+  # Singular vectors are only unique up to an arbitrary phase. We normalize
+  # the vectors such that the first component of u (if m >=n) or v (if n > m)
+  # have phase 0.
+  m = tf_a.shape[-2]
+  n = tf_a.shape[-1]
+  if m >= n:
+    top_rows = tf_u[..., 0:1, :]
+  else:
+    top_rows = tf_v[..., 0:1, :]
+  if tf_u.dtype.is_complex:
+    angle = -math_ops.angle(top_rows)
+    phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
+  else:
+    phase = math_ops.sign(top_rows)
+  tf_u *= phase[..., :m]
+  tf_v *= phase[..., :n]
+  return tf_s, tf_u, tf_v
 
-  def _NormalizingSvd(tf_a):
-    tf_s, tf_u, tf_v = linalg_ops.svd(
-        tf_a, compute_uv=True, full_matrices=full_matrices_)
-    # Singular vectors are only unique up to an arbitrary phase. We normalize
-    # the vectors such that the first component of u (if m >=n) or v (if n > m)
-    # have phase 0.
-    m = tf_a.shape[-2]
-    n = tf_a.shape[-1]
-    if m >= n:
-      top_rows = tf_u[..., 0:1, :]
-    else:
-      top_rows = tf_v[..., 0:1, :]
-    if tf_u.dtype.is_complex:
-      angle = -math_ops.angle(top_rows)
-      phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
-    else:
-      phase = math_ops.sign(top_rows)
-    tf_u *= phase[..., :m]
-    tf_v *= phase[..., :n]
-    return tf_s, tf_u, tf_v
+
+def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
 
   @test_util.run_v1_only("b/120545219")
   def Test(self):
@@ -238,7 +240,7 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
     with self.session(use_gpu=True):
       tf_a = constant_op.constant(a)
       if compute_uv_:
-        tf_s, tf_u, tf_v = _NormalizingSvd(tf_a)
+        tf_s, tf_u, tf_v = _NormalizingSvd(tf_a, full_matrices_)
         outputs = [tf_s, tf_u, tf_v]
       else:
         tf_s = linalg_ops.svd(tf_a, compute_uv=False)
@@ -257,7 +259,53 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
             x_init_value=x_init,
             delta=delta)
         self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+  return Test
+
+
+class SvdGradGradOpTest(test.TestCase):
+  pass  # Filled in below
 
+
+def _GetSvdGradGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
+
+  @test_util.run_v1_only("b/120545219")
+  def Test(self):
+    np.random.seed(42)
+    a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
+    if dtype_ in [np.complex64, np.complex128]:
+      a += 1j * np.random.uniform(
+          low=-1.0, high=1.0, size=shape_).astype(dtype_)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    # See Equation (21) in:
+    # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf
+    # TODO(rmlarsen): Move step size control to gradient checker.
+    epsilon = np.finfo(dtype_).eps
+    delta = 0.1 * epsilon**(1.0 / 3.0)
+    tol = 1e-5
+    with self.session(use_gpu=True):
+      tf_a = constant_op.constant(a)
+      if compute_uv_:
+        tf_s, tf_u, tf_v = _NormalizingSvd(tf_a, full_matrices_)
+        outputs = [tf_s, tf_u, tf_v]
+      else:
+        tf_s = linalg_ops.svd(tf_a, compute_uv=False)
+        outputs = [tf_s]
+      outputs_sums = [math_ops.reduce_sum(o) for o in outputs]
+      tf_func_outputs = math_ops.add_n(outputs_sums)
+      grad = gradients_impl.gradients(tf_func_outputs, tf_a)[0]
+      x_init = np.random.uniform(
+          low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      if dtype_ in [np.complex64, np.complex128]:
+        x_init += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      theoretical, numerical = gradient_checker.compute_gradient(
+          tf_a,
+          tf_a.get_shape().as_list(),
+          grad,
+          grad.get_shape().as_list(),
+          x_init_value=x_init,
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
   return Test
 
 
@@ -282,7 +330,7 @@ if __name__ == "__main__":
       dtypes = ([np.float32, np.float64]
                 + [np.complex64, np.complex128] * (not compute_uv))
       for dtype in dtypes:
-        mat_shapes = [(10, 11), (11, 10), (11, 11)]
+        mat_shapes = [(10, 11), (11, 10), (11, 11), (2, 2, 2, 3)]
         if not full_matrices or not compute_uv:
           mat_shapes += [(5, 11), (11, 5)]
         for mat_shape in mat_shapes:
@@ -293,5 +341,10 @@ if __name__ == "__main__":
                 full_matrices)
             _AddTest(SvdGradOpTest, "SvdGrad", name,
                      _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
-
+            # The results are too inacurate for float32.
+            if dtype == np.float64:
+              _AddTest(
+                  SvdGradGradOpTest, "SvdGradGrad", name,
+                  _GetSvdGradGradOpTest(dtype, shape, compute_uv,
+                                        full_matrices))
   test.main()
diff --git a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..de94212a9eb4992d9e6c6a31baa40595cb84925e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
@@ -0,0 +1,52 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for make_template used with MirroredStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class TemplateMirroredStrategyTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_merge_call(self):
+    def fn():
+      var1 = variable_scope.get_variable(
+          "var1", shape=[], initializer=init_ops.constant_initializer(21.))
+      ds_context.get_replica_context().merge_call(lambda _: ())
+      var2 = variable_scope.get_variable(
+          "var2", shape=[], initializer=init_ops.constant_initializer(2.))
+      return var1 * var2
+
+    temp = template.make_template("my_template", fn)
+
+    strategy = mirrored_strategy.MirroredStrategy(["/cpu:0", "/gpu:0"])
+    out = strategy.unwrap(strategy.experimental_run_v2(temp))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([42., 42.], self.evaluate(out))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index e8af998a7049c051f33d91a7a2a79bf8b92d9b33..4d7ae4f24ea04fe840d7f8e6f13e8d9ec95fffea 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -162,7 +162,7 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
     self._testTensorArrayWriteConcat(dtypes.float64)
@@ -185,7 +185,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/122324791")
   @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
@@ -202,11 +201,21 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/122324791")
   @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
+  @test_util.run_v1_only("Uses placeholders")
+  def testSkipEagerTensorArrayReadUninitializedInferShapeFillsZeros(self):
+    with self.cached_session(use_gpu=True) as sess:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3)
+      val = array_ops.placeholder(dtypes.float32)
+      self.assertAllEqual(
+          [[0.0, 0.0]], sess.run(ta.write(1, val).read(0), {val: [[4.0, 5.0]]}))
+
   def _testTensorArrayUnpackRead(self, tf_dtype):
     with self.cached_session(use_gpu=True):
       convert = _make_converter(tf_dtype)
@@ -299,7 +308,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
     self._testTensorArraySplitRead(dtypes.float64)
@@ -344,7 +353,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual(-2.0, g_d2)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradGrad(self):
     if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
       self.skipTest("Legacy TensorArray does not support double derivatives.")
@@ -531,7 +540,10 @@ class TensorArrayTest(test.TestCase):
 
       # The exact error messages differ between eager execution and graph
       # construction as the former bubbles up the error from array_op.concat.
-      with self.assertRaisesOpError("shape"):
+      error_msg = ("Incompatible ranks"
+                   if control_flow_util.ENABLE_CONTROL_FLOW_V2 and
+                   not context.executing_eagerly() else "shape")
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, error_msg):
         self.evaluate(w3.concat())
 
   def testTensorArraySplitIncompatibleShapesFails(self):
@@ -733,7 +745,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
@@ -770,7 +782,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
@@ -829,11 +841,11 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradientSplitConcat(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -879,7 +891,7 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradientDynamicUnpackRead(self):
     self._testTensorArrayGradientDynamicUnpackRead()
 
@@ -1044,7 +1056,7 @@ class TensorArrayTest(test.TestCase):
         grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.session(use_gpu=True) as session:
       a = array_ops.identity(
@@ -1080,7 +1092,7 @@ class TensorArrayTest(test.TestCase):
   def _grad_source_for_name(self, name):
     return tensor_array_grad._GetGradSource(constant_op.constant(0, name=name))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerGetGradSource_Invalid(self):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("")
@@ -1089,7 +1101,7 @@ class TensorArrayTest(test.TestCase):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("foo/bar")
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerGetGradSource_NoEnclosingScope(self):
     self.assertEqual("gradients:0", self._grad_source_for_name("gradients"))
     self.assertEqual("gradients_0:0", self._grad_source_for_name("gradients_0"))
@@ -1101,7 +1113,7 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("gradients_0",
                      self._grad_source_for_name("gradients_0/foo/bar"))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerGetGradSource_EnclosingScope(self):
     self.assertEqual("foo/gradients:0",
                      self._grad_source_for_name("foo/gradients"))
@@ -1116,13 +1128,13 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("foo/bar/gradients_0",
                      self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
         "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerWriteShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1147,7 +1159,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerPartlyUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1225,7 +1237,7 @@ class TensorArrayTest(test.TestCase):
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSplitShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1257,7 +1269,7 @@ class TensorArrayTest(test.TestCase):
               tensor_shape.TensorShape(
                   ta1.handle.op.get_attr("element_shape")).ndims, None)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerWriteUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1281,13 +1293,23 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  # TODO(srbs): Figure out how to enable this. This is probably failing
-  # because we are trying to stack a TensorList with invalid tensors.
-  # That is because we do not receive gradients for all list indices.
-  # Figure out how TensorArray handles this.
-  def disabletestGradientWhenNotAllComponentsRead(self):
+  @test_util.deprecated_graph_mode_only
+  def testSkipEagerGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
+  @test_util.deprecated_graph_mode_only
+  def testSkipEagerWriteButNotAllComponentsReadGrad(self):
+    with self.cached_session(use_gpu=True) as session:
+      x0 = constant_op.constant(5.0)
+      x1 = constant_op.constant(10.0)
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2).write(0, x0).write(1, x1)
+      r0 = ta.read(0)
+      # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
+      grad_r0_x1 = gradients_impl.gradients(ys=[r0], xs=[x0, x1], grad_ys=[1.0])
+      grad_r0_x1_vals = session.run(grad_r0_x1)
+      self.assertAllEqual(grad_r0_x1_vals, [1.0, 0.0])
+
   def _testTensorArrayUnpackDynamic(self):
     with self.cached_session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1321,8 +1343,8 @@ class TensorArrayTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      v2_msg = ("Tried to stack elements of a empty list with non-fully-defined"
-                " element_shape")
+      v2_msg = ("Tried to stack elements of an empty list with "
+                "non-fully-defined element_shape")
       v1_msg = (
           "TensorArray has size zero, but element shape <unknown> is not "
           "fully defined. Currently only static shapes are supported when "
@@ -1385,6 +1407,42 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
+    with self.session(use_gpu=True) as session:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=0,
+          dynamic_size=True)
+
+      indices = constant_op.constant([1, 8])
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      w = ta.scatter(indices, value)
+      r0 = w.read(1)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[r0], xs=[value], grad_ys=[[2.0, 3.0]])[0]
+      read_val, grad_val = session.run([r0, grad])
+
+      self.assertAllEqual([1.0, -1.0], read_val)
+      self.assertAllEqual([[2.0, 3.0], [0.0, 0.0]], grad_val)
+
+  def testScatterIntoExistingList(self):
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.float32, tensor_array_name="foo", size=5)
+
+    ta = ta.scatter(indices=[3, 4], value=array_ops.ones([2]))
+    self.assertAllEqual(ta.stack(), [0., 0., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[1], value=array_ops.ones([1]))
+    self.assertAllEqual(ta.stack(), [0., 1., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[0, 2], value=[5., 6.])
+    self.assertAllEqual(ta.stack(), [5., 1., 6., 1., 1.])
+
   @test_util.run_v1_only("b/118890905")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
@@ -1468,7 +1526,7 @@ class TensorArrayTest(test.TestCase):
       if "/task:1/" in d:
         self.assertTrue(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
-      else:
+      elif "/host:CPU" not in d:
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
@@ -1587,7 +1645,7 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(size0_v, 2)
       self.assertEqual(size1_v, 4)
 
-  @test_util.run_deprecated_v1
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayGradYsInCorrectScope(self):
     n_time = 1
     n_dim = 1
@@ -1606,6 +1664,7 @@ class TensorArrayTest(test.TestCase):
         vdx, vdy = self.evaluate([dx, dy])
       self.assertAllClose(vdx, vdy)
 
+  @test_util.deprecated_graph_mode_only
   def testSkipEagerTensorArrayInt64GPU(self):
     if not test.is_gpu_available():
       return
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 5d46176bce87a94ac6f2c2ce51739c0289b38b80..32ac9a415695d5bdaea530f82ffa8ddde1eb643d 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -182,6 +182,11 @@ class TopKTest(test.TestCase):
     k = constant_op.constant(3)
     self._validateTopK(inputs, k, [19, 18, 17], [11, 3, 7])
 
+  def testTop3ZeroRows(self):
+    inputs = np.zeros([0, 10], dtype=np.float32)
+    self._validateTopK(inputs, 3, np.zeros([0, 3], dtype=np.float32),
+                       np.zeros([0, 3], dtype=np.int32))
+
   @test_util.run_deprecated_v1
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
diff --git a/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd603a9547ee6f318f55f5b790953a51e591b0e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
@@ -0,0 +1,456 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.linalg.linalg_impl.tridiagonal_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+_sample_diags = np.array([[2, 1, 4, 0], [1, 3, 2, 2], [0, 1, -1, 1]])
+_sample_rhs = np.array([1, 2, 3, 4])
+_sample_result = np.array([-9, 5, -4, 4])
+
+
+def _tfconst(array):
+  return constant_op.constant(array, dtypes.float64)
+
+
+def _tf_ones(shape):
+  return array_ops.ones(shape, dtype=dtypes.float64)
+
+
+class TridiagonalSolveOpTest(test.TestCase):
+
+  def _test(self,
+            diags,
+            rhs,
+            expected,
+            diags_format="compact",
+            transpose_rhs=False,
+            conjugate_rhs=False):
+    with self.cached_session(use_gpu=False):
+      result = linalg_impl.tridiagonal_solve(diags, rhs, diags_format,
+                                             transpose_rhs, conjugate_rhs)
+      self.assertAllClose(self.evaluate(result), expected)
+
+  def _testWithLists(self,
+                     diags,
+                     rhs,
+                     expected,
+                     diags_format="compact",
+                     transpose_rhs=False,
+                     conjugate_rhs=False):
+    self._test(
+        _tfconst(diags), _tfconst(rhs), _tfconst(expected), diags_format,
+        transpose_rhs, conjugate_rhs)
+
+  def _assertRaises(self, diags, rhs, diags_format="compact"):
+    with self.assertRaises(ValueError):
+      linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+
+  # Tests with various dtypes
+
+  def testReal(self):
+    for dtype in dtypes.float32, dtypes.float64:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype),
+          rhs=constant_op.constant(_sample_rhs, dtype),
+          expected=constant_op.constant(_sample_result, dtype))
+
+  def testComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype) * (1 + 1j),
+          rhs=constant_op.constant(_sample_rhs, dtype) * (1 - 1j),
+          expected=constant_op.constant(_sample_result, dtype) * (1 - 1j) /
+          (1 + 1j))
+
+  # Tests with small matrix sizes
+
+  def test3x3(self):
+    self._testWithLists(
+        diags=[[2, -1, 0], [1, 3, 1], [0, -1, -2]],
+        rhs=[1, 2, 3],
+        expected=[-3, 2, 7])
+
+  def test2x2(self):
+    self._testWithLists(
+        diags=[[2, 0], [1, 3], [0, 1]], rhs=[1, 4], expected=[-5, 3])
+
+  def test1x1(self):
+    self._testWithLists(diags=[[0], [3], [0]], rhs=[6], expected=[2])
+
+  def test0x0(self):
+    self._test(
+        diags=constant_op.constant(0, shape=(3, 0), dtype=dtypes.float32),
+        rhs=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32),
+        expected=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32))
+
+  # Other edge cases
+
+  def testCaseRequiringPivoting(self):
+    # Without partial pivoting (e.g. Thomas algorithm) this would fail.
+    self._testWithLists(
+        diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, -2, 3]],
+        rhs=[1, 2, 3, 4],
+        expected=[8, -3.5, 0, -4])
+
+  def testCaseRequiringPivotingLastRows(self):
+    self._testWithLists(
+        diags=[[2, 1, -1, 0], [1, -1, 2, 1], [0, 1, -6, 1]],
+        rhs=[1, 2, -1, -2],
+        expected=[5, -2, -5, 3])
+
+  def testNotInvertible(self):
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self._testWithLists(
+          diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, 0, 3]],
+          rhs=[1, 2, 3, 4],
+          expected=[8, -3.5, 0, -4])
+
+  def testDiagonal(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [1, 2, -1, -2], [0, 0, 0, 0]],
+        rhs=[1, 2, 3, 4],
+        expected=[1, 1, -3, -2])
+
+  def testUpperTriangular(self):
+    self._testWithLists(
+        diags=[[2, 4, -1, 0], [1, 3, 1, 2], [0, 0, 0, 0]],
+        rhs=[1, 6, 4, 4],
+        expected=[13, -6, 6, 2])
+
+  def testLowerTriangular(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [2, -1, 3, 1], [0, 1, 4, 2]],
+        rhs=[4, 5, 6, 1],
+        expected=[2, -3, 6, -11])
+
+  # Multiple right-hand sides and batching
+
+  def testWithTwoRightHandSides(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testBatchingAndTwoRightHandSides(self):
+    rhs = np.transpose([_sample_rhs, 2 * _sample_rhs])
+    expected_result = np.transpose([_sample_result, 2 * _sample_result])
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([rhs, 2 * rhs]),
+        expected=np.array([expected_result, -2 * expected_result]))
+
+  # Various input formats
+
+  def testSequenceFormat(self):
+    self._test(
+        diags=(_tfconst([2, 1, 4]), _tfconst([1, 3, 2, 2]), _tfconst([1, -1,
+                                                                      1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithDummyElements(self):
+    dummy = 20
+    self._test(
+        diags=(_tfconst([2, 1, 4, dummy]), _tfconst([1, 3, 2, 2]),
+               _tfconst([dummy, 1, -1, 1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithBatching(self):
+    self._test(
+        diags=(_tfconst([[2, 1, 4], [-2, -1, -4]]),
+               _tfconst([[1, 3, 2, 2], [-1, -3, -2, -2]]),
+               _tfconst([[1, -1, 1], [-1, 1, -1]])),
+        rhs=_tfconst([[1, 2, 3, 4], [1, 2, 3, 4]]),
+        expected=_tfconst([[-9, 5, -4, 4], [9, -5, 4, -4]]),
+        diags_format="sequence")
+
+  def testMatrixFormat(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[1, 2, 3, 4],
+        expected=[-9, 5, -4, 4],
+        diags_format="matrix")
+
+  def testMatrixFormatWithMultipleRightHandSides(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[[1, -1], [2, -2], [3, -3], [4, -4]],
+        expected=[[-9, 9], [5, -5], [-4, 4], [4, -4]],
+        diags_format="matrix")
+
+  def testMatrixFormatWithBatching(self):
+    self._testWithLists(
+        diags=[[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+               [[-1, -2, 0, 0], [-1, -3, -1, 0], [0, 1, -2, -4], [0, 0, -1,
+                                                                  -2]]],
+        rhs=[[1, 2, 3, 4], [1, 2, 3, 4]],
+        expected=[[-9, 5, -4, 4], [9, -5, 4, -4]],
+        diags_format="matrix")
+
+  def testRightHandSideAsColumn(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs]),
+        expected=np.transpose([_sample_result]),
+        diags_format="compact")
+
+  # Tests with transpose and adjoint
+
+  def testTransposeRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, 2 * _sample_result]),
+        transpose_rhs=True)
+
+  def testConjugateRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.transpose(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        conjugate_rhs=True)
+
+  def testAdjointRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.array(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        transpose_rhs=True,
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([[_sample_rhs, 2 * _sample_rhs],
+                      [3 * _sample_rhs, 4 * _sample_rhs]]),
+        expected=np.array([[_sample_result, 2 * _sample_result],
+                           [-3 * _sample_result, -4 * _sample_result]]),
+        transpose_rhs=True)
+
+  def testTransposeRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs,
+        expected=_sample_result,
+        transpose_rhs=True)
+
+  def testConjugateRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs * (1 + 1j),
+        expected=_sample_result * (1 - 1j),
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithRhsAsVectorAndBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]),
+        transpose_rhs=True)
+
+  # Invalid input shapes
+
+  def testInvalidShapesCompactFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "compact")
+
+    test_raises((5, 4, 4), (5, 4))
+    test_raises((5, 3, 4), (4, 5))
+    test_raises((5, 3, 4), (5))
+    test_raises((5), (5, 4))
+
+  def testInvalidShapesSequenceFormat(self):
+
+    def test_raises(diags_tuple_shapes, rhs_shape):
+      diagonals = tuple(_tf_ones(shape) for shape in diags_tuple_shapes)
+      self._assertRaises(diagonals, _tf_ones(rhs_shape), "sequence")
+
+    test_raises(((5, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 3), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 6), (5, 4), (5, 3)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (3, 4))
+
+  def testInvalidShapesMatrixFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "matrix")
+
+    test_raises((5, 4, 7), (5, 4))
+    test_raises((5, 4, 4), (3, 4))
+    test_raises((5, 4, 4), (5, 3))
+
+  # Tests with placeholders
+
+  def _testWithPlaceholders(self,
+                            diags_shape,
+                            rhs_shape,
+                            diags_feed,
+                            rhs_feed,
+                            expected,
+                            diags_format="compact"):
+    if context.executing_eagerly():
+      return
+    diags = array_ops.placeholder(dtypes.float64, shape=diags_shape)
+    rhs = array_ops.placeholder(dtypes.float64, shape=rhs_shape)
+    x = linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(x, feed_dict={diags: diags_feed, rhs: rhs_feed})
+      self.assertAllClose(result, expected)
+
+  def testCompactFormatAllDimsUnknown(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, None],
+        rhs_shape=[None],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownMatrixSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, None],
+        rhs_shape=[4],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownRhsCount(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, 4],
+        rhs_shape=[4, None],
+        diags_feed=_sample_diags,
+        rhs_feed=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testCompactFormatUnknownBatchSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, 3, 4],
+        rhs_shape=[None, 4],
+        diags_feed=np.array([_sample_diags, -_sample_diags]),
+        rhs_feed=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testMatrixFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+
+    def test_with_matrix_shapes(matrix_shape):
+      matrix = np.array([[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4],
+                         [0, 0, 1, 2]])
+      rhs = np.array([1, 2, 3, 4])
+      x = np.array([-9, 5, -4, 4])
+      self._testWithPlaceholders(
+          diags_shape=matrix_shape,
+          rhs_shape=[None, None],
+          diags_feed=matrix,
+          rhs_feed=np.transpose([rhs, 2 * rhs]),
+          expected=np.transpose([x, 2 * x]),
+          diags_format="matrix")
+
+    test_with_matrix_shapes(matrix_shape=[4, 4])
+    test_with_matrix_shapes(matrix_shape=[None, 4])
+    test_with_matrix_shapes(matrix_shape=[4, None])
+    with self.assertRaises(ValueError):
+      test_with_matrix_shapes(matrix_shape=[None, None])
+
+  def testSequenceFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+    superdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    diag = array_ops.placeholder(dtypes.float64, shape=[None])
+    subdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    rhs = array_ops.placeholder(dtypes.float64, shape=[None])
+
+    x = linalg_impl.tridiagonal_solve((superdiag, diag, subdiag),
+                                      rhs,
+                                      diagonals_format="sequence")
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(
+          x,
+          feed_dict={
+              subdiag: [20, 1, -1, 1],
+              diag: [1, 3, 2, 2],
+              superdiag: [2, 1, 4, 20],
+              rhs: [1, 2, 3, 4]
+          })
+      self.assertAllClose(result, [-9, 5, -4, 4])
+
+  # Benchmark
+
+  class TridiagonalSolveBenchmark(test.Benchmark):
+    sizes = [(100000, 1, 1), (1000000, 1, 1), (10000000, 1, 1), (100000, 10, 1),
+             (100000, 100, 1), (10000, 1, 100), (10000, 1, 1000),
+             (10000, 1, 10000)]
+
+    def _generateData(self, matrix_size, batch_size, num_rhs, seed=42):
+      data = random_ops.random_normal(
+          shape=(batch_size, 3 + num_rhs, matrix_size),
+          dtype=dtypes.float64,
+          seed=seed)
+      diags = array_ops.stack([data[:, 0], data[:, 1], data[:, 2]], axis=-2)
+      rhs = data[:, 3:, :]
+      return diags, rhs
+
+    def benchmarkTridiagonalSolveOp(self):
+      for matrix_size, batch_size, num_rhs in self.sizes:
+        with ops.Graph().as_default(), \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
+                ops.device("/cpu:0"):
+          diags, rhs = self._generateData(matrix_size, batch_size, num_rhs)
+          x = linalg_impl.tridiagonal_solve(diags, rhs, transpose_rhs=True)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(x),
+              min_iters=10,
+              store_memory_usage=False,
+              name=("tridiagonal_solve_matrix_size_{}_batch_size_{}_"
+                    "num_rhs_{}").format(matrix_size, batch_size, num_rhs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index f5ba475e7adabc9bb5b057504ad854f550395440..89885cf752bf51add97896325ef7c77bb2553b14 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,20 +41,20 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with test_util.use_gpu():
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [
-            np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
-        ]:
-          data = np.random.randn(*shape).astype(dtype)
-          # Convert data to a single tensorflow tensor
-          x = constant_op.constant(data)
-          # Unstack into a list of tensors
-          cs = array_ops.unstack(x, num=shape[0])
-          self.assertEqual(type(cs), list)
-          self.assertEqual(len(cs), shape[0])
-          cs = [self.evaluate(c) for c in cs]
-          self.assertAllEqual(cs, data)
+    for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      for dtype in [
+          np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
+          np.int64
+      ]:
+        data = np.random.randn(*shape).astype(dtype)
+        # Convert data to a single tensorflow tensor
+        x = constant_op.constant(data)
+        # Unstack into a list of tensors
+        cs = array_ops.unstack(x, num=shape[0])
+        self.assertEqual(type(cs), list)
+        self.assertEqual(len(cs), shape[0])
+        cs = [self.evaluate(c) for c in cs]
+        self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
@@ -63,7 +63,10 @@ class UnstackOpTest(test.TestCase):
     np.random.seed(7)
     with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+        for dtype in [
+            np.bool, np.float16, np.float32, np.float64, np.uint8, np.int32,
+            np.int64
+        ]:
           data = np.random.randn(*shape).astype(dtype)
           # Convert data to a single tensorflow tensor
           x = constant_op.constant(data)
@@ -80,7 +83,7 @@ class UnstackOpTest(test.TestCase):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
       for i in xrange(shape[0]):
-        with self.cached_session(use_gpu=True):
+        with self.cached_session():
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[0])
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
@@ -94,7 +97,7 @@ class UnstackOpTest(test.TestCase):
       out_shape = list(shape)
       del out_shape[1]
       for i in xrange(shape[1]):
-        with self.cached_session(use_gpu=True):
+        with self.cached_session():
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[1], axis=1)
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
@@ -103,12 +106,11 @@ class UnstackOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInferNum(self):
-    with self.cached_session():
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        x = array_ops.placeholder(np.float32, shape=shape)
-        cs = array_ops.unstack(x)
-        self.assertEqual(type(cs), list)
-        self.assertEqual(len(cs), shape[0])
+    for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      x = array_ops.placeholder(np.float32, shape=shape)
+      cs = array_ops.unstack(x)
+      self.assertEqual(type(cs), list)
+      self.assertEqual(len(cs), shape[0])
 
   @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 028ef11fc496725fd6535dd28196e9fadcf2fee4..b3316b73ff6c6fccedced987cfa8428092df4014 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -828,5 +828,29 @@ class VariableContainerTest(test.TestCase):
     self.assertEqual(compat.as_bytes(""), v4.op.get_attr("container"))
 
 
+class AggregationModesTest(test.TestCase):
+
+  def testV1V2Equal(self):
+    v1 = variables.VariableAggregation
+    v2 = variables.VariableAggregationV2
+
+    self.assertEqual(v1.NONE, v2.NONE)
+    self.assertEqual(v1.SUM, v2.SUM)
+    self.assertEqual(v1.MEAN, v2.MEAN)
+    self.assertEqual(v1.ONLY_FIRST_REPLICA, v2.ONLY_FIRST_REPLICA)
+    self.assertEqual(v1.ONLY_FIRST_TOWER, v2.ONLY_FIRST_REPLICA)
+
+    self.assertEqual(v2.NONE, v1.NONE)
+    self.assertEqual(v2.SUM, v1.SUM)
+    self.assertEqual(v2.MEAN, v1.MEAN)
+    self.assertEqual(v2.ONLY_FIRST_REPLICA, v1.ONLY_FIRST_REPLICA)
+    self.assertEqual(v2.ONLY_FIRST_REPLICA, v1.ONLY_FIRST_TOWER)
+
+    self.assertEqual(hash(v1.NONE), hash(v2.NONE))
+    self.assertEqual(hash(v1.SUM), hash(v2.SUM))
+    self.assertEqual(hash(v1.MEAN), hash(v2.MEAN))
+    self.assertEqual(hash(v1.ONLY_FIRST_REPLICA), hash(v2.ONLY_FIRST_REPLICA))
+    self.assertEqual(hash(v1.ONLY_FIRST_TOWER), hash(v2.ONLY_FIRST_REPLICA))
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 061d787760889cb344967fee861147d1d4674ad2..61cd1181ed7b3bda5fd29c84c33ad1fa3cea22d1 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -32,9 +32,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2
@@ -83,6 +83,20 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual(fnWithLoop(), 4.0)
 
+  def testExternalControlDependencies(self):
+    with ops.Graph().as_default(), self.test_session():
+      v = variables.Variable(1.)
+      v.initializer.run()
+      op = v.assign_add(1.)
+
+      def body_fn(i):  # pylint: disable=invalid-name
+        with ops.control_dependencies([op]):
+          return i + 1
+
+      loop = while_loop_v2(lambda i: i < 1, body_fn, [0])
+      loop[0].op.run()
+      self.assertAllEqual(self.evaluate(v), 2.0)
+
   @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
@@ -283,8 +297,8 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         while_op = op
 
     body_graph = while_v2._get_graph(while_op, "body")
-    # body_graph.inputs: [counter_arg, x_arg, tl_arg, *accumulators]
-    x_input_t = body_graph.inputs[1]
+    x_input_index = [i for i, inp in enumerate(while_op.inputs) if inp == x][0]
+    x_input_t = body_graph.inputs[x_input_index]
     accumulator_count = len(
         [c for c in x_input_t.consumers() if c.type == "TensorListPushBack"])
     self.assertEqual(accumulator_count, 1)
@@ -331,12 +345,13 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     while_op = ret[0].op.inputs[0].op
     # Gradient pass.
     grad = gradients_impl.gradients(ret[0], x)
+    # Note: There is an Identity b/w grad[0] and the While op.
     grad_while_op = grad[0].op.inputs[0].op
 
     # Get the TensorList output of While op containing the accumulated values
     # of y.
-    # while_op.inputs: [counter_arg, x_arg, y_arg, *accumulators]
-    output = GetAccumulatorForInputAtIndex(while_op, 2)
+    x_input_index = [i for i, inp in enumerate(while_op.inputs) if x == inp][0]
+    output = GetAccumulatorForInputAtIndex(while_op, x_input_index)
     _, val = list_ops.tensor_list_pop_back(output,
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
@@ -347,8 +362,9 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # Get the TensorList output of gradient While op containing the accumulated
     # values of grad_x (note that grad_x is needed by the second derivative).
     # grad_while_op.inputs:
-    # [counter_arg, total_iters_arg, grad_x_arg, grad_y_arg, *other_args]
-    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 2)
+    grad_output_index = grad_while_op.outputs.index(grad[0].op.inputs[0])
+    grad_output = GetAccumulatorForInputAtIndex(grad_while_op,
+                                                grad_output_index)
     _, val = list_ops.tensor_list_pop_back(grad_output,
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
@@ -394,7 +410,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     param = constant_op.constant(2.0)
     y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
     # map_fn uses TensorArray internally.
-    r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
+    r = map_fn.map_fn(lambda x: math_ops.multiply(x, param), y0)
     grad = gradients_impl.gradients(r, param)[0]
     self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
     self.assertAllClose(21.0, self.evaluate(grad))
@@ -447,17 +463,17 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
                                  [x])[0]
     while_op = output.op.inputs[0].op
     self.assertEqual(while_op.type, "While")
-    # outputs = [loop_counter, x]
-    self.assertLen(while_op.outputs, 2)
+    # outputs = [loop_counter, max_iters, x]
+    self.assertLen(while_op.outputs, 3)
 
     gradients_impl.gradients(output, x)
     # while_op should have been rewritten to output 2.0 intermediate.
-    # outputs = [loop_counter, x, 2.0_accumulator, x_accumulator]
-    self.assertLen(while_op.outputs, 4)
+    # outputs = [loop_counter, max_iters, x, 2.0_accumulator, x_accumulator]
+    self.assertLen(while_op.outputs, 5)
 
     gradients_impl.gradients(output, x)
     # Computing the gradient again shouldn't rewrite while_op again.
-    self.assertLen(while_op.outputs, 4)
+    self.assertLen(while_op.outputs, 5)
 
 
 def ScalarShape():
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index f5d03c2370186e39cad2ba9aa29d03c454de9168..f37b57c38572100611aada51c79988c16fb7b08e 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -41,7 +41,7 @@ from tensorflow.python.platform import test
 class XentTest(test.TestCase):
 
   def _npXent(self, features, labels, dim=-1):
-    if dim is -1:
+    if dim == -1:
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index 3dd9ec4ba9459b95f690a2146c7f94ad75043d6d..1220be45733e16759b5f6ed18e0c3c215855a1a7 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -51,8 +51,10 @@ class ZeroDivisionTest(test.TestCase):
             # means 32 bits set, so we allow 0xffffffff as well.  This isn't
             # very portable, so we may need to expand this list if other GPUs
             # do different things.
+            #
+            # XLA constant folds integer division by zero to 1.
             self.assertTrue(test.is_gpu_available())
-            self.assertIn(result, (-1, 0xff, 0xffffffff))
+            self.assertIn(result, (-1, 1, 0xff, 0xffffffff))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 1b84ec1f69ed55a5c86c7767e986c7bc542e1841..89652a0a4b87e16826c27a21fd09868049a216e0 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -307,7 +307,8 @@ class Layer(base_layer.Layer):
                  use_resource=None,
                  synchronization=vs.VariableSynchronization.AUTO,
                  aggregation=vs.VariableAggregation.NONE,
-                 partitioner=None):
+                 partitioner=None,
+                 **kwargs):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -342,6 +343,7 @@ class Layer(base_layer.Layer):
         `tf.variable_axis_size_partitioner`.  For more details, see the
         documentation of `tf.get_variable` and the  "Variable Partitioners
         and Sharding" section of the API guide.
+      **kwargs: Additional keyword arguments.
 
     Returns:
       The created variable.  Usually either a `Variable` or `ResourceVariable`
@@ -354,6 +356,9 @@ class Layer(base_layer.Layer):
       ValueError: When trainable has been set to True with synchronization
         set as `ON_READ`.
     """
+    for kwarg in kwargs:
+      if kwarg != 'experimental_autocast':
+        raise TypeError('Unknown keyword argument:', kwarg)
     if self._keras_style:
       return super(Layer, self).add_weight(
           name=name,
@@ -366,7 +371,8 @@ class Layer(base_layer.Layer):
           use_resource=use_resource,
           synchronization=vs.VariableSynchronization.AUTO,
           aggregation=vs.VariableAggregation.NONE,
-          partitioner=partitioner)
+          partitioner=partitioner,
+          **kwargs)
 
     if synchronization == vs.VariableSynchronization.ON_READ:
       if trainable:
@@ -433,11 +439,12 @@ class Layer(base_layer.Layer):
             use_resource=use_resource,
             synchronization=synchronization,
             aggregation=aggregation,
-            getter=vs.get_variable)
+            getter=vs.get_variable,
+            **kwargs)
 
         if regularizer:
-          if context.executing_eagerly() or _should_add_regularizer(
-              variable, existing_variables):
+          if (ops.executing_eagerly_outside_functions()
+              or _should_add_regularizer(variable, existing_variables)):
             self._handle_weight_regularization(name, variable, regularizer)
 
         if init_graph is not None:
@@ -554,7 +561,7 @@ class Layer(base_layer.Layer):
 
   def __setattr__(self, value, name):
     # By-pass the automatic dependency tracking performed by the parent Layer.
-    super(checkpointable.Checkpointable, self).__setattr__(value, name)
+    super(trackable.Trackable, self).__setattr__(value, name)
 
 
 def _add_elements_to_collection(elements, collection_list):
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index d0ec4f4425f2ea92ba5699cf4ae2d81a86ea27dd..a55751f89f2dff4894741309b7ea9029f602cfed 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -109,17 +110,19 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.variables, [variable, variable_2])
     self.assertEqual(layer.trainable_variables, [variable])
     self.assertEqual(layer.non_trainable_variables, [variable_2])
+
     if not context.executing_eagerly():
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 1)
 
-      # regularizers only supported in GRAPH mode.
-      regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
-      _ = layer.add_variable(
-          'reg_var', [2, 2],
-          initializer=init_ops.zeros_initializer(),
-          regularizer=regularizer)
-      self.assertEqual(len(layer.losses), 1)
+    regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
+    _ = layer.add_variable(
+        'reg_var', [2, 2],
+        initializer=init_ops.zeros_initializer(),
+        regularizer=regularizer)
+    self.assertEqual(len(layer.losses), 1)
+
+    added_variable = [False]
 
     # Test that sync `ON_READ` variables are defaulted to be non-trainable.
     variable_3 = layer.add_variable(
@@ -129,6 +132,18 @@ class BaseLayerTest(test.TestCase):
         aggregation=variable_scope.VariableAggregation.SUM)
     self.assertEqual(layer.non_trainable_variables, [variable_2, variable_3])
 
+    @def_function.function
+    def function_adds_weight():
+      if not added_variable[0]:
+        layer.add_variable(
+            'reg_var_from_function', [2, 2],
+            initializer=init_ops.zeros_initializer(),
+            regularizer=regularizer)
+        added_variable[0] = True
+
+    function_adds_weight()
+    self.assertEqual(len(layer.losses), 2)
+
   def testInvalidTrainableSynchronizationCombination(self):
     layer = base_layers.Layer(name='my_layer')
 
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 03344c844d35aa74c09ccc9cc308fa921b4d1789..f88934122fc8f78f5763e08691db05054410bbe9 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -117,7 +117,7 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.Conv1D instead.')
+    instructions='Use `tf.keras.layers.Conv1D` instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -316,7 +316,7 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.Conv2D instead.')
+    instructions='Use `tf.keras.layers.Conv2D` instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -523,7 +523,7 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.Conv3D instead.')
+    instructions='Use `tf.keras.layers.Conv3D` instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -853,7 +853,7 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.SeparableConv1D instead.')
+    instructions='Use `tf.keras.layers.SeparableConv1D` instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -973,7 +973,7 @@ def separable_conv1d(inputs,
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.SeparableConv2D instead.')
+    instructions='Use `tf.keras.layers.SeparableConv2D` instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1183,7 +1183,7 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.Conv2DTranspose instead.')
+    instructions='Use `tf.keras.layers.Conv2DTranspose` instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1363,7 +1363,7 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use tf.keras.layers.Conv3DTranspose instead.')
+    instructions='Use `tf.keras.layers.Conv3DTranspose` instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index b2d54a98272be53b69872e900901d9552177a172..7e12dcacd86a2f792743316f65a97806c7028fc0 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -64,7 +64,7 @@ class Dense(keras_layers.Dense, base.Layer):
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such cases.
-    reuse: Boolean, whether to reuse the weights of a previous layer
+    _reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
 
   Properties:
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 7eefb294cd6f1f8c7194d68f5a76bfba220e0493..02eb57b1bb24a0ca53557b37b7dddd415d2a9f70 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -156,8 +156,7 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
 
 
 @deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.batch_normalization instead.')
+    date=None, instructions='Use keras.layers.BatchNormalization instead.')
 @tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 97bebe86177ee264ef00bc9b969b293389aa2122..71b1577471aed993a5064f635b8027882d554b82 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -185,19 +185,22 @@ tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
   if (actual_device == nullptr) {
     if (!IsCPUDevice(expected_device)) {
       return errors::Internal(
-          "expected the py_func to return a Tensor backed by memory in ",
+          "Expected the py_func to return a Tensor backed by memory in ",
           expected_device_name,
           ", but is actually backed by local host memory. This is a bug.");
     }
     return Status::OK();
   }
-  const string& actual_device_name = actual_device->attributes().name();
-  if (actual_device_name != expected_device_name) {
-    return errors::Internal(
-        "expected the py_func to return a Tensor backed by memory in ",
-        expected_device_name, ", but is actually in ", actual_device_name,
-        ". This is a bug.");
-  }
+  // NOTE(ebrevdo): Here we could try comparing "actual_device_name"
+  // (actual_device->attributes()->name()) to expected_device_name and ensure
+  // they're the same.  However, this comparison fails if we create a ClusterDef
+  // on localhost, mainly because the Device created by Eager code doesn't match
+  // the device created by a session.  In this case, expected_device_name may
+  // contain "worker" but the Eager device name contains "localhost".  Since we
+  // can't easily access the true underlying device of "worker" here, we are not
+  // able to perform a proper comparison.  Furthermore, we can't check
+  // IsCPUDevice(actual_device) because the kernel's device may indeed be a
+  // GPU device (the python interpreter doesn't use it, however).
   return Status::OK();
 }
 
@@ -485,6 +488,8 @@ class PyFuncOp : public OpKernel {
     eager_ = type_string() == "EagerPyFunc";
   }
 
+  bool IsExpensive() override { return true; }
+
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
@@ -494,8 +499,8 @@ class PyFuncOp : public OpKernel {
       // `DeviceBase`; attempt to downcast.
       call.device = dynamic_cast<Device*>(ctx->device());
       if (call.device == nullptr) {
-        ctx->CtxFailureWithWarning(
-            errors::Internal("Unrecognized device class"));
+        ctx->CtxFailureWithWarning(errors::Internal(
+            "Unrecognized device class: ", ctx->device()->name()));
         return;
       }
     }
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index f681cff6cff35bfd8ed0e3a880d26936a54fabee..77fbfd51bbbace5b043b719de45c474476f69fd4 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -64,6 +64,19 @@ bool IsPyFloat(PyObject* obj) {
          PyIsInstance(obj, &PyFloatingArrType_Type);  // NumPy float types
 }
 
+// If the input is a zero dimensional PyArray return it converted to a scalar.
+// Otherwise return the input and increment its reference count.
+// Users must Py_DECREF the output of this method.
+PyObject* ZeroDimArrayToScalar(PyObject* obj) {
+  if (PyArray_IsZeroDim(obj) && !PyArray_IsScalar(obj, Generic)) {
+    auto pyarray_obj = reinterpret_cast<PyArrayObject*>(obj);
+    obj = PyArray_ToScalar(PyArray_DATA(pyarray_obj), pyarray_obj);
+  } else {
+    Py_INCREF(obj);
+  }
+  return obj;
+}
+
 // Converts Python object `c` that should hold a Python string into a
 // C++ string in *out.  Returns nullptr on success, or a message on error.
 // Defined below, but forward declared here for use in PyRepr.
@@ -130,6 +143,10 @@ Status SampleElementFromSequence(PyObject* seq, PyObject** elem) {
 Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
   std::vector<Safe_PyObjectPtr> refs_to_clean;
   while (true) {
+    // Convert any zero dimensional numpy arrays to scalars first of all.
+    // We also have to make sure a reference to the safe_obj is kept.
+    obj = ZeroDimArrayToScalar(obj);
+    refs_to_clean.push_back(make_safe(obj));
     // We test strings first, in case a string is considered a sequence.
     if (IsPyString(obj)) {
       *dtype = DT_STRING;
@@ -240,7 +257,9 @@ const char ErrorFoundFloat[] =
       }                                                                   \
       PyObject** l = PySequence_Fast_ITEMS(seq.get());                    \
       for (int64 i = 0; i < s; ++i) {                                     \
-        const char* error = CONVERT(l[i], *buf);                          \
+        auto scalar = ZeroDimArrayToScalar(l[i]);                         \
+        const char* error = CONVERT(scalar, *buf);                        \
+        Py_DECREF(scalar);                                                \
         if (TF_PREDICT_FALSE(error != nullptr)) return error;             \
         ++*buf;                                                           \
       }                                                                   \
@@ -253,7 +272,9 @@ const char ErrorFoundFloat[] =
     Tensor result(TYPE_ENUM, shape);                                      \
     if (shape.dims() == 0) { /* Scalar case */                            \
       TYPE value;                                                         \
-      const char* error = CONVERT(obj, &value);                           \
+      auto scalar = ZeroDimArrayToScalar(obj);                            \
+      const char* error = CONVERT(scalar, &value);                        \
+      Py_DECREF(scalar);                                                  \
       if (error != nullptr) return error;                                 \
       result.scalar<TYPE>()() = value;                                    \
     } else {                                                              \
@@ -331,8 +352,25 @@ DEFINE_HELPER(ConvertInt32, int32, DT_INT32, ConvertOneInt32);
 
 template <class T>
 const char* ConvertOneFloat(PyObject* v, T* out) {
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
   if (TF_PREDICT_TRUE(PyFloat_Check(v))) {
-    *out = PyFloat_AS_DOUBLE(v);
+    double as_double = PyFloat_AsDouble(v);
+    // Handle infinity.
+    if (as_double == std::numeric_limits<double>::infinity()) {
+      *out = std::numeric_limits<T>::infinity();
+      return nullptr;
+    } else if (as_double == -1 * std::numeric_limits<double>::infinity()) {
+      *out = -1 * std::numeric_limits<T>::infinity();
+      return nullptr;
+    }
+    // Check for overflow.
+    if (as_double > std::numeric_limits<T>::max() ||
+        as_double < std::numeric_limits<T>::lowest()) {
+      return ErrorOutOfRangeDouble;
+    }
+    *out = static_cast<T>(as_double);
     return nullptr;
   }
 #if PY_MAJOR_VERSION < 3
@@ -348,6 +386,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
   }
   if (PyIsInstance(v, &PyFloatingArrType_Type)) {  // NumPy float types
     Safe_PyObjectPtr as_float = make_safe(PyNumber_Float(v));
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_float.get(), out);
   }
   if (PyIsInstance(v, &PyIntegerArrType_Type)) {  // NumPy integers
@@ -356,6 +397,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
 #else
     Safe_PyObjectPtr as_int = make_safe(PyNumber_Long(v));
 #endif
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_int.get(), out);
   }
   return ErrorMixedTypes;
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 2720962084b19a57ceefee64e604ce2376a53f78..e80178b2d90b3c6400c60f81ed2072068327596a 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -646,7 +646,10 @@ def list_directory_v2(path):
     errors.NotFoundError if directory doesn't exist
   """
   if not is_directory(path):
-    raise errors.NotFoundError(None, None, "Could not find directory")
+    raise errors.NotFoundError(
+        node_def=None,
+        op=None,
+        message="Could not find directory {}".format(path))
   with errors.raise_exception_on_not_ok_status() as status:
     # Convert each element to string, since the return values of the
     # vector of string should be interpreted as strings, not bytes.
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index faf20df8683ccd0a5498b0c620bbbd704e470995..03f24d0f8f4278db9595ef70827bea1894c834ce 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -68,6 +68,11 @@ void PyRecordWriter::Flush(TF_Status* out_status) {
     return;
   }
   Status s = writer_->Flush();
+  if (s.ok()) {
+    // Per the RecordWriter contract, flushing the RecordWriter does not
+    // flush the underlying file.  Here we need to do both.
+    s = file_->Flush();
+  }
   if (!s.ok()) {
     Set_TF_Status_from_Status(out_status, s);
     return;
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 64c226e4c993a8e8348253c3b6f781263ddb5e41..55909cc2672b4e601f3a6e5607c1efe1b10e06cc 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -13,7 +13,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/training/tracking",
         "@six_archive//:six",
     ],
 )
@@ -23,6 +23,7 @@ tf_py_test(
     srcs = ["module_test.py"],
     additional_deps = [
         ":module",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python:variables",
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
index 16db0b0e01c387fdd7f883961b334fa61a9eb3a2..6406bc8f146fec785597aa5a9af95a201174aee0 100644
--- a/tensorflow/python/module/module.py
+++ b/tensorflow/python/module/module.py
@@ -19,110 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import re
-import sys
-
-import six
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-class ModuleMetaclass(type):
-  """Metaclass for `tf.Module`."""
-
-  def __new__(mcs, name, bases, clsdict):
-    for key, value in clsdict.items():
-      if key in ("__init__", "name_scope"):
-        continue
-
-      elif tf_inspect.isfunction(value):
-        if getattr(value, "_no_module_name_scope", False):
-          # The function has been annotated to say that no autoscoping should
-          # be applied, so do not patch it.
-          continue
-        clsdict[key] = with_name_scope(value)
-
-      elif isinstance(value, property):
-        clsdict[key] = property(
-            value.fget if not value.fget else with_name_scope(value.fget),
-            value.fset if not value.fset else with_name_scope(value.fset),
-            value.fdel if not value.fdel else with_name_scope(value.fdel),
-            doc=value.__doc__)
-
-    return type.__new__(mcs, name, bases, clsdict)
-
-  def __call__(cls, *args, **kwargs):
-    # Call new such that we have an un-initialized module instance that we can
-    # still reference even if there is an exception during __init__. This is
-    # needed such that we can make sure the name_scope constructed in __init__
-    # is closed even if there is an exception.
-    module = cls.__new__(cls, *args, **kwargs)
-
-    # Now attempt to initialize the object.
-    try:
-      module.__init__(*args, **kwargs)
-    except:
-      # We must explicitly catch so that in Python 2 sys.exc_info() is populated
-      # before entering the finally block.
-      raise
-
-    finally:
-      # The base Module constructor enters the modules name scope before
-      # returning such that other functionality in the ctor happens within the
-      # modules name scope.
-      scope = getattr(module, "_ctor_name_scope", None)
-      exc_info = sys.exc_info()
-      if scope is None:
-        if exc_info[0] is None:
-          raise ValueError(
-              "Constructing a tf.Module without calling the super constructor "
-              "is not supported. Add the following as the first line in your "
-              "__init__ method:\n\n"
-              "super(%s, self).__init__()" % cls.__name__)
-      else:
-        scope.__exit__(*exc_info)
-        del module._ctor_name_scope
-
-    return module
-
-
-def with_name_scope(unbound_method):
-  """Patches the given method so it enters the modules name scope."""
-  def enter_name_scope(self, *args, **kwargs):
-    """Decorator that calls the given function in the module name scope.
-
-    Args:
-      self: Module instance.
-      *args: Positional arguments to `unbound_method`.
-      **kwargs: Keyword arguments to `unbound_method`.
-
-    Returns:
-      `with self.name_scope: return unbound_method(self, *args, **kwargs)`
-    """
-    try:
-      module_name_scope = self.name_scope
-    except AttributeError as exc_value_from:
-      exc_value = AttributeError(
-          "The super constructor must be called before any other methods in "
-          "your constructor. If this is not possible then annotate all the "
-          "methods called with `@no_module_name_scope`.")
-      six.raise_from(exc_value, exc_value_from)
-
-    with module_name_scope:
-      # tf.Module enters the module name scope for all methods. To disable this
-      # for a particular method annotate it with `@no_module_name_scope`.
-      return unbound_method(self, *args, **kwargs)
-
-  return tf_decorator.make_decorator(unbound_method, enter_name_scope)
-
-
-@tf_export("experimental.Module")
-class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
+@tf_export("Module")
+class Module(tracking.AutoTrackable):
   """Base neural network module class.
 
   A module is a named container for `tf.Variable`s, other `tf.Module`s and
@@ -130,37 +37,54 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
   network might be implemented as a `tf.Module`:
 
   >>> class Dense(tf.Module):
-  ...   def __init__(self, in_features, output_features):
-  ...     super(Linear, self).__init__()
+  ...   def __init__(self, in_features, output_features, name=None):
+  ...     super(Dense, self).__init__(name=name)
   ...     self.w = tf.Variable(
   ...         tf.random_normal([input_features, output_features]), name='w')
   ...     self.b = tf.Variable(tf.zeros([output_features]), name='b')
   ...
   ...   def __call__(self, x):
-  ...     x = tf.convert_to_tensor(x, name='x')
   ...     y = tf.matmul(x, self.w) + self.b
   ...     return tf.nn.relu(y)
 
-  You can use the dense layer as you would expect:
+  You can use the Dense layer as you would expect:
 
   >>> d = Dense(input_features=64, output_features=10)
   >>> d(tf.ones([100, 64]))
   <tf.Tensor: ...>
 
-  By subclassing `tf.Module` instead of `object` any variables created inside
-  the module are automatically created within the modules name scope:
-
-  >> d.w.name
-  "dense/w:0"
-
-  In eager mode this is useful for debugging, and when used with `@tf.function`
-  the use of name scopes gives operations (e.g. matmul) useful names as well.
-
-  As well as automatic naming, the Dense module inherits methods for tracking
-  its variables:
+  By subclassing `tf.Module` instead of `object` any `tf.Variable` or
+  `tf.Module` instances assigned to object properties can be collected using
+  the `variables`, `trainable_variables` or `submodules` property:
 
   >>> d.variables
-  (<tf.Variable 'dense/b:0' ...>, <tf.Variable 'dense/w:0' ...>)
+  (<tf.Variable 'b:0' ...>, <tf.Variable 'w:0' ...>)
+
+  Subclasses of `tf.Module` can also take advantage of the `_flatten` method
+  which can be used to implement tracking of any other types.
+
+  All `tf.Module` classes have an associated `tf.name_scope` which can be used
+  to group operations in TensorBoard and create hierarchies for variable names
+  which can help with debugging. We suggest using the name scope when creating
+  nested submodules/parameters or for forward methods whose graph you might want
+  to inspect in TensorBoard. You can enter the name scope explicitly using
+  `with self.name_scope:` or you can annotate methods (apart from `__init__`)
+  with `@tf.Module.with_name_scope`.
+
+  >>> class MLP(tf.Module):
+  ...   def __init__(self, input_size, sizes, name=None):
+  ...     super(MLP, self).__init__(name=name)
+  ...     self.layers = []
+  ...     with self.name_scope:
+  ...       for size in sizes:
+  ...         self.layers.append(Dense(input_size=input_size, output_size=size))
+  ...         input_size = size
+  ...
+  ...   @tf.Module.with_name_scope
+  ...   def __call__(self, x):
+  ...     for layer in self.layers:
+  ...       x = layer(x)
+  ...     return x
   """
 
   def __init__(self, name=None):
@@ -176,12 +100,6 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
     with ops.name_scope(name) as scope_name:
       self._scope_name = scope_name
 
-    # Enter the name scope so subsequent code in the contructor (e.g. creating
-    # submodules) happens inside the modules name scope. This is exited when
-    # the subclass __init__ returns (this is implemented in ModuleMetaclass).
-    self._ctor_name_scope = self.name_scope
-    self._ctor_name_scope.__enter__()
-
   @property
   def name(self):
     """Returns the name of this module as passed or determined in the ctor.
@@ -199,84 +117,44 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
 
   @property
   def variables(self):
-    """Returns a tuple of variables owned by this module and it's submodules.
-
-    Returns:
-      A tuple of variables for the current module (sorted by attribute name)
-      followed by variables from all submodules recursively (depth first).
-    """
-    return tuple(walk(self, recurse_if=_IS_MODULE, predicate=_IS_VARIABLE))
+    """Sequence of variables owned by this module and it's submodules.
 
-  @property
-  def owned_variables(self):
-    """Returns a tuple of variables that are attributes of the current module.
-
-    See `variables` for a property which returns all variables from the current
-    module and all it's submodules recursively.
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
 
     Returns:
-      A tuple of variables which are attributes of the current module. Will
-      yield variables inside nested structures (lists etc) but not in other
-      modules.
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
     """
-    return tuple(walk(self, predicate=_IS_VARIABLE))
+    return tuple(self._flatten(predicate=_IS_VARIABLE))
 
   @property
   def trainable_variables(self):
-    """Returns a tuple of variables owned by this module and it's submodules.
-
-    Returns:
-      A tuple of variables for the current module (sorted by attribute name)
-      followed by variables from all submodules recursively (depth first).
-    """
-    return tuple(
-        walk(self, recurse_if=_IS_MODULE, predicate=_IS_TRAINABLE_VARIABLE))
-
-  @property
-  def owned_trainable_variables(self):
-    """Returns a tuple of variables that are attributes of the current module.
+    """Sequence of variables owned by this module and it's submodules.
 
-    See `variables` for a property which returns all variables from the current
-    module and all it's submodules recursively.
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
 
     Returns:
-      A tuple of variables which are attributes of the current module. Will
-      yield variables inside nested structures (lists etc) but not in other
-      modules.
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
     """
-    return tuple(walk(self, predicate=_IS_TRAINABLE_VARIABLE))
-
-  @property
-  def owned_submodules(self):
-    """Returns an iterator of immediate child modules.
-
-    Child modules are modules which are found as properties of the current
-    module.
-
-    >>> a = tf.experimental.Module()
-    >>> b = tf.experimental.Module()
-    >>> c = tf.experimental.Module()
-    >>> a.b = b
-    >>> b.c = c
-    >>> assert list(a.owned_submodules) == [b]
-    >>> assert list(b.owned_submodules) == [c]
-    >>> assert list(c.owned_submodules) == []
-
-    Returns:
-      A generator over all child modules.
-    """
-    return walk(self, predicate=_IS_MODULE)
+    return tuple(self._flatten(predicate=_IS_TRAINABLE_VARIABLE))
 
   @property
   def submodules(self):
-    """Returns an iterator of all sub-modules recursively.
+    """Sequence of all sub-modules.
 
     Submodules are modules which are properties of this module, or found as
     properties of modules which are properties of this module (and so on).
 
-    >>> a = tf.experimental.Module()
-    >>> b = tf.experimental.Module()
-    >>> c = tf.experimental.Module()
+    >>> a = tf.Module()
+    >>> b = tf.Module()
+    >>> c = tf.Module()
     >>> a.b = b
     >>> b.c = c
     >>> assert list(a.submodules) == [b, c]
@@ -284,32 +162,104 @@ class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoCheckpointable)):
     >>> assert list(c.submodules) == []
 
     Returns:
-      A generator over all submodules.
+      A sequence of all submodules.
     """
-    return walk(self, recurse_if=_IS_MODULE, predicate=_IS_MODULE)
+    return tuple(self._flatten(predicate=_IS_MODULE))
+
+  def _flatten(self,
+               recursive=True,
+               predicate=None,
+               attribute_traversal_key=None,
+               with_path=False):
+    """Flattened attribute values in sorted order by attribute name.
+
+    Modules are flattened by first walking their attributes in name order.
+    Each attribute value is then flattened to find leaf values. If flatten is
+    to be applied `recursive`ly then if the leaf is a `Module` it will also be
+    flattened to find leaves. Finally every leaf value is optionally tested
+    against the given `predicate` and finally yielded.
+
+    >>> class Foo(tf.Module):
+    ...   def __init__(self):
+    ...     super(Foo, self).__init__()
+    ...     self.x = [tf.constant('a'), tf.constant('b')]
+    ...     self.y = {'i': tf.constant('c'), 'j': tf.constant('d')}
+    ...     self.z = tf.constant('e')
+    ...
+    ...   @property
+    ...   def tensors(self):
+    ...     return tuple(self._flatten(predicate=is_tensor, with_path=True))
+
+    >>> foo = Foo()
+    >>> foo.tensors
+    ((('x', 0),   <tf.Tensor: ...'a'>),
+     (('x', 1),   <tf.Tensor: ...'b'>),
+     (('y', 'i'), <tf.Tensor: ...'c'>),
+     (('y', 'j'), <tf.Tensor: ...'d'>),
+     (('z',),     <tf.Tensor: ...'e'>))
+
+    `attribute_traversal_key` controls the order object properties are visited.
+    If not set objects are visited in ascending order by name.
+
+    Args:
+      recursive: Whether to recurse into child modules or not.
+      predicate: (Optional) If set then only values matching predicate are
+        yielded. A value of `None` (the default) means no items will be
+        filtered.
+      attribute_traversal_key: (Optional) Method to rekey object attributes
+        before they are sorted. Contract is the same as `key` argument to
+        builtin `sorted` and only applies to object properties.
+      with_path: (Optional) Whether to include the path to the object as well
+        as the object itself. If `with_path` is `True` then leaves will not be
+        de-duplicated (e.g. if the same leaf instance is reachable via multiple
+        modules then it will be yielded multiple times with different paths).
+
+    Returns:
+      Flat generator for leaves of the current module and optionally all
+      submodules.
+    """
+    if predicate is None:
+      predicate = lambda _: True
+
+    return _flatten_module(
+        self,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path)
 
   @classmethod
-  def no_name_scope(cls, method):
-    """Decorator to wrap a method, preventing automatic name scope wrapping.
+  def with_name_scope(cls, method):
+    """Decorator to automatically enter the module name scope.
 
-    By default, any method on a module is considered as a forwards function, and
-    so any variables / modules created by the method will be scoped as belonging
-    to the module. In some cases this is undesirable, for example when
-    implementing .clone() / .transpose(), as in those cases we want the new
-    module to have the scope of wherever the .transpose() call is made. To
-    allow this, decorate any methods with `no_module_name_scope`.
+    >>> class MyModule(tf.Module):
+    ...   @tf.Module.with_name_scope
+    ...   def __call__(self, x):
+    ...     if not hasattr(self, 'w'):
+    ...       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    ...     return tf.matmul(x, self.w)
 
-    This logic is tied to ModuleMetaclass.__new__, if anything is
-    changed here corresponding changes will be needed there.
+    Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+    names included the module name:
+
+    >>> mod = MyModule()
+    >>> mod(tf.ones([8, 32]))
+    <tf.Tensor: ...>
+    >>> mod.w
+    <tf.Variable ...'my_module/w:0'>
 
     Args:
-      method: the method to wrap.
+      method: The method to wrap.
 
     Returns:
-      The method, with a flag indicating no name scope wrapping should occur.
+      The original method wrapped such that it enters the module's name scope.
     """
-    setattr(method, "_no_module_name_scope", True)
-    return method
+    def method_with_name_scope(self, *args, **kwargs):
+      with self.name_scope:
+        return method(self, *args, **kwargs)
+
+    return tf_decorator.make_decorator(method, method_with_name_scope)
+
 
 _IS_VARIABLE = lambda o: isinstance(o, variables.Variable)
 _IS_TRAINABLE_VARIABLE = lambda o: (_IS_VARIABLE(o) and o.trainable)
@@ -326,77 +276,61 @@ def camel_to_snake(value):
   return _CAMEL_TO_SNAKE_R.sub(r"_\1", value).lower()
 
 
-def walk(o, recurse_if=None, predicate=None):
-  """Flattened attributes of `o` in sorted order by attribute name.
-
-  >>> class Foo(object):
-  ...   def __init__(self, prefix=''):
-  ...     self.z = prefix + 'c'
-  ...     self.a = [prefix + 'a', prefix + 'b']
-
-  >>> tuple(walk(Foo()))
-  ('a', 'b', 'c')
-
-  If `predicate` is not None, then only values matching predicate are returned:
-
-  >>> tuple(walk(Foo(), predicate=lambda v: v != 'a'))
-  ('b', 'c')
+# AutoTrackable adds object attributes that users will not expect us to
+# include when flattening (these reference dependencies reachable via other
+# object attributes).
+AUTO_CHECKPOINTABLE_ATTRS = ("_unconditional_checkpoint_dependencies",
+                             "_unconditional_dependency_names")
 
-  If `recurse_if` is not None then it should be a callable which tests if the
-  given leaf should be expanded:
 
-  >>> is_string = lambda v: isinstance(v, str)
-  >>> is_foo = lambda l: isinstance(l, Foo)
-  >>> o = Foo(prefix='root_')
-  >>> o.b = Foo(prefix='child_')
-  >>> tuple(walk(o, predicate=is_string))
-  ('root_a', 'root_b', 'root_c')
-  >>> tuple(walk(o, recurse_if=is_foo, predicate=is_string))
-  ('root_a', 'root_b', 'root_c', 'child_a', 'child_b', 'child_c')
-
-  Args:
-    o: An object who's attributes are walked.
-    recurse_if: (Optional) Visited items of this type will be walked to extract
-      more leaves. If `None`, it will not recurse into leaves.
-    predicate: (Optional) If set then only values matching predicate are
-      yielded.
+def _flatten_module(module,
+                    recursive,
+                    predicate,
+                    attribute_traversal_key,
+                    with_path,
+                    module_path=(),
+                    seen=None):
+  """Implementation of `flatten`."""
+  if seen is None:
+    seen = set([id(module)])
 
-  Returns:
-    Attributes of `o` in name order. If `recurse_if` is not `None` then
-    attributes for which `recurse_if(attribute) == True` will be walked
-    recursively. If `predicate` is not `None` then only attributes for which
-    `predicate(attribute) == True` will be yielded.
-  """
-  if predicate is None:
-    predicate = lambda _: True
-  return _walk_internal(
-      o, recurse_if=recurse_if, predicate=predicate, seen=set())
+  module_dict = vars(module)
+  submodules = []
 
+  for key in sorted(module_dict, key=attribute_traversal_key):
+    if key in AUTO_CHECKPOINTABLE_ATTRS:
+      continue
 
-def _walk_internal(o, recurse_if, predicate, seen):
-  """Implementation of `walk`."""
-  if seen is None:
-    seen = set([id(o)])
+    for leaf_path, leaf in nest.flatten_with_tuple_paths(module_dict[key]):
+      leaf_path = (key,) + leaf_path
 
-  o_dict = vars(o)
-  to_walk = []
-
-  for key in sorted(o_dict):
-    values = nest.flatten(o_dict[key])
-    for value in values:
-      value_id = id(value)
-      if value_id in seen:
-        continue
+      # TODO(tomhennigan) Handle cycles for `with_path=True` (e.g. `a.a = a`).
+      if not with_path:
+        leaf_id = id(leaf)
+        if leaf_id in seen:
+          continue
+        seen.add(leaf_id)
 
-      seen.add(value_id)
-      if predicate(value):
-        yield value
+      if predicate(leaf):
+        if with_path:
+          yield module_path + leaf_path, leaf
+        else:
+          yield leaf
 
-      if recurse_if is not None and recurse_if(value):
+      if recursive and isinstance(leaf, Module):
         # Walk direct properties first then recurse.
-        to_walk.append(value)
-
-  for value in to_walk:
-    for subvalue in _walk_internal(value, recurse_if, predicate, seen):
+        submodules.append((module_path + leaf_path, leaf))
+
+  for submodule_path, submodule in submodules:
+    subvalues = _flatten_module(
+        submodule,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path,
+        module_path=submodule_path,
+        seen=seen)
+
+    for subvalue in subvalues:
       # Predicate is already tested for these values.
       yield subvalue
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index d5d36a7bbc17bc3cd3f39f9ac11a81f143e7ffb7..17c0ad8832a1ce8fc53f45b7db237a163faabd0e 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -18,7 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+import collections
+
+from absl.testing import parameterized
+import six
+
 from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import variables
@@ -80,6 +87,20 @@ class TestModuleNaming(test.TestCase):
     # `foo` is not a method so we do not re-enter the name scope.
     self.assertEqual(mod.foo(), "")
 
+  def test_property(self):
+    mod = PropertyModule()
+    mod.some_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.some_property
+    self.assertEqual(getter_scope_name, "property_module/")
+    self.assertEqual(setter_scope_name, "property_module/")
+
+  def test_property_no_name_scope(self):
+    mod = PropertyModule()
+    mod.no_name_scope_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.no_name_scope_property
+    self.assertEqual(getter_scope_name, "")
+    self.assertEqual(setter_scope_name, "")
+
   def test_invalid_name(self):
     msg = ".* is not a valid module name"
     with self.assertRaisesRegexp(ValueError, msg):
@@ -129,6 +150,34 @@ class TestModuleNaming(test.TestCase):
 
     self.assertEqual("", get_name_scope())
 
+  def test_get_attr_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttrModule(module.Module):
+
+      def __getattr__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttrModule, self).__getattr__(name)
+
+    mod = GetAttrModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
+  def test_get_attribute_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttributeModule(module.Module):
+
+      def __getattribute__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttributeModule, self).__getattribute__(name)
+
+    mod = GetAttributeModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
 
 class VariableNamingTest(test.TestCase):
 
@@ -143,82 +192,76 @@ class VariableTrackingTest(test.TestCase):
 
   def test_variables(self):
     m = RecursiveModule(3)
-    self.assertCountEqual(m.variables, (m.w, m.child.w, m.child.child.w))
-    self.assertCountEqual(m.child.variables, (m.child.w, m.child.child.w))
-    self.assertCountEqual(m.child.child.variables, (m.child.child.w,))
-
-  def test_owned_variables(self):
-    m = RecursiveModule(3)
-    self.assertCountEqual(m.owned_variables, (m.w,))
-    self.assertCountEqual(m.child.owned_variables, (m.child.w,))
-    self.assertCountEqual(m.child.child.owned_variables, (m.child.child.w,))
+    self.assertEqual(m.variables, (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.variables, (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.variables, (m.child.child.w,))
 
   def test_trainable_variables(self):
     m = RecursiveModule(3)
-    self.assertCountEqual(m.trainable_variables,
-                          (m.w, m.child.w, m.child.child.w))
-    self.assertCountEqual(m.child.trainable_variables,
-                          (m.child.w, m.child.child.w))
-    self.assertCountEqual(m.child.child.trainable_variables, (m.child.child.w,))
+    self.assertEqual(m.trainable_variables,
+                     (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.trainable_variables,
+                     (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.trainable_variables, (m.child.child.w,))
 
   def test_trainable_variables_ignores_non_trainable(self):
     m = RecursiveModule(3, trainable=False)
-    self.assertEmpty(m.trainable_variables)
-    self.assertEmpty(m.child.trainable_variables)
-    self.assertEmpty(m.child.child.trainable_variables)
-
-  def test_owned_trainable_variables(self):
-    m = RecursiveModule(3)
-    self.assertCountEqual(m.owned_trainable_variables, (m.w,))
-    self.assertCountEqual(m.child.owned_trainable_variables, (m.child.w,))
-    self.assertCountEqual(m.child.child.owned_trainable_variables,
-                          (m.child.child.w,))
-
-  def test_owned_trainable_variables_ignores_non_trainable(self):
-    m = RecursiveModule(3, trainable=False)
-    self.assertEmpty(m.owned_trainable_variables)
-    self.assertEmpty(m.child.owned_trainable_variables)
-    self.assertEmpty(m.child.child.owned_trainable_variables)
+    self.assertEqual(len(m.trainable_variables), 0)
+    self.assertEqual(len(m.child.trainable_variables), 0)
+    self.assertEqual(len(m.child.child.trainable_variables), 0)
 
 
 class ModuleTrackingTest(test.TestCase):
 
-  def test_owned_submodules(self):
-    m = RecursiveModule(3)
-    self.assertCountEqual(m.owned_submodules, [m.child])
-    self.assertCountEqual(m.child.owned_submodules, [m.child.child])
-    self.assertEmpty(list(m.child.child.owned_submodules))
-
   def test_submodules(self):
     m = RecursiveModule(3)
-    self.assertCountEqual(m.submodules, [m.child, m.child.child])
-    self.assertCountEqual(m.child.submodules, [m.child.child])
-    self.assertEmpty(list(m.child.child.submodules))
+    self.assertEqual(list(m.submodules), [m.child, m.child.child])
+    self.assertEqual(list(m.child.submodules), [m.child.child])
+    self.assertEqual(list(m.child.child.submodules), [])
 
   def test_non_ctor_submodule(self):
     m = TreeModule()
     leaf1 = m.new_leaf()
-    self.assertCountEqual(m.submodules, (leaf1,))
+    self.assertEqual(set(m.submodules), {leaf1})
     leaf2 = m.new_leaf()
-    self.assertCountEqual(m.submodules, (leaf1, leaf2))
+    self.assertEqual(set(m.submodules), {leaf1, leaf2})
 
 
-class CommonErrorsTest(test.TestCase):
+class ForwardMethodsTest(test.TestCase):
 
-  def test_not_calling_super_constructor(self):
-    msg = ("Constructing a tf.Module without calling the super constructor is "
-           "not supported")
-    with self.assertRaisesRegexp(ValueError, msg):
-      DoesNotCallSuperConstructorModule()
+  def testFunctionType(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertTrue(isinstance(mod.forward, def_function.Function))
+    self.assertTrue(isinstance(mod.forward_ag, def_function.Function))
 
-  def test_calls_method_before_super(self):
-    msg = "super constructor must be called before any other methods"
-    with self.assertRaisesRegexp(AttributeError, msg):
-      CallsMethodBeforeSuperConstructorModule(allowed_method=False)
+  def testEntersNameScope_call(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag().numpy(),
+                     b"module_with_function_annotated_call/")
 
-  def test_annotated_method_is_allowed(self):
-    self.assertIsNotNone(
-        CallsMethodBeforeSuperConstructorModule(allowed_method=True))
+  def testEntersNameScope_concreteFunction(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+
+
+class AbcTest(test.TestCase):
+
+  def testAbstract(self):
+    msg = "Can't instantiate .* abstract methods"
+    with self.assertRaisesRegexp(TypeError, msg):
+      AbstractModule()  # pylint: disable=abstract-class-instantiated
+
+  def testConcrete(self):
+    mod = ConcreteModule()
+    x, scope_name = mod(2.)
+    self.assertEqual(x, 4.)
+    self.assertEqual(scope_name, "concrete_module/")
+    self.assertEqual(get_name_scope(), "")
 
 
 def get_name_scope():
@@ -246,10 +289,26 @@ class RecursiveModule(module.Module):
 
   def __init__(self, depth, trainable=True):
     super(RecursiveModule, self).__init__(name="badger")
-    self.child = None
-    if depth > 1:
-      self.child = RecursiveModule(depth - 1, trainable=trainable)
-    self.w = variables.Variable(1.0, trainable=trainable, name="mushroom")
+    with self.name_scope:
+      self.child = None
+      if depth > 1:
+        self.child = RecursiveModule(depth - 1, trainable=trainable)
+      self.w = variables.Variable(1.0, trainable=trainable, name="mushroom")
+
+
+@six.add_metaclass(abc.ABCMeta)
+class AbstractModule(module.Module):
+
+  @abc.abstractmethod
+  def __call__(self, x):
+    pass
+
+
+class ConcreteModule(AbstractModule):
+
+  @module.Module.with_name_scope
+  def __call__(self, x):
+    return x ** 2, get_name_scope()
 
 
 class TreeModule(module.Module):
@@ -258,6 +317,7 @@ class TreeModule(module.Module):
     super(TreeModule, self).__init__(name=name)
     self._leaves = []
 
+  @module.Module.with_name_scope
   def new_leaf(self, name=None):
     leaf = TreeModule(name=name)
     self._leaves.append(leaf)
@@ -266,15 +326,18 @@ class TreeModule(module.Module):
 
 class ReturnsNameScopeModule(module.Module):
 
+  @module.Module.with_name_scope
   def alternative_forward(self):
     return get_name_scope()
 
+  @module.Module.with_name_scope
   def __call__(self):
     return get_name_scope()
 
 
 class SubclassedReturnsNameScopeModule(ReturnsNameScopeModule):
 
+  @module.Module.with_name_scope
   def alternative_alternative_forward(self):
     return get_name_scope()
 
@@ -293,44 +356,111 @@ class ModuleOverridingNameScope(ReturnsNameScopeModule):
     return ops.name_scope("yolo/")
 
 
-class DoesNotCallSuperConstructorModule(module.Module):
+class ModuleWithFunctionAnnotatedCall(module.Module):
+
+  @def_function.function(autograph=False)
+  @module.Module.with_name_scope
+  def forward(self):
+    return get_name_scope()
+
+  @def_function.function(autograph=True)
+  @module.Module.with_name_scope
+  def forward_ag(self):
+    return get_name_scope()
+
+
+class PropertyModule(module.Module):
 
   def __init__(self):
-    # NOTE: Intentionally does not call super constructor.
-    pass
+    super(PropertyModule, self).__init__()
+    self._setter_scope_name = None
 
+  @property
+  @module.Module.with_name_scope
+  def some_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
 
-class CallsMethodBeforeSuperConstructorModule(module.Module):
+  @some_property.setter
+  @module.Module.with_name_scope
+  def some_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
 
-  def __init__(self, allowed_method):
-    if allowed_method:
-      self.no_name_scope()
-    else:
-      self.with_name_scope()
-    super(CallsMethodBeforeSuperConstructorModule, self).__init__()
+  @property
+  def no_name_scope_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
 
-  @module.Module.no_name_scope
-  def no_name_scope(self):
-    pass
+  @no_name_scope_property.setter
+  def no_name_scope_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
 
-  def with_name_scope(self):
-    pass
+NamedPair = collections.namedtuple("NamedPair", ("first", "second"))
+mk_index_dict = lambda v: dict(enumerate(v))
 
 
-class WalkTest(test.TestCase):
+class FlattenTest(parameterized.TestCase, test.TestCase):
 
-  def test_walk(self):
-    parent = SimpleModule()
+  @parameterized.parameters(lambda v: NamedPair(*v), list, tuple, mk_index_dict)
+  def test_flatten(self, container_type):
+    parent = SimpleModule(container_type=container_type)
     child = parent.c
 
-    self.assertCountEqual(
-        module.walk(parent, predicate=IS_MEMBER),
+    self.assertEqual(
+        list(parent._flatten(recursive=False, predicate=IS_MEMBER)),
         [parent.a[0], parent.a[1], parent.z])
 
-    self.assertCountEqual(
-        module.walk(parent, recurse_if=IS_MODULE, predicate=IS_MEMBER),
+    self.assertEqual(
+        list(parent._flatten(predicate=IS_MEMBER)),
         [parent.a[0], parent.a[1], parent.z, child.a[0], child.a[1], child.z])
 
+  def test_attribute_traversal_key(self):
+    mod = LayerModule()
+    self.assertEqual(
+        mod.variables,
+        mod._trainable_variables + mod._non_trainable_variables + [mod._bonus])
+
+  def test_with_path(self):
+    mod = module.Module()
+    mod.w = variables.Variable(1.)
+    mod.encoder = module.Module()
+    mod.encoder.w = [({"k": mod.w}, {"k": mod.w})]
+    mod.decoder = mod.encoder
+
+    state_dict = dict(
+        mod._flatten(with_path=True, predicate=module._IS_VARIABLE))
+
+    self.assertEqual(state_dict,
+                     {("w",): mod.w,
+                      ("encoder", "w", 0, 0, "k"): mod.encoder.w[0][0]["k"],
+                      ("encoder", "w", 0, 1, "k"): mod.encoder.w[0][1]["k"],
+                      ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
+                      ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
+
+
+class LayerModule(module.Module):
+
+  def __init__(self):
+    super(LayerModule, self).__init__()
+    self._trainable_variables = [
+        variables.Variable(1., name="a"),
+        variables.Variable(2., name="b"),
+    ]
+    self._non_trainable_variables = [
+        variables.Variable(3., name="c"),
+        variables.Variable(4., name="d"),
+    ]
+    self._bonus = variables.Variable(5., name="e")
+
+  @property
+  def variables(self):
+    def key_function(name):
+      indexes = {"_trainable_variables": 0, "_non_trainable_variables": 1}
+      return indexes.get(name, 2), name
+
+    return list(self._flatten(predicate=module._IS_VARIABLE,
+                              attribute_traversal_key=key_function))
+
 
 class MemberType(object):
   """A simple type to search for."""
@@ -339,10 +469,10 @@ class MemberType(object):
 
 class SimpleModule(module.Module):
 
-  def __init__(self, create_child=True):
+  def __init__(self, create_child=True, container_type=list):
     super(SimpleModule, self).__init__()
     self.z = MemberType()
-    self.a = [MemberType(), MemberType()]
+    self.a = container_type([MemberType(), MemberType()])
     if create_child:
       self.c = SimpleModule(create_child=False)
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 45e741ef222b1dcde21b66ab6cdc3db9576a85ce..1dedd11b70755909766d7c0c1317c04cf586f01a 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -399,7 +400,7 @@ def _GatherGrad(op, grad):
   params = op.inputs[0]
   with ops.colocate_with(params):
     params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
-    params_shape = math_ops.to_int32(params_shape)
+    params_shape = math_ops.cast(params_shape, dtypes.int32)
 
   # Build appropriately shaped IndexedSlices
   indices = op.inputs[1]
@@ -422,7 +423,7 @@ def _GatherV2Grad(op, grad):
   params = op.inputs[0]
   with ops.colocate_with(params):
     params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
-    params_shape = math_ops.to_int32(params_shape)
+    params_shape = math_ops.cast(params_shape, dtypes.int32)
 
   indices = op.inputs[1]
   indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
@@ -568,7 +569,7 @@ ops.NotDifferentiable("Size")
 @ops.RegisterGradient("Tile")
 def _TileGrad(op, grad):
   """Sum reduces grad along the tiled dimensions."""
-  input_shape = array_ops.shape(op.inputs[0])
+  input_shape = array_ops.shape(op.inputs[0], out_type=op.inputs[1].dtype)
   # We interleave multiples and input_shape to get split_shape,
   # reshape grad to split_shape, and reduce along all even
   # dimensions (the tiled dimensions) to get the result
@@ -582,10 +583,11 @@ def _TileGrad(op, grad):
   axes = math_ops.range(0, array_ops.size(split_shape), 2)
   # Sum reduces grad along the first dimension for IndexedSlices
   if isinstance(grad, ops.IndexedSlices):
+    input_shape_0 = math_ops.cast(input_shape[0], grad.indices.dtype)
     grad = math_ops.unsorted_segment_sum(
         grad.values,
-        math_ops.mod(grad.indices, input_shape[0]),
-        input_shape[0])
+        math_ops.mod(grad.indices, input_shape_0),
+        input_shape_0)
     split_shape = array_ops.concat([[1], split_shape[1:]], axis=0)
   input_grad = math_ops.reduce_sum(array_ops.reshape(grad, split_shape), axes)
   # Fix shape inference
@@ -795,6 +797,68 @@ def _ExtractImagePatchesGrad(op, grad):
   return [grad_out]
 
 
+@ops.RegisterGradient("ExtractVolumePatches")
+def _ExtractVolumePatchesGrad(op, grad):
+  batch_size, planes_in, rows_in, cols_in, channels = [
+      dim.value for dim in op.inputs[0].shape.dims
+  ]
+  input_bphwc = array_ops.shape(op.inputs[0])
+  batch_size = input_bphwc[0]
+  channels = input_bphwc[4]
+
+  # Create indices matrix for input tensor.
+  # Note that 0 is preserved for padding location,
+  # so indices for input start from 1 to 1 + rows_in * cols_in.
+  input_indices_num = 1 + planes_in * rows_in * cols_in
+  input_idx = array_ops.reshape(
+      math_ops.range(1, input_indices_num, dtype=ops.dtypes.int64),
+      (1, planes_in, rows_in, cols_in, 1))
+  input_idx_patched = gen_array_ops.extract_volume_patches(
+      input_idx, op.get_attr("ksizes"), op.get_attr("strides"),
+      op.get_attr("padding"))
+
+  # Create indices matrix for output tensor.
+  _, planes_out, rows_out, cols_out, _ = [
+      dim.value for dim in op.outputs[0].shape.dims
+  ]
+  _, ksize_p, ksize_r, ksize_c, _ = op.get_attr("ksizes")
+  # Indices for output start from 0.
+  prc_indices_num = planes_out * rows_out * cols_out
+  output_indices_num = prc_indices_num * ksize_p * ksize_r * ksize_c
+  output_idx = array_ops.reshape(
+      math_ops.range(output_indices_num, dtype=ops.dtypes.int64),
+      (1, planes_out, rows_out, cols_out, ksize_p * ksize_r * ksize_c))
+
+  # Construct mapping table for indices: (input -> output).
+  idx_matrix = array_ops.concat([
+      array_ops.expand_dims(input_idx_patched, axis=-1),
+      array_ops.expand_dims(output_idx, axis=-1)
+  ],
+                                axis=-1)
+  idx_map = array_ops.reshape(idx_matrix, (-1, 2))
+
+  sp_shape = (input_indices_num, output_indices_num)
+  sp_mat_full = sparse_tensor.SparseTensor(
+      idx_map, array_ops.ones([output_indices_num], dtype=grad.dtype), sp_shape)
+  # Remove all padding locations [0, :].
+  sp_mat = sparse_ops.sparse_slice(sp_mat_full, (1, 0),
+                                   (input_indices_num - 1, output_indices_num))
+
+  grad_expanded = array_ops.transpose(
+      array_ops.reshape(grad, (batch_size, planes_out, rows_out, cols_out,
+                               ksize_p, ksize_r, ksize_c, channels)),
+      (1, 2, 3, 4, 5, 6, 0, 7))
+  grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
+
+  jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
+
+  grad_out = array_ops.reshape(
+      jac, (planes_in, rows_in, cols_in, batch_size, channels))
+  grad_out = array_ops.transpose(grad_out, (3, 0, 1, 2, 4))
+
+  return [grad_out]
+
+
 @ops.RegisterGradient("ScatterNd")
 def _ScatterNdGrad(op, grad):
   indices = op.inputs[0]
@@ -839,16 +903,11 @@ def _ScatterNdNonAliasingAddGrad(op, grad):
 def _BroadcastToGrad(op, grad):
   input_value = op.inputs[0]
   broadcast_shape = op.inputs[1]
-  # Assign ids for each position in input_value.
   input_value_shape = array_ops.shape(input_value)
-  input_value_size = array_ops.size(input_value)
-  ids = array_ops.reshape(math_ops.range(input_value_size), input_value_shape)
-  broadcast_ids = array_ops.broadcast_to(ids, broadcast_shape)
-  # Group by ids and sum its gradients.
-  grad_flatten = array_ops.reshape(grad, [-1])
-  broadcast_ids_flatten = array_ops.reshape(broadcast_ids, [-1])
-  updates_grad_flatten = math_ops.unsorted_segment_sum(grad_flatten,
-                                                       broadcast_ids_flatten,
-                                                       input_value_size)
-  updates_grad = array_ops.reshape(updates_grad_flatten, input_value_shape)
+  _, reduction_axes = gen_array_ops.broadcast_gradient_args(broadcast_shape,
+                                                            input_value_shape)
+  updates_grad_reshaped = math_ops.reduce_sum(grad,
+                                              axis=reduction_axes,
+                                              keepdims=True)
+  updates_grad = array_ops.reshape(updates_grad_reshaped, input_value_shape)
   return [updates_grad, None]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 014fdd25cc26a5ce789bfeb0c587c987b750c366..59980b80104d17fd5bb108110d923e27d3a5931d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -67,16 +67,20 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not hasattr(input, "graph"):
     input = ops.convert_to_tensor(input)
-    in_device = input.device
+    in_device = input.backing_device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     context_device = context.context().device_name
     if not context_device:
       context_device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    if context_device != in_device:
-      return input._copy()  # pylint: disable=protected-access
-    return input
+    if context_device == in_device:
+      return input
+    else:
+      copied = input._copy()  # pylint: disable=protected-access
+      if hasattr(copied, "_handle_data"):
+        copied._handle_data = input._handle_data  # pylint: disable=protected-access
+      return copied
   else:
     ret = gen_array_ops.identity(input, name=name)
     # Propagate handle data for happier shape inference for resource variables.
@@ -553,6 +557,10 @@ def _slice_helper(tensor, slice_spec, var=None):
   print(foo[tf.newaxis, :, :].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
   print(foo[tf.newaxis, ...].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
   print(foo[tf.newaxis].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+
+  # masks
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[foo > 2].eval())  # => [3, 4, 5, 6, 7, 8, 9]
   ```
 
   Notes:
@@ -575,6 +583,10 @@ def _slice_helper(tensor, slice_spec, var=None):
     TypeError: If the slice indices aren't int, slice, ellipsis,
       tf.newaxis or scalar int32/int64 tensors.
   """
+  if isinstance(slice_spec, bool) or \
+  (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
+  (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
+    return boolean_mask(tensor=tensor, mask=slice_spec)
 
   if not isinstance(slice_spec, (list, tuple)):
     slice_spec = [slice_spec]
@@ -1460,14 +1472,14 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
 
-  If `num_or_size_splits` is an integer type, then `value` is split
-  along dimension `axis` into `num_split` smaller tensors.
-  Requires that `num_split` evenly divides `value.shape[axis]`.
+  If `num_or_size_splits` is an integer, then `value` is split along dimension
+  `axis` into `num_split` smaller tensors. This requires that `num_split` evenly
+  divides `value.shape[axis]`.
 
-  If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
-  `size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
-  of the `i`-th piece has the same size as the `value` except along dimension
-  `axis` where the size is `size_splits[i]`.
+  If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
+  and `value` is split into `len(size_splits)` elements. The shape of the `i`-th
+  element has the same size as the `value` except along dimension `axis` where
+  the size is `size_splits[i]`.
 
   For example:
 
@@ -1485,13 +1497,13 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
-      splits along split_dim or a 1-D integer `Tensor` containing
+    num_or_size_splits: Either an integer indicating the number of
+      splits along split_dim or a 1-D integer `Tensor` or Python list containing
       the sizes of each output tensor along split_dim. If a scalar then it must
       evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
       split dimension must match that of the `value`.
-    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
-      Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
+    axis: An integer or scalar `int32` `Tensor`. The dimension along which to
+    split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
       inferred from the shape of `size_splits`.
     name: A name for the operation (optional).
@@ -1506,10 +1518,16 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
     ValueError: If `num` is unspecified and cannot be inferred.
   """
   size_splits = ops.convert_to_tensor(num_or_size_splits)
-  if size_splits._rank() == 0 and size_splits.dtype.is_integer:
+  if isinstance(num_or_size_splits,
+                six.integer_types + (tensor_shape.Dimension,)):
     return gen_array_ops.split(
         axis=axis, num_split=num_or_size_splits, value=value, name=name)
 
+  if size_splits._rank() == 0:
+    raise ValueError(
+        "Rank-0 tensors are not supported as the num_or_size_splits argument "
+        "to split. Argument provided: %s" % (num_or_size_splits,))
+
   if num is None:
     size_splits_shape = size_splits._shape_tuple()
     if size_splits_shape:
@@ -2662,7 +2680,10 @@ def required_space_to_batch_paddings(input_shape,
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
-def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
+def space_to_batch(  # pylint: disable=missing-docstring
+    input, paddings, block_size=None, name=None, block_shape=None):  # pylint: disable=redefined-builtin
+  block_size = deprecation.deprecated_argument_lookup(
+      "block_shape", block_shape, "block_size", block_size)
   result = space_to_batch_nd(
       input,
       paddings=paddings,
@@ -2718,7 +2739,9 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
-def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
+def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
+  block_size = deprecation.deprecated_argument_lookup(
+      "block_shape", block_shape, "block_size", block_size)
   result = batch_to_space_nd(
       input,
       crops=crops,
@@ -3076,6 +3099,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
 
     if maxlen is None:
       maxlen = gen_math_ops._max(lengths, _all_dimensions(lengths))
+      maxlen = gen_math_ops.maximum(constant(0, maxlen.dtype), maxlen)
     else:
       maxlen = ops.convert_to_tensor(maxlen)
     if maxlen.get_shape().ndims is not None and maxlen.get_shape().ndims != 0:
@@ -3266,11 +3290,11 @@ def gather(params,
            validate_indices=None,
            name=None,
            axis=None,
-           batch_dims=0):  # pylint: disable=g-doc-args
+           batch_dims=0):
   r"""Gather slices from params axis axis according to indices.
 
-  Gather slices from params axis axis according to indices.  `indices` must be
-  an integer tensor of any dimension (usually 0-D or 1-D).
+  Gather slices from params axis `axis` according to `indices`.  `indices` must
+  be an integer tensor of any dimension (usually 0-D or 1-D).
 
   For 0-D (scalar) `indices`:
 
@@ -3320,23 +3344,24 @@ def gather(params,
       `axis + 1`.
     indices: The index `Tensor`.  Must be one of the following types: `int32`,
       `int64`. Must be in range `[0, params.shape[axis])`.
+    validate_indices: Deprecated, does nothing.
+    name: A name for the operation (optional).
     axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
       `axis` in `params` to gather `indices` from. Must be greater than or equal
       to `batch_dims`.  Defaults to the first non-batch dimension. Supports
       negative indexes.
     batch_dims: An `integer`.  The number of batch dimensions.  Must be less
-      than `ndims(inices)`.
-    name: A name for the operation (optional).
+      than `rank(indices)`.
 
   Returns:
     A `Tensor`. Has the same type as `params`.
   """
   del validate_indices
-  if axis is None:
-    axis = batch_dims
   if batch_dims != 0:
     with ops.name_scope(name, "Gather", [params, indices, axis]):
       return _batch_gather(params, indices, batch_dims, axis)
+  if axis is None:
+    axis = batch_dims
   if axis != 0:
     # Note that we do a sparse_read here to avoid snapshotting the entire
     # resource variable and doing a gather, which can be inefficient and lead to
@@ -3365,7 +3390,7 @@ gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
 @dispatch.add_dispatch_support
 @deprecation.deprecated(
     "2017-10-25", "`tf.batch_gather` is deprecated, please use `tf.gather` "
-    "with `batch_dims=-1` instead.")  # pylint: disable=missing-docstring
+    "with `batch_dims` instead.")  # pylint: disable=missing-docstring
 def batch_gather(params, indices, name=None):
   """Gather slices from params according to indices with leading batch dims."""
   with ops.name_scope(name, "BatchGather", [params, indices]):
@@ -3383,15 +3408,15 @@ def _batch_gather(params, indices, batch_dims, axis=None):
   This operation assumes that the leading `batch_dims` dimensions of `indices`
   and `params` are batch dimensions; and performs a `tf.gather` operation within
   each batch. (If `batch_dims` is not specified, then it defaults to
-  `ndims(indices) - 1`.)  In the case in which `batch_dims==0`, this operation
+  `rank(indices)-1`.)  In the case in which `batch_dims==0`, this operation
   is equivalent to `tf.gather`.
 
   Args:
     params: A Tensor. The tensor from which to gather values.
     indices: A Tensor. Must be one of the following types: int32, int64. Index
       tensor. Must be in range `[0, params.shape[batch_dims]]`.
-    batch_dims: An integer.  The number of batch dimensions.  Must be less than
-      ndims(inices).  Defaults to `ndims(indices) - 1` if not specified.
+    batch_dims: An integer or none.  The number of batch dimensions.  Must be
+      less than `rank(indices)`.  Defaults to `rank(indices) - 1` if None.
     axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
       `axis` in `params` to gather `indices` from. Must be greater than or equal
       to `batch_dims`.  Defaults to the first non-batch dimension. Supports
@@ -3417,10 +3442,10 @@ def _batch_gather(params, indices, batch_dims, axis=None):
   if batch_dims < 0:
     batch_dims += indices_ndims
   if batch_dims < 0 or batch_dims >= indices_ndims:
-    raise ValueError("batch_dims = %d must be less than ndims(indices) = %d" %
+    raise ValueError("batch_dims = %d must be less than rank(indices) = %d" %
                      (batch_dims, indices_ndims))
   if params.shape.ndims is not None and batch_dims >= params.shape.ndims:
-    raise ValueError("batch_dims = %d must be less than ndims(params) = %d" %
+    raise ValueError("batch_dims = %d must be less than rank(params) = %d" %
                      (batch_dims, params.shape.ndims))
 
   # Handle axis by transposing the axis dimension to be the first non-batch
@@ -3649,7 +3674,22 @@ def extract_image_patches_v2(
   return gen_array_ops.extract_image_patches(
       images, sizes, strides, rates, padding, name)
 
-extract_image_patches_deprecation = deprecation.deprecated_args(
+
+@tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@deprecation.deprecated_args(
     None, "ksizes is deprecated, use sizes instead", "ksizes")
-tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
-    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
+def extract_image_patches(  # pylint: disable=missing-docstring
+    images,
+    ksizes=None,
+    strides=None,
+    rates=None,
+    padding=None,
+    name=None,
+    sizes=None):
+  ksizes = deprecation.deprecated_argument_lookup(
+      "sizes", sizes, "ksizes", ksizes)
+  return gen_array_ops.extract_image_patches(
+      images, ksizes, strides, rates, padding, name)
+
+
+extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index c182874c7f2d77b317f42a0cbfadb7435534f747..d154b6759bfbc50ad2e5ea34e4f04b945ef2d397 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -35,7 +35,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
     super(BitwiseOpTest, self).__init__(method_name)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -73,7 +72,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(truth, popcnt_result)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -99,7 +97,6 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(inverted, expected)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 37d649acf00c6905ae7330169321e5a5f8f487be..9e0011194dd0f2b36810da3d5d16a38282175b8d 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -43,7 +43,7 @@ from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantil
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 class PruningMode(object):
@@ -61,7 +61,36 @@ class PruningMode(object):
           sorted(cls._map))))
 
 
-class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator."""
+
+  def __init__(self, resource_handle, create_op, num_streams, name):
+    self._resource_handle = resource_handle
+    self._num_streams = num_streams
+    self._create_op = create_op
+    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+                                              self._num_streams)
+    slice_spec = ''
+    specs = []
+
+    def make_save_spec(tensor, suffix):
+      return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix)
+
+    for i in range(self._num_streams):
+      specs += [
+          make_save_spec(bucket_boundaries[i], '_bucket_boundaries_' + str(i))
+      ]
+    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+                                                      specs, name)
+
+  def restore(self, restored_tensors, unused_tensor_shapes):
+    bucket_boundaries = restored_tensors
+    with ops.control_dependencies([self._create_op]):
+      return quantile_resource_deserialize(
+          self._resource_handle, bucket_boundaries=bucket_boundaries)
+
+
+class QuantileAccumulator(tracking.TrackableResource):
   """SaveableObject implementation for QuantileAccumulator.
 
      The bucket boundaries are serialized and deserialized from checkpointing.
@@ -73,55 +102,58 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                num_quantiles,
                name=None,
                max_elements=None):
+    self._eps = epsilon
+    self._num_streams = num_streams
+    self._num_quantiles = num_quantiles
+    super(QuantileAccumulator, self).__init__()
+
     with ops.name_scope(name, 'QuantileAccumulator') as name:
-      self._eps = epsilon
-      self._num_streams = num_streams
-      self._num_quantiles = num_quantiles
-      self._resource_handle = quantile_resource_handle_op(
-          container='', shared_name=name, name=name)
-      self._create_op = create_quantile_stream_resource(self._resource_handle,
-                                                        epsilon, num_streams)
-      is_initialized_op = is_quantile_resource_initialized(
-          self._resource_handle)
-      resources.register_resource(self._resource_handle, self._create_op,
-                                  is_initialized_op)
-      self._make_saveable(name)
-
-  def _make_saveable(self, name):
-    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
-                                              self._num_streams)
-    slice_spec = ''
-    specs = []
-    for i in range(self._num_streams):
-      specs.append(
-          saver.BaseSaverBuilder.SaveSpec(
-              bucket_boundaries[i], slice_spec,
-              name + '_bucket_boundaries_' + str(i)))
-    super(QuantileAccumulator, self).__init__(self._resource_handle, specs,
-                                              name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+      self._name = name
+      self._resource_handle = self._create_resource()
+      self._init_op = self._initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self._init_op,
+                                is_initialized_op)
+    self._saveable = QuantileAccumulatorSaveable(
+        self.resource_handle, self._init_op, self._num_streams,
+        self.resource_handle.name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+
+  def _create_resource(self):
+    return quantile_resource_handle_op(
+        container='', shared_name=self._name, name=self._name)
 
-  def restore(self, restored_tensors, unused_tensor_shapes):
-    bucket_boundaries = restored_tensors
-    with ops.control_dependencies([self._create_op]):
-      return quantile_resource_deserialize(
-          self._resource_handle, bucket_boundaries=bucket_boundaries)
+  def _initialize(self):
+    return create_quantile_stream_resource(self.resource_handle, self._eps,
+                                           self._num_streams)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self._initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return is_quantile_resource_initialized(self.resource_handle)
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def _gather_saveables_for_checkpoint(self):
+    return {'quantile_accumulator', self._saveable}
 
   def add_summaries(self, float_columns, example_weights):
     summaries = make_quantile_summaries(float_columns, example_weights,
                                         self._eps)
-    summary_op = quantile_add_summaries(self._resource_handle, summaries)
+    summary_op = quantile_add_summaries(self.resource_handle, summaries)
     return summary_op
 
   def flush(self):
-    return quantile_flush(self._resource_handle, self._num_quantiles)
+    return quantile_flush(self.resource_handle, self._num_quantiles)
 
   def get_bucket_boundaries(self):
-    return get_bucket_boundaries(self._resource_handle, self._num_streams)
-
-  @property
-  def resource(self):
-    return self._resource_handle
+    return get_bucket_boundaries(self.resource_handle, self._num_streams)
 
 
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
@@ -178,8 +210,8 @@ class TreeEnsemble(tracking.TrackableResource):
     self._is_local = is_local
     with ops.name_scope(name, 'TreeEnsemble') as name:
       self._name = name
-      self._resource_handle = self.create_resource()
-      self._init_op = self.initialize()
+      self._resource_handle = self._create_resource()
+      self._init_op = self._initialize()
       is_initialized_op = self.is_initialized()
       # Adds the variable to the savable list.
       if not is_local:
@@ -192,11 +224,11 @@ class TreeEnsemble(tracking.TrackableResource):
           is_initialized_op,
           is_shared=not is_local)
 
-  def create_resource(self):
+  def _create_resource(self):
     return gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
         container='', shared_name=self._name, name=self._name)
 
-  def initialize(self):
+  def _initialize(self):
     return gen_boosted_trees_ops.boosted_trees_create_ensemble(
         self.resource_handle,
         self._stamp_token,
@@ -205,7 +237,7 @@ class TreeEnsemble(tracking.TrackableResource):
   @property
   def initializer(self):
     if self._init_op is None:
-      self._init_op = self.initialize()
+      self._init_op = self._initialize()
     return self._init_op
 
   def is_initialized(self):
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index f1f36269cf2bd9bcd3d25638a82d776850bc6bb8..b452b4a0f341738aac1da7c7b78ba99a5a469e70 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1526,6 +1526,25 @@ def _get_diff_for_monotonic_comparison(x):
     v1=['debugging.is_numeric_tensor', 'is_numeric_tensor'])
 @deprecation.deprecated_endpoints('is_numeric_tensor')
 def is_numeric_tensor(tensor):
+  """Returns `True` if the elements of `tensor` are numbers.
+
+  Specifically, returns `True` if the dtype of `tensor` is one of the following:
+
+  * `tf.float32`
+  * `tf.float64`
+  * `tf.int8`
+  * `tf.int16`
+  * `tf.int32`
+  * `tf.int64`
+  * `tf.uint8`
+  * `tf.qint8`
+  * `tf.qint32`
+  * `tf.quint8`
+  * `tf.complex64`
+
+  Returns `False` if `tensor` is of a non-numeric type or if `tensor` is not
+  a `tf.Tensor` object.
+  """
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
@@ -1702,7 +1721,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
-  """Asserts that the given `tensor` is a scalar.
+  """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
 
   This function raises `ValueError` unless it can be certain that the given
   `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e..6b271ff42a0c4b99bddce26dc12b10f5b834c5b8 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -126,8 +126,8 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   In this case, the L2-norm of the output tensor is `clip_norm`.
 
   As another example, if `t` is a matrix and `axes == [1]`, then each row
-  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
-  instead, each column of the output will be clipped.
+  of the output will have L2-norm less than or equal to `clip_norm`. If
+  `axes == [0]` instead, each column of the output will be clipped.
 
   This operation is typically used to clip gradients before applying them with
   an optimizer.
@@ -304,9 +304,9 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
 
 @deprecation.deprecated(
     date=None,
-    instructions=
-    "clip_by_average_norm is deprecated in TensorFlow 2.0. Please use "
-    "clip_by_norm(t, clip_norm * tf.to_float(tf.size(t), name)) instead.")
+    instructions="clip_by_average_norm is deprecated in TensorFlow 2.0. Please "
+    "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
+    "instead.")
 @tf_export(v1=["clip_by_average_norm"])
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
diff --git a/tensorflow/python/ops/clustering_ops.py b/tensorflow/python/ops/clustering_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..10423bf9ab6bae65d15159910c89af251e1aea5b
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops.py
@@ -0,0 +1,770 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Clustering Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed as random_seed_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_clustering_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.embedding_ops import embedding_lookup
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
+
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
+SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
+# Cosine distance between vectors U and V is defined as
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
+COSINE_DISTANCE = 'cosine'
+
+RANDOM_INIT = 'random'
+KMEANS_PLUS_PLUS_INIT = 'kmeans_plus_plus'
+KMC2_INIT = 'kmc2'
+
+# The name of the variable holding the cluster centers. Used by the Estimator.
+CLUSTERS_VAR_NAME = 'clusters'
+
+
+class KMeans(object):
+  """Creates the graph for k-means clustering."""
+
+  def __init__(self,
+               inputs,
+               num_clusters,
+               initial_clusters=RANDOM_INIT,
+               distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
+               use_mini_batch=False,
+               mini_batch_steps_per_iteration=1,
+               random_seed=0,
+               kmeans_plus_plus_num_retries=2,
+               kmc2_chain_length=200):
+    """Creates an object for generating KMeans clustering graph.
+
+    This class implements the following variants of K-means algorithm:
+
+    If use_mini_batch is False, it runs standard full batch K-means. Each step
+    runs a single iteration of K-Means. This step can be run sharded across
+    multiple workers by passing a list of sharded inputs to this class. Note
+    however that a single step needs to process the full input at once.
+
+    If use_mini_batch is True, it runs a generalization of the mini-batch
+    K-means algorithm. It runs multiple iterations, where each iteration is
+    composed of mini_batch_steps_per_iteration steps. Two copies of cluster
+    centers are maintained: one that is updated at the end of each iteration,
+    and one that is updated every step. The first copy is used to compute
+    cluster allocations for each step, and for inference, while the second copy
+    is the one updated each step using the mini-batch update rule. After each
+    iteration is complete, this second copy is copied back the first copy.
+
+    Note that for use_mini_batch=True, when mini_batch_steps_per_iteration=1,
+    the algorithm reduces to the standard mini-batch algorithm. Also by setting
+    mini_batch_steps_per_iteration = num_inputs / batch_size, the algorithm
+    becomes an asynchronous version of the full-batch algorithm. Note however
+    that there is no guarantee by this implementation that each input is seen
+    exactly once per iteration. Also, different updates are applied
+    asynchronously without locking. So this asynchronous version may not behave
+    exactly like a full-batch version.
+
+    Args:
+      inputs: An input tensor or list of input tensors. It is assumed that the
+        data points have been previously randomly permuted.
+      num_clusters: An integer tensor specifying the number of clusters. This
+        argument is ignored if initial_clusters is a tensor or numpy array.
+      initial_clusters: Specifies the clusters used during initialization. One
+        of the following:
+        - a tensor or numpy array with the initial cluster centers.
+        - a function f(inputs, k) that returns up to k centers from `inputs`.
+        - "random": Choose centers randomly from `inputs`.
+        - "kmeans_plus_plus": Use kmeans++ to choose centers from `inputs`.
+        - "kmc2": Use the fast k-MC2 algorithm to choose centers from `inputs`.
+        In the last three cases, one batch of `inputs` may not yield
+        `num_clusters` centers, in which case initialization will require
+        multiple batches until enough centers are chosen. In the case of
+        "random" or "kmeans_plus_plus", if the input size is <= `num_clusters`
+        then the entire batch is chosen to be cluster centers.
+      distance_metric: Distance metric used for clustering. Supported options:
+        "squared_euclidean", "cosine".
+      use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
+        full batch.
+      mini_batch_steps_per_iteration: Number of steps after which the updated
+        cluster centers are synced back to a master copy.
+      random_seed: Seed for PRNG used to initialize seeds.
+      kmeans_plus_plus_num_retries: For each point that is sampled during
+        kmeans++ initialization, this parameter specifies the number of
+        additional points to draw from the current distribution before selecting
+        the best. If a negative value is specified, a heuristic is used to
+        sample O(log(num_to_sample)) additional points.
+      kmc2_chain_length: Determines how many candidate points are used by the
+        k-MC2 algorithm to produce one new cluster centers. If a (mini-)batch
+        contains less points, one new cluster center is generated from the
+        (mini-)batch.
+
+    Raises:
+      ValueError: An invalid argument was passed to initial_clusters or
+        distance_metric.
+    """
+    if isinstance(initial_clusters, str) and initial_clusters not in [
+        RANDOM_INIT, KMEANS_PLUS_PLUS_INIT, KMC2_INIT
+    ]:
+      raise ValueError(
+          "Unsupported initialization algorithm '%s'" % initial_clusters)
+    if distance_metric not in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]:
+      raise ValueError("Unsupported distance metric '%s'" % distance_metric)
+    self._inputs = inputs if isinstance(inputs, list) else [inputs]
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._use_mini_batch = use_mini_batch
+    self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
+    self._seed = random_seed_ops.get_seed(random_seed)[0]
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+
+  @classmethod
+  def _distance_graph(cls, inputs, clusters, distance_metric):
+    """Computes distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+      distance_metric: distance metric used for clustering
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+      Currently only Euclidean distance and cosine distance are supported.
+    """
+    assert isinstance(inputs, list)
+    if distance_metric == SQUARED_EUCLIDEAN_DISTANCE:
+      return cls._compute_euclidean_distance(inputs, clusters)
+    elif distance_metric == COSINE_DISTANCE:
+      return cls._compute_cosine_distance(
+          inputs, clusters, inputs_normalized=True)
+    else:
+      assert False, str(distance_metric)
+
+  @classmethod
+  def _compute_euclidean_distance(cls, inputs, clusters):
+    """Computes Euclidean distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        # Computes Euclidean distance. Note the first and third terms are
+        # broadcast additions.
+        squared_distance = (
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
+            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
+            array_ops.transpose(
+                math_ops.reduce_sum(
+                    math_ops.square(clusters), 1, keepdims=True)))
+        output.append(squared_distance)
+
+    return output
+
+  @classmethod
+  def _compute_cosine_distance(cls, inputs, clusters, inputs_normalized=True):
+    """Computes cosine distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensor.
+      clusters: cluster Tensor
+      inputs_normalized: if True, it assumes that inp and clusters are
+      normalized and computes the dot product which is equivalent to the cosine
+      distance. Else it L2 normalizes the inputs first.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inp.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    if not inputs_normalized:
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        if not inputs_normalized:
+          inp = nn_impl.l2_normalize(inp, dim=1)
+        output.append(1 - math_ops.matmul(inp, clusters, transpose_b=True))
+    return output
+
+  def _infer_graph(self, inputs, clusters):
+    """Maps input to closest cluster and the score.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: Tensor of cluster centers.
+
+    Returns:
+      List of tuple, where each value in tuple corresponds to a value in inp.
+      The tuple has following three elements:
+      all_scores: distance of each input to each cluster center.
+      score: distance of each input to closest cluster center.
+      cluster_idx: index of cluster center closest to the corresponding input.
+    """
+    assert isinstance(inputs, list)
+    # Pairwise distances are used only by transform(). In all other cases, this
+    # sub-graph is not evaluated.
+    scores = self._distance_graph(inputs, clusters, self._distance_metric)
+    output = []
+    if (self._distance_metric == COSINE_DISTANCE and
+        not self._clusters_l2_normalized()):
+      # The cosine distance between normalized vectors x and y is the same as
+      # 2 * squared_euclidean_distance. We are using this fact and reusing the
+      # nearest_neighbors op.
+      # TODO(ands): Support COSINE distance in nearest_neighbors and remove
+      # this.
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp, score in zip(inputs, scores):
+      with ops.colocate_with(inp, ignore_existing=True):
+        (indices, distances) = gen_clustering_ops.nearest_neighbors(
+            inp, clusters, 1)
+        if self._distance_metric == COSINE_DISTANCE:
+          distances *= 0.5
+        output.append((score, array_ops.squeeze(distances, [-1]),
+                       array_ops.squeeze(indices, [-1])))
+    return zip(*output)
+
+  def _clusters_l2_normalized(self):
+    """Returns True if clusters centers are kept normalized."""
+    return (self._distance_metric == COSINE_DISTANCE and
+            (not self._use_mini_batch or
+             self._mini_batch_steps_per_iteration > 1))
+
+  def _create_variables(self, num_clusters):
+    """Creates variables.
+
+    Args:
+      num_clusters: an integer Tensor providing the number of clusters.
+
+    Returns:
+      Tuple with following elements:
+      - cluster_centers: a Tensor for storing cluster centers
+      - cluster_centers_initialized: bool Variable indicating whether clusters
+            are initialized.
+      - cluster_counts: a Tensor for storing counts of points assigned to this
+            cluster. This is used by mini-batch training.
+      - cluster_centers_updated: Tensor representing copy of cluster centers
+            that are updated every step.
+      - update_in_steps: numbers of steps left before we sync
+            cluster_centers_updated back to cluster_centers.
+    """
+    init_value = array_ops.placeholder_with_default([], shape=None)
+    cluster_centers = variable_scope.variable(
+        init_value, name=CLUSTERS_VAR_NAME, validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(
+        False, dtype=dtypes.bool, name='initialized')
+
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      # Copy of cluster centers actively updated each step according to
+      # mini-batch update rule.
+      cluster_centers_updated = variable_scope.variable(
+          init_value, name='clusters_updated', validate_shape=False)
+      # How many steps till we copy the updated clusters to cluster_centers.
+      update_in_steps = variable_scope.variable(
+          self._mini_batch_steps_per_iteration,
+          dtype=dtypes.int64,
+          name='update_in_steps')
+      # Count of points assigned to cluster_centers_updated.
+      cluster_counts = variable_scope.variable(
+          array_ops.zeros([num_clusters], dtype=dtypes.int64))
+    else:
+      cluster_centers_updated = cluster_centers
+      update_in_steps = None
+      cluster_counts = (
+          variable_scope.variable(
+              array_ops.ones([num_clusters], dtype=dtypes.int64))
+          if self._use_mini_batch else None)
+    return (cluster_centers, cluster_centers_initialized, cluster_counts,
+            cluster_centers_updated, update_in_steps)
+
+  @classmethod
+  def _l2_normalize_data(cls, inputs):
+    """Normalized the input data."""
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        output.append(nn_impl.l2_normalize(inp, dim=1))
+    return output
+
+  def training_graph(self):
+    """Generate a training graph for kmeans algorithm.
+
+    This returns, among other things, an op that chooses initial centers
+    (init_op), a boolean variable that is set to True when the initial centers
+    are chosen (cluster_centers_initialized), and an op to perform either an
+    entire Lloyd iteration or a mini-batch of a Lloyd iteration (training_op).
+    The caller should use these components as follows. A single worker should
+    execute init_op multiple times until cluster_centers_initialized becomes
+    True. Then multiple workers may execute training_op any number of times.
+
+    Returns:
+      A tuple consisting of:
+      all_scores: A matrix (or list of matrices) of dimensions (num_input,
+        num_clusters) where the value is the distance of an input vector and a
+        cluster center.
+      cluster_idx: A vector (or list of vectors). Each element in the vector
+        corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      scores: Similar to cluster_idx but specifies the distance to the
+        assigned cluster instead.
+      cluster_centers_initialized: scalar indicating whether clusters have been
+        initialized.
+      init_op: an op to initialize the clusters.
+      training_op: an op that runs an iteration of training.
+    """
+    # Implementation of kmeans.
+    if (isinstance(self._initial_clusters, str) or
+        callable(self._initial_clusters)):
+      initial_clusters = self._initial_clusters
+      num_clusters = ops.convert_to_tensor(self._num_clusters)
+    else:
+      initial_clusters = ops.convert_to_tensor(self._initial_clusters)
+      num_clusters = array_ops.shape(initial_clusters)[0]
+
+    inputs = self._inputs
+    (cluster_centers_var, cluster_centers_initialized, total_counts,
+     cluster_centers_updated,
+     update_in_steps) = self._create_variables(num_clusters)
+    init_op = _InitializeClustersOpFactory(
+        self._inputs, num_clusters, initial_clusters, self._distance_metric,
+        self._seed, self._kmeans_plus_plus_num_retries, self._kmc2_chain_length,
+        cluster_centers_var, cluster_centers_updated,
+        cluster_centers_initialized).op()
+    cluster_centers = cluster_centers_var
+
+    if self._distance_metric == COSINE_DISTANCE:
+      inputs = self._l2_normalize_data(inputs)
+      if not self._clusters_l2_normalized():
+        cluster_centers = nn_impl.l2_normalize(cluster_centers, dim=1)
+
+    all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
+    if self._use_mini_batch:
+      sync_updates_op = self._mini_batch_sync_updates_op(
+          update_in_steps, cluster_centers_var, cluster_centers_updated,
+          total_counts)
+      assert sync_updates_op is not None
+      with ops.control_dependencies([sync_updates_op]):
+        training_op = self._mini_batch_training_op(
+            inputs, cluster_idx, cluster_centers_updated, total_counts)
+    else:
+      assert cluster_centers == cluster_centers_var
+      training_op = self._full_batch_training_op(
+          inputs, num_clusters, cluster_idx, cluster_centers_var)
+
+    return (all_scores, cluster_idx, scores, cluster_centers_initialized,
+            init_op, training_op)
+
+  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
+                                  cluster_centers_updated, total_counts):
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      assert update_in_steps is not None
+      with ops.colocate_with(update_in_steps, ignore_existing=True):
+
+        def _f():
+          # Note that there is a race condition here, so we do a best effort
+          # updates here. We reset update_in_steps first so that other workers
+          # don't duplicate the updates. Also we update cluster_center_vars
+          # before resetting total_counts to avoid large updates to
+          # cluster_centers_updated based on partially updated
+          # cluster_center_vars.
+          with ops.control_dependencies([
+              state_ops.assign(update_in_steps,
+                               self._mini_batch_steps_per_iteration - 1)
+          ]):
+            with ops.colocate_with(
+                cluster_centers_updated, ignore_existing=True):
+              if self._distance_metric == COSINE_DISTANCE:
+                cluster_centers = nn_impl.l2_normalize(
+                    cluster_centers_updated, dim=1)
+              else:
+                cluster_centers = cluster_centers_updated
+            with ops.colocate_with(cluster_centers_var, ignore_existing=True):
+              with ops.control_dependencies(
+                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
+                with ops.colocate_with(None, ignore_existing=True):
+                  with ops.control_dependencies([
+                      state_ops.assign(total_counts,
+                                       array_ops.zeros_like(total_counts))
+                  ]):
+                    return array_ops.identity(update_in_steps)
+
+        return control_flow_ops.cond(
+            update_in_steps <= 0, _f,
+            lambda: state_ops.assign_sub(update_in_steps, 1))
+    else:
+      return control_flow_ops.no_op()
+
+  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
+                              total_counts):
+    """Creates an op for training for mini batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+      total_counts: Tensor Ref of cluster counts.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    update_ops = []
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        assert total_counts is not None
+        cluster_idx = array_ops.reshape(cluster_idx, [-1])
+        # Dedupe the unique ids of cluster_centers being updated so that updates
+        # can be locally aggregated.
+        unique_ids, unique_idx = array_ops.unique(cluster_idx)
+        num_unique_cluster_idx = array_ops.size(unique_ids)
+        # Fetch the old values of counts and cluster_centers.
+        with ops.colocate_with(total_counts, ignore_existing=True):
+          old_counts = array_ops.gather(total_counts, unique_ids)
+        # TODO(agarwal): This colocation seems to run into problems. Fix it.
+        with ops.colocate_with(cluster_centers, ignore_existing=True):
+          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
+        # Locally aggregate the increment to counts.
+        count_updates = math_ops.unsorted_segment_sum(
+            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
+            unique_idx, num_unique_cluster_idx)
+        # Locally compute the sum of inputs mapped to each id.
+        # For a cluster with old cluster value x, old count n, and with data
+        # d_1,...d_k newly assigned to it, we recompute the new value as
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
+        cluster_center_updates = math_ops.unsorted_segment_sum(
+            inp, unique_idx, num_unique_cluster_idx)
+        # Shape to enable broadcasting count_updates and learning_rate to inp.
+        # It extends the shape with 1's to match the rank of inp.
+        broadcast_shape = array_ops.concat([
+            array_ops.reshape(num_unique_cluster_idx, [1]),
+            array_ops.ones(
+                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
+                dtype=dtypes.int32)
+        ], 0)
+        # Subtract k * x, see comment above.
+        cluster_center_updates -= math_ops.cast(
+            array_ops.reshape(count_updates, broadcast_shape),
+            inp.dtype) * old_cluster_centers
+        learning_rate = math_ops.reciprocal(
+            math_ops.cast(old_counts + count_updates, inp.dtype))
+        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
+        # scale by 1 / (n + k), see comment above.
+        cluster_center_updates *= learning_rate
+        # Apply the updates.
+      update_counts = state_ops.scatter_add(total_counts, unique_ids,
+                                            count_updates)
+      update_cluster_centers = state_ops.scatter_add(
+          cluster_centers, unique_ids, cluster_center_updates)
+      update_ops.extend([update_counts, update_cluster_centers])
+    return control_flow_ops.group(*update_ops)
+
+  def _full_batch_training_op(self, inputs, num_clusters, cluster_idx_list,
+                              cluster_centers):
+    """Creates an op for training for full batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      num_clusters: an integer Tensor providing the number of clusters.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    cluster_sums = []
+    cluster_counts = []
+    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        cluster_sums.append(
+            math_ops.unsorted_segment_sum(inp, cluster_idx, num_clusters))
+        cluster_counts.append(
+            math_ops.unsorted_segment_sum(
+                array_ops.reshape(
+                    array_ops.ones(
+                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
+                    [-1, 1]), cluster_idx, num_clusters))
+    with ops.colocate_with(cluster_centers, ignore_existing=True):
+      new_clusters_centers = math_ops.add_n(cluster_sums) / (
+          math_ops.cast(math_ops.add_n(cluster_counts), cluster_sums[0].dtype) +
+          epsilon)
+      if self._clusters_l2_normalized():
+        new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
+    return state_ops.assign(cluster_centers, new_clusters_centers)
+
+
+class _InitializeClustersOpFactory(object):
+  """Internal class to create the op to initialize the clusters.
+
+    The op performs this algorithm (see constructor args):
+
+    num_remaining = num_clusters - length(cluster_centers)
+    if num_remaining == 0:
+      assert that cluster_centers_initialized is true
+    else:
+      assert that num_remaining > 0
+      new_centers = choose up to num_remaining initial centers
+      l2-normalize new_centers if using cosine distance
+      all_centers = concat(cluster_centers, new_centers)
+      cluster_centers := all_centers
+      if there is a cluster_centers_updated variable:
+        cluster_centers_updated := cluster_centers
+      num_now_remaining = num_clusters - length(cluster_centers)
+      if num_now_remaining == 0:
+        cluster_centers_initialized := true
+  """
+
+  # TODO(ccolby): Refactor this class so that kmc2 isn't so much a special case.
+
+  def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
+               random_seed, kmeans_plus_plus_num_retries, kmc2_chain_length,
+               cluster_centers, cluster_centers_updated,
+               cluster_centers_initialized):
+    """Creates an op factory.
+
+    Args:
+      inputs: See KMeans constructor.
+      num_clusters: An integer Tensor providing the number of clusters.
+      initial_clusters: See KMeans constructor.
+      distance_metric: See KMeans constructor.
+      random_seed: See KMeans constructor.
+      kmeans_plus_plus_num_retries: See KMeans constructor.
+      kmc2_chain_length: See KMeans constructor.
+      cluster_centers: The TF variable holding the initial centers. It may
+          already contain some centers when the op is executed.
+      cluster_centers_updated: A second TF variable to hold a copy of the
+          initial centers, used for full-batch mode. In mini-batch mode,
+          cluster_centers_updated is the same variable as cluster_centers.
+      cluster_centers_initialized: A boolean TF variable that will be set
+          to true when all the initial centers have been chosen.
+    """
+    # All of these instance variables are constants.
+    self._inputs = inputs
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+    self._cluster_centers = cluster_centers
+    self._cluster_centers_updated = cluster_centers_updated
+    self._cluster_centers_initialized = cluster_centers_initialized
+
+    self._num_selected = array_ops.shape(self._cluster_centers)[0]
+    self._num_remaining = self._num_clusters - self._num_selected
+    self._num_data = math_ops.add_n(
+        [array_ops.shape(i)[0] for i in self._inputs])
+
+  def _random(self):
+    indices = random_ops.random_uniform(
+        array_ops.reshape(self._num_remaining, [-1]),
+        minval=0,
+        maxval=math_ops.cast(self._num_data, dtypes.int64),
+        seed=self._seed,
+        dtype=dtypes.int64)
+    return embedding_lookup(self._inputs, indices, partition_strategy='div')
+
+  def _kmeans_plus_plus(self):
+    # Points from only the first shard are used for initializing centers.
+    # TODO(ands): Use all points.
+    inp = self._inputs[0]
+    if self._distance_metric == COSINE_DISTANCE:
+      inp = nn_impl.l2_normalize(inp, dim=1)
+    return gen_clustering_ops.kmeans_plus_plus_initialization(
+        inp, math_ops.cast(self._num_remaining, dtypes.int64), self._seed,
+        self._kmeans_plus_plus_num_retries)
+
+  def _kmc2_multiple_centers(self):
+    """Adds new initial cluster centers using the k-MC2 algorithm.
+
+    In each call to the op, the provided batch is split into subsets based on
+    the specified `kmc2_chain_length`. On each subset, a single Markov chain of
+    the k-MC2 algorithm is used to add *one* new center cluster center. If there
+    are less than `kmc2_chain_length` points in the subset, a single center is
+    added using one Markov chain on the full input. It is assumed that the
+    provided batch has previously been randomly permuted. Otherwise, k-MC2 may
+    return suboptimal centers.
+
+    Returns:
+      An op that adds new cluster centers.
+    """
+    # The op only operates on the first shard of data.
+    first_shard = self._inputs[0]
+    # Number of points in the input that can be used.
+    batch_size = array_ops.shape(first_shard)[0]
+    # Maximum number of subsets such that the size of each subset is at least
+    # `kmc2_chain_length`. Final subsets may be larger.
+    max_to_sample = math_ops.cast(
+        batch_size / self._kmc2_chain_length, dtype=dtypes.int32)
+    # We sample at least one new center and at most all remaining centers.
+    num_to_sample = math_ops.maximum(
+        math_ops.minimum(self._num_remaining, max_to_sample), 1)
+
+    def _cond(i, _):
+      """Stopping condition for the while loop."""
+      return math_ops.less(i, num_to_sample)
+
+    def _body(i, _):
+      """Body that adds a single new center based on a subset."""
+
+      def _sample_random():
+        """Returns a random point as a cluster center."""
+        # By assumption the batch is reshuffled and _sample_random is always
+        # called for i=0. Hence, we simply return the first point.
+        new_center = array_ops.reshape(first_shard[0], [1, -1])
+        if self._distance_metric == COSINE_DISTANCE:
+          new_center = nn_impl.l2_normalize(new_center, dim=1)
+        return new_center
+
+      def _sample_kmc2_chain():
+        """Returns previous centers as well as a new center sampled using k-MC2.
+        """
+        # Extract the subset from the underlying batch.
+        start = i * self._kmc2_chain_length
+        end = start + self._kmc2_chain_length
+        subset = first_shard[start:end]
+        # Compute the distances from points in the subset to previous centers.
+        _, distances = gen_clustering_ops.nearest_neighbors(
+            subset, self._cluster_centers, 1)
+        # Sample index of new center using k-MC2 Markov chain.
+        new_center_index = gen_clustering_ops.kmc2_chain_initialization(
+            array_ops.squeeze(distances), self._seed)
+        # Extract actual new center.
+        newly_sampled_center = array_ops.reshape(subset[new_center_index],
+                                                 [1, -1])
+        # Return concatenation with previously sampled centers.
+        if self._distance_metric == COSINE_DISTANCE:
+          newly_sampled_center = nn_impl.l2_normalize(
+              newly_sampled_center, dim=1)
+        return array_ops.concat([self._cluster_centers, newly_sampled_center],
+                                0)
+
+      # Obtain a random point if there are no previously sampled centers.
+      # Otherwise, construct a k-MC2 Markov chain.
+      new_centers = control_flow_ops.cond(
+          math_ops.equal(self._num_selected, 0), _sample_random,
+          _sample_kmc2_chain)
+      # Assign new cluster centers to underlying variable.
+      assigned_centers = state_ops.assign(
+          self._cluster_centers, new_centers, validate_shape=False)
+      if self._cluster_centers_updated is not self._cluster_centers:
+        assigned_centers = state_ops.assign(
+            self._cluster_centers_updated,
+            assigned_centers,
+            validate_shape=False)
+      return i + 1, self._num_clusters - array_ops.shape(assigned_centers)[0]
+
+    # Add num_to_sample new data points.
+    _, num_remaining = control_flow_ops.while_loop(_cond, _body, [0, 0])
+    return num_remaining
+
+  def _greedy_batch_sampler(self, sampler):
+    # If the input dataset size is smaller than the number of centers
+    # remaining, choose the entire input dataset as centers. This can happen
+    # with mini-batch. Otherwise, sample the batch according to the provided
+    # sampler.
+    return control_flow_ops.cond(self._num_data <= self._num_remaining,
+                                 lambda: array_ops.concat(self._inputs, 0),
+                                 sampler)
+
+  def _single_batch_sampler(self, sampler):
+    # Enforce that there are at least as many data points as centers
+    # remaining. This gives the provided sampler the chance to select all
+    # remaining centers from a single batch.
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(self._num_data, self._num_remaining)]):
+      return sampler()
+
+  def _choose_initial_centers(self):
+    if isinstance(self._initial_clusters, str):
+      if self._initial_clusters == RANDOM_INIT:
+        return self._greedy_batch_sampler(self._random)
+      else:  # self._initial_clusters == KMEANS_PLUS_PLUS_INIT
+        return self._single_batch_sampler(self._kmeans_plus_plus)
+    elif callable(self._initial_clusters):
+      return self._initial_clusters(self._inputs, self._num_remaining)
+    else:
+      with ops.control_dependencies([
+          check_ops.assert_equal(self._num_remaining,
+                                 array_ops.shape(self._initial_clusters)[0])
+      ]):
+        return self._initial_clusters
+
+  def _add_new_centers(self):
+    """Adds some centers and returns the number of centers remaining."""
+    new_centers = self._choose_initial_centers()
+    if self._distance_metric == COSINE_DISTANCE:
+      new_centers = nn_impl.l2_normalize(new_centers, dim=1)
+    # If cluster_centers is empty, it doesn't have the right shape for concat.
+    all_centers = control_flow_ops.cond(
+        math_ops.equal(self._num_selected, 0), lambda: new_centers,
+        lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
+    # TODO(ccolby): De-dupe all_centers?
+    a = state_ops.assign(
+        self._cluster_centers, all_centers, validate_shape=False)
+    if self._cluster_centers_updated is not self._cluster_centers:
+      a = state_ops.assign(
+          self._cluster_centers_updated, a, validate_shape=False)
+    return self._num_clusters - array_ops.shape(a)[0]
+
+  def _initialize(self):
+    with ops.control_dependencies([
+        check_ops.assert_positive(self._num_remaining),
+    ]):
+      if self._initial_clusters == KMC2_INIT:
+        num_now_remaining = self._kmc2_multiple_centers()
+      else:
+        num_now_remaining = self._add_new_centers()
+      return control_flow_ops.cond(
+          math_ops.equal(num_now_remaining, 0),
+          lambda: state_ops.assign(self._cluster_centers_initialized, True),
+          control_flow_ops.no_op)
+
+  def op(self):
+    """Returns the cluster initializer op."""
+    return control_flow_ops.cond(
+        math_ops.equal(self._num_remaining, 0),
+        lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
+        self._initialize)
diff --git a/tensorflow/python/ops/clustering_ops_test.py b/tensorflow/python/ops/clustering_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5804c660e67eedf09b0dec6e599d1cf644156a9d
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops_test.py
@@ -0,0 +1,212 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for clustering_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import clustering_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KmeansPlusPlusInitializationTest(test.TestCase):
+
+  # All but one input point are close to (101, 1). With uniform random sampling,
+  # it is highly improbable for (-1, -1) to be selected.
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [102., 0.],
+                             [100., 1.],
+                             [100., 2.],
+                             [101., 0.],
+                             [101., 0.],
+                             [101., 1.],
+                             [102., 0.],
+                             [-1., -1.]]).astype(np.float32)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_points = clustering_ops.kmeans_plus_plus_initialization(
+          self._points, 3, seed, (seed % 5) - 1)
+      self.assertAllClose(
+          sorted(self.evaluate(sampled_points).tolist()),
+          [[-1., -1.], [101., 1.], [101., 1.]],
+          atol=1.0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationTest(test.TestCase):
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      distances = np.zeros(1000).astype(np.float32)
+      distances[6] = 10e7
+      distances[4] = 10e3
+
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 6)
+      distances[6] = 0.0
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 4)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationLargeTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(1001)
+    self._distances[500] = 100.0
+    self._distances[1000] = 50.0
+
+  def testBasic(self):
+    with self.cached_session():
+      counts = {}
+      seed = 0
+      for i in range(50):
+        sample = self.evaluate(
+            clustering_ops.kmc2_chain_initialization(self._distances, seed + i))
+        counts[sample] = counts.get(sample, 0) + 1
+      self.assertEquals(len(counts), 2)
+      self.assertTrue(500 in counts)
+      self.assertTrue(1000 in counts)
+      self.assertGreaterEqual(counts[500], 5)
+      self.assertGreaterEqual(counts[1000], 5)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationCornercaseTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(10)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_point = clustering_ops.kmc2_chain_initialization(
+          self._distances, seed)
+      self.assertAllEqual(sampled_point, 0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A simple test that can be verified by hand.
+class NearestCentersTest(test.TestCase):
+
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [99., 2.],
+                             [1., 1.]]).astype(np.float32)
+
+    self._centers = np.array([[100., 0.],
+                              [99., 1.],
+                              [50., 50.],
+                              [0., 0.],
+                              [1., 1.]]).astype(np.float32)
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(indices, [[0], [0], [1], [4]])
+      self.assertAllClose(distances, [[0.], [5.], [1.], [0.]])
+
+  def testNearest2(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 2)
+      self.assertAllClose(indices, [[0, 1], [0, 1], [1, 0], [4, 3]])
+      self.assertAllClose(distances, [[0., 2.], [5., 5.], [1., 5.], [0., 2.]])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A test with large inputs.
+class NearestCentersLargeTest(test.TestCase):
+
+  def setUp(self):
+    num_points = 1000
+    num_centers = 2000
+    num_dim = 100
+    max_k = 5
+    # Construct a small number of random points and later tile them.
+    points_per_tile = 10
+    assert num_points % points_per_tile == 0
+    points = np.random.standard_normal(
+        [points_per_tile, num_dim]).astype(np.float32)
+    # Construct random centers.
+    self._centers = np.random.standard_normal(
+        [num_centers, num_dim]).astype(np.float32)
+
+    # Exhaustively compute expected nearest neighbors.
+    def squared_distance(x, y):
+      return np.linalg.norm(x - y, ord=2)**2
+
+    nearest_neighbors = [
+        sorted([(squared_distance(point, self._centers[j]), j)
+                for j in range(num_centers)])[:max_k] for point in points
+    ]
+    expected_nearest_neighbor_indices = np.array(
+        [[i for _, i in nn] for nn in nearest_neighbors])
+    expected_nearest_neighbor_squared_distances = np.array(
+        [[dist for dist, _ in nn] for nn in nearest_neighbors])
+    # Tile points and expected results to reach requested size (num_points)
+    (self._points, self._expected_nearest_neighbor_indices,
+     self._expected_nearest_neighbor_squared_distances) = (
+         np.tile(x, (int(num_points / points_per_tile), 1))
+         for x in (points, expected_nearest_neighbor_indices,
+                   expected_nearest_neighbor_squared_distances))
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, [0]])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, [0]])
+
+  def testNearest5(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 5)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, 0:5])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, 0:5])
+
+
+if __name__ == "__main__":
+  np.random.seed(0)
+  test.main()
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 98668facd5bc56892fa00f258dfebcbe93c063da..32a71fc25d370f4e96ef4036f4fdee3c670502d2 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -48,7 +48,7 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
   if not device.canonical_name(t.device):
     raise ValueError('Device assignment required for collective ops')
   if group_size <= 1:
-    raise ValueError('Parameter group_size to add_reduce must be at least 2.')
+    raise ValueError('Parameter group_size to all_reduce must be at least 2.')
   return gen_collective_ops.collective_reduce(t,
                                               group_size=group_size,
                                               group_key=group_key,
@@ -58,6 +58,35 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
                                               subdiv_offsets=subdiv_offsets)
 
 
+def all_gather(t, group_size, group_key, instance_key):
+  """Accumulates tensors collectively, across devices, along first dimension.
+
+  Args:
+    t: the tensor to participate in the accumulation.
+    group_size: the total number of tensors to be collectively accumulated.
+      Each must reside on a different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+
+  Returns:
+    An Op implementing the distributed operation.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+  """
+  if not device.canonical_name(t.device):
+    raise ValueError('Device assignment required for collective ops')
+  if group_size <= 1:
+    raise ValueError('Parameter group_size to all_gather must be at least 2.')
+  dims = t.shape.as_list()
+  output_shape = [dims[0] * group_size] + dims[1:]
+  return gen_collective_ops.collective_gather(t,
+                                              shape=output_shape,
+                                              group_size=group_size,
+                                              group_key=group_key,
+                                              instance_key=instance_key)
+
+
 def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
   """Broadcasts one tensor to a group of others, across devices.
 
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 9c5a39b90e0e163f559524e33f7deb04794c1d0d..c9b376caf8f07236c4dde2bda2ba43c89a0ad8c2 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,8 +25,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
-# TODO(tucker): Make these ops work in eager mode. b/79776476
-
 
 class CollectiveOpTest(test.TestCase):
 
@@ -114,6 +112,42 @@ class CollectiveOpTest(test.TestCase):
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
+  def _testCollectiveGather(self, t0, t1, expected, set_graph_key):
+    group_key = 1
+    instance_key = 1
+    with self.session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        colred0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+      with ops.device('/CPU:1'):
+        in1 = constant_op.constant(t1)
+        colred1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+      run_options = config_pb2.RunOptions()
+      if set_graph_key:
+        run_options.experimental.collective_graph_key = 1
+      results = sess.run([colred0, colred1], options=run_options)
+    self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testCollectiveGather(self):
+    self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
+                               [10, 11, 12, 13, 14, 15, 16, 17],
+                               [0, 1, 2, 3, 4, 5, 6, 7,
+                                10, 11, 12, 13, 14, 15, 16, 17],
+                               True)
+    self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
+                               [[10, 11, 12, 13], [14, 15, 16, 17]],
+                               [[0, 1, 2, 3], [4, 5, 6, 7],
+                                [10, 11, 12, 13], [14, 15, 16, 17]],
+                               True)
+    self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
+                               [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
+                                [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 6e8f2df39b97969c8ea3fc8c42fd7fa4dab67b1d..98f641e540121246f0f318aeee2e1674f9923bb5 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -35,7 +35,8 @@ from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
 
@@ -79,13 +80,10 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
 
-    outputs = _build_cond(pred, true_graph, false_graph,
-                          true_graph.external_captures,
-                          false_graph.external_captures,
-                          name=scope)
-
-    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
-                                              outputs)
+    return _build_cond(pred, true_graph, false_graph,
+                       true_graph.external_captures,
+                       false_graph.external_captures,
+                       name=scope)
 
 
 @ops.RegisterGradient("If")
@@ -107,9 +105,6 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   false_grad_graph = _create_grad_func(
       false_graph, grads, util.unique_grad_fn_name(false_graph.name))
 
-  assert ([t.dtype for t in true_grad_graph.outputs] ==
-          [t.dtype for t in false_grad_graph.outputs])
-
   if (true_grad_graph.if_op_needs_rewrite or
       false_grad_graph.if_op_needs_rewrite):
     # Modify 'op' to output the intermediates needed by the grad functions. Note
@@ -156,6 +151,9 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
   false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
+  # This modifies true_grad_graph and false_grad_graph.
+  _make_output_composite_tensors_match(true_grad_graph, false_grad_graph)
+
   outputs = _build_cond(if_op.inputs[0], true_grad_graph, false_grad_graph,
                         true_grad_inputs, false_grad_inputs)
 
@@ -194,14 +192,16 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
                                    true_inputs, false_inputs)
 
   # Create the If op.
-  tensors = gen_functional_ops._if(  # pylint: disable=protected-access
-      pred,
-      cond_inputs, [t.dtype for t in true_graph.outputs],
-      util.create_new_tf_function(true_graph),
-      util.create_new_tf_function(false_graph),
-      output_shapes=_get_output_shapes(true_graph.outputs,
-                                       false_graph.outputs),
-      name=name)
+  with ops.control_dependencies(
+      list(true_graph.control_captures) + list(false_graph.control_captures)):
+    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
+        pred,
+        cond_inputs, [t.dtype for t in true_graph.outputs],
+        util.create_new_tf_function(true_graph),
+        util.create_new_tf_function(false_graph),
+        output_shapes=_get_output_shapes(true_graph.outputs,
+                                         false_graph.outputs),
+        name=name)
 
   # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
   if_op = tensors[0].op
@@ -219,7 +219,8 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
 
   # Prevent fetching since the variant outputs can't be fetched directly.
   if_op.graph.prevent_fetching(if_op)
-  return tensors
+  return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                            tensors)
 
 
 def _get_func_graphs(if_op):
@@ -278,7 +279,7 @@ def _grad_fn(func_graph, grads):
   ys = []
   grad_ys = []
   for y, grad_y in zip(func_graph.outputs, grads):
-    if not gradients_impl.IsTrainable(y):
+    if not gradients_util.IsTrainable(y):
       continue
     ys.append(y)
     grad_ys.append(grad_y)
@@ -287,7 +288,7 @@ def _grad_fn(func_graph, grads):
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
   # in _resolve_grad_inputs.
-  result = gradients_impl._GradientsHelper(
+  result = gradients_util._GradientsHelper(
       ys, func_graph.inputs, grad_ys=grad_ys,
       src_graph=func_graph)
 
@@ -475,6 +476,50 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   return new_inputs
 
 
+def _make_output_composite_tensors_match(true_graph, false_graph):
+  """Rewrites {true,false}_graph's outputs to use the same _TensorLike classes.
+
+  Currently the only transformation implemented is turning a Tensor into an
+  equivalent IndexedSlices if the other branch returns an IndexedSlices.
+  Updates {true,false}_graph.{outputs,structured_outputs}.
+
+  Args:
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+
+  Raises:
+    TypeError: if a pair of outputs cannot be rewritten.
+  """
+  # Note: since this is only used for gradient graphs, we do not expect the
+  # outputs to be structured (e.g. nested lists), and thus do not need to use
+  # nest.flatten, etc.
+  true_outputs = list(true_graph.structured_outputs)
+  false_outputs = list(false_graph.structured_outputs)
+  assert len(true_outputs) == len(false_outputs)
+
+  for idx, (true_out, false_out) in enumerate(zip(true_outputs, false_outputs)):
+    if type(true_out) == type(false_out):  # pylint: disable=unidiomatic-typecheck
+      continue
+    if (isinstance(true_out, ops.IndexedSlices) and
+        isinstance(false_out, ops.Tensor)):
+      with false_graph.as_default():
+        false_outputs[idx] = math_ops._as_indexed_slices(false_out)
+    elif (isinstance(true_out, ops.Tensor) and
+          isinstance(false_out, ops.IndexedSlices)):
+      with true_graph.as_default():
+        true_outputs[idx] = math_ops._as_indexed_slices(true_out)
+    else:
+      raise TypeError(
+          "Cannot reconcile tf.cond %i-th outputs:\n"
+          "  true_fn returned:  %s\n"
+          "  false_fn returned: %s" % (idx, true_out, false_out))
+
+  true_graph.structured_outputs = true_outputs
+  true_graph.outputs = func_graph_module.flatten(true_outputs)
+  false_graph.structured_outputs = false_outputs
+  false_graph.outputs = func_graph_module.flatten(false_outputs)
+
+
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
     return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
@@ -518,23 +563,30 @@ def _create_fakeparams(func_graph, template_tensors):
 
 def _check_same_outputs(true_graph, false_graph):
   """Raises an error if true_graph and false_graph have different outputs."""
-  true_output_types = [t.dtype for t in true_graph.outputs]
-  false_output_types = [t.dtype for t in false_graph.outputs]
-  if (len(true_graph.outputs) != len(false_graph.outputs) or
-      true_output_types != false_output_types):
+
+  def error(error_detail):
     raise TypeError(
-        "true_fn() and false_fn() must return the same number and type of "
-        "arguments, got:\n"
-        "  true_fn: %s\n"
-        "  false_fn: %s" % (true_output_types, false_output_types))
+        "true_fn and false_fn arguments to tf.cond must have the same number, "
+        "type, and overall structure of return values.\n"
+        "\n"
+        "true_fn output:  %s\n"
+        "false_fn output: %s\n"
+        "\n"
+        "Error details:\n"
+        "%s" % (true_graph.structured_outputs, false_graph.structured_outputs,
+                error_detail))
 
-  # Make sure `structured_outputs` for both graphs have the same structure.
   try:
     nest.assert_same_structure(true_graph.structured_outputs,
-                               false_graph.structured_outputs)
+                               false_graph.structured_outputs,
+                               expand_composites=True)
   except (ValueError, TypeError) as e:
-    raise ValueError("Outputs of true_fn and false_fn must have the same "
-                     "structure: %s" % str(e))
+    error(str(e))
+
+  assert len(true_graph.outputs) == len(false_graph.outputs)
+  for true_out, false_out in zip(true_graph.outputs, false_graph.outputs):
+    if true_out.dtype != false_out.dtype:
+      error("%s and %s have different types" % (true_out, false_out))
 
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index ccfe3b65c2d90b37836e2e48c3235f399f77df49..f61d681b7bc7c9302939a4018676f088f7c72d65 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -182,6 +182,7 @@ def confusion_matrix(labels,
           predictions)
 
     if weights is not None:
+      weights = ops.convert_to_tensor(weights, name='weights')
       predictions.get_shape().assert_is_compatible_with(weights.get_shape())
       weights = math_ops.cast(weights, dtype)
 
@@ -190,8 +191,10 @@ def confusion_matrix(labels,
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
     cm_sparse = sparse_tensor.SparseTensor(
-        indices=indices, values=values, dense_shape=math_ops.to_int64(shape))
-    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
+        indices=indices,
+        values=values,
+        dense_shape=math_ops.cast(shape, dtypes.int64))
+    zero_matrix = array_ops.zeros(math_ops.cast(shape, dtypes.int32), dtype)
 
     return sparse_ops.sparse_add(zero_matrix, cm_sparse)
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 3b3bfbbfc2da256f2e795bf400c53c2163a070eb..32a5db2c1ae0687f3c9954e943735a7748a2b777 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -30,11 +30,11 @@ import six
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -59,13 +59,13 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 # This is to avoid a circular dependency:
-# cond_v2 -> gradients_impl -> control_flow_ops
+# cond_v2 -> gradients_util -> control_flow_ops
 cond_v2 = LazyLoader("cond_v2", globals(),
                      "tensorflow.python.ops.cond_v2")
 
 # This is to avoid circular dependencies:
 # while_v2 -> control_flow_ops
-# while_v2 -> gradients_impl -> control_flow_ops
+# while_v2 -> gradients_util -> control_flow_ops
 while_v2 = LazyLoader("while_v2", globals(),
                       "tensorflow.python.ops.while_v2")
 
@@ -181,47 +181,29 @@ def _Identity(data, name=None):
   Returns:
     A Tensor with the same type and value as the input Tensor.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_array_ops.ref_identity(data, name=name)
     else:
       return array_ops.identity(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_Identity, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Identity(data.values, name=name)
-    indices = array_ops.identity(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = array_ops.identity(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = array_ops.identity(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _NextIteration(data, name=None):
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_NextIteration, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _NextIteration(data.values, name=name)
-    indices = next_iteration(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = next_iteration(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = next_iteration(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _Enter(data,
@@ -244,12 +226,13 @@ def _Enter(data,
     is_constant: If true, the output is constant within the child frame.
     parallel_iterations: The number of iterations allowed to run in parallel.
     use_ref: If true, use ref_enter if data is of ref type.
+    use_input_shape: If true, set the result's shape based on data's shape.
     name: A name for this operation (optional).
 
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
       result = gen_control_flow_ops.ref_enter(
@@ -260,46 +243,13 @@ def _Enter(data,
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    def enter_component(t):
+      return _Enter(t, frame_name, is_constant, parallel_iterations,
+                    use_ref, use_input_shape)
+    return nest.map_structure(enter_component, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(
-        data.values,
-        frame_name,
-        is_constant,
-        parallel_iterations=parallel_iterations,
-        use_input_shape=use_input_shape,
-        name=name)
-    indices = gen_control_flow_ops.enter(
-        data.indices,
-        frame_name,
-        is_constant,
-        parallel_iterations,
-        name="indices")
-    if use_input_shape:
-      indices.set_shape(data.indices.get_shape())
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops.enter(
-            dense_shape,
-            frame_name,
-            is_constant,
-            parallel_iterations,
-            name="dense_shape")
-        if use_input_shape:
-          dense_shape.set_shape(data.dense_shape.get_shape())
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops.enter(
-          data.dense_shape,
-          frame_name,
-          is_constant,
-          parallel_iterations,
-          name="dense_shape")
-      if use_input_shape:
-        dense_shape.set_shape(data.dense_shape.get_shape())
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def exit(data, name=None):  # pylint: disable=redefined-builtin
@@ -314,25 +264,16 @@ def exit(data, name=None):  # pylint: disable=redefined-builtin
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_control_flow_ops.ref_exit(data, name)
     else:
       return gen_control_flow_ops._exit(data, name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(exit, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = exit(data.values, name=name)
-    indices = gen_control_flow_ops._exit(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops._exit(dense_shape, name)
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops._exit(data.dense_shape, name)
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def switch(data, pred, dtype=None, name=None):
@@ -355,32 +296,19 @@ def switch(data, pred, dtype=None, name=None):
     to `output_true`, otherwise it goes to `output_false`.
   """
   with ops.name_scope(name, "Switch", [data, pred]) as name:
-    data = ops.internal_convert_to_tensor_or_indexed_slices(
+    data = ops.internal_convert_to_tensor_or_composite(
         data, dtype=dtype, name="data", as_ref=True)
     pred = ops.convert_to_tensor(pred, name="pred")
     if isinstance(data, ops.Tensor):
       return gen_control_flow_ops.switch(data, pred, name=name)
     else:
-      if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
+      if not isinstance(data, composite_tensor.CompositeTensor):
         raise TypeError("Type %s not supported" % type(data))
-      val, ind = data.values, data.indices
-      val_f, val_t = gen_control_flow_ops.switch(val, pred, name=name)
-      ind_f, ind_t = gen_control_flow_ops.switch(ind, pred, name="indices")
-      if isinstance(data, ops.IndexedSlices):
-        dense_shape = data.dense_shape
-        if dense_shape is not None:
-          dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-              dense_shape, pred, name="dense_shape")
-        else:
-          dense_shape_f, dense_shape_t = None, None
-        return (ops.IndexedSlices(val_f, ind_f, dense_shape_f),
-                ops.IndexedSlices(val_t, ind_t, dense_shape_t))
-      else:
-        dense_shape = data.dense_shape
-        dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-            data.dense_shape, pred, name="dense_shape")
-        return (sparse_tensor.SparseTensor(ind_f, val_f, dense_shape_f),
-                sparse_tensor.SparseTensor(ind_t, val_t, dense_shape_t))
+      tensors = nest.flatten(data, expand_composites=True)
+      mapped = [gen_control_flow_ops.switch(tensor, pred) for tensor in tensors]
+      mapped_f, mapped_t = zip(*mapped)
+      return (nest.pack_sequence_as(data, mapped_f, expand_composites=True),
+              nest.pack_sequence_as(data, mapped_t, expand_composites=True))
 
 
 def _SwitchRefOrTensor(data, pred, name="Switch"):
@@ -403,7 +331,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
   Raises:
     TypeError: if data is not a Tensor or IndexedSlices
   """
-  data = ops.convert_to_tensor_or_indexed_slices(data, name="data")
+  data = ops.convert_to_tensor_or_composite(data, name="data")
   # NOTE(vrv): ops.colocate_with(data, ignore_existing=True) below
   # addresses the following scenario.
   #
@@ -456,7 +384,7 @@ def merge(inputs, name=None):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
-        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        ops.internal_convert_to_tensor_or_composite(inp, as_ref=True)
         for inp in inputs
     ]
     if all(isinstance(v, ops.Tensor) for v in inputs):
@@ -464,30 +392,27 @@ def merge(inputs, name=None):
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
-      # Only handle the case when all inputs are SparseTensor.
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      dense_shape, _ = gen_control_flow_ops.merge(
-          [inp.dense_shape for inp in inputs], name="dense_shape")
-      return (sparse_tensor.SparseTensor(indices, values, dense_shape),
-              chosen_index)
     else:
-      # For now convert all the inputs as IndexedSlices.
-      inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      if any(inp.dense_shape is not None for inp in inputs):
-        if any(inp.dense_shape is None for inp in inputs):
-          raise ValueError("Either all merged IndexedSlices must have a "
-                           "dense_shape, or none must have a dense_shape.")
-        dense_shape, _ = gen_control_flow_ops.merge(
-            [inp.dense_shape for inp in inputs], name="dense_shape")
-      else:
-        dense_shape = None
-      return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+      # If there is a mix of tensors and indexed slices, then convert the
+      # tensors to indexed slices.
+      if all(isinstance(v, (ops.IndexedSlices, ops.Tensor)) for v in inputs):
+        inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
+
+      for v in inputs:
+        if not isinstance(v, composite_tensor.CompositeTensor):
+          raise TypeError("Type %s not supported" % type(v))
+
+      for v in inputs[1:]:
+        nest.assert_same_structure(inputs[0], v, expand_composites=True)
+
+      flat_inputs = [nest.flatten(v, expand_composites=True) for v in inputs]
+      merged_results = [gen_control_flow_ops.merge(component)
+                        for component in zip(*flat_inputs)]
+      flat_merged = [tensor for (tensor, _) in merged_results]
+      chosen_index = merged_results[0][1]
+      merged_inputs = nest.pack_sequence_as(inputs[0], flat_merged,
+                                            expand_composites=True)
+      return (merged_inputs, chosen_index)
 
 
 # pylint: enable=protected-access
@@ -537,6 +462,30 @@ def _ShapeLessThanOrEqual(shape1, shape2):
   return True
 
 
+def _get_shape_invariant(var, shape=None):
+  """Returns a shape invariant for the given variable.
+
+  If `var` is a `CompositeTensor`, then this uses
+  `_shape_invariant_to_components()` to get shape invariants for the
+  component tensors.
+
+  Args:
+    var: The tensor whose shape is described.
+    shape: The shape invariant for the tensor.  If not specified, then a default
+      shape invariant for `var` is returned.
+
+  Returns:
+    The shape invariant for `var` (if it is a `Tensor`), or the shape invariants
+    for the components that comprise `var` (if it is a `CompositeTensor`).
+  """
+  if isinstance(var, composite_tensor.CompositeTensor):
+    return var._shape_invariant_to_components(shape)  # pylint: disable=protected-access
+  elif shape is None:
+    return var.shape
+  else:
+    return shape
+
+
 def _SetShapeInvariants(input_vars, enter_vars, shapes):
   """Set the shapes of the tensors in `enter_vars` to `shapes`.
 
@@ -566,31 +515,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
             (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
-      if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-        raise TypeError("Type %s not supported" % type(var))
-      if isinstance(var, ops.IndexedSlices):
-        if not _ShapeLessThanOrEqual(inp.values.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the values tensor of this IndexedSlices. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
-                                    shape))
-        var.values.set_shape(shape)
-        var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
-        if var.dense_shape is not None:
-          var.dense_shape.set_shape(tensor_shape.TensorShape([shape.ndims]))
-      else:
-        if not _ShapeLessThanOrEqual(inp.dense_shape.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the shape tensor of this SparseTensor. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.dense_shape.name,
-                                    inp.dense_shape.get_shape(), shape))
-        var.values.set_shape(tensor_shape.TensorShape([None]))
-        var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
-        var.dense_shape.set_shape(shape)
+      raise TypeError("Type %s not supported" % type(var))
 
 
 def _EnforceShapeInvariant(merge_var, next_var):
@@ -619,49 +544,7 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "use the `shape_invariants` argument of tf.while_loop to specify a "
           "less-specific shape." % (input_t.name, input_t.shape, n_shape))
   else:
-    if not isinstance(merge_var,
-                      (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(merge_var))
-    if isinstance(merge_var, ops.IndexedSlices):
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = tensor_shape.TensorShape(None)
-      if merge_var.dense_shape is not None:
-        m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = tensor_shape.TensorShape(None)
-      if next_var.dense_shape is not None:
-        n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape)):
-        if not _ShapeLessThanOrEqual(n_values_shape, m_values_shape):
-          raise ValueError(
-              "The shape for %s is not an invariant for the loop. It enters "
-              "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-              "after one iteration. Provide shape invariants using either the "
-              "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables." %
-              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-               n_values_shape, n_indices_shape, n_shape_shape))
-    else:
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
-          not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
-        raise ValueError(
-            "The shape for %s is not an invariant for the loop. It enters "
-            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-            "after one iteration. Provide shape invariants using either "
-            "the `shape_invariants` argument of tf.while_loop or set_shape() "
-            "on the loop variables." %
-            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-             n_values_shape, n_indices_shape, n_shape_shape))
+    raise TypeError("Type %s not supported" % type(merge_var))
 
 
 def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
@@ -676,26 +559,15 @@ def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
       # TODO(skyewm): call this for other cases below (needs testing)
       _EnforceShapeInvariant(m, v)
     m.op._update_input(1, v)  # pylint: disable=protected-access
-  elif isinstance(m, ops.IndexedSlices):
+  elif isinstance(m, composite_tensor.CompositeTensor):
     # pylint: disable=protected-access
-    v = math_ops._as_indexed_slices(v, optimize=False)
-    v = _NextIteration(v)
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
+    def update_component(m_component, v_component):
+      m_component.op._update_input(1, v_component)
+    if isinstance(m, ops.IndexedSlices):
+      v = math_ops._as_indexed_slices(v, optimize=False)
     # pylint: enable=protected-access
-    if m.dense_shape is not None:
-      if v.dense_shape is None:
-        raise ValueError("Must have dense shape: %s" % v.name)
-      m.dense_shape.op._update_input(1, v.dense_shape)
-  elif isinstance(m, sparse_tensor.SparseTensor):
-    if not isinstance(v, sparse_tensor.SparseTensor):
-      raise ValueError("Must be a sparse tensor: %s" % v.name)
     v = _NextIteration(v)
-    # pylint: disable=protected-access
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
-    m.dense_shape.op._update_input(1, v.dense_shape)
-    # pylint: enable=protected-access
+    return nest.map_structure(update_component, m, v, expand_composites=True)
   else:
     raise TypeError("Type %s not supported" % type(m))
   return v
@@ -1613,7 +1485,8 @@ class ControlFlowContext(object):
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result)
+      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result,
+                         expand_composites=True)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1920,19 +1793,9 @@ class CondContext(ControlFlowContext):
     if isinstance(v, ops.Operation):
       # Use pivot as the proxy for this op.
       return with_dependencies([v], self._pivot)
-    elif isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      values = self._ProcessOutputTensor(v.values)
-      indices = self._ProcessOutputTensor(v.indices)
-      if isinstance(v, ops.IndexedSlices):
-        dense_shape = v.dense_shape
-        if dense_shape is not None:
-          dense_shape = self._ProcessOutputTensor(dense_shape)
-        return ops.IndexedSlices(values, indices, dense_shape)
-      else:
-        dense_shape = self._ProcessOutputTensor(v.dense_shape)
-        return sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
-      v = nest.map_structure(_convert_tensorarray_to_flow, v)
+      v = nest.map_structure(_convert_tensorarray_to_flow, v,
+                             expand_composites=True)
       return self._ProcessOutputTensor(ops.convert_to_tensor(v))
 
   def BuildCondBranch(self, fn):
@@ -1949,11 +1812,13 @@ class CondContext(ControlFlowContext):
           return no_op(), None
         else:
           original_result = nest.map_structure(array_ops.identity,
-                                               original_result)
+                                               original_result,
+                                               expand_composites=True)
     if original_result is None:
       return None, None
 
-    result = nest.map_structure(self._BuildCondTensor, original_result)
+    result = nest.map_structure(self._BuildCondTensor, original_result,
+                                expand_composites=True)
     if not isinstance(result, (list, _basetuple)):
       result = [result]
     return original_result, result
@@ -2120,7 +1985,8 @@ def cond(pred,
 
     # Check that the return values of the two branches have the same structure.
     try:
-      nest.assert_same_structure(orig_res_t, orig_res_f)
+      nest.assert_same_structure(orig_res_t, orig_res_f,
+                                 expand_composites=True)
     except TypeError as e:
       raise TypeError(
           "Incompatible return types of true_fn and false_fn: {}".format(e))
@@ -2132,24 +1998,21 @@ def cond(pred,
     if not res_t:
       raise ValueError("true_fn and false_fn must return at least one result.")
 
-    res_t_flat = nest.flatten(res_t)
-    res_f_flat = nest.flatten(res_f)
-
-    for x, y in zip(res_t_flat, res_f_flat):
-      assert ((isinstance(x, ops.IndexedSlices) and
-               isinstance(y, ops.IndexedSlices)) or
-              (isinstance(x, sparse_tensor.SparseTensor) and
-               isinstance(y, sparse_tensor.SparseTensor)) or
-              (isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)))
-      val_x = x if isinstance(x, ops.Tensor) else x.values
-      val_y = y if isinstance(y, ops.Tensor) else y.values
-      if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError(
-            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
-            (val_x.dtype.name, val_y.dtype.name))
+    res_t_flat = nest.flatten(res_t, expand_composites=True)
+    res_f_flat = nest.flatten(res_f, expand_composites=True)
+
+    for i, (x, y) in enumerate(zip(res_t_flat, res_f_flat)):
+      assert isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)
+      if x.dtype.base_dtype != y.dtype.base_dtype:
+        _cast_indexed_slice_indices(res_t, res_t_flat, res_f_flat)
+        if res_t_flat[i].dtype.base_dtype != res_f_flat[i].dtype.base_dtype:
+          raise ValueError(
+              "Outputs of true_fn and false_fn must have the same type: "
+              "%s, %s" % (x.dtype.name, y.dtype.name))
 
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
-    merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
+    merges = _convert_flows_to_tensorarrays(
+        nest.flatten(orig_res_t, expand_composites=True), merges)
 
     # Only add non-nested conds to the collection. Any nested control flow will
     # be encapsulated in the root context.
@@ -2158,7 +2021,8 @@ def cond(pred,
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
+    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges,
+                                   expand_composites=True)
 
     # Singleton lists and tuples are automatically unpacked if strict == False.
     if not strict:
@@ -2166,6 +2030,48 @@ def cond(pred,
     return merges
 
 
+def _cast_indexed_slice_indices(structure, flat_a, flat_b):
+  """Cast IndexedSlice.indices from int32 to int64 where necessary.
+
+  For each `IndexedSlices` in the nested structure `structure`, find its
+  indices `Tensor` in the corresponding flattened lists `flat_a` and `flat_b`
+  (where composites have been expanded); and if those indices tensors have
+  different dtypes (i.e., if one is int64 but the other is int32), then cast
+  them to both be int64.
+
+  Args:
+    structure: The nested structure that was flattened.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+  """
+  # Find the locations (in flat_a and flat_b) of the IndexedSlices'
+  # indices tensors.
+  indexed_slice_indices = []
+  current_index = 0
+  for item in nest.flatten(structure, expand_composites=False):
+    if isinstance(item, ops.IndexedSlices):
+      # indices is the second component of the composite tensor.
+      indexed_slice_indices.append(current_index + 1)
+    if nest.is_sequence_or_composite(item):
+      current_index += len(nest.flatten(item, expand_composites=True))
+    else:
+      current_index += 1
+  assert current_index == len(flat_a)
+
+  for index in indexed_slice_indices:
+    assert flat_a[index].dtype in (dtypes.int32, dtypes.int64)
+    assert flat_b[index].dtype in (dtypes.int32, dtypes.int64)
+    if flat_a[index].dtype != flat_b[index].dtype:
+      if flat_b[index].dtype == dtypes.int32:
+        flat_b[index] = math_ops.cast(flat_b[index], dtypes.int64)
+      else:
+        flat_a[index] = math_ops.cast(flat_a[index], dtypes.int64)
+
+
 # pylint: enable=g-doc-args
 # pylint: enable=redefined-outer-name
 
@@ -2939,21 +2845,12 @@ class WhileContext(ControlFlowContext):
       if isinstance(x, ops.Tensor):
         self._values.add(x.name)
       else:
-        self._values.add(x.values.name)
-        self._values.add(x.indices.name)
-        if isinstance(x, ops.IndexedSlices):
-          dense_shape = x.dense_shape
-        elif isinstance(x, sparse_tensor.SparseTensor):
-          dense_shape = x.dense_shape
-        else:
-          raise TypeError("Type %s not supported" % type(x))
-        if dense_shape is not None:
-          self._values.add(dense_shape.name)
+        raise TypeError("Type %s not supported" % type(x))
 
   def _BuildLoop(self, pred, body, original_loop_vars, loop_vars,
                  shape_invariants):
     """Core: Add the loop termination condition and body to the graph."""
-    flat_loop_vars = nest.flatten(original_loop_vars)
+    flat_loop_vars = nest.flatten(original_loop_vars, expand_composites=True)
 
     # Let the context know the loop variables so the loop variables
     # would be added in the outer contexts properly.
@@ -3005,7 +2902,8 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, merge_vars))
     packed_vars = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=merge_vars_with_tensor_arrays)
+        flat_sequence=merge_vars_with_tensor_arrays,
+        expand_composites=True)
     c = ops.convert_to_tensor(pred(*packed_vars))
     self._pivot = loop_cond(c, name="LoopCond")
     switch_vars = [_SwitchRefOrTensor(x, self._pivot) for x in merge_vars]
@@ -3019,11 +2917,12 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, vars_for_body))
     packed_vars_for_body = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=vars_for_body_with_tensor_arrays)
+        flat_sequence=vars_for_body_with_tensor_arrays,
+        expand_composites=True)
     pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     body_result = body(*packed_vars_for_body)
     post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
-    if not nest.is_sequence(body_result):
+    if not nest.is_sequence_or_composite(body_result):
       body_result = [body_result]
     if len(post_summaries) > len(pre_summaries):
       new_summaries = post_summaries[len(pre_summaries):]
@@ -3037,20 +2936,24 @@ class WhileContext(ControlFlowContext):
             return x
           return array_ops.identity(x)
 
-        body_result = nest.map_structure(map_fn, body_result)
+        body_result = nest.map_structure(map_fn, body_result,
+                                         expand_composites=True)
 
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
     # outputs of the body are typically tuples.
-    nest.assert_same_structure(list(packed_vars_for_body), list(body_result))
+    nest.assert_same_structure(list(packed_vars_for_body), list(body_result),
+                               expand_composites=True)
 
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
     # Convert TensorArrays returned by body into their flow variables
-    result = nest.map_structure(_convert_tensorarray_to_flow,
-                                nest.flatten(body_result))
-    result = ops.convert_n_to_tensor_or_indexed_slices(result)
+    result = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(body_result, expand_composites=True),
+        expand_composites=True)
+    result = ops.convert_n_to_tensor_or_composite(result)
 
     # Add NextIteration and the back edges to complete the loop.
     if len(merge_vars) != len(result):
@@ -3076,9 +2979,15 @@ class WhileContext(ControlFlowContext):
     # Keep original_loop_vars to identify which are TensorArrays
     original_loop_vars = loop_vars
     # Convert TensorArrays to their flow variables
-    loop_vars = nest.map_structure(_convert_tensorarray_to_flow,
-                                   nest.flatten(loop_vars))
-    loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
+    loop_vars = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(loop_vars, expand_composites=False),
+        expand_composites=True)
+    loop_vars = ops.convert_n_to_tensor_or_composite(loop_vars)
+    if shape_invariants is None:
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, expand_composites=False)
+    loop_vars = nest.flatten(loop_vars, expand_composites=True)
     try:
       self.Enter()
       # _BuildLoop calls _update_input in several places. _mutation_lock()
@@ -3090,14 +2999,15 @@ class WhileContext(ControlFlowContext):
     finally:
       self.Exit()
 
-    flat_result = nest.flatten(original_body_result)
+    flat_result = nest.flatten(original_body_result, expand_composites=True)
     # Convert TensorArray flow variables outside the context back into
     # their associated TensorArrays for returning to caller.
     exit_vars_with_tensor_arrays = (
         _convert_flows_to_tensorarrays(flat_result, exit_vars))
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
-        flat_sequence=exit_vars_with_tensor_arrays)
+        flat_sequence=exit_vars_with_tensor_arrays,
+        expand_composites=True)
 
     if return_same_structure:
       return packed_exit_vars
@@ -3111,12 +3021,7 @@ class WhileContext(ControlFlowContext):
       if isinstance(e, ops.Tensor):
         xs = [e]
       else:
-        if not isinstance(e, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-          raise TypeError("Type %s not supported" % type(e))
-        xs = [e.values, e.indices]
-        shape = e.dense_shape
-        if shape is not None:
-          xs.append(shape)
+        raise TypeError("Type %s not supported" % type(e))
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
@@ -3538,6 +3443,12 @@ def while_loop(cond,
         if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
           packed = True
           loop_vars = (loop_vars,)
+
+      def convert(x):
+        if isinstance(x, tensor_array_ops.TensorArray):
+          return x
+        return ops.convert_to_tensor(x)
+      loop_vars = nest.map_structure(convert, loop_vars)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
@@ -3546,7 +3457,12 @@ def while_loop(cond,
     if shape_invariants is not None:
       if maximum_iterations is not None:
         shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
-      nest.assert_same_structure(loop_vars, shape_invariants)
+
+      nest.assert_same_structure(loop_vars, shape_invariants,
+                                 expand_composites=False)
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, shape_invariants,
+          expand_composites=False)
 
     loop_context = WhileContext(
         maximum_iterations=maximum_iterations,
@@ -3588,7 +3504,7 @@ def _AsTensorList(x, p):
   for v in x:
     if isinstance(v, ops.Operation):
       v = with_dependencies([v], p)
-    v = ops.convert_to_tensor_or_indexed_slices(v)
+    v = ops.convert_to_tensor_or_composite(v)
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
@@ -3636,7 +3552,7 @@ def with_dependencies(dependencies, output_tensor, name=None):
                       list(dependencies) + [output_tensor]) as name:
     with ops.colocate_with(output_tensor):
       with ops.control_dependencies(dependencies):
-        output_tensor = ops.convert_to_tensor_or_indexed_slices(output_tensor)
+        output_tensor = ops.convert_to_tensor_or_composite(output_tensor)
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
@@ -3687,7 +3603,7 @@ def group(*inputs, **kwargs):
 
     # Sorts *inputs according to their devices.
     ops_on_device = {}  # device -> operations specified on the device.
-    for inp in nest.flatten(inputs):
+    for inp in nest.flatten(inputs, expand_composites=True):
       if not hasattr(inp, "device"):
         raise TypeError("Expected tf.group() expected Tensor arguments not "
                         "'%s' with type '%s'" % (inp, type(inp)))
@@ -3709,7 +3625,7 @@ def group(*inputs, **kwargs):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
 
-    for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
+    for dev in sorted(ops_on_device, key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
     with ops.control_dependencies(deps):
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index ff0dff0042e409cc12131ca4e97731a210c6203b..e6fdbe34ec5e0504db749e273810ce1e8820d9da 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -331,7 +331,7 @@ def CheckInputFromValidContext(op, input_op):
     if while_ctxt:
       error_msg = (
           "Cannot use '%s' as input to '%s' because they are in different while"
-          " loops." % (op.name, input_op.name))
+          " loops." % (input_op.name, op.name))
     else:
       error_msg = (
           "Cannot use '%s' as input to '%s' because '%s' is in a while loop."
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 58917ad264a56578bb4c98ff9a3ef0b63a3cbf12..cd37419906b1dc46f851cad35b83e93500261cf7 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -115,8 +115,26 @@ def maybe_set_lowering_attr(op):
     op: An `If` or `While` Operation.
   """
   if (not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
-      context.context().get_function_call_options().executor_type
-      != "SINGLE_THREADED_EXECUTOR"):
+      context.context().function_call_options.executor_type !=
+      "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
     op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
     # pylint: enable=protected-access
+
+
+def maybe_propagate_compile_time_consts_in_xla(op):
+  """Tells XLA whether to propagate compile-time consts in the loop body.
+
+  This is needed to make compile time constants available to ops, for example
+  `max_num_elements` in `EmptyTensorList`, inside the loop body. Ideally this
+  would always be turned on, but that doesn't work with legacy functionalized
+  while_loops.
+
+  Args:
+    op: A `While` Operation.
+  """
+  if control_flow_util.GraphOrParentsInXlaContext(op.graph):
+    # pylint: disable=protected-access
+    op._set_attr("_xla_propagate_compile_time_consts",
+                 attr_value_pb2.AttrValue(b=True))
+    # pylint: enable=protected-access
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/python/ops/critical_section_ops.py
similarity index 92%
rename from tensorflow/contrib/framework/python/ops/critical_section_ops.py
rename to tensorflow/python/ops/critical_section_ops.py
index 71ab755aa2948c548db89b330bb93c9524412fa6..21872ffff139b3f5b74d044746a83f3ce5ab265b 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/python/ops/critical_section_ops.py
@@ -31,6 +31,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+__all__ = ["CriticalSection"]
 
 
 # Graph Keys
@@ -66,6 +70,7 @@ def _get_colocation(op):
     return None
 
 
+@tf_export("CriticalSection")
 class CriticalSection(object):
   """Critical section.
 
@@ -179,37 +184,36 @@ class CriticalSection(object):
   def name(self):
     return self._handle.op.name
 
-  def execute(self, fn, *args, **kwargs):
-    """Execute function `fn(*args, **kwargs)` inside the CriticalSection.
+  def execute(self, fn, exclusive_resource_access=True, name=None):
+    """Execute function `fn()` inside the critical section.
+
+    `fn` should not accept any arguments.  To add extra arguments to when
+    calling `fn` in the critical section, create a lambda:
+
+    ```python
+    critical_section.execute(lambda: fn(*my_args, **my_kwargs))
+    ```
 
     Args:
       fn: The function to execute.  Must return at least one tensor.
-      *args: Additional positional arguments to `fn`.
-      **kwargs: Additional keyword arguments to `fn`.
-        Several keywords are reserved for `execute`.  These are:
-
-        - name; The name to use when creating the execute operation.
-        - exclusive_resource_access; Whether the resources required by
-          `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
-          You may want to set this to `False` if you will be accessing a
-          resource in read-only mode in two different CriticalSections.
+      exclusive_resource_access: Whether the resources required by
+        `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
+        You may want to set this to `False` if you will be accessing a
+        resource in read-only mode in two different CriticalSections.
+      name: The name to use when creating the execute operation.
 
     Returns:
-      The tensors returned from `fn(*args, **kwargs)`.
+      The tensors returned from `fn()`.
 
     Raises:
       ValueError: If `fn` attempts to lock this `CriticalSection` in any nested
         or lazy way that may cause a deadlock.
-      ValueError: If `exclusive_resource_access` is not provided (is `True`) and
+      ValueError: If `exclusive_resource_access == True` and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionally captured
-        inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
-        if another execution in another `CriticalSection` was created without
-        `exclusive_resource_access=True`, a `ValueError` will be raised.
+        resources as `fn``.  Note, even if `exclusive_resource_access` is
+        `True`, if another execution in another `CriticalSection` was created
+        without `exclusive_resource_access=True`, a `ValueError` will be raised.
     """
-    name = kwargs.pop("name", None)
-    exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
-
     with ops.name_scope(name, "critical_section_execute", []):
 
       # Ensure that mutex locking only happens *after* all args and
@@ -222,7 +226,7 @@ class CriticalSection(object):
         with ops.get_default_graph()._lock:  # pylint: disable=protected-access
           existing_ops = ops.get_default_graph().get_operations()
           with ops.control_dependencies([lock]):
-            r = fn(*args, **kwargs)
+            r = fn()
           # TODO(ebrevdo): If creating critical sections in a python loop, this
           # makes graph creation time quadratic.  Revisit if this
           # becomes a problem.
@@ -230,7 +234,7 @@ class CriticalSection(object):
                          .difference(existing_ops))
       else:
         with ops.control_dependencies([lock]):
-          r = fn(*args, **kwargs)
+          r = fn()
 
       if not context.executing_eagerly():
         self._add_control_dependencies_to_lock(created_ops, lock.op)
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 45286f7c188a3e891b5bf3f332f546bed627e102..80502daaac3b0daba1e207c7ccd76f6ec6eb2f72 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -31,20 +31,23 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops import inplace_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
-def ctc_loss(labels, inputs, sequence_length,
+def ctc_loss(labels, inputs=None, sequence_length=None,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
-             ignore_longer_outputs_than_inputs=False, time_major=True):
+             ignore_longer_outputs_than_inputs=False, time_major=True,
+             logits=None):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
   This op implements the CTC loss as presented in the article:
@@ -142,6 +145,7 @@ def ctc_loss(labels, inputs, sequence_length,
       avoids transposes at the beginning of the ctc_loss calculation.  However,
       most TensorFlow data is batch-major, so by this function also accepts
       inputs in batch-major form.
+    logits: Alias for inputs.
 
   Returns:
     A 1-D `float` `Tensor`, size `[batch]`, containing the negative log
@@ -156,6 +160,8 @@ def ctc_loss(labels, inputs, sequence_length,
     raise TypeError("Expected labels (first argument) to be a SparseTensor")
 
   # For internal calculations, we transpose to [time, batch, num_classes]
+  inputs = deprecation.deprecated_argument_lookup(
+      "logits", logits, "inputs", inputs)
   if not time_major:
     inputs = array_ops.transpose(inputs, [1, 0, 2])  # (B,T,N) => (T,B,N)
 
@@ -904,7 +910,7 @@ def ctc_unique_labels(labels, name=None):
           u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
       y = math_ops.cast(y, dtypes.int64)
       return [y, u.idx]
-    return functional_ops.map_fn(
+    return map_fn.map_fn(
         _unique, labels, dtype=[dtypes.int64, dtypes.int32])
 
 
diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py
index d4c182a802ad52dc431dde5b184ebb79cb733dc5..9ce906121f2b39d7c9370e57bb8293cfca4fa9c0 100644
--- a/tensorflow/python/ops/cudnn_rnn_grad.py
+++ b/tensorflow/python/ops/cudnn_rnn_grad.py
@@ -97,6 +97,7 @@ def _cudnn_rnn_backwardv3(op, *grads):
       dropout=op.get_attr("dropout"),
       seed=op.get_attr("seed"),
       seed2=op.get_attr("seed2"),
+      time_major=op.get_attr("time_major"),
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction")) + (None,)
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d96601ac21c7d7d62423b65a2e43d08449e23129..33b1651a0408af83475282ffc98c92d15e446ce2 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -56,7 +56,9 @@ def copy_handle_data(source_t, target_t):
       handle_data = source_t._handle_data  # pylint: disable=protected-access
     else:
       handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if handle_data is not None and handle_data.is_set:
+    if (handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type):
       # pylint: disable=protected-access
       pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
                                               target_t._as_tf_output(),
@@ -183,7 +185,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
                    current_var_scope.local_variables())
   new_vars = after_vars - before_vars
   for v in new_vars:
-    if not isinstance(v, resource_variable_ops.ResourceVariable):
+    if not resource_variable_ops.is_resource_variable(v):
       raise TypeError(
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
@@ -236,6 +238,9 @@ def _graph_mode_decorator(f, *args, **kwargs):
   original_tensors = all_tensors
   with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
     all_tensors = array_ops.identity_n(all_tensors)
+
+  original_tensors = [ops.convert_to_tensor(x) for x in original_tensors]
+
   # Propagate handle data for happier shape inference for resource variables.
   for i, t in enumerate(original_tensors):
     if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
@@ -266,6 +271,9 @@ def _eager_mode_decorator(f, *args, **kwargs):
   # TODO(apassos) consider removing the identity below.
   flat_result = [gen_array_ops.identity(x) for x in flat_result]
 
+  input_tensors = [ops.convert_to_tensor(x) for x
+                   in list(args) + list(variables)]
+  arg_count = len(args)
   def actual_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
     if variables:
@@ -276,10 +284,13 @@ def _eager_mode_decorator(f, *args, **kwargs):
     else:
       input_grads = grad_fn(*result_grads)
       variable_grads = []
+    flat_grads = nest.flatten(input_grads)
+    if len(flat_grads) != arg_count:
+      raise ValueError(
+          "custom_gradient function expected to return", arg_count,
+          "gradients but returned", len(flat_grads), "instead.")
     return nest.flatten(input_grads) + variable_grads
 
-  input_tensors = [ops.convert_to_tensor(x) for x
-                   in list(args) + list(variables)]
   tape_lib.record_operation(f.__name__, flat_result, input_tensors,
                             actual_grad_fn)
   flat_result = list(flat_result)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 1557bdf0eda90c26a97ce83239190dd6f9023a58..332e03244ce7c895bf992dc9e00a3aefd21ba97b 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1233,7 +1233,7 @@ class ConditionalAccumulatorBase(object):
     """
     return gen_data_flow_ops.accumulator_set_global_step(
         self._accumulator_ref,
-        math_ops.to_int64(ops.convert_to_tensor(new_global_step)),
+        math_ops.cast(ops.convert_to_tensor(new_global_step), _dtypes.int64),
         name=name)
 
 
@@ -1291,7 +1291,7 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     """
     grad = ops.convert_to_tensor(grad, self._dtype)
     grad.get_shape().assert_is_compatible_with(self._shape)
-    local_step = math_ops.to_int64(ops.convert_to_tensor(local_step))
+    local_step = math_ops.cast(ops.convert_to_tensor(local_step), _dtypes.int64)
     return gen_data_flow_ops.accumulator_apply_gradient(
         self._accumulator_ref, local_step=local_step, gradient=grad, name=name)
 
@@ -1423,14 +1423,14 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
     Raises:
       InvalidArgumentError: If grad is of the wrong shape
     """
-    local_step = math_ops.to_int64(ops.convert_to_tensor(local_step))
+    local_step = math_ops.cast(ops.convert_to_tensor(local_step), _dtypes.int64)
     return gen_data_flow_ops.sparse_accumulator_apply_gradient(
         self._accumulator_ref,
         local_step=local_step,
-        gradient_indices=math_ops.to_int64(grad_indices),
+        gradient_indices=math_ops.cast(grad_indices, _dtypes.int64),
         gradient_values=grad_values,
-        gradient_shape=math_ops.to_int64([]
-                                         if grad_shape is None else grad_shape),
+        gradient_shape=math_ops.cast(
+            [] if grad_shape is None else grad_shape, _dtypes.int64),
         has_known_shape=(grad_shape is not None),
         name=name)
 
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 9c63385dd0152aae48b1f92fd8d350fc910fe564..a347cfdec1585f87ba0bf5e2e6fa604367604c7b 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -462,7 +462,7 @@ class Bijector(object):
 
 
   ```python
-  abs = tf.contrib.distributions.bijectors.AbsoluteValue()
+  abs = tfp.distributions.bijectors.AbsoluteValue()
 
   abs.forward(-1.)
   ==> 1.
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 97d2b1b26c68dc53f0a77120c9d3820c1d0f017b..1b2dbfaf9fe4de3ca8f439737f36b71e26e8e368 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -266,7 +266,7 @@ class Multinomial(distribution.Distribution):
       x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)  # [n, k]
       return x
 
-    x = functional_ops.map_fn(
+    x = map_fn.map_fn(
         _sample_single, [flat_logits, flat_ndraws],
         dtype=self.dtype)  # [B1B2...Bm, n, k]
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9acc0469885c2463e84f875314f07d1f3d55481a..0b36054db2f15538037c2f5f64a2b762c58e5105 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -291,5 +291,5 @@ def _kl_normal_normal(n_a, n_b, name=None):
     s_a_squared = math_ops.square(n_a.scale)
     s_b_squared = math_ops.square(n_b.scale)
     ratio = s_a_squared / s_b_squared
-    return (math_ops.square(n_a.loc - n_b.loc) / (two * s_b_squared) +
-            half * (ratio - one - math_ops.log(ratio)))
+    return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared)
+            + half * (ratio - one - math_ops.log(ratio)))
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1becfc18778e998d1a84594273e1637e580f2aad..3c6476864a0bb05feec828d69de8fb8bc138a74b 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -167,7 +167,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   distribution:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Exp(),
@@ -177,7 +177,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   A `LogNormal` made from callables:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Inline(
@@ -191,7 +191,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   Another example constructing a Normal from a StandardNormal:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
@@ -209,7 +209,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   multivariate Normal as a `TransformedDistribution`.
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   # We will create two MVNs with batch_shape = event_shape = 2.
   mean = [[-1., 0],      # batch:0
           [0., 1]]       # batch:1
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index d0291e2095bdb6574c707c7458e4cc335fc4b825..f533a0e73f121bb8a40198fbfa7a7b058d6f3b8e 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -320,7 +320,6 @@ def embedding_lookup(
 def embedding_lookup_v2(
     params,
     ids,
-    partition_strategy="mod",
     max_norm=None,
     name=None):
   """Looks up `ids` in a list of embedding tensors.
@@ -338,13 +337,9 @@ def embedding_lookup_v2(
   partitions, each of the first `(max_id + 1) % len(params)` partitions will
   be assigned one more id.
 
-  If `partition_strategy` is `"mod"`, we assign each id to partition
-  `p = id % len(params)`. For instance,
-  13 ids are split across 5 partitions as:
-  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
-
-  If `partition_strategy` is `"div"`, we assign ids to partitions in a
-  contiguous manner. In this case, 13 ids are split across 5 partitions as:
+  The `partition_strategy` is always `"div"` currently. This means that we
+  assign ids to partitions in a contiguous manner. For instance, 13 ids are
+  split across 5 partitions as:
   `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
 
   The results of the lookup are concatenated into a dense
@@ -355,12 +350,9 @@ def embedding_lookup_v2(
       or a list of P tensors all of same shape except for the first dimension,
       representing sharded embedding tensors.  Alternatively, a
       `PartitionedVariable`, created by partitioning along dimension 0. Each
-      element must be appropriately sized for the given `partition_strategy`.
+      element must be appropriately sized for the 'div' `partition_strategy`.
     ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
       up in `params`.
-    partition_strategy: A string specifying the partitioning strategy, relevant
-      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
-      is `"mod"`.
     max_norm: If not `None`, each embedding is clipped if its l2-norm is
       larger than this value.
     name: A name for the operation (optional).
@@ -371,7 +363,7 @@ def embedding_lookup_v2(
   Raises:
     ValueError: If `params` is empty.
   """
-  return embedding_lookup(params, ids, partition_strategy, name,
+  return embedding_lookup(params, ids, "div", name,
                           max_norm=max_norm)
 
 
@@ -493,7 +485,7 @@ def embedding_lookup_sparse(params,
     embeddings = embedding_lookup(
         params, ids, partition_strategy=partition_strategy, max_norm=max_norm)
     if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
-      embeddings = math_ops.to_float(embeddings)
+      embeddings = math_ops.cast(embeddings, dtypes.float32)
     if not ignore_weights:
       weights = sp_weights.values
       if weights.dtype != embeddings.dtype:
@@ -554,15 +546,81 @@ def embedding_lookup_sparse(params,
 def embedding_lookup_sparse_v2(params,
                                sp_ids,
                                sp_weights,
-                               partition_strategy="mod",
                                combiner=None,
                                max_norm=None,
                                name=None):
-  return embedding_lookup_sparse_v2(
-      params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
+  """Computes embeddings for the given ids and weights.
+
+  This op assumes that there is at least one id for each row in the dense tensor
+  represented by sp_ids (i.e. there are no rows with empty features), and that
+  all the indices of sp_ids are in canonical row-major order.
+
+  It also assumes that all id values lie in the range [0, p0), where p0
+  is the sum of the size of params along dimension 0.
 
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for ``"div"`` `partition_strategy`.
+    sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
+      and M is arbitrary.
+    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
+      indicate all weights should be taken to be 1. If specified, `sp_weights`
+      must have exactly the same shape and indices as `sp_ids`.
+    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
+      and "sum" are supported.
+      "sum" computes the weighted sum of the embedding results for each row.
+      "mean" is the weighted sum divided by the total weight.
+      "sqrtn" is the weighted sum divided by the square root of the sum of the
+      squares of the weights.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is
+      larger than this value, before combining.
+    name: Optional name for the op.
 
-embedding_lookup_sparse_v2.__doc__ = embedding_lookup_sparse.__doc__
+  Returns:
+    A dense tensor representing the combined embeddings for the
+    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
+    looks up the embeddings for all ids in that row, multiplies them by the
+    corresponding weight, and combines these embeddings as specified.
+
+    In other words, if
+
+      `shape(combined params) = [p0, p1, ..., pm]`
+
+    and
+
+      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`
+
+    then
+
+      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.
+
+    For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are
+
+      ```python
+      [0, 0]: id 1, weight 2.0
+      [0, 1]: id 3, weight 0.5
+      [1, 0]: id 0, weight 1.0
+      [2, 3]: id 1, weight 3.0
+      ```
+
+    with `combiner`="mean", then the output will be a 3x20 matrix where
+
+      ```python
+      output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
+      output[1, :] = (params[0, :] * 1.0) / 1.0
+      output[2, :] = (params[1, :] * 3.0) / 3.0
+      ```
+
+  Raises:
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is
+      neither `None` nor `SparseTensor`.
+    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
+  """
+  return embedding_lookup_sparse(
+      params, sp_ids, sp_weights, "div", name, combiner, max_norm)
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index b240d1e465c21450fba5ead6ca957e7d4482ea1d..448e45cae37be372bc7742d047a24d6017735c3e 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -37,7 +36,6 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
@@ -265,255 +263,6 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     return r_a
 
 
-@tf_export("map_fn")
-def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
-           swap_memory=False, infer_shape=True, name=None):
-  """map on the list of tensors unpacked from `elems` on dimension 0.
-
-  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
-  sequence of elements from first to last. The elements are made of the
-  tensors unpacked from `elems`. `dtype` is the data type of the return
-  value of `fn`. Users must provide `dtype` if it is different from
-  the data type of `elems`.
-
-  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
-
-  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-  is a (possibly nested) list or tuple of tensors, then each of these tensors
-  must have a matching first (unpack) dimension.  The signature of `fn` may
-  match the structure of `elems`.  That is, if `elems` is
-  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
-
-  Furthermore, `fn` may emit a different structure than its input.  For example,
-  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-  nested) tuple of types matching the output of `fn`.
-
-  To apply a functional operation to the nonzero elements of a SparseTensor
-  one of the following methods is recommended. First, if the function is
-  expressible as TensorFlow ops, use
-
-  ```python
-    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-  ```
-
-  If, however, the function is not expressible as a TensorFlow op, then use
-
-  ```python
-  result = SparseTensor(
-    input.indices, map_fn(fn, input.values), input.dense_shape)
-  ```
-
-  instead.
-
-  When executing eagerly, map_fn does not execute in parallel even if
-  `parallel_iterations` is set to a value > 1. You can still get the
-  performance benefits of running a function in parallel by using the
-  `tf.contrib.eager.defun` decorator,
-
-  ```python
-  # Assume the function being used in map_fn is fn.
-  # To ensure map_fn calls fn in parallel, use the defun decorator.
-  @tf.contrib.eager.defun
-  def func(tensor):
-    return tf.map_fn(fn, tensor)
-  ```
-
-  Note that if you use the defun decorator, any non-TensorFlow Python code
-  that you may have written in your function won't get executed. See
-  `tf.contrib.eager.defun` for more details. The recommendation would be to
-  debug without defun but switch to defun to get performance benefits of
-  running map_fn in parallel.
-
-  Args:
-    fn: The callable to be performed.  It accepts one argument, which will
-      have the same (possibly nested) structure as `elems`.  Its output
-      must have the same structure as `dtype` if one is provided, otherwise
-      it must have the same structure as `elems`.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be applied to `fn`.
-    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-      of Tensors differing from the structure of `elems`, then `dtype` is not
-      optional and must have the same structure as the output of `fn`.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel. When graph building, the default value is 10. While executing
-      eagerly, the default value is set to 1.
-    back_prop: (optional) True enables support for back propagation.
-    swap_memory: (optional) True enables GPU-CPU memory swapping.
-    infer_shape: (optional) False disables tests for consistent output shapes.
-    name: (optional) Name prefix for the returned tensors.
-
-  Returns:
-    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-    results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, from first to last.
-
-  Raises:
-    TypeError: if `fn` is not callable or the structure of the output of
-      `fn` and `dtype` do not match, or if elems is a SparseTensor.
-    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
-
-  Examples:
-    ```python
-    elems = np.array([1, 2, 3, 4, 5, 6])
-    squares = map_fn(lambda x: x * x, elems)
-    # squares == [1, 4, 9, 16, 25, 36]
-    ```
-
-    ```python
-    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-    # alternate == [-1, 2, -3]
-    ```
-
-    ```python
-    elems = np.array([1, 2, 3])
-    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-    # alternates[0] == [1, 2, 3]
-    # alternates[1] == [-1, -2, -3]
-    ```
-  """
-  if not callable(fn):
-    raise TypeError("fn must be callable.")
-
-  if isinstance(elems, sparse_tensor.SparseTensor):
-    raise TypeError(
-        "To perform a map on the values of a sparse tensor use either "
-        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
-        " SparseTensor(input.indices, map_fn(fn, input.values), "
-        "input.dense_shape)")
-
-  in_graph_mode = not context.executing_eagerly()
-  # Set the default number of parallel_iterations depending on graph/eager mode.
-  if in_graph_mode and not parallel_iterations:
-    parallel_iterations = 10
-  elif not in_graph_mode and not parallel_iterations:
-    parallel_iterations = 1
-
-  if not in_graph_mode and parallel_iterations > 1:
-    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
-                        "effect when executing eagerly. Consider calling map_fn"
-                        " with tf.contrib.eager.defun to execute fn in "
-                        "parallel.", 1)
-    parallel_iterations = 1
-
-  input_is_sequence = nest.is_sequence(elems)
-  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
-  def input_pack(x):
-    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
-
-  if dtype is None:
-    output_is_sequence = input_is_sequence
-    output_flatten = input_flatten
-    output_pack = input_pack
-  else:
-    output_is_sequence = nest.is_sequence(dtype)
-    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
-    def output_pack(x):
-      return (nest.pack_sequence_as(dtype, x)
-              if output_is_sequence else x[0])
-
-  elems_flat = input_flatten(elems)
-
-  with ops.name_scope(name, "map", elems_flat):
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode:
-      # Any get_variable calls in fn will cache the first call locally
-      # and not issue repeated network I/O requests for each iteration.
-      varscope = vs.get_variable_scope()
-      varscope_caching_device_was_none = False
-      if varscope.caching_device is None:
-        # TODO(ebrevdo): Change to using colocate_with here and in other
-        # methods.
-        varscope.set_caching_device(lambda op: op.device)
-        varscope_caching_device_was_none = True
-
-    elems_flat = [
-        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
-
-    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
-    dtype_flat = output_flatten(dtype)
-
-    # Convert elems to tensor array. n may be known statically.
-    static_shape = elems_flat[0].shape
-    if static_shape.ndims is not None and static_shape.ndims < 1:
-      if len(elems_flat) == 1:
-        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
-      else:
-        raise ValueError(
-            "elements in elems must be 1+ dimensional Tensors, not scalars"
-        )
-    n = (tensor_shape.dimension_value(static_shape[0])
-         or array_ops.shape(elems_flat[0])[0])
-
-    # TensorArrays are always flat
-    elems_ta = [
-        tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
-                                     dynamic_size=False,
-                                     infer_shape=True)
-        for elem in elems_flat]
-    # Unpack elements
-    elems_ta = [
-        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
-
-    i = constant_op.constant(0)
-
-    accs_ta = [
-        tensor_array_ops.TensorArray(dtype=dt, size=n,
-                                     dynamic_size=False,
-                                     infer_shape=infer_shape)
-        for dt in dtype_flat]
-
-    def compute(i, tas):
-      """The loop body of map_fn.
-
-      Args:
-        i: the loop counter
-        tas: the flat TensorArray accumulator list
-
-      Returns:
-        (i + 1, tas): the updated counter + updated TensorArrays
-
-      Raises:
-        TypeError: if dtype and packed_fn_values structure do not match
-        ValueType: if dtype and packed_fn_values lengths do not match
-      """
-      packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
-      packed_fn_values = fn(packed_values)
-      nest.assert_same_structure(dtype or elems, packed_fn_values)
-      flat_fn_values = output_flatten(packed_fn_values)
-      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)]
-      return (i + 1, tas)
-
-    _, r_a = control_flow_ops.while_loop(
-        lambda i, _: i < n, compute, (i, accs_ta),
-        parallel_iterations=parallel_iterations,
-        back_prop=back_prop,
-        swap_memory=swap_memory,
-        maximum_iterations=n)
-    results_flat = [r.stack() for r in r_a]
-
-    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
-        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
-    for elem in elems_flat[1:]:
-      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
-          elem.get_shape().with_rank_at_least(1)[0])))
-    for r in results_flat:
-      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
-          r.get_shape()[1:]))
-
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode and varscope_caching_device_was_none:
-      varscope.set_caching_device(None)
-
-    return output_pack(results_flat)
-
-
 @tf_export("scan")
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
          swap_memory=False, infer_shape=True, reverse=False, name=None):
@@ -648,13 +397,15 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
 
     # Convert elems to tensor array. n may be known statically.
-    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
-         or array_ops.shape(elems_flat[0])[0])
+    n = tensor_shape.dimension_value(elems_flat[0].shape[0])
+    if n is None:
+      n = array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
         tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
                                      dynamic_size=False,
+                                     element_shape=elem.shape[1:],
                                      infer_shape=True)
         for elem in elems_flat]
     # Unpack elements
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 5d473eeb5f4f00087672da53c5fef3ab63bdbd08..41fcaaca6824611fb4212df1f444e72bffdf0ea4 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -66,20 +66,31 @@ def _eval_indexed_slices(a):
 
 
 def _to_numpy(a):
-  """Converts Tensors and EagerTensors to numpy arrays.
+  """Converts Tensors, EagerTensors, and IndexedSlicesValue to numpy arrays.
 
   Args:
     a: any value.
 
   Returns:
     If a is EagerTensor or Tensor, returns the evaluation of a by calling
-    numpy() or run(). Otherwise returns a unchanged.
+    numpy() or run(). If a is IndexedSlicesValue, constructs the corresponding
+    dense numpy array. Otherwise returns a unchanged.
   """
   if isinstance(a, ops.EagerTensor):
     return a.numpy()
   if isinstance(a, ops.Tensor):
     sess = ops.get_default_session()
     return sess.run(a)
+  if isinstance(a, ops.IndexedSlicesValue):
+    arr = np.zeros(a.dense_shape)
+    assert len(a.values) == len(a.indices), (
+        "IndexedSlicesValue has %s value slices but %s indices\n%s" %
+        (a.values, a.indices, a))
+    for values_slice, index in zip(a.values, a.indices):
+      assert 0 <= index < len(arr), (
+          "IndexedSlicesValue has invalid index %s\n%s" % (index, a))
+      arr[index] += values_slice
+    return arr
   return a
 
 
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index cd11447e1f963a62d79855cfd8af42a35e978c79..96389abded3acf3c58f90faa601d4cf1e5eb8619 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.ops.custom_gradient import custom_gradient
-from tensorflow.python.ops.gradients_impl import AggregationMethod
+from tensorflow.python.ops.gradients_util import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 0a70d6ee61e64f94c41c1f1d0a5b6c3610b45c04..6fecaa927dd07c52bd91ff249c2d9fa7305c876f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -18,30 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import contextlib
-import warnings
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework.func_graph import FuncGraph
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
@@ -51,503 +35,11 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-# This is to avoid a circular dependency (eager.function depends on
-# gradients_impl). This is set in eager/function.py.
-_function = None
-
-# Warn the user if we convert a sparse representation to dense with at
-# least this number of elements.
-_LARGE_SPARSE_NUM_ELEMENTS = 100000000
-
-
-def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
-  """Converts an IndexedSlices object `value` to a Tensor.
-
-  NOTE(mrry): This function is potentially expensive.
-
-  Args:
-    value: An ops.IndexedSlices object.
-    dtype: The dtype of the Tensor to be returned.
-    name: Optional name to use for the returned Tensor.
-    as_ref: True if a ref is requested.
-
-  Returns:
-    A dense Tensor representing the values in the given IndexedSlices.
-
-  Raises:
-    ValueError: If the IndexedSlices does not have the same dtype.
-  """
-  _ = as_ref
-  if dtype and not dtype.is_compatible_with(value.dtype):
-    raise ValueError(
-        "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" %
-        (dtype.name, value.dtype.name))
-  if value.dense_shape is None:
-    raise ValueError(
-        "Tensor conversion requested for IndexedSlices without dense_shape: %s"
-        % str(value))
-  # TODO(mrry): Consider adding static shape information to
-  # IndexedSlices, to avoid using numpy here.
-  if not context.executing_eagerly():
-    dense_shape_value = tensor_util.constant_value(value.dense_shape)
-    if dense_shape_value is not None:
-      num_elements = np.prod(dense_shape_value)
-      if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
-        warnings.warn(
-            "Converting sparse IndexedSlices to a dense Tensor with %d "
-            "elements. This may consume a large amount of memory." %
-            num_elements)
-    else:
-      warnings.warn(
-          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
-          "This may consume a large amount of memory.")
-  return math_ops.unsorted_segment_sum(
-      value.values, value.indices, value.dense_shape[0], name=name)
-
-
-ops.register_tensor_conversion_function(ops.IndexedSlices,
-                                        _IndexedSlicesToTensor)
-
-
-def _MarkReachedOps(from_ops, reached_ops, func_graphs):
-  """Mark all ops reached from "from_ops".
-
-  Args:
-    from_ops: list of Operations.
-    reached_ops: set of Operations.
-    func_graphs: list of FuncGraphs. This method will traverse through
-      these functions if they capture from_ops or any reachable ops.
-  """
-  queue = collections.deque()
-  queue.extend(from_ops)
-  while queue:
-    op = queue.popleft()
-    if op not in reached_ops:
-      reached_ops.add(op)
-      for output in op.outputs:
-        if _IsBackpropagatable(output):
-          queue.extend(_Consumers(output, func_graphs))
-
-
-def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
-                  xs):
-  """Initialize the pending count for ops between two lists of Operations.
-
-  'pending_count[op]' indicates the number of backprop inputs
-  to this operation.
-
-  Args:
-    to_ops: list of Operations.
-    from_ops: list of Operations.
-    colocate_gradients_with_ops: Python bool.  See docstring of gradients().
-    func_graphs: list of FuncGraphs. This method will traverse through
-      these functions if they capture from_ops or any reachable ops. This is
-      useful if to_ops occur in a function and from_ops are in an outer function
-      or graph.
-    xs: list of Tensors.
-
-  Returns:
-    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
-    path of zero or more backpropagatable tensors, (2) a mapping from operation
-    to the number of backprop inputs to that op, and (3) a ControlFlowState
-    object which is not None if the ops between from_ops and to_ops contain
-    control flow loops.
-  """
-  # Mark reachable ops from from_ops.
-  reached_ops = set()
-  _MarkReachedOps(from_ops, reached_ops, func_graphs)
-  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
-  # backpropagatable tensors.
-
-  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
-
-  # Mark between ops.
-  between_ops = set()
-  between_op_list = []
-  queue = collections.deque()
-  queue.extend(to_ops)
-  while queue:
-    op = queue.popleft()
-    # We are interested in this op.
-    if op in reached_ops:
-      between_ops.add(op)
-      between_op_list.append(op)
-      # Clear the boolean so we won't add the inputs again.
-      reached_ops.remove(op)
-      for inp in _NonEagerInputs(op, xs):
-        queue.append(inp.op)
-  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
-  # between from_ops and to_ops
-
-  # 'loop_state' is None if there are no while loops.
-  loop_state = control_flow_ops.MaybeCreateControlFlowState(
-      between_op_list, between_ops, colocate_gradients_with_ops)
-
-  # Initialize pending count for between ops.
-  pending_count = collections.defaultdict(int)
-  for op in between_op_list:
-    for x in _NonEagerInputs(op, xs):
-      if x.op in between_ops:
-        pending_count[x.op] += 1
-
-  return reachable_to_ops, pending_count, loop_state
-
-
-def _AsList(x):
-  return x if isinstance(x, (list, tuple)) else [x]
-
-
-def _DefaultGradYs(grad_ys,
-                   ys,
-                   colocate_gradients_with_ops,
-                   gradient_uid="__unsupported__"):
-  """Fill in default values for grad_ys.
-
-  Args:
-    grad_ys: List of gradients, can contain None.
-    ys: List of tensors.
-    colocate_gradients_with_ops: If True, try colocating gradients with
-      the corresponding op.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
-
-  Returns:
-    A list of gradients to use, without None.
-
-  Raises:
-    ValueError: If sizes of gradients and inputs don't match
-    TypeError: If type of any gradient is not valid for its input.
-  """
-  if len(grad_ys) != len(ys):
-    raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
-  grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
-  new_grad_ys = []
-  for i in xrange(len(grad_ys)):
-    grad_y = grad_ys[i]
-    y = ys[i]
-    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
-      if grad_y is None:
-        if y.dtype.is_complex:
-          raise TypeError(
-              "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
-              y.dtype)
-        new_grad_ys.append(
-            array_ops.fill(
-                array_ops.shape(y),
-                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
-        continue
-      if y.dtype.is_floating or y.dtype.is_integer:
-        if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
-          raise TypeError(
-              "Gradient type %s generated for real or "
-              "integer-valued tensor %s with type %s must be "
-              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
-                                   dtypes.as_dtype(y.dtype).name))
-      elif y.dtype.is_complex:
-        if not grad_y.dtype.is_complex:
-          raise TypeError(
-              "Gradient type %s generated for complex-valued "
-              "tensor %s with type %s must be real" % (dtypes.as_dtype(
-                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
-      elif y.dtype == dtypes.variant:
-        if grad_y.dtype != dtypes.variant:
-          raise TypeError(
-              "Gradient type %s generated for variant "
-              "tensor %s with type %s must be variant" % (dtypes.as_dtype(
-                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
-      elif y.dtype == dtypes.resource:
-        # We assume y is the handle of a ResourceVariable. The gradient of a
-        # ResourceVariable should be a numeric value, not another resource.
-        if grad_y.dtype == dtypes.resource:
-          raise TypeError("Input gradient %s for resource tensor %s should not "
-                          "be a resource" % (grad_y, y))
-      else:
-        raise TypeError(
-            "Tensor %s with type %s must be numeric "
-            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
-      # Create a grad_y tensor in the name scope of the gradient.
-      # Required for TensorArrays to identify which gradient call a
-      # grad_y value is coming from.
-      if isinstance(grad_y, ops.IndexedSlices):
-        new_grad_ys.append(
-            ops.IndexedSlices(
-                indices=(array_ops.identity(
-                    grad_y.indices, name="grad_ys_%d_indices" % i)
-                         if isinstance(grad_y.indices, ops.Tensor) else
-                         grad_y.indices),
-                values=(array_ops.identity(
-                    grad_y.values, name="grad_ys_%d_values" % i) if isinstance(
-                        grad_y.values, ops.Tensor) else grad_y.values),
-                dense_shape=(array_ops.identity(
-                    grad_y.dense_shape, name="grad_ys_%d_shape" % i)
-                             if isinstance(grad_y.dense_shape, ops.Tensor) else
-                             grad_y.dense_shape)))
-      else:
-        new_grad_ys.append(array_ops.identity(grad_y, name="grad_ys_%d" % i))
-
-  return new_grad_ys
-
-
-def IsTrainable(tensor_or_dtype):
-  if isinstance(tensor_or_dtype, ops.Tensor):
-    dtype = tensor_or_dtype.dtype
-  else:
-    dtype = tensor_or_dtype
-  dtype = dtypes.as_dtype(dtype)
-  return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                              dtypes.complex64, dtypes.complex128,
-                              dtypes.resource, dtypes.variant)
-
-
-def _IsBackpropagatable(tensor):
-  if IsTrainable(tensor):
-    return True
-  dtype = dtypes.as_dtype(tensor.dtype)
-  return dtype.base_dtype == dtypes.bfloat16
-
-
-def _VerifyGeneratedGradients(grads, op):
-  """Verify that gradients are valid in number and type.
-
-  Args:
-    grads: List of generated gradients.
-    op: Operation for which the gradients where generated.
-
-  Raises:
-    ValueError: if sizes of gradients and inputs don't match.
-    TypeError: if type of any gradient is not valid for its input.
-  """
-  # While ops have inputs added to them during the gradient computation, so we
-  # skip the below check. See while_v2 for details.
-  if op.type == "While": return
-
-  if len(grads) != len(op.inputs):
-    raise ValueError("Num gradients %d generated for op %s do not match num "
-                     "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
-
-
-def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
-  """The set of ops that terminate the gradient computation.
-
-  This computes the frontier of the forward graph *before* which backprop
-  should stop. Operations in the returned set will not be differentiated.
-  This set is defined as the subset of `from_ops` containing ops that have
-  no predecessor in `from_ops`. `pending_count` is the result of
-  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
-  iff pending_count[op] > 0.
-
-  In addition, none of `stop_gradient_ops` will be differentiated.
-
-  Args:
-    from_ops: list of Operations.
-    stop_gradient_ops: list of Operations never to backprop through.
-    pending_count: mapping from operation to number of backprop inputs.
-    xs: list of Tensors.
-
-  Returns:
-    The set of operations.
-  """
-  stop_ops = set()
-  for op in from_ops:
-    is_stop_op = True
-    for inp in _NonEagerInputs(op, xs):
-      if pending_count[inp.op] > 0:
-        is_stop_op = False
-        break
-    if is_stop_op:
-      stop_ops.add(op)
-  stop_ops.update(op for op in stop_gradient_ops)
-  return stop_ops
-
-
-@contextlib.contextmanager
-def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
-  """Context to colocate with `op` if `colocate_gradients_with_ops`."""
-  if colocate_gradients_with_ops:
-    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
-      yield
-  else:
-    yield
-
-
-def _IsPartitionedCall(op):
-  return op.type == "PartitionedCall" or op.type == "StatefulPartitionedCall"
-
-
-def _SymGrad(op, out_grads):
-  """Backprop through a function call node op given its outputs' gradients."""
-  f_in = [x for x in op.inputs] + out_grads
-  f_types = [x.dtype for x in op.inputs]
-  f = attr_value_pb2.NameAttrList()
-  if _IsPartitionedCall(op):
-    f.name = op.get_attr("f").name
-  else:
-    f.name = op.type
-  for k in op.node_def.attr:
-    f.attr[k].CopyFrom(op.node_def.attr[k])
-  # TODO(apassos) use a better dtype here
-  in_grads = functional_ops.symbolic_gradient(
-      input=f_in,
-      Tout=[x if x != dtypes.resource else dtypes.float32 for x in f_types],
-      f=f)
-  return in_grads
-
-
-def _MaybeCompile(scope, op, func, grad_fn):
-  """Compile the calculation in grad_fn if op was marked as compiled."""
-  scope = scope.rstrip("/").replace("/", "_")
-  if func is not None:
-    xla_compile = func.definition.attr["_XlaCompile"].b
-    xla_separate_compiled_gradients = func.definition.attr[
-        "_XlaSeparateCompiledGradients"].b
-    xla_scope = func.definition.attr["_XlaScope"].s.decode()
-  else:
-    try:
-      xla_compile = op.get_attr("_XlaCompile")
-      xla_separate_compiled_gradients = op.get_attr(
-          "_XlaSeparateCompiledGradients")
-      xla_scope = op.get_attr("_XlaScope").decode()
-    except ValueError:
-      return grad_fn()  # Exit early
-
-  if not xla_compile:
-    return grad_fn()  # Exit early
-
-  # If the gradients are supposed to be compiled separately, we give them a
-  # _XlaScope name that is based on the name_scope of the gradients.  Otherwise
-  # they just inherit the existing _XlaScope name, which lets them be merged
-  # together with the non-gradient computation.
-  if xla_separate_compiled_gradients:
-    xla_grad_scope = "%s_grad_%s" % (xla_scope, scope)
-  else:
-    xla_grad_scope = xla_scope
-
-  attrs = {
-      "_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile),
-      "_XlaScope": attr_value_pb2.AttrValue(s=xla_grad_scope.encode())
-  }
-  with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
-    return grad_fn()
-
-
-def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
-  """Raises an error if we backprop through a loop var."""
-  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
-  # message.
-  target_op = None
-  queue = collections.deque([op])
-  visited = set()
-  while queue:
-    curr_op = queue.popleft()
-    if curr_op in visited: continue
-    visited.add(curr_op)
-    if curr_op in from_ops:
-      target_op = curr_op
-      break
-    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs))
-  assert target_op
-  raise ValueError(
-      "Cannot compute gradient inside while loop with respect to op '%s'. "
-      "We do not support taking the gradient wrt or through the initial value "
-      "of a loop variable. Gradients can be computed through loop invariants "
-      "or wrt the input parameters to the loop body."
-      % target_op.name)
-
-
-def _IsFunction(graph):
-  return (isinstance(graph, FuncGraph) or
-          isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
-
-
-def _Captures(func_graph):
-  if isinstance(func_graph, FuncGraph):
-    return func_graph.captures
-  else:
-    assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-    return func_graph._captured  # pylint: disable=protected-access
-
-
-def _MaybeCaptured(t):
-  """If t is a captured value placeholder, returns the original captured value.
-
-  Args:
-    t: Tensor
-
-  Returns:
-    A tensor, potentially from a different Graph/FuncGraph.
-  """
-  # pylint: disable=protected-access
-  if (not isinstance(t, ops.EagerTensor) and
-      _IsFunction(t.op.graph) and t.op.type == "Placeholder"):
-    for input_t, placeholder_t in _Captures(t.op.graph).items():
-      if t == placeholder_t:
-        return _MaybeCaptured(input_t)
-  # pylint: enable=protected-access
-  return t
-
-
-# TODO(skyewm): plumbing xs through everywhere is ugly, consider making
-# _GradientsHelper a class with xs as a member variable.
-def _NonEagerInputs(op, xs):
-  """Returns the inputs of op, crossing closure boundaries where necessary.
-
-  Does not return any captured EagerTensors, i.e., the number of tensors
-  returned may be less than than the actual number of inputs.
-
-  Args:
-    op: Operation
-    xs: list of Tensors we are differentiating w.r.t.
-
-  Returns:
-    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
-    is in a FuncGraph and has captured inputs.
-  """
-  if _IsFunction(op.graph):  # pylint: disable=protected-access
-    inputs = []
-    for t in op.inputs:
-      # If we're differentiating w.r.t. `t`, do not attempt to traverse through
-      # it to a captured value. The algorithm needs to "see" `t` in this case,
-      # even if it's a function input for a captured value, whereas usually we'd
-      # like to traverse through these closures as if the captured value was the
-      # direct input to op.
-      if t not in xs:
-        t = _MaybeCaptured(t)
-        # Skip captured eager inputs.
-        if isinstance(t, ops.EagerTensor): continue
-      inputs.append(t)
-    return inputs
-  else:
-    return op.inputs
-
-
-def _Consumers(t, func_graphs):
-  """Returns the consumers of t, crossing closure boundaries where necessary.
-
-  Args:
-    t: Tensor
-    func_graphs: a list of FuncGraphs that may have captured t.
-
-  Returns:
-    A list of tensors. The tensors will be from the current graph and/or
-    func_graphs.
-  """
-  consumers = t.consumers()
-  for func in func_graphs:
-    for input_t, placeholder in _Captures(func).items():
-      if input_t == t:
-        consumers.extend(_Consumers(placeholder, func_graphs))
-  return consumers
-
-
 @tf_export(v1=["gradients"])
 def gradients(ys,
               xs,
@@ -658,10 +150,13 @@ def gradients(ys,
   # Creating the gradient graph for control flow mutates Operations.
   # _mutation_lock ensures a Session.run call cannot occur between creating and
   # mutating new ops.
-  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
-    return _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
-                            gate_gradients, aggregation_method, stop_gradients,
-                            unconnected_gradients)
+  # pylint: disable=protected-access
+  with ops.get_default_graph()._mutation_lock():
+    return gradients_util._GradientsHelper(
+        ys, xs, grad_ys, name, colocate_gradients_with_ops,
+        gate_gradients, aggregation_method, stop_gradients,
+        unconnected_gradients)
+  # pylint: enable=protected-access
 
 
 @tf_export("gradients", v1=[])
@@ -725,7 +220,7 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
 
   `unconnected_gradients` determines the value returned for each x in xs if it
   is unconnected in the graph to ys. By default this is None to safeguard
-  against errors. MAthematically these gradients are zero which can be requested
+  against errors. Mathematically these gradients are zero which can be requested
   using the `'zero'` option. `tf.UnconnectedGradients` provides the
   following options and behaviors:
 
@@ -771,540 +266,13 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
   # Creating the gradient graph for control flow mutates Operations.
   # _mutation_lock ensures a Session.run call cannot occur between creating and
   # mutating new ops.
-  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
-    return _GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients,
-                            aggregation_method, stop_gradients,
-                            unconnected_gradients)
-
-
-def _GradientsHelper(ys,
-                     xs,
-                     grad_ys=None,
-                     name="gradients",
-                     colocate_gradients_with_ops=False,
-                     gate_gradients=False,
-                     aggregation_method=None,
-                     stop_gradients=None,
-                     unconnected_gradients=UnconnectedGradients.NONE,
-                     src_graph=None):
-  """Implementation of gradients()."""
-  if context.executing_eagerly():
-    raise RuntimeError("tf.gradients is not supported when eager execution "
-                       "is enabled. Use tf.GradientTape instead.")
-  if src_graph is None:
-    src_graph = ops.get_default_graph()
-  try:
-    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
-  except ValueError:
-    raise ValueError(
-        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
-
-  # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
-  # ancestor graphs. This is necessary for correctly handling captured values.
-  func_graphs = []
-  curr_graph = src_graph
-  while _IsFunction(curr_graph):
-    func_graphs.append(curr_graph)
-    if isinstance(curr_graph, FuncGraph):
-      curr_graph = curr_graph.outer_graph
-    else:
-      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access
-
-  ys = _AsList(ys)
-  xs = _AsList(xs)
-  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
-  if grad_ys is None:
-    grad_ys = [None] * len(ys)
-  else:
-    grad_ys = _AsList(grad_ys)
-
-  with ops.name_scope(
-      name, "gradients",
-      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
-    # Get a uid for this call to gradients that can be used to help
-    # cluster ops for compilation.
-    gradient_uid = ops.get_default_graph().unique_name("uid")
-    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = [
-        x.handle if resource_variable_ops.is_resource_variable(x) else x
-        for x in xs
-    ]
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
-        xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
-                             gradient_uid)
-
-    # The approach we take here is as follows: Create a list of all ops in the
-    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
-    # to ensure that when we visit an op the gradients w.r.t its outputs have
-    # been collected.  Then aggregate these gradients if needed, call the op's
-    # gradient function, and add the generated gradients to the gradients for
-    # its input.
-
-    # Initialize the pending count for ops in the connected subgraph from ys
-    # to the xs.
-    to_ops = [t.op for t in ys]
-    from_ops = [t.op for t in xs]
-    stop_gradient_ops = [t.op for t in stop_gradients]
-    reachable_to_ops, pending_count, loop_state = _PendingCount(
-        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)
-
-    # Iterate over the collected ops.
-    #
-    # grads: op => list of gradients received on each output endpoint of the
-    # op.  The gradients for each endpoint are initially collected as a list.
-    # When it is time to call the op's gradient function, for each endpoint we
-    # aggregate the list of received gradients into a Add() Operation if there
-    # is more than one.
-    grads = {}
-
-    # Add the initial gradients for the ys.
-    for y, grad_y in zip(ys, grad_ys):
-      _SetGrad(grads, y, grad_y)
-
-    # Initialize queue with to_ops.
-    queue = collections.deque()
-    # Add the ops in 'to_ops' into the queue.
-    to_ops_set = set()
-    for op in to_ops:
-      # 'ready' handles the case where one output gradient relies on
-      # another output's gradient.
-      ready = (pending_count[op] == 0)
-      if ready and op not in to_ops_set and op in reachable_to_ops:
-        to_ops_set.add(op)
-        queue.append(op)
-
-    if loop_state:
-      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
-      for y in loop_exits:
-        if IsTrainable(y):
-          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
-          queue.append(y.op)
-
-    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
-    while queue:
-      # generate gradient subgraph for op.
-      op = queue.popleft()
-      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
-        if loop_state:
-          loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
-                                     aggregation_method)
-        if loop_state:
-          loop_state.ExitGradWhileContext(op, before=True)
-
-        grad_fn = None
-        func_call = None
-        is_partitioned_call = _IsPartitionedCall(op)
-        # pylint: disable=protected-access
-        is_func_call = (
-            src_graph._is_function(op.type) or is_partitioned_call)
-        # pylint: enable=protected-access
-        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
-        if has_out_grads and (op not in stop_ops):
-          try:
-            grad_fn = ops.get_gradient_function(op)
-          except LookupError:
-            if is_func_call:
-              if is_partitioned_call:
-                func_call = src_graph._get_function(  # pylint: disable=protected-access
-                    compat.as_bytes(op.get_attr("f").name))
-              else:
-                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
-              # Note that __defun is not set if the graph is
-              # imported. If it's set, we prefer to access the original
-              # defun.
-              func_call = getattr(op, "__defun", func_call)
-              grad_fn = func_call.python_grad_func
-            else:
-              raise LookupError(
-                  "No gradient defined for operation '%s' (op type: %s)" %
-                  (op.name, op.type))
-        if loop_state:
-          loop_state.EnterGradWhileContext(op, before=False)
-
-        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
-        # unless it's within the context of a single iteration (i.e. the
-        # gradient is wrt to the loop parameter in the body function, not wrt or
-        # through the initial value). This means if we're in a while loop
-        # context, we should never see a switch node from this context.
-        # pylint: disable=protected-access
-        if (control_flow_util.IsSwitch(op) and
-            op._control_flow_context is not None and
-            op._control_flow_context.IsWhileContext() and
-            op._control_flow_context ==
-            ops.get_default_graph()._get_control_flow_context()):
-          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
-        # pylint: enable=protected-access
-
-        if (grad_fn or is_func_call) and has_out_grads:
-          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
-          # output, it means that the cost does not depend on output[i],
-          # therefore dC/doutput[i] is 0.
-          for i, out_grad in enumerate(out_grads):
-            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
-                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
-              # Only trainable outputs or outputs for a function call that
-              # will use SymbolicGradient get a zero gradient. Gradient
-              # functions should ignore the gradient for other outputs.
-              # TODO(apassos) gradients of resource handles might be an
-              # issue here because of zeros.
-              if loop_state:
-                out_grads[i] = loop_state.ZerosLike(op, i)
-              else:
-                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
-          with ops.name_scope(op.name + "_grad"):
-            # pylint: disable=protected-access
-            with src_graph._original_op(op):
-              # pylint: enable=protected-access
-              if grad_fn:
-                # If grad_fn was found, do not use SymbolicGradient even for
-                # functions.
-                in_grads = _MaybeCompile(grad_scope, op, func_call,
-                                         lambda: grad_fn(op, *out_grads))
-              else:
-                # For function call ops, we add a 'SymbolicGradient'
-                # node to the graph to compute gradients.
-                in_grads = _MaybeCompile(grad_scope, op, func_call,
-                                         lambda: _SymGrad(op, out_grads))
-              in_grads = _AsList(in_grads)
-              _VerifyGeneratedGradients(in_grads, op)
-              if gate_gradients and len([x for x in in_grads
-                                         if x is not None]) > 1:
-                with ops.device(None):
-                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-                      None,
-                      gradient_uid,
-                      ignore_existing=True):
-                    in_grads = control_flow_ops.tuple(in_grads)
-          _LogOpGradients(op, out_grads, in_grads)
-        else:
-          # If no grad_fn is defined or none of out_grads is available,
-          # just propagate a list of None backwards.
-          in_grads = [None] * len(_NonEagerInputs(op, xs))
-        for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs),
-                                                in_grads)):
-          if in_grad is not None:
-            if (isinstance(in_grad, ops.Tensor) and
-                t_in.dtype != dtypes.resource):
-              try:
-                in_grad.set_shape(t_in.get_shape())
-              except ValueError:
-                raise ValueError(
-                    "Incompatible shapes between op input and calculated "
-                    "input gradient.  Forward operation: %s.  Input index: %d. "
-                    "Original input shape: %s.  "
-                    "Calculated input gradient shape: %s" %
-                    (op.name, i, t_in.shape, in_grad.shape))
-            _SetGrad(grads, t_in, in_grad)
-        if loop_state:
-          loop_state.ExitGradWhileContext(op, before=False)
-
-      # Update pending count for the inputs of op and enqueue ready ops.
-      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                    xs)
-
-  if loop_state:
-    loop_state.PostProcessing()
-  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
-
-
-def _HasAnyNotNoneGrads(grads, op):
-  """Return true iff op has real gradient."""
-  out_grads = _GetGrads(grads, op)
-  for out_grad in out_grads:
-    if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-      return True
-    if out_grad and isinstance(out_grad, collections.Sequence):
-      if any(g is not None for g in out_grad):
-        return True
-  return False
-
-
-def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                  xs):
-  """Update pending count for the inputs of op and enqueue ready ops."""
-  for x in _NonEagerInputs(op, xs):
-    pending_count[x.op] -= 1
-    ready = (pending_count[x.op] == 0)
-    if loop_state and not ready:
-      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
-    if ready:
-      if control_flow_util.IsLoopExit(x.op):
-        # if x is an exit without real gradient, defer processing them.
-        grad_state = loop_state.GetGradState(x.op, before=False)
-        grad_state.deferred_exits.append(x)
-        grad_state.pending_exits_count -= 1
-        if grad_state.pending_exits_count == 0:
-          # We now have all the exits so process them.
-          has_not_none_grad = False
-          for y in grad_state.deferred_exits:
-            if _HasAnyNotNoneGrads(grads, y.op):
-              has_not_none_grad = True
-              queue.append(y.op)
-            else:
-              grad_state.unused_exits.append(y)
-          if has_not_none_grad:
-            # For an unused exit, if it has trainable outputs, backprop
-            # a zero gradient. Otherwise, just ignore it.
-            for y in grad_state.unused_exits:
-              if IsTrainable(y):
-                _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
-              queue.append(y.op)
-          else:
-            # All exits are "unused" so use None as gradient.
-            for y in grad_state.unused_exits:
-              queue.append(y.op)
-      else:
-        queue.append(x.op)
-
-
-def _SetGrad(grads, t, grad):
-  """Sets gradient "grad" in "grads" for tensor "t"."""
-  op = t.op
-  op_grads = grads.get(op)
-  if not op_grads:
-    op_grads = [[] for _ in xrange(len(op.outputs))]
-    grads[op] = op_grads
-  t_grads = op_grads[t.value_index]
-  if isinstance(t_grads, list):
-    t_grads.append(grad)
-  else:
-    assert control_flow_util.IsLoopSwitch(op)
-    op_grads[t.value_index] = grad
-
-
-def _GetGrad(grads, t, unconnected_gradients):
-  """Gets gradient for tensor "t"."""
-  op = t.op
-  op_grads = grads.get(op)
-  if not op_grads:
-    if unconnected_gradients == UnconnectedGradients.ZERO:
-      t_dtype = t.dtype if t.dtype != dtypes.resource else dtypes.float32
-      return array_ops.zeros_like(t, dtype=t_dtype)
-    elif unconnected_gradients == UnconnectedGradients.NONE:
-      return None
-    else:
-      raise ValueError(
-          "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
-
-  t_grad = op_grads[t.value_index]
-  assert not isinstance(
-      t_grad, list), ("gradients list should have been aggregated by now.")
-  return t_grad
-
-
-def _GetGrads(grads, op):
-  """Gets all gradients for op."""
-  if op in grads:
-    return grads[op]
-  else:
-    return [[] for _ in xrange(len(op.outputs))]
-
-
-def _HandleNestedIndexedSlices(grad):
-  assert isinstance(grad, ops.IndexedSlices)
-  if isinstance(grad.values, ops.Tensor):
-    return grad
-  else:
-    assert isinstance(grad.values, ops.IndexedSlices)
-    g = _HandleNestedIndexedSlices(grad.values)
-    return ops.IndexedSlices(g.values, array_ops.gather(
-        grad.indices, g.indices), g.dense_shape)
-
-
-def _AccumulatorShape(inputs):
-  shape = tensor_shape.unknown_shape()
-  for i in inputs:
-    if isinstance(i, ops.Tensor):
-      shape = shape.merge_with(i.get_shape())
-  return shape
-
-
-def _LogOpGradients(op, out_grads, in_grads):
-  """Log the in and out grads of an op."""
-  logging.vlog(1, "Gradient for '" + op.name + "'")
-
-  def _FilterGrad(x):
-    if x is None:
-      return False
-    if isinstance(x, (list, tuple)):
-      return bool(x)
-    else:
-      return True
-
-  logging.vlog(1, "  in  --> %s",
-               ", ".join([x.name for x in out_grads if _FilterGrad(x)]))
-  logging.vlog(1, "  out --> %s",
-               ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
-
-
-def _MultiDeviceAddN(tensor_list, gradient_uid):
-  """Adds tensors from potentially multiple devices."""
-  # Basic function structure comes from control_flow_ops.group().
-  # Sort tensors according to their devices.
-  tensors_on_device = collections.defaultdict(lambda: [])
-  for tensor in tensor_list:
-    tensors_on_device[tensor.device].append(tensor)
-
-  # For each device, add the tensors on that device first.
-  # Then gather the partial sums from multiple devices.
-  # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion.
-  # E.g., aggregate per GPU, then per task, and so on.
-  summands = []
-
-  def DeviceKey(dev):
-    return "" if dev is None else dev
-
-  for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
-    tensors = tensors_on_device[dev]
-    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-        tensors[0].op,
-        gradient_uid,
-        ignore_existing=True):
-      summands.append(math_ops.add_n(tensors))
-
-  return math_ops.add_n(summands)
-
-
-@tf_export("AggregationMethod")
-class AggregationMethod(object):
-  """A class listing aggregation methods used to combine gradients.
-
-  Computing partial derivatives can require aggregating gradient
-  contributions. This class lists the various methods that can
-  be used to combine gradients in the graph:
-
-  *  `ADD_N`: All of the gradient terms are summed as part of one
-     operation using the "AddN" op. It has the property that all
-     gradients must be ready before any aggregation is performed.
-  *  `DEFAULT`: The system-chosen default aggregation method.
-  """
-  ADD_N = 0
-  DEFAULT = ADD_N
-  # The following are experimental and may not be supported in future releases.
-  EXPERIMENTAL_TREE = 1
-  EXPERIMENTAL_ACCUMULATE_N = 2
-
-
-def _AggregatedGrads(grads,
-                     op,
-                     gradient_uid,
-                     loop_state,
-                     aggregation_method=None):
-  """Get the aggregated gradients for op.
-
-  Args:
-    grads: The map of memoized gradients.
-    op: The op to get gradients for.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
-    loop_state: An object for maintaining the state of the while loops in the
-                graph. It is of type ControlFlowState. None if the graph
-                contains no while loops.
-    aggregation_method: Specifies the method used to combine gradient terms.
-      Accepted values are constants defined in the class `AggregationMethod`.
-
-  Returns:
-    A list of gradients, one per each output of `op`. If the gradients
-      for a particular output is a list, this function aggregates it
-      before returning.
-
-  Raises:
-    TypeError: if the incoming grads are not Tensors or IndexedSlices.
-    ValueError: if the arguments are invalid.
-
-  """
-  if aggregation_method is None:
-    aggregation_method = AggregationMethod.DEFAULT
-  if aggregation_method not in [
-      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
-      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-  ]:
-    raise ValueError(
-        "Invalid aggregation_method specified %s." % aggregation_method)
-  out_grads = _GetGrads(grads, op)
-  for i, out_grad in enumerate(out_grads):
-    if loop_state:
-      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-        assert control_flow_util.IsLoopSwitch(op)
-        continue
-    # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all(
-        isinstance(g, (ops.Tensor, ops.IndexedSlices))
-        for g in out_grad
-        if g is not None
-    )):
-      raise TypeError("gradients have to be either all Tensors "
-                      "or all IndexedSlices")
-    # Aggregate multiple gradients, and convert [] to None.
-    if out_grad:
-      if len(out_grad) < 2:
-        used = "nop"
-        out_grads[i] = out_grad[0]
-      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
-        tensor_shape = _AccumulatorShape(out_grad)
-        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
-          # The benefit of using AccumulateN is that its inputs can be combined
-          # in any order and this can allow the expression to be evaluated with
-          # a smaller memory footprint.  When used with gpu_allocator_retry,
-          # it is possible to compute a sum of terms which are much larger than
-          # total GPU memory.
-          # AccumulateN can currently only be used if we know the shape for
-          # an accumulator variable.  If this is not known, or if we only have
-          # 2 grads then we fall through to the "tree" case below.
-          used = "accumulate_n"
-          out_grads[i] = math_ops.accumulate_n(out_grad)
-        elif aggregation_method in [
-            AggregationMethod.EXPERIMENTAL_TREE,
-            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-        ]:
-          # Aggregate all gradients by doing pairwise sums: this may
-          # reduce performance, but it can improve memory because the
-          # gradients can be released earlier.
-          #
-          # TODO(vrv): Consider replacing this with a version of
-          # tf.AddN() that eagerly frees its inputs as soon as they are
-          # ready, so the order of this tree does not become a problem.
-          used = "tree"
-          with ops.name_scope(op.name + "_gradient_sum"):
-            running_sum = out_grad[0]
-            for grad in out_grad[1:]:
-              running_sum = math_ops.add_n([running_sum, grad])
-            out_grads[i] = running_sum
-        else:
-          used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
-        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
-                     tensor_shape, used)
-      else:
-        out_grads[i] = _AggregateIndexedSlicesGradients(out_grad)
-    else:  # not out_grad
-      # out_grads[i] is [], thus its aggregation is simply None.
-      out_grads[i] = None
-  return out_grads
-
-
-def _AggregateIndexedSlicesGradients(grads):
-  """Aggregates gradients of type `IndexedSlices` by concatenation."""
-  if len(grads) < 1:
-    return None
-  elif len(grads) == 1:
-    return grads[0]
-  else:
-    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
-        [g for g in grads if g is not None])
-    grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
-    # Form IndexedSlices out of the concatenated values and indices.
-    concat_grad = ops.IndexedSlices(
-        array_ops.concat([x.values for x in grads], axis=0),
-        array_ops.concat([x.indices for x in grads], axis=0),
-        grads[0].dense_shape)
-
-    return concat_grad
+  # pylint: disable=protected-access
+  with ops.get_default_graph()._mutation_lock():
+    return gradients_util._GradientsHelper(
+        ys, xs, grad_ys, name, True, gate_gradients,
+        aggregation_method, stop_gradients,
+        unconnected_gradients)
+  # pylint: enable=protected-access
 
 
 # TODO(vrv): Make this available when we want to make it public.
@@ -1393,7 +361,7 @@ def hessians(ys,
     LookupError: if one of the operations between `xs` and `ys` does not
       have a registered gradient function.
   """
-  xs = _AsList(xs)
+  xs = gradients_util._AsList(xs)  # pylint: disable=protected-access
   kwargs = {
       "colocate_gradients_with_ops": colocate_gradients_with_ops,
       "gate_gradients": gate_gradients,
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index c53afef63bc1d2fc1ba1927c687f7ecad4eb46a4..9d6ac46c049157210ecc764c20354f61e12f9ca0 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import data_flow_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import functional_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
@@ -586,11 +587,12 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       def Foo():
         x = constant_op.constant(10.0, name="x")
         y = math_ops.multiply(x, c, name="y")
-        z = math_ops.multiply(y, 3.0, name="z")
+        # Regression test for b/122564611.
+        z = math_ops.multiply(c, y, name="z")
         g = gradients_impl.gradients(z, x)
         return g[0]
 
-      self.assertEqual(Foo().numpy(), 6.0)
+      self.assertEqual(Foo().numpy(), 4.0)
 
 
 class StopGradientTest(test_util.TensorFlowTestCase):
@@ -1031,6 +1033,42 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(g.eval(), [2.0])
       self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
 
+  def testRecursiveCustomGradient(self):
+    @custom_gradient.custom_gradient
+    def F(x):
+      out = core_layers.dense(x, 3, use_bias=False)
+
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        self.assertEqual(1, len(variables))
+        grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
+        return grads[0], [array_ops.ones((4, 3))]
+
+      return out, Grad
+
+    @custom_gradient.custom_gradient
+    def DoubleF(x):
+      out = F(x)
+
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        self.assertEqual(1, len(variables))
+        grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
+        return grads[0], [array_ops.ones((4, 3))]
+
+      return out, Grad
+    with ops.Graph().as_default():
+      x = array_ops.ones((2, 4))
+      with variable_scope.variable_scope("f", use_resource=True) as vs:
+        y = DoubleF(x)
+        all_vars = vs.global_variables()
+        assert len(all_vars) == 1
+      grads = gradients.gradients(y, [x, all_vars[0]])
+      for g in grads:
+        self.assertIsNotNone(g)
+      with session.Session() as sess:
+        self.evaluate(variables.global_variables_initializer())
+        dw = sess.run(math_ops.reduce_sum(grads[1]))
+        self.assertEqual(12., dw)
+
 
 class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
 
@@ -1040,12 +1078,12 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         self.evaluate(ops.convert_to_tensor(right)))
 
   def testNoGradients(self):
-    self.assertIsNone(gradients_impl._AggregateIndexedSlicesGradients([]))
+    self.assertIsNone(gradients_util._AggregateIndexedSlicesGradients([]))
 
   def testOneGradient(self):
     t = math_ops._as_indexed_slices(constant_op.constant(
         [[1., 2.], [0, 0], [3., 4.]]))
-    result = gradients_impl._AggregateIndexedSlicesGradients([t])
+    result = gradients_util._AggregateIndexedSlicesGradients([t])
     self._assert_indexed_slices_equal(t, result)
 
   def testMultipleGradients(self):
@@ -1055,7 +1093,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         [[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
   def testMultipleGradientsWithNones(self):
@@ -1066,7 +1104,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
     t3 = None
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1, t3])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1, t3])
     self._assert_indexed_slices_equal(total, result)
 
   def testMixedTensorAndIndexedSlices(self):
@@ -1076,7 +1114,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         [[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
 
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..af46101726d15cd1c738cd11c88a10c5a0d0c842
--- /dev/null
+++ b/tensorflow/python/ops/gradients_util.py
@@ -0,0 +1,1075 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements the graph generation for computation of gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import warnings
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function as framework_function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
+
+
+# Warn the user if we convert a sparse representation to dense with at
+# least this number of elements.
+_LARGE_SPARSE_NUM_ELEMENTS = 100000000
+
+
+def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
+  """Converts an IndexedSlices object `value` to a Tensor.
+
+  NOTE(mrry): This function is potentially expensive.
+
+  Args:
+    value: An ops.IndexedSlices object.
+    dtype: The dtype of the Tensor to be returned.
+    name: Optional name to use for the returned Tensor.
+    as_ref: True if a ref is requested.
+
+  Returns:
+    A dense Tensor representing the values in the given IndexedSlices.
+
+  Raises:
+    ValueError: If the IndexedSlices does not have the same dtype.
+  """
+  _ = as_ref
+  if dtype and not dtype.is_compatible_with(value.dtype):
+    raise ValueError(
+        "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" %
+        (dtype.name, value.dtype.name))
+  if value.dense_shape is None:
+    raise ValueError(
+        "Tensor conversion requested for IndexedSlices without dense_shape: %s"
+        % str(value))
+  # TODO(mrry): Consider adding static shape information to
+  # IndexedSlices, to avoid using numpy here.
+  if not context.executing_eagerly():
+    dense_shape_value = tensor_util.constant_value(value.dense_shape)
+    if dense_shape_value is not None:
+      num_elements = np.prod(dense_shape_value)
+      if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
+        warnings.warn(
+            "Converting sparse IndexedSlices to a dense Tensor with %d "
+            "elements. This may consume a large amount of memory." %
+            num_elements)
+    else:
+      warnings.warn(
+          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
+          "This may consume a large amount of memory.")
+  return math_ops.unsorted_segment_sum(
+      value.values, value.indices, value.dense_shape[0], name=name)
+
+
+ops.register_tensor_conversion_function(ops.IndexedSlices,
+                                        _IndexedSlicesToTensor)
+
+
+def _MarkReachedOps(from_ops, reached_ops, func_graphs):
+  """Mark all ops reached from "from_ops".
+
+  Args:
+    from_ops: list of Operations.
+    reached_ops: set of Operations.
+    func_graphs: list of FuncGraphs. This method will traverse through
+      these functions if they capture from_ops or any reachable ops.
+  """
+  queue = collections.deque()
+  queue.extend(from_ops)
+  while queue:
+    op = queue.popleft()
+    if op not in reached_ops:
+      reached_ops.add(op)
+      for output in op.outputs:
+        if _IsBackpropagatable(output):
+          queue.extend(_Consumers(output, func_graphs))
+
+
+def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
+                  xs):
+  """Initialize the pending count for ops between two lists of Operations.
+
+  'pending_count[op]' indicates the number of backprop inputs
+  to this operation.
+
+  Args:
+    to_ops: list of Operations.
+    from_ops: list of Operations.
+    colocate_gradients_with_ops: Python bool.  See docstring of gradients().
+    func_graphs: list of FuncGraphs. This method will traverse through
+      these functions if they capture from_ops or any reachable ops. This is
+      useful if to_ops occur in a function and from_ops are in an outer function
+      or graph.
+    xs: list of Tensors.
+
+  Returns:
+    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
+    path of zero or more backpropagatable tensors, (2) a mapping from operation
+    to the number of backprop inputs to that op, and (3) a ControlFlowState
+    object which is not None if the ops between from_ops and to_ops contain
+    control flow loops.
+  """
+  # Mark reachable ops from from_ops.
+  reached_ops = set()
+  _MarkReachedOps(from_ops, reached_ops, func_graphs)
+  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
+  # backpropagatable tensors.
+
+  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
+
+  # Mark between ops.
+  between_ops = set()
+  between_op_list = []
+  queue = collections.deque()
+  queue.extend(to_ops)
+  while queue:
+    op = queue.popleft()
+    # We are interested in this op.
+    if op in reached_ops:
+      between_ops.add(op)
+      between_op_list.append(op)
+      # Clear the boolean so we won't add the inputs again.
+      reached_ops.remove(op)
+      for inp in _NonEagerInputs(op, xs):
+        queue.append(inp.op)
+  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
+  # between from_ops and to_ops
+
+  # 'loop_state' is None if there are no while loops.
+  loop_state = control_flow_ops.MaybeCreateControlFlowState(
+      between_op_list, between_ops, colocate_gradients_with_ops)
+
+  # Initialize pending count for between ops.
+  pending_count = collections.defaultdict(int)
+  for op in between_op_list:
+    for x in _NonEagerInputs(op, xs):
+      if x.op in between_ops:
+        pending_count[x.op] += 1
+
+  return reachable_to_ops, pending_count, loop_state
+
+
+def _AsList(x):
+  return x if isinstance(x, (list, tuple)) else [x]
+
+
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
+  """Fill in default values for grad_ys.
+
+  Args:
+    grad_ys: List of gradients, can contain None.
+    ys: List of tensors.
+    colocate_gradients_with_ops: If True, try colocating gradients with
+      the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
+
+  Returns:
+    A list of gradients to use, without None.
+
+  Raises:
+    ValueError: If sizes of gradients and inputs don't match
+    TypeError: If type of any gradient is not valid for its input.
+  """
+  if len(grad_ys) != len(ys):
+    raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
+  grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
+  new_grad_ys = []
+  for i in xrange(len(grad_ys)):
+    grad_y = grad_ys[i]
+    y = ys[i]
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
+      if grad_y is None:
+        if y.dtype.is_complex:
+          raise TypeError(
+              "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
+              y.dtype)
+        new_grad_ys.append(
+            array_ops.fill(
+                array_ops.shape(y),
+                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
+        continue
+      if y.dtype.is_floating or y.dtype.is_integer:
+        if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
+          raise TypeError(
+              "Gradient type %s generated for real or "
+              "integer-valued tensor %s with type %s must be "
+              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
+                                   dtypes.as_dtype(y.dtype).name))
+      elif y.dtype.is_complex:
+        if not grad_y.dtype.is_complex:
+          raise TypeError(
+              "Gradient type %s generated for complex-valued "
+              "tensor %s with type %s must be real" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
+      elif y.dtype == dtypes.variant:
+        if grad_y.dtype != dtypes.variant:
+          raise TypeError(
+              "Gradient type %s generated for variant "
+              "tensor %s with type %s must be variant" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
+      elif y.dtype == dtypes.resource:
+        # We assume y is the handle of a ResourceVariable. The gradient of a
+        # ResourceVariable should be a numeric value, not another resource.
+        if grad_y.dtype == dtypes.resource:
+          raise TypeError("Input gradient %s for resource tensor %s should not "
+                          "be a resource" % (grad_y, y))
+      else:
+        raise TypeError(
+            "Tensor %s with type %s must be numeric "
+            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
+      # Create a grad_y tensor in the name scope of the gradient.
+      # Required for TensorArrays to identify which gradient call a
+      # grad_y value is coming from.
+      if isinstance(grad_y, ops.IndexedSlices):
+        new_grad_ys.append(
+            ops.IndexedSlices(
+                indices=(array_ops.identity(
+                    grad_y.indices, name="grad_ys_%d_indices" % i)
+                         if isinstance(grad_y.indices, ops.Tensor) else
+                         grad_y.indices),
+                values=(array_ops.identity(
+                    grad_y.values, name="grad_ys_%d_values" % i) if isinstance(
+                        grad_y.values, ops.Tensor) else grad_y.values),
+                dense_shape=(array_ops.identity(
+                    grad_y.dense_shape, name="grad_ys_%d_shape" % i)
+                             if isinstance(grad_y.dense_shape, ops.Tensor) else
+                             grad_y.dense_shape)))
+      else:
+        new_grad_ys.append(array_ops.identity(grad_y, name="grad_ys_%d" % i))
+
+  return new_grad_ys
+
+
+def IsTrainable(tensor_or_dtype):
+  if isinstance(tensor_or_dtype, ops.Tensor):
+    dtype = tensor_or_dtype.dtype
+  else:
+    dtype = tensor_or_dtype
+  dtype = dtypes.as_dtype(dtype)
+  return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                              dtypes.complex64, dtypes.complex128,
+                              dtypes.resource, dtypes.variant)
+
+
+def _IsBackpropagatable(tensor):
+  if IsTrainable(tensor):
+    return True
+  dtype = dtypes.as_dtype(tensor.dtype)
+  return dtype.base_dtype == dtypes.bfloat16
+
+
+def _VerifyGeneratedGradients(grads, op):
+  """Verify that gradients are valid in number and type.
+
+  Args:
+    grads: List of generated gradients.
+    op: Operation for which the gradients where generated.
+
+  Raises:
+    ValueError: if sizes of gradients and inputs don't match.
+    TypeError: if type of any gradient is not valid for its input.
+  """
+  # While ops have inputs added to them during the gradient computation, so we
+  # skip the below check. See while_v2 for details.
+  if op.type == "While": return
+
+  if len(grads) != len(op.inputs):
+    raise ValueError("Num gradients %d generated for op %s do not match num "
+                     "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
+
+
+def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
+  """The set of ops that terminate the gradient computation.
+
+  This computes the frontier of the forward graph *before* which backprop
+  should stop. Operations in the returned set will not be differentiated.
+  This set is defined as the subset of `from_ops` containing ops that have
+  no predecessor in `from_ops`. `pending_count` is the result of
+  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
+  iff pending_count[op] > 0.
+
+  In addition, none of `stop_gradient_ops` will be differentiated.
+
+  Args:
+    from_ops: list of Operations.
+    stop_gradient_ops: list of Operations never to backprop through.
+    pending_count: mapping from operation to number of backprop inputs.
+    xs: list of Tensors.
+
+  Returns:
+    The set of operations.
+  """
+  stop_ops = set()
+  for op in from_ops:
+    is_stop_op = True
+    for inp in _NonEagerInputs(op, xs):
+      if pending_count[inp.op] > 0:
+        is_stop_op = False
+        break
+    if is_stop_op:
+      stop_ops.add(op)
+  stop_ops.update(op for op in stop_gradient_ops)
+  return stop_ops
+
+
+@contextlib.contextmanager
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
+  """Context to colocate with `op` if `colocate_gradients_with_ops`."""
+  if colocate_gradients_with_ops:
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
+      yield
+  else:
+    yield
+
+
+def _IsPartitionedCall(op):
+  return op.type == "PartitionedCall" or op.type == "StatefulPartitionedCall"
+
+
+def _SymGrad(op, out_grads):
+  """Backprop through a function call node op given its outputs' gradients."""
+  f_in = [x for x in op.inputs] + out_grads
+  f_types = [x.dtype for x in op.inputs]
+  f = attr_value_pb2.NameAttrList()
+  if _IsPartitionedCall(op):
+    f.name = op.get_attr("f").name
+  else:
+    f.name = op.type
+  for k in op.node_def.attr:
+    f.attr[k].CopyFrom(op.node_def.attr[k])
+  # TODO(apassos) use a better dtype here
+  in_grads = functional_ops.symbolic_gradient(
+      input=f_in,
+      Tout=[x if x != dtypes.resource else dtypes.float32 for x in f_types],
+      f=f)
+  return in_grads
+
+
+def _MaybeCompile(scope, op, func, grad_fn):
+  """Compile the calculation in grad_fn if op was marked as compiled."""
+  scope = scope.rstrip("/").replace("/", "_")
+  if func is not None:
+    xla_compile = func.definition.attr["_XlaCompile"].b
+    xla_separate_compiled_gradients = func.definition.attr[
+        "_XlaSeparateCompiledGradients"].b
+    xla_scope = func.definition.attr["_XlaScope"].s.decode()
+  else:
+    try:
+      xla_compile = op.get_attr("_XlaCompile")
+      xla_separate_compiled_gradients = op.get_attr(
+          "_XlaSeparateCompiledGradients")
+      xla_scope = op.get_attr("_XlaScope").decode()
+    except ValueError:
+      return grad_fn()  # Exit early
+
+  if not xla_compile:
+    return grad_fn()  # Exit early
+
+  # If the gradients are supposed to be compiled separately, we give them a
+  # _XlaScope name that is based on the name_scope of the gradients.  Otherwise
+  # they just inherit the existing _XlaScope name, which lets them be merged
+  # together with the non-gradient computation.
+  if xla_separate_compiled_gradients:
+    xla_grad_scope = "%s_grad_%s" % (xla_scope, scope)
+  else:
+    xla_grad_scope = xla_scope
+
+  attrs = {
+      "_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile),
+      "_XlaScope": attr_value_pb2.AttrValue(s=xla_grad_scope.encode())
+  }
+  with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
+    return grad_fn()
+
+
+def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
+  """Raises an error if we backprop through a loop var."""
+  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
+  # message.
+  target_op = None
+  queue = collections.deque([op])
+  visited = set()
+  while queue:
+    curr_op = queue.popleft()
+    if curr_op in visited: continue
+    visited.add(curr_op)
+    if curr_op in from_ops:
+      target_op = curr_op
+      break
+    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs))
+  assert target_op
+  raise ValueError(
+      "Cannot compute gradient inside while loop with respect to op '%s'. "
+      "We do not support taking the gradient wrt or through the initial value "
+      "of a loop variable. Gradients can be computed through loop invariants "
+      "or wrt the input parameters to the loop body."
+      % target_op.name)
+
+
+def _IsFunction(graph):
+  return (isinstance(graph, FuncGraph) or
+          isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
+
+
+def _Captures(func_graph):
+  if isinstance(func_graph, FuncGraph):
+    return func_graph.captures
+  else:
+    assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
+    return func_graph._captured  # pylint: disable=protected-access
+
+
+def _MaybeCaptured(t):
+  """If t is a captured value placeholder, returns the original captured value.
+
+  Args:
+    t: Tensor
+
+  Returns:
+    A tensor, potentially from a different Graph/FuncGraph.
+  """
+  # pylint: disable=protected-access
+  if (not isinstance(t, ops.EagerTensor) and
+      _IsFunction(t.op.graph) and t.op.type == "Placeholder"):
+    for input_t, placeholder_t in _Captures(t.op.graph).items():
+      if t == placeholder_t:
+        return _MaybeCaptured(input_t)
+  # pylint: enable=protected-access
+  return t
+
+
+def _NonEagerInputs(op, xs):
+  """Returns the inputs of op, crossing closure boundaries where necessary.
+
+  Does not return any captured EagerTensors, i.e., the number of tensors
+  returned may be less than than the actual number of inputs.
+
+  Args:
+    op: Operation
+    xs: list of Tensors we are differentiating w.r.t.
+
+  Returns:
+    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
+    is in a FuncGraph and has captured inputs.
+  """
+  return [t for t in _Inputs(op, xs) if not isinstance(t, ops.EagerTensor)]
+
+
+# TODO(skyewm): plumbing xs through everywhere is ugly, consider making
+# _GradientsHelper a class with xs as a member variable.
+def _Inputs(op, xs):
+  """Returns the inputs of op, crossing closure boundaries where necessary.
+
+  Args:
+    op: Operation
+    xs: list of Tensors we are differentiating w.r.t.
+
+  Returns:
+    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
+    is in a FuncGraph and has captured inputs.
+  """
+  if _IsFunction(op.graph):  # pylint: disable=protected-access
+    inputs = []
+    for t in op.inputs:
+      # If we're differentiating w.r.t. `t`, do not attempt to traverse through
+      # it to a captured value. The algorithm needs to "see" `t` in this case,
+      # even if it's a function input for a captured value, whereas usually we'd
+      # like to traverse through these closures as if the captured value was the
+      # direct input to op.
+      if t not in xs:
+        t = _MaybeCaptured(t)
+      inputs.append(t)
+    return inputs
+  else:
+    return op.inputs
+
+
+def _Consumers(t, func_graphs):
+  """Returns the consumers of t, crossing closure boundaries where necessary.
+
+  Args:
+    t: Tensor
+    func_graphs: a list of FuncGraphs that may have captured t.
+
+  Returns:
+    A list of tensors. The tensors will be from the current graph and/or
+    func_graphs.
+  """
+  consumers = t.consumers()
+  for func in func_graphs:
+    for input_t, placeholder in _Captures(func).items():
+      if input_t == t:
+        consumers.extend(_Consumers(placeholder, func_graphs))
+  return consumers
+
+
+def _GradientsHelper(ys,
+                     xs,
+                     grad_ys=None,
+                     name="gradients",
+                     colocate_gradients_with_ops=False,
+                     gate_gradients=False,
+                     aggregation_method=None,
+                     stop_gradients=None,
+                     unconnected_gradients=UnconnectedGradients.NONE,
+                     src_graph=None):
+  """Implementation of gradients()."""
+  if context.executing_eagerly():
+    raise RuntimeError("tf.gradients is not supported when eager execution "
+                       "is enabled. Use tf.GradientTape instead.")
+  if src_graph is None:
+    src_graph = ops.get_default_graph()
+  try:
+    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
+  except ValueError:
+    raise ValueError(
+        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
+  # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
+  # ancestor graphs. This is necessary for correctly handling captured values.
+  func_graphs = []
+  curr_graph = src_graph
+  while _IsFunction(curr_graph):
+    func_graphs.append(curr_graph)
+    if isinstance(curr_graph, FuncGraph):
+      curr_graph = curr_graph.outer_graph
+    else:
+      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
+      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access
+
+  ys = _AsList(ys)
+  xs = _AsList(xs)
+  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
+  if grad_ys is None:
+    grad_ys = [None] * len(ys)
+  else:
+    grad_ys = _AsList(grad_ys)
+
+  with ops.name_scope(
+      name, "gradients",
+      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
+    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
+    xs = [
+        x.handle if resource_variable_ops.is_resource_variable(x) else x
+        for x in xs
+    ]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+        xs, name="x", as_ref=True)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
+
+    # The approach we take here is as follows: Create a list of all ops in the
+    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
+    # to ensure that when we visit an op the gradients w.r.t its outputs have
+    # been collected.  Then aggregate these gradients if needed, call the op's
+    # gradient function, and add the generated gradients to the gradients for
+    # its input.
+
+    # Initialize the pending count for ops in the connected subgraph from ys
+    # to the xs.
+    to_ops = [t.op for t in ys]
+    from_ops = [t.op for t in xs]
+    stop_gradient_ops = [t.op for t in stop_gradients]
+    reachable_to_ops, pending_count, loop_state = _PendingCount(
+        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)
+
+    # Iterate over the collected ops.
+    #
+    # grads: op => list of gradients received on each output endpoint of the
+    # op.  The gradients for each endpoint are initially collected as a list.
+    # When it is time to call the op's gradient function, for each endpoint we
+    # aggregate the list of received gradients into a Add() Operation if there
+    # is more than one.
+    grads = {}
+
+    # Add the initial gradients for the ys.
+    for y, grad_y in zip(ys, grad_ys):
+      _SetGrad(grads, y, grad_y)
+
+    # Initialize queue with to_ops.
+    queue = collections.deque()
+    # Add the ops in 'to_ops' into the queue.
+    to_ops_set = set()
+    for op in to_ops:
+      # 'ready' handles the case where one output gradient relies on
+      # another output's gradient.
+      ready = (pending_count[op] == 0)
+      if ready and op not in to_ops_set and op in reachable_to_ops:
+        to_ops_set.add(op)
+        queue.append(op)
+
+    if loop_state:
+      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
+      for y in loop_exits:
+        if IsTrainable(y):
+          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
+          queue.append(y.op)
+
+    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
+    while queue:
+      # generate gradient subgraph for op.
+      op = queue.popleft()
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
+        if loop_state:
+          loop_state.EnterGradWhileContext(op, before=True)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
+        if loop_state:
+          loop_state.ExitGradWhileContext(op, before=True)
+
+        grad_fn = None
+        func_call = None
+        is_partitioned_call = _IsPartitionedCall(op)
+        # pylint: disable=protected-access
+        is_func_call = (
+            src_graph._is_function(op.type) or is_partitioned_call)
+        # pylint: enable=protected-access
+        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
+        if has_out_grads and (op not in stop_ops):
+          try:
+            grad_fn = ops.get_gradient_function(op)
+          except LookupError:
+            if is_func_call:
+              if is_partitioned_call:
+                func_call = src_graph._get_function(  # pylint: disable=protected-access
+                    compat.as_bytes(op.get_attr("f").name))
+              else:
+                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
+              # Note that __defun is not set if the graph is
+              # imported. If it's set, we prefer to access the original
+              # defun.
+              func_call = getattr(op, "__defun", func_call)
+              grad_fn = func_call.python_grad_func
+            else:
+              raise LookupError(
+                  "No gradient defined for operation '%s' (op type: %s)" %
+                  (op.name, op.type))
+        if loop_state:
+          loop_state.EnterGradWhileContext(op, before=False)
+
+        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
+        # unless it's within the context of a single iteration (i.e. the
+        # gradient is wrt to the loop parameter in the body function, not wrt or
+        # through the initial value). This means if we're in a while loop
+        # context, we should never see a switch node from this context.
+        # pylint: disable=protected-access
+        if (control_flow_util.IsSwitch(op) and
+            op._control_flow_context is not None and
+            op._control_flow_context.IsWhileContext() and
+            op._control_flow_context ==
+            ops.get_default_graph()._get_control_flow_context()):
+          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
+        # pylint: enable=protected-access
+
+        if (grad_fn or is_func_call) and has_out_grads:
+          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
+          # output, it means that the cost does not depend on output[i],
+          # therefore dC/doutput[i] is 0.
+          for i, out_grad in enumerate(out_grads):
+            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
+                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
+              # Only trainable outputs or outputs for a function call that
+              # will use SymbolicGradient get a zero gradient. Gradient
+              # functions should ignore the gradient for other outputs.
+              # TODO(apassos) gradients of resource handles might be an
+              # issue here because of zeros.
+              if loop_state:
+                out_grads[i] = loop_state.ZerosLike(op, i)
+              else:
+                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
+          with ops.name_scope(op.name + "_grad"):
+            # pylint: disable=protected-access
+            with src_graph._original_op(op):
+              # pylint: enable=protected-access
+              if grad_fn:
+                # If grad_fn was found, do not use SymbolicGradient even for
+                # functions.
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: grad_fn(op, *out_grads))
+              else:
+                # For function call ops, we add a 'SymbolicGradient'
+                # node to the graph to compute gradients.
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: _SymGrad(op, out_grads))
+              in_grads = _AsList(in_grads)
+              _VerifyGeneratedGradients(in_grads, op)
+              if gate_gradients and len([x for x in in_grads
+                                         if x is not None]) > 1:
+                with ops.device(None):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
+                    in_grads = control_flow_ops.tuple(in_grads)
+          _LogOpGradients(op, out_grads, in_grads)
+        else:
+          # If no grad_fn is defined or none of out_grads is available,
+          # just propagate a list of None backwards.
+          in_grads = [None] * len(_Inputs(op, xs))
+        # Note: we don't filter out eager inputs here because the inputs need to
+        # line up with in_grads.
+        for i, (t_in, in_grad) in enumerate(zip(_Inputs(op, xs), in_grads)):
+          if in_grad is not None:
+            if (isinstance(in_grad, ops.Tensor) and
+                t_in.dtype != dtypes.resource):
+              try:
+                in_grad.set_shape(t_in.get_shape())
+              except ValueError:
+                raise ValueError(
+                    "Incompatible shapes between op input and calculated "
+                    "input gradient.  Forward operation: %s.  Input index: %d. "
+                    "Original input shape: %s.  "
+                    "Calculated input gradient shape: %s" %
+                    (op.name, i, t_in.shape, in_grad.shape))
+            if not isinstance(t_in, ops.EagerTensor):
+              _SetGrad(grads, t_in, in_grad)
+        if loop_state:
+          loop_state.ExitGradWhileContext(op, before=False)
+
+      # Update pending count for the inputs of op and enqueue ready ops.
+      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
+                                    xs)
+
+  if loop_state:
+    loop_state.PostProcessing()
+  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
+
+
+def _HasAnyNotNoneGrads(grads, op):
+  """Return true iff op has real gradient."""
+  out_grads = _GetGrads(grads, op)
+  for out_grad in out_grads:
+    if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
+      return True
+    if out_grad and isinstance(out_grad, collections.Sequence):
+      if any(g is not None for g in out_grad):
+        return True
+  return False
+
+
+def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
+                                  xs):
+  """Update pending count for the inputs of op and enqueue ready ops."""
+  for x in _NonEagerInputs(op, xs):
+    pending_count[x.op] -= 1
+    ready = (pending_count[x.op] == 0)
+    if loop_state and not ready:
+      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
+    if ready:
+      if control_flow_util.IsLoopExit(x.op):
+        # if x is an exit without real gradient, defer processing them.
+        grad_state = loop_state.GetGradState(x.op, before=False)
+        grad_state.deferred_exits.append(x)
+        grad_state.pending_exits_count -= 1
+        if grad_state.pending_exits_count == 0:
+          # We now have all the exits so process them.
+          has_not_none_grad = False
+          for y in grad_state.deferred_exits:
+            if _HasAnyNotNoneGrads(grads, y.op):
+              has_not_none_grad = True
+              queue.append(y.op)
+            else:
+              grad_state.unused_exits.append(y)
+          if has_not_none_grad:
+            # For an unused exit, if it has trainable outputs, backprop
+            # a zero gradient. Otherwise, just ignore it.
+            for y in grad_state.unused_exits:
+              if IsTrainable(y):
+                _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
+              queue.append(y.op)
+          else:
+            # All exits are "unused" so use None as gradient.
+            for y in grad_state.unused_exits:
+              queue.append(y.op)
+      else:
+        queue.append(x.op)
+
+
+def _SetGrad(grads, t, grad):
+  """Sets gradient "grad" in "grads" for tensor "t"."""
+  op = t.op
+  op_grads = grads.get(op)
+  if not op_grads:
+    op_grads = [[] for _ in xrange(len(op.outputs))]
+    grads[op] = op_grads
+  t_grads = op_grads[t.value_index]
+  if isinstance(t_grads, list):
+    t_grads.append(grad)
+  else:
+    assert control_flow_util.IsLoopSwitch(op)
+    op_grads[t.value_index] = grad
+
+
+def _GetGrad(grads, t, unconnected_gradients):
+  """Gets gradient for tensor "t"."""
+  op = t.op
+  op_grads = grads.get(op)
+  if not op_grads:
+    if unconnected_gradients == UnconnectedGradients.ZERO:
+      t_dtype = t.dtype if t.dtype != dtypes.resource else dtypes.float32
+      return array_ops.zeros_like(t, dtype=t_dtype)
+    elif unconnected_gradients == UnconnectedGradients.NONE:
+      return None
+    else:
+      raise ValueError(
+          "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
+  t_grad = op_grads[t.value_index]
+  assert not isinstance(
+      t_grad, list), ("gradients list should have been aggregated by now.")
+  return t_grad
+
+
+def _GetGrads(grads, op):
+  """Gets all gradients for op."""
+  if op in grads:
+    return grads[op]
+  else:
+    return [[] for _ in xrange(len(op.outputs))]
+
+
+def _HandleNestedIndexedSlices(grad):
+  assert isinstance(grad, ops.IndexedSlices)
+  if isinstance(grad.values, ops.Tensor):
+    return grad
+  else:
+    assert isinstance(grad.values, ops.IndexedSlices)
+    g = _HandleNestedIndexedSlices(grad.values)
+    return ops.IndexedSlices(g.values, array_ops.gather(
+        grad.indices, g.indices), g.dense_shape)
+
+
+def _AccumulatorShape(inputs):
+  shape = tensor_shape.unknown_shape()
+  for i in inputs:
+    if isinstance(i, ops.Tensor):
+      shape = shape.merge_with(i.get_shape())
+  return shape
+
+
+def _LogOpGradients(op, out_grads, in_grads):
+  """Log the in and out grads of an op."""
+  logging.vlog(1, "Gradient for '" + op.name + "'")
+
+  def _FilterGrad(x):
+    if x is None:
+      return False
+    if isinstance(x, (list, tuple)):
+      return bool(x)
+    else:
+      return True
+
+  logging.vlog(1, "  in  --> %s",
+               ", ".join([x.name for x in out_grads if _FilterGrad(x)]))
+  logging.vlog(1, "  out --> %s",
+               ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
+
+
+def _MultiDeviceAddN(tensor_list, gradient_uid):
+  """Adds tensors from potentially multiple devices."""
+  # Basic function structure comes from control_flow_ops.group().
+  # Sort tensors according to their devices.
+  tensors_on_device = collections.defaultdict(lambda: [])
+  for tensor in tensor_list:
+    tensors_on_device[tensor.device].append(tensor)
+
+  # For each device, add the tensors on that device first.
+  # Then gather the partial sums from multiple devices.
+  # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion.
+  # E.g., aggregate per GPU, then per task, and so on.
+  summands = []
+
+  def DeviceKey(dev):
+    return "" if dev is None else dev
+
+  for dev in sorted(tensors_on_device, key=DeviceKey):
+    tensors = tensors_on_device[dev]
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
+      summands.append(math_ops.add_n(tensors))
+
+  return math_ops.add_n(summands)
+
+
+@tf_export("AggregationMethod")
+class AggregationMethod(object):
+  """A class listing aggregation methods used to combine gradients.
+
+  Computing partial derivatives can require aggregating gradient
+  contributions. This class lists the various methods that can
+  be used to combine gradients in the graph:
+
+  *  `ADD_N`: All of the gradient terms are summed as part of one
+     operation using the "AddN" op. It has the property that all
+     gradients must be ready before any aggregation is performed.
+  *  `DEFAULT`: The system-chosen default aggregation method.
+  """
+  ADD_N = 0
+  DEFAULT = ADD_N
+  # The following are experimental and may not be supported in future releases.
+  EXPERIMENTAL_TREE = 1
+  EXPERIMENTAL_ACCUMULATE_N = 2
+
+
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
+  """Get the aggregated gradients for op.
+
+  Args:
+    grads: The map of memoized gradients.
+    op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
+    loop_state: An object for maintaining the state of the while loops in the
+                graph. It is of type ControlFlowState. None if the graph
+                contains no while loops.
+    aggregation_method: Specifies the method used to combine gradient terms.
+      Accepted values are constants defined in the class `AggregationMethod`.
+
+  Returns:
+    A list of gradients, one per each output of `op`. If the gradients
+      for a particular output is a list, this function aggregates it
+      before returning.
+
+  Raises:
+    TypeError: if the incoming grads are not Tensors or IndexedSlices.
+    ValueError: if the arguments are invalid.
+
+  """
+  if aggregation_method is None:
+    aggregation_method = AggregationMethod.DEFAULT
+  if aggregation_method not in [
+      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
+      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+  ]:
+    raise ValueError(
+        "Invalid aggregation_method specified %s." % aggregation_method)
+  out_grads = _GetGrads(grads, op)
+  for i, out_grad in enumerate(out_grads):
+    if loop_state:
+      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
+        assert control_flow_util.IsLoopSwitch(op)
+        continue
+    # Grads have to be Tensors or IndexedSlices
+    if (isinstance(out_grad, collections.Sequence) and not all(
+        isinstance(g, (ops.Tensor, ops.IndexedSlices))
+        for g in out_grad
+        if g is not None
+    )):
+      raise TypeError("gradients have to be either all Tensors "
+                      "or all IndexedSlices")
+    # Aggregate multiple gradients, and convert [] to None.
+    if out_grad:
+      if len(out_grad) < 2:
+        used = "nop"
+        out_grads[i] = out_grad[0]
+      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
+        tensor_shape = _AccumulatorShape(out_grad)
+        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
+          # The benefit of using AccumulateN is that its inputs can be combined
+          # in any order and this can allow the expression to be evaluated with
+          # a smaller memory footprint.  When used with gpu_allocator_retry,
+          # it is possible to compute a sum of terms which are much larger than
+          # total GPU memory.
+          # AccumulateN can currently only be used if we know the shape for
+          # an accumulator variable.  If this is not known, or if we only have
+          # 2 grads then we fall through to the "tree" case below.
+          used = "accumulate_n"
+          out_grads[i] = math_ops.accumulate_n(out_grad)
+        elif aggregation_method in [
+            AggregationMethod.EXPERIMENTAL_TREE,
+            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+        ]:
+          # Aggregate all gradients by doing pairwise sums: this may
+          # reduce performance, but it can improve memory because the
+          # gradients can be released earlier.
+          #
+          # TODO(vrv): Consider replacing this with a version of
+          # tf.AddN() that eagerly frees its inputs as soon as they are
+          # ready, so the order of this tree does not become a problem.
+          used = "tree"
+          with ops.name_scope(op.name + "_gradient_sum"):
+            running_sum = out_grad[0]
+            for grad in out_grad[1:]:
+              running_sum = math_ops.add_n([running_sum, grad])
+            out_grads[i] = running_sum
+        else:
+          used = "add_n"
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
+        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
+                     tensor_shape, used)
+      else:
+        out_grads[i] = _AggregateIndexedSlicesGradients(out_grad)
+    else:  # not out_grad
+      # out_grads[i] is [], thus its aggregation is simply None.
+      out_grads[i] = None
+  return out_grads
+
+
+def _AggregateIndexedSlicesGradients(grads):
+  """Aggregates gradients of type `IndexedSlices` by concatenation."""
+  if len(grads) < 1:
+    return None
+  elif len(grads) == 1:
+    return grads[0]
+  else:
+    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
+        [g for g in grads if g is not None])
+    grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
+    # Form IndexedSlices out of the concatenated values and indices.
+    concat_grad = ops.IndexedSlices(
+        array_ops.concat([x.values for x in grads], axis=0),
+        array_ops.concat([x.indices for x in grads], axis=0),
+        grads[0].dense_shape)
+
+    return concat_grad
diff --git a/tensorflow/python/ops/image_grad.py b/tensorflow/python/ops/image_grad.py
index 102181e68b4d091872f8562e9912c1b517e39044..7d240dc6b63382155603da3b68f77a884aed7026 100644
--- a/tensorflow/python/ops/image_grad.py
+++ b/tensorflow/python/ops/image_grad.py
@@ -44,7 +44,8 @@ def _ResizeNearestNeighborGrad(op, grad):
   grads = gen_image_ops.resize_nearest_neighbor_grad(
       grad,
       image_shape,
-      align_corners=op.get_attr("align_corners"))
+      align_corners=op.get_attr("align_corners"),
+      half_pixel_centers=op.get_attr("half_pixel_centers"))
   return [grads, None]
 
 
@@ -60,10 +61,35 @@ def _ResizeBilinearGrad(op, grad):
     The gradients w.r.t. the input.
   """
   grad0 = gen_image_ops.resize_bilinear_grad(
-      grad, op.inputs[0], align_corners=op.get_attr("align_corners"))
+      grad,
+      op.inputs[0],
+      align_corners=op.get_attr("align_corners"),
+      half_pixel_centers=op.get_attr("half_pixel_centers"))
   return [grad0, None]
 
 
+@ops.RegisterGradient("ScaleAndTranslate")
+def _ScaleAndTranslateGrad(op, grad):
+  """The derivatives for ScaleAndTranslate transformation op.
+
+  Args:
+    op: The ScaleAndTranslate op.
+    grad: The tensor representing the gradient w.r.t. the output.
+
+  Returns:
+    The gradients w.r.t. the input.
+  """
+
+  grad0 = gen_image_ops.scale_and_translate_grad(
+      grad,
+      op.inputs[0],
+      op.inputs[2],
+      op.inputs[3],
+      kernel_type=op.get_attr("kernel_type"),
+      antialias=op.get_attr("antialias"))
+  return [grad0, None, None, None]
+
+
 @ops.RegisterGradient("ResizeBicubic")
 def _ResizeBicubicGrad(op, grad):
   """The derivatives for bicubic resizing.
@@ -79,7 +105,10 @@ def _ResizeBicubicGrad(op, grad):
   grad0 = None
   if op.inputs[0].dtype in allowed_types:
     grad0 = gen_image_ops.resize_bicubic_grad(
-        grad, op.inputs[0], align_corners=op.get_attr("align_corners"))
+        grad,
+        op.inputs[0],
+        align_corners=op.get_attr("align_corners"),
+        half_pixel_centers=op.get_attr("half_pixel_centers"))
   return [grad0, None]
 
 
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index c481266dd71c1300612dbc384d240d34b98b3599..ea41ea39f98fcc505c358efd8e2e656ac55c01b6 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
@@ -28,6 +29,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+@test_util.disable_all_xla('align_corners=False not supported by XLA')
 class ResizeNearestNeighborOpTest(test.TestCase):
 
   TYPES = [np.float32, np.float64]
@@ -39,7 +41,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
     for nptype in self.TYPES:
       x = np.arange(0, 4).reshape(in_shape).astype(nptype)
 
-      with self.cached_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_nearest_neighbor(input_tensor,
                                                        out_shape[1:3])
@@ -112,7 +114,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
     x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       input_tensor = constant_op.constant(x, shape=in_shape)
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
@@ -155,7 +157,12 @@ class ResizeBilinearOpTest(test.TestCase):
 
     size = np.prod(in_shape)
     x = 1.0 / size * np.arange(0, size).reshape(in_shape).astype(np.float32)
-    for align_corners in [True, False]:
+
+    # Align corners will be deprecated for tf2.0 and the false version is not
+    # supported by XLA.
+    align_corner_options = [True
+                           ] if test_util.is_xla_enabled() else [True, False]
+    for align_corners in align_corner_options:
       grad = {}
       for use_gpu in [False, True]:
         with self.cached_session(use_gpu=use_gpu):
@@ -198,7 +205,7 @@ class ResizeBicubicOpTest(test.TestCase):
     x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
 
     for align_corners in [True, False]:
-      with self.cached_session() as sess:
+      with self.cached_session():
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3],
                                               align_corners=align_corners)
@@ -253,6 +260,70 @@ class ResizeBicubicOpTest(test.TestCase):
       self.assertEqual([None], grad)
 
 
+class ScaleAndTranslateOpTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testGrads(self):
+    in_shape = [1, 2, 3, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+
+    kernel_types = [
+        'lanczos1', 'lanczos3', 'lanczos5', 'gaussian', 'box', 'triangle',
+        'keyscubic', 'mitchellcubic'
+    ]
+    scales = [(1.0, 1.0), (0.37, 0.47), (2.1, 2.1)]
+    translations = [(0.0, 0.0), (3.14, 1.19), (2.1, 3.1), (100.0, 200.0)]
+    for scale in scales:
+      for translation in translations:
+        for kernel_type in kernel_types:
+          for antialias in [True, False]:
+            with self.cached_session():
+              input_tensor = constant_op.constant(x, shape=in_shape)
+              scale_and_translate_out = image_ops.scale_and_translate(
+                  input_tensor,
+                  out_shape[1:3],
+                  scale=constant_op.constant(scale),
+                  translation=constant_op.constant(translation),
+                  kernel_type=kernel_type,
+                  antialias=antialias)
+              err = gradient_checker.compute_gradient_error(
+                  input_tensor,
+                  in_shape,
+                  scale_and_translate_out,
+                  out_shape,
+                  x_init_value=x)
+            self.assertLess(err, 1e-3)
+
+  def testIdentityGrads(self):
+    """Tests that Gradients for 1.0 scale should be ones for some kernels."""
+    in_shape = [1, 2, 3, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+
+    kernel_types = ['lanczos1', 'lanczos3', 'lanczos5', 'triangle', 'keyscubic']
+    scale = (1.0, 1.0)
+    translation = (0.0, 0.0)
+    antialias = True
+    for kernel_type in kernel_types:
+      with self.cached_session():
+        input_tensor = constant_op.constant(x, shape=in_shape)
+        with backprop.GradientTape() as tape:
+          tape.watch(input_tensor)
+          scale_and_translate_out = image_ops.scale_and_translate(
+              input_tensor,
+              out_shape[1:3],
+              scale=constant_op.constant(scale),
+              translation=constant_op.constant(translation),
+              kernel_type=kernel_type,
+              antialias=antialias)
+        grad = tape.gradient(scale_and_translate_out, input_tensor)[0]
+        grad_v = self.evaluate(grad)
+        self.assertAllClose(np.ones_like(grad_v), grad_v)
+
+
 class CropAndResizeOpTest(test.TestCase):
 
   def testShapeIsCorrectAfterOp(self):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 8047743cfa2ef719461e85b904d38beeb040979b..fc78a3e26f6c726fc92c08da108a918bb6145922 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -615,15 +615,17 @@ def central_crop(image, central_fraction):
     # bounding boxes depend on the `image` tensor's rank and whether / not the
     # dimensions are statically defined.
     if dynamic_h:
-      img_hd = math_ops.to_double(img_h)
-      bbox_h_start = math_ops.to_int32((img_hd - img_hd * central_fraction) / 2)
+      img_hd = math_ops.cast(img_h, dtypes.float64)
+      bbox_h_start = math_ops.cast(
+          (img_hd - img_hd * central_fraction) / 2, dtypes.int32)
     else:
       img_hd = float(img_h)
       bbox_h_start = int((img_hd - img_hd * central_fraction) / 2)
 
     if dynamic_w:
-      img_wd = math_ops.to_double(img_w)
-      bbox_w_start = math_ops.to_int32((img_wd - img_wd * central_fraction) / 2)
+      img_wd = math_ops.cast(img_w, dtypes.float64)
+      bbox_w_start = math_ops.cast(
+          (img_wd - img_wd * central_fraction) / 2, dtypes.int32)
     else:
       img_wd = float(img_w)
       bbox_w_start = int((img_wd - img_wd * central_fraction) / 2)
@@ -937,85 +939,29 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     return resized
 
 
-@tf_export('image.ResizeMethod')
-class ResizeMethod(object):
+@tf_export(v1=['image.ResizeMethod'])
+class ResizeMethodV1(object):
   BILINEAR = 0
   NEAREST_NEIGHBOR = 1
   BICUBIC = 2
   AREA = 3
 
 
-@tf_export(v1=['image.resize_images', 'image.resize'])
-def resize_images(images,
-                  size,
-                  method=ResizeMethod.BILINEAR,
-                  align_corners=False,
-                  preserve_aspect_ratio=False):
-  return resize_images_v2(
-      images=images,
-      size=size,
-      method=method,
-      align_corners=align_corners,
-      preserve_aspect_ratio=preserve_aspect_ratio,
-      name=None)
-
-
-@tf_export('image.resize', v1=[])
-def resize_images_v2(images,
-                     size,
-                     method=ResizeMethod.BILINEAR,
-                     align_corners=False,
-                     preserve_aspect_ratio=False,
-                     name=None):
-  """Resize `images` to `size` using the specified `method`.
-
-  Resized images will be distorted if their original aspect ratio is not
-  the same as `size`.  To avoid distortions see
-  `tf.image.resize_image_with_pad`.
-
-  `method` can be one of:
-
-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](
-    https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](
-    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](
-    https://en.wikipedia.org/wiki/Bicubic_interpolation)
-  *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
-
-  The return value has the same type as `images` if `method` is
-  `ResizeMethod.NEAREST_NEIGHBOR`. It will also have the same type as `images`
-  if the size of `images` can be statically determined to be the same as `size`,
-  because `images` is returned in this case. Otherwise, the return value has
-  type `float32`.
-
-  Args:
-    images: 4-D Tensor of shape `[batch, height, width, channels]` or
-            3-D Tensor of shape `[height, width, channels]`.
-    size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-          new size for the images.
-    method: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-    align_corners: bool.  If True, the centers of the 4 corner pixels of the
-        input and output tensors are aligned, preserving the values at the
-        corner pixels. Defaults to `False`.
-    preserve_aspect_ratio: Whether to preserve the aspect ratio. If this is set,
-      then `images` will be resized to a size that fits in `size` while
-      preserving the aspect ratio of the original image. Scales up the image if
-      `size` is bigger than the current size of the `image`. Defaults to False.
-    name: A name for this operation (optional).
-
-  Raises:
-    ValueError: if the shape of `images` is incompatible with the
-      shape arguments to this function
-    ValueError: if `size` has invalid shape or type.
-    ValueError: if an unsupported resize method is specified.
-
-  Returns:
-    If `images` was 4-D, a 4-D float Tensor of shape
-    `[batch, new_height, new_width, channels]`.
-    If `images` was 3-D, a 3-D float Tensor of shape
-    `[new_height, new_width, channels]`.
-  """
+@tf_export('image.ResizeMethod', v1=[])
+class ResizeMethod(object):
+  BILINEAR = 'bilinear'
+  NEAREST_NEIGHBOR = 'nearest'
+  BICUBIC = 'bicubic'
+  AREA = 'area'
+  LANCZOS3 = 'lanczos3'
+  LANCZOS5 = 'lanczos5'
+  GAUSSIAN = 'gaussian'
+  MITCHELLCUBIC = 'mitchellcubic'
+
+
+def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
+                          skip_resize_if_same):
+  """Core functionality for v1 and v2 resize functions."""
   with ops.name_scope(name, 'resize', [images, size]):
     images = ops.convert_to_tensor(images, name='images')
     if images.get_shape().ndims is None:
@@ -1046,15 +992,21 @@ def resize_images_v2(images,
       _, current_height, current_width, _ = _ImageDimensions(images, rank=4)
 
       # do the computation to find the right scale and height/width.
-      scale_factor_height = (math_ops.to_float(new_height_const) /
-                             math_ops.to_float(current_height))
-      scale_factor_width = (math_ops.to_float(new_width_const) /
-                            math_ops.to_float(current_width))
+      scale_factor_height = (
+          math_ops.cast(new_height_const, dtypes.float32) /
+          math_ops.cast(current_height, dtypes.float32))
+      scale_factor_width = (
+          math_ops.cast(new_width_const, dtypes.float32) /
+          math_ops.cast(current_width, dtypes.float32))
       scale_factor = math_ops.minimum(scale_factor_height, scale_factor_width)
-      scaled_height_const = math_ops.to_int32(
-          math_ops.round(scale_factor * math_ops.to_float(current_height)))
-      scaled_width_const = math_ops.to_int32(
-          math_ops.round(scale_factor * math_ops.to_float(current_width)))
+      scaled_height_const = math_ops.cast(
+          math_ops.round(
+              scale_factor * math_ops.cast(current_height, dtypes.float32)),
+          dtypes.int32)
+      scaled_width_const = math_ops.cast(
+          math_ops.round(
+              scale_factor * math_ops.cast(current_width, dtypes.float32)),
+          dtypes.int32)
 
       # NOTE: Reset the size and other constants used later.
       size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
@@ -1065,27 +1017,15 @@ def resize_images_v2(images,
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
-    if all(x is not None
-           for x in [new_width_const, width, new_height_const, height]) and (
-               width == new_width_const and height == new_height_const):
+    if skip_resize_if_same and all(
+        x is not None
+        for x in [new_width_const, width, new_height_const, height]) and (
+            width == new_width_const and height == new_height_const):
       if not is_batch:
         images = array_ops.squeeze(images, axis=[0])
       return images
 
-    if method == ResizeMethod.BILINEAR:
-      images = gen_image_ops.resize_bilinear(
-          images, size, align_corners=align_corners)
-    elif method == ResizeMethod.NEAREST_NEIGHBOR:
-      images = gen_image_ops.resize_nearest_neighbor(
-          images, size, align_corners=align_corners)
-    elif method == ResizeMethod.BICUBIC:
-      images = gen_image_ops.resize_bicubic(
-          images, size, align_corners=align_corners)
-    elif method == ResizeMethod.AREA:
-      images = gen_image_ops.resize_area(
-          images, size, align_corners=align_corners)
-    else:
-      raise ValueError('Resize method is not implemented.')
+    images = resizer_fn(images, size)
 
     # NOTE(mrry): The shape functions for the resize ops cannot unpack
     # the packed values in `new_size`, so set the shape here.
@@ -1096,36 +1036,225 @@ def resize_images_v2(images,
     return images
 
 
-@tf_export('image.resize_image_with_pad')
-def resize_image_with_pad(image,
-                          target_height,
-                          target_width,
-                          method=ResizeMethod.BILINEAR):
-  """Resizes and pads an image to a target width and height.
+@tf_export(v1=['image.resize_images', 'image.resize'])
+def resize_images(images,
+                  size,
+                  method=ResizeMethodV1.BILINEAR,
+                  align_corners=False,
+                  preserve_aspect_ratio=False,
+                  name=None):
+  """Resize `images` to `size` using the specified `method`.
 
-  Resizes an image to a target width and height by keeping
-  the aspect ratio the same without distortion. If the target
-  dimensions don't match the image dimensions, the image
-  is resized and then padded with zeroes to match requested
-  dimensions.
+  Resized images will be distorted if their original aspect ratio is not
+  the same as `size`.  To avoid distortions see
+  `tf.image.resize_image_with_pad`.
+
+  `method` can be one of:
+
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](
+    https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
+
+  The return value has the same type as `images` if `method` is
+  `ResizeMethod.NEAREST_NEIGHBOR`. It will also have the same type as `images`
+  if the size of `images` can be statically determined to be the same as `size`,
+  because `images` is returned in this case. Otherwise, the return value has
+  type `float32`.
 
   Args:
-    image: 4-D Tensor of shape `[batch, height, width, channels]` or
-           3-D Tensor of shape `[height, width, channels]`.
-    target_height: Target height.
-    target_width: Target width.
-    method: Method to use for resizing image. See `resize_images()`
+    images: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The new
+      size for the images.
+    method: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+      input and output tensors are aligned, preserving the values at the corner
+      pixels. Defaults to `False`.
+    preserve_aspect_ratio: Whether to preserve the aspect ratio. If this is set,
+      then `images` will be resized to a size that fits in `size` while
+      preserving the aspect ratio of the original image. Scales up the image if
+      `size` is bigger than the current size of the `image`. Defaults to False.
+    name: A name for this operation (optional).
 
   Raises:
-    ValueError: if `target_height` or `target_width` are zero or negative.
+    ValueError: if the shape of `images` is incompatible with the
+      shape arguments to this function
+    ValueError: if `size` has invalid shape or type.
+    ValueError: if an unsupported resize method is specified.
 
   Returns:
-    Resized and padded image.
     If `images` was 4-D, a 4-D float Tensor of shape
     `[batch, new_height, new_width, channels]`.
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
+
+  def resize_fn(images_t, new_size):
+    """Legacy resize core function, passed to _resize_images_common."""
+    if method == ResizeMethodV1.BILINEAR or method == ResizeMethod.BILINEAR:
+      return gen_image_ops.resize_bilinear(
+          images_t, new_size, align_corners=align_corners)
+    elif (method == ResizeMethodV1.NEAREST_NEIGHBOR or
+          method == ResizeMethod.NEAREST_NEIGHBOR):
+      return gen_image_ops.resize_nearest_neighbor(
+          images_t, new_size, align_corners=align_corners)
+    elif method == ResizeMethodV1.BICUBIC or method == ResizeMethod.BICUBIC:
+      return gen_image_ops.resize_bicubic(
+          images_t, new_size, align_corners=align_corners)
+    elif method == ResizeMethodV1.AREA or method == ResizeMethod.AREA:
+      return gen_image_ops.resize_area(
+          images_t, new_size, align_corners=align_corners)
+    else:
+      raise ValueError('Resize method is not implemented.')
+
+  return _resize_images_common(
+      images,
+      resize_fn,
+      size,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=name,
+      skip_resize_if_same=True)
+
+
+@tf_export('image.resize', v1=[])
+def resize_images_v2(images,
+                     size,
+                     method=ResizeMethod.BILINEAR,
+                     preserve_aspect_ratio=False,
+                     antialias=False,
+                     name=None):
+  """Resize `images` to `size` using the specified `method`.
+
+  Resized images will be distorted if their original aspect ratio is not
+  the same as `size`.  To avoid distortions see
+  `tf.image.resize_with_pad`.
+
+  When 'antialias' is true, the sampling filter will anti-alias the input image
+  as well as interpolate.   When downsampling an image with [anti-aliasing](
+  https://en.wikipedia.org/wiki/Spatial_anti-aliasing) the sampling filter
+  kernel is scaled in order to properly anti-alias the input image signal.
+  'antialias' has no effect when upsampling an image.
+
+  *   <b>`bilinear`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation) If 'antialias' is
+    true, becomes a hat/tent filter function with radius 1 when downsampling.
+  *   <b>`lanczos3`</b>:  [Lanczos kernel](
+    https://en.wikipedia.org/wiki/Lanczos_resampling) with radius 3.
+    High-quality practical filter but may have some ringing especially on
+    synthetic images.
+  *   <b>`lanczos5`</b>: [Lanczos kernel] (
+    https://en.wikipedia.org/wiki/Lanczos_resampling) with radius 5.
+    Very-high-quality filter but may have stronger ringing.
+  *   <b>`bicubic`</b>: [Cubic interpolant](
+    https://en.wikipedia.org/wiki/Bicubic_interpolation) of Keys. Equivalent to
+    Catmull-Rom kernel. Reasonably good quality and faster than Lanczos3Kernel,
+    particularly when upsampling.
+  *   <b>`gaussian`</b>: [Gaussian kernel](
+    https://en.wikipedia.org/wiki/Gaussian_filter) with radius 3,
+    sigma = 1.5 / 3.]
+  *   <b>`nearest`</b>: [Nearest neighbor interpolation.](
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+    'antialias' has no effect when used with nearest neighbor interpolation.
+  *   <b>`area`</b>: Anti-aliased resampling with area interpolation.
+    'antialias' has no effect when used with area interpolation; it
+    always anti-aliases.
+  *   <b>`mitchellcubic`</b>: Mitchell-Netravali Cubic non-interpolating filter.
+    For synthetic images (especially those lacking proper prefiltering), less
+    ringing than Keys cubic kernel but less sharp.
+
+  Note that near image edges the filtering kernel may be partially outside the
+  image boundaries. For these pixels, only input pixels inside the image will be
+  included in the filter sum, and the output value will be appropriately
+  normalized.
+
+  The return value has the same type as `images` if `method` is
+  `ResizeMethod.NEAREST_NEIGHBOR`. Otherwise, the return value has type
+  `float32`.
+
+  Args:
+    images: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The new
+      size for the images.
+    method: ResizeMethod.  Defaults to `bilinear`.
+    preserve_aspect_ratio: Whether to preserve the aspect ratio. If this is set,
+      then `images` will be resized to a size that fits in `size` while
+      preserving the aspect ratio of the original image. Scales up the image if
+      `size` is bigger than the current size of the `image`. Defaults to False.
+    antialias: Whether to use an anti-aliasing filter when downsampling an
+      image.
+    name: A name for this operation (optional).
+
+  Raises:
+    ValueError: if the shape of `images` is incompatible with the
+      shape arguments to this function
+    ValueError: if `size` has invalid shape or type.
+    ValueError: if an unsupported resize method is specified.
+
+  Returns:
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
+  """
+
+  def resize_fn(images_t, new_size):
+    """Resize core function, passed to _resize_images_common."""
+    scale_and_translate_methods = [
+        ResizeMethod.LANCZOS3, ResizeMethod.LANCZOS5, ResizeMethod.GAUSSIAN,
+        ResizeMethod.MITCHELLCUBIC
+    ]
+
+    def resize_with_scale_and_translate(method):
+      scale = (
+          math_ops.cast(new_size, dtype=dtypes.float32) /
+          math_ops.cast(array_ops.shape(images_t)[1:3], dtype=dtypes.float32))
+      return gen_image_ops.scale_and_translate(
+          images_t,
+          new_size,
+          scale,
+          array_ops.zeros([2]),
+          kernel_type=method,
+          antialias=antialias)
+
+    if method == ResizeMethod.BILINEAR:
+      if antialias:
+        return resize_with_scale_and_translate('triangle')
+      else:
+        return gen_image_ops.resize_bilinear(
+            images_t, new_size, half_pixel_centers=True)
+    elif method == ResizeMethod.NEAREST_NEIGHBOR:
+      return gen_image_ops.resize_nearest_neighbor(
+          images_t, new_size, half_pixel_centers=True)
+    elif method == ResizeMethod.BICUBIC:
+      if antialias:
+        return resize_with_scale_and_translate('keyscubic')
+      else:
+        return gen_image_ops.resize_bicubic(
+            images_t, new_size, half_pixel_centers=True)
+    elif method == ResizeMethod.AREA:
+      return gen_image_ops.resize_area(images_t, new_size)
+    elif method in scale_and_translate_methods:
+      return resize_with_scale_and_translate(method)
+    else:
+      raise ValueError('Resize method is not implemented.')
+
+  return _resize_images_common(
+      images,
+      resize_fn,
+      size,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=name,
+      skip_resize_if_same=False)
+
+
+def _resize_image_with_pad_common(image, target_height, target_width,
+                                  resize_fn):
+  """Core functionality for v1 and v2 resize_image_with_pad functions."""
   with ops.name_scope(None, 'resize_image_with_pad', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image_shape = image.get_shape()
@@ -1180,7 +1309,7 @@ def resize_image_with_pad(image,
     p_width = max_(0, math_ops.cast(f_padding_width, dtype=dtypes.int32))
 
     # Resize first, then pad to meet requested dimensions
-    resized = resize_images(image, [resized_height, resized_width], method)
+    resized = resize_fn(image, [resized_height, resized_width])
 
     padded = pad_to_bounding_box(resized, p_height, p_width, target_height,
                                  target_width)
@@ -1196,6 +1325,88 @@ def resize_image_with_pad(image,
     return padded
 
 
+@tf_export(v1=['image.resize_image_with_pad'])
+def resize_image_with_pad_v1(image,
+                             target_height,
+                             target_width,
+                             method=ResizeMethodV1.BILINEAR,
+                             align_corners=False):
+  """Resizes and pads an image to a target width and height.
+
+  Resizes an image to a target width and height by keeping
+  the aspect ratio the same without distortion. If the target
+  dimensions don't match the image dimensions, the image
+  is resized and then padded with zeroes to match requested
+  dimensions.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    target_height: Target height.
+    target_width: Target width.
+    method: Method to use for resizing image. See `resize_images()`
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+      input and output tensors are aligned, preserving the values at the corner
+      pixels. Defaults to `False`.
+
+  Raises:
+    ValueError: if `target_height` or `target_width` are zero or negative.
+
+  Returns:
+    Resized and padded image.
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
+  """
+
+  def _resize_fn(im, new_size):
+    return resize_images(im, new_size, method, align_corners=align_corners)
+
+  return _resize_image_with_pad_common(image, target_height, target_width,
+                                       _resize_fn)
+
+
+@tf_export('image.resize_with_pad', v1=[])
+def resize_image_with_pad_v2(image,
+                             target_height,
+                             target_width,
+                             method=ResizeMethod.BILINEAR,
+                             antialias=False):
+  """Resizes and pads an image to a target width and height.
+
+  Resizes an image to a target width and height by keeping
+  the aspect ratio the same without distortion. If the target
+  dimensions don't match the image dimensions, the image
+  is resized and then padded with zeroes to match requested
+  dimensions.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor
+      of shape `[height, width, channels]`.
+    target_height: Target height.
+    target_width: Target width.
+    method: Method to use for resizing image. See `image.resize()`
+    antialias: Whether to use anti-aliasing when resizing. See 'image.resize()'.
+
+  Raises:
+    ValueError: if `target_height` or `target_width` are zero or negative.
+
+  Returns:
+    Resized and padded image.
+    If `images` was 4-D, a 4-D float Tensor of shape
+    `[batch, new_height, new_width, channels]`.
+    If `images` was 3-D, a 3-D float Tensor of shape
+    `[new_height, new_width, channels]`.
+  """
+
+  def _resize_fn(im, new_size):
+    return resize_images_v2(im, new_size, method, antialias=antialias)
+
+  return _resize_image_with_pad_common(image, target_height, target_width,
+                                       _resize_fn)
+
+
 @tf_export('image.per_image_standardization')
 def per_image_standardization(image):
   """Linearly scales `image` to have zero mean and unit variance.
@@ -1208,8 +1419,8 @@ def per_image_standardization(image):
   away from zero to protect against division by 0 when handling uniform images.
 
   Args:
-    image: An n-D Tensor where the last 3 dimensions are
-           `[height, width, channels]`.
+    image: An n-D Tensor where the last 3 dimensions are `[height, width,
+      channels]`.
 
   Returns:
     The standardized image with same shape as `image`.
@@ -2237,9 +2448,9 @@ def non_max_suppression(boxes,
 
   Prunes away boxes that have high intersection-over-union (IOU) overlap
   with previously selected boxes.  Bounding boxes are supplied as
-  [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+  `[y1, x1, y2, x2]`, where `(y1, x1)` and `(y2, x2)` are the coordinates of any
   diagonal pair of box corners and the coordinates can be provided as normalized
-  (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+  (i.e., lying in the interval `[0, 1]`) or absolute.  Note that this algorithm
   is agnostic to where the origin is in the coordinate system.  Note that this
   algorithm is invariant to orthogonal transformations and translations
   of the coordinate system; thus translating or reflections of the coordinate
@@ -2247,10 +2458,12 @@ def non_max_suppression(boxes,
   The output of this operation is a set of integers indexing into the input
   collection of bounding boxes representing the selected boxes.  The bounding
   box coordinates corresponding to the selected indices can then be obtained
-  using the `tf.gather operation`.  For example:
+  using the `tf.gather` operation.  For example:
+    ```python
     selected_indices = tf.image.non_max_suppression(
         boxes, scores, max_output_size, iou_threshold)
     selected_boxes = tf.gather(boxes, selected_indices)
+    ```
 
   Args:
     boxes: A 2-D float `Tensor` of shape `[num_boxes, 4]`.
@@ -2294,12 +2507,14 @@ def non_max_suppression_padded(boxes,
   boxes and the number of valid indices in the index set.  The bounding box
   coordinates corresponding to the selected indices can then be obtained using
   the `tf.slice` and `tf.gather` operations.  For example:
+    ```python
     selected_indices_padded, num_valid = tf.image.non_max_suppression_padded(
         boxes, scores, max_output_size, iou_threshold,
         score_threshold, pad_to_max_output_size=True)
     selected_indices = tf.slice(
         selected_indices_padded, tf.constant([0]), num_valid)
     selected_boxes = tf.gather(boxes, selected_indices)
+    ```
 
   Args:
     boxes: A 2-D float `Tensor` of shape `[num_boxes, 4]`.
@@ -2348,10 +2563,12 @@ def non_max_suppression_with_overlaps(overlaps,
   The output of this operation is a set of integers indexing into the input
   collection of bounding boxes representing the selected boxes.  The bounding
   box coordinates corresponding to the selected indices can then be obtained
-  using the `tf.gather operation`.  For example:
+  using the `tf.gather` operation.  For example:
+    ```python
     selected_indices = tf.image.non_max_suppression_overlaps(
         overlaps, scores, max_output_size, iou_threshold)
     selected_boxes = tf.gather(boxes, selected_indices)
+    ```
 
   Args:
     overlaps: A 2-D float `Tensor` of shape `[num_boxes, num_boxes]`.
@@ -2973,6 +3190,45 @@ def sobel_edges(image):
   return output
 
 
+def resize_bicubic(images,
+                   size,
+                   align_corners=False,
+                   name=None,
+                   half_pixel_centers=False):
+  return gen_image_ops.resize_bicubic(
+      images=images,
+      size=size,
+      align_corners=align_corners,
+      half_pixel_centers=half_pixel_centers,
+      name=name)
+
+
+def resize_bilinear(images,
+                    size,
+                    align_corners=False,
+                    name=None,
+                    half_pixel_centers=False):
+  return gen_image_ops.resize_bilinear(
+      images=images,
+      size=size,
+      align_corners=align_corners,
+      half_pixel_centers=half_pixel_centers,
+      name=name)
+
+
+def resize_nearest_neighbor(images,
+                            size,
+                            align_corners=False,
+                            name=None,
+                            half_pixel_centers=False):
+  return gen_image_ops.resize_nearest_neighbor(
+      images=images,
+      size=size,
+      align_corners=align_corners,
+      half_pixel_centers=half_pixel_centers,
+      name=name)
+
+
 resize_area_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
@@ -2985,14 +3241,14 @@ resize_bicubic_deprecation = deprecation.deprecated(
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
 tf_export(v1=['image.resize_bicubic'])(
-    resize_bicubic_deprecation(gen_image_ops.resize_bicubic))
+    resize_bicubic_deprecation(resize_bicubic))
 
 resize_bilinear_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
 tf_export(v1=['image.resize_bilinear'])(
-    resize_bilinear_deprecation(gen_image_ops.resize_bilinear))
+    resize_bilinear_deprecation(resize_bilinear))
 
 resize_nearest_neighbor_deprecation = deprecation.deprecated(
     date=None,
@@ -3000,7 +3256,7 @@ resize_nearest_neighbor_deprecation = deprecation.deprecated(
         'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
         'instead.'))
 tf_export(v1=['image.resize_nearest_neighbor'])(
-    resize_nearest_neighbor_deprecation(gen_image_ops.resize_nearest_neighbor))
+    resize_nearest_neighbor_deprecation(resize_nearest_neighbor))
 
 
 @tf_export('image.crop_and_resize', v1=[])
@@ -3066,7 +3322,222 @@ def crop_and_resize_v2(
       image, boxes, box_indices, crop_size, method, extrapolation_value, name)
 
 
-crop_and_resize_deprecation = deprecation.deprecated_args(
+@tf_export(v1=['image.crop_and_resize'])
+@deprecation.deprecated_args(
     None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
-tf_export(v1=['image.crop_and_resize'])(
-    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
+def crop_and_resize_v1(   # pylint: disable=missing-docstring
+    image,
+    boxes,
+    box_ind=None,
+    crop_size=None,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None,
+    box_indices=None):
+  box_ind = deprecation.deprecated_argument_lookup(
+      "box_indices", box_indices, "box_ind", box_ind)
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_ind, crop_size, method, extrapolation_value, name)
+
+crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
+
+
+@tf_export(v1=['image.extract_glimpse'])
+def extract_glimpse(
+    input,  # pylint: disable=redefined-builtin
+    size,
+    offsets,
+    centered=True,
+    normalized=True,
+    uniform_noise=True,
+    name=None):
+  """Extracts a glimpse from the input tensor.
+
+  Returns a set of windows called glimpses extracted at location
+  `offsets` from the input tensor. If the windows only partially
+  overlaps the inputs, the non overlapping areas will be filled with
+  random noise.
+
+  The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+  glimpse_width, channels]`. The channels and batch dimensions are the
+  same as that of the input tensor. The height and width of the output
+  windows are specified in the `size` parameter.
+
+  The argument `normalized` and `centered` controls how the windows are built:
+
+  * If the coordinates are normalized but not centered, 0.0 and 1.0
+    correspond to the minimum and maximum of each height and width
+    dimension.
+  * If the coordinates are both normalized and centered, they range from
+    -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+    left corner, the lower right corner is located at (1.0, 1.0) and the
+    center is at (0, 0).
+  * If the coordinates are not normalized they are interpreted as
+    numbers of pixels.
+
+  Args:
+    input: A `Tensor` of type `float32`. A 4-D float tensor of shape
+      `[batch_size, height, width, channels]`.
+    size: A `Tensor` of type `int32`. A 1-D tensor of 2 elements containing the
+      size of the glimpses to extract.  The glimpse height must be specified
+      first, following by the glimpse width.
+    offsets: A `Tensor` of type `float32`. A 2-D integer tensor of shape
+      `[batch_size, 2]` containing the y, x locations of the center of each
+      window.
+    centered: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are centered relative to the image, in which case the (0, 0)
+      offset is relative to the center of the input images. If false, the (0,0)
+      offset corresponds to the upper left corner of the input images.
+    normalized: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are normalized.
+    uniform_noise: An optional `bool`. Defaults to `True`. indicates if the
+      noise should be generated using a uniform distribution or a Gaussian
+      distribution.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `float32`.
+  """
+  return gen_image_ops.extract_glimpse(
+      input=input,
+      size=size,
+      offsets=offsets,
+      centered=centered,
+      normalized=normalized,
+      uniform_noise=uniform_noise,
+      name=name)
+
+
+@tf_export('image.extract_glimpse', v1=[])
+def extract_glimpse_v2(
+    input,  # pylint: disable=redefined-builtin
+    size,
+    offsets,
+    centered=True,
+    normalized=True,
+    noise='uniform',
+    name=None):
+  """Extracts a glimpse from the input tensor.
+
+  Returns a set of windows called glimpses extracted at location
+  `offsets` from the input tensor. If the windows only partially
+  overlaps the inputs, the non overlapping areas will be filled with
+  random noise.
+
+  The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+  glimpse_width, channels]`. The channels and batch dimensions are the
+  same as that of the input tensor. The height and width of the output
+  windows are specified in the `size` parameter.
+
+  The argument `normalized` and `centered` controls how the windows are built:
+
+  * If the coordinates are normalized but not centered, 0.0 and 1.0
+    correspond to the minimum and maximum of each height and width
+    dimension.
+  * If the coordinates are both normalized and centered, they range from
+    -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+    left corner, the lower right corner is located at (1.0, 1.0) and the
+    center is at (0, 0).
+  * If the coordinates are not normalized they are interpreted as
+    numbers of pixels.
+
+  Args:
+    input: A `Tensor` of type `float32`. A 4-D float tensor of shape
+      `[batch_size, height, width, channels]`.
+    size: A `Tensor` of type `int32`. A 1-D tensor of 2 elements containing the
+      size of the glimpses to extract.  The glimpse height must be specified
+      first, following by the glimpse width.
+    offsets: A `Tensor` of type `float32`. A 2-D integer tensor of shape
+      `[batch_size, 2]` containing the y, x locations of the center of each
+      window.
+    centered: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are centered relative to the image, in which case the (0, 0)
+      offset is relative to the center of the input images. If false, the (0,0)
+      offset corresponds to the upper left corner of the input images.
+    normalized: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are normalized.
+    noise: An optional `string`. Defaults to `uniform`. indicates if the noise
+      should be `uniform` (uniform distribution), `gaussian` (gaussian
+      distribution), or `zero` (zero padding).
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `float32`.
+  """
+  return gen_image_ops.extract_glimpse(
+      input=input,
+      size=size,
+      offsets=offsets,
+      centered=centered,
+      normalized=normalized,
+      noise=noise,
+      uniform_noise=False,
+      name=name)
+
+
+@tf_export('image.combined_non_max_suppression')
+def combined_non_max_suppression(boxes,
+                                 scores,
+                                 max_output_size_per_class,
+                                 max_total_size,
+                                 iou_threshold=0.5,
+                                 score_threshold=float('-inf'),
+                                 pad_per_class=False,
+                                 name=None):
+  """Greedily selects a subset of bounding boxes in descending order of score.
+
+  This operation performs non_max_suppression on the inputs per batch, across
+  all classes.
+  Prunes away boxes that have high intersection-over-union (IOU) overlap
+  with previously selected boxes.  Bounding boxes are supplied as
+  [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+  diagonal pair of box corners and the coordinates can be provided as normalized
+  (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+  is agnostic to where the origin is in the coordinate system. Also note that
+  this algorithm is invariant to orthogonal transformations and translations
+  of the coordinate system; thus translating or reflections of the coordinate
+  system result in the same boxes being selected by the algorithm.
+  The output of this operation is the final boxes, scores and classes tensor
+  returned after performing non_max_suppression.
+
+  Args:
+    boxes: A 4-D float `Tensor` of shape `[batch_size, num_boxes, q, 4]`. If `q`
+      is 1 then same boxes are used for all classes otherwise, if `q` is equal
+      to number of classes, class-specific boxes are used.
+    scores: A 3-D float `Tensor` of shape `[batch_size, num_boxes, num_classes]`
+      representing a single score corresponding to each box (each row of boxes).
+    max_output_size_per_class: A scalar integer `Tensor` representing the
+      maximum number of boxes to be selected by non max suppression per class
+    max_total_size: A scalar representing maximum number of boxes retained over
+      all classes.
+    iou_threshold: A float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+    score_threshold: A float representing the threshold for deciding when to
+      remove boxes based on score.
+    pad_per_class: If false, the output nmsed boxes, scores and classes are
+      padded/clipped to `max_total_size`. If true, the output nmsed boxes,
+      scores and classes are padded to be of length
+      `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+      which case it is clipped to `max_total_size`. Defaults to false.
+    name: A name for the operation (optional).
+
+  Returns:
+    'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor
+      containing the non-max suppressed boxes.
+    'nmsed_scores': A [batch_size, max_detections] float32 tensor containing
+      the scores for the boxes.
+    'nmsed_classes': A [batch_size, max_detections] float32 tensor
+      containing the class for boxes.
+    'valid_detections': A [batch_size] int32 tensor indicating the number of
+      valid detections per batch item. Only the top valid_detections[i] entries
+      in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+      entries are zero paddings.
+  """
+  with ops.name_scope(name, 'combined_non_max_suppression'):
+    iou_threshold = ops.convert_to_tensor(
+        iou_threshold, dtype=dtypes.float32, name='iou_threshold')
+    score_threshold = ops.convert_to_tensor(
+        score_threshold, dtype=dtypes.float32, name='score_threshold')
+    return gen_image_ops.combined_non_max_suppression(
+        boxes, scores, max_output_size_per_class, max_total_size, iou_threshold,
+        score_threshold, pad_per_class)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 361befabce7725f44f44dcda1c6d2c487f704030..71e310edc8569f08aa2481ebaae1c6f37e728e90 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -61,7 +61,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_hsv(batch0)
         batch2 = image_ops.hsv_to_rgb(batch1)
@@ -82,7 +82,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     for nptype in [np.float32, np.float64]:
       rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
         rgb_tf = self.evaluate(rgb)
@@ -101,7 +101,7 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to YIQ and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_yiq(batch0)
         batch2 = image_ops.yiq_to_rgb(batch1)
@@ -131,7 +131,7 @@ class RGBToYUVTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to YUV and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_yuv(batch0)
         batch2 = image_ops.yuv_to_rgb(batch1)
@@ -173,7 +173,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
   def _TestRGBToGrayscale(self, x_np):
     y_np = self._RGBToGrayscale(x_np)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
       y_tf = self.evaluate(y)
@@ -195,7 +195,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     y_np = np.array(
         [[1, 1, 1], [2, 2, 2]], dtype=np.uint8).reshape([1, 1, 2, 3])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
       y_tf = self.evaluate(y)
@@ -205,7 +205,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2]], dtype=np.uint8).reshape([1, 2, 1])
     y_np = np.array([[1, 1, 1], [2, 2, 2]], dtype=np.uint8).reshape([1, 2, 3])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
       y_tf = self.evaluate(y)
@@ -216,23 +216,23 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     # Shape inference works and produces expected output where possible
     rgb_shape = [7, None, 19, 3]
     gray_shape = rgb_shape[:-1] + [1]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
       gray = image_ops.rgb_to_grayscale(rgb_tf)
       self.assertEqual(gray_shape, gray.get_shape().as_list())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
       rgb = image_ops.grayscale_to_rgb(gray_tf)
       self.assertEqual(rgb_shape, rgb.get_shape().as_list())
 
     # Shape inference does not break for unknown shapes
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
       gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
       self.assertFalse(gray_unknown.get_shape())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
       rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
       self.assertFalse(rgb_unknown.get_shape())
@@ -364,7 +364,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [0, 13, 1, 54, 226, 59, 8, 234, 150, 255, 39, 1]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -379,7 +379,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -394,7 +394,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -419,7 +419,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     return y_v.reshape(x_np.shape)
 
   def _adjustHueTf(self, x_np, delta_h):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
       y_tf = self.evaluate(y)
@@ -850,7 +850,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
@@ -865,7 +865,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
@@ -880,7 +880,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
@@ -920,7 +920,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
         "gb_same",
         "rgb_same",
     ]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       for x_shape in x_shapes:
         for test_style in test_styles:
           x_np = np.random.rand(*x_shape) * 255.
@@ -947,7 +947,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testInvolutionLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
       y_tf = self.evaluate(y)
@@ -957,7 +957,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array(
         [[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
       y_tf = self.evaluate(y)
@@ -968,7 +968,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
@@ -983,7 +983,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       y_tf = self.evaluate(y)
@@ -995,7 +995,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
     seed = 42
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
@@ -1035,7 +1035,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.vstack([x_np_raw for _ in range(batch_size)])
     y_np = np.vstack([y_np_raw for _ in range(batch_size)])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
@@ -1066,7 +1066,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
       y_tf = self.evaluate(y)
@@ -1077,7 +1077,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
       y_tf = self.evaluate(y)
@@ -1088,7 +1088,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
@@ -1103,7 +1103,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       y_tf = self.evaluate(y)
@@ -1116,7 +1116,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     seed = 42
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
@@ -1155,7 +1155,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.vstack([x_np_raw for _ in range(batch_size)])
     y_np = np.vstack([y_np_raw for _ in range(batch_size)])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
@@ -1186,7 +1186,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
       y_tf = self.evaluate(y)
@@ -1197,7 +1197,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
       y_tf = self.evaluate(y)
@@ -1208,7 +1208,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
       self.assertTrue(y.op.name.startswith("transpose"))
@@ -1224,7 +1224,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
         dtype=np.uint8).reshape([2, 3, 2, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
       y_tf = self.evaluate(y)
@@ -1275,7 +1275,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
@@ -1283,7 +1283,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
@@ -1292,7 +1292,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
       y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
@@ -1302,7 +1302,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
       y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
@@ -1312,7 +1312,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
       y_tf = self.evaluate(y)
@@ -1367,7 +1367,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     return y_np
 
   def _adjustContrastTf(self, x_np, contrast_factor):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
       y_tf = self.evaluate(y)
@@ -1401,7 +1401,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
 class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
   def _testBrightness(self, x_np, y_np, delta, tol=1e-6):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
       y_tf = self.evaluate(y)
@@ -1468,7 +1468,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     x_np = np.arange(0, np.prod(x_shape), dtype=np.int32).reshape(x_shape)
     y_np = self._NumpyPerImageWhitening(x_np)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
@@ -1479,14 +1479,14 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im_np = np.ones([19, 19, 3]).astype(np.float32) * 249
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
   def testBatchWhitening(self):
     imgs_np = np.random.uniform(0., 255., [4, 24, 24, 3])
     whiten_np = [self._NumpyPerImageWhitening(img) for img in imgs_np]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       imgs = constant_op.constant(imgs_np)
       whiten = image_ops.per_image_standardization(imgs)
       whiten_tf = self.evaluate(whiten)
@@ -1514,7 +1514,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -1693,7 +1693,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     for x_shape in x_shapes:
       x_np = np.ones(x_shape, dtype=np.float32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
           y_tf = self.evaluate(y)
@@ -1708,7 +1708,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
         y_tf = self.evaluate(y)
@@ -1724,7 +1724,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         dtype=np.int32).reshape(x_shape)
     y_np = np.array([[[3, 4, 5, 6], [3, 4, 5, 6]],
                      [[6, 5, 4, 3], [6, 5, 4, 3]]]).reshape([2, 2, 4, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
       y_tf = self.evaluate(y)
@@ -1741,7 +1741,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       x_np = np.zeros(x_shape, dtype=np.int32)
       y_np = np.zeros(y_shape, dtype=np.int32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
           y = image_ops.central_crop(x, 0.33)
           y_tf = y.eval(feed_dict={x: x_np})
@@ -1792,7 +1792,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         with self.assertRaises(ValueError):
           _ = image_ops.central_crop(x, 0.0)
@@ -1804,7 +1804,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     for x_shape in x_shapes:
       x_np = np.ones(x_shape, dtype=np.float32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
@@ -1814,7 +1814,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         y = image_ops.central_crop(x_np, 1.0)
         self.assertTrue(y.op.name.startswith("central_crop"))
 
@@ -1839,7 +1839,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -1899,7 +1899,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(y, self.evaluate(y_tf))
 
   @test_util.run_deprecated_v1
@@ -2034,7 +2034,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     fraction_object_covered = []
 
     num_iter = 1000
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_tf = constant_op.constant(image, shape=image.shape)
       image_size_tf = constant_op.constant(
           image_size_np, shape=image_size_np.shape)
@@ -2164,7 +2164,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
@@ -2202,7 +2202,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
   def testDefaultMinObjectCovered(self):
     # By default min_object_covered=0.1 if not provided
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
@@ -2225,11 +2225,521 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
 
-class ResizeImagesTest(test_util.TensorFlowTestCase):
+class ResizeImagesV2Test(test_util.TensorFlowTestCase):
+
+  METHODS = [
+      image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA,
+      image_ops.ResizeMethod.LANCZOS3, image_ops.ResizeMethod.LANCZOS5,
+      image_ops.ResizeMethod.GAUSSIAN, image_ops.ResizeMethod.MITCHELLCUBIC
+  ]
 
-  OPTIONS = [
+  # Some resize methods, such as Gaussian, are non-interpolating in that they
+  # change the image even if there is no scale change, for some test, we only
+  # check the value on the value preserving methods.
+  INTERPOLATING_METHODS = [
       image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA,
+      image_ops.ResizeMethod.LANCZOS3, image_ops.ResizeMethod.LANCZOS5
+  ]
+
+  TYPES = [
+      np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16,
+      np.float32, np.float64
+  ]
+
+  def _assertShapeInference(self, pre_shape, size, post_shape):
+    # Try single image resize
+    single_image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
+    y = image_ops.resize_images_v2(single_image, size)
+    self.assertEqual(y.get_shape().as_list(), post_shape)
+    # Try batch images resize with known batch size
+    images = array_ops.placeholder(dtypes.float32, shape=[99] + pre_shape)
+    y = image_ops.resize_images_v2(images, size)
+    self.assertEqual(y.get_shape().as_list(), [99] + post_shape)
+    # Try batch images resize with unknown batch size
+    images = array_ops.placeholder(dtypes.float32, shape=[None] + pre_shape)
+    y = image_ops.resize_images_v2(images, size)
+    self.assertEqual(y.get_shape().as_list(), [None] + post_shape)
+
+  def shouldRunOnGPU(self, method, nptype):
+    if (method == image_ops.ResizeMethod.NEAREST_NEIGHBOR and
+        nptype in [np.float32, np.float64]):
+      return True
+    else:
+      return False
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  @test_util.run_deprecated_v1
+  def testNoOp(self):
+    img_shape = [1, 6, 4, 1]
+    single_shape = [6, 4, 1]
+    # This test is also conducted with int8, so 127 is the maximum
+    # value that can be used.
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    target_height = 6
+    target_width = 4
+
+    for nptype in self.TYPES:
+      img_np = np.array(data, dtype=nptype).reshape(img_shape)
+
+      for method in self.METHODS:
+        with self.cached_session(use_gpu=True):
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images_v2(image, [target_height, target_width],
+                                         method)
+          yshape = array_ops.shape(y)
+          resized, newshape = self.evaluate([y, yshape])
+          self.assertAllEqual(img_shape, newshape)
+          if method in self.INTERPOLATING_METHODS:
+            self.assertAllClose(resized, img_np, atol=1e-5)
+
+      # Resizing with a single image must leave the shape unchanged also.
+      with self.cached_session(use_gpu=True):
+        img_single = img_np.reshape(single_shape)
+        image = constant_op.constant(img_single, shape=single_shape)
+        y = image_ops.resize_images_v2(image, [target_height, target_width],
+                                       self.METHODS[0])
+        yshape = array_ops.shape(y)
+        newshape = self.evaluate(yshape)
+        self.assertAllEqual(single_shape, newshape)
+
+  @test_util.run_deprecated_v1
+  def testTensorArguments(self):
+    img_shape = [1, 6, 4, 1]
+    single_shape = [6, 4, 1]
+    # This test is also conducted with int8, so 127 is the maximum
+    # value that can be used.
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    new_size = array_ops.placeholder(dtypes.int32, shape=(2))
+
+    img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
+
+    for method in self.METHODS:
+      with self.cached_session(use_gpu=True) as sess:
+        image = constant_op.constant(img_np, shape=img_shape)
+        y = image_ops.resize_images_v2(image, new_size, method)
+        yshape = array_ops.shape(y)
+        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        self.assertAllEqual(img_shape, newshape)
+        if method in self.INTERPOLATING_METHODS:
+          self.assertAllClose(resized, img_np, atol=1e-5)
+
+      # Resizing with a single image must leave the shape unchanged also.
+      with self.cached_session(use_gpu=True):
+        img_single = img_np.reshape(single_shape)
+        image = constant_op.constant(img_single, shape=single_shape)
+        y = image_ops.resize_images_v2(image, new_size, self.METHODS[0])
+        yshape = array_ops.shape(y)
+        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        self.assertAllEqual(single_shape, newshape)
+        if method in self.INTERPOLATING_METHODS:
+          self.assertAllClose(resized, img_single, atol=1e-5)
+
+    # Incorrect shape.
+    with self.assertRaises(ValueError):
+      new_size = constant_op.constant(4)
+      _ = image_ops.resize_images_v2(image, new_size,
+                                     image_ops.ResizeMethod.BILINEAR)
+    with self.assertRaises(ValueError):
+      new_size = constant_op.constant([4])
+      _ = image_ops.resize_images_v2(image, new_size,
+                                     image_ops.ResizeMethod.BILINEAR)
+    with self.assertRaises(ValueError):
+      new_size = constant_op.constant([1, 2, 3])
+      _ = image_ops.resize_images_v2(image, new_size,
+                                     image_ops.ResizeMethod.BILINEAR)
+
+    # Incorrect dtypes.
+    with self.assertRaises(ValueError):
+      new_size = constant_op.constant([6.0, 4])
+      _ = image_ops.resize_images_v2(image, new_size,
+                                     image_ops.ResizeMethod.BILINEAR)
+    with self.assertRaises(ValueError):
+      _ = image_ops.resize_images_v2(image, [6, 4.0],
+                                     image_ops.ResizeMethod.BILINEAR)
+    with self.assertRaises(ValueError):
+      _ = image_ops.resize_images_v2(image, [None, 4],
+                                     image_ops.ResizeMethod.BILINEAR)
+    with self.assertRaises(ValueError):
+      _ = image_ops.resize_images_v2(image, [6, None],
+                                     image_ops.ResizeMethod.BILINEAR)
+
+  @test_util.run_deprecated_v1
+  def testReturnDtype(self):
+    target_shapes = [[6, 4], [3, 2],
+                     [
+                         array_ops.placeholder(dtypes.int32),
+                         array_ops.placeholder(dtypes.int32)
+                     ]]
+    for nptype in self.TYPES:
+      image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
+      for method in self.METHODS:
+        for target_shape in target_shapes:
+          y = image_ops.resize_images_v2(image, target_shape, method)
+          if method == image_ops.ResizeMethod.NEAREST_NEIGHBOR:
+            expected_dtype = image.dtype
+          else:
+            expected_dtype = dtypes.float32
+          self.assertEqual(y.dtype, expected_dtype)
+
+  def testSumTensor(self):
+    img_shape = [1, 6, 4, 1]
+    # This test is also conducted with int8, so 127 is the maximum
+    # value that can be used.
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    # Test size where width is specified as a tensor which is a sum
+    # of two tensors.
+    width_1 = constant_op.constant(1)
+    width_2 = constant_op.constant(3)
+    width = math_ops.add(width_1, width_2)
+    height = constant_op.constant(6)
+
+    img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
+
+    for method in self.METHODS:
+      with self.cached_session():
+        image = constant_op.constant(img_np, shape=img_shape)
+        y = image_ops.resize_images_v2(image, [height, width], method)
+        yshape = array_ops.shape(y)
+        resized, newshape = self.evaluate([y, yshape])
+        self.assertAllEqual(img_shape, newshape)
+        if method in self.INTERPOLATING_METHODS:
+          self.assertAllClose(resized, img_np, atol=1e-5)
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  def testResizeDown(self):
+    # This test is also conducted with int8, so 127 is the maximum
+    # value that can be used.
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    expected_data = [127, 64, 64, 127, 50, 100]
+    target_height = 3
+    target_width = 2
+
+    # Test out 3-D and 4-D image shapes.
+    img_shapes = [[1, 6, 4, 1], [6, 4, 1]]
+    target_shapes = [[1, target_height, target_width, 1],
+                     [target_height, target_width, 1]]
+
+    for target_shape, img_shape in zip(target_shapes, img_shapes):
+
+      for nptype in self.TYPES:
+        img_np = np.array(data, dtype=nptype).reshape(img_shape)
+
+        for method in self.METHODS:
+          if test.is_gpu_available() and self.shouldRunOnGPU(method, nptype):
+            with self.cached_session(use_gpu=True):
+              image = constant_op.constant(img_np, shape=img_shape)
+              y = image_ops.resize_images_v2(
+                  image, [target_height, target_width], method)
+              expected = np.array(expected_data).reshape(target_shape)
+              resized = self.evaluate(y)
+              self.assertAllClose(resized, expected, atol=1e-5)
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  def testResizeUp(self):
+    img_shape = [1, 3, 2, 1]
+    data = [64, 32, 32, 64, 50, 100]
+    target_height = 6
+    target_width = 4
+    expected_data = {}
+    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+        64.0, 56.0, 40.0, 32.0, 56.0, 52.0, 44.0, 40.0, 40.0, 44.0, 52.0, 56.0,
+        36.5, 45.625, 63.875, 73.0, 45.5, 56.875, 79.625, 91.0, 50.0, 62.5,
+        87.5, 100.0
+    ]
+    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
+    expected_data[image_ops.ResizeMethod.AREA] = [
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
+    expected_data[image_ops.ResizeMethod.LANCZOS3] = [
+        75.8294, 59.6281, 38.4313, 22.23, 60.6851, 52.0037, 40.6454, 31.964,
+        35.8344, 41.0779, 47.9383, 53.1818, 24.6968, 43.0769, 67.1244, 85.5045,
+        35.7939, 56.4713, 83.5243, 104.2017, 44.8138, 65.1949, 91.8603, 112.2413
+    ]
+    expected_data[image_ops.ResizeMethod.LANCZOS5] = [
+        77.5699, 60.0223, 40.6694, 23.1219, 61.8253, 51.2369, 39.5593, 28.9709,
+        35.7438, 40.8875, 46.5604, 51.7041, 21.5942, 43.5299, 67.7223, 89.658,
+        32.1213, 56.784, 83.984, 108.6467, 44.5802, 66.183, 90.0082, 111.6109
+    ]
+    expected_data[image_ops.ResizeMethod.GAUSSIAN] = [
+        61.1087, 54.6926, 41.3074, 34.8913, 54.6926, 51.4168, 44.5832, 41.3074,
+        41.696, 45.2456, 52.6508, 56.2004, 39.4273, 47.0526, 62.9602, 70.5855,
+        47.3008, 57.3042, 78.173, 88.1764, 51.4771, 62.3638, 85.0752, 95.9619
+    ]
+    expected_data[image_ops.ResizeMethod.BICUBIC] = [
+        70.1453, 59.0252, 36.9748, 25.8547, 59.3195, 53.3386, 41.4789, 35.4981,
+        36.383, 41.285, 51.0051, 55.9071, 30.2232, 42.151, 65.8032, 77.731,
+        41.6492, 55.823, 83.9288, 98.1026, 47.0363, 62.2744, 92.4903, 107.7284
+    ]
+    expected_data[image_ops.ResizeMethod.MITCHELLCUBIC] = [
+        66.0382, 56.6079, 39.3921, 29.9618, 56.7255, 51.9603, 43.2611, 38.4959,
+        39.1828, 43.4664, 51.2864, 55.57, 34.6287, 45.1812, 64.4458, 74.9983,
+        43.8523, 56.8078, 80.4594, 93.4149, 48.9943, 63.026, 88.6422, 102.6739
+    ]
+    for nptype in self.TYPES:
+      for method in expected_data:
+        with self.cached_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images_v2(image, [target_height, target_width],
+                                         method)
+          resized = self.evaluate(y)
+          expected = np.array(expected_data[method]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-04)
+
+  # XLA doesn't implement half_pixel_centers
+  @test_util.disable_xla("b/127616992")
+  def testLegacyBicubicMethodsMatchNewMethods(self):
+    img_shape = [1, 3, 2, 1]
+    data = [64, 32, 32, 64, 50, 100]
+    target_height = 6
+    target_width = 4
+    methods_to_test = ((gen_image_ops.resize_bilinear, "triangle"),
+                       (gen_image_ops.resize_bicubic, "keyscubic"))
+    for legacy_method, new_method in methods_to_test:
+      with self.cached_session(use_gpu=True):
+        img_np = np.array(data, dtype=np.float32).reshape(img_shape)
+        image = constant_op.constant(img_np, shape=img_shape)
+        legacy_result = legacy_method(
+            image,
+            constant_op.constant([target_height, target_width],
+                                 dtype=dtypes.int32),
+            half_pixel_centers=True)
+        scale = (
+            constant_op.constant([target_height, target_width],
+                                 dtype=dtypes.float32) /
+            math_ops.cast(array_ops.shape(image)[1:3], dtype=dtypes.float32))
+        new_result = gen_image_ops.scale_and_translate(
+            image,
+            constant_op.constant([target_height, target_width],
+                                 dtype=dtypes.int32),
+            scale,
+            array_ops.zeros([2]),
+            kernel_type=new_method,
+            antialias=False)
+        self.assertAllClose(
+            self.evaluate(legacy_result), self.evaluate(new_result), atol=1e-04)
+
+  def testResizeDownArea(self):
+    img_shape = [1, 6, 6, 1]
+    data = [
+        128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5,
+        10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30
+    ]
+    img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
+
+    target_height = 4
+    target_width = 4
+    expected_data = [
+        73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
+    ]
+
+    with self.cached_session(use_gpu=True):
+      image = constant_op.constant(img_np, shape=img_shape)
+      y = image_ops.resize_images_v2(image, [target_height, target_width],
+                                     image_ops.ResizeMethod.AREA)
+      expected = np.array(expected_data).reshape(
+          [1, target_height, target_width, 1])
+      resized = self.evaluate(y)
+      self.assertAllClose(resized, expected, atol=1)
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  def testCompareNearestNeighbor(self):
+    if test.is_gpu_available():
+      input_shape = [1, 5, 6, 3]
+      target_height = 8
+      target_width = 12
+      for nptype in [np.float32, np.float64]:
+        img_np = np.arange(
+            0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
+        with self.cached_session(use_gpu=True):
+          image = constant_op.constant(img_np, shape=input_shape)
+          new_size = constant_op.constant([target_height, target_width])
+          out_op = image_ops.resize_images_v2(
+              image, new_size, image_ops.ResizeMethod.NEAREST_NEIGHBOR)
+          gpu_val = self.evaluate(out_op)
+        with self.cached_session(use_gpu=False):
+          image = constant_op.constant(img_np, shape=input_shape)
+          new_size = constant_op.constant([target_height, target_width])
+          out_op = image_ops.resize_images_v2(
+              image, new_size, image_ops.ResizeMethod.NEAREST_NEIGHBOR)
+          cpu_val = self.evaluate(out_op)
+        self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+
+  def testCompareBilinear(self):
+    if test.is_gpu_available():
+      input_shape = [1, 5, 6, 3]
+      target_height = 8
+      target_width = 12
+      for nptype in [np.float32, np.float64]:
+        img_np = np.arange(
+            0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
+        value = {}
+        for use_gpu in [True, False]:
+          with self.cached_session(use_gpu=use_gpu):
+            image = constant_op.constant(img_np, shape=input_shape)
+            new_size = constant_op.constant([target_height, target_width])
+            out_op = image_ops.resize_images(image, new_size,
+                                             image_ops.ResizeMethod.BILINEAR)
+            value[use_gpu] = self.evaluate(out_op)
+        self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testShapeInference(self):
+    self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
+    self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
+    self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
+    self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
+    self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
+    self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
+    self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
+
+  @test_util.run_deprecated_v1
+  def testNameScope(self):
+    with self.cached_session(use_gpu=True):
+      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+      y = image_ops.resize_images(single_image, [55, 66])
+      self.assertTrue(y.op.name.startswith("resize"))
+
+  def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
+                       use_tensor_inputs):
+    if use_tensor_inputs:
+      target_max = ops.convert_to_tensor([max_h, max_w])
+      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+      feed_dict = {x_tensor: x}
+    else:
+      target_max = [max_h, max_w]
+      x_tensor = x
+      feed_dict = {}
+
+    y = image_ops.resize_images(
+        x_tensor, target_max, preserve_aspect_ratio=preserve_aspect_ratio)
+
+    with self.cached_session(use_gpu=True):
+      return y.eval(feed_dict=feed_dict)
+
+  def _assertResizeEqual(self,
+                         x,
+                         x_shape,
+                         y,
+                         y_shape,
+                         preserve_aspect_ratio=True,
+                         use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width, _ = y_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.array(y).reshape(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertAllClose(y, y_tf)
+
+  def _assertResizeCheckShape(self,
+                              x,
+                              x_shape,
+                              target_shape,
+                              y_shape,
+                              preserve_aspect_ratio=True,
+                              use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width = target_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.zeros(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(
+        x, x_shape, [250, 250], [10, 250, 250, 10], preserve_aspect_ratio=False)
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioNoOp(self):
+    x_shape = [10, 10, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeEqual(x, x_shape, x, x_shape)
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioSmaller(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioSmallerMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioLarger(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioSameRatio(self):
+    x_shape = [1920, 1080, 3]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
+
+  @test_util.run_deprecated_v1
+  def testPreserveAspectRatioSquare(self):
+    x_shape = [299, 299, 3]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [320, 320], [320, 320, 3])
+
+
+class ResizeImagesTest(test_util.TensorFlowTestCase):
+
+  METHODS = [
+      image_ops.ResizeMethodV1.BILINEAR,
+      image_ops.ResizeMethodV1.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethodV1.BICUBIC, image_ops.ResizeMethodV1.AREA
   ]
 
   TYPES = [
@@ -2251,13 +2761,14 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_images(images, size)
     self.assertEqual(y.get_shape().as_list(), [None] + post_shape)
 
-  def shouldRunOnGPU(self, opt, nptype):
-    if (opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR and
+  def shouldRunOnGPU(self, method, nptype):
+    if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR and
         nptype in [np.float32, np.float64]):
       return True
     else:
       return False
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
   @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
@@ -2274,21 +2785,22 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     for nptype in self.TYPES:
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
-      for opt in self.OPTIONS:
-        with self.test_session(use_gpu=True) as sess:
+      for method in self.METHODS:
+        with self.cached_session(use_gpu=True) as sess:
           image = constant_op.constant(img_np, shape=img_shape)
-          y = image_ops.resize_images(image, [target_height, target_width], opt)
+          y = image_ops.resize_images(image, [target_height, target_width],
+                                      method)
           yshape = array_ops.shape(y)
           resized, newshape = self.evaluate([y, yshape])
           self.assertAllEqual(img_shape, newshape)
           self.assertAllClose(resized, img_np, atol=1e-5)
 
       # Resizing with a single image must leave the shape unchanged also.
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         img_single = img_np.reshape(single_shape)
         image = constant_op.constant(img_single, shape=single_shape)
         y = image_ops.resize_images(image, [target_height, target_width],
-                                    self.OPTIONS[0])
+                                    self.METHODS[0])
         yshape = array_ops.shape(y)
         newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
@@ -2307,20 +2819,20 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
-    for opt in self.OPTIONS:
-      with self.test_session(use_gpu=True) as sess:
+    for method in self.METHODS:
+      with self.cached_session(use_gpu=True) as sess:
         image = constant_op.constant(img_np, shape=img_shape)
-        y = image_ops.resize_images(image, new_size, opt)
+        y = image_ops.resize_images(image, new_size, method)
         yshape = array_ops.shape(y)
         resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
     # Resizing with a single image must leave the shape unchanged also.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       img_single = img_np.reshape(single_shape)
       image = constant_op.constant(img_single, shape=single_shape)
-      y = image_ops.resize_images(image, new_size, self.OPTIONS[0])
+      y = image_ops.resize_images(image, new_size, self.METHODS[0])
       yshape = array_ops.shape(y)
       resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
       self.assertAllEqual(single_shape, newshape)
@@ -2330,30 +2842,30 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       new_size = constant_op.constant(4)
       _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([4])
       _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([1, 2, 3])
       _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
 
     # Incorrect dtypes.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([6.0, 4])
       _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       _ = image_ops.resize_images(image, [6, 4.0],
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       _ = image_ops.resize_images(image, [None, 4],
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       _ = image_ops.resize_images(image, [6, None],
-                                  image_ops.ResizeMethod.BILINEAR)
+                                  image_ops.ResizeMethodV1.BILINEAR)
 
   @test_util.run_deprecated_v1
   def testReturnDtype(self):
@@ -2363,16 +2875,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     ]]
     for nptype in self.TYPES:
       image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
-      for opt in self.OPTIONS:
+      for method in self.METHODS:
         for target_shape in target_shapes:
-          y = image_ops.resize_images(image, target_shape, opt)
-          if (opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR or
+          y = image_ops.resize_images(image, target_shape, method)
+          if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
               target_shape == image.shape[1:3]):
             expected_dtype = image.dtype
           else:
             expected_dtype = dtypes.float32
           self.assertEqual(y.dtype, expected_dtype)
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
   def testSumTensor(self):
     img_shape = [1, 6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
@@ -2390,15 +2903,16 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
-    for opt in self.OPTIONS:
+    for method in self.METHODS:
       with self.cached_session() as sess:
         image = constant_op.constant(img_np, shape=img_shape)
-        y = image_ops.resize_images(image, [height, width], opt)
+        y = image_ops.resize_images(image, [height, width], method)
         yshape = array_ops.shape(y)
         resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
@@ -2420,50 +2934,52 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       for nptype in self.TYPES:
         img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
-        for opt in self.OPTIONS:
-          if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-            with self.test_session(use_gpu=True):
+        for method in self.METHODS:
+          if test.is_gpu_available() and self.shouldRunOnGPU(method, nptype):
+            with self.cached_session(use_gpu=True):
               image = constant_op.constant(img_np, shape=img_shape)
               y = image_ops.resize_images(image, [target_height, target_width],
-                                          opt)
+                                          method)
               expected = np.array(expected_data).reshape(target_shape)
               resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32, 32, 64, 50, 100]
     target_height = 6
     target_width = 4
     expected_data = {}
-    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+    expected_data[image_ops.ResizeMethodV1.BILINEAR] = [
         64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0,
         41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0,
         100.0
     ]
-    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+    expected_data[image_ops.ResizeMethodV1.NEAREST_NEIGHBOR] = [
         64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
         32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
         100.0
     ]
-    expected_data[image_ops.ResizeMethod.AREA] = [
+    expected_data[image_ops.ResizeMethodV1.AREA] = [
         64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
         32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
         100.0
     ]
 
     for nptype in self.TYPES:
-      for opt in [
-          image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      for method in [
+          image_ops.ResizeMethodV1.BILINEAR,
+          image_ops.ResizeMethodV1.NEAREST_NEIGHBOR,
+          image_ops.ResizeMethodV1.AREA
       ]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
-              image, [target_height, target_width], opt, align_corners=False)
+              image, [target_height, target_width], method, align_corners=False)
           resized = self.evaluate(y)
-          expected = np.array(expected_data[opt]).reshape(
+          expected = np.array(expected_data[method]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
 
@@ -2473,33 +2989,34 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     target_height = 5
     target_width = 4
     expected_data = {}
-    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+    expected_data[image_ops.ResizeMethodV1.BILINEAR] = [
         6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5,
         6.5, 7.5, 6.0, 7.0, 8.0, 9.0
     ]
-    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+    expected_data[image_ops.ResizeMethodV1.NEAREST_NEIGHBOR] = [
         6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0,
         9.0, 9.0, 6.0, 6.0, 9.0, 9.0
     ]
-    # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
+    # TODO(b/37749740): Improve alignment of ResizeMethodV1.AREA when
     # align_corners=True.
-    expected_data[image_ops.ResizeMethod.AREA] = [
+    expected_data[image_ops.ResizeMethodV1.AREA] = [
         6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0,
         3.0, 6.0, 6.0, 6.0, 6.0, 9.0
     ]
 
     for nptype in self.TYPES:
-      for opt in [
-          image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      for method in [
+          image_ops.ResizeMethodV1.BILINEAR,
+          image_ops.ResizeMethodV1.NEAREST_NEIGHBOR,
+          image_ops.ResizeMethodV1.AREA
       ]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
-              image, [target_height, target_width], opt, align_corners=True)
+              image, [target_height, target_width], method, align_corners=True)
           resized = self.evaluate(y)
-          expected = np.array(expected_data[opt]).reshape(
+          expected = np.array(expected_data[method]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
 
@@ -2521,10 +3038,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
     ]
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
-                                  image_ops.ResizeMethod.BICUBIC)
+                                  image_ops.ResizeMethodV1.BICUBIC)
       resized = self.evaluate(y)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
@@ -2544,15 +3061,16 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
     ]
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
-                                  image_ops.ResizeMethod.AREA)
+                                  image_ops.ResizeMethodV1.AREA)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
       resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
   def testCompareNearestNeighbor(self):
     if test.is_gpu_available():
       input_shape = [1, 5, 6, 3]
@@ -2562,22 +3080,22 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         for align_corners in [True, False]:
           img_np = np.arange(
               0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
-          with self.test_session(use_gpu=True):
+          with self.cached_session(use_gpu=True):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
             out_op = image_ops.resize_images(
                 image,
                 new_size,
-                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                image_ops.ResizeMethodV1.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
             gpu_val = self.evaluate(out_op)
-          with self.test_session(use_gpu=False):
+          with self.cached_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
             out_op = image_ops.resize_images(
                 image,
                 new_size,
-                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                image_ops.ResizeMethodV1.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
             cpu_val = self.evaluate(out_op)
           self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
@@ -2593,13 +3111,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
           value = {}
           for use_gpu in [True, False]:
-            with self.test_session(use_gpu=use_gpu):
+            with self.cached_session(use_gpu=use_gpu):
               image = constant_op.constant(img_np, shape=input_shape)
               new_size = constant_op.constant([target_height, target_width])
               out_op = image_ops.resize_images(
                   image,
                   new_size,
-                  image_ops.ResizeMethod.BILINEAR,
+                  image_ops.ResizeMethodV1.BILINEAR,
                   align_corners=align_corners)
               value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
@@ -2628,7 +3146,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testNameScope(self):
     img_shape = [1, 3, 2, 1]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
       self.assertTrue(y.op.name.startswith("resize"))
@@ -2647,7 +3165,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_images(x_tensor, target_max,
                                 preserve_aspect_ratio=preserve_aspect_ratio)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertResizeEqual(self, x, x_shape, y, y_shape,
@@ -2727,7 +3245,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [320, 320], [320, 320, 3])
 
 
-class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
+class ResizeImageWithPadV1Test(test_util.TensorFlowTestCase):
 
   def _ResizeImageWithPad(self, x, target_height, target_width,
                           use_tensor_inputs):
@@ -2740,12 +3258,12 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
       x_tensor = x
       feed_dict = {}
 
-    y = image_ops.resize_image_with_pad(x_tensor, target_height,
-                                        target_width)
+    y = image_ops.resize_image_with_pad_v1(x_tensor, target_height,
+                                           target_width)
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -2786,7 +3304,7 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
-    y = image_ops.resize_image_with_pad(image, height, width)
+    y = image_ops.resize_image_with_pad_v1(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
   @test_util.run_deprecated_v1
@@ -2825,6 +3343,107 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
     self._assertReturns(x, x_shape, y, y_shape)
 
 
+class ResizeImageWithPadV2Test(test_util.TensorFlowTestCase):
+
+  def _ResizeImageWithPad(self, x, target_height, target_width,
+                          use_tensor_inputs):
+    if use_tensor_inputs:
+      target_height = ops.convert_to_tensor(target_height)
+      target_width = ops.convert_to_tensor(target_width)
+      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+      feed_dict = {x_tensor: x}
+    else:
+      x_tensor = x
+      feed_dict = {}
+
+    y = image_ops.resize_image_with_pad_v2(x_tensor, target_height,
+                                           target_width)
+    if not use_tensor_inputs:
+      self.assertTrue(y.get_shape().is_fully_defined())
+
+    with self.cached_session(use_gpu=True):
+      return y.eval(feed_dict=feed_dict)
+
+  def _assertReturns(self,
+                     x,
+                     x_shape,
+                     y,
+                     y_shape,
+                     use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width, _ = y_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.array(y).reshape(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageWithPad(x, target_height, target_width,
+                                      use_tensor_inputs)
+      self.assertAllClose(y, y_tf)
+
+  def _assertRaises(self,
+                    x,
+                    x_shape,
+                    target_height,
+                    target_width,
+                    err_msg,
+                    use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    x = np.array(x).reshape(x_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      try:
+        self._ResizeImageWithPad(x, target_height, target_width,
+                                 use_tensor_inputs)
+      except Exception as e:  # pylint: disable=broad-except
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+
+  def _assertShapeInference(self, pre_shape, height, width, post_shape):
+    image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
+    y = image_ops.resize_image_with_pad_v1(image, height, width)
+    self.assertEqual(y.get_shape().as_list(), post_shape)
+
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  @test_util.run_deprecated_v1
+  def testNoOp(self):
+    x_shape = [10, 10, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertReturns(x, x_shape, x, x_shape)
+
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  @test_util.run_deprecated_v1
+  def testPad(self):
+    # Reduce vertical dimension
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [0, 3.5, 5.5, 0]
+    y_shape = [1, 4, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+    # Reduce horizontal dimension
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [3.5, 5.5, 0, 0]
+    y_shape = [2, 2, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
+    x_shape = [2, 4, 1]
+
+    y = [3.5, 5.5]
+    y_shape = [1, 2, 1]
+
+    self._assertReturns(x, x_shape, y, y_shape)
+
+
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def _ResizeImageWithCropOrPad(self, x, target_height, target_width,
@@ -2843,7 +3462,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -3098,7 +3717,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a real jpeg and verify shape
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
       image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
@@ -3114,7 +3733,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     cmyk_path = os.path.join(base, "jpeg_merge_test1_cmyk.jpg")
     shape = 256, 128, 3
     for channels in 3, 0:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         rgb = image_ops.decode_jpeg(
             io_ops.read_file(rgb_path), channels=channels)
         cmyk = image_ops.decode_jpeg(
@@ -3171,7 +3790,7 @@ class JpegTest(test_util.TensorFlowTestCase):
           self.evaluate(result)
 
   def testSynthetic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
@@ -3192,7 +3811,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(jpeg0), 6000)
 
   def testSyntheticFasterAlgorithm(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
@@ -3216,7 +3835,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(jpeg0), 6000)
 
   def testDefaultDCTMethodIsIntegerFast(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Compare decoding with both dct_option=INTEGER_FAST and
       # default.  They should be the same.
       image0 = constant_op.constant(_SimpleColorRamp())
@@ -3230,7 +3849,7 @@ class JpegTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_jpeg(jpeg, channels=channels)
@@ -3242,7 +3861,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = io_ops.read_file(path)
       # Extract shape without decoding.
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
@@ -3253,7 +3872,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1_cmyk.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = io_ops.read_file(path)
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
       # Cmyk jpeg image has 4 channels.
@@ -3269,7 +3888,7 @@ class PngTest(test_util.TensorFlowTestCase):
               (3, "lena_palette.png"), (4, "lena_palette_trns.png"))
     for channels_in, filename in inputs:
       for channels in 0, 1, 3, 4:
-        with self.test_session(use_gpu=True) as sess:
+        with self.cached_session(use_gpu=True) as sess:
           png0 = io_ops.read_file(prefix + filename)
           image0 = image_ops.decode_png(png0, channels=channels)
           png0, image0 = self.evaluate([png0, image0])
@@ -3279,7 +3898,7 @@ class PngTest(test_util.TensorFlowTestCase):
             self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
       image0 = constant_op.constant(_SimpleColorRamp())
       png0 = image_ops.encode_png(image0, compression=7)
@@ -3294,7 +3913,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(png0), 750)
 
   def testSyntheticUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
       image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
@@ -3309,7 +3928,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(png0), 1500)
 
   def testSyntheticTwoChannel(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
       gray_alpha = _SimpleColorRamp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha)
@@ -3320,7 +3939,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testSyntheticTwoChannelUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
       gray_alpha = _SimpleColorRamp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
@@ -3332,7 +3951,7 @@ class PngTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       png = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_png(png, channels=channels)
@@ -3350,7 +3969,7 @@ class GifTest(test_util.TensorFlowTestCase):
     STRIDE = 5
     shape = (12, HEIGHT, WIDTH, 3)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
       image0 = image_ops.decode_gif(gif0)
       gif0, image0 = self.evaluate([gif0, image0])
@@ -3377,7 +3996,7 @@ class GifTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       gif = constant_op.constant("nonsense")
       image = image_ops.decode_gif(gif)
       self.assertEqual(image.get_shape().as_list(), [None, None, None, 3])
@@ -3389,7 +4008,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     x_np = np.array(original, dtype=original_dtype.as_numpy_dtype())
     y_np = np.array(expected, dtype=output_dtype.as_numpy_dtype())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(x_np)
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
@@ -3405,7 +4024,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Make sure converting to the same data type creates only an identity op
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant([1], dtype=dtypes.uint8)
       image_ops.convert_image_dtype(image, dtypes.uint8)
       y = image_ops.convert_image_dtype(image, dtypes.uint8)
@@ -3415,7 +4034,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
@@ -3424,7 +4043,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float32, dtypes.float64,
                     [-1.0, 0, 1.0, 200000])
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
@@ -3433,7 +4052,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([0, 1, 255], dtypes.uint8, dtypes.float32,
                     [0, 1.0 / 255.0, 1])
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
@@ -3441,7 +4060,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # uint8, uint16
       self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
       self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
@@ -3472,7 +4091,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     """
 
     # Create a TensorFlow session.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # Add a constant to the TensorFlow graph that holds the input.
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
 
@@ -3612,7 +4231,8 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # If we negate all pixel-values then the total variation is unchanged.
     self._test(-a, tot_var)
 
-    # Scale the pixel-values by a float. This scales the total variation as well.
+    # Scale the pixel-values by a float. This scales the total variation as
+    # well.
     b = 1.1 * a
     self._test(b, 1.1 * tot_var)
 
@@ -3860,7 +4480,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
     img = array_ops.placeholder(dtype=dtypes.float32)
     img_np = np.array((2, 2))
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       _, _, checks = image_ops_impl._verify_compatible_image_shapes(img, img)
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img: img_np})
@@ -3873,7 +4493,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
     img1_np = np.array([1, 2, 2, 1])
     img2_np = np.array([1, 3, 3, 1])
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       _, _, checks = image_ops_impl._verify_compatible_image_shapes(img1, img2)
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img1: img1_np, img2: img2_np})
@@ -3891,7 +4511,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       q20 = self._LoadTestImage(sess, "cat_q20.jpg")
       q72 = self._LoadTestImage(sess, "cat_q72.jpg")
       q95 = self._LoadTestImage(sess, "cat_q95.jpg")
@@ -3912,7 +4532,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     image2 = self._RandomImage((8, 8, 1), 1)
     psnr = self._PSNR_NumPy(image1, image2, 1)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_image1 = constant_op.constant(image1, shape=image1.shape,
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
@@ -3926,7 +4546,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     image2 = self._RandomImage((10, 8, 8, 1), 1)
     psnr = self._PSNR_NumPy(image1, image2, 1)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_image1 = constant_op.constant(image1, shape=image1.shape,
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
@@ -3948,7 +4568,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     self.assertNear(35.302, psnr3, 0.001)
 
     # Test TensorFlow implementation.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
       tf_q72 = constant_op.constant(q72, shape=q72.shape, dtype=dtypes.float32)
       tf_q95 = constant_op.constant(q95, shape=q95.shape, dtype=dtypes.float32)
@@ -3963,7 +4583,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
       tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
@@ -3978,7 +4598,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
@@ -4003,7 +4623,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       return [self._LoadTestImage(sess, f) for f in self._filenames]
 
   def _RandomImage(self, shape, max_val):
@@ -4018,7 +4638,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
     ssim = image_ops.ssim(*ph, max_val=1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       scores = [ssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
@@ -4033,7 +4653,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
@@ -4045,7 +4665,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
     ssim = image_ops.ssim(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   @test_util.run_deprecated_v1
@@ -4060,7 +4680,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 255)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
   @test_util.run_deprecated_v1
@@ -4073,7 +4693,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
@@ -4098,7 +4718,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       return [self._LoadTestImage(sess, f) for f in self._filenames]
 
   def _RandomImage(self, shape, max_val):
@@ -4116,7 +4736,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
     msssim = image_ops.ssim_multiscale(*ph, max_val=1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       scores = [msssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
 
@@ -4131,7 +4751,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(*scaled_ph, max_val=1.0,
                                        power_factors=(1, 1, 1, 1, 1))
     grads = gradients.gradients(msssim, scalar)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       np_grads = sess.run(grads, feed_dict={ph[0]: img[0], ph[1]: img[1]})
     self.assertTrue(np.isfinite(np_grads).all())
 
@@ -4146,7 +4766,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
@@ -4159,7 +4779,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
@@ -4169,7 +4789,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     If any of the value is negative so that the geometric mean is not
     well-defined, then treat the MS-SSIM score as zero.
     """
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       img1 = self._LoadTestImage(sess, "checkerboard1.png")
       img2 = self._LoadTestImage(sess, "checkerboard3.png")
       images = [img1, img2, np.zeros_like(img1),
@@ -4194,7 +4814,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
@@ -4235,7 +4855,7 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     batch = constant_op.constant(batch)
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_dy = self.evaluate(dy)
       actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
@@ -4256,7 +4876,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
     expected = np.reshape([[[0, 0], [0, 12], [0, 0]],
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
@@ -4278,7 +4898,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
     expected_batch = np.concatenate([expected_two_channel] * batch_size, axis=0)
 
     sobel = image_ops.sobel_edges(img)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
@@ -4286,7 +4906,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 class DecodeImageTest(test_util.TensorFlowTestCase):
 
   def testJpegUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
@@ -4296,7 +4916,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/png/testdata"
       png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
       image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
@@ -4306,7 +4926,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testGifUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/gif/testdata"
       gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
       image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
@@ -4316,7 +4936,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/bmp/testdata"
       bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
@@ -4326,7 +4946,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
@@ -4336,7 +4956,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/png/testdata"
       png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
       image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
@@ -4346,7 +4966,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/gif/testdata"
       gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
       image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
@@ -4356,7 +4976,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/bmp/testdata"
       bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index a4cebc8d5891da23e9c1042b478dcabe9b7994a0..035534ef49cc4c715b2101beb98e1d1aa6a72071 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -38,6 +38,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
@@ -531,7 +532,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -605,7 +606,8 @@ class Orthogonal(Initializer):
     num_rows = 1
     for dim in shape[:-1]:
       num_rows *= dim
-    num_cols = shape[-1]
+    num_rows = int(num_rows)
+    num_cols = int(shape[-1])
     if num_rows < num_cols:
       flat_shape = (num_cols, num_rows)
     else:
@@ -1207,6 +1209,8 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
+    if isinstance(full_shape, tensor_shape.TensorShape):
+      full_shape = full_shape.as_list()
     initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
@@ -1263,9 +1267,10 @@ class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / (fan_in + fan_out))`
-  where `fan_in` is the number of input units in the weight tensor
-  and `fan_out` is the number of output units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number
+  of input units in the weight tensor and `fan_out` is the number of
+  output units in the weight tensor.
 
   Args:
     seed: A Python integer. Used to create random seeds. See
@@ -1321,8 +1326,9 @@ def lecun_normal(seed=None):
   """LeCun normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(1 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(1 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1371,8 +1377,9 @@ def he_normal(seed=None):
   """He normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1422,7 +1429,7 @@ def _compute_fans(shape):
     shape: Integer shape tuple or TF tensor shape.
 
   Returns:
-    A tuple of scalars (fan_in, fan_out).
+    A tuple of integer scalars (fan_in, fan_out).
   """
   if len(shape) < 1:  # Just to avoid errors for constants.
     fan_in = fan_out = 1
@@ -1434,12 +1441,12 @@ def _compute_fans(shape):
   else:
     # Assuming convolution kernels (2D, 3D, or more).
     # kernel shape: (..., input_depth, depth)
-    receptive_field_size = 1.
+    receptive_field_size = 1
     for dim in shape[:-2]:
       receptive_field_size *= dim
     fan_in = shape[-2] * receptive_field_size
     fan_out = shape[-1] * receptive_field_size
-  return fan_in, fan_out
+  return int(fan_in), int(fan_out)
 
 
 def _assert_float_dtype(dtype):
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index b3cdec9dd407c26277ed2d710397a0a831d75e16..1205f367bc99c8e07b97d45b6e4ae7089a089e13 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape as tensor_shape_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
@@ -54,114 +55,126 @@ class InitializersTest(test.TestCase):
       self.assertGreater(lim, abs(output.min() - target_min))
 
   def test_uniform(self):
-    tensor_shape = (9, 6, 7)
+    shape = (9, 6, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
-          tensor_shape,
-          target_mean=0.,
-          target_max=1,
-          target_min=-1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
+            tensor_shape,
+            target_mean=0.,
+            target_max=1,
+            target_min=-1)
 
   def test_normal(self):
-    tensor_shape = (8, 12, 99)
+    shape = (8, 12, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomNormal(mean=0, stddev=1, seed=153),
-          tensor_shape,
-          target_mean=0.,
-          target_std=1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomNormal(mean=0, stddev=1, seed=153),
+            tensor_shape,
+            target_mean=0.,
+            target_std=1)
 
   def test_truncated_normal(self):
-    tensor_shape = (12, 99, 7)
+    shape = (12, 99, 7)
     with self.cached_session():
-      self._runner(
-          init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
-          tensor_shape,
-          target_mean=0.,
-          target_max=2,
-          target_min=-2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
+            tensor_shape,
+            target_mean=0.,
+            target_max=2,
+            target_min=-2)
 
   def test_constant(self):
-    tensor_shape = (5, 6, 4)
+    shape = (5, 6, 4)
     with self.cached_session():
-      self._runner(
-          init_ops.Constant(2),
-          tensor_shape,
-          target_mean=2,
-          target_max=2,
-          target_min=2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Constant(2),
+            tensor_shape,
+            target_mean=2,
+            target_max=2,
+            target_min=2)
 
   def test_lecun_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_uniform_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_uniform_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_uniform_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_lecun_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_normal_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_normal_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_normal_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_Orthogonal(self):
-    tensor_shape = (20, 20)
+    shape = (20, 20)
     with self.cached_session():
-      self._runner(init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
 
   def testVariablePlacementWithOrthogonalInitializer(self):
     if not context.context().num_gpus():
@@ -199,31 +212,36 @@ class InitializersTest(test.TestCase):
 
   def test_Identity(self):
     with self.cached_session():
-      tensor_shape = (3, 4, 5)
-      with self.assertRaises(ValueError):
+      shape = (3, 4, 5)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        with self.assertRaises(ValueError):
+          self._runner(
+              init_ops.Identity(),
+              tensor_shape,
+              target_mean=1. / int(tensor_shape[0]),
+              target_max=1.)
+
+      shape = (3, 3)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
         self._runner(
             init_ops.Identity(),
             tensor_shape,
-            target_mean=1. / tensor_shape[0],
+            target_mean=1. / int(tensor_shape[0]),
             target_max=1.)
 
-      tensor_shape = (3, 3)
-      self._runner(
-          init_ops.Identity(),
-          tensor_shape,
-          target_mean=1. / tensor_shape[0],
-          target_max=1.)
-
   def test_Zeros(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(
-          init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
 
   def test_Ones(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
index 4e614e0012a279a2c4257a850579bc63577207b7..f7a867870cf60c6a9f0d8bbf25371fc3a1784194 100644
--- a/tensorflow/python/ops/init_ops_v2.py
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -232,6 +233,7 @@ class RandomUniform(Initializer):
     self.minval = minval
     self.maxval = maxval
     self.seed = seed
+    self._random_generator = _RandomGenerator(seed)
 
   def __call__(self, shape, dtype=dtypes.float32):
     """Returns a tensor object initialized as specified by the initializer.
@@ -247,8 +249,8 @@ class RandomUniform(Initializer):
     dtype = dtypes.as_dtype(dtype)
     if not dtype.is_floating and not dtype.is_integer:
       raise ValueError("Expected float or integer dtype, got %s." % dtype)
-    return random_ops.random_uniform(
-        shape, self.minval, self.maxval, dtype, seed=self.seed)
+    return self._random_generator.random_uniform(shape, self.minval,
+                                                 self.maxval, dtype)
 
   def get_config(self):
     return {
@@ -276,6 +278,7 @@ class RandomNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
+    self._random_generator = _RandomGenerator(seed)
 
   def __call__(self, shape, dtype=dtypes.float32):
     """Returns a tensor object initialized as specified by the initializer.
@@ -289,8 +292,8 @@ class RandomNormal(Initializer):
       ValueError: If the dtype is not floating point
     """
     dtype = _assert_float_dtype(dtype)
-    return random_ops.random_normal(
-        shape, self.mean, self.stddev, dtype, seed=self.seed)
+    return self._random_generator.random_normal(shape, self.mean, self.stddev,
+                                                dtype)
 
   def get_config(self):
     return {
@@ -322,6 +325,7 @@ class TruncatedNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
+    self._random_generator = _RandomGenerator(seed)
 
   def __call__(self, shape, dtype=dtypes.float32):
     """Returns a tensor object initialized as specified by the initializer.
@@ -335,8 +339,8 @@ class TruncatedNormal(Initializer):
       ValueError: If the dtype is not floating point
     """
     dtype = _assert_float_dtype(dtype)
-    return random_ops.truncated_normal(
-        shape, self.mean, self.stddev, dtype, seed=self.seed)
+    return self._random_generator.truncated_normal(shape, self.mean,
+                                                   self.stddev, dtype)
 
   def get_config(self):
     return {
@@ -392,6 +396,7 @@ class VarianceScaling(Initializer):
     self.mode = mode
     self.distribution = distribution
     self.seed = seed
+    self._random_generator = _RandomGenerator(seed)
 
   def __call__(self, shape, dtype=dtypes.float32):
     """Returns a tensor object initialized as specified by the initializer.
@@ -420,16 +425,13 @@ class VarianceScaling(Initializer):
     if self.distribution == "truncated_normal":
       # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
-      return random_ops.truncated_normal(
-          shape, 0.0, stddev, dtype, seed=self.seed)
+      return self._random_generator.truncated_normal(shape, 0.0, stddev, dtype)
     elif self.distribution == "untruncated_normal":
       stddev = math.sqrt(scale)
-      return random_ops.random_normal(
-          shape, 0.0, stddev, dtype, seed=self.seed)
+      return self._random_generator.random_normal(shape, 0.0, stddev, dtype)
     else:
       limit = math.sqrt(3.0 * scale)
-      return random_ops.random_uniform(
-          shape, -limit, limit, dtype, seed=self.seed)
+      return self._random_generator.random_uniform(shape, -limit, limit, dtype)
 
   def get_config(self):
     return {
@@ -468,6 +470,7 @@ class Orthogonal(Initializer):
   def __init__(self, gain=1.0, seed=None):
     self.gain = gain
     self.seed = seed
+    self._random_generator = _RandomGenerator(seed)
 
   def __call__(self, shape, dtype=dtypes.float32):
     """Returns a tensor object initialized as specified by the initializer.
@@ -475,7 +478,7 @@ class Orthogonal(Initializer):
     Args:
       shape: Shape of the tensor.
       dtype: Optional dtype of the tensor. Only floating point types are
-       supported.
+        supported.
 
     Raises:
       ValueError: If the dtype is not floating point or the input shape is not
@@ -495,7 +498,7 @@ class Orthogonal(Initializer):
     flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows))
 
     # Generate a random matrix
-    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
+    a = self._random_generator.random_normal(flat_shape, dtype=dtype)
     # Compute the qr factorization
     q, r = gen_linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
@@ -602,7 +605,7 @@ class GlorotNormal(VarianceScaling):
         seed=seed)
 
   def get_config(self):
-    return {"seed": self.seed, "dtype": self.dtype.name}
+    return {"seed": self.seed}
 
 
 # Aliases.
@@ -762,3 +765,56 @@ def _assert_float_dtype(dtype):
   if not dtype.is_floating:
     raise ValueError("Expected floating point type, got %s." % dtype)
   return dtype
+
+
+class _RandomGenerator(object):
+  """Random generator that selects appropriate random ops."""
+
+  def __init__(self, seed=None):
+    super(_RandomGenerator, self).__init__()
+    if seed is not None:
+      # Stateless random ops requires 2-int seed.
+      self.seed = [seed, 0]
+    else:
+      self.seed = None
+
+  def random_normal(self, shape, mean=0.0, stddev=1, dtype=dtypes.float32):
+    """A deterministic random normal if seed is passed."""
+    if self.seed:
+      op = stateless_random_ops.stateless_random_normal
+    else:
+      op = random_ops.random_normal
+    return op(
+        shape=shape, mean=mean, stddev=stddev, dtype=dtype, seed=self.seed)
+
+  def random_uniform(self, shape, minval, maxval, dtype):
+    """A deterministic random uniform if seed is passed."""
+    if self.seed:
+      op = stateless_random_ops.stateless_random_uniform
+    else:
+      op = random_ops.random_uniform
+    return op(
+        shape=shape, minval=minval, maxval=maxval, dtype=dtype, seed=self.seed)
+
+  def truncated_normal(self, shape, mean, stddev, dtype):
+    """A deterministic truncated normal if seed is passed."""
+    if self.seed:
+      op = stateless_random_ops.stateless_truncated_normal
+    else:
+      op = random_ops.truncated_normal
+    return op(
+        shape=shape, mean=mean, stddev=stddev, dtype=dtype, seed=self.seed)
+
+# Compatibility aliases
+
+# pylint: disable=invalid-name
+zero = zeros = Zeros
+one = ones = Ones
+constant = Constant
+uniform = random_uniform = RandomUniform
+normal = random_normal = RandomNormal
+truncated_normal = TruncatedNormal
+identity = Identity
+orthogonal = Orthogonal
+glorot_normal = GlorotNormal
+glorot_uniform = GlorotUniform
diff --git a/tensorflow/python/ops/inplace_ops.py b/tensorflow/python/ops/inplace_ops.py
index e5b000086b05219d23cd88935948f88f2cc718bf..a6b6f7a28b43f998a0e0f599fa6a41e5640592b1 100644
--- a/tensorflow/python/ops/inplace_ops.py
+++ b/tensorflow/python/ops/inplace_ops.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -55,7 +56,7 @@ def _inplace_helper(x, i, v, op):
     return array_ops.reshape(
         op(array_ops.reshape(x, [1, -1]), [0], array_ops.reshape(v, [1, -1])),
         array_ops.shape(x))
-  i = math_ops.to_int32(i)
+  i = math_ops.cast(i, dtypes.int32)
   if i.get_shape().ndims == 0:
     # Single 0-dim update.
     return op(x, array_ops.reshape(i, [1]), array_ops.expand_dims(v, 0))
diff --git a/tensorflow/python/ops/linalg/adjoint_registrations.py b/tensorflow/python/ops/linalg/adjoint_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ec97d2758f80aaa90c52646430b0d9c5e642bd
--- /dev/null
+++ b/tensorflow/python/ops/linalg/adjoint_registrations.py
@@ -0,0 +1,127 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.adjoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorAdjoint which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterAdjoint(linear_operator.LinearOperator)
+def _adjoint_linear_operator(linop):
+  return linear_operator_adjoint.LinearOperatorAdjoint(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_adjoint.LinearOperatorAdjoint)
+def _adjoint_adjoint_linear_operator(linop):
+  return linop.operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorIdentity)
+def _adjoint_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _adjoint_scaled_identity(identity_operator):
+  multiplier = identity_operator.multiplier
+  if multiplier.dtype.is_complex:
+    multiplier = math_ops.conj(multiplier)
+
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=identity_operator.is_self_adjoint,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_diag.LinearOperatorDiag)
+def _adjoint_diag(diag_operator):
+  diag = diag_operator.diag
+  if diag.dtype.is_complex:
+    diag = math_ops.conj(diag)
+
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _adjoint_block_diag(block_diag_operator):
+    # We take the adjoint of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.adjoint() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _adjoint_kronecker(kronecker_operator):
+    # Adjoint of a Kronecker product is the Kronecker product
+    # of adjoints.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.adjoint() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _adjoint_circulant(circulant_operator):
+  spectrum = circulant_operator.spectrum
+  if spectrum.dtype.is_complex:
+    spectrum = math_ops.conj(spectrum)
+
+  # Conjugating the spectrum is sufficient to get the adjoint.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index eebe741337d8eefae44e5206ce990edbf261bdd9..b9f8411c934aabfa30de2d684d5afcb354401509 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import adjoint_registrations as _adjoint_registrations
 from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
 from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 2259eaa65cd1a857e369ee8673165c76c882df7e..fec2b2713e3709d9104204412a8c52fd062e9336 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -329,3 +330,189 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
           result,
           array_ops.concat((batch_shape, array_ops.shape(result)[-2:]), axis=0))
     return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
+
+
+@tf_export('linalg.tridiagonal_solve')
+def tridiagonal_solve(diagonals,
+                      rhs,
+                      diagonals_format='compact',
+                      transpose_rhs=False,
+                      conjugate_rhs=False,
+                      name=None):
+  r"""Solves tridiagonal systems of equations.
+
+  Solution is computed via Gaussian elemination with partial pivoting.
+
+  The input can be supplied in various formats: `matrix`, `tuple` and `compact`,
+  specified by the `diagonals_format` arg.
+
+  In `matrix` format, `diagonals` must be a tensor of shape `[..., M, M]`, with
+  two inner-most dimensions representing the square tridiagonal matrices.
+  Elements outside of the three diagonals will be ignored.
+
+  In `sequence` format, `diagonals` are supplied as a tuple or list of three
+  tensors of shapes `[..., N]`, `[..., M]`, `[..., N]` representing
+  superdiagonals, diagonals, and subdiagonals, respectively. `N` can be either
+  `M-1` or `M`; in the latter case, the last element of superdiagonal and the
+  first element of subdiagonal will be ignored.
+
+  In `compact` format the three diagonals are brought together into one tensor
+  of shape `[..., 3, M]`, with last two dimensions containing superdiagonals,
+  diagonals, and subdiagonals, in order. Similarly to `sequence` format,
+  elements `diagonals[..., 0, M-1]` and `diagonals[..., 2, 0]` are ignored.
+
+  The `compact` format is recommended as the one with best performance. In case
+  you need to cast a tensor into a compact format manually, use `tf.gather_nd`.
+  An example for a tensor of shape [m, m]:
+
+  ```python
+  rhs = tf.constant([...])
+  matrix = tf.constant([[...]])
+  m = matrix.shape[0]
+  dummy_idx = [0, 0]  # An arbitrary element to use as a dummy
+  indices = [[[i, i + 1] for i in range(m - 1)] + [dummy_idx],  # Superdiagonal
+           [[i, i] for i in range(m)],                          # Diagonal
+           [dummy_idx] + [[i + 1, i] for i in range(m - 1)]]    # Subdiagonal
+  diagonals=tf.gather_nd(matrix, indices)
+  x = tf.linalg.tridiagonal_solve(diagonals, rhs)
+  ```
+
+  Regardless of the `diagonals_format`, `rhs` is a tensor of shape `[..., M]` or
+  `[..., M, K]`. The latter allows to simultaneously solve K systems with the
+  same left-hand sides and K different right-hand sides. If `transpose_rhs`
+  is set to `True` the expected shape is `[..., M]` or `[..., K, M]`.
+
+  The batch dimensions, denoted as `...`, must be the same in `diagonals` and
+  `rhs`.
+
+  The output is a tensor of the same shape as `rhs`: either `[..., M]` or
+  `[..., M, K]`.
+
+  Args:
+    diagonals: A `Tensor` or tuple of `Tensor`s describing left-hand sides. The
+      shape depends of `diagonals_format`, see description above. Must be
+      `float32`, `float64`, `complex64`, or `complex128`.
+    rhs: A `Tensor` of shape [..., M] or [..., M, K] and with the same dtype as
+      `diagonals`.
+    diagonals_format: one of `matrix`, `sequence`, or `compact`. Default is
+      `compact`.
+    transpose_rhs: If `True`, `rhs` is transposed before solving (has no effect
+      if the shape of rhs is [..., M]).
+    conjugate_rhs: If `True`, `rhs` is conjugated before solving.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M] or [..., M, K] containing the solutions.
+
+  Raises:
+    ValueError: An unsupported type is provided as input, or when the input
+    tensors have incorrect shapes.
+
+  """
+  if diagonals_format == 'compact':
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'sequence':
+    if not isinstance(diagonals, (tuple, list)) or len(diagonals) != 3:
+      raise ValueError('Expected diagonals to be a sequence of length 3.')
+
+    superdiag, maindiag, subdiag = diagonals
+    if (not subdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1]) or
+        not superdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1])):
+      raise ValueError(
+          'Tensors representing the three diagonals must have the same shape,'
+          'except for the last dimension, got {}, {}, {}'.format(
+              subdiag.shape, maindiag.shape, superdiag.shape))
+
+    m = tensor_shape.dimension_value(maindiag.shape[-1])
+
+    def pad_if_necessary(t, name, last_dim_padding):
+      n = tensor_shape.dimension_value(t.shape[-1])
+      if not n or n == m:
+        return t
+      if n == m - 1:
+        paddings = (
+            [[0, 0] for _ in range(len(t.shape) - 1)] + [last_dim_padding])
+        return array_ops.pad(t, paddings)
+      raise ValueError('Expected {} to be have length {} or {}, got {}.'.format(
+          name, m, m - 1, n))
+
+    subdiag = pad_if_necessary(subdiag, 'subdiagonal', [1, 0])
+    superdiag = pad_if_necessary(superdiag, 'superdiagonal', [0, 1])
+
+    diagonals = array_ops.stack((superdiag, maindiag, subdiag), axis=-2)
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'matrix':
+    m1 = tensor_shape.dimension_value(diagonals.shape[-1])
+    m2 = tensor_shape.dimension_value(diagonals.shape[-2])
+    if m1 and m2 and m1 != m2:
+      raise ValueError(
+          'Expected last two dimensions of diagonals to be same, got {} and {}'
+          .format(m1, m2))
+    m = m1 or m2
+    if not m:
+      raise ValueError('The size of the matrix needs to be known for '
+                       'diagonals_format="matrix"')
+
+    # Extract diagonals; use input[..., 0, 0] as "dummy" m-th elements of sub-
+    # and superdiagonal.
+    # gather_nd slices into first indices, whereas we need to slice into the
+    # last two, so transposing back and forth is necessary.
+    dummy_idx = [0, 0]
+    indices = ([[[1, 0], [0, 0], dummy_idx]] + [
+        [[i + 1, i], [i, i], [i - 1, i]] for i in range(1, m - 1)
+    ] + [[dummy_idx, [m - 1, m - 1], [m - 2, m - 1]]])
+    diagonals = array_ops.transpose(
+        array_ops.gather_nd(array_ops.transpose(diagonals), indices))
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  raise ValueError('Unrecognized diagonals_format: {}'.format(diagonals_format))
+
+
+def _tridiagonal_solve_compact_format(diagonals,
+                                      rhs,
+                                      transpose_rhs=False,
+                                      conjugate_rhs=False,
+                                      name=None):
+  """Helper function used after the input has been cast to compact form."""
+  diags_rank, rhs_rank = len(diagonals.shape), len(rhs.shape)
+
+  if diags_rank < 2:
+    raise ValueError(
+        'Expected diagonals to have rank at least 2, got {}'.format(diags_rank))
+  if rhs_rank != diags_rank and rhs_rank != diags_rank - 1:
+    raise ValueError('Expected the rank of rhs to be {} or {}, got {}'.format(
+        diags_rank - 1, diags_rank, rhs_rank))
+  if diagonals.shape[-2] != 3:
+    raise ValueError('Expected 3 diagonals got {}'.format(diagonals.shape[-2]))
+  if not diagonals.shape[:-2].is_compatible_with(rhs.shape[:diags_rank - 2]):
+    raise ValueError('Batch shapes {} and {} are incompatible'.format(
+        diagonals.shape[:-2], rhs.shape[:diags_rank - 2]))
+
+  def check_num_lhs_matches_num_rhs():
+    if diagonals.shape[-1] != rhs.shape[-2]:
+      raise ValueError('Expected number of left-hand sided and right-hand '
+                       'sides to be equal, got {} and {}'.format(
+                           diagonals.shape[-1], rhs.shape[-2]))
+
+  if rhs_rank == diags_rank - 1:
+    # Rhs provided as a vector, ignoring transpose_rhs
+    if conjugate_rhs:
+      rhs = math_ops.conj(rhs)
+    rhs = array_ops.expand_dims(rhs, -1)
+    check_num_lhs_matches_num_rhs()
+    return array_ops.squeeze(
+        linalg_ops.tridiagonal_solve(diagonals, rhs, name), -1)
+
+  if transpose_rhs:
+    rhs = array_ops.matrix_transpose(rhs, conjugate=conjugate_rhs)
+  elif conjugate_rhs:
+    rhs = math_ops.conj(rhs)
+
+  check_num_lhs_matches_num_rhs()
+  result = linalg_ops.tridiagonal_solve(diagonals, rhs, name)
+  return array_ops.matrix_transpose(result) if transpose_rhs else result
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 4c99e86dc59a8c39abb57494ae84bcfdc13faa1b..8fa9f63e043a59da5b3ea425883cb954a065c1ee 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -847,6 +847,26 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def adjoint(self, name="adjoint"):
+    """Returns the adjoint of the current `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return `A*`.
+    Note that calling `self.adjoint()` and `self.H` are equivalent.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the adjoint of this `LinearOperator`.
+    """
+    if self.is_self_adjoint is True:  # pylint: disable=g-bool-id-comparison
+      return self
+    with self._name_scope(name):
+      return linear_operator_algebra.adjoint(self)
+
+  # self.H is equivalent to self.adjoint().
+  H = property(adjoint, None)
+
   def inverse(self, name="inverse"):
     """Returns the Inverse of this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
index 858e224b9adda57b4d472ae2f61b2b6cda74c243..7ee4752d264b73689c172240f10c89e1a52c5b68 100644
--- a/tensorflow/python/ops/linalg/linear_operator_adjoint.py
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -83,7 +84,7 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     r"""Initialize a `LinearOperatorAdjoint`.
 
     `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
-    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+    and `matmul` methods  effectively flip the `adjoint` argument.  E.g.
 
     ```
     A = MyLinearOperator(...)
@@ -175,15 +176,24 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.assert_self_adjoint()
 
   def _shape(self):
-    return self.operator.shape
+    # Rotate last dimension
+    shape = self.operator.shape
+    return shape[:-2].concatenate([shape[-1], shape[-2]])
 
   def _shape_tensor(self):
-    return self.operator.shape_tensor()
+    # Rotate last dimension
+    shape = self.operator.shape_tensor()
+    return array_ops.concat([
+        shape[:-2], [shape[-1], shape[-2]]], axis=-1)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return self.operator.matmul(
         x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _matvec(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matvec(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _determinant(self):
     if self.is_self_adjoint:
       return self.operator.determinant()
@@ -201,7 +211,14 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.solve(
         rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _solvevec(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solvevec(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _to_dense(self):
     if self.is_self_adjoint:
       return self.operator.to_dense()
     return linalg.adjoint(self.operator.to_dense())
+
+  def _add_to_tensor(self, x):
+    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index c1513fdb38c6005c89f6994141797f7df5c65350..0d1eab4b735d64ad988507d6c52fc52202389fd0 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util import tf_inspect
 
 
+_ADJOINTS = {}
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
 _INVERSES = {}
@@ -46,6 +47,11 @@ def _registered_function(type_list, registry):
   return registry.get(tuple(r[1] for r in registered_combination), None)
 
 
+def _registered_adjoint(type_a):
+  """Get the Adjoint function registered for class a."""
+  return _registered_function([type_a], _ADJOINTS)
+
+
 def _registered_cholesky(type_a):
   """Get the Cholesky function registered for class a."""
   return _registered_function([type_a], _CHOLESKY_DECOMPS)
@@ -61,6 +67,29 @@ def _registered_inverse(type_a):
   return _registered_function([type_a], _INVERSES)
 
 
+def adjoint(lin_op_a, name=None):
+  """Get the adjoint associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to take the adjoint of.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the adjoint of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Adjoint method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  adjoint_fn = _registered_adjoint(type(lin_op_a))
+  if adjoint_fn is None:
+    raise ValueError("No adjoint registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Adjoint"):
+    return adjoint_fn(lin_op_a)
+
+
 def cholesky(lin_op_a, name=None):
   """Get the Cholesky factor associated to lin_op_a.
 
@@ -132,6 +161,48 @@ def inverse(lin_op_a, name=None):
     return inverse_fn(lin_op_a)
 
 
+class RegisterAdjoint(object):
+  """Decorator to register an Adjoint implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterAdjoint(lin_op.LinearOperatorIdentity)
+  def _adjoint_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, adjoint_fn):
+    """Perform the Adjoint registration.
+
+    Args:
+      adjoint_fn: The function to use for the Adjoint.
+
+    Returns:
+      adjoint_fn
+
+    Raises:
+      TypeError: if adjoint_fn is not a callable.
+      ValueError: if a Adjoint function has already been registered for
+        the given argument classes.
+    """
+    if not callable(adjoint_fn):
+      raise TypeError(
+          "adjoint_fn must be callable, received: {}".format(adjoint_fn))
+    if self._key in _ADJOINTS:
+      raise ValueError("Adjoint({}) has already been registered to: {}".format(
+          self._key[0].__name__, _ADJOINTS[self._key]))
+    _ADJOINTS[self._key] = adjoint_fn
+    return adjoint_fn
+
+
 class RegisterCholesky(object):
   """Decorator to register a Cholesky implementation function.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index a957c84dc1ca6f26927ae3c39c6cb49caa2b19be..854c4deaeb874e547c256f9029e6559b42831b08 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -186,6 +187,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
           "{} skipped because it was added to self._tests_to_skip.".format(
               test_name))
 
+  @test_util.run_deprecated_v1
   def test_to_dense(self):
     self._skip_if_tests_to_skip_contains("to_dense")
     for use_placeholder in self._use_placeholder_options:
@@ -201,6 +203,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
             op_dense_v, mat_v = sess.run([op_dense, mat])
             self.assertAC(op_dense_v, mat_v)
 
+  @test_util.run_deprecated_v1
   def test_det(self):
     self._skip_if_tests_to_skip_contains("det")
     for use_placeholder in self._use_placeholder_options:
@@ -217,6 +220,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 [op_det, linalg_ops.matrix_determinant(mat)])
             self.assertAC(op_det_v, mat_det_v)
 
+  @test_util.run_deprecated_v1
   def test_log_abs_det(self):
     self._skip_if_tests_to_skip_contains("log_abs_det")
     for use_placeholder in self._use_placeholder_options:
@@ -270,14 +274,35 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                     [op_matmul, mat_matmul])
                 self.assertAC(op_matmul_v, mat_matmul_v)
 
+  @test_util.run_deprecated_v1
   def test_matmul(self):
     self._skip_if_tests_to_skip_contains("matmul")
     self._test_matmul(with_batch=True)
 
+  @test_util.run_deprecated_v1
   def test_matmul_with_broadcast(self):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  @test_util.run_deprecated_v1
+  def test_adjoint(self):
+    self._skip_if_tests_to_skip_contains("adjoint")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_adjoint = operator.adjoint().to_dense()
+            op_adjoint_h = operator.H.to_dense()
+            mat_adjoint = linalg.adjoint(mat)
+            op_adjoint_v, op_adjoint_h_v, mat_adjoint_v = sess.run(
+                [op_adjoint, op_adjoint_h, mat_adjoint])
+            self.assertAC(mat_adjoint_v, op_adjoint_v)
+            self.assertAC(mat_adjoint_v, op_adjoint_h_v)
+
+  @test_util.run_deprecated_v1
   def test_cholesky(self):
     self._skip_if_tests_to_skip_contains("cholesky")
     for use_placeholder in self._use_placeholder_options:
@@ -293,6 +318,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
             op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
             self.assertAC(mat_chol_v, op_chol_v)
 
+  @test_util.run_deprecated_v1
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
@@ -328,10 +354,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
                 self.assertAC(op_solve_v, mat_solve_v)
 
+  @test_util.run_deprecated_v1
   def test_solve(self):
     self._skip_if_tests_to_skip_contains("solve")
     self._test_solve(with_batch=True)
 
+  @test_util.run_deprecated_v1
   def test_solve_with_broadcast(self):
     self._skip_if_tests_to_skip_contains("solve_with_broadcast")
     self._test_solve(with_batch=False)
@@ -348,10 +376,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 operator.inverse().to_dense(), linalg.inv(mat)])
             self.assertAC(op_inverse_v, mat_inverse_v)
 
+  @test_util.run_deprecated_v1
   def test_inverse(self):
     self._skip_if_tests_to_skip_contains("inverse")
     self._test_inverse()
 
+  @test_util.run_deprecated_v1
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -368,6 +398,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
             op_trace_v, mat_trace_v = sess.run([op_trace, mat_trace])
             self.assertAC(op_trace_v, mat_trace_v)
 
+  @test_util.run_deprecated_v1
   def test_add_to_tensor(self):
     self._skip_if_tests_to_skip_contains("add_to_tensor")
     for use_placeholder in self._use_placeholder_options:
@@ -386,6 +417,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
             self.assertAC(op_plus_2mat_v, 3 * mat_v)
 
+  @test_util.run_deprecated_v1
   def test_diag_part(self):
     self._skip_if_tests_to_skip_contains("diag_part")
     for use_placeholder in self._use_placeholder_options:
@@ -557,7 +589,7 @@ def random_positive_definite_matrix(shape, dtype, force_well_conditioned=False):
   if not tensor_util.is_tensor(shape):
     shape = tensor_shape.TensorShape(shape)
     # Matrix must be square.
-    shape[-1].assert_is_compatible_with(shape[-2])
+    shape.dims[-1].assert_is_compatible_with(shape.dims[-2])
 
   with ops.name_scope("random_positive_definite_matrix"):
     tril = random_tril_matrix(
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 290d1fc5030023c426d45116f57b263576833fc3..f6c9d2c6a6db092b2074086337758321e57f2e1c 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -412,10 +412,16 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     # only defined up a (k-dimensional) subspace. In practice, this can
     # lead to numerical instability when singular values are close but not
     # exactly equal.
+    # Also, even with distinct singular values, the diagonal of f can have Inf
+    # values before setting to zero, which hurt when differentiating through
+    # this op. To avoid that, we add eye to the matrix before taking
+    # the reciprocal.
+    s_shape = array_ops.shape(s)
+    eye = _linalg.eye(s_shape[-1], batch_shape=s_shape[:-1], dtype=s.dtype)
     f = array_ops.matrix_set_diag(
         math_ops.reciprocal(
-            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
-        array_ops.zeros_like(s))
+            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1) +
+            eye), array_ops.zeros_like(s))
     s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))
 
     v1 = v[..., :, :m]
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 1a9e7112b45cacb711ac176b92cb3bef0dc72f00..b67e18979465001815a74abe9a6f46ad1e0f31c8 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,9 +24,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
@@ -335,7 +335,7 @@ def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
 
   Note: If your program backpropagates through this function, you should replace
-  it with a call to tf.linalg.eigvalsh (possibly ignoring the second output) to
+  it with a call to tf.linalg.eigh (possibly ignoring the second output) to
   avoid computing the eigen decomposition twice. This is because the
   eigenvectors are used to compute the gradient w.r.t. the eigenvalues. See
   _SelfAdjointEigV2Grad in linalg_grad.py.
@@ -437,13 +437,13 @@ def norm_v2(tensor,
 
   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
-    ord: Order of the norm. Supported values are 'fro', 'euclidean',
+    ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`,
       `1`, `2`, `np.inf` and any positive real number yielding the corresponding
-      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
+      p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
       `tensor` is a matrix and equivalent to 2-norm for vectors.
       Some restrictions apply:
-        a) The Frobenius norm `fro` is not defined for vectors,
-        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
+        a) The Frobenius norm `'fro'` is not defined for vectors,
+        b) If axis is a 2-tuple (matrix norm), only `'euclidean'`, '`fro'`, `1`,
            `2`, `np.inf` are supported.
       See the description of `axis` on how to compute norms for a batch of
       vectors or matrices stored in a tensor.
@@ -595,14 +595,14 @@ def norm(tensor,
     if ord in ['fro', 'euclidean', 2, 2.0]:
       if is_matrix_norm and ord in [2, 2.0]:
         rank = array_ops.rank(tensor)
-        positive_axis = functional_ops.map_fn(
+        positive_axis = map_fn.map_fn(
             lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat(
             [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
             axis=0)
-        perm_after = functional_ops.map_fn(
+        perm_after = map_fn.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
                     array_ops.where(math_ops.equal(perm_before, i))),
@@ -619,6 +619,8 @@ def norm(tensor,
         result = math_ops.sqrt(
             math_ops.reduce_sum(
                 tensor * math_ops.conj(tensor), axis, keepdims=True))
+        # TODO(rmlarsen): Replace with the following, once gradients are defined
+        # result = math_ops.reduce_euclidean_norm(tensor, axis, keepdims=True)
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index df928ea85df895ffd3439f5b8a2a430f8c517cd5..ee01ff7cf687c6ebfb2e7069534f52047fc0d9f7 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -71,13 +71,77 @@ def tensor_list_from_tensor(tensor, element_shape, name=None):
       name=name)
 
 
+def tensor_list_get_item(input_handle, index, element_dtype, element_shape=None,
+                         name=None):
+  return gen_list_ops.tensor_list_get_item(
+      input_handle=input_handle,
+      index=index,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_pop_back(input_handle, element_dtype, name=None):
+  return gen_list_ops.tensor_list_pop_back(
+      input_handle=input_handle,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_gather(input_handle,
+                       indices,
+                       element_dtype,
+                       element_shape=None,
+                       name=None):
+  return gen_list_ops.tensor_list_gather(
+      input_handle=input_handle,
+      indices=indices,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_scatter(tensor,
+                        indices,
+                        element_shape=None,
+                        input_handle=None,
+                        name=None):
+  if input_handle is not None:
+    return gen_list_ops.tensor_list_scatter_into_existing_list(
+        input_handle=input_handle, tensor=tensor, indices=indices, name=name)
+  else:
+    return gen_list_ops.tensor_list_scatter_v2(
+        tensor=tensor,
+        indices=indices,
+        element_shape=_build_element_shape(element_shape),
+        num_elements=-1,
+        name=name)
+
+
+def tensor_list_stack(input_handle,
+                      element_dtype,
+                      num_elements=-1,
+                      element_shape=None,
+                      name=None):
+  return gen_list_ops.tensor_list_stack(
+      input_handle=input_handle,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      num_elements=num_elements,
+      name=name)
+
+
 def tensor_list_concat(input_handle, element_dtype, element_shape=None,
                        name=None):
   # Ignore the lengths output of TensorListConcat. It is only used during
   # gradient computation.
-  return gen_list_ops.tensor_list_concat(
-      input_handle=input_handle, element_dtype=element_dtype,
-      element_shape=element_shape, name=name)[0]
+  return gen_list_ops.tensor_list_concat_v2(
+      input_handle=input_handle,
+      element_dtype=element_dtype,
+      element_shape=_build_element_shape(element_shape),
+      leading_dims=ops.convert_to_tensor([], dtype=dtypes.int64),
+      name=name)[0]
 
 
 def tensor_list_split(tensor, element_shape, lengths, name=None):
@@ -110,7 +174,9 @@ def tensor_list_set_item(input_handle,
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
-      dresult, element_dtype=op.get_attr("element_dtype"))
+      dresult,
+      element_shape=array_ops.shape(op.inputs[1]),
+      element_dtype=op.get_attr("element_dtype"))
 
 
 @ops.RegisterGradient("TensorListPopBack")
@@ -120,47 +186,59 @@ def _PopBackGrad(op, dlist, delement):
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_push_back(dlist, delement)
+  return gen_list_ops.tensor_list_push_back(dlist, delement), None
 
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:]), None
 
 
 @ops.RegisterGradient("TensorListConcat")
+@ops.RegisterGradient("TensorListConcatV2")
 def _TensorListConcatGrad(op, dtensor, unused_dlengths):
-  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
-  # Consider providing that as an output of TensorListConcat?
-  if dtensor.shape.rank is None:
-    element_shape = None
-  else:
-    element_shape = [None] + dtensor.shape.as_list()[1:]
-  return tensor_list_split(
+  """Gradient function for TensorListConcat."""
+  dlist = tensor_list_split(
       dtensor,
-      element_shape=_build_element_shape(element_shape),
+      element_shape=gen_list_ops.tensor_list_element_shape(
+          op.inputs[0], shape_type=dtypes.int32),
       lengths=op.outputs[1])
+  if op.type == "TensorListConcatV2":
+    return dlist, None, None
+  else:
+    return dlist
 
 
 @ops.RegisterGradient("TensorListSplit")
 def _TensorListSplitGrad(op, dlist):
-  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
+  tensor, _, lengths = op.inputs
+  element_shape = array_ops.slice(array_ops.shape(tensor), [1], [-1])
+  element_shape = array_ops.concat([[-1], element_shape], axis=0)
+  return gen_list_ops.tensor_list_concat_v2(
+      dlist,
+      element_shape=element_shape,
+      leading_dims=lengths,
+      element_dtype=op.inputs[0].dtype)[0], None, None
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
-    num_elements = op.inputs[0].shape.dims[0].value
+  t = op.inputs[0]
+  if t.shape.dims and t.shape.dims[0].value is not None:
+    num_elements = t.shape.dims[0].value
   else:
     num_elements = None
   if dlist is None:
     dlist = empty_tensor_list(
-        element_dtype=op.inputs[0].dtype,
+        element_dtype=t.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
   tensor_grad = gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+      dlist,
+      element_shape=array_ops.slice(array_ops.shape(t), [1], [-1]),
+      element_dtype=t.dtype,
+      num_elements=num_elements)
   shape_grad = None
   return tensor_grad, shape_grad
 
@@ -177,17 +255,22 @@ def _TensorListGetItemGrad(op, ditem):
       index=op.inputs[1],
       item=ditem)
   index_grad = None
-  return list_grad, index_grad
+  element_shape_grad = None
+  return list_grad, index_grad, element_shape_grad
 
 
 @ops.RegisterGradient("TensorListSetItem")
 def _TensorListSetItemGrad(op, dlist):
+  """Gradient function for TensorListSetItem."""
   _, index, item = op.inputs
   list_grad = gen_list_ops.tensor_list_set_item(
       dlist, index=index, item=array_ops.zeros_like(item))
   index_grad = None
-  element_grad = gen_list_ops.tensor_list_get_item(
-      dlist, index, element_dtype=item.dtype)
+  element_grad = tensor_list_get_item(
+      dlist,
+      index,
+      element_shape=array_ops.shape(item),
+      element_dtype=item.dtype)
   return list_grad, index_grad, element_grad
 
 
@@ -200,23 +283,46 @@ def _TensorListResizeGrad(op, dlist):
 
 @ops.RegisterGradient("TensorListGather")
 def _TensorListGatherGrad(op, dtensor):
-  input_list, indices = op.inputs
-  dlist = gen_list_ops.tensor_list_scatter(
-      tensor=dtensor,
-      indices=indices,
-      element_shape=ops.convert_to_tensor(-1, dtype=dtypes.int32))
-  # TensorListScatter returns a list with size `max(indices) + 1`
-  # so we manually resize it to match the size of the input list.
-  input_list_size = gen_list_ops.tensor_list_length(input_list)
-  dlist = gen_list_ops.tensor_list_resize(dlist, input_list_size)
-  return dlist, None
+  """Gradient function for TensorListGather."""
+  input_list, indices, _ = op.inputs
+  element_shape = gen_list_ops.tensor_list_element_shape(
+      input_list, shape_type=dtypes.int32)
+  num_elements = gen_list_ops.tensor_list_length(input_list)
+  dlist = tensor_list_reserve(element_shape, num_elements, dtensor.dtype)
+  dlist = tensor_list_scatter(
+      tensor=dtensor, indices=indices, input_handle=dlist)
+  return dlist, None, None
 
 
 @ops.RegisterGradient("TensorListScatter")
+@ops.RegisterGradient("TensorListScatterV2")
 def _TensorListScatterGrad(op, dlist):
-  t, indices, _ = op.inputs
-  return gen_list_ops.tensor_list_gather(
-      dlist, indices, element_dtype=t.dtype), None, None
+  """Gradient function for TensorListScatter."""
+  tensor = op.inputs[0]
+  indices = op.inputs[1]
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  if op.type == "TensorListScatterV2":
+    return dtensor, None, None, None
+  else:
+    return dtensor, None, None
+
+
+@ops.RegisterGradient("TensorListScatterIntoExistingList")
+def _TensorListScatterIntoExistingListGrad(op, dlist):
+  """Gradient function for TensorListScatterIntoExistingList."""
+  _, tensor, indices = op.inputs
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  zeros = array_ops.zeros_like(tensor)
+  dlist = tensor_list_scatter(zeros, indices, indices, input_handle=dlist)
+  return dlist, dtensor, None
 
 
 def _build_element_shape(shape):
@@ -253,4 +359,13 @@ def _build_element_shape(shape):
   if not shape:
     return ops.convert_to_tensor(shape, dtype=dtypes.int32)
   # Shape is a sequence of dimensions. Convert None dims to -1.
-  return [d if d is not None else -1 for d in shape]
+  def convert(val):
+    if val is None:
+      return -1
+    if isinstance(val, ops.Tensor):
+      return val
+    if isinstance(val, tensor_shape.Dimension):
+      return val.value if val.value is not None else -1
+    return val
+
+  return [convert(d) for d in shape]
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 3cb16eb81e8c0796e199edb9c97acd1c269c832b..f05fbf4dd5665596b2a03d5b580b13877d14218b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -25,6 +25,7 @@ import sys
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -40,6 +41,14 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
+# Register printing to the cell output if we are in a Colab or Jupyter Notebook.
+try:
+  get_ipython()  # Exists in an ipython env like Jupyter or Colab
+  pywrap_tensorflow.TFE_Py_EnableInteractivePythonLogging()
+except NameError:
+  pass
+
+
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
@@ -193,9 +202,8 @@ def print_v2(*inputs, **kwargs):
     (This prints "tensors: [0 1 2 ... 7 8 9] {2: [0 2 4 ... 14 16 18]}" to
     sys.stdout)
 
-  Note: This op is only partially compatible with Jupyter notebooks and colabs.
-    Because it prints to the C++ standard out / standard error, this will go
-    in the notebook kernel's console output, not in the notebook cell output.
+  Note: In Jupyter notebooks and colabs, this operator prints to the notebook
+    cell outputs. It will not write to the notebook kernel's console logs.
 
   Args:
     *inputs: Positional arguments that are the inputs to print. Inputs in the
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 6a7057fafd4e64477874b14b2078f49b2cc96c7c..c16cb79aeb7b8997fc08aa6c365381884bc364e3 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -38,9 +38,10 @@ from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
+from tensorflow.python.training.saver import BaseSaverBuilder
 # pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -64,7 +65,7 @@ def initialize_all_tables(name="init_all_tables"):
 @tf_export(v1=["initializers.tables_initializer", "tables_initializer"])
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
-  
+
   See the [Low Level Intro](https://www.tensorflow.org/guide/low_level_intro#feature_columns)
   guide, for an example of usage.
 
@@ -101,7 +102,7 @@ def _check_table_dtypes(table, key_dtype, value_dtype):
                     (table.value_dtype, value_dtype))
 
 
-class LookupInterface(checkpointable.TrackableResource):
+class LookupInterface(trackable.TrackableResource):
   """Represent a lookup table that persists across different steps."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -115,7 +116,7 @@ class LookupInterface(checkpointable.TrackableResource):
     self._value_dtype = dtypes.as_dtype(value_dtype)
     super(LookupInterface, self).__init__()
 
-  def create_resource(self):
+  def _create_resource(self):
     raise NotImplementedError
 
   @property
@@ -164,24 +165,16 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    if isinstance(initializer, checkpointable_base.Checkpointable):
-      self._initializer = self._track_checkpointable(
+    if isinstance(initializer, trackable_base.Trackable):
+      self._initializer = self._track_trackable(
           initializer, "_initializer")
-    self._resource_handle = self.create_resource()
-    self._init_op = self.initialize()
+    with ops.init_scope():
+      self._resource_handle = self._create_resource()
+      self._init_op = self._initialize()
 
-  def initialize(self):
+  def _initialize(self):
     return self._initializer.initialize(self)
 
-  @property
-  def initializer(self):
-    return self._init_op
-
-  @property
-  @deprecated("2018-12-15", "Use `initializer` instead.")
-  def init(self):
-    return self.initializer
-
   @property
   def default_value(self):
     """The default value of the table."""
@@ -196,10 +189,8 @@ class InitializableLookupTableBase(LookupInterface):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as scope:
-      return gen_lookup_ops.lookup_table_size_v2(
-          self.resource_handle, name=scope)
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -227,9 +218,9 @@ class InitializableLookupTableBase(LookupInterface):
 
     with ops.name_scope(
         name, "%s_Lookup" % self.name,
-        (self.resource_handle, key_tensor, self._default_value)) as scope:
+        (self.resource_handle, key_tensor, self._default_value)):
       values = gen_lookup_ops.lookup_table_find_v2(
-          self.resource_handle, key_tensor, self._default_value, name=scope)
+          self.resource_handle, key_tensor, self._default_value)
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -238,13 +229,21 @@ class InitializableLookupTableBase(LookupInterface):
       return values
 
 
-class HashTable(InitializableLookupTableBase):
+class InitializableLookupTableBaseV1(InitializableLookupTableBase):
+
+  @property
+  def initializer(self):
+    return self._init_op
+
+
+@tf_export("lookup.StaticHashTable", v1=[])
+class StaticHashTable(InitializableLookupTableBase):
   """A generic hash table implementation.
 
   Example usage:
 
   ```python
-  table = tf.HashTable(
+  table = tf.lookup.StaticHashTable(
       tf.KeyValueTensorInitializer(keys, values), -1)
   out = table.lookup(input_tensor)
   table.init.run()
@@ -252,7 +251,7 @@ class HashTable(InitializableLookupTableBase):
   ```
   """
 
-  def __init__(self, initializer, default_value, shared_name=None, name=None):
+  def __init__(self, initializer, default_value, name=None):
     """Creates a non-initialized `HashTable` object.
 
     Creates a table, the type of its keys and values are specified by the
@@ -264,8 +263,6 @@ class HashTable(InitializableLookupTableBase):
       initializer: The table initializer to use. See `HashTable` kernel for
         supported key and value types.
       default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
       name: A name for the operation (optional).
 
     Returns:
@@ -273,21 +270,22 @@ class HashTable(InitializableLookupTableBase):
     """
     self._initializer = initializer
     self._default_value = default_value
-    self._shared_name = shared_name
-    self._name = name
-    self._table_name = ""
-    super(HashTable, self).__init__(default_value, initializer)
+    self._shared_name = self._initializer._shared_name  # pylint: disable=protected-access
+    self._name = name or "hash_table"
+    self._table_name = None
+    super(StaticHashTable, self).__init__(default_value, initializer)
     self._value_shape = self._default_value.get_shape()
 
-  def create_resource(self):
-    with ops.name_scope(self._name, "hash_table",
-                        (self._initializer, self._default_value)) as scope:
-      table_ref = gen_lookup_ops.hash_table_v2(
-          shared_name=self._shared_name,
-          key_dtype=self._initializer.key_dtype,
-          value_dtype=self._initializer.value_dtype,
-          name=scope)
-      self._table_name = scope.split("/")[-2]
+  def _create_resource(self):
+    table_ref = gen_lookup_ops.hash_table_v2(
+        shared_name=self._shared_name,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
     return table_ref
 
   @property
@@ -304,18 +302,32 @@ class HashTable(InitializableLookupTableBase):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_Export" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+    with ops.name_scope(name, "%s_Export" % self.name, [self.resource_handle]):
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
     return exported_keys, exported_values
 
 
-class TableInitializerBase(checkpointable_base.Checkpointable):
+@tf_export(v1=["lookup.StaticHashTable"])
+class StaticHashTableV1(StaticHashTable):
+
+  @property
+  def initializer(self):
+    return self._init_op
+
+
+# For backwards compatibility. This will be removed in TF 2.0.
+class HashTable(StaticHashTableV1):
+
+  @property
+  def init(self):
+    return self.initializer
+
+
+class TableInitializerBase(trackable_base.Trackable):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -349,10 +361,12 @@ class TableInitializerBase(checkpointable_base.Checkpointable):
     if context.executing_eagerly():
       # Ensure a unique name when eager execution is enabled to avoid spurious
       # sharing issues.
+      # TODO(rohanj): Use context.shared_name() instead.
       shared_name += str(ops.uid())
     return shared_name
 
 
+@tf_export("lookup.KeyValueTensorInitializer")
 class KeyValueTensorInitializer(TableInitializerBase):
   """Table initializers given `keys` and `values` tensors."""
 
@@ -366,11 +380,16 @@ class KeyValueTensorInitializer(TableInitializerBase):
       value_dtype: The `values` data type. Used when `values` is a python array.
       name: A name for the operation (optional).
     """
-    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
+    with ops.init_scope():
       self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
       self._values = ops.convert_to_tensor(
           values, dtype=value_dtype, name="values")
-      self._name = scope
+    self._name = name if name is not None else "key_value_init"
+    if context.executing_eagerly():
+      # Ensure a unique name when eager execution is enabled to avoid spurious
+      # sharing issues.
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._name += str(ops.uid())
 
     super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
                                                     self._values.dtype)
@@ -390,19 +409,14 @@ class KeyValueTensorInitializer(TableInitializerBase):
     """
     _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
     with ops.name_scope(
-        self._name, values=(table.resource_handle, self._keys,
-                            self._values)) as scope:
-      if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        scope += str(ops.uid())
+        self._name, values=(table.resource_handle, self._keys, self._values)):
       if fwd_compat.forward_compatible(2018, 9, 19):
         init_op = gen_lookup_ops.lookup_table_import_v2(
-            table.resource_handle, self._keys, self._values, name=scope)
+            table.resource_handle, self._keys, self._values)
       else:
         # To maintain forward compatibiltiy, use the old implementation.
-        init_op = gen_lookup_ops.initialize_table_v2(
-            table.resource_handle, self._keys, self._values, name=scope)
+        init_op = gen_lookup_ops.initialize_table_v2(table.resource_handle,
+                                                     self._keys, self._values)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
@@ -412,6 +426,7 @@ class TextFileIndex(object):
   LINE_NUMBER = -1
 
 
+@tf_export("lookup.TextFileInitializer")
 class TextFileInitializer(TableInitializerBase):
   """Table initializers from a text file.
 
@@ -446,7 +461,7 @@ class TextFileInitializer(TableInitializerBase):
   * `palmer -> 30`
 
   ```python
-  table = tf.lookup.HashTable(tf.lookup.TextFileInitializer(
+  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
       "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
   ...
   table.init.run()
@@ -459,7 +474,7 @@ class TextFileInitializer(TableInitializerBase):
   * `palmer 30 -> 2`
 
   ```python
-  table = tf.lookup.HashTable(tf.lookup.TextFileInitializer(
+  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
       "test.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
       tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
   ...
@@ -544,8 +559,8 @@ class TextFileInitializer(TableInitializerBase):
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
-    self._filename = self._track_checkpointable(
-        checkpointable.TrackableAsset(filename),
+    self._filename = self._track_trackable(
+        trackable.TrackableAsset(filename),
         "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
@@ -564,21 +579,16 @@ class TextFileInitializer(TableInitializerBase):
       key and value data types.
     """
     _check_table_dtypes(table, self.key_dtype, self.value_dtype)
-    with ops.name_scope(self._name, "text_file_init",
-                        (table.resource_handle,)) as scope:
+    with ops.name_scope(self._name, "text_file_init", (table.resource_handle,)):
       filename = ops.convert_to_tensor(
           self._filename, dtypes.string, name="asset_filepath")
       init_op = gen_lookup_ops.initialize_table_from_text_file_v2(
-          table.resource_handle,
-          filename,
-          self._key_index,
-          self._value_index,
-          -1 if self._vocab_size is None else self._vocab_size,
-          self._delimiter,
-          name=scope)
+          table.resource_handle, filename, self._key_index, self._value_index,
+          -1 if self._vocab_size is None else self._vocab_size, self._delimiter)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
-    # If the filename tensor is anything other than a string constant (e.g., if
-    # it is a placeholder) then it does not make sense to track it as an asset.
+    # If the filename tensor is anything other than a string constant (e.g.,
+    # if it is a placeholder) then it does not make sense to track it as an
+    # asset.
     if not context.executing_eagerly() and constant_op.is_constant(filename):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
@@ -588,16 +598,14 @@ class TextFileInitializer(TableInitializerBase):
     if self._vocab_size:
       # Keep the shared_name:
       # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%d_%s_%s" % (self._filename_arg,
-                                                self._vocab_size,
-                                                self._key_index,
-                                                self._value_index)
+      shared_name = "hash_table_%s_%d_%s_%s" % (
+          self._filename_arg, self._vocab_size, self._key_index,
+          self._value_index)
     else:
       # Keep the shared_name
       # <table_type>_<filename>_<key_index>_<value_index>
       shared_name = "hash_table_%s_%s_%s" % (self._filename_arg,
-                                             self._key_index,
-                                             self._value_index)
+                                             self._key_index, self._value_index)
     return shared_name
 
 
@@ -782,7 +790,8 @@ class IdTableWithHashBuckets(LookupInterface):
   num_oov_buckets = 3
   input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
   table = tf.IdTableWithHashBuckets(
-      tf.HashTable(tf.TextFileIdTableInitializer(filename), default_value),
+      tf.StaticHashTable(tf.TextFileIdTableInitializer(filename),
+                         default_value),
       num_oov_buckets)
   out = table.lookup(input_tensor).
   table.init.run()
@@ -850,17 +859,20 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError(
           "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
     self._hasher_spec = hasher_spec
-    self._table_name = name.split("/")[-1]
+    if name:
+      self._table_name = name.split("/")[-1]
+    else:
+      self._table_name = None
     super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64)
 
-  def create_resource(self):
+  def _create_resource(self):
     if self._table is not None:
-      return self._table.create_resource()
+      return self._table._create_resource()  # pylint: disable=protected-access
     return None
 
-  def initialize(self):
+  def _initialize(self):
     if self._table is not None:
-      return self._table.initialize()
+      return self._table._initialize()  # pylint: disable=protected-access
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
@@ -888,9 +900,9 @@ class IdTableWithHashBuckets(LookupInterface):
 
   def size(self, name=None):
     """Compute the number of elements in this table."""
-    with ops.name_scope(name, "%s_Size" % self.name) as scope:
+    with ops.name_scope(name, "%s_Size" % self.name):
       if self._table:
-        tsize = self._table.size(scope)
+        tsize = self._table.size()
       else:
         tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
       return tsize + self._num_oov_buckets
@@ -930,13 +942,13 @@ class IdTableWithHashBuckets(LookupInterface):
     if isinstance(keys, sparse_tensor.SparseTensor):
       values = keys.values
     if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
-      values = math_ops.to_int64(values)
+      values = math_ops.cast(values, dtypes.int64)
 
     if self._num_oov_buckets == 0:
       ids = self._table.lookup(values, name=name)
     else:
       # TODO(yleon): Consider moving this functionality to its own kernel.
-      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
+      with ops.name_scope(name, "%s_Lookup" % self.name):
         str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
             self._hasher_spec)
         buckets = str_to_hash_bucket(
@@ -947,7 +959,7 @@ class IdTableWithHashBuckets(LookupInterface):
           ids = self._table.lookup(values)
           buckets = math_ops.add(buckets, self._table.size())
           is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
-          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
+          ids = array_ops.where(is_id_non_default, ids, buckets)
         else:
           ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -955,6 +967,191 @@ class IdTableWithHashBuckets(LookupInterface):
     return ids
 
 
+@tf_export("lookup.StaticVocabularyTable", v1=[])
+class StaticVocabularyTable(LookupInterface):
+  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+
+  For example, if an instance of `StaticVocabularyTable` is initialized with a
+  string-to-id initializer that maps:
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+
+  The `Vocabulary` object will performs the following mapping:
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
+  `3 + num_oov_buckets - 1`, calculated by:
+  `hash(<term>) % num_oov_buckets + vocab_size`
+
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`.
+
+  If `initializer` is None, only out-of-vocabulary buckets are used.
+
+  Example usage:
+
+  ```python
+  num_oov_buckets = 3
+  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
+  table = tf.lookup.StaticVocabularyTable(
+      tf.TextFileIdTableInitializer(filename), num_oov_buckets)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print(out.eval())
+  ```
+
+  The hash function used for generating out-of-vocabulary buckets ID is
+  Fingerprint64.
+  """
+
+  def __init__(self,
+               initializer,
+               num_oov_buckets,
+               lookup_key_dtype=None,
+               name=None):
+    """Construct a `StaticVocabularyTable` object.
+
+    Args:
+      initializer: A TableInitializerBase object that contains the data used to
+        initialize the table. If None, then we only use out-of-vocab buckets.
+      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys. Must
+        be greater than zero.
+      lookup_key_dtype: Data type of keys passed to `lookup`. Defaults to
+        `initializer.key_dtype` if `initializer` is specified, otherwise
+        `tf.string`. Must be string or integer, and must be castable to
+        `initializer.key_dtype`.
+      name: A name for the operation (optional).
+
+    Raises:
+      ValueError: when `num_oov_buckets` is not positive.
+      TypeError: when lookup_key_dtype or initializer.key_dtype are not
+        integer or string. Also when initializer.value_dtype != int64.
+    """
+    if num_oov_buckets <= 0:
+      raise ValueError("oov_buckets must be > 0.")
+    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
+    # characters to use as table name.
+    if name:
+      name = name.rstrip("/")
+    if initializer:
+      if lookup_key_dtype is None:
+        lookup_key_dtype = initializer.key_dtype
+      supported_table_key_dtypes = (dtypes.int64, dtypes.string)
+      if initializer.key_dtype not in supported_table_key_dtypes:
+        raise TypeError("Invalid key dtype, expected one of %s, but got %s." %
+                        (supported_table_key_dtypes, initializer.key_dtype))
+      if initializer.key_dtype.is_integer != lookup_key_dtype.is_integer:
+        raise TypeError(
+            "Invalid key dtype, expected %s but got %s." %
+            ("integer" if lookup_key_dtype.is_integer else "non-integer",
+             initializer.key_dtype))
+      if initializer.value_dtype != dtypes.int64:
+        raise TypeError("Invalid value dtype, expected %s but got %s." %
+                        (dtypes.int64, initializer.value_dtype))
+      self._table = HashTable(initializer, default_value=-1)
+      name = name or self._table.name
+    else:
+      lookup_key_dtype = dtypes.string
+      self._table = None
+      name = name or "hash_bucket"
+    if (not lookup_key_dtype.is_integer) and (dtypes.string !=
+                                              lookup_key_dtype):
+      raise TypeError("Invalid key_dtype, expected integer or string, got %s." %
+                      lookup_key_dtype)
+    self._num_oov_buckets = num_oov_buckets
+
+    self._table_name = None
+    if name is not None:
+      self._table_name = name.split("/")[-1]
+    super(StaticVocabularyTable, self).__init__(lookup_key_dtype, dtypes.int64)
+
+  def _create_resource(self):
+    if self._table is not None:
+      return self._table._create_resource()  # pylint: disable=protected-access
+    return None
+
+  def _initialize(self):
+    if self._table is not None:
+      return self._table._initialize()  # pylint: disable=protected-access
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  @property
+  def resource_handle(self):
+    if self._table is not None:
+      return self._table.resource_handle
+    return None
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    with ops.name_scope(name, "%s_Size" % self.name):
+      if self._table:
+        tsize = self._table.size()
+      else:
+        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
+      return tsize + self._num_oov_buckets
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in the table, outputs the corresponding values.
+
+    It assigns out-of-vocabulary keys to buckets based in their hashes.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: Optional name for the op.
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` doesn't match the table key data type.
+    """
+    if keys.dtype.base_dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+    values = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      values = keys.values
+    if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
+      values = math_ops.cast(values, dtypes.int64)
+
+    # TODO(yleon): Consider moving this functionality to its own kernel.
+    with ops.name_scope(name, "%s_Lookup" % self.name):
+      buckets = string_ops.string_to_hash_bucket_fast(
+          _as_string(values),
+          num_buckets=self._num_oov_buckets,
+          name="hash_bucket")
+      if self._table:
+        ids = self._table.lookup(values)
+        buckets = math_ops.add(buckets, self._table.size())
+        is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
+        ids = array_ops.where(is_id_non_default, ids, buckets)
+      else:
+        ids = buckets
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    return ids
+
+
+@tf_export(v1=["lookup.StaticVocabularyTable"])
+class StaticVocabularyTableV1(StaticVocabularyTable):
+
+  @property
+  def initializer(self):
+    if self._table is not None:
+      return self._table._init_op  # pylint: disable=protected-access
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+
 def index_table_from_file(vocabulary_file=None,
                           num_oov_buckets=0,
                           vocab_size=None,
@@ -1051,22 +1248,9 @@ def index_table_from_file(vocabulary_file=None,
   if (not key_dtype.is_integer) and (dtypes.string != key_dtype.base_dtype):
     raise TypeError("Only integer and string keys are supported.")
 
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+  with ops.name_scope(name, "string_to_index"):
     table = None
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if vocab_size:
-        # Keep the shared_name:
-        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                  key_column_index,
-                                                  value_column_index)
-      else:
-        # Keep the shared_name
-        # <table_type>_<filename>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                               key_column_index,
-                                               value_column_index)
+    with ops.name_scope(None, "hash_table"):
       init = TextFileIdTableInitializer(
           vocabulary_file,
           vocab_size=vocab_size,
@@ -1076,14 +1260,12 @@ def index_table_from_file(vocabulary_file=None,
           value_column_index=value_column_index,
           delimiter=delimiter)
 
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = StaticHashTableV1(init, default_value)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
           num_oov_buckets=num_oov_buckets,
           hasher_spec=hasher_spec,
-          name=feat_to_id_scope,
           key_dtype=key_dtype)
 
     return table
@@ -1156,7 +1338,7 @@ def index_table_from_tensor(vocabulary_list,
   if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
     raise TypeError("Only integer and string keys are supported.")
 
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+  with ops.name_scope(name, "string_to_index"):
     keys = ops.convert_to_tensor(vocabulary_list)
     if keys.dtype.is_integer != dtype.is_integer:
       raise ValueError("Expected %s, got %s." %
@@ -1165,29 +1347,23 @@ def index_table_from_tensor(vocabulary_list,
     if (not dtype.is_integer) and (keys.dtype.base_dtype != dtype):
       raise ValueError("Expected %s, got %s." % (dtype, keys.dtype))
     num_elements = array_ops.size(keys)
-    values = math_ops.to_int64(math_ops.range(num_elements))
+    values = math_ops.cast(math_ops.range(num_elements), dtypes.int64)
 
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        shared_name += str(ops.uid())
-      table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
+    with ops.name_scope(None, "hash_table"):
+      table_keys = math_ops.cast(
+          keys, dtypes.int64) if keys.dtype.is_integer else keys
       init = KeyValueTensorInitializer(
           table_keys,
           values,
           table_keys.dtype.base_dtype,
           dtypes.int64,
           name="table_init")
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = StaticHashTableV1(init, default_value)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
           num_oov_buckets=num_oov_buckets,
           hasher_spec=hasher_spec,
-          name=feat_to_id_scope,
           key_dtype=dtype)
     return table
 
@@ -1269,18 +1445,7 @@ def index_to_string_table_from_file(vocabulary_file,
   if vocab_size is not None and vocab_size < 1:
     raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
 
-  with ops.name_scope(name, "index_to_string") as scope:
-    shared_name = ""
-    if vocab_size:
-      # Keep a shared_name
-      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                key_column_index,
-                                                value_column_index)
-    else:
-      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file, key_column_index,
-                                             value_column_index)
+  with ops.name_scope(name, "index_to_string"):
     init = TextFileStringTableInitializer(
         vocabulary_file,
         vocab_size=vocab_size,
@@ -1290,7 +1455,7 @@ def index_to_string_table_from_file(vocabulary_file,
         delimiter=delimiter)
 
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return StaticHashTableV1(init, default_value)
 
 
 def index_to_string_table_from_tensor(vocabulary_list,
@@ -1343,16 +1508,537 @@ def index_to_string_table_from_tensor(vocabulary_list,
   if vocabulary_list is None:
     raise ValueError("vocabulary_list must be specified.")
 
-  with ops.name_scope(name, "index_to_string") as scope:
+  with ops.name_scope(name, "index_to_string"):
     vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
     num_elements = array_ops.size(vocabulary_list)
-    keys = math_ops.to_int64(math_ops.range(num_elements))
+    keys = math_ops.cast(math_ops.range(num_elements), dtypes.int64)
 
-    shared_name = ""
     init = KeyValueTensorInitializer(
         keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return StaticHashTableV1(init, default_value)
+
+
+class MutableHashTable(LookupInterface):
+  """A generic mutable hash table implementation.
+
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
+
+  Example usage:
+
+  ```python
+  table = tf.lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64,
+                                     default_value=-1)
+  sess.run(table.insert(keys, values))
+  out = table.lookup(query_keys)
+  print(out.eval())
+  ```
+  """
+
+  def __init__(self,
+               key_dtype,
+               value_dtype,
+               default_value,
+               name="MutableHashTable",
+               checkpoint=True):
+    """Creates an empty `MutableHashTable` object.
+
+    Creates a table, the type of its keys and values are specified by key_dtype
+    and value_dtype, respectively.
+
+    Args:
+      key_dtype: the type of the key tensors.
+      value_dtype: the type of the value tensors.
+      default_value: The value to use if a key is missing in the table.
+      name: A name for the operation (optional).
+      checkpoint: if True, the contents of the table are saved to and restored
+        from checkpoints. If `shared_name` is empty for a checkpointed table, it
+        is shared using the table node name.
+
+    Returns:
+      A `MutableHashTable` object.
+
+    Raises:
+      ValueError: If checkpoint is True and no name was specified.
+    """
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=value_dtype)
+    self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._name = name
+
+    self._shared_name = None
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._shared_name = "table_%d" % (ops.uid(),)
+    super(MutableHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self._create_resource()
+    if checkpoint:
+      saveable = MutableHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def _create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+    if self._default_value.get_shape().ndims == 0:
+      table_ref = gen_lookup_ops.mutable_hash_table_v2(
+          shared_name=self._shared_name,
+          use_node_name_sharing=use_node_name_sharing,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          name=self._name)
+    else:
+      table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
+          shared_name=self._shared_name,
+          use_node_name_sharing=use_node_name_sharing,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          value_shape=self._default_value.get_shape(),
+          name=self._name)
+
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_lookup_table_remove" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      op = gen_lookup_ops.lookup_table_remove_v2(self.resource_handle, keys)
+
+    return op
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. Can be a tensor of any shape. Must match the
+        table's key_dtype.
+      name: A name for the operation (optional).
+
+    Returns:
+      A tensor containing the values in the same shape as `keys` using the
+        table's value type.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      with ops.colocate_with(self.resource_handle):
+        values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle, keys,
+                                                     self._default_value)
+    return values
+
+  def insert(self, keys, values, name=None):
+    """Associates `keys` with `values`.
+
+    Args:
+      keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+        key type.
+      values: Values to be associated with keys. Must be a tensor of the same
+        shape as `keys` and match the table's value type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` or `values` doesn't match the table data
+        types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]):
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
+      with ops.colocate_with(self.resource_handle):
+        # pylint: disable=protected-access
+        op = gen_lookup_ops.lookup_table_insert_v2(self.resource_handle, keys,
+                                                   values)
+    return op
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self.resource_handle, self._key_dtype, self._value_dtype)
+    return exported_keys, exported_values
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {"table": functools.partial(MutableHashTable._Saveable, table=self)}
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject implementation for MutableHashTable."""
+
+    def __init__(self, table, name):
+      tensors = table.export()
+      specs = [
+          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
+          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
+      ]
+      # pylint: disable=protected-access
+      super(MutableHashTable._Saveable, self).__init__(table, specs, name)
+
+    def restore(self, restored_tensors, restored_shapes, name=None):
+      del restored_shapes  # unused
+      # pylint: disable=protected-access
+      with ops.name_scope(name, "%s_table_restore" % self.name):
+        with ops.colocate_with(self.op.resource_handle):
+          return gen_lookup_ops.lookup_table_import_v2(
+              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+
+
+@tf_export("lookup.experimental.DenseHashTable")
+class DenseHashTable(LookupInterface):
+  """A generic mutable hash table implementation using tensors as backing store.
+
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
+
+  It uses "open addressing" with quadratic reprobing to resolve collisions.
+  Compared to `MutableHashTable` the insert, remove and lookup operations in a
+  `DenseHashTable` are typically faster, but memory usage can be higher.
+  However, `DenseHashTable` does not require additional memory for
+  temporary tensors created during checkpointing and restore operations.
+
+  Example usage:
+
+  ```python
+  table = tf.lookup.DenseHashTable(key_dtype=tf.int64,
+                                   value_dtype=tf.int64,
+                                   default_value=-1,
+                                   empty_key=0,
+                                   deleted_key=-1)
+
+  sess.run(table.insert(keys, values))
+  out = table.lookup(query_keys)
+  print(out.eval())
+  ```
+  """
+
+  # TODO(andreasst): consider extracting common code with MutableHashTable into
+  # a common superclass.
+  def __init__(self,
+               key_dtype,
+               value_dtype,
+               default_value,
+               empty_key,
+               deleted_key,
+               initial_num_buckets=None,
+               name="MutableDenseHashTable",
+               checkpoint=True):
+    """Creates an empty `DenseHashTable` object.
+
+    Creates a table, the type of its keys and values are specified by key_dtype
+    and value_dtype, respectively.
+
+    Args:
+      key_dtype: the type of the key tensors.
+      value_dtype: the type of the value tensors.
+      default_value: The value to use if a key is missing in the table.
+      empty_key: the key to use to represent empty buckets internally. Must not
+        be used in insert, remove or lookup operations.
+      deleted_key: the key to use to represent deleted buckets internally. Must
+        not be used in insert, remove or lookup operations and be different from
+        the empty_key.
+      initial_num_buckets: the initial number of buckets.
+      name: A name for the operation (optional).
+      checkpoint: if True, the contents of the table are saved to and restored
+        from checkpoints. If `shared_name` is empty for a checkpointed table, it
+        is shared using the table node name.
+
+    Returns:
+      A `DenseHashTable` object.
+
+    Raises:
+      ValueError: If checkpoint is True and no name was specified.
+    """
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=value_dtype, name="default_value")
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._initial_num_buckets = initial_num_buckets
+    self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._name = name
+
+    self._empty_key = ops.convert_to_tensor(
+        empty_key, dtype=key_dtype, name="empty_key")
+    self._deleted_key = ops.convert_to_tensor(
+        deleted_key, dtype=key_dtype, name="deleted_key")
+    self._shared_name = None
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._shared_name = "table_%d" % (ops.uid(),)
+    super(DenseHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self._create_resource()
+    if checkpoint:
+      saveable = DenseHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def _create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+    table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=self._empty_key,
+        deleted_key=self._deleted_key,
+        shared_name=self._shared_name,
+        use_node_name_sharing=use_node_name_sharing,
+        value_dtype=self._value_dtype,
+        value_shape=self._value_shape,
+        initial_num_buckets=self._initial_num_buckets,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. Can be a tensor of any shape. Must match the
+        table's key_dtype.
+      name: A name for the operation (optional).
+
+    Returns:
+      A tensor containing the values in the same shape as `keys` using the
+        table's value type.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
+                        [self.resource_handle, keys]):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      with ops.colocate_with(self.resource_handle):
+        values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle, keys,
+                                                     self._default_value)
+
+    return values
+
+  def insert_or_assign(self, keys, values, name=None):
+    """Associates `keys` with `values`.
+
+    Args:
+      keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+        key type.
+      values: Values to be associated with keys. Must be a tensor of the same
+        shape as `keys` and match the table's value type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` or `values` doesn't match the table data
+        types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(
+          values, dtype=self._value_dtype, name="values")
+      with ops.colocate_with(self.resource_handle):
+        op = gen_lookup_ops.lookup_table_insert_v2(self.resource_handle, keys,
+                                                   values)
+      return op
+
+  def insert(self, keys, values, name=None):
+    """Associates `keys` with `values`.
+
+    Args:
+      keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+        key type.
+      values: Values to be associated with keys. Must be a tensor of the same
+        shape as `keys` and match the table's value type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` or `values` doesn't match the table data
+        types.
+    """
+    return self.insert_or_assign(keys, values, name)
+
+  def erase(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_lookup_table_remove" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      # pylint: disable=protected-access
+      op = gen_lookup_ops.lookup_table_remove_v2(self.resource_handle, keys)
+
+    return op
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    return self.erase(keys, name)
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self.resource_handle, self._key_dtype, self._value_dtype)
+
+    return exported_keys, exported_values
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {"table": functools.partial(DenseHashTable._Saveable, table=self)}
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject implementation for DenseHashTable."""
+
+    def __init__(self, table, name):
+      tensors = table.export()
+      specs = [
+          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
+          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
+      ]
+      # pylint: disable=protected-access
+      super(DenseHashTable._Saveable, self).__init__(table, specs, name)
+
+    def restore(self, restored_tensors, restored_shapes, name=None):
+      del restored_shapes  # unused
+      # pylint: disable=protected-access
+      with ops.name_scope(name, "%s_table_restore" % self.name):
+        with ops.colocate_with(self.op.resource_handle):
+          return gen_lookup_ops.lookup_table_import_v2(
+              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
 
 
 ops.NotDifferentiable("LookupTableFind")
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 4aea0265a72dcd2b2358f063fb0a51a5877076e7..9155d890ded7782ef7d64e631540e98e07f34a80 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 7f88ccd879d09d57dc32c29dd4f28fa4389937a1..6cd1d8e5f8baf39f9051f95ead2f47d30826c945 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,31 +35,6 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction", v1=[])
-class ReductionV2(object):
-  """Types of loss reduction.
-
-  Contains the following values:
-
-  * `NONE`: Un-reduced weighted losses with the same shape as input.
-  * `SUM`: Scalar sum of weighted losses.
-  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  """
-
-  NONE = "none"
-  SUM = "sum"
-  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
-
-  @classmethod
-  def all(cls):
-    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
-
-  @classmethod
-  def validate(cls, key):
-    if key not in cls.all():
-      raise ValueError("Invalid Reduction Key %s." % key)
-
-
 @tf_export(v1=["losses.Reduction"])
 class Reduction(object):
   """Types of loss reduction.
@@ -69,8 +45,13 @@ class Reduction(object):
   * `SUM`: Scalar sum of weighted losses.
   * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
   * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
      weights. DEPRECATED.
+     Note that when using `tf.distribute.Strategy`, this is scaled by the
+     number of replicas that are contributing to a single step to get an
+     approximation to the global batch size.
   * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
@@ -198,11 +179,6 @@ def compute_weighted_loss(
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
       losses = ops.convert_to_tensor(losses)
@@ -214,15 +190,17 @@ def compute_weighted_loss(
         loss = weighted_losses
       else:
         loss = math_ops.reduce_sum(weighted_losses)
+        num_replicas = (  # Used to convert from local to global batch size.
+            distribution_strategy_context.get_strategy().num_replicas_in_sync)
         if reduction == Reduction.MEAN:
-          loss = _safe_mean(
-              loss,
-              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          denom = (num_replicas *
+                   math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          loss = _safe_mean(loss, denom)
         elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
               reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
-          loss = _safe_mean(loss, _num_present(losses, weights))
+          loss = _safe_mean(loss, num_replicas * _num_present(losses, weights))
         elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
-          loss = _safe_mean(loss, _num_elements(losses))
+          loss = _safe_mean(loss, num_replicas * _num_elements(losses))
 
       # Convert the result back to the input type.
       loss = math_ops.cast(loss, input_dtype)
@@ -794,7 +772,7 @@ def softmax_cross_entropy(
 
     if label_smoothing > 0:
       num_classes = math_ops.cast(
-          array_ops.shape(onehot_labels)[1], logits.dtype)
+          array_ops.shape(onehot_labels)[-1], logits.dtype)
       smooth_positives = 1.0 - label_smoothing
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6767e3eb228d9f934379db7acef2d935af71ff80
--- /dev/null
+++ b/tensorflow/python/ops/map_fn.py
@@ -0,0 +1,285 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("map_fn")
+def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
+           swap_memory=False, infer_shape=True, name=None):
+  """map on the list of tensors unpacked from `elems` on dimension 0.
+
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
+  sequence of elements from first to last. The elements are made of the
+  tensors unpacked from `elems`. `dtype` is the data type of the return
+  value of `fn`. Users must provide `dtype` if it is different from
+  the data type of `elems`.
+
+  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
+  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
+  Furthermore, `fn` may emit a different structure than its input.  For example,
+  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
+  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
+  nested) tuple of types matching the output of `fn`.
+
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
+  ```
+
+  instead.
+
+  When executing eagerly, map_fn does not execute in parallel even if
+  `parallel_iterations` is set to a value > 1. You can still get the
+  performance benefits of running a function in parallel by using the
+  `tf.contrib.eager.defun` decorator,
+
+  ```python
+  # Assume the function being used in map_fn is fn.
+  # To ensure map_fn calls fn in parallel, use the defun decorator.
+  @tf.contrib.eager.defun
+  def func(tensor):
+    return tf.map_fn(fn, tensor)
+  ```
+
+  Note that if you use the defun decorator, any non-TensorFlow Python code
+  that you may have written in your function won't get executed. See
+  `tf.contrib.eager.defun` for more details. The recommendation would be to
+  debug without defun but switch to defun to get performance benefits of
+  running map_fn in parallel.
+
+  Args:
+    fn: The callable to be performed.  It accepts one argument, which will
+      have the same (possibly nested) structure as `elems`.  Its output
+      must have the same structure as `dtype` if one is provided, otherwise
+      it must have the same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be applied to `fn`.
+    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
+      of Tensors differing from the structure of `elems`, then `dtype` is not
+      optional and must have the same structure as the output of `fn`.
+    parallel_iterations: (optional) The number of iterations allowed to run
+      in parallel. When graph building, the default value is 10. While executing
+      eagerly, the default value is set to 1.
+    back_prop: (optional) True enables support for back propagation.
+    swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
+    name: (optional) Name prefix for the returned tensors.
+
+  Returns:
+    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
+    results of applying `fn` to tensors unpacked from `elems` along the first
+    dimension, from first to last.
+
+  Raises:
+    TypeError: if `fn` is not callable or the structure of the output of
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
+    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+
+  Examples:
+    ```python
+    elems = np.array([1, 2, 3, 4, 5, 6])
+    squares = map_fn(lambda x: x * x, elems)
+    # squares == [1, 4, 9, 16, 25, 36]
+    ```
+
+    ```python
+    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
+    # alternate == [-1, 2, -3]
+    ```
+
+    ```python
+    elems = np.array([1, 2, 3])
+    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
+    # alternates[0] == [1, 2, 3]
+    # alternates[1] == [-1, -2, -3]
+    ```
+  """
+  if not callable(fn):
+    raise TypeError("fn must be callable.")
+
+  if isinstance(elems, sparse_tensor.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
+
+  in_graph_mode = not context.executing_eagerly()
+  # Set the default number of parallel_iterations depending on graph/eager mode.
+  if in_graph_mode and not parallel_iterations:
+    parallel_iterations = 10
+  elif not in_graph_mode and not parallel_iterations:
+    parallel_iterations = 1
+
+  if not in_graph_mode and parallel_iterations > 1:
+    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
+                        "effect when executing eagerly. Consider calling map_fn"
+                        " with tf.contrib.eager.defun to execute fn in "
+                        "parallel.", 1)
+    parallel_iterations = 1
+
+  input_is_sequence = nest.is_sequence(elems)
+  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+  def input_pack(x):
+    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+
+  if dtype is None:
+    output_is_sequence = input_is_sequence
+    output_flatten = input_flatten
+    output_pack = input_pack
+  else:
+    output_is_sequence = nest.is_sequence(dtype)
+    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
+    def output_pack(x):
+      return (nest.pack_sequence_as(dtype, x)
+              if output_is_sequence else x[0])
+
+  elems_flat = input_flatten(elems)
+
+  with ops.name_scope(name, "map", elems_flat):
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode:
+      # Any get_variable calls in fn will cache the first call locally
+      # and not issue repeated network I/O requests for each iteration.
+      varscope = vs.get_variable_scope()
+      varscope_caching_device_was_none = False
+      if varscope.caching_device is None:
+        # TODO(ebrevdo): Change to using colocate_with here and in other
+        # methods.
+        varscope.set_caching_device(lambda op: op.device)
+        varscope_caching_device_was_none = True
+
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
+
+    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
+    dtype_flat = output_flatten(dtype)
+
+    # Convert elems to tensor array. n may be known statically.
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = (tensor_shape.dimension_value(static_shape[0])
+         or array_ops.shape(elems_flat[0])[0])
+
+    # TensorArrays are always flat
+    elems_ta = [
+        tensor_array_ops.TensorArray(dtype=elem.dtype,
+                                     size=n,
+                                     dynamic_size=False,
+                                     infer_shape=True)
+        for elem in elems_flat]
+    # Unpack elements
+    elems_ta = [
+        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
+
+    i = constant_op.constant(0)
+
+    accs_ta = [
+        tensor_array_ops.TensorArray(dtype=dt,
+                                     size=n,
+                                     dynamic_size=False,
+                                     infer_shape=infer_shape)
+        for dt in dtype_flat]
+
+    def compute(i, tas):
+      """The loop body of map_fn.
+
+      Args:
+        i: the loop counter
+        tas: the flat TensorArray accumulator list
+
+      Returns:
+        (i + 1, tas): the updated counter + updated TensorArrays
+
+      Raises:
+        TypeError: if dtype and packed_fn_values structure do not match
+        ValueType: if dtype and packed_fn_values lengths do not match
+      """
+      packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
+      packed_fn_values = fn(packed_values)
+      nest.assert_same_structure(dtype or elems, packed_fn_values)
+      flat_fn_values = output_flatten(packed_fn_values)
+      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)]
+      return (i + 1, tas)
+
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, _: i < n, compute, (i, accs_ta),
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
+        maximum_iterations=n)
+    results_flat = [r.stack() for r in r_a]
+
+    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
+        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
+    for elem in elems_flat[1:]:
+      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
+          elem.get_shape().with_rank_at_least(1)[0])))
+    for r in results_flat:
+      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
+          r.get_shape()[1:]))
+
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode and varscope_caching_device_was_none:
+      varscope.set_caching_device(None)
+
+    return output_pack(results_flat)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index dc2340983afa21f9236708a77f50875fafd0699b..5e4901cb75931f58804bb3bf741bc2ba7962659d 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -47,6 +47,10 @@ def _ArgMinGrad(op, grad):
   return [None, None]
 
 
+# TODO(rmlarsen): Implement gradient.
+ops.NotDifferentiable("EuclideanNorm")
+
+
 @ops.RegisterGradient("Sum")
 def _SumGrad(op, grad):
   """Gradient for Sum."""
@@ -99,7 +103,7 @@ def _MinOrMaxGrad(op, grad):
   num_selected = array_ops.reshape(
       math_ops.reduce_sum(indicators, op.inputs[1]), output_shape_kept_dims)
 
-  return [math_ops.div(indicators, num_selected) * grad, None]
+  return [math_ops.divide(indicators, num_selected) * grad, None]
 
 
 @ops.RegisterGradient("Max")
@@ -196,7 +200,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
   ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
-  scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
+  scaled_grad = math_ops.divide(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
 
@@ -260,7 +264,7 @@ def _SegmentMinOrMaxGrad(op, grad):
                                       op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
   return array_ops.where(is_selected, gathered_grads, zeros), None
 
@@ -314,7 +318,7 @@ def _UnsortedSegmentMinOrMaxGrad(op, grad):
       math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
                                               zero_clipped_indices,
                                               is_positive)
@@ -455,8 +459,8 @@ def _SqrtGradGrad(op, grad):
   a = op.inputs[0]
   y = op.outputs[0]  # y = 0.5 * b / conj(a)
   with ops.control_dependencies([grad]):
-    ga = grad / a
-    return -math_ops.conj(ga) * y, 0.5 * ga
+    ga = gen_math_ops.xdivy(grad, a)
+    return -gen_math_ops.mul_no_nan(y, math_ops.conj(ga)), 0.5 * ga
 
 
 @ops.RegisterGradient("Rsqrt")
@@ -504,7 +508,7 @@ def _LogGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * math_ops.reciprocal(x)
+    return gen_math_ops.xdivy(grad, x)
 
 
 @ops.RegisterGradient("Log1p")
@@ -513,7 +517,7 @@ def _Log1pGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * math_ops.reciprocal(1 + x)
+    return gen_math_ops.xdivy(grad, 1 + x)
 
 
 @ops.RegisterGradient("Xlogy")
@@ -592,7 +596,7 @@ def _AcoshGrad(op, grad):
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    return grad / math_ops.sinh(y)
+    return math_ops.xdivy(grad, math_ops.sinh(y))
 
 
 @ops.RegisterGradient("Atanh")
@@ -827,7 +831,7 @@ def _TanGrad(op, grad):
     x = math_ops.conj(x)
     secx = math_ops.reciprocal(math_ops.cos(x))
     secx2 = math_ops.square(secx)
-    return grad * secx2
+    return math_ops.mul_no_nan(secx2, grad)
 
 
 @ops.RegisterGradient("Asin")
@@ -839,8 +843,7 @@ def _AsinGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    inv = math_ops.reciprocal(den)
-    return grad * inv
+    return math_ops.xdivy(grad, den)
 
 
 @ops.RegisterGradient("Acos")
@@ -852,8 +855,7 @@ def _AcosGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    inv = math_ops.reciprocal(den)
-    return -grad * inv
+    return -math_ops.xdivy(grad, den)
 
 
 @ops.RegisterGradient("Atan")
@@ -874,7 +876,7 @@ def _Atan2Grad(op, grad):
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
-    grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
+    grad_inv = math_ops.xdivy(grad, (math_ops.square(x) + math_ops.square(y)))
     return x * grad_inv, -y * grad_inv
 
 
@@ -946,6 +948,26 @@ def _MulGrad(op, grad):
               math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
 
 
+@ops.RegisterGradient("MulNoNan")
+def _MulNoNanGrad(op, grad):
+  """The gradient of scalar multiplication with NaN-suppression."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return gen_math_ops.mul_no_nan(grad, y), gen_math_ops.mul_no_nan(
+        x, grad)
+  assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  return (array_ops.reshape(
+      math_ops.reduce_sum(gen_math_ops.mul_no_nan(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(gen_math_ops.mul_no_nan(x, grad), ry),
+              sy))
+
+
 @ops.RegisterGradient("Div")
 def _DivGrad(op, grad):
   """The gradient for the Div operator."""
@@ -956,10 +978,12 @@ def _DivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  return (array_ops.reshape(math_ops.reduce_sum(math_ops.div(grad, y), rx), sx),
+  return (array_ops.reshape(
+      math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
           array_ops.reshape(
-              math_ops.reduce_sum(grad * math_ops.div(math_ops.div(-x, y), y),
-                                  ry), sy))
+              math_ops.reduce_sum(
+                  grad * math_ops.div_no_nan(math_ops.divide(-x, y), y), ry),
+              sy))
 
 
 @ops.RegisterGradient("FloorDiv")
@@ -1037,7 +1061,8 @@ def _PowGrad(op, grad):
   y = math_ops.conj(y)
   z = math_ops.conj(z)
   gx = array_ops.reshape(
-      math_ops.reduce_sum(grad * y * math_ops.pow(x, y - 1), rx), sx)
+      math_ops.reduce_sum(
+          math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad), rx), sx)
   # Avoid false singularity at x = 0
   if x.dtype.is_complex:
     # real(x) < 0 is fine for the complex case
@@ -1047,7 +1072,8 @@ def _PowGrad(op, grad):
     mask = x > 0
   safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
   log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
-  gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
+  gy = array_ops.reshape(
+      math_ops.reduce_sum(gen_math_ops.mul_no_nan(z * log_x, grad), ry), sy)
   return gx, gy
 
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 36b54b62cb44dae96e951279195282678f0dc637..96c24c3c98f2ec6f52317f0f9c46380eb7fe35c5 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -280,6 +280,31 @@ class DivNoNanGradientTest(test.TestCase):
       self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
 
 
+class MulNoNanGradientTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasicGradient(self):
+    inputs = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(inputs, 1 + math_ops.abs(inputs))
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.get_shape().as_list(), outputs,
+          outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithRhsIsZero(self):
+    x_vals = [0, 1.0, np.nan, np.inf, np.NINF]
+    x = constant_op.constant(x_vals, dtype=dtypes.float32)
+    y = array_ops.zeros_like(x, dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(x, y)
+    with self.cached_session():
+      dx, dy = gradients.gradients(outputs, [x, y])
+      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy.eval(), x_vals)
+
+
 class XlogyTest(test.TestCase):
 
   def _xlogy_gradients(self, x, y):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index cb7fecf7c7b261c320f3fe3d5579642176b39514..a5c0055a535decdd906dac29e2b321a9739f888e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -12,9 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Basic arithmetic operators.
+"""Math Operations.
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+`tf.convert_to_tensor`.
+
+Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+TensorFlow provides a variety of math functions including:
+
+* Basic arithmetic operators and trigonometric functions.
+* Special math functions (like: `tf.math.igamma` and `tf.math.zeta`)
+* Complex number functions (like: `tf.math.imag` and `tf.math.angle`)
+* Reductions and scans (like: `tf.math.reduce_mean` and `tf.math.cumsum`)
+* Segment functions (like: `tf.math.segment_sum`)
+
+See: `tf.linalg` for matrix and tensor functions.
+
+<a id=Segmentation></a>
+
+## About Segmentation
+
+TensorFlow provides several operations that you can use to perform common
+math computations on tensor segments.
+Here a segmentation is a partitioning of a tensor along
+the first dimension, i.e. it  defines a mapping from the first dimension onto
+`segment_ids`. The `segment_ids` tensor should be the size of
+the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
+where `k<d0`.
+In particular, a segmentation of a matrix tensor is a mapping of rows to
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+#  ==>  [[0 0 0 0]
+#        [5 6 7 8]]
+```
+
+The standard `segment_*` functions assert that the segment indices are sorted.
+If you have unsorted indices use the equivalent `unsorted_segment_` function.
+Thses functions take an additional argument `num_segments` so that the output
+tensor can be efficiently allocated.
+
+``` python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 6,  8, 10, 12],
+#       [-1, -2, -3, -4]]
+```
 
-See the [python/math_ops](python/math_ops) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -654,7 +704,7 @@ def saturate_cast(value, dtype, name=None):
                                        name="max"))
     return cast(value, dtype, name=name)
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
@@ -673,7 +723,7 @@ def to_float(x, name="ToFloat"):
   return cast(x, dtypes.float32, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_double"])
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
@@ -692,7 +742,7 @@ def to_double(x, name="ToDouble"):
   return cast(x, dtypes.float64, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int32"])
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
@@ -711,7 +761,7 @@ def to_int32(x, name="ToInt32"):
   return cast(x, dtypes.int32, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int64"])
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
@@ -730,7 +780,7 @@ def to_int64(x, name="ToInt64"):
   return cast(x, dtypes.int64, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_bfloat16"])
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
@@ -749,7 +799,7 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex64"])
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
@@ -768,7 +818,7 @@ def to_complex64(x, name="ToComplex64"):
   return cast(x, dtypes.complex64, name=name)
 
 
-@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex128"])
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
@@ -1005,7 +1055,8 @@ def div(x, y, name=None):
   return _div_python2(x, y, name)
 
 
-@tf_export("div_no_nan")
+@tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
@@ -1029,6 +1080,31 @@ def div_no_nan(x, y, name=None):
     return gen_math_ops.div_no_nan(x, y, name=name)
 
 
+@tf_export("math.multiply_no_nan")
+@dispatch.add_dispatch_support
+def multiply_no_nan(x, y, name=None):
+  """Computes the product of x and y and returns 0 if the y is zero, even if x is NaN or infinite.
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
+    y: A `Tensor` whose dtype is compatible with `x`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The element-wise value of the x times y.
+  """
+
+  with ops.name_scope(name, "multiply_no_nan", [x, y]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
+    x_dtype = x.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(
+          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
+    return gen_math_ops.mul_no_nan(x, y, name=name)
+
+
 # TODO(aselle): This should be removed
 mod = gen_math_ops.floor_mod
 
@@ -1336,18 +1412,60 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
           name=name))
 
 
+@tf_export("math.reduce_euclidean_norm")
+def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the Euclidean norm of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [1, 1, 1]])
+  tf.reduce_euclidean_norm(x)  # sqrt(17)
+  tf.reduce_euclidean_norm(x, 0)  # [sqrt(2), sqrt(5), sqrt(10)]
+  tf.reduce_euclidean_norm(x, 1)  # [sqrt(14), sqrt(3)]
+  tf.reduce_euclidean_norm(x, 1, keepdims=True)  # [[sqrt(14)], [sqrt(3)]]
+  tf.reduce_euclidean_norm(x, [0, 1])  # sqrt(17)
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+  """
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.euclidean_norm(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 @deprecation.deprecated_args(
     None, "reduction_indices is deprecated, use axis instead", "axis")
-def count_nonzero(input_tensor,
+def count_nonzero(input_tensor=None,
                   axis=None,
                   keepdims=None,
                   dtype=dtypes.int64,
                   name=None,
                   reduction_indices=None,
-                  keep_dims=None):
+                  keep_dims=None,
+                  input=None):  # pylint: disable=redefined-builtin
   """Computes number of nonzero elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1393,12 +1511,15 @@ def count_nonzero(input_tensor,
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
     keep_dims: Deprecated alias for `keepdims`.
+    input: Overrides input_tensor. For compatibility.
 
   Returns:
     The reduced tensor (number of nonzero values).
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  input_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "input_tensor", input_tensor)
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis,
       "reduction_indices", reduction_indices
@@ -2659,11 +2780,11 @@ def add_n(inputs, name=None):
     cannot be inferred.
   """
   if not inputs or not isinstance(inputs, (list, tuple)):
-    raise ValueError("inputs must be a list of at least one"
+    raise ValueError("inputs must be a list of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape")
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
   if not all(isinstance(x, (ops.Tensor, ops.IndexedSlices)) for x in inputs):
-    raise ValueError("inputs must be a list of at least one"
+    raise ValueError("inputs must be a list of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape")
 
   if len(inputs) == 1:
@@ -3108,7 +3229,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3154,7 +3275,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3201,7 +3322,7 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
   r"""Computes the sum along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -3288,7 +3409,7 @@ def sparse_segment_mean(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
@@ -3333,7 +3454,7 @@ def sparse_segment_mean_v2(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index b4832e09c084e7165143f4e918b9ba76842e2311..8f66022b506f589b7916d3b6dbdf98e3a09402a4 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -558,6 +558,22 @@ class DivNoNanTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(tf_result, np_result)
 
 
+class MultiplyNoNanTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasic(self):
+    for dtype in [np.float32, np.float64]:
+      values = [0, 1, np.nan, np.inf, np.NINF]
+      x = constant_op.constant(values, dtype=dtype)
+      zeros = constant_op.constant(np.zeros((5,)), dtype=dtype)
+      ones = constant_op.constant(np.ones((5,)), dtype=dtype)
+      with self.cached_session(use_gpu=True):
+        tf_result_zeros = math_ops.multiply_no_nan(x, zeros).eval()
+        self.assertAllEqual(tf_result_zeros, zeros)
+        tf_result_ones = math_ops.multiply_no_nan(x, ones).eval()
+        self.assertAllEqual(tf_result_ones, x)
+
+
 class XlogyTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -672,8 +688,8 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
     if context.executing_eagerly():
       error = errors_impl.InvalidArgumentError
       error_message = (
-          r"cannot compute Add as input #0\(zero-based\) was expected to be a "
-          r"float tensor but is a int32 tensor \[Op:Add\] name: add/")
+          r"cannot compute Add as input #1\(zero-based\) was expected to be a "
+          r"int32 tensor but is a float tensor \[Op:Add\] name: add/")
     else:
       error = TypeError
       error_message = ("Input 'y' of 'Add' Op has type float32 that does not "
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index 138149e63dc693aa18adbbce2c6ff677df0a6d57..d597298c8d73e971925dce9b1585bc94dd8c1763 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -49,13 +49,17 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
   """
   with ops.device('%s' % device):
     if not transpose_a:
-      x = variables.VariableV1(random_ops.random_uniform([n, m], dtype=dtype))
+      x = variables.VariableV1(random_ops.random_uniform([n, m], dtype=dtype),
+                               use_resource=False)
     else:
-      x = variables.VariableV1(random_ops.random_uniform([m, n], dtype=dtype))
+      x = variables.VariableV1(random_ops.random_uniform([m, n], dtype=dtype),
+                               use_resource=False)
     if not transpose_b:
-      y = variables.VariableV1(random_ops.random_uniform([m, k], dtype=dtype))
+      y = variables.VariableV1(random_ops.random_uniform([m, k], dtype=dtype),
+                               use_resource=False)
     else:
-      y = variables.VariableV1(random_ops.random_uniform([k, m], dtype=dtype))
+      y = variables.VariableV1(random_ops.random_uniform([k, m], dtype=dtype),
+                               use_resource=False)
 
     z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b)
     return control_flow_ops.group(z)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index df9ca86ec77cfb99d083248783bed2209d281257..71f05746d14fac6a553c0fdf94d9c1916c6dd8ca 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -43,7 +43,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
   """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES)` collections.
 
   If running in a `DistributionStrategy` context, the variable will be
-  "replica local". This means:
+  "sync on read". This means:
 
   *   The returned object will be a container with separate variables
       per replica of the model.
@@ -59,7 +59,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
       of the final result value inside
       `distribution_strategy_context.get_replica_context().merge_call(fn)`.
       Inside the `merge_call()`, ops are only added to the graph once
-      and access to a replica-local variable in a computation returns
+      and access to a sync on read variable in a computation returns
       the sum across all replicas.
 
   Args:
@@ -71,7 +71,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
 
   Returns:
     A (non-trainable) variable initialized to zero, or if inside a
-    `DistributionStrategy` scope a replica-local variable container.
+    `DistributionStrategy` scope a sync on read variable container.
   """
   # Note that synchronization "ON_READ" implies trainable=False.
   return variable_scope.variable(
@@ -256,9 +256,9 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
       [num_classes, num_classes], dtypes.float64, name='total_confusion_matrix')
 
   # Cast the type to int64 required by confusion_matrix_ops.
-  predictions = math_ops.to_int64(predictions)
-  labels = math_ops.to_int64(labels)
-  num_classes = math_ops.to_int64(num_classes)
+  predictions = math_ops.cast(predictions, dtypes.int64)
+  labels = math_ops.cast(labels, dtypes.int64)
+  num_classes = math_ops.cast(num_classes, dtypes.int64)
 
   # Flatten the input if its rank > 1.
   if predictions.get_shape().ndims > 1:
@@ -360,18 +360,18 @@ def mean(values,
                        'is enabled.')
 
   with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    values = math_ops.to_float(values)
+    values = math_ops.cast(values, dtypes.float32)
 
     total = metric_variable([], dtypes.float32, name='total')
     count = metric_variable([], dtypes.float32, name='count')
 
     if weights is None:
-      num_values = math_ops.to_float(array_ops.size(values))
+      num_values = math_ops.cast(array_ops.size(values), dtypes.float32)
     else:
       values, _, weights = _remove_squeezable_dimensions(
           predictions=values, labels=None, weights=weights)
       weights = weights_broadcast_ops.broadcast_weights(
-          math_ops.to_float(weights), values)
+          math_ops.cast(weights, dtypes.float32), values)
       values = math_ops.multiply(values, weights)
       num_values = math_ops.reduce_sum(weights)
 
@@ -452,7 +452,8 @@ def accuracy(labels,
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
   if labels.dtype != predictions.dtype:
     predictions = math_ops.cast(predictions, labels.dtype)
-  is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+  is_correct = math_ops.cast(
+      math_ops.equal(predictions, labels), dtypes.float32)
   return mean(is_correct, weights, metrics_collections, updates_collections,
               name or 'accuracy')
 
@@ -523,7 +524,7 @@ def _confusion_matrix_at_thresholds(labels,
           message='predictions must be in [0, 1]')
   ]):
     predictions, labels, weights = _remove_squeezable_dimensions(
-        predictions=math_ops.to_float(predictions),
+        predictions=math_ops.cast(predictions, dtypes.float32),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
 
@@ -558,7 +559,7 @@ def _confusion_matrix_at_thresholds(labels,
 
   if weights is not None:
     weights = weights_broadcast_ops.broadcast_weights(
-        math_ops.to_float(weights), predictions)
+        math_ops.cast(weights, dtypes.float32), predictions)
     weights_tiled = array_ops.tile(
         array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
     thresh_tiled.get_shape().assert_is_compatible_with(
@@ -572,8 +573,8 @@ def _confusion_matrix_at_thresholds(labels,
   if 'tp' in includes:
     true_p = metric_variable(
         [num_thresholds], dtypes.float32, name='true_positives')
-    is_true_positive = math_ops.to_float(
-        math_ops.logical_and(label_is_pos, pred_is_pos))
+    is_true_positive = math_ops.cast(
+        math_ops.logical_and(label_is_pos, pred_is_pos), dtypes.float32)
     if weights_tiled is not None:
       is_true_positive *= weights_tiled
     update_ops['tp'] = state_ops.assign_add(true_p,
@@ -584,8 +585,8 @@ def _confusion_matrix_at_thresholds(labels,
   if 'fn' in includes:
     false_n = metric_variable(
         [num_thresholds], dtypes.float32, name='false_negatives')
-    is_false_negative = math_ops.to_float(
-        math_ops.logical_and(label_is_pos, pred_is_neg))
+    is_false_negative = math_ops.cast(
+        math_ops.logical_and(label_is_pos, pred_is_neg), dtypes.float32)
     if weights_tiled is not None:
       is_false_negative *= weights_tiled
     update_ops['fn'] = state_ops.assign_add(false_n,
@@ -596,8 +597,8 @@ def _confusion_matrix_at_thresholds(labels,
   if 'tn' in includes:
     true_n = metric_variable(
         [num_thresholds], dtypes.float32, name='true_negatives')
-    is_true_negative = math_ops.to_float(
-        math_ops.logical_and(label_is_neg, pred_is_neg))
+    is_true_negative = math_ops.cast(
+        math_ops.logical_and(label_is_neg, pred_is_neg), dtypes.float32)
     if weights_tiled is not None:
       is_true_negative *= weights_tiled
     update_ops['tn'] = state_ops.assign_add(true_n,
@@ -608,8 +609,8 @@ def _confusion_matrix_at_thresholds(labels,
   if 'fp' in includes:
     false_p = metric_variable(
         [num_thresholds], dtypes.float32, name='false_positives')
-    is_false_positive = math_ops.to_float(
-        math_ops.logical_and(label_is_neg, pred_is_pos))
+    is_false_positive = math_ops.cast(
+        math_ops.logical_and(label_is_neg, pred_is_pos), dtypes.float32)
     if weights_tiled is not None:
       is_false_positive *= weights_tiled
     update_ops['fp'] = state_ops.assign_add(false_p,
@@ -1019,7 +1020,7 @@ def mean_per_class_accuracy(labels,
 
   with variable_scope.variable_scope(name, 'mean_accuracy',
                                      (predictions, labels, weights)):
-    labels = math_ops.to_int64(labels)
+    labels = math_ops.cast(labels, dtypes.int64)
 
     # Flatten the input if its rank > 1.
     if labels.get_shape().ndims > 1:
@@ -1038,12 +1039,13 @@ def mean_per_class_accuracy(labels,
 
     if labels.dtype != predictions.dtype:
       predictions = math_ops.cast(predictions, labels.dtype)
-    is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+    is_correct = math_ops.cast(
+        math_ops.equal(predictions, labels), dtypes.float32)
 
     if weights is not None:
       if weights.get_shape().ndims > 1:
         weights = array_ops.reshape(weights, [-1])
-      weights = math_ops.to_float(weights)
+      weights = math_ops.cast(weights, dtypes.float32)
 
       is_correct *= weights
       ones *= weights
@@ -1135,9 +1137,11 @@ def mean_iou(labels,
 
     def compute_mean_iou(_, total_cm):
       """Compute the mean intersection-over-union via the confusion matrix."""
-      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
-      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
-      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
+      sum_over_row = math_ops.cast(
+          math_ops.reduce_sum(total_cm, 0), dtypes.float32)
+      sum_over_col = math_ops.cast(
+          math_ops.reduce_sum(total_cm, 1), dtypes.float32)
+      cm_diag = math_ops.cast(array_ops.diag_part(total_cm), dtypes.float32)
       denominator = sum_over_row + sum_over_col - cm_diag
 
       # The mean is only computed over classes that appear in the
@@ -1295,7 +1299,7 @@ def mean_squared_error(labels,
 
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
-  squared_error = math_ops.square(labels - predictions)
+  squared_error = math_ops.squared_difference(labels, predictions)
   return mean(squared_error, weights, metrics_collections, updates_collections,
               name or 'mean_squared_error')
 
@@ -1352,7 +1356,7 @@ def mean_tensor(values,
                        'eager execution is enabled.')
 
   with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    values = math_ops.to_float(values)
+    values = math_ops.cast(values, dtypes.float32)
     total = metric_variable(
         values.get_shape(), dtypes.float32, name='total_tensor')
     count = metric_variable(
@@ -1363,7 +1367,7 @@ def mean_tensor(values,
       values, _, weights = _remove_squeezable_dimensions(
           predictions=values, labels=None, weights=weights)
       weights = weights_broadcast_ops.broadcast_weights(
-          math_ops.to_float(weights), values)
+          math_ops.cast(weights, dtypes.float32), values)
       values = math_ops.multiply(values, weights)
       num_values = math_ops.multiply(num_values, weights)
 
@@ -1434,7 +1438,8 @@ def percentage_below(values,
     raise RuntimeError('tf.metrics.percentage_below is not supported when '
                        'eager execution is enabled.')
 
-  is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
+  is_below_threshold = math_ops.cast(
+      math_ops.less(values, threshold), dtypes.float32)
   return mean(is_below_threshold, weights, metrics_collections,
               updates_collections, name or 'percentage_below_threshold')
 
@@ -1469,11 +1474,11 @@ def _count_condition(values,
   check_ops.assert_type(values, dtypes.bool)
   count = metric_variable([], dtypes.float32, name='count')
 
-  values = math_ops.to_float(values)
+  values = math_ops.cast(values, dtypes.float32)
   if weights is not None:
     with ops.control_dependencies((check_ops.assert_rank_in(
         weights, (0, array_ops.rank(values))),)):
-      weights = math_ops.to_float(weights)
+      weights = math_ops.cast(weights, dtypes.float32)
       values = math_ops.multiply(values, weights)
 
   value_tensor = _aggregate_variable(count, metrics_collections)
@@ -2227,7 +2232,7 @@ def _select_class_id(ids, selected_id):
 
   # Intersect `ids` with the selected ID.
   filled_selected_id = array_ops.fill(filled_selected_id_shape,
-                                      math_ops.to_int64(selected_id))
+                                      math_ops.cast(selected_id, dtypes.int64))
   result = sets.set_intersection(filled_selected_id, ids)
   return sparse_tensor.SparseTensor(
       indices=result.indices, values=result.values, dense_shape=ids_shape)
@@ -2292,11 +2297,11 @@ def _sparse_true_positive_at_k(labels,
     labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
                                                      class_id)
     tp = sets.set_size(sets.set_intersection(predictions_idx, labels))
-    tp = math_ops.to_double(tp)
+    tp = math_ops.cast(tp, dtypes.float64)
     if weights is not None:
       with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
           weights, tp),)):
-        weights = math_ops.to_double(weights)
+        weights = math_ops.cast(weights, dtypes.float64)
         tp = math_ops.multiply(tp, weights)
     return tp
 
@@ -2346,7 +2351,7 @@ def _streaming_sparse_true_positive_at_k(labels,
         labels=labels,
         class_id=class_id,
         weights=weights)
-    batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
+    batch_total_tp = math_ops.cast(math_ops.reduce_sum(tp), dtypes.float64)
 
     var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_tp, name='update')
@@ -2387,11 +2392,11 @@ def _sparse_false_negative_at_k(labels,
                                                      class_id)
     fn = sets.set_size(
         sets.set_difference(predictions_idx, labels, aminusb=False))
-    fn = math_ops.to_double(fn)
+    fn = math_ops.cast(fn, dtypes.float64)
     if weights is not None:
       with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
           weights, fn),)):
-        weights = math_ops.to_double(weights)
+        weights = math_ops.cast(weights, dtypes.float64)
         fn = math_ops.multiply(fn, weights)
     return fn
 
@@ -2441,7 +2446,7 @@ def _streaming_sparse_false_negative_at_k(labels,
         labels=labels,
         class_id=class_id,
         weights=weights)
-    batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
+    batch_total_fn = math_ops.cast(math_ops.reduce_sum(fn), dtypes.float64)
 
     var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
@@ -2597,7 +2602,7 @@ def recall_at_top_k(labels,
   with ops.name_scope(name, _at_k_name('recall', k, class_id=class_id),
                       (predictions_idx, labels, weights)) as scope:
     labels = _maybe_expand_labels(labels, predictions_idx)
-    top_k_idx = math_ops.to_int64(predictions_idx)
+    top_k_idx = math_ops.cast(predictions_idx, dtypes.int64)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx,
         labels=labels,
@@ -2999,7 +3004,8 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
   """
   with ops.name_scope(None, 'average_precision',
                       (predictions_idx, labels)) as scope:
-    predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
+    predictions_idx = math_ops.cast(
+        predictions_idx, dtypes.int64, name='predictions_idx')
     if predictions_idx.get_shape().ndims == 0:
       raise ValueError('The rank of predictions_idx must be at least 1.')
     k = predictions_idx.get_shape().as_list()[-1]
@@ -3035,12 +3041,12 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
     retrieved_per_k = math_ops.cumsum(
         array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
     precision_per_k = math_ops.div(
-        math_ops.to_double(tp_per_k),
-        math_ops.to_double(retrieved_per_k),
+        math_ops.cast(tp_per_k, dtypes.float64),
+        math_ops.cast(retrieved_per_k, dtypes.float64),
         name='precision_per_k')
     relevant_precision_per_k = math_ops.multiply(
         precision_per_k,
-        math_ops.to_double(relevant_per_k),
+        math_ops.cast(relevant_per_k, dtypes.float64),
         name='relevant_precision_per_k')
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
@@ -3049,7 +3055,7 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
 
     # Divide by number of relevant items to get average precision. These are
     # the "num_relevant_items" and "AveP" terms from the formula above.
-    num_relevant_items = math_ops.to_double(_num_relevant(labels, k))
+    num_relevant_items = math_ops.cast(_num_relevant(labels, k), dtypes.float64)
     return math_ops.div(precision_sum, num_relevant_items, name=scope)
 
 
@@ -3110,7 +3116,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
         predictions_idx=predictions_idx, labels=labels)
     if weights is not None:
       weights = weights_broadcast_ops.broadcast_weights(
-          math_ops.to_double(weights), average_precision)
+          math_ops.cast(weights, dtypes.float64), average_precision)
       average_precision = math_ops.multiply(average_precision, weights)
 
     # Create accumulation variables and update ops for max average precision and
@@ -3122,8 +3128,8 @@ def _streaming_sparse_average_precision_at_top_k(labels,
       #   `average_precision` rows.
       max_var = metric_variable([], dtypes.float64, name=max_scope)
       if weights is None:
-        batch_max = math_ops.to_double(
-            array_ops.size(average_precision, name='batch_max'))
+        batch_max = math_ops.cast(
+            array_ops.size(average_precision, name='batch_max'), dtypes.float64)
       else:
         batch_max = math_ops.reduce_sum(weights, name='batch_max')
       max_update = state_ops.assign_add(max_var, batch_max, name='update')
@@ -3280,11 +3286,11 @@ def _sparse_false_positive_at_k(labels,
                                                      class_id)
     fp = sets.set_size(
         sets.set_difference(predictions_idx, labels, aminusb=True))
-    fp = math_ops.to_double(fp)
+    fp = math_ops.cast(fp, dtypes.float64)
     if weights is not None:
       with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
           weights, fp),)):
-        weights = math_ops.to_double(weights)
+        weights = math_ops.cast(weights, dtypes.float64)
         fp = math_ops.multiply(fp, weights)
     return fp
 
@@ -3334,7 +3340,7 @@ def _streaming_sparse_false_positive_at_k(labels,
         labels=labels,
         class_id=class_id,
         weights=weights)
-    batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
+    batch_total_fp = math_ops.cast(math_ops.reduce_sum(fp), dtypes.float64)
 
     var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
@@ -3402,7 +3408,7 @@ def precision_at_top_k(labels,
   with ops.name_scope(name, _at_k_name('precision', k, class_id=class_id),
                       (predictions_idx, labels, weights)) as scope:
     labels = _maybe_expand_labels(labels, predictions_idx)
-    top_k_idx = math_ops.to_int64(predictions_idx)
+    top_k_idx = math_ops.cast(predictions_idx, dtypes.int64)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx,
         labels=labels,
@@ -3642,7 +3648,7 @@ def specificity_at_sensitivity(labels,
       min_val = math_ops.reduce_min(math_ops.abs(sensitivities - sensitivity))
       indices_at_minval = math_ops.equal(
           math_ops.abs(sensitivities - sensitivity), min_val)
-      indices_at_minval = math_ops.to_int64(indices_at_minval)
+      indices_at_minval = math_ops.cast(indices_at_minval, dtypes.int64)
       indices_at_minval = math_ops.cumsum(indices_at_minval)
       tf_index = math_ops.argmax(indices_at_minval, 0)
       tf_index = math_ops.cast(tf_index, dtypes.int32)
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index fedf8e44c3ddfdac9739b88e019ed6d1e4485ab2..e978f1d32601890f8eb9b54fdd5738f626b7f863 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -206,7 +206,6 @@ class BatchNormalizationTest(test.TestCase):
                                   2)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 6ca2b2aafe3145978e6610cded32719173368eb8..a1fdddc4e4434b9eccb8e98edea45e4a6df4b8fd 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -50,7 +50,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")),
+          data_format=op.get_attr("data_format").decode()),
       nn_ops.conv2d(
           grad,
           op.inputs[1],
@@ -58,7 +58,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format"))
+          data_format=op.get_attr("data_format").decode())
   ]
 
 
@@ -73,7 +73,7 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")), None,
+          data_format=op.get_attr("data_format").decode()), None,
       nn_ops.conv2d(
           op.inputs[0],
           grad,
@@ -81,13 +81,65 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format").decode())
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput")
+def _DepthwiseConv2dNativeBackpropInputGrad(op, grad):
+  """The derivatives for deconvolution.
+
+  Args:
+    op: the Deconvolution op.
+    grad: the tensor representing the gradient w.r.t. the output
+
+  Returns:
+    the gradients w.r.t. the input and the filter
+  """
+  return [
+      None,
+      nn_ops.depthwise_conv2d_native_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.depthwise_conv2d_native(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format"))
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter")
+def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
+  return [
+      nn_ops.depthwise_conv2d_native_backprop_input(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")), None,
+      nn_ops.depthwise_conv2d_native(
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
           data_format=op.get_attr("data_format"))
   ]
 
 
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -110,7 +162,7 @@ def _Conv3DGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       None,
       nn_ops.conv3d_backprop_filter_v2(
@@ -133,7 +185,7 @@ def _Conv3DBackpropInputGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -161,7 +213,7 @@ def _AvgPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("AvgPool3DGrad")
@@ -172,7 +224,7 @@ def _AvgPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -184,7 +236,7 @@ def _MaxPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("MaxPool3DGrad")
@@ -200,7 +252,7 @@ def _MaxPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3DGradGrad")
@@ -216,7 +268,7 @@ def _MaxPool3DGradGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("Softmax")
@@ -232,8 +284,8 @@ def _SoftmaxGrad(op, grad_softmax):
 
   Args:
      op: the Softmax op.
-     grad_softmax:  the tensor representing the gradient w.r.t. the
-       softmax output.
+     grad_softmax:  the tensor representing the gradient w.r.t. the softmax
+       output.
 
   Returns:
      gradient w.r.t the input to the softmax
@@ -309,7 +361,6 @@ def _BiasAddGradGrad(op, received_grad):
     data_format = None
 
   shape = array_ops.shape(op.inputs[0])
-  rank = array_ops.rank(op.inputs[0])
   bias_shape = array_ops.shape(received_grad)
 
   if data_format == b"NCHW":
@@ -360,9 +411,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0, grad * op.inputs[0],
-                          array_ops.zeros(
-                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
+          array_ops.where(
+              elu_x < 0, grad * op.inputs[0],
+              array_ops.zeros(shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -370,11 +421,9 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(x < 0.,
-                          gen_nn_ops.elu_grad(grad,
-                                              op.outputs[0] + scale_alpha),
-                          array_ops.zeros(
-                              shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(
+              x < 0., gen_nn_ops.elu_grad(grad, op.outputs[0] + scale_alpha),
+              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -485,10 +534,10 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(array_ops.expand_dims(grad_grad, 1),
-                        array_ops.expand_dims(softmax, 2)),
-        axis=1)) *
-             softmax)
+        math_ops.matmul(
+            array_ops.expand_dims(grad_grad, 1),
+            array_ops.expand_dims(softmax, 2)),
+        axis=1)) * softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -649,13 +698,15 @@ def _MaxPoolGradV2(op, grad):
 
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
+  del unused_argmax_grad
   return gen_nn_ops.max_pool_grad_with_argmax(
       op.inputs[0],
       grad,
       op.outputs[1],
       op.get_attr("ksize"),
       op.get_attr("strides"),
-      padding=op.get_attr("padding"))
+      padding=op.get_attr("padding"),
+      include_batch_in_index=op.get_attr("include_batch_in_index"))
 
 
 @ops.RegisterGradient("MaxPoolGrad")
@@ -785,9 +836,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   Args:
     op: The BatchNormOp for which we need to compute gradients.
     use_v2: Boolean indicating whether to use the V2 version of the fused batch
-            norm gradient.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_y.
+      norm gradient.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_y.
 
   Returns:
     grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
@@ -874,8 +925,7 @@ def _BatchNormGrad(grad_y,
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
     is_training: A bool value to indicate the operation is for training
-      (default)
-        or inference.
+      (default) or inference.
 
   Returns:
     A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
@@ -939,9 +989,9 @@ def _FusedBatchNormGradGrad(op, *grad):
 
   Args:
     op: The FusedBatchNormGradOp for which we need to compute gradients.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_grad_x, grad[1] as grad_grad_scale,
-          grad[2] as grad_grad_offset.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
+      grad_grad_offset.
 
   Returns:
     A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
@@ -1007,29 +1057,31 @@ def _TopKGrad(op, grad, _):
   ind_shape = array_ops.shape(op.outputs[1])
 
   # int32 is not supported on GPU hence up-casting
-  ind_lastdim = array_ops.gather(math_ops.cast(
-      ind_shape, dtypes.int64), array_ops.size(ind_shape) - 1)
+  ind_lastdim = array_ops.gather(
+      math_ops.cast(ind_shape, dtypes.int64),
+      array_ops.size(ind_shape) - 1)
   # Flatten indices to 2D.
   ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim]))
 
-  in_lastdim = array_ops.gather(math_ops.cast(
-      in_shape, dtypes.int64), array_ops.size(in_shape) - 1)
+  in_lastdim = array_ops.gather(
+      math_ops.cast(in_shape, dtypes.int64),
+      array_ops.size(in_shape) - 1)
   outerdim = array_ops.shape(ind_2d)[0]
   # Compute linear indices (flattened to 1D).
-  ind = array_ops.reshape(ind_2d + math_ops.cast(array_ops.expand_dims(
-      math_ops.range(0, math_ops.cast(outerdim, dtypes.int64)
-                     * in_lastdim, in_lastdim), -1), dtypes.int32), [-1])
+  ind = array_ops.reshape(
+      ind_2d + math_ops.cast(
+          array_ops.expand_dims(
+              math_ops.range(0,
+                             math_ops.cast(outerdim, dtypes.int64) * in_lastdim,
+                             in_lastdim), -1), dtypes.int32), [-1])
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
   return [
       array_ops.reshape(
           array_ops.scatter_nd(
-              array_ops.expand_dims(ind, -1),
-              array_ops.reshape(grad, [-1]),
-              [math_ops.reduce_prod(in_shape)]
-          ),
-          in_shape),
+              array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]),
+              [math_ops.reduce_prod(in_shape)]), in_shape),
       array_ops.zeros([], dtype=dtypes.int32)
   ]
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 95e05a977b856505f0b608442e85fda8468ead1f..783656a86932019e373e42b236acfacf96245faf 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -23,9 +23,11 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
@@ -49,5 +51,111 @@ class Relu6OpTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class Conv2dOpTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
+class DepthwiseConv2dTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 7abfde5149acfb3da6b27e03f5ddd95fec746db6..930226efea89d08a7f23b2f8eafa7dcb64eed34e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
@@ -104,7 +104,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
     return result
 
 
-@tf_export("nn.sigmoid_cross_entropy_with_logits")
+@tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -184,6 +184,57 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
         name=name)
 
 
+# Note: intentionally calling this v2 to not allow existing code with indirect
+# imports to ignore the sentinel behavior.
+@tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
+def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
+  """Computes sigmoid cross entropy given `logits`.
+
+  Measures the probability error in discrete classification tasks in which each
+  class is independent and not mutually exclusive.  For instance, one could
+  perform multilabel classification where a picture can contain both an elephant
+  and a dog at the same time.
+
+  For brevity, let `x = logits`, `z = labels`.  The logistic loss is
+
+        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
+      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
+      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
+      = (1 - z) * x + log(1 + exp(-x))
+      = x - x * z + log(1 + exp(-x))
+
+  For x < 0, to avoid overflow in exp(-x), we reformulate the above
+
+        x - x * z + log(1 + exp(-x))
+      = log(exp(x)) - x * z + log(1 + exp(-x))
+      = - x * z + log(1 + exp(x))
+
+  Hence, to ensure stability and avoid overflow, the implementation uses this
+  equivalent formulation
+
+      max(x, 0) - x * z + log(1 + exp(-abs(x)))
+
+  `logits` and `labels` must have the same type and shape.
+
+  Args:
+    labels: A `Tensor` of the same type and shape as `logits`.
+    logits: A `Tensor` of type `float32` or `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same shape as `logits` with the componentwise
+    logistic losses.
+
+  Raises:
+    ValueError: If `logits` and `labels` do not have the same shape.
+  """
+  return sigmoid_cross_entropy_with_logits(
+      logits=logits, labels=labels, name=name)
+
 @tf_export("nn.weighted_cross_entropy_with_logits")
 def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
   """Computes a weighted cross entropy.
@@ -456,7 +507,8 @@ def depthwise_conv2d(input,
                      padding,
                      rate=None,
                      name=None,
-                     data_format=None):
+                     data_format=None,
+                     dilations=None):
   """Depthwise 2-D convolution.
 
   Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
@@ -492,12 +544,14 @@ def depthwise_conv2d(input,
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: Alias of rate.
 
   Returns:
     A 4-D `Tensor` with shape according to `data_format`.  E.g., for
     "NHWC" format, shape is
     `[batch, out_height, out_width, in_channels * channel_multiplier].`
   """
+  rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "depthwise", [input, filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
     filter = ops.convert_to_tensor(filter, name="filter_in")
@@ -591,7 +645,8 @@ def separable_conv2d(input,
                      padding,
                      rate=None,
                      name=None,
-                     data_format=None):
+                     data_format=None,
+                     dilations=None):
   """2-D convolution with separable filters.
 
   Performs a depthwise convolution that acts separately on channels followed by
@@ -631,12 +686,14 @@ def separable_conv2d(input,
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: Alias of rate.
 
   Returns:
     A 4-D `Tensor` with shape according to 'data_format'. For
       example, with data_format="NHWC", shape is [batch, out_height,
       out_width, out_channels].
   """
+  rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "separable_conv2d",
                       [input, depthwise_filter, pointwise_filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
@@ -751,7 +808,8 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
-def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
+def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
+                          keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
   These sufficient statistics are computed using the one pass algorithm on
@@ -766,6 +824,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
       close to the true mean provides the most numerically stable results.
     keep_dims: produce statistics with the same dimensionality as the input.
     name: Name used to scope the operations that compute the sufficient stats.
+    keepdims: Alias for keep_dims.
 
   Returns:
     Four `Tensor` objects of the same type as `x`:
@@ -776,6 +835,10 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
   axes = list(set(axes))
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
@@ -867,7 +930,8 @@ def moments(
     axes,
     shift=None,  # pylint: disable=unused-argument
     name=None,
-    keep_dims=False):
+    keep_dims=None,
+    keepdims=None):
   """Calculate the mean and variance of `x`.
 
   The mean and variance are calculated by aggregating the contents of `x`
@@ -890,10 +954,15 @@ def moments(
     shift: Not used in the current implementation
     name: Name used to scope the operations that compute the moments.
     keep_dims: produce moments with the same dimensionality as the input.
+    keepdims: Alias to keep_dims.
 
   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "moments", [x, axes]):
     # The dynamic range of fp16 is too limited to support the collection of
     # sufficient statistics. As a workaround we simply perform the operations
@@ -957,7 +1026,8 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
-def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
+def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
+                     keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
   Args:
@@ -968,10 +1038,15 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
       broadcast with x.
     name: Name used to scope the operation.
     keep_dims: Produce moments with the same dimensionality as the input.
+    keepdims: Alias of keep_dims.
 
   Returns:
     Two tensors: `weighted_mean` and `weighted_variance`.
   """
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
     x = ops.convert_to_tensor(x, name="x")
     frequency_weights = ops.convert_to_tensor(
@@ -1184,14 +1259,17 @@ def fused_batch_norm(
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
-def batch_norm_with_global_normalization(t,
-                                         m,
-                                         v,
-                                         beta,
-                                         gamma,
-                                         variance_epsilon,
-                                         scale_after_normalization,
-                                         name=None):
+def batch_norm_with_global_normalization(t=None,
+                                         m=None,
+                                         v=None,
+                                         beta=None,
+                                         gamma=None,
+                                         variance_epsilon=None,
+                                         scale_after_normalization=None,
+                                         name=None,
+                                         input=None,  # pylint: disable=redefined-builtin
+                                         mean=None,
+                                         variance=None):
   """Batch normalization.
 
   This op is deprecated. See `tf.nn.batch_normalization`.
@@ -1213,10 +1291,16 @@ def batch_norm_with_global_normalization(t,
     scale_after_normalization: A bool indicating whether the resulted tensor
       needs to be multiplied with gamma.
     name: A name for this operation (optional).
+    input: Alias for t.
+    mean: Alias for m.
+    variance: Alias for v.
 
   Returns:
      A batch-normalized `t`.
   """
+  t = deprecated_argument_lookup("input", input, "t", t)
+  m = deprecated_argument_lookup("mean", mean, "m", m)
+  v = deprecated_argument_lookup("variance", variance, "v", v)
   return batch_normalization(t, m, v, beta, gamma if scale_after_normalization
                              else None, variance_epsilon, name)
 
@@ -1440,7 +1524,7 @@ def _compute_sampled_logits(weights,
            array_ops.expand_dims(num_sampled, 0)], 0)
       if sampled_logits.dtype != acc_weights.dtype:
         acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
-      sampled_logits += sparse_ops.sparse_to_dense(
+      sampled_logits += gen_sparse_ops.sparse_to_dense(
           sparse_indices,
           sampled_logits_shape,
           acc_weights,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index e0ef9e5e34ad3666540daf91552e9ccb16f1b46c..eb0f8897a889ed28030139ede5f03901b0dd916d 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numbers
 
 import numpy as np
@@ -52,6 +53,30 @@ local_response_normalization = gen_nn_ops.lrn
 # pylint: disable=protected-access
 
 
+def _get_sequence(value, n, channel_index, name):
+  """Formats a value input for gen_nn_ops."""
+  if value is None:
+    value = [1]
+  elif not isinstance(value, collections.Sized):
+    value = [value]
+
+  current_n = len(value)
+  if current_n == n + 2:
+    return value
+  elif current_n == 1:
+    value = list((value[0],) * n)
+  elif current_n == n:
+    value = list(value)
+  else:
+    raise ValueError("{} should be of length 1, {} or {} but was {}".format(
+        name, n, n + 2, current_n))
+
+  if channel_index == 1:
+    return [1, 1] + value
+  else:
+    return [1] + value + [1]
+
+
 def _non_atrous_convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -275,6 +300,24 @@ def dilation2d_v2(
                                name=name)
 
 
+@tf_export(v1=["nn.dilation2d"])
+def dilation2d_v1(  # pylint: disable=missing-docstring
+    input,  # pylint: disable=redefined-builtin
+    filter=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    rates=None,
+    padding=None,
+    name=None,
+    filters=None,
+    dilations=None):
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  rates = deprecated_argument_lookup("dilations", dilations, "rates", rates)
+  return gen_nn_ops.dilation2d(input, filter, strides, rates, padding, name)
+
+
+dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
+
+
 @tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
@@ -487,7 +530,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a montonically increasing sequence of positive "
-          "integers")  # pylint: disable=line-too-long
+          "integers")
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -721,8 +764,9 @@ def convolution(
     strides=None,
     dilation_rate=None,
     name=None,
-    data_format=None):
-  # pylint: disable=line-too-long
+    data_format=None,
+    filters=None,
+    dilations=None):
   """Computes sums of N-D convolutions (actually cross-correlation).
 
   This also supports either output striding via the optional `strides` parameter
@@ -807,6 +851,8 @@ def convolution(
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    filters: Alias of filter.
+    dilations: Alias of dilation_rate.
 
   Returns:
     A `Tensor` with the same type as `input` of shape
@@ -834,21 +880,17 @@ def convolution(
       is other than `"VALID"` or `"SAME"`, or if data_format is invalid.
 
   """
-  # pylint: enable=line-too-long
-  with ops.name_scope(name, "convolution", [input, filter]) as name:
-    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
-    op = Convolution(
-        input_shape,
-        filter_shape,
-        padding,
-        strides=strides,
-        dilation_rate=dilation_rate,
-        name=name,
-        data_format=data_format)
-    return op(input, filter)
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  dilation_rate = deprecated_argument_lookup(
+      "dilations", dilations, "dilation_rate", dilation_rate)
+  return convolution_internal(
+      input,
+      filter,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilation_rate,
+      name=name)
 
 
 @tf_export("nn.convolution", v1=[])
@@ -860,14 +902,15 @@ def convolution_v2(
     data_format=None,
     dilations=None,
     name=None):
-  return convolution(
+  return convolution_internal(
       input,  # pylint: disable=redefined-builtin
       filters,
-      padding=padding,
       strides=strides,
-      dilation_rate=dilations,
-      name=name,
-      data_format=data_format)
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
+
 
 convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
@@ -875,6 +918,75 @@ convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     "filter", "filters")
 
 
+def convolution_internal(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  """Internal function which performs rank agnostic convolution."""
+  with ops.name_scope(name, "convolution", [input, filters]) as name:
+    if isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape.rank is not None:
+      n = len(input.shape) - 2
+    elif not isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape is not None:
+      n = len(input.shape) - 2
+    elif isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape.rank is not None:
+      n = len(filters.shape) - 2
+    elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape is not None:
+      n = len(filters.shape) - 2
+    else:
+      raise ValueError("rank of input or filter must be known")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+    if data_format is None:
+      channel_index = n + 1
+    else:
+      channel_index = 1 if data_format.startswith("NC") else n + 1
+
+    strides = _get_sequence(strides, n, channel_index, "strides")
+    dilations = _get_sequence(dilations, n, channel_index, "dilations")
+
+    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+
+    if all(i == 1 for i in dilations):
+      # fast path if no dilation as gradient only supported on GPU for dilations
+      op = conv_ops[n]
+      return op(
+          input,
+          filters,
+          strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations,
+          name=name)
+    else:
+      if channel_index == 1:
+        strides = strides[2:]
+        dilations = dilations[2:]
+      else:
+        strides = strides[1:-1]
+        dilations = dilations[1:-1]
+
+      op = Convolution(
+          tensor_shape.as_shape(input.shape),
+          tensor_shape.as_shape(filters.shape),
+          padding,
+          strides=strides,
+          dilation_rate=dilations,
+          name=name,
+          data_format=data_format)
+      return op(input, filters)
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -975,8 +1087,8 @@ def pool(
     dilation_rate=None,
     strides=None,
     name=None,
-    data_format=None):
-  # pylint: disable=line-too-long
+    data_format=None,
+    dilations=None):
   """Performs an N-D pooling operation.
 
   In the case that `data_format` does not start with "NC", computes for
@@ -1032,6 +1144,7 @@ def pool(
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Alias for dilation_rate
 
   Returns:
     Tensor of rank N+2, of shape
@@ -1056,6 +1169,8 @@ def pool(
     ValueError: if arguments are invalid.
 
   """
+  dilation_rate = deprecated_argument_lookup(
+      "dilations", dilations, "dilation_rate", dilation_rate)
   # pylint: enable=line-too-long
   with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
                       [input]) as scope:
@@ -1411,6 +1526,250 @@ def _convert_padding(padding):
   return padding, explicit_paddings
 
 
+@tf_export(v1=["nn.conv1d"])
+@deprecation.deprecated_arg_values(
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
+@deprecation.deprecated_arg_values(
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(
+    value=None,
+    filters=None,
+    stride=None,
+    padding=None,
+    use_cudnn_on_gpu=None,
+    data_format=None,
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    dilations=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `value`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+    input: Alias for value.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
+  with ops.name_scope(name, "conv1d", [value, filters]) as name:
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    if data_format is None or data_format == "NHWC" or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCHW" or data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+    strides = [1] + _get_sequence(stride, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    value = array_ops.expand_dims(value, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, [spatial_start_dim])
+
+
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    stride,
+    padding,
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      stride,
+      padding,
+      use_cudnn_on_gpu=True,
+      data_format=data_format,
+      name=name,
+      dilations=dilations)
+
+
+@tf_export("nn.conv1d_transpose")
+def conv1d_transpose(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  """The transpose of `conv1d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv1d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 3-D `Tensor` of type `float` and shape
+      `[batch, in_width, in_channels]` for `NWC` data format or
+      `[batch, in_channels, in_width]` for `NCW` data format.
+    filters: A 3-D `Tensor` with the same type as `value` and shape
+      `[filter_width, output_channels, in_channels]`.  `filter`'s
+      `in_channels` dimension must match that of `value`.
+    output_shape: A 1-D `Tensor`, containing three elements, representing the
+      output shape of the deconvolution op.
+    strides: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. `'NWC'` and `'NCW'` are supported.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, if
+      `output_shape` is not at 3-element vector, if `padding` is other than
+      `'VALID'` or `'SAME'`, or if `data_format` is invalid.
+  """
+  with ops.name_scope(name, "conv1d_transpose",
+                      [input, filters, output_shape]) as name:
+    # The format could be either NWC or NCW, map to NHWC or NCHW
+    if data_format is None or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    strides = [1] + _get_sequence(strides, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    input = array_ops.expand_dims(input, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    output_shape = list(output_shape)
+    output_shape = output_shape[: spatial_start_dim] + [1] + \
+                   output_shape[spatial_start_dim:]
+
+    result = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, spatial_start_dim)
+
+
 @tf_export("nn.conv2d", v1=[])
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
@@ -1452,10 +1811,11 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
     filters: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
-    strides: A list of `ints`.
-      1-D tensor of length 4.  The stride of the sliding window for each
-      dimension of `input`. The dimension order is determined by the value of
-      `data_format`, see below for details.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
       padding algorithm to use, or a list indicating the explicit paddings at
       the start and end of each dimension. When explicit padding is used and
@@ -1470,20 +1830,20 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
           [batch, height, width, channels].
       Alternatively, the format could be "NCHW", the data storage order of:
           [batch, channels, height, width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by the
-      value of `data_format`, see above for details. Dilations in the batch and
-      depth dimensions must be 1.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
   # pylint: enable=line-too-long
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
   return conv2d(input,  # pylint: disable=redefined-builtin
                 filters,
                 strides,
@@ -1497,13 +1857,14 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.conv2d"])
 def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
-    filter,
-    strides,
-    padding,
+    filter=None,
+    strides=None,
+    padding=None,
     use_cudnn_on_gpu=True,
     data_format="NHWC",
     dilations=[1, 1, 1, 1],
-    name=None):
+    name=None,
+    filters=None):
   r"""Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 
   Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
@@ -1536,10 +1897,11 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     filter: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
-    strides: A list of `ints`.
-      1-D tensor of length 4.  The stride of the sliding window for each
-      dimension of `input`. The dimension order is determined by the value of
-      `data_format`, see below for details.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
       padding algorithm to use, or a list indicating the explicit paddings at
       the start and end of each dimension. When explicit padding is used and
@@ -1555,18 +1917,29 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
           [batch, height, width, channels].
       Alternatively, the format could be "NCHW", the data storage order of:
           [batch, channels, height, width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by the
-      value of `data_format`, see above for details. Dilations in the batch and
-      depth dimensions must be 1.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
     name: A name for the operation (optional).
+    filters: Alias for filter.
 
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
+  filter = deprecation.deprecated_argument_lookup(
+      "filters", filters, "filter", filter)
   padding, explicit_paddings = _convert_padding(padding)
+  if data_format is None:
+    data_format = "NHWC"
+  channel_index = 1 if data_format.startswith("NC") else 3
+
+  strides = _get_sequence(strides, 2, channel_index, "strides")
+  dilations = _get_sequence(dilations, 2, channel_index, "dilations")
   return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
                            filter,
                            strides,
@@ -1578,70 +1951,6 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
                            name=name)
 
 
-@tf_export("nn.conv2d_backprop_filter", v1=[])
-def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
-                              filter_sizes,
-                              out_backprop,
-                              strides,
-                              padding,
-                              data_format="NHWC",
-                              dilations=None,
-                              name=None):
-  r"""Computes the gradients of convolution with respect to the filter.
-
-  Args:
-    input: A `Tensor`. Must be one of the following types:
-      `half`, `bfloat16`, `float32`, `float64`.
-      4-D with shape `[batch, in_height, in_width, in_channels]`.
-    filter_sizes: A `Tensor` of type `int32`.
-      An integer vector representing the tensor shape of `filter`,
-      where `filter` is a 4-D
-      `[filter_height, filter_width, in_channels, out_channels]` tensor.
-    out_backprop: A `Tensor`. Must have the same type as `input`.
-      4-D with shape `[batch, out_height, out_width, out_channels]`.
-      Gradients w.r.t. the output of the convolution.
-    strides: A list of `ints`.
-      The stride of the sliding window for each dimension of the input
-      of the convolution. Must be in the same order as the dimension specified
-      with format.
-    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
-      padding algorithm to use, or a list indicating the explicit paddings at
-      the start and end of each dimension. When explicit padding is used and
-      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
-      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
-      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
-      [pad_top, pad_bottom], [pad_left, pad_right]]`.
-    data_format: An optional `string` from: `"NHWC", "NCHW"`.
-      Defaults to `"NHWC"`.
-      Specify the data format of the input and output data. With the
-      default format "NHWC", the data is stored in the order of:
-          [batch, in_height, in_width, in_channels].
-      Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, in_channels, in_height, in_width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by
-      the value of `data_format`, see above for details. Dilations in the batch
-      and depth dimensions must be 1.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `input`.
-  """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
-                                filter_sizes,
-                                out_backprop,
-                                strides,
-                                padding,
-                                use_cudnn_on_gpu=True,
-                                data_format=data_format,
-                                dilations=dilations,
-                                name=name)
-
-
 @tf_export(v1=["nn.conv2d_backprop_filter"])
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -1702,81 +2011,18 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
       explicit_paddings, data_format, dilations, name)
 
 
-@tf_export("nn.conv2d_backprop_input", v1=[])
-def conv2d_backprop_input_v2(input_sizes,
-                             filters,
-                             out_backprop,
-                             strides,
-                             padding,
-                             data_format="NHWC",
-                             dilations=None,
-                             name=None):
-  r"""Computes the gradients of convolution with respect to the input.
-
-  Args:
-    input_sizes: A `Tensor` of type `int32`.
-      An integer vector representing the shape of `input`,
-      where `input` is a 4-D `[batch, height, width, channels]` tensor.
-    filters: A `Tensor`. Must be one of the following types:
-      `half`, `bfloat16`, `float32`, `float64`.
-      4-D with shape
-      `[filter_height, filter_width, in_channels, out_channels]`.
-    out_backprop: A `Tensor`. Must have the same type as `filters`.
-      4-D with shape `[batch, out_height, out_width, out_channels]`.
-      Gradients w.r.t. the output of the convolution.
-    strides: A list of `ints`.
-      The stride of the sliding window for each dimension of the input
-      of the convolution. Must be in the same order as the dimension specified
-      with format.
-    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
-      padding algorithm to use, or a list indicating the explicit paddings at
-      the start and end of each dimension. When explicit padding is used and
-      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
-      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
-      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
-      [pad_top, pad_bottom], [pad_left, pad_right]]`.
-    data_format: An optional `string` from: `"NHWC", "NCHW"`.
-      Defaults to `"NHWC"`.
-      Specify the data format of the input and output data. With the
-      default format "NHWC", the data is stored in the order of:
-          [batch, in_height, in_width, in_channels].
-      Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, in_channels, in_height, in_width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by
-      the value of `data_format`, see above for details. Dilations in the batch
-      and depth dimensions must be 1.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `filters`.
-  """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return conv2d_backprop_input(input_sizes,
-                               filters,
-                               out_backprop,
-                               strides,
-                               padding,
-                               use_cudnn_on_gpu=True,
-                               data_format=data_format,
-                               dilations=dilations,
-                               name=name)
-
-
 @tf_export(v1=["nn.conv2d_backprop_input"])
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
-    filter,
-    out_backprop,
-    strides,
-    padding,
+    filter=None,
+    out_backprop=None,
+    strides=None,
+    padding=None,
     use_cudnn_on_gpu=True,
     data_format="NHWC",
     dilations=[1, 1, 1, 1],
-    name=None):
+    name=None,
+    filters=None):
   r"""Computes the gradients of convolution with respect to the input.
 
   Args:
@@ -1816,10 +2062,13 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
       the value of `data_format`, see above for details. Dilations in the batch
       and depth dimensions must be 1.
     name: A name for the operation (optional).
+    filters: Alias for filter.
 
   Returns:
     A `Tensor`. Has the same type as `filter`.
   """
+  filter = deprecation.deprecated_argument_lookup(
+      "filters", filters, "filter", filter)
   padding, explicit_paddings = _convert_padding(padding)
   return gen_nn_ops.conv2d_backprop_input(
       input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu,
@@ -1828,18 +2077,21 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
 
 @tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
-    value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    strides,
+    value=None,
+    filter=None,  # pylint: disable=redefined-builtin
+    output_shape=None,
+    strides=None,
     padding="SAME",
     data_format="NHWC",
-    name=None):
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    filters=None,
+    dilations=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv2d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv2d` rather than an actual
   deconvolution.
 
   Args:
@@ -1851,12 +2103,25 @@ def conv2d_transpose(
       `in_channels` dimension must match that of `value`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
-    strides: A list of ints. The stride of the sliding window for each
-      dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
+    input: Alias for value.
+    filters: Alias for filter.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -1865,70 +2130,90 @@ def conv2d_transpose(
     ValueError: If input/output depth does not match `filter`'s shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
+  value = deprecated_argument_lookup("input", input, "value", value)
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
   with ops.name_scope(name, "conv2d_transpose",
                       [value, filter, output_shape]) as name:
-    if data_format not in ("NCHW", "NHWC"):
-      raise ValueError("data_format has to be either NCHW or NHWC.")
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 3 if data_format == "NHWC" else 1
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[3]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[3]))
-
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}".format(
-          output_shape_.get_shape()))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape().dims[2].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[2]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    return gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
+    return conv2d_transpose_v2(
+        value,
+        filter,
+        output_shape,
+        strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
 @tf_export("nn.conv2d_transpose", v1=[])
 def conv2d_transpose_v2(
-    input,
+    input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
     output_shape,
     strides,
     padding="SAME",
     data_format="NHWC",
+    dilations=None,
     name=None):
-  return conv2d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv2d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+  """The transpose of `conv2d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 4-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 4-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  with ops.name_scope(name, "conv2d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+    dilations = _get_sequence(dilations, 2, channel_index, "dilations")
+
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.atrous_conv2d_transpose")
@@ -1941,9 +2226,9 @@ def atrous_conv2d_transpose(value,
   """The transpose of `atrous_conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `atrous_conv2d` rather than an actual
-  deconvolution.
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `atrous_conv2d` rather than an
+  actual deconvolution.
 
   Args:
     value: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
@@ -2000,6 +2285,9 @@ def atrous_conv2d_transpose(value,
       raise ValueError("output_shape must have shape (4,), got {}".format(
           output_shape_.get_shape()))
 
+    if isinstance(output_shape, tuple):
+      output_shape = list(output_shape)
+
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filters.get_shape().dims[2].is_compatible_with(output_shape[3]):
@@ -2089,32 +2377,52 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               name=None):
   if dilations is None:
     dilations = [1, 1, 1, 1, 1]
-  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+  return gen_nn_ops.conv3d(input,
                            filters,
                            strides,
                            padding,
                            data_format=data_format,
                            dilations=dilations,
                            name=name)
-tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+
+
+@tf_export(v1=["nn.conv3d"])
+def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
+    input,  # pylint: disable=redefined-builtin
+    filter=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    padding=None,
+    data_format="NDHWC",
+    dilations=[1, 1, 1, 1, 1],
+    name=None,
+    filters=None):
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  return gen_nn_ops.conv3d(
+      input, filter, strides, padding, data_format, dilations, name)
+
+
 conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
     gen_nn_ops.conv3d.__doc__, "filter", "filters")
+conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    strides,
+    filter=None,  # pylint: disable=redefined-builtin
+    output_shape=None,
+    strides=None,
     padding="SAME",
     data_format="NDHWC",
-    name=None):
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    filters=None,
+    dilations=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv3d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv3d` rather than an actual
   deconvolution.
 
   Args:
@@ -2132,6 +2440,16 @@ def conv3d_transpose(
     data_format: A string, either `'NDHWC'` or `'NCDHW`' specifying the layout
       of the input and output tensors. Defaults to `'NDHWC'`.
     name: Optional name for the returned tensor.
+    input: Alias of value.
+    filters: Alias of filter.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -2140,68 +2458,166 @@ def conv3d_transpose(
     ValueError: If input/output depth does not match `filter`'s shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
-  with ops.name_scope(name, "conv3d_transpose",
-                      [value, filter, output_shape]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 1 if data_format == "NCDHW" else 4
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[4]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[4]))
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  value = deprecated_argument_lookup("input", input, "value", value)
+  return conv3d_transpose_v2(
+      value,
+      filter,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
 
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}".format(
-          output_shape_.get_shape()))
 
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape().dims[3].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[3]))
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
+                        filters,
+                        output_shape,
+                        strides,
+                        padding="SAME",
+                        data_format="NDHWC",
+                        dilations=None,
+                        name=None):
+  """The transpose of `conv3d`.
 
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 5-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 5-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `D`, `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 0. The dimension order is
+      determined by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv3d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 4
+
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+    dilations = _get_sequence(dilations, 3, channel_index, "dilations")
 
     return gen_nn_ops.conv3d_backprop_input_v2(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
-@tf_export("nn.conv3d_transpose", v1=[])
-def conv3d_transpose_v2(
-    input,
-    filters,
-    output_shape,
-    strides,
-    padding="SAME",
-    data_format="NDHWC",
-    name=None):
-  return conv3d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv3d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+CONV_TRANSPOSE_OPS = (
+    conv1d_transpose,
+    conv2d_transpose_v2,
+    conv3d_transpose_v2,
+)
+
+
+@tf_export("nn.conv_transpose")
+def conv_transpose(input,  # pylint: disable=redefined-builtin
+                   filters,
+                   output_shape,
+                   strides,
+                   padding="SAME",
+                   data_format=None,
+                   dilations=None,
+                   name=None):
+  """The transpose of `convolution`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `convolution` rather than an actual
+  deconvolution.
+
+  Args:
+    input: An N+2 dimensional `Tensor` of shape
+      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
+      not start with "NC" (default), or
+      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
+      with "NC". It must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+    filters: An N+2 dimensional `Tensor` with the same type as `input` and
+      shape `spatial_filter_shape + [in_channels, out_channels]`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the spatial dimensions. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: An int or list of `ints` that has length `1`, `N` or `N+2`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the spatial dimensions. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details.
+    name: A name for the operation (optional). If not specified "conv_transpose"
+      is used.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv_transpose",
+                      [input, filter, output_shape]) as name:
+    if output_shape is not None:
+      n = len(output_shape) - 2
+    else:
+      raise ValueError("output_shape cannot be None")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "output_shape must be of length 3, 4 or 5 but was {}.".format(n + 2))
+
+    op = CONV_TRANSPOSE_OPS[n-1]
+    return op(
+        input,
+        filters,
+        output_shape,
+        strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.bias_add")
@@ -2219,13 +2635,21 @@ def bias_add(value, bias, data_format=None, name=None):
     bias: A 1-D `Tensor` with size matching the last dimension of `value`.
       Must be the same type as `value` unless `value` is a quantized type,
       in which case a different quantized type may be used.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    data_format: A string. 'N...C' and 'NC...' are supported.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor` with the same type as `value`.
   """
   with ops.name_scope(name, "BiasAdd", [value, bias]) as name:
+    if data_format is not None:
+      if data_format.startswith("NC"):
+        data_format = "NCHW"
+      elif data_format.startswith("N") and data_format.endswith("C"):
+        data_format = "NHWC"
+      else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+
     if not context.executing_eagerly():
       value = ops.convert_to_tensor(value, name="input")
       bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
@@ -2331,10 +2755,10 @@ def leaky_relu(features, alpha=0.2, name=None):
   with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
-      features = math_ops.to_float(features)
+      features = math_ops.cast(features, dtypes.float32)
     if compat.forward_compatible(2018, 11, 1):
       if isinstance(alpha, np.ndarray):
-        alpha = np.asscalar(alpha)
+        alpha = alpha.item()
       return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features, name=name)
@@ -2403,7 +2827,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
   # We need its original shape for shape inference.
   shape = logits.get_shape()
-  is_last_dim = (dim is -1) or (dim == shape.ndims - 1)
+  is_last_dim = (dim == -1) or (dim == shape.ndims - 1)
 
   if is_last_dim:
     return compute_op(logits, name=name)
@@ -2411,7 +2835,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   dim_val = dim
   if isinstance(dim, ops.Tensor):
     dim_val = tensor_util.constant_value(dim)
-  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+  if dim_val is not None and not -shape.ndims <= dim_val < shape.ndims:
     raise errors_impl.InvalidArgumentError(
         None, None,
         "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
@@ -2756,7 +3180,8 @@ def softmax_cross_entropy_with_logits(
     labels=None,
     logits=None,
     dim=-1,
-    name=None):
+    name=None,
+    axis=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -2796,12 +3221,14 @@ def softmax_cross_entropy_with_logits(
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
+    axis: Alias for dim.
 
   Returns:
     A `Tensor` that contains the softmax cross entropy loss. Its type is the
     same as `logits` and its shape is the same as `labels` except that it does
     not have the last dimension of `labels`.
   """
+  dim = deprecated_argument_lookup("axis", axis, "dim", dim)
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
 
@@ -2937,83 +3364,482 @@ def sparse_softmax_cross_entropy_with_logits(
         return cost
 
 
-@tf_export("nn.avg_pool")
-def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
+@tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
+  """Performs the avg pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The average pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  avg_pooling_ops = {
+      1: avg_pool1d,
+      2: gen_nn_ops.avg_pool,
+      3: gen_nn_ops.avg_pool3d
+  }
+
+  op = avg_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+
+
+@tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+def avg_pool(value, ksize, strides, padding, data_format="NHWC",
+             name=None, input=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the operation.
+    input: Alias for value.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool", [value]) as name:
+    value = deprecation.deprecated_argument_lookup(
+        "input", input, "value", value)
+
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+@tf_export("nn.avg_pool2d", v1=[])
+def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
   Each entry in `output` is the mean of the corresponding size `ksize`
   window in `value`.
 
   Args:
-    value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
-      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A list or tuple of 4 ints. The size of the window for each dimension
-      of the input tensor.
-    strides: A list or tuple of 4 ints. The stride of the sliding window for
-      each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    input: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool2D", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+@tf_export("nn.avg_pool1d")
+def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Note internally this op reshapes and uses the underlying 2d operation.
+
+  Args:
+    input: A 3-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1` or `3`. The size of the
+      window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1` or `3`. The stride of
+      the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool1D", [input]) as name:
+    if data_format is None:
+      data_format = "NWC"
+    channel_index = 1 if data_format.startswith("NC") else 2
+    ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
+    strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
+    expanding_dim = 1 if data_format == "NWC" else 2
+
+    input = array_ops.expand_dims_v2(input, expanding_dim)
+    result = gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+    return array_ops.squeeze(result, expanding_dim)
+
+
+@tf_export("nn.avg_pool3d")
+def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input: A 5-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `3` or `5`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool3D", [input]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 3, channel_index, "ksize")
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool3d(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  max_pooling_ops = {
+      1: max_pool1d,
+      2: gen_nn_ops.max_pool,
+      3: gen_nn_ops.max_pool3d
+  }
+
+  op = max_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["nn.max_pool"])
+def max_pool(value,
+             ksize,
+             strides,
+             padding,
+             data_format="NHWC",
+             name=None,
+             input=None):  # pylint: disable=redefined-builtin
+  """Performs the max pooling on the input.
+
+  Args:
+    value: A 4-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`.
+      The size of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.
+      The stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
+    name: Optional name for the operation.
+    input: Alias for value.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
+  with ops.name_scope(name, "MaxPool", [value]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.max_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool1d")
+def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
+  """Performs the max pooling on the input.
+
+  Note internally this op reshapes and uses the underlying 2d operation.
+
+  Args:
+    input: A 3-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1` or `3`. The size of the
+      window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1` or `3`. The stride of
+      the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "MaxPool1d", [input]) as name:
+    if data_format is None:
+      data_format = "NWC"
+    channel_index = 1 if data_format.startswith("NC") else 2
+    ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
+    strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
+    expanding_dim = 1 if data_format == "NWC" else 2
+
+    input = array_ops.expand_dims_v2(input, expanding_dim)
+    result = gen_nn_ops.max_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+    return array_ops.squeeze(result, expanding_dim)
+# pylint: enable=redefined-builtin
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool2d")
+def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input: A 4-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
 
   Returns:
-    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
   """
-  with ops.name_scope(name, "AvgPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops.avg_pool(
-        value,
+  with ops.name_scope(name, "MaxPool2d", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.max_pool(
+        input,
         ksize=ksize,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name)
+# pylint: enable=redefined-builtin
 
 
-@tf_export("nn.max_pool")
-def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool3d")
+def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
   """Performs the max pooling on the input.
 
   Args:
-    value: A 4-D `Tensor` of the format specified by `data_format`.
-    ksize: A list or tuple of 4 ints. The size of the window for each dimension
-      of the input tensor.
-    strides: A list or tuple of 4 ints. The stride of the sliding window for
-      each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
-    name: Optional name for the operation.
+    input: A 5-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `3` or `5`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC".
+      The data format of the input and output data. With the default format
+      "NDHWC", the data is stored in the order of: [batch, in_depth, in_height,
+        in_width, in_channels]. Alternatively, the format could be "NCDHW", the
+      data storage order is: [batch, in_channels, in_depth, in_height,
+        in_width].
+    name: A name for the operation (optional).
 
   Returns:
     A `Tensor` of format specified by `data_format`.
     The max pooled output tensor.
   """
-  with ops.name_scope(name, "MaxPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops.max_pool(
-        value,
+  with ops.name_scope(name, "MaxPool3D", [input]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 4
+
+    ksize = _get_sequence(ksize, 3, channel_index, "ksize")
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+
+    return gen_nn_ops.max_pool3d(
+        input,
         ksize=ksize,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name)
+# pylint: enable=redefined-builtin
 
 
-# pylint: disable=redefined-builtin
 @tf_export("nn.max_pool_with_argmax", v1=[])
-def max_pool_with_argmax_v2(input,
-                            ksize,
-                            strides,
-                            padding,
-                            data_format="NHWC",
-                            output_dtype=dtypes.int64,
-                            name=None):
+def max_pool_with_argmax_v2(
+    input,  # pylint: disable=redefined-builtin
+    ksize,
+    strides,
+    padding,
+    data_format="NHWC",
+    output_dtype=dtypes.int64,
+    include_batch_in_index=False,
+    name=None):
   """Performs max pooling on the input and outputs both max values and indices.
 
   The indices in `argmax` are flattened, so that a maximum value at position
-  `[b, y, x, c]` becomes flattened index
-  `((b * height + y) * width + x) * channels + c`.
+  `[b, y, x, c]` becomes flattened index: `(y * width + x) * channels + c` if
+  `include_batch_in_index` is False;
+  `((b * height + y) * width + x) * channels + c`
+  if `include_batch_in_index` is True.
 
   The indices returned are always in `[0, height) x [0, width)` before
   flattening, even if padding is involved and the mathematically correct answer
@@ -3026,9 +3852,9 @@ def max_pool_with_argmax_v2(input,
       `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
       `uint32`, `uint64`.
       4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-    ksize: A list of `ints` that has length `>= 4`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`.
       The size of the window for each dimension of the input tensor.
-    strides: A list of `ints` that has length `>= 4`.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.
       The stride of the sliding window for each dimension of the
       input tensor.
     padding: A `string` from: `"SAME", "VALID"`.
@@ -3039,6 +3865,8 @@ def max_pool_with_argmax_v2(input,
     output_dtype: An optional `tf.DType` from: `tf.int32, tf.int64`.
       Defaults to `tf.int64`.
       The dtype of the returned argmax tensor.
+    include_batch_in_index: An optional `boolean`. Defaults to `False`.
+      Whether to include batch dimension in flattened index of `argmax`.
     name: A name for the operation (optional).
 
   Returns:
@@ -3051,14 +3879,48 @@ def max_pool_with_argmax_v2(input,
   if data_format != "NHWC":
     raise ValueError("Data formats other than 'NHWC' are not yet supported")
 
-  return gen_nn_ops.max_pool_with_argmax(input=input,
-                                         ksize=ksize,
-                                         strides=strides,
-                                         padding=padding,
-                                         Targmax=output_dtype,
-                                         name=name)
+  ksize = _get_sequence(ksize, 2, 3, "ksize")
+  strides = _get_sequence(strides, 2, 3, "strides")
+
+  return gen_nn_ops.max_pool_with_argmax(
+      input=input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      Targmax=output_dtype,
+      include_batch_in_index=include_batch_in_index,
+      name=name)
+
+
+@tf_export(v1=["nn.max_pool_with_argmax"])
+def max_pool_with_argmax_v1(  # pylint: disable=missing-docstring,invalid-name
+    input,  # pylint: disable=redefined-builtin
+    ksize,
+    strides,
+    padding,
+    data_format="NHWC",
+    Targmax=None,
+    name=None,
+    output_dtype=None,
+    include_batch_in_index=False):
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than 'NHWC' are not yet supported")
+
+  Targmax = deprecated_argument_lookup(
+      "output_dtype", output_dtype, "Targmax", Targmax)
+  if Targmax is None:
+    Targmax = dtypes.int64
+  return gen_nn_ops.max_pool_with_argmax(
+      input=input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      Targmax=Targmax,
+      include_batch_in_index=include_batch_in_index,
+      name=name)
 
-# pylint: enable=redefined-builtin
+
+max_pool_with_argmax_v1.__doc__ = gen_nn_ops.max_pool_with_argmax.__doc__
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
@@ -3128,7 +3990,7 @@ def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
     return bias_add(mm, biases, name=name)
 
 
-def xw_plus_b_v1(x, weights, biases, name=None):  # pylint: disable=invalid-name
+def xw_plus_b_v1(x, weights, biases, name=None):
   """Computes matmul(x, weights) + biases.
 
   This is a deprecated version of that will soon be removed.
@@ -3182,7 +4044,7 @@ def _get_noise_shape(x, noise_shape):
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
 def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
-            rate=None):  # pylint: disable=invalid-name
+            rate=None):
   """Computes dropout.
 
   For each element of `x`, with probability `rate`, outputs `0`, and otherwise
@@ -3232,7 +4094,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
-def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout.
 
   With probability `rate`, drops elements of `x`. Input that are kept are
@@ -3493,12 +4355,12 @@ def fractional_max_pool_v2(value,
 
   Args:
     value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
-    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
-      each dimension of `value`, currently only supports row and col dimension
-      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
-      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
-      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
-      ratio on height and width dimensions respectively.
+    pooling_ratio: An int or list of `ints` that has length `1`, `2` or `4`.
+      Pooling ratio for each dimension of `value`, currently only supports row
+      and col dimension and should be >= 1.0. For example, a valid pooling ratio
+      looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements must be 1.0
+      because we don't allow pooling on batch and channels dimensions.  1.44 and
+      1.73 are pooling ratio on height and width dimensions respectively.
     pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
       generates the pooling sequence in a pseudorandom fashion, otherwise, in a
       random fashion. Check paper [Benjamin Graham, Fractional
@@ -3524,6 +4386,8 @@ def fractional_max_pool_v2(value,
     row_pooling_sequence: A `Tensor` of type `int64`.
     col_pooling_sequence: A `Tensor` of type `int64`.
   """
+  pooling_ratio = _get_sequence(pooling_ratio, 2, 3, "pooling_ratio")
+
   if seed == 0:
     return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
                                           overlapping, deterministic=False,
@@ -3654,248 +4518,6 @@ def fractional_avg_pool_v2(value,
                                           seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export(v1=["nn.conv1d"])
-@deprecation.deprecated_arg_values(
-    None,
-    "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True,
-    data_format="NCHW")
-@deprecation.deprecated_arg_values(
-    None,
-    "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True,
-    data_format="NHWC")
-def conv1d(value,
-           filters,
-           stride,
-           padding,
-           use_cudnn_on_gpu=None,
-           data_format=None,
-           name=None):
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `value`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  with ops.name_scope(name, "conv1d", [value, filters]) as name:
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format is None or data_format == "NHWC" or data_format == "NWC":
-      data_format = "NHWC"
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    elif data_format == "NCHW" or data_format == "NCW":
-      data_format = "NCHW"
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(
-        value,
-        filters,
-        strides,
-        padding,
-        use_cudnn_on_gpu=use_cudnn_on_gpu,
-        data_format=data_format)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
-@tf_export("nn.conv1d", v1=[])
-def conv1d_v2(input,  # pylint: disable=redefined-builtin
-              filters,
-              stride,
-              padding,
-              data_format=None,
-              name=None):
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `input`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  return conv1d(input,  # pylint: disable=redefined-builtin
-                filters,
-                stride,
-                padding,
-                use_cudnn_on_gpu=True,
-                data_format=data_format,
-                name=name)
-
-
-def conv1d_transpose(
-    value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    stride,
-    padding="SAME",
-    data_format="NWC",
-    name=None):
-  """The transpose of `conv1d`.
-
-  This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv1d` rather than an actual
-  deconvolution.
-
-  Args:
-    value: A 3-D `Tensor` of type `float` and shape
-      `[batch, in_width, in_channels]` for `NWC` data format or
-      `[batch, in_channels, in_width]` for `NCW` data format.
-    filter: A 3-D `Tensor` with the same type as `value` and shape
-      `[filter_width, output_channels, in_channels]`.  `filter`'s
-      `in_channels` dimension must match that of `value`.
-    output_shape: A 1-D `Tensor` representing the output shape of the
-      deconvolution op.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
-    name: Optional name for the returned tensor.
-
-  Returns:
-    A `Tensor` with the same type as `value`.
-
-  Raises:
-    ValueError: If input/output depth does not match `filter`'s shape, or if
-      padding is other than `'VALID'` or `'SAME'`.
-  """
-  with ops.name_scope(name, "conv1d_transpose",
-                      [value, filter, output_shape]) as name:
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
-      raise ValueError("output_shape must have shape (3,), got {}".format(
-          output_shape_.get_shape()))
-
-    # The format could be either NWC or NCW, map to NHWC or NCHW
-    if data_format is None or data_format == "NWC":
-      data_format_2d = "NHWC"
-      axis = 2
-    elif data_format == "NCW":
-      data_format_2d = "NCHW"
-      axis = 1
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[2]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[2]))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [3] if reached this point.
-      if not filter.get_shape().dims[1].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[1]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format_2d == "NHWC":
-      output_shape_ = array_ops.concat(
-          [output_shape_[:1], [1], output_shape_[1:]], axis=0)
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    else:
-      output_shape_ = array_ops.concat(
-          [output_shape_[:2], [1], output_shape_[2:]], axis=0)
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filter = array_ops.expand_dims(filter, 0)  # pylint: disable=redefined-builtin
-
-    result = gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
-        padding=padding,
-        data_format=data_format_2d,
-        name=name)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
 @ops.RegisterStatistics("Dilation2D", "flops")
 def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 74561349ed1de72037e2e2f3c5d16e4a7cb03ce5..d79e420589f7c8346a30281a88637ea5d8fc16d2 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -24,9 +24,11 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -41,7 +43,6 @@ from tensorflow.python.ops.nn_impl import _compute_sampled_logits
 from tensorflow.python.platform import test as test_lib
 
 
-@test_util.disable_all_xla("This test never passed for XLA")
 class ZeroFractionTest(test_lib.TestCase):
 
   def _ZeroFraction(self, x):
@@ -1018,11 +1019,10 @@ class LeakyReluTest(test_lib.TestCase):
 class SwishTest(test_lib.TestCase):
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testValues(self):
     np_values = np.array(
-        [np.linspace(-10.0, 0.0, 100),
-         np.linspace(0.0, 10.0, 100)],
+        [np.linspace(-7.0, 0.0, 100),
+         np.linspace(0.0, 7.0, 100)],
         dtype=np.float32)
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
@@ -1241,5 +1241,206 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class AvgPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MaxPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    x = array_ops.ones([3, 4])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 2."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+  def testIncorrectSizeInput(self):
+    x = array_ops.ones([3, 4, 1, 2, 1, 2])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 6."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ConvolutionTest(test_lib.TestCase):
+
+  def testUnknownSize(self):
+    x = tensor_spec.TensorSpec(None, dtypes.float32, name="x")
+    k = np.ones([3, 6, 6, 5])
+
+    @def_function.function
+    def F(value):
+      return nn_ops.convolution(value, k, "SAME")
+
+    F.get_concrete_function(x)
+
+
+class ConvTransposeTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    t = array_ops.ones([2, 4, 3])
+    v = array_ops.ones([2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv1d_transpose(t, v, [2, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    t = array_ops.ones([2, 4, 4, 3])
+    v = array_ops.ones([2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv2d_transpose_v2(t, v, [2, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    t = array_ops.ones([2, 4, 4, 4, 3])
+    v = array_ops.ones([2, 2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv3d_transpose_v2(t, v, [2, 8, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 2."):
+      nn_ops.conv_transpose(None, 2, [2, 3], "SAME")
+
+  def testIncorrectSizeInput(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 6."):
+      nn_ops.conv_transpose(None, 2, [2, 3, 4, 2, 5, 1], "SAME")
+
+  def testTensorsNoShape(self):
+    with self.assertRaisesRegex(ValueError, "output_shape cannot be None"):
+      nn_ops.conv_transpose(None, None, None, None)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 0ab39ad0a8edd60c78a6bea3ae31e4f025c9e0bd..2aba42ef8951d58be595dbe1208eba3a9fceb663 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -30,18 +30,23 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
-def verify_tensor_all_finite(t, msg, name=None):
+def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
   Args:
     t: Tensor to check.
     msg: Message to log on failure.
     name: A name for this operation (optional).
+    x: Alias for t.
+    message: Alias for msg.
 
   Returns:
     Same tensor as `t`.
   """
-  return verify_tensor_all_finite_v2(t, msg, name)
+  x = deprecation.deprecated_argument_lookup("x", x, "t", t)
+  message = deprecation.deprecated_argument_lookup(
+      "message", message, "msg", msg)
+  return verify_tensor_all_finite_v2(x, message, name)
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index f54ca558f4fddd3efc66badd800e8b0e82b2708c..0dd13a420f697cb295168dceb54d12455f24aac8 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -31,6 +31,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
@@ -56,7 +57,7 @@ py_library(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
@@ -114,6 +115,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:util",
     ],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -129,6 +131,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:backprop",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -143,6 +146,7 @@ cuda_py_test(
         "//tensorflow/python:util",
     ],
     tags = ["optonly"],  # Too slow in non-opt mode
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 8f652e9c5097db318a77c3cec8c6597c6bb1d87c..83bf86a5635b9a81e78956903918f3cb7adad93c 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -13,10 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 """for_loop and pfor ops."""
+# pylint: disable=g-direct-tensorflow-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -27,7 +31,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.parallel_for.pfor import PFor
+from tensorflow.python.ops.parallel_for.pfor import PForConfig
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
@@ -98,6 +105,9 @@ def _flatten_first_two_dims(x):
   return array_ops.reshape(x, new_shape)
 
 
+PFOR_CONFIG_ARG = "pfor_config"
+
+
 def pfor(loop_fn, iters, parallel_iterations=None):
   """Equivalent to running `loop_fn` `iters` times and stacking the outputs.
 
@@ -127,10 +137,11 @@ def pfor(loop_fn, iters, parallel_iterations=None):
 
   Args:
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
-      the iteration number, and returns a possibly nested structure of Tensor or
-      Operation objects. Note that if setting `parallel_iterations` argument to
-      something other than None, `loop_fn` may be called more than once during
-      graph construction. So it may need to avoid mutating global state.
+      the iteration number, and optionally a keyword argument `pfor_config` set
+      to a PForConfig object. It returns a possibly nested structure of Tensor
+      or Operation objects. Note that if setting `parallel_iterations` argument
+      to something other than None, `loop_fn` may be called more than once
+      during graph construction. So it may need to avoid mutating global state.
     iters: Number of iterations for which to run loop_fn.
     parallel_iterations: A knob to control how many iterations are vectorized
       and dispatched in parallel. The default value of None corresponds to
@@ -151,12 +162,38 @@ def pfor(loop_fn, iters, parallel_iterations=None):
   return f()
 
 
-def _pfor_impl(loop_fn, iters, parallel_iterations=None):
+def _loop_fn_has_config(loop_fn):
+  """Test if `loop_fn` has a `pfor_config` argument."""
+  if tf_inspect.isfunction(loop_fn):
+    argspec = tf_inspect.getargspec(loop_fn)
+    return PFOR_CONFIG_ARG in argspec.args
+  elif isinstance(loop_fn, functools.partial):
+    fn = loop_fn.func
+    argspec = tf_inspect.getargspec(fn)
+    return (PFOR_CONFIG_ARG in argspec.args and
+            PFOR_CONFIG_ARG not in loop_fn.keywords)
+  else:
+    loop_class = tf_decorator.unwrap(loop_fn)[1]
+    if not hasattr(loop_class, "__call__"):
+      raise ValueError("loop_fn object did not have a __call__ method")
+    argspec = tf_inspect.getargspec(loop_class.__call__)
+    return PFOR_CONFIG_ARG in argspec.args
+
+
+def _pfor_impl(loop_fn, iters, parallel_iterations=None, pfor_config=None):
   """Implementation of pfor."""
+  loop_fn_has_config = _loop_fn_has_config(loop_fn)
   existing_ops = set(ops.get_default_graph().get_operations())
   with ops.name_scope("loop_body"):
     loop_var = array_ops.placeholder(dtypes.int32, shape=[])
-    loop_fn_outputs = loop_fn(loop_var)
+    if loop_fn_has_config:
+      if pfor_config is None:
+        pfor_config = PForConfig()
+        pfor_config._set_iters(iters)  # pylint: disable=protected-access
+      loop_fn_outputs = loop_fn(loop_var, **{PFOR_CONFIG_ARG: pfor_config})
+    else:
+      assert pfor_config is None
+      loop_fn_outputs = loop_fn(loop_var)
   new_ops = set(ops.get_default_graph().get_operations()) - existing_ops
   iters = ops.convert_to_tensor(iters)
   if parallel_iterations is not None:
@@ -169,18 +206,22 @@ def _pfor_impl(loop_fn, iters, parallel_iterations=None):
       parallel_iterations = None
   if parallel_iterations is None:
     with ops.name_scope("pfor"):
-      converter = PFor(loop_var, iters, new_ops)
+      converter = PFor(loop_var, iters, new_ops, pfor_config=pfor_config)
       outputs = []
       for loop_fn_output in nest.flatten(loop_fn_outputs):
         outputs.append(converter.convert(loop_fn_output))
       return nest.pack_sequence_as(loop_fn_outputs, outputs)
   else:
+    if pfor_config is not None and pfor_config._has_reductions():  # pylint: disable=protected-access
+      raise ValueError("Setting parallel_iterations currently unsupported if"
+                       " reductions across iterations are performed.")
     num_tiled_iterations = iters // parallel_iterations
     num_remaining_iterations = iters % parallel_iterations
     # TODO(agarwal): Avoid calling loop_fn twice. Generate the loop body inside
     # a tf.function and extract the graph from there to vectorize it.
     with ops.name_scope("pfor_untiled"):
-      converter = PFor(loop_var, num_remaining_iterations, new_ops)
+      converter = PFor(loop_var, num_remaining_iterations, new_ops,
+                       pfor_config=pfor_config)
       remaining_outputs = []
       flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
       for loop_fn_output in flattened_loop_fn_outputs:
@@ -193,10 +234,14 @@ def _pfor_impl(loop_fn, iters, parallel_iterations=None):
       def tiled_loop_body(j):
         offset = j * parallel_iterations + num_remaining_iterations
 
-        def tiled_loop_fn(i):
-          return nest.flatten(loop_fn(i + offset))
+        def tiled_loop_fn(i, pfor_config=None):
+          if loop_fn_has_config:
+            return nest.flatten(loop_fn(i + offset, pfor_config=pfor_config))
+          else:
+            return nest.flatten(loop_fn(i + offset))
 
-        return pfor(tiled_loop_fn, parallel_iterations)
+        return _pfor_impl(
+            tiled_loop_fn, parallel_iterations, pfor_config=pfor_config)
 
       tiled_outputs = for_loop(tiled_loop_body, loop_fn_dtypes,
                                num_tiled_iterations, parallel_iterations=1)
@@ -213,7 +258,3 @@ def _pfor_impl(loop_fn, iters, parallel_iterations=None):
       else:
         outputs = tiled_outputs
       return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
-
-
-
-
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 8a5830e28f34fe35258262241db2330b1f592614..ef877c3544686d537133938a8cdd4af4ac31336f 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for pfor and for_loop."""
+# pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import time
 
 from absl import flags
@@ -36,9 +38,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
@@ -100,6 +102,90 @@ class PForTest(PForTestCase):
       pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ReductionTest(PForTestCase):
+
+  def test_reduce_concat(self):
+    x = random_ops.random_uniform([8, 3])
+
+    def loop_fn(i, pfor_config):
+      x_i = array_ops.gather(x, i)
+      vectorized_value = pfor_config.reduce_concat(x_i)
+      mean_value = math_ops.reduce_mean(vectorized_value, axis=0)
+      return x_i - mean_value
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 8)
+    ans = x - math_ops.reduce_mean(x, axis=0)
+    output_val, ans_val = self.evaluate([output, ans])
+    self.assertAllClose(ans_val, output_val)
+
+  def test_reduce_mean(self):
+    x = random_ops.random_uniform([8, 3])
+
+    def loop_fn(i, pfor_config):
+      x_i = array_ops.gather(x, i)
+      return x_i - pfor_config.reduce_mean(x_i)
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 8)
+    ans = x - math_ops.reduce_mean(x, axis=0)
+    output_val, ans_val = self.evaluate([output, ans])
+    self.assertAllClose(ans_val, output_val)
+
+  def test_reduce_sum(self):
+    x = random_ops.random_uniform([8, 3])
+
+    def loop_fn(i, pfor_config):
+      x_i = array_ops.gather(x, i)
+      return x_i - pfor_config.reduce_sum(x_i)
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 8)
+    ans = x - math_ops.reduce_sum(x, axis=0)
+    output_val, ans_val = self.evaluate([output, ans])
+    self.assertAllClose(ans_val, output_val)
+
+  def test_reduce_class(self):
+    x = random_ops.random_uniform([8, 3])
+
+    class LoopFn(object):
+
+      def __init__(self):
+        pass
+
+      def __call__(self, i, pfor_config):
+        x_i = array_ops.gather(x, i)
+        return x_i - pfor_config.reduce_mean(x_i)
+
+    output = pfor_control_flow_ops.pfor(LoopFn(), 8)
+    ans = x - math_ops.reduce_mean(x, axis=0)
+    output_val, ans_val = self.evaluate([output, ans])
+    self.assertAllClose(ans_val, output_val)
+
+  def test_reduce_functools_partial(self):
+    x = random_ops.random_uniform([8, 3])
+
+    def fn(i, pfor_config, dummy=None):
+      del dummy
+      x_i = array_ops.gather(x, i)
+      return x_i - pfor_config.reduce_mean(x_i)
+
+    loop_fn = functools.partial(fn, dummy=1)
+    output = pfor_control_flow_ops.pfor(loop_fn, 8)
+    ans = x - math_ops.reduce_mean(x, axis=0)
+    output_val, ans_val = self.evaluate([output, ans])
+    self.assertAllClose(ans_val, output_val)
+
+  def test_parallel_iterations(self):
+    x = random_ops.random_uniform([8, 3])
+
+    def loop_fn(i, pfor_config):
+      x_i = array_ops.gather(x, i)
+      return pfor_config.reduce_sum(x_i)
+
+    with self.assertRaisesRegexp(
+        ValueError, "parallel_iterations currently unsupported"):
+      pfor_control_flow_ops.pfor(loop_fn, 8, parallel_iterations=2)
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class BitwiseTest(PForTestCase):
 
@@ -928,15 +1014,15 @@ class Benchmarks(test.Benchmark):
       b = 256
       params = 1000
       inp = random_ops.random_normal((b, params))
-      map_fn = lambda x: x * x
+      fn = lambda x: x * x
 
       def pfor_map_fn(f, x):
         return pfor_control_flow_ops.pfor(
             lambda i: f(array_ops.gather(x, i)),
             array_ops.shape(x)[0])
 
-      map_output = functional_ops.map_fn(map_fn, inp)
-      pfor_output = pfor_map_fn(map_fn, inp)
+      map_output = map_fn.map_fn(fn, inp)
+      pfor_output = pfor_map_fn(fn, inp)
 
       self._run(map_output, 100, name="tf_map_fn")
       self._run(pfor_output, 100, name="pfor_map_fn")
@@ -965,6 +1051,26 @@ class Benchmarks(test.Benchmark):
       self._run(pfor_outputs, 100, name="pfor_rnn")
       self._run(tf_outputs, 100, name="tf_rnn")
 
+  def benchmark_reduction(self):
+    n = 1024
+    with ops.Graph().as_default():
+      x = random_ops.random_uniform([n, n])
+      w = random_ops.random_uniform([n, n])
+
+      def loop_fn(i, pfor_config):
+        x_i = array_ops.gather(x, i)
+        return math_ops.reduce_sum(
+            math_ops.matmul(pfor_config.reduce_concat(x_i), w))
+
+      # Note that output_reduction will be tiled, so there may be some minor
+      # overheads compared to output_no_reduction.
+      output_reduction = pfor_control_flow_ops.pfor(loop_fn, n)
+      output_no_reduction = math_ops.reduce_sum(math_ops.matmul(x, w))
+      # Benchmark to test that reduction does not add overhead and its output is
+      # treated as loop invariant.
+      self._run(output_reduction, 30, name="matmul_reduction")
+      self._run(output_no_reduction, 30, name="matmul_no_reduction")
+
 
 class SparseTest(PForTestCase):
 
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 69635c5a79c032514cdcd83af7e52b6953b2dc0b..b2946576053e10a428fb2dd237950d44ec687bc4 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -497,7 +497,7 @@ class GradientsTest(test.TestCase):
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
 
   @test_util.disable_xla("This test never passed for XLA")
-  def test_dynamic_lstm_batch_jacobian(self):
+  def DISABLED_test_dynamic_lstm_batch_jacobian(self):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
       init = variables.global_variables_initializer()
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index db88f4fe0332afe8de312da65b9643a24a056bcb..8a081e194f14ddc1eed4aed846a02706c051a71a 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -161,7 +161,6 @@ class MathTest(PForTestCase):
         math_ops.divide,
         math_ops.div_no_nan,
         math_ops.equal,
-        math_ops.floor_div,
         math_ops.floor_mod,
         math_ops.greater,
         math_ops.greater_equal,
@@ -182,6 +181,10 @@ class MathTest(PForTestCase):
         safe_polygamma,
         safe_zeta,
     ]
+    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+    # division differences.
+    if not test_util.is_xla_enabled():
+      float_ops += [math_ops.floor_div]
     for op in logical_ops + float_ops:
       x = random_ops.random_uniform([7, 3, 5])
       y = random_ops.random_uniform([3, 5])
@@ -278,7 +281,7 @@ class MathTest(PForTestCase):
     x = random_ops.random_uniform([2, 3, 4, 5])
     for op in [
         math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
-        math_ops.reduce_min
+        math_ops.reduce_min, math_ops.reduce_mean,
     ]:
       for axis in ([1], None, [0, 2]):
         for keepdims in (True, False):
@@ -325,26 +328,46 @@ class MathTest(PForTestCase):
           self._test_loop_fn(loop_fn, 2)
 
   def test_bias_add(self):
-    x_shape = [2, 3, 4, 5, 6]
-    x = random_ops.random_uniform(x_shape)
     for data_format in ("NCHW", "NHWC"):
-      with backprop.GradientTape(persistent=True) as g:
-        bias_dim = 2 if data_format == "NCHW" else -1
-        bias_shape = x_shape[bias_dim]
-        bias = random_ops.random_uniform([bias_shape])
-        g.watch(bias)
+      for stacked_value in (True, False):
+        x_shape = [3, 4, 5, 6]
+        if stacked_value:
+          x_shape = [2] + x_shape
+        x = random_ops.random_uniform(x_shape)
+        for stacked_bias in (True, False):
+          if not (stacked_value or stacked_bias):
+            continue
+          with backprop.GradientTape(persistent=True) as g:
+            bias_dim = -1
+            if data_format == "NCHW":
+              bias_dim = 2 if stacked_value else 1
+            bias_shape = [x_shape[bias_dim]]
+            if stacked_bias:
+              bias_shape = [2] + bias_shape
+            bias = random_ops.random_uniform(bias_shape)
+            g.watch(bias)
 
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          a = array_ops.gather(x, i)
-          y = nn.bias_add(a, bias, data_format=data_format)
-          loss = math_ops.reduce_sum(y * y)
-        return y, g.gradient(loss, bias)
-      # pylint: enable=cell-var-from-loop
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            with g:
+              a = array_ops.gather(x, i) if stacked_value else x
+              b = array_ops.gather(bias, i) if stacked_bias else bias
+              y = nn.bias_add(a, b, data_format=data_format)
+              loss = math_ops.reduce_sum(y * y)
+            grad = g.gradient(loss, bias)
+            if stacked_bias:
+              # If we gather over bias in loop_fn, the gradient will be an
+              # instance of `IndexedSlices` with attrs `values` and `indices`.
+              return y, grad.values, grad.indices
+            else:
+              return y, grad
+          # pylint: enable=cell-var-from-loop
 
-      self._test_loop_fn(
-          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
+          out_dtypes = [dtypes.float32, dtypes.float32]
+          if stacked_bias:
+            out_dtypes = out_dtypes + [dtypes.int32]
+          self._test_loop_fn(
+              loop_fn, 2, loop_fn_dtypes=out_dtypes)
 
   def test_unsorted_segment_sum(self):
     t = random_ops.random_uniform([3, 3, 2])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index a22c1126c93915da7acc5221594567f855557b84..dc1ed88d475eba6130a89d1b9121a1f3bc3dad8e 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Compiled parallel-for loop."""
-# pylint: disable=missing-docstring
+# pylint: disable=missing-docstring,g-direct-tensorflow-import
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,9 +33,9 @@ from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
@@ -42,6 +43,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 flags.DEFINE_bool(
@@ -93,13 +95,15 @@ def _is_stateful_pfor_op(op):
 class WhileOp(object):
   """Object for storing state for converting the outputs of a while_loop."""
 
-  def __init__(self, exit_node, pfor_ops):
+  def __init__(self, exit_node, pfor_ops, pfor_config):
     """Initializer.
 
     Args:
       exit_node: A tensor output from the while_loop.
       pfor_ops: list of ops inside the current pfor loop.
+      pfor_config: PForConfig object used while constructing loop body.
     """
+    self._pfor_config = pfor_config
     self._pfor_ops = set(pfor_ops)
     self._pfor_op_ids = set([x._id for x in pfor_ops])
     assert isinstance(exit_node, ops.Tensor)
@@ -280,7 +284,8 @@ class WhileOp(object):
         loop_len,
         pfor_ops=self._pfor_ops,
         all_indices=indices,
-        all_indices_partitioned=cond_stacked)
+        all_indices_partitioned=cond_stacked,
+        pfor_config=self._pfor_config)
     # Map all inputs of Enter nodes in self._direct_enters to their converted
     # values.
     for enter in self._direct_enters:
@@ -902,6 +907,86 @@ def _fallback_converter(pfor_input):
   return tuple([wrap(ta.concat(), True) for ta in ta_list])
 
 
+class PForConfig(object):
+  """A configuration object used to communicate with loop body function."""
+
+  def __init__(self):
+    # This may be set to the number of iterations.
+    self._maybe_iters = None
+    # Map from output placeholder to the unvectorized tensor.
+    self._reduce_concat_map = {}
+    # Reverse map of `self._reduce_concat_map`.
+    self._reverse_reduce_concat_map = {}
+
+  def _has_reductions(self):
+    """True if some reductions where performed by loop body."""
+    return len(self._reduce_concat_map)
+
+  def _set_iters(self, iters):
+    """Set number of pfor iterations."""
+    self._maybe_iters = iters
+
+  # TODO(agarwal): handle reductions inside control flow constructs.
+  def reduce_concat(self, x):
+    """Performs a concat reduction on `x` across pfor iterations.
+
+    Note that this currently may not work inside a control flow construct.
+    Args:
+      x: an unvectorized Tensor.
+
+    Returns:
+      A Tensor that has rank one higher than `x`. The value is the vectorized
+      version of `x`, i.e. stacking the value of `x` across different pfor
+      iterations.
+    """
+    assert not context.executing_eagerly()
+    assert isinstance(x, ops.Tensor)
+    if x not in self._reduce_concat_map:
+      out_shape = tensor_shape.TensorShape([self._maybe_iters]).concatenate(
+          x.shape)
+      with ops.control_dependencies([x]):
+        # Control dependency to make sure out is converted after x.
+        out = array_ops.placeholder(x.dtype, out_shape)
+      self._reduce_concat_map[out] = x
+      self._reverse_reduce_concat_map[x] = out
+      return out
+    else:
+      return self._reverse_reduce_concat_map[x]
+
+  def reduce_mean(self, x):
+    """Performs a mean reduction on `x` across pfor iterations.
+
+    Note that this currently may not work inside a control flow construct.
+    Args:
+      x: an unvectorized Tensor.
+
+    Returns:
+      A Tensor that has same rank as `x`. The value is the mean of the values
+      of `x` across the pfor iterations.
+    """
+    y = self.reduce_concat(x)
+    return math_ops.reduce_mean(y, axis=0)
+
+  def reduce_sum(self, x):
+    """Performs a sum reduction on `x` across pfor iterations.
+
+    Note that this currently may not work inside a control flow construct.
+    Args:
+      x: an unvectorized Tensor.
+
+    Returns:
+      A Tensor that has same rank as `x`. The value is the sum of the values
+      of `x` across the pfor iterations.
+    """
+    y = self.reduce_concat(x)
+    return math_ops.reduce_sum(y, axis=0)
+
+  def _lookup_reduction(self, pl):
+    """Lookups Placeholder `pl` in the reduction map."""
+    assert isinstance(pl, ops.Tensor)
+    return self._reduce_concat_map.get(pl, None)
+
+
 class PFor(object):
   """Implementation of rewrite of parallel-for loops.
 
@@ -940,7 +1025,8 @@ class PFor(object):
                loop_len,
                pfor_ops,
                all_indices=None,
-               all_indices_partitioned=False):
+               all_indices_partitioned=False,
+               pfor_config=None):
     """Creates an object to rewrite a parallel-for loop.
 
     Args:
@@ -957,6 +1043,7 @@ class PFor(object):
       all_indices_partitioned: If True, this object is being constructed from a
        control flow construct where not all the pfor iterations are guaranteed
        to be active.
+      pfor_config: PForConfig object used while constructing the loop body.
     """
     assert isinstance(loop_var, ops.Tensor)
     assert loop_var.op.type == "Placeholder"
@@ -975,6 +1062,7 @@ class PFor(object):
     self._conversion_map[loop_var] = wrap(self.all_indices, True)
     self._pfor_ops = set(pfor_ops)
     self._pfor_op_ids = set([x._id for x in pfor_ops])
+    self._pfor_config = pfor_config
 
   def op_is_inside_loop(self, op):
     """True if op was created inside the pfor loop body."""
@@ -1037,7 +1125,7 @@ class PFor(object):
     if sparse_tensor_rank is not None:
       sparse_tensor_rank += 1
 
-    def map_fn(args):
+    def fn(args):
       res = gen_sparse_ops.serialize_sparse(
           args[0], args[1], args[2], out_type=dtypes.variant)
       return res
@@ -1046,8 +1134,8 @@ class PFor(object):
     # sparse tensor element and batch them all, then deserializes the batch.
     # TODO(rachelim): Try to do this without map_fn -- add the right offsets
     # to shape and indices tensors instead.
-    result = functional_ops.map_fn(
-        map_fn, [indices, values, shape], dtype=dtypes.variant)
+    result = map_fn.map_fn(
+        fn, [indices, values, shape], dtype=dtypes.variant)
     return sparse_ops.deserialize_sparse(
         result, dtype=values.dtype, rank=sparse_tensor_rank)
 
@@ -1112,7 +1200,8 @@ class PFor(object):
 
       is_while_loop = y_op.type == "Exit"
       if is_while_loop:
-        while_op = WhileOp(y, pfor_ops=self._pfor_ops)
+        while_op = WhileOp(
+            y, pfor_ops=self._pfor_ops, pfor_config=self._pfor_config)
         is_inside_loop = while_op.is_inside_loop
         # If all nodes in the while_loop graph were created inside the pfor, we
         # treat the whole loop subgraph as a single op (y_op) and try to convert
@@ -1184,10 +1273,32 @@ class PFor(object):
       control_dependencies = [] if is_while_loop else converted_control_ops
       with ops.control_dependencies(control_dependencies), ops.name_scope(
           y_op.name + "/pfor/"):
+        # Op is a placeholder for a reduction.
+        if (self._pfor_config is not None and
+            self._pfor_config._lookup_reduction(y) is not None):
+          # Handle reductions. Map the placeholder to the unvectorized input
+          # that is being reduced.
+          reduction_input = self._pfor_config._lookup_reduction(y)
+          assert isinstance(reduction_input, ops.Tensor), reduction_input
+          # Tensor being reduced should already be converted due to a control
+          # dependency on the created placeholder.
+          # Note that in cases where reduction_input is in an outer context, one
+          # needs to locate the corresponding Enter node and use that to lookup
+          # the conversion.
+          # TODO(agarwal): handle reductions inside control flow constructs.
+          assert reduction_input in self._conversion_map, (
+              "Unable to handle reduction of %s, possibly as it was used "
+              "inside a control flow construct. Note that reductions across "
+              "pfor iterations are currently not supported inside control flow "
+              "constructs." % reduction_input)
+          output = self._conversion_map[reduction_input]
+          # If original input is not stacked, we tile it. Also we always mark
+          # output as unstacked.
+          new_outputs = [wrap(self._unwrap_or_tile(output), False)]
         # None of the inputs and control inputs were converted.
-        if (not is_inside_loop or
-            (not is_stateful and not some_input_converted and
-             not some_control_input_converted)):
+        elif (not is_inside_loop or
+              (not is_stateful and not some_input_converted and
+               not some_control_input_converted)):
           if y == y_op:
             assert not isinstance(y_op, WhileOp)
             new_outputs = y_op
@@ -1876,6 +1987,7 @@ def _convert_batch_mat_mul(pfor_input):
 @RegisterPForWithArgs("Prod", math_ops.reduce_prod)
 @RegisterPForWithArgs("Max", math_ops.reduce_max)
 @RegisterPForWithArgs("Min", math_ops.reduce_min)
+@RegisterPForWithArgs("Mean", math_ops.reduce_mean)
 def _convert_reduction(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   indices = pfor_input.unstacked_input(1)
@@ -1899,17 +2011,30 @@ def _convert_cumfoo(pfor_input, _, op_func):
 
 @RegisterPFor("BiasAdd")
 def _convert_biasadd(pfor_input):
-  t = pfor_input.stacked_input(0)
-  bias = pfor_input.unstacked_input(1)
-  data_format = pfor_input.get_attr("data_format")
-  if data_format != b"NCHW":
+  t, t_stacked, _ = pfor_input.input(0)
+  bias, bias_stacked, _ = pfor_input.input(1)
+  data_format = pfor_input.get_attr("data_format").decode()
+  if bias_stacked:
+    # BiasAdd only supports 1-D biases, so cast bias to match value and use Add.
+    pfor_input.expanddim_inputs_for_broadcast()
+    t, _, _ = pfor_input.input(0)
+    bias = math_ops.cast(pfor_input.stacked_input(1), t.dtype)
+    if compat.as_bytes(data_format) == b"NCHW":
+      b_shape = array_ops.shape(bias)
+      new_b_shape = array_ops.concat(
+          [b_shape[:-3], b_shape[-1:], b_shape[-3:-1]], axis=0)
+      bias = array_ops.reshape(bias, new_b_shape)
+    return wrap(math_ops.add(t, bias), True)
+  else:
+    assert t_stacked, "At least one input to BiasAdd should be loop variant."
+    if compat.as_bytes(data_format) == b"NCHW":
+      shape = array_ops.shape(t)
+      flattened_shape = array_ops.concat([[-1], shape[2:]], axis=0)
+      t = array_ops.reshape(t, flattened_shape)
+      t = nn_ops.bias_add(t, bias, data_format="NCHW")
+      t = array_ops.reshape(t, shape)
+      return wrap(t, True)
     return wrap(nn_ops.bias_add(t, bias, data_format=data_format), True)
-  shape = array_ops.shape(t)
-  flattened_shape = array_ops.concat([[-1], shape[2:]], axis=0)
-  t = array_ops.reshape(t, flattened_shape)
-  t = nn_ops.bias_add(t, bias, data_format=b"NCHW")
-  t = array_ops.reshape(t, shape)
-  return wrap(t, True)
 
 
 @RegisterPFor("UnsortedSegmentSum")
@@ -1945,72 +2070,73 @@ def _convert_cast(pfor_input):
 
 
 @RegisterPForWithArgs("Abs", math_ops.abs)
-@RegisterPForWithArgs("Acosh", math_ops.acosh)
 @RegisterPForWithArgs("Acos", math_ops.acos)
+@RegisterPForWithArgs("Acosh", math_ops.acosh)
 @RegisterPForWithArgs("Add", math_ops.add)
 @RegisterPForWithArgs("AddV2", math_ops.add_v2)
 @RegisterPForWithArgs("Angle", math_ops.angle)
-@RegisterPForWithArgs("Asinh", math_ops.asinh)
 @RegisterPForWithArgs("Asin", math_ops.asin)
+@RegisterPForWithArgs("Asinh", math_ops.asinh)
+@RegisterPForWithArgs("Atan", math_ops.atan)
 @RegisterPForWithArgs("Atan2", math_ops.atan2)
 @RegisterPForWithArgs("Atanh", math_ops.atanh)
-@RegisterPForWithArgs("Atan", math_ops.atan)
 @RegisterPForWithArgs("BesselI0e", math_ops.bessel_i0e)
 @RegisterPForWithArgs("BesselI1e", math_ops.bessel_i1e)
 @RegisterPForWithArgs("BitwiseAnd", bitwise_ops.bitwise_and)
 @RegisterPForWithArgs("BitwiseOr", bitwise_ops.bitwise_or)
 @RegisterPForWithArgs("BitwiseXor", bitwise_ops.bitwise_xor)
 @RegisterPForWithArgs("Ceil", math_ops.ceil)
-@RegisterPForWithArgs("ComplexAbs", math_ops.complex_abs)
 @RegisterPForWithArgs("Complex", math_ops.complex)
+@RegisterPForWithArgs("ComplexAbs", math_ops.complex_abs)
 @RegisterPForWithArgs("Conj", math_ops.conj)
-@RegisterPForWithArgs("Cosh", math_ops.cosh)
 @RegisterPForWithArgs("Cos", math_ops.cos)
+@RegisterPForWithArgs("Cosh", math_ops.cosh)
 @RegisterPForWithArgs("Digamma", math_ops.digamma)
 @RegisterPForWithArgs("Div", math_ops.div)
 @RegisterPForWithArgs("DivNoNan", math_ops.div_no_nan)
 @RegisterPForWithArgs("Elu", nn_ops.elu)
 @RegisterPForWithArgs("Equal", math_ops.equal)
-@RegisterPForWithArgs("Erfc", math_ops.erfc)
 @RegisterPForWithArgs("Erf", math_ops.erf)
-@RegisterPForWithArgs("Expm1", math_ops.expm1)
+@RegisterPForWithArgs("Erfc", math_ops.erfc)
 @RegisterPForWithArgs("Exp", math_ops.exp)
-@RegisterPForWithArgs("FloorDiv", math_ops.floor_div)
+@RegisterPForWithArgs("Expm1", math_ops.expm1)
 @RegisterPForWithArgs("Floor", math_ops.floor)
+@RegisterPForWithArgs("FloorDiv", math_ops.floor_div)
 @RegisterPForWithArgs("FloorMod", math_ops.floor_mod)
-@RegisterPForWithArgs("GreaterEqual", math_ops.greater_equal)
 @RegisterPForWithArgs("Greater", math_ops.greater)
-@RegisterPForWithArgs("Igammac", math_ops.igammac)
-@RegisterPForWithArgs("IgammaGradA", math_ops.igamma_grad_a)
+@RegisterPForWithArgs("GreaterEqual", math_ops.greater_equal)
 @RegisterPForWithArgs("Igamma", math_ops.igamma)
+@RegisterPForWithArgs("IgammaGradA", math_ops.igamma_grad_a)
+@RegisterPForWithArgs("Igammac", math_ops.igammac)
 @RegisterPForWithArgs("Imag", math_ops.imag)
-@RegisterPForWithArgs("Invert", bitwise_ops.invert)
 @RegisterPForWithArgs("Inv", math_ops.inv)
+@RegisterPForWithArgs("Invert", bitwise_ops.invert)
 @RegisterPForWithArgs("IsFinite", math_ops.is_finite)
 @RegisterPForWithArgs("IsInf", math_ops.is_inf)
 @RegisterPForWithArgs("LeftShift", bitwise_ops.left_shift)
-@RegisterPForWithArgs("LessEqual", math_ops.less_equal)
 @RegisterPForWithArgs("Less", math_ops.less)
+@RegisterPForWithArgs("LessEqual", math_ops.less_equal)
 @RegisterPForWithArgs("Lgamma", math_ops.lgamma)
+@RegisterPForWithArgs("Log", math_ops.log)
 @RegisterPForWithArgs("Log1p", math_ops.log1p)
 @RegisterPForWithArgs("LogicalAnd", math_ops.logical_and)
 @RegisterPForWithArgs("LogicalNot", math_ops.logical_not)
 @RegisterPForWithArgs("LogicalOr", math_ops.logical_or)
 @RegisterPForWithArgs("LogicalXor", math_ops.logical_xor)
-@RegisterPForWithArgs("Log", math_ops.log)
 @RegisterPForWithArgs("Maximum", math_ops.maximum)
 @RegisterPForWithArgs("Minimum", math_ops.minimum)
 @RegisterPForWithArgs("Mod", math_ops.mod)
 @RegisterPForWithArgs("Mul", math_ops.multiply)
+@RegisterPForWithArgs("MulNoNan", math_ops.mul_no_nan)
 @RegisterPForWithArgs("Neg", math_ops.negative)
 @RegisterPForWithArgs("NotEqual", math_ops.not_equal)
 @RegisterPForWithArgs("Polygamma", math_ops.polygamma)
 @RegisterPForWithArgs("Pow", math_ops.pow)
-@RegisterPForWithArgs("RealDiv", math_ops.divide)
 @RegisterPForWithArgs("Real", math_ops.real)
+@RegisterPForWithArgs("RealDiv", math_ops.divide)
 @RegisterPForWithArgs("Reciprocal", math_ops.reciprocal)
-@RegisterPForWithArgs("Relu6", nn_ops.relu6)
 @RegisterPForWithArgs("Relu", nn_ops.relu)
+@RegisterPForWithArgs("Relu6", nn_ops.relu6)
 @RegisterPForWithArgs("RightShift", bitwise_ops.right_shift)
 @RegisterPForWithArgs("Rint", math_ops.rint)
 @RegisterPForWithArgs("Round", math_ops.round)
@@ -2018,18 +2144,20 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Selu", nn_ops.selu)
 @RegisterPForWithArgs("Sigmoid", math_ops.sigmoid)
 @RegisterPForWithArgs("Sign", math_ops.sign)
-@RegisterPForWithArgs("Sinh", math_ops.sinh)
 @RegisterPForWithArgs("Sin", math_ops.sin)
+@RegisterPForWithArgs("Sinh", math_ops.sinh)
 @RegisterPForWithArgs("Softplus", nn_ops.softplus)
 @RegisterPForWithArgs("Softsign", nn_ops.softsign)
 @RegisterPForWithArgs("Sqrt", math_ops.sqrt)
-@RegisterPForWithArgs("SquaredDifference", math_ops.squared_difference)
 @RegisterPForWithArgs("Square", math_ops.square)
+@RegisterPForWithArgs("SquaredDifference", math_ops.squared_difference)
 @RegisterPForWithArgs("Sub", math_ops.subtract)
-@RegisterPForWithArgs("Tanh", math_ops.tanh)
 @RegisterPForWithArgs("Tan", math_ops.tan)
+@RegisterPForWithArgs("Tanh", math_ops.tanh)
 @RegisterPForWithArgs("TruncateDiv", math_ops.truncate_div)
 @RegisterPForWithArgs("TruncateMod", math_ops.truncate_mod)
+@RegisterPForWithArgs("Xdivy", math_ops.xdivy)
+@RegisterPForWithArgs("Xlogy", math_ops.xlogy)
 @RegisterPForWithArgs("Zeta", math_ops.zeta)
 def _convert_cwise(pfor_input, op_type, op_func):
   # Note that ops handled here do not have attributes except "T" and "Tout", and
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index c1084c25592045734ae016c9d5a84b5264a38032..50e89e7220a5347019aa9e8d127fe24b0d5ee76e 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -240,7 +240,7 @@ def fixed_size_partitioner(num_shards, axis=0):
 @tf_export(v1=["create_partitioned_variables"])
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.get_variable with a partitioner set.")
+    instructions="Use `tf.get_variable` with a partitioner set.")
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
     trainable=True, collections=None, name=None, reuse=None):
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 972b14955ff98804fc8a57bd53f0d62bcde23a55..b980efe77c222425bd7abbac7c284a585dc75ff2 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -24,10 +24,14 @@ py_library(
     tags = ["nofixdeps"],
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_batch_gather_with_default_op",
+        ":ragged_concat_ops",
         ":ragged_conversion_ops",
         ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_functional_ops",
+        ":ragged_gather_ops",
         ":ragged_getitem",
         ":ragged_map_ops",
         ":ragged_math_ops",
@@ -37,6 +41,7 @@ py_library(
         ":ragged_tensor_shape",
         ":ragged_tensor_value",
         ":ragged_util",
+        ":ragged_where_op",
         ":segment_id_ops",
         "//tensorflow/python:util",
     ],
@@ -55,6 +60,7 @@ py_library(
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -64,6 +70,68 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_batch_gather_ops",
+    srcs = ["ragged_batch_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_concat_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        ":ragged_where_op",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_batch_gather_with_default_op",
+    srcs = [
+        "ragged_batch_gather_with_default_op.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_concat_ops",
+        ":ragged_dispatch",
+        ":ragged_operators",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_where_op",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_concat_ops",
+    srcs = ["ragged_concat_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "ragged_conversion_ops",
     srcs = ["ragged_conversion_ops.py"],
@@ -115,13 +183,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_gather_ops",
+    srcs = ["ragged_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "ragged_getitem",
     srcs = ["ragged_getitem.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
@@ -244,6 +329,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_where_op",
+    srcs = ["ragged_where_op.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_concat_ops",
+        ":ragged_functional_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "segment_id_ops",
     srcs = ["segment_id_ops.py"],
@@ -288,11 +389,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -412,6 +515,7 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -430,6 +534,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_batch_gather_with_default_op",
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_test_util",
@@ -448,8 +554,8 @@ py_test(
     srcs = ["ragged_gather_nd_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -493,12 +599,15 @@ py_test(
     srcs = ["ragged_from_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_string_ops",
         ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -691,7 +800,7 @@ py_test(
     srcs = ["ragged_concat_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
@@ -710,7 +819,7 @@ py_test(
     srcs = ["ragged_stack_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:constant_op",
@@ -785,9 +894,9 @@ py_test(
     srcs = ["ragged_where_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_test_util",
+        ":ragged_where_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 30ed9c53abbbe5402fd7318e256e342fad2c3d73..e9232a1c641c251ed61259ca6251f76fea785626 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -29,10 +29,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_map_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
@@ -41,6 +45,7 @@ from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.ops.ragged import segment_id_ops
 
 # Add a list of the ops that support Ragged Tensors.
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index f1e61fd7f67f42ad5bf7b61ec54297f91fb30ee6..8c62cc4a7286c13d9c6aaa0da2e5a70d2abf1d32 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -20,11 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
@@ -34,326 +31,6 @@ from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
 
 
-#===============================================================================
-# ragged_gather
-#===============================================================================
-# TODO(edloper): Add an `axis` argument
-def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
-           name=None):
-  """Gathers ragged slices from `params` axis `0` according to `indices`.
-
-  Returns `RaggedTensor` output, such that:
-
-  ```python
-  output.shape = indices.shape + params.shape[1:]
-  output.ragged_rank = indices.shape.ndims + params.ragged_rank
-  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-  ```
-
-  `params` may be ragged.  `indices` may be ragged.
-  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
-  then an error is returned.
-
-  Examples:
-
-  ```python
-  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
-  >>> indices = tf.constant([3, 1, 2, 1, 0])
-  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
-
-  >>> print ragged.gather(params, ragged_indices)
-  [['d', 'b', 'c'], ['b'], [], ['a']]
-
-  >>> print ragged.gather(ragged_params, indices)
-  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
-
-  >>> print ragged.gather(ragged_params, ragged_indices)
-  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
-  ```
-
-  Args:
-    params: The potentially ragged tensor from which to gather values. Must be
-      at least rank 1.
-    indices: The potentially ragged tensor indicating which values to gather.
-      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
-      params.shape[0]]`.
-    validate_indices: Ignored.
-    axis: Must be zero.
-    batch_dims: Must be zero.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `RaggedTensor`, where `output.dtype=params.dtype` and
-    `output.shape=indices.shape + params.shape[1:]` and
-    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
-
-  Raises:
-    ValueError: If indices.shape.ndims is not known statically.
-  """
-  del validate_indices
-  if not isinstance(axis, int) or axis != 0:
-    raise ValueError('axis != 0 is not supported for ragged gather yet.')
-  if not isinstance(batch_dims, int) or batch_dims != 0:
-    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
-  with ops.name_scope(name, 'RaggedGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-
-    if ragged_tensor.is_ragged(indices):
-      return indices.with_values(gather(params, indices.values))
-
-    if not ragged_tensor.is_ragged(params):
-      return array_ops.gather(params, indices)
-
-    indices = ops.convert_to_tensor(indices)
-    if indices.shape.ndims is None:
-      raise ValueError('indices.shape.ndims must be known statically')
-
-    result = gen_ragged_array_ops.ragged_gather(
-        indices=indices,
-        params_dense_values=params.flat_values,
-        params_nested_splits=params.nested_row_splits,
-        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
-        1)
-
-    # Compose the RaggedTensor from splits & values.
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        result.output_dense_values, result.output_nested_splits)
-
-
-#===============================================================================
-# ragged.batch_gather
-#===============================================================================
-def batch_gather(params, indices, name=None):
-  """Gathers slices from `params` according to `indices` with batch dims.
-
-  This operation is similar to `gather`, but it assumes that the leading `N`
-  dimensions of `indices` and `params` are batch dimensions, and performs a
-  gather within each batch.  In particular, when using this operation with `N`
-  batch dimensions `B1...BN`:
-
-  * `indices` has shape `[B1...BN, I]`
-  * `params` has shape `[B1...BN, P1...PM]`.
-  * `result` has shape `[B1...BN, I, P2...PM]`.
-  * `result[b1...bN, i, p2...pM] =
-    params[b1...bN, indices[b1...bN, i], p2...pM]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
-      `M>0`).
-    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
-    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
-
-  #### Example:
-    ```python
-    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
-    >>> ragged.batch_gather(params, indices)
-    [['b', 'c', 'a'], [], [], ['e', 'e']]
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.batch_gather(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_ndims = indices.shape.ndims
-    if indices_ndims is None:
-      raise ValueError(
-          'batch_gather does not allow indices with unknown shape.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-
-    if ragged_tensor.is_ragged(indices):
-      # If the outermost ragged dimension is a batch dimension, recurse.
-      if indices_ndims > 2:
-        if not ragged_tensor.is_ragged(params):
-          raise ValueError('batch shape from indices does '
-                           'not match params shape')
-        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
-        with ops.control_dependencies(checks):
-          return ragged_tensor.RaggedTensor.from_row_splits(
-              batch_gather(params.values, indices.values), indices.row_splits)
-
-      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
-      else:
-        # Ensure that `params` is ragged and has at least 2 dimensions.
-        if not ragged_tensor.is_ragged(params):
-          if params.shape.ndims is not None and params.shape.ndims < 2:
-            raise ValueError('batch shape from indices does '
-                             'not match params shape')
-          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
-
-        # Adjust indices from within-batch to global (in params.values), and
-        # then use ragged.gather to gather them.
-        num_indices = indices.row_lengths()
-        params_starts = params.row_starts()
-        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
-        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
-        return ragged_tensor.RaggedTensor.from_row_splits(
-            gather(params.values, adjusted_index_values), indices.row_splits)
-
-    else:  # params is a RaggedTensor and indices is a Tensor.
-      if indices_ndims == 1:
-        return gather(params, indices)
-      elif indices_ndims == 2:
-        # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(params.row_starts(), 1)
-        adjusted_indices = math_ops.to_int64(indices) + adjustments
-        return gather(params.values, adjusted_indices)
-      else:
-        raise ValueError('batch shape from indices does not match params shape')
-
-
-#===============================================================================
-# ragged.gather_nd
-#===============================================================================
-def gather_nd(params, indices, name=None):
-  """Gather slices from `params` using `n`-dimensional indices.
-
-  This operation is similar to `gather`, but it uses the innermost dimension
-  of `indices` to define a slice into `params`.  In particular, if:
-
-  * `indices` has shape `[A1...AN, I]`
-  * `params` has shape `[B1...BM]`
-
-  Then:
-
-  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
-  * `result[a1...aN] = params[indices[a1...aN, :]]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[A1...AN, I]`.
-    indices: A potentially ragged tensor with shape `[B1...BM]`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
-
-  #### Examples:
-    ```python
-    >>> params = tf.ragged.constant_value(
-    ...     [ [ ['000', '001'], ['010'              ]          ],
-    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
-    ...       [ [            ], ['210'              ]          ] ])
-
-    >>> # Gather 2D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2], [0]])
-    [ [ [            ], ['210'] ]
-      [ ['000', '001'], ['010'] ] ]
-
-    >>> # Gather 1D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
-    [['210'], ['000', '001']]
-
-    >>> # Gather scalars from a 3D tensor
-    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
-    ['001', '112']
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.gather_nd(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
-
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_shape = indices.shape
-    indices_ndims = indices_shape.ndims
-    if indices_ndims is None:
-      raise ValueError('indices.rank be statically known.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-    if (ragged_tensor.is_ragged(indices) and
-        indices_ndims == indices.ragged_rank + 1):
-      raise ValueError('The innermost dimension of indices may not be ragged')
-
-    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
-    # that each index slices into.
-    index_size = tensor_shape.dimension_value(indices_shape[-1])
-    if index_size is None:
-      raise ValueError('indices.shape[-1] must be statically known.')
-
-    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
-    # dense, then we convert it to ragged before recursing, and then convert
-    # the result back to `dense` if appropriate.
-    if indices_ndims > 2:
-      indices_is_dense = not ragged_tensor.is_ragged(indices)
-      if indices_is_dense:
-        indices = ragged_conversion_ops.from_tensor(
-            indices, ragged_rank=indices_ndims - 2)
-      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
-      if (indices_is_dense and ragged_tensor.is_ragged(result) and
-          result.ragged_rank == indices_ndims - 2):
-        result = ragged_conversion_ops.to_tensor(result)
-      return result
-
-    # indices_ndims <= 2, and the innermost dimension of indices may not be
-    # ragged, so `indices` must not be ragged.
-    assert not ragged_tensor.is_ragged(indices)
-    assert ragged_tensor.is_ragged(params)
-
-    # Handle corner case: An empty index tuple selects the entire `params`
-    # value.  So if `index_size` is zero, then tile `params`.
-    if index_size == 0:
-      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
-      for dim in range(indices_ndims - 1):
-        params = expand_dims(params, axis=0)
-      multiples = array_ops.concat([
-          array_ops.shape(indices)[:-1],
-          array_ops.ones([params_ndims], dtypes.int32)
-      ],
-                                   axis=0)
-      return tile(params, multiples)
-
-    # When index_size=1, we can just flatten the index tuples and use gather.
-    elif index_size == 1:
-      flattened_index_tuples = array_ops.reshape(indices, [-1])
-      return gather(params, flattened_index_tuples)
-
-    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
-    # Flatten both the index tuples and the params, such that the flattened
-    # index tuples point to the correct values in the flattened params; and
-    # then use ragged.gather on the flattened index tuples & params.
-    else:
-      indices = math_ops.to_int64(indices)
-
-      # Flatten the outermost 2 dimensions of the index tuples & params.
-      flattened_index_tuples = array_ops.gather(params.row_splits,
-                                                indices[..., 0])
-      flattened_index_tuples += indices[..., 1]
-      flattened_params = params.values
-
-      # Flatten any remaining dimensions.
-      for dim in range(2, index_size):
-        if not ragged_tensor.is_ragged(flattened_params):
-          flattened_index_tuples = array_ops.expand_dims(
-              flattened_index_tuples, axis=1)
-          flattened_index_tuples = array_ops.concat(
-              [flattened_index_tuples, indices[..., dim:]], axis=1)
-          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
-
-        flattened_index_tuples = array_ops.gather(
-            flattened_params.row_starts(), flattened_index_tuples)
-        flattened_index_tuples += indices[..., dim]
-        flattened_params = flattened_params.values
-
-      # Gather using the flattened index tuples and params.
-      return gather(flattened_params, flattened_index_tuples)
-
-
 #===============================================================================
 # Masking
 #===============================================================================
@@ -547,260 +224,6 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       return masked_values
 
 
-#===============================================================================
-# Concatenation and Stacking
-#===============================================================================
-def concat(values, axis, name=None):
-  """Concatenates potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  concatenation of `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to concatenate.
-      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.concat([t1, t2], axis=0)
-    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
-    >>> ragged.concat([t1, t2], axis=1)
-    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=False)
-
-
-def stack(values, axis=0, name=None):
-  """Stacks potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to stack.
-      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.stack([t1, t2], axis=0)
-    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
-    >>> ragged.stack([t1, t2], axis=1)
-    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=True)
-
-
-def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
-  """Helper function to concatenate or stack ragged tensors.
-
-  Args:
-    rt_inputs: A list of RaggedTensors or Tensors to combine.
-    axis: The axis along which to concatenate or stack.
-    stack_values: A boolean -- if true, then stack values; otherwise,
-      concatenate them.
-
-  Returns:
-    A RaggedTensor.
-  Raises:
-    ValueError: If rt_inputs is empty, or if axis is out of range.
-  """
-  # Validate parameters.
-  if not rt_inputs:
-    raise ValueError('rt_inputs may not be empty.')
-
-  # Convert input tensors.
-  rt_inputs = [
-      ragged_tensor.convert_to_tensor_or_ragged_tensor(
-          rt_input, name='rt_input') for rt_input in rt_inputs
-  ]
-
-  # Special case: if there's only one input, then return it as-is.
-  if len(rt_inputs) == 1:
-    if stack_values:
-      return expand_dims(rt_inputs[0], axis=axis)
-    else:
-      return rt_inputs[0]
-
-  # Check the rank (number of dimensions) of the input tensors.
-  ndims = None
-  for rt in rt_inputs:
-    if ndims is None:
-      ndims = rt.shape.ndims
-    else:
-      rt.shape.assert_has_rank(ndims)
-
-  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
-  axis = ragged_util.get_positive_axis(axis, out_ndims)
-
-  # If all the inputs are Tensors, and we're combining the final dimension,
-  # then we can delegate to the tf.stack/tf.concat operation, and return a
-  # Tensor.
-  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
-    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
-      if stack_values:
-        return array_ops.stack(rt_inputs, axis)
-      else:
-        return array_ops.concat(rt_inputs, axis)
-
-  # Convert any Tensor inputs to RaggedTensors.  This makes it
-  # possible to concatenate Tensors and RaggedTensors together.
-  for i in range(len(rt_inputs)):
-    if not ragged_tensor.is_ragged(rt_inputs[i]):
-      rt_inputs[i] = ragged_conversion_ops.from_tensor(
-          rt_inputs[i], ragged_rank=1)
-
-  # Convert the input tensors to all have the same ragged_rank.
-  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
-  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
-
-  if axis == 0:
-    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
-  elif axis == 1:
-    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
-  else:  # axis > 1: recurse.
-    values = [rt.values for rt in rt_inputs]
-    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
-    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
-      return ragged_tensor.RaggedTensor.from_row_splits(
-          _ragged_stack_concat_helper(values, axis - 1, stack_values),
-          splits[0][0])
-
-
-def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 0.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  # Concatenate the inner values together.
-  flat_values = [rt.flat_values for rt in rt_inputs]
-  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
-
-  # Concatenate the splits together for each ragged dimension (adjusting
-  # split offsets as necessary).
-  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
-  ragged_rank = rt_inputs[0].ragged_rank
-  concatenated_nested_splits = [
-      _concat_ragged_splits([ns[dim]
-                             for ns in nested_splits])
-      for dim in range(ragged_rank)
-  ]
-
-  # If we are performing a stack operation, then add another splits.
-  if stack_values:
-    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
-    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
-    concatenated_nested_splits.insert(0, stack_splits)
-
-  return ragged_tensor.RaggedTensor.from_nested_row_splits(
-      concatenated_flat_values, concatenated_nested_splits)
-
-
-def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 1.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  num_inputs = len(rt_inputs)
-
-  rt_nrows = _nrows(rt_inputs[0])
-  nrows_msg = 'Input tensors have incompatible shapes.'
-  nrows_checks = [
-      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
-      for rt in rt_inputs[1:]
-  ]
-
-  with ops.control_dependencies(nrows_checks):
-    # Concatentate the inputs together to put them in a single ragged tensor.
-    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
-
-    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
-    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
-    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
-    #                      ...,
-    #                  rt_inputs[0][M], ..., rt_input[N][M]]
-    # where `N=num_inputs-1` and `M=rt_nrows-1`.
-    row_indices = math_ops.range(rt_nrows * num_inputs)
-    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
-    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
-    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
-    permuted_rt = gather(concatenated_rt, row_permutation)
-
-    if stack_values:
-      # Add a new splits tensor to group together the values.
-      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
-      _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
-                                                        stack_splits)
-    else:
-      # Merge together adjacent rows by dropping the row-split indices that
-      # separate them.
-      concat_splits = permuted_rt.row_splits[::num_inputs]
-      _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
-                                                        concat_splits)
-
-
-def _copy_row_shape(rt_inputs, splits):
-  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
-  for rt in rt_inputs:
-    if rt.shape[0] is not None:
-      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
-
-
 #===============================================================================
 # Tiling
 #===============================================================================
@@ -1061,139 +484,6 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
     return ragged_tensor.RaggedTensor.from_row_splits(values, splits)
 
 
-#===============================================================================
-# ragged.where
-#===============================================================================
-def where(condition, x=None, y=None, name=None):
-  """Return the elements, either from `x` or `y`, depending on the `condition`.
-
-  : If both `x` and `y` are `None`:
-    Returns the coordinates of true elements of `condition`. The coordinates
-    are returned in a 2-D tensor with shape
-    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
-    coordinates of the `i`th true value (in row-major order).
-
-  : If both `x` and `y` are non-`None`:
-    Returns a tensor formed by selecting values from `x` where condition is
-    true, and from `y` when condition is false.  In particular:
-
-    : If `condition`, `x`, and `y` all have the same shape:
-
-      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
-      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
-
-    : Otherwise:
-
-      * `condition` must be a vector.
-      * `x` and `y` must have the same number of dimensions.
-      * The outermost dimensions of `condition`, `x`, and `y` must all have the
-        same size.
-      * `result[i] = x[i]` if `condition[i]` is true.
-      * `result[i] = y[i]` if `condition[i]` is false.
-
-  Args:
-    condition: A potentially ragged tensor of type `bool`
-    x: A potentially ragged tensor (optional).
-    y: A potentially ragged tensor (optional).  Must be specified if `x` is
-      specified.  Must have the same rank and type as `x`.
-    name: A name of the operation (optional)
-
-  Returns:
-    : If both `x` and `y` are `None`:
-      A `Tensor` with shape `(num_true, dim_size(condition))`.
-    : Otherwise:
-      A potentially ragged tensor with the same type, rank, and outermost
-      dimension size as `x` and `y`.
-      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
-
-  Raises:
-    ValueError: When exactly one of `x` or `y` is non-`None`; or when
-      `condition`, `x`, and `y` have incompatible shapes.
-
-  #### Examples:
-    ```python
-    >>> # Coordinates where condition is true.
-    >>> condition = tf.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> ragged.where(condition)
-    [[0, 0], [0, 2], [1, 1]]
-
-    >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = tf.ragged.constant_value(
-    ...     [[True, False, True], [False, True]])
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'b', 'C'], ['d', 'E']]
-
-    >>> # Row selection between x and y, based on condition.
-    >>> condition = [True, False]
-    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'B', 'C'], ['d', 'e']]
-    ```
-  """
-  if (x is None) != (y is None):
-    raise ValueError('x and y must be either both None or both non-None')
-  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
-    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        condition, name='condition')
-    if x is None:
-      return _coordinate_where(condition)
-    else:
-      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
-      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
-      return _elementwise_where(condition, x, y)
-
-
-def _elementwise_where(condition, x, y):
-  """Ragged version of tf.where(condition, x, y)."""
-  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
-  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
-  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
-
-  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
-    return array_ops.where(condition, x, y)
-
-  elif condition_is_ragged and x_is_ragged and y_is_ragged:
-    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
-                                                 y)
-  elif not condition_is_ragged:
-    # Concatenate x and y, and then use `gather` to assemble the selected rows.
-    condition.shape.assert_has_rank(1)
-    x_nrows = _nrows(x)
-    x_and_y = concat([x, y], axis=0)
-    indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(_nrows(y)))
-    return gather(x_and_y, indices)
-
-  else:
-    raise ValueError('Input shapes do not match.')
-
-
-def _coordinate_where(condition):
-  """Ragged version of tf.where(condition)."""
-  if not isinstance(condition, ragged_tensor.RaggedTensor):
-    return array_ops.where(condition)
-
-  # The coordinate for each `true` value in condition.values.
-  selected_coords = _coordinate_where(condition.values)
-
-  # Convert the first index in each coordinate to a row index and column index.
-  first_index = selected_coords[:, 0]
-  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
-  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
-  selected_cols = first_index - selected_row_starts
-
-  # Assemble the row & column index with the indices for inner dimensions.
-  return array_ops.concat([
-      array_ops.expand_dims(selected_rows, 1),
-      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
-  ],
-                          axis=1)
-
-
 #===============================================================================
 # RaggedTensor Size
 #===============================================================================
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 431d350db8a5a266113df9a03e39a90643893d79..17c55eb810ab3b9718a5d5f94af3dba67fb673e8 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.batch_gather."""
+"""Tests for ragged_batch_gather_ops.batch_gather."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,13 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
@@ -146,9 +148,327 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
               [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
   ])
   def testRaggedBatchGather(self, descr, params, indices, expected):
-    result = ragged_array_ops.batch_gather(params, indices)
+    result = ragged_batch_gather_ops.batch_gather(params, indices)
     self.assertRaggedEqual(result, expected)
 
+  @parameterized.parameters([
+      # Docstring example:
+      dict(
+          descr='Docstring example',
+          params=[['a', 'b', 'c'], ['d'], [], ['e']],
+          indices=[[1, 2, -1], [], [], [0, 10]],
+          expected=[['b', 'c', 'FOO'], [], [], ['e', 'FOO']],
+          default_value='FOO',
+      ),
+      # Dimensions:
+      # indices: [4]
+      # params: [2, (d1), (d2)]
+      dict(
+          descr='params: [2, (d1), (d2), indices: [4]',
+          indices=[1, 100, 0, -1],
+          params=[[['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                    'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                    'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+                   ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall']],
+                  [["It's", 'always', 'darkest', 'before', 'the', 'dawn']]],
+          expected=[[["It's", 'always', 'darkest', 'before', 'the', 'dawn']],
+                    [['$NONE^']],
+                    [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                      'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion',
+                      '-', 'dollar', 'takeover', 'offer', 'from', 'Microsoft',
+                      '.'],
+                     ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall']],
+                    [['$NONE^']]],
+      ),
+      # Dimensions:
+      # params: [1, (d1)]
+      # indices: [3]
+      dict(
+          descr='params: rank 2, indices: rank 1',
+          params=[
+              ['Bruce', 'Wayne'],
+          ],
+          indices=[-1, 0, 1000],
+          expected=[['$NONE^'], ['Bruce', 'Wayne'], ['$NONE^']]
+      ),
+      # Dimensions:
+      # params: [1, (d1)]
+      # indices: [1, (d2)]
+      dict(
+          descr='Test underbound indices of shape [1, (d2)]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+          ],
+          indices=[[8, -1]],
+          expected=[['!', '$NONE^']],
+      ),
+      dict(
+          descr='Test underbound indices of shape [2, (d2)]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['Who', 'let', 'the', 'dogs', 'out', '?'],
+          ],
+          indices=[[8, -1], [1, 100]],
+          expected=[['!', '$NONE^'], ['let', '$NONE^']],
+      ),
+      # Dimensions:
+      # params: [2, (d1)]
+      # indices: [2, (d2)]
+      dict(
+          descr='Test underbound indices of rank 2',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['He', 'left', 'us', '.', 'Little', 'boys', 'crowded', 'together',
+               'on', 'long', 'wooden', 'benches', ',', 'and', 'in', 'the',
+               'center', 'of', 'the', 'room', 'sat', 'the', 'teacher', '.',
+               'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+               'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+               'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+               'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+               'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',', 'then',
+               'shouted', 'in', 'Yiddish', ',', '``', 'One', ',', 'two', ',',
+               'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+               'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+               'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+               'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+               'had', 'previously', 'chanted', 'in', 'Hebrew', '.']],
+          indices=[[8, -1], [3, 23, 35, 45, 75, 83, -121]],
+          expected=[['!', '$NONE^'], ['.', '.', '.', '.', '!', '.', '$NONE^']],
+      ),
+      dict(
+          descr='Test overbound indices of rank 2',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['He', 'left', 'us', '.', 'Little', 'boys', 'crowded', 'together',
+               'on', 'long', 'wooden', 'benches', ',', 'and', 'in', 'the',
+               'center', 'of', 'the', 'room', 'sat', 'the', 'teacher', '.',
+               'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+               'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+               'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+               'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+               'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',', 'then',
+               'shouted', 'in', 'Yiddish', ',', '``', 'One', ',', 'two', ',',
+               'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+               'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+               'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+               'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+               'had', 'previously', 'chanted', 'in', 'Hebrew', '.']],
+          indices=[[8, 8823], [3, 23, 35, 45, 75, 83, 1234]],
+          expected=[['!', '$NONE^'], ['.', '.', '.', '.', '!', '.', '$NONE^']],
+      ),
+      # Dimensions:
+      # params: [2, (d1), 2]
+      # indices: [2, (d2)]
+      dict(
+          descr='params: rank 3, indices: rank 2',
+          params=[
+              [['The', 'deal'], ['takeover', 'offer'], ['from', 'Microsoft']],
+              [['Who', 'let'], ['the', 'dogs'], ['out', '?']],
+          ],
+          ragged_rank=1,
+          indices=[[1, -1, 2, 30], [1, 100]],
+          indices_ragged_rank=1,
+          expected=[[['takeover', 'offer'],
+                     ['$NONE^', '$NONE^'],
+                     ['from', 'Microsoft'],
+                     ['$NONE^', '$NONE^']],
+                    [['the', 'dogs'],
+                     ['$NONE^', '$NONE^']]],
+          expected_ragged_rank=1,
+          default_value=['$NONE^', '$NONE^'],
+      ),
+      # Dimensions:
+      # params: [2, (d1), (d2)]
+      # indices: [2, (d3)]
+      dict(
+          descr='params: [2, (d1), (d2)], indices: [2, (d3)]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+               ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall'],
+              ],
+              [['It\'s', 'always', 'darkest', 'before', 'the', 'dawn']]
+          ],
+          indices=[[1, 100], [0, -1]],
+          expected=[[['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall'],
+                     ['$NONE^']],
+                    [["It's", 'always', 'darkest', 'before', 'the', 'dawn'],
+                     ['$NONE^']]]
+      ),
+      # Dimensions:
+      # params: [2, (d1), (d2)]
+      # indices: [2, (d1), (d3)]
+      dict(
+          descr='Test overbound indices of rank 3',
+          params=[
+              [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+               ['Foo', 'bar', 'mar']],
+              [['He', 'left', 'us', '.', 'Little', 'boys', 'crowded',
+                'together', 'on', 'long', 'wooden', 'benches', ',', 'and', 'in',
+                'the', 'center', 'of', 'the', 'room', 'sat', 'the', 'teacher',
+                '.', 'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+                'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+                'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+                'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+                'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',',
+                'then', 'shouted', 'in', 'Yiddish', ',', '``', 'One', ',',
+                'two', ',',
+                'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+                'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+                'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+                'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+                'had', 'previously', 'chanted', 'in', 'Hebrew', '.'],
+               ['I', 'too', 'was', 'hustled', 'scammed', 'bamboozled', 'hood',
+                'winked', 'lead', 'astray']]
+          ],
+          indices=[[[8, 8823], [0, 100]], [[3, 23, 35, 45, 75, 83, 1234], [5]]],
+          expected=[[['!', '$NONE^'], ['Foo', '$NONE^']],
+                    [['.', '.', '.', '.', '!', '.', '$NONE^'],
+                     ['bamboozled']]],
+      ),
+      # params.shape = [2, (d1), 8]
+      # indices.shape = [2, (d1), 3]
+      dict(
+          descr='params = [2, (2, 1), 8], indices = [2, (2, 1), 3]',
+          params=[[['h'] * 8, ['w'] * 8], [['b'] * 8]],
+          ragged_rank=1,
+          indices=[[[0, 100, 1], [0, 1, 0]], [[1, 0, 0]]],
+          indices_ragged_rank=1,
+          expected=[[['h', '$NONE^', 'h'], ['w', 'w', 'w']], [['b', 'b', 'b']]],
+          expected_ragged_rank=1,
+      ),
+  ])
+  def testRaggedBatchGatherWithDefault(
+      self, descr, params, indices, expected, indices_ragged_rank=None,
+      expected_ragged_rank=None, ragged_rank=None, default_value='$NONE^'):
+    params = ragged_factory_ops.constant(params, ragged_rank=ragged_rank)
+    indices = ragged_factory_ops.constant(
+        indices, ragged_rank=indices_ragged_rank or ragged_rank)
+    expected = ragged_factory_ops.constant(
+        expected, ragged_rank=expected_ragged_rank or ragged_rank)
+    result = ragged_batch_gather_with_default_op.batch_gather_with_default(
+        params, indices, default_value)
+    self.assertRaggedEqual(result, expected)
+
+  @parameterized.parameters([
+      # Dimensions:
+      #  params: dims [2, 5], indices: [2, 2]
+      dict(
+          descr='params: dims [2, 5], indices: [2, 2]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18'],
+              ['He', 'left', 'us', '.', 'Little']],
+          indices=[[0, -1], [3, 121]],
+          expected=[['The', '$NONE^'], ['.', '$NONE^']],
+          default_value='$NONE^',
+      ),
+      # Dimensions:
+      #  params: dims [2, 2, 5], indices: [2, 2]
+      dict(
+          descr='params: dims [2, 2, 5], indices: [2, 2]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18'],
+               ['The', 'deal', 'came', 'about', '19'],
+              ],
+              [['He', 'left', 'us', '.', 'Little'],
+               ['The', 'deal', 'came', 'about', '20'],
+              ]
+          ],
+          indices=[[0, -1], [0, 121]],
+          expected=[[['The', 'deal', 'came', 'about', '18'],
+                     ['$NONE^', '$NONE^', '$NONE^', '$NONE^', '$NONE^']],
+                    [['He', 'left', 'us', '.', 'Little'],
+                     ['$NONE^', '$NONE^', '$NONE^', '$NONE^', '$NONE^']]],
+          default_value='$NONE^',
+      ),
+      # Test default_value with shape [5]
+      dict(
+          descr='params: dims [2, 2, 5], indices: [2, 2]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18'],
+               ['The', 'deal', 'came', 'about', '19'],
+              ],
+              [['He', 'left', 'us', '.', 'Little'],
+               ['The', 'deal', 'came', 'about', '20'],
+              ]
+          ],
+          indices=[[0, -1], [0, 121]],
+          expected=[[['The', 'deal', 'came', 'about', '18'],
+                     [':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:']],
+                    [['He', 'left', 'us', '.', 'Little'],
+                     [':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:']]],
+          default_value=[':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:'],
+      ),
+  ])
+  def testRaggedBatchGatherWithDefaultOnTensors(
+      self, descr, params, indices, expected, default_value):
+    params = constant_op.constant(params)
+    indices = constant_op.constant(indices)
+    expected = constant_op.constant(expected)
+    result = ragged_batch_gather_with_default_op.batch_gather_with_default(
+        params, indices, default_value)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          params=[['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                   'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                   'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.']],
+          indices=[[[8, -1]]],
+          # Exception here because different errors are thrown in eager vs
+          # graph mode.
+          error=Exception,
+          default_value='$NONE^',
+      ),
+  ])
+  def testRankMismatch(
+      self, params, indices, default_value, error):
+    params = ragged_factory_ops.constant(params)
+    indices = ragged_factory_ops.constant(indices)
+    with self.assertRaises(error):
+      _ = ragged_batch_gather_with_default_op.batch_gather_with_default(
+          params, indices, default_value)
+
+  @parameterized.parameters([
+      # Dimensions:
+      # params: [2, (d1), 2]
+      # indices: [2, (d2)]
+      # default_value: []
+      dict(
+          descr='params: rank 3, indices: rank 2, default: rank = [], but'
+          ' should be [2]',
+          params=[
+              [['The', 'deal'], ['takeover', 'offer'], ['from', 'Microsoft']],
+              [['Who', 'let'], ['the', 'dogs'], ['out', '?']],
+          ],
+          ragged_rank=1,
+          indices=[[1, -1, 2, 30], [1, 100]],
+          indices_ragged_rank=1,
+          default_value='$NONE^',
+          error=Exception,
+      )
+  ])
+  def testInvalidDefaultValueRank(
+      self, descr, params, indices, default_value, error, ragged_rank=None,
+      indices_ragged_rank=None):
+    params = ragged_factory_ops.constant(params, ragged_rank=ragged_rank)
+    indices = ragged_factory_ops.constant(
+        indices, ragged_rank=indices_ragged_rank)
+    with self.assertRaises(error):
+      _ = ragged_batch_gather_with_default_op.batch_gather_with_default(
+          params, indices, default_value)
+
   def testRaggedBatchGatherUnknownRankError(self):
     if context.executing_eagerly():
       return
@@ -159,11 +479,11 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged_array_ops.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged_array_ops.batch_gather(params, ragged_indices)
+      ragged_batch_gather_ops.batch_gather(params, ragged_indices)
 
   @parameterized.parameters(
       [
@@ -208,7 +528,7 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
                                        message=None,
                                        error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f6d3710669d7a964a531ca944223c2dc933bad
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -0,0 +1,122 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batch gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> tf.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_tensor.RaggedTensor.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.cast(
+            indices.values, dtypes.int64) + adjustments
+        return ragged_tensor.RaggedTensor.from_row_splits(
+            ragged_gather_ops.gather(params.values, adjusted_index_values),
+            indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return ragged_gather_ops.gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
+        adjusted_indices = math_ops.cast(indices, dtypes.int64) + adjustments
+        return ragged_gather_ops.gather(params.values, adjusted_indices)
+      else:
+        raise ValueError('batch shape from indices does not match params shape')
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..049829d88b8324fafc2b4e386bc06c83d8bdc712
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -0,0 +1,186 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Array operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_dispatch  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_operators  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_where_op
+
+
+#===============================================================================
+# ragged.batch_gather_with_default
+#===============================================================================
+def batch_gather_with_default(params,
+                              indices,
+                              default_value='',
+                              name=None):
+  """Same as `batch_gather` but inserts `default_value` for invalid indices.
+
+  This operation is similar to `batch_gather` except that it will substitute
+  the value for invalid indices with `default_value` as the contents.
+  See `batch_gather` for more details.
+
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    default_value: A value to be inserted in places where `indices` are out of
+      bounds. Must be the same dtype as params and either a scalar or rank 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = tf.ragged.constant([
+          ['a', 'b', 'c'],
+          ['d'],
+          [],
+          ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
+    >>> batch_gather_with_default(params, indices, 'FOO')
+    [['b', 'c', 'FOO'], [], [], ['e', 'FOO']]
+  ```
+  """
+  with ops.name_scope(name, 'RaggedBatchGatherWithDefault'):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params',
+    )
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices',
+    )
+    default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        default_value, name='default_value',
+    )
+    # TODO(hterry): lift this restriction and support default_values of
+    #               of rank > 1
+    if (default_value.shape.ndims is not 0
+        and default_value.shape.ndims is not 1):
+      raise ValueError('"default_value" must be a scalar or vector')
+    upper_bounds = None
+    if indices.shape.ndims is None:
+      raise ValueError('Indices must have a known rank.')
+    if params.shape.ndims is None:
+      raise ValueError('Params must have a known rank.')
+
+    num_batch_dimensions = indices.shape.ndims - 1
+    pad = None
+    # The logic for this works as follows:
+    # - create a padded params, where:
+    #    padded_params[b1...bn, 0] = default_value
+    #    padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0)
+    # - create an `upper_bounds` Tensor that contains the number of elements
+    #   in each innermost rank. Broadcast `upper_bounds` to be the same shape
+    #   as `indices`.
+    # - check to see which index in `indices` are out of bounds and substitute
+    #   it with the index containing `default_value` (the first).
+    # - call batch_gather with the indices adjusted.
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(array_ops.rank(params),
+                                       array_ops.rank(indices))]):
+      if ragged_tensor.is_ragged(params):
+        row_lengths = ragged_array_ops.expand_dims(
+            params.row_lengths(axis=num_batch_dimensions),
+            axis=-1)
+        upper_bounds = math_ops.cast(row_lengths, indices.dtype)
+
+        pad_shape = _get_pad_shape(params, indices)
+
+        pad = ragged_tensor_shape.broadcast_to(
+            default_value, pad_shape)
+      else:
+        params_shape = array_ops.shape(params)
+        pad_shape = array_ops.concat([
+            params_shape[:num_batch_dimensions],
+            [1],
+            params_shape[num_batch_dimensions + 1:params.shape.ndims]
+        ], 0)
+        upper_bounds = params_shape[num_batch_dimensions]
+        pad = array_ops.broadcast_to(default_value, pad_shape)
+
+      # Add `default_value` as the first value in the innermost (ragged) rank.
+      pad = math_ops.cast(pad, params.dtype)
+      padded_params = array_ops.concat(
+          [pad, params], axis=num_batch_dimensions)
+
+      # Adjust the indices by substituting out-of-bound indices to the
+      # default-value index (which is the first element)
+      shifted_indices = indices + 1
+      is_out_of_bounds = (indices < 0) | (indices > upper_bounds)
+      adjusted_indices = ragged_where_op.where(
+          is_out_of_bounds,
+          x=array_ops.zeros_like(indices), y=shifted_indices,
+      )
+      return array_ops.batch_gather(
+          params=padded_params, indices=adjusted_indices, name=name)
+
+
+def _get_pad_shape(params, indices):
+  """Gets the RaggedTensorDynamicShape for the pad tensor."""
+  num_batch_dimensions = indices.shape.ndims - 1
+  params_shape = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(
+      params)
+
+  # We want to create a pad tensor that can be concatenated with the params.
+  if params.shape.ndims == indices.shape.ndims:
+    # When params and indices are the same rank, the shape of the pad tensor is
+    # almost identical to params, except the last dimension which has size = 1.
+    if params_shape.num_inner_dimensions == 0:
+      pad_dims = params_shape.partitioned_dim_sizes[:-1] + (
+          array_ops.ones_like(params_shape.partitioned_dim_sizes[-1]),)
+      return ragged_tensor_shape.RaggedTensorDynamicShape(
+          pad_dims, [])
+    else:
+      return ragged_tensor_shape.RaggedTensorDynamicShape(
+          params_shape.partitioned_dim_sizes,
+          array_ops.concat([params_shape.inner_dim_sizes[:-1], [1]], axis=0))
+  else:
+    # When the rank of indices < params, the pad has the same dimension as
+    # params up to the 'num_batch_dimensions' rank. Every dimension after that
+    # has size 1.
+    pad_dims = None
+    if num_batch_dimensions == 0:
+      pad_dims = (constant_op.constant(1, dtype=dtypes.int64),) + (
+          constant_op.constant([1], dtype=dtypes.int64),) * (
+              params_shape.num_partitioned_dimensions -
+              num_batch_dimensions - 1)
+    else:
+      batch_dimensions = params_shape.partitioned_dim_sizes[
+          :num_batch_dimensions]
+      gather_dimension = params_shape.partitioned_dim_sizes[
+          num_batch_dimensions]
+      pad_dims = batch_dimensions + (
+          array_ops.ones_like(gather_dimension),) * (
+              params_shape.num_partitioned_dimensions - num_batch_dimensions)
+
+    return ragged_tensor_shape.RaggedTensorDynamicShape(
+        pad_dims, params_shape.inner_dim_sizes)
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 254afdaa21b489f0c3ea4191b0b02990fd7334cf..62989d3025562db9af4b19d5a2922988591fe521 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
@@ -235,7 +235,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                        expected_ragged_rank=None,
                        expected_shape=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    concatenated = ragged_array_ops.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -276,7 +276,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                       message=None,
                       ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    self.assertRaisesRegexp(error, message, ragged_array_ops.concat, rt_inputs,
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.concat, rt_inputs,
                             axis)
 
   @parameterized.parameters([
@@ -294,7 +294,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     rt_inputs = [
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
-    concatenated = ragged_array_ops.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     with self.assertRaisesRegexp(error, message):
       self.evaluate(concatenated)
 
@@ -307,7 +307,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     ]
     self.assertRaisesRegexp(
         ValueError, r'axis may only be negative if ndims is statically known.',
-        ragged_array_ops.concat, rt_inputs, -1)
+        ragged_concat_ops.concat, rt_inputs, -1)
 
   def testSingleTensorInput(self):
     """Tests ragged_concat with a single tensor input.
@@ -317,7 +317,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     returns that tensor.  This test exercises that path.
     """
     rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
-    concatenated = ragged_array_ops.concat(rt_inputs, 0)
+    concatenated = ragged_concat_ops.concat(rt_inputs, 0)
     self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f86b05e178a98f5c0afa9c201f83bb652ad8deb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -0,0 +1,302 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Concat and stack operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def concat(values, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
+
+
+def stack(values, axis=0, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([rt.nrows() for rt in rt_inputs])
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = rt_inputs[0].nrows()
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(rt.nrows(), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = ragged_gather_ops.gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 5284750198caf8699b894db6ccf4342824d033ce..3bda777482bf0965939c0a6a6d1a82c95d669aaf 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -32,10 +32,14 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
@@ -401,7 +405,7 @@ _V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
 
 def _ragged_gather_v1(params, indices, validate_indices=None, name=None,
                       axis=0, batch_dims=0):
-  return ragged_array_ops.gather(
+  return ragged_gather_ops.gather(
       params=params,
       indices=indices,
       validate_indices=validate_indices,
@@ -422,20 +426,20 @@ def _ragged_size_v1(input, name=None, out_type=dtypes.int32):  # pylint: disable
 
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
-    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+    (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
      ['params', 'indices']),
-    (array_ops.concat, ragged_array_ops.concat, ['[values]']),
+    (array_ops.concat, ragged_concat_ops.concat, ['[values]']),
     (array_ops.expand_dims, _ragged_expand_dims_v1, ['input']),
     (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
     (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
-    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
-    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
+    (array_ops.gather_v2, ragged_gather_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_gather_ops.gather_nd, ['params', 'indices']),
     (array_ops.rank, ragged_array_ops.rank, ['input']),
     (array_ops.size, _ragged_size_v1, ['input']),
     (array_ops.size_v2, ragged_array_ops.size, ['input']),
-    (array_ops.stack, ragged_array_ops.stack, ['[values]']),
+    (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
-    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
@@ -466,7 +470,8 @@ def register_dispatchers():
       _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
   for op in op_list:
     _, undecorated_op = tf_decorator.unwrap(op)
-    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+    if not hasattr(undecorated_op,
+                   tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names):
       raise AssertionError('Expected %s to be an exported symbol '
                            '(while adding a RaggedTensor dispatcher)')
 
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index 68d3953f4cdf31458fc75397522b3f9fc8960098..d4e1faf888fe0912e6501d7c5a61732359a3b39f 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -45,6 +45,13 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaggedEqual(
         RaggedTensor.from_tensor(dt, padding=0), [[5, 7], [0, 3], [6]])
 
+    dt_3d = constant_op.constant([[[5, 0], [7, 0], [0, 0]],
+                                  [[0, 0], [3, 0], [0, 0]],
+                                  [[6, 0], [0, 0], [0, 0]]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt_3d, lengths=([2, 0, 3], [1, 1, 2, 0, 1])),
+        [[[5], [7]], [], [[6, 0], [], [0]]])
+
   @parameterized.parameters(
       # 2D test cases, no length or padding.
       {
@@ -251,6 +258,24 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
           'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
                        [[[5, 6], [7, 0]], [[0, 8]]]]
       },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'lengths': ([2, 2], [1, 2, 2, 1]),
+          'expected': [[[[1, 0]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8]]]],
+          'ragged_rank': 2,
+          'use_ragged_rank': False
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'lengths': [[2, 2], [1, 2, 2, 1]],
+          'expected': [[[[1, 0]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8]]]],
+          'ragged_rank': 2,
+          'use_ragged_rank': False
+      },
       {
           'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
                      [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
@@ -259,15 +284,37 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
           'expected': [[[[1], [2, 3]], [[], [4]]],
                        [[[5, 6], [7]], [[0, 8], []]]]
       },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'lengths': ([2, 2], [2, 2, 2, 2], [1, 2, 0, 1, 2, 1, 2, 0]),
+          'expected': [[[[1], [2, 3]], [[], [4]]],
+                       [[[5, 6], [7]], [[0, 8], []]]],
+          'ragged_rank': 3,
+          'use_ragged_rank': False
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'lengths': [[2, 2], [2, 2, 2, 2], [1, 2, 0, 1, 2, 1, 2, 0]],
+          'expected': [[[[1], [2, 3]], [[], [4]]],
+                       [[[5, 6], [7]], [[0, 8], []]]],
+          'ragged_rank': 3,
+          'use_ragged_rank': False
+      },
   )  # pyformat: disable
   def testRaggedFromTensor(self,
                            tensor,
                            expected,
                            lengths=None,
                            padding=None,
-                           ragged_rank=1):
+                           ragged_rank=1,
+                           use_ragged_rank=True):
     dt = constant_op.constant(tensor)
-    rt = RaggedTensor.from_tensor(dt, lengths, padding, ragged_rank)
+    if use_ragged_rank:
+      rt = RaggedTensor.from_tensor(dt, lengths, padding, ragged_rank)
+    else:
+      rt = RaggedTensor.from_tensor(dt, lengths, padding)
     self.assertEqual(type(rt), RaggedTensor)
     self.assertEqual(rt.ragged_rank, ragged_rank)
     self.assertTrue(
@@ -411,6 +458,11 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
           'lengths': [0.5],
           'error': (TypeError, 'lengths must be an integer tensor')
       },
+      {
+          'tensor': [[1, 2, 3]],
+          'lengths': [[1], [1]],
+          'error': (ValueError, r'Shape \(1, 3\) must have rank at least 3')
+      },
       {
           'tensor': [[1]],
           'padding': 'a',
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index d4bffeb401656b02a48a36eb0383850656506fc4..8e44368d4752ed01410de762b7cbda134ebfaa60 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.gather_nd."""
+"""Tests for ragged_gather_ops.gather_nd."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,8 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -201,7 +201,7 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
           expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
   ])  # pyformat: disable
   def testRaggedGatherNd(self, descr, params, indices, expected):
-    result = ragged_array_ops.gather_nd(params, indices)
+    result = ragged_gather_ops.gather_nd(params, indices)
     self.assertRaggedEqual(result, expected)
 
   def testRaggedGatherNdUnknownRankError(self):
@@ -213,10 +213,10 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
 
     with self.assertRaisesRegexp(ValueError,
                                  'indices.rank be statically known.'):
-      ragged_array_ops.gather_nd(params, indices1)
+      ragged_gather_ops.gather_nd(params, indices1)
     with self.assertRaisesRegexp(
         ValueError, r'indices.shape\[-1\] must be statically known.'):
-      ragged_array_ops.gather_nd(params, indices2)
+      ragged_gather_ops.gather_nd(params, indices2)
 
   @parameterized.parameters([
       dict(
@@ -238,7 +238,7 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
                                     message=None,
                                     error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.gather_nd(params, indices)
+      ragged_gather_ops.gather_nd(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 9914b56448868b21058cdb50cda17d63676c4f23..eb64bb4ad1685dc1c9c850c4a9c9ef36e9ffa23f 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -25,8 +24,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -41,35 +40,35 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                                  ['e']])
     ragged_indices = ragged_factory_ops.constant([[3, 1, 2], [1], [], [0]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, ragged_indices),
+        ragged_gather_ops.gather(params, ragged_indices),
         [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(ragged_params, indices),
+        ragged_gather_ops.gather(ragged_params, indices),
         [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(ragged_params, ragged_indices),
+        ragged_gather_ops.gather(ragged_params, ragged_indices),
         [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
 
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
-    self.assertIsInstance(ragged_array_ops.gather(params, indices), ops.Tensor)
+        ragged_gather_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged_gather_ops.gather(params, indices), ops.Tensor)
 
   def testRaggedParamsAndTensorIndices(self):
     params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
                                           [], ['g']])
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
   def testRaggedParamsAndRaggedIndices(self):
@@ -77,7 +76,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                           [], ['g']])
     indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
          [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
          [[]]]                                        #  [p[3]            ]]
@@ -88,14 +87,14 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
                                           [], ['g']])
     indices = 1
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices), [b'c', b'd', b'e'])
+        ragged_gather_ops.gather(params, indices), [b'c', b'd', b'e'])
 
   def test3DRaggedParamsAnd2DTensorIndices(self):
     params = ragged_factory_ops.constant([[['a', 'b'], []],
                                           [['c', 'd'], ['e'], ['f']], [['g']]])
     indices = [[1, 2], [0, 1], [2, 2]]
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
          [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
          [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
@@ -109,7 +108,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
         inner_shape=(2,))
     params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
     self.assertRaggedEqual(
-        ragged_array_ops.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'd', b'e'], [b'a', b'g']], []],
          [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
          [[[b'b', b'a']]]])  # pyformat: disable
@@ -121,13 +120,13 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
     ragged_indices = ragged_factory_ops.constant([[0, 3]])
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 3\)'):
-      self.evaluate(ragged_array_ops.gather(tensor_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(tensor_params, ragged_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[2\] = 2 is not in \[0, 2\)'):
-      self.evaluate(ragged_array_ops.gather(ragged_params, tensor_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, tensor_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 2\)'):
-      self.evaluate(ragged_array_ops.gather(ragged_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, ragged_indices))
 
   def testUnknownIndicesRankError(self):
     if context.executing_eagerly():
@@ -137,7 +136,7 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
     indices = array_ops.placeholder_with_default(indices, None)
     self.assertRaisesRegexp(ValueError,
                             r'indices\.shape\.ndims must be known statically',
-                            ragged_array_ops.gather, params, indices)
+                            ragged_gather_ops.gather, params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b259df30e8a30796c9decd4db1a2b81db710173f
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -0,0 +1,258 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
+           name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices)
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices)
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices)
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
+    batch_dims: Must be zero.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis != 0 is not supported for ragged gather yet.')
+  if not isinstance(batch_dims, int) or batch_dims != 0:
+    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.flat_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
+      for dim in range(indices_ndims - 1):
+        params = ragged_array_ops.expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return ragged_array_ops.tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.cast(indices, dtypes.int64)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            flattened_params.row_starts(), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 001a400596597bb0efb9b847184abd54e757f1d5..d01cf67139b397977c30817fa515f5e30050b25b 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
@@ -150,6 +151,27 @@ def _ragged_getitem(rt_input, key_list):
   else:
     starts = rt_input.row_splits[:-1]
     limits = rt_input.row_splits[1:]
+    if context.executing_eagerly():
+      # In python, __getitem__ should throw IndexError for out of bound
+      # indices. This will allow iteration run correctly as python will
+      # translate IndexError into StopIteration for next()/__next__().
+      # Below is an example:
+      #    import tensorflow as tf
+      #    r = tf.ragged.constant([[1., 2.], [3., 4., 5.], [6.]])
+      #    for elem in r:
+      #      print(elem)
+      # In non eager mode, the exception is thrown when session runs
+      # so we don't know if out of bound happens before.
+      # In eager mode, however, it is possible to find out when to
+      # throw out of bound IndexError.
+      # In the following row_key >= len(starts) is checked. In case of
+      # TypeError which happens when row_key is not an integer, the exception
+      # will simply be ignored as it will be processed later anyway.
+      try:
+        if int(row_key) >= len(starts):
+          raise IndexError("Row key {} out of bounds".format(row_key))
+      except (TypeError, ValueError):
+        pass
     row = rt_input.values[starts[row_key]:limits[row_key]]
     return row.__getitem__(inner_keys)
 
@@ -344,7 +366,7 @@ def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
 
   # Use `ragged_gather` or `array_ops.gather` to collect the values.
   if isinstance(values, ragged_tensor.RaggedTensor):
-    gathered_values = ragged_array_ops.gather(
+    gathered_values = ragged_gather_ops.gather(
         params=values, indices=value_indices.values)
   else:
     gathered_values = array_ops.gather(
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index fbe188bd1a305c1b366461528139bfcbb85b6367..1d342512c0206c8877f8e669c9e3df78d784fd8f 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional operations.
-
-See the [Higher Order
-Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
-"""
+"""Functional operations for RaggedTensors."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index b2b96a946593ce46b1843d7283baae344ff060da..e52ad4de20cd8697c7772123627f32d2b980b720 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_array_ops.stack."""
+"""Tests for ragged_concat_ops.stack."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
@@ -330,7 +330,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
-    stacked = ragged_array_ops.stack(rt_inputs, axis)
+    stacked = ragged_concat_ops.stack(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -360,7 +360,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
           message='axis=3 out of bounds: expected -3<=axis<3'),
   )
   def testError(self, rt_inputs, axis, error, message):
-    self.assertRaisesRegexp(error, message, ragged_array_ops.stack, rt_inputs,
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.stack, rt_inputs,
                             axis)
 
   def testSingleTensorInput(self):
@@ -371,7 +371,7 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
     equivalent to expand_dims(axis=0).  This test exercises that path.
     """
     rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
-    stacked = ragged_array_ops.stack(rt_inputs, 0)
+    stacked = ragged_concat_ops.stack(rt_inputs, 0)
     self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index fd334e6cc713d3cc3e94a84e9f7f7bdc813e0a7b..27438ff6bd13e93e3bf9e114e14b2eb3697fe437 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -45,7 +46,7 @@ _eval_using_default_session = ops._eval_using_default_session
 
 
 @tf_export("RaggedTensor")
-class RaggedTensor(object):
+class RaggedTensor(composite_tensor.CompositeTensor):
   """Represents a ragged tensor.
 
   A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
@@ -1052,11 +1053,12 @@ class RaggedTensor(object):
 
     The set of absent/default values may be specified using a vector of lengths
     or a padding value (but not both).  If `lengths` is specified, then the
-    output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
-    If `padding` is specified, then any row *suffix* consisting entirely of
-    `padding` will be excluded from the returned `RaggedTensor`.  If neither
-    `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
-    have no absent/default values.
+    output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`. If
+    'lengths' is a list of lists or tuple of lists, those lists will be used
+    as nested row lengths. If `padding` is specified, then any row *suffix*
+    consisting entirely of `padding` will be excluded from the returned
+    `RaggedTensor`.  If neither `lengths` nor `padding` is specified, then the
+    returned `RaggedTensor` will have no absent/default values.
 
     Examples:
 
@@ -1064,10 +1066,17 @@ class RaggedTensor(object):
     >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
     >>> tf.RaggedTensor.from_tensor(dt)
     <tf.RaggedTensor [[5, 7, 0], [0, 3, 0], [6, 0, 0]]>
-    >>> tf.RaggedTensor.from_tensor(dt, lengths=[2, 0, 3])
-    <tf.RaggedTensor [[5, 7], [], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, lengths=[1, 0, 3])
+    <tf.RaggedTensor [[5], [], [6, 0, 0]]>
+
     >>> tf.RaggedTensor.from_tensor(dt, padding=0)
     <tf.RaggedTensor [[5, 7], [0, 3], [6]]>
+
+    >>> dt = tf.constant([[[5, 0], [7, 0], [0, 0]],
+                          [[0, 0], [3, 0], [0, 0]],
+                          [[6, 0], [0, 0], [0, 0]]])
+    >>> tf.RaggedTensor.from_tensor(dt, lengths=([2, 0, 3], [1, 1, 2, 0, 1]))
+    <tf.RaggedTensor [[[5], [7]], [], [[6, 0], [], [0]]]>
     ```
 
     Args:
@@ -1076,7 +1085,10 @@ class RaggedTensor(object):
       lengths: An optional set of row lengths, specified using a 1-D integer
         `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows
         in `tensor`).  If specified, then `output[row]` will contain
-        `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+        `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero. You
+        may optionally pass a list or tuple of lengths to this argument, which
+        will be used as nested row lengths to construct a ragged tensor with
+        multiple ragged dimensions.
       padding: An optional padding value.  If specified, then any row suffix
         consisting entirely of `padding` will be excluded from the returned
         RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
@@ -1166,21 +1178,36 @@ class RaggedTensor(object):
                 math_ops.range(1, ncols + 1), 0))
         lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
 
-      # If we have lengths (either directly supplied, or computed from
-      # paddings), then use those to construct splits; and then use masking
-      # to get the corresponding values.
       if lengths is not None:
-        lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
-                                                    dtypes.int64)
-        lengths.shape.assert_has_rank(1)
-        lengths = math_ops.minimum(lengths, ncols)
-        lengths = math_ops.maximum(lengths, 0)
-        limits = math_ops.cumsum(lengths)
-        splits = array_ops.concat([array_ops.zeros([1], dtypes.int64), limits],
-                                  axis=0)
-        mask = array_ops.sequence_mask(lengths, maxlen=ncols)
-        values = array_ops.boolean_mask(tensor, mask)
-        return cls.from_row_splits(values, splits)
+        if isinstance(lengths,
+                      (list, tuple)) and len(lengths) and not isinstance(
+                          lengths[0], (int, float)):
+          # In this case, we've been given nested row lengths. Rather than
+          # reconstructing the tensor mask directly, we can recreate it as
+          # a boolean RaggedTensor, then densify that and use that as the
+          # mask to clear out the unused data in the passed tensor.
+          tensor.shape.with_rank_at_least(len(lengths) + 1)
+          num_tokens = math_ops.reduce_sum(lengths[-1])
+          ones_mask = array_ops.ones([num_tokens], dtype=dtypes.bool)
+          ragged_mask = cls.from_nested_row_lengths(ones_mask, lengths)
+          dense_ragged_mask = ragged_mask.to_tensor(default_value=False)
+          masked_data = array_ops.boolean_mask(tensor, dense_ragged_mask)
+          return cls.from_nested_row_lengths(masked_data, lengths)
+        else:
+          # If we have lengths (either directly supplied, or computed from
+          # paddings), then use those to construct splits; and then use masking
+          # to get the corresponding values.
+          lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
+                                                      dtypes.int64)
+          lengths.shape.assert_has_rank(1)
+          lengths = math_ops.minimum(lengths, ncols)
+          lengths = math_ops.maximum(lengths, 0)
+          limits = math_ops.cumsum(lengths)
+          splits = array_ops.concat(
+              [array_ops.zeros([1], dtypes.int64), limits], axis=0)
+          mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+          values = array_ops.boolean_mask(tensor, mask)
+          return cls.from_row_splits(values, splits)
 
       # If neither padding nor lengths were specified, then create a splits
       # vector that contains no default values, and reshape the input tensor
@@ -1437,6 +1464,53 @@ class RaggedTensor(object):
       values = values.values
     return values
 
+  #=============================================================================
+  # Composite Tensor
+  #=============================================================================
+
+  def _to_components(self):
+    return (self.flat_values,) + self.nested_row_splits
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls.from_nested_row_splits(components[0], components[1:])
+
+  def _shape_invariant_to_components(self, shape=None):
+    ragged_rank = self.ragged_rank
+    flat_values = self.flat_values
+
+    if shape is None:
+      # Default shape invariant
+      value_shape = flat_values.shape[1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      return ((values_shape, self._row_splits.shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+    else:
+      # Explicitly specified shape invariant
+      if shape.ndims is not None and shape.ndims <= ragged_rank:
+        raise ValueError("Shape invariant %s does not have sufficient rank "
+                         "for a RaggedTensor with %d ragged dimensions." %
+                         (shape, self.ragged_rank))
+      if any(tensor_shape.dimension_value(shape[dim]) is not None
+             for dim in range(1, self.ragged_rank + 1)):
+        raise ValueError("Shape invariant dimension size must be None for "
+                         "ragged dimenions.")
+      nrows = tensor_shape.dimension_value(shape[0])
+      value_shape = shape[self.ragged_rank + 1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      if nrows is None:
+        outer_splits_shape = tensor_shape.TensorShape([None])
+      else:
+        outer_splits_shape = tensor_shape.TensorShape([nrows + 1])
+      return ((values_shape, outer_splits_shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 def is_ragged(value):
   """Returns true if `value` is a ragged tensor or ragged tensor value."""
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 89691b015d76dbd35d0a9f5db2f2a0ab431147b1..5d55afcc6b3a8720896e8c24c66d78586441debb 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -828,14 +828,14 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
   # pylint: disable=invalid-slice-index
   @parameterized.parameters(
       # Tests for out-of-bound errors
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-6],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
 
       # Indexing into an inner ragged dimension
       (SLICE_BUILDER[:, 3], ValueError,
@@ -953,14 +953,15 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
        'Cannot index into an inner ragged dimension.'),
 
       # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[1, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
       (SLICE_BUILDER[0, 0, 3],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+       (IndexError, ValueError,
+        errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
@@ -982,10 +983,10 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
-      (SLICE_BUILDER[0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-1],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
@@ -1207,5 +1208,28 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
       res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
       self.assertAllEqual(res2, [15, 7])
 
+  # Test case for GitHub issue 24679.
+  def testEagerForLoop(self):
+    if not context.executing_eagerly():
+      return
+
+    values = [[1., 2.], [3., 4., 5.], [6.]]
+    r = ragged_factory_ops.constant(values)
+    i = 0
+    for elem in r:
+      self.assertAllEqual(elem, values[i])
+      i += 1
+
+  def testConsumers(self):
+    if context.executing_eagerly():
+      return
+
+    a = RaggedTensor.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
+        array_ops.placeholder(dtypes.int64, name='a.row_splits'))
+    ragged_math_ops.reduce_sum(a)
+    self.assertLen(a.consumers(), 1)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60ee49ee8adb2e4b117f9009bd602ab36f84046
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""where operation for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = _nrows(x)
+    x_and_y = ragged_concat_ops.concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(_nrows(y)))
+    return ragged_gather_ops.gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 3dd95658265de90a71f59ab4ae7c38ad80579cec..e76a04072a5ae0f593a9897105962305a38c39bf 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -17,13 +17,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from absl.testing import parameterized
-
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.platform import googletest
 
 
@@ -182,7 +180,7 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
               [[[[], [b'A']]], [[[b'b']]]])),
   ])   # pyformat: disable
   def testRaggedWhere(self, condition, expected, x=None, y=None):
-    result = ragged_array_ops.where(condition, x, y)
+    result = ragged_where_op.where(condition, x, y)
     self.assertRaggedEqual(result, expected)
 
   @parameterized.parameters([
@@ -201,7 +199,7 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
   ])
   def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
     with self.assertRaisesRegexp(error, message):
-      ragged_array_ops.where(condition, x, y)
+      ragged_where_op.where(condition, x, y)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 42dc13223b67e2505578baefb783bc81182ec150..31e26e7c9d8b913e538b284654aa80d80c55150b 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
 def row_splits_to_segment_ids(splits, name=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
@@ -64,7 +64,7 @@ def row_splits_to_segment_ids(splits, name=None):
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
 def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 62e2f6d1025bb9802a5b2a09a4dbffbe15921ace..cd332ede178c8704a3e77e69b4bac970d3728d19 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -329,7 +329,7 @@ def random_crop(value, size, seed=None, name=None):
 
 @tf_export(v1=["random.multinomial", "multinomial"])
 @deprecation.deprecated(
-    date=None, instructions="Use tf.random.categorical instead.")
+    date=None, instructions="Use `tf.random.categorical` instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff94f5c25ae8c9b0c398f629d2accff88a56be5
--- /dev/null
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Raw ops tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RawOpsTest(test.TestCase):
+
+  def testSimple(self):
+    x = constant_op.constant(1)
+    self.assertEqual([2], self.evaluate(gen_math_ops.Add(x=x, y=x)))
+
+  def testRequiresKwargs(self):
+    with self.assertRaisesRegexp(TypeError, "only takes keyword args"):
+      gen_math_ops.Add(1., 1.)
+
+  def testRequiresKwargs_providesSuggestion(self):
+    msg = "possible keys: \\['x', 'y', 'name'\\]"
+    with self.assertRaisesRegexp(TypeError, msg):
+      gen_math_ops.Add(1., y=2.)
+
+  def testName(self):
+    x = constant_op.constant(1)
+    op = gen_math_ops.Add(x=x, y=x, name="double")
+    if not context.executing_eagerly():
+      # `Tensor.name` is not available in eager.
+      self.assertEqual(op.name, "double:0")
+
+  def testDoc(self):
+    self.assertEqual(gen_math_ops.add.__doc__, gen_math_ops.Add.__doc__)
+
+  def testDefaults(self):
+    x = constant_op.constant([[True]])
+    self.assertAllClose(
+        gen_math_ops.Any(input=x, axis=0),
+        gen_math_ops.Any(input=x, axis=0, keep_dims=False))
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index c8f20029acb64b16ecf2cab151929857b41ddc22..a77afc9e6b0882e6d881292813f96c2a718fcf6e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -43,7 +43,7 @@ from tensorflow.python.ops import variables
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -58,8 +58,118 @@ def get_resource_handle_data(graph_op):
       compat.as_bytes(handle_data))
 
 
-def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
-  """Creates a variable handle with information to do shape inference."""
+def get_eager_safe_handle_data(handle):
+  """Get the data handle from the Tensor `handle`."""
+  assert isinstance(handle, ops.Tensor)
+
+  if isinstance(handle, ops.EagerTensor):
+    return handle._handle_data  # pylint: disable=protected-access
+  else:
+    return get_resource_handle_data(handle)
+
+
+def _set_handle_shapes_and_types(tensor, handle_data, graph_mode):
+  """Sets the shape inference result HandleData on tensor.
+
+  Args:
+    tensor: A `Tensor` or `EagerTensor`.
+    handle_data: A `CppShapeInferenceResult.HandleData`.
+    graph_mode: A python bool.
+  """
+  tensor._handle_data = handle_data  # pylint: disable=protected-access
+  if not graph_mode:
+    return
+
+  # Not an EagerTensor, so a graph tensor.
+  shapes, types = zip(*[(pair.shape, pair.dtype)
+                        for pair in handle_data.shape_and_type])
+  ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+  shapes = [[d.size for d in s.dim]
+            if not s.unknown_rank else None for s in shapes]
+  pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+      tensor._op._graph._c_graph,  # pylint: disable=protected-access
+      tensor._as_tf_output(),  # pylint: disable=protected-access
+      shapes, ranks, types)
+
+
+def _combine_handle_data(handle, initial_value):
+  """Concats HandleData from tensors `handle` and `initial_value`.
+
+  Args:
+    handle: A `Tensor` of dtype `resource`.
+    initial_value: A `Tensor`.
+
+  Returns:
+    A `CppShapeInferenceResult.HandleData`.  If `initial_value` has dtype
+    `variant`, the `HandleData` contains the concatenation of the shape_and_type
+    from both `handle` and `initial_value`.
+
+  Raises:
+    RuntimeError: If handle, which was returned by VarHandleOp, either has
+      no handle data, or its len(handle_data.shape_and_type) != 1.
+  """
+  assert handle.dtype == dtypes.resource
+
+  variable_handle_data = get_eager_safe_handle_data(handle)
+
+  if initial_value.dtype != dtypes.variant:
+    return variable_handle_data
+
+  extra_handle_data = get_eager_safe_handle_data(initial_value)
+  if extra_handle_data is not None and extra_handle_data.is_set:
+    if (variable_handle_data is None
+        or not variable_handle_data.is_set
+        or len(variable_handle_data.shape_and_type) != 1):
+      raise RuntimeError(
+          "Expected VarHandleOp to return a length==1 shape_and_type, "
+          "but saw: '%s'" % (variable_handle_data,))
+    variable_handle_data.shape_and_type.extend(
+        extra_handle_data.shape_and_type)
+  return variable_handle_data
+
+
+def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
+  """Creates a variable handle with information to do shape inference.
+
+  The shape and dtype are read from `initial_value` and stored in the returned
+  resource tensor's handle data.
+
+  If `initial_value.dtype == tf.variant`, we additionally extract the handle
+  data (if any) from `initial_value` and append it to the `handle_data`.
+  In this case, the returned tensor's handle data is in the form
+
+  ```
+  is_set: true
+  shape_and_type {
+    shape {
+      // initial_value.shape
+    }
+    dtype: DT_VARIANT
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[0]
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[1]
+  }
+  ...
+  ```
+
+  Ops that read from this tensor, such as `ReadVariableOp` and
+  `AssignVariableOp`, know that `handle_data(handle).shape_and_type[1:]`
+  correspond to the handle data of the variant(s) stored in the Variable.
+
+  Args:
+    initial_value: A `Tensor`.
+    shared_name: A string.
+    name: A string.
+    graph_mode: A python bool.
+
+  Returns:
+    The handle, a `Tensor` of type `resource`.
+  """
+  shape = initial_value.get_shape()
+  dtype = initial_value.dtype.base_dtype
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
@@ -67,35 +177,38 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    shared_name=shared_name,
                                                    name=name,
                                                    container=container)
+
   if graph_mode:
-    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
+    full_handle_data = _combine_handle_data(handle, initial_value)
+    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+    return handle
+  else:
+    # We do not want two distinct ResourceVariable objects for the same
+    # underlying resource in the runtime.
+    # When in eager mode, explicitly ensure so here. When in graph mode, it's
+    # ensured by always generating different variable names.
+    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
+    if exists:
+      raise ValueError("variable object with name '%s' already created. Use "
+                       "get_variable() if reuse is desired." %
+                       shared_name)
+    with context.graph_mode(), ops.Graph().as_default() as graph:
+      h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
+                                                  shared_name=shared_name,
+                                                  name=name,
+                                                  container=container)
+
+      # Tensor._handle_data contains information for the shape-inference code to
+      # know the shape and dtype of the variable pointed to by a handle. Since
+      # shape inference doesn't run in eager mode we copy this data here for
+      # when the handle is captured by an eager mode function.
+      # pylint: disable=protected-access
+      full_handle_data = _combine_handle_data(h, initial_value)
+      _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+      # pylint: enable=protected-access
+    # Clean up op->graph->op reference cycles.
+    ops.dismantle_graph(graph)
     return handle
-
-  # We do not want two distinct ResourceVariable objects for the same
-  # underlying resource in the runtime.
-  # When in eager mode, explicitly ensure so here. When in graph mode, it's
-  # ensured by always generating different variable names.
-  exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-  if exists:
-    raise ValueError("variable object with name '%s' already created. Use "
-                     "get_variable() if reuse is desired." %
-                     shared_name)
-  with context.graph_mode(), ops.Graph().as_default() as graph:
-    h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                shared_name=shared_name,
-                                                name=name,
-                                                container=container)
-
-    # Tensor._handle_data contains information for the shape-inference code to
-    # know the shape and dtype of the variable pointed to by a handle. Since
-    # shape inference doesn't run in eager mode we copy this data here for when
-    # the handle is captured by an eager mode function.
-    # pylint: disable=protected-access
-    handle._handle_data = get_resource_handle_data(h)
-    # pylint: enable=protected-access
-  # Clean up op->graph->op reference cycles.
-  ops.dismantle_graph(graph)
-  return handle
 
 
 @contextlib.contextmanager
@@ -162,6 +275,18 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
                                                       name=name)
 
 
+def _maybe_set_handle_data(dtype, handle, tensor):
+  if dtype == dtypes.variant:
+    # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+    # variant's handle data.  Extract it.
+    handle_data = get_eager_safe_handle_data(handle)
+    if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+      tensor._handle_data = (  # pylint: disable=protected-access
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+              is_set=True,
+              shape_and_type=handle_data.shape_and_type[1:]))
+
+
 class ResourceVariable(variables.VariableV1):
   """Variable based on resource handles.
 
@@ -380,8 +505,8 @@ class ResourceVariable(variables.VariableV1):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -420,8 +545,7 @@ class ResourceVariable(variables.VariableV1):
                 initial_value() if init_from_fn else initial_value,
                 name="initial_value", dtype=dtype)
           self._handle = eager_safe_variable_handle(
-              shape=initial_value.get_shape(),
-              dtype=initial_value.dtype.base_dtype,
+              initial_value=initial_value,
               shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
@@ -732,6 +856,8 @@ class ResourceVariable(variables.VariableV1):
       tape.variable_accessed(self)
     result = gen_resource_variable_ops.read_variable_op(self._handle,
                                                         self._dtype)
+    _maybe_set_handle_data(self._dtype, self._handle, result)
+
     if not context.executing_eagerly():
       # Note that if a control flow context is active the input of the read op
       # might not actually be the handle. This line bypasses it.
@@ -763,6 +889,17 @@ class ResourceVariable(variables.VariableV1):
         tape.variable_accessed(self)
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
+
+      if self._dtype == dtypes.variant:
+        # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+        # variant's handle data.  Extract it.
+        handle_data = get_eager_safe_handle_data(self._handle)
+        if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+          value._handle_data = (  # pylint: disable=protected-access
+              cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+                  is_set=True,
+                  shape_and_type=handle_data.shape_and_type[1:]))
+
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):
@@ -1337,8 +1474,11 @@ class _UnreadVariable(ResourceVariable):
 
   def _read_variable_op(self):
     with ops.control_dependencies([self._parent_op]):
-      return gen_resource_variable_ops.read_variable_op(self._handle,
-                                                        self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(self._handle,
+                                                          self._dtype)
+      _maybe_set_handle_data(self._dtype, self._handle, result)
+      return result
+
 
   @property
   def op(self):
@@ -1349,107 +1489,6 @@ class _UnreadVariable(ResourceVariable):
 ops.register_dense_tensor_like_type(_UnreadVariable)
 
 
-class _MixedPrecisionVariable(ResourceVariable):
-  """Represents a variable that can return in desired dtype when read.
-
-  In mixed precision training, it is usually desirable to use different dtypes
-  for variables and computation. This class will be used to wrap created
-  ResourceVariable when mixed precision training is enabled. It allows layers to
-  perform computation in a different dtype than their variable dtypes, in order
-  to achieve higher performance without causing quality loss.
-  """
-
-  def __init__(self, var, read_dtype):
-    """Creates a MixedPrecisionVariable.
-
-    Args:
-      var: A ResourceVariable instance.
-      read_dtype: A tf.DType, the returned dtype when read, default to None.
-        Casting is performed if read_dtype is not None and differs from
-        var.dtype.
-    Returns:
-      An MixedPrecisionVariable instance.
-    Raises:
-      ValueError: if var is not a ResourceVariable instance, or read_dtype is
-        not a tf.DType instance.
-    """
-    # pylint: disable=super-init-not-called
-    # We do not call super init on purpose.
-    if not isinstance(var, ResourceVariable):
-      raise ValueError("InvalidArgument: var must be a ResourceVariable type.")
-    if not isinstance(read_dtype, dtypes.DType):
-      raise ValueError("InvalidArgument: read_dtype must be a tf.DType type.")
-
-    self._var = var
-    self._trainable = var.trainable
-    self._save_slice_info = None
-    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    self._in_graph_mode = var._in_graph_mode  # pylint: disable=protected-access
-    self._handle = var.handle
-    self._shape = var.shape
-    self._initial_value = None
-    if isinstance(self.handle, ops.EagerTensor):
-      self._handle_name = ""
-    else:
-      self._handle_name = self.handle.name
-    self._unique_id = var._unique_id  # pylint: disable=protected-access
-    self._dtype = var.dtype
-    self._constraint = None
-    self._cached_value = None
-    self._is_initialized_op = var._is_initialized_op  # pylint: disable=protected-access
-    self._initializer_op = var._initializer_op  # pylint: disable=protected-access
-    # This needs to be set before read_value() is called.
-    self._read_dtype = read_dtype
-    if context.executing_eagerly():
-      self._graph_element = None
-    else:
-      self._graph_element = self.read_value()
-    self._handle_deleter = (
-        var._handle_deleter if not self._in_graph_mode  # pylint: disable=protected-access
-        else None)
-    # pylint: enable=super-init-not-called
-
-  @property
-  def name(self):
-    return self._var.name
-
-  def value(self):
-    return self._read_variable_op()
-
-  def read_value(self):
-    return self._read_variable_op()
-
-  def _read_variable_op(self):
-    with ops.colocate_with(self._handle):
-      res = gen_resource_variable_ops.read_variable_op(self._handle,
-                                                       self._dtype)
-      if self._read_dtype != self._dtype:
-        return math_ops.cast(res, self._read_dtype)
-      else:
-        return res
-
-  @property
-  def op(self):
-    """The op for this variable."""
-    return self._var.op
-
-  @property
-  def read_dtype(self):
-    """The dtype of the returned tensor when reading the var."""
-    return self._read_dtype
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    del name
-    if (dtype is not None and
-        not dtype.is_compatible_with(self.read_dtype) or as_ref):
-      return NotImplemented
-    return self.value()
-
-  def _should_act_as_resource_variable(self):
-    """To pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
   """Gradient for read op."""
@@ -1543,7 +1582,7 @@ def copy_to_graph_uninitialized(var):
       constraint=var._constraint,
       dtype=var.dtype,
       name=var._shared_name)
-  new_variable._maybe_initialize_checkpointable()
+  new_variable._maybe_initialize_trackable()
   # pylint: enable=protected-access
   return new_variable
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index ec48cab91d172c54b2f927d946312f086e808c9c..bd5abd53db982d9d04ac883191c55bff9ef67cbf 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -623,7 +623,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
     parallel_iterations = parallel_iterations or 32
     if sequence_length is not None:
-      sequence_length = math_ops.to_int32(sequence_length)
+      sequence_length = math_ops.cast(sequence_length, dtypes.int32)
       if sequence_length.get_shape().rank not in (None, 1):
         raise ValueError(
             "sequence_length must be a vector of length batch_size, "
@@ -1367,7 +1367,7 @@ def static_rnn(cell,
       zero_output = nest.pack_sequence_as(
           structure=output_size, flat_sequence=flat_zero_output)
 
-      sequence_length = math_ops.to_int32(sequence_length)
+      sequence_length = math_ops.cast(sequence_length, dtypes.int32)
       min_sequence_length = math_ops.reduce_min(sequence_length)
       max_sequence_length = math_ops.reduce_max(sequence_length)
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 40c3771f4e3bd961bd9728855be319fd3df817c5..9d9511a73e879846954aa1587ad0b1d7969657d1 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras import layers as keras_layer
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
@@ -50,7 +51,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -180,7 +181,7 @@ def _zero_state_tensors(state_size, batch_size, dtype):
   return nest.map_structure(get_state_shape, state_size)
 
 
-@tf_export("nn.rnn_cell.RNNCell")
+@tf_export(v1=["nn.rnn_cell.RNNCell"])
 class RNNCell(base_layer.Layer):
   """Abstract object representing an RNN cell.
 
@@ -207,7 +208,9 @@ class RNNCell(base_layer.Layer):
     super(RNNCell, self).__init__(
         trainable=trainable, name=name, dtype=dtype, **kwargs)
     # Attribute that indicates whether the cell is a TF RNN cell, due the slight
-    # difference between TF and Keras RNN cell.
+    # difference between TF and Keras RNN cell. Notably the state is not wrapped
+    # in a list for TF cell where they are single tensor state, whereas keras
+    # cell will wrap the state into a list, and call() will have to unwrap them.
     self._is_tf_rnn_cell = True
 
   def __call__(self, inputs, state, scope=None):
@@ -606,7 +609,7 @@ class GRUCell(LayerRNNCell):
 _LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
 
 
-@tf_export("nn.rnn_cell.LSTMStateTuple")
+@tf_export(v1=["nn.rnn_cell.LSTMStateTuple"])
 class LSTMStateTuple(_LSTMStateTuple):
   """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
 
@@ -1085,8 +1088,106 @@ def _default_dropout_state_filter_visitor(substate):
   return True
 
 
-@tf_export(v1=["nn.rnn_cell.DropoutWrapper"])
-class DropoutWrapper(RNNCell):
+class _RNNCellWrapperV1(RNNCell):
+  """Base class for cells wrappers V1 compatibility.
+
+  This class along with `_RNNCellWrapperV2` allows to define cells wrappers that
+  are compatible with V1 and V2, and defines helper methods for this purpose.
+  """
+
+  def __init__(self, cell):
+    super(_RNNCellWrapperV1, self).__init__()
+    self.cell = cell
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self.cell, name="cell")
+
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+    """Calls the wrapped cell and performs the wrapping logic.
+
+    This method is called from the wrapper's `call` or `__call__` methods.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
+        `__call__` or 'call' method).
+      **kwargs: Additional arguments.
+
+    Returns:
+      A pair containing:
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    raise NotImplementedError
+
+  def __call__(self, inputs, state, scope=None):
+    """Runs the RNN cell step computation.
+
+    We assume that the wrapped RNNCell is being built within its `__call__`
+    method. We directly use the wrapped cell's `__call__` in the overridden
+    wrapper `__call__` method.
+
+    This allows to use the wrapped cell and the non-wrapped cell equivalently
+    when using `__call__`.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      scope: VariableScope for the subgraph created in the wrapped cells'
+        `__call__`.
+
+    Returns:
+      A pair containing:
+
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    return self._call_wrapped_cell(
+        inputs, state, cell_call_fn=self.cell.__call__, scope=scope)
+
+
+class _RNNCellWrapperV2(keras_layer.AbstractRNNCell):
+  """Base class for cells wrappers V2 compatibility.
+
+  This class along with `_RNNCellWrapperV1` allows to define cells wrappers that
+  are compatible with V1 and V2, and defines helper methods for this purpose.
+  """
+
+  def __init__(self, cell, *args, **kwargs):
+    super(_RNNCellWrapperV2, self).__init__(*args, **kwargs)
+    self.cell = cell
+
+  def call(self, inputs, state, **kwargs):
+    """Runs the RNN cell step computation.
+
+    When `call` is being used, we assume that the wrapper object has been built,
+    and therefore the wrapped cells has been built via its `build` method and
+    its `call` method can be used directly.
+
+    This allows to use the wrapped cell and the non-wrapped cell equivalently
+    when using `call` and `build`.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      **kwargs: Additional arguments passed to the wrapped cell's `call`.
+
+    Returns:
+      A pair containing:
+
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    return self._call_wrapped_cell(
+        inputs, state, cell_call_fn=self.cell.call, **kwargs)
+
+  def build(self, inputs_shape):
+    """Builds the wrapped cell."""
+    self.cell.build(inputs_shape)
+    self.built = True
+
+
+class DropoutWrapperBase(object):
   """Operator adding dropout to inputs and outputs of the given cell."""
 
   def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
@@ -1156,7 +1257,7 @@ class DropoutWrapper(RNNCell):
         but not `callable`.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
-    super(DropoutWrapper, self).__init__()
+    super(DropoutWrapperBase, self).__init__(cell)
     assert_like_rnncell("cell", cell)
 
     if (dropout_state_filter_visitor is not None
@@ -1181,10 +1282,7 @@ class DropoutWrapper(RNNCell):
         else:
           setattr(self, "_%s" % attr, tensor_prob)
 
-    # Set cell, variational_recurrent, seed before running the code below
-    self._cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self._cell, name="cell")
+    # Set variational_recurrent, seed before running the code below
     self._variational_recurrent = variational_recurrent
     self._seed = seed
 
@@ -1236,19 +1334,19 @@ class DropoutWrapper(RNNCell):
 
   @property
   def wrapped_cell(self):
-    return self._cell
+    return self.cell
 
   @property
   def state_size(self):
-    return self._cell.state_size
+    return self.cell.state_size
 
   @property
   def output_size(self):
-    return self._cell.output_size
+    return self.cell.output_size
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-      return self._cell.zero_state(batch_size, dtype)
+      return self.cell.zero_state(batch_size, dtype)
 
   def _variational_recurrent_dropout_value(
       self, index, value, noise, keep_prob):
@@ -1291,16 +1389,13 @@ class DropoutWrapper(RNNCell):
           shallow_filtered_substructure, dropout,
           *[shallow_filtered_substructure, values, recurrent_noise])
 
-  def _call(self, inputs, state, call_fn, **kwargs):
-    """Defines a helper method that runs the wrapped cell and applies dropout.
-
-    This helper is called from the DropoutWrapper's `call` or `__call__`
-    methods.
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+    """Runs the wrapped cell and applies dropout.
 
     Args:
       inputs: A tensor with wrapped cell's input.
       state: A tensor or tuple of tensors with wrapped cell's state.
-      call_fn: Wrapped cell's method to use for step computation (cell's
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
         `__call__` or 'call' method).
       **kwargs: Additional arguments.
 
@@ -1317,7 +1412,7 @@ class DropoutWrapper(RNNCell):
       inputs = self._dropout(inputs, "input",
                              self._recurrent_input_noise,
                              self._input_keep_prob)
-    output, new_state = call_fn(inputs, state, **kwargs)
+    output, new_state = cell_call_fn(inputs, state, **kwargs)
     if _should_dropout(self._state_keep_prob):
       # Identify which subsets of the state to perform dropout on and
       # which ones to keep.
@@ -1333,83 +1428,28 @@ class DropoutWrapper(RNNCell):
                              self._output_keep_prob)
     return output, new_state
 
-  def __call__(self, inputs, state, scope=None):
-    """Runs the cell with the declared dropouts.
-
-    We assume that the wrapped RNNCell is being built within its `__call__`
-    method. We directly use the wrapped cell's `__call__` in the overridden
-    DropoutWrapper `__call__` method.
-
-    This should allow to use the wrapped cell and the non-wrapped cell
-    equivalently when using `__call__`.
-
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      scope: VariableScope for the subgraph created in the wrapped cells'
-        `__call__`.
-
-    Returns:
-      A pair containing:
-
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-    return self._call(inputs, state, call_fn=self._cell.__call__, scope=scope)
 
-
-@tf_export("rnn.DropoutWrapper", v1=[])
-class DropoutWrapperV2(LayerRNNCell, DropoutWrapper):
+@tf_export(v1=["nn.rnn_cell.DropoutWrapper"])
+class DropoutWrapper(DropoutWrapperBase, _RNNCellWrapperV1):
   """Operator adding dropout to inputs and outputs of the given cell."""
 
-  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
-               state_keep_prob=1.0, variational_recurrent=False,
-               input_size=None, dtype=None, seed=None,
-               dropout_state_filter_visitor=None):
-    """Runs init in Keras style scope to use Keras-style variable management."""
-
-    with base_layer.keras_style_scope():
-      super(DropoutWrapperV2, self).__init__(
-          cell=cell,
-          input_keep_prob=input_keep_prob,
-          output_keep_prob=output_keep_prob,
-          state_keep_prob=state_keep_prob,
-          variational_recurrent=variational_recurrent,
-          input_size=input_size,
-          dtype=dtype,
-          seed=seed,
-          dropout_state_filter_visitor=dropout_state_filter_visitor)
-
-  def build(self, inputs_shape):
-    self._cell.build(inputs_shape)
-    self.built = True
-
-  def call(self, inputs, state, **kwargs):
-    """Runs the cell with the declared dropouts.
+  def __init__(self, *args, **kwargs):
+    super(DropoutWrapper, self).__init__(*args, **kwargs)
 
-    When `call` is being used, we assume that the DropoutWrapper object has
-    been built and therefore the wrapped cells has been built via its `build`
-    method and its `call` method can be used directly.
+  __init__.__doc__ = DropoutWrapperBase.__init__.__doc__
 
-    This should allow to use the wrapped cell and the non-wrapped cell
-    equivalently when using `call` and `build`.
 
-    Args:
-      inputs: A tensor with wrapped cell's input.
-      state: A tensor or tuple of tensors with wrapped cell's state.
-      **kwargs: Additional arguments passed to the wrapped cell's `call`.
+@tf_export("nn.RNNCellDropoutWrapper", v1=[])
+class DropoutWrapperV2(DropoutWrapperBase, _RNNCellWrapperV2):
+  """Operator adding dropout to inputs and outputs of the given cell."""
 
-    Returns:
-      A pair containing:
+  def __init__(self, *args, **kwargs):
+    super(DropoutWrapperV2, self).__init__(*args, **kwargs)
 
-      - Output: A tensor with cell's output.
-      - New state: A tensor or tuple of tensors with new wrapped cell's state.
-    """
-    return self._call(inputs, state, call_fn=self._cell.call, **kwargs)
+  __init__.__doc__ = DropoutWrapperBase.__init__.__doc__
 
 
-@tf_export("nn.rnn_cell.ResidualWrapper")
-class ResidualWrapper(RNNCell):
+class ResidualWrapperBase(object):
   """RNNCell wrapper that ensures cell inputs are added to the outputs."""
 
   def __init__(self, cell, residual_fn=None):
@@ -1422,31 +1462,30 @@ class ResidualWrapper(RNNCell):
         Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
         and outputs.
     """
-    super(ResidualWrapper, self).__init__()
-    self._cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self._cell, name="cell")
+    super(ResidualWrapperBase, self).__init__(cell)
     self._residual_fn = residual_fn
 
   @property
   def state_size(self):
-    return self._cell.state_size
+    return self.cell.state_size
 
   @property
   def output_size(self):
-    return self._cell.output_size
+    return self.cell.output_size
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-      return self._cell.zero_state(batch_size, dtype)
+      return self.cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
     """Run the cell and then apply the residual_fn on its inputs to its outputs.
 
     Args:
       inputs: cell inputs.
       state: cell state.
-      scope: optional cell scope.
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
+        `__call__` or 'call' method).
+      **kwargs: Additional arguments passed to the wrapped cell's `call`.
 
     Returns:
       Tuple of cell outputs and new state.
@@ -1455,7 +1494,7 @@ class ResidualWrapper(RNNCell):
       TypeError: If cell inputs and outputs have different structure (type).
       ValueError: If cell inputs and outputs have different structure (value).
     """
-    outputs, new_state = self._cell(inputs, state, scope=scope)
+    outputs, new_state = cell_call_fn(inputs, state, **kwargs)
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
@@ -1467,8 +1506,27 @@ class ResidualWrapper(RNNCell):
     return (res_outputs, new_state)
 
 
-@tf_export("nn.rnn_cell.DeviceWrapper")
-class DeviceWrapper(RNNCell):
+@tf_export(v1=["nn.rnn_cell.ResidualWrapper"])
+class ResidualWrapper(ResidualWrapperBase, _RNNCellWrapperV1):
+  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+  def __init__(self, *args, **kwargs):
+    super(ResidualWrapper, self).__init__(*args, **kwargs)
+
+  __init__.__doc__ = ResidualWrapperBase.__init__.__doc__
+
+
+@tf_export("nn.RNNCellResidualWrapper", v1=[])
+class ResidualWrapperV2(ResidualWrapperBase, _RNNCellWrapperV2):
+  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+  def __init__(self, *args, **kwargs):
+    super(ResidualWrapperV2, self).__init__(*args, **kwargs)
+
+  __init__.__doc__ = ResidualWrapperBase.__init__.__doc__
+
+
+class DeviceWrapperBase(object):
   """Operator that ensures an RNNCell runs on a particular device."""
 
   def __init__(self, cell, device):
@@ -1480,29 +1538,45 @@ class DeviceWrapper(RNNCell):
       cell: An instance of `RNNCell`.
       device: A device string or function, for passing to `tf.device`.
     """
-    super(DeviceWrapper, self).__init__()
-    self._cell = cell
-    if isinstance(cell, checkpointable.Checkpointable):
-      self._track_checkpointable(self._cell, name="cell")
+    super(DeviceWrapperBase, self).__init__(cell)
     self._device = device
 
   @property
   def state_size(self):
-    return self._cell.state_size
+    return self.cell.state_size
 
   @property
   def output_size(self):
-    return self._cell.output_size
+    return self.cell.output_size
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       with ops.device(self._device):
-        return self._cell.zero_state(batch_size, dtype)
+        return self.cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
     """Run the cell on specified device."""
     with ops.device(self._device):
-      return self._cell(inputs, state, scope=scope)
+      return cell_call_fn(inputs, state, **kwargs)
+
+
+@tf_export(v1=["nn.rnn_cell.DeviceWrapper"])
+class DeviceWrapper(DeviceWrapperBase, _RNNCellWrapperV1):
+
+  def __init__(self, *args, **kwargs):  # pylint: disable=useless-super-delegation
+    super(DeviceWrapper, self).__init__(*args, **kwargs)
+
+  __init__.__doc__ = DeviceWrapperBase.__init__.__doc__
+
+
+@tf_export("nn.RNNCellDeviceWrapper", v1=[])
+class DeviceWrapperV2(DeviceWrapperBase, _RNNCellWrapperV2):
+  """Operator that ensures an RNNCell runs on a particular device."""
+
+  def __init__(self, *args, **kwargs):  # pylint: disable=useless-super-delegation
+    super(DeviceWrapperV2, self).__init__(*args, **kwargs)
+
+  __init__.__doc__ = DeviceWrapperBase.__init__.__doc__
 
 
 @tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
@@ -1549,11 +1623,11 @@ class MultiRNNCell(RNNCell):
 
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
-      # Add Checkpointable dependencies on these cells so their variables get
+      # Add Trackable dependencies on these cells so their variables get
       # saved with this object when using object-based saving.
-      if isinstance(cell, checkpointable.Checkpointable):
-        # TODO(allenl): Track down non-Checkpointable callers.
-        self._track_checkpointable(cell, name="cell-%d" % (cell_number,))
+      if isinstance(cell, trackable.Trackable):
+        # TODO(allenl): Track down non-Trackable callers.
+        self._track_trackable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
     if not state_is_tuple:
       if any(nest.is_sequence(c.state_size) for c in self._cells):
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index a5b31aff91660a6ac79c980dffb543e87fd40dfa..63b5eab56a3a6dc434ef03c3477945a9860bef65 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -391,12 +391,16 @@ def eager_py_func(func, inp, Tout, name=None):
 
 @deprecation.deprecated(
     date=None,
-    instructions="""tf.py_func is deprecated in TF V2. Instead, use
-    tf.py_function, which takes a python function which manipulates tf eager
+    instructions="""tf.py_func is deprecated in TF V2. Instead, there are two
+    options available in V2.
+    - tf.py_function takes a python function which manipulates tf eager
     tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
     an ndarray (just call tensor.numpy()) but having access to eager tensors
     means `tf.py_function`s can use accelerators such as GPUs as well as
     being differentiable using a gradient tape.
+    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
+    (it is not differentiable, and manipulates numpy arrays). It drops the
+    stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
@@ -467,6 +471,13 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
+@tf_export("numpy_function", v1=[])
+def numpy_function(func, inp, Tout, name=None):
+  return py_func(func, inp, Tout, stateful=True, name=name)
+
+numpy_function.__doc__ = py_func.__doc__.replace(
+    "py_func", "numpy_function")
+
 
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index d042c95c049538354836ef83f0b21d8babccedc8..0d080c5977ce2741f9243abec2a6e7efd513ad6e 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -94,7 +94,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
 
     axis_dim = (tensor_shape.dimension_value(input.shape[-1])
                 or _array_ops.shape(input)[-1])
-    axis_dim_float = _math_ops.to_float(axis_dim)
+    axis_dim_float = _math_ops.cast(axis_dim, _dtypes.float32)
 
     if type == 1:
       dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1)
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 2d14b2bbd75864b6477bccc5cef562b617674c08..0cc29a343d2e78c217ad0f367938e1c70ca895df 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -286,7 +286,8 @@ def _rfft_grad_helper(rank, irfft_fn):
     # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
     # factor, plus some additional terms to make up for the components dropped
     # due to Hermitian symmetry.
-    input_size = _math_ops.to_float(_fft_size_for_grad(op.inputs[0], rank))
+    input_size = _math_ops.cast(
+        _fft_size_for_grad(op.inputs[0], rank), _dtypes.float32)
     the_irfft = irfft_fn(grad, fft_length)
     return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
 
@@ -311,8 +312,8 @@ def _irfft_grad_helper(rank, rfft_fn):
         [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
          _array_ops.ones([1 - is_odd])], 0)
 
-    rsize = _math_ops.reciprocal(_math_ops.to_float(
-        _fft_size_for_grad(grad, rank)))
+    rsize = _math_ops.reciprocal(_math_ops.cast(
+        _fft_size_for_grad(grad, rank), _dtypes.float32))
 
     # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
     # factor and a mask. The mask scales the gradient for the Hermitian
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 601409dea901f34cca02861971850c3238378163..675d60ee94df98fdb813b77a992f10952fa82f24 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -107,4 +107,5 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
     dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
-    return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
+    return dct2 * math_ops.rsqrt(
+        math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index f029e0a8b59777b50e38ab4d8f801e811467c561..ba1709b4cfc1d383cef98304c7d300920f50bdae 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -282,6 +283,8 @@ def _enclosing_power_of_two(value):
     return constant_op.constant(
         int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
   return math_ops.cast(
-      math_ops.pow(2.0, math_ops.ceil(
-          math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))),
-      value.dtype)
+      math_ops.pow(
+          2.0,
+          math_ops.ceil(
+              math_ops.log(math_ops.cast(value, dtypes.float32)) /
+              math_ops.log(2.0))), value.dtype)
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index bef0a8ea4eda3bc3a7d79b275fccf7fbfb1fc3af..222ab347e06a2b4fc803cfa35ca6bab3fc8b0b93 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -30,7 +31,6 @@ from tensorflow.python.ops import sparse_ops
 # latent bugs here.
 ops.NotDifferentiable("SparseAddGrad")
 ops.NotDifferentiable("SparseConcat")
-ops.NotDifferentiable("SparseToDense")
 
 
 @ops.RegisterGradient("SparseReorder")
@@ -110,7 +110,7 @@ def _SparseReduceSumGrad(op, out_grad):
   sp_shape = op.inputs[2]
   output_shape_kept_dims = math_ops.reduced_shape(sp_shape, op.inputs[3])
   out_grad_reshaped = array_ops.reshape(out_grad, output_shape_kept_dims)
-  scale = sp_shape // math_ops.to_int64(output_shape_kept_dims)
+  scale = sp_shape // math_ops.cast(output_shape_kept_dims, dtypes.int64)
   # (sparse_indices, sparse_values, sparse_shape, reduction_axes)
   return (None, array_ops.gather_nd(out_grad_reshaped, sp_indices // scale),
           None, None)
@@ -213,7 +213,7 @@ def _SparseDenseCwiseMulOrDivGrad(op, grad, is_mul):
   x_shape = op.inputs[2]
   y = op.inputs[3]
 
-  y_shape = math_ops.to_int64(array_ops.shape(y))
+  y_shape = math_ops.cast(array_ops.shape(y), dtypes.int64)
   num_added_dims = array_ops.expand_dims(
       array_ops.size(x_shape) - array_ops.size(y_shape), 0)
   augmented_y_shape = array_ops.concat(
@@ -310,3 +310,16 @@ def _SparseFillEmptyRowsGrad(op, unused_grad_output_indices, output_grad_values,
 
   # d_indices, d_values, d_dense_shape, d_default_value.
   return [None, d_values, None, d_default_value]
+
+
+@ops.RegisterGradient("SparseToDense")
+def _SparseToDenseGrad(op, grad):
+  sparse_indices, output_shape, _, _ = op.inputs
+
+  sparse_values_grad = array_ops.gather_nd(grad, sparse_indices)
+  default_value_grad = math_ops.reduce_sum(grad) - math_ops.reduce_sum(
+      sparse_values_grad)
+  return [
+      array_ops.zeros_like(sparse_indices),
+      array_ops.zeros_like(output_shape), sparse_values_grad, default_value_grad
+  ]
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index a149d9873016e52164d072ee4cabd98167bfa3dd..8b7de428b1a8fe18cd00f165917b144b224b797d 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -197,7 +197,8 @@ def sparse_concat(axis,
                   sp_inputs,
                   name=None,
                   expand_nonconcat_dim=False,
-                  concat_dim=None):
+                  concat_dim=None,
+                  expand_nonconcat_dims=None):
   """Concatenates a list of `SparseTensor` along the specified dimension.
 
   Concatenation is with respect to the dense versions of each sparse input.
@@ -286,6 +287,7 @@ def sparse_concat(axis,
     expand_nonconcat_dim: Whether to allow the expansion in the non-concat
       dimensions. Defaulted to False.
     concat_dim: The old (deprecated) name for axis.
+    expand_nonconcat_dims: alias for expand_nonconcat_dim
 
   Returns:
     A `SparseTensor` with the concatenated output.
@@ -293,6 +295,11 @@ def sparse_concat(axis,
   Raises:
     TypeError: If `sp_inputs` is not a list of `SparseTensor`.
   """
+  expand_nonconcat_dim = deprecation.deprecated_argument_lookup(
+      "expand_nonconcat_dims", expand_nonconcat_dims,
+      "expand_nonconcat_dim", expand_nonconcat_dim)
+  if expand_nonconcat_dims is not None:
+    expand_nonconcat_dim = expand_nonconcat_dims
   axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
                                                 concat_dim)
   return sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim, name)
@@ -590,11 +597,11 @@ def _sparse_cross_internal(inputs,
   internal_type = dtypes.string
   for i in range(len(values)):
     if values[i].dtype != dtypes.string:
-      values[i] = math_ops.to_int64(values[i])
+      values[i] = math_ops.cast(values[i], dtypes.int64)
       internal_type = dtypes.int64
   for i in range(len(dense_inputs)):
     if dense_inputs[i].dtype != dtypes.string:
-      dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
+      dense_inputs[i] = math_ops.cast(dense_inputs[i], dtypes.int64)
       internal_type = dtypes.int64
 
   indices_out, values_out, shape_out = gen_sparse_ops.sparse_cross(
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 031069a0f017c5d7e80999d2aa6a3e5fd2cf10e6..8fa0e5896874970b479e8a5ec9fc2a9c195dfdb8 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -25,7 +25,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+# Need array_grad to register gradient for Identity.
+from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import gradient_checker_v2 as gradient_checker
 from tensorflow.python.ops import math_ops
+# Need sparse_grad to register gradient for SparseToDense.
+from tensorflow.python.ops import sparse_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
@@ -96,6 +101,21 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(result_value.values, expected)
     self.assertAllEqual(result_value.dense_shape, st.dense_shape)
 
+  def testSparseToDenseGradient(self):
+
+    def f(sparse_values, default_value):
+      st = sparse_tensor.SparseTensor(
+          indices=[[0, 3, 6], [1, 4, 7], [2, 5, 8]],
+          values=sparse_values,
+          dense_shape=[3, 6, 9])
+      return sparse_ops.sparse_tensor_to_dense(st, default_value)
+
+    grads = gradient_checker.compute_gradient(
+        f, [constant_op.constant([1.0, 2.0, 3.0]),
+            constant_op.constant(0.0)])
+    epsilon = 1e-4
+    self.assertLess(gradient_checker.max_error(*grads), epsilon)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 21f4996798eda29c8c9090c12b096d888c0b12d8..34242872fa9107bd60e49f3ebc99ff28c894e33e 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -172,6 +172,9 @@ def einsum(equation, *inputs, **kwargs):
   # Transpose
   >>> einsum('ij->ji', m)  # output[j,i] = m[i,j]
 
+  # Trace
+  >>> einsum('ii', m)  # output[j,i] = trace(m) = sum_i m[i, i]
+
   # Batch matrix multiplication
   >>> einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
   ```
@@ -180,7 +183,7 @@ def einsum(equation, *inputs, **kwargs):
 
   * Ellipses (subscripts like `ij...,jk...->ik...`)
   * Subscripts where an axis appears more than once for a single input
-    (e.g. `ijj,k->ik`).
+    (e.g. `ijj,k->ik`) unless it is a trace (e.g. `ijji`).
 
   Args:
     equation: a `str` describing the contraction, in the same format as
@@ -217,7 +220,6 @@ def einsum(equation, *inputs, **kwargs):
 
     inputs = list(inputs)
     input_axis_labels = match.group(1).split(',')
-
     if len(inputs) != len(input_axis_labels):
       raise ValueError('Got %d arguments for equation "%s", expecting %d' %
                        (len(inputs), equation, len(input_axis_labels)))
@@ -235,14 +237,15 @@ def einsum(equation, *inputs, **kwargs):
 
       output_axis_labels = ''.join(
           sorted(ax for ax in indices if counts[ax] == 1))
-
     for a in axis_labels:
       for input_labels in input_axis_labels:
+        if (len(input_axis_labels) == 1 and input_labels.count(a) == 2 and
+            input_labels == input_labels[::-1] and '->' not in equation):
+          return math_ops.trace(inputs[0])
         if input_labels.count(a) > 1:
           raise ValueError(
               'Subscript not supported: an axis appears more than once: %s' %
               input_labels)
-
     for a in axis_labels:
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
@@ -261,6 +264,7 @@ def einsum(equation, *inputs, **kwargs):
           temp, temp_axis_labels, inputs[i + 1], input_axis_labels[i + 1],
           axes_to_sum)
 
+
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
       axis = [
@@ -270,7 +274,6 @@ def einsum(equation, *inputs, **kwargs):
       temp = math_ops.reduce_sum(temp, axis=axis)
       temp_axis_labels = ''.join(
           a for a in temp_axis_labels if a in output_axis_labels)
-
     if sorted(temp_axis_labels) != sorted(output_axis_labels):
       raise ValueError('Invalid equation: %s' % equation)
 
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 0224e7ebb8aa6d35d20ab43a303887d325e00441..26219db3d952616812bfb354f52858ed7c5d8ca4 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -119,7 +119,6 @@ class LBetaTest(test.TestCase):
           special_math_ops.lbeta(x).get_shape())
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.disable_xla('This test never passed for XLA')
   def test_length_1_last_dimension_results_in_one(self):
     # If there is only one coefficient, the formula still works, and we get one
     # as the answer, always.
@@ -127,7 +126,9 @@ class LBetaTest(test.TestCase):
     x_b = [0.1]
     with self.session(use_gpu=True):
       self.assertAllClose(
-          1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))))
+          1,
+          self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))),
+          rtol=3e-6)
       self.assertAllClose(
           1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_b))))
       self.assertEqual((), special_math_ops.lbeta(x_a).get_shape())
@@ -203,54 +204,18 @@ class BesselTest(test.TestCase):
 class EinsumTest(test.TestCase):
 
   simple_cases = [
-      'ij,jk->ik',
-      'ijk,jklm->il',
-      'ij,jk,kl->il',
-      'ijk->i',
-      'ijk->kji',
-      'ji,kj->ik',
-      'ikl,kji->kl',
-      'klj,lki->ij',
-      'ijk,ilj->kli',
-      'kij,mkb->ijmb',
-      'ijk,ijl,ikl->i',
-      'i,ijk,j->k',
-      'ij,ij,jk,kl->il',
-      'ij,kj,il,jm->ml',
-      'a,ab,abc->abc',
-      'a,b,ab->ab',
-      'ab,ab,c->',
-      'ab,ab,c->c',
-      'ab,ab,cd,cd->',
-      'ab,ab,cd,cd->ac',
-      'ab,ab,cd,cd->cd',
-      'ab,ab,cd,cd,ef,ef->',
-      'ab,cd,ef->abcdef',
-      'ab,cd,ef->acdf',
-      'ab,cd,de->abcde',
-      'ab,cd,de->be',
-      'ab,bcd,cd->abcd',
-      'ab,bcd,cd->abd',
-      'eb,cb,fb->cef',
-      'abcd,ad',
-      'bd,db,eac->ace',
-      'ba,ac,da->bcd',
-      'ab,ab',
-      'ab,ba',
-      'abc,abc',
-      'abc,bac',
-      'abc,cba',
-      'dba,ead,cad->bce',
-      'aef,fbc,dca->bde',
-      'iJ,Jk->ik',
-      'iJ,Ki->JK',
-      'iJk,Jklm->Jk',
-      'ij, jk, kl -> il',
-      'a, ab, abc -> abc',
-      'ab, ab, cd, cd, ef, ef -> ',
-      'abc, bac',
-      'iJ, Ki -> JK',
-      'iJk, Jklm -> Jk'
+      'ij,jk->ik', 'ijk,jklm->il', 'ij,jk,kl->il', 'ijk->i', 'ijk->kji',
+      'ji,kj->ik', 'ikl,kji->kl', 'klj,lki->ij', 'ijk,ilj->kli',
+      'kij,mkb->ijmb', 'ijk,ijl,ikl->i', 'i,ijk,j->k', 'ij,ij,jk,kl->il',
+      'ij,kj,il,jm->ml', 'a,ab,abc->abc', 'a,b,ab->ab', 'ab,ab,c->',
+      'ab,ab,c->c', 'ab,ab,cd,cd->', 'ab,ab,cd,cd->ac', 'ab,ab,cd,cd->cd',
+      'ab,ab,cd,cd,ef,ef->', 'ab,cd,ef->abcdef', 'ab,cd,ef->acdf',
+      'ab,cd,de->abcde', 'ab,cd,de->be', 'ab,bcd,cd->abcd', 'ab,bcd,cd->abd',
+      'eb,cb,fb->cef', 'abcd,ad', 'bd,db,eac->ace', 'ba,ac,da->bcd', 'ab,ab',
+      'ab,ba', 'abc,abc', 'abc,bac', 'abc,cba', 'dba,ead,cad->bce',
+      'aef,fbc,dca->bde', 'iJ,Jk->ik', 'iJ,Ki->JK', 'iJk,Jklm->Jk',
+      'ij, jk, kl -> il', 'a, ab, abc -> abc', 'ab, ab, cd, cd, ef, ef -> ',
+      'abc, bac', 'iJ, Ki -> JK', 'iJk, Jklm -> Jk', 'ii', 'ijji'
   ]
 
   long_cases = [
@@ -349,10 +314,13 @@ class EinsumTest(test.TestCase):
     with self.session(use_gpu=True):
       output_value = self.evaluate(output_tensor)
 
-    correct_value = np.einsum(axes, *input_vals)
-
+    correct_value = 0
+    if axes == 'ijji':
+      output = math_ops.trace(*input_tensors)
+      correct_value = self.evaluate(output)
+    else:
+      correct_value = np.einsum(axes, *input_vals)
     err = np.abs(correct_value - output_value).max()
-    # print(axes, err)
     self.assertLess(err, 1e-8)
 
   def test_input_is_placeholder(self):
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index ba3bd094923abf2929d9e64e9f9bdb7d60cf4c80..5e217d8ed2f3bbe427c144700e485d1be339545f 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -54,6 +54,7 @@ from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=rede
 # pylint: enable=redefined-builtin
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
+from tensorflow.python.ops.critical_section_ops import *
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
 from tensorflow.python.ops.gradients import *
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index be21263f4cbdbdd4a38b0e849e1fec15ba033712..10d41f363f8d0472e2eed921d6ef0fd84b68c21d 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -597,6 +597,220 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
+@tf_export(v1=["scatter_mul"])
+def scatter_mul(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Multiplies sparse updates into a variable reference.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] *= updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] *= updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the reset value.
+
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions multiply.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape =
+  []`.
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`,
+      `qint8`, `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`,
+      `uint32`, `uint64`. Should be from a `Variable` node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`. A
+      tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`. A tensor of updated
+      values to multiply to `ref`.
+    use_locking: An optional `bool`. Defaults to `False`. If True, the operation
+      will be protected by a lock; otherwise the behavior is undefined, but may
+      exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  return gen_state_ops.scatter_mul(
+      ref=ref,
+      indices=indices,
+      updates=updates,
+      use_locking=use_locking,
+      name=name)
+
+
+@tf_export(v1=["scatter_div"])
+def scatter_div(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Divides a variable reference by sparse updates.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] /= updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] /= updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the reset value.
+
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions divide.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape =
+  []`.
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`,
+      `qint8`, `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`,
+      `uint32`, `uint64`. Should be from a `Variable` node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`. A
+      tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`. A tensor of values
+      that `ref` is divided by.
+    use_locking: An optional `bool`. Defaults to `False`. If True, the operation
+      will be protected by a lock; otherwise the behavior is undefined, but may
+      exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  return gen_state_ops.scatter_div(
+      ref=ref,
+      indices=indices,
+      updates=updates,
+      use_locking=use_locking,
+      name=name)
+
+
+@tf_export(v1=["scatter_max"])
+def scatter_max(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Reduces sparse updates into a variable reference using the `max` operation.
+
+  This operation computes
+
+      # Scalar indices
+      ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...],
+      updates[i, ..., j, ...])
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the reset value.
+
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions combine.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape =
+  []`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png"
+  alt>
+  </div>
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `half`,
+      `bfloat16`, `float32`, `float64`, `int32`, `int64`. Should be from a
+      `Variable` node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`. A
+      tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`. A tensor of updated
+      values to reduce into `ref`.
+    use_locking: An optional `bool`. Defaults to `False`. If True, the update
+      will be protected by a lock; otherwise the behavior is undefined, but may
+      exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  return gen_state_ops.scatter_max(
+      ref=ref,
+      indices=indices,
+      updates=updates,
+      use_locking=use_locking,
+      name=name)
+
+
+@tf_export(v1=["scatter_min"])
+def scatter_min(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Reduces sparse updates into a variable reference using the `min` operation.
+
+  This operation computes
+
+      # Scalar indices
+      ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...],
+      updates[i, ..., j, ...])
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the reset value.
+
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions combine.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape =
+  []`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png"
+  alt>
+  </div>
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `half`,
+      `bfloat16`, `float32`, `float64`, `int32`, `int64`. Should be from a
+      `Variable` node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`. A
+      tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`. A tensor of updated
+      values to reduce into `ref`.
+    use_locking: An optional `bool`. Defaults to `False`. If True, the update
+      will be protected by a lock; otherwise the behavior is undefined, but may
+      exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  return gen_state_ops.scatter_min(
+      ref=ref,
+      indices=indices,
+      updates=updates,
+      use_locking=use_locking,
+      name=name)
+
+
 @tf_export(v1=["batch_scatter_update"])
 @deprecation.deprecated(
     "2018-11-29", "Use the batch_scatter_update method of Variable instead.")
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 155ad969f67073f151fbdc295570e85af5dc22b1..91625ff968e3301926bbef0f94807e584d00ef37 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import \
+from tensorflow.python.training.tracking import \
 tracking
 from tensorflow.python.util.tf_export import tf_export
 
@@ -51,21 +51,31 @@ SEED_SIZE = 16  # in units of SEED_TYPE
 
 
 STATE_TYPE = SEED_TYPE
+ALGORITHM_TYPE = STATE_TYPE
 RNG_ALG_PHILOX = 1
+RNG_ALG_THREEFRY = 2
 DEFAULT_ALGORITHM = RNG_ALG_PHILOX
 
 
-def non_deterministic_seed():
-  """Makes a non-deterministic seed.
+PHILOX_STATE_SIZE = 3
+THREEFRY_STATE_SIZE = 2
+
+
+def non_deterministic_ints(shape, dtype=dtypes.int64):
+  """Non-deterministically generates some integers.
 
-  The implementation will be changed soon from pure Python to an op.
+  This op may use some OS-provided source of non-determinism (e.g. an RNG), so
+  each execution will give different results.
+
+  Args:
+    shape: the shape of the result.
+    dtype: (optional) the dtype of the result.
 
   Returns:
-    a 1-D tensor.
+    a tensor whose element values are non-deterministically chosen.
   """
-  return np.random.randint(
-      low=SEED_MIN, high=SEED_MAX + 1, size=SEED_SIZE,
-      dtype=SEED_TYPE)
+  return gen_stateful_random_ops.non_deterministic_ints(
+      shape=shape, dtype=dtype)
 
 
 def _uint_to_int(n):
@@ -74,23 +84,21 @@ def _uint_to_int(n):
   return n
 
 
-PHILOX_STATE_SIZE = 3
-
-
-def _make_philox_state(seed):
-  """Makes a RNG state for Philox algorithm.
+def _make_1d_state(state_size, seed):
+  """Makes a 1-D RNG state.
 
   Args:
+    state_size: an integer.
     seed: an integer or 1-D tensor.
 
   Returns:
-    a 1-D tensor.
+    a 1-D tensor of shape [state_size] and dtype STATE_TYPE.
   """
   int_types = (int,) if sys.version_info >= (3, 0) else (int, long)
   if isinstance(seed, int_types):
     # chop the Python integer (infinite precision) into chunks of SEED_TYPE
     ls = []
-    for _ in range(PHILOX_STATE_SIZE):
+    for _ in range(state_size):
       ls.append(seed & SEED_BIT_MASK)
       seed >>= SEED_TYPE_BITS
     seed = ls
@@ -100,45 +108,43 @@ def _make_philox_state(seed):
   if len(seed.shape) != 1:
     raise ValueError(
         "seed should only have one dimension; got shape: %s" % seed.shape)
-  seed = seed[0:PHILOX_STATE_SIZE]
+  seed = seed[0:state_size]
   # Padding with zeros on the right if too short
   seed_size = seed.shape[0]
-  if seed_size < PHILOX_STATE_SIZE:
+  if seed_size < state_size:
     seed = np.pad(
-        seed, [(0, PHILOX_STATE_SIZE - seed_size)],
+        seed, [(0, state_size - seed_size)],
         mode="constant",
         constant_values=0)
-  assert seed.shape == (PHILOX_STATE_SIZE,), "Wrong seed.shape: %s" % seed.shape
+  assert seed.shape == (state_size,), "Wrong seed.shape: %s" % seed.shape
   return seed
 
 
-def _make_state_from_seed(seed, algorithm):
-  if algorithm == RNG_ALG_PHILOX:
-    return _make_philox_state(seed)
+def _get_state_size(alg):
+  if alg == RNG_ALG_PHILOX:
+    return PHILOX_STATE_SIZE
+  elif alg == RNG_ALG_THREEFRY:
+    return THREEFRY_STATE_SIZE
   else:
-    raise ValueError("Unsupported algorithm id: %s" % algorithm)
+    raise ValueError("Unsupported algorithm id: %s" % alg)
+
 
+def _make_state_from_seed(seed, alg):
+  return _make_1d_state(_get_state_size(alg), seed)
 
-def create_rng_state(seed, algorithm=None):
+
+@tf_export("random.experimental.create_rng_state")
+def create_rng_state(seed, algorithm):
   """Creates a RNG state.
 
   Args:
     seed: an integer or 1-D tensor.
-    algorithm: (optional) an integer representing the RNG algorithm. If None, an
-      algorithm will be auto-selected.
+    algorithm: an integer representing the RNG algorithm.
 
   Returns:
-    a 1-D tensor "rng_state" with:
-    * rng_state[0] is a value that identifies the RNG algorithm;
-    * rng_state[1:] holds the RNG state itself (size dependent on the
-        algorithm).
+    a 1-D tensor whose size depends on the algorithm.
   """
-  if algorithm is None:
-    # TODO(wangpeng): more sophisticated algorithm selection
-    algorithm = DEFAULT_ALGORITHM
-  state = _make_state_from_seed(seed, algorithm)
-  return np.concatenate((np.array([algorithm], dtype=STATE_TYPE), state),
-                        axis=None)
+  return _make_state_from_seed(seed, algorithm)
 
 
 def _shape_tensor(shape):
@@ -151,26 +157,45 @@ def _shape_tensor(shape):
 
 
 @tf_export("random.experimental.Generator")
-class Generator(tracking.AutoCheckpointable):
+class Generator(tracking.AutoTrackable):
   """Random-number generator.
 
   It uses Variable to manage its internal state.
   """
 
   def __init__(self, copy_from=None, seed=None, algorithm=None):
+    """Creates a generator.
+
+    Args:
+      copy_from: (optional) a generator to be copied from.
+      seed: (optional) the seed for the RNG. If None, it will be chosen
+            nondeterministically
+      algorithm: (optional) the RNG algorithm. If None, it will be
+                 auto-selected.
+    """
     if copy_from is None:
+      if algorithm is None:
+        # TODO(wangpeng): more sophisticated algorithm selection
+        algorithm = DEFAULT_ALGORITHM
       if seed is None:
-        seed = non_deterministic_seed()
-      state = create_rng_state(seed, algorithm)
+        state = non_deterministic_ints(shape=[_get_state_size(algorithm)],
+                                       dtype=SEED_TYPE)
+      else:
+        state = create_rng_state(seed, algorithm)
       self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._alg_var = algorithm
     else:
       assert seed is None
-      state = copy_from.state
-      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._state_var = variables.Variable(copy_from.state, dtype=STATE_TYPE)
+      self._alg_var = copy_from.algorithm
 
   def reset(self, seed):
-    algorithm = int(self.algorithm)
-    state = create_rng_state(seed, algorithm)
+    """Resets the generator.
+
+    Args:
+      seed: the seed to reset the RNG to.
+    """
+    state = create_rng_state(seed, self.algorithm)
     self._state_var.assign(state)
 
   @property
@@ -179,24 +204,66 @@ class Generator(tracking.AutoCheckpointable):
 
   @property
   def algorithm(self):
-    return self._state_var[0]
+    return self._alg_var
+
+  def _standard_normal(self, shape, dtype):
+    return gen_stateful_random_ops.stateful_standard_normal_v2(
+        self.state.handle, self.algorithm, shape, dtype=dtype)
 
   # The following functions return a tensor and as a side effect update
   # self._state_var.
-  def standard_normal(self, shape, dtype=dtypes.float32):
-    return gen_stateful_random_ops.stateful_standard_normal(
-        self.state.handle, shape, dtype)
-
   def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
              name=None):
     with ops.name_scope(name, "stateful_normal", [shape, mean, stddev]) as name:
       shape = _shape_tensor(shape)
       mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
       stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-      rnd = self.standard_normal(shape, dtype)
+      rnd = self._standard_normal(shape, dtype=dtype)
       return math_ops.add(rnd * stddev, mean, name=name)
 
-  # TODO(wangpeng): implement other distributions (`uniform`,
+  def uniform(self, shape, minval=0, maxval=None,
+              dtype=dtypes.float32, name=None):
+    dtype = dtypes.as_dtype(dtype)
+    if maxval is None:
+      if dtype.is_integer:
+        raise ValueError("Must specify maxval for integer dtype %r" % dtype)
+      maxval = 1
+    with ops.name_scope(name, "stateful_uniform",
+                        [shape, minval, maxval]) as name:
+      shape = _shape_tensor(shape)
+      minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
+      maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
+      if dtype.is_integer:
+        return gen_stateful_random_ops.stateful_uniform_int(
+            self.state.handle, self.algorithm, shape=shape,
+            minval=minval, maxval=maxval, name=name)
+      else:
+        # TODO(wangpeng): implement uniform for floats
+        raise ValueError("uniform for floats not implemented yet")
+
+  def uniform_full_int(self, shape, dtype=dtypes.uint64, name=None):
+    """Uniform distribution on an integer type's entire range.
+
+    The other method `uniform` only covers the range [minval, maxval), which
+    cannot be `dtype`'s full range because `maxval` is of type `dtype`.
+
+    Args:
+      shape: the shape of the output.
+      dtype: (optional) the integer type, default to uint64.
+      name: (optional) the name of the node.
+
+    Returns:
+      A tensor of random numbers of the required shape.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    with ops.name_scope(name, "stateful_uniform_full_int",
+                        [shape]) as name:
+      shape = _shape_tensor(shape)
+      return gen_stateful_random_ops.stateful_uniform_full_int(
+          self.state.handle, self.algorithm, shape=shape,
+          dtype=dtype, name=name)
+
+  # TODO(wangpeng): implement other distributions (
   #   `truncated_normal`, etc.)
   # TODO(wangpeng): implement `make_seeds`
   # TODO(wangpeng): implement `make_generators`
@@ -237,5 +304,6 @@ def set_global_generator(generator):
 def reset_global_generator(seed, algorithm=None):
   global global_generator
   if algorithm is None:
-    algorithm = int(global_generator.algorithm)  # preserve the old algorithm
+    # preserve the old algorithm
+    algorithm = int(get_global_generator().algorithm)
   global_generator = Generator(seed=seed, algorithm=algorithm)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 92419a0770e302acce205cf9c2c37023da72434c..ea1cebd18fe3a22d88cebc2ee94d423204ad1abe 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -18,18 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import stateful_random_ops as \
 random
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+g_seeded = None
+g_unseeded = None
+
+
 class StatefulRandomOpsTest(test.TestCase):
 
   def testCreateRNGStateIntSeed(self):
@@ -39,10 +48,79 @@ class StatefulRandomOpsTest(test.TestCase):
                                     random.RNG_ALG_PHILOX)
     self.assertAllEqual(
         list(map(random._uint_to_int,
-                 [random.RNG_ALG_PHILOX, 0xFFAA666677778888,
-                  0xFFFF222233334444] + [0] * (random.PHILOX_STATE_SIZE - 2))),
+                 [0xFFAA666677778888, 0xFFFF222233334444] +
+                 [0] * (random.PHILOX_STATE_SIZE - 2))),
         state)
 
+  @test_util.run_v2_only
+  def testNonDeterministicInts(self):
+    """Tests that non_deterministic_ints returns different results every time.
+
+    This test is flaky, but with very low probability of failing.
+    """
+    shape = [2, 3]
+    dtype = dtypes.uint64
+    a = random.non_deterministic_ints(shape=shape, dtype=dtype)
+    self.assertAllEqual(shape, a.shape)
+    self.assertEqual(dtype, a.dtype)
+    b = random.non_deterministic_ints(shape, dtype=dtype)
+    self.assertNotAllClose(a, b)
+
+  @test_util.run_v2_only
+  def testGeneratorCreationInDefun(self):
+    """Tests creating a Generator in defun.
+
+    The interaction between Generator creation and defun should be the same as
+    tf.Variable.
+    """
+    seed = 1234
+    shape = [2, 3]
+    with ops.device("/device:CPU:0"):
+      gen = random.Generator(seed=seed)
+      expected_normal1 = gen.normal(shape)
+      expected_normal2 = gen.normal(shape)
+      @def_function.function
+      def f():
+        global g_seeded
+        global g_unseeded
+        # defun'ed function should only create variables once
+        if g_seeded is None:
+          g_seeded = random.Generator(seed=seed)
+        if g_unseeded is None:
+          g_unseeded = random.Generator()
+        r = g_seeded.normal(shape)
+        r = (r, g_unseeded.normal(shape))
+        return r
+      def check_results(expected_normal, v1, v2):
+        self.assertAllEqual(expected_normal, v1)
+        self.assertAllEqual(shape, v2.shape)
+      check_results(expected_normal1, *f())
+      check_results(expected_normal2, *f())
+
+  @test_util.run_v1_only
+  def testTF1(self):
+    seed = 1234
+    shape = [2, 3]
+    expected_normal1 = constant_op.constant(
+        [[0.9356609, 1.0854305, -0.93788373],
+         [-0.50615472, 1.31697023, 0.71375787]], dtype=dtypes.float32)
+    expected_normal2 = constant_op.constant(
+        [[-0.3964749, 0.8369565, -0.30946946],
+         [1.1206646, 1.00852597, -0.10185789]], dtype=dtypes.float32)
+    with self.cached_session() as sess:
+      gen1 = random.Generator(seed=seed)
+      gen2 = random.Generator()
+      sess.run((gen1._state_var.initializer, gen2._state_var.initializer))
+      r1 = gen1.normal(shape)
+      r2 = gen2.normal(shape)
+      def f():
+        return sess.run((r1, r2))
+      def check_results(expected_normal, v1, v2):
+        self.assertAllEqual(expected_normal, v1)
+        self.assertAllEqual(shape, v2.shape)
+      check_results(expected_normal1, *f())
+      check_results(expected_normal2, *f())
+
   @test_util.run_v2_only
   @test_util.also_run_as_tf_function
   def testEagerAndDefun(self):
@@ -112,27 +190,81 @@ class StatefulRandomOpsTest(test.TestCase):
     compare(True, False)
 
   @test_util.run_v2_only
-  def testSameAsOldRandomOps(self):
-    """Tests that the generated numbers are the same as the old random_ops.py .
+  def testCPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The CPU version.
     """
-    seed1, seed2 = 50, 60
+    seed1, seed2 = 79, 25
     # note how the two seeds for the old op correspond to the seed for the new
     # op
-    random.get_global_generator().reset([0, seed2, seed1])
-    shape = constant_op.constant([2, 3])
-    dtype = dtypes.float32
+    with ops.device("/device:CPU:0"):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
     # create a graph for the old op in order to call it many times
     @def_function.function
     def old():
-      return gen_random_ops.random_standard_normal(
-          shape, dtype=dtype, seed=seed1, seed2=seed2)
+      with ops.device("/device:CPU:0"):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
 
     def new():
-      return random.get_global_generator().standard_normal(shape, dtype=dtype)
+      with ops.device("/device:CPU:0"):
+        return random.get_global_generator().normal(shape, dtype=dtype)
 
     for _ in range(100):
       self.assertAllEqual(old(), new())
 
+  @test_util.run_v2_only
+  @test_util.run_cuda_only
+  def testGPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The GPU version.
+    """
+    seed1, seed2 = 79, 25
+    with ops.device(test_util.gpu_device_name()):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
+    @def_function.function
+    def old():
+      with ops.device(test_util.gpu_device_name()):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    def new():
+      with ops.device(test_util.gpu_device_name()):
+        return random.get_global_generator().normal(shape, dtype=dtype)
+
+    for _ in range(100):
+      self.assertAllEqual(old(), new())
+
+  @test_util.run_v2_only
+  def testStatefulStandardNormal(self):
+    """Tests that op 'StatefulStandardNormal' still works.
+    """
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+    seed = 1234
+    algorithm = random.RNG_ALG_PHILOX
+    state = random._make_state_from_seed(seed, algorithm)
+    with ops.device("/device:CPU:0"):
+      var1 = variables.Variable(
+          np.concatenate((np.array([algorithm], dtype=random.STATE_TYPE),
+                          state), axis=None),
+          dtype=random.STATE_TYPE)
+      var2 = variables.Variable(state, dtype=random.STATE_TYPE)
+      for _ in range(100):
+        t1 = gen_stateful_random_ops.stateful_standard_normal(
+            var1.handle, shape, dtype)
+        t2 = gen_stateful_random_ops.stateful_standard_normal_v2(
+            var2.handle, algorithm, shape, dtype)
+        self.assertAllEqual(t1, t2)
+
   @test_util.run_v2_only
   def testResetGlobalGeneratorBadWithDefun(self):
     """Demonstrates that reset_global_generator don't work properly with defun.
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index b119049b163dd57aee08f078e5ab5ca913f61706..0576f6e933c6628681150ed623d85a69f2f745a2 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -182,7 +182,7 @@ def stateless_truncated_normal(shape,
 
 @tf_export(v1=["random.stateless_multinomial"])
 @deprecation.deprecated(
-    date=None, instructions="Use tf.random.stateless_categorical instead.")
+    date=None, instructions="Use `tf.random.stateless_categorical` instead.")
 def stateless_multinomial(logits,
                           num_samples,
                           seed,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 9967f48060c2aefd7c5fe789c82f935751efc45c..ebd0d9b440003ef666e2bc518ac04010d2884caa 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -192,15 +192,18 @@ def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
                                       name=name)
 
 
-@tf_export("string_split")
-def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
+@tf_export(v1=["string_split"])
+@deprecation.deprecated_args(None,
+                             "delimiter is deprecated, please use sep instead.",
+                             "delimiter")
+def string_split(source, sep=None, skip_empty=True, delimiter=None):  # pylint: disable=invalid-name
   """Split elements of `source` based on `delimiter` into a `SparseTensor`.
 
   Let N be the size of source (typically N will be the batch size). Split each
   element of `source` based on `delimiter` and return a `SparseTensor`
   containing the split tokens. Empty tokens are ignored.
 
-  If `delimiter` is an empty string, each element of the `source` is split
+  If `sep` is an empty string, each element of the `source` is split
   into individual strings, each containing one byte. (This includes splitting
   multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is
   treated as a set of delimiters with each considered a potential split point.
@@ -219,9 +222,10 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
 
   Args:
     source: `1-D` string `Tensor`, the strings to split.
-    delimiter: `0-D` string `Tensor`, the delimiter character, the string should
-      be length 0 or 1.
+    sep: `0-D` string `Tensor`, the delimiter character, the string should
+      be length 0 or 1. Default is ' '.
     skip_empty: A `bool`. If `True`, skip the empty strings from the result.
+    delimiter: deprecated alias for `sep`.
 
   Raises:
     ValueError: If delimiter is not a string.
@@ -231,6 +235,11 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
     The first column of the indices corresponds to the row in `source` and the
     second column corresponds to the index of the split component in this row.
   """
+  delimiter = deprecation.deprecated_argument_lookup(
+      "sep", sep, "delimiter", delimiter)
+
+  if delimiter is None:
+    delimiter = " "
   delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string)
   source = ops.convert_to_tensor(source, dtype=dtypes.string)
 
@@ -265,7 +274,7 @@ def string_split_v2(source, sep=None, maxsplit=-1):
   deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
   sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
   string, consecutive whitespace are regarded as a single separator, and the
-  result will contain no empty strings at the startor end if the string has
+  result will contain no empty strings at the start or end if the string has
   leading or trailing whitespace.
 
   Note that the above mentioned behavior matches python's str.split.
@@ -321,7 +330,10 @@ def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
                 separator="",
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keepdims=None):
+  keep_dims = deprecation.deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
   inputs_t = ops.convert_to_tensor(inputs)
   reduction_indices = _reduce_join_reduction_dims(
       inputs_t, axis, reduction_indices)
@@ -422,9 +434,19 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
     A `Tensor` of type `out_type`.
   """
   return gen_parsing_ops.string_to_number(input, out_type, name)
-tf_export(v1=["strings.to_number", "string_to_number"])(
-    gen_parsing_ops.string_to_number
-    )
+
+
+@tf_export(v1=["strings.to_number", "string_to_number"])
+def string_to_number_v1(
+    string_tensor=None,
+    out_type=dtypes.float32,
+    name=None,
+    input=None):
+  string_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "string_tensor", string_tensor)
+  return gen_parsing_ops.string_to_number(string_tensor, out_type, name)
+
+string_to_number_v1.__doc__ = gen_parsing_ops.string_to_number.__doc__
 
 
 @tf_export("strings.to_hash_bucket", v1=[])
@@ -450,6 +472,16 @@ def string_to_hash_bucket(input, num_buckets, name=None):
   """
   # pylint: enable=line-too-long
   return gen_string_ops.string_to_hash_bucket(input, num_buckets, name)
-tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])(
-    gen_string_ops.string_to_hash_bucket
-    )
+
+
+@tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+def string_to_hash_bucket_v1(
+    string_tensor=None,
+    num_buckets=None,
+    name=None,
+    input=None):
+  string_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "string_tensor", string_tensor)
+  return gen_string_ops.string_to_hash_bucket(string_tensor, num_buckets, name)
+
+string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 93d8d50842ba681688e6d42890445ab4e6879124..37b80d5e20bf06c041a669c14ac6d88201af2180 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -21,9 +21,7 @@ from __future__ import print_function
 import contextlib
 import re
 
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
 
 
@@ -44,30 +42,6 @@ def collect(val, collections, default_collections):
 _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
-def skip_summary():
-  """Determines if summary should be skipped.
-
-  If using multiple replicas in distributed strategy, skip summaries on all
-  replicas except the first one (replica_id=0).
-
-  Returns:
-    True if the summary is skipped; False otherwise.
-  """
-
-  # TODO(priyag): Add a new optional argument that will provide multiple
-  # alternatives to override default behavior. (e.g. run on last replica,
-  # compute sum or mean across replicas).
-  replica_context = distribution_strategy_context.get_replica_context()
-  if not replica_context:
-    return False
-  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
-  # initialized, remember to change here as well.
-  replica_id = replica_context.replica_id_in_sync_group
-  if isinstance(replica_id, ops.Tensor):
-    replica_id = tensor_util.constant_value(replica_id)
-  return replica_id and replica_id > 0
-
-
 def clean_tag(name):
   """Cleans a tag. Removes illegal characters for instance.
 
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 168cb975548095be4648a9e705deb797241363c7..e64a7d8de7fa9d6b7c121093495db448c847b0db 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -19,19 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+import collections
+import functools
 import getpass
 import os
 import re
+import threading
 import time
 
 import six
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import profiler as _profiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_summary_ops
@@ -44,12 +52,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
-# Dictionary mapping graph keys to a boolean Tensor (or callable returning
-# a boolean Tensor) indicating whether we should record summaries for the
-# graph identified by the key of the dictionary.
-_SHOULD_RECORD_SUMMARIES = {}
-
 # A global dictionary mapping graph keys to a list of summary writer init ops.
 _SUMMARY_WRITER_INIT_OP = {}
 
@@ -58,13 +60,26 @@ _RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
 _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 
 
-def _should_record_summaries_internal():
-  """Returns boolean Tensor if summaries should/shouldn't be recorded, or None.
+def _should_record_summaries_internal(default_state):
+  """Returns boolean Tensor if summaries should/shouldn't be recorded.
+
+  Now the summary condition is decided by logical "and" of two conditions:
+  ctx.summary_recording and ctx.summary_recording_distribution_strategy. The
+  former one is usually set by user, and the latter one is controlled by
+  DistributionStrategy (tf.distribute.ReplicaContext).
+
+  Args:
+    default_state: can be True or False. The default summary behavior when user
+      does not specify ctx.summary_recording and
+      ctx.summary_recording_distribution_strategy is True.
   """
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  should = _SHOULD_RECORD_SUMMARIES.get(key)
-  return should() if callable(should) else should
+  ctx = context.context()
+  resolve = lambda x: x() if callable(x) else x
+  cond_distributed = resolve(ctx.summary_recording_distribution_strategy)
+  cond = resolve(ctx.summary_recording)
+  if cond is None:
+    cond = default_state
+  return math_ops.logical_and(cond_distributed, cond)
 
 
 def _should_record_summaries_v2():
@@ -73,42 +88,36 @@ def _should_record_summaries_v2():
   If no recording status has been set, this defaults to True, unlike the public
   should_record_summaries().
   """
-  result = _should_record_summaries_internal()
-  return True if result is None else result
+  return _should_record_summaries_internal(default_state=True)
 
 
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
-  result = _should_record_summaries_internal()
-  return False if result is None else result
+  return _should_record_summaries_internal(default_state=False)
 
 
+@tf_export("summary.record_if", v1=[])
 @tf_contextlib.contextmanager
-def _record_summaries(boolean=True):
+def record_if(condition):
   """Sets summary recording on or off per the provided boolean value.
 
   The provided value can be a python boolean, a scalar boolean Tensor, or
   or a callable providing such a value; if a callable is passed it will be
-  invoked each time should_record_summaries() is called to determine whether
-  summary writing should be enabled.
+  invoked on-demand to determine whether summary writing will occur.
 
   Args:
-    boolean: can be True, False, a bool Tensor, or a callable providing such.
-      Defaults to True.
+    condition: can be True, False, a bool Tensor, or a callable providing such.
 
   Yields:
     Returns a context manager that sets this value on enter and restores the
     previous value on exit.
   """
-  # TODO(nickfelt): make this threadlocal
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, None)
+  old = context.context().summary_recording
   try:
-    _SHOULD_RECORD_SUMMARIES[key] = boolean
+    context.context().summary_recording = condition
     yield
   finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+    context.context().summary_recording = old
 
 
 # TODO(apassos) consider how to handle local step here.
@@ -120,91 +129,179 @@ def record_summaries_every_n_global_steps(n, global_step=None):
     should = lambda: math_ops.equal(global_step % n, 0)
     if not context.executing_eagerly():
       should = should()
-  return _record_summaries(should)
+  return record_if(should)
 
 
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  return _record_summaries(True)
+  return record_if(True)
 
 
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  return _record_summaries(False)
+  return record_if(False)
+
+
+@tf_export("summary.experimental.get_step", v1=[])
+def get_step():
+  """Returns the default summary step for the current thread.
+
+  Returns:
+    The step set by `tf.summary.experimental.set_step()` if one has been set,
+    otherwise None.
+  """
+  return context.context().summary_step
+
+
+@tf_export("summary.experimental.set_step", v1=[])
+def set_step(step):
+  """Sets the default summary step for the current thread.
+
+  For convenience, this function sets a default value for the `step` parameter
+  used in summary-writing functions elsewhere in the API so that it need not
+  be explicitly passed in every such invocation. The value can be a constant
+  or a variable, and can be retrieved via `tf.summary.experimental.get_step()`.
+
+  Note: when using this with @tf.functions, the step value will be captured at
+  the time the function is traced, so changes to the step outside the function
+  will not be reflected inside the function unless using a `tf.Variable` step.
+
+  Args:
+    step: An `int64`-castable default step value, or None to unset.
+  """
+  context.context().summary_step = step
 
 
 @tf_export("summary.SummaryWriter", v1=[])
+@six.add_metaclass(abc.ABCMeta)
 class SummaryWriter(object):
-  """Encapsulates a stateful summary writer resource.
+  """Interface representing a stateful summary writer object."""
 
-  See also:
-  - `tf.summary.create_file_writer`
-  - `tf.summary.create_db_writer`
-  """
+  @abc.abstractmethod
+  def set_as_default(self):
+    """Enables this summary writer for the current thread."""
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  @tf_contextlib.contextmanager
+  def as_default(self):
+    """Returns a context manager that enables summary writing."""
+    raise NotImplementedError()
+
+  def init(self):
+    """Initializes the summary writer."""
+    raise NotImplementedError()
+
+  def flush(self):
+    """Flushes any buffered data."""
+    raise NotImplementedError()
+
+  def close(self):
+    """Flushes and closes the summary writer."""
+    raise NotImplementedError()
+
+
+class ResourceSummaryWriter(SummaryWriter):
+  """Implementation of SummaryWriter using a SummaryWriterInterface resource."""
 
-  def  __init__(self, resource, init_op_fn):
-    self._resource = resource
-    # TODO(nickfelt): cache constructed ops in graph mode
+  def  __init__(self, shared_name, init_op_fn, name=None, v2=False):
+    self._resource = gen_summary_ops.summary_writer(
+        shared_name=shared_name, name=name)
+    # TODO(nickfelt): cache other constructed ops in graph mode
     self._init_op_fn = init_op_fn
-    if context.executing_eagerly() and self._resource is not None:
+    self._init_op = init_op_fn(self._resource)
+    self._v2 = v2
+    self._closed = False
+    if context.executing_eagerly():
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
+    else:
+      global _SUMMARY_WRITER_INIT_OP
+      key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+      _SUMMARY_WRITER_INIT_OP.setdefault(key, []).append(self._init_op)
 
   def set_as_default(self):
     """Enables this summary writer for the current thread."""
-    context.context().summary_writer_resource = self._resource
+    if self._v2 and context.executing_eagerly() and self._closed:
+      raise RuntimeError("SummaryWriter is already closed")
+    context.context().summary_writer = self
 
   @tf_contextlib.contextmanager
   def as_default(self):
-    """Enables summary writing within a `with` block."""
-    if self._resource is None:
+    """Returns a context manager that enables summary writing."""
+    if self._v2 and context.executing_eagerly() and self._closed:
+      raise RuntimeError("SummaryWriter is already closed")
+    old = context.context().summary_writer
+    try:
+      context.context().summary_writer = self
       yield self
-    else:
-      old = context.context().summary_writer_resource
-      try:
-        context.context().summary_writer_resource = self._resource
-        yield self
-        # Flushes the summary writer in eager mode or in graph functions, but
-        # not in legacy graph mode (you're on your own there).
-        with ops.device("cpu:0"):
-          gen_summary_ops.flush_summary_writer(self._resource)
-      finally:
-        context.context().summary_writer_resource = old
+      # Flushes the summary writer in eager mode or in graph functions, but
+      # not in legacy graph mode (you're on your own there).
+      self.flush()
+    finally:
+      context.context().summary_writer = old
 
   def init(self):
-    """Operation to initialize the summary writer resource."""
-    if self._resource is not None:
-      return self._init_op_fn()
+    """Initializes the summary writer."""
+    if self._v2:
+      if context.executing_eagerly() and self._closed:
+        raise RuntimeError("SummaryWriter is already closed")
+      return self._init_op
+    # Legacy behavior allows re-initializing the resource.
+    return self._init_op_fn(self._resource)
 
-  def _flush(self):
+  def flush(self):
+    """Flushes any buffered data."""
+    if self._v2 and context.executing_eagerly() and self._closed:
+      return
     return _flush_fn(writer=self)
 
-  def flush(self):
-    """Operation to force the summary writer to flush any buffered data."""
-    if self._resource is not None:
-      return self._flush()
+  def close(self):
+    """Flushes and closes the summary writer."""
+    if self._v2 and context.executing_eagerly() and self._closed:
+      return
+    try:
+      with ops.control_dependencies([self.flush()]):
+        with ops.device("cpu:0"):
+          return gen_summary_ops.close_summary_writer(self._resource)
+    finally:
+      if self._v2 and context.executing_eagerly():
+        self._closed = True
 
-  def _close(self):
-    with ops.control_dependencies([self.flush()]):
-      with ops.device("cpu:0"):
-        return gen_summary_ops.close_summary_writer(self._resource)
+
+class NoopSummaryWriter(SummaryWriter):
+  """A summary writer that does nothing, for create_noop_writer()."""
+
+  def set_as_default(self):
+    pass
+
+  @tf_contextlib.contextmanager
+  def as_default(self):
+    yield
+
+  def init(self):
+    pass
+
+  def flush(self):
+    pass
 
   def close(self):
-    """Operation to flush and close the summary writer resource."""
-    if self._resource is not None:
-      return self._close()
+    pass
 
 
+@tf_export(v1=["summary.initialize"])
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
     session=None):
   """Initializes summary writing for graph execution mode.
 
+  This operation is a no-op when executing eagerly.
+
   This helper method provides a higher-level alternative to using
   `tf.contrib.summary.summary_writer_initializer_op` and
   `tf.contrib.summary.graph`.
 
-  Most users will also want to call `tf.train.create_global_step`
+  Most users will also want to call `tf.compat.v1.train.create_global_step`
   which can happen before or after this function is called.
 
   Args:
@@ -221,7 +318,7 @@ def initialize(
   """
   if context.executing_eagerly():
     return
-  if context.context().summary_writer_resource is None:
+  if context.context().summary_writer is None:
     raise RuntimeError("No default tf.contrib.summary.SummaryWriter found")
   if session is None:
     session = ops.get_default_session()
@@ -235,6 +332,66 @@ def initialize(
 
 
 @tf_export("summary.create_file_writer", v1=[])
+def create_file_writer_v2(logdir,
+                          max_queue=None,
+                          flush_millis=None,
+                          filename_suffix=None,
+                          name=None):
+  """Creates a summary file writer for the given log directory.
+
+  Args:
+    logdir: a string specifying the directory in which to write an event file.
+    max_queue: the largest number of summaries to keep in a queue; will
+     flush once the queue gets bigger than this. Defaults to 10.
+    flush_millis: the largest interval between flushes. Defaults to 120,000.
+    filename_suffix: optional suffix for the event file name. Defaults to `.v2`.
+    name: a name for the op that creates the writer.
+
+  Returns:
+    A SummaryWriter object.
+  """
+  if logdir is None:
+    raise ValueError("logdir cannot be None")
+  inside_function = ops.inside_function()
+  with ops.name_scope(name, "create_file_writer") as scope, ops.device("cpu:0"):
+    # Run init inside an init_scope() to hoist it out of tf.functions.
+    with ops.init_scope():
+      if context.executing_eagerly():
+        _check_create_file_writer_args(
+            inside_function,
+            logdir=logdir,
+            max_queue=max_queue,
+            flush_millis=flush_millis,
+            filename_suffix=filename_suffix)
+      logdir = ops.convert_to_tensor(logdir, dtype=dtypes.string)
+      if max_queue is None:
+        max_queue = constant_op.constant(10)
+      if flush_millis is None:
+        flush_millis = constant_op.constant(2 * 60 * 1000)
+      if filename_suffix is None:
+        filename_suffix = constant_op.constant(".v2")
+      # Prepend the PID and a process-local UID to the filename suffix to avoid
+      # filename collisions within the machine (the filename already contains
+      # the hostname to avoid cross-machine collisions).
+      unique_prefix = constant_op.constant(".%s.%s" % (os.getpid(), ops.uid()))
+      filename_suffix = unique_prefix + filename_suffix
+      # Use a unique shared_name to prevent resource sharing.
+      if context.executing_eagerly():
+        shared_name = context.shared_name()
+      else:
+        shared_name = ops._name_from_scope_name(scope)  # pylint: disable=protected-access
+      return ResourceSummaryWriter(
+          shared_name=shared_name,
+          init_op_fn=functools.partial(
+              gen_summary_ops.create_summary_file_writer,
+              logdir=logdir,
+              max_queue=max_queue,
+              flush_millis=flush_millis,
+              filename_suffix=filename_suffix),
+          name=name,
+          v2=True)
+
+
 def create_file_writer(logdir,
                        max_queue=None,
                        flush_millis=None,
@@ -261,7 +418,7 @@ def create_file_writer(logdir,
     summary writer.
   """
   if logdir is None:
-    return SummaryWriter(None, None)
+    return NoopSummaryWriter()
   logdir = str(logdir)
   with ops.device("cpu:0"):
     if max_queue is None:
@@ -272,13 +429,14 @@ def create_file_writer(logdir,
       filename_suffix = constant_op.constant(".v2")
     if name is None:
       name = "logdir:" + logdir
-    return _make_summary_writer(
-        name,
-        gen_summary_ops.create_summary_file_writer,
-        logdir=logdir,
-        max_queue=max_queue,
-        flush_millis=flush_millis,
-        filename_suffix=filename_suffix)
+    return ResourceSummaryWriter(
+        shared_name=name,
+        init_op_fn=functools.partial(
+            gen_summary_ops.create_summary_file_writer,
+            logdir=logdir,
+            max_queue=max_queue,
+            flush_millis=flush_millis,
+            filename_suffix=filename_suffix))
 
 
 def create_db_writer(db_uri,
@@ -323,26 +481,23 @@ def create_db_writer(db_uri,
         "experiment_name", _EXPERIMENT_NAME_PATTERNS, experiment_name)
     run_name = _cleanse_string("run_name", _RUN_NAME_PATTERNS, run_name)
     user_name = _cleanse_string("user_name", _USER_NAME_PATTERNS, user_name)
-    return _make_summary_writer(
-        name,
-        gen_summary_ops.create_summary_db_writer,
-        db_uri=db_uri,
-        experiment_name=experiment_name,
-        run_name=run_name,
-        user_name=user_name)
-
-
-def _make_summary_writer(name, factory, **kwargs):
-  resource = gen_summary_ops.summary_writer(shared_name=name)
-  init_op_fn = lambda: factory(resource, **kwargs)
-  init_op = init_op_fn()
-  if not context.executing_eagerly():
-    # TODO(apassos): Consider doing this instead.
-    #   ops.get_default_session().run(init_op)
-    global _SUMMARY_WRITER_INIT_OP
-    key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    _SUMMARY_WRITER_INIT_OP.setdefault(key, []).append(init_op)
-  return SummaryWriter(resource, init_op_fn)
+    return ResourceSummaryWriter(
+        shared_name=name,
+        init_op_fn=functools.partial(
+            gen_summary_ops.create_summary_db_writer,
+            db_uri=db_uri,
+            experiment_name=experiment_name,
+            run_name=run_name,
+            user_name=user_name))
+
+
+@tf_export("summary.create_noop_writer", v1=[])
+def create_noop_writer():
+  """Returns a summary writer that does nothing.
+
+  This is useful as a placeholder in code that expects a context manager.
+  """
+  return NoopSummaryWriter()
 
 
 def _cleanse_string(name, pattern, value):
@@ -432,7 +587,7 @@ def summary_scope(name, default_name="summary", values=None):
 
 
 @tf_export("summary.write", v1=[])
-def write(tag, tensor, step, metadata=None, name=None):
+def write(tag, tensor, step=None, metadata=None, name=None):
   """Writes a generic summary to the default SummaryWriter if one exists.
 
   This exists primarily to support the definition of type-specific summary ops
@@ -443,21 +598,32 @@ def write(tag, tensor, step, metadata=None, name=None):
     tag: string tag used to identify the summary (e.g. in TensorBoard), usually
       generated with `tf.summary.summary_scope`
     tensor: the Tensor holding the summary data to write
-    step: `int64`-castable monotic step value for this summary
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
     metadata: Optional SummaryMetadata, as a proto or serialized bytes
     name: Optional string name for this op.
 
   Returns:
     True on success, or false if no summary was written because no default
     summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
   """
   with ops.name_scope(name, "write_summary") as scope:
-    if context.context().summary_writer_resource is None:
+    if context.context().summary_writer is None:
       return constant_op.constant(False)
+    if step is None:
+      step = get_step()
+      if step is None:
+        raise ValueError("No step set via 'step' argument or "
+                         "tf.summary.experimental.set_step()")
     if metadata is None:
-      serialized_metadata = constant_op.constant(b"")
+      serialized_metadata = b""
     elif hasattr(metadata, "SerializeToString"):
-      serialized_metadata = constant_op.constant(metadata.SerializeToString())
+      serialized_metadata = metadata.SerializeToString()
     else:
       serialized_metadata = metadata
 
@@ -466,7 +632,7 @@ def write(tag, tensor, step, metadata=None, name=None):
       # Note the identity to move the tensor to the CPU.
       with ops.device("cpu:0"):
         write_summary_op = gen_summary_ops.write_summary(
-            context.context().summary_writer_resource,
+            context.context().summary_writer._resource,  # pylint: disable=protected-access
             step,
             array_ops.identity(tensor),
             tag,
@@ -501,7 +667,7 @@ def summary_writer_function(name, tensor, function, family=None):
       with ops.control_dependencies([function(tag, scope)]):
         return constant_op.constant(True)
 
-  if context.context().summary_writer_resource is None:
+  if context.context().summary_writer is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
     op = smart_cond.smart_cond(
@@ -523,7 +689,7 @@ def generic(name, tensor, metadata=None, family=None, step=None):
       serialized_metadata = metadata
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_summary(
-        context.context().summary_writer_resource,
+        context.context().summary_writer._resource,  # pylint: disable=protected-access
         _choose_step(step),
         array_ops.identity(tensor),
         tag,
@@ -555,7 +721,7 @@ def scalar(name, tensor, family=None, step=None):
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_scalar_summary(
-        context.context().summary_writer_resource,
+        context.context().summary_writer._resource,  # pylint: disable=protected-access
         _choose_step(step),
         tag,
         array_ops.identity(tensor),
@@ -570,7 +736,7 @@ def histogram(name, tensor, family=None, step=None):
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_histogram_summary(
-        context.context().summary_writer_resource,
+        context.context().summary_writer._resource,  # pylint: disable=protected-access
         _choose_step(step),
         tag,
         array_ops.identity(tensor),
@@ -587,7 +753,7 @@ def image(name, tensor, bad_color=None, max_images=3, family=None, step=None):
                   if bad_color is None else bad_color)
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_image_summary(
-        context.context().summary_writer_resource,
+        context.context().summary_writer._resource,  # pylint: disable=protected-access
         _choose_step(step),
         tag,
         array_ops.identity(tensor),
@@ -604,7 +770,7 @@ def audio(name, tensor, sample_rate, max_outputs, family=None, step=None):
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_audio_summary(
-        context.context().summary_writer_resource,
+        context.context().summary_writer._resource,  # pylint: disable=protected-access
         _choose_step(step),
         tag,
         array_ops.identity(tensor),
@@ -648,7 +814,7 @@ def graph(param, step=None, name=None):
   if not context.executing_eagerly() and not isinstance(param, ops.Tensor):
     raise TypeError("graph() needs a tf.Tensor (e.g. tf.placeholder) in graph "
                     "mode, but was: %s" % type(param))
-  writer = context.context().summary_writer_resource
+  writer = context.context().summary_writer
   if writer is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
@@ -657,7 +823,7 @@ def graph(param, step=None, name=None):
     else:
       tensor = array_ops.identity(param)
     return gen_summary_ops.write_graph_summary(
-        writer, _choose_step(step), tensor, name=name)
+        writer._resource, _choose_step(step), tensor, name=name)  # pylint: disable=protected-access
 
 
 _graph = graph  # for functions with a graph parameter
@@ -680,7 +846,7 @@ def import_event(tensor, name=None):
     The created `tf.Operation`.
   """
   return gen_summary_ops.import_event(
-      context.context().summary_writer_resource, tensor, name=name)
+      context.context().summary_writer._resource, tensor, name=name)  # pylint: disable=protected-access
 
 
 @tf_export("summary.flush", v1=[])
@@ -699,14 +865,16 @@ def flush(writer=None, name=None):
     The created `tf.Operation`.
   """
   if writer is None:
-    writer = context.context().summary_writer_resource
+    writer = context.context().summary_writer
     if writer is None:
       return control_flow_ops.no_op()
+  if isinstance(writer, ResourceSummaryWriter):
+    resource = writer._resource  # pylint: disable=protected-access
   else:
-    if isinstance(writer, SummaryWriter):
-      writer = writer._resource  # pylint: disable=protected-access
+    # Assume we were passed a raw resource tensor.
+    resource = writer
   with ops.device("cpu:0"):
-    return gen_summary_ops.flush_summary_writer(writer, name=name)
+    return gen_summary_ops.flush_summary_writer(resource, name=name)
 
 
 _flush_fn = flush  # for within SummaryWriter.flush()
@@ -739,3 +907,271 @@ def _choose_step(step):
   if not isinstance(step, ops.Tensor):
     return ops.convert_to_tensor(step, dtypes.int64)
   return step
+
+
+def _check_create_file_writer_args(inside_function, **kwargs):
+  """Helper to check the validity of arguments to a create_file_writer() call.
+
+  Args:
+    inside_function: whether the create_file_writer() call is in a tf.function
+    **kwargs: the arguments to check, as kwargs to give them names.
+
+  Raises:
+    ValueError: if the arguments are graph tensors.
+  """
+  for arg_name, arg in kwargs.items():
+    if not isinstance(arg, ops.EagerTensor) and tensor_util.is_tensor(arg):
+      if inside_function:
+        raise ValueError(
+            "Invalid graph Tensor argument \"%s=%s\" to create_file_writer() "
+            "inside an @tf.function. The create call will be lifted into the "
+            "outer eager execution context, so it cannot consume graph tensors "
+            "defined inside the function body." % (arg_name, arg))
+      else:
+        raise ValueError(
+            "Invalid graph Tensor argument \"%s=%s\" to eagerly executed "
+            "create_file_writer()." % (arg_name, arg))
+
+
+def run_metadata(name, data, step=None):
+  """Writes entire RunMetadata summary.
+
+  A RunMetadata can contain DeviceStats, partition graphs, and function graphs.
+  Please refer to the proto for definition of each field.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  with summary_scope(name,
+                     "graph_run_metadata_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+def run_metadata_graphs(name, data, step=None):
+  """Writes graphs from a RunMetadata summary.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata_graph"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  data = config_pb2.RunMetadata(
+      function_graphs=data.function_graphs,
+      partition_graphs=data.partition_graphs)
+
+  with summary_scope(name,
+                     "graph_run_metadata_graph_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+def keras_model(name, data, step=None):
+  """Writes a Keras model as JSON to as a Summary.
+
+  Writing the Keras model configuration allows the TensorBoard graph plugin to
+  render a conceptual graph, as opposed to graph of ops. In case the model fails
+  to serialze as JSON, it ignores and returns False.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A Keras Model to write.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+
+  Returns:
+    True on success, or False if no summary was written because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_keras_model"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  try:
+    json_string = data.to_json()
+  except Exception as exc:  # pylint: disable=broad-except
+    # An exception should not break a model code.
+    logging.warn("Model failed to serialize as JSON. Ignoring... %s" % exc)
+    return False
+
+  with summary_scope(name, "graph_keras_model", [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(json_string, dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+_TraceContext = collections.namedtuple("TraceContext", ("graph", "profiler"))
+_current_trace_context_lock = threading.Lock()
+_current_trace_context = None
+
+
+@tf_export("summary.trace_on", v1=[])
+def trace_on(graph=True, profiler=False):  # pylint: disable=redefined-outer-name
+  """Starts a trace to record computation graphs and profiling information.
+
+  Must be invoked in eager mode.
+
+  When enabled, TensorFlow runtime will collection information that can later be
+  exported and consumed by TensorBoard. The trace is activated across the entire
+  TensorFlow runtime and affects all threads of execution.
+
+  To stop the trace and export the collected information, use
+  `tf.summary.trace_export`. To stop the trace without exporting, use
+  `tf.summary.trace_off`.
+
+  Args:
+    graph: If True, enables collection of executed graphs. It includes ones from
+        tf.function invocation and ones from the legacy graph mode. The default
+        is True.
+    profiler: If True, enables the advanced profiler. Enabling profiler
+        implicitly enables the graph collection. The profiler may incur a high
+        memory overhead. The default is False.
+
+  """
+  if ops.inside_function():
+    logging.warn("Cannot enable trace inside a tf.function.")
+    return
+  if not context.context().executing_eagerly():
+    logging.warn("Must enable trace in eager mode.")
+    return
+
+  global _current_trace_context
+  with _current_trace_context_lock:
+    if _current_trace_context:
+      logging.warn("Trace already enabled")
+      return
+
+    if graph and not profiler:
+      context.context().enable_graph_collection()
+    if profiler:
+      context.context().enable_run_metadata()
+      _profiler.start()
+
+    _current_trace_context = _TraceContext(graph=graph, profiler=profiler)
+
+
+@tf_export("summary.trace_export", v1=[])
+def trace_export(name, step=None, profiler_outdir=None):
+  """Stops and exports the active trace as a Summary and/or profile file.
+
+  Stops the trace and exports all metadata collected during the trace to the
+  default SummaryWriter, if one has been set.
+
+  Args:
+    name: A name for the summary to be written.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    profiler_outdir: Output directory for profiler. It is required when profiler
+      is enabled when trace was started. Otherwise, it is ignored.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  # TODO(stephanlee): See if we can remove profiler_outdir and infer it from
+  # the SummaryWriter's logdir.
+  global _current_trace_context
+
+  if ops.inside_function():
+    logging.warn("Cannot export trace inside a tf.function.")
+    return
+  if not context.context().executing_eagerly():
+    logging.warn("Can only export trace while executing eagerly.")
+    return
+
+  with _current_trace_context_lock:
+    if _current_trace_context is None:
+      raise ValueError("Must enable trace before export.")
+    graph, profiler = _current_trace_context  # pylint: disable=redefined-outer-name
+    if profiler and profiler_outdir is None:
+      raise ValueError("Required profiler_outdir is not specified")
+
+  run_meta = context.context().export_run_metadata()
+
+  if graph and not profiler:
+    run_metadata_graphs(name, run_meta, step)
+  else:
+    run_metadata(name, run_meta, step)
+
+  if profiler:
+    _profiler.save(profiler_outdir, _profiler.stop())
+
+  trace_off()
+
+
+@tf_export("summary.trace_off", v1=[])
+def trace_off():
+  """Stops the current trace and discards any collected information."""
+  global _current_trace_context
+  with _current_trace_context_lock:
+    _current_trace_context = None
+
+  # Disabling run_metadata disables graph collection as well.
+  context.context().disable_run_metadata()
+
+  # profiler only has start and stop. One needs to stop in order to export
+  # and stopping when it is not running will raise an error.
+  try:
+    _profiler.stop()
+  except _profiler.ProfilerNotRunningError:
+    pass
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index e02175d6feaf1dd74c560bec9e7815cbe20eddc5..3ca9799e4312d956c1b80d18597e8f886a6d50a9 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,8 +26,8 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -232,7 +232,7 @@ def _skip_common_stack_elements(stacktrace, base_case):
   return stacktrace[-1:]
 
 
-class Template(checkpointable.Checkpointable):
+class Template(trackable.Trackable):
   """Wrap a function to aid in variable sharing.
 
   Templates are functions that create variables the first time they are called
@@ -292,30 +292,31 @@ class Template(checkpointable.Checkpointable):
         self._variable_scope = vs
     else:
       self._variable_scope = None
-    # This variable keeps track of whether the template has been called yet,
-    # which is not the same as whether the scope has been created.
+    # This variable keeps track of whether the template has been called to
+    # completion, which is not the same as whether the scope has been created.
     self._variables_created = False
+    # `MirroredStrategy` builds the graph with multiple threads. If a
+    # `merge_call` happens within a template, multiple calls may be in progress
+    # simultaneously. This variable keeps track of whether any call of the
+    # template has started.
+    self._first_call = True
 
   def _call_func(self, args, kwargs):
     try:
-      vars_at_start = len(
-          ops.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES))
-      trainable_at_start = len(
-          ops.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES))
       if self._variables_created:
+        vars_at_start = len(
+            ops.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES))
+        trainable_at_start = len(
+            ops.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES))
+
         result = self._func(*args, **kwargs)
-      else:
-        # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
-          result = self._func(*args, **kwargs)
 
-      if self._variables_created:
         # Variables were previously created, implying this is not the first
         # time the template has been called. Check to make sure that no new
         # trainable variables were created this time around.
         trainable_variables = ops.get_collection_ref(
             ops.GraphKeys.TRAINABLE_VARIABLES)
+
         # If a variable that we intend to train is created as a side effect
         # of creating a template, then that is almost certainly an error.
         if trainable_at_start != len(trainable_variables):
@@ -333,8 +334,19 @@ class Template(checkpointable.Checkpointable):
                        "the first time, perhaps you used tf.Variable when you "
                        "meant tf.get_variable: %s",
                        variables[vars_at_start:])
-      else:
+      elif self._first_call:
+        self._first_call = False
+        try:
+          # The first time we run, restore variables if necessary (via
+          # Trackable).
+          with trackable_util.capture_dependencies(template=self):
+            result = self._func(*args, **kwargs)
+        except:
+          self._first_call = True
+          raise
         self._variables_created = True
+      else:  # We are calling the template in parallel from another thread.
+        result = self._func(*args, **kwargs)
       return result
     except Exception as exc:
       # Reraise the exception, but append the original definition to the
@@ -354,9 +366,9 @@ class Template(checkpointable.Checkpointable):
 
   def __call__(self, *args, **kwargs):
     if self._variable_scope:
-      # Only reuse variables if they were already created.
+      # Only reuse variables if not on first call.
       with variable_scope.variable_scope(
-          self._variable_scope, reuse=self._variables_created):
+          self._variable_scope, reuse=not self._first_call):
         return self._call_func(args, kwargs)
     else:
       # The scope was not created at construction time, so create it here.
@@ -577,8 +589,8 @@ class EagerTemplate(Template):
         result = self._func(*args, **kwargs)
       else:
         # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
+        # Trackable).
+        with trackable_util.capture_dependencies(template=self):
           result = self._func(*args, **kwargs)
 
       if self._variables_created:
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 90a8b0af469b1be36340244c0dfdf43e013c75a2..59fee70583e8b29f482628803e1c0133fbfd98b2 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -370,7 +370,7 @@ class _GraphTensorArray(object):
                         [self._handle, value, lengths]):
       value = ops.convert_to_tensor(value, name="value")
       with self._maybe_colocate_with(value):
-        lengths_64 = math_ops.to_int64(lengths)
+        lengths_64 = math_ops.cast(lengths, dtypes.int64)
         if self._infer_shape and not context.executing_eagerly():
           clengths = tensor_util.constant_value(lengths_64)
           if value.shape.dims is not None:
@@ -542,14 +542,20 @@ class _GraphTensorArrayV2(object):
 
   def read(self, index, name=None):
     """See TensorArray."""
-    value = list_ops.tensor_list_get_item(
-        input_handle=self._flow,
-        index=index,
-        element_dtype=self._dtype,
-        name=name)
-    if self._element_shape:
-      value.set_shape(self._element_shape[0].dims)
-    return value
+    with ops.name_scope(name, "TensorArrayV2Read", [self._flow, index]):
+      if self._element_shape:
+        element_shape = self._element_shape[0]
+      else:
+        element_shape = tensor_shape.TensorShape(None)
+      value = list_ops.tensor_list_get_item(
+          input_handle=self._flow,
+          index=index,
+          element_dtype=self._dtype,
+          element_shape=element_shape,
+          name=name)
+      if self._element_shape:
+        value.set_shape(self._element_shape[0].dims)
+      return value
 
   @tf_should_use.should_use_result
   def write(self, index, value, name=None):
@@ -569,18 +575,29 @@ class _GraphTensorArrayV2(object):
   def stack(self, name=None):
     """See TensorArray."""
     with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      if self._element_shape:
+        element_shape = self._element_shape[0]
+      else:
+        element_shape = tensor_shape.TensorShape(None)
       value = list_ops.tensor_list_stack(
-          input_handle=self._flow, element_dtype=self._dtype)
+          input_handle=self._flow,
+          element_dtype=self._dtype,
+          element_shape=element_shape)
       if self._element_shape and self._element_shape[0].dims is not None:
         value.set_shape([None] + self._element_shape[0].dims)
       return value
 
   def gather(self, indices, name=None):
     """See TensorArray."""
+    if self._element_shape:
+      element_shape = self._element_shape[0]
+    else:
+      element_shape = tensor_shape.TensorShape(None)
     value = list_ops.tensor_list_gather(
         input_handle=self._flow,
         indices=indices,
         element_dtype=self._dtype,
+        element_shape=element_shape,
         name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
       value.set_shape([None] + self._element_shape[0].dims)
@@ -619,8 +636,9 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(value, name="value")
       if self._infer_shape and not context.executing_eagerly():
         self._merge_element_shape(value.shape[1:])
+      element_shape = self._element_shape[0] if self._element_shape else None
       flow_out = list_ops.tensor_list_scatter(
-          tensor=value, indices=indices, element_shape=-1)
+          tensor=value, indices=indices, input_handle=self._flow)
       return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
@@ -628,7 +646,7 @@ class _GraphTensorArrayV2(object):
     """See TensorArray."""
     with ops.name_scope(name, "TensorArraySplit", [self._flow, value, lengths]):
       value = ops.convert_to_tensor(value, name="value")
-      lengths_64 = math_ops.to_int64(lengths)
+      lengths_64 = math_ops.cast(lengths, dtypes.int64)
       if self._infer_shape and not context.executing_eagerly():
         clengths = tensor_util.constant_value(lengths_64)
         if value.shape.dims is not None:
@@ -818,7 +836,7 @@ class _EagerTensorArray(object):
     if self._infer_shape:
       if self._element_shape is None:
         self._element_shape = value.shape
-      elif self._element_shape != value.shape:
+      elif not self._element_shape.is_compatible_with(value.shape):
         raise ValueError("Incompatible shape for value (%s), expected (%s)" %
                          (value.shape.as_list(), self._element_shape.as_list()))
 
@@ -853,7 +871,9 @@ class _EagerTensorArray(object):
   def gather(self, indices, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    return array_ops.stack([self._maybe_zero(i) for i in indices])
 
   def concat(self, name=None):
     """See TensorArray."""
@@ -886,7 +906,9 @@ class _EagerTensorArray(object):
   def scatter(self, indices, value, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    for index, val in zip(indices.numpy(), array_ops.unstack(value)):
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    for index, val in zip(indices, array_ops.unstack(value)):
       self._write(index, val)  # pylint: disable=protected-access
     return self.parent()
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 35c00778ae5c99cb5688c9ff1fa97b26c72dc855..bcef6e60e33f1274f5504c2fb23330d24f10bbe4 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -230,6 +230,26 @@ def enable_resource_variables():
   _DEFAULT_USE_RESOURCE = True
 
 
+@tf_export(v1=["resource_variables_enabled"])
+def resource_variables_enabled():
+  """Returns `True` if resource variables are enabled.
+
+  Resource variables are improved versions of TensorFlow variables with a
+  well-defined memory model. Accessing a resource variable reads its value, and
+  all ops which access a specific read value of the variable are guaranteed to
+  see the same value for that tensor. Writes which happen after a read (by
+  having a control or data dependency on the read) are guaranteed not to affect
+  the value of the read tensor, and similarly writes which happen before a read
+  are guaranteed to affect the value. No guarantees are made about unordered
+  read/write pairs.
+
+  Calling tf.enable_resource_variables() lets you opt-in to this TensorFlow 2.0
+  feature.
+  """
+  global _DEFAULT_USE_RESOURCE
+  return _DEFAULT_USE_RESOURCE
+
+
 @deprecation.deprecated(
     None, "non-resource variables are not supported in the long term")
 @tf_export(v1=["disable_resource_variables"])
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 3392b1aad9a532c71ee00ccadba39f77c79a6d80..412300772b5783bf8d337029a5e61efda23c5cfc 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -103,6 +103,17 @@ class VariableAggregationV2(enum.Enum):
   MEAN = 2
   ONLY_FIRST_REPLICA = 3
 
+  def __hash__(self):
+    return hash(self.value)
+
+  def __eq__(self, other):
+    if self is other:
+      return True
+    elif isinstance(other, VariableAggregation):
+      return int(self.value) == int(other.value)
+    else:
+      return False
+
 
 @tf_export(v1=["VariableAggregation"])
 class VariableAggregation(enum.Enum):
@@ -112,6 +123,9 @@ class VariableAggregation(enum.Enum):
   ONLY_FIRST_REPLICA = 3
   ONLY_FIRST_TOWER = 3  # DEPRECATED
 
+  def __hash__(self):
+    return hash(self.value)
+
 
 VariableAggregation.__doc__ = (
     VariableAggregationV2.__doc__ +
@@ -138,7 +152,7 @@ class VariableMetaclass(type):
                         aggregation=VariableAggregation.NONE):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
-    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
       previous_getter = _make_getter(getter, previous_getter)
 
     # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
@@ -174,7 +188,7 @@ class VariableMetaclass(type):
                         aggregation=VariableAggregation.NONE):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
-    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
       previous_getter = _make_getter(getter, previous_getter)
 
     # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
@@ -204,7 +218,7 @@ class VariableMetaclass(type):
 
 @tf_export("Variable", v1=[])
 class Variable(six.with_metaclass(VariableMetaclass,
-                                  checkpointable.Checkpointable)):
+                                  trackable.Trackable)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
@@ -304,7 +318,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
 
   ```
@@ -1018,8 +1032,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
     return self.shape
 
   def _gather_saveables_for_checkpoint(self):
-    """For implementing `Checkpointable`. This object is saveable on its own."""
-    return {checkpointable.VARIABLE_VALUE_KEY: self}
+    """For implementing `Trackable`. This object is saveable on its own."""
+    return {trackable.VARIABLE_VALUE_KEY: self}
 
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
@@ -1239,7 +1253,7 @@ class VariableV1(Variable):
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
   ```
   v = tf.Variable(True, use_resource=True)
@@ -1506,8 +1520,8 @@ class RefVariable(VariableV1):
     # Store the graph key so optimizers know how to only retrieve variables from
     # this graph.
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 0e427d3c6ab10517524429957e70fcfa332991af..122f275b98e529c75933bbdad46c8c1d7daca11c 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,7 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -31,7 +30,6 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -39,10 +37,11 @@ from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import while_v2_indexed_slices_rewriter
 from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
@@ -63,7 +62,6 @@ def while_loop(cond,
                name=None,
                return_same_structure=True):
   """Like tf.while_loop, except emits a single While op."""
-  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
   # Keep the original loop_vars around to know which args were TensorArrays.
   orig_loop_vars = loop_vars
   # Cache its length since we use it at multiple places below.
@@ -87,24 +85,25 @@ def while_loop(cond,
     with ops.name_scope(None):
       cond_name = util.unique_fn_name(scope, "cond")
       body_name = util.unique_fn_name(scope, "body")
-
+    maximum_iterations_loop_var = _build_maximum_iterations_loop_var(
+        maximum_iterations)
     loop_counter = constant_op.constant(
         0,
-        dtype=maximum_iterations.dtype
+        dtype=maximum_iterations_loop_var.dtype
         if maximum_iterations is not None else None,
         name="loop_counter")
     # Add loop counter needed for computing gradients.
-    loop_vars = [loop_counter] + loop_vars
+    loop_vars = [loop_counter, maximum_iterations_loop_var] + loop_vars
 
-    shape_invariants = type(shape_invariants)([tensor_shape.scalar()
-                                              ]) + shape_invariants
+    shape_invariants = type(shape_invariants)(
+        [tensor_shape.scalar(), tensor_shape.scalar()]) + shape_invariants
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
     add_control_dependencies = ops.get_default_graph()._add_control_dependencies
 
     # Build a `cond` wrapper that can handle the extra counter loop_var.
-    def wrapped_cond(loop_counter, *args):
+    def wrapped_cond(loop_counter, maximum_iterations_arg, *args):
       # Convert the flow variables in `args` to TensorArrays. `args` should
       # already have the same structure as `orig_loop_vars` but currently there
       # is no nest.zip so we call `_pack_sequence_as` which flattens both
@@ -114,7 +113,7 @@ def while_loop(cond,
         return cond(*_pack_sequence_as(orig_loop_vars, args))
       else:
         return math_ops.logical_and(
-            loop_counter < maximum_iterations,
+            loop_counter < maximum_iterations_arg,
             cond(*_pack_sequence_as(orig_loop_vars, args)))
 
     # NOTE(skyewm): we set collections to the outer graph's collections for
@@ -122,40 +121,35 @@ def while_loop(cond,
     cond_graph = func_graph_module.func_graph_from_py_func(
         cond_name,
         wrapped_cond,
-        loop_vars, {},
+        [],  # We provide signature instead of args.
+        {},
         signature=_build_signature(loop_vars, shape_invariants),
         func_graph=util.WhileCondFuncGraph(
             cond_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
 
-    # Add external_captures of cond to the list of loop vars.
-    # Note that external tensors will be treated as loop invariants, i.e.,
-    # the value of that tensor in each iteration is the same as it was at the
-    # beginning of the loop execution.
-    loop_vars = loop_vars + cond_graph.external_captures
-    shape_invariants = shape_invariants + type(shape_invariants)(
-        [t.shape for t in cond_graph.external_captures])
-
-    def wrapped_body(loop_counter, *args):
+    def wrapped_body(loop_counter, maximum_iterations_arg, *args):
       """Loop body augmented with counter update.
 
       Args:
         loop_counter: Loop counter which needs to be incremented in the body.
+        maximum_iterations_arg: Maximum iterations of the loop.
         *args: List of args
-          args[:len_orig_loop_vars] - Args for the original loop body.
-          args[len_orig_loop_vars:] - External captures of cond. These get
-            passed through as is.
 
       Returns:
         A list of tensors the same length as args.
       """
+      # Capture the tensors already captured in cond_graph so that they appear
+      # in the same order in body_graph.external_captures.
+      for t in cond_graph.external_captures:
+        ops.get_default_graph().capture(t)
+
       # Convert the flow variables in `args` to TensorArrays. `args` should
       # already have the same structure as `orig_loop_vars` but currently there
       # is no nest.zip so we call `_pack_sequence_as` which flattens both
       # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
       # and packs it into the structure of `orig_loop_vars`.
-      outputs = body(
-          *_pack_sequence_as(orig_loop_vars, args[:len_orig_loop_vars]))
+      outputs = body(*_pack_sequence_as(orig_loop_vars, args))
       if not nest.is_sequence(outputs):
         outputs = [outputs]
       # Compare the structure of input and output of body converting the
@@ -164,17 +158,15 @@ def while_loop(cond,
 
       outputs = _tensor_array_to_flow(outputs)
 
-      # Return the external_captures of cond_graph as is, i.e., treat them as
-      # loop invariants.
       # TODO(srbs): Update lowering code to create _Enter nodes with
       # is_constant=True for inputs that are directly passed to outputs.
-      return [loop_counter + 1] + list(outputs) + list(
-          args[len_orig_loop_vars:])
+      return [loop_counter + 1, maximum_iterations_arg] + list(outputs)
 
     body_graph = func_graph_module.func_graph_from_py_func(
         body_name,
         wrapped_body,
-        loop_vars, {},
+        [],  # We provide signature instead of args.
+        {},
         signature=_build_signature(loop_vars, shape_invariants),
         func_graph=util.WhileBodyFuncGraph(
             body_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
@@ -188,41 +180,47 @@ def while_loop(cond,
     # is_constant=True for inputs that are directly passed to outputs.
     body_graph.outputs.extend(body_graph.internal_captures)
 
-    # Capture `external_captures` of `body_graph` in `cond_graph` so that it
-    # expects to receive those as arguments.
-    # TODO(b/118457764): Dedup tensors that are captured in both the cond and
-    # body. This logic already exists in cond_v2.
+    # Capture the extra `external_captures` of `body_graph` in `cond_graph` so
+    # that it expects to receive those as arguments.
     with cond_graph.as_default():
-      for external_capture in body_graph.external_captures:
-        assert external_capture not in cond_graph.captures, (
-            "Looks like both cond and body are capturing the same tensor %s. "
-            "This is not supported yet. For now consider passing,"
-            " this as a loop variable." % str(external_capture))
-        cond_graph.capture(external_capture)
+      num_cond_captures = len(cond_graph.external_captures)
+      assert (cond_graph.external_captures ==
+              body_graph.external_captures[:num_cond_captures])
+      for body_capture in body_graph.external_captures[num_cond_captures:]:
+        assert body_capture not in cond_graph.captures
+        cond_graph.capture(body_capture)
 
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
     # specified.
     num_flattened_outputs = len(nest.flatten(orig_loop_vars))
+    # First var is loop counter and second var is maximum_iterations.
+    first_loop_var_index = 2
     _check_shapes_compat(
-        body_graph.outputs[1:1 + num_flattened_outputs],
-        nest.flatten(shape_invariants[1:1 + len_orig_loop_vars]),
-        nest.flatten(loop_vars[1:1 + len_orig_loop_vars]))
+        body_graph.outputs[first_loop_var_index:first_loop_var_index +
+                           num_flattened_outputs],
+        nest.flatten(
+            shape_invariants[first_loop_var_index:first_loop_var_index +
+                             len_orig_loop_vars]),
+        nest.flatten(loop_vars[first_loop_var_index:first_loop_var_index +
+                               len_orig_loop_vars]))
     flattened_loop_vars = nest.flatten(loop_vars)
     _check_num_inputs_outputs(cond_graph, body_graph,
                               len(flattened_loop_vars))
 
-    outputs = gen_functional_ops._while(
-        flattened_loop_vars,
-        util.create_new_tf_function(cond_graph),
-        util.create_new_tf_function(body_graph),
-        output_shapes=[t.shape for t in body_graph.outputs],
-        parallel_iterations=parallel_iterations,
-        name=scope)
+    with ops.control_dependencies(
+        list(cond_graph.control_captures) + list(body_graph.control_captures)):
+      outputs = gen_functional_ops._while(
+          flattened_loop_vars,
+          util.create_new_tf_function(cond_graph),
+          util.create_new_tf_function(body_graph),
+          output_shapes=[t.shape for t in body_graph.outputs],
+          parallel_iterations=parallel_iterations,
+          name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
     util.maybe_set_lowering_attr(outputs[0].op)
-    _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+    util.maybe_propagate_compile_time_consts_in_xla(outputs[0].op)
 
     # Return identities for each output of the While op, rather than the output
     # of the While op directly. This makes pruning work if the output of
@@ -232,9 +230,9 @@ def while_loop(cond,
     # identity op will end up with only the appropriate exit op as input.
     outputs = tuple(array_ops.identity(t) for t in outputs)
 
-  # First var is loop counter.
-  outputs = _pack_sequence_as(orig_loop_vars,
-                              outputs[1:1 + num_flattened_outputs])
+  outputs = _pack_sequence_as(
+      orig_loop_vars, outputs[first_loop_var_index:first_loop_var_index +
+                              num_flattened_outputs])
 
   if return_same_structure:
     return outputs
@@ -258,11 +256,8 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   body_graph = _get_graph(while_op, "body")
   orig_num_params = len(body_graph.outputs)
 
-  maximum_iterations = op.get_attr(
-      "_maximum_iterations") if _is_in_xla_context() else None
+  maximum_iterations = op.inputs[1]
   parallel_iterations = op.get_attr("parallel_iterations")
-  assert not _is_in_xla_context() or maximum_iterations is not None
-  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
 
   grads = [_preprocess_grad(grad, body_out, while_out)
            for grad, body_out, while_out
@@ -303,8 +298,13 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
                                            while_op)
   loop_vars = args + captured_inputs
 
-  def grad_cond(counter, max_iters, *unused_args):
-    return counter < max_iters
+  # This modifies body_grad_graph.
+  loop_vars = while_v2_indexed_slices_rewriter.rewrite_grad_indexed_slices(
+      grads, body_grad_graph, loop_vars, while_op.inputs)
+
+  def grad_cond(counter, unused_maximum_iterations_arg, forward_loop_iters,
+                *unused_args):
+    return counter < forward_loop_iters
 
   grad_cond_name = util.unique_grad_fn_name(op.get_attr("cond").name)
   cond_grad_graph = func_graph_module.func_graph_from_py_func(
@@ -320,26 +320,15 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       output_shapes=[t.shape for t in body_grad_graph.outputs],
       parallel_iterations=parallel_iterations,
       name="%s_grad" % while_op.name)
+  grad_op = outputs[0].op
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  util.maybe_set_lowering_attr(outputs[0].op)
-  _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+  util.maybe_set_lowering_attr(grad_op)
+  util.maybe_propagate_compile_time_consts_in_xla(grad_op)
 
   # See comment in while_loop.
   outputs = [array_ops.identity(t) for t in outputs]
-
-  # Set None as the output gradient for tensors with None input gradient.
-  # outputs[0] is the loop counter.
-  # outputs[1] is the total number of loop iterations.
-  index = 2
-  none_padded_outputs = []
-  for g in grads:
-    if g is None:
-      none_padded_outputs.append(None)
-    else:
-      none_padded_outputs.append(outputs[index])
-      index += 1
-  return none_padded_outputs
+  return _get_structured_grad_output(outputs, grads, body_grad_graph)
 
 
 def _preprocess_grad(grad, body_graph_output, while_op_output):
@@ -375,6 +364,8 @@ def _preprocess_grad(grad, body_graph_output, while_op_output):
   return grad
 
 
+# TODO(skyewm): make this return constants if op_output's shape is fully
+# defined (this can be done by checking the "shape" attr of resource vars).
 def _zeros_like(op_output):
   """Like array_ops.zeros_like() but also accepts resource var handles."""
   if op_output.dtype == dtypes.resource:
@@ -385,7 +376,7 @@ def _zeros_like(op_output):
 
 def _is_trainable(tensor):
   """Returns whether the given tensor is trainable."""
-  if not gradients_impl.IsTrainable(tensor):
+  if not gradients_util.IsTrainable(tensor):
     return False
 
   # Special case: untrainable accumulator output. The gradients algorithm
@@ -396,53 +387,11 @@ def _is_trainable(tensor):
   if tensor.op.type == "TensorListPopBack" and tensor.value_index == 0:
     assert tensor.dtype == dtypes.variant
     element_type = tensor.op.get_attr("element_dtype")
-    return gradients_impl.IsTrainable(element_type)
+    return gradients_util.IsTrainable(element_type)
 
   return True
 
 
-def _validate_and_convert_to_tensor(maximum_iterations):
-  """Checks that `maximum_iterations` is valid.
-
-  In XLA context, `maximum_iterations` is required and must be statically
-  inferable, e.g. output tensor of a Const node.
-
-  Args:
-    maximum_iterations: The maximum_iterations passed to while_loop.
-
-  Returns:
-    A scalar valued tensor of type int32 or None.
-
-  Raises:
-    ValueError: If `maximum_iterations` is invalid.
-  """
-  if maximum_iterations is None:
-    return None
-
-  if _is_in_xla_context() and isinstance(maximum_iterations, ops.Tensor):
-    # Get the constant value from the `maximum_iterations` tensor to avoid
-    # capturing a Const tensor from outside this graph.
-    value = tensor_util.constant_value(maximum_iterations)
-    if value is None:
-      # XLA requires maximum_iterations to be statically known (e.g. a
-      # constant value or known shape dimension) when intermediate values
-      # from the forward pass are needed in the gradients pass. However,
-      # maximum_iterations may not be required if the gradient isn't built
-      # or no intermediates are required, thus we return the tensor as is.
-      return maximum_iterations
-
-    maximum_iterations = value
-
-  # EmptyTensorList expects `max_num_elements` to be of type int32.
-  maximum_iterations = ops.convert_to_tensor(
-      maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
-  if maximum_iterations.shape.ndims != 0:
-    raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                     maximum_iterations.shape)
-
-  return maximum_iterations
-
-
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
 def _get_graph(while_op, func_attr_name):
   """Returns `FuncGraph` for the given function attribute.
@@ -472,7 +421,7 @@ def _get_graph(while_op, func_attr_name):
 
 
 def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
-                      max_iters):
+                      maximum_iterations):
   """Builds and returns the gradient FuncGraph of `func_graph` and its args.
 
   The returned grad_func_graph must be called with the returned
@@ -486,7 +435,7 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
     body_graph: FuncGraph for the forward body function.
     name: Name of the returned gradient function.
     while_op: The forward While op.
-    max_iters: the maximum number of iterations, or None if no limit.
+    maximum_iterations: Tensor. The maximum number of iterations.
 
   Returns:
     2-tuple of (grad_func_graph, args).
@@ -497,7 +446,7 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
   counter = constant_op.constant(
       0, dtype=total_iters.dtype, name="grad_counter")
 
-  args = [counter, total_iters] + list(grads)
+  args = [counter, maximum_iterations, total_iters] + list(grads)
   # Note: The returned function does not have `args` in the list of
   # `external_captures`.
   grad_func_graph = func_graph_module.func_graph_from_py_func(
@@ -505,19 +454,20 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
       lambda *args: _grad_fn(ys, xs, args, body_graph),
       args, {},
       func_graph=_WhileBodyGradFuncGraph(name, cond_graph, body_graph,
-                                         max_iters))
+                                         maximum_iterations))
 
   # Add the popped accumulators to the list of outputs.
   for internal_capture in grad_func_graph.internal_captures:
     if internal_capture in grad_func_graph.popped_tensor_lists:
-      grad_func_graph.outputs.append(
-          grad_func_graph.popped_tensor_lists[internal_capture])
+      new_output = grad_func_graph.popped_tensor_lists[internal_capture]
     elif internal_capture.dtype == dtypes.resource:
-      grad_func_graph.outputs.append(internal_capture)
+      new_output = internal_capture
     else:
       raise ValueError("Tensor %s is in list of internal_captures but is"
                        " neither a resource nor is in popped_tensor_lists." %
                        str(internal_capture))
+    grad_func_graph.outputs.append(new_output)
+    grad_func_graph.structured_outputs.append(new_output)
 
   return grad_func_graph, args
 
@@ -534,20 +484,21 @@ def _grad_fn(ys, xs, args, func_graph):
     args: The input arguments.
       args[0] - Loop counter
       args[1] - Total number of iterations.
-      args[2:] - Incoming gradients for `ys`.
+      args[2] - maximum_iterations.
+      args[3:] - Incoming gradients for `ys`.
     func_graph: function.FuncGraph. The corresponding forward-pass function.
 
   Returns:
     The output gradient Tensors.
   """
-  grad_ys = args[2:]
+  grad_ys = args[3:]
 
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
   # after the forward While op has been rewritten in _resolve_grad_captures.
   # TODO(srbs): Mark GradientsHelper as public?
-  grad_outs = gradients_impl._GradientsHelper(
+  grad_outs = gradients_util._GradientsHelper(
       ys, xs, grad_ys=grad_ys, src_graph=func_graph,
       unconnected_gradients="zero")
 
@@ -555,8 +506,9 @@ def _grad_fn(ys, xs, args, func_graph):
   # is a tf.StopGradient in the loop body.
   assert all(g is not None for g in grad_outs)
   counter = args[0]
-  total_iters = args[1]
-  return [counter + 1, total_iters] + grad_outs
+  maximum_iterations = args[1]
+  total_iters = args[2]
+  return [counter + 1, maximum_iterations, total_iters] + grad_outs
 
 
 def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
@@ -600,6 +552,46 @@ def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
   return new_capture_inputs
 
 
+def _get_structured_grad_output(outputs, grads, body_grad_graph):
+  """Returns the values that should be returned from the while grad function.
+
+  Args:
+    outputs: the raw Tensor outputs of the grad While op.
+    grads: the input gradients to the gradient function.
+    body_grad_graph: _WhileBodyGradFuncGraph.
+
+  Returns:
+    A list of gradient values. May include Nones.
+  """
+  result = []
+  # outputs[0] is the loop counter.
+  # outputs[1] is maximum_iterations.
+  # outputs[2] is the total number of loop iterations.
+  outputs_idx = 3
+  structured_outputs_idx = 3
+  for g in grads:
+    # Set None as the output gradient for tensors with None input gradient.
+    if g is None:
+      result.append(None)
+      continue
+    output = body_grad_graph.structured_outputs[structured_outputs_idx]
+    structured_outputs_idx += 1
+    if isinstance(output, ops.IndexedSlices):
+      # TODO(skyewm): is there a more robust way to determine the order of
+      # flattened IndexedSlices components?
+      result.append(ops.IndexedSlices(
+          values=outputs[outputs_idx],
+          indices=outputs[outputs_idx + 1],
+          dense_shape=outputs[outputs_idx + 2]))
+      outputs_idx += 3
+    else:
+      assert isinstance(output, ops.Tensor)
+      result.append(outputs[outputs_idx])
+      outputs_idx += 1
+
+  return result
+
+
 def _get_accumulator(tensor):
   r"""Returns TensorList if any containing accumulated values of tensor.
 
@@ -698,7 +690,8 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
       values of this dict need to be added to the list of outputs.
   """
 
-  def __init__(self, name, forward_cond_graph, forward_body_graph, max_iters):
+  def __init__(self, name, forward_cond_graph, forward_body_graph,
+               maximum_iterations):
     super(_WhileBodyGradFuncGraph, self).__init__(name)
     self.empty_tensor_lists = []
     self.popped_tensor_lists = {}
@@ -706,7 +699,7 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     self._forward_graph = forward_body_graph
     # FuncGraph for the cond of the forward While op.
     self._forward_cond_graph = forward_cond_graph
-    self._maximum_iterations = max_iters
+    self._maximum_iterations = maximum_iterations
     # Dict from forward intermediate tensor to its indirectly captured tensor
     # in this graph. Indirect capturing happens in two ways:
     # 1. For non-resource tensors we capture their accumulators from the forward
@@ -741,9 +734,9 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     """
     if (not whitelisted and tensor.graph is not self and
         tensor.graph != self._forward_graph):
-      raise ValueError("Attempting to capture tensor", str(tensor),
-                       " which is not in the forward graph but in ",
-                       _graph_name(tensor.graph), ".")
+      raise ValueError("Attempting to capture tensor %s which is not in the "
+                       "forward graph but in %s." %
+                       (str(tensor), _graph_name(tensor.graph)))
     return super(_WhileBodyGradFuncGraph, self).capture(tensor, name)
 
   def _capture_helper(self, tensor, name):
@@ -816,27 +809,97 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
       Tensor in this graph.
     """
     assert tensor.dtype == dtypes.resource
-    if tensor in self._forward_graph.inputs:
-      index = self._forward_graph.inputs.index(tensor)
-    elif tensor.op.type == "While":
-      # Captured resources occur at the same index in the lists of inputs and
-      # outputs of a while op. So we lookup the input of `tensor.op` at the
-      # same index as the index of `tensor` in the `tensor.op.outputs`.
-      index = self._forward_graph.inputs.index(
-          tensor.op.inputs[tensor.value_index])
-    else:
-      raise ValueError(
-          "Taking gradient of a while loop which creates "
-          "a resource in its body is not supported: %s" % tensor)
-    # This must be a loop invariant.
-    assert self._forward_graph.inputs[index] == self._forward_graph.outputs[
-        index], ("Resource tensors must be loop invariants %s." %
-                 self._forward_graph._while.inputs[index])
+
+    index = self._resource_input_index(
+        tensor.name,
+        [t.name for t in self._forward_graph.inputs],
+        {op.name: op.node_def for op in self._forward_graph.get_operations()},
+        self._forward_graph._functions)
+
+    input_placeholder = self._forward_graph.inputs[index]
     tensor_in_outer_graph = self._forward_graph._while.inputs[index]
+
+    assert input_placeholder.dtype == dtypes.resource
+    assert tensor_in_outer_graph.dtype == dtypes.resource
+    # This must be a loop invariant.
+    assert input_placeholder == self._forward_graph.outputs[index], (
+        "Resource tensors must be loop invariants %s." %
+        tensor_in_outer_graph)
+
     self._indirect_captures[tensor] = self.capture(
         tensor_in_outer_graph, whitelisted=True)
     return self._indirect_captures[tensor]
 
+  def _resource_input_index(self, tensor_name, input_names, node_defs,
+                            functions):
+    """Returns the index of the input corresponding to `tensor_name`.
+
+    This method is used to find the corresponding index of an arbitrary resource
+    tensor in a function (the function could be a loop body). We assume that
+    resource handles are never created in functions, so that every resource
+    tensor can be traced back to a function input.
+
+    The awkward signature of this method is to make it work with both FuncGraphs
+    and FunctionDefs. This is so we can recurse on function call ops without
+    building the corresponding FuncGraph (note that even if a FuncGraph for a
+    FunctionDef already exists, the input/output/node names may have been
+    changed when the FuncGraph was serialized to the FunctionDef, which makes it
+    unusable with this algorithm).
+
+    Args:
+      tensor_name: the name of the resource tensor to be resolved to an input.
+      input_names: a list of the names of all inputs to the function.
+      node_defs: a dict mapping op name -> NodeDef for every op in the function.
+      functions: a dict mapping function name -> _EagerDefinedFunction.
+
+    Returns:
+      The index into input_names corresponding to `tensor_name`.
+    """
+    while tensor_name not in input_names:
+      # FunctionDefs and graphs use different tensor naming conventions.
+      parts = tensor_name.split(":")
+      if len(parts) == 3:
+        op_name, _, output_idx = parts
+      elif len(parts) == 2:
+        op_name, output_idx = parts
+      else:
+        assert len(parts) == 1
+        op_name = parts[0]
+        output_idx = 0
+      output_idx = int(output_idx)
+      node_def = node_defs[op_name]
+
+      if node_def.op == "While":
+        # Captured resources occur at the same index in the lists of inputs and
+        # outputs of a while op. So we lookup the input of `tensor.op` at the
+        # same index as the index of `tensor` in the `tensor.op.outputs`.
+        tensor_name = node_def.input[output_idx]
+      elif node_def.op in ("PartitionedCall", "StatefulPartitionedCall"):
+        # Functions output any captured resource tensors used by their
+        # gradients.  `tensor_name` is one of these outputs from a nested
+        # function call, so recursively find the corresponding input in the
+        # nested FunctionDef.
+        func_name = node_def.attr["f"].func.name
+        fdef = functions[func_name].definition
+        output_arg_name = fdef.signature.output_arg[output_idx].name
+        output_tensor_name = fdef.ret[output_arg_name]
+        input_index = self._resource_input_index(
+            output_tensor_name,
+            [arg.name for arg in fdef.signature.input_arg],
+            {ndef.name: ndef for ndef in fdef.node_def},
+            functions)
+        tensor_name = node_def.input[input_index]
+      else:
+        # We assume there are no other ops types that will "forward" resource
+        # handles like this, so all other handles must have been created by the
+        # op. (Note that cond_v2 wraps resource handle outputs in optionals,
+        # which we'll end up accumulating).
+        raise ValueError(
+            "Taking gradient of a while loop which creates "
+            "a resource in its body is not supported: %s" % op_name)
+
+    return input_names.index(tensor_name)
+
 
 def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
   for (t, shape, input_t) in zip(output_tensors, shape_invariants,
@@ -857,7 +920,7 @@ def _check_num_inputs_outputs(cond_graph, body_graph, num_flattened_loop_vars):
   assert len(cond_graph.outputs) == 1, (
       "cond_graph has %d outputs; Expected: 1" % len(cond_graph.outputs))
   assert len(body_graph.inputs) == num_flattened_loop_vars, (
-      "body_graph takes %d inputs; Expected: %d" % (len(cond_graph.inputs),
+      "body_graph takes %d inputs; Expected: %d" % (len(body_graph.inputs),
                                                     num_flattened_loop_vars))
   assert len(body_graph.outputs) == num_flattened_loop_vars, (
       "body_graph has %d outputs; Expected: %d" % (len(body_graph.outputs),
@@ -869,15 +932,6 @@ def _copy_handle_data(src_tensors, tgt_tensors):
     custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
-def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
-  if maximum_iterations is not None and control_flow_util.IsInXLAContext(op):
-    # Store the maximum_iterations to use in the gradient pass.
-    op._set_attr(  # pylint: disable=protected-access
-        "_maximum_iterations",
-        attr_value_pb2.AttrValue(
-            i=tensor_util.constant_value(maximum_iterations)))
-
-
 # TODO(srbs): This method should be in control_flow_util but that introduces
 # a circular dependency ops -> control_flow_util -> ops.
 def _is_in_xla_context():
@@ -928,4 +982,13 @@ def _build_signature(loop_vars, shape_invariants):
   ])
 
 
+def _build_maximum_iterations_loop_var(maximum_iterations):
+  if maximum_iterations is None:
+    # Default value for max_num_elements to EmptyTensorList meaning that the
+    # list size is unbounded.
+    maximum_iterations = -1
+  # EmptyTensorList expects `max_num_elements` to be of type int32.
+  return ops.convert_to_tensor(
+      maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9951cf74cc39ac2f8cdaef9309946cc7d7187ee5
--- /dev/null
+++ b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
@@ -0,0 +1,279 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Methods for rewriting while_v2 grad functions with IndexedSlices output."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.util import nest
+
+
+def rewrite_grad_indexed_slices(grads, body_grad_graph, loop_vars,
+                                forward_inputs):
+  """Handles special case of IndexedSlices returned from while gradient.
+
+  Some gradient functions return IndexedSlices instead of a Tensor (e.g. the
+  gradient of Gather ops). When this happens in the gradient of a while body,
+  the resulting gradient body function will have mismatched inputs and outputs,
+  since the input is a single Tensor, but the IndexedSlices gets unnested into
+  three output Tensors.
+
+  This function fixes this by rewriting the gradient body to have three inputs
+  to match the three outputs, i.e., it effectively converts the input Tensor
+  into an input IndexedSlices. It also returns new `loop_vars` to reflect the
+  new inputs.
+
+  Args:
+    grads: the input gradient Tensors to the while gradient computation.
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    loop_vars: list of Tensors. The inputs to body_grad_graph.
+    forward_inputs: list of Tensors. The (flat) inputs to the forward-pass
+      While op.
+
+  Returns:
+    The new loop_vars to pass to body_grad_graph.
+  """
+  # Match up body_grad_graph.structured_outputs with the corresponding
+  # forward_inputs.
+  #
+  # Note that we don't expect a gradient computation to have structured output
+  # (e.g. no nested lists), so no need to flatten
+  # body_grad_graph.structured_outputs. However, structured_outputs may still
+  # contain composite tensors such as IndexedSlices, unlike
+  # body_grad_graph.outputs, which contains flattened composite tensors.
+  inputs_with_grads = [t for g, t in zip(grads, forward_inputs)
+                       if g is not None]
+  # Skip loop counter, maximum_iterations and total number of loop iterations.
+  structured_outputs = body_grad_graph.structured_outputs[3:]
+
+  for forward_input, output in zip(inputs_with_grads, structured_outputs):
+    if not isinstance(output, ops.IndexedSlices): continue
+
+    if forward_input.dtype == dtypes.resource:
+      # TODO(skyewm): In theory we should use this for all captured inputs, not
+      # just resource handles (which can only be captured). We can do this by
+      # checking that forward_input is passed straight through to its output.
+      loop_vars = _rewrite_input_as_indexed_slices(body_grad_graph, output,
+                                                   forward_input, loop_vars)
+    else:
+      _rewrite_output_as_tensor(body_grad_graph, output)
+
+  return loop_vars
+
+
+def _rewrite_output_as_tensor(body_grad_graph, grad_output_slices):
+  """Rewrites grad_output_slices to be a Tensor output.
+
+  Args:
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    grad_output_slices: IndexedSlices output of body_grad_graph.
+  """
+  with body_grad_graph.as_default():
+    new_output = ops.convert_to_tensor_v2(grad_output_slices)
+
+  idx = body_grad_graph.structured_outputs.index(grad_output_slices)
+  body_grad_graph.structured_outputs[idx] = new_output
+  body_grad_graph.outputs = func_graph.flatten(
+      body_grad_graph.structured_outputs)
+
+
+def _rewrite_input_as_indexed_slices(body_grad_graph, grad_output_slices,
+                                     forward_input, loop_vars):
+  """Rewrites grad_output_slices's corresponding input to be an IndexedSlices.
+
+  This rewrite requires that forward_input was captured in the forward loop,
+  i.e. is not a user-specified loop variable. This is important because the
+  rewrite assumes that forward_input is passed through to its corresponding
+  output unchanged. This assumption is used in _rewrite_input_as_indexed_slices,
+  which depends on the exact gradient structure produced by the input's fanout.
+
+  This can yield a more efficient computation than using
+  _rewrite_output_as_tensor, since it preserves the IndexedSlices structure
+  instead of converting the IndexedSlices to a dense Tensor.
+
+  Args:
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    grad_output_slices: IndexedSlices output of body_grad_graph.
+    forward_input: the corresonding Tensor input to the forward loop.
+    loop_vars: list of Tensors. The inputs to body_grad_graph.
+
+  Returns:
+    The new loop_vars to pass to body_grad_graph.
+  """
+  # Create initial IndexedSlices that will be the input to the grad While
+  # op. This will start as zeros, and accumulate the IndexedSlices grad output.
+  # Note that because forward_input is captured and not a loop var, its incoming
+  # gradient should always be zero.
+  init_slices = _create_grad_indexed_slices_init(grad_output_slices,
+                                                 forward_input)
+
+  # Create a new version of grad_output_slices's gradient computation that uses
+  # the new IndexedSlices input instead of the original Tensor input. We'll
+  # return the new computation and leave the old computation as dead code.
+  # TODO(skyewm): considering pruning body_grad_graph to remove the old
+  # computation.
+  with body_grad_graph.as_default():
+    input_slices = ops.IndexedSlices(
+        values=body_grad_graph.capture(init_slices.values, whitelisted=True),
+        indices=body_grad_graph.capture(init_slices.indices, whitelisted=True),
+        dense_shape=body_grad_graph.capture(init_slices.dense_shape,
+                                            whitelisted=True))
+
+    # Remove the captured tensors from the function inputs. We'll add them back
+    # at the correct index in _update_indexed_slices_param.
+    for t in _flatten(init_slices):
+      captured_t = body_grad_graph.captures.pop(t)
+      body_grad_graph.inputs.remove(captured_t)
+
+    new_output_slices = _rewrite_grad_indexed_slices_output(grad_output_slices,
+                                                            input_slices)
+
+  # Update body_grad_graph's inputs and outputs to reflect the new
+  # IndexedSlices computation.
+  return _update_indexed_slices_param(
+      body_grad_graph, loop_vars, init_slices, input_slices, new_output_slices,
+      grad_output_slices)
+
+
+def _create_grad_indexed_slices_init(grad_output_slices, forward_input):
+  """Creates an IndexedSlices to pass as input to the while grad function.
+
+  Args:
+    grad_output_slices: IndexedSlices. The corresponding while grad function
+      output.
+    forward_input: Tensor. The corresonding input to the forward while op.
+
+  Returns:
+    Zeros IndexedSlices, created in current Graph.
+  """
+  assert isinstance(grad_output_slices, ops.IndexedSlices)
+  assert isinstance(forward_input, ops.Tensor)
+  values_out = grad_output_slices.values
+  indices_out = grad_output_slices.indices
+
+  # Create the initial values tensor.
+  if values_out.shape.is_fully_defined():
+    values_shape = tensor_shape.TensorShape([0] +
+                                            values_out.shape.as_list()[1:])
+    values = array_ops.zeros(values_shape, dtype=values_out.dtype,
+                             name="values_init")
+  else:
+    if forward_input.dtype == dtypes.resource:
+      forward_shape = gen_resource_variable_ops.variable_shape(forward_input)
+    else:
+      forward_shape = array_ops.shape(forward_input)
+    values_shape = array_ops.concat([[0], forward_shape[1:]], 0)
+    values = array_ops.zeros(values_shape, dtype=values_out.dtype,
+                             name="values_init")
+
+  # Create the initial indices tensor.
+  indices = constant_op.constant([], indices_out.dtype, name="indices_init")
+
+  # Create the initial dense_shape tensor. We assume is the same shape as
+  # forward_input, since captured tensors don't change shape across loop
+  # iterations.
+  if forward_input.dtype == dtypes.resource:
+    shape = gen_resource_variable_ops.variable_shape(forward_input,
+                                                     name="shape_init")
+  else:
+    shape = array_ops.shape(forward_input, name="shape_init")
+
+  return ops.IndexedSlices(values=values, indices=indices, dense_shape=shape)
+
+
+def _rewrite_grad_indexed_slices_output(old_output_slices, new_input_slices):
+  """Creates a new verson of old_output_slices with new_input_slices as input.
+
+  This method assumes that old_output_slices.{values,indices} are produced by
+  concatenating the incoming gradient Tensor input with the IndexedSlices
+  produced by the gradient computation of the while body. See
+  gradients_impl._AggregateIndexedSlicesGradients for where these concats are
+  constructed. We build new concats that use new_input_slices instead of the
+  original Tensor input.
+
+  Args:
+    old_output_slices: original IndexedSlices output of while gradient.
+    new_input_slices: new IndexedSlices to use as input to while gradient.
+
+  Returns:
+    A new IndexedSlices to replace old_output_slices.
+  """
+
+  def rewrite(old_output, new_input):
+    assert old_output.type == "Identity"
+    concat_op = old_output.inputs[0].op
+    assert concat_op.type == "ConcatV2"
+    # Don't include axis arg
+    old_concat_args = concat_op.inputs[:-1]
+    # We assume that the original gradient input was the first argument to the
+    # concat op.
+    # TODO(skyewm): do this in a more robust way.
+    return array_ops.concat([new_input] + old_concat_args[1:], 0)
+
+  values = rewrite(old_output_slices.values.op, new_input_slices.values)
+  indices = rewrite(old_output_slices.indices.op, new_input_slices.indices)
+  return ops.IndexedSlices(values=values, indices=indices,
+                           dense_shape=new_input_slices.dense_shape)
+
+
+def _update_indexed_slices_param(graph, loop_vars, init_slices, input_slices,
+                                 output_slices, old_output_slices):
+  """Updates graph with new IndexedSlices input/output.
+
+  Updates graph's metadata to output the gradient computation defined by
+  init_slices, input_slices, and output_slices, instead of outputting
+  old_output_slices. Also returns a new version of loop_vars with init_slices
+  replacing the old input.
+
+  Args:
+    graph: _WhileBodyGradFuncGraph.
+    loop_vars: the inputs to graph.
+    init_slices: the new IndexedSlices to use as input to graph.
+    input_slices: the new IndexedSlices in graph that should be fed by
+      init_slices.
+    output_slices: the new IndexedSlices in graph that should be the
+      corresonding output to input_slices.
+    old_output_slices: the IndexedSlices in graph that are currently
+      being output.
+
+  Returns:
+    New loop_vars to pass to graph.
+  """
+  structured_idx = graph.structured_outputs.index(old_output_slices)
+  # We assume that the component tensors of old_output_slices appear
+  # sequentially in graph.outputs. We use the first of these tensors
+  # as the reference index.
+  flat_idx = graph.outputs.index(func_graph.flatten(old_output_slices)[0])
+
+  graph.structured_outputs[structured_idx] = output_slices
+  graph.outputs = func_graph.flatten(
+      graph.structured_outputs)
+
+  graph.inputs = (graph.inputs[:flat_idx] + _flatten(input_slices) +
+                  graph.inputs[flat_idx + 1:])
+
+  return loop_vars[:flat_idx] + _flatten(init_slices) + loop_vars[flat_idx + 1:]
+
+
+def _flatten(arg):
+  return nest.flatten(arg, expand_composites=True)
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 7b917235c0a73421552b7aebaa3192de969e5f3a..303b70ff57e4eba5d1338e4ea30fbe5a0c8b652e 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -18,109 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import errno as _errno
 import sys as _sys
 
+from absl.app import run as _run
+
 from tensorflow.python.platform import flags
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _usage(shorthelp):
-  """Writes __main__'s docstring to stdout with some help text.
-
-  Args:
-    shorthelp: bool, if True, prints only flags from the main module,
-        rather than all flags.
-  """
-  doc = _sys.modules['__main__'].__doc__
-  if not doc:
-    doc = '\nUSAGE: %s [flags]\n' % _sys.argv[0]
-    doc = flags.text_wrap(doc, indent='       ', firstline_indent='')
-  else:
-    # Replace all '%s' with sys.argv[0], and all '%%' with '%'.
-    num_specifiers = doc.count('%') - 2 * doc.count('%%')
-    try:
-      doc %= (_sys.argv[0],) * num_specifiers
-    except (OverflowError, TypeError, ValueError):
-      # Just display the docstring as-is.
-      pass
-  if shorthelp:
-    flag_str = flags.FLAGS.main_module_help()
-  else:
-    flag_str = str(flags.FLAGS)
-  try:
-    _sys.stdout.write(doc)
-    if flag_str:
-      _sys.stdout.write('\nflags:\n')
-      _sys.stdout.write(flag_str)
-    _sys.stdout.write('\n')
-  except IOError as e:
-    # We avoid printing a huge backtrace if we get EPIPE, because
-    # "foo.par --help | less" is a frequent use case.
-    if e.errno != _errno.EPIPE:
-      raise
-
-
-class _HelpFlag(flags.BooleanFlag):
-  """Special boolean flag that displays usage and raises SystemExit."""
-  NAME = 'help'
-  SHORT_NAME = 'h'
-
-  def __init__(self):
-    super(_HelpFlag, self).__init__(
-        self.NAME, False, 'show this help', short_name=self.SHORT_NAME)
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=True)
-      print()
-      print('Try --helpfull to get a list of all flags.')
-      _sys.exit(1)
-
-
-class _HelpshortFlag(_HelpFlag):
-  """--helpshort is an alias for --help."""
-  NAME = 'helpshort'
-  SHORT_NAME = None
-
-
-class _HelpfullFlag(flags.BooleanFlag):
-  """Display help for flags in main module and all dependent modules."""
-
-  def __init__(self):
-    super(_HelpfullFlag, self).__init__('helpfull', False, 'show full help')
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=False)
-      _sys.exit(1)
-
-
-_define_help_flags_called = False
-
-
-def _define_help_flags():
-  global _define_help_flags_called
-  if not _define_help_flags_called:
-    flags.DEFINE_flag(_HelpFlag())
-    flags.DEFINE_flag(_HelpfullFlag())
-    flags.DEFINE_flag(_HelpshortFlag())
-    _define_help_flags_called = True
+def _parse_flags_tolerate_undef(argv):
+  """Parse args, returning any unknown flags (ABSL defaults to crashing)."""
+  return flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
 
 @tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
-  # Define help flags.
-  _define_help_flags()
-
-  # Parse known flags.
-  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
-
   main = main or _sys.modules['__main__'].main
 
-  # Call the main function, passing through any arguments
-  # to the final program.
-  _sys.exit(main(argv))
-
+  _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index d6773d7b8136f93080b122f52b77513305aecdb6..428505a7375b9c19064344378bae6df5bd6c7dd6 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -46,6 +46,10 @@ GLOBAL_BENCHMARK_REGISTRY = set()
 # See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv.
 TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
 
+# Environment variable that lets the TensorFlow runtime allocate a new
+# threadpool for each benchmark.
+OVERRIDE_GLOBAL_THREADPOOL = "TF_OVERRIDE_GLOBAL_THREADPOOL"
+
 
 def _global_report_benchmark(
     name, iters=None, cpu_time=None, wall_time=None,
@@ -201,6 +205,12 @@ def benchmark_config():
 class TensorFlowBenchmark(Benchmark):
   """Abstract class that provides helpers for TensorFlow benchmarks."""
 
+  def __init__(self):
+    # Allow TensorFlow runtime to allocate a new threadpool with different
+    # number of threads for each new benchmark.
+    os.environ[OVERRIDE_GLOBAL_THREADPOOL] = "1"
+    super(TensorFlowBenchmark, self).__init__()
+
   @classmethod
   def is_abstract(cls):
     # mro: (_BenchmarkRegistrar, Benchmark, TensorFlowBenchmark) means
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index fe4b0d0d3767346f4300450f01d56a62e625cca4..f146d7517447828ace3c8420cec764ffde1b88a8 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -13,20 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Imports unittest as a replacement for testing.pybase.googletest."""
+"""Imports absltest as a replacement for testing.pybase.googletest."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import atexit
-import itertools
 import os
 import sys
 import tempfile
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from unittest import *
+from absl.testing.absltest import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.framework import errors
@@ -41,7 +40,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
 
-unittest_main = main
+absltest_main = main
 
 # We keep a global variable in this module to make sure we create the temporary
 # directory only once per test binary invocation.
@@ -51,43 +50,9 @@ _googletest_temp_dir = ''
 # pylint: disable=invalid-name
 # pylint: disable=undefined-variable
 def g_main(argv):
-  """Delegate to unittest.main after redefining testLoader."""
-  if 'TEST_SHARD_STATUS_FILE' in os.environ:
-    try:
-      f = None
-      try:
-        f = open(os.environ['TEST_SHARD_STATUS_FILE'], 'w')
-        f.write('')
-      except IOError:
-        sys.stderr.write('Error opening TEST_SHARD_STATUS_FILE (%s). Exiting.'
-                         % os.environ['TEST_SHARD_STATUS_FILE'])
-        sys.exit(1)
-    finally:
-      if f is not None: f.close()
-
-  if ('TEST_TOTAL_SHARDS' not in os.environ or
-      'TEST_SHARD_INDEX' not in os.environ):
-    return unittest_main(argv=argv)
-
-  total_shards = int(os.environ['TEST_TOTAL_SHARDS'])
-  shard_index = int(os.environ['TEST_SHARD_INDEX'])
-  base_loader = TestLoader()
-
-  delegate_get_names = base_loader.getTestCaseNames
-  bucket_iterator = itertools.cycle(range(total_shards))
-
-  def getShardedTestCaseNames(testCaseClass):
-    filtered_names = []
-    for testcase in sorted(delegate_get_names(testCaseClass)):
-      bucket = next(bucket_iterator)
-      if bucket == shard_index:
-        filtered_names.append(testcase)
-    return filtered_names
-
-  # Override getTestCaseNames
-  base_loader.getTestCaseNames = getShardedTestCaseNames
-
-  unittest_main(argv=argv, testLoader=base_loader)
+  """Delegate to absltest.main."""
+
+  absltest_main(argv=argv)
 
 
 # Redefine main to allow running benchmarks
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 0654104a3436366bb5fe88e2c3415cc957cbfde8..fcab57c12c95cd18fd5e32279a1a42b296a4d130 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -61,6 +61,7 @@ cuda_py_test(
         "no_pip",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -76,6 +77,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
@@ -130,6 +132,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 994206cd63a915de93bc109e7b217ad997c787a7..0a6ba12094b5e2d4374acbe0d23e8355c3b309c2 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -69,4 +69,5 @@ cuda_py_test(
     tags = [
         "no_pip",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index f96d721f46e162ee6753377569aacb439cd591d5..88392ff3f08a9c35d426236289563de1b131b0f4 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -50,7 +50,7 @@ def _extract_node(run_meta, node_name):
       dev = dev[dev.find('cpu:'):]
     elif dev.find('gpu:') > 0:
       dev = dev[dev.find('gpu:'):]
-    else:
+    elif '/host:cpu' not in dev:
       assert False, 'Unrecognized device name: %s' % dev
 
     for node_stat in dev_stat.node_stats:
@@ -111,6 +111,7 @@ def _run_loop_model():
 
 class RunMetadataTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGPU(self):
     if not test.is_gpu_available(cuda_only=True):
       return
@@ -126,6 +127,7 @@ class RunMetadataTest(test.TestCase):
     self.assertEqual(len(ret['gpu:0']), 1)
     self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
 
+  @test_util.run_deprecated_v1
   def testAllocationHistory(self):
     if not test.is_gpu_available(cuda_only=True):
       return
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index d0ba24485328282bdb13db3b173eb1426020084d..fe7a41afb421015a0b57335989a5aad441fbf91d 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -22,8 +22,11 @@ limitations under the License.
 %rename("%s") TFE_ContextListDevices;
 %rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
+%rename("%s") TFE_ContextHasFunction;
 %rename("%s") TFE_ContextEnableRunMetadata;
 %rename("%s") TFE_ContextDisableRunMetadata;
+%rename("%s") TFE_ContextEnableGraphCollection;
+%rename("%s") TFE_ContextDisableGraphCollection;
 %rename("%s") TFE_ContextExportRunMetadata;
 %rename("%s") TFE_ContextClearCaches;
 %rename("%s") TFE_ContextGetDevicePlacementPolicy;
@@ -33,8 +36,14 @@ limitations under the License.
 %rename("%s") TFE_ContextAsyncWait;
 %rename("%s") TFE_ContextAsyncClearError;
 %rename("%s") TFE_NewProfiler;
+%rename("%s") TFE_ProfilerIsOk;
 %rename("%s") TFE_DeleteProfiler;
 %rename("%s") TFE_ProfilerSerializeToString;
+%rename("%s") TFE_NewProfilerContext;
+%rename("%s") TFE_ProfilerContextSetEagerContext;
+%rename("%s") TFE_DeleteProfilerContext;
+%rename("%s") TFE_StartProfilerServer;
+%rename("%s") TFE_ProfilerClientStartTracing;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_SetEagerTensorProfiler;
@@ -67,6 +76,7 @@ limitations under the License.
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 %rename("%s") TFE_Py_TensorShapeOnDevice;
+%rename("%s") TFE_Py_EnableInteractivePythonLogging;
 %rename("%s") TFE_ContextStartStep;
 %rename("%s") TFE_ContextEndStep;
 %rename("%s") TFE_Py_RegisterVSpace;
@@ -140,6 +150,34 @@ limitations under the License.
   $1 = const_cast<char*>(TFE_GetPythonString($input));
 }
 
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* name {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* service_addr {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* logdir {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* worker_list {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
 %typemap(in) (TFE_Context*) {
   $1 = (TFE_Context*)PyCapsule_GetPointer($input, nullptr);
 
@@ -176,6 +214,25 @@ limitations under the License.
       }
       if (EagerTensor_CheckExact(elem)) {
         (*$1)[i] = EagerTensor_Handle(elem);
+      } else if (tensorflow::swig::IsTensor(elem)) {
+        // If it isnt an EagerTensor, but is still a Tensor, it must be a graph
+        // tensor.
+        SWIG_exception_fail(
+            SWIG_TypeError,
+            tensorflow::strings::StrCat(
+                "An op outside of the function building code is being passed\n"
+                "a \"Graph\" tensor. It is possible to have Graph tensors\n"
+                "leak out of the function building context by including a\n"
+                "tf.init_scope in your function building code.\n"
+                "For example, the following function will fail:\n",
+                "  @tf.function\n",
+                "  def has_init_scope():\n",
+                "    my_constant = tf.constant(1.)\n",
+                "    with tf.init_scope():\n",
+                "      added = my_constant * 2\n",
+                "The graph tensor has name: ",
+                TFE_GetPythonString(PyObject_GetAttrString(elem, "name")))
+                .c_str());
       } else {
         SWIG_exception_fail(
             SWIG_TypeError,
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index ebbefd7cde7a31931cf2cd33147c88489e77bda2..0b5c22c7a53f6287b3bb3c54412f5e811bb87af1 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -261,6 +261,24 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "signature_serialization",
+    srcs = [
+        "signature_serialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":revived_types",
+        ":signature_constants",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/tracking:base",
+    ],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -271,10 +289,11 @@ py_library(
         ":builder",
         ":constants",
         ":function_serialization",
+        ":nested_structure_coder",
         ":revived_types",
-        ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
+        ":signature_serialization",
         ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
@@ -286,14 +305,16 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:tracking",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:object_identity",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -320,14 +341,40 @@ py_library(
     deps = [
         ":constants",
         ":function_deserialization",
+        ":load_v1_in_v2",
         ":loader",
+        ":nested_structure_coder",
         ":revived_types",
-        ":saved_object_graph_py",
         ":utils",
-        "//tensorflow/python:function",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "load_v1_in_v2",
+    srcs = [
+        "load_v1_in_v2.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":signature_serialization",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:saver",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/training/tracking",
     ],
 )
 
@@ -337,13 +384,42 @@ tf_py_test(
     additional_deps = [
         ":load",
         ":save",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/module",
+        "//tensorflow/python/training/tracking:tracking",
+    ],
+    tags = ["no_mac"],  # TODO(b/124822121): Re-enable this test.
+)
+
+tf_py_test(
+    name = "load_v1_in_v2_test",
+    srcs = ["load_v1_in_v2_test.py"],
+    additional_deps = [
+        ":builder",
+        ":load",
+        ":save",
+        ":signature_def_utils",
+        ":simple_save",
+        ":utils",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/tracking:tracking",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -354,7 +430,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -363,7 +439,7 @@ tf_py_test(
     srcs = ["revived_types_test.py"],
     additional_deps = [
         ":revived_types",
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -376,7 +452,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":nested_structure_coder",
-        ":saved_object_graph_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
     ],
@@ -394,27 +470,11 @@ py_library(
     ],
 )
 
-tf_proto_library(
-    name = "struct",
-    srcs = ["struct.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library(
-    name = "saved_object_graph",
-    srcs = ["saved_object_graph.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos() + [":struct"],
-    visibility = ["//tensorflow:internal"],
-)
-
 py_library(
     name = "nested_structure_coder",
     srcs = ["nested_structure_coder.py"],
     deps = [
-        ":struct_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "@six_archive//:six",
     ],
@@ -425,7 +485,7 @@ tf_py_test(
     srcs = ["nested_structure_coder_test.py"],
     additional_deps = [
         ":nested_structure_coder",
-        ":struct_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python/eager:test",
     ],
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74..37af428dcb97d77f85e0555edcc1ca959a479943 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -95,11 +95,13 @@ class _SavedModelBuilder(object):
 
     self._export_dir = export_dir
     if file_io.file_exists(export_dir):
-      raise AssertionError(
-          "Export directory already exists. Please specify a different export "
-          "directory: %s" % export_dir)
-
-    file_io.recursive_create_dir(self._export_dir)
+      if file_io.list_directory(export_dir):
+        raise AssertionError(
+            "Export directory already exists, and isn't empty. Please choose "
+            "a different export directory, or delete all the contents of the "
+            "specified directory: %s" % export_dir)
+    else:
+      file_io.recursive_create_dir(self._export_dir)
 
     # Boolean to track whether variables and assets corresponding to the
     # SavedModel have been saved. Specifically, the first meta graph to be added
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index ec5214824ec9c04902919290719b67eb2951aef9..5924b2e0e16a4e871c49ab11724fce258d16e386 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -24,45 +24,98 @@ import re
 from tensorflow.core.framework import function_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
+from tensorflow.python.framework import func_graph as func_graph_lib
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 def _is_tensor(t):
   return isinstance(t, (ops.Tensor, resource_variable_ops.ResourceVariable))
 
 
-def _inputs_compatible(args, stored_inputs):
-  """Checks whether function arguments are compatible with parameters."""
-  if len(args) != len(stored_inputs):
-    return False
+def _call_concrete_function(function, inputs):
+  """Calls a restored Function with structured inputs.
 
-  for arg, stored_input in zip(args, stored_inputs):
-    if not function_lib.is_same_structure(arg, stored_input):
-      return False
+  This differs from `function.__call__` in that inputs and outputs are
+  structured and that it casts inputs to tensors if needed.
 
-    flattened_arg = nest.flatten(arg)
-    flattened_stored_input = nest.flatten(stored_input)
+  Note: this does not checks that non-tensor inputs match. That should be
+  done before via `_concrete_function_callable_with`.
 
-    for a, b in zip(flattened_arg, flattened_stored_input):
-      if _is_tensor(a):
-        if not isinstance(b, tensor_spec.TensorSpec):
-          return False
-        if a.dtype != b.dtype or not b.shape.is_compatible_with(a.shape):
-          return False
-      else:
-        if a != b:
-          return False
+  Args:
+    function: ConcreteFunction to call.
+    inputs: Structured inputs compatible with
+        `function.graph.structured_input_signature`.
+
+  Returns:
+    The structured function output.
+  """
+  expected_structure = function.graph.structured_input_signature
+  flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  tensor_inputs = []
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      tensor_inputs.append(
+          ops.convert_to_tensor(arg, dtype_hint=expected.dtype))
+  result = function._call_flat(tensor_inputs)  # pylint: disable=protected-access
+  if isinstance(result, ops.Operation):
+    return None
+  return result
+
+
+def _try_convert_to_tensor_spec(arg, dtype_hint):
+  """Returns None or TensorSpec obtained if `arg` is converted to tensor."""
+  try:
+    # Note: try conversion in a FuncGraph to avoid poluting current context.
+    with func_graph_lib.FuncGraph(name="guess_conversion").as_default():
+      result = ops.convert_to_tensor(arg, dtype_hint=dtype_hint)
+      return tensor_spec.TensorSpec(shape=result.shape, dtype=result.dtype)
+  except (TypeError, ValueError):
+    return None
+
+
+def _concrete_function_callable_with(function, inputs, allow_conversion):
+  """Returns whether concrete `function` can be called with `inputs`."""
+  expected_structure = function.graph.structured_input_signature
+  try:
+    flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  except (TypeError, ValueError):
+    return False
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      if allow_conversion:
+        arg = _try_convert_to_tensor_spec(arg, dtype_hint=expected.dtype)
+      if not _is_tensor(arg) and not isinstance(arg, tensor_spec.TensorSpec):
+        return False
+      if arg.dtype != expected.dtype:
+        return False
+      if not expected.shape.is_compatible_with(arg.shape):
+        return False
+    else:
+      if arg != expected:
+        return False
   return True
 
 
 def _deserialize_function_spec(function_spec_proto, coder):
   """Deserialize a FunctionSpec object from its proto representation."""
-  fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  typeless_fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  fullargspec = tf_inspect.FullArgSpec(
+      args=typeless_fullargspec.args,
+      varargs=typeless_fullargspec.varargs,
+      varkw=typeless_fullargspec.varkw,
+      defaults=typeless_fullargspec.defaults,
+      kwonlyargs=typeless_fullargspec.kwonlyargs,
+      kwonlydefaults=typeless_fullargspec.kwonlydefaults,
+      annotations=typeless_fullargspec.annotations)
   is_method = function_spec_proto.is_method
   args_to_prepend = coder.decode_proto(function_spec_proto.args_to_prepend)
   kwargs_to_include = coder.decode_proto(function_spec_proto.kwargs_to_include)
@@ -71,30 +124,44 @@ def _deserialize_function_spec(function_spec_proto, coder):
                                    kwargs_to_include, input_signature)
 
 
-def recreate_concrete_function(saved_concrete_function, concrete_functions):
-  """Recreates a user-facing concrete function."""
-  coder = nested_structure_coder.StructureCoder()
-
-  concrete_function = concrete_functions[saved_concrete_function.name]
-  input_signature = coder.decode_proto(
-      saved_concrete_function.canonicalized_input_signature)
-  input_signature_args, input_signature_kwargs = input_signature
-  if input_signature_kwargs:
-    raise ValueError("Restoring concrete function with non-empty kwargs (%s)." %
-                     input_signature_kwargs)
-
+# TODO(allenl): The fact that we can't derive ConcreteFunction calling
+# conventions from the serialized input spec right now is unfortunate. Merging
+# these would be good, maybe by adding TensorSpec names to cache keys so renamed
+# keyword arguments would yield different ConcreteFunctions.
+def setup_bare_concrete_function(saved_bare_concrete_function,
+                                 concrete_functions):
+  """Makes a restored bare concrete function callable."""
+  # Bare concrete functions accept only flat lists of Tensors with unique
+  # names.
+  concrete_function = concrete_functions[
+      saved_bare_concrete_function.concrete_function_name]
   # pylint: disable=protected-access
-  # Set metadata required for the concrete function to accept keyword and
-  # positional arguments in __call__. Normally this is set in
-  # get_concrete_function.
-  concrete_function._arg_keywords = [spec.name for spec in input_signature_args]
-  # TODO(allenl): Should we preserve the number of allowed positional arguments?
-  concrete_function._num_positional_args = len(input_signature_args)
+  concrete_function._arg_keywords = (
+      saved_bare_concrete_function.argument_keywords)
+  concrete_function._num_positional_args = (
+      saved_bare_concrete_function.allowed_positional_arguments)
   # pylint: enable=protected-access
   concrete_function.add_to_graph()
   return concrete_function
 
 
+class RestoredFunction(def_function.Function):
+  """Wrapper class for a function that has been restored from saved state.
+
+  See `def_function.Function`.
+  """
+
+  def __init__(self, python_function, name, function_spec, concrete_functions):
+    # TODO(mdan): We may enable autograph once exceptions are supported.
+    super(RestoredFunction, self).__init__(
+        python_function, name, autograph=False)
+    self._concrete_functions = concrete_functions
+    self._function_spec = function_spec
+
+  def _list_all_concrete_functions_for_serialization(self):
+    return self._concrete_functions
+
+
 def recreate_function(saved_function, concrete_functions):
   """Creates a `Function` from a `SavedFunction`.
 
@@ -114,34 +181,44 @@ def recreate_function(saved_function, concrete_functions):
   function_spec = _deserialize_function_spec(saved_function.function_spec,
                                              coder)
 
-  # TODO(mdan): We may enable autograph once exceptions are supported.
-  @def_function.function(autograph=False)
-  def restored_function(*args, **kwargs):
+  def restored_function_body(*args, **kwargs):
     """Calls a restored function."""
-    # TODO(allenl): Functions saved with input_signatures should revive with
-    # input_signatures.
-    for concrete_function in saved_function.concrete_function:
-      function_obj = concrete_functions[concrete_function.name]
-      canonicalized_original_inputs = coder.decode_proto(
-          concrete_function.canonicalized_input_signature)
-
-      try:
-        canonicalized_inputs = function_spec.canonicalize_function_inputs(
-            *args, **kwargs)
-      except ValueError:
-        continue
-
-      if _inputs_compatible(canonicalized_inputs,
-                            canonicalized_original_inputs):
-        flattened_inputs = nest.flatten(canonicalized_inputs)
-        filtered_inputs = [t for t in flattened_inputs if _is_tensor(t)]
-        return function_obj._call_flat(filtered_inputs)  # pylint: disable=protected-access
-
-    raise AssertionError(
-        "Could not find matching function to call for args %r and kwargs %r" %
-        (args, kwargs))
-
-  return restored_function
+    # This is the format of function.graph.structured_input_signature. At this
+    # point, the args and kwargs have already been canonicalized.
+    inputs = (args, kwargs)
+
+    # First try to find a concrete function that can be called without input
+    # conversions. This allows one to pick a more specific trace in case there
+    # was also a more expensive one that supported tensors.
+    for allow_conversion in [False, True]:
+      for function_name in saved_function.concrete_functions:
+        function = concrete_functions[function_name]
+        if _concrete_function_callable_with(function, inputs, allow_conversion):
+          return _call_concrete_function(function, inputs)
+
+    available_signatures = [
+        concrete_functions[function_name].graph.structured_input_signature
+        for function_name in saved_function.concrete_functions
+    ]
+    raise ValueError(
+        "Could not find matching function to call for inputs %r. "
+        "Only existing signatures are %r."
+        % (inputs, available_signatures))
+
+  concrete_function_objects = []
+  for concrete_function_name in saved_function.concrete_functions:
+    concrete_function_objects.append(concrete_functions[concrete_function_name])
+
+  restored_function = RestoredFunction(
+      restored_function_body,
+      restored_function_body.__name__,
+      function_spec,
+      concrete_function_objects)
+
+  return tf_decorator.make_decorator(
+      restored_function_body,
+      restored_function,
+      decorator_argspec=function_spec.fullargspec)
 
 
 def load_function_def_library(library):
@@ -160,24 +237,24 @@ def load_function_def_library(library):
   Raises:
     ValueError: if functions dependencies have a cycle.
   """
-  # TODO(andresp): Look into restoring gradient function information.
   functions = {}
-  name_mapping = {}
-  # Note: Use a new graph to allow function_def_to_graph to help validating
-  # that the functions are loaded correctly. This is not possible to do
-  # just in eager mode as there is no python API to find if a function has
-  # been registered in eager. Note also that despite this the created
-  # func_graphs can still be used in eager or in other graphs.
-  with ops.Graph().as_default() as import_graph:
-    for fdef in _sort_function_defs(library):
-      copy = _fix_fdef(fdef, name_mapping)
-
-      func_graph = function_def_lib.function_def_to_graph(copy)
-      func = function_lib.ConcreteFunction(func_graph)
-      func.add_to_graph(import_graph)
-
-      name_mapping[fdef.signature.name] = func.name
-      functions[fdef.signature.name] = func
+
+  load_shared_name_suffix = "_load_{}".format(ops.uid())
+  for fdef in _sort_function_defs(library):
+    copy = _fix_fdef(fdef, functions, load_shared_name_suffix)
+
+    func_graph = function_def_lib.function_def_to_graph(copy)
+    for dep in _list_function_deps(fdef):
+      functions[dep].add_to_graph(func_graph)
+    func = function_lib.ConcreteFunction(func_graph)
+    func.add_to_graph()
+
+    functions[fdef.signature.name] = func
+
+    # Also register the gradients in the current root context.
+    with ops.init_scope():
+      func._register_gradient()  # pylint: disable=protected-access
+
   return functions
 
 
@@ -206,8 +283,7 @@ def _sort_function_defs(library):
         ready.append(dest)
 
   if len(output) != len(library.function):
-    loaded = set([x.signature.name for x in output])
-    failed_to_resolve = sorted(set(in_count.keys()) - loaded)
+    failed_to_resolve = sorted(set(in_count.keys()) - set(output))
     raise ValueError("There is a cyclic-dependency between functions. ",
                      "Could not resolve %r." % (failed_to_resolve,))
 
@@ -215,14 +291,49 @@ def _sort_function_defs(library):
   return [reverse[x] for x in output]
 
 
-def _fix_fdef(orig_fdef, name_map):
+def _fix_fdef(orig_fdef, functions, shared_name_suffix):
+  """Fixes a FunctionDef proto to be loaded in current context.
+
+  In particular, when loading a function library into an eager context, one
+  must rename the functions to avoid conflicts with existent functions.
+
+  Args:
+    orig_fdef: FunctionDef proto to fix. It is not modified.
+    functions: map from function name to a ConcreteFunction instance.
+    shared_name_suffix: A unique string for this load which helps to avoid
+      `shared_name` collisions across loads. Two functions from the same load
+      using the same `shared_name` still need to share, but functions from
+      different loads with the same `shared_name` should not.
+
+  Returns:
+    A fixed copy of the original FunctionDef.
+  """
   fdef = function_pb2.FunctionDef()
   fdef.CopyFrom(orig_fdef)
-  fdef.signature.name = _clean_function_name(fdef.signature.name)
   for node_def in fdef.node_def:
+    if "_gradient_op_type" in node_def.attr:
+      if node_def.op in ["StatefulPartitionedCall", "PartitionedCall"]:
+        # TODO(andresp): This code assumes that the gradient registered for this
+        # function call is the default gradient for the function and not a
+        # custom one.
+        fname = node_def.attr["f"].func.name
+        node_def.attr["_gradient_op_type"].s = compat.as_bytes(
+            functions[fname]._gradient_name)  # pylint: disable=protected-access
+      else:
+        logging.warning("Importing a function (%s) with ops with custom "
+                        "gradients. Will likely fail if a gradient is "
+                        "requested.", fdef.signature.name)
     for _, attr_value in node_def.attr.items():
       if attr_value.func.name:
-        attr_value.func.name = name_map[attr_value.func.name]
+        attr_value.func.name = functions[attr_value.func.name].name
+
+    # TODO(b/124205571): Avoid accidental sharing and destruction of restored
+    # resources. For now uniquify "shared_name" when loading functions to avoid
+    # sharing.
+    if "shared_name" in node_def.attr:
+      node_def.attr["shared_name"].s += compat.as_bytes(shared_name_suffix)
+
+  fdef.signature.name = _clean_function_name(fdef.signature.name)
   return fdef
 
 
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index 172ab41dc8c885e51ca3ef7ad60049163d2ccefb..e876eef8b349ac17a42cb284a861784b4d941998 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.framework import func_graph as func_graph_module
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
-from tensorflow.python.saved_model import saved_object_graph_pb2
 
 
 def _serialize_function_spec(function_spec, coder):
@@ -38,33 +37,43 @@ def _serialize_function_spec(function_spec, coder):
   return proto
 
 
-def serialize_concrete_function(concrete_function, node_ids):
+def serialize_concrete_function(concrete_function, node_ids, coder):
   """Build a SavedConcreteFunction."""
   bound_inputs = []
   try:
     for capture in concrete_function.captured_inputs:
       bound_inputs.append(node_ids[capture])
   except KeyError:
-    # TODO(andresp): Would it better to throw an exception?
-    logging.warning(
-        "Concrete function %s not added to object based saved model as it "
-        "captures tensor %s which is unsupported or not reachable from root.",
-        concrete_function.name, capture)
-    return None
-  coder = nested_structure_coder.StructureCoder()
+    raise KeyError(
+        "Failed to add concrete function %s to object based saved model as it "
+        "captures tensor %s which is unsupported or not reachable from root. "
+        "One reason could be that a stateful object or a variable that the "
+        "function depends on is not assigned to an attribute of the serialized "
+        "trackable object "
+        "(see SaveTest.test_captures_unreachable_variable)."
+        % (concrete_function.name, capture))
   concrete_function_proto = saved_object_graph_pb2.SavedConcreteFunction()
-  concrete_function_proto.name = concrete_function.name
-  concrete_function_proto.canonicalized_input_signature.CopyFrom(
-      coder.encode_structure(concrete_function.structured_input_signature))
   structured_outputs = func_graph_module.convert_structure_to_signature(
       concrete_function.structured_outputs)
+  concrete_function_proto.canonicalized_input_signature.CopyFrom(
+      coder.encode_structure(concrete_function.structured_input_signature))
   concrete_function_proto.output_signature.CopyFrom(
       coder.encode_structure(structured_outputs))
   concrete_function_proto.bound_inputs.extend(bound_inputs)
   return concrete_function_proto
 
 
-def serialize_function(function, node_ids):
+def serialize_bare_concrete_function(concrete_function):
+  """Build a SavedBareConcreteFunction."""
+  # pylint: disable=protected-access
+  return saved_object_graph_pb2.SavedBareConcreteFunction(
+      concrete_function_name=concrete_function.name,
+      allowed_positional_arguments=concrete_function._num_positional_args,
+      argument_keywords=concrete_function._arg_keywords)
+  # pylint: enable=protected-access
+
+
+def serialize_function(function):
   """Build a SavedFunction proto."""
   coder = nested_structure_coder.StructureCoder()
   proto = saved_object_graph_pb2.SavedFunction()
@@ -74,8 +83,5 @@ def serialize_function(function, node_ids):
   all_concrete_functions = \
       function._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
   for concrete_function in all_concrete_functions:
-    concrete_function_proto = serialize_concrete_function(
-        concrete_function, node_ids)
-    if concrete_function_proto is not None:
-      proto.concrete_function.add().CopyFrom(concrete_function_proto)
+    proto.concrete_functions.append(concrete_function.name)
   return proto
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 2aa4924b2911b9a8824ca8a88461007c2b0f7e85..326f54c719433c9a56de28c1b7f05c4a6fbb1ab7 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Import a checkpointable object from a SavedModel."""
+"""Import a trackable object from a SavedModel."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,20 +21,24 @@ from __future__ import print_function
 import functools
 import os
 
-from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_deserialization
+from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
-from tensorflow.python.util import compat
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Loader(object):
@@ -43,90 +47,131 @@ class _Loader(object):
   def __init__(self, object_graph_proto, saved_model_proto, export_dir):
     meta_graph = saved_model_proto.meta_graphs[0]
     self._asset_file_def = meta_graph.asset_file_def
+    self._operation_attributes = {
+        node.name: node.attr for node in meta_graph.graph_def.node}
     self._proto = object_graph_proto
     self._export_dir = export_dir
     self._concrete_functions = (
         function_deserialization.load_function_def_library(
             meta_graph.graph_def.library))
     self._load_all()
-    self._setup_functions()
+    # TODO(b/124045874): There are limitations with functions whose captures
+    # trigger other functions to be executed. For now it is only guaranteed to
+    # work if the captures of a function only trigger functions without
+    # captures.
+    self._setup_functions_structures()
+    self._setup_functions_captures()
     self._restore_checkpoint()
 
-  def _setup_concrete_function(
-      self, concrete_function, seen_functions, coder):
-    """Setup captured tensors and outputs for a single concrete function."""
-    name = concrete_function.name
-    bound_inputs = [
-        self._get_tensor_from_node(node_id)
-        for node_id in concrete_function.bound_inputs]
-    bound_variables = [
-        self._nodes[node_id]
-        for node_id in concrete_function.bound_inputs
-        if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
-    ]
-    if name in seen_functions:
-      raise RuntimeError(
-          "Concrete function with a duplicate name: %s." % name)
-    else:
-      seen_functions.add(name)
-      # TODO(andresp): This is only injecting the captured inputs into the
-      # concrete function, note that we did not modify the FuncGraph
-      # itself.
-      function = self._concrete_functions[name]
-      function._captured_inputs = bound_inputs  # pylint: disable=protected-access
-      function._func_graph.variables = bound_variables  # pylint: disable=protected-access
+    for node in self._nodes:
+      if isinstance(node, tracking.TrackableResource):
+        init_op = node._initialize()  # pylint: disable=protected-access
+        ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+
+  def _setup_functions_structures(self):
+    """Setup structure for inputs and outputs of restored functions."""
+    coder = nested_structure_coder.StructureCoder()
+    for name, proto in sorted(self._proto.concrete_functions.items()):
+      concrete_function = self._concrete_functions[name]
       # By setting the structured_outputs directly, we can rely on this
       # function_lib.ConcreteFunction object to perform the output repacking
       # logic. The only limitation of that logic is that it only works
       # with output that is convertible to Tensors and the conversion
       # always happens. For example tf.TensorShape([2, 3]) will be
       # converted to Tensor representing [2, 3].
-      original_outputs = coder.decode_proto(
-          concrete_function.output_signature)
+      original_outputs = coder.decode_proto(proto.output_signature)
       # The original_outputs here had Tensors converted to TensorSpecs, so
       # the restored function's structured_outputs field will not be
       # exactly the same. Fortunately the repacking logic cares only about
       # the structure.
       # TODO(vbardiovsky): Should we just replicate the structures, with
-      # Nones instead of real objects? Decide when we start solving
-      # idempotency.
-      function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
-
-  def _setup_functions(self):
+      # Nones instead of real objects?
+      concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
+      concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
+          coder.decode_proto(proto.canonicalized_input_signature))
 
-    """Setup captures and output structure in restored concrete functions."""
-    seen_concrete_functions = set()
-    coder = nested_structure_coder.StructureCoder()
-    for object_proto in self._proto.nodes:
-      if object_proto.WhichOneof("kind") == "concrete_function":
-        self._setup_concrete_function(
-            object_proto.concrete_function, seen_concrete_functions, coder)
-      elif object_proto.WhichOneof("kind") == "function":
-        for concrete_function in object_proto.function.concrete_function:
-          self._setup_concrete_function(
-              concrete_function, seen_concrete_functions, coder)
+  def _setup_functions_captures(self):
+    """Setup captures and variables in restored functions."""
+    concrete_functions = sorted(self._proto.concrete_functions.items())
+    for name, proto in concrete_functions:
+      concrete_function = self._concrete_functions[name]
+      bound_inputs = [
+          self._get_tensor_from_node(node_id)
+          for node_id in proto.bound_inputs]
+      bound_variables = [
+          self._nodes[node_id]
+          for node_id in proto.bound_inputs
+          if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+      ]
+      # TODO(andresp): This is only injecting the captured inputs into the
+      # concrete function, note that we did not modify the FuncGraph
+      # itself.
+      concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
+      concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
 
   def _get_tensor_from_node(self, node_id):
-    obj = self._nodes[node_id]
-    if resource_variable_ops.is_resource_variable(obj):
-      return obj.handle
-    elif isinstance(obj, tracking.TrackableAsset):
-      return obj.asset_path.handle
-    raise ValueError("Can't convert node %s to tensor" % (type(obj)))
+    """Resolves a node id into a tensor to be captured for a function."""
+    with ops.init_scope():
+      obj = self._nodes[node_id]
+      if resource_variable_ops.is_resource_variable(obj):
+        return obj.handle
+      elif isinstance(obj, tracking.TrackableAsset):
+        return obj.asset_path
+      elif tensor_util.is_tensor(obj):
+        return obj
+      elif isinstance(obj, tracking.TrackableResource):
+        # Note: this executes restored functions in the TrackableResource.
+        return obj.resource_handle
+      raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
   def _load_all(self):
     """Load all saved objects and wire their properties."""
-    self._nodes = []
-    node_setters = []
+    # Maps from node ids to recreated objects
+    nodes = {}
+    # Maps from node ids to setter functions (same signature as setattr) for
+    # setting dependencies.
+    node_setters = {}
+
+    # Figure out which objects are slot variables. These objects are created
+    # with Optimizer.add_slot rather than _recreate_variable.
+    slot_variable_node_ids = set()
     for proto in self._proto.nodes:
+      for slot_variable_proto in proto.slot_variables:
+        slot_variable_node_ids.add(slot_variable_proto.slot_variable_node_id)
+
+    # Re-create everything except slot variables.
+    for node_id, proto in enumerate(self._proto.nodes):
+      if node_id in slot_variable_node_ids:
+        # Defer recreating slot variables so we can use the public Optimizer
+        # interface.
+        continue
       node, setter = self._recreate(proto)
-      self._nodes.append(node)
-      node_setters.append(setter)
+      nodes[node_id] = node
+      node_setters[node_id] = setter
+
+    # Now that we have created the variables being optimized, we have enough
+    # information to re-create slot variables for them.
+    for node_id, proto in enumerate(self._proto.nodes):
+      optimizer_object = nodes[node_id]
+      for slot_variable_proto in proto.slot_variables:
+        optimized_variable = nodes[
+            slot_variable_proto.original_variable_node_id]
+        slot_variable = optimizer_object.add_slot(
+            var=optimized_variable,
+            slot_name=slot_variable_proto.slot_name)
+        nodes[slot_variable_proto.slot_variable_node_id] = slot_variable
+        node_setters[slot_variable_proto.slot_variable_node_id] = setattr
+
+    self._nodes = []
+
     # After creating the objects, construct the edges between the objects.
-    for obj, object_proto, setter in zip(self._nodes, self._proto.nodes,
-                                         node_setters):
+    for node_id, object_proto in enumerate(self._proto.nodes):
+      obj = nodes[node_id]
+      setter = node_setters[node_id]
+      self._nodes.append(obj)
+
       for reference in object_proto.children:
-        setter(obj, reference.local_name, self._nodes[reference.node_id])
+        setter(obj, reference.local_name, nodes[reference.node_id])
         # Note: if an object has an attribute `__call__` add a class method
         # that allows `obj()` syntax to work. This is done per-instance to
         # allow `callable` to be used to find out if an object is callable.
@@ -134,9 +179,34 @@ class _Loader(object):
           setattr(type(obj), "__call__", _call_attribute)
 
   def _restore_checkpoint(self):
+    """Load state from checkpoint into the deserialized objects."""
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
-    saver = util.CheckpointableSaver(self.get(0))
-    saver.restore(variables_path).assert_consumed()
+    # TODO(andresp): Clean use of private methods of TrackableSaver.
+    # pylint: disable=protected-access
+    saver = util.TrackableSaver(graph_view.ObjectGraphView(self.get(0)))
+    saver._file_prefix_placeholder = constant_op.constant(variables_path)
+    load_status = saver.restore(variables_path)
+    load_status.assert_existing_objects_matched()
+    checkpoint = load_status._checkpoint
+
+    # When running in eager mode, the `restore` call above has already run and
+    # restored the state of trackables, call `position.restore_ops()` will
+    # return an empty list as there is nothing left to do. In graph mode, that
+    # will return the list of ops that must run to restore the object on that
+    # position. We have to wire them in the initializers of the objects so that
+    # they get initialized properly when using common practices (e.g. the ones
+    # used by ManagedSession) without further user action.
+    for object_id, obj in dict(checkpoint.object_by_proto_id).items():
+      position = base.CheckpointPosition(checkpoint=checkpoint,
+                                         proto_id=object_id)
+      restore_ops = position.restore_ops()
+      if restore_ops:
+        if resource_variable_ops.is_resource_variable(obj):
+          obj._initializer_op = restore_ops
+        else:
+          raise NotImplementedError(
+              ("Missing functionality to restore state of object "
+               "%r from the checkpoint." % obj))
 
   def get(self, node_id):
     return self._nodes[node_id]
@@ -147,9 +217,12 @@ class _Loader(object):
         "user_object": lambda: self._recreate_user_object(proto.user_object),
         "asset": lambda: self._recreate_asset(proto.asset),
         "function": lambda: self._recreate_function(proto.function),
-        "concrete_function": functools.partial(
-            self._recreate_concrete_function, proto.concrete_function),
+        "bare_concrete_function": functools.partial(
+            self._recreate_bare_concrete_function,
+            proto.bare_concrete_function),
         "variable": lambda: self._recreate_variable(proto.variable),
+        "constant": lambda: self._recreate_constant(proto.constant),
+        "resource": lambda: self._recreate_resource(proto.resource),
     }
     kind = proto.WhichOneof("kind")
     if kind not in factory:
@@ -164,7 +237,7 @@ class _Loader(object):
       # individually callable by adding a `__call__` method to the classes of
       # the objects instances that have a `__call__` property.
 
-      class _UserObject(tracking.AutoCheckpointable):
+      class _UserObject(tracking.AutoTrackable):
         pass
 
       return _UserObject(), setattr
@@ -180,8 +253,8 @@ class _Loader(object):
     return function_deserialization.recreate_function(
         proto, self._concrete_functions), setattr
 
-  def _recreate_concrete_function(self, proto):
-    return function_deserialization.recreate_concrete_function(
+  def _recreate_bare_concrete_function(self, proto):
+    return function_deserialization.setup_bare_concrete_function(
         proto, self._concrete_functions), setattr
 
   def _recreate_variable(self, proto):
@@ -189,32 +262,104 @@ class _Loader(object):
     dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
     return variables.Variable(dummy_value, trainable=proto.trainable), setattr
 
+  def _recreate_constant(self, proto):
+    tensor_proto = self._operation_attributes[proto.operation]["value"].tensor
+    imported_constant = constant_op.constant(
+        tensor_util.MakeNdarray(tensor_proto))
+    return imported_constant, setattr
+
+  def _recreate_resource(self, proto):
+    del proto
+    return _RestoredResource(), setattr
+
+
+# TODO(b/124205571,b/124092991): Solve destruction of resources.
+class _RestoredResource(tracking.TrackableResource):
+  """Restored SavedResource."""
+
+  def _create_resource(self):
+    raise RuntimeError()
+
+  def _initialize(self):
+    raise RuntimeError()
+
+  def _list_functions_for_serialization(self):
+    # Overwrite this method to avoid the implementation of
+    # base class to re-wrap the polymorphic functions into
+    # another layer of `tf.function`.
+    return {
+        "_create_resource": self._create_resource,
+        "_initialize": self._initialize,
+    }
+
 
 def _call_attribute(instance, *args, **kwargs):
   return instance.__call__(*args, **kwargs)
 
 
-def _load_saved_object_graph_proto(filename):
-  with file_io.FileIO(filename, "rb") as f:
-    contents = f.read()
-    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+@tf_export("saved_model.load", v1=["saved_model.load_v2"])
+def load(export_dir, tags=None):
+  """Load a SavedModel from `export_dir`.
+
+  Signatures associated with the SavedModel are available as functions:
+
+  ```python
+  imported = tf.saved_model.load(path)
+  f = imported.signatures["serving_default"]
+  print(f(x=tf.constant([[1.]])))
+  ```
+
+  Objects exported with `tf.saved_model.save` additionally have trackable
+  objects and functions assigned to attributes:
+
+  ```python
+  exported = tf.train.Checkpoint(v=tf.Variable(3.))
+  exported.f = tf.function(
+      lambda x: exported.v * x,
+      input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  tf.saved_model.save(exported, path)
+  imported = tf.saved_model.load(path)
+  assert 3. == imported.v.numpy()
+  assert 6. == imported.f(x=tf.constant(2.)).numpy()
+  ```
+
+  Args:
+    export_dir: The SavedModel directory to load from.
+    tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
+      if the SavedModel contains a single MetaGraph, as for those exported from
+      `tf.saved_model.load`.
 
+  Returns:
+    A trackable object with a `signatures` attribute mapping from signature
+    keys to functions. If the SavedModel was exported by `tf.saved_model.load`,
+    it also points to trackable objects and functions which were attached
+    to the exported object.
 
-def load(export_dir):
-  """Load a SavedModel from `export_dir`."""
+  Raises:
+    ValueError: If `tags` don't match a MetaGraph in the SavedModel.
+  """
+  if tags is not None and not isinstance(tags, set):
+    # Supports e.g. tags=SERVING and tags=[SERVING]. Sets aren't considered
+    # sequences for nest.flatten, so we put those through as-is.
+    tags = nest.flatten(tags)
   saved_model_proto = loader_impl.parse_saved_model(export_dir)
-  object_graph_filename = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
-      compat.as_bytes("object_graph.pb"))
-  if file_io.file_exists(object_graph_filename):
-    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
-    loader = _Loader(object_graph_proto,
-                     saved_model_proto,
-                     export_dir)
-    root = loader.get(0)
+  if (len(saved_model_proto.meta_graphs) == 1
+      and saved_model_proto.meta_graphs[0].HasField("object_graph_def")):
+    meta_graph_def = saved_model_proto.meta_graphs[0]
+    if (tags is not None
+        and set(tags) != set(meta_graph_def.meta_info_def.tags)):
+      raise ValueError(
+          ("The SavedModel at {} has one MetaGraph with tags {}, but got an "
+           "incompatible argument tags={} to tf.saved_model.load. You may omit "
+           "it, pass 'None', or pass matching tags.")
+          .format(export_dir, meta_graph_def.meta_info_def.tags, tags))
+    object_graph_proto = meta_graph_def.object_graph_def
+    with ops.init_scope():
+      loader = _Loader(object_graph_proto,
+                       saved_model_proto,
+                       export_dir)
+      root = loader.get(0)
   else:
-    raise NotImplementedError(
-        "Currently only SavedModels exported with `tf.saved_model.save` may be "
-        "imported. Other SavedModels may eventually be supported via load().")
+    with ops.init_scope():
+      root = load_v1_in_v2.load(export_dir, tags)
   return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 280a26a0a572528e25b0dfb140a6a542b08ca36e..d80de3c87e3b34948cf629edbe7a49cc112e5323 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -12,83 +12,129 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel loading."""
+"""Tests for trackable object SavedModel loading."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 import tempfile
 
+from absl.testing import parameterized
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
-from tensorflow.python.training.checkpointable import tracking
-
-
-class LoadTest(test.TestCase):
-
-  def cycle(self, obj, signatures=None):
-    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    save.save(obj, path, signatures=signatures or {})
-    return load.load(path)
-
-  def test_structure_import(self):
-    root = tracking.AutoCheckpointable()
-    root.dep_one = tracking.AutoCheckpointable()
-    root.dep_two = tracking.AutoCheckpointable()
-    root.dep_two.dep = tracking.AutoCheckpointable()
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import tf_inspect
+
+
+@parameterized.named_parameters(
+    dict(testcase_name="ReloadOnce", cycles=1),
+    dict(testcase_name="ReloadTwice", cycles=2),
+    dict(testcase_name="ReloadThrice", cycles=3))
+class LoadTest(test.TestCase, parameterized.TestCase):
+
+  def cycle(self, obj, cycles=1, signatures=None):
+    to_save = obj
+    # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
+    # point w.r.t. saving/restoring, ideally after 2nd saving.
+    for _ in range(cycles):
+      path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+      save.save(to_save, path, signatures)
+      loaded = load.load(path)
+      to_save = loaded
+    return loaded
+
+  def test_structure_import(self, cycles):
+    root = tracking.AutoTrackable()
+    root.dep_one = tracking.AutoTrackable()
+    root.dep_two = tracking.AutoTrackable()
+    root.dep_two.dep = tracking.AutoTrackable()
     root.dep_three = root.dep_two.dep
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
-  def test_variables(self):
-    root = tracking.AutoCheckpointable()
+  def test_variables(self, cycles):
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(1., trainable=True)
     root.v2 = variables.Variable(2., trainable=False)
-    imported = self.cycle(root)
-    self.assertEquals(imported.v1.numpy(), 1.0)
+    imported = self.cycle(root, cycles)
+    self.assertEqual(imported.v1.numpy(), 1.0)
     self.assertTrue(imported.v1.trainable)
-    self.assertEquals(imported.v2.numpy(), 2.0)
+    self.assertEqual(imported.v2.numpy(), 2.0)
     self.assertFalse(imported.v2.trainable)
 
-  def test_capture_variables(self):
-    root = tracking.AutoCheckpointable()
+  def test_capture_variables(self, cycles):
+    root = tracking.AutoTrackable()
     root.weights = variables.Variable(2.)
     root.f = def_function.function(
         lambda x: root.weights * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
     imported.weights.assign(4.0)
     self.assertEqual(8., imported.f(constant_op.constant(2.)).numpy())
 
+  def test_control_outputs(self, cycles):
+    exported = tracking.AutoTrackable()
+    exported.v = variables.Variable(1.)
+    exported.f = def_function.function(
+        lambda: exported.v.assign(2., name="should_be_control_output"))
+    exported_graph = exported.f.get_concrete_function().graph
+    self.assertIn(
+        exported_graph.get_operation_by_name("should_be_control_output"),
+        exported_graph.control_outputs)
+
+    imported = self.cycle(exported, cycles)
+    # Calling get_concrete_function wraps in a second call operation; we want to
+    # inspect the original function body for the control output; digging into
+    # graph.as_graph_def() and its FunctionDefLibrary is another option.
+    imported_concrete, = imported.f._concrete_functions
+    imported_graph = imported_concrete.graph
+    self.assertIn(
+        imported_graph.get_operation_by_name("should_be_control_output"),
+        imported_graph.control_outputs)
+
   def _make_asset(self, contents):
     filename = tempfile.mktemp(prefix=self.get_temp_dir())
     with open(filename, "w") as f:
       f.write(contents)
     return filename
 
-  def test_assets(self):
+  def test_assets(self, cycles):
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(file1)
     root.asset2 = tracking.TrackableAsset(file2)
 
     save_dir = os.path.join(self.get_temp_dir(), "save_dir")
-    save.save(root, save_dir, signatures={})
+    save.save(root, save_dir)
 
     file_io.delete_file(file1)
     file_io.delete_file(file2)
@@ -97,75 +143,101 @@ class LoadTest(test.TestCase):
 
     imported = load.load(load_dir)
     with open(imported.asset1.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 1", f.read())
+      self.assertEqual("contents 1", f.read())
     with open(imported.asset2.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 2", f.read())
+      self.assertEqual("contents 2", f.read())
 
-  def test_capture_assets(self):
-    root = tracking.AutoCheckpointable()
+  def test_capture_assets(self, cycles):
+    root = tracking.AutoTrackable()
     root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
     root.f = def_function.function(
         lambda: root.vocab.asset_path,
         input_signature=[])
-    imported = self.cycle(root)
-    origin_output = root.f().numpy()
+    imported = self.cycle(root, cycles)
+    original_output = root.f().numpy()
     imported_output = imported.f().numpy()
-    self.assertNotEqual(origin_output, imported_output)
+    self.assertNotEqual(original_output, imported_output)
     with open(imported_output, "r") as f:
-      self.assertEquals("contents", f.read())
+      self.assertEqual("contents", f.read())
+
+  def test_capture_assets_in_graph(self, cycles):
+    root = tracking.AutoTrackable()
+    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
+    root.f = def_function.function(
+        lambda: root.vocab.asset_path,
+        input_signature=[])
+
+    original_output = root.f().numpy()
 
-  def test_dedup_assets(self):
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      imported_tensor = imported.f()
+      with monitored_session.MonitoredSession() as sess:
+        imported_output = sess.run(imported_tensor)
+        self.assertNotEqual(original_output, imported_output)
+        with open(imported_output, "r") as f:
+          self.assertEqual("contents", f.read())
+
+  def test_dedup_assets(self, cycles):
     vocab = self._make_asset("contents")
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(vocab)
     root.asset2 = tracking.TrackableAsset(vocab)
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
 
-  def test_implicit_input_signature(self):
+  def test_implicit_input_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     # Add two traces.
     root.f(constant_op.constant(1.))
     root.f(constant_op.constant(1))
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
     self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
 
-  def test_explicit_input_signature(self):
+  def test_explicit_input_signature(self, cycles):
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
 
-  def test_explicit_save_signature(self):
+  def test_explicit_save_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     imported = self.cycle(
-        root, {"f": root.f.get_concrete_function(
-            tensor_spec.TensorSpec(None, dtypes.float32))})
+        root, cycles, {
+            "f":
+                root.f.get_concrete_function(
+                    tensor_spec.TensorSpec(None, dtypes.float32))
+        })
     self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
 
-  def test_nested_functions(self):
+  def test_nested_functions(self, cycles):
     f = def_function.function(
         lambda x: x*2.0,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -173,12 +245,12 @@ class LoadTest(test.TestCase):
         lambda x: f(x) + 1.0,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.g = g
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     imported.g(constant_op.constant([1.0]))
 
-  def test_function_with_default_bool_input(self):
+  def test_function_with_default_bool_input(self, cycles):
 
     def func(x, training=False):
       if training:
@@ -186,19 +258,85 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
     self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
 
-  def test_structured_inputs(self):
+  def test_function_with_default_none_input(self, cycles):
+
+    def func(x, dtype=None):
+      if dtype:
+        return array_ops.zeros(shape=x.shape, dtype=dtype)
+      else:
+        return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1.0, 2.0, 3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        root.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertEqual(4, len(concrete_functions))
+
+    imported = self.cycle(root, cycles)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3]),
+                                   None).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1.0, 2.0,
+                                                         3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        imported.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+  def test_function_no_return(self, cycles):
+
+    class TrackableWithOneVariable(tracking.AutoTrackable):
+
+      def __init__(self, initial_value=0.0):
+        super(TrackableWithOneVariable, self).__init__()
+        self.variable = variables.Variable(initial_value)
+
+      @def_function.function
+      def increase(self, by=1.0):
+        self.variable.assign_add(by)
+
+    obj = TrackableWithOneVariable(5.0)
+
+    obj.increase(constant_op.constant(10.0))
+    self.assertEqual(15.0, obj.variable.numpy())
+    obj.increase()
+    self.assertEqual(16.0, obj.variable.numpy())
+
+    imported = self.cycle(obj, cycles)
+
+    imported.increase(constant_op.constant(10.0))
+    self.assertEqual(26.0, imported.variable.numpy())
+    imported.increase(constant_op.constant(1.0))
+    self.assertEqual(27.0, imported.variable.numpy())
+
+  def test_structured_inputs(self, cycles):
 
     def func(x, training=True):
       # x is a nested structure, we care about one particular tensor.
@@ -208,7 +346,7 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     x = constant_op.constant(10)
@@ -222,16 +360,16 @@ class LoadTest(test.TestCase):
     # matching signature will be valid on the loaded model.
     self.assertEqual(31, root.f(input1).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
-    with self.assertRaisesRegexp(AssertionError,
-                                 "Could not find matching function to call.*"):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
       imported.f(input2)
 
     self.assertEqual(31, imported.f(input1).numpy())
     self.assertEqual(32, imported.f(input3).numpy())
 
-  def test_structured_output(self):
+  def test_structured_output(self, cycles):
 
     # Use fields with non-alphabetical order
     named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
@@ -240,7 +378,7 @@ class LoadTest(test.TestCase):
       named_tuple = named_tuple_type(a=input1 + input2, b=input1 * input2)
       return [named_tuple, input2, {"x": 0.5}]
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     result = root.f(constant_op.constant(2), constant_op.constant(3))
@@ -251,7 +389,7 @@ class LoadTest(test.TestCase):
     self.assertEqual(3, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     result = imported.f(constant_op.constant(2), constant_op.constant(5))
     self.assertEqual(7, result[0].a.numpy())
@@ -260,7 +398,43 @@ class LoadTest(test.TestCase):
     self.assertEqual(5, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
-  def test_positional_arguments(self):
+  def test_optimizer(self, cycles):
+
+    class _HasOptimizer(module.Module):
+
+      def __init__(self):
+        super(_HasOptimizer, self).__init__()
+        self.layer = core.Dense(1)
+        self.optimizer = adam.Adam(0.01)
+
+      @def_function.function
+      def __call__(self, x):
+        return self.layer(x)
+
+      @def_function.function
+      def train(self, x, y):
+        with backprop.GradientTape() as tape:
+          predicted = self(x)
+          loss = math_ops.reduce_sum(math_ops.abs(y - predicted))
+        train_vars = self.layer.trainable_variables
+        grads = tape.gradient(loss, train_vars)
+        self.optimizer.apply_gradients(zip(grads, train_vars))
+
+    root = _HasOptimizer()
+    train_input = dict(x=constant_op.constant([[1.]]),
+                       y=constant_op.constant([[2.]]))
+    root.train(**train_input)
+    imported = self.cycle(root, cycles)
+    self.assertAllClose(root.optimizer.learning_rate.numpy(),
+                        imported.optimizer.learning_rate.numpy())
+    self.assertAllClose(root(constant_op.constant([[-0.5]])),
+                        imported(constant_op.constant([[-0.5]])))
+    root.train(**train_input)
+    imported.train(**train_input)
+    self.assertAllClose(root(constant_op.constant([[-0.5]])),
+                        imported(constant_op.constant([[-0.5]])))
+
+  def test_positional_arguments(self, cycles):
     def func(x, training=False, abc=7.1, defg=7.7):
       del abc
       if training:
@@ -270,7 +444,7 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
@@ -278,13 +452,13 @@ class LoadTest(test.TestCase):
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
     self.assertEqual(6, root.f(constant_op.constant(1), defg=7.0).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
     self.assertEqual(6, imported.f(constant_op.constant(1), defg=7.0).numpy())
 
-  def test_additional_kwargs(self):
+  def test_additional_kwargs(self, cycles):
     def func(x, training=False, **options):
       del options
       if training:
@@ -292,25 +466,25 @@ class LoadTest(test.TestCase):
       else:
         return 7
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(func)
 
     x = constant_op.constant(10)
     self.assertEqual(7, root.f(x, learning_rate=0.5, epochs=3).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
-    with self.assertRaisesRegexp(AssertionError,
+    with self.assertRaisesRegexp(ValueError,
                                  "Could not find matching function to call.*"):
       imported.f(x, learning_rate=0.5, epochs=4)
 
     self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
 
-  def test_member_function(self):
-    class CheckpointableWithMember(tracking.AutoCheckpointable):
+  def test_member_function(self, cycles):
+    class TrackableWithMember(tracking.AutoTrackable):
 
       def __init__(self):
-        super(CheckpointableWithMember, self).__init__()
+        super(TrackableWithMember, self).__init__()
         self._some_value = 20
 
       @def_function.function
@@ -320,19 +494,19 @@ class LoadTest(test.TestCase):
         else:
           return 7 + self._some_value
 
-    root = CheckpointableWithMember()
+    root = TrackableWithMember()
 
     self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
     self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
 
-  def test_side_effect_listing(self):
-    class M(tracking.AutoCheckpointable):
+  def test_side_effect_listing(self, cycles):
+    class M(tracking.AutoTrackable):
 
       def __init__(self):
         super(M, self).__init__()
@@ -347,41 +521,82 @@ class LoadTest(test.TestCase):
 
     m = M()
     self.cycle(m)
-    self.assertEquals(4.0, m.f(constant_op.constant(2.0)).numpy())
+    self.assertEqual(4.0, m.f(constant_op.constant(2.0)).numpy())
 
-  def test_basic_backprop(self):
+  def test_basic_backprop(self, cycles):
     weight = variables.Variable(1., trainable=True)
     bias = variables.Variable(0., trainable=True)
     g = def_function.function(
         lambda x: x*weight + bias,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.weight = weight
     root.bias = bias
     root.g = g
-    imported = self.cycle(root)
-    with backprop.GradientTape(watch_accessed_variables=True) as t:
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
       x = constant_op.constant([3.5])
       loss = imported.g(x)
       grad = t.gradient(loss, [imported.weight, imported.bias])
       self.assertAllClose(grad, [3.5, 1.0])
 
-  def test_callable(self):
-    class M1(tracking.AutoCheckpointable):
+  def test_nested_backprop(self, cycles):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+
+    # Note: this function gets called from other function defs via a
+    # "PartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32),
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def mul(x, y):
+      return x * y
+
+    # Note: this function gets called from other function defs via a
+    # "StatefulPartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def f(x):
+      return mul(weight.read_value(), x)
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def g(x):
+      return f(x) + bias,
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def h(x):
+      return g(x) + bias,
+
+    root = tracking.AutoTrackable()
+    root.weight = weight
+    root.bias = bias
+    root.g = h
+
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+    grad = t.gradient(loss, [imported.weight, imported.bias])
+    self.assertAllClose(grad, [3.5, 2.0])
+
+  def test_callable(self, cycles):
+    class M1(tracking.AutoTrackable):
 
       @def_function.function(
           input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
       def __call__(self, x):
         return x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.m1 = M1()
-    root.m2 = tracking.AutoCheckpointable()
+    root.m2 = tracking.AutoTrackable()
     root.m2.__call__ = def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
             lambda x: x*3.0)
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     x = constant_op.constant(1.0)
 
     self.assertTrue(callable(imported.m1))
@@ -396,28 +611,72 @@ class LoadTest(test.TestCase):
     # Verify that user objects without `__call__` attribute are not callable.
     self.assertFalse(callable(imported))
 
-  def test_chain_callable(self):
+  def test_chain_callable(self, cycles):
     func = def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
             lambda x: x*3.0)
-    root = tracking.AutoCheckpointable()
-    root.__call__ = tracking.AutoCheckpointable()
-    root.__call__.__call__ = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.__call__ = tracking.AutoTrackable()
+    root.__call__.__call__ = tracking.AutoTrackable()
     root.__call__.__call__.__call__ = func
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertTrue(callable(imported))
     x = constant_op.constant(1.0)
     self.assertAllEqual(imported(x).numpy(), 3.0)
 
-  def test_soft_matching(self):
+  def test_load_in_graph_mode(self, cycles):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(1.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.v2 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      var_v1 = imported.v1
+      output = imported.f(constant_op.constant(2.))
+      with monitored_session.MonitoredSession() as sess:
+        self.assertEqual(1.0, sess.run(var_v1))
+        self.assertEqual(4.0, sess.run(output))
+
+  def test_load_in_func_graph(self, cycles):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(1.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.v2 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    closure = tracking.AutoTrackable()
+    @def_function.function
+    def func(x):
+      if not hasattr(closure, "model"):
+        closure.model = load.load(path)
+      return closure.model.f(x)
+
+    inputs = constant_op.constant(2.)
+    self.assertEqual(4.0, func(inputs).numpy())
+
+  def test_soft_matching(self, cycles):
 
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
@@ -426,11 +685,11 @@ class LoadTest(test.TestCase):
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertEqual(1, len(concrete_functions))
 
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
 
-    with self.assertRaises(AssertionError):
+    with self.assertRaisesRegexp(ValueError, "Python inputs incompatible"):
       # We cannot call the function with a constant of shape ().
-      self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+      imported.f(constant_op.constant(2)).numpy()
 
     # TODO(vbardiovsky): When classes are revived with input_signatures, we
     # should also check that the calls below are not generating any more
@@ -440,61 +699,94 @@ class LoadTest(test.TestCase):
     self.assertAllEqual([2, 4, 6],
                         imported.f(constant_op.constant([1, 2, 3])).numpy())
 
-  def test_concrete_function(self):
+  def test_get_concrete_function(self, cycles):
+
+    @def_function.function
+    def func(x, training=False):
+      if training:
+        return 2 * x
+      else:
+        return 3 * x
+
+    func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+    func.get_concrete_function(tensor_spec.TensorSpec([None], dtypes.float32))
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    imported = self.cycle(root, cycles)
+
+    concrete = imported.f.get_concrete_function(
+        training=True, x=tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertAllEqual([2, 4, 6, 8],
+                        concrete(x=constant_op.constant([1, 2, 3, 4])).numpy())
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
+      imported.f.get_concrete_function(
+          tensor_spec.TensorSpec([None], dtypes.int32))
+    imported.f.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+
+  def test_concrete_function(self, cycles):
 
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
     self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
 
-    imported = self.cycle(root)
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
 
     self.assertAllEqual([2, 4, 6, 8],
                         imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
     self.assertAllEqual([2, 4, 6],
                         imported.f(constant_op.constant([1, 2, 3])).numpy())
 
-  def test_concrete_function_arg_names(self):
+  def test_concrete_function_arg_names(self, cycles):
 
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
 
-    imported = self.cycle(root)
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
 
     self.assertAllEqual([2, 4, 6],
                         imported.f(x=constant_op.constant([1, 2, 3])).numpy())
 
-  def test_concrete_function_no_signature(self):
+  def test_concrete_function_no_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
 
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function(constant_op.constant([1]))
     self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
-    imported = self.cycle(root)
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
     self.assertAllEqual([6],
                         imported.f(constant_op.constant([3])).numpy())
 
-  def test_concrete_function_backprop(self):
+  def test_concrete_function_backprop(self, cycles):
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec([None], dtypes.float32)])
     def func(x):
       return x ** 2.
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = func.get_concrete_function()
 
     def _compute_gradient(function):
@@ -505,30 +797,427 @@ class LoadTest(test.TestCase):
       return tape.gradient(output, inp)
 
     self.assertEqual(2., _compute_gradient(root.f).numpy())
-    imported = self.cycle(root)
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
     self.assertEqual(2., _compute_gradient(imported.f).numpy())
 
-  def test_dict(self):
-    root = tracking.AutoCheckpointable()
+  def test_revived_concrete_function_kwargs(self, cycles):
+
+    @def_function.function
+    def func(x, y):
+      return x * (y + 1.)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.float32))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_revived_concrete_function_tensorspec_kwargs(self, cycles):
+
+    @def_function.function
+    def func(*args):
+      x, y = args
+      return x * (y + 1.)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32, name="x"),
+        tensor_spec.TensorSpec([], dtypes.float32, name="y"))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_concrete_function_variable_argument(self, cycles):
+    # TODO(allenl): Fix variables in input signatures.
+    self.skipTest("Need to fix encoding of variables in inputs signatures")
+    capture = variables.Variable(0)
+
+    @def_function.function
+    def func(v):
+      v.assign_add(1)
+      capture.assign_sub(1)
+
+    vsave = variables.Variable(1)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(vsave)
+    root.capture = capture
+    self.assertEqual(1, vsave.numpy())
+    root.f(vsave)
+    self.assertEqual(2, vsave.numpy())
+    self.assertEqual(-1, capture.numpy())
+    imported = self.cycle(root, cycles)
+
+    vload = variables.Variable(1)
+    imported.f(vload)
+    self.assertEqual(2, vload.numpy())
+    imported.f(v=vload)
+    self.assertEqual(3, vload.numpy())
+    self.assertEqual(-3, imported.capture.numpy())
+    self.assertEqual(-1, capture.numpy())
+
+  def test_function_and_component(self, cycles):
+
+    @def_function.function
+    def func(v):
+      return v + 1
+
+    root = tracking.AutoTrackable()
+    root.func = func
+    root.concrete_func = func.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    one = constant_op.constant(1)
+    self.assertEqual(2, root.func(one).numpy())
+    self.assertEqual(2, root.concrete_func(one).numpy())
+    imported = self.cycle(root, cycles)
+    self.assertEqual(2, imported.func(one).numpy())
+    self.assertEqual(2, imported.concrete_func(one).numpy())
+
+  def test_dict(self, cycles):
+    root = tracking.AutoTrackable()
     root.variables = dict(a=variables.Variable(1.))
     root.variables["b"] = variables.Variable(2.)
     root.variables["c"] = 1
-    imported = self.cycle(root)
+    root.funcs = dict(
+        a=def_function.function(lambda: constant_op.constant(100.)))
+    root.funcs["conc"] = root.funcs["a"].get_concrete_function()
+    imported = self.cycle(root, cycles)
     self.assertEqual(1., imported.variables["a"].numpy())
     self.assertEqual(2., imported.variables["b"].numpy())
     self.assertEqual(set(["a", "b"]), set(imported.variables.keys()))
+    self.assertEqual(100., imported.funcs["a"]().numpy())
+    self.assertEqual(100., imported.funcs["conc"]().numpy())
 
-  def test_list(self):
-    root = tracking.AutoCheckpointable()
+  def test_list(self, cycles):
+    root = tracking.AutoTrackable()
     root.variables = [variables.Variable(1.)]
     root.variables.append(1)
     root.variables.append(variables.Variable(3.))
-    imported = self.cycle(root)
+    imported = self.cycle(root, cycles)
     self.assertEqual(1., imported.variables[0].numpy())
     self.assertEqual(3., imported.variables[2].numpy())
     self.assertIs(None, imported.variables[1])
     self.assertEqual(3, len(imported.variables))
 
+  def test_functions_list(self, cycles):
+    root = tracking.AutoTrackable()
+    v1 = variables.Variable(1.)
+    root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1 ** 2))]
+    root.variables = [v1]
+
+    @def_function.function
+    def _v2_loss():
+      if len(root.variables) == 1:
+        v2 = variables.Variable(2.)
+        root.variables.append(v2)
+      return math_ops.reduce_sum(root.variables[1] ** 2)
+
+    root.losses.append(_v2_loss)
+    self.assertAllClose([1., 4.], [loss() for loss in root.losses])
+    imported = self.cycle(root, cycles)
+    self.assertAllClose([1., 4.], [loss() for loss in imported.losses])
+    imported.variables[0].assign(3.)
+    imported.variables[1].assign(4.)
+    self.assertAllClose([9., 16.], [loss() for loss in imported.losses])
+
+  def test_captured_constant(self, cycles):
+    const = array_ops.zeros([100])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda: const + 1.)
+    root.g = def_function.function(lambda: const + 2.)
+    self.assertAllClose(array_ops.ones([100]), root.f())
+    self.assertAllClose(2. * array_ops.ones([100]), root.g())
+    imported = self.cycle(root, cycles)
+    self.assertAllClose(array_ops.ones([100]), imported.f())
+    self.assertAllClose(2. * array_ops.ones([100]), imported.g())
+    # TODO(b/123408994): Use the public get_concrete_function.
+    f_concrete = imported.f._list_all_concrete_functions_for_serialization()[0]
+    g_concrete = imported.g._list_all_concrete_functions_for_serialization()[0]
+    self.assertLen(f_concrete.captured_inputs, 1)
+    self.assertLen(g_concrete.captured_inputs, 1)
+    # We should be using the same captured EagerTensor in both functions, not
+    # duplicating the constant.
+    self.assertIs(f_concrete.captured_inputs[0],
+                  g_concrete.captured_inputs[0])
+
+  def test_functions_accessed_once(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self._counter = 0
+
+      @property
+      def make_func(self):
+        @def_function.function
+        def f():
+          return constant_op.constant(self._counter)
+        f.get_concrete_function()  # force a trace
+        self._counter += 1
+        return f
+
+    exported = Exported()
+    imported = self.cycle(exported, cycles)
+    self.assertEqual(0, imported.make_func().numpy())
+    self.assertEqual(1, exported.make_func().numpy())
+
+  def test_overwritten_signatures_error(self, cycles):
+    exported = tracking.AutoTrackable()
+    exported.f = def_function.function(lambda: constant_op.constant(1.))
+    imported = self.cycle(
+        exported, cycles,
+        signatures={"key": exported.f.get_concrete_function()})
+    self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
+    imported.signatures = {"key1": imported.signatures["key"]}
+    with self.assertRaisesRegexp(ValueError, "signatures"):
+      save.save(imported, tempfile.mkdtemp(prefix=self.get_temp_dir()))
+
+  def test_signature_loading(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = variables.Variable(3.)
+
+      @def_function.function
+      def do(self, x):
+        return self.v * x
+
+    exported = Exported()
+    imported = self.cycle(
+        exported,
+        signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    self.assertEqual(["serving_default"], list(imported.signatures.keys()))
+    imported_function = imported.signatures["serving_default"]
+    two = constant_op.constant(2.)
+    self.assertEqual(6., imported_function(x=two)["output_0"].numpy())
+    imported.v.assign(4.)
+    self.assertEqual(8., imported_function(x=two)["output_0"].numpy())
+    self.assertEqual(8., imported_function(two)["output_0"].numpy())
+    with self.assertRaises(TypeError):
+      # The signatures mapping is immutable
+      imported.signatures["random_key"] = 3
+
+  def test_multiple_argument_signatures_no_positional(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      @def_function.function
+      def do(self, x, y):
+        return x + y
+
+    exported = Exported()
+    imported = self.cycle(
+        exported, signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32),
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    with self.assertRaises(TypeError):
+      imported.signatures["serving_default"](
+          constant_op.constant(1.),
+          y=constant_op.constant(2.))
+    self.assertEqual(
+        {"output_0": 3.},
+        self.evaluate(imported.signatures["serving_default"](
+            x=constant_op.constant(1.),
+            y=constant_op.constant(2.))))
+
+  def _make_model_with_tables(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table1_initializer = lookup_ops.KeyValueTensorInitializer(keys, values)
+    table1 = lookup_ops.HashTable(table1_initializer, default_val)
+
+    table2_file = self._make_asset("test\nfoo\nbrain\n")
+    table2_initializer = lookup_ops.TextFileIdTableInitializer(table2_file)
+    table2 = lookup_ops.HashTable(table2_initializer, default_val)
+
+    def _make_lookup_function(table):
+      signature = [tensor_spec.TensorSpec(None, dtypes.string)]
+      return def_function.function(input_signature=signature)(
+          lambda x: table.lookup(x))  # pylint: disable=unnecessary-lambda
+
+    root = tracking.AutoTrackable()
+    root.table1 = table1
+    root.lookup1 = _make_lookup_function(table1)
+    root.table2 = table2
+    root.lookup2 = _make_lookup_function(table2)
+    return root
+
+  def test_table(self, cycles):
+    root = self._make_model_with_tables()
+    imported = self.cycle(root, cycles, signatures={})
+    keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+    self.assertAllEqual([0, -1, -1, 2], imported.lookup1(keys).numpy())
+    self.assertAllEqual([2, 0, 1, -1], imported.lookup2(keys).numpy())
+
+  def test_table_in_graph(self, cycles):
+    root = self._make_model_with_tables()
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    imported = self.cycle(root, 1)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+      output1 = imported.lookup1(keys)
+      output2 = imported.lookup2(keys)
+      with monitored_session.MonitoredSession() as sess:
+        self.assertAllEqual([0, -1, -1, 2], sess.run(output1))
+        self.assertAllEqual([2, 0, 1, -1], sess.run(output2))
+
+  def test_perserve_argspec(self, cycles):
+    def f(a, b, c):  # pylint: disable=unused-argument
+      return None
+
+    original_fullargspec = tf_inspect.getfullargspec(f)
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(f)
+    imported = self.cycle(root, cycles)
+
+    restored_fullargspec = tf_inspect.getfullargspec(imported.f)
+    self.assertEqual(original_fullargspec, restored_fullargspec)
+
+  def test_canonicalize_inputs(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a=1, b=2, c=3, training=True):
+      if training:
+        return [a, b, c, training]
+      else:
+        return [c, b, a, training]
+
+    # TODO(b/123501567): Work-around to trigger generic traces of a function
+    # with extra non tensor args.
+    signature = 3*[tensor_spec.TensorSpec(None, dtypes.float32)]
+    @def_function.function(input_signature=signature)
+    def trigger(a, b, c):
+      func(a, b, c, True)
+      func(a, b, c, False)
+
+    trigger.get_concrete_function()
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
+    self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function"):
+      root.f(["hello", 1.0])
+
+  def test_prefer_specific_trace(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a):
+      if isinstance(a, int):
+        return a
+      else:
+        return a + 1
+
+    self.assertAllEqual(2, func(2).numpy())
+    self.assertAllEqual(3, func(constant_op.constant(2)).numpy())
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(2, root.f(2).numpy())
+    self.assertAllEqual(4, root.f(3).numpy())
+    self.assertAllEqual(3, root.f(constant_op.constant(2)).numpy())
+    self.assertAllEqual(4, root.f(constant_op.constant(3)).numpy())
+
+  def test_partial(self, cycles):
+    # TODO(vbardiovsky): Figure out the story for FunctionSpec vs partial vs
+    # input_signature.
+    self.skipTest("Partial does not work for serialization.")
+
+    def f(x, y):
+      return x + y
+
+    func = def_function.function(
+        functools.partial(f, x=array_ops.zeros([1]), y=array_ops.zeros([1])))
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    self.assertAllEqual(root.f(), [0.0])
+
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(root.f(), [0.0])
+
+  def test_convert_to_input_signature(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return x
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    root = self.cycle(root, cycles)
+
+    self.assertEqual([2], root.f([2]).numpy())
+
+  def test_dense_features_layer(self, cycles):
+    columns = [feature_column_v2.numeric_column("x"),
+               feature_column_v2.numeric_column("y")]
+    layer = feature_column_v2.DenseFeatures(columns)
+    model = sequential.Sequential([layer])
+    model_input = {"x": constant_op.constant([[1.]]),
+                   "y": constant_op.constant([[2.]])}
+    self.assertAllClose([[1., 2.]], model.predict(model_input))
+    loaded = self.cycle(model, cycles)
+    output, = loaded._default_save_signature(model_input).values()
+    self.assertAllClose([[1., 2.]], output)
+    signature_output, = loaded.signatures["serving_default"](
+        **model_input).values()
+    self.assertAllClose([[1., 2.]], signature_output)
+
+
+class SingleCycleTests(test.TestCase, parameterized.TestCase):
+
+  def test_load_with_tags(self):
+    root = tracking.AutoTrackable()
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    with self.assertRaises(ValueError):
+      load.load(path, tags=[tag_constants.EVAL])
+    load.load(path, tags=[tag_constants.SERVING])
+    load.load(path, tags=tag_constants.SERVING)
+    load.load(path, tags=set([tag_constants.SERVING]))
+
+  def test_docstring_examples(self):
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    exported = util.Checkpoint(v=variables.Variable(3.))
+    exported.f = def_function.function(
+        lambda x: exported.v * x,
+        input_signature=[
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+    save.save(exported, path)
+    imported = load.load(path)
+    self.assertEqual(3., imported.v.numpy())
+    self.assertEqual(6., imported.f(x=constant_op.constant(2.)).numpy())
+
+    save.save(exported, path, exported.f.get_concrete_function())
+    imported = load.load(path)
+    f = imported.signatures["serving_default"]
+    self.assertAllEqual(
+        [[-3.]],
+        f(x=constant_op.constant([[-1.]]))["output_0"].numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..54971c0718f3fd3140025760c58e5a73d1c84711
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -0,0 +1,159 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import signature_serialization
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.training.tracking import tracking
+
+
+class _Initializer(tracking.TrackableResource):
+  """Represents an initialization operation restored from a SavedModel.
+
+  Without this object re-export of imported 1.x SavedModels would omit the
+  original SavedModel's initialization procedure.
+
+  Created when `tf.saved_model.load` loads a TF 1.x-style SavedModel with an
+  initialization op. This object holds a function which runs the
+  initialization. It does not require any manual user intervention;
+  `tf.saved_model.save` will see this object and automatically add it to the
+  exported SavedModel, and `tf.saved_model.load` runs the initialization
+  function automatically.
+  """
+
+  def __init__(self, init_fn, asset_paths):
+    super(_Initializer, self).__init__()
+    self._asset_paths = asset_paths
+    self._init_fn = init_fn
+
+  def _create_resource(self):
+    return array_ops.placeholder(
+        dtype=dtypes.resource, shape=[], name="unused_resource")
+
+  def _initialize(self):
+    self._init_fn(*[path.asset_path for path in self._asset_paths])
+
+
+class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
+  """Loads a SavedModel without using Sessions."""
+
+  def get_meta_graph_def_from_tags(self, tags):
+    """Override to support implicit one-MetaGraph loading with tags=None."""
+    if tags is None:
+      if len(self._saved_model.meta_graphs) != 1:
+        tag_sets = [mg.meta_info_def.tags
+                    for mg in self._saved_model.meta_graphs]
+        raise ValueError(
+            ("Importing a SavedModel with tf.saved_model.load requires a "
+             "'tags=' argument if there is more than one MetaGraph. Got "
+             "'tags=None', but there are {} MetaGraphs in the SavedModel with "
+             "tag sets {}. Pass a 'tags=' argument to load this SavedModel.")
+            .format(len(self._saved_model.meta_graphs), tag_sets))
+      return self._saved_model.meta_graphs[0]
+    return super(_EagerSavedModelLoader, self).get_meta_graph_def_from_tags(
+        tags)
+
+  def load_graph(self, returns, meta_graph_def):
+    """Called from wrap_function to import `meta_graph_def`."""
+    # pylint: disable=protected-access
+    saver, _ = tf_saver._import_meta_graph_with_return_elements(
+        meta_graph_def)
+    # pylint: enable=protected-access
+    returns[0] = saver
+
+  def restore_variables(self, wrapped, saver):
+    """Restores variables from the checkpoint."""
+    if saver is not None:
+      saver_def = saver.saver_def
+      restore_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(
+              saver_def.filename_tensor_name)],
+          fetches=[wrapped.graph.as_graph_element(saver_def.restore_op_name)])
+      restore_fn(constant_op.constant(self._variables_path))
+
+  def _extract_signatures(self, wrapped, meta_graph_def):
+    """Creates ConcreteFunctions for signatures in `meta_graph_def`."""
+    signature_functions = {}
+    for signature_key, signature_def in meta_graph_def.signature_def.items():
+      input_names, input_specs = zip(*signature_def.inputs.items())
+      # TODO(allenl): Support optional arguments
+      signature_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(inp.name)
+                 for inp in input_specs],
+          fetches={name: wrapped.graph.as_graph_element(out.name)
+                   for name, out in signature_def.outputs.items()})
+      # pylint: disable=protected-access
+      signature_fn._arg_keywords = input_names
+      if len(input_names) == 1:
+        # Allowing positional arguments does not create any ambiguity if there's
+        # only one.
+        signature_fn._num_positional_args = 1
+      else:
+        signature_fn._num_positional_args = 0
+      # pylint: enable=protected-access
+      signature_functions[signature_key] = signature_fn
+    return signature_functions
+
+  def load(self, tags):
+    """Creates an object from the MetaGraph identified by `tags`."""
+    meta_graph_def = self.get_meta_graph_def_from_tags(tags)
+    load_graph_returns = [None]
+    wrapped = wrap_function.wrap_function(
+        functools.partial(self.load_graph, load_graph_returns, meta_graph_def),
+        signature=[])
+    saver, = load_graph_returns
+    self.restore_variables(wrapped, saver)
+    with wrapped.graph.as_default():
+      init_op = loader_impl.get_init_op(meta_graph_def)
+    root = tracking.AutoTrackable()
+    if init_op is not None:
+      asset_feed_tensors = []
+      asset_paths = []
+      for tensor_name, value in loader_impl.get_asset_tensors(
+          self._export_dir, meta_graph_def).items():
+        asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
+        asset_paths.append(tracking.TrackableAsset(value))
+      init_fn = wrapped.prune(
+          feeds=asset_feed_tensors,
+          fetches=[wrapped.graph.as_graph_element(init_op)])
+      initializer = _Initializer(init_fn, asset_paths)
+      initializer._initialize()  # pylint: disable=protected-access
+      root.initializer = initializer
+      root.asset_paths = asset_paths
+    else:
+      root.asset_paths = []
+    signature_functions = self._extract_signatures(wrapped, meta_graph_def)
+
+    root.signatures = signature_serialization.create_signature_map(
+        signature_functions)
+    root.variables = list(wrapped.graph.variables)
+    return root
+
+
+def load(export_dir, tags):
+  """Load a v1-style SavedModel as an object."""
+  loader = _EagerSavedModelLoader(export_dir)
+  return loader.load(tags=tags)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..78ecccad9244eeac789c8db9985f57ee13f2a5aa
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -0,0 +1,208 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for importing a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import builder_impl
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import simple_save
+from tensorflow.python.saved_model import utils_impl
+
+
+class LoadTest(test.TestCase):
+
+  def _v1_single_metagraph_saved_model(self, use_resource):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      if use_resource:
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = resource_variable_ops.ResourceVariable(3., name="v")
+      else:
+        # "distractor" gets saved in the checkpoint and so used in the restore
+        # function, but not in the pruned function for the signature. This tests
+        # node naming: it needs to be consistent (and ideally always the same as
+        # the node in the original GraphDef) for the resource manager to find
+        # the right variable.
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = variables.RefVariable(3., name="v")
+      local_variable = variables.VariableV1(
+          1.,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          trainable=False,
+          use_resource=True)
+      output = array_ops.identity(start * v * local_variable, name="output")
+      with session_lib.Session() as session:
+        session.run([v.initializer, distractor.initializer,
+                     local_variable.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=local_variable.initializer)
+    return path
+
+  def test_resource_variable_import(self):
+    imported = load.load(self._v1_single_metagraph_saved_model(
+        use_resource=True))
+    fn = imported.signatures["serving_default"]
+    self.assertEqual({"output": 6.},
+                     self.evaluate(fn(constant_op.constant(2.))))
+    self.assertAllEqual([3., 1.], self.evaluate(imported.variables))
+    imported.variables[0].assign(4.)
+    self.assertEqual({"output": 8.},
+                     self.evaluate(fn(start=constant_op.constant(2.))))
+    imported.variables[1].assign(2.)
+    self.assertEqual({"output": 24.},
+                     self.evaluate(fn(start=constant_op.constant(3.))))
+    self.assertTrue(imported.variables[0].trainable)
+    self.assertFalse(imported.variables[1].trainable)
+    with backprop.GradientTape() as tape:
+      output = fn(start=constant_op.constant(4.))
+    self.assertEqual(imported.variables[:1], list(tape.watched_variables()))
+    self.assertEqual(8., tape.gradient(output, imported.variables[0]).numpy())
+
+  def test_ref_variable_import(self):
+    saved = self._v1_single_metagraph_saved_model(use_resource=False)
+    imported = load.load(saved)
+    fn = imported.signatures["serving_default"]
+    self.assertEqual(6., fn(start=constant_op.constant(2.))["output"].numpy())
+
+  def _v1_multi_metagraph_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      v = resource_variable_ops.ResourceVariable(21.)
+      first_output = array_ops.identity(start * v, name="first_output")
+      second_output = array_ops.identity(v, name="second_output")
+      with session_lib.Session() as session:
+        session.run(v.initializer)
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        builder = builder_impl.SavedModelBuilder(path)
+        builder.add_meta_graph_and_variables(
+            session, tags=["first"],
+            signature_def_map={
+                "first_key": signature_def_utils.build_signature_def(
+                    {"first_start": utils_impl.build_tensor_info(start)},
+                    {"first_output": utils_impl.build_tensor_info(
+                        first_output)})})
+        builder.add_meta_graph(
+            tags=["second"],
+            signature_def_map={
+                "second_key": signature_def_utils.build_signature_def(
+                    {"second_start": utils_impl.build_tensor_info(start)},
+                    {"second_output": utils_impl.build_tensor_info(
+                        second_output)})})
+        builder.save()
+    return path
+
+  def test_multi_meta_graph_loading(self):
+    with self.assertRaisesRegexp(ValueError, "2 MetaGraphs"):
+      load.load(self._v1_multi_metagraph_saved_model())
+    first_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                               tags=["first"])
+    self.assertEqual({"first_output": 42.},
+                     self.evaluate(first_imported.signatures["first_key"](
+                         first_start=constant_op.constant(2.))))
+    second_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                                tags=set(["second"]))
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](x=constant_op.constant(2.))
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](
+          second_start=constant_op.constant(2.),
+          x=constant_op.constant(2.))
+    self.assertEqual({"second_output": 21.},
+                     self.evaluate(second_imported.signatures["second_key"](
+                         second_start=constant_op.constant(2.))))
+
+  def _v1_asset_saved_model(self):
+    export_graph = ops.Graph()
+    vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+    with export_graph.as_default():
+      initializer = lookup_ops.TextFileInitializer(
+          vocab_path,
+          key_dtype=dtypes.string,
+          key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+          value_dtype=dtypes.int64,
+          value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+      table = lookup_ops.HashTable(
+          initializer, default_value=-1)
+      start = array_ops.placeholder(
+          shape=None, dtype=dtypes.string, name="in")
+      output = table.lookup(start, name="out")
+      with session_lib.Session() as session:
+        session.run([table.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=table.initializer)
+    file_io.delete_file(vocab_path)
+    return path
+
+  def test_asset_loading(self):
+    first_path = self._v1_asset_saved_model()
+    imported = load.load(first_path)
+    fn = imported.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+    second_path = os.path.join(self.get_temp_dir(), "saved_model",
+                               str(ops.uid()))
+    save.save(imported, second_path, signatures=imported.signatures)
+    shutil.rmtree(first_path)
+    second_import = load.load(second_path)
+    fn = second_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+    third_path = os.path.join(self.get_temp_dir(), "saved_model",
+                              str(ops.uid()))
+    save.save(second_import, third_path, signatures=second_import.signatures)
+    shutil.rmtree(second_path)
+    third_import = load.load(third_path)
+    fn = third_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5be03aae4905f4465ac87590da610a7d46e2ae4..bfabef9174de2b7ae7a330785d735c7193569683 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -88,7 +88,7 @@ def parse_saved_model(export_dir):
 _parse_saved_model = parse_saved_model
 
 
-def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
+def get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
   Args:
@@ -393,7 +393,7 @@ class SavedModelLoader(object):
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     with sess.graph.as_default():
       # Get asset tensors, if any.
-      asset_tensors_dictionary = _get_asset_tensors(
+      asset_tensors_dictionary = get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       init_op = get_init_op(meta_graph_def, import_scope)
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 192a610fd244c0d8950764cdfbf53fb62bd32698..493574a225d16fdada0ce08b569e06bf0aa06e16 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -30,6 +30,7 @@ py_library(
     deps = [
         ":export_output",
         ":export_utils",
+        ":mode_keys",
     ],
 )
 
@@ -70,7 +71,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":export_output",
-        "//tensorflow/python:mode_keys",
+        ":mode_keys",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//tensorflow/python/saved_model:signature_constants",
@@ -98,3 +99,19 @@ py_test(
         "//tensorflow/python/saved_model:signature_def_utils",
     ],
 )
+
+py_library(
+    name = "mode_keys",
+    srcs = ["mode_keys.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "mode_keys_test",
+    srcs = ["mode_keys_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":mode_keys",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/saved_model/model_utils/__init__.py b/tensorflow/python/saved_model/model_utils/__init__.py
index 84540badb4b100ab649b4653d9d84b5ebe922cf1..3f54c96def1bc10d334b62b9c4b0f201b2850a07 100644
--- a/tensorflow/python/saved_model/model_utils/__init__.py
+++ b/tensorflow/python/saved_model/model_utils/__init__.py
@@ -25,4 +25,5 @@ from tensorflow.python.saved_model.model_utils.export_utils import EXPORT_TAG_MA
 from tensorflow.python.saved_model.model_utils.export_utils import get_export_outputs
 from tensorflow.python.saved_model.model_utils.export_utils import get_temp_export_dir
 from tensorflow.python.saved_model.model_utils.export_utils import get_timestamped_export_dir
+from tensorflow.python.saved_model.model_utils.export_utils import SIGNATURE_KEY_MAP
 # pylint: enable=wildcard-import
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
index ef512150a259514fcc4c801eaa06a99441f1f7a2..c87d2ee6ae703d50c916dbedc7fcc03936880f71 100644
--- a/tensorflow/python/saved_model/model_utils/export_test.py
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -24,7 +24,6 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -32,21 +31,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model.model_utils import export_output
 from tensorflow.python.saved_model.model_utils import export_utils
-
-
-class LabeledTensorMock(object):
-  """Mock class emulating LabeledTensor."""
-
-  def __init__(self):
-    self.tensor = constant_op.constant([1])
-
-
-def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
-  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
-
-
-ops.register_tensor_conversion_function(LabeledTensorMock,
-                                        _convert_labeled_tensor_mock_to_tensor)
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys
 
 
 class ExportTest(test_util.TensorFlowTestCase):
@@ -251,6 +236,53 @@ class ExportTest(test_util.TensorFlowTestCase):
 
     self.assertDictEqual(expected_signature_defs, signature_defs)
 
+  @test_util.deprecated_graph_mode_only
+  def test_export_outputs_for_mode(self):
+    predictions = {"predictions": constant_op.constant([1.])}
+    loss = {"loss": constant_op.constant([2.])}
+    metrics = {
+        "metrics": (constant_op.constant([3.]), constant_op.constant([4.]))}
+    expected_metrics = {
+        "metrics/value": metrics["metrics"][0],
+        "metrics/update_op": metrics["metrics"][1]
+    }
+
+    def _build_export_output(mode):
+      return export_utils.export_outputs_for_mode(
+          mode, None, predictions, loss, metrics)
+
+    ret = _build_export_output(KerasModeKeys.TRAIN)
+    self.assertIn(signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.TrainOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.TEST)
+    self.assertIn(signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.EvalOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.PREDICT)
+    self.assertIn(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.PredictOutput)
+    self.assertEqual(export_out.outputs, predictions)
+
+    classes = constant_op.constant(["class5"])
+    ret = export_utils.export_outputs_for_mode(
+        KerasModeKeys.PREDICT,
+        {"classify": export_output.ClassificationOutput(
+            classes=classes)})
+    self.assertIn("classify", ret)
+    export_out = ret["classify"]
+    self.assertIsInstance(export_out, export_output.ClassificationOutput)
+    self.assertEqual(export_out.classes, classes)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_utils.py b/tensorflow/python/saved_model/model_utils/export_utils.py
index 4f8933758d92199ca1cbdd4a6f046a90e5a97f21..adb6bf26677e0ff0e465291cdfc08e92a27ee85d 100644
--- a/tensorflow/python/saved_model/model_utils/export_utils.py
+++ b/tensorflow/python/saved_model/model_utils/export_utils.py
@@ -30,21 +30,32 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
-from tensorflow.python.training import mode_keys
+from tensorflow.python.saved_model.model_utils import mode_keys
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
 from tensorflow.python.util import compat
 
 
 # Mapping of the modes to appropriate MetaGraph tags in the SavedModel.
-EXPORT_TAG_MAP = {
-    mode_keys.ModeKeys.PREDICT: [tag_constants.SERVING],
-    mode_keys.ModeKeys.TRAIN: [tag_constants.TRAINING],
-    mode_keys.ModeKeys.TEST: [tag_constants.EVAL],
-}
-
-
-_SINGLE_FEATURE_DEFAULT_NAME = 'feature'
-_SINGLE_RECEIVER_DEFAULT_NAME = 'input'
-_SINGLE_LABEL_DEFAULT_NAME = 'label'
+EXPORT_TAG_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: [tag_constants.SERVING],
+    ModeKeys.TRAIN: [tag_constants.TRAINING],
+    ModeKeys.TEST: [tag_constants.EVAL]})
+
+# For every exported mode, a SignatureDef map should be created using the
+# functions `export_outputs_for_mode` and `build_all_signature_defs`. By
+# default, this map will contain a single Signature that defines the input
+# tensors and output predictions, losses, and/or metrics (depending on the mode)
+# The default keys used in the SignatureDef map are defined below.
+SIGNATURE_KEY_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    ModeKeys.TRAIN: signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY,
+    ModeKeys.TEST: signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY})
+
+# Default names used in the SignatureDef input map, which maps strings to
+# TensorInfo protos.
+SINGLE_FEATURE_DEFAULT_NAME = 'feature'
+SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+SINGLE_LABEL_DEFAULT_NAME = 'label'
 
 ### Below utilities are specific to SavedModel exports.
 
@@ -80,7 +91,7 @@ def build_all_signature_defs(receiver_tensors,
     ValueError: if export_outputs is not a dict
   """
   if not isinstance(receiver_tensors, dict):
-    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+    receiver_tensors = {SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
     raise ValueError('export_outputs must be a dict and not'
                      '{}'.format(type(export_outputs)))
@@ -100,7 +111,7 @@ def build_all_signature_defs(receiver_tensors,
         six.iteritems(receiver_tensors_alternatives)):
       if not isinstance(receiver_tensors_alt, dict):
         receiver_tensors_alt = {
-            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+            SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
         }
       for output_key, export_output in export_outputs.items():
         signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
@@ -262,18 +273,21 @@ def export_outputs_for_mode(
   Raises:
     ValueError: if an appropriate ExportOutput cannot be found for the mode.
   """
-  # TODO(b/113185250): move all model export helper functions into an util file.
-  if mode == mode_keys.ModeKeys.PREDICT:
+  if mode not in SIGNATURE_KEY_MAP:
+    raise ValueError(
+        'Export output type not found for mode: {}. Expected one of: {}.\n'
+        'One likely error is that V1 Estimator Modekeys were somehow passed to '
+        'this function. Please ensure that you are using the new ModeKeys.'
+        .format(mode, SIGNATURE_KEY_MAP.keys()))
+  signature_key = SIGNATURE_KEY_MAP[mode]
+  if mode_keys.is_predict(mode):
     return get_export_outputs(serving_export_outputs, predictions)
-  elif mode == mode_keys.ModeKeys.TRAIN:
-    return {mode: export_output_lib.TrainOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-  elif mode == mode_keys.ModeKeys.TEST:
-    return {mode: export_output_lib.EvalOutput(
+  elif mode_keys.is_train(mode):
+    return {signature_key: export_output_lib.TrainOutput(
         loss=loss, predictions=predictions, metrics=metrics)}
   else:
-    raise ValueError(
-        'Export output type not found for mode: {}'.format(mode))
+    return {signature_key: export_output_lib.EvalOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
 
 
 def get_export_outputs(export_outputs, predictions):
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys.py b/tensorflow/python/saved_model/model_utils/mode_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..2912de7210f8b8900f7383b537d13bc664f15158
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys.py
@@ -0,0 +1,109 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for managing different mode strings used by Keras and Estimator models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+class KerasModeKeys(object):
+  """Standard names for model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `TEST`: testing/evaluation mode.
+  * `PREDICT`: prediction/inference mode.
+  """
+
+  TRAIN = 'train'
+  TEST = 'test'
+  PREDICT = 'predict'
+
+
+# TODO(kathywu): Remove copy in Estimator after nightlies
+class EstimatorModeKeys(object):
+  """Standard names for Estimator model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `EVAL`: testing/evaluation mode.
+  * `PREDICT`: predication/inference mode.
+  """
+
+  TRAIN = 'train'
+  EVAL = 'eval'
+  PREDICT = 'infer'
+
+
+def is_predict(mode):
+  return mode in [KerasModeKeys.PREDICT, EstimatorModeKeys.PREDICT]
+
+
+def is_eval(mode):
+  return mode in [KerasModeKeys.TEST, EstimatorModeKeys.EVAL]
+
+
+def is_train(mode):
+  return mode in [KerasModeKeys.TRAIN, EstimatorModeKeys.TRAIN]
+
+
+class ModeKeyMap(collections.Mapping):
+  """Map using ModeKeys as keys.
+
+  This class creates an immutable mapping from modes to values. For example,
+  SavedModel export of Keras and Estimator models use this to map modes to their
+  corresponding MetaGraph tags/SignatureDef keys.
+
+  Since this class uses modes, rather than strings, as keys, both "predict"
+  (Keras's PREDICT ModeKey) and "infer" (Estimator's PREDICT ModeKey) map to the
+  same value.
+  """
+
+  def __init__(self, **kwargs):
+    self._internal_dict = {}
+    self._keys = []
+    for key in kwargs:
+      self._keys.append(key)
+      dict_key = self._get_internal_key(key)
+      if dict_key in self._internal_dict:
+        raise ValueError(
+            'Error creating ModeKeyMap. Multiple keys/values found for {} mode.'
+            .format(dict_key))
+      self._internal_dict[dict_key] = kwargs[key]
+
+  def _get_internal_key(self, key):
+    """Return keys used for the internal dictionary."""
+    if is_train(key):
+      return KerasModeKeys.TRAIN
+    if is_eval(key):
+      return KerasModeKeys.TEST
+    if is_predict(key):
+      return KerasModeKeys.PREDICT
+    raise ValueError('Invalid mode key: {}.'.format(key))
+
+  def __getitem__(self, key):
+    return self._internal_dict[self._get_internal_key(key)]
+
+  def __iter__(self):
+    return iter(self._keys)
+
+  def __len__(self):
+    return len(self._keys)
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys_test.py b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26795ef8b16a08e6426fa8399a38135dc8a4ac7c
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModeKey Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.model_utils import mode_keys
+
+
+class ModeKeyMapTest(test.TestCase):
+
+  def test_map(self):
+    mode_map = mode_keys.ModeKeyMap(**{
+        mode_keys.KerasModeKeys.PREDICT: 3,
+        mode_keys.KerasModeKeys.TEST: 1
+    })
+
+    # Test dictionary __getitem__
+    self.assertEqual(3, mode_map[mode_keys.KerasModeKeys.PREDICT])
+    self.assertEqual(3, mode_map[mode_keys.EstimatorModeKeys.PREDICT])
+    self.assertEqual(1, mode_map[mode_keys.KerasModeKeys.TEST])
+    self.assertEqual(1, mode_map[mode_keys.EstimatorModeKeys.EVAL])
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.KerasModeKeys.TRAIN]
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.EstimatorModeKeys.TRAIN]
+    with self.assertRaisesRegexp(ValueError, 'Invalid mode'):
+      _ = mode_map['serve']
+
+    # Test common dictionary methods
+    self.assertLen(mode_map, 2)
+    self.assertEqual({1, 3}, set(mode_map.values()))
+    self.assertEqual(
+        {mode_keys.KerasModeKeys.TEST, mode_keys.KerasModeKeys.PREDICT},
+        set(mode_map.keys()))
+
+    # Map is immutable
+    with self.assertRaises(TypeError):
+      mode_map[mode_keys.KerasModeKeys.TEST] = 1
+
+  def test_invalid_init(self):
+    with self.assertRaisesRegexp(ValueError, 'Multiple keys/values found'):
+      _ = mode_keys.ModeKeyMap(**{
+          mode_keys.KerasModeKeys.PREDICT: 3,
+          mode_keys.EstimatorModeKeys.PREDICT: 1
+      })
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 31ee239f13955a1fdf11be7063683668f90069ac..59a2687edafdf7f8b98a6a00670ad0d975bbf1d7 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -34,10 +34,11 @@ import collections
 import functools
 import six
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.saved_model import struct_pb2
+from tensorflow.python.util import compat
 
 
 class NotEncodableError(Exception):
@@ -83,7 +84,6 @@ class StructureCoder(object):
     """
     return self._map_structure(nested_structure, self._get_encoders())
 
-
   def can_encode(self, nested_structure):
     """Determines whether a nested structure can be encoded into a proto.
 
@@ -305,7 +305,7 @@ class _StringCodec(object):
 
   def do_decode(self, value, decode_fn):
     del decode_fn
-    return value.string_value
+    return compat.as_str(value.string_value)
 
 
 StructureCoder.register_codec(_StringCodec())
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 863630171965f57806d87c02d0e9cdb1ead7e8a2..1538fbf1271dadaa9bb7d82359f8ea38fcb95a01 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import nested_structure_coder
-from tensorflow.python.saved_model import struct_pb2
 
 
 class NestedStructureTest(test.TestCase):
diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
index ae063202bb47c8afa46d015fa3b7874458819c95..3140d3d617d458dccb1bb6107b679b927d0d151e 100644
--- a/tensorflow/python/saved_model/revived_types.py
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import versions_pb2
-from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 
 
 class VersionedTypeRegistration(object):
@@ -31,7 +31,7 @@ class VersionedTypeRegistration(object):
 
     Args:
       object_factory: A callable which takes a SavedUserObject proto and returns
-        a checkpointable object. Dependencies are added later via `setter`.
+        a trackable object. Dependencies are added later via `setter`.
       version: An integer, the producer version of this wrapper type. When
         making incompatible changes to a wrapper, add a new
         `VersionedTypeRegistration` with an incremented `version`. The most
@@ -45,11 +45,11 @@ class VersionedTypeRegistration(object):
         with this object. `min_consumer_version` should be set to the lowest
         version number which can successfully load protos saved by this
         object. If no matching registration is available on load, the object
-        will be revived with a generic checkpointable type.
+        will be revived with a generic trackable type.
 
         `min_consumer_version` and `bad_consumers` are a blunt tool, and using
         them will generally break forward compatibility: previous versions of
-        TensorFlow will revive newly saved objects as opaque checkpointable
+        TensorFlow will revive newly saved objects as opaque trackable
         objects rather than wrapped objects. When updating wrappers, prefer
         saving new information but preserving compatibility with previous
         wrapper versions. They are, however, useful for ensuring that
@@ -83,7 +83,7 @@ class VersionedTypeRegistration(object):
             bad_consumers=self._bad_consumers))
 
   def from_proto(self, proto):
-    """Recreate a checkpointable object from a SavedUserObject proto."""
+    """Recreate a trackable object from a SavedUserObject proto."""
     return self._object_factory(proto)
 
   def should_load(self, proto):
@@ -111,7 +111,7 @@ def register_revived_type(identifier, predicate, versions):
   Args:
     identifier: A unique string identifying this class of objects.
     predicate: A Boolean predicate for this registration. Takes a
-      checkpointable object as an argument. If True, `type_registration` may be
+      trackable object as an argument. If True, `type_registration` may be
       used to save and restore the object.
     versions: A list of `VersionedTypeRegistration` objects.
   """
@@ -138,7 +138,7 @@ def register_revived_type(identifier, predicate, versions):
 
 
 def serialize(obj):
-  """Create a SavedUserObject from a checkpointable object."""
+  """Create a SavedUserObject from a trackable object."""
   for identifier in _TYPE_IDENTIFIERS:
     predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
     if predicate(obj):
@@ -148,15 +148,15 @@ def serialize(obj):
 
 
 def deserialize(proto):
-  """Create a checkpointable object from a SavedUserObject proto.
+  """Create a trackable object from a SavedUserObject proto.
 
   Args:
     proto: A SavedUserObject to deserialize.
 
   Returns:
-    A tuple of (checkpointable, assignment_fn) where assignment_fn has the same
+    A tuple of (trackable, assignment_fn) where assignment_fn has the same
     signature as setattr and should be used to add dependencies to
-    `checkpointable` when they are available.
+    `trackable` when they are available.
   """
   _, type_registrations = _REVIVED_TYPE_REGISTRY.get(
       proto.identifier, (None, None))
diff --git a/tensorflow/python/saved_model/revived_types_test.py b/tensorflow/python/saved_model/revived_types_test.py
index ede5922b801e1d6606d3d86059a03eee60433ad8..8bd806f315aa7899d7e816739191a6c1af111912 100644
--- a/tensorflow/python/saved_model/revived_types_test.py
+++ b/tensorflow/python/saved_model/revived_types_test.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
-class CustomTestClass(tracking.AutoCheckpointable):
+class CustomTestClass(tracking.AutoTrackable):
 
   def __init__(self, version):
     self.version = version
@@ -56,7 +56,7 @@ revived_types.register_revived_type(
 class RegistrationMatchingTest(test.TestCase):
 
   def test_save_typecheck(self):
-    self.assertIs(revived_types.serialize(tracking.AutoCheckpointable()), None)
+    self.assertIs(revived_types.serialize(tracking.AutoTrackable()), None)
 
   def test_load_identifier_not_found(self):
     nothing_matches = revived_types.deserialize(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 9c3ace7325417549bba8497b2ef89fd5efa89f44..c2c0795f4e5201845371fa8f2931a67a49ef9943 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Exports a SavedModel from a Checkpointable Python object."""
+"""Exports a SavedModel from a Trackable Python object."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections
-import functools
 import os
 
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -39,58 +39,127 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import signature_serialization
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import compat
-from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
+_UNCOPIABLE_DTYPES = frozenset((dtypes.resource, dtypes.variant))
+
+
+# A container for an EagerTensor constant which has been copied to the exported
+# Graph.
+_CapturedConstant = collections.namedtuple(
+    "_CapturedConstant", ["eager_tensor", "graph_tensor"])
+
+
+class _AugmentedGraphView(graph_view.ObjectGraphView):
+  """An extendable graph which also tracks functions attached to objects.
+
+  Extensions through `add_object` appear in the object graph and any checkpoints
+  generated from it, even if they are not dependencies of the node they were
+  attached to in the saving program. For example a `.signatures` attribute is
+  added to exported SavedModel root objects without modifying the root object
+  itself.
+
+  Also tracks functions attached to objects in the graph, through the caching
+  `list_functions` method. Enumerating functions only through this method
+  ensures that we get a consistent view of functions, even if object attributes
+  create new functions every time they are accessed.
+  """
+
+  def __init__(self, root):
+    super(_AugmentedGraphView, self).__init__(root)
+    # Object -> (name -> dep)
+    self._extra_dependencies = object_identity.ObjectIdentityDictionary()
+    self._functions = object_identity.ObjectIdentityDictionary()
+
+  def add_object(self, parent_node, name_in_parent, subgraph_root):
+    """Attach an object to `parent_node`, overriding any existing dependency."""
+    self._extra_dependencies.setdefault(
+        parent_node, {})[name_in_parent] = subgraph_root
+
+  def list_dependencies(self, obj):
+    """Overrides a parent method to include `add_object` objects."""
+    extra_dependencies = self._extra_dependencies.get(obj, {})
+    used_names = set()
+    for name, dep in super(_AugmentedGraphView, self).list_dependencies(obj):
+      used_names.add(name)
+      if name in extra_dependencies:
+        yield base.TrackableReference(name, extra_dependencies[name])
+      else:
+        yield base.TrackableReference(name, dep)
+    for name, dep in extra_dependencies.items():
+      if name in used_names:
+        continue
+      yield base.TrackableReference(name, dep)
+
+  def list_functions(self, obj):
+    obj_functions = self._functions.get(obj, None)
+    if obj_functions is None:
+      obj_functions = obj._list_functions_for_serialization()  # pylint: disable=protected-access
+      self._functions[obj] = obj_functions
+    return obj_functions
 
 
 class _SaveableView(object):
-  """Provides a stable view over a checkpointable root.
+  """Provides a frozen view over a trackable root.
 
   This class helps creating a single stable view over an object to save. The
   saving code should access properties and functions via this class and not via
   the original object as there are cases where an object construct their
-  checkpointable attributes and functions dynamically per call and will yield
+  trackable attributes and functions dynamically per call and will yield
   different objects if invoked more than once.
+
+  Changes to the graph, for example adding objects, must happen in
+  `checkpoint_view` (an `_AugmentedGraphView`) before the `_SaveableView` is
+  constructed. Changes after the `_SaveableView` has been constructed will be
+  ignored.
   """
 
-  def __init__(self, root):
-    checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
-    self.nodes = checkpointable_objects
+  def __init__(self, checkpoint_view):
+    self.checkpoint_view = checkpoint_view
+    trackable_objects, node_ids, slot_variables = (
+        self.checkpoint_view.objects_ids_and_slot_variables())
+    self.nodes = trackable_objects
     self.node_ids = node_ids
+    self.captured_tensor_node_ids = object_identity.ObjectIdentityDictionary()
     self.slot_variables = slot_variables
-    self.functions = util.ObjectIdentityDictionary()
+    self.concrete_functions = []
 
     # Also add `Function`s as nodes.
     nodes_without_functions = list(self.nodes)
-    for obj in nodes_without_functions:
-      self.functions[obj] = self._list_functions(obj)
-      for function in self.functions[obj].values():
+    seen_function_names = set()
+    for node in nodes_without_functions:
+      for function in checkpoint_view.list_functions(node).values():
         if function not in self.node_ids:
           self.node_ids[function] = len(self.nodes)
           self.nodes.append(function)
-          # Avoids recursing into functions to see if other functions are
-          # assigned to attributes. This is sometimes true for concrete
-          # functions but not helpful.
-          self.functions[function] = {}
         if isinstance(function, def_function.Function):
           # Force listing the concrete functions for the side effects:
           #  - populate the cache for functions that have an input_signature
           #  and have not been called.
           #  - force side effects of creation of concrete functions, e.g. create
           #  variables on first run.
-          function._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+          concrete_functions = (
+              function._list_all_concrete_functions_for_serialization())  # pylint: disable=protected-access
+        else:
+          concrete_functions = [function]
+        for concrete_function in concrete_functions:
+          if concrete_function.name not in seen_function_names:
+            seen_function_names.add(concrete_function.name)
+            self.concrete_functions.append(concrete_function)
 
   @property
   def root(self):
@@ -102,131 +171,77 @@ class _SaveableView(object):
       assert self.node_ids[node] == node_id
       object_proto = proto.nodes.add()
       object_proto.slot_variables.extend(self.slot_variables.get(node, ()))
-      if isinstance(node, (def_function.Function, defun.ConcreteFunction)):
+      if isinstance(node, (def_function.Function, defun.ConcreteFunction,
+                           _CapturedConstant)):
         continue
-      for child in node._checkpoint_dependencies:  # pylint: disable=protected-access
+      for child in self.checkpoint_view.list_dependencies(node):
         child_proto = object_proto.children.add()
         child_proto.node_id = self.node_ids[child.ref]
         child_proto.local_name = child.name
-      for local_name, ref_function in self.functions[node].items():
+      for local_name, ref_function in (
+          self.checkpoint_view.list_functions(node).items()):
         child_proto = object_proto.children.add()
         child_proto.node_id = self.node_ids[ref_function]
         child_proto.local_name = local_name
 
-  def _list_functions(self, checkpointable_object):
-    """Return a dict of `Function`s of a checkpointable."""
-    functions = dict()
-    for attribute_name in dir(checkpointable_object):
-      try:
-        attribute_value = getattr(checkpointable_object, attribute_name, None)
-      except Exception:  # pylint: disable=broad-except
-        # We really don't want to throw an exception just because some object's
-        # attribute accessor is broken.
-        attribute_value = None
-      if isinstance(attribute_value, (def_function.Function,
-                                      defun.ConcreteFunction)):
-        functions[attribute_name] = attribute_value
-    return functions
-
-
-def _find_function_to_export(saveable_view):
-  """Iterate over `root`'s attributes, finding traced functions."""
-  exported_function = None
-  previous_attribute_name = None
-  functions = saveable_view.functions[saveable_view.root]
-  for name, value in sorted(functions.items()):
-    if exported_function is not None:
-      raise ValueError(
-          ("Exporting an object with no "
-           "tf.saved_model.save(..., signatures=...) "
-           "argument specified, and with more than one "
-           "@tf.function-decorated method attached to it: {}. The signature "
-           "keys for these functions are ambiguous. Specify signature "
-           "functions explicitly.").format(
-               [previous_attribute_name, name]))
-    exported_function = value
-    previous_attribute_name = name
-  if exported_function is None:
-    exported_function = functions.get(DEFAULT_SIGNATURE_ATTR, None)
-  if exported_function is None:
-    raise ValueError(
-        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
-         "argument specified, and with no @tf.function-decorated methods "
-         "attached to it. In the future this will be a supported use-case for "
-         "Python re-import, but at the moment saving a SavedModel without "
-         "signatures does not make sense, as the only consumers will expect "
-         "signatures. Either decorate a method or specify a signature function "
-         "explicitly."))
-  return exported_function
-
-
-def _canonicalize_signatures(signatures):
-  """Converts `signatures` into a dictionary of concrete functions."""
-  if not isinstance(signatures, collections.Mapping):
-    signatures = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
-  concrete_signatures = {}
-  for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (defun.Function, def_function.Function)):
-      input_signature = signature_function._input_signature  # pylint: disable=protected-access
-      if input_signature is None:
-        raise ValueError(
-            ("Unable to use the function {} as a signature directly. Functions "
-             "used to generate serving signatures must either have an "
-             "`input_signature=` specified when constructed, or must be "
-             "converted to concrete functions using "
-             "`f.get_concrete_function(...)`.").format(signature_function))
-      signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, defun.ConcreteFunction):
-      raise ValueError(
-          ("Expected a TensorFlow function to generate a signature for, but "
-           "got {}. Python functions may be decorated with "
-           "`@tf.function(input_signature=...)` and passed as signatures "
-           "directly, or created without a signature using `@tf.function` "
-           "and then converted to a concrete TensorFlow function using "
-           "`f.get_concrete_function(...)`.").format(signature_function))
-    concrete_signatures[serving_key] = signature_function
-  return concrete_signatures
-
-
-def _is_flat(sequence):
-  sequence_flat = nest.flatten(sequence)
-  try:
-    nest.assert_same_structure(sequence_flat, sequence)
-    return True
-  except ValueError:
-    return False
-  except TypeError:
-    return False
-
-
-def _normalize_outputs(outputs, function_name, signature_key):
-  """Construct an output dictionary from unnormalized function outputs."""
-  if isinstance(outputs, collections.Mapping):
-    for key, value in outputs.items():
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            ("Got a dictionary containing non-Tensor value {} for key {} "
-             "in the output of the function {} used to generate a SavedModel "
-             "signature. Dictionaries outputs for functions used as signatures "
-             "should have one Tensor output per string key.")
-            .format(value, key, compat.as_str_any(function_name)))
-    return outputs
-  else:
-    original_outputs = outputs
-    if not isinstance(outputs, collections.Sequence):
-      outputs = [outputs]
-    if not _is_flat(outputs):
-      raise ValueError(
-          ("Got non-flat outputs '{}' from '{}' for SavedModel "
-           "signature '{}'. Signatures have one Tensor per output, so "
-           "to have predictable names Python functions used to generate "
-           "these signatures should avoid outputting Tensors in nested "
-           "structures.")
-          .format(original_outputs, function_name, signature_key))
-    return {("output_{}".format(output_index)): output
-            for output_index, output
-            in enumerate(outputs)}
+  def map_resources(self):
+    """Makes new resource handle ops corresponding to existing resource tensors.
+
+    Creates resource handle ops in the current default graph, whereas
+    `accessible_objects` will be from an eager context. Resource mapping adds
+    resource handle ops to the main GraphDef of a SavedModel, which allows the
+    C++ loader API to interact with variables.
+
+    Returns:
+      A tuple of (object_map, resource_map, asset_info):
+        object_map: A dictionary mapping from object in `accessible_objects` to
+          replacement objects created to hold the new resource tensors.
+        resource_map: A dictionary mapping from resource tensors extracted from
+          `accessible_objects` to newly created resource tensors.
+        asset_info: An _AssetInfo tuple describing external assets referenced
+          from accessible_objects.
+    """
+    # Only makes sense when adding to the export Graph
+    assert not context.executing_eagerly()
+    # TODO(allenl): Handle MirroredVariables and other types of variables which
+    # may need special casing.
+    object_map = object_identity.ObjectIdentityDictionary()
+    resource_map = {}
+    asset_info = _AssetInfo(
+        asset_defs=[],
+        asset_initializers_by_resource={},
+        asset_filename_map={},
+        asset_index={})
+    for node_id, obj in enumerate(self.nodes):
+      if isinstance(obj, tracking.TrackableResource):
+        new_resource = obj._create_resource()  # pylint: disable=protected-access
+        resource_map[obj.resource_handle] = new_resource
+        self.captured_tensor_node_ids[obj.resource_handle] = node_id
+      elif resource_variable_ops.is_resource_variable(obj):
+        new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
+        object_map[obj] = new_variable
+        resource_map[obj.handle] = new_variable.handle
+        self.captured_tensor_node_ids[obj.handle] = node_id
+      elif isinstance(obj, tracking.TrackableAsset):
+        _process_asset(obj, asset_info, resource_map)
+        self.captured_tensor_node_ids[obj.asset_path] = node_id
+
+    for concrete_function in self.concrete_functions:
+      for capture in concrete_function.captured_inputs:
+        if (isinstance(capture, ops.EagerTensor)
+            and capture.dtype not in _UNCOPIABLE_DTYPES
+            and capture not in self.captured_tensor_node_ids):
+          copied_tensor = constant_op.constant(capture.numpy())
+          node_id = len(self.nodes)
+          node = _CapturedConstant(
+              eager_tensor=capture, graph_tensor=copied_tensor)
+          self.nodes.append(node)
+          self.node_ids[capture] = node_id
+          self.node_ids[node] = node_id
+          self.captured_tensor_node_ids[capture] = node_id
+          resource_map[capture] = copied_tensor
+
+    return object_map, resource_map, asset_info
 
 
 def _tensor_dict_to_tensorinfo(tensor_dict):
@@ -257,18 +272,12 @@ def _map_captures_to_created_tensors(
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      if exterior.dtype == dtypes.resource:
-        raise AssertionError(
-            ("Tried to export a function which references untracked stateful "
-             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-             "be tracked by the main object. Objects may be tracked by "
-             "assigning them to an attribute of another tracked object, or to "
-             "an attribute of the main object directly.")
-            .format(interior))
-      else:
-        # This is a captured Tensor, but it's not a resource. We'll just add it
-        # to the graph as a constant.
-        mapped_resource = constant_op.constant(exterior.numpy())
+      raise AssertionError(
+          ("Tried to export a function which references untracked object {}."
+           "TensorFlow objects (e.g. tf.Variable) captured by functions must "
+           "be tracked by assigning them to an attribute of a tracked object "
+           "or assigned to an attribute of the main object directly.")
+          .format(interior))
     export_captures.append(mapped_resource)
   return export_captures
 
@@ -362,8 +371,8 @@ def _generate_signatures(signature_functions, resource_map):
 
   Args:
     signature_functions: A dictionary mapping string keys to concrete TensorFlow
-      functions (e.g. from `_canonicalize_signatures`) which will be used to
-      generate SignatureDefs.
+      functions (e.g. from `signature_serialization.canonicalize_signatures`)
+      which will be used to generate SignatureDefs.
     resource_map: A dictionary mapping from resource tensors in the eager
       context to resource tensors in the Graph being exported. This dictionary
       is used to re-bind resources captured by functions to tensors which will
@@ -394,13 +403,12 @@ def _generate_signatures(signature_functions, resource_map):
     mapped_inputs, exterior_argument_placeholders = (
         _map_function_arguments_to_created_inputs(
             argument_inputs, signature_key, function.name))
-    outputs = _normalize_outputs(
-        _call_function_with_mapped_captures(
-            function, mapped_inputs, resource_map),
-        function.name, signature_key)
+    outputs = _call_function_with_mapped_captures(
+        function, mapped_inputs, resource_map)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
-        _tensor_dict_to_tensorinfo(outputs))
+        _tensor_dict_to_tensorinfo(outputs),
+        method_name=signature_constants.PREDICT_METHOD_NAME)
   return signatures
 
 
@@ -409,13 +417,16 @@ def _trace_resource_initializers(accessible_objects):
   resource_initializers = []
 
   def _wrap_initializer(obj):
-    obj.initialize()
+    obj._initialize()  # pylint: disable=protected-access
     return constant_op.constant(1.)  # Dummy control output
 
+  def _wrap_obj_initializer(obj):
+    return lambda: _wrap_initializer(obj)
+
   for obj in accessible_objects:
     if isinstance(obj, tracking.TrackableResource):
       resource_initializers.append(def_function.function(
-          functools.partial(_wrap_initializer, obj),
+          _wrap_obj_initializer(obj),
           # All inputs are captures.
           input_signature=[]).get_concrete_function())
   return resource_initializers
@@ -455,57 +466,13 @@ def _process_asset(trackable_asset, asset_info, resource_map):
   asset_def.filename = path
   asset_def.tensor_info.name = asset_path_initializer.name
   asset_info.asset_defs.append(asset_def)
-  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+  asset_info.asset_initializers_by_resource[original_variable] = (
       asset_variable.initializer)
   asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
-  resource_map[original_variable.handle] = asset_variable.handle
+  resource_map[original_variable] = asset_variable
 
 
-def _map_resources(accessible_objects):
-  """Makes new resource handle ops corresponding to existing resource tensors.
-
-  Creates resource handle ops in the current default graph, whereas
-  `accessible_objects` will be from an eager context. Resource mapping adds
-  resource handle ops to the main GraphDef of a SavedModel, which allows the C++
-  loader API to interact with variables.
-
-  Args:
-    accessible_objects: A list of objects, some of which may contain resources,
-      to create replacements for.
-
-  Returns:
-    A tuple of (object_map, resource_map, asset_info):
-      object_map: A dictionary mapping from object in `accessible_objects` to
-        replacement objects created to hold the new resource tensors.
-      resource_map: A dictionary mapping from resource tensors extracted from
-        `accessible_objects` to newly created resource tensors.
-      asset_info: An _AssetInfo tuple describing external assets referenced from
-        accessible_objects.
-  """
-  # TODO(allenl): Handle MirroredVariables and other types of variables which
-  # may need special casing.
-  object_map = util.ObjectIdentityDictionary()
-  resource_map = {}
-  asset_info = _AssetInfo(
-      asset_defs=[],
-      asset_initializers_by_resource={},
-      asset_filename_map={},
-      asset_index={})
-  for obj in accessible_objects:
-    if isinstance(obj, tracking.TrackableResource):
-      new_resource = obj.create_resource()
-      resource_map[obj.resource_handle] = new_resource
-    elif resource_variable_ops.is_resource_variable(obj):
-      new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
-      object_map[obj] = new_variable
-      resource_map[obj.handle] = new_variable.handle
-    elif isinstance(obj, tracking.TrackableAsset):
-      _process_asset(obj, asset_info, resource_map)
-  return object_map, resource_map, asset_info
-
-
-def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
-                         object_saver):
+def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions):
   """Generates a MetaGraph which calls `signature_functions`.
 
   Args:
@@ -513,7 +480,6 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
     saveable_view: The _SaveableView being exported.
     signature_functions: A dictionary mapping signature keys to concrete
       functions containing signatures to add to the MetaGraph.
-    object_saver: A CheckpointableSaver to add to the MetaGraph.
 
   Returns:
     An _AssetInfo, which contains information to help creating the SavedModel.
@@ -526,7 +492,7 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   exported_graph = ops.Graph()
   resource_initializer_ops = []
   with exported_graph.as_default():
-    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    object_map, resource_map, asset_info = saveable_view.map_resources()
     for resource_initializer_function in resource_initializer_functions:
       asset_dependencies = []
       for capture in resource_initializer_function.graph.external_captures:
@@ -538,6 +504,8 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
         resource_initializer_ops.append(
             _call_function_with_mapped_captures(
                 resource_initializer_function, [], resource_map))
+    resource_initializer_ops.extend(
+        asset_info.asset_initializers_by_resource.values())
     with ops.control_dependencies(resource_initializer_ops):
       init_op = control_flow_ops.no_op()
     # Add the same op to the main_op collection and to the init_op
@@ -553,30 +521,17 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
   # the exported graph (thus the `to_graph` argument).
-  saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
-
-  # We must instantiate and list all concrete functions of `Function`s while in
-  # eager mode so they end up added to the graph and can later be used by the
-  # object based saved model.
-  concrete_functions = []
-  for obj in accessible_objects:
-    for function in saveable_view.functions[obj].values():
-      if isinstance(function, defun.ConcreteFunction):
-        concrete_functions.append(function)
-      else:
-        concrete_functions.extend(
-            function._list_all_concrete_functions_for_serialization())  # pylint: disable=protected-access
+  saver = functional_saver.Saver(
+      saveable_view.checkpoint_view.frozen_saveable_objects(
+          object_map=object_map, to_graph=exported_graph))
 
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
-    for concrete_function in concrete_functions:
+    for concrete_function in saveable_view.concrete_functions:
       concrete_function.add_to_graph()
     saver_def = saver.to_proto()
     meta_graph_def.saver_def.CopyFrom(saver_def)
   graph_def = exported_graph.as_graph_def(add_shapes=True)
-  # Clean reference cycles so repeated export()s don't make work for the garbage
-  # collector.
-  ops.dismantle_graph(exported_graph)
 
   meta_graph_def.graph_def.CopyFrom(graph_def)
   meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
@@ -584,37 +539,30 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   for signature_key, signature in signatures.items():
     meta_graph_def.signature_def[signature_key].CopyFrom(signature)
   meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
-  return asset_info
+  return asset_info, exported_graph
 
 
-def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
+def _serialize_object_graph(saveable_view, asset_file_def_index):
   """Save a SavedObjectGraph proto for `root`."""
-  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # SavedObjectGraph is similar to the TrackableObjectGraph proto in the
   # checkpoint. It will eventually go into the SavedModel.
   proto = saved_object_graph_pb2.SavedObjectGraph()
   saveable_view.fill_object_graph_proto(proto)
 
-  node_ids = util.ObjectIdentityDictionary()
-  for i, obj in enumerate(saveable_view.nodes):
-    node_ids[obj] = i
-    if resource_variable_ops.is_resource_variable(obj):
-      node_ids[obj.handle] = i
-    elif isinstance(obj, tracking.TrackableAsset):
-      node_ids[obj.asset_path.handle] = i
+  coder = nested_structure_coder.StructureCoder()
+  for concrete_function in saveable_view.concrete_functions:
+    serialized = function_serialization.serialize_concrete_function(
+        concrete_function, saveable_view.captured_tensor_node_ids, coder)
+    if serialized is not None:
+      proto.concrete_functions[concrete_function.name].CopyFrom(
+          serialized)
 
   for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
-    _write_object_proto(obj, obj_proto, asset_file_def_index, node_ids)
+    _write_object_proto(obj, obj_proto, asset_file_def_index)
+  return proto
 
-  extra_asset_dir = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
-  file_io.recursive_create_dir(extra_asset_dir)
-  object_graph_filename = os.path.join(
-      extra_asset_dir, compat.as_bytes("object_graph.pb"))
-  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
 
-
-def _write_object_proto(obj, proto, asset_file_def_index, node_ids):
+def _write_object_proto(obj, proto, asset_file_def_index):
   """Saves an object into SavedObject proto."""
   if isinstance(obj, tracking.TrackableAsset):
     proto.asset.SetInParent()
@@ -626,10 +574,14 @@ def _write_object_proto(obj, proto, asset_file_def_index, node_ids):
     proto.variable.shape.CopyFrom(obj.shape.as_proto())
   elif isinstance(obj, def_function.Function):
     proto.function.CopyFrom(
-        function_serialization.serialize_function(obj, node_ids))
+        function_serialization.serialize_function(obj))
   elif isinstance(obj, defun.ConcreteFunction):
-    proto.concrete_function.CopyFrom(
-        function_serialization.serialize_concrete_function(obj, node_ids))
+    proto.bare_concrete_function.CopyFrom(
+        function_serialization.serialize_bare_concrete_function(obj))
+  elif isinstance(obj, _CapturedConstant):
+    proto.constant.operation = obj.graph_tensor.op.name
+  elif isinstance(obj, tracking.TrackableResource):
+    proto.resource.SetInParent()
   else:
     registered_type_proto = revived_types.serialize(obj)
     if registered_type_proto is None:
@@ -641,10 +593,11 @@ def _write_object_proto(obj, proto, asset_file_def_index, node_ids):
     proto.user_object.CopyFrom(registered_type_proto)
 
 
-@tf_export("saved_model.save", v1=["saved_model.experimental.save"])
+@tf_export("saved_model.save",
+           v1=["saved_model.save", "saved_model.experimental.save"])
 def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
-  """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+  """Exports the Trackable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
 
   Example usage:
 
@@ -688,7 +641,11 @@ def save(obj, export_dir, signatures=None):
   which case outputs will be numbered, or a dictionary mapping string keys to
   `Tensor`, in which case the keys will be used to name outputs.
 
-  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  Signatures are available in objects returned by `tf.saved_model.load` as a
+  `.signatures` attribute. This is a reserved attribute: `tf.saved_model.save`
+  on an object with a custom `.signatures` attribute will raise an exception.
+
+  Since `tf.keras.Model` objects are also Trackable, this function can be
   used to export Keras models. For example, exporting with a signature
   specified:
 
@@ -774,7 +731,7 @@ def save(obj, export_dir, signatures=None):
   prior to the TensorFlow 2.0 release.
 
   Args:
-    obj: A checkpointable object to export.
+    obj: A trackable object to export.
     export_dir: A directory in which to write the SavedModel.
     signatures: Optional, either a `tf.function` with an input signature
       specified or the result of `f.get_concrete_function` on a
@@ -787,7 +744,7 @@ def save(obj, export_dir, signatures=None):
       `tf.saved_model.signature_constants` module.
 
   Raises:
-    ValueError: If `obj` is not checkpointable.
+    ValueError: If `obj` is not trackable.
 
   @compatibility(eager)
   Not supported when graph building. From TensorFlow 1.x,
@@ -808,29 +765,37 @@ def save(obj, export_dir, signatures=None):
             "tf.enable_eager_execution() must run first when calling it from "
             "TensorFlow 1.x.")
   # pylint: enable=line-too-long
-  if not isinstance(obj, base.Checkpointable):
+  if not isinstance(obj, base.Trackable):
     raise ValueError(
-        "Expected a Checkpointable object for export, got {}.".format(obj))
+        "Expected a Trackable object for export, got {}.".format(obj))
 
-  # Use _SaveableView to provide a stable listing of properties and functions.
+  checkpoint_graph_view = _AugmentedGraphView(obj)
+  if signatures is None:
+    signatures = signature_serialization.find_function_to_export(
+        checkpoint_graph_view)
+
+  signatures = signature_serialization.canonicalize_signatures(signatures)
+  signature_serialization.validate_saveable_view(checkpoint_graph_view)
+  signature_map = signature_serialization.create_signature_map(signatures)
+  checkpoint_graph_view.add_object(
+      parent_node=checkpoint_graph_view.root,
+      name_in_parent=signature_serialization.SIGNATURE_ATTRIBUTE_NAME,
+      subgraph_root=signature_map)
+
+  # Use _SaveableView to provide a frozen listing of properties and functions.
   # Note we run this twice since, while constructing the view the first time
   # there can be side effects of creating variables.
-  _ = _SaveableView(obj)
-  saveable_view = _SaveableView(obj)
-
-  if signatures is None:
-    signatures = _find_function_to_export(saveable_view)
-  signatures = _canonicalize_signatures(signatures)
+  _ = _SaveableView(checkpoint_graph_view)
+  saveable_view = _SaveableView(checkpoint_graph_view)
 
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
   saved_model = saved_model_pb2.SavedModel()
   meta_graph_def = saved_model.meta_graphs.add()
-  # TODO(andresp): Should this be using saveable_view?
-  object_saver = util.CheckpointableSaver(obj)
-  asset_info = _fill_meta_graph_def(
-      meta_graph_def, saveable_view, signatures, object_saver)
+  object_saver = util.TrackableSaver(checkpoint_graph_view)
+  asset_info, exported_graph = _fill_meta_graph_def(
+      meta_graph_def, saveable_view, signatures)
   saved_model.saved_model_schema_version = (
       constants.SAVED_MODEL_SCHEMA_VERSION)
   # So far we've just been generating protocol buffers with no I/O. Now we write
@@ -843,5 +808,11 @@ def save(obj, export_dir, signatures=None):
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+  object_graph_proto = _serialize_object_graph(
+      saveable_view, asset_info.asset_index)
+  meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
   file_io.write_string_to_file(path, saved_model.SerializeToString())
-  _write_object_graph(saveable_view, export_dir, asset_info.asset_index)
+  # Clean reference cycles so repeated export()s don't make work for the garbage
+  # collector. Before this point we need to keep references to captured
+  # constants in the saved graph.
+  ops.dismantle_graph(exported_graph)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 5f9dbe2c2550c5a806e1a645cc1cd2a340ac64c5..ca1d5738ed7a7b0d0bf8ee2488fbedba258e2c3c 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel save."""
+"""Tests for trackable object SavedModel save."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,16 +23,16 @@ import sys
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import function
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
@@ -41,9 +41,9 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.keras.optimizer_v2 import adam
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import compat
 
 
 class _ModelWithOptimizer(util.Checkpoint):
@@ -87,7 +87,7 @@ def _import_and_infer(
 class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -99,7 +99,7 @@ class SaveTest(test.TestCase):
         _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
@@ -115,16 +115,35 @@ class SaveTest(test.TestCase):
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
-        ValueError, "must be converted to concrete functions"):
+        ValueError, "Expected a TensorFlow function"):
       save.save(root, save_dir, root.f)
 
+  def test_captures_unreachable_variable(self):
+    root = tracking.AutoTrackable()
+    unreachable_variable = variables.Variable([5.0, 2.0])
+    root.reachable_variable = variables.Variable([1.0, 3.0])
+
+    @def_function.function
+    def increase_variable(x):
+      return 2 * unreachable_variable * x + root.reachable_variable
+
+    root.f = increase_variable
+
+    self.assertAllEqual([101.0, 83.0],
+                        root.f(constant_op.constant([10.0, 20.0])).numpy())
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+
+    with self.assertRaisesRegexp(KeyError, "not reachable from root"):
+      save.save(root, save_dir)
+
   def test_nested_inputs(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x[0],
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
@@ -137,7 +156,7 @@ class SaveTest(test.TestCase):
       root.f.get_concrete_function()
 
   def test_nested_outputs(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
@@ -158,7 +177,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(3.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -183,11 +202,6 @@ class SaveTest(test.TestCase):
         second_loss,
         _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
-  def test_trivial_save_exception(self):
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "signature"):
-      save.save(tracking.AutoCheckpointable(), save_dir)
-
   def test_single_method_default_signature(self):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
@@ -200,7 +214,7 @@ class SaveTest(test.TestCase):
                                     {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
-    model = tracking.AutoCheckpointable()
+    model = tracking.AutoTrackable()
     model.f = def_function.function(lambda: 3., input_signature=())
     model.f()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -208,28 +222,11 @@ class SaveTest(test.TestCase):
     self.assertAllClose({"output_0": 3.},
                         _import_and_infer(save_dir, {}))
 
-  def test_ambiguous_signatures(self):
-    model = _ModelWithOptimizer()
-    x = constant_op.constant([[3., 4.]])
-    y = constant_op.constant([2.])
-    model.call(x, y)
-    model.second_function = def_function.function(lambda: 1.)
+  def test_single_function_no_signature(self):
+    model = tracking.AutoTrackable()
+    model.f = def_function.function(lambda: 3.)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "call.*second_function"):
-      save.save(model, save_dir)
-
-  def test_no_signature(self):
-
-    class Model(util.Checkpoint):
-
-      def call(self, inputs):
-        return inputs * 2.
-
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    model = Model()
-    with self.assertRaisesRegexp(
-        ValueError, "no @tf.function-decorated methods"):
-      save.save(model, save_dir)
+    save.save(model, save_dir)
 
   def test_find_default_save_function(self):
 
@@ -307,29 +304,13 @@ class SaveTest(test.TestCase):
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
-  def test_subclassed_no_signature(self):
-
-    class Subclassed(training.Model):
-
-      def call(self, inputs):
-        return inputs * 2.
-
+  def test_signature_attribute_reserved(self):
+    root = util.Checkpoint(signatures=variables.Variable(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    model = Subclassed()
-    with self.assertRaisesRegexp(
-        ValueError, "no @tf.function-decorated methods"):
-      save.save(model, save_dir)
-
-    traced_call = def_function.function(
-        model.call,
-        input_signature=(tensor_spec.TensorSpec(
-            (None, None),
-            dtype=dtypes.float32),))
-    save.save(model, save_dir, traced_call)
-    self.assertAllClose({"output_0": [[8., 10.], [10., 12.]]},
-                        _import_and_infer(
-                            save_dir,
-                            {"inputs": [[4., 5.], [5., 6.]]}))
+    with self.assertRaisesRegexp(ValueError, "del obj.signatures"):
+      save.save(root, save_dir)
+    del root.signatures
+    save.save(root, save_dir)
 
 
 class AssetTests(test.TestCase):
@@ -340,6 +321,18 @@ class AssetTests(test.TestCase):
     with open(self._vocab_path, "w") as f:
       f.write("alpha\nbeta\ngamma\n")
 
+  def test_asset_path_returned(self):
+    root = tracking.AutoTrackable()
+    root.path = tracking.TrackableAsset(self._vocab_path)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    root.get_asset = def_function.function(lambda: root.path.asset_path)
+    save.save(root, save_dir, signatures=root.get_asset.get_concrete_function())
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    file_io.rename(save_dir, second_dir)
+    imported_path = _import_and_infer(second_dir, {})["output_0"]
+    self.assertIn(compat.as_str_any(second_dir),
+                  compat.as_str_any(imported_path))
+
   def test_table(self):
     initializer = lookup_ops.TextFileInitializer(
         self._vocab_path,
@@ -369,7 +362,7 @@ class AssetTests(test.TestCase):
         _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
 
   def test_unused_asset(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index fcde6b47e4ff10dbd84801e08597591a10818d51..9c926d789f4199666e2ffb68bdc9134751ba17e8 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,6 +29,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 0efe1763430eade223801b63f958405212eebe34..525d18d18e186c3a9bc551150a7fe5fcd60f0356 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -136,6 +136,9 @@ tf_export(
 ################################################################################
 # Train/Eval API constants.
 # Not exported while export_all_saved_models is experimental.
+DEFAULT_TRAIN_SIGNATURE_DEF_KEY = "train"
+
+DEFAULT_EVAL_SIGNATURE_DEF_KEY = "eval"
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index f6e6e1d13ecdea684f14dcaaa39f1c66f72ac352..2e0a0afeec630eb97467d6967d989dd9bf5ce898 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -30,7 +30,6 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
-    'saved_model.build_signature_def',
     v1=[
         'saved_model.build_signature_def',
         'saved_model.signature_def_utils.build_signature_def'
@@ -63,7 +62,6 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
 
 
 @tf_export(
-    'saved_model.regression_signature_def',
     v1=[
         'saved_model.regression_signature_def',
         'saved_model.signature_def_utils.regression_signature_def'
@@ -112,7 +110,6 @@ def regression_signature_def(examples, predictions):
 
 
 @tf_export(
-    'saved_model.classification_signature_def',
     v1=[
         'saved_model.classification_signature_def',
         'saved_model.signature_def_utils.classification_signature_def'
@@ -172,7 +169,6 @@ def classification_signature_def(examples, classes, scores):
 
 
 @tf_export(
-    'saved_model.predict_signature_def',
     v1=[
         'saved_model.predict_signature_def',
         'saved_model.signature_def_utils.predict_signature_def'
@@ -270,7 +266,6 @@ def _supervised_signature_def(
 
 
 @tf_export(
-    'saved_model.is_valid_signature',
     v1=[
         'saved_model.is_valid_signature',
         'saved_model.signature_def_utils.is_valid_signature'
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd64ee5cadabeced0bc22cf648a5ce03e8f3aee
--- /dev/null
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -0,0 +1,263 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers for working with signatures in tf.saved_model.save."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training.tracking import base
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
+SIGNATURE_ATTRIBUTE_NAME = "signatures"
+
+
+def _get_signature(function):
+  if (isinstance(function, (defun.Function, def_function.Function)) and
+      function.input_signature is not None):
+    function = function.get_concrete_function()
+  if not isinstance(function, defun.ConcreteFunction):
+    return None
+  return function
+
+
+def _valid_signature(concrete_function):
+  """Returns whether concrete function can be converted to a signature."""
+  if not concrete_function.outputs:
+    # Functions without outputs don't make sense as signatures. We just don't
+    # have any way to run an Operation with no outputs as a SignatureDef in the
+    # 1.x style.
+    return False
+  try:
+    _normalize_outputs(concrete_function.structured_outputs, "unused", "unused")
+  except ValueError:
+    return False
+  return True
+
+
+def find_function_to_export(saveable_view):
+  """Function to export, None if no suitable function was found."""
+  # If the user did not specify signatures, check the root object for a function
+  # that can be made into a signature.
+  functions = saveable_view.list_functions(saveable_view.root)
+  signature = functions.get(DEFAULT_SIGNATURE_ATTR, None)
+  if signature is not None:
+    return signature
+
+  # TODO(andresp): Discuss removing this behaviour. It can lead to WTFs when a
+  # user decides to annotate more functions with tf.function and suddenly
+  # serving that model way later in the process stops working.
+  possible_signatures = []
+  for function in functions.values():
+    concrete = _get_signature(function)
+    if concrete is not None and _valid_signature(concrete):
+      possible_signatures.append(concrete)
+  if len(possible_signatures) == 1:
+    single_function = possible_signatures[0]
+    signature = _get_signature(single_function)
+    if signature and  _valid_signature(signature):
+      return signature
+  return None
+
+
+def canonicalize_signatures(signatures):
+  """Converts `signatures` into a dictionary of concrete functions."""
+  if signatures is None:
+    return {}
+  if not isinstance(signatures, collections.Mapping):
+    signatures = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
+  concrete_signatures = {}
+  for signature_key, function in signatures.items():
+    signature_function = _get_signature(function)
+    if signature_function is None:
+      raise ValueError(
+          ("Expected a TensorFlow function to generate a signature for, but "
+           "got {}. Only `tf.functions` with an input signature or "
+           "concrete functions can be used as a signature.").format(function))
+
+    # Re-wrap the function so that it returns a dictionary of Tensors. This
+    # matches the format of 1.x-style signatures.
+    # pylint: disable=cell-var-from-loop
+    @def_function.function
+    def signature_wrapper(**kwargs):
+      structured_outputs = signature_function(**kwargs)
+      return _normalize_outputs(
+          structured_outputs, signature_function.name, signature_key)
+    # TODO(b/123902469): Use ConcreteFunction.structured_inputs once their names
+    # always match keyword arguments.
+    tensor_spec_signature = {}
+    for keyword, tensor in zip(
+        signature_function._arg_keywords,  # pylint: disable=protected-access
+        signature_function.inputs):
+      keyword = compat.as_str(keyword)
+      tensor_spec_signature[keyword] = tensor_spec.TensorSpec.from_tensor(
+          tensor, name=keyword)
+    final_concrete = signature_wrapper.get_concrete_function(
+        **tensor_spec_signature)
+    # pylint: disable=protected-access
+    if len(final_concrete._arg_keywords) == 1:
+      # If there is only one input to the signature, a very common case, then
+      # ordering is unambiguous and we can let people pass a positional
+      # argument. Since SignatureDefs are unordered (protobuf "map") multiple
+      # arguments means we need to be keyword-only.
+      final_concrete._num_positional_args = 1
+    else:
+      final_concrete._num_positional_args = 0
+    # pylint: enable=protected-access
+    concrete_signatures[signature_key] = final_concrete
+    # pylint: enable=cell-var-from-loop
+  return concrete_signatures
+
+
+def _is_flat(sequence):
+  sequence_flat = nest.flatten(sequence)
+  try:
+    nest.assert_same_structure(sequence_flat, sequence)
+    return True
+  except ValueError:
+    return False
+  except TypeError:
+    return False
+
+
+def _normalize_outputs(outputs, function_name, signature_key):
+  """Construct an output dictionary from unnormalized function outputs."""
+  if isinstance(outputs, collections.Mapping):
+    for key, value in outputs.items():
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            ("Got a dictionary containing non-Tensor value {} for key {} "
+             "in the output of the function {} used to generate a SavedModel "
+             "signature. Dictionaries outputs for functions used as signatures "
+             "should have one Tensor output per string key.")
+            .format(value, key, compat.as_str_any(function_name)))
+    return outputs
+  else:
+    original_outputs = outputs
+    if not isinstance(outputs, collections.Sequence):
+      outputs = [outputs]
+    if not _is_flat(outputs):
+      raise ValueError(
+          ("Got non-flat outputs '{}' from '{}' for SavedModel "
+           "signature '{}'. Signatures have one Tensor per output, so "
+           "to have predictable names Python functions used to generate "
+           "these signatures should avoid outputting Tensors in nested "
+           "structures.")
+          .format(original_outputs, function_name, signature_key))
+    return {("output_{}".format(output_index)): output
+            for output_index, output
+            in enumerate(outputs)}
+
+
+# _SignatureMap is immutable to ensure that users do not expect changes to be
+# reflected in the SavedModel. Using public APIs, tf.saved_model.load() is the
+# only way to create a _SignatureMap and there is no way to modify it. So we can
+# safely ignore/overwrite ".signatures" attributes attached to objects being
+# saved if they contain a _SignatureMap. A ".signatures" attribute containing
+# any other type (e.g. a regular dict) will raise an exception asking the user
+# to first "del obj.signatures" if they want it overwritten.
+class _SignatureMap(collections.Mapping, base.Trackable):
+  """A collection of SavedModel signatures."""
+
+  def __init__(self):
+    self._signatures = {}
+
+  def _add_signature(self, name, concrete_function):
+    """Adds a signature to the _SignatureMap."""
+    # Ideally this object would be immutable, but restore is streaming so we do
+    # need a private API for adding new signatures to an existing object.
+    self._signatures[name] = concrete_function
+
+  def __getitem__(self, key):
+    return self._signatures[key]
+
+  def __iter__(self):
+    return iter(self._signatures)
+
+  def __len__(self):
+    return len(self._signatures)
+
+  def __repr__(self):
+    return "_SignatureMap({})".format(self._signatures)
+
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if isinstance(value, (def_function.Function, defun.ConcreteFunction))
+    }
+
+
+revived_types.register_revived_type(
+    "signature_map",
+    lambda obj: isinstance(obj, _SignatureMap),
+    versions=[revived_types.VersionedTypeRegistration(
+        # Standard dependencies are enough to reconstruct the trackable
+        # items in dictionaries, so we don't need to save any extra information.
+        object_factory=lambda proto: _SignatureMap(),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_SignatureMap._add_signature  # pylint: disable=protected-access
+    )])
+
+
+def create_signature_map(signatures):
+  """Creates an object containing `signatures`."""
+  signature_map = _SignatureMap()
+  for name, func in signatures.items():
+    # This true of any signature that came from canonicalize_signatures. Here as
+    # a sanity check on saving; crashing on load (e.g. in _add_signature) would
+    # be more problematic in case future export changes violated these
+    # assertions.
+    assert isinstance(func, defun.ConcreteFunction)
+    assert isinstance(func.structured_outputs, collections.Mapping)
+    # pylint: disable=protected-access
+    if len(func._arg_keywords) == 1:
+      assert 1 == func._num_positional_args
+    else:
+      assert 0 == func._num_positional_args
+    signature_map._add_signature(name, func)
+    # pylint: enable=protected-access
+  return signature_map
+
+
+def validate_saveable_view(saveable_view):
+  """Performs signature-related sanity checks on `saveable_view`."""
+  for name, dep in saveable_view.list_dependencies(
+      saveable_view.root):
+    if name == SIGNATURE_ATTRIBUTE_NAME:
+      if not isinstance(dep, _SignatureMap):
+        raise ValueError(
+            ("Exporting an object {} which has an attribute named "
+             "'{signatures}'. This is a reserved attribute used to store "
+             "SavedModel signatures in objects which come from "
+             "`tf.saved_model.load`. Delete this attribute "
+             "(e.g. 'del obj.{signatures}') before saving if this shadowing is "
+             "acceptable.").format(
+                 saveable_view.root,
+                 signatures=SIGNATURE_ATTRIBUTE_NAME))
+      break
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 8c84c9fbe4d8e65273433dc98f9da34a2183f90e..7793d4921444de45966023aff087e7865068e251 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -33,7 +33,7 @@ tf_export(
 # Tag for the `training` graph.
 TRAINING = "train"
 tf_export(
-    "saved_model.TRANING",
+    "saved_model.TRAINING",
     v1=["saved_model.TRAINING",
         "saved_model.tag_constants.TRAINING"]).export_constant(
             __name__, "TRAINING")
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index a82007fd545ca9e088411bcd5234477b8801e995..2e7b2080574e875233181a1476eb328a07e718c5 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -50,7 +50,7 @@ def build_tensor_info(tensor):
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
         build the TensorInfo. For SparseTensors, the names of the three
-        constitutent Tensors are used.
+        constituent Tensors are used.
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
diff --git a/tensorflow/python/summary/plugin_asset.py b/tensorflow/python/summary/plugin_asset.py
index 82d3a618304fb914f81c72c452e57a7d553ff186..252fa2a307096dafe8871239d5983fde506d0b75 100644
--- a/tensorflow/python/summary/plugin_asset.py
+++ b/tensorflow/python/summary/plugin_asset.py
@@ -67,7 +67,7 @@ def get_plugin_asset(plugin_asset_cls, graph=None):
   name = _PLUGIN_ASSET_PREFIX + plugin_asset_cls.plugin_name
   container = graph.get_collection(name)
   if container:
-    if len(container) is not 1:
+    if len(container) != 1:
       raise ValueError("Collection for %s had %d items, expected 1" %
                        (name, len(container)))
     instance = container[0]
@@ -102,7 +102,7 @@ def get_all_plugin_assets(graph=None):
   out = []
   for name in graph.get_collection(_PLUGIN_ASSET_PREFIX):
     collection = graph.get_collection(_PLUGIN_ASSET_PREFIX + name)
-    if len(collection) is not 1:
+    if len(collection) != 1:
       raise ValueError("Collection for %s had %d items, expected 1" %
                        (name, len(collection)))
     out.append(collection[0])
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index a01feb3dde041de2ca33f5f4d9fea6a1b6869d41..4802dbb657275ef1d1f9aaf11222c0a9cd2853ef 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -35,6 +35,7 @@ from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
+from tensorflow.python.distribute import summary_op_util as _distribute_summary_op_util
 from tensorflow.python.eager import context as _context
 from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
@@ -74,7 +75,7 @@ def scalar(name, tensor, collections=None, family=None):
   Raises:
     ValueError: If tensor has the wrong shape or type.
   """
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
@@ -129,7 +130,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
@@ -169,7 +170,7 @@ def histogram(name, values, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[values],
@@ -216,7 +217,7 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family=family, values=[tensor]) as (tag, scope):
@@ -313,7 +314,7 @@ def tensor_summary(name,
 
   serialized_summary_metadata = summary_metadata.SerializeToString()
 
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
@@ -363,7 +364,7 @@ def merge(inputs, collections=None, name=None):
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
-  if _summary_op_util.skip_summary():
+  if _distribute_summary_op_util.skip_summary():
     return _constant_op.constant('')
   name = _summary_op_util.clean_tag(name)
   with _ops.name_scope(name, 'Merge', inputs):
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index f1a911eb489970cb6a594258e5fcf69e70f91fcd..e483155dcfbc9e93c8b8aa28e83b6122ec99822e 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -13,15 +13,19 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "tools_pip",
-    deps = [
+    data = [
         ":freeze_graph",
         ":import_pb_to_tensorboard",
         ":inspect_checkpoint",
         ":optimize_for_inference",
         ":print_selective_registration_header",
         ":saved_model_cli",
-        ":saved_model_utils",
         ":strip_unused",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
+    ],
+    deps = [
+        ":saved_model_utils",
         # The following py_library are needed because
         # py_binary may not depend on them when --define=no_tensorflow_py_deps=true
         # is specified. See https://github.com/tensorflow/tensorflow/issues/22390
@@ -29,8 +33,6 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
-        # Include the TF upgrade script to users can run it directly after install TF
-        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
@@ -77,6 +79,13 @@ py_binary(
     name = "freeze_graph",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
+    deps = [":freeze_graph_main_lib"],
+)
+
+py_library(
+    name = "freeze_graph_main_lib",
+    srcs = ["freeze_graph.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":freeze_graph_lib",
     ],
@@ -86,6 +95,13 @@ py_binary(
     name = "import_pb_to_tensorboard",
     srcs = ["import_pb_to_tensorboard.py"],
     srcs_version = "PY2AND3",
+    deps = [":import_pb_to_tensorboard_lib"],
+)
+
+py_library(
+    name = "import_pb_to_tensorboard_lib",
+    srcs = ["import_pb_to_tensorboard.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python",
@@ -103,7 +119,7 @@ py_test(
     srcs = ["freeze_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":freeze_graph",
+        ":freeze_graph_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -120,6 +136,13 @@ py_binary(
     name = "inspect_checkpoint",
     srcs = ["inspect_checkpoint.py"],
     srcs_version = "PY2AND3",
+    deps = [":inspect_checkpoint_lib"],
+)
+
+py_library(
+    name = "inspect_checkpoint_lib",
+    srcs = ["inspect_checkpoint.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:platform",
@@ -193,6 +216,13 @@ py_binary(
     name = "optimize_for_inference",
     srcs = ["optimize_for_inference.py"],
     srcs_version = "PY2AND3",
+    deps = [":optimize_for_inference_main_lib"],
+)
+
+py_library(
+    name = "optimize_for_inference_main_lib",
+    srcs = ["optimize_for_inference.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
@@ -240,6 +270,14 @@ py_binary(
     srcs = ["print_selective_registration_header.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":print_selective_registration_header_lib"],
+)
+
+py_library(
+    name = "print_selective_registration_header_lib",
+    srcs = ["print_selective_registration_header.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":selective_registration_header_lib",
         "//tensorflow/python:platform",
@@ -261,6 +299,13 @@ py_binary(
     name = "saved_model_cli",
     srcs = ["saved_model_cli.py"],
     srcs_version = "PY2AND3",
+    deps = [":saved_model_cli_lib"],
+)
+
+py_library(
+    name = "saved_model_cli_lib",
+    srcs = ["saved_model_cli.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":saved_model_utils",
         "//tensorflow/python",
@@ -280,7 +325,7 @@ py_test(
         "no-internal-py3",
     ],
     deps = [
-        ":saved_model_cli",
+        ":saved_model_cli_lib",
         "//tensorflow/core:protos_all_py",
     ],
 )
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 5182f53beb6756aba6826b80411a9ff192294620..e4289a3951d1a2f52222c4ef498be0cc6a3de811 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -10,10 +10,15 @@ TENSORFLOW_API_INIT_FILES = [
     "bitwise/__init__.py",
     "compat/__init__.py",
     "config/__init__.py",
+    "config/experimental/__init__.py",
+    "config/gpu/__init__.py",
+    "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distribute/__init__.py",
+    "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
@@ -26,23 +31,31 @@ TENSORFLOW_API_INIT_FILES = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
-    "losses/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
+    "lookup/__init__.py",
+    "lookup/experimental/__init__.py",
     "math/__init__.py",
+    "nest/__init__.py",
     "nn/__init__.py",
-    "nn/rnn_cell/__init__.py",
     "quantization/__init__.py",
     "ragged/__init__.py",
     "random/__init__.py",
-    "rnn/__init__.py",
+    "random/experimental/__init__.py",
+    "raw_ops/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
     "signal/__init__.py",
     "sparse/__init__.py",
     "strings/__init__.py",
     "summary/__init__.py",
+    "summary/experimental/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/__init__.py",
+    "tpu/__init__.py",
     "train/__init__.py",
+    "train/experimental/__init__.py",
     "version/__init__.py",
     # END GENERATED FILES
 ]
@@ -77,10 +90,14 @@ KERAS_API_INIT_FILES = [
     "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
+    "keras/mixed_precision/__init__.py",
+    "keras/mixed_precision/experimental/__init__.py",
     "keras/models/__init__.py",
     "keras/optimizers/__init__.py",
+    "keras/optimizers/schedules/__init__.py",
     "keras/preprocessing/__init__.py",
     "keras/preprocessing/image/__init__.py",
     "keras/preprocessing/sequence/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 3e2c00c87fdf9ba9fa6928c8f0221af6324cabdb..e65043532d948ffa4b224274f958a9f43d1d303c 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -11,10 +11,15 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "bitwise/__init__.py",
     "compat/__init__.py",
     "config/__init__.py",
+    "config/experimental/__init__.py",
+    "config/gpu/__init__.py",
+    "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distribute/__init__.py",
+    "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
@@ -32,11 +37,16 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
     "logging/__init__.py",
+    "lookup/__init__.py",
+    "lookup/experimental/__init__.py",
     "losses/__init__.py",
     "manip/__init__.py",
     "math/__init__.py",
     "metrics/__init__.py",
+    "nest/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
     "profiler/__init__.py",
@@ -44,6 +54,8 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "quantization/__init__.py",
     "ragged/__init__.py",
     "random/__init__.py",
+    "random/experimental/__init__.py",
+    "raw_ops/__init__.py",
     "resource_loader/__init__.py",
     "strings/__init__.py",
     "saved_model/__init__.py",
@@ -63,7 +75,10 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/__init__.py",
+    "tpu/__init__.py",
     "train/__init__.py",
+    "train/experimental/__init__.py",
     "train/queue_runner/__init__.py",
     "user_ops/__init__.py",
     "version/__init__.py",
@@ -100,10 +115,14 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
+    "keras/mixed_precision/__init__.py",
+    "keras/mixed_precision/experimental/__init__.py",
     "keras/models/__init__.py",
     "keras/optimizers/__init__.py",
+    "keras/optimizers/schedules/__init__.py",
     "keras/preprocessing/__init__.py",
     "keras/preprocessing/image/__init__.py",
     "keras/preprocessing/sequence/__init__.py",
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 06a6e7dc086f27344bc24184eab13aa20208fbbb..ab82ee9fd410e646c0c1f9b302d47bb3021bb514 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -240,7 +240,7 @@ def freeze_graph_with_def_protos(input_graph_def,
 
 
 def _parse_input_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into GraphDef proto."""
+  """Parses input tensorflow graph into GraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input graph file '" + input_graph + "' does not exist!")
     return -1
@@ -255,7 +255,7 @@ def _parse_input_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_meta_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into MetaGraphDef proto."""
+  """Parses input tensorflow graph into MetaGraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input meta graph file '" + input_graph + "' does not exist!")
     return -1
@@ -271,7 +271,7 @@ def _parse_input_meta_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_saver_proto(input_saver, input_binary):
-  """Parser input tensorflow Saver into SaverDef proto."""
+  """Parses input tensorflow Saver into SaverDef proto."""
   if not gfile.Exists(input_saver):
     print("Input saver file '" + input_saver + "' does not exist!")
     return -1
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index de2672db3c4c4e6b94d3803767a749a943910d2c..d7edf4ec65d18bf067930d63e6e2b129dbc77b56 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -51,8 +51,6 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
   def _testFreezeGraph(self, saver_write_version):
 
     checkpoint_prefix = os.path.join(self.get_temp_dir(), "saved_checkpoint")
-    checkpoint_meta_graph_file = os.path.join(self.get_temp_dir(),
-                                              "saved_checkpoint.meta")
     checkpoint_state_name = "checkpoint_state"
     input_graph_name = "input_graph.pb"
     output_graph_name = "output_graph.pb"
@@ -85,7 +83,6 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     filename_tensor_name = "save/Const:0"
     output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name)
     clear_devices = False
-    input_meta_graph = checkpoint_meta_graph_file
 
     freeze_graph.freeze_graph(
         input_graph_path,
@@ -99,7 +96,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
         clear_devices,
         "",
         "",
-        input_meta_graph,
+        "",
         checkpoint_version=saver_write_version)
 
     # Now we make sure the variable is now a constant, and that the graph still
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 108f2b593cf5b84af74306fef7365b83ecdc270c..2918ba737c6ae79da7bdab4823425d54d69f3259 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -211,8 +211,9 @@ def fold_batch_norms(input_graph_def):
   scaling into the convolution weights. This function identifies the typical
   pattern of batch normalization subgraphs, and performs the transformation to
   fold the computations down into a simpler form. It currently only spots batch
-  normalization that's performed by the BatchNormWithGlobalNormalization op, and
-  will need to be extended in the future to handle the newer style.
+  normalization that's performed by the BatchNormWithGlobalNormalization and
+  FusedBatchNorm ops, and will need to be extended in the future to handle the
+  newer style.
 
   Args:
     input_graph_def: A GraphDef containing a model.
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cdd151b425452a34b47fae45bfd408efaba6a063
--- /dev/null
+++ b/tensorflow/python/tpu/BUILD
@@ -0,0 +1,351 @@
+# Description: Operations defined for Cloud TPUs
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_py_test",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
+        "//learning/brain:__subpackages__",
+        "//learning/deepmind:__subpackages__",
+        "//medical/pathology:__subpackages__",
+        "//research/graph:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//vr/perception:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "tpu_py",
+    srcs = ["ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "async_checkpoint",
+    srcs = ["async_checkpoint.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "tpu_estimator",
+    srcs = [
+        "_tpu_estimator_embedding.py",
+        "error_handling.py",
+        "tpu_config.py",
+        "tpu_context.py",
+        "tpu_estimator.py",
+        "util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":async_checkpoint",
+        ":feature_column",
+        ":functional",
+        ":tpu_embedding",
+        ":tpu_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:function",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "functional",
+    srcs = ["functional.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "tpu",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        ":tpu_embedding",
+        ":tpu_estimator",
+        ":tpu_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_lib",
+    srcs = [
+        "__init__.py",
+        "bfloat16.py",
+        "device_assignment.py",
+        "session_support.py",
+        "tensor_tracer.py",
+        "topology.py",
+        "tpu.py",
+        "tpu_feed.py",
+        "tpu_function.py",
+        "tpu_optimizer.py",
+        "tpu_sharding.py",
+        "tpu_strategy_util.py",
+        "tpu_system_metadata.py",
+        "training_loop.py",
+        "xla.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":datasets",
+        ":functional",
+        ":tpu_py",
+        "//tensorflow/compiler/xla/experimental/xla_sharding",
+        "//tensorflow/compiler/xla/python_api:xla_shape",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
+        "//tensorflow/core/protobuf/tpu:topology_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu/profiler",
+    ],
+)
+
+py_library(
+    name = "datasets",
+    srcs = [
+        "datasets.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "datasets_test",
+    size = "medium",
+    srcs = ["datasets_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        ":datasets",
+    ],
+    grpc_enabled = True,
+    shard_count = 4,
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "tpu_test",
+    size = "small",
+    srcs = ["tpu_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+    ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+)
+
+tf_py_test(
+    name = "tpu_sharding_test",
+    size = "small",
+    srcs = ["tpu_sharding_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_infeed_test",
+    size = "small",
+    srcs = ["tpu_infeed_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_config_test",
+    size = "small",
+    srcs = ["tpu_config_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_estimator_signals_test",
+    size = "small",
+    srcs = ["tpu_estimator_signals_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    # TODO(jhseu): Remove. Fails in OSS on Python 3.
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "topology_test",
+    size = "medium",
+    srcs = ["topology_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding",
+    srcs = [
+        "tpu_embedding.py",
+        "tpu_embedding_gradient.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_strategy_util",
+    srcs = ["tpu_strategy_util.py"],
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+)
+
+tf_py_test(
+    name = "feature_column_test",
+    srcs = [
+        "feature_column_test.py",
+    ],
+    additional_deps = [
+        ":feature_column",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+    main = "feature_column_test.py",
+)
diff --git a/tensorflow/python/tpu/__init__.py b/tensorflow/python/tpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dffd7064b19f353aed6afa3ad383564643a4a90
--- /dev/null
+++ b/tensorflow/python/tpu/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Ops related to Tensor Processing Units."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc885c675945b9264988ac83e4b52298809d5ed
--- /dev/null
+++ b/tensorflow/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,284 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Tooling for support TPU embedding in TPUEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.feature_column import feature_column as core_fc
+from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
+from tensorflow.python.tpu import feature_column as tpu_fc
+from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu.tpu_embedding import AdagradParameters
+from tensorflow.python.tpu.tpu_embedding import AdamParameters
+from tensorflow.python.tpu.tpu_embedding import StochasticGradientDescentParameters
+
+# pylint: disable=protected-access
+_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
+                                 tpu_fc._TPUSharedEmbeddingColumn)
+_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
+                             core_fc_lib.EmbeddingColumn,
+                             core_fc._SharedEmbeddingColumn)
+_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
+_SUPPORTED_OPTIMIZERS = (AdagradParameters, AdamParameters,
+                         StochasticGradientDescentParameters)
+
+# pylint: enable=protected-access
+
+_TABLE_NAME_PREFIX = 'tbl_'
+_LEN_TABLE_NAME_PREFIX = len(_TABLE_NAME_PREFIX)
+
+
+def _get_table_name_from_embedding_var_name(embedding_var_name):
+  return '{}{}'.format(_TABLE_NAME_PREFIX, embedding_var_name)
+
+
+def _get_embedding_var_name_from_table_name(table_name):
+  return table_name[_LEN_TABLE_NAME_PREFIX:]
+
+
+def _get_embedding_variable_name(scope_name, var_name):
+  return '{}/{}'.format(scope_name, var_name)
+
+
+def _get_slot_variable_names(scope_name, var_name, optimization_parameters):
+  """Return embedding variable names which are consistent with CPU runs."""
+  if isinstance(optimization_parameters, tpu_embedding.AdagradParameters):
+    return tpu_embedding.AdagradSlotVariableName(
+        '{}/{}/Adagrad'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters, tpu_embedding.AdamParameters):
+    return tpu_embedding.AdamSlotVariableNames(
+        '{}/{}/Adam/m'.format(scope_name, var_name),
+        '{}/{}/Adam/v'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters,
+                  tpu_embedding.StochasticGradientDescentParameters):
+    return None
+  else:
+    raise ValueError('Support to infer full variable name '
+                     'for optimization_parameter {} has not been added.'
+                     .format(optimization_parameters))
+
+
+def get_full_variable_names(
+    graph, table_to_config_dict, optimization_parameters=None):
+  """Return embedding variable names and slot variables which are consistent with CPU runs."""
+  collection = graph.get_collection_ref(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  if not collection:
+    raise RuntimeError(
+        'Embedding feature column did not capture any thing. Make sure the '
+        'feature columns passed to TPUEstimator constructor is properly '
+        'used in model_fn.')
+
+  embedding_variable_name_by_table = {}
+  slot_variable_names_by_table = {}
+  for table_name in table_to_config_dict:
+    embedding_var_name = _get_embedding_var_name_from_table_name(table_name)
+    (scope_name, var_name) = collection[0][embedding_var_name]
+    embedding_variable_name_by_table[table_name] = (
+        _get_embedding_variable_name(scope_name, var_name))
+    if optimization_parameters:
+      slot_variable_names_by_table[table_name] = _get_slot_variable_names(
+          scope_name, var_name, optimization_parameters)
+
+  graph.clear_collection(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  return embedding_variable_name_by_table, slot_variable_names_by_table
+
+
+def get_tpu_embedding_config_from_feature_columns(feature_columns):
+  """Create configs for TPUEmbedding from a list of feature columns.
+
+  This function will place one embedding tensor per table and the return is
+  intended to be used as input to TPUEmbedding.
+
+  Args:
+    feature_columns: a list of supported feature columns.
+
+  Returns:
+    A pair of dicts, the first maps tables to their config, the second maps
+    features to tables.
+  """
+
+  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
+
+  for column in feature_columns:
+    if not isinstance(column, allowed):
+      raise TypeError(
+          'Unsupported feature column {}. Supported types are {}.'.format(
+              type(column), allowed))
+
+  table_to_config = {}
+  feature_to_table = {}
+  for column in feature_columns:
+    feature_name = column.get_feature_key_name()
+    table_name = _get_table_name_from_embedding_var_name(
+        column.get_embedding_var_name())
+    if feature_name in feature_to_table:
+      raise ValueError(
+          'Feature column {} is used with multiple embeddings and this is '
+          'not supported.'.format(feature_name))
+    feature_to_table[feature_name] = table_name
+    vocabulary_size, dimension = column.get_embedding_table_size()
+    table_to_config[table_name] = tpu_embedding.TableConfig(
+        vocabulary_size=vocabulary_size,
+        dimension=dimension,
+        initializer=column.get_initializer(),
+        combiner=column.get_combiner())
+
+  return table_to_config, feature_to_table
+
+
+class EmbeddingConfigSpec(
+    collections.namedtuple('EmbeddingConfigSpec', [
+        'feature_columns', 'optimization_parameters', 'clipping_limit',
+    ])):
+  """Class to keep track of embedding config specification."""
+
+  def __new__(cls,
+              feature_columns,
+              optimization_parameters,
+              clipping_limit=None):
+    """Creates an EmbeddingConfigSpec instance.
+
+    Args:
+      feature_columns: All `FeatureColumn`s used by model.
+      optimization_parameters: An instance of `AdagradParameters`,
+        `AdamParameters` or `StochasticGradientDescentParameters`. This
+        optimizer will be applied to all embedding variables specified by
+        `feature_columns`.
+      clipping_limit: (Optional) Clipping limit (absolute value).
+
+    Returns:
+      An EmbeddingConfigSpec instance.
+
+    Raises:
+      ValueError: If the feature_columns are not specified.
+      TypeError: If the feature columns are not of ths correct type (one of
+        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
+        _EMBEDDING_COLUMN_CLASSES).
+      ValueError: If `optimization_parameters` is not one of the required types.
+    """
+    if not feature_columns:
+      raise ValueError('`feature_columns` cannot be `None` or empty.')
+
+    # It is unknown at this moment, whether the TPUEstimator is running in CPU
+    # or TPU mode. So allow non-TPU embedding columns also.
+    supported_classes = tuple(
+        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
+        list(_EMBEDDING_COLUMN_CLASSES))
+
+    for column in feature_columns:
+      if not isinstance(column, supported_classes):
+        raise TypeError(
+            'All feature columns must be supported types in {}. Got {}'.format(
+                supported_classes, type(column)))
+
+    if not isinstance(optimization_parameters, _SUPPORTED_OPTIMIZERS):
+      raise ValueError('optimization_parameters must be an instance of type '
+                       '{}. Got {}.'.format(_SUPPORTED_OPTIMIZERS,
+                                            type(optimization_parameters)))
+
+    return super(EmbeddingConfigSpec, cls).__new__(
+        cls,
+        feature_columns=feature_columns,
+        optimization_parameters=optimization_parameters,
+        clipping_limit=clipping_limit)
+
+
+class EmbeddingConfig(object):
+  """This is the internal immutable object for embedding config.
+
+  `_EmbeddingConfig` is responsible to _translate_ user provided
+  `EmbeddingConfigSpec` to internal data structures, mostly constructor
+  arguments of `TPUEmbedding`.
+  """
+
+  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
+               num_hosts, num_cores, run_config):
+    self._embedding_config_spec = embedding_config_spec
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._num_hosts = num_hosts
+    self._num_cores = num_cores
+    self._run_config = run_config
+
+    self._table_to_config_dict, self._feature_to_table_dict = (
+        get_tpu_embedding_config_from_feature_columns(
+            embedding_config_spec.feature_columns))
+    self._mode_to_tpu_embedding_dict = {}
+    self.dummy_table_variables = None
+
+  def has_embedding_tables(self):
+    return bool(self._table_to_config_dict)
+
+  def _create_tpu_embedding(self, mode):
+    """Create tpu_embedding.TPUEmbedding based on mode."""
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      batch_size = self._train_batch_size
+    else:
+      batch_size = self._eval_batch_size
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      tpu_embedding_mode = tpu_embedding.TRAINING
+      optimization_parameters = (
+          self._embedding_config_spec.optimization_parameters)
+    elif (mode == model_fn_lib.ModeKeys.EVAL or
+          mode == model_fn_lib.ModeKeys.PREDICT):
+      tpu_embedding_mode = tpu_embedding.INFERENCE
+      optimization_parameters = None
+    else:
+      raise ValueError('Mode {} is not supported.'.format(mode))
+
+    if self._run_config.cluster:
+      master = self._run_config.cluster.master()
+      cluster_spec = self._run_config.cluster.cluster_spec()
+      cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+    else:
+      master = (
+          self._run_config.evaluation_master
+          if mode == model_fn_lib.ModeKeys.EVAL else self._run_config.master)
+      cluster_def = None
+    tpu_embedding_ = tpu_embedding.TPUEmbedding(
+        self._table_to_config_dict,
+        self._feature_to_table_dict,
+        batch_size,
+        tpu_embedding_mode,
+        master,
+        optimization_parameters,
+        cluster_def,
+    )
+    return tpu_embedding_
+
+  def get_tpu_embedding(self, mode):
+    if mode not in self._mode_to_tpu_embedding_dict:
+      self._mode_to_tpu_embedding_dict[mode] = (
+          self._create_tpu_embedding(mode))
+    return self._mode_to_tpu_embedding_dict[mode]
+
+
+def split_inputs(ctx, features, labels):
+  """Splits the dense and sparse tensors inside the features and labels."""
+  sparse_features = collections.OrderedDict()
+  if ctx.embedding_config:
+    tpu_embedding_ = ctx.embedding_config.tpu_embedding
+    for feature_key in tpu_embedding_.feature_to_table_dict:
+      sparse_features[feature_key] = features.pop(feature_key)
+
+  return features, labels, sparse_features
diff --git a/tensorflow/python/tpu/async_checkpoint.py b/tensorflow/python/tpu/async_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42c2438fa66d0bdb8df4ee2319f64b3bf78061a
--- /dev/null
+++ b/tensorflow/python/tpu/async_checkpoint.py
@@ -0,0 +1,208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Hook for asynchronous checkpointing.
+
+This hook dispatches checkpoint writing operations in a separate thread to
+allow execution to continue on the main thread.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+import time
+
+from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import training_util
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training.summary_io import SummaryWriterCache
+
+
+class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
+  """Saves checkpoints every N steps or seconds."""
+
+  def __init__(self,
+               checkpoint_dir,
+               save_secs=None,
+               save_steps=None,
+               saver=None,
+               checkpoint_basename="model.ckpt",
+               scaffold=None,
+               listeners=None):
+    """Initializes a `CheckpointSaverHook`.
+
+    Args:
+      checkpoint_dir: `str`, base directory for the checkpoint files.
+      save_secs: `int`, save every N secs.
+      save_steps: `int`, save every N steps.
+      saver: `Saver` object, used for saving.
+      checkpoint_basename: `str`, base name for the checkpoint files.
+      scaffold: `Scaffold`, use to get saver object.
+      listeners: List of `CheckpointSaverListener` subclass instances. Used for
+        callbacks that run immediately before or after this hook saves the
+        checkpoint.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of `saver` or `scaffold` should be set.
+    """
+    logging.info("Create AsyncCheckpointSaverHook.")
+    if saver is not None and scaffold is not None:
+      raise ValueError("You cannot provide both saver and scaffold.")
+    self._saver = saver
+    self._save_thread = None
+    self._write_graph_thread = None
+    self._checkpoint_dir = checkpoint_dir
+    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
+    self._scaffold = scaffold
+    self._timer = basic_session_run_hooks.SecondOrStepTimer(
+        every_secs=save_secs, every_steps=save_steps)
+    self._listeners = listeners or []
+    self._steps_per_run = 1
+    self._summary_writer = None
+    self._global_step_tensor = None
+
+    self._last_checkpoint_step = None
+
+  def _set_steps_per_run(self, steps_per_run):
+    self._steps_per_run = steps_per_run
+
+  def begin(self):
+    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
+    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use CheckpointSaverHook.")
+    for l in self._listeners:
+      l.begin()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+
+    # We do write graph and saver_def at the first call of before_run.
+    # We cannot do this in begin, since we let other hooks to change graph and
+    # add variables in begin. Graph is finalized after all begin calls.
+    def _write_graph_fn(self):
+      training_util.write_graph(
+          ops.get_default_graph().as_graph_def(add_shapes=True),
+          self._checkpoint_dir, "graph.pbtxt")
+    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
+                                                args=[self])
+    self._write_graph_thread.start()
+
+    saver_def = self._get_saver().saver_def if self._get_saver() else None
+    graph = ops.get_default_graph()
+    meta_graph_def = meta_graph.create_meta_graph_def(
+        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
+    self._summary_writer.add_graph(graph)
+    self._summary_writer.add_meta_graph(meta_graph_def)
+    # The checkpoint saved here is the state at step "global_step".
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_context.session.run(self._global_step_tensor)
+    if self._timer.should_trigger_for_step(global_step):
+      self._timer.update_last_triggered_step(global_step)
+      logging.info("Triggering checkpoint. %s", global_step)
+      if self._save(run_context.session, global_step):
+        run_context.request_stop()
+
+  def end(self, session):
+    if self._save_thread:
+      logging.info("Waiting for any pending checkpoints to finish.")
+      self._save_thread.join()
+    if self._write_graph_thread:
+      logging.info("Waiting for any pending write_graph to finish.")
+      self._write_graph_thread.join()
+
+    last_step = session.run(self._global_step_tensor)
+
+    if self._last_checkpoint_step != last_step:
+      self._save(session, last_step, asynchronous=False)
+
+    for l in self._listeners:
+      l.end(session, last_step)
+
+  def _save(self, session, step, asynchronous=True):
+    """Saves the latest checkpoint, returns should_stop."""
+
+    def _save_fn():
+      """Run the saver process."""
+      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
+
+      start_time = time.time()
+      for l in self._listeners:
+        l.before_save(session, step)
+
+      self._get_saver().save(session, self._save_path, global_step=step)
+      self._summary_writer.add_session_log(
+          SessionLog(
+              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
+          step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
+      end_time = time.time()
+      logging.info("Checkpoint actual writing time: (%.3f sec)",
+                   end_time - start_time)
+      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
+
+    if not asynchronous:
+      self._last_checkpoint_step = step
+      _save_fn()
+      return
+
+    if self._save_thread is not None:
+      self._save_thread.join(timeout=0.1)
+      if self._save_thread.is_alive():
+        logging.info("Saver thread still in progress, skipping checkpoint.")
+        return
+
+    self._last_checkpoint_step = step
+    self._save_thread = threading.Thread(target=_save_fn)
+    self._save_thread.start()
+
+  def _get_saver(self):
+    if self._saver is not None:
+      return self._saver
+    elif self._scaffold is not None:
+      return self._scaffold.saver
+
+    # Get saver from the SAVERS collection if present.
+    collection_key = ops.GraphKeys.SAVERS
+    savers = ops.get_collection(collection_key)
+    if not savers:
+      raise RuntimeError(
+          "No items in collection {}. Please add a saver to the collection "
+          "or provide a saver or scaffold.".format(collection_key))
+    elif len(savers) > 1:
+      raise RuntimeError(
+          "More than one item in collection {}. "
+          "Please indicate which one to use by passing it to the constructor."
+          .format(collection_key))
+
+    self._saver = savers[0]
+    return savers[0]
diff --git a/tensorflow/python/tpu/bfloat16.py b/tensorflow/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa74f651aa63c72d14eb78c8af479263810e9b7d
--- /dev/null
+++ b/tensorflow/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      '', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/python/tpu/bfloat16_test.py
similarity index 93%
rename from tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
rename to tensorflow/python/tpu/bfloat16_test.py
index 26fd3768278cacd076e5fee8bdad75d0486678d0..e087dda3799014d8adac8aae7924da3e14b9f8e5 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
+++ b/tensorflow/python/tpu/bfloat16_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import bfloat16
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import bfloat16
 
 
 class BFloat16ScopeTest(test.TestCase):
@@ -34,6 +34,7 @@ class BFloat16ScopeTest(test.TestCase):
     with bfloat16.bfloat16_scope() as bf:
       self.assertEqual(bf.name, "")
 
+  @test_util.run_deprecated_v1
   def testRequestedDType(self):
     """Test if requested dtype is honored in the getter.
     """
diff --git a/tensorflow/python/tpu/datasets.py b/tensorflow/python/tpu/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dd7fb774ee9791f8715ebae46563a6691b1611c
--- /dev/null
+++ b/tensorflow/python/tpu/datasets.py
@@ -0,0 +1,194 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of Cloud TPU helper functions for data loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import functional_ops
+
+
+def _TextLineDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+def _TFRecordDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+_FILETYPE_MAP = {
+    'tfrecord': _TFRecordDataset,
+    'textline': _TextLineDataset,
+    'text': _TextLineDataset,
+}
+
+
+def StreamingFilesDataset(files,
+                          filetype=None,
+                          file_reader_job=None,
+                          worker_job=None,
+                          num_epochs=None,
+                          filename_shuffle_buffer_size=None,
+                          num_parallel_reads=None,
+                          batch_transfer_size=None,
+                          sloppy=None):
+  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
+
+  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
+  files local to your GCE VM. In order to train using files stored on your local
+  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
+  helper to generate a dataset to feed your Cloud TPU with files from your GCE
+  VM.
+
+  The resulting dataset may return an OutOfRangeError if there are no files
+  found as a result of the fileglob expansion.
+
+  Note: StreamingFilesDataset assumes that the session is using a
+  TPUClusterResolver and has therefore a worker and a coordinator job. File
+  loading will be done on the coordinator job.
+
+  Args:
+    files: A string glob to match files, or a `tf.data.Dataset` generating file
+      names.
+    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
+      TensorFlow function that when given a filename returns a dataset.
+    file_reader_job: An optional string that corresponds to the job that should
+      perform the file reads.
+    worker_job: An optional string that corresponds to the job that should
+      process the tensors (i.e. your GPU or TPU worker).
+    num_epochs: The number of epochs through the training set that should be
+      generated. By default, it will repeat infinitely.
+    filename_shuffle_buffer_size: An optional integer whose value controls the
+      shuffling of the file names. If you would like to read from the files in
+      the same order, set to 0 or False.
+    num_parallel_reads: An optional integer controlling the number of files to
+      read from concurrently. (Set to 1 for no parallelism.)
+    batch_transfer_size: An optional integer controlling the batching used to
+      amortize the remote function invocation overhead. Set to a very large
+      number to increase throughput. Set to a very small number to reduce memory
+      consumption. Set to False to skip batching.
+    sloppy: (Optional.) If `False`, read input data while maintaining a
+      deterministic order. (This may have significant performance impacts.)
+      sloppy defaults to: True.
+  Returns:
+    A `tf.data.Dataset` with an infinite stream of elements generated by a
+    parallel interleaving of the set of files matched (or generated) by `files`
+    with a type is the output of the dataset specified by `filetype`.
+
+  Raises:
+    ValueError: if any argument is not of the expected type.
+  """
+  if filetype is None:
+    filetype = 'tfrecord'
+
+  if isinstance(filetype, str):
+    if filetype not in _FILETYPE_MAP:
+      raise ValueError('Unexpected filetype: %s' % filetype)
+    reader_fn = _FILETYPE_MAP[filetype]
+  elif callable(filetype):
+    reader_fn = filetype
+  else:
+    raise ValueError('filetype should be a string or a callable')
+
+  file_reader_job = file_reader_job or 'coordinator'
+
+  worker_job = worker_job or 'worker'
+
+  if filename_shuffle_buffer_size is None:
+    filename_shuffle_buffer_size = 4096
+
+  num_parallel_reads = num_parallel_reads or 8
+
+  if batch_transfer_size is None:
+    batch_transfer_size = 256
+
+  if sloppy is None:
+    sloppy = True
+
+  with ops.device('/job:%s' % file_reader_job):
+    if isinstance(files, str):
+      source_dataset = dataset_ops.Dataset.list_files(files)
+    elif isinstance(files, dataset_ops.DatasetV2):
+      source_dataset = files
+    else:
+      raise ValueError('files was not a string or a dataset: %s' % files)
+
+    if filename_shuffle_buffer_size:
+      source_dataset = source_dataset.shuffle(
+          buffer_size=filename_shuffle_buffer_size)
+
+    source_dataset = source_dataset.apply(
+        interleave_ops.parallel_interleave(
+            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+    source_dataset = source_dataset.repeat(num_epochs)
+
+    if batch_transfer_size:
+      source_dataset = source_dataset.batch(batch_transfer_size)
+
+    source_dataset = source_dataset.prefetch(1)
+
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
+    source_handle = source_iterator.string_handle()
+
+  @function.Defun(dtypes.string)
+  def LoadingFunc(h):
+    remote_iterator = iterator_ops.Iterator.from_string_handle(
+        h, dataset_ops.get_legacy_output_types(source_dataset),
+        dataset_ops.get_legacy_output_shapes(source_dataset))
+    return remote_iterator.get_next()
+
+  def MapFn(unused_input):
+    source_dataset_output_types = dataset_ops.get_legacy_output_types(
+        source_dataset)
+    if isinstance(source_dataset_output_types, dtypes.DType):
+      output_types = [source_dataset_output_types]
+    elif isinstance(source_dataset_output_types, (list, tuple)):
+      output_types = source_dataset_output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
+        args=[source_handle],
+        Tout=output_types,
+        f=LoadingFunc,
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
+
+  with ops.device('/job:%s' % worker_job):
+    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
+        MapFn, num_parallel_calls=4 if sloppy else None)
+    output_dataset = output_dataset.prefetch(1)
+
+    if batch_transfer_size:
+      # Undo the batching used during the transfer.
+      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
+
+  return output_dataset
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/python/tpu/datasets_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/datasets_test.py
rename to tensorflow/python/tpu/datasets_test.py
index 8a94f527bb6dffa48e71e6500ae5e9e9589fbf5c..416dd9496cc18af8354d4d961b54a50f3db99a24 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/python/tpu/datasets_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.tpu.python.tpu import datasets
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -31,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import datasets
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a301babfb31e90835e7782186e83e566cd040d
--- /dev/null
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -0,0 +1,323 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.tpu.topology import Topology
+from tensorflow.python.util.tf_export import tf_export
+
+
+SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
+
+
+def _compute_task_and_cores_to_replicas(core_assignment, topology):
+  """Computes a nested dict which maps task and logical core to replicas."""
+  task_and_cores_to_replicas = {}
+  for replica in xrange(core_assignment.shape[0]):
+    for logical_core in xrange(core_assignment.shape[1]):
+      coordinates = core_assignment[replica, logical_core, :]
+      task_id = topology.task_ordinal_at_coordinates(coordinates)
+      if task_id not in task_and_cores_to_replicas:
+        task_and_cores_to_replicas[task_id] = {}
+      if logical_core not in task_and_cores_to_replicas[task_id]:
+        task_and_cores_to_replicas[task_id][logical_core] = set()
+
+      task_and_cores_to_replicas[task_id][logical_core].add(replica)
+
+  task_to_sorted_replica_id = {}
+
+  for task, core_to_replicas in task_and_cores_to_replicas.items():
+    core_to_sorted_replicas = {}
+    for core, replicas in core_to_replicas.items():
+      core_to_sorted_replicas[core] = sorted(replicas)
+
+    task_to_sorted_replica_id[task] = core_to_sorted_replicas
+  return task_to_sorted_replica_id
+
+
+@tf_export("tpu.experimental.DeviceAssignment")
+class DeviceAssignment(object):
+  """Mapping from logical cores in a computation to the physical TPU topology.
+
+  Prefer to use the `DeviceAssignment.build()` helper to construct a
+  `DeviceAssignment`; it is easier if less flexible than constructing a
+  `DeviceAssignment` directly.
+  """
+
+  def __init__(self, topology, core_assignment):
+    """Constructs a `DeviceAssignment` object.
+
+    Args:
+      topology: A `Topology` object that describes the physical TPU topology.
+      core_assignment: A logical to physical core mapping, represented as a
+        rank 3 numpy array. See the description of the `core_assignment`
+        property for more details.
+
+    Raises:
+      ValueError: If `topology` is not `Topology` object.
+      ValueError: If `core_assignment` is not a rank 3 numpy array.
+    """
+    if not isinstance(topology, Topology):
+      raise ValueError("topology must be a Topology object, got {}".format(
+          type(topology)))
+    core_assignment = np.asarray(core_assignment, dtype=np.int32)
+
+    self._topology = topology
+
+    if core_assignment.ndim != 3:
+      raise ValueError("core_assignment must be a rank 3 numpy array, "
+                       "got shape {}".format(core_assignment.shape))
+
+    self._num_replicas = core_assignment.shape[0]
+    self._num_cores_per_replica = core_assignment.shape[1]
+
+    if core_assignment.shape[-1] != topology.mesh_rank:
+      raise ValueError(
+          "minor dimension of core_assignment must have size equal to topology "
+          "rank ({}), got shape {}".format(topology.mesh_rank,
+                                           core_assignment.shape))
+
+    self._core_assignment = core_assignment
+    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
+        self._core_assignment, topology)
+
+  @property
+  def topology(self):
+    """A `Topology` that describes the TPU topology."""
+    return self._topology
+
+  @property
+  def num_cores_per_replica(self):
+    """The number of cores per replica."""
+    return self._num_cores_per_replica
+
+  @property
+  def num_replicas(self):
+    """The number of replicas of the computation."""
+    return self._num_replicas
+
+  @property
+  def core_assignment(self):
+    """The logical to physical core mapping.
+
+    Returns:
+      An integer numpy array of rank 3, with shape
+      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
+      (replica, logical core) pairs to physical topology coordinates.
+    """
+    return self._core_assignment
+
+  def coordinates(self, replica, logical_core):
+    """Returns the physical topology coordinates of a logical core."""
+    return tuple(self.core_assignment[replica, logical_core, :])
+
+  def lookup_replicas(self, task_id, logical_core):
+    """Lookup replica ids by task number and logical core.
+
+    Args:
+      task_id: TensorFlow task number.
+      logical_core: An integer, identifying a logical core.
+    Returns:
+      A sorted list of the replicas that are attached to that task and
+      logical_core.
+    Raises:
+      ValueError: If no replica exists in the task which contains the logical
+      core.
+    """
+    try:
+      return self._task_and_cores_to_replicas[task_id][logical_core]
+    except KeyError:
+      raise ValueError(
+          "Can not find any replica in task: {} contains logical_core: {} ".
+          format(task_id, logical_core))
+
+  def tpu_ordinal(self, replica=0, logical_core=0):
+    """Returns the ordinal of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
+
+  def host_device(self, replica=0, logical_core=0, job=None):
+    """Returns the CPU device attached to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
+
+  def tpu_device(self, replica=0, logical_core=0, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
+
+  @staticmethod
+  def build(topology,
+            computation_shape=None,
+            computation_stride=None,
+            num_replicas=1):
+    return device_assignment(topology, computation_shape, computation_stride,
+                             num_replicas)
+
+
+def device_assignment(topology,
+                      computation_shape=None,
+                      computation_stride=None,
+                      num_replicas=1):
+  """Computes a device_assignment of a computation across a TPU topology.
+
+  Attempts to choose a compact grid of cores for locality.
+
+  Returns a `DeviceAssignment` that describes the cores in the topology assigned
+  to each core of each replica.
+
+  `computation_shape` and `computation_stride` values should be powers of 2 for
+  optimal packing.
+
+  Args:
+    topology: A `Topology` object that describes the TPU cluster topology.
+      To obtain a TPU topology, evaluate the `Tensor` returned by
+      `initialize_system` using `Session.run`. Either a serialized
+      `TopologyProto` or a `Topology` object may be passed. Note: you must
+      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
+    computation_shape: A rank 1 int32 numpy array with size equal to the
+      topology rank, describing the shape of the computation's block of cores.
+      If None, the `computation_shape` is `[1] * topology_rank`.
+    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
+      describing the inter-core spacing of the `computation_shape` cores in the
+      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
+    num_replicas: The number of computation replicas to run. The replicas will
+      be packed into the free spaces of the topology.
+
+  Returns:
+    A DeviceAssignment object, which describes the mapping between the logical
+    cores in each computation replica and the physical cores in the TPU
+    topology.
+
+  Raises:
+    ValueError: If `topology` is not a valid `Topology` object.
+    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
+      numpy arrays with shape [3] where all values are positive.
+    ValueError: If computation's replicas cannot fit into the TPU topology.
+  """
+  # Deserialize the Topology proto, if it is a string.
+  if isinstance(topology, bytes):
+    topology = Topology(serialized=topology)
+
+  if not isinstance(topology, Topology):
+    raise ValueError("`topology` is not a Topology object; got {}".format(
+        type(topology)))
+
+  topology_rank = len(topology.mesh_shape)
+  mesh_shape = topology.mesh_shape
+  if computation_shape is None:
+    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_shape = np.asarray(computation_shape, dtype=np.int32)
+
+  if computation_stride is None:
+    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_stride = np.asarray(computation_stride, dtype=np.int32)
+
+  if computation_shape.shape != (topology_rank,):
+    raise ValueError("computation_shape must have shape [{}]; got {}".format(
+        topology_rank, computation_shape.shape))
+  if computation_stride.shape != (topology_rank,):
+    raise ValueError("computation_stride must have shape [{}]; got {}".format(
+        topology_rank, computation_stride.shape))
+
+  if any(computation_shape < 1):
+    raise ValueError(
+        "computation_shape must be positive; got computation_shape={}".format(
+            computation_shape))
+  if any(computation_stride < 1):
+    raise ValueError(
+        "computation_stride must be positive; got computation_stride={}".format(
+            computation_stride))
+
+  # Computes the physical size of one computation instance.
+  computation_footprint = computation_shape * computation_stride
+  if any(computation_footprint > mesh_shape):
+    raise ValueError(
+        "computation footprint {} does not fit in TPU topology shape {}".format(
+            computation_footprint, mesh_shape))
+
+  # Computes how many copies of the computation footprint fit in the mesh.
+  block_counts = mesh_shape // computation_footprint
+
+  replica_counts = block_counts * computation_stride
+  max_replicas = np.prod(replica_counts)
+  if num_replicas > max_replicas:
+    raise ValueError(
+        "requested {} replicas but only {} replicas with shape {} and "
+        "computation_stride {} fit in a TPU mesh of shape {}".format(
+            num_replicas, max_replicas, computation_shape, computation_stride,
+            mesh_shape))
+
+  def ceil_of_ratio(n, m):
+    return (n + m - 1) // m
+
+  replica_shape = [0] * topology_rank
+  if num_replicas > 0:
+    remaining_replicas = num_replicas
+    remaining_dims = topology_rank
+
+    # Choose dimensions as close to an equal cube as possible, in order of
+    # increasing dimension size. By visiting dimensions in increasing size, we
+    # assign the most constrained dimension first, so we won't make infeasible
+    # choices.
+    #
+    # As a secondary sort order, visit the dimensions in reverse order. This
+    # means we try to use both cores on the same chip in preference to two cores
+    # on different chips.
+    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
+      i = -ni
+      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
+      replica_shape[i] = min(target_size, x)
+      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
+      remaining_dims -= 1
+
+    assert remaining_replicas == 1 and remaining_dims == 0
+
+  # Assigns an offset to each replica such that no two replicas overlap.
+  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
+  for replica in xrange(num_replicas):
+    # Chooses a replica number in each axis.
+    t = replica
+    pos = []
+    for dim in replica_shape[::-1]:
+      pos.append(t % dim)
+      t //= dim
+    replica_pos = np.array(pos[::-1], dtype=np.int32)
+
+    # Determines where that replica starts in each axis.
+    outer = replica_pos // computation_stride
+    inner = replica_pos % computation_stride
+    replica_offsets[replica, :] = outer * computation_footprint + inner
+
+  # Computes a complete logical core -> physical core mapping for each replica.
+  indices = [
+      np.arange(0, computation_shape[i] * computation_stride[i],
+                computation_stride[i]) for i in xrange(topology_rank)
+  ]
+  indices = np.concatenate(
+      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+      axis=-1)
+  indices = indices.reshape((-1, topology_rank))
+  assignment = indices + replica_offsets[:, np.newaxis, :]
+  return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e1ea42370d653d1de7c12eee4b456ec7ce921c
--- /dev/null
+++ b/tensorflow/python/tpu/error_handling.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""ErrorRendezvous handler for collecting errors from multiple threads."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+import threading
+import time
+
+import six
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+
+_UNINTERESTING_ERRORS = (errors.CancelledError,)
+
+
+class ErrorRendezvous(object):
+  """Resolve errors from multiple threads during TPU execution.
+
+  TPU errors can occur on the infeed or outfeed threads as well as the main
+  training thread.
+
+  Depending on which thread "wins" and receives the session error first, we may
+  end up showing users a confusing and non-actionable error message (session
+  cancelled) instead of a root cause (e.g. a bad filename).
+
+  The rendezvous object provides a location to capture these errors until all
+  threads terminate.  At that point we can choose the most informative error
+  to report.
+  """
+
+  def __init__(self, num_sources):
+    # string -> (message, traceback)
+    self._errors = {}
+    self._num_sources = num_sources
+    self._session_cancel_timer = None
+
+  def record_error(self, source, exc_info, session=None):
+    """Report an exception from the given source.
+
+    If a session is passed, a timer will be registered to close it after a few
+    seconds.  This is necessary to ensure the main training loop does not hang
+    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
+    interesting error from another thread to propagate.
+
+    Args:
+      source: string, source of the error
+      exc_info: Output from `sys.exc_info` (type, value, traceback)
+      session: Session to close after delay.
+    """
+    _, value, _ = exc_info
+    self._errors[source] = exc_info
+    logging.info('Error recorded from %s: %s', source, value)
+
+    if session is not None and self._session_cancel_timer is None:
+
+      def _cancel_session():
+        time.sleep(5)
+        try:
+          session.close()
+        except:  # pylint: disable=bare-except
+          pass
+
+      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
+      self._session_cancel_timer.daemon = True
+      self._session_cancel_timer.start()
+
+  def record_done(self, source):
+    """Mark execution source `source` as done.
+
+    If an error was originally reported from `source` it is left intact.
+
+    Args:
+      source: `str`, source being recorded
+    """
+    logging.info('%s marked as finished', source)
+    if source not in self._errors:
+      self._errors[source] = None
+
+  @contextlib.contextmanager
+  def catch_errors(self, source, session=None):
+    """Context manager to report any errors within a block."""
+    try:
+      yield
+    except Exception:  # pylint: disable=broad-except
+      self.record_error(source, sys.exc_info(), session)
+
+  def raise_errors(self, timeout_sec=0):
+    """Wait for up to `timeout` seconds for all error sources to finish.
+
+    Preferentially raise "interesting" errors (errors not in the
+    _UNINTERESTING_ERRORS) set.
+
+    Args:
+      timeout_sec: Seconds to wait for other error sources.
+    """
+    for _ in range(timeout_sec):
+      if len(self._errors) == self._num_sources:
+        break
+      time.sleep(1)
+
+    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
+
+    # First check for any interesting errors, then fall back on the session
+    # cancelled errors etc.
+    for k, (typ, value, traceback) in kept_errors:
+      if isinstance(value, _UNINTERESTING_ERRORS):
+        continue
+      else:
+        logging.warn('Reraising captured error')
+        six.reraise(typ, value, traceback)
+
+    for k, (typ, value, traceback) in kept_errors:
+      logging.warn('Reraising captured error')
+      six.reraise(typ, value, traceback)
diff --git a/tensorflow/python/tpu/experimental/BUILD b/tensorflow/python/tpu/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..407596284907f0cf08f6d1c6c4112c3e09e8baf9
--- /dev/null
+++ b/tensorflow/python/tpu/experimental/BUILD
@@ -0,0 +1,18 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "experimental",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/tpu:tpu_strategy_util",
+    ],
+)
diff --git a/tensorflow/python/tpu/experimental/__init__.py b/tensorflow/python/tpu/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb4e08270a4d249db849bda63f99a43c75b87dd
--- /dev/null
+++ b/tensorflow/python/tpu/experimental/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental TPU library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.tpu import tpu_strategy_util
+# pylint: enable=unused-import
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7e93910c834831b5aa79440de3660f974a2310
--- /dev/null
+++ b/tensorflow/python/tpu/feature_column.py
@@ -0,0 +1,435 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU Feature Column Library."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
+# pylint: disable=protected-access
+
+
+_TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
+_SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
+                                  fc._VocabularyFileCategoricalColumn,
+                                  fc._VocabularyListCategoricalColumn,
+                                  fc._WeightedCategoricalColumn,
+                                  fc_lib.IdentityCategoricalColumn,
+                                  fc_lib.VocabularyFileCategoricalColumn,
+                                  fc_lib.VocabularyListCategoricalColumn,
+                                  fc_lib.WeightedCategoricalColumn)
+
+
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None):
+  """TPU embedding_column for `tf.feature_column.embedding_column`.
+
+  Note that the interface for TPU embedding_column is different from the non-TPU
+  version. The following args available for the non-TPU version are NOT
+  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
+
+  Args:
+    categorical_column: A categorical_column returned from
+        categorical_column_with_identity,  weighted_categorical_column,
+        categorical_column_with_vocabulary_list or
+        categorical_column_with_vocabulary_file.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. For more information, see
+      `tf.feature_column.embedding_column`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+
+  Returns:
+    A  _TPUEmbeddingColumn.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if `initializer` is specified but not callable.
+  """
+  if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+    raise TypeError(
+        'categorical_column for tpu '
+        ' embedding_column must be type %s, got %s.' % (' or '.join([
+            cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+        ]), type(categorical_column)))
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = fc._EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=True,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
+  column = _TPUEmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      layer_creator=_creator,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+  # For Embedding column, the initializer is hidden inside the creator Fn, which
+  # is not accessiable later. So, we attach it to a speicial field. Also note
+  # that non-TPU Embedding column and non-TPU shared Embedding column handle the
+  # initializer differently. See shared_embedding_columns for details.
+  column._tpu_initializer = initializer
+  return column
+
+
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None):
+  """List of dense columns that convert from sparse, categorical input."""
+  for categorical_column in categorical_columns:
+    if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+      raise TypeError(
+          'categorical_column for tpu '
+          ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
+              cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+          ]), type(categorical_column)))
+  columns = fc_lib.shared_embedding_columns(
+      categorical_columns,
+      dimension,
+      combiner=combiner,
+      initializer=initializer,
+      shared_embedding_collection_name=shared_embedding_collection_name,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+
+  # Use the initializer and shared_embedding_collection_name to create TPU
+  # version
+  initializer = columns[0].initializer
+  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
+  tpu_columns = []
+
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  for categorical_column in categorical_columns:
+    column = _TPUSharedEmbeddingColumn(
+        categorical_column=categorical_column,
+        dimension=dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=None,
+        tensor_name_in_ckpt=None,
+        max_norm=None,
+        trainable=True)
+    tpu_columns.append(column)
+
+  return tpu_columns
+
+
+class _TPUBaseEmbeddingColumn(object):
+  """Base class for TPU Embedding Column."""
+
+  def __init__(self, categorical_column):
+    self._tpu_categorical_column = categorical_column
+
+  def get_combiner(self):
+    """Returns the embedding combiner."""
+    raise NotImplementedError('not implemented')
+
+  def get_embedding_table_size(self):
+    """Returns the embedding table size, tuple of vocab size and dimension."""
+    raise NotImplementedError('not implemented')
+
+  def get_feature_key_name(self):
+    """Returns the feature key name in the features dict."""
+    raise NotImplementedError('not impl')
+
+  def get_weight_key_name(self):
+    """Return the key name for weights."""
+    raise NotImplementedError('not impl')
+
+  def get_embedding_var_name(self):
+    """Returns the embedding variable name.
+
+    Feature key name and embedding variable name are usually one-to-one mapping.
+    But for shared embedding columns, it is many-to-one mapping.
+    """
+    raise NotImplementedError('not impl')
+
+  def get_initializer(self):
+    """Returns the initializer."""
+    raise NotImplementedError('not impl')
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    raise NotImplementedError('not impl')
+
+
+class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
+  """Core Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              layer_creator=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
+    # are not supported on TPU. They are solely for matching the signature of
+    # __new__ of parent class fc._EmbeddingColumn.
+    return fc._EmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        layer_creator=layer_creator,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               layer_creator=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.categorical_column.name
+
+  def get_initializer(self):
+    return self._tpu_initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._EmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._EmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(self.get_embedding_var_name(),
+                                    'embedding_weights')
+
+    return tensor
+
+
+class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
+                                fc._SharedEmbeddingColumn):
+  """Core Shared Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              initializer=None,
+              shared_embedding_collection_name=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    return fc._SharedEmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               initializer=None,
+               shared_embedding_collection_name=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.shared_embedding_collection_name
+
+  def get_initializer(self):
+    return self.initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._SharedEmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._SharedEmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        is_shared_embedding=True)
+    return tensor
+
+
+def _record_variable_scope_and_name(embedding_var_name,
+                                    embedding_var_name_in_fc,
+                                    is_shared_embedding=False):
+  """Add embedding variable name and scope to collection."""
+  g = ops.get_default_graph()
+  collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
+  if not collection:
+    collection.append({})
+
+  var_def_dict = collection[0]
+
+  captured_scope = variable_scope.get_variable_scope()
+  captured_scope_name = captured_scope.name
+
+  if embedding_var_name in var_def_dict:
+    if (var_def_dict[embedding_var_name][0] != captured_scope_name
+        and not is_shared_embedding):
+      raise ValueError(
+          'For embedding var name {}, the variable scope name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       captured_scope_name,
+                                       var_def_dict[embedding_var_name][0]))
+    if var_def_dict[embedding_var_name][1] != embedding_var_name_in_fc:
+      raise ValueError(
+          'For embedding var name {}, the embedding name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       embedding_var_name_in_fc,
+                                       var_def_dict[embedding_var_name][1]))
+  else:
+    var_def_dict[embedding_var_name] = (captured_scope_name,
+                                        embedding_var_name_in_fc)
+
+
+def _is_running_on_cpu():
+  """Returns True if the current context is CPU model."""
+  return tpu_function.get_tpu_context().number_of_shards is None
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column_test.py b/tensorflow/python/tpu/feature_column_test.py
similarity index 97%
rename from tensorflow/contrib/tpu/python/tpu/feature_column_test.py
rename to tensorflow/python/tpu/feature_column_test.py
index 75164cce4c261cc541dd6b01ee22699d286d9621..99e66de2ba7ae0aa09c0e2221bead0ca52df66b1 100644
--- a/tensorflow/contrib/tpu/python/tpu/feature_column_test.py
+++ b/tensorflow/python/tpu/feature_column_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-"""Tests for contrib.tpu.python.tpu.feature_column."""
+"""Tests for python.tpu.feature_column."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,17 +20,18 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tpu.python.tpu import feature_column as tpu_fc
 from tensorflow.python.client import session
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import feature_column as tpu_fc
 
 
 def _initialized_session():
@@ -77,6 +78,7 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.deprecated_graph_mode_only
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -140,6 +142,7 @@ class EmbeddingColumnTest(test.TestCase):
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.deprecated_graph_mode_only
   def test_defaults(self):
     categorical_column_a = fc_lib.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -176,6 +179,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.deprecated_graph_mode_only
   def test_all_constructor_args(self):
     categorical_column_a = fc_lib.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -213,6 +217,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.deprecated_graph_mode_only
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
diff --git a/tensorflow/python/tpu/functional.py b/tensorflow/python/tpu/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..045ec523bbbd1a16f616fbf2c3b11b66053968f5
--- /dev/null
+++ b/tensorflow/python/tpu/functional.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.tpu.ops import tpu_ops
+
+TPUPartitionedCall = tpu_ops.tpu_partitioned_call  # pylint: disable=invalid-name
diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bce994af73e137b2c73c590ca9b5fe94e41775e8
--- /dev/null
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -0,0 +1,436 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops import gen_tpu_ops
+from tensorflow.python.ops.gen_tpu_ops import *
+# pylint: enable=wildcard-import,unused-import
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+
+
+def _create_default_group_assignment():
+  num_shards = tpu_function.get_tpu_context().number_of_shards
+  if num_shards is None:
+    logging.warning(
+        "cross_replica_sum should be used within a tpu_shard_context, but "
+        "got unset number_of_shards. Assuming 1.")
+    num_shards = 1
+  group_assignment = [list(range(num_shards))]
+  return group_assignment
+
+
+def all_to_all(x,
+               concat_dimension,
+               split_dimension,
+               split_count,
+               group_assignment=None,
+               name=None):
+  """Exchange data across TPU replicas.
+
+  Args:
+    x: The local tensor.
+    concat_dimension: The dimension number to concatenate.
+    split_dimension: The dimension number to split.
+    split_count: The number of splits, this number must equal to the sub-group
+      size(group_assignment.get_shape()[1])
+    group_assignment: Optional 2d int32 lists with shape [num_groups,
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is concatenated by data from different replicas.
+  """
+  if group_assignment is None:
+    group_assignment = _create_default_group_assignment()
+  return gen_tpu_ops.all_to_all(
+      x,
+      group_assignment,
+      concat_dimension=concat_dimension,
+      split_dimension=split_dimension,
+      split_count=split_count,
+      name=name)
+
+
+@ops.RegisterGradient("AllToAll")
+def _all_to_all_grad(op, grad):
+  # The gradient of a all-to-all is also a all-to-all but the
+  # split_dimension and concat_dimension is swapped.
+  # The graident with respect to group_assignment is None.
+  return [
+      gen_tpu_ops.all_to_all(
+          grad,
+          op.inputs[1],
+          concat_dimension=op.get_attr("split_dimension"),
+          split_dimension=op.get_attr("concat_dimension"),
+          split_count=op.get_attr("split_count")), None
+  ]
+
+
+def cross_replica_sum(x, group_assignment=None, name=None):
+  """Sum the input tensor across replicas according to group_assignment.
+
+  Args:
+    x: The local tensor to the sum.
+    group_assignment: Optional 2d int32 lists with shape [num_groups,
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is summed across replicas.
+  """
+  if group_assignment is None:
+    group_assignment = _create_default_group_assignment()
+
+  return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
+
+
+def collective_permute(x, source_target_pairs, name=None):
+  """Permute the input tensor across replicas given source_target_pairs.
+
+  For each source_target_pair <a, b>, we send replica a's input to replica b.
+  Each replica id must only appear once in the source column. Also it must
+  only appear once in the target column.
+  For the replica id not in the target column, this op returns a zero tensor
+  with the same shape and dtype of the input x.
+
+  For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+  source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
+  `[0, A, B, C]`.
+
+  Args:
+    x: The local tensor to be permuted.
+    source_target_pairs: 2d int lists with shape [num_pairs, 2].
+      source_target_pairs[i][0] represents the source replica id and
+      source_target_pairs[i][1] represents the target replica id.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is permuted.
+  """
+  return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
+
+
+@ops.RegisterGradient("CollectivePermute")
+def _collective_permute_grad(op, grad):
+  # The gradient of a collective permute operation is also a collective
+  # permute, but with source/target pairs reversed. The gradient with respect
+  # to input argument `source_target_pairs` is `None`.
+  source_target_pairs = op.inputs[1][:, ::-1]
+  return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
+
+
+@ops.RegisterGradient("CrossReplicaSum")
+def _cross_replica_sum_grad(op, grad):
+  # The gradient of a cross replica sum is also a cross-replica sum.
+  # The gradient with respect to group_assignment is None.
+  return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
+
+
+# This extra type checking exists to give a more helpful error message in
+# the common case that uint8 and int64 values are infed. Remove when both
+# types are supported.
+
+_SUPPORTED_INFEED_DTYPES = set([
+    dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
+    dtypes.complex64, dtypes.uint32
+])
+
+
+@ops.RegisterGradient("TPUEmbeddingActivations")
+def _embedding_activations_grad(activations_op, grad_wrt_activations):
+  """Saves the gradient of embedding activations ops in a graph collection."""
+  g = ops.get_default_graph()
+  table_id = activations_op.get_attr("table_id")
+  lookup_id = activations_op.get_attr("lookup_id")
+  table_gradients = g.get_collection_ref(
+      "tpu_embedding_gradients_table_%d" % table_id)
+
+  if not table_gradients:
+    raise RuntimeError(
+        "Gradients for TPUEmbedding have been generated in non-training mode."
+        "This is not expected. Consider putting your Optimizer.minimize code "
+        "behind the training mode condition check. For Estimator, you can "
+        "do \n\n"
+        "    if mode == tf.estimator.ModeKeys.TRAIN:\n"
+        "        train_op = opt.minimize(loss)\n"
+        "\n")
+
+  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
+  return [
+      # RegisterGradient requires that value be returned for all inputs. Since
+      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
+      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
+      # embedding activations (grad_wrt_activations) has the same shape as the
+      # activations returned by  embedding_activations.
+      array_ops.zeros(arg.shape, dtype=dtypes.float32)
+      for arg in activations_op.inputs
+  ]
+
+
+def infeed_dequeue(dtype, shape, name=None):
+  """A placeholder op for a value that will be fed into the computation.
+
+  Args:
+    dtype: A `tf.DType`. The type of elements in the tensor.
+    shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `dtype`.
+    A tensor that will be provided using the infeed mechanism.
+
+  Raises:
+    TypeError: If 'dtype` is not a supported infeed type.
+  """
+  if dtype not in _SUPPORTED_INFEED_DTYPES:
+    raise TypeError(
+        "{} is not a supported TPU infeed type. Supported types are: "
+        "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+
+  return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
+
+
+# pylint: disable=redefined-outer-name
+def infeed_dequeue_tuple(dtypes, shapes, name=None):
+  """A placeholder op for values fed into the TPU simultaneously as a tuple.
+
+  Args:
+    dtypes: A list of `tf.DType`s that has length `>= 1`.
+      The element types of each element in `outputs`.
+    shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+      The shapes of each tensor in `outputs`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` objects of type `dtypes`.
+    A list of tensors that will be provided using the infeed mechanism.
+
+  Raises:
+    TypeError: If a type in 'dtypes` is not a supported infeed type.
+  """
+  for dtype in dtypes:
+    if dtype not in _SUPPORTED_INFEED_DTYPES:
+      raise TypeError(
+          "{} is not a supported TPU infeed type. Supported types are: "
+          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+  return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
+# pylint: enable=redefined-outer-name
+
+
+# pylint: disable=protected-access
+def send_tpu_embedding_gradients(inputs,
+                                 config,
+                                 learning_rates=None,
+                                 name=None):
+  """A placeholder op for feeding per-sample gradients to the embedding layer.
+
+  Args:
+    inputs: A TensorList of gradients with which to update embedding tables.
+        This argument has the same length and shapes as the return value of
+        RecvTPUEmbeddingActivations, but contains gradients of the model's
+        loss with respect to the embedding activations. The embedding tables
+        are updated from these gradients via the optimizers specified in the
+        TPU embedding configuration given to tpu.initialize_system.
+    config: Serialized TPUEmbeddingConfiguration proto.
+    learning_rates: A TensorList of float32 scalars, one for each dynamic
+        learning rate tag: see the comments in
+        //third_party/tensorflow/core/protobuf/tpu/
+                                             optimization_parameters.proto.
+        Multiple tables can share the same dynamic learning rate tag as
+        specified in the configuration. If the learning rates for all tables
+        are constant, this list should be empty.
+    name: A name for the operation (optional).
+
+  Returns:
+    A SendTPUEmbeddingGradients operation.
+  """
+  if learning_rates is None:
+    learning_rates = []
+  return gen_tpu_ops.send_tpu_embedding_gradients(
+      inputs=inputs, learning_rates=learning_rates, config=config, name=name)
+
+
+send_tpu_embedding_gradients.__doc__ = (
+    gen_tpu_ops.send_tpu_embedding_gradients.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_integer_batch(batch,
+                                        device_ordinal,
+                                        mode_override=None,
+                                        name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    batch: A list of 1D tensors, one for each embedding table, containing the
+      indices into the tables.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingIntegerBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_integer_batch(
+      batch=batch,
+      device_ordinal=device_ordinal,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_integer_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_integer_batch.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_sparse_batch(sample_indices,
+                                       embedding_indices,
+                                       aggregation_weights,
+                                       device_ordinal,
+                                       combiners=None,
+                                       mode_override=None,
+                                       name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    sample_indices: A list of rank 1 Tensors specifying the training example
+      and feature to which the corresponding embedding_indices and
+      aggregation_weights values belong. sample_indices[i] must equal b * nf +
+      f, where nf is the number of features from the corresponding table, f is
+      in [0, nf), and b is in [0, batch size). Both int32 and int64 are allowed,
+      and will be converted to int32 internally.
+    embedding_indices: A list of rank 1 Tensors, indices into the embedding
+      tables. Both int32 and int64 are allowed and will be converted to int32
+      internally.
+    aggregation_weights: A list of rank 1 Tensors containing per sample --
+      i.e. per (training example, feature) -- aggregation weights. Both float32
+      and float64 are allowed and will be converted to float32 internally.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    combiners: A list of string scalars, one for each embedding table that
+      specify how to normalize the embedding activations after weighted
+      summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+      invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+      squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+      is to use 'sum' for all tables (optional).
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingSparseBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_sparse_batch(
+      sample_indices=sample_indices,
+      embedding_indices=embedding_indices,
+      aggregation_weights=aggregation_weights,
+      device_ordinal=device_ordinal,
+      combiners=combiners,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_sparse_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_sparse_batch.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
+                                              embedding_indices,
+                                              aggregation_weights,
+                                              table_ids,
+                                              device_ordinal,
+                                              combiners=None,
+                                              mode_override=None,
+                                              name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    sample_indices: A list of rank 2 Tensors specifying the training example
+      to which the corresponding embedding_indices and aggregation_weights
+      values belong. It corresponds to sp_ids.indices in
+      embedding_lookup_sparse(). If the size of its first dimension is 0, we
+      assume each embedding_indices belongs to a different sample. Both int32
+      and int64 are allowed and will be converted to int32 internally.
+    embedding_indices: A list of rank 1 Tensors, indices into the embedding
+      tables. It corresponds to sp_ids.values in embedding_lookup_sparse(). Both
+      int32 and int64 are allowed and will be converted to int32 internally.
+    aggregation_weights: A list of rank 1 Tensors containing per training
+      example aggregation weights. It corresponds to sp_weights.values in
+      embedding_lookup_sparse(). If the size of its first dimension is 0, we
+      assume all weights are 1. Both float32 and float64 are allowed and will
+      be converted to float32 internally.
+    table_ids: A list of integers specifying the identifier of the embedding
+      table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
+      lookup the corresponding input. The ith input is looked up using
+      table_ids[i]. The size of the table_ids list must be equal to that of
+      sample_indices, embedding_indices and aggregation_weights.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    combiners: A list of string scalars, one for each embedding table that
+      specify how to normalize the embedding activations after weighted
+      summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+      invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+      squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+      is to use 'sum' for all tables (optional).
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingSparseTensorBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+      sample_indices=sample_indices,
+      embedding_indices=embedding_indices,
+      aggregation_weights=aggregation_weights,
+      table_ids=table_ids,
+      device_ordinal=device_ordinal,
+      combiners=combiners,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
diff --git a/tensorflow/tools/dist_test/server/__init__.py b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
similarity index 85%
rename from tensorflow/tools/dist_test/server/__init__.py
rename to tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
index 29c7bf77319774582514c929bb10fbf28c0770d5..1f2dce26cd5dd1183d51491715186f57fbe95fab 100644
--- a/tensorflow/tools/dist_test/server/__init__.py
+++ b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+# =============================================================================
+
+"""Operations to select TPU core to run."""
 
-"""TensorFlow GRPC server."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bfe79454cc16db7dedadcddb1aad20c3d7792528
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -0,0 +1,27 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "profiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/core/profiler:profiler_analysis_proto_py",
+        "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tpu_profiler_analysis_pb2_grpc",
+    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core/profiler:profiler_analysis_proto_py"],
+)
diff --git a/tensorflow/python/tpu/profiler/__init__.py b/tensorflow/python/tpu/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c183aaf53543f7bb38475525d1777048925ff62
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Classes for TPU trace events."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.core.profiler.trace_events_pb2 import *
+from tensorflow.core.profiler.profiler_analysis_pb2 import *
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
similarity index 100%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
rename to tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cca8aeb55b96adfcdaf1257a9c209ebf8ce57ad
--- /dev/null
+++ b/tensorflow/python/tpu/session_support.py
@@ -0,0 +1,421 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Operations for handling session logging and shutdown notifications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import time
+from google.protobuf import text_format
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+_WATCHDOG = None
+
+
+class CoordinatorShutdownException(Exception):
+  """Raised when the coordinator needs to shutdown."""
+  pass
+
+
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
+class WorkerHeartbeatManager(object):
+  """Manages the status/heartbeat monitor for a set of workers."""
+
+  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
+    """Construct a new WorkerHeartbeatManager.
+
+    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
+
+    Args:
+      session: `tf.Session`, session to use for heartbeat operations.
+      devices: `list[string]` Set of devices to connect to.
+      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
+      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
+        the WorkerHeartbeatRequest protocol buffer.
+    """
+    self._session = session
+    self._devices = devices
+    self._ops = heartbeat_ops
+    self._request_placeholder = request_placeholder
+
+  @staticmethod
+  def from_devices(session, devices):
+    """Construct a heartbeat manager for the given devices."""
+    if not devices:
+      logging.error('Trying to create heartbeat manager with no devices?')
+
+    logging.info('Creating heartbeat manager for %s', devices)
+    request_placeholder = array_ops.placeholder(
+        name='worker_heartbeat_request', dtype=dtypes.string)
+
+    heartbeat_ops = []
+    for device in devices:
+      with ops.device(device):
+        heartbeat_ops.append(tpu_ops.worker_heartbeat(request_placeholder))
+
+    return WorkerHeartbeatManager(session, devices, heartbeat_ops,
+                                  request_placeholder)
+
+  def num_workers(self):
+    return len(self._devices)
+
+  def configure(self, message):
+    """Configure heartbeat manager for all devices.
+
+    Args:
+      message: `event_pb2.WorkerHeartbeatRequest`
+    Returns: `None`
+    """
+    logging.info('Configuring worker heartbeat: %s',
+                 text_format.MessageToString(message))
+    self._session.run(self._ops,
+                      {self._request_placeholder: message.SerializeToString()})
+
+  def ping(self, request=None, timeout_in_ms=5000):
+    """Ping all workers, returning the parsed status results."""
+    if request is None:
+      request = event_pb2.WorkerHeartbeatRequest()
+
+    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
+    results = self._session.run(
+        self._ops,
+        feed_dict={self._request_placeholder: request.SerializeToString()},
+        options=options)
+    parsed_results = [
+        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
+        for res_pb in results
+    ]
+    logging.debug('Ping results: %s', parsed_results)
+    return parsed_results
+
+  def lame_workers(self):
+    """Ping all workers, returning manager containing lame workers (or None)."""
+    ping_results = self.ping()
+    lame_workers = []
+
+    for ping_response, device, op in zip(ping_results, self._devices,
+                                         self._ops):
+      if ping_response.health_status != event_pb2.OK:
+        lame_workers.append((device, op))
+
+    if not lame_workers:
+      return None
+
+    bad_devices, bad_ops = zip(*lame_workers)
+    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
+                                  self._request_placeholder)
+
+  def __repr__(self):
+    return 'HeartbeatManager(%s)' % ','.join(self._devices)
+
+  def shutdown(self, timeout_ms=10000):
+    """Shutdown all workers after `shutdown_timeout_secs`."""
+    logging.info('Shutting down %s.', self)
+    req = event_pb2.WorkerHeartbeatRequest(
+        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
+        shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
+    self.configure(req)
+
+    # Wait for workers to shutdown.  This isn't strictly required
+    # but it avoids triggering multiple checkpoints with the same lame worker.
+    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
+    time.sleep(timeout_ms / 1000)
+
+
+def all_worker_devices(session):
+  """Return a list of devices for each worker in the system."""
+  devices = session.list_devices()
+
+  devices_that_support_heartbeats = []
+
+  for device in devices:
+    name = device.name
+    # Pick devices that have a TPU but target the attached CPU
+    if ':TPU:0' in name and 'coordinator' not in name:
+      devices_that_support_heartbeats.append(name.replace('TPU', 'CPU'))
+
+  return devices_that_support_heartbeats
+
+
+class WatchdogManager(threading.Thread):
+  """Configures worker watchdog timer and handles periodic pings.
+
+  Usage:
+    # Ping workers every minute, shutting down workers if they haven't received
+    # a ping after 1 hour.
+    watchdog_manager = WatchdogManager(
+      ping_interval=60, shutdown_timeout=3600
+    )
+
+    # Use as a context manager, resetting watchdog on context exit:
+    with watchdog_manager:
+      session.run(...)
+
+    # Or setup globally; watchdog will remain active until program exit.
+    watchdog_manager.configure_and_run()
+  """
+
+  def __init__(self,
+               session,
+               devices=None,
+               ping_interval=60,
+               shutdown_timeout=3600):
+    """Initialize a watchdog manager.
+
+    Args:
+      session: Session connected to worker devices.  A cloned session and graph
+        will be created for managing worker pings.
+      devices: Set of devices to monitor.  If none, all workers will be
+        monitored.
+      ping_interval: Time, in seconds, between watchdog pings.
+      shutdown_timeout: Time, in seconds, before watchdog timeout.
+    """
+    threading.Thread.__init__(self)
+    self.ping_interval = ping_interval
+    self.shutdown_timeout = shutdown_timeout
+    self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
+    self._target = session.sess_str
+    self._running = False
+    self._devices = devices
+
+    self._graph = None
+    self._session = None
+    self._worker_manager = None
+
+  def _reset_manager(self):
+    """Reset the graph, session and worker manager."""
+    self._graph = ops.Graph()
+    self._session = session_lib.Session(
+        target=self._target,
+        graph=self._graph,
+        config=self._config,
+    )
+
+    if self._devices is None:
+      self._devices = all_worker_devices(self._session)
+
+    with self._graph.as_default():
+      self._worker_manager = WorkerHeartbeatManager.from_devices(
+          self._session, self._devices)
+
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(
+                timeout_ms=self.shutdown_timeout * 1000,),
+            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+
+  def configure_and_run(self):
+    logging.info(
+        'Enabling watchdog timer with %d second timeout '
+        'and %d second ping interval.', self.shutdown_timeout,
+        self.ping_interval)
+    self._reset_manager()
+    self._running = True
+    self.start()
+
+  def stop(self):
+    logging.info('Stopping worker watchdog.')
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
+            shutdown_mode=event_pb2.NOT_CONFIGURED))
+    self._running = False
+    self.join()
+
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.stop()
+
+  def run(self):
+    # Don't fetch logs or adjust timing: just ping the watchdog.
+    #
+    # If we hit an exception, reset our session as it is likely broken.
+    while self._running:
+      try:
+        self._worker_manager.ping(request=None)
+        time.sleep(self.ping_interval)
+      except errors.OpError as e:
+        # Catch any TF errors that occur so we don't stop sending heartbeats
+        logging.debug('Caught error while sending heartbeat: %s', e)
+        self._reset_manager()
+
+
+def start_worker_watchdog(session,
+                          devices=None,
+                          ping_interval=60,
+                          shutdown_timeout=3600):
+  """Start global worker watchdog to shutdown workers on coordinator exit."""
+  global _WATCHDOG
+  if _WATCHDOG is None:
+    # Ensure we can send a few pings before we timeout!
+    ping_interval = min(shutdown_timeout / 10., ping_interval)
+    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
+                                shutdown_timeout)
+    _WATCHDOG.configure_and_run()
+
+
+class GracefulShutdownHook(session_run_hook.SessionRunHook):
+  """Session hook that watches for shutdown events.
+
+  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
+  SystemShutdown exception is raised to terminate the main session.  If `saver`
+  is None the `SAVERS` collection will be read to find a saver.
+
+  `on_shutdown_hooks` is an optional list of functions that should be called
+  after checkpointing.  The function is called with (`run_context`,
+  `all_workers`, `lame_workers`).
+
+  If `heartbeat_group` is not specified, it will default to all CPU workers
+  in the system.
+  """
+
+  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
+    self._saver = saver
+    self._checkpoint_prefix = checkpoint_prefix
+    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
+
+    # Worker heartbeats are managed independently of the main training graph.
+    self._graph = ops.Graph()
+    self._workers = None
+    self._session = None
+    self._heartbeat_supported = False
+
+  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
+    # N.B. We have to pull the global step here to avoid it being unavailable
+    # at checkpoint time; the graph has been frozen at that point.
+    if training_util.get_global_step() is None and self.saver() is not None:
+      raise ValueError(
+          'Saver defined but no global step.  Run `get_or_create_global_step()`'
+          ' in your model definition to allow checkpointing.')
+
+    with self._graph.as_default():
+      logging.info('Installing graceful shutdown hook.')
+      self._session = _clone_session(training_session, self._graph)
+      self._workers = WorkerHeartbeatManager.from_devices(
+          self._session, all_worker_devices(self._session))
+      self._heartbeat_supported = self._workers.num_workers() > 0
+      if self._heartbeat_supported:
+        try:
+          self._workers.configure(
+              event_pb2.WorkerHeartbeatRequest(
+                  shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+        except errors.InvalidArgumentError:
+          logging.warn(
+              'TPU device does not support heartbeats. Failure '
+              'handling will be disabled.')
+          self._heartbeat_supported = False
+      else:
+        logging.warn(
+            'No workers support hearbeats. Failure handling will be disabled.')
+
+  def saver(self):
+    if self._saver:
+      return self._saver
+
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)
+    if not savers:
+      return None
+
+    if not isinstance(savers, list):
+      return savers
+
+    if len(savers) > 1:
+      logging.error(
+          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
+          'will be disabled. Pass an explicit `saver` to the constructor to '
+          'override this behavior.')
+      return None
+
+    return savers[0]
+
+  def after_run(self, run_context, run_values):
+    del run_values
+
+    if not self._heartbeat_supported:
+      return
+
+    lame_workers = self._workers.lame_workers()
+    if lame_workers:
+      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
+
+      if self.saver():
+        logging.info('ShutdownHook: saving checkpoint to %s',
+                     self._checkpoint_prefix)
+        self.saver().save(
+            run_context.session,
+            self._checkpoint_prefix,
+            global_step=training_util.get_global_step(),
+            write_state=True,
+        )
+      else:
+        logging.info('ShutdownHook: no Saver defined.')
+
+      for fn in self._on_shutdown_hooks:
+        fn(run_context, self._workers, lame_workers)
+
+
+class RestartComputation(object):
+  """Restart the entire computation.
+
+  This hook shuts down all workers and returns control to the top-level by
+  throwing a CoordinatorShutdownException.
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    del run_context, lame_workers
+    all_workers.shutdown(timeout_ms=self.timeout_ms)
+
+    logging.info('Terminating coordinator.')
+    raise CoordinatorShutdownException()
+
+
+class ShutdownLameWorkers(object):
+  """Shutdown lamed workers.
+
+  Processing will continue normally (typically by waiting for the down
+  workers to be restarted).
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_in_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d37f1b64b8504a96f8abbb5aa399a619d7a8796
--- /dev/null
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -0,0 +1,1640 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+import sys
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_io
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu.ops import tpu_ops
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_TRACE_MODE_NORM = 'norm'
+_TRACE_MODE_MAX_ABS = 'max-abs'
+_SUBMODE_BRIEF = 'brief'
+_SUBMODE_DETAILED = 'detailed'
+_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
+_REASON_WHILELOOP_OP = 'not-traced-special-whileloop-op'
+_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
+_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
+_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
+_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_REASON_SCALAR_GET_TRACED = 'traced-scalar'
+_REASON_TENSOR_GET_TRACED = 'traced-tensor'
+_REASON_USER_INCLUDED = 'traced-user-included'
+_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
+_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
+_REASON_FEEDS_WHILELOOP_OP = 'not-traced-feeds-special-whileloop-op'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_TENSOR_LIST = 'tensor-list'
+_SECTION_NAME_CACHE_INDEX_MAP = 'cache-index-map'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_SUBMODE = 'submode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_REPLICAS_PER_HOST = 'num-replicas-per-host:'
+_FIELD_NAME_NUM_HOSTS = 'num-hosts:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
+_FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
+_FLAG_NAME_SUBMODE = 'submode'
+_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
+_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
+_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
+_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
+_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
+_FLAG_NAME_TRACE_DIR = 'trace_dir'
+_FLAG_NAME_REPORT_FILE = 'report_file'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+# Folder to dump the pre (before tensor tracer updates) and post graphs (after
+# tensor tracer updates).
+_FLAG_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
+_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
+_TRACE_FILE_NAME = 'trace.all'
+_COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
+_COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
+_TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
+_TENSOR_VALUES_CACHE = 'tensor_values_cache'
+_REPLICA_ID_TAG = '#replica-id: '
+
+
+def tensor_tracepoint(tensor, checkpoint_name):
+  """Adds a checkpoint with the given checkpoint name for the given tensor.
+
+  The tensor will be added to the list of tensors that will be traced by the
+  tensor tracer.
+
+  Args:
+     tensor: the tensor object for which the tracing is requested.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+  Returns:
+    The provided tensor.
+  """
+
+  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
+  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
+                                 (tensor, checkpoint_name))
+  return tensor
+
+
+def keras_layer_tracepoint(layer, checkpoint_name):
+  """An interface for adding the tensor outputs of a keras layer.
+
+  Encapsulates tensor_tracepoint.
+
+  Args:
+     layer: A keras layer.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+
+  Returns:
+    The provided layer.
+  """
+  try:
+    outputs = layer.output
+    if tensor_util.is_tensor(outputs):
+      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
+    else:
+      idx = 0
+      for output_tensor in outputs:
+        if tensor_util.is_tensor(outputs):
+          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
+        idx += 1
+  except AttributeError:
+    pass
+  except RuntimeError:
+    pass
+  return layer
+
+
+def _trace_files_need_precreated(output_dir):
+  """Return True if trace files must be pre-created by users."""
+
+  if not output_dir.startswith('/'):
+    return False
+  if len(output_dir) < 5:
+    return False
+  if output_dir[2] != 'n':
+    return False
+  if output_dir[3] != 's':
+    return False
+  if output_dir[1] != 'c':
+    return False
+  if output_dir[4] != '/':
+    return False
+  return True
+
+
+def _get_tensor_values_cache(graph=None):
+  """Returns the variable that implements tensor-value caching."""
+
+  graph = graph or ops.get_default_graph()
+  collection = graph.get_collection(_TENSOR_TRACER_STORAGE)
+  if len(collection) == 1:
+    return collection[0]
+  elif not collection:
+    raise RuntimeError('%s has not been created'%_TENSOR_VALUES_CACHE)
+  else:
+    raise RuntimeError('Multiple %s created'%_TENSOR_VALUES_CACHE)
+  return None
+
+
+def _create_tensor_values_cache(graph, num_tensors):
+  """Creates a variable as the cache to store intermediate tensor values."""
+  graph = graph or ops.get_default_graph()
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        _TENSOR_VALUES_CACHE,
+        shape=[num_tensors],
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(
+            _COMPACT_TRACE_ENTRY_INIT_VALUE),
+        trainable=False,
+        use_resource=True,
+        collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.GLOBAL_VARIABLES])
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+  # The set of graphs that are rewritten by tensor tracer.
+  _traced_graphs = set()
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag.
+
+    Args:
+       flags: a string that contains the flags.
+       pos: where in flags to start the search.
+
+    Returns:
+       A pair where the first element is the regular-expression
+       match found and the second element indicates if the match
+       has a value.
+    """
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
+    if match:
+      # The flag is found but is not given a value.
+      return match, False
+    # The flag is not found.
+    return None, False
+
+  @staticmethod
+  def validate_flag_names():
+    """Validates if the TensorTrace flags passed are valid."""
+    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
+                        _FLAG_NAME_USE_COMPACT_TRACE,
+                        _FLAG_NAME_SUBMODE,
+                        _FLAG_NAME_EXCLUDED_OPNAMES,
+                        _FLAG_NAME_EXCLUDED_OPTYPES,
+                        _FLAG_NAME_INCLUDED_OPNAMES,
+                        _FLAG_NAME_INCLUDED_OPTYPES,
+                        _FLAG_NAME_TRACE_DIR,
+                        _FLAG_NAME_REPORT_FILE,
+                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
+                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
+                        _FLAG_NAME_OP_RANGE,
+                        _FLAG_DUMP_BEFORE_AFTER_GRAPHS]
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return
+    pos = 0
+    while True:
+      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if flag_name not in valid_flag_names:
+        raise ValueError(
+            'The flag name "%s" passed via the environment variable "%s" '
+            'is invalid. Valid flag names are:'
+            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
+      pos = match.end()
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags.
+
+    Args:
+      wanted_flag_name: the name the the flag we are looking for.
+
+    Returns:
+      A pair where the first element indicates if the flag is
+      found and the second element is the value of the flag.
+
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return False, None
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        return False, None
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      if flag_name == wanted_flag_name:
+        return True, flag_value
+      pos = match.end()
+    raise RuntimeError('Should not reach here.')
+
+  @staticmethod
+  def flag_value_to_re_list(flag_name):
+    """Converts list of strings to compiled RE."""
+
+    re_list = []
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found or not flag_value:
+      return re_list
+    list_of_values = flag_value.split()
+    for v in list_of_values:
+      r = re.compile(v)
+      re_list.append(r)
+    return re_list
+
+  @staticmethod
+  def _is_flag_on(flag_name):
+    """Returns True if the given flag is on."""
+
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found:
+      return False
+    if flag_value is None:
+      return True
+    # Depends on the flag value.
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the report and trace files.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output files should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+
+  @staticmethod
+  def use_compact_trace():
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_COMPACT_TRACE)
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
+                         _TRACE_MODE_MAX_ABS]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def check_submode(submode):
+    """Checks if the given submode is valid."""
+
+    if not submode:
+      return
+    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
+    if submode not in valid_submodes:
+      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
+                       'Valid submodes are: %s'%(submode,
+                                                 valid_submodes))
+
+  @staticmethod
+  def loop_cond_op(op):
+    return op.type in ('LoopCond', 'RefLoopCond')
+
+  @staticmethod
+  def while_loop_op(op):
+    """Returns true if op is one of the special ops of in a while loop.
+
+    Args:
+       op: A tf.Operation.
+
+    Returns:
+       True if the given op is one of [Switch, Merge, Enter, Exit,
+       NextIteration, LoopCond], which are all building blocks for TF while
+       loops.
+    """
+    return  (control_flow_util.IsLoopSwitch(op) or
+             control_flow_util.IsLoopMerge(op) or
+             control_flow_util.IsLoopEnter(op) or
+             control_flow_util.IsLoopExit(op) or
+             TensorTracer.loop_cond_op(op) or
+             op.type in ('RefNextIteration', 'NextIteration'))
+
+  @staticmethod
+  def unsafe_op(op):
+    """Returns True if this op is not safe to be traced."""
+
+    if control_flow_util.IsInCond(op):
+      return True
+    # Reasons for not including following op types:
+    #    Assign: cause incorrect result with CPU tracing.
+    if op.type in ['Assign']:
+      return True
+    return False
+
+  @staticmethod
+  def device_mismatch(device_type, op):
+    if device_type == _DEVICE_TYPE_TPU:
+      # pylint: disable=protected-access
+      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
+      # pylint: enable=protected-access
+    return False
+
+  @staticmethod
+  def unsafe_scalar_trace(op):
+    """Return true if scalar output tensor from Op is not safe to be traced."""
+
+    # Tracing the following causes cycle in the graph on TPU.
+    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
+                   'Switch', 'Less', 'ReadVariableOp']:
+      return True
+    # Tracing the following will cause casting-issue
+    # with the norm tracing mode or other compilation issues on CPU.
+    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
+                   'IteratorGetNext', 'OneShotIterator',
+                   'IteratorV2', 'MakeIterator',
+                   'BatchDatasetV2', 'MapDataset',
+                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
+                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
+      return True
+    return False
+
+  @staticmethod
+  def less_interesting_op(op):
+    """Returns True if the given Op is not an interesting one to be traced."""
+
+    found, _ = TensorTracer.get_flag_value(
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
+    if found:
+      # users force to include all ops.
+      return False
+    # Following ops are highly unlikey to cause bugs.
+    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns reason why the Op at op_idx is traced or not."""
+
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+    def _is_loop_edge(op):
+      """Returns true if the op is the end of a while-loop creating a cycle."""
+      return op.type in ['NextIteration']
+
+    def _in_op_degree(op):
+      """Returns the number of incoming edges to the given op.
+
+      The edge calculation skips the edges that come from 'NextIteration' ops.
+      NextIteration creates a cycle in the graph. We break cycles by treating
+      this op as 'sink' and ignoring all outgoing edges from it.
+      Args:
+        op: Tf.Operation
+      Returns:
+        the number of incoming edges.
+      """
+      count = 0
+      for op in op.control_inputs + [in_tensor.op for in_tensor in op.inputs]:
+        if not _is_loop_edge(op):
+          count += 1
+      return count
+
+    sorted_ops = []
+    op_in_degree = {op: _in_op_degree(op) for op in g.get_operations()}
+
+    frontier = [op for (op, degree) in op_in_degree.items() if degree == 0]
+    while frontier:
+      op = frontier.pop()
+      # Remove the op from graph, and remove its outgoing edges.
+      sorted_ops.append(op)
+      if _is_loop_edge(op):
+        continue
+      # pylint: disable=protected-access
+      consumers = list(op._control_outputs)
+      # pylint: enable=protected-access
+      for out_tensor in op.outputs:
+        consumers += [consumer_op for consumer_op in out_tensor.consumers()]
+
+      for consumer in consumers:
+        # For each deleted edge shift the bucket of the vertex.
+        op_in_degree[consumer] -= 1
+        if op_in_degree[consumer] == 0:
+          frontier.append(consumer)
+        if op_in_degree[consumer] < 0:
+          raise ValueError('consumer:%s degree mismatch'%consumer.name)
+
+    left_ops = set([op for (op, degree) in op_in_degree.items() if degree > 0])
+    if left_ops:
+      return (False, left_ops)
+    else:
+      assert len(g.get_operations()) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  @staticmethod
+  def _make_op_and_tensor_maps(op_list):
+    """Creates various maps and lists from op_list.
+
+    Args:
+       op_list: a list of Ops
+
+    Returns:
+       opname_idx_map: a map from Op's name to its index in op_list.
+       tensor_list: a list of output tensors of the Ops in op_list.
+       tensorname_idx_map: a map from output tensor name to its index
+                           in tensor_list.
+    """
+
+    opname_idx_map = {}
+    tensor_list = []
+    tensorname_idx_map = {}
+    for op_id, op in enumerate(op_list):
+      if op.name in opname_idx_map:
+        raise ValueError('Duplicated Op name: %s'%op.name)
+      opname_idx_map[op.name] = op_id
+      for output_tensor in op.outputs:
+        if output_tensor.name not in tensorname_idx_map:
+          tensor_list.append(output_tensor)
+          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
+    return (opname_idx_map, tensor_list, tensorname_idx_map)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    TensorTracer.validate_flag_names()
+    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not found or not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
+    if not found or not self._submode:
+      self._submode = _SUBMODE_DETAILED
+    TensorTracer.check_submode(self._submode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    self._set_trace_dir()
+    self._set_report_file()
+    self._set_op_range()
+    self._set_excluded_opnames()
+    self._set_excluded_optypes()
+    self._set_included_opnames()
+    self._set_included_optypes()
+    self._num_replicas = None
+    self._num_replicas_per_host = None
+    self._num_hosts = None
+    self._replica_id = None
+    _, self._graph_dump_path = TensorTracer.get_flag_value(
+        _FLAG_DUMP_BEFORE_AFTER_GRAPHS)
+
+  def _add_replica_id_to_graph(self):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if self._num_replicas:
+      with ops.control_dependencies(None):
+        # Uses None as dependency to run outside of TPU graph rewrites.
+        self._replica_id = tpu_ops.tpu_replicated_input(
+            list(range(self._num_replicas)),
+            name='tt_replica_id')
+    else:
+      self._replica_id = 'unknown'
+
+  def _set_trace_dir(self):
+    found, self._trace_dir = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_DIR)
+    if found and self._trace_dir \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      raise ValueError('Cannot not use --%s and --%s at the same time'
+                       %(_FLAG_NAME_TRACE_DIR,
+                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
+    if TensorTracer.use_test_undeclared_outputs_dir():
+      self._trace_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+
+  def _set_report_file(self):
+    """Sets the path of the output report file."""
+
+    found, self._report_file_path = TensorTracer.get_flag_value(
+        _FLAG_NAME_REPORT_FILE)
+    if found and self._report_file_path \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._report_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'report_file_path cannot be an absolute path (%s)'
+                         %self._report_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._report_file_path = os.path.join(outputs_dir,
+                                            self._report_file_path)
+    if not self._report_file_path:
+      self._report_file = None
+      return
+    try:
+      self._report_file = gfile.Open(self._report_file_path, 'w')
+    except IOError as e:
+      raise e
+
+  def _close_report_file(self):
+    if self._report_file:
+      self._report_file.close()
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not found or not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _set_excluded_opnames(self):
+    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPNAMES)
+
+  def _set_excluded_optypes(self):
+    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPTYPES)
+
+  def _set_included_opnames(self):
+    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPNAMES)
+
+  def _set_included_optypes(self):
+    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPTYPES)
+
+  def _is_user_included_op(self, op):
+    for opname_re in self._included_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _is_user_excluded_op(self, op):
+    for opname_re in self._excluded_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._excluded_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _use_tensor_values_cache(self):
+    """Returns True if immediate tensors should be first saved to a cache."""
+
+    if self._trace_mode not in set([_TRACE_MODE_NAN_INF,
+                                    _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS]):
+      return False
+    if self._trace_dir and _trace_files_need_precreated(self._trace_dir):
+      return True
+    if TensorTracer.use_compact_trace():
+      return True
+    return False
+
+  def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
+    """Returns an Op that will save the given updates to an entry in the cache."""
+
+    cache = _get_tensor_values_cache(graph)
+    indices = constant_op.constant([cache_idx])
+    return state_ops.scatter_update(cache, indices, updates).op
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
+    if self._report_file:
+      self._report_file.write(line)
+    else:
+      logging.info(line)
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
+                                  self._num_replicas_per_host))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, self._num_hosts))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      op = op_list[i]
+      line = '%d "%s" %s'%(i, op.name, op.type)
+      for out_tensor in op.outputs:
+        if out_tensor.name not in self._tensorname_idx_map:
+          raise ValueError(
+              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
+        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
+    """Writes the tensor-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_TENSOR_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
+    for i in range(0, len(tensor_list)):
+      tensor = tensor_list[i]
+      line = '%d "%s"'%(i, tensor.name)
+      for consumer_op in tensor.consumers():
+        if consumer_op.name not in opname_idx_map:
+          raise ValueError(
+              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
+        line += ' %d'%opname_idx_map[consumer_op.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_TENSOR_LIST))
+
+  def _write_cache_index_map_section(self):
+    """Writes the mapping from cache index to tensor index to the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_CACHE_INDICES,
+                                  len(self._cache_idx_to_tensor_idx)))
+    for cache_idx in range(0, len(self._cache_idx_to_tensor_idx)):
+      tensor_idx = self._cache_idx_to_tensor_idx[cache_idx]
+      line = '%d %d\n'%(cache_idx, tensor_idx)
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _preprocess_traced_tensor(self, tensor):
+    """Computes NAN/Norm/Max on TPUs before sending to CPU.
+
+    Args:
+      tensor: The tensor to be traced.
+    Returns:
+      A tensor that should be input to the trace_function.
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        mask = math_ops.reduce_any(
+            gen_math_ops.logical_or(
+                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
+        output_tensor = control_flow_ops.cond(mask,
+                                              lambda: constant_op.constant(1.0),
+                                              lambda: constant_op.constant(0.0))
+      else:
+        output_tensor = constant_op.constant(0.0)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = linalg_ops.norm(tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_max_abs(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
+      zero = constant_op.constant(0, dtypes.float32)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf(tensor)
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm(tensor)
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs(tensor)
+    raise RuntimeError(
+        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+
+  def _make_tensor_trace_fun(self, tensor_name):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      tensor_name: name of the tensor being traced.
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        tensor_name: name of the tensor being traced.
+        num_elements: number of elements to print (-1 means print all).
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+
+      Raises:
+        ValueError: If tensor_name is not already in
+                    self._tensorname_idx_map.
+      """
+
+      if self._submode == _SUBMODE_BRIEF:
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        msg = '%d'%self._tensorname_idx_map[tensor_name]
+      else:
+        msg = '"%s"'%tensor_name
+
+      if self._trace_dir:
+        output_path = os.path.join(self._trace_dir, _TRACE_FILE_NAME)
+        output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+      else:
+        output_stream = sys.stderr
+      return logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                  '@', self._replica_id,
+                                  '\n', output_tensor, '\n',
+                                  summarize=num_elements,
+                                  output_stream=output_stream)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(tensor_name, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(tensor_name, -1, tensor, tensor)
+
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
+    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # performed within TPUs and only their results are transferred to CPU.
+    # Simply, print the full tensor for these trace modes.
+    if self._trace_mode in [
+        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
+        _TRACE_MODE_MAX_ABS
+    ]:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def _skip_op(self, op_id, op, user_included, user_excluded,
+               in_exec_path=True):
+    """Returns True if we should not trace Op."""
+
+    if TensorTracer.while_loop_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_WHILELOOP_OP)
+      return True
+    if TensorTracer.unsafe_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_UNSAFE_OP)
+      return True
+    if TensorTracer.device_mismatch(self._device_type, op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_DEVICE_MISMATCH)
+      return True
+    if not in_exec_path:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_NOT_EXECUTED)
+      return True
+
+    if not self._inside_op_range(op_id):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_OUTSIDE_OP_RANGE)
+      return True
+    if TensorTracer.less_interesting_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_LESS_INTERESTING_OP)
+      return True
+    if user_included:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    return False
+
+  def _skip_tensor(self, op_id, out_tensor, user_included,
+                   user_excluded):
+    """Returns True if we should not trace out_tensor."""
+
+    # Skips a tensor if the tensor has a non-numeric type.
+    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
+    #         because it also excludes tensors with dtypes, bool, and
+    #         float32_ref, which we actually want to trace.
+    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
+                                    dtypes.string])
+    if out_tensor.dtype in non_numeric_tensor_types:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_NON_NUMERIC_TENSOR)
+      return True
+    # Skip a tensor if it feeds a special while loop op.
+    if [consumer for consumer in out_tensor.consumers() if
+        TensorTracer.while_loop_op(consumer)]:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_FEEDS_WHILELOOP_OP)
+      return True
+    if user_included:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    if not out_tensor.get_shape().is_fully_defined():
+      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
+      # to a scalar before the outside compilation call.
+      if self._trace_mode in [
+          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      ]:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_TENSOR_GET_TRACED)
+        return False
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_DYNAMIC_SHAPE)
+        return True
+    rank = len(out_tensor.shape)
+    if rank < 1:
+      # scalar
+      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_UNSAFE_SCALAR)
+        return True
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_SCALAR_GET_TRACED)
+        return False
+    else:
+      # tensor
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_TENSOR_GET_TRACED)
+      return False
+
+  def _filter_execution_path_operations(self, operations, fetches):
+    """Returns the set of ops in the execution path to compute given fetches."""
+
+    # If no fetch provided, then return all operations.
+    if fetches is None:
+      return set(operations)
+    # Convert to list, if a single element is provided.
+    if not isinstance(fetches, (list, tuple)):
+      fetches = [fetches]
+    # If a tensor is given as fetch, convert it to op.
+    op_fetches = []
+    for fetch in fetches:
+      if isinstance(fetch, ops.Operation):
+        op_fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        op_fetches.append(fetch.op)
+      else:
+        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
+                           %fetch)
+
+    execution_path_operations = set(op_fetches)
+    traverse_stack = list(op_fetches)
+    while True:
+      if not traverse_stack:
+        break
+      head_op = traverse_stack.pop()
+      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
+      input_ops.extend(head_op.control_inputs)
+
+      for input_op in input_ops:
+        if input_op not in execution_path_operations:
+          # Filter out loop condition operations, tracing them causes a cycle.
+          # Trace only the loop-body.
+          if TensorTracer.loop_cond_op(input_op):
+            continue
+          execution_path_operations.add(input_op)
+          traverse_stack.append(input_op)
+    return execution_path_operations
+
+  def _determine_traced_tensors(self, graph, ops_in_exec_path):
+    """Determines the tensors that will be traced."""
+
+    self._traced_tensorname_to_cache_idx_map = {}
+    self._cache_idx_to_tensor_idx = []
+    operations = graph.get_operations()
+    checkpoint_operations = self._get_checkpoints(graph)
+    for op_id, op in enumerate(operations):
+      if checkpoint_operations and op.name not in checkpoint_operations:
+        continue
+      user_included = self._is_user_included_op(op)
+      user_excluded = self._is_user_excluded_op(op)
+      in_exec_path = op in ops_in_exec_path
+      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if self._skip_tensor(op_id, out_tensor, user_included,
+                             user_excluded):
+          continue
+        tensor_name = out_tensor.name
+        if tensor_name in self._traced_tensorname_to_cache_idx_map:
+          raise ValueError(
+              'Tensor name %s should not be already in '
+              'traced_tensorname_to_cache_idx_map'%tensor_name)
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        tensor_idx = self._tensorname_idx_map[tensor_name]
+        cache_idx = len(self._traced_tensorname_to_cache_idx_map)
+        self._traced_tensorname_to_cache_idx_map[tensor_name] = cache_idx
+        self._cache_idx_to_tensor_idx.append(tensor_idx)
+        if len(self._traced_tensorname_to_cache_idx_map) != len(
+            self._cache_idx_to_tensor_idx):
+          raise RuntimeError('len(self._traced_tensorname_to_cache_idx_map) != '
+                             'len(self._cache_idx_to_tensor_idx')
+
+  def _check_trace_files(self):
+    """Checks if any requirements for trace files are satisfied."""
+
+    if not self._trace_dir:
+      # traces will be written to stderr. No need to check trace files.
+      return
+    if _trace_files_need_precreated(self._trace_dir):
+      for replica_id in range(0, self._num_replicas):
+        trace_file_path = os.path.join(
+            self._trace_dir,
+            _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
+        if not gfile.Exists(trace_file_path):
+          raise RuntimeError(
+              '%s must be pre-created with the '
+              'appropriate properties.'%trace_file_path)
+    else:
+      if not gfile.Exists(self._trace_dir):
+        gfile.MkDir(self._trace_dir)
+        if not gfile.Exists(self._trace_dir):
+          raise RuntimeError('Failed to create %s'%self._trace_dir)
+
+  def _pre_tracing(self, graph, fetches):
+    """Work needs to be done prior to TPU or CPU tracing."""
+
+    self._check_trace_files()
+    operations = graph.get_operations()
+    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
+        TensorTracer._make_op_and_tensor_maps(operations))
+    self._write_config_section()
+    self._write_op_list_section(operations)
+    self._write_tensor_list_section(tensor_list, opname_idx_map)
+    # Filter out the operations that won't be executed.
+    # if fetches=None, then ops_in_exec_path = set(operations)
+    ops_in_exec_path = self._filter_execution_path_operations(operations,
+                                                              fetches)
+    self._determine_traced_tensors(graph, ops_in_exec_path)
+    self._write_cache_index_map_section()
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    if self._use_tensor_values_cache():
+      _create_tensor_values_cache(graph,
+                                  len(self._cache_idx_to_tensor_idx))
+    return (ops_in_exec_path, succeed, sorted_or_cycle)
+
+  def _post_tracing(self, succeed, sorted_or_cycle):
+    """Work needs to be done after TPU or CPU tracing."""
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+    self._close_report_file()
+
+  def _get_checkpoints(self, graph):
+    """Returns the list of Ops that produce the tensors traced with API.
+
+    Args:
+      graph: the graph of Ops.
+
+    Returns:
+      A set of operation names which should be traced.
+    """
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    checkpoint_operations = set()
+    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
+    for (tensor, checkpoint_name) in tensor_tracer_variables:
+      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
+      checkpoint_operations.add(tensor.op.name)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    return checkpoint_operations
+
+  def _generate_flush_cache_op(self, graph, start_replica, on_tpu):
+    """Generates an Op that will flush the cache to file.
+
+    Args:
+      graph: the graph of Ops
+      start_replica: the ID of the first replica being flushed by this Op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      The Op to flush the cache to file.
+    """
+    def _make_flush_fun(replica_id):
+      """Makes a function for flushing the cache for the given replica."""
+
+      def _fun():
+        """A function that flushes the cache to a file."""
+
+        def _flush_fun(cache):
+          """Flushes the cache to a file."""
+
+          if isinstance(replica_id, str):
+            replica_id_str = replica_id
+          else:
+            replica_id_str = '%d'%replica_id
+          if self._trace_dir:
+            output_path = os.path.join(self._trace_dir,
+                                       _COMPACT_TRACE_FILE_PREFIX) \
+                                       + replica_id_str
+            output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+          else:
+            output_stream = sys.stderr
+          new_step_line = _REPLICA_ID_TAG + replica_id_str
+          print_op = logging_ops.print_v2(
+              new_step_line, '\n',
+              cache, '\n',
+              summarize=-1,
+              output_stream=output_stream)
+          with ops.control_dependencies([print_op]):
+            return constant_op.constant(0).op
+
+        cache = _get_tensor_values_cache(graph)
+        if on_tpu:
+          flush_op = tpu.outside_compilation(_flush_fun, cache.value())
+        else:
+          flush_op = _flush_fun(cache.value())
+        with ops.control_dependencies([flush_op]):
+          reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
+                                             dtype=cache.dtype,
+                                             shape=cache.shape)
+          assign_op = state_ops.assign(cache, reset_value).op
+          with ops.control_dependencies([assign_op]):
+            return flush_op.outputs[0]
+
+      return _fun
+
+    def _f(replica_id):
+      return _make_flush_fun(replica_id)
+    def _eq(x):
+      return math_ops.equal(x, self._replica_id)
+    def _do_nothing():
+      return constant_op.constant(0)
+
+    return control_flow_ops.case({\
+                                  _eq(start_replica): _f(start_replica), \
+                                  _eq(start_replica+1): _f(start_replica+1), \
+                                  _eq(start_replica+2): _f(start_replica+2), \
+                                  _eq(start_replica+3): _f(start_replica+3), \
+                                  _eq(start_replica+4): _f(start_replica+4), \
+                                  _eq(start_replica+5): _f(start_replica+5), \
+                                  _eq(start_replica+6): _f(start_replica+6), \
+                                  _eq(start_replica+7): _f(start_replica+7), \
+    },
+                                 default=_do_nothing,
+                                 exclusive=True).op
+
+  def _flush_tensor_values_cache(self, graph, tensor_fetches, op_fetches,
+                                 on_tpu):
+    """Flushes the intermediate tensor values in the graph to the cache.
+
+    Args:
+      graph: the graph of Ops
+      tensor_fetches: list of tensor results returned by the model_fn.
+      op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      An identical copy of tensor_fetches.
+    """
+    # Add a dependency to op and tensor fetches to make sure that all tracing
+    # ops are executed before flushing trace results.
+    with ops.control_dependencies(op_fetches +
+                                  [tensor.op for tensor in tensor_fetches]):
+      flush_cache_op_list = []
+      for host in range(self._num_hosts):
+        start_replica = host * 8
+        flush_op = self._generate_flush_cache_op(graph, start_replica, on_tpu)
+        flush_cache_op_list.append(flush_op)
+      return control_flow_ops.tuple(tensor_fetches,
+                                    control_inputs=flush_cache_op_list)
+
+  def _process_tensor_fetches(self, tensor_fetches):
+    """Check that tensor_fetches is not empty and have valid tensors."""
+    # If none or empty list.
+    if tensor_fetches is None:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'None.')
+    if not isinstance(tensor_fetches, (list, tuple)):
+      tensor_fetches = [tensor_fetches]
+    elif not tensor_fetches:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'empty list.')
+    fetches = []
+    for fetch in tensor_fetches:
+      if isinstance(fetch, ops.Tensor):
+        fetches.append(fetch)
+      else:
+        raise RuntimeError('Given tensor_fetch:%s is not a tensor.' % fetch)
+    return fetches
+
+  def _process_op_fetches(self, op_fetches):
+    """Check that op_fetches have valid ops."""
+    if op_fetches is None:
+      return []
+
+    if not isinstance(op_fetches, (list, tuple)):
+      op_fetches = [op_fetches]
+
+    fetches = []
+    for fetch in op_fetches:
+      if isinstance(fetch, ops.Operation):
+        fetches.append(fetch)
+      else:
+        logging.warning('Ignoring the given op_fetch:%s, which is not an op.' %
+                        fetch)
+    return fetches
+
+  def _convert_fetches_to_input_format(self, input_fetches, current_fetches):
+    """Changes current_fetches' format, so that it matches input_fetches."""
+    if isinstance(input_fetches, ops.Tensor):
+      if len(current_fetches) != 1:
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      return current_fetches[0]
+    else:
+      if len(current_fetches) != len(current_fetches):
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      elif isinstance(input_fetches, tuple):
+        return tuple(current_fetches)
+      else:
+        return current_fetches
+
+  def _get_op_control_flow_context(self, op):
+    """Returns the control flow of the given op.
+
+    Args:
+      op: tf.Operation for which the control flow context is requested.
+    Returns:
+      op_control_flow_context: which the is control flow context of the given
+      op. If the operation type is LoopExit, returns the outer control flow
+      context.
+    """
+    # pylint: disable=protected-access
+    op_control_flow_context = op._control_flow_context
+    # pylint: enable=protected-access
+    if control_flow_util.IsLoopExit(op):
+      op_control_flow_context = op_control_flow_context.outer_context
+    return op_control_flow_context
+
+  def _trace_execution(self, graph,
+                       tensor_fetches,
+                       op_fetches=None,
+                       on_tpu=True):
+    """Commong tracing function for both CPU and TPUs.
+
+    The caller function should set _device_type, _num_replicas,
+    _num_replicas_per_host, _num_hosts and _replica_id before calling
+    _trace_execution.
+
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      on_tpu: True if executing on TPU.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+    def _cast_unsupported_dtypes(tensor):
+      """Casts tensor to a supported type."""
+
+      if tensor.dtype.__eq__(dtypes.int64):
+        # outside-compilation doesn't support int64 input yet.
+        return math_ops.cast(tensor, dtypes.int32)
+      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
+          dtypes.float16):
+        # Since host can't handle bf16, convert tensor to f32.
+        return math_ops.cast(tensor, dtypes.float32)
+      return tensor
+
+    TensorTracer.check_device_type(self._device_type)
+    # Check in_tensor_fetches, and op_fetches and convert them to lists.
+    processed_t_fetches = self._process_tensor_fetches(tensor_fetches)
+    op_fetches = self._process_op_fetches(op_fetches)
+    all_fetches = op_fetches + [tensor.op for tensor in processed_t_fetches]
+
+    # Filter the set of ops that will be executed, and topological sort.
+    (exec_op_set, succeed, sorted_or_cycle) = self._pre_tracing(graph,
+                                                                all_fetches)
+
+    tensor_fetch_set = set(processed_t_fetches)
+    tracing_ops = []
+
+    # pylint: disable=protected-access
+    current_control_flow_context = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+
+    # Trace ops only if they are in the execution path.
+    for op in exec_op_set:
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        tensor_name = out_tensor.name
+        if tensor_name not in self._traced_tensorname_to_cache_idx_map:
+          continue
+        # Create the list of consumers before calling _preprocess_traced_tensor.
+        # Otherwise, adding control input below, will introduce a cycle in the
+        # graph.
+        consumers = out_tensor.consumers()
+        # Not all consumers may be in the exec path. Filter out the consumers
+        # to keep the graph simpler.
+        consumers = [cop for cop in consumers if cop in exec_op_set]
+
+        # If there is no consumer of the tensor, there is no need to trace it;
+        # unless the tensor itself is one of the fetches.
+        is_a_fetched_tensor = out_tensor in tensor_fetch_set
+        if (not consumers) and (not is_a_fetched_tensor):
+          continue
+
+        op_control_flow_context = self._get_op_control_flow_context(op)
+        # pylint: disable=protected-access
+        graph._set_control_flow_context(op_control_flow_context)
+        # pylint: enable=protected-access
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+
+        if on_tpu:
+          processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
+
+        if self._use_tensor_values_cache():
+          cache_idx = self._traced_tensorname_to_cache_idx_map[tensor_name]
+          trace_op = self._save_tensor_value_to_cache_op(graph,
+                                                         cache_idx,
+                                                         processed_out_tensor)
+        elif on_tpu:
+          trace_op = tpu.outside_compilation(
+              self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
+        else:
+          trace_fun = self._make_tensor_trace_fun(tensor_name)
+          trace_op = trace_fun(processed_out_tensor)
+
+        if is_a_fetched_tensor:
+          tracing_ops.append(trace_op)
+          continue
+        # Add it to all consumers, as some consumers may not be executed if they
+        # are in a control flow.
+        for consumer_op in consumers:
+          # pylint: disable=protected-access
+          consumer_op._add_control_input(trace_op)
+          # pylint: enable=protected-access
+
+    # pylint: disable=protected-access
+    graph._set_control_flow_context(current_control_flow_context)
+    # pylint: enable=protected-access
+    if tracing_ops:
+      # If we are tracing a fetched tensor, their dependency is stored in
+      # tracing_ops.
+      processed_t_fetches = control_flow_ops.tuple(processed_t_fetches,
+                                                   control_inputs=tracing_ops)
+    if self._use_tensor_values_cache():
+      processed_t_fetches = self._flush_tensor_values_cache(graph,
+                                                            processed_t_fetches,
+                                                            op_fetches,
+                                                            on_tpu=on_tpu)
+    self._post_tracing(succeed, sorted_or_cycle)
+    # processed_t_fetches is a list at this point. Convert it to the same
+    # format as given in tensor_fetches.
+    return self._convert_fetches_to_input_format(tensor_fetches,
+                                                 processed_t_fetches)
+
+  def trace_tpu(self, graph,
+                tensor_fetches,
+                op_fetches=None,
+                num_replicas=None,
+                num_replicas_per_host=None,
+                num_hosts=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      num_replicas: number of replicas used on the TPU.
+      num_replicas_per_host: number of replicas per TPU host.
+      num_hosts: total number of TPU hosts.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If num_replicas_per_host > 8.
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+    self._device_type = _DEVICE_TYPE_TPU
+    self._num_replicas = num_replicas
+    self._num_replicas_per_host = num_replicas_per_host
+    self._num_hosts = num_hosts
+    if self._num_replicas is not None:
+      if self._num_replicas_per_host is None:
+        self._num_replicas_per_host = 8
+      if self._num_hosts is None:
+        self._num_hosts = num_replicas // self._num_replicas_per_host + \
+            (num_replicas % self._num_replicas_per_host > 0)
+
+    if self._num_replicas_per_host > 8:
+      # Checks for the assumption in _generate_flush_cache_op().
+      raise RuntimeError('num_replicas_per_host (%d) is '
+                         'greater than 8'%self._num_replicas_per_host)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      self._add_replica_id_to_graph()
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=True)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+  def trace_cpu(self, graph, tensor_fetches, op_fetches=None):
+    """Traces the tensors generated by CPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the CPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+
+    self._device_type = _DEVICE_TYPE_CPU
+    self._num_replicas = 1
+    self._num_replicas_per_host = 1
+    self._num_hosts = 1
+    self._replica_id = 0
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=False)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+
diff --git a/tensorflow/python/tpu/topology.py b/tensorflow/python/tpu/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ee21e694d15d2e795b0b35289e1e116b9e76cf
--- /dev/null
+++ b/tensorflow/python/tpu/topology.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Defines the `Topology` class, that describes a TPU fabric topology."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf.tpu import topology_pb2
+
+
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
+class Topology(object):
+  """Describes a set of TPU devices.
+
+  Represents both the shape of the physical mesh, and the mapping between
+  TensorFlow TPU devices to physical mesh coordinates.
+  """
+
+  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
+    """Builds a Topology object.
+
+    If `serialized` is not `None`, the topology is parsed from `serialized` and
+    the other arguments are ignored. Otherwise, the topology is computed from
+    `mesh_shape` and `device_coordinates`.
+
+    Args:
+      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
+        serialized proto is parsed to discover the topology.
+      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+        the shape of the TPU topology, in number of cores. Ignored if
+        `serialized` is not `None`.
+      device_coordinates: A rank 3 numpy array that describes the mapping from
+        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
+        if `serialized is not `None`.
+
+    Raises:
+      ValueError: If `serialized` does not describe a well-formed topology.
+      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
+        of 3 positive integers.
+      ValueError: If `serialized` is `None` and `device_coordinates` is not a
+        rank 3 numpy int32 array that describes a valid coordinate mapping.
+    """
+
+    self._serialized = serialized
+
+    if serialized:
+      self._parse_topology(serialized)
+    else:
+      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
+      self._device_coordinates = np.asarray(device_coordinates, np.int32)
+      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+                         "entries; got {}".format(self._mesh_shape))
+
+      if (len(self._device_coordinates.shape) != 3 or
+          self._device_coordinates.shape[2] != len(self._mesh_shape)):
+        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
+                         "with minor dimension equal to the mesh shape rank")
+
+    self._topology_tasks, self._topology_devices = self._invert_topology()
+
+  def _parse_topology(self, serialized):
+    """Parses a serialized `TopologyProto` into `self`."""
+    proto = topology_pb2.TopologyProto()
+    proto.ParseFromString(serialized)
+
+    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
+    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+                       "entries; got {}".format(self._mesh_shape))
+
+    if proto.num_tasks < 0:
+      raise ValueError("`num_tasks` must be >= 0; got {}".format(
+          proto.num_tasks))
+    if proto.num_tpu_devices_per_task < 0:
+      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
+          proto.num_tpu_devices_per_task))
+
+    expected_coordinates_size = (
+        proto.num_tasks * proto.num_tpu_devices_per_task * len(
+            proto.mesh_shape))
+    if len(proto.device_coordinates) != expected_coordinates_size:
+      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
+                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
+                       "got shape {}".format(proto.num_tasks,
+                                             proto.num_tpu_devices_per_task,
+                                             proto.mesh_shape,
+                                             len(proto.device_coordinates)))
+
+    coords = np.array(proto.device_coordinates, dtype=np.int32)
+    if any(coords < 0):
+      raise ValueError("`device_coordinates` must be >= 0")
+    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
+                             len(proto.mesh_shape)))
+    self._device_coordinates = coords
+
+  def _invert_topology(self):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    for task in xrange(self.device_coordinates.shape[0]):
+      for device in xrange(self.device_coordinates.shape[1]):
+        x, y, z = self.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
+  @property
+  def mesh_shape(self):
+    """A rank 1 int32 array describing the shape of the TPU topology."""
+    return self._mesh_shape
+
+  @property
+  def mesh_rank(self):
+    """Returns the number of dimensions in the mesh."""
+    return len(self._mesh_shape)
+
+  @property
+  def device_coordinates(self):
+    """Describes the mapping from TPU devices to topology coordinates.
+
+    Returns:
+      A rank 3 int32 array with shape `[tasks, devices, axis]`.
+      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+      of TPU devices per task, and `axis` is the number of axes in the TPU
+      cluster topology. Each entry gives the `axis`-th coordinate in the
+      topology of a task/device pair. TPU topologies are 3-dimensional, with
+      dimensions `(x, y, core number)`.
+    """
+    return self._device_coordinates
+
+  def task_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow task number attached to `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow task number that contains the TPU device with those
+      physical coordinates.
+    """
+    return self._topology_tasks[tuple(device_coordinates)]
+
+  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow device number at `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow device number within the task corresponding to
+      attached to the device with those physical coordinates.
+    """
+    return self._topology_devices[tuple(device_coordinates)]
+
+  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the CPU device attached to a logical core."""
+    return _tpu_host_device_name(
+        job, self._topology_tasks[tuple(device_coordinates)])
+
+  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
+
+  @property
+  def num_tasks(self):
+    """Returns the number of TensorFlow tasks in the TPU slice."""
+    return self._device_coordinates.shape[0]
+
+  @property
+  def num_tpus_per_task(self):
+    """Returns the number of TPU devices per task in the TPU slice."""
+    return self._device_coordinates.shape[1]
+
+  def serialized(self):
+    """Returns the serialized form of the topology."""
+    if self._serialized is None:
+      proto = topology_pb2.TopologyProto()
+      proto.mesh_shape[:] = list(self._mesh_shape)
+      proto.num_tasks = self._device_coordinates.shape[0]
+      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
+      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
+      self._serialized = proto.SerializeToString()
+
+    return self._serialized
diff --git a/tensorflow/contrib/tpu/python/tpu/topology_test.py b/tensorflow/python/tpu/topology_test.py
similarity index 96%
rename from tensorflow/contrib/tpu/python/tpu/topology_test.py
rename to tensorflow/python/tpu/topology_test.py
index fafe3254d84551d3d7ed8a9d3346849411714f97..9e1b7de859703f6859017d57cd73b4fbda7237b4 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology_test.py
+++ b/tensorflow/python/tpu/topology_test.py
@@ -19,9 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import topology
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import topology
 
 
 class TopologyTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..55273a5203e38941a08e87943bdcc53ef38a29c9
--- /dev/null
+++ b/tensorflow/python/tpu/tpu.py
@@ -0,0 +1,1608 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding
+from tensorflow.python.compat import compat as api_compat
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_BLACKLISTED_OPS = set([
+    "Placeholder",
+])
+
+# XLA doesn't currently support reading of intermediate tensors, thus some ops
+# are not supported.
+_UNSUPPORTED_OPS = set([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+    ])
+
+# Ops which can be safely pruned from XLA compile if they have no consumers.
+#  These ops should also have no inputs.
+_UNCONNECTED_OPS_TO_PRUNE = set(["Placeholder", "VarHandleOp"])
+
+_MAX_WARNING_LINES = 5
+
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+
+
+def _tpu_system_device_name(job):
+  """Returns the device name for the TPU_SYSTEM device of `job`."""
+  if job is None:
+    return "/device:TPU_SYSTEM:0"
+  else:
+    return "/job:%s/device:TPU_SYSTEM:0" % job
+
+
+def initialize_system(embedding_config=None, job=None):
+  """Initializes a distributed TPU system for use with TensorFlow.
+
+  Args:
+    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
+      describing the desired configuration of the hardware embedding lookup
+      tables. If embedding_config is None, no hardware embeddings can be used.
+    job: The job (the XXX in TensorFlow device specification /job:XXX) that
+      contains the TPU devices that will be initialized. If job=None it is
+      assumed there is only one job in the TensorFlow flock, and an error will
+      be returned if this assumption does not hold.
+  Returns:
+    A serialized `TopologyProto` that describes the TPU system. Note:
+      the topology must be evaluated using `Session.run` before it can be used.
+  """
+  config_string = ("" if embedding_config is None else
+                   embedding_config.SerializeToString())
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
+
+
+def shutdown_system(job=None):
+  """Shuts down a running a distributed TPU system."""
+  with ops.device(_tpu_system_device_name(job)):
+    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
+  return shutdown_distributed_tpu
+
+
+def core(num):
+  """Returns the device name for a core in a replicated TPU computation.
+
+  Args:
+    num: the virtual core number within each replica to which operators should
+    be assigned.
+  Returns:
+    A device name, suitable for passing to `tf.device()`.
+  """
+  return "device:TPU_REPLICATED_CORE:{}".format(num)
+
+
+class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU computation.
+
+  The primary role of `TPUReplicateContext` is to mark operators inside a
+  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
+  is a unique name.
+
+  We use a `ControlFlowContext` to perform the annotation since it integrates
+  with Tensorflow constructs like ResourceVariables. For example, if a
+  `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
+  """
+
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
+    super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
+    self._name = name
+    self._name_as_bytes = compat.as_bytes(name)
+    self._unsupported_ops = []
+    self._pivot = pivot
+    self._replicated_vars = {}
+
+  def get_replicated_var_handle(self, name, vars_):
+    """Returns a variable handle for replicated TPU variable 'var'.
+
+    This is a method used by an experimental replicated variable implementation
+    and is not intended as a public API.
+
+    Args:
+      name: The common name of the variable.
+      vars_: The replicated TPU variables.
+
+    Returns:
+      The handle of the TPU replicated input node.
+    """
+    handle = self._replicated_vars.get(name)
+    if handle is not None:
+      return handle
+
+    # Builds a TPUReplicatedInput node for the variable, if one does not already
+    # exist. The TPUReplicatedInput node must belong to the enclosing
+    # control-flow scope of the TPUReplicateContext.
+    # TODO(phawkins): consider changing the contract of the TPU encapsulation
+    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
+    # instead.
+
+    # pylint: disable=protected-access
+    graph = ops.get_default_graph()
+    saved_context = graph._get_control_flow_context()
+    graph._set_control_flow_context(self.outer_context)
+    handle = tpu_ops.tpu_replicated_input(
+        [v.handle for v in vars_], name=name + "/handle")
+    graph._set_control_flow_context(saved_context)
+    # pylint: enable=protected-access
+    self._replicated_vars[name] = handle
+    return handle
+
+  def report_unsupported_operations(self):
+    if self._unsupported_ops:
+      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
+                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more" %
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
+
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an outside_compilation
+          # cluster C in a forward computation we would like to put the ops
+          # corresponding to the gradient of X into a new outside_compilation
+          # cluster C'. However, if we take the gradient of X twice, the second
+          # one should get yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is the
+          # cluster that X was in before we took gradients, and a 'gradient_uid'
+          # which is different for every invocation of gradients, and put the
+          # gradient of X in cluster 'root_cluster.gradient_uid'.
+          #
+          # When taking a gradient of a gradient, some ops will be colocated
+          # with Op in the forward pass (e.g., cluster root_cluster) and some in
+          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
+          # We need all of the grad-of-grad ops to be in the same cluster to
+          # avoid cyclic dependencies between clusters. We adopt a heuristic
+          # that puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          cluster = parts[0] + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the type and device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def type(self):
+        return "FakeOp"
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        if isinstance(device, pydev.DeviceSpec):
+          self._device = device.to_string()
+        else:
+          self._device = device
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
+  def _RemoveExternalControlEdges(self, op):
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
+  def AddOp(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_OPS:
+      logging.error("Operation of type %s (%s) is not supported on the TPU. "
+                    "Execution will fail if this op is used in the graph. " %
+                    (op.type, op.name))
+
+    if op.type in _UNSUPPORTED_OPS:
+      self._unsupported_ops.append(op)
+
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
+      raise NotImplementedError(
+          "Non-resource Variables are not supported inside TPU computations "
+          "(operator name: %s)" % op.name)
+    if _TPU_REPLICATE_ATTR in op.node_def.attr:
+      raise ValueError("TPU computations cannot be nested")
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
+
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    (internal_control_inputs,
+     external_control_inputs) = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not internal_control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_control_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_control_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    """Add `val` to the current context and its outer context recursively."""
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
+    result = val
+    self._values.add(val.name)
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
+    return result
+
+  def AddInnerOp(self, op):
+    self.AddOp(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  @property
+  def grad_state(self):
+    # Define the gradient loop state associated with the TPUReplicateContext to
+    # be None as the TPUReplicateContext does not get nested nor does the
+    # grad_state outside the TPUReplicateContext affect the graph inside so the
+    # grad_state should be as if this is the top-level gradient state.
+    return None
+
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
+
+def outside_compilation(computation, *args, **kwargs):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
+  Returns:
+    The Tensors returned by computation.
+  """
+  args = [] if args is None else args
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args, **kwargs)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
+def replicate(computation,
+              inputs=None,
+              infeed_queue=None,
+              device_assignment=None,
+              name=None,
+              maximum_shapes=None):
+  """Builds a graph operator that runs a replicated TPU computation.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+  Returns:
+    A list of outputs, indexed by `[replica_num]` each output can be a nested
+    structure same as what computation() returns with a few exceptions.
+
+    Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  return split_compile_and_replicate(
+      computation,
+      inputs,
+      infeed_queue,
+      device_assignment,
+      name,
+      maximum_shapes=maximum_shapes)[1]
+
+
+def _pad_all_input(inputs, padded_shapes):
+  """Pad all input tensors given padded_shapes.
+
+  The real shape tensors will be concatenated with the padded original inputs.
+
+  Args:
+    inputs: The original inputs.
+    padded_shapes: A list of padded shapes for each input.
+
+  Returns:
+    The padded inputs and a PaddingMap list which maps the padded input
+    dimension to the real shape argument index.
+  """
+  input_shape_tensors = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    for idx, input_tensor in enumerate(inputs_per_core):
+      if core_idx == 0:
+        input_shape_tensors.append([])
+      input_shape_tensors[idx].append(array_ops.shape(input_tensor))
+
+  maximum_shapes = []
+  for shapes_per_input in input_shape_tensors:
+    maximum_shapes.append(
+        math_ops.reduce_max(array_ops.stack(shapes_per_input), axis=0))
+
+  padded_inputs = []
+  real_shapes = []
+  padding_maps = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    padded_inputs.append([])
+    real_shapes.append([])
+    real_shape_idx = len(inputs_per_core) - 1
+    for idx, input_tensor in enumerate(inputs_per_core):
+      input_shape_tensor = input_shape_tensors[idx][core_idx]
+      input_shape = input_tensor.get_shape()
+      padded_shape = padded_shapes[idx]
+
+      # The static shape of inputs should be compatible with the given padded
+      # shapes.
+      input_shape.assert_is_compatible_with(padded_shape)
+
+      if input_shape.is_fully_defined():
+        # Do nothing if the shape of the whole tensor is already static.
+        padded_inputs[core_idx].append(input_tensor)
+      else:
+        # Only pad the non static shape dimension.
+        for i, s in enumerate(input_shape):
+          if s.value is None:
+            if core_idx == 0:
+              real_shape_idx += 1
+              padding_map = dynamic_padding.PaddingMap()
+              padding_map.arg_index = idx
+              padding_map.shape_index = i
+              padding_map.padding_arg_index = real_shape_idx
+              padding_maps.append(padding_map)
+            real_shapes[core_idx].append(
+                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
+
+        paddings = []
+        for i, s in enumerate(padded_shape):
+          if input_shape[i].value:
+            # Don't pad if input shape is already static.
+            padding = [0, 0]
+          else:
+            if s.value:
+              # Pad to the given maximum value.
+              padding = [0, s.value - input_shape_tensor[i]]
+            else:
+              # If maximum value is not given, then pad to the maximum dimension
+              # among all the cores.
+              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
+          paddings.append(padding)
+
+        padded_input = array_ops.pad(input_tensor, paddings)
+        padded_inputs[core_idx].append(padded_input)
+
+  num_replicas = len(padded_inputs)
+  for i in range(num_replicas):
+    padded_inputs[i].extend(real_shapes[i])
+
+  return padded_inputs, padding_maps
+
+
+def split_compile_and_replicate(computation,
+                                inputs=None,
+                                infeed_queue=None,
+                                device_assignment=None,
+                                name=None,
+                                use_tpu=True,
+                                maximum_shapes=None):
+  """Builds graph operators that runs compilation and replicated computation.
+
+  This is a lower level interface than replicate that returns a separate compile
+  and execute output tensor. In the generated graph the compile op feeds into
+  the execute op and no additional compilation is incurred when running the
+  compile op before the execute op. The compile op returns additional
+  information about the compilation but does not return the compiled program.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
+      backends. Currently, only supports a default placement (computation is
+      placed on GPU if one is available, and on CPU if not).
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+
+  Returns:
+    A list of lists with the first list corresponding to the compile op and the
+    second a list of output tensors, indexed by `[replica_num][output_num]`.
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  del name
+  inputs = [[]] if inputs is None else inputs
+
+  metadata_kwargs = {}
+  if device_assignment is not None:
+    # Turn the Numpy array into a flattened list so we can pass it as an
+    # operator attribute.
+    metadata_kwargs = {
+        "topology":
+            device_assignment.topology.serialized(),
+        "device_assignment":
+            device_assignment.core_assignment.flatten().tolist()
+    }
+    # TODO(phawkins): remove this case after the forward compatibility window
+    # expires on 2018-10-5.
+    if api_compat.forward_compatible(2018, 10, 5):
+      metadata_kwargs["num_cores_per_replica"] = (
+          device_assignment.num_cores_per_replica)
+    else:
+      metadata_kwargs["computation_shape"] = [
+          device_assignment.num_cores_per_replica
+      ]
+
+  if ((not isinstance(inputs, list)) or
+      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
+    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
+
+  num_replicas = len(inputs)
+
+  # No replicas? Nothing to do.
+  if num_replicas == 0:
+    return []
+
+  # Checks all replicas have the same structure.
+  for i in xrange(1, num_replicas):
+    nest.assert_same_structure(inputs[0], inputs[i])
+
+  # Flatten inputs.
+  flat_inputs = [
+      nest.flatten(per_replica_input) for per_replica_input in inputs
+  ]
+  # Converts inputs to Tensors.
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
+
+  # Verifies that all replicas have matching numbers and types of inputs
+  flat_input_types = [x.dtype for x in flat_inputs[0]]
+  input_arity = len(inputs[0])
+  flat_input_arity = len(flat_input_types)
+  for i in range(num_replicas):
+    if len(inputs[i]) != input_arity:
+      raise ValueError("Replicas must have the same number of inputs. "
+                       "Replica 0 had {} inputs, replica {} had {} "
+                       "inputs.".format(input_arity, i, len(inputs[i])))
+
+    types = [x.dtype for x in flat_inputs[i]]
+    if types != flat_input_types:
+      raise ValueError("Replicas must have matching input types. Replica 0 had "
+                       "input types {}, replica {} had input types {}".format(
+                           flat_input_types, i, types))
+
+  arg_error = xla.check_function_argument_count(
+      computation, input_arity, infeed_queue)
+  if arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s, but the computation needs %s" % (
+              input_arity, str([i.name for i in inputs[0]]), arg_error))
+    else:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s and %d additional inputs from infeed,"
+          " but the computation needs %s" % (input_arity, str(
+              [i.name
+               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
+                                             arg_error))
+
+  if maximum_shapes:
+    if infeed_queue:
+      raise ValueError(
+          "Dynamic input shapes are not supported with infeed queues")
+
+    # Make sure maximum_shapes has the same structure as inputs.
+    nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
+
+    # Flatten padded shapes.
+    flat_maximum_shapes = nest.flatten(maximum_shapes)
+    flat_maximum_shapes = [
+        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
+    ]
+
+    flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes)
+
+    serialized_padding_maps = []
+    for padding_map in padding_maps:
+      serialized_padding_maps.append(padding_map.SerializeToString())
+    metadata_kwargs["padding_map"] = serialized_padding_maps
+
+  metadata_kwargs["step_marker_location"] = getattr(
+      computation, "step_marker_location", "STEP_MARK_AT_ENTRY")
+
+  graph = ops.get_default_graph()
+
+  # Fan-in: Builds a TPUReplicatedInput node for each input.
+  flat_replicated_inputs = []
+  for i in range(0, len(flat_inputs[0])):
+    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
+    flat_replicated_inputs.append(
+        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+  cluster_name = graph.unique_name("cluster")
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  context = TPUReplicateContext(
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
+  try:
+    context.Enter()
+
+    metadata = tpu_ops.tpu_replicate_metadata(
+        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
+
+    with tpu_function.tpu_shard_context(
+        num_replicas), ops.control_dependencies([metadata]):
+
+      # Add identity ops so even unused inputs are "consumed" by the
+      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
+      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
+      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
+      flat_replicated_inputs = [
+          array_ops.identity(x, name="replicated_input_{}".format(i))
+          for i, x in enumerate(flat_replicated_inputs)
+      ]
+      for i in flat_replicated_inputs:
+        # pylint: disable=protected-access
+        # Add an attribute to the identity node so that they could be removed in
+        # encapsulate TPU computation pass if unused. However we don't remove
+        # inputs when dynamic padding is enabled.
+        # TODO(rxsang): Use other ways except argument index in padding_map so
+        # outside compilation can work with dynamic padding correctly.
+        if maximum_shapes is None:
+          i.op._set_attr("_tpu_input_identity",
+                         attr_value_pb2.AttrValue(b=True))
+        # pylint: enable=protected-access
+
+      # Unflatten the computation inputs to match original input structure.
+      computation_inputs = nest.pack_sequence_as(
+          structure=inputs[0],
+          flat_sequence=flat_replicated_inputs[:flat_input_arity])
+
+      # If there is an infeed queue, adds the dequeued values to the
+      # computation's inputs.
+      if infeed_queue is not None:
+        infeed_queue.set_number_of_shards(num_replicas)
+        for t in infeed_queue.generate_dequeue_op():
+          computation_inputs.append(t)
+
+      # Only resource variables work inside a TPU computation, so turn on
+      # resource variables for the computation.
+      # TODO(phawkins): consider removing this code. It will
+      # be less confusing to clients if they knowingly choose to use resource
+      # variables.
+      # Partitioned variables is not supported (b/112311320).
+      vscope = variable_scope.get_variable_scope()
+      saved_use_resource = vscope.use_resource
+      saved_custom_getter = vscope.custom_getter
+
+      def custom_getter(getter, name, *args, **kwargs):
+        """Variables on TPU have a few restrictions."""
+        partitioner = kwargs["partitioner"]
+        if partitioner is not None:
+          kwargs["partitioner"] = None
+          logging.warning(
+              "Partitioned variables are not supported on TPU. Got "
+              "`partitioner` that is {} for variable {}. "
+              "Setting `partitioner` to `None`."
+              .format(partitioner, name))
+        if saved_custom_getter is None:
+          return getter(name, *args, **kwargs)
+        else:
+          return saved_custom_getter(getter, name, *args, **kwargs)
+
+      vscope.set_use_resource(True)
+      vscope.set_custom_getter(custom_getter)
+
+      outputs = computation(*computation_inputs)
+
+      vscope.set_use_resource(saved_use_resource)
+      vscope.set_custom_getter(saved_custom_getter)
+
+    outputs_is_flat = xla.is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
+
+    # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
+    # import-cycle
+    # pylint: disable=g-import-not-at-top
+    from tensorflow.python.tpu import tensor_tracer
+    # pylint: enable=g-import-not-at-top
+    if tensor_tracer.TensorTracer.is_enabled():
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, control_deps,
+                                    num_replicas)
+
+    context.ExitResult(output_tensors)
+  finally:
+    context.report_unsupported_operations()
+    context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
+
+  with ops.control_dependencies([metadata]):
+    if use_tpu:
+      compile_status = tpu_ops.tpu_compilation_result()
+      op = compile_status.op
+      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
+      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
+    else:
+      compile_status = control_flow_ops.no_op(name="compilation_status")
+
+  if not output_tensors:
+    # Returns a list of NoOps dependent on the replication Op, indexed by
+    # [replica_num].
+    return [
+        compile_status,
+        [
+            control_flow_ops.group(control_deps, name="shard_%d" % i)
+            for i in range(num_replicas)
+        ]
+    ]
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  replicated_outputs = [[] for i in xrange(num_replicas)]
+  for i, t in enumerate(output_tensors):
+    # Fan-out: Builds a TPUReplicatedOutput node for each output.
+    ys = tpu_ops.tpu_replicated_output(
+        t, num_replicas, name="output{}".format(i))
+
+    # Wraps the outputs in identity operators so the names of any possible
+    # `fetch` nodes are preserved by the replication rewrite.
+    with ops.control_dependencies(control_deps):
+      for replica in xrange(num_replicas):
+        replicated_outputs[replica].append(
+            array_ops.identity(
+                ys[replica], name="output_%d_shard_%d" % (i, replica)))
+
+  if not outputs_is_flat:
+    replicated_outputs = [
+        nest.pack_sequence_as(outputs, replica_outs)
+        for replica_outs in replicated_outputs
+    ]
+
+  return [compile_status, replicated_outputs]
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, makes it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that fetching any return value of this function
+  # will trigger TPUExecute node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    with ops.device(core(0)):
+      outputs = [
+          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+          for o in outputs
+      ]
+  except Exception as e:
+    raise ValueError(
+        "TPU function return values must all either be Operations or "
+        "convertible to Tensors. Got '%s'" % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        "TPU functions must return zero-or more Tensor values followed by "
+        "zero or more Operations.")
+
+  # Wraps outputs in Identity ops. Otherwise a replicated input copied
+  # straight to an output would bypass the replicate(). This would be bad
+  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+  # be rewritten away, leading to a runtime error.
+  # TODO(phawkins): extend the rewrite to elide these nodes instead.
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else core(0)):
+      o = array_ops.identity(t)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      new_output_tensors.append(o)
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+
+  # Flatten output items.
+  flat_outputs = nest.flatten(outputs)
+
+  # Convert all non-Operation outputs to Tensors.
+  for i, o in enumerate(flat_outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          "tpu.rewrite does not support Operation as return value in non-flat "
+          "output structure. You can set returned Operations as control "
+          "dependencies of returned Tensors so Operations are triggered when "
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          'convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    with ops.device(core(0)):
+      o = array_ops.identity(o)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      flat_outputs[i] = array_ops.identity(o)
+
+  # All flat_outputs are Tensors, and no Operations.
+  return flat_outputs, []
+
+
+def split_compile_and_shard(computation,
+                            inputs=None,
+                            num_shards=1,
+                            input_shard_axes=None,
+                            outputs_from_all_shards=True,
+                            output_shard_axes=None,
+                            infeed_queue=None,
+                            device_assignment=None,
+                            name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A tuple of (compile op, [output tensors]).
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  # TODO(phawkins): consider adding support for broadcasting Tensors passed as
+  # inputs.
+
+  if num_shards <= 0:
+    raise ValueError("num_shards must be a positive integer.")
+
+  inputs = [] if inputs is None else inputs
+  if not isinstance(inputs, list):
+    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
+
+  # Converts inputs to Tensors.
+  inputs = [ops.convert_to_tensor(x) for x in inputs]
+
+  if input_shard_axes is None:
+    input_shard_axes = [0] * len(inputs)
+  if len(inputs) != len(input_shard_axes):
+    raise ValueError("Length of input_shard_axes must be equal to the number "
+                     "of inputs.")
+
+  if inputs:
+    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
+    # lists with layout [input][shard]
+    split_inputs = [
+        array_ops.split(x, num_shards, axis=axis)
+        for (axis, x) in zip(input_shard_axes, inputs)]
+
+    # Transposes the input lists to have layout [shard][input]
+    transposed_inputs = [list(i) for i in zip(*split_inputs)]
+  else:
+    transposed_inputs = [[]] * num_shards
+
+  compile_op, outputs = split_compile_and_replicate(
+      computation,
+      transposed_inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+  # There must be at least one shard since num_shards > 0.
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  if isinstance(outputs[0], ops.Operation):
+    # pylint: enable=indexing-exception
+    # There were no outputs from the computation and replicate returned a list
+    # of NoOps with control dependencies on the computation. Return the first
+    # one so it can be used as a control dependency or fetch node.
+    # TODO(b/36647078) remove disable when pylint bug is fixed.
+    # pylint: disable=indexing-exception
+    return compile_op, [outputs[0]]
+    # pylint: enable=indexing-exception
+
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  num_outputs = len(outputs[0])
+  # pylint: enable=indexing-exception
+
+  if output_shard_axes is None:
+    output_shard_axes = [0] * num_outputs
+  if num_outputs != len(output_shard_axes):
+    raise ValueError("Length of output_shard_axes must be equal to the number "
+                     "of outputs.")
+
+  if isinstance(outputs_from_all_shards, bool):
+    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
+
+  if num_outputs != len(outputs_from_all_shards):
+    raise ValueError("Length of outputs_from_all_shards must be equal to the "
+                     "number of outputs.")
+
+  results = []
+  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
+                                   zip(*outputs)):
+    if all_shards:
+      # Concatenate all of the outputs together (use stack for scalars).
+      shape = x[0].shape
+      is_scalar = shape is not None and (shape.ndims == 0)
+      results.append((array_ops.stack(list(x)) if is_scalar
+                      else array_ops.concat(list(x), axis=axis)))
+    else:
+      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
+      results.append(x[0])
+
+  return compile_op, results
+
+
+def shard(computation,
+          inputs=None,
+          num_shards=1,
+          input_shard_axes=None,
+          outputs_from_all_shards=True,
+          output_shard_axes=None,
+          infeed_queue=None,
+          device_assignment=None,
+          name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  TODO(phawkins): consider adding support for broadcasting Tensors passed
+  as inputs.
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  return split_compile_and_shard(
+      computation,
+      inputs=inputs,
+      num_shards=num_shards,
+      input_shard_axes=input_shard_axes,
+      outputs_from_all_shards=outputs_from_all_shards,
+      output_shard_axes=output_shard_axes,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[1]
+
+
+def batch_parallel(computation,
+                   inputs=None,
+                   num_shards=1,
+                   infeed_queue=None,
+                   device_assignment=None,
+                   name=None):
+  """Shards `computation` along the batch dimension for parallel execution.
+
+  Convenience wrapper around shard().
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list).
+  Each input is split into `num_shards` pieces along the 0-th dimension, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  The outputs from all shards are concatenated back together along their 0-th
+  dimension.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). The
+      0-th dimension of each Tensor must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If `num_shards <= 0`
+  """
+  return shard(
+      computation,
+      inputs,
+      num_shards=num_shards,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+
+def rewrite(computation,
+            inputs=None,
+            infeed_queue=None,
+            device_assignment=None,
+            name=None):
+  """Rewrites `computation` for execution on a TPU system.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors.
+
+      `computation` may return a list of operations and tensors. Tensors must
+      come before operations in the returned list.  The return value of
+      `rewrite` is a list of tensors corresponding to the tensors from the
+      output of `computation`.
+
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+      Each input can be a nested structure containing values that are
+      convertible to tensors. Note that passing an N-dimension list of
+      compatible values will result in a N-dimention list of scalar tensors
+      rather than a single Rank-N tensors. If you need different behavior,
+      convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: (Deprecated) Does nothing.
+  Returns:
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+  """
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  return replicate(
+      computation,
+      None if inputs is None else [inputs],
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[0]
+  # pylint: enable=indexing-exception
+
+  # Operations that indicate some error in the user's inference graph.
+_BLACKLISTED_INFERENCE_OPS = set([
+    "ReadVariableOp",
+    "AssignVariableOp",
+    "AssignAddVariableOp",
+    "AssignSubVariableOp",
+    "VarHandleOp",
+    "Variable",
+    "VariableV2",
+])
+
+
+def under_tpu_inference_context():
+  """Check if it is currently under `tpu.rewrite_for_inference()`."""
+  graph = ops.get_default_graph()
+
+  context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  while context:
+    if isinstance(context, _TPUInferenceContext):
+      return True
+    context = context.outer_context
+
+  return False
+
+
+class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU inference computation.
+
+  The primary role of `TPUReplicateContext` is to sanity check operators inside
+  a tpu.rewrite_for_inference() computation.
+  """
+
+  def __init__(self, name):
+    super(_TPUInferenceContext, self).__init__()
+    self._name = name
+
+  def AddOp(self, op):
+    self._AddOpInternal(op)
+
+  def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_INFERENCE_OPS:
+      raise NotImplementedError(
+          "Operation of type %s (%s) is not supported on the TPU for inference."
+          " Execution will fail if this op is used in the graph. Make sure your"
+          " variables are using variable_scope." % (op.type, op.name))
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    result = val
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+    return result
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+
+  @property
+  def grad_state(self):
+    return None
+
+
+def validate_inference_rewrite_for_variables(graph):
+  """Validates whether rewrite_for_inference() 'worked' for variables.
+
+     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
+     after ReadVariableOps, but this mechanism works only if you are using
+     tf.get_variable() to create and access variables in your tpu computation.
+     This validation method can be called immediately after calling
+     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
+     to the graph.
+
+     Typical usages:
+       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
+
+       tpu.validate_inference_rewrite_for_variables(sess.graph)
+
+  Args:
+    graph: The graph which needs to be validated.
+  Raises:
+    RuntimeError: if validation failed.
+  """
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
+    raise RuntimeError(
+        "No GuaranteeConst ops found in the graph after running "
+        "tpu.rewrite_for_inference(...). Please check that you are using "
+        "tf.get_variable() to create and access variables in your tpu "
+        "computation.")
+
+
+def rewrite_for_inference(computation,
+                          inputs=None,
+                          infeed_queue=None,
+                          device_assignment=None,
+                          name=None):
+  """Rewrites `computation` for inference on a TPU system.
+
+     Other than 'rewriting' the computation to run on a TPU, if using variables
+     in your computation, it moves the ReadVariableOps outside the TPU
+     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
+     This mechanism works only if you are using tf.get_variable() to create and
+     access variables in your tpu computation. You can validate whether this
+     worked, by calling validate_inference_rewrite_for_variables() method
+     immediately after this method to check whether GuaranteeConstOps where
+     added to the graph.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors. If the function returns m outputs, rewrite will return a list of
+      m tensors.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
+  Returns:
+    A list of output tensors.
+  """
+
+  def guarantee_const_getter(getter, name, *args, **kwargs):
+    with ops.control_dependencies(None):
+      return array_ops.guarantee_const(
+          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
+
+  def wrapped_computation(*args, **kwargs):
+    """Execute computation under `_TPUInferenceContext`."""
+    context = _TPUInferenceContext(
+        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
+    try:
+      context.Enter()
+
+      vscope = variable_scope.get_variable_scope()
+      prev_custom_getter = vscope.custom_getter
+      prev_caching_device = vscope.caching_device
+      vscope.set_custom_getter(guarantee_const_getter)
+      vscope.set_caching_device(lambda op: op.device)
+
+      result = computation(*args, **kwargs)
+
+      vscope.set_custom_getter(prev_custom_getter)
+      vscope.set_caching_device(prev_caching_device)
+    finally:
+      context.Exit()
+    return result
+
+  # pylint: disable=undefined-variable
+  return rewrite(
+      wrapped_computation,
+      inputs=inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+  # pylint: enable=undefined-variable
+
+
+def prune_unconnected_ops_from_xla(prune_graph):
+  """Prunes unconnected ops as listed in _UNCONNECTED_OPS_TO_PRUNE.
+
+  Args:
+    prune_graph: A tensorflow graph from which we wish to prune unconnected ops
+      as listed in _UNCONNECTED_OPS_TO_PRUNE.  In general, these ops should have
+      no inputs and no consumers. These can often be left behind due to graph
+      construction rewiring (for instance TF-Hub). While they never execute,
+      they will cause XLA compile to fail so we strip them from XLA compile by
+      removing the tpu_replicate attribute.
+  """
+  # Scan over the top level graph and all function graphs.
+  for graph in [prune_graph] + list(prune_graph._functions.values()):  # pylint: disable=protected-access
+    for op in graph.get_operations():
+      if op.type not in _UNCONNECTED_OPS_TO_PRUNE:
+        continue
+      outputs_consumed = False
+      for output in op.outputs:
+        if output.consumers():
+          outputs_consumed = True
+          break
+      if not outputs_consumed:
+        logging.info(
+            "Pruning OP %s of type %s from XLA Compile due to "
+            "it being disconnected.", op.name, op.type)
+        op._clear_attr(_TPU_REPLICATE_ATTR)  # pylint: disable=protected-access
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/tpu/tpu_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..20946d7aa666dd1818d841157624d53bb09f1bb8
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_config.py
@@ -0,0 +1,293 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""A RunConfig subclass with TPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import util as util_lib
+
+# pylint: disable=protected-access
+_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
+_SERVICE_KEY = run_config_lib._SERVICE_KEY
+_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
+# pylint: enable=protected-access
+
+
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+  BROADCAST = 4
+  SLICED = 5
+
+
+class TPUConfig(
+    collections.namedtuple('TPUConfig', [
+        'iterations_per_loop',
+        'num_shards',
+        'num_cores_per_replica',
+        'per_host_input_for_training',
+        'tpu_job_name',
+        'initial_infeed_sleep_secs',
+        'input_partition_dims',
+        'eval_training_input_configuration',
+    ])):
+  r"""TPU related configuration required by `TPUEstimator`.
+
+  Args:
+    iterations_per_loop: This is the number of train steps running in TPU
+      system before returning to CPU host for each `Session.run`. This means
+      global step is increased `iterations_per_loop` times in one `Session.run`.
+      It is recommended to be set as number of global steps for next checkpoint.
+    num_shards: (Deprecated, ignored by TPUEstimator).
+      The number of model replicas in the system. For non-model-parallelism
+      case, this number equals the total number of TPU cores. For
+      model-parallelism, the total number of TPU cores equals
+      num_cores_per_replica * num_shards.
+    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
+      An integer which describes the number of TPU cores per model replica. This
+      is required by model-parallelism which enables partitioning
+      the model to multiple cores. Currently num_cores_per_replica must be
+      1, 2, 4, or 8.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked once on each host. With the per-core input pipeline
+      configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
+      invoked once on host 0 and the tensors are broadcasted to all other
+      replicas. The batch size equals to train_batch_size`. With the per-core
+      input pipeline configuration, the shard batch size is also
+      `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
+    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
+      within TPUEstimator, however when using ClusterSpec propagation in more
+      esoteric cluster configurations, you may need to specify the job name as a
+      string.
+    initial_infeed_sleep_secs: The number of seconds the infeed thread should
+      wait before enqueueing the first batch. This helps avoid timeouts for
+      models that require a long compilation time.
+    input_partition_dims: A nested list to describe the partition dims
+      for all the tensors from input_fn(). The structure of
+      input_partition_dims must match the structure of `features` and
+      `labels` from input_fn(). The total number of partitions must match
+      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
+      images with shape [N, H, W, C] and labels [N].
+      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
+      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
+      to all the TPU cores since the partition dims is `None`.
+      Current limitations: This feature is only supported with the PER_HOST_V2
+      input mode.
+    eval_training_input_configuration: If `SLICED`, `input_fn` is only
+      invoked once on host 0 and the tensors are broadcasted to all other
+      replicas. Unlike per_host_input_for_training=BROADCAST, each replica will
+      only get a slice of the data instead of a whole copy. If `PER_HOST_V1`,
+      the behaviour is determined by per_host_input_for_training.
+
+    Raises:
+      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
+  """
+
+  def __new__(
+      cls,
+      iterations_per_loop=2,
+      num_shards=None,
+      num_cores_per_replica=None,
+      per_host_input_for_training=True,
+      tpu_job_name=None,
+      initial_infeed_sleep_secs=None,
+      input_partition_dims=None,
+      eval_training_input_configuration=InputPipelineConfig.PER_HOST_V1):
+
+    # Check iterations_per_loop.
+    util_lib.check_positive_integer(iterations_per_loop,
+                                    'TPUConfig iterations_per_loop')
+
+    # Check num_shards.
+    if num_shards is not None:
+      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+
+    if input_partition_dims is not None:
+      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
+        raise ValueError(
+            'input_partition_dims must be a list/tuple with one or two'
+            ' elements.')
+
+      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
+        raise ValueError(
+            'input_partition_dims is only supported in PER_HOST_V2 mode.')
+
+      if num_cores_per_replica is None:
+        raise ValueError(
+            'input_partition_dims requires setting num_cores_per_replica.')
+
+    # Check num_cores_per_replica
+    if num_cores_per_replica is not None:
+      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
+        raise ValueError(
+            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
+                str(num_cores_per_replica)))
+
+    if eval_training_input_configuration not in [
+        InputPipelineConfig.PER_HOST_V1, InputPipelineConfig.SLICED
+    ]:
+      raise ValueError(
+          'eval_training_input_configuration must be PER_HOST_V1 or SLICED;'
+          ' got {}'.format(str(eval_training_input_configuration)))
+
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
+    # Check initial_infeed_sleep_secs.
+    if initial_infeed_sleep_secs:
+      util_lib.check_positive_integer(initial_infeed_sleep_secs,
+                                      'TPUConfig initial_infeed_sleep_secs')
+
+    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
+
+    return super(TPUConfig, cls).__new__(
+        cls,
+        iterations_per_loop=iterations_per_loop,
+        num_shards=num_shards,
+        num_cores_per_replica=num_cores_per_replica,
+        per_host_input_for_training=per_host_input_for_training,
+        tpu_job_name=tpu_job_name,
+        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
+        input_partition_dims=input_partition_dims,
+        eval_training_input_configuration=eval_training_input_configuration)
+
+
+class RunConfig(run_config_lib.RunConfig):
+  """RunConfig with TPU support."""
+
+  def __init__(self,
+               tpu_config=None,
+               evaluation_master=None,
+               master=None,
+               cluster=None,
+               **kwargs):
+    """Constructs a RunConfig.
+
+    Args:
+      tpu_config: the TPUConfig that specifies TPU-specific configuration.
+      evaluation_master: a string. The address of the master to use for eval.
+        Defaults to master if not set.
+      master: a string. The address of the master to use for training.
+      cluster: a ClusterResolver
+      **kwargs: keyword config parameters.
+
+    Raises:
+      ValueError: if cluster is not None and the provided session_config has a
+        cluster_def already.
+    """
+    super(RunConfig, self).__init__(**kwargs)
+    self._tpu_config = tpu_config or TPUConfig()
+    self._cluster = cluster
+
+    # If user sets master and/or evaluation_master explicitly, including empty
+    # string '', take it. Otherwise, take the values set by parent class.
+    if master is not None:
+      if cluster is not None:
+        raise ValueError('Both master and cluster are set.')
+      self._master = master
+    else:
+      if cluster:
+        self._master = cluster.master()
+
+    if evaluation_master is not None:
+      self._evaluation_master = evaluation_master
+    elif (not self._evaluation_master and
+          self.task_type != run_config_lib.TaskType.EVALUATOR):
+      # If the task type is EVALUATOR, it means some cluster manager sets the
+      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
+      #
+      # Otherwise, it means user executes the code without external cluster
+      # manager. For that, we optimize the user experience by setting
+      # evaluation_master to master, unless user overwrites it.
+      self._evaluation_master = self._master
+
+    # Set the ClusterSpec to use
+    if cluster:
+      self._cluster_spec = cluster.cluster_spec()
+
+      # Merge the cluster_def into the ConfigProto.
+      if self._session_config is None:  # pylint: disable=access-member-before-definition
+        self._session_config = config_pb2.ConfigProto(
+            allow_soft_placement=True, isolate_session_state=True)
+      if self._session_config.HasField('cluster_def'):
+        raise ValueError(
+            'You cannot provide a ClusterResolver and '
+            'session_config.cluster_def.')
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
+
+  def _maybe_overwrite_session_config_for_distributed_training(self):
+    # Overrides the parent class session_config overwrite for between-graph. TPU
+    # runs with in-graph, which should not have device filter. Doing nothing
+    # ("pass") basically disables it.
+    pass
+
+  @property
+  def evaluation_master(self):
+    return self._evaluation_master
+
+  @property
+  def master(self):
+    return self._master
+
+  @property
+  def tpu_config(self):
+    return self._tpu_config
+
+  @property
+  def cluster(self):
+    return self._cluster
+
+  def replace(self, **kwargs):
+    if 'tpu_config' not in kwargs:
+      return super(RunConfig, self).replace(**kwargs)
+
+    tpu_config = kwargs.pop('tpu_config')
+    new_instance = super(RunConfig, self).replace(**kwargs)
+    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
+    return new_instance
+
+
+def _get_tpu_job_name_from_tf_config():
+  """Extracts the TPU job name from TF_CONFIG env variable."""
+  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
+  # spec propagation.
+  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
+  if tpu_job_name:
+    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
+  return tpu_job_name
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/python/tpu/tpu_config_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
rename to tensorflow/python/tpu/tpu_config_test.py
index b2fe0a688861503ae0bc55208f5dfc4d664419fd..22fb3032169851e5ee58d6b40bef52ece8593ba1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
+++ b/tensorflow/python/tpu/tpu_config_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import json
 
-from tensorflow.contrib.tpu.python.tpu import tpu_config as tpu_config_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_config as tpu_config_lib
 
 
 def _set_tf_config_env_variable(tf_config):
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d25048842386576c5b5efb62178a86b5a99b25
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_context.py
@@ -0,0 +1,749 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from contextlib import contextmanager
+import copy
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import device_assignment as tpu_device_assignment
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+_NUM_CORES_TO_COMPUTATION_SHAPE = {
+    1: [1, 1, 1],
+    2: [1, 1, 2],
+    4: [1, 2, 2],
+    8: [2, 2, 2],
+    16: [4, 2, 2],
+}
+
+
+class TPUContext(object):
+  """A context that holds the current configuration of the TPU computation."""
+
+  def __init__(self,
+               internal_ctx,
+               input_device=None,
+               invocation_index=None,
+               call_from_input_fn=True):
+    self._internal_ctx = internal_ctx
+    self._input_device = input_device
+    self._invocation_index = invocation_index
+    self._call_from_input_fn = call_from_input_fn
+
+  def current_input_fn_deployment(self):
+    """The configuration of the current input_fn invocation.
+
+    The configuration depends on `TPUConfig.per_host_input_for_training`. See
+    `TPUConfig` for details.
+
+    Only set in params dict of input_fn
+
+    Returns:
+      A tuple of
+        1. Device spec string: String, is the current CPU host where the
+           input_fn is invoked.
+        2. Current invocation index: Int, 0-based index of the input_fn
+           invocation. See next item for details.
+        3. Total invocation count: Int, the total number of times to invoke the
+           input_fn on all CPU hosts. Each invocation will be passed with a new
+           `TPUContext` instance with current invocation index set properly.
+        4. Total number of replicas consumed by current_invocation: Int, the
+           number of replicas fed by the data returned by current input_fn. For
+           example, for per_core input pipeline deployment
+           and non-model-parallelism, total invocation count is equal to
+           the number of cores in the system and num replicas consumed by
+           current invocation is 1. For per-host v2 input pipeline deployment,
+           total invocation count is equal to the number of hosts in the system
+           and num replicas consumed by current invocation is equal to number of
+           cores per host.
+
+    Raises:
+      RuntimeError: If this method must not be called from input_fn.
+    """
+    if not self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' model_fn.')
+
+    if self._internal_ctx.is_input_sharded_per_core():
+      total_invocation_count = (self._internal_ctx.num_hosts
+                                * self._internal_ctx.num_of_replicas_per_host)
+      replicas_consumed = 1
+    elif self._internal_ctx.is_input_broadcast_with_iterators():
+      total_invocation_count = 1
+      replicas_consumed = self._internal_ctx.num_replicas
+    else:
+      total_invocation_count = self._internal_ctx.num_hosts
+      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
+    return (self._input_device, self._invocation_index,
+            total_invocation_count, replicas_consumed)
+
+  @property
+  def num_replicas(self):
+    """The total number of replicas.
+
+    For non-model-parallelism, num_replicas should be the total num of TPU
+    cores in the system.
+
+    Returns:
+      The number of replicas.
+    """
+    return self._internal_ctx.num_replicas
+
+  @property
+  def num_hosts(self):
+    """The number of hosts for the TPU system."""
+    return self._internal_ctx.num_hosts
+
+  @property
+  def current_host(self):
+    """The current host index for the TPU system."""
+    return self._invocation_index
+
+  @property
+  def num_of_replicas_per_host(self):
+    """The number of replicas for each host."""
+    if self._internal_ctx.model_parallelism_enabled:
+      raise ValueError(
+          'num_of_replicas_per_host is not supported for model_parallelism')
+    return self._internal_ctx.num_of_replicas_per_host
+
+  @property
+  def device_assignment(self):
+    """Returns device_assignment object."""
+    if self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' input_fn.')
+    return self._internal_ctx.device_assignment
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    # Note that: For the non-model parallelism, the mapping could be
+    # a random permutation. The order should not matter in most cases
+    # as far as model is replicated to all cores in the system.
+    return self._internal_ctx.device_for_replica(replica_id)
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function.
+
+    The place function takes host_id as the input and returns the TF device
+    for the correspoding host.
+    """
+
+    def _placement_function(host_id):
+      """Return the host device given host_id."""
+      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
+
+    return _placement_function
+
+
+class _InternalTPUContext(object):
+  """A context holds immutable states of TPU computation.
+
+  This immutable object holds TPUEstimator config, train/eval batch size, and
+  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
+  provides utility functions, based on the current state, to determine other
+  information commonly required by TPU computation, such as TPU device names,
+  TPU hosts, shard batch size, etc.
+
+  if eval_on_tpu is False, then execution of eval on TPU is disabled.
+  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
+  and TPU execution is disabled for all modes.
+
+  N.B. As `mode` is not immutable state in Estimator, but essential to
+  distinguish between TPU training and evaluation, a common usage for
+  _InternalTPUContext with `mode` is as follows:
+  ```
+  with _ctx.with_mode(mode) as ctx:
+    if ctx.is_running_on_cpu():
+       ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               train_batch_size,
+               eval_batch_size,
+               predict_batch_size,
+               use_tpu,
+               eval_on_tpu=True,
+               embedding_config_spec=None):
+    self._config = config
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
+    self._use_tpu = use_tpu
+    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
+    if not use_tpu and eval_on_tpu:
+      logging.warning('eval_on_tpu ignored because use_tpu is False.')
+
+    self._eval_on_tpu = eval_on_tpu
+    self._model_parallelism_enabled = (
+        use_tpu and config.tpu_config.num_cores_per_replica)
+    self._mode = None
+    num_cores_per_replica = config.tpu_config.num_cores_per_replica
+    if self._model_parallelism_enabled:
+      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
+          num_cores_per_replica]
+    else:
+      self._computation_shape = None
+    self._lazy_tpu_system_metadata_dict = {}  # key by master address
+    self._lazy_device_assignment_dict = {}  # key by master address
+    self._lazy_validation_dict = {}  # key by ModeKeys
+    self._embedding_config_spec = embedding_config_spec
+    self._lazy_embedding_config_dict = {}  # key by master address
+
+  def _assert_mode(self):
+    if self._mode is None:
+      raise RuntimeError(
+          '`mode` needs to be set via contextmanager `with_mode`.')
+    return self._mode
+
+  @contextmanager
+  def with_mode(self, mode):
+    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
+    # such as _lazy_tpu_system_metadata_dict between new copy and the original
+    # one. Note that all lazy states stored in properties _lazy_foo are sort of
+    # immutable as they should be same for the process lifetime.
+    new_ctx = copy.copy(self)
+    new_ctx._mode = mode  # pylint: disable=protected-access
+    yield new_ctx
+
+  @property
+  def mode(self):
+    return self._assert_mode()
+
+  def _get_master_address(self):
+    mode = self._assert_mode()
+    config = self._config
+    master = (
+        config.master
+        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
+    return master
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    cluster_def = None
+    if (self._config.session_config and
+        self._config.session_config.cluster_def.job):
+      cluster_def = self._config.session_config.cluster_def
+
+    # pylint: disable=protected-access
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(
+            master,
+            cluster_def=cluster_def,
+            query_topology=self.model_parallelism_enabled))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+  def _get_device_assignment(self):
+    """Gets the (maybe cached) TPU device assignment."""
+    master = self._get_master_address()
+    device_assignment = self._lazy_device_assignment_dict.get(master)
+    if device_assignment is not None:
+      return device_assignment
+
+    tpu_system_metadata = self._get_tpu_system_metadata()
+
+    device_assignment = tpu_device_assignment.device_assignment(
+        tpu_system_metadata.topology,
+        computation_shape=self._computation_shape,
+        num_replicas=self.num_replicas)
+
+    logging.info('num_cores_per_replica: %s',
+                 str(self._config.tpu_config.num_cores_per_replica))
+    logging.info('computation_shape: %s', str(self._computation_shape))
+    logging.info('num_replicas: %d', self.num_replicas)
+    logging.info('device_assignment.topology.device_coordinates: %s',
+                 str(device_assignment.topology.device_coordinates))
+    logging.info('device_assignment.core_assignment: %s',
+                 str(device_assignment.core_assignment))
+
+    self._lazy_device_assignment_dict[master] = device_assignment
+    return device_assignment
+
+  @property
+  def embedding_config(self):
+    """Returns the embedding config based on current mode."""
+    master = self._get_master_address()
+    if master in self._lazy_embedding_config_dict:
+      embedding_config = self._lazy_embedding_config_dict[master]
+    else:
+      embedding_config = None
+      if self._use_tpu and self._embedding_config_spec:
+        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
+            self._embedding_config_spec, self._train_batch_size,
+            self._eval_batch_size, self.num_hosts, self.num_cores, self.config)
+        if not embedding_config.has_embedding_tables():
+          embedding_config = None
+      self._lazy_embedding_config_dict[master] = embedding_config
+
+    if embedding_config is not None:
+      mode = self._assert_mode()
+      # Dynamically attach tpu_embedding based on mode. With
+      # this, we could keep embedding_config immutable but call site always
+      # accesses the unified API '.tpu_embedding'.
+      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
+    return embedding_config
+
+  @property
+  def model_parallelism_enabled(self):
+    return self._model_parallelism_enabled
+
+  @property
+  def input_partition_dims(self):
+    return self._config.tpu_config.input_partition_dims
+
+  @property
+  def device_assignment(self):
+    return (self._get_device_assignment()
+            if self._model_parallelism_enabled else None)
+
+  @property
+  def num_of_cores_per_host(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_of_cores_per_host
+
+  @property
+  def num_cores(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_cores
+
+  @property
+  def num_of_replicas_per_host(self):
+    """Return the number of replicas per host."""
+    if self.model_parallelism_enabled:
+      return self.num_replicas // self.num_hosts
+    else:
+      return self.num_of_cores_per_host
+
+  @property
+  def num_replicas(self):
+    num_cores_in_system = self.num_cores
+
+    if self.model_parallelism_enabled:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      if num_cores_per_replica > num_cores_in_system:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the total num of '
+            'TPU cores in the system. num_cores_per_replica: {}, num cores '
+            'in the system: {}'.format(num_cores_per_replica,
+                                       num_cores_in_system))
+
+      if num_cores_in_system % num_cores_per_replica != 0:
+        raise RuntimeError(
+            'The num of cores in the system ({}) is not divisible by the num '
+            'of cores ({}) required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
+                num_cores_in_system, num_cores_per_replica))
+
+      return num_cores_in_system // num_cores_per_replica
+    else:
+      return num_cores_in_system
+
+  @property
+  def num_hosts(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_hosts
+
+  @property
+  def config(self):
+    return self._config
+
+  def is_input_sharded_per_core(self):
+    """Return true if input_fn is invoked per-core (other than per-host)."""
+    mode = self._assert_mode()
+    return (mode == model_fn_lib.ModeKeys.TRAIN and
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
+
+  def is_input_broadcast_with_iterators(self):
+    """Return true if input_fn should be run in the full_replicae config."""
+    mode = self._assert_mode()
+    return ((self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.BROADCAST) or
+            (mode != model_fn_lib.ModeKeys.TRAIN and
+             self._config.tpu_config.eval_training_input_configuration is
+             tpu_config.InputPipelineConfig.SLICED))
+
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    This API also validates user provided configuration, such as batch size,
+    according the lazy initialized TPU system metadata.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distinguish these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
+
+    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
+    if not is_running_on_cpu:
+      self._validate_tpu_configuration()
+    return is_running_on_cpu
+
+  def _is_running_on_cpu(self, is_export_mode):
+    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+    mode = self._assert_mode()
+
+    if not self._use_tpu:
+      return True
+
+    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
+      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
+      return True
+
+    if is_export_mode:
+      return True
+
+    return False
+
+  @property
+  def global_batch_size(self):
+    mode = self._assert_mode()
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
+
+  @property
+  def batch_size_for_input_fn(self):
+    """Returns the shard batch size for `input_fn`."""
+    global_batch_size = self.global_batch_size
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
+      return global_batch_size // self.num_replicas
+    else:
+      return global_batch_size // self.num_hosts
+
+  @property
+  def batch_size_for_model_fn(self):
+    """Returns the shard batch size for `model_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU. always sharded per shard.
+    return global_batch_size // self.num_replicas
+
+  @property
+  def master_job(self):
+    """Returns the job name to use to place TPU computations on.
+
+    Returns:
+      A string containing the job name, or None if no job should be specified.
+
+    Raises:
+      ValueError: If the user needs to specify a tpu_job_name, because we are
+        unable to infer the job name automatically, or if the user-specified job
+        names are inappropriate.
+    """
+    run_config = self._config
+    # If the user specifies the tpu_job_name, use that.
+    if run_config.tpu_config.tpu_job_name:
+      return run_config.tpu_config.tpu_job_name
+
+    # The tpu job is determined by the run_config. Right now, this method is
+    # required as tpu_config is not part of the RunConfig.
+    mode = self._assert_mode()
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
+    cluster_def = (run_config.session_config.cluster_def
+                   if run_config.session_config else None)
+
+    return tpu_system_metadata_lib.master_job(master, cluster_def)
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function."""
+
+    master = self.master_job
+
+    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
+      """Return the host device given replica_id or host_id."""
+      assert _sentinal is None
+      if replica_id is not None and host_id is not None:
+        raise RuntimeError(
+            'replica_id and host_id can have only one non-None value.')
+
+      if master is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        if replica_id is not None:
+          if self.model_parallelism_enabled:
+            return self.device_assignment.host_device(
+                replica=replica_id, job=master)
+          else:
+            host_id = replica_id / self.num_of_cores_per_host
+
+        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
+    return _placement_function
+
+  @property
+  def tpu_device_placement_function(self):
+    """Returns a TPU device placement Fn."""
+    master = self.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    def _placement_function(i):
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_device(replica=i, job=master)
+      else:
+        num_of_cores_per_host = self.num_of_cores_per_host
+        host_id = i / num_of_cores_per_host
+        ordinal_id = i % num_of_cores_per_host
+        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
+
+    return _placement_function
+
+  def tpu_ordinal_function(self, host_id):
+    """Returns the TPU ordinal fn."""
+
+    def _tpu_ordinal_function(shard_index_in_host):
+      """Return the TPU ordinal associated with a shard.
+
+      Required because the enqueue ops are placed on CPU.
+
+      Args:
+        shard_index_in_host: the shard index
+
+      Returns:
+        The ordinal of the TPU device the shard's infeed should be placed on.
+      """
+      if self.model_parallelism_enabled:
+        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
+        replica = self.device_assignment.lookup_replicas(host_id,
+                                                         0)[shard_index_in_host]
+        return self.device_assignment.tpu_ordinal(replica=replica)
+      else:
+        return shard_index_in_host % self.num_of_cores_per_host
+
+    return _tpu_ordinal_function
+
+  def _validate_tpu_configuration(self):
+    """Validates the configuration based on the TPU system metadata."""
+    mode = self._assert_mode()
+    if self._lazy_validation_dict.get(mode):
+      return
+
+    # All following information is obtained from TPU system metadata.
+    num_cores = self.num_cores
+    num_replicas = self.num_replicas
+    num_hosts = self.num_hosts
+
+    if not num_cores:
+      tpu_system_metadata = self._get_tpu_system_metadata()
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system. Please double check '
+          'Tensorflow master address and TPU worker(s). Available devices '
+          'are {}.'.format(tpu_system_metadata.devices))
+
+    if self._config.tpu_config.num_shards:
+      user_provided_num_replicas = self._config.tpu_config.num_shards
+      if user_provided_num_replicas != num_replicas:
+        message = (
+            'TPUConfig.num_shards is not set correctly. According to TPU '
+            'system metadata for Tensorflow master ({}): num_replicas should '
+            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
+            'be the total num of TPU cores in the system. For '
+            'model-parallelism, the total number of TPU cores should be '
+            'num_cores_per_replica * num_replicas. Please set it '
+            'accordingly or leave it as `None`'.format(
+                self._get_master_address(), num_replicas,
+                user_provided_num_replicas))
+
+        raise ValueError(message)
+
+    if self._config.tpu_config.num_cores_per_replica:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
+      if num_cores_per_replica > num_cores_per_host:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the '
+            'num_cores_per_host. num_cores_per_replica: {}, '
+            'num_cores_per_host: {}'.format(num_cores_per_replica,
+                                            num_cores_per_host))
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      if (self._train_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'train batch size {} must be divisible by number of replicas {}'
+            .format(self._train_batch_size, num_replicas))
+
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      if self._eval_batch_size is None:
+        raise ValueError(
+            'eval_batch_size in TPUEstimator constructor cannot be `None`'
+            'if .evaluate is running on TPU.')
+      if (self._eval_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'eval batch size {} must be divisible by number of replicas {}'
+            .format(self._eval_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.evaluate should be running on single TPU'
+            ' instead of a Pod.')
+    else:
+      assert mode == model_fn_lib.ModeKeys.PREDICT
+      if self._predict_batch_size is None:
+        raise ValueError(
+            'predict_batch_size in TPUEstimator constructor should not be '
+            '`None` if .predict is running on TPU.')
+      if (self._predict_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'predict batch size {} must be divisible by number of replicas {}'
+            .format(self._predict_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.predict should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+
+    # Record the state "validated" into lazy dictionary.
+    self._lazy_validation_dict[mode] = True
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    master = self.master_job
+
+    if self.model_parallelism_enabled:
+      return (self.device_assignment.host_device(
+          replica=replica_id, job=master),
+              self.device_assignment.tpu_ordinal(replica=replica_id))
+
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
+
+class _OneCoreTPUContext(_InternalTPUContext):
+  """Special _InternalTPUContext for one core usage."""
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+
+    super(_OneCoreTPUContext, self).__init__(
+        config, train_batch_size, eval_batch_size,
+        predict_batch_size, use_tpu)
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
+            num_cores=1,
+            num_hosts=1,
+            num_of_cores_per_host=1,
+            topology=None,
+            devices=[]))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+
+def _get_tpu_context(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu, eval_on_tpu,
+                     embedding_config_spec):
+  """Returns an instance of `_InternalTPUContext`."""
+
+  if (config.tpu_config.num_shards == 1 and
+      config.tpu_config.num_cores_per_replica is None):
+    if embedding_config_spec is not None:
+      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
+                       'when embedding_config_spec is not None.')
+    logging.warning(
+        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
+        'Please fix as soon as possible (leaving num_shards as None.)')
+    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
+                              predict_batch_size, use_tpu)
+
+  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
+                             predict_batch_size, use_tpu, eval_on_tpu,
+                             embedding_config_spec)
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d55bf10a0331108e607d3cd9fb29a8d31182a51
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -0,0 +1,1105 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU embedding APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import math
+import re
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2 as elc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.tpu.ops import tpu_ops
+
+TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
+INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
+
+
+class TableConfig(
+    collections.namedtuple(
+        'TableConfig',
+        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
+  """Embedding table configuration."""
+
+  def __new__(cls,
+              vocabulary_size,
+              dimension,
+              initializer=None,
+              combiner='mean'):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Number of vocabulary (/rows) in the table.
+      dimension: The embedding dimension.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn', 'sum' and None are
+        supported, with 'mean' the default. 'sqrtn' often achieves good
+        accuracy, in particular with bag-of-words columns. For more information,
+        see `tf.nn.embedding_lookup_sparse`. None is only valid for dense rather
+        than sparse tensors.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not positive integer.
+      ValueError: if `dimension` is not positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
+
+    if not isinstance(dimension, int) or dimension < 1:
+      raise ValueError('Invalid dimension {}.'.format(dimension))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError('initializer must be callable if specified.')
+    if initializer is None:
+      initializer = init_ops.truncated_normal_initializer(
+          mean=0.0, stddev=1 / math.sqrt(dimension))
+
+    if combiner not in ('mean', 'sum', 'sqrtn', None):
+      raise ValueError('Invalid combiner {}'.format(combiner))
+
+    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
+                                           initializer, combiner)
+
+
+AdamSlotVariableNames = collections.namedtuple(
+    'AdamSlotVariableNames', ['m', 'v'])
+
+AdagradSlotVariableName = collections.namedtuple(
+    'AdagradSlotVariableName', ['accumulator'])
+
+AdamSlotVariables = collections.namedtuple(
+    'AdamSlotVariables', ['m', 'v'])
+
+AdagradSlotVariable = collections.namedtuple(
+    'AdagradSlotVariable', ['accumulator'])
+
+VariablesAndOps = collections.namedtuple(
+    'VariablesAndOps',
+    ['embedding_variables_by_table', 'slot_variables_by_table',
+     'load_ops', 'retrieve_ops']
+)
+
+
+class _OptimizationParameters(object):
+  """Parameters common to all optimizations."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+
+
+class AdagradParameters(_OptimizationParameters):
+  """Optimization parameters for Adagrad."""
+
+  def __init__(self, learning_rate, initial_accumulator=0.1,
+               use_gradient_accumulation=True):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: used for updating embedding table.
+      initial_accumulator: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster. Please see
+        `optimization_parameters.proto` for details.
+        for details.
+    """
+    super(AdagradParameters, self).__init__(learning_rate,
+                                            use_gradient_accumulation)
+    if initial_accumulator <= 0:
+      raise ValueError('Adagrad initial_accumulator must be positive')
+    self.initial_accumulator = initial_accumulator
+
+
+class AdamParameters(_OptimizationParameters):
+  """Optimization parameters for Adam."""
+
+  def __init__(self, learning_rate,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-08,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=True):
+    """Optimization parameters for Adam.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      beta1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+        Please see `optimization_parameters.proto` for details.
+      sum_inside_sqrt: This improves training speed. Please see
+        `optimization_parameters.proto` for details.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster. Please see
+        `optimization_parameters.proto` for details.
+        for details.
+    """
+    super(AdamParameters, self).__init__(learning_rate,
+                                         use_gradient_accumulation)
+    if beta1 < 0. or beta1 >= 1.:
+      raise ValueError('beta1 must be between 0. and 1; got {}.'.format(beta1))
+    if beta2 < 0. or beta2 >= 1.:
+      raise ValueError('beta2 must be between 0. and 1; got {}.'.format(beta2))
+    if epsilon <= 0.:
+      raise ValueError('epsilon must be positive; got {}.'.format(epsilon))
+    if not use_gradient_accumulation and not lazy_adam:
+      raise ValueError(
+          'When disabling Lazy Adam, gradient accumulation must be used.')
+
+    self.beta1 = beta1
+    self.beta2 = beta2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+
+class StochasticGradientDescentParameters(_OptimizationParameters):
+  """Optimization parameters for stochastic gradient descent.
+
+  Args:
+    learning_rate: a floating point value. The learning rate.
+  """
+
+  def __init__(self, learning_rate):
+    super(StochasticGradientDescentParameters, self).__init__(
+        learning_rate, False)
+
+
+class TPUEmbedding(object):
+  """API for using TPU for embedding.
+
+    Example:
+    ```
+    table_config_user = tpu_embedding.TableConfig(
+        vocabulary_size=4, dimension=2,
+        initializer=initializer, combiner='mean')
+    table_to_config_dict = {'video': table_config_video,
+                          'user': table_config_user}
+    feature_to_table_dict = {'watched': 'video',
+                             'favorited': 'video',
+                             'friends': 'user'}
+    batch_size = 4
+    num_hosts = 1
+    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
+    mode = tpu_embedding.TRAINING
+    embedding = tpu_embedding.TPUEmbedding(
+        table_to_config_dict, feature_to_table_dict,
+        batch_size, num_hosts, mode, optimization_parameters)
+
+    batch_size_per_core = embedding.batch_size_per_core
+    sparse_features_list = []
+    for host in hosts:
+      with ops.device(host):
+        for _ in range(embedding.num_cores_per_host):
+          sparse_features = {}
+          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
+          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
+          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
+          sparse_features_list.append(sparse_features)
+
+    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
+    embedding_variables_and_ops = embedding.create_variables_and_ops()
+
+    def computation():
+      activations = embedding.get_activations()
+      loss = compute_loss(activations)
+
+      base_optimizer = gradient_descent.GradientDescentOptimizer(
+          learning_rate=1)
+      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
+          base_optimizer)
+
+      train_op = cross_shard_optimizer.minimize(loss)
+      gradients = (
+          tpu_embedding_gradient.get_gradients_through_compute_gradients(
+              cross_shard_optimizer, loss, activations)
+      send_gradients_op = embedding.generate_send_gradients_op(gradients)
+      with ops.control_dependencies([train_op, send_gradients_op]):
+        loss = array_ops.identity(loss)
+
+    loss = tpu.shard(computation,
+                     num_shards=embedding.num_cores)
+
+    with self.test_session() as sess:
+      sess.run(tpu.initialize_system(embedding_config=
+                                     embedding.config_proto))
+      sess.run(variables.global_variables_initializer())
+      sess.run(embedding_variables_and_ops.load_ops())
+      sess.run(enqueue_ops)
+      loss_val = sess.run(loss)
+    ```
+  """
+
+  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
+  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
+  # `FeatureConfig` could have fields other than table name. For example, it
+  # could have a field to indicate that the feature should not be used to
+  # update embedding table (cr/204852758, cr/204940540). Also, this can support
+  # different combiners for different features within the same table.
+  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
+  # to `FeatureConfig`?
+
+  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
+  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
+
+  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
+  # for-loops around construction of inputs.
+
+  # `optimization_parameter` applies to all tables. If the need arises,
+  # we can add `optimization_parameters` to `TableConfig` to override this
+  # global setting.
+  def __init__(self,
+               table_to_config_dict,
+               feature_to_table_dict,
+               batch_size,
+               mode,
+               master,
+               optimization_parameters=None,
+               cluster_def=None,
+               pipeline_execution_with_tensor_core=True):
+    """API for using TPU for embedding lookups.
+
+    Args:
+      table_to_config_dict: A dictionary mapping from string of table name to
+        `TableConfig`. Table refers to an embedding table, e.g. `params`
+        argument to `tf.nn.embedding_lookup_sparse()`.
+      feature_to_table_dict: A dictionary mapping from string of feature name
+        to string of table name. Feature refers to ids to lookup in embedding
+        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
+      batch_size: An `int` representing the global batch size.
+      mode: `TRAINING` or `INFERENCE`.
+      master: A `string` representing the TensorFlow master to use.
+      optimization_parameters: `AdagradParameters`, `AdamParameters`,
+        `Stochasticgradientdescentparameters`. Must be set in training and must
+        be `None` in inference.
+      cluster_def: A ClusterDef object describing the TPU cluster.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+
+    Raises:
+      ValueError: if any input is invalid.
+    """
+    _validate_table_to_config_dict(table_to_config_dict)
+    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
+    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
+
+    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
+    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
+    self._table_to_features_dict = _create_table_to_features_dict(
+        self._feature_to_table_dict)
+    self._combiners = _create_combiners(self._table_to_config_dict,
+                                        self._table_to_features_dict)
+
+    self._batch_size = batch_size
+
+    self._master = master
+    self._cluster_def = cluster_def
+    self._tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+            self._master, cluster_def=self._cluster_def))
+    if self._tpu_system_metadata.num_cores == 0:
+      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
+                       'TPUs.'.format(self._master))
+    self._num_hosts = self._tpu_system_metadata.num_hosts
+    master_job_name = tpu_system_metadata_lib.master_job(self._master,
+                                                         self._cluster_def)
+    self._hosts = sorted([
+        device.name for device in self._tpu_system_metadata.devices
+        if 'device:CPU:' in device.name and (master_job_name is None or
+                                             master_job_name in device.name)])
+    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
+    self._num_cores = self._tpu_system_metadata.num_cores
+
+    _validate_batch_size(self._batch_size, self._num_cores)
+    self._batch_size_per_core = self._batch_size // self._num_cores
+
+    # TODO(shizhiw): remove `mode`?
+    if mode == TRAINING:
+      _validate_optimization_parameters(optimization_parameters)
+      self._optimization_parameters = optimization_parameters
+    elif mode == INFERENCE:
+      if optimization_parameters is not None:
+        raise ValueError('`optimization_parameters` should be `None` '
+                         'for inference mode.')
+      self._optimization_parameters = (
+          StochasticGradientDescentParameters(1.))
+    else:
+      raise ValueError('`mode` only supports {} and {}; got {}.'
+                       .format(TRAINING, INFERENCE, mode))
+    self._mode = mode
+
+    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
+    # and create special handler for inference that inherits from
+    # StochasticGradientDescentHandler with more user-friendly error message
+    # on get_slot().
+    self._optimizer_handler = _get_optimization_handler(
+        self._optimization_parameters)
+    self._pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+    self._config_proto = self._create_config_proto()
+
+  @property
+  def hosts(self):
+    """A list of device names for CPU hosts.
+
+    Returns:
+      A list of device names for CPU hosts.
+    """
+    return copy.copy(self._hosts)
+
+  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
+  # to be consistent with `tpu_embedding_configuration.proto`.
+  @property
+  def num_cores_per_host(self):
+    """Number of TPU cores on a CPU host.
+
+    Returns:
+      Number of TPU cores on a CPU host.
+    """
+    return self._num_cores_per_host
+
+  @property
+  def num_cores(self):
+    """Total number of TPU cores on all hosts.
+
+    Returns:
+      Total number of TPU cores on all hosts.
+    """
+    return self._num_cores
+
+  @property
+  def batch_size_per_core(self):
+    """Batch size for each TPU core.
+
+    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
+       must have batch dimension equal to this.
+
+    Returns:
+      Batch size for each TPU core.
+    """
+    return self._batch_size_per_core
+
+  @property
+  def config_proto(self):
+    """Create embedding config proto for `tpu.initialize_system()`.
+
+    Returns:
+      an `TPUEmbeddingConfiguration` proto describing the desired
+         configuration of the hardware embedding lookup tables, which
+         is passed to `tpu.initialize_system()`.
+    """
+    return self._config_proto
+
+  @property
+  def table_to_config_dict(self):
+    return copy.copy(self._table_to_config_dict)
+
+  @property
+  def feature_to_table_dict(self):
+    return copy.copy(self._feature_to_table_dict)
+
+  @property
+  def table_to_features_dict(self):
+    return copy.copy(self._table_to_features_dict)
+
+  @property
+  def optimization_parameters(self):
+    return self._optimization_parameters
+
+  def _create_config_proto(self):
+    """Create `TPUEmbeddingConfiguration`."""
+    config_proto = elc.TPUEmbeddingConfiguration()
+    for table in self._table_to_config_dict:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table
+
+      table_config = self._table_to_config_dict[table]
+      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      table_descriptor.dimension = table_config.dimension
+
+      features_for_table = self._table_to_features_dict[table]
+      table_descriptor.num_features = len(features_for_table)
+
+      table_descriptor.optimization_parameters.learning_rate.constant = (
+          self._optimization_parameters.learning_rate)
+      table_descriptor.optimization_parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
+          if self._optimization_parameters.use_gradient_accumulation else
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+      self._optimizer_handler.set_optimization_parameters(table_descriptor)
+
+    config_proto.mode = self._mode
+    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
+    config_proto.num_hosts = self._num_hosts
+    config_proto.num_tensor_cores = self._num_cores
+    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def create_variables_and_ops(self, embedding_variable_name_by_table=None,
+                               slot_variable_names_by_table=None):
+    """Create embedding and slot variables, with ops to load and retrieve them.
+
+    Args:
+      embedding_variable_name_by_table: A dictionary mapping from string of
+        table name to string of embedding variable name. If `None`,
+        defaults from `get_default_slot_variable_names()` will be used.
+      slot_variable_names_by_table: A dictionary mapping from string of table
+        name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
+        `None`, defaults from `get_default_slot_variable_names()` will be used.
+
+    Returns:
+      `tpu_embedding.VariablesAndOps` with:
+        A dictionary mapping from string of table name to embedding variables,
+        A dictionary mapping from string of table name to AdagradSlotVariable,
+         AdamSlotVariables etc with slot variables,
+        A function which returns a list of ops to load embedding and slot
+         variables from TPU to CPU.
+        A function which returns a list of ops to retrieve embedding and slot
+         variables from TPU to CPU.
+    """
+    embedding_variables_by_table = {}
+    slot_variables_by_table = {}
+    load_op_fns = []
+    retrieve_op_fns = []
+    for table in self._table_to_config_dict:
+      if embedding_variable_name_by_table:
+        embedding_variable_name = embedding_variable_name_by_table[table]
+      else:
+        embedding_variable_name = table
+      if slot_variable_names_by_table:
+        slot_variable_names = slot_variable_names_by_table[table]
+      else:
+        slot_variable_names = (
+            self._optimizer_handler.get_default_slot_variable_names(table))
+
+      device_fn = _create_device_fn(self._hosts)
+      with ops.device(device_fn):
+        table_variables = _create_partitioned_variables(
+            name=embedding_variable_name,
+            num_hosts=self._num_hosts,
+            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
+            embedding_dimension=self._table_to_config_dict[table].dimension,
+            initializer=self._table_to_config_dict[table].initializer,
+            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+        embedding_variables_by_table[table] = table_variables
+
+        slot_variables_for_table, load_ops_fn, retrieve_ops_fn = (
+            self._optimizer_handler.create_variables_and_ops(
+                table, slot_variable_names, self._num_hosts,
+                self._table_to_config_dict[table], table_variables)
+        )
+        slot_variables_by_table[table] = slot_variables_for_table
+        load_op_fns.append(load_ops_fn)
+        retrieve_op_fns.append(retrieve_ops_fn)
+
+    def load_ops():
+      """Calls and returns the load ops for each embedding table.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_ops_list = []
+      for load_op_fn in load_op_fns:
+        load_ops_list.extend(load_op_fn())
+      return load_ops_list
+
+    def retrieve_ops():
+      """Calls and returns the retrieve ops for each embedding table.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_ops_list = []
+      for retrieve_op_fn in retrieve_op_fns:
+        retrieve_ops_list.extend(retrieve_op_fn())
+      return retrieve_ops_list
+
+    return VariablesAndOps(embedding_variables_by_table,
+                           slot_variables_by_table,
+                           load_ops, retrieve_ops)
+
+  def generate_enqueue_ops(self, sparse_features_list):
+    """Generate enqueue ops.
+
+    Args:
+      sparse_features_list: a list of dictionary mapping from string
+        of feature names to sparse tensor. Each dictionary is for one
+        TPU core. Dictionaries for the same host should be contiguous
+        on the list.
+
+    Returns:
+      Ops to enqueue to TPU for embedding.
+    """
+    self._validate_generate_enqueue_ops_sparse_features_list(
+        sparse_features_list)
+    return [
+        self._generate_enqueue_op(
+            sparse_features, device_ordinal=i % self._num_cores_per_host)
+        for i, sparse_features in enumerate(sparse_features_list)
+    ]
+
+  def _validate_generate_enqueue_ops_sparse_features_list(
+      self, sparse_features_list):
+    """Validate `sparse_features_list`."""
+    if len(sparse_features_list) != self._num_cores:
+      raise ValueError('Length of `sparse_features_list` should match the '
+                       'number of cores; '
+                       '`len(sparse_features_list)` is {}, '
+                       'number of cores is {}.'.format(
+                           len(sparse_features_list), self._num_cores))
+
+    feature_set = set(self._feature_to_table_dict.keys())
+    contiguous_device = None
+    for i, sparse_features in enumerate(sparse_features_list):
+      used_feature_set = set(sparse_features.keys())
+
+      # Check features are valid.
+      missing_feature_set = feature_set - used_feature_set
+      if missing_feature_set:
+        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, missing_feature_set))
+
+      extra_feature_set = used_feature_set - feature_set
+      if extra_feature_set:
+        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, extra_feature_set))
+
+      device = None
+      device_feature = None
+      for feature, tensor in six.iteritems(sparse_features):
+        combiner = self._table_to_config_dict[
+            self._feature_to_table_dict[feature]].combiner
+        if not isinstance(tensor, sparse_tensor.SparseTensor) and combiner:
+          raise ValueError('`sparse_features_list[{}]` has a feature that is '
+                           'not mapped to `SparseTensor` and has a combiner. '
+                           '`feature`: {}, combiner: {}'.format(
+                               i, feature, combiner))
+
+        # Check all features are on the same device.
+        if device is None:
+          device = tensor.op.device
+          device_feature = feature
+        else:
+          if device != tensor.op.device:
+            raise ValueError('Devices are different between features in '
+                             '`sparse_features_list[{}]`; '
+                             'devices: {}, {}; features: {}, {}.'.format(
+                                 i, device, tensor.op.device, feature,
+                                 device_feature))
+
+      if i % self._num_cores_per_host:
+        if device != contiguous_device:
+          raise ValueError('We expect the `sparse_features` which are on the '
+                           'same host to be contiguous in '
+                           '`sparse_features_list`, '
+                           '`sparse_features_list[{}]` is on device {}, '
+                           'but is expected to be on device {}.'.format(
+                               i, device, contiguous_device))
+      else:
+        contiguous_device = device
+
+  def _generate_enqueue_op(self, sparse_features, device_ordinal):
+    with ops.colocate_with(list(sparse_features.values())[0]):
+      sample_idcs, embedding_idcs, aggregation_weights, table_ids = (
+          self._format_for_tpu_embedding_sparse_tensor_batch(sparse_features))
+      return tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+          sample_idcs,
+          embedding_idcs,
+          aggregation_weights,
+          table_ids,
+          device_ordinal=device_ordinal,
+          combiners=self._combiners)
+
+  def _format_for_tpu_embedding_sparse_tensor_batch(self, sparse_features):
+    """Format sparse features for `enqueue_tpu_embedding_sparse_tensor_batch()`.
+
+    Args:
+      sparse_features: a `Dict` of tensors for embedding. Can be sparse or
+      dense.
+
+    Returns:
+      Arguments for `enqueue_tpu_embedding_sparse_tensor_batch()`.
+    """
+
+    sample_idcs, embedding_idcs, aggregation_weights, table_ids = (
+        list(), list(), list(), list())
+    for table_id, table in enumerate(self._table_to_features_dict):
+      features = self._table_to_features_dict[table]
+      for feature in features:
+        tensor = sparse_features[feature]
+        if not isinstance(tensor, sparse_tensor.SparseTensor):
+          sample_idcs.append(array_ops.zeros([0], dtype=dtypes.int32))
+          embedding_idcs.append(tensor)
+        else:
+          sample_idcs.append(tensor.indices)
+          embedding_idcs.append(tensor.values)
+        aggregation_weights.append(array_ops.zeros([0]))
+        table_ids.append(table_id)
+
+    return sample_idcs, embedding_idcs, aggregation_weights, table_ids
+
+  def get_activations(self):
+    """Get activations for features.
+
+    This should be called within `computation` that is passed to
+      `tpu.replicate` and friends.
+
+    Returns:
+      A dictionary mapping from `String` of feature name to `Tensor`
+        of activation.
+    """
+    recv_activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_to_config_dict),
+        config=self._config_proto.SerializeToString())
+
+    activations = collections.OrderedDict()
+    for table_id, table in enumerate(self._table_to_features_dict):
+      features = self._table_to_features_dict[table]
+      for lookup_id, feature in enumerate(features):
+        stride = len(self._table_to_features_dict[table])
+        activations[feature] = recv_activations[table_id][lookup_id::stride, :]
+    return activations
+
+  def generate_send_gradients_op(self, feature_to_gradient_dict):
+    """Send gradient to TPU embedding.
+
+    Args:
+      feature_to_gradient_dict: dict mapping feature names to gradient wrt
+        activations.
+
+    Returns:
+      SendTPUEmbeddingGradients Op.
+
+    Raises:
+      RuntimeError: If `mode` is not `TRAINING`.
+    """
+    if self._mode != TRAINING:
+      raise RuntimeError('Only in training mode gradients need to '
+                         'be sent to TPU embedding; got mode {}.'
+                         .format(self._mode))
+    gradients = []
+    for table in self._table_to_features_dict:
+      features = self._table_to_features_dict[table]
+      table_gradients = [
+          feature_to_gradient_dict[feature] for feature in features
+      ]
+      interleaved_table_grads = array_ops.reshape(
+          array_ops.stack(table_gradients, axis=1),
+          [-1, table_gradients[0].shape[1]])
+      gradients.append(interleaved_table_grads)
+    return tpu_ops.send_tpu_embedding_gradients(
+        inputs=gradients, config=self.config_proto.SerializeToString())
+
+
+def _validate_table_to_config_dict(table_to_config_dict):
+  """Validate `table_to_config_dict`."""
+  for k, v in six.iteritems(table_to_config_dict):
+    if not isinstance(v, TableConfig):
+      raise ValueError('Value of `table_to_config_dict` must be of type '
+                       '`TableConfig`, got {} for {}.'.format(type(v), k))
+
+
+def _validate_feature_to_table_dict(table_to_config_dict,
+                                    feature_to_table_dict):
+  """Validate `feature_to_table_dict`."""
+  used_table_set = set(feature_to_table_dict.values())
+  table_set = set(table_to_config_dict.keys())
+
+  unused_table_set = table_set - used_table_set
+  if unused_table_set:
+    raise ValueError('`table_to_config_dict` specifies table that is not '
+                     'used in `feature_to_table_dict`: {}.'
+                     .format(unused_table_set))
+
+  extra_table_set = used_table_set - table_set
+  if extra_table_set:
+    raise ValueError('`feature_to_table_dict` refers to a table that is not '
+                     'specified in `table_to_config_dict`: {}.'
+                     .format(extra_table_set))
+
+
+def _validate_batch_size(batch_size, num_cores):
+  if batch_size % num_cores:
+    raise ValueError('`batch_size` is not a multiple of number of '
+                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
+                         batch_size, num_cores))
+
+
+def _validate_optimization_parameters(optimization_parameters):
+  if not isinstance(optimization_parameters, _OptimizationParameters):
+    raise ValueError('`optimization_parameters` must inherit from '
+                     '`_OptimizationPramaters`. '
+                     '`type(optimization_parameters)`={}'.format(
+                         type(optimization_parameters)))
+
+
+class _OptimizerHandler(object):
+  """Interface class for handling optimizer specific logic."""
+
+  def __init__(self, optimization_parameters):
+    self._optimization_parameters = optimization_parameters
+
+  def set_optimization_parameters(self, table_descriptor):
+    raise NotImplementedError()
+
+  def get_default_slot_variable_names(self, table):
+    raise NotImplementedError()
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    raise NotImplementedError()
+
+
+class _AdagradHandler(_OptimizerHandler):
+  """Handles Adagrad specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdagradHandler, self).__init__(optimization_parameters)
+    self._table_to_accumulator_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adagrad.SetInParent()
+
+  def get_default_slot_variable_names(self, table):
+    return AdagradSlotVariableName('{}/{}'.format(table, 'Adagrad'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    accumulator_initializer = init_ops.constant_initializer(
+        self._optimization_parameters.initial_accumulator)
+    accumulator_variables = _create_partitioned_variables(
+        name=slot_variable_names.accumulator,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=accumulator_initializer)
+    slot_variables = AdagradSlotVariable(accumulator_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adagrad_parameters(
+                  parameters=table_variable,
+                  accumulators=accumulator_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_accumulator = (
+              tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(accumulator_variable, retrieved_accumulator))
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _AdamHandler(_OptimizerHandler):
+  """Handles Adam specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdamHandler, self).__init__(optimization_parameters)
+    self._table_to_m_variables_dict = {}
+    self._table_to_v_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adam.beta1 = (
+        self._optimization_parameters.beta1)
+    table_descriptor.optimization_parameters.adam.beta2 = (
+        self._optimization_parameters.beta2)
+    table_descriptor.optimization_parameters.adam.epsilon = (
+        self._optimization_parameters.epsilon)
+    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
+        not self._optimization_parameters.lazy_adam)
+    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
+        self._optimization_parameters.sum_inside_sqrt)
+
+  def get_default_slot_variable_names(self, table):
+    return AdamSlotVariableNames('{}/{}/m'.format(table, 'Adam'),
+                                 '{}/{}/v'.format(table, 'Adam'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    m_initializer = init_ops.zeros_initializer()
+    m_variables = _create_partitioned_variables(
+        name=slot_variable_names.m,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=m_initializer)
+    v_initializer = init_ops.zeros_initializer()
+    v_variables = _create_partitioned_variables(
+        name=slot_variable_names.v,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=v_initializer)
+    slot_variables = AdamSlotVariables(m_variables, v_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adam_parameters(
+                  parameters=table_variable,
+                  momenta=m_variable,
+                  velocities=v_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for Adam embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_m, retrieved_v = (
+              tpu_ops.retrieve_tpu_embedding_adam_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(m_variable, retrieved_m),
+              state_ops.assign(v_variable, retrieved_v))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _StochasticGradientDescentHandler(_OptimizerHandler):
+  """Handles stochastic gradient descent specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.stochastic_gradient_descent
+     .SetInParent())
+
+  def get_default_slot_variable_names(self, table):
+    return None
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    del table_config
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops
+              .load_tpu_embedding_stochastic_gradient_descent_parameters(
+                  parameters=table_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for SGD embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table = (
+              tpu_ops
+              .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return None, load_ops_fn, retrieve_ops_fn
+
+
+def _get_optimization_handler(optimization_parameters):
+  if isinstance(optimization_parameters, AdagradParameters):
+    return _AdagradHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, AdamParameters):
+    return _AdamHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
+    return _StochasticGradientDescentHandler(optimization_parameters)
+  else:
+    return NotImplementedError()
+
+
+def _create_ordered_dict(d):
+  """Create an OrderedDict from Dict."""
+  return collections.OrderedDict((k, d[k]) for k in sorted(d))
+
+
+def _create_combiners(table_to_config_dict, table_to_features_dict):
+  """Create a per feature list of combiners, ordered by table."""
+  combiners = []
+  for table in table_to_config_dict:
+    combiner = table_to_config_dict[table].combiner or 'sum'
+    combiners.extend([combiner] * len(table_to_features_dict[table]))
+  return combiners
+
+
+def _create_table_to_features_dict(feature_to_table_dict):
+  """Create mapping from table to a list of its features."""
+  table_to_features_dict_tmp = {}
+  for feature, table in six.iteritems(feature_to_table_dict):
+    if table in table_to_features_dict_tmp:
+      table_to_features_dict_tmp[table].append(feature)
+    else:
+      table_to_features_dict_tmp[table] = [feature]
+
+  table_to_features_dict = collections.OrderedDict()
+  for table in sorted(table_to_features_dict_tmp):
+    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
+  return table_to_features_dict
+
+
+def _create_device_fn(hosts):
+  """Create device_fn() to use with _create_partitioned_variables()."""
+
+  def device_fn(op):
+    """Returns the `device` for `op`."""
+    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+
+    if part_match:
+      idx = int(part_match.group(1))
+    else:
+      raise RuntimeError('Internal Error: '
+                         'Expected %s to contain /part_*.' % op.name)
+
+    device = hosts[idx]
+    return device
+
+  return device_fn
+
+
+def _create_partitioned_variables(name,
+                                  num_hosts,
+                                  vocabulary_size,
+                                  embedding_dimension,
+                                  initializer,
+                                  collections=None):  # pylint: disable=redefined-outer-name
+  """Creates ParitionedVariables based on `num_hosts` for `table`."""
+  # TODO(shizhiw): automatically place embedding lookup elsewhere?
+  if vocabulary_size < num_hosts:
+    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
+                     'As TPU embedding is not optimized for small tables, '
+                     'please consider other ways for this embedding lookup.')
+
+  return list(variable_scope.get_variable(
+      name,
+      shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
+      dtype=dtypes.float32,
+      initializer=initializer,
+      collections=collections,
+      trainable=False))
diff --git a/tensorflow/python/tpu/tpu_embedding_gradient.py b/tensorflow/python/tpu/tpu_embedding_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7de661cc35ff5439f9ce5a88fc5642cdeb07daf
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_gradient.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Optional helper for gradient handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu.ops import tpu_ops
+
+
+def get_gradients_through_compute_gradients(optimizer, loss, activations):
+  """Compute gradients to send to TPU embedding.
+
+  Args:
+    optimizer: a subclass of optimizer.Optimizer, usually CrossShardOptimizer.
+      Used to call compute_gradients().
+    loss: a Tensor to call optimizer.compute_gradients() on.
+    activations: an OrderedDict mapping feature_name to Tensors of activations.
+
+  Returns:
+    An OrderedDict mapping from feature name Strings to Tensors of gradients of
+      the loss wrt the activations of the features.
+  """
+  activation_list = activations.values()
+  grads_and_vars = optimizer.compute_gradients(loss, activation_list)
+  grads = [grad for grad, _ in grads_and_vars]
+  feature_to_gradient_dict = collections.OrderedDict(
+      zip(activations.keys(), grads))
+  return feature_to_gradient_dict
+
+
+def create_dummy_table_variables(tpu_embedding):
+  """Create dummy embedding table variables.
+
+  The sole purpose of these dummy variables are to trigger gradient
+  calcuation wrt them so that the gradients wrt activation can be captured
+  and later sent to TPU embedding.
+
+  Args:
+    tpu_embedding: TPUEmbedding, dummy table variables will be created for use
+      with tpu_embedding.
+
+  Returns:
+    A tuple of dummy variables and their initializer.
+
+  Raises:
+    RuntimeError: if collection to store gradients already exists and is not
+    empty.
+  """
+  dummy_table_variables = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_features_dict):
+    dummy_table_variables[table] = (
+        # Explicitly specifying collections prevents this variable from
+        # being added to the GLOBAL_VARIABLES collection, so that Saver()
+        # ignores it.
+        # But Tensorflow optimizer creates slot variable for these dummy
+        # variable, e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1},
+        # which will be in GLOBAL_VARIABLES collection,
+        variable_scope.get_variable(
+            'tpu_embedding_dummy_table_variable_{}'.format(table),
+            dtype=dtypes.float32,
+            shape=[1],
+            use_resource=True,
+            trainable=True,
+            collections=['tpu_embedding_dummy_table_variables']))
+
+    g = ops.get_default_graph()
+    table_gradients = g.get_collection_ref(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if table_gradients:
+      raise RuntimeError(
+          'tpu_embedding_gradients_table_{} is not empty.'.format(table_id))
+    table_gradients.extend(
+        [None] * len(tpu_embedding.table_to_features_dict[table]))
+
+  return (dummy_table_variables,
+          variables.variables_initializer(
+              dummy_table_variables.values(),
+              name='tpu_embedding_dummy_table_variables_init'))
+
+
+def hook_dummy_table_variables_to_activations(tpu_embedding, activations,
+                                              dummy_table_variables):
+  """Have activations depend on dummy table variables for gradient intercept.
+
+  Args:
+    tpu_embedding: TPUEmbedding, activations and dummy_table_variables are from
+      tpu_embedding.
+    activations: An OrderedDict of feature name String to activation tensors.
+    dummy_table_variables: An OrderedDict of table name String to dummy table
+      variables.
+
+  Returns:
+    An OrderedDict of feature name String to activation tensors, which can be
+      used just as the activations input.
+  """
+  new_activations = collections.OrderedDict()
+  for feature in activations:
+    table = tpu_embedding.feature_to_table_dict[feature]
+    new_activations[feature] = tpu_ops.tpu_embedding_activations(
+        dummy_table_variables[table],
+        activations[feature],
+        table_id=tpu_embedding.table_to_config_dict.keys().index(table),
+        lookup_id=tpu_embedding.table_to_features_dict[table].index(feature))
+  return new_activations
+
+
+def get_gradients_through_dummy_table_variables(tpu_embedding):
+  """Get gradients wrt the activations of each feature.
+
+  Args:
+    tpu_embedding: TPUEmbedding, create dummy table variable to be used with
+      tpu_embedding.
+
+  Returns:
+    An OrderedDict mapping feature name to gradient.
+
+  Raises:
+    ValueError: if some gradients are not defined.
+  """
+  g = ops.get_default_graph()
+  feature_to_gradient_dict = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_config_dict):
+    table_gradients = g.get_collection(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if any(gradient is None for gradient in table_gradients):
+      raise ValueError(
+          'Table {} with id {} has undefined gradients: this is probably '
+          'because the model asked TPUEmbedding to compute activations that '
+          'were not used.'.format(table, table_id))
+    for feature, gradient in zip(tpu_embedding.table_to_features_dict[table],
+                                 table_gradients):
+      feature_to_gradient_dict[feature] = gradient
+  return feature_to_gradient_dict
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb323653ae8e1917be4fa4562f977e6ef37a7992
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -0,0 +1,3801 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPUEstimator class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import os
+import signal
+import sys
+import threading
+import time
+
+import numpy as np
+import six
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import variable_pb2
+from tensorflow.core.framework.summary_pb2 import Summary
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary import summary
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import error_handling
+from tensorflow.python.tpu import functional as tpu_functional
+from tensorflow.python.tpu import session_support
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_context
+from tensorflow.python.tpu import tpu_embedding_gradient
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import training_loop
+from tensorflow.python.tpu import util as util_lib
+from tensorflow.python.tpu._tpu_estimator_embedding import AdagradParameters  # pylint: disable=unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import StochasticGradientDescentParameters  # pylint: disable=unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import evaluation
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
+
+_INITIAL_LOSS = 1e7
+_ZERO_LOSS = 0.
+_TPU_ESTIMATOR = 'tpu_estimator'
+_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
+_BATCH_SIZE_KEY = 'batch_size'
+_CTX_KEY = 'context'
+_USE_TPU_KEY = 'use_tpu'
+_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
+_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
+_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
+
+# Ideally _USE_TPU_KEY should be reserved as well. However there are already
+# models that make use of this key, thus it can not be reserved now to prevent
+# breakage. In the long run, we would like to mitigate this by migrating models
+# off of using _USE_TPU_KEY.
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
+_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
+
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
+class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
+
+  def AddOp(self, op):
+    if op.type in [
+        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
+        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
+    ]:
+      raise ValueError('Use tf.contrib.summary inside of host_calls.')
+
+
+def _create_global_step(graph):
+  graph = graph or ops.get_default_graph()
+  if training.get_global_step(graph) is not None:
+    raise ValueError('"global_step" already exists.')
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        ops.GraphKeys.GLOBAL_STEP,
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        use_resource=True,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
+
+
+def _create_or_get_iterations_per_loop():
+  """Creates or gets the iterations_per_loop variable.
+
+  In TPUEstimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each TPU
+  program execution and before the next TPU execution.
+
+  The purpose of using a variable, rather then a constant, is to allow
+  TPUEstimator adapt the TPU training iterations according to the final steps
+  specified by users. For example, if the user sets the iterations_per_loop as 4
+  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
+  variable will have the following value before each TPU training.
+
+      - 1-th TPU execution: iterations_per_loop = 4
+      - 2-th TPU execution: iterations_per_loop = 4
+      - 3-th TPU execution: iterations_per_loop = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi iterations_per_loop variables were found.
+  """
+  graph = ops.get_default_graph()
+  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
+  iter_vars = graph.get_collection(collection_name)
+  if len(iter_vars) == 1:
+    return iter_vars[0]
+  elif len(iter_vars) > 1:
+    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
+
+  with ops.colocate_with(training_util.get_global_step()):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
+      return variable_scope.get_variable(
+          _ITERATIONS_PER_LOOP_VAR,
+          initializer=init_ops.zeros_initializer(),
+          shape=[],
+          dtype=dtypes.int32,
+          trainable=False,
+          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
+          use_resource=True)
+
+
+def _sync_variables_ops(ctx):
+  """Create varriables synchronization ops.
+
+  Gets the variables back from TPU nodes. This means the variables updated
+  by TPU will now be *synced* to host memory.
+  In BROADCAST mode, we skip this sync since the variables are ususally too
+  big to transmit via RPC.
+
+  Args:
+    ctx: A `_InternalTPUContext` instance with mode.
+
+  Returns:
+    A list of sync ops.
+  """
+
+  if not ctx.is_input_broadcast_with_iterators():
+    return [
+        array_ops.check_numerics(v.read_value(),
+                                 'Gradient for %s is NaN' % v.name).op
+        for v in variables.trainable_variables()
+    ]
+  else:
+    return [control_flow_ops.no_op()]
+
+
+def _increase_eval_step_op(iterations_per_loop):
+  """Returns an op to increase the eval step for TPU evaluation.
+
+  Args:
+    iterations_per_loop: Tensor. The number of eval steps running in TPU system
+      before returning to CPU host for each `Session.run`.
+
+  Returns:
+    An operation
+  """
+  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
+  # Estimator evaluate increases 1 by default. So, we increase the difference.
+  return state_ops.assign_add(
+      eval_step,
+      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
+      use_locking=True)
+
+
+def _extract_key_names(tensor_or_dict):
+  if isinstance(tensor_or_dict, dict):
+    return sorted(tensor_or_dict.keys())
+  return []
+
+
+class _SIGNAL(object):
+  """Signal used to control the thread of infeed/outfeed.
+
+  All preserved signals must be negative numbers. Positive numbers are used to
+  indicate the number of iterations for next training/evaluation loop.
+  """
+  NEXT_BATCH = -1
+  STOP = -2
+
+
+class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
+
+  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
+  `export_outputs`.
+
+  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
+  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
+  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
+  To be precise, TPU evaluation expects a slightly different signature from the
+  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
+  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
+  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
+  `tensors` usually specify the model logits, which are transferred back from
+  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
+  size is the first dimension. Once all tensors are available at CPU host from
+  all shards, they are concatenated (on CPU) and passed as positional arguments
+  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
+  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
+  name to the result of calling a metric function, namely a `(metric_tensor,
+  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
+  `eval_metrics`.
+
+  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
+  function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with `tf.contrib.summary.create_file_writer`.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metrics=None,
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
+    """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
+    if eval_metrics is not None:
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
+
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in training_hooks + evaluation_hooks + prediction_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
+                        .format(hook))
+
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
+
+  def as_estimator_spec(self):
+    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    loss = self.loss
+    if tensor_tracer.TensorTracer.is_enabled() \
+       and self.train_op is not None:
+      tt = tensor_tracer.TensorTracer()
+      loss = tt.trace_cpu(ops.get_default_graph(), loss, self.train_op)
+
+    hooks = tuple(hooks or [])
+    scaffold = self.scaffold_fn() if self.scaffold_fn else None
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold,
+        training_hooks=self.training_hooks + hooks,
+        evaluation_hooks=self.evaluation_hooks + hooks,
+        prediction_hooks=self.prediction_hooks + hooks)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
+        return
+      yield iterations
+
+  def join(self):
+    logging.info('Shutting down %s thread.', self._name)
+    self.stop()
+    self._thread.join()
+
+
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
+
+  This subclass only signals once.
+  """
+
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
+
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
+
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
+
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               tpu_compile_op,
+               run_infeed_loop_on_coordinator=True,
+               rendezvous=None,
+               master=None,
+               session_config=None,
+               tpu_init_ops=None):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
+    self._rendezvous = rendezvous
+    self._master = master
+    self._session_config = session_config
+    self._init_ops = list(tpu_init_ops or [])
+    if ctx.embedding_config is None:
+      self._embedding_layer_config = None
+    else:
+      self._embedding_layer_config = (
+          ctx.embedding_config.tpu_embedding.config_proto)
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
+
+    self._feed_error = None
+    self._finished = False
+    # When using model parallelism, the TPU is pre-initialized at startup to
+    # fetch mesh information.  We skip re-initializing it here to avoid
+    # suspected issues due to the mesh layout changing on the second
+    # initialization.
+    self._should_initialize_tpu = not ctx.model_parallelism_enabled
+    self._tpu_compile_op = tpu_compile_op
+
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    if self._should_initialize_tpu:
+      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+    else:
+      self._finalize_ops = []
+
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('Infeed thread sleeping for %d seconds.',
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('Infeed thread starting after sleep')
+
+    with self._rendezvous.catch_errors(source='infeed', session=session):
+      if self._run_infeed_loop_on_coordinator:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
+
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    with self._rendezvous.catch_errors(source='outfeed', session=session):
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
+
+  def _assertCompilationSucceeded(self, result, coord):
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      logging.error('Compilation failed: {}'.format(proto.status_error_message))
+      coord.request_stop()
+    else:
+      logging.info('Compilation succeeded')
+
+  def after_create_session(self, session, coord):
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(
+              tpu.initialize_system(
+                  job=self._master_job,
+                  embedding_config=self._embedding_layer_config))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
+
+    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
+      logging.info('Compiling user program: this may take a while...')
+      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
+
+    self._infeed_controller = self._create_infeed_controller(
+        name='InfeedController', target=self._run_infeed, args=(session,))
+
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
+
+    # Enable the worker watchdog to terminate workers on coordinator exit.
+    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
+    if watchdog_timeout > 0:
+      session_support.start_worker_watchdog(session,
+                                            shutdown_timeout=watchdog_timeout)
+
+  def before_run(self, run_context):
+    self._feed_error = None
+
+    iterations = run_context.session.run(self._iterations_per_loop_var)
+
+    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
+
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
+
+  def end(self, session):
+    self._finished = True
+    logging.info('Stop infeed thread controller')
+    self._infeed_controller.join()
+    self._rendezvous.record_done('infeed')
+
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
+    self._rendezvous.record_done('outfeed')
+
+    logging.info('Shutdown TPU system.')
+    session.run(self._finalize_ops)
+
+
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
+               rendezvous=None, master=None, session_config=None):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx,
+        enqueue_ops,
+        dequeue_ops,
+        tpu_compile_op=tpu_compile_op,
+        run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
+
+
+class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step.
+
+  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
+  following differences for TPU training:
+
+  1. This hook sets the variable for iterations_per_loop, which is used by
+     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
+     As the hook execution order is not guaranteed, the variable update is
+     handled in `after_create_session` and `after_run` as
+     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
+
+  2. For each training loop (session.run), the global step could be increased
+     multiple times on TPU. The global step tensor value will be explicitly read
+     again in `after_run` to ensure the latest value is retrieved to avoid race
+     condition.
+  """
+
+  def __init__(self, iterations, num_steps=None, last_step=None):
+    """Initializes a `StopAtStepHook`.
+
+    Args:
+      iterations: The number of iterations to run optimizer per training loop.
+      num_steps: Number of steps to execute.
+      last_step: Step after which to stop.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if num_steps is None and last_step is None:
+      raise ValueError('One of num_steps or last_step must be specified.')
+    if num_steps is not None and last_step is not None:
+      raise ValueError('Only one of num_steps or last_step can be specified.')
+    self._num_steps = num_steps
+    self._last_step = last_step
+    self._iterations = iterations
+
+  def _next_iterations(self, global_step, last_step):
+    gap = last_step - global_step
+    return min(gap, self._iterations)
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError('Global step should be created.')
+
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    if self._last_step is None:
+      self._last_step = global_step + self._num_steps
+
+    iterations = self._next_iterations(global_step, self._last_step)
+
+    self._iterations_per_loop_var.load(iterations, session=session)
+
+  def after_run(self, run_context, run_values):
+    # Global step cannot be retrieved via SessionRunArgs and before_run due to
+    # race condition.
+    global_step = run_context.session.run(self._global_step_tensor)
+    if global_step >= self._last_step:
+      run_context.request_stop()
+    else:
+      iterations = self._next_iterations(global_step, self._last_step)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
+
+
+class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step."""
+
+  def __init__(self, num_steps):
+    """Initializes a `_SetEvalIterationsHook`.
+
+    Args:
+      num_steps: Number of steps to execute.
+    """
+    self._num_steps = num_steps
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    self._iterations_per_loop_var.load(self._num_steps, session=session)
+
+
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provided)
+      #  batch   1: images, labels, stop = 0  (user provided)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provided)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediction, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
+def generate_per_core_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
+  """Generates infeed enqueue ops for per-core input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A fn returns enqueue_ops."""
+    num_cores_per_host = ctx.num_of_cores_per_host
+    per_host_sharded_inputs = []
+    for core_ordinal in range(num_cores_per_host):
+      with ops.name_scope('ordinal_%d' % (core_ordinal)):
+        user_context = tpu_context.TPUContext(
+            internal_ctx=ctx,
+            input_device=host_device,
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
+        inputs = _Inputs.from_input_fn(input_fn(user_context))
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue
+
+
+def generate_per_host_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      if batch_axis is not None:
+        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A Fn returning the TPU infeed enqueue ops.
+
+    By providing as a Fn, it can be invoked inside the tf.while_loop such that
+    the input pipeline for multiple iterations can be executed by one
+    Session.run call.
+
+    Returns:
+      list of dict of ops.
+    """
+    with ops.device(device):
+      num_of_replicas_per_host = ctx.num_of_replicas_per_host
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
+      unsharded_tensor_list = (
+          inputs_structure_recorder.flatten_features_and_labels(
+              features, labels, signals))
+
+      infeed_queue = tpu_feed.InfeedQueue(
+          tuple_types=[t.dtype for t in unsharded_tensor_list],
+          tuple_shapes=[t.shape for t in unsharded_tensor_list],
+          shard_dimensions=batch_axis)
+      captured_infeed_queue.capture(infeed_queue)
+      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
+      per_host_enqueue_ops = (
+          infeed_queue.split_inputs_and_generate_enqueue_ops(
+              unsharded_tensor_list,
+              placement_function=lambda x: device,
+              tpu_ordinal_function=tpu_ordinal_function_impl))
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True,
+          num_invocations_per_step=ctx.num_of_replicas_per_host)
+
+    dataset_initializer = inputs.dataset_initializer()
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    sparse_features_list = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    cached_signals = None
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+          signals = inputs.signals()
+
+          # All the replicas share the replica 0's stopping singal.
+          # This avoids inconsistent state among different model replcias.
+          if cached_signals:
+            signals['stopping'] = cached_signals['stopping']
+          else:
+            cached_signals = signals
+
+        features, labels, sparse_features = (
+            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
+        sparse_features_list.append(sparse_features)
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels, signals))
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+      if inputs_structure_recorder.flattened_input_dims:
+        input_partition_dims = inputs_structure_recorder.flattened_input_dims
+        if signals:
+          input_partition_dims += [None] * len(signals)
+        # pylint: disable=protected-access
+        infeed_queue = tpu_feed._PartitionedInfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
+            host_id=host_id,
+            input_partition_dims=input_partition_dims,
+            device_assignment=ctx.device_assignment)
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs)
+      else:
+        infeed_queue = tpu_feed.InfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs,
+            tpu_ordinal_function=tpu_ordinal_function_impl)
+      captured_infeed_queue.capture(infeed_queue)
+
+    if ctx.embedding_config:
+      per_host_enqueue_ops.extend(
+          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
+              sparse_features_list))
+
+    if signals is None:
+      return per_host_enqueue_ops
+    else:
+      return {
+          'ops': per_host_enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
+                                      num_hosts):
+  """Generates infeed enqueue ops for one input_fn on all the hosts."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+  device_0 = ctx.tpu_host_placement_function(host_id=0)
+  with ops.device(device_0):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device_0, invocation_index=0)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+
+  def tpu_ordinal_function_impl(replica_id):
+    if ctx.device_assignment:
+      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
+    else:
+      return replica_id % num_replicas_per_host
+
+  def device_function_impl(replica_id):
+    return ctx.tpu_host_placement_function(replica_id=replica_id)
+
+  def enqueue_ops_fn():
+    """Generates enqueue ops for all the hosts."""
+    broadcasted_inputs = []
+    flattened_inputs = None  # Cache result from input_fn.
+    signals = None
+    num_replicas = ctx.num_replicas
+    core_id = 0
+    for host_id in xrange(num_hosts):
+      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
+        for _ in xrange(ctx.num_of_replicas_per_host):
+          # Note: input_fn is only called once at host 0 for the first replica.
+          # The features and labels returned from that invocation are
+          # broadcasted to other replicas(including the replicas on other
+          # hosts).
+          if flattened_inputs is None:
+            features, labels = inputs.features_and_labels()  # Calls get_next()
+            signals = inputs.signals()
+
+            inputs_structure_recorder.validate_and_record_structure(
+                features, labels)
+            flattened_inputs = (
+                inputs_structure_recorder.flatten_features_and_labels(
+                    features, labels, signals))
+            if (ctx.config.tpu_config.eval_training_input_configuration is
+                tpu_config.InputPipelineConfig.SLICED):
+              input_slices = [
+                  array_ops.split(x, num_replicas) for x in flattened_inputs
+              ]
+          if (ctx.config.tpu_config.eval_training_input_configuration is
+              tpu_config.InputPipelineConfig.SLICED):
+            # for each core, slice out the flattened_inputs for each core.
+            broadcasted_inputs.append([x[core_id] for x in input_slices])
+            core_id += 1
+          else:
+            broadcasted_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(broadcasted_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    enqueue_ops = infeed_queue.generate_enqueue_ops(
+        broadcasted_inputs,
+        tpu_ordinal_function=tpu_ordinal_function_impl,
+        placement_function=device_function_impl)
+
+    if signals is None:
+      return enqueue_ops
+    else:
+      return {
+          'ops': enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+class _InputPipeline(object):
+  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
+
+  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
+  call site.  To be precise, based on the configuration in
+  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
+  multi-host TPU training) or for one host (usually for single-host TPU
+  evaluation), and sends all `features` and `labels` returned by `input_fn` to
+  TPU infeed. For per-core invocation, `features` and `labels` are piped to
+  infeed directly, one tuple for each core. For per-host invocation,  `features`
+  and `labels` are split at host (with respect to `batch_axis`) and piped to all
+  cores accordingly.
+
+  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
+  inputs returned by the `input_fn` can have one of the following forms:
+  1. features
+  2. (features, labels)
+  3. ((arbitrarily nested structure of features), labels)
+
+  Internally, form 1 is reformed to `(features, None)` as features and labels
+  are passed separately to underlying methods. For TPU training, TPUEstimator
+  may expect multiple `features` and `labels` tuples one for each core.
+
+  TPUEstimator allows various different structures for inputs (namely `features`
+  and `labels`).  Both `features` and `labels` can be any nested sturcture
+  supported by TF nest (namely, dict, tuples, namedtuples or any nested
+  structure of such of Tensors).  `labels` could be `None` as well.
+
+  These are flattened before they are passed to the infeed/outfeed library
+  as that expectes flattend lists.
+  """
+
+  class InputsStructureRecorder(object):
+    """The recorder to record inputs structure."""
+
+    def __init__(self, input_partition_dims=None):
+      # Holds the structure of inputs
+      self._feature_structure = {}
+      self._flattened_input_dims = None
+
+      if input_partition_dims:
+        # This should have been validated in TPUConfig.
+        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
+        if len(input_partition_dims) == 2:
+          self._feature_dims, self._label_dims = input_partition_dims
+        else:
+          self._feature_dims = input_partition_dims[0]
+          self._label_dims = None
+
+        assert self._feature_dims is not None, ('input_partition_dims[0] must '
+                                                'not be None')
+      else:
+        self._feature_dims = None
+        self._label_dims = None
+
+      # Internal state.
+      self._initialized = False
+
+    @property
+    def flattened_input_dims(self):
+      assert self._initialized, 'InputsStructureRecorder is not initialized.'
+      return self._flattened_input_dims
+
+    def has_labels(self):
+      return 'labels' in self._feature_structure
+
+    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
+                            label_dims_names, label_names, has_labels):
+      """Flatten input dims with the same order as flattened input tensors."""
+      flattened_input_dims = []
+      if feature_dims_names:
+        # We need a fixed ordering for matching the tensors in features.
+        flattened_input_dims.extend(
+            [feature_dims[name] for name in feature_dims_names])
+      else:
+        flattened_input_dims.append(feature_dims)
+
+      if label_dims_names:
+        # We need a fixed ordering for matching the tensors in labels.
+        flattened_input_dims.extend(
+            [label_dims[name] for name in label_dims_names])
+      else:
+        if label_names:
+          num_tensors_in_label = len(label_names)
+        else:
+          num_tensors_in_label = int(has_labels)
+        # Setting `None` in input_partition_dims[1] will apply `None` to
+        # all the tensors in labels, regardless of internal structure.
+        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
+
+      return flattened_input_dims
+
+    def validate_and_record_structure(self, features, labels):
+      """Validates and records the structure of `features` and `labels`."""
+      # Extract structure.
+      has_labels = labels is not None
+      feature_names = _extract_key_names(features)
+      label_names = _extract_key_names(labels)
+
+      if not self._initialized:
+        # Record structure.
+        self._initialized = True
+        if self._feature_dims is not None:
+          feature_dims_names = _extract_key_names(self._feature_dims)
+          if feature_dims_names != feature_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[0] mismatched feature'
+                ' keys. Expected {}, got {}'.format(feature_names,
+                                                    feature_dims_names))
+
+          label_dims_names = _extract_key_names(self._label_dims)
+          if self._label_dims is not None and label_dims_names != label_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[1] mismatched label'
+                ' keys. Expected {}, got {}'.format(label_names,
+                                                    label_dims_names))
+
+          self._flattened_input_dims = self._flatten_input_dims(
+              self._feature_dims, feature_dims_names, self._label_dims,
+              label_dims_names, label_names, has_labels)
+
+    def flatten_features_and_labels(self, features, labels, signals=None):
+      """Flattens the `features` and `labels` to a single tensor list."""
+      self._feature_structure['features'] = features
+      if labels is not None:
+        self._feature_structure['labels'] = labels
+      if signals is not None:
+        self._feature_structure['signals'] = signals
+      return data_nest.flatten(self._feature_structure)
+
+    def unflatten_features_and_labels(self, flattened_inputs):
+      """Restores the flattened inputs to original features and labels form.
+
+      Args:
+        flattened_inputs: Flattened inputs for each shard.
+
+      Returns:
+        A tuple of (`features`, `labels`), where `labels` could be None.
+        Each one, if present, should have identical structure (single tensor vs
+        dict) as the one returned by input_fn.
+
+      Raises:
+        ValueError: If the number of expected tensors from `flattened_inputs`
+          mismatches the recorded structure.
+      """
+
+      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
+                                                      flattened_inputs)
+      return _Inputs(
+          unflattened_inputs['features'],
+          unflattened_inputs.get('labels'),
+          signals=unflattened_inputs.get('signals'))
+
+  def __init__(self, input_fn, batch_axis, ctx):
+    """Constructor.
+
+    Args:
+      input_fn: input fn for train or eval.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards.
+      ctx: A `_InternalTPUContext` instance with mode.
+
+    Raises:
+      ValueError: If both `sharded_features` and `num_cores` are `None`.
+    """
+    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
+        ctx.input_partition_dims)
+
+    self._sharded_per_core = ctx.is_input_sharded_per_core()
+    self._input_fn = input_fn
+    self._infeed_queue = None
+    self._ctx = ctx
+    self._batch_axis = batch_axis
+
+  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
+    """Generates infeed enqueue ops and dequeue_fn."""
+    # While tf.while_loop is called, the body function, which invokes
+    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
+    # structure is recorded.
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
+
+    self._validate_input_pipeline()
+
+    def dequeue_fn():
+      """dequeue_fn is used by TPU to retrieve the tensors."""
+      # In the model-parallel case, both the host-side and device-side
+      # computations must agree on the core on which infeed takes place. We
+      # choose to perform infeed on logical core 0 of each replica.
+      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
+      # The unflatten process uses the structure information recorded above.
+      return self._inputs_structure_recorder.unflatten_features_and_labels(
+          values)
+
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
+
+  def _invoke_input_fn_and_record_structure(self):
+    """Deploys the input pipeline and record input structure."""
+    enqueue_ops = []
+    infeed_queues = []
+    all_dataset_initializers = []
+    num_hosts = self._ctx.num_hosts
+    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
+    if self._sharded_per_core:
+      # Per-Core input pipeline deployment.
+      # Invoke input pipeline for each core and placed on the corresponding
+      # host.
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            enqueue_ops_fn, captured_infeed_queue = (
+                generate_per_core_enqueue_ops_fn_for_host(
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    host_device, host_id))
+
+            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
+            infeed_queues.append(captured_infeed_queue.get())
+
+    elif self._ctx.is_input_broadcast_with_iterators():
+      # Only calls input_fn in host 0.
+      host_device = tpu_host_placement_fn(host_id=0)
+      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
+                                            self._inputs_structure_recorder,
+                                            num_hosts))
+      if dataset_initializer:
+        all_dataset_initializers.append(dataset_initializer)
+        run_infeed_loop_on_coordinator = False
+        wrap_fn = (
+            _wrap_computation_in_while_loop
+            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+            _wrap_computation_in_while_loop_with_stopping_signals)
+        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+      else:
+        enqueue_ops.append(enqueue_ops_fn())
+      infeed_queues.append(captured_infeed_queue.get())
+    else:
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if dataset_initializer:
+              all_dataset_initializers.append(dataset_initializer)
+              run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
+              enqueue_ops.append(
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            infeed_queues.append(captured_infeed_queue.get())
+    # infeed_queue is used to generate dequeue ops. The only thing it uses for
+    # dequeue is dtypes and types. So, any one can be used. Here, grab the
+    # first one.
+    self._infeed_queue = infeed_queues[0]
+    return enqueue_ops, [
+        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
+    ], run_infeed_loop_on_coordinator
+
+  def _validate_input_pipeline(self):
+    """Validates the input pipeline.
+
+    Perform some sanity checks to log user friendly information. We should
+    error out to give users better error message. But, if
+    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    user code, so, log a warning.
+
+    Raises:
+      RuntimeError: If the validation failed.
+    """
+    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+      err_msg = ('Input pipeline contains one or more QueueRunners. '
+                 'It could be slow and not scalable. Please consider '
+                 'converting your input pipeline to use `tf.data` instead (see '
+                 'https://www.tensorflow.org/guide/datasets for '
+                 'instructions.')
+      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+        raise RuntimeError(err_msg)
+      else:
+        logging.warn(err_msg)
+
+
+def call_computation(computation,
+                     experimental_exported_model_uses_all_cores=True):
+  """Call computation.
+
+  computation uses a single-core for TPU inference. If
+  `experimental_exported_model_uses_all_cores` is `True`, this function will
+  round-robin
+  computation among all TPU cores visible to the host; otherwise, it will use
+  a single core.
+
+  Args:
+    computation: A Python function that takes no inputs and builds computation
+      graph. If `computation` returns m outputs, this function will return a
+      list of m Tensors.
+    experimental_exported_model_uses_all_cores: Whether to round-robin among all
+      cores visible to the host, or to use a single core.
+
+  Returns:
+    A list of output tensors.
+  """
+  if experimental_exported_model_uses_all_cores:
+    # Using `TPUPartitionedCall` makes it possible to target a different
+    # TPU core with every `Session.run()` call. Note that the entire inference
+    # graph executes on a single core, and that invocations of this graph
+    # will round-robin among the cores attached to a host.
+    @function.Defun(capture_resource_var_by_value=False)
+    def tpu_subgraph():
+      return computation()
+
+    return tpu_functional.TPUPartitionedCall(
+        args=tpu_subgraph.captured_inputs,
+        device_ordinal=tpu_ops.tpu_ordinal_selector(),
+        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
+        f=tpu_subgraph)
+  else:
+    return computation()
+
+
+class _ModelFnWrapper(object):
+  """A `model_fn` wrapper.
+
+  This makes calling model_fn on CPU and TPU easier and more consistent and
+  performs necessary check and mutation required by TPU training and evaluation.
+
+  In addition, this wrapper manages converting the `model_fn` to a single TPU
+  train and eval step.
+  """
+
+  def __init__(self, model_fn, config, params, ctx):
+    self._model_fn = model_fn
+    self._config = config
+    self._params = params
+    self._ctx = ctx
+
+  def call_without_tpu(self, features, labels, is_export_mode):
+    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
+
+  def _add_embedding_features(self, features, hook_dummy_table_variables):
+    """Add embedding features, optionally add hook to intercept gradient."""
+    if self._ctx.embedding_config:
+      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+      embedding_activations = tpu_embedding_.get_activations()
+      if hook_dummy_table_variables:
+        new_embedding_activations = (
+            tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
+                tpu_embedding_, embedding_activations,
+                self._ctx.embedding_config.dummy_table_variables))
+        features.update(new_embedding_activations)
+      else:
+        features.update(embedding_activations)
+
+  def convert_to_single_tpu_train_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single train step on TPU.
+
+    The user provided `model_fn` takes input tuple
+    (features, labels) and produces the EstimatorSpec with train_op and loss for
+    train `mode`. This usually represents a single train computation on CPU.
+
+    For TPU training, a train (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input should be taken from TPU infeed rather
+    than input pipeline (input_fn) directly. To fit TPU loop and replicate
+    pattern, the original train computation should be reformed, which is the
+    returned `train_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
+    """
+
+    host_call = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_training_hooks = _CapturedObject()
+
+    def train_step(loss):
+      """Training step function for use inside a while loop."""
+      del loss  # unused; required in function signature.
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, True)
+
+      estimator_spec = self._verify_estimator_spec(
+          self._call_model_fn(features, labels))
+      loss, train_op = estimator_spec.loss, estimator_spec.train_op
+
+      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+      else:
+        captured_scaffold_fn.capture(None)
+
+      captured_training_hooks.capture(estimator_spec.training_hooks)
+
+      if self._ctx.embedding_config is None:
+        apply_sparse_grads = []
+      else:
+        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+        gradients = (
+            tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
+                tpu_embedding_)
+        )
+        apply_sparse_grads = [
+            tpu_embedding_.generate_send_gradients_op(gradients)
+        ]
+
+      # We must run train_op to update the variables prior to running the
+      # outfeed.
+      with ops.control_dependencies([train_op] + apply_sparse_grads):
+        host_call_outfeed_ops = []
+        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
+            and estimator_spec.host_call is not None):
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
+        with ops.control_dependencies(host_call_outfeed_ops):
+          return array_ops.identity(loss)
+
+    return (train_step, host_call, captured_scaffold_fn,
+            captured_training_hooks)
+
+  def convert_to_single_tpu_eval_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single eval step on TPU.
+
+    Similar to training, the user provided `model_fn` takes input tuple
+    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
+    eval `mode`. This usually represents a single evaluation computation on CPU.
+
+    For TPU evaluation, a eval (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input and output are slightly different. Input,
+    features and labels, should be taken from TPU infeed rather than input
+    pipeline (input_fn) directly. Output is managed in two stages.  First, the
+    model outputs as the result of evaluation computation, usually model logits,
+    should be transferred from TPU system to CPU. Then, all model outputs are
+    concatenated first on CPU and sent to the metric_fn for metrics computation.
+    To fit TPU evaluation pattern, the original eval computation should be
+    reformed, which is the returned `eval_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_eval_hooks = _CapturedObject()
+
+    def eval_step(total_loss):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, False)
+
+      tpu_estimator_spec = self._call_model_fn(features, labels)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU evaluation must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      loss = tpu_estimator_spec.loss
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
+
+      to_record = {}
+      if tpu_estimator_spec.eval_metrics:
+        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return math_ops.add(total_loss, loss)
+
+    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_predict_hooks = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return (predict_step, host_calls, captured_scaffold_fn,
+            captured_predict_hooks)
+
+  def _verify_tpu_spec_predictions(self, predictions):
+    """Validates TPUEstimatorSpec.predictions dict."""
+    # TODO(xiejw): Adds validation for prediction dictionrary.
+    # TODO(xiejw): Adds support for single tensor as predictions.
+    if not isinstance(predictions, dict):
+      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+
+    for (key, tensor) in predictions.items():
+      if tensor.shape.dims[0].value is None:
+        raise ValueError(
+            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
+            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
+    return predictions
+
+  def _validate_model_features_and_labels(self, features, labels,
+                                          is_export_mode):
+    """Validates that the features and labels for the model function are valid.
+
+    A valid features/labels object is the one with:
+    - Type: A tensor or any nested structure of tensors supported by TF nest,
+        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
+    - Static shape if is_export_mode is False.
+
+    Args:
+      features: the features that would be input to the model function.
+      labels: the labels that would be input to the model function.
+      is_export_mode: boolean value specifying if in export mode.
+
+    Raises:
+      TypeError: If features/labels are not of the correct type.
+      ValueError: If features/labels have dynamic shape.
+    """
+
+    def validate(obj, obj_name):
+      """Helper validate function."""
+      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
+        return
+      if isinstance(obj, ops.Tensor):
+        if not obj.get_shape().is_fully_defined():
+          raise ValueError(
+              'The {} to the model returned by input_fn must have static shape.'
+              ' Tensor: {}'.format(obj_name, obj))
+      else:
+        for tensor in data_nest.flatten(obj):
+          if not tensor.get_shape().is_fully_defined():
+            raise ValueError(
+                ('The {} to the model returned by input_fn must have static '
+                 'shape. Tensor: {}').format(obj_name, tensor))
+
+    validate(features, 'features')
+    if labels is not None:
+      validate(labels, 'labels')
+
+  def _call_model_fn(self, features, labels, is_export_mode=False):
+    """Calls the model_fn with required parameters."""
+    self._validate_model_features_and_labels(features, labels, is_export_mode)
+    model_fn_args = function_utils.fn_args(self._model_fn)
+    kwargs = {}
+
+    # Makes deep copy with `config` and params` in case user mutates them.
+    config = copy.deepcopy(self._config)
+    params = copy.deepcopy(self._params)
+
+    if 'labels' in model_fn_args:
+      kwargs['labels'] = labels
+    elif labels is not None:
+      raise ValueError(
+          'model_fn does not take labels, but input_fn returns labels.')
+    if 'mode' in model_fn_args:
+      kwargs['mode'] = self._ctx.mode
+    if 'config' in model_fn_args:
+      kwargs['config'] = config
+    if 'params' in model_fn_args:
+      kwargs['params'] = params
+
+    if 'params' not in model_fn_args:
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
+
+    if is_export_mode:
+      batch_size_for_model_fn = None
+    else:
+      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+
+    if batch_size_for_model_fn is not None:
+      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
+
+    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
+    # In export mode, params['use_tpu'] has already been set based on mode
+    # (i.e. True for _REWRITE_FOR_INFERENCE_MODE, False otherwise).
+    if not is_export_mode:
+      _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
+
+    if not running_on_cpu:
+      user_context = tpu_context.TPUContext(
+          internal_ctx=self._ctx, call_from_input_fn=False)
+      _add_item_to_params(params, _CTX_KEY, user_context)
+
+    estimator_spec = self._model_fn(features=features, **kwargs)
+    if (running_on_cpu and
+        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
+      # The estimator_spec will be passed to `Estimator` directly, which expects
+      # type `EstimatorSpec`.
+      return estimator_spec.as_estimator_spec()
+    else:
+      return estimator_spec
+
+  def _verify_estimator_spec(self, estimator_spec):
+    """Validates the estimator_spec."""
+    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+      return estimator_spec
+
+    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
+    if estimator_spec.training_chief_hooks:
+      raise ValueError(
+          err_msg.format('training_chief_hooks') + 'If you want' +
+          ' to pass training hooks, please pass via training_hooks.')
+
+    if estimator_spec.scaffold:
+      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
+                      'Please use TPUEstimatorSpec.')
+    return estimator_spec
+
+
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
+
+  def __init__(self, ctx):
+    self._ctx = ctx
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
+
+  @staticmethod
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
+        fn_args = function_utils.fn_args(host_call[0])
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
+
+  @staticmethod
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
+
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
+
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+
+    with ops.device(tpu.core(0)):
+      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
+
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
+
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
+
+    Returns:
+      A dictionary mapping name to the return type of the host_call by that
+      name.
+
+    Raises:
+      RuntimeError: If outfeed tensor is scalar.
+    """
+    if not self._names:
+      return {}
+
+    ret = {}
+    # For each i, dequeue_ops[i] is a list containing the tensors from all
+    # shards. This list is concatenated later.
+    dequeue_ops = []
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each replica's first logical core. Note: we must
+    # constraint it such that we have at most one outfeed dequeue and enqueue
+    # per replica.
+    for i in xrange(self._ctx.num_replicas):
+      host_device, ordinal_id = self._ctx.device_for_replica(i)
+      with ops.device(host_device):
+        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
+            dtypes=tensor_dtypes,
+            shapes=tensor_shapes,
+            device_ordinal=ordinal_id)
+        for j, item in enumerate(outfeed_tensors):
+          dequeue_ops[j].append(item)
+
+    # Deconstruct dequeue ops.
+    flat_dequeue_ops = []
+    for l in dequeue_ops:
+      flat_dequeue_ops.extend(l)
+
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
+                                              len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    def _call_host_fn(fn, *args, **kw):
+      context = CatchInvalidHostcallFunctions()
+      context.Enter()
+      result = fn(*args, **kw)
+      context.Exit()
+      context.ExitResult(result)
+      return result
+
+    # It is assumed evaluation always happens on single host TPU system. So,
+    # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
+    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Make the specification of the outfeed combinaton
+          # function more explicit and well-documented.  We may want to give the
+          # user the option of concatenating along any axis.
+          if (self._ctx.config.tpu_config.per_host_input_for_training is
+              tpu_config.InputPipelineConfig.BROADCAST):
+            # If the infeed is in BROADCAST mode (each core recieving the same
+            # input), then we assume that the cores also produce identical
+            # copies of the same output, and we simply take the output from
+            # the first core.  This mode is used by Mesh-TensorFlow.
+            with ops.control_dependencies(dequeue_ops[i]):
+              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
+          else:
+            # Assume that the input has been batch-split and that axis 0 of the
+            # output tensors represents the batch size.  Concatenate along
+            # the axis 0 to re-combine the batch.
+            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise
+        else:
+          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
+
+    # force all dequeue operations to be run if not consumed by the host calls
+    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
+    return ret
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
+
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
+
+
+class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
+  """Calculate and report global_step/sec and examples/sec during runtime."""
+
+  def __init__(self,
+               batch_size,
+               every_n_steps=100,
+               every_n_secs=None,
+               output_dir=None,
+               summary_writer=None):
+    self._batch_size = batch_size
+    super(ExamplesPerSecondHook, self).__init__(
+        every_n_steps=every_n_steps,
+        every_n_secs=every_n_secs,
+        output_dir=output_dir,
+        summary_writer=summary_writer)
+
+  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
+    global_step_per_sec = elapsed_steps / elapsed_time
+    examples_per_sec = self._batch_size * global_step_per_sec
+    if self._summary_writer is not None:
+      global_step_summary = Summary(value=[
+          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
+      ])
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
+      ])
+      self._summary_writer.add_summary(global_step_summary, global_step)
+      self._summary_writer.add_summary(example_summary, global_step)
+    logging.info('global_step/sec: %g', global_step_per_sec)
+    logging.info('examples/sec: %g', examples_per_sec)
+
+
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
+class TPUEstimator(estimator_lib.Estimator):
+  """Estimator with TPU support.
+
+  TPUEstimator also supports training on CPU and GPU. You don't need to define
+  a separate `tf.estimator.Estimator`.
+
+  TPUEstimator handles many of the details of running on TPU devices, such as
+  replicating inputs and models for each core, and returning to host
+  periodically to run hooks.
+
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+
+  - For training, `model_fn` gets per-core batch size; `input_fn` may get
+    per-core or per-host batch size depending on `per_host_input_for_training`
+    in `TPUConfig` (See docstring for TPUConfig for details).
+
+  - For evaluation and prediction, `model_fn` gets per-core batch size and
+    `input_fn` get per-host batch size.
+
+  Evaluation
+  ==========
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation. If eval_on_tpu is False, the evaluation will execute on
+  CPU or GPU; in this case the following discussion on TPU evaluation does not
+  apply.
+
+  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
+  `tensors` could be a list of any nested structure of `Tensor`s (See
+  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
+  a dict from metric string name to the result of calling a metric function,
+  namely a `(metric_tensor, update_op)` tuple.
+
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
+  Current limitations:
+  --------------------
+
+  1. TPU evaluation only works on a single host (one TPU worker) except
+     BROADCAST mode.
+
+  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
+     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
+     batches should have the same size.
+
+  Example (MNIST):
+  ----------------
+
+  ```
+  # The metric Fn which runs on CPU.
+  def metric_fn(labels, logits):
+    predictions = tf.argmax(logits, 1)
+    return {
+      'accuracy': tf.metrics.precision(
+          labels=labels, predictions=predictions),
+    }
+
+  # Your model Fn which runs on TPU (eval_metrics is list in this example)
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, [labels, logits]))
+
+  # or specify the eval_metrics tensors as dict.
+  def model_fn(features, labels, mode, config, params):
+    ...
+    final_layer_output = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, {
+              'labels': labels,
+              'logits': final_layer_output,
+          }))
+  ```
+
+  Prediction
+  ==========
+
+  Prediction on TPU is an experimental feature to support large batch inference.
+  It is not designed for latency-critical system. In addition, due to some
+  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
+  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
+  convenient.
+
+  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
+  *should* raise an end-of-input exception (`OutOfRangeError` or
+  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
+  precise, the ops created by `input_fn` produce one batch of the data.
+  The `predict()` API processes one batch at a time. When reaching the end of
+  the data source, an end-of-input exception should be raised by one of these
+  operations. The user usually does not need to do this manually. As long as the
+  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
+  exception automatically after the last batch has been produced.
+
+  Note: Estimator.predict returns a Python generator. Please consume all the
+  data from the generator so that TPUEstimator can shutdown the TPU system
+  properly for user.
+
+  Current limitations:
+  --------------------
+  1. TPU prediction only works on a single host (one TPU worker).
+
+  2. `input_fn` must return a `Dataset` instance rather than `features`. In
+  fact, .train() and .evaluate() also support Dataset as return value.
+
+  Example (MNIST):
+  ----------------
+  ```
+  height = 32
+  width = 32
+  total_examples = 100
+
+  def predict_input_fn(params):
+    batch_size = params['batch_size']
+
+    images = tf.random_uniform(
+        [total_examples, height, width, 3], minval=-1, maxval=1)
+
+    dataset = tf.data.Dataset.from_tensor_slices(images)
+    dataset = dataset.map(lambda images: {'image': images})
+
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def model_fn(features, labels, params, mode):
+     # Generate predictions, called 'output', from features['image']
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      return tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          predictions={
+              'predictions': output,
+              'is_padding': features['is_padding']
+          })
+
+  tpu_est = TPUEstimator(
+      model_fn=model_fn,
+      ...,
+      predict_batch_size=16)
+
+  # Fully consume the generator so that TPUEstimator can shutdown the TPU
+  # system.
+  for item in tpu_est.predict(input_fn=input_fn):
+    # Filter out item if the `is_padding` is 1.
+    # Process the 'predictions'
+  ```
+
+  Exporting
+  =========
+
+  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
+  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
+  At serving time, these tags are used to select metagraph to load.
+
+  Before running the graph on TPU, TPU system needs to be initialized. If
+  TensorFlow Serving model-server is used, this is done automatically. If
+  not, please call `session.run(tpu.initialize_system())`.
+
+  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
+  `model_fn`.
+
+  Example:
+  ----------------
+
+  ```
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+    export_outputs = {
+      'logits': export_output_lib.PredictOutput(
+        {'logits': logits})
+    }
+
+    def host_call(logits):
+      class_ids = math_ops.argmax(logits)
+      classes = string_ops.as_string(class_ids)
+      export_outputs['classes'] =
+        export_output_lib.ClassificationOutput(classes=classes)
+
+    tpu.outside_compilation(host_call, logits)
+
+    ...
+  ```
+
+  """
+
+  def __init__(self,
+               model_fn=None,
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None,
+               eval_batch_size=None,
+               predict_batch_size=None,
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               export_to_cpu=True,
+               warm_start_from=None,
+               experimental_exported_model_uses_all_cores=False,
+               experimental_export_device_assignment=False,
+               experimental_embedding_config_spec=None):
+    """Constructs an `TPUEstimator` instance.
+
+    Args:
+      model_fn: Model function as required by `Estimator` which returns
+        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+        and `prediction_hooks` must not capure any TPU Tensor inside the
+        model_fn.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
+      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
+      params: An optional `dict` of hyper parameters that will be passed into
+        `input_fn` and `model_fn`.  Keys are names of parameters, values are
+        basic python types. There are reserved keys for `TPUEstimator`,
+        including 'batch_size'.
+      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
+        TPU training and evaluation respect this bit, but eval_on_tpu can
+        override execution of eval. See below. - Predict still happens on CPU.
+      train_batch_size: An int representing the global training batch size.
+        TPUEstimator transforms this global batch size to a per-shard batch
+        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
+        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
+        number of replicas.
+      eval_batch_size: An int representing evaluation batch size. Must be
+        divisible by total number of replicas.
+      predict_batch_size: An int representing the prediction batch size. Must be
+        divisible by total number of replicas.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards. For example, if your input_fn produced (images, labels)
+        where the images tensor is in `HWCN` format, your shard dimensions would
+        be [3, 0], where 3 corresponds to the `N` dimension of your images
+        Tensor, and 0 corresponds to the dimension along which to split the
+        labels to match up with the corresponding images. If None is supplied,
+        and per_host_input_for_training is True, batches will be sharded based
+        on the major dimension. If tpu_config.per_host_input_for_training is
+        False or `PER_HOST_V2`, batch_axis is ignored.
+      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
+        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on TPU. Note that unsupported export modes such as EVAL will be
+        ignored. For those modes, only a CPU model will be exported.
+        Currently, export_to_tpu only supports PREDICT.
+      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on CPU.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
+        configure warm-starting.  If the string filepath is provided instead of
+        a `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
+      experimental_exported_model_uses_all_cores: Whether to round-robin among
+        all cores visible to the host which is serving the saved model, or to
+        use a single core. This is a temporary flag to enable using all TPU
+        cores for inference with TPUPartitionedCall(). Once outside compilation
+        is supported in TPUPartitionedCall(), this flag will be enabled by
+        default.
+      experimental_export_device_assignment: Whether to include the device
+        assignment in the exported model. Doing so is useful in case of model
+        parallel inference but will tie the exported model to the TPU topology
+        used to export the model.
+      experimental_embedding_config_spec: Optional EmbeddingConfigSpec instance
+        to support using TPU embedding. IT IS STILL WORK IN PROGRESS, SO PLEASE
+        DO NOT USE.
+
+    Raises:
+      ValueError: `params` has reserved keys already.
+    """
+    if config is None or not isinstance(config, tpu_config.RunConfig):
+      raise ValueError(
+          '`config` must be provided with type `tpu_config.RunConfig`')
+
+    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
+
+    if use_tpu:
+      # Perform some very basic validations. More validations will be found in
+      # _InternalTPUContext.
+      if train_batch_size is None:
+        raise ValueError('`train_batch_size` cannot be `None`')
+      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
+
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
+          config.tpu_config.num_cores_per_replica):
+        raise ValueError(
+            'Model parallelism only supports per host input for training. '
+            'Please adjust TPURunconfig.per_host_input_for_training.')
+
+      if eval_batch_size is not None:
+        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
+
+      if predict_batch_size is not None:
+        util_lib.check_positive_integer(predict_batch_size,
+                                        'predict_batch_size')
+
+    # Verifies the model_fn signature according to Estimator framework.
+    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
+    # We cannot store config and params in this constructor as parent
+    # constructor might change them, such as assigning a temp dir for
+    # config.model_dir.
+    model_function = self._augment_model_fn(model_fn, batch_axis)
+
+    # Overwrite log_step_count_steps to disable TensorLoggingHook and
+    # StepCounterHook from being created in Estimator. TPUEstimator already
+    # added equivalent hooks in _augment_model_fn above.
+    self._log_every_n_steps = config.log_step_count_steps
+    config = config.replace(log_step_count_steps=None)
+
+    # Passing non-None params as wrapped model_fn has it.
+    params = params or {}
+    super(TPUEstimator, self).__init__(
+        model_fn=model_function,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        warm_start_from=warm_start_from)
+    self._iterations_per_training_loop = (
+        self._config.tpu_config.iterations_per_loop)
+
+    # All properties passed to _InternalTPUContext are immutable.
+    # pylint: disable=protected-access
+    self._ctx = tpu_context._get_tpu_context(
+        self._config, train_batch_size, eval_batch_size, predict_batch_size,
+        use_tpu, eval_on_tpu, experimental_embedding_config_spec)
+
+    self._export_to_cpu = export_to_cpu
+    self._export_to_tpu = export_to_tpu
+    self._experimental_exported_model_uses_all_cores = (
+        experimental_exported_model_uses_all_cores)
+    self._experimental_export_device_assignment = (
+        experimental_export_device_assignment)
+    if (experimental_exported_model_uses_all_cores and
+        experimental_export_device_assignment):
+      raise ValueError('experimental_exported_model_uses_all_cores and '
+                       'experimental_export_device_assignment is not supported '
+                       'at the same time.')
+
+    self._is_input_fn_invoked = None
+    self._rendezvous = {}
+
+  def _add_meta_graph_for_mode(self,
+                               builder,
+                               input_receiver_fn_map,
+                               checkpoint_path,
+                               save_variables=True,
+                               mode=model_fn_lib.ModeKeys.PREDICT,
+                               export_tags=None,
+                               check_variables=True):
+    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
+      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
+                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
+                      'for TPU.'.format(mode))
+
+    if not self._export_to_cpu and not self._export_to_tpu:
+      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
+
+    if self._export_to_cpu:
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
+      input_receiver_fn_map = {
+          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
+      }
+      export_tags = [tag_constants.SERVING, tag_constants.TPU]
+      mode = _REWRITE_FOR_INFERENCE_MODE
+
+      # See b/110052256 for why `check_variables` is `False`.
+      if not self._export_to_cpu:
+        check_variables = save_variables = True
+      else:
+        check_variables = save_variables = False
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables=save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+  def _call_model_fn(self, features, labels, mode, config):
+    if mode == _REWRITE_FOR_INFERENCE_MODE:
+      return self._call_model_fn_for_inference(features, labels, mode, config)
+    else:
+      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
+                                                      config)
+
+  def _call_model_fn_for_inference(self, features, labels, mode, config):
+    """Wraps `_call_model_fn` for `export_savedmodel`."""
+    if mode != _REWRITE_FOR_INFERENCE_MODE:
+      raise ValueError('mode must be {}; '
+                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
+
+    computation, capture = self._build_computation_for_inference(
+        features, labels, mode, config)
+    tensors = call_computation(
+        computation,
+        experimental_exported_model_uses_all_cores=self
+        ._experimental_exported_model_uses_all_cores)
+    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
+        capture.get())
+    predictions_list = tensors[:len(predictions_dict)]
+    export_outputs_list_without_none = tensors[len(predictions_dict):]
+
+    # Reinsert `None`s which we've taken out in
+    # `_build_computation_for_inference()`.
+    export_outputs_list = []
+    while none_indices or export_outputs_list_without_none:
+      if none_indices and none_indices[0] == len(export_outputs_list):
+        export_outputs_list.append(None)
+        none_indices.pop(0)
+      else:
+        export_outputs_list.append(export_outputs_list_without_none.pop(0))
+
+    # Reconstruct `export_outputs` with updated tensors.
+    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
+                                                    export_outputs_list)
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_export_outputs_dict))
+    # Reconstruct `predictions` with updated tensors.
+    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
+    if (len(new_predictions) == 1 and
+        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
+      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
+
+    return estimator_spec._replace(
+        export_outputs=new_export_outputs, predictions=new_predictions)
+
+  def _build_computation_for_inference(self, features, labels, mode, config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Computation to be passed to `TPUPartitionedCall()`."""
+      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
+          features, labels, mode, config)
+
+      if self._experimental_export_device_assignment:
+        # Export the device assignment as part of the model. This is useful for
+        # model parallel usecases where the model relies on the mapping between
+        # logical and physical devices.
+        with self._ctx.with_mode(mode) as ctx:
+          device_assignment = ctx.device_assignment
+      else:
+        device_assignment = None
+
+      if self._experimental_exported_model_uses_all_cores:
+        tensors_on_cpu = tpu.rewrite(
+            tpu_computation, device_assignment=device_assignment)
+        tpu.prune_unconnected_ops_from_xla(ops.get_default_graph())
+      else:
+        tensors_on_cpu = tpu.rewrite_for_inference(
+            tpu_computation, device_assignment=device_assignment)
+
+      (estimator_spec, export_outputs_dict, export_outputs_list,
+       predictions_dict) = (
+           tpu_capture.get())
+      predictions_list = tensors_on_cpu[:len(predictions_dict)]
+      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
+
+      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
+      # with their CPU counterpart returned from `rewrite_for_inference()`.
+      # `function.Defun()` does not like `None`s in return values, so we leave
+      # `None`s out but record their positions for later reconstruction.
+      export_outputs_list_without_none = []
+      none_indices = []
+      for i, t in enumerate(export_outputs_list):
+        if t is None:
+          none_indices.append(i)
+        else:
+          export_outputs_list_without_none.append(
+              export_outputs_tpu_on_cpu_list.pop(0))
+
+      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
+                       none_indices))
+      return predictions_list + export_outputs_list_without_none
+
+    return computation, capture
+
+  def _build_tpu_computation_for_inference(self, features, labels, mode,
+                                           config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Compute tpu tensors used in export_outputs.
+
+      Passed to rewrite_for_inference so that model_fn will be called under
+      the rewriting contexts. Only tpu tensors are returned, but export_outputs
+      and scaffold are captured.
+
+      Returns:
+         A list of Tensors used in export_outputs and not marked for
+         outside_compilation.
+      """
+      # We should only call model fn once and it should be inside `computation`
+      # so that building the graph will happen under `rewrite_for_inference`.
+      estimator_spec = super(TPUEstimator, self)._call_model_fn(
+          features, labels, mode, config)
+
+      # We pick the TPU tensors out from `export_output` and later return them
+      # from `computation` for rewriting.
+      export_outputs_dict = collections.OrderedDict(
+          (k, _export_output_to_tensors(v))
+          for k, v in six.iteritems(estimator_spec.export_outputs))
+      export_outputs_list = nest.flatten(export_outputs_dict)
+      export_outputs_tpu_list = [
+          t for t in export_outputs_list if t is not None
+      ]
+
+      if isinstance(estimator_spec.predictions, dict):
+        predictions_dict = collections.OrderedDict(
+            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
+      else:
+        predictions_dict = {
+            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
+        }
+      predictions_list = nest.flatten(predictions_dict)
+
+      # We cannot return everything we want through the return values, so
+      # capture the rest here for later use.
+      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
+                       predictions_dict))
+      return predictions_list + export_outputs_tpu_list
+
+    return computation, capture
+
+  def _create_global_step(self, graph):
+    """Creates a global step suitable for TPUs.
+
+    Args:
+      graph: The graph in which to create the global step.
+
+    Returns:
+      A global step `Tensor`.
+
+    Raises:
+      ValueError: if the global step tensor is already defined.
+    """
+    return _create_global_step(graph)
+
+  def _convert_train_steps_to_hooks(self, steps, max_steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
+            steps, max_steps)
+
+    # On TPU.
+    if steps is None and max_steps is None:
+      raise ValueError(
+          'For TPU training, one of `steps` or `max_steps` must be set. '
+          'Cannot be both `None`.')
+
+    # Estimator.train has explicit positiveness check.
+    if steps is not None:
+      util_lib.check_positive_integer(steps, 'Train steps')
+    if max_steps is not None:
+      util_lib.check_positive_integer(max_steps, 'Train max_steps')
+
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
+
+  def _convert_eval_steps_to_hooks(self, steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
+
+    if steps is None:
+      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
+
+    util_lib.check_positive_integer(steps, 'Eval steps')
+
+    return [
+        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
+            num_evals=steps),
+        _SetEvalIterationsHook(steps)
+    ]
+
+  def _call_input_fn(self, input_fn, mode):
+    """Calls the input function.
+
+    Args:
+      input_fn: The input function.
+      mode: ModeKeys
+
+    Returns:
+      In TPU mode, returns an input_fn to be called later in model_fn.
+      Otherwise, calls the input_fn and returns either fatures or
+        (features, labels).
+
+    Raises:
+      ValueError: if input_fn takes invalid arguments or does not have `params`.
+    """
+    input_fn_args = function_utils.fn_args(input_fn)
+    config = self.config  # a deep copy.
+    kwargs = {}
+    if 'params' in input_fn_args:
+      kwargs['params'] = self.params  # a deep copy.
+    else:
+      raise ValueError('input_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params["batch_size"]'.format(input_fn))
+    if 'config' in input_fn_args:
+      kwargs['config'] = config
+
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
+
+    # Records the fact input_fn has been invoked.
+    self._is_input_fn_invoked = True
+
+    with self._ctx.with_mode(mode) as ctx:
+      # Setting the batch size in params first. This helps user to have same
+      # input_fn for use_tpu=True/False.
+      batch_size_for_input_fn = ctx.batch_size_for_input_fn
+      if batch_size_for_input_fn is not None:
+        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
+                            batch_size_for_input_fn)
+
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
+        with ops.device('/device:CPU:0'):
+          return input_fn(**kwargs)
+
+      # For TPU computation, input_fn should be invoked in a tf.while_loop for
+      # performance. While constructing the tf.while_loop, the structure of
+      # inputs returned by the `input_fn` needs to be recorded. The structure
+      # includes whether features or labels is dict or single Tensor, dict keys,
+      # tensor shapes, and dtypes. The recorded structure is used to create the
+      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
+      # inside the TPU computation, as the TPU computation is wrapped inside a
+      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
+      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
+      # `features` in `model_fn` signature.
+      def _input_fn(ctx):
+        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
+        return input_fn(**kwargs)
+
+      return _input_fn
+
+  def _validate_features_in_predict_input(self, result):
+    """Skip the validation.
+
+    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
+    has stronger check. Parent class's check generates confusing warning msg.
+
+    Args:
+      result: `features` returned by input_fn.
+    """
+    pass
+
+  def train(self,
+            input_fn,
+            hooks=None,
+            steps=None,
+            max_steps=None,
+            saving_listeners=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
+    try:
+      return super(TPUEstimator, self).train(
+          input_fn=input_fn,
+          hooks=hooks,
+          steps=steps,
+          max_steps=max_steps,
+          saving_listeners=saving_listeners)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('training_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('training_loop')
+      rendezvous.raise_errors()
+
+  def evaluate(self,
+               input_fn,
+               steps=None,
+               hooks=None,
+               checkpoint_path=None,
+               name=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
+    try:
+      return super(TPUEstimator, self).evaluate(
+          input_fn,
+          steps=steps,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('evaluation_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('evaluation_loop')
+      rendezvous.raise_errors()
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None,
+              yield_single_examples=True):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
+    try:
+      for result in super(TPUEstimator, self).predict(
+          input_fn=input_fn,
+          predict_keys=predict_keys,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          yield_single_examples=yield_single_examples):
+        yield result
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('prediction_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('prediction_loop')
+      rendezvous.raise_errors()
+
+    rendezvous.record_done('prediction_loop')
+    rendezvous.raise_errors()
+
+  def _augment_model_fn(self, model_fn, batch_axis):
+    """Returns a new model_fn, which wraps the TPU support."""
+
+    def _model_fn(features, labels, mode, config, params):
+      """A Estimator `model_fn` for TPUEstimator."""
+
+      # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
+      # but not in `export_savedmodel()`.
+      if self._is_input_fn_invoked:
+        is_export_mode = False
+      else:
+        is_export_mode = True
+
+      # Clear the bit.
+      self._is_input_fn_invoked = None
+
+      if is_export_mode:
+        if mode == _REWRITE_FOR_INFERENCE_MODE:
+          _add_item_to_params(params, _USE_TPU_KEY, True)
+          mode = model_fn_lib.ModeKeys.PREDICT
+        else:
+          _add_item_to_params(params, _USE_TPU_KEY, False)
+
+      with self._ctx.with_mode(mode) as ctx:
+        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
+
+        # examples_hook is added to training_hooks for both CPU and TPU
+        # execution.
+        if self._log_every_n_steps is not None:
+          examples_hook = ExamplesPerSecondHook(
+              ctx.global_batch_size,
+              # pylint:disable=g-long-ternary
+              output_dir=(self.model_dir
+                          if not config or config.save_summary_steps
+                          else None),
+              # pylint:enable=g-long-ternary
+              every_n_steps=self._log_every_n_steps)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
+          logging.info('Running %s on CPU', mode)
+          estimator_spec = model_fn_wrapper.call_without_tpu(
+              features, labels, is_export_mode=is_export_mode)
+          if self._log_every_n_steps is not None:
+            estimator_spec = estimator_spec._replace(
+                training_hooks=estimator_spec.training_hooks + (examples_hook,))
+          return estimator_spec
+
+        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
+        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
+        assert callable(features), '`input_fn` is not callable.'
+        input_fn = features
+
+        tpu_init_ops = []
+        if ctx.embedding_config and mode == model_fn_lib.ModeKeys.TRAIN:
+          dummy_table_variables, dummy_table_variables_init = (
+              tpu_embedding_gradient.create_dummy_table_variables(
+                  ctx.embedding_config.tpu_embedding))
+          ctx.embedding_config.dummy_table_variables = dummy_table_variables
+          tpu_init_ops.append(dummy_table_variables_init)
+
+        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
+            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
+
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
+        if mode == model_fn_lib.ModeKeys.TRAIN:
+          compile_op, loss, host_call, scaffold, training_hooks = (
+              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          if ctx.embedding_config:
+            g = ops.get_default_graph()
+            table_to_config_dict = (
+                ctx.embedding_config.tpu_embedding.table_to_config_dict)
+            optimization_parameters = (
+                ctx.embedding_config.tpu_embedding.optimization_parameters)
+            embedding_variable_name_by_table, slot_variable_names_by_table = (
+                _tpu_estimator_embedding.get_full_variable_names(
+                    g, table_to_config_dict, optimization_parameters
+                )
+            )
+            embedding_variables_and_ops = (
+                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
+                    embedding_variable_name_by_table,
+                    slot_variable_names_by_table
+                ))
+            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
+
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
+
+          shutdown_hooks = []
+          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
+                                         'shutdown_worker')
+          if shutdown_mode:
+            if shutdown_mode == 'shutdown_worker':
+              finalizer_hooks = [
+                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
+              ]
+            elif shutdown_mode == 'shutdown_computation':
+              finalizer_hooks = [
+                  session_support.RestartComputation(timeout_ms=60 * 1000),
+              ]
+            else:
+              raise ValueError(
+                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
+
+            shutdown_hooks.append(
+                session_support.GracefulShutdownHook(
+                    checkpoint_prefix=self.model_dir + '/model.ckpt',
+                    on_shutdown_hooks=finalizer_hooks))
+
+          with ops.control_dependencies([loss]):
+            global_step = array_ops.identity(training.get_global_step())
+          hooks = input_hooks + shutdown_hooks
+          hooks.extend([
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops),
+              InstallSignalHandlerHook()
+          ])
+          if self._log_every_n_steps is not None:
+            logging_hook_frequency = (  # Divide and round up
+                (self._log_every_n_steps +
+                 self._config.tpu_config.iterations_per_loop - 1) //
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(
+                training.LoggingTensorHook({
+                    'loss': array_ops.identity(loss),
+                    'step': global_step,
+                },
+                                           every_n_iter=logging_hook_frequency))
+            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(examples_hook)
+
+          if training_hooks:
+            hooks.extend(training_hooks)
+
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            checkpoint_hook = training.CheckpointSaverHook(
+                self.model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=scaffold)
+            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            chief_hooks.append(checkpoint_hook)
+
+          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
+          with ops.control_dependencies([loss]):
+            update_ops = _sync_variables_ops(ctx)
+            if ctx.embedding_config:
+              update_ops.extend(embedding_variables_and_ops.retrieve_ops())
+
+          # Validate the TPU training graph to catch basic errors
+          _validate_tpu_training_graph()
+
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=loss,
+              training_chief_hooks=chief_hooks,
+              training_hooks=hooks,
+              train_op=train_op,
+              scaffold=scaffold)
+
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
+              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          if ctx.embedding_config:
+            g = ops.get_default_graph()
+            table_to_config_dict = (
+                ctx.embedding_config.tpu_embedding.table_to_config_dict)
+            embedding_variable_name_by_table, _ = (
+                _tpu_estimator_embedding.get_full_variable_names(
+                    g, table_to_config_dict)
+            )
+            embedding_variables_and_ops = (
+                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
+                    embedding_variable_name_by_table
+                ))
+            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(
+              total_loss,
+              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops(ctx)
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+
+          eval_metrics = host_call_ret.get('eval_metrics', {})
+          if eval_metrics:
+            # Creates a dummy metric update_op for all metrics. Estimator
+            # expects all metrics in `eval_metric_ops` have update_op and calls
+            # them one by one. The real metric update_ops are invoked in a
+            # separated thread. So, here give Estimator the dummy op for all
+            # metrics.
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+            for k, v in eval_metrics.items():
+              eval_metric_ops[k] = (v[0], dummy_update_op)
+              eval_update_ops.append(v[1])
+          else:
+            # If no eval metrics are passed, create an identity node for the
+            # loss and add `internal_ops_to_run` to its dependencies. So
+            # `internal_ops_to_run` can be executed.
+            with ops.control_dependencies(internal_ops_to_run):
+              mean_loss = array_ops.identity(mean_loss)
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops)
+          ] + input_hooks
+
+          if eval_hooks:
+            hooks.extend(eval_hooks)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        (compile_op, dummy_predict_op, host_calls,
+         scaffold, prediction_hooks) = _predict_on_tpu_system(
+             ctx, model_fn_wrapper, dequeue_fn)
+        with ops.control_dependencies([dummy_predict_op]):
+          internal_ops_to_run = _sync_variables_ops(ctx)
+          with ops.control_dependencies(internal_ops_to_run):
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
+
+        host_call_ret = host_calls.create_tpu_hostcall()
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        _verify_cross_hosts_transfer_size(
+            predictions,
+            message=(
+                'The estimated size for TPUEstimatorSpec.predictions is too '
+                'large.'))
+        signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              signals)
+          predictions = _PaddingSignals.slice_tensor_or_dict(
+              predictions, signals)
+
+        hooks = [
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                tpu_compile_op=compile_op,
+                master=self._config.master,
+                session_config=self._session_config),
+        ] + input_hooks
+
+        if prediction_hooks:
+          hooks.extend(prediction_hooks)
+
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            prediction_hooks=hooks,
+            predictions=predictions,
+            scaffold=scaffold)
+
+    return _model_fn
+
+
+def _export_output_to_tensors(export_output):
+  """Get a list of `Tensors` used in `export_output`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+
+  Returns:
+    a list of tensors used in export_output.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    return [export_output.scores, export_output.classes]
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    return [export_output.value]
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return list(export_output.outputs.values())
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _clone_export_output_with_tensors(export_output, tensors):
+  """Clones `export_output` but with new `tensors`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+    tensors: a list of `Tensors` used to construct a new `export_output`.
+
+  Returns:
+    A dict similar to `export_output` but with `tensors`.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    if len(tensors) != 2:
+      raise ValueError('tensors must be of length 2; '
+                       'got {}.'.format(len(tensors)))
+    return export_output_lib.ClassificationOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    if len(tensors) != 1:
+      raise ValueError('tensors must be of length 1; '
+                       'got {}'.format(len(tensors)))
+    return export_output_lib.RegressionOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return export_output_lib.PredictOutput(
+        dict(zip(export_output.outputs.keys(), tensors)))
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_eval_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
+                                [_ZERO_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
+
+
+def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_train_step, host_call, captured_scaffold_fn,
+   captured_training_hooks) = (
+       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_train_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
+                                [_INITIAL_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
+
+
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
+   captured_predict_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  dummy_predict_op = dummy_predict_op[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return (compile_op, dummy_predict_op, host_calls, scaffold,
+          captured_predict_hooks.get())
+
+
+def _wrap_computation_in_while_loop(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def computation(i):
+    with ops.control_dependencies(op_fn()):
+      return i + 1
+
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    iterations = array_ops.identity(iterations_per_loop_var)
+    return control_flow_ops.while_loop(
+        lambda i: i < iterations,
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
+
+
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
+
+
+def _validate_tpu_training_graph():
+  """Validate graph before running distributed training.
+
+  Raises:
+    ValueError: If the graph seems invalid for running on device
+  """
+  operations = ops.get_default_graph().get_operations()
+
+  # Check if there is atleast one CrossReplicaSum operation in the graph
+  # This should be introduced by using the CrossShardOptimizer wrapper
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
+  if not cross_replica_sum_ops:
+    raise ValueError(
+        'CrossShardOptimizer must be used for model training on TPUs.')
+
+
+class _CapturedObject(object):
+  """A placeholder to capture an object.
+
+  This is useful when we need to capture a Python object in the Tensorflow
+  control flow body function and use it outside the control flow.
+  """
+
+  def __init__(self):
+    self._object = None
+    self._captured = False
+
+  def capture(self, o):
+    if self._captured:
+      raise RuntimeError(
+          'InternalError: Object can capture only once. Please file bug.')
+
+    self._captured = True
+    self._object = o
+
+  def get(self):
+    if not self._captured:
+      raise RuntimeError(
+          'InternalError: Object is not captured properly before `get`. '
+          'Please file bug.')
+    return self._object
+
+
+def _get_scaffold(captured_scaffold_fn):
+  """Retrieves the Scaffold from `captured_scaffold_fn`."""
+  with _CapturingContext(message='Inside scaffold_fn'):
+    scaffold_fn = captured_scaffold_fn.get()
+    if scaffold_fn:
+      scaffold = scaffold_fn()
+      if scaffold is None:
+        raise ValueError(
+            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
+    else:
+      scaffold = None
+
+  if scaffold:
+    wrapped_finalize = scaffold.finalize
+
+    def _finalize():
+      with _CapturingContext('Inside Scaffold.finalize'):
+        wrapped_finalize()
+
+    scaffold.finalize = _finalize
+  return scaffold
+
+
+class _CapturingContext(control_flow_ops.ControlFlowContext):
+  """Tracks references to Tensors defined in TPU replication."""
+
+  def __init__(self, message):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._message = message
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    # pylint: disable=useless-super-delegation
+    # NOTE(slebedev): the method is required by `ControlFlowContext`.
+    super(_CapturingContext, self).to_control_flow_context_def(
+        context_def, export_scope)
+
+  def AddOp(self, op):  # pylint: disable=invalid-name
+    for c in op.inputs:
+      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
+
+  def __enter__(self):
+    # pylint: disable=protected-access
+    self._g = ops.get_default_graph()
+    self._old = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
+    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+    self._signals = signals
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.DatasetV2):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return features, labels
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer(self):
+    """Returns the dataset's initializer.
+
+    The initializer must be run before calling `features_and_labels`.
+    """
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return self._iterator.initializer
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must run dataset_initializer '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
+      return _Inputs._parse_inputs(self._iterator.get_next())
+
+    return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self,
+               dataset,
+               batch_size,
+               add_padding=False,
+               num_invocations_per_step=1):
+
+    assert dataset is not None
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size, add_padding=add_padding))
+    if num_invocations_per_step == 1:
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+    else:
+      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
+      # user_provided_dataset and stop properly.
+      # For example, if num_invocations_per_step is 2, we append 3 additional
+      # padding batches: b1, b2, b3.
+      # If user_provided_dataset contains two batches: a1, a2
+      # Step 1: [a1, a2]
+      # Step 2: [b1, b2] -> STOP
+      # If user_provided_dataset contains three batches: a1, a2, a3.
+      # The training loops:
+      # Step 1: [a1, a2]
+      # Step 2: [a3, b1]
+      # Step 3: [b2, b3] -> STOP.
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+      final_batch_dataset = final_batch_dataset.repeat(
+          2 * num_invocations_per_step - 1)
+
+      def _set_mask(data_dict):
+        signals = data_dict['signals']
+        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
+        data_dict['signals'] = signals
+        return data_dict
+
+      # Mask out the extra batch.
+      final_batch_dataset = final_batch_dataset.map(_set_mask)
+
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size, add_padding=False):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+      add_padding: bool, whether to pad the tensor to full batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      """The map fn to insert signals."""
+      if len(args) == 1:
+        # Unpack the single Tensor/dict argument as features. This is required
+        # for the input_fn returns no labels.
+        args = args[0]
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+
+      if add_padding:
+        padding_mask, features, labels = (
+            _PaddingSignals.pad_features_and_labels(features, labels,
+                                                    batch_size))
+
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+
+      else:
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+        padding_mask = None
+
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size,
+          padding_mask=padding_mask).as_dict()
+
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = False
+  STOPPING_SIGNAL = True
+
+  def __init__(self, stop, batch_size, padding_mask=None):
+    self._stop = stop
+    self._batch_size = batch_size
+    self._padding_mask = padding_mask
+
+  def as_dict(self):
+    """Returns the signals as Python dict."""
+    shape = [self._batch_size, 1]
+    dtype = dtypes.bool
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    signals = {'stopping': stopping}
+    if self._padding_mask is not None:
+      signals['padding_mask'] = self._padding_mask
+    return signals
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    """Detects whether scalar_stopping_signal indicates stopping."""
+    if isinstance(scalar_stopping_signal, ops.Tensor):
+      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
+      # way to express the bool check whether scalar_stopping_signal is True.
+      return math_ops.logical_and(scalar_stopping_signal,
+                                  _StopSignals.STOPPING_SIGNAL)
+    else:
+      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
+      # the graph anymore. Here, we use pure Python.
+      return bool(scalar_stopping_signal)
+
+
+class _PaddingSignals(object):
+  """Signals class holding all logic to handle padding."""
+
+  @staticmethod
+  def pad_features_and_labels(features, labels, batch_size):
+    """Pads out the batch dimension of features and labels."""
+    real_batch_size = array_ops.shape(
+        _PaddingSignals._find_any_tensor(features))[0]
+
+    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
+
+    check_greater = check_ops.assert_greater_equal(
+        batch_size_tensor,
+        real_batch_size,
+        data=(batch_size_tensor, real_batch_size),
+        message='The real batch size should not be greater than batch_size.')
+
+    with ops.control_dependencies([check_greater]):
+      missing_count = batch_size_tensor - real_batch_size
+
+    def pad_single_tensor(tensor):
+      """Pads out the batch dimension of a tensor to the complete batch_size."""
+      rank = len(tensor.shape)
+      assert rank > 0
+      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
+      padded_tensor = array_ops.pad(tensor, padding)
+      padded_tensor.set_shape(padded_shape)
+      return padded_tensor
+
+    def nest_pad(tensor_or_dict):
+      return nest.map_structure(pad_single_tensor, tensor_or_dict)
+
+    features = nest_pad(features)
+    if labels is not None:
+      labels = nest_pad(labels)
+
+    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
+                                                 batch_size)
+
+    return padding_mask, features, labels
+
+  @staticmethod
+  def slice_tensor_or_dict(tensor_or_dict, signals):
+    """Slice the real Tensors according to padding mask in signals."""
+
+    padding_mask = signals['padding_mask']
+    batch_size = array_ops.shape(padding_mask)[0]
+
+    def verify_batch_size(tensor):
+      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
+      with ops.control_dependencies([check_batch_size]):
+        return array_ops.identity(tensor)
+
+    def slice_single_tensor(tensor):
+      rank = len(tensor.shape)
+      assert rank > 0
+      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
+      return verify_batch_size(tensor)[0:real_batch_size]
+
+    # As we split the Tensors to all TPU cores and concat them back, it is
+    # important to ensure the real data is placed before padded ones, i.e.,
+    # order is preserved. By that, the sliced padding mask should have all 0's.
+    # If this assertion failed, # the slice logic here would not hold.
+    sliced_padding_mask = slice_single_tensor(padding_mask)
+    assert_padding_mask = math_ops.equal(
+        math_ops.reduce_sum(sliced_padding_mask), 0)
+
+    with ops.control_dependencies([assert_padding_mask]):
+      should_stop = _StopSignals.should_stop(
+          _StopSignals.as_scalar_stopping_signal(signals))
+
+    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
+
+    def slice_fn(tensor):
+      # If the current batch is full batch or part of stopping signals, we do
+      # not need to slice to save performance.
+      return control_flow_ops.cond(
+          math_ops.logical_or(should_stop, is_full_batch),
+          (lambda: verify_batch_size(tensor)),
+          (lambda: slice_single_tensor(tensor)))
+
+    return nest.map_structure(slice_fn, tensor_or_dict)
+
+  @staticmethod
+  def _find_any_tensor(batch_features):
+    tensors = [
+        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
+    ]
+    if not tensors:
+      raise ValueError('Cannot find any Tensor in features dict.')
+    return tensors[0]
+
+  @staticmethod
+  def _padding_mask(real_batch_size, missing_count, batch_size):
+    padding_mask = array_ops.concat([
+        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+        array_ops.ones((missing_count,), dtype=dtypes.int32)
+    ],
+                                    axis=0)
+    padding_mask.set_shape((batch_size,))
+    return padding_mask
+
+
+def _verify_cross_hosts_transfer_size(tensor_dict, message):
+  total_size = 0
+  tensor_structure = {}
+  for key, tensor in tensor_dict.items():
+    shape = tensor.shape
+    size = np.product(shape) * tensor.dtype.size
+    tensor_structure[key] = shape
+    total_size += size
+  if total_size >= _ONE_GIGABYTE:
+    raise ValueError(
+        '{} The transfer size is larger than the protobuf limit. Please '
+        'consider to use Tensors with smaller shapes or reduce batch '
+        'size. Given:\n'
+        '{}'.format(
+            message, '\n'.join([
+                ' -- Key: {}, Shape: {}'.format(k, v)
+                for k, v in tensor_structure.items()
+            ])))
+
+
+def _add_item_to_params(params, key, value):
+  """Adds a new item into `params`."""
+  if hasattr(params, 'set_hparam'):
+    # For HParams, we need to use special API.
+    if key in params:
+      params.set_hparam(key, value)
+    else:
+      params.add_hparam(key, value)
+  else:
+    # Now params is Python dict.
+    params[key] = value
+
+
+def export_estimator_savedmodel(estimator,
+                                export_dir_base,
+                                serving_input_receiver_fn,
+                                assets_extra=None,
+                                as_text=False,
+                                checkpoint_path=None,
+                                strip_default_attrs=False):
+  """Export `Estimator` trained model for TPU inference.
+
+  Args:
+    estimator: `Estimator` with which model has been trained.
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    serving_input_receiver_fn: A function that takes no argument and returns a
+      `ServingInputReceiver` or `TensorServingInputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs.
+
+  Returns:
+    The string path to the exported directory.
+  """
+  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
+  # `estimator.config`.
+  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
+  est = TPUEstimator(
+      estimator._model_fn,  # pylint: disable=protected-access
+      config=config,
+      params=estimator.params,
+      use_tpu=True,
+      train_batch_size=2048,  # Does not matter.
+      eval_batch_size=2048,  # Does not matter.
+  )
+  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                               assets_extra, as_text, checkpoint_path,
+                               strip_default_attrs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/python/tpu/tpu_estimator_signals_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
rename to tensorflow/python/tpu/tpu_estimator_signals_test.py
index e3ea983abfd24d03c964fbc647b56262e15e0a96..ca3eeaa9c9ace9bdbf6a3c6efa8b84eeecc7a60f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/python/tpu/tpu_estimator_signals_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_estimator
 
 
 def make_input_fn(num_samples):
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1adc80e6015a8418bd3ea470d1a17561eb542b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -0,0 +1,919 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Helper library for handling infeed between hosts and TPUs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_sharding
+from tensorflow.python.tpu.ops import tpu_ops
+
+from tensorflow.python.util import nest
+
+
+def partition_or_replicate_on_host(tensor, dims):
+  """Partitions or replicates the input tensor.
+
+    The ops inside this function are placed on the host side.
+
+  Args:
+    tensor: The input tensor which will be partioned or replicated.
+    dims: A list of integer describes how to partition the input tensor.
+
+  Returns:
+    An iterator of `Tensor`s or a list of partioned tensors.
+  """
+  if dims is None:
+    return itertools.repeat(tensor)
+  dims = np.array(dims)
+  output = [tensor]
+  shape_list = np.array(tensor.shape.as_list())
+  quotients, remainders = np.divmod(shape_list, dims)
+  for axis, (quotient, remainder, dim, original_size) in enumerate(
+      zip(quotients, remainders, dims, shape_list)):
+    if dim <= 1:
+      continue
+    if remainder > 0:
+      # For each dimension, when it cannot be evenly partitioned, XLA assumes
+      # tensors are partitioned in a greedy manner by using
+      # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+      # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+      # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+      # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+      ceil_ratio = quotient + 1
+      num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+      num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+      if len(num_or_size_splits) < dim:
+        num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+      new_output = []
+      for x in output:
+        new_output.append(
+            array_ops.split(
+                x, num_or_size_splits=num_or_size_splits, axis=axis))
+      output = new_output
+    else:
+      output = [array_ops.split(x, dim, axis=axis) for x in output]
+    output = nest.flatten(output)
+  return output
+
+
+def _tag_sharding_attribute_for_dequeued_tensor(tensor, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensor.
+
+  Args:
+    tensor: The dequeued tensor on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same tensor with the xla_sharding attribute.
+  """
+  if dims is None:
+    return xla_sharding.replicate(tensor)
+  elif np.prod(dims) == 1:
+    return xla_sharding.assign_device(tensor, 0)
+  else:
+    tile_assignment = np.arange(np.prod(dims)).reshape(dims)
+    return xla_sharding.tile(tensor=tensor, tile_assignment=tile_assignment)
+
+
+def tag_sharding_attribute_for_dequeued_tensors(dequeues, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensors.
+
+  Args:
+    dequeues: A list of dequeued tensors on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same dequeues with appropriate xla_sharding attribute.
+  """
+  nest.assert_shallow_structure(dequeues, dims)
+  return nest.map_structure_up_to(
+      dequeues, _tag_sharding_attribute_for_dequeued_tensor, dequeues, dims)
+
+
+class InfeedQueue(object):
+  """A helper object to build a device infeed queue.
+
+  The InfeedQueue builds the host-side and device-side Ops to enqueue and
+  dequeue elements, respectively, and ensures that their types and
+  shapes match.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               shard_dimensions=None,
+               name=None):
+    """Creates a new InfeedQueue with the given configuration.
+
+    The configuration need not be fully specified at creation since it
+    can be modified subsequently by methods that set the values
+    explicitly or infer them from the shapes of inputs.
+
+    Args:
+      number_of_tuple_elements: the number of Tensors fed atomically through the
+        queue, must be present unless it can be inferred from other arguments.
+      tuple_types: if not None, a list of types of the elements of the queue.
+      tuple_shapes: if not None, a list of shapes of the elements of the queue.
+      shard_dimensions: if not None, a list of dimensions on which the
+        elements of the queue should be sharded during automatic
+        parallelization.
+      name: the name of the queue.
+
+    Raises:
+      ValueError: if number_of_tuple_elements <= 0; or
+        number_of_tuple_arguments, tuple_types, tuple_shapes, and
+        shard_dimensions are all None; or the length of tuple_types,
+        tuple_shapes, or shard_dimensions is not equal to
+        number_of_tuple_elements; or any element of shard_dimensions
+        can't be converted to a Dimension.
+      TypeError: if any element of tuple_types or tuple_shapes can't
+        be converted to a dtype or TensorShape, respectively.
+    """
+    self._frozen = False
+    self._generated_enqueue_ops = False
+    self._generated_dequeue_op = False
+    self._name = "InfeedQueue" if name is None else name
+    if number_of_tuple_elements is None:
+      if tuple_types is not None:
+        number_of_tuple_elements = len(tuple_types)
+      elif tuple_shapes is not None:
+        number_of_tuple_elements = len(tuple_shapes)
+      elif shard_dimensions is not None:
+        number_of_tuple_elements = len(shard_dimensions)
+      else:
+        raise ValueError(
+            "number of tuple elements cannot be inferred from InfeedQueue "
+            "constructor")
+    if number_of_tuple_elements <= 0:
+      raise ValueError("number_of_tuple_elements %d must be > 0" %
+                       number_of_tuple_elements)
+    # Make an empty sharding policy for each tuple element.
+    self._sharding_policies = [
+        tpu_sharding.ShardingPolicy()
+        for _ in xrange(number_of_tuple_elements)
+    ]
+    if tuple_types is not None:
+      self.set_tuple_types(tuple_types)
+    else:
+      self._tuple_types = None
+    if tuple_shapes is not None:
+      self.set_tuple_shapes(tuple_shapes)
+    else:
+      self._tuple_shapes = None
+    if shard_dimensions is not None:
+      self.set_shard_dimensions(shard_dimensions)
+    self._validate()
+
+  def _validate(self):
+    """Checks that the configuration is self-consistent.
+
+    Raises:
+      ValueError: if the shapes and sharding policies don't match.
+    """
+    if self.tuple_shapes is not None:
+      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
+        # Raise an error if the policy is incompatible with the shape.
+        _ = policy.get_sharded_shape(shape)
+
+  @property
+  def number_of_tuple_elements(self):
+    """Returns the number of InfeedQueue tuple elements."""
+    return len(self._sharding_policies)
+
+  @property
+  def tuple_types(self):
+    """Returns the types of the InfeedQueue tuple elements."""
+    return self._tuple_types
+
+  def set_tuple_types(self, tuple_types):
+    """Sets the type of each element of the queue.
+
+    tuple_types must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a dtype.
+
+    Args:
+      tuple_types: the types of each queue element.
+
+    Raises:
+      ValueError: if tuple_types is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_types cannot be converted to a
+        dtype.
+    """
+    if len(tuple_types) != self.number_of_tuple_elements:
+      raise ValueError("tuple_types is %s, but must be a list of length %d" %
+                       (str(tuple_types), self.number_of_tuple_elements))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_types, tuple_types):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible type. Frozen types are %s, updated types are %s" % (
+                  str(self._tuple_types), str(tuple_types)))
+    else:
+      try:
+        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
+      except (TypeError) as e:
+        raise TypeError(
+            "tuple_types is %s, but must be a list of elements each "
+            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
+
+  @property
+  def tuple_shapes(self):
+    """Returns the shapes of the InfeedQueue tuple elements."""
+    return self._tuple_shapes
+
+  def set_tuple_shapes(self, tuple_shapes):
+    """Sets the shape of each element of the queue.
+
+    tuple_shapes must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a TensorShape.
+
+    Args:
+      tuple_shapes: the shapes of each queue element.
+
+    Raises:
+      ValueError: if tuple_shapes is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_shapes cannot be converted to
+        a TensorShape.
+    """
+    if len(tuple_shapes) != self.number_of_tuple_elements:
+      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
+                       (str(tuple_shapes), self.number_of_tuple_elements))
+    try:
+      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
+    except (ValueError, TypeError) as e:
+      raise TypeError(
+          "tuple_shapes is %s, but must be a list of elements each "
+          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
+                                                        str(e)))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
+              % (str(self._tuple_shapes), str(tuple_shapes)))
+    else:
+      self._tuple_shapes = tuple_shapes
+    self._validate()
+
+  @property
+  def sharding_policies(self):
+    """Returns the sharding policies of the InfeedQueue tuple elements."""
+    return self._sharding_policies
+
+  @property
+  def shard_dimensions(self):
+    """Gets the shard dimension of each tuple element.
+
+    Returns:
+      A list of length number_of_tuple_elements, where each list entry
+      is the shard dimension of that tuple element or None if the
+      shard dimension has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return [policy.shard_dimension for policy in self._sharding_policies]
+
+  def set_shard_dimensions(self, shard_dimensions):
+    """Sets the shard_dimension of each element of the queue.
+
+    shard_dimensions must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a Dimension compatible with self.tuple_shapes.
+
+    Args:
+      shard_dimensions: the dimensions of each queue element.
+
+    Raises:
+      ValueError: if shard_dimensions is not of length
+        self.number_of_tuple_elements; or an element of
+        shard_dimensions cannot be converted to a Dimension; or an
+        element of shard_dimensions is a Dimension that is out of
+        range for the corresponding tuple element shape.
+    """
+    if len(shard_dimensions) != self.number_of_tuple_elements:
+      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
+                       % (str(shard_dimensions),
+                          self.number_of_tuple_elements))
+    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
+      policy.set_shard_dimension(dimension)
+    self._validate()
+
+  @property
+  def number_of_shards(self):
+    """Gets the number of shards to use for the InfeedQueue.
+
+    Returns:
+      Number of shards or None if the number of shards has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return self._sharding_policies[0].number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards to use for the InfeedQueue.
+
+    Args:
+      number_of_shards: number of ways to shard the InfeedQueue.
+
+    Raises:
+      ValueError: if number_of_shards is not > 0; or the policies have
+        been frozen and number_of_shards was already set to something
+        else.
+    """
+    for policy in self._sharding_policies:
+      policy.set_number_of_shards(number_of_shards)
+    self._validate()
+
+  def set_configuration_from_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of Tensors whose types and shapes are used
+    to set the queue configuration.
+
+    Args:
+      input_tensors: list of Tensors of the same types and shapes as
+        the desired queue Tuple.
+
+    Raises:
+      ValueError: if input_tensors is not a list of length
+        self.number_of_tuple_elements
+    """
+    if len(input_tensors) != self.number_of_tuple_elements:
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
+    self.set_tuple_shapes([t.shape for t in input_tensors])
+    self.set_tuple_types([t.dtype for t in input_tensors])
+
+  def set_configuration_from_sharded_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of lists of Tensors whose types and shapes are used
+    to set the queue configuration. The length of the outer list is the number
+    of shards required, and each inner list is the tuple of Tensors to use to
+    determine the types and shapes of the corresponding shard. This method
+    depends on the shard dimension, and calling it freezes the shard policy.
+
+    Args:
+      input_tensors: list of lists of Tensors. The outer list length corresponds
+        to the desired number of shards, and each inner list is the size
+        and shape of the desired configuration of the corresponding shard.
+
+    Raises:
+      ValueError: if any inner list is not a list of length
+        self.number_of_tuple_elements; or the inner lists do not combine to
+        form a consistent unsharded shape.
+      TypeError: if the types of the Tensors in the inner lists do not match.
+    """
+    if not self._frozen:
+      # Unset the tuple shapes in case the configuration becomes
+      # transiently inconsistent.
+      self._tuple_shapes = None
+    number_of_shards = len(input_tensors)
+    self.set_number_of_shards(number_of_shards)
+    for t in input_tensors:
+      if len(t) != self.number_of_tuple_elements:
+        raise ValueError(
+            "input_tensors is %s but must be a list of lists, where each inner"
+            " list has length number_of_tuple_elements=%d" % (
+                str(input_tensors), self.number_of_tuple_elements))
+    # Transpose the inputs to make a list of shard shapes for each tuple
+    # element.
+    sharded_shapes = [[t[i].shape for t in input_tensors]
+                      for i in xrange(self.number_of_tuple_elements)]
+    # For each tuple, get the unsharded shape using that tuple's policy.
+    unsharded_shapes = [
+        policy.get_unsharded_shape(s)
+        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
+    ]
+    self.set_tuple_shapes(unsharded_shapes)
+    for i in xrange(1, self.number_of_shards):
+      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
+        if t1.dtype != t2.dtype:
+          raise TypeError(
+              "types of the tuple elements of input_tensors %s are not "
+              "consistent" % str(input_tensors))
+    self.set_tuple_types([t.dtype for t in input_tensors[0]])
+
+  def freeze(self):
+    """Freezes the InfeedQueue so it can no longer be modified.
+
+    The configuration is implicitly frozen before any host-side or
+    device-side Ops are generated. The configuration cannot be frozen
+    until the types and shapes of the tuple elements have been set.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set.
+    """
+    self._frozen = True
+    if self._tuple_types is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple types.")
+    if self._tuple_shapes is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for shape in self._tuple_shapes:
+      if shape.dims is None:
+        raise ValueError(
+            "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for policy in self._sharding_policies:
+      policy.freeze()
+    self._validate()
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generates the device-side Op to dequeue a tuple from the queue.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen, which will raise errors if the shapes and types have not
+    been fully specified.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed. If None, no explicit placement will be performed, and it is up
+        to the user to call this API from within a proper TPU device scope.
+        The XLA code will fail if the TPU dequeue instruction is not bound to
+        any device.
+
+    Returns:
+      A list of Outputs corresponding to a shard of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    if tpu_device is not None:
+      with ops.device(tpu.core(tpu_device)):
+        return tpu_ops.infeed_dequeue_tuple(
+            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    else:
+      return tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+
+  def _generate_enqueue_op(self,
+                           inputs,
+                           name_prefix,
+                           index,
+                           device=None,
+                           tpu_ordinal=-1):
+    """Generate a host-side Op to enqueue a tuple to the queue.
+
+    If device is None the inputs are all required to have the same
+    device specification, and the enqueue Op is colocated with
+    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
+
+    Args:
+      inputs: a list of Tensors with the types and shapes of the tuple elements.
+      name_prefix: the base name for the Op.
+      index: the shard index, used to uniquify the Op name.
+      device: device to place the Op on, or None if it should be
+        colocated with the inputs.
+      tpu_ordinal: ordinal of the TPU device on the host to use for
+      infeed if device is a CPU device. Should be set to -1 if device
+      is a TPU device.
+
+    Returns:
+      An Op corresponding to a shard of infeed enqueued at the host,
+      suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if device is None and inputs do not all have the
+        same device specification.
+    """
+    full_name = "%s/%d" % (name_prefix, index)
+    shapes = [t.shape for t in inputs]
+    if device is None:
+      devices = [t.device for t in inputs]
+      for i in xrange(1, self.number_of_tuple_elements):
+        if devices[0] != devices[i]:
+          raise ValueError(
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
+      with ops.colocate_with(inputs[0]):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+    else:
+      with ops.device(device):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+
+  def generate_enqueue_ops(self,
+                           sharded_inputs,
+                           tpu_ordinal_function=None,
+                           placement_function=None):
+    """Generates the host-side Ops to enqueue the shards of a tuple.
+
+    sharded_inputs is a list, one for each shard, of lists of
+    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
+    shard 0 if the queue. Returns the host-side Ops that must be run to
+    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
+    for shard i.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of sharded_inputs, an error
+    will be raised.
+
+    Args:
+      sharded_inputs: a list of lists of Tensors. The length of the outer list
+        determines the number of shards. Each inner list indicates the types
+        and shapes of the tuples in the corresponding shard.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. tpu_ordinal_function must be
+        set if the inputs are placed on CPU devices.
+      placement_function: if not None, a function that takes the shard index as
+        input and returns the host device where the enqueue op should be placed
+        on.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    if tpu_ordinal_function is None:
+      tpu_ordinal_function = lambda index: -1
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            tpu_ordinal=tpu_ordinal_function(index),
+            device=placement_function(index) if placement_function else None)
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+  # TODO(misard) Generalize this to the case of systems that don't
+  # have 8 devices per host, and figure out what to do with
+  # model-parallelism.
+  def _default_placement_function(self, index):
+    return "/task:%d/device:CPU:0" % (index / 8)
+
+  def _default_ordinal_function(self, index):
+    return index % 8
+
+  # TODO(b/36470756) remove this from tutorials once we have a better story
+  # for automatic placement of input pipelines.
+  def split_inputs_and_generate_enqueue_ops(self,
+                                            inputs,
+                                            device_assignment=None,
+                                            placement_function=None,
+                                            tpu_ordinal_function=None):
+    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
+
+    Generates the host-side Ops to enqueue a tuple.
+
+    This method performs poorly because it takes an entire input on a single
+    host, splits it, and distributes it to all of the cores. It is present only
+    to simplify tutorial examples.
+
+    inputs is a list of Tensors to use to feed the queue. Each input is split
+    into self.number_of_shards shards. Returns an Op for each shard to enqueue
+    the shard. The Op for shard i is placed on device placement_function(i).
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of inputs, an error
+    will be raised.
+
+    Args:
+      inputs: a list of Tensors which indicates the types and shapes of the
+        queue tuple.
+     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
+        device_assignment is not `None`, but `placement_function` and
+        `ordinal_function` are None, then `device_assignment` will be used to
+        place infeeds on the first k TPU shards, where k is the number of shards
+        in the queue. If all three are `None`, then default placement and
+        ordinal functions are used.
+      placement_function: if not None, a function that takes the shard
+        index as input and returns a device string indicating which
+        device the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of inputs are not compatible with the frozen
+        configuration.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of inputs are not compatible with the frozen
+        configuration.
+    """
+    if device_assignment is None:
+      if placement_function is None:
+        placement_function = self._default_placement_function
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = self._default_ordinal_function
+    else:
+
+      def _placement_function_from_map(index):
+        return device_assignment.host_device(replica=index)
+
+      def _ordinal_function_from_map(index):
+        return device_assignment.tpu_ordinal(replica=index)
+
+      if placement_function is None:
+        placement_function = _placement_function_from_map
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = _ordinal_function_from_map
+    self.set_configuration_from_input_tensors(inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    split_name_prefix = "%s/split" % self._name
+    if self.number_of_shards == 1:
+      transposed_sharded_inputs = [[inp] for inp in inputs]
+    else:
+
+      def split_fn(inp, num_shards, axis, name):
+        with ops.colocate_with(inp):
+          return array_ops.split(inp, num_shards, axis=axis, name=name)
+
+      transposed_sharded_inputs = [
+          split_fn(
+              inp,
+              self.number_of_shards,
+              axis=policy.shard_dimension,
+              name="%s/%d" % (split_name_prefix, index))
+          for (inp, policy, index) in zip(inputs, self._sharding_policies,
+                                          xrange(self.number_of_tuple_elements))
+      ]
+    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
+                      for i in xrange(self.number_of_shards)]
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            device=placement_function(index),
+            tpu_ordinal=tpu_ordinal_function(index))
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+
+class _PartitionedInfeedQueue(InfeedQueue):
+  """A helper object to build a device infeed queue with input partition.
+
+  Args:
+    number_of_tuple_elements: the number of Tensors fed atomically through the
+      queue, must be present unless it can be inferred from other arguments.
+    device_assignment: A TPU `DeviceAssignment` which is used to place all the
+      partitions to different TPU infeed queues.
+    host_id: The id of the host machine.
+    input_partition_dims: A nested list/tuple of integers. Each inner
+      list/tuple describes how to partition the corresponding input tensor.
+    tuple_types: If not None, a list of types of the elements of the queue.
+    tuple_shapes: If not None, a list of shapes of the elements of the queue.
+    name: The name of the queue.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements,
+               device_assignment,
+               host_id,
+               input_partition_dims=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               name=None):
+    super(_PartitionedInfeedQueue, self).__init__(
+        number_of_tuple_elements=number_of_tuple_elements,
+        tuple_types=tuple_types,
+        tuple_shapes=None,
+        shard_dimensions=None,
+        name="PartitionedInfeedQueue" if name is None else name)
+    self._input_partition_dims = input_partition_dims
+    self._host_id = host_id
+    self._device_assignment = device_assignment
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generate TPU dequeue ops.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed.
+
+    Returns:
+      A list of Outputs corresponding to a partition of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    with ops.device(tpu.core(tpu_device)):
+      values = tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    return tag_sharding_attribute_for_dequeued_tensors(
+        values, self._input_partition_dims)
+
+  def generate_enqueue_ops(self, per_host_sharded_inputs):
+    """Generates the host-side Ops to enqueue the partitioned inputs.
+
+    per_host_sharded_inputs is a list, one for each replica, of lists of
+    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
+    replica i.
+    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
+
+    For example, if sharded_inputs[i][j] is a 2-D Tensor:
+    [[A, B, C, D],
+     [E ,F, G, H]]
+    self._input_partition_dims[j] is [2, 4].
+
+    sharded_inputs[i][j] will be partitioned and flattened into:
+    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
+    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
+
+    Args:
+      per_host_sharded_inputs: a list of lists of Tensors. The length of the
+        outer list determines the number of shards. Each inner list indicates
+        the types and shapes of the tuples in the corresponding shard.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints; or if the partition dims are invalid.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
+    number_of_replicas_per_host = len(per_host_sharded_inputs)
+    number_of_tuple_elements = len(per_host_sharded_inputs[0])
+
+    assert len(self._input_partition_dims) == number_of_tuple_elements
+    per_host_enqueue_ops = []
+
+    for replica_index in range(number_of_replicas_per_host):
+      flattened_inputs = per_host_sharded_inputs[replica_index]
+      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
+                                                 self._input_partition_dims)
+      inputs_parted_iters = [
+          iter(self._check_dims_and_partition_or_replicate_on_host(x, dims))
+          for x, dims in zip(per_host_sharded_inputs[replica_index],
+                             inputs_part_dims_flat)
+      ]
+
+      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
+        # Places different partitions to different logic cores.
+        replica_id = self._device_assignment.lookup_replicas(
+            self._host_id, logical_core)[replica_index]
+        ordinal = self._device_assignment.tpu_ordinal(
+            replica=replica_id, logical_core=logical_core)
+        infeed_inputs = []
+        for it in inputs_parted_iters:
+          input_for_device = next(it, None)
+          if input_for_device is not None:
+            infeed_inputs.append(input_for_device)
+
+        if infeed_inputs:
+          per_host_enqueue_ops.append(
+              tpu_ops.infeed_enqueue_tuple(
+                  inputs=infeed_inputs,
+                  shapes=[x.shape for x in infeed_inputs],
+                  name="enqueue/replica_{0}/input_{1}".format(
+                      replica_index, logical_core),
+                  device_ordinal=ordinal))
+    return per_host_enqueue_ops
+
+  def _check_input_partition_dims(self, tensor, dims):
+    """Checks that input partition dims are valid for the `Tensor`.
+
+    Args:
+      tensor: Input tensor for partitioning.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Raises:
+      ValueError: If the tensor can't be partitioned by dims or the
+        num_cores_per_replica doesn't match the number of
+        partitions(dims.prod()).
+    """
+    # No partitioning specified, so don't perform further checks.
+    if dims is None:
+      return
+
+    dims = np.array(dims)
+
+    if (dims < 1).any():
+      raise ValueError("All input partition dims must be >= 1.")
+
+    # No partitioning, so don't perform further checks.
+    if dims.prod() == 1:
+      return
+
+    if dims.prod() != self._device_assignment.num_cores_per_replica:
+      raise ValueError(
+          "The product of each input parition dim should equal to "
+          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
+          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
+    if dims.shape[0] != tensor.shape.ndims:
+      raise ValueError(
+          "Input partition dims must have the same number of dimensions "
+          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
+          "partition dims = {}).".format(tensor.shape.as_list(), dims))
+
+    tensor.shape.assert_is_fully_defined()
+
+  def _check_dims_and_partition_or_replicate_on_host(self, tensor, dims):
+    """Checks dims and partitions or replicates the input tensor.
+
+      The ops inside this function are placed on the host side.
+
+    Args:
+      tensor: The input tensor which will be partioned or replicated.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Returns:
+      An iterator of `Tensor`s or a list of partioned tensors.
+    """
+    self._check_input_partition_dims(tensor, dims)
+    return partition_or_replicate_on_host(tensor, dims)
diff --git a/tensorflow/python/tpu/tpu_function.py b/tensorflow/python/tpu/tpu_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..422c7d3b26ffb4ad1b72450c4803ac2eb87cea3b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_function.py
@@ -0,0 +1,66 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for functions used during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+class TpuContext(object):
+  """A context object holding state about the TPU computation being built."""
+
+  def __init__(self):
+    """Creates a new TpuContext."""
+    self._number_of_shards = None
+
+  @property
+  def number_of_shards(self):
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    self._number_of_shards = number_of_shards
+
+
+# The Tpu context holds the number of shards when a sharded computation is
+# being built, or None if no computation is being built.
+_current_tpu_context = TpuContext()
+
+
+@contextlib.contextmanager
+def tpu_shard_context(number_of_shards):
+  if _current_tpu_context.number_of_shards is not None:
+    raise NotImplementedError("tpu_shard_context cannot be nested.")
+  try:
+    _current_tpu_context.set_number_of_shards(number_of_shards)
+    yield
+  finally:
+    _current_tpu_context.set_number_of_shards(None)
+
+
+def get_tpu_context():
+  return _current_tpu_context
+
+
+# Decorator function for tpu computation func that was passed to tpu.rewrite()
+# if there is an embedded training loop in this func, trace tools will generate
+# step markers for each iteration.
+def on_device_training_loop(func):
+  # Value for this attribute is from xla.DebugOptions.StepMarkerLocation.
+  setattr(func, "step_marker_location", "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP")
+  return func
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py b/tensorflow/python/tpu/tpu_infeed_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
rename to tensorflow/python/tpu/tpu_infeed_test.py
index a41ff60d0af6c89fa9825d557aceefc9f6b8098d..3e90979157f891a989209fea4e56ff7090dde837 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
+++ b/tensorflow/python/tpu/tpu_infeed_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_feed
 
 
 class InfeedTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_optimizer.py b/tensorflow/python/tpu/tpu_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c409eaa1cd4d499b72dbfbf429324d5f641e7c
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_optimizer.py
@@ -0,0 +1,203 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Optimizer that implements cross-shard gradient reduction for TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import optimizer
+
+
+class CrossShardOptimizer(optimizer.Optimizer):
+  """An optimizer that averages gradients across TPU shards."""
+
+  def __init__(self,
+               opt,
+               reduction=losses.Reduction.MEAN,
+               name="CrossShardOptimizer",
+               group_assignment=None):
+    """Construct a new cross-shard optimizer.
+
+    Args:
+      opt: An existing `Optimizer` to encapsulate.
+      reduction: The reduction to apply to the shard losses.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "CrossShardOptimizer".
+      group_assignment: Optional 2d int32 lists with shape
+        [num_groups, num_replicas_per_group] which describles how to apply
+        optimizer to subgroups.
+
+    Raises:
+      ValueError: If reduction is not a valid cross-shard reduction.
+    """
+    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
+      raise ValueError("Unsupported reduction: %s." % reduction)
+
+    super(CrossShardOptimizer, self).__init__(False, name)
+    self._opt = opt
+    self._reduction = reduction
+    self._group_assignment = group_assignment
+
+  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
+    """Verify group_assignment and get the subgroup size".
+
+    Args:
+      group_assignment: list of group ids for applying the optimizer
+        to subgroups.
+      num_shards: The number of TPU shards.
+
+    Returns:
+      The size of one subgroup in group_assignment.
+
+    Raises:
+      ValueError: If group_assignment is invalid.
+    """
+    if not group_assignment:
+      return None
+    if not (isinstance(group_assignment, list) and
+            all(isinstance(i, list) for i in group_assignment)):
+      raise ValueError("group_assignment must be a list of list. Got {}".format(
+          group_assignment))
+
+    replica_ids = set()
+    for g in group_assignment:
+      for i in g:
+        replica_ids.add(i)
+
+    if set(range(num_shards)) != replica_ids:
+      raise ValueError("group_assignment must be a permutation of range({0})."
+                       " Got group_assignment={1}".format(
+                           num_shards, group_assignment))
+
+    subgroup_size_list = [len(group) for group in group_assignment]
+    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
+      return subgroup_size_list[0]
+    else:
+      raise ValueError("The size of each subgroup in group_assignment must "
+                       "be equal. Got group_assignment={}".format(
+                           self._group_assignment))
+
+  def compute_gradients(self, loss, var_list=None, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer. The
+    gradients will be aggregated in the apply_gradients() so that user can
+    modify the gradients like clipping with per replica global norm if needed.
+    The global norm with aggregated gradients can be bad as one replica's huge
+    gradients can hurt the gradients from other replicas.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+
+    Raises:
+      ValueError: If not within a tpu_shard_context or group_assignment is
+        invalid.
+    """
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    if num_shards is None:
+      logging.warning(
+          "CrossShardOptimizer should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
+
+    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
+                                                       num_shards)
+
+    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
+      if self._group_assignment:
+        scale = 1.0 / subgroup_size
+      else:
+        scale = 1.0 / num_shards
+      loss *= scale
+
+    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
+    replicas, and then applies the real optimizer.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      An `Operation` that applies the gradients. If `global_step` was not None,
+      that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If the grads_and_vars is malformed.
+    """
+    summed_grads_and_vars = []
+    for (grad, var) in grads_and_vars:
+      if grad is None:
+        summed_grads_and_vars.append((grad, var))
+      else:
+        with ops.colocate_with(grad):
+          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
+              grad, self._group_assignment), var))
+    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
+
+  def get_slot(self, *args, **kwargs):
+    """Return a slot named "name" created for "var" by the Optimizer.
+
+    This simply wraps the get_slot() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    This simply wraps the get_slot_names() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      A list of strings.
+    """
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def variables(self):
+    """Forwarding the variables from the underlying optimizer."""
+    return self._opt.variables()
diff --git a/tensorflow/python/tpu/tpu_sharding.py b/tensorflow/python/tpu/tpu_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5af03f33ca8f13af517007672e9ce0e12be6205
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_sharding.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for sharding during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import tensor_shape
+
+_DEFAULT_NUMBER_OF_SHARDS = 1
+_DEFAULT_SHARD_DIMENSION = 0
+
+
+# TODO(b/36777903) change other parts of tpu.py to use this class.
+class ShardingPolicy(object):
+  """An object use to hold the sharding policy for a Tensor.
+  """
+
+  def __init__(self):
+    self._number_of_shards = None
+    self._shard_dimension = None
+    self._frozen = False
+
+  def __str__(self):
+    if self.number_of_shards is None or self.shard_dimension is None:
+      return "ShardingPolicy(unset)"
+    else:
+      return ("ShardingPolicy(%d shards dimension %d)" %
+              (self.number_of_shards, self.shard_dimension))
+
+  def _fill_default_values(self):
+    if self._number_of_shards is None:
+      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
+    if self._shard_dimension is None:
+      self._shard_dimension = tensor_shape.as_dimension(
+          _DEFAULT_SHARD_DIMENSION)
+
+  def freeze(self):
+    """Prevents further modification to the sharding policy.
+
+    Any values that have not been set when freeze is called are set to
+    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
+    """
+    if not self._frozen:
+      self._fill_default_values()
+      self._frozen = True
+
+  @property
+  def number_of_shards(self):
+    """Returns the number of shards in the policy or None if unspecified."""
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards for the current policy.
+
+    If the policy has been frozen then number_of_shards must match the
+    existing setting.
+
+    Args:
+      number_of_shards: The number of shards to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and number_of_shards
+        differs from the frozen value; or number_of_shards <= 0.
+    """
+    if self._frozen:
+      if self._number_of_shards != number_of_shards:
+        raise ValueError(
+            "Can't set sharding policy to use %d shards since it has been "
+            "frozen to use %d." % (number_of_shards, self._number_of_shards))
+    else:
+      if number_of_shards > 0:
+        self._number_of_shards = number_of_shards
+      else:
+        raise ValueError(
+            "Can't set sharding policy to use %s shards; value must be >0",
+            str(number_of_shards))
+
+  @property
+  def shard_dimension(self):
+    """Returns the shard dimension of the policy or None if unspecified."""
+    return self._shard_dimension
+
+  def set_shard_dimension(self, shard_dimension):
+    """Sets the shard dimension for the current policy.
+
+    If the policy has been frozen then shard_dimension must match the
+    existing setting.
+
+    Args:
+      shard_dimension: The shard dimension to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and shard_dimension
+        differs from the frozen value, or shard_dimension can't be
+        interpreted as a Dimension.
+    """
+    if self._frozen:
+      if self._shard_dimension != shard_dimension:
+        raise ValueError(
+            "Can't set shard dimension to %d since it has been frozen to "
+            "use %d." % (shard_dimension, self._shard_dimension))
+    else:
+      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
+
+  def merge(self, other):
+    """Merges the policy of another policy into the current policy.
+
+    Args:
+      other: The policy to merge into this one.
+
+    Raises:
+      ValueError: If this policy has been frozen and the merge conflicts with
+      the frozen policy.
+    """
+    if other.number_of_shards is not None:
+      self.set_number_of_shards(other.number_of_shards)
+    if other.shard_dimension is not None:
+      self.set_shard_dimension(other.shard_dimension)
+
+  def get_sharded_shape(self, shape, shard_index=None):
+    """Returns the shape of a shard of a full Tensor.
+
+    When given the shape of a 'full-size' Tensor, returns the shape of
+    the sub-Tensor after it has been sharded. Freezes the policy if it
+    has not yet been frozen.
+
+    Args:
+      shape: The shape of the full-size Tensor to be sharded.
+      shard_index: The index of the shard whose shape should be returned.
+        shard_index can be None for sharding policies that use the same
+        shape for every shard.
+      freeze_config:
+
+    Returns:
+      The shape of the sharded version of the Tensor.
+
+    Raises:
+      ValueError: If shard_index is None when shards are of different
+        shapes; or shard_index is not None and
+        !(0<=shard_index<number_of_shards); or shape does not have at
+        least self.shard_dimension+1 dimensions; or the value of
+        shape's shard dimension is not a multiple of
+        self.number_of_shards
+    """
+    if self._shard_dimension is None or self._number_of_shards is None:
+      # Don't raise an error if the config is unset.
+      return None
+    if shard_index is not None:
+      if shard_index < 0 or shard_index >= self.number_of_shards:
+        raise ValueError("shard_index %d, but must be in [0,%d)." %
+                         (shard_index, self._number_of_shards))
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    if dims[self._shard_dimension] is None:
+      raise ValueError("shape %s must have a fixed size for dimension %d "
+                       "that is known at graph construction time." %
+                       (shape.as_list(), self._shard_dimension))
+    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
+      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
+                       (shape.as_list(), self._number_of_shards,
+                        self._shard_dimension))
+    dims[self._shard_dimension] /= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def _unshard_shape(self, shape):
+    """Return the unsharded shape that would generate a given sharded shape.
+
+    Args:
+      shape: the sharded shape to unshard
+
+    Returns:
+      The unsharded shape.
+
+    Raises:
+      ValueError: if shape is unknown or does not contain
+        self.shard_dimension
+      TypeError: if shape is not convertible to a TensorShape
+    """
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    dims[self._shard_dimension] *= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def get_unsharded_shape(self, shapes):
+    """Returns the shape of an unsharded Tensor given a list of shards.
+
+    When given a list of shapes of shards, returns the shape of the
+    unsharded Tensor that would generate the shards. Sets defaults for the
+    policy if number_of_shards or shard_dimension is None.
+
+    Args:
+      shapes: The shapes of the Tensor shards to be combined.
+
+    Returns:
+      The shape of the unsharded version of the Tensor.
+
+    Raises:
+      ValueError: if shapes is not a list of length
+        self.number_of_shards; or any element of shapes is not a valid
+        shape consistent with the sharding policy; or the list of
+        shapes is not a valid sharding of a full shape.
+      TypeError: if an element of shapes is not convertible to a
+        TensorShape
+    """
+    self._fill_default_values()
+    if len(shapes) != self.number_of_shards:
+      raise ValueError(
+          "shapes is %s but must be a list of length number_of_shards=%d" % (
+              str(shapes), self.number_of_shards))
+    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
+    for i in xrange(self.number_of_shards - 1):
+      if not unsharded_shapes[i].is_compatible_with(
+          unsharded_shapes[self.number_of_shards - 1]):
+        raise ValueError(
+            "sharded shapes %s are not consistent shards of a full shape "
+            "sharded %d ways along dimension %d" % (
+                str(shapes), self.number_of_shards, self.shard_dimension))
+    return unsharded_shapes[0]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py b/tensorflow/python/tpu/tpu_sharding_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
rename to tensorflow/python/tpu/tpu_sharding_test.py
index b0a5511d2d7683a5e0f527e49651df236c7a68d4..21d2a0897a0ff938359a4ca29e077678778ddc56 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
+++ b/tensorflow/python/tpu/tpu_sharding_test.py
@@ -19,10 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_sharding
 
 
 class ShardingTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..47e0b7b62fa8900b9387a92e12a804d4c7289a1f
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -0,0 +1,103 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU specific APIs to be used in conjunction with TPU Strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import functional as tpu_functional_ops
+from tensorflow.python.tpu import topology
+from tensorflow.python.tpu import tpu
+from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
+
+
+def get_first_tpu_host_device(cluster_resolver):
+  """Get the device spec for the first TPU host."""
+  if context.executing_eagerly():
+    tpu_devices = sorted(
+        [x for x in context.list_devices() if "device:TPU:" in x])
+    if not tpu_devices:
+      raise RuntimeError("Could not find any TPU devices")
+    spec = tf_device.DeviceSpec.from_string(tpu_devices[0])
+    task_id = spec.task
+  else:
+    # Session master needs to be configured and the coordinator is not part
+    # of the cluster.
+    task_id = 0
+  if cluster_resolver.get_master() in ("", "local"):
+    return "/replica:0/task:0/device:CPU:0"
+  job_name = cluster_resolver.get_job_name() or "tpu_worker"
+  return "/job:%s/task:%d/device:CPU:0" % (job_name, task_id)
+
+
+@tf_export("tpu.experimental.initialize_tpu_system")
+def initialize_tpu_system(cluster_resolver=None):
+  """Initialize the TPU devices.
+
+  Args:
+    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+  Returns:
+    The tf.tpu.Topology object for the topology of the TPU cluster.
+  """
+  if cluster_resolver is None:
+    cluster_resolver = TPUClusterResolver("")
+
+  logging.info("Initializing the TPU system.")
+
+  if context.executing_eagerly():
+    # This function looks as it is for the following non-intuitive reasons.
+    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
+    # DistributedTPURewritePass. This pass actually adds real ops that
+    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
+    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
+    # The easiest way to trigger a rewrite is to run the function with
+    # TPUPartitionedCallOp.
+    @function.defun
+    def _tpu_init_fn():
+      return tpu.initialize_system()
+
+    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
+    # see above) but need to define it to get it added to eager context
+    # and get its assigned name.
+    # pylint: disable=protected-access
+    graph_func = _tpu_init_fn._get_concrete_function_internal()
+    func_name = compat.as_str(graph_func._inference_function.name)
+    # pylint: enable=protected-access
+
+    with ops.device(get_first_tpu_host_device(cluster_resolver)):
+      output = tpu_functional_ops.TPUPartitionedCall(
+          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
+    serialized_topology = output[0].numpy()
+  else:
+    master = cluster_resolver.master()
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        serialized_topology = sess.run(tpu.initialize_system())
+
+  logging.info("Finished initializing TPU system.")
+  return topology.Topology(serialized=serialized_topology)
diff --git a/tensorflow/python/tpu/tpu_system_metadata.py b/tensorflow/python/tpu/tpu_system_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..57518df00449552d826756d5305308806d81caa2
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_system_metadata.py
@@ -0,0 +1,223 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+
+_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
+_RETRY_TIMES = 120
+_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
+
+_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
+_DEVICE_TYPE_REGEX = re.compile('.*device:([^:]+).*')
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+
+# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
+# including num_cores and num_hosts.
+_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
+    'num_cores',
+    'num_hosts',
+    'num_of_cores_per_host',
+    'topology',
+    'devices',
+])
+
+
+def _query_tpu_system_metadata(master_address, cluster_def=None,
+                               query_topology=False):
+  """Automatically detects the TPU system metadata in the system."""
+  tpu_core_count = 0
+  devices = []
+  device_dict = collections.defaultdict(list)
+
+  if context.executing_eagerly():
+    device_names = context.list_devices()
+    devices = []
+
+    # We want the output type to match in both eager and session mode
+    for name in device_names:
+      device_match = _DEVICE_TYPE_REGEX.match(name)
+      device_type = 'CPU'
+      if device_match:
+        device_type = device_match.group(1)
+      devices.append(session_lib._DeviceAttributes(name, device_type, 0, 0))  # pylint: disable=protected-access
+  else:
+    # TODO(b/120564445): Replace with standard library for retries.
+    retry_count = 1
+    while True:
+      logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
+                   master_address)
+      try:
+        with ops.Graph().as_default():
+          with session_lib.Session(
+              master_address,
+              config=get_session_config_with_timeout(
+                  _PINGING_MASTER_TIMEOUT_IN_MS,
+                  cluster_def)) as sess:
+            devices = sess.list_devices()
+            break
+      except errors.DeadlineExceededError:
+        msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+               'not be ready (still scheduling) or the Tensorflow master '
+               'address is incorrect: got (%s).' %
+               (master_address))
+
+        # TODO(xiejw): For local or grpc master we might not need retry logic
+        # here.
+        if retry_count <= _RETRY_TIMES:
+          logging.warning('%s', msg)
+          logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
+          retry_count += 1
+        else:
+          raise ValueError(msg)
+
+  for device in devices:
+    match = _TPU_DEVICE_REG.match(device.name)
+    if match:
+      host_id = match.group(1)
+      core_id = match.group(2)
+      device_dict[host_id].append(core_id)
+      tpu_core_count += 1
+
+  num_of_cores_per_host = 0
+  if tpu_core_count:
+    num_cores_per_host_set = set(
+        [len(core_ids) for core_ids in device_dict.values()])
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError(
+          'TPU cores on each host is not same. This should not happen!. '
+          'devices: {}'.format(devices))
+    num_of_cores_per_host = num_cores_per_host_set.pop()
+
+  topology = None
+  if query_topology:
+    if not tpu_core_count:
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system (master address {}). '
+          'This usually means the master address is incorrect or the '
+          'TPU worker has some problems. Available devices: {}'.format(
+              master_address, devices))
+
+    topology = _obtain_topology(master_address, cluster_def)
+
+  # We sort the metadata devices so that downstream users get a sorted list
+  # for creating mirrored variables correctly.
+  def _sort_key(device):
+    spec = tf_device.DeviceSpec.from_string(device.name)
+    return (spec.job, spec.replica, spec.task, spec.device_type,
+            spec.device_index)
+  devices = tuple(sorted(devices, key=_sort_key))
+
+  metadata = _TPUSystemMetadata(
+      num_cores=tpu_core_count,
+      num_hosts=len(device_dict),
+      num_of_cores_per_host=num_of_cores_per_host,
+      topology=topology,
+      devices=devices)
+
+  if tpu_core_count:
+    logging.info('Found TPU system:')
+    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
+    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
+    logging.info('*** Num TPU Cores Per Worker: %d',
+                 metadata.num_of_cores_per_host)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
+  else:
+    logging.info('Failed to find TPU: %s', metadata)
+  return metadata
+
+
+def _obtain_topology(master_address, cluster_def):
+  """Obtains TPU fabric topology."""
+  try:
+    logging.info('Initializing TPU system (master: %s) to fetch topology '
+                 'for model parallelism. This might take a while.',
+                 master_address)
+    with ops.Graph().as_default():
+      session_config = get_session_config_with_timeout(
+          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
+      with session_lib.Session(
+          master_address, config=session_config) as sess:
+        topology = sess.run(tpu.initialize_system())
+        return topology
+  except errors.DeadlineExceededError:
+    raise ValueError(
+        'Fail to initialize TPU system with master (%s). '
+        'Please double check the TPU system is functional.' % (
+            master_address))
+
+
+def get_session_config_with_timeout(timeout_in_secs, cluster_def):
+  """Returns a session given a timeout and a cluster configuration."""
+  config = config_pb2.ConfigProto(
+      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
+  return config
+
+
+def master_job(master, cluster_def):
+  """Returns the canonnical job name to use to place TPU computations on.
+
+  Args:
+    master: A `string` representing the TensorFlow master to use.
+    cluster_def: A ClusterDef object describing the TPU cluster.
+
+
+  Returns:
+    A string containing the job name, or None if no job should be specified.
+
+  Raises:
+    ValueError: If the user needs to specify a tpu_job_name, because we are
+      unable to infer the job name automatically, or if the user-specified job
+      names are inappropriate.
+  """
+  # If the user specifies the tpu_job_name, use that.
+
+  if master in _LOCAL_MASTERS:
+    return None
+
+  if (not cluster_def or not cluster_def.job):
+    return _DEFAULT_JOB_NAME
+  job_names = set([job.name for job in cluster_def.job])
+  if _DEFAULT_JOB_NAME in job_names:
+    # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
+    raise ValueError('Currently, tpu_worker is not an allowed job name.')
+  if len(job_names) == 1:
+    return cluster_def.job[0].name
+  if len(job_names) == 2:
+    if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
+      job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
+      return job_names.pop()
+    # TODO(b/67716447): Include more sophisticated heuristics.
+  raise ValueError(
+      'Could not infer TPU job name. Please specify a tpu_job_name as part '
+      'of your TPUConfig.')
diff --git a/tensorflow/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf40a9d449e155a9eda9e9cc0e3f2975e179ac5
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -0,0 +1,146 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tpu_function helpers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import convolutional
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import training_loop
+
+
+class TPUContextTest(test.TestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def testIsInContext(self):
+    """Test that control_flow_util can check that we're in a TPU context."""
+    z1 = array_ops.identity(1)
+    pivot = control_flow_ops.no_op()
+    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+    context.Enter()
+    z2 = array_ops.identity(1)
+    context.Exit()
+    self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
+    self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+
+
+class TPULayerRewriteTest(test.TestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def testUsingInfeedQueueWithRegularizer(self):
+    """Test that Layer regularizers can reference data created in loops."""
+
+    def make_regularizer(scale):
+      return lambda inputs: scale * math_ops.reduce_sum(math_ops.square(inputs))
+
+    def training_step(inputs, scale):
+      outputs = convolutional.conv2d(
+          inputs,
+          filters=16,
+          kernel_size=(3, 3),
+          data_format="channels_first",
+          kernel_regularizer=make_regularizer(scale))
+      loss = math_ops.reduce_mean(math_ops.square(outputs))
+      return loss.op
+
+    inputs = array_ops.zeros(shape=(128, 32, 32, 16))
+    scale = array_ops.ones(shape=())
+    infeed = tpu_feed.InfeedQueue(
+        tuple_types=[dtypes.float32, dtypes.float32],
+        tuple_shapes=[inputs.shape, scale.shape])
+
+    def loop():
+      return training_loop.repeat(5, training_step, infeed_queue=infeed)
+
+    # This should not throw an error.
+    tpu.rewrite(loop)
+
+
+class TPUGraphPruneTest(test.TestCase):
+
+  def test_prune_unconnected_ops(self):
+    with ops.Graph().as_default():
+      a = array_ops.placeholder(dtype=dtypes.float32, name="a")
+      b = array_ops.placeholder(dtype=dtypes.float32, name="b")
+      constant_op.constant(1.0, name="constant")
+      x = variable_scope.get_variable(
+          name="x",
+          dtype=dtypes.float32,
+          shape=[],
+          use_resource=True,
+          initializer=init_ops.constant_initializer(2.0))
+      y = variable_scope.get_variable(
+          name="y",
+          dtype=dtypes.float32,
+          shape=[],
+          use_resource=True,
+          initializer=init_ops.constant_initializer(3.0))
+      math_ops.add(a, b)
+      math_ops.add(x, y)
+      graph_def = ops.get_default_graph().as_graph_def()
+
+      for node in graph_def.node:
+        # Attach a TPU_REPLICATE_ATTR to each node.
+        node.attr[tpu._TPU_REPLICATE_ATTR].s = b"0"
+        # Rewire placeholder "a" and variable "y" leaving them unconnected.
+        for (input_index, node_input) in enumerate(node.input):
+          if node_input == "b":
+            node.input[input_index] = "constant"
+          if node_input == "y":
+            node.input[input_index] = "x"
+
+    with ops.Graph().as_default() as graph:
+      # Reimport the graph and prune unconnected ops.
+      importer.import_graph_def(graph_def)
+      tpu.prune_unconnected_ops_from_xla(ops.get_default_graph())
+
+      # Verify that ops "a" and "x" still have TPU_REPLICATE_ATTR.
+      a = graph.get_operation_by_name("import/a").get_attr(
+          tpu._TPU_REPLICATE_ATTR)
+      self.assertEqual(b"0", a)
+      x = graph.get_operation_by_name("import/x").get_attr(
+          tpu._TPU_REPLICATE_ATTR)
+      self.assertEqual(b"0", x)
+      # Verify that ops "b" and "y" have TPU_REPLICATE_ATTR removed.
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Operation \'import/b\' has no attr named \'_tpu_replicate\'"):
+        graph.get_operation_by_name("import/b").get_attr(
+            tpu._TPU_REPLICATE_ATTR)
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Operation \'import/y\' has no attr named \'_tpu_replicate\'"):
+        graph.get_operation_by_name("import/y").get_attr(
+            tpu._TPU_REPLICATE_ATTR)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/tpu/training_loop.py b/tensorflow/python/tpu/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffeb7e915a06a513e96e1ed60beabf6b79b6518
--- /dev/null
+++ b/tensorflow/python/tpu/training_loop.py
@@ -0,0 +1,222 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Library for constructing a training loop, suitable for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+
+
+def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop for TPUs.
+
+  The set of loop-carried tensors corresponds to `inputs`.  Both
+  `condition` and `body` take the current value of the loop-carried
+  tensors. 'body' additionally takes a tuple of infeed from
+  infeed_queue if infeed_queue is not None. `condition` must return a
+  single boolean value that determines whether iteration
+  continues. `body` must return an updated list of values for the
+  loop-carried tensors.
+
+  Args:
+    condition: a Python function that builds the loop condition.
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop, or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+
+  Returns:
+    The final values of the loop-carried tensors.
+
+  Raises:
+    TypeError: if body or condition has the wrong signature.
+  """
+  del name
+  # Converts inputs to Tensors.
+  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
+                                      x in inputs]
+  input_types = [x.dtype for x in inputs]
+  input_arity = len(inputs)
+
+  body_arg_error = xla.check_function_argument_count(
+      body, input_arity, infeed_queue)
+  if body_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
+              input_arity, str([i.name for i in inputs]), body_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s and %d additional inputs from "
+          "infeed, but the computation needs %s" % (input_arity, str(
+              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
+                                                    body_arg_error))
+  condition_arg_error = xla.check_function_argument_count(
+      condition, input_arity, None)
+  if condition_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
+                                  condition_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s. Note that infeed is not passed to the loop "
+          "condition." % (input_arity, str([i.name for i in inputs]),
+                          condition_arg_error))
+
+  def condition_wrapper(*inputs):
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+    return condition(*inputs)
+
+  def body_wrapper(*inputs):
+    """Wrapper around `body` that handles infeed queues and control deps."""
+    inputs = list(inputs)
+
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+
+    # Runs `body` with the dequeue_ops appended.
+    if infeed_queue:
+      number_of_shards = tpu_function.get_tpu_context().number_of_shards
+      if number_of_shards is None:
+        raise ValueError("Can't build training loop with infeed when there is "
+                         "no tpu_shard_context. Are you building a loop or "
+                         "graph directly rather than from inside tpu.rewrite, "
+                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
+      infeed_queue.set_number_of_shards(number_of_shards)
+      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
+    else:
+      dequeue_ops = []
+    outputs = body(*(inputs + dequeue_ops))
+
+    # If the computation only returned one value, make it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
+
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU training loop body must return zero or more Tensor values "
+          "followed by zero or more Operations.")
+
+    output_types = [op.dtype for op in output_tensors]
+    if input_types != output_types:
+      raise TypeError(
+          "Mismatch between input types and output types for training loop "
+          "body: {} vs {}".format(input_types, output_types))
+
+    # Add the dequeue operations to output_operations to ensure they are run
+    # by the loop, even if the programmer's loop body does not use them.
+    output_operations += dequeue_ops
+
+    # Add a dummy output, if needed.
+    if not output_tensors:
+      output_tensors = array_ops.constant(0)
+
+    if output_operations:
+      # TODO(phawkins): in principle this is too restrictive since it serializes
+      # the training loop steps. In practice it does not matter since this loop
+      # will be compiled by XLA.
+      output_tensors = control_flow_ops.tuple(output_tensors,
+                                              control_inputs=output_operations)
+
+    if tensor_tracer.TensorTracer.is_enabled():
+      num_replicas = tpu_function.get_tpu_context().number_of_shards
+      if num_replicas is None:
+        num_replicas = 1
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, None,
+                                    num_replicas)
+    return output_tensors
+
+  # If the body has arity 0, add a dummy loop-carried value to which we can add
+  # control dependencies from any side-effecting operations.
+  if input_arity == 0:
+    inputs = [array_ops.constant(0)]
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
+
+
+def repeat(n, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop that executes a fixed number of iterations.
+
+  The set of loop-carried tensors correspond to `inputs`.
+  `body` must be a function that takes and returns the values of the
+  loop-carried tensors.
+
+  Args:
+    n: the number of loop iterations
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+  Returns:
+    The final values of the loop-carried tensors.
+  Raises:
+    ValueError: if there is a type error.
+  """
+  def _convert_to_list(xs):
+    if not isinstance(xs, (list, tuple)):
+      return [xs]
+    else:
+      return list(xs)
+
+  def cond(i, *args):
+    del args
+    return i < n
+
+  def body_wrapper(i, *args):
+    return [i + 1] + _convert_to_list(body(*args))
+
+  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
+  outputs = while_loop(
+      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
+  outputs = _convert_to_list(outputs)
+  if len(outputs) == 1:
+    # Returns the Op rather than an empty list.
+    return outputs[0].op
+  else:
+    return outputs[1:]
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b
--- /dev/null
+++ b/tensorflow/python/tpu/util.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Utilities for the functionalities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import six
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
+
+def check_positive_integer(value, name):
+  """Checks whether `value` is a positive integer."""
+  if not isinstance(value, six.integer_types):
+    raise TypeError('{} must be int, got {}'.format(name, type(value)))
+
+  if value <= 0:
+    raise ValueError('{} must be positive, got {}'.format(name, value))
+
+
+# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
+# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
+# python/estimator/util.py.
+class MultiHostDatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes all passed iterators."""
+
+  def __init__(self, dataset_initializers):
+    self._initializers = dataset_initializers
+
+  def after_create_session(self, session, coord):
+    del coord
+    start = time.time()
+    session.run(self._initializers)
+    logging.info('Initialized dataset iterators in %d seconds',
+                 time.time() - start)
diff --git a/tensorflow/python/tpu/xla.py b/tensorflow/python/tpu/xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..58476fae3d132aeeac7c23f176e23ea609478b15
--- /dev/null
+++ b/tensorflow/python/tpu/xla.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""XLA utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.util import tf_inspect
+
+
+def is_flat(outputs):
+  """Checks if outputs is a flat structure.
+
+    Following structures and values are considered flat:
+    1) None
+    2) A single object
+    3) A list or tuple of Tensors/Operations
+
+    The only structures that this function understands are sequences and
+    dictionaries.  E.g. this means that if outputs contains a single
+    user-defined Object, it is considered to be flat. Errors are raised later on
+    if that Object cannot be converted to a Tensor.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    A boolean indicates whether outputs is flat.
+  """
+  # If outputs is a list or tuple, check if it has any nested structure. If
+  # there is, then outputs is non-flat.
+  if isinstance(outputs, collections.Sequence):
+    for o in outputs:
+      if isinstance(o, collections.Sequence) or isinstance(o, dict):
+        return False
+
+  # If outputs is a dict, it is non-flat.
+  if isinstance(outputs, dict):
+    return False
+
+  # Getting here means either outputs itself is a single non-structured value
+  # or it is a flat list of single non-structured values.
+  return True
+
+
+def check_function_argument_count(func, input_arity, infeed_queue):
+  """Validate the number of input arguments to an XLA function.
+
+  Args:
+    func: the Python function that will be called to generate the body of an XLA
+      computation graph.
+    input_arity: the number of explicit arguments supplied by the caller.
+    infeed_queue: if not None, the infeed queue that will supply
+      additional arguments to the function.
+
+  Returns:
+    None if function can be called with the supplied number of
+      arguments, or an error string if it cannot.
+  """
+  def format_error(complaint, quantity):
+    return '%s %d argument%s' % (complaint, quantity, ''
+                                 if quantity == 1 else 's')
+
+  num_args_supplied = input_arity
+  if infeed_queue is not None:
+    num_args_supplied += infeed_queue.number_of_tuple_elements
+  arg_spec = tf_inspect.getargspec(func)
+  num_func_args = len(arg_spec.args)
+  if arg_spec.defaults is None:
+    num_func_defaults = 0
+  else:
+    num_func_defaults = len(arg_spec.defaults)
+  min_func_args = num_func_args - num_func_defaults
+  if num_args_supplied < min_func_args:
+    # The required number of arguments is not enough to call the function.
+    if num_func_defaults == 0 and arg_spec.varargs is None:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at least', min_func_args)
+  if arg_spec.varargs is None and num_args_supplied > num_func_args:
+    # The required number of arguments is too many to call the function.
+    if num_func_defaults == 0:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at most', num_func_args)
+  # Reaching here means either
+  # 1) There are varargs, func can accept any number of arguments greater than
+  # the minimum.
+  # 2) Number of supplied arguments falls in range of acceptable argument count
+  # of func.
+  return None
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index b80fb03111d8257b34ae8f4d795fb9fded96ed00..46ec3be54ec6851bd096d59f7298b6202608c53b 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Adam for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
@@ -37,8 +36,13 @@ class AdamOptimizer(optimizer.Optimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-               use_locking=False, name="Adam"):
+  def __init__(self,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
     r"""Construct a new Adam optimizer.
 
     Initialization:
@@ -48,7 +52,7 @@ class AdamOptimizer(optimizer.Optimizer):
     $$t := 0 \text{(Initialize timestep)}$$
 
     The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+    described at the end of section 2 of the paper:
 
     $$t := t + 1$$
     $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -102,9 +106,6 @@ class AdamOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Created in SparseApply if needed.
-    self._updated_lr = None
-
   def _get_beta_accumulators(self):
     with ops.init_scope():
       if context.executing_eagerly():
@@ -120,12 +121,10 @@ class AdamOptimizer(optimizer.Optimizer):
     # workers (these need to go on the same PS, otherwise some updates are
     # silently ignored).
     first_var = min(var_list, key=lambda x: x.name)
-    self._create_non_slot_variable(initial_value=self._beta1,
-                                   name="beta1_power",
-                                   colocate_with=first_var)
-    self._create_non_slot_variable(initial_value=self._beta2,
-                                   name="beta2_power",
-                                   colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta1, name="beta1_power", colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta2, name="beta2_power", colocate_with=first_var)
 
     # Create slots for the first and second moments.
     for v in var_list:
@@ -148,28 +147,34 @@ class AdamOptimizer(optimizer.Optimizer):
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power, beta2_power = self._get_beta_accumulators()
@@ -183,8 +188,7 @@ class AdamOptimizer(optimizer.Optimizer):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -194,26 +198,29 @@ class AdamOptimizer(optimizer.Optimizer):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
+        grad.values,
+        var,
+        grad.indices,
         lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking))
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
 
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
@@ -224,5 +231,5 @@ class AdamOptimizer(optimizer.Optimizer):
             beta1_power * self._beta1_t, use_locking=self._use_locking)
         update_beta2 = beta2_power.assign(
             beta2_power * self._beta2_t, use_locking=self._use_locking)
-    return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
-                                  name=name_scope)
+    return control_flow_ops.group(
+        *update_ops + [update_beta1, update_beta2], name=name_scope)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 21fa6b3b5d3f8c306f0116f4d21940164c28b104..131ecf71ba56c4c683bf1748c4681dc28507d829 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -124,8 +124,8 @@ def generate_checkpoint_state_proto(save_dir,
 
 @deprecation.deprecated(
     date=None,
-    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
-                  "than manually editing the Checkpoint proto."))
+    instructions=("Use `tf.train.CheckpointManager` to manage checkpoints "
+                  "rather than manually editing the Checkpoint proto."))
 @tf_export(v1=["train.update_checkpoint_state"])
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 8606ec4a206ffbce85cf4071934deeb5a545b055..053298d1a592df821cd56e15d9026f6386f0e502 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -38,7 +38,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class LatestCheckpointWithRelativePaths(test.TestCase):
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 5e18f4b722b402a892125903ac82bf5991c385cd..b6c5d304eddc176f824729b1919324018a4257e2 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -183,16 +183,17 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     ValueError: If missing variables in current graph, or if missing
       checkpoints or tensors in checkpoints.
   """
+  init_from_checkpoint_fn = lambda _: _init_from_checkpoint(
+      ckpt_dir_or_file, assignment_map)
   if distribution_strategy_context.get_cross_replica_context():
-    _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
+    init_from_checkpoint_fn(None)
   else:
     distribution_strategy_context.get_replica_context().merge_call(
-        _init_from_checkpoint, args=(ckpt_dir_or_file, assignment_map))
+        init_from_checkpoint_fn)
 
 
-def _init_from_checkpoint(_, ckpt_dir_or_file, assignment_map):
+def _init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """See `init_from_checkpoint` for documentation."""
-
   ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
   reader = load_checkpoint(ckpt_dir_or_file)
   variable_map = reader.get_variable_to_shape_map()
@@ -362,7 +363,6 @@ def _is_variable(x):
   return (isinstance(x, variables.Variable) or
           resource_variable_ops.is_resource_variable(x))
 
-
 def _collect_partitioned_variable(name, all_vars):
   """Returns list of `tf.Variable` that comprise the partitioned variable."""
   if name + "/part_0" in all_vars:
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 37d46795b16cb4b4ed5ce2b4f5cf9b17cdcafab3..35f0b6e26492912c2c80799de2a99d29bf737793 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -253,8 +253,8 @@ def _evaluate_once(checkpoint_path,
       if isinstance(h, (_StopAfterNEvalsHook, _MultiStepStopAfterNEvalsHook)):
         h._set_evals_completed_tensor(eval_step_value)  # pylint: disable=protected-access
 
-  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%dT%H:%M:%SZ',
-                                                         time.gmtime()))
+  logging.info('Starting evaluation at ' +
+               time.strftime('%Y-%m-%dT%H:%M:%SZ', time.localtime()))
 
   # Prepare the session creator.
   session_creator = monitored_session.ChiefSessionCreator(
@@ -273,6 +273,6 @@ def _evaluate_once(checkpoint_path,
       while not session.should_stop():
         session.run(eval_ops, feed_dict)
 
-  logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
-                                                         time.gmtime()))
+  logging.info('Finished evaluation at ' +
+               time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime()))
   return final_ops_hook.final_ops_values
diff --git a/tensorflow/python/training/evaluation_test.py b/tensorflow/python/training/evaluation_test.py
index 3de4ceda759d927aaf743a0aa0159c50b0dbefb7..690c97e3db196ddeb5a212e3b254cf6c01907789 100644
--- a/tensorflow/python/training/evaluation_test.py
+++ b/tensorflow/python/training/evaluation_test.py
@@ -26,10 +26,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
@@ -117,16 +117,18 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy},
-        hooks=[evaluation._StopAfterNEvalsHook(1),])
+        final_ops={'accuracy': (accuracy.result(), update_op)},
+        hooks=[
+            evaluation._StopAfterNEvalsHook(1),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
 
   def testEvaluateWithFiniteInputs(self):
@@ -148,17 +150,21 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy,
-                   'eval_steps': evaluation._get_or_create_eval_step()},
-        hooks=[evaluation._StopAfterNEvalsHook(None),])
+        final_ops={
+            'accuracy': (accuracy.result(), update_op),
+            'eval_steps': evaluation._get_or_create_eval_step()
+        },
+        hooks=[
+            evaluation._StopAfterNEvalsHook(None),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
     # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs
     # each; the 3rd iter evaluates the remaining 4 inputs, and the last one
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index d3441a2f872681c4563a58c0f4808ec8be708bbc..5095efa6ca92ccf7dad0514d30222235fbc62b5e 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -199,7 +199,7 @@ def input_producer(input_tensor,
             q, [enq], cancel_op=cancel_op))
     if summary_name is not None:
       summary.scalar(summary_name,
-                     math_ops.to_float(q.size()) * (1. / capacity))
+                     math_ops.cast(q.size(), dtypes.float32) * (1. / capacity))
     return q
 
 
@@ -712,7 +712,7 @@ def _shapes(tensor_list_list, shapes, enqueue_many):
 
 def _select_which_to_enqueue(tensor_list, keep_input):
   """Select which examples to enqueue based on vector `keep_input`."""
-  select_i = math_ops.to_int32(keep_input)
+  select_i = math_ops.cast(keep_input, dtypes.int32)
   tensor_list = [
       data_flow_ops.dynamic_partition(x, select_i, num_partitions=2)[1]
       for x in tensor_list]
@@ -780,8 +780,9 @@ def _batch(tensors, batch_size, keep_input, num_threads=1, capacity=32,
     queue = _which_queue(dynamic_pad)(
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
-    summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.to_float(queue.size()) * (1. / capacity))
+    summary.scalar(
+        "fraction_of_%d_full" % capacity,
+        math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -819,8 +820,9 @@ def _batch_join(tensors_list, batch_size, keep_input, capacity=32,
     queue = _which_queue(dynamic_pad)(
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
-    summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.to_float(queue.size()) * (1. / capacity))
+    summary.scalar(
+        "fraction_of_%d_full" % capacity,
+        math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -857,8 +859,8 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
-    full = (math_ops.to_float(
-        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
+    full = (math_ops.cast(
+        math_ops.maximum(0, queue.size() - min_after_dequeue), dtypes.float32) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
@@ -899,8 +901,8 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
-    full = (math_ops.to_float(
-        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
+    full = (math_ops.cast(
+        math_ops.maximum(0, queue.size() - min_after_dequeue), dtypes.float32) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 5efc15d56f9530569b98a9cde975d74de1f110ef..b82a2611d542e5483a5e1fde32ef939865cce17d 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -554,7 +554,8 @@ class BatchTest(test_lib.TestCase):
       examples = variables.Variable(zero64)
       counter = examples.count_up_to(num_batches * batch_size)
       string = array_ops.tile(["string"],
-                              math_ops.to_int32(array_ops.stack([counter])))
+                              math_ops.cast(array_ops.stack([counter]),
+                                            dtypes.int32))
       self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       batched = inp.batch(
@@ -1143,10 +1144,12 @@ class BatchJoinTest(test_lib.TestCase):
 
       # These get joined together and grouped into batches of 5.
       batch_size = 5
-      a = array_ops.tile(["a"],
-                         math_ops.to_int32(array_ops.stack([counter + 1])))
-      b = array_ops.tile(["b"],
-                         math_ops.to_int32(array_ops.stack([ninety_nine])))
+      a = array_ops.tile(
+          ["a"],
+          math_ops.cast(array_ops.stack([counter + 1]), dtypes.int32))
+      b = array_ops.tile(
+          ["b"],
+          math_ops.cast(array_ops.stack([ninety_nine]), dtypes.int32))
       batched = inp.batch_join(
           [[counter, a], [ninety_nine, b]],
           batch_size=batch_size,
@@ -1324,10 +1327,12 @@ class BatchJoinTest(test_lib.TestCase):
 
       # These get joined together and grouped into batches of 5.
       batch_size = 5
-      a = array_ops.tile(["a"],
-                         math_ops.to_int32(array_ops.stack([counter + 1])))
-      b = array_ops.tile(["b"],
-                         math_ops.to_int32(array_ops.stack([ninety_nine])))
+      a = array_ops.tile(
+          ["a"],
+          math_ops.cast(array_ops.stack([counter + 1]), dtypes.int32))
+      b = array_ops.tile(
+          ["b"],
+          math_ops.cast(array_ops.stack([ninety_nine]), dtypes.int32))
       batched = inp.batch_join(
           [[counter, a], [ninety_nine, b]],
           batch_size=batch_size,
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index c52e89db1f47eb303b7160cef77c01bcb46aebba..ab9d923bedc721413a2120fc5be3ce302fef4e0f 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -17,8 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.eager import context
-from tensorflow.python.training import learning_rate_decay_v2
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -88,15 +91,15 @@ def exponential_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.exponential_decay(learning_rate,
-                                                        global_step,
-                                                        decay_steps,
-                                                        decay_rate,
-                                                        staircase=staircase,
-                                                        name=name)
+  decayed_lr = learning_rate_schedule.ExponentialDecay(learning_rate,
+                                                       decay_steps,
+                                                       decay_rate,
+                                                       staircase=staircase,
+                                                       name=name)
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -143,11 +146,12 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.piecewise_constant(x, boundaries, values,
-                                                         name=name)
+  decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+      boundaries, values, name=name)
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(x)
+  else:
+    decayed_lr = functools.partial(decayed_lr, x)
   return decayed_lr
 
 
@@ -236,9 +240,8 @@ def polynomial_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.polynomial_decay(
+  decayed_lr = learning_rate_schedule.PolynomialDecay(
       learning_rate,
-      global_step,
       decay_steps,
       end_learning_rate=end_learning_rate,
       power=power,
@@ -246,8 +249,9 @@ def polynomial_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -323,13 +327,15 @@ def natural_exp_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.natural_exp_decay(
-      learning_rate, global_step, decay_steps, decay_rate, staircase=staircase,
+  natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate))
+  decayed_lr = learning_rate_schedule.ExponentialDecay(
+      learning_rate, decay_steps, natural_exp_rate, staircase=staircase,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -405,17 +411,17 @@ def inverse_time_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.inverse_time_decay(
+  decayed_lr = learning_rate_schedule.InverseTimeDecay(
       learning_rate,
-      global_step,
       decay_steps,
       decay_rate,
       staircase=staircase,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -468,12 +474,13 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.cosine_decay(
-      learning_rate, global_step, decay_steps, alpha=alpha, name=name)
+  decayed_lr = learning_rate_schedule.CosineDecay(
+      learning_rate, decay_steps, alpha=alpha, name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -535,9 +542,8 @@ def cosine_decay_restarts(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+  decayed_lr = learning_rate_schedule.CosineDecayRestarts(
       learning_rate,
-      global_step,
       first_decay_steps,
       t_mul=t_mul,
       m_mul=m_mul,
@@ -545,8 +551,9 @@ def cosine_decay_restarts(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -617,9 +624,8 @@ def linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
+  decayed_lr = learning_rate_schedule.LinearCosineDecay(
       learning_rate,
-      global_step,
       decay_steps,
       num_periods=num_periods,
       alpha=alpha,
@@ -627,8 +633,9 @@ def linear_cosine_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -707,8 +714,8 @@ def noisy_linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-      learning_rate, global_step,
+  decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+      learning_rate,
       decay_steps,
       initial_variance=initial_variance,
       variance_decay=variance_decay,
@@ -718,6 +725,7 @@ def noisy_linear_cosine_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
deleted file mode 100644
index eb69feb17d3983ddb494cdf63ae30edee7062915..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/learning_rate_decay_v2.py
+++ /dev/null
@@ -1,898 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Various learning rate decay functions."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import math
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export("train.exponential_decay", v1=[])
-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies exponential decay to the learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg function that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate *
-                          decay_rate ^ (global_step / decay_steps)
-  ```
-
-  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
-  integer division and the decayed learning rate follows a staircase function.
-
-  Example: decay every 100000 steps with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate_fn = tf.train.exponential_decay(starter_learning_rate,
-                                                global_step, 100000, 0.96,
-                                                staircase=True)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    decay_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The decay rate.
-    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for exponential_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate,
-                 staircase, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(
-        name, "ExponentialDecay",
-        [learning_rate, global_step, decay_steps, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      return math_ops.multiply(
-          learning_rate, math_ops.pow(decay_rate, p), name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.piecewise_constant_decay", v1=[])
-def piecewise_constant(x, boundaries, values, name=None):
-  """Piecewise constant from boundaries and interval values.
-
-  This function returns a no-arg callable to compute the piecewise constant.
-  This can be useful for changing the learning rate value across
-  different invocations of optimizer functions.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-    for the next 10000 steps, and 0.1 for any additional steps.
-
-  ```python
-  global_step = tf.Variable(0, trainable=False)
-  boundaries = [100000, 110000]
-  values = [1.0, 0.5, 0.1]
-  learning_rate_fn = tf.train.piecewise_constant(global_step, boundaries,
-    values)
-  learning_rate = learning_rate_fn()
-
-  # Later, whenever we perform an optimization step, we increment global_step.
-  ```
-
-  Args:
-    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
-      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
-    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
-      increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
-      for the intervals defined by `boundaries`. It should have one more element
-      than `boundaries`, and all elements should have the same type.
-    name: A string. Optional name of the operation. Defaults to
-      'PiecewiseConstant'.
-
-  Returns:
-    A no-arg function that outputs a 0-D Tensor. The output of the no-arg
-    function is `values[0]` when `x <= boundaries[0]`,
-    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
-    and values[-1] when `x > boundaries[-1]`.
-
-  Raises:
-    ValueError: if types of `x` and `boundaries` do not match, or types of all
-        `values` do not match or
-        the number of elements in the lists does not match.
-  """
-  if len(boundaries) != len(values) - 1:
-    raise ValueError(
-        "The length of boundaries should be 1 less than the length of values")
-  def decayed_lr(x, boundaries, values, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "PiecewiseConstant",
-                        [x, boundaries, values, name]) as name:
-      boundaries = ops.convert_n_to_tensor(boundaries)
-      values = ops.convert_n_to_tensor(values)
-      x_recomp = ops.convert_to_tensor(x)
-      # Avoid explicit conversion to x's dtype. This could result in faulty
-      # comparisons, for example if floats are converted to integers.
-      for i, b in enumerate(boundaries):
-        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
-          # We can promote int32 boundaries to int64 without loss of precision.
-          # This covers the most common case where the user passes in boundaries
-          # as an array of Python integers.
-          if (b.dtype.base_dtype == dtypes.int32 and
-              x_recomp.dtype.base_dtype == dtypes.int64):
-            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
-            boundaries[i] = b
-          else:
-            raise ValueError(
-                "Boundaries (%s) must have the same dtype as x (%s)." %
-                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
-      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
-      for v in values[1:]:
-        if v.dtype.base_dtype != values[0].dtype.base_dtype:
-          raise ValueError(
-              "Values must have elements all with the same dtype (%s vs %s)." %
-              (values[0].dtype.base_dtype, v.dtype.base_dtype))
-      pred_fn_pairs = []
-      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
-      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
-      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
-        # Need to bind v here; can do this with lambda v=v: ...
-        pred = (x_recomp > low) & (x_recomp <= high)
-        pred_fn_pairs.append((pred, lambda v=v: v))
-
-      # The default isn't needed here because our conditions are mutually
-      # exclusive and exhaustive, but tf.case requires it.
-      default = lambda: values[0]
-      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
-
-  return functools.partial(decayed_lr, x, boundaries, values, name)
-
-
-@tf_export("train.polynomial_decay", v1=[])
-def polynomial_decay(learning_rate,
-                     global_step,
-                     decay_steps,
-                     end_learning_rate=0.0001,
-                     power=1.0,
-                     cycle=False,
-                     name=None):
-  """Applies a polynomial decay to the learning rate.
-
-  It is commonly observed that a monotonically decreasing learning rate, whose
-  degree of change is carefully chosen, results in a better performing model.
-  This function applies a polynomial decay function to a provided initial
-  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
-
-  It requires a `global_step` value to compute the decayed learning rate.  You
-  can just pass a TensorFlow variable that you increment at each training step.
-
-  The function returns a no-arg callable that outputs the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  decayed_learning_rate = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-
-  ```
-
-  If `cycle` is True then a multiple of `decay_steps` is used, the first one
-  that is bigger than `global_steps`.
-
-  ```python
-  decay_steps = decay_steps * ceil(global_step / decay_steps)
-  decayed_learning_rate_fn = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-  decayed_learning_rate = decayed_learning_rate_fn()
-
-  ```
-
-  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  end_learning_rate = 0.01
-  decay_steps = 10000
-  learning_rate_fn = tf.train.polynomial_decay(starter_learning_rate,
-                                               global_step, decay_steps,
-                                               end_learning_rate,
-                                               power=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The minimal end learning rate.
-    power: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The power of the polynomial. Defaults to linear, 1.0.
-    cycle: A boolean, whether or not it should cycle beyond decay_steps.
-    name: String.  Optional name of the operation. Defaults to
-      'PolynomialDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for polynomial_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
-                 power, cycle, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(
-        name, "PolynomialDecay",
-        [learning_rate, global_step, decay_steps, end_learning_rate, power]
-    ) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
-      power = math_ops.cast(power, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
-      if cycle:
-        # Find the first multiple of decay_steps that is bigger than
-        # global_step. If global_step is zero set the multiplier to 1
-        multiplier = control_flow_ops.cond(
-            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
-            lambda: math_ops.ceil(global_step_recomp / decay_steps))
-        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
-      else:
-        # Make sure that the global_step used is not bigger than decay_steps.
-        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-
-      p = math_ops.div(global_step_recomp, decay_steps_recomp)
-      return math_ops.add(
-          math_ops.multiply(learning_rate - end_learning_rate,
-                            math_ops.pow(1 - p, power)),
-          end_learning_rate,
-          name=name)
-
-  return functools.partial(
-      decayed_lr, learning_rate, global_step, decay_steps, end_learning_rate,
-      power, cycle, name)
-
-
-@tf_export("train.natural_exp_decay", v1=[])
-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies natural exponential decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay exponentially with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 5
-  k = 0.5
-  learning_rate_fn = tf.train.natural_exp_decay(learning_rate, global_step,
-                                                decay_steps, k)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialTimeDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for natural_exp_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
-                 name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "NaturalExpDecay",
-                        [learning_rate, global_step, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      exponent = math_ops.exp(
-          math_ops.multiply(math_ops.negative(decay_rate), p))
-      return math_ops.multiply(learning_rate, exponent, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.inverse_time_decay", v1=[])
-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False,
-                       name=None):
-  """Applies inverse time decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an inverse decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay 1/t with a rate of 0.5:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 1.0
-  decay_rate = 0.5
-  learning_rate_fn = tf.train.inverse_time_decay(learning_rate, global_step,
-  decay_steps, decay_rate)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'InverseTimeDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for inverse_time_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
-                 name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "InverseTimeDecay",
-                        [learning_rate, global_step, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      const = math_ops.cast(constant_op.constant(1), dtype)
-      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
-      return math_ops.div(learning_rate, denom, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.cosine_decay", v1=[])
-def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
-                 name=None):
-  """Applies cosine decay to the learning rate.
-
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
-  decayed = (1 - alpha) * cosine_decay + alpha
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, alpha, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "CosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      completed_fraction = global_step_recomp / decay_steps
-      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-
-      decayed = (1 - alpha) * cosine_decayed + alpha
-      return math_ops.multiply(learning_rate, decayed)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           alpha, name)
-
-
-@tf_export("train.cosine_decay_restarts", v1=[])
-def cosine_decay_restarts(learning_rate,
-                          global_step,
-                          first_decay_steps,
-                          t_mul=2.0,
-                          m_mul=1.0,
-                          alpha=0.0,
-                          name=None):
-  """Applies cosine decay with restarts to the learning rate.
-
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function with
-  restarts to a provided initial learning rate.  It requires a `global_step`
-  value to compute the decayed learning rate.  You can just pass a TensorFlow
-  variable that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate while taking into account possible warm restarts. This can be useful for
-  changing the learning rate value across different invocations of optimizer
-  functions.
-
-  The learning rate multiplier first decays
-  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-  restart is performed. Each new warm restart runs for `t_mul` times more steps
-  and with `m_mul` times smaller initial learning rate.
-
-  Example usage:
-  ```python
-  first_decay_steps = 1000
-  lr_decayed_fn = tf.train.cosine_decay_restarts(learning_rate, global_step,
-                                     first_decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the number of iterations in the i-th period
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the initial learning rate of the i-th period:
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of the learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("cosine decay restarts requires global_step")
-  def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul,
-                 alpha, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]
-                       ) as name:
-      learning_rate = ops.convert_to_tensor(
-          learning_rate, name="initial_learning_rate")
-      dtype = learning_rate.dtype
-      first_decay_steps = math_ops.cast(first_decay_steps, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      t_mul = math_ops.cast(t_mul, dtype)
-      m_mul = math_ops.cast(m_mul, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      completed_fraction = global_step_recomp / first_decay_steps
-
-      def compute_step(completed_fraction, geometric=False):
-        """Helper for `cond` operation."""
-        if geometric:
-          i_restart = math_ops.floor(
-              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
-              math_ops.log(t_mul))
-
-          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
-          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
-
-        else:
-          i_restart = math_ops.floor(completed_fraction)
-          completed_fraction -= i_restart
-
-        return i_restart, completed_fraction
-
-      i_restart, completed_fraction = control_flow_ops.cond(
-          math_ops.equal(t_mul, 1.0),
-          lambda: compute_step(completed_fraction, geometric=False),
-          lambda: compute_step(completed_fraction, geometric=True))
-
-      m_fac = m_mul**i_restart
-      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-      decayed = (1 - alpha) * cosine_decayed + alpha
-
-      return math_ops.multiply(learning_rate, decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step,
-                           first_decay_steps, t_mul, m_mul, alpha, name)
-
-
-@tf_export("train.linear_cosine_decay", v1=[])
-def linear_cosine_decay(learning_rate,
-                        global_step,
-                        decay_steps,
-                        num_periods=0.5,
-                        alpha=0.0,
-                        beta=0.001,
-                        name=None):
-  """Applies linear cosine decay to the learning rate.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a linear cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.linear_cosine_decay(learning_rate, global_step,
-                                               decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'LinearCosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("linear cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, num_periods, alpha,
-                 beta, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "LinearCosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      num_periods = math_ops.cast(num_periods, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      beta = math_ops.cast(beta, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-
-      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
-      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           num_periods, alpha, beta, name)
-
-
-@tf_export("train.noisy_linear_cosine_decay", v1=[])
-def noisy_linear_cosine_decay(learning_rate,
-                              global_step,
-                              decay_steps,
-                              initial_variance=1.0,
-                              variance_decay=0.55,
-                              num_periods=0.5,
-                              alpha=0.0,
-                              beta=0.001,
-                              name=None):
-  """Applies noisy linear cosine decay to the learning rate.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a noisy linear
-  cosine decay function to a provided initial learning rate.
-  It requires a `global_step` value to compute the decayed learning rate.
-  You can just pass a TensorFlow variable that you increment at each
-  training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-  where eps_t is 0-centered gaussian noise with variance
-  initial_variance / (1 + global_step) ** variance_decay
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.noisy_linear_cosine_decay(learning_rate, global_step,
-                                                     decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    initial_variance: initial variance for the noise. See computation above.
-    variance_decay: decay for the noise's variance. See computation above.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'NoisyLinearCosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("noisy linear cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, initial_variance,
-                 variance_decay, num_periods, alpha, beta, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "NoisyLinearCosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      initial_variance = math_ops.cast(initial_variance, dtype)
-      variance_decay = math_ops.cast(variance_decay, dtype)
-      num_periods = math_ops.cast(num_periods, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      beta = math_ops.cast(beta, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      variance = initial_variance / (
-          math_ops.pow(1.0 + global_step_recomp, variance_decay))
-      std = math_ops.sqrt(variance)
-      noisy_linear_decayed = (
-          linear_decayed + random_ops.random_normal(
-              linear_decayed.shape, stddev=std))
-
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-      noisy_linear_cosine_decayed = (
-          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
-
-      return math_ops.multiply(
-          learning_rate, noisy_linear_cosine_decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           initial_variance, variance_decay, num_periods, alpha,
-                           beta, name)
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
deleted file mode 100644
index cb96773e299a37db1d5792c84d6a837147e09d04..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/learning_rate_decay_v2_test.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functional test for learning rate decay."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
-# Import resource_variable_ops for the variables-to-tensor implicit conversion.
-from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import learning_rate_decay_v2
-
-
-class LRDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContinuous(self):
-    self.evaluate(variables.global_variables_initializer())
-    step = 5
-    decayed_lr = learning_rate_decay_v2.exponential_decay(0.05, step, 10, 0.96)
-    expected = .05 * 0.96**(5.0 / 10.0)
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    if context.executing_eagerly():
-      step = resource_variable_ops.ResourceVariable(0)
-      self.evaluate(variables.global_variables_initializer())
-      decayed_lr = learning_rate_decay_v2.exponential_decay(
-          .1, step, 3, 0.96, staircase=True)
-
-      # No change to learning rate due to staircase
-      expected = .1
-      self.evaluate(step.assign(1))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-      expected = .1
-      self.evaluate(step.assign(2))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-      # Decayed learning rate
-      expected = .1 * 0.96 ** (100 // 3)
-      self.evaluate(step.assign(100))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_deprecated_v1
-  def testVariables(self):
-    step = variables.Variable(1)
-    assign_1 = step.assign(1)
-    assign_2 = step.assign(2)
-    assign_100 = step.assign(100)
-    decayed_lr = learning_rate_decay_v2.exponential_decay(
-        .1, step, 3, 0.96, staircase=True)
-    self.evaluate(variables.global_variables_initializer())
-    # No change to learning rate
-    self.evaluate(assign_1.op)
-    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
-    self.evaluate(assign_2.op)
-    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
-    # Decayed learning rate
-    self.evaluate(assign_100.op)
-    expected = .1 * 0.96**(100 // 3)
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPiecewiseConstant(self):
-    x = resource_variable_ops.ResourceVariable(-999)
-    decayed_lr = learning_rate_decay_v2.piecewise_constant(
-        x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
-
-    self.evaluate(variables.global_variables_initializer())
-
-    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
-    self.evaluate(x.assign(100))
-    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
-    self.evaluate(x.assign(105))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
-    self.evaluate(x.assign(110))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
-    self.evaluate(x.assign(120))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.01, 1e-6)
-    self.evaluate(x.assign(999))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.001, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPiecewiseConstantEdgeCases(self):
-    x_int = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int32)
-    boundaries, values = [-1.0, 1.0], [1, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = learning_rate_decay_v2.piecewise_constant(
-          x_int, boundaries, values)
-      decayed_lr()
-
-    x = resource_variable_ops.ResourceVariable(0.0)
-    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = learning_rate_decay_v2.piecewise_constant(
-          x, boundaries, values)()
-      decayed_lr()
-
-    # Test that ref types are valid.
-    if not context.executing_eagerly():
-      x = variables.Variable(0.0)
-      x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
-      boundaries, values = [1.0, 2.0], [1, 2, 3]
-      learning_rate_decay_v2.piecewise_constant(x_ref, boundaries, values)
-
-    # Test casting boundaries from int32 to int64.
-    x_int64 = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int64)
-    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
-    decayed_lr = learning_rate_decay_v2.piecewise_constant(
-        x_int64, boundaries, values)
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(1))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(2))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.5, 1e-6)
-    self.evaluate(x_int64.assign(3))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.6, 1e-6)
-    self.evaluate(x_int64.assign(4))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.7, 1e-6)
-
-
-class LinearDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = lr * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = (lr + end_lr) * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, cycle=True)
-    expected = (lr - end_lr) * 0.25 + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class SqrtDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = lr * 0.5**power
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = (lr - end_lr) * 0.5**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power, cycle=True)
-    expected = (lr - end_lr) * 0.25**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class PolynomialDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeginWithCycle(self):
-    lr = 0.001
-    decay_steps = 10
-    step = 0
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, decay_steps, cycle=True)
-    expected = lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class ExponentialDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.natural_exp_decay(initial_lr, step, k,
-                                                          decay_rate)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.natural_exp_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-
-class InverseDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.inverse_time_decay(initial_lr, step, k,
-                                                           decay_rate)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.inverse_time_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-
-class CosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
-    step = min(step, decay_steps)
-    completed_fraction = step / decay_steps
-    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
-                                                       num_training_steps)
-      expected = self.np_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
-                                                       num_training_steps,
-                                                       alpha)
-      expected = self.np_cosine_decay(step, num_training_steps, alpha)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase):
-
-  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
-                               alpha=0.0):
-    fac = 1.0
-    while step >= decay_steps:
-      step -= decay_steps
-      decay_steps *= t_mul
-      fac *= m_mul
-
-    completed_fraction = step / decay_steps
-    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps)
-      expected = self.np_cosine_decay_restarts(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, alpha=alpha)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, alpha=alpha)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    m_mul = 0.9
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, m_mul=m_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, m_mul=m_mul)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testTMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    t_mul = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, t_mul=t_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, t_mul=t_mul)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class LinearCosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  def np_linear_cosine_decay(self,
-                             step,
-                             decay_steps,
-                             alpha=0.0,
-                             beta=0.001,
-                             num_periods=0.5):
-    step = min(step, decay_steps)
-    linear_decayed = float(decay_steps - step) / decay_steps
-    fraction = 2.0 * num_periods * step / float(decay_steps)
-    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
-    return (alpha + linear_decayed) * cosine_decayed + beta
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      expected = self.np_linear_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNonDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      expected = self.np_linear_cosine_decay(
-          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNonDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          initial_variance=0.5,
-          variance_decay=0.1,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr())
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 41a42bd2695eaecaee271409afa03653ca6b4014..6f86d1ce8e83d0d2c32d27aec3ff9e2a162bf979 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import os
 import sys
 
 import six
@@ -41,16 +42,15 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
-
 # The list of exceptions that we should recover from. Exceptions not in this
 # list may terminate the job.
 _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 
-
 # Value that indicates no value was provided.
 USE_DEFAULT = object()
 
@@ -92,8 +92,6 @@ class Scaffold(object):
     from and stored into the `LOCAL_INIT_OP` collection in the graph by default.
   * `summary_op`: An op to run and merge the summaries in the graph.  Picked
     from and stored into the `SUMMARY_OP` collection in the graph by default.
-  * `global_step`: A tensor containing the global step counter.  Picked
-    from and stored into the `GLOBAL_STEP` collection in the graph by default.
 
   You can also pass the following additional pieces to the constructor:
 
@@ -128,25 +126,23 @@ class Scaffold(object):
         a non-empty 1D string tensor listing the names of the non-initialized
         variables.
       ready_for_local_init_op: Optional op to verify that the global variables
-        are initialized and `local_init_op` can be run. Must return an empty
-        1D string tensor when the global variables are initialized, or a
-        non-empty 1D string tensor listing the names of the non-initialized
-        global variables.
+        are initialized and `local_init_op` can be run. Must return an empty 1D
+        string tensor when the global variables are initialized, or a non-empty
+        1D string tensor listing the names of the non-initialized global
+        variables.
       local_init_op: Optional op to initialize local variables.
       summary_op: Optional op to gather all summaries.  Must return a scalar
         string tensor containing a serialized `Summary` proto.
       saver: Optional `tf.train.Saver` object to use to save and restore
-        variables.
-
-        May also be a `tf.train.Checkpoint` object, in which case object-based
-        checkpoints are saved. This will also load some object-based checkpoints
-        saved from elsewhere, but that loading may be fragile since it uses
-        fixed keys rather than performing a full graph-based match. For example
-        if a variable has two paths from the `Checkpoint` object because two
-        `Model` objects share the `Layer` object that owns it, removing one
-        `Model` may change the keys and break checkpoint loading through this
-        API, whereas a graph-based match would match the variable through the
-        other `Model`.
+        variables.  May also be a `tf.train.Checkpoint` object, in which case
+        object-based checkpoints are saved. This will also load some
+        object-based checkpoints saved from elsewhere, but that loading may be
+        fragile since it uses fixed keys rather than performing a full
+        graph-based match. For example if a variable has two paths from the
+        `Checkpoint` object because two `Model` objects share the `Layer` object
+        that owns it, removing one `Model` may change the keys and break
+        checkpoint loading through this API, whereas a graph-based match would
+        match the variable through the other `Model`.
       copy_from_scaffold: Optional scaffold object to copy fields from. Its
         fields will be overwritten by the provided fields in this function.
     """
@@ -187,24 +183,26 @@ class Scaffold(object):
   def finalize(self):
     """Creates operations if needed and finalizes the graph."""
     if self._init_op is None:
+
       def default_init_op():
         return control_flow_ops.group(
             variables.global_variables_initializer(),
             resources.initialize_resources(resources.shared_resources()))
-      self._init_op = Scaffold.get_or_default(
-          'init_op',
-          ops.GraphKeys.INIT_OP,
-          default_init_op)
+
+      self._init_op = Scaffold.get_or_default('init_op', ops.GraphKeys.INIT_OP,
+                                              default_init_op)
     if self._ready_op is None:
+
       def default_ready_op():
         return array_ops.concat([
             variables.report_uninitialized_variables(),
             resources.report_uninitialized_resources()
         ], 0)
+
       self._ready_op = Scaffold.get_or_default(
-          'ready_op', ops.GraphKeys.READY_OP,
-          default_ready_op)
+          'ready_op', ops.GraphKeys.READY_OP, default_ready_op)
     if self._ready_for_local_init_op is None:
+
       def default_ready_for_local_init_op():
         return array_ops.concat([
             variables.report_uninitialized_variables(
@@ -212,6 +210,7 @@ class Scaffold(object):
             resources.report_uninitialized_resources(
                 resources.shared_resources())
         ], 0)
+
       self._ready_for_local_init_op = Scaffold.get_or_default(
           'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
           default_ready_for_local_init_op)
@@ -220,17 +219,16 @@ class Scaffold(object):
           'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
           Scaffold.default_local_init_op)
     if self._summary_op is None:
-      self._summary_op = Scaffold.get_or_default('summary_op',
-                                                 ops.GraphKeys.SUMMARY_OP,
-                                                 summary.merge_all)
+      self._summary_op = Scaffold.get_or_default(
+          'summary_op', ops.GraphKeys.SUMMARY_OP, summary.merge_all)
     # pylint: disable=g-long-lambda
     if self._saver is None:
       self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
-    if isinstance(self._saver, checkpointable_util.Checkpoint):
+    if isinstance(self._saver, trackable_util.Checkpoint):
       self._saver = training_saver.Saver(
-          var_list=checkpointable_util.CheckpointableSaver(
-              self._saver).gather_objects(),
+          var_list=graph_view.ObjectGraphView(
+              self._saver).frozen_saveable_objects(),
           sharded=True)
     else:
       self._saver.build()
@@ -277,11 +275,11 @@ class Scaffold(object):
     elements = ops.get_collection(collection_key)
     if elements:
       if len(elements) > 1:
-        raise RuntimeError('More than one item in the collection "%s". '
-                           'Please indicate which one to use by passing it to '
-                           'the tf.Scaffold constructor as:  '
-                           'tf.Scaffold(%s=item to use)', collection_key,
-                           arg_name)
+        raise RuntimeError(
+            'More than one item in the collection "%s". '
+            'Please indicate which one to use by passing it to '
+            'the tf.Scaffold constructor as:  '
+            'tf.Scaffold(%s=item to use)', collection_key, arg_name)
       return elements[0]
     op = default_constructor()
     if op is not None:
@@ -306,52 +304,102 @@ class Scaffold(object):
         resources.initialize_resources(resources.local_resources()))
 
 
-def _create_monitored_session_with_worker_context(worker_context,  # pylint: disable=missing-docstring
-                                                  scaffold,
-                                                  checkpoint_dir=None,
-                                                  hooks=None,
-                                                  chief_only_hooks=None,
-                                                  save_checkpoint_secs=None,
-                                                  save_summaries_steps=None,
-                                                  save_summaries_secs=None,
-                                                  config=None,
-                                                  stop_grace_period_secs=120,
-                                                  log_step_count_steps=100,
-                                                  max_wait_secs=7200,
-                                                  save_checkpoint_steps=None,
-                                                  summary_dir=None):
+def _create_monitored_session_with_worker_context(
+    worker_context,  # pylint: disable=missing-docstring
+    scaffold,
+    checkpoint_dir=None,
+    hooks=None,
+    chief_only_hooks=None,
+    save_checkpoint_secs=None,
+    save_summaries_steps=None,
+    save_summaries_secs=None,
+    config=None,
+    stop_grace_period_secs=120,
+    log_step_count_steps=100,
+    max_wait_secs=7200,
+    save_checkpoint_steps=None,
+    summary_dir=None):
   all_hooks = []
   if hooks:
     all_hooks.extend(hooks)
   if chief_only_hooks and worker_context.is_chief:
     all_hooks.extend(chief_only_hooks)
 
+  # We need to call save or summary ops on all workers since these ops may
+  # contain collective ops, only running save ops on some workers would make
+  # collective ops hang. Therefore on those workers that don't need to actually
+  # write checkpoints or summaries, we let them write to a temp directory.
+  # pylint: disable=protected-access
+  if type(worker_context._strategy).__name__ in ('CollectiveAllReduceStrategy',
+                                                 'MultiWorkerMirroredStrategy'):
+    if worker_context.task_type:
+      tmpdir = 'tmp_%s_%d' % (worker_context.task_type, worker_context.task_id)
+    else:
+      tmpdir = 'tmp'
+
+    if save_checkpoint_secs:
+      logging.warning('Collective ops may deadlock with '
+                      '`save_checkpoints_secs` please use '
+                      '`save_checkpoint_steps` instead. Clearing '
+                      '`save_checkpoint_secs` and setting '
+                      '`save_checkpoint_steps` to 1000 now.')
+      save_checkpoint_secs = None
+      save_checkpoint_steps = 1000
+    if save_summaries_secs:
+      logging.warning('Collective ops may run out of sync with'
+                      '`save_summaries_secs`, please use '
+                      '`save_summaries_steps` instead.')
+  else:
+    tmpdir = None
+
   summary_dir = summary_dir or checkpoint_dir
-  if summary_dir and worker_context.should_save_summary:
-    if log_step_count_steps and log_step_count_steps > 0:
+  if summary_dir and log_step_count_steps and log_step_count_steps > 0:
+    if worker_context.should_save_summary:
       all_hooks.append(
           basic_session_run_hooks.StepCounterHook(
               output_dir=summary_dir, every_n_steps=log_step_count_steps))
+    elif tmpdir:
+      all_hooks.append(
+          basic_session_run_hooks.StepCounterHook(
+              output_dir=os.path.join(summary_dir, tmpdir),
+              every_n_steps=log_step_count_steps))
 
-    if (save_summaries_steps and save_summaries_steps > 0) or (
-        save_summaries_secs and save_summaries_secs > 0):
+  if (((save_summaries_steps and save_summaries_steps > 0) or
+       (save_summaries_secs and save_summaries_secs > 0)) and summary_dir):
+    if worker_context.should_save_summary:
       all_hooks.append(
           basic_session_run_hooks.SummarySaverHook(
               scaffold=scaffold,
               save_steps=save_summaries_steps,
               save_secs=save_summaries_secs,
               output_dir=summary_dir))
-
-  if checkpoint_dir and worker_context.should_checkpoint:
-    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
-        save_checkpoint_steps and save_checkpoint_steps > 0):
+    elif tmpdir:
       all_hooks.append(
-          basic_session_run_hooks.CheckpointSaverHook(
-              checkpoint_dir,
-              save_steps=save_checkpoint_steps,
-              save_secs=save_checkpoint_secs,
-              scaffold=scaffold))
-
+          basic_session_run_hooks.SummarySaverHook(
+              scaffold=scaffold,
+              save_steps=save_summaries_steps,
+              save_secs=save_summaries_secs,
+              output_dir=os.path.join(summary_dir, tmpdir)))
+
+    if (((save_checkpoint_secs and save_checkpoint_secs > 0) or
+         (save_checkpoint_steps and save_checkpoint_steps > 0)) and
+        checkpoint_dir):
+      if worker_context.should_checkpoint:
+        all_hooks.append(
+            basic_session_run_hooks.CheckpointSaverHook(
+                checkpoint_dir,
+                save_steps=save_checkpoint_steps,
+                save_secs=save_checkpoint_secs,
+                scaffold=scaffold))
+      elif tmpdir:
+        all_hooks.append(
+            basic_session_run_hooks.CheckpointSaverHook(
+                os.path.join(checkpoint_dir, tmpdir),
+                save_steps=save_checkpoint_steps,
+                save_secs=save_checkpoint_secs,
+                scaffold=scaffold))
+
+  logging.info('all_hooks %r', all_hooks)
   session_creator = worker_context.session_creator(
       scaffold,
       config=config,
@@ -364,21 +412,22 @@ def _create_monitored_session_with_worker_context(worker_context,  # pylint: dis
 
 
 @tf_export(v1=['train.MonitoredTrainingSession'])
-def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
-                             is_chief=True,
-                             checkpoint_dir=None,
-                             scaffold=None,
-                             hooks=None,
-                             chief_only_hooks=None,
-                             save_checkpoint_secs=USE_DEFAULT,
-                             save_summaries_steps=USE_DEFAULT,
-                             save_summaries_secs=USE_DEFAULT,
-                             config=None,
-                             stop_grace_period_secs=120,
-                             log_step_count_steps=100,
-                             max_wait_secs=7200,
-                             save_checkpoint_steps=USE_DEFAULT,
-                             summary_dir=None):
+def MonitoredTrainingSession(
+    master='',  # pylint: disable=invalid-name
+    is_chief=True,
+    checkpoint_dir=None,
+    scaffold=None,
+    hooks=None,
+    chief_only_hooks=None,
+    save_checkpoint_secs=USE_DEFAULT,
+    save_summaries_steps=USE_DEFAULT,
+    save_summaries_secs=USE_DEFAULT,
+    config=None,
+    stop_grace_period_secs=120,
+    log_step_count_steps=100,
+    max_wait_secs=7200,
+    save_checkpoint_steps=USE_DEFAULT,
+    summary_dir=None):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
@@ -395,8 +444,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       initialize or recover the TensorFlow session.
     checkpoint_dir: A string.  Optional path to a directory where to restore
       variables.
-    scaffold: A `Scaffold` used for gathering or building supportive ops. If
-      not specified, a default one is created. It's used to finalize the graph.
+    scaffold: A `Scaffold` used for gathering or building supportive ops. If not
+      specified, a default one is created. It's used to finalize the graph.
     hooks: Optional list of `SessionRunHook` objects.
     chief_only_hooks: list of `SessionRunHook` objects. Activate these hooks if
       `is_chief==True`, ignore otherwise.
@@ -419,17 +468,17 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       `close()` has been called.
     log_step_count_steps: The frequency, in number of global steps, that the
       global step/sec is logged.
-    max_wait_secs: Maximum time workers should wait for the session to
-      become available. This should be kept relatively short to help detect
-      incorrect code, but sometimes may need to be increased if the chief takes
-      a while to start up.
+    max_wait_secs: Maximum time workers should wait for the session to become
+      available. This should be kept relatively short to help detect incorrect
+      code, but sometimes may need to be increased if the chief takes a while to
+      start up.
     save_checkpoint_steps: The frequency, in number of global steps, that a
       checkpoint is saved using a default checkpoint saver. If both
       `save_checkpoint_steps` and `save_checkpoint_secs` are set to `None`, then
       the default checkpoint saver isn't used. If both are provided, then only
       `save_checkpoint_secs` is used. Default not enabled.
-    summary_dir: A string.  Optional path to a directory where to
-      save summaries. If None, checkpoint_dir is used instead.
+    summary_dir: A string.  Optional path to a directory where to save
+      summaries. If None, checkpoint_dir is used instead.
 
   Returns:
     A `MonitoredSession` object.
@@ -498,8 +547,9 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
           basic_session_run_hooks.StepCounterHook(
               output_dir=summary_dir, every_n_steps=log_step_count_steps))
 
-    if (save_summaries_steps and save_summaries_steps > 0) or (
-        save_summaries_secs and save_summaries_secs > 0):
+    if (save_summaries_steps and
+        save_summaries_steps > 0) or (save_summaries_secs and
+                                      save_summaries_secs > 0):
       all_hooks.append(
           basic_session_run_hooks.SummarySaverHook(
               scaffold=scaffold,
@@ -508,8 +558,9 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
               output_dir=summary_dir))
 
   if checkpoint_dir:
-    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
-        save_checkpoint_steps and save_checkpoint_steps > 0):
+    if (save_checkpoint_secs and
+        save_checkpoint_secs > 0) or (save_checkpoint_steps and
+                                      save_checkpoint_steps > 0):
       all_hooks.append(
           basic_session_run_hooks.CheckpointSaverHook(
               checkpoint_dir,
@@ -626,15 +677,16 @@ class WorkerSessionCreator(SessionCreator):
   def create_session(self):
     self._scaffold.finalize()
     return self._get_session_manager().wait_for_session(
-        self._master, config=self._config,
-        max_wait_secs=self._max_wait_secs
-    )
+        self._master, config=self._config, max_wait_secs=self._max_wait_secs)
 
 
 class _MonitoredSession(object):
   """See `MonitoredSession` or `SingularMonitoredSession`."""
 
-  def __init__(self, session_creator, hooks, should_recover,
+  def __init__(self,
+               session_creator,
+               hooks,
+               should_recover,
                stop_grace_period_secs=120):
     """Sets up a Monitored or Hooked Session.
 
@@ -687,44 +739,32 @@ class _MonitoredSession(object):
     Returns:
       Same as `tf.Session.run()`.
     """
-    return self._sess.run(fetches,
-                          feed_dict=feed_dict,
-                          options=options,
-                          run_metadata=run_metadata)
+    return self._sess.run(
+        fetches,
+        feed_dict=feed_dict,
+        options=options,
+        run_metadata=run_metadata)
 
   def run_step_fn(self, step_fn):
     """Run ops using a step function.
 
     Args:
       step_fn: A function or a method with a single argument of type
-        `StepContext`.  The function may use methods of the argument to
-        perform computations with access to a raw session.
-
-        The returned value of the `step_fn` will be returned from `run_step_fn`,
-        unless a stop is requested.  In that case, the next `should_stop` call
-        will return True.
-
-        Example usage:
-
-        ```python
-           with tf.Graph().as_default():
-             c = tf.placeholder(dtypes.float32)
-             v = tf.add(c, 4.0)
-             w = tf.add(c, 0.5)
-
+        `StepContext`.  The function may use methods of the argument to perform
+        computations with access to a raw session.  The returned value of the
+        `step_fn` will be returned from `run_step_fn`, unless a stop is
+        requested.  In that case, the next `should_stop` call will return True.
+        Example usage:  ```python
+           with tf.Graph().as_default(): c = tf.placeholder(dtypes.float32) v =
+             tf.add(c, 4.0) w = tf.add(c, 0.5)
              def step_fn(step_context):
                a = step_context.session.run(fetches=v, feed_dict={c: 0.5})
-               if a <= 4.5:
-                 step_context.request_stop()
+               if a <= 4.5: step_context.request_stop()
                return step_context.run_with_hooks(fetches=w, feed_dict={c: 0.1})
-
              with tf.MonitoredSession() as session:
-               while not session.should_stop():
-                 a = session.run_step_fn(step_fn)
-        ```
-
-        Hooks interact with the `run_with_hooks()` call inside the `step_fn`
-        as they do with a `MonitoredSession.run` call.
+               while not session.should_stop(): a = session.run_step_fn(step_fn)
+                 ```  Hooks interact with the `run_with_hooks()` call inside the
+                 `step_fn` as they do with a `MonitoredSession.run` call.
 
     Returns:
       Returns the returned value of `step_fn`.
@@ -849,7 +889,9 @@ class _MonitoredSession(object):
           ops.get_default_graph()._unsafe_unfinalize()  # pylint: disable=protected-access
 
   def _is_closed(self):
-    """Return True if the monitored session is closed.  For tests only.
+    """Return True if the monitored session is closed.
+
+    For tests only.
 
     Returns:
       A boolean.
@@ -944,10 +986,14 @@ class MonitoredSession(_MonitoredSession):
     A MonitoredSession object.
   """
 
-  def __init__(self, session_creator=None, hooks=None,
+  def __init__(self,
+               session_creator=None,
+               hooks=None,
                stop_grace_period_secs=120):
     super(MonitoredSession, self).__init__(
-        session_creator, hooks, should_recover=True,
+        session_creator,
+        hooks,
+        should_recover=True,
         stop_grace_period_secs=stop_grace_period_secs)
 
 
@@ -1034,7 +1080,9 @@ class SingularMonitoredSession(_MonitoredSession):
         checkpoint_dir=checkpoint_dir,
         checkpoint_filename_with_path=checkpoint_filename_with_path)
     super(SingularMonitoredSession, self).__init__(
-        session_creator, hooks, should_recover=False,
+        session_creator,
+        hooks,
+        should_recover=False,
         stop_grace_period_secs=stop_grace_period_secs)
 
   def raw_session(self):
@@ -1098,9 +1146,10 @@ class _WrappedSession(object):
       try:
         self._sess.close()
       except _PREEMPTION_ERRORS as e:
-        logging.warning('An error occurred when attempting to close the '
-                        'session. This may be due to a preemption in a '
-                        'connected worker or parameter server. Error: %s', e)
+        logging.warning(
+            'An error occurred when attempting to close the '
+            'session. This may be due to a preemption in a '
+            'connected worker or parameter server. Error: %s', e)
       finally:
         self._sess = None
 
@@ -1143,14 +1192,15 @@ class _RecoverableSession(_WrappedSession):
       try:
         return self._sess_creator.create_session()
       except _PREEMPTION_ERRORS as e:
-        logging.info('An error was raised while a session was being created. '
-                     'This may be due to a preemption of a connected worker '
-                     'or parameter server. A new session will be created. '
-                     'This error may also occur due to a gRPC failure caused '
-                     'by high memory or network bandwidth usage in the '
-                     'parameter servers. If this error occurs repeatedly, try '
-                     'increasing the number of parameter servers assigned to '
-                     'the job. Error: %s', e)
+        logging.info(
+            'An error was raised while a session was being created. '
+            'This may be due to a preemption of a connected worker '
+            'or parameter server. A new session will be created. '
+            'This error may also occur due to a gRPC failure caused '
+            'by high memory or network bandwidth usage in the '
+            'parameter servers. If this error occurs repeatedly, try '
+            'increasing the number of parameter servers assigned to '
+            'the job. Error: %s', e)
 
   def _check_stop(self):
     try:
@@ -1159,15 +1209,16 @@ class _RecoverableSession(_WrappedSession):
       else:
         return True
     except _PREEMPTION_ERRORS as e:
-      logging.info('An error was raised while considering whether the '
-                   'session is complete. This may be due to a preemption in '
-                   'a connected worker or parameter server. The current '
-                   'session will be closed and a new session will be '
-                   'created. This error may also occur due to a gRPC failure '
-                   'caused by high memory or network bandwidth usage in the '
-                   'parameter servers. If this error occurs repeatedly, try '
-                   'increasing the number of parameter servers assigned to '
-                   'the job. Error: %s', e)
+      logging.info(
+          'An error was raised while considering whether the '
+          'session is complete. This may be due to a preemption in '
+          'a connected worker or parameter server. The current '
+          'session will be closed and a new session will be '
+          'created. This error may also occur due to a gRPC failure '
+          'caused by high memory or network bandwidth usage in the '
+          'parameter servers. If this error occurs repeatedly, try '
+          'increasing the number of parameter servers assigned to '
+          'the job. Error: %s', e)
       self.close()
       self._sess = self._create_session()
       # Since we have just recreated the session, the overall computation should
@@ -1182,19 +1233,21 @@ class _RecoverableSession(_WrappedSession):
       try:
         if not self._sess:
           self._sess = self._create_session()
-        return self._sess.run(fetches,
-                              feed_dict=feed_dict,
-                              options=options,
-                              run_metadata=run_metadata)
+        return self._sess.run(
+            fetches,
+            feed_dict=feed_dict,
+            options=options,
+            run_metadata=run_metadata)
       except _PREEMPTION_ERRORS as e:
-        logging.info('An error was raised. This may be due to a preemption in '
-                     'a connected worker or parameter server. The current '
-                     'session will be closed and a new session will be '
-                     'created. This error may also occur due to a gRPC failure '
-                     'caused by high memory or network bandwidth usage in the '
-                     'parameter servers. If this error occurs repeatedly, try '
-                     'increasing the number of parameter servers assigned to '
-                     'the job. Error: %s', e)
+        logging.info(
+            'An error was raised. This may be due to a preemption in '
+            'a connected worker or parameter server. The current '
+            'session will be closed and a new session will be '
+            'created. This error may also occur due to a gRPC failure '
+            'caused by high memory or network bandwidth usage in the '
+            'parameter servers. If this error occurs repeatedly, try '
+            'increasing the number of parameter servers assigned to '
+            'the job. Error: %s', e)
         self.close()
         self._sess = None
 
@@ -1207,14 +1260,15 @@ class _RecoverableSession(_WrappedSession):
         run_with_hooks = self._sess.run
         return self._sess.run_step_fn(step_fn, raw_session, run_with_hooks)
       except _PREEMPTION_ERRORS as e:
-        logging.info('An error was raised. This may be due to a preemption in '
-                     'a connected worker or parameter server. The current '
-                     'session will be closed and a new session will be '
-                     'created. This error may also occur due to a gRPC failure '
-                     'caused by high memory or network bandwidth usage in the '
-                     'parameter servers. If this error occurs repeatedly, try '
-                     'increasing the number of parameter servers assigned to '
-                     'the job. Error: %s', e)
+        logging.info(
+            'An error was raised. This may be due to a preemption in '
+            'a connected worker or parameter server. The current '
+            'session will be closed and a new session will be '
+            'created. This error may also occur due to a gRPC failure '
+            'caused by high memory or network bandwidth usage in the '
+            'parameter servers. If this error occurs repeatedly, try '
+            'increasing the number of parameter servers assigned to '
+            'the job. Error: %s', e)
         self.close()
         self._sess = None
 
@@ -1337,11 +1391,12 @@ class _HookedSession(_WrappedSession):
 
     # Do session run.
     run_metadata = run_metadata or config_pb2.RunMetadata()
-    outputs = _WrappedSession.run(self,
-                                  fetches=actual_fetches,
-                                  feed_dict=feed_dict,
-                                  options=options,
-                                  run_metadata=run_metadata)
+    outputs = _WrappedSession.run(
+        self,
+        fetches=actual_fetches,
+        feed_dict=feed_dict,
+        options=options,
+        run_metadata=run_metadata)
 
     for hook in self._hooks:
       hook.after_run(
@@ -1364,9 +1419,8 @@ class _HookedSession(_WrappedSession):
         if request.fetches is not None:
           fetch_dict[hook] = request.fetches
         if request.feed_dict:
-          self._raise_if_feeds_intersects(
-              hook_feeds, request.feed_dict,
-              'Same tensor is fed by two hooks.')
+          self._raise_if_feeds_intersects(hook_feeds, request.feed_dict,
+                                          'Same tensor is fed by two hooks.')
           hook_feeds.update(request.feed_dict)
         if request.options:
           self._merge_run_options(options, request.options)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 6d24f8e17e797cc7e525d7a359010be45ca7b71c..3084dbe300b5bb16c74ee13be5461bba65e9fcaf 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -32,6 +32,7 @@ from tensorflow.contrib.testing.python.framework import util_test
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import debug_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_coordinator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -487,6 +488,32 @@ class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
     checkpoint = checkpoint_management.latest_checkpoint(logdir)
     self.assertIsNone(checkpoint)
 
+  def test_checkpoint_hook_enable_on_non_chief_with_collective_ops(self):
+    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+    strategy.extended._is_chief = False
+
+    context = distribute_coordinator._WorkerContext(strategy, None, 'worker', 1)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_disabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+
+    # No checkpoint is saved.
+    checkpoint = checkpoint_management.latest_checkpoint(logdir)
+    self.assertIsNone(checkpoint)
+
+    # But saved to a temporary directory.
+    checkpoint = checkpoint_management.latest_checkpoint(
+        os.path.join(logdir, 'tmp_worker_1'))
+    self.assertIsNotNone(checkpoint)
+
 
 class StopAtNSession(monitored_session._WrappedSession):
   """A wrapped session that stops at the N-th call to _check_stop."""
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 6efcab28c5249fe943f6d4a1b0b6b7866271571f..1bae24f67aaf69951cbb2946fd90ca19362d8669 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -203,7 +203,8 @@ def _zero_debias(unbiased_var, value, decay):
     tensor will also update the shadow variables appropriately.
   """
   with variable_scope.variable_scope(
-      unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
+      unbiased_var.name[:-len(":0")], values=[unbiased_var,
+                                              value, decay]) as scope:
     with ops.colocate_with(unbiased_var):
       with ops.init_scope():
         biased_initializer = init_ops.zeros_initializer(
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 0a7cff4f56207dcfadf095da5e03371730417ad2..889d1119555894f7c73eae51c779ecf030fb1c90 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -35,12 +35,19 @@ from tensorflow.python.training import saver as saver_lib
 
 class MovingAveragesTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testAssignMovingAverageWithoutZeroDebias(self):
-    with self.cached_session():
-      var = variables.Variable([10.0, 11.0])
-      val = constant_op.constant([1.0, 2.0], dtypes.float32)
-      decay = 0.25
+    var = variables.Variable([10.0, 11.0])
+    val = constant_op.constant([1.0, 2.0], dtypes.float32)
+    decay = 0.25
+    if context.executing_eagerly():
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      self.assertAllClose(
+          [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
+          self.evaluate(var))
+    else:
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
       self.evaluate(variables.global_variables_initializer())
@@ -50,19 +57,26 @@ class MovingAveragesTest(test.TestCase):
           [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
           self.evaluate(var))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testAssignMovingAverage(self):
     with self.cached_session():
       var = variables.Variable([0.0, 0.0])
       val = constant_op.constant([1.0, 2.0], dtypes.float32)
       decay = 0.25
-      assign = moving_averages.assign_moving_average(var, val, decay)
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllClose([0.0, 0.0], self.evaluate(var))
-      assign.op.run()
-      self.assertAllClose(
-          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
-          self.evaluate(var))
+      if context.executing_eagerly():
+        self.assertAllClose([0.0, 0.0], self.evaluate(var))
+        assign = moving_averages.assign_moving_average(var, val, decay)
+        self.assertAllClose(
+            [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+            self.evaluate(var))
+      else:
+        assign = moving_averages.assign_moving_average(var, val, decay)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var))
+        assign.op.run()
+        self.assertAllClose(
+            [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+            self.evaluate(var))
 
   @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCalls(self):
@@ -143,7 +157,6 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(bfloat16(numerator_2 / denominator_2), wma_array)
 
-
 def _Repeat(value, dim):
   if dim == 1:
     return value
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 3742ebb807f4c245aef956144f7888d7b0560375..4361f07e196050c87338d0f7102f530d2c2c9be7 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -40,7 +39,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -215,10 +214,10 @@ def _get_processor(v):
 
 @tf_export(v1=["train.Optimizer"])
 class Optimizer(
-    # Optimizers inherit from CheckpointableBase rather than Checkpointable
+    # Optimizers inherit from Trackable rather than AutoTrackable
     # since they do most of their dependency management themselves (slot
     # variables are special-cased, and non-slot variables are keyed to graphs).
-    checkpointable.Checkpointable):
+    trackable.Trackable):
   """Base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -334,9 +333,9 @@ class Optimizer(
     #   ... }
     self._slots = {}
     self._non_slot_dict = {}
-    # For implementing Checkpointable. Stores information about how to restore
+    # For implementing Trackable. Stores information about how to restore
     # slot variables which have not yet been created
-    # (checkpointable._CheckpointPosition objects).
+    # (trackable._CheckpointPosition objects).
     #  {slot_name :
     #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
     #   ... }
@@ -461,12 +460,6 @@ class Optimizer(
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss if using a "mean" loss reduction and multiple replicas.
-        # Have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
-        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        loss_value = self._scale_loss(loss_value)
-
       if var_list is None:
         var_list = tape.watched_variables()
       # TODO(jhseu): Figure out why GradientTape's gradients don't require loss
@@ -481,9 +474,6 @@ class Optimizer(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
 
-    # Scale loss if using a "mean" loss reduction and multiple replicas.
-    loss = self._scale_loss(loss)
-
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -518,14 +508,6 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
-
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -814,7 +796,7 @@ class Optimizer(
     key = (name, graph)
     v = self._non_slot_dict.get(key, None)
     if v is None:
-      self._maybe_initialize_checkpointable()
+      self._maybe_initialize_trackable()
       distribution_strategy = distribute_ctx.get_strategy()
       with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
@@ -827,19 +809,19 @@ class Optimizer(
             use_resource=resource_variable_ops.is_resource_variable(
                 colocate_with))
       # Restore this variable by name if necessary, but don't add a
-      # Checkpointable dependency. Optimizers return the current graph's
+      # Trackable dependency. Optimizers return the current graph's
       # non-slot variables from _checkpoint_dependencies explicitly rather
       # than unconditionally adding dependencies (since there may be multiple
       # non-slot variables with the same name in different graphs, trying to
       # save all of them would result in errors).
-      self._handle_deferred_dependencies(name=name, checkpointable=v)
+      self._handle_deferred_dependencies(name=name, trackable=v)
       self._non_slot_dict[key] = v
 
     return v
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     current_graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     for (name, _), variable_object in sorted(self._non_slot_dict.items(),
@@ -847,13 +829,13 @@ class Optimizer(
                                              key=lambda item: item[0][0]):
       if variable_object._graph_key == current_graph_key:  # pylint: disable=protected-access
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     return (super(Optimizer, self)._checkpoint_dependencies
             + current_graph_non_slot_variables)
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     unconditional = super(Optimizer, self)._lookup_dependency(name)
     if unconditional is not None:
       return unconditional
@@ -1158,7 +1140,7 @@ class Optimizer(
     return named_slots[_var_key(var)]
 
   # --------------
-  # For implementing the Checkpointable interface.
+  # For implementing the Trackable interface.
   # --------------
 
   def _restore_slot_variable(self, slot_name, variable, slot_variable):
@@ -1189,8 +1171,8 @@ class Optimizer(
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
@@ -1208,7 +1190,7 @@ class Optimizer(
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
           var=variable,
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index e175b5a79989e4c7b6b4c736eefe0250e9ebbcc9..ac831cb6422f8995b81c81e86f038041e4fb2567 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -75,7 +75,7 @@ class OptimizerTest(test.TestCase):
         opt_op = sgd_op.minimize(
             cost,
             global_step, [var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod.
+            aggregation_method=gradients_util.AggregationMethod.
             EXPERIMENTAL_ACCUMULATE_N)
 
         variables.global_variables_initializer().run()
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 369b6cbb50e5c621737c095a24eeb473f3870534..6eca0e6cb5f32a34b178c14c9fe86d00fdd0fdfe 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -27,7 +27,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalGradientDescentOptimizer")
+@tf_export(v1=["train.ProximalGradientDescentOptimizer"])
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 348b8bf1ef0a89a971eb26c9cb7e5f9d01c51a4b..215fc3965248c6582993bae3844cc0e3067acce3 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -17,7 +17,7 @@
 """Save and restore variables.
 
 Symbols in this file are deprecated. See replacements in
-tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+tensorflow/python/training/trackable and tensorflow/python/training/saving.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,10 +29,9 @@ import time
 import uuid
 
 import numpy as np
-
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -51,9 +50,9 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1078,7 +1077,8 @@ class Saver(object):
            meta_graph_suffix="meta",
            write_meta_graph=True,
            write_state=True,
-           strip_default_attrs=False):
+           strip_default_attrs=False,
+           save_debug_info=False):
     # pylint: disable=line-too-long
     """Saves variables.
 
@@ -1108,6 +1108,10 @@ class Saver(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+        which in the same directory of save_path and with `_debug` added before
+        the file extension. This is only enabled when `write_meta_graph` is
+        `True`
 
     Returns:
       A string: path prefix used for the checkpoint files.  If the saver is
@@ -1193,7 +1197,8 @@ class Saver(object):
       if not context.executing_eagerly():
         with sess.graph.as_default():
           self.export_meta_graph(
-              meta_graph_filename, strip_default_attrs=strip_default_attrs)
+              meta_graph_filename, strip_default_attrs=strip_default_attrs,
+              save_debug_info=save_debug_info)
 
     if self._is_empty:
       return None
@@ -1207,7 +1212,8 @@ class Saver(object):
                         export_scope=None,
                         clear_devices=False,
                         clear_extraneous_savers=False,
-                        strip_default_attrs=False):
+                        strip_default_attrs=False,
+                        save_debug_info=False):
     # pylint: disable=line-too-long
     """Writes `MetaGraphDef` to save_path/filename.
 
@@ -1224,6 +1230,9 @@ class Saver(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+        which in the same directory of filename and with `_debug` added before
+        the file extension.
 
     Returns:
       A `MetaGraphDef` proto.
@@ -1238,7 +1247,8 @@ class Saver(object):
         export_scope=export_scope,
         clear_devices=clear_devices,
         clear_extraneous_savers=clear_extraneous_savers,
-        strip_default_attrs=strip_default_attrs)
+        strip_default_attrs=strip_default_attrs,
+        save_debug_info=save_debug_info)
 
   def restore(self, sess, save_path):
     """Restores previously saved variables.
@@ -1497,6 +1507,7 @@ def export_meta_graph(filename=None,
                       clear_devices=False,
                       clear_extraneous_savers=False,
                       strip_default_attrs=False,
+                      save_debug_info=False,
                       **kwargs):
   # pylint: disable=line-too-long
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
@@ -1527,6 +1538,9 @@ def export_meta_graph(filename=None,
     strip_default_attrs: Boolean. If `True`, default-valued attributes will be
       removed from the NodeDefs. For a detailed guide, see
       [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+    save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+      which in the same directory of filename and with `_debug` added before
+      the file extend.
     **kwargs: Optional keyed arguments.
 
   Returns:
@@ -1537,12 +1551,13 @@ def export_meta_graph(filename=None,
     RuntimeError: If called with eager execution enabled.
 
   @compatibility(eager)
-  Exporting/importing meta graphs is not supported. No graph exists when eager
-  execution is enabled.
+  Exporting/importing meta graphs is not supported unless both `graph_def` and
+  `graph` are provided. No graph exists when eager execution is enabled.
   @end_compatibility
   """
   # pylint: enable=line-too-long
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
                        "execution is enabled.")
@@ -1558,6 +1573,7 @@ def export_meta_graph(filename=None,
       clear_devices=clear_devices,
       clear_extraneous_savers=clear_extraneous_savers,
       strip_default_attrs=strip_default_attrs,
+      save_debug_info=save_debug_info,
       **kwargs)
   return meta_graph_def
 
@@ -1588,9 +1604,9 @@ def object_graph_key_mapping(checkpoint_path):
   """
   reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
   object_graph_string = reader.get_tensor(
-      checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      trackable.OBJECT_GRAPH_PROTO_KEY)
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   names_to_keys = {}
   for node in object_graph_proto.nodes:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index dec23c50e8c069d4f2dd18c49ecdabb447f4872b..9b2a1da7c29723b589b67484bd2e1d880ef1363d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -73,9 +73,9 @@ from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable_tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable_tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import compat
 
 
@@ -2775,15 +2775,15 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
-class _OwnsAVariableSimple(checkpointable_base.Checkpointable):
-  """A Checkpointable object which can be saved using a tf.train.Saver."""
+class _OwnsAVariableSimple(trackable_base.Trackable):
+  """A Trackable object which can be saved using a tf.train.Saver."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
         name="non_dep_variable", initializer=6., use_resource=True)
 
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
+    return {trackable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2808,8 +2808,8 @@ class _MirroringSaveable(
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(trackable_base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -2823,7 +2823,7 @@ class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
           primary_variable=self.non_dep_variable,
           mirrored_variable=self.mirrored,
           name=name)
-    return {checkpointable_base.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable_base.VARIABLE_VALUE_KEY: _saveable_factory}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2831,11 +2831,11 @@ class _OwnsMirroredVariables(checkpointable_base.Checkpointable):
     return self.non_dep_variable.name
 
 
-class NonLayerCheckpointable(checkpointable_tracking.AutoCheckpointable):
+class NonLayerTrackable(trackable_tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -2846,19 +2846,19 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
     return ret
 
 
-class CheckpointableCompatibilityTests(test.TestCase):
+class TrackableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
   @test_util.run_in_graph_and_eager_modes
-  def testNotSaveableButIsCheckpointable(self):
+  def testNotSaveableButIsTrackable(self):
     v = _OwnsAVariableSimple()
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
@@ -2923,13 +2923,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -2938,24 +2938,24 @@ class CheckpointableCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta1_power))
 
   def testVariableNotFoundErrorRaised(self):
@@ -3012,13 +3012,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
       # An incompatible object-based checkpoint to check error messages
       var = resource_variable_ops.ResourceVariable(1., name="a")
       self.evaluate(var.initializer)
-      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_saver = trackable_utils.Checkpoint(v=var)
       second_path = second_saver.save(file_prefix=os.path.join(
           checkpoint_directory, "second"))
 
@@ -3046,7 +3046,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph):
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
     with context.eager_mode():
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 67ccd59b88c289a11791c9098a2014c48e6c33fb..5577a2b4ae80325cffcb5879764dd7f953b045a1 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -34,6 +34,7 @@ cuda_py_test(
         ":functional_saver",
         "//tensorflow/python/eager:test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -49,7 +50,7 @@ py_library(
     deps = [
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index b8cc66249bffd7c5c21280969a5d27b8c3b89da7..eeec19e5886d48828b85ab7aa6931db38a561613 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -26,8 +26,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.tracking import base as trackable
 
 
 # Op names which identify variable reads which should be saved.
@@ -137,7 +137,7 @@ def saveable_objects_for_op(op, name):
   if not isinstance(name, six.string_types):
     raise TypeError(
         "names_to_saveables must be a dict mapping string names to "
-        "checkpointable operations. Name is not a string: %s" % name)
+        "trackable operations. Name is not a string: %s" % name)
   if isinstance(op, saveable_object.SaveableObject):
     yield op
   elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
@@ -165,11 +165,11 @@ def saveable_objects_for_op(op, name):
         yield ResourceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
     # pylint: enable=protected-access
-  elif isinstance(op, checkpointable.Checkpointable) and not isinstance(
+  elif isinstance(op, trackable.Trackable) and not isinstance(
       op, variables.Variable):
     # pylint: disable=protected-access
     for attr, factory in op._gather_saveables_for_checkpoint().items():
-      if attr == checkpointable.VARIABLE_VALUE_KEY:
+      if attr == trackable.VARIABLE_VALUE_KEY:
         # Keep original name for classes masquerading as variables.
         full_name = name
       else:
@@ -250,15 +250,18 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
         names_to_saveables[name].append(var)
       else:
         names_to_saveables[name] = [var]
-    elif (isinstance(var, checkpointable.Checkpointable)
+    elif (isinstance(var, trackable.Trackable)
           and not isinstance(var, variables.Variable)):
-      checkpointable_saveables = [
+      trackable_saveables = [
           (factory() if callable(factory) else factory)
           for factory in var._gather_saveables_for_checkpoint().values()]
       names_to_saveables.update(
-          op_list_to_dict(checkpointable_saveables))
+          op_list_to_dict(trackable_saveables))
     else:
-      if context.executing_eagerly():
+      # Variables (reference and resource) have an _in_graph_mode property
+      # indicating whether they were created in a graph building context. We
+      # also get Tensors when graph building, which do not have this property.
+      if not getattr(var, "_in_graph_mode", True):
         if not isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(
               "Can only save/restore ResourceVariables when eager execution "
@@ -323,7 +326,7 @@ def validate_and_slice_inputs(names_to_saveables):
 
   Raises:
     TypeError: If any of the keys are not strings or any of the
-      values are not one of Tensor or Variable or a checkpointable operation.
+      values are not one of Tensor or Variable or a trackable operation.
     ValueError: If the same operation is given in more than one value
       (this also applies to slices of SlicedVariables).
   """
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/tracking/BUILD
similarity index 82%
rename from tensorflow/python/training/checkpointable/BUILD
rename to tensorflow/python/training/tracking/BUILD
index a39462732f591cb49bb4ee07a45a9efe732f589e..2c6623e33192e86b9f336be857bb78984289dbe7 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/saving:saveable_object",
+        "@six_archive//:six",
     ],
 )
 
@@ -92,6 +93,33 @@ tf_py_test(
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
     ],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+)
+
+py_library(
+    name = "object_identity",
+    srcs = ["object_identity.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "graph_view",
+    srcs = ["graph_view.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        ":object_identity",
+        ":tracking",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+    ],
 )
 
 py_library(
@@ -101,17 +129,19 @@ py_library(
     deps = [
         ":base",
         ":data_structures",
+        ":graph_view",
+        ":object_identity",
         ":tracking",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
@@ -122,7 +152,6 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/training/saving:functional_saver",
-        "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
@@ -132,10 +161,12 @@ tf_py_test(
     srcs = ["util_test.py"],
     additional_deps = [
         ":base",
+        ":graph_view",
         ":tracking",
         ":util",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -149,8 +180,8 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:template",
-        "//tensorflow/python:training",
         "//tensorflow/python:training_util",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -158,6 +189,7 @@ tf_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
+        "//tensorflow/python:variables",
     ],
     tags = ["notsan"],  # b/74395663
 )
@@ -167,6 +199,7 @@ tf_xla_py_test(
     srcs = ["util_xla_test.py"],
     tags = [
         "no_pip",
+        "no_rocm",
         "nomac",
         "notsan",  # b/74395663
     ],
@@ -190,6 +223,7 @@ tf_py_test(
     srcs = ["util_with_v1_optimizers_test.py"],
     additional_deps = [
         ":base",
+        ":graph_view",
         ":tracking",
         ":util",
         "@absl_py//absl/testing:parameterized",
@@ -218,5 +252,29 @@ tf_py_test(
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
     ],
-    tags = ["notsan"],  # b/74395663
+    tags = [
+        "no_windows",  # b/124401331
+        "notsan",  # b/74395663
+    ],
+)
+
+py_library(
+    name = "python_state",
+    srcs = ["python_state.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+    ],
+)
+
+tf_py_test(
+    name = "python_state_test",
+    srcs = ["python_state_test.py"],
+    additional_deps = [
+        ":base",
+        ":util",
+        "//tensorflow/python/module",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
 )
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/tracking/base.py
similarity index 81%
rename from tensorflow/python/training/checkpointable/base.py
rename to tensorflow/python/training/tracking/base.py
index 8257693055d0508c223eab8aeb6ff2e291515d4d..f1f1fcba7c15056dc2435e67da1adcae5d1822a6 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -44,18 +44,18 @@ OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 
 
 # A key indicating a variable's value in an object's checkpointed Tensors
-# (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
+# (Trackable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
 OBJECT_CONFIG_JSON_KEY = "OBJECT_CONFIG_JSON"
 
-CheckpointableReference = collections.namedtuple(
-    "CheckpointableReference",
+TrackableReference = collections.namedtuple(
+    "TrackableReference",
     [
         # The local name for this dependency.
         "name",
-        # The Checkpointable object being referenced.
+        # The Trackable object being referenced.
         "ref"
     ])
 
@@ -187,34 +187,34 @@ class PythonStringStateSaveable(PythonStateSaveable):
     return control_flow_ops.no_op()
 
 
-class _CheckpointPosition(object):
-  """Indicates a position within a `_Checkpoint`."""
+class CheckpointPosition(object):
+  """Indicates a position within a `_CheckpointRestoreCoordinator`."""
 
   def __init__(self, checkpoint, proto_id):
     """Specify an object within a checkpoint.
 
     Args:
-      checkpoint: A _Checkpoint object.
-      proto_id: The index of this object in CheckpointableObjectGraph.nodes.
+      checkpoint: A _CheckpointRestoreCoordinator object.
+      proto_id: The index of this object in TrackableObjectGraph.nodes.
     """
     self._checkpoint = checkpoint
     self._proto_id = proto_id
 
-  def restore(self, checkpointable):
-    """Restore this value into `checkpointable`."""
+  def restore(self, trackable):
+    """Restore this value into `trackable`."""
     with ops.init_scope():
-      if self.bind_object(checkpointable):
+      if self.bind_object(trackable):
         # This object's correspondence with a checkpointed object is new, so
         # process deferred restorations for it and its dependencies.
-        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        restore_ops = trackable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
         if restore_ops:
           self._checkpoint.new_restore_ops(restore_ops)
 
-  def bind_object(self, checkpointable):
+  def bind_object(self, trackable):
     """Set a checkpoint<->object correspondence and process slot variables.
 
     Args:
-      checkpointable: The object to record a correspondence for.
+      trackable: The object to record a correspondence for.
     Returns:
       True if this is a new assignment, False if this object has already been
       mapped to a checkpointed `Object` proto.
@@ -222,14 +222,14 @@ class _CheckpointPosition(object):
       AssertionError: If another object is already bound to the `Object` proto.
     """
     checkpoint = self.checkpoint
-    checkpoint.all_python_objects.add(checkpointable)
+    checkpoint.all_python_objects.add(trackable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
     if current_assignment is None:
-      checkpoint.object_by_proto_id[self._proto_id] = checkpointable
+      checkpoint.object_by_proto_id[self._proto_id] = trackable
       for deferred_slot_restoration in (
           checkpoint.deferred_slot_restorations.pop(self._proto_id, ())):
-        checkpointable._create_or_restore_slot_variable(  # pylint: disable=protected-access
-            slot_variable_position=_CheckpointPosition(
+        trackable._create_or_restore_slot_variable(  # pylint: disable=protected-access
+            slot_variable_position=CheckpointPosition(
                 checkpoint=checkpoint,
                 proto_id=deferred_slot_restoration.slot_variable_id),
             variable=deferred_slot_restoration.original_variable,
@@ -244,15 +244,15 @@ class _CheckpointPosition(object):
           checkpoint.deferred_slot_restorations.setdefault(
               slot_restoration.optimizer_id, []).append(
                   _DeferredSlotVariableRestoration(
-                      original_variable=checkpointable,
+                      original_variable=trackable,
                       slot_variable_id=slot_restoration.slot_variable_id,
                       slot_name=slot_restoration.slot_name))
         else:
           optimizer_object._create_or_restore_slot_variable(  # pylint: disable=protected-access
-              slot_variable_position=_CheckpointPosition(
+              slot_variable_position=CheckpointPosition(
                   checkpoint=checkpoint,
                   proto_id=slot_restoration.slot_variable_id),
-              variable=checkpointable,
+              variable=trackable,
               slot_name=slot_restoration.slot_name)
       return True  # New assignment
     else:
@@ -260,14 +260,14 @@ class _CheckpointPosition(object):
       # we don't need to do anything besides check that the mapping is
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
-      if current_assignment is not checkpointable:
+      if current_assignment is not trackable:
         logging.warning(
             ("Inconsistent references when loading the checkpoint into this "
-             "object graph. Either the Checkpointable object references in the "
+             "object graph. Either the Trackable object references in the "
              "Python program have changed in an incompatible way, or the "
              "checkpoint was generated in an incompatible program.\n\nTwo "
              "checkpoint references resolved to different objects (%s and %s).")
-            % (current_assignment, checkpointable))
+            % (current_assignment, trackable))
       return False  # Not a new assignment
 
   def is_simple_variable(self):
@@ -306,7 +306,7 @@ class _CheckpointPosition(object):
 
   def _gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
-    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    saveables = self.trackable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
     # Name saveables based on the name this object had when it was checkpointed.
     named_saveables = {}
     python_saveables = []
@@ -325,15 +325,16 @@ class _CheckpointPosition(object):
       # the SaveableObject itself has been cached. If not, we'll make it, and
       # either way we'll extract new ops from it (or if it has Python state to
       # restore, we'll run that).
-      if self._checkpoint.saveable_object_cache is None:
+      saveables_cache = self._checkpoint.graph_view.saveables_cache
+      if saveables_cache is None:
         # No SaveableObject caching when executing eagerly.
         saveable = None
       else:
         # If we've already created and cached a SaveableObject for this
         # attribute, we can re-use it to avoid re-creating some ops when graph
         # building.
-        saveable_list = self._checkpoint.saveable_object_cache.get(
-            self.checkpointable, {}).get(serialized_tensor.name, (None,))
+        saveable_list = saveables_cache.get(
+            self.trackable, {}).get(serialized_tensor.name, (None,))
         if len(saveable_list) == 1:
           # Almost every attribute will have exactly one SaveableObject.
           saveable, = saveable_list
@@ -347,7 +348,7 @@ class _CheckpointPosition(object):
         # the SaveableObject.
         if serialized_tensor.checkpoint_key not in saveable.name:
           saveable = None
-          del self._checkpoint.saveable_object_cache[self.checkpointable]
+          del saveables_cache[self.trackable]
           break
       if saveable is None:
         # If there was no cached SaveableObject, we should check if the Python
@@ -360,15 +361,15 @@ class _CheckpointPosition(object):
           # checkpoint was loaded.
           if not serialized_tensor.optional_restore:
             self._checkpoint.unused_attributes.setdefault(
-                self.checkpointable, []).append(serialized_tensor.name)
+                self.trackable, []).append(serialized_tensor.name)
           continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
         else:
           saveable = saveable_factory
-        if self._checkpoint.saveable_object_cache is not None:
-          self._checkpoint.saveable_object_cache.setdefault(
-              self.checkpointable, {})[serialized_tensor.name] = [saveable]
+        if saveables_cache is not None:
+          saveables_cache.setdefault(
+              self.trackable, {})[serialized_tensor.name] = [saveable]
       if isinstance(saveable, PythonStateSaveable):
         python_saveables.append(saveable)
       else:
@@ -378,7 +379,7 @@ class _CheckpointPosition(object):
   def restore_ops(self):
     """Create or fetch restore ops for this object's attributes.
 
-    Requires that the `Checkpointable` Python object has been bound to an object
+    Requires that the `Trackable` Python object has been bound to an object
     ID in the checkpoint.
 
     Returns:
@@ -397,7 +398,7 @@ class _CheckpointPosition(object):
     return self._checkpoint
 
   @property
-  def checkpointable(self):
+  def trackable(self):
     return self._checkpoint.object_by_proto_id[self._proto_id]
 
   @property
@@ -435,11 +436,11 @@ _SlotVariableRestoration = collections.namedtuple(
 def no_automatic_dependency_tracking(method):
   """Disables automatic dependency tracking on attribute assignment.
 
-  Use to decorate any method of a Checkpointable object. Attribute assignment in
+  Use to decorate any method of a Trackable object. Attribute assignment in
   that method will not add dependencies (also respected in Model). Harmless if
   used in a class which does not do automatic dependency tracking (which means
   it's safe to use in base classes which may have subclasses which also inherit
-  from Checkpointable).
+  from Trackable).
 
   Args:
     method: The method to decorate.
@@ -452,46 +453,47 @@ def no_automatic_dependency_tracking(method):
     previous_value = getattr(self, "_setattr_tracking", True)
     self._setattr_tracking = False  # pylint: disable=protected-access
     try:
-      method(self, *args, **kwargs)
+      result = method(self, *args, **kwargs)
     finally:
       self._setattr_tracking = previous_value  # pylint: disable=protected-access
+    return result
 
   return tf_decorator.make_decorator(
       target=method, decorator_func=_method_wrapper)
 
 
-class Checkpointable(object):
-  """Base class for `Checkpointable` objects without automatic dependencies.
+class Trackable(object):
+  """Base class for `Trackable` objects without automatic dependencies.
 
   This class has no __setattr__ override for performance reasons. Dependencies
   must be added explicitly. Unless attribute assignment is performance-critical,
-  use `AutoCheckpointable` instead. Use `Checkpointable` for `isinstance`
+  use `AutoTrackable` instead. Use `Trackable` for `isinstance`
   checks.
   """
 
-  # Checkpointable does not do automatic dependency tracking, but uses the
+  # Trackable does not do automatic dependency tracking, but uses the
   # no_automatic_dependency_tracking decorator so it can avoid adding
-  # dependencies if a subclass is Checkpointable / inherits from Model (both of
+  # dependencies if a subclass is Trackable / inherits from Model (both of
   # which have __setattr__ overrides).
   @no_automatic_dependency_tracking
-  def _maybe_initialize_checkpointable(self):
+  def _maybe_initialize_trackable(self):
     """Initialize dependency management.
 
     Not __init__, since most objects will forget to call it.
     """
     if hasattr(self, "_unconditional_checkpoint_dependencies"):
       # __init__ already called. This check means that we don't need
-      # Checkpointable.__init__() in the constructor of every TensorFlow object.
+      # Trackable.__init__() in the constructor of every TensorFlow object.
       return
-    # A list of CheckpointableReference objects. Some classes implementing
-    # `Checkpointable`, notably `Optimizer`s, may override the
+    # A list of TrackableReference objects. Some classes implementing
+    # `Trackable`, notably `Optimizer`s, may override the
     # _checkpoint_dependencies property with conditional dependencies
     # (e.g. based on the current graph when saving).
     self._unconditional_checkpoint_dependencies = []
-    # Maps names -> Checkpointable objects
+    # Maps names -> Trackable objects
     self._unconditional_dependency_names = {}
-    # Restorations for other Checkpointable objects on which this object may
-    # eventually depend. Maps local name -> _CheckpointPosition list. Optimizers
+    # Restorations for other Trackable objects on which this object may
+    # eventually depend. Maps local name -> CheckpointPosition list. Optimizers
     # tack on conditional dependencies, and so need separate management of
     # deferred dependencies too.
     self._unconditional_deferred_dependencies = {}
@@ -529,8 +531,8 @@ class Checkpointable(object):
     May be overridden to include conditional dependencies.
 
     Returns:
-      A list of `CheckpointableReference` objects indicating named
-      `Checkpointable` dependencies which should be saved along with this
+      A list of `TrackableReference` objects indicating named
+      `Trackable` dependencies which should be saved along with this
       object.
     """
     return self._unconditional_checkpoint_dependencies
@@ -539,13 +541,13 @@ class Checkpointable(object):
   def _deferred_dependencies(self):
     """A dictionary with deferred dependencies.
 
-    Stores restorations for other Checkpointable objects on which this object
+    Stores restorations for other Trackable objects on which this object
     may eventually depend. May be overridden by sub-classes (e.g. Optimizers use
     conditional dependencies based the current graph, and so need separate
     management of deferred dependencies too).
 
     Returns:
-      A dictionary mapping from local name to a list of _CheckpointPosition
+      A dictionary mapping from local name to a list of CheckpointPosition
       objects.
     """
     return self._unconditional_deferred_dependencies
@@ -558,7 +560,7 @@ class Checkpointable(object):
     Args:
       name: The local name of the dependency.
     Returns:
-      A `Checkpointable` object, or `None` if no dependency by this name was
+      A `Trackable` object, or `None` if no dependency by this name was
       found.
     """
     return self._unconditional_dependency_names.get(name, None)
@@ -567,9 +569,9 @@ class Checkpointable(object):
       self, name, shape=None, dtype=dtypes.float32,
       initializer=None, getter=None, overwrite=False,
       **kwargs_for_getter):
-    """Restore-on-create for a variable be saved with this `Checkpointable`.
+    """Restore-on-create for a variable be saved with this `Trackable`.
 
-    If the user has requested that this object or another `Checkpointable` which
+    If the user has requested that this object or another `Trackable` which
     depends on this object be restored from a checkpoint (deferred loading
     before variable object creation), `initializer` may be ignored and the value
     from the checkpoint used instead.
@@ -591,7 +593,7 @@ class Checkpointable(object):
     Raises:
       ValueError: If the variable name is not unique.
     """
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     with ops.init_scope():
       if context.executing_eagerly():
         # If this is a variable with a single Tensor stored in the checkpoint,
@@ -607,11 +609,11 @@ class Checkpointable(object):
               isinstance(initializer, CheckpointInitialValue)
               and (initializer.restore_uid
                    > checkpoint_initializer.restore_uid))):
-        # If multiple Checkpointable objects are "creating" the same variable
+        # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
         # (the one called last) has to make the final initializer. If another
         # custom getter interrupts this process by overwriting the initializer,
-        # then we'll catch that when we call _track_checkpointable. So this is
+        # then we'll catch that when we call _track_trackable. So this is
         # "best effort" to set the initializer with the highest restore UID.
         initializer = checkpoint_initializer
         shape = None
@@ -623,12 +625,12 @@ class Checkpointable(object):
     # assign again. It will add this variable to our dependencies, and if there
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
-    if not overwrite or isinstance(new_variable, Checkpointable):
-      return self._track_checkpointable(new_variable, name=name,
-                                        overwrite=overwrite)
+    if not overwrite or isinstance(new_variable, Trackable):
+      return self._track_trackable(new_variable, name=name,
+                                   overwrite=overwrite)
     else:
       # TODO(allenl): Some variable types are not yet supported. Remove this
-      # fallback once all get_variable() return types are Checkpointable.
+      # fallback once all get_variable() return types are Trackable.
       return new_variable
 
   def _preload_simple_restoration(self, name, shape):
@@ -667,46 +669,46 @@ class Checkpointable(object):
     return CheckpointInitialValue(
         checkpoint_position=checkpoint_position, shape=shape)
 
-  def _track_checkpointable(self, checkpointable, name, overwrite=False):
-    """Declare a dependency on another `Checkpointable` object.
+  def _track_trackable(self, trackable, name, overwrite=False):
+    """Declare a dependency on another `Trackable` object.
 
     Indicates that checkpoints for this object should include variables from
-    `checkpointable`.
+    `trackable`.
 
-    Variables in a checkpoint are mapped to `Checkpointable`s based on the names
+    Variables in a checkpoint are mapped to `Trackable`s based on the names
     provided when the checkpoint was written. To avoid breaking existing
     checkpoints when modifying a class, neither variable names nor dependency
-    names (the names passed to `_track_checkpointable`) may change.
+    names (the names passed to `_track_trackable`) may change.
 
     Args:
-      checkpointable: A `Checkpointable` which this object depends on.
-      name: A local name for `checkpointable`, used for loading checkpoints into
+      trackable: A `Trackable` which this object depends on.
+      name: A local name for `trackable`, used for loading checkpoints into
         the correct objects.
       overwrite: Boolean, whether silently replacing dependencies is OK. Used
         for __setattr__, where throwing an error on attribute reassignment would
         be inappropriate.
 
     Returns:
-      `checkpointable`, for convenience when declaring a dependency and
+      `trackable`, for convenience when declaring a dependency and
       assigning to a member variable in one statement.
 
     Raises:
-      TypeError: If `checkpointable` does not inherit from `Checkpointable`.
+      TypeError: If `trackable` does not inherit from `Trackable`.
       ValueError: If another object is already tracked by this name.
     """
-    self._maybe_initialize_checkpointable()
-    if not isinstance(checkpointable, Checkpointable):
+    self._maybe_initialize_trackable()
+    if not isinstance(trackable, Trackable):
       raise TypeError(
-          ("Checkpointable._track_checkpointable() passed type %s, not a "
-           "Checkpointable.") % (type(checkpointable),))
-    new_reference = CheckpointableReference(name=name, ref=checkpointable)
+          ("Trackable._track_trackable() passed type %s, not a "
+           "Trackable.") % (type(trackable),))
+    new_reference = TrackableReference(name=name, ref=trackable)
     current_object = self._lookup_dependency(name)
     if (current_object is not None
-        and current_object is not checkpointable):
+        and current_object is not trackable):
       if not overwrite:
         raise ValueError(
-            ("Called Checkpointable._track_checkpointable() with name='%s', "
-             "but a Checkpointable with this name is already declared as a "
+            ("Called Trackable._track_trackable() with name='%s', "
+             "but a Trackable with this name is already declared as a "
              "dependency. Names must be unique (or overwrite=True).") % (name,))
       # This is a weird thing to do, but we're not going to stop people from
       # using __setattr__.
@@ -717,20 +719,20 @@ class Checkpointable(object):
     elif current_object is None:
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._handle_deferred_dependencies(
-          name=name, checkpointable=checkpointable)
-    self._unconditional_dependency_names[name] = checkpointable
-    return checkpointable
+          name=name, trackable=trackable)
+    self._unconditional_dependency_names[name] = trackable
+    return trackable
 
-  def _handle_deferred_dependencies(self, name, checkpointable):
-    """Pop and load any deferred checkpoint restores into `checkpointable`.
+  def _handle_deferred_dependencies(self, name, trackable):
+    """Pop and load any deferred checkpoint restores into `trackable`.
 
-    This method does not add a new dependency on `checkpointable`, but it does
+    This method does not add a new dependency on `trackable`, but it does
     check if any outstanding/deferred dependencies have been queued waiting for
     this dependency to be added (matched based on `name`). If so,
-    `checkpointable` and its dependencies are restored. The restorations are
+    `trackable` and its dependencies are restored. The restorations are
     considered fulfilled and so are deleted.
 
-    `_track_checkpointable` is more appropriate for adding a
+    `_track_trackable` is more appropriate for adding a
     normal/unconditional dependency, and includes handling for deferred
     restorations. This method allows objects such as `Optimizer` to use the same
     restoration logic while managing conditional dependencies themselves, by
@@ -740,25 +742,25 @@ class Checkpointable(object):
 
     Args:
       name: The name of the dependency within this object (`self`), used to
-        match `checkpointable` with values saved in a checkpoint.
-      checkpointable: The Checkpointable object to restore (inheriting from
-        `Checkpointable`).
+        match `trackable` with values saved in a checkpoint.
+      trackable: The Trackable object to restore (inheriting from
+        `Trackable`).
     """
-    self._maybe_initialize_checkpointable()
-    checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    self._maybe_initialize_trackable()
+    trackable._maybe_initialize_trackable()  # pylint: disable=protected-access
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
         key=lambda restore: restore.checkpoint.restore_uid,
         reverse=True):
-      checkpoint_position.restore(checkpointable)
+      checkpoint_position.restore(trackable)
 
     # Pass on any name-based restores queued in this object.
     for name_based_restore in sorted(
         self._name_based_restores,
         key=lambda checkpoint: checkpoint.restore_uid,
         reverse=True):
-      checkpointable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
+      trackable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
 
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
@@ -771,7 +773,7 @@ class Checkpointable(object):
     while visit_queue:
       current_position = visit_queue.popleft()
       restore_ops.extend(nest.flatten(
-          current_position.checkpointable  # pylint: disable=protected-access
+          current_position.trackable  # pylint: disable=protected-access
           ._single_restoration_from_checkpoint_position(
               checkpoint_position=current_position,
               visit_queue=visit_queue)))
@@ -780,7 +782,7 @@ class Checkpointable(object):
   def _single_restoration_from_checkpoint_position(
       self, checkpoint_position, visit_queue):
     """Restore this object, and either queue its dependencies or defer them."""
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     checkpoint = checkpoint_position.checkpoint
     # If the UID of this restore is lower than our current update UID, we don't
     # need to actually restore the object. However, we should pass the
@@ -791,7 +793,7 @@ class Checkpointable(object):
     else:
       restore_ops = ()
     for child in checkpoint_position.object_proto.children:
-      child_position = _CheckpointPosition(
+      child_position = CheckpointPosition(
           checkpoint=checkpoint,
           proto_id=child.node_id)
       local_object = self._lookup_dependency(child.local_name)
@@ -801,7 +803,7 @@ class Checkpointable(object):
         self._deferred_dependencies.setdefault(child.local_name, []).append(
             child_position)
       else:
-        if child_position.bind_object(checkpointable=local_object):
+        if child_position.bind_object(trackable=local_object):
           # This object's correspondence is new, so dependencies need to be
           # visited. Delay doing it so that we get a breadth-first dependency
           # resolution order (shallowest paths first). The caller is responsible
@@ -817,7 +819,7 @@ class Checkpointable(object):
     or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
     `var_list` constructor argument).
 
-    `SaveableObjects` have a name set, which Checkpointable needs to generate
+    `SaveableObjects` have a name set, which Trackable needs to generate
     itself. So rather than returning `SaveableObjects` directly, this method
     should return a dictionary of callables which take `name` arguments and
     return `SaveableObjects` with that name.
@@ -846,13 +848,32 @@ class Checkpointable(object):
       return {}
     weak_self = weakref.ref(self)
     def _state_callback():
+      """Serializes `self.get_config()` for saving."""
       dereferenced_self = weak_self()
       if dereferenced_self:
-        return json.dumps(dereferenced_self,
-                          default=serialization.get_json_type,
-                          sort_keys=True).encode("utf8")
+        try:
+          return json.dumps(
+              dereferenced_self,
+              default=serialization.get_json_type,
+              sort_keys=True).encode("utf8")
+        except TypeError:
+          # Even if get_config worked objects may have produced garbage.
+          return ""
       else:
         return ""
     return {OBJECT_CONFIG_JSON_KEY: functools.partial(
         PythonStringStateSaveable,
         state_callback=_state_callback)}
+
+  def _list_functions_for_serialization(self):
+    """Lists the functions of this trackable to serialize.
+
+    Internal sub-classes can override this with specific logic. E.g.
+    `AutoTrackable` provides an implementation that returns the `attr`
+    that return functions.
+
+    Returns:
+        A dictionary mapping attribute names to `Function` or
+        `ConcreteFunction`.
+    """
+    return dict()
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/tracking/base_test.py
similarity index 77%
rename from tensorflow/python/training/checkpointable/base_test.py
rename to tensorflow/python/training/tracking/base_test.py
index 750799f03036bfddc188796210c7c3c29aa0e986..d76e20edf7e8dcca11c588a05cc7514625083086 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/tracking/base_test.py
@@ -22,29 +22,29 @@ import os
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import util
 
 
 class InterfaceTests(test.TestCase):
 
   def testOverwrite(self):
-    root = base.Checkpointable()
-    leaf = base.Checkpointable()
-    root._track_checkpointable(leaf, name="leaf")
+    root = base.Trackable()
+    leaf = base.Trackable()
+    root._track_trackable(leaf, name="leaf")
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(leaf, current_dependency)
     self.assertEqual("leaf", current_name)
-    duplicate_name_dep = base.Checkpointable()
+    duplicate_name_dep = base.Trackable()
     with self.assertRaises(ValueError):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+      root._track_trackable(duplicate_name_dep, name="leaf")
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, current_dependency)
     self.assertEqual("leaf", current_name)
 
   def testAddVariableOverwrite(self):
-    root = base.Checkpointable()
+    root = base.Trackable()
     a = root._add_variable_with_custom_getter(
         name="v", shape=[], getter=variable_scope.get_variable)
     self.assertEqual([root, a], util.list_objects(root))
@@ -61,15 +61,15 @@ class InterfaceTests(test.TestCase):
             getter=variable_scope.get_variable)
 
   def testAssertConsumedWithUnusedPythonState(self):
-    has_config = base.Checkpointable()
+    has_config = base.Trackable()
     has_config.get_config = lambda: {}
     saved = util.Checkpoint(obj=has_config)
     save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    restored = util.Checkpoint(obj=base.Checkpointable())
+    restored = util.Checkpoint(obj=base.Trackable())
     restored.restore(save_path).assert_consumed()
 
   def testAssertConsumedFailsWithUsedPythonState(self):
-    has_config = base.Checkpointable()
+    has_config = base.Trackable()
     attributes = {
         "foo_attr": functools.partial(
             base.PythonStringStateSaveable,
@@ -78,11 +78,24 @@ class InterfaceTests(test.TestCase):
     has_config._gather_saveables_for_checkpoint = lambda: attributes
     saved = util.Checkpoint(obj=has_config)
     save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
-    restored = util.Checkpoint(obj=base.Checkpointable())
+    restored = util.Checkpoint(obj=base.Trackable())
     status = restored.restore(save_path)
     with self.assertRaisesRegexp(AssertionError, "foo_attr"):
       status.assert_consumed()
 
+  def testBuggyGetConfig(self):
+
+    class NotSerializable(object):
+      pass
+
+    class GetConfigRaisesError(base.Trackable):
+
+      def get_config(self):
+        return NotSerializable()
+
+    util.Checkpoint(obj=GetConfigRaisesError()).save(
+        os.path.join(self.get_temp_dir(), "ckpt"))
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
similarity index 76%
rename from tensorflow/python/training/checkpointable/data_structures.py
rename to tensorflow/python/training/tracking/data_structures.py
index 5a5b444a6c22b498c3dd76acdd33cef29b46e206..1689b31378e00cd5b8f14c6cd7ec4af29661dd5d 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,21 +20,24 @@ from __future__ import print_function
 import collections
 import copy
 import operator
+import sys
 
 import six
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import revived_types
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import layer_utils
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import layer_utils
 
 
 class NoDependency(object):
-  """Allows attribute assignment to `Checkpointable` objects with no dependency.
+  """Allows attribute assignment to `Trackable` objects with no dependency.
 
   Example usage:
   ```python
-  obj = Checkpointable()
+  obj = Trackable()
   obj.has_dependency = tf.Variable(0., name="dep")
   obj.no_dependency = NoDependency(tf.Variable(1., name="nodep"))
   assert obj.no_dependency.name == "nodep:0"
@@ -56,16 +59,22 @@ class NoDependency(object):
 
 def _wrap_or_unwrap(value):
   """Wraps basic data structures, unwraps NoDependency objects."""
+  # pylint: disable=unidiomatic-typecheck
+  # Exact type checking to avoid mucking up custom logic in list/dict
+  # subclasses, e.g. collections.Counter.
   if isinstance(value, NoDependency):
     return value.value
-  if isinstance(value, base.Checkpointable):
-    return value  # Skip conversion for already checkpointable objects.
-  elif isinstance(value, dict):
+  if isinstance(value, base.Trackable):
+    return value  # Skip conversion for already trackable objects.
+  elif type(value) == dict:
     return _DictWrapper(value)
-  elif isinstance(value, list):
+  elif type(value) == collections.OrderedDict:
+    return _DictWrapper(value)
+  elif type(value) == list:
     return _ListWrapper(value)
   else:
     return value
+  # pylint: enable=unidiomatic-typecheck
   # TODO(allenl): Handle other common data structures. Tuples will require
   # special casing (tuple subclasses are not weak referenceable, so replacement
   # with a wrapper that subclasses tuple on attribute assignment works poorly,
@@ -74,19 +83,19 @@ def _wrap_or_unwrap(value):
   # come up with names. Dictionaries should look like lists.
 
 
-def sticky_attribute_assignment(checkpointable, name, value):
+def sticky_attribute_assignment(trackable, name, value):
   """Adds dependencies, generally called from __setattr__.
 
-  This behavior is shared between Checkpointable and Model.
+  This behavior is shared between Trackable and Model.
 
-  Respects NoDependency indicators, but otherwise makes checkpointable objects
+  Respects NoDependency indicators, but otherwise makes trackable objects
   out of common data structures and tracks objects by their attribute names.
 
   Args:
-    checkpointable: The object to add dependencies to (generally the one having
+    trackable: The object to add dependencies to (generally the one having
       an attribute assigned).
     name: The attribute name being assigned.
-    value: The value being assigned. Not necessarily a checkpointable object.
+    value: The value being assigned. Not necessarily a trackable object.
 
   Returns:
     The value which should be stored in the attribute (unwrapped from a
@@ -99,18 +108,29 @@ def sticky_attribute_assignment(checkpointable, name, value):
   value = _wrap_or_unwrap(value)
   if not add_dependency:
     return value
-  if isinstance(value, base.Checkpointable):
-    checkpointable._track_checkpointable(  # pylint: disable=protected-access
+  if isinstance(value, base.Trackable):
+    trackable._track_trackable(  # pylint: disable=protected-access
         value, name=name,
-        # Allow the user to switch the Checkpointable which is tracked by this
+        # Allow the user to switch the Trackable which is tracked by this
         # name, since assigning a new variable to an attribute has
         # historically been fine (e.g. Adam did this).
         overwrite=True)
   return value
 
 
-class CheckpointableDataStructure(base.Checkpointable):
-  """Base class for data structures which contain checkpointable objects."""
+class _UntrackableError(ValueError):
+
+  def __init__(self, value):  # pylint: disable=super-init-not-called
+    self._value = value
+
+  def __str__(self):
+    return (("Only trackable objects (such as Layers or Optimizers) may be "
+             "stored in a List object. Got %s, which does not inherit from "
+             "Trackable.") % (self._value,))
+
+
+class TrackableDataStructure(base.Trackable):
+  """Base class for data structures which contain trackable objects."""
 
   def __init__(self):
     self.trainable = True
@@ -119,14 +139,11 @@ class CheckpointableDataStructure(base.Checkpointable):
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
     value = sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
+        trackable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
       self._extra_variables.append(value)
-    if not isinstance(value, base.Checkpointable):
-      raise ValueError(
-          ("Only checkpointable objects (such as Layers or Optimizers) may be "
-           "stored in a List object. Got %s, which does not inherit from "
-           "Checkpointable.") % (value,))
+    if not isinstance(value, base.Trackable):
+      raise _UntrackableError(value)
     if hasattr(value, "_use_resource_variables"):
       # In subclassed models, legacy layers (tf.layers) must always use
       # resource variables.
@@ -135,7 +152,7 @@ class CheckpointableDataStructure(base.Checkpointable):
 
   @property
   def _values(self):
-    """An iterable/sequence which may contain checkpointable objects."""
+    """An iterable/sequence which may contain trackable objects."""
     raise NotImplementedError("Abstract method")
 
   @property
@@ -145,7 +162,7 @@ class CheckpointableDataStructure(base.Checkpointable):
     # they're wrapping if out of sync.
     collected = []
     for obj in self._values:
-      if (isinstance(obj, CheckpointableDataStructure)
+      if (isinstance(obj, TrackableDataStructure)
           or layer_utils.is_layer(obj)
           or layer_utils.has_weights(obj)):
         collected.append(obj)
@@ -212,19 +229,19 @@ class CheckpointableDataStructure(base.Checkpointable):
     return id(self)
 
   def __eq__(self, other):
-    # Similar to Tensors, checkpointable data structures use object-identity
+    # Similar to Tensors, trackable data structures use object-identity
     # equality to support set/dict membership.
     return self is other
 
 
-class List(CheckpointableDataStructure, collections.Sequence):
-  """An append-only sequence type which is checkpointable.
+class List(TrackableDataStructure, collections.Sequence):
+  """An append-only sequence type which is trackable.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), and forwards any `Layer` metadata such as updates and losses.
+  trackable), and forwards any `Layer` metadata such as updates and losses.
 
   Note that `List` is purely a container. It lets a `tf.keras.Model` or
-  other checkpointable object know about its contents, but does not call any
+  other trackable object know about its contents, but does not call any
   `Layer` instances which are added to it. To indicate a sequence of `Layer`
   instances which should be called sequentially, use `tf.keras.Sequential`.
 
@@ -245,7 +262,7 @@ class List(CheckpointableDataStructure, collections.Sequence):
       return aggregation
   ```
 
-  This kind of wrapping is necessary because `Checkpointable` objects do not
+  This kind of wrapping is necessary because `Trackable` objects do not
   (yet) deeply inspect regular Python data structures, so for example assigning
   a regular list (`self.layer_list = [layers.Dense(3)]`) does not create a
   checkpoint dependency and does not add the `Layer` instance's weights to its
@@ -260,9 +277,12 @@ class List(CheckpointableDataStructure, collections.Sequence):
       self._storage[index] = self._track_value(
           element, name=self._name_element(index))
 
-  def __copy__(self):
+  def copy(self):
     return type(self)(copy.copy(self._storage))
 
+  def __copy__(self):
+    return self.copy()
+
   def __deepcopy__(self, memo):
     return type(self)(copy.deepcopy(self._storage, memo))
 
@@ -278,28 +298,43 @@ class List(CheckpointableDataStructure, collections.Sequence):
     return self
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
     self._storage.append(value)
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     for value in values:
-      self._storage.append(self._track_value(
-          value, name=self._name_element(len(self._storage))))
+      self.append(value)
 
   def __iadd__(self, values):
     self.extend(values)
     return self
 
   def __add__(self, other):
-    if isinstance(other, List):
-      return self.__class__(self._storage + other._storage)  # pylint: disable=protected-access
-    else:
-      return self.__class__(self._storage + other)
+    return self.__class__(self._storage + getattr(other, "_storage", other))
+
+  def __imul__(self, y):
+    if y <= 0:
+      raise ValueError(
+          "List only supports append, multiplying in place by %d removes "
+          "elements." % y)
+
+    n = len(self._storage)
+    for _ in range(y - 1):
+      for i in range(n):
+        self.append(self._storage[i])
+
+    return self
+
+  def __mul__(self, n):
+    return self.__class__(self._storage * n)
+
+  def __rmul__(self, n):
+    return self * n
 
   def __radd__(self, other):
-    return self + other
+    return self.__class__(other) + self
 
   def __getitem__(self, key):
     return self._storage[key]
@@ -313,7 +348,11 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def __repr__(self):
     return "List(%s)" % (repr(self._storage),)
 
+  def __sizeof__(self):
+    return super(List, self).__sizeof__() + sys.getsizeof(self._storage)
+
 
+# TODO(tomhennigan) Update to collections.UserList?
 class _ListWrapper(List, collections.MutableSequence,
                    # Shadowed, but there for isinstance checks.
                    list):
@@ -325,7 +364,7 @@ class _ListWrapper(List, collections.MutableSequence,
   occupied, meaning both elements get the same names at different times) and
   refuses to save.
 
-  On assignment to an attribute of a Model or Checkpointable object, Python
+  On assignment to an attribute of a Model or Trackable object, Python
   lists are replaced with _ListWrapper. Wrapping a list in a
   `tf.contrib.checkpoint.NoDependency` object prevents this.
   """
@@ -385,17 +424,17 @@ class _ListWrapper(List, collections.MutableSequence,
     if self._non_append_mutation:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). A list element was replaced "
-           "(__setitem__), deleted, or inserted. In order to support "
-           "restoration on object creation, tracking is exclusively for "
-           "append-only data structures.\n\nIf you don't need this list "
-           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
-           "object; it will be automatically un-wrapped and subsequently "
-           "ignored." % (self,)))
+           "trackable TensorFlow objects). A list element was replaced "
+           "(__setitem__, __setslice__), deleted (__delitem__, __delslice__), "
+           "or moved (sort). In order to support restoration on object "
+           "creation, tracking is exclusively for append-only data structures."
+           "\n\nIf you don't need this list checkpointed, wrap it in a "
+           "tf.contrib.checkpoint.NoDependency object; it will be "
+           "automatically un-wrapped and subsequently ignored." % (self,)))
     if self._external_modification:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). The wrapped list was modified "
+           "trackable TensorFlow objects). The wrapped list was modified "
            "outside the wrapper (its final value was %s, its value when a "
            "checkpoint dependency was added was %s), which breaks restoration "
            "on object creation.\n\nIf you don't need this list checkpointed, "
@@ -410,19 +449,42 @@ class _ListWrapper(List, collections.MutableSequence,
 
   def __setitem__(self, key, value):
     self._check_external_modification()
-    if isinstance(self._storage[key], base.Checkpointable):
-      self._non_append_mutation = True
-    self._storage[key] = self._track_value(value, self._name_element(key))
+
+    if isinstance(key, slice):
+      # Note: this is quite inefficient, but the list API supports a broad range
+      # of slice setters (e.g. truncate, extend, replace) and immitating this
+      # for a range of Python versions is non-trivial.
+      storage_copy = list(self._storage)
+      self._storage[key] = value
+
+      len_before = len(storage_copy)
+      len_now = len(self._storage)
+      for i in range(max(len_before, len_now)):
+        value_now = self._storage[i] if i < len_now else None
+        value_before = storage_copy[i] if i < len_before else None
+
+        if isinstance(value_before, base.Trackable):
+          self._non_append_mutation = True
+
+        if value_now is not None and value_now != value_before:
+          self._storage[i] = self._track_value(self._storage[i],
+                                               self._name_element(i))
+
+    else:
+      if isinstance(self._storage[key], base.Trackable):
+        self._non_append_mutation = True
+      self._storage[key] = self._track_value(value, self._name_element(key))
+
     self._update_snapshot()
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     self._check_external_modification()
     super(_ListWrapper, self).append(value)
     self._update_snapshot()
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     self._check_external_modification()
     super(_ListWrapper, self).extend(values)
     self._update_snapshot()
@@ -454,26 +516,43 @@ class _ListWrapper(List, collections.MutableSequence,
     self._non_append_mutation = True
     self._storage.insert(index, obj)
 
+  def sort(self):
+    self._non_append_mutation = True
+    self._storage.sort()
+
+  def __setslice__(self, i, j, y):
+    self.__setitem__(slice(i, j), y)
+
+  def __delslice__(self, i, j):
+    self._non_append_mutation = True
+    del self._storage[slice(i, j)]
+
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     try:
       value = super(_ListWrapper, self)._track_value(value=value, name=name)
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       value = sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
     return value
 
   def __repr__(self):
     return "ListWrapper(%s)" % (repr(self._storage),)
 
+  def _list_functions_for_serialization(self):
+    return {
+        str(key): value for key, value in enumerate(self)
+        if _is_function(value)
+    }
+
 
-class Mapping(CheckpointableDataStructure, collections.Mapping):
-  """An append-only checkpointable mapping data structure with string keys.
+class Mapping(TrackableDataStructure, collections.Mapping):
+  """An append-only trackable mapping data structure with string keys.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), named based on its keys.
+  trackable), named based on its keys.
 
   Note that once a key has been added, it may not be deleted or replaced. If
   names may not be unique, see `tf.contrib.checkpoint.UniqueNameTracker`.
@@ -550,7 +629,7 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
 # patching all of the "wrapped" dict's methods instead of creating a wrapper
 # object is an option, but not a very attractive one (replacing methods without
 # creating reference cycles is difficult, and then dicts would need to be
-# special cased everywhere as being checkpointable).
+# special cased everywhere as being trackable).
 class _DictWrapper(Mapping, collections.MutableMapping):
   """Wraps built-in dicts to support restore-on-create for variables.
 
@@ -606,7 +685,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). The wrapped dictionary "
-          "contains a non-string key which maps to a checkpointable object or "
+          "contains a non-string key which maps to a trackable object or "
           "mutable data structure.\n\nIf you don't need this dictionary "
           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
           "object; it will be automatically un-wrapped and subsequently "
@@ -615,7 +694,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). A key mapping to a "
-          "checkpointable object was overwritten or deleted, which would "
+          "trackable object was overwritten or deleted, which would "
           "cause problems for restoration.\n\nIf you don't need this "
           "dictionary checkpointed, wrap it in a "
           "tf.contrib.checkpoint.NoDependency object; it will be automatically "
@@ -656,7 +735,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     self._last_wrapped_dict_snapshot = dict(self)
 
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     if isinstance(name, six.string_types):
       string_key = True
     else:
@@ -666,15 +745,15 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       no_dependency = isinstance(value, NoDependency)
       value = super(_DictWrapper, self)._track_value(value=value, name=name)
       if not (string_key or no_dependency):
-        # A non-string key maps to a checkpointable value. This data structure
+        # A non-string key maps to a trackable value. This data structure
         # is not saveable.
         self._non_string_key = True
       return value
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       return sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
 
   def _name_element(self, key):
     """Don't throw errors for non-string keys."""
@@ -693,31 +772,32 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     else:
       value = _wrap_or_unwrap(value)
       existing_dependency = None
-      if not no_dep and isinstance(value, base.Checkpointable):
+      if not no_dep and isinstance(value, base.Trackable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
-        # checkpointable, or because it was wrapped in a NoDependency object).
+        # trackable, or because it was wrapped in a NoDependency object).
         self._non_string_key = True
-    current_value = self._storage.setdefault(key, value)
-    if current_value is not value:
-      if ((not no_dep and isinstance(value, base.Checkpointable))
-          # We don't want to just check that the existing object is
-          # checkpointable, since it may have been wrapped in a NoDependency
-          # object.
-          or existing_dependency is not None):
-        # A checkpointable object was replaced under the same key; this means
-        # that restoring would be error-prone, so we'll throw an exception on
-        # save.
-        self._non_append_mutation = True
-      self._storage[key] = value
+    if key in self._storage:
+      previous_value = self._storage[key]
+      if previous_value is not value:
+        if ((not no_dep and isinstance(value, base.Trackable))
+            # We don't want to just check that the existing object is
+            # trackable, since it may have been wrapped in a NoDependency
+            # object.
+            or existing_dependency is not None):
+          # A trackable object was replaced under the same key; this means
+          # that restoring would be error-prone, so we'll throw an exception on
+          # save.
+          self._non_append_mutation = True
+    self._storage[key] = value
 
     self._update_snapshot()
 
   def __delitem__(self, key):
     self._check_external_modification()
     existing_value = self[key]
-    if isinstance(existing_value, base.Checkpointable):
-      # Deleting tracked checkpointable values means restoring is problematic,
+    if isinstance(existing_value, base.Trackable):
+      # Deleting tracked trackable values means restoring is problematic,
       # so we'll throw an exception on save.
       self._non_append_mutation = True
     del self._storage[key]
@@ -736,11 +816,21 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     for key, value in dict(*args, **kwargs).items():
       self[key] = value
 
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if _is_function(value)
+    }
+
+
+def _is_function(x):
+  return isinstance(x, (def_function.Function, defun.ConcreteFunction))
+
 revived_types.register_revived_type(
-    "checkpointable_dict_wrapper",
+    "trackable_dict_wrapper",
     lambda obj: isinstance(obj, _DictWrapper),
     versions=[revived_types.VersionedTypeRegistration(
-        # Standard dependencies are enough to reconstruct the checkpointable
+        # Standard dependencies are enough to reconstruct the trackable
         # items in dictionaries, so we don't need to save any extra information.
         object_factory=lambda proto: _DictWrapper({}),
         version=1,
@@ -757,7 +847,7 @@ def _set_list_item(list_object, index_string, value):
 
 
 revived_types.register_revived_type(
-    "checkpointable_list_wrapper",
+    "trackable_list_wrapper",
     lambda obj: isinstance(obj, _ListWrapper),
     versions=[revived_types.VersionedTypeRegistration(
         object_factory=lambda proto: _ListWrapper([]),
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
similarity index 76%
rename from tensorflow/python/training/checkpointable/data_structures_test.py
rename to tensorflow/python/training/tracking/data_structures_test.py
index 53cbd6648215f9f1e02720a000936fb211e0a25d..c2d9b833153cc7e902707656121d2d3fe80db12a 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -34,9 +34,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class HasList(training.Model):
@@ -57,9 +57,9 @@ class HasList(training.Model):
             [core.Dense(10)]))
     self.layer_list.extend(
         data_structures.List(
-            list(sequence=[core.Dense(11)]) + [core.Dense(12)]))
+            list([core.Dense(11)]) + [core.Dense(12)]))
     self.layers_with_updates = data_structures.List(
-        sequence=(normalization.BatchNormalization(),))
+        (normalization.BatchNormalization(),))
 
   def call(self, x):
     aggregation = 0.
@@ -145,12 +145,12 @@ class ListTests(test.TestCase):
     model.l2.append(second_layer)
     self.assertEqual([first_layer, second_layer], model.layers)
 
-  def testNotCheckpointable(self):
-    class NotCheckpointable(object):
+  def testNotTrackable(self):
+    class NotTrackable(object):
       pass
 
     with self.assertRaises(ValueError):
-      data_structures.List([NotCheckpointable()])
+      data_structures.List([NotTrackable()])
 
   def testCallNotImplemented(self):
     with self.assertRaisesRegexp(TypeError, "not callable"):
@@ -207,11 +207,88 @@ class ListTests(test.TestCase):
     self.assertEqual([v], l.trainable_weights)
     self.assertEqual([v2], l.non_trainable_weights)
 
+  def testCopy(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+
+    l1 = data_structures.List([v1, v2])
+    l2 = l1.copy()
+    l2.append(v3)
+    self.assertEqual(list(l1), [v1, v2])
+    self.assertEqual(list(l2), [v1, v2, v3])
+
+  def testSlicing(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+    v4 = resource_variable_ops.ResourceVariable(1.)
+
+    l = data_structures.List([v1, v2, v3, v4])
+    self.assertEqual(l[1:], [v2, v3, v4])
+    self.assertEqual(l[1:-1], [v2, v3])
+    self.assertEqual(l[:-1], [v1, v2, v3])
+
+  def testHash(self):
+    has_sequences = set([data_structures.List(),
+                         data_structures.List()])
+    self.assertEqual(2, len(has_sequences))
+    self.assertNotIn(data_structures.List(), has_sequences)
+
+  def testIMul_zero(self):
+    l = data_structures.List([])
+    with self.assertRaisesRegexp(ValueError, "List only supports append"):
+      l *= 0
+
+  def testIMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v])
+    l *= 2
+    self.assertEqual(list(l), [v] * 2)
+
+  def testMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(l * 2), [v, v, v] * 2)
+
+  def testRMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(2 * l), [v, v, v] * 2)
+
+
+class ListWrapperTest(test.TestCase):
+
+  IGNORED = ("__new__", "__init__", "__subclasshook__", "__getattribute__")
+
+  def test_overrides_all_list_methods(self):
+    not_overridden = []
+
+    for name in dir(list):
+      if name in ListWrapperTest.IGNORED:
+        continue
+
+      list_method = getattr(list, name)
+
+      if not callable(list_method):
+        continue
+
+      object_method = getattr(object, name, None)
+      if object_method is not None and object_method == list_method:
+        # Skip methods that aren't overridden from object.
+        continue
+
+      if list_method == getattr(data_structures._ListWrapper, name):
+        not_overridden.append(name)
+
+    if not_overridden:
+      self.fail("_ListWrapper does not override %s" % (not_overridden))
+
   def testListWrapperBasic(self):
     # _ListWrapper, unlike List, compares like the built-in list type (since it
     # is used to automatically replace lists).
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     self.assertEqual([a, a],
                      [a, a])
     self.assertEqual(data_structures._ListWrapper([a, a]),
@@ -244,6 +321,10 @@ class ListTests(test.TestCase):
     self.assertEqual([a, a], [a] + data_structures._ListWrapper([a]))
     self.assertIsInstance(data_structures._ListWrapper([a]), list)
 
+  def testAcceptsNonTrackableContent(self):
+    l = data_structures._ListWrapper([1, 2, 3])
+    self.assertEqual(l, [1, 2, 3])
+
   def testWrapperChangesList(self):
     l = []
     l_wrapper = data_structures._ListWrapper(l)
@@ -263,19 +344,61 @@ class ListTests(test.TestCase):
     l.append(layer)
     self.assertEqual([layer], l_wrapper.layers)
 
-  def testHashing(self):
-    has_sequences = set([data_structures.List(),
-                         data_structures.List()])
-    self.assertEqual(2, len(has_sequences))
-    self.assertNotIn(data_structures.List(), has_sequences)
+  def testNotHashable(self):
     with self.assertRaises(TypeError):
-      has_sequences.add(data_structures._ListWrapper([]))
+      hash(data_structures._ListWrapper())
 
-  def testSlicing(self):
+  def testDelItem(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[0]
+    self.assertEqual(l, [2, 3, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delitem__")
+
+  def testDelSlice(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[2:3]
+    self.assertEqual(l, [1, 2, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delslice__")
+
+  def testSetSlice_canSaveForNonTrackableItems(self):
     l = data_structures._ListWrapper([1, 2, 3, 4])
-    self.assertEqual(l[1:], [2, 3, 4])
-    self.assertEqual(l[1:-1], [2, 3])
-    self.assertEqual(l[:-1], [1, 2, 3])
+    l[:] = 2, 8, 9, 0
+    self.assertEqual(l, [2, 8, 9, 0])
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
+    self.assertEqual(len(l._checkpoint_dependencies), 0)  # pylint: disable=protected-access
+
+  def testSetSlice_cannotSaveIfTrackableModified(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures._ListWrapper([1, 2, v1, v2])
+    l[:] = 2, 8, 9, v2
+    self.assertEqual(l, [2, 8, 9, v2])
+    self.assertUnableToSave(l, "Unable to save .*__setslice__")
+
+  def testSetSlice_truncate(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[:] = []
+    self.assertEqual(l, [])
+
+  def testSetSlice_extend(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[2:] = 1, 2, 3, 4
+    self.assertEqual(l, [1, 2, 1, 2, 3, 4])
+
+  def testSort(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l.sort()
+    self.assertEqual(l, [1, 2, 3, 4])
+    # Regardless of being a no-op for the input list, we still refuse to save.
+    # This is intentional since otherwise we would end up with a hard to debug
+    # case for users (e.g. sometimes sort on a ListWrapper is trackable and
+    # other times it is not).
+    self.assertUnableToSave(l, "Unable to save .*sort")
+
+  def assertUnableToSave(self, l, msg):
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
+    with self.assertRaisesRegexp(ValueError, msg):
+      return l._checkpoint_dependencies  # pylint: disable=protected-access
 
 
 class HasMapping(training.Model):
@@ -343,7 +466,7 @@ class MappingTests(test.TestCase):
 
   def testLayerCollectionWithExternalMutation(self):
     d = {}
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     root.wrapper = d
     self.assertEqual([], root.wrapper.layers)
     self.assertEqual([], root.wrapper.trainable_weights)
@@ -361,7 +484,7 @@ class MappingTests(test.TestCase):
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
     # In contrast to Mapping, dict wrappers are not hashable
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     self.assertEqual({}, a.d)
     self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
@@ -370,7 +493,7 @@ class MappingTests(test.TestCase):
       set([a.d])
 
   def testDictWrapperBadKeys(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d[1] = data_structures.List()
     model = training.Model()
@@ -380,7 +503,7 @@ class MappingTests(test.TestCase):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = data_structures.NoDependency({})
     a.d[1] = [3]
     self.assertEqual([a], util.list_objects(a))
@@ -390,8 +513,8 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonStringKeyNotCheckpointableValue(self):
-    a = tracking.AutoCheckpointable()
+  def testNonStringKeyNotTrackableValue(self):
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = data_structures.NoDependency([3])
@@ -402,18 +525,18 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonAppendNotCheckpointable(self):
+  def testNonAppendNotTrackable(self):
     # Non-append mutations (deleting or overwriting values) are OK when the
     # values aren't tracked.
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = 3
     a.d[1] = 2
     self.assertEqual(2, a.d[1])
     del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.AutoCheckpointable())
-    second = tracking.AutoCheckpointable()
+    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
+    second = tracking.AutoTrackable()
     a.d[2] = data_structures.NoDependency(second)
     self.assertIs(second, a.d[2])
     self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
@@ -475,7 +598,7 @@ class MappingTests(test.TestCase):
     self.assertEqual({1: 3}, new_dict)
 
   def testListShallowCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.copy(root.a)
@@ -492,7 +615,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testListDeepCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.deepcopy(root.a)
@@ -509,7 +632,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.deepcopy(root.a))
 
   def testDictShallowCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.copy(root.a)
@@ -526,7 +649,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testDictDeepCopy(self):
-    root = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.deepcopy(root.a)
@@ -542,9 +665,9 @@ class MappingTests(test.TestCase):
     with self.assertRaises(ValueError):
       util.list_objects(copy.deepcopy(root.a))
 
-  def testShallowCopyCheckpointable(self):
-    original = tracking.AutoCheckpointable()
-    original_sub = tracking.AutoCheckpointable()
+  def testShallowCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     shallow_copied = copy.copy(original)
@@ -556,16 +679,16 @@ class MappingTests(test.TestCase):
     self.assertIn(shallow_copied.b, shallow_deps)
     self.assertIn(shallow_copied.b["a"], shallow_deps)
 
-  def testDeepCopyCheckpointable(self):
-    original = tracking.AutoCheckpointable()
-    original_sub = tracking.AutoCheckpointable()
+  def testDeepCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     deep_copied = copy.deepcopy(original)
     self.assertIsNot(original, deep_copied)
     self.assertIsNot(original_sub, deep_copied.b["a"])
     self.assertEqual([[1.]], deep_copied.a)
-    self.assertIsInstance(deep_copied.b["a"], tracking.AutoCheckpointable)
+    self.assertIsInstance(deep_copied.b["a"], tracking.AutoTrackable)
     deps = util.list_objects(deep_copied)
     self.assertIn(deep_copied.a, deps)
     self.assertIn(deep_copied.b, deps)
@@ -577,5 +700,17 @@ class MappingTests(test.TestCase):
     self.assertIsInstance(result, dict)
     self.assertEqual({1: 2, 3: 4}, result)
 
+  def testListAddOrder(self):
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + data_structures._ListWrapper([2.]))
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + [2.])
+    self.assertEqual([1., 2.],
+                     [1.]
+                     + data_structures._ListWrapper([2.]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2387870182cef6e578c7b947f07a4957fdf22c
--- /dev/null
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -0,0 +1,431 @@
+"""Manages a graph of Trackable objects."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+from tensorflow.core.protobuf import trackable_object_graph_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+
+
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(trackable.name)
+       for trackable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(trackable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(trackable_objects)
+  slot_variables = object_identity.ObjectIdentityDictionary()
+  for trackable in non_slot_objects:
+    if (isinstance(trackable, optimizer_v1.Optimizer)
+        # TODO(b/110718070): Fix Keras imports.
+        or hasattr(trackable, "_create_or_restore_slot_variable")):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[trackable])
+      slot_names = trackable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = trackable.get_slot(
+                original_variable, slot_name)
+          except (AttributeError, KeyError):
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_trackable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Trackable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(trackable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          trackable_objects.append(slot_variable)
+          slot_variable_proto = (
+              trackable_object_graph_pb2.TrackableObjectGraph
+              .TrackableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(trackable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+class ObjectGraphView(object):
+  """Gathers and serializes an object graph."""
+
+  def __init__(self, root, saveables_cache=None):
+    """Configure the graph view.
+
+    Args:
+      root: A `Trackable` object whose variables (including the variables
+        of dependencies, recursively) should be saved. May be a weak reference.
+      saveables_cache: A dictionary mapping `Trackable` objects ->
+        attribute names -> SaveableObjects, used to avoid re-creating
+        SaveableObjects when graph building.
+    """
+    self._root_ref = root
+    self._saveables_cache = saveables_cache
+
+  def list_dependencies(self, obj):
+    # pylint: disable=protected-access
+    obj._maybe_initialize_trackable()
+    return obj._checkpoint_dependencies
+    # pylint: enable=protected-access
+
+  @property
+  def saveables_cache(self):
+    """Maps Trackable objects -> attribute names -> list(SaveableObjects).
+
+    Used to avoid re-creating SaveableObjects when graph building. None when
+    executing eagerly.
+
+    Returns:
+      The cache (an object-identity dictionary), or None if caching is disabled.
+    """
+    return self._saveables_cache
+
+  @property
+  def root(self):
+    if isinstance(self._root_ref, weakref.ref):
+      derefed = self._root_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_ref
+
+  def _breadth_first_traversal(self):
+    """Find shortest paths to all dependencies of self.root."""
+    bfs_sorted = []
+    to_visit = collections.deque([self.root])
+    path_to_root = object_identity.ObjectIdentityDictionary()
+    path_to_root[self.root] = ()
+    while to_visit:
+      current_trackable = to_visit.popleft()
+      if isinstance(current_trackable, tracking.NotTrackable):
+        raise NotImplementedError(
+            ("The object %s does not support object-based saving. File a "
+             "feature request if this limitation bothers you. In the meantime, "
+             "you can remove the dependency on this object and save everything "
+             "else.")
+            % (current_trackable,))
+      bfs_sorted.append(current_trackable)
+      for name, dependency in self.list_dependencies(current_trackable):
+        if dependency not in path_to_root:
+          path_to_root[dependency] = (
+              path_to_root[current_trackable] + (
+                  base.TrackableReference(name, dependency),))
+          to_visit.append(dependency)
+    return bfs_sorted, path_to_root
+
+  def _add_attributes_to_object_graph(
+      self, trackable_objects, object_graph_proto, node_ids, object_names,
+      object_map):
+    """Create SaveableObjects and corresponding SerializedTensor protos."""
+    named_saveable_objects = []
+    if self._saveables_cache is None:
+      # No SaveableObject caching. Either we're executing eagerly, or building a
+      # static save which is specialized to the current Python state.
+      feed_additions = None
+    else:
+      # If we are caching SaveableObjects, we need to build up a feed_dict with
+      # functions computing volatile Python state to be saved with the
+      # checkpoint.
+      feed_additions = {}
+    for checkpoint_id, (trackable, object_proto) in enumerate(
+        zip(trackable_objects, object_graph_proto.nodes)):
+      assert node_ids[trackable] == checkpoint_id
+      object_name = object_names[trackable]
+      if object_map is None:
+        object_to_save = trackable
+      else:
+        object_to_save = object_map.get(trackable, trackable)
+      if self._saveables_cache is not None:
+        cached_attributes = self._saveables_cache.setdefault(object_to_save, {})
+      else:
+        cached_attributes = None
+
+      for name, saveable_factory in (
+          object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        attribute = object_proto.attributes.add()
+        attribute.name = name
+        attribute.checkpoint_key = "%s/%s/%s" % (
+            object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+        if cached_attributes is None:
+          saveables = None
+        else:
+          saveables = cached_attributes.get(name, None)
+          if saveables is not None:
+            for saveable in saveables:
+              if attribute.checkpoint_key not in saveable.name:
+                # The checkpoint key for this SaveableObject is different. We
+                # need to re-create it.
+                saveables = None
+                del cached_attributes[name]
+                break
+        if saveables is None:
+          if callable(saveable_factory):
+            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+          else:
+            maybe_saveable = saveable_factory
+          if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
+            saveables = (maybe_saveable,)
+          else:
+            # Figure out the name-based Saver's name for this variable. If it's
+            # already a SaveableObject we'd just get the checkpoint key back, so
+            # we leave full_name blank.
+            saver_dict = saveable_object_util.op_list_to_dict(
+                [maybe_saveable], convert_variable_to_tensor=False)
+            full_name, = saver_dict.keys()
+            saveables = tuple(saveable_object_util.saveable_objects_for_op(
+                op=maybe_saveable, name=attribute.checkpoint_key))
+            for saveable in saveables:
+              saveable.full_name = full_name
+          for saveable in saveables:
+            if attribute.checkpoint_key not in saveable.name:
+              raise AssertionError(
+                  ("The object %s produced a SaveableObject with name '%s' for "
+                   "attribute '%s'. Expected a name containing '%s'.")
+                  % (trackable, name, saveable.name,
+                     attribute.checkpoint_key))
+          if cached_attributes is not None:
+            cached_attributes[name] = saveables
+
+        optional_restore = None
+        for saveable in saveables:
+          if optional_restore is None:
+            optional_restore = saveable.optional_restore
+          else:
+            optional_restore = optional_restore and saveable.optional_restore
+
+          if hasattr(saveable, "full_name"):
+            attribute.full_name = saveable.full_name
+          if isinstance(saveable, base.PythonStateSaveable):
+            if feed_additions is None:
+              assert self._saveables_cache is None
+              # If we're not caching saveables, then we're either executing
+              # eagerly or building a static save/restore (e.g. for a
+              # SavedModel). In either case, we should embed the current Python
+              # state in the graph rather than relying on a feed dict.
+              saveable = saveable.freeze()
+            else:
+              saveable_feed_dict = saveable.feed_dict_additions()
+              for new_feed_key in saveable_feed_dict.keys():
+                if new_feed_key in feed_additions:
+                  raise AssertionError(
+                      ("The object %s tried to feed a value for the Tensor %s "
+                       "when saving, but another object is already feeding a "
+                       "value.")
+                      % (trackable, new_feed_key))
+              feed_additions.update(saveable_feed_dict)
+          named_saveable_objects.append(saveable)
+        if optional_restore is None:
+          optional_restore = False
+        attribute.optional_restore = optional_restore
+
+    return named_saveable_objects, feed_additions
+
+  def _fill_object_graph_proto(self, trackable_objects,
+                               node_ids,
+                               slot_variables,
+                               object_graph_proto=None):
+    """Name non-slot `Trackable`s and add them to `object_graph_proto`."""
+    if object_graph_proto is None:
+      object_graph_proto = (
+          trackable_object_graph_pb2.TrackableObjectGraph())
+    for checkpoint_id, trackable in enumerate(trackable_objects):
+      assert node_ids[trackable] == checkpoint_id
+      object_proto = object_graph_proto.nodes.add()
+      object_proto.slot_variables.extend(slot_variables.get(trackable, ()))
+      for child in self.list_dependencies(trackable):
+        child_proto = object_proto.children.add()
+        child_proto.node_id = node_ids[child.ref]
+        child_proto.local_name = child.name
+    return object_graph_proto
+
+  def _serialize_gathered_objects(self, trackable_objects, path_to_root,
+                                  object_map=None):
+    """Create SaveableObjects and protos for gathered objects."""
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    object_graph_proto = self._fill_object_graph_proto(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        slot_variables=slot_variables)
+    named_saveable_objects, feed_additions = (
+        self._add_attributes_to_object_graph(
+            trackable_objects=trackable_objects,
+            object_graph_proto=object_graph_proto,
+            node_ids=node_ids,
+            object_names=object_names,
+            object_map=object_map))
+    return named_saveable_objects, object_graph_proto, feed_additions
+
+  def serialize_object_graph(self):
+    """Determine checkpoint keys for variables and build a serialized graph.
+
+    Non-slot variables are keyed based on a shortest path from the root saveable
+    to the object which owns the variable (i.e. the one which called
+    `Trackable._add_variable` to create it).
+
+    Slot variables are keyed based on a shortest path to the variable being
+    slotted for, a shortest path to their optimizer, and the slot name.
+
+    Returns:
+      A tuple of (named_variables, object_graph_proto, feed_additions):
+        named_variables: A dictionary mapping names to variable objects.
+        object_graph_proto: A TrackableObjectGraph protocol buffer
+          containing the serialized object graph and variable references.
+        feed_additions: A dictionary mapping from Tensors to values which should
+          be fed when saving.
+
+    Raises:
+      ValueError: If there are invalid characters in an optimizer's slot names.
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    return self._serialize_gathered_objects(
+        trackable_objects, path_to_root)
+
+  def frozen_saveable_objects(self, object_map=None, to_graph=None):
+    """Creates SaveableObjects with the current object graph frozen."""
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    if to_graph:
+      target_context = to_graph.as_default
+    else:
+      target_context = ops.NullContextmanager
+    with target_context():
+      named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
+          trackable_objects,
+          path_to_root,
+          object_map)
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      named_saveable_objects.append(
+          base.NoRestoreSaveable(
+              tensor=object_graph_tensor,
+              name=base.OBJECT_GRAPH_PROTO_KEY))
+    return named_saveable_objects
+
+  def objects_ids_and_slot_variables(self):
+    """Traverse the object graph and list all accessible objects.
+
+    Looks for `Trackable` objects which are dependencies of
+    `root_trackable`. Includes slot variables only if the variable they are
+    slotting for and the optimizer are dependencies of `root_trackable`
+    (i.e. if they would be saved with a checkpoint).
+
+    Returns:
+      A tuple of (trackable objects, object -> node id, slot variables)
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    return trackable_objects, node_ids, slot_variables
+
+  def list_objects(self):
+    """Traverse the object graph and list all accessible objects."""
+    trackable_objects, _, _ = self.objects_ids_and_slot_variables()
+    return trackable_objects
diff --git a/tensorflow/python/training/checkpointable/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
similarity index 89%
rename from tensorflow/python/training/checkpointable/layer_utils.py
rename to tensorflow/python/training/tracking/layer_utils.py
index 9d45c4883e70d140650660971656478c216530c9..818563c32fa6ed726156781704b869978409652c 100644
--- a/tensorflow/python/training/checkpointable/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.training.tracking import object_identity
+
 
 def is_layer(obj):
   """Implicit check for Layer-like objects."""
@@ -36,15 +38,21 @@ def has_weights(obj):
 
 
 def filter_empty_layer_containers(layer_list):
-  """Filter out empty Layer-like containers."""
+  """Filter out empty Layer-like containers and uniquify."""
+  existing = object_identity.ObjectIdentitySet()
+  to_visit = layer_list[::-1]
   filtered = []
-  for obj in layer_list:
+  while to_visit:
+    obj = to_visit.pop()
+    if obj in existing:
+      continue
+    existing.add(obj)
     if is_layer(obj):
       filtered.append(obj)
     elif hasattr(obj, "layers"):
-      # Checkpointable data structures will not show up in ".layers" lists, but
+      # Trackable data structures will not show up in ".layers" lists, but
       # the layers they contain will.
-      filtered.extend(obj.layers)
+      to_visit.extend(obj.layers[::-1])
   return filtered
 
 
diff --git a/tensorflow/python/training/tracking/object_identity.py b/tensorflow/python/training/tracking/object_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4eef5b34b55dbf41bea09c5ac6ec7aadaac70ad
--- /dev/null
+++ b/tensorflow/python/training/tracking/object_identity.py
@@ -0,0 +1,156 @@
+"""Utilities for collecting objects based on "is" comparison."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+
+class _ObjectIdentityWrapper(object):
+  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
+
+  Since __eq__ is based on object identity, it's safe to also define __hash__
+  based on object ids. This lets us add unhashable types like trackable
+  _ListWrapper objects to object-identity collections.
+  """
+
+  def __init__(self, wrapped):
+    self._wrapped = wrapped
+
+  @property
+  def unwrapped(self):
+    return self._wrapped
+
+  def __eq__(self, other):
+    if isinstance(other, _ObjectIdentityWrapper):
+      return self._wrapped is other._wrapped  # pylint: disable=protected-access
+    return self._wrapped is other
+
+  def __hash__(self):
+    # Wrapper id() is also fine for weakrefs. In fact, we rely on
+    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
+    # weakref.ref(a) in _WeakObjectIdentityWrapper.
+    return id(self._wrapped)
+
+
+class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
+
+  def __init__(self, wrapped):
+    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
+
+  @property
+  def unwrapped(self):
+    return self._wrapped()
+
+
+class ObjectIdentityDictionary(collections.MutableMapping):
+  """A mutable mapping data structure which compares using "is".
+
+  This is necessary because we have trackable objects (_ListWrapper) which
+  have behavior identical to built-in Python lists (including being unhashable
+  and comparing based on the equality of their contents by default).
+  """
+
+  def __init__(self):
+    self._storage = {}
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __getitem__(self, key):
+    return self._storage[self._wrap_key(key)]
+
+  def __setitem__(self, key, value):
+    self._storage[self._wrap_key(key)] = value
+
+  def __delitem__(self, key):
+    del self._storage[self._wrap_key(key)]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    for key in self._storage:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
+  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len(list(self._storage))
+
+  def __iter__(self):
+    keys = self._storage.keys()
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        del self[key]
+      else:
+        yield unwrapped
+
+
+class ObjectIdentitySet(collections.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
+
+  def __init__(self, *args):
+    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __contains__(self, key):
+    return self._wrap_key(key) in self._storage
+
+  def discard(self, key):
+    self._storage.discard(self._wrap_key(key))
+
+  def add(self, key):
+    self._storage.add(self._wrap_key(key))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakSet(ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len([_ for _ in self])
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        self.discard(key)
+      else:
+        yield unwrapped
diff --git a/tensorflow/python/training/tracking/python_state.py b/tensorflow/python/training/tracking/python_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..f05ca91a21c084bc6b37e34c3e60baa3791cca26
--- /dev/null
+++ b/tensorflow/python/training/tracking/python_state.py
@@ -0,0 +1,92 @@
+"""Utilities for including Python state in TensorFlow checkpoints."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import functools
+
+import six
+
+from tensorflow.python.training.tracking import base
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.experimental.PythonState")
+@six.add_metaclass(abc.ABCMeta)
+class PythonState(base.Trackable):
+  """A mixin for putting Python state in an object-based checkpoint.
+
+  This is an abstract class which allows extensions to TensorFlow's object-based
+  checkpointing (see `tf.train.Checkpoint`). For example a wrapper for NumPy
+  arrays:
+
+  ```python
+  import io
+  import numpy
+
+  class NumpyWrapper(tf.train.experimental.PythonState):
+
+    def __init__(self, array):
+      self.array = array
+
+    def serialize(self):
+      string_file = io.BytesIO()
+      try:
+        numpy.save(string_file, self.array, allow_pickle=False)
+        serialized = string_file.getvalue()
+      finally:
+        string_file.close()
+      return serialized
+
+    def deserialize(self, string_value):
+      string_file = io.BytesIO(string_value)
+      try:
+        self.array = numpy.load(string_file, allow_pickle=False)
+      finally:
+        string_file.close()
+  ```
+
+  Instances of `NumpyWrapper` are checkpointable objects, and will be saved and
+  restored from checkpoints along with TensorFlow state like variables.
+
+  ```python
+  root = tf.train.Checkpoint(numpy=NumpyWrapper(numpy.array([1.])))
+  save_path = root.save(prefix)
+  root.numpy.array *= 2.
+  assert [2.] == root.numpy.array
+  root.restore(save_path)
+  assert [1.] == root.numpy.array
+  ```
+  """
+
+  @abc.abstractmethod
+  def serialize(self):
+    """Callback to serialize the object. Returns a string."""
+
+  @abc.abstractmethod
+  def deserialize(self, string_value):
+    """Callback to deserialize the object."""
+
+  def _gather_saveables_for_checkpoint(self):
+    """Specify callbacks for saving and restoring `array`."""
+    return {
+        "py_state": functools.partial(
+            base.PythonStringStateSaveable,
+            state_callback=self.serialize,
+            restore_callback=self.deserialize)
+        }
diff --git a/tensorflow/python/training/tracking/python_state_test.py b/tensorflow/python/training/tracking/python_state_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c27a2714f42c22a853073e79597fd4c1367f1d5
--- /dev/null
+++ b/tensorflow/python/training/tracking/python_state_test.py
@@ -0,0 +1,214 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+
+import numpy
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.module import module
+from tensorflow.python.platform import test
+from tensorflow.python.training.tracking import python_state
+from tensorflow.python.training.tracking import util
+
+
+class _NumpyState(module.Module):
+  """A checkpointable object whose NumPy array attributes are saved/restored.
+
+  Example usage:
+
+  ```python
+  arrays = _NumpyState()
+  checkpoint = tf.train.Checkpoint(numpy_arrays=arrays)
+  arrays.x = numpy.zeros([3, 4])
+  save_path = checkpoint.save("/tmp/ckpt")
+  arrays.x[1, 1] = 4.
+  checkpoint.restore(save_path)
+  assert (arrays.x == numpy.zeros([3, 4])).all()
+
+  second_checkpoint = tf.train.Checkpoint(
+      numpy_arrays=_NumpyState())
+  # Attributes of NumpyState objects are created automatically by restore()
+  second_checkpoint.restore(save_path)
+  assert (second_checkpoint.numpy_arrays.x == numpy.zeros([3, 4])).all()
+  ```
+
+  Note that `NumpyState` objects re-create the attributes of the previously
+  saved object on `restore()`. This is in contrast to TensorFlow variables, for
+  which a `Variable` object must be created and assigned to an attribute.
+
+  This snippet works both when graph building and when executing eagerly. On
+  save, the NumPy array(s) are fed as strings to be saved in the checkpoint (via
+  a placeholder when graph building, or as a string constant when executing
+  eagerly). When restoring they skip the TensorFlow graph entirely, and so no
+  restore ops need be run. This means that restoration always happens eagerly,
+  rather than waiting for `checkpoint.restore(...).run_restore_ops()` like
+  TensorFlow variables when graph building.
+  """
+
+  def __init__(self):
+    super(_NumpyState, self).__setattr__("_arrays", module.Module())
+
+  def __getattribute__(self, name):
+    """Un-wrap `_NumpyWrapper` objects when accessing attributes."""
+    try:
+      arrays = super(_NumpyState, self).__getattribute__("_arrays")
+    except AttributeError:
+      # _arrays hasn't been assigned yet
+      return super(_NumpyState, self).__getattribute__(name)
+    try:
+      value = getattr(arrays, name)
+    except AttributeError:
+      dummy_array = numpy.array([])
+      setattr(arrays, name, _NumpyWrapper(dummy_array))
+      value = getattr(arrays, name)
+      if value.array is dummy_array:
+        # No set or restored attribute with this name
+        delattr(arrays, name)
+        return super(_NumpyState, self).__getattribute__(name)
+
+    if isinstance(value, _NumpyWrapper):
+      return value.array
+    return super(_NumpyState, self).__getattribute__(name)
+
+  def __setattr__(self, name, value):
+    """Automatically wrap NumPy arrays assigned to attributes."""
+    if isinstance(value, (numpy.ndarray, numpy.generic)):
+      try:
+        existing = getattr(self._arrays, name)
+        existing.array = value
+        return
+      except AttributeError:
+        value = _NumpyWrapper(value)
+      setattr(self._arrays, name, value)
+      return
+    super(_NumpyState, self).__setattr__(name, value)
+
+
+class _NumpyWrapper(python_state.PythonState):
+  """Wraps a NumPy array for storage in an object-based checkpoint."""
+
+  def __init__(self, array):
+    """Specify a NumPy array to wrap.
+
+    Args:
+      array: The NumPy array to save and restore (may be overwritten).
+    """
+    self.array = array
+
+  def serialize(self):
+    """Callback to serialize the array."""
+    string_file = io.BytesIO()
+    try:
+      numpy.save(string_file, self.array, allow_pickle=False)
+      serialized = string_file.getvalue()
+    finally:
+      string_file.close()
+    return serialized
+
+  def deserialize(self, string_value):
+    """Callback to deserialize the array."""
+    string_file = io.BytesIO(string_value)
+    try:
+      self.array = numpy.load(string_file, allow_pickle=False)
+    finally:
+      string_file.close()
+
+
+class NumpyStateTests(test.TestCase):
+
+  def testWrapper(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "ckpt")
+    root = util.Checkpoint(numpy=_NumpyWrapper(numpy.array([1.])))
+    save_path = root.save(prefix)
+    root.numpy.array *= 2.
+    self.assertEqual([2.], root.numpy.array)
+    root.restore(save_path)
+    self.assertEqual([1.], root.numpy.array)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestoreNumpyState(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "ckpt")
+    save_state = _NumpyState()
+    saver = util.Checkpoint(numpy=save_state)
+    save_state.a = numpy.ones([2, 2])
+    save_state.b = numpy.ones([2, 2])
+    save_state.b = numpy.zeros([2, 2])
+    save_state.c = numpy.int64(3)
+    self.assertAllEqual(numpy.ones([2, 2]), save_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), save_state.b)
+    self.assertEqual(3, save_state.c)
+    first_save_path = saver.save(prefix)
+    save_state.a[1, 1] = 2.
+    save_state.c = numpy.int64(4)
+    second_save_path = saver.save(prefix)
+
+    load_state = _NumpyState()
+    loader = util.Checkpoint(numpy=load_state)
+    loader.restore(first_save_path).initialize_or_restore()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+    self.assertEqual(3, load_state.c)
+    load_state.a[0, 0] = 42.
+    self.assertAllEqual([[42., 1.], [1., 1.]], load_state.a)
+    loader.restore(first_save_path).run_restore_ops()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    loader.restore(second_save_path).run_restore_ops()
+    self.assertAllEqual([[1., 1.], [1., 2.]], load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+    self.assertEqual(4, load_state.c)
+
+  def testNoGraphPollution(self):
+    graph = ops.Graph()
+    with graph.as_default(), session.Session():
+      directory = self.get_temp_dir()
+      prefix = os.path.join(directory, "ckpt")
+      save_state = _NumpyState()
+      saver = util.Checkpoint(numpy=save_state)
+      save_state.a = numpy.ones([2, 2])
+      save_path = saver.save(prefix)
+      saver.restore(save_path)
+      graph.finalize()
+      saver.save(prefix)
+      save_state.a = numpy.zeros([2, 2])
+      saver.save(prefix)
+      saver.restore(save_path)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDocstringExample(self):
+    arrays = _NumpyState()
+    checkpoint = util.Checkpoint(numpy_arrays=arrays)
+    arrays.x = numpy.zeros([3, 4])
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    arrays.x[1, 1] = 4.
+    checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), arrays.x)
+
+    second_checkpoint = util.Checkpoint(numpy_arrays=_NumpyState())
+    second_checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), second_checkpoint.numpy_arrays.x)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/tracking/tracking.py
similarity index 56%
rename from tensorflow/python/training/checkpointable/tracking.py
rename to tensorflow/python/training/tracking/tracking.py
index 04fd5547e1002c559b43c241dc25919588167e8b..877b300ff3735da974e10762b5b5b7d8c372a438 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -1,4 +1,4 @@
-"""Dependency tracking for checkpointable objects."""
+"""Dependency tracking for trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import tf_contextlib
 
 
@@ -30,21 +30,21 @@ from tensorflow.python.util import tf_contextlib
 _RESOURCE_TRACKER_STACK = []
 
 
-class NotCheckpointable(object):
+class NotTrackable(object):
   """Marks instances of child classes as unsaveable using an object-based API.
 
-  Useful for marking objects which would otherwise look checkpointable because
-  of inheritance (e.g. through `Layer`) as not checkpointable. Inheriting from
-  `NotCheckpointable` does not prevent an object from being assigned to any
+  Useful for marking objects which would otherwise look trackable because
+  of inheritance (e.g. through `Layer`) as not trackable. Inheriting from
+  `NotTrackable` does not prevent an object from being assigned to any
   attributes, but will throw an error on save/restore.
   """
   pass
 
 
-class AutoCheckpointable(base.Checkpointable):
+class AutoTrackable(base.Trackable):
   """Manages dependencies on other objects.
 
-  `Checkpointable` objects may have dependencies: other `Checkpointable` objects
+  `Trackable` objects may have dependencies: other `Trackable` objects
   which should be saved if the object declaring the dependency is saved. A
   correctly saveable program has a dependency graph such that if changing a
   global variable affects an object (e.g. changes the behavior of any of its
@@ -52,34 +52,60 @@ class AutoCheckpointable(base.Checkpointable):
   the variable.
 
   Dependency edges have names, and are created implicitly when a
-  `Checkpointable` object is assigned to an attribute of another
-  `Checkpointable` object. For example:
+  `Trackable` object is assigned to an attribute of another
+  `Trackable` object. For example:
 
   ```
-  obj = Checkpointable()
+  obj = Trackable()
   obj.v = ResourceVariable(0.)
   ```
 
-  The `Checkpointable` object `obj` now has a dependency named "v" on a
+  The `Trackable` object `obj` now has a dependency named "v" on a
   variable.
 
-  `Checkpointable` objects may specify `Tensor`s to be saved and restored
+  `Trackable` objects may specify `Tensor`s to be saved and restored
   directly (e.g. a `Variable` indicating how to save itself) rather than through
   dependencies on other objects. See
-  `Checkpointable._gather_saveables_for_checkpoint` for details.
+  `Trackable._gather_saveables_for_checkpoint` for details.
   """
 
   def __setattr__(self, name, value):
-    """Support self.foo = checkpointable syntax."""
+    """Support self.foo = trackable syntax."""
     if getattr(self, "_setattr_tracking", True):
       value = data_structures.sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
-    super(AutoCheckpointable, self).__setattr__(name, value)
+          trackable=self, value=value, name=name)
+    super(AutoTrackable, self).__setattr__(name, value)
+
+  def __delattr__(self, name):
+    self._maybe_initialize_trackable()
+    if name in self._unconditional_dependency_names:
+      del self._unconditional_dependency_names[name]
+      for index, (dep_name, _) in enumerate(
+          self._unconditional_checkpoint_dependencies):
+        if dep_name == name:
+          del self._unconditional_checkpoint_dependencies[index]
+          break
+    super(AutoTrackable, self).__delattr__(name)
 
   def _no_dependency(self, value):
-    """Override to allow CheckpointableBase to disable dependency tracking."""
+    """Override to allow TrackableBase to disable dependency tracking."""
     return data_structures.NoDependency(value)
 
+  def _list_functions_for_serialization(self):
+    """Return a dict of `Function`s of a trackable."""
+    functions = dict()
+    for attribute_name in dir(self):
+      try:
+        attribute_value = getattr(self, attribute_name, None)
+      except Exception:  # pylint: disable=broad-except
+        # We really don't want to throw an exception just because some object's
+        # attribute accessor is broken.
+        attribute_value = None
+      if isinstance(attribute_value, (def_function.Function,
+                                      defun.ConcreteFunction)):
+        functions[attribute_name] = attribute_value
+    return functions
+
 
 class ResourceTracker(object):
   """An object that tracks a list of resources."""
@@ -124,7 +150,7 @@ def resource_tracker_scope(resource_tracker):
     _RESOURCE_TRACKER_STACK = old
 
 
-class TrackableResource(base.Checkpointable):
+class TrackableResource(base.Trackable):
   """Base class for all resources that need to be tracked."""
 
   def __init__(self):
@@ -134,12 +160,12 @@ class TrackableResource(base.Checkpointable):
 
     self._resource_handle = None
 
-  def create_resource(self):
+  def _create_resource(self):
     """A function that creates a resource handle."""
-    raise NotImplementedError("TrackableResource.create_resource not "
+    raise NotImplementedError("TrackableResource._create_resource not "
                               "implemented.")
 
-  def initialize(self):
+  def _initialize(self):
     """A function that initializes the resource. Optional."""
     pass
 
@@ -147,32 +173,37 @@ class TrackableResource(base.Checkpointable):
   def resource_handle(self):
     """Returns the resource handle associated with this Resource."""
     if self._resource_handle is None:
-      self._resource_handle = self.create_resource()
+      self._resource_handle = self._create_resource()
     return self._resource_handle
 
+  def _list_functions_for_serialization(self):
+    @def_function.function(input_signature=[], autograph=False)
+    def _creator():
+      resource = self._create_resource()
+      return resource
+
+    @def_function.function(input_signature=[], autograph=False)
+    def _initializer():
+      self._initialize()
+      return 1  # Dummy return
+
+    return {
+        "_create_resource": _creator,
+        "_initialize": _initializer,
+    }
+
 
-class TrackableAsset(base.Checkpointable):
+class TrackableAsset(base.Trackable):
   """Base class for asset files which need to be tracked."""
 
   def __init__(self, path):
     """Record the full path to the asset."""
-    # We use a variable here so that @tf.functions do not capture a literal
-    # value. The init_scope prevents functions from capturing `path` in an
+    # The init_scope prevents functions from capturing `path` in an
     # initialization graph, since it is transient and should not end up in a
-    # serialized function body. When serialized in a SavedModel, the variable
-    # will be set during the loading process to its location in the assets/
-    # directory.
+    # serialized function body.
     with ops.init_scope():
-      if context.executing_eagerly():
-        self._path = self._no_dependency(
-            resource_variable_ops.ResourceVariable(
-                path, dtype=dtypes.string,
-                name="asset_path"))
-      else:
-        # Adding a variable is too disruptive when v1-style graph building,
-        # since things may get fed and local variable initializers would then
-        # need to be run.
-        self._path = path
+      self._path = ops.internal_convert_to_tensor(path, dtype=dtypes.string,
+                                                  name="asset_path")
 
   @property
   def asset_path(self):
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
similarity index 81%
rename from tensorflow/python/training/checkpointable/tracking_test.py
rename to tensorflow/python/training/tracking/tracking_test.py
index eb70919b9c99d7c00326e0d2233ad204a10288a1..adef69f45bdad1631fbe988cbbdf0f018f1da288 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -25,35 +25,35 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 
 
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
-    root = tracking.AutoCheckpointable()
-    root.leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    root.leaf = tracking.AutoTrackable()
     root.leaf = root.leaf
-    duplicate_name_dep = tracking.AutoCheckpointable()
+    duplicate_name_dep = tracking.AutoTrackable()
     with self.assertRaisesRegexp(ValueError, "already declared"):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
+      root._track_trackable(duplicate_name_dep, name="leaf")
     # No error; we're overriding __setattr__, so we can't really stop people
     # from doing this while maintaining backward compatibility.
     root.leaf = duplicate_name_dep
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     self.assertIs(duplicate_name_dep, root._lookup_dependency("leaf"))
     (_, dep_object), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, dep_object)
 
   def testNoDependency(self):
-    root = tracking.AutoCheckpointable()
-    hasdep = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    hasdep = tracking.AutoTrackable()
     root.hasdep = hasdep
-    nodep = tracking.AutoCheckpointable()
+    nodep = tracking.AutoTrackable()
     root.nodep = data_structures.NoDependency(nodep)
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
@@ -66,16 +66,31 @@ class InterfaceTests(test.TestCase):
       def __init__(self):
         super(NoDependencyModel, self).__init__()
         self.a = []
-        self.b = tracking.AutoCheckpointable()
+        self.b = tracking.AutoTrackable()
 
     nodeps = NoDependencyModel()
     self.assertEqual([nodeps], util.list_objects(nodeps))
 
+  def testRemoveDependency(self):
+    root = tracking.AutoTrackable()
+    root.a = tracking.AutoTrackable()
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
+    self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
+    del root.a
+    self.assertFalse(hasattr(root, "a"))
+    self.assertEqual(0, len(root._checkpoint_dependencies))
+    self.assertEqual(0, len(root._unconditional_checkpoint_dependencies))
+    root.a = tracking.AutoTrackable()
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
+    self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
+
   def testListBasic(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l.append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
@@ -87,10 +102,10 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMutationDirtiesList(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l.insert(0, c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
@@ -98,11 +113,11 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testOutOfBandEditDirtiesList(self):
-    a = tracking.AutoCheckpointable()
-    b = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     held_reference = [b]
     a.l = held_reference
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     held_reference.append(c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "The wrapped list was modified"):
@@ -110,25 +125,25 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testNestedLists(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.l = []
-    b = tracking.AutoCheckpointable()
+    b = tracking.AutoTrackable()
     a.l.append([b])
-    c = tracking.AutoCheckpointable()
+    c = tracking.AutoTrackable()
     a.l[0].append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     a.l[0].append(1)
-    d = tracking.AutoCheckpointable()
+    d = tracking.AutoTrackable()
     a.l[0].append(d)
     a_deps = util.list_objects(a)
     self.assertIn(d, a_deps)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     self.assertNotIn(1, a_deps)
-    e = tracking.AutoCheckpointable()
-    f = tracking.AutoCheckpointable()
+    e = tracking.AutoTrackable()
+    f = tracking.AutoTrackable()
     a.l1 = [[], [e]]
     a.l1[0].append(f)
     a_deps = util.list_objects(a)
@@ -183,7 +198,7 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
-    a = tracking.AutoCheckpointable()
+    a = tracking.AutoTrackable()
     a.l = {"k": [numpy.zeros([2, 2])]}
     self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
                         nest.flatten(a.l))
@@ -200,7 +215,7 @@ class _DummyResource(tracking.TrackableResource):
     self._handle_name = handle_name
     super(_DummyResource, self).__init__()
 
-  def create_resource(self):
+  def _create_resource(self):
     return self._handle_name
 
 
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/tracking/util.py
similarity index 62%
rename from tensorflow/python/training/checkpointable/util.py
rename to tensorflow/python/training/tracking/util.py
index 129ad55f961794b387a38ecc28e539fca5826477..c229fade83c1d88a85c23c801ab22aee4000196f 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -1,4 +1,4 @@
-"""Utilities for saving/loading Checkpointable objects."""
+"""Utilities for saving/loading Trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import os
 import weakref
 
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
@@ -32,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
@@ -39,45 +39,29 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import saver as v1_saver_lib
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.saving import functional_saver
-from tensorflow.python.training.saving import saveable_object as saveable_object_lib
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import graph_view as graph_view_lib
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-
-
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, save_path_tensor,
-               restore_op_cache, saveable_object_cache):
+               restore_op_cache, graph_view):
     """Specify the checkpoint being loaded.
 
     Args:
-      object_graph_proto: The CheckpointableObjectGraph protocol buffer
+      object_graph_proto: The TrackableObjectGraph protocol buffer
         associated with this checkpoint.
       save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
@@ -87,10 +71,8 @@ class _CheckpointRestoreCoordinator(object):
         `_CheckpointRestoreCoordinator`s for the same Python objects, used to
         look up restore ops by name to avoid re-creating them across multiple
         `restore()` calls.
-      saveable_object_cache: A mapping of checkpointable objects -> attribute
-        names -> list(`SaveableObject`s), used when `SaveableObjects` must be
-        referenced every restore (e.g. for Python state); otherwise they would
-        create their own ops every restore.
+      graph_view: A graph_view_lib.ObjectGraphView object for the restored
+        objects.
     """
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
@@ -98,7 +80,7 @@ class _CheckpointRestoreCoordinator(object):
     # not loaded into any object, for error checking.
     self.unused_attributes = weakref.WeakKeyDictionary()
     # Dictionary mapping from an id in the protocol buffer flat array to
-    # Checkpointable Python objects. This mapping may be deferred if a
+    # Trackable Python objects. This mapping may be deferred if a
     # checkpoint is restored before all dependencies have been tracked. Uses
     # weak references so that partial restorations don't create reference cycles
     # (as objects with deferred dependencies will generally have references to
@@ -108,7 +90,7 @@ class _CheckpointRestoreCoordinator(object):
     # use them (for example because of inconsistent references when
     # loading). Used to make status assertions fail when loading checkpoints
     # that don't quite match.
-    self.all_python_objects = _ObjectIdentityWeakSet()
+    self.all_python_objects = object_identity.ObjectIdentityWeakSet()
     self.save_path_tensor = save_path_tensor
     self.save_path_string = save_path
     self.dtype_map = pywrap_tensorflow.NewCheckpointReader(
@@ -119,7 +101,7 @@ class _CheckpointRestoreCoordinator(object):
     # this checkpoint.
     self.restore_ops = []
     self.restore_ops_by_name = restore_op_cache
-    self.saveable_object_cache = saveable_object_cache
+    self.graph_view = graph_view
     self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
@@ -195,7 +177,7 @@ class _NameBasedRestoreCoordinator(object):
     self.unused_attributes = weakref.WeakKeyDictionary()
     self.restore_uid = ops.uid()
 
-  def globally_named_object_attributes(self, checkpointable):
+  def globally_named_object_attributes(self, trackable):
     """Create globally named SaveableObjects from attributes.
 
     If an object's attribute has no global name specified (default construction
@@ -204,13 +186,13 @@ class _NameBasedRestoreCoordinator(object):
     fail; see `NameBasedSaverStatus`).
 
     Args:
-      checkpointable: An object to save.
+      trackable: An object to save.
 
     Yields:
-      SaveableObjects for `checkpointable`'s attributes.
+      SaveableObjects for `trackable`'s attributes.
     """
     for attribute_name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        trackable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       if callable(saveable_factory):
         try:
           # This saveable object factory does not have a default name= argument,
@@ -219,8 +201,12 @@ class _NameBasedRestoreCoordinator(object):
           # fails.
           saveable = saveable_factory()
         except TypeError:
-          self.unused_attributes.setdefault(checkpointable, []).append(
-              attribute_name)
+          # Even if we can't name this object, we should construct it and check
+          # whether it's optional to restore it. If it's optional we don't need
+          # to make assertions fail.
+          if not saveable_factory("").optional_restore:
+            self.unused_attributes.setdefault(trackable, []).append(
+                attribute_name)
           continue
       else:
         saveable = saveable_factory
@@ -232,14 +218,14 @@ class _NameBasedRestoreCoordinator(object):
             op=op, name=name):
           yield saveable_object
 
-  def eager_restore(self, checkpointable):
-    """Runs restore ops for `checkpointable`'s attributes."""
+  def eager_restore(self, trackable):
+    """Runs restore ops for `trackable`'s attributes."""
     # When graph building, we don't add any restore ops to the graph until
     # run_restore_ops/initialize_or_restore on the status object for name-based
     # checkpoints.
     assert context.executing_eagerly()
     for saveable in self.globally_named_object_attributes(
-        checkpointable):
+        trackable):
       restored_tensors = []
       tensor_missing = False
       for spec in saveable.specs:
@@ -255,7 +241,10 @@ class _NameBasedRestoreCoordinator(object):
         else:
           tensor_missing = True
 
-      if not tensor_missing:
+      if tensor_missing:
+        # Record that this variable didn't match so assertions will fail.
+        self.unused_attributes.setdefault(trackable, []).append(saveable.name)
+      else:
         # Ignores values missing from the checkpoint, as with object-based
         # restore. Status assertions can be used to check exact matches,
         # although it's unlikely to ever happen for name-based checkpoints.
@@ -299,10 +288,10 @@ def _default_getter(name, shape, dtype, initializer=None,
     )
 
 
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+def add_variable(trackable, name, shape=None, dtype=dtypes.float32,
                  initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+  """Add a variable to a Trackable with no scope influence."""
+  return trackable._add_variable_with_custom_getter(  # pylint: disable=protected-access
       name=name, shape=shape, dtype=dtype,
       initializer=initializer, getter=_default_getter)
 
@@ -325,7 +314,7 @@ def object_metadata(save_path):
     save_path: The path to the checkpoint, as returned by `save` or
       `tf.train.latest_checkpoint`.
   Returns:
-    A parsed `tf.contrib.checkpoint.CheckpointableObjectGraph` protocol buffer.
+    A parsed `tf.contrib.checkpoint.TrackableObjectGraph` protocol buffer.
   Raises:
     ValueError: If an object graph was not found in the checkpoint.
   """
@@ -340,503 +329,44 @@ def object_metadata(save_path):
          'saver and does not contain an object dependency graph.') % (
              save_path, base.OBJECT_GRAPH_PROTO_KEY))
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   return object_graph_proto
 
 
-class _ObjectIdentityWrapper(object):
-  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
-
-  Since __eq__ is based on object identity, it's safe to also define __hash__
-  based on object ids. This lets us add unhashable types like checkpointable
-  _ListWrapper objects to object-identity collections.
-  """
-
-  def __init__(self, wrapped):
-    self._wrapped = wrapped
-
-  @property
-  def unwrapped(self):
-    return self._wrapped
-
-  def __eq__(self, other):
-    if isinstance(other, _ObjectIdentityWrapper):
-      return self._wrapped is other._wrapped  # pylint: disable=protected-access
-    return self._wrapped is other
-
-  def __hash__(self):
-    # Wrapper id() is also fine for weakrefs. In fact, we rely on
-    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
-    # weakref.ref(a) in _WeakObjectIdentityWrapper.
-    return id(self._wrapped)
-
-
-class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
-
-  def __init__(self, wrapped):
-    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
-
-  @property
-  def unwrapped(self):
-    return self._wrapped()
-
-
-class ObjectIdentityDictionary(collections.MutableMapping):
-  """A mutable mapping data structure which compares using "is".
-
-  This is necessary because we have checkpointable objects (_ListWrapper) which
-  have behavior identical to built-in Python lists (including being unhashable
-  and comparing based on the equality of their contents by default).
-  """
-
-  def __init__(self):
-    self._storage = {}
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __getitem__(self, key):
-    return self._storage[self._wrap_key(key)]
-
-  def __setitem__(self, key, value):
-    self._storage[self._wrap_key(key)] = value
-
-  def __delitem__(self, key):
-    del self._storage[self._wrap_key(key)]
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    for key in self._storage:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
-  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len(list(self._storage))
-
-  def __iter__(self):
-    keys = self._storage.keys()
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        del self[key]
-      else:
-        yield unwrapped
-
-
-class _ObjectIdentitySet(collections.MutableSet):
-  """Like the built-in set, but compares objects with "is"."""
-
-  def __init__(self, *args):
-    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __contains__(self, key):
-    return self._wrap_key(key) in self._storage
-
-  def discard(self, key):
-    self._storage.discard(self._wrap_key(key))
-
-  def add(self, key):
-    self._storage.add(self._wrap_key(key))
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakSet(_ObjectIdentitySet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len([_ for _ in self])
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        self.discard(key)
-      else:
-        yield unwrapped
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = ObjectIdentityDictionary()
-  path_to_root[root_checkpointable] = ()
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    if isinstance(current_checkpointable, tracking.NotCheckpointable):
-      raise NotImplementedError(
-          ("The object %s does not support object-based saving. File a feature "
-           "request if this limitation bothers you. In the meantime, you can "
-           "remove the dependency on this object and save everything else.")
-          % (current_checkpointable,))
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = ObjectIdentityDictionary()
-  for checkpointable in non_slot_objects:
-    if (isinstance(checkpointable, optimizer_v1.Optimizer)
-        # TODO(b/110718070): Fix Keras imports.
-        or hasattr(checkpointable, "_create_or_restore_slot_variable")):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except (AttributeError, KeyError):
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .CheckpointableObject.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _add_attributes_to_object_graph(
-    checkpointable_objects, object_graph_proto, node_ids, object_names,
-    saveables_cache, object_map):
-  """Create SaveableObjects and corresponding SerializedTensor protos."""
-  named_saveable_objects = []
-  if saveables_cache is None:
-    # No SaveableObject caching. Either we're executing eagerly, or building a
-    # static save which is specialized to the current Python state.
-    feed_additions = None
-  else:
-    # If we are caching SaveableObjects, we need to build up a feed_dict with
-    # functions computing volatile Python state to be saved with the checkpoint.
-    feed_additions = {}
-  for checkpoint_id, (checkpointable, object_proto) in enumerate(
-      zip(checkpointable_objects, object_graph_proto.nodes)):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_name = object_names[checkpointable]
-    if object_map:
-      object_to_save = object_map.get(checkpointable, checkpointable)
-    else:
-      object_to_save = checkpointable
-    if saveables_cache is not None:
-      cached_attributes = saveables_cache.setdefault(object_to_save, {})
-    else:
-      cached_attributes = None
-
-    for name, saveable_factory in (
-        object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if cached_attributes is None:
-        saveables = None
-      else:
-        saveables = cached_attributes.get(name, None)
-        if saveables is not None:
-          for saveable in saveables:
-            if attribute.checkpoint_key not in saveable.name:
-              # The checkpoint key for this SaveableObject is different. We need
-              # to re-create it.
-              saveables = None
-              del cached_attributes[name]
-              break
-      if saveables is None:
-        if callable(saveable_factory):
-          maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
-        else:
-          maybe_saveable = saveable_factory
-        if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
-          saveables = (maybe_saveable,)
-        else:
-          # Figure out the name-based Saver's name for this variable. If it's
-          # already a SaveableObject we'd just get the checkpoint key back, so
-          # we leave full_name blank.
-          saver_dict = saveable_object_util.op_list_to_dict(
-              [maybe_saveable], convert_variable_to_tensor=False)
-          full_name, = saver_dict.keys()
-          saveables = tuple(saveable_object_util.saveable_objects_for_op(
-              op=maybe_saveable, name=attribute.checkpoint_key))
-          for saveable in saveables:
-            saveable.full_name = full_name
-        for saveable in saveables:
-          if attribute.checkpoint_key not in saveable.name:
-            raise AssertionError(
-                ("The object %s produced a SaveableObject with name '%s' for "
-                 "attribute '%s'. Expected a name containing '%s'.")
-                % (checkpointable, name, saveable.name,
-                   attribute.checkpoint_key))
-        if cached_attributes is not None:
-          cached_attributes[name] = saveables
-
-      optional_restore = None
-      for saveable in saveables:
-        if optional_restore is None:
-          optional_restore = saveable.optional_restore
-        else:
-          optional_restore = optional_restore and saveable.optional_restore
-
-        if hasattr(saveable, "full_name"):
-          attribute.full_name = saveable.full_name
-        if isinstance(saveable, base.PythonStateSaveable):
-          if feed_additions is None:
-            assert saveables_cache is None
-            # If we're not caching saveables, then we're either executing
-            # eagerly or building a static save/restore (e.g. for a
-            # SavedModel). In either case, we should embed the current Python
-            # state in the graph rather than relying on a feed dict.
-            saveable = saveable.freeze()
-          else:
-            saveable_feed_dict = saveable.feed_dict_additions()
-            for new_feed_key in saveable_feed_dict.keys():
-              if new_feed_key in feed_additions:
-                raise AssertionError(
-                    ("The object %s tried to feed a value for the Tensor %s "
-                     "when saving, but another object is already feeding a "
-                     "value.")
-                    % (checkpointable, new_feed_key))
-            feed_additions.update(saveable_feed_dict)
-        named_saveable_objects.append(saveable)
-      if optional_restore is None:
-        optional_restore = False
-      attribute.optional_restore = optional_restore
-
-  return named_saveable_objects, feed_additions
-
-
-def fill_object_graph_proto(checkpointable_objects,
-                            node_ids,
-                            slot_variables,
-                            object_graph_proto=None):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  if object_graph_proto is None:
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-  return object_graph_proto
-
-
-def _serialize_gathered_objects(
-    checkpointable_objects, path_to_root, saveables_cache, object_map):
-  """Create SaveableObjects and protos for gathered objects."""
-  object_names = ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  object_graph_proto = fill_object_graph_proto(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      slot_variables=slot_variables)
-  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
-      checkpointable_objects=checkpointable_objects,
-      object_graph_proto=object_graph_proto,
-      node_ids=node_ids,
-      object_names=object_names,
-      saveables_cache=saveables_cache,
-      object_map=object_map)
-  return named_saveable_objects, object_graph_proto, feed_additions
-
-
-def _serialize_object_graph(root_checkpointable, saveables_cache):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-    saveables_cache: A dictionary mapping `Checkpointable` objects -> attribute
-      names -> SaveableObjects, used to avoid re-creating SaveableObjects when
-      graph building.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto, feed_additions):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-      feed_additions: A dictionary mapping from Tensors to values which should
-        be fed when saving.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  return _serialize_gathered_objects(
-      checkpointable_objects, path_to_root, saveables_cache, object_map=None)
-
-
-def named_saveables(root_checkpointable):
-  """Gather list of all SaveableObjects in the Checkpointable object."""
-  return _serialize_object_graph(root_checkpointable, None)[0]
-
-
-def find_objects(root_checkpointable):
-  """Find and number objects which are dependencies of `root_checkpointable`."""
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return checkpointable_objects, node_ids, slot_variables
-
-
-def list_objects(root_checkpointable):
+def list_objects(root_trackable):
   """Traverse the object graph and list all accessible objects.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable`. Includes slot variables only if the variable they are
-  slotting for and the optimizer are dependencies of `root_checkpointable`
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable`. Includes slot variables only if the variable they are
+  slotting for and the optimizer are dependencies of `root_trackable`
   (i.e. if they would be saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object whose dependencies should be
+    root_trackable: A `Trackable` object whose dependencies should be
       flattened.
   Returns:
     A flat list of objects.
   """
-  checkpointable_objects, _, _ = find_objects(root_checkpointable)
-  return checkpointable_objects
+  return graph_view_lib.ObjectGraphView(root_trackable).list_objects()
 
 
-def gather_initializers(root_checkpointable):
+def gather_initializers(root_trackable):
   """Traverse the object graph and find initialization ops.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable` and which have an `initializer` property. Includes
   initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  the optimizer are dependencies of `root_trackable` (i.e. if they would be
   saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
+    root_trackable: A `Trackable` object to gather initializers for.
   Returns:
     A list of initialization ops.
   """
-  checkpointable_objects = list_objects(root_checkpointable)
-  return [c.initializer for c in checkpointable_objects
+  trackable_objects = list_objects(root_trackable)
+  return [c.initializer for c in trackable_objects
           if hasattr(c, "initializer") and c.initializer is not None]
 
 
@@ -846,7 +376,7 @@ def capture_dependencies(template):
 
   Requires that `template.variable_scope` is active.
 
-  This scope is intended as a compatibility measure, allowing a checkpointable
+  This scope is intended as a compatibility measure, allowing a trackable
   object to add dependencies on variables created in a block of code which is
   not aware of object-based saving (and instead uses variable names
   heavily). This is how `Template` objects add dependencies on variables and
@@ -860,17 +390,17 @@ def capture_dependencies(template):
   """
   name_prefix = template.variable_scope.name
 
-  def _checkpointable_custom_creator(next_creator, name, initial_value,
-                                     checkpointable_parent=None, **kwargs):
-    """A variable creation hook which adds Checkpointable dependencies.
+  def _trackable_custom_creator(next_creator, name, initial_value,
+                                trackable_parent=None, **kwargs):
+    """A variable creation hook which adds Trackable dependencies.
 
     Set for example during a `Template`'s first wrapped function
-    execution. Ensures that (a) `template` depends on any checkpointable
+    execution. Ensures that (a) `template` depends on any trackable
     objects using their own `capture_dependencies` scope inside this scope which
     create variables, and (b) that any variables not in a more deeply nested
     scope are added as dependencies directly.
 
-    The `checkpointable_parent` argument is passed between custom creators but
+    The `trackable_parent` argument is passed between custom creators but
     ignored when the variable object itself is created. This argument indicates
     (if not `None`) that a more deeply nested scope has already added the
     variable as a dependency, and that parent scopes should add a dependency on
@@ -884,8 +414,8 @@ def capture_dependencies(template):
         but scopes opened within this scope are respected.
       initial_value: See `variable_scope.variable_creator_scope`. Taken
         explicitly so the argument can be re-named and used with
-        `Checkpointable._add_variable_with_custom_getter`.
-      checkpointable_parent: If not None, a more deeply nested checkpointable
+        `Trackable._add_variable_with_custom_getter`.
+      trackable_parent: If not None, a more deeply nested trackable
         object and its name prefix which were passed to `capture_dependencies`
         to add a dependency on (rather than depending on the variable directly).
       **kwargs: Passed through to the next creator.
@@ -902,28 +432,28 @@ def capture_dependencies(template):
           **inner_kwargs)
     if name is not None and name.startswith(name_prefix):
       scope_stripped_name = name[len(name_prefix) + 1:]
-      if not checkpointable_parent:
+      if not trackable_parent:
         return template._add_variable_with_custom_getter(  # pylint: disable=protected-access
             initializer=initial_value,
             name=scope_stripped_name,
             getter=_call_next_creator_renaming_initializer,
-            # Disable error checking for Checkpointable. Exceptions are instead
+            # Disable error checking for Trackable. Exceptions are instead
             # raised if necessary when the object-based saver tries to
             # save/restore the object.
             overwrite=True,
-            checkpointable_parent=(template, name_prefix),
+            trackable_parent=(template, name_prefix),
             **kwargs)
       else:
-        parent_object, parent_name_prefix = checkpointable_parent
-        template._track_checkpointable(  # pylint: disable=protected-access
+        parent_object, parent_name_prefix = trackable_parent
+        template._track_trackable(  # pylint: disable=protected-access
             parent_object,
             name=parent_name_prefix[len(name_prefix) + 1:],
             overwrite=True)
     return next_creator(
         name=name, initial_value=initial_value,
-        checkpointable_parent=(template, name_prefix), **kwargs)
+        trackable_parent=(template, name_prefix), **kwargs)
 
-  with variable_scope.variable_creator_scope(_checkpointable_custom_creator):
+  with variable_scope.variable_creator_scope(_trackable_custom_creator):
     yield
 
 
@@ -999,10 +529,10 @@ class CheckpointLoadStatus(_LoadStatus):
   See `Saver.restore` for usage examples.
   """
 
-  def __init__(self, checkpoint, feed_dict, root_checkpointable):
+  def __init__(self, checkpoint, feed_dict, graph_view):
     self._checkpoint = checkpoint
     self._feed_dict = feed_dict
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Asserts that all objects in the checkpoint have been created/matched.
@@ -1017,8 +547,8 @@ class CheckpointLoadStatus(_LoadStatus):
     """
     self.assert_existing_objects_matched()
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if trackable is None:
         raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
     if self._checkpoint.slot_restorations:
       # Sanity check; this collection should be clear if everything has been
@@ -1033,7 +563,7 @@ class CheckpointLoadStatus(_LoadStatus):
     return self
 
   def assert_existing_objects_matched(self):
-    """Asserts that checkpointable Python objects have been matched.
+    """Asserts that trackable Python objects have been matched.
 
     Note that this is a weaker assertion than `assert_consumed`. It will only
     fail for existing Python objects which are (transitive) dependencies of the
@@ -1050,22 +580,23 @@ class CheckpointLoadStatus(_LoadStatus):
         of the root object but does not have a value in the checkpoint.
     """
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if (checkpointable is not None
-          and checkpointable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if (trackable is not None
+          and trackable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
-    for checkpointable_object in list_objects(self._root_checkpointable):
+    for trackable_object in self._graph_view.list_objects():
       # Remove data structures that do not contain any variables from
       # restoration checks.
-      if (isinstance(checkpointable_object,
-                     data_structures.CheckpointableDataStructure) and
-          not checkpointable_object._checkpoint_dependencies):
+      if (isinstance(trackable_object,
+                     data_structures.TrackableDataStructure) and
+          not trackable_object._checkpoint_dependencies):
         continue
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+      self._checkpoint.all_python_objects.add(trackable_object)
     unused_python_objects = (
-        _ObjectIdentitySet(self._checkpoint.all_python_objects)
-        - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+        object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects)
+        - object_identity.ObjectIdentitySet(
+            self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
@@ -1075,12 +606,14 @@ class CheckpointLoadStatus(_LoadStatus):
 
   def assert_nontrivial_match(self):
     """Raises an exception if only the root object matched."""
-    for checkpointable_object in list_objects(self._root_checkpointable):
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+    for trackable_object in self._graph_view.list_objects():
+      self._checkpoint.all_python_objects.add(trackable_object)
     if len(self._checkpoint.object_by_proto_id) <= 1:
       unused_python_objects = (
-          _ObjectIdentitySet(self._checkpoint.all_python_objects)
-          - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+          object_identity.ObjectIdentitySet(
+              self._checkpoint.all_python_objects)
+          - object_identity.ObjectIdentitySet(
+              self._checkpoint.object_by_proto_id.values()))
       if unused_python_objects:
         raise AssertionError(
             ("Nothing except the root object matched a checkpointed value. "
@@ -1090,7 +623,7 @@ class CheckpointLoadStatus(_LoadStatus):
       else:
         raise AssertionError(
             "Nothing to load. No dependencies have been added to %s yet." % (
-                self._root_checkpointable,))
+                self._graph_view.root,))
     return self
 
   def run_restore_ops(self, session=None):
@@ -1120,8 +653,8 @@ class CheckpointLoadStatus(_LoadStatus):
       return  # Initialization and restoration ops are run eagerly
     if session is None:
       session = ops.get_default_session()
-    all_objects = list_objects(self._root_checkpointable)
-    already_initialized_objects = _ObjectIdentitySet(
+    all_objects = self._graph_view.list_objects()
+    already_initialized_objects = object_identity.ObjectIdentitySet(
         self._checkpoint.object_by_proto_id.values())
     initializers_for_non_restored_variables = [
         c.initializer for c in all_objects
@@ -1143,9 +676,9 @@ class InitializationOnlyStatus(_LoadStatus):
   otherwise.
   """
 
-  def __init__(self, root_checkpointable, restore_uid):
+  def __init__(self, graph_view, restore_uid):
     self._restore_uid = restore_uid
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
@@ -1193,9 +726,9 @@ class InitializationOnlyStatus(_LoadStatus):
       return  # run eagerly
     if session is None:
       session = ops.get_default_session()
-    checkpointable_objects = list_objects(self._root_checkpointable)
+    trackable_objects = self._graph_view.list_objects()
     initializers = [
-        c.initializer for c in checkpointable_objects
+        c.initializer for c in trackable_objects
         if hasattr(c, "initializer") and c.initializer is not None
         and (getattr(c, "_update_uid", self._restore_uid - 1)
              < self._restore_uid)]
@@ -1218,9 +751,9 @@ class NameBasedSaverStatus(_LoadStatus):
   # interferes with isinstance checks.
   @deprecation.deprecated(
       date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
-  def __init__(self, checkpoint, root_checkpointable):
+  def __init__(self, checkpoint, graph_view):
     self._checkpoint = checkpoint
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Raises an exception if any variables/objects are unmatched."""
@@ -1229,11 +762,11 @@ class NameBasedSaverStatus(_LoadStatus):
       raise AssertionError(
           "Some objects had attributes which were not restored: %s"
           % (unused_attributes,))
-    for checkpointable in list_objects(self._root_checkpointable):
+    for trackable in self._graph_view.list_objects():
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        raise AssertionError("Object not restored: %s" % (checkpointable,))
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        raise AssertionError("Object not restored: %s" % (trackable,))
       # pylint: enable=protected-access
     return self
 
@@ -1255,19 +788,19 @@ class NameBasedSaverStatus(_LoadStatus):
 
   def _gather_saveable_objects(self):
     """Walk the object graph, using global names for SaveableObjects."""
-    objects = list_objects(self._root_checkpointable)
+    objects = self._graph_view.list_objects()
     saveable_objects = []
-    for checkpointable in objects:
+    for trackable in objects:
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        checkpointable._update_uid = self._checkpoint.restore_uid
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        trackable._update_uid = self._checkpoint.restore_uid
       else:
         continue
       # pylint: enable=protected-access
       saveable_objects.extend(
           self._checkpoint.globally_named_object_attributes(
-              checkpointable))
+              trackable))
     return saveable_objects
 
   def run_restore_ops(self, session=None):
@@ -1303,36 +836,32 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-class CheckpointableSaver(object):
-  """Saves and restores a `Checkpointable` object and its dependencies.
+class TrackableSaver(object):
+  """Saves and restores a `Trackable` object and its dependencies.
 
-  See `Checkpointable` for details of dependency management. `Saver` wraps
+  See `Trackable` for details of dependency management. `Saver` wraps
   `tf.train.Saver` for saving, including extra information about the graph of
   dependencies between Python objects. When restoring, it uses this information
   about the save-time dependency graph to more robustly match objects with their
   checkpointed values. When executing eagerly, it supports restoring variables
   on object creation (see `Saver.restore`).
 
-  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  Values in a checkpoint are mapped to `Trackable` Python objects
   (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
   checkpoint was written. To avoid breaking existing checkpoints when modifying
-  a class, dependency names (the names of attributes to which `Checkpointable`
+  a class, dependency names (the names of attributes to which `Trackable`
   objects are assigned) may not change. These names are local to objects, in
   contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
   so allow additional program transformations.
   """
 
-  def __init__(self, root_checkpointable):
+  def __init__(self, graph_view):
     """Configure saving.
 
     Args:
-      root_checkpointable: The root of the object graph to save/restore. This
-        object and all of its dependencies are saved in the checkpoint. When
-        restoring, objects are matched and restored starting from this root.
+      graph_view: A `GraphView` object containing a description of the object
+        graph to save.
     """
-    # Allow passing in a weak reference to avoid reference cycles when
-    # `Checkpointable` objects save themselves.
-    self._root_checkpointable_ref = root_checkpointable
     # The file prefix placeholder is created lazily when graph building (and not
     # at all when executing eagerly) to avoid creating ops in the constructor
     # (when they may never be necessary).
@@ -1346,34 +875,13 @@ class CheckpointableSaver(object):
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
-
-    if context.executing_eagerly():
-      # SaveableObjects are always recreated when executing eagerly.
-      self._saveable_object_cache = None
-    else:
-      # Maps Checkpointable objects -> attribute names -> list(SaveableObjects),
-      # to avoid re-creating SaveableObjects when graph building.
-      self._saveable_object_cache = _ObjectIdentityWeakKeyDictionary()
-
-  @property
-  def _root_checkpointable(self):
-    if isinstance(self._root_checkpointable_ref, weakref.ref):
-      derefed = self._root_checkpointable_ref()
-      assert derefed is not None
-      return derefed
-    else:
-      return self._root_checkpointable_ref
+    self._graph_view = graph_view
 
   def _gather_saveables(
-      self, object_graph_tensor=None, saveable_object_cache=None):
+      self, object_graph_tensor=None):
     """Wraps _serialize_object_graph to include the object graph proto."""
-    assert ((object_graph_tensor is None and saveable_object_cache is None)
-            or (object_graph_tensor is not None
-                and saveable_object_cache is not None))
     (named_saveable_objects, graph_proto,
-     feed_additions) = _serialize_object_graph(
-         self._root_checkpointable,
-         saveables_cache=saveable_object_cache)
+     feed_additions) = self._graph_view.serialize_object_graph()
     if object_graph_tensor is None:
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
@@ -1388,52 +896,16 @@ class CheckpointableSaver(object):
             name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def gather_objects(self, object_map=None, to_graph=None):
-    """Creates SaveableObjects with the current object graph frozen."""
-    checkpointable_objects, path_to_root = (
-        _breadth_first_checkpointable_traversal(self._root_checkpointable))
-    if to_graph:
-      target_context = to_graph.as_default
-    else:
-      target_context = ops.NullContextmanager
-    with target_context():
-      named_saveable_objects, graph_proto, _ = _serialize_gathered_objects(
-          checkpointable_objects,
-          path_to_root,
-          saveables_cache=None,
-          object_map=object_map)
-      with ops.device("/cpu:0"):
-        object_graph_tensor = constant_op.constant(
-            graph_proto.SerializeToString(), dtype=dtypes.string)
-      named_saveable_objects.append(
-          base.NoRestoreSaveable(
-              tensor=object_graph_tensor,
-              name=base.OBJECT_GRAPH_PROTO_KEY))
-    return named_saveable_objects
-
-  def freeze(self, object_map=None, to_graph=None):
-    named_saveable_objects = self.gather_objects(
-        object_map=object_map, to_graph=to_graph)
-    return functional_saver.Saver(named_saveable_objects)
-
   def _save_cached_when_graph_building(
       self,
       file_prefix,
-      object_graph_tensor=None,
-      saveable_object_cache=None):
+      object_graph_tensor=None):
     """Create or retrieve save ops.
 
-    When graph building, `saveable_object_cache` will typically be non-`None`,
-    meaning that existing `SaveableObject`s are re-used across calls to
-    `_prepare_save` even if the object graph has grown. This avoids
-    unnecessarily re-creating save ops.
-
     Args:
       file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
-      saveable_object_cache: A dictionary; if specified, used to cache
-        `SaveableObject`s.
 
     Returns:
       A two-element tuple with a filename tensor and a feed_dict of tensors to
@@ -1443,14 +915,14 @@ class CheckpointableSaver(object):
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
-         object_graph_tensor=object_graph_tensor,
-         saveable_object_cache=saveable_object_cache)
+         object_graph_tensor=object_graph_tensor)
     if (self._last_save_object_graph != graph_proto
         # When executing eagerly, we need to re-create SaveableObjects each time
         # save() is called so they pick up new Tensors passed to their
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
-        or context.executing_eagerly()):
+        or context.executing_eagerly()
+        or ops.inside_function()):
       saver = functional_saver.Saver(named_saveable_objects)
       with ops.device("/cpu:0"):
         self._cached_save_operation = saver.save(file_prefix)
@@ -1461,7 +933,7 @@ class CheckpointableSaver(object):
     """Save a training checkpoint.
 
     The saved checkpoint includes variables created by this object and any
-    Checkpointable objects it depends on at the time `Saver.save()` is called.
+    Trackable objects it depends on at the time `Saver.save()` is called.
 
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
@@ -1470,8 +942,8 @@ class CheckpointableSaver(object):
       checkpoint_number: An integer variable or Tensor, used to number
         checkpoints. Typically this value is saved along with other variables in
         training checkpoints, which will happen automatically if it was created
-        by `root_checkpointable` or one of its dependencies (via
-        `Checkpointable._add_variable`).
+        by `root_trackable` or one of its dependencies (via
+        `Trackable._add_variable`).
       session: The session to evaluate variables in. Ignored when executing
         eagerly. If not provided when graph building, the default session is
         used.
@@ -1480,10 +952,11 @@ class CheckpointableSaver(object):
       The full path to the checkpoint.
     """
     feed_dict = {}
-    graph_building = not context.executing_eagerly()
+    use_session = (not context.executing_eagerly()
+                   and not ops.inside_function())
     if checkpoint_number:
       file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
-    if graph_building:
+    if use_session:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
@@ -1502,28 +975,26 @@ class CheckpointableSaver(object):
     file_io.recursive_create_dir(os.path.dirname(file_prefix))
     save_path, new_feed_additions = self._save_cached_when_graph_building(
         file_prefix=file_prefix_tensor,
-        object_graph_tensor=object_graph_tensor,
-        saveable_object_cache=self._saveable_object_cache)
+        object_graph_tensor=object_graph_tensor)
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
-    if not graph_building:
+    if not use_session:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
     if session:
-      save_path = session.run(save_path, feed_dict=feed_dict)
+      return session.run(save_path, feed_dict=feed_dict)
     else:
-      save_path = save_path.numpy()
-    return save_path
+      return save_path
 
   def restore(self, save_path):
     """Restore a training checkpoint.
 
-    Restores `root_checkpointable` and any objects that it tracks
+    Restores `root_trackable` and any objects that it tracks
     (transitive). Either assigns values immediately if variables to restore have
     been created already, or defers restoration until the variables are
-    created. Dependencies added to the `root_checkpointable` passed to the
+    created. Dependencies added to the `root_trackable` passed to the
     constructor after this call will be matched if they have a corresponding
     object in the checkpoint.
 
@@ -1576,7 +1047,7 @@ class CheckpointableSaver(object):
       object is returned which runs restore ops from a name-based saver.
     """
     if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
+      return InitializationOnlyStatus(self._graph_view, ops.uid())
     reader = pywrap_tensorflow.NewCheckpointReader(save_path)
     graph_building = not context.executing_eagerly()
     if graph_building:
@@ -1592,15 +1063,15 @@ class CheckpointableSaver(object):
       restore_coordinator = _NameBasedRestoreCoordinator(
           save_path=save_path, dtype_map=dtype_map)
       if not graph_building:
-        for existing_checkpointable in list_objects(self._root_checkpointable):
+        for existing_trackable in self._graph_view.list_objects():
           # pylint: disable=protected-access
-          existing_checkpointable._maybe_initialize_checkpointable()
-          existing_checkpointable._name_based_restores.add(restore_coordinator)
-          existing_checkpointable._name_based_attribute_restore(
+          existing_trackable._maybe_initialize_trackable()
+          existing_trackable._name_based_restores.add(restore_coordinator)
+          existing_trackable._name_based_attribute_restore(
               restore_coordinator)
           # pylint: enable=protected-access
       return NameBasedSaverStatus(
-          restore_coordinator, root_checkpointable=self._root_checkpointable)
+          restore_coordinator, graph_view=self._graph_view)
 
     if graph_building:
       if self._file_prefix_placeholder is None:
@@ -1613,25 +1084,25 @@ class CheckpointableSaver(object):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
     object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+        trackable_object_graph_pb2.TrackableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
     checkpoint = _CheckpointRestoreCoordinator(
         object_graph_proto=object_graph_proto,
         save_path=save_path,
         save_path_tensor=file_prefix_tensor,
         restore_op_cache=self._restore_op_cache,
-        saveable_object_cache=self._saveable_object_cache)
-    base._CheckpointPosition(  # pylint: disable=protected-access
-        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+        graph_view=self._graph_view)
+    base.CheckpointPosition(checkpoint=checkpoint, proto_id=0).restore(
+        self._graph_view.root)
     load_status = CheckpointLoadStatus(
         checkpoint,
-        root_checkpointable=self._root_checkpointable,
+        graph_view=self._graph_view,
         feed_dict=file_prefix_feed_dict)
     return load_status
 
 
-def frozen_saver(root_checkpointable):
-  """Creates a static `tf.train.Saver` from a checkpointable object.
+def frozen_saver(root_trackable):
+  """Creates a static `tf.train.Saver` from a trackable object.
 
   The returned `Saver` saves object-based checkpoints, but these checkpoints
   will no longer reflect structural changes to the object graph, only changes to
@@ -1645,21 +1116,35 @@ def frozen_saver(root_checkpointable):
   object graph and the current Python object graph.
 
   Args:
-    root_checkpointable: A checkpointable object to save.
+    root_trackable: A trackable object to save.
 
   Returns:
-    A `tf.train.Saver` which saves object-based checkpoints for the object graph
-    frozen at the time `frozen_saver` was called.
+    A saver which saves object-based checkpoints for the object graph frozen at
+    the time `frozen_saver` was called.
   """
-  return CheckpointableSaver(root_checkpointable).freeze()
+  named_saveable_objects = graph_view_lib.ObjectGraphView(
+      root_trackable).frozen_saveable_objects()
+  return functional_saver.Saver(named_saveable_objects)
 
 
-@tf_export("train.Checkpoint")
-class Checkpoint(tracking.AutoCheckpointable):
-  """Groups checkpointable objects, saving and restoring them.
+def saver_with_op_caching(obj):
+  """A TrackableSaver with a SaveableObject cache when graph building."""
+  if context.executing_eagerly():
+    saveables_cache = None
+  else:
+    saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
+  return TrackableSaver(graph_view_lib.ObjectGraphView(
+      weakref.ref(obj),
+      saveables_cache=saveables_cache))
+
+
+# Mentions graph building / Sessions. The v2 version is below.
+@tf_export(v1=["train.Checkpoint"])
+class CheckpointV1(tracking.AutoTrackable):
+  """Groups trackable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain checkpointable state, such as `tf.train.Optimizer`
+  that contain trackable state, such as `tf.train.Optimizer`
   implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
   `tf.keras.Model` implementations. It saves these values with a checkpoint, and
   maintains a `save_counter` for numbering checkpoints.
@@ -1751,23 +1236,23 @@ class Checkpoint(tracking.AutoCheckpointable):
 
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Values must be checkpointable objects.
+        saved with the checkpoint. Values must be trackable objects.
     Raises:
-      ValueError: If objects in `kwargs` are not checkpointable.
+      ValueError: If objects in `kwargs` are not trackable.
     """
-    super(Checkpoint, self).__init__()
+    super(CheckpointV1, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, (base.Checkpointable, def_function.Function)):
+      if not isinstance(v, (base.Trackable, def_function.Function)):
         raise ValueError(
-            ("`Checkpoint` was expecting a checkpointable object (an object "
-             "derived from `CheckpointableBase`), got %s. If you believe this "
-             "object should be checkpointable (i.e. it is part of the "
+            ("`Checkpoint` was expecting a trackable object (an object "
+             "derived from `TrackableBase`), got %s. If you believe this "
+             "object should be trackable (i.e. it is part of the "
              "TensorFlow Python API and manages state), please open an issue.")
             % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
-    self._saver = CheckpointableSaver(weakref.ref(self))
+    self._saver = saver_with_op_caching(self)
 
   def _maybe_create_save_counter(self):
     """Create a save counter if it does not yet exist."""
@@ -1784,7 +1269,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     """Writes a training checkpoint.
 
     The checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.write()` is
+    trackable objects it depends on at the time `Checkpoint.write()` is
     called.
 
     `write` does not number checkpoints, increment `save_counter`, or update the
@@ -1802,9 +1287,18 @@ class Checkpoint(tracking.AutoCheckpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return compat.as_str(self._saver.save(
+    output = self._saver.save(
         file_prefix=file_prefix,
-        session=session))
+        session=session)
+    if tensor_util.is_tensor(output):
+      if context.executing_eagerly():
+        return compat.as_str(output.numpy())
+      else:
+        # Function building
+        return output
+    else:
+      # Graph + Session, so we already session.ran it.
+      return compat.as_str(output)
 
   @property
   def save_counter(self):
@@ -1822,7 +1316,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     """Saves a training checkpoint and provides basic checkpoint management.
 
     The saved checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    trackable objects it depends on at the time `Checkpoint.save()` is
     called.
 
     `save` is a basic convenience wrapper around the `write` method,
@@ -1845,6 +1339,14 @@ class Checkpoint(tracking.AutoCheckpointable):
     """
     graph_building = not context.executing_eagerly()
     if graph_building:
+      if ops.inside_function():
+        raise NotImplementedError(
+            "Calling tf.train.Checkpoint.save() from a function is not "
+            "supported, as save() modifies saving metadata in ways not "
+            "supported by TensorFlow Operations. Consider using "
+            "tf.train.Checkpoint.write(), a lower-level API which does not "
+            "update metadata. tf.train.latest_checkpoint and related APIs will "
+            "not see this checkpoint.")
       if session is None:
         session = ops.get_default_session()
       if self._save_counter is None:
@@ -1866,7 +1368,8 @@ class Checkpoint(tracking.AutoCheckpointable):
     checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
-        all_model_checkpoint_paths=[file_path])
+        all_model_checkpoint_paths=[file_path],
+        save_relative_paths=True)
     return file_path
 
   def restore(self, save_path):
@@ -1878,7 +1381,7 @@ class Checkpoint(tracking.AutoCheckpointable):
     restore have been created already, or defers restoration until the variables
     are created. Dependencies added after this call will be matched if they have
     a corresponding object in the checkpoint (the restore request will queue in
-    any checkpointable object waiting for the expected dependency to be added).
+    any trackable object waiting for the expected dependency to be added).
 
     When graph building, restoration ops are added to the graph but not run
     immediately.
@@ -1971,3 +1474,277 @@ class Checkpoint(tracking.AutoCheckpointable):
     # initialization when executing eagerly.
     self._maybe_create_save_counter()
     return status
+
+
+@tf_export("train.Checkpoint", v1=[])
+class Checkpoint(tracking.AutoTrackable):
+  """Groups trackable objects, saving and restoring them.
+
+  `Checkpoint`'s constructor accepts keyword arguments whose values are types
+  that contain trackable state, such as `tf.train.Optimizer`
+  implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
+  `tf.keras.Model` implementations. It saves these values with a checkpoint, and
+  maintains a `save_counter` for numbering checkpoints.
+
+  Example usage:
+
+  ```python
+  import tensorflow as tf
+  import os
+
+  checkpoint_directory = "/tmp/training_checkpoints"
+  checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
+  status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
+  for _ in range(num_training_steps):
+    optimizer.minimize( ... )  # Variables will be restored on creation.
+  status.assert_consumed()  # Optional sanity checks.
+  checkpoint.save(file_prefix=checkpoint_prefix)
+  ```
+
+  `Checkpoint.save` and `Checkpoint.restore` write and read object-based
+  checkpoints, in contrast to TensorFlow 1.x's `tf.train.Saver` which writes and
+  reads `variable.name` based checkpoints. Object-based checkpointing saves a
+  graph of dependencies between Python objects (`Layer`s, `Optimizer`s,
+  `Variable`s, etc.) with named edges, and this graph is used to match variables
+  when restoring a checkpoint. It can be more robust to changes in the Python
+  program, and helps to support restore-on-create for variables.
+
+  `Checkpoint` objects have dependencies on the objects passed as keyword
+  arguments to their constructors, and each dependency is given a name that is
+  identical to the name of the keyword argument for which it was created.
+  TensorFlow classes like `Layer`s and `Optimizer`s will automatically add
+  dependencies on their variables (e.g. "kernel" and "bias" for
+  `tf.keras.layers.Dense`). Inheriting from `tf.keras.Model` makes managing
+  dependencies easy in user-defined classes, since `Model` hooks into attribute
+  assignment. For example:
+
+  ```python
+  class Regress(tf.keras.Model):
+
+    def __init__(self):
+      super(Regress, self).__init__()
+      self.input_transform = tf.keras.layers.Dense(10)
+      # ...
+
+    def call(self, inputs):
+      x = self.input_transform(inputs)
+      # ...
+  ```
+
+  This `Model` has a dependency named "input_transform" on its `Dense` layer,
+  which in turn depends on its variables. As a result, saving an instance of
+  `Regress` using `tf.train.Checkpoint` will also save all the variables created
+  by the `Dense` layer.
+
+  Attributes:
+    save_counter: Incremented when `save()` is called. Used to number
+      checkpoints.
+  """
+
+  def __init__(self, **kwargs):
+    """Group objects into a training checkpoint.
+
+    Args:
+      **kwargs: Keyword arguments are set as attributes of this object, and are
+        saved with the checkpoint. Values must be trackable objects.
+    Raises:
+      ValueError: If objects in `kwargs` are not trackable.
+    """
+    super(Checkpoint, self).__init__()
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      if not isinstance(v, (base.Trackable, def_function.Function)):
+        raise ValueError(
+            ("`Checkpoint` was expecting a trackable object (an object "
+             "derived from `TrackableBase`), got %s. If you believe this "
+             "object should be trackable (i.e. it is part of the "
+             "TensorFlow Python API and manages state), please open an issue.")
+            % (v,))
+      setattr(self, k, v)
+    self._save_counter = None  # Created lazily for restore-on-create.
+    self._save_assign_op = None
+    self._saver = saver_with_op_caching(self)
+
+  def _maybe_create_save_counter(self):
+    """Create a save counter if it does not yet exist."""
+    if self._save_counter is None:
+      # Initialized to 0 and incremented before saving.
+      with ops.device("/cpu:0"):
+        # add_variable creates a dependency named "save_counter"; NoDependency
+        # prevents creating a second dependency named "_save_counter".
+        self._save_counter = data_structures.NoDependency(
+            add_variable(self, name="save_counter", initializer=0,
+                         dtype=dtypes.int64))
+
+  def write(self, file_prefix):
+    """Writes a training checkpoint.
+
+    The checkpoint includes variables created by this object and any
+    trackable objects it depends on at the time `Checkpoint.write()` is
+    called.
+
+    `write` does not number checkpoints, increment `save_counter`, or update the
+    metadata used by `tf.train.latest_checkpoint`. It is primarily intended for
+    use by higher level checkpoint management utilities. `save` provides a very
+    basic implementation of these features.
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix).
+
+    Returns:
+      The full path to the checkpoint (i.e. `file_prefix`).
+    """
+    output = self._saver.save(
+        file_prefix=file_prefix)
+    if tensor_util.is_tensor(output):
+      if context.executing_eagerly():
+        return compat.as_str(output.numpy())
+      else:
+        # Function building
+        return output
+    else:
+      # Graph + Session, so we already session.ran it.
+      return compat.as_str(output)
+
+  @property
+  def save_counter(self):
+    """An integer variable which starts at zero and is incremented on save.
+
+    Used to number checkpoints.
+
+    Returns:
+      The save counter variable.
+    """
+    self._maybe_create_save_counter()
+    return self._save_counter
+
+  def save(self, file_prefix):
+    """Saves a training checkpoint and provides basic checkpoint management.
+
+    The saved checkpoint includes variables created by this object and any
+    trackable objects it depends on at the time `Checkpoint.save()` is
+    called.
+
+    `save` is a basic convenience wrapper around the `write` method,
+    sequentially numbering checkpoints using `save_counter` and updating the
+    metadata used by `tf.train.latest_checkpoint`. More advanced checkpoint
+    management, for example garbage collection and custom numbering, may be
+    provided by other utilities which also wrap `write`
+    (`tf.contrib.checkpoint.CheckpointManager` for example).
+
+    Args:
+      file_prefix: A prefix to use for the checkpoint filenames
+        (/path/to/directory/and_a_prefix). Names are generated based on this
+        prefix and `Checkpoint.save_counter`.
+
+    Returns:
+      The full path to the checkpoint.
+    """
+    graph_building = not context.executing_eagerly()
+    if graph_building:
+      if ops.inside_function():
+        raise NotImplementedError(
+            "Calling tf.train.Checkpoint.save() from a function is not "
+            "supported, as save() modifies saving metadata in ways not "
+            "supported by TensorFlow Operations. Consider using "
+            "tf.train.Checkpoint.write(), a lower-level API which does not "
+            "update metadata. tf.train.latest_checkpoint and related APIs will "
+            "not see this checkpoint.")
+      session = ops.get_default_session()
+      if self._save_counter is None:
+        # When graph building, if this is a new save counter variable then it
+        # needs to be initialized before assign_add. This is only an issue if
+        # restore() has not been called first.
+        session.run(self.save_counter.initializer)
+    if not graph_building or self._save_assign_op is None:
+      with ops.colocate_with(self.save_counter):
+        assign_op = self.save_counter.assign_add(1, read_value=True)
+      if graph_building:
+        self._save_assign_op = data_structures.NoDependency(assign_op)
+    if graph_building:
+      checkpoint_number = session.run(self._save_assign_op)
+    else:
+      checkpoint_number = assign_op.numpy()
+    file_path = self.write("%s-%d" % (file_prefix, checkpoint_number))
+    checkpoint_management.update_checkpoint_state_internal(
+        save_dir=os.path.dirname(file_prefix),
+        model_checkpoint_path=file_path,
+        all_model_checkpoint_paths=[file_path],
+        save_relative_paths=True)
+    return file_path
+
+  def restore(self, save_path):
+    """Restore a training checkpoint.
+
+    Restores this `Checkpoint` and any objects it depends on.
+
+    Either assigns values immediately if variables to restore have been created
+    already, or defers restoration until the variables are created. Dependencies
+    added after this call will be matched if they have a corresponding object in
+    the checkpoint (the restore request will queue in any trackable object
+    waiting for the expected dependency to be added).
+
+    To ensure that loading is complete and no more assignments will take place,
+    use the `assert_consumed()` method of the status object returned by
+    `restore`:
+
+    ```python
+    checkpoint = tf.train.Checkpoint( ... )
+    checkpoint.restore(path).assert_consumed()
+    ```
+
+    An exception will be raised if any Python objects in the dependency graph
+    were not found in the checkpoint, or if any checkpointed values do not have
+    a matching Python object.
+
+    Name-based `tf.train.Saver` checkpoints from TensorFlow 1.x can be loaded
+    using this method. Names are used to match variables. Re-encode name-based
+    checkpoints using `tf.train.Checkpoint.save` as soon as possible.
+
+    Args:
+      save_path: The path to the checkpoint, as returned by `save` or
+        `tf.train.latest_checkpoint`. If None (as when there is no latest
+        checkpoint for `tf.train.latest_checkpoint` to return), returns an
+        object which may run initializers for objects in the dependency
+        graph. If the checkpoint was written by the name-based `tf.train.Saver`,
+        names are used to match variables.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration.
+
+      The returned status object has the following methods:
+
+      * `assert_consumed()`:
+          Raises an exception if any variables/objects are unmatched: either
+          checkpointed values which don't have a matching Python object or
+          Python objects in the dependency graph with no values in the
+          checkpoint. This method returns the status object, and so may be
+          chained with other assertions.
+
+      * `assert_existing_objects_matched()`:
+          Raises an exception if any existing Python objects in the dependency
+          graph are unmatched. Unlike `assert_consumed`, this assertion will
+          pass if values in the checkpoint have no corresponding Python
+          objects. For example a `tf.keras.Layer` object which has not yet been
+          built, and so has not created any variables, will pass this assertion
+          but fail `assert_consumed`. Useful when loading part of a larger
+          checkpoint into a new Python program, e.g. a training checkpoint with
+          a `tf.train.Optimizer` was saved but only the state required for
+          inference is being loaded. This method returns the status object, and
+          so may be chained with other assertions.
+
+      * `assert_nontrivial_match()`: Asserts that something aside from the root
+          object was matched. This is a very weak assertion, but is useful for
+          sanity checking in library code where objects may exist in the
+          checkpoint which haven't been created in Python and some Python
+          objects may not have a checkpointed value.
+    """
+    status = self._saver.restore(save_path=save_path)
+    # Create the save counter now so it gets initialized with other variables
+    # when graph building. Creating it earlier would lead to double
+    # initialization when executing eagerly.
+    self._maybe_create_save_counter()
+    return status
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/tracking/util_test.py
similarity index 80%
rename from tensorflow/python/training/checkpointable/util_test.py
rename to tensorflow/python/training/tracking/util_test.py
index cef1075e93ca8a2aecfb5af9362308dabb82e47c..e2e8fa0552822ad2190c8a5255791b9cb0cb94b5 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -46,16 +46,17 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -67,8 +68,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -77,21 +78,30 @@ class MyModel(training.Model):
 
 class InterfaceTests(test.TestCase):
 
+  def testLayerDeduplication(self):
+    model = training.Model()
+    layer_one = core.Dense(1)
+    layer_two = core.Dense(1)
+    model.other_path = [layer_one, layer_two]
+    model.l2 = layer_two
+    model.l1 = layer_one
+    self.assertEqual([layer_one, layer_two], model.layers)
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
-    obj = NonLayerCheckpointable()
+    obj = NonLayerTrackable()
     with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
+      trackable_utils.add_variable(
           obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
+    constant_initializer = trackable_utils.add_variable(
         obj, name="constant_initializer", initializer=1)
     with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
+      ones_initializer = trackable_utils.add_variable(
           obj,
           name="ones_initializer",
           shape=[2],
           initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
+    bare_initializer = trackable_utils.add_variable(
         obj,
         name="bare_initializer",
         shape=[2, 2],
@@ -102,12 +112,12 @@ class InterfaceTests(test.TestCase):
     # naming conflicts within an object.
     other_duplicate = resource_variable_ops.ResourceVariable(
         name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
+    duplicate = trackable_utils.add_variable(
         obj, name="duplicate", shape=[])
     with self.assertRaisesRegexp(ValueError, "'duplicate'.*already declared"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+      trackable_utils.add_variable(obj, name="duplicate", shape=[])
 
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.evaluate(trackable_utils.gather_initializers(obj))
     self.assertEqual("constant_initializer:0", constant_initializer.name)
     self.assertEqual(1, self.evaluate(constant_initializer))
     self.assertEqual("some_variable_scope/ones_initializer:0",
@@ -125,8 +135,8 @@ class InterfaceTests(test.TestCase):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _, _ = checkpointable_utils._serialize_object_graph(
-        obj, saveables_cache=None)
+    named_variables, _, _ = (
+        graph_view.ObjectGraphView(obj).serialize_object_graph())
     expected_checkpoint_names = (
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
@@ -139,20 +149,20 @@ class InterfaceTests(test.TestCase):
 
   def testInitNotCalled(self):
 
-    class NoInit(tracking.AutoCheckpointable):
+    class NoInit(tracking.AutoTrackable):
 
       def __init__(self):
         pass
 
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+    # __init__ for Trackable will be called implicitly.
+    trackable_utils.add_variable(NoInit(), "var", shape=[])
 
   def testShapeDtype(self):
-    root = tracking.AutoCheckpointable()
-    v1 = checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    v1 = trackable_utils.add_variable(
         root, name="v1", initializer=3., dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         root,
         name="v2",
         shape=[3],
@@ -166,34 +176,34 @@ class InterfaceTests(test.TestCase):
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       dense = core.Dense(1)
-      checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+      checkpoint = trackable_utils.Checkpoint(dense=dense)
       dense(constant_op.constant([[1.]]))
       save_path = checkpoint.save(checkpoint_prefix)
 
-    objects = checkpointable_utils.object_metadata(save_path)
+    objects = trackable_utils.object_metadata(save_path)
     all_variable_names = []
     for obj in objects.nodes:
       for attribute in obj.attributes:
         all_variable_names.append(attribute.full_name)
     self.assertIn("dense/kernel", all_variable_names)
 
-  def testNotCheckpointable(self):
+  def testNotTrackable(self):
 
     class CallsFunctionalStuff(
-        tracking.NotCheckpointable, tracking.AutoCheckpointable):
+        tracking.NotTrackable, tracking.AutoTrackable):
       pass
 
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
-    checkpoint = checkpointable_utils.Checkpoint(x=CallsFunctionalStuff())
+    checkpoint = trackable_utils.Checkpoint(x=CallsFunctionalStuff())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(prefix)
 
     class CallsFunctionalStuffOtherMRO(
-        tracking.AutoCheckpointable, tracking.NotCheckpointable):
+        tracking.AutoTrackable, tracking.NotTrackable):
       pass
 
-    checkpoint_reversed = checkpointable_utils.Checkpoint(
+    checkpoint_reversed = trackable_utils.Checkpoint(
         x=CallsFunctionalStuffOtherMRO())
     with self.assertRaises(NotImplementedError):
       checkpoint_reversed.save(prefix)
@@ -220,8 +230,8 @@ class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(base.Checkpointable):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -254,7 +264,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     other_model = MyModel()
     optimizer = adam.Adam(0.001)
     step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, step=step)
 
     with backprop.GradientTape() as tape:
@@ -271,12 +281,11 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     gradients = tape.gradient(loss, variables)
     optimizer.apply_gradients(zip(gradients, variables))
 
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        checkpointable_utils._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_slot_keys = (
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
@@ -352,7 +361,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMoreComplexSaveableReturned(self):
     v = _OwnsMirroredVariables()
-    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    checkpoint = trackable_utils.Checkpoint(v=v)
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
     self.evaluate(v.non_dep_variable.assign(42.))
@@ -389,7 +398,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.Adam(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     with backprop.GradientTape() as tape:
@@ -397,31 +406,31 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     variables = model.trainable_variables
     gradients = tape.gradient(loss, variables)
     train_op = optimizer.apply_gradients(zip(gradients, variables))
-    root_checkpointable.save_counter  # pylint: disable=pointless-statement
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    root_trackable.save_counter  # pylint: disable=pointless-statement
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(
         sorted(optimizer.variables(), key=lambda v: v.name))
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
     on_create_model = MyModel()
     on_create_optimizer = adam.Adam(0.001)
-    on_create_root = checkpointable_utils.Checkpoint(
+    on_create_root = trackable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
@@ -462,7 +471,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     for training_continuation in range(3):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       root.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
@@ -488,7 +497,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         with ops.Graph().as_default():
           model = MyModel()
           optimizer = adam.Adam(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.CheckpointV1(
               optimizer=optimizer, model=model)
           input_value = constant_op.constant([[3.]])
           with backprop.GradientTape() as tape:
@@ -535,7 +544,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         optimizer = adam.Adam(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=1)
@@ -560,12 +569,12 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       directory = self.get_temp_dir()
       prefix = os.path.join(directory, "ckpt")
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
+      checkpoint = trackable_utils.Checkpoint(v=v)
       self.evaluate(v.assign(3))
       # Create the save counter so assert_consumed doesn't complain about it not
       # existing in the checkpoint on restore.
       self.evaluate(checkpoint.save_counter.assign(12))
-      saver = checkpointable_utils.frozen_saver(checkpoint)
+      saver = trackable_utils.frozen_saver(checkpoint)
       with ops.device("cpu:0"):
         prefix_tensor = constant_op.constant(prefix)
       save_path = self.evaluate(saver.save(prefix_tensor))
@@ -577,14 +586,14 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       # Restore using another frozen saver on an identical object graph
       del v, checkpoint, saver
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
-      saver = checkpointable_utils.frozen_saver(checkpoint)
+      checkpoint = trackable_utils.Checkpoint(v=v)
+      saver = trackable_utils.frozen_saver(checkpoint)
       self.evaluate(saver.restore(prefix_tensor))
       self.assertEqual(3, self.evaluate(v))
 
       # Restore as an object-based checkpoint
       del v, checkpoint, saver
-      checkpoint = checkpointable_utils.Checkpoint()
+      checkpoint = trackable_utils.Checkpoint()
       status = checkpoint.restore(save_path)
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
       if context.executing_eagerly():
@@ -600,7 +609,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     directory = self.get_temp_dir()
     prefix = os.path.join(directory, "ckpt")
     step = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-    checkpoint = checkpointable_utils.Checkpoint(step=step)
+    checkpoint = trackable_utils.Checkpoint(step=step)
     self.evaluate(step.initializer)
     for i in range(5):
       path = checkpoint.write("%s-%d" % (prefix, self.evaluate(step)))
@@ -621,7 +630,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
         model = MyModel()
         # Don't actually train so we can test variable values
         optimizer = adam.Adam(0.)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         checkpoint_path = checkpoint_management.latest_checkpoint(
             checkpoint_directory)
@@ -653,11 +662,11 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
     return named_variable.name
@@ -674,23 +683,23 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNumberedPath(self):
-    root = tracking.AutoCheckpointable()
-    leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    trackable_utils.add_variable(leaf, name="v", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
 
   @test_util.run_in_graph_and_eager_modes
   def testLocalNameValidation(self):
-    root = tracking.AutoCheckpointable()
-    leaf = tracking.AutoCheckpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    root._track_trackable(leaf, name=".ATTRIBUTES")
+    trackable_utils.add_variable(trackable=leaf, name="a", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
                      named_variable.name)
 
@@ -712,7 +721,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       optimizer = adam.Adam(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
+      checkpoint = trackable_utils.Checkpoint(
           model=model, optimizer=optimizer)
       for _ in range(2):
         checkpoint.save(checkpoint_prefix)
@@ -726,13 +735,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
 
-    class Dependency(tracking.AutoCheckpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class LateDependencies(tracking.AutoCheckpointable):
+    class LateDependencies(trackable_utils.Checkpoint):
 
       def add_dep(self):
         self.dep = Dependency()
@@ -743,11 +752,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(state_ops.assign(original.dep.var, 123.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(
-        original).save(checkpoint_prefix)
+    save_path = original.save(checkpoint_prefix)
     load_into = LateDependencies()
-    status = checkpointable_utils.CheckpointableSaver(
-        load_into).restore(save_path)
+    status = load_into.restore(save_path)
     status.assert_existing_objects_matched()
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -759,13 +766,13 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testDepAfterVar(self):
 
-    class Dependency(tracking.AutoCheckpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class DepAfterVar(tracking.AutoCheckpointable):
+    class DepAfterVar(trackable_utils.Checkpoint):
 
       def add_dep(self):
         dep = Dependency()
@@ -777,12 +784,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
-        checkpoint_prefix)
+    save_path = dep_after_var.save(checkpoint_prefix)
 
     loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.CheckpointableSaver(
-        loaded_dep_after_var).restore(save_path)
+    status = loaded_dep_after_var.restore(save_path)
     loaded_dep_after_var.add_dep()
     status.assert_consumed()
     status.run_restore_ops()
@@ -792,8 +797,8 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
-    root.var = checkpointable_utils.add_variable(
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.Adam(0.1)
     variables = [root.var]
@@ -802,29 +807,25 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     # Note that `optimizer` has not been added as a dependency of
     # `root`. Create a one-off grouping so that slot variables for `root.var`
     # get initialized too.
-    self.evaluate(checkpointable_utils.gather_initializers(
-        checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+    self.evaluate(trackable_utils.gather_initializers(
+        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
     self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(
         optimizer.get_slot(slot_name="m", var=root.var),
         14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
+    new_root.var = trackable_utils.add_variable(
         new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
@@ -861,24 +862,21 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep = tracking.AutoCheckpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep = tracking.AutoTrackable()
+    save_root.dep.var = trackable_utils.add_variable(
         save_root.dep, name="var", initializer=0.)
     self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    saver = checkpointable_utils.CheckpointableSaver(save_root)
-    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    first_path = save_root.save(os.path.join(checkpoint_directory, "first"))
     self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
-
-    first_root = tracking.AutoCheckpointable()
-    second_root = tracking.AutoCheckpointable()
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    load_dep = tracking.AutoCheckpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    second_path = save_root.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    first_status = first_root.restore(first_path)
+    second_status = second_root.restore(second_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -891,14 +889,12 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
     # Try again with the order of the restore() reversed. The last restore
     # determines the final value.
-    first_root = tracking.AutoCheckpointable()
-    second_root = tracking.AutoCheckpointable()
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    load_dep = tracking.AutoCheckpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    second_status = second_root.restore(second_path)
+    first_status = first_root.restore(first_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -913,24 +909,22 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep_one = tracking.AutoCheckpointable()
-    save_root.dep_two = tracking.AutoCheckpointable()
-    dep_three = tracking.AutoCheckpointable()
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    dep_three = tracking.AutoTrackable()
     save_root.dep_one.dep_three = dep_three
     save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.AutoCheckpointable()
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path)
-    load_root.dep_one = tracking.AutoCheckpointable()
-    load_root.dep_two = tracking.AutoCheckpointable()
-    load_root.dep_one.dep_three = tracking.AutoCheckpointable()
-    load_root.dep_two.dep_three = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    status = load_root.restore(save_path)
+    load_root.dep_one = tracking.AutoTrackable()
+    load_root.dep_two = tracking.AutoTrackable()
+    load_root.dep_one.dep_three = tracking.AutoTrackable()
+    load_root.dep_two.dep_three = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         load_root.dep_one.dep_three, name="var", initializer=0.)
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -941,24 +935,23 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.AutoCheckpointable()
-    save_root.dep_one = tracking.AutoCheckpointable()
-    save_root.dep_two = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(
         save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.AutoCheckpointable()
-    load_root.dep_one = tracking.AutoCheckpointable()
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    load_root.dep_one = tracking.AutoTrackable()
     load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
+    v1 = trackable_utils.add_variable(
         load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+    status = load_root.restore(
         save_path).assert_consumed().assert_existing_objects_matched()
     status.run_restore_ops()
     self.assertEqual(32., self.evaluate(v1))
@@ -968,31 +961,29 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testDependencyLoop(self):
     # Note: this test creates garbage during eager execution because it
     # purposefully creates a reference cycle.
-    first = tracking.AutoCheckpointable()
-    second = tracking.AutoCheckpointable()
+    first = trackable_utils.Checkpoint()
+    second = trackable_utils.Checkpoint()
     first.second = second
     second.first = first
-    first.v = checkpointable_utils.add_variable(
+    first.v = trackable_utils.add_variable(
         first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
+    second.v = trackable_utils.add_variable(
         second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(checkpointable_utils.gather_initializers(first))
+    self.evaluate(trackable_utils.gather_initializers(first))
     checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        os.path.join(checkpoint_directory, "ckpt"))
+    save_path = first.save(os.path.join(checkpoint_directory, "ckpt"))
 
     # Test deferred loading
-    first_load = tracking.AutoCheckpointable()
-    status = checkpointable_utils.CheckpointableSaver(
-        first_load).restore(save_path)
-    second_load = tracking.AutoCheckpointable()
+    first_load = trackable_utils.Checkpoint()
+    status = first_load.restore(save_path)
+    second_load = tracking.AutoTrackable()
     first_load.second = second_load
     second_load.first = first_load
     with self.assertRaises(AssertionError):
       status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
+    first_load.v = trackable_utils.add_variable(
         first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
+    second_load.v = trackable_utils.add_variable(
         second_load, "v2", shape=[4])
     status.assert_consumed()
     status.run_restore_ops()
@@ -1004,8 +995,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
     self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
     self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
-        save_path).assert_consumed()
+    status = first_load.restore(save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
     self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
@@ -1014,18 +1004,16 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testRestoreOnAssign(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    first = tracking.AutoCheckpointable()
+    first = trackable_utils.Checkpoint()
     first.var1 = variables_lib.Variable(0., name="outside_var")
     first.var2 = variables_lib.Variable(0., name="blah")
     self.evaluate(first.var1.assign(4.))
     self.evaluate(first.var2.assign(8.))
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        checkpoint_prefix)
+    save_path = first.save(checkpoint_prefix)
 
-    second = tracking.AutoCheckpointable()
+    second = trackable_utils.Checkpoint()
     second.var2 = variables_lib.Variable(0., name="blah")
-    status = checkpointable_utils.CheckpointableSaver(
-        second).restore(save_path)
+    status = second.restore(save_path)
     recreated_var1 = variables_lib.Variable(0., name="outside_var")
     status.run_restore_ops()
     self.assertEqual(8., self.evaluate(second.var2))
@@ -1042,27 +1030,26 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
         gradients = [1.]
         obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
         graph.finalize()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
 
   @test_util.run_in_graph_and_eager_modes
   def testCheckpointState(self):
     # No checkpoints are deleted by default
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.AutoCheckpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    saver = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    saver = trackable_utils.Checkpoint(obj=obj)
     for _ in range(10):
       saver.save(checkpoint_prefix)
     expected_filenames = ["checkpoint"]
@@ -1079,10 +1066,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
   def testCheckpointStateChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.AutoCheckpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    checkpoint = trackable_utils.Checkpoint(obj=obj)
     looped_variables = []
     for iteration in range(10):
       new_variable = resource_variable_ops.ResourceVariable(iteration)
@@ -1132,23 +1119,22 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variables_lib.Variable(0., name="v")
         obj.opt = adam.Adam(0.1)
         variables = [obj.var]
         gradients = [1.]
         obj.opt.apply_gradients(zip(gradients, variables))
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         graph.finalize()
-        saver.restore(save_path)
+        obj.restore(save_path)
 
   @test_util.run_in_graph_and_eager_modes
   def test_sequential(self):
     model = sequential.Sequential()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     model.add(core.Dense(4))
     second_dense = core.Dense(5)
     model.add(second_dense)
@@ -1165,7 +1151,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
 
     deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
         model=deferred_sequential)
     status = deferred_sequential_checkpoint.restore(save_path)
     deferred_sequential.add(core.Dense(4))
@@ -1185,9 +1171,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           model=model)  # Do not save the optimizer with the checkpoint.
-      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+      optimizer_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer)
 
       checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -1221,7 +1207,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
       status = root.restore(save_path=model_save_path)
       input_value = constant_op.constant([[3.]])
@@ -1245,9 +1231,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.Adam(0.001, beta_1=1.0)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model)
-      opt_root = checkpointable_utils.Checkpoint(
+      opt_root = trackable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
@@ -1268,9 +1254,9 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
       self.assertEqual(42., self.evaluate(optimizer.beta_1))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_restore_after_adding_empty_checkpointable_data_structure(self):
-    model = NonLayerCheckpointable()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+  def test_restore_after_adding_empty_trackable_data_structure(self):
+    model = NonLayerTrackable()
+    checkpoint = trackable_utils.Checkpoint(model=model)
     checkpoint.restore(None).initialize_or_restore()
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -1278,20 +1264,43 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
     del model, checkpoint
 
-    model = NonLayerCheckpointable()
+    model = NonLayerTrackable()
     model.dict = {"a": 1}
     model.list = {"b": 1}
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     load_status = checkpoint.restore(save_path)
     load_status.assert_existing_objects_matched().run_restore_ops()
 
-
-class _ManualScope(tracking.AutoCheckpointable):
+  @test_util.run_in_graph_and_eager_modes
+  def test_write_checkpoint_from_function(self):
+    checkpoint_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(1.))
+
+    @def_function.function
+    def _write_checkpoint():
+      save_path = save_checkpoint.write(checkpoint_prefix)
+      return save_path
+
+    self.evaluate([save_checkpoint.v.initializer])
+    self.evaluate(_write_checkpoint())
+    load_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(1., self.evaluate(load_checkpoint.v))
+    self.evaluate(save_checkpoint.v.assign(3.))
+    self.evaluate(_write_checkpoint())
+    self.evaluate(save_checkpoint.v.assign(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(3., self.evaluate(load_checkpoint.v))
+
+
+class _ManualScope(tracking.AutoTrackable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
       self.variable_scope = vs
-      with checkpointable_utils.capture_dependencies(template=self):
+      with trackable_utils.capture_dependencies(template=self):
         return self._build()
 
   def _build(self):
@@ -1301,7 +1310,7 @@ class _ManualScope(tracking.AutoCheckpointable):
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -1318,12 +1327,12 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     six.assertCountEqual(
         self,
         [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        checkpointable_utils.list_objects(save_template))
+        trackable_utils.list_objects(save_template))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
     optimizer = adam.Adam(0.0)
-    save_root = checkpointable_utils.Checkpoint(
+    save_root = trackable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
     optimizer.minimize(v1_save.read_value,
                        var_list=[v1_save])
@@ -1337,7 +1346,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
 
     load_template = template.make_template("s2", _templated)
     load_optimizer = adam.Adam(0.0)
-    load_root = checkpointable_utils.Checkpoint(
+    load_root = trackable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
@@ -1353,7 +1362,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     self.assertAllEqual([14.], self.evaluate(var2))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore_nested(self):
+  def test_trackable_save_restore_nested(self):
 
     def _inner_template():
       v = variable_scope.get_variable(
@@ -1370,7 +1379,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
 
     with variable_scope.variable_scope("ignored"):
       save_template = template.make_template("s1", _outer_template)
-      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      save_root = trackable_utils.Checkpoint(my_template=save_template)
       (inner_template_one, inner_template_two), _ = save_template()
     self.evaluate(inner_template_one.variables[0].assign([20.]))
     self.evaluate(inner_template_two.variables[0].assign([25.]))
@@ -1379,7 +1388,7 @@ class TemplateTests(parameterized.TestCase, test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _outer_template)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    load_root = trackable_utils.Checkpoint(my_template=load_template)
     status = load_root.restore(save_path)
     (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
     outer_template_dependencies = load_root.my_template._checkpoint_dependencies
@@ -1404,15 +1413,15 @@ class CheckpointCompatibilityTests(test.TestCase):
     input_value = constant_op.constant([[3.]])
     model = MyModel()
     optimizer = adam.Adam(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     with backprop.GradientTape() as tape:
       loss = model(input_value)
     variables = model.trainable_variables
     gradients = tape.gradient(loss, variables)
     train_op = optimizer.apply_gradients(zip(gradients, variables))
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -1420,24 +1429,24 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.evaluate(optimizer.get_slot(
         var=model._named_dense.bias, slot_name="m").assign([2.]))
     self.evaluate(optimizer.beta_1.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, slot_name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")
         .assign([102.]))
-    self.evaluate(root_checkpointable.optimizer.beta_1.assign(103.))
+    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, slot_name="m")))
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")))
     self.assertAllEqual(3.,
-                        self.evaluate(root_checkpointable.optimizer.beta_1))
+                        self.evaluate(root_trackable.optimizer.beta_1))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
@@ -1461,18 +1470,16 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
         self._check_sentinels(root)
       if context.executing_eagerly():
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_nontrivial_match()
+        status.assert_consumed()
+        status.assert_existing_objects_matched()
+        status.assert_nontrivial_match()
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
@@ -1487,6 +1494,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       status.initialize_or_restore()
+      status.assert_nontrivial_match()
       self._check_sentinels(root)
       # Check that there is no error when keys are missing from the name-based
       # checkpoint.
@@ -1501,9 +1509,9 @@ class CheckpointCompatibilityTests(test.TestCase):
     with context.graph_mode():
       save_graph = ops.Graph()
       with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
+          graph=save_graph):
         root = self._initialized_model()
-        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
+        save_path = root.save(file_prefix=checkpoint_prefix)
     with context.eager_mode():
       root = self._initialized_model()
       self._set_sentinels(root)
@@ -1533,7 +1541,7 @@ class PythonMetadataTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     dense = core.Dense(1)
-    checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+    checkpoint = trackable_utils.Checkpoint(dense=dense)
     dense(constant_op.constant([[1.]]))
     checkpoint.restore(None).initialize_or_restore()
     save_path = checkpoint.save(checkpoint_prefix)
@@ -1553,7 +1561,7 @@ class PythonMetadataTests(test.TestCase):
       return json.loads(layer_json.decode("utf-8"))
 
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(1, layer_data["config"]["units"])
 
@@ -1563,7 +1571,7 @@ class PythonMetadataTests(test.TestCase):
     dense.units = 42
     save_path = checkpoint.save(checkpoint_prefix)
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(42, layer_data["config"]["units"])
 
diff --git a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
similarity index 86%
rename from tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
rename to tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index bd80fa60f0b27f16da01002ab5088495f0a43edb..e00131a8e468f3d95a5e1b714362354462fdc5bb 100644
--- a/tensorflow/python/training/checkpointable/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -43,15 +43,16 @@ from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -63,8 +64,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -82,7 +83,7 @@ class CheckpointingTests(test.TestCase):
     other_model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     if context.executing_eagerly():
       optimizer.minimize(
@@ -97,12 +98,11 @@ class CheckpointingTests(test.TestCase):
       optimizer.minimize(
           other_model(input_value),
           global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
       self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        checkpointable_utils._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -190,7 +190,7 @@ class CheckpointingTests(test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     if context.executing_eagerly():
@@ -199,24 +199,24 @@ class CheckpointingTests(test.TestCase):
     else:
       train_op = optimizer.minimize(model(input_value))
       # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
+      root_trackable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
@@ -226,7 +226,7 @@ class CheckpointingTests(test.TestCase):
         # Preserve beta1_power and beta2_power when appying gradients so we can
         # test that they've been restored correctly.
         beta1=1.0, beta2=1.0)
-    on_create_root = checkpointable_utils.Checkpoint(
+    on_create_root = trackable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
@@ -266,7 +266,7 @@ class CheckpointingTests(test.TestCase):
     for training_continuation in range(3):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           optimizer_step=training_util.get_or_create_global_step())
       root.restore(checkpoint_management.latest_checkpoint(
@@ -297,7 +297,7 @@ class CheckpointingTests(test.TestCase):
       with strategy.scope():
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             optimizer_step=training_util.get_or_create_global_step())
         root.restore(checkpoint_management.latest_checkpoint(
@@ -328,7 +328,7 @@ class CheckpointingTests(test.TestCase):
         with strategy.scope():
           model = MyModel()
           optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model,
               optimizer_step=training_util.get_or_create_global_step())
           status = root.restore(checkpoint_management.latest_checkpoint(
@@ -355,7 +355,7 @@ class CheckpointingTests(test.TestCase):
         with ops.Graph().as_default():
           model = MyModel()
           optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
+          root = trackable_utils.CheckpointV1(
               optimizer=optimizer, model=model,
               global_step=training_util.get_or_create_global_step())
           input_value = constant_op.constant([[3.]])
@@ -394,7 +394,7 @@ class CheckpointingTests(test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             global_step=training_util.get_or_create_global_step())
         manager = checkpoint_management.CheckpointManager(
@@ -427,7 +427,7 @@ class CheckpointingTests(test.TestCase):
         model = MyModel()
         # Don't actually train so we can test variable values
         optimizer = adam.AdamOptimizer(0.)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model,
             global_step=training_util.get_or_create_global_step())
         checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -461,10 +461,10 @@ class CheckpointingTests(test.TestCase):
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.AutoCheckpointable()
-    checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
         root, saveables_cache=None)
     with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
@@ -488,7 +488,7 @@ class CheckpointingTests(test.TestCase):
       optimizer = adam.AdamOptimizer(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
+      checkpoint = trackable_utils.Checkpoint(
           model=model, optimizer=optimizer)
       for _ in range(2):
         checkpoint.save(checkpoint_prefix)
@@ -503,8 +503,8 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.AutoCheckpointable()
-    root.var = checkpointable_utils.add_variable(
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
     if context.executing_eagerly():
@@ -514,28 +514,24 @@ class CheckpointingTests(test.TestCase):
       # Note that `optimizer` has not been added as a dependency of
       # `root`. Create a one-off grouping so that slot variables for `root.var`
       # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(trackable_utils.gather_initializers(
+          trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
       self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
                                    14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.AutoCheckpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
+    new_root.var = trackable_utils.add_variable(
         new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
@@ -572,15 +568,14 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
         before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testManyRestoresGraph(self):
@@ -590,16 +585,15 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.AutoCheckpointable()
+        obj = trackable_utils.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         before_ops = graph.get_operations()
-        saver.restore(save_path)
+        obj.restore(save_path)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testMultipleGraphsNonSlotVariables(self):
@@ -612,11 +606,11 @@ class CheckpointingTests(test.TestCase):
       first_session = session_lib.Session(graph=first_graph)
       with first_graph.as_default(), first_session.as_default():
         first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
+        first_root_trackable = trackable_utils.Checkpoint(
             optimizer=optimizer, variable=first_variable)
         train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
+        self.evaluate(trackable_utils.gather_initializers(
+            first_root_trackable))
         self.evaluate(train_op)
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
@@ -628,23 +622,23 @@ class CheckpointingTests(test.TestCase):
       second_graph = ops.Graph()
       with second_graph.as_default(), session_lib.Session(graph=second_graph):
         second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
+        second_root_trackable = trackable_utils.Checkpoint(
             optimizer=optimizer, variable=second_variable)
         train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
+        second_root_trackable.restore(None).initialize_or_restore()
         self.evaluate(train_op)
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        save_path = second_root_trackable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
+        status = second_root_trackable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
@@ -668,10 +662,10 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           model=model,  # Do not save the optimizer with the checkpoint.
           global_step=training_util.get_or_create_global_step())
-      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+      optimizer_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer)
 
       checkpoint_path = checkpoint_management.latest_checkpoint(
@@ -695,7 +689,7 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           global_step=training_util.get_or_create_global_step())
       status = root.restore(save_path=model_save_path)
@@ -717,10 +711,10 @@ class CheckpointingTests(test.TestCase):
     with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
-      root = checkpointable_utils.Checkpoint(
+      root = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model,
           global_step=training_util.get_or_create_global_step())
-      opt_root = checkpointable_utils.Checkpoint(
+      opt_root = trackable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
@@ -739,12 +733,12 @@ class CheckpointingTests(test.TestCase):
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
 
-class _ManualScope(tracking.AutoCheckpointable):
+class _ManualScope(tracking.AutoTrackable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
       self.variable_scope = vs
-      with checkpointable_utils.capture_dependencies(template=self):
+      with trackable_utils.capture_dependencies(template=self):
         return self._build()
 
   def _build(self):
@@ -754,7 +748,7 @@ class _ManualScope(tracking.AutoCheckpointable):
 class TemplateTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -771,12 +765,12 @@ class TemplateTests(test.TestCase):
     six.assertCountEqual(
         self,
         [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        checkpointable_utils.list_objects(save_template))
+        trackable_utils.list_objects(save_template))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
     optimizer = adam.AdamOptimizer(0.0)
-    save_root = checkpointable_utils.Checkpoint(
+    save_root = trackable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
     optimizer.minimize(v1_save.read_value)
     self.evaluate([v.initializer for v in save_template.variables])
@@ -789,7 +783,7 @@ class TemplateTests(test.TestCase):
 
     load_template = template.make_template("s2", _templated)
     load_optimizer = adam.AdamOptimizer(0.0)
-    load_root = checkpointable_utils.Checkpoint(
+    load_root = trackable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
@@ -812,13 +806,13 @@ class CheckpointCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -827,24 +821,24 @@ class CheckpointCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta1_power))
 
   def _write_name_based_checkpoint(self):
@@ -869,18 +863,16 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
         self._check_sentinels(root)
       if context.executing_eagerly():
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_consumed()
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_existing_objects_matched()
-        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
-          status.assert_nontrivial_match()
+        status.assert_consumed()
+        status.assert_existing_objects_matched()
+        status.assert_nontrivial_match()
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
@@ -909,9 +901,9 @@ class CheckpointCompatibilityTests(test.TestCase):
     with context.graph_mode():
       save_graph = ops.Graph()
       with save_graph.as_default(), self.session(
-          graph=save_graph) as session:
+          graph=save_graph):
         root = self._initialized_model()
-        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
+        save_path = root.save(file_prefix=checkpoint_prefix)
     with context.eager_mode():
       root = self._initialized_model()
       self._set_sentinels(root)
diff --git a/tensorflow/python/training/checkpointable/util_xla_test.py b/tensorflow/python/training/tracking/util_xla_test.py
similarity index 85%
rename from tensorflow/python/training/checkpointable/util_xla_test.py
rename to tensorflow/python/training/tracking/util_xla_test.py
index 4e96a7514a24be19b857eab7032846e7578cc55c..4e8dd0a6fd3231b335758a7e0fb05c7db37ac95c 100644
--- a/tensorflow/python/training/checkpointable/util_xla_test.py
+++ b/tensorflow/python/training/tracking/util_xla_test.py
@@ -25,15 +25,15 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.AutoCheckpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -44,8 +44,8 @@ class Subclassed(training.Model):
     super(Subclassed, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -62,7 +62,7 @@ class CheckpointingTests(xla_test.XLATestCase):
       with self.test_scope():
         model = Subclassed()
         optimizer = adam.Adam(0.001)
-        root = checkpointable_utils.Checkpoint(
+        root = trackable_utils.Checkpoint(
             optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=2)
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index ae71a628c1f9e1e7e86a25cbcacab0bd400ed279..9f509ae0a38be04fd0ae28b5d1127ea28afb5be4 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -68,7 +68,8 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.python_state import PythonState
+from tensorflow.python.training.tracking.util import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
@@ -142,3 +143,4 @@ tf_export(v1=["train.SaverDef"])(SaverDef)
 tf_export("train.SequenceExample")(SequenceExample)
 tf_export("train.ServerDef")(ServerDef)
 # pylint: enable=undefined-variable
+
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 9aaf0c2de9756718645e77de416c653182994019..f8e8d4c28a50629f108abeb0700d82fba311666c 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -100,7 +100,13 @@ def _validate_deprecation_args(date, instructions):
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
   stack = tf_stack.extract_stack()
-  frame = stack[-4 if outer else -3]
+  length = len(stack)
+  if length == 0:  # should never happen as we're in a function
+    return 'UNKNOWN'
+  index = length-4 if outer else length-3
+  if index < 0:
+    index = 0
+  frame = stack[index]
   return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index e3fdcf956e543c516335762a7c47e5547256a2a7..dc8937a31995c1752ea49638ff23ff805a39753f 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -101,7 +101,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     fixed_config.shape.CopyFrom(
         tensor_shape.TensorShape(dense_shapes[i]).as_proto())
 
-    fixed_config.dtype = int(dense_types[i])
+    fixed_config.dtype = dense_types[i].as_datatype_enum
     # Get the output tensor name.
     fixed_config.values_output_tensor_name = parse_example_op.outputs[
         dense_values_start + i].name
@@ -111,7 +111,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     key = fetched[sparse_keys_start + i]
     feature_config = config.feature_map[key]
     var_len_feature = feature_config.var_len_feature
-    var_len_feature.dtype = int(sparse_types[i])
+    var_len_feature.dtype = sparse_types[i].as_datatype_enum
     var_len_feature.indices_output_tensor_name = parse_example_op.outputs[
         sparse_indices_start + i].name
     var_len_feature.values_output_tensor_name = parse_example_op.outputs[
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index dbae97a373291ba97ef59f87394e1f31eaeebb79..8fb187fb5b6169a680ca6ea11d48751688817e0d 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -39,18 +39,47 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
 
 
-def _get_attrs_values(obj):
-  """Returns the list of values from an attrs instance."""
+_SHALLOW_TREE_HAS_INVALID_KEYS = (
+    "The shallow_tree's keys are not a subset of the input_tree's keys. The "
+    "shallow_tree has the following keys that are not in the input_tree: {}.")
+
+_STRUCTURES_HAVE_MISMATCHING_TYPES = (
+    "The two structures don't have the same sequence type. Input structure has "
+    "type {shallow_type}, while shallow structure has type {input_type}.")
+
+_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE = (
+    "The input_tree has fewer elements than the input_tree. Input structure "
+    "has length {input_size}, while shallow structure has length "
+    "{shallow_size}.")
+
+_IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ = (
+    "If shallow structure is a sequence, input must also be a sequence. "
+    "Input has type: {}.")
+
+
+def _get_attrs_items(obj):
+  """Returns a list of (name, value) pairs from an attrs instance.
+
+  The list will be sorted by name.
+
+  Args:
+    obj: an object.
+
+  Returns:
+    A list of (attr_name, attr_value) pairs, sorted by attr_name.
+  """
   attrs = getattr(obj.__class__, "__attrs_attrs__")
-  return [getattr(obj, a.name) for a in attrs]
+  attr_names = sorted([a.name for a in attrs])
+  return [(attr_name, getattr(obj, attr_name)) for attr_name in attr_names]
 
 
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(dict_)
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -74,6 +103,7 @@ def _is_namedtuple(instance, strict=False):
 # See the swig file (util.i) for documentation.
 _is_mapping = _pywrap_tensorflow.IsMapping
 _is_attrs = _pywrap_tensorflow.IsAttrs
+_is_composite_tensor = _pywrap_tensorflow.IsCompositeTensor
 
 
 def _sequence_like(instance, args):
@@ -94,30 +124,56 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
+  elif _is_composite_tensor(instance):
+    return instance._from_components(args)  # pylint: disable=protected-access
   else:
     # Not a namedtuple
     return type(instance)(args)
 
 
 def _yield_value(iterable):
-  """Yields the next value from the given iterable."""
-  if _is_mapping(iterable):
+  for _, v in _yield_sorted_items(iterable):
+    yield v
+
+
+def _yield_sorted_items(iterable):
+  """Yield (key, value) pairs for `iterable` in a deterministic order.
+
+  For Sequences, the key will be an int, the array index of a value.
+  For Mappings, the key will be the dictionary key.
+  For objects (e.g. namedtuples), the key will be the attribute name.
+
+  In all cases, the keys will be iterated in sorted order.
+
+  Args:
+    iterable: an iterable.
+
+  Yields:
+    The iterable's (key, value) pairs, in order of sorted keys.
+  """
+  if isinstance(iterable, _collections.Mapping):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     for key in _sorted(iterable):
-      yield iterable[key]
+      yield key, iterable[key]
   elif _is_attrs(iterable):
-    for value in _get_attrs_values(iterable):
-      yield value
+    for item in _get_attrs_items(iterable):
+      yield item
+  elif _is_namedtuple(iterable):
+    for field in iterable._fields:
+      yield field, getattr(iterable, field)
+  elif _is_composite_tensor(iterable):
+    for item in enumerate(iterable._to_components()):  # pylint: disable=protected-access
+      yield item
   else:
-    for value in iterable:
-      yield value
+    for item in enumerate(iterable):
+      yield item
 
 
 # See the swig file (util.i) for documentation.
@@ -125,7 +181,55 @@ is_sequence = _pywrap_tensorflow.IsSequence
 
 
 # See the swig file (util.i) for documentation.
-flatten = _pywrap_tensorflow.Flatten
+is_sequence_or_composite = _pywrap_tensorflow.IsSequenceOrComposite
+
+
+@tf_export("nest.is_nested")
+def is_nested(seq):
+  """Returns true if its input is a collections.Sequence (except strings).
+
+  Args:
+    seq: an input sequence.
+
+  Returns:
+    True if the sequence is a not a string and is a collections.Sequence or a
+    dict.
+  """
+  return is_sequence(seq)
+
+
+@tf_export("nest.flatten")
+def flatten(structure, expand_composites=False):
+  """Returns a flat list from a given nested structure.
+
+  If nest is not a sequence, tuple, or dict, then returns a single-element list:
+  [nest].
+
+  In the case of dict instances, the sequence consists of the values, sorted by
+  key to ensure deterministic behavior. This is true also for OrderedDict
+  instances: their sequence order is ignored, the sorting order of keys is used
+  instead. The same convention is followed in pack_sequence_as. This correctly
+  repacks dicts and OrderedDicts after they have been flattened, and also allows
+  flattening an OrderedDict and then repacking it back using a corresponding
+  plain dict, or vice-versa. Dictionaries with non-sortable keys cannot be
+  flattened.
+
+  Users must not modify any collections used in nest while this function is
+  running.
+
+  Args:
+    structure: an arbitrarily nested structure or a scalar object. Note, numpy
+      arrays are considered scalars.
+    expand_composites: If true, then composite tensors such as tf.SparseTensor
+       and tf.RaggedTensor are expanded into their component tensors.
+
+  Returns:
+    A Python list, the flattened version of the input.
+
+  Raises:
+    TypeError: The nest is or contains a dict with non-sortable keys.
+  """
+  return _pywrap_tensorflow.Flatten(structure, expand_composites)
 
 
 # See the swig file (util.i) for documentation.
@@ -144,7 +248,9 @@ class _DotString(object):
 _DOT = _DotString()
 
 
-def assert_same_structure(nest1, nest2, check_types=True):
+@tf_export("nest.assert_same_structure")
+def assert_same_structure(nest1, nest2, check_types=True,
+                          expand_composites=False):
   """Asserts that two structures are nested in the same way.
 
   Note that namedtuples with identical name and fields are always considered
@@ -166,8 +272,10 @@ def assert_same_structure(nest1, nest2, check_types=True):
         size. Note that namedtuples with identical name and fields are always
         considered to have the same shallow structure. Two types will also be
         considered the same if they are both list subtypes (which allows "list"
-        and "_ListWrapper" from checkpointable dependency tracking to compare
+        and "_ListWrapper" from trackable dependency tracking to compare
         equal).
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -176,7 +284,8 @@ def assert_same_structure(nest1, nest2, check_types=True):
       their substructures. Only possible if `check_types` is `True`.
   """
   try:
-    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
+    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types,
+                                           expand_composites)
   except (ValueError, TypeError) as e:
     str1 = str(map_structure(lambda _: _DOT, nest1))
     str2 = str(map_structure(lambda _: _DOT, nest2))
@@ -213,8 +322,8 @@ def flatten_dict_items(dictionary):
 
   Raises:
     TypeError: If the input is not a dictionary.
-    ValueError: If any key and value have not the same structure, or if keys are
-      not unique.
+    ValueError: If any key and value do not have the same structure layout, or
+    if keys are not unique.
   """
   if not isinstance(dictionary, (dict, _collections.Mapping)):
     raise TypeError("input must be a dictionary")
@@ -242,13 +351,14 @@ def flatten_dict_items(dictionary):
   return flat_dictionary
 
 
-def _packed_nest_with_indices(structure, flat, index):
+def _packed_nest_with_indices(structure, flat, index, is_seq):
   """Helper function for pack_sequence_as.
 
   Args:
     structure: Substructure (list / tuple / dict) to mimic.
     flat: Flattened values to output substructure for.
     index: Index at which to start reading from flat.
+    is_seq: Function used to test if a value should be treated as a sequence.
 
   Returns:
     The tuple (new_index, child), where:
@@ -263,8 +373,8 @@ def _packed_nest_with_indices(structure, flat, index):
   """
   packed = []
   for s in _yield_value(structure):
-    if is_sequence(s):
-      new_index, child = _packed_nest_with_indices(s, flat, index)
+    if is_seq(s):
+      new_index, child = _packed_nest_with_indices(s, flat, index, is_seq)
       packed.append(_sequence_like(s, child))
       index = new_index
     else:
@@ -273,7 +383,8 @@ def _packed_nest_with_indices(structure, flat, index):
   return index, packed
 
 
-def pack_sequence_as(structure, flat_sequence):
+@tf_export("nest.pack_sequence_as")
+def pack_sequence_as(structure, flat_sequence, expand_composites=False):
   """Returns a given flattened sequence packed into a given structure.
 
   If `structure` is a scalar, `flat_sequence` must be a single-element list;
@@ -293,6 +404,8 @@ def pack_sequence_as(structure, flat_sequence):
         tuples, and dicts. Note: numpy arrays and strings are considered
         scalars.
     flat_sequence: flat sequence to pack.
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Returns:
     packed: `flat_sequence` converted to have the same recursive structure as
@@ -303,17 +416,19 @@ def pack_sequence_as(structure, flat_sequence):
       element counts.
     TypeError: `structure` is or contains a dict with non-sortable keys.
   """
-  if not is_sequence(flat_sequence):
+  is_seq = is_sequence_or_composite if expand_composites else is_sequence
+  if not is_seq(flat_sequence):
     raise TypeError("flat_sequence must be a sequence")
 
-  if not is_sequence(structure):
+  if not is_seq(structure):
     if len(flat_sequence) != 1:
       raise ValueError("Structure is a scalar but len(flat_sequence) == %d > 1"
                        % len(flat_sequence))
     return flat_sequence[0]
 
   try:
-    final_index, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+    final_index, packed = _packed_nest_with_indices(structure, flat_sequence,
+                                                    0, is_seq)
     if final_index < len(flat_sequence):
       raise IndexError
   except IndexError:
@@ -326,23 +441,30 @@ def pack_sequence_as(structure, flat_sequence):
   return _sequence_like(structure, packed)
 
 
-def map_structure(func, *structure, **check_types_dict):
+@tf_export("nest.map_structure")
+def map_structure(func, *structure, **kwargs):
   """Applies `func` to each entry in `structure` and returns a new structure.
 
   Applies `func(x[0], x[1], ...)` where x[i] is an entry in
   `structure[i]`.  All structures in `structure` must have the same arity,
-  and the return value will contain the results in the same structure.
+  and the return value will contain results with the same structure layout.
 
   Args:
     func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
-    **check_types_dict: only valid keyword argument is `check_types`. If set to
-      `True` (default) the types of iterables within the structures have to be
-      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
-      exception). To allow this set this argument to `False`.
-      Note that namedtuples with identical name and fields are always
-      considered to have the same shallow structure.
+    **kwargs: Valid keyword args are:
+
+      * `check_types`: If set to `True` (default) the types of
+        iterables within the structures have to be same (e.g.
+        `map_structure(func, [1], (1,))` raises a `TypeError`
+        exception). To allow this set this argument to `False`.
+        Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
+      * `expand_composites`: If set to `True`, then composite tensors such
+        as `tf.SparseTensor` and `tf.RaggedTensor` are expanded into their
+        component tensors.  If `False` (the default), then composite tensors
+        are not expanded.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
@@ -364,21 +486,25 @@ def map_structure(func, *structure, **check_types_dict):
   if not structure:
     raise ValueError("Must provide at least one structure")
 
-  if check_types_dict:
-    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
-      raise ValueError("Only valid keyword argument is check_types")
-    check_types = check_types_dict["check_types"]
-  else:
-    check_types = True
+  check_types = True
+  expand_composites = False
+  if kwargs:
+    check_types = kwargs.pop("check_types", check_types)
+    expand_composites = kwargs.pop("expand_composites", expand_composites)
+    if kwargs:
+      raise ValueError("Only valid keyword arguments are check_types "
+                       "and expand_composites")
 
   for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
+    assert_same_structure(structure[0], other, check_types=check_types,
+                          expand_composites=expand_composites)
 
-  flat_structure = [flatten(s) for s in structure]
+  flat_structure = [flatten(s, expand_composites) for s in structure]
   entries = zip(*flat_structure)
 
   return pack_sequence_as(
-      structure[0], [func(*x) for x in entries])
+      structure[0], [func(*x) for x in entries],
+      expand_composites=expand_composites)
 
 
 def map_structure_with_paths(func, *structure, **kwargs):
@@ -387,9 +513,9 @@ def map_structure_with_paths(func, *structure, **kwargs):
   Applies `func(path, x[0], x[1], ..., **kwargs)` where x[i] is an entry in
   `structure[i]` and `path` is the common path to x[i] in the structures.  All
   structures in `structure` must have the same arity, and the return value will
-  contain the results in the same structure. Special kwarg `check_types`
-  determines whether the types of iterables within the structure must be the
-  same-- see **kwargs definition below.
+  contain the results with the same structure layout. Special kwarg
+  `check_types` determines whether the types of iterables within the structure
+  must be the same-- see **kwargs definition below.
 
   Args:
     func: A callable with the signature func(path, *values, **kwargs) that is
@@ -413,8 +539,14 @@ def map_structure_with_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
-  return _map_structure_with_tuple_or_string_paths(
-      use_string_paths=True, func=func, structure=structure, kwargs=kwargs)
+  def wrapper_func(tuple_path, *inputs, **kwargs):
+    string_path = "/".join(str(s) for s in tuple_path)
+    return func(string_path, *inputs, **kwargs)
+
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              wrapper_func,
+                                              *structure,
+                                              **kwargs)
 
 
 def map_structure_with_tuple_paths(func, *structure, **kwargs):
@@ -450,52 +582,43 @@ def map_structure_with_tuple_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
-  return _map_structure_with_tuple_or_string_paths(
-      use_string_paths=False, func=func, structure=structure, kwargs=kwargs)
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              func,
+                                              *structure,
+                                              **kwargs)
 
 
-def _map_structure_with_tuple_or_string_paths(
-    use_string_paths, func, structure, kwargs):
-  """Implements `map_structure` with either tuple or string paths."""
-
-  if not callable(func):
-    raise TypeError("func must be callable, got: %s" % func)
-  if not structure:
-    raise ValueError("Must provide at least one structure")
-
-  check_types = kwargs.pop("check_types", True)
-  for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
-
-  if use_string_paths:
-    flatten_func = flatten_with_joined_string_paths
-  else:
-    flatten_func = flatten_with_tuple_paths
-
-  # First set paths_and_values to:
-  # [[(p11, v11), ... (p1n, v1n)], ... [(pm1, vm1), ... (pmn, vmn)]]
-  paths_and_values = [flatten_func(s) for s in structure]
-
-  # Now zip(*paths_and_values) would be:
-  # [((p11, v11), ... (pm1, vm1)), ... ((p1n, v1n), ... (pmn, vmn))]
-  # so grouped_by_path is set to:
-  # [[(p11, ... pm1), (v11, ... vm1)], ... [(p1n, ... pmn), (v1n, ... vmn)]]
-  # Note that p1i, ... pmi must all be equal since the structures are the same.
-  grouped_by_path = [zip(*p_v) for p_v in zip(*paths_and_values)]
-
-  return pack_sequence_as(structure[0], [
-      func(paths[0], *values, **kwargs) for paths, values in grouped_by_path])
+def _yield_flat_up_to(shallow_tree, input_tree, path=()):
+  """Yields (path, value) pairs of input_tree flattened up to shallow_tree.
 
+  Args:
+    shallow_tree: Nested structure. Traverse no further than its leaf nodes.
+    input_tree: Nested structure. Return the paths and values from this tree.
+      Must have the same upper structure as shallow_tree.
+    path: Tuple. Optional argument, only used when recursing. The path from the
+      root of the original shallow_tree, down to the root of the shallow_tree
+      arg of this recursive call.
 
-def _yield_flat_up_to(shallow_tree, input_tree):
-  """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
-  if is_sequence(shallow_tree):
-    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
-                                            _yield_value(input_tree)):
-      for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
-        yield input_leaf
+  Yields:
+    Pairs of (path, value), where path the tuple path of a leaf node in
+    shallow_tree, and value is the value of the corresponding node in
+    input_tree.
+  """
+  if (isinstance(shallow_tree, _six.string_types) or
+      not any([isinstance(shallow_tree, _collections.Sequence),
+               isinstance(shallow_tree, _collections.Mapping),
+               _is_namedtuple(shallow_tree),
+               _is_attrs(shallow_tree)])):
+    yield (path, input_tree)
   else:
-    yield input_tree
+    input_tree = dict(_yield_sorted_items(input_tree))
+    for shallow_key, shallow_subtree in _yield_sorted_items(shallow_tree):
+      subpath = path + (shallow_key,)
+      input_subtree = input_tree[shallow_key]
+      for leaf_path, leaf_value in _yield_flat_up_to(shallow_subtree,
+                                                     input_subtree,
+                                                     path=subpath):
+        yield (leaf_path, leaf_value)
 
 
 def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
@@ -509,15 +632,15 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
 
   The following code will raise an exception:
   ```python
-    shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"], "f"]
+    shallow_tree = {"a": "A", "b": "B"}
+    input_tree = {"a": 1, "c": 2}
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
   The following code will not raise an exception:
   ```python
     shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"]]
+    input_tree = ["c", ["d", "e"], "f"]
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
@@ -549,40 +672,34 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
       input_is_namedtuple = _is_namedtuple(input_tree, False)
       if shallow_is_namedtuple and input_is_namedtuple:
         if not _same_namedtuples(shallow_tree, input_tree):
-          raise TypeError(
-              "The two namedtuples don't have the same sequence type. Input "
-              "structure has type %s, while shallow structure has type %s."
-              % (type(input_tree), type(shallow_tree)))
+          raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+              input_type=type(input_tree),
+              shallow_type=type(shallow_tree)))
+
       elif not (isinstance(shallow_tree, _collections.Mapping)
                 and isinstance(input_tree, _collections.Mapping)):
-        raise TypeError(
-            "The two structures don't have the same sequence type. Input "
-            "structure has type %s, while shallow structure has type %s."
-            % (type(input_tree), type(shallow_tree)))
+        raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            input_type=type(input_tree),
+            shallow_type=type(shallow_tree)))
 
-    if len(input_tree) != len(shallow_tree):
-      raise ValueError(
-          "The two structures don't have the same sequence length. Input "
-          "structure has length %s, while shallow structure has length %s."
-          % (len(input_tree), len(shallow_tree)))
-
-    if check_types and isinstance(shallow_tree, (dict, _collections.Mapping)):
-      if set(input_tree) != set(shallow_tree):
-        raise ValueError(
-            "The two structures don't have the same keys. Input "
-            "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+    if len(input_tree) < len(shallow_tree):
+      raise ValueError(_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+          input_size=len(input_tree),
+          shallow_size=len(shallow_tree)))
 
-      input_tree = list(sorted(_six.iteritems(input_tree)))
-      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
+    if isinstance(shallow_tree, _collections.Mapping):
+      absent_keys = set(shallow_tree) - set(input_tree)
+      if absent_keys:
+        raise ValueError(_SHALLOW_TREE_HAS_INVALID_KEYS
+                         .format(sorted(absent_keys)))
 
-    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
+                                            _yield_value(input_tree)):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types)
 
 
-def flatten_up_to(shallow_tree, input_tree):
+def flatten_up_to(shallow_tree, input_tree, check_types=True):
   """Flattens `input_tree` up to `shallow_tree`.
 
   Any further depth in structure in `input_tree` is retained as elements in the
@@ -597,8 +714,8 @@ def flatten_up_to(shallow_tree, input_tree):
   of the nested structure. We achieve this by specifying a shallow structure,
   `shallow_tree`, we wish to flatten up to.
 
-  The input, `input_tree`, can be thought of as having the same structure as
-  `shallow_tree`, but with leaf nodes that are themselves tree structures.
+  The input, `input_tree`, can be thought of as having the same structure layout
+  as `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
   Examples:
 
@@ -639,6 +756,103 @@ def flatten_up_to(shallow_tree, input_tree):
     shallow_tree: a possibly pruned structure of input_tree.
     input_tree: an arbitrarily nested structure or a scalar object.
       Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
+
+  Returns:
+    A Python list, the partially flattened version of `input_tree` according to
+    the structure of `shallow_tree`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+  """
+  assert_shallow_structure(shallow_tree, input_tree, check_types)
+  # Discard paths returned by _yield_flat_up_to.
+  return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree))
+
+
+def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True):
+  """Flattens `input_tree` up to `shallow_tree`.
+
+  Any further depth in structure in `input_tree` is retained as elements in the
+  partially flattened output.
+
+  Returns a list of (path, value) pairs, where value a leaf node in the
+  flattened tree, and path is the tuple path of that leaf in input_tree.
+
+  If `shallow_tree` and `input_tree` are not sequences, this returns a
+  single-element list: `[((), input_tree)]`.
+
+  Use Case:
+
+  Sometimes we may wish to partially flatten a nested sequence, retaining some
+  of the nested structure. We achieve this by specifying a shallow structure,
+  `shallow_tree`, we wish to flatten up to.
+
+  The input, `input_tree`, can be thought of as having the same structure layout
+  as `shallow_tree`, but with leaf nodes that are themselves tree structures.
+
+  Examples:
+
+  ```python
+  input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+  shallow_tree = [[True, True], [False, True]]
+
+  flattened_input_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                        input_tree)
+  flattened_shallow_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                          shallow_tree)
+
+  # Output is:
+  # [((0, 0), [2, 2]),
+  #  ((0, 1), [3, 3]),
+  #  ((1, 0), [4, 9]),
+  #  ((1, 1), [5, 5])]
+  #
+  # [((0, 0), True),
+  #  ((0, 1), True),
+  #  ((1, 0), False),
+  #  ((1, 1), True)]
+  ```
+
+  ```python
+  input_tree = [[('a', 1), [('b', 2), [('c', 3), [('d', 4)]]]]]
+  shallow_tree = [['level_1', ['level_2', ['level_3', ['level_4']]]]]
+
+  input_tree_flattened_as_shallow_tree = flatten_up_to(shallow_tree, input_tree)
+  input_tree_flattened = flatten(input_tree)
+
+  # Output is:
+  # [((0, 0), ('a', 1)),
+  #  ((0, 1, 0), ('b', 2)),
+  #  ((0, 1, 1, 0), ('c', 3)),
+  #  ((0, 1, 1, 1), ('d', 4))]
+  # ['a', 1, 'b', 2, 'c', 3, 'd', 4]
+  ```
+
+  Non-Sequence Edge Cases:
+
+  ```python
+  flatten_with_tuple_paths_up_to(0, 0)  # Output: [(), 0]
+
+  flatten_with_tuple_paths_up_to(0, [0, 1, 2])  # Output: [(), [0, 1, 2]]
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], 0)  # Output: TypeError
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], [0, 1, 2])
+  # Output: [((0,) 0), ((1,), 1), ((2,), 2)]
+  ```
+
+  Args:
+    shallow_tree: a possibly pruned structure of input_tree.
+    input_tree: an arbitrarily nested structure or a scalar object.
+      Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
 
   Returns:
     A Python list, the partially flattened version of `input_tree` according to
@@ -651,11 +865,11 @@ def flatten_up_to(shallow_tree, input_tree):
     ValueError: If the sequence lengths of `shallow_tree` are different from
       `input_tree`.
   """
-  assert_shallow_structure(shallow_tree, input_tree)
+  assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
   return list(_yield_flat_up_to(shallow_tree, input_tree))
 
 
-def map_structure_up_to(shallow_tree, func, *inputs):
+def map_structure_up_to(shallow_tree, func, *inputs, **kwargs):
   """Applies a function or op to a number of partially flattened inputs.
 
   The `inputs` are flattened up to `shallow_tree` before being mapped.
@@ -667,7 +881,7 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   achieve this by specifying a shallow structure, `shallow_tree` we wish to
   flatten up to.
 
-  The `inputs`, can be thought of as having the same structure as
+  The `inputs`, can be thought of as having the same structure layout as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
   This function therefore will return something with the same base structure as
@@ -675,6 +889,14 @@ def map_structure_up_to(shallow_tree, func, *inputs):
 
   Examples:
 
+  ```python
+  shallow_tree = [None, None]
+  inp_val = [1, 2, 3]
+  out = map_structure_up_to(shallow_tree, lambda x: 2 * x, inp_val)
+
+  # Output is: [2, 4]
+  ```
+
   ```python
   ab_tuple = collections.namedtuple("ab_tuple", "a, b")
   op_tuple = collections.namedtuple("op_tuple", "add, mul")
@@ -704,6 +926,11 @@ def map_structure_up_to(shallow_tree, func, *inputs):
         shallow_tree. The function `func` is applied to corresponding
         partially flattened elements of each input, so the function must support
         arity of `len(inputs)`.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
@@ -713,19 +940,97 @@ def map_structure_up_to(shallow_tree, func, *inputs):
       `input_tree`.
 
   Returns:
-    result of repeatedly applying `func`, with same structure as
+    result of repeatedly applying `func`, with the same structure layout as
+    `shallow_tree`.
+  """
+  return map_structure_with_tuple_paths_up_to(
+      shallow_tree,
+      lambda _, *values: func(*values),  # Discards the path arg.
+      *inputs,
+      **kwargs)
+
+
+def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
+  """Applies a function or op to a number of partially flattened inputs.
+
+  Like map_structure_up_to(), except that the 'func' argument takes a path
+  tuple as its first argument, followed by the corresponding values from
+  *inputs.
+
+  Example:
+
+  lowercase = {'a': 'a', 'b': ('b0', 'b1')}
+  uppercase = {'a': 'A', 'b': ('B0', 'B1')}
+
+  def print_path_and_values(path, *values):
+    print("path: {}, values: {}".format(path, values))
+
+  shallow_tree = {'a': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 0), values: ('b0', 'B0')
+  >>> path: ('b', 1), values: ('b1', 'B1')
+
+  shallow_tree = {'b': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('b', 1), values: (('bo', 'b1'), ('B0', 'B1'))
+
+  shallow_tree = {'a': None, 'b': {1: None}}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 1), values: ('b1', B1')
+
+  Args:
+    shallow_tree: a shallow tree, common to all the inputs.
+    func: callable that takes args (path, inputs_0_value, ... , inputs_N_value),
+      where path is a tuple path to a leaf node in shallow_tree, and
+      inputs_i_value is the corresponding value from inputs[i].
+    *inputs: nested structures that are all structurally compatible with
+        shallow_tree.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but one of `*inputs` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+
+  Returns:
+    Result of repeatedly applying `func`. Has the same structure layout as
     `shallow_tree`.
   """
   if not inputs:
     raise ValueError("Cannot map over no sequences")
+
+  check_types = kwargs.pop("check_types", True)
+
   for input_tree in inputs:
-    assert_shallow_structure(shallow_tree, input_tree)
+    assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
-                         for input_tree in inputs]
-  results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
+  flat_value_lists = [flatten_up_to(shallow_tree, input_tree, check_types)
+                      for input_tree in inputs]
+  flat_path_list = [path for path, _
+                    in _yield_flat_up_to(shallow_tree, inputs[0])]
+  results = [func(*args, **kwargs) for args in zip(flat_path_list,
+                                                   *flat_value_lists)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
 
@@ -824,27 +1129,8 @@ def yield_flat_paths(nest):
     Tuples containing index or key values which form the path to a specific
       leaf value in the nested structure.
   """
-
-  # The _maybe_add_final_path_element function is used below in order to avoid
-  # adding trailing slashes when the sub-element recursed into is a leaf.
-  if isinstance(nest, (dict, _collections.Mapping)):
-    for key in _sorted(nest):
-      value = nest[key]
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif _is_namedtuple(nest):
-    for key in nest._fields:
-      value = getattr(nest, key)
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif isinstance(nest, _six.string_types):
-    yield ()
-  elif isinstance(nest, _collections.Sequence):
-    for idx, value in enumerate(nest):
-      for sub_path in yield_flat_paths(value):
-        yield (idx,) + sub_path
-  else:
-    yield ()
+  for k, _ in _yield_flat_up_to(nest, nest):
+    yield k
 
 
 def flatten_with_joined_string_paths(structure, separator="/"):
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 83fa5dd66084e7d6710505bc638cdc7ae4f9bbe3..0540f71f7a98b3fd574c98ae5d0406a4b5d94ff5 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -231,17 +231,17 @@ class NestTest(parameterized.TestCase, test.TestCase):
                             ["and", "goodbye", "again"])
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testIsSequence(self):
-    self.assertFalse(nest.is_sequence("1234"))
-    self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
-    self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
-    self.assertTrue(nest.is_sequence([]))
-    self.assertTrue(nest.is_sequence({"a": 1, "b": 2}))
-    self.assertFalse(nest.is_sequence(set([1, 2])))
+  def testIsNested(self):
+    self.assertFalse(nest.is_nested("1234"))
+    self.assertTrue(nest.is_nested([1, 3, [4, 5]]))
+    self.assertTrue(nest.is_nested(((7, 8), (5, 6))))
+    self.assertTrue(nest.is_nested([]))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}))
+    self.assertFalse(nest.is_nested(set([1, 2])))
     ones = array_ops.ones([2, 3])
-    self.assertFalse(nest.is_sequence(ones))
-    self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
-    self.assertFalse(nest.is_sequence(np.ones((4, 5))))
+    self.assertFalse(nest.is_nested(ones))
+    self.assertFalse(nest.is_nested(math_ops.tanh(ones)))
+    self.assertFalse(nest.is_nested(np.ones((4, 5))))
 
   @parameterized.parameters({"mapping_type": _CustomMapping},
                             {"mapping_type": dict})
@@ -510,30 +510,28 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
-    expected_message = (
-        "The two structures don't have the same sequence length. Input "
-        "structure has length 2, while shallow structure has length 3.")
-    with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_shallow_structure(inp_abc, inp_ab)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+            shallow_size=len(inp_abc),
+            input_size=len(inp_ab))):
+      nest.assert_shallow_structure(shallow_tree=inp_abc, input_tree=inp_ab)
 
     inp_ab1 = [(1, 1), (2, 2)]
     inp_ab2 = [[1, 1], [2, 2]]
-    expected_message = (
-        "The two structures don't have the same sequence type. Input structure "
-        "has type <(type|class) 'tuple'>, while shallow structure has type "
-        "<(type|class) 'list'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            shallow_type=type(inp_ab2[0]),
+            input_type=type(inp_ab1[0]))):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
-    expected_message = (
-        r"The two structures don't have the same keys. Input "
-        r"structure has keys \['c'\], while shallow structure has "
-        r"keys \['d'\].")
-
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["d"])):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
     inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
@@ -688,6 +686,244 @@ class NestTest(parameterized.TestCase, test.TestCase):
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
+  def testFlattenWithTuplePathsUpTo(self):
+    def get_paths_and_values(shallow_tree, input_tree):
+      path_value_pairs = nest.flatten_with_tuple_paths_up_to(shallow_tree,
+                                                             input_tree)
+      paths = [p for p, _ in path_value_pairs]
+      values = [v for _, v in path_value_pairs]
+      return paths, values
+
+    # Shallow tree ends at scalar.
+    input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+    shallow_tree = [[True, True], [False, True]]
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_input_tree, [[2, 2], [3, 3], [4, 9], [5, 5]])
+    self.assertEqual(flattened_shallow_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_shallow_tree, [True, True, False, True])
+
+    # Shallow tree ends at string.
+    input_tree = [[("a", 1), [("b", 2), [("c", 3), [("d", 4)]]]]]
+    shallow_tree = [["level_1", ["level_2", ["level_3", ["level_4"]]]]]
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    input_tree_flattened_paths = [p for p, _ in
+                                  nest.flatten_with_tuple_paths(input_tree)]
+    input_tree_flattened = nest.flatten(input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [(0, 0), (0, 1, 0), (0, 1, 1, 0), (0, 1, 1, 1, 0)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [("a", 1), ("b", 2), ("c", 3), ("d", 4)])
+
+    self.assertEqual(input_tree_flattened_paths,
+                     [(0, 0, 0), (0, 0, 1),
+                      (0, 1, 0, 0), (0, 1, 0, 1),
+                      (0, 1, 1, 0, 0), (0, 1, 1, 0, 1),
+                      (0, 1, 1, 1, 0, 0), (0, 1, 1, 1, 0, 1)])
+    self.assertEqual(input_tree_flattened, ["a", 1, "b", 2, "c", 3, "d", 4])
+
+    # Make sure dicts are correctly flattened, yielding values, not keys.
+    input_tree = {"a": 1, "b": {"c": 2}, "d": [3, (4, 5)]}
+    shallow_tree = {"a": 0, "b": 0, "d": [0, 0]}
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",), ("d", 0), ("d", 1)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [1, {"c": 2}, 3, (4, 5)])
+
+    # Namedtuples.
+    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    input_tree = ab_tuple(a=[0, 1], b=2)
+    shallow_tree = ab_tuple(a=0, b=1)
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [[0, 1], 2])
+
+    # Nested dicts, OrderedDicts and namedtuples.
+    input_tree = collections.OrderedDict(
+        [("a", ab_tuple(a=[0, {"b": 1}], b=2)),
+         ("c", {"d": 3, "e": collections.OrderedDict([("f", 4)])})])
+    shallow_tree = input_tree
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a", "a", 0),
+                      ("a", "a", 1, "b"),
+                      ("a", "b"),
+                      ("c", "d"),
+                      ("c", "e", "f")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",),
+                      ("c", "d"),
+                      ("c", "e")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      3,
+                      collections.OrderedDict([("f", 4)])])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", 0)])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("c",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      {"d": 3, "e": collections.OrderedDict([("f", 4)])}])
+
+    ## Shallow non-list edge-case.
+    # Using iterable elements.
+    input_tree = ["input_tree"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = ["input_tree_0", "input_tree_1"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Test case where len(shallow_tree) < len(input_tree)
+    input_tree = {"a": "A", "b": "B", "c": "C"}
+    shallow_tree = {"a": 1, "c": 2}
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_input_tree, ["A", "C"])
+    self.assertEqual(flattened_shallow_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_shallow_tree, [1, 2])
+
+    # Using non-iterable elements.
+    input_tree = [0]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = [0, 1]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Both non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = 0
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Input non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree_9", "shallow_tree_8"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = [9]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = 0
+    shallow_tree = [9, 8]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
   def testMapStructureUpTo(self):
     # Named tuples.
     ab_tuple = collections.namedtuple("ab_tuple", "a, b")
@@ -719,7 +955,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dicts.
     inp_val = dict(a=2, b=3)
     inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -736,7 +974,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dict/mapping.
     inp_val = dict(a=2, b=3)
     inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -849,12 +1089,12 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(expected, result)
 
   @parameterized.named_parameters(
-      ("tuples", (1, 2), (3, 4, 5), ValueError),
+      ("tuples", (1, 2, 3), (4, 5), ValueError),
       ("dicts", {"a": 1}, {"b": 2}, ValueError),
       ("mixed", (1, 2), [3, 4], TypeError),
       ("nested",
-       {"a": [2, 3], "b": [1, 3]},
-       {"b": [5, 6, 7], "a": [8, 9]},
+       {"a": [2, 3, 4], "b": [1, 3]},
+       {"b": [5, 6], "a": [8, 9]},
        ValueError
       ))
   def testMapWithPathsIncompatibleStructures(self, s1, s2, error_type):
@@ -884,13 +1124,14 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(expected, result)
 
   @parameterized.named_parameters([
-      dict(testcase_name="Tuples", s1=(1, 2), s2=(3, 4, 5),
+      dict(testcase_name="Tuples", s1=(1, 2, 3), s2=(4, 5),
            error_type=ValueError),
       dict(testcase_name="Dicts", s1={"a": 1}, s2={"b": 2},
            error_type=ValueError),
       dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4], error_type=TypeError),
       dict(testcase_name="Nested",
-           s1={"a": [2, 3], "b": [1, 3]}, s2={"b": [5, 6, 7], "a": [8, 9]},
+           s1={"a": [2, 3, 4], "b": [1, 3]},
+           s2={"b": [5, 6], "a": [8, 9]},
            error_type=ValueError)
   ])
   def testMapWithTuplePathsIncompatibleStructures(self, s1, s2, error_type):
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index cff864c0304b02aaa6339efb403388c65ab6fec4..2164ba4dbf22b46e7fad3ac45a164ddbdd2f01c0 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
@@ -61,4 +63,7 @@ def get_json_type(obj):
   if isinstance(obj, tensor_shape.TensorShape):
     return obj.as_list()
 
+  if isinstance(obj, collections.Mapping):
+    return dict(obj)
+
   raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index f018e1a1bd35f0111cacc20e678c0466bfd5f2e3..cf380509e9f7c547f6a518971578105bf69e4b9a 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -59,7 +59,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools as _functools
+import inspect
 import traceback as _traceback
 
 
@@ -95,6 +95,11 @@ def make_decorator(target,
     decorator_func.__name__ = target.__name__
   if hasattr(target, '__module__'):
     decorator_func.__module__ = target.__module__
+  if hasattr(target, '__dict__'):
+    # Copy dict entries from target which are not overridden by decorator_func.
+    for name in target.__dict__:
+      if name not in decorator_func.__dict__:
+        decorator_func.__dict__[name] = target.__dict__[name]
   if hasattr(target, '__doc__'):
     decorator_func.__doc__ = decorator.__doc__
   decorator_func.__wrapped__ = target
@@ -133,6 +138,10 @@ def rewrap(decorator_func, previous_target, new_target):
     decorator_func: Callable returned by `wrap`.
     previous_target: Callable that needs to be replaced.
     new_target: Callable to replace previous_target with.
+
+  Returns:
+    The updated decorator. If decorator_func is not a tf_decorator, new_target
+    is returned.
   """
   # Because the process mutates the decorator, we only need to alter the
   # innermost function that wraps previous_target.
@@ -145,12 +154,31 @@ def rewrap(decorator_func, previous_target, new_target):
     if target.decorated_target is previous_target:
       break
     cur = target.decorated_target
+    assert cur is not None
 
+  # If decorator_func is not a decorator, new_target replaces it directly.
   if innermost_decorator is None:
-    return
+    # Consistency check. The caller should always pass the result of
+    # tf_decorator.unwrap as previous_target. If decorator_func is not a
+    # decorator, that will have returned decorator_func itself.
+    assert decorator_func is previous_target
+    return new_target
 
   target.decorated_target = new_target
-  innermost_decorator.__wrapped__ = new_target
+
+  if inspect.ismethod(innermost_decorator):
+    # Bound methods can't be assigned attributes. Thankfully, they seem to
+    # be just proxies for their unbound counterpart, and we can modify that.
+    if hasattr(innermost_decorator, '__func__'):
+      innermost_decorator.__func__.__wrapped__ = new_target
+    elif hasattr(innermost_decorator, 'im_func'):
+      innermost_decorator.im_func.__wrapped__ = new_target
+    else:
+      innermost_decorator.__wrapped__ = new_target
+  else:
+    innermost_decorator.__wrapped__ = new_target
+
+  return decorator_func
 
 
 def unwrap(maybe_tf_decorator):
@@ -207,8 +235,8 @@ class TFDecorator(object):
     else:
       self.__doc__ = ''
 
-  def __get__(self, obj, objtype):
-    return _functools.partial(self.__call__, obj)
+  def __get__(self, instance, owner):
+    return self._decorated_target.__get__(instance, owner)
 
   def __call__(self, *args, **kwargs):
     return self._decorated_target(*args, **kwargs)
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 9198f0b3fad1590bedac71b30cf332e35cb489fe..48d735189cdb0acb394747aa3a99864393ccda7b 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -170,6 +170,17 @@ class TfDecoratorTest(test.TestCase):
     self.assertEqual('Return parameters.',
                      TestDecoratedClass().return_params.__doc__)
 
+  def testTarget__get__IsProxied(self):
+    class Descr(object):
+
+      def __get__(self, instance, owner):
+        return self
+
+    class Foo(object):
+      foo = tf_decorator.TFDecorator('Descr', Descr())
+
+    self.assertIsInstance(Foo.foo, Descr)
+
 
 def test_wrapper(*args, **kwargs):
   return test_function(*args, **kwargs)
@@ -199,6 +210,20 @@ class TfMakeDecoratorTest(test.TestCase):
     decorator = getattr(decorated, '_tf_decorator')
     self.assertEqual('test decorator doc', decorator.decorator_doc)
 
+  def testUpdatesDictWithMissingEntries(self):
+    test_function.foobar = True
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertTrue(decorated.foobar)
+    del test_function.foobar
+
+  def testUpdatesDict_doesNotOverridePresentEntries(self):
+    test_function.foobar = True
+    test_wrapper.foobar = False
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertFalse(decorated.foobar)
+    del test_function.foobar
+    del test_wrapper.foobar
+
   def testSetsTFDecoratorArgSpec(self):
     argspec = tf_inspect.ArgSpec(
         args=['a', 'b', 'c'],
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 90c9c4b5b38dad824f4132513cc71a82fafcbf92..04c96d03617ebf6d9884b6b70ee4b952470c2a01 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -46,6 +46,7 @@ import functools
 import sys
 
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 ESTIMATOR_API_NAME = 'estimator'
 KERAS_API_NAME = 'keras'
@@ -169,7 +170,7 @@ def get_v1_names(symbol):
   estimator_api_attr_v1 = API_ATTRS_V1[ESTIMATOR_API_NAME].names
   keras_api_attr_v1 = API_ATTRS_V1[KERAS_API_NAME].names
 
-  if not hasattr(symbol, tensorflow_api_attr_v1):
+  if not hasattr(symbol, '__dict__'):
     return names_v1
   if tensorflow_api_attr_v1 in symbol.__dict__:
     names_v1.extend(getattr(symbol, tensorflow_api_attr_v1))
@@ -195,7 +196,7 @@ def get_v2_names(symbol):
   estimator_api_attr = API_ATTRS[ESTIMATOR_API_NAME].names
   keras_api_attr = API_ATTRS[KERAS_API_NAME].names
 
-  if not hasattr(symbol, tensorflow_api_attr):
+  if not hasattr(symbol, '__dict__'):
     return names_v2
   if tensorflow_api_attr in symbol.__dict__:
     names_v2.extend(getattr(symbol, tensorflow_api_attr))
@@ -251,7 +252,7 @@ def get_v2_constants(module):
 class api_export(object):  # pylint: disable=invalid-name
   """Provides ways to export symbols to the TensorFlow API."""
 
-  def __init__(self, *args, **kwargs):
+  def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
     """Export under the names *args (first one is considered canonical).
 
     Args:
@@ -269,6 +270,10 @@ class api_export(object):  # pylint: disable=invalid-name
     """
     self._names = args
     self._names_v1 = kwargs.get('v1', args)
+    if 'v2' in kwargs:
+      raise ValueError('You passed a "v2" argument to tf_export. This is not '
+                       'what you want. Pass v2 names directly as positional '
+                       'arguments instead.')
     self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
@@ -375,6 +380,21 @@ class api_export(object):  # pylint: disable=invalid-name
         (self._names_v1, name))
 
 
+def kwarg_only(f):
+  """A wrapper that throws away all non-kwarg arguments."""
+  f_argspec = tf_inspect.getargspec(f)
+
+  def wrapper(*args, **kwargs):
+    if args:
+      raise TypeError(
+          '{f} only takes keyword args (possible keys: {kwargs}). '
+          'Please pass these args as kwargs instead.'
+          .format(f=f.__name__, kwargs=f_argspec.args))
+    return f(**kwargs)
+
+  return tf_decorator.make_decorator(f, wrapper, decorator_argspec=f_argspec)
+
+
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
 estimator_export = functools.partial(api_export, api_name=ESTIMATOR_API_NAME)
 keras_export = functools.partial(api_export, api_name=KERAS_API_NAME)
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 5f1e776640df3e2b75e6a0b8accfce40098cf36c..906776d5dbae7356d68cf0ff39fd29e10c96a40e 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -36,6 +36,19 @@ else:
       'annotations'
   ])
 
+
+def _convert_maybe_argspec_to_fullargspec(argspec):
+  if isinstance(argspec, FullArgSpec):
+    return argspec
+  return FullArgSpec(
+      args=argspec.args,
+      varargs=argspec.varargs,
+      varkw=argspec.keywords,
+      defaults=argspec.defaults,
+      kwonlyargs=[],
+      kwonlydefaults=None,
+      annotations={})
+
 if hasattr(_inspect, 'getfullargspec'):
   _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
 
@@ -74,16 +87,7 @@ else:
     Returns:
       A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
     """
-    argspecs = getargspec(target)
-    fullargspecs = FullArgSpec(
-        args=argspecs.args,
-        varargs=argspecs.varargs,
-        varkw=argspecs.keywords,
-        defaults=argspecs.defaults,
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-    return fullargspecs
+    return _convert_maybe_argspec_to_fullargspec(getargspec(target))
 
 
 def currentframe():
@@ -194,7 +198,9 @@ def _get_argspec_for_partial(obj):
   # Partial function may give default value to any argument, therefore length
   # of default value list must be len(args) to allow each argument to
   # potentially be given a default value.
-  all_defaults = [None] * len(args)
+  no_default = object()
+  all_defaults = [no_default] * len(args)
+
   if defaults:
     all_defaults[-len(defaults):] = defaults
 
@@ -204,7 +210,8 @@ def _get_argspec_for_partial(obj):
     all_defaults[idx] = default
 
   # Find first argument with default value set.
-  first_default = next((idx for idx, x in enumerate(all_defaults) if x), None)
+  first_default = next(
+      (idx for idx, x in enumerate(all_defaults) if x is not no_default), None)
 
   # If no default values are found, return ArgSpec with defaults=None.
   if first_default is None:
@@ -212,7 +219,8 @@ def _get_argspec_for_partial(obj):
 
   # Checks if all arguments have default value set after first one.
   invalid_default_values = [
-      args[i] for i, j in enumerate(all_defaults) if not j and i > first_default
+      args[i] for i, j in enumerate(all_defaults)
+      if j is no_default and i > first_default
   ]
 
   if invalid_default_values:
@@ -238,7 +246,7 @@ def getfullargspec(obj):
     directly on the callable.
   """
   decorators, target = tf_decorator.unwrap(obj)
-  return next((d.decorator_argspec
+  return next((_convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
                for d in decorators
                if d.decorator_argspec is not None), _getfullargspec(target))
 
@@ -380,3 +388,22 @@ def isroutine(object):  # pylint: disable=redefined-builtin
 def stack(context=1):
   """TFDecorator-aware replacement for inspect.stack."""
   return _inspect.stack(context)[1:]
+
+
+def getsource_no_unwrap(obj):
+  """Return source code for an object. Does not unwrap TFDecorators.
+
+  The source code is returned literally, including indentation for functions not
+  at the top level. This function is analogous to inspect.getsource, with one
+  key difference - it doesn't unwrap decorators. For simplicity, support for
+  some Python object types is dropped (tracebacks, frames, code objects).
+
+  Args:
+      obj: a class, method, or function object.
+
+  Returns:
+      source code as a string
+
+  """
+  lines, lnum = _inspect.findsource(obj)
+  return ''.join(_inspect.getblock(lines[lnum:]))
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 02d075cdff97fc11274186b42e10d71744234364..7c030d692160a3dbcc6259961cd024cea35f64de 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -122,6 +122,20 @@ class TfInspectTest(test.TestCase):
 
     self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
 
+  def testGetArgSpecOnPartialArgumentWithConvertibleToFalse(self):
+    """Tests getargspec on partial function with args that convert to False."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, m=0)
+
+    exception_message = (r"Some arguments \['n'\] do not have default value, "
+                         "but they are positioned after those with default "
+                         "values. This can not be expressed with ArgSpec.")
+    with self.assertRaisesRegexp(ValueError, exception_message):
+      tf_inspect.getargspec(partial_func)
+
   def testGetArgSpecOnPartialInvalidArgspec(self):
     """Tests getargspec on partial function that doesn't have valid argspec."""
 
@@ -727,6 +741,73 @@ class TfInspectGetCallArgsTest(test.TestCase):
         'c': 'goodbye'
     }, tf_inspect.getcallargs(decorated, 4, c='goodbye'))
 
+  def testGetSourceNoUnwrapHandlesPlainDecorator(self):
+    def dec(f):
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+      return wrapper
+
+    @dec
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesFunctoolsDecorator(self):
+    def dec(f):
+      @functools.wraps(f)
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+      return wrapper
+
+    @dec
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesPlainDecoratorFactory(self):
+    def dec_factory():
+      def dec(f):
+        def wrapper(*args, **kwargs):
+          return f(*args, **kwargs)
+        return wrapper
+      return dec
+
+    @dec_factory()
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('factory', source)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesFunctoolsDecoratorFactory(self):
+    def dec_factory():
+      def dec(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+          return f(*args, **kwargs)
+        return wrapper
+      return dec
+
+    @dec_factory()
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('factory', source)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39..bda0cba82fa31528337cd35d26f5daa577a43d55 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -455,6 +455,14 @@ class SparseTensorValueIterator : public ValueIterator {
   Safe_PyObjectPtr tensor_;
 };
 
+// Returns nullptr (to raise an exception) when next() is called.  Caller
+// should have already called PyErr_SetString.
+class ErrorValueIterator : public ValueIterator {
+ public:
+  ErrorValueIterator() {}
+  Safe_PyObjectPtr next() override { return nullptr; }
+};
+
 class AttrsValueIterator : public ValueIterator {
  public:
   explicit AttrsValueIterator(PyObject* nested) : nested_(nested) {
@@ -497,6 +505,35 @@ bool IsSparseTensorValueType(PyObject* o) {
              o, reinterpret_cast<PyTypeObject*>(sparse_tensor_value_type)) == 1;
 }
 
+// Returns 1 if `o` is an instance of CompositeTensor.
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+bool IsCompositeTensorHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    PyObject* composite_tensor_type = GetRegisteredType("CompositeTensor");
+    if (TF_PREDICT_FALSE(composite_tensor_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "CompositeTensor type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"CompositeTensor\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    int is_instance = PyObject_IsInstance(to_check, composite_tensor_type);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0);
+  });
+  return check_cache->CachedLookup(o);
+}
+
+int IsSequenceOrCompositeHelper(PyObject* o) {
+  return IsSequence(o) || IsCompositeTensor(o);
+}
+
 int IsSequenceForDataHelper(PyObject* o) {
   return IsSequenceHelper(o) == 1 && !PyList_Check(o) &&
          !IsSparseTensorValueType(o);
@@ -529,6 +566,18 @@ ValueIteratorPtr GetValueIteratorForData(PyObject* nested) {
   }
 }
 
+// Similar to GetValueIterator above, but expands CompositeTensors.
+ValueIteratorPtr GetValueIteratorForComposite(PyObject* nested) {
+  if (IsCompositeTensor(nested)) {
+    static char expand_method_name[] = "_to_components";
+    nested = PyObject_CallMethod(nested, expand_method_name, nullptr);
+    if (PyErr_Occurred() || nested == nullptr) {
+      return absl::make_unique<ErrorValueIterator>();
+    }
+  }
+  return GetValueIterator(nested);
+}
+
 bool FlattenHelper(
     PyObject* nested, PyObject* list,
     const std::function<int(PyObject*)>& is_sequence_helper,
@@ -596,7 +645,8 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 bool AssertSameStructureHelper(
     PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
     bool* is_type_error,
-    const std::function<int(PyObject*)>& is_sequence_helper) {
+    const std::function<int(PyObject*)>& is_sequence_helper,
+    const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter) {
   DCHECK(error_msg);
   DCHECK(is_type_error);
   const bool is_seq1 = is_sequence_helper(o1);
@@ -702,8 +752,8 @@ bool AssertSameStructureHelper(
     }
   }
 
-  ValueIteratorPtr iter1 = GetValueIterator(o1);
-  ValueIteratorPtr iter2 = GetValueIterator(o2);
+  ValueIteratorPtr iter1 = value_iterator_getter(o1);
+  ValueIteratorPtr iter2 = value_iterator_getter(o2);
 
   if (!iter1->valid() || !iter2->valid()) return false;
 
@@ -714,9 +764,9 @@ bool AssertSameStructureHelper(
       if (Py_EnterRecursiveCall(" in assert_same_structure")) {
         return false;
       }
-      bool no_internal_errors =
-          AssertSameStructureHelper(v1.get(), v2.get(), check_types, error_msg,
-                                    is_type_error, is_sequence_helper);
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1.get(), v2.get(), check_types, error_msg, is_type_error,
+          is_sequence_helper, value_iterator_getter);
       Py_LeaveRecursiveCall();
       if (!no_internal_errors) return false;
       if (!error_msg->empty()) return true;
@@ -742,9 +792,13 @@ bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
 bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
 bool IsIndexedSlices(PyObject* o) { return IsIndexedSlicesHelper(o) == 1; }
 
-PyObject* Flatten(PyObject* nested) {
+PyObject* Flatten(PyObject* nested, bool expand_composites) {
   PyObject* list = PyList_New(0);
-  if (FlattenHelper(nested, list, IsSequenceHelper, GetValueIterator)) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
+  if (FlattenHelper(nested, list, is_sequence_helper, get_value_iterator)) {
     return list;
   } else {
     Py_DECREF(list);
@@ -752,6 +806,12 @@ PyObject* Flatten(PyObject* nested) {
   }
 }
 
+bool IsSequenceOrComposite(PyObject* o) {
+  return IsSequenceOrCompositeHelper(o) == 1;
+}
+
+bool IsCompositeTensor(PyObject* o) { return IsCompositeTensorHelper(o) == 1; }
+
 bool IsSequenceForData(PyObject* o) { return IsSequenceForDataHelper(o) == 1; }
 
 PyObject* FlattenForData(PyObject* nested) {
@@ -850,11 +910,16 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
   }
 }
 
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceHelper);
+                            is_sequence_helper, get_value_iterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
@@ -878,7 +943,7 @@ PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceForDataHelper);
+                            IsSequenceForDataHelper, GetValueIterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index f37cd527d819fad36bcac7b914e416bf788c8cb3..4a5db93401c328c056d80f678dd47d66306d53b3 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,30 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsSequenceOrComposite(PyObject* o);
+
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsCompositeTensor(PyObject* o);
+
 // Implements the same interface as tensorflow.util.nest._is_namedtuple
 // Returns Py_True iff `instance` should be considered a `namedtuple`.
 //
@@ -118,7 +142,8 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
 //
 // Returns:
 //  Py_None on success, nullptr on error.
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites);
 
 // Implements the same interface as tensorflow.util.nest.flatten
 //
@@ -139,6 +164,9 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // Args:
 //   nest: an arbitrarily nested structure or a scalar object. Note, numpy
 //       arrays are considered scalars.
+//   expand_composites: If true, then composite tensors (such as
+//       `tf.SparseTensor` and `tf.RaggedTensor` are flattened into their
+//       component tensors.
 //
 // Returns:
 //   A Python list, the flattened version of the input.
@@ -146,7 +174,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 //
 // Raises:
 //   TypeError: The nest is or contains a dict with non-sortable keys.
-PyObject* Flatten(PyObject* nested);
+PyObject* Flatten(PyObject* nested, bool expand_composites = false);
 
 // The tensorflow.python.data package has its own nest utility that follows very
 // slightly different semantics for its functions than the tensorflow.python
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 4d34d61eee65ea48ad4fbb2894699695110fc76c..6e2a3d8ccfc48bd9234e0c42229fb37dd9fa1ce4 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -35,7 +35,7 @@ limitations under the License.
 %noexception tensorflow::swig::IsTensor;
 
 %feature("docstring") tensorflow::swig::IsSequence
-"""Returns a true if its input is a collections.Sequence (except strings).
+"""Returns true if its input is a collections.Sequence (except strings).
 
 Args:
   seq: an input sequence.
@@ -47,6 +47,31 @@ Returns:
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%feature("docstring") tensorflow::swig::IsSequenceOrComposite
+"""Returns true if its input is a sequence or a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a not a string and is a collections.Sequence or a
+  dict or a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsSequenceOrComposite;
+%noexception tensorflow::swig::IsSequenceOrComposite;
+
+%feature("docstring") tensorflow::swig::IsCompositeTensor
+"""Returns true if its input is a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsCompositeTensor;
+%noexception tensorflow::swig::IsCompositeTensor;
+
 %unignore tensorflow::swig::IsNamedtuple;
 %noexception tensorflow::swig::IsNamedtuple;
 
@@ -103,6 +128,8 @@ running.
 Args:
   nest: an arbitrarily nested structure or a scalar object. Note, numpy
       arrays are considered scalars.
+  expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+      and `tf.RaggedTensor` are expanded into their component tensors.
 
 Returns:
   A Python list, the flattened version of the input.
@@ -112,6 +139,7 @@ Raises:
 """
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
+%feature("kwargs") tensorflow::swig::Flatten;
 
 %feature("docstring") tensorflow::swig::IsSequenceForData
 """Returns a true if `seq` is a Sequence or dict (except strings/lists).
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index fca28e016afeef8adbe6718ceda1bb76c8849172..b1f5675cf942d930e175f4d3258d9be68a40fdb3 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -84,6 +84,7 @@ cc_library(
         "stream_executor_internal.h",
     ],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -128,6 +129,7 @@ cc_library(
         "trace_listener.h",
     ],
     deps = [
+        ":allocator_stats",
         ":device_memory",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
@@ -190,6 +192,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -386,6 +389,7 @@ cc_library(
         "trace_listener.h",
     ],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         ":stream_executor_headers",
@@ -460,15 +464,6 @@ tf_proto_library(
     provide_cc_alias = True,
 )
 
-tf_proto_library(
-    name = "logging_proto",
-    srcs = ["logging.proto"],
-    cc_api_version = 2,
-    protodeps = [":dnn_proto"],
-    provide_cc_alias = True,
-    visibility = [":friends"],
-)
-
 cc_library(
     name = "dnn",
     srcs = ["dnn.cc"],
@@ -497,6 +492,7 @@ cc_library(
         "stream_executor_internal.h",
     ],
     deps = [
+        ":allocator_stats",
         ":device_description",
         ":device_memory",
         ":device_options",
@@ -510,6 +506,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
@@ -527,6 +524,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":platform",
         ":stream_executor_headers",
@@ -561,6 +559,7 @@ cc_library(
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -601,6 +600,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
         "//tensorflow/core:lib",
@@ -638,6 +638,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "allocator_stats",
+    srcs = [
+        "allocator_stats.cc",
+    ],
+    hdrs = ["allocator_stats.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cc_test(
     name = "stream_test",
     size = "small",
@@ -654,3 +667,8 @@ alias(
     name = "cuda_platform",
     actual = "//tensorflow/stream_executor/cuda:all_runtime",
 )
+
+alias(
+    name = "rocm_platform",
+    actual = "//tensorflow/stream_executor/rocm:all_runtime",
+)
diff --git a/tensorflow/stream_executor/allocator_stats.cc b/tensorflow/stream_executor/allocator_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..440d6f46a3cbd8740c55a239865ce5f89b96b4f3
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/allocator_stats.h"
+#include "absl/strings/str_format.h"
+
+namespace stream_executor {
+
+string AllocatorStats::DebugString() const {
+  return absl::StrFormat(
+      "Limit:        %20lld\n"
+      "InUse:        %20lld\n"
+      "MaxInUse:     %20lld\n"
+      "NumAllocs:    %20lld\n"
+      "MaxAllocSize: %20lld\n",
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..786ceb0fdd6fdea829d095923dc774d63a5de625
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Runtime statistics collected by an allocator. Exactly the same as
+// tensorflow::AllocatorStats, but independently defined to preserve the mutual
+// independence of StreamExecutor and TensorFlow.
+struct AllocatorStats {
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
+
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
+
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
+
+  string DebugString() const;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 717c13d113a05c5150768692ef6526cc2ce27817..575ff639e7566b801dffc8e452b26a2cea0895fb 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -1,12 +1,20 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "rocm_is_configured")
+
 def stream_executor_friends():
     return ["//tensorflow/..."]
 
 def tf_additional_cuda_platform_deps():
-  return []
+    return []
 
-# Use dynamic loading, therefore should be empty.
 def tf_additional_cuda_driver_deps():
-  return []
+    return [":cuda_stub"]
 
 def tf_additional_cudnn_plugin_deps():
-  return []
+    return []
+
+# Returns whether any GPU backend is configuered.
+def if_gpu_is_configured(x):
+    if cuda_is_configured() or rocm_is_configured():
+        return x
+    return []
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 77a185104709b2e2396aa4d53495ef61cf634ac4..42d37424b2526f807c66c351291469e5da1db51b 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -45,17 +45,19 @@ cc_library(
     srcs = if_cuda_is_configured(["cuda_platform.cc"]),
     hdrs = if_cuda_is_configured(["cuda_platform.h"]),
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_driver",
-        ":cuda_gpu_executor",
-        ":cuda_platform_id",
-        "//tensorflow/stream_executor",  # buildcleaner: keep
-        "//tensorflow/stream_executor:executor_cache",
-        "//tensorflow/stream_executor:multi_platform_manager",
-        "//tensorflow/stream_executor:stream_executor_pimpl_header",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
-    ] + tf_additional_cuda_platform_deps()),
+    deps = if_cuda_is_configured(
+        [
+            ":cuda_driver",
+            ":cuda_gpu_executor",
+            ":cuda_platform_id",
+            "//tensorflow/stream_executor",  # buildcleaner: keep
+            "//tensorflow/stream_executor:executor_cache",
+            "//tensorflow/stream_executor:multi_platform_manager",
+            "//tensorflow/stream_executor:stream_executor_pimpl_header",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform",
+        ],
+    ) + tf_additional_cuda_platform_deps(),
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
 )
 
@@ -66,49 +68,95 @@ cc_library(
     deps = if_cuda_is_configured([
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
     ]),
 )
 
+cc_library(
+    name = "cuda_stub",
+    srcs = if_cuda_is_configured(["cuda_stub.cc"]),
+    textual_hdrs = ["cuda_10_0.inc"],
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
 cc_library(
     name = "cuda_driver",
     srcs = if_cuda_is_configured(["cuda_driver.cc"]),
-    hdrs = if_cuda_is_configured([
-        "cuda_driver.h",
-        "cuda_driver_wrapper.h",
-    ]),
+    hdrs = if_cuda_is_configured(["cuda_driver.h"]),
     deps = if_cuda_is_configured([
         ":cuda_diagnostics",
+        "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + tf_additional_cuda_driver_deps()),
+    ] + tf_additional_cuda_driver_deps()) + select({
+        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
+        "//conditions:default": ["//tensorflow/core:cuda"],
+    }),
+)
+
+cc_library(
+    name = "cudart_stub",
+    srcs = select({
+        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub.cc"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub.cc"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [
+            ":cuda_stub",
+            "@local_config_cuda//cuda:cuda_headers",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform:dso_loader",
+        ],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": [
+            ":cuda_stub",
+            "@local_config_cuda//cuda:cuda_headers",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform:dso_loader",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 # The activation library is tightly coupled to the executor library.
 # TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
 cc_library(
     name = "cuda_activation_header",
-    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
+    hdrs = ["cuda_activation.h"],
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured(["//tensorflow/stream_executor/platform"]),
+    deps = [
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/platform",
+    ],
 )
 
 cc_library(
     name = "cuda_activation",
-    srcs = if_cuda_is_configured(["cuda_activation.cc"]),
+    srcs = [],
     hdrs = if_cuda_is_configured(["cuda_activation.h"]),
     deps = if_cuda_is_configured([
         ":cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
         "//tensorflow/stream_executor/platform",
     ]),
 )
@@ -120,11 +168,23 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_kernel",
         "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
     ]),
 )
 
+cc_library(
+    name = "cublas_stub",
+    srcs = if_cuda_is_configured(["cublas_stub.cc"]),
+    textual_hdrs = glob(["cublas_*.inc"]),
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured(["cuda_blas.cc"]),
@@ -133,10 +193,10 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_activation",
         ":cuda_gpu_executor",
-        ":cuda_helpers",
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_timer",
+        ":cuda_helpers",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
@@ -147,13 +207,27 @@ cc_library(
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:scratch_allocator",
         "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-        "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(["@local_config_cuda//cuda:cublas"])),
+    ] + if_static(
+        ["@local_config_cuda//cuda:cublas"],
+        [":cublas_stub"],
+    )),
     alwayslink = True,
 )
 
+cc_library(
+    name = "cufft_stub",
+    srcs = if_cuda_is_configured(["cufft_stub.cc"]),
+    textual_hdrs = ["cufft_10_0.inc"],
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
 cc_library(
     name = "cufft_plugin",
     srcs = if_cuda_is_configured(["cuda_fft.cc"]),
@@ -162,30 +236,40 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_activation_header",
         ":cuda_gpu_executor_header",
-        ":cuda_helpers",
         ":cuda_platform_id",
         ":cuda_stream",
+        ":cuda_helpers",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:fft",
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(["@local_config_cuda//cuda:cufft"])),
+    ] + if_static(
+        ["@local_config_cuda//cuda:cufft"],
+        [":cufft_stub"],
+    )),
     alwayslink = True,
 )
 
+cc_library(
+    name = "cudnn_stub",
+    srcs = if_cuda_is_configured(["cudnn_stub.cc"]),
+    textual_hdrs = glob(["cudnn_*.inc"]),
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cudnn_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
 cc_library(
     name = "cudnn_plugin",
     srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
     hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
-    copts = [
-        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
-        # setting of template depth 256
-        "-ftemplate-depth-512",
-    ],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation",
@@ -201,21 +285,32 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:logger",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
-        "//tensorflow/stream_executor:logging_proto_cc",
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:scratch_allocator",
         "//tensorflow/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/stream_executor:temporary_device_memory",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-        "//tensorflow/stream_executor/platform:dso_loader",
-    ] + tf_additional_cudnn_plugin_deps() + if_static(["@local_config_cuda//cuda:cudnn"])),
+    ]) + tf_additional_cudnn_plugin_deps() + if_cuda_is_configured(if_static(
+        ["@local_config_cuda//cuda:cudnn"],
+        [":cudnn_stub"],
+    )),
     alwayslink = True,
 )
 
+cc_library(
+    name = "curand_stub",
+    srcs = if_cuda_is_configured(["curand_stub.cc"]),
+    textual_hdrs = ["curand_10_0.inc"],
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
 cc_library(
     name = "curand_plugin",
     srcs = if_cuda_is_configured(["cuda_rng.cc"]),
@@ -223,28 +318,35 @@ cc_library(
     deps = if_cuda_is_configured([
         ":cuda_activation",
         ":cuda_gpu_executor",
-        ":cuda_helpers",
         ":cuda_platform_id",
         ":cuda_stream",
+        ":cuda_helpers",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(["@local_config_cuda//cuda:curand"])),
+    ] + if_static(
+        ["@local_config_cuda//cuda:curand"],
+        [":curand_stub"],
+    )),
     alwayslink = True,
 )
 
 cc_library(
     name = "cuda_kernel",
+    srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
     hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
     deps = if_cuda_is_configured([
         ":cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
     ]),
@@ -254,6 +356,9 @@ cc_library(
 cc_library(
     name = "cuda_helpers",
     textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
+    deps = if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+    ]),
 )
 
 cc_library(
@@ -265,19 +370,22 @@ cc_library(
         ":cuda_gpu_executor_header",
         ":cuda_stream",
         "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/stream_executor/lib",
     ]),
 )
 
 cc_library(
     name = "cuda_stream",
-    srcs = if_cuda_is_configured(["cuda_stream.cc"]),
+    srcs = [],
     hdrs = if_cuda_is_configured(["cuda_stream.h"]),
     deps = if_cuda_is_configured([
         ":cuda_driver",
         ":cuda_gpu_executor_header",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
     ]),
@@ -285,18 +393,18 @@ cc_library(
 
 cc_library(
     name = "cuda_timer",
-    srcs = if_cuda_is_configured(["cuda_timer.cc"]),
+    srcs = [],
     hdrs = if_cuda_is_configured(["cuda_timer.h"]),
     deps = if_cuda_is_configured([
         ":cuda_driver",
         ":cuda_gpu_executor_header",
         ":cuda_stream",
         "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
         "//tensorflow/stream_executor/lib",
     ]),
 )
 
-# It implements :cuda_gpu_executor_header
 cc_library(
     name = "cuda_gpu_executor",
     srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
@@ -316,6 +424,7 @@ cc_library(
         "//tensorflow/stream_executor:stream_executor_internal",
         "//tensorflow/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
diff --git a/tensorflow/stream_executor/cuda/cublas_10_0.inc b/tensorflow/stream_executor/cuda/cublas_10_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..854545f4f7718b56bf5661874fecf65897e399e7
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_10_0.inc
@@ -0,0 +1,5200 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdErr, const char* logFileName) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *devicePtr, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *y, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B, 
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B,
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
+                                                  const void *hostPtr, int incx, 
+                                                  void *devicePtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
+                                                  const void *devicePtr, int incx,
+                                                  void *hostPtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
+  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *x, 
+                                                     cudaDataType xType,
+                                                     int incx, 
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     const float *y, 
+                                                     int incy,
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     const double *y,
+                                                     int incy,
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *alpha,  /* host or device pointer */
+                                                     cudaDataType alphaType,
+                                                     void *x, 
+                                                     cudaDataType xType,
+                                                     int incx,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *alpha,  /* host or device pointer */
+                                                     float *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *alpha,  /* host or device pointer */
+                                                     double *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuComplex *alpha, /* host or device pointer */
+                                                     cuComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */
+                                                     cuDoubleComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
+                                                      int n,
+                                                      const void *alpha, /* host or device pointer */
+                                                      cudaDataType alphaType,
+                                                      const void *x,
+                                                      cudaDataType xType,
+                                                      int incx,
+                                                      void *y,
+                                                      cudaDataType yType,
+                                                      int incy,
+                                                      cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *alpha, /* host or device pointer */
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,      /* host or device pointer */
+                                                     const cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,            /* host or device pointer */
+                                                     const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
+                                                     float *a,   /* host or device pointer */
+                                                     float *b,   /* host or device pointer */
+                                                     float *c,   /* host or device pointer */
+                                                     float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
+                                                     double *a,  /* host or device pointer */
+                                                     double *b,  /* host or device pointer */
+                                                     double *c,  /* host or device pointer */
+                                                     double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
+                                                     cuComplex *a,  /* host or device pointer */
+                                                     cuComplex *b,  /* host or device pointer */
+                                                     float *c,      /* host or device pointer */
+                                                     cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
+                                                     cuDoubleComplex *a,  /* host or device pointer */
+                                                     cuDoubleComplex *b,  /* host or device pointer */
+                                                     double *c,           /* host or device pointer */
+                                                     cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
+                                                      float *d1,        /* host or device pointer */
+                                                      float *d2,        /* host or device pointer */
+                                                      float *x1,        /* host or device pointer */
+                                                      const float *y1,  /* host or device pointer */
+                                                      float *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
+                                                      double *d1,        /* host or device pointer */  
+                                                      double *d2,        /* host or device pointer */  
+                                                      double *x1,        /* host or device pointer */  
+                                                      const double *y1,  /* host or device pointer */  
+                                                      double *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      const float *beta,  /* host or device pointer */
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda, 
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda, 
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,   /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x, 
+                                                      int incx,
+                                                      const float *beta,  /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,   /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x, 
+                                                      int incx,
+                                                      const double *beta,   /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha,  /* host or device pointer */                                           
+                                                      const float *AP,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *AP,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *AP,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *AP,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     const float *y,
+                                                     int incy,
+                                                     float *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
+                                                     int m,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */   
+                                                     const double *x,
+                                                     int incx,
+                                                     const double *y,
+                                                     int incy,
+                                                     double *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *y,
+                                                      int incy,
+                                                      double *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx, 
+                                                      const double *y,
+                                                      int incy,
+                                                      double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb, 
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const void *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const void *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc,
+                                                      cudaDataType computeType,
+                                                      cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
+                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
+                                                           int m, int n, int k, 
+                                                           const unsigned char *A, int A_bias, int lda, 
+                                                           const unsigned char *B, int B_bias, int ldb,
+                                                                 unsigned char *C, int C_bias, int ldc,
+                                                           int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      int n, 
+                                                      int k,
+                                                      const cuComplex *alpha, 
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, 
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo, 
+                                                       cublasOperation_t trans, 
+                                                       int n, 
+                                                       int k,
+                                                       const float *alpha, 
+                                                       const void *A, cudaDataType Atype, 
+                                                       int lda,
+                                                       const float *beta, 
+                                                       void *C, 
+                                                       cudaDataType Ctype, 
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const float *alpha, /* host or device pointer */  
+                                                       const float *A,
+                                                       int lda,
+                                                       const float *B,
+                                                       int ldb,
+                                                       const float *beta, /* host or device pointer */  
+                                                       float *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const double *alpha, /* host or device pointer */  
+                                                       const double *A,
+                                                       int lda,
+                                                       const double *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       double *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const cuComplex *beta, /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                       const cuDoubleComplex *A,
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const float *beta,   /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans, 
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                       const cuDoubleComplex *A, 
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const float *alpha, /* host or device pointer */ 
+                                                    const float *A,
+                                                    int lda,
+                                                    const float *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    float *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const double *alpha, /* host or device pointer */ 
+                                                    const double *A,
+                                                    int lda,
+                                                    const double *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    double *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const cuComplex *beta, /* host or device pointer */ 
+                                                    cuComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo, 
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    cuComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      float *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,                                        
+                                                     int lda,
+                                                     cuDoubleComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda, 
+                                                      const float *B,
+                                                      int ldb,
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     const cuComplex *B,
+                                                     int ldb,
+                                                     cuComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     const cuDoubleComplex *B,
+                                                     int ldb,
+                                                     cuDoubleComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const float *alpha,  /* host or device pointer */  
+                                                          const float *const Aarray[], 
+                                                          int lda,
+                                                          const float *const Barray[],
+                                                          int ldb, 
+                                                          const float *beta,   /* host or device pointer */  
+                                                          float *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *const [], int, const float *const [], int, const float *, float *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const double *alpha,  /* host or device pointer */ 
+                                                          const double *const Aarray[], 
+                                                          int lda,
+                                                          const double *const Barray[],
+                                                          int ldb, 
+                                                          const double *beta,  /* host or device pointer */ 
+                                                          double *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *const [], int, const double *const [], int, const double *, double *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                          const cuDoubleComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuDoubleComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                          cuDoubleComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, const cuDoubleComplex *const [], int, const cuDoubleComplex *, cuDoubleComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const void *alpha, /* host or device pointer */  
+                                                      const void *const Aarray[], 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *const Barray[],
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const void *beta, /* host or device pointer */  
+                                                      void *const Carray[],
+                                                      cudaDataType Ctype,
+                                                      int ldc,
+                                                      int batchCount,
+                                                      cudaDataType computeType,
+                                                      cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *const [], cudaDataType, int, const void *const [], cudaDataType, int, const void *, void *const [], cudaDataType, int, int, cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda, Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const void *alpha,  /* host or device pointer */
+                                                                 const void *A,
+                                                                 cudaDataType Atype,
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const void *B,
+                                                                 cudaDataType Btype,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const void *beta,   /* host or device pointer */
+                                                                 void *C,
+                                                                 cudaDataType Ctype,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount,
+                                                                 cudaDataType computeType,
+                                                                 cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, long long, const void *, cudaDataType, int, long long, const void *, void *, cudaDataType, int, long long, int, cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC, batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const float *alpha,  /* host or device pointer */
+                                                                 const float *A,
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const float *B,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const float *beta,   /* host or device pointer */
+                                                                 float *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const double *alpha,  /* host or device pointer */
+                                                                 const double *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const double *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const double *beta,   /* host or device pointer */
+                                                                 double *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
+                                                                 const cuDoubleComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuDoubleComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuDoubleComplex *beta,   /* host or device poi */
+                                                                 cuDoubleComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const float *alpha, /* host or device pointer */ 
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *beta , /* host or device pointer */ 
+                                                  const float *B, 
+                                                  int ldb,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const double *alpha, /* host or device pointer */ 
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *beta, /* host or device pointer */ 
+                                                  const double *B, 
+                                                  int ldb,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *alpha, /* host or device pointer */ 
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *beta, /* host or device pointer */  
+                                                  const cuComplex *B, 
+                                                  int ldb,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *beta, /* host or device pointer */  
+                                                  const cuDoubleComplex *B, 
+                                                  int ldb,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  float *const A[],                /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  double *const A[],               /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuComplex *const A[],           /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuDoubleComplex *const A[],     /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const float *const A[],         /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  float *const C[],               /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, const int *, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const double *const A[],        /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  double *const C[],              /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, const int *, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuComplex *const A[],     /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  cuComplex *const C[],           /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuDoubleComplex *const A[], /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                     /*Device pointer*/
+                                                  cuDoubleComplex *const C[],       /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const float *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            float *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *const [], int, const int *, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int n, 
+                                                           int nrhs, 
+                                                           const double *const Aarray[], 
+                                                           int lda, 
+                                                           const int *devIpiv, 
+                                                           double *const Barray[], 
+                                                           int ldb, 
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *const [], int, const int *, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuComplex *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuComplex *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuDoubleComplex *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuDoubleComplex *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const float *alpha,           /*Host or Device Pointer*/
+                                                          const float *const A[], 
+                                                          int lda,
+                                                          float *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *const [], int, float *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const double *alpha,          /*Host or Device Pointer*/
+                                                          const double *const A[], 
+                                                          int lda,
+                                                          double *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *const [], int, double *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
+                                                          const cuComplex *const A[], 
+                                                          int lda,
+                                                          cuComplex *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const [], int, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+                                                          const cuDoubleComplex *const A[], 
+                                                          int lda,
+                                                          cuDoubleComplex *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const float *const A[],      /*Device pointer*/
+                                                          int lda, 
+                                                          float *const Ainv[],         /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const double *const A[],     /*Device pointer*/
+                                                          int lda, 
+                                                          double *const Ainv[],        /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuComplex *const A[],  /*Device pointer*/
+                                                          int lda, 
+                                                          cuComplex *const Ainv[],     /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuDoubleComplex *const A[], /*Device pointer*/
+                                                          int lda, 
+                                                          cuDoubleComplex *const Ainv[],    /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                        /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
+                                                           int m, 
+                                                           int n,
+                                                           float *const Aarray[],      /*Device pointer*/
+                                                           int lda,
+                                                           float *const TauArray[],    /*Device pointer*/                                                           
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *const [], int, float *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            double *const Aarray[],     /*Device pointer*/
+                                                            int lda, 
+                                                            double *const TauArray[],   /*Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *const [], int, double *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuComplex *const Aarray[],          /*Device pointer*/
+                                                            int lda, 
+                                                            cuComplex *const TauArray[],        /*Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *const [], int, cuComplex *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuDoubleComplex *const Aarray[],    /*Device pointer*/
+                                                            int lda,
+                                                            cuDoubleComplex *const TauArray[],  /*Device pointer*/
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           float *const Aarray[],      /*Device pointer*/
+                                                           int lda, 
+                                                           float *const Carray[],      /*Device pointer*/
+                                                           int ldc,
+                                                           int *info, 
+                                                           int *devInfoArray,          /*Device pointer*/
+                                                           int batchSize ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *const [], int, float *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           double *const Aarray[],     /*Device pointer*/
+                                                           int lda, 
+                                                           double *const Carray[],     /*Device pointer*/
+                                                           int ldc,
+                                                           int *info, 
+                                                           int *devInfoArray,          /*Device pointer*/
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *const [], int, double *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           cuComplex *const Aarray[],  /*Device pointer*/
+                                                           int lda,
+                                                           cuComplex *const Carray[],  /*Device pointer*/
+                                                           int ldc,
+                                                           int *info,
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const [], int, cuComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           cuDoubleComplex *const Aarray[],  /*Device pointer*/
+                                                           int lda,
+                                                           cuDoubleComplex *const Carray[],  /*Device pointer*/
+                                                           int ldc,
+                                                           int *info,
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *x, 
+                                                  int incx,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *x, 
+                                                  int incx,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *x, 
+                                                  int incx,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *x, 
+                                                  int incx,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *AP,
+                                                     float *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *AP,
+                                                     double *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *AP,
+                                                     cuComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *AP,
+                                                     cuDoubleComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *A,
+                                                     int lda,
+                                                     float *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *A,
+                                                     int lda,
+                                                     double *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     cuDoubleComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus CUBLASWINAPI cublasInit (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasShutdown (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetError (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
+                               int incy) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
+                               int incy) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy) {
+  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy) {
+  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
+                               float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
+                               int incx, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
+  return func_ptr(n, x, incx);
+}
+
+void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
+                              float sc, float ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
+                              double sc, double ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
+                              int incy, float c, cuComplex s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
+                              cuDoubleComplex *y, int incy, double sc, 
+                              cuDoubleComplex cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
+  return func_ptr(n, x, incx, y, incy, sc, cs);
+}
+
+void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
+                               int incy, float c, float s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex *y, int incy, double c, double s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
+                               cuComplex *cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
+                               cuDoubleComplex *cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
+                              const float* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
+                              const double* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
+                                const float *sy1, float* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
+                                const double *sy1, double* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
+                               const float *A, int lda, const float *x, int incx,
+                               float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
+                               const double *A, int lda, const double *x, int incx,
+                               double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
+                               const cuComplex *A, int lda, const cuComplex *x, int incx,
+                               cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
+                               float alpha, const float *A, int lda, 
+                               const float *x, int incx, float beta, float *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
+                               double alpha, const double *A, int lda, 
+                               const double *x, int incx, double beta, double *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuComplex alpha, const cuComplex *A, int lda, 
+                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
+                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
+                               const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
+                               const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
+                               const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
+                              float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
+                              char diag, int n, int k, const float *A, 
+                              int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const double *A, 
+                              int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuComplex *A, 
+                              int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuDoubleComplex *A, 
+                              int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
+                               int lda, const float *x, int incx, float beta, 
+                               float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
+                               int lda, const double *x, int incx, double beta, 
+                               double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, const cuComplex *x, int incx, cuComplex beta, 
+                               cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
+                               cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
+                               const float *A, int lda, const float *x, int incx, 
+                               float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
+                               const double *A, int lda, const double *x, int incx, 
+                               double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
+                               cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
+                              const float *AP, const float *x,
+                              int incx, float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
+                              const double *AP, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
+                              const float *y, int incy, float *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
+                              const double *y, int incy, double *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
+                              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
+                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
+                              int incx, cuComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
+                               int incx, const double *y, int incy, double *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, const cuComplex *y, int incy, cuComplex *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
+                               const double *x, int incx, const double *y,
+                               int incy, double *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
+                               const cuComplex *x, int incx, const cuComplex *y,
+                               int incy, cuComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+                               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
+                               float alpha, const float *A, int lda, 
+                               const float *B, int ldb, float beta, float *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
+                               double alpha, const double *A, int lda, 
+                               const double *B, int ldb, double beta, double *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
+                               int k, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
+                               const float *A, int lda, float beta, float *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               double beta, double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
+                               float alpha, const cuComplex *A, int lda,
+                               float beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
+                               double alpha,
+                               const cuDoubleComplex *A, int lda,
+                               double beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
+                                const float *A, int lda, const float *B, int ldb, 
+                                float beta, float *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
+                                double alpha, const double *A, int lda,
+                                const double *B, int ldb, double beta,
+                                double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, cuComplex beta,
+                                cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                                cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, float beta,
+                                cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, double beta,
+                                cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
+                               const double *A, int lda, const double *B, int ldb,
+                               double beta, double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
+                               cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublas_9_0.inc b/tensorflow/stream_executor/cuda/cublas_9_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ba46426878fcac7788cda188952fe1e82e509eb9
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_9_0.inc
@@ -0,0 +1,5124 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *devicePtr, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *y, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B, 
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B,
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
+                                                  const void *hostPtr, int incx, 
+                                                  void *devicePtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
+                                                  const void *devicePtr, int incx,
+                                                  void *hostPtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
+  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *x, 
+                                                     cudaDataType xType,
+                                                     int incx, 
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     const float *y, 
+                                                     int incy,
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     const double *y,
+                                                     int incy,
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *alpha,  /* host or device pointer */
+                                                     cudaDataType alphaType,
+                                                     void *x, 
+                                                     cudaDataType xType,
+                                                     int incx,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *alpha,  /* host or device pointer */
+                                                     float *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *alpha,  /* host or device pointer */
+                                                     double *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuComplex *alpha, /* host or device pointer */
+                                                     cuComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */
+                                                     cuDoubleComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
+                                                      int n,
+                                                      const void *alpha, /* host or device pointer */
+                                                      cudaDataType alphaType,
+                                                      const void *x,
+                                                      cudaDataType xType,
+                                                      int incx,
+                                                      void *y,
+                                                      cudaDataType yType,
+                                                      int incy,
+                                                      cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *alpha, /* host or device pointer */
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,      /* host or device pointer */
+                                                     const cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,            /* host or device pointer */
+                                                     const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
+                                                     float *a,   /* host or device pointer */
+                                                     float *b,   /* host or device pointer */
+                                                     float *c,   /* host or device pointer */
+                                                     float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
+                                                     double *a,  /* host or device pointer */
+                                                     double *b,  /* host or device pointer */
+                                                     double *c,  /* host or device pointer */
+                                                     double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
+                                                     cuComplex *a,  /* host or device pointer */
+                                                     cuComplex *b,  /* host or device pointer */
+                                                     float *c,      /* host or device pointer */
+                                                     cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
+                                                     cuDoubleComplex *a,  /* host or device pointer */
+                                                     cuDoubleComplex *b,  /* host or device pointer */
+                                                     double *c,           /* host or device pointer */
+                                                     cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
+                                                      float *d1,        /* host or device pointer */
+                                                      float *d2,        /* host or device pointer */
+                                                      float *x1,        /* host or device pointer */
+                                                      const float *y1,  /* host or device pointer */
+                                                      float *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
+                                                      double *d1,        /* host or device pointer */  
+                                                      double *d2,        /* host or device pointer */  
+                                                      double *x1,        /* host or device pointer */  
+                                                      const double *y1,  /* host or device pointer */  
+                                                      double *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      const float *beta,  /* host or device pointer */
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda, 
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda, 
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,   /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x, 
+                                                      int incx,
+                                                      const float *beta,  /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,   /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x, 
+                                                      int incx,
+                                                      const double *beta,   /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha,  /* host or device pointer */                                           
+                                                      const float *AP,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *AP,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *AP,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *AP,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     const float *y,
+                                                     int incy,
+                                                     float *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
+                                                     int m,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */   
+                                                     const double *x,
+                                                     int incx,
+                                                     const double *y,
+                                                     int incy,
+                                                     double *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *y,
+                                                      int incy,
+                                                      double *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx, 
+                                                      const double *y,
+                                                      int incy,
+                                                      double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb, 
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const void *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const void *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc,
+                                                      cudaDataType computeType,
+                                                      cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
+                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
+                                                           int m, int n, int k, 
+                                                           const unsigned char *A, int A_bias, int lda, 
+                                                           const unsigned char *B, int B_bias, int ldb,
+                                                                 unsigned char *C, int C_bias, int ldc,
+                                                           int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      int n, 
+                                                      int k,
+                                                      const cuComplex *alpha, 
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, 
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo, 
+                                                       cublasOperation_t trans, 
+                                                       int n, 
+                                                       int k,
+                                                       const float *alpha, 
+                                                       const void *A, cudaDataType Atype, 
+                                                       int lda,
+                                                       const float *beta, 
+                                                       void *C, 
+                                                       cudaDataType Ctype, 
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const float *alpha, /* host or device pointer */  
+                                                       const float *A,
+                                                       int lda,
+                                                       const float *B,
+                                                       int ldb,
+                                                       const float *beta, /* host or device pointer */  
+                                                       float *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const double *alpha, /* host or device pointer */  
+                                                       const double *A,
+                                                       int lda,
+                                                       const double *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       double *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const cuComplex *beta, /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                       const cuDoubleComplex *A,
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const float *beta,   /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans, 
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                       const cuDoubleComplex *A, 
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const float *alpha, /* host or device pointer */ 
+                                                    const float *A,
+                                                    int lda,
+                                                    const float *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    float *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const double *alpha, /* host or device pointer */ 
+                                                    const double *A,
+                                                    int lda,
+                                                    const double *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    double *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const cuComplex *beta, /* host or device pointer */ 
+                                                    cuComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo, 
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    cuComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      float *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,                                        
+                                                     int lda,
+                                                     cuDoubleComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda, 
+                                                      const float *B,
+                                                      int ldb,
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     const cuComplex *B,
+                                                     int ldb,
+                                                     cuComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     const cuDoubleComplex *B,
+                                                     int ldb,
+                                                     cuDoubleComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const float *alpha,  /* host or device pointer */  
+                                                          const float *Aarray[], 
+                                                          int lda,
+                                                          const float *Barray[],
+                                                          int ldb, 
+                                                          const float *beta,   /* host or device pointer */  
+                                                          float *Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *[], int, const float *[], int, const float *, float *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const double *alpha,  /* host or device pointer */ 
+                                                          const double *Aarray[], 
+                                                          int lda,
+                                                          const double *Barray[],
+                                                          int ldb, 
+                                                          const double *beta,  /* host or device pointer */ 
+                                                          double *Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *[], int, const double *[], int, const double *, double *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                          const cuDoubleComplex *Aarray[], 
+                                                          int lda,
+                                                          const cuDoubleComplex *Barray[],
+                                                          int ldb, 
+                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                          cuDoubleComplex *Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, const cuDoubleComplex *[], int, const cuDoubleComplex *, cuDoubleComplex *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const float *alpha,  /* host or device pointer */
+                                                                 const float *A,
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const float *B,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const float *beta,   /* host or device pointer */
+                                                                 float *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const double *alpha,  /* host or device pointer */
+                                                                 const double *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const double *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const double *beta,   /* host or device pointer */
+                                                                 double *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
+                                                                 const cuDoubleComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuDoubleComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuDoubleComplex *beta,   /* host or device poi */
+                                                                 cuDoubleComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const float *alpha, /* host or device pointer */ 
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *beta , /* host or device pointer */ 
+                                                  const float *B, 
+                                                  int ldb,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const double *alpha, /* host or device pointer */ 
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *beta, /* host or device pointer */ 
+                                                  const double *B, 
+                                                  int ldb,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *alpha, /* host or device pointer */ 
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *beta, /* host or device pointer */  
+                                                  const cuComplex *B, 
+                                                  int ldb,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *beta, /* host or device pointer */  
+                                                  const cuDoubleComplex *B, 
+                                                  int ldb,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  float *A[],                      /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  double *A[],                     /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuComplex *A[],                 /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuDoubleComplex *A[],           /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const float *A[],               /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  float *C[],                     /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, const int *, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const double *A[],              /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  double *C[],                    /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, const int *, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuComplex *A[],            /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  cuComplex *C[],                 /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuDoubleComplex *A[],     /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  cuDoubleComplex *C[],           /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const float *Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            float *Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *[], int, const int *, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int n, 
+                                                           int nrhs, 
+                                                           const double *Aarray[], 
+                                                           int lda, 
+                                                           const int *devIpiv, 
+                                                           double *Barray[], 
+                                                           int ldb, 
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *[], int, const int *, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuComplex *Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuComplex *Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuDoubleComplex *Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuDoubleComplex *Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const float *alpha,           /*Host or Device Pointer*/
+                                                          const float *A[], 
+                                                          int lda,
+                                                          float *B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *[], int, float *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const double *alpha,          /*Host or Device Pointer*/
+                                                          const double *A[], 
+                                                          int lda,
+                                                          double *B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *[], int, double *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
+                                                          const cuComplex *A[], 
+                                                          int lda,
+                                                          cuComplex *B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *[], int, cuComplex *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+                                                          const cuDoubleComplex *A[], 
+                                                          int lda,
+                                                          cuDoubleComplex *B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const float *A[],                  /*Device pointer*/
+                                                          int lda, 
+                                                          float *Ainv[],               /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const double *A[],                 /*Device pointer*/
+                                                          int lda, 
+                                                          double *Ainv[],              /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuComplex *A[],              /*Device pointer*/
+                                                          int lda, 
+                                                          cuComplex *Ainv[],           /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuDoubleComplex *A[],        /*Device pointer*/
+                                                          int lda, 
+                                                          cuDoubleComplex *Ainv[],     /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
+                                                           int m, 
+                                                           int n,
+                                                           float *Aarray[],           /*Device pointer*/
+                                                           int lda, 
+                                                           float *TauArray[],        /* Device pointer*/                                                           
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *[], int, float *[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            double *Aarray[],           /*Device pointer*/
+                                                            int lda, 
+                                                            double *TauArray[],        /* Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *[], int, double *[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuComplex *Aarray[],           /*Device pointer*/
+                                                            int lda, 
+                                                            cuComplex *TauArray[],        /* Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *[], int, cuComplex *[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuDoubleComplex *Aarray[],           /*Device pointer*/
+                                                            int lda, 
+                                                            cuDoubleComplex *TauArray[],        /* Device pointer*/                                                          
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           float *Aarray[], /*Device pointer*/
+                                                           int lda, 
+                                                           float *Carray[], /* Device pointer*/
+                                                           int ldc,                                                                 
+                                                           int *info, 
+                                                           int *devInfoArray, /* Device pointer*/
+                                                           int batchSize ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *[], int, float *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,  
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           double *Aarray[], /*Device pointer*/
+                                                           int lda, 
+                                                           double *Carray[], /* Device pointer*/
+                                                           int ldc,                                                                 
+                                                           int *info, 
+                                                           int *devInfoArray, /* Device pointer*/
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *[], int, double *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           cuComplex *Aarray[], /*Device pointer*/
+                                                           int lda, 
+                                                           cuComplex *Carray[], /* Device pointer*/
+                                                           int ldc,                                                                 
+                                                           int *info, 
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *[], int, cuComplex *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           cuDoubleComplex *Aarray[], /*Device pointer*/
+                                                           int lda, 
+                                                           cuDoubleComplex *Carray[], /* Device pointer*/
+                                                           int ldc,                                                                 
+                                                           int *info, 
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *x, 
+                                                  int incx,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *x, 
+                                                  int incx,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *x, 
+                                                  int incx,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *x, 
+                                                  int incx,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *AP,
+                                                     float *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *AP,
+                                                     double *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *AP,
+                                                     cuComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *AP,
+                                                     cuDoubleComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *A,
+                                                     int lda,
+                                                     float *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *A,
+                                                     int lda,
+                                                     double *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     cuDoubleComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus CUBLASWINAPI cublasInit (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasShutdown (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetError (void) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
+                               int incy) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
+                               int incy) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy) {
+  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy) {
+  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
+                               float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
+                               int incx, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
+  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
+  return func_ptr(n, x, incx);
+}
+
+void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
+                              float sc, float ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
+                              double sc, double ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
+                              int incy, float c, cuComplex s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
+                              cuDoubleComplex *y, int incy, double sc, 
+                              cuDoubleComplex cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
+  return func_ptr(n, x, incx, y, incy, sc, cs);
+}
+
+void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
+                               int incy, float c, float s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex *y, int incy, double c, double s) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
+                               cuComplex *cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
+                               cuDoubleComplex *cs) {
+  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
+                              const float* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
+                              const double* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
+                                const float *sy1, float* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
+                                const double *sy1, double* sparam) {
+  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
+                               const float *A, int lda, const float *x, int incx,
+                               float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
+                               const double *A, int lda, const double *x, int incx,
+                               double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
+                               const cuComplex *A, int lda, const cuComplex *x, int incx,
+                               cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
+                               float alpha, const float *A, int lda, 
+                               const float *x, int incx, float beta, float *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
+                               double alpha, const double *A, int lda, 
+                               const double *x, int incx, double beta, double *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuComplex alpha, const cuComplex *A, int lda, 
+                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
+                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
+                               int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
+                               const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
+                               const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
+                               const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
+                              float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
+                              char diag, int n, int k, const float *A, 
+                              int lda, float *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const double *A, 
+                              int lda, double *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuComplex *A, 
+                              int lda, cuComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuDoubleComplex *A, 
+                              int lda, cuDoubleComplex *x, int incx) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
+                               int lda, const float *x, int incx, float beta, 
+                               float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
+                               int lda, const double *x, int incx, double beta, 
+                               double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, const cuComplex *x, int incx, cuComplex beta, 
+                               cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
+                               cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
+                               const float *A, int lda, const float *x, int incx, 
+                               float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
+                               const double *A, int lda, const double *x, int incx, 
+                               double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
+                               cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
+                              const float *AP, const float *x,
+                              int incx, float beta, float *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
+                              const double *AP, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
+                              const float *y, int incy, float *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
+                              const double *y, int incy, double *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
+                              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
+                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
+                              int incx, cuComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
+                               int incx, const double *y, int incy, double *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, const cuComplex *y, int incy, cuComplex *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
+                               int lda) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
+                               const double *x, int incx, const double *y,
+                               int incy, double *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
+                               const cuComplex *x, int incx, const cuComplex *y,
+                               int incy, cuComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+                               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
+                               float alpha, const float *A, int lda, 
+                               const float *B, int ldb, float beta, float *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
+                               double alpha, const double *A, int lda, 
+                               const double *B, int ldb, double beta, double *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
+                               int k, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
+                               const float *A, int lda, float beta, float *C, 
+                               int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               double beta, double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
+                               float alpha, const cuComplex *A, int lda,
+                               float beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
+                               double alpha,
+                               const cuDoubleComplex *A, int lda,
+                               double beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
+                                const float *A, int lda, const float *B, int ldb, 
+                                float beta, float *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
+                                double alpha, const double *A, int lda,
+                                const double *B, int ldb, double beta,
+                                double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, cuComplex beta,
+                                cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                                cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, float beta,
+                                cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, double beta,
+                                cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
+                               const double *A, int lda, const double *B, int ldb,
+                               double beta, double *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
+                               cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
+                               int ldb) {
+  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..beca8c1a46fe131816b21b4764df21d4cbd6bd23
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "cuda/include/cublas.h"
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuBLAS API by forwarding to cuBLAS loaded from the DSO.
+// Note that it does not implement the v1 interface.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = stream_executor::internal::DsoLoader::GetCublasDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+void LogFatalSymbolNotFound(const char* symbol_name) {
+  LOG(FATAL) << symbol_name << " symbol not found.";
+}
+
+cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; }
+}  // namespace
+
+#if CUDA_VERSION < 9000
+typedef enum {} cublasMath_t;
+#endif
+
+// Parameter constness changed in cuBLAS 9.2
+#if CUDA_VERSION < 9020
+#include "tensorflow/stream_executor/cuda/cublas_9_0.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cublas_10_0.inc"
+#endif
diff --git a/tensorflow/stream_executor/cuda/cuda_10_0.inc b/tensorflow/stream_executor/cuda/cuda_10_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..f096b99aa1ca988da7f90a321acd68f163420aeb
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_10_0.inc
@@ -0,0 +1,1822 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuInit(unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(Flags);
+}
+
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, ordinal);
+}
+
+CUresult CUDAAPI cuDeviceGetCount(int *count) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(name, len, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(bytes, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevprop *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(major, minor, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags, active);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+CUresult CUDAAPI cuCtxSynchronize(void) {
+  using FuncPtr = CUresult (CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUlimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pvalue, limit);
+}
+
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pconfig);
+}
+
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, version);
+}
+
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fname);
+}
+
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fatCubin);
+}
+
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod);
+}
+
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfRef, hmod, name);
+}
+
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, void **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, cubinOut, sizeOut);
+}
+
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state);
+}
+
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize);
+}
+
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
+}
+
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbase, psize, dptr);
+}
+
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize);
+}
+
+CUresult CUDAAPI cuMemFreeHost(void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, p, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, p);
+}
+
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, flags);
+}
+
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pciBusId);
+}
+
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, dev);
+}
+
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUipcEventHandle *, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, event);
+}
+
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, CUipcEventHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, handle);
+}
+
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, dptr);
+}
+
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, handle, Flags);
+}
+
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostUnregister(void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N);
+}
+
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N);
+}
+
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N);
+}
+
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hArray);
+}
+
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pLevelArray, hMipmappedArray, level);
+}
+
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hMipmappedArray);
+}
+
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, attribute, ptr);
+}
+
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, hStream);
+}
+
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, CUmem_range_attribute, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attribute, ptr);
+}
+
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numAttributes, attributes, data, ptr);
+}
+
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, Flags);
+}
+
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, flags, priority);
+}
+
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, pctx);
+}
+
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUevent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, hEvent, Flags);
+}
+
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, callback, userData, flags);
+}
+
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, phGraph);
+}
+
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus);
+}
+
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dptr, length, flags);
+}
+
+CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, Flags);
+}
+
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream);
+}
+
+CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUevent, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMilliseconds, hStart, hEnd);
+}
+
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory *, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, count, paramArray, flags);
+}
+
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, hfunc);
+}
+
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, attrib, value);
+}
+
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUhostFn, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, fn, userData);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, x, y, z);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, ptr, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height, hStream);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, texunit, hTexRef);
+}
+
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraph, flags);
+}
+
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t, const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, childGraph);
+}
+
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, phGraph);
+}
+
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph *, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphClone, originalGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phNode, hOriginalNode, hClonedGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, type);
+}
+
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, nodes, numNodes);
+}
+
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, rootNodes, numRootNodes);
+}
+
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numEdges);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependentNodes, numDependentNodes);
+}
+
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode);
+}
+
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
+}
+
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec);
+}
+
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags);
+}
+
+CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hArray, Flags);
+}
+
+CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hMipmappedArray, Flags);
+}
+
+CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
+}
+
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, desc, dptr, Pitch);
+}
+
+CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray_format, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fmt, NumPackedComponents);
+}
+
+CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, int, CUaddress_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, dim, am);
+}
+
+CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, bias);
+}
+
+CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
+}
+
+CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, maxAniso);
+}
+
+CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, pBorderColor);
+}
+
+CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, Flags);
+}
+
+CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phMipmappedArray, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pam, hTexRef, dim);
+}
+
+CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray_format *, int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFormat, pNumChannels, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbias, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pmaxAniso, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pBorderColor, hTexRef);
+}
+
+CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef);
+}
+
+CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSurfRef, hArray, Flags);
+}
+
+CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUsurfref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hSurfRef);
+}
+
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *, const CUDA_TEXTURE_DESC *, const CUDA_RESOURCE_VIEW_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, dev, peerDev);
+}
+
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext, Flags);
+}
+
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext);
+}
+
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attrib, srcDevice, dstDevice);
+}
+
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUgraphicsResource, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArray, resource, arrayIndex, mipLevel);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMipmappedArray, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevPtr, pSize, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult (CUDAAPI *)(const void **, const CUuuid *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index ef9807820fda493a9ab926ae0509beaafeebdf2e..2b80ae094d17bc8ad957044545ff46daf4aeb103 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -17,13 +17,13 @@ limitations under the License.
 // It reaches into the CUDA implementation to activate an underlying CUDA
 // context.
 //
-// Having this file separate from cuda_gpu_executor.h means that dependent
+// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
 // code does not also have to depend on cuda.h.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
 namespace stream_executor {
 
@@ -31,29 +31,7 @@ class StreamExecutor;
 
 namespace cuda {
 
-class CUDAExecutor;
-class ScopedActivateContext;
-
-// Activates a CUDA context within an enclosing scope.
-class ScopedActivateExecutorContext {
- public:
-  // Form that takes a CUDA executor implementation.
-  explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
-
-  // Form that takes a pImpl executor and extracts a CUDA implementation --
-  // fatal failure if it is not CUDA inside.
-  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
-
-  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
-
-  ~ScopedActivateExecutorContext();
-
- private:
-  // The cuda.h-using datatype that we wrap.
-  ScopedActivateContext* driver_scoped_activate_context_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
-};
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 7325476ef16799e70b01234ef79e009ca9194c8f..b4e8beedccce56093c53099e021199fedc9e6b1a 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -63,7 +63,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
-#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -71,272 +70,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
-namespace wrap {
-
-// clang-format off
-#define CUBLAS_ROUTINE_EACH(__macro)      \
-  __macro(cublasSnrm2)                    \
-  __macro(cublasDnrm2)                    \
-  __macro(cublasScnrm2)                   \
-  __macro(cublasDznrm2)                   \
-  __macro(cublasSdot)                     \
-  __macro(cublasDdot)                     \
-  __macro(cublasCdotu)                    \
-  __macro(cublasCdotc)                    \
-  __macro(cublasZdotu)                    \
-  __macro(cublasZdotc)                    \
-  __macro(cublasSscal)                    \
-  __macro(cublasDscal)                    \
-  __macro(cublasCscal)                    \
-  __macro(cublasCsscal)                   \
-  __macro(cublasZscal)                    \
-  __macro(cublasZdscal)                   \
-  __macro(cublasSaxpy)                    \
-  __macro(cublasDaxpy)                    \
-  __macro(cublasCaxpy)                    \
-  __macro(cublasZaxpy)                    \
-  __macro(cublasScopy)                    \
-  __macro(cublasDcopy)                    \
-  __macro(cublasCcopy)                    \
-  __macro(cublasZcopy)                    \
-  __macro(cublasSswap)                    \
-  __macro(cublasDswap)                    \
-  __macro(cublasCswap)                    \
-  __macro(cublasZswap)                    \
-  __macro(cublasIsamax)                   \
-  __macro(cublasIdamax)                   \
-  __macro(cublasIcamax)                   \
-  __macro(cublasIzamax)                   \
-  __macro(cublasIsamin)                   \
-  __macro(cublasIdamin)                   \
-  __macro(cublasIcamin)                   \
-  __macro(cublasIzamin)                   \
-  __macro(cublasSasum)                    \
-  __macro(cublasDasum)                    \
-  __macro(cublasScasum)                   \
-  __macro(cublasDzasum)                   \
-  __macro(cublasSrot)                     \
-  __macro(cublasDrot)                     \
-  __macro(cublasCrot)                     \
-  __macro(cublasCsrot)                    \
-  __macro(cublasZrot)                     \
-  __macro(cublasZdrot)                    \
-  __macro(cublasSrotg)                    \
-  __macro(cublasDrotg)                    \
-  __macro(cublasCrotg)                    \
-  __macro(cublasZrotg)                    \
-  __macro(cublasSrotm)                    \
-  __macro(cublasDrotm)                    \
-  __macro(cublasSrotmg)                   \
-  __macro(cublasDrotmg)                   \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasCgemv)                    \
-  __macro(cublasZgemv)                    \
-  __macro(cublasSgbmv)                    \
-  __macro(cublasDgbmv)                    \
-  __macro(cublasCgbmv)                    \
-  __macro(cublasZgbmv)                    \
-  __macro(cublasStrmv)                    \
-  __macro(cublasDtrmv)                    \
-  __macro(cublasCtrmv)                    \
-  __macro(cublasZtrmv)                    \
-  __macro(cublasStbmv)                    \
-  __macro(cublasDtbmv)                    \
-  __macro(cublasCtbmv)                    \
-  __macro(cublasZtbmv)                    \
-  __macro(cublasStpmv)                    \
-  __macro(cublasDtpmv)                    \
-  __macro(cublasCtpmv)                    \
-  __macro(cublasZtpmv)                    \
-  __macro(cublasStrsv)                    \
-  __macro(cublasDtrsv)                    \
-  __macro(cublasCtrsv)                    \
-  __macro(cublasZtrsv)                    \
-  __macro(cublasStpsv)                    \
-  __macro(cublasDtpsv)                    \
-  __macro(cublasCtpsv)                    \
-  __macro(cublasZtpsv)                    \
-  __macro(cublasStbsv)                    \
-  __macro(cublasDtbsv)                    \
-  __macro(cublasCtbsv)                    \
-  __macro(cublasZtbsv)                    \
-  __macro(cublasSsymv)                    \
-  __macro(cublasDsymv)                    \
-  __macro(cublasCsymv)                    \
-  __macro(cublasZsymv)                    \
-  __macro(cublasChemv)                    \
-  __macro(cublasZhemv)                    \
-  __macro(cublasSsbmv)                    \
-  __macro(cublasDsbmv)                    \
-  __macro(cublasChbmv)                    \
-  __macro(cublasZhbmv)                    \
-  __macro(cublasSspmv)                    \
-  __macro(cublasDspmv)                    \
-  __macro(cublasChpmv)                    \
-  __macro(cublasZhpmv)                    \
-  __macro(cublasSger)                     \
-  __macro(cublasDger)                     \
-  __macro(cublasCgeru)                    \
-  __macro(cublasCgerc)                    \
-  __macro(cublasZgeru)                    \
-  __macro(cublasZgerc)                    \
-  __macro(cublasSsyr)                     \
-  __macro(cublasDsyr)                     \
-  __macro(cublasCsyr)                     \
-  __macro(cublasZsyr)                     \
-  __macro(cublasCher)                     \
-  __macro(cublasZher)                     \
-  __macro(cublasSspr)                     \
-  __macro(cublasDspr)                     \
-  __macro(cublasChpr)                     \
-  __macro(cublasZhpr)                     \
-  __macro(cublasSsyr2)                    \
-  __macro(cublasDsyr2)                    \
-  __macro(cublasCsyr2)                    \
-  __macro(cublasZsyr2)                    \
-  __macro(cublasCher2)                    \
-  __macro(cublasZher2)                    \
-  __macro(cublasSspr2)                    \
-  __macro(cublasDspr2)                    \
-  __macro(cublasChpr2)                    \
-  __macro(cublasZhpr2)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasCgemm)                    \
-  __macro(cublasZgemm)                    \
-  __macro(cublasSsyrk)                    \
-  __macro(cublasDsyrk)                    \
-  __macro(cublasCsyrk)                    \
-  __macro(cublasZsyrk)                    \
-  __macro(cublasCherk)                    \
-  __macro(cublasZherk)                    \
-  __macro(cublasSsyr2k)                   \
-  __macro(cublasDsyr2k)                   \
-  __macro(cublasCsyr2k)                   \
-  __macro(cublasZsyr2k)                   \
-  __macro(cublasCher2k)                   \
-  __macro(cublasZher2k)                   \
-  __macro(cublasSsyrkx)                   \
-  __macro(cublasDsyrkx)                   \
-  __macro(cublasCsyrkx)                   \
-  __macro(cublasZsyrkx)                   \
-  __macro(cublasCherkx)                   \
-  __macro(cublasZherkx)                   \
-  __macro(cublasSsymm)                    \
-  __macro(cublasDsymm)                    \
-  __macro(cublasCsymm)                    \
-  __macro(cublasZsymm)                    \
-  __macro(cublasChemm)                    \
-  __macro(cublasZhemm)                    \
-  __macro(cublasStrsm)                    \
-  __macro(cublasDtrsm)                    \
-  __macro(cublasCtrsm)                    \
-  __macro(cublasZtrsm)                    \
-  __macro(cublasStrmm)                    \
-  __macro(cublasDtrmm)                    \
-  __macro(cublasCtrmm)                    \
-  __macro(cublasZtrmm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-  __macro(cublasCgeam)                    \
-  __macro(cublasZgeam)                    \
-  __macro(cublasSdgmm)                    \
-  __macro(cublasDdgmm)                    \
-  __macro(cublasCdgmm)                    \
-  __macro(cublasZdgmm)
-
-// clang-format off
-
-#ifdef PLATFORM_GOOGLE
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    static const char *kName;                                       \
-    template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;                                                         \
-  const char *WrapperShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#else
-
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                               \
-  struct DynLoadShim__##__name {                                          \
-    static const char* kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetCublasDsoHandle();           \
-      return s.ValueOrDie();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void* f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in cublas DSO; dlerror: " << s.error_message();  \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
-  const char* DynLoadShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#endif
-
-STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
-STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
-STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
-STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetPointerMode)
-STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasGetPointerMode)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
-
-#if CUDA_VERSION >= 7050
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
-#endif
-
-#if CUDA_VERSION >= 8000
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmEx)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmStridedBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmStridedBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmStridedBatched)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmStridedBatched)
-#endif
-
-#if CUDA_VERSION >= 9000
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
-#endif
-
-#if CUDA_VERSION >= 9010
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmBatchedEx)
-STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmStridedBatchedEx)
-#endif
-
-}  // namespace wrap
-
 static string ToString(cublasStatus_t status) {
   switch (status) {
     case CUBLAS_STATUS_SUCCESS:
@@ -395,22 +132,21 @@ class ScopedCublasPointerMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the pointer mode.
-  explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle)
-      : parent_(parent), handle_(handle), ok_(false) {}
+  explicit ScopedCublasPointerMode(cublasHandle_t handle)
+      : handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped pointer mode, new_mode.
   //
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasPointerMode_t new_mode) {
-    cublasStatus_t ret =
-        wrap::cublasGetPointerMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret = cublasGetPointerMode(handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret);
       return ok_ = false;
     }
 
-    ret = wrap::cublasSetPointerMode(parent_, handle_, new_mode);
+    ret = cublasSetPointerMode(handle_, new_mode);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret);
       return ok_ = false;
@@ -423,8 +159,7 @@ class ScopedCublasPointerMode {
   // successful in the first place.
   ~ScopedCublasPointerMode() {
     if (ok_) {
-      cublasStatus_t ret =
-          wrap::cublasSetPointerMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret = cublasSetPointerMode(handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas pointer mode: "
                    << ToString(ret);
@@ -433,7 +168,6 @@ class ScopedCublasPointerMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this pointer mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
   bool ok_;                       // Whether the change was successful.
@@ -456,22 +190,21 @@ class ScopedCublasMathMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the math mode.
-  explicit ScopedCublasMathMode(CUDAExecutor *parent, cublasHandle_t handle)
-      : parent_(parent), handle_(handle), ok_(false) {}
+  explicit ScopedCublasMathMode(cublasHandle_t handle)
+      : handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped math mode, new_mode.
   //
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasMath_t new_mode) {
-    cublasStatus_t ret =
-        wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret = cublasGetMathMode(handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
       return ok_ = false;
     }
 
-    ret = wrap::cublasSetMathMode(parent_, handle_, new_mode);
+    ret = cublasSetMathMode(handle_, new_mode);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to set new cublas math mode: " << ToString(ret);
       return ok_ = false;
@@ -483,8 +216,7 @@ class ScopedCublasMathMode {
   // successful in the first place.
   ~ScopedCublasMathMode() {
     if (ok_) {
-      cublasStatus_t ret =
-          wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret = cublasSetMathMode(handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas math mode: "
                    << ToString(ret);
@@ -493,7 +225,6 @@ class ScopedCublasMathMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this math mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
   bool ok_;                // Whether the change was successful.
@@ -501,7 +232,8 @@ class ScopedCublasMathMode {
 #endif  // CUDA_VERSION >= 9000
 
 bool CUDABlas::Init() {
-  cublasStatus_t ret = wrap::cublasCreate(parent_, &blas_);
+  gpu::ScopedActivateExecutorContext sac{parent_};
+  cublasStatus_t ret = cublasCreate(&blas_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
     return false;
@@ -510,21 +242,22 @@ bool CUDABlas::Init() {
   return true;
 }
 
-CUDABlas::CUDABlas(cuda::CUDAExecutor *parent)
+CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
     : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
 
 CUDABlas::~CUDABlas() {
   if (blas_ != nullptr) {
-    wrap::cublasDestroy(parent_, blas_);
+    gpu::ScopedActivateExecutorContext sac{parent_};
+    cublasDestroy(blas_);
   }
 }
 
 bool CUDABlas::SetStream(Stream *stream) {
   CHECK(stream != nullptr);
-  CHECK(AsCUDAStreamValue(stream) != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
   CHECK(blas_ != nullptr);
-  cublasStatus_t ret =
-      wrap::cublasSetStream(parent_, blas_, AsCUDAStreamValue(stream));
+  gpu::ScopedActivateExecutorContext sac{parent_};
+  cublasStatus_t ret = cublasSetStream(blas_, AsGpuStreamValue(stream));
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
     return false;
@@ -676,23 +409,23 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return false;
   }
 
-  ScopedCublasPointerMode pointer_mode{parent_, blas_};
+  gpu::ScopedActivateExecutorContext sac{parent_};
+  ScopedCublasPointerMode pointer_mode{blas_};
   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
 #if CUDA_VERSION >= 9000
-  ScopedCublasMathMode math_mode{parent_, blas_};
+  ScopedCublasMathMode math_mode{blas_};
   if (use_tensor_op_math) {
     if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
       return false;
     }
   }
 #endif
-  cublasStatus_t ret = cublas_func(parent_, blas_, args...);
+  cublasStatus_t ret = cublas_func(blas_, args...);
   if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": "
-               << ToString(ret);
+    LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret);
   }
   return ret == CUBLAS_STATUS_SUCCESS;
 }
@@ -700,248 +433,242 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<float> &x, int incx,
                           DeviceMemory<float> *result) {
-  return DoBlasInternal(wrap::cublasSasum, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasSasum, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<double> &x, int incx,
                           DeviceMemory<double> *result) {
-  return DoBlasInternal(wrap::cublasDasum, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasDasum, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
-  return DoBlasInternal(
-      wrap::cublasScasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasScasum, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
-  return DoBlasInternal(
-      wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasDzasum, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
                           const DeviceMemory<float> &x, int incx,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(wrap::cublasSaxpy, stream,
-                        true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSaxpy, stream, true /* = pointer_mode_host */,
+                        elem_count, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
                           const DeviceMemory<double> &x, int incx,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(wrap::cublasDaxpy, stream,
-                        true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDaxpy, stream, true /* = pointer_mode_host */,
+                        elem_count, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasCaxpy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasZaxpy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           const DeviceMemory<float> &x, int incx,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(wrap::cublasScopy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasScopy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           const DeviceMemory<double> &x, int incx,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(wrap::cublasDcopy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDcopy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasCcopy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasCcopy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasZcopy, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZcopy, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          const DeviceMemory<float> &x, int incx,
                          const DeviceMemory<float> &y, int incy,
                          DeviceMemory<float> *result) {
-  return DoBlasInternal(
-      wrap::cublasSdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasSdot, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          const DeviceMemory<double> &x, int incx,
                          const DeviceMemory<double> &y, int incy,
                          DeviceMemory<double> *result) {
-  return DoBlasInternal(
-      wrap::cublasDdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasDdot, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *result) {
-  return DoBlasInternal(
-      wrap::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+  return DoBlasInternal(cublasCdotc, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *result) {
-  return DoBlasInternal(
-      wrap::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+  return DoBlasInternal(cublasZdotc, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *result) {
-  return DoBlasInternal(
-      wrap::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+  return DoBlasInternal(cublasCdotu, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *result) {
-  return DoBlasInternal(
-      wrap::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+  return DoBlasInternal(cublasZdotu, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<float> &x, int incx,
                           DeviceMemory<float> *result) {
-  return DoBlasInternal(wrap::cublasSnrm2, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasSnrm2, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<double> &x, int incx,
                           DeviceMemory<double> *result) {
-  return DoBlasInternal(wrap::cublasDnrm2, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasDnrm2, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
-  return DoBlasInternal(
-      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasScnrm2, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
-  return DoBlasInternal(
-      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasDznrm2, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<float> *x, int incx,
                          DeviceMemory<float> *y, int incy, float c, float s) {
-  return DoBlasInternal(
-      wrap::cublasSrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+  return DoBlasInternal(cublasSrot, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<double> *x, int incx,
                          DeviceMemory<double> *y, int incy, double c,
                          double s) {
-  return DoBlasInternal(
-      wrap::cublasDrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+  return DoBlasInternal(cublasDrot, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<std::complex<float>> *x, int incx,
                          DeviceMemory<std::complex<float>> *y, int incy,
                          float c, float s) {
-  return DoBlasInternal(wrap::cublasCsrot, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+  return DoBlasInternal(cublasCsrot, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<std::complex<double>> *x, int incx,
                          DeviceMemory<std::complex<double>> *y, int incy,
                          double c, double s) {
-  return DoBlasInternal(wrap::cublasZdrot, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+  return DoBlasInternal(cublasZdrot, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
                           DeviceMemory<float> *b, DeviceMemory<float> *c,
                           DeviceMemory<float> *s) {
-  return DoBlasInternal(wrap::cublasSrotg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(a),
-                        CUDAMemoryMutable(b), CUDAMemoryMutable(c),
-                        CUDAMemoryMutable(s));
+  return DoBlasInternal(cublasSrotg, stream, false /* = pointer_mode_host */,
+                        GpuMemoryMutable(a), GpuMemoryMutable(b),
+                        GpuMemoryMutable(c), GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
                           DeviceMemory<double> *b, DeviceMemory<double> *c,
                           DeviceMemory<double> *s) {
-  return DoBlasInternal(wrap::cublasDrotg, stream,
-                        false /* = pointer_mode_host */,
-                        CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b),
-                        CUDAMemoryMutable(c), CUDAMemoryMutable(s));
+  return DoBlasInternal(cublasDrotg, stream, false /* = pointer_mode_host */,
+                        GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
+                        GpuMemoryMutable(c), GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
@@ -949,9 +676,9 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
                           DeviceMemory<float> *c,
                           DeviceMemory<std::complex<float>> *s) {
   return DoBlasInternal(
-      wrap::cublasCrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      cublasCrotg, stream, false /* = pointer_mode_host */,
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
@@ -959,191 +686,185 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
                           DeviceMemory<double> *c,
                           DeviceMemory<std::complex<double>> *s) {
   return DoBlasInternal(
-      wrap::cublasZrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      cublasZrotg, stream, false /* = pointer_mode_host */,
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *x, int incx,
                           DeviceMemory<float> *y, int incy,
                           const DeviceMemory<float> &param) {
-  return DoBlasInternal(wrap::cublasSrotm, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+  return DoBlasInternal(cublasSrotm, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy, GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *x, int incx,
                           DeviceMemory<double> *y, int incy,
                           const DeviceMemory<double> &param) {
-  return DoBlasInternal(wrap::cublasDrotm, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+  return DoBlasInternal(cublasDrotm, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy, GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
                            DeviceMemory<float> *d2, DeviceMemory<float> *x1,
                            const DeviceMemory<float> &y1,
                            DeviceMemory<float> *param) {
-  return DoBlasInternal(wrap::cublasSrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+  return DoBlasInternal(cublasSrotmg, stream, false /* = pointer_mode_host */,
+                        GpuMemoryMutable(d1), GpuMemoryMutable(d2),
+                        GpuMemoryMutable(x1), GpuMemory(y1),
+                        GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
                            DeviceMemory<double> *d2, DeviceMemory<double> *x1,
                            const DeviceMemory<double> &y1,
                            DeviceMemory<double> *param) {
-  return DoBlasInternal(wrap::cublasDrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+  return DoBlasInternal(cublasDrotmg, stream, false /* = pointer_mode_host */,
+                        GpuMemoryMutable(d1), GpuMemoryMutable(d2),
+                        GpuMemoryMutable(x1), GpuMemory(y1),
+                        GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<float> *x, int incx) {
-  return DoBlasInternal(wrap::cublasSscal, stream,
-                        true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasSscal, stream, true /* = pointer_mode_host */,
+                        elem_count, &alpha, GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(wrap::cublasDscal, stream,
-                        true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasDscal, stream, true /* = pointer_mode_host */,
+                        elem_count, &alpha, GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           std::complex<float> alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasCscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           std::complex<double> alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasZscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(&alpha),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *x, int incx,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(wrap::cublasSswap, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSswap, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *x, int incx,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(wrap::cublasDswap, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDswap, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuMemoryMutable(x), incx,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *x, int incx,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasCswap, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasCswap, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *x, int incx,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(wrap::cublasZswap, stream,
-                        true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZswap, stream, true /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(wrap::cublasIsamax, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIsamax, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(wrap::cublasIdamax, stream,
-                        false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIdamax, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuMemory(x), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIcamax, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIzamax, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIsamin, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIdamin, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIcamin, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
-  return DoBlasInternal(
-      wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+  return DoBlasInternal(cublasIzamin, stream, false /* = pointer_mode_host */,
+                        elem_count, GpuComplex(GpuMemory(x)), incx,
+                        GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1151,10 +872,10 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<float> &a, int lda,
                           const DeviceMemory<float> &x, int incx, float beta,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasSgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSgbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
+                        GpuMemory(a), lda, GpuMemory(x), incx, &beta,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1162,10 +883,10 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<double> &a, int lda,
                           const DeviceMemory<double> &x, int incx, double beta,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasDgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDgbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
+                        GpuMemory(a), lda, GpuMemory(x), incx, &beta,
+                        GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1175,11 +896,11 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasCgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, kl, ku,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1189,31 +910,31 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasZgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, kl, ku,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(x)), incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           uint64 n, float alpha, const DeviceMemory<float> &a,
                           int lda, const DeviceMemory<float> &x, int incx,
                           float beta, DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasSgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSgemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           uint64 n, double alpha, const DeviceMemory<double> &a,
                           int lda, const DeviceMemory<double> &x, int incx,
                           double beta, DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasDgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDgemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1222,11 +943,11 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasCgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1235,29 +956,29 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasZgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
                          const DeviceMemory<float> &x, int incx,
                          const DeviceMemory<float> &y, int incy,
                          DeviceMemory<float> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasSger, stream, true /* = pointer_mode_host */, m,
+                        n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
+                        GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
                          const DeviceMemory<double> &x, int incx,
                          const DeviceMemory<double> &y, int incy,
                          DeviceMemory<double> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasDger, stream, true /* = pointer_mode_host */, m,
+                        n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
+                        GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1265,10 +986,10 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasCgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m,
+                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1276,10 +997,10 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasZgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m,
+                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1287,10 +1008,10 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasCgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m,
+                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1298,10 +1019,10 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasZgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m,
+                        n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1310,11 +1031,11 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasChbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1323,11 +1044,11 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasZhbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1336,11 +1057,11 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasChemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1349,31 +1070,31 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasZhemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          float alpha,
                          const DeviceMemory<std::complex<float>> &x, int incx,
                          DeviceMemory<std::complex<float>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasCher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasCher, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha,
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          double alpha,
                          const DeviceMemory<std::complex<double>> &x, int incx,
                          DeviceMemory<std::complex<double>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasZher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasZher, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha,
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1381,11 +1102,11 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasCher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1393,11 +1114,11 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *a, int lda) {
-  return DoBlasInternal(
-      wrap::cublasZher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+  return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemory(y)), incy,
+                        GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1406,11 +1127,11 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasChpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1419,31 +1140,31 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasZhpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+  return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
+                        incx, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          float alpha,
                          const DeviceMemory<std::complex<float>> &x, int incx,
                          DeviceMemory<std::complex<float>> *ap) {
-  return DoBlasInternal(
-      wrap::cublasChpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+  return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          double alpha,
                          const DeviceMemory<std::complex<double>> &x, int incx,
                          DeviceMemory<std::complex<double>> *ap) {
-  return DoBlasInternal(
-      wrap::cublasZhpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+  return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1452,10 +1173,9 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<float>> &y, int incy,
                           DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(
-      wrap::cublasChpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      cublasChpr2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
+      incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1464,168 +1184,153 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           const DeviceMemory<std::complex<double>> &y, int incy,
                           DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(
-      wrap::cublasZhpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      cublasZhpr2, stream, true /* = pointer_mode_host */,
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha), GpuComplex(GpuMemory(x)),
+      incx, GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           uint64 k, float alpha, const DeviceMemory<float> &a,
                           int lda, const DeviceMemory<float> &x, int incx,
                           float beta, DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasSsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSsbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
+                        lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           uint64 k, double alpha, const DeviceMemory<double> &a,
                           int lda, const DeviceMemory<double> &x, int incx,
                           double beta, DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(
-      wrap::cublasDsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDsbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
+                        lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
+                        incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float alpha, const DeviceMemory<float> &ap,
                           const DeviceMemory<float> &x, int incx, float beta,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(wrap::cublasSspmv, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSspmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double alpha, const DeviceMemory<double> &ap,
                           const DeviceMemory<double> &x, int incx, double beta,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(wrap::cublasDspmv, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDspmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          float alpha, const DeviceMemory<float> &x, int incx,
                          DeviceMemory<float> *ap) {
-  return DoBlasInternal(wrap::cublasSspr, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+  return DoBlasInternal(cublasSspr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          double alpha, const DeviceMemory<double> &x, int incx,
                          DeviceMemory<double> *ap) {
-  return DoBlasInternal(wrap::cublasDspr, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+  return DoBlasInternal(cublasDspr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float alpha, const DeviceMemory<float> &x, int incx,
                           const DeviceMemory<float> &y, int incy,
                           DeviceMemory<float> *ap) {
-  return DoBlasInternal(wrap::cublasSspr2, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+  return DoBlasInternal(cublasSspr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double alpha, const DeviceMemory<double> &x, int incx,
                           const DeviceMemory<double> &y, int incy,
                           DeviceMemory<double> *ap) {
-  return DoBlasInternal(wrap::cublasDspr2, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+  return DoBlasInternal(cublasDspr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float alpha, const DeviceMemory<float> &a, int lda,
                           const DeviceMemory<float> &x, int incx, float beta,
                           DeviceMemory<float> *y, int incy) {
-  return DoBlasInternal(wrap::cublasSsymv, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasSsymv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double alpha, const DeviceMemory<double> &a, int lda,
                           const DeviceMemory<double> &x, int incx, double beta,
                           DeviceMemory<double> *y, int incy) {
-  return DoBlasInternal(wrap::cublasDsymv, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+  return DoBlasInternal(cublasDsymv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          float alpha, const DeviceMemory<float> &x, int incx,
                          DeviceMemory<float> *a, int lda) {
-  return DoBlasInternal(wrap::cublasSsyr, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasSsyr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          double alpha, const DeviceMemory<double> &x, int incx,
                          DeviceMemory<double> *a, int lda) {
-  return DoBlasInternal(wrap::cublasDsyr, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasDsyr, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float alpha, const DeviceMemory<float> &x, int incx,
                           const DeviceMemory<float> &y, int incy,
                           DeviceMemory<float> *a, int lda) {
-  return DoBlasInternal(wrap::cublasSsyr2, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasSsyr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double alpha, const DeviceMemory<double> &x, int incx,
                           const DeviceMemory<double> &y, int incy,
                           DeviceMemory<double> *a, int lda) {
-  return DoBlasInternal(wrap::cublasDsyr2, stream,
-                        true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+  return DoBlasInternal(cublasDsyr2, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           uint64 k, const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *x, int incx) {
-  return DoBlasInternal(wrap::cublasStbmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasStbmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           uint64 k, const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(wrap::cublasDtbmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasDtbmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1633,11 +1338,10 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
                           uint64 k, const DeviceMemory<std::complex<float>> &a,
                           int lda, DeviceMemory<std::complex<float>> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasCtbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasCtbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1645,33 +1349,30 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
                           uint64 k, const DeviceMemory<std::complex<double>> &a,
                           int lda, DeviceMemory<std::complex<double>> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasZtbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasZtbmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           uint64 k, const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *x, int incx) {
-  return DoBlasInternal(wrap::cublasStbsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasStbsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           uint64 k, const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(wrap::cublasDtbsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasDtbsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1679,11 +1380,10 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
                           uint64 k, const DeviceMemory<std::complex<float>> &a,
                           int lda, DeviceMemory<std::complex<float>> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasCtbsv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasCtbsv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1691,183 +1391,170 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
                           uint64 k, const DeviceMemory<std::complex<double>> &a,
                           int lda, DeviceMemory<std::complex<double>> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasZtbsv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+  return DoBlasInternal(cublasZtbsv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasStpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasStpmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, GpuMemory(ap),
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<double> &ap,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasDtpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasDtpmv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, GpuMemory(ap),
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<float>> &ap,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasCtpmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCtpmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<double>> &ap,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasZtpmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZtpmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<float> &ap, DeviceMemory<float> *x,
                           int incx) {
-  return DoBlasInternal(
-      wrap::cublasStpsv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasStpsv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, GpuMemory(ap),
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<double> &ap,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(
-      wrap::cublasDtpsv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+  return DoBlasInternal(cublasDtpsv, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
+                        CUDABlasDiagonal(diag), n, GpuMemory(ap),
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<float>> &ap,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasCtpsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCtpsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<double>> &ap,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasZtpsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZtpsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *x, int incx) {
-  return DoBlasInternal(wrap::cublasStrmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasStrmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(wrap::cublasDtrmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasDtrmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasCtrmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCtrmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasZtrmv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZtrmv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *x, int incx) {
-  return DoBlasInternal(wrap::cublasStrsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasStrsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *x, int incx) {
-  return DoBlasInternal(wrap::cublasDtrsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasDtrsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasCtrsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCtrsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, blas::Diagonal diag, uint64 n,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *x, int incx) {
-  return DoBlasInternal(wrap::cublasZtrsv, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZtrsv, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasGemm(
@@ -1919,11 +1606,11 @@ bool CUDABlas::DoBlasGemm(
 #endif
 
   return DoBlasInternalImpl(
-      wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
+      cublasSgemmEx, stream, true /* = pointer_mode_host */,
       true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
-      CUDABlasTranspose(transb), m, n, k, &alpha, CUDAMemory(a),
-      SE_CUDA_DATA_HALF, lda, CUDAMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
-      CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
+      SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
+      GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
 
 #else
   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
@@ -1965,10 +1652,10 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                       "precondition violation";
     }
   }
-  return DoBlasInternal(
-      wrap::cublasSgemm, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
+                        n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
+                        &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1976,10 +1663,10 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           double alpha, const DeviceMemory<double> &a, int lda,
                           const DeviceMemory<double> &b, int ldb, double beta,
                           DeviceMemory<double> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasDgemm, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasDgemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
+                        n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
+                        &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1989,12 +1676,11 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasCgemm, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
+                        n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -2004,12 +1690,11 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasZgemm, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
+                        n, k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemvWithProfiling(
@@ -2116,10 +1801,10 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2129,9 +1814,9 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
       DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2148,10 +1833,10 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2161,9 +1846,9 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
                            ldb, beta, c, ldc);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2238,13 +1923,13 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
                  "output_profile_result was given, but we were unable to "
-                 "create a CUDATimer.";
+                 "create a GpuTimer.";
       return false;
     }
   }
@@ -2268,21 +1953,21 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   // essentially reinterpet_cast to __half, which is safe because Eigen::half
   // inherits from __half.
   bool result = DoBlasInternalFailureOK(
-      wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
+      cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(),
-      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb,
-      beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(),
-      CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
+      alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
+      GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
+      beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
+      GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
-                 "CUDATimer.";
+                 "GpuTimer.";
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2329,7 +2014,7 @@ bool CUDABlas::GetBlasGemmAlgorithms(
     CUBLAS_GEMM_ALGO3_TENSOR_OP,
     CUBLAS_GEMM_ALGO4_TENSOR_OP,
 #endif
-#if CUDA_VERSION >= 9200
+#if CUDA_VERSION >= 9020
     CUBLAS_GEMM_ALGO18,
     CUBLAS_GEMM_ALGO19,
     CUBLAS_GEMM_ALGO20,
@@ -2470,7 +2155,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
+  typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2535,14 +2220,14 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     cudaDataType_t compute_type =
         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
     const void **a_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(a)));
+        const_cast<const CUDA_T **>(GpuMemory(a)));
     const void **b_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(b)));
+        const_cast<const CUDA_T **>(GpuMemory(b)));
     void **c_void_ptrs =
-        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
     bool ok;
     ok = DoBlasInternalImpl(
-        wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
+        cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
         true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
@@ -2559,9 +2244,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok = DoBlasInternal(
         cublas_func, stream, true /* = pointer_mode_host */,
         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+        GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
+        const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
+        const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
     if (ok) {
       return port::Status::OK();
     }
@@ -2594,8 +2279,8 @@ bool CUDABlas::DoBlasGemmBatched(
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
   port::Status status = DoBlasGemmBatchedInternal(
-      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+      cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -2610,8 +2295,8 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
     int batch_count, ScratchAllocator *scratch_allocator) {
   port::Status status = DoBlasGemmBatchedInternal(
-      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+      cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -2626,8 +2311,8 @@ bool CUDABlas::DoBlasGemmBatched(
     double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   port::Status status = DoBlasGemmBatchedInternal(
-      wrap::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+      cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -2644,8 +2329,8 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   port::Status status = DoBlasGemmBatchedInternal(
-      wrap::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+      cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -2662,8 +2347,8 @@ bool CUDABlas::DoBlasGemmBatched(
     const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   port::Status status = DoBlasGemmBatchedInternal(
-      wrap::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
-      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+      cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -2690,12 +2375,12 @@ bool CUDABlas::DoBlasGemmStridedBatched(
       cublasGemmAlgo_t algo =
           (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
       bool ok = DoBlasInternalImpl(
-          wrap::cublasGemmStridedBatchedEx, stream,
-          true /* = pointer_mode_host */, true /* = err_on_failure */,
-          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, CUDAMemory(a), CUDA_R_16F, lda, stride_a,
-          CUDAMemory(b), CUDA_R_16F, ldb, stride_b, &beta, CUDAMemoryMutable(c),
-          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
+          cublasGemmStridedBatchedEx, stream, true /* = pointer_mode_host */,
+          true /* = err_on_failure */, use_tensor_ops,
+          CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+          GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F,
+          ldb, stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
+          batch_count, CUDA_R_32F, algo);
       if (ok) {
         return true;
       }
@@ -2708,13 +2393,13 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
   for (int batch = 0; batch < batch_count; ++batch) {
     const auto *a_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(a) + batch * stride_a);
+        reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
     const auto *b_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(b) + batch * stride_b);
+        reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
     auto *c_matrix =
-        reinterpret_cast<__half *>(CUDAMemoryMutable(c) + batch * stride_c);
+        reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
-        wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
+        cublasSgemmEx, stream, true /* = pointer_mode_host */,
         true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
@@ -2734,10 +2419,10 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
     int batch_count) {
   return DoBlasInternal(
-      wrap::cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
+      cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2747,10 +2432,10 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
     int batch_count) {
   return DoBlasInternal(
-      wrap::cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
+      cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2761,11 +2446,11 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
     int64 stride_c, int batch_count) {
   return DoBlasInternal(
-      wrap::cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
+      cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2776,11 +2461,11 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
     int64 stride_c, int batch_count) {
   return DoBlasInternal(
-      wrap::cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
+      cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2790,11 +2475,11 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasChemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2804,11 +2489,11 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasZhemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2817,11 +2502,10 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           float beta, DeviceMemory<std::complex<float>> *c,
                           int ldc) {
-  return DoBlasInternal(wrap::cublasCherk, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2830,11 +2514,10 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           double beta, DeviceMemory<std::complex<double>> *c,
                           int ldc) {
-  return DoBlasInternal(wrap::cublasZherk, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2844,12 +2527,11 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<float>> &b, int ldb,
                            float beta, DeviceMemory<std::complex<float>> *c,
                            int ldc) {
-  return DoBlasInternal(wrap::cublasCher2k, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2859,12 +2541,11 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<double>> &b, int ldb,
                            double beta, DeviceMemory<std::complex<double>> *c,
                            int ldc) {
-  return DoBlasInternal(wrap::cublasZher2k, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2872,10 +2553,10 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           float alpha, const DeviceMemory<float> &a, int lda,
                           const DeviceMemory<float> &b, int ldb, float beta,
                           DeviceMemory<float> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasSsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasSsymm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2883,10 +2564,10 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           double alpha, const DeviceMemory<double> &a, int lda,
                           const DeviceMemory<double> &b, int ldb, double beta,
                           DeviceMemory<double> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasDsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasDsymm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2896,11 +2577,11 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<float>> &b, int ldb,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasCsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2910,31 +2591,31 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           const DeviceMemory<std::complex<double>> &b, int ldb,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasZsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, uint64 n, uint64 k,
                           float alpha, const DeviceMemory<float> &a, int lda,
                           float beta, DeviceMemory<float> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasSsyrk, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasSsyrk, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, &alpha, GpuMemory(a), lda, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           blas::Transpose trans, uint64 n, uint64 k,
                           double alpha, const DeviceMemory<double> &a, int lda,
                           double beta, DeviceMemory<double> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasDsyrk, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasDsyrk, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, &alpha, GpuMemory(a), lda, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2943,11 +2624,11 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           std::complex<float> beta,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasCsyrk, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
+                        ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2956,11 +2637,11 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasZsyrk, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+  return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)),
+                        ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2968,10 +2649,10 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            float alpha, const DeviceMemory<float> &a, int lda,
                            const DeviceMemory<float> &b, int ldb, float beta,
                            DeviceMemory<float> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasSsyr2k, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasSsyr2k, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2979,10 +2660,10 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            double alpha, const DeviceMemory<double> &a, int lda,
                            const DeviceMemory<double> &b, int ldb, double beta,
                            DeviceMemory<double> *c, int ldc) {
-  return DoBlasInternal(
-      wrap::cublasDsyr2k, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+  return DoBlasInternal(cublasDsyr2k, stream, true /* = pointer_mode_host */,
+                        CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
+                        k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
+                        GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2992,12 +2673,11 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<float>> &b, int ldb,
                            std::complex<float> beta,
                            DeviceMemory<std::complex<float>> *c, int ldc) {
-  return DoBlasInternal(wrap::cublasCsyr2k, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -3007,12 +2687,11 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
                            const DeviceMemory<std::complex<double>> &b, int ldb,
                            std::complex<double> beta,
                            DeviceMemory<std::complex<double>> *c, int ldc) {
-  return DoBlasInternal(wrap::cublasZsyr2k, stream,
-                        true /* = pointer_mode_host */,
+  return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3020,11 +2699,11 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
                           const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasStrmm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+  return DoBlasInternal(cublasStrmm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
+                        GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3032,11 +2711,11 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
                           const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasDtrmm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+  return DoBlasInternal(cublasDtrmm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
+                        GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3045,12 +2724,12 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasCtrmm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+  return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemoryMutable(b)), ldb,
+                        GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3059,12 +2738,12 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasZtrmm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+  return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemoryMutable(b)), ldb,
+                        GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3072,11 +2751,10 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::Diagonal diag, uint64 m, uint64 n, float alpha,
                           const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *b, int ldb) {
-  return DoBlasInternal(wrap::cublasStrsm, stream,
-                        true /* = pointer_mode_host */, CUDABlasSide(side),
-                        CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+  return DoBlasInternal(cublasStrsm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3084,11 +2762,10 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::Diagonal diag, uint64 m, uint64 n, double alpha,
                           const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *b, int ldb) {
-  return DoBlasInternal(wrap::cublasDtrsm, stream,
-                        true /* = pointer_mode_host */, CUDABlasSide(side),
-                        CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+  return DoBlasInternal(cublasDtrsm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        &alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3097,11 +2774,11 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasCtrsm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+  return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3110,22 +2787,22 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) {
-  return DoBlasInternal(
-      wrap::cublasZtrsm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+  return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */,
+                        CUDABlasSide(side), CUDABlasUpperLower(uplo),
+                        CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cublas() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
-          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS",
           [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuBLAS "
@@ -3133,7 +2810,7 @@ void initialize_cublas() {
               return nullptr;
             }
 
-            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor);
             if (!blas->Init()) {
               // Note: Init() will log a more specific error.
               delete blas;
@@ -3148,7 +2825,7 @@ void initialize_cublas() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
+      cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 0fb05089d7530aa298a332e4e6c714eddd7799e9..63d03056d911fe807617f0987e751825248ae607 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -33,26 +33,26 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
 // Opaque and unique identifier for the cuBLAS plugin.
 extern const PluginId kCuBlasPlugin;
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // BLAS plugin for CUDA platform via cuBLAS library.
 //
 // This satisfies the platform-agnostic BlasSupport interface.
 //
 // Note that the cuBLAS handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuBLAS handle when a
 // CUDA context is active.
 //
 // Thread-safe post-initialization.
 class CUDABlas : public blas::BlasSupport {
  public:
-  explicit CUDABlas(CUDAExecutor *parent);
+  explicit CUDABlas(GpuExecutor *parent);
 
   // Allocates a cuBLAS handle.
   bool Init();
@@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
   // mutex that guards the cuBLAS handle for this device.
   mutex mu_;
 
-  // CUDAExecutor which instantiated this CUDABlas.
+  // GpuExecutor which instantiated this CUDABlas.
   // Immutable post-initialization.
-  CUDAExecutor *parent_;
+  GpuExecutor *parent_;
 
   // cuBLAS library handle on the device.
   cublasHandle_t blas_ GUARDED_BY(mu_);
@@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 6af71b6c9d194182e79decd3f1beeb96d8141974..e58ebee80da613a63e00d7627abf4e8f8c99bc5b 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -52,13 +52,6 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {
 
-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
-static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
-#endif
-
-
 string DriverVersionToString(DriverVersion version) {
   return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
 }
@@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
   return result;
 }
 
+}  // namespace cuda
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+#ifdef __APPLE__
+static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
+#elif !defined(PLATFORM_WINDOWS)
+static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#endif
+
 // -- class Diagnostician
 
 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
   }
   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "libcuda reported version is: "
-            << DriverVersionStatusToString(dso_version);
+            << cuda::DriverVersionStatusToString(dso_version);
 
   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
-	  << DriverVersionStatusToString(kernel_version);
+            << cuda::DriverVersionStatusToString(kernel_version);
 #endif
 
   // OS X kernel driver does not report version accurately
@@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
     }
     const size_t length = suffix_pos - start;
     const string version = path.substr(start, length);
-    result = StringToDriverVersion(version);
+    result = cuda::StringToDriverVersion(version);
   }
 #else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
@@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
-      *result = StringToDriverVersion(stripped_dso_version);
+      *result = cuda::StringToDriverVersion(stripped_dso_version);
       return 1;
     }
     return 0;
@@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
   auto stripped_kernel_version =
       port::StripSuffixString(kernel_version, ".ld64");
-  return StringToDriverVersion(stripped_kernel_version);
+  return cuda::StringToDriverVersion(stripped_kernel_version);
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
@@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
     LOG(INFO) << "kernel version seems to match DSO: "
-              << DriverVersionToString(kernel_version.ValueOrDie());
+              << cuda::DriverVersionToString(kernel_version.ValueOrDie());
   } else {
     LOG(ERROR) << "kernel version "
-               << DriverVersionStatusToString(kernel_version)
+               << cuda::DriverVersionStatusToString(kernel_version)
                << " does not match DSO version "
-               << DriverVersionStatusToString(dso_version)
+               << cuda::DriverVersionStatusToString(dso_version)
                << " -- cannot find working devices in this configuration";
   }
 }
@@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     // see
     // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
     if (version == NULL) {
-      return StringToDriverVersion("");
+      return cuda::StringToDriverVersion("");
     }
-    return StringToDriverVersion(version);
+    return cuda::StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
   auto status = port::Status(
@@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 #endif
 }
 
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index f2db2eb20a18c671e055b910809dfde940a5e3f8..0837e136fd428570cb0d4ebddc85bedf66375f1a 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -16,17 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-#include <tuple>
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
+using DriverVersion = gpu::DriverVersion;
 
 // Converts a parsed driver version to string form.
 string DriverVersionToString(DriverVersion version);
@@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
 
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
-
-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const string &driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static port::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      port::StatusOr<DriverVersion> dso_version,
-      port::StatusOr<DriverVersion> kernel_version);
-
-  // Logs information about the dev nodes present on this machine: their
-  // existence, permissions, accessibility from this uid/gid.
-  static void LogDevNodeDiagnosticInformation();
-
-  static string GetDevNodePath(int dev_node_ordinal);
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
-};
+using Diagnostician = gpu::Diagnostician;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 62df7e6eff83bc007aa061b372cb3dcf41876975..c0cc00c7208c4ba24be017145a9309caf464af4c 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -39,8 +38,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
-#include "tensorflow/stream_executor/logging.pb.h"
-#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
@@ -58,7 +55,7 @@ limitations under the License.
 #pragma clang diagnostic warning "-Wmismatched-tags"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
 
@@ -82,17 +79,6 @@ static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
     }                                                                    \
   } while (false)
 
-// Returns whether status is 'ok', and potentially logs the error.
-bool IsStatusOk(const port::Status& status, bool report_error) {
-  if (status.ok()) {
-    return true;
-  }
-  if (report_error) {
-    LOG(ERROR) << status.error_message();
-  }
-  return false;
-}
-
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -148,7 +134,7 @@ class CudnnHandle {
  public:
   // Takes ownership of the executor context and the lock to access cuDNN
   // using handle.
-  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
               cudnnHandle_t handle)
       : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
@@ -157,163 +143,13 @@ class CudnnHandle {
   cudnnHandle_t handle() const { return handle_; }
 
  private:
-  cuda::ScopedActivateExecutorContext context_;
+  gpu::ScopedActivateExecutorContext context_;
   mutex_lock lock_;
   cudnnHandle_t handle_;  // Not owned.
 };
 
 }  // namespace
 
-#ifdef PLATFORM_GOOGLE
-// This macro wraps a global identifier, given by __name, in a callable
-// structure that loads the DLL symbol out of the DSO handle in a thread-safe
-// manner on first use. This dynamic loading technique is used to avoid DSO
-// dependencies on vendor libraries which may or may not be available in the
-// deployed binary environment.
-#define STREAM_EXECUTOR_CUDNN_WRAP(__name)   \
-  struct WrapperShim__##__name {             \
-    template <typename... Args>              \
-    cudnnStatus_t operator()(Args... args) { \
-      return ::__name(args...);              \
-    }                                        \
-  } __name;
-
-#else
-#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                                \
-  struct DynLoadShim__##__name {                                          \
-    static const char* kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetCudnnDsoHandle();            \
-      return s.ValueOrDie();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void* f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in cudnn DSO; dlerror: " << s.error_message();   \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    cudnnStatus_t operator()(Args... args) {                              \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
-  const char* DynLoadShim__##__name::kName = #__name;
-#endif
-
-// clang-format off
-#define CUDNN_ROUTINE_EACH_V7000_UNDER(__macro)               \
-  __macro(cudnnActivationForward)                             \
-  __macro(cudnnAddTensor)                                     \
-  __macro(cudnnBatchNormalizationBackward)                    \
-  __macro(cudnnBatchNormalizationForwardInference)            \
-  __macro(cudnnBatchNormalizationForwardTraining)             \
-  __macro(cudnnConvolutionBackwardBias)                       \
-  __macro(cudnnConvolutionBackwardData)                       \
-  __macro(cudnnConvolutionBackwardFilter)                     \
-  __macro(cudnnConvolutionBiasActivationForward)              \
-  __macro(cudnnConvolutionForward)                            \
-  __macro(cudnnCreate)                                        \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnCreateConvolutionDescriptor)                   \
-  __macro(cudnnCreateDropoutDescriptor)                       \
-  __macro(cudnnCreateFilterDescriptor)                        \
-  __macro(cudnnCreateLRNDescriptor)                           \
-  __macro(cudnnCreatePersistentRNNPlan)                       \
-  __macro(cudnnCreatePoolingDescriptor)                       \
-  __macro(cudnnCreateRNNDescriptor)                           \
-  __macro(cudnnCreateTensorDescriptor)                        \
-  __macro(cudnnDestroy)                                       \
-  __macro(cudnnDestroyActivationDescriptor)                   \
-  __macro(cudnnDestroyConvolutionDescriptor)                  \
-  __macro(cudnnDestroyDropoutDescriptor)                      \
-  __macro(cudnnDestroyFilterDescriptor)                       \
-  __macro(cudnnDestroyLRNDescriptor)                          \
-  __macro(cudnnDestroyPersistentRNNPlan)                      \
-  __macro(cudnnDestroyPoolingDescriptor)                      \
-  __macro(cudnnDestroyRNNDescriptor)                          \
-  __macro(cudnnDestroyTensorDescriptor)                       \
-  __macro(cudnnDropoutGetStatesSize)                          \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)       \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionForwardAlgorithm)                \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)            \
-  __macro(cudnnGetConvolutionNdDescriptor)                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)              \
-  __macro(cudnnGetFilterNdDescriptor)                         \
-  __macro(cudnnGetProperty)                                   \
-  __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
-  __macro(cudnnGetRNNParamsSize)                              \
-  __macro(cudnnGetRNNTrainingReserveSize)                     \
-  __macro(cudnnGetRNNWorkspaceSize)                           \
-  __macro(cudnnLRNCrossChannelBackward)                       \
-  __macro(cudnnLRNCrossChannelForward)                        \
-  __macro(cudnnPoolingBackward)                               \
-  __macro(cudnnPoolingForward)                                \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)                            \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnSetConvolutionNdDescriptor)                    \
-  __macro(cudnnSetDropoutDescriptor)                          \
-  __macro(cudnnSetFilterNdDescriptor)                         \
-  __macro(cudnnSetLRNDescriptor)                              \
-  __macro(cudnnSetPersistentRNNPlan)                          \
-  __macro(cudnnSetPoolingNdDescriptor)                        \
-  __macro(cudnnSetRNNDescriptor)                              \
-  __macro(cudnnSetRNNDescriptor_v6)                           \
-  __macro(cudnnSetStream)                                     \
-  __macro(cudnnSetTensor4dDescriptor)                         \
-  __macro(cudnnSetTensorNdDescriptor)                         \
-  __macro(cudnnTransformTensor)
-
-// clang-format on
-
-CUDNN_ROUTINE_EACH_V7000_UNDER(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_ROUTINE_EACH_V7000_UNDER
-
-#if CUDNN_VERSION >= 7000
-// clang-format off
-#define CUDNN_ROUTINE_EACH_V7000(__macro)                    \
-  __macro(cudnnSetRNNMatrixMathType)                         \
-  __macro(cudnnSetConvolutionMathType)                       \
-  __macro(cudnnSetConvolutionGroupCount)
-
-// clang-format on
-
-CUDNN_ROUTINE_EACH_V7000(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_ROUTINE_EACH_V7000
-#endif
-
-#if CUDNN_VERSION >= 7201
-// clang-format off
-#define CUDNN_ROUTINE_EACH_V7210(__macro)                     \
-  __macro(cudnnCreateRNNDataDescriptor)                       \
-  __macro(cudnnDestroyRNNDataDescriptor)                      \
-  __macro(cudnnRNNBackwardDataEx)                             \
-  __macro(cudnnRNNBackwardWeightsEx)                          \
-  __macro(cudnnRNNForwardInferenceEx)                         \
-  __macro(cudnnRNNForwardTrainingEx)                          \
-  __macro(cudnnSetRNNDataDescriptor)                          \
-  __macro(cudnnSetRNNPaddingMode)
-
-// clang-format on
-
-CUDNN_ROUTINE_EACH_V7210(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_ROUTINE_EACH_V7210
-#endif
-
 // Wraps a cuDNN handle and provides access to it through CudnnHandle
 // instances, which also locks a mutex, acquires the CUDA context, and sets
 // the stream that cuDNN should use to enqueue any work.
@@ -345,11 +181,11 @@ class CudnnAccess {
   // The legacy default stream synchronizes with all other streams and it is
   // therefore a bad idea (performance wise) to call any cuDNN APIs that
   // enqueue work in the stream.
-  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+  CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
     mutex_lock lock(mutex_);
-    cuda::ScopedActivateExecutorContext context(executor);
-    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
-    auto status = cudnnSetStream(handle_, cu_stream);
+    gpu::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
+    const auto status = cudnnSetStream(handle_, cu_stream);
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
     return CudnnHandle(std::move(context), std::move(lock), handle_);
   }
@@ -459,12 +295,12 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 
 }  // namespace
 
-CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
+CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
   cudnnHandle_t cudnn_handle = nullptr;
-  auto status = cudnnCreate(&cudnn_handle);
+  const auto status = cudnnCreate(&cudnn_handle);
   if (status == CUDNN_STATUS_SUCCESS) {
     CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
 
@@ -492,14 +328,14 @@ port::Status CudnnSupport::Init() {
   CHECK_EQ(cudnn_handle, nullptr);
   LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    auto result = cuda::Diagnostician::FindKernelDriverVersion();
+    auto result = gpu::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
       LOG(ERROR) << "Error retrieving driver version: "
-                 << DriverVersionStatusToString(result);
+                 << cuda::DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
       LOG(ERROR) << "Possibly insufficient driver version: "
-                 << DriverVersionToString(version);
+                 << cuda::DriverVersionToString(version);
     }
   }
 
@@ -795,7 +631,6 @@ bool BatchnormSpatialPersistentEnabled() {
 }
 
 // A helper function to decide whether to enable deterministic functionality.
-// TODO(pr/24355): Support all cuDNN functionality (currently only convolution).
 bool RequireDeterminism() {
   static bool is_enabled = [] {
     bool is_enabled = false;
@@ -898,10 +733,13 @@ class CudnnPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
+    const auto cudnn_max_pooling_mode = RequireDeterminism()
+                                            ? CUDNN_POOLING_MAX_DETERMINISTIC
+                                            : CUDNN_POOLING_MAX;
     CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
         handle_.get(),
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
-             ? CUDNN_POOLING_MAX
+             ? cudnn_max_pooling_mode
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
         propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
         shape.data(), padding.data(), strides.data()));
@@ -1162,7 +1000,7 @@ class CudnnRnnParamsDescriptor {
 }  // namespace
 
 class CudnnRnnDescriptor : public dnn::RnnDescriptor {
-  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
                      PersistentRnnPlan rnn_plan, int num_layers,
                      int hidden_size, int input_size, int batch_size,
                      cudnnRNNInputMode_t input_mode,
@@ -1202,7 +1040,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         CudnnDropoutDescriptor dropout_desc,
         CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
 
-    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
     cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
 
     // TODO: allow the user to choose an algorithm.
@@ -1293,7 +1131,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   }
 
  private:
-  cuda::RnnDescriptor rnn_desc_;
+  gpu::RnnDescriptor rnn_desc_;
   PersistentRnnPlan rnn_plan_;
   int num_layers_;
   int hidden_size_;
@@ -1412,15 +1250,14 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
 
 class CudnnRnnSequenceTensorDescriptor
     : public dnn::RnnSequenceTensorDescriptor {
-  CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int max_seq_length,
+  CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
                                    int batch_size, int data_size,
                                    cudnnDataType_t data_type,
 #if CUDNN_VERSION >= 7201
                                    RNNDataDescriptor data_handle,
 #endif
                                    TensorDescriptor handle)
-      : parent_(parent),
-        max_seq_length_(max_seq_length),
+      : max_seq_length_(max_seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type),
@@ -1436,7 +1273,7 @@ class CudnnRnnSequenceTensorDescriptor
       default;
 
   static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       cudnnDataType_t data_type) {
     CHECK_GT(max_seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
@@ -1455,8 +1292,9 @@ class CudnnRnnSequenceTensorDescriptor
   }
 
   static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
-      const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      const absl::Span<const int>& seq_lengths, bool time_major,
+      cudnnDataType_t data_type) {
 #if CUDNN_VERSION >= 7201
     CHECK_GT(max_seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
@@ -1469,9 +1307,15 @@ class CudnnRnnSequenceTensorDescriptor
     const int* seq_lengths_array = seq_lengths.data();
     RNNDataDescriptor data_desc = CreateRNNDataDescriptor();
     float padding_fill = 0.0f;
+    cudnnRNNDataLayout_t layout;
+    if (time_major) {
+      layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
+    } else {
+      layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
+    }
     RETURN_IF_CUDNN_ERROR(cudnnSetRNNDataDescriptor(
         /*RNNDataDesc=*/data_desc.get(), /*dataType*/ data_type,
-        /*layout=*/CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
+        /*layout=*/layout,
         /*maxSeqLength=*/max_seq_length,
         /*batchSize=*/batch_size, /*vectorSize=*/data_size,
         /*seqLengthArray=*/seq_lengths_array,
@@ -1507,7 +1351,6 @@ class CudnnRnnSequenceTensorDescriptor
   }
 
  private:
-  CUDAExecutor* parent_;
   int max_seq_length_;
   int batch_size_;
   int data_size_;
@@ -1522,11 +1365,10 @@ class CudnnRnnSequenceTensorDescriptor
 
 class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  public:
-  CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
+  CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
                                 int batch_size, int data_size,
                                 cudnnDataType_t data_type)
-      : parent_(parent),
-        handle_(CreateTensorDescriptor()),
+      : handle_(CreateTensorDescriptor()),
         num_layers_(num_layers),
         batch_size_(batch_size),
         data_size_(data_size),
@@ -1546,7 +1388,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
   int data_size() const { return data_size_; }
 
  private:
-  CUDAExecutor* parent_;
   TensorDescriptor handle_;
   int num_layers_;
   int batch_size_;
@@ -1710,14 +1551,14 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     }
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -1802,7 +1643,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
   }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -1853,14 +1694,14 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
                       CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
                                          workspace_allocator));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -1959,7 +1800,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
   }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -2008,11 +1849,12 @@ CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(
     int max_seq_length, int batch_size, int data_size,
-    const absl::Span<const int>& seq_lengths, dnn::DataType data_type) {
+    const absl::Span<const int>& seq_lengths, bool time_major,
+    dnn::DataType data_type) {
   SE_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
                       CudnnRnnSequenceTensorDescriptor::Create(
                           parent_, max_seq_length, batch_size, data_size,
-                          seq_lengths, ToCudnnDataType(data_type)));
+                          seq_lengths, time_major, ToCudnnDataType(data_type)));
   return std::unique_ptr<dnn::RnnSequenceTensorDescriptor>(
       new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
@@ -2519,7 +2361,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     algo_desc = dnn::AlgorithmDesc(algo, /*use_tensor_ops=*/true);
   }
 
-  auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
+  const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
       stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
       scratch_allocator);
 
@@ -2568,7 +2410,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     algo_desc = dnn::AlgorithmDesc(algo, /*use_tensor_ops=*/true);
   }
 
-  auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
+  const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
       stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
       scratch_allocator);
 
@@ -2778,65 +2620,6 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 }
 #endif
 
-template <class ElementType>
-dnn::ConvolutionProto GenerateConvProto(
-    dnn::ConvolutionKind kind, const dnn::BatchDescriptor& input_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const dnn::BatchDescriptor& output_descriptor, dnn::AlgorithmDesc algorithm,
-    const dnn::ConvolutionDescriptor& convolution_descriptor, double conv_scale,
-    double side_value_scale, dnn::DataType acc_type,
-    dnn::ActivationMode activation) {
-  dnn::ConvolutionProto conv_config;
-  auto element_type = dnn::ToDataType<ElementType>::value;
-
-  conv_config.set_kind(kind);
-  *conv_config.mutable_input() = input_descriptor.ToProto(element_type);
-  *conv_config.mutable_filter() = filter_descriptor.ToProto(element_type);
-  *conv_config.mutable_output() = output_descriptor.ToProto(element_type);
-  *conv_config.mutable_algorithm() = algorithm.ToProto();
-  *conv_config.mutable_conv_desc() = convolution_descriptor.ToProto();
-  conv_config.mutable_conv_desc()->set_compute_mode(acc_type);
-  conv_config.set_conv_scale(conv_scale);
-  conv_config.set_side_value_scale(side_value_scale);
-  conv_config.set_activation(activation);
-  return conv_config;
-}
-
-void LogCudaProto(const dnn::ConvolutionProto& conv, float profile_time_ms,
-                  StreamExecutor* stream_executor) {
-  {
-    // For rolling-out, temporarily cap the number of logs per process.
-    // TODO(timshen): remove it.
-    static int count_down = 200;
-    if (count_down == 0) {
-      return;
-    }
-    count_down--;
-  }
-
-  ConvLogEntry conv_log;
-  *conv_log.mutable_convolution() = conv;
-  conv_log.set_profile_time_ms(profile_time_ms);
-
-  auto info = conv_log.mutable_cuda_info();
-  int cc_major, cc_minor;
-  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                  &cc_minor);
-  info->mutable_compute_capability()->set_major(cc_major);
-  info->mutable_compute_capability()->set_minor(cc_minor);
-
-  if (auto* dnn = stream_executor->AsDnn()) {
-    port::StatusOr<dnn::VersionInfo> version_or = dnn->GetVersion();
-    if (version_or.ok()) {
-      const auto& version = version_or.ValueOrDie();
-      info->mutable_cudnn_version()->set_major(version.major_version());
-      info->mutable_cudnn_version()->set_minor(version.minor_version());
-      info->mutable_cudnn_version()->set_patch(version.patch());
-    }
-  }
-  tensorflow::Logger::Singleton()->LogProto(conv_log);
-}
-
 }  // namespace
 
 port::Status CudnnSupport::DoPrepareForConvolution(
@@ -2897,21 +2680,20 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   return port::Status::OK();
 }
 
-template <class T>
-port::Status CudnnSupport::DoConvolveImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
+port::Status CudnnSupport::DoConvolve(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    dnn::DataType accumulator_type, const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
+    dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
     dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  cudnnDataType_t cudnn_type = ToCudnnDataType(element_type);
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter_nd(filter_descriptor, cudnn_type);
+  auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
 
@@ -2929,21 +2711,21 @@ port::Status CudnnSupport::DoConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
 
-  // Report an error if we might be hitting a cuDNN bug that accesses illegal
-  // memory. See nvbugs/2138754, b/80018418.
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
+  const auto get_fwd_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that accesses illegal
+    // memory. See nvbugs/2138754, b/80018418.
+    if (CUDNN_VERSION < 7300) {
       if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
@@ -2951,7 +2733,7 @@ port::Status CudnnSupport::DoConvolveImpl(
         return port::Status::OK();
       }
       // Checks that a*b is within the valid range (as provided by NVIDIA).
-      auto check_sizes = [](size_t a, size_t b) {
+      const auto check_sizes = [](size_t a, size_t b) {
         if ((a * b * 4608 - 1) >> 31 == 0) {
           return port::Status::OK();
         }
@@ -2966,42 +2748,171 @@ port::Status CudnnSupport::DoConvolveImpl(
       SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
                                      output_descriptor.feature_map_count()));
       return port::Status::OK();
-    }());
-  }
+    }
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+    return port::Status::OK();
+  };
 
-  if (algorithm_desc.algo_id() ==
-          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
+  auto get_bwd_data_bugs = [&]() -> port::Status {
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
 
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
-      cudnn.handle(),
-      /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvForwardAlgo(algorithm_desc),
-      /*workSpace=*/scratch_memory->opaque(),
-      /*workSpaceSizeInBytes=*/scratch_memory->size(), /*beta=*/beta,
-      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
+    // Cudnn 7.1.4 has a bug if the workspace of the following convolution is
+    // not zero-initialized, nvbugs/2254619.
+    if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&scratch_memory, scratch_memory.size());
+    }
+    return port::Status::OK();
+  };
+
+  const auto get_bwd_filter_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that produces
+    // incorrect results. See nvbugs/2072856
+    if (CUDNN_VERSION < 7300) {
+      SE_RETURN_IF_ERROR([&] {
+        if (algorithm_desc.algo_id() !=
+            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+          return port::Status::OK();
+        }
+        if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
+          return port::Status::OK();
+        }
+        int convolution_size = output_descriptor.height() > 1
+                                   ? filter_descriptor.input_filter_height()
+                                   : filter_descriptor.input_filter_width();
+        if (convolution_size <= 32) {
+          return port::Status::OK();
+        }
+        cudnnConvolutionMode_t convolution_mode;
+        cudnnDataType_t compute_type;
+        RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
+            conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
+            &convolution_mode, &compute_type));
+        if (convolution_mode != CUDNN_CONVOLUTION) {
+          return port::Status::OK();
+        }
+        return port::Status(
+            port::error::FAILED_PRECONDITION,
+            "This configuration potentially produces incorrect results.");
+      }());
+    }
+
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+
+    // Zero out the result buffer for strided conv backward filter for NHWC
+    // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is
+    // not zeroed.
+    //
+    // This wrong result caused by the bug is very flaky. It needs to be run for
+    // up to 20 times to produce a mismatch.
+    //
+    // See nvbugs/2379553.
+    if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&filter_data, filter_data.size());
+    }
+    return port::Status::OK();
+  };
+
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      SE_RETURN_IF_ERROR(get_fwd_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
+          cudnn.handle(),
+          /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(), /*filterDesc=*/filter_nd.handle(),
+          /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvForwardAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(), /*beta=*/beta,
+          /*yDesc=*/output_nd.handle(), /*y=*/output_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      SE_RETURN_IF_ERROR(get_bwd_data_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*wDesc=*/filter_nd.handle(),
+          /*w=*/filter_data.opaque(),
+          /*dyDesc=*/output_nd.handle(),
+          /*dy=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*dxDesc=*/input_nd.handle(),
+          /*dx=*/input_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      SE_RETURN_IF_ERROR(get_bwd_filter_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(),
+          /*diffDesc=*/output_nd.handle(),
+          /*diffData=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*gradDesc=*/filter_nd.handle(),
+          /*dw=*/filter_data.opaque()));
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch_memory->size());
-
-    LogCudaProto(
-        GenerateConvProto<T>(
-            dnn::ConvolutionKind::FORWARD, input_descriptor, filter_descriptor,
-            output_descriptor, algorithm_desc, convolution_descriptor, dalpha,
-            dbeta, accumulator_type, dnn::ActivationMode::kNone),
-        output_profile_result->elapsed_time_in_ms(), stream->parent());
+    output_profile_result->set_scratch_size(scratch_memory.size());
   }
 
   return port::Status::OK();
@@ -3053,13 +2964,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
           stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
           output_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -3112,20 +3023,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
     output_profile_result->set_scratch_size(scratch.size());
-
-    LogCudaProto(GenerateConvProto<ElementType>(
-                     dnn::ConvolutionKind::FORWARD, conv_input_descriptor,
-                     filter_descriptor, output_descriptor, algo_desc,
-                     convolution_descriptor, conv_input_scale, side_input_scale,
-                     accumulator_type, activation_mode),
-                 output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
@@ -3436,62 +3340,6 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<float>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float>* output_data, const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, GetConvAccumulatorType(dnn::DataType::kFloat),
-                     algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data,
-                     GetConvAccumulatorType(dnn::DataType::kDouble),
-                     algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half>* output_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, GetConvAccumulatorType(dnn::DataType::kHalf),
-                     algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
 bool CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
@@ -3610,7 +3458,7 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   CudnnTensorDescriptor output_tensor_desc(
       output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnTransformTensor(
         cudnn.handle(), &scale, input_tensor_desc.handle(), input_data.opaque(),
         &beta, output_tensor_desc.handle(), output_data->opaque()));
@@ -3619,363 +3467,6 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardDataImpl(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  if (algorithm_desc.algo_id() ==
-          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
-  // zero-initialized, nvbugs/2254619.
-  if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-      algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(scratch_memory, scratch_memory->size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData(
-      cudnn.handle(),
-      /*alpha=*/alpha,
-      /*wDesc=*/filter.handle(),
-      /*w=*/filter_data.opaque(),
-      /*dyDesc=*/out_back_nd.handle(),
-      /*dy=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
-      /*workSpace=*/scratch_memory->opaque(),
-      /*workSpaceSizeInBytes=*/scratch_memory->size(),
-      /*beta=*/beta,
-      /*dxDesc=*/in_back_nd.handle(),
-      /*dx=*/backward_input_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algorithm_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch_memory->size());
-
-    LogCudaProto(GenerateConvProto<T>(
-                     dnn::ConvolutionKind::BACKWARD_DATA, input_descriptor,
-                     filter_descriptor, output_descriptor, algorithm_desc,
-                     convolution_descriptor, dalpha, dbeta, accumulator_type,
-                     dnn::ActivationMode::kNone),
-                 output_profile_result->elapsed_time_in_ms(), stream->parent());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<double>* backward_input_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, GetConvAccumulatorType(dnn::DataType::kDouble),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<float>* backward_input_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, GetConvAccumulatorType(dnn::DataType::kFloat),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<Eigen::half>* backward_input_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, GetConvAccumulatorType(dnn::DataType::kHalf),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  // Report an error if we might be hitting a cuDNN bug that produces incorrect
-  // results. See nvbugs/2072856
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
-      if (algorithm_desc.algo_id() !=
-          CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
-        return port::Status::OK();
-      }
-      if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
-        return port::Status::OK();
-      }
-      int convolution_size = output_descriptor.height() > 1
-                                 ? filter_descriptor.input_filter_height()
-                                 : filter_descriptor.input_filter_width();
-      if (convolution_size <= 32) {
-        return port::Status::OK();
-      }
-      cudnnConvolutionMode_t convolution_mode;
-      cudnnDataType_t compute_type;
-      RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
-          conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
-          &convolution_mode, &compute_type));
-      if (convolution_mode != CUDNN_CONVOLUTION) {
-        return port::Status::OK();
-      }
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration potentially produces incorrect results.");
-    }());
-  }
-
-  if (algorithm_desc.algo_id() ==
-          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Zero out the result buffer for strided conv backward filter for NHWC
-  // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is not
-  // zeroed.
-  //
-  // This wrong result caused by the bug is very flaky. It needs to be run for
-  // up to 20 times to produce a mismatch.
-  //
-  // See nvbugs/2379553.
-  if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-      algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(backward_filter_data, backward_filter_data->size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
-      cudnn.handle(),
-      /*alpha=*/alpha,
-      /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(),
-      /*diffDesc=*/out_back_nd.handle(),
-      /*diffData=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
-      /*workSpace=*/scratch_memory->opaque(),
-      /*workSpaceSizeInBytes=*/scratch_memory->size(),
-      /*beta=*/beta,
-      /*gradDesc=*/filter.handle(),
-      /*dw=*/backward_filter_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algorithm_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch_memory->size());
-
-    LogCudaProto(GenerateConvProto<T>(
-                     dnn::ConvolutionKind::BACKWARD_FILTER, input_descriptor,
-                     filter_descriptor, output_descriptor, algorithm_desc,
-                     convolution_descriptor, dalpha, dbeta, accumulator_type,
-                     dnn::ActivationMode::kNone),
-                 output_profile_result->elapsed_time_in_ms(), stream->parent());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<double>* backward_filter_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, GetConvAccumulatorType(dnn::DataType::kDouble),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<float>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<float>* backward_filter_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, GetConvAccumulatorType(dnn::DataType::kFloat),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<Eigen::half>* backward_filter_data,
-    const dnn::AlgorithmDesc& algorithm_desc,
-    DeviceMemory<uint8>* scratch_memory,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, GetConvAccumulatorType(dnn::DataType::kHalf),
-          algorithm_desc, scratch_memory, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
 template <class T>
 port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -4200,7 +3691,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnAddTensor(
         cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(),
         &beta, input_descriptor.handle(), output_data->opaque()));
@@ -4225,7 +3716,7 @@ bool CudnnSupport::DoActivate(Stream* stream,
   float beta = 0.0;
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnActivationForward(
         cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
         input_data.opaque(), &beta, input_nd.handle(), output_data->opaque()));
@@ -4250,7 +3741,7 @@ bool CudnnSupport::DoPoolForward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
         cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
         input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
@@ -4275,7 +3766,7 @@ bool CudnnSupport::DoPoolForward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
         cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
         input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
@@ -4300,7 +3791,7 @@ bool CudnnSupport::DoPoolForward(
   CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
         cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
         input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
@@ -4325,7 +3816,7 @@ bool CudnnSupport::DoPoolForward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
         cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
         input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
@@ -4353,7 +3844,7 @@ bool CudnnSupport::DoPoolBackward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
         cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
         output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
@@ -4383,7 +3874,7 @@ bool CudnnSupport::DoPoolBackward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
         cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
         output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
@@ -4413,7 +3904,7 @@ bool CudnnSupport::DoPoolBackward(
   CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
         cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
         output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
@@ -4424,13 +3915,6 @@ bool CudnnSupport::DoPoolBackward(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-bool CudnnSupport::DoNormalize(
-    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
-    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
-  LOG(FATAL) << "not yet implemented";  // TODO(leary)
-  return false;
-}
-
 bool CudnnSupport::DoNormalizeWithDimensions(
     Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
     const dnn::BatchDescriptor& dimensions,
@@ -4456,7 +3940,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   // Launch the normalization.
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelForward(
         cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
         &alpha, dims.handle(), input_data.opaque(), &beta, dims.handle(),
@@ -4490,7 +3974,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   float beta = 0.0f;
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelBackward(
         cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
         &alpha, dims.handle(), normalized_data.opaque(), dims.handle(),
@@ -4613,7 +4097,7 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
-  auto status = [&] {
+  const auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdForwardOutputDim(
         conv.handle(), input_nd.handle(), filter.handle(), dn, dims.data()));
     output_batch_descriptor->set_count(dims[0])
@@ -4629,22 +4113,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cudnn() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
-          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
           [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
-            cuda::CUDAExecutor* cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
             if (!dnn->Init().ok()) {
               // Note: Init() will log a more specific error.
               delete dnn;
@@ -4659,7 +4143,7 @@ void initialize_cudnn() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
+      cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 383a99e118b3ce7cb3dd66fea9e95ee387b78fc5..3a49469651c6532f50ae82822c21723b1a341c4a 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 class CudnnRnnDescriptor;
 class CudnnRnnSequenceTensorDescriptor;
 class CudnnRnnStateTensorDescriptor;
@@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
 // functions, see dnn.h.
 class CudnnSupport : public dnn::DnnSupport {
  public:
-  explicit CudnnSupport(CUDAExecutor* parent);
+  explicit CudnnSupport(GpuExecutor* parent);
 
   port::Status Init() override;
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@@ -63,6 +63,7 @@ class CudnnSupport : public dnn::DnnSupport {
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
+                                    bool time_major,
                                     dnn::DataType data_type) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
@@ -258,38 +259,16 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* scale_backprop,
       DeviceMemory<float>* offset_backprop) override;
 
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<float>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<float>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<float>* output_data,
-                  const dnn::AlgorithmDesc& algorithm_desc,
-                  DeviceMemory<uint8>* scratch_memory,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<double>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<double>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<double>* output_data,
-                  const dnn::AlgorithmDesc& algorithm_desc,
-                  DeviceMemory<uint8>* scratch_memory,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<Eigen::half>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<Eigen::half>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<Eigen::half>* output_data,
-                  const dnn::AlgorithmDesc& algorithm_desc,
-                  DeviceMemory<uint8>* scratch_memory,
-                  dnn::ProfileResult* output_profile_result) override;
+  port::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
 
   bool DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -390,78 +369,6 @@ class CudnnSupport : public dnn::DnnSupport {
     return false;
   }
 
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) override;
-
   bool DoConvolveBackwardBias(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<double>& input_data,
@@ -578,11 +485,6 @@ class CudnnSupport : public dnn::DnnSupport {
                       DeviceMemory<Eigen::half>* output_diff_data,
                       ScratchAllocator* workspace_allocator) override;
 
-  bool DoNormalize(Stream* stream,
-                   const dnn::NormalizeDescriptor& normalize_descriptor,
-                   const DeviceMemory<float>& input_data,
-                   DeviceMemory<float>* output_data) override;
-
   bool DoNormalizeWithDimensions(
       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
       const dnn::BatchDescriptor& dimensions,
@@ -646,7 +548,7 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemoryBase* output_data) override;
 
  private:
-  CUDAExecutor* parent_;  // Parent executor object. Not owned.
+  GpuExecutor* parent_;  // Parent executor object. Not owned.
 
   // Provides access to the cuDNN handle.
   std::unique_ptr<class CudnnAccess> cudnn_;
@@ -676,19 +578,6 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
       DeviceMemory<U>* offset_backprop);
 
-  template <class T>
-  port::Status DoConvolveImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result);
-
   template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -706,32 +595,6 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <class T>
-  port::Status DoConvolveBackwardDataImpl(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result);
-
-  template <class T>
-  port::Status DoConvolveBackwardFilterImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result);
-
   template <class T>
   port::Status DoConvolveBackwardBiasImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -800,7 +663,7 @@ class CudnnSupport : public dnn::DnnSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index c39e4f59eb0ffec805a5339113101d500414db2f..6aca1687c53a7a16ddb4605516913f33e1e25ce2 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -23,9 +23,10 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/debugging/leak_check.h"
 #include "absl/strings/str_cat.h"
+#include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
@@ -45,21 +46,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 
 // Debugging: on each push and pop of a cuda context, verify the current context
 // matches the expected one.
-constexpr bool kVerifyCudaContext = false;
+constexpr bool kVerifyGpuContext = false;
 
 namespace stream_executor {
-namespace cuda {
-
+namespace gpu {
 namespace {
 
 // Manages the singleton map of contexts that we've created, mapping
-// from the CUcontext to the CudaContext* that we pass around internally.
-// This also manages assignment of unique ids to CudaContexts, to allow
+// from the CUcontext to the GpuContext* that we pass around internally.
+// This also manages assignment of unique ids to GpuContexts, to allow
 // for fast comparison of a context against the current context.
 //
 // CUDA-runtime-created contexts are avoided, if triple angle
 // brace launches are required, by using the scoped activations in
-// cuda_activation.h.
+// gpu/gpu_activation.h.
 class CreatedContexts {
  public:
   // Returns whether context is a member of the live set.
@@ -69,14 +69,14 @@ class CreatedContexts {
   }
 
   // Adds context to the live set, or returns it if it's already present.
-  static CudaContext* Add(CUcontext context) {
+  static GpuContext* Add(CUcontext context) {
     CHECK(context != nullptr);
     mutex_lock lock(mu_);
     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
     auto it = insert_result.first;
     if (insert_result.second) {
       // context was not present in the map.  Add it.
-      it->second = MakeUnique<CudaContext>(context, next_id_++);
+      it->second = MakeUnique<GpuContext>(context, next_id_++);
     }
     return it->second.get();
   }
@@ -92,9 +92,9 @@ class CreatedContexts {
 
  private:
   // Returns the live map singleton.
-  static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
+  static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
     static auto singleton =
-        new std::map<CUcontext, std::unique_ptr<CudaContext>>;
+        new std::map<CUcontext, std::unique_ptr<GpuContext>>;
     return singleton;
   }
 
@@ -108,12 +108,12 @@ class CreatedContexts {
 
 // Formats CUresult to output prettified values into a log stream.
 string ToString(CUresult result) {
-  const char *error_name;
-  if (tensorflow::wrap::cuGetErrorName(result, &error_name)) {
+  const char* error_name;
+  if (cuGetErrorName(result, &error_name)) {
     return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
   }
-  const char *error_string;
-  if (tensorflow::wrap::cuGetErrorString(result, &error_string)) {
+  const char* error_string;
+  if (cuGetErrorString(result, &error_string)) {
     return error_name;
   }
   return absl::StrCat(error_name, ": ", error_string);
@@ -123,7 +123,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current = CUDADriver::CurrentContextOrDie();
+  CUcontext current = cuda::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -140,14 +140,14 @@ CUcontext CurrentContext() {
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool *InitializeDriverExecutor() {
+static port::ThreadPool* InitializeDriverExecutor() {
   return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
                               "cuda_driver", 1);
 }
 
-port::ThreadPool *GetDriverExecutor() {
+port::ThreadPool* GetDriverExecutor() {
   mutex_lock lock(driver_executor_threadpool_mu);
-  static port::ThreadPool *thread_pool = InitializeDriverExecutor();
+  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
   return thread_pool;
 }
 
@@ -166,18 +166,36 @@ string MemorySpaceString(MemorySpace memory_space) {
 
 namespace {
 
+template <typename PtrT>
+bool PointerIsValid(const PtrT ptr) {
+  // Checks that the pointer is to a location on the device it purports to be.
+  // PtrT is one of CUdeviceptr or void*.  If it's a CUdeviceptr, then
+  // cudaPointerGetAttributes should not fail, and return a memoryType of
+  // cudaMemoryTypeDevice.
+
+  bool is_host_ptr = !std::is_same<PtrT, CUdeviceptr>::value;
+  cudaPointerAttributes attributes;
+  cudaError_t err =
+      cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
+  // If we failed, reset cuda error status to avoid poisoning cuda streams.
+  if (err != cudaSuccess) cudaGetLastError();
+  bool points_to_host_memory = (err == cudaErrorInvalidValue ||
+                                attributes.memoryType != cudaMemoryTypeDevice);
+  return (is_host_ptr == points_to_host_memory);
+}
+
 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = tensorflow::wrap::cuCtxSynchronize();
+  auto res = cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
-    LOG(FATAL) << "Synchronize found "
-               << ToString(res) << " :: " << port::CurrentStackTrace();
+    LOG(FATAL) << "Synchronize found " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
   }
 }
 
 struct ThreadLocalData {
   int64 id;
-  CudaContext* context;  // Only valid if id == a known good context.
+  GpuContext* context;  // Only valid if id == a known good context.
   int depth;
 };
 
@@ -185,13 +203,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
 
 }  // namespace
 
-ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
+ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
 
   auto* tls = &tls_data.get();
   tls->depth++;
   if (tls->id == cuda_context->id()) {
-    if (kVerifyCudaContext) {
+    if (kVerifyGpuContext) {
       CHECK_EQ(CurrentContext(), cuda_context->context());
     }
     DCHECK_EQ(CurrentContext(), cuda_context->context());
@@ -204,8 +222,7 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
 
   // Set the context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS,
-           tensorflow::wrap::cuCtxSetCurrent(cuda_context->context()));
+  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
   tls->id = cuda_context->id();
   tls->context = cuda_context;
 }
@@ -215,8 +232,8 @@ ScopedActivateContext::~ScopedActivateContext() {
 
   auto* tls = &tls_data.get();
 
-  if (kVerifyCudaContext) {
-    // Note that if kVerifyCudaContext is used, and contexts are deleted, it's
+  if (kVerifyGpuContext) {
+    // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
     // possible this could fail in the CurrentContext() call.
     CHECK_EQ(CurrentContext(),
              tls->context == nullptr ? nullptr : tls->context->context());
@@ -230,8 +247,7 @@ ScopedActivateContext::~ScopedActivateContext() {
   }
 
   // Set context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS,
-           tensorflow::wrap::cuCtxSetCurrent(to_restore_->context()));
+  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
   tls->id = to_restore_->id();
   tls->context = to_restore_;
 }
@@ -242,7 +258,7 @@ namespace {
 // logging purposes. Returns "?" if the device could not be successfully
 // queried.
 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerDevice(pointer);
+  auto value = GpuDriver::GetPointerDevice(pointer);
   if (value.ok()) {
     return absl::StrCat(value.ValueOrDie());
   }
@@ -254,7 +270,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
 // logging purposes. Returns "?" if the memory space could not be successfully
 // queried.
 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
   if (value.ok()) {
     return MemorySpaceString(value.ValueOrDie());
   }
@@ -267,25 +283,24 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
 // primarily for logging purposes. Returns "error" if an error is encountered
 // in the process of querying.
 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
-  auto from_context = CUDADriver::GetPointerContext(from);
+  auto from_context = GpuDriver::GetPointerContext(from);
   if (!from_context.ok()) {
     LOG(ERROR) << "could not retrieve source pointer's context: "
                << from_context.status();
     return "error";
   }
-  auto to_context = CUDADriver::GetPointerContext(to);
+  auto to_context = GpuDriver::GetPointerContext(to);
   if (!to_context.ok()) {
     LOG(ERROR) << "could not retrieve destination pointer's context: "
                << to_context.status();
     return "error";
   }
-  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
-                                         to_context.ValueOrDie())
+  return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                        to_context.ValueOrDie())
              ? "true"
              : "false";
 }
 
-
 // Actually performs the work of CUDA initialization. Wrapped up in one-time
 // execution guard.
 static port::Status InternalInit() {
@@ -293,7 +308,7 @@ static port::Status InternalInit() {
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
   } else {
-    res = tensorflow::wrap::cuInit(0 /* = flags */);
+    res = cuInit(0 /* = flags */);
   }
 
   if (res == CUDA_SUCCESS) {
@@ -308,12 +323,12 @@ static port::Status InternalInit() {
 
 }  // namespace
 
-/* static */ port::Status CUDADriver::Init() {
+/* static */ port::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
-  // called once, but CUDADriver::Init may be called many times.
+  // called once, but GpuDriver::Init may be called many times.
   static port::Status init_retval;
   static bool set = false;
-  static mutex *init_mu = new mutex;
+  static mutex* init_mu = new mutex;
 
   mutex_lock lock(*init_mu);
   if (!set) {
@@ -324,9 +339,9 @@ static port::Status InternalInit() {
   return init_retval;
 }
 
-/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
-                                                CUdevice *device) {
-  CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               CUdevice* device) {
+  CUresult res = cuDeviceGet(device, device_ordinal);
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   }
@@ -336,12 +351,11 @@ static port::Status InternalInit() {
       absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
-/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
-                                            string *device_name) {
+/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
+                                           string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
-  CUresult res =
-      tensorflow::wrap::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -352,8 +366,8 @@ static port::Status InternalInit() {
   return true;
 }
 
-bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
-                                 int *flags) {
+bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
+                                 int* flags) {
   static_assert(DeviceOptions::kMask == 0xf,
                 "needs update for new device options");
 
@@ -376,9 +390,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, const DeviceOptions &device_options,
-    CudaContext **context) {
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, CUdevice device, const DeviceOptions& device_options,
+    GpuContext** context) {
   *context = nullptr;
 
   int flags = 0;
@@ -392,9 +406,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 
   unsigned int former_primary_context_flags;
   int former_primary_context_is_active;
-  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuDevicePrimaryCtxGetState(
-                             device, &former_primary_context_flags,
-                             &former_primary_context_is_active));
+  CHECK_EQ(CUDA_SUCCESS,
+           cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                      &former_primary_context_is_active));
   if (former_primary_context_flags != flags) {
     if (former_primary_context_is_active) {
       LOG(ERROR)
@@ -402,16 +416,15 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
           << former_primary_context_flags << ") than the desired flag set ("
           << flags << ").";
     } else {
-      CHECK_EQ(CUDA_SUCCESS,
-               tensorflow::wrap::cuDevicePrimaryCtxSetFlags(device, flags));
+      CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
     }
   }
 
-  former_context = CUDADriver::CurrentContextOrDie();
-  res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
+  former_context = cuda::CurrentContextOrDie();
+  res = cuDevicePrimaryCtxRetain(&new_context, device);
   if (former_context != nullptr) {
     CUdevice former_device;
-    if (tensorflow::wrap::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+    if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
       if (former_device == device) {
         if (former_context == new_context) {
           VLOG(2) << "The primary context " << former_context << " for device "
@@ -430,7 +443,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
                  << former_context;
     }
   }
-  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuCtxSetCurrent(former_context));
+  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
 
   if (res == CUDA_SUCCESS) {
     *context = CreatedContexts::Add(new_context);
@@ -454,17 +467,17 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return port::Status(port::error::INTERNAL, message);
 }
 
-/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
   if (context == nullptr) {
     return;
   }
   CUcontext former_context = CurrentContext();
-  CUresult res = tensorflow::wrap::cuCtxSetCurrent(context->context());
+  CUresult res = cuCtxSetCurrent(context->context());
   CUdevice device;
-  tensorflow::wrap::cuCtxGetDevice(&device);
-  tensorflow::wrap::cuCtxSetCurrent(former_context);
+  cuCtxGetDevice(&device);
+  cuCtxSetCurrent(former_context);
 
-  res = tensorflow::wrap::cuDevicePrimaryCtxRelease(device);
+  res = cuDevicePrimaryCtxRelease(device);
 
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
@@ -473,11 +486,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   CreatedContexts::Remove(context->context());
 }
 
-/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
-                                               CUfunction func,
-                                               int *attribute_value) {
-  CUresult res =
-      tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
+/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                              CUfunction func,
+                                              int* attribute_value) {
+  CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
                << ", attribute: " << attribute;
@@ -486,9 +498,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
-                                                 CUfunc_cache cache_config) {
-  CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
+/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
+                                                CUfunc_cache cache_config) {
+  CUresult res = cuFuncSetCacheConfig(function, cache_config);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -499,14 +511,13 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }
 
 /* static */ port::StatusOr<CUsharedconfig>
-CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation(context);
-  CUresult result =
-      tensorflow::wrap::cuCtxGetSharedMemConfig(&shared_mem_config);
+  CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    tensorflow::wrap::cuCtxGetDevice(&device);
+    cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
@@ -517,14 +528,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return shared_mem_config;
 }
 
-/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
-    CudaContext* context, CUsharedconfig shared_mem_config) {
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation(context);
-  CUresult result =
-      tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
+  CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    tensorflow::wrap::cuCtxGetDevice(&device);
+    cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to set CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
@@ -536,20 +546,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LaunchKernel(
-    CudaContext* context, CUfunction function, unsigned int grid_dim_x,
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, CUfunction function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
-    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
-    void **extra) {
+    unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
+    void** extra) {
   ScopedActivateContext activation(context);
   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
-  CUresult res = tensorflow::wrap::cuLaunchKernel(
-      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
-      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
+                                block_dim_x, block_dim_y, block_dim_z,
+                                shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to launch CUDA kernel: " << function
                << "; result: " << ToString(res);
@@ -559,12 +569,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
-                                                const char *cubin_bytes,
-                                                CUmodule *module) {
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               CUmodule* module) {
   ScopedActivateContext activation(context);
-  CUresult result =
-      tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
+  CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(port::error::INTERNAL,
                         "failed to load in-memory CUBIN: " + ToString(result));
@@ -573,15 +582,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
-                                      const char *ptx_contents,
-                                      CUmodule *module) {
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     CUmodule* module) {
   port::Notification notification;
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
     ScopedActivateContext activation(context);
-    void *ptx_data = const_cast<char *>(ptx_contents);
+    void* ptx_data = const_cast<char*>(ptx_contents);
     static const unsigned int kLogBufferBytesLimit = 1024;
     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
@@ -594,21 +603,21 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
     // Note that the driver API wants the contents of this values to be stored
     // in an array of void*s, so we coerce them accordingly.
-    void *option_values[] = {
-        absl::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
-        absl::bit_cast<void *>(error_log_buffer.data()),
-        absl::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
-        absl::bit_cast<void *>(info_log_buffer.data()),
-        absl::bit_cast<void *>(uintptr_t(log_verbose))};
+    void* option_values[] = {
+        absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
+        absl::bit_cast<void*>(error_log_buffer.data()),
+        absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
+        absl::bit_cast<void*>(info_log_buffer.data()),
+        absl::bit_cast<void*>(uintptr_t(log_verbose))};
     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
     {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
-
-      res = tensorflow::wrap::cuModuleLoadDataEx(
-          module, ptx_data, TF_ARRAYSIZE(options), options, option_values);
+      absl::LeakCheckDisabler disabler;
+      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), options,
+                               option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
@@ -623,8 +632,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
       // As a precaution for null termination of the API-provided value, ensure
       // that at least the last byte is null.
-      error_log_buffer[error_log_buffer_bytes ?
-                       error_log_buffer_bytes - 1 : 0] = '\0';
+      error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
+                                              : 0] = '\0';
       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
                  << " bytes): " << error_log_buffer.data();
       ret = false;
@@ -643,11 +652,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return ret;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
-                                                     CUdeviceptr location,
-                                                     uint8 value, size_t size) {
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
+  return false;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    CUdeviceptr location,
+                                                    uint8 value, size_t size) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
+  CUresult res = cuMemsetD8(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -655,12 +671,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint32 value,
-                                                      size_t uint32_count) {
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
+  CUresult res = cuMemsetD32(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -668,14 +684,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint8 value,
-                                                      size_t uint32_count,
-                                                      CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res =
-      tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
+  CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -684,14 +699,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
-                                                       CUdeviceptr location,
-                                                       uint32 value,
-                                                       size_t uint32_count,
-                                                       CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res =
-      tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
+  CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -700,13 +714,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
-                                                CUstream stream,
-                                                StreamCallback callback,
-                                                void *data) {
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               CUstream stream,
+                                               StreamCallback callback,
+                                               void* data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
-  CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
-                                                       0 /* = flags */);
+  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -714,14 +727,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
-                                                CUmodule module,
-                                                const char *kernel_name,
-                                                CUfunction *function) {
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               CUmodule module,
+                                               const char* kernel_name,
+                                               CUfunction* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  CUresult res =
-      tensorflow::wrap::cuModuleGetFunction(function, module, kernel_name);
+  CUresult res = cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -731,16 +743,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
-                                              CUmodule module,
-                                              const char *symbol_name,
-                                              CUdeviceptr *dptr,
-                                              size_t *bytes) {
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             CUmodule module,
+                                             const char* symbol_name,
+                                             CUdeviceptr* dptr, size_t* bytes) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  CUresult res =
-      tensorflow::wrap::cuModuleGetGlobal(dptr, bytes, module, symbol_name);
+  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
   if (res != CUDA_SUCCESS) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -752,21 +762,21 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ void CUDADriver::UnloadModule(CudaContext *context,
-                                           CUmodule module) {
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          CUmodule module) {
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuModuleUnload(module);
+  CUresult res = cuModuleUnload(module);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
   }
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
-    CudaContext* context) {
+/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
+    GpuContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
-  CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
+  CUresult result = cuCtxGetDevice(&device);
   if (result == CUDA_SUCCESS) {
     return device;
   }
@@ -776,32 +786,32 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool CUDADriver::CreateStream(CudaContext *context,
-                                           CUstream *out) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          CUstream* stream) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuStreamCreate(out, 0);
+  CUresult res = cuStreamCreate(stream, 0);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context "
                << context->context() << ": " << ToString(res);
     return false;
   }
 
-  VLOG(2) << "successfully created stream " << *out << " for context "
+  VLOG(2) << "successfully created stream " << *stream << " for context "
           << context->context() << " on thread";
   return true;
 }
 
-/* static */ void CUDADriver::DestroyStream(CudaContext* context,
-                                            CUstream *stream) {
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           CUstream* stream) {
   if (*stream == nullptr) {
     return;
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuStreamDestroy(*stream);
+  CUresult res = cuStreamDestroy(*stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to destroy CUDA stream for context "
                << context->context() << ": " << ToString(res);
@@ -812,28 +822,32 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
-                                              uint64 bytes) {
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
+  if (bytes == 0) {
+    return nullptr;
+  }
+
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
-  CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
+  CUresult res = cuMemAlloc(&result, bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
                << " bytes) from device: " << ToString(res);
     return nullptr;
   }
-  void *ptr = reinterpret_cast<void *>(result);
+  void* ptr = reinterpret_cast<void*>(result);
   VLOG(2) << "allocated " << ptr << " for context " << context->context()
           << " of " << bytes << " bytes";
   return ptr;
 }
 
-/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
-                                               void *location) {
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = tensorflow::wrap::cuMemFree(pointer);
+  CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
@@ -843,29 +857,28 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
-                                                     uint64 bytes) {
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
   ScopedActivateContext activation(context);
   CUdeviceptr result = 0;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res =
-      tensorflow::wrap::cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes unified memory; result: " << ToString(res);
     return nullptr;
   }
-  void *ptr = reinterpret_cast<void *>(result);
+  void* ptr = reinterpret_cast<void*>(result);
   VLOG(2) << "allocated " << ptr << " for context " << context->context()
           << " of " << bytes << " bytes in unified memory";
   return ptr;
 }
 
-/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
-                                                      void *location) {
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = tensorflow::wrap::cuMemFree(pointer);
+  CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free unified memory at " << location
                << "; result: " << ToString(res);
@@ -875,13 +888,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
-                                            uint64 bytes) {
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
   ScopedActivateContext activation(context);
-  void *host_mem = nullptr;
+  void* host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = tensorflow::wrap::cuMemHostAlloc(&host_mem, bytes,
-                                                  CU_MEMHOSTALLOC_PORTABLE);
+  CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -889,22 +900,22 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return host_mem;
 }
 
-/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
-                                             void *location) {
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemFreeHost(location);
+  CUresult res = cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
   }
 }
 
-/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
-                                           uint64 bytes) {
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
   ScopedActivateContext activation(context);
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = tensorflow::wrap::cuMemHostRegister(
-      location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+  CUresult res =
+      cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -913,10 +924,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
-                                             void *location) {
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
+  CUresult res = cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -925,15 +936,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
-                                                   CUevent *event) {
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  CUevent* event) {
   if (*event == nullptr) {
     return port::Status(port::error::INVALID_ARGUMENT,
                         "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuEventDestroy(*event);
+  CUresult res = cuEventDestroy(*event);
   *event = nullptr;
 
   switch (res) {
@@ -953,11 +964,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
-                                                  CUevent event,
-                                                  CUstream stream) {
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 CUevent event,
+                                                 CUstream stream) {
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
+  CUresult res = cuEventRecord(event, stream);
   switch (res) {
     case CUDA_SUCCESS:
       return port::Status::OK();
@@ -975,10 +986,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
-    CudaContext *context, CUevent event) {
+/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
+                                                            CUevent event) {
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuEventQuery(event);
+  CUresult res = cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status(
         port::error::INTERNAL,
@@ -988,18 +999,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return res;
 }
 
-/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
-                                                  float *elapsed_milliseconds,
-                                                  CUevent start, CUevent stop) {
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 CUevent start, CUevent stop) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for cuEventElapsedTime to
   // work.
-  CUresult res = tensorflow::wrap::cuEventSynchronize(stop);
+  CUresult res = cuEventSynchronize(stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = tensorflow::wrap::cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -1009,12 +1020,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
-                                                CUstream stream,
-                                                CUevent event) {
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               CUstream stream, CUevent event) {
   ScopedActivateContext activation(context);
-  CUresult res =
-      tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -1023,9 +1032,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuCtxSynchronize();
+  CUresult res = cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -1035,11 +1044,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
-                                                        CUstream stream) {
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
+  CUresult res = cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
         absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
@@ -1051,11 +1060,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
-                                           CUstream stream) {
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = tensorflow::wrap::cuStreamQuery(stream);
+  CUresult res = cuStreamQuery(stream);
   if (res == CUDA_SUCCESS) {
     return true;
   }
@@ -1066,93 +1075,121 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return false;
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
-                                                           void *host_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                          void* host_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(host_dst))
+        << "Destination pointer is not actually on CPU: " << host_dst;
+  }
+  CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
         port::Printf("failed to synchronous memcpy from device to host: %s; "
                      "host dst: %p; GPU src: %p; size: %llu=0x%llx",
                      ToString(res).c_str(), host_dst,
-                     absl::bit_cast<void *>(gpu_src), size, size));
+                     absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
           << host_dst;
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           const void *host_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          const void* host_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(host_src))
+        << "Source pointer is not actually on CPU: " << host_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
         " host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
         size));
   }
   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst),
-        absl::bit_cast<void *>(gpu_src), size, size));
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
-                                                    void *host_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res =
-      tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(host_dst))
+        << "Destination pointer is not actually on CPU: " << host_dst;
+  }
+  CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), host_dst, absl::bit_cast<void *>(gpu_src), size,
+        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
         size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
-          << " bytes from " << absl::bit_cast<void *>(gpu_src) << " to "
+          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
           << host_dst << " on stream " << stream;
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    const void *host_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res =
-      tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(host_src))
+        << "Source pointer is not actually on CPU: " << host_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
         "host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
         size);
     return false;
   }
@@ -1161,24 +1198,29 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult result =
-      tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to device: %s"
         "; GPU dst: %p on %s %s"
         "; GPU src: %p on %s %s"
         "; can access? %s; size: %llu=0x%llx",
-        ToString(result).c_str(), absl::bit_cast<void *>(gpu_dst),
+        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
         CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
         CUDAPointerToDeviceString(gpu_dst).c_str(),
-        absl::bit_cast<void *>(gpu_src),
+        absl::bit_cast<void*>(gpu_src),
         CUDAPointerToMemorySpaceString(gpu_src).c_str(),
         CUDAPointerToDeviceString(gpu_src).c_str(),
         CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
@@ -1189,9 +1231,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
-                                                  CUevent *result,
-                                                  EventFlags flags) {
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 CUevent* result,
+                                                 EventFlags flags) {
   int cuflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -1205,7 +1247,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = tensorflow::wrap::cuEventCreate(result, cuflags);
+  CUresult res = cuEventCreate(result, cuflags);
 
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
@@ -1219,9 +1261,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ int CUDADriver::GetDeviceCount() {
+/* static */ int GpuDriver::GetDeviceCount() {
   int device_count = 0;
-  CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
+  CUresult res = cuDeviceGetCount(&device_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
     return 0;
@@ -1233,11 +1275,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return device_count;
 }
 
-/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
+/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
     CUdeviceptr pointer) {
-  CudaContext* context = nullptr;
-  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
-      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  GpuContext* context = nullptr;
+  CUresult result =
+      cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == CUDA_SUCCESS) {
     CHECK(context != nullptr) << "success should entail non-null context";
     return context;
@@ -1249,11 +1291,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
-  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
-      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  CUresult result =
+      cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == CUDA_SUCCESS) {
     switch (value) {
       case CU_MEMORYTYPE_DEVICE:
@@ -1273,10 +1315,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
-                                                             CUdeviceptr *base,
-                                                             size_t *size) {
-  CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
+/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                            CUdeviceptr* base,
+                                                            size_t* size) {
+  CUresult result = cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (result == CUDA_ERROR_NOT_FOUND) {
@@ -1286,16 +1328,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::Status(
         port::error::NOT_FOUND,
         port::Printf("not a device pointer %p; %s",
-                     reinterpret_cast<void *>(dptr), ToString(result).c_str()));
+                     reinterpret_cast<void*>(dptr), ToString(result).c_str()));
   }
 
   return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get pointer into for device pointer %p; %s",
-                   reinterpret_cast<void *>(dptr), ToString(result).c_str()));
+                   reinterpret_cast<void*>(dptr), ToString(result).c_str()));
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
     CUdeviceptr pointer) {
   auto result = GetPointerContext(pointer);
   if (!result.ok()) {
@@ -1305,13 +1347,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return DeviceFromContext(result.ValueOrDie());
 }
 
-/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
-                                                           int *cc_minor,
-                                                           CUdevice device) {
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
 
-  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+  CUresult res = cuDeviceGetAttribute(
       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
   if (res != CUDA_SUCCESS) {
     return port::Status(
@@ -1321,7 +1363,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
             ToString(res).c_str(), device));
   }
 
-  res = tensorflow::wrap::cuDeviceGetAttribute(
+  res = cuDeviceGetAttribute(
       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
   if (res != CUDA_SUCCESS) {
     return port::Status(
@@ -1334,14 +1376,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      CUdevice device) {
+  return port::Status{
+      port::error::INTERNAL,
+      "Feature not supported on CUDA platform (GetGpuISAVersion)"};
+}
+
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
 // T and wraps it in a StatusOr.
 template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
   int value = -1;
-  CUresult result =
-      tensorflow::wrap::cuDeviceGetAttribute(&value, attribute, device);
+  CUresult result = cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::NOT_FOUND,
@@ -1352,68 +1400,68 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return converted;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
     CUdevice device) {
   return GetSimpleAttribute<int>(device,
                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
 }
 
-/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
-                                            CUdevice device) {
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           CUdevice device) {
   int value;
-  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
-      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  CUresult res =
+      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
-  res = tensorflow::wrap::cuDeviceGetAttribute(
-      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  res =
+      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
-  res = tensorflow::wrap::cuDeviceGetAttribute(
-      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  res =
+      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1422,8 +1470,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
-  CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  CUresult res = cuDriverGetVersion(driver_version);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1432,11 +1480,21 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
+                                                 int device_ordinal) {
+  CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
     CUdevice_attribute attribute, CUdevice device) {
   int val;
-  CUresult res =
-      tensorflow::wrap::cuDeviceGetAttribute(&val, attribute, device);
+  CUresult res = cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1446,10 +1504,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return val;
 }
 
-/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
   int value = -1;
-  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
-      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  CUresult res =
+      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
     return false;
@@ -1459,13 +1517,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
-                                                  int64 *free_out,
-                                                  int64 *total_out) {
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
   ScopedActivateContext activation(context);
   size_t free = 0;
   size_t total = 0;
-  CUresult res = tensorflow::wrap::cuMemGetInfo(&free, &total);
+  CUresult res = cuMemGetInfo(&free, &total);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1476,10 +1534,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
-                                                   uint64 *result) {
+/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
+                                                  uint64* result) {
   size_t value = -1;
-  CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
+  CUresult res = cuDeviceTotalMem(&value, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1489,13 +1547,12 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
   string pci_bus_id;
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  CUresult res = tensorflow::wrap::cuDeviceGetPCIBusId(chars.begin(),
-                                                       kBufferSize - 1, device);
+  CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1504,8 +1561,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return pci_bus_id;
 }
 
-/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
-                                                  CudaContext* to) {
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
   if (from == to) {
     return true;  // A context can always access its own memory.
   }
@@ -1523,7 +1580,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
-  CUresult res = tensorflow::wrap::cuDeviceCanAccessPeer(
+  CUresult res = cuDeviceCanAccessPeer(
       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1533,15 +1590,14 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return can_access_peer;
 }
 
-/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
-                                                       CudaContext* to) {
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
   if (from == to) {
     return port::Status::OK();  // A context can always access its own memory.
   }
 
   ScopedActivateContext activated{from};
-  CUresult result =
-      tensorflow::wrap::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
+  CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status(
@@ -1553,15 +1609,14 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
-    CudaContext* context, CUfunction kernel, int threads_per_block,
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
   ScopedActivateContext activation(context);
 
   int max_blocks;
-  CUresult result =
-      tensorflow::wrap::cuOccupancyMaxActiveBlocksPerMultiprocessor(
-          &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
+  CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1572,11 +1627,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return max_blocks;
 }
 
-/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
+}  // namespace gpu
+
+namespace cuda {
+
+CUcontext CurrentContextOrDie() {
   CUcontext current = nullptr;
-  CUresult result = tensorflow::wrap::cuCtxGetCurrent(&current);
+  CUresult result = cuCtxGetCurrent(&current);
   if (result != CUDA_SUCCESS) {
-    LOG(FATAL) << "failed to query current context: " << ToString(result);
+    LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
   }
   return current;
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index bee4c8af27c410a19906806b561bb16eb2c6190a..5bbe6f6e627e8b4e217345b0e014e95c08df2fb0 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -18,495 +18,45 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 
-#include <stddef.h>
-#include "tensorflow/stream_executor/platform/port.h"
-
-#include "cuda/include/cuda.h"
-#include "tensorflow/stream_executor/device_options.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 
 namespace stream_executor {
-namespace cuda {
-
-// Identifies the memory space where an allocation resides. See
-// CUDADriver::GetPointerMemorySpace().
-enum class MemorySpace { kHost, kDevice };
-
-// Returns a casual string, such as "host" for the provided memory space.
-string MemorySpaceString(MemorySpace memory_space);
-
-class CudaContext;
-
-// CUDADriver contains wrappers for calls to the userspace library driver. It's
-// useful to isolate these calls and put basic wrappers around them to separate
-// userspace library driver behaviors from the rest of the program.
-//
-// At the moment it's simply used as a namespace.
-//
-// The calls log any specific errors internally and return whether the operation
-// was successful to the caller.
-//
-// The order of parameters is generally kept symmetric with the underlying CUDA
-// driver API.
-//
-// Links on functions are to specific documentation under
-// http://docs.nvidia.com/cuda/cuda-driver-api/
-//
-// Thread safety: these functions should not be used from signal handlers.
-class CUDADriver {
- public:
-  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
-  // the case of failure. Safe to call multiple times; will be fast on all calls
-  // after the first.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
-  static port::Status Init();
-
-  // Returns the device associated with the given context.
-  // device is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
-  static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
-
-  // Creates a new CUDA stream associated with the given context via
-  // cuStreamCreate.
-  // stream is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(CudaContext* context, CUstream* stream);
-
-  // Destroys a CUDA stream associated with the given context.
-  // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfully destroyed.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
-  static void DestroyStream(CudaContext* context, CUstream* stream);
-
-  // CUDA events can explicitly disable event TSC retrieval for some presumed
-  // performance improvement if timing is unnecessary.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  enum class EventFlags { kDefault, kDisableTiming };
-
-  // Creates a new event associated with the given context.
-  // result is an outparam owned by the caller and must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status CreateEvent(CudaContext* context, CUevent* result,
-                                  EventFlags flags);
-
-  // Destroys *event and turns it into a nullptr. event may not be null, but
-  // *event may be, via cuEventDestroy
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
-  static port::Status DestroyEvent(CudaContext* context, CUevent* event);
-
-  // Allocates a GPU memory space of size bytes associated with the given
-  // context via cuMemAlloc.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
-  static void* DeviceAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a GPU memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void DeviceDeallocate(CudaContext* context, void* location);
-
-  // Allocates a unified memory space of size bytes associated with the given
-  // context via cuMemAllocManaged.
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
-  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a unified memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
-
-  // Allocates page-locked and CUDA-registered memory on the host via
-  // cuMemAllocHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
-  static void* HostAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
-  static void HostDeallocate(CudaContext* context, void* location);
-
-  // Registers a memory region at location of size bytes via cuMemHostRegister.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
-  static bool HostRegister(CudaContext* context, void* location, uint64 bytes);
-
-  // Unregisters a memory region that was previously registered at location via
-  // cuMemHostUnregister.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
-  //
-  // TODO(leary) verify an error will be returned if the location wasn't
-  // previously registered.
-  static bool HostUnregister(CudaContext* context, void* location);
-
-  // Given a device ordinal, returns a device handle into the device outparam,
-  // which must not be null.
-  //
-  // N.B. these device handles do not have a corresponding destroy function in
-  // the CUDA driver API.
-  static port::Status GetDevice(int device_ordinal, CUdevice* device);
-
-  // Given a device handle, returns the name reported by the driver for the
-  // device.
-  static bool GetDeviceName(CUdevice device, string* name_out);
-
-  // Given a device to create a context for, returns a context handle into the
-  // context outparam, which must not be null.
-  //
-  // N.B. CUDA contexts are weird. They are implicitly associated with the
-  // calling thread. Current documentation on contexts and their influence on
-  // userspace processes is given here:
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
-  static port::Status CreateContext(CUdevice device,
-                                    const DeviceOptions& device_options,
-                                    CudaContext** context);
-
-  // Destroys the provided context via cuCtxDestroy.
-  // Don't do this while clients could still be using the context, per the docs
-  // bad things will happen.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
-  static void DestroyContext(CudaContext* context);
-
-  // Queries the runtime for the specified attribute of the specified function.
-  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
-  // in terms of integer-sized values, so there's no potential for overrun (as
-  // of CUDA 5.5).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
-  static bool FuncGetAttribute(CUfunction_attribute attribute,
-                               CUfunction function, int* attribute_value);
-
-  // Sets the preferred cache configuration for the specified function.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
-  static bool FuncSetCacheConfig(CUfunction function,
-                                 CUfunc_cache cache_config);
-
-  // Gets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
-  static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
-      CudaContext* context);
-
-  // Sets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
-  static port::Status ContextSetSharedMemConfig(
-      CudaContext* context, CUsharedconfig shared_mem_config);
-
-  // Launches a CUDA kernel via cuLaunchKernel.
-  // TODO(leary) describe the structure of kernel_params and extra in a readable
-  // way.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static bool LaunchKernel(CudaContext* context, CUfunction function,
-                           unsigned int grid_dim_x, unsigned int grid_dim_y,
-                           unsigned int grid_dim_z, unsigned int block_dim_x,
-                           unsigned int block_dim_y, unsigned int block_dim_z,
-                           unsigned int shared_mem_bytes, CUstream stream,
-                           void** kernel_params, void** extra);
-
-  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
-  // handle in "module". Any error logs that are produced are logged internally.
-  static bool LoadPtx(CudaContext* context, const char* ptx_contents,
-                      CUmodule* module);
-
-  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
-  // the resulting handle in "module".
-  static port::Status LoadCubin(CudaContext* context, const char* cubin_bytes,
-                                CUmodule* module);
-
-  // Retrieves a named kernel from a loaded module, and places the resulting
-  // handle into function (outparam) on success. Neither kernel_name nor
-  // function may be null. No ownership is taken of kernel_name.
-  static bool GetModuleFunction(CudaContext* context, CUmodule module,
-                                const char* kernel_name, CUfunction* function);
-
-  // Retrieves a named global/constant symbol from a loaded module, and returns
-  // a device pointer and size of the symbol on success. symbol_name may not be
-  // null. At least one of dptr or bytes should not be null. No ownership is
-  // taken of symbol_name.
-  static bool GetModuleSymbol(CudaContext* context, CUmodule module,
-                              const char* symbol_name, CUdeviceptr* dptr,
-                              size_t* bytes);
-
-  // Unloads module from the current context via cuModuleUnload.
-  // TODO(leary) the documentation doesn't say what kind of disasters happen
-  // if you try to unload a module while its CUfunctions are in use.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
-  static void UnloadModule(CudaContext* context, CUmodule module);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
-  static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                     uint8 value, size_t size);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
-  static bool SynchronousMemsetUint32(CudaContext* context,
-                                      CUdeviceptr location, uint32 value,
-                                      size_t uint32_count);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD8Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
-  static bool AsynchronousMemsetUint8(CudaContext* context,
-                                      CUdeviceptr location, uint8 value,
-                                      size_t uint32_count, CUstream stream);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD32Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
-  static bool AsynchronousMemsetUint32(CudaContext* context,
-                                       CUdeviceptr location, uint32 value,
-                                       size_t uint32_count, CUstream stream);
-
-  // -- Synchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
-
-  static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-  static port::Status SynchronousMemcpyH2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           const void* host_src, uint64 size);
-  static port::Status SynchronousMemcpyD2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-
-  // -- Asynchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
-
-  static bool AsynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    const void* host_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-
-  // The CUDA stream callback type signature.
-  // The data passed to AddStreamCallback is subsequently passed to this
-  // callback when it fires.
-  //
-  // Some notable things:
-  // * Callbacks must not make any CUDA API calls.
-  // * Callbacks from independent streams execute in an undefined order and may
-  //   be serialized.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
-  typedef void (*StreamCallback)(CUstream stream, CUresult status, void* data);
-
-  // Enqueues a callback operation into stream.
-  // See StreamCallback above and the NVIDIA documentation for additional
-  // details.
-  static bool AddStreamCallback(CudaContext* context, CUstream stream,
-                                StreamCallback callback, void* data);
-
-  // Causes stream to wait for event to trigger before proceeding via
-  // cuStreamWaitEvent.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
-  static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
-                                CUevent event);
-
-  // Blocks the calling thread until the operations enqueued onto stream have
-  // been completed, via cuStreamSynchronize.
-  //
-  // TODO(leary) if a pathological thread enqueues operations onto the stream
-  // while another thread blocks like this, can you wind up waiting an unbounded
-  // amount of time?
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
-
-  // Blocks the calling thread until the operations associated with the context
-  // have been completed, via cuCtxSynchronize.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
-  static bool SynchronizeContext(CudaContext* context);
-
-  // Returns true if all stream tasks have completed at time of the call. Note
-  // the potential for races around this call (if another thread adds work to
-  // the stream immediately after this returns).
-  static bool IsStreamIdle(CudaContext* context, CUstream stream);
-
-  // Returns whether code in the from context can access memory in the to
-  // context via cuDeviceCanAccessPeer.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
-  static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
-  static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Returns the elapsed milliseconds between start and stop via
-  // cuEventElapsedTime.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
-  static bool GetEventElapsedTime(CudaContext* context,
-                                  float* elapsed_milliseconds, CUevent start,
-                                  CUevent stop);
-
-  // Records that an event occurred when execution reaches the current point in
-  // thestream via cuEventRecord.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
-  static port::Status RecordEvent(CudaContext* context, CUevent event,
-                                  CUstream stream);
-
-  // Polls (without blocking) to determine the status of an event - pending or
-  // complete (or an error status).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
-                                             CUevent event);
-
-  // -- Pointer-specific calls.
-
-  // Returns the context in which pointer was allocated or registered.
-  static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
-
-  // Returns the device associated with the context from GetPointerContext().
-  static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
-
-  // Returns the memory space addressed by pointer.
-  static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
-
-  // Returns the base address and size of the device pointer dptr.
-  static port::Status GetPointerAddressRange(CUdeviceptr dptr,
-                                             CUdeviceptr* base, size_t* size);
-
-  // -- Device-specific calls.
-
-  // Returns the compute capability for the device; i.e (3, 5).
-  // This is currently done via the deprecated device API.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
-  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
-                                           CUdevice device);
-
-  // Returns the number of multiprocessors on the device (note that the device
-  // may be multi-GPU-per-board).
-  static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
-
-  // Returns the limit on number of threads that can be resident in a single
-  // multiprocessor.
-  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
-
-  // Returns the limit on number of threads which may be resident for a single
-  // block (cooperative thread array).
-  static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
-
-  // Returns the amount of shared memory available on a single GPU core (i.e.
-  // SM on NVIDIA devices).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
-
-  // Returns the amount of shared memory available for a single block
-  // (cooperative thread array).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
-
-  // Returns the maximum supported number of registers per block.
-  static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
-
-  // Returns the number of threads per warp.
-  static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
-
-  // Queries the grid limits for device with cuDeviceGetAttribute calls.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool GetGridLimits(int* x, int* y, int* z, CUdevice device);
-
-  // Gets a specific integer-valued property about the given device.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
-                                                CUdevice device);
-
-  // Returns whether ECC is enabled for the given CUdevice via
-  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool IsEccEnabled(CUdevice device, bool* result);
-
-  // Returns the total amount of memory available for allocation by the CUDA
-  // context, in bytes, via cuDeviceTotalMem.
-  static bool GetDeviceTotalMemory(CUdevice device, uint64* result);
-
-  // Returns the free amount of memory and total amount of memory, as reported
-  // by cuMemGetInfo.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
-  static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
-                                  int64* total);
-
-  // Returns a PCI bus id string for the device.
-  // [domain]:[bus]:[device].[function]
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
-  static string GetPCIBusID(CUdevice device);
-
-  // -- Context- and device-independent calls.
-
-  // Returns the number of visible CUDA device via cuDeviceGetCount.
-  // This should correspond to the set of device ordinals available.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
-  static int GetDeviceCount();
-
-  // Returns the driver version number via cuDriverGetVersion.
-  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
-  // instead, the CUDA toolkit release number that this driver is compatible
-  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
-  // compatible driver).
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
-  static bool GetDriverVersion(int* driver_version);
-
-  // -- Other calls
-
-  // Returns the maximum number of blocks (per multiprocessor) occupied by the
-  // specified kernel/CUfunction when launched with the specified parameters.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
-  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
-      CudaContext* context, CUfunction kernel, int threads_per_block,
-      size_t dynamic_shared_memory_bytes);
-
-  // Returns the current context set in CUDA. This is done by calling the cuda
-  // driver (e.g., this value is not our cached view of the current context).
-  static CUcontext CurrentContextOrDie();
-
-  // Seam for injecting an error at CUDA initialization time for testing
-  // purposes.
-  static bool driver_inject_init_error_;
-};
-
-// Ensures a context is activated within a scope.
-class ScopedActivateContext {
- public:
-  // Activates the context via cuCtxSetCurrent, if it is not the currently
-  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
-  // mechanism is said by NVIDIA to be relatively slow and deprecated.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
-  explicit ScopedActivateContext(CudaContext* context);
-
-  // Checks that the context has remained activated for the duration of the
-  // scope.
-  ~ScopedActivateContext();
-
- private:
-  CudaContext* to_restore_ = nullptr;
-};
-
-// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
+namespace gpu {
+// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
 // unique id is positive, and ids are not repeated within the process.
-class CudaContext {
+class GpuContext {
  public:
-  CudaContext(CUcontext context, int64 id) : context_(context), id_(id) {}
+  GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}
 
   CUcontext context() const { return context_; }
   int64 id() const { return id_; }
 
   // Disallow copying and moving.
-  CudaContext(CudaContext&&) = delete;
-  CudaContext(const CudaContext&) = delete;
-  CudaContext& operator=(CudaContext&&) = delete;
-  CudaContext& operator=(const CudaContext&) = delete;
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
 
  private:
   CUcontext const context_;
   const int64 id_;
 };
 
-inline CUcontext CurrentContextOrDie() {
-  return CUDADriver::CurrentContextOrDie();
-}
+}  // namespace gpu
+
+namespace cuda {
+
+using MemorySpace = gpu::MemorySpace;
+
+using CUDADriver = gpu::GpuDriver;
+
+using ScopedActivateContext = gpu::ScopedActivateContext;
+
+using CudaContext = gpu::GpuContext;
+
+// Returns the current context set in CUDA. This is done by calling the cuda
+// driver (e.g., this value is not our cached view of the current context).
+CUcontext CurrentContextOrDie();
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
deleted file mode 100644
index ee99908bd5437da35c93cebd48a3e64a0012ae06..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file wraps cuda driver calls with dso loader so that we don't need to
-// have explicit linking to libcuda. All TF cuda driver usage should route
-// through this wrapper.
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
-
-#include "cuda/include/cuda.h"
-#include "tensorflow/stream_executor/lib/env.h"
-#include "tensorflow/stream_executor/platform/dso_loader.h"
-#include "tensorflow/stream_executor/platform/port.h"
-
-namespace tensorflow {
-namespace wrap {
-#ifdef PLATFORM_GOOGLE
-// Use static linked library
-#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                       \
-  template <typename... Args>                                              \
-  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) { \
-    return ::cudaSymbolName(args...);                                      \
-  }
-
-// This macro wraps a global identifier, given by cudaSymbolName, in a callable
-// structure that loads the DLL symbol out of the DSO handle in a thread-safe
-// manner on first use. This dynamic loading technique is used to avoid DSO
-// dependencies on vendor libraries which may or may not be available in the
-// deployed binary environment.
-#else
-#define TO_STR_(x) #x
-#define TO_STR(x) TO_STR_(x)
-
-#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                        \
-  template <typename... Args>                                               \
-  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) {  \
-    using FuncPtrT = std::add_pointer<decltype(::cudaSymbolName)>::type;    \
-    static FuncPtrT loaded = []() -> FuncPtrT {                             \
-      static const char *kName = TO_STR(cudaSymbolName);                    \
-      void *f;                                                              \
-      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
-          stream_executor::internal::CachedDsoLoader::GetLibcudaDsoHandle() \
-              .ValueOrDie(),                                                \
-          kName, &f);                                                       \
-      CHECK(s.ok()) << "could not find " << kName                           \
-                    << " in libcuda DSO; dlerror: " << s.error_message();   \
-      return reinterpret_cast<FuncPtrT>(f);                                 \
-    }();                                                                    \
-    return loaded(args...);                                                 \
-  }
-#endif
-
-// clang-format off
-#define LIBCUDA_ROUTINE_EACH(__macro)                   \
-  __macro(cuCtxEnablePeerAccess)                        \
-  __macro(cuCtxGetCurrent)                              \
-  __macro(cuCtxGetDevice)                               \
-  __macro(cuCtxGetSharedMemConfig)                      \
-  __macro(cuCtxSetCurrent)                              \
-  __macro(cuCtxSetSharedMemConfig)                      \
-  __macro(cuCtxSynchronize)                             \
-  __macro(cuDeviceCanAccessPeer)                        \
-  __macro(cuDeviceGet)                                  \
-  __macro(cuDeviceGetAttribute)                         \
-  __macro(cuDeviceGetCount)                             \
-  __macro(cuDeviceGetName)                              \
-  __macro(cuDeviceGetPCIBusId)                          \
-  __macro(cuDevicePrimaryCtxGetState)                   \
-  __macro(cuDevicePrimaryCtxRelease)                    \
-  __macro(cuDevicePrimaryCtxRetain)                     \
-  __macro(cuDevicePrimaryCtxSetFlags)                   \
-  __macro(cuDeviceTotalMem)                             \
-  __macro(cuDriverGetVersion)                           \
-  __macro(cuEventCreate)                                \
-  __macro(cuEventDestroy)                               \
-  __macro(cuEventElapsedTime)                           \
-  __macro(cuEventQuery)                                 \
-  __macro(cuEventRecord)                                \
-  __macro(cuEventSynchronize)                           \
-  __macro(cuFuncGetAttribute)                           \
-  __macro(cuFuncSetCacheConfig)                         \
-  __macro(cuGetErrorName)                               \
-  __macro(cuGetErrorString)                             \
-  __macro(cuInit)                                       \
-  __macro(cuLaunchKernel)                               \
-  __macro(cuMemAlloc)                                   \
-  __macro(cuMemAllocManaged)                            \
-  __macro(cuMemFree)                                    \
-  __macro(cuMemFreeHost)                                \
-  __macro(cuMemGetAddressRange)                         \
-  __macro(cuMemGetInfo)                                 \
-  __macro(cuMemHostAlloc)                               \
-  __macro(cuMemHostRegister)                            \
-  __macro(cuMemHostUnregister)                          \
-  __macro(cuMemcpyDtoD)                                 \
-  __macro(cuMemcpyDtoDAsync)                            \
-  __macro(cuMemcpyDtoH)                                 \
-  __macro(cuMemcpyDtoHAsync)                            \
-  __macro(cuMemcpyHtoD)                                 \
-  __macro(cuMemcpyHtoDAsync)                            \
-  __macro(cuMemsetD32)                                  \
-  __macro(cuMemsetD32Async)                             \
-  __macro(cuMemsetD8)                                   \
-  __macro(cuMemsetD8Async)                              \
-  __macro(cuModuleGetFunction)                          \
-  __macro(cuModuleGetGlobal)                            \
-  __macro(cuModuleLoadDataEx)                           \
-  __macro(cuModuleLoadFatBinary)                        \
-  __macro(cuModuleUnload)                               \
-  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor)  \
-  __macro(cuOccupancyMaxPotentialBlockSize)             \
-  __macro(cuPointerGetAttribute)                        \
-  __macro(cuStreamAddCallback)                          \
-  __macro(cuStreamCreate)                               \
-  __macro(cuStreamDestroy)                              \
-  __macro(cuStreamQuery)                                \
-  __macro(cuStreamSynchronize)                          \
-  __macro(cuStreamWaitEvent)
-
-// clang-format on
-
-LIBCUDA_ROUTINE_EACH(STREAM_EXECUTOR_LIBCUDA_WRAP)
-#undef LIBCUDA_ROUTINE_EACH
-#undef STREAM_EXECUTOR_LIBCUDA_WRAP
-#undef TO_STR
-#undef TO_STR_
-}  // namespace wrap
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 96dcf173566087db475e3b237591d19f06128d92..fd9d4741e01082ee46c9f1ba77a089ee2cc8fad5 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -20,30 +20,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CUDAEvent::CUDAEvent(CUDAExecutor* parent)
-    : parent_(parent), cuda_event_(nullptr) {}
-
-CUDAEvent::~CUDAEvent() {}
-
-port::Status CUDAEvent::Init() {
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
-                                 CUDADriver::EventFlags::kDisableTiming);
-}
-
-port::Status CUDAEvent::Destroy() {
-  return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
-}
-
-port::Status CUDAEvent::Record(CUDAStream* stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
-                                 stream->cuda_stream());
-}
-
-Event::Status CUDAEvent::PollForStatus() {
+Event::Status GpuEvent::PollForStatus() {
   port::StatusOr<CUresult> status =
-      CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
                << status.status().error_message();
@@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
   }
 }
 
-const CUevent& CUDAEvent::cuda_event() {
-  return cuda_event_;
-}
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index f62344672ed624f1ed60b5452d33b6f8273f2b47..e3596e0261acc1f6225c610db33dbbcdc38fd7e4 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -16,45 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDAEvent wraps a CUevent in the platform-independent EventInterface
-// interface.
-class CUDAEvent : public internal::EventInterface {
- public:
-  explicit CUDAEvent(CUDAExecutor* parent);
-
-  ~CUDAEvent() override;
-
-  // Populates the CUDA-platform-specific elements of this object.
-  port::Status Init();
-
-  // Deallocates any platform-specific elements of this object. This is broken
-  // out (not part of the destructor) to allow for error reporting.
-  port::Status Destroy();
-
-  // Inserts the event at the current position into the specified stream.
-  port::Status Record(CUDAStream* stream);
-
-  // Polls the CUDA platform for the event's current status.
-  Event::Status PollForStatus();
-
-  // The underlying CUDA event element.
-  const CUevent& cuda_event();
-
- private:
-  // The Executor used to which this object and CUevent are bound.
-  CUDAExecutor* parent_;
-
-  // The underlying CUDA event element.
-  CUevent cuda_event_;
-};
+using CUDAEvent = gpu::GpuEvent;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 776719bc35c3e43bf0e7dc38790640f0e8cc301f..3bf2f5b9742d943a049d6e799bf0166f9ec0d610 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -26,100 +26,16 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
-namespace wrap {
-
-#ifdef PLATFORM_GOOGLE
-// This macro wraps a global identifier, given by __name, in a callable
-// structure that loads the DLL symbol out of the DSO handle in a thread-safe
-// manner on first use. This dynamic loading technique is used to avoid DSO
-// dependencies on vendor libraries which may or may not be available in the
-// deployed binary environment.
-#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
-  struct WrapperShim__##__name {                                 \
-    template <typename... Args>                                  \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};           \
-      return ::__name(args...);                                  \
-    }                                                            \
-  } __name;
-
-#else
-
-#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                                \
-  struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetCufftDsoHandle();            \
-      return s.ValueOrDie();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in cufft DSO; dlerror: " << s.error_message();   \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
-  const char *DynLoadShim__##__name::kName = #__name;
-
-#endif
-
-// clang-format off
-
-#define CUFFT_ROUTINE_EACH(__macro)                                     \
-  __macro(cufftDestroy)                                                 \
-  __macro(cufftSetStream)                                               \
-  __macro(cufftPlan1d)                                                  \
-  __macro(cufftPlan2d)                                                  \
-  __macro(cufftPlan3d)                                                  \
-  __macro(cufftPlanMany)                                                \
-  __macro(cufftExecD2Z)                                                 \
-  __macro(cufftExecZ2D)                                                 \
-  __macro(cufftExecC2C)                                                 \
-  __macro(cufftExecC2R)                                                 \
-  __macro(cufftExecZ2Z)                                                 \
-  __macro(cufftExecR2C)                                                 \
-  __macro(cufftCreate)                                                  \
-  __macro(cufftSetAutoAllocation)                                       \
-  __macro(cufftSetWorkArea)                                             \
-  __macro(cufftGetSize1d)                                               \
-  __macro(cufftMakePlan1d)                                              \
-  __macro(cufftGetSize2d)                                               \
-  __macro(cufftMakePlan2d)                                              \
-  __macro(cufftGetSize3d)                                               \
-  __macro(cufftMakePlan3d)                                              \
-  __macro(cufftGetSizeMany)                                             \
-  __macro(cufftMakePlanMany)
-
-// clang-format on
-
-CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
-#undef CUFFT_ROUTINE_EACH
-
-}  // namespace wrap
-
 namespace {
 
 // A helper function transforming gpu_fft arguments into cuFFT arguments.
@@ -145,8 +61,9 @@ cufftType CUDAFftType(fft::Type type) {
 }
 
 // Associates the given stream with the given cuFFT plan.
-bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
-  auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
+  cuda::ScopedActivateExecutorContext sac(parent);
+  auto ret = cufftSetStream(plan, AsGpuStreamValue(stream));
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
     return false;
@@ -157,7 +74,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
 }  // namespace
 
 port::Status CUDAFftPlan::Initialize(
-    CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
     uint64 *input_embed, uint64 input_stride, uint64 input_distance,
     uint64 *output_embed, uint64 output_stride, uint64 output_distance,
     fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
@@ -165,6 +82,7 @@ port::Status CUDAFftPlan::Initialize(
     LOG(FATAL) << "Try to repeatedly initialize.";
   }
   is_initialized_ = true;
+  cuda::ScopedActivateExecutorContext sac(parent);
   int elem_count_[3], input_embed_[3], output_embed_[3];
   for (int i = 0; i < rank; ++i) {
     elem_count_[i] = elem_count[i];
@@ -183,8 +101,8 @@ port::Status CUDAFftPlan::Initialize(
       switch (rank) {
         case 1:
           // cufftPlan1d
-          ret = wrap::cufftPlan1d(parent, &plan_, elem_count_[0],
-                                  CUDAFftType(type), 1 /* = batch */);
+          ret = cufftPlan1d(&plan_, elem_count_[0], CUDAFftType(type),
+                            1 /* = batch */);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -193,8 +111,8 @@ port::Status CUDAFftPlan::Initialize(
           return port::Status::OK();
         case 2:
           // cufftPlan2d
-          ret = wrap::cufftPlan2d(parent, &plan_, elem_count_[0],
-                                  elem_count_[1], CUDAFftType(type));
+          ret = cufftPlan2d(&plan_, elem_count_[0], elem_count_[1],
+                            CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -203,9 +121,8 @@ port::Status CUDAFftPlan::Initialize(
           return port::Status::OK();
         case 3:
           // cufftPlan3d
-          ret =
-              wrap::cufftPlan3d(parent, &plan_, elem_count_[0], elem_count_[1],
-                                elem_count_[2], CUDAFftType(type));
+          ret = cufftPlan3d(&plan_, elem_count_[0], elem_count_[1],
+                            elem_count_[2], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -220,13 +137,13 @@ port::Status CUDAFftPlan::Initialize(
                               "cufftPlan only takes rank 1, 2, or 3.");
       }
     } else {
-      ret = wrap::cufftCreate(parent, &plan_);
+      ret = cufftCreate(&plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT plan:" << ret;
         return port::Status(port::error::INTERNAL,
                             "Failed to create cuFFT plan.");
       }
-      ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
+      ret = cufftSetAutoAllocation(plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT plan:" << ret;
         return port::Status(port::error::INTERNAL,
@@ -234,9 +151,8 @@ port::Status CUDAFftPlan::Initialize(
       }
       switch (rank) {
         case 1:
-          ret = wrap::cufftMakePlan1d(parent, plan_, elem_count_[0],
-                                      CUDAFftType(type), /*batch=*/1,
-                                      &scratch_size_bytes_);
+          ret = cufftMakePlan1d(plan_, elem_count_[0], CUDAFftType(type),
+                                /*batch=*/1, &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 1d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -244,9 +160,8 @@ port::Status CUDAFftPlan::Initialize(
           }
           break;
         case 2:
-          ret = wrap::cufftMakePlan2d(parent, plan_, elem_count_[0],
-                                      elem_count_[1], CUDAFftType(type),
-                                      &scratch_size_bytes_);
+          ret = cufftMakePlan2d(plan_, elem_count_[0], elem_count_[1],
+                                CUDAFftType(type), &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 2d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -254,9 +169,9 @@ port::Status CUDAFftPlan::Initialize(
           }
           break;
         case 3:
-          ret = wrap::cufftMakePlan3d(parent, plan_, elem_count_[0],
-                                      elem_count_[1], elem_count_[2],
-                                      CUDAFftType(type), &scratch_size_bytes_);
+          ret = cufftMakePlan3d(plan_, elem_count_[0], elem_count_[1],
+                                elem_count_[2], CUDAFftType(type),
+                                &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 3d plan:" << ret;
             return port::Status(port::error::INTERNAL,
@@ -275,24 +190,23 @@ port::Status CUDAFftPlan::Initialize(
   } else {
     // For either multiple batches or rank higher than 3, use cufftPlanMany().
     if (scratch_allocator == nullptr) {
-      auto ret = wrap::cufftPlanMany(
-          parent, &plan_, rank, elem_count_,
-          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
-          output_embed ? output_embed_ : nullptr, output_stride,
-          output_distance, CUDAFftType(type), batch_count);
+      auto ret = cufftPlanMany(
+          &plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
+          input_stride, input_distance, output_embed ? output_embed_ : nullptr,
+          output_stride, output_distance, CUDAFftType(type), batch_count);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
         return port::Status(port::error::INTERNAL,
                             "Failed to create cuFFT batched plan.");
       }
     } else {
-      auto ret = wrap::cufftCreate(parent, &plan_);
+      auto ret = cufftCreate(&plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
         return port::Status(port::error::INTERNAL,
                             "Failed to create cuFFT batched plan.");
       }
-      ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
+      ret = cufftSetAutoAllocation(plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT batched plan:"
                    << ret;
@@ -300,11 +214,10 @@ port::Status CUDAFftPlan::Initialize(
             port::error::INTERNAL,
             "Failed to set auto allocation for cuFFT batched plan.");
       }
-      ret = wrap::cufftMakePlanMany(
-          parent, plan_, rank, elem_count_,
-          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
-          output_embed ? output_embed_ : nullptr, output_stride,
-          output_distance, CUDAFftType(type), batch_count,
+      ret = cufftMakePlanMany(
+          plan_, rank, elem_count_, input_embed ? input_embed_ : nullptr,
+          input_stride, input_distance, output_embed ? output_embed_ : nullptr,
+          output_stride, output_distance, CUDAFftType(type), batch_count,
           &scratch_size_bytes_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to make cuFFT batched plan:" << ret;
@@ -317,7 +230,7 @@ port::Status CUDAFftPlan::Initialize(
   return port::Status::OK();
 }
 
-port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
+port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                                      int rank, uint64 *elem_count,
                                      fft::Type type,
                                      ScratchAllocator *scratch_allocator) {
@@ -339,7 +252,8 @@ port::Status CUDAFftPlan::UpdateScratchAllocator(
     }
   }
   // Connect work area with allocated space.
-  cufftResult_t ret = wrap::cufftSetWorkArea(parent_, plan_, scratch_.opaque());
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  cufftResult_t ret = cufftSetWorkArea(plan_, scratch_.opaque());
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
     return port::Status(port::error::INTERNAL,
@@ -348,7 +262,10 @@ port::Status CUDAFftPlan::UpdateScratchAllocator(
   return port::Status::OK();
 }
 
-CUDAFftPlan::~CUDAFftPlan() { wrap::cufftDestroy(parent_, plan_); }
+CUDAFftPlan::~CUDAFftPlan() {
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  cufftDestroy(plan_);
+}
 
 int CUDAFftPlan::GetFftDirection() const {
   if (!IsInitialized()) {
@@ -548,9 +465,10 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
     return false;
   }
 
-  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)));
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  auto ret = cufftExec(cuda_fft_plan->GetPlan(),
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)));
 
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine: " << ret;
@@ -575,9 +493,10 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
     return false;
   }
 
-  auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)),
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  auto ret = cufftExec(cuda_fft_plan->GetPlan(),
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)),
                        cuda_fft_plan->GetFftDirection());
 
   if (ret != CUFFT_SUCCESS) {
@@ -588,25 +507,23 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   return true;
 }
 
-#define STREAM_EXECUTOR_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,   \
-                                        __fft_type3)                        \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
-                      const DeviceMemory<std::complex<__type>> &input,      \
-                      DeviceMemory<std::complex<__type>> *output) {         \
-    return DoFftWithDirectionInternal(                                      \
-        stream, plan, wrap::cufftExec##__fft_type1, input, output);         \
-  }                                                                         \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
-                      const DeviceMemory<__type> &input,                    \
-                      DeviceMemory<std::complex<__type>> *output) {         \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type2, input, \
-                         output);                                           \
-  }                                                                         \
-  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                      \
-                      const DeviceMemory<std::complex<__type>> &input,      \
-                      DeviceMemory<__type> *output) {                       \
-    return DoFftInternal(stream, plan, wrap::cufftExec##__fft_type3, input, \
-                         output);                                           \
+#define STREAM_EXECUTOR_CUDA_DEFINE_FFT(__type, __fft_type1, __fft_type2,      \
+                                        __fft_type3)                           \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftWithDirectionInternal(stream, plan, cufftExec##__fft_type1,    \
+                                      input, output);                          \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<__type> &input,                       \
+                      DeviceMemory<std::complex<__type>> *output) {            \
+    return DoFftInternal(stream, plan, cufftExec##__fft_type2, input, output); \
+  }                                                                            \
+  bool CUDAFft::DoFft(Stream *stream, fft::Plan *plan,                         \
+                      const DeviceMemory<std::complex<__type>> &input,         \
+                      DeviceMemory<__type> *output) {                          \
+    return DoFftInternal(stream, plan, cufftExec##__fft_type3, input, output); \
   }
 
 STREAM_EXECUTOR_CUDA_DEFINE_FFT(float, C2C, R2C, C2R)
@@ -614,22 +531,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
 #undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cufft() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
-          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
           [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            return new cuda::CUDAFft(cuda_executor);
+            return new gpu::CUDAFft(cuda_executor);
           });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
@@ -637,7 +554,7 @@ void initialize_cufft() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+      cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 8171e61418a3185455e50ee76315eb2493c36c01..0f3baeab6fa8b26b18c22854e8c95aadbb02f1ba 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -30,9 +30,9 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // Opaque and unique indentifier for the cuFFT plugin.
 extern const PluginId kCuFftPlugin;
@@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
   }
 
   // Initialize function for batched plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, uint64 *input_embed,
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, uint64* input_embed,
                           uint64 input_stride, uint64 input_distance,
-                          uint64 *output_embed, uint64 output_stride,
+                          uint64* output_embed, uint64 output_stride,
                           uint64 output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator *scratch_allocator);
+                          int batch_count, ScratchAllocator* scratch_allocator);
 
   // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, fft::Type type,
-                          ScratchAllocator *scratch_allocator);
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, fft::Type type,
+                          ScratchAllocator* scratch_allocator);
 
   port::Status UpdateScratchAllocator(Stream *stream,
                                       ScratchAllocator *scratch_allocator);
@@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
   bool IsInitialized() const { return is_initialized_; }
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
   cufftHandle plan_;
   fft::Type fft_type_;
   DeviceMemory<uint8> scratch_;
@@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
 // This satisfies the platform-agnostic FftSupport interface.
 //
 // Note that the cuFFT handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuFFT handle when a
 // CUDA context is active.
 //
@@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
 // context of parent_, so all context is explicit.
 class CUDAFft : public fft::FftSupport {
  public:
-  explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
+  explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
   ~CUDAFft() override {}
 
   TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
 
   // Two helper functions that execute dynload::cufftExec?2?.
 
@@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 51dec6b2465142c7947cb84803eacb98898e6032..cacaf360c52f54a6049f1d7d01723ad569ddef40 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
@@ -72,7 +71,7 @@ extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
 // It has been observed that loading both PTX and cubins into the driver library
@@ -84,17 +83,16 @@ namespace cuda {
 // variable with extern linkage and populate it from another translation unit.
 std::function<string(const string &)> g_cubinate;
 
-static CUDAEvent *AsCUDAEvent(Event *event) {
+static GpuEvent* AsGpuEvent(Event* event) {
   DCHECK(event != nullptr);
-  return static_cast<CUDAEvent *>(event->implementation());
+  return static_cast<GpuEvent*>(event->implementation());
 }
 
-
 // Given a platform-independent timer datatype, returns the internal CUDA
 // platform implementation pointer.
-static CUDATimer *AsCUDATimer(Timer *timer) {
+static GpuTimer* AsGpuTimer(Timer* timer) {
   DCHECK(timer != nullptr);
-  return static_cast<CUDATimer *>(timer->implementation());
+  return static_cast<GpuTimer*>(timer->implementation());
 }
 
 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -112,48 +110,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
+GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
   CHECK(cuda_exec != nullptr);
-  return cuda_exec->cuda_context();
+  return cuda_exec->gpu_context();
 }
 
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
-  return static_cast<CUDAExecutor *>(stream_exec->implementation());
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
 }
 
-CUDAExecutor::~CUDAExecutor() {
-  CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
-  CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
+GpuExecutor::~GpuExecutor() {
+  CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
   if (context_ != nullptr) {
-    CUDADriver::DestroyContext(context_);
+    GpuDriver::DestroyContext(context_);
   }
 }
 
-port::Status CUDAExecutor::Init(int device_ordinal,
-                                DeviceOptions device_options) {
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
 
-  auto status = CUDADriver::Init();
+  auto status = GpuDriver::Init();
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::GetDevice(device_ordinal_, &device_);
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::CreateContext(device_, device_options, &context_);
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
   if (!status.ok()) {
     return status;
   }
 
-  return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
+  return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
 }
 
-bool CUDAExecutor::FindOnDiskForComputeCapability(
+bool GpuExecutor::FindOnDiskForComputeCapability(
     absl::string_view filename, absl::string_view canonical_suffix,
-    string *found_filename) const {
+    string* found_filename) const {
   if (cc_major_ == 0 && cc_minor_ == 0) {
     return false;
   }
@@ -177,6 +176,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
   return false;
 }
 
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  LOG(ERROR)
+      << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
+  return false;
+}
 // Returns the path to the running executable.
 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
 // Arg: strip_exe: if true, remove the name of the executable itself from the
@@ -211,12 +217,12 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
 
   if (*module == nullptr) {
-    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
     if (!load_status.ok()) {
       LOG(ERROR) << "failed to load CUBIN: " << load_status;
       return false;
@@ -233,12 +239,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
 
   if (*module == nullptr) {
-    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+    if (!GpuDriver::LoadPtx(context_, ptx, module)) {
       return false;
     }
     VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
@@ -253,9 +259,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
-                             KernelBase *kernel) {
-  CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
+  return false;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const string *kernelname;
 
@@ -295,8 +306,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
     return false;
   }
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
-  if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
-                                     cuda_kernel->cuda_function_ptr())) {
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    cuda_kernel->gpu_function_ptr())) {
     return false;
   }
 
@@ -313,7 +324,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return true;
 }
 
-bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
   auto module_it = gpu_binary_to_module_.find(gpu_binary);
   if (gpu_binary_to_module_.end() == module_it) {
     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
@@ -324,13 +335,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
+    GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
   }
   return true;
 }
 
-void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
   mutex_lock lock{in_memory_modules_mu_};
@@ -346,9 +357,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
-                              ModuleHandle *module_handle) {
-  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   CUmodule cu_module;
   if (spec.has_cuda_cubin_in_memory()) {
@@ -382,25 +393,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
   return false;
 }
 
-bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
   mutex_lock lock{in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
 
-bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
-                                     KernelMetadata *kernel_metadata) {
+bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
+                                    KernelMetadata* kernel_metadata) {
   int value;
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_registers_per_thread(value);
 
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_shared_memory_bytes(value);
@@ -408,13 +417,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
   return true;
 }
 
-bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
-                          const BlockDim &block_dims, const KernelBase &kernel,
-                          const KernelArgsArrayBase &args) {
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
-  CUstream custream = AsCUDAStreamValue(stream);
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  CUstream custream = AsGpuStreamValue(stream);
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   // Only perform/print the occupancy check once.  Even just checking to see
   // whether we've done an occupancy check on this kernel before isn't free
@@ -431,16 +440,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 
   if (cuda_kernel->GetPreferredCacheConfig() !=
       KernelCacheConfig::kNoPreference) {
-    CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
+    GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
   }
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
-                                block_dims.z, thread_dims.x, thread_dims.y,
-                                thread_dims.z, args.number_of_shared_bytes(),
-                                custream, kernel_params,
-                                nullptr /* = extra */)) {
+  if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                               block_dims.z, thread_dims.x, thread_dims.y,
+                               thread_dims.z, args.number_of_shared_bytes(),
+                               custream, kernel_params,
+                               nullptr /* = extra */)) {
     LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
                << " args; thread dim: " << thread_dims.ToString()
@@ -454,9 +463,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
-                                     const ThreadDim &thread_dims,
-                                     const BlockDim &block_dims) {
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
           << kernel.demangled_name();
   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@@ -475,8 +484,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   const DeviceDescription &device_description =
       kernel.parent()->GetDeviceDescription();
 
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
                                          smem_per_block, thread_dims, cufunc);
@@ -496,13 +505,14 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CUDAExecutor::CalculateOccupancy(
-    const DeviceDescription &device_description, uint64 registers_per_thread,
-    uint64 shared_memory_per_block, const ThreadDim &thread_dims,
-    CUfunction func) {
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
+  CUresult err = cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -511,15 +521,15 @@ int CUDAExecutor::CalculateOccupancy(
 
 // Compute and return the suggested thread count to achieve ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int *initial_blocks,
-                                   const DeviceDescription &device_description,
-                                   uint64 registers_per_thread,
-                                   uint64 shared_memory_per_block,
-                                   const ThreadDim &thread_dims,
-                                   CUfunction func) {
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
+  CUresult err = cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -531,88 +541,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
   }
 }
 
-void *CUDAExecutor::Allocate(uint64 size) {
-  return CUDADriver::DeviceAllocate(context_, size);
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
 }
 
-void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
-                                      uint64 offset_bytes, uint64 size_bytes) {
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
   // offset and size are in bytes, so char* works as the pointer type.
   return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
 }
 
-void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
   if (!mem->is_sub_buffer()) {
-    CUDADriver::DeviceDeallocate(context_, mem->opaque());
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
   }
 }
 
-bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
   if (location == nullptr || size == 0) {
     LOG(WARNING) << "attempting to register null or zero-sized memory: "
                  << location << "; size " << size;
   }
   VLOG(2) << "registering " << location << " size " << size;
-  return CUDADriver::HostRegister(context_, location, size);
+  return GpuDriver::HostRegister(context_, location, size);
 }
 
-bool CUDAExecutor::HostMemoryUnregister(void *location) {
+bool GpuExecutor::HostMemoryUnregister(void* location) {
   VLOG(2) << "unregistering " << location;
-  return CUDADriver::HostUnregister(context_, location);
+  return GpuDriver::HostUnregister(context_, location);
 }
 
-bool CUDAExecutor::SynchronizeAllActivity() {
-  return CUDADriver::SynchronizeContext(context_);
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
 }
 
-bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), 0x0, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            0x0, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           0x0, size);
 }
 
-bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                     uint64 size) {
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     // cudaMemset reinterprets "value" as a uint8.
     uint8 byte_value = static_cast<uint8>(value);
     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
                      (byte_value << 8) | byte_value;
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), pattern, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            value, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           value, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                             const void *host_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          host_src, size);
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         host_src, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
-                                             const DeviceMemoryBase &gpu_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
-                           uint64 size) {
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return Memset32(stream, location, 0x0, size);
@@ -621,88 +630,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
   }
 }
 
-bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
-                           uint8 pattern, uint64 size) {
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
-  return CUDADriver::AsynchronousMemsetUint8(
-      context_, AsCudaDevicePtr(location), pattern, size,
-      AsCUDAStreamValue(stream));
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
-                            uint32 pattern, uint64 size) {
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
         size % 4 == 0);
-  return CUDADriver::AsynchronousMemsetUint32(
+  return GpuDriver::AsynchronousMemsetUint32(
       context_, AsCudaDevicePtr(location), pattern, size / 4,
-      AsCUDAStreamValue(stream));
+      AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
-                          const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
-                          const void *host_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           host_src, size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
-                                        DeviceMemoryBase *gpu_dst,
-                                        const DeviceMemoryBase &gpu_src,
-                                        uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<port::Status()> callback) {
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
   auto callback_ptr = new std::function<void()>([callback]() {
     port::Status s = callback();
     if (!s.ok()) {
       LOG(WARNING) << "Host callback failed: " << s;
     }
   });
-  return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
-                                       InternalHostCallback, callback_ptr);
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
 }
 
-/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
-                                                     CUresult status,
-                                                     void *data) {
+/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
+                                                    CUresult status,
+                                                    void* data) {
   std::function<void()> *callback =
       reinterpret_cast<std::function<void()> *>(data);
   (*callback)();
   delete callback;
 }
 
-port::Status CUDAExecutor::AllocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Init();
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
 }
 
-port::Status CUDAExecutor::DeallocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Destroy();
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
 }
 
-port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
-  return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
-  if (CUDADriver::WaitStreamOnEvent(context_,
-                                    AsCUDAStream(stream)->cuda_stream(),
-                                    AsCUDAEvent(event)->cuda_event())) {
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
     return port::Status::OK();
   } else {
     return port::Status(
@@ -712,61 +720,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
   }
 }
 
-Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
-  return AsCUDAEvent(event)->PollForStatus();
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
 }
 
-bool CUDAExecutor::AllocateStream(Stream *stream) {
-  return AsCUDAStream(stream)->Init();
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
 }
 
-void CUDAExecutor::DeallocateStream(Stream *stream) {
-  CUDAStream *cuda_stream = AsCUDAStream(stream);
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* cuda_stream = AsGpuStream(stream);
   if (!cuda_stream->IsIdle()) {
     LOG(ERROR) << "Deallocating stream with pending work";
   }
   cuda_stream->Destroy();
 }
 
-bool CUDAExecutor::AllocateTimer(Timer *timer) {
-  return AsCUDATimer(timer)->Init();
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
 }
 
-void CUDAExecutor::DeallocateTimer(Timer *timer) {
-  AsCUDATimer(timer)->Destroy();
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
 }
 
-bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
-  CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
-  bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
-                                    AsCUDAStreamValue(other))
-      .ok();
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  CUevent other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
   if (!ok) {
     LOG(ERROR) << "failed to record completion event; "
                   "therefore, failed to create inter-stream dependency";
     return false;
   }
 
-  return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
-                                       other_completed_event);
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
 }
 
-bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
 }
 
-bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
-  return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
-blas::BlasSupport *CUDAExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
@@ -777,10 +785,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
   return status.ValueOrDie()(this);
 }
 
-dnn::DnnSupport *CUDAExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
-      registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
@@ -791,10 +799,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
   return status.ValueOrDie()(this);
 }
 
-fft::FftSupport *CUDAExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
@@ -805,10 +813,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
   return status.ValueOrDie()(this);
 }
 
-rng::RngSupport *CUDAExecutor::CreateRng() {
+rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
-      registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
@@ -820,23 +828,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
 }
 
 // TODO(rspringer): Remove in b/18544742.
-bool CUDAExecutor::SupportsDnn() const {
-  return true;
-}
+bool GpuExecutor::SupportsDnn() const { return true; }
 
-bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }
 
-port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
 
-SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
   port::StatusOr<CUsharedconfig> cuda_config =
-      CUDADriver::ContextGetSharedMemConfig(context_);
+      GpuDriver::ContextGetSharedMemConfig(context_);
   if (!cuda_config.ok()) {
     // Don't log; the failed call will log necessary output.
     return SharedMemoryConfig::kDefault;
@@ -855,7 +861,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
   }
 }
 
-port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
     SharedMemoryConfig config) {
   CUsharedconfig cuda_config;
   switch (config) {
@@ -872,21 +878,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
       LOG(FATAL) << "Invalid shared memory configuration specified: "
                  << static_cast<int>(config);
   }
-  return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
+  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
 }
 
-bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
-  return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool CUDAExecutor::GetSymbol(const string &symbol_name,
-                             ModuleHandle module_handle, void **mem,
-                             size_t *bytes) {
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
   auto lookup_in_module = [&](CUmodule module) {
     CHECK(module != nullptr);
-    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                       reinterpret_cast<CUdeviceptr *>(mem),
-                                       bytes);
+    return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr*>(mem),
+                                      bytes);
   };
 
   {  // give limited scope to mutex_lock
@@ -908,13 +914,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
   return false;
 }
 
-bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
   // (as opposed to ThreadDim which expresses the dimensions of threads
   // within a block).
   int x, y, z;
-  if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
     return false;
   }
 
@@ -924,35 +930,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
   return true;
 }
 
-bool CUDAExecutor::SupportsBlas() const { return true; }
+bool GpuExecutor::SupportsBlas() const { return true; }
 
-bool CUDAExecutor::SupportsFft() const { return true; }
+bool GpuExecutor::SupportsFft() const { return true; }
 
-bool CUDAExecutor::SupportsRng() const { return true; }
+bool GpuExecutor::SupportsRng() const { return true; }
 
 std::unique_ptr<internal::EventInterface>
-CUDAExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }
 
 std::unique_ptr<internal::KernelInterface>
-CUDAExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
 }
 
 std::unique_ptr<internal::StreamInterface>
-CUDAExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
 std::unique_ptr<internal::TimerInterface>
-CUDAExecutor::GetTimerImplementation() {
-  return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
 }
 
-void *CUDAExecutor::GpuContextHack() { return context_; }
+void* GpuExecutor::GpuContextHack() { return context_; }
 
-CudaContext* CUDAExecutor::cuda_context() { return context_; }
+GpuContext* GpuExecutor::gpu_context() { return context_; }
 
 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
@@ -1019,21 +1025,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #endif
 }
 
-
-DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   {
     int driver_version = 0;
-    (void)CUDADriver::GetDriverVersion(&driver_version);
+    (void)GpuDriver::GetDriverVersion(&driver_version);
     string augmented_driver_version = port::Printf(
         "%d (%s)", driver_version,
-        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
     builder.set_driver_version(augmented_driver_version);
   }
 
   {
-    string pci_bus_id = CUDADriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
 
     // Lower the hex characters to match sysfs.
     pci_bus_id = port::Lowercase(pci_bus_id);
@@ -1046,43 +1052,43 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
   {
     builder.set_threads_per_block_limit(
-        CUDADriver::GetDeviceAttribute(
-            CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device_)
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                      device_)
             .ValueOrDie());
 
     ThreadDim thread_dim_limit;
-    thread_dim_limit.x = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
                              .ValueOrDie();
-    thread_dim_limit.y = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
                              .ValueOrDie();
-    thread_dim_limit.z = CUDADriver::GetDeviceAttribute(
+    thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
                              .ValueOrDie();
     builder.set_thread_dim_limit(thread_dim_limit);
 
     int clock_rate =
-        CUDADriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
             .ValueOrDie();
     builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
   }
 
   {
     bool ecc_enabled = false;
-    (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
   {
     uint64 device_memory_size = -1;
-    (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
     builder.set_device_memory_size(device_memory_size);
   }
 
-  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
-  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
     // Times 2 because HBM is DDR memory; it gets two data bits per each data
@@ -1100,7 +1106,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
   {
     string device_name;
-    (void)CUDADriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
     builder.set_name(device_name);
   }
 
@@ -1114,31 +1120,69 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   builder.set_device_vendor("NVIDIA Corporation");
   builder.set_cuda_compute_capability(cc_major_, cc_minor_);
   builder.set_shared_memory_per_core(
-      CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
   builder.set_shared_memory_per_block(
-      CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
   builder.set_core_count(
-      CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
   builder.set_threads_per_core_limit(
-      CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
   builder.set_registers_per_block_limit(
-      CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
   builder.set_threads_per_warp(
-      CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
   builder.set_registers_per_core_limit(
-      CUDADriver::GetDeviceAttribute(
+      GpuDriver::GetDeviceAttribute(
           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
           .ValueOrDie());
 
+  // We are loading a dummy ptx kernel to set the device description's
+  // blocks_per_core_limit by calling the CUDA occupancy calculator.  This
+  // value is currently required XLA GPU's CalculateLaunchDimensions()
+  const char* blank_ptx = R"(
+.version 6.0
+.target sm_30
+.address_size 64
+
+        // .globl       testkernel
+.visible .entry testkernel()
+{
+        ret;
+})";
+  const char* kernel_name = "testkernel";
+
+  CUmodule blank_module;
+  CUfunction blank_function;
+  int bpc = -1;
+  bool ptx_success =
+      cuda::CUDADriver::LoadPtx(context_, blank_ptx, &blank_module);
+  if (ptx_success) {
+    ptx_success = cuda::CUDADriver::GetModuleFunction(
+        context_, blank_module, kernel_name, &blank_function);
+    if (ptx_success) {
+      CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+          &bpc, blank_function, 1, 1);
+      if (result != CUDA_SUCCESS) {
+        bpc = -1;
+        ptx_success = false;
+      }
+    }
+    cuda::CUDADriver::UnloadModule(context_, blank_module);
+  }
+  if (!ptx_success) {
+    LOG(ERROR) << "Failed to calculate max blocks per SM using dummy kernel.";
+  }
+  builder.set_blocks_per_core_limit(bpc);
+
   auto built = builder.Build();
   return built.release();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cuda_gpu_executor() {
-  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
-    return new cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
   };
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index ae8e4abf92024626bf3d2bd3d334244708f55737..9d02c7516cfd9aa1e86a7e534e41d54f8d8e5de3 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -22,289 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 
-#include <set>
-#include <unordered_map>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDA-platform implementation of the platform-agnostic
-// StreamExecutorInferface.
-class CUDAExecutor : public internal::StreamExecutorInterface {
- public:
-  // sub_platform indicates the subplatform used in this executor; it must
-  // be a CUDA type.
-  explicit CUDAExecutor(const PluginConfig &plugin_config)
-      : device_(0),
-        context_(nullptr),
-        device_ordinal_(0),
-        cc_major_(0),
-        cc_minor_(0),
-        plugin_config_(plugin_config) {}
-
-  // See the corresponding StreamExecutor methods for method comments on the
-  // following overrides.
-
-  ~CUDAExecutor() override;
-
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
-
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override;
-  void UnloadKernel(const KernelBase *kernel) override;
-  bool LoadModule(const MultiModuleLoaderSpec &spec,
-                  ModuleHandle *module_handle) override;
-  bool UnloadModule(ModuleHandle module_handle) override;
-
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &k,
-              const KernelArgsArrayBase &args) override;
-
-  int CalculateOccupancy(const DeviceDescription &device_description,
-                         uint64 registers_per_thread,
-                         uint64 shared_memory_per_block,
-                         const ThreadDim &thread_dims, CUfunction func);
-
-  int CompareOccupancy(int *initial_blocks,
-                       const DeviceDescription &device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim &thread_dims, CUfunction func);
-
-  void *Allocate(uint64 size) override;
-
-  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-                          uint64 size_bytes) override;
-
-  void Deallocate(DeviceMemoryBase *mem) override;
-
-  void *UnifiedMemoryAllocate(uint64 size) override {
-    return CUDADriver::UnifiedMemoryAllocate(context_, size);
-  }
-
-  void UnifiedMemoryDeallocate(void *location) override {
-    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
-  }
-
-  // CUDA allocation/registration functions are necessary because the driver
-  // internally sets up buffers for DMA operations (and page locks them).
-  // There's no external interface for us to otherwise control these DMA
-  // settings.
-  void *HostMemoryAllocate(uint64 size) override {
-    return CUDADriver::HostAllocate(context_, size);
-  }
-
-  void HostMemoryDeallocate(void *location) override {
-    return CUDADriver::HostDeallocate(context_, location);
-  }
-
-  bool HostMemoryRegister(void *location, uint64 size) override;
-
-  bool HostMemoryUnregister(void *location) override;
-
-  bool SynchronizeAllActivity() override;
-
-  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
-
-  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
-                         uint64 size) override;
-
-  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                 const void *host_src, uint64 size) override;
-
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &gpu_src,
-                                 uint64 size) override;
-
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                               const DeviceMemoryBase &gpu_src,
-                                               uint64 size) override;
-
-  bool MemZero(Stream *stream, DeviceMemoryBase *location,
-               uint64 size) override;
-  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
-              uint64 size) override;
-  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
-                uint64 size) override;
-
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
-              uint64 size) override;
-
-  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
-              uint64 size) override;
-
-  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &gpu_src,
-                            uint64 size) override;
-
-  bool HostCallback(Stream *stream,
-                    std::function<port::Status()> callback) override;
-
-  bool AllocateStream(Stream *stream) override;
-
-  void DeallocateStream(Stream *stream) override;
-
-  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
-
-  bool AllocateTimer(Timer *timer) override;
-
-  void DeallocateTimer(Timer *timer) override;
-
-  bool StartTimer(Stream *stream, Timer *timer) override;
-
-  bool StopTimer(Stream *stream, Timer *timer) override;
-
-  port::Status AllocateEvent(Event *event) override;
-
-  port::Status DeallocateEvent(Event *event) override;
-
-  port::Status RecordEvent(Stream *stream, Event *event) override;
-
-  port::Status WaitForEvent(Stream *stream, Event *event) override;
-
-  Event::Status PollForEventStatus(Event *event) override;
-
-  port::Status BlockHostUntilDone(Stream *stream) override;
-
-  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
-
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
-
-  // Search for the symbol and returns a device pointer and size.
-  // Returns false if symbol does not exist.
-  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
-                 void **mem, size_t *bytes) override;
-
-  DeviceDescription *PopulateDeviceDescription() const override;
-
-  // Populates the block_dim_limit by querying the device driver API. If an
-  // error occurs at any point while asking the driver for block dim limits, it
-  // will be only partially populated as a result, and an error will be logged.
-  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
-
-  bool SupportsBlas() const override;
-
-  blas::BlasSupport *CreateBlas() override;
-
-  bool SupportsFft() const override;
-
-  fft::FftSupport *CreateFft() override;
-
-  bool SupportsRng() const override;
-
-  rng::RngSupport *CreateRng() override;
-
-  bool SupportsDnn() const override;
-
-  dnn::DnnSupport *CreateDnn() override;
-
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
-
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override;
-
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
-
-  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
-
-  void *GpuContextHack() override;
-
-  CudaContext* cuda_context();
-
- private:
-  // Attempts to find a more specific version of the file indicated by
-  // filename by looking for compute-capability-specific suffixed versions; i.e.
-  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
-  // we're on a compute capability 3.0 machine.
-  bool FindOnDiskForComputeCapability(absl::string_view filename,
-                                      absl::string_view canonical_suffix,
-                                      string *found_filename) const;
-
-  // Host callback landing routine invoked by CUDA.
-  // data: User-provided callback provided to HostCallback() above, captured
-  //       as a std::function<void()>. Allocated/initialized inside
-  //       HostCallback() and owned and deleted by this call.
-  static void InternalHostCallback(CUstream stream, CUresult status,
-                                   void *data);
-
-  // Collects metadata for the specified kernel.
-  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
-                         KernelMetadata *kernel_metadata);
-
-  // Prints to VLOG(2) information about the kernel's occupancy and how it might
-  // be improved.
-  void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
-                         const BlockDim &block_dims);
-
-  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
-  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  bool UnloadGpuBinary(const void *gpu_binary)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
-
-  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
-      GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
-  std::unordered_map<const void *, std::pair<CUmodule, uint64>>
-      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
-
-  // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
-
-  // Keeps track of the set of launched kernels. Currently used to suppress the
-  // occupancy check on subsequent launches.
-  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
-
-  // Handle for the CUDA device being operated on. Immutable
-  // post-initialization.
-  CUdevice device_;
-
-  // Handle for session with the library/driver. Immutable post-initialization.
-  CudaContext* context_;
-
-  // The device ordinal value that this executor was initialized with; recorded
-  // for use in getting device metadata. Immutable post-initialization.
-  int device_ordinal_;
-
-  // The major verion of the compute capability for device_.
-  int cc_major_;
-
-  // The minor verion of the compute capability for device_.
-  int cc_minor_;
-
-  // The plugin configuration associated with this instance.
-  PluginConfig plugin_config_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
-};
+using CUDAExecutor = gpu::GpuExecutor;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index dc0dc694cdc6001341514c02cef38178b25338aa..af6dcf3549748ef74674b5362c86dc284c6712c8 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -17,88 +17,9 @@ limitations under the License.
 //
 // These are typically placed here for use by multiple source components (for
 // example, BLAS and executor components).
-
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 
-#include <stddef.h>
-#include <complex>
-
-#include "cuda/include/cuComplex.h"
-
-namespace stream_executor {
-
-template <typename ElemT>
-class DeviceMemory;
-
-namespace cuda {
-
-// Converts a const DeviceMemory reference to its underlying typed pointer in
-// CUDA
-// device memory.
-template <typename T>
-const T *CUDAMemory(const DeviceMemory<T> &mem) {
-  return static_cast<const T *>(mem.opaque());
-}
-
-// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
-// pointer in CUDA device memory.
-template <typename T>
-T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
-  return static_cast<T *>(mem->opaque());
-}
-
-static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
-              "std::complex<float> and cuComplex should have the same size");
-static_assert(offsetof(cuComplex, x) == 0,
-              "The real part of cuComplex should appear first.");
-static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
-              "std::complex<double> and cuDoubleComplex should have the same "
-              "size");
-static_assert(offsetof(cuDoubleComplex, x) == 0,
-              "The real part of cuDoubleComplex should appear first.");
-
-// Type traits to get CUDA complex types from std::complex<>.
-
-template <typename T>
-struct CUDAComplexT {
-  typedef T type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-// Converts pointers of std::complex<> to pointers of
-// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
-
-template <typename T>
-inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
-  return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
-}
-
-template <typename T>
-inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
-  return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
-}
-
-// Converts values of std::complex<float/double> to values of
-// cuComplex/cuDoubleComplex.
-inline cuComplex CUDAComplexValue(std::complex<float> val) {
-  return {val.real(), val.imag()};
-}
-
-inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
-  return {val.real(), val.imag()};
-}
-
-}  // namespace cuda
-}  // namespace stream_executor
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.cc b/tensorflow/stream_executor/cuda/cuda_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b4e9a178fbcab63adb0a14bc806ac3ee3a60416
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return CU_FUNC_CACHE_PREFER_NONE;
+    case KernelCacheConfig::kPreferShared:
+      return CU_FUNC_CACHE_PREFER_SHARED;
+    case KernelCacheConfig::kPreferL1:
+      return CU_FUNC_CACHE_PREFER_L1;
+    case KernelCacheConfig::kPreferEqual:
+      return CU_FUNC_CACHE_PREFER_EQUAL;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index ec1dc51e57f5a928d54cb86b1cbcc217100df6d4..a8a18d200d93168660d70746db442aeaed146290 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -22,104 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 
-#include "tensorflow/stream_executor/kernel_cache_config.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "cuda/include/cuda.h"
-
-#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
-#error \
-    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
-#endif
-
-#ifdef __CUDA_RUNTIME_H__
-#error \
-    "CUDA runtime being included into CUDA GPU executor; should be driver only."
-#endif
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// Wraps a CUfunction to implement the platform-independent KernelInterface.
-class CUDAKernel : public internal::KernelInterface {
- public:
-  CUDAKernel() : cuda_function_(nullptr), arity_(0),
-                 preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
-
-  // Note that the function is unloaded when the module is unloaded, and the
-  // module that the function is contained in is owned by the CUDAExecutor.
-  ~CUDAKernel() override {}
-
-  // As arity cannot be reflected upon using the CUDA API, the arity is
-  // explicitly set during the CUDAExecutor::GetKernel initialization process.
-  void set_arity(unsigned arity) { arity_ = arity; }
-  unsigned Arity() const override { return arity_; }
-
-  // Returns the CUfunction value for passing to the CUDA API.
-  CUfunction AsCUDAFunctionValue() const {
-    DCHECK(cuda_function_ != nullptr);
-    return const_cast<CUfunction>(cuda_function_);
-  }
-
-  // Returns the slot that the CUfunction is stored within for this object,
-  // for the CUDA API which wants to load into a CUfunction*.
-  CUfunction *cuda_function_ptr() { return &cuda_function_; }
-
-  // CUDA supports setting the preferred cache configuration of a CUfunction
-  // (more-or-less equivalent to a CUDAKernel). We support this via the below
-  // functions; users can set a preference, and that is applied when the kernel
-  // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
-  // load the kernel & set the preference when the user calls the setter below;
-  // either approach is valid.
-  // Sets the current kernel cache configuration preference.
-  void SetPreferredCacheConfig(KernelCacheConfig config) override {
-    preferred_cache_config_ = config;
-  }
-
-  // Returns the current kernel cache configuration preference.
-  KernelCacheConfig GetPreferredCacheConfig() const override {
-    return preferred_cache_config_;
-  }
-
-  // Returns the current kernel cache configuration preference as a
-  // CUfunc_cache.
-  CUfunc_cache GetCUDACacheConfig() const {
-    switch (preferred_cache_config_) {
-      case KernelCacheConfig::kNoPreference:
-        return CU_FUNC_CACHE_PREFER_NONE;
-      case KernelCacheConfig::kPreferShared:
-        return CU_FUNC_CACHE_PREFER_SHARED;
-      case KernelCacheConfig::kPreferL1:
-        return CU_FUNC_CACHE_PREFER_L1;
-      case KernelCacheConfig::kPreferEqual:
-        return CU_FUNC_CACHE_PREFER_EQUAL;
-      default:
-        LOG(FATAL) << "Unknown KernelCacheConfig"
-                   << static_cast<int32>(preferred_cache_config_);
-    }
-  }
-
- private:
-  CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
-  unsigned arity_;            // Number of formal parameters the kernel takes.
-
-  // Preferred (but not required) cache configuration for this kernel.
-  KernelCacheConfig preferred_cache_config_;
-};
-
-// Given a platform-independent kernel datatype, returns the (const) internal
-// CUDA platform implementation pointer.
-inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
-  return static_cast<const CUDAKernel *>(kernel->implementation());
-}
-
-// Given a platform-independent kernel datatype, returns the (non-const)
-// internal CUDA platform implementation pointer.
-inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
-  return static_cast<CUDAKernel *>(kernel->implementation());
-}
+using CUDAKernel = gpu::GpuKernel;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index b342e71bdd94f6112d500d86f6ed4051821d2d54..54aba01278d17505a33d190fba85eb543dd624e1 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 // Synchronize with spinlocks.
@@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
       port::Printf("Executor for bus %d not found.", bus_ordinal));
 }
 
-Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
+Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 
 int CudaPlatform::VisibleDeviceCount() const {
   // Throw away the result - it logs internally, and this [containing] function
   // isn't in the path of user control. It's safe to call this > 1x.
-  if (!cuda::CUDADriver::Init().ok()) {
+  if (!gpu::GpuDriver::Init().ok()) {
     return -1;
   }
 
-  return CUDADriver::GetDeviceCount();
+  return GpuDriver::GetDeviceCount();
 }
 
 const string& CudaPlatform::Name() const { return name_; }
@@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<CUDAExecutor>(config.plugin_config));
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status(
@@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 static void InitializeCudaPlatform() {
   // Disabling leak checking, MultiPlatformManager does not destroy its
   // registered platforms.
 
-  std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
+  std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index fc0e15d5a6a9142f064085d34fcfaedfb25f433a..b21e9797be719fe9fe9ce4ebd75c36a485efb69b 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -33,12 +33,13 @@ limitations under the License.
 
 namespace stream_executor {
 namespace cuda {
-
 // Opaque and unique identifier for the CUDA platform plugin.
 // This is needed so that plugins can refer to/identify this platform without
 // instantiating a CudaPlatform object.
 extern const Platform::Id kCudaPlatformId;
+}  // namespace cuda
 
+namespace gpu {
 // Cuda-specific platform plugin, registered as a singleton value via module
 // initializer.
 class CudaPlatform : public Platform {
@@ -102,6 +103,12 @@ class CudaPlatform : public Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
 };
 
+}  // namespace gpu
+
+namespace cuda {
+
+using CudaPlatform = gpu::CudaPlatform;
+
 }  // namespace cuda
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 022ee17ff7226a50326fe89ca77863177b28d0a5..36eef0a07ace62253b01e1ae3fd750420bc1ecca 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
 // clang-format off
@@ -58,101 +57,25 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
 }
 
 namespace stream_executor {
-namespace cuda {
-
-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
-
-namespace wrap {
-
-#ifdef PLATFORM_GOOGLE
-#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;
-
-#else
-#define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
-  struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
-      return s.ValueOrDie();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in curand DSO; dlerror: " << s.error_message();  \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
-  const char *DynLoadShim__##__name::kName = #__name;
-#endif
-
-STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
-STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
-STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
-STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniform);
-STREAM_EXECUTOR_CURAND_WRAP(curandGenerateUniformDouble);
-STREAM_EXECUTOR_CURAND_WRAP(curandSetPseudoRandomGeneratorSeed);
-STREAM_EXECUTOR_CURAND_WRAP(curandSetGeneratorOffset);
-STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormal);
-STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
-
-}  // namespace wrap
+namespace gpu {
 
-template <typename T>
-string TypeString();
-
-template <>
-string TypeString<float>() {
-  return "float";
-}
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
 
-template <>
-string TypeString<double>() {
-  return "double";
-}
-
-template <>
-string TypeString<std::complex<float>>() {
-  return "std::complex<float>";
-}
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
 
-template <>
-string TypeString<std::complex<double>>() {
-  return "std::complex<double>";
-}
-
-CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
-
-CUDARng::~CUDARng() {
+GpuRng::~GpuRng() {
   if (rng_ != nullptr) {
-    wrap::curandDestroyGenerator(parent_, rng_);
+    cuda::ScopedActivateExecutorContext sac(parent_);
+    curandDestroyGenerator(rng_);
   }
 }
 
-bool CUDARng::Init() {
+bool GpuRng::Init() {
   mutex_lock lock(mu_);
   CHECK(rng_ == nullptr);
 
-  curandStatus_t ret =
-      wrap::curandCreateGenerator(parent_, &rng_, CURAND_RNG_PSEUDO_DEFAULT);
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  curandStatus_t ret = curandCreateGenerator(&rng_, CURAND_RNG_PSEUDO_DEFAULT);
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to create random number generator: " << ret;
     return false;
@@ -162,9 +85,9 @@ bool CUDARng::Init() {
   return true;
 }
 
-bool CUDARng::SetStream(Stream *stream) {
-  curandStatus_t ret =
-      wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
+bool GpuRng::SetStream(Stream* stream) {
+  cuda::ScopedActivateExecutorContext sac(parent_);
+  curandStatus_t ret = curandSetStream(rng_, AsGpuStreamValue(stream));
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for random generation: " << ret;
     return false;
@@ -182,8 +105,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 }
 
 template <typename T>
-bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
-                                            DeviceMemory<T> *v) {
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
   mutex_lock lock(mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
@@ -199,16 +121,15 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
     element_count *= 2;
   }
 
+  cuda::ScopedActivateExecutorContext sac(parent_);
   curandStatus_t ret;
   if (std::is_same<T, float>::value ||
       std::is_same<T, std::complex<float>>::value) {
-    ret = wrap::curandGenerateUniform(
-        parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
-        element_count);
+    ret = curandGenerateUniform(
+        rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)), element_count);
   } else {
-    ret = wrap::curandGenerateUniformDouble(
-        parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
-        element_count);
+    ret = curandGenerateUniformDouble(
+        rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)), element_count);
   }
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
@@ -220,38 +141,39 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
   return true;
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<float>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<double>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
 template <typename ElemT, typename FuncT>
-bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
-                                             ElemT stddev,
-                                             DeviceMemory<ElemT> *v,
-                                             FuncT func) {
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
   mutex_lock lock(mu_);
 
   if (!SetStream(stream)) {
     return false;
   }
 
+  cuda::ScopedActivateExecutorContext sac(parent_);
   uint64 element_count = v->ElementCount();
   curandStatus_t ret =
-      func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
+      func(rng_, GpuMemoryMutable(v), element_count, mean, stddev);
 
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
@@ -262,19 +184,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
   return true;
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                                     DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
-                                        wrap::curandGenerateNormal);
+                                        curandGenerateNormal);
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                                     DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
-                                        wrap::curandGenerateNormalDouble);
+                                        curandGenerateNormalDouble);
 }
 
-bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
   mutex_lock lock(mu_);
   CHECK(rng_ != nullptr);
 
@@ -286,16 +208,17 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
     return false;
   }
 
+  cuda::ScopedActivateExecutorContext sac(parent_);
   // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
   // (which itself requires 16 for API consistency with host RNG fallbacks).
-  curandStatus_t ret = wrap::curandSetPseudoRandomGeneratorSeed(
-      parent_, rng_, *(reinterpret_cast<const uint64 *>(seed)));
+  curandStatus_t ret = curandSetPseudoRandomGeneratorSeed(
+      rng_, *(reinterpret_cast<const uint64*>(seed)));
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set rng seed: " << ret;
     return false;
   }
 
-  ret = wrap::curandSetGeneratorOffset(parent_, rng_, 0);
+  ret = curandSetGeneratorOffset(rng_, 0);
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to reset rng position: " << ret;
     return false;
@@ -303,15 +226,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
   return true;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_curand() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
-          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
-          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+          cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuRAND "
@@ -319,7 +242,7 @@ void initialize_curand() {
               return nullptr;
             }
 
-            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
             if (!rng->Init()) {
               // Note: Init() will log a more specific error.
               delete rng;
@@ -334,7 +257,7 @@ void initialize_curand() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+      cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 57ef398aaa88da7de769c49820325c6c9feb4d70..d7f6b0e8e034967ed2919332aafca9c7a8081eba 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -16,85 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/rng.h"
-
-typedef struct curandGenerator_st *curandGenerator_t;
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
 
 namespace stream_executor {
 
-class Stream;
-template <typename ElemT>
-class DeviceMemory;
-
 namespace cuda {
 
-// Opaque and unique identifier for the cuRAND plugin.
-extern const PluginId kCuRandPlugin;
-
-class CUDAExecutor;
-
-// CUDA-platform implementation of the random number generation support
-// interface.
-//
-// Thread-safe post-initialization.
-class CUDARng : public rng::RngSupport {
- public:
-  explicit CUDARng(CUDAExecutor *parent);
-
-  // Retrieves a curand library generator handle. This is necessary for
-  // enqueuing random number generation work onto the device.
-  // TODO(leary) provide a way for users to select the RNG algorithm.
-  bool Init();
-
-  // Releases a curand library generator handle, if one was acquired.
-  ~CUDARng() override;
-
-  // See rng::RngSupport for details on the following overrides.
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<float>> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<double>> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                              DeviceMemory<float> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                              DeviceMemory<double> *v) override;
-
-  bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
-
- private:
-  // Actually performs the work of generating random numbers - the public
-  // methods are thin wrappers to this interface.
-  template <typename T>
-  bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
-  template <typename ElemT, typename FuncT>
-  bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
-                                      DeviceMemory<ElemT> *v, FuncT func);
-
-  // Sets the stream for the internal curand generator.
-  //
-  // This is a stateful operation, as the handle can only have one stream set at
-  // a given time, so it is usually performed right before enqueuing work to do
-  // with random number generation.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // mutex that guards the cuRAND handle for this device.
-  mutex mu_;
-
-  // CUDAExecutor which instantiated this CUDARng.
-  // Immutable post-initialization.
-  CUDAExecutor *parent_;
-
-  // cuRANDalibrary handle on the device.
-  curandGenerator_t rng_ GUARDED_BY(mu_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
-};
+using CUDARng = gpu::GpuRng;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index bb8bda4755344d859668425f89614cc87d7e2d3e..4460351368894a009eaa4d7186e809ddf3fa3aed 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDAStream type - the CUDA-specific implementation of the generic
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
 // StreamExecutor Stream interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
+using CUDAStream = gpu::GpuStream;
 
-// Wraps a CUstream in order to satisfy the platform-independent
-// StreamInterface.
-//
-// Thread-safe post-initialization.
-class CUDAStream : public internal::StreamInterface {
- public:
-  explicit CUDAStream(CUDAExecutor *parent)
-      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
-
-  // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~CUDAStream() override {}
-
-  void *GpuStreamHack() override { return cuda_stream_; }
-  void **GpuStreamMemberHack() override {
-    return reinterpret_cast<void **>(&cuda_stream_);
-  }
-
-  // Explicitly initialize the CUDA resources associated with this stream, used
-  // by StreamExecutor::AllocateStream().
-  bool Init();
-
-  // Explicitly destroy the CUDA resources associated with this stream, used by
-  // StreamExecutor::DeallocateStream().
-  void Destroy();
-
-  // Returns true if no work is pending or executing on the stream.
-  bool IsIdle() const;
-
-  // Retrieves an event which indicates that all work enqueued into the stream
-  // has completed. Ownership of the event is not transferred to the caller, the
-  // event is owned by this stream.
-  CUevent* completed_event() { return &completed_event_; }
-
-  // Returns the CUstream value for passing to the CUDA API.
-  //
-  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
-  // into the NVIDIA library causes difficult-to-understand faults).
-  CUstream cuda_stream() const {
-    DCHECK(cuda_stream_ != nullptr);
-    return const_cast<CUstream>(cuda_stream_);
-  }
-
-  CUDAExecutor *parent() const { return parent_; }
-
- private:
-  CUDAExecutor *parent_;  // Executor that spawned this stream.
-  CUstream cuda_stream_;  // Wrapped CUDA stream handle.
-
-  // Event that indicates this stream has completed.
-  CUevent completed_event_ = nullptr;
-};
-
-// Helper functions to simplify extremely common flows.
-// Converts a Stream to the underlying CUDAStream implementation.
-CUDAStream *AsCUDAStream(Stream *stream);
-
-// Extracts a CUstream from a CUDAStream-backed Stream object.
-CUstream AsCUDAStreamValue(Stream *stream);
+inline CUDAStream* AsCUDAStream(Stream* stream) {
+  return gpu::AsGpuStream(stream);
+}
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stub.cc b/tensorflow/stream_executor/cuda/cuda_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7777cae5bb958a6247599af699114e281c7e79a
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_stub.cc
@@ -0,0 +1,99 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the CUDA driver API by forwarding to CUDA loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCudaDriverDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+CUresult GetSymbolNotFoundError() {
+  return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
+}
+}  // namespace
+
+#if CUDA_VERSION < 8000
+#error CUDA version earlier than 8 is not supported.
+#endif
+
+// Forward-declare types introduced in CUDA 9.0.
+typedef struct CUDA_LAUNCH_PARAMS_st CUDA_LAUNCH_PARAMS;
+
+#ifndef __CUDA_DEPRECATED
+#define __CUDA_DEPRECATED
+#endif
+
+#if CUDA_VERSION < 10000
+// Define fake enums introduced in CUDA 10.0.
+typedef enum CUgraphNodeType_enum {} CUgraphNodeType;
+typedef enum CUstreamCaptureStatus_enum {} CUstreamCaptureStatus;
+typedef enum CUexternalMemoryHandleType_enum {} CUexternalMemoryHandleType;
+typedef enum CUexternalSemaphoreHandleType_enum {
+} CUexternalSemaphoreHandleType;
+#endif
+
+// Forward-declare types introduced in CUDA 10.0.
+typedef struct CUextMemory_st* CUexternalMemory;
+typedef struct CUextSemaphore_st* CUexternalSemaphore;
+typedef struct CUgraph_st* CUgraph;
+typedef struct CUgraphNode_st* CUgraphNode;
+typedef struct CUgraphExec_st* CUgraphExec;
+typedef struct CUDA_KERNEL_NODE_PARAMS_st CUDA_KERNEL_NODE_PARAMS;
+typedef struct CUDA_MEMSET_NODE_PARAMS_st CUDA_MEMSET_NODE_PARAMS;
+typedef struct CUDA_HOST_NODE_PARAMS_st CUDA_HOST_NODE_PARAMS;
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st
+    CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st
+    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st
+    CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st
+    CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st
+    CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+typedef void(CUDA_CB* CUhostFn)(void* userData);
+
+// We only need one stub implementation. Calling a function that is not yet
+// available in the given CUDA version will return
+// CUDA_ERROR_SHARED_OBJECT_INIT_FAILED.
+#include "tensorflow/stream_executor/cuda/cuda_10_0.inc"
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index e040cf86fad1f40a708ad4ca28693e31908393f0..01b722e888687c0e199d7fe8ace92aec407f3a4b 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDATimer type - the CUDA-specific implementation of the generic
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
 // StreamExecutor Timer interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
-class CUDAStream;
-
-// Wraps a pair of CUevents in order to satisfy the platform-independent
-// TimerInferface -- both a start and a stop event are present which may be
-// recorded in a stream.
-class CUDATimer : public internal::TimerInterface {
- public:
-  explicit CUDATimer(CUDAExecutor *parent)
-      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
-
-  // Note: teardown needs to be explicitly handled in this API by a call to
-  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
-  // TODO(csigg): Change to RAII.
-  ~CUDATimer() override {}
-
-  // Allocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::AllocateTimer().
-  bool Init();
-
-  // Deallocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::DeallocateTimer().
-  void Destroy();
-
-  // Records the "timer start" event at the current point in the stream.
-  bool Start(CUDAStream *stream);
-
-  // Records the "timer stop" event at the current point in the stream.
-  bool Stop(CUDAStream *stream);
-
-  // Returns the elapsed time, in milliseconds, between the start and stop
-  // events.
-  float GetElapsedMilliseconds() const;
-
-  // See Timer::Microseconds().
-  // TODO(leary) make this into an error code interface...
-  uint64 Microseconds() const override {
-    return GetElapsedMilliseconds() * 1e3;
-  }
-
-  // See Timer::Nanoseconds().
-  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
-
- private:
-  CUDAExecutor *parent_;
-  CUevent start_event_;  // Event recorded to indicate the "start" timestamp
-                         // executing in a stream.
-  CUevent stop_event_;   // Event recorded to indicate the "stop" timestamp
-                         // executing in a stream.
-};
-
-struct TimerDeleter {
-  void operator()(CUDATimer *t) {
-    t->Destroy();
-    delete t;
-  }
-};
+using CUDATimer = gpu::GpuTimer;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8878700c5ea9b48f0bad2038d803e61c71313dad
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps cuda runtime calls with dso loader so that we don't need to
+// have explicit linking to libcuda.
+
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+namespace {
+void* GetDsoHandle() {
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCudaRuntimeDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  auto env = stream_executor::port::Env::Default();
+  env->GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol).IgnoreError();
+  return reinterpret_cast<T>(symbol);
+}
+cudaError_t GetSymbolNotFoundError() {
+  return cudaErrorSharedObjectSymbolNotFound;
+}
+const char* GetSymbolNotFoundStrError() {
+  return "cudaErrorSharedObjectSymbolNotFound";
+}
+}  // namespace
+
+// Code below is auto-generated.
+extern "C" {
+cudaError_t CUDART_CB cudaFree(void* devPtr) {
+  using FuncPtr = cudaError_t (*)(void* devPtr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+cudaError_t CUDART_CB cudaGetDevice(int* device) {
+  using FuncPtr = cudaError_t (*)(int* device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+cudaError_t CUDART_CB cudaGetDeviceProperties(cudaDeviceProp* prop,
+                                              int device) {
+  using FuncPtr = cudaError_t (*)(cudaDeviceProp * prop, int device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+const char* CUDART_CB cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char* (*)(cudaError_t error);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundStrError();
+  return func_ptr(error);
+}
+
+cudaError_t CUDART_CB cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t (*)(int device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+cudaError_t CUDART_CB cudaStreamAddCallback(cudaStream_t stream,
+                                            cudaStreamCallback_t callback,
+                                            void* userData,
+                                            unsigned int flags) {
+  using FuncPtr =
+      cudaError_t (*)(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void* userData, unsigned int flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+cudaError_t CUDART_CB cudaGetDeviceCount(int* count) {
+  using FuncPtr = cudaError_t (*)(int* count);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+cudaError_t CUDART_CB cudaPointerGetAttributes(
+    struct cudaPointerAttributes* attributes, const void* ptr) {
+  using FuncPtr = cudaError_t (*)(struct cudaPointerAttributes * attributes,
+                                  const void* ptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+cudaError_t CUDART_CB cudaGetLastError() {
+  using FuncPtr = cudaError_t (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_6_0.inc b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..e9c51d60570614f1919e14b7650c6902edb92b3b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
@@ -0,0 +1,1773 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
+                                cudnnTensorDescriptor_t            *tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType, // image data type
+                                int                                 n,        // number of inputs (batch size)
+                                int                                 c,        // number of input feature maps
+                                int                                 h,        // height of input section
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType, // image data type
+                                int                                 n,        // number of inputs (batch size)
+                                int                                 c,        // number of input feature maps
+                                int                                 h,        // height of input section
+                                int                                 w,        // width of input section
+                                int                                 nStride,
+                                int                                 cStride,
+                                int                                 hStride,
+                                int                                 wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                cudnnDataType_t                    *dataType, // image data type
+                                int                                *n,        // number of inputs (batch size)
+                                int                                *c,        // number of input feature maps
+                                int                                *h,        // height of input section
+                                int                                *w,        // width of input section
+                                int                                *nStride,
+                                int                                *cStride,
+                                int                                *hStride,
+                                int                                *wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType,
+                                int                                *nbDims,
+                                int                                 dimA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                size_t                              *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc,
+                                cudnnOpTensorOp_t                   opTensorOp,
+                                cudnnDataType_t                     opTensorCompType,
+                                cudnnNanPropagation_t               opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                cudnnOpTensorOp_t                  *opTensorOp,
+                                cudnnDataType_t                    *opTensorCompType,
+                                cudnnNanPropagation_t              *opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       bDesc,
+                                const void                         *B,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
+                                cudnnReduceTensorOp_t                   reduceTensorOp,
+                                cudnnDataType_t                     reduceTensorCompType,
+                                cudnnNanPropagation_t               reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t          reduceTensorIndices,
+                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
+                                cudnnReduceTensorOp_t                  *reduceTensorOp,
+                                cudnnDataType_t                    *reduceTensorCompType,
+                                cudnnNanPropagation_t              *reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
+                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                void                               *indices,
+                                size_t                              indicesSizeInBytes,
+                                void                               *workspace,
+                                size_t                              workspaceSizeInBytes,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *valuePtr ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *alpha ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
+                                cudnnFilterDescriptor_t            *filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, // image data type
+                                cudnnTensorFormat_t                 format,
+                                int                                 k,        // number of output feature maps
+                                int                                 c,        // number of input feature maps
+                                int                                 h,        // height of each input filter
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                cudnnDataType_t                    *dataType, // image data type
+                                cudnnTensorFormat_t                *format,
+                                int                                *k,        // number of output feature maps
+                                int                                *c,        // number of input feature maps
+                                int                                *h,        // height of each input filter
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, // image data type
+                                cudnnTensorFormat_t                 format,
+                                int                                 nbDims,
+                                const int                           filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType, // image data type
+                                cudnnTensorFormat_t                *format,
+                                int                                *nbDims,
+                                int                                 filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t       *convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
+                                                             int pad_h,    // zero-padding height
+                                                             int pad_w,    // zero-padding width
+                                                             int u,   // vertical filter stride
+                                                             int v,   // horizontal filter stride
+                                                             int dilation_h, // filter dilation in the vertical dimension
+                                                             int dilation_w, // filter dilation in the horizontal dimension
+                                                             cudnnConvolutionMode_t mode,
+                                                             cudnnDataType_t computeType
+                                                           ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
+                                                            int* pad_h,    // zero-padding height
+                                                            int* pad_w,    // zero-padding width
+                                                            int* u,        // vertical filter stride
+                                                            int* v,        // horizontal filter stride
+                                                            int* dilation_h, // filter dilation in the vertical dimension
+                                                            int* dilation_w, // filter dilation in the horizontal dimension
+                                                            cudnnConvolutionMode_t* mode,
+                                                            cudnnDataType_t *computeType
+                                                         ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc,
+                                int                                 arrayLength,             /* nbDims-2 size */
+                                const int                           padA[],
+                                const int                           filterStrideA[],
+                                const int                           dilationA[],
+                                cudnnConvolutionMode_t              mode,
+                                cudnnDataType_t                     computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                int                                 arrayLengthRequested,
+                                int                                *arrayLength,
+                                int                                 padA[],
+                                int                                 strideA[],
+                                int                                 dilationA[],
+                                cudnnConvolutionMode_t             *mode,
+                                cudnnDataType_t                    *computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDims,
+                                int                                 tensorOuputDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdPreference_t     preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionFwdAlgo_t          *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       zDesc,
+                                const void                         *z,
+                                const cudnnTensorDescriptor_t       biasDesc,
+                                const void                         *bias,
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dbDesc,
+                                void                               *db ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                const int                           requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                                cudnnHandle_t                        handle,
+                                const cudnnTensorDescriptor_t        xDesc,
+                                const void                          *x,
+                                const cudnnTensorDescriptor_t        dyDesc,
+                                const void                          *y,
+                                const cudnnConvolutionDescriptor_t   convDesc,
+                                const cudnnFilterDescriptor_t        dwDesc,
+                                void                                *dw,
+                                const int                            requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                void                                *workSpace,
+                                size_t                               workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                         handle,
+                                const cudnnTensorDescriptor_t         xDesc,
+                                const cudnnTensorDescriptor_t         dyDesc,
+                                const cudnnConvolutionDescriptor_t    convDesc,
+                                const cudnnFilterDescriptor_t         dwDesc,
+                                cudnnConvolutionBwdFilterPreference_t preference,
+                                size_t                                memoryLimitInBytes,
+                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       gradDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                void                               *dw ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataPreference_t preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                void                               *colBuffer ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
+                                cudnnPoolingDescriptor_t           *poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                cudnnPoolingMode_t                  mode,
+                                cudnnNanPropagation_t               maxpoolingNanOpt,
+                                int                                 windowHeight,
+                                int                                 windowWidth,
+                                int                                 verticalPadding,
+                                int                                 horizontalPadding,
+                                int                                 verticalStride,
+                                int                                 horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *windowHeight,
+                                int                                *windowWidth,
+                                int                                *verticalPadding,
+                                int                                *horizontalPadding,
+                                int                                *verticalStride,
+                                int                                *horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                const cudnnPoolingMode_t            mode,
+                                const cudnnNanPropagation_t         maxpoolingNanOpt,
+                                int                                 nbDims,
+                                const int                           windowDimA[],
+                                const int                           paddingA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                int                                 nbDimsRequested,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *nbDims,
+                                int                                 windowDimA[],
+                                int                                 paddingA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                 nbDims,
+                                int                                 outputTensorDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                          *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
+                                cudnnActivationDescriptor_t        *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+                                cudnnActivationDescriptor_t         activationDesc,
+                                cudnnActivationMode_t               mode,
+                                cudnnNanPropagation_t               reluNanOpt,
+                                double                              coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                cudnnActivationMode_t              *mode,
+                                cudnnNanPropagation_t              *reluNanOpt,
+                                double*                             coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
+                                cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
+                                cudnnLRNDescriptor_t               *normDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned                            lrnN,
+                                double                              lrnAlpha,
+                                double                              lrnBeta,
+                                double                              lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned*                           lrnN,
+                                double*                             lrnAlpha,
+                                double*                             lrnBeta,
+                                double*                             lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, // same desc for means, temp, temp2
+                                const void                         *x,
+                                const void                         *means, // if NULL, means are assumed to be zero
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, means, dy, temp, temp2
+                                const void                         *x,
+                                const void                         *means, // if NULL, means are assumed to be zero
+                                const void                         *dy,
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dXdMeansDesc, // same desc for dx, dMeans
+                                void                               *dx, // output x differential
+                                void                               *dMeans ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+                                cudnnTensorDescriptor_t             derivedBnDesc,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                cudnnBatchNormMode_t                mode ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+
+                                const void                         *alpha, // alpha[0] = result blend factor
+                                const void                         *beta,  // beta[0] = dest layer blend factor
+
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     // NxCxHxW
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     // NxCxHxW
+
+                                /* Shared desc for the next 6 tensors in the argument list.
+                                   Data type to be set as follows:
+                                   type = (typeOf(x) == double) ? double : float
+                                   Dimensions for this descriptor depend on normalization mode
+                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+                                    (normalization is performed across NxHxW)
+                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
+                                    (normalization is performed across N) */
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+
+                                // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+
+                                /* MUST use factor=1 in the very first call of a complete training cycle.
+                                   Use a factor=1/(1+n) at N-th call to the function to get
+                                   Cumulative Moving Average (CMA) behavior
+                                   CMA[n] = (x[1]+...+x[n])/n
+                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+                                double                              exponentialAverageFactor,
+
+                                /* Used in Training phase only. 
+                                   runningMean = newMean*factor + runningMean*(1-factor) */
+                                void                               *resultRunningMean,
+                                /* Output in training mode, input in inference. Is the moving average
+                                   of  variance[x] (factor is applied in the same way as for runningMean) */
+                                void                               *resultRunningVariance,
+
+                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+                                double                              epsilon,
+
+                                /* Optionally save intermediate results from the forward pass here
+                                   - can be reused to speed up backward pass. NULL if unused */
+                                void                               *resultSaveMean,
+                                void                               *resultSaveInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alpha, // alpha[0] = result blend factor
+                                const void                         *beta,  // beta[0] = dest layer blend factor
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     // NxCxHxW
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     // NxCxHxW
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+                                const void                         *estimatedMean,
+                                const void                         *estimatedVariance,
+                                double                              epsilon ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alphaDataDiff,
+                                const void                         *betaDataDiff,
+                                const void                         *alphaParamDiff,
+                                const void                         *betaParamDiff,
+                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, dx, dy
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
+                                const void                         *bnScale, // bnBias doesn't affect backpropagation
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void                               *dBnScaleResult,
+                                void                               *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double                              epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void                         *savedMean,
+                                const void                         *savedInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor( 
+
+                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+                                cudnnSpatialTransformerDescriptor_t         stDesc,
+                                cudnnSamplerType_t                          samplerType, 
+                                cudnnDataType_t                             dataType,
+                                const int                                   nbDims,
+                                const int                                   dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *theta,
+                                 void                                      *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *dgrid,
+                                 void                                      *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,                                    
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *grid,
+                                 const void                                *beta,
+                                 cudnnTensorDescriptor_t                    yDesc,
+                                 void                                      *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *beta,
+                                 const cudnnTensorDescriptor_t              dxDesc,
+                                 void                                      *dx,
+                                 const void                                *alphaDgrid,
+                                 const cudnnTensorDescriptor_t              dyDesc,
+                                 const void                                *dy,
+                                 const void                                *grid,
+                                 const void                                *betaDgrid,
+                                 void                                      *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                    cudnnHandle_t handle,
+                                                    float dropout, 
+                                                    void * states, 
+                                                    size_t stateSizeInBytes, 
+                                                    unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t handle, 
+                                                      const cudnnDropoutDescriptor_t dropoutDesc,
+                                                      const cudnnTensorDescriptor_t xdesc, 
+                                                      const void * x,
+                                                      const cudnnTensorDescriptor_t ydesc,
+                                                      void * y,
+                                                      void * reserveSpace,
+                                                      size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t handle, 
+                                               const cudnnDropoutDescriptor_t dropoutDesc,
+                                               const cudnnTensorDescriptor_t dydesc, 
+                                               const void * dy,
+                                               const cudnnTensorDescriptor_t dxdesc,
+                                               void * dx,
+                                               void * reserveSpace,
+                                               size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                                             const int minibatch,
+                                             const cudnnDataType_t dataType,
+                                             cudnnPersistentRNNPlan_t * plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, 
+                                                cudnnRNNDescriptor_t rnnDesc,
+                                                const int hiddenSize, 
+                                                const int numLayers, 
+                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
+                                                cudnnRNNInputMode_t inputMode,                                                 
+                                                cudnnDirectionMode_t direction, 
+                                                cudnnRNNMode_t mode, 
+                                                cudnnRNNAlgo_t algo, 
+                                                cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnRNNDescriptor_t rnnDesc,
+                                                int hiddenSize, 
+                                                int numLayers, 
+                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
+                                                cudnnRNNInputMode_t inputMode,                                                 
+                                                cudnnDirectionMode_t direction, 
+                                                cudnnRNNMode_t mode, 
+                                                cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
+                                                    const cudnnRNNDescriptor_t rnnDesc,  
+                                                    const int seqLength, 
+                                                    const cudnnTensorDescriptor_t    *xDesc,
+                                                    size_t                     *sizeInBytes
+                                                    ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
+                                                          const cudnnRNNDescriptor_t rnnDesc,  
+                                                          const int seqLength, 
+                                                          const cudnnTensorDescriptor_t    *xDesc,
+                                                          size_t                     *sizeInBytes
+                                                    ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t              handle,
+                                                 const cudnnRNNDescriptor_t rnnDesc,  
+                                                 const cudnnTensorDescriptor_t    xDesc,                                                    
+                                                 size_t                     *sizeInBytes,
+                                                 cudnnDataType_t dataType
+                                                    ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
+                             const cudnnRNNDescriptor_t rnnDesc,  
+                             const int layer,
+                             const cudnnTensorDescriptor_t xDesc, 
+                             const cudnnFilterDescriptor_t wDesc, 
+                             const void * w, 
+                             const int linLayerID,  
+                             cudnnFilterDescriptor_t linLayerMatDesc, 
+                             void ** linLayerMat
+                             ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
+                             const cudnnRNNDescriptor_t rnnDesc,  
+                             const int layer,
+                             const cudnnTensorDescriptor_t xDesc, 
+                             const cudnnFilterDescriptor_t wDesc, 
+                             const void * w, 
+                             const int linLayerID, 
+                             cudnnFilterDescriptor_t linLayerBiasDesc, 
+                             void ** linLayerBias                       
+                             ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle, 
+                                                    const cudnnRNNDescriptor_t rnnDesc, 
+                                                    const int seqLength, 
+                                                    const cudnnTensorDescriptor_t * xDesc, 
+                                                    const void * x, 
+                                                    const cudnnTensorDescriptor_t hxDesc, 
+                                                    const void * hx, 
+                                                    const cudnnTensorDescriptor_t cxDesc, 
+                                                    const void * cx, 
+                                                    const cudnnFilterDescriptor_t wDesc, 
+                                                    const void * w, 
+                                                    const cudnnTensorDescriptor_t *yDesc,  
+                                                    void * y, 
+                                                    const cudnnTensorDescriptor_t hyDesc, 
+                                                    void * hy, 
+                                                    const cudnnTensorDescriptor_t cyDesc, 
+                                                    void * cy, 
+                                                    void * workspace, 
+                                                    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle, 
+                                                   const cudnnRNNDescriptor_t rnnDesc, 
+                                                   const int seqLength, 
+                                                   const cudnnTensorDescriptor_t *xDesc, 
+                                                   const void * x, 
+                                                   const cudnnTensorDescriptor_t hxDesc, 
+                                                   const void * hx, 
+                                                   const cudnnTensorDescriptor_t cxDesc, 
+                                                   const void * cx, 
+                                                   const cudnnFilterDescriptor_t wDesc, 
+                                                   const void * w, 
+                                                   const cudnnTensorDescriptor_t *yDesc,  
+                                                   void * y, 
+                                                   const cudnnTensorDescriptor_t hyDesc, 
+                                                   void * hy, 
+                                                   const cudnnTensorDescriptor_t cyDesc, 
+                                                   void * cy, 
+                                                   void * workspace, 
+                                                   size_t workSpaceSizeInBytes,
+                                                   void * reserveSpace, 
+                                                   size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle, 
+                                                const cudnnRNNDescriptor_t rnnDesc, 
+                                                const int seqLength, 
+                                                const cudnnTensorDescriptor_t * yDesc, 
+                                                const void * y,                                                
+                                                const cudnnTensorDescriptor_t * dyDesc, 
+                                                const void * dy, 
+                                                const cudnnTensorDescriptor_t dhyDesc, 
+                                                const void * dhy, 
+                                                const cudnnTensorDescriptor_t dcyDesc, 
+                                                const void * dcy, 
+                                                const cudnnFilterDescriptor_t wDesc, 
+                                                const void * w, 
+                                                const cudnnTensorDescriptor_t hxDesc, 
+                                                const void * hx,                                                                  
+                                                const cudnnTensorDescriptor_t cxDesc, 
+                                                const void * cx,                                                 
+                                                const cudnnTensorDescriptor_t * dxDesc, 
+                                                void * dx, 
+                                                const cudnnTensorDescriptor_t dhxDesc,
+                                                void * dhx,
+                                                const cudnnTensorDescriptor_t dcxDesc,
+                                                void * dcx,
+                                                void * workspace,
+                                                size_t workSpaceSizeInBytes,
+                                                void * reserveSpace, 
+                                                size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle, 
+                                                   const cudnnRNNDescriptor_t rnnDesc, 
+                                                   const int seqLength, 
+                                                   const cudnnTensorDescriptor_t * xDesc, 
+                                                   const void * x, 
+                                                   const cudnnTensorDescriptor_t hxDesc, 
+                                                   const void * hx,                                                   
+                                                   const cudnnTensorDescriptor_t * yDesc, 
+                                                   const void * y,
+                                                   const void * workspace, 
+                                                   size_t workSpaceSizeInBytes, 
+                                                   const cudnnFilterDescriptor_t dwDesc, 
+                                                   void * dw,
+                                                   const void * reserveSpace, 
+                                                   size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v4(
+                                cudnnConvolutionDescriptor_t        convDesc,
+                                int                                 pad_h,      // zero-padding height
+                                int                                 pad_w,      // zero-padding width
+                                int                                 u,          // vertical filter stride
+                                int                                 v,          // horizontal filter stride
+                                int                                 dilation_h, // filter dilation in the vertical dimension
+                                int                                 dilation_w, // filter dilation in the horizontal dimension
+                                cudnnConvolutionMode_t              mode ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5( cudnnConvolutionDescriptor_t convDesc,
+                                                             int pad_h,    // zero-padding height
+                                                             int pad_w,    // zero-padding width
+                                                             int u,   // vertical filter stride
+                                                             int v,   // horizontal filter stride
+                                                             int dilation_h, // filter dilation in the vertical dimension
+                                                             int dilation_w, // filter dilation in the horizontal dimension
+                                                             cudnnConvolutionMode_t mode,
+                                                             cudnnDataType_t computeType
+                                                           ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v4(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                int                                *pad_h,    // zero-padding height
+                                int                                *pad_w,    // zero-padding width
+                                int                                *u,        // vertical filter stride
+                                int                                *v,        // horizontal filter stride
+                                int                                *dilation_h, // filter dilation in the vertical dimension
+                                int                                *dilation_w, // filter dilation in the horizontal dimension
+                                cudnnConvolutionMode_t             *mode ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(  const cudnnConvolutionDescriptor_t convDesc,
+                                                            int* pad_h,    // zero-padding height
+                                                            int* pad_w,    // zero-padding width
+                                                            int* u,        // vertical filter stride
+                                                            int* v,        // horizontal filter stride
+                                                            int* dilation_h, // filter dilation in the vertical dimension
+                                                            int* dilation_w, // filter dilation in the horizontal dimension
+                                                            cudnnConvolutionMode_t* mode,
+                                                            cudnnDataType_t *computeType
+                                                         ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_0.inc b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ac6b0dd823e71c84a08e41aa11509973fa5b60d2
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
@@ -0,0 +1,1946 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
+                                cudnnHandle_t                       handle,
+                                cudnnStatus_t                      *rstatus,
+                                cudnnErrQueryMode_t                 mode,
+                                cudnnRuntimeTag_t                  *tag ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
+                                cudnnTensorDescriptor_t            *tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                int                                 n,        /* number of inputs (batch size) */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of input section */
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                int                                 n,        /* number of inputs (batch size) */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of input section */
+                                int                                 w,        /* width of input section */
+                                int                                 nStride,
+                                int                                 cStride,
+                                int                                 hStride,
+                                int                                 wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                int                                *n,        /* number of inputs (batch size) */
+                                int                                *c,        /* number of input feature maps  */
+                                int                                *h,        /* height of input section */
+                                int                                *w,        /* width of input section */
+                                int                                *nStride,
+                                int                                *cStride,
+                                int                                *hStride,
+                                int                                *wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType,
+                                int                                *nbDims,
+                                int                                 dimA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                size_t                              *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc,
+                                cudnnOpTensorOp_t                   opTensorOp,
+                                cudnnDataType_t                     opTensorCompType,
+                                cudnnNanPropagation_t               opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                cudnnOpTensorOp_t                  *opTensorOp,
+                                cudnnDataType_t                    *opTensorCompType,
+                                cudnnNanPropagation_t              *opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       bDesc,
+                                const void                         *B,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
+                                cudnnReduceTensorOp_t                   reduceTensorOp,
+                                cudnnDataType_t                     reduceTensorCompType,
+                                cudnnNanPropagation_t               reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t          reduceTensorIndices,
+                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
+                                cudnnReduceTensorOp_t                  *reduceTensorOp,
+                                cudnnDataType_t                    *reduceTensorCompType,
+                                cudnnNanPropagation_t              *reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
+                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                void                               *indices,
+                                size_t                              indicesSizeInBytes,
+                                void                               *workspace,
+                                size_t                              workspaceSizeInBytes,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *valuePtr ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *alpha ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
+                                cudnnFilterDescriptor_t            *filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                cudnnTensorFormat_t                 format,
+                                int                                 k,        /* number of output feature maps */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of each input filter */
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                cudnnTensorFormat_t                *format,
+                                int                                *k,        /* number of output feature maps */
+                                int                                *c,        /* number of input feature maps */
+                                int                                *h,        /* height of each input filter */
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                cudnnTensorFormat_t                 format,
+                                int                                 nbDims,
+                                const int                           filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                cudnnTensorFormat_t                *format,
+                                int                                *nbDims,
+                                int                                 filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t       *convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
+                                                       cudnnMathType_t mathType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
+                                                       cudnnMathType_t *mathType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
+                                                         int groupCount ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
+                                                         int *groupCount ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
+                                                             int pad_h,    /* zero-padding height */
+                                                             int pad_w,    /* zero-padding width */
+                                                             int u,   /* vertical filter stride */
+                                                             int v,   /* horizontal filter stride */
+                                                             int dilation_h, /* filter dilation in the vertical dimension */
+                                                             int dilation_w, /* filter dilation in the horizontal dimension */
+                                                             cudnnConvolutionMode_t mode,
+                                                             cudnnDataType_t computeType
+                                                           ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
+                                                            int* pad_h,    /* zero-padding height */
+                                                            int* pad_w,    /* zero-padding width */
+                                                            int* u,        /* vertical filter stride */
+                                                            int* v,        /* horizontal filter stride */
+                                                            int* dilation_h, /* filter dilation in the vertical dimension */
+                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
+                                                            cudnnConvolutionMode_t* mode,
+                                                            cudnnDataType_t *computeType
+                                                         ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc,
+                                int                                 arrayLength,             /* nbDims-2 size */
+                                const int                           padA[],
+                                const int                           filterStrideA[],
+                                const int                           dilationA[],
+                                cudnnConvolutionMode_t              mode,
+                                cudnnDataType_t                     computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                int                                 arrayLengthRequested,
+                                int                                *arrayLength,
+                                int                                 padA[],
+                                int                                 strideA[],
+                                int                                 dilationA[],
+                                cudnnConvolutionMode_t             *mode,
+                                cudnnDataType_t                    *computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDims,
+                                int                                 tensorOuputDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                       int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdPreference_t     preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionFwdAlgo_t          *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+                                cudnnHandle_t                      handle,
+                                const cudnnTensorDescriptor_t      srcDesc,
+                                const cudnnFilterDescriptor_t      filterDesc,
+                                const cudnnConvolutionDescriptor_t convDesc,
+                                const cudnnTensorDescriptor_t      destDesc,
+                                const int                          requestedAlgoCount,
+                                int                               *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       zDesc,
+                                const void                         *z,
+                                const cudnnTensorDescriptor_t       biasDesc,
+                                const void                         *bias,
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dbDesc,
+                                void                               *db ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                              int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                const int                           requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                                cudnnHandle_t                        handle,
+                                const cudnnTensorDescriptor_t        xDesc,
+                                const void                          *x,
+                                const cudnnTensorDescriptor_t        dyDesc,
+                                const void                          *y,
+                                const cudnnConvolutionDescriptor_t   convDesc,
+                                const cudnnFilterDescriptor_t        dwDesc,
+                                void                                *dw,
+                                const int                            requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                void                                *workSpace,
+                                size_t                               workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                         handle,
+                                const cudnnTensorDescriptor_t         xDesc,
+                                const cudnnTensorDescriptor_t         dyDesc,
+                                const cudnnConvolutionDescriptor_t    convDesc,
+                                const cudnnFilterDescriptor_t         dwDesc,
+                                cudnnConvolutionBwdFilterPreference_t preference,
+                                size_t                                memoryLimitInBytes,
+                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+                                cudnnHandle_t                         handle,
+                                const cudnnTensorDescriptor_t         srcDesc,
+                                const cudnnTensorDescriptor_t         diffDesc,
+                                const cudnnConvolutionDescriptor_t    convDesc,
+                                const cudnnFilterDescriptor_t         gradDesc,
+                                const int                             requestedAlgoCount,
+                                int                                  *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       gradDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                void                               *dw ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                            int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataPreference_t preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                const cudnnTensorDescriptor_t       diffDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       gradDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                void                               *colBuffer ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
+                                cudnnPoolingDescriptor_t           *poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                cudnnPoolingMode_t                  mode,
+                                cudnnNanPropagation_t               maxpoolingNanOpt,
+                                int                                 windowHeight,
+                                int                                 windowWidth,
+                                int                                 verticalPadding,
+                                int                                 horizontalPadding,
+                                int                                 verticalStride,
+                                int                                 horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *windowHeight,
+                                int                                *windowWidth,
+                                int                                *verticalPadding,
+                                int                                *horizontalPadding,
+                                int                                *verticalStride,
+                                int                                *horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                const cudnnPoolingMode_t            mode,
+                                const cudnnNanPropagation_t         maxpoolingNanOpt,
+                                int                                 nbDims,
+                                const int                           windowDimA[],
+                                const int                           paddingA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                int                                 nbDimsRequested,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *nbDims,
+                                int                                 windowDimA[],
+                                int                                 paddingA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                 nbDims,
+                                int                                 outputTensorDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                          *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
+                                cudnnActivationDescriptor_t        *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+                                cudnnActivationDescriptor_t         activationDesc,
+                                cudnnActivationMode_t               mode,
+                                cudnnNanPropagation_t               reluNanOpt,
+                                double                              coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                cudnnActivationMode_t              *mode,
+                                cudnnNanPropagation_t              *reluNanOpt,
+                                double*                             coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
+                                cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
+                                cudnnLRNDescriptor_t               *normDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned                            lrnN,
+                                double                              lrnAlpha,
+                                double                              lrnBeta,
+                                double                              lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned*                           lrnN,
+                                double*                             lrnAlpha,
+                                double*                             lrnBeta,
+                                double*                             lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
+                                const void                         *x,
+                                const void                         *means, /* if NULL, means are assumed to be zero */
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                const void                         *x,
+                                const void                         *means, /* if NULL, means are assumed to be zero */
+                                const void                         *dy,
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
+                                void                               *dx, /* output x differential */
+                                void                               *dMeans ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+                                cudnnTensorDescriptor_t             derivedBnDesc,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                cudnnBatchNormMode_t                mode ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+
+                                const void                         *alpha, /* alpha[0] = result blend factor */
+                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     /* NxCxHxW */
+
+                                /* Shared desc for the next 6 tensors in the argument list.
+                                   Data type to be set as follows:
+                                   type = (typeOf(x) == double) ? double : float
+                                   Dimensions for this descriptor depend on normalization mode
+                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+                                    (normalization is performed across NxHxW)
+                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
+                                    (normalization is performed across N) */
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+
+                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+
+                                /* MUST use factor=1 in the very first call of a complete training cycle.
+                                   Use a factor=1/(1+n) at N-th call to the function to get
+                                   Cumulative Moving Average (CMA) behavior
+                                   CMA[n] = (x[1]+...+x[n])/n
+                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+                                double                              exponentialAverageFactor,
+
+                                /* Used in Training phase only. 
+                                   runningMean = newMean*factor + runningMean*(1-factor) */
+                                void                               *resultRunningMean,
+                                /* Output in training mode, input in inference. Is the moving average
+                                   of  variance[x] (factor is applied in the same way as for runningMean) */
+                                void                               *resultRunningVariance,
+
+                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+                                double                              epsilon,
+
+                                /* Optionally save intermediate results from the forward pass here
+                                   - can be reused to speed up backward pass. NULL if unused */
+                                void                               *resultSaveMean,
+                                void                               *resultSaveInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alpha, /* alpha[0] = result blend factor */
+                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+                                const void                         *estimatedMean,
+                                const void                         *estimatedVariance,
+                                double                              epsilon ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alphaDataDiff,
+                                const void                         *betaDataDiff,
+                                const void                         *alphaParamDiff,
+                                const void                         *betaParamDiff,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
+                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void                               *dBnScaleResult,
+                                void                               *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double                              epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void                         *savedMean,
+                                const void                         *savedInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+                                cudnnSpatialTransformerDescriptor_t         stDesc,
+                                cudnnSamplerType_t                          samplerType, 
+                                cudnnDataType_t                             dataType,
+                                const int                                   nbDims,
+                                const int                                   dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *theta,
+                                 void                                      *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *dgrid,
+                                 void                                      *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,                                    
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *grid,
+                                 const void                                *beta,
+                                 cudnnTensorDescriptor_t                    yDesc,
+                                 void                                      *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *beta,
+                                 const cudnnTensorDescriptor_t              dxDesc,
+                                 void                                      *dx,
+                                 const void                                *alphaDgrid,
+                                 const cudnnTensorDescriptor_t              dyDesc,
+                                 const void                                *dy,
+                                 const void                                *grid,
+                                 const void                                *betaDgrid,
+                                 void                                      *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                    cudnnHandle_t            handle,
+                                                    float                    dropout, 
+                                                    void *                   states, 
+                                                    size_t                   stateSizeInBytes, 
+                                                    unsigned long long       seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                        cudnnHandle_t            handle,
+                                                        float                    dropout, 
+                                                        void *                   states, 
+                                                        size_t                   stateSizeInBytes, 
+                                                        unsigned long long       seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                    cudnnHandle_t            handle,
+                                                    float *                  dropout, 
+                                                    void **                  states,
+                                                    unsigned long long *     seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
+                                              const cudnnDropoutDescriptor_t dropoutDesc,
+                                              const cudnnTensorDescriptor_t  xdesc, 
+                                              const void *                   x,
+                                              const cudnnTensorDescriptor_t  ydesc,
+                                              void *                         y,
+                                              void *                         reserveSpace,
+                                              size_t                         reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
+                                               const cudnnDropoutDescriptor_t dropoutDesc,
+                                               const cudnnTensorDescriptor_t  dydesc, 
+                                               const void *                   dy,
+                                               const cudnnTensorDescriptor_t  dxdesc,
+                                               void *                         dx,
+                                               void *                         reserveSpace,
+                                               size_t                         reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
+                                                       const int                  minibatch,
+                                                       const cudnnDataType_t      dataType,
+                                                       cudnnPersistentRNNPlan_t * plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                                                    cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t            handle,
+                                                   cudnnRNNDescriptor_t     rnnDesc,
+                                                   const int                hiddenSize,
+                                                   const int                numLayers,
+                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
+                                                   cudnnRNNInputMode_t      inputMode,          
+                                                   cudnnDirectionMode_t     direction,
+                                                   cudnnRNNMode_t           mode,
+                                                   cudnnRNNAlgo_t           algo,
+                                                   cudnnDataType_t          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              cudnnHandle,
+                                                cudnnRNNDescriptor_t       rnnDesc,
+                                                int *                      hiddenSize, 
+                                                int *                      numLayers, 
+                                                cudnnDropoutDescriptor_t * dropoutDesc,
+                                                cudnnRNNInputMode_t *      inputMode, 
+                                                cudnnDirectionMode_t *     direction, 
+                                                cudnnRNNMode_t *           mode, 
+                                                cudnnRNNAlgo_t *           algo, 
+                                                cudnnDataType_t *          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType (cudnnRNNDescriptor_t desc, cudnnMathType_t math) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, math);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
+                                                    const cudnnRNNDescriptor_t rnnDesc,  
+                                                    const int seqLength, 
+                                                    const cudnnTensorDescriptor_t    *xDesc,
+                                                    size_t                     *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
+                                                          const cudnnRNNDescriptor_t rnnDesc,  
+                                                          const int                  seqLength,
+                                                          const cudnnTensorDescriptor_t    *xDesc,
+                                                          size_t                   *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t                    handle,
+                                                 const cudnnRNNDescriptor_t       rnnDesc,  
+                                                 const cudnnTensorDescriptor_t    xDesc,
+                                                 size_t                          *sizeInBytes,
+                                                 cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
+                                                           const cudnnRNNDescriptor_t rnnDesc, 
+                                                           const int layer,
+                                                           const cudnnTensorDescriptor_t xDesc,
+                                                           const cudnnFilterDescriptor_t wDesc,
+                                                           const void * w, 
+                                                           const int linLayerID,  
+                                                           cudnnFilterDescriptor_t linLayerMatDesc,
+                                                           void ** linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
+                                                         const cudnnRNNDescriptor_t rnnDesc, 
+                                                         const int layer,
+                                                         const cudnnTensorDescriptor_t xDesc, 
+                                                         const cudnnFilterDescriptor_t wDesc,
+                                                         const void * w,
+                                                         const int linLayerID,
+                                                         cudnnFilterDescriptor_t linLayerBiasDesc,
+                                                         void ** linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle,
+                                                    const cudnnRNNDescriptor_t rnnDesc,
+                                                    const int seqLength,
+                                                    const cudnnTensorDescriptor_t * xDesc,
+                                                    const void * x,
+                                                    const cudnnTensorDescriptor_t hxDesc,
+                                                    const void * hx,
+                                                    const cudnnTensorDescriptor_t cxDesc,
+                                                    const void * cx,
+                                                    const cudnnFilterDescriptor_t wDesc,
+                                                    const void * w,
+                                                    const cudnnTensorDescriptor_t *yDesc,
+                                                    void * y,
+                                                    const cudnnTensorDescriptor_t hyDesc,
+                                                    void * hy,
+                                                    const cudnnTensorDescriptor_t cyDesc,
+                                                    void * cy,
+                                                    void * workspace,
+                                                    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle,
+                                                   const cudnnRNNDescriptor_t rnnDesc,
+                                                   const int seqLength,
+                                                   const cudnnTensorDescriptor_t *xDesc,
+                                                   const void * x,
+                                                   const cudnnTensorDescriptor_t hxDesc,
+                                                   const void * hx,
+                                                   const cudnnTensorDescriptor_t cxDesc,
+                                                   const void * cx,
+                                                   const cudnnFilterDescriptor_t wDesc,
+                                                   const void * w,
+                                                   const cudnnTensorDescriptor_t *yDesc,
+                                                   void * y,
+                                                   const cudnnTensorDescriptor_t hyDesc,
+                                                   void * hy,
+                                                   const cudnnTensorDescriptor_t cyDesc,
+                                                   void * cy,
+                                                   void * workspace,
+                                                   size_t workSpaceSizeInBytes,
+                                                   void * reserveSpace,
+                                                   size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle,
+                                                const cudnnRNNDescriptor_t rnnDesc,
+                                                const int seqLength,
+                                                const cudnnTensorDescriptor_t * yDesc,
+                                                const void * y,
+                                                const cudnnTensorDescriptor_t * dyDesc,
+                                                const void * dy,
+                                                const cudnnTensorDescriptor_t dhyDesc,
+                                                const void * dhy,
+                                                const cudnnTensorDescriptor_t dcyDesc,
+                                                const void * dcy,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void * w,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void * hx,
+                                                const cudnnTensorDescriptor_t cxDesc,
+                                                const void * cx,
+                                                const cudnnTensorDescriptor_t * dxDesc,
+                                                void * dx,
+                                                const cudnnTensorDescriptor_t dhxDesc,
+                                                void * dhx,
+                                                const cudnnTensorDescriptor_t dcxDesc,
+                                                void * dcx,
+                                                void * workspace,
+                                                size_t workSpaceSizeInBytes,
+                                                void * reserveSpace,
+                                                size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle,
+                                                   const cudnnRNNDescriptor_t rnnDesc,
+                                                   const int seqLength,
+                                                   const cudnnTensorDescriptor_t * xDesc,
+                                                   const void * x,
+                                                   const cudnnTensorDescriptor_t hxDesc,
+                                                   const void * hx,
+                                                   const cudnnTensorDescriptor_t * yDesc, 
+                                                   const void * y,
+                                                   const void * workspace, 
+                                                   size_t workSpaceSizeInBytes, 
+                                                   const cudnnFilterDescriptor_t dwDesc, 
+                                                   void * dw,
+                                                   const void * reserveSpace, 
+                                                   size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+                                cudnnCTCLossDescriptor_t         ctcLossDesc,
+                                cudnnDataType_t                  compType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+                                cudnnCTCLossDescriptor_t         ctcLossDesc,
+                                cudnnDataType_t*                 compType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
+                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
+                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
+                                        const int * labels,                          /* labels, in CPU memory */
+                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
+                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
+                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
+                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+                                        cudnnCTCLossDescriptor_t ctcLossDesc,
+                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
+                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
+                                const int                          * labels,         /* labels, in CPU memory */
+                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
+                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
+                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
+                                cudnnCTCLossDescriptor_t            ctcLossDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t            handle,
+                                                   cudnnRNNDescriptor_t     rnnDesc,
+                                                   const int                hiddenSize,
+                                                   const int                numLayers,
+                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
+                                                   cudnnRNNInputMode_t      inputMode,          
+                                                   cudnnDirectionMode_t     direction,
+                                                   cudnnRNNMode_t           mode,
+                                                   cudnnRNNAlgo_t           algo,
+                                                   cudnnDataType_t          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t     rnnDesc,
+                                                int                      hiddenSize,
+                                                int                      numLayers,
+                                                cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
+                                                cudnnRNNInputMode_t      inputMode,
+                                                cudnnDirectionMode_t     direction,
+                                                cudnnRNNMode_t           mode,
+                                                cudnnDataType_t          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_1.inc b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
new file mode 100644
index 0000000000000000000000000000000000000000..21abd7fdb16ad467f698970c8cd64de8017eff65
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
@@ -0,0 +1,2281 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
+                                cudnnHandle_t                       handle,
+                                cudnnStatus_t                      *rstatus,
+                                cudnnErrQueryMode_t                 mode,
+                                cudnnRuntimeTag_t                  *tag ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
+                                cudnnTensorDescriptor_t            *tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                int                                 n,        /* number of inputs (batch size) */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of input section */
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                int                                 n,        /* number of inputs (batch size) */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of input section */
+                                int                                 w,        /* width of input section */
+                                int                                 nStride,
+                                int                                 cStride,
+                                int                                 hStride,
+                                int                                 wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                int                                *n,        /* number of inputs (batch size) */
+                                int                                *c,        /* number of input feature maps  */
+                                int                                *h,        /* height of input section */
+                                int                                *w,        /* width of input section */
+                                int                                *nStride,
+                                int                                *cStride,
+                                int                                *hStride,
+                                int                                *wStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+                                cudnnTensorDescriptor_t             tensorDesc,
+                                cudnnTensorFormat_t                 format,
+                                cudnnDataType_t                     dataType,
+                                int                                 nbDims,
+                                const int                           dimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType,
+                                int                                *nbDims,
+                                int                                 dimA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+                                const cudnnTensorDescriptor_t       tensorDesc,
+                                size_t                              *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
+                                cudnnTensorDescriptor_t             tensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc,
+                                cudnnOpTensorOp_t                   opTensorOp,
+                                cudnnDataType_t                     opTensorCompType,
+                                cudnnNanPropagation_t               opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                cudnnOpTensorOp_t                  *opTensorOp,
+                                cudnnDataType_t                    *opTensorCompType,
+                                cudnnNanPropagation_t              *opTensorNanOpt ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
+                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnOpTensorDescriptor_t     opTensorDesc,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       bDesc,
+                                const void                         *B,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
+                                cudnnReduceTensorOp_t                   reduceTensorOp,
+                                cudnnDataType_t                     reduceTensorCompType,
+                                cudnnNanPropagation_t               reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t          reduceTensorIndices,
+                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
+                                cudnnReduceTensorOp_t                  *reduceTensorOp,
+                                cudnnDataType_t                    *reduceTensorCompType,
+                                cudnnNanPropagation_t              *reduceTensorNanOpt,
+                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
+                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                                void                               *indices,
+                                size_t                              indicesSizeInBytes,
+                                void                               *workspace,
+                                size_t                              workspaceSizeInBytes,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       aDesc,
+                                const void                         *A,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       cDesc,
+                                void                               *C ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *valuePtr ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const void                         *alpha ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
+                                cudnnFilterDescriptor_t            *filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                cudnnTensorFormat_t                 format,
+                                int                                 k,        /* number of output feature maps */
+                                int                                 c,        /* number of input feature maps */
+                                int                                 h,        /* height of each input filter */
+                                int                                 w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                cudnnTensorFormat_t                *format,
+                                int                                *k,        /* number of output feature maps */
+                                int                                *c,        /* number of input feature maps */
+                                int                                *h,        /* height of each input filter */
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc,
+                                cudnnDataType_t                     dataType, /* image data type */
+                                cudnnTensorFormat_t                 format,
+                                int                                 nbDims,
+                                const int                           filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDimsRequested,
+                                cudnnDataType_t                    *dataType, /* image data type */
+                                cudnnTensorFormat_t                *format,
+                                int                                *nbDims,
+                                int                                 filterDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
+                                cudnnFilterDescriptor_t             filterDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t       *convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
+                                                       cudnnMathType_t mathType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
+                                                       cudnnMathType_t *mathType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
+                                                         int groupCount ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
+                                                         int *groupCount ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
+                                                             int pad_h,    /* zero-padding height */
+                                                             int pad_w,    /* zero-padding width */
+                                                             int u,   /* vertical filter stride */
+                                                             int v,   /* horizontal filter stride */
+                                                             int dilation_h, /* filter dilation in the vertical dimension */
+                                                             int dilation_w, /* filter dilation in the horizontal dimension */
+                                                             cudnnConvolutionMode_t mode,
+                                                             cudnnDataType_t computeType
+                                                           ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
+                                                            int* pad_h,    /* zero-padding height */
+                                                            int* pad_w,    /* zero-padding width */
+                                                            int* u,        /* vertical filter stride */
+                                                            int* v,        /* horizontal filter stride */
+                                                            int* dilation_h, /* filter dilation in the vertical dimension */
+                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
+                                                            cudnnConvolutionMode_t* mode,
+                                                            cudnnDataType_t *computeType
+                                                         ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc,
+                                int                                 arrayLength,             /* nbDims-2 size */
+                                const int                           padA[],
+                                const int                           filterStrideA[],
+                                const int                           dilationA[],
+                                cudnnConvolutionMode_t              mode,
+                                cudnnDataType_t                     computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                int                                 arrayLengthRequested,
+                                int                                *arrayLength,
+                                int                                 padA[],
+                                int                                 strideA[],
+                                int                                 dilationA[],
+                                cudnnConvolutionMode_t             *mode,
+                                cudnnDataType_t                    *computeType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                int                                 nbDims,
+                                int                                 tensorOuputDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
+                                cudnnConvolutionDescriptor_t        convDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                       int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdPreference_t     preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionFwdAlgo_t          *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+                                cudnnHandle_t                      handle,
+                                const cudnnTensorDescriptor_t      srcDesc,
+                                const cudnnFilterDescriptor_t      filterDesc,
+                                const cudnnConvolutionDescriptor_t convDesc,
+                                const cudnnTensorDescriptor_t      destDesc,
+                                const int                          requestedAlgoCount,
+                                int                               *returnedAlgoCount,
+                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha1,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionFwdAlgo_t           algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *alpha2,
+                                const cudnnTensorDescriptor_t       zDesc,
+                                const void                         *z,
+                                const cudnnTensorDescriptor_t       biasDesc,
+                                const void                         *bias,
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dbDesc,
+                                void                               *db ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                              int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                const int                           requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                                cudnnHandle_t                        handle,
+                                const cudnnTensorDescriptor_t        xDesc,
+                                const void                          *x,
+                                const cudnnTensorDescriptor_t        dyDesc,
+                                const void                          *y,
+                                const cudnnConvolutionDescriptor_t   convDesc,
+                                const cudnnFilterDescriptor_t        dwDesc,
+                                void                                *dw,
+                                const int                            requestedAlgoCount,
+                                int                                 *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                void                                *workSpace,
+                                size_t                               workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+                                cudnnHandle_t                         handle,
+                                const cudnnTensorDescriptor_t         xDesc,
+                                const cudnnTensorDescriptor_t         dyDesc,
+                                const cudnnConvolutionDescriptor_t    convDesc,
+                                const cudnnFilterDescriptor_t         dwDesc,
+                                cudnnConvolutionBwdFilterPreference_t preference,
+                                size_t                                memoryLimitInBytes,
+                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+                                cudnnHandle_t                         handle,
+                                const cudnnTensorDescriptor_t         srcDesc,
+                                const cudnnTensorDescriptor_t         diffDesc,
+                                const cudnnConvolutionDescriptor_t    convDesc,
+                                const cudnnFilterDescriptor_t         gradDesc,
+                                const int                             requestedAlgoCount,
+                                int                                  *returnedAlgoCount,
+                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnFilterDescriptor_t       gradDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdFilterAlgo_t     algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnFilterDescriptor_t       dwDesc,
+                                void                               *dw ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
+                                                                            int              *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataPreference_t preference,
+                                size_t                              memoryLimitInBytes,
+                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       filterDesc,
+                                const cudnnTensorDescriptor_t       diffDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       gradDesc,
+                                const int                           requestedAlgoCount,
+                                int                                *returnedAlgoCount,
+                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+                                cudnnHandle_t                       handle,
+                                const void                         *alpha,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const void                         *w,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                cudnnConvolutionBwdDataAlgo_t       algo,
+                                void                               *workSpace,
+                                size_t                              workSpaceSizeInBytes,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const cudnnFilterDescriptor_t       wDesc,
+                                const cudnnConvolutionDescriptor_t  convDesc,
+                                void                               *colBuffer ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnSoftmaxAlgorithm_t             algo,
+                                cudnnSoftmaxMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
+                                cudnnPoolingDescriptor_t           *poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                cudnnPoolingMode_t                  mode,
+                                cudnnNanPropagation_t               maxpoolingNanOpt,
+                                int                                 windowHeight,
+                                int                                 windowWidth,
+                                int                                 verticalPadding,
+                                int                                 horizontalPadding,
+                                int                                 verticalStride,
+                                int                                 horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *windowHeight,
+                                int                                *windowWidth,
+                                int                                *verticalPadding,
+                                int                                *horizontalPadding,
+                                int                                *verticalStride,
+                                int                                *horizontalStride ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc,
+                                const cudnnPoolingMode_t            mode,
+                                const cudnnNanPropagation_t         maxpoolingNanOpt,
+                                int                                 nbDims,
+                                const int                           windowDimA[],
+                                const int                           paddingA[],
+                                const int                           strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                int                                 nbDimsRequested,
+                                cudnnPoolingMode_t                 *mode,
+                                cudnnNanPropagation_t              *maxpoolingNanOpt,
+                                int                                *nbDims,
+                                int                                 windowDimA[],
+                                int                                 paddingA[],
+                                int                                 strideA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                 nbDims,
+                                int                                 outputTensorDimA[] ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const cudnnTensorDescriptor_t       inputTensorDesc,
+                                int                                *n,
+                                int                                *c,
+                                int                                *h,
+                                int                                *w ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
+                                cudnnPoolingDescriptor_t            poolingDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+                                cudnnHandle_t                       handle,
+                                const cudnnPoolingDescriptor_t      poolingDesc,
+                                const void                          *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
+                                cudnnActivationDescriptor_t        *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+                                cudnnActivationDescriptor_t         activationDesc,
+                                cudnnActivationMode_t               mode,
+                                cudnnNanPropagation_t               reluNanOpt,
+                                double                              coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
+                                const cudnnActivationDescriptor_t   activationDesc,
+                                cudnnActivationMode_t              *mode,
+                                cudnnNanPropagation_t              *reluNanOpt,
+                                double*                             coef ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
+                                cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnActivationDescriptor_t         activationDesc,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
+                                cudnnLRNDescriptor_t               *normDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned                            lrnN,
+                                double                              lrnAlpha,
+                                double                              lrnBeta,
+                                double                              lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
+                                cudnnLRNDescriptor_t                normDesc,
+                                unsigned*                           lrnN,
+                                double*                             lrnAlpha,
+                                double*                             lrnBeta,
+                                double*                             lrnK ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnLRNMode_t                      lrnMode,
+                                const void*                         alpha,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                const void                         *y,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
+                                const void                         *x,
+                                const void                         *means, /* if NULL, means are assumed to be zero */
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnLRNDescriptor_t                normDesc,
+                                cudnnDivNormMode_t                  mode,
+                                const void                         *alpha,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                const void                         *x,
+                                const void                         *means, /* if NULL, means are assumed to be zero */
+                                const void                         *dy,
+                                void                               *temp,
+                                void                               *temp2,
+                                const void                         *beta,
+                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
+                                void                               *dx, /* output x differential */
+                                void                               *dMeans ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+                                cudnnTensorDescriptor_t             derivedBnDesc,
+                                const cudnnTensorDescriptor_t       xDesc,
+                                cudnnBatchNormMode_t                mode ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+
+                                const void                         *alpha, /* alpha[0] = result blend factor */
+                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     /* NxCxHxW */
+
+                                /* Shared desc for the next 6 tensors in the argument list.
+                                   Data type to be set as follows:
+                                   type = (typeOf(x) == double) ? double : float
+                                   Dimensions for this descriptor depend on normalization mode
+                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+                                    (normalization is performed across NxHxW)
+                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
+                                    (normalization is performed across N) */
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+
+                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+
+                                /* MUST use factor=1 in the very first call of a complete training cycle.
+                                   Use a factor=1/(1+n) at N-th call to the function to get
+                                   Cumulative Moving Average (CMA) behavior
+                                   CMA[n] = (x[1]+...+x[n])/n
+                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+                                double                              exponentialAverageFactor,
+
+                                /* Used in Training phase only. 
+                                   runningMean = newMean*factor + runningMean*(1-factor) */
+                                void                               *resultRunningMean,
+                                /* Output in training mode, input in inference. Is the moving average
+                                   of  variance[x] (factor is applied in the same way as for runningMean) */
+                                void                               *resultRunningVariance,
+
+                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+                                double                              epsilon,
+
+                                /* Optionally save intermediate results from the forward pass here
+                                   - can be reused to speed up backward pass. NULL if unused */
+                                void                               *resultSaveMean,
+                                void                               *resultSaveInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alpha, /* alpha[0] = result blend factor */
+                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+                                const cudnnTensorDescriptor_t       xDesc,
+                                const void                         *x,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       yDesc,
+                                void                               *y,     /* NxCxHxW */
+                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+                                const void                         *bnScale,
+                                const void                         *bnBias,
+                                const void                         *estimatedMean,
+                                const void                         *estimatedVariance,
+                                double                              epsilon ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+                                cudnnHandle_t                       handle,
+                                cudnnBatchNormMode_t                mode,
+                                const void                         *alphaDataDiff,
+                                const void                         *betaDataDiff,
+                                const void                         *alphaParamDiff,
+                                const void                         *betaParamDiff,
+                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
+                                const void                         *x,
+                                const cudnnTensorDescriptor_t       dyDesc,
+                                const void                         *dy,
+                                const cudnnTensorDescriptor_t       dxDesc,
+                                void                               *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
+                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void                               *dBnScaleResult,
+                                void                               *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double                              epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void                         *savedMean,
+                                const void                         *savedInvVariance ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+                                cudnnSpatialTransformerDescriptor_t         stDesc,
+                                cudnnSamplerType_t                          samplerType, 
+                                cudnnDataType_t                             dataType,
+                                const int                                   nbDims,
+                                const int                                   dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *theta,
+                                 void                                      *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+                                 cudnnHandle_t                              handle,
+                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
+                                 const void                                *dgrid,
+                                 void                                      *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,                                    
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *grid,
+                                 const void                                *beta,
+                                 cudnnTensorDescriptor_t                    yDesc,
+                                 void                                      *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+                                 cudnnHandle_t                              handle,
+                                 cudnnSpatialTransformerDescriptor_t        stDesc,
+                                 const void                                *alpha,
+                                 const cudnnTensorDescriptor_t              xDesc,
+                                 const void                                *x,
+                                 const void                                *beta,
+                                 const cudnnTensorDescriptor_t              dxDesc,
+                                 void                                      *dx,
+                                 const void                                *alphaDgrid,
+                                 const cudnnTensorDescriptor_t              dyDesc,
+                                 const void                                *dy,
+                                 const void                                *grid,
+                                 const void                                *betaDgrid,
+                                 void                                      *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                    cudnnHandle_t            handle,
+                                                    float                    dropout, 
+                                                    void *                   states, 
+                                                    size_t                   stateSizeInBytes, 
+                                                    unsigned long long       seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                        cudnnHandle_t            handle,
+                                                        float                    dropout, 
+                                                        void *                   states, 
+                                                        size_t                   stateSizeInBytes, 
+                                                        unsigned long long       seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
+                                                    cudnnHandle_t            handle,
+                                                    float *                  dropout, 
+                                                    void **                  states,
+                                                    unsigned long long *     seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
+                                              const cudnnDropoutDescriptor_t dropoutDesc,
+                                              const cudnnTensorDescriptor_t  xdesc, 
+                                              const void *                   x,
+                                              const cudnnTensorDescriptor_t  ydesc,
+                                              void *                         y,
+                                              void *                         reserveSpace,
+                                              size_t                         reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
+                                               const cudnnDropoutDescriptor_t dropoutDesc,
+                                               const cudnnTensorDescriptor_t  dydesc, 
+                                               const void *                   dy,
+                                               const cudnnTensorDescriptor_t  dxdesc,
+                                               void *                         dx,
+                                               void *                         reserveSpace,
+                                               size_t                         reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+                                cudnnHandle_t              handle,
+                          const cudnnRNNDescriptor_t       rnnDesc,
+                                int                        *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx( cudnnHandle_t handle,
+                                                    const cudnnRNNDescriptor_t rnnDesc,
+                                                    const int seqLength,
+                                                    const cudnnTensorDescriptor_t * xDesc,
+                                                    const void * x,
+                                                    const cudnnTensorDescriptor_t hxDesc,
+                                                    const void * hx,
+                                                    const cudnnTensorDescriptor_t cxDesc,
+                                                    const void * cx,
+                                                    const cudnnFilterDescriptor_t wDesc,
+                                                    const void * w,
+                                                    const cudnnTensorDescriptor_t *yDesc,
+                                                    void * y,
+                                                    const cudnnTensorDescriptor_t hyDesc,
+                                                    void * hy,
+                                                    const cudnnTensorDescriptor_t cyDesc,
+                                                    void * cy,
+                                                    const float findIntensity,
+                                                    const int requestedAlgoCount,
+                                                    int *returnedAlgoCount,
+                                                    cudnnAlgorithmPerformance_t *perfResults,
+                                                    void * workspace,
+                                                    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+                                cudnnHandle_t              handle,
+                          const cudnnRNNDescriptor_t       rnnDesc,
+                                int                        *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx( cudnnHandle_t handle,
+                                                    const cudnnRNNDescriptor_t rnnDesc,
+                                                    const int seqLength,
+                                                    const cudnnTensorDescriptor_t * xDesc,
+                                                    const void * x,
+                                                    const cudnnTensorDescriptor_t hxDesc,
+                                                    const void * hx,
+                                                    const cudnnTensorDescriptor_t cxDesc,
+                                                    const void * cx,
+                                                    const cudnnFilterDescriptor_t wDesc,
+                                                    const void * w,
+                                                    const cudnnTensorDescriptor_t *yDesc,
+                                                    void * y,
+                                                    const cudnnTensorDescriptor_t hyDesc,
+                                                    void * hy,
+                                                    const cudnnTensorDescriptor_t cyDesc,
+                                                    void * cy,
+                                                    const float findIntensity,
+                                                    const int requestedAlgoCount,
+                                                    int *returnedAlgoCount,
+                                                    cudnnAlgorithmPerformance_t *perfResults,
+                                                    void * workspace,
+                                                    size_t workSpaceSizeInBytes,
+                                                    void * reserveSpace,
+                                                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+                                cudnnHandle_t              handle,
+                          const cudnnRNNDescriptor_t       rnnDesc,
+                                int                        *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx( cudnnHandle_t handle,
+                                                const cudnnRNNDescriptor_t rnnDesc,
+                                                const int seqLength,
+                                                const cudnnTensorDescriptor_t * yDesc,
+                                                const void * y,
+                                                const cudnnTensorDescriptor_t * dyDesc,
+                                                const void * dy,
+                                                const cudnnTensorDescriptor_t dhyDesc,
+                                                const void * dhy,
+                                                const cudnnTensorDescriptor_t dcyDesc,
+                                                const void * dcy,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void * w,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void * hx,
+                                                const cudnnTensorDescriptor_t cxDesc,
+                                                const void * cx,
+                                                const cudnnTensorDescriptor_t * dxDesc,
+                                                void * dx,
+                                                const cudnnTensorDescriptor_t dhxDesc,
+                                                void * dhx,
+                                                const cudnnTensorDescriptor_t dcxDesc,
+                                                void * dcx,
+                                                const float findIntensity,
+                                                const int requestedAlgoCount,
+                                                int *returnedAlgoCount,
+                                                cudnnAlgorithmPerformance_t *perfResults,
+                                                void * workspace,
+                                                size_t workSpaceSizeInBytes,
+                                                void * reserveSpace,
+                                                size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+                                cudnnHandle_t              handle,
+                          const cudnnRNNDescriptor_t       rnnDesc,
+                                int                        *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx( cudnnHandle_t handle,
+                                                   const cudnnRNNDescriptor_t rnnDesc,
+                                                   const int seqLength,
+                                                   const cudnnTensorDescriptor_t * xDesc,
+                                                   const void * x,
+                                                   const cudnnTensorDescriptor_t hxDesc,
+                                                   const void * hx,
+                                                   const cudnnTensorDescriptor_t * yDesc, 
+                                                   const void * y,
+                                                   const float findIntensity,
+                                                   const int requestedAlgoCount,
+                                                   int *returnedAlgoCount,
+                                                   cudnnAlgorithmPerformance_t *perfResults,
+                                                   const void * workspace, 
+                                                   size_t workSpaceSizeInBytes, 
+                                                   const cudnnFilterDescriptor_t dwDesc, 
+                                                   void * dw,
+                                                   const void * reserveSpace, 
+                                                   size_t reserveSpaceSizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
+                                                       const int                  minibatch,
+                                                       const cudnnDataType_t      dataType,
+                                                       cudnnPersistentRNNPlan_t * plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                                                    cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t              handle,
+                                                cudnnRNNDescriptor_t       rnnDesc,
+                                                const int                  hiddenSize,
+                                                const int                  numLayers,
+                                                cudnnDropoutDescriptor_t   dropoutDesc, /* Between layers, not between recurrent steps. */
+                                                cudnnRNNInputMode_t        inputMode,          
+                                                cudnnDirectionMode_t       direction,
+                                                cudnnRNNMode_t             mode,
+                                                cudnnRNNAlgo_t             algo,
+                                                cudnnDataType_t            dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNProjectionLayers(cudnnHandle_t        handle,
+                                                cudnnRNNDescriptor_t       rnnDesc,
+                                                const int                  recProjSize,
+                                                const int                  outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(cudnnHandle_t        handle,
+                                                const cudnnRNNDescriptor_t rnnDesc,
+                                                int                        *recProjSize,
+                                                int                        *outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t     handle,
+                                                cudnnRNNDescriptor_t       rnnDesc,
+                                                cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              handle,
+                                                cudnnRNNDescriptor_t       rnnDesc,
+                                                int *                      hiddenSize, 
+                                                int *                      numLayers, 
+                                                cudnnDropoutDescriptor_t * dropoutDesc,
+                                                cudnnRNNInputMode_t *      inputMode, 
+                                                cudnnDirectionMode_t *     direction, 
+                                                cudnnRNNMode_t *           mode, 
+                                                cudnnRNNAlgo_t *           algo, 
+                                                cudnnDataType_t *          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t* mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t             handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,  
+                                                const int                     seqLength, 
+                                                const cudnnTensorDescriptor_t *xDesc,
+                                                size_t                        *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t       handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,  
+                                                const int                     seqLength,
+                                                const cudnnTensorDescriptor_t *xDesc,
+                                                size_t                        *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize(cudnnHandle_t                 handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,  
+                                                const cudnnTensorDescriptor_t xDesc,
+                                                size_t                        *sizeInBytes,
+                                                cudnnDataType_t               dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t      handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc, 
+                                                const int                     pseudoLayer,
+                                                const cudnnTensorDescriptor_t xDesc,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void                    *w, 
+                                                const int                     linLayerID,  
+                                                cudnnFilterDescriptor_t       linLayerMatDesc,
+                                                void                          **linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t        handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc, 
+                                                const int                     pseudoLayer,
+                                                const cudnnTensorDescriptor_t xDesc, 
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void                    *w,
+                                                const int                     linLayerID,
+                                                cudnnFilterDescriptor_t       linLayerBiasDesc,
+                                                void                          **linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t             handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,
+                                                const int                     seqLength,
+                                                const cudnnTensorDescriptor_t *xDesc,
+                                                const void                    *x,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void                    *hx,
+                                                const cudnnTensorDescriptor_t cxDesc,
+                                                const void                    *cx,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void                    *w,
+                                                const cudnnTensorDescriptor_t *yDesc,
+                                                void                          *y,
+                                                const cudnnTensorDescriptor_t hyDesc,
+                                                void                          *hy,
+                                                const cudnnTensorDescriptor_t cyDesc,
+                                                void                          *cy,
+                                                void                          *workspace,
+                                                size_t                        workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t              handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,
+                                                const int                     seqLength,
+                                                const cudnnTensorDescriptor_t *xDesc,
+                                                const void                    *x,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void                    *hx,
+                                                const cudnnTensorDescriptor_t cxDesc,
+                                                const void                    *cx,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void                    *w,
+                                                const cudnnTensorDescriptor_t *yDesc,
+                                                void                          *y,
+                                                const cudnnTensorDescriptor_t hyDesc,
+                                                void                          *hy,
+                                                const cudnnTensorDescriptor_t cyDesc,
+                                                void                          *cy,
+                                                void                          *workspace,
+                                                size_t                        workSpaceSizeInBytes,
+                                                void *                        reserveSpace,
+                                                size_t                        reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t                 handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,
+                                                const int                     seqLength,
+                                                const cudnnTensorDescriptor_t *yDesc,
+                                                const void                    *y,
+                                                const cudnnTensorDescriptor_t *dyDesc,
+                                                const void                    *dy,
+                                                const cudnnTensorDescriptor_t dhyDesc,
+                                                const void                    *dhy,
+                                                const cudnnTensorDescriptor_t dcyDesc,
+                                                const void                    *dcy,
+                                                const cudnnFilterDescriptor_t wDesc,
+                                                const void                    *w,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void                    *hx,
+                                                const cudnnTensorDescriptor_t cxDesc,
+                                                const void                    *cx,
+                                                const cudnnTensorDescriptor_t *dxDesc,
+                                                void                          *dx,
+                                                const cudnnTensorDescriptor_t dhxDesc,
+                                                void                          *dhx,
+                                                const cudnnTensorDescriptor_t dcxDesc,
+                                                void                          *dcx,
+                                                void                          *workspace,
+                                                size_t                        workSpaceSizeInBytes,
+                                                void *                        reserveSpace,
+                                                size_t                        reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t              handle,
+                                                const cudnnRNNDescriptor_t    rnnDesc,
+                                                const int                     seqLength,
+                                                const cudnnTensorDescriptor_t *xDesc,
+                                                const void                    *x,
+                                                const cudnnTensorDescriptor_t hxDesc,
+                                                const void                    *hx,
+                                                const cudnnTensorDescriptor_t *yDesc, 
+                                                const void                    *y,
+                                                const void                    *workspace, 
+                                                size_t                        workSpaceSizeInBytes, 
+                                                const cudnnFilterDescriptor_t dwDesc, 
+                                                void                          *dw,
+                                                const void                    *reserveSpace, 
+                                                size_t                        reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+                                cudnnCTCLossDescriptor_t         ctcLossDesc,
+                                cudnnDataType_t                  compType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+                                cudnnCTCLossDescriptor_t         ctcLossDesc,
+                                cudnnDataType_t*                 compType ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
+                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
+                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
+                                        const int * labels,                          /* labels, in CPU memory */
+                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
+                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
+                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
+                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+                                        cudnnCTCLossDescriptor_t ctcLossDesc,
+                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
+                                cudnnHandle_t                       handle,
+                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
+                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
+                                const int                          * labels,         /* labels, in CPU memory */
+                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
+                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
+                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
+                                cudnnCTCLossDescriptor_t            ctcLossDesc,
+                                size_t                             *sizeInBytes ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmDescriptor(
+                                cudnnAlgorithmDescriptor_t *algoDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+                                cudnnAlgorithmDescriptor_t algoDesc,
+                                cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+                                const cudnnAlgorithmDescriptor_t algoDesc,
+                                cudnnAlgorithm_t* algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+                                const cudnnAlgorithmDescriptor_t src,
+                                cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(src, dest);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmDescriptor(
+                                cudnnAlgorithmDescriptor_t algoDesc ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+                                cudnnAlgorithmPerformance_t* algoPerf,
+                                int numberToCreate ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToCreate);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+                                cudnnAlgorithmPerformance_t algoPerf,
+                                cudnnAlgorithmDescriptor_t algoDesc,
+                                cudnnStatus_t status,
+                                float time,
+                                size_t memory ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+                                const cudnnAlgorithmPerformance_t algoPerf,
+                                cudnnAlgorithmDescriptor_t* algoDesc,
+                                cudnnStatus_t* status,
+                                float* time,
+                                size_t* memory ) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+                                cudnnAlgorithmPerformance_t* algoPerf,
+                                int numberToDestroy) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToDestroy);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+                                cudnnHandle_t              handle,
+                                cudnnAlgorithmDescriptor_t algoDesc,
+                                size_t*                    algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSaveAlgorithm(
+                                cudnnHandle_t              handle,
+                                cudnnAlgorithmDescriptor_t algoDesc,
+                                void*                      algoSpace,
+                                size_t                     algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+                                cudnnHandle_t              handle,
+                                void*                      algoSpace,
+                                size_t                     algoSpaceSizeInBytes,
+                                cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(
+                                unsigned            mask,
+                                void                *udata,
+                                cudnnCallback_t     fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(
+                                unsigned            *mask,
+                                void                **udata,
+                                cudnnCallback_t     *fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t         handle,
+                                                cudnnRNNDescriptor_t     rnnDesc,
+                                                const int                hiddenSize,
+                                                const int                numLayers,
+                                                cudnnDropoutDescriptor_t dropoutDesc,
+                                                cudnnRNNInputMode_t      inputMode,          
+                                                cudnnDirectionMode_t     direction,
+                                                cudnnRNNMode_t           mode,
+                                                cudnnRNNAlgo_t           algo,
+                                                cudnnDataType_t          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t  rnnDesc,
+                                                int                      hiddenSize,
+                                                int                      numLayers,
+                                                cudnnDropoutDescriptor_t dropoutDesc,
+                                                cudnnRNNInputMode_t      inputMode,
+                                                cudnnDirectionMode_t     direction,
+                                                cudnnRNNMode_t           mode,
+                                                cudnnDataType_t          dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_3.inc b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
new file mode 100644
index 0000000000000000000000000000000000000000..1f8e997ab9d34e47ce9382b5af1ef352086560ed
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
@@ -0,0 +1,2510 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                    const cudnnTensorDescriptor_t xDesc,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const cudnnConvolutionDescriptor_t convDesc,
+                                    const cudnnTensorDescriptor_t yDesc,
+                                    cudnnConvolutionFwdPreference_t preference,
+                                    size_t memoryLimitInBytes,
+                                    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnConvolutionDescriptor_t convDesc,
+                                           const cudnnFilterDescriptor_t dwDesc,
+                                           cudnnConvolutionBwdFilterPreference_t preference,
+                                           size_t memoryLimitInBytes,
+                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                         const cudnnFilterDescriptor_t wDesc,
+                                         const cudnnTensorDescriptor_t dyDesc,
+                                         const cudnnConvolutionDescriptor_t convDesc,
+                                         const cudnnTensorDescriptor_t dxDesc,
+                                         cudnnConvolutionBwdDataPreference_t preference,
+                                         size_t memoryLimitInBytes,
+                                         cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      const int hiddenSize,
+                      const int numLayers,
+                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
+                      cudnnRNNInputMode_t inputMode,
+                      cudnnDirectionMode_t direction,
+                      cudnnRNNMode_t mode,
+                      cudnnRNNAlgo_t algo,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      int *hiddenSize,
+                      int *numLayers,
+                      cudnnDropoutDescriptor_t *dropoutDesc,
+                      cudnnRNNInputMode_t *inputMode,
+                      cudnnDirectionMode_t *direction,
+                      cudnnRNNMode_t *mode,
+                      cudnnRNNAlgo_t *algo,
+                      cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workspace,
+                         size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workspace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(src, dest);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToCreate);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToDestroy);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
+                         int hiddenSize,
+                         int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_4.inc b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
new file mode 100644
index 0000000000000000000000000000000000000000..cd35c1fbb7494f5fdc457fff03b768372e0aaf57
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
@@ -0,0 +1,2643 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                    const cudnnTensorDescriptor_t xDesc,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const cudnnConvolutionDescriptor_t convDesc,
+                                    const cudnnTensorDescriptor_t yDesc,
+                                    cudnnConvolutionFwdPreference_t preference,
+                                    size_t memoryLimitInBytes,
+                                    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnConvolutionDescriptor_t convDesc,
+                                           const cudnnFilterDescriptor_t dwDesc,
+                                           cudnnConvolutionBwdFilterPreference_t preference,
+                                           size_t memoryLimitInBytes,
+                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                         const cudnnFilterDescriptor_t wDesc,
+                                         const cudnnTensorDescriptor_t dyDesc,
+                                         const cudnnConvolutionDescriptor_t convDesc,
+                                         const cudnnTensorDescriptor_t dxDesc,
+                                         cudnnConvolutionBwdDataPreference_t preference,
+                                         size_t memoryLimitInBytes,
+                                         cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      const int hiddenSize,
+                      const int numLayers,
+                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
+                      cudnnRNNInputMode_t inputMode,
+                      cudnnDirectionMode_t direction,
+                      cudnnRNNMode_t mode,
+                      cudnnRNNAlgo_t algo,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor(cudnnHandle_t handle,
+                      cudnnRNNDescriptor_t rnnDesc,
+                      int *hiddenSize,
+                      int *numLayers,
+                      cudnnDropoutDescriptor_t *dropoutDesc,
+                      cudnnRNNInputMode_t *inputMode,
+                      cudnnDirectionMode_t *direction,
+                      cudnnRNNMode_t *mode,
+                      cudnnRNNAlgo_t *algo,
+                      cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workspace,
+                         size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workspace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(src, dest);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToCreate);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToDestroy);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
+                         int hiddenSize,
+                         int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t mode,
+                         cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_stub.cc b/tensorflow/stream_executor/cuda/cudnn_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2727c215e8cd0b112300102f70b58ba832505c8f
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_stub.cc
@@ -0,0 +1,62 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "cuda/include/cudnn.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuDNN API by forwarding to cuDNN loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = stream_executor::internal::DsoLoader::GetCudnnDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+cudnnStatus_t GetSymbolNotFoundError() { return CUDNN_STATUS_INTERNAL_ERROR; }
+}  // namespace
+
+#if CUDNN_MAJOR < 6
+#error cuDNN version earlier than 6 is not supported.
+#elif CUDNN_MAJOR < 7
+#include "tensorflow/stream_executor/cuda/cudnn_6_0.inc"
+#elif CUDNN_MINOR < 1
+#include "tensorflow/stream_executor/cuda/cudnn_7_0.inc"
+#elif CUDNN_MINOR < 3
+#include "tensorflow/stream_executor/cuda/cudnn_7_1.inc"
+#elif CUDNN_MINOR < 4
+#include "tensorflow/stream_executor/cuda/cudnn_7_3.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cudnn_7_4.inc"
+#endif
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
index e8fcc0361850a561928d09f29f78fb57071c24b2..9ef8bc95e5644ed060d88335de4f9d1abd5f719d 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version) {
@@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
            loaded_version.minor_version >= source_version.minor_version));
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 6464e7f8e8755b5b46b90a4b35d50509eb0cfde7..4607a9bff87bf29a00a9f5e0f112f9389fa12972 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 struct CudnnVersion {
   CudnnVersion() = default;
@@ -44,7 +44,7 @@ struct CudnnVersion {
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version);
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 7d4c6399d040e9bcddff5d98d202ab00fdeffa58..cfe114662d4515c68ffdab46918db09f631e9343 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 TEST(CuDNNVersion, ToString) {
@@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 }
 
 }  // namespace
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cufft_10_0.inc b/tensorflow/stream_executor/cuda/cufft_10_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..19ae08815f2dc7d26dfe527addeeb89a249530f1
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cufft_10_0.inc
@@ -0,0 +1,320 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, 
+                                 int nx, 
+                                 cufftType type, 
+                                 int batch) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, cufftType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan1d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, type, batch);
+}
+
+cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, 
+                                 int nx, int ny,
+                                 cufftType type) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int, cufftType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan2d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, ny, type);
+}
+
+cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, 
+                                 int nx, int ny, int nz, 
+                                 cufftType type) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int, int, cufftType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlan3d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, ny, nz, type);
+}
+
+cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
+                                   int rank,
+                                   int *n,
+                                   int *inembed, int istride, int idist,
+                                   int *onembed, int ostride, int odist,
+                                   cufftType type,
+                                   int batch) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *, int, int *, int *, int, int, int *, int, int, cufftType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftPlanMany");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch);
+}
+
+cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, 
+                                     int nx, 
+                                     cufftType type, 
+                                     int batch,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan1d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, 
+                                     int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan2d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, ny, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, 
+                                     int nx, int ny, int nz, 
+                                     cufftType type,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlan3d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, nx, ny, nz, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
+                                       int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan, 
+                                         int rank, 
+                                         long long int *n,
+                                         long long int *inembed, 
+                                         long long int istride, 
+                                         long long int idist,
+                                         long long int *onembed, 
+                                         long long int ostride, long long int odist,
+                                         cufftType type, 
+                                         long long int batch,
+                                         size_t * workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, long long *, long long *, long long, long long, long long *, long long, long long, cufftType, long long, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftMakePlanMany64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
+                                        int rank,
+                                        long long int *n,
+                                        long long int *inembed, 
+                                        long long int istride, long long int idist,
+                                        long long int *onembed, 
+                                        long long int ostride, long long int odist,
+                                        cufftType type,
+                                        long long int batch,
+                                        size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, long long *, long long *, long long, long long, long long *, long long, long long, cufftType, long long, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftEstimate1d(int nx, 
+                                     cufftType type, 
+                                     int batch,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate1d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nx, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate2d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nx, ny, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, 
+                                     cufftType type,
+                                     size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(int, int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimate3d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nx, ny, nz, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftEstimateMany(int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftEstimateMany");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftCreate(cufftHandle * handle) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, 
+                                    int nx, 
+                                    cufftType type, 
+                                    int batch,
+                                    size_t *workSize ) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize1d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nx, type, batch, workSize);
+}
+
+cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, 
+                                    int nx, int ny,
+                                    cufftType type,
+                                    size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize2d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nx, ny, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
+                                    int nx, int ny, int nz, 
+                                    cufftType type,
+                                    size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int, int, cufftType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize3d");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nx, ny, nz, type, workSize);
+}
+
+cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, 
+                                      int rank, int *n,
+                                      int *inembed, int istride, int idist,
+                                      int *onembed, int ostride, int odist,
+                                      cufftType type, int batch, size_t *workArea) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int, int *, int *, int, int, int *, int, int, cufftType, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSizeMany");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch, workArea);
+}
+
+cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, workSize);
+}
+
+cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetWorkArea");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, workArea);
+}
+
+cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetAutoAllocation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, autoAllocate);
+}
+
+cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, 
+                                  cufftComplex *idata,
+                                  cufftComplex *odata,
+                                  int direction) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftComplex *, cufftComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2C");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata, direction);
+}
+
+cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, 
+                                  cufftReal *idata,
+                                  cufftComplex *odata) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftReal *, cufftComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecR2C");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata);
+}
+
+cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, 
+                                  cufftComplex *idata,
+                                  cufftReal *odata) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftComplex *, cufftReal *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecC2R");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata);
+}
+
+cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, 
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleComplex *odata,
+                                  int direction) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleComplex *, cufftDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2Z");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata, direction);
+}
+
+cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, 
+                                  cufftDoubleReal *idata,
+                                  cufftDoubleComplex *odata) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleReal *, cufftDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecD2Z");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata);
+}
+
+cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, 
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleReal *odata) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cufftDoubleComplex *, cufftDoubleReal *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftExecZ2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, idata, odata);
+}
+
+cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
+                                    cudaStream_t stream) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, stream);
+}
+
+cufftResult CUFFTAPI cufftDestroy(cufftHandle plan) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(cufftHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cufftResult CUFFTAPI cufftGetVersion(int *version) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
+                                      int *value) {
+  using FuncPtr = cufftResult (CUFFTAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cufftGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cufft_stub.cc b/tensorflow/stream_executor/cuda/cufft_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c15d98730eb81244415b0744df2832a275435bdd
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cufft_stub.cc
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "cuda/include/cufft.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuFFT API by forwarding to cuFFT loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = stream_executor::internal::DsoLoader::GetCufftDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+cufftResult GetSymbolNotFoundError() { return CUFFT_INTERNAL_ERROR; }
+}  // namespace
+
+#include "tensorflow/stream_executor/cuda/cufft_10_0.inc"
diff --git a/tensorflow/stream_executor/cuda/curand_10_0.inc b/tensorflow/stream_executor/cuda/curand_10_0.inc
new file mode 100644
index 0000000000000000000000000000000000000000..e6024e2bb3b6173cb36504500be61ad5d05baef0
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/curand_10_0.inc
@@ -0,0 +1,246 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+curandStatus_t CURANDAPI 
+curandCreateGenerator(curandGenerator_t *generator, curandRngType_t rng_type) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t *, curandRngType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGenerator");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, rng_type);
+}
+
+curandStatus_t CURANDAPI 
+curandCreateGeneratorHost(curandGenerator_t *generator, curandRngType_t rng_type) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t *, curandRngType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreateGeneratorHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, rng_type);
+}
+
+curandStatus_t CURANDAPI 
+curandDestroyGenerator(curandGenerator_t generator) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyGenerator");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator);
+}
+
+curandStatus_t CURANDAPI
+curandGetVersion(int *version) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+curandStatus_t CURANDAPI
+curandGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+curandStatus_t CURANDAPI
+curandSetStream(curandGenerator_t generator, cudaStream_t stream) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, stream);
+}
+
+curandStatus_t CURANDAPI 
+curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator, unsigned long long seed) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetPseudoRandomGeneratorSeed");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, seed);
+}
+
+curandStatus_t CURANDAPI 
+curandSetGeneratorOffset(curandGenerator_t generator, unsigned long long offset) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, offset);
+}
+
+curandStatus_t CURANDAPI 
+curandSetGeneratorOrdering(curandGenerator_t generator, curandOrdering_t order) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, curandOrdering_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetGeneratorOrdering");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, order);
+}
+
+curandStatus_t CURANDAPI
+curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator, unsigned int num_dimensions) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandSetQuasiRandomGeneratorDimensions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, num_dimensions);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerate(curandGenerator_t generator, unsigned int *outputPtr, size_t num) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateLongLong(curandGenerator_t generator, unsigned long long *outputPtr, size_t num) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned long long *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLongLong");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateUniform(curandGenerator_t generator, float *outputPtr, size_t num) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniform");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr, size_t num) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateUniformDouble");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateNormal(curandGenerator_t generator, float *outputPtr, 
+                     size_t n, float mean, float stddev) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, mean, stddev);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr, 
+                     size_t n, double mean, double stddev) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateNormalDouble");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, mean, stddev);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr, 
+                     size_t n, float mean, float stddev) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, float *, size_t, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, mean, stddev);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr, 
+                     size_t n, double mean, double stddev) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, double *, size_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateLogNormalDouble");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, mean, stddev);
+}
+
+curandStatus_t CURANDAPI
+curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(double, curandDiscreteDistribution_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandCreatePoissonDistribution");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lambda, discrete_distribution);
+}
+
+curandStatus_t CURANDAPI
+curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDiscreteDistribution_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandDestroyDistribution");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(discrete_distribution);
+}
+
+curandStatus_t CURANDAPI
+curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr,
+                     size_t n, double lambda) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoisson");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, lambda);
+}
+
+curandStatus_t CURANDAPI
+curandGeneratePoissonMethod(curandGenerator_t generator, unsigned int *outputPtr,
+                     size_t n, double lambda, curandMethod_t method) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, double, curandMethod_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGeneratePoissonMethod");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, n, lambda, method);
+}
+
+curandStatus_t CURANDAPI
+curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr,
+                       size_t num, unsigned int n, double p) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, unsigned int, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomial");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num, n, p);
+}
+
+curandStatus_t CURANDAPI
+curandGenerateBinomialMethod(curandGenerator_t generator,
+                             unsigned int *outputPtr,
+                             size_t num, unsigned int n, double p,
+                             curandMethod_t method) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t, unsigned int *, size_t, unsigned int, double, curandMethod_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateBinomialMethod");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator, outputPtr, num, n, p, method);
+}
+
+curandStatus_t CURANDAPI 
+curandGenerateSeeds(curandGenerator_t generator) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandGenerator_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGenerateSeeds");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(generator);
+}
+
+curandStatus_t CURANDAPI
+curandGetDirectionVectors32(curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDirectionVectors32_t *[], curandDirectionVectorSet_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(vectors, set);
+}
+
+curandStatus_t CURANDAPI
+curandGetScrambleConstants32(unsigned int * * constants) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(unsigned int **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constants);
+}
+
+curandStatus_t CURANDAPI
+curandGetDirectionVectors64(curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(curandDirectionVectors64_t *[], curandDirectionVectorSet_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetDirectionVectors64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(vectors, set);
+}
+
+curandStatus_t CURANDAPI
+curandGetScrambleConstants64(unsigned long long * * constants) {
+  using FuncPtr = curandStatus_t (CURANDAPI *)(unsigned long long **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("curandGetScrambleConstants64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constants);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/curand_stub.cc b/tensorflow/stream_executor/cuda/curand_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd70384d643c60e410e504a12b8b125fd347352b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/curand_stub.cc
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "cuda/include/curand.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuRAND API by forwarding to cuRAND loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or = stream_executor::internal::DsoLoader::GetCurandDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+curandStatus_t GetSymbolNotFoundError() { return CURAND_STATUS_INTERNAL_ERROR; }
+}  // namespace
+
+#include "tensorflow/stream_executor/cuda/curand_10_0.inc"
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 0b991b7ba8cdad7f342adc6c8ff25b88d91e2bd2..3247665485eda92e409ff6ee7c20261256fee2ed 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -37,6 +37,7 @@ DeviceDescription::DeviceDescription()
                         kUninitializedUint64),
       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
                        kUninitializedUint64),
+      blocks_per_core_limit_(kUninitializedUint64),
       threads_per_core_limit_(kUninitializedUint64),
       threads_per_block_limit_(kUninitializedUint64),
       threads_per_warp_(kUninitializedUint64),
@@ -50,6 +51,7 @@ DeviceDescription::DeviceDescription()
       clock_rate_ghz_(-1.0),
       cuda_compute_capability_major_(-1),
       cuda_compute_capability_minor_(-1),
+      rocm_amdgpu_isa_version_(-1),
       numa_node_(-1),
       core_count_(-1),
       ecc_enabled_(false) {}
@@ -112,6 +114,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
   return cuda_compute_capability_major_ != 0;
 }
 
+bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
+  bool status = false;
+  if (rocm_amdgpu_isa_version_ > 0) {
+    *version = rocm_amdgpu_isa_version_;
+    status = true;
+  }
+  return status;
+}
+
 bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim) {
   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
@@ -137,8 +148,8 @@ uint64 DivideCeil(uint64 x, uint64 y) {
 }
 
 void CalculateDimensionality(const DeviceDescription &device_description,
-                             uint64 element_count, uint64 *threads_per_block,
-                             uint64 *block_count) {
+                             int64 element_count, int64 *threads_per_block,
+                             int64 *block_count) {
   *threads_per_block = device_description.threads_per_block_limit();
   *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
   if (*block_count == 1) {
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 8ddf18629d554112631c3d9c09dbb7afd8505c76..356b605a00cc7f57fe37f805e3da11e6acce5e49 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -78,45 +78,49 @@ class DeviceDescription {
   // legitimate kernel launch request.
   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
 
+  // Returns the maximum number of simultaneously resident blocks
+  // on a multiprocessor.
+  int64 blocks_per_core_limit() const { return blocks_per_core_limit_; }
+
   // Returns the limit on the total number of threads that can be launched in a
   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
   // This limit affects what constitutes a legitimate kernel launch request.
-  const uint64 &threads_per_block_limit() const {
+  const int64 &threads_per_block_limit() const {
     return threads_per_block_limit_;
   }
 
   // Returns the limit on the total number of threads that can be simultaneously
   // launched on a given multiprocessor.
-  const uint64 &threads_per_core_limit() const {
+  const int64 &threads_per_core_limit() const {
     return threads_per_core_limit_;
   }
 
   // Returns the number of threads per warp/wavefront.
-  const uint64 &threads_per_warp() const { return threads_per_warp_; }
+  const int64 &threads_per_warp() const { return threads_per_warp_; }
 
   // Returns the limit on the total number of registers per core.
-  const uint64 &registers_per_core_limit() const {
+  const int64 &registers_per_core_limit() const {
     return registers_per_core_limit_;
   }
 
   // Returns the limit on the total number of registers that can be
   // simultaneously used by a block.
-  const uint64 &registers_per_block_limit() const {
+  const int64 &registers_per_block_limit() const {
     return registers_per_block_limit_;
   }
 
   // Returns the number of address bits available to kernel code running on the
   // platform. This affects things like the maximum allocation size and perhaps
   // types used in kernel code such as size_t.
-  const uint64 &device_address_bits() const { return device_address_bits_; }
+  const int64 &device_address_bits() const { return device_address_bits_; }
 
   // Returns the device memory size in bytes.
-  uint64 device_memory_size() const { return device_memory_size_; }
+  int64 device_memory_size() const { return device_memory_size_; }
 
   // Returns the device's memory bandwidth in bytes/sec.  (This is for
   // reads/writes to/from the device's own memory, not for transfers between the
   // host and device.)
-  uint64 memory_bandwidth() const { return memory_bandwidth_; }
+  int64 memory_bandwidth() const { return memory_bandwidth_; }
 
   // Returns the device's core clock rate in GHz.
   float clock_rate_ghz() const { return clock_rate_ghz_; }
@@ -133,14 +137,19 @@ class DeviceDescription {
   // zero, and the return value will be false.
   bool cuda_compute_capability(int *major, int *minor) const;
 
+  // Returns the AMDGPU ISA version if we're running on the ROCm platform.
+  // If the information is not available, the version is not modified,
+  // and the return value will be false.
+  bool rocm_amdgpu_isa_version(int *version) const;
+
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
   // devices). Note that some devices, such as NVIDIA's have a configurable
   // partitioning between shared memory and L1 cache.
-  uint64 shared_memory_per_core() const { return shared_memory_per_core_; }
+  int64 shared_memory_per_core() const { return shared_memory_per_core_; }
 
   // Returns the maximum amount of shared memory available for a single block.
-  uint64 shared_memory_per_block() const { return shared_memory_per_block_; }
+  int64 shared_memory_per_block() const { return shared_memory_per_block_; }
 
   // TODO(leary): resident blocks per core will be useful.
 
@@ -174,20 +183,22 @@ class DeviceDescription {
   ThreadDim thread_dim_limit_;
   BlockDim block_dim_limit_;
 
-  uint64 threads_per_core_limit_;
-  uint64 threads_per_block_limit_;
-  uint64 threads_per_warp_;
+  int64 blocks_per_core_limit_;
 
-  uint64 registers_per_core_limit_;
-  uint64 registers_per_block_limit_;
+  int64 threads_per_core_limit_;
+  int64 threads_per_block_limit_;
+  int64 threads_per_warp_;
 
-  uint64 device_address_bits_;
-  uint64 device_memory_size_;
-  uint64 memory_bandwidth_;
+  int64 registers_per_core_limit_;
+  int64 registers_per_block_limit_;
+
+  int64 device_address_bits_;
+  int64 device_memory_size_;
+  int64 memory_bandwidth_;
 
   // Shared memory limits on a given device.
-  uint64 shared_memory_per_core_;
-  uint64 shared_memory_per_block_;
+  int64 shared_memory_per_core_;
+  int64 shared_memory_per_block_;
 
   float clock_rate_ghz_;
 
@@ -195,6 +206,9 @@ class DeviceDescription {
   int cuda_compute_capability_major_;
   int cuda_compute_capability_minor_;
 
+  // ROCM AMDGPU ISA version, 0 if not available.
+  int rocm_amdgpu_isa_version_;
+
   int numa_node_;
   int core_count_;
   bool ecc_enabled_;
@@ -237,30 +251,34 @@ class DeviceDescriptionBuilder {
     device_description_->block_dim_limit_ = value;
   }
 
-  void set_threads_per_core_limit(uint64 value) {
+  void set_blocks_per_core_limit(int64 value) {
+    device_description_->blocks_per_core_limit_ = value;
+  }
+
+  void set_threads_per_core_limit(int64 value) {
     device_description_->threads_per_core_limit_ = value;
   }
-  void set_threads_per_block_limit(uint64 value) {
+  void set_threads_per_block_limit(int64 value) {
     device_description_->threads_per_block_limit_ = value;
   }
-  void set_threads_per_warp(uint64 value) {
+  void set_threads_per_warp(int64 value) {
     device_description_->threads_per_warp_ = value;
   }
 
-  void set_registers_per_core_limit(uint64 value) {
+  void set_registers_per_core_limit(int64 value) {
     device_description_->registers_per_core_limit_ = value;
   }
-  void set_registers_per_block_limit(uint64 value) {
+  void set_registers_per_block_limit(int64 value) {
     device_description_->registers_per_block_limit_ = value;
   }
 
-  void set_device_address_bits(uint64 value) {
+  void set_device_address_bits(int64 value) {
     device_description_->device_address_bits_ = value;
   }
-  void set_device_memory_size(uint64 value) {
+  void set_device_memory_size(int64 value) {
     device_description_->device_memory_size_ = value;
   }
-  void set_memory_bandwidth(uint64 value) {
+  void set_memory_bandwidth(int64 value) {
     device_description_->memory_bandwidth_ = value;
   }
 
@@ -280,6 +298,10 @@ class DeviceDescriptionBuilder {
     device_description_->cuda_compute_capability_minor_ = minor;
   }
 
+  void set_rocm_amdgpu_isa_version(int version) {
+    device_description_->rocm_amdgpu_isa_version_ = version;
+  }
+
   void set_numa_node(int value) { device_description_->numa_node_ = value; }
   void set_core_count(int value) { device_description_->core_count_ = value; }
   void set_ecc_enabled(bool value) {
@@ -311,15 +333,15 @@ bool ThreadDimOk(const DeviceDescription &device_description,
 
 // Equivalent to ceil(double(element_count) / threads_per_block).
 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
-uint64 DivideCeil(uint64 x, uint64 y);
+int64 DivideCeil(int64 x, int64 y);
 
 // Calculate the number of threads/blocks required to process element_count
 // elements. Note that you can still end up with more threads than
 // element_count due to rounding, so kernels often start with an "is this
 // thread id in the element_count range?" test.
 void CalculateDimensionality(const DeviceDescription &device_description,
-                             uint64 element_count, uint64 *threads_per_block,
-                             uint64 *block_count);
+                             int64 element_count, int64 *threads_per_block,
+                             int64 *block_count);
 
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index d91afaa638b2686ef6e39dc06ad61d3b31d377a8..fcc3db928b1daaca33bef2e518aa6a4c1d8e5373 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -581,5 +581,15 @@ string NormalizeDescriptor::ToShortString() const {
                       "_size:", segment_size_);
 }
 
+bool DnnSupport::IsStatusOk(const port::Status& status, bool report_error) {
+  if (status.ok()) {
+    return true;
+  }
+  if (report_error) {
+    LOG(ERROR) << status.error_message();
+  }
+  return false;
+}
+
 }  // namespace dnn
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index f60d1ada241fcbcd75d46ccef13781b77a489bda..37879f46be5bbf233ea8a88514e2b788f0143c96 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1237,45 +1237,34 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
+  virtual port::Status DoConvolve(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
-  // Enqueues a double-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      dnn::ProfileResult* output_profile_result) = 0;
-
-  // Enqueues a half-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half>* output_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
+  template <typename ElementType>
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+                  const DeviceMemory<ElementType>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<ElementType>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<ElementType>* output_data,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
+                  ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
+                   stream, input_descriptor, input_data, filter_descriptor,
+                   filter_data, output_descriptor, *output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the forward convolution pass.
   // cc_major and cc_minor are the compute capabilities of the device.
@@ -1348,17 +1337,27 @@ class DnnSupport {
   //    backprop of the input.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardData(
+  template <typename ElementType>
+  bool DoConvolveBackwardData(
       Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
+      const DeviceMemory<ElementType>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
+      const DeviceMemory<ElementType>& backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
+      DeviceMemory<ElementType>* backward_input_data,
       const dnn::AlgorithmDesc& algorithm_desc,
       DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_DATA,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   *backward_input_data, filter_descriptor, filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
@@ -1366,30 +1365,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for filter) operation
   // onto the stream.
   //
@@ -1409,17 +1384,27 @@ class DnnSupport {
   //    backprop of the filter.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardFilter(
+  template <typename ElementType>
+  bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
+      const DeviceMemory<ElementType>& input_data,
       const BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
+      const DeviceMemory<ElementType>& backward_output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
+      DeviceMemory<ElementType>* backward_filter_data,
       const dnn::AlgorithmDesc& algorithm_desc,
       DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_FILTER,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   input_data, filter_descriptor, *backward_filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
@@ -1427,30 +1412,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      const dnn::AlgorithmDesc& algorithm_desc,
-      DeviceMemory<uint8>* scratch_memory,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for bias) operation onto
   // the stream.
   //
@@ -1687,21 +1648,9 @@ class DnnSupport {
     return false;
   }
 
-  // Applies local response normalization to the values from
-  // input_data and writes the result to output_data. See comments on
-  // NormalizeDescriptor for a description of local response
-  // normalization.
-  virtual bool DoNormalize(Stream* stream,
-                           const dnn::NormalizeDescriptor& normalize_descriptor,
-                           const DeviceMemory<float>& input_data,
-                           DeviceMemory<float>* output_data) = 0;
-
   // Applies local response normalization to the values from input_data and
   // writes the result to output_data.
   //
-  // Similar to DoNormalize, but normalizes across feature maps and allows for
-  // specifying the dimensions of the tensor.
-  //
   // See comments on NormalizeDescriptor for a description of local response
   // normalization.
   virtual bool DoNormalizeWithDimensions(
@@ -2121,7 +2070,7 @@ class DnnSupport {
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
-                                    dnn::DataType data_type) {
+                                    bool time_major, dnn::DataType data_type) {
     return port::Status(port::error::UNIMPLEMENTED,
                         "createRnnSequenceTensorDescriptor is unimplemented");
   }
@@ -2373,6 +2322,224 @@ class DnnSupport {
     return false;
   }
 
+  // Enqueues a fused convolution+bias+activation operation onto the stream.
+  //
+  // Arguments (all borrowed):
+  //
+  //  stream: borrowed pointer to the stream that the 'fusion' operation should
+  //  be enqueued onto.
+  //
+  //  conv_input_descriptor: dimensions of the convolution input layer.
+  //  conv_input_data: device memory which contains the convolution input.
+  //
+  //  filter_descriptor: dimensions of the convolution filter.
+  //  filter_data: device memory which contains the convolution filter weights.
+  //
+  //  convolution_descriptor: stride of the convolution filter.
+  //
+  //  bias_descriptor: dimensions of the bias layer
+  //  biases: device memory region containing biases to add to the convolution
+  //  output
+  //
+  //  activation_mode: Type of activation to perform.
+  //
+  //  output_descriptor: dimensions of the output layer.
+  //  output_data: device memory region in which to place the fusion result.
+  //
+  //  output_profile_result: the output profile result for this call.
+  //         The profiling is only enabled when this is not nullptr.
+  //
+  virtual bool DoFusedConvolutionBiasActivation(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<float>& conv_input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  // Enqueues a fused batchnorm+activation (inference) operation onto the
+  // stream.
+  //
+  // Arguments (all borrowed):
+  //
+  //  stream: borrowed pointer to the stream that the 'fusion' operation should
+  //  be enqueued onto.
+  //
+  //  x_descriptor: dimensions of the batchnorm input layer.
+  //  x_data: device memory which contains the batchnorm input.
+  //
+  //  scale_offset_mean_variance_descriptor:
+  //      dimensions of the scale/offset/mean/variance tensor.
+  //  scale_data: device memory which contains the scale input.
+  //  offset_data: device memory which contains the offset input.
+  //  mean_data: device memory which contains the mean input.
+  //  variance_data: device memory which contains the variance input.
+  //  epsilon : the epsilon value to use in batchnorm calculation
+  //
+  //  activation_mode: Type of activation to perform.
+  //
+  //  y_data: device memory region in which to place the fusion result.
+  //
+  //  output_profile_result: the output profile result for this call.
+  //         The profiling is only enabled when this is not nullptr.
+  //
+  virtual bool DoFusedBatchNormActivationInference(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<float>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& mean_data,
+      const DeviceMemory<float>& variance_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoFusedBatchNormActivationInference(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<Eigen::half>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& mean_data,
+      const DeviceMemory<float>& variance_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  // Enqueues a fused batchnorm+activation (training-fwd) operation onto the
+  // stream.
+  //
+  // Arguments (all borrowed):
+  //
+  //  stream: borrowed pointer to the stream that the 'fusion' operation should
+  //  be enqueued onto.
+  //
+  //  x_descriptor: dimensions of the batchnorm input layer.
+  //  x_data: device memory which contains the batchnorm input.
+  //
+  //  scale_offset_mean_variance_descriptor:
+  //      dimensions of the scale/offset/mean/variance tensor.
+  //  scale_data: device memory which contains the scale input.
+  //  offset_data: device memory which contains the offset input.
+  //  epsilon : the epsilon value to use in batchnorm calculation
+  //
+  //  activation_mode: Type of activation to perform.
+  //
+  //  y_data: device memory region in which to place the fusion result.
+  //  batch_mean_data: device memory in which to place the batch mean output.
+  //  batch_var_data: device memory in which to place the batch variance output.
+  //  saved_mean_data: device memory in which to save the mean for bwd pass.
+  //  saved_var_data: device memory in which to save the variance for bwd pass.
+  //
+  //  output_profile_result: the output profile result for this call.
+  //         The profiling is only enabled when this is not nullptr.
+  //
+  virtual bool DoFusedBatchNormActivationForward(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<float>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoFusedBatchNormActivationForward(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<Eigen::half>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  // Enqueues a fused batchnorm+activation (training-bwd) operation onto the
+  // stream.
+  //
+  // Arguments (all borrowed):
+  //
+  //  stream: borrowed pointer to the stream that the 'fusion' operation should
+  //  be enqueued onto.
+  //
+  //  y_act_backprop_descriptor: dimensions of the backprop input from the
+  //  previous layer. y_act_backprop_data: device memory which contains the
+  //  backprop input.
+  //
+  //  y_act_data: device memory which contains the actv-fwd output data.
+  //
+  //  activation_mode: actv-fwd type.
+  //
+  //  scale_offset_mean_variance_descriptor:
+  //      dimensions of the scale/offset/mean/variance tensor.
+  //  scale_data: device memory which contains the scale input.
+  //  offset_data: device memory which contains the offset input.
+  //  saved_mean_data: device memory which contains the saved mean from fwd
+  //  pass. saved_var_data: device memory which contains the saved variance from
+  //  fwd pass.
+  //
+  //  x_bn_backprop_data: device memory region in which to place the backprop
+  //  data from this layer scale_backprop_data: device memory in which to place
+  //  the scale backprop output. offset_backprop_data: device memory in which to
+  //  place the offset backprop output.
+  //
+  //  output_profile_result: the output profile result for this call.
+  //         The profiling is only enabled when this is not nullptr.
+  //
+  virtual bool DoFusedBatchNormActivationBackward(
+      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+      const DeviceMemory<float>& y_act_backprop_data,
+      const DeviceMemory<float>& y_act_data,
+      dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& saved_mean_data,
+      const DeviceMemory<float>& saved_var_data,
+      DeviceMemory<float>* x_bn_backprop_data,
+      DeviceMemory<float>* scale_backprop_data,
+      DeviceMemory<float>* offset_backprop_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+  virtual bool DoFusedBatchNormActivationBackward(
+      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+      const DeviceMemory<Eigen::half>& y_act_backprop_data,
+      const DeviceMemory<Eigen::half>& y_act_data,
+      dnn::ActivationMode activation_mode,
+      const DeviceMemory<Eigen::half>& x_bn_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& saved_mean_data,
+      const DeviceMemory<float>& saved_var_data,
+      DeviceMemory<Eigen::half>* x_bn_backprop_data,
+      DeviceMemory<float>* scale_backprop_data,
+      DeviceMemory<float>* offset_backprop_data,
+      dnn::ProfileResult* output_profile_result) {
+    return false;
+  }
+
+ protected:
+  // Returns whether status is 'ok', and potentially logs the error.
+  static bool IsStatusOk(const port::Status& status, bool report_error);
+
  private:
   virtual port::Status DoPrepareForConvolution(
       ConvolutionKind kind, DataType element_type, Stream* stream,
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 11fb5d0f6a02a32fd3c958133136b078ac848ac3..817cb59662a72bae2c12cf41fc5e0cf21228d733 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -108,22 +108,3 @@ message ConvolutionDescriptorProto {
   int32 group_count = 5;
   ConvolutionMode convolution_mode = 6;
 }
-
-// A convolution. Currently it's only used for logging. In the future, we may
-// want to use it in the API as well.
-message ConvolutionProto {
-  ConvolutionKind kind = 1;
-  TensorDescriptorProto input = 2;
-  TensorDescriptorProto filter = 3;
-  TensorDescriptorProto output = 4;
-  AlgorithmProto algorithm = 5;
-  ConvolutionDescriptorProto conv_desc = 6;
-
-  // result = conv_scale * conv(...) + side_value_scale * side_value.
-  // side_value is an arbitrary buffer if activation is not none. Otherwise, it
-  // has to be the result buffer (using its old values).
-  double conv_scale = 7;
-  double side_value_scale = 8;
-
-  ActivationMode activation = 9;
-}
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e6812389e3eafc40365861722be3edb414dd05c7
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -0,0 +1,189 @@
+# Description:
+#   GPU-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
+package(
+    default_visibility = ["//tensorflow/stream_executor:__subpackages__"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "gpu_activation_header",
+    hdrs = ["gpu_activation.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "gpu_activation",
+    srcs = if_gpu_is_configured(["gpu_activation.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_activation.h"]),
+    deps = if_gpu_is_configured([
+        ":gpu_activation_header",
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "gpu_diagnostics_header",
+    hdrs = if_gpu_is_configured(["gpu_diagnostics.h"]),
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_driver_header",
+    hdrs = if_gpu_is_configured(["gpu_driver.h"]),
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "gpu_event_header",
+    hdrs = if_gpu_is_configured(["gpu_event.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_stream_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_event",
+    srcs = if_gpu_is_configured(["gpu_event.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_event.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_executor_header",
+    hdrs = if_gpu_is_configured(["gpu_executor.h"]),
+    deps = [
+        ":gpu_kernel_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gpu_helpers_header",
+    hdrs = if_gpu_is_configured(["gpu_helpers.h"]),
+    deps = [":gpu_types_header"],
+)
+
+cc_library(
+    name = "gpu_kernel_header",
+    hdrs = if_gpu_is_configured(["gpu_kernel.h"]),
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_rng_header",
+    hdrs = if_gpu_is_configured(["gpu_rng.h"]),
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream_header",
+    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream",
+    srcs = if_gpu_is_configured(["gpu_stream.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer_header",
+    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer",
+    srcs = if_gpu_is_configured(["gpu_timer.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_types_header",
+    hdrs = if_gpu_is_configured(["gpu_types.h"]),
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/gpu/gpu_activation.cc
similarity index 62%
rename from tensorflow/stream_executor/cuda/cuda_activation.cc
rename to tensorflow/stream_executor/gpu/gpu_activation.cc
index 02371c3c3ab403e9b3303fbbafdef18c30196f4f..6f74eef2dbc106c14f04736418f3e42adb68f0b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/gpu/gpu_activation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
+GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    CUDAExecutor *cuda_exec):
-      driver_scoped_activate_context_(
-          new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
+    GpuExecutor* gpu_exec)
+    : driver_scoped_activate_context_(
+          new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    StreamExecutor *stream_exec)
-    : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
+    StreamExecutor* stream_exec)
+    : ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}
 
 ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
-  delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
+  delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
 }
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    ScopedActivateExecutorContext &&other)
+    ScopedActivateExecutorContext&& other)
     : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
   other.driver_scoped_activate_context_ = nullptr;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_activation.h b/tensorflow/stream_executor/gpu/gpu_activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3409304d7796bfac92295b2eecc10e2f9487c018
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_activation.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by CUDA.
+// It reaches into the CUDA implementation to activate an underlying CUDA
+// context.
+//
+// Having this file separate from gpu/gpu_executor.h means that dependent
+// code does not also have to depend on cuda.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace gpu {
+
+class GpuExecutor;
+class ScopedActivateContext;
+
+// Activates a CUDA context within an enclosing scope.
+class ScopedActivateExecutorContext {
+ public:
+  // Form that takes a CUDA executor implementation.
+  explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
+
+  // Form that takes a pImpl executor and extracts a CUDA implementation --
+  // fatal failure if it is not CUDA inside.
+  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
+
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
+  ~ScopedActivateExecutorContext();
+
+ private:
+  // The cuda.h-using datatype that we wrap.
+  ScopedActivateContext* driver_scoped_activate_context_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_diagnostics.h b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..71642109b57fd9b4e0a0a3dbc4efee7991bb6f03
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+
+#include <tuple>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = std::tuple<int, int, int>;
+
+// FIXME: These functions are in stream_executor::cuda namespaces for now
+// Will move to stream_executor::gpu namespace in the near future
+//
+//// Converts a parsed driver version to string form.
+// string DriverVersionToString(DriverVersion version);
+//
+//// Converts a parsed driver version or status value to natural string form.
+// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+//
+//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static port::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      port::StatusOr<DriverVersion> dso_version,
+      port::StatusOr<DriverVersion> kernel_version);
+
+  // Logs information about the dev nodes present on this machine: their
+  // existence, permissions, accessibility from this uid/gid.
+  static void LogDevNodeDiagnosticInformation();
+
+  static string GetDevNodePath(int dev_node_ordinal);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5ef48db4704015c51fb1d0e203e541b6b79afc5
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -0,0 +1,525 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+
+#include <stddef.h>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Identifies the memory space where an allocation resides. See
+// GpuDriver::GetPointerMemorySpace().
+enum class MemorySpace { kHost, kDevice };
+
+// Returns a casual string, such as "host" for the provided memory space.
+string MemorySpaceString(MemorySpace memory_space);
+
+class GpuContext;
+
+// GpuDriver contains wrappers for calls to the userspace library driver. It's
+// useful to isolate these calls and put basic wrappers around them to separate
+// userspace library driver behaviors from the rest of the program.
+//
+// At the moment it's simply used as a namespace.
+//
+// The calls log any specific errors internally and return whether the operation
+// was successful to the caller.
+//
+// The order of parameters is generally kept symmetric with the underlying CUDA
+// driver API.
+//
+// Links on functions are to specific documentation under
+// http://docs.nvidia.com/cuda/cuda-driver-api/
+//
+// Thread safety: these functions should not be used from signal handlers.
+class GpuDriver {
+ public:
+  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
+  // the case of failure. Safe to call multiple times; will be fast on all calls
+  // after the first.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
+  static port::Status Init();
+
+  // Returns the device associated with the given context.
+  // device is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
+  static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
+
+  // Creates a new CUDA stream associated with the given context via
+  // cuStreamCreate.
+  // stream is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // Destroys a CUDA stream associated with the given context.
+  // stream is owned by the caller, must not be null, and *stream is set to null
+  // if the stream is successfully destroyed.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
+  static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // CUDA events can explicitly disable event TSC retrieval for some presumed
+  // performance improvement if timing is unnecessary.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  enum class EventFlags { kDefault, kDisableTiming };
+
+  // Creates a new event associated with the given context.
+  // result is an outparam owned by the caller and must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
+                                  EventFlags flags);
+
+  // Destroys *event and turns it into a nullptr. event may not be null, but
+  // *event may be, via cuEventDestroy
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
+  static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
+
+  // Allocates a GPU memory space of size bytes associated with the given
+  // context via cuMemAlloc.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
+  static void* DeviceAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a GPU memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void DeviceDeallocate(GpuContext* context, void* location);
+
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  // (supported on CUDA only)
+  static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  // (supported on CUDA only)
+  static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
+
+  // Allocates page-locked and CUDA-registered memory on the host via
+  // cuMemAllocHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
+  static void* HostAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
+  static void HostDeallocate(GpuContext* context, void* location);
+
+  // Registers a memory region at location of size bytes via cuMemHostRegister.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
+
+  // Unregisters a memory region that was previously registered at location via
+  // cuMemHostUnregister.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  //
+  // TODO(leary) verify an error will be returned if the location wasn't
+  // previously registered.
+  static bool HostUnregister(GpuContext* context, void* location);
+
+  // Given a device ordinal, returns a device handle into the device outparam,
+  // which must not be null.
+  //
+  // N.B. these device handles do not have a corresponding destroy function in
+  // the CUDA driver API.
+  static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
+
+  // Given a device handle, returns the name reported by the driver for the
+  // device.
+  static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
+
+  // Given a device to create a context for, returns a context handle into the
+  // context outparam, which must not be null.
+  //
+  // N.B. CUDA contexts are weird. They are implicitly associated with the
+  // calling thread. Current documentation on contexts and their influence on
+  // userspace processes is given here:
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
+  static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
+                                    const DeviceOptions& device_options,
+                                    GpuContext** context);
+
+  // Destroys the provided context via cuCtxDestroy.
+  // Don't do this while clients could still be using the context, per the docs
+  // bad things will happen.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
+  static void DestroyContext(GpuContext* context);
+
+  // Queries the runtime for the specified attribute of the specified function.
+  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
+  // in terms of integer-sized values, so there's no potential for overrun (as
+  // of CUDA 5.5).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+  static bool FuncGetAttribute(GpuFunctionAttribute attribute,
+                               GpuFunctionHandle function,
+                               int* attribute_value);
+
+  // Sets the preferred cache configuration for the specified function.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
+  static bool FuncSetCacheConfig(GpuFunctionHandle function,
+                                 GpuFuncCachePreference cache_config);
+
+  // Gets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
+  static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
+      GpuContext* context);
+
+  // Sets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
+  static port::Status ContextSetSharedMemConfig(
+      GpuContext* context, GpuSharedMemConfig shared_mem_config);
+
+  // Launches a CUDA kernel via cuLaunchKernel.
+  // TODO(leary) describe the structure of kernel_params and extra in a readable
+  // way.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
+  static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
+                           unsigned int grid_dim_x, unsigned int grid_dim_y,
+                           unsigned int grid_dim_z, unsigned int block_dim_x,
+                           unsigned int block_dim_y, unsigned int block_dim_z,
+                           unsigned int shared_mem_bytes,
+                           GpuStreamHandle stream, void** kernel_params,
+                           void** extra);
+
+  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
+  // handle in "module". Any error logs that are produced are logged internally.
+  // (supported on CUDA only)
+  static bool LoadPtx(GpuContext* context, const char* ptx_contents,
+                      GpuModuleHandle* module);
+
+  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
+  // the resulting handle in "module".
+  // (supported on CUDA only)
+  static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
+                                GpuModuleHandle* module);
+
+  // Loads HSACO with the ROCM runtime and stores the resulting handle in
+  // "module". Any error logs that are produced are logged internally.
+  // (supported on ROCm only)
+  static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
+                        GpuModuleHandle* module);
+
+  // Retrieves a named kernel from a loaded module, and places the resulting
+  // handle into function (outparam) on success. Neither kernel_name nor
+  // function may be null. No ownership is taken of kernel_name.
+  static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
+                                const char* kernel_name,
+                                GpuFunctionHandle* function);
+
+  // Retrieves a named global/constant symbol from a loaded module, and returns
+  // a device pointer and size of the symbol on success. symbol_name may not be
+  // null. At least one of dptr or bytes should not be null. No ownership is
+  // taken of symbol_name.
+  static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
+                              const char* symbol_name, GpuDevicePtr* dptr,
+                              size_t* bytes);
+
+  // Unloads module from the current context via cuModuleUnload.
+  // TODO(leary) the documentation doesn't say what kind of disasters happen
+  // if you try to unload a module while its GpuFunctionHandles are in use.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
+  static void UnloadModule(GpuContext* context, GpuModuleHandle module);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
+  static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
+                                     uint8 value, size_t size);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
+  static bool SynchronousMemsetUint32(GpuContext* context,
+                                      GpuDevicePtr location, uint32 value,
+                                      size_t uint32_count);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD8Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
+  static bool AsynchronousMemsetUint8(GpuContext* context,
+                                      GpuDevicePtr location, uint8 value,
+                                      size_t uint32_count,
+                                      GpuStreamHandle stream);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD32Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
+  static bool AsynchronousMemsetUint32(GpuContext* context,
+                                       GpuDevicePtr location, uint32 value,
+                                       size_t uint32_count,
+                                       GpuStreamHandle stream);
+
+  // -- Synchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
+
+  static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyH2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           const void* host_src, uint64 size);
+  static port::Status SynchronousMemcpyD2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+
+  // -- Asynchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
+
+  static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    const void* host_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+
+  // The CUDA stream callback type signature.
+  // The data passed to AddStreamCallback is subsequently passed to this
+  // callback when it fires.
+  //
+  // Some notable things:
+  // * Callbacks must not make any CUDA API calls.
+  // * Callbacks from independent streams execute in an undefined order and may
+  //   be serialized.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
+  typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
+                                 void* data);
+
+  // Enqueues a callback operation into stream.
+  // See StreamCallback above and the NVIDIA documentation for additional
+  // details.
+  static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
+                                StreamCallback callback, void* data);
+
+  // Causes stream to wait for event to trigger before proceeding via
+  // cuStreamWaitEvent.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
+  static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
+                                GpuEventHandle event);
+
+  // Blocks the calling thread until the operations enqueued onto stream have
+  // been completed, via cuStreamSynchronize.
+  //
+  // TODO(leary) if a pathological thread enqueues operations onto the stream
+  // while another thread blocks like this, can you wind up waiting an unbounded
+  // amount of time?
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
+  static port::Status SynchronizeStream(GpuContext* context,
+                                        GpuStreamHandle stream);
+
+  // Blocks the calling thread until the operations associated with the context
+  // have been completed, via cuCtxSynchronize.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
+  static bool SynchronizeContext(GpuContext* context);
+
+  // Returns true if all stream tasks have completed at time of the call. Note
+  // the potential for races around this call (if another thread adds work to
+  // the stream immediately after this returns).
+  static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
+
+  // Returns whether code in the from context can access memory in the to
+  // context via cuDeviceCanAccessPeer.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
+  static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Returns the elapsed milliseconds between start and stop via
+  // cuEventElapsedTime.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
+  static bool GetEventElapsedTime(GpuContext* context,
+                                  float* elapsed_milliseconds,
+                                  GpuEventHandle start, GpuEventHandle stop);
+
+  // Records that an event occurred when execution reaches the current point in
+  // thestream via cuEventRecord.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
+  static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
+                                  GpuStreamHandle stream);
+
+  // Polls (without blocking) to determine the status of an event - pending or
+  // complete (or an error status).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+  static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
+                                              GpuEventHandle event);
+
+  // -- Pointer-specific calls.
+
+  // Returns the context in which pointer was allocated or registered.
+  static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
+
+  // Returns the device associated with the context from GetPointerContext().
+  static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
+
+  // Returns the memory space addressed by pointer.
+  static port::StatusOr<MemorySpace> GetPointerMemorySpace(
+      GpuDevicePtr pointer);
+
+  // Returns the base address and size of the device pointer dptr.
+  static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
+                                             GpuDevicePtr* base, size_t* size);
+
+  // -- Device-specific calls.
+
+  // Returns the compute capability for the device; i.e (3, 5).
+  // This is currently done via the deprecated device API.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
+  // (supported on CUDA only)
+  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
+                                           GpuDeviceHandle device);
+
+  // Returns Gpu ISA version for the device; i.e 803, 900.
+  // (supported on ROCm only)
+  static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
+
+  // Returns the number of multiprocessors on the device (note that the device
+  // may be multi-GPU-per-board).
+  static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
+
+  // Returns the limit on number of threads that can be resident in a single
+  // multiprocessor.
+  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
+      GpuDeviceHandle device);
+
+  // Returns the limit on number of threads which may be resident for a single
+  // block (cooperative thread array).
+  static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available on a single GPU core (i.e.
+  // SM on NVIDIA devices).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
+      GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available for a single block
+  // (cooperative thread array).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
+      GpuDeviceHandle device);
+
+  // Returns the maximum supported number of registers per block.
+  static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
+
+  // Returns the number of threads per warp.
+  static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
+
+  // Queries the grid limits for device with cuDeviceGetAttribute calls.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
+
+  // Returns a grab-bag of device properties in a caller-owned device_properties
+  // structure for device_ordinal via cuDeviceGetProperties.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
+  static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
+                                  int device_ordinal);
+
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
+                                                GpuDeviceHandle device);
+
+  // Returns whether ECC is enabled for the given GpuDeviceHandle via
+  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
+
+  // Returns the total amount of memory available for allocation by the CUDA
+  // context, in bytes, via cuDeviceTotalMem.
+  static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by cuMemGetInfo.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
+  static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
+                                  int64* total);
+
+  // Returns a PCI bus id string for the device.
+  // [domain]:[bus]:[device].[function]
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
+  static string GetPCIBusID(GpuDeviceHandle device);
+
+  // -- Context- and device-independent calls.
+
+  // Returns the number of visible CUDA device via cuDeviceGetCount.
+  // This should correspond to the set of device ordinals available.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
+  static int GetDeviceCount();
+
+  // Returns the driver version number via cuDriverGetVersion.
+  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
+  // instead, the CUDA toolkit release number that this driver is compatible
+  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
+  // compatible driver).
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
+  static bool GetDriverVersion(int* driver_version);
+
+  // -- Other calls
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // specified kernel/GpuFunctionHandle when launched with the specified
+  // parameters.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
+  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+      GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
+      size_t dynamic_shared_memory_bytes);
+
+  // Seam for injecting an error at CUDA initialization time for testing
+  // purposes.
+  static bool driver_inject_init_error_;
+};
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext {
+ public:
+  // Activates the context via cuCtxSetCurrent, if it is not the currently
+  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
+  // mechanism is said by NVIDIA to be relatively slow and deprecated.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
+  explicit ScopedActivateContext(GpuContext* context);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext();
+
+ private:
+  GpuContext* to_restore_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_event.cc b/tensorflow/stream_executor/gpu/gpu_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a523958550d10c13624b729076a3fd271e68243a
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+GpuEvent::GpuEvent(GpuExecutor* parent)
+    : parent_(parent), gpu_event_(nullptr) {}
+
+GpuEvent::~GpuEvent() {}
+
+port::Status GpuEvent::Init() {
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
+                                GpuDriver::EventFlags::kDisableTiming);
+}
+
+port::Status GpuEvent::Destroy() {
+  return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
+}
+
+port::Status GpuEvent::Record(GpuStream* stream) {
+  return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
+                                stream->gpu_stream());
+}
+
+GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_event.h b/tensorflow/stream_executor/gpu/gpu_event.h
new file mode 100644
index 0000000000000000000000000000000000000000..61f39d42fe7344b3b092b8fbcc5615da99564300
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
+// interface.
+class GpuEvent : public internal::EventInterface {
+ public:
+  explicit GpuEvent(GpuExecutor* parent);
+
+  ~GpuEvent() override;
+
+  // Populates the CUDA-platform-specific elements of this object.
+  port::Status Init();
+
+  // Deallocates any platform-specific elements of this object. This is broken
+  // out (not part of the destructor) to allow for error reporting.
+  port::Status Destroy();
+
+  // Inserts the event at the current position into the specified stream.
+  port::Status Record(GpuStream* stream);
+
+  // Polls the CUDA platform for the event's current status.
+  Event::Status PollForStatus();
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event();
+
+ private:
+  // The Executor used to which this object and GpuEventHandle are bound.
+  GpuExecutor* parent_;
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event_;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f969a98d2f42b5be0f6d29e8e19c006540e3b8b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -0,0 +1,347 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class GpuExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit GpuExecutor(const PluginConfig& plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        version_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~GpuExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec& spec,
+                 KernelBase* kernel) override;
+  // (supported on CUDA only)
+  void UnloadKernel(const KernelBase* kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec& spec,
+                  ModuleHandle* module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+
+  bool Launch(Stream* stream, const ThreadDim& thread_dims,
+              const BlockDim& block_dims, const KernelBase& k,
+              const KernelArgsArrayBase& args) override;
+
+  // (supported on CUDA only)
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                         uint64 registers_per_thread,
+                         uint64 shared_memory_per_block,
+                         const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  // (supported on CUDA only)
+  int CompareOccupancy(int* initial_blocks,
+                       const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  void* Allocate(uint64 size) override;
+
+  void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase* mem) override;
+
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    return GpuDriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* location) override {
+    return GpuDriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void* HostMemoryAllocate(uint64 size) override {
+    return GpuDriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void* location) override {
+    return GpuDriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void* location, uint64 size) override;
+
+  bool HostMemoryUnregister(void* location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase* location, int value,
+                         uint64 size) override;
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override;
+
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override;
+
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override;
+
+  bool MemZero(Stream* stream, DeviceMemoryBase* location,
+               uint64 size) override;
+  bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+              uint64 size) override;
+  bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override;
+
+  bool AllocateStream(Stream* stream) override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override;
+
+  bool AllocateTimer(Timer* timer) override;
+
+  void DeallocateTimer(Timer* timer) override;
+
+  bool StartTimer(Stream* stream, Timer* timer) override;
+
+  bool StopTimer(Stream* stream, Timer* timer) override;
+
+  port::Status AllocateEvent(Event* event) override;
+
+  port::Status DeallocateEvent(Event* event) override;
+
+  port::Status RecordEvent(Stream* stream, Event* event) override;
+
+  port::Status WaitForEvent(Stream* stream, Event* event) override;
+
+  Event::Status PollForEventStatus(Event* event) override;
+
+  port::Status BlockHostUntilDone(Stream* stream) override;
+
+  int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
+                 void** mem, size_t* bytes) override;
+
+  DeviceDescription* PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport* CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport* CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport* CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport* CreateDnn() override;
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override;
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override;
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
+
+  void* GpuContextHack() override;
+
+  GpuContext* gpu_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  // (supported on CUDA only)
+  bool FindOnDiskForComputeCapability(absl::string_view filename,
+                                      absl::string_view canonical_suffix,
+                                      string* found_filename) const;
+
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for AMDGPU ISA-specific suffixed versions.
+  // (supported on ROCm only)
+
+  bool FindOnDiskForISAVersion(absl::string_view filename,
+                               absl::string_view canonical_suffix,
+                               string* found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
+                                   void* data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(GpuKernel* cuda_kernel,
+                         KernelMetadata* kernel_metadata);
+
+  // Prints to VLOG(2) information about the kernel's occupancy and how it might
+  // be improved.
+  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims);
+
+  // (supported on CUDA only)
+  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  // (supported on CUDA only)
+  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // (supported on ROCm only)
+  bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void* gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
+  // Multiple GPUFunctionHandle are usually obtained from a single
+  // GPUModuleHandle so we attempt to hit in this mapping first, before
+  // retrieving it.
+  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char*, GpuModuleHandle> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
+      GUARDED_BY(in_memory_modules_mu_);
+  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
+  std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
+      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<GpuFunctionHandle> launched_kernels_
+      GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  GpuDeviceHandle device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  GpuContext* context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // GPU ISA version for device_.
+  int version_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_helpers.h b/tensorflow/stream_executor/gpu/gpu_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..117a71718f269d8ffd724d55ae269fea95dac366
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_helpers.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+
+#include <stddef.h>
+#include <complex>
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA
+// device memory.
+template <typename T>
+const T* GpuMemory(const DeviceMemory<T>& mem) {
+  return static_cast<const T*>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device memory.
+template <typename T>
+T* GpuMemoryMutable(DeviceMemory<T>* mem) {
+  return static_cast<T*>(mem->opaque());
+}
+
+static_assert(
+    sizeof(std::complex<float>) == sizeof(GpuComplexType),
+    "std::complex<float> and GpuComplexType should have the same size");
+static_assert(offsetof(GpuComplexType, x) == 0,
+              "The real part of GpuComplexType should appear first.");
+static_assert(
+    sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
+    "std::complex<double> and GpuDoubleComplexType should have the same "
+    "size");
+static_assert(offsetof(GpuDoubleComplexType, x) == 0,
+              "The real part of GpuDoubleComplexType should appear first.");
+
+// Type traits to get CUDA complex types from std::complex<>.
+
+template <typename T>
+struct GpuComplexT {
+  typedef T type;
+};
+
+template <>
+struct GpuComplexT<std::complex<float>> {
+  typedef GpuComplexType type;
+};
+
+template <>
+struct GpuComplexT<std::complex<double>> {
+  typedef GpuDoubleComplexType type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
+// types.
+
+template <typename T>
+inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
+  return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+}
+
+template <typename T>
+inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
+  return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+}
+
+// Converts values of std::complex<float/double> to values of
+// GpuComplexType/GpuDoubleComplexType.
+inline GpuComplexType GpuComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_kernel.h b/tensorflow/stream_executor/gpu/gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b89b20097323c02fc9cf7492d54657789956ca7
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_kernel.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Wraps a GpuFunctionHandle to implement the platform-independent
+// KernelInterface.
+class GpuKernel : public internal::KernelInterface {
+ public:
+  GpuKernel()
+      : gpu_function_(nullptr),
+        arity_(0),
+        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the GpuExecutor.
+  ~GpuKernel() override {}
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the GpuExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  // Returns the GpuFunctionHandle value for passing to the CUDA API.
+  GpuFunctionHandle AsGpuFunctionHandle() const {
+    DCHECK(gpu_function_ != nullptr);
+    return const_cast<GpuFunctionHandle>(gpu_function_);
+  }
+
+  // Returns the slot that the GpuFunctionHandle is stored within for this
+  // object, for the CUDA API which wants to load into a GpuFunctionHandle*.
+  GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
+
+  // CUDA supports setting the preferred cache configuration of a
+  // GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
+  // via the below functions; users can set a preference, and that is applied
+  // when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
+  // would be to load the kernel & set the preference when the user calls the
+  // setter below; either approach is valid. Sets the current kernel cache
+  // configuration preference.
+  void SetPreferredCacheConfig(KernelCacheConfig config) override {
+    preferred_cache_config_ = config;
+  }
+
+  // Returns the current kernel cache configuration preference.
+  KernelCacheConfig GetPreferredCacheConfig() const override {
+    return preferred_cache_config_;
+  }
+
+  // Returns the current kernel cache configuration preference as a
+  // CUfunc_cache.
+  GpuFuncCachePreference GetGpuCacheConfig() const;
+
+ private:
+  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
+  unsigned arity_;  // Number of formal parameters the kernel takes.
+
+  // Preferred (but not required) cache configuration for this kernel.
+  KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
+  return static_cast<const GpuKernel*>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
+  return static_cast<GpuKernel*>(kernel->implementation());
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf1e1963044a9a54fb92b6a324d3fadd5e6c0b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rng.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Opaque and unique identifier for the GPU RNG plugin.
+extern const PluginId kGpuRandPlugin;
+
+class GpuExecutor;
+
+// GPU-platform implementation of the random number generation support
+// interface.
+//
+// Thread-safe post-initialization.
+class GpuRng : public rng::RngSupport {
+ public:
+  explicit GpuRng(GpuExecutor* parent);
+
+  // Retrieves a gpu rng library generator handle. This is necessary for
+  // enqueuing random number generation work onto the device.
+  // TODO(leary) provide a way for users to select the RNG algorithm.
+  bool Init();
+
+  // Releases a gpu rng library generator handle, if one was acquired.
+  ~GpuRng() override;
+
+  // See rng::RngSupport for details on the following overrides.
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<float>>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<double>>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                              DeviceMemory<float>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                              DeviceMemory<double>* v) override;
+
+  bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
+
+ private:
+  // Actually performs the work of generating random numbers - the public
+  // methods are thin wrappers to this interface.
+  template <typename T>
+  bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
+  template <typename ElemT, typename FuncT>
+  bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
+                                      DeviceMemory<ElemT>* v, FuncT func);
+
+  // Sets the stream for the internal gpu rng generator.
+  //
+  // This is a stateful operation, as the handle can only have one stream set at
+  // a given time, so it is usually performed right before enqueuing work to do
+  // with random number generation.
+  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // mutex that guards the gpu rng library handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this GpuRng.
+  // Immutable post-initialization.
+  GpuExecutor* parent_;
+
+  // gpu rng library handle on the device.
+  GpuRngHandle rng_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
+};
+
+template <typename T>
+string TypeString();
+
+template <>
+string TypeString<float>() {
+  return "float";
+}
+
+template <>
+string TypeString<double>() {
+  return "double";
+}
+
+template <>
+string TypeString<std::complex<float>>() {
+  return "std::complex<float>";
+}
+
+template <>
+string TypeString<std::complex<double>>() {
+  return "std::complex<double>";
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_stream.cc
rename to tensorflow/stream_executor/gpu/gpu_stream.cc
index b5aa7694f7e1d8d47f3252d3ba679292155119b5..f43500370fc6a7a3e919d2c7af0a92e98100284b 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDAStream::Init() {
-  if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
+bool GpuStream::Init() {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
     return false;
   }
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
-                                 CUDADriver::EventFlags::kDisableTiming)
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
+                                GpuDriver::EventFlags::kDisableTiming)
       .ok();
 }
 
-void CUDAStream::Destroy() {
+void GpuStream::Destroy() {
   if (completed_event_ != nullptr) {
     port::Status status =
-        CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
+        GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
     }
   }
 
-  CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
+  GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
 }
 
-bool CUDAStream::IsIdle() const {
-  return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
+bool GpuStream::IsIdle() const {
+  return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
 }
 
-CUDAStream *AsCUDAStream(Stream *stream) {
+GpuStream* AsGpuStream(Stream* stream) {
   DCHECK(stream != nullptr);
-  return static_cast<CUDAStream *>(stream->implementation());
+  return static_cast<GpuStream*>(stream->implementation());
 }
 
-CUstream AsCUDAStreamValue(Stream *stream) {
+GpuStreamHandle AsGpuStreamValue(Stream* stream) {
   DCHECK(stream != nullptr);
-  return AsCUDAStream(stream)->cuda_stream();
+  return AsGpuStream(stream)->gpu_stream();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..c38f6c132a571bb42b31c9649440fd0ff2aaa777
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+
+// Wraps a GpuStreamHandle in order to satisfy the platform-independent
+// StreamInterface.
+//
+// Thread-safe post-initialization.
+class GpuStream : public internal::StreamInterface {
+ public:
+  explicit GpuStream(GpuExecutor* parent)
+      : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
+
+  // Note: teardown is handled by a parent's call to DeallocateStream.
+  ~GpuStream() override {}
+
+  void* GpuStreamHack() override { return gpu_stream_; }
+  void** GpuStreamMemberHack() override {
+    return reinterpret_cast<void**>(&gpu_stream_);
+  }
+
+  // Explicitly initialize the CUDA resources associated with this stream, used
+  // by StreamExecutor::AllocateStream().
+  bool Init();
+
+  // Explicitly destroy the CUDA resources associated with this stream, used by
+  // StreamExecutor::DeallocateStream().
+  void Destroy();
+
+  // Returns true if no work is pending or executing on the stream.
+  bool IsIdle() const;
+
+  // Retrieves an event which indicates that all work enqueued into the stream
+  // has completed. Ownership of the event is not transferred to the caller, the
+  // event is owned by this stream.
+  GpuEventHandle* completed_event() { return &completed_event_; }
+
+  // Returns the GpuStreamHandle value for passing to the CUDA API.
+  //
+  // Precond: this GpuStream has been allocated (otherwise passing a nullptr
+  // into the NVIDIA library causes difficult-to-understand faults).
+  GpuStreamHandle gpu_stream() const {
+    DCHECK(gpu_stream_ != nullptr);
+    return const_cast<GpuStreamHandle>(gpu_stream_);
+  }
+
+  // TODO(timshen): Migrate away and remove this function.
+  GpuStreamHandle cuda_stream() const { return gpu_stream(); }
+
+  GpuExecutor* parent() const { return parent_; }
+
+ private:
+  GpuExecutor* parent_;         // Executor that spawned this stream.
+  GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+
+  // Event that indicates this stream has completed.
+  GpuEventHandle completed_event_ = nullptr;
+};
+
+// Helper functions to simplify extremely common flows.
+// Converts a Stream to the underlying GpuStream implementation.
+GpuStream* AsGpuStream(Stream* stream);
+
+// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
+GpuStreamHandle AsGpuStreamValue(Stream* stream);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/gpu/gpu_timer.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_timer.cc
rename to tensorflow/stream_executor/gpu/gpu_timer.cc
index 991a12a23d632bd9fb4c97a340e244f6ffb4c7d3..cc4b50d9baa0af70410baad582d210e90bdb7b03 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/gpu/gpu_timer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDATimer::Init() {
+bool GpuTimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::CreateEvent(
-      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::CreateEvent(context, &start_event_,
+                                               GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return false;
   }
 
-  status = CUDADriver::CreateEvent(context, &stop_event_,
-                                   CUDADriver::EventFlags::kDefault);
+  status = GpuDriver::CreateEvent(context, &stop_event_,
+                                  GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
-    status = CUDADriver::DestroyEvent(context, &start_event_);
+    status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
@@ -48,47 +48,46 @@ bool CUDATimer::Init() {
   return true;
 }
 
-void CUDATimer::Destroy() {
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+void GpuTimer::Destroy() {
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 
-  status = CUDADriver::DestroyEvent(context, &stop_event_);
+  status = GpuDriver::DestroyEvent(context, &stop_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 }
 
-float CUDATimer::GetElapsedMilliseconds() const {
+float GpuTimer::GetElapsedMilliseconds() const {
   CHECK(start_event_ != nullptr && stop_event_ != nullptr);
   // TODO(leary) provide a way to query timer resolution?
   // CUDA docs say a resolution of about 0.5us
   float elapsed_milliseconds = NAN;
-  (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
-                                        &elapsed_milliseconds, start_event_,
-                                        stop_event_);
+  (void)GpuDriver::GetEventElapsedTime(
+      parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), start_event_, stream->cuda_stream());
+bool GpuTimer::Start(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), start_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+bool GpuTimer::Stop(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), stop_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_timer.h b/tensorflow/stream_executor/gpu/gpu_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..886f0c2d57729270b9a87635ddffd1a4be4acfdb
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_timer.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
+// StreamExecutor Timer interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+class GpuStream;
+
+// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
+// TimerInferface -- both a start and a stop event are present which may be
+// recorded in a stream.
+class GpuTimer : public internal::TimerInterface {
+ public:
+  explicit GpuTimer(GpuExecutor* parent)
+      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
+
+  // Note: teardown needs to be explicitly handled in this API by a call to
+  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
+  ~GpuTimer() override {}
+
+  // Allocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::AllocateTimer().
+  bool Init();
+
+  // Deallocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::DeallocateTimer().
+  void Destroy();
+
+  // Records the "timer start" event at the current point in the stream.
+  bool Start(GpuStream* stream);
+
+  // Records the "timer stop" event at the current point in the stream.
+  bool Stop(GpuStream* stream);
+
+  // Returns the elapsed time, in milliseconds, between the start and stop
+  // events.
+  float GetElapsedMilliseconds() const;
+
+  // See Timer::Microseconds().
+  // TODO(leary) make this into an error code interface...
+  uint64 Microseconds() const override {
+    return GetElapsedMilliseconds() * 1e3;
+  }
+
+  // See Timer::Nanoseconds().
+  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
+
+ private:
+  GpuExecutor* parent_;
+  GpuEventHandle start_event_;  // Event recorded to indicate the "start"
+                                // timestamp executing in a stream.
+  GpuEventHandle stop_event_;   // Event recorded to indicate the "stop"
+                                // timestamp executing in a stream.
+};
+
+struct GpuTimerDeleter {
+  void operator()(GpuTimer* t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..64a6e5e5efc6ed5070c9220e27a0b2296a2ae3cb
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@@ -0,0 +1,86 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// GPU (ROCm / CUDA) specific type handle resolution
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+
+#if TENSORFLOW_USE_ROCM
+
+#define __HIP_DISABLE_CPP_FUNCTIONS__
+
+#include "rocm/include/hip/hip_complex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hiprand/hiprand.h"
+
+#else  // CUDA
+
+#include "cuda/include/cuComplex.h"
+#include "cuda/include/cuda.h"
+
+// cannot include curand.h here
+//   because it triggers the #error in cuda/cuda_gpu_executor.cc
+//     (because curand.h includes cuda_runtime.h)
+// so explicitly adding the lone typedef we need from that file
+typedef struct curandGenerator_st* curandGenerator_t;
+
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+#if TENSORFLOW_USE_ROCM
+
+using GpuStreamHandle = hipStream_t;
+using GpuEventHandle = hipEvent_t;
+using GpuFunctionHandle = hipFunction_t;
+using GpuFunctionAttribute = hipDeviceAttribute_t;  // not a typo!
+using GpuDeviceHandle = hipDevice_t;
+using GpuDevicePtr = hipDeviceptr_t;
+using GpuDeviceAttribute = hipDeviceAttribute_t;
+using GpuDeviceProperty = hipDeviceProp_t;
+using GpuModuleHandle = hipModule_t;
+using GpuStatus = hipError_t;
+using GpuFuncCachePreference = hipFuncCache_t;
+using GpuSharedMemConfig = hipSharedMemConfig;
+using GpuComplexType = hipComplex;
+using GpuDoubleComplexType = hipDoubleComplex;
+using GpuRngHandle = hiprandGenerator_t;
+
+#else  // CUDA
+
+using GpuStreamHandle = CUstream;
+using GpuEventHandle = CUevent;
+using GpuFunctionHandle = CUfunction;
+using GpuFunctionAttribute = CUfunction_attribute;
+using GpuDeviceHandle = CUdevice;
+using GpuDevicePtr = CUdeviceptr;
+using GpuDeviceAttribute = CUdevice_attribute;
+using GpuDeviceProperty = CUdevprop;
+using GpuModuleHandle = CUmodule;
+using GpuStatus = CUresult;
+using GpuFuncCachePreference = CUfunc_cache;
+using GpuSharedMemConfig = CUsharedconfig;
+using GpuComplexType = cuComplex;
+using GpuDoubleComplexType = cuDoubleComplex;
+using GpuRngHandle = curandGenerator_t;
+
+#endif
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
deleted file mode 100644
index 68021d2b3157ceeaabd0d0a2065bc946913f64c4..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/logging.proto
+++ /dev/null
@@ -1,29 +0,0 @@
-syntax = "proto3";
-
-package stream_executor;
-
-import "tensorflow/stream_executor/dnn.proto";
-
-message CudnnVersion {
-  int32 major = 1;
-  int32 minor = 2;
-  int32 patch = 3;
-};
-
-message ComputeCapability {
-  int32 major = 1;
-  int32 minor = 2;
-}
-
-message CudaInfo {
-  CudnnVersion cudnn_version = 1;
-  ComputeCapability compute_capability = 2;
-}
-
-message ConvLogEntry {
-  CudaInfo cuda_info = 1;
-  dnn.ConvolutionProto convolution = 2;
-
-  // Profiled time in ms. 0.0 if the convolution is not profiled.
-  float profile_time_ms = 3;
-}
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index c0205abbee305edc23e24d79c53f9ed3b84049b5..9c99581438653a55223a5ebee6173d2a5fefb3ab 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
       return "CUDA";
+    case PlatformKind::kROCm:
+      return "ROCm";
     case PlatformKind::kOpenCL:
       return "OpenCL";
     case PlatformKind::kHost:
@@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
 bool PlatformIsRunnable(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
     case PlatformKind::kHost:
       return true;
@@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
 bool PlatformIsRunnableOnDevice(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
       return true;
     default:
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 5cb7047b6f39483f237b5bb249906d9ce8a06b9e..2c2cd77ad21aaeb700a7cffe598112237204b418 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -40,6 +40,7 @@ class StreamExecutor;
 enum class PlatformKind {
   kInvalid,
   kCuda,
+  kROCm,
   kOpenCL,
   kHost,
   kMock,
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 668eeee3f31ff257092674de98c7d20c39c46a73..6ed7480ff0c5bf0659618318b6da6135d4f4bae6 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -12,285 +12,175 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 
-// TODO(jhen): Replace hardcoded, platform specific path strings in GetXXXPath()
-// with a function in e.g. cuda.h.
-
-#include <limits.h>
 #include <stdlib.h>
-#include <initializer_list>
-#include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "cuda/cuda_config.h"
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/path.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-#include "tensorflow/stream_executor/platform/default/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
-#if !defined(PLATFORM_GOOGLE)
-#include "absl/strings/string_view.h"
-#include "cuda/cuda_config.h"
-#endif
-
 namespace stream_executor {
 namespace internal {
 
+namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 
-/* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cublas", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
+  auto filename = port::Env::Default()->FormatLibraryFileName(name, version);
+  void* dso_handle;
+  port::Status status =
+      port::Env::Default()->LoadLibrary(filename.c_str(), &dso_handle);
+  if (status.ok()) {
+    LOG(INFO) << "Successfully opened dynamic library " << filename;
+    return dso_handle;
+  }
+
+  auto message = absl::StrCat("Could not dlopen library '", filename,
+                              "'; dlerror: ", status.error_message());
+#if !defined(PLATFORM_WINDOWS)
+  if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
+    message += absl::StrCat("; LD_LIRARY_PATH: ", ld_library_path);
+  }
+#endif
+  LOG(INFO) << message;
+  return port::Status(port::error::FAILED_PRECONDITION, message);
 }
+}  // namespace
 
-/* static */ port::Status DsoLoader::GetCudnnDsoHandle(void** dso_handle) {
-  // libcudnn is versioned differently than the other libraries and may have a
-  // different version number than other CUDA libraries.  See b/22397368 for
-  // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cudnn", GetCudnnVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+namespace DsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvcuda", "");
+#elif defined(__APPLE__)
+  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
+  // libcuda.1.dylib.
+  auto handle_or = GetDsoHandle("cuda", "");
+  if (handle_or.ok()) {
+    return handle_or;
+  }
+#endif
+  return GetDsoHandle("cuda", "1");
 }
 
-/* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cufft", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  return GetDsoHandle("cudart", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "curand", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
+port::StatusOr<void*> GetCublasDsoHandle() {
+  return GetDsoHandle("cublas", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
-#if defined(PLATFORM_WINDOWS)
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#else
-  port::Status status = GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#if defined(__APPLE__)
-  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
-  // libcuda.1.dylib.
-  return status.ok()
-             ? status
-             : GetDsoHandle(
-                   FindDsoPath(
-                       port::Env::Default()->FormatLibraryFileName("cuda", ""),
-                       GetCudaDriverLibraryPath()),
-                   dso_handle);
-#else
-  return status;
-#endif
-#endif
+port::StatusOr<void*> GetCufftDsoHandle() {
+  return GetDsoHandle("cufft", GetCudaVersion());
 }
 
-/* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+port::StatusOr<void*> GetCurandDsoHandle() {
+  return GetDsoHandle("curand", GetCudaVersion());
+}
+
+port::StatusOr<void*> GetCuptiDsoHandle() {
 #if defined(ANDROID_TEGRA)
   // On Android devices the CUDA version number is not added to the library
   // name.
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cupti", ""),
-                  GetCudaCuptiLibraryPath()),
-      dso_handle);
+  return GetDsoHandle("cupti", "");
 #else
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cupti", GetCudaVersion()),
-                                  GetCudaCuptiLibraryPath()),
-                      dso_handle);
+  return GetDsoHandle("cupti", GetCudaVersion());
 #endif
 }
 
-static mutex& GetRpathMutex() {
-  static mutex* mu = new mutex;
-  return *mu;
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  return GetDsoHandle("cudnn", GetCudnnVersion());
 }
 
-/* static */ void DsoLoader::RegisterRpath(absl::string_view path) {
-  mutex_lock lock{GetRpathMutex()};
-  GetRpaths()->emplace_back(path);
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  return GetDsoHandle("rocblas", "");
 }
 
-/* static */ port::Status DsoLoader::GetDsoHandle(absl::string_view path,
-                                                  void** dso_handle,
-                                                  LoadKind load_kind) {
-  if (load_kind != LoadKind::kLocal) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Only LoadKind::kLocal is currently supported");
-  }
-  string path_string(path);
-  port::Status s =
-      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
-  if (!s.ok()) {
-#if !defined(PLATFORM_WINDOWS)
-    char* ld_library_path = getenv("LD_LIBRARY_PATH");
-#endif
-    LOG(INFO) << "Couldn't open CUDA library " << path
-#if !defined(PLATFORM_WINDOWS)
-              << ". LD_LIBRARY_PATH: "
-              << (ld_library_path != nullptr ? ld_library_path : "")
-#endif
-        ;
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        absl::StrCat("could not dlopen DSO: ", path,
-                                     "; dlerror: ", s.error_message()));
-  }
-  LOG(INFO) << "successfully opened CUDA library " << path << " locally";
-  return port::Status::OK();
-}
-
-/* static */ string DsoLoader::GetBinaryDirectory(bool strip_executable_name) {
-  string exe_path = port::Env::Default()->GetExecutablePath();
-  return strip_executable_name ? string(port::Dirname(exe_path)) : exe_path;
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  return GetDsoHandle("MIOpen", "");
 }
 
-// Creates a heap-allocated vector for initial rpaths.
-// Ownership is transferred to the caller.
-static std::vector<string>* CreatePrimordialRpaths() {
-  auto rpaths = new std::vector<string>;
-#if defined(__APPLE__)
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib");
-#else
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib64");
-#endif
-  return rpaths;
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  return GetDsoHandle("rocfft", "");
 }
 
-/* static */ std::vector<string>* DsoLoader::GetRpaths() {
-  static std::vector<string>* rpaths = CreatePrimordialRpaths();
-  return rpaths;
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  return GetDsoHandle("rocrand", "");
 }
 
-/* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
-#if defined(PLATFORM_WINDOWS)
-  return false;
-#else
-  char buf[PATH_MAX];
-  char* result = realpath(candidate->c_str(), buf);
-  if (result == nullptr) {
-    return false;
-  }
-  VLOG(3) << "realpath resolved candidate path \"" << *candidate << "\" to \""
-          << result << "\"";
-  *candidate = result;
-  return true;
-#endif
-}
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
 
-/* static */ string DsoLoader::FindDsoPath(absl::string_view library_name,
-                                           absl::string_view runfiles_relpath) {
-  // Keep a record of the paths we attempted so we can dump out meaningful
-  // diagnostics if no path is found.
-  std::vector<string> attempted;
-
-  using StringPieces = std::vector<absl::string_view>;
-  string candidate;
-
-  // Otherwise, try binary-plus-rpath locations.
-  string binary_directory =
-      GetBinaryDirectory(true /* = strip_executable_name */);
-  mutex_lock lock{GetRpathMutex()};
-  for (const string& rpath : *GetRpaths()) {
-    candidate =
-        port::Join(StringPieces{binary_directory, rpath, library_name}, "/");
-    if (TrySymbolicDereference(&candidate)) {
-      return candidate;
-    }
-  }
-  attempted.push_back(candidate);
+}  // namespace DsoLoader
 
-  return string(library_name);
+namespace CachedDsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaDriverDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaLibraryDirPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/lib";
-#else
-  return "external/local_config_cuda/cuda/lib64";
-#endif
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaRuntimeDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaDriverLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/driver/lib";
-#elif defined(PLATFORM_WINDOWS)
-  return "";
-#else
-  return "external/local_config_cuda/cuda/driver/lib64";
-#endif
+port::StatusOr<void*> GetCublasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCublasDsoHandle());
+  return *result;
 }
 
-/* static */ string DsoLoader::GetCudaCuptiLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib";
-#else
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib64";
-#endif
+port::StatusOr<void*> GetCurandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCurandDsoHandle());
+  return *result;
 }
 
-// -- CachedDsoLoader
+port::StatusOr<void*> GetCufftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCufftDsoHandle());
+  return *result;
+}
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCublasDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCublasDsoHandle);
-  return result;
+port::StatusOr<void*> GetCuptiDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCuptiDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCurandDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCurandDsoHandle);
-  return result;
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudnnDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCudnnDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCudnnDsoHandle);
-  return result;
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCufftDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCufftDsoHandle);
-  return result;
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcudaDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcudaDsoHandle);
-  return result;
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcuptiDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcuptiDsoHandle);
-  return result;
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
+  return *result;
 }
 
-/* static */ port::StatusOr<void*> CachedDsoLoader::FetchHandleResult(
-    std::function<port::Status(void**)> load_dso) {
-  void* handle;
-  auto status = load_dso(&handle);
-  if (!status.ok()) {
-    return status;
-  }
-  return handle;
+port::StatusOr<void*> GetHipDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipDsoHandle());
+  return *result;
 }
 
+}  // namespace CachedDsoLoader
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 806f65b24cdc209dd14a727de6a724bcd1705075..89f23324dcdcfc0ca3d9d8c1382b566fcd1fd79e 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -31,85 +31,42 @@ limitations under the License.
 namespace stream_executor {
 namespace internal {
 
-// Permits StreamExecutor code to dynamically load a pre-determined set of
-// relevant DSOs via dlopen.
-//
-// Thread-safe.
-class DsoLoader {
- public:
-  // The following methods either load the DSO of interest and return a dlopen
-  // handle or error status in the canonical namespace.
-
-  static port::Status GetCublasDsoHandle(void** dso_handle);
-  static port::Status GetCudnnDsoHandle(void** dso_handle);
-  static port::Status GetCufftDsoHandle(void** dso_handle);
-  static port::Status GetCurandDsoHandle(void** dso_handle);
-  static port::Status GetLibcudaDsoHandle(void** dso_handle);
-  static port::Status GetLibcuptiDsoHandle(void** dso_handle);
-
-  // Registers a new binary-relative path to use as a dlopen search path.
-  static void RegisterRpath(absl::string_view path);
-
- private:
-  // Registered rpaths (singleton vector) and a mutex that guards it.
-  static std::vector<string>* GetRpaths();
-
-  // Descriptive boolean wrapper to indicate whether symbols are made available
-  // to resolve in later-loaded libraries.
-  enum class LoadKind { kLocal, kGlobal };
-
-  // Loads a DSO from the given "path" (which can technically be any dlopen-able
-  // name). If the load kind is global, the symbols in the loaded DSO are
-  // visible to subsequent DSO loading operations.
-  static port::Status GetDsoHandle(absl::string_view path, void** dso_handle,
-                                   LoadKind load_kind = LoadKind::kLocal);
-
-  // Returns the binary directory (or binary path) associated with the currently
-  // executing program. If strip_executable_name is true, the executable file is
-  // stripped off of the path.
-  static string GetBinaryDirectory(bool strip_executable_name);
-
-  // Invokes realpath on the original path; updates candidate and returns true
-  // if it succeeds (i.e. a file exists at the path); otherwise, returns false.
-  static bool TrySymbolicDereference(string* candidate);
-
-  // Attempts to find a path to the DSO of interest, otherwise returns the
-  // bare library name:
-  // Arguments:
-  //   library_name: the filename in tree; e.g. libOpenCL.so.1.0.0
-  //   runfiles_relpath: where to look for the library relative to the runfiles
-  //      root; e.g. third_party/gpus/cuda/lib64
-  static string FindDsoPath(absl::string_view library_name,
-                            absl::string_view runfiles_relpath);
-
-  // Return platform dependent paths for DSOs
-  static string GetCudaLibraryDirPath();
-  static string GetCudaDriverLibraryPath();
-  static string GetCudaCuptiLibraryPath();
-
-  SE_DISALLOW_COPY_AND_ASSIGN(DsoLoader);
-};
+namespace DsoLoader {
+// The following methods either load the DSO of interest and return a dlopen
+// handle or error status.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace DsoLoader
 
 // Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
 // more than once.
-class CachedDsoLoader {
- public:
-  // Cached versions of the corresponding DsoLoader methods above.
-  static port::StatusOr<void*> GetCublasDsoHandle();
-  static port::StatusOr<void*> GetCudnnDsoHandle();
-  static port::StatusOr<void*> GetCufftDsoHandle();
-  static port::StatusOr<void*> GetCurandDsoHandle();
-  static port::StatusOr<void*> GetLibcudaDsoHandle();
-  static port::StatusOr<void*> GetLibcuptiDsoHandle();
-
- private:
-  // Fetches a DSO handle via "load_dso" and returns the StatusOr form of the
-  // result.
-  static port::StatusOr<void*> FetchHandleResult(
-      std::function<port::Status(void**)> load_dso);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CachedDsoLoader);
-};
+namespace CachedDsoLoader {
+// Cached versions of the corresponding DsoLoader methods above.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace CachedDsoLoader
 
 }  // namespace internal
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..902d8f98ee06f66f469bdd4434c4a191885df314
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -0,0 +1,291 @@
+# Description:
+#   ROCm-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "rocm_diagnostics",
+    srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_diagnostics.h"]),
+    deps = if_rocm_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "rocm_driver",
+    srcs = if_rocm_is_configured(["rocm_driver.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
+cc_library(
+    name = "rocm_activation",
+    srcs = [],
+    hdrs = if_rocm_is_configured(["rocm_activation.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "rocm_event",
+    srcs = if_rocm_is_configured(["rocm_event.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "rocm_gpu_executor",
+    srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        ":rocm_driver",
+        ":rocm_event",
+        ":rocm_kernel",
+        ":rocm_platform_id",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_kernel",
+    srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
+    hdrs = [],
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_platform",
+    srcs = if_rocm_is_configured(["rocm_platform.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "rocm_platform_id",
+    srcs = ["rocm_platform_id.cc"],
+    hdrs = ["rocm_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+cc_library(
+    name = "rocblas_plugin",
+    srcs = if_rocm_is_configured(["rocm_blas.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/strings",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocblas",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocfft_plugin",
+    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocfft",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "miopen_plugin",
+    srcs = if_rocm_is_configured(["rocm_dnn.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_dnn.h"]),
+    copts = [
+        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+        # setting of template depth 256
+        "-ftemplate-depth-512",
+    ],
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        ":rocm_driver",
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//third_party/eigen3",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:temporary_device_memory",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/strings",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:miopen",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocrand_plugin",
+    srcs = if_rocm_is_configured(["rocm_rng.cc"]),
+    hdrs = if_rocm_is_configured([]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":miopen_plugin",
+        ":rocfft_plugin",
+        ":rocblas_plugin",
+        ":rocrand_plugin",
+        ":rocm_driver",
+        ":rocm_platform",
+    ]),
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/rocm/rocm_activation.h b/tensorflow/stream_executor/rocm/rocm_activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..03a0beaece45f246fc49563801c5ebe1ec5d04aa
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_activation.h
@@ -0,0 +1,39 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by ROCM.
+// It reaches into the ROCM implementation to activate an underlying ROCM
+// context.
+//
+// Having this file separate from rocm/rocm_gpu_executor.h means that dependent
+// code does not also have to depend on rocm.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace rocm {
+
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82b966644c573c542c3c59320b0cfe7149ee3dbb
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -0,0 +1,2374 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/rocblas.h"
+
+#include "tensorflow/stream_executor/rocm/rocm_blas.h"
+
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include <assert.h>
+#include <complex>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocBlasPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                       \
+  struct WrapperShim__##__name {                                   \
+    static const char *kName;                                      \
+    template <typename... Args>                                    \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
+  } __name;                                                        \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocblasDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocblas DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#endif
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro)                                     \
+  __macro(rocblas_snrm2) __macro(rocblas_dnrm2) /*  __macro(rocblas_scnrm2)    \
+                                                  __macro(rocblas_dznrm2) */   \
+      __macro(rocblas_sdot)                                                    \
+          __macro(rocblas_ddot) /*  __macro(rocblas_cdotu)                     \
+                                  __macro(rocblas_cdotc)                       \
+                                  __macro(rocblas_zdotu)                       \
+                                  __macro(rocblas_zdotc)                    */ \
+      __macro(rocblas_sscal)                                                   \
+          __macro(rocblas_dscal) /*  __macro(rocblas_cscal)                    \
+                                   __macro(rocblas_csscal)                     \
+                                   __macro(rocblas_zscal)                      \
+                                   __macro(rocblas_zdscal) */                  \
+      __macro(rocblas_saxpy)                                                   \
+          __macro(rocblas_daxpy) /*  __macro(rocblas_caxpy)                    \
+                                   __macro(rocblas_zaxpy) */                   \
+      __macro(rocblas_scopy)                                                   \
+          __macro(rocblas_dcopy) /*  __macro(rocblas_ccopy)                    \
+                                   __macro(rocblas_zcopy) */                   \
+      __macro(rocblas_sswap)                                                   \
+          __macro(rocblas_dswap) /*  __macro(rocblas_cswap)                    \
+                                   __macro(rocblas_zswap) */                   \
+      __macro(rocblas_isamax)                                                  \
+          __macro(rocblas_idamax) /*  __macro(rocblas_icamax)                  \
+                                    __macro(rocblas_izamax) */                 \
+      __macro(rocblas_isamin)                                                  \
+          __macro(rocblas_idamin) /*  __macro(rocblas_icamin)                  \
+                                    __macro(rocblas_izamin) */                 \
+      __macro(rocblas_sasum)                                                   \
+          __macro(rocblas_dasum) /*  __macro(rocblas_scasum)                   \
+                                   __macro(rocblas_dzasum)                     \
+                                   __macro(rocblas_srot)                       \
+                                   __macro(rocblas_drot)                       \
+                                   __macro(rocblas_crot)                       \
+                                   __macro(rocblas_csrot)                      \
+                                   __macro(rocblas_zrot)                       \
+                                   __macro(rocblas_zdrot)                      \
+                                   __macro(rocblas_srotg)                      \
+                                   __macro(rocblas_drotg)                      \
+                                   __macro(rocblas_Crotg)                      \
+                                   __macro(rocblas_crotg)                      \
+                                   __macro(rocblas_zrotm)                      \
+                                   __macro(rocblas_drotm)                      \
+                                   __macro(rocblas_srotmg)                     \
+                                   __macro(rocblas_drotmg) */                  \
+      __macro(rocblas_sgemv)                                                   \
+          __macro(rocblas_dgemv) /*  __macro(rocblas_cgemv)                    \
+                                   __macro(rocblas_zgemv)                      \
+                                   __macro(rocblas_sgbmv)                      \
+                                   __macro(rocblas_dgbmv)                      \
+                                   __macro(rocblas_cgbmv)                      \
+                                   __macro(rocblas_zgbmv)                      \
+                                   __macro(rocblas_strmv)                      \
+                                   __macro(rocblas_dtrmv)                      \
+                                   __macro(rocblas_ctrmv)                      \
+                                   __macro(rocblas_ztrmv)                      \
+                                   __macro(rocblas_stbmv)                      \
+                                   __macro(rocblas_dtbmv)                      \
+                                   __macro(rocblas_ctbmv)                      \
+                                   __macro(rocblas_ztbmv)                      \
+                                   __macro(rocblas_stpmv)                      \
+                                   __macro(rocblas_dtpmv)                      \
+                                   __macro(rocblas_ctpmv)                      \
+                                   __macro(rocblas_ztpmv)                      \
+                                   __macro(rocblas_strsv)                      \
+                                   __macro(rocblas_dtrsv)                      \
+                                   __macro(rocblas_ctrsv)                      \
+                                   __macro(rocblas_ztrsv)                      \
+                                   __macro(rocblas_stpsv)                      \
+                                   __macro(rocblas_dtpsv)                      \
+                                   __macro(rocblas_ctpsv)                      \
+                                   __macro(rocblas_ztpsv)                      \
+                                   __macro(rocblas_stbsv)                      \
+                                   __macro(rocblas_dtbsv)                      \
+                                   __macro(rocblas_ctbsv)                      \
+                                   __macro(rocblas_ztbsv)                      \
+                                   __macro(rocblas_ssymv)                      \
+                                   __macro(rocblas_dsymv)                      \
+                                   __macro(rocblas_csymv)                      \
+                                   __macro(rocblas_zsymv)                      \
+                                   __macro(rocblas_chemv)                      \
+                                   __macro(rocblas_zhemv)                      \
+                                   __macro(rocblas_ssbmv)                      \
+                                   __macro(rocblas_dsbmv)                      \
+                                   __macro(rocblas_chbmv)                      \
+                                   __macro(rocblas_zhbmv)                      \
+                                   __macro(rocblas_sspmv)                      \
+                                   __macro(rocblas_dspmv)                      \
+                                   __macro(rocblas_chpmv)                      \
+                                   __macro(rocblas_zhpmv) */                   \
+      __macro(rocblas_sger)                                                    \
+          __macro(rocblas_dger) /*  __macro(rocblas_cgeru)                     \
+                                  __macro(rocblas_cgerc)                       \
+                                  __macro(rocblas_zgeru)                       \
+                                  __macro(rocblas_zgerc)                    */ \
+      __macro(rocblas_ssyr)                                                    \
+          __macro(rocblas_dsyr) /*  __macro(rocblas_csyr)                      \
+                                  __macro(rocblas_zsyr)                        \
+                                  __macro(rocblas_cher)                        \
+                                  __macro(rocblas_zher)                        \
+                                  __macro(rocblas_sspr)                        \
+                                  __macro(rocblas_dspr)                        \
+                                  __macro(rocblas_chpr)                        \
+                                  __macro(rocblas_zhpr)                        \
+                                  __macro(rocblas_ssyr2)                       \
+                                  __macro(rocblas_dsyr2)                       \
+                                  __macro(rocblas_csyr2)                       \
+                                  __macro(rocblas_zsyr2)                       \
+                                  __macro(rocblas_cher2)                       \
+                                  __macro(rocblas_zher2)                       \
+                                  __macro(rocblas_sspr2)                       \
+                                  __macro(rocblas_dspr2)                       \
+                                  __macro(rocblas_chpr2)                       \
+                                  __macro(rocblas_zhpr2)                    */ \
+      __macro(rocblas_sgemm) __macro(rocblas_dgemm)                            \
+          __macro(rocblas_hgemm) /*  __macro(rocblas_cgemm)                    \
+                                   __macro(rocblas_zgemm)                      \
+                                   __macro(rocblas_ssyrk)                      \
+                                   __macro(rocblas_dsyrk)                      \
+                                   __macro(rocblas_csyrk)                      \
+                                   __macro(rocblas_zsyrk)                      \
+                                   __macro(rocblas_cherk)                      \
+                                   __macro(rocblas_zherk)                      \
+                                   __macro(rocblas_ssyr2k)                     \
+                                   __macro(rocblas_dsyr2k)                     \
+                                   __macro(rocblas_csyr2k)                     \
+                                   __macro(rocblas_zsyr2k)                     \
+                                   __macro(rocblas_cher2k)                     \
+                                   __macro(rocblas_zher2k)                     \
+                                   __macro(rocblas_ssyrkx)                     \
+                                   __macro(rocblas_dsyrkx)                     \
+                                   __macro(rocblas_csyrkx)                     \
+                                   __macro(rocblas_zsyrkx)                     \
+                                   __macro(rocblas_cherkx)                     \
+                                   __macro(rocblas_zherkx)                     \
+                                   __macro(rocblas_ssymm)                      \
+                                   __macro(rocblas_dsymm)                      \
+                                   __macro(rocblas_csymm)                      \
+                                   __macro(rocblas_zsymm)                      \
+                                   __macro(rocblas_chemm)                      \
+                                   __macro(rocblas_zhemm) */                   \
+      __macro(rocblas_strsm)                                                   \
+          __macro(rocblas_dtrsm) /*  __macro(rocblas_ctrsm)                    \
+                                   __macro(rocblas_ztrsm)                      \
+                                   __macro(rocblas_strmm)                      \
+                                   __macro(rocblas_dtrmm)                      \
+                                   __macro(rocblas_ctrmm)                      \
+                                   __macro(rocblas_ztrmm) */                   \
+      __macro(rocblas_sgeam)                                                   \
+          __macro(rocblas_dgeam) /*  __macro(rocblas_cgeam)                    \
+                                   __macro(rocblas_zgeam)                      \
+                                   __macro(rocblas_sdgmm)                      \
+                                   __macro(rocblas_ddgmm)                      \
+                                   __macro(rocblas_cdgmm)                      \
+                                   __macro(rocblas_zdgmm) */
+
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_create_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_destroy_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_stream)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_get_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_hgemm_strided_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_cgemm_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_zgemm_batched)
+ROCBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_ROCBLAS_V2_WRAP)
+
+}  // namespace wrap
+
+static string ToString(rocblas_status status) {
+  switch (status) {
+    case rocblas_status_success:
+      return "rocblas_status_success";
+    case rocblas_status_invalid_handle:
+      return "rocblas_status_invalid_handle";
+    case rocblas_status_not_implemented:
+      return "rocblas_status_not_implemented";
+    case rocblas_status_invalid_pointer:
+      return "rocblas_status_invalid_pointer";
+    case rocblas_status_invalid_size:
+      return "rocblas_status_invalid_size";
+    case rocblas_status_memory_error:
+      return "rocblas_status_memory_error";
+    case rocblas_status_internal_error:
+      return "rocblas_status_internal_error";
+    default:
+      return absl::StrCat("<invalid rocBLAS status: ", status, ">");
+  }
+}
+
+bool ROCMBlas::Init() {
+  rocblas_status ret = wrap::rocblas_create_handle(parent_, &blas_);
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to create rocBLAS handle: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+ROCMBlas::ROCMBlas(gpu::GpuExecutor *parent)
+    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+
+ROCMBlas::~ROCMBlas() {
+  if (blas_ != nullptr) {
+    wrap::rocblas_destroy_handle(parent_, blas_);
+  }
+}
+
+bool ROCMBlas::SetStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
+  CHECK(blas_ != nullptr);
+  rocblas_status ret =
+      wrap::rocblas_set_stream(parent_, blas_, AsGpuStreamValue(stream));
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to set stream for rocBLAS calls: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+namespace {
+
+// Helper functions transforming blas arguments into rocBLAS arguments.
+
+rocblas_operation ROCMBlasTranspose(blas::Transpose trans) {
+  switch (trans) {
+    case blas::Transpose::kNoTranspose:
+      return rocblas_operation_none;
+    case blas::Transpose::kTranspose:
+      return rocblas_operation_transpose;
+    case blas::Transpose::kConjugateTranspose:
+      return rocblas_operation_conjugate_transpose;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Transpose.";
+  }
+}
+
+rocblas_fill ROCMBlasUpperLower(blas::UpperLower uplo) {
+  switch (uplo) {
+    case blas::UpperLower::kUpper:
+      return rocblas_fill_upper;
+    case blas::UpperLower::kLower:
+      return rocblas_fill_lower;
+    default:
+      LOG(FATAL) << "Invalid value of blas::UpperLower.";
+  }
+}
+
+rocblas_diagonal ROCMBlasDiagonal(blas::Diagonal diag) {
+  switch (diag) {
+    case blas::Diagonal::kUnit:
+      return rocblas_diagonal_unit;
+    case blas::Diagonal::kNonUnit:
+      return rocblas_diagonal_non_unit;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Diagonal.";
+  }
+}
+
+rocblas_side ROCMBlasSide(blas::Side side) {
+  switch (side) {
+    case blas::Side::kLeft:
+      return rocblas_side_left;
+    case blas::Side::kRight:
+      return rocblas_side_right;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Side.";
+  }
+}
+
+}  // namespace
+
+template <typename FuncT, typename... Args>
+bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args... args) {
+  mutex_lock lock{mu_};
+
+  CHECK(blas_ != nullptr);
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  rocblas_status ret = rocblas_func(parent_, blas_, args...);
+  if (err_on_failure && ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to run ROCBLAS routine " << rocblas_func.kName << ": "
+               << ToString(ret);
+  }
+  return ret == rocblas_status_success;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_sasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_saxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_daxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_scopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_sdot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_ddot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_snrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dnrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<float> *x, int incx,
+                         DeviceMemory<float> *y, int incy, float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<double> *x, int incx,
+                         DeviceMemory<double> *y, int incy, double c,
+                         double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<float>> *x, int incx,
+                         DeviceMemory<std::complex<float>> *y, int incy,
+                         float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<double>> *x, int incx,
+                         DeviceMemory<std::complex<double>> *y, int incy,
+                         double c, double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
+                          DeviceMemory<float> *b, DeviceMemory<float> *c,
+                          DeviceMemory<float> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
+                          DeviceMemory<double> *b, DeviceMemory<double> *c,
+                          DeviceMemory<double> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
+                          DeviceMemory<std::complex<float>> *b,
+                          DeviceMemory<float> *c,
+                          DeviceMemory<std::complex<float>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
+                          DeviceMemory<std::complex<double>> *b,
+                          DeviceMemory<double> *c,
+                          DeviceMemory<std::complex<double>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy,
+                          const DeviceMemory<float> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy,
+                          const DeviceMemory<double> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
+                           DeviceMemory<float> *d2, DeviceMemory<float> *x1,
+                           const DeviceMemory<float> &y1,
+                           DeviceMemory<float> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
+                           DeviceMemory<double> *d2, DeviceMemory<double> *x1,
+                           const DeviceMemory<double> &y1,
+                           DeviceMemory<double> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_sscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_dscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_sswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<float>> *x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<double>> *x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_isamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_idamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_isamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_idamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_sgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_sger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_dger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<float>\" dataype";
+
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &ap,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &ap,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_ssyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_dsyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<Eigen::half> &a,
+                          int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+                          float beta, DeviceMemory<Eigen::half> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+  return DoBlasInternal(
+      wrap::rocblas_hgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+      reinterpret_cast<const rocblas_half *>(&alpha_half),
+      reinterpret_cast<const rocblas_half *>(GpuMemory(a)), lda,
+      reinterpret_cast<const rocblas_half *>(GpuMemory(b)), ldb,
+      reinterpret_cast<const rocblas_half *>(&beta_half),
+      reinterpret_cast<rocblas_half *>(GpuMemoryMutable(c)), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  return DoBlasInternal(
+      wrap::rocblas_sgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+    int incx, float beta, DeviceMemory<float> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+    int incx, double beta, DeviceMemory<double> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
+    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
+    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+template <typename T>
+bool ROCMBlas::DoBlasGemvWithProfilingImpl(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
+    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
+    const T &beta, DeviceMemory<T> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename T, typename ParamType>
+bool ROCMBlas::DoBlasGemmWithProfilingImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename InT, typename OutT, typename CompT>
+bool ROCMBlas::DoBlasGemmWithAlgorithmImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
+    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+bool ROCMBlas::GetBlasGemmAlgorithms(
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+  // ROCM TODO: properly implement the interface
+  return true;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+    const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int32> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"int8\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"half\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+template <typename T>
+struct EigenHalfToRocBlasHalf {
+  using type = T;
+};
+
+template <>
+struct EigenHalfToRocBlasHalf<Eigen::half> {
+  using type = rocblas_half;
+};
+
+template <typename T, typename FuncT>
+port::Status ROCMBlas::DoBlasGemmBatchedInternal(
+    FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
+    const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
+    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // MAPPED_T will be same as T for all types except Eigen::Half
+  // for T = Eigen::half, MAPPED_T = rocblas_half
+  using MAPPED_T = typename EigenHalfToRocBlasHalf<T>::type;
+
+  // Alocate local vectors to hold device pointers to matrices
+  std::vector<MAPPED_T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
+  for (int i = 0; i < batch_count; ++i) {
+    // static_cast does work when converting Eigen::half* to rocblas_half*,
+    // hence the use od reinterpret_cast
+    a_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(a_ptrs_to_wrappers[i]->opaque()));
+    b_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(b_ptrs_to_wrappers[i]->opaque()));
+    c_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(c_ptrs_to_wrappers[i]->opaque()));
+  }
+
+  //  batch_count <= 1 is base case, no definable matrix stride, set it same as
+  //  ld*
+  long long bsa = lda;
+  long long bsb = ldb;
+  long long bsc = ldc;
+  bool bsa_is_constant = true;
+  bool bsb_is_constant = true;
+  bool bsc_is_constant = true;
+
+  if (batch_count > 1) {
+    // Remember first stride; if any other stride is different that this one,
+    // KABLAM
+    bsa = a_raw_ptrs[1] - a_raw_ptrs[0];
+    bsb = b_raw_ptrs[1] - b_raw_ptrs[0];
+    bsc = c_raw_ptrs[1] - c_raw_ptrs[0];
+
+    //  Loop to verify that batched strides are constant
+    //  All the test cases from batch_matmul_op_test.py seem to satisfy this
+    //  requirement of a constant stride.  If this can be proven globally, then
+    //  this loop check can be safely removed
+    for (int i = 1; i < batch_count - 1; ++i) {
+      long long iterative_bsa = a_raw_ptrs[i + 1] - a_raw_ptrs[i];
+      if (iterative_bsa != bsa) {
+        bsa_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsb = b_raw_ptrs[i + 1] - b_raw_ptrs[i];
+      if (iterative_bsb != bsb) {
+        bsb_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsc = c_raw_ptrs[i + 1] - c_raw_ptrs[i];
+      if (iterative_bsc != bsc) {
+        bsc_is_constant = false;
+        break;
+      }
+    }
+  }
+
+  assert(!(ldc < m || bsc < ldc * n));
+
+  if (ROCMBlasTranspose(transa) == rocblas_operation_none)
+    assert(!(lda < m || bsa < lda * k));
+  else
+    assert(!(lda < k || bsa < lda * m));
+
+  if (ROCMBlasTranspose(transb) == rocblas_operation_none)
+    assert(!(ldb < k || bsb < ldb * n));
+  else
+    assert(!(ldb < n || bsc < ldc * k));
+
+  MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
+  MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
+
+  if (bsa_is_constant && bsb_is_constant && bsc_is_constant) {
+    bool ok = DoBlasInternal(
+        rocblas_func, stream, true /* = pointer_mode_host */,
+        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+        GpuComplex(alpha_ptr), a_raw_ptrs[0], lda, bsa, b_raw_ptrs[0], ldb, bsb,
+        GpuComplex(beta_ptr), c_raw_ptrs[0], ldc, bsc, batch_count);
+
+    if (ok) {
+      return port::Status::OK();
+    }
+  }
+
+  return port::Status(port::error::INTERNAL,
+                      "failed BLAS call, see log for details");
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_hgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha_half, a, lda, b, ldb, beta_half, c, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_sgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha,
+    const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
+    double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_dgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
+    int ldb, std::complex<float> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
+    int ldb, std::complex<double> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          float beta, DeviceMemory<std::complex<float>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          double beta, DeviceMemory<std::complex<double>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           float beta, DeviceMemory<std::complex<float>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           double beta, DeviceMemory<std::complex<double>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          float beta, DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          double beta, DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           float alpha, const DeviceMemory<float> &a, int lda,
+                           const DeviceMemory<float> &b, int ldb, float beta,
+                           DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           double alpha, const DeviceMemory<double> &a, int lda,
+                           const DeviceMemory<double> &b, int ldb, double beta,
+                           DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           std::complex<float> beta,
+                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           std::complex<double> beta,
+                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_strsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<float *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_dtrsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<double *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
+    int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"Eigen::half\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
+    float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
+    double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+}  // namespace gpu
+
+void initialize_rocblas() {
+  auto rocBlasAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+
+  if (!rocBlasAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()
+            ->RegisterFactory<PluginRegistry::BlasFactory>(
+                rocm::kROCmPlatformId, gpu::kRocBlasPlugin, "rocBLAS",
+                [](internal::StreamExecutorInterface *parent)
+                    -> blas::BlasSupport * {
+                  gpu::GpuExecutor *rocm_executor =
+                      dynamic_cast<gpu::GpuExecutor *>(parent);
+                  if (rocm_executor == nullptr) {
+                    LOG(ERROR)
+                        << "Attempting to initialize an instance of the "
+                           "rocBLAS "
+                        << "support library with a non-ROCM StreamExecutor";
+                    return nullptr;
+                  }
+
+                  gpu::ROCMBlas *blas = new gpu::ROCMBlas(rocm_executor);
+                  if (!blas->Init()) {
+                    // Note: Init() will log a more specific error.
+                    delete blas;
+                    return nullptr;
+                  }
+                  return blas;
+                });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocBLAS factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocblas,
+                            { stream_executor::initialize_rocblas(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c68481cee5f6123c80e9751c06392f1835a5ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@@ -0,0 +1,159 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for BLAS functionality -- this wraps the rocBLAS
+// library capabilities, and is only included into ROCM implementation code --
+// it will not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+// Opaque and unique identifier for the rocBLAS plugin.
+extern const PluginId kRocBlasPlugin;
+
+class GpuExecutor;
+
+// BLAS plugin for ROCM platform via rocBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the rocBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the rocBLAS handle when a
+// ROCM context is active.
+//
+// Thread-safe post-initialization.
+class ROCMBlas : public blas::BlasSupport {
+ public:
+  explicit ROCMBlas(GpuExecutor *parent);
+
+  // Allocates a rocBLAS handle.
+  bool Init();
+
+  // Releases the rocBLAS handle, if present.
+  ~ROCMBlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+ private:
+  // Tells rocBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // rocBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into rocBLAS.
+  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real rocBLAS function together with error
+  // handling.
+  //
+  // rocblas_func:       rocBLAS function pointer.
+  // rocblas_name:       rocBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // err_on_failure:     Whether to print an error if the rocBLAS function
+  // fails. args:               Arguments of rocBLAS function.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                          bool pointer_mode_host, bool err_on_failure,
+                          Args... args);
+
+  // Convenience functions that call DoBlasInternalImpl with different values
+  // for err_on_failure.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
+                      bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true, args...);
+  }
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalFailureOK(FuncT rocblas_func, Stream *stream,
+                               bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/false, args...);
+  }
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  template <typename T, typename FuncT>
+  port::Status DoBlasGemmBatchedInternal(
+      FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Helper function for implementing DoBlasGemmWithAlgorithm.
+  //
+  // We take alpha and beta by const reference because T might be Eigen::half,
+  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
+  // references to rocBLAS, we essentially reinterpret_cast to __half, which is
+  // safe because Eigen::half inherits from __half.
+  template <typename InT, typename OutT, typename CompT>
+  bool DoBlasGemmWithAlgorithmImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
+      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemmWithProfiling.
+  template <typename T, typename ParamType>
+  bool DoBlasGemmWithProfilingImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+      int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+      DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemvWithProfiling.
+  template <typename T>
+  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
+                                   uint64 m, uint64 n, const T &alpha,
+                                   const DeviceMemory<T> &a, int lda,
+                                   const DeviceMemory<T> &x, int incx,
+                                   const T &beta, DeviceMemory<T> *y, int incy,
+                                   blas::ProfileResult *output_profile_result);
+
+  // mutex that guards the rocBLAS handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this ROCMBlas.
+  // Immutable post-initialization.
+  GpuExecutor *parent_;
+
+  // rocBLAS library handle on the device.
+  rocblas_handle blas_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMBlas);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
new file mode 100644
index 0000000000000000000000000000000000000000..812974a9debb88e7db924680089acc5dc2ccc5a2
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@@ -0,0 +1,240 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+
+#include <limits.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+string DriverVersionToString(DriverVersion version) {
+  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
+                         std::get<2>(version));
+}
+
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+  if (!version.ok()) {
+    return version.status().ToString();
+  }
+
+  return DriverVersionToString(version.ValueOrDie());
+}
+
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
+  std::vector<string> pieces = port::Split(value, '.');
+  if (pieces.size() != 2 && pieces.size() != 3) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
+                                        "for driver version; got \"%s\"",
+                                        value.c_str())};
+  }
+
+  int major;
+  int minor;
+  int patch = 0;
+  if (!port::safe_strto32(pieces[0], &major)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse major version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[0].c_str(), value.c_str())};
+  }
+  if (!port::safe_strto32(pieces[1], &minor)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse minor version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[1].c_str(), value.c_str())};
+  }
+  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse patch version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[2].c_str(), value.c_str())};
+  }
+
+  DriverVersion result{major, minor, patch};
+  VLOG(2) << "version string \"" << value << "\" made value "
+          << DriverVersionToString(result);
+  return result;
+}
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+// -- class Diagnostician
+
+string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
+  return absl::StrCat("/dev/kfd", dev_node_ordinal);
+}
+
+void Diagnostician::LogDiagnosticInformation() {
+  LOG(INFO) << "retrieving ROCM diagnostic information for host: "
+            << port::Hostname();
+
+  LogDriverVersionInformation();
+}
+
+/* static */ void Diagnostician::LogDriverVersionInformation() {
+  LOG(INFO) << "hostname: " << port::Hostname();
+  if (VLOG_IS_ON(1)) {
+    const char* value = getenv("LD_LIBRARY_PATH");
+    string library_path = value == nullptr ? "" : value;
+    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
+
+    std::vector<string> pieces = port::Split(library_path, ':');
+    for (const auto& piece : pieces) {
+      if (piece.empty()) {
+        continue;
+      }
+      DIR* dir = opendir(piece.c_str());
+      if (dir == nullptr) {
+        VLOG(1) << "could not open \"" << piece << "\"";
+        continue;
+      }
+      while (dirent* entity = readdir(dir)) {
+        VLOG(1) << piece << " :: " << entity->d_name;
+      }
+      closedir(dir);
+    }
+  }
+  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  LOG(INFO) << "librocm reported version is: "
+            << rocm::DriverVersionStatusToString(dso_version);
+
+  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  LOG(INFO) << "kernel reported version is: "
+            << rocm::DriverVersionStatusToString(kernel_version);
+
+  if (kernel_version.ok() && dso_version.ok()) {
+    WarnOnDsoKernelMismatch(dso_version, kernel_version);
+  }
+}
+
+// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+// driver-interfacing DSO version number. Returns it as a string.
+port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  port::StatusOr<DriverVersion> result{port::Status{
+      port::error::NOT_FOUND,
+      "was unable to find librocm.so DSO loaded into this program"}};
+
+  // Callback used when iterating through DSOs. Looks for the driver-interfacing
+  // DSO and yields its version number into the callback data, when found.
+  auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
+                         void* data) -> int {
+    if (strstr(info->dlpi_name, "librocm.so.1")) {
+      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
+      char resolved_path[PATH_MAX] = {0};
+      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
+        return 0;
+      }
+      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
+      const char* slash = rindex(resolved_path, '/');
+      if (slash == nullptr) {
+        return 0;
+      }
+      const char* so_suffix = ".so.";
+      const char* dot = strstr(slash, so_suffix);
+      if (dot == nullptr) {
+        return 0;
+      }
+      string dso_version = dot + strlen(so_suffix);
+      // TODO(b/22689637): Eliminate the explicit namespace if possible.
+      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
+      *result = rocm::StringToDriverVersion(stripped_dso_version);
+      return 1;
+    }
+    return 0;
+  };
+
+  dl_iterate_phdr(iterate_phdr, &result);
+
+  return result;
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+    const string& driver_version_file_contents) {
+  static const char* kDriverFilePrelude = "Kernel Module  ";
+  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
+  if (offset == string::npos) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not find kernel module information in "
+                     "driver version file contents: \"",
+                     driver_version_file_contents, "\"")};
+  }
+
+  string version_and_rest = driver_version_file_contents.substr(
+      offset + strlen(kDriverFilePrelude), string::npos);
+  size_t space_index = version_and_rest.find(" ");
+  auto kernel_version = version_and_rest.substr(0, space_index);
+  // TODO(b/22689637): Eliminate the explicit namespace if possible.
+  auto stripped_kernel_version =
+      port::StripSuffixString(kernel_version, ".ld64");
+  return rocm::StringToDriverVersion(stripped_kernel_version);
+}
+
+void Diagnostician::WarnOnDsoKernelMismatch(
+    port::StatusOr<DriverVersion> dso_version,
+    port::StatusOr<DriverVersion> kernel_version) {
+  if (kernel_version.ok() && dso_version.ok() &&
+      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
+    LOG(INFO) << "kernel version seems to match DSO: "
+              << rocm::DriverVersionToString(kernel_version.ValueOrDie());
+  } else {
+    LOG(ERROR) << "kernel version "
+               << rocm::DriverVersionStatusToString(kernel_version)
+               << " does not match DSO version "
+               << rocm::DriverVersionStatusToString(dso_version)
+               << " -- cannot find working devices in this configuration";
+  }
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  auto status = port::Status{port::error::UNIMPLEMENTED,
+                             "kernel reported driver version not implemented"};
+  return status;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.h b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..233c6bdade68e19e02a30c92e92d5961d9ca260b
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = gpu::DriverVersion;
+
+// Converts a parsed driver version to string form.
+string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+using Diagnostician = gpu::Diagnostician;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1a2e453152f103af1ff7009e5eba0fd5960a995
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -0,0 +1,4522 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_dnn.h"
+
+#include <functional>
+#include <memory>
+
+#include "absl/strings/str_cat.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "rocm/include/miopen/miopen.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+
+namespace {
+
+// Converts (via narrowing) a type T value to a type U, and checks that the
+// value has no value change due to the conversion.
+template <typename WideT, typename NarrowT>
+NarrowT CheckedNarrowing(const WideT& wide) {
+  NarrowT narrow = wide;
+  CHECK_EQ(narrow, wide)
+      << "checked narrowing failed; values not equal post-conversion";
+  return narrow;
+}
+
+}  // namespace
+
+namespace stream_executor {
+
+using dnn::BatchDescriptor;
+using dnn::ConvolutionDescriptor;
+using dnn::FilterDescriptor;
+using dnn::NormalizeDescriptor;
+using dnn::PoolingDescriptor;
+
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kMIOpenPlugin);
+
+string ToString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "miopenStatusSuccess";
+    case miopenStatusNotInitialized:
+      return "miopenStatusNotInitialized";
+    case miopenStatusAllocFailed:
+      return "miopenStatusAllocFailed";
+    case miopenStatusBadParm:
+      return "miopenStatusBadParm";
+    case miopenStatusInternalError:
+      return "miopenStatusInternalError";
+    case miopenStatusInvalidValue:
+      return "miopenStatusInvalidValue";
+    case miopenStatusNotImplemented:
+      return "miopenStatusNotImplemented";
+    case miopenStatusUnknownError:
+      return "miopenStatusUnknownError";
+    default:
+      return absl::StrCat("<unknown miopen status: ", static_cast<int>(status),
+                          ">");
+  }
+}
+
+// RAII wrapper for all calls to MIOpen with a MIOpen handle argument.
+//
+// See MIOpenAccess::GetHandle() for details.
+class MIOpenHandle {
+ public:
+  // Takes ownership of the executor context and the lock to access MIOpen
+  // using handle.
+  MIOpenHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
+               miopenHandle_t handle)
+      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
+
+  // Returns MIOpen handle. To be passed directly to MIOpen APIs, don't keep
+  // a copy.
+  miopenHandle_t handle() const { return handle_; }
+
+ private:
+  gpu::ScopedActivateExecutorContext context_;
+  mutex_lock lock_;
+  miopenHandle_t handle_;  // Not owned.
+};
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)      \
+  struct WrapperShim__##__name {                 \
+    template <typename... Args>                  \
+    miopenStatus_t operator()(Args... args) {    \
+      miopenStatus_t retval = ::__name(args...); \
+      return retval;                             \
+    }                                            \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in miopen DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    miopenStatus_t operator()(Args... args) {                             \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+#define MIOPEN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(miopenBatchNormalizationBackward)                \
+  __macro(miopenBatchNormalizationForwardInference)        \
+  __macro(miopenBatchNormalizationForwardTraining)         \
+  __macro(miopenGetConvolutionForwardOutputDim)            \
+  __macro(miopenFindConvolutionForwardAlgorithm)           \
+  __macro(miopenCreateTensorDescriptor)                    \
+  __macro(miopenDestroyTensorDescriptor)                   \
+  __macro(miopenSet2dPoolingDescriptor)                    \
+  __macro(miopenSetLRNDescriptor)                          \
+  __macro(miopenLRNGetWorkSpaceSize)                       \
+  __macro(miopenCreateConvolutionDescriptor)               \
+  __macro(miopenCreatePoolingDescriptor)                   \
+  __macro(miopenDestroyPoolingDescriptor)                  \
+  __macro(miopenCreateLRNDescriptor)                       \
+  __macro(miopenDestroyLRNDescriptor)                      \
+  __macro(miopenDestroyConvolutionDescriptor)              \
+  __macro(miopenCreateWithStream)                          \
+  __macro(miopenDestroy)                                   \
+  __macro(miopenSetStream)                                 \
+  __macro(miopenSetAllocator)                              \
+  __macro(miopenActivationForward)                         \
+  __macro(miopenConvolutionForward)                        \
+  __macro(miopenConvolutionBackwardBias)                   \
+  __macro(miopenConvolutionForwardGetWorkSpaceSize)        \
+  __macro(miopenInitConvolutionDescriptor)                 \
+  __macro(miopenGetConvolutionDescriptor)                  \
+  __macro(miopenSetConvolutionGroupCount)                  \
+  __macro(miopenSet4dTensorDescriptor)                     \
+  __macro(miopenGetTensorDescriptor)                       \
+  __macro(miopenSetTensorDescriptor)                       \
+  __macro(miopenGetTensorDescriptorSize)                   \
+  __macro(miopenPoolingForward)                            \
+  __macro(miopenPoolingGetWorkSpaceSize)                   \
+  __macro(miopenPoolingBackward)                           \
+  __macro(miopenLRNForward)                                \
+  __macro(miopenLRNBackward)                               \
+  __macro(miopenOpTensor)                                  \
+  __macro(miopenConvolutionBackwardData)                   \
+  __macro(miopenConvolutionBackwardWeights)                \
+  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize)\
+  __macro(miopenFindConvolutionBackwardDataAlgorithm)      \
+  __macro(miopenFindConvolutionBackwardWeightsAlgorithm)   \
+  __macro(miopenConvolutionBackwardDataGetWorkSpaceSize)   \
+  __macro(miopenCreateRNNDescriptor)                       \
+  __macro(miopenSetRNNDescriptor)                          \
+  __macro(miopenDestroyRNNDescriptor)                      \
+  __macro(miopenGetRNNParamsSize)                          \
+  __macro(miopenGetRNNLayerParam)                          \
+  __macro(miopenGetRNNLayerBias)                           \
+  __macro(miopenGetRNNWorkspaceSize)                       \
+  __macro(miopenGetRNNTrainingReserveSize)                 \
+  __macro(miopenRNNForwardInference)                       \
+  __macro(miopenRNNForwardTraining)                        \
+  __macro(miopenRNNBackwardData)                           \
+  __macro(miopenRNNBackwardWeights)                        \
+  __macro(miopenGetRNNLayerParamOffset)                    \
+  __macro(miopenGetRNNLayerParamSize)                      \
+  __macro(miopenGetRNNLayerBiasOffset)                     \
+  __macro(miopenGetRNNLayerBiasSize)                       \
+  __macro(miopenGetRNNParamsDescriptor)                    \
+  __macro(miopenCreateActivationDescriptor)                \
+  __macro(miopenSetActivationDescriptor)                   \
+  __macro(miopenGetActivationDescriptor)                   \
+  __macro(miopenDestroyActivationDescriptor)               \
+  __macro(miopenCreateFusionPlan)                          \
+  __macro(miopenCreateOpConvForward)                       \
+  __macro(miopenCreateOpBiasForward)                       \
+  __macro(miopenCreateOpActivationForward)                 \
+  __macro(miopenCreateOpActivationBackward)                \
+  __macro(miopenCreateOpBatchNormInference)                \
+  __macro(miopenCreateOpBatchNormForward)                  \
+  __macro(miopenCreateOpBatchNormBackward)                 \
+  __macro(miopenCompileFusionPlan)                         \
+  __macro(miopenFusionPlanGetOp)                           \
+  __macro(miopenCreateOperatorArgs)                        \
+  __macro(miopenSetOpArgsConvForward)                      \
+  __macro(miopenSetOpArgsBiasForward)                      \
+  __macro(miopenSetOpArgsActivForward)                     \
+  __macro(miopenSetOpArgsActivBackward)                    \
+  __macro(miopenSetOpArgsBatchNormInference)               \
+  __macro(miopenSetOpArgsBatchNormForward)                 \
+  __macro(miopenSetOpArgsBatchNormBackward)                \
+  __macro(miopenExecuteFusionPlan)                         \
+  __macro(miopenDestroyOperatorArgs)                       \
+  __macro(miopenDestroyFusionPlan)
+
+// clang-format on
+
+MIOPEN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_MIOPEN_WRAP)
+
+#undef MIOPEN_DNN_ROUTINE_EACH
+
+}  // namespace wrap
+
+namespace {
+
+// These routines should ideally be provided as an MIOpen API.
+// They are called for *every* _ROCMmFusedOp*::Compute call, and they need to be
+// efficient! Instead of calculating the hash value by quering the MIOpen Get*
+// APIs for the descriptor components, it would be a lot more efficient if,
+// MIOpen calculated the hash value when creating the descriptor, stored it on
+// the descriptor datastructure, and provided an API routine to query it.
+
+const int kMaxMIOpenTensorSize = 5;
+
+uint64 GetHashValue(miopenTensorDescriptor_t tensor_desc) {
+  miopenDataType_t datatype = miopenFloat;
+  int dims[kMaxMIOpenTensorSize] = {0};
+  int strides[kMaxMIOpenTensorSize] = {0};
+  wrap::miopenGetTensorDescriptor(tensor_desc, &datatype, dims, strides);
+
+  uint64 hash_value = tensorflow::hash<int>()(datatype);
+  for (int dim : dims)
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(dim));
+  for (int stride : strides)
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(stride));
+
+  return hash_value;
+}
+
+uint64 GetHashValue(miopenConvolutionDescriptor_t conv_desc) {
+  miopenConvolutionMode_t c_mode = miopenConvolution;
+  int pad_h = 0, pad_w = 0, u = 0, v = 0, dilation_h = 0, dilation_w = 0;
+  wrap::miopenGetConvolutionDescriptor(conv_desc, &c_mode, &pad_h, &pad_w, &u,
+                                       &v, &dilation_h, &dilation_w);
+
+  uint64 hash_value = tensorflow::hash<int>()(c_mode);
+  hash_value =
+      tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(pad_h));
+  hash_value =
+      tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(pad_w));
+  hash_value =
+      tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(u));
+  hash_value =
+      tensorflow::Hash64Combine(hash_value, tensorflow::hash<int>()(v));
+  hash_value = tensorflow::Hash64Combine(hash_value,
+                                         tensorflow::hash<int>()(dilation_h));
+  hash_value = tensorflow::Hash64Combine(hash_value,
+                                         tensorflow::hash<int>()(dilation_w));
+
+  return hash_value;
+}
+
+// Class to implement a cache of compiled fusion plans.
+class CachedFusionPlans {
+ public:
+  // Check if we already have a fusion_plan corresponding to the given hash
+  // value.
+  // If we do, then
+  //   return true (+ the cached fusion plan via given pointer)
+  // Else
+  //   create a new fusion plan descriptor,
+  //   associate it with the given hash value in the cache
+  //   return false (+ newly created fusion plan via given pointer)
+  static bool FindOrCreate(uint64 hash,
+                           miopenFusionPlanDescriptor_t* fusion_plan,
+                           miopenFusionDirection_t fusion_direction,
+                           miopenTensorDescriptor_t input_descriptor) {
+    mutex_lock lock{cached_plans_mutex};
+
+    bool found_cached_plan = false;
+
+    auto it = cached_plans.find(hash);
+    if (it != cached_plans.end()) {
+      *fusion_plan = it->second;
+      found_cached_plan = true;
+    } else {
+      auto status = wrap::miopenCreateFusionPlan(fusion_plan, fusion_direction,
+                                                 input_descriptor);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateFusionPlan failed: "
+                   << ToString(status);
+      } else {
+        cached_plans[hash] = *fusion_plan;
+      }
+    }
+
+    return found_cached_plan;
+  }
+
+  // Need to figure out the right place to call this routine.
+  static void Clear() {
+    mutex_lock lock{cached_plans_mutex};
+
+    for (auto it : cached_plans) {
+      auto status = wrap::miopenDestroyFusionPlan(it.second);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenDestroyFusionPlan failed: "
+                   << ToString(status);
+      }
+    }
+
+    cached_plans.clear();
+
+    unsupported_plans.clear();
+  }
+
+  // Is the Fusion plan corresponding to this hash unsupported.
+  static bool IsUnsupportedFusionPlan(uint64 hash) {
+    mutex_lock lock{cached_plans_mutex};
+    return unsupported_plans.count(hash) > 0;
+  }
+
+  // Mark the given hash value as corresponding to an unsupported fusion plan.
+  static void MarkFusionPlanUnsupported(uint64 hash) {
+    mutex_lock lock{cached_plans_mutex};
+    unsupported_plans.insert(hash);
+  }
+
+ private:
+  // Mutex to guard access to all data within this class.
+  static mutex cached_plans_mutex;
+
+  // Map of hash-value to MIOpen Fusion plan descriptors.
+  // Need to be able share this across more than one stream and hence static.
+  static std::map<uint64, miopenFusionPlanDescriptor_t> cached_plans;
+
+  // Set of hash-values that correspond to MIOpen Fusion plans that will fail
+  // compile and hence are not supported.
+  static std::set<uint64> unsupported_plans;
+};
+
+mutex CachedFusionPlans::cached_plans_mutex;
+std::map<uint64, miopenFusionPlanDescriptor_t> CachedFusionPlans::cached_plans;
+std::set<uint64> CachedFusionPlans::unsupported_plans;
+
+miopenHandle_t ToHandle(void* opaque_handle) {
+  return static_cast<miopenHandle_t>(opaque_handle);
+}
+
+miopenConvFwdAlgorithm_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
+  miopenConvFwdAlgorithm_t algo = miopenConvFwdAlgorithm_t(algorithm.algo_id());
+  switch (algo) {
+    case miopenConvolutionFwdAlgoGEMM:
+    case miopenConvolutionFwdAlgoDirect:
+    case miopenConvolutionFwdAlgoFFT:
+    case miopenConvolutionFwdAlgoWinograd:
+      return algo;
+    default:
+      LOG(FATAL) << "Unsupported MIOpen convolution forward algorithm: "
+                 << algorithm.algo_id();
+  }
+}
+
+miopenConvBwdDataAlgorithm_t ToConvBackwardDataAlgo(
+    dnn::AlgorithmDesc algorithm) {
+  miopenConvBwdDataAlgorithm_t algo =
+      miopenConvBwdDataAlgorithm_t(algorithm.algo_id());
+  switch (algo) {
+    case miopenConvolutionBwdDataAlgoGEMM:
+    case miopenConvolutionBwdDataAlgoDirect:
+    case miopenConvolutionBwdDataAlgoFFT:
+    case miopenConvolutionBwdDataAlgoWinograd:
+      return algo;
+    default:
+      LOG(FATAL)
+          << "Unsupported MIOpen convolution backward algorithm for data: "
+          << algorithm.algo_id();
+  }
+}
+
+miopenConvBwdWeightsAlgorithm_t ToConvBackwardFilterAlgo(
+    dnn::AlgorithmDesc algorithm) {
+  miopenConvBwdWeightsAlgorithm_t algo =
+      miopenConvBwdWeightsAlgorithm_t(algorithm.algo_id());
+  switch (algo) {
+    case miopenConvolutionBwdWeightsAlgoGEMM:
+    case miopenConvolutionBwdWeightsAlgoDirect:
+      return algo;
+    default:
+      LOG(FATAL)
+          << "Unsupported MIOpen convolution backward algorithm for filter: "
+          << algorithm.algo_id();
+  }
+}
+
+}  // namespace
+
+// Wraps a MIOpen handle and provides access to it through miopenHandle_t
+// instances, which also locks a mutex, acquires the ROCm context, and sets
+// the stream that MIOpen should use to enqueue any work.
+//
+// Note: MIOpenSupport::miopen_ should be the only instantiation of this class.
+class MIOpenAccess {
+ public:
+  // Takes ownership of the handle.
+  explicit MIOpenAccess(miopenHandle_t handle) : handle_(handle) {}
+
+  ~MIOpenAccess() {
+    mutex_lock lock(mutex_);
+    wrap::miopenDestroy(handle_);
+  }
+
+  // Creates a MIOpenHandle instance for stream.
+  //
+  // MIOpen API calls using the same handle instance need to be serialized
+  // across threads. This is guaranteed by MIOpenHandle instances locking the
+  // mutex owned by this class.
+  //
+  // Most MIOpen APIs taking a handle perform work on a HIP stream. The
+  // MIOpenHandle instance acquires the executor's ROCm context and sets MIOpen
+  // to use the provided stream.
+  //
+  // The stream argument may be null, which translates to the null stream.
+  // The null stream synchronizes with all other streams and it is
+  // therefore a bad idea (performance wise) to call any MIOpen APIs that
+  // enqueue work in the stream.
+  MIOpenHandle GetHandle(GpuExecutor* executor, Stream* stream) {
+    mutex_lock lock(mutex_);
+    gpu::ScopedActivateExecutorContext context(executor);
+    hipStream_t hip_stream = stream ? AsGpuStreamValue(stream) : nullptr;
+    auto status = wrap::miopenSetStream(handle_, hip_stream);
+    CHECK_EQ(status, miopenStatusSuccess) << "Failed to set MIOpen stream.";
+    return MIOpenHandle(std::move(context), std::move(lock), handle_);
+  }
+
+ private:
+  // Guards the enqueueing of MIOpen operations via the handle_ below.
+  mutex mutex_;
+
+  // MIOpen library handle.
+  miopenHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
+};
+
+MIOpenSupport::MIOpenSupport(GpuExecutor* parent) : parent_(parent) {}
+
+port::Status MIOpenSupport::Init() {
+  ScopedActivateExecutorContext context(parent_);
+  miopenHandle_t miopen_handle = nullptr;
+  auto status = wrap::miopenCreateWithStream(
+      reinterpret_cast<miopenHandle_t*>(&miopen_handle), (hipStream_t)(0));
+  if (status == miopenStatusSuccess) {
+    miopen_.reset(new MIOpenAccess(miopen_handle));
+    return port::Status::OK();
+  }
+
+  CHECK_EQ(miopen_handle, nullptr);
+  LOG(ERROR) << "could not create miopen handle: " << ToString(status);
+  if (status == miopenStatusNotInitialized) {
+    auto result = rocm::Diagnostician::FindKernelDriverVersion();
+    if (!result.ok()) {
+      LOG(ERROR) << "error retrieving driver version: "
+                 << rocm::DriverVersionStatusToString(result);
+    } else {
+      const auto& version = result.ValueOrDie();
+      LOG(INFO) << "possibly insufficient driver version: "
+                << rocm::DriverVersionToString(version);
+    }
+  }
+
+  return port::Status{port::error::INTERNAL,
+                      absl::StrCat("miopen library could not create a handle: ",
+                                   ToString(status))};
+}
+
+port::StatusOr<perftools::gputools::dnn::VersionInfo>
+MIOpenSupport::GetVersion() {
+  // ROCM TODO: retrieve MIOpen version with its API
+  return perftools::gputools::dnn::VersionInfo(1, 3, 0);
+}
+
+// Turns a BatchDescriptor structure into a miopen tensor handle within a scope.
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor(const BatchDescriptor& batch_descriptor,
+                         miopenDataType_t elem_type)
+      : handle_(nullptr) {
+    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not create miopen tensor descriptor: "
+                 << ToString(status);
+    }
+
+    switch (batch_descriptor.layout()) {
+      case dnn::DataLayout::kBatchYXDepth:
+      case dnn::DataLayout::kBatchDepthYX: {
+        const int nd = batch_descriptor.ndims() + 2;
+        if (nd != 4) {
+          LOG(FATAL) << "miopen only supports 4D tensors, dim=" << nd
+                     << " not allowed";
+        }
+
+        // MIOpen requires the strides and dims to be ordered as BDYX.
+        std::vector<int64> strides64 =
+            batch_descriptor.full_strides(dnn::DataLayout::kBatchDepthYX);
+        std::vector<int64> dims64 =
+            batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+        // MIOpen requires arrays of ints.
+        std::vector<int> strides(nd);
+        std::vector<int> dims(nd);
+        std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                       &CheckedNarrowing<int64, int>);
+        std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
+                       &CheckedNarrowing<int64, int>);
+        status = wrap::miopenSet4dTensorDescriptor(handle_, elem_type, dims[0],
+                                                   dims[1], dims[2], dims[3]);
+
+        if (status != miopenStatusSuccess) {
+          LOG(FATAL) << "could not convert BatchDescriptor "
+                     << batch_descriptor.ToString()
+                     << " to miopen tensor descriptor: " << ToString(status);
+        }
+      } break;
+      default:
+        LOG(FATAL) << "Unsupported tensor format "
+                   << DataLayoutString(batch_descriptor.layout());
+        break;
+    }
+  }
+
+  ~ScopedTensorDescriptor() {
+    auto status = wrap::miopenDestroyTensorDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "could not destroy miopen tensor descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  miopenTensorDescriptor_t handle() const { return handle_; }
+
+ private:
+  miopenTensorDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+// Turns a FilterDescriptor structure into a miopen filter handle within a
+// scope.
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor(const FilterDescriptor& filter_descriptor,
+                         const BatchDescriptor& batch_descriptor,
+                         miopenDataType_t elem_type)
+      : handle_(nullptr) {
+    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not create miopen filter descriptor: "
+                 << ToString(status);
+    }
+
+    const int nd = batch_descriptor.ndims() + 2;
+
+    if (nd != 4) {
+      LOG(FATAL) << "miopen only supports 4D filters, dim=" << nd
+                 << "not allowed" << ToString(status);
+    }
+
+    std::vector<int> dims(2 + filter_descriptor.ndims());
+    dims[0] = filter_descriptor.output_feature_map_count();
+    dims[1] = filter_descriptor.input_feature_map_count();
+    const auto& spatial_dims = filter_descriptor.input_filter_dims();
+    std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
+
+    status = wrap::miopenSet4dTensorDescriptor(handle_, elem_type, dims[0],
+                                               dims[1], dims[2], dims[3]);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not set miopen filter descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  ~ScopedFilterDescriptor() {
+    auto status = wrap::miopenDestroyTensorDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "could not destroy miopen filter descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  miopenTensorDescriptor_t handle() const { return handle_; }
+
+ private:
+  // miopen filter descriptor this object creates. Owned.
+  miopenTensorDescriptor_t handle_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+// Turns a ConvolutionDescriptor structure into a miopen convolution handle
+// within a scope.
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor(
+      const ConvolutionDescriptor& convolution_descriptor,
+      miopenDataType_t data_type)
+      : handle_(nullptr) {
+    auto status = wrap::miopenCreateConvolutionDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not create miopen convolution descriptor: "
+                 << ToString(status);
+    }
+    const auto& strides64 = convolution_descriptor.strides();
+    const auto& padding64 = convolution_descriptor.padding();
+    if (convolution_descriptor.pad_alignment() ==
+        dnn::PadAlignment::kTensorFlowPadding) {
+      LOG(ERROR) << "TensorFlow padding alignment is not supported.";
+    }
+
+    // MIOpen requires arrays of ints.
+    std::vector<int> strides(convolution_descriptor.ndims());
+    std::vector<int> padding(convolution_descriptor.ndims());
+    std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                   &CheckedNarrowing<int64, int>);
+    std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
+                   &CheckedNarrowing<int64, int>);
+    std::vector<int> upscale(convolution_descriptor.ndims(), 1);
+
+    status = wrap::miopenInitConvolutionDescriptor(
+        handle_, miopenConvolution, padding[0], padding[1], strides[0],
+        strides[1], upscale[0], upscale[1]);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not set miopen convolution descriptor: "
+                 << ToString(status);
+    }
+
+    VLOG(2) << "Requesting grouped convolution: "
+            << convolution_descriptor.group_count();
+    status = wrap::miopenSetConvolutionGroupCount(
+        handle_, convolution_descriptor.group_count());
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not set miopen convolution group count: "
+                 << ToString(status);
+    }
+  }
+  ~ScopedConvolutionDescriptor() {
+    auto status = wrap::miopenDestroyConvolutionDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "could not destroy miopen convolution descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  miopenConvolutionDescriptor_t handle() const { return handle_; }
+
+ private:
+  miopenConvolutionDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+// Turns a PoolingDescriptor structure into a miopen pooling descriptor handle
+// within a scope.
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor(const PoolingDescriptor& pooling_descriptor)
+      : handle_(nullptr) {
+    auto status = wrap::miopenCreatePoolingDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not create miopen pooling descriptor: "
+                 << ToString(status);
+    }
+
+    absl::Span<const int64> strides64 = pooling_descriptor.strides();
+    absl::Span<const int64> padding64 = pooling_descriptor.padding();
+    absl::Span<const int64> shape64 = pooling_descriptor.window();
+
+    const int nd = pooling_descriptor.ndims();
+    std::vector<int> shape(nd);
+    std::vector<int> padding(nd);
+    std::vector<int> strides(nd);
+    std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                   &CheckedNarrowing<int64, int>);
+    std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
+                   &CheckedNarrowing<int64, int>);
+    std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
+                   &CheckedNarrowing<int64, int>);
+
+    if (nd != 2) {
+      LOG(FATAL) << "miopen requires pooling dimensions be 2"
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSet2dPoolingDescriptor(
+        handle_,
+        (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
+             ? miopenPoolingMax
+             : miopenPoolingAverage),
+        shape[0], shape[1], padding[0], padding[1], strides[0], strides[1]);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not set miopen pooling descriptor: "
+                 << ToString(status);
+    }
+  }
+  ~ScopedPoolingDescriptor() {
+    auto status = wrap::miopenDestroyPoolingDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "could not destroy miopen pooling descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  miopenPoolingDescriptor_t handle() const { return handle_; }
+
+ private:
+  miopenPoolingDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+// Turns a NormalizeDescriptor structure into a miopen LRN descriptor handle.
+class ScopedNormalizeDescriptor {
+ public:
+  ScopedNormalizeDescriptor(const NormalizeDescriptor& normalize_descriptor)
+      : handle_(nullptr) {
+    auto status = wrap::miopenCreateLRNDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not create miopen LRN descriptor: "
+                 << ToString(status);
+    }
+
+    // The range specifies that the indices in the closed range
+    // [i - range, i + range] should be included in the normalization for index
+    // i. The lrnN value is the total number of elements in the range, so
+    // lrnN = 2*range + 1.
+    unsigned lrn_N = 2 * normalize_descriptor.range() + 1;
+
+    // Note that SE defines the normalization operation as
+    //
+    //  U_i = V_i / ((bias +  alpha      * (sum_j V_j^2)) ^ beta)
+    //
+    // but MIOpen defines it as
+    //
+    //  U_i = V_i / ((bias + (alpha / n) * (sum_j V_j^2)) ^ beta)
+    //
+    // i.e. there is a factor of n difference between the meaning of the alphas
+    // in the two contexts. The MIOpen alpha is n times the SE alpha.
+    double lrn_alpha = lrn_N * normalize_descriptor.alpha();
+
+    double lrn_beta = normalize_descriptor.beta();
+    double lrn_k = normalize_descriptor.bias();
+    status = wrap::miopenSetLRNDescriptor(handle_, miopenLRNCrossChannel, lrn_N,
+                                          lrn_alpha, lrn_beta, lrn_k);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "could not set miopen LRN descriptor: " << ToString(status);
+    }
+  }
+
+  ~ScopedNormalizeDescriptor() {
+    auto status = wrap::miopenDestroyLRNDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "could not destroy miopen LRN descriptor: "
+                 << ToString(status);
+    }
+  }
+
+  miopenLRNDescriptor_t handle() const { return handle_; }
+
+ private:
+  miopenLRNDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
+};
+
+// Turns a activation mode into a miopen activation mode descriptor with a scope
+// around it
+class ScopedActivationDescriptor {
+ public:
+  ScopedActivationDescriptor(dnn::ActivationMode activation_mode)
+      : handle_(nullptr),
+        miopen_activation_mode_(miopenActivationPASTHRU),
+        alpha_(0.0),
+        beta_(0.0),
+        gamma_(0.0) {
+    auto status = wrap::miopenCreateActivationDescriptor(&handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenCreateActivationDescriptor failed: "
+                 << ToString(status);
+    } else {
+      switch (activation_mode) {
+        case dnn::ActivationMode::kNone:
+          miopen_activation_mode_ = miopenActivationPASTHRU;
+          break;
+
+        case dnn::ActivationMode::kSigmoid:
+          miopen_activation_mode_ = miopenActivationLOGISTIC;
+          break;
+
+        case dnn::ActivationMode::kRelu:
+          miopen_activation_mode_ = miopenActivationRELU;
+          break;
+
+        case dnn::ActivationMode::kRelu6:
+          miopen_activation_mode_ = miopenActivationRELU;
+          alpha_ = 6.0;
+          break;
+
+        case dnn::ActivationMode::kTanh:
+          miopen_activation_mode_ = miopenActivationTANH;
+          break;
+
+        default:
+          LOG(FATAL) << "Activation mode ("
+                     << dnn::ActivationModeString(activation_mode)
+                     << ") not yet implemented";
+          break;
+      }
+
+      status = wrap::miopenSetActivationDescriptor(
+          handle_, miopen_activation_mode_, alpha_, beta_, gamma_);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenSetActivationDescriptor failed: "
+                   << ToString(status);
+      }
+    }
+  }
+
+  ~ScopedActivationDescriptor() {
+    auto status = wrap::miopenDestroyActivationDescriptor(handle_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenDestroyActivationDescriptor failed: "
+                 << ToString(status);
+    }
+  }
+
+  miopenActivationDescriptor_t handle() const { return handle_; }
+
+  uint64 GetHashValue() {
+    uint64 hash_value = tensorflow::hash<int>()(miopen_activation_mode_);
+    hash_value = tensorflow::Hash64Combine(hash_value,
+                                           tensorflow::hash<double>()(alpha_));
+    hash_value = tensorflow::Hash64Combine(hash_value,
+                                           tensorflow::hash<double>()(beta_));
+    hash_value = tensorflow::Hash64Combine(hash_value,
+                                           tensorflow::hash<double>()(gamma_));
+
+    return hash_value;
+  }
+
+ private:
+  miopenActivationDescriptor_t handle_;  // Owned.
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+
+ public:
+  // caching these values here to avoid calling miopenGetActivationDescriptor
+  // to do the same. miopenGetActivationDescriptor gets called twice during each
+  // call to execute a fusion plan (that involves the activation op)...once call
+  // during calculating hashvalue for the fusion op, and another before calling
+  // SetOpArgs for the activation op
+  miopenActivationMode_t miopen_activation_mode_;
+  double alpha_;
+  double beta_;
+  double gamma_;
+};
+
+// base class for all fusion plan implementations to derive from
+class ScopedFusionPlanBase {
+ public:
+  ScopedFusionPlanBase(miopenHandle_t miopen_handle,
+                       const miopenFusionDirection_t fuse_direction,
+                       const miopenTensorDescriptor_t input_descriptor)
+      : miopen_handle_(miopen_handle),
+        fusion_plan_(nullptr),
+        fusion_args_(nullptr),
+        fusion_plan_compiled_(false) {
+    auto status = wrap::miopenCreateOperatorArgs(&fusion_args_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenCreateOperatorArgs failed: "
+                 << ToString(status);
+    }
+  }
+
+  virtual ~ScopedFusionPlanBase() {
+    auto status = wrap::miopenDestroyOperatorArgs(fusion_args_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenDestroyoperatorArgs failed: "
+                 << ToString(status);
+    }
+  }
+
+  miopenStatus_t Execute(miopenTensorDescriptor_t input_descriptor,
+                         const void* input_data,
+                         miopenTensorDescriptor_t output_descriptor,
+                         void* output_data) {
+    auto status = wrap::miopenExecuteFusionPlan(
+        miopen_handle_, fusion_plan_, input_descriptor, input_data,
+        output_descriptor, output_data, fusion_args_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenExecuteFusionPlan failed: "
+                 << ToString(status);
+    }
+
+    return status;
+  }
+
+  bool CompilationSucceeded() { return fusion_plan_compiled_; }
+
+ protected:
+  miopenStatus_t SetConvolutionArgs(const int op_idx, const float* alpha,
+                                    const float* beta, const void* data) {
+    miopenFusionOpDescriptor_t conv_op;
+    auto status = wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &conv_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSetOpArgsConvForward(fusion_args_, conv_op, alpha,
+                                              beta, data);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsConvForward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetBiasArgs(const int op_idx, const float* alpha,
+                             const float* beta, const void* data) {
+    miopenFusionOpDescriptor_t bias_op;
+    auto status = wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &bias_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSetOpArgsBiasForward(fusion_args_, bias_op, alpha,
+                                              beta, data);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsBiasForward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetBatchNormInferenceArgs(const int op_idx, const float* alpha,
+                                           const float* beta, const void* scale,
+                                           const void* offset, const void* mean,
+                                           const void* variance,
+                                           double epsilon) {
+    miopenFusionOpDescriptor_t batchnorm_op;
+    auto status =
+        wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &batchnorm_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSetOpArgsBatchNormInference(fusion_args_, batchnorm_op,
+                                                     alpha, beta, scale, offset,
+                                                     mean, variance, epsilon);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsBatchNormInference failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetBatchNormForwardArgs(const int op_idx, const float* alpha,
+                                         const float* beta, const void* scale,
+                                         const void* offset, void* running_mean,
+                                         void* running_variance,
+                                         void* saved_mean,
+                                         void* saved_inv_variance,
+                                         double epsilon) {
+    miopenFusionOpDescriptor_t batchnorm_op;
+    auto status =
+        wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &batchnorm_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    double exp_avg_factor = 1.0;
+
+    status = wrap::miopenSetOpArgsBatchNormForward(
+        fusion_args_, batchnorm_op, alpha, beta, scale, offset, saved_mean,
+        saved_inv_variance, running_mean, running_variance, exp_avg_factor,
+        epsilon);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsBatchNormForward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetBatchNormBackwardArgs(const int op_idx, const float* alpha,
+                                          const float* beta, const void* x,
+                                          const void* scale, const void* offset,
+                                          void* scale_grad, void* offset_grad,
+                                          const void* saved_mean,
+                                          const void* saved_inv_variance) {
+    miopenFusionOpDescriptor_t batchnorm_op;
+    auto status =
+        wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &batchnorm_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSetOpArgsBatchNormBackward(
+        fusion_args_, batchnorm_op, alpha, beta, x, scale, offset, scale_grad,
+        offset_grad, saved_mean, saved_inv_variance);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsBatchNormBackward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetActivationForwardArgs(const int op_idx, const float* alpha,
+                                          const float* beta, double activ_alpha,
+                                          double activ_beta,
+                                          double activ_gamma) {
+    miopenFusionOpDescriptor_t actv_op;
+    auto status = wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &actv_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status =
+        wrap::miopenSetOpArgsActivForward(fusion_args_, actv_op, alpha, beta,
+                                          activ_alpha, activ_beta, activ_gamma);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsActivForward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenStatus_t SetActivationBackwardArgs(const int op_idx, const float* alpha,
+                                           const float* beta, const void* y,
+                                           double activ_alpha,
+                                           double activ_beta,
+                                           double activ_gamma) {
+    miopenFusionOpDescriptor_t actv_op;
+    auto status = wrap::miopenFusionPlanGetOp(fusion_plan_, op_idx, &actv_op);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenFusionPlanGetOp failed: "
+                 << ToString(status);
+    }
+
+    status = wrap::miopenSetOpArgsActivBackward(fusion_args_, actv_op, alpha,
+                                                beta, y, nullptr, activ_alpha,
+                                                activ_beta, activ_gamma);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenSetOpArgsActivBackward failed: "
+                 << ToString(status);
+    }
+    return status;
+  }
+
+  miopenHandle_t miopen_handle_;
+  miopenFusionPlanDescriptor_t fusion_plan_;
+  miopenOperatorArgs_t fusion_args_;  // Owned.
+  bool fusion_plan_compiled_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBase);
+};
+
+// class to represent the Convolution+Bias+Activation fusion plan
+class ScopedFusionPlanConvolutionBiasActivation : public ScopedFusionPlanBase {
+ public:
+  ScopedFusionPlanConvolutionBiasActivation(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t filter_descriptor,
+      miopenConvolutionDescriptor_t conv_descriptor,
+      miopenTensorDescriptor_t bias_descriptor,
+      ScopedActivationDescriptor& activation_descriptor)
+      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
+                             input_descriptor) {
+    uint64 hash = GetFusionOpHashValue(miopen_handle, input_descriptor,
+                                       filter_descriptor, conv_descriptor,
+                                       bias_descriptor, activation_descriptor);
+
+    bool is_compiled = CachedFusionPlans::FindOrCreate(
+        hash, &fusion_plan_, miopenVerticalFusion, input_descriptor);
+    if (!is_compiled) {
+      miopenFusionOpDescriptor_t conv_op;
+      auto status = wrap::miopenCreateOpConvForward(
+          fusion_plan_, &conv_op, conv_descriptor, filter_descriptor);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpConvForward failed: "
+                   << ToString(status);
+      }
+
+      miopenFusionOpDescriptor_t bias_op;
+      status = wrap::miopenCreateOpBiasForward(fusion_plan_, &bias_op,
+                                               bias_descriptor);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpBiasForward failed: "
+                   << ToString(status);
+      }
+
+      miopenFusionOpDescriptor_t actv_op;
+      status = wrap::miopenCreateOpActivationForward(
+          fusion_plan_, &actv_op,
+          activation_descriptor.miopen_activation_mode_);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpActivationForward failed: "
+                   << ToString(status);
+      }
+
+      status = wrap::miopenCompileFusionPlan(miopen_handle_, fusion_plan_);
+      if (status != miopenStatusSuccess) {
+        VLOG(2) << "call to miopenCompileFusionPlan (CBA) failed: "
+                << ToString(status);
+
+        CachedFusionPlans::MarkFusionPlanUnsupported(hash);
+      } else {
+        VLOG(2) << "Fusion Plan compile succedded (CBA) ";
+        fusion_plan_compiled_ = true;
+      }
+    } else {
+      // fusion plan was already compiled...check whether it failed to compile
+      fusion_plan_compiled_ = !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
+    }
+  }
+
+  miopenStatus_t SetConvolutionArgs(const void* filter_data) {
+    float alpha = 1.0;
+    float beta = 0.0;
+    return ScopedFusionPlanBase::SetConvolutionArgs(k_conv_op_idx, &alpha,
+                                                    &beta, filter_data);
+  }
+
+  miopenStatus_t SetBiasArgs(const void* bias_data) {
+    float alpha = 1.0;
+    float beta = 0.0;
+    return ScopedFusionPlanBase::SetBiasArgs(k_bias_op_idx, &alpha, &beta,
+                                             bias_data);
+  }
+
+  miopenStatus_t SetActivationForwardArgs(
+      ScopedActivationDescriptor& activation_descriptor) {
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    return ScopedFusionPlanBase::SetActivationForwardArgs(
+        k_actv_op_idx, &alpha, &beta, activation_descriptor.alpha_,
+        activation_descriptor.beta_, activation_descriptor.gamma_);
+  }
+
+  uint64 GetFusionOpHashValue(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t filter_descriptor,
+      miopenConvolutionDescriptor_t conv_descriptor,
+      miopenTensorDescriptor_t bias_descriptor,
+      ScopedActivationDescriptor& activation_descriptor) {
+    uint64 hash_value = tensorflow::Hash64("ConvolutionBiasActivation");
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, tensorflow::hash<miopenHandle_t>()(miopen_handle));
+
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(input_descriptor));
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(filter_descriptor));
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(conv_descriptor));
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(bias_descriptor));
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, activation_descriptor.GetHashValue());
+    return hash_value;
+  }
+
+ private:
+  const int k_conv_op_idx = 0;
+  const int k_bias_op_idx = 1;
+  const int k_actv_op_idx = 2;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanConvolutionBiasActivation);
+};
+
+// class to represent the BatchNorm+Activation (inference) fusion plan
+class ScopedFusionPlanBatchNormActivationInference
+    : public ScopedFusionPlanBase {
+ public:
+  ScopedFusionPlanBatchNormActivationInference(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor)
+      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
+                             input_descriptor) {
+    uint64 hash = GetFusionOpHashValue(miopen_handle, input_descriptor,
+                                       scale_offset_mean_variance_descriptor,
+                                       activation_descriptor);
+
+    bool is_compiled = CachedFusionPlans::FindOrCreate(
+        hash, &fusion_plan_, miopenVerticalFusion, input_descriptor);
+
+    if (!is_compiled) {
+      miopenFusionOpDescriptor_t batchnorm_op;
+      auto status = wrap::miopenCreateOpBatchNormInference(
+          fusion_plan_, &batchnorm_op, miopenBNSpatial,
+          scale_offset_mean_variance_descriptor);
+
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpBatchNormInference failed: "
+                   << ToString(status);
+      }
+
+      miopenFusionOpDescriptor_t actv_op;
+      status = wrap::miopenCreateOpActivationForward(
+          fusion_plan_, &actv_op,
+          activation_descriptor.miopen_activation_mode_);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpActivationForward failed: "
+                   << ToString(status);
+      }
+
+      status = wrap::miopenCompileFusionPlan(miopen_handle_, fusion_plan_);
+      if (status != miopenStatusSuccess) {
+        VLOG(2) << "call to miopenCompileFusionPlan (BnA inference) failed: "
+                << ToString(status);
+
+        CachedFusionPlans::MarkFusionPlanUnsupported(hash);
+      } else {
+        VLOG(2) << "Fusion Plan compile succedded (BnA inference) ";
+        fusion_plan_compiled_ = true;
+      }
+    } else {
+      // fusion plan was already compiled...check whether it failed to compile
+      fusion_plan_compiled_ = !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
+    }
+  }
+
+  miopenStatus_t SetBatchNormInferenceArgs(const void* scale,
+                                           const void* offset, const void* mean,
+                                           const void* variance,
+                                           double epsilon) {
+    float alpha = 1.0;
+    float beta = 0.0;
+    return ScopedFusionPlanBase::SetBatchNormInferenceArgs(
+        k_batchnorm_op_idx, &alpha, &beta, scale, offset, mean, variance,
+        epsilon);
+  }
+
+  miopenStatus_t SetActivationForwardArgs(
+      ScopedActivationDescriptor& activation_descriptor) {
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    return ScopedFusionPlanBase::SetActivationForwardArgs(
+        k_actv_op_idx, &alpha, &beta, activation_descriptor.alpha_,
+        activation_descriptor.beta_, activation_descriptor.gamma_);
+  }
+
+  uint64 GetFusionOpHashValue(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor) {
+    uint64 hash_value = tensorflow::Hash64("BatchNormActivationInference");
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, tensorflow::hash<miopenHandle_t>()(miopen_handle));
+
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(input_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, GetHashValue(scale_offset_mean_variance_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, activation_descriptor.GetHashValue());
+    return hash_value;
+  }
+
+ private:
+  const int k_batchnorm_op_idx = 0;
+  const int k_actv_op_idx = 1;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationInference);
+};
+
+// class to represent the BatchNorm+Activation (training-forward) fusion plan
+class ScopedFusionPlanBatchNormActivationForward : public ScopedFusionPlanBase {
+ public:
+  ScopedFusionPlanBatchNormActivationForward(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor)
+      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
+                             input_descriptor) {
+    uint64 hash = GetFusionOpHashValue(miopen_handle, input_descriptor,
+                                       scale_offset_mean_variance_descriptor,
+                                       activation_descriptor);
+
+    bool is_compiled = CachedFusionPlans::FindOrCreate(
+        hash, &fusion_plan_, miopenVerticalFusion, input_descriptor);
+
+    if (!is_compiled) {
+      miopenFusionOpDescriptor_t batchnorm_op;
+      auto status = wrap::miopenCreateOpBatchNormForward(
+          fusion_plan_, &batchnorm_op, miopenBNSpatial,
+          true /* runningMeanVariance */);
+
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpBatchNormForward failed: "
+                   << ToString(status);
+      }
+
+      miopenFusionOpDescriptor_t actv_op;
+      status = wrap::miopenCreateOpActivationForward(
+          fusion_plan_, &actv_op,
+          activation_descriptor.miopen_activation_mode_);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpActivationForward failed: "
+                   << ToString(status);
+      }
+
+      status = wrap::miopenCompileFusionPlan(miopen_handle_, fusion_plan_);
+      if (status != miopenStatusSuccess) {
+        VLOG(2) << "call to miopenCompileFusionPlan (BnA forward) failed: "
+                << ToString(status);
+
+        CachedFusionPlans::MarkFusionPlanUnsupported(hash);
+      } else {
+        VLOG(2) << "Fusion Plan compile succedded (BnA forward) ";
+        fusion_plan_compiled_ = true;
+      }
+    } else {
+      // fusion plan was already compiled...check whether it failed to compile
+      fusion_plan_compiled_ = !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
+    }
+  }
+
+  miopenStatus_t SetBatchNormForwardArgs(const void* scale, const void* offset,
+                                         void* batch_mean, void* batch_var,
+                                         void* saved_mean, void* saved_var,
+                                         double epsilon) {
+    float alpha = 1.0;
+    float beta = 0.0;
+    return ScopedFusionPlanBase::SetBatchNormForwardArgs(
+        k_batchnorm_op_idx, &alpha, &beta, scale, offset, batch_mean, batch_var,
+        saved_mean, saved_var, epsilon);
+  }
+
+  miopenStatus_t SetActivationForwardArgs(
+      ScopedActivationDescriptor& activation_descriptor) {
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    return ScopedFusionPlanBase::SetActivationForwardArgs(
+        k_actv_op_idx, &alpha, &beta, activation_descriptor.alpha_,
+        activation_descriptor.beta_, activation_descriptor.gamma_);
+  }
+
+  uint64 GetFusionOpHashValue(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor) {
+    uint64 hash_value = tensorflow::Hash64("BatchNormActivationForward");
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, tensorflow::hash<miopenHandle_t>()(miopen_handle));
+
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(input_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, GetHashValue(scale_offset_mean_variance_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, activation_descriptor.GetHashValue());
+    return hash_value;
+  }
+
+ private:
+  const int k_batchnorm_op_idx = 0;
+  const int k_actv_op_idx = 1;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationForward);
+};
+
+// class to represent the BatchNorm+Activation (training-backward) fusion plan
+class ScopedFusionPlanBatchNormActivationBackward
+    : public ScopedFusionPlanBase {
+ public:
+  ScopedFusionPlanBatchNormActivationBackward(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor)
+      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
+                             input_descriptor) {
+    uint64 hash = GetFusionOpHashValue(miopen_handle, input_descriptor,
+                                       scale_offset_mean_variance_descriptor,
+                                       activation_descriptor);
+
+    bool is_compiled = CachedFusionPlans::FindOrCreate(
+        hash, &fusion_plan_, miopenVerticalFusion, input_descriptor);
+
+    if (!is_compiled) {
+      miopenFusionOpDescriptor_t batchnorm_op;
+      auto status = wrap::miopenCreateOpBatchNormBackward(
+          fusion_plan_, &batchnorm_op, miopenBNSpatial);
+
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpBatchNormBackward failed: "
+                   << ToString(status);
+      }
+
+      miopenFusionOpDescriptor_t actv_op;
+      status = wrap::miopenCreateOpActivationBackward(
+          fusion_plan_, &actv_op,
+          activation_descriptor.miopen_activation_mode_);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenCreateOpActivationBackward failed: "
+                   << ToString(status);
+      }
+
+      status = wrap::miopenCompileFusionPlan(miopen_handle_, fusion_plan_);
+      if (status != miopenStatusSuccess) {
+        VLOG(2) << "call to miopenCompileFusionPlan (BnA backward) failed: "
+                << ToString(status);
+
+        CachedFusionPlans::MarkFusionPlanUnsupported(hash);
+      } else {
+        VLOG(2) << "Fusion Plan compile succedded (BnA backward) ";
+        fusion_plan_compiled_ = true;
+      }
+    } else {
+      // fusion plan was already compiled...check whether it failed to compile
+      fusion_plan_compiled_ = !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
+    }
+  }
+
+  miopenStatus_t SetBatchNormBackwardArgs(const void* x, const void* scale,
+                                          const void* offset,
+                                          const void* saved_mean,
+                                          const void* saved_var,
+                                          void* scale_grad, void* offset_grad) {
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    return ScopedFusionPlanBase::SetBatchNormBackwardArgs(
+        k_batchnorm_op_idx, &alpha, &beta, x, scale, offset, scale_grad,
+        offset_grad, saved_mean, saved_var);
+  }
+
+  miopenStatus_t SetActivationBackwardArgs(
+      ScopedActivationDescriptor& activation_descriptor, const void* y) {
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    return ScopedFusionPlanBase::SetActivationBackwardArgs(
+        k_actv_op_idx, &alpha, &beta, y, activation_descriptor.alpha_,
+        activation_descriptor.beta_, activation_descriptor.gamma_);
+  }
+
+  uint64 GetFusionOpHashValue(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
+      miopenTensorDescriptor_t scale_offset_mean_variance_descriptor,
+      ScopedActivationDescriptor& activation_descriptor) {
+    uint64 hash_value = tensorflow::Hash64("BatchNormActivationBackward");
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, tensorflow::hash<miopenHandle_t>()(miopen_handle));
+
+    hash_value =
+        tensorflow::Hash64Combine(hash_value, GetHashValue(input_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, GetHashValue(scale_offset_mean_variance_descriptor));
+
+    hash_value = tensorflow::Hash64Combine(
+        hash_value, activation_descriptor.GetHashValue());
+    return hash_value;
+  }
+
+ private:
+  const int k_batchnorm_op_idx = 0;
+  const int k_actv_op_idx = 1;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFusionPlanBatchNormActivationBackward);
+};
+
+namespace {
+miopenDataType_t ToMIOpenDataType(
+    dnn::DataType data_type,
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return miopenFloat;
+    case dnn::DataType::kHalf:
+      return miopenHalf;
+    case dnn::DataType::kDouble:
+    default:
+      LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+  }
+}
+
+miopenDataType_t ToMIOpenDataType(dnn::DataType data_type,
+                                  dnn::FilterLayout filter_layout) {
+  return ToMIOpenDataType(data_type);
+}
+
+miopenRNNInputMode_t ToMIOpenRnnInputMode(dnn::RnnInputMode input_mode) {
+  switch (input_mode) {
+    case dnn::RnnInputMode::kRnnLinearSkip:
+      return miopenRNNlinear;
+    case dnn::RnnInputMode::kRnnSkipInput:
+      return miopenRNNskip;
+    default:
+      LOG(FATAL) << "Invalid RNN input mode: " << static_cast<int>(input_mode);
+  }
+}
+
+miopenRNNDirectionMode_t ToMIOpenRnnDirectionMode(
+    dnn::RnnDirectionMode direction_mode) {
+  switch (direction_mode) {
+    case dnn::RnnDirectionMode::kRnnUnidirectional:
+      return miopenRNNunidirection;
+    case dnn::RnnDirectionMode::kRnnBidirectional:
+      return miopenRNNbidirection;
+    default:
+      LOG(FATAL) << "Invalid RNN direction mode: "
+                 << static_cast<int>(direction_mode);
+  }
+}
+
+miopenRNNMode_t ToMIOpenRnnMode(dnn::RnnMode rnn_mode) {
+  switch (rnn_mode) {
+    case dnn::RnnMode::kRnnRelu:
+      return miopenRNNRELU;
+    case dnn::RnnMode::kRnnTanh:
+      return miopenRNNTANH;
+    case dnn::RnnMode::kRnnLstm:
+      return miopenLSTM;
+    case dnn::RnnMode::kRnnGru:
+      return miopenGRU;
+    default:
+      LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
+  }
+}
+
+int MIOpenDataTypeToByteSize(miopenDataType_t data_type) {
+  switch (data_type) {
+    case miopenFloat:
+      return sizeof(float);
+    case miopenHalf:
+      return sizeof(Eigen::half);
+    default:
+      LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+  }
+}
+
+template <typename Base>
+class MixinBase : public Base {};
+template <>
+class MixinBase<void> {};
+
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+    case dnn::DataType::kDouble:
+      return data_type;
+    case dnn::DataType::kHalf:
+      // FIXME: Check if MIOpen can switch dynamically change accumulator type
+      return dnn::DataType::kFloat;
+    case dnn::DataType::kInt8:
+    case dnn::DataType::kInt32:
+      return dnn::DataType::kInt32;
+    default:
+      LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+  }
+}
+
+}  // namespace
+
+#define RETURN_IF_MIOPEN_ERROR(STATUS, ...)                              \
+  if (!SE_PREDICT_TRUE((STATUS) == miopenStatusSuccess)) {               \
+    string error_msg = absl::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
+    SetFailure(port::Status(port::error::UNKNOWN, error_msg));           \
+    LOG(ERROR) << error_msg;                                             \
+    return;                                                              \
+  }
+
+template <typename Base>
+class MIOpenDescriptorCommon : public MixinBase<Base> {
+ public:
+  bool ok() const { return status_.ok(); }
+  port::Status Status() const { return status_; }
+
+ protected:
+  void SetFailure(const port::Status& status) { status_.Update(status); }
+  port::Status status_;
+};
+
+class MIOpenRnnParamsDescriptor : public MIOpenDescriptorCommon<void> {
+ public:
+  typedef dnn::RnnDescriptor::ParamsRegion ParamsRegion;
+  typedef dnn::RnnDescriptor::ParamsRegions ParamsRegions;
+  MIOpenRnnParamsDescriptor(miopenHandle_t miopen_handle,
+                            const MIOpenRnnDescriptor& rnn_desc);
+  ~MIOpenRnnParamsDescriptor() {
+    auto status = wrap::miopenDestroyTensorDescriptor(handle_);
+    RETURN_IF_MIOPEN_ERROR(status, "Failed to destroy RNN tensor descriptor");
+  }
+  miopenTensorDescriptor_t handle() const {
+    if (!ok()) return nullptr;
+    return handle_;
+  }
+  int64 params_size_in_bytes() const { return params_size_in_bytes_; }
+  ParamsRegions params_weights() const {
+    if (!ok()) return ParamsRegions();
+    return weights_;
+  }
+  ParamsRegions params_biases() const {
+    if (!ok()) return ParamsRegions();
+    return biases_;
+  }
+
+ private:
+  int GetRegionCountPerLayer() const;
+  miopenTensorDescriptor_t handle_;
+  const MIOpenRnnDescriptor* rnn_desc_;
+  int64 params_size_in_bytes_;
+  ParamsRegions weights_;
+  ParamsRegions biases_;
+  port::Status status_;
+  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnParamsDescriptor);
+};
+
+class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
+ public:
+  MIOpenRnnDescriptor(miopenHandle_t miopen_handle, int num_layers,
+                      int hidden_size, int input_size,
+                      miopenRNNInputMode_t input_mode,
+                      miopenRNNDirectionMode_t direction_mode,
+                      miopenRNNMode_t rnn_mode, miopenDataType_t data_type,
+                      float dropout, uint64 seed,
+                      ScratchAllocator* state_allocator)
+      : rnn_desc_(nullptr),
+        num_layers_(num_layers),
+        hidden_size_(hidden_size),
+        input_size_(input_size),
+        input_mode_(input_mode),
+        direction_mode_(direction_mode),
+        rnn_mode_(rnn_mode),
+        data_type_(data_type) {
+    // Create the RNN handle
+    auto status = wrap::miopenCreateRNNDescriptor(&rnn_desc_);
+    RETURN_IF_MIOPEN_ERROR(status, "Unable to create RNN descriptor");
+    status = wrap::miopenSetRNNDescriptor(
+        rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
+        num_layers /*numLayers*/, input_mode /*inputMode*/,
+        direction_mode /*direction*/, rnn_mode /*mode*/,
+        miopenRNNwithBias /*biasMode*/, miopenRNNdefault /*algo*/,
+        data_type /*dataType*/);
+    RETURN_IF_MIOPEN_ERROR(status, "Unable to update RNN descriptor");
+    // Create the params handle.
+    miopen_params_desc_.reset(
+        new MIOpenRnnParamsDescriptor(miopen_handle, *this));
+    if (!miopen_params_desc_->ok()) {
+      SetFailure(miopen_params_desc_->Status());
+      return;
+    }
+  }
+  ~MIOpenRnnDescriptor() override {
+    if (rnn_desc_) {
+      auto status = wrap::miopenDestroyRNNDescriptor(rnn_desc_);
+      RETURN_IF_MIOPEN_ERROR(status, "Unable to destroy RNN descriptor");
+    }
+  }
+  miopenRNNDescriptor_t handle() const {
+    if (!ok()) return nullptr;
+    return rnn_desc_;
+  }
+  int num_layers() const { return num_layers_; }
+  int hidden_size() const { return hidden_size_; }
+  int input_size() const { return input_size_; }
+  miopenRNNInputMode_t input_mode() const { return input_mode_; }
+  miopenRNNDirectionMode_t direction_mode() const { return direction_mode_; }
+  miopenRNNMode_t rnn_mode() const { return rnn_mode_; }
+  miopenDataType_t data_type() const { return data_type_; }
+  int64 ParamsSizeInBytes() const override {
+    return miopen_params_desc_->params_size_in_bytes();
+  }
+  miopenTensorDescriptor_t params_handle() const {
+    if (!miopen_params_desc_) return nullptr;
+    return miopen_params_desc_->handle();
+  }
+  ParamsRegions ParamsWeightRegions() const override {
+    if (!ok()) return ParamsRegions();
+    return miopen_params_desc_->params_weights();
+  }
+  ParamsRegions ParamsBiasRegions() const override {
+    if (!ok()) return ParamsRegions();
+    return miopen_params_desc_->params_biases();
+  }
+
+ private:
+  miopenRNNDescriptor_t rnn_desc_;
+  int num_layers_;
+  int hidden_size_;
+  int input_size_;
+  miopenRNNInputMode_t input_mode_;
+  miopenRNNDirectionMode_t direction_mode_;
+  miopenRNNMode_t rnn_mode_;
+  miopenDataType_t data_type_;
+  port::Status status_;
+  // no dropout in MIOpen.
+  // std::unique_ptr<miopenDropoutDescriptor> miopen_dropout_desc_;
+  std::unique_ptr<MIOpenRnnParamsDescriptor> miopen_params_desc_;
+  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnDescriptor);
+};
+
+// Get ID of the internal parameter tensor.
+//
+int MIOpenRnnParamsDescriptor::GetRegionCountPerLayer() const {
+  auto rnn_mode = rnn_desc_->rnn_mode();
+  switch (rnn_mode) {
+    case miopenRNNRELU:
+    case miopenRNNTANH:
+      return 2;
+    case miopenLSTM:
+      return 8;
+    case miopenGRU:
+      return 6;
+    default:
+      LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
+  }
+}
+
+class MIOpenRnnSequenceTensorDescriptor
+    : public MIOpenDescriptorCommon<dnn::RnnSequenceTensorDescriptor> {
+ public:
+  MIOpenRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+                                    int data_size, miopenDataType_t data_type)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        data_size_(data_size),
+        data_type_(data_type) {
+    miopenTensorDescriptor_t handle = nullptr;
+    if (seq_length <= 0) {
+      string error_msg =
+          absl::StrCat("sequence length must be positive: ", seq_length);
+      LOG(ERROR) << error_msg;
+      SetFailure(port::Status(port::error::UNKNOWN, error_msg));
+      return;
+    }
+    auto status = wrap::miopenCreateTensorDescriptor(&handle);
+    RETURN_IF_MIOPEN_ERROR(status, "Failed to create tensor descriptor");
+    std::array<int, 2> dims = {{batch_size, data_size}};
+    status = wrap::miopenSetTensorDescriptor(
+        handle /*tensorDesc*/, data_type /*dataType*/, 2 /*nbDims*/,
+        dims.data() /*dimA*/, nullptr /*strideA*/);
+    RETURN_IF_MIOPEN_ERROR(status, "Failed to update tensor descriptor");
+    // Replicate handle across the number of steps.
+    handles_.assign(seq_length, handle);
+  }
+
+  ~MIOpenRnnSequenceTensorDescriptor() override {
+    // Only the first one needs to be destroyed. All others are the same.
+    auto status = wrap::miopenDestroyTensorDescriptor(handles_[0]);
+    RETURN_IF_MIOPEN_ERROR(status,
+                           "Failed to destroy sequence tensor descriptor");
+  }
+
+  const miopenTensorDescriptor_t* handles() const {
+    if (!ok()) return nullptr;
+    CHECK(!handles_.empty()) << "handles cannot be empty";
+    return handles_.data();
+  }
+
+  int seq_length() const { return seq_length_; }
+  int batch_size() const { return batch_size_; }
+  int data_size() const { return data_size_; }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int data_size_;
+  miopenDataType_t data_type_;
+  std::vector<miopenTensorDescriptor_t> handles_;
+  port::Status status_;
+  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnSequenceTensorDescriptor);
+};
+
+class MIOpenRnnStateTensorDescriptor
+    : public MIOpenDescriptorCommon<dnn::RnnStateTensorDescriptor> {
+ public:
+  MIOpenRnnStateTensorDescriptor(int num_layers, int batch_size, int data_size,
+                                 miopenDataType_t data_type)
+      : handle_(nullptr),
+        num_layers_(num_layers),
+        batch_size_(batch_size),
+        data_size_(data_size),
+        data_type_(data_type) {
+    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
+    RETURN_IF_MIOPEN_ERROR(status, "Failed to create tensor descriptor");
+    std::array<int, 3> dims = {{num_layers, batch_size, data_size}};
+    status = wrap::miopenSetTensorDescriptor(
+        handle_ /*tensorDesc*/, data_type /*dataType*/, 3 /*nbDims*/,
+        dims.data() /*dimA*/, nullptr /*strideA*/);
+    RETURN_IF_MIOPEN_ERROR(status, "Failed to update tensor descriptor");
+  }
+
+  ~MIOpenRnnStateTensorDescriptor() override {
+    if (!handle_) {
+      auto status = wrap::miopenDestroyTensorDescriptor(handle_);
+      RETURN_IF_MIOPEN_ERROR(status, "Unable to destroy RNN state tensor");
+    }
+  }
+
+  miopenTensorDescriptor_t handle() const {
+    if (!ok()) return nullptr;
+    return handle_;
+  }
+  int num_layers() const { return num_layers_; }
+  int batch_size() const { return batch_size_; }
+  int data_size() const { return data_size_; }
+
+ private:
+  miopenTensorDescriptor_t handle_;
+  int num_layers_;
+  int batch_size_;
+  int data_size_;
+  port::Status status_;
+  miopenDataType_t data_type_;
+  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnStateTensorDescriptor);
+};
+
+namespace {
+
+struct RnnModelDims {
+  int num_layers = 0;
+  int batch_size = 0;
+  int seq_length = 0;
+  int hidden_size = 0;
+  int input_size = 0;
+  int dir_count = 0;
+};
+
+template <class T>
+bool ExtractAndCheckRnnForward(
+    const MIOpenRnnDescriptor& rnn_desc,
+    const MIOpenRnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<T>& input_data,
+    const MIOpenRnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<T>& input_h_data,
+    const MIOpenRnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+    const MIOpenRnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<T>& output_data,
+    const MIOpenRnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<T>& output_h_data,
+    const MIOpenRnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<T>& output_c_data, RnnModelDims* model_dims) {
+  // extract model parameters
+  model_dims->num_layers = rnn_desc.num_layers();
+  model_dims->batch_size = input_desc.batch_size();
+  model_dims->seq_length = input_desc.seq_length();
+  model_dims->hidden_size = rnn_desc.hidden_size();
+  model_dims->input_size = input_desc.data_size();
+  model_dims->dir_count =
+      (rnn_desc.direction_mode() == miopenRNNbidirection) ? 2 : 1;
+
+  // check parameters
+  if (!(input_h_desc.num_layers() ==
+            model_dims->num_layers * model_dims->dir_count &&
+        input_h_desc.batch_size() == model_dims->batch_size &&
+        input_h_desc.data_size() == model_dims->hidden_size)) {
+    LOG(ERROR) << "Invalid input_h shape";
+    return false;
+  }
+  if (!(input_h_desc.num_layers() == input_c_desc.num_layers() &&
+        input_h_desc.batch_size() == input_c_desc.batch_size() &&
+        input_h_desc.data_size() == input_c_desc.data_size())) {
+    LOG(ERROR) << "Invalid input_c shape";
+    return false;
+  }
+  if (!(output_desc.seq_length() == model_dims->seq_length &&
+        output_desc.batch_size() == model_dims->batch_size &&
+        output_desc.data_size() ==
+            model_dims->hidden_size * model_dims->dir_count)) {
+    LOG(ERROR) << "Invalid output shape";
+    return false;
+  }
+  if (!(input_h_desc.num_layers() == output_h_desc.num_layers() &&
+        input_h_desc.batch_size() == output_h_desc.batch_size() &&
+        input_h_desc.data_size() == output_h_desc.data_size())) {
+    LOG(ERROR) << "Invalid output_h shape";
+    return false;
+  }
+  if (!(input_h_desc.num_layers() == output_c_desc.num_layers() &&
+        input_h_desc.batch_size() == output_c_desc.batch_size() &&
+        input_h_desc.data_size() == output_c_desc.data_size())) {
+    LOG(ERROR) << "Invalid output_h shape";
+    return false;
+  }
+
+  return true;
+}
+
+bool CheckRNNParameterSize(
+    miopenHandle_t miopen_handle, const MIOpenRnnDescriptor& rnn_desc,
+    const MIOpenRnnSequenceTensorDescriptor& input_desc) {
+  size_t params_size_in_bytes = 0;
+  auto status = wrap::miopenGetRNNParamsSize(
+      miopen_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+      input_desc.handles()[0] /*xDesc*/, &params_size_in_bytes /*sizeInBytes*/,
+      rnn_desc.data_type() /*dataType*/);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "Unable to check RNN param size: " << ToString(status);
+    return false;
+  }
+  return static_cast<int64>(params_size_in_bytes) ==
+         rnn_desc.ParamsSizeInBytes();
+}
+
+bool CreateRnnWorkspace(Stream* stream, miopenHandle_t miopen_handle,
+                        const MIOpenRnnDescriptor& rnn_desc,
+                        const MIOpenRnnSequenceTensorDescriptor& input_desc,
+                        ScratchAllocator* workspace_allocator,
+                        DeviceMemory<uint8>* workspace) {
+  // Query the workspace size.
+  size_t workspace_size_in_bytes = 0;
+  auto status = wrap::miopenGetRNNWorkspaceSize(
+      miopen_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+      input_desc.seq_length() /*seqLength*/, input_desc.handles() /*xDesc*/,
+      &workspace_size_in_bytes /*sizeInBytes*/);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "Unable to query workspace size: " << ToString(status);
+    return false;
+  }
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    auto allocated =
+        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+    if (!allocated.ok() || (*workspace = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate RNN workspace";
+
+      return false;
+    }
+    stream->ThenMemZero(workspace, workspace_size_in_bytes);
+  } else {
+    *workspace = DeviceMemory<uint8>();
+  }
+  return true;
+}
+
+}  // namespace
+
+template <class T>
+bool MIOpenSupport::DoRnnForwardImpl(
+    Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+    const MIOpenRnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<T>& input_data,
+    const MIOpenRnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<T>& input_h_data,
+    const MIOpenRnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+    const MIOpenRnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<T>* output_data,
+    const MIOpenRnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<T>* output_h_data,
+    const MIOpenRnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<T>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator) {
+  // extract model parameters
+  RnnModelDims model_dims;
+  bool res = ExtractAndCheckRnnForward(
+      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+      input_c_desc, input_c_data, params, output_desc, *output_data,
+      output_h_desc, *output_h_data, output_c_desc, *output_c_data,
+      &model_dims);
+  if (!res) {
+    LOG(ERROR) << "Invalid parameters for RNN Model";
+    return false;
+  }
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // check params size
+
+  if (!CheckRNNParameterSize(miopen.handle(), rnn_desc, input_desc)) {
+    LOG(ERROR) << "Invalid parameters";
+    return false;
+  }
+
+  // create the workspace
+  DeviceMemory<uint8> workspace;
+  if (!CreateRnnWorkspace(stream, miopen.handle(), rnn_desc, input_desc,
+                          workspace_allocator, &workspace)) {
+    LOG(ERROR) << "Unable to create rnn workspace";
+
+    return false;
+  }
+
+  // query the reserve space size
+  // allocate the reserve space
+  DeviceMemory<uint8> reserve_space;
+  if (is_training) {
+    size_t reserve_space_size_in_bytes = 0;
+    auto status = wrap::miopenGetRNNTrainingReserveSize(
+        miopen.handle() /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+        model_dims.seq_length /*seqLength*/, input_desc.handles() /*xDesc*/,
+        &reserve_space_size_in_bytes /*sizeInBytes*/);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "Unable to query reserve space size: " << ToString(status);
+      return false;
+    }
+
+    if (reserve_space_size_in_bytes > 0) {
+      auto allocated = reserve_space_allocator->AllocateBytes(
+          stream, reserve_space_size_in_bytes);
+      if (!allocated.ok() ||
+          (reserve_space = allocated.ValueOrDie()) == nullptr) {
+        LOG(ERROR) << "Fail to allocate RNN reserve space";
+        return false;
+      }
+      stream->ThenMemZero(&reserve_space, reserve_space_size_in_bytes);
+    }
+  }
+
+  // make the forward call
+  if (!is_training) {
+    auto status = wrap::miopenRNNForwardInference(
+        miopen.handle() /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+        model_dims.seq_length /*seqLength*/, input_desc.handles() /*xDesc*/,
+        input_data.opaque() /*x*/, input_h_desc.handle() /*hxDesc*/,
+        input_h_data.opaque() /*hx*/, input_c_desc.handle() /*cxDesc*/,
+        input_c_data.opaque() /*cx*/, rnn_desc.params_handle() /*wDesc*/,
+        params.opaque() /*w*/, output_desc.handles() /*yDesc*/,
+        output_data->opaque() /*y*/, output_h_desc.handle() /*hyDesc*/,
+        output_h_data->opaque() /*hy*/, output_c_desc.handle() /*cyDesc*/,
+        output_c_data->opaque() /*cy*/, workspace.opaque() /*workspace*/,
+        workspace.size() /*workSpaceSizeInBytes*/);
+
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "Failed to call miopenRNNForwardInference: "
+                 << ToString(status);
+      return false;
+    }
+  } else {
+    auto status = wrap::miopenRNNForwardTraining(
+        miopen.handle() /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+        model_dims.seq_length /*seqLength*/, input_desc.handles() /*xDesc*/,
+        input_data.opaque() /*x*/, input_h_desc.handle() /*hxDesc*/,
+        input_h_data.opaque() /*hx*/, input_c_desc.handle() /*cxDesc*/,
+        input_c_data.opaque() /*cx*/, rnn_desc.params_handle() /*wDesc*/,
+        params.opaque() /*w*/, output_desc.handles() /*yDesc*/,
+        output_data->opaque() /*y*/, output_h_desc.handle() /*hyDesc*/,
+        output_h_data->opaque() /*hy*/, output_c_desc.handle() /*cyDesc*/,
+        output_c_data->opaque() /*cy*/, workspace.opaque() /*workspace*/,
+        workspace.size() /*workSpaceSizeInBytes*/,
+        reserve_space.opaque() /*reserveSpace*/,
+        reserve_space.size() /*reserveSpaceSizeInBytes*/);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "Failed to call miopenRNNForwardTraining"
+                 << ToString(status);
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class T>
+bool MIOpenSupport::DoRnnBackwardImpl(
+    Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+    const MIOpenRnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<T>& input_data,
+    const MIOpenRnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<T>& input_h_data,
+    const MIOpenRnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+    const MIOpenRnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<T>& output_data,
+    const MIOpenRnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<T>& output_h_data,
+    const MIOpenRnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<T>& output_c_data,
+    const DeviceMemory<T>& output_backprop_data,
+    const DeviceMemory<T>& output_h_backprop_data,
+    const DeviceMemory<T>& output_c_backprop_data,
+    DeviceMemory<T>* input_backprop_data,
+    DeviceMemory<T>* input_h_backprop_data,
+    DeviceMemory<T>* input_c_backprop_data,
+    DeviceMemory<T>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
+  // extract model parameters
+  RnnModelDims model_dims;
+  bool res = ExtractAndCheckRnnForward(
+      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+      input_c_desc, input_c_data, params, output_desc, output_data,
+      output_h_desc, output_h_data, output_c_desc, output_c_data, &model_dims);
+  if (!res) {
+    LOG(ERROR) << "Invalid parameters for RNN Model";
+    return false;
+  }
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // check params size
+
+  if (!CheckRNNParameterSize(miopen.handle(), rnn_desc, input_desc)) {
+    LOG(ERROR) << "Invalid parameters";
+    return false;
+  }
+
+  // create the workspace
+  DeviceMemory<uint8> workspace;
+  if (!CreateRnnWorkspace(stream, miopen.handle(), rnn_desc, input_desc,
+                          workspace_allocator, &workspace)) {
+    LOG(ERROR) << "Unable to create rnn workspace";
+    return false;
+  }
+
+  // workaround for missing initialization support in MIOpen.
+  // TODO: remove this when MIOpen is ready.
+  auto size_data = input_desc.seq_length() * input_desc.batch_size() *
+                   input_desc.data_size();
+  if ((size_data > 0) && (input_backprop_data->opaque() != nullptr))
+    stream->ThenMemZero(input_backprop_data, size_data * sizeof(float));
+
+  size_data = input_h_desc.num_layers() * input_h_desc.batch_size() *
+              input_h_desc.data_size();
+  if ((size_data > 0) && (input_h_backprop_data->opaque() != nullptr))
+    stream->ThenMemZero(input_h_backprop_data, size_data * sizeof(float));
+
+  size_data = input_c_desc.num_layers() * input_c_desc.batch_size() *
+              input_c_desc.data_size();
+  if ((size_data > 0) && (input_c_backprop_data->opaque() != nullptr))
+    stream->ThenMemZero(input_c_backprop_data, size_data * sizeof(float));
+
+  // make the backward data call
+  auto status = wrap::miopenRNNBackwardData(
+      miopen.handle() /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+      model_dims.seq_length /*seqLength*/, output_desc.handles() /*yDesc*/,
+      output_data.opaque() /*y*/, output_desc.handles() /*dyDesc*/,
+      output_backprop_data.opaque() /*dy*/, output_h_desc.handle() /*dhyDesc*/,
+      output_h_backprop_data.opaque() /*dhy*/,
+      output_c_desc.handle() /*dcyDesc*/,
+      output_c_backprop_data.opaque() /*dcy*/,
+      rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
+      input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
+      input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
+      input_desc.handles() /*dxDesc*/, input_backprop_data->opaque() /*dx*/,
+      input_h_desc.handle() /*dhxDesc*/,
+      input_h_backprop_data->opaque() /*dhx*/,
+      input_c_desc.handle() /*dcxDesc*/,
+      input_c_backprop_data->opaque() /*dcx*/, workspace.opaque() /*workspace*/,
+      workspace.size() /*workSpaceSizeInBytes*/,
+      reserve_space_data->opaque() /*reserveSpace*/,
+      reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "Failed to call miopenRNNBackwardData: " << ToString(status);
+    return false;
+  }
+
+  if (params_backprop_data != nullptr) {
+    // Clear the dw to zeros.
+    stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
+    // make the backward weight call
+    status = wrap::miopenRNNBackwardWeights(
+        miopen.handle() /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+        model_dims.seq_length /*seqLength*/, input_desc.handles() /*xDesc*/,
+        input_data.opaque() /*x*/, input_h_desc.handle() /*hxDesc*/,
+        input_h_data.opaque() /*hx*/, output_desc.handles() /*yDesc*/,
+        output_data.opaque() /*y*/, rnn_desc.params_handle() /*dwDesc*/,
+        params_backprop_data->opaque() /*dw*/, workspace.opaque() /*workspace*/,
+        workspace.size() /*workSpaceSizeInBytes*/,
+        reserve_space_data->opaque() /*reserveSpace*/,
+        reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "Failed to call miopenRNNBackwardWeights: "
+                 << ToString(status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+MIOpenRnnParamsDescriptor::MIOpenRnnParamsDescriptor(
+    miopenHandle_t miopen_handle, const MIOpenRnnDescriptor& rnn_desc)
+    : handle_(nullptr), rnn_desc_(&rnn_desc), params_size_in_bytes_(0) {
+  miopenTensorDescriptor_t input_desc = nullptr;
+  {
+    // Query the params size.
+    auto status = wrap::miopenCreateTensorDescriptor(&input_desc);
+    RETURN_IF_MIOPEN_ERROR(status, "MIOpen fails to create tensor descriptor");
+    std::array<int, 2> dims = {{1, rnn_desc.input_size()}};
+    status = wrap::miopenSetTensorDescriptor(
+        input_desc /*tensorDesc*/, rnn_desc.data_type() /*dataType*/,
+        2 /*nbDims*/, dims.data() /*dimA*/, nullptr /*strideA*/);
+    RETURN_IF_MIOPEN_ERROR(status, "MIOpen fails to set tensor descriptor");
+
+    size_t params_size = 0;
+    status = wrap::miopenGetRNNParamsSize(
+        miopen_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
+        input_desc /*xDesc*/, &params_size /*sizeInBytes*/,
+        rnn_desc.data_type() /*dataType*/);
+    RETURN_IF_MIOPEN_ERROR(status, "MIOpen fails to get RNN parameter size");
+    params_size_in_bytes_ = static_cast<int64>(params_size);
+  }
+
+  {
+    // Create the params descriptor.
+    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
+    RETURN_IF_MIOPEN_ERROR(status,
+                           "MIOpen fails to create RNN params descriptor");
+    status = wrap::miopenGetRNNParamsDescriptor(miopen_handle,
+                                                rnn_desc.handle(), input_desc,
+                                                handle_, rnn_desc.data_type());
+    RETURN_IF_MIOPEN_ERROR(status,
+                           "MIOpen fails to update RNN filter descriptor");
+  }
+  {
+    // Release the dummy input tensor descriptor.
+    auto status = wrap::miopenDestroyTensorDescriptor(input_desc);
+    RETURN_IF_MIOPEN_ERROR(status, "MIOpen fails to destroy tensor descriptor");
+  }
+}
+
+port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
+MIOpenSupport::createRnnDescriptor(
+    int num_layers, int hidden_size, int input_size, int batch_size,
+    dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+    ScratchAllocator* state_allocator) {
+  // ROCM TODO: batch_size is ignored for now
+
+  auto miopen = miopen_->GetHandle(parent_, nullptr);
+  std::unique_ptr<MIOpenRnnDescriptor> rnn_desc(new MIOpenRnnDescriptor(
+      miopen.handle(), num_layers, hidden_size, input_size,
+      ToMIOpenRnnInputMode(input_mode),
+      ToMIOpenRnnDirectionMode(direction_mode), ToMIOpenRnnMode(rnn_mode),
+      ToMIOpenDataType(data_type), dropout, seed, state_allocator));
+  if (!rnn_desc->ok()) {
+    return rnn_desc->Status();
+  }
+  return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
+      std::move(rnn_desc));
+}
+
+port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+MIOpenSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+                                                 int data_size,
+                                                 dnn::DataType data_type) {
+  std::unique_ptr<MIOpenRnnSequenceTensorDescriptor> seq_desc(
+      new MIOpenRnnSequenceTensorDescriptor(seq_length, batch_size, data_size,
+                                            ToMIOpenDataType(data_type)));
+  if (!seq_desc->ok()) {
+    return seq_desc->Status();
+  }
+  return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
+      std::move(seq_desc));
+}
+
+port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+MIOpenSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
+                                              int data_size,
+                                              dnn::DataType data_type) {
+  std::unique_ptr<MIOpenRnnStateTensorDescriptor> state_desc(
+      new MIOpenRnnStateTensorDescriptor(num_layer, batch_size, data_size,
+                                         ToMIOpenDataType(data_type)));
+  if (!state_desc->ok()) {
+    return state_desc->Status();
+  }
+  return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
+      std::move(state_desc));
+}
+
+bool MIOpenSupport::DoRnnForward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<Eigen::half>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<Eigen::half>& input_c_data,
+    const DeviceMemory<Eigen::half>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<Eigen::half>* output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<Eigen::half>* output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  // ROCM TODO: output_profile_result is ignore for now
+
+  const MIOpenRnnDescriptor& miopen_rnn_desc =
+      static_cast<const MIOpenRnnDescriptor&>(rnn_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_input_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(input_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_c_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_output_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(output_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnForwardImpl<Eigen::half>(
+      stream, miopen_rnn_desc, miopen_input_desc, input_data,
+      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+      params, miopen_output_desc, output_data, miopen_output_h_desc,
+      output_h_data, miopen_output_c_desc, output_c_data, is_training,
+      reserve_space_allocator, workspace_allocator);
+}
+
+bool MIOpenSupport::DoRnnForward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<float>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<float>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<float>& input_c_data, const DeviceMemory<float>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<float>* output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<float>* output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<float>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  // ROCM TODO: output_profile_result is ignore for now
+
+  const MIOpenRnnDescriptor& miopen_rnn_desc =
+      static_cast<const MIOpenRnnDescriptor&>(rnn_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_input_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(input_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_c_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_output_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(output_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnForwardImpl<float>(
+      stream, miopen_rnn_desc, miopen_input_desc, input_data,
+      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+      params, miopen_output_desc, output_data, miopen_output_h_desc,
+      output_h_data, miopen_output_c_desc, output_c_data, is_training,
+      reserve_space_allocator, workspace_allocator);
+}
+
+bool MIOpenSupport::DoRnnForward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<double>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<double>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<double>& input_c_data,
+    const DeviceMemory<double>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<double>* output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<double>* output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<double>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "miopen does not support double type RNN fwd yet";
+  return false;
+}
+
+bool MIOpenSupport::DoRnnBackward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<Eigen::half>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<Eigen::half>& input_c_data,
+    const DeviceMemory<Eigen::half>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<Eigen::half>& output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<Eigen::half>& output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<Eigen::half>& output_c_data,
+    const DeviceMemory<Eigen::half>& output_backprop_data,
+    const DeviceMemory<Eigen::half>& output_h_backprop_data,
+    const DeviceMemory<Eigen::half>& output_c_backprop_data,
+    DeviceMemory<Eigen::half>* input_backprop_data,
+    DeviceMemory<Eigen::half>* input_h_backprop_data,
+    DeviceMemory<Eigen::half>* input_c_backprop_data,
+    DeviceMemory<Eigen::half>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  // ROCM TODO: output_profile_result is ignore for now
+
+  const MIOpenRnnDescriptor& miopen_rnn_desc =
+      static_cast<const MIOpenRnnDescriptor&>(rnn_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_input_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(input_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_c_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_output_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(output_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnBackwardImpl<Eigen::half>(
+      stream, miopen_rnn_desc, miopen_input_desc, input_data,
+      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+      params, miopen_output_desc, output_data, miopen_output_h_desc,
+      output_h_data, miopen_output_c_desc, output_c_data, output_backprop_data,
+      output_h_backprop_data, output_c_backprop_data, input_backprop_data,
+      input_h_backprop_data, input_c_backprop_data, params_backprop_data,
+      reserve_space_data, workspace_allocator);
+}
+
+bool MIOpenSupport::DoRnnBackward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<float>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<float>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<float>& input_c_data, const DeviceMemory<float>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<float>& output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<float>& output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<float>& output_c_data,
+    const DeviceMemory<float>& output_backprop_data,
+    const DeviceMemory<float>& output_h_backprop_data,
+    const DeviceMemory<float>& output_c_backprop_data,
+    DeviceMemory<float>* input_backprop_data,
+    DeviceMemory<float>* input_h_backprop_data,
+    DeviceMemory<float>* input_c_backprop_data,
+    DeviceMemory<float>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  // ROCM TODO: output_profile_result is ignore for now
+
+  const MIOpenRnnDescriptor& miopen_rnn_desc =
+      static_cast<const MIOpenRnnDescriptor&>(rnn_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_input_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(input_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_input_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(input_c_desc);
+  const MIOpenRnnSequenceTensorDescriptor& miopen_output_desc =
+      static_cast<const MIOpenRnnSequenceTensorDescriptor&>(output_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_h_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_h_desc);
+  const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
+      static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnBackwardImpl<float>(
+      stream, miopen_rnn_desc, miopen_input_desc, input_data,
+      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+      params, miopen_output_desc, output_data, miopen_output_h_desc,
+      output_h_data, miopen_output_c_desc, output_c_data, output_backprop_data,
+      output_h_backprop_data, output_c_backprop_data, input_backprop_data,
+      input_h_backprop_data, input_c_backprop_data, params_backprop_data,
+      reserve_space_data, workspace_allocator);
+}
+
+bool MIOpenSupport::DoRnnBackward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<double>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<double>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<double>& input_c_data,
+    const DeviceMemory<double>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<double>& output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<double>& output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<double>& output_c_data,
+    const DeviceMemory<double>& output_backprop_data,
+    const DeviceMemory<double>& output_h_backprop_data,
+    const DeviceMemory<double>& output_c_backprop_data,
+    DeviceMemory<double>* input_backprop_data,
+    DeviceMemory<double>* input_h_backprop_data,
+    DeviceMemory<double>* input_c_backprop_data,
+    DeviceMemory<double>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "miopen does not support half type RNN bwd yet";
+  return false;
+}
+
+// This is the context required to use the TF scratch allocator:
+struct MIOpenAllocatorContext {
+  MIOpenAllocatorContext(ScratchAllocator* scratch_allocator, Stream* stream)
+      : scratch_allocator_(scratch_allocator), stream_(stream) {}
+
+  ScratchAllocator* scratch_allocator_;
+  Stream* stream_;
+};
+
+void* MIOpenAllocatorCallback(void* ctx, size_t size_in_bytes) {
+  auto* mac = static_cast<MIOpenAllocatorContext*>(ctx);
+  auto allocated =
+      mac->scratch_allocator_->AllocateBytes(mac->stream_, size_in_bytes);
+
+  DeviceMemory<uint8> scratch;
+  if (allocated.ok()) {
+    scratch = allocated.ValueOrDie();
+    return scratch.opaque();
+  } else {
+    return nullptr;
+  }
+}
+
+void MIOpenDeallocatorCallback(void* ctx, void* mem) {
+  // Don't need dealloactor since the TensorFlow heap will automatically reclaim
+  // the memory
+}
+
+port::Status MIOpenSupport::DoPrepareForConvolution(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::AlgorithmConfig& algorithm_config,
+    ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory) {
+  ScopedTensorDescriptor input_nd{
+      input_descriptor,
+      ToMIOpenDataType(element_type, input_descriptor.layout())};
+  ScopedFilterDescriptor filter{
+      filter_descriptor, input_descriptor,
+      ToMIOpenDataType(element_type, filter_descriptor.layout())};
+  ScopedTensorDescriptor output_nd{
+      output_descriptor,
+      ToMIOpenDataType(element_type, output_descriptor.layout())};
+  ScopedConvolutionDescriptor conv{
+      convolution_descriptor,
+      ToMIOpenDataType(GetConvAccumulatorType(element_type))};
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  size_t scratch_memory_size;
+
+  if (!algo_desc.has_value()) {
+    // With the default algorithm, use MIOpen's heuristics.
+    assert(scratch_allocator);
+
+    DeviceMemory<uint8> scratch_memory_temp;
+    MIOpenAllocatorContext mac(scratch_allocator, stream);
+    wrap::miopenSetAllocator(miopen.handle(), MIOpenAllocatorCallback,
+                             MIOpenDeallocatorCallback, &mac);
+    size_t size_in_bytes;
+    miopenStatus_t status = miopenStatusSuccess;
+
+    switch (kind) {
+      case dnn::ConvolutionKind::FORWARD: {
+        status = wrap::miopenConvolutionForwardGetWorkSpaceSize(
+            miopen.handle(), /*filterDesc=*/filter.handle(),
+            /*srcDesc=*/input_nd.handle(), /*convDesc=*/conv.handle(),
+            /*destDesc=*/output_nd.handle(), /*sizeInBytes=*/&size_in_bytes);
+        break;
+      }
+      case dnn::ConvolutionKind::BACKWARD_DATA: {
+        status = wrap::miopenConvolutionBackwardDataGetWorkSpaceSize(
+            miopen.handle(), /*diffDesc=*/output_nd.handle(),
+            /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+            /*gradDesc=*/input_nd.handle(), /*sizeInBytes=*/&size_in_bytes);
+        break;
+      }
+      case dnn::ConvolutionKind::BACKWARD_FILTER: {
+        status = wrap::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+            miopen.handle(), /*diffDesc=*/output_nd.handle(),
+            /*srcDesc=*/input_nd.handle(), /*convDesc=*/conv.handle(),
+            /*gradDesc=*/filter.handle(), /*sizeInBytes=*/&size_in_bytes);
+        break;
+      }
+      default:
+        return port::InternalError(absl::StrCat("Unexpected convolution kind ",
+                                                static_cast<int>(kind)));
+    }
+
+    if (status == miopenStatusSuccess && size_in_bytes != 0) {
+      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
+      if (allocated.ok()) {
+        scratch_memory_temp = allocated.ValueOrDie();
+      }
+    }
+
+    miopenConvAlgoPerf_t preference;
+    int returnedAlgoCount;
+
+    switch (kind) {
+      case dnn::ConvolutionKind::FORWARD: {
+        auto status = wrap::miopenFindConvolutionForwardAlgorithm(
+            miopen.handle(), input_nd.handle(), input_data.opaque(),
+            filter.handle(), filter_data.opaque(), conv.handle(),
+            output_nd.handle(), output_data.opaque(),
+            /*requestAlgoCount=*/1, &returnedAlgoCount,
+            /*preference=*/&preference,
+            /*workspace*/ scratch_memory_temp.opaque(),
+            /*WorkSpaceSize*/ scratch_memory_temp.size(),
+            /*exhaustiveSearch*/ false);
+        CHECK_EQ(status, miopenStatusSuccess) << "Unable to find a suitable "
+                                                 "algorithm for doing forward "
+                                                 "convolution";
+        *algorithm_desc = dnn::AlgorithmDesc(preference.fwd_algo, false);
+        break;
+      }
+      case dnn::ConvolutionKind::BACKWARD_DATA: {
+        auto status = wrap::miopenFindConvolutionBackwardDataAlgorithm(
+            miopen.handle(),
+            /*diffDesc=*/output_nd.handle(), output_data.opaque(),
+            /*filterDesc=*/filter.handle(), filter_data.opaque(),
+            /*convDesc=*/conv.handle(),
+            /*gradDesc=*/input_nd.handle(), input_data.opaque(),
+            /*requestCount=*/1, /*returnedAlgoCount=*/&returnedAlgoCount,
+            /*preference=*/&preference,
+            /*WorkSpace=*/scratch_memory_temp.opaque(),
+            /*WorkSpaceSize=*/scratch_memory_temp.size(),
+            /*exhaustiveSearch=*/false);
+        CHECK_EQ(status, miopenStatusSuccess) << "Unable to find a suitable "
+                                                 "algorithm for doing backward "
+                                                 "data convolution";
+        *algorithm_desc = dnn::AlgorithmDesc(preference.bwd_data_algo, false);
+        break;
+      }
+      case dnn::ConvolutionKind::BACKWARD_FILTER: {
+        auto status = wrap::miopenFindConvolutionBackwardWeightsAlgorithm(
+            miopen.handle(),
+            /*diffDesc=*/output_nd.handle(), output_data.opaque(),
+            /*srcDesc=*/input_nd.handle(), input_data.opaque(),
+            /*convDesc=*/conv.handle(),
+            /*gradDesc=*/filter.handle(), filter_data.opaque(),
+            /*requestAlgoCount=*/1, /*returnedAlgoCount=*/&returnedAlgoCount,
+            /*preference=*/&preference,
+            /*WorkSpace=*/scratch_memory_temp.opaque(),
+            /*WorkSpaceSize=*/scratch_memory_temp.size(),
+            /*exhaustiveSearch=*/false);
+        CHECK_EQ(status, miopenStatusSuccess) << "Unable to find a suitable "
+                                                 "algorithm for doing backward "
+                                                 "filter convolution";
+        *algorithm_desc =
+            dnn::AlgorithmDesc(preference.bwd_weights_algo, false);
+        break;
+      }
+      default:
+        return port::InternalError(absl::StrCat("Unexpected convolution kind ",
+                                                static_cast<int>(kind)));
+    }
+
+    // Restore default allocator, note mac is stack temp
+    wrap::miopenSetAllocator(miopen.handle(), nullptr, nullptr, nullptr);
+
+    scratch_memory_size = preference.memory;
+  } else {
+    // An algorithm has been specified.
+    *algorithm_desc = *algo_desc;
+    // commenting this line out for the upstream repo, since
+    // AlgorithmConfig::scratch_size_ has been removed in the upstream repo but
+    // is still used in the ROCM develop-upstream repo
+    //
+    // scratch_memory_size = *(algorithm_config.scratch_size());
+    //
+  }
+
+  // allocate scratch memory
+  if (scratch_memory_size != 0) {
+    if (scratch_allocator == nullptr) {
+      return port::InternalError(
+          absl::StrCat("An allocator must be specified when scratch memory is "
+                       "needed"));
+    }
+    auto allocated =
+        scratch_allocator->AllocateBytes(stream, scratch_memory_size);
+    if (!allocated.ok()) {
+      return port::InternalError(absl::StrCat(
+          "Failed to allocate scratch memory of size: ", scratch_memory_size));
+    }
+    if (allocated.ok()) {
+      *scratch_memory = allocated.ValueOrDie();
+    }
+  }
+
+  return port::Status::OK();
+}
+
+// NOTE(keveman): Temporary data layout transformation until MIOpen supports
+// kBatchYXDepth for backward pass. This function allocates temporary memory,
+// lays out the source data into the temporary but in the kBatchDepthXY
+// layout, and returns the temporary memory. The caller is responsible for
+// deallocating the temporary. Since the allocation is done using Stream's
+// AllocateTemporaryMemory, a later BlockHostUntilDone could be used for
+// deallocation.
+//
+// transform_scratch is populated with a legitimate temporary allocation iff
+// the original output data needs to be transformed.
+static DeviceMemoryBase MaybeTransformLayout(
+    Stream* stream, miopenHandle_t handle_,
+    int miopen_type,  // Actually miopenDataType_t.
+    BatchDescriptor* output_descriptor, DeviceMemoryBase backward_output_data,
+    std::unique_ptr<TemporaryDeviceMemory<uint8>>* transform_scratch) {
+  if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) {
+    return backward_output_data;
+  }
+  CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth);
+  *transform_scratch =
+      stream->AllocateTemporaryArray<uint8>(backward_output_data.size())
+          .ConsumeValueOrDie();
+  BatchDescriptor transformed_output_descriptor;
+  transformed_output_descriptor.CloneFrom(*output_descriptor);
+  transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
+  ScopedTensorDescriptor orig_out_back_nd{
+      *output_descriptor, static_cast<miopenDataType_t>(miopen_type)};
+  ScopedTensorDescriptor transformed_out_back_nd{
+      transformed_output_descriptor,
+      static_cast<miopenDataType_t>(miopen_type)};
+
+  float alpha1 = 1.0f;
+  float alpha2 = 0.0f;
+  float beta = 0.0f;
+  auto status = wrap::miopenOpTensor(
+      handle_, miopenTensorOpAdd, &alpha1, orig_out_back_nd.handle(),
+      backward_output_data.opaque(), &alpha2, orig_out_back_nd.handle(),
+      backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
+      (*transform_scratch)->mutable_device_memory()->opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "Failed to transform the data layout.";
+  }
+  output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX);
+  return (*transform_scratch)->device_memory();
+}
+
+port::Status MIOpenSupport::DoConvolve(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
+    dnn::ProfileResult* output_profile_result) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+  ScopedTensorDescriptor input_nd{input_descriptor,
+                                  ToMIOpenDataType(element_type)};
+  ScopedTensorDescriptor output_nd{output_descriptor,
+                                   ToMIOpenDataType(element_type)};
+  ScopedFilterDescriptor filter{filter_descriptor, input_descriptor,
+                                ToMIOpenDataType(element_type)};
+  ScopedConvolutionDescriptor conv{convolution_descriptor,
+                                   ToMIOpenDataType(element_type)};
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  const bool is_profiling = output_profile_result != nullptr;
+
+  std::unique_ptr<GpuTimer> timer;
+  if (is_profiling) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init()) {
+      return port::Status(port::error::INTERNAL, "Failed to init timer");
+    }
+    // The start and stop of the timer should be as close to the MIOpen call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsGpuStream(stream))) {
+      timer->Destroy();
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
+  }
+
+  miopenStatus_t status = miopenStatusSuccess;
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      status = wrap::miopenConvolutionForward(
+          miopen.handle(),
+          /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
+          /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+          /*algo=*/
+          static_cast<miopenConvFwdAlgorithm_t>(algorithm_desc.algo_id()),
+          /*beta=*/&beta, /*destDesc=*/output_nd.handle(),
+          /*destData=*/output_data.opaque(),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size());
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      // TBD: remove once MIOpen supports kBatchYXDepth for backward pass.
+      BatchDescriptor output_back_descriptor;
+      output_back_descriptor.CloneFrom(output_descriptor);
+      std::unique_ptr<TemporaryDeviceMemory<uint8>> transform_scratch;
+      output_data = MaybeTransformLayout(
+          stream, miopen.handle(), ToMIOpenDataType(element_type),
+          &output_back_descriptor, output_data, &transform_scratch);
+
+      status = wrap::miopenConvolutionBackwardData(
+          miopen.handle(),
+          /*alpha=*/&alpha,
+          /*diffDesc=*/output_nd.handle(),
+          /*diffData=*/output_data.opaque(),
+          /*filterDesc=*/filter.handle(),
+          /*filterData=*/filter_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/
+          static_cast<miopenConvBwdDataAlgorithm_t>(algorithm_desc.algo_id()),
+          /*beta=*/&beta,
+          /*gradDesc=*/input_nd.handle(),
+          /*gradData=*/input_data.opaque(),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size());
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      // TBD: remove once MIOpen supports kBatchYXDepth for backward pass.
+      BatchDescriptor output_back_descriptor;
+      output_back_descriptor.CloneFrom(output_descriptor);
+      std::unique_ptr<TemporaryDeviceMemory<uint8>> transform_scratch;
+      output_data = MaybeTransformLayout(
+          stream, miopen.handle(), ToMIOpenDataType(element_type),
+          &output_back_descriptor, output_data, &transform_scratch);
+
+      status = wrap::miopenConvolutionBackwardWeights(
+          miopen.handle(),
+          /*alpha=*/&alpha,
+          /*diffDesc=*/output_nd.handle(),
+          /*diffData=*/output_data.opaque(),
+          /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/
+          static_cast<miopenConvBwdWeightsAlgorithm_t>(
+              algorithm_desc.algo_id()),
+          /*beta=*/&beta,
+          /*gradDesc=*/filter.handle(),
+          /*gradData=*/filter_data.opaque(),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size());
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
+
+  if (is_profiling) {
+    if (!timer->Stop(AsGpuStream(stream))) {
+      timer->Destroy();
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
+    }
+    if (status == miopenStatusSuccess) {
+      dnn::AlgorithmDesc algotype(algorithm_desc.algo_id(), false);
+      output_profile_result->set_algorithm(algotype);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+      output_profile_result->set_scratch_size(scratch_memory.size());
+    }
+    timer->Destroy();
+  }
+
+  if (status != miopenStatusSuccess) {
+    return port::InternalError(absl::StrCat(
+        "Failed to euqueue convolution on stream: ", ToString(status)));
+  }
+
+  return port::Status::OK();
+}
+
+bool MIOpenSupport::GetConvolveAlgorithms(
+    // ROCM TODO: refactor cc_major / cc_minor
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  out_algorithms->assign({
+      // clang-format off
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoGEMM, false),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoDirect, false),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoFFT, false),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoWinograd, false),
+      // clang-format on
+  });
+  return true;
+}
+
+bool MIOpenSupport::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  // ROCM TODO: implement this with proper MIOpen API
+  return true;
+}
+
+bool MIOpenSupport::GetConvolveBackwardDataAlgorithms(
+    // ROCM TODO: refactor cc_major / cc_minor
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  out_algorithms->assign({
+      // clang-format off
+      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoGEMM, false),
+      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoDirect, false),
+      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoFFT, false),
+      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoWinograd, false),
+      // clang-format on
+  });
+  return true;
+}
+
+bool MIOpenSupport::GetConvolveBackwardFilterAlgorithms(
+    // ROCM TODO: refactor cc_major / cc_minor
+    bool with_winograd_nonfused, int cc_major, int cc_minor,
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  out_algorithms->assign({
+      // clang-format off
+      dnn::AlgorithmDesc(miopenConvolutionBwdWeightsAlgoGEMM, false),
+      dnn::AlgorithmDesc(miopenConvolutionBwdWeightsAlgoDirect, false),
+      // clang-format on
+  });
+  return true;
+}
+
+bool MIOpenSupport::DoBatchNormalizationForward(
+    Stream* stream, const DeviceMemory<Eigen::half>& x,
+    const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+    const DeviceMemory<float>& estimated_mean,
+    const DeviceMemory<float>& estimated_variance,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+    DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+    DeviceMemory<float>* saved_inv_var, bool is_training,
+    std::function<const DeviceMemory<float>&()> var_to_inv_var,
+    std::function<void()> inv_var_to_var) {
+  return DoBatchNormalizationForwardImpl<Eigen::half, float>(
+      stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
+      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
+      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
+      std::move(var_to_inv_var), std::move(inv_var_to_var));
+}
+
+bool MIOpenSupport::DoBatchNormalizationForward(
+    Stream* stream, const DeviceMemory<float>& x,
+    const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+    const DeviceMemory<float>& estimated_mean,
+    const DeviceMemory<float>& estimated_variance,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+    DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+    DeviceMemory<float>* saved_inv_var, bool is_training,
+    std::function<const DeviceMemory<float>&()> var_to_inv_var,
+    std::function<void()> inv_var_to_var) {
+  return DoBatchNormalizationForwardImpl<float, float>(
+      stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset,
+      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
+      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
+      std::move(var_to_inv_var), std::move(inv_var_to_var));
+}
+
+template <class T, class U>
+bool MIOpenSupport::DoBatchNormalizationForwardImpl(
+    Stream* stream, dnn::DataType input_data_type,
+    dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+    const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+    const DeviceMemory<U>& estimated_mean,
+    const DeviceMemory<U>& estimated_variance,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
+    DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
+    bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
+    std::function<void()> inv_var_to_var) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor x_descriptor{x_desc,
+                                      ToMIOpenDataType(input_data_type)};
+  ScopedTensorDescriptor scale_offset_descriptor{
+      scale_offset_desc, ToMIOpenDataType(scale_data_type)};
+  miopenBatchNormMode_t mode = miopenBNSpatial;
+  float one = 1.0;
+  float zero = 0.0;
+
+  auto status = miopenStatusInvalidValue;
+  if (is_training) {
+    stream->ThenMemZero(batch_mean, batch_mean->size());
+    stream->ThenMemZero(batch_var, batch_var->size());
+    status = wrap::miopenBatchNormalizationForwardTraining(
+        miopen.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        const_cast<void*>(scale.opaque()), const_cast<void*>(offset.opaque()),
+        1.0, batch_mean->opaque(), batch_var->opaque(), epsilon,
+        saved_mean->opaque(), saved_inv_var->opaque());
+  } else {
+    const void* maybe_inv_var = estimated_variance.opaque();
+    status = wrap::miopenBatchNormalizationForwardInference(
+        miopen.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        const_cast<void*>(scale.opaque()), const_cast<void*>(offset.opaque()),
+        const_cast<void*>(estimated_mean.opaque()),
+        const_cast<void*>(maybe_inv_var), epsilon);
+  }
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoBatchNormalizationBackward(
+    Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+    const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
+    DeviceMemory<float>* offset_backprop) {
+  return DoBatchNormalizationBackwardImpl<Eigen::half, float>(
+      stream, miopenHalf, miopenFloat, y_backprop, x, scale, mean, inv_var,
+      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      offset_backprop);
+}
+
+bool MIOpenSupport::DoBatchNormalizationBackward(
+    Stream* stream, const DeviceMemory<float>& y_backprop,
+    const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+    DeviceMemory<float>* offset_backprop) {
+  return DoBatchNormalizationBackwardImpl<float, float>(
+      stream, miopenFloat, miopenFloat, y_backprop, x, scale, mean, variance,
+      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      offset_backprop);
+}
+
+template <class T, class U>
+bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
+    Stream* stream, int miopen_input_type, int miopen_scale_type,
+    const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+    const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
+    const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
+    DeviceMemory<U>* offset_backprop) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+  ScopedTensorDescriptor x_descriptor{
+      x_desc, static_cast<miopenDataType_t>(miopen_input_type)};
+  ScopedTensorDescriptor scale_offset_descriptor{
+      scale_offset_desc, static_cast<miopenDataType_t>(miopen_scale_type)};
+  miopenBatchNormMode_t mode = miopenBNSpatial;
+  float one = 1.0;
+  float zero = 0.0;
+
+  auto status = wrap::miopenBatchNormalizationBackward(
+      miopen.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
+      x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
+      x_descriptor.handle(), x_backprop->opaque(),
+      scale_offset_descriptor.handle(), scale.opaque(),
+      scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
+      mean.opaque(), variance.opaque());
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<double>& conv_input_data, double conv_input_scale,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<double>& side_input_data, double side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "fused convolve not implemented yet";
+  return false;
+}
+
+bool MIOpenSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<float>& conv_input_data, float conv_input_scale,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<float>& side_input_data, float side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "fused convolve not implemented yet";
+  return false;
+}
+
+bool MIOpenSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<Eigen::half>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<Eigen::half>& biases,
+    dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "fused convolve not implemented yet";
+  return false;
+}
+
+bool MIOpenSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<int8>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<int8>& side_input_data, float side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  LOG(ERROR) << "fused convolve not implemented yet";
+  return false;
+}
+
+bool MIOpenSupport::DoTransformTensor(Stream* stream,
+                                      const dnn::BatchDescriptor& input_desc,
+                                      dnn::DataType input_type,
+                                      const DeviceMemoryBase& input_data,
+                                      const dnn::BatchDescriptor& output_desc,
+                                      dnn::DataType output_type, float scale,
+                                      DeviceMemoryBase* output_data) {
+  // ROCM TODO implement this operation
+  LOG(ERROR) << "transform tensor not implemented yet";
+  return false;
+}
+
+template <class T>
+bool MIOpenSupport::DoConvolveBackwardBiasImpl(
+    Stream* stream, int miopen_type,  // Actually miopenDataType_t.
+    const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<T>& input_data,
+    const dnn::BatchDescriptor& bias_descriptor,
+    DeviceMemory<T>* backward_bias_data) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor input_nd{input_descriptor,
+                                  static_cast<miopenDataType_t>(miopen_type)};
+  ScopedTensorDescriptor bias_nd{bias_descriptor,
+                                 static_cast<miopenDataType_t>(miopen_type)};
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  auto status = wrap::miopenConvolutionBackwardBias(
+      miopen.handle(), &alpha, input_nd.handle(), input_data.opaque(), &beta,
+      bias_nd.handle(), backward_bias_data->opaque());
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "failed to enqueue backward convolution on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<double>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<double>* backward_bias_data) {
+  LOG(ERROR) << "miopen does not support double bwd bias yet";
+  return false;
+}
+
+bool MIOpenSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<float>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<float>* backward_bias_data) {
+  return DoConvolveBackwardBiasImpl(stream, miopenFloat, input_descriptor,
+                                    input_data, bias_descriptor,
+                                    backward_bias_data);
+}
+
+bool MIOpenSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<Eigen::half>* backward_bias_data) {
+  return DoConvolveBackwardBiasImpl(stream, miopenHalf, input_descriptor,
+                                    input_data, bias_descriptor,
+                                    backward_bias_data);
+}
+
+bool MIOpenSupport::DoMatMul(Stream* stream,
+                             const DeviceMemory<float>& input_data,
+                             const DeviceMemory<float>& weights,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<float>* output_data) {
+  if (input_dimensions.count() != output_dimensions.count()) {
+    LOG(ERROR) << "MatMul input and output dimensions are not compatible.";
+    return false;
+  }
+
+  // We do not permute the input or output, instead we just
+  // reinterpret the layout. We are working with row-major matrices
+  // and the rows of the input and output correspond to batch, so
+  // batch has to be outermost in both the input and output.
+  //
+  // By adding transposes to the BLAS gemm call we could perhaps make
+  // the kYXDepthBatch layout work as well, but there has been no need
+  // for that so far.
+  if (input_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+      input_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
+    LOG(ERROR) << "Unsupported MatMul input layout.";
+    return false;
+  }
+  if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+      output_dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
+    LOG(ERROR) << "Unsupported MatMul output layout.";
+    return false;
+  }
+
+  if (output_dimensions.width() == 1 && output_dimensions.height() == 1) {
+    // This is a fast path that also supports the kBatchYXDepth layout.
+
+    // The matrices here are in row-major format while BLAS expects
+    // column-major, i.e. our matrices are transposed as far as BLAS
+    // is concerned. So we need to compute output^T =
+    // input^T*weights^T. There is no parameter for transposing the
+    // output in BLAS gemm, but instead we can transpose both sides of
+    // the equality to see that this is equivalent to
+    // output=weights*input. So we only need to swap the order of
+    // weights and input in the matrix product to correct for the
+    // row-major versus column-major difference.
+    const float alpha = 1.0f;  // Take the matrix product without scaling it.
+    const float beta = 0.0f;   // Ignore the original values in output_data.
+    const int64 m = output_dimensions.NodesAcrossFeatureMaps();
+    const int64 n = input_dimensions.count();
+    const int64 k = input_dimensions.NodesAcrossFeatureMaps();
+    stream->ThenBlasGemm(blas::Transpose::kNoTranspose,
+                         blas::Transpose::kNoTranspose, m, n, k, alpha, weights,
+                         m, input_data, k, beta, output_data, m);
+  } else {
+    // This is a slower and more complex path that supports output
+    // width() * height() > 1, though it only supports the
+    // kBatchYXDepth layout. Does support kBatchDepthYX if output
+    // feature_map_count() == 1, as then there is no difference
+    // between the two layouts.
+    //
+    // The operation here is the same as above, except that we have to
+    // do the matrix multiplication for each (y,x) output coordinate
+    // separately. We then interpret weights as containing K = width()
+    // * height() different matrices, which we all multiply onto the
+    // matrix from input_data, yielding K matrix products. We then
+    // combine these together into one matrix by concatenating all the
+    // first rows of these matrices, then all the seconds rows and so
+    // on. We can do this with a batched matrix multiplication, where
+    // the result is written to a different submatrix of the output
+    // for each matrix multiplication.
+    //
+    // The reason that we only support the kBatchYXDepth output layout
+    // is that we have to do something in the depth for each (y,x)
+    // coordinate. The kBatchYXDepth layout has the depth information
+    // for each point (y,x) in contiguous memory while the
+    // kBatchDepthYX layout does not.
+    //
+    // TODO(broune): Consider a special case for when output depth ==
+    // 1, as then possibly this could all be done as one matrix
+    // multiplication instead of a batched one, which should be
+    // faster. Another possibility would be to add a weights layout
+    // parameter and then support kBatchDepthYX for a different
+    // weights layout.
+    if (output_dimensions.layout() != dnn::DataLayout::kBatchYXDepth &&
+        !(output_dimensions.layout() == dnn::DataLayout::kBatchDepthYX &&
+          output_dimensions.feature_map_count() == 1)) {
+      LOG(ERROR) << "Unsupported MatMul output layout.";
+      return false;
+    }
+
+    const float alpha = 1.0f;  // Take the matrix product without scaling it.
+    const float beta = 0.0f;   // Ignore the original values in output_data.
+    const uint64 m = output_dimensions.feature_map_count();
+    const uint64 n = input_dimensions.count();
+    const uint64 k = input_dimensions.NodesAcrossFeatureMaps();
+    const int lda = m;
+    const int ldb = k;
+    const int ldc = output_dimensions.NodesAcrossFeatureMaps();
+    const int batch_count = output_dimensions.NodesPerFeatureMap();
+
+    std::vector<DeviceMemory<float>> a(batch_count);
+    std::vector<DeviceMemory<float>> b(batch_count);
+    std::vector<DeviceMemory<float>> c(batch_count);
+    for (int i = 0; i < batch_count; ++i) {
+      const int weights_offset = i * input_dimensions.NodesAcrossFeatureMaps() *
+                                 output_dimensions.feature_map_count();
+      a[i] = DeviceMemory<float>::MakeFromByteSize(
+          const_cast<float*>(reinterpret_cast<const float*>(weights.opaque())) +
+              weights_offset,
+          weights.ElementCount() - weights_offset);
+
+      b[i] = input_data;
+
+      const int output_offset = i * output_dimensions.feature_map_count();
+      c[i] = DeviceMemory<float>::MakeFromByteSize(
+          const_cast<float*>(
+              reinterpret_cast<const float*>(output_data->opaque())) +
+              output_offset,
+          output_data->ElementCount() - output_offset);
+    }
+    const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) {
+      std::vector<DeviceMemory<float>*> ptrs;
+      ptrs.reserve(v.size());
+      for (auto& mem : v) {
+        ptrs.push_back(&mem);
+      }
+      return ptrs;
+    };
+
+    stream->ThenBlasGemmBatched(blas::Transpose::kNoTranspose,
+                                blas::Transpose::kNoTranspose, m, n, k, alpha,
+                                toPtrs(a), lda, toPtrs(b), ldb, beta, toPtrs(c),
+                                ldc, batch_count);
+  }
+
+  return stream->ok();
+}
+
+bool MIOpenSupport::DoBiasAdd(Stream* stream,
+                              const DeviceMemory<float>& input_data,
+                              const DeviceMemory<float>& biases,
+                              const dnn::BatchDescriptor& dimensions,
+                              DeviceMemory<float>* output_data) {
+  ScopedTensorDescriptor input_descriptor{dimensions, miopenFloat};
+
+  BatchDescriptor bias_dimensions;
+  bias_dimensions.set_count(1)
+      .set_feature_map_count(dimensions.feature_map_count())
+      .set_height(1)
+      .set_width(1)
+      .set_layout(dnn::DataLayout::kBatchYXDepth);
+  ScopedTensorDescriptor bias_descriptor{bias_dimensions, miopenFloat};
+
+  if (input_data.opaque() != output_data->opaque()) {
+    stream->ThenMemcpy(output_data, input_data,
+                       dimensions.ElementCount() * sizeof(float));
+    if (!stream->ok()) {
+      LOG(ERROR)
+          << "stream " << stream
+          << " could not enqueue a tensor copy as part of bias addition.";
+      return false;
+    }
+  }
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  const float alpha1 = 1.0f;
+  const float alpha2 = 0.0f;
+  const float beta = 1.0f;
+
+  auto status = wrap::miopenOpTensor(
+      miopen.handle(), miopenTensorOpAdd, &alpha1, bias_descriptor.handle(),
+      biases.opaque(), &alpha2, bias_descriptor.handle(), biases.opaque(),
+      &beta, input_descriptor.handle(), output_data->opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
+    return false;
+  }
+
+  return true;
+}
+
+bool MIOpenSupport::DoActivate(Stream* stream,
+                               dnn::ActivationMode activation_mode,
+                               const dnn::BatchDescriptor& dimensions,
+                               const DeviceMemory<float>& input_data,
+                               DeviceMemory<float>* output_data,
+                               uint64 options) {
+  LOG(ERROR) << "miopen does not support activation yet";
+  return false;
+}
+
+bool MIOpenSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<double>* output_data, ScratchAllocator* workspace_allocator) {
+  LOG(ERROR) << "miopen does not support pooling for dobule type yet";
+  return false;
+}
+
+bool MIOpenSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<float>* output_data, ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{input_dimensions, miopenFloat};
+  ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  auto status = wrap::miopenPoolingForward(
+      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque(),
+      false, nullptr, 0);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<Eigen::half>* output_data,
+    ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{input_dimensions, miopenHalf};
+  ScopedTensorDescriptor dest_desc{output_dimensions, miopenHalf};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  auto status = wrap::miopenPoolingForward(
+      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque(),
+      false, nullptr, 0);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<double>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<double>& output_data,
+    const DeviceMemory<double>& input_diff_data,
+    DeviceMemory<double>* output_diff_data,
+    ScratchAllocator* workspace_allocator) {
+  LOG(ERROR) << "miopen does not support backward pooling on double type yet";
+  return false;
+}
+
+bool MIOpenSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<float>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<float>& output_data,
+    const DeviceMemory<float>& input_diff_data,
+    DeviceMemory<float>* output_diff_data,
+    ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{input_dimensions, miopenFloat};
+  ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  DeviceMemory<uint8> workspace;
+  size_t workspace_size_in_bytes = 0;
+  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
+                                                    &workspace_size_in_bytes);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to obtain workspace size for backward pooling on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    assert(workspace_allocator);
+    auto allocated =
+        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate backward pooling workspace";
+      return false;
+    }
+  }
+
+  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
+  int dest2_size = 0;
+
+  // miopen requires the strides and dims to be ordered as BDYX.
+  std::vector<int64> dims64 =
+      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+  // miopen does not use strides and must have 4D tensor.
+  std::vector<int> dims(4);
+
+  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
+                 &CheckedNarrowing<int64, int>);
+
+  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
+
+  if (dest2_size > 0) {
+    assert(workspace_allocator);
+    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate backward pooling workspace";
+      return false;
+    }
+  } else {
+    LOG(ERROR) << "Failed to calcuate tensor size to chain forward and "
+                  "backward pooling";
+  }
+
+  status = wrap::miopenPoolingForward(
+      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
+      workspace.opaque(), workspace_size_in_bytes);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to enqueue forward pooling (before backward) on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  status = wrap::miopenPoolingBackward(
+      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque(), workspace.opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<Eigen::half>& output_data,
+    const DeviceMemory<Eigen::half>& input_diff_data,
+    DeviceMemory<Eigen::half>* output_diff_data,
+    ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{input_dimensions, miopenHalf};
+  ScopedTensorDescriptor dest_desc{output_dimensions, miopenHalf};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  DeviceMemory<uint8> workspace;
+  size_t workspace_size_in_bytes = 0;
+  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
+                                                    &workspace_size_in_bytes);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to obtain workspace size for backward pooling on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    assert(workspace_allocator);
+    auto allocated =
+        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate backward pooling workspace";
+      return false;
+    }
+  }
+
+  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
+  int dest2_size = 0;
+
+  // miopen requires the strides and dims to be ordered as BDYX.
+  std::vector<int64> dims64 =
+      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+  // miopen does not use strides and must have 4D tensor.
+  std::vector<int> dims(4);
+
+  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
+                 &CheckedNarrowing<int64, int>);
+
+  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
+
+  if (dest2_size > 0) {
+    assert(workspace_allocator);
+    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate backward pooling workspace";
+      return false;
+    }
+  } else {
+    LOG(ERROR) << "Failed to calcuate tensor size to chain forward and "
+                  "backward pooling";
+  }
+
+  status = wrap::miopenPoolingForward(
+      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
+      workspace.opaque(), workspace_size_in_bytes);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to enqueue forward pooling (before backward) on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  status = wrap::miopenPoolingBackward(
+      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque(), workspace.opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoNormalizeWithDimensions(
+    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+    const dnn::BatchDescriptor& dimensions,
+    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
+  // Check for unsupported modes.
+  if (normalize_descriptor.wrap_around()) {
+    LOG(ERROR) << "MIOpen LRN does not support wrap-around mode";
+    return false;
+  }
+  if (normalize_descriptor.segment_size()) {
+    LOG(ERROR) << "MIOpen LRN does not support segmentation";
+    return false;
+  }
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  // Launch the normalization.
+  ScopedTensorDescriptor dims{dimensions, miopenFloat};
+  ScopedNormalizeDescriptor normalize{normalize_descriptor};
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0f;
+  // Beta is the scaling factor for output.
+  float beta = 0.0f;
+
+  auto status = wrap::miopenLRNForward(
+      miopen.handle(), normalize.handle(), &alpha, dims.handle(),
+      input_data.opaque(), &beta, dims.handle(), output_data->opaque(), false,
+      nullptr);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to run miopenLRNForward";
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoNormalizeBackwardWithDimensions(
+    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+    const dnn::BatchDescriptor& dimensions, const DeviceMemory<float>& raw_data,
+    const DeviceMemory<float>& normalized_data,
+    const DeviceMemory<float>& normalized_variable_gradient,
+    DeviceMemory<float>* raw_variable_gradient,
+    ScratchAllocator* workspace_allocator) {
+  // Check for unsupported modes.
+  if (normalize_descriptor.wrap_around()) {
+    LOG(ERROR) << "MIOpen LRN does not support wrap-around mode";
+    return false;
+  }
+  if (normalize_descriptor.segment_size()) {
+    LOG(ERROR) << "MIOpen LRN does not support segmentation";
+    return false;
+  }
+
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor dims{dimensions, miopenFloat};
+  ScopedNormalizeDescriptor normalize{normalize_descriptor};
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+
+  DeviceMemory<uint8> workspace;
+  size_t workspace_size_in_bytes = 0;
+  auto status =
+      wrap::miopenLRNGetWorkSpaceSize(dims.handle(), &workspace_size_in_bytes);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to obtain workspace size for miopenLRNBackward";
+    return false;
+  }
+
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    assert(workspace_allocator);
+    auto allocated =
+        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "Failed to allocate backward pooling workspace";
+      return false;
+    }
+  }
+
+  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
+  int dest2_size = 0;
+
+  // miopen requires the strides and dims to be ordered as BDYX.
+  std::vector<int64> dims64 =
+      dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+  // miopen does not use strides and must have 4D tensor.
+  std::vector<int> dimsint(4);
+
+  std::transform(dims64.cbegin(), dims64.cend(), dimsint.begin(),
+                 &CheckedNarrowing<int64, int>);
+
+  dest2_size =
+      dimsint[0] * dimsint[1] * dimsint[2] * dimsint[3] * sizeof(float);
+
+  if (dest2_size > 0) {
+    assert(workspace_allocator);
+    auto allocated = workspace_allocator->AllocateBytes(stream, dest2_size);
+    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR)
+          << "Failed to allocate tensor to chain forward and backward LRN";
+      return false;
+    }
+  } else {
+    LOG(ERROR)
+        << "Failed to calcuate tensor size to chain forward and backward LRN";
+  }
+
+  status = wrap::miopenLRNForward(miopen.handle(), normalize.handle(), &alpha,
+                                  dims.handle(), raw_data.opaque(), &beta,
+                                  dims.handle(), dest2.opaque(), true,
+                                  workspace.opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to run miopenLRNForward";
+    return false;
+  }
+
+  status = wrap::miopenLRNBackward(
+      miopen.handle(), normalize.handle(), &alpha, dims.handle(),
+      normalized_data.opaque(), dims.handle(),
+      normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
+      &beta, dims.handle(), raw_variable_gradient->opaque(),
+      workspace.opaque());
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to run miopenLRNBackward";
+    return false;
+  }
+  return true;
+}
+
+bool MIOpenSupport::DoDepthConcatenate(
+    Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float>*> input_data,
+    DeviceMemory<float>* output_data) {
+  CHECK_EQ(input_dimensions.size(), input_data.size());
+
+  for (const auto& dimensions : input_dimensions) {
+    if (dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
+      LOG(ERROR) << "MIOpenSupport::DoDepthConcatenate currently only "
+                    "supports the kBatchDepthYX layout.";
+      return false;
+    }
+  }
+
+  if (input_dimensions.empty()) {
+    return true;  // Nothing to do.
+  }
+
+  dnn::BatchDescriptor output_dimensions =
+      dnn::BatchDescriptor::DepthConcatenateOutputDescriptor(input_dimensions);
+
+  const int64 area = output_dimensions.width() * output_dimensions.height();
+  const auto index = [area](int64 batch, int64 depth, int64 yx,
+                            int64 max_depth) {
+    return (batch * max_depth + depth) * area + yx;
+  };
+
+  std::vector<float> output_host(output_dimensions.ElementCount());
+  std::vector<float> tmp;
+  int64 depth_sum = 0;
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    const auto& dimensions = input_dimensions[i];
+    tmp.resize(dimensions.ElementCount());
+    stream->ThenMemcpyD2H<float>(*input_data[i], absl::MakeSpan(tmp));
+    port::Status block_status = stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
+      return false;
+    }
+
+    for (int64 batch = 0; batch < output_dimensions.count(); ++batch) {
+      for (int64 yx = 0; yx < area; ++yx) {
+        for (int64 depth = 0; depth < dimensions.feature_map_count(); ++depth) {
+          LOG(INFO) << output_dimensions.ElementCount() << ' ' << batch << ' '
+                    << yx << ' ' << depth;
+          output_host[index(batch, depth + depth_sum, yx,
+                            output_dimensions.feature_map_count())] =
+              tmp[index(batch, depth, yx, dimensions.feature_map_count())];
+        }
+      }
+    }
+    depth_sum += dimensions.feature_map_count();
+  }
+  stream->ThenMemcpyH2D<float>(output_host, output_data);
+  return true;
+}
+
+bool MIOpenSupport::DoElementwiseOperate(
+    Stream* stream, dnn::ElementwiseOperation operation,
+    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+    port::ArraySlice<const DeviceMemory<float>*> input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+  return false;
+}
+
+bool MIOpenSupport::DoXYPad(Stream* stream,
+                            const dnn::BatchDescriptor& dimensions,
+                            const DeviceMemory<float>& input_data,
+                            int64 left_pad, int64 right_pad, int64 top_pad,
+                            int64 bottom_pad,
+                            DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+  return false;
+}
+
+bool MIOpenSupport::DoXYSlice(Stream* stream,
+                              const dnn::BatchDescriptor& dimensions,
+                              const DeviceMemory<float>& input_data,
+                              int64 left_trim, int64 right_trim, int64 top_trim,
+                              int64 bottom_trim,
+                              DeviceMemory<float>* output_data) {
+  LOG(FATAL) << "not yet implemented";  // TODO(leary)
+  return false;
+}
+
+bool MIOpenSupport::DoMemcpyD2HQuantized(
+    Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
+    dnn::QuantizedActivationMode mode, void* host_dst, int64 size) {
+  LOG(ERROR) << "quantized memcpy not supported by MIOpen";
+  return false;
+}
+
+bool MIOpenSupport::DoMemcpyH2DQuantized(
+    Stream* stream, const void* host_src, int64 size,
+    dnn::QuantizedActivationMode mode,
+    DeviceMemory<float>* gpu_unquantized_dst) {
+  LOG(ERROR) << "quantized memcpy not supported by MIOpen";
+  return false;
+}
+
+bool MIOpenSupport::DeriveOutputBatchDescriptor(
+    const BatchDescriptor& batch_descriptor,
+    const FilterDescriptor& filter_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::BatchDescriptor* output_batch_descriptor) {
+  ScopedTensorDescriptor input_nd{batch_descriptor, miopenFloat};
+  ScopedFilterDescriptor filter{filter_descriptor, batch_descriptor,
+                                miopenFloat};
+  ScopedConvolutionDescriptor conv{convolution_descriptor, miopenFloat};
+
+  int dn = batch_descriptor.ndims() + 2;
+  std::vector<int> dims(dn);  // in BDYX
+  auto status = wrap::miopenGetConvolutionForwardOutputDim(
+      conv.handle(), input_nd.handle(), filter.handle(), &dims[0], &dims[1],
+      &dims[2], &dims[3]);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "could not get output tensor for convolution: "
+               << ToString(status);
+    return false;
+  }
+
+  output_batch_descriptor->set_count(dims[0])
+      .set_feature_map_count(dims[1])
+      .set_layout(batch_descriptor.layout());
+
+  for (int i = 0; i < batch_descriptor.ndims(); i++) {
+    output_batch_descriptor->set_spatial_dim(static_cast<dnn::DimIndex>(i),
+                                             dims.rbegin()[i]);
+  }
+
+  return true;
+}
+
+template <typename T>
+bool MIOpenSupport::DoFusedConvolutionBiasActivationImpl(
+    Stream* stream,
+    int miopen_type,  // Actually miopenDataType_t.
+    const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<T>& conv_input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<T>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
+    dnn::ProfileResult* output_profile_result) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor conv_input_nd{
+      conv_input_descriptor, static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedTensorDescriptor bias_nd{bias_descriptor,
+                                 static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedTensorDescriptor output_nd{output_descriptor,
+                                   static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedConvolutionDescriptor conv{convolution_descriptor,
+                                   static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedFilterDescriptor filter{filter_descriptor, conv_input_descriptor,
+                                static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedActivationDescriptor activation_desc{activation_mode};
+
+  ScopedFusionPlanConvolutionBiasActivation fusion_plan{
+      miopen.handle(), conv_input_nd.handle(), filter.handle(),
+      conv.handle(),   bias_nd.handle(),       activation_desc};
+
+  bool retval = false;
+
+  if (fusion_plan.CompilationSucceeded()) {
+    const bool is_profiling = output_profile_result != nullptr;
+
+    std::unique_ptr<GpuTimer> timer;
+    if (is_profiling) {
+      timer.reset(new GpuTimer(parent_));
+      timer->Init();
+      timer->Start(AsGpuStream(stream));
+    }
+
+    miopenStatus_t status = miopenStatusSuccess;
+
+    if (status == miopenStatusSuccess) {
+      fusion_plan.SetConvolutionArgs(filter_data.opaque());
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.SetBiasArgs(bias_data.opaque());
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.SetActivationForwardArgs(activation_desc);
+    }
+
+    if (status == miopenStatusSuccess) {
+      status =
+          fusion_plan.Execute(conv_input_nd.handle(), conv_input_data.opaque(),
+                              output_nd.handle(), output_data->opaque());
+    }
+
+    if (is_profiling) {
+      timer->Stop(AsGpuStream(stream));
+      if (status == miopenStatusSuccess) {
+        output_profile_result->set_elapsed_time_in_ms(
+            timer->GetElapsedMilliseconds());
+      }
+      timer->Destroy();
+    }
+
+    if (status != miopenStatusSuccess) {
+      // Silently return when we are profiling.
+      if (!is_profiling) {
+        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
+                   << ToString(status);
+      }
+    }
+
+    retval = true;
+  }
+
+  return retval;
+}
+
+bool MIOpenSupport::DoFusedConvolutionBiasActivation(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<float>& conv_input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedConvolutionBiasActivationImpl<float>(
+      stream, miopenFloat, conv_input_descriptor, conv_input_data,
+      filter_descriptor, filter_data, convolution_descriptor, bias_descriptor,
+      bias_data, activation_mode, output_descriptor, output_data,
+      output_profile_result);
+}
+
+template <typename T, typename U>
+bool MIOpenSupport::DoFusedBatchNormActivationInferenceImpl(
+    Stream* stream,
+    int miopen_type,  // Actually miopenDataType_t.
+    const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+    const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data,
+    double epsilon, dnn::ActivationMode activation_mode,
+    DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor x_nd{x_descriptor,
+                              static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedTensorDescriptor scale_offset_mean_variance_nd{
+      scale_offset_mean_variance_descriptor,
+      static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedActivationDescriptor activation_desc{activation_mode};
+
+  ScopedFusionPlanBatchNormActivationInference fusion_plan{
+      miopen.handle(), x_nd.handle(), scale_offset_mean_variance_nd.handle(),
+      activation_desc};
+
+  bool retval = false;
+
+  if (fusion_plan.CompilationSucceeded()) {
+    const bool is_profiling = output_profile_result != nullptr;
+
+    std::unique_ptr<GpuTimer> timer;
+    if (is_profiling) {
+      timer.reset(new GpuTimer(parent_));
+      timer->Init();
+      timer->Start(AsGpuStream(stream));
+    }
+
+    miopenStatus_t status = miopenStatusSuccess;
+
+    if (status == miopenStatusSuccess) {
+      fusion_plan.SetBatchNormInferenceArgs(
+          scale_data.opaque(), offset_data.opaque(), mean_data.opaque(),
+          variance_data.opaque(), epsilon);
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.SetActivationForwardArgs(activation_desc);
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.Execute(x_nd.handle(), x_data.opaque(),
+                                   x_nd.handle(), y_data->opaque());
+    }
+
+    if (is_profiling) {
+      timer->Stop(AsGpuStream(stream));
+      if (status == miopenStatusSuccess) {
+        output_profile_result->set_elapsed_time_in_ms(
+            timer->GetElapsedMilliseconds());
+      }
+      timer->Destroy();
+    }
+
+    if (status != miopenStatusSuccess) {
+      // Silently return when we are profiling.
+      if (!is_profiling) {
+        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
+                   << ToString(status);
+      }
+    }
+
+    retval = true;
+  }
+
+  return retval;
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationInference(
+    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+    const DeviceMemory<float>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data,
+    const DeviceMemory<float>& mean_data,
+    const DeviceMemory<float>& variance_data, double epsilon,
+    dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationInferenceImpl<float, float>(
+      stream, miopenFloat, x_descriptor, x_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data, mean_data,
+      variance_data, epsilon, activation_mode, y_data, output_profile_result);
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationInference(
+    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+    const DeviceMemory<Eigen::half>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data,
+    const DeviceMemory<float>& mean_data,
+    const DeviceMemory<float>& variance_data, double epsilon,
+    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationInferenceImpl<Eigen::half, float>(
+      stream, miopenHalf, x_descriptor, x_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data, mean_data,
+      variance_data, epsilon, activation_mode, y_data, output_profile_result);
+}
+
+template <typename T, typename U>
+bool MIOpenSupport::DoFusedBatchNormActivationForwardImpl(
+    Stream* stream,
+    int miopen_type,  // Actually miopenDataType_t.
+    const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+    double epsilon, dnn::ActivationMode activation_mode,
+    DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data,
+    DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data,
+    DeviceMemory<U>* saved_var_data,
+    dnn::ProfileResult* output_profile_result) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor x_nd{x_descriptor,
+                              static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedTensorDescriptor scale_offset_mean_variance_nd{
+      scale_offset_mean_variance_descriptor,
+      static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedActivationDescriptor activation_desc{activation_mode};
+
+  ScopedFusionPlanBatchNormActivationForward fusion_plan{
+      miopen.handle(), x_nd.handle(), scale_offset_mean_variance_nd.handle(),
+      activation_desc};
+
+  bool retval = false;
+
+  if (fusion_plan.CompilationSucceeded()) {
+    const bool is_profiling = output_profile_result != nullptr;
+
+    std::unique_ptr<GpuTimer> timer;
+    if (is_profiling) {
+      timer.reset(new GpuTimer(parent_));
+      timer->Init();
+      timer->Start(AsGpuStream(stream));
+    }
+
+    miopenStatus_t status = miopenStatusSuccess;
+
+    if (status == miopenStatusSuccess) {
+      fusion_plan.SetBatchNormForwardArgs(
+          scale_data.opaque(), offset_data.opaque(), batch_mean_data->opaque(),
+          batch_var_data->opaque(), saved_mean_data->opaque(),
+          saved_var_data->opaque(), epsilon);
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.SetActivationForwardArgs(activation_desc);
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.Execute(x_nd.handle(), x_data.opaque(),
+                                   x_nd.handle(), y_data->opaque());
+    }
+
+    if (is_profiling) {
+      timer->Stop(AsGpuStream(stream));
+      if (status == miopenStatusSuccess) {
+        output_profile_result->set_elapsed_time_in_ms(
+            timer->GetElapsedMilliseconds());
+      }
+      timer->Destroy();
+    }
+
+    if (status != miopenStatusSuccess) {
+      // Silently return when we are profiling.
+      if (!is_profiling) {
+        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
+                   << ToString(status);
+      }
+    }
+
+    retval = true;
+  }
+
+  return retval;
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationForward(
+    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+    const DeviceMemory<float>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data, double epsilon,
+    dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+    DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+    DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationForwardImpl<float, float>(
+      stream, miopenFloat, x_descriptor, x_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data, epsilon,
+      activation_mode, y_data, batch_mean_data, batch_var_data, saved_mean_data,
+      saved_var_data, output_profile_result);
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationForward(
+    Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+    const DeviceMemory<Eigen::half>& x_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data, double epsilon,
+    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+    DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+    DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationForwardImpl<Eigen::half, float>(
+      stream, miopenHalf, x_descriptor, x_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data, epsilon,
+      activation_mode, y_data, batch_mean_data, batch_var_data, saved_mean_data,
+      saved_var_data, output_profile_result);
+}
+
+template <typename T, typename U>
+bool MIOpenSupport::DoFusedBatchNormActivationBackwardImpl(
+    Stream* stream,
+    int miopen_type,  // Actually miopenDataType_t.
+    const dnn::BatchDescriptor& y_act_backprop_descriptor,
+    const DeviceMemory<T>& y_act_backprop_data,
+    const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode,
+    const DeviceMemory<T>& x_bn_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+    const DeviceMemory<U>& saved_mean_data,
+    const DeviceMemory<U>& saved_var_data, DeviceMemory<T>* x_bn_backprop_data,
+    DeviceMemory<U>* scale_backprop_data, DeviceMemory<U>* offset_backprop_data,
+    dnn::ProfileResult* output_profile_result) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+
+  ScopedTensorDescriptor y_act_backprop_nd{
+      y_act_backprop_descriptor, static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedTensorDescriptor scale_offset_mean_variance_nd{
+      scale_offset_mean_variance_descriptor,
+      static_cast<miopenDataType_t>(miopen_type)};
+
+  ScopedActivationDescriptor activation_desc{activation_mode};
+
+  ScopedFusionPlanBatchNormActivationBackward fusion_plan{
+      miopen.handle(), y_act_backprop_nd.handle(),
+      scale_offset_mean_variance_nd.handle(), activation_desc};
+
+  bool retval = false;
+
+  if (fusion_plan.CompilationSucceeded()) {
+    const bool is_profiling = output_profile_result != nullptr;
+
+    std::unique_ptr<GpuTimer> timer;
+    if (is_profiling) {
+      timer.reset(new GpuTimer(parent_));
+      timer->Init();
+      timer->Start(AsGpuStream(stream));
+    }
+
+    miopenStatus_t status = miopenStatusSuccess;
+
+    if (status == miopenStatusSuccess) {
+      fusion_plan.SetBatchNormBackwardArgs(
+          x_bn_data.opaque(), scale_data.opaque(), offset_data.opaque(),
+          saved_mean_data.opaque(), saved_var_data.opaque(),
+          scale_backprop_data->opaque(), offset_backprop_data->opaque());
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.SetActivationBackwardArgs(activation_desc,
+                                                     y_act_data.opaque());
+    }
+
+    if (status == miopenStatusSuccess) {
+      status = fusion_plan.Execute(
+          y_act_backprop_nd.handle(), y_act_backprop_data.opaque(),
+          y_act_backprop_nd.handle(), x_bn_backprop_data->opaque());
+    }
+
+    if (is_profiling) {
+      timer->Stop(AsGpuStream(stream));
+      if (status == miopenStatusSuccess) {
+        output_profile_result->set_elapsed_time_in_ms(
+            timer->GetElapsedMilliseconds());
+      }
+      timer->Destroy();
+    }
+
+    if (status != miopenStatusSuccess) {
+      // Silently return when we are profiling.
+      if (!is_profiling) {
+        LOG(FATAL) << "failed to enqueue fused-convolution on stream: "
+                   << ToString(status);
+      }
+    }
+
+    retval = true;
+  }
+
+  return retval;
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationBackward(
+    Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+    const DeviceMemory<float>& y_act_backprop_data,
+    const DeviceMemory<float>& y_act_data, dnn::ActivationMode activation_mode,
+    const DeviceMemory<float>& x_bn_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data,
+    const DeviceMemory<float>& saved_mean_data,
+    const DeviceMemory<float>& saved_var_data,
+    DeviceMemory<float>* x_bn_backprop_data,
+    DeviceMemory<float>* scale_backprop_data,
+    DeviceMemory<float>* offset_backprop_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationBackwardImpl<float, float>(
+      stream, miopenFloat, y_act_backprop_descriptor, y_act_backprop_data,
+      y_act_data, activation_mode, x_bn_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data,
+      saved_mean_data, saved_var_data, x_bn_backprop_data, scale_backprop_data,
+      offset_backprop_data, output_profile_result);
+}
+
+bool MIOpenSupport::DoFusedBatchNormActivationBackward(
+    Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+    const DeviceMemory<Eigen::half>& y_act_backprop_data,
+    const DeviceMemory<Eigen::half>& y_act_data,
+    dnn::ActivationMode activation_mode,
+    const DeviceMemory<Eigen::half>& x_bn_data,
+    const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+    const DeviceMemory<float>& scale_data,
+    const DeviceMemory<float>& offset_data,
+    const DeviceMemory<float>& saved_mean_data,
+    const DeviceMemory<float>& saved_var_data,
+    DeviceMemory<Eigen::half>* x_bn_backprop_data,
+    DeviceMemory<float>* scale_backprop_data,
+    DeviceMemory<float>* offset_backprop_data,
+    dnn::ProfileResult* output_profile_result) {
+  return DoFusedBatchNormActivationBackwardImpl<Eigen::half, float>(
+      stream, miopenHalf, y_act_backprop_descriptor, y_act_backprop_data,
+      y_act_data, activation_mode, x_bn_data,
+      scale_offset_mean_variance_descriptor, scale_data, offset_data,
+      saved_mean_data, saved_var_data, x_bn_backprop_data, scale_backprop_data,
+      offset_backprop_data, output_profile_result);
+}
+
+}  // namespace gpu
+
+void initialize_miopen() {
+  auto miopenAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kDnn, gpu::kMIOpenPlugin);
+
+  if (!miopenAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
+            rocm::kROCmPlatformId, gpu::kMIOpenPlugin, "MIOpen",
+            [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
+              gpu::GpuExecutor* rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor*>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the MIOpen "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              gpu::MIOpenSupport* dnn = new gpu::MIOpenSupport(rocm_executor);
+              if (!dnn->Init().ok()) {
+                // Note: Init() will log a more specific error.
+                delete dnn;
+                return nullptr;
+              }
+              return dnn;
+            });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register MIOpen factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kDnn, gpu::kMIOpenPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_miopen,
+                            { stream_executor::initialize_miopen(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..643f9b3a73c7683922fb2d2467cf7fc95d1b471c
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -0,0 +1,776 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The ROCM-specific DNN library support, implementing the general DnnSupport
+// interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
+
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/temporary_device_memory.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+class MIOpenRnnDescriptor;
+class MIOpenRnnSequenceTensorDescriptor;
+class MIOpenRnnStateTensorDescriptor;
+// Opaque and unique identifier for the MIOpen plugin.
+extern const PluginId kMIOpenPlugin;
+
+// miopen-library based DNN support. For details on overridden interface
+// functions, see dnn.h.
+class MIOpenSupport : public dnn::DnnSupport {
+ public:
+  explicit MIOpenSupport(GpuExecutor* parent);
+
+  port::Status Init() override;
+  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
+
+  port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+      int num_layers, int hidden_size, int input_size, int batch_size,
+      dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) override;
+
+  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+                                    int data_size,
+                                    dnn::DataType data_type) override;
+
+  port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+                                 dnn::DataType data_type) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<Eigen::half>& input_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<Eigen::half>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<Eigen::half>& input_c_data,
+                    const DeviceMemory<Eigen::half>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<Eigen::half>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<Eigen::half>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<float>& input_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<float>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<float>& input_c_data,
+                    const DeviceMemory<float>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<float>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<float>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<float>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<double>& input_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<double>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<double>& input_c_data,
+                    const DeviceMemory<double>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<double>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<double>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<double>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<Eigen::half>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<Eigen::half>& input_c_data,
+                     const DeviceMemory<Eigen::half>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<Eigen::half>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<Eigen::half>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<Eigen::half>& output_c_data,
+                     const DeviceMemory<Eigen::half>& output_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_h_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_c_backprop_data,
+                     DeviceMemory<Eigen::half>* input_backprop_data,
+                     DeviceMemory<Eigen::half>* input_h_backprop_data,
+                     DeviceMemory<Eigen::half>* input_c_backprop_data,
+                     DeviceMemory<Eigen::half>* params_backprop_data,
+                     DeviceMemory<uint8>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<float>& input_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<float>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<float>& input_c_data,
+                     const DeviceMemory<float>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<float>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<float>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<float>& output_c_data,
+                     const DeviceMemory<float>& output_backprop_data,
+                     const DeviceMemory<float>& output_h_backprop_data,
+                     const DeviceMemory<float>& output_c_backprop_data,
+                     DeviceMemory<float>* input_backprop_data,
+                     DeviceMemory<float>* input_h_backprop_data,
+                     DeviceMemory<float>* input_c_backprop_data,
+                     DeviceMemory<float>* params_backprop_data,
+                     DeviceMemory<uint8>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<double>& input_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<double>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<double>& input_c_data,
+                     const DeviceMemory<double>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<double>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<double>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<double>& output_c_data,
+                     const DeviceMemory<double>& output_backprop_data,
+                     const DeviceMemory<double>& output_h_backprop_data,
+                     const DeviceMemory<double>& output_c_backprop_data,
+                     DeviceMemory<double>* input_backprop_data,
+                     DeviceMemory<double>* input_h_backprop_data,
+                     DeviceMemory<double>* input_c_backprop_data,
+                     DeviceMemory<double>* params_backprop_data,
+                     DeviceMemory<uint8>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
+
+  bool GetConvolveAlgorithms(
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool GetConvolveBackwardDataAlgorithms(
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool GetConvolveBackwardFilterAlgorithms(
+      bool with_winograd_nonfused, int cc_major, int cc_minor,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<float>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+      DeviceMemory<float>* saved_inv_var, bool is_training,
+      std::function<const DeviceMemory<float>&()> var_to_inv_var,
+      std::function<void()> inv_var_to_var) override;
+
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::half>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+      DeviceMemory<float>* saved_inv_var, bool is_training,
+      std::function<const DeviceMemory<float>&()> var_to_inv_var,
+      std::function<void()> inv_var_to_var) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<float>& y_backprop,
+      const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+      DeviceMemory<float>* offset_backprop) override;
+
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
+      const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<Eigen::half>* x_backprop,
+      DeviceMemory<float>* scale_backprop,
+      DeviceMemory<float>* offset_backprop) override;
+
+  port::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<double>& conv_input_data, double conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<double>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<double>& side_input_data, double side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<float>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<float>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedConvolve(Stream* stream,
+                       const dnn::BatchDescriptor& conv_input_descriptor,
+                       const DeviceMemory<Eigen::half>& conv_input_data,
+                       float conv_input_scale,
+                       const dnn::FilterDescriptor& filter_descriptor,
+                       const DeviceMemory<Eigen::half>& filter_data,
+                       const dnn::ConvolutionDescriptor& convolution_descriptor,
+                       const DeviceMemory<Eigen::half>& side_input_data,
+                       float side_input_scale,
+                       const dnn::BatchDescriptor& bias_descriptor,
+                       const DeviceMemory<Eigen::half>& biases,
+                       dnn::ActivationMode activation_mode,
+                       const dnn::BatchDescriptor& output_descriptor,
+                       DeviceMemory<Eigen::half>* output_data,
+                       ScratchAllocator* scratch_allocator,
+                       const dnn::AlgorithmConfig& algorithm_config,
+                       dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int8>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<int8>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoConvolveQuantized(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int8>& filter_coefficients,
+      const DeviceMemory<float>& coefficient_scales,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
+    return false;
+  }
+
+  bool DoConvolveQuantized(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<int16>& filter_coefficients,
+      const DeviceMemory<float>& coefficient_scales,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
+    return false;
+  }
+
+  bool DoSeparableConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<float>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier,
+      const DeviceMemory<float>& first_weights,
+      const DeviceMemory<float>& second_weights,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "separable convolution not supported by MIOpen";
+    return false;
+  }
+
+  bool DoConvolveBackwardBias(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<double>* backward_bias_data) override;
+
+  bool DoConvolveBackwardBias(Stream* stream,
+                              const dnn::BatchDescriptor& input_descriptor,
+                              const DeviceMemory<float>& input_data,
+                              const dnn::BatchDescriptor& bias_descriptor,
+                              DeviceMemory<float>* backward_bias_data) override;
+
+  bool DoConvolveBackwardBias(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<Eigen::half>* backward_bias_data) override;
+
+  bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
+                const DeviceMemory<float>& weights,
+                const dnn::BatchDescriptor& input_dimensions,
+                const dnn::BatchDescriptor& output_dimensions,
+                DeviceMemory<float>* output_data) override;
+
+  bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
+                         const DeviceMemory<int8>& quantized_weights,
+                         const DeviceMemory<float>& weight_scales,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen";
+    return false;
+  }
+
+  bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
+                         const DeviceMemory<int16>& quantized_weights,
+                         const DeviceMemory<float>& weight_scales,
+                         const dnn::BatchDescriptor& input_dimensions,
+                         const dnn::BatchDescriptor& output_dimensions,
+                         DeviceMemory<float>* output_data) override {
+    LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen";
+    return false;
+  }
+
+  bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
+                 const DeviceMemory<float>& biases,
+                 const dnn::BatchDescriptor& dimensions,
+                 DeviceMemory<float>* output_data) override;
+
+  bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode,
+                  const dnn::BatchDescriptor& dimensions,
+                  const DeviceMemory<float>& input_data,
+                  DeviceMemory<float>* output_data, uint64 options) override;
+
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<double>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<double>* output_data,
+                     ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<float>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<float>* output_data,
+                     ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<Eigen::half>* output_data,
+                     ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<double>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<double>& output_data,
+                      const DeviceMemory<double>& input_diff_data,
+                      DeviceMemory<double>* output_diff_data,
+                      ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<float>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<float>& output_data,
+                      const DeviceMemory<float>& input_diff_data,
+                      DeviceMemory<float>* output_diff_data,
+                      ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<Eigen::half>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<Eigen::half>& output_data,
+                      const DeviceMemory<Eigen::half>& input_diff_data,
+                      DeviceMemory<Eigen::half>* output_diff_data,
+                      ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoNormalizeWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& input_data,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoNormalizeBackwardWithDimensions(
+      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
+      const dnn::BatchDescriptor& dimensions,
+      const DeviceMemory<float>& raw_data,
+      const DeviceMemory<float>& normalized_data,
+      const DeviceMemory<float>& normalized_variable_gradient,
+      DeviceMemory<float>* raw_variable_gradient,
+      ScratchAllocator* workspace_allocator = nullptr) override;
+
+  bool DoDepthConcatenate(
+      Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoElementwiseOperate(
+      Stream* stream, dnn::ElementwiseOperation operation,
+      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
+      port::ArraySlice<const DeviceMemory<float>*> input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemory<float>* output_data) override;
+
+  bool DoXYPad(Stream* stream, const dnn::BatchDescriptor& dimensions,
+               const DeviceMemory<float>& input_data, int64 left_pad,
+               int64 right_pad, int64 top_pad, int64 bottom_pad,
+               DeviceMemory<float>* output_data) override;
+
+  bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor& dimensions,
+                 const DeviceMemory<float>& input_data, int64 left_trim,
+                 int64 right_trim, int64 top_trim, int64 bottom_trim,
+                 DeviceMemory<float>* output_data) override;
+
+  bool DoMemcpyD2HQuantized(Stream* stream,
+                            const DeviceMemory<float>& device_unquantized_src,
+                            dnn::QuantizedActivationMode mode, void* host_dst,
+                            int64 size) override;
+
+  bool DoMemcpyH2DQuantized(
+      Stream* stream, const void* host_src, int64 size,
+      dnn::QuantizedActivationMode mode,
+      DeviceMemory<float>* device_unquantized_dst) override;
+
+  // Derives an output batch descriptor from an input batch and convolution
+  // descriptors.
+  bool DeriveOutputBatchDescriptor(
+      const dnn::BatchDescriptor& batch_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::BatchDescriptor* output_batch_descriptor);
+
+  bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
+                         dnn::DataType input_type,
+                         const DeviceMemoryBase& input_data,
+                         const dnn::BatchDescriptor& output_desc,
+                         dnn::DataType output_type, float scale,
+                         DeviceMemoryBase* output_data) override;
+
+  bool DoFusedConvolutionBiasActivation(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<float>& conv_input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<float>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<float>* output_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationInference(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<float>& x_data,
+      const dnn::BatchDescriptor& scale_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& mean_data,
+      const DeviceMemory<float>& variance_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationInference(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<Eigen::half>& x_data,
+      const dnn::BatchDescriptor& scale_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& mean_data,
+      const DeviceMemory<float>& variance_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationForward(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<float>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
+      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationForward(
+      Stream* stream, const dnn::BatchDescriptor& x_descriptor,
+      const DeviceMemory<Eigen::half>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data, double epsilon,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
+      DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
+      DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationBackward(
+      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+      const DeviceMemory<float>& y_act_backprop_data,
+      const DeviceMemory<float>& y_act_data,
+      dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& saved_mean_data,
+      const DeviceMemory<float>& saved_var_data,
+      DeviceMemory<float>* x_bn_backprop_data,
+      DeviceMemory<float>* scale_backprop_data,
+      DeviceMemory<float>* offset_backprop_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  bool DoFusedBatchNormActivationBackward(
+      Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
+      const DeviceMemory<Eigen::half>& y_act_backprop_data,
+      const DeviceMemory<Eigen::half>& y_act_data,
+      dnn::ActivationMode activation_mode,
+      const DeviceMemory<Eigen::half>& x_bn_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<float>& scale_data,
+      const DeviceMemory<float>& offset_data,
+      const DeviceMemory<float>& saved_mean_data,
+      const DeviceMemory<float>& saved_var_data,
+      DeviceMemory<Eigen::half>* x_bn_backprop_data,
+      DeviceMemory<float>* scale_backprop_data,
+      DeviceMemory<float>* offset_backprop_data,
+      dnn::ProfileResult* output_profile_result) override;
+
+  GpuExecutor* GetParentExecutor() { return parent_; }
+
+ private:
+  GpuExecutor* parent_;  // Parent executor object. Not owned.
+
+  // Provide access to the MIOpen handle.
+  std::unique_ptr<class MIOpenAccess> miopen_;
+
+  template <class T, class U>
+  bool DoBatchNormalizationForwardImpl(
+      Stream* stream, dnn::DataType input_data_type,
+      dnn::DataType scale_data_type, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
+      const DeviceMemory<U>& estimated_mean,
+      const DeviceMemory<U>& estimated_variance,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<T>* y, DeviceMemory<U>* batch_mean,
+      DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean,
+      DeviceMemory<U>* saved_inv_var, bool is_training,
+      std::function<const DeviceMemory<U>&()> var_to_inv_var,
+      std::function<void()> inv_var_to_var);
+
+  template <class T, class U>
+  bool DoBatchNormalizationBackwardImpl(
+      Stream* stream, int miopen_input_type, int miopen_scale_type,
+      const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
+      const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
+      const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
+      DeviceMemory<U>* offset_backprop);
+
+  template <class T>
+  bool DoConvolveBackwardBiasImpl(
+      Stream* stream,
+      int miopen_type,  // Actually miopenDataType_t.
+      const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<T>* backward_bias_data);
+
+  template <class T>
+  bool DoRnnForwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+                        const MIOpenRnnSequenceTensorDescriptor& input_desc,
+                        const DeviceMemory<T>& input_data,
+                        const MIOpenRnnStateTensorDescriptor& input_h_desc,
+                        const DeviceMemory<T>& input_h_data,
+                        const MIOpenRnnStateTensorDescriptor& input_c_desc,
+                        const DeviceMemory<T>& input_c_data,
+                        const DeviceMemory<T>& params,
+                        const MIOpenRnnSequenceTensorDescriptor& output_desc,
+                        DeviceMemory<T>* output_data,
+                        const MIOpenRnnStateTensorDescriptor& output_h_desc,
+                        DeviceMemory<T>* output_h_data,
+                        const MIOpenRnnStateTensorDescriptor& output_c_desc,
+                        DeviceMemory<T>* output_c_data, bool is_training,
+                        ScratchAllocator* reserve_space_allocator,
+                        ScratchAllocator* workspace_allocator);
+  template <class T>
+  bool DoRnnBackwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+                         const MIOpenRnnSequenceTensorDescriptor& input_desc,
+                         const DeviceMemory<T>& input_data,
+                         const MIOpenRnnStateTensorDescriptor& input_h_desc,
+                         const DeviceMemory<T>& input_h_data,
+                         const MIOpenRnnStateTensorDescriptor& input_c_desc,
+                         const DeviceMemory<T>& input_c_data,
+                         const DeviceMemory<T>& params,
+                         const MIOpenRnnSequenceTensorDescriptor& output_desc,
+                         const DeviceMemory<T>& output_data,
+                         const MIOpenRnnStateTensorDescriptor& output_h_desc,
+                         const DeviceMemory<T>& output_h_data,
+                         const MIOpenRnnStateTensorDescriptor& output_c_desc,
+                         const DeviceMemory<T>& output_c_data,
+                         const DeviceMemory<T>& output_backprop_data,
+                         const DeviceMemory<T>& output_h_backprop_data,
+                         const DeviceMemory<T>& output_c_backprop_data,
+                         DeviceMemory<T>* input_backprop_data,
+                         DeviceMemory<T>* input_h_backprop_data,
+                         DeviceMemory<T>* input_c_backprop_data,
+                         DeviceMemory<T>* params_backprop_data,
+                         DeviceMemory<uint8>* reserve_space_data,
+                         ScratchAllocator* workspace_allocator);
+
+  template <typename T>
+  bool DoFusedConvolutionBiasActivationImpl(
+      Stream* stream,
+      int miopen_type,  // Actually miopenDataType_t.
+      const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<T>& conv_input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<T>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T>* output_data, dnn::ProfileResult* output_profile_result);
+
+  template <typename T, typename U>
+  bool DoFusedBatchNormActivationInferenceImpl(
+      Stream* stream,
+      int miopen_type,  // Actually miopenDataType_t.
+      const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+      const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data,
+      double epsilon, dnn::ActivationMode activation_mode,
+      DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result);
+
+  template <typename T, typename U>
+  bool DoFusedBatchNormActivationForwardImpl(
+      Stream* stream,
+      int miopen_type,  // Actually miopenDataType_t.
+      const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+      double epsilon, dnn::ActivationMode activation_mode,
+      DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data,
+      DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data,
+      DeviceMemory<U>* saved_var_data,
+      dnn::ProfileResult* output_profile_result);
+
+  template <typename T, typename U>
+  bool DoFusedBatchNormActivationBackwardImpl(
+      Stream* stream,
+      int miopen_type,  // Actually miopenDataType_t.
+      const dnn::BatchDescriptor& y_act_backprop_descriptor,
+      const DeviceMemory<T>& y_act_backprop_data,
+      const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode,
+      const DeviceMemory<T>& x_bn_data,
+      const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
+      const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
+      const DeviceMemory<U>& saved_mean_data,
+      const DeviceMemory<U>& saved_var_data,
+      DeviceMemory<T>* x_bn_backprop_data, DeviceMemory<U>* scale_backprop_data,
+      DeviceMemory<U>* offset_backprop_data,
+      dnn::ProfileResult* output_profile_result);
+
+  port::Status DoPrepareForConvolution(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c44e9f814f8470e8c5aefc987c4b1b275197c77
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -0,0 +1,1374 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include <utility>
+
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/static_threadlocal.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
+
+bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
+bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
+bool FLAGS_gpuexec_rocm_device_0_only = false;
+
+// Debugging: on each push and pop of a rocm context, verify the current device
+// matches the expected one.
+constexpr bool kVerifyGpuContext = false;
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuContext wraps the device_ordinal.
+// Only reason we need this wrapper class is to make the GpuDriver* API
+class GpuContext {
+ public:
+  GpuContext(const int v) : device_ordinal_(v) {}
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Disallow copying and moving.
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
+
+ private:
+  const int device_ordinal_;
+};
+
+namespace {
+
+// Formats hipError_t to output prettified values into a log stream.
+// Error summaries taken from:
+//
+// TODO(leary) switch to cuGetErrorName when updated rocm.h is available.
+string ToString(hipError_t result) {
+#define OSTREAM_ROCM_ERROR(__name) \
+  case hipError##__name:           \
+    return "HIP_ERROR_" #__name;
+
+  switch (result) {
+    OSTREAM_ROCM_ERROR(InvalidValue)
+    OSTREAM_ROCM_ERROR(OutOfMemory)
+    OSTREAM_ROCM_ERROR(NotInitialized)
+    OSTREAM_ROCM_ERROR(Deinitialized)
+    OSTREAM_ROCM_ERROR(NoDevice)
+    OSTREAM_ROCM_ERROR(InvalidDevice)
+    OSTREAM_ROCM_ERROR(InvalidImage)
+    OSTREAM_ROCM_ERROR(InvalidContext)
+    OSTREAM_ROCM_ERROR(InvalidHandle)
+    OSTREAM_ROCM_ERROR(NotFound)
+    OSTREAM_ROCM_ERROR(NotReady)
+    OSTREAM_ROCM_ERROR(NoBinaryForGpu)
+
+    // Encountered an uncorrectable ECC error during execution.
+    OSTREAM_ROCM_ERROR(ECCNotCorrectable)
+
+    // Load/store on an invalid address. Must reboot all context.
+    case 700:
+      return "ROCM_ERROR_ILLEGAL_ADDRESS";
+    // Passed too many / wrong arguments, too many threads for register count.
+    case 701:
+      return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+      OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
+      OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
+      OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
+    default:
+      return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
+  }
+}
+
+// ROCM driver routines may require a large amount of stack (particularly
+// hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
+// stack-limited threads (such as those spawned by a default-argument
+// thread::ThreadPool on some platforms), we run certain routines in this pool
+// and wait for completion.
+static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool* InitializeDriverExecutor() {
+  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
+                              "rocm_driver", 1);
+}
+
+port::ThreadPool* GetDriverExecutor() {
+  mutex_lock lock(driver_executor_threadpool_mu);
+  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
+  return thread_pool;
+}
+
+}  // namespace
+
+string MemorySpaceString(MemorySpace memory_space) {
+  switch (memory_space) {
+    case MemorySpace::kHost:
+      return "host";
+    case MemorySpace::kDevice:
+      return "device";
+    default:
+      LOG(FATAL) << "impossible memory space";
+  }
+}
+
+// Returns the current device set in HIP. This is done by calling the
+// HIP driver (e.g., this value is not our cached view of the current device).
+static int CurrentDeviceOrDie() {
+  int current = -1;
+  hipError_t result = tensorflow::wrap::hipGetDevice(&current);
+  if (result != hipSuccess) {
+    LOG(FATAL) << "failed to query current device: " << ToString(result);
+  }
+  return current;
+}
+
+namespace {
+
+// Call hipDeviceSynchronize and crash if it doesn't succeed.
+void SynchronizeOrDie() {
+  auto res = tensorflow::wrap::hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(FATAL) << "Synchronize found " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+  }
+}
+
+struct ThreadLocalData {
+  int current_device_ordinal;
+  int depth;
+};
+
+SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
+
+}  // namespace
+
+ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+  if (tls->depth == 0) {
+    tls->current_device_ordinal = CurrentDeviceOrDie();
+  }
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth++;
+
+  to_restore_ = context;
+
+  if (context->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to " << context->device_ordinal();
+
+  // Set the device and update thread local.
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(context->device_ordinal()));
+  tls->current_device_ordinal = context->device_ordinal();
+}
+
+ScopedActivateContext::~ScopedActivateContext() {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth--;
+  DCHECK_GE(tls->depth, 0);
+
+  if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to "
+          << to_restore_->device_ordinal();
+
+  // Set context and update thread local.
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
+  tls->current_device_ordinal = to_restore_->device_ordinal();
+}
+
+namespace {
+
+// Returns a stringified device number associated with pointer, primarily for
+// logging purposes. Returns "?" if the device could not be successfully
+// queried.
+string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerDevice(pointer);
+  if (value.ok()) {
+    return absl::StrCat(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified memory space associated with pointer, primarily for
+// logging purposes. Returns "?" if the memory space could not be successfully
+// queried.
+string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
+  if (value.ok()) {
+    return MemorySpaceString(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified representation of whether or not peer access is
+// permitted between the "from" and "to" pointers' associated contexts,
+// primarily for logging purposes. Returns "error" if an error is encountered
+// in the process of querying.
+string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
+  hipPointerAttribute_t from_pointerAttributes;
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve source pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  hipPointerAttribute_t to_pointerAttributes;
+  result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve destination pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  GpuContext fromCtx(from_pointerAttributes.device);
+  GpuContext toCtx(to_pointerAttributes.device);
+
+  return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
+}
+
+// Actually performs the work of ROCM initialization. Wrapped up in one-time
+// execution guard.
+static port::Status InternalInit() {
+  hipError_t res = hipErrorNoDevice;
+  if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
+    LOG(ERROR) << "injecting ROCM init error; initialization will fail";
+  } else {
+    res = tensorflow::wrap::hipInit(0 /* = flags */);
+  }
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "failed call to hipInit: " << ToString(res);
+  Diagnostician::LogDiagnosticInformation();
+  return port::Status{port::error::ABORTED,
+                      absl::StrCat("failed call to hipInit: ", ToString(res))};
+}
+
+}  // namespace
+
+/* static */ port::Status GpuDriver::Init() {
+  // Cached return value from calling InternalInit(), as hipInit need only be
+  // called once, but GpuDriver::Init may be called many times.
+  static port::Status init_retval;
+  static bool set = false;
+  static mutex* init_mu = new mutex;
+
+  mutex_lock lock(*init_mu);
+  if (!set) {
+    init_retval = InternalInit();
+    set = true;
+  }
+
+  return init_retval;
+}
+
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               hipDevice_t* device) {
+  hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
+}
+
+/* static */ bool GpuDriver::GetDeviceName(hipDevice_t device,
+                                           string* device_name) {
+  static const size_t kCharLimit = 64;
+  absl::InlinedVector<char, 4> chars(kCharLimit);
+  hipError_t res =
+      tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get device name for " << device << ": "
+               << ToString(res);
+    return false;
+  }
+  chars[kCharLimit - 1] = '\0';
+  *device_name = chars.begin();
+  return true;
+}
+
+bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
+                                 int* flags) {
+  static_assert(DeviceOptions::kMask == 0xf,
+                "needs update for new device options");
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
+    GpuContext** context) {
+  *context = new GpuContext(device_ordinal);
+  return port::Status::OK();
+}
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
+  if (context == nullptr) {
+    return;
+  }
+  delete context;
+}
+
+/* static */ bool GpuDriver::FuncGetAttribute(hipDeviceAttribute_t attribute,
+                                              hipFunction_t func,
+                                              int* attribute_value) {
+  // TODO(ROCm) properly implement this feature in HIP
+  hipError_t res = hipSuccess;
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
+               << ", attribute: " << attribute;
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
+                                                hipFuncCache_t cache_config) {
+  hipError_t res =
+      tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
+               << ", config: " << cache_config << ", result: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<hipSharedMemConfig>
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
+  hipSharedMemConfig shared_mem_config;
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to get ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get shared memory config: ", ToString(result))};
+  }
+  return shared_mem_config;
+}
+
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, hipSharedMemConfig shared_mem_config) {
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", config: " << shared_mem_config
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to set shared memory config: ", ToString(result))};
+  }
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
+    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
+    unsigned int block_dim_y, unsigned int block_dim_z,
+    unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
+    void** extra) {
+  ScopedActivateContext activation{context};
+  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
+          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
+          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
+          << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
+  hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to launch ROCM kernel: " << function
+               << "; result: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully launched kernel";
+  return true;
+}
+
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     hipModule_t* module) {
+  LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
+  return false;
+}
+
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               hipModule_t* module) {
+  return port::Status{port::error::INTERNAL,
+                      "Feature not supported on ROCm platform (LoadCubin)"};
+}
+
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       hipModule_t* module) {
+  port::Notification notification;
+  bool ret = true;
+  GetDriverExecutor()->Schedule(
+      [context, hsaco_contents, module, &ret, &notification]() {
+        ScopedActivateContext activation{context};
+        void* hsaco_data = const_cast<char*>(hsaco_contents);
+
+        hipError_t res =
+            tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
+
+        if (res != hipSuccess) {
+          LOG(ERROR) << "failed to load HSACO: " << ToString(res);
+          ret = false;
+          notification.Notify();
+        }
+
+        CHECK(module != nullptr);
+        notification.Notify();
+      });
+  notification.WaitForNotification();
+
+  return ret;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    hipDeviceptr_t location,
+                                                    uint8 value, size_t size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemsetD8(location, value, size);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+  hipError_t res = tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      hipDeviceptr_t location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+  hipError_t res =
+      tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               StreamCallback callback,
+                                               void* data) {
+  hipError_t res = tensorflow::wrap::hipStreamAddCallback(
+      stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "unable to add host callback: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               hipModule_t module,
+                                               const char* kernel_name,
+                                               hipFunction_t* function) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && kernel_name != nullptr);
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get kernel \"" << kernel_name
+               << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             hipModule_t module,
+                                             const char* symbol_name,
+                                             hipDeviceptr_t* dptr,
+                                             size_t* bytes) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && symbol_name != nullptr &&
+        (dptr != nullptr || bytes != nullptr));
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
+  if (res != hipSuccess) {
+    // symbol may not be found in the current module, but it may reside in
+    // another module.
+    VLOG(2) << "failed to get symbol \"" << symbol_name
+            << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          hipModule_t module) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipModuleUnload(module);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to unload module " << module
+               << "; leaking: " << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          GpuStreamHandle* stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
+      stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not allocate ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+    return false;
+  }
+
+  VLOG(2) << "successfully created stream " << *stream << " for device "
+          << context->device_ordinal() << " on thread";
+  return true;
+}
+
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           GpuStreamHandle* stream) {
+  if (*stream == nullptr) {
+    return;
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to destroy ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+  } else {
+    VLOG(2) << "successfully destroyed stream " << *stream << " for device "
+            << context->device_ordinal();
+    *stream = nullptr;
+  }
+}
+
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
+  ScopedActivateContext activated{context};
+  hipDeviceptr_t result = 0;
+  hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to allocate "
+               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << " bytes) from device: " << ToString(res);
+    return nullptr;
+  }
+  void* ptr = reinterpret_cast<void*>(result);
+  VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
+          << " of " << bytes << " bytes";
+  return ptr;
+}
+
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
+  ScopedActivateContext activation{context};
+  hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
+  hipError_t res = tensorflow::wrap::hipFree(pointer);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to free device memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated " << location << " for device "
+            << context->device_ordinal();
+  }
+}
+
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
+  ScopedActivateContext activated{context};
+
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
+  return nullptr;
+}
+
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
+}
+
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
+  ScopedActivateContext activation{context};
+  void* host_mem = nullptr;
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res =
+      tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes on host: " << ToString(res);
+  }
+  return host_mem;
+}
+
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipHostFree(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error deallocating host memory at " << location << ": "
+               << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
+  ScopedActivateContext activation{context};
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
+                                                     hipHostRegisterPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipHostUnregister(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  GpuEventHandle* event) {
+  if (*event == nullptr) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        "input event cannot be null"};
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
+  *event = nullptr;
+
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INTERNAL,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+  }
+}
+
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 GpuEventHandle event,
+                                                 GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INVALID_ARGUMENT,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+  }
+}
+
+/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
+    GpuContext* context, GpuEventHandle event) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventQuery(event);
+  if (res != hipSuccess && res != hipErrorNotReady) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
+  }
+
+  return res;
+}
+
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 GpuEventHandle start,
+                                                 GpuEventHandle stop) {
+  ScopedActivateContext activated{context};
+  // The stop event must have completed in order for hipEventElapsedTime to
+  // work.
+  hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
+    return false;
+  }
+  res =
+      tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get elapsed time between events: "
+               << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               GpuEventHandle event) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
+  if (res != hipSuccess) {
+    port::Status status = port::InternalError(
+        absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
+  }
+  VLOG(2) << "successfully synchronized stream " << stream << " on device "
+          << context->device_ordinal();
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
+  if (res == hipSuccess) {
+    return true;
+  }
+
+  if (res != hipErrorNotReady) {
+    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
+  }
+  return false;
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
+    GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(
+        absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
+                        "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
+                        ToString(res).c_str(), host_dst,
+                        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
+          << host_dst;
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoD(
+      gpu_dst, const_cast<void*>(host_src), size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
+        " host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size));
+  }
+  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
+          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
+          << host_dst << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
+      gpu_dst, const_cast<void*>(host_src), size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
+        "host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  if (result != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to device: %s"
+        "; Gpu dst: %p on %s %s"
+        "; Gpu src: %p on %s %s"
+        "; can access? %s; size: %llu=0x%llx",
+        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
+        ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
+        ROCMPointerToDeviceString(gpu_dst).c_str(),
+        absl::bit_cast<void*>(gpu_src),
+        ROCMPointerToMemorySpaceString(gpu_src).c_str(),
+        ROCMPointerToDeviceString(gpu_src).c_str(),
+        ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 GpuEventHandle* event,
+                                                 EventFlags flags) {
+  int hipflags;
+  switch (flags) {
+    case EventFlags::kDefault:
+      hipflags = hipEventDefault;
+      break;
+    case EventFlags::kDisableTiming:
+      hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
+      break;
+    default:
+      LOG(FATAL) << "impossible event flags: " << int(hipflags);
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  } else if (res == hipErrorMemoryAllocation) {
+    return port::Status{port::error::RESOURCE_EXHAUSTED,
+                        "could not create ROCM event: out of device memory"};
+  } else {
+    return port::Status{
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("could not create ROCM event: ", ToString(res))};
+  }
+}
+
+/* static */ int GpuDriver::GetDeviceCount() {
+  int device_count = 0;
+  hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
+    return 0;
+  }
+
+  if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
+    device_count = 1;
+  }
+  return device_count;
+}
+
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          hipDevice_t device) {
+  return port::Status(
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get compute capability for device: %d "
+                      "(unsupported API on AMD Gpus)",
+                      device));
+}
+
+/* static */ port::Status GpuDriver::GetPointerAddressRange(
+    hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
+  hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
+  if (result == hipSuccess) {
+    return port::Status::OK();
+  } else if (result == hipErrorNotFound) {
+    // We differentiate between "this pointer is unknown" (return here) and
+    // "there was an internal error while performing this operation" (return
+    // below).
+    return port::Status{port::error::NOT_FOUND,
+                        absl::StrFormat("not a device pointer %p; %s",
+                                        reinterpret_cast<void*>(dptr),
+                                        ToString(result).c_str())};
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get pointer into for device pointer %p; %s",
+                      reinterpret_cast<void*>(dptr), ToString(result).c_str())};
+}
+
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
+    hipDeviceptr_t pointer) {
+  unsigned int value;
+  hipError_t result = hipSuccess;
+  if (result == hipSuccess) {
+    switch (value) {
+      case hipMemoryTypeDevice:
+        return MemorySpace::kDevice;
+      case hipMemoryTypeHost:
+        return MemorySpace::kHost;
+      default:
+        return port::Status{
+            port::error::INTERNAL,
+            absl::StrCat("unknown memory space provided by ROCM API: ", value)};
+    }
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed to query device pointer for memory space: ",
+                   ToString(result))};
+}
+
+/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
+    hipDeviceptr_t pointer) {
+  hipPointerAttribute_t pointerAttributes;
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  hipDevice_t device;
+  result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  return device;
+}
+
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      hipDevice_t device) {
+  hipDeviceProp_t props;
+  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
+  if (result == hipSuccess) {
+    *version = props.gcnArch;
+    return port::Status::OK();
+  }
+  *version = 0;
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
+                      device)};
+}
+
+// Helper function that turns the integer output of hipDeviceGetAttribute to
+// type T and wraps it in a StatusOr.
+template <typename T>
+static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
+                                            hipDeviceAttribute_t attribute) {
+  int value = -1;
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not retrieve ROCM device attribute (", attribute,
+                     "): ", ToString(result))};
+  }
+  T converted = value;
+  return converted;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxSharedMemoryPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxThreadsPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxRegistersPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
+}
+
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           hipDevice_t device) {
+  int value;
+  hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimX, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
+    return false;
+  }
+  *x = value;
+
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimY, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
+    return false;
+  }
+  *y = value;
+
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimZ, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
+    return false;
+  }
+  *z = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query driver version: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceProperties(
+    hipDeviceProp_t* device_properties, int device_ordinal) {
+  hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
+                                                            device_ordinal);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
+    hipDeviceAttribute_t attribute, hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, attribute);
+}
+
+/* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
+  int value = -1;
+  hipError_t res = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
+  ScopedActivateContext activation{context};
+  size_t free = 0;
+  size_t total = 0;
+  hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
+    return false;
+  }
+
+  *free_out = free;
+  *total_out = total;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
+                                                  uint64* result) {
+  size_t value = -1;
+  hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
+  string pci_bus_id;
+  static const int kBufferSize = 64;
+  absl::InlinedVector<char, 4> chars(kBufferSize);
+  chars[kBufferSize - 1] = '\0';
+  hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
+      chars.begin(), kBufferSize - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
+    return pci_bus_id;
+  }
+  pci_bus_id = chars.begin();
+  return pci_bus_id;
+}
+
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return true;  // A device can always access its own memory.
+  }
+
+  int can_access_peer = -1;
+  hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
+      &can_access_peer, from->device_ordinal(), to->device_ordinal());
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+    return false;
+  }
+
+  return can_access_peer;
+}
+
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return port::Status::OK();  // A device can always access its own memory.
+  }
+
+  ScopedActivateContext activated{from};
+  hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
+      to->device_ordinal(), 0 /* = flags */);
+  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to enable peer access from %d to %d: %s",
+                        from->device_ordinal(), to->device_ordinal(),
+                        ToString(result).c_str())};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, hipFunction_t kernel, int threads_per_block,
+    size_t dynamic_shared_memory_bytes) {
+  ScopedActivateContext activation{context};
+
+  int max_blocks = 0;
+  hipError_t result = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
+                        kernel, ToString(result).c_str())};
+  }
+
+  return max_blocks;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba803edaafb1809bcac54327c73345ec95238c42
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -0,0 +1,138 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocm driver calls with dso loader so that we don't need to
+// have explicit linking to librocm. All TF rocm driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+
+#define __HIP_DISABLE_CPP_FUNCTIONS__
+
+#include "rocm/include/hip/hip_runtime.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#if defined(TENSORFLOW_USE_ROCM)
+
+#endif
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
+  template <typename... Args>                                            \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                     \
+  }
+
+// This macro wraps a global identifier, given by hipSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
+  template <typename... Args>                                               \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(hipSymbolName);                     \
+      void *f;                                                              \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()     \
+              .ValueOrDie(),                                                \
+          kName, &f);                                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in HIP DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+#define HIP_ROUTINE_EACH(__macro)                   \
+  __macro(hipDeviceCanAccessPeer)                   \
+  __macro(hipDeviceEnablePeerAccess)                \
+  __macro(hipDeviceGet)                             \
+  __macro(hipDeviceGetAttribute)                    \
+  __macro(hipDeviceGetName)                         \
+  __macro(hipDeviceGetPCIBusId)                     \
+  __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceSetSharedMemConfig)              \
+  __macro(hipDeviceSynchronize)                     \
+  __macro(hipDeviceTotalMem)                        \
+  __macro(hipDriverGetVersion)                      \
+  __macro(hipEventCreateWithFlags)                  \
+  __macro(hipEventElapsedTime)                      \
+  __macro(hipEventDestroy)                          \
+  __macro(hipEventQuery)                            \
+  __macro(hipEventRecord)                           \
+  __macro(hipEventSynchronize)                      \
+  __macro(hipFree)                                  \
+  __macro(hipFuncSetCacheConfig)                    \
+  __macro(hipGetDevice)                             \
+  __macro(hipGetDeviceCount)                        \
+  __macro(hipGetDeviceProperties)                   \
+  __macro(hipHostFree)                              \
+  __macro(hipHostMalloc)                            \
+  __macro(hipHostRegister)                          \
+  __macro(hipHostUnregister)                        \
+  __macro(hipInit)                                  \
+  __macro(hipMalloc)                                \
+  __macro(hipMemGetAddressRange)                    \
+  __macro(hipMemGetInfo)                            \
+  __macro(hipMemcpyDtoD)                            \
+  __macro(hipMemcpyDtoDAsync)                       \
+  __macro(hipMemcpyDtoH)                            \
+  __macro(hipMemcpyDtoHAsync)                       \
+  __macro(hipMemcpyHtoD)                            \
+  __macro(hipMemcpyHtoDAsync)                       \
+  __macro(hipMemset)                                \
+  __macro(hipMemsetD32)                             \
+  __macro(hipMemsetD8)                              \
+  __macro(hipMemsetAsync)                           \
+  __macro(hipMemsetD32Async)                        \
+  __macro(hipModuleGetFunction)                     \
+  __macro(hipModuleGetGlobal)                       \
+  __macro(hipModuleLaunchKernel)                    \
+  __macro(hipModuleLoadData)                        \
+  __macro(hipModuleUnload)                          \
+  __macro(hipPointerGetAttributes)                  \
+  __macro(hipSetDevice)                             \
+  __macro(hipStreamAddCallback)                     \
+  __macro(hipStreamCreateWithFlags)                 \
+  __macro(hipStreamDestroy)                         \
+  __macro(hipStreamQuery)                           \
+  __macro(hipStreamSynchronize)                     \
+  __macro(hipStreamWaitEvent)                       \
+// clang-format on
+
+HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
+#undef HIP_ROUTINE_EACH
+#undef STREAM_EXECUTOR_HIP_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_event.cc b/tensorflow/stream_executor/rocm/rocm_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0ffd74c177bf5149f98cc045a51559b9acf1d94
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_event.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+Event::Status GpuEvent::PollForStatus() {
+  port::StatusOr<hipError_t> status =
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error polling for event status: "
+               << status.status().error_message();
+    return Event::Status::kError;
+  }
+
+  switch (status.ValueOrDie()) {
+    case hipSuccess:
+      return Event::Status::kComplete;
+    case hipErrorNotReady:
+      return Event::Status::kPending;
+    default:
+      LOG(INFO) << "Error condition returned for event status: "
+                << status.ValueOrDie();
+      return Event::Status::kError;
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.cc b/tensorflow/stream_executor/rocm/rocm_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2af973309c01ec67de0d7022e5a8cefd18c5063a
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.cc
@@ -0,0 +1,618 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_fft.h"
+
+#include <complex>
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocFftPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                 \
+    template <typename... Args>                                  \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};            \
+      return ::__name(args...);                                  \
+    }                                                            \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocfftDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocfft DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) {          \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+#define ROCFFT_ROUTINE_EACH(__macro)                                           \
+  __macro(hipfftDestroy) __macro(hipfftSetStream) __macro(hipfftPlan1d)        \
+      __macro(hipfftPlan2d) __macro(hipfftPlan3d) __macro(hipfftPlanMany)      \
+          __macro(hipfftCreate) __macro(hipfftSetAutoAllocation)               \
+              __macro(hipfftSetWorkArea) __macro(hipfftGetSize1d)              \
+                  __macro(hipfftMakePlan1d) __macro(hipfftGetSize2d)           \
+                      __macro(hipfftMakePlan2d) __macro(hipfftGetSize3d)       \
+                          __macro(hipfftMakePlan3d) __macro(hipfftGetSizeMany) \
+                              __macro(hipfftMakePlanMany)                      \
+                                  __macro(hipfftExecD2Z)                       \
+                                      __macro(hipfftExecZ2D)                   \
+                                          __macro(hipfftExecC2C)               \
+                                              __macro(hipfftExecC2R)           \
+                                                  __macro(hipfftExecZ2Z)       \
+                                                      __macro(hipfftExecR2C)
+
+ROCFFT_ROUTINE_EACH(STREAM_EXECUTOR_ROCFFT_WRAP)
+
+}  // namespace wrap
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into rocFFT arguments.
+hipfftType ROCMFftType(fft::Type type) {
+  switch (type) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kC2CInverse:
+      return HIPFFT_C2C;
+    case fft::Type::kC2R:
+      return HIPFFT_C2R;
+    case fft::Type::kR2C:
+      return HIPFFT_R2C;
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kZ2ZInverse:
+      return HIPFFT_Z2Z;
+    case fft::Type::kZ2D:
+      return HIPFFT_Z2D;
+    case fft::Type::kD2Z:
+      return HIPFFT_D2Z;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+// Associates the given stream with the given rocFFT plan.
+bool SetStream(GpuExecutor *parent, hipfftHandle plan, Stream *stream) {
+  auto ret = wrap::hipfftSetStream(parent, plan, AsGpuStreamValue(stream));
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine hipfftSetStream: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+port::Status ROCMFftPlan::Initialize(
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    uint64 *input_embed, uint64 input_stride, uint64 input_distance,
+    uint64 *output_embed, uint64 output_stride, uint64 output_distance,
+    fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
+  if (IsInitialized()) {
+    LOG(FATAL) << "Try to repeatedly initialize.";
+  }
+  is_initialized_ = true;
+  int elem_count_[3], input_embed_[3], output_embed_[3];
+  for (int i = 0; i < rank; ++i) {
+    elem_count_[i] = elem_count[i];
+    if (input_embed) {
+      input_embed_[i] = input_embed[i];
+    }
+    if (output_embed) {
+      output_embed_[i] = output_embed[i];
+    }
+  }
+  parent_ = parent;
+  fft_type_ = type;
+  if (batch_count == 1 && input_embed == nullptr && output_embed == nullptr) {
+    hipfftResult_t ret;
+    if (scratch_allocator == nullptr) {
+      switch (rank) {
+        case 1:
+          // hipfftPlan1d
+          ret = wrap::hipfftPlan1d(parent, &plan_, elem_count_[0],
+                                   ROCMFftType(type), 1 /* = batch */);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 1d plan."};
+          }
+          return port::Status::OK();
+        case 2:
+          // hipfftPlan2d
+          ret = wrap::hipfftPlan2d(parent, &plan_, elem_count_[0],
+                                   elem_count_[1], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 2d plan."};
+          }
+          return port::Status::OK();
+        case 3:
+          // hipfftPlan3d
+          ret =
+              wrap::hipfftPlan3d(parent, &plan_, elem_count_[0], elem_count_[1],
+                                 elem_count_[2], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 3d plan."};
+          }
+          return port::Status::OK();
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+    } else {
+      ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set auto allocation for rocFFT plan."};
+      }
+      size_t size_in_bytes;
+      switch (rank) {
+        case 1:
+          ret = wrap::hipfftMakePlan1d(parent, plan_, elem_count_[0],
+                                       ROCMFftType(type), /*batch=*/1,
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 1d plan."};
+          }
+          break;
+        case 2:
+          ret = wrap::hipfftMakePlan2d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], ROCMFftType(type),
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 2d plan."};
+          }
+          break;
+        case 3:
+          ret = wrap::hipfftMakePlan3d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], elem_count_[2],
+                                       ROCMFftType(type), &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 3d plan."};
+          }
+          break;
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+      // TODO(yangzihao): refactor this code and the one with the same function
+      // in the batch mode.
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT plan."};
+      }
+      return port::Status::OK();
+    }
+  } else {
+    // For either multiple batches or rank higher than 3, use hipfftPlanMany().
+    if (scratch_allocator == nullptr) {
+      auto ret = wrap::hipfftPlanMany(
+          parent, &plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+    } else {
+      auto ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT batched plan:"
+                   << ret;
+        return port::Status{
+            port::error::INTERNAL,
+            "Failed to set auto allocation for rocFFT bacthed plan."};
+      }
+      size_t size_in_bytes;
+      ret = wrap::hipfftMakePlanMany(
+          parent, plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count, &size_in_bytes);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to make rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to make rocFFT bacthed plan."};
+      }
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT bacthed plan."};
+      }
+    }
+  }
+  return port::Status::OK();
+}
+
+port::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
+                                     int rank, uint64 *elem_count,
+                                     fft::Type type,
+                                     ScratchAllocator *scratch_allocator) {
+  return Initialize(parent_, stream, rank, elem_count,
+                    /*input_embed=*/nullptr, /*input_stride=*/0,
+                    /*input_distance=*/0,
+                    /*output_embed=*/nullptr, /*output_stride=*/0,
+                    /*output_distance=*/0, type, 1, scratch_allocator);
+}
+
+ROCMFftPlan::~ROCMFftPlan() { wrap::hipfftDestroy(parent_, plan_); }
+
+int ROCMFftPlan::GetFftDirection() const {
+  if (!IsInitialized()) {
+    LOG(FATAL) << "Try to get fft direction before initialization.";
+  } else {
+    switch (fft_type_) {
+      case fft::Type::kC2CForward:
+      case fft::Type::kZ2ZForward:
+      case fft::Type::kR2C:
+      case fft::Type::kD2Z:
+        return HIPFFT_FORWARD;
+      case fft::Type::kC2CInverse:
+      case fft::Type::kZ2ZInverse:
+      case fft::Type::kC2R:
+      case fft::Type::kZ2D:
+        return HIPFFT_BACKWARD;
+      default:
+        LOG(FATAL) << "Invalid value of fft::Type.";
+    }
+  }
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlan(Stream *stream, uint64 num_x,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  // TODO(yangzihao): In the future, send error msg back to TensorFlow
+  // so it can fail gracefully,
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 1d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, fft::Type type, bool in_place_fft,
+    ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 1d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 2d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 2d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, uint64 num_z,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 3d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 3d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlan(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan: "
+               << status.error_message();
+  }
+
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan with customized "
+                  "allocator: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+void ROCMFft::UpdatePlanWithScratchAllocator(
+    Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "update plan with scratch allocator not implemented";
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfftExec,
+                            const DeviceMemory<InputT> &input,
+                            DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)));
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                         FuncT hipfftExec,
+                                         const DeviceMemory<InputT> &input,
+                                         DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)),
+                        rocm_fft_plan->GetFftDirection());
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+#define STREAM_EXECUTOR_ROCM_DEFINE_FFT(__type, __fft_type1, __fft_type2,    \
+                                        __fft_type3)                         \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftWithDirectionInternal(                                       \
+        stream, plan, wrap::hipfftExec##__fft_type1, input, output);         \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<__type> &input,                     \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type2, input, \
+                         output);                                            \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<__type> *output) {                        \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type3, input, \
+                         output);                                            \
+  }
+
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef STREAM_EXECUTOR_ROCM_DEFINE_FFT
+
+}  // namespace gpu
+
+void initialize_rocfft() {
+  auto rocFftAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+
+  if (!rocFftAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+            rocm::kROCmPlatformId, gpu::kRocFftPlugin, "rocFFT",
+            [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+              gpu::GpuExecutor *rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor *>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the rocFFT "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              return new gpu::ROCMFft(rocm_executor);
+            });
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocFFT factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocfft,
+                            { stream_executor::initialize_rocfft(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.h b/tensorflow/stream_executor/rocm/rocm_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..7086d8a4b129a5807fdbde6d9ace6ee437edc3ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.h
@@ -0,0 +1,132 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for FFT functionality -- this wraps the rocFFT library
+// capabilities, and is only included into ROCM implementation code -- it will
+// not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+
+#include "rocm/include/rocfft/hipfft.h"
+#include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+class GpuExecutor;
+
+// Opaque and unique indentifier for the rocFFT plugin.
+extern const PluginId kRocFftPlugin;
+
+// ROCMFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create hipfft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
+class ROCMFftPlan : public fft::Plan {
+ public:
+  ROCMFftPlan()
+      : parent_(nullptr),
+        plan_(),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        is_initialized_(false) {}
+  ~ROCMFftPlan() override;
+
+  // Get FFT direction in hipFFT based on FFT type.
+  int GetFftDirection() const;
+  hipfftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get hipfftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, uint64 *input_embed,
+                          uint64 input_stride, uint64 input_distance,
+                          uint64 *output_embed, uint64 output_stride,
+                          uint64 output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Initialize function for 1d,2d, and 3d plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, fft::Type type,
+                          ScratchAllocator *scratch_allocator);
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
+
+ private:
+  GpuExecutor *parent_;
+  hipfftHandle plan_;
+  fft::Type fft_type_;
+  DeviceMemory<uint8> scratch_;
+  bool is_initialized_;
+};
+
+// FFT support for ROCM platform via rocFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the hipFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the hipFFT handle when a
+// ROCM context is active.
+//
+// Thread-safe. The ROCM context associated with all operations is the ROCM
+// context of parent_, so all context is explicit.
+class ROCMFft : public fft::FftSupport {
+ public:
+  explicit ROCMFft(GpuExecutor *parent) : parent_(parent) {}
+  ~ROCMFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  GpuExecutor *parent_;
+
+  // Two helper functions that execute dynload::hipfftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                  FuncT hipfft_exec,
+                                  const DeviceMemory<InputT> &input,
+                                  DeviceMemory<OutputT> *output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfft_exec,
+                     const DeviceMemory<InputT> &input,
+                     DeviceMemory<OutputT> *output);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMFft);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f6af695dfdf7ee8623478fbc2250c2fb7d39e64
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -0,0 +1,977 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+
+#include "absl/base/casts.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
+#endif
+
+#ifdef __ROCM_RUNTIME_H__
+#error \
+    "ROCM runtime being included into ROCM GPU executor; should be driver only."
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+static GpuEvent* AsGpuEvent(Event* event) {
+  DCHECK(event != nullptr);
+  return static_cast<GpuEvent*>(event->implementation());
+}
+
+// Given a platform-independent timer datatype, returns the internal ROCM
+// platform implementation pointer.
+static GpuTimer* AsGpuTimer(Timer* timer) {
+  DCHECK(timer != nullptr);
+  return static_cast<GpuTimer*>(timer->implementation());
+}
+
+// Given const GPU memory, returns a librocm device pointer datatype, suitable
+// for passing directly to librocm APIs.
+//
+// N.B. we must lose constness in order to pass a suitable type to the existing
+// librocm APIs, so the caller should take care to only pass the result of const
+// GPU memory conversions to librocm functions which will honor constness.
+static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
+  return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
+}
+
+// See description on const version above.
+static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
+  return AsROCmDevicePtr(*gpu_mem);
+}
+
+static GpuContext* GetGpuContext(Stream* stream) {
+  return static_cast<GpuExecutor*>(stream->parent()->implementation())
+      ->gpu_context();
+}
+
+GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
+  CHECK(rocm_exec != nullptr);
+  return rocm_exec->gpu_context();
+}
+
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
+}
+
+GpuExecutor::~GpuExecutor() {
+  for (auto& it : disk_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  for (auto& it : in_memory_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  if (context_ != nullptr) {
+    GpuDriver::DestroyContext(context_);
+  }
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
+}
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
+    return false;
+  }
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
+  VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading  HSACO module " << module;
+    GpuDriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
+}
+
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
+  device_ordinal_ = device_ordinal;
+
+  auto status = GpuDriver::Init();
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return GpuDriver::GetGpuISAVersion(&version_, device_);
+}
+
+bool GpuExecutor::FindOnDiskForComputeCapability(
+    absl::string_view filename, absl::string_view canonical_suffix,
+    string* found_filename) const {
+  LOG(FATAL) << "Feature not supported on ROCM platform "
+                "(FindOnDiskForComputeCapability)";
+  return false;
+}
+
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  if (version_ == 0) {
+    return false;
+  }
+
+  string cc_specific =
+      absl::StrCat(filename, ".cc", version_, canonical_suffix);
+  if (port::FileExists(cc_specific).ok()) {
+    VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
+            << cc_specific;
+    *found_filename = cc_specific;
+    return true;
+  }
+
+  VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
+          << cc_specific;
+  if (port::FileExists(string(filename)).ok()) {
+    *found_filename = string(filename);
+    return true;
+  }
+
+  return false;
+}
+
+// Returns the path to the running executable.
+// N.B. Derived from //knowledge/smalltalk/background_kb.cc
+// Arg: strip_exe: if true, remove the name of the executable itself from the
+//                 returned string. Example: calling this from /usr/bin/foo
+//                 would return /usr/bin.
+static string GetBinaryDir(bool strip_exe) {
+  char exe_path[PATH_MAX] = {0};
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  // Make sure it's null-terminated:
+  exe_path[sizeof(exe_path) - 1] = 0;
+
+  if (strip_exe) {
+    // The exe is the last component of the path, so remove one component.
+    string ret = exe_path;
+    std::vector<string> components = port::Split(exe_path, '/');
+    components.pop_back();
+    return port::Join(components, "/");
+  }
+  return exe_path;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* rocm_kernel = AsGpuKernel(kernel);
+  hipModule_t module = nullptr;
+  const string* kernelname;
+
+  const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
+  bool has_cubin = spec.has_cuda_cubin_on_disk();
+  if (has_cubin) {
+    on_disk_spec = &spec.cuda_cubin_on_disk();
+  }
+
+  if (on_disk_spec != nullptr) {
+    LOG(WARNING) << "loading ROCM kernel from disk is not supported";
+    return false;
+  } else if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+
+    const char* hsaco = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[hsaco];
+
+    if (module == nullptr) {
+      if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
+        LOG(ERROR) << "failed to load HSACO\n";
+        return false;
+      }
+      in_memory_modules_[hsaco] = module;
+    }
+  } else {
+    LOG(WARNING) << "no method of loading ROCM kernel provided";
+    return false;
+  }
+
+  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    rocm_kernel->gpu_function_ptr())) {
+    return false;
+  }
+
+  // We have to trust the kernel loader spec arity because there doesn't appear
+  // to be a way to reflect on the number of expected arguments w/the ROCM API.
+  rocm_kernel->set_arity(spec.arity());
+
+  KernelMetadata kernel_metadata;
+  if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
+    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+  }
+  kernel->set_metadata(kernel_metadata);
+  kernel->set_name(*kernelname);
+  return true;
+}
+
+bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
+                                    KernelMetadata* kernel_metadata) {
+  int value = 0;
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_registers_per_thread(value);
+
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_shared_memory_bytes(value);
+
+  return true;
+}
+
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
+  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+  GpuStreamHandle hipstream = AsGpuStreamValue(stream);
+  const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
+  hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
+
+  // Only perform/print the occupancy check once.  Even just checking to see
+  // whether we've done an occupancy check on this kernel before isn't free
+  // (because we have to synchronize), so we only do this at -v 2+.
+  if (VLOG_IS_ON(2)) {
+    mutex_lock lock(launched_kernels_mu_);
+    if (!launched_kernels_.count(hipfunc)) {
+      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+      // expose a kernel/module deallocation method.
+      launched_kernels_.insert(hipfunc);
+    }
+  }
+
+  if (rocm_kernel->GetPreferredCacheConfig() !=
+      KernelCacheConfig::kNoPreference) {
+    GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
+  }
+
+  // prepare kernargs
+  // KernelArgsArrayBase keeps the pointer of arguments
+  // deference them here
+  std::vector<void*> kernargs;
+  KernelArgIterator iter = args.arg_iterator();
+  while (iter.has_next()) {
+    KernelArg arg = iter.next();
+    VLOG(2) << "*(arg.address): "
+            << reinterpret_cast<void*>(
+                   *static_cast<const uint64_t*>(arg.address));
+    kernargs.push_back(
+        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
+  }
+
+  size_t size = sizeof(void*) * kernargs.size();
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+
+  if (!GpuDriver::LaunchKernel(
+          GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
+    LOG(ERROR) << "failed to launch ROCM kernel with args: "
+               << args.number_of_arguments()
+               << "; thread dim: " << thread_dims.ToString()
+               << "; block dim: " << block_dims.ToString();
+    return false;
+  }
+
+  return true;
+}
+
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
+  return 0;
+}
+
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
+  return 0;
+}
+
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the  HSACO binary  as
+  // ModuleHandle::id().
+  hipModule_t hip_module = nullptr;
+  // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromHsaco(
+            reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
+            &hip_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else {
+    LOG(ERROR) << "No HSACO binary found \n";
+    return false;
+  }
+}
+
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
+
+  if (*module == nullptr) {
+    if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
+      LOG(ERROR) << "failed to load : HSACO \n";
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[hsaco] = {*module, module_refcount};
+  return true;
+}
+
+// This is a non-essential operation; if there's a failure, proceed without
+// logging an error. It's nearly certain that in case of failures, we'd never
+// get here in the first place; these are very low-impact routines.
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
+  // TODO(ROCm) implement this feature in HIP
+}
+
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
+}
+
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
+  // offset and size are in bytes, so char* works as the pointer type.
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
+}
+
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
+  // ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
+  if (!mem->is_sub_buffer()) {
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
+  }
+}
+
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
+  if (location == nullptr || size == 0) {
+    LOG(WARNING) << "attempting to register null or zero-sized memory: "
+                 << location << "; size " << size;
+  }
+  VLOG(2) << "registering " << location << " size " << size;
+  return GpuDriver::HostRegister(context_, location, size);
+}
+
+bool GpuExecutor::HostMemoryUnregister(void* location) {
+  VLOG(2) << "unregistering " << location;
+  return GpuDriver::HostUnregister(context_, location);
+}
+
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
+}
+
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), 0x0, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           0x0, size);
+}
+
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    // hipMemset reinterprets "value" as a uint8.
+    uint8 byte_value = static_cast<uint8>(value);
+    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
+                     (byte_value << 8) | byte_value;
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), pattern, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           value, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         host_src, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return Memset32(stream, location, 0x0, size);
+  } else {
+    return Memset(stream, location, 0x0, size);
+  }
+}
+
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset8 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+        size % 4 == 0);
+  return GpuDriver::AsynchronousMemsetUint32(
+      context_, AsROCmDevicePtr(location), pattern, size / 4,
+      AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
+}
+
+/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
+                                                    hipError_t status,
+                                                    void* data) {
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
+  (*callback)();
+  delete callback;
+}
+
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
+}
+
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
+}
+
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
+    return port::Status::OK();
+  } else {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("error recording waiting for ROCM event on stream %p",
+                        stream)};
+  }
+}
+
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
+}
+
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
+}
+
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* rocm_stream = AsGpuStream(stream);
+  if (!rocm_stream->IsIdle()) {
+    LOG(ERROR) << "Deallocating stream with pending work";
+  }
+  rocm_stream->Destroy();
+}
+
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
+}
+
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
+}
+
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
+  if (!ok) {
+    LOG(ERROR) << "failed to record completion event; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
+}
+
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
+}
+
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
+}
+
+blas::BlasSupport* GpuExecutor::CreateBlas() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::DnnFactory> status =
+      registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.dnn());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve DNN factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+fft::FftSupport* GpuExecutor::CreateFft() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+rng::RngSupport* GpuExecutor::CreateRng() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+// TODO(rspringer): Remove in b/18544742.
+bool GpuExecutor::SupportsDnn() const { return true; }
+
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
+}
+
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
+}
+
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
+  port::StatusOr<hipSharedMemConfig> rocm_config =
+      GpuDriver::ContextGetSharedMemConfig(context_);
+  if (!rocm_config.ok()) {
+    // Don't log; the failed call will log necessary output.
+    return SharedMemoryConfig::kDefault;
+  }
+
+  switch (rocm_config.ValueOrDie()) {
+    case hipSharedMemBankSizeDefault:
+      return SharedMemoryConfig::kDefault;
+    case hipSharedMemBankSizeFourByte:
+      return SharedMemoryConfig::kFourByte;
+    case hipSharedMemBankSizeEightByte:
+      return SharedMemoryConfig::kEightByte;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration returned: "
+                 << rocm_config.ValueOrDie();
+  }
+}
+
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
+    SharedMemoryConfig config) {
+  hipSharedMemConfig rocm_config;
+  switch (config) {
+    case SharedMemoryConfig::kDefault:
+      rocm_config = hipSharedMemBankSizeDefault;
+      break;
+    case SharedMemoryConfig::kFourByte:
+      rocm_config = hipSharedMemBankSizeFourByte;
+      break;
+    case SharedMemoryConfig::kEightByte:
+      rocm_config = hipSharedMemBankSizeEightByte;
+      break;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration specified: "
+                 << static_cast<int>(config);
+  }
+  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
+}
+
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
+}
+
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{disk_modules_mu_};
+    for (auto& it : disk_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    for (auto& it : in_memory_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      if (GpuDriver::GetModuleSymbol(
+              context_, it->second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+
+    for (auto& it : gpu_binary_to_module_) {
+      if (GpuDriver::GetModuleSymbol(
+              context_, it.second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+  }
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return false;
+}
+
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
+  // we use BlockDims to express the dimensions of blocks within a grid
+  // (as opposed to ThreadDim which expresses the dimensions of threads
+  // within a block).
+  int x, y, z;
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+    return false;
+  }
+
+  block_dim_limit->x = x;
+  block_dim_limit->y = y;
+  block_dim_limit->z = z;
+  return true;
+}
+
+bool GpuExecutor::SupportsBlas() const { return true; }
+
+bool GpuExecutor::SupportsFft() const { return true; }
+
+bool GpuExecutor::SupportsRng() const { return true; }
+
+std::unique_ptr<internal::EventInterface>
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+}
+
+std::unique_ptr<internal::KernelInterface>
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
+}
+
+std::unique_ptr<internal::StreamInterface>
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+}
+
+std::unique_ptr<internal::TimerInterface>
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
+}
+
+void* GpuExecutor::GpuContextHack() { return context_; }
+
+GpuContext* GpuExecutor::gpu_context() { return context_; }
+
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+//
+// For anything more complicated/prod-focused than this, you'll likely want to
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
+  // TODO(ROCm) implement this feature in HIP
+  return 1;
+}
+
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  {
+    int driver_version = 0;
+    (void)GpuDriver::GetDriverVersion(&driver_version);
+    string augmented_driver_version = absl::StrFormat(
+        "%d (%s)", driver_version,
+        rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
+    builder.set_driver_version(augmented_driver_version);
+  }
+
+  {
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+
+    // Lower the hex characters to match sysfs.
+    pci_bus_id = port::Lowercase(pci_bus_id);
+    builder.set_pci_bus_id(pci_bus_id);
+
+    // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    builder.set_numa_node(numa_node);
+  }
+
+  hipDeviceProp_t prop;
+  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
+    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+
+    ThreadDim thread_dim_limit;
+    thread_dim_limit.x = prop.maxThreadsDim[0];
+    thread_dim_limit.y = prop.maxThreadsDim[1];
+    thread_dim_limit.z = prop.maxThreadsDim[2];
+    builder.set_thread_dim_limit(thread_dim_limit);
+
+    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
+    builder.set_clock_rate_ghz(clock_rate_ghz);
+  }
+
+  {
+    bool ecc_enabled = false;
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    builder.set_ecc_enabled(ecc_enabled);
+  }
+
+  {
+    uint64 device_memory_size = -1;
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    builder.set_device_memory_size(device_memory_size);
+  }
+
+  {
+    BlockDim block_dim_limit;
+    FillBlockDimLimit(&block_dim_limit);
+    builder.set_block_dim_limit(block_dim_limit);
+  }
+
+  {
+    string device_name;
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    builder.set_name(device_name);
+  }
+
+  builder.set_platform_version(
+      absl::StrCat("AMDGPU ISA version: gfx", version_));
+
+  // TODO(leary) should be a way to query this from the driver, but this is
+  // unlikely to change for us any time soon.
+  builder.set_device_address_bits(64);
+
+  builder.set_device_vendor("Advanced Micro Devices, Inc");
+  builder.set_rocm_amdgpu_isa_version(version_);
+  builder.set_shared_memory_per_core(
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+  builder.set_shared_memory_per_block(
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+  builder.set_core_count(
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+  builder.set_threads_per_core_limit(
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+  builder.set_registers_per_block_limit(
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+  builder.set_threads_per_warp(
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+  builder.set_registers_per_core_limit(64 * 1024);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace gpu
+
+void initialize_rocm_gpu_executor() {
+  *internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
+  };
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
+  stream_executor::initialize_rocm_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/rocm/rocm_kernel.cc b/tensorflow/stream_executor/rocm/rocm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..162b2bdc71574e7dc30f5a3ed2d5a15a45d97206
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return hipFuncCachePreferNone;
+    case KernelCacheConfig::kPreferShared:
+      return hipFuncCachePreferShared;
+    case KernelCacheConfig::kPreferL1:
+      return hipFuncCachePreferL1;
+    case KernelCacheConfig::kPreferEqual:
+      return hipFuncCachePreferEqual;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.cc b/tensorflow/stream_executor/rocm/rocm_platform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce091658da4db4087f2b1078ad46b67afce5695e
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+ROCmPlatform::ROCmPlatform()
+    : name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
+
+ROCmPlatform::~ROCmPlatform() {}
+
+// Due to legacy issues in user code, we can't currently call InpectNumaNodes
+// at module initialization time, because non-GPU programs still include this
+// plugin via various methods, so instead, it has to be init-on-reference.
+void ROCmPlatform::InspectNumaNodes() {
+  // To get NUMA node information, we need to create all executors, so we can
+  // examine their device descriptions to see their bus assignments.
+  static bool initialized = false;
+  static mutex numa_mutex(LINKER_INITIALIZED);
+  mutex_lock lock(numa_mutex);
+  if (initialized) {
+    return;
+  }
+
+  StreamExecutorConfig config;
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    config.ordinal = i;
+    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+    if (i == 0) {
+      // NUMA nodes may not start at 0, so set the minimum node  based on the
+      // first executor we see.
+      min_numa_node_ = exec->GetDeviceDescription().numa_node();
+      limit_numa_node_ = min_numa_node_ + 1;
+    } else {
+      min_numa_node_ =
+          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+      limit_numa_node_ = std::max(limit_numa_node_,
+                                  exec->GetDeviceDescription().numa_node() + 1);
+    }
+  }
+  initialized = true;
+}
+
+int ROCmPlatform::BusCount() {
+  InspectNumaNodes();
+  return limit_numa_node_ - min_numa_node_;
+}
+
+int ROCmPlatform::DeviceToBus(int device_ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+  return exec->GetDeviceDescription().numa_node() - min_numa_node_;
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
+    int bus_ordinal) {
+  InspectNumaNodes();
+  CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    if (DeviceToBus(i) == bus_ordinal) {
+      StreamExecutorConfig config;
+      config.ordinal = i;
+      return GetExecutor(config).ValueOrDie();
+    }
+  }
+
+  return port::Status{
+      port::error::NOT_FOUND,
+      absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
+}
+
+Platform::Id ROCmPlatform::id() const { return rocm::kROCmPlatformId; }
+
+int ROCmPlatform::VisibleDeviceCount() const {
+  // Throw away the result - it logs internally, and this [containing] function
+  // isn't in the path of user control. It's safe to call this > 1x.
+
+  if (!gpu::GpuDriver::Init().ok()) {
+    return -1;
+  }
+
+  return GpuDriver::GetDeviceCount();
+}
+
+const string& ROCmPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat(
+            "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ROCmPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register ROCM trace listener";
+}
+
+void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
+}
+
+}  // namespace gpu
+
+static void InitializeROCmPlatform() {
+  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // registered platforms.
+  auto status = MultiPlatformManager::PlatformWithName("ROCM");
+  if (!status.ok()) {
+    std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
+    SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_platform,
+                            stream_executor::InitializeROCmPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.h b/tensorflow/stream_executor/rocm/rocm_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..d498e5fdb1e9ef1f31b2fea13625aba995d9acad
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCM platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+extern const Platform::Id kROCmPlatformId;
+
+// ROCm-specific platform plugin, registered as a singleton value via module
+// initializer.
+class ROCmPlatform : public Platform {
+ public:
+  ROCmPlatform();
+  ~ROCmPlatform() override;
+
+  // ROCmPlatform-specific functionality
+  // Returns the number of distinct buses / NUMA nodes on the machine.
+  int BusCount();
+
+  // Returns the bus/NUMA node for the specified device ordinal.
+  int DeviceToBus(int device_ordinal);
+
+  // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
+  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+
+  // Platform interface implementation:
+  // Returns the same value as kROCmPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // Determines the number of NUMA nodes and the assignment of executor to each.
+  void InspectNumaNodes();
+
+  // This platform's name.
+  string name_;
+
+  // mutex that guards internal state.
+  mutable mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  // The smallest NUMA node value for any device managed by this machine
+  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
+  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
+  int min_numa_node_;
+
+  // Larger than the NUMA node value for any device managed by this machine
+  // manager.
+  int limit_numa_node_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.cc b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb07858a96babaed11c991c59ff9644e0933ac6b
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace rocm {
+
+PLATFORM_DEFINE_ID(kROCmPlatformId);
+
+}  // namespace rocm
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.h b/tensorflow/stream_executor/rocm/rocm_platform_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..a17d4f97bbcb91e883f89d107da40aebcb6fba95
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// Opaque and unique identifier for the ROCm platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+// This is broken out here to avoid a circular dependency between ROCmPlatform
+// and ROCmExecutor.
+extern const Platform::Id kROCmPlatformId;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_rng.cc b/tensorflow/stream_executor/rocm/rocm_rng.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99bfc49d10fb0dd71aaf4ead6ece0f9336920545
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@@ -0,0 +1,325 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/hiprand/hiprand.h"
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+// Formats hiprandStatus_t to output prettified values into a log stream.
+std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
+#define OSTREAM_HIPRAND_STATUS(__name) \
+  case HIPRAND_STATUS_##__name:        \
+    in << "HIPRAND_STATUS_" #__name;   \
+    return in;
+
+  switch (status) {
+    OSTREAM_HIPRAND_STATUS(SUCCESS)
+    OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
+    OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
+    OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
+    OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
+    OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
+    OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
+    OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
+    default:
+      in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
+      return in;
+  }
+}
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                        \
+  struct WrapperShim__##__name {                                    \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};               \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocrand DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hiprandStatus operator()(GpuExecutor* parent, Args... args) {         \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandCreateGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandDestroyGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetStream);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniform);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniformDouble);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetGeneratorOffset);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormal);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormalDouble);
+
+}  // namespace wrap
+
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
+
+GpuRng::~GpuRng() {
+  if (rng_ != nullptr) {
+    wrap::hiprandDestroyGenerator(parent_, rng_);
+  }
+}
+
+bool GpuRng::Init() {
+  mutex_lock lock{mu_};
+  CHECK(rng_ == nullptr);
+
+  hiprandStatus_t ret =
+      wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create random number generator: " << ret;
+    return false;
+  }
+
+  CHECK(rng_ != nullptr);
+  return true;
+}
+
+bool GpuRng::SetStream(Stream* stream) {
+  hiprandStatus_t ret =
+      wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for random generation: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if std::complex stores its contents as two consecutive
+// elements. Tests int, float and double, as the last two are independent
+// specializations.
+constexpr bool ComplexIsConsecutiveFloats() {
+  return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
+         sizeof(std::complex<double>) == 16;
+}
+
+template <typename T>
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
+  mutex_lock lock{mu_};
+  static_assert(ComplexIsConsecutiveFloats(),
+                "std::complex values are not stored as consecutive values");
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // std::complex<T> is currently implemented as two consecutive T variables.
+  uint64 element_count = v->ElementCount();
+  if (std::is_same<T, std::complex<float>>::value ||
+      std::is_same<T, std::complex<double>>::value) {
+    element_count *= 2;
+  }
+
+  hiprandStatus_t ret;
+  if (std::is_same<T, float>::value ||
+      std::is_same<T, std::complex<float>>::value) {
+    ret = wrap::hiprandGenerateUniform(
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
+        element_count);
+  } else {
+    ret = wrap::hiprandGenerateUniformDouble(
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
+        element_count);
+  }
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
+               << " " << TypeString<T>() << "s at " << v->opaque() << ": "
+               << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+template <typename ElemT, typename FuncT>
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
+  mutex_lock lock{mu_};
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  uint64 element_count = v->ElementCount();
+  hiprandStatus_t ret =
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
+
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
+               << " floats at " << v->opaque() << ": " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormal);
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormalDouble);
+}
+
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
+  mutex_lock lock{mu_};
+  CHECK(rng_ != nullptr);
+
+  if (!CheckSeed(seed, seed_bytes)) {
+    return false;
+  }
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
+  // (which itself requires 16 for API consistency with host RNG fallbacks).
+  hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
+      parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set rng seed: " << ret;
+    return false;
+  }
+
+  ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to reset rng position: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace gpu
+
+void initialize_rocrand() {
+  auto rocRandAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
+
+  if (!rocRandAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+            rocm::kROCmPlatformId, gpu::kGpuRandPlugin, "rocRAND",
+            [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+              gpu::GpuExecutor* rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor*>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the hipRAND "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              gpu::GpuRng* rng = new gpu::GpuRng(rocm_executor);
+              if (!rng->Init()) {
+                // Note: Init() will log a more specific error.
+                delete rng;
+                return nullptr;
+              }
+              return rng;
+            });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocRAND factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocrand,
+                            { stream_executor::initialize_rocrand(); });
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index e7485ca426bc8108cc7a376906c6624c7cae5600..2577d3825fb448ac332d109990b3d556c4535835 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -281,6 +281,12 @@ Stream::~Stream() {
   }
 }
 
+port::Status Stream::RefreshStatus() {
+  port::Status status = parent_->GetStatus(this);
+  CheckStatus(status);
+  return status;
+}
+
 Stream &Stream::Init() {
   VLOG_CALL();
 
@@ -431,172 +437,6 @@ Stream &Stream::ThenBatchNormalizationBackward(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<Eigen::half> &biases,
-    dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<float> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<float> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::FORWARD, this, input_descriptor, input_data,
-          filter_descriptor, filter_data, output_descriptor, *output,
-          convolution_descriptor, dnn::AlgorithmConfig(), scratch_allocator,
-          &algorithm_desc, &scratch_memory));
-      CheckError(dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, algorithm_desc,
-          &scratch_memory, nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::FORWARD, this, input_descriptor, input_data,
-          filter_descriptor, filter_data, output_descriptor, *output,
-          convolution_descriptor, dnn::AlgorithmConfig(), scratch_allocator,
-          &algorithm_desc, &scratch_memory));
-      CheckError(dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, algorithm_desc,
-          &scratch_memory, nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenFusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<double> &conv_input_data, double conv_input_scale,
@@ -876,24 +716,6 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolve(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output) {
-  return ThenFusedConvolveWithScratch(
-      conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output,
-      /*scratch_allocator=*/nullptr);
-}
-
 Stream &Stream::ThenConvolve(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -902,10 +724,11 @@ Stream &Stream::ThenConvolve(
     const dnn::ConvolutionDescriptor &convolution_descriptor,
     const dnn::BatchDescriptor &output_descriptor,
     DeviceMemory<float> *output) {
-  return ThenConvolveWithScratch(input_descriptor, input_data,
-                                 filter_descriptor, filter_data,
-                                 convolution_descriptor, output_descriptor,
-                                 output, /*scratch_allocator=*/nullptr);
+  return ThenConvolveWithAlgorithm(
+      input_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output,
+      /*scratch_allocator=*/nullptr, dnn::AlgorithmConfig(),
+      /*output_profile_result=*/nullptr);
 }
 
 Stream &Stream::ThenConvolveQuantized(
@@ -995,42 +818,6 @@ Stream &Stream::ThenSeparableConvolve(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardDataWithScratch(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<float> *backward_input_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-          *backward_input_data, filter_descriptor, filter_data,
-          output_descriptor, backward_output_data, convolution_descriptor,
-          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
-          &scratch_memory));
-      CheckError(dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, algorithm_desc, &scratch_memory,
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<double> &filter_data,
@@ -1166,92 +953,6 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardDataWithScratch(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<Eigen::half> *backward_input_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-          *backward_input_data, filter_descriptor, filter_data,
-          output_descriptor, backward_output_data, convolution_descriptor,
-          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
-          &scratch_memory));
-      CheckError(dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, algorithm_desc, &scratch_memory,
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardData(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<float> *backward_input_data) {
-  return ThenConvolveBackwardDataWithScratch(
-      filter_descriptor, filter_data, output_descriptor, backward_output_data,
-      convolution_descriptor, input_descriptor, backward_input_data,
-      /*scratch_allocator=*/nullptr);
-}
-
-Stream &Stream::ThenConvolveBackwardFilterWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<float> *backward_filter_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-          input_data, filter_descriptor, *backward_filter_data,
-          output_descriptor, backward_output_data, convolution_descriptor,
-          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
-          &scratch_memory));
-      CheckError(dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, algorithm_desc, &scratch_memory,
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<double> &input_data,
@@ -1342,42 +1043,6 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardFilterWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<Eigen::half> *backward_filter_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      CheckStatus(dnn->PrepareForConvolution(
-          dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-          input_data, filter_descriptor, *backward_filter_data,
-          output_descriptor, backward_output_data, convolution_descriptor,
-          dnn::AlgorithmConfig(), scratch_allocator, &algorithm_desc,
-          &scratch_memory));
-      CheckError(dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, algorithm_desc, &scratch_memory,
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<Eigen::half> &input_data,
@@ -1423,20 +1088,6 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardFilter(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<float> *backward_filter_data) {
-  return ThenConvolveBackwardFilterWithScratch(
-      input_descriptor, input_data, output_descriptor, backward_output_data,
-      convolution_descriptor, filter_descriptor, backward_filter_data,
-      /*scratch_allocator=*/nullptr);
-}
-
 template <typename T>
 Stream &Stream::ThenConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor &input_descriptor,
@@ -1742,22 +1393,6 @@ Stream &Stream::ThenPoolBackward(
   return *this;
 }
 
-Stream &Stream::ThenNormalize(
-    const dnn::NormalizeDescriptor &normalize_descriptor,
-    const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(normalize_descriptor), PARAM(input_data), PARAM(output_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data,
-                                  output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenNormalizeWithDimensions(
     const dnn::NormalizeDescriptor &normalize_descriptor,
     const dnn::BatchDescriptor &dimensions,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index f698d50e2a8179cdc1a376c279a490d8c377cd8c..3e67d55922d1d4fa231b8f30caa95c2b782efbd8 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -109,6 +109,17 @@ class Stream {
   // stream.
   bool ok() const { return !InErrorState(); }
 
+  // Retrieves execution status back into the stream from the underlying
+  // implementation without blocking the stream.
+  //
+  // Normally, Stream::BlockHostUntilDone is used to get execution status.
+  // However, some devices use out-of-band mechnanisms to ensure their streams
+  // have finished on-device work, without needing to block the streams. (These
+  // devices should also override AllowsSyncOnCompletion to return false.) For
+  // these devices, this method can be used after work is finished to retrieve
+  // execution status.
+  port::Status RefreshStatus() LOCKS_EXCLUDED(mu_);
+
   // Initialize the stream. This must be performed before entraining any other
   // operations.
   Stream &Init() LOCKS_EXCLUDED(mu_);
@@ -262,19 +273,6 @@ class Stream {
       DeviceMemory<float> *scale_backprop,
       DeviceMemory<float> *offset_backprop);
 
-  // TODO(leary) add double-precision version of this interface.
-  Stream &ThenFusedConvolve(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<int8> *output);
-
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
                        const dnn::FilterDescriptor &filter_descriptor,
@@ -303,61 +301,6 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output_data);
 
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<Eigen::half> &biases,
-      dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<float> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<float> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<double> &input_data,
@@ -458,35 +401,6 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output);
 
-  Stream &ThenConvolveBackwardData(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<float> *backward_input_data);
-
-  Stream &ThenConvolveBackwardDataWithScratch(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<float> *backward_input_data,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveBackwardDataWithScratch(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<Eigen::half> *backward_input_data,
-      ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveBackwardDataWithAlgorithm(
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<double> &filter_data,
@@ -523,35 +437,6 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenConvolveBackwardFilter(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<float> *backward_filter_data);
-
-  Stream &ThenConvolveBackwardFilterWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<float> *backward_filter_data,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveBackwardFilterWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<Eigen::half> *backward_filter_data,
-      ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveBackwardFilterWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<double> &input_data,
@@ -684,12 +569,6 @@ class Stream {
                            DeviceMemory<Eigen::half> *output_diff_data,
                            ScratchAllocator *workspace_allocator = nullptr);
 
-  Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
-                        const DeviceMemory<float> &input_data,
-                        DeviceMemory<float> *output_data);
-
-  // Similar to ThenNormalize, but normalizes across feature maps and allows for
-  // specifying the dimensions of the tensor.
   Stream &ThenNormalizeWithDimensions(
       const dnn::NormalizeDescriptor &normalize_descriptor,
       const dnn::BatchDescriptor &dimensions,
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 341c6edccd3c1bfd314127c5356f03a15a85e1d3..46afedef3316bcd6b23c6f7b081af10db43d58f6 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
   return &instance;
 }
 
+// -- ROCm
+
+StreamExecutorFactory* MakeROCMExecutorImplementation() {
+  static StreamExecutorFactory instance;
+  return &instance;
+}
+
 // -- OpenCL
 
 StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 0c2c33cfca227b2d67fcdc633dd94274a65b92bb..36eabda459cdec634de8542961d24942e943d4d1 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/allocator_stats.h"
 #include "tensorflow/stream_executor/device_description.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_options.h"
@@ -253,6 +255,10 @@ class StreamExecutorInterface {
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
   virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
+  virtual port::Status GetStatus(Stream *stream) {
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "GetStatus is not supported on this executor.");
+  }
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
@@ -363,6 +369,11 @@ class StreamExecutorInterface {
   // as a platform.
   virtual void *GpuContextHack() { return nullptr; }
 
+  // Return allocator statistics.
+  virtual absl::optional<AllocatorStats> GetAllocatorStats() {
+    return absl::nullopt;
+  }
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };
@@ -374,9 +385,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
 using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
 using KernelFactory = std::function<KernelInterface*()>;
 
-StreamExecutorFactory* MakeCUDAExecutorImplementation();
+StreamExecutorFactory *MakeCUDAExecutorImplementation();
+
+StreamExecutorFactory *MakeROCMExecutorImplementation();
 
-StreamExecutorFactory* MakeOpenCLExecutorImplementation();
+StreamExecutorFactory *MakeOpenCLExecutorImplementation();
 
 extern StreamExecutorFactory MakeHostExecutorImplementation;
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 439c73ec8f61388cd3d02283bd1724cdf69b04e4..2870c3883e2527888ad2aa93c01f2032aa49ba48 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
     case PlatformKind::kCuda:
       factory = *internal::MakeCUDAExecutorImplementation();
       break;
+    case PlatformKind::kROCm:
+      factory = *internal::MakeROCMExecutorImplementation();
+      break;
     case PlatformKind::kOpenCL:
       factory = *internal::MakeOpenCLExecutorImplementation();
       break;
@@ -188,6 +191,8 @@ StreamExecutor::StreamExecutor(
       memory_limit_bytes_(GetMemoryLimitBytes()) {
   if (port::Lowercase(platform_->Name()) == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
+  } else if (port::Lowercase(platform_->Name()) == "rocm") {
+    platform_kind_ = PlatformKind::kROCm;
   } else if (port::Lowercase(platform_->Name()) == "opencl") {
     platform_kind_ = PlatformKind::kOpenCL;
   } else if (port::Lowercase(platform_->Name()) == "host") {
@@ -406,14 +411,16 @@ StreamExecutor::createRnnSequenceTensorDescriptor(int max_seq_length,
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 StreamExecutor::createRnnSequenceTensorDescriptor(
     int max_seq_length, int batch_size, int data_size,
-    const absl::Span<const int> &seq_lengths, dnn::DataType data_type) {
+    const absl::Span<const int> &seq_lengths, bool time_major,
+    dnn::DataType data_type) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
     return port::Status(port::error::UNKNOWN,
                         "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnSequenceTensorDescriptor(
-      max_seq_length, batch_size, data_size, seq_lengths, data_type);
+      max_seq_length, batch_size, data_size, seq_lengths, time_major,
+      data_type);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
@@ -487,6 +494,10 @@ port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
   return result;
 }
 
+port::Status StreamExecutor::GetStatus(Stream *stream) {
+  return implementation_->GetStatus(stream);
+}
+
 void *StreamExecutor::Allocate(uint64 size) {
   if (memory_limit_bytes_ > 0 &&
       mem_alloc_bytes_ + size > memory_limit_bytes_) {
@@ -862,6 +873,10 @@ bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   return true;
 }
 
+absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
+  return implementation_->GetAllocatorStats();
+}
+
 template <typename TraceCallT, typename... ArgsT>
 void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
   if (tracing_enabled_) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ad2bc3c733b12886a96f469b80016c1d0865691e..7ded071467fc7e49e30e89cb2abfe8104584b6a7 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/macros.h"
+#include "absl/types/optional.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
@@ -420,7 +421,7 @@ class StreamExecutor {
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int> &seq_lengths,
-                                    dnn::DataType data_type);
+                                    bool time_major, dnn::DataType data_type);
 
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
@@ -485,6 +486,9 @@ class StreamExecutor {
   // previously registered.
   bool UnregisterTraceListener(TraceListener* listener);
 
+  // Return allocator statistics.
+  absl::optional<AllocatorStats> GetAllocatorStats();
+
  private:
   template <typename BeginCallT, typename CompleteCallT,
             typename ReturnT, typename... BeginArgsT>
@@ -524,6 +528,9 @@ class StreamExecutor {
   // operations enqueued on the stream before this program point.
   port::Status BlockHostUntilDone(Stream *stream);
 
+  // Without blocking the device, retrieve the current stream status.
+  port::Status GetStatus(Stream *stream);
+
   // Synchronously allocates size bytes on the underlying platform and returns
   // an opaque void* representing that allocation. In the case of failure,
   // nullptr is returned.
@@ -853,7 +860,7 @@ DeviceMemory<T> StreamExecutor::AllocateSubBuffer(DeviceMemory<T> *parent,
   }
   CreateAllocRecord(opaque, sizeof(T) * element_count);
   return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count,
-                                    true /* = is_sub_buffer */));
+                                          true /* = is_sub_buffer */));
 }
 
 template <typename... Params, typename... Args>
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 37d52cdb394b519a64775f3ee866fa08a931bee6..8c5b02eba5e6b2d2a536717c9a55f988f50a1bd0 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -10,6 +10,7 @@ load(
     "tf_additional_xla_deps_py",
     "tf_cuda_tests_tags",
     "tf_exec_compatible_with",
+    "tf_gpu_tests_tags",
     "tf_sycl_tests_tags",
 )
 load(
@@ -45,6 +46,7 @@ load(
     "//third_party/ngraph:build_defs.bzl",
     "if_ngraph",
 )
+
 def register_extension_info(**kwargs):
     pass
 
@@ -97,6 +99,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
         for p in core_proto_sources_relative
     ])
 
+# Wrapper for portable protos which currently just creates an empty rule.
+def tf_portable_proto_library(name, proto_deps, **kwargs):
+    _ignore = [kwargs]
+    native.cc_library(name = name, deps = proto_deps)
+
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
@@ -146,6 +153,18 @@ def if_android(a):
         "//conditions:default": [],
     })
 
+def if_emscripten(a):
+    return select({
+        clean_dep("//tensorflow:emscripten"): a,
+        "//conditions:default": [],
+    })
+
+def if_macos(a):
+    return select({
+        clean_dep("//tensorflow:macos"): a,
+        "//conditions:default": [],
+    })
+
 def if_ios(a):
     return select({
         clean_dep("//tensorflow:ios"): a,
@@ -192,6 +211,12 @@ def if_windows(a, otherwise = []):
         "//conditions:default": otherwise,
     })
 
+def if_windows_cuda(a, otherwise = []):
+    return select({
+        clean_dep("//tensorflow:with_cuda_support_windows_override"): a,
+        "//conditions:default": otherwise,
+    })
+
 def if_not_windows_cuda(a):
     return select({
         clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
@@ -204,12 +229,6 @@ def if_linux_x86_64(a):
         "//conditions:default": [],
     })
 
-def if_darwin(a):
-    return select({
-        clean_dep("//tensorflow:darwin"): a,
-        "//conditions:default": [],
-    })
-
 def if_override_eigen_strong_inline(a):
     return select({
         clean_dep("//tensorflow:override_eigen_strong_inline"): a,
@@ -283,7 +302,7 @@ def tf_copts(android_optimization_level_override = "-O2", is_external = False):
         }) +
         select({
             clean_dep("//tensorflow:android"): android_copts,
-            clean_dep("//tensorflow:darwin"): [],
+            clean_dep("//tensorflow:macos"): [],
             clean_dep("//tensorflow:windows"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
             clean_dep("//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
@@ -306,9 +325,19 @@ def tf_opts_nortti_if_android():
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+def tf_opts_nortti_if_emscripten():
+    return if_emscripten([
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ])
+
 def tf_features_nomodules_if_android():
     return if_android(["-use_header_modules"])
 
+def tf_features_nomodules_if_emscripten():
+    return if_emscripten(["-use_header_modules"])
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
@@ -345,7 +374,7 @@ def _rpath_linkopts(name):
     # directory in the tensorflow/ tree.
     levels_to_root = native.package_name().count("/") + name.count("/")
     return select({
-        clean_dep("//tensorflow:darwin"): [
+        clean_dep("//tensorflow:macos"): [
             "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
         ],
         clean_dep("//tensorflow:windows"): [],
@@ -364,16 +393,13 @@ def tf_binary_additional_srcs():
         ],
     )
 
-def _linux_kernel_dso_name(kernel_build_target):
-    """Given a build target, construct the dso name for linux."""
-    parts = kernel_build_target.split(":")
-    return "%s:libtfkernel_%s.so" % (parts[0], parts[1])
-
 # Helper functions to add kernel dependencies to tf binaries when using dynamic
 # kernel linking.
-def tf_binary_dynamic_kernel_dsos(kernels):
+def tf_binary_dynamic_kernel_dsos():
     return if_dynamic_kernels(
-        extra_deps = [_linux_kernel_dso_name(k) for k in kernels],
+        extra_deps = [
+            "//tensorflow/core/kernels:libtfkernel_all_kernels.so",
+        ],
         otherwise = [],
     )
 
@@ -385,6 +411,17 @@ def tf_binary_dynamic_kernel_deps(kernels):
         otherwise = kernels,
     )
 
+# Shared libraries have different name pattern on different platforms,
+# but cc_binary cannot output correct artifact name yet,
+# so we generate multiple cc_binary targets with all name patterns when necessary.
+# TODO(pcloudy): Remove this workaround when https://github.com/bazelbuild/bazel/issues/4570
+# is done and cc_shared_library is available.
+SHARED_LIBRARY_NAME_PATTERNS = [
+    "lib%s.so",  # On Linux, shared libraries are usually named as libfoo.so
+    "lib%s.dylib",  # On macos, shared libraries are usually named as libfoo.dylib
+    "%s.dll",  # On Windows, shared libraries are usually named as foo.dll
+]
+
 def tf_cc_shared_object(
         name,
         srcs = [],
@@ -393,24 +430,42 @@ def tf_cc_shared_object(
         linkopts = [],
         framework_so = tf_binary_additional_srcs(),
         kernels = [],
+        per_os_targets = False,  # Generate targets with SHARED_LIBRARY_NAME_PATTERNS
+        visibility = None,
         **kwargs):
-    native.cc_binary(
-        name = name,
-        srcs = srcs + framework_so,
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels),
-        linkshared = 1,
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
-        linkopts = linkopts + _rpath_linkopts(name) + select({
-            clean_dep("//tensorflow:darwin"): [
-                "-Wl,-install_name,@rpath/" + name.split("/")[-1],
-            ],
-            clean_dep("//tensorflow:windows"): [],
-            "//conditions:default": [
-                "-Wl,-soname," + name.split("/")[-1],
-            ],
-        }),
-        **kwargs
-    )
+    if per_os_targets:
+        names = [pattern % name for pattern in SHARED_LIBRARY_NAME_PATTERNS]
+    else:
+        names = [name]
+    for name_os in names:
+        native.cc_binary(
+            name = name_os,
+            srcs = srcs + framework_so,
+            deps = deps,
+            linkshared = 1,
+            data = data,
+            linkopts = linkopts + _rpath_linkopts(name_os) + select({
+                clean_dep("//tensorflow:macos"): [
+                    "-Wl,-install_name,@rpath/" + name_os.split("/")[-1],
+                ],
+                clean_dep("//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-Wl,-soname," + name_os.split("/")[-1],
+                ],
+            }),
+            visibility = visibility,
+            **kwargs
+        )
+    if name not in names:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                "//tensorflow:windows": [":%s.dll" % name],
+                "//tensorflow:macos": [":lib%s.dylib" % name],
+                "//conditions:default": [":lib%s.so" % name],
+            }),
+            visibility = visibility,
+        )
 
 register_extension_info(
     extension_name = "tf_cc_shared_object",
@@ -429,20 +484,43 @@ def tf_cc_binary(
         linkopts = [],
         copts = tf_copts(),
         kernels = [],
+        per_os_targets = False,  # Generate targets with SHARED_LIBRARY_NAME_PATTERNS
+        visibility = None,
         **kwargs):
-    native.cc_binary(
-        name = name,
-        copts = copts,
-        srcs = srcs + tf_binary_additional_srcs(),
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
-            [
-                clean_dep("//third_party/mkl:intel_binary_blob"),
-            ],
-        ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
-        linkopts = linkopts + _rpath_linkopts(name),
-        **kwargs
-    )
+    if kernels:
+        added_data_deps = tf_binary_dynamic_kernel_dsos()
+    else:
+        added_data_deps = []
+
+    if per_os_targets:
+        names = [pattern % name for pattern in SHARED_LIBRARY_NAME_PATTERNS]
+    else:
+        names = [name]
+    for name_os in names:
+        native.cc_binary(
+            name = name_os,
+            copts = copts,
+            srcs = srcs + tf_binary_additional_srcs(),
+            deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+                [
+                    clean_dep("//third_party/mkl:intel_binary_blob"),
+                ],
+            ),
+            data = depset(data + added_data_deps),
+            linkopts = linkopts + _rpath_linkopts(name_os),
+            visibility = visibility,
+            **kwargs
+        )
+    if name not in names:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                "//tensorflow:windows": [":%s.dll" % name],
+                "//tensorflow:macos": [":lib%s.dylib" % name],
+                "//conditions:default": [":lib%s.so" % name],
+            }),
+            visibility = visibility,
+        )
 
 register_extension_info(
     extension_name = "tf_cc_binary",
@@ -591,6 +669,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -607,6 +686,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -772,7 +852,7 @@ def tf_cc_test(
                 "-pie",
             ],
             clean_dep("//tensorflow:windows"): [],
-            clean_dep("//tensorflow:darwin"): [
+            clean_dep("//tensorflow:macos"): [
                 "-lm",
             ],
             "//conditions:default": [
@@ -785,7 +865,7 @@ def tf_cc_test(
                 clean_dep("//third_party/mkl:intel_binary_blob"),
             ],
         ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         exec_compatible_with = tf_exec_compatible_with(kwargs),
         # Nested select() statements seem not to be supported when passed to
         # linkstatic, and we already have a cuda select() passed in to this
@@ -794,7 +874,7 @@ def tf_cc_test(
             # cc_tests with ".so"s in srcs incorrectly link on Darwin unless
             # linkstatic=1 (https://github.com/bazelbuild/bazel/issues/3450).
             # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-            clean_dep("//tensorflow:darwin"): 1,
+            clean_dep("//tensorflow:macos"): 1,
             "//conditions:default": 0,
         }),
         nocopts = nocopts,
@@ -835,7 +915,7 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_cc_test(
+def tf_gpu_cc_test(
         name,
         srcs = [],
         deps = [],
@@ -871,24 +951,35 @@ def tf_cuda_cc_test(
         linkopts = linkopts,
         linkstatic = select({
             # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-            clean_dep("//tensorflow:darwin"): 1,
+            clean_dep("//tensorflow:macos"): 1,
             "@local_config_cuda//cuda:using_nvcc": 1,
             "@local_config_cuda//cuda:using_clang": 1,
             "//conditions:default": 0,
         }),
         suffix = "_gpu",
-        tags = tags + tf_cuda_tests_tags(),
-        deps = deps + if_cuda([
+        tags = tags + tf_gpu_tests_tags(),
+        deps = deps + if_cuda_is_configured([
+            clean_dep("//tensorflow/core:gpu_runtime"),
+        ]) + if_rocm_is_configured([
             clean_dep("//tensorflow/core:gpu_runtime"),
         ]),
     )
 
+register_extension_info(
+    extension_name = "tf_gpu_cc_test",
+    label_regex_for_dep = "{extension_name}",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_test(*args, **kwargs):
+    tf_gpu_cc_test(*args, **kwargs)
+
 register_extension_info(
     extension_name = "tf_cuda_cc_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_only_cc_test(
+def tf_gpu_only_cc_test(
         name,
         srcs = [],
         deps = [],
@@ -899,7 +990,7 @@ def tf_cuda_only_cc_test(
         args = [],
         kernels = [],
         linkopts = []):
-    tags = tags + tf_cuda_tests_tags()
+    tags = tags + tf_gpu_tests_tags()
     native.cc_test(
         name = "%s%s" % (name, "_gpu"),
         srcs = srcs + tf_binary_additional_srcs(),
@@ -907,7 +998,7 @@ def tf_cuda_only_cc_test(
         args = args,
         copts = _cuda_copts() + rocm_copts() + tf_copts(),
         features = if_cuda(["-use_header_modules"]),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda_is_configured([
             clean_dep("//tensorflow/core:cuda"),
             clean_dep("//tensorflow/core:gpu_lib"),
@@ -919,13 +1010,22 @@ def tf_cuda_only_cc_test(
             # cc_tests with ".so"s in srcs incorrectly link on Darwin
             # unless linkstatic=1.
             # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-            clean_dep("//tensorflow:darwin"): 1,
+            clean_dep("//tensorflow:macos"): 1,
             "//conditions:default": 0,
         }),
         tags = tags,
         exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
     )
 
+register_extension_info(
+    extension_name = "tf_gpu_only_cc_test",
+    label_regex_for_dep = "{extension_name}_gpu",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_only_cc_test(*args, **kwargs):
+    tf_gpu_only_cc_test(*args, **kwargs)
+
 register_extension_info(
     extension_name = "tf_cuda_only_cc_test",
     label_regex_for_dep = "{extension_name}_gpu",
@@ -986,7 +1086,7 @@ def tf_cc_test_mkl(
                 ],
             }) + _rpath_linkopts(src_to_test_name(src)),
             deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
-            data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            data = data + tf_binary_dynamic_kernel_dsos(),
             exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
             linkstatic = linkstatic,
             tags = tags,
@@ -1007,7 +1107,7 @@ def tf_cc_tests_gpu(
         args = None):
     tf_cc_tests(srcs, deps, linkstatic, size = size, args = args, kernels = kernels, tags = tags)
 
-def tf_cuda_cc_tests(
+def tf_gpu_cc_tests(
         srcs,
         deps,
         name = "",
@@ -1018,7 +1118,7 @@ def tf_cuda_cc_tests(
         kernels = [],
         linkopts = []):
     for src in srcs:
-        tf_cuda_cc_test(
+        tf_gpu_cc_test(
             name = src_to_test_name(src),
             size = size,
             srcs = [src],
@@ -1030,6 +1130,10 @@ def tf_cuda_cc_tests(
             deps = deps,
         )
 
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_tests(*args, **kwargs):
+    tf_gpu_cc_tests(*args, **kwargs)
+
 def tf_java_test(
         name,
         srcs = [],
@@ -1040,7 +1144,7 @@ def tf_java_test(
     native.java_test(
         name = name,
         srcs = srcs,
-        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
         *args,
         **kwargs
     )
@@ -1106,7 +1210,7 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
+def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
     """Generate a cc_library with a conditional set of CUDA dependencies.
 
     When the library is built with --config=cuda:
@@ -1132,15 +1236,25 @@ def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs)
     kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
     native.cc_library(
         deps = deps + if_cuda_is_configured_compat(cuda_deps + [
-            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/stream_executor/cuda:cudart_stub"),
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured(cuda_deps + [
             # rocm_header placeholder
+            # rocm_header placeholder
         ]),
         copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
         **kwargs
     )
 
+register_extension_info(
+    extension_name = "tf_gpu_library",
+    label_regex_for_dep = "{extension_name}",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_library(*args, **kwargs):
+    tf_gpu_library(*args, **kwargs)
+
 register_extension_info(
     extension_name = "tf_cuda_library",
     label_regex_for_dep = "{extension_name}",
@@ -1159,7 +1273,7 @@ def tf_kernel_library(
         **kwargs):
     """A rule to build a TensorFlow OpKernel.
 
-    May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
+    May either specify srcs/hdrs or prefix.  Similar to tf_gpu_library,
     but with alwayslink=1 by default.  If prefix is specified:
       * prefix*.cc (except *.cu.cc) is added to srcs
       * prefix*.h (except *.cu.h) is added to hdrs
@@ -1230,7 +1344,7 @@ def tf_kernel_library(
         "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
         "req_dep=@local_config_cuda//cuda:cuda_headers",
     ]
-    tf_cuda_library(
+    tf_gpu_library(
         name = name,
         srcs = srcs,
         hdrs = hdrs,
@@ -1331,7 +1445,7 @@ def _py_wrap_cc_impl(ctx):
     args += ["-I" + i for i in swig_include_dirs.to_list()]
     args += [src.path]
     outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
-    ctx.action(
+    ctx.actions.run(
         executable = ctx.executable._swig,
         arguments = args,
         inputs = inputs.to_list(),
@@ -1440,7 +1554,7 @@ def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kw
 
 def tf_custom_op_library_additional_deps():
     return [
-      "@protobuf_archive//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
         clean_dep("//third_party/eigen3"),
         clean_dep("//tensorflow/core:framework_headers_lib"),
     ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
@@ -1450,8 +1564,8 @@ def tf_custom_op_library_additional_deps():
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
     return [
-      "@protobuf_archive//:protobuf",
-      "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+        "@nsync//:nsync_cpp",
         # for //third_party/eigen3
         clean_dep("//third_party/eigen3"),
         # for //tensorflow/core:framework_headers_lib
@@ -1565,7 +1679,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
                 "-lm",
             ],
             clean_dep("//tensorflow:windows"): [],
-            clean_dep("//tensorflow:darwin"): [],
+            clean_dep("//tensorflow:macos"): [],
         }),
         **kwargs
     )
@@ -1611,7 +1725,7 @@ def _append_init_to_versionscript_impl(ctx):
             template = ctx.file.template_file,
             output = ctx.outputs.versionscript,
             substitutions = {
-                "global:": "global:\n     init_%s;\n     PyInit_*;" % (mod_name),
+                "global:": "global:\n     init_%s;\n     _init_%s;\n     PyInit_*;\n     _PyInit_*;" % (mod_name, mod_name),
             },
             is_executable = False,
         )
@@ -1620,7 +1734,7 @@ def _append_init_to_versionscript_impl(ctx):
             template = ctx.file.template_file,
             output = ctx.outputs.versionscript,
             substitutions = {
-                "*tensorflow*": "*tensorflow*\ninit_%s\nPyInit_*\n" % (mod_name),
+                "*tensorflow*": "*tensorflow*\ninit_%s\n_init_%s\nPyInit_*\n_PyInit_*\n" % (mod_name, mod_name),
             },
             is_executable = False,
         )
@@ -1629,8 +1743,7 @@ _append_init_to_versionscript = rule(
     attrs = {
         "module_name": attr.string(mandatory = True),
         "template_file": attr.label(
-            allow_files = True,
-            single_file = True,
+            allow_single_file = True,
             mandatory = True,
         ),
         "is_version_script": attr.bool(
@@ -1649,7 +1762,9 @@ def tf_py_wrap_cc(
         swig_includes = [],
         deps = [],
         copts = [],
+        version_script = None,
         **kwargs):
+    """Builds a Python extension module."""
     module_name = name.split("/")[-1]
 
     # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
@@ -1668,6 +1783,11 @@ def tf_py_wrap_cc(
         toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
         deps = deps + extra_deps,
     )
+    if not version_script:
+        version_script = select({
+            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
+        })
     vscriptname = name + "_versionscript"
     _append_init_to_versionscript(
         name = vscriptname,
@@ -1676,10 +1796,7 @@ def tf_py_wrap_cc(
             "//conditions:default": True,
         }),
         module_name = module_name,
-        template_file = select({
-            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
-            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
-        }),
+        template_file = version_script,
     )
     extra_linkopts = select({
         "@local_config_cuda//cuda:darwin": [
@@ -1756,7 +1873,7 @@ def py_test(deps = [], data = [], kernels = [], **kwargs):
         data = data + select({
             "//conditions:default": [],
             clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
-        }) + tf_binary_dynamic_kernel_dsos(kernels),
+        }) + tf_binary_dynamic_kernel_dsos(),
         exec_compatible_with = tf_exec_compatible_with(kwargs),
         **kwargs
     )
@@ -1841,7 +1958,7 @@ register_extension_info(
     label_regex_map = {"additional_deps": "deps:{extension_name}"},
 )
 
-def cuda_py_test(
+def gpu_py_test(
         name,
         srcs,
         size = "medium",
@@ -1866,7 +1983,7 @@ def cuda_py_test(
         test_tags = tags
         if config == "gpu":
             test_name += "_gpu"
-            test_tags = test_tags + tf_cuda_tests_tags()
+            test_tags = test_tags + tf_gpu_tests_tags()
         tf_py_test(
             name = test_name,
             size = size,
@@ -1884,6 +2001,15 @@ def cuda_py_test(
             xla_enable_strict_auto_jit = False,
         )
 
+register_extension_info(
+    extension_name = "gpu_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
+
+# terminology changes: saving cuda_* definition for compatibility
+def cuda_py_test(*args, **kwargs):
+    gpu_py_test(*args, **kwargs)
+
 register_extension_info(
     extension_name = "cuda_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
@@ -1957,7 +2083,7 @@ def py_tests(
             xla_enable_strict_auto_jit = xla_enable_strict_auto_jit,
         )
 
-def cuda_py_tests(
+def gpu_py_tests(
         name,
         srcs,
         size = "medium",
@@ -1973,7 +2099,7 @@ def cuda_py_tests(
     # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
     # XLA tests once enough compute resources are available.
     _ignored = [xla_enable_strict_auto_jit]
-    test_tags = tags + tf_cuda_tests_tags()
+    test_tags = tags + tf_gpu_tests_tags()
     py_tests(
         name = name,
         size = size,
@@ -1989,6 +2115,10 @@ def cuda_py_tests(
         xla_enable_strict_auto_jit = False,
     )
 
+# terminology changes: saving cuda_* definition for compatibility
+def cuda_py_tests(*args, **kwargs):
+    gpu_py_tests(*args, **kwargs)
+
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
 #
@@ -2053,7 +2183,16 @@ def tf_py_build_info_genrule():
         name = "py_build_info_gen",
         outs = ["platform/build_info.py"],
         cmd =
-            "$(location //tensorflow/tools/build_info:gen_build_info) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu") + if_windows(" --key_value msvcp_dll_name=msvcp140.dll", ""),
+            "$(location //tensorflow/tools/build_info:gen_build_info) --raw_generate \"$@\" --build_config " +
+            if_cuda("cuda", "cpu") +
+            " --key_value " +
+            if_cuda(" cuda_version_number=$${TF_CUDA_VERSION} cudnn_version_number=$${TF_CUDNN_VERSION} ", "") +
+            if_windows(" msvcp_dll_name=msvcp140.dll ", "") +
+            if_windows_cuda(" ".join([
+                "nvcuda_dll_name=nvcuda.dll",
+                "cudart_dll_name=cudart64_$${TF_CUDA_VERSION/\\./}.dll",
+                "cudnn_dll_name=cudnn64_$${TF_CUDNN_VERSION}.dll",
+            ]), ""),
         local = 1,
         tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info")],
     )
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 9f6114f503467fc12fcfb5dae07e75d2113e410d..04632330c56c69a359d2e8fad424a1fb5afff74b 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -4,5 +4,4 @@
 *TF_*
 *TFE_*
 *nsync_*
-*pywrap_xla*
 *stream_executor*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 39d258c3b7edd1f5f7d0805c080e832aa1d6109a..563d178de7396fbae6127d9dcfbfa8cf00c65038 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -5,7 +5,6 @@ tensorflow {
     *TF_*;
     *TFE_*;
     *nsync_*;
-    *pywrap_xla*;
     *stream_executor*;
   local:
     *;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
index f79029d3fe0b88a454b11456b3785c3ae28a253c..cc2d5c87d667fb5c4af6b6fc435ae626334fe2d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.AggregationMethod"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<class \'tensorflow.python.ops.gradients_util.AggregationMethod\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "ADD_N"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 078f1028fd4e98c11481c6bb9e08303dd3c54c19..2e8ece122c572f3430afd073179c4d61f532303a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -32,6 +32,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "collective_nccl"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index d2ee0c4db668d0a1aa6190573f56f210a06e2653..9c7de2c5719350d6c30ac27c08712f326b014e3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -155,6 +155,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "collective_nccl"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index a2cc07483a4e10918891f555ca9459fb7503bb32..6c528dd16200e870ca860312defcb43155247979 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -84,6 +84,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "timestamped_allocator"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "pending_cap"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
index cdaeb55e30865e082054085f47d6a071ebf3affd..9193168c2072388a5a660abf55acbdc6f889d58e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
@@ -68,7 +68,7 @@ tf_class {
   }
   member_method {
     name: "create_op"
-    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
+    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
   }
   member_method {
     name: "device"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..148e6b83b31e9e23c34f1b501c45749063a6c3b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
@@ -39,4 +40,8 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'values\', \'indices\', \'dense_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
index d71c2358c93e9597726665fdf8f92e648b2ea772..b453f7e9903bf66d19b06974da016c8c2971372d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
@@ -44,6 +44,13 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.AssetFileDef"
     }
+    field {
+      name: "object_graph_def"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SavedObjectGraph"
+    }
     nested_type {
       name: "MetaInfoDef"
       field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
similarity index 59%
rename from tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
index c364b0217a7ed10282dc8fc28797f3be1b92f867..8c3438e4d8e377de8ae0c063d460b5adeea11258 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
@@ -1,8 +1,8 @@
-path: "tensorflow.experimental.Module"
+path: "tensorflow.Module"
 tf_class {
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
@@ -12,18 +12,6 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "owned_submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -41,7 +29,7 @@ tf_class {
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "no_name_scope"
+    name: "with_name_scope"
     argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index c0ed95653552f904acea1cc82bca00773ecb792c..feb831fb6b7bda76f4140272f3b193c6f66114b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
@@ -38,6 +39,10 @@ tf_class {
     name: "bounding_shape"
     argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_nested_row_lengths"
     argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e2f583d21a30fd1e97fc20dadd58d3a62a5141
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.RunMetadata.FunctionGraphs"
+tf_proto {
+  descriptor {
+    name: "FunctionGraphs"
+    field {
+      name: "partition_graphs"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "pre_optimization_graph"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "post_optimization_graph"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
index 1287940326c0196e76fff2cf6363622226092504..777b889745fefd69e628ea26d3ca0a0fefc743b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
@@ -23,5 +23,36 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.GraphDef"
     }
+    field {
+      name: "function_graphs"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RunMetadata.FunctionGraphs"
+    }
+    nested_type {
+      name: "FunctionGraphs"
+      field {
+        name: "partition_graphs"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "pre_optimization_graph"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "post_optimization_graph"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
index d11e927bd55cea52d0dbdfd4b28b2c1bc24fdaa5..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
index 493dcba8922d7f6c51a61d337f48e09d168e6bac..d824ad573e4854844e6a3fa3b544ab5f51ddde6c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -16,7 +16,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "from_spec"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 341ace07663032a836da9c4c6b5f9fccccfb7add..9a4363829c20d10aeaed28abfa2146355ba12f46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
index ce29615f72eee78525b8a1efbb4531215e6b72fe..6c5724078357125255acd413902c4a5e57cb719e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.audio"
 tf_module {
+  member_method {
+    name: "decode_wav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "encode_wav"
     argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
index a71da113b4ffcaa9ff71e18df4a9263b141b42e6..1f04d028efdc895e493c9e60e1c9025fc26de4f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -5,12 +5,16 @@ tf_class {
     name: "ALL"
     mtype: "<enum \'Feature\'>"
   }
+  member {
+    name: "ASSERT_STATEMENTS"
+    mtype: "<enum \'Feature\'>"
+  }
   member {
     name: "AUTO_CONTROL_DEPS"
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "DECORATORS"
+    name: "BUILTIN_FUNCTIONS"
     mtype: "<enum \'Feature\'>"
   }
   member {
@@ -21,6 +25,10 @@ tf_class {
     name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
+  member {
+    name: "LOGICAL_EXPRESSIONS"
+    mtype: "<enum \'Feature\'>"
+  }
   member {
     name: "NAME_SCOPES"
     mtype: "<enum \'Feature\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
deleted file mode 100644
index c4d5b77c0738feb1fa6ea69672ee3fafa51de5be..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.autograph.experimental.Verbosity"
-tf_class {
-  is_instance: "<enum \'Verbosity\'>"
-  member {
-    name: "BRIEF"
-    mtype: "<enum \'Verbosity\'>"
-  }
-  member {
-    name: "VERBOSE"
-    mtype: "<enum \'Verbosity\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
index 5747dac7ab201443d1f237415cd280aee672a8ff..cd8f0716d48f4e84b5a21238d8a661722c1d33ae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "Feature"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "Verbosity"
-    mtype: "<class \'enum.EnumMeta\'>"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
index 12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57..8880ed4f0cb31641dec7336ba64ef55ec227c813 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
@@ -4,12 +4,20 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "to_code"
-    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\'], "
   }
   member_method {
     name: "to_graph"
-    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3fd133e92a41fb133bd138750a574ef40ea57b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.experimental"
+tf_module {
+  member_method {
+    name: "get_device_policy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_synchronous_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_device_policy"
+    argspec: "args=[\'device_policy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_synchronous_execution"
+    argspec: "args=[\'enable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.gpu.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.gpu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6768a9e06e93aeb251228d2f1ee47b18dc19945
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.gpu.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.gpu"
+tf_module {
+  member_method {
+    name: "get_per_process_memory_fraction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_per_process_memory_growth"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_per_process_memory_fraction"
+    argspec: "args=[\'fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_per_process_memory_growth"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
index d7e4529594df24666844bbce8cff729c5fa8fa67..41e61ac683c0fb3f68b7dd092af8bb71e2a2a3bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
@@ -1,7 +1,35 @@
 path: "tensorflow.config"
 tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "gpu"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "threading"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "experimental_connect_to_host"
     argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
   }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_soft_device_placement"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_soft_device_placement"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.threading.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.threading.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b155733a4d556daa954bc61b3647ac9f103ba3f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.threading.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.threading"
+tf_module {
+  member_method {
+    name: "inter_op_parallelism_threads"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "intra_op_parallelism_threads"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_inter_op_parallelism_threads"
+    argspec: "args=[\'num_threads\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_intra_op_parallelism_threads"
+    argspec: "args=[\'num_threads\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index f7d388d33d050eac2c9f14682bc7068c745a46bc..1c55f81bed19929276cc17d31d333ff53712bc8f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -79,6 +83,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index d73168b070e374a749a00f74b24b77a715d2f37e..5488449044a021ea58c92a028b109f701f6f1b79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index 682a2b91b6187783eef74a4cd3672ae2ae2d47fb..87af1123d77a22a362abced5605beddf12395723 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "initializer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 72fc2c3a9ee5b985723ce2dba9643ba796362dc7..70e3b6792ccc171a633d75df5047309e19cf78bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "experimental_autotune"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b..0927dd01a3f4629f6caf39f097085547638f0cf1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index a10add1b7e38f9875e699903b3e3c103d73e647e..bab1e399210ea20ffcaaa417a4f709d74c5e8a00 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 71b597c19c512879b8f18b34843b160efecc6bec..68cf02333f05aa7f068686773ff3ce5d2f0d3ca5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index 3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8..b2fd09e2ff4668981b1d4da2085349913589376f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "apply_default_optimizations"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "autotune_cpu_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 20646e87b5fbe23d89ad31ca632a64bf958339f6..6d3f88eded4a055b7047886c3245f28866ec59ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 86c5ff5b0bd7b42d61a92a44c8888852a48677be..bc4943e73788c8c59669e8f2fe1145e3d1c1fb01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index abc98a74b64ab274ed8b2fc43876b7102f1c7201..9aa5955a16fa69c1273027b75fdaba5e0a52020f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -88,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "bytes_produced_stats"
+    argspec: "args=[\'tag\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cardinality"
     argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -108,10 +112,6 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "filter_for_shard"
-    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
@@ -152,6 +152,10 @@ tf_module {
     name: "map_and_batch"
     argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "map_and_batch_with_legacy_function"
+    argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "parallel_interleave"
     argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index aa474680592a1a3996ca3db970b814ba167cd801..272963382a009c837427176859994f5c603a05a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "make_initializable_iterator"
-    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'dataset\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "make_one_shot_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index 8a7f1e9363b8211d83d39d31da11507cb4c805eb..07684dd92e8f718161b15784eac50374af558662 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_equal"
@@ -88,6 +88,10 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_log_device_placement"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_finite"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -112,4 +116,8 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "set_log_device_placement"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 4cb78b08f8d966890fd9173d7ac1459a905d5921..fbac8c087f1a51c8d66b8fe3be1a5a3e90f13eef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -15,10 +15,6 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
@@ -28,8 +24,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "experimental_make_numpy_iterator"
@@ -39,6 +35,10 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..184ff96e6325c87f02444d4359e5c2987534b6ce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 11c1479b5bfa4a02ee825509a8a725486b917333..b6be122b7accd3aa2a61d96e8c30473329b464fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -14,10 +14,6 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +23,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "experimental_make_numpy_iterator"
@@ -38,6 +34,10 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b35b61b4c08868feaf501e1f09b37d02da09cd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.distribute.cluster_resolver.ClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2cc522f1cac65611ffc3f09ce1513d186da27a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.GCEClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver.GCEClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'project\', \'zone\', \'instance_group\', \'port\', \'task_type\', \'task_id\', \'rpc_layer\', \'credentials\', \'service\'], varargs=None, keywords=None, defaults=[\'worker\', \'0\', \'grpc\', \'default\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3220d68e05458da3cda4e36c63bc5dc79cde93af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver.KubernetesClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d39ddc7e408c8fbc3cbb7db26379357b93f8b459
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.SimpleClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.SimpleClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster_spec\', \'master\', \'task_type\', \'task_id\', \'environment\', \'num_accelerators\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b487626520addbd072983f4218b5d0785e6049ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.distribute.cluster_resolver.SlurmClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver.SlurmClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_task_info"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f9a430c0f84c9caba29dee514f1f3a3391d8588
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.TFConfigClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver.TFConfigClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\', \'environment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbc76c24813bad6288b630a792ad3996a7940f46
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_job_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_master"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ecd77ad192d575c3cc8331a5dc2d0d89816182ac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.UnionResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.UnionClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5906ffa850a360889e26fe0230618ad60cf01231
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.cluster_resolver"
+tf_module {
+  member {
+    name: "ClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GCEClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KubernetesClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SimpleClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SlurmClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFConfigClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnionResolver"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7eca1c80d8b751feb6f9f16b743944da44e258b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CollectiveCommunication"
+tf_class {
+  is_instance: "<enum \'CollectiveCommunication\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1726e74534a3922a9948ec0e166da4d6cdbef6ce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'communication\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6ee288f28b56b2a5a4aa2a21b04d80a5115609
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..927f1a8f5051633d1ab8a8b9ba6ca4509ebc2ad1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -0,0 +1,74 @@
+path: "tensorflow.distribute.experimental.TPUStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.tpu_strategy.TPUStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "steps_per_run"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu_cluster_resolver\', \'steps_per_run\', \'device_assignment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf8cedb50cbccf3b47d09567abcde7e29d458ace
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "CollectiveCommunication"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
index 31dc6e071613bfe3d2ea24c65835f09bab90c400..430e81166067cb9599b03fba43e066e099d928fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,10 +20,22 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
@@ -32,6 +52,14 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "get_loss_reduction"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index e138ce936ec73c05f8f790fb63c381e56ae2f654..216854587d6476c37e12063eda53acf61c6383cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\', \'train_in_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fdbecb8856e24c86ae3165d546d8ddd019ba88f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt
@@ -0,0 +1,76 @@
+path: "tensorflow.estimator.BoostedTreesEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'head\', \'model_dir\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_feature_importances"
+    argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "experimental_predict_with_explanations"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index eae0a292a962680a53d8c683ee2d2b97e24937a6..25d021ad8deb9c6a51f4510c2ec215b94ee66cef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\', \'train_in_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
index bf7c1abcd89b29c29f3487cab58cfdf28103119c..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-k-means.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-k-means.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd425e46dbd2dcf89f6fadcc94bf0419de17933a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-k-means.pbtxt
@@ -0,0 +1,115 @@
+path: "tensorflow.estimator.experimental.KMeans"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.kmeans.KMeansClustering\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "ALL_DISTANCES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLUSTER_CENTERS_VAR_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLUSTER_INDEX"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "COSINE_DISTANCE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "KMEANS_PLUS_PLUS_INIT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "RANDOM_INIT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SCORE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SQUARED_EUCLIDEAN_DISTANCE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_clusters\', \'model_dir\', \'initial_clusters\', \'distance_metric\', \'seed\', \'use_mini_batch\', \'mini_batch_steps_per_iteration\', \'kmeans_plus_plus_num_retries\', \'relative_tolerance\', \'config\', \'feature_columns\'], varargs=None, keywords=None, defaults=[\'None\', \'random\', \'squared_euclidean\', \'None\', \'True\', \'1\', \'2\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_centers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "predict_cluster_index"
+    argspec: "args=[\'self\', \'input_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "score"
+    argspec: "args=[\'self\', \'input_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "transform"
+    argspec: "args=[\'self\', \'input_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index b1bd5a2661d44d9b36b965ba160874e6142628ea..9eb3ccf983c49f4a3ab2369cfe6a87b2922e422d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "InMemoryEvaluatorHook"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "KMeans"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearSDCA"
     mtype: "<type \'type\'>"
@@ -16,10 +20,6 @@ tf_module {
     name: "call_logit_fn"
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "dnn_logit_fn_builder"
-    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index 6f57505afe84f3982a8beb402783f35b3e699241..43a5e978f58e464a7450e2f730dcd72aa6ae334f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "BoostedTreesClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BoostedTreesEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c..0c3f04e468c4c817cd474deb42149aee3021aa43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.experimental"
 tf_module {
-  member {
-    name: "Module"
-    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
-  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..79ed45cfba325e0749c823765a181f413c0e2617 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
@@ -48,6 +48,26 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "shared_embedding_columns"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt
index dbc360b13ee7dc8228f5fb4fe0cd6fc21504d0d0..198fda49c83a3993f09c3c33cb3622e73441ca84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.-resize-method.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.image.ResizeMethod"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.image_ops_impl.ResizeMethod\'>"
+  is_instance: "<class \'tensorflow.python.ops.image_ops_impl.ResizeMethodV1\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "AREA"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 15d0e099bab3052553671d52d396239b27383a8d..131ec939838cb39e6cfcb96f41691ce77270689d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -32,13 +32,17 @@ tf_module {
     name: "central_crop"
     argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
   member_method {
     name: "convert_image_dtype"
     argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\', \'box_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'bilinear\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +90,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\', \'sizes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -174,7 +178,7 @@ tf_module {
   }
   member_method {
     name: "resize"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_area"
@@ -182,11 +186,11 @@ tf_module {
   }
   member_method {
     name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\', \'half_pixel_centers\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\', \'half_pixel_centers\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
@@ -194,15 +198,15 @@ tf_module {
   }
   member_method {
     name: "resize_image_with_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
   }
   member_method {
     name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\', \'half_pixel_centers\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "rgb_to_grayscale"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 283cc6a735695b0b2d16af28f7688a7a077f19be..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 95e405aebaf61e3ccae268b474a006a3bca51343..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 8cd0c6ea5f027fa1f30b60a742450b651242d406..45c224734b8347e33afad9c299263a4555866da1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -190,7 +198,7 @@ tf_module {
   }
   member_method {
     name: "get_session"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'op_input_list\'], varargs=None, keywords=None, defaults=[\'()\'], "
   }
   member_method {
     name: "get_uid"
@@ -248,6 +256,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -256,6 +268,14 @@ tf_module {
     name: "less_equal"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "local_conv1d"
+    argspec: "args=[\'inputs\', \'kernel\', \'kernel_size\', \'strides\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "local_conv2d"
+    argspec: "args=[\'inputs\', \'kernel\', \'kernel_size\', \'strides\', \'output_shape\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "log"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 2e0f77eda85780cec26b103ba11276ccdfd90189..4504633d4a1d1272f2bdec8b01f6de6e1381396c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.callbacks.TensorBoard"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.callbacks.TensorBoard\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks_v1.TensorBoard\'>"
   is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\', \'profile_batch\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\', \'2\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58bede556dfd4d8988d92e99e402d9b3b3bf5adb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecayRestarts"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecayRestarts\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2549a2ac627421ecc80df2d6235c1a22ab5e3ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f083120b52ce483f46cc92390b53180bc3bd65ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.LinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ea3c6beb1c0f8fffaa442956c0cc134f70a5e84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.NoisyLinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.NoisyLinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d..c3127642e2583700c2451f54b80487a4cb943a55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -2,8 +2,9 @@ path: "tensorflow.keras.experimental.PeepholeLSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -141,6 +142,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -173,6 +178,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -181,6 +190,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 5cd6851278dce8ef45c90112176be94b9c45dc91..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -1,15 +1,35 @@
 path: "tensorflow.keras.experimental"
 tf_module {
+  member {
+    name: "CosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineDecayRestarts"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NoisyLinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
   member_method {
-    name: "export"
-    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "load_from_saved_model"
-    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..526d65fbf6c91d5d02ffc90dd0333cab07a50b84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.layers.AbstractRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.AbstractRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 8a0b8eb46f006497472c1e9ce539e91db19bd260..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index abb3c236948a7f46d64cad92ae922324446f9a99..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index b27db4e7f23499fd27430059f1cb556f341547b3..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 50998ac9d63c9492523720d7dcc8041fd9efcab5..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index be17aeafb5ae383cba58b854808f6c9bc0e9696d..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7f21b444bc8832189b11cd8ff206e034bc89170c..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 2ac86f152fad454fc0b09e2cb8814f23ad997c20..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f6b1dd2f7e4244218b7c64868b773142c79695d6..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 3da1f43a92a3fb5a146bcf8fd16f26783487f129..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a7be5ac81814b28c93407cd5d1ca7c3f60822f0b..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index c5c29bead383da6b9c0c7436fb089e27413e72f3..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 3af3c2a501d6b46821d2ef1b4e6a06e2820d4764..f2b80301df820478a11964a89e140b6682c21fc9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 880d18e1aae53512b2f587b5c8914babcd68566f..43af4aa1ec1ad93fde018c6244bef3a4c1bd6549 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -69,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -115,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 1eb0cf1a188b88d55b82297da715624c9e5a58f2..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index d9394e60f532465c1852b2cac46ca4cbd9125583..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -196,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index a0f6dc8097adfb896a8f3aa3b642c2997e257cf3..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 037b92f861b14720a1a638884752a4d3e1dbd9f9..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 6a0d027d47d999f5770e59299fc1206249bf9b43..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 66b5bd75fc16c37aecaa65ef12fb2311925c252a..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index e73133ff0731821407084cc1cd6160b2e9bf3d9c..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 7af6b2b3c398473398a9d2e227a42ec96451b301..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index baff492dfbd3c9ab6f2c269cb89632768e6b6c92..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 63d30a61851cdae8daa8a5dc70fa733fd6b2ff11..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 7a29cbbec35b885792828828354ac8f9a29579b6..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 87c75c02243cd646502e12e2947555ad7c6913e9..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index f69104ddfef17c8b5df36f4bc3e9b0ea3a986295..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index aa05471933cc97a872480e0ac45213b49a882189..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index d61f1ddc1d506ae2db992aaacbdc634964d53292..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index e2d05f8298ef4779f3c2678b24ea9f938a3889cd..e618a111b6f60cd73fa64ab47d4d4422580a73ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index f650f48423b4496fe21abd215ac494018921df00..cf8a67b54983f17773e59de2e0661fd9896d2420 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f7f3033ecf8e226b961dabfe59e751639e5b98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.layers.DenseFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 06e8b6b314183b884e635f3b78e5bd5368e0962f..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 9fdf6f66d1160a49da302ffa8eeeade3009de048..65d018fa55d9b2d798abf72f2d918fac5dab7623 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index cbe102065071a00596fb4b8f764b410737c638a6..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 0efba09b272c8ffb2220ccfaad830c7fff98568c..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index b34c499eb2e603aa8e2a6c9c84ec752a41efd0de..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 51dd853127f549c8ff370391f11cf7b8021af469..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index dcd18a9cedd53565fdf38d9787335e0afea9ad3d..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f029907ee86943fb8c04eada819e9cbfd6d01009..9a660083ee0d875ed7bf2dccfd7e4cd0d5ae2a91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 278ae06cba9bd60b16716fdd0a38e87df2ee303e..9f25b3c6574e08b96e4947210c3c7d174e7293f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -179,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 15cbcfe8edffa92ef0514248e9dbc523dc6a49bd..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 865b898c4cc54253d85442f2db2f3f624ecfb817..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 3e17aca17cc4e636ea3f6235f04cd4b7f468ae28..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index b160687a2a610714f5dd6c0cc7c7c92408d386df..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 70e8d51a5a782d5f473e1350d16d942998e58fbb..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 809dc8554b38af9486035f1f3b03aa58392812de..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 3fbce8cb714355c0898dcbdc6797394410e90253..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 70e4103ea1abd5bda90811d127230105ae7bb941..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 000bf54c4523307d791db76d73e1cbe71cb46e4d..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 3803d2b0a8765f4832df34fc4876256d4dd2ea86..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 28668224e01e04bbc4c14259d84a11ec72c826a8..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b83ed67723afd5544ec19c599437a57909d780c5..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index e689d69140e36a94c731f4c3b4578919d31343f5..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index bb6eddae7168b576bdaca91b6f7951fe7b65ee1f..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c..66aad25f9af529f40c22ad5bfe94009f869fe396 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 8eb6dd9f4a1799c9a6d90fff42490e29417cc24d..a8f60e83b9bc92c2e0759d120c84b5a61b7431aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -179,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 376bec0814880e3fa0091a41cd9a4ba0dcc4ab60..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bde888735916a018647f681968241a583e0271ef..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 16945f2c12a7be4eba8a67a9a58587d756888d12..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f05741ffceb6a855f56731086619dcc621c8d71e..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 7885db4ed291afb8ea627cebe1dbae45723d4b2f..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 9380d26cf4c7b1c93a4c0ce2681e792381c42deb..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 0c96f86ed36d7cf99c396f863de6d9ef8f90adc5..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 0c6b230eb79aac1e719949b3f8331423b621d47b..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index eb7ca52fba97174a1c6869ae003beec8ffc328df..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e724e9088f82f7ff7152cc4393af4a8f582136c6..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index 3122fbec1c7cad161d71fdf9970995adcedfdad7..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 0527cda1f026e1ff9075e827c2902c45fd22db9d..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 814e5a5d545f0d4b1276ef1639eddb72004b4d1b..15c24f8da73c8a00b35d67241ebd131035f8c347 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index aa1731afb82698cb44375407fc717bf32ef634d6..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 9d7dd85fe0eef3733a86b9e918396e882f5812d2..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index e9bba298bb028851e6e9b9a17ff40a671d9132f2..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 3c783eb5129028b3eb5160c75dde2859541cfd32..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index b8e0882541c51209cca112c54197bbce305bd1b2..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 310f369ed6c7a9931af56016ae09db5d4bca15d5..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index df19d781c21e403b51d451d772cfba66a7383be7..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index bf909509bd4b25507291839fff1ee0eaccee630f..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 88e9300de912f3b12712bbc311ac156803ef35c4..33a0c1976b07281141aeaf14ac78f5f3f856ef1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 9d81c6d4bc3139952e9f41113d05547b215cf571..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -167,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 712eb0c6ec3b706a9e396a532f58916140a2c606..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index dfc4ca27052f919ea3866a489e525ae1202795f1..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 5e4f727f71d8be2496bee1abdb87c7050f1ca02e..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9d893cb30a066c4732ccab9d1520f5047a4d3a01..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a2ed954e4c0ce1d474b8c71b41ce1d585d42d665..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 8a0818e78ac766a624bdcce85591fe13e1d4ceac..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index b5591b48265d3d09459bf9bb114a4a3149984eb9..4c817692194c100204412d10ca11181af8f0f7b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 210e4fd4e6f0b2e8ba75d22e83134e2267fbece5..709aac579db73a365d3a318fbc828557a261019a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -65,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -111,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index da2213a84fe2f6bc683630e5f8760acdf3239b19..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2c303d506e0f8f99d8ca89f29979a5999382378..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 396e774c8a4a10c4996c56c208fc4f4d432e3135..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 8b6418d514e61536f314da88f1586cea4f29cfc5..ee06ae5059d1760cb4dea447e0705fecfde8b827 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -64,10 +64,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -110,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e8fda4c71ada65aecef59eb8012120488b0f17c7..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 50c52d270b684bcea5105e4c9813cc62103403f9..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b5598eed07c9f04feb0d90820381abc12dbb456
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..fe9085bb99f58062cab5c2658a6956bc822649b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers"
 tf_module {
+  member {
+    name: "AbstractRNNCell"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Activation"
     mtype: "<type \'type\'>"
@@ -124,6 +128,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -396,6 +404,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -412,6 +424,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -428,6 +444,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 2f7da93f6f412ca559aec2f6acde2b80a5c93c86..1242eec68f1414f1c8e67bb95602687f4a58412f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.BinaryCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index b3a7cd80973259bd5cdfe382c656a9478f8933d8..cf3c2de840450de8e9467269ec446172583e8ffd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
index 4ba9e57bed4100437c8b71d8b506cc2c928a9ac9..fa374afb28bc4d7fe226456743c285b4f539ced1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalHinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
similarity index 73%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
index 4952a76291c00bfdd73eed5412e7421887d1bab2..aa14c44fa3628236033e952b69f3a160c49a36fc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.losses.CosineProximity"
+path: "tensorflow.keras.losses.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
index 7b3c62d3bef0b9d200577f34cbe303fc7a094acc..a4c25eefcbbc75afb3765b11e325f6bd830ccba8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.losses.Hinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa8ffa95726f72b620c3908b48fe20dfae1dc17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d950c789eb44fcad792a9d11856ce11143715807
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd5317f89f801e8a4f4cc80e700e2b478ebf40
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 712bb2ecd3526c354cbcf640e689526b2e415a13..9da6b59ec83bb5b74336a122a791a0d5ea3eb079 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsoluteError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7fe362da89b47a925cd4708909e1c882a9a23aca..7c3ae9b49a415c1586df01984bd73af38ee97558 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
index a5718533500d9508c558d25d13fc6b61518a73a0..2126ac68d2a4cd8f1b68466e073ec573d13f2cda 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 200006db355ca4dc8eb2f509bcb9da7543145548..6ef9610546a0ec662313534f424d49879187f302 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c90c3140e2b68b9796873b0de73668f1508476
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c13f9f967db7014548de1283c5d59bbac403299a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
index f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d..fabe4c7814462b91a12062bac5c2119cfd45bccf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.SquaredHinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index ce942de6d2dd1a316e6804a5c187ae6b4d6b8db7..e24947ad19a8407cfc17c18630c8e81abdfe806c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -13,13 +13,29 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
     name: "Hinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -36,6 +52,14 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SquaredHinge"
     mtype: "<type \'type\'>"
@@ -80,6 +104,10 @@ tf_module {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -146,7 +174,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e00a3a355269a0ccc5d69b3fcea106c4908e115
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 6756beca1dec885bf23b0c365496d84e5cb3eeb1..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index bec0b20aa51d7c098c333e162c562225d92f38f4..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 71dc294dc39f05ffdc416be7b92337e2dfa69690..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f1794aba61aae085a7580806e524eea8b2a791
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 43024e738a500fbbc36077b9e598dbe2445898e2..19442b5028dda68548c19c74e0828abf4fd54534 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
similarity index 94%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 1e39385f81957bc0bb9ff2fb6660a16fd3e4c9b4..678c7b0681fe4281893fba70b4652233a91e2a0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.metrics.CosineProximity"
+path: "tensorflow.keras.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -89,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5432f7f4006b165fefb9aa028bf7d36d8cbc38f1..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 75541bf285d8989f867aabc7c7025e56cce1d05d..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index f8d47f3771798d40860f29a22bf81319385cfb66..dedc64f1375b66b90f655f280c1a56ba165cfa17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af8366b60876cb31f840c5f5007e67980be8dc3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e072e21cc94492ed27186f44b92863cd791d62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index bf7fc7cfc506e0adc07342531a0f590533468139..75173ad17a9c1fa02451287adad10870a60d653b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 59bb767d3501c59b7d2bf052570d5b1f161d2df7..7be81b63bbe01b8534bd64d163e735d735ff88f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21e44ed988494119662e5e1a5101edbe4d7a35fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17fc34566e8ab6c5cc73781b40cb0f7396067
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 91f4712312820a6840229f2e6cd763c2a3ce7900..363f532ba410f1ebae5f105769a0e69c2e2d0166 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 205e15d439fe8f9493f41fb23ce54248173e9295..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index eec26ffce71425d78fbc4fa29540b58c8b12ee64..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce746ab350bfa0534bf7f9ac7d6e8255c7749894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..570b77408cbaa2b7a0089f9de8a528e604799abe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index 9aeaa5627a9805579d6a6c4e09336a4d7994d1c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 748cec08668c461fcf80df6a50fd5192f99073b9..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2bdbd54e22756b823716c149cf0f24661acc812
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 571c2bf9d33c14b4a5699fd9dfa3e85ad97f99f2..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c135b8f680061a1e79fedd9d705d0fb54344823b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 85f80b062efe3d2d91104b211c8d9d75127c8c0e..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 71c047e9736e1bd86e47b2f43774d2ad0d884821..90bbb087fafcdcde5dee048c45adbc45e3be2e55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30ef19e02cfc99d117e6a396beeaf6422a105013
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e59476a2410f859dff7171162a2cab123d5e853d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 4bc9383f6ffc90972416fa031d5515a149e70425..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 2eae4df0ae344656bb637bc27e806876304a86f1..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index cae251cdfe1b0eae57a8c44030a08d3cfb373c5a..3f7fe4a2f4033faf1e4f79705fe78475866e80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,16 +12,24 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalHinge"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
@@ -32,6 +44,14 @@ tf_module {
     name: "Hinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
@@ -44,6 +64,14 @@ tf_module {
     name: "MeanAbsolutePercentageError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanSquaredError"
     mtype: "<type \'type\'>"
@@ -52,6 +80,18 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -60,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -68,6 +112,14 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
@@ -76,6 +128,14 @@ tf_module {
     name: "SquaredHinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -194,7 +254,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b405198a29c4c7673688f2bf0f410a4a3e7a526
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -0,0 +1,75 @@
+path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2af65554f75e606755738d5126f7ee2b749d32e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.mixed_precision.experimental.Policy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "default_variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_cast_variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83ef24cdf7a5c1bf91da0a08cb735433d7dd1e3c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.mixed_precision.experimental"
+tf_module {
+  member {
+    name: "LossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Policy"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "global_policy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_policy"
+    argspec: "args=[\'policy\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8648afb5f7d2aba11e6cb3a20b537f12d96dd10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.mixed_precision"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index eb1ab1d9dd61b36ed8662e25700f12f82aadb502..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index c69cf281742360d9ed4d1f7cbd35219cf04b1149..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index 0a56293e804f583a949ecb413da0ba613e0bc876..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index 14d0894e5622021c4961228d431d01516b752055..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index ece63ec168dac58f58286dbd9fd8a8151d0dc2dc..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index f952f88b6d203488ea0ec4f1794d7de79a25853a..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 27bae902b0cb7f1f4e09737a83fadd95a83cc163..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index e523443a0099b57942c73cafcd8a919503e8db38..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index d2721f8e92088c216ab748cae45e415553b9d4c1..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
index 7257b02087e237eaa47ed6a042559aa1332fc87b..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
@@ -32,6 +36,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ae478cb2c663b8a856bd29146558b808499079
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2fe61f4d2cb8f76fe1c8d6261b5f383b79281f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b33bd7526bd3f67f54450f97adf3d1d4d717051
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.keras.optimizers.schedules.LearningRateSchedule"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f1496492abfabb04bd47834d434ab8df05af705
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..728436c36111de60c3752e09049ffb5678e4b2d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024e472a734935e668b9d6ee6e9c115cc90bdcd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.keras.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
index ed9967856200d62fd152dfec85c8ec36403bcbc0..3db6920519a989bb6832c81ecbb07aec30166115 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
@@ -56,6 +56,10 @@ tf_module {
     name: "metrics"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mixed_precision"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "models"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 6d826a8f8e47e53bffd5f759c4af02c5f9d5b15c..f8acd9b54e227d7855f0da90fe5c2baf9eda1d29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 9505c90aac52c3329861bf01ce3c40c50e557b10..35aab02ed444793173dce15d4d48558c2f42bdf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 5b1b8f78dc5d98c059756122df24340bcbc2790b..0b44b231c3b6c3b692eb967cf03ccc4e734e1638 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index ef4c57b6942f17bdd1524d3eb773fbfabd5a82a6..f55c3e4c426b2dd855f8557a8caabb4e73a13bcf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index b5ee2e7302d034c4b9d9ced7a1159c87297b1a06..2e11690019a243d1d4efed0ba0eebadfbe24cb47 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 57f6d7c7c0114d3ab5cd9cf4066979a6837d8e9d..0be1478ea9350d4cd0a8bda1906990dbb0450f62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 88c616bd17987acc2e766a26c4b14c62d6d4a3b1..7a6c6f2f2cd6551a9b1a2d512f0dd9aecd10e0bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index b70a907907e5cb7e7a509e1712675bc9c9bd4cfc..4ba326546c67bfb50ef8000acc9e976f0f07c17c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 33e8765ce6edfe3a14b7d6ff88be9a2ec2f07b32..753a7965d793d374051de95c338ff4bb852fc168 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 1ac13b57912cd815c1b8de9b461d6cae2364ed9d..52624add063600c8376220f30a885f45e5d0108d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 77faa3c2b9da7eb1c7f9cb086948997f6b2af02c..f412f2bade26f6074a1b68d5e9982e35cc7fc5db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 0b2631491b0d727b262df4ac05eef7bc64eedba3..e0e6f2849ae5bcb261c4b552ddb4a0f56fd99aec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 0a3414d20cfd554d1c5eb82d613fbc938f8f3600..903809b243d92af6933582bdb2700ee29ca84517 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index ffc5cf1c8b76e84d30f6a7af22773b45feb1a02b..badd5d7b973bea8e24df48caa07fa6be41370029 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index ff2cf2ba90732bcb042c7f5ebc8a50483d37c8e7..4076962adfda5f609557295cbfe45669fe8180ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 09c8a31a7be1162866e4457fda84a921de283377..ee591be46faa3806bc814b18a8374141c1907dfb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 549e13a7ac6b595dfa665096aa15d7bb20df65e7..c837bf4d4f736944bde804c83a63c46aa103dc88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 169ecdece5ddc92908ed027c7f470d08cbd5a5e0..72b2c4465226924509c29f4914dba3c9e0521098 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index c7a50969b54e5efc4d338caa79dea76d86bffe8a..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 3900c752c8527f68af2496f99083d80fc9d18106..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 7b876099af6a28d9fca2e5c55aeae5e4610f82a6..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 5bddba8e798618f5b1d0cdc61ddff9725a495fe0..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 62ba8bb59e8af14447fe570ba28c5d0eba7f6af8..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 0803feeabd12acb7988459fe6da2748e19b70a5f..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 6def32864b9cc660b94d628ccd53dc48a566ea81..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index dbf1ac82d33b81c63e5c356ac736f63262797ff0..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337..a7eb144d83aaeb2997d44b703b46de9a01c3a478 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 85d902b977ceddd405abb1154a086d7bd29e7848..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 638d82a599248e547bcae86ebd6d8d8dc3f6aa4b..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index ab1b04bd3cb1b215b848019b6c578ce091f8f828..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 961969aac58b78e4edd53b47f2932f71f2d21fd5..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index e76738a9648123414159fdc9666a99b0577aa46e..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index b35cd69da474a9665652f04f12b34a8d9f33fa8a..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 5e49b75c3131b989c765ab03659fb225cc23e26e..53564e33596c1789493f7eaa00ca74d491b041db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -200,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
index 1fe179f6c1b64ebc2f7535719bc1598577ee7f03..68cb07ea6fab85824400cce8408ebcb1dc030f8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
index 66e692a5a379203cb491980802b7003072bfe76c..3ac478f7626556574983aed4e5d284cb758406c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "AGGREGATE_STACK"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "CHILDREN_INPUTS_MAPPINGS"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_AGGREGATE_ATTR"
     mtype: "<type \'str\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "FUNCTION_INPUT_INDEX_ATTR"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "FUNCTION_LEVEL_ATTR"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_NAME_ATTR"
     mtype: "<type \'str\'>"
@@ -48,7 +56,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add_input"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-target-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-target-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..966fb69cbed38f1fe8102cc09a2e3a438eb79c28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-target-spec.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.TargetSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TargetSpec\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'supported_ops\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
similarity index 86%
rename from tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index 7781337c826e01cdc820a65a288bf9ce7e251fd0..c84513d088516ee8cc8c2c92e344f42bfc4379f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -1,12 +1,11 @@
-path: "tensorflow.rnn.DropoutWrapper"
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapperV2\'>"
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -104,13 +103,9 @@ tf_class {
     name: "weights"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "wrapped_cell"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'input_keep_prob\', \'output_keep_prob\', \'state_keep_prob\', \'variational_recurrent\', \'input_size\', \'dtype\', \'seed\', \'dropout_state_filter_visitor\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'1.0\', \'False\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -130,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -142,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
similarity index 86%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 3205c6a4dcaaa00591cd957021a463b77835343e..269944ee9df44f38e89bbca32949b7a12490f8f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -1,10 +1,11 @@
-path: "tensorflow.nn.rnn_cell.DeviceWrapper"
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -104,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'device\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -124,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -132,11 +133,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..354a7086d6046d5f2452799a9e86cb07400c1679
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "convert_op_hints_to_stubs"
+    argspec: "args=[\'session\', \'graph_def\', \'write_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'<function <lambda> instance>\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..97722ad559eb9feb0265caf341540b0380f8f4cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -12,10 +12,22 @@ tf_module {
     name: "OpsSet"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TargetSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TocoConverter"
     mtype: "<type \'type\'>"
@@ -24,6 +36,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "toco_convert"
     argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-key-value-tensor-initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-key-value-tensor-initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..982246da441a65d5eb2819f8f7f23c4804d88a38
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-key-value-tensor-initializer.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup.KeyValueTensorInitializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.KeyValueTensorInitializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TableInitializerBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'key_dtype\', \'value_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\', \'table\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e89846ba0b689956adb32cd6b0b9563c09241159
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-hash-table.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.lookup.StaticHashTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticHashTableV1\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticHashTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.InitializableLookupTableBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initializer\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d839fa1c8905b585dd3f1c0ce305022c2bfc8085
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-static-vocabulary-table.pbtxt
@@ -0,0 +1,41 @@
+path: "tensorflow.lookup.StaticVocabularyTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticVocabularyTableV1\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticVocabularyTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initializer\', \'num_oov_buckets\', \'lookup_key_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.-text-file-initializer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-text-file-initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff9a0ce6e7de6b8de01c897815bd8fd07fefc2c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.-text-file-initializer.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup.TextFileInitializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TextFileInitializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TableInitializerBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filename\', \'key_dtype\', \'key_index\', \'value_dtype\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\', \'table\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2308185afdd6a60a98e1542619e480526f44b2ba
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
@@ -0,0 +1,56 @@
+path: "tensorflow.lookup.experimental.DenseHashTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.DenseHashTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'key_dtype\', \'value_dtype\', \'default_value\', \'empty_key\', \'deleted_key\', \'initial_num_buckets\', \'name\', \'checkpoint\'], varargs=None, keywords=None, defaults=[\'None\', \'MutableDenseHashTable\', \'True\'], "
+  }
+  member_method {
+    name: "erase"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "insert"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "insert_or_assign"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..855a758fd612505b03213d095e122342fd87e1d6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lookup.experimental"
+tf_module {
+  member {
+    name: "DenseHashTable"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lookup.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7119d4c281ee0ce3c0a391f2f9a198b257aa537
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lookup.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup"
+tf_module {
+  member {
+    name: "KeyValueTensorInitializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StaticHashTable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StaticVocabularyTable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TextFileInitializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 2e298d8cb641e7a9333b4cd7a84ed4dd9eb213a1..6fea38d03acaf1c6c3ec60109b6e16f0a2d3f11f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,6 +264,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -300,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.nest"
+tf_module {
+  member_method {
+    name: "assert_same_structure"
+    argspec: "args=[\'nest1\', \'nest2\', \'check_types\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+  }
+  member_method {
+    name: "flatten"
+    argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_structure"
+    argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "pack_sequence_as"
+    argspec: "args=[\'structure\', \'flat_sequence\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 40e20f8c919e64362e5697bd00ded70d0c2292a0..c36d997d2e97619babd8eb4d161db41326907359 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -22,15 +22,27 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "avg_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\', \'input\', \'mean\', \'variance\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -54,11 +66,15 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\', \'input\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
@@ -66,15 +82,15 @@ tf_module {
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'SAME\', \'NHWC\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d_backprop_filter"
@@ -86,11 +102,15 @@ tf_module {
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'SAME\', \'NDHWC\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
@@ -110,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\', \'logits\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'False\', \'True\', \'None\'], "
   }
   member_method {
     name: "ctc_loss_v2"
@@ -126,7 +146,7 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_backprop_filter"
@@ -150,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "dilation2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dropout"
@@ -234,19 +254,31 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "max_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "max_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'Targmax\', \'name\', \'output_dtype\', \'include_batch_in_index\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "moments"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "nce_loss"
@@ -258,7 +290,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "quantized_avg_pool"
@@ -306,7 +338,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -322,7 +354,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
@@ -338,7 +370,7 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "space_to_depth"
@@ -362,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -382,7 +414,7 @@ tf_module {
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 4251206cda782be1a3a4c7f78fc0df705df88596..f2211375ed40d0d0393b14f952bd52c680aa8897 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 20af24633a45759c5b6e7b7758d09d87a024cacd..ca6e923ebcf3a6c3fede7a47fd6fb87f02ba046f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3205c6a4dcaaa00591cd957021a463b77835343e..c80f71272ab4ff27e063f2452dab39551e4d49e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -1,10 +1,12 @@
 path: "tensorflow.nn.rnn_cell.DeviceWrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -104,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'device\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
@@ -124,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 14cf5ce4569f18b326af1ba953a8edec2fee5706..78f1b943dd123b5da360cd248e9b7e9e997af47d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -1,10 +1,12 @@
 path: "tensorflow.nn.rnn_cell.DropoutWrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'input_keep_prob\', \'output_keep_prob\', \'state_keep_prob\', \'variational_recurrent\', \'input_size\', \'dtype\', \'seed\', \'dropout_state_filter_visitor\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'1.0\', \'False\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
@@ -128,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index e43547b15428634f0f84ff0e01abdf4585e9d5db..ce25e44b17b4950de7c74cc7edcead63e5e1f42a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 99381cd7e167223cea0fe4eaebbcff736be66054..1859a9e388b8471becb7c5716b5fdf6c8ab87856 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 1fbde9df17cb83bffd46c82f11c99d2926859f77..baf6278bac9fbeb88199ea572ea23957bbf5148b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -124,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 8ba92fcc8dc89958b8395aa986c358a03fedd66d..da4d5c47aaebf01bd4c269e6ac2db4d66b91e2a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -123,7 +123,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 9de73076b1197ce7bee8a00dfd7bfcd1b48a35bc..5aa93262361229f10ec2ddba86902c7f1722aaae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -1,10 +1,12 @@
 path: "tensorflow.nn.rnn_cell.ResidualWrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -104,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'residual_fn\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
@@ -124,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 016ae23890866ce3394806b0114ca0c9a4d70869..462ae1128828767f6df672578b7c235e5bf21212 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "ConfigProto"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "CriticalSection"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -136,6 +140,10 @@ tf_module {
     name: "MetaGraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Module"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "NameAttrList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -456,6 +464,10 @@ tf_module {
     name: "logging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "lookup"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
@@ -476,6 +488,10 @@ tf_module {
     name: "name_scope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "nest"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "newaxis"
     mtype: "<type \'NoneType\'>"
@@ -548,6 +564,10 @@ tf_module {
     name: "random_uniform_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "raw_ops"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -596,6 +616,10 @@ tf_module {
     name: "test"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "tpu"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "train"
     mtype: "<type \'module\'>"
@@ -858,7 +882,7 @@ tf_module {
   }
   member_method {
     name: "batch_to_space"
-    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "batch_to_space_nd"
@@ -936,6 +960,10 @@ tf_module {
     name: "colocate_with"
     argspec: "args=[\'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -970,7 +998,7 @@ tf_module {
   }
   member_method {
     name: "convert_to_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\', \'dtype_hint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convert_to_tensor_or_indexed_slices"
@@ -990,7 +1018,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "count_up_to"
@@ -1084,10 +1112,6 @@ tf_module {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "disable_v2_batch_normalization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "disable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1132,10 +1156,6 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "enable_v2_batch_normalization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "enable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1182,7 +1202,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\', \'sizes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -1260,6 +1280,10 @@ tf_module {
     name: "foldr"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "function"
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+  }
   member_method {
     name: "gather"
     argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\', \'batch_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'0\'], "
@@ -1304,6 +1328,10 @@ tf_module {
     name: "get_session_tensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_static_value"
+    argspec: "args=[\'tensor\', \'partial\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "get_variable"
     argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
@@ -1440,6 +1468,10 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_variable_initialized"
     argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
@@ -1786,7 +1818,7 @@ tf_module {
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
@@ -1836,6 +1868,10 @@ tf_module {
     name: "reshape"
     argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "resource_variables_enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reverse"
     argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2014,7 +2050,7 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "space_to_batch_nd"
@@ -2030,7 +2066,7 @@ tf_module {
   }
   member_method {
     name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\', \'expand_nonconcat_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_fill_empty_rows"
@@ -2174,7 +2210,7 @@ tf_module {
   }
   member_method {
     name: "string_split"
-    argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
+    argspec: "args=[\'source\', \'sep\', \'skip_empty\', \'delimiter\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "string_strip"
@@ -2182,7 +2218,7 @@ tf_module {
   }
   member_method {
     name: "string_to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "string_to_hash_bucket_fast"
@@ -2194,7 +2230,7 @@ tf_module {
   }
   member_method {
     name: "string_to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "substr"
@@ -2224,6 +2260,18 @@ tf_module {
     name: "tensor_scatter_add"
     argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_nd_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_nd_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_nd_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensor_scatter_sub"
     argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2362,7 +2410,7 @@ tf_module {
   }
   member_method {
     name: "verify_tensor_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "where"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96203756a28afe6899ec0d4e3631199c4cc5745e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.-generator.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.random.experimental.Generator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.stateful_random_ops.Generator\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "algorithm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'copy_from\', \'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "normal"
+    argspec: "args=[\'self\', \'shape\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'self\', \'shape\', \'minval\', \'maxval\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "uniform_full_int"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ccedc63420eb425041fcd5e4183675e781cd3a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.experimental.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.random.experimental"
+tf_module {
+  member {
+    name: "Generator"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "create_rng_state"
+    argspec: "args=[\'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_global_generator"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_global_generator"
+    argspec: "args=[\'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_generator"
+    argspec: "args=[\'generator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 1eefb1c70ce4d825402155a5e068c736defff02f..ac8412115f91db60641fd07c18d16b2f6c12644f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.random"
 tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "all_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..489771285c7aa5a407c4914785f8577bb30944c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -0,0 +1,4351 @@
+path: "tensorflow.raw_ops"
+tf_module {
+  member_method {
+    name: "Abort"
+    argspec: "args=[\'error_msg\', \'exit_without_error\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulateNV2"
+    argspec: "args=[\'inputs\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorNumAccumulated"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorSetGlobalStep"
+    argspec: "args=[\'handle\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Acos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Acosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Add"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AddManySparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "AddN"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AddSparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "AddV2"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustContrast"
+    argspec: "args=[\'images\', \'contrast_factor\', \'min_value\', \'max_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustContrastv2"
+    argspec: "args=[\'images\', \'contrast_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustHue"
+    argspec: "args=[\'images\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustSaturation"
+    argspec: "args=[\'images\', \'scale\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "All"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AllCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "AllToAll"
+    argspec: "args=[\'input\', \'group_assignment\', \'concat_dimension\', \'split_dimension\', \'split_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Angle"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "AnonymousIterator"
+    argspec: "args=[\'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Any"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApproximateEqual"
+    argspec: "args=[\'x\', \'y\', \'tolerance\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'None\'], "
+  }
+  member_method {
+    name: "ArgMax"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ArgMin"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "AsString"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Asin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Asinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "Assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "AssignAdd"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AssignAddVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AssignSub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AssignSubVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AssignVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atan2"
+    argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AudioSpectrogram"
+    argspec: "args=[\'input\', \'window_size\', \'stride\', \'magnitude_squared\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AudioSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "AudioSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool3DGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPoolGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "Barrier"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BarrierClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BarrierIncompleteSize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierInsertMany"
+    argspec: "args=[\'handle\', \'keys\', \'values\', \'component_index\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierReadySize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierTakeMany"
+    argspec: "args=[\'handle\', \'num_elements\', \'component_types\', \'allow_small_batch\', \'wait_for_incomplete\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "BatchCholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchCholeskyGrad"
+    argspec: "args=[\'l\', \'grad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatMul"
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDiag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalizationGrad"
+    argspec: "args=[\'t\', \'m\', \'v\', \'gamma\', \'backprop\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchSelfAdjointEig"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchSelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "BatchSvd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchToSpace"
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchToSpaceND"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselI0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselI1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Betainc"
+    argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BiasAdd"
+    argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "BiasAddGrad"
+    argspec: "args=[\'out_backprop\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "BiasAddV1"
+    argspec: "args=[\'value\', \'bias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bincount"
+    argspec: "args=[\'arr\', \'size\', \'weights\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bitcast"
+    argspec: "args=[\'input\', \'type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseAnd"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseOr"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseXor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesBucketize"
+    argspec: "args=[\'float_values\', \'bucket_boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCalculateBestGainsPerFeature"
+    argspec: "args=[\'node_id_range\', \'stats_summary_list\', \'l1\', \'l2\', \'tree_complexity\', \'min_node_weight\', \'max_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCenterBias"
+    argspec: "args=[\'tree_ensemble_handle\', \'mean_gradients\', \'mean_hessians\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCreateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCreateQuantileStreamResource"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'epsilon\', \'num_streams\', \'max_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'1099511627776\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesDeserializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesEnsembleResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesExampleDebugOutputs"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesGetEnsembleStates"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesMakeQuantileSummaries"
+    argspec: "args=[\'float_values\', \'example_weights\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesMakeStatsSummary"
+    argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'bucketized_features_list\', \'max_splits\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceAddSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'summaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceDeserialize"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'bucket_boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceFlush"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_buckets\', \'generate_quantiles\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesSerializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesTrainingPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'cached_tree_ids\', \'cached_node_ids\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesUpdateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastArgs"
+    argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastGradientArgs"
+    argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastTo"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bucketize"
+    argspec: "args=[\'input\', \'boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CTCBeamSearchDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "CTCGreedyDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "CTCLoss"
+    argspec: "args=[\'inputs\', \'labels_indices\', \'labels_values\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "CacheDataset"
+    argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Cast"
+    argspec: "args=[\'x\', \'DstT\', \'Truncate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Ceil"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CheckNumerics"
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Cholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CholeskyGrad"
+    argspec: "args=[\'l\', \'grad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ChooseFastestBranchDataset"
+    argspec: "args=[\'input_dataset\', \'ratio_numerator\', \'ratio_denominator\', \'other_arguments\', \'num_elements_per_branch\', \'branches\', \'other_arguments_lengths\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ClipByValue"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CloseSummaryWriter"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveBcastRecv"
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveBcastSend"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectivePermute"
+    argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveReduce"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "CombinedNonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "CompareAndBitpack"
+    argspec: "args=[\'input\', \'threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Complex"
+    argspec: "args=[\'real\', \'imag\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ComplexAbs"
+    argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ComputeAccidentalHits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Concat"
+    argspec: "args=[\'concat_dim\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatOffset"
+    argspec: "args=[\'concat_dim\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatV2"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatenateDataset"
+    argspec: "args=[\'input_dataset\', \'another_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
+  }
+  member_method {
+    name: "ConfigureDistributedTPU"
+    argspec: "args=[\'embedding_config\', \'tpu_embedding_config\', \'is_global_init\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Conj"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConjugateTranspose"
+    argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Const"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConsumeMutexLock"
+    argspec: "args=[\'mutex_lock\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ControlTrigger"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Conv2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv2DBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropInputV2"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Cos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Cosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CountUpTo"
+    argspec: "args=[\'ref\', \'limit\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CreateSummaryDbWriter"
+    argspec: "args=[\'writer\', \'db_uri\', \'experiment_name\', \'run_name\', \'user_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CreateSummaryFileWriter"
+    argspec: "args=[\'writer\', \'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CropAndResize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CropAndResizeGradBoxes"
+    argspec: "args=[\'grads\', \'image\', \'boxes\', \'box_ind\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "CropAndResizeGradImage"
+    argspec: "args=[\'grads\', \'boxes\', \'box_ind\', \'image_size\', \'T\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "Cross"
+    argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CrossReplicaSum"
+    argspec: "args=[\'input\', \'group_assignment\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CudnnRNN"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackprop"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackpropV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackpropV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNCanonicalToParams"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNParamsSize"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNParamsToCanonical"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "Cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "DataFormatDimMap"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
+  }
+  member_method {
+    name: "DataFormatVecPermute"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
+  }
+  member_method {
+    name: "DatasetToGraph"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DatasetToSingleElement"
+    argspec: "args=[\'dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DebugGradientIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DebugGradientRefIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeAndCropJpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeBase64"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeBmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeCSV"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeCompressed"
+    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeGif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeJSONExample"
+    argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeJpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodePng"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "DecodeRaw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeWav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "DeepCopy"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeleteSessionTensor"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DenseToDenseSetOperation"
+    argspec: "args=[\'set1\', \'set2\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DenseToSparseSetOperation"
+    argspec: "args=[\'set1\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DepthToSpace"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNative"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+  }
+  member_method {
+    name: "DeserializeIterator"
+    argspec: "args=[\'resource_handle\', \'serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeserializeManySparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeserializeSparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DestroyResourceOp"
+    argspec: "args=[\'resource\', \'ignore_lookup_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DestroyTemporaryVariable"
+    argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Digamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DivNoNan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DrawBoundingBoxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DynamicPartition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DynamicStitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EagerPyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EditDistance"
+    argspec: "args=[\'hypothesis_indices\', \'hypothesis_values\', \'hypothesis_shape\', \'truth_indices\', \'truth_values\', \'truth_shape\', \'normalize\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Elu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EluGrad"
+    argspec: "args=[\'gradients\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Empty"
+    argspec: "args=[\'shape\', \'dtype\', \'init\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "EmptyTensorList"
+    argspec: "args=[\'element_shape\', \'max_num_elements\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EncodeBase64"
+    argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "EncodeJpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "EncodePng"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "EncodeWav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingIntegerBatch"
+    argspec: "args=[\'batch\', \'mode_override\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingSparseBatch"
+    argspec: "args=[\'sample_indices\', \'embedding_indices\', \'aggregation_weights\', \'mode_override\', \'device_ordinal\', \'combiners\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingSparseTensorBatch"
+    argspec: "args=[\'sample_indices\', \'embedding_indices\', \'aggregation_weights\', \'mode_override\', \'table_ids\', \'device_ordinal\', \'combiners\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "EnsureShape"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Enter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "Equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Erfc"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Exit"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Exp"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExpandDims"
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalAssertNextDataset"
+    argspec: "args=[\'input_dataset\', \'transformations\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalAutoShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalBytesProducedStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalCSVDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalChooseFastestDataset"
+    argspec: "args=[\'input_datasets\', \'num_experiments\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDatasetCardinality"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDatasetToTFRecord"
+    argspec: "args=[\'input_dataset\', \'filename\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDenseToSparseBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'row_shape\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDirectedInterleaveDataset"
+    argspec: "args=[\'selector_input_dataset\', \'data_input_datasets\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalGroupByReducerDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'init_func_other_arguments\', \'reduce_func_other_arguments\', \'finalize_func_other_arguments\', \'key_func\', \'init_func\', \'reduce_func\', \'finalize_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalGroupByWindowDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'reduce_func_other_arguments\', \'window_size_func_other_arguments\', \'key_func\', \'reduce_func\', \'window_size_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIdentityIndexedDataset"
+    argspec: "args=[\'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIgnoreErrorsDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetGet"
+    argspec: "args=[\'materialized\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetMaterialize"
+    argspec: "args=[\'dataset\', \'materialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIteratorGetDevice"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalLMDBDataset"
+    argspec: "args=[\'filenames\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalLatencyStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMatchingFilesDataset"
+    argspec: "args=[\'patterns\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMaterializedIndexDatasetHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMaxIntraOpParallelismDataset"
+    argspec: "args=[\'input_dataset\', \'max_intra_op_parallelism\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalNonSerializableDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalNumaMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalParallelInterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalParseExampleDataset"
+    argspec: "args=[\'input_dataset\', \'num_parallel_calls\', \'dense_defaults\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'output_types\', \'output_shapes\', \'sloppy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalPrivateThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'num_threads\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalRandomDataset"
+    argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalRebatchDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalScanDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSetStatsAggregatorDataset"
+    argspec: "args=[\'input_dataset\', \'stats_aggregator\', \'tag\', \'counter_prefix\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSleepDataset"
+    argspec: "args=[\'input_dataset\', \'sleep_microseconds\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSlidingWindowDataset"
+    argspec: "args=[\'input_dataset\', \'window_size\', \'window_shift\', \'window_stride\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSqlDataset"
+    argspec: "args=[\'driver_name\', \'data_source_name\', \'query\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorSummary"
+    argspec: "args=[\'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalTakeWhileDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'thread_pool\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalThreadPoolHandle"
+    argspec: "args=[\'num_threads\', \'display_name\', \'max_intra_op_parallelism\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalUnbatchDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalUniqueDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Expm1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExtractGlimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
+  member_method {
+    name: "ExtractImagePatches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExtractJpegShape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ExtractVolumePatches"
+    argspec: "args=[\'input\', \'ksizes\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Fact"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FakeParam"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgs"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQueue"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Fill"
+    argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FilterByLastComponentDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FilterDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordDataset"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordDatasetV2"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordReader"
+    argspec: "args=[\'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordReaderV2"
+    argspec: "args=[\'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FixedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1\', \'0\', \'1\', \'0\', \'[]\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Floor"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FloorDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FloorMod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FlushSummaryWriter"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "For"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'input\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FractionalAvgPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalAvgPoolGrad"
+    argspec: "args=[\'orig_input_tensor_shape\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalMaxPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalMaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNorm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormGrad"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormGradV2"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormV2"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedPadConv2D"
+    argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FusedResizeAndPadConv2D"
+    argspec: "args=[\'input\', \'size\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'resize_align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "GatherNd"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GatherV2"
+    argspec: "args=[\'params\', \'indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GenerateVocabRemapping"
+    argspec: "args=[\'new_vocab_file\', \'old_vocab_file\', \'new_vocab_offset\', \'num_new_vocab\', \'old_vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "GeneratorDataset"
+    argspec: "args=[\'init_func_other_args\', \'next_func_other_args\', \'finalize_func_other_args\', \'init_func\', \'next_func\', \'finalize_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionHandle"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionHandleV2"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionTensor"
+    argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Greater"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GreaterEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GuaranteeConst"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "HSVToRGB"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "HashTable"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "HashTableV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "HistogramFixedWidth"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "HistogramSummary"
+    argspec: "args=[\'tag\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT2D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT3D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Identity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IdentityN"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IdentityReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "IdentityReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "If"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Igamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IgammaGradA"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Igammac"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Imag"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ImageSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
+  }
+  member_method {
+    name: "ImmutableConst"
+    argspec: "args=[\'dtype\', \'shape\', \'memory_region_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ImportEvent"
+    argspec: "args=[\'writer\', \'event\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InTopK"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InTopKV2"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedDequeue"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedDequeueTuple"
+    argspec: "args=[\'dtypes\', \'shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueue"
+    argspec: "args=[\'input\', \'shape\', \'layout\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueuePrelinearizedBuffer"
+    argspec: "args=[\'input\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueueTuple"
+    argspec: "args=[\'inputs\', \'shapes\', \'layouts\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTable"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InitializeTableFromTextFile"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTableFromTextFileV2"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTableV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceAdd"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceSub"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceUpdate"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Inv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InvGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Invert"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InvertPermutation"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsBoostedTreesEnsembleInitialized"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsBoostedTreesQuantileStreamResourceInitialized"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsFinite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsInf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsNan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsVariableInitialized"
+    argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Iterator"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "IteratorFromStringHandleV2"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNext"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNextAsOptional"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNextSync"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorToStringHandle"
+    argspec: "args=[\'resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorV2"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "L2Loss"
+    argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LMDBReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LRN"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'5\', \'1\', \'1\', \'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "LRNGrad"
+    argspec: "args=[\'input_grads\', \'input_image\', \'output_image\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'5\', \'1\', \'1\', \'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "LeakyRelu"
+    argspec: "args=[\'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
+  }
+  member_method {
+    name: "LeakyReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
+  }
+  member_method {
+    name: "LearnedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "LeftShift"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Less"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LessEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Lgamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LinSpace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ListDiff"
+    argspec: "args=[\'x\', \'y\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "LoadAndRemapMatrix"
+    argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingADAMParameters"
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdadeltaParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdagradParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingFTRLParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMomentumParameters"
+    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingProximalAdagradParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingRMSPropParameters"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Log"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Log1p"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogMatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogSoftmax"
+    argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogUniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "LogicalAnd"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogicalNot"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogicalOr"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableExport"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableExportV2"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableFind"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableFindV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableImport"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableImportV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableInsert"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableInsertV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableRemoveV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableSize"
+    argspec: "args=[\'table_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableSizeV2"
+    argspec: "args=[\'table_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LoopCond"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LowerBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "MakeIterator"
+    argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MapClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MapDefun"
+    argspec: "args=[\'arguments\', \'captured_inputs\', \'output_types\', \'output_shapes\', \'f\', \'max_intra_op_parallelism\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "MapIncompleteSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapPeek"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapUnstageNoKey"
+    argspec: "args=[\'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatchingFiles"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDiag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixExponential"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixLogarithm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixSquareRoot"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Max"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3DGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3DGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolV2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolWithArgmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Maximum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Mean"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Merge"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MergeSummary"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MergeV2Checkpoints"
+    argspec: "args=[\'checkpoint_prefixes\', \'destination_prefix\', \'delete_old_dirs\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Mfcc"
+    argspec: "args=[\'spectrogram\', \'sample_rate\', \'upper_frequency_limit\', \'lower_frequency_limit\', \'filterbank_channel_count\', \'dct_coefficient_count\', \'name\'], varargs=None, keywords=None, defaults=[\'4000\', \'20\', \'40\', \'13\', \'None\'], "
+  }
+  member_method {
+    name: "Min"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Minimum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MirrorPad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MirrorPadGrad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ModelDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "Mul"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIterator"
+    argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorGetNextFromShard"
+    argspec: "args=[\'multi_device_iterator\', \'shard_num\', \'incarnation_id\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorInit"
+    argspec: "args=[\'dataset\', \'multi_device_iterator\', \'max_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorToStringHandle"
+    argspec: "args=[\'multi_device_iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'seed2\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "MutableDenseHashTable"
+    argspec: "args=[\'empty_key\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'131072\', \'0.8\', \'None\'], "
+  }
+  member_method {
+    name: "MutableDenseHashTableV2"
+    argspec: "args=[\'empty_key\', \'deleted_key\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'131072\', \'0.8\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTable"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableOfTensors"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableOfTensorsV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MutexLock"
+    argspec: "args=[\'mutex\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MutexV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "NcclAllReduce"
+    argspec: "args=[\'input\', \'reduction\', \'num_devices\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NcclBroadcast"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NcclReduce"
+    argspec: "args=[\'input\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Neg"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NextAfter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NextIteration"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NoOp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonDeterministicInts"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV2"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV3"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV4"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionWithOverlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NotEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NthElement"
+    argspec: "args=[\'input\', \'n\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "OneHot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OneShotIterator"
+    argspec: "args=[\'dataset_factory\', \'output_types\', \'output_shapes\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OnesLike"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptimizeDataset"
+    argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalFromValue"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalGetValue"
+    argspec: "args=[\'optional\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalHasValue"
+    argspec: "args=[\'optional\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalNone"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OrderedMapClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapIncompleteSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapPeek"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapUnstageNoKey"
+    argspec: "args=[\'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeue"
+    argspec: "args=[\'dtype\', \'shape\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeueTuple"
+    argspec: "args=[\'dtypes\', \'shapes\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedEnqueue"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OutfeedEnqueueTuple"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Pack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "Pad"
+    argspec: "args=[\'input\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PadV2"
+    argspec: "args=[\'input\', \'paddings\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddedBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddedBatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddingFIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "PaddingFIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ParallelConcat"
+    argspec: "args=[\'values\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParallelDynamicStitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParallelInterleaveDatasetV2"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'sloppy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ParallelMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'sloppy\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'means\', \'stdevs\', \'minvals\', \'maxvals\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "ParseExample"
+    argspec: "args=[\'serialized\', \'names\', \'sparse_keys\', \'dense_keys\', \'dense_defaults\', \'sparse_types\', \'dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParseSequenceExample"
+    argspec: "args=[\'serialized\', \'debug_name\', \'context_dense_defaults\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'Ncontext_sparse\', \'Ncontext_dense\', \'Nfeature_list_sparse\', \'Nfeature_list_dense\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'[]\', \'[]\', \'[]\', \'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "ParseSingleExample"
+    argspec: "args=[\'serialized\', \'dense_defaults\', \'num_sparse\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParseSingleSequenceExample"
+    argspec: "args=[\'serialized\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'context_dense_defaults\', \'debug_name\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "ParseTensor"
+    argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "PlaceholderV2"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PlaceholderWithDefault"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Polygamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PopulationCount"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PrefetchDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Prelinearize"
+    argspec: "args=[\'input\', \'shape\', \'layout\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "PrelinearizeTuple"
+    argspec: "args=[\'inputs\', \'shapes\', \'layouts\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "PreventGradient"
+    argspec: "args=[\'input\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'-1\', \'3\', \'None\'], "
+  }
+  member_method {
+    name: "PrintV2"
+    argspec: "args=[\'input\', \'output_stream\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'None\'], "
+  }
+  member_method {
+    name: "PriorityQueue"
+    argspec: "args=[\'shapes\', \'component_types\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "PriorityQueueV2"
+    argspec: "args=[\'shapes\', \'component_types\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Prod"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "PyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PyFuncStateless"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Qr"
+    argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantize"
+    argspec: "args=[\'input\', \'signed_input\', \'num_bits\', \'range_given\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV3"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeDownAndShrinkRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizeV2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedAdd"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedAvgPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedBatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'t_min\', \'t_max\', \'m\', \'m_min\', \'m_max\', \'v\', \'v_min\', \'v_max\', \'beta\', \'beta_min\', \'beta_max\', \'gamma\', \'gamma_min\', \'gamma_max\', \'out_type\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedBiasAdd"
+    argspec: "args=[\'input\', \'bias\', \'min_input\', \'max_input\', \'min_bias\', \'max_bias\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedConcat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2D"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBias"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedInstanceNorm"
+    argspec: "args=[\'x\', \'x_min\', \'x_max\', \'output_range_given\', \'given_y_min\', \'given_y_max\', \'variance_epsilon\', \'min_separation\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'0\', \'1e-05\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedMatMul"
+    argspec: "args=[\'a\', \'b\', \'min_a\', \'max_a\', \'min_b\', \'max_b\', \'Toutput\', \'transpose_a\', \'transpose_b\', \'Tactivation\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'False\', \'False\', \"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedMaxPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedMul"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedRelu"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedRelu6"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedReluX"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedReshape"
+    argspec: "args=[\'tensor\', \'shape\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'min\', \'max\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueCloseV2"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeue"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueMany"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueManyV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueUpTo"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueUpToV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueV2"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueue"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueMany"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueManyV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueIsClosed"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueIsClosedV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueSize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueSizeV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT2D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT3D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RGBToHSV"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedGather"
+    argspec: "args=[\'params_nested_splits\', \'params_dense_values\', \'indices\', \'OUTPUT_RAGGED_RANK\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedRange"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedTensorToSparse"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RandomCrop"
+    argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomGamma"
+    argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomGammaGrad"
+    argspec: "args=[\'alpha\', \'sample\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RandomPoisson"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomPoissonV2"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffle"
+    argspec: "args=[\'value\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffleQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffleQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RandomStandardNormal"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomUniform"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomUniformInt"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Range"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RangeDataset"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rank"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReadFile"
+    argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReadVariableOp"
+    argspec: "args=[\'resource\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumRecordsProduced"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumRecordsProducedV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompleted"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompletedV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRead"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadUpTo"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadUpToV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReset"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderResetV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRestoreState"
+    argspec: "args=[\'reader_handle\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRestoreStateV2"
+    argspec: "args=[\'reader_handle\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderSerializeState"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderSerializeStateV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Real"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RealDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Reciprocal"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReciprocalGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RecordInput"
+    argspec: "args=[\'file_pattern\', \'file_random_seed\', \'file_shuffle_shift_ratio\', \'file_buffer_size\', \'file_parallelism\', \'batch_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'301\', \'0\', \'10000\', \'16\', \'32\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RecvTPUEmbeddingActivations"
+    argspec: "args=[\'num_outputs\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReduceDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ReduceJoin"
+    argspec: "args=[\'inputs\', \'reduction_indices\', \'keep_dims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RefEnter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "RefExit"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefMerge"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefNextIteration"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefSelect"
+    argspec: "args=[\'index\', \'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefSwitch"
+    argspec: "args=[\'data\', \'pred\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Relu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Relu6"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Relu6Grad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RemoteCall"
+    argspec: "args=[\'target\', \'args\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RepeatDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizationRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Requantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Reshape"
+    argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResizeArea"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBicubicGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBilinearGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeNearestNeighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeNearestNeighborGrad"
+    argspec: "args=[\'grads\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdamWithAmsgrad"
+    argspec: "args=[\'var\', \'m\', \'v\', \'vhat\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceCountUpTo"
+    argspec: "args=[\'resource\', \'limit\', \'T\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceGather"
+    argspec: "args=[\'resource\', \'indices\', \'dtype\', \'batch_dims\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterAdd"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterDiv"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMax"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMin"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMul"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterSub"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterUpdate"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceStridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Restore"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'dt\', \'preferred_shard\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "RestoreSlice"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'shape_and_slice\', \'dt\', \'preferred_shard\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "RestoreV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'dtypes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingADAMParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdadeltaParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdagradParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingFTRLParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMomentumParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingRMSPropParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Reverse"
+    argspec: "args=[\'tensor\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReverseSequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_dim\', \'batch_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "ReverseV2"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RightShift"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rint"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rsqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RsqrtGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0.1\', \'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SampleDistortedBoundingBoxV2"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'seed2\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Save"
+    argspec: "args=[\'filename\', \'tensor_names\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SaveSlices"
+    argspec: "args=[\'filename\', \'tensor_names\', \'shapes_and_slices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SaveV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'tensors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScalarSummary"
+    argspec: "args=[\'tags\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScaleAndTranslate"
+    argspec: "args=[\'images\', \'size\', \'scale\', \'translation\', \'kernel_type\', \'antialias\', \'name\'], varargs=None, keywords=None, defaults=[\'lanczos3\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScaleAndTranslateGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'scale\', \'translation\', \'kernel_type\', \'antialias\', \'name\'], varargs=None, keywords=None, defaults=[\'lanczos3\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterDiv"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMax"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMin"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNdNonAliasingAdd"
+    argspec: "args=[\'input\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaFprint"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SdcaOptimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaOptimizerV2"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptive\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaShrinkL1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMean"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Select"
+    argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SelfAdjointEig"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Selu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SeluGrad"
+    argspec: "args=[\'gradients\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SendTPUEmbeddingGradients"
+    argspec: "args=[\'inputs\', \'learning_rates\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SerializeIterator"
+    argspec: "args=[\'resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SerializeManySparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SerializeSparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SerializeTensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SetSize"
+    argspec: "args=[\'set_indices\', \'set_values\', \'set_shape\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Shape"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ShapeN"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShardedFilename"
+    argspec: "args=[\'basename\', \'shard\', \'num_shards\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShardedFilespec"
+    argspec: "args=[\'basename\', \'num_shards\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShuffleAndRepeatDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ShutdownDistributedTPU"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SigmoidGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Size"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SkipDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Slice"
+    argspec: "args=[\'input\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Snapshot"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softmax"
+    argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softplus"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftplusGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softsign"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftsignGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToBatch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToBatchND"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToDepth"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "SparseAccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient_indices\', \'gradient_values\', \'gradient_shape\', \'has_known_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'thresh\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAddGrad"
+    argspec: "args=[\'backprop_val_grad\', \'a_indices\', \'b_indices\', \'sum_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseConcat"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
+  }
+  member_method {
+    name: "SparseCross"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseAdd"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseDiv"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseMul"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseFillEmptyRows"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseFillEmptyRowsGrad"
+    argspec: "args=[\'reverse_index_map\', \'grad_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseMatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceMax"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceMaxSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceSum"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceSumSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReorder"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseReshape"
+    argspec: "args=[\'input_indices\', \'input_shape\', \'new_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMeanGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMeanWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtN"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtNGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtNWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSumWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSlice"
+    argspec: "args=[\'indices\', \'values\', \'shape\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSliceGrad"
+    argspec: "args=[\'backprop_val_grad\', \'input_indices\', \'input_start\', \'output_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSoftmax"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSparseMaximum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSparseMinimum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSplit"
+    argspec: "args=[\'split_dim\', \'indices\', \'values\', \'shape\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseTensorDenseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseTensorDenseMatMul"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseTensorSliceDataset"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseToDense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SparseToSparseSetOperation"
+    argspec: "args=[\'set1_indices\', \'set1_values\', \'set1_shape\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Split"
+    argspec: "args=[\'axis\', \'value\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SplitV"
+    argspec: "args=[\'value\', \'size_splits\', \'axis\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SqrtGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SquaredDifference"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Squeeze"
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Stack"
+    argspec: "args=[\'elem_type\', \'stack_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "StackClose"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackCloseV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPop"
+    argspec: "args=[\'handle\', \'elem_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPopV2"
+    argspec: "args=[\'handle\', \'elem_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPush"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "StackPushV2"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "StackV2"
+    argspec: "args=[\'max_size\', \'elem_type\', \'stack_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "Stage"
+    argspec: "args=[\'values\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StageClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StagePeek"
+    argspec: "args=[\'index\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StageSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StatefulPartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StatefulStandardNormal"
+    argspec: "args=[\'resource\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulStandardNormalV2"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulUniformFullInt"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulUniformInt"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessIf"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessMultinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformInt"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessWhile"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StaticRegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StaticRegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "StopGradient"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StridedSlice"
+    argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StridedSliceGrad"
+    argspec: "args=[\'shape\', \'begin\', \'end\', \'strides\', \'dy\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StringFormat"
+    argspec: "args=[\'inputs\', \'template\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'%s\', \'%s\', \'3\', \'None\'], "
+  }
+  member_method {
+    name: "StringJoin"
+    argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "StringLength"
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
+  }
+  member_method {
+    name: "StringSplit"
+    argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "StringSplitV2"
+    argspec: "args=[\'input\', \'sep\', \'maxsplit\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "StringStrip"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucketFast"
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucketStrong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToNumber"
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Sub"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
+  }
+  member_method {
+    name: "Sum"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SummaryWriter"
+    argspec: "args=[\'shared_name\', \'container\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Svd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Switch"
+    argspec: "args=[\'data\', \'pred\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SymbolicGradient"
+    argspec: "args=[\'input\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TFRecordDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TFRecordReader"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TFRecordReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TPUCompilationResult"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUEmbeddingActivations"
+    argspec: "args=[\'embedding_variable\', \'sliced_activations\', \'table_id\', \'lookup_id\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUOrdinalSelector"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUPartitionedCall"
+    argspec: "args=[\'args\', \'device_ordinal\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUReplicate"
+    argspec: "args=[\'inputs\', \'broadcast_inputs\', \'variables\', \'guaranteed_constants\', \'computation\', \'num_replicas\', \'output_types\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+  }
+  member_method {
+    name: "TPUReplicateMetadata"
+    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+  }
+  member_method {
+    name: "TPUReplicatedInput"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUReplicatedOutput"
+    argspec: "args=[\'input\', \'num_replicas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TakeDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TakeManySparseFromTensorsMap"
+    argspec: "args=[\'sparse_handles\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Tan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TanhGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TemporaryVariable"
+    argspec: "args=[\'shape\', \'dtype\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArray"
+    argspec: "args=[\'size\', \'dtype\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayClose"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayCloseV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayCloseV3"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcat"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcatV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcatV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGather"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGatherV2"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGatherV3"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGrad"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradWithShape"
+    argspec: "args=[\'handle\', \'flow_in\', \'shape_to_prepend\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayPack"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayRead"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayReadV2"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayReadV3"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatter"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatterV2"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatterV3"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySize"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySizeV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySizeV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplit"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplitV2"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplitV3"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayUnpack"
+    argspec: "args=[\'handle\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayV2"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'True\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayV3"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'identical_element_shapes\', \'tensor_array_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'True\', \'False\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWrite"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWriteV2"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWriteV3"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorDataset"
+    argspec: "args=[\'components\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListConcat"
+    argspec: "args=[\'input_handle\', \'element_dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorListConcatLists"
+    argspec: "args=[\'input_a\', \'input_b\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListConcatV2"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'leading_dims\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListElementShape"
+    argspec: "args=[\'input_handle\', \'shape_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListFromTensor"
+    argspec: "args=[\'tensor\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListGather"
+    argspec: "args=[\'input_handle\', \'indices\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListGetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListLength"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPopBack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPushBack"
+    argspec: "args=[\'input_handle\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPushBackBatch"
+    argspec: "args=[\'input_handles\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListReserve"
+    argspec: "args=[\'element_shape\', \'num_elements\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListResize"
+    argspec: "args=[\'input_handle\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatter"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatterV2"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListSetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'item\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListSplit"
+    argspec: "args=[\'tensor\', \'element_shape\', \'lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListStack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'num_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "TensorScatterAdd"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorScatterSub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorScatterUpdate"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorSliceDataset"
+    argspec: "args=[\'components\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorSummary"
+    argspec: "args=[\'tensor\', \'description\', \'labels\', \'display_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'[]\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'serialized_summary_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TextLineDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TextLineReader"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TextLineReaderV2"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ThreadUnsafeUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Tile"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TileGrad"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Timestamp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TopK"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "TopKV2"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Transpose"
+    argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncateDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncateMod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncatedNormal"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeDecode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeDecodeWithOffsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeEncode"
+    argspec: "args=[\'input_values\', \'input_splits\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeScript"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnicodeTranscode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Unique"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueWithCounts"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueWithCountsV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Unpack"
+    argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "UnravelIndex"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Unstage"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "UnwrapDatasetVariant"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UpperBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "VarHandleOp"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "VarIsInitializedOp"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Variable"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "VariableShape"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "VariableV2"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Where"
+    argspec: "args=[\'condition\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "While"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "WholeFileReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "WholeFileReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "WindowDataset"
+    argspec: "args=[\'input_dataset\', \'size\', \'shift\', \'stride\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WorkerHeartbeat"
+    argspec: "args=[\'request\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WrapDatasetVariant"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteAudioSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "WriteFile"
+    argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteGraphSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteHistogramSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteImageSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'bad_color\', \'max_images\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "WriteScalarSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'tag\', \'summary_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Xdivy"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Xlogy"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ZerosLike"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Zeta"
+    argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ZipDataset"
+    argspec: "args=[\'input_datasets\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 3929003fa1ff0902b55adcdca1274b1c1b1de2e8..5216f4e2ed063c72999654de022db37bbedb6a63 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -164,6 +164,10 @@ tf_module {
     name: "load"
     argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
   }
+  member_method {
+    name: "load_v2"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "main_op_with_restore"
     argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +184,10 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 33e342bc75486be0bccffc1e36a94e147f934432..d3543e2e19def45db6b6d627b54475af2ae28c28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\', \'expand_nonconcat_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "cross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index ad26ded10b4dc652574ce4b544cbadd98e57a013..ada8be91454b190875f6f078328c8f5279bd4784 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
@@ -14,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
@@ -38,7 +42,7 @@ tf_module {
   }
   member_method {
     name: "to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_hash_bucket_fast"
@@ -50,7 +54,7 @@ tf_module {
   }
   member_method {
     name: "to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "unicode_decode"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..3879645d60249b18664b77125917d2066a063662 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "image"
     argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'graph\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "merge"
     argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
index 6fc489c86043d074ac832d0ec9dbefd2cbbb4f19..48f53a85454f1f7103728965217eba85ccde10c9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-assignment.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-assignment.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f2d8c19444cd4a383997ed0277a7c0140c9be48
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-assignment.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.tpu.experimental.DeviceAssignment"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.device_assignment.DeviceAssignment\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "core_assignment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_cores_per_replica"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "topology"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'topology\', \'core_assignment\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'topology\', \'computation_shape\', \'computation_stride\', \'num_replicas\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\'], "
+  }
+  member_method {
+    name: "coordinates"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "host_device"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\', \'job\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "lookup_replicas"
+    argspec: "args=[\'self\', \'task_id\', \'logical_core\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tpu_device"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\', \'job\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "tpu_ordinal"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\'], varargs=None, keywords=None, defaults=[\'0\', \'0\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..642942a349115d29b55d799ef3ca0e0a26c035da
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.tpu.experimental"
+tf_module {
+  member {
+    name: "DeviceAssignment"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "initialize_tpu_system"
+    argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7fabcf229880c9077a75fddb9769a0d891f065f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.tpu"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
index 65a2b605d532c4a14d3d444a44a723c543af5026..6ed8f934f307b88c26993176b1838d202d187b17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 179272d8a8a298ac374c19641068aca739bb9626..c57b3d8ed5a78b0417fd0e927e447cb3c0d9dd96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
index 15c2ef46c127543cb94690aade3c79b6e75981c2..897df3ed231d5e91c417e78c48e062a591308cf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
index 9c902e582f35ca44a6825727637fa3d76011e33c..cb8b5d366c4c5e853e99ca79737886b62f9503ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
index 42dcdac9e77a8efac875e4985f6a8f744e838ddb..4440a035afe06a2a3ee56462231d2e241e4ebac7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.CheckpointV1\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index f41d9f12d9fe65b128d216551870ec8c95834a6c..1d1aceb0138d264501758a26eba75791d5b9f735 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
index 7399750385f960133aa5cf071c57dc9fc716a18d..b998e848c2ba6a585d3820549d1d873bf04538cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
index 9bbaa14a6fd54dfcad37560142bebc7b3118601c..2de61d67f717786152515d414bed6ccd574aa58f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
index 448e17a44891781b1d6b0fe8e627cb91d098f1e1..8baa56902581d8ac405f95992daa29ae4a9fd1e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index eb1782e9cad73708de24f6565237830a29cfaf8b..626b75335461fc13a0bdc73b220d7e562a5a6c46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index eb9a86183e10775379efb84c693f7aa7ba573f2d..9c0dca030d2fa4c75315fb60df4cc2019271b41e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index 2cf4c2e7ea4879c48c1b3a43302f7fa4e9f689cb..61ae458c01750493d87bc53f3be5c660ab912f5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
index 2cda458f468b2d748b43954b14b670df7145243f..a91ba5b6722d9929fc857091a5b2c6ae5cf188f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "from_proto"
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\'], "
+    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\', \'False\'], "
   }
   member_method {
     name: "set_last_checkpoints"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
index ecce08220d6bd9815fecd26a95f8ac6f745d9e33..b812d6f1ef4ae8f4173c2b23a010935dcc6cabcf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-python-state.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-python-state.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59bc0179aef2ee5bd73b6541ef2f17447784745b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-python-state.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.train.experimental.PythonState"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.python_state.PythonState\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'self\', \'string_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2761b489b965ad4ca6e22458d7efad724891c22f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.train.experimental"
+tf_module {
+  member {
+    name: "PythonState"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index bdb3ea2197c78dd17357f2753f05638c3c054bd6..551fda2eacd147a604a876c7283388a4c4e8e90c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -240,6 +240,10 @@ tf_module {
     name: "WorkerSessionCreator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "queue_runner"
     mtype: "<type \'module\'>"
@@ -298,7 +302,7 @@ tf_module {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "generate_checkpoint_state_proto"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
index f79029d3fe0b88a454b11456b3785c3ae28a253c..cc2d5c87d667fb5c4af6b6fc435ae626334fe2d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.AggregationMethod"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<class \'tensorflow.python.ops.gradients_util.AggregationMethod\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "ADD_N"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
index cdaeb55e30865e082054085f47d6a071ebf3affd..9193168c2072388a5a660abf55acbdc6f889d58e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
@@ -68,7 +68,7 @@ tf_class {
   }
   member_method {
     name: "create_op"
-    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
+    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
   }
   member_method {
     name: "device"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..148e6b83b31e9e23c34f1b501c45749063a6c3b7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
@@ -39,4 +40,8 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'values\', \'indices\', \'dense_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
similarity index 59%
rename from tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
index c364b0217a7ed10282dc8fc28797f3be1b92f867..8c3438e4d8e377de8ae0c063d460b5adeea11258 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-module.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
@@ -1,8 +1,8 @@
-path: "tensorflow.experimental.Module"
+path: "tensorflow.Module"
 tf_class {
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
@@ -12,18 +12,6 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "owned_submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "owned_variables"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "submodules"
     mtype: "<type \'property\'>"
@@ -41,7 +29,7 @@ tf_class {
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "no_name_scope"
+    name: "with_name_scope"
     argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index c0ed95653552f904acea1cc82bca00773ecb792c..feb831fb6b7bda76f4140272f3b193c6f66114b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.RaggedTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
@@ -38,6 +39,10 @@ tf_class {
     name: "bounding_shape"
     argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_nested_row_lengths"
     argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
index bee19520b7736967533c6d30a1862e3c48d03fc2..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV2\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
index 493dcba8922d7f6c51a61d337f48e09d168e6bac..d824ad573e4854844e6a3fa3b544ab5f51ddde6c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -16,7 +16,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
     name: "from_spec"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index a80726d3bbc400b1ce8e640819ad370c3589be6c..03fd32fdebf80745cb264afd81d08bf7054aebaf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
index ce29615f72eee78525b8a1efbb4531215e6b72fe..6c5724078357125255acd413902c4a5e57cb719e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.audio"
 tf_module {
+  member_method {
+    name: "decode_wav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "encode_wav"
     argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
index a71da113b4ffcaa9ff71e18df4a9263b141b42e6..1f04d028efdc895e493c9e60e1c9025fc26de4f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -5,12 +5,16 @@ tf_class {
     name: "ALL"
     mtype: "<enum \'Feature\'>"
   }
+  member {
+    name: "ASSERT_STATEMENTS"
+    mtype: "<enum \'Feature\'>"
+  }
   member {
     name: "AUTO_CONTROL_DEPS"
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "DECORATORS"
+    name: "BUILTIN_FUNCTIONS"
     mtype: "<enum \'Feature\'>"
   }
   member {
@@ -21,6 +25,10 @@ tf_class {
     name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
+  member {
+    name: "LOGICAL_EXPRESSIONS"
+    mtype: "<enum \'Feature\'>"
+  }
   member {
     name: "NAME_SCOPES"
     mtype: "<enum \'Feature\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
deleted file mode 100644
index c4d5b77c0738feb1fa6ea69672ee3fafa51de5be..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.autograph.experimental.Verbosity"
-tf_class {
-  is_instance: "<enum \'Verbosity\'>"
-  member {
-    name: "BRIEF"
-    mtype: "<enum \'Verbosity\'>"
-  }
-  member {
-    name: "VERBOSE"
-    mtype: "<enum \'Verbosity\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
index 5747dac7ab201443d1f237415cd280aee672a8ff..cd8f0716d48f4e84b5a21238d8a661722c1d33ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "Feature"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "Verbosity"
-    mtype: "<class \'enum.EnumMeta\'>"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
index 12e23bc0c8fd0831471abcf56bcd8f07d3e6fe57..8880ed4f0cb31641dec7336ba64ef55ec227c813 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
@@ -4,12 +4,20 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "to_code"
-    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\'], "
   }
   member_method {
     name: "to_graph"
-    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3fd133e92a41fb133bd138750a574ef40ea57b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.experimental"
+tf_module {
+  member_method {
+    name: "get_device_policy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_synchronous_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_device_policy"
+    argspec: "args=[\'device_policy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_synchronous_execution"
+    argspec: "args=[\'enable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.gpu.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.gpu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6768a9e06e93aeb251228d2f1ee47b18dc19945
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.gpu.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.gpu"
+tf_module {
+  member_method {
+    name: "get_per_process_memory_fraction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_per_process_memory_growth"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_per_process_memory_fraction"
+    argspec: "args=[\'fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_per_process_memory_growth"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
index d7e4529594df24666844bbce8cff729c5fa8fa67..41e61ac683c0fb3f68b7dd092af8bb71e2a2a3bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
@@ -1,7 +1,35 @@
 path: "tensorflow.config"
 tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "gpu"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "threading"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "experimental_connect_to_host"
     argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
   }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_soft_device_placement"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_soft_device_placement"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.threading.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.threading.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b155733a4d556daa954bc61b3647ac9f103ba3f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.threading.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.config.threading"
+tf_module {
+  member_method {
+    name: "inter_op_parallelism_threads"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "intra_op_parallelism_threads"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_inter_op_parallelism_threads"
+    argspec: "args=[\'num_threads\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_intra_op_parallelism_threads"
+    argspec: "args=[\'num_threads\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 951b2df05aca88cef88e256a30dc76f70f18a355..21859d7662e57a09fd7458fe4f63e83b45caa39e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -2,18 +2,6 @@ path: "tensorflow.data.Dataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'variant_tensor\'], varargs=None, keywords=None, defaults=None"
@@ -90,6 +78,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index f1573512438b3f40db7653bf94fd4ad282a40acd..512446e93ffef54330eb7819a785fc457a02573b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -4,18 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -92,6 +80,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 72fc2c3a9ee5b985723ce2dba9643ba796362dc7..70e3b6792ccc171a633d75df5047309e19cf78bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
   is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "experimental_autotune"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "experimental_deterministic"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 690da98b1ac2097c4241ba3218caa3b476dbf397..0ba3e488dd6c20694fe530dfbb361d00fb8323ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -3,18 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'filenames\', \'compression_type\', \'buffer_size\', \'num_parallel_reads\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -91,6 +79,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index fe0bc1a4db5d4a5e78ec7479e414545b522ec2df..72e12d0e7f962ab611ea7502d0e8f779b7428080 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -4,18 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -92,6 +80,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 261129b132189ef504678058f11651dd22bdce8c..5f5e60fae9ad81d4797a66e6ff5c66a077eebea0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -4,18 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'filenames\', \'record_defaults\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \',\', \'True\', \'\', \'None\'], "
@@ -92,6 +80,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index 3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8..b2fd09e2ff4668981b1d4da2085349913589376f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "apply_default_optimizations"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "autotune_cpu_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 0b34bbc94269280d6cca77bca789fb74f76629be..88ce93511b326a22b6ef7f1f60d946beeccc30bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -4,18 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -92,6 +80,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 0e61890eee42a8b5b0df7bda0f99d189c4911eb9..381eba742b6309f949a518dafc93ee5920d7afce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -4,18 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'driver_name\', \'data_source_name\', \'query\', \'output_types\'], varargs=None, keywords=None, defaults=None"
@@ -92,6 +80,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index abc98a74b64ab274ed8b2fc43876b7102f1c7201..695756890eb40b1515add45dc371cb22a55af9bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -88,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "bytes_produced_stats"
+    argspec: "args=[\'tag\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cardinality"
     argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -108,10 +112,6 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "filter_for_shard"
-    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index 314aedda909cda8b1d8a209333b85a7792c19bd5..f47cd12d2a8bc65b4565de759f548aee3e203471 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -88,8 +88,16 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_log_device_placement"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_numeric_tensor"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "set_log_device_placement"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 4cb78b08f8d966890fd9173d7ac1459a905d5921..fbac8c087f1a51c8d66b8fe3be1a5a3e90f13eef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -15,10 +15,6 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
@@ -28,8 +24,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "experimental_make_numpy_iterator"
@@ -39,6 +35,10 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..184ff96e6325c87f02444d4359e5c2987534b6ce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 11c1479b5bfa4a02ee825509a8a725486b917333..b6be122b7accd3aa2a61d96e8c30473329b464fc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -14,10 +14,6 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +23,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "experimental_make_numpy_iterator"
@@ -38,6 +34,10 @@ tf_class {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b35b61b4c08868feaf501e1f09b37d02da09cd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.distribute.cluster_resolver.ClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2cc522f1cac65611ffc3f09ce1513d186da27a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.GCEClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver.GCEClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'project\', \'zone\', \'instance_group\', \'port\', \'task_type\', \'task_id\', \'rpc_layer\', \'credentials\', \'service\'], varargs=None, keywords=None, defaults=[\'worker\', \'0\', \'grpc\', \'default\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3220d68e05458da3cda4e36c63bc5dc79cde93af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver.KubernetesClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d39ddc7e408c8fbc3cbb7db26379357b93f8b459
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.SimpleClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.SimpleClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster_spec\', \'master\', \'task_type\', \'task_id\', \'environment\', \'num_accelerators\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b487626520addbd072983f4218b5d0785e6049ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.distribute.cluster_resolver.SlurmClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver.SlurmClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_task_info"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f9a430c0f84c9caba29dee514f1f3a3391d8588
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.TFConfigClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver.TFConfigClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\', \'environment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbc76c24813bad6288b630a792ad3996a7940f46
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_job_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_master"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ecd77ad192d575c3cc8331a5dc2d0d89816182ac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.UnionResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.UnionClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5906ffa850a360889e26fe0230618ad60cf01231
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.cluster_resolver"
+tf_module {
+  member {
+    name: "ClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GCEClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KubernetesClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SimpleClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SlurmClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFConfigClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnionResolver"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7eca1c80d8b751feb6f9f16b743944da44e258b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CollectiveCommunication"
+tf_class {
+  is_instance: "<enum \'CollectiveCommunication\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CollectiveCommunication\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1726e74534a3922a9948ec0e166da4d6cdbef6ce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'communication\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6ee288f28b56b2a5a4aa2a21b04d80a5115609
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..927f1a8f5051633d1ab8a8b9ba6ca4509ebc2ad1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -0,0 +1,74 @@
+path: "tensorflow.distribute.experimental.TPUStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.tpu_strategy.TPUStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "steps_per_run"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu_cluster_resolver\', \'steps_per_run\', \'device_assignment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf8cedb50cbccf3b47d09567abcde7e29d458ace
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "CollectiveCommunication"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 31dc6e071613bfe3d2ea24c65835f09bab90c400..7339bee6cd85ba9d474e55b3952468113d28a27c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,10 +20,22 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
@@ -32,9 +52,13 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "get_loss_reduction"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
   member_method {
     name: "get_replica_context"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 01b870a81639807489ec2a09dcc185137aae1665..956e4d93e57069b6936413a3a432d45a22e4ed1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -112,10 +112,6 @@ tf_module {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "as_string"
-    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
-  }
   member_method {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
index efe9e74697096b4a7bac912f10c1092470daadec..13d77d9f5b8d23ae0a6d5dd4b56811da05f7a447 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
index a7300bf06bb5bbb01c02b9050f8779910b11919e..7e33a21059619e6f4a8b61ebfd69198b393b039c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index e138ce936ec73c05f8f790fb63c381e56ae2f654..216854587d6476c37e12063eda53acf61c6383cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\', \'train_in_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fdbecb8856e24c86ae3165d546d8ddd019ba88f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt
@@ -0,0 +1,76 @@
+path: "tensorflow.estimator.BoostedTreesEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'head\', \'model_dir\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_feature_importances"
+    argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "experimental_predict_with_explanations"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index eae0a292a962680a53d8c683ee2d2b97e24937a6..25d021ad8deb9c6a51f4510c2ec215b94ee66cef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\', \'train_in_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index a540085aba48c1d7c877b41831475cb2dacf8ec9..a4e9b564233bc44d37a57f5ea2c3816e14933d67 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
index d1b29d670a0cbd3628569ea1c401a329f336c960..cfb49257c48bfbff0333db91cfc033b9e239d5b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index f6c3910a9fe5c76bafe03a636a4e91014055ce81..89029a2b79d953dee18ec3f81485e29b52e39927 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index b78527279ca32decc71185a98f9f8270b4cd41a2..d81054233a8cd64e5bd52736ac3583187bc41266 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'input_layer_partitioner\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 9133f0d3b280dc8d2d5a263e25731594e0be2ef0..a158cd037eb6fc68d669f96a0bf62730f434d34c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index a58d733302da9e69fe0d46d7d327e1b7868e198e..cb6c88203a5e07e4a2a88281b31c9fe22138137f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 47de660a386c3362cf880ba9eed189f2bea047cd..5c0e8f5a894e87b0ed2f3dbcaa24977d43372c18 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
index 66a127606a5be7c356a48ff7eb0751dd7db0eb02..e562bf1c57664164af27b01decea85ab3bf2c3c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 5c094fe1318565443fb0864750fdf532d465cc04..5df93619a7dc7c918f364cf708b1fd17881e6b99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
index f3dfe7296f77fa295e7f02718a2a8bb4cb3ea199..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.mode_keys.ModeKeysV2\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index b1bd5a2661d44d9b36b965ba160874e6142628ea..de0470be44e52101ccacd0c2052685660b965096 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "dnn_logit_fn_builder"
-    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "linear_logit_fn_builder"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index add8ef5e65dce5d0fffa82805e465c46eeb3f3ab..a7b72d68569bf231dc68a3ff406ad81c6fdeee89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "BoostedTreesClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BoostedTreesEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index a7ee6d3e07d4387c4fc0bd9a5f6d9614473ca73c..0c3f04e468c4c817cd474deb42149aee3021aa43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.experimental"
 tf_module {
-  member {
-    name: "Module"
-    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
-  }
   member_method {
     name: "function_executor_type"
     argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index 3aadd7dc341ae97fdbfa83cd3fc96fc75249a4c2..4e4fd78b598c91b98a121e3751f1e61d67f14419 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -41,7 +41,27 @@ tf_module {
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
-    name: "shared_embedding_columns"
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "shared_embeddings"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt
index dbc360b13ee7dc8228f5fb4fe0cd6fc21504d0d0..d29736117294960c0715d343a7a06434b10cb33c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.-resize-method.pbtxt
@@ -4,19 +4,35 @@ tf_class {
   is_instance: "<type \'object\'>"
   member {
     name: "AREA"
-    mtype: "<type \'int\'>"
+    mtype: "<type \'str\'>"
   }
   member {
     name: "BICUBIC"
-    mtype: "<type \'int\'>"
+    mtype: "<type \'str\'>"
   }
   member {
     name: "BILINEAR"
-    mtype: "<type \'int\'>"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GAUSSIAN"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LANCZOS3"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LANCZOS5"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MITCHELLCUBIC"
+    mtype: "<type \'str\'>"
   }
   member {
     name: "NEAREST_NEIGHBOR"
-    mtype: "<type \'int\'>"
+    mtype: "<type \'str\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 3c6ed1cfb8340b6e8f2599360e3c321c562e37ff..741b5b772cffa50fb8baf2ea940ce415f892204e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "central_crop"
     argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
   member_method {
     name: "convert_image_dtype"
     argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -82,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_glimpse"
-    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'uniform\', \'None\'], "
   }
   member_method {
     name: "extract_image_patches"
@@ -174,15 +178,15 @@ tf_module {
   }
   member_method {
     name: "resize"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'images\', \'size\', \'method\', \'preserve_aspect_ratio\', \'antialias\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "resize_image_with_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    name: "resize_with_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\', \'antialias\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'False\'], "
   }
   member_method {
     name: "rgb_to_grayscale"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..502fda18fd827836e7d0366cb3c800ac9c7e4408
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.Constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06beddc818b425e27155fcf3fd74fc34f2254409
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.initializers.GlorotNormal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f37448f63465cb4f421647141722769b7a0e47dd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.initializers.GlorotUniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc6f16b04fa539461666d84c2fa8e64c1b1ad7d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.Identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03a69732c6c4cf3eb1ab044669a3f9a87fd4956e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.initializers.Initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff715c649320f06e75a709b9d1b07d7ef54f83
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.initializers.Ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..303752f934f30ae5af33a2eea4f56dbc4e88d245
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.Orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1b1dfb4d230cb2944008ddeeb6d746955cd1e7f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.RandomNormal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eac13a132462ed6afe692bbb7f3a6d08e45530db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.RandomUniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b796faef0cf0e1adf9bc2eb0e7c91cfd60721ec0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.TruncatedNormal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea46406d3a6d229fc90946f058fb80f07ea4b422
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.VarianceScaling"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2596186705fad6c669b7fe227c9502ae73fef191
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.initializers.Zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f26775f4c84125734eb1ca90f65614ae2940d4c0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0af16f3ae89f597d8fa1d86c6d3faa84b12d6489
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.initializers.glorot_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8272f2c64fc3c1cf31e89f7e65895c36ec11ef8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.initializers.glorot_uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c11593fe3125a00d46ca05cae08c0d6b33e9694d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..436465c10a66c376cfda6ae0db407e8f4ad001af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.initializers.ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6603f66736ee1d57fcc9b09412586774a7ac4987
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.initializers.orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b10b8e6ce9ba229e7ec550c14a8bf37d614327e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -0,0 +1,107 @@
+path: "tensorflow.initializers"
+tf_module {
+  member {
+    name: "Constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlorotNormal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlorotUniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomNormal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomUniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruncatedNormal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VarianceScaling"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Zeros"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "zeros"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "he_normal"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "he_uniform"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lecun_normal"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lecun_uniform"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'initializer\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27774af69a2adba86708eb0ea0d34ede7bb2a04f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.initializers.zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 283cc6a735695b0b2d16af28f7688a7a077f19be..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 95e405aebaf61e3ccae268b474a006a3bca51343..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index d200d3d26d7c1b7d54eda596a8056a66e29be0b6..81844f6f9186f6a7621b468970ccf76901b5763c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -252,6 +264,14 @@ tf_module {
     name: "less_equal"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "local_conv1d"
+    argspec: "args=[\'inputs\', \'kernel\', \'kernel_size\', \'strides\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "local_conv2d"
+    argspec: "args=[\'inputs\', \'kernel\', \'kernel_size\', \'strides\', \'output_shape\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "log"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 2e0f77eda85780cec26b103ba11276ccdfd90189..6a00e0a25d432067b08bfdb2cb77f73c590ac949 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\', \'profile_batch\'], varargs=None, keywords=kwargs, defaults=[\'logs\', \'0\', \'True\', \'False\', \'epoch\', \'2\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58bede556dfd4d8988d92e99e402d9b3b3bf5adb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecayRestarts"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecayRestarts\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2549a2ac627421ecc80df2d6235c1a22ab5e3ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f083120b52ce483f46cc92390b53180bc3bd65ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.LinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ea3c6beb1c0f8fffaa442956c0cc134f70a5e84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.NoisyLinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.NoisyLinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1bfd51cdcc3e783fcd24a80e189d3d73bf3a928d..c3127642e2583700c2451f54b80487a4cb943a55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -2,8 +2,9 @@ path: "tensorflow.keras.experimental.PeepholeLSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -141,6 +142,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -173,6 +178,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -181,6 +190,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 5cd6851278dce8ef45c90112176be94b9c45dc91..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -1,15 +1,35 @@
 path: "tensorflow.keras.experimental"
 tf_module {
+  member {
+    name: "CosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineDecayRestarts"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NoisyLinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
   member_method {
-    name: "export"
-    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "load_from_saved_model"
-    argspec: "args=[\'saved_model_path\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b03cbb8eb804ad80ce5c2d6e43fe07d4ac9db9cd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02f8c252bdafc6ae5e0db1162ba2185c04981b63
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.glorot_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d18a3b6e7e063b5b3e172228a93b934a41736bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.glorot_uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb6ddf5f0c6256c7f1160996a08565330aa6c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc2dd171dfceba916fed1a02bbfb19f26497adca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.keras.initializers.ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..855065c1634abe2c794fec705a5dbc004fd3e597
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
index 7412cd130588a6a95538607b17b93be26492111a..15a56fbb973ccc729d7b377ef6a20b426687690c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
@@ -48,6 +48,34 @@ tf_module {
     name: "Zeros"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "glorot_uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "zeros"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9b3359d7a9b7deda1c80ffce17b061d87f31235
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.keras.initializers.zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..526d65fbf6c91d5d02ffc90dd0333cab07a50b84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.layers.AbstractRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.AbstractRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 8a0b8eb46f006497472c1e9ce539e91db19bd260..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index abb3c236948a7f46d64cad92ae922324446f9a99..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index b27db4e7f23499fd27430059f1cb556f341547b3..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 50998ac9d63c9492523720d7dcc8041fd9efcab5..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index be17aeafb5ae383cba58b854808f6c9bc0e9696d..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7f21b444bc8832189b11cd8ff206e034bc89170c..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 2ac86f152fad454fc0b09e2cb8814f23ad997c20..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f6b1dd2f7e4244218b7c64868b773142c79695d6..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 3da1f43a92a3fb5a146bcf8fd16f26783487f129..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a7be5ac81814b28c93407cd5d1ca7c3f60822f0b..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index c5c29bead383da6b9c0c7436fb089e27413e72f3..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index b13f963a6fcaa8b4c2da541654564d620c710c20..5613c23641a83be43bf758506b0d4c62a23d3488 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization_v2.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationBase\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 880d18e1aae53512b2f587b5c8914babcd68566f..43af4aa1ec1ad93fde018c6244bef3a4c1bd6549 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -69,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -115,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 1eb0cf1a188b88d55b82297da715624c9e5a58f2..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index d9394e60f532465c1852b2cac46ca4cbd9125583..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -196,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index a0f6dc8097adfb896a8f3aa3b642c2997e257cf3..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 037b92f861b14720a1a638884752a4d3e1dbd9f9..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 6a0d027d47d999f5770e59299fc1206249bf9b43..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 66b5bd75fc16c37aecaa65ef12fb2311925c252a..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index e73133ff0731821407084cc1cd6160b2e9bf3d9c..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 7af6b2b3c398473398a9d2e227a42ec96451b301..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index baff492dfbd3c9ab6f2c269cb89632768e6b6c92..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 63d30a61851cdae8daa8a5dc70fa733fd6b2ff11..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 7a29cbbec35b885792828828354ac8f9a29579b6..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 87c75c02243cd646502e12e2947555ad7c6913e9..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index f69104ddfef17c8b5df36f4bc3e9b0ea3a986295..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index aa05471933cc97a872480e0ac45213b49a882189..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index d61f1ddc1d506ae2db992aaacbdc634964d53292..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 28c1926bd7926f0f0e8331534dc8f7611c1740a2..d6f7f3033ecf8e226b961dabfe59e751639e5b98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 06e8b6b314183b884e635f3b78e5bd5368e0962f..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 9fdf6f66d1160a49da302ffa8eeeade3009de048..65d018fa55d9b2d798abf72f2d918fac5dab7623 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -120,7 +120,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index cbe102065071a00596fb4b8f764b410737c638a6..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 0efba09b272c8ffb2220ccfaad830c7fff98568c..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index b34c499eb2e603aa8e2a6c9c84ec752a41efd0de..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 51dd853127f549c8ff370391f11cf7b8021af469..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index dcd18a9cedd53565fdf38d9787335e0afea9ad3d..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f029907ee86943fb8c04eada819e9cbfd6d01009..9a660083ee0d875ed7bf2dccfd7e4cd0d5ae2a91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index ac2d8c9aa3b74e5754c2a8014b4c093a610c5198..c9b250b9bf4356015218ef08d31cc1aa1bdbce1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,10 +1,11 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedGRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent_v2.GRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -180,7 +181,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -214,6 +215,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -246,6 +251,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -254,6 +263,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 15cbcfe8edffa92ef0514248e9dbc523dc6a49bd..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 865b898c4cc54253d85442f2db2f3f624ecfb817..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 3e17aca17cc4e636ea3f6235f04cd4b7f468ae28..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index b160687a2a610714f5dd6c0cc7c7c92408d386df..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 70e8d51a5a782d5f473e1350d16d942998e58fbb..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 809dc8554b38af9486035f1f3b03aa58392812de..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 3fbce8cb714355c0898dcbdc6797394410e90253..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 70e4103ea1abd5bda90811d127230105ae7bb941..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 000bf54c4523307d791db76d73e1cbe71cb46e4d..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 8ffbf07f9bc32cc9a3a83e2f57f5fe4d78fffd3c..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 3803d2b0a8765f4832df34fc4876256d4dd2ea86..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 28668224e01e04bbc4c14259d84a11ec72c826a8..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index b83ed67723afd5544ec19c599437a57909d780c5..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index e689d69140e36a94c731f4c3b4578919d31343f5..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index bb6eddae7168b576bdaca91b6f7951fe7b65ee1f..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 5fb3f9dd3aa9a3f3761c2bfadbcc19f46fe2303c..66aad25f9af529f40c22ad5bfe94009f869fe396 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 89dfc2a256da20cc65b7d18601dce240f5580a21..4f07fad45932a99ce2587ba69d280b5beed1f22c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,10 +1,11 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedLSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent_v2.LSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -180,7 +181,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -214,6 +215,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -246,6 +251,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -254,6 +263,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 376bec0814880e3fa0091a41cd9a4ba0dcc4ab60..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index c5f91a6338cf5c0b8f017f6a6a87d0016ffbe999..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bde888735916a018647f681968241a583e0271ef..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
index c4726cf82430c800267f24032f2e02fc65e9499c..9bafe013efed1d59e1a6c43600cbe35593b04f97 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 16945f2c12a7be4eba8a67a9a58587d756888d12..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index f05741ffceb6a855f56731086619dcc621c8d71e..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 7885db4ed291afb8ea627cebe1dbae45723d4b2f..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 9380d26cf4c7b1c93a4c0ce2681e792381c42deb..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 8eb8218df3f3532c6c108e9f43f6ff5a708c7fbc..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 0c96f86ed36d7cf99c396f863de6d9ef8f90adc5..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 0c6b230eb79aac1e719949b3f8331423b621d47b..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index eb7ca52fba97174a1c6869ae003beec8ffc328df..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e724e9088f82f7ff7152cc4393af4a8f582136c6..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index dafbd09ee28ef6a5ec933cddf3c246cf2f4f4480..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index 3122fbec1c7cad161d71fdf9970995adcedfdad7..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 0527cda1f026e1ff9075e827c2902c45fd22db9d..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 814e5a5d545f0d4b1276ef1639eddb72004b4d1b..15c24f8da73c8a00b35d67241ebd131035f8c347 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index aa1731afb82698cb44375407fc717bf32ef634d6..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 9d7dd85fe0eef3733a86b9e918396e882f5812d2..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -110,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index e9bba298bb028851e6e9b9a17ff40a671d9132f2..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 3c783eb5129028b3eb5160c75dde2859541cfd32..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index b8e0882541c51209cca112c54197bbce305bd1b2..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 310f369ed6c7a9931af56016ae09db5d4bca15d5..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index df19d781c21e403b51d451d772cfba66a7383be7..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index bf909509bd4b25507291839fff1ee0eaccee630f..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5d66bc6fb6334d99242009f59ccbe0e7aa2d4e89..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 88e9300de912f3b12712bbc311ac156803ef35c4..33a0c1976b07281141aeaf14ac78f5f3f856ef1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,6 +141,10 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -172,6 +177,10 @@ tf_class {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_recurrent_dropout_mask_for_cell"
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'count\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
   member_method {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -180,6 +189,14 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_recurrent_dropout_mask"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 9d81c6d4bc3139952e9f41113d05547b215cf571..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -167,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 712eb0c6ec3b706a9e396a532f58916140a2c606..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index dfc4ca27052f919ea3866a489e525ae1202795f1..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 5e4f727f71d8be2496bee1abdb87c7050f1ca02e..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9d893cb30a066c4732ccab9d1520f5047a4d3a01..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a2ed954e4c0ce1d474b8c71b41ce1d585d42d665..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 8a0818e78ac766a624bdcce85591fe13e1d4ceac..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index b5591b48265d3d09459bf9bb114a4a3149984eb9..4c817692194c100204412d10ca11181af8f0f7b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 210e4fd4e6f0b2e8ba75d22e83134e2267fbece5..709aac579db73a365d3a318fbc828557a261019a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -65,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -111,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index da2213a84fe2f6bc683630e5f8760acdf3239b19..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index e2c303d506e0f8f99d8ca89f29979a5999382378..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 396e774c8a4a10c4996c56c208fc4f4d432e3135..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 8b6418d514e61536f314da88f1586cea4f29cfc5..ee06ae5059d1760cb4dea447e0705fecfde8b827 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -64,10 +64,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -110,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e8fda4c71ada65aecef59eb8012120488b0f17c7..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 50c52d270b684bcea5105e4c9813cc62103403f9..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 84c6b78a2b4405fe0e2a1fbb3bba0635db06a21d..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b5598eed07c9f04feb0d90820381abc12dbb456
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index e84c9a2a8f178f0acf8305a77f6ea06c406b9888..d5b7af5a757b5df7b5223c21c3c9ba23b2533f90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers"
 tf_module {
+  member {
+    name: "AbstractRNNCell"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Activation"
     mtype: "<type \'type\'>"
@@ -396,6 +400,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -412,6 +420,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -428,6 +440,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 2f7da93f6f412ca559aec2f6acde2b80a5c93c86..1242eec68f1414f1c8e67bb95602687f4a58412f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.BinaryCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index b3a7cd80973259bd5cdfe382c656a9478f8933d8..cf3c2de840450de8e9467269ec446172583e8ffd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
index 4ba9e57bed4100437c8b71d8b506cc2c928a9ac9..fa374afb28bc4d7fe226456743c285b4f539ced1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalHinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa14c44fa3628236033e952b69f3a160c49a36fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
index 7b3c62d3bef0b9d200577f34cbe303fc7a094acc..a4c25eefcbbc75afb3765b11e325f6bd830ccba8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.losses.Hinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa8ffa95726f72b620c3908b48fe20dfae1dc17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d950c789eb44fcad792a9d11856ce11143715807
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd5317f89f801e8a4f4cc80e700e2b478ebf40
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 712bb2ecd3526c354cbcf640e689526b2e415a13..9da6b59ec83bb5b74336a122a791a0d5ea3eb079 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsoluteError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7fe362da89b47a925cd4708909e1c882a9a23aca..7c3ae9b49a415c1586df01984bd73af38ee97558 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
index a5718533500d9508c558d25d13fc6b61518a73a0..2126ac68d2a4cd8f1b68466e073ec573d13f2cda 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 200006db355ca4dc8eb2f509bcb9da7543145548..6ef9610546a0ec662313534f424d49879187f302 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c90c3140e2b68b9796873b0de73668f1508476
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
index f20ed26e2ea2819554159a9bcecb4141601e4a19..e93be80f1f702eacda20e4eefbec12dad724edaa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c13f9f967db7014548de1283c5d59bbac403299a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
index f8badb2b6ec2ba8dd16136c32f5d27811a0d4d9d..fabe4c7814462b91a12062bac5c2119cfd45bccf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.SquaredHinge"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index e8809450fac7d759a1e1d0e066b86ae7f4820072..a8a4134df5e83d38532e26fa709db09611b2d03e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -13,13 +13,29 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
     name: "Hinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -36,10 +52,18 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SquaredHinge"
     mtype: "<type \'type\'>"
@@ -77,11 +101,7 @@ tf_module {
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "cosine_proximity"
+    name: "cosine_similarity"
     argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
@@ -150,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e00a3a355269a0ccc5d69b3fcea106c4908e115
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 6756beca1dec885bf23b0c365496d84e5cb3eeb1..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index bec0b20aa51d7c098c333e162c562225d92f38f4..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 71dc294dc39f05ffdc416be7b92337e2dfa69690..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f1794aba61aae085a7580806e524eea8b2a791
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 43024e738a500fbbc36077b9e598dbe2445898e2..19442b5028dda68548c19c74e0828abf4fd54534 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678c7b0681fe4281893fba70b4652233a91e2a0c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 5432f7f4006b165fefb9aa028bf7d36d8cbc38f1..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 75541bf285d8989f867aabc7c7025e56cce1d05d..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index f8d47f3771798d40860f29a22bf81319385cfb66..dedc64f1375b66b90f655f280c1a56ba165cfa17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af8366b60876cb31f840c5f5007e67980be8dc3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e072e21cc94492ed27186f44b92863cd791d62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index bf7fc7cfc506e0adc07342531a0f590533468139..75173ad17a9c1fa02451287adad10870a60d653b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 59bb767d3501c59b7d2bf052570d5b1f161d2df7..7be81b63bbe01b8534bd64d163e735d735ff88f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21e44ed988494119662e5e1a5101edbe4d7a35fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17fc34566e8ab6c5cc73781b40cb0f7396067
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 91f4712312820a6840229f2e6cd763c2a3ce7900..363f532ba410f1ebae5f105769a0e69c2e2d0166 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index 205e15d439fe8f9493f41fb23ce54248173e9295..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index eec26ffce71425d78fbc4fa29540b58c8b12ee64..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce746ab350bfa0534bf7f9ac7d6e8255c7749894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..570b77408cbaa2b7a0089f9de8a528e604799abe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index 9aeaa5627a9805579d6a6c4e09336a4d7994d1c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 748cec08668c461fcf80df6a50fd5192f99073b9..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2bdbd54e22756b823716c149cf0f24661acc812
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 97aeb680be1c5c412a16e2a73e3f1bc2ca8ce6c0..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 571c2bf9d33c14b4a5699fd9dfa3e85ad97f99f2..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c135b8f680061a1e79fedd9d705d0fb54344823b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 85f80b062efe3d2d91104b211c8d9d75127c8c0e..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 71c047e9736e1bd86e47b2f43774d2ad0d884821..90bbb087fafcdcde5dee048c45adbc45e3be2e55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30ef19e02cfc99d117e6a396beeaf6422a105013
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e59476a2410f859dff7171162a2cab123d5e853d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 4bc9383f6ffc90972416fa031d5515a149e70425..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 2eae4df0ae344656bb637bc27e806876304a86f1..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index cae251cdfe1b0eae57a8c44030a08d3cfb373c5a..71e89765cb8a660e4843362f912d7d011ca4ec14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,16 +12,24 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalHinge"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CosineProximity"
+    name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
   member {
@@ -32,6 +44,14 @@ tf_module {
     name: "Hinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
@@ -44,6 +64,14 @@ tf_module {
     name: "MeanAbsolutePercentageError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanSquaredError"
     mtype: "<type \'type\'>"
@@ -52,6 +80,18 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -60,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -68,6 +112,14 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
@@ -76,6 +128,14 @@ tf_module {
     name: "SquaredHinge"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -120,14 +180,6 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
-  member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -194,7 +246,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b405198a29c4c7673688f2bf0f410a4a3e7a526
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -0,0 +1,75 @@
+path: "tensorflow.keras.mixed_precision.experimental.LossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.loss_scale_optimizer.LossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2af65554f75e606755738d5126f7ee2b749d32e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.mixed_precision.experimental.Policy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "default_variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_cast_variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83ef24cdf7a5c1bf91da0a08cb735433d7dd1e3c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.mixed_precision.experimental"
+tf_module {
+  member {
+    name: "LossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Policy"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "global_policy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_policy"
+    argspec: "args=[\'policy\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8648afb5f7d2aba11e6cb3a20b537f12d96dd10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.mixed_precision"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index eb1ab1d9dd61b36ed8662e25700f12f82aadb502..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index c69cf281742360d9ed4d1f7cbd35219cf04b1149..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index 0a56293e804f583a949ecb413da0ba613e0bc876..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index 14d0894e5622021c4961228d431d01516b752055..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index fdb1ea838c04f296c3d0ee7596d73052b6f4b6d9..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index ece63ec168dac58f58286dbd9fd8a8151d0dc2dc..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index f952f88b6d203488ea0ec4f1794d7de79a25853a..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 27bae902b0cb7f1f4e09737a83fadd95a83cc163..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index e523443a0099b57942c73cafcd8a919503e8db38..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index d2721f8e92088c216ab748cae45e415553b9d4c1..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.optimizers.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
   is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "iterations"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
index 7257b02087e237eaa47ed6a042559aa1332fc87b..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
@@ -32,6 +36,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ae478cb2c663b8a856bd29146558b808499079
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2fe61f4d2cb8f76fe1c8d6261b5f383b79281f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b33bd7526bd3f67f54450f97adf3d1d4d717051
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.keras.optimizers.schedules.LearningRateSchedule"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f1496492abfabb04bd47834d434ab8df05af705
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..728436c36111de60c3752e09049ffb5678e4b2d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024e472a734935e668b9d6ee6e9c115cc90bdcd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.keras.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
index ed9967856200d62fd152dfec85c8ec36403bcbc0..3db6920519a989bb6832c81ecbb07aec30166115 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
@@ -56,6 +56,10 @@ tf_module {
     name: "metrics"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "mixed_precision"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "models"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index c7a50969b54e5efc4d338caa79dea76d86bffe8a..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 3900c752c8527f68af2496f99083d80fc9d18106..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index 7b876099af6a28d9fca2e5c55aeae5e4610f82a6..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 5bddba8e798618f5b1d0cdc61ddff9725a495fe0..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 62ba8bb59e8af14447fe570ba28c5d0eba7f6af8..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 0803feeabd12acb7988459fe6da2748e19b70a5f..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 6def32864b9cc660b94d628ccd53dc48a566ea81..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index dbf1ac82d33b81c63e5c356ac736f63262797ff0..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
index 6a3fe4dd66397bbff4b0b9ca6d195adeb64e3337..a7eb144d83aaeb2997d44b703b46de9a01c3a478 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 85d902b977ceddd405abb1154a086d7bd29e7848..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 638d82a599248e547bcae86ebd6d8d8dc3f6aa4b..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index ab1b04bd3cb1b215b848019b6c578ce091f8f828..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 961969aac58b78e4edd53b47f2932f71f2d21fd5..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index e76738a9648123414159fdc9666a99b0577aa46e..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index b35cd69da474a9665652f04f12b34a8d9f33fa8a..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index f9119cdd5f728f3b35d83248daff17547a497aa2..e46cb44ba56c1df7b4c004b35c21f80326f963a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -200,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
index 1fe179f6c1b64ebc2f7535719bc1598577ee7f03..68cb07ea6fab85824400cce8408ebcb1dc030f8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
index 66e692a5a379203cb491980802b7003072bfe76c..3ac478f7626556574983aed4e5d284cb758406c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "AGGREGATE_STACK"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "CHILDREN_INPUTS_MAPPINGS"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_AGGREGATE_ATTR"
     mtype: "<type \'str\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "FUNCTION_INPUT_INDEX_ATTR"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "FUNCTION_LEVEL_ATTR"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_NAME_ATTR"
     mtype: "<type \'str\'>"
@@ -48,7 +56,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add_input"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
index c955b1a04a4b8af701a57ba2468145590c1a4a16..eca1508564ddbb98fb540be8bc2a5f7bbb3d4d78 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -1,33 +1,17 @@
 path: "tensorflow.lite.TFLiteConverter"
 tf_class {
-  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverter\'>"
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverterV2\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'graph_def\', \'input_tensors\', \'output_tensors\', \'input_arrays_with_shape\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'func\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "convert"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "from_frozen_graph"
-    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_keras_model_file"
-    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_saved_model"
-    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_session"
-    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_arrays"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "from_concrete_function"
+    argspec: "args=[\'cls\', \'func\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-target-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-target-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..966fb69cbed38f1fe8102cc09a2e3a438eb79c28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-target-spec.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.TargetSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TargetSpec\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'supported_ops\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
deleted file mode 100644
index 3ef90b8bc4646a2adfcbeca2258ff5aa7cbf8894..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.lite.TocoConverter"
-tf_class {
-  is_instance: "<class \'tensorflow.lite.python.lite.TocoConverter\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "from_frozen_graph"
-    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_keras_model_file"
-    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_saved_model"
-    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_session"
-    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c84513d088516ee8cc8c2c92e344f42bfc4379f1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
similarity index 86%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 9de73076b1197ce7bee8a00dfd7bfcd1b48a35bc..269944ee9df44f38e89bbca32949b7a12490f8f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -1,10 +1,11 @@
-path: "tensorflow.nn.rnn_cell.ResidualWrapper"
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -104,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'residual_fn\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -124,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -132,11 +133,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..354a7086d6046d5f2452799a9e86cb07400c1679
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "convert_op_hints_to_stubs"
+    argspec: "args=[\'session\', \'graph_def\', \'write_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'<function <lambda> instance>\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..4e80f432e73d20f9061448431d7759e09743241d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -12,20 +12,28 @@ tf_module {
     name: "OpsSet"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TocoConverter"
+    name: "TargetSpec"
     mtype: "<type \'type\'>"
   }
   member {
     name: "constants"
     mtype: "<type \'module\'>"
   }
-  member_method {
-    name: "toco_convert"
-    argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-key-value-tensor-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-key-value-tensor-initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..982246da441a65d5eb2819f8f7f23c4804d88a38
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-key-value-tensor-initializer.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup.KeyValueTensorInitializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.KeyValueTensorInitializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TableInitializerBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'key_dtype\', \'value_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\', \'table\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbc59f42de2bc5013f6e027713ebee90ef01a34d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-hash-table.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.lookup.StaticHashTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticHashTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.InitializableLookupTableBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initializer\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..065698348bdcf3fb038dd8222bcd4e0f67abecee
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-static-vocabulary-table.pbtxt
@@ -0,0 +1,36 @@
+path: "tensorflow.lookup.StaticVocabularyTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.StaticVocabularyTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initializer\', \'num_oov_buckets\', \'lookup_key_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.-text-file-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-text-file-initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff9a0ce6e7de6b8de01c897815bd8fd07fefc2c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.-text-file-initializer.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup.TextFileInitializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TextFileInitializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.TableInitializerBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filename\', \'key_dtype\', \'key_index\', \'value_dtype\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\', \'table\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2308185afdd6a60a98e1542619e480526f44b2ba
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.-dense-hash-table.pbtxt
@@ -0,0 +1,56 @@
+path: "tensorflow.lookup.experimental.DenseHashTable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.DenseHashTable\'>"
+  is_instance: "<class \'tensorflow.python.ops.lookup_ops.LookupInterface\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.TrackableResource\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "key_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "resource_handle"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'key_dtype\', \'value_dtype\', \'default_value\', \'empty_key\', \'deleted_key\', \'initial_num_buckets\', \'name\', \'checkpoint\'], varargs=None, keywords=None, defaults=[\'None\', \'MutableDenseHashTable\', \'True\'], "
+  }
+  member_method {
+    name: "erase"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "insert"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "insert_or_assign"
+    argspec: "args=[\'self\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lookup"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'self\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..855a758fd612505b03213d095e122342fd87e1d6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lookup.experimental"
+tf_module {
+  member {
+    name: "DenseHashTable"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lookup.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7119d4c281ee0ce3c0a391f2f9a198b257aa537
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lookup.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.lookup"
+tf_module {
+  member {
+    name: "KeyValueTensorInitializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StaticHashTable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StaticVocabularyTable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TextFileInitializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d180a9fbad492089f37bc98de50904c8bfa4d38
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3937dfa153a08843c12f1098c05e49e1b1a01c79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a064dd649535336ddfeda4f24b2594771bbbd5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
similarity index 74%
rename from tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
index 4952a76291c00bfdd73eed5412e7421887d1bab2..7829f0f327be676b1fe81775bf3a8a368e88c7db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.losses.CosineProximity"
+path: "tensorflow.losses.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.losses.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..155154c312a8b9bf034d5600bd2d859036934ce1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5052c19a049c0defa2f5b64f3d16626dc95374cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b739c057b238b6a3dfa14e325deac3c4f4b46fc1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..557cc210450c2553668f914c84218bb762202668
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bcc6f8854368f919cf9b4254caa6fc2e071dc6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a33db29d421ccaab3bb829e3924cc3307039ad1b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c79a5e8b8a9fc3779f93fa8e8fd29f28df9ec52
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b99e19413ed7a87509b1ff2296046e718c5dcc82
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e01827391a796614bd0b69f36f0ae268ffb31a7b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6603cb407265b59ad17c2e428306e6b025aa9a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff..e4ae87ea29365b13b0c49e3ea2329550b0e420ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ce6fc728d594a3bc0c7a0ffe83f078e81fac8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5e3757143871b04a63c258d49f8ef30f52304fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index 36007d3ca6f63ccf06ef613aad584c8d1c63d627..e681f29b99c687c830d3ab0db683511bdf8f0c35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -1,7 +1,179 @@
 path: "tensorflow.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+  }
+  member_method {
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a0b8e9e4013d4f69cc933f6f495bdcbbe478641e..c2c5bb50b59815a154c52407c82648cc28d4b31b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,6 +264,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -300,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aa59370a3050d67e35324831d688f23ab444303
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4cfade42ece20e113bfd41744f05a451bbba34c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84198f3cf3d841627aa88d690673b5e8fb1838a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d51c6a798bd20f25b523dc142bc6cb4734b5b6a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67e14faf3f950ed4d52c45111fa8c4a7023f7019
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33cd4c574931e8de692bc69a7dc85d98ca432fcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c90fab3fdbd901235bd7b5b10259fe2a67e071
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
similarity index 94%
rename from tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 1e39385f81957bc0bb9ff2fb6660a16fd3e4c9b4..326df8fcc2aa4beeabe11c4566d5d77b6ed13981 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-proximity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -1,11 +1,12 @@
-path: "tensorflow.keras.metrics.CosineProximity"
+path: "tensorflow.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.metrics.CosineProximity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -89,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_proximity\', \'None\', \'-1\'], "
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9e32ad53c790a6753a1764c959575ff3eee7631
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45a2c48acba5be6c53ba5666cf9e308f374f6372
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2132fda36f44150e7154323c5d5f1e317173777
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0db90fa517982b85f4fe071c16c7cef4f2f70
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b66eda8523d026935d7edc380d4c9a00e41e4a6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90a6b0664165e37a61bccd7468dab19c7105d3d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8710a45cccd402e6e298044a53c957bd6797342f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb48837860114f5073206abe447151fe130f5d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..207f262851adfcd6d751d3f33c3d528f44e8daae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4c959abc303ff77bd7b50738bc8dd30750cd7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ad4089c2cb7d1eeeb796903e75731b675abdd5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31dc0cc9d7a88b317faa0ba5df1d67a70e0955d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..794665197bfa453fd73405d1ca8b1986c1758257
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1826983476253122caf753c821a3ce331801856
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202739712fcf5471911158b8944203c36b80546b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe442e8626a18729a1930ccad7a531c12553be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74489f6446e533f89bc705a7657b0016abb4ab35
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f408836b66fc3c0ccff64a625a58879b44b7498f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6233b63a7161e52f4f40b306dd114e2aff5a2a7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7dfbf3f62e32b0a1ca6d8d698d0f563484927d53
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cb0007b1bf7ce959c25413aa6097f5026b0267c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7e4344e43907e876020cf6fe58c0a997180a76a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9dbf70f8f66debf98967b29690e087a473e57c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76ecc8c41c68ddf84c185d12c446f7e4f83529c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1642e1e62f2cbef1ffe2f3a01962e94c16030ca2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d47520fe113d2858d58bc1325a0fbbf74cb087b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8b55f8a2eec45220435ad1fa0298e4684d00ad1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e7274d7e6171d3766555d8c6910de8599e568b1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae997c6bd4ec0c98631c439a01b085f7cfae8e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.metrics"
+tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.5\'], "
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+  }
+  member_method {
+    name: "sparse_top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
index 80418970132377a5d578e4f11fa4091a19202cf3..e4447792ca0cff3e953a63d6e304b1af4b428e06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
@@ -1,5 +1,6 @@
 path: "tensorflow.name_scope"
 tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.name_scope_v2\'>"
   is_instance: "<class \'tensorflow.python.framework.ops.name_scope\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -8,6 +9,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.nest"
+tf_module {
+  member_method {
+    name: "assert_same_structure"
+    argspec: "args=[\'nest1\', \'nest2\', \'check_types\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+  }
+  member_method {
+    name: "flatten"
+    argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_structure"
+    argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "pack_sequence_as"
+    argspec: "args=[\'structure\', \'flat_sequence\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
similarity index 83%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index 8ba92fcc8dc89958b8395aa986c358a03fedd66d..079952a71e8b946886ddb575bea33888bcce0199 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -1,9 +1,11 @@
-path: "tensorflow.nn.rnn_cell.RNNCell"
+path: "tensorflow.nn.RNNCellDeviceWrapper"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.AbstractRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -17,10 +19,6 @@ tf_class {
     name: "dynamic"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -73,10 +71,6 @@ tf_class {
     name: "output_size"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_size"
     mtype: "<type \'property\'>"
@@ -103,7 +97,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
@@ -123,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -131,11 +125,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2effcf402faa5a6b50234bc9bc1910f6c49e822
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -0,0 +1,206 @@
+path: "tensorflow.nn.RNNCellDropoutWrapper"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.AbstractRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "wrapped_cell"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..91e4073769c1df3bf5f0ab4023575358f42078ba
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -0,0 +1,202 @@
+path: "tensorflow.nn.RNNCellResidualWrapper"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapperBase\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.AbstractRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index c75c75f2ef7ca50cce15fe1dffb4d0de3f6815de..7477f143c8918510f0e0a487a4efd0f63d9ce07e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -1,8 +1,16 @@
 path: "tensorflow.nn"
 tf_module {
   member {
-    name: "rnn_cell"
-    mtype: "<type \'module\'>"
+    name: "RNNCellDeviceWrapper"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RNNCellDropoutWrapper"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RNNCellResidualWrapper"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "swish"
@@ -22,7 +30,15 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
@@ -50,35 +66,31 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d"
-    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv2d"
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
     argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    name: "conv3d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
@@ -134,11 +146,11 @@ tf_module {
   }
   member_method {
     name: "embedding_lookup"
-    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\'], "
+    argspec: "args=[\'params\', \'ids\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "embedding_lookup_sparse"
-    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'combiner\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'combiner\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "erosion2d"
@@ -194,7 +206,15 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "max_pool3d"
@@ -202,7 +222,7 @@ tf_module {
   }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'output_dtype\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \"<dtype: \'int64\'>\", \'False\', \'None\'], "
   }
   member_method {
     name: "moments"
@@ -250,7 +270,7 @@ tf_module {
   }
   member_method {
     name: "sigmoid_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
deleted file mode 100644
index 1de8a55dccac10ee9af08eb1efc0cb6d22f7163b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.nn.rnn_cell.LSTMStateTuple"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMStateTuple\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMStateTuple\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "c"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "h"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
deleted file mode 100644
index e2496dff63efeba26140286ab4ac306676aa8b9f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.nn.rnn_cell"
-tf_module {
-  member {
-    name: "DeviceWrapper"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LSTMStateTuple"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "RNNCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ResidualWrapper"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b476fafa9a6e26c29d91e28ac2ee66b6e74f637
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adadelta"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be2fedfe81f5e01bf86cdcfaccf19dbd1f367543
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..919c433648ff5950d4ab0c0f2ff2295d33d1085c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67fce4f5c63c0dcd364a124929c9232de1887ae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adamax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee4c9ad25fc678652e33fd5423f228dd86b89816
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06363234ea68f192105295a1eec2b8487c4eb121
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Nadam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..041922bdfd121b5f161f2d1dea443fc32c592743
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.optimizers.Optimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5deef618248d608bf571a9548ea72f6837984a09
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.RMSprop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..381f72767b807bb495aeaa98c012e7fd97608c22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e03b42ac7f8b8400c6c09061e6d9d09f0ac3d9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.optimizers"
+tf_module {
+  member {
+    name: "Adadelta"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adamax"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Nadam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Optimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RMSprop"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f174d6cfc3ce20d1bf6d36e1cfff4d63d799a3d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..851d21c5cc063c97a49e8dbe2611b2f903d216a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36db36e4f432e8c0b87e306d3a35d3a0e5bdde0b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.optimizers.schedules.LearningRateSchedule"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6613bedef5f638e4b7f3211827031d04b35cc5ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbd5bcef8f75e78c3bcaa87d82d45b2094bfed80
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e61b76628a5ef1638e5134a4687402d0e47c8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 92f4704b493caba790862174c17092c51f0eb54f..b6efded31c39ce25cd9f96bffcf4b885e8cb5034 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CriticalSection"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -20,6 +24,10 @@ tf_module {
     name: "IndexedSlices"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Module"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Operation"
     mtype: "<type \'type\'>"
@@ -168,6 +176,10 @@ tf_module {
     name: "image"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "initializers"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "int16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -200,6 +212,10 @@ tf_module {
     name: "lite"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "lookup"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
@@ -208,10 +224,18 @@ tf_module {
     name: "math"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "nest"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "newaxis"
     mtype: "<type \'NoneType\'>"
@@ -224,6 +248,10 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "optimizers"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -269,12 +297,12 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "resource"
-    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+    name: "raw_ops"
+    mtype: "<type \'module\'>"
   }
   member {
-    name: "rnn"
-    mtype: "<type \'module\'>"
+    name: "resource"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "saved_model"
@@ -312,6 +340,10 @@ tf_module {
     name: "test"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "tpu"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "train"
     mtype: "<type \'module\'>"
@@ -468,6 +500,10 @@ tf_module {
     name: "clip_by_value"
     argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -512,10 +548,6 @@ tf_module {
     name: "device"
     argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "div_no_nan"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -604,6 +636,10 @@ tf_module {
     name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_static_value"
+    argspec: "args=[\'tensor\', \'partial\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "gradients"
     argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
@@ -652,6 +688,10 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -744,6 +784,10 @@ tf_module {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "numpy_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "one_hot"
     argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -860,22 +904,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "scatter_div"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_max"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_min"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_mul"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "scatter_nd"
     argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -960,10 +988,6 @@ tf_module {
     name: "strided_slice"
     argspec: "args=[\'input_\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'var\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'0\', \'0\', \'0\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "string_split"
-    argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
-  }
   member_method {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -977,15 +1001,15 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tensor_scatter_add"
+    name: "tensor_scatter_nd_add"
     argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tensor_scatter_sub"
+    name: "tensor_scatter_nd_sub"
     argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tensor_scatter_update"
+    name: "tensor_scatter_nd_update"
     argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96203756a28afe6899ec0d4e3631199c4cc5745e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.-generator.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.random.experimental.Generator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.stateful_random_ops.Generator\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "algorithm"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'copy_from\', \'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "normal"
+    argspec: "args=[\'self\', \'shape\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'self\', \'shape\', \'minval\', \'maxval\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "uniform_full_int"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ccedc63420eb425041fcd5e4183675e781cd3a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.experimental.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.random.experimental"
+tf_module {
+  member {
+    name: "Generator"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "create_rng_state"
+    argspec: "args=[\'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_global_generator"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_global_generator"
+    argspec: "args=[\'seed\', \'algorithm\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_generator"
+    argspec: "args=[\'generator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index d49c23e59cf036f05758f5c50208febf4b7381d5..bbb3e4b63fb146a696ca491cb85565e769f619fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.random"
 tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "all_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..489771285c7aa5a407c4914785f8577bb30944c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -0,0 +1,4351 @@
+path: "tensorflow.raw_ops"
+tf_module {
+  member_method {
+    name: "Abort"
+    argspec: "args=[\'error_msg\', \'exit_without_error\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulateNV2"
+    argspec: "args=[\'inputs\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorNumAccumulated"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorSetGlobalStep"
+    argspec: "args=[\'handle\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Acos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Acosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Add"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AddManySparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "AddN"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AddSparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "AddV2"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustContrast"
+    argspec: "args=[\'images\', \'contrast_factor\', \'min_value\', \'max_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustContrastv2"
+    argspec: "args=[\'images\', \'contrast_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustHue"
+    argspec: "args=[\'images\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AdjustSaturation"
+    argspec: "args=[\'images\', \'scale\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "All"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AllCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "AllToAll"
+    argspec: "args=[\'input\', \'group_assignment\', \'concat_dimension\', \'split_dimension\', \'split_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Angle"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "AnonymousIterator"
+    argspec: "args=[\'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Any"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ApproximateEqual"
+    argspec: "args=[\'x\', \'y\', \'tolerance\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'None\'], "
+  }
+  member_method {
+    name: "ArgMax"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ArgMin"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "AsString"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Asin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Asinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "Assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "AssignAdd"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AssignAddVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AssignSub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AssignSubVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AssignVariableOp"
+    argspec: "args=[\'resource\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atan2"
+    argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Atanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "AudioSpectrogram"
+    argspec: "args=[\'input\', \'window_size\', \'stride\', \'magnitude_squared\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "AudioSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "AudioSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPool3DGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "AvgPoolGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "Barrier"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BarrierClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BarrierIncompleteSize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierInsertMany"
+    argspec: "args=[\'handle\', \'keys\', \'values\', \'component_index\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierReadySize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BarrierTakeMany"
+    argspec: "args=[\'handle\', \'num_elements\', \'component_types\', \'allow_small_batch\', \'wait_for_incomplete\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "BatchCholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchCholeskyGrad"
+    argspec: "args=[\'l\', \'grad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchIFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatMul"
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDiag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixDiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "BatchMatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalizationGrad"
+    argspec: "args=[\'t\', \'m\', \'v\', \'gamma\', \'backprop\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchSelfAdjointEig"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchSelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "BatchSvd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "BatchToSpace"
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BatchToSpaceND"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselI0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselI1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Betainc"
+    argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BiasAdd"
+    argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "BiasAddGrad"
+    argspec: "args=[\'out_backprop\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "BiasAddV1"
+    argspec: "args=[\'value\', \'bias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bincount"
+    argspec: "args=[\'arr\', \'size\', \'weights\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bitcast"
+    argspec: "args=[\'input\', \'type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseAnd"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseOr"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BitwiseXor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesBucketize"
+    argspec: "args=[\'float_values\', \'bucket_boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCalculateBestGainsPerFeature"
+    argspec: "args=[\'node_id_range\', \'stats_summary_list\', \'l1\', \'l2\', \'tree_complexity\', \'min_node_weight\', \'max_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCenterBias"
+    argspec: "args=[\'tree_ensemble_handle\', \'mean_gradients\', \'mean_hessians\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCreateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesCreateQuantileStreamResource"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'epsilon\', \'num_streams\', \'max_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'1099511627776\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesDeserializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesEnsembleResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesExampleDebugOutputs"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesGetEnsembleStates"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesMakeQuantileSummaries"
+    argspec: "args=[\'float_values\', \'example_weights\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesMakeStatsSummary"
+    argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'bucketized_features_list\', \'max_splits\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceAddSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'summaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceDeserialize"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'bucket_boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceFlush"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_buckets\', \'generate_quantiles\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesSerializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesTrainingPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'cached_tree_ids\', \'cached_node_ids\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BoostedTreesUpdateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastArgs"
+    argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastGradientArgs"
+    argspec: "args=[\'s0\', \'s1\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BroadcastTo"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Bucketize"
+    argspec: "args=[\'input\', \'boundaries\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CTCBeamSearchDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "CTCGreedyDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "CTCLoss"
+    argspec: "args=[\'inputs\', \'labels_indices\', \'labels_values\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "CacheDataset"
+    argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Cast"
+    argspec: "args=[\'x\', \'DstT\', \'Truncate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Ceil"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CheckNumerics"
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Cholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CholeskyGrad"
+    argspec: "args=[\'l\', \'grad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ChooseFastestBranchDataset"
+    argspec: "args=[\'input_dataset\', \'ratio_numerator\', \'ratio_denominator\', \'other_arguments\', \'num_elements_per_branch\', \'branches\', \'other_arguments_lengths\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ClipByValue"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CloseSummaryWriter"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveBcastRecv"
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveBcastSend"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectivePermute"
+    argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CollectiveReduce"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "CombinedNonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "CompareAndBitpack"
+    argspec: "args=[\'input\', \'threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Complex"
+    argspec: "args=[\'real\', \'imag\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'complex64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ComplexAbs"
+    argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ComputeAccidentalHits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Concat"
+    argspec: "args=[\'concat_dim\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatOffset"
+    argspec: "args=[\'concat_dim\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatV2"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConcatenateDataset"
+    argspec: "args=[\'input_dataset\', \'another_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
+  }
+  member_method {
+    name: "ConfigureDistributedTPU"
+    argspec: "args=[\'embedding_config\', \'tpu_embedding_config\', \'is_global_init\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Conj"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConjugateTranspose"
+    argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Const"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConsumeMutexLock"
+    argspec: "args=[\'mutex_lock\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ControlTrigger"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Conv2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv2DBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Conv3DBackpropInputV2"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Cos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Cosh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CountUpTo"
+    argspec: "args=[\'ref\', \'limit\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CreateSummaryDbWriter"
+    argspec: "args=[\'writer\', \'db_uri\', \'experiment_name\', \'run_name\', \'user_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CreateSummaryFileWriter"
+    argspec: "args=[\'writer\', \'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CropAndResize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CropAndResizeGradBoxes"
+    argspec: "args=[\'grads\', \'image\', \'boxes\', \'box_ind\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "CropAndResizeGradImage"
+    argspec: "args=[\'grads\', \'boxes\', \'box_ind\', \'image_size\', \'T\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "Cross"
+    argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CrossReplicaSum"
+    argspec: "args=[\'input\', \'group_assignment\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "CudnnRNN"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackprop"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackpropV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNBackpropV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNCanonicalToParams"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNParamsSize"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNParamsToCanonical"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "CudnnRNNV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "Cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "DataFormatDimMap"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
+  }
+  member_method {
+    name: "DataFormatVecPermute"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
+  }
+  member_method {
+    name: "DatasetToGraph"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DatasetToSingleElement"
+    argspec: "args=[\'dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DebugGradientIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DebugGradientRefIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeAndCropJpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeBase64"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeBmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeCSV"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeCompressed"
+    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeGif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeJSONExample"
+    argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DecodeJpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "DecodePng"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "DecodeRaw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DecodeWav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "DeepCopy"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeleteSessionTensor"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DenseToDenseSetOperation"
+    argspec: "args=[\'set1\', \'set2\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DenseToSparseSetOperation"
+    argspec: "args=[\'set1\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DepthToSpace"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNative"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "Dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
+  }
+  member_method {
+    name: "DeserializeIterator"
+    argspec: "args=[\'resource_handle\', \'serialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeserializeManySparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DeserializeSparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DestroyResourceOp"
+    argspec: "args=[\'resource\', \'ignore_lookup_error\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "DestroyTemporaryVariable"
+    argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Digamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Dilation2DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DivNoNan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DrawBoundingBoxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DynamicPartition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "DynamicStitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EagerPyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EditDistance"
+    argspec: "args=[\'hypothesis_indices\', \'hypothesis_values\', \'hypothesis_shape\', \'truth_indices\', \'truth_values\', \'truth_shape\', \'normalize\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Elu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EluGrad"
+    argspec: "args=[\'gradients\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Empty"
+    argspec: "args=[\'shape\', \'dtype\', \'init\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "EmptyTensorList"
+    argspec: "args=[\'element_shape\', \'max_num_elements\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EncodeBase64"
+    argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "EncodeJpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "EncodePng"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "EncodeWav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingIntegerBatch"
+    argspec: "args=[\'batch\', \'mode_override\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingSparseBatch"
+    argspec: "args=[\'sample_indices\', \'embedding_indices\', \'aggregation_weights\', \'mode_override\', \'device_ordinal\', \'combiners\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "EnqueueTPUEmbeddingSparseTensorBatch"
+    argspec: "args=[\'sample_indices\', \'embedding_indices\', \'aggregation_weights\', \'mode_override\', \'table_ids\', \'device_ordinal\', \'combiners\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "EnsureShape"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Enter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "Equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Erfc"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Exit"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Exp"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExpandDims"
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalAssertNextDataset"
+    argspec: "args=[\'input_dataset\', \'transformations\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalAutoShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalBytesProducedStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalCSVDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalChooseFastestDataset"
+    argspec: "args=[\'input_datasets\', \'num_experiments\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDatasetCardinality"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDatasetToTFRecord"
+    argspec: "args=[\'input_dataset\', \'filename\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDenseToSparseBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'row_shape\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalDirectedInterleaveDataset"
+    argspec: "args=[\'selector_input_dataset\', \'data_input_datasets\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalGroupByReducerDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'init_func_other_arguments\', \'reduce_func_other_arguments\', \'finalize_func_other_arguments\', \'key_func\', \'init_func\', \'reduce_func\', \'finalize_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalGroupByWindowDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'reduce_func_other_arguments\', \'window_size_func_other_arguments\', \'key_func\', \'reduce_func\', \'window_size_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIdentityIndexedDataset"
+    argspec: "args=[\'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIgnoreErrorsDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetGet"
+    argspec: "args=[\'materialized\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetMaterialize"
+    argspec: "args=[\'dataset\', \'materialized\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalIteratorGetDevice"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalLMDBDataset"
+    argspec: "args=[\'filenames\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalLatencyStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMatchingFilesDataset"
+    argspec: "args=[\'patterns\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMaterializedIndexDatasetHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalMaxIntraOpParallelismDataset"
+    argspec: "args=[\'input_dataset\', \'max_intra_op_parallelism\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalNonSerializableDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalNumaMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalParallelInterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalParseExampleDataset"
+    argspec: "args=[\'input_dataset\', \'num_parallel_calls\', \'dense_defaults\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'output_types\', \'output_shapes\', \'sloppy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalPrivateThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'num_threads\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalRandomDataset"
+    argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalRebatchDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalScanDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSetStatsAggregatorDataset"
+    argspec: "args=[\'input_dataset\', \'stats_aggregator\', \'tag\', \'counter_prefix\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSleepDataset"
+    argspec: "args=[\'input_dataset\', \'sleep_microseconds\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSlidingWindowDataset"
+    argspec: "args=[\'input_dataset\', \'window_size\', \'window_shift\', \'window_stride\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalSqlDataset"
+    argspec: "args=[\'driver_name\', \'data_source_name\', \'query\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorSummary"
+    argspec: "args=[\'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalTakeWhileDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'thread_pool\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalThreadPoolHandle"
+    argspec: "args=[\'num_threads\', \'display_name\', \'max_intra_op_parallelism\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ExperimentalUnbatchDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExperimentalUniqueDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Expm1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExtractGlimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
+  member_method {
+    name: "ExtractImagePatches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ExtractJpegShape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ExtractVolumePatches"
+    argspec: "args=[\'input\', \'ksizes\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Fact"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FakeParam"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgs"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "FakeQueue"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Fill"
+    argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FilterByLastComponentDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FilterDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordDataset"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordDatasetV2"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordReader"
+    argspec: "args=[\'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FixedLengthRecordReaderV2"
+    argspec: "args=[\'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "FixedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1\', \'0\', \'1\', \'0\', \'[]\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Floor"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FloorDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FloorMod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FlushSummaryWriter"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "For"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'input\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FractionalAvgPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalAvgPoolGrad"
+    argspec: "args=[\'orig_input_tensor_shape\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalMaxPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "FractionalMaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNorm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormGrad"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormGradV2"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedBatchNormV2"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "FusedPadConv2D"
+    argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "FusedResizeAndPadConv2D"
+    argspec: "args=[\'input\', \'size\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'resize_align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "GatherNd"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GatherV2"
+    argspec: "args=[\'params\', \'indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GenerateVocabRemapping"
+    argspec: "args=[\'new_vocab_file\', \'old_vocab_file\', \'new_vocab_offset\', \'num_new_vocab\', \'old_vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "GeneratorDataset"
+    argspec: "args=[\'init_func_other_args\', \'next_func_other_args\', \'finalize_func_other_args\', \'init_func\', \'next_func\', \'finalize_func\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionHandle"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionHandleV2"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GetSessionTensor"
+    argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Greater"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GreaterEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "GuaranteeConst"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "HSVToRGB"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "HashTable"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "HashTableV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "HistogramFixedWidth"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "HistogramSummary"
+    argspec: "args=[\'tag\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT2D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IFFT3D"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT2D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IRFFT3D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Identity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IdentityN"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IdentityReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "IdentityReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "If"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Igamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IgammaGradA"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Igammac"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Imag"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ImageSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
+  }
+  member_method {
+    name: "ImmutableConst"
+    argspec: "args=[\'dtype\', \'shape\', \'memory_region_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ImportEvent"
+    argspec: "args=[\'writer\', \'event\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InTopK"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InTopKV2"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedDequeue"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedDequeueTuple"
+    argspec: "args=[\'dtypes\', \'shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueue"
+    argspec: "args=[\'input\', \'shape\', \'layout\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueuePrelinearizedBuffer"
+    argspec: "args=[\'input\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InfeedEnqueueTuple"
+    argspec: "args=[\'inputs\', \'shapes\', \'layouts\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTable"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InitializeTableFromTextFile"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTableFromTextFileV2"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\\t\', \'None\'], "
+  }
+  member_method {
+    name: "InitializeTableV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceAdd"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceSub"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InplaceUpdate"
+    argspec: "args=[\'x\', \'i\', \'v\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'f\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Inv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InvGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Invert"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "InvertPermutation"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsBoostedTreesEnsembleInitialized"
+    argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsBoostedTreesQuantileStreamResourceInitialized"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsFinite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsInf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsNan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IsVariableInitialized"
+    argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Iterator"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "IteratorFromStringHandleV2"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNext"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNextAsOptional"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorGetNextSync"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorToStringHandle"
+    argspec: "args=[\'resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "IteratorV2"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "L2Loss"
+    argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LMDBReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LRN"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'5\', \'1\', \'1\', \'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "LRNGrad"
+    argspec: "args=[\'input_grads\', \'input_image\', \'output_image\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'5\', \'1\', \'1\', \'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "LeakyRelu"
+    argspec: "args=[\'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
+  }
+  member_method {
+    name: "LeakyReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.2\', \'None\'], "
+  }
+  member_method {
+    name: "LearnedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "LeftShift"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Less"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LessEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Lgamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LinSpace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ListDiff"
+    argspec: "args=[\'x\', \'y\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "LoadAndRemapMatrix"
+    argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingADAMParameters"
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'momenta\', \'velocities\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdadeltaParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'updates\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdagradParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'mg\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingFTRLParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'linears\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'weights\', \'benefits\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMomentumParameters"
+    argspec: "args=[\'parameters\', \'momenta\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'momenta\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingProximalAdagradParameters"
+    argspec: "args=[\'parameters\', \'accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'accumulators\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingRMSPropParameters"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+    argspec: "args=[\'parameters\', \'ms\', \'mom\', \'gradient_accumulators\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+    argspec: "args=[\'parameters\', \'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Log"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Log1p"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogMatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogSoftmax"
+    argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogUniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "LogicalAnd"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogicalNot"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LogicalOr"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableExport"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableExportV2"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableFind"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableFindV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableImport"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableImportV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableInsert"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableInsertV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableRemoveV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableSize"
+    argspec: "args=[\'table_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LookupTableSizeV2"
+    argspec: "args=[\'table_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LoopCond"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "LowerBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "MakeIterator"
+    argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MapClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MapDefun"
+    argspec: "args=[\'arguments\', \'captured_inputs\', \'output_types\', \'output_shapes\', \'f\', \'max_intra_op_parallelism\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "MapIncompleteSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapPeek"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MapUnstageNoKey"
+    argspec: "args=[\'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "MatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatchingFiles"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDeterminant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDiag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixDiagPart"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixExponential"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixLogarithm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "MatrixSquareRoot"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Max"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3DGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPool3DGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolV2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "MaxPoolWithArgmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'include_batch_in_index\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Maximum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Mean"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Merge"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MergeSummary"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MergeV2Checkpoints"
+    argspec: "args=[\'checkpoint_prefixes\', \'destination_prefix\', \'delete_old_dirs\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Mfcc"
+    argspec: "args=[\'spectrogram\', \'sample_rate\', \'upper_frequency_limit\', \'lower_frequency_limit\', \'filterbank_channel_count\', \'dct_coefficient_count\', \'name\'], varargs=None, keywords=None, defaults=[\'4000\', \'20\', \'40\', \'13\', \'None\'], "
+  }
+  member_method {
+    name: "Min"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "Minimum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MirrorPad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MirrorPadGrad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ModelDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "Mul"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIterator"
+    argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorGetNextFromShard"
+    argspec: "args=[\'multi_device_iterator\', \'shard_num\', \'incarnation_id\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorInit"
+    argspec: "args=[\'dataset\', \'multi_device_iterator\', \'max_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MultiDeviceIteratorToStringHandle"
+    argspec: "args=[\'multi_device_iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'seed2\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "MutableDenseHashTable"
+    argspec: "args=[\'empty_key\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'131072\', \'0.8\', \'None\'], "
+  }
+  member_method {
+    name: "MutableDenseHashTableV2"
+    argspec: "args=[\'empty_key\', \'deleted_key\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'131072\', \'0.8\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTable"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableOfTensors"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableOfTensorsV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "MutableHashTableV2"
+    argspec: "args=[\'key_dtype\', \'value_dtype\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "MutexLock"
+    argspec: "args=[\'mutex\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "MutexV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "NcclAllReduce"
+    argspec: "args=[\'input\', \'reduction\', \'num_devices\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NcclBroadcast"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NcclReduce"
+    argspec: "args=[\'input\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Neg"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NextAfter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NextIteration"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NoOp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonDeterministicInts"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV2"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV3"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionV4"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "NonMaxSuppressionWithOverlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NotEqual"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "NthElement"
+    argspec: "args=[\'input\', \'n\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "OneHot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OneShotIterator"
+    argspec: "args=[\'dataset_factory\', \'output_types\', \'output_shapes\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OnesLike"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptimizeDataset"
+    argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalFromValue"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalGetValue"
+    argspec: "args=[\'optional\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalHasValue"
+    argspec: "args=[\'optional\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OptionalNone"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OrderedMapClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapIncompleteSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapPeek"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OrderedMapUnstageNoKey"
+    argspec: "args=[\'indices\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeue"
+    argspec: "args=[\'dtype\', \'shape\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeueTuple"
+    argspec: "args=[\'dtypes\', \'shapes\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "OutfeedEnqueue"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OutfeedEnqueueTuple"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Pack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "Pad"
+    argspec: "args=[\'input\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PadV2"
+    argspec: "args=[\'input\', \'paddings\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddedBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddedBatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PaddingFIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "PaddingFIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ParallelConcat"
+    argspec: "args=[\'values\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParallelDynamicStitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParallelInterleaveDatasetV2"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'sloppy\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ParallelMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'sloppy\', \'preserve_cardinality\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'means\', \'stdevs\', \'minvals\', \'maxvals\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "ParseExample"
+    argspec: "args=[\'serialized\', \'names\', \'sparse_keys\', \'dense_keys\', \'dense_defaults\', \'sparse_types\', \'dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParseSequenceExample"
+    argspec: "args=[\'serialized\', \'debug_name\', \'context_dense_defaults\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'Ncontext_sparse\', \'Ncontext_dense\', \'Nfeature_list_sparse\', \'Nfeature_list_dense\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'[]\', \'[]\', \'[]\', \'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "ParseSingleExample"
+    argspec: "args=[\'serialized\', \'dense_defaults\', \'num_sparse\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ParseSingleSequenceExample"
+    argspec: "args=[\'serialized\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'context_dense_defaults\', \'debug_name\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "ParseTensor"
+    argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "PlaceholderV2"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PlaceholderWithDefault"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Polygamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PopulationCount"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PrefetchDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Prelinearize"
+    argspec: "args=[\'input\', \'shape\', \'layout\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "PrelinearizeTuple"
+    argspec: "args=[\'inputs\', \'shapes\', \'layouts\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "PreventGradient"
+    argspec: "args=[\'input\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'-1\', \'3\', \'None\'], "
+  }
+  member_method {
+    name: "PrintV2"
+    argspec: "args=[\'input\', \'output_stream\', \'name\'], varargs=None, keywords=None, defaults=[\'stderr\', \'None\'], "
+  }
+  member_method {
+    name: "PriorityQueue"
+    argspec: "args=[\'shapes\', \'component_types\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "PriorityQueueV2"
+    argspec: "args=[\'shapes\', \'component_types\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Prod"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "PyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "PyFuncStateless"
+    argspec: "args=[\'input\', \'token\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Qr"
+    argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantize"
+    argspec: "args=[\'input\', \'signed_input\', \'num_bits\', \'range_given\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV3"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeDownAndShrinkRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizeV2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedAdd"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedAvgPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedBatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'t_min\', \'t_max\', \'m\', \'m_min\', \'m_max\', \'v\', \'v_min\', \'v_max\', \'beta\', \'beta_min\', \'beta_max\', \'gamma\', \'gamma_min\', \'gamma_max\', \'out_type\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedBiasAdd"
+    argspec: "args=[\'input\', \'bias\', \'min_input\', \'max_input\', \'min_bias\', \'max_bias\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedConcat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2D"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBias"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'padding_list\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'[1, 1, 1, 1]\', \'[]\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedInstanceNorm"
+    argspec: "args=[\'x\', \'x_min\', \'x_max\', \'output_range_given\', \'given_y_min\', \'given_y_max\', \'variance_epsilon\', \'min_separation\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'0\', \'1e-05\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizedMatMul"
+    argspec: "args=[\'a\', \'b\', \'min_a\', \'max_a\', \'min_b\', \'max_b\', \'Toutput\', \'transpose_a\', \'transpose_b\', \'Tactivation\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'False\', \'False\', \"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedMaxPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedMul"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedRelu"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedRelu6"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedReluX"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "QuantizedReshape"
+    argspec: "args=[\'tensor\', \'shape\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QuantizedResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'min\', \'max\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueCloseV2"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeue"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueMany"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueManyV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueUpTo"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueUpToV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueDequeueV2"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueue"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueMany"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueManyV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueEnqueueV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QueueIsClosed"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueIsClosedV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueSize"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "QueueSizeV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT2D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RFFT3D"
+    argspec: "args=[\'input\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RGBToHSV"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedGather"
+    argspec: "args=[\'params_nested_splits\', \'params_dense_values\', \'indices\', \'OUTPUT_RAGGED_RANK\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedRange"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RaggedTensorToSparse"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RandomCrop"
+    argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomGamma"
+    argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomGammaGrad"
+    argspec: "args=[\'alpha\', \'sample\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RandomPoisson"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomPoissonV2"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffle"
+    argspec: "args=[\'value\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffleQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RandomShuffleQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'0\', \'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RandomStandardNormal"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomUniform"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "RandomUniformInt"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Range"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RangeDataset"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rank"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReadFile"
+    argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReadVariableOp"
+    argspec: "args=[\'resource\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumRecordsProduced"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumRecordsProducedV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompleted"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompletedV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRead"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadUpTo"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadUpToV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReadV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderReset"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderResetV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRestoreState"
+    argspec: "args=[\'reader_handle\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderRestoreStateV2"
+    argspec: "args=[\'reader_handle\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderSerializeState"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReaderSerializeStateV2"
+    argspec: "args=[\'reader_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Real"
+    argspec: "args=[\'input\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "RealDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Reciprocal"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReciprocalGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RecordInput"
+    argspec: "args=[\'file_pattern\', \'file_random_seed\', \'file_shuffle_shift_ratio\', \'file_buffer_size\', \'file_parallelism\', \'batch_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'301\', \'0\', \'10000\', \'16\', \'32\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RecvTPUEmbeddingActivations"
+    argspec: "args=[\'num_outputs\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReduceDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ReduceJoin"
+    argspec: "args=[\'inputs\', \'reduction_indices\', \'keep_dims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RefEnter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "RefExit"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefIdentity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefMerge"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefNextIteration"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefSelect"
+    argspec: "args=[\'index\', \'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RefSwitch"
+    argspec: "args=[\'data\', \'pred\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Relu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Relu6"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Relu6Grad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RemoteCall"
+    argspec: "args=[\'target\', \'args\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RepeatDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizationRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Requantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Reshape"
+    argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResizeArea"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBicubicGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeBilinearGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeNearestNeighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResizeNearestNeighborGrad"
+    argspec: "args=[\'grads\', \'size\', \'align_corners\', \'half_pixel_centers\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAdamWithAmsgrad"
+    argspec: "args=[\'var\', \'m\', \'v\', \'vhat\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceCountUpTo"
+    argspec: "args=[\'resource\', \'limit\', \'T\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceGather"
+    argspec: "args=[\'resource\', \'indices\', \'dtype\', \'batch_dims\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterAdd"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterDiv"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMax"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMin"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterMul"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterSub"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceScatterUpdate"
+    argspec: "args=[\'resource\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceSparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ResourceStridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Restore"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'dt\', \'preferred_shard\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "RestoreSlice"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'shape_and_slice\', \'dt\', \'preferred_shard\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "RestoreV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'dtypes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingADAMParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdadeltaParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdagradParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingFTRLParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMomentumParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingRMSPropParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+    argspec: "args=[\'num_shards\', \'shard_id\', \'table_id\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Reverse"
+    argspec: "args=[\'tensor\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ReverseSequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_dim\', \'batch_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "ReverseV2"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RightShift"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rint"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Rsqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "RsqrtGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0.1\', \'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SampleDistortedBoundingBoxV2"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'seed2\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Save"
+    argspec: "args=[\'filename\', \'tensor_names\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SaveSlices"
+    argspec: "args=[\'filename\', \'tensor_names\', \'shapes_and_slices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SaveV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'tensors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScalarSummary"
+    argspec: "args=[\'tags\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScaleAndTranslate"
+    argspec: "args=[\'images\', \'size\', \'scale\', \'translation\', \'kernel_type\', \'antialias\', \'name\'], varargs=None, keywords=None, defaults=[\'lanczos3\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScaleAndTranslateGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'scale\', \'translation\', \'kernel_type\', \'antialias\', \'name\'], varargs=None, keywords=None, defaults=[\'lanczos3\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterDiv"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMax"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMin"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterMul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNdNonAliasingAdd"
+    argspec: "args=[\'input\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "ScatterUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaFprint"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SdcaOptimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaOptimizerV2"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptive\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SdcaShrinkL1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMean"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Select"
+    argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SelfAdjointEig"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Selu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SeluGrad"
+    argspec: "args=[\'gradients\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SendTPUEmbeddingGradients"
+    argspec: "args=[\'inputs\', \'learning_rates\', \'config\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SerializeIterator"
+    argspec: "args=[\'resource_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SerializeManySparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SerializeSparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SerializeTensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SetSize"
+    argspec: "args=[\'set_indices\', \'set_values\', \'set_shape\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Shape"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ShapeN"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShardedFilename"
+    argspec: "args=[\'basename\', \'shard\', \'num_shards\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShardedFilespec"
+    argspec: "args=[\'basename\', \'num_shards\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShuffleAndRepeatDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "ShutdownDistributedTPU"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SigmoidGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sinh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Size"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "SkipDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Slice"
+    argspec: "args=[\'input\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Snapshot"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softmax"
+    argspec: "args=[\'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softplus"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftplusGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Softsign"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SoftsignGrad"
+    argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToBatch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToBatchND"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SpaceToDepth"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "SparseAccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient_indices\', \'gradient_values\', \'gradient_shape\', \'has_known_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'thresh\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseAddGrad"
+    argspec: "args=[\'backprop_val_grad\', \'a_indices\', \'b_indices\', \'sum_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseConcat"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'MEAN\', \'None\'], "
+  }
+  member_method {
+    name: "SparseCross"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseAdd"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseDiv"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseDenseCwiseMul"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseFillEmptyRows"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseFillEmptyRowsGrad"
+    argspec: "args=[\'reverse_index_map\', \'grad_values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseMatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceMax"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceMaxSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceSum"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReduceSumSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseReorder"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseReshape"
+    argspec: "args=[\'input_indices\', \'input_shape\', \'new_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMeanGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentMeanWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtN"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtNGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSqrtNWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSegmentSumWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSlice"
+    argspec: "args=[\'indices\', \'values\', \'shape\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSliceGrad"
+    argspec: "args=[\'backprop_val_grad\', \'input_indices\', \'input_start\', \'output_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSoftmax"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSparseMaximum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSparseMinimum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseSplit"
+    argspec: "args=[\'split_dim\', \'indices\', \'values\', \'shape\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseTensorDenseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseTensorDenseMatMul"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "SparseTensorSliceDataset"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseToDense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "SparseToSparseSetOperation"
+    argspec: "args=[\'set1_indices\', \'set1_values\', \'set1_shape\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Split"
+    argspec: "args=[\'axis\', \'value\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SplitV"
+    argspec: "args=[\'value\', \'size_splits\', \'axis\', \'num_split\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SqrtGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SquaredDifference"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Squeeze"
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
+  }
+  member_method {
+    name: "Stack"
+    argspec: "args=[\'elem_type\', \'stack_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "StackClose"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackCloseV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPop"
+    argspec: "args=[\'handle\', \'elem_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPopV2"
+    argspec: "args=[\'handle\', \'elem_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StackPush"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "StackPushV2"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "StackV2"
+    argspec: "args=[\'max_size\', \'elem_type\', \'stack_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "Stage"
+    argspec: "args=[\'values\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StageClear"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StagePeek"
+    argspec: "args=[\'index\', \'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StageSize"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StatefulPartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "StatefulStandardNormal"
+    argspec: "args=[\'resource\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulStandardNormalV2"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulUniformFullInt"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatefulUniformInt"
+    argspec: "args=[\'resource\', \'algorithm\', \'shape\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessIf"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessMultinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformInt"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "StatelessWhile"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StaticRegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StaticRegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "StopGradient"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StridedSlice"
+    argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StridedSliceGrad"
+    argspec: "args=[\'shape\', \'begin\', \'end\', \'strides\', \'dy\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "StringFormat"
+    argspec: "args=[\'inputs\', \'template\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'%s\', \'%s\', \'3\', \'None\'], "
+  }
+  member_method {
+    name: "StringJoin"
+    argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "StringLength"
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
+  }
+  member_method {
+    name: "StringSplit"
+    argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "StringSplitV2"
+    argspec: "args=[\'input\', \'sep\', \'maxsplit\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "StringStrip"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucketFast"
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToHashBucketStrong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StringToNumber"
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Sub"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
+  }
+  member_method {
+    name: "Sum"
+    argspec: "args=[\'input\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "SummaryWriter"
+    argspec: "args=[\'shared_name\', \'container\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Svd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "Switch"
+    argspec: "args=[\'data\', \'pred\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SymbolicGradient"
+    argspec: "args=[\'input\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TFRecordDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TFRecordReader"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TFRecordReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TPUCompilationResult"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUEmbeddingActivations"
+    argspec: "args=[\'embedding_variable\', \'sliced_activations\', \'table_id\', \'lookup_id\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUOrdinalSelector"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUPartitionedCall"
+    argspec: "args=[\'args\', \'device_ordinal\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUReplicate"
+    argspec: "args=[\'inputs\', \'broadcast_inputs\', \'variables\', \'guaranteed_constants\', \'computation\', \'num_replicas\', \'output_types\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+  }
+  member_method {
+    name: "TPUReplicateMetadata"
+    argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
+  }
+  member_method {
+    name: "TPUReplicatedInput"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TPUReplicatedOutput"
+    argspec: "args=[\'input\', \'num_replicas\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TakeDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TakeManySparseFromTensorsMap"
+    argspec: "args=[\'sparse_handles\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Tan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TanhGrad"
+    argspec: "args=[\'y\', \'dy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TemporaryVariable"
+    argspec: "args=[\'shape\', \'dtype\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArray"
+    argspec: "args=[\'size\', \'dtype\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayClose"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayCloseV2"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayCloseV3"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcat"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcatV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayConcatV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGather"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGatherV2"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGatherV3"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGrad"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayGradWithShape"
+    argspec: "args=[\'handle\', \'flow_in\', \'shape_to_prepend\', \'source\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayPack"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayRead"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayReadV2"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayReadV3"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatter"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatterV2"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayScatterV3"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySize"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySizeV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySizeV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplit"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplitV2"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArraySplitV3"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayUnpack"
+    argspec: "args=[\'handle\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayV2"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'True\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayV3"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'identical_element_shapes\', \'tensor_array_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'True\', \'False\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWrite"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWriteV2"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorArrayWriteV3"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorDataset"
+    argspec: "args=[\'components\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListConcat"
+    argspec: "args=[\'input_handle\', \'element_dtype\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "TensorListConcatLists"
+    argspec: "args=[\'input_a\', \'input_b\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListConcatV2"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'leading_dims\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListElementShape"
+    argspec: "args=[\'input_handle\', \'shape_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListFromTensor"
+    argspec: "args=[\'tensor\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListGather"
+    argspec: "args=[\'input_handle\', \'indices\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListGetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListLength"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPopBack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPushBack"
+    argspec: "args=[\'input_handle\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListPushBackBatch"
+    argspec: "args=[\'input_handles\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListReserve"
+    argspec: "args=[\'element_shape\', \'num_elements\', \'element_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListResize"
+    argspec: "args=[\'input_handle\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatter"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListScatterV2"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListSetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'item\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListSplit"
+    argspec: "args=[\'tensor\', \'element_shape\', \'lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorListStack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'num_elements\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "TensorScatterAdd"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorScatterSub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorScatterUpdate"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorSliceDataset"
+    argspec: "args=[\'components\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TensorSummary"
+    argspec: "args=[\'tensor\', \'description\', \'labels\', \'display_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'[]\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TensorSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'serialized_summary_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TextLineDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TextLineReader"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "TextLineReaderV2"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "ThreadUnsafeUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Tile"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TileGrad"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Timestamp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TopK"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "TopKV2"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "Transpose"
+    argspec: "args=[\'x\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncateDiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncateMod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "TruncatedNormal"
+    argspec: "args=[\'shape\', \'dtype\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeDecode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeDecodeWithOffsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeEncode"
+    argspec: "args=[\'input_values\', \'input_splits\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "UnicodeScript"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnicodeTranscode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "UniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "Unique"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueWithCounts"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "UniqueWithCountsV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "Unpack"
+    argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "UnravelIndex"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UnsortedSegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Unstage"
+    argspec: "args=[\'dtypes\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "UnwrapDatasetVariant"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "UpperBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "VarHandleOp"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "VarIsInitializedOp"
+    argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Variable"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "VariableShape"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "VariableV2"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "Where"
+    argspec: "args=[\'condition\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "While"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
+  }
+  member_method {
+    name: "WholeFileReader"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "WholeFileReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "WindowDataset"
+    argspec: "args=[\'input_dataset\', \'size\', \'shift\', \'stride\', \'drop_remainder\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WorkerHeartbeat"
+    argspec: "args=[\'request\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WrapDatasetVariant"
+    argspec: "args=[\'input_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteAudioSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'sample_rate\', \'max_outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "WriteFile"
+    argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteGraphSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteHistogramSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteImageSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'bad_color\', \'max_images\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "WriteScalarSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WriteSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'tag\', \'summary_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Xdivy"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Xlogy"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ZerosLike"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "Zeta"
+    argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ZipDataset"
+    argspec: "args=[\'input_datasets\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt
deleted file mode 100644
index 42b13533dd8c94335294e7fe3d9223d791877f50..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.rnn"
-tf_module {
-  member {
-    name: "DropoutWrapper"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 63bebb20bcae08c645d9aaaecab2ea2de4cc49aa..17275d4d8d7f4cebdca43ed0f2fed6a9841a7ea2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -77,7 +77,7 @@ tf_module {
     mtype: "<type \'str\'>"
   }
   member {
-    name: "TRANING"
+    name: "TRAINING"
     mtype: "<type \'str\'>"
   }
   member {
@@ -88,29 +88,13 @@ tf_module {
     name: "VARIABLES_FILENAME"
     mtype: "<type \'str\'>"
   }
-  member_method {
-    name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "classification_signature_def"
-    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "contains_saved_model"
     argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "regression_signature_def"
-    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+    name: "load"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 962cf9a7239343e3b570d3a6d20edeeeb871b120..e2da65eee41905c7b7c67eade11e1775a2408ca0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
index 6715c14e168d6a30ce8aa35470525521069de40a..8f45ecbf776a8e0e0839692d4020635d0f676482 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'resource\', \'init_op_fn\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "as_default"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..016d7537440267e0dd5819b5279b0e50143221b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.summary.experimental"
+tf_module {
+  member_method {
+    name: "get_step"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_step"
+    argspec: "args=[\'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index c59f1b8474302b5529895b8aa9216a2e197d958f..a4d728c69764cfb2f9e5b72365a2a0a390a39ba6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -4,24 +4,68 @@ tf_module {
     name: "SummaryWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "audio"
+    argspec: "args=[\'name\', \'data\', \'sample_rate\', \'step\', \'max_outputs\', \'encoding\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'3\', \'None\', \'None\'], "
+  }
   member_method {
     name: "create_file_writer"
     argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "create_noop_writer"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flush"
     argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "histogram"
+    argspec: "args=[\'name\', \'data\', \'step\', \'buckets\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "image"
+    argspec: "args=[\'name\', \'data\', \'step\', \'max_outputs\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'3\', \'None\'], "
+  }
   member_method {
     name: "import_event"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "record_if"
+    argspec: "args=[\'condition\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scalar"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "summary_scope"
     argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'summary\', \'None\'], "
   }
+  member_method {
+    name: "text"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "trace_export"
+    argspec: "args=[\'name\', \'step\', \'profiler_outdir\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "trace_off"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "trace_on"
+    argspec: "args=[\'graph\', \'profiler\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+  }
   member_method {
     name: "write"
-    argspec: "args=[\'tag\', \'tensor\', \'step\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'tag\', \'tensor\', \'step\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
index 6fc489c86043d074ac832d0ec9dbefd2cbbb4f19..48f53a85454f1f7103728965217eba85ccde10c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index 980e96ac254aebf229ae52d98f607ed87d334e7a..ac9dd8f7189799cbf9b061677cd88058cb9d799e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -10,12 +10,16 @@ tf_module {
   }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'expected\', \'actual\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_gradient"
+    argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-assignment.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-assignment.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f2d8c19444cd4a383997ed0277a7c0140c9be48
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-assignment.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.tpu.experimental.DeviceAssignment"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.device_assignment.DeviceAssignment\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "core_assignment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_cores_per_replica"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "topology"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'topology\', \'core_assignment\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'topology\', \'computation_shape\', \'computation_stride\', \'num_replicas\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\'], "
+  }
+  member_method {
+    name: "coordinates"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "host_device"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\', \'job\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "lookup_replicas"
+    argspec: "args=[\'self\', \'task_id\', \'logical_core\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tpu_device"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\', \'job\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "tpu_ordinal"
+    argspec: "args=[\'self\', \'replica\', \'logical_core\'], varargs=None, keywords=None, defaults=[\'0\', \'0\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..642942a349115d29b55d799ef3ca0e0a26c035da
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.tpu.experimental"
+tf_module {
+  member {
+    name: "DeviceAssignment"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "initialize_tpu_system"
+    argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7fabcf229880c9077a75fddb9769a0d891f065f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.tpu"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 42dcdac9e77a8efac875e4985f6a8f744e838ddb..deb93d7adca4d564ec31df926536ffb9176be5be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.AutoCheckpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
@@ -18,10 +18,10 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'file_prefix\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "write"
-    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'file_prefix\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index eb9a86183e10775379efb84c693f7aa7ba573f2d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalGradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.0\', \'False\', \'ProximalGradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-python-state.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-python-state.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59bc0179aef2ee5bd73b6541ef2f17447784745b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-python-state.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.train.experimental.PythonState"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.python_state.PythonState\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'self\', \'string_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2761b489b965ad4ca6e22458d7efad724891c22f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.train.experimental"
+tf_module {
+  member {
+    name: "PythonState"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index a3ace15ca2cfe15cfd8f3ab98d9fabb603f0131e..59c7c17c5266fbc76b3d5849514dc66acce17580 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -60,10 +60,6 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "ProximalGradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -72,34 +68,18 @@ tf_module {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member_method {
-    name: "cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
-  }
-  member_method {
-    name: "cosine_decay_restarts"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
-  }
-  member_method {
-    name: "exponential_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "inverse_time_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "latest_checkpoint"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "linear_cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
-  }
   member_method {
     name: "list_variables"
     argspec: "args=[\'ckpt_dir_or_file\'], varargs=None, keywords=None, defaults=None"
@@ -112,22 +92,6 @@ tf_module {
     name: "load_variable"
     argspec: "args=[\'ckpt_dir_or_file\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "natural_exp_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "noisy_linear_cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
-  }
-  member_method {
-    name: "piecewise_constant_decay"
-    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "polynomial_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 3cbea41dcab4f8453ef2598fbfd4f8bee65c9b65..b81ce692bbdb28f7c039c32af7f803423d7e86c1 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -15,11 +15,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 py_test(
     name = "api_compatibility_test",
-    srcs = [
-        "api_compatibility_test.py",
-        "//tensorflow:tf_python_api_gen_v2",
-    ],
-    args = ["--only_test_core_api=true"],
+    srcs = ["api_compatibility_test.py"],
     data = [
         "//tensorflow/tools/api/golden:api_golden_v1",
         "//tensorflow/tools/api/golden:api_golden_v2",
@@ -27,6 +23,7 @@ py_test(
         "//tensorflow/tools/api/tests:README.txt",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -35,6 +32,7 @@ py_test(
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 723fceef413d86675e885debd37e73e5facd7f7c..0c1f9de1a36e778c446f7f54be9b20bd7ae2abb7 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -32,8 +32,8 @@ import os
 import re
 import sys
 
+import six
 import tensorflow as tf
-from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
@@ -59,7 +59,7 @@ _UPDATE_GOLDENS_HELP = """
 # DEFINE_boolean, only_test_core_api, default False:
 _ONLY_TEST_CORE_API_HELP = """
     Some TF APIs are being moved outside of the tensorflow/ directory. There is
-    no garuntee which versions of these APIs will be present when running this
+    no guarantee which versions of these APIs will be present when running this
     test. Therefore, do not error out on API changes in non-core TF code
     if this flag is set.
 """
@@ -78,6 +78,13 @@ _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 _NON_CORE_PACKAGES = ['estimator']
 
 
+# TODO(annarev): remove this once we test with newer version of
+# estimator that actually has compat v1 version.
+if not hasattr(tf.compat.v1, 'estimator'):
+  tf.compat.v1.estimator = tf.estimator
+  tf.compat.v2.estimator = tf.estimator
+
+
 def _KeyToFilePath(key, api_version):
   """From a given key, construct a filepath.
 
@@ -135,6 +142,29 @@ def _FilterNonCoreGoldenFiles(golden_file_list):
   return filtered_file_list
 
 
+def _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map):
+  """Filter out golden proto dict symbols that should be omitted."""
+  if not omit_golden_symbols_map:
+    return golden_proto_dict
+  filtered_proto_dict = dict(golden_proto_dict)
+  for key, symbol_list in six.iteritems(omit_golden_symbols_map):
+    api_object = api_objects_pb2.TFAPIObject()
+    api_object.CopyFrom(filtered_proto_dict[key])
+    filtered_proto_dict[key] = api_object
+    module_or_class = None
+    if api_object.HasField('tf_module'):
+      module_or_class = api_object.tf_module
+    elif api_object.HasField('tf_class'):
+      module_or_class = api_object.tf_class
+    if module_or_class is not None:
+      for members in (module_or_class.member, module_or_class.member_method):
+        filtered_members = [m for m in members if m.name not in symbol_list]
+        # Two steps because protobuf repeated fields disallow slice assignment.
+        del members[:]
+        members.extend(filtered_members)
+  return filtered_proto_dict
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -256,7 +286,7 @@ class ApiCompatibilityTest(test.TestCase):
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
       visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
-    traverse.traverse(tf_v2.compat.v1, visitor)
+    traverse.traverse(tf.compat.v1, visitor)
 
   def testNoSubclassOfMessageV2(self):
     if not hasattr(tf.compat, 'v2'):
@@ -265,13 +295,15 @@ class ApiCompatibilityTest(test.TestCase):
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
       visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
-    traverse.traverse(tf_v2, visitor)
+    visitor.private_map['tf.compat'] = ['v1', 'v2']
+    traverse.traverse(tf.compat.v2, visitor)
 
   def _checkBackwardsCompatibility(self,
                                    root,
                                    golden_file_pattern,
                                    api_version,
-                                   additional_private_map=None):
+                                   additional_private_map=None,
+                                   omit_golden_symbols_map=None):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
@@ -304,6 +336,8 @@ class ApiCompatibilityTest(test.TestCase):
         _FileNameToKey(filename): _ReadFileToProto(filename)
         for filename in golden_file_list
     }
+    golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
+                                               omit_golden_symbols_map)
 
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
@@ -316,7 +350,7 @@ class ApiCompatibilityTest(test.TestCase):
 
   @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
-    api_version = 1
+    api_version = 2 if '_api.v2' in tf.__name__ else 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
@@ -339,7 +373,7 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
-    self._checkBackwardsCompatibility(tf_v2.compat.v1, golden_file_pattern,
+    self._checkBackwardsCompatibility(tf.compat.v1, golden_file_pattern,
                                       api_version)
 
   def testAPIBackwardsCompatibilityV2(self):
@@ -347,11 +381,17 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
+    omit_golden_symbols_map = {}
+    if FLAGS.only_test_core_api:
+      # In TF 2.0 these summary symbols are imported from TensorBoard.
+      omit_golden_symbols_map['tensorflow.summary'] = [
+          'audio', 'histogram', 'image', 'scalar', 'text']
     self._checkBackwardsCompatibility(
-        tf_v2,
+        tf.compat.v2,
         golden_file_pattern,
         api_version,
-        additional_private_map={'tf.compat': ['v1']})
+        additional_private_map={'tf.compat': ['v1', 'v2']},
+        omit_golden_symbols_map=omit_golden_symbols_map)
 
 
 if __name__ == '__main__':
@@ -363,7 +403,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--only_test_core_api',
       type=bool,
-      default=False,
+      default=True,  # only_test_core_api default value
       help=_ONLY_TEST_CORE_API_HELP)
   parser.add_argument(
       '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index de93b12b97081feea5be96edf3b6e6dfbe5599b4..e5187ab8727b2af1853972417c2fb1b890b59a4a 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -254,6 +254,7 @@ Status InitializeSession(int num_threads, const string& graph,
   tensorflow::ConfigProto& config = options.config;
   if (num_threads > 0) {
     config.set_intra_op_parallelism_threads(num_threads);
+    config.set_inter_op_parallelism_threads(num_threads);
   }
   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_gpu b/tensorflow/tools/ci_build/Dockerfile.custom_op_gpu
new file mode 100644
index 0000000000000000000000000000000000000000..c36b92224f4f3e7cd211c6bd0bc30668c71f51a1
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_gpu
@@ -0,0 +1,21 @@
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu14.04
+
+LABEL maintainer="Yifei Feng <yifeif@google.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+
+ENV TF_NEED_CUDA 1
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index eb9d0d4dd01c8b39fd108c88d690a2c08efa3760..ad82c88b4a6fa88d5375e66d44e31c2f4e17cea4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -5,7 +5,7 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
-RUN echo "deb http://http.debian.net/debian jessie-backports main" | \
+RUN echo "deb http://www.debian.net/debian jessie-backports main" | \
     tee -a /etc/apt/sources.list
 # Workaround bug in Jessie backport repository deb packages
 # http://serverfault.com/questions/830636/cannot-install-openjdk-8-jre-headless-on-debian-jessie
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
index d08d31d91304d45c317fdb4b6dec5b05494f7e9b..c6099c9e45115bfb84be6d3721fbf62088614801 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -3,7 +3,7 @@
 #       --tag "gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04" .
 # $ docker push gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -59,13 +59,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# Install a newer version of libstdc++, as new clang versions do not work
-# with the stock ubuntu 14.04 libstdc++.
+# Install a newer version of g++:
+# - we need a new libstdc++, because new clang versions do not work with a stock
+#   ubuntu 14.04 libstdc++.
+# - we want to compile with g++-7 to get ahead of LLVM dropping support for
+#   gcc 4.8.
 RUN apt-get update && \
     apt-get install -y software-properties-common && \
     add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
     apt-get update && \
-    apt-get install -y libstdc++-7-dev && \
+    apt-get install -y --no-install-recommends g++-7 && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy and run the install scripts.
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index aadaa8bac11cc80d1af11905d88116c8df677c2f..f190199643a14d42bb16113568d02ef2e6c85127 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -44,9 +44,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
   libboost-filesystem-dev \
   rpm \
   libnuma-dev \
+  pciutils \
   virtualenv \
   python-pip \
   python3-pip \
+  libxml2 \
+  libxml2-dev \
   wget && \
   apt-get clean && \
   rm -rf /var/lib/apt/lists/*
@@ -54,15 +57,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
 # Install rocm pkgs
 RUN apt-get update --allow-insecure-repositories && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    rocm-dev rocm-libs rocm-utils \
+    rocm-dev rocm-libs rocm-utils rocm-cmake \
     rocfft miopen-hip miopengemm rocblas hipblas rocrand \
     rocm-profiler cxlactivitylogger && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN cd ~ && git clone https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP.git
-RUN cd ~/HIP && mkdir -p build && cd build && cmake .. && make package -j && dpkg -i *.deb
-
 ENV HCC_HOME=$ROCM_PATH/hcc
 ENV HIP_PATH=$ROCM_PATH/hip
 ENV OPENCL_ROOT=$ROCM_PATH/opencl
@@ -71,12 +71,7 @@ ENV PATH="$ROCM_PATH/bin:${PATH}"
 ENV PATH="$OPENCL_ROOT/bin:${PATH}"
 
 # Add target file to help determine which device(s) to build for
-RUN echo -e "gfx803\ngfx900" >> /opt/rocm/bin/target.lst
-
-# Setup environment variables, and add those environment variables at the end of ~/.bashrc 
-ARG HCC_HOME=/opt/rocm/hcc
-ARG HIP_PATH=/opt/rocm/hip
-ARG PATH=$HCC_HOME/bin:$HIP_PATH/bin:$PATH
+RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
new file mode 100755
index 0000000000000000000000000000000000000000..04b59884a84c51a650672c461ca5d6891879c9cd
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -0,0 +1,654 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Build the Python PIP installation package for TensorFlow and install
+# the package.
+#
+# Usage:
+#   pip_new.sh
+#
+# Required step(s):
+#   Run configure.py prior to running this script.
+#
+# Required environment variable(s):
+#   CONTAINER_TYPE:      (CPU | GPU)
+#   OS_TYPE:             (UBUNTU | MACOS)
+#   TF_PYTHON_VERSION:   (python2 | python2.7 | python3.5 | python3.7)
+#   TF_BUILD_FLAGS:      Bazel build flags.
+#                          e.g. TF_BUILD_FLAGS="--config=opt"
+#   TF_TEST_FLAGS:       Bazel test flags.
+#                          e.g. TF_TEST_FLAGS="--verbose_failures=true \
+#                               --build_tests_only --test_output=errors"
+#   TF_TEST_FILTER_TAGS: Filtering tags for bazel tests. More specifically,
+#                        input tags for `--test_filter_tags` flag.
+#                          e.g. TF_TEST_FILTER_TAGS="no_pip,-nomac,no_oss"
+#   TF_TEST_TARGETS:     Bazel test targets.
+#                          e.g. TF_TEST_TARGETS="//tensorflow/... \
+#                               -//tensorflow/contrib/... \
+#                               -//tensorflow/python/..."
+#   IS_NIGHTLY:          Nightly run flag.
+#                          e.g. IS_NIGHTLY=1  # nightly runs
+#                          e.g. IS_NIGHTLY=0  # non-nightly runs
+#
+# Optional environment variables. If provided, overwrites any default values.
+#   TF_PIP_TESTS:        PIP tests to run. If NOT specified, skips all tests.
+#                          e.g. TF_PIP_TESTS="test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_oss_serial"
+#   TF_PROJECT_NAME:     Name of the project. This string will be pass onto
+#                        the wheel file name. For nightly builds, it will be
+#                        overwritten to 'tf_nightly'. For gpu builds, '_gpu'
+#                        will be appended.
+#                          e.g. TF_PROJECT_NAME="tensorflow"
+#                          e.g. TF_PROJECT_NAME="tf_nightly_gpu"
+#   TF_PIP_TEST_ROOT:    Root directory for building and testing pip pkgs.
+#                          e.g. TF_PIP_TEST_ROOT="pip_test"
+#
+# To-be-deprecated variable(s).
+#   GIT_TAG_OVERRIDE:    Values for `--git_tag_override`. This flag gets passed
+#                        in as `--action_env` for bazel build and tests.
+#   TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES:
+#                        Additonal pip packages to be installed.
+#                        Caveat: pip version needs to be checked prior.
+#
+# ==============================================================================
+
+# set bash options
+set -e
+set -x
+
+###########################################################################
+# General helper function(s)
+###########################################################################
+
+# Strip leading and trailing whitespaces
+str_strip () {
+  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
+# Convert string to all lower case
+lowercase() {
+  if [[ -z "${1}" ]]; then
+    die "Nothing to convert to lowercase. No argument given."
+  fi
+  echo "${1}" | tr '[:upper:]' '[:lower:]'
+}
+
+check_global_vars() {
+  # Check container type
+  if ! [[ ${CONTAINER_TYPE} == "cpu" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "rocm" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+    die "Error: Provided CONTAINER_TYPE \"${CONTAINER_TYPE}\" "\
+        "is not supported."
+  fi
+  # Check OS type
+  if ! [[ ${OS_TYPE} == "ubuntu" ]] && \
+     ! [[ ${OS_TYPE} == "macos" ]]; then
+    die "Error: Provided OS_TYPE \"${OS_TYPE}\" is not supported."
+  fi
+  # Check build flags
+  if [[ -z ${TF_BUILD_FLAGS} ]]; then
+    die "Error: TF_BUILD_FLAGS is not specified."
+  fi
+  # Check test flags
+  if [[ -z ${TF_TEST_FLAGS} ]]; then
+    die "Error: TF_TEST_FLAGS is not specified."
+  fi
+  # Check test filter tags
+  if [[ -z ${TF_TEST_FILTER_TAGS} ]]; then
+    die "Error: TF_TEST_FILTER_TAGS is not specified."
+  fi
+  # Check test targets
+  if [[ -z ${TF_TEST_TARGETS} ]]; then
+    die "Error: TF_TEST_TARGETS is not specified."
+  fi
+  # Check nightly status
+  if [[ -z ${IS_NIGHTLY} ]]; then
+    die "Error: IS_NIGHTLY is not specified."
+  fi
+}
+
+add_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    if ! [[ $TF_TEST_FILTER_TAGS == *"${FILTER}"* ]]; then
+      TF_TEST_FILTER_TAGS="${FILTER},${TF_TEST_FILTER_TAGS}"
+    fi
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+remove_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    TF_TEST_FILTER_TAGS="$(echo ${TF_TEST_FILTER_TAGS} | sed -e 's/^'${FILTER}',//g' -e 's/,'${FILTER}'//g')"
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+# Clean up bazel build & test flags with proper configuration.
+update_bazel_flags() {
+  # Add git tag override flag if necessary.
+  GIT_TAG_STR=" --action_env=GIT_TAG_OVERRIDE"
+  if [[ -z "${GIT_TAG_OVERRIDE}" ]] && \
+    ! [[ ${TF_BUILD_FLAGS} = *${GIT_TAG_STR}* ]]; then
+    TF_BUILD_FLAGS+="${GIT_TAG_STR}"
+  fi
+  # Clean up whitespaces
+  TF_BUILD_FLAGS=$(str_strip "${TF_BUILD_FLAGS}")
+  TF_TEST_FLAGS=$(str_strip "${TF_TEST_FLAGS}")
+  # Cleaned bazel flags
+  echo "Bazel build flags (cleaned):\n" "${TF_BUILD_FLAGS}"
+  echo "Bazel test flags (cleaned):\n" "${TF_TEST_FLAGS}"
+}
+
+update_test_filter_tags() {
+  # Add test filter tags
+  # This script is for validating built PIP packages. Add pip tags.
+  add_test_filter_tag -no_pip -nopip
+  # MacOS filter tags
+  if [[ ${OS_TYPE} == "macos" ]]; then
+    remove_test_filter_tag nomac no_mac
+    add_test_filter_tag -nomac -no_mac
+  fi
+  echo "Final test filter tags: ${TF_TEST_FILTER_TAGS}"
+}
+
+# Check currently running python and pip version
+check_python_pip_version() {
+  # Check if only the major version of python is provided by the user.
+  MAJOR_VER_ONLY=0
+  if [[ ${#PYTHON_VER} -lt 9 ]]; then
+    # User only provided major version (e.g. 'python2' instead of 'python2.7')
+    MAJOR_VER_ONLY=1
+  fi
+
+  # Retrieve only the version number of the user requested python.
+  PYTHON_VER_REQUESTED=${PYTHON_VER:6:3}
+  echo "PYTHON_VER_REQUESTED: ${PYTHON_VER_REQUESTED}"
+
+  # Retrieve only the version numbers of the python & pip in use currently.
+  PYTHON_VER_IN_USE=$(python --version 2>&1)
+  PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:7:3}
+  PIP_VER_IN_USE=$(${PIP_BIN_PATH} --version)
+  PIP_VER_IN_USE=${PIP_VER_IN_USE:${#PIP_VER_IN_USE}-4:3}
+
+  # If only major versions are applied, drop minor versions.
+  if [[ $MAJOR_VER_ONLY == 1 ]]; then
+    PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:0:1}
+    PIP_VER_IN_USE=${PIP_VER_IN_USE:0:1}
+  fi
+
+  # Check if all versions match.
+  echo -e "User requested python version: '${PYTHON_VER_REQUESTED}'\n" \
+    "Detected python version in use: '${PYTHON_VER_IN_USE}'\n"\
+    "Detected pip version in use: '${PIP_VER_IN_USE}'"
+  if ! [[ $PYTHON_VER_REQUESTED == $PYTHON_VER_IN_USE ]]; then
+    die "Error: Mismatch in python versions detected."
+  else:
+    echo "Python and PIP versions in use match the requested."
+  fi
+}
+
+###########################################################################
+# Setup: directories, local/global variables
+###########################################################################
+
+# Script directory and source necessary files.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+
+# Required global variables
+# Checks on values for these vars are done in "Build TF PIP Package" section.
+CONTAINER_TYPE=$(lowercase "${CONTAINER_TYPE}")
+OS_TYPE=$(lowercase "${OS_TYPE}")
+PYTHON_VER=$(lowercase "${TF_PYTHON_VERSION}")
+
+# Python bin path
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "Error: PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+
+# Set optional environment variables; set to default in case not user defined.
+DEFAULT_PIP_TESTS="" # Do not run any tests by default
+DEFAULT_PROJECT_NAME="tensorflow"
+DEFAULT_PIP_TEST_ROOT="pip_test"
+# Take in optional global variables
+PIP_TESTS=${TF_PIP_TESTS:-$DEFAULT_PIP_TESTS}
+PROJECT_NAME=${TF_PROJECT_NAME:-$DEFAULT_PROJECT_NAME}
+PIP_TEST_ROOT=${TF_PIP_TEST_ROOT:-$DEFAULT_PIP_TEST_ROOT}
+
+# Local variables
+PIP_WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/${PIP_TEST_ROOT}/whl"
+mkdir -p "${PIP_WHL_DIR}"
+PIP_WHL_DIR=$(realpath "${PIP_WHL_DIR}") # Get absolute path
+WHL_PATH=""
+# Determine the major.minor versions of python being used (e.g., 2.7).
+# Useful for determining the directory of the local pip installation.
+PY_MAJOR_MINOR_VER=$(${PYTHON_BIN_PATH} -c "print(__import__('sys').version)" 2>&1 | awk '{ print $1 }' | head -n 1 | cut -c1-3)
+
+if [[ -z "${PY_MAJOR_MINOR_VER}" ]]; then
+  die "ERROR: Unable to determine the major.minor version of Python."
+fi
+echo "Python binary path to be used in PIP install: ${PYTHON_BIN_PATH} "\
+"(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
+PYTHON_BIN_PATH_INIT=${PYTHON_BIN_PATH}
+PIP_BIN_PATH="$(which pip${PY_MAJOR_MINOR_VER})"
+
+# PIP packages
+INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
+
+###########################################################################
+# Build TF PIP Package
+###########################################################################
+
+# First remove any already existing binaries for a clean start and test.
+if [[ -d ${PIP_TEST_ROOT} ]]; then
+  echo "Test root directory ${PIP_TEST_ROOT} already exists. Deleting it."
+  sudo rm -rf ${PIP_TEST_ROOT}
+fi
+
+# Check that global variables are properly set.
+check_global_vars
+
+# Check if in a virtualenv and exit if yes.
+IN_VENV=$(python -c 'import sys; print("1" if hasattr(sys, "real_prefix") else "0")')
+if [[ "$IN_VENV" == "1" ]]; then
+  echo "It appears that we are already in a virtualenv. Deactivating..."
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+fi
+
+# Configure python. Obtain the path to python binary.
+source tools/python_bin_path.sh
+# Assume PYTHON_BIN_PATH is exported by the script above.
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+
+# Bazel build the file.
+PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
+# Clean bazel cache.
+bazel clean
+# Clean up and update bazel flags
+update_bazel_flags
+# Build. This outputs the file `build_pip_package`.
+bazel build ${TF_BUILD_FLAGS} ${PIP_BUILD_TARGET} || \
+  die "Error: Bazel build failed for target: '${PIP_BUILD_TARGET}'"
+
+###########################################################################
+# Test function(s)
+###########################################################################
+
+test_pip_virtualenv() {
+  # Get args
+  VENV_DIR_NAME=$1
+  shift
+  TEST_TYPE_FLAG=$1
+
+  # Check test type args
+  if ! [[ ${TEST_TYPE_FLAG} == "--oss_serial" ]] && \
+     ! [[ ${TEST_TYPE_FLAG} == "--clean" ]] && \
+     ! [[ ${TEST_TYPE_FLAG} == "" ]]; then
+     die "Error: Wrong test type given. TEST_TYPE_FLAG=${TEST_TYPE_FLAG}"
+  fi
+
+  # Create virtualenv directory for test
+  VENV_DIR="${PIP_TEST_ROOT}/${VENV_DIR_NAME}"
+
+  # Activate virtualenv
+  create_activate_virtualenv ${TEST_TYPE_FLAG} ${VENV_DIR}
+  # Install TF with pip
+  TIME_START=$SECONDS
+  install_tensorflow_pip "${WHL_PATH}"
+  TIME_ELAPSED=$(($SECONDS - $TIME_START))
+  echo "Time elapsed installing tensorflow = ${TIME_ELAPSED} seconds"
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  # Run a quick check on tensorflow installation.
+  RET_VAL=$(python -c "import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)")
+
+  # Return to original directory. Remove temp dirs.
+  popd
+  sudo rm -rf "${TMP_DIR}"
+
+  # Check result to see if tensorflow is properly installed.
+  if ! [[ ${RET_VAL} == *'(4,)'* ]]; then
+    echo "PIP test on virtualenv (non-clean) FAILED"
+    return 1
+  fi
+
+  # Install extra pip packages, if specified.
+  for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
+    echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
+
+    ${PIP_BIN_PATH} install ${PACKAGE}
+    if [[ $? != 0 ]]; then
+      echo "${PIP_BIN_PATH} install ${PACKAGE} FAILED."
+      deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+      return 1
+    fi
+  done
+
+  # Run bazel test.
+  run_test_with_bazel ${TEST_TYPE_FLAG}
+  RESULT=$?
+
+  # Deactivate from virtualenv.
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+  sudo rm -rf "${VENV_DIR}"
+
+  return $RESULT
+}
+
+###########################################################################
+# Test helper function(s)
+###########################################################################
+
+create_activate_virtualenv() {
+  VIRTUALENV_FLAGS="--system-site-packages"
+  if [[ "${1}" == "--clean" ]]; then
+    VIRTUALENV_FLAGS=""
+    shift
+  elif [[ "{1}" == "--oss_serial" ]]; then
+    shift
+  fi
+
+  VIRTUALENV_DIR="${1}"
+  if [[ -d "${VIRTUALENV_DIR}" ]]; then
+    if sudo rm -rf "${VIRTUALENV_DIR}"
+    then
+      echo "Removed existing virtualenv directory: ${VIRTUALENV_DIR}"
+    else
+      die "Failed to remove existing virtualenv directory: ${VIRTUALENV_DIR}"
+    fi
+  fi
+
+  if mkdir -p "${VIRTUALENV_DIR}"
+  then
+    echo "Created virtualenv directory: ${VIRTUALENV_DIR}"
+  else
+    die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
+  fi
+
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH_INIT} -m virtualenv -p ${PYTHON_BIN_PATH_INIT} ${VIRTUALENV_FLAGS} ${VIRTUALENV_DIR} || \
+    die "FAILED: Unable to create virtualenv"
+
+  source "${VIRTUALENV_DIR}/bin/activate" || \
+    die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
+
+  # Update .tf_configure.bazelrc with venv python path for bazel test.
+  PYTHON_BIN_PATH="$(which python)"
+  yes "" | ./configure
+}
+
+install_tensorflow_pip() {
+  if [[ -z "${1}" ]]; then
+    die "Please provide a proper wheel file path."
+  fi
+
+  # Set path to pip.
+  PIP_BIN_PATH="$(which pip${PY_MAJOR_MINOR_VER})"
+
+  # Print python and pip bin paths
+  echo "PYTHON_BIN_PATH to be used to install the .whl: ${PYTHON_BIN_PATH}"
+  echo "PIP_BIN_PATH to be used to install the .whl: ${PIP_BIN_PATH}"
+
+  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
+  echo "Upgrade pip in virtualenv"
+
+  # NOTE: pip install --upgrade pip leads to a documented TLS issue for
+  # some versions in python
+  curl https://bootstrap.pypa.io/get-pip.py | ${PYTHON_BIN_PATH} || \
+    die "Error: pip install (get-pip.py) FAILED"
+
+  # Check that requested python version matches configured one.
+  check_python_pip_version
+
+  # Force upgrade of setuptools. This must happen before the pip install of the
+  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
+  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
+  # which is too old for absl-py.
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+
+  # Force tensorflow reinstallation. Otherwise it may not get installed from
+  # last build if it had the same version number as previous build.
+  PIP_FLAGS="--upgrade --force-reinstall"
+  ${PIP_BIN_PATH} install -v ${PIP_FLAGS} ${WHL_PATH} || \
+    die "pip install (forcing to reinstall tensorflow) FAILED"
+  echo "Successfully installed pip package ${WHL_PATH}"
+
+  # Force downgrade of setuptools. This must happen after the pip install of the
+  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
+  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
+  #   ImportError: cannot import name py31compat
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+}
+
+run_test_with_bazel() {
+  IS_OSS_SERIAL=0
+  if [[ "${1}" == "--oss_serial" ]]; then
+    IS_OSS_SERIAL=1
+  fi
+  TF_GPU_COUNT=${TF_GPU_COUNT:-4}
+
+  # PIP tests should have a "different" path. Different than the one we place
+  # virtualenv, because we are deleting and recreating it here.
+  PIP_TEST_PREFIX=bazel_pip
+  TEST_ROOT=$(pwd)/${PIP_TEST_PREFIX}
+  sudo rm -rf $TEST_ROOT
+  mkdir -p $TEST_ROOT
+  ln -s $(pwd)/tensorflow $TEST_ROOT/tensorflow
+
+  if [[ "${IS_OSS_SERIAL}" == "1" ]]; then
+    remove_test_filter_tag -no_oss
+    add_test_filter_tag oss_serial
+  else
+    add_test_filter_tag -oss_serial
+  fi
+
+  # Clean the bazel cache
+  bazel clean
+  # Clean up flags before running bazel commands
+  update_bazel_flags
+  # Clean up and update test filter tags
+  update_test_filter_tags
+
+  # Figure out how many concurrent tests we can run and do run the tests.
+  BAZEL_PARALLEL_TEST_FLAGS=""
+  if [[ $CONTAINER_TYPE == "gpu" ]]; then
+    # Number of test threads is the number of GPU cards available.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \
+        --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute"
+    fi
+  else
+    # Number of test threads is the number of physical CPUs.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)"
+    fi
+  fi
+
+  if [[ ${IS_OSS_SERIAL} == 1 ]]; then
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+  fi
+
+  TEST_TARGETS_SYMLINK=""
+  for TARGET in ${TF_TEST_TARGETS[@]}; do
+    TARGET_NEW=$(echo ${TARGET} | sed -e "s/\/\//\/\/${PIP_TEST_PREFIX}\//g")
+    TEST_TARGETS_SYMLINK+="${TARGET_NEW} "
+  done
+  echo "Test targets (symlink): ${TEST_TARGETS_SYMLINK}"
+
+  # Run the test.
+  bazel test --build_tests_only ${TF_TEST_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} --test_tag_filters=${TF_TEST_FILTER_TAGS} -k -- ${TEST_TARGETS_SYMLINK}
+
+  unlink ${TEST_ROOT}/tensorflow
+}
+
+run_all_tests() {
+  if [[ -z "${PIP_TESTS}" ]]; then
+    echo "No test was specified to run. Skipping all tests."
+    return 0
+  fi
+  FAIL_COUNTER=0
+  PASS_COUNTER=0
+  for TEST in ${PIP_TESTS[@]}; do
+
+    # Run tests.
+    case "${TEST}" in
+    "test_pip_virtualenv_clean")
+      test_pip_virtualenv venv_clean --clean
+      ;;
+    "test_pip_virtualenv_non_clean")
+      test_pip_virtualenv venv
+      ;;
+    "test_pip_virtualenv_oss_serial")
+      test_pip_virtualenv venv_oss --oss_serial
+      ;;
+    *)
+      die "No matching test ${TEST} was found. Stopping test."
+      ;;
+    esac
+
+    # Check and update the results.
+    RETVAL=$?
+
+    # Update results counter
+    if [ ${RETVAL} -eq 0 ]; then
+      echo "Test (${TEST}) PASSED. (PASS COUNTER: ${PASS_COUNTER})"
+      PASS_COUNTER=$(($PASS_COUNTER+1))
+    else
+      echo "Test (${TEST}) FAILED. (FAIL COUNTER: ${FAIL_COUNTER})"
+      FAIL_COUNTER=$(($FAIL_COUNTER+1))
+    fi
+  done
+  printf "${PASS_COUNTER} PASSED | ${FAIL_COUNTER} FAILED"
+  if [[ "${FAIL_COUNTER}" == "0" ]]; then
+    printf "PIP tests ${COLOR_GREEN}PASSED${COLOR_NC}\n"
+    return 0
+  else:
+    printf "PIP tests ${COLOR_RED}FAILED${COLOR_NC}\n"
+    return 1
+  fi
+}
+
+###########################################################################
+# Build TF PIP Wheel file
+###########################################################################
+
+# Update the build flags for building whl.
+# Flags: GPU, OS, tf_nightly, project name
+GPU_FLAG=""
+NIGHTLY_FLAG=""
+
+# TF Nightly flag
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  # If 'nightly' is not specified in the project name already, then add.
+  if ! [[ $PROJECT_NAME == *"nightly"* ]]; then
+    echo "WARNING: IS_NIGHTLY=${IS_NIGHTLY} but requested project name \
+    (PROJECT_NAME=${PROJECT_NAME}) does not include 'nightly' string. \
+    Renaming it to 'tf_nightly'."
+    PROJECT_NAME="tf_nightly"
+  fi
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
+# CPU / GPU flag
+if [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+  GPU_FLAG="--gpu"
+  if ! [[ $PROJECT_NAME == *"gpu"* ]]; then
+    echo "WARNING: GPU is specified but requested project name (PROJECT_NAME=${PROJECT_NAME}) \
+    does not include 'gpu'. Appending '_gpu' to the project name."
+    PROJECT_NAME="${PROJECT_NAME}_gpu"
+  fi
+fi
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${NIGHTLY_FLAG} "--project_name" ${PROJECT_NAME} || die "build_pip_package FAILED"
+
+PY_DOTLESS_MAJOR_MINOR_VER=$(echo $PY_MAJOR_MINOR_VER | tr -d '.')
+if [[ $PY_DOTLESS_MAJOR_MINOR_VER == "2" ]]; then
+  PY_DOTLESS_MAJOR_MINOR_VER="27"
+fi
+
+# Set wheel path and verify that there is only one .whl file in the path.
+WHL_PATH=$(ls "${PIP_WHL_DIR}"/"${PROJECT_NAME}"-*"${PY_DOTLESS_MAJOR_MINOR_VER}"*"${PY_DOTLESS_MAJOR_MINOR_VER}"*.whl)
+if [[ $(echo "${WHL_PATH}" | wc -w) -ne 1 ]]; then
+  echo "ERROR: Failed to find exactly one built TensorFlow .whl file in "\
+  "directory: ${PIP_WHL_DIR}"
+fi
+
+WHL_DIR=$(dirname "${WHL_PATH}")
+WHL_BASE_NAME=$(basename "${WHL_PATH}")
+AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux1}")
+
+# Print the size of the wheel file.
+echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
+
+# Run tests (if any is specified).
+run_all_tests
+
+for WHL_PATH in $(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl); do
+  if [[ "${TF_NEED_CUDA}" -eq "1" ]]; then
+    # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+    WHL_PATH=${AUDITED_WHL_NAME}
+    cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+    echo "Copied manylinux1 wheel file at ${WHL_PATH}"
+  else
+    if [[ ${OS_TYPE} == "ubuntu" ]]; then
+      # Avoid Python3.6 abnormality by installing auditwheel here.
+      pip3 show auditwheel
+      set +e
+      pip3 install auditwheel==1.5.0
+      sudo pip3 install auditwheel==1.5.0
+      set -e
+      auditwheel --version
+
+      # Repair the wheels for cpu manylinux1
+      echo "auditwheel repairing ${WHL_PATH}"
+      auditwheel repair -w "${WHL_DIR}" "${WHL_PATH}"
+
+      if [[ -f ${AUDITED_WHL_NAME} ]]; then
+        WHL_PATH=${AUDITED_WHL_NAME}
+        echo "Repaired manylinux1 wheel file at: ${WHL_PATH}"
+      else
+        die "WARNING: Cannot find repaired wheel."
+      fi
+    fi
+  fi
+done
+
+echo "EOF: Successfully ran pip_new.sh"
diff --git a/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc
index 65b50bd3ae9be960283f6cdfbe7dca296e5c489b..721da2a0bdb96dcbce83ed0b2bc5f0d5094e2b8a 100644
--- a/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc
+++ b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
 __global__ void AddOneKernel(const int* in, const int N, int* out) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
@@ -25,7 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
 }
 
 void AddOneKernelLauncher(const int* in, const int N, int* out) {
-  AddOneKernel<<<32, 256>>>(in, N, out);
+  TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
+                                             in, N, out));
 }
 
 #endif
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 2c348a0e3390af05cffff5d9a73d0bd57caa92b4..afb282715bc9197eb2250cb823af2a0f36a0d233 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -540,9 +540,12 @@ do_check_file_name_test() {
   python file_name_test.py
 }
 
+do_libtensorflow_framework_not_depend_on_cuda_check() {
+  bazel build --action_env=TF_NEED_CUDA=1 --define framework_shared_object=true --config=cuda --nobuild_tests_only tensorflow/core/platform/default/build_config:libtensorflow_cuda_check_deps
+}
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_libtensorflow_framework_not_depend_on_cuda_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu libtensorflow_framework.so does not depend on cuda shared libraries.")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 40a744374564d3ad3e663de8453d4085202c4e0c..856d64eb82f48525d4851b1f2167e44c849eaa27 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,7 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+TF_NIGHTLY_REGEX = (r"(.+)(tf_nightly.*)-(\d\.[\d]{1,2}"
                     r"\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
@@ -43,7 +43,7 @@ def check_existence(filename):
     raise RuntimeError("%s not found." % filename)
 
 
-def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
+def copy_binary(directory, origin_tag, new_tag, version, package):
   """Rename and copy binaries for different python versions.
 
   Arguments:
@@ -51,14 +51,10 @@ def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
     origin_tag: str of the old python version tag
     new_tag: str of the new tag
     version: the version of the package
-    gpu: bool if its a gpu build or not
+    package: str, name of the package
 
   """
   print("Rename and copy binaries with %s to %s." % (origin_tag, new_tag))
-  if gpu:
-    package = "tf_nightly_gpu"
-  else:
-    package = "tf_nightly"
   origin_binary = BINARY_STRING_TEMPLATE % (package, version, origin_tag)
   new_binary = BINARY_STRING_TEMPLATE % (package, version, new_tag)
   zip_ref = zipfile.ZipFile(os.path.join(directory, origin_binary), "r")
@@ -120,7 +116,7 @@ def main():
   check_existence(args.filename)
   regex_groups = re.search(TF_NIGHTLY_REGEX, args.filename)
   directory = regex_groups.group(1)
-  gpu = regex_groups.group(2)
+  package = regex_groups.group(2)
   version = regex_groups.group(3)
   origin_tag = regex_groups.group(4)
   old_py_ver = re.search(r"(cp\d\d)", origin_tag).group(1)
@@ -129,7 +125,7 @@ def main():
   new_tag = origin_tag.replace(old_py_ver, "cp" + args.new_py_ver)
 
   # Copy the binary with the info we have
-  copy_binary(directory, origin_tag, new_tag, version, gpu)
+  copy_binary(directory, origin_tag, new_tag, version, package)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 989f2a92eb6e5940b0557452080c3b0f3cf706ae..bd810016d2a05071e73cf3a8c72600b315e99679 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -68,12 +68,6 @@ apt-get install -y --no-install-recommends \
     zip \
     zlib1g-dev
 
-apt-get update && \
-  apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-  apt-get update && \
-  apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-  apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
 # populate the database
 updatedb
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 5ae840cfa0730401cd87acdf8205a34fc5e56a3d..131950dc0d3e2499ffe77ea424db83c68e713130 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -18,10 +18,10 @@ set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
-# Run easy_install before easy_install3, so that the default pip points to pip2,
+# Run easy_install after easy_install3, so that the default pip points to pip2,
 # to match the default python version of 2.7.
-easy_install3 -U pip==9.0.3
-easy_install -U pip==9.0.3
+easy_install3 -U pip==18.1
+easy_install -U pip==18.1
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -40,8 +40,8 @@ pip2 install virtualenv
 pip3 install virtualenv
 
 # Install six.
-pip2 install --upgrade six==1.10.0
-pip3 install --upgrade six==1.10.0
+pip2 install --upgrade six==1.12.0
+pip3 install --upgrade six==1.12.0
 
 # Install absl-py.
 pip2 install --upgrade absl-py
@@ -97,9 +97,9 @@ pip3 install py-cpuinfo
 pip2 install pylint==1.6.4
 pip3 install pylint==1.6.4
 
-# pep8 tests require the following:
-pip2 install pep8
-pip3 install pep8
+# pycodestyle tests require the following:
+pip2 install pycodestyle
+pip3 install pycodestyle
 
 # tf.mock require the following for python2:
 pip2 install mock
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 1d0b838c1b5c101e202fcb3cb64c3ce4a9bd34d6..1944183c0e8124c2ed4e572ac4a63f1f82f5c380 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -19,9 +19,10 @@ set -e
 set -x
 
 N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_GPUS=$(lspci|grep 'VGA'|grep 'AMD/ATI'|wc -l)
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
 echo ""
 
 # Run configure.
@@ -29,11 +30,14 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
-    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=1 --config=opt \
+bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test -k \
+    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 600,900,2400,7200 \
+    --build_tests_only --test_output=errors --local_test_jobs=${TF_GPU_COUNT} --config=opt \
+    --test_sharding_strategy=disabled \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 4373d464b6a9f8cf6d498652d7afeed507a666ba..117627c458c3326735deb667b484c642b25a2ba9 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -84,19 +84,26 @@ class Version(object):
       identifier_string: extension string eg. (-rc0)
       version_type: version parameter ((REGULAR|NIGHTLY)_VERSION)
     """
-    self.string = "%s.%s.%s%s" % (major,
-                                  minor,
-                                  patch,
-                                  identifier_string)
     self.major = major
     self.minor = minor
     self.patch = patch
     self.identifier_string = identifier_string
     self.version_type = version_type
+    self._update_string()
+
+  def _update_string(self):
+    self.string = "%s.%s.%s%s" % (self.major,
+                                  self.minor,
+                                  self.patch,
+                                  self.identifier_string)
 
   def __str__(self):
     return self.string
 
+  def set_identifier_string(self, identifier_string):
+    self.identifier_string = identifier_string
+    self._update_string()
+
   @property
   def pep_440_str(self):
     if self.version_type == REGULAR_VERSION:
@@ -283,15 +290,14 @@ def main():
   """
 
   parser = argparse.ArgumentParser(description="Cherry picking automation.")
-  group = parser.add_mutually_exclusive_group(required=True)
 
   # Arg information
-  group.add_argument("--version",
-                     help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
-                     default="")
-  group.add_argument("--nightly",
-                     help="disable the service provisioning step",
-                     action="store_true")
+  parser.add_argument("--version",
+                      help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
+                      default="")
+  parser.add_argument("--nightly",
+                      help="disable the service provisioning step",
+                      action="store_true")
 
   args = parser.parse_args()
 
@@ -299,13 +305,17 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
-    # Dev minor version is one ahead of official.
-    nightly_minor_ver = int(old_version.minor) + 1
-    new_version = Version(old_version.major,
-                          str(nightly_minor_ver),
-                          old_version.patch,
-                          "-dev" + time.strftime("%Y%m%d"),
-                          NIGHTLY_VERSION)
+    if args.version:
+      new_version = Version.parse_from_string(args.version, NIGHTLY_VERSION)
+      new_version.set_identifier_string("-dev" + time.strftime("%Y%m%d"))
+    else:
+      # Dev minor version is one ahead of official.
+      nightly_minor_ver = int(old_version.minor) + 1
+      new_version = Version(old_version.major,
+                            str(nightly_minor_ver),
+                            old_version.patch,
+                            "-dev" + time.strftime("%Y%m%d"),
+                            NIGHTLY_VERSION)
   else:
     new_version = Version.parse_from_string(args.version, REGULAR_VERSION)
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 2eeb8dfa1aa9f8ed3bb8faf3e96472e0665bf264..a938928baab9e010a5f2d7b8c209146fdc424932 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -60,6 +60,7 @@ RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 PROJECT_NAME=""
 EXTRA_BUILD_FLAGS=""
+EXTRA_TEST_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -89,6 +90,13 @@ while [[ $# -gt 0 ]]; do
       fi
       PROJECT_NAME="$1"
       ;;
+    --extra_test_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_TEST_FLAGS="$1"
+      ;;
     *)
   esac
   shift
@@ -105,7 +113,7 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
-    python tensorflow/tools/ci_build/update_version.py --version=2.0.0-preview
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
   else
     python tensorflow/tools/ci_build/update_version.py --nightly
   fi
@@ -125,6 +133,10 @@ fi
 
 run_configure_for_cpu_build
 
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS}  \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  tensorflow/lite:framework tensorflow/lite/examples/minimal:minimal || exit $?
+
 bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package \
   --incompatible_remove_native_http_archive=false || exit $?
@@ -152,6 +164,7 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
+  ${EXTRA_TEST_FLAGS} \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
   --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 4cca4f9acf2cd1504b24e94521584c9f85a19873..1f49e0cc55c0a134dfdd3d92f19a98be5170f292 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -60,6 +60,7 @@ RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 PROJECT_NAME=""
 EXTRA_BUILD_FLAGS=""
+EXTRA_TEST_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -89,6 +90,13 @@ while [[ $# -gt 0 ]]; do
       fi
       PROJECT_NAME="$1"
       ;;
+    --extra_test_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_TEST_FLAGS="$1"
+      ;;
     *)
   esac
   shift
@@ -105,7 +113,7 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
-    python tensorflow/tools/ci_build/update_version.py --version=2.0.0-preview
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
   else
     python tensorflow/tools/ci_build/update_version.py --nightly
   fi
@@ -157,6 +165,7 @@ TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
 bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_env=TF_GPU_COUNT \
+  ${EXTRA_TEST_FLAGS} \
   --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss \
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index fdbd1120b20ea4461a4ec5f84c666d8b62309905..d03f0ee4a9ce5a176b359eaa9ba3189f047ccf62 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -31,20 +31,16 @@ if [ ! -e "WORKSPACE" ]; then
   exit 1
 fi
 
-export TF_BAZEL_TARGETS="//tensorflow:libtensorflow.so"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clicenses_generate"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
-
 run_configure_for_cpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX \
-  tensorflow:libtensorflow.so \
+bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX --announce_rc \
+  tensorflow:tensorflow.dll \
+  tensorflow:tensorflow_dll_import_lib \
   tensorflow/tools/lib_package:clicenses_generate \
-  tensorflow/java:libtensorflow_jni.so \
+  tensorflow/java:tensorflow_jni.dll \
   tensorflow/tools/lib_package:jnilicenses_generate
 
 DIR=lib_package
@@ -52,7 +48,7 @@ rm -rf ${DIR}
 mkdir -p ${DIR}
 
 # Zip up the .dll and the LICENSE for the JNI library.
-cp bazel-bin/tensorflow/java/libtensorflow_jni.so ${DIR}/tensorflow_jni.dll
+cp bazel-bin/tensorflow/java/tensorflow_jni.dll ${DIR}/tensorflow_jni.dll
 zip -j ${DIR}/libtensorflow_jni-cpu-windows-$(uname -m).zip \
   ${DIR}/tensorflow_jni.dll \
   bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/jni/LICENSE
@@ -62,13 +58,15 @@ rm -f ${DIR}/tensorflow_jni.dll
 mkdir -p ${DIR}/include/tensorflow/c
 mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/lib
-cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/lib/tensorflow.dll
+cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
+cp bazel-bin/tensorflow/tensorflow.lib ${DIR}/lib/tensorflow.lib
 cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
 cp tensorflow/c/eager/c_api.h ${DIR}/include/tensorflow/c/eager
 cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
 cd ${DIR}
 zip libtensorflow-cpu-windows-$(uname -m).zip \
   lib/tensorflow.dll \
+  lib/tensorflow.lib \
   include/tensorflow/c/eager/c_api.h \
   include/tensorflow/c/c_api.h \
   include/tensorflow/c/LICENSE
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 9c05db974b4e30c2997a9c0d11f792ae52587eb5..fbfc69ed0967403b8df6d88d4ef92ad9ba369311 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,20 +31,16 @@ if [ ! -e "WORKSPACE" ]; then
   exit 1
 fi
 
-export TF_BAZEL_TARGETS="//tensorflow:libtensorflow.so"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clicenses_generate"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
-export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
-
 run_configure_for_gpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
 bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX --announce_rc \
-  tensorflow:libtensorflow.so \
+  tensorflow:tensorflow.dll \
+  tensorflow:tensorflow_dll_import_lib \
   tensorflow/tools/lib_package:clicenses_generate \
-  tensorflow/java:libtensorflow_jni.so \
+  tensorflow/java:tensorflow_jni.dll \
   tensorflow/tools/lib_package:jnilicenses_generate
 
 DIR=lib_package
@@ -52,7 +48,7 @@ rm -rf ${DIR}
 mkdir -p ${DIR}
 
 # Zip up the .dll and the LICENSE for the JNI library.
-cp bazel-bin/tensorflow/java/libtensorflow_jni.so ${DIR}/tensorflow_jni.dll
+cp bazel-bin/tensorflow/java/tensorflow_jni.dll ${DIR}/tensorflow_jni.dll
 zip -j ${DIR}/libtensorflow_jni-gpu-windows-$(uname -m).zip \
   ${DIR}/tensorflow_jni.dll \
   bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/jni/LICENSE
@@ -60,13 +56,18 @@ rm -f ${DIR}/tensorflow_jni.dll
 
 # Zip up the .dll, LICENSE and include files for the C library.
 mkdir -p ${DIR}/include/tensorflow/c
+mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/lib
-cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/lib/tensorflow.dll
+cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
+cp bazel-bin/tensorflow/tensorflow.lib ${DIR}/lib/tensorflow.lib
 cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp tensorflow/c/eager/c_api.h ${DIR}/include/tensorflow/c/eager
 cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
 cd ${DIR}
-zip -j libtensorflow-gpu-windows-$(uname -m).zip \
+zip libtensorflow-gpu-windows-$(uname -m).zip \
   lib/tensorflow.dll \
+  lib/tensorflow.lib \
+  include/tensorflow/c/eager/c_api.h \
   include/tensorflow/c/c_api.h \
   include/tensorflow/c/LICENSE
 rm -rf lib include
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 31dbc02963d60a4943f0683252c86ea0ba1610c0..83f696b514e203e97df0e9bf51934ada2a5d71f2 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -8,6 +8,12 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow:internal"])
 
+py_library(
+    name = "ipynb",
+    srcs = ["ipynb.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "ast_edits",
     srcs = ["ast_edits.py"],
@@ -34,6 +40,13 @@ py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
+    deps = [":tf_upgrade_lib"],
+)
+
+py_library(
+    name = "tf_upgrade_lib",
+    srcs = ["tf_upgrade.py"],
+    srcs_version = "PY2AND3",
     deps = [":ast_edits"],
 )
 
@@ -42,7 +55,7 @@ py_test(
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_upgrade",
+        ":tf_upgrade_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "@six_archive//:six",
@@ -80,6 +93,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":ast_edits",
+        ":ipynb",
         ":tf_upgrade_v2_lib",
     ],
 )
@@ -89,7 +103,7 @@ py_test(
     srcs = ["tf_upgrade_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_upgrade_v2",
+        ":tf_upgrade_v2_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index 5e2de35338a9460205272112195ff5560d6e503c..ca6dd5a0a07a0d4d98673d9294fc47ff51561d6c 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -1,9 +1,9 @@
 # TensorFlow Python API Upgrade Utility
 
-This tool allows you to upgrade your existing TensorFlow Python scripts.
-Specifically: \
-`tf_upgrade_v2.py`: upgrades code from TensorFlow 1.12 to TensorFlow 2.0 preview. \
-`tf_upgrade.py`: upgrades code to TensorFlow 1.0 from TensorFlow 0.11.
+This tool allows you to upgrade your existing TensorFlow Python scripts,
+specifically:
+* `tf_upgrade_v2.py`: Upgrade code from TensorFlow 1.x to TensorFlow 2.0 preview.
+* `tf_upgrade.py`: Upgrade code to TensorFlow 1.0 from TensorFlow 0.11.
 
 ## Running the script from pip package
 
@@ -58,17 +58,13 @@ arguments that mismap arguments.
 - This script wouldn't actually reorder arguments. Instead, the script will add
 keyword arguments to functions that had their arguments reordered.
 
-- This script is not able to upgrade all functions. One notable example is
-`tf.nn.conv2d` that no longer takes `use_cudnn_on_gpu` argument.
-If the script detects this, it will report this to stdout
-(and in the report), and you can fix it manually. For example if you have
-`tf.nn.conv2d(inputs, filters, strides, padding, use_cudnn_on_gpu=True)`
-you will need to manually change it to
-`tf.nn.conv2d(input, filters, strides, padding)`.
+- The script assumes that `tensorflow` is imported using `import tensorflow as tf`.
 
-- There are some syntaxes that are not handleable with this script as this
+- Note for upgrading to 2.0: Check out [tf2up.ml](http://tf2up.ml) for a convenient tool to upgrade Jupiter
+  notebooks and Python files in a github repository.
+
+- Note for upgrading to 1.0: There are some syntaxes that are not handleable with this script as this
 script was designed to use only standard python packages.
-There is an alternative available for TensorFlow 0.* to 1.0 upgrade script.
 If the script fails with "A necessary keyword argument failed to be inserted." or
 "Failed to find keyword lexicographically. Fix manually.", you can try
 [@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 940be4c23cf9eddbb59a150c4035574992061dfa..25c0a2d5c12bb2cfffabd342f938a27adc005d4f 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -34,6 +34,73 @@ FIND_OPEN = re.compile(r"^\s*(\[).*$")
 FIND_STRING_CHARS = re.compile(r"['\"]")
 
 
+INFO = "INFO"
+WARNING = "WARNING"
+ERROR = "ERROR"
+
+
+def full_name_node(name, ctx=ast.Load()):
+  """Make an Attribute or Name node for name.
+
+  Translate a qualified name into nested Attribute nodes (and a Name node).
+
+  Args:
+    name: The name to translate to a node.
+    ctx: What context this name is used in. Defaults to Load()
+
+  Returns:
+    A Name or Attribute node.
+  """
+  names = name.split(".")
+  names.reverse()
+  node = ast.Name(id=names.pop(), ctx=ast.Load())
+  while names:
+    node = ast.Attribute(value=node, attr=names.pop(), ctx=ast.Load())
+
+  # Change outermost ctx to the one given to us (inner ones should be Load).
+  node.ctx = ctx
+  return node
+
+
+def get_arg_value(node, arg_name, arg_pos=None):
+  """Get the value of an argument from a ast.Call node.
+
+  This function goes through the positional and keyword arguments to check
+  whether a given argument was used, and if so, returns its value (the node
+  representing its value).
+
+  This cannot introspect *args or **args, but it safely handles *args in
+  Python3.5+.
+
+  Args:
+    node: The ast.Call node to extract arg values from.
+    arg_name: The name of the argument to extract.
+    arg_pos: The position of the argument (in case it's passed as a positional
+      argument).
+
+  Returns:
+    A tuple (arg_present, arg_value) containing a boolean indicating whether
+    the argument is present, and its value in case it is.
+  """
+  # Check keyword args
+  if arg_name is not None:
+    for kw in node.keywords:
+      if kw.arg == arg_name:
+        return (True, kw.value)
+
+  # Check positional args
+  if arg_pos is not None:
+    idx = 0
+    for arg in node.args:
+      if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
+        continue  # Can't parse Starred
+      if idx == arg_pos:
+        return (True, arg)
+      idx += 1
+
+  return (False, None)
+
+
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
 
@@ -49,6 +116,8 @@ class APIChangeSpec(object):
   * `function_warnings`: maps full names of functions to warnings that will be
     printed out if the function is used. (e.g. tf.nn.convolution())
   * `function_transformers`: maps function names to custom handlers
+  * `module_deprecations`: maps module names to warnings that will be printed
+    if the module is still used after all other transformations have run
 
   For an example, see `TFAPIChangeSpec`.
   """
@@ -63,8 +132,7 @@ class _PastaEditVisitor(ast.NodeVisitor):
 
   def __init__(self, api_change_spec):
     self._api_change_spec = api_change_spec
-    self._log = []   # Holds 3-tuples: line, col, msg.
-    self._errors = []  # Same structure as _log.
+    self._log = []   # Holds 4-tuples: severity, line, col, msg.
     self._stack = []  # Allow easy access to parents.
 
   # Overridden to maintain a stack of nodes to allow for parent access
@@ -75,55 +143,42 @@ class _PastaEditVisitor(ast.NodeVisitor):
 
   @property
   def errors(self):
-    return self._errors
+    return [log for log in self._log if log[0] == ERROR]
 
   @property
-  def log(self):
-    return self._log
+  def warnings(self):
+    return [log for log in self._log if log[0] == WARNING]
 
-  def _format_log(self, log):
-    text = ""
-    for log_entry in log:
-      text += "Line %d:%d: %s\n" % log_entry
-    return text
+  @property
+  def warnings_and_errors(self):
+    return [log for log in self._log if log[0] in (WARNING, ERROR)]
 
-  def log_text(self):
-    return self._format_log(self.log)
+  @property
+  def info(self):
+    return [log for log in self._log if log[0] == INFO]
 
-  def add_log(self, lineno, col, msg):
-    self._log.append((lineno, col, msg))
-    print("Line %d:%d: %s" % (lineno, col, msg))
+  @property
+  def log(self):
+    return self._log
 
-  def add_error(self, lineno, col, msg):
-    # All errors are also added to the regular log.
-    self.add_log(lineno, col, msg)
-    self._errors.append((lineno, col, msg))
+  def add_log(self, severity, lineno, col, msg):
+    self._log.append((severity, lineno, col, msg))
+    print("%s line %d:%d: %s" % (severity, lineno, col, msg))
 
   def add_logs(self, logs):
     """Record a log and print it.
 
-    The log should be a tuple (lineno, col_offset, msg), which will be printed
-    and then recorded. It is part of the log available in the self.log property.
+    The log should be a tuple `(severity, lineno, col_offset, msg)`, which will
+    be printed and recorded. It is part of the log available in the `self.log`
+    property.
 
     Args:
-      logs: The log to add. Must be a tuple (lineno, col_offset, msg).
+      logs: The logs to add. Must be a list of tuples
+        `(severity, lineno, col_offset, msg)`.
     """
     self._log.extend(logs)
     for log in logs:
-      print("Line %d:%d: %s" % log)
-
-  def add_errors(self, errors):
-    """Record an error and print it.
-
-    The error must be a tuple (lineno, col_offset, msg), which will be printed
-    and then recorded as both a log and an error. It is therefore part of the
-    log available in the self.log as well as the self.errors property.
-
-    Args:
-      errors: The log to add. Must be a tuple (lineno, col_offset, msg).
-    """
-    self.add_logs(errors)
-    self._errors.extend(errors)
+      print("%s line %d:%d: %s" % log)
 
   def _get_applicable_entries(self, transformer_field, full_name, name):
     """Get all list entries indexed by name that apply to full_name or name."""
@@ -158,7 +213,7 @@ class _PastaEditVisitor(ast.NodeVisitor):
   def _get_full_name(self, node):
     """Traverse an Attribute node to generate a full name, e.g., "tf.foo.bar".
 
-    This is the inverse of _full_name_node.
+    This is the inverse of `full_name_node`.
 
     Args:
       node: A Node of type Attribute.
@@ -177,37 +232,28 @@ class _PastaEditVisitor(ast.NodeVisitor):
     items.append(curr.id)
     return ".".join(reversed(items))
 
-  def _full_name_node(self, name, ctx=ast.Load()):
-    """Make an Attribute or Name node for name.
-
-    Translate a qualified name into nested Attribute nodes (and a Name node).
-
-    Args:
-      name: The name to translate to a node.
-      ctx: What context this name is used in. Defaults to Load()
-
-    Returns:
-      A Name or Attribute node.
-    """
-    names = name.split(".")
-    names.reverse()
-    node = ast.Name(id=names.pop(), ctx=ast.Load())
-    while names:
-      node = ast.Attribute(value=node, attr=names.pop(), ctx=ast.Load())
-
-    # Change outermost ctx to the one given to us (inner ones should be Load).
-    node.ctx = ctx
-    return node
-
   def _maybe_add_warning(self, node, full_name):
     """Adds an error to be printed about full_name at node."""
     function_warnings = self._api_change_spec.function_warnings
     if full_name in function_warnings:
-      warning_message = function_warnings[full_name]
-      warning_message = warning_message.replace("<function name>", full_name)
-      self.add_error(node.lineno, node.col_offset,
-                     "%s requires manual check: %s." % (full_name,
-                                                        warning_message))
+      level, message = function_warnings[full_name]
+      message = message.replace("<function name>", full_name)
+      self.add_log(level, node.lineno, node.col_offset,
+                   "%s requires manual check. %s" % (full_name, message))
+      return True
+    else:
+      return False
+
+  def _maybe_add_module_deprecation_warning(self, node, full_name, whole_name):
+    """Adds a warning if full_name is a deprecated module."""
+    warnings = self._api_change_spec.module_deprecations
+    if full_name in warnings:
+      level, message = warnings[full_name]
+      message = message.replace("<function name>", whole_name)
+      self.add_log(level, node.lineno, node.col_offset,
+                   "Using member %s in deprecated module %s. %s" % (whole_name,
+                                                                    full_name,
+                                                                    message))
       return True
     else:
       return False
@@ -240,14 +286,14 @@ class _PastaEditVisitor(ast.NodeVisitor):
     arg_warnings = self._get_applicable_dict("function_arg_warnings",
                                              full_name, name)
 
-    used_args = [kw.arg for kw in node.keywords]
-    for (kwarg, arg), warning in arg_warnings.items():
-      if kwarg in used_args or len(node.args) > arg:
+    for (kwarg, arg), (level, warning) in sorted(arg_warnings.items()):
+      present, _ = get_arg_value(node, kwarg, arg)
+      if present:
         warned = True
         warning_message = warning.replace("<function name>", full_name or name)
-        self.add_error(node.lineno, node.col_offset,
-                       "%s called with %s argument requires manual check: %s." %
-                       (full_name or name, kwarg, warning_message))
+        self.add_log(level, node.lineno, node.col_offset,
+                     "%s called with %s argument requires manual check: %s" %
+                     (full_name or name, kwarg, warning_message))
 
     return warned
 
@@ -255,9 +301,9 @@ class _PastaEditVisitor(ast.NodeVisitor):
     """Replace node (Attribute or Name) with a node representing full_name."""
     new_name = self._api_change_spec.symbol_renames.get(full_name, None)
     if new_name:
-      self.add_log(node.lineno, node.col_offset,
+      self.add_log(INFO, node.lineno, node.col_offset,
                    "Renamed %r to %r" % (full_name, new_name))
-      new_node = self._full_name_node(new_name, node.ctx)
+      new_node = full_name_node(new_name, node.ctx)
       ast.copy_location(new_node, node)
       pasta.ast_utils.replace_child(parent, node, new_node)
       return True
@@ -276,7 +322,7 @@ class _PastaEditVisitor(ast.NodeVisitor):
           new_node = ast.Call(node, [], [])
         pasta.ast_utils.replace_child(parent, node, new_node)
         ast.copy_location(new_node, node)
-        self.add_log(node.lineno, node.col_offset,
+        self.add_log(INFO, node.lineno, node.col_offset,
                      "Changed %r to a function call" % full_name)
         return True
     return False
@@ -288,15 +334,17 @@ class _PastaEditVisitor(ast.NodeVisitor):
     if full_name in function_reorders:
       reordered = function_reorders[full_name]
       new_keywords = []
-      for idx, arg in enumerate(node.args):
+      idx = 0
+      for arg in node.args:
         if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
           continue  # Can't move Starred to keywords
         keyword_arg = reordered[idx]
         keyword = ast.keyword(arg=keyword_arg, value=arg)
         new_keywords.append(keyword)
+        idx += 1
 
       if new_keywords:
-        self.add_log(node.lineno, node.col_offset,
+        self.add_log(INFO, node.lineno, node.col_offset,
                      "Added keywords to args of function %r" % full_name)
         node.args = []
         node.keywords = new_keywords + (node.keywords or [])
@@ -320,14 +368,14 @@ class _PastaEditVisitor(ast.NodeVisitor):
         if renamed_keywords[argkey] is None:
           lineno = getattr(keyword, "lineno", node.lineno)
           col_offset = getattr(keyword, "col_offset", node.col_offset)
-          self.add_log(lineno, col_offset,
+          self.add_log(INFO, lineno, col_offset,
                        "Removed argument %s for function %s" % (
                            argkey, full_name or name))
         else:
           keyword.arg = renamed_keywords[argkey]
           lineno = getattr(keyword, "lineno", node.lineno)
           col_offset = getattr(keyword, "col_offset", node.col_offset)
-          self.add_log(lineno, col_offset,
+          self.add_log(INFO, lineno, col_offset,
                        "Renamed keyword argument for %s from %s to %s" % (
                            full_name, argkey, renamed_keywords[argkey]))
           new_keywords.append(keyword)
@@ -380,15 +428,12 @@ class _PastaEditVisitor(ast.NodeVisitor):
 
     for transformer in transformers:
       logs = []
-      errors = []
-      new_node = transformer(parent, node, full_name, name, logs, errors)
+      new_node = transformer(parent, node, full_name, name, logs)
       self.add_logs(logs)
-      self.add_errors(errors)
-      if new_node:
-        if new_node is not node:
-          pasta.ast_utils.replace_child(parent, node, new_node)
-          node = new_node
-          self._stack[-1] = node
+      if new_node and new_node is not node:
+        pasta.ast_utils.replace_child(parent, node, new_node)
+        node = new_node
+        self._stack[-1] = node
 
     self.generic_visit(node)
 
@@ -411,6 +456,14 @@ class _PastaEditVisitor(ast.NodeVisitor):
       if self._maybe_change_to_function_call(parent, node, full_name):
         return
 
+      # The isinstance check is enough -- a bare Attribute is never root.
+      i = 2
+      while isinstance(self._stack[-i], ast.Attribute):
+        i += 1
+      whole_name = pasta.dump(self._stack[-(i-1)])
+
+      self._maybe_add_module_deprecation_warning(node, full_name, whole_name)
+
     self.generic_visit(node)
 
 
@@ -444,29 +497,35 @@ class ASTCodeUpgrader(object):
     shutil.move(temp_file.name, out_filename)
     return ret
 
-  def _format_errors(self, errors, in_filename):
-    return ["%s:%d:%d: %s" % ((in_filename,) + error) for error in errors]
+  def format_log(self, log, in_filename):
+    log_string = "%d:%d: %s: %s" % (log[1], log[2], log[0], log[3])
+    if in_filename:
+      return in_filename + ":" + log_string
+    else:
+      return log_string
 
   def update_string_pasta(self, text, in_filename):
     """Updates a file using pasta."""
     try:
       t = pasta.parse(text)
     except (SyntaxError, ValueError, TypeError):
-      log = "Failed to parse.\n\n" + traceback.format_exc()
+      log = ["ERROR: Failed to parse.\n" + traceback.format_exc()]
       return 0, "", log, []
 
     visitor = _PastaEditVisitor(self._api_change_spec)
     visitor.visit(t)
 
-    errors = self._format_errors(visitor.errors, in_filename)
-    return 1, pasta.dump(t), visitor.log_text(), errors
+    logs = [self.format_log(log, None) for log in visitor.log]
+    errors = [self.format_log(error, in_filename)
+              for error in visitor.warnings_and_errors]
+    return 1, pasta.dump(t), logs, errors
 
   def _format_log(self, log, in_filename, out_filename):
     text = "-" * 80 + "\n"
     text += "Processing file %r\n outputting to %r\n" % (in_filename,
                                                          out_filename)
     text += "-" * 80 + "\n\n"
-    text += log
+    text += "\n".join(log) + "\n"
     text += "-" * 80 + "\n\n"
     return text
 
@@ -496,7 +555,7 @@ class ASTCodeUpgrader(object):
             process_errors)
 
   def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files, in_place):
+                   copy_other_files):
     """Processes upgrades on an entire tree of python files in place.
 
     Note that only Python files. If you have custom code in other languages,
@@ -506,19 +565,14 @@ class ASTCodeUpgrader(object):
       root_directory: Directory to walk and process.
       output_root_directory: Directory to use as base.
       copy_other_files: Copy files that are not touched by this converter.
-      in_place: Allow the conversion of an entire directory in place.
 
     Returns:
-      A tuple of files processed, the report string ofr all files, and errors
+      A tuple of files processed, the report string for all files, and a dict
+        mapping filenames to errors encountered in that file.
     """
 
     if output_root_directory == root_directory:
-      if in_place:
-        return self.process_tree_inplace(root_directory)
-      else:
-        print("In order to copy a directory in place the `--inplace` input "
-              "arg must be set to `True`.")
-        sys.exit(1)
+      return self.process_tree_inplace(root_directory)
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
@@ -556,7 +610,7 @@ class ASTCodeUpgrader(object):
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
-    tree_errors = []
+    tree_errors = {}
     report = ""
     report += ("=" * 80) + "\n"
     report += "Input tree: %r\n" % root_directory
@@ -568,7 +622,7 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       file_count += 1
       _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
+      tree_errors[input_path] = l_errors
       report += l_report
     for input_path, output_path in files_to_copy:
       output_directory = os.path.dirname(output_path)
@@ -586,7 +640,7 @@ class ASTCodeUpgrader(object):
       files_to_process += py_files
 
     file_count = 0
-    tree_errors = []
+    tree_errors = {}
     report = ""
     report += ("=" * 80) + "\n"
     report += "Input tree: %r\n" % root_directory
@@ -595,7 +649,7 @@ class ASTCodeUpgrader(object):
     for path in files_to_process:
       file_count += 1
       _, l_report, l_errors = self.process_file(path, path)
-      tree_errors += l_errors
+      tree_errors[path] = l_errors
       report += l_report
 
     return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index ff8b8d69821dcb41ddcef6d4d6d500342cb6b2b8..a9307f9f83bb8f1ccfb965ca4570c5fba4c1e047 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -39,7 +39,10 @@ following new APIs:
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import ast
 import six
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 from tensorflow.tools.compatibility import ast_edits
@@ -55,6 +58,15 @@ class NoUpdateSpec(ast_edits.APIChangeSpec):
     self.symbol_renames = {}
     self.function_warnings = {}
     self.change_to_function = {}
+    self.module_deprecations = {}
+
+
+class ModuleDeprecationSpec(NoUpdateSpec):
+  """A specification which deprecates 'a.b'."""
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.module_deprecations.update({"a.b": (ast_edits.ERROR, "a.b is evil.")})
 
 
 class RenameKeywordSpec(NoUpdateSpec):
@@ -169,6 +181,15 @@ class TestAstEdits(test_util.TensorFlowTestCase):
                                      "test_out.py", out_file))
     return (count, report, errors), out_file.getvalue()
 
+  def testModuleDeprecation(self):
+    text = "a.b.c(a.b.x)"
+    (_, _, errors), new_text = self._upgrade(ModuleDeprecationSpec(), text)
+    self.assertEqual(text, new_text)
+    self.assertIn("Using member a.b.c", errors[0])
+    self.assertIn("1:0", errors[0])
+    self.assertIn("Using member a.b.c", errors[0])
+    self.assertIn("1:6", errors[1])
+
   def testNoTransformIfNothingIsSupplied(self):
     text = "f(a, b, kw1=c, kw2=d)\n"
     _, new_text = self._upgrade(NoUpdateSpec(), text)
@@ -414,7 +435,7 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
       def __init__(self):
         NoUpdateSpec.__init__(self)
-        self.function_warnings = {"*.foo": "not good"}
+        self.function_warnings = {"*.foo": (ast_edits.WARNING, "not good")}
 
     texts = ["object.foo()", "get_object().foo()",
              "get_object().foo()", "object.foo().bar()"]
@@ -429,6 +450,13 @@ class TestAstEdits(test_util.TensorFlowTestCase):
       (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
       self.assertNotIn("not good", report)
 
+  def testFullNameNode(self):
+    t = ast_edits.full_name_node("a.b.c")
+    self.assertEquals(
+        ast.dump(t),
+        "Attribute(value=Attribute(value=Name(id='a', ctx=Load()), attr='b', "
+        "ctx=Load()), attr='c', ctx=Load())"
+    )
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/ipynb.py b/tensorflow/tools/compatibility/ipynb.py
new file mode 100644
index 0000000000000000000000000000000000000000..567665c7c0db89f7f41768b5556e5f65c3c966c8
--- /dev/null
+++ b/tensorflow/tools/compatibility/ipynb.py
@@ -0,0 +1,124 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""A module to support operation on ipynb files"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import shutil
+import tempfile
+
+CodeLine = collections.namedtuple("CodeLine", ["cell_number", "code"])
+
+
+def process_file(in_filename, out_filename, upgrader):
+  """The function where we inject the support for ipynb upgrade."""
+  print("Extracting code lines from original notebook")
+  raw_code, notebook = _get_code(in_filename)
+  raw_lines = [cl.code for cl in raw_code]
+
+  # The function follows the original flow from `upgrader.process_fil`
+  with tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+
+    processed_file, new_file_content, log, process_errors = (
+        upgrader.update_string_pasta("\n".join(raw_lines), in_filename))
+
+    if temp_file and processed_file:
+      new_notebook = _update_notebook(notebook, raw_code,
+                                      new_file_content.split("\n"))
+      json.dump(new_notebook, temp_file)
+    else:
+      raise SyntaxError(
+          "Was not able to process the file: \n%s\n" % "".join(log))
+
+    files_processed = processed_file
+    report_text = upgrader._format_log(log, in_filename, out_filename)
+    errors = process_errors
+
+  shutil.move(temp_file.name, out_filename)
+
+  return files_processed, report_text, errors
+
+
+def _get_code(input_file):
+  """Load the ipynb file and return a list of CodeLines."""
+
+  raw_code = []
+
+  with open(input_file) as in_file:
+    notebook = json.load(in_file)
+
+  cell_index = 0
+  for cell in notebook["cells"]:
+    if cell["cell_type"] == "code":
+      cell_lines = cell["source"]
+
+      for line_idx, code_line in enumerate(cell_lines):
+
+        # Sometimes, jupyter has more than python code
+        # Idea is to comment these lines, for upgrade time
+        if code_line.startswith("%") or code_line.startswith("!") \
+            or code_line.startswith("?"):
+          # Found a special character, need to "encode"
+          code_line = "###!!!" + code_line
+
+        # Sometimes, people leave \n at the end of cell
+        # in order to migrate only related things, and make the diff
+        # the smallest -> here is another hack
+        if (line_idx == len(cell_lines) - 1) and code_line.endswith("\n"):
+          code_line = code_line.replace("\n", "###===")
+
+        # sometimes a line would start with `\n` and content after
+        # that's the hack for this
+        raw_code.append(
+            CodeLine(cell_index,
+                     code_line.rstrip().replace("\n", "###===")))
+
+      cell_index += 1
+
+  return raw_code, notebook
+
+
+def _update_notebook(original_notebook, original_raw_lines, updated_code_lines):
+  """Update notebook, once migration is done."""
+
+  new_notebook = copy.deepcopy(original_notebook)
+
+  # validate that the number of lines is the same
+  assert len(original_raw_lines) == len(updated_code_lines), \
+    ("The lengths of input and converted files are not the same: "
+     "{} vs {}".format(len(original_raw_lines), len(updated_code_lines)))
+
+  code_cell_idx = 0
+  for cell in new_notebook["cells"]:
+    if cell["cell_type"] != "code":
+      continue
+
+    applicable_lines = [
+        idx for idx, code_line in enumerate(original_raw_lines)
+        if code_line.cell_number == code_cell_idx
+    ]
+
+    new_code = [updated_code_lines[idx] for idx in applicable_lines]
+
+    cell["source"] = "\n".join(new_code).replace("###!!!", "").replace(
+        "###===", "\n")
+    code_cell_idx += 1
+
+  return new_notebook
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index fb2303e9dd2f6b842124ed21201276f228150f06..aa57f53005bc1bf709bc6b85f247653bdc6f76f6 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -25,721 +25,1498 @@ from __future__ import division
 from __future__ import print_function
 
 renames = {
-    'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
-    'tf.AttrValue': 'tf.compat.v1.AttrValue',
-    'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
-    'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
-    'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
-    'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
-    'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
-    'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
-    'tf.Dimension': 'tf.compat.v1.Dimension',
-    'tf.Event': 'tf.compat.v1.Event',
-    'tf.FIFOQueue': 'tf.queue.FIFOQueue',
-    'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
-    'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
-    'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
-    'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
-    'tf.GPUOptions': 'tf.compat.v1.GPUOptions',
-    'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
-    'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
-    'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
-    'tf.GraphDef': 'tf.compat.v1.GraphDef',
-    'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
-    'tf.GraphOptions': 'tf.compat.v1.GraphOptions',
-    'tf.HistogramProto': 'tf.compat.v1.HistogramProto',
-    'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
-    'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
-    'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
-    'tf.LogMessage': 'tf.compat.v1.LogMessage',
-    'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
-    'tf.MetaGraphDef': 'tf.compat.v1.MetaGraphDef',
-    'tf.NameAttrList': 'tf.compat.v1.NameAttrList',
-    'tf.NoGradient': 'tf.no_gradient',
-    'tf.NodeDef': 'tf.compat.v1.NodeDef',
-    'tf.NotDifferentiable': 'tf.no_gradient',
-    'tf.OpError': 'tf.errors.OpError',
-    'tf.OptimizerOptions': 'tf.compat.v1.OptimizerOptions',
-    'tf.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
-    'tf.Print': 'tf.compat.v1.Print',
-    'tf.PriorityQueue': 'tf.queue.PriorityQueue',
-    'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
-    'tf.QueueBase': 'tf.queue.QueueBase',
-    'tf.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
-    'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
-    'tf.RunMetadata': 'tf.compat.v1.RunMetadata',
-    'tf.RunOptions': 'tf.compat.v1.RunOptions',
-    'tf.Session': 'tf.compat.v1.Session',
-    'tf.SessionLog': 'tf.compat.v1.SessionLog',
-    'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
-    'tf.SparseFeature': 'tf.io.SparseFeature',
-    'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
-    'tf.Summary': 'tf.compat.v1.Summary',
-    'tf.SummaryMetadata': 'tf.compat.v1.SummaryMetadata',
-    'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
-    'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
-    'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
-    'tf.VERSION': 'tf.version.VERSION',
-    'tf.VarLenFeature': 'tf.io.VarLenFeature',
-    'tf.VariableScope': 'tf.compat.v1.VariableScope',
-    'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
-    'tf.accumulate_n': 'tf.math.accumulate_n',
-    'tf.add_check_numerics_ops': 'tf.compat.v1.add_check_numerics_ops',
-    'tf.add_to_collection': 'tf.compat.v1.add_to_collection',
-    'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
-    'tf.all_variables': 'tf.compat.v1.all_variables',
-    'tf.angle': 'tf.math.angle',
-    'tf.app.run': 'tf.compat.v1.app.run',
-    'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
-    'tf.assert_integer': 'tf.compat.v1.assert_integer',
-    'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
-    'tf.assert_near': 'tf.compat.v1.assert_near',
-    'tf.assert_negative': 'tf.compat.v1.assert_negative',
-    'tf.assert_non_negative': 'tf.compat.v1.assert_non_negative',
-    'tf.assert_non_positive': 'tf.compat.v1.assert_non_positive',
-    'tf.assert_none_equal': 'tf.compat.v1.assert_none_equal',
-    'tf.assert_positive': 'tf.compat.v1.assert_positive',
-    'tf.assert_proper_iterable': 'tf.debugging.assert_proper_iterable',
-    'tf.assert_rank_at_least': 'tf.compat.v1.assert_rank_at_least',
-    'tf.assert_rank_in': 'tf.compat.v1.assert_rank_in',
-    'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
-    'tf.assert_scalar': 'tf.compat.v1.assert_scalar',
-    'tf.assert_type': 'tf.compat.v1.assert_type',
-    'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
-    'tf.assign': 'tf.compat.v1.assign',
-    'tf.assign_add': 'tf.compat.v1.assign_add',
-    'tf.assign_sub': 'tf.compat.v1.assign_sub',
-    'tf.batch_scatter_update': 'tf.compat.v1.batch_scatter_update',
-    'tf.betainc': 'tf.math.betainc',
-    'tf.ceil': 'tf.math.ceil',
-    'tf.check_numerics': 'tf.debugging.check_numerics',
-    'tf.cholesky': 'tf.linalg.cholesky',
-    'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
-    'tf.clip_by_average_norm': 'tf.compat.v1.clip_by_average_norm',
-    'tf.colocate_with': 'tf.compat.v1.colocate_with',
-    'tf.conj': 'tf.math.conj',
-    'tf.container': 'tf.compat.v1.container',
-    'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
-    'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
-    'tf.count_up_to': 'tf.compat.v1.count_up_to',
-    'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
-    'tf.cross': 'tf.linalg.cross',
-    'tf.cumprod': 'tf.math.cumprod',
-    'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator',
-    'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator',
-    'tf.debugging.is_finite': 'tf.math.is_finite',
-    'tf.debugging.is_inf': 'tf.math.is_inf',
-    'tf.debugging.is_nan': 'tf.math.is_nan',
-    'tf.debugging.is_non_decreasing': 'tf.math.is_non_decreasing',
-    'tf.debugging.is_strictly_increasing': 'tf.math.is_strictly_increasing',
-    'tf.decode_base64': 'tf.io.decode_base64',
-    'tf.decode_compressed': 'tf.io.decode_compressed',
-    'tf.decode_json_example': 'tf.io.decode_json_example',
-    'tf.decode_raw': 'tf.io.decode_raw',
-    'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
-    'tf.depth_to_space': 'tf.compat.v1.depth_to_space',
-    'tf.dequantize': 'tf.quantization.dequantize',
-    'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
-    'tf.diag': 'tf.linalg.tensor_diag',
-    'tf.diag_part': 'tf.linalg.tensor_diag_part',
-    'tf.digamma': 'tf.math.digamma',
-    'tf.dimension_at_index': 'tf.compat.dimension_at_index',
-    'tf.dimension_value': 'tf.compat.dimension_value',
-    'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
-    'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
-    'tf.disable_v2_batch_normalization': 'tf.compat.v1.disable_v2_batch_normalization',
-    'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
-    'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
-    'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
-    'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
-    'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
-    'tf.distributions.Dirichlet': 'tf.compat.v1.distributions.Dirichlet',
-    'tf.distributions.DirichletMultinomial': 'tf.compat.v1.distributions.DirichletMultinomial',
-    'tf.distributions.Distribution': 'tf.compat.v1.distributions.Distribution',
-    'tf.distributions.Exponential': 'tf.compat.v1.distributions.Exponential',
-    'tf.distributions.FULLY_REPARAMETERIZED': 'tf.compat.v1.distributions.FULLY_REPARAMETERIZED',
-    'tf.distributions.Gamma': 'tf.compat.v1.distributions.Gamma',
-    'tf.distributions.Laplace': 'tf.compat.v1.distributions.Laplace',
-    'tf.distributions.Multinomial': 'tf.compat.v1.distributions.Multinomial',
-    'tf.distributions.NOT_REPARAMETERIZED': 'tf.compat.v1.distributions.NOT_REPARAMETERIZED',
-    'tf.distributions.Normal': 'tf.compat.v1.distributions.Normal',
-    'tf.distributions.RegisterKL': 'tf.compat.v1.distributions.RegisterKL',
-    'tf.distributions.ReparameterizationType': 'tf.compat.v1.distributions.ReparameterizationType',
-    'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
-    'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
-    'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
-    'tf.div': 'tf.compat.v1.div',
-    'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
-    'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
-    'tf.enable_v2_batch_normalization': 'tf.compat.v1.enable_v2_batch_normalization',
-    'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
-    'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
-    'tf.encode_base64': 'tf.io.encode_base64',
-    'tf.erf': 'tf.math.erf',
-    'tf.erfc': 'tf.math.erfc',
-    'tf.expm1': 'tf.math.expm1',
-    'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
-    'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
-    'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
-    'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
-    'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
-    'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
-    'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
-    'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
-    'tf.fft': 'tf.signal.fft',
-    'tf.fft2d': 'tf.signal.fft2d',
-    'tf.fft3d': 'tf.signal.fft3d',
-    'tf.fixed_size_partitioner': 'tf.compat.v1.fixed_size_partitioner',
-    'tf.floordiv': 'tf.math.floordiv',
-    'tf.get_collection': 'tf.compat.v1.get_collection',
-    'tf.get_collection_ref': 'tf.compat.v1.get_collection_ref',
-    'tf.get_default_graph': 'tf.compat.v1.get_default_graph',
-    'tf.get_default_session': 'tf.compat.v1.get_default_session',
-    'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
-    'tf.get_seed': 'tf.compat.v1.get_seed',
-    'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
-    'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
-    'tf.get_variable': 'tf.compat.v1.get_variable',
-    'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
-    'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
-    'tf.gfile.GFile': 'tf.io.gfile.GFile',
-    'tf.gfile.Open': 'tf.io.gfile.GFile',
-    'tf.global_norm': 'tf.linalg.global_norm',
-    'tf.global_variables': 'tf.compat.v1.global_variables',
-    'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
-    'tf.glorot_normal_initializer': 'tf.compat.v1.glorot_normal_initializer',
-    'tf.glorot_uniform_initializer': 'tf.compat.v1.glorot_uniform_initializer',
-    'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
-    'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
-    'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
-    'tf.graph_util.remove_training_nodes': 'tf.compat.v1.graph_util.remove_training_nodes',
-    'tf.graph_util.tensor_shape_from_node_def_name': 'tf.compat.v1.graph_util.tensor_shape_from_node_def_name',
-    'tf.ifft': 'tf.signal.ifft',
-    'tf.ifft2d': 'tf.signal.ifft2d',
-    'tf.ifft3d': 'tf.signal.ifft3d',
-    'tf.igamma': 'tf.math.igamma',
-    'tf.igammac': 'tf.math.igammac',
-    'tf.imag': 'tf.math.imag',
-    'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
-    'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
-    'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
-    'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
-    'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
-    'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
-    'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
-    'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
-    'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
-    'tf.initializers.constant': 'tf.compat.v1.initializers.constant',
-    'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
-    'tf.initializers.glorot_normal': 'tf.compat.v1.initializers.glorot_normal',
-    'tf.initializers.glorot_uniform': 'tf.compat.v1.initializers.glorot_uniform',
-    'tf.initializers.he_normal': 'tf.compat.v1.initializers.he_normal',
-    'tf.initializers.he_uniform': 'tf.compat.v1.initializers.he_uniform',
-    'tf.initializers.identity': 'tf.compat.v1.initializers.identity',
-    'tf.initializers.lecun_normal': 'tf.compat.v1.initializers.lecun_normal',
-    'tf.initializers.lecun_uniform': 'tf.compat.v1.initializers.lecun_uniform',
-    'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
-    'tf.initializers.ones': 'tf.compat.v1.initializers.ones',
-    'tf.initializers.orthogonal': 'tf.compat.v1.initializers.orthogonal',
-    'tf.initializers.random_normal': 'tf.compat.v1.initializers.random_normal',
-    'tf.initializers.random_uniform': 'tf.compat.v1.initializers.random_uniform',
-    'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
-    'tf.initializers.truncated_normal': 'tf.compat.v1.initializers.truncated_normal',
-    'tf.initializers.uniform_unit_scaling': 'tf.compat.v1.initializers.uniform_unit_scaling',
-    'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
-    'tf.initializers.variance_scaling': 'tf.compat.v1.initializers.variance_scaling',
-    'tf.initializers.zeros': 'tf.compat.v1.initializers.zeros',
-    'tf.invert_permutation': 'tf.math.invert_permutation',
-    'tf.io.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
-    'tf.io.PriorityQueue': 'tf.queue.PriorityQueue',
-    'tf.io.QueueBase': 'tf.queue.QueueBase',
-    'tf.io.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
-    'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
-    'tf.is_finite': 'tf.math.is_finite',
-    'tf.is_inf': 'tf.math.is_inf',
-    'tf.is_nan': 'tf.math.is_nan',
-    'tf.is_non_decreasing': 'tf.math.is_non_decreasing',
-    'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
-    'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
-    'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
-    'tf.keras.initializers.Identity': 'tf.compat.v1.keras.initializers.Identity',
-    'tf.keras.initializers.Orthogonal': 'tf.compat.v1.keras.initializers.Orthogonal',
-    'tf.keras.initializers.TruncatedNormal': 'tf.compat.v1.keras.initializers.TruncatedNormal',
-    'tf.keras.initializers.VarianceScaling': 'tf.compat.v1.keras.initializers.VarianceScaling',
-    'tf.keras.initializers.constant': 'tf.compat.v1.keras.initializers.constant',
-    'tf.keras.initializers.glorot_normal': 'tf.compat.v1.keras.initializers.glorot_normal',
-    'tf.keras.initializers.glorot_uniform': 'tf.compat.v1.keras.initializers.glorot_uniform',
-    'tf.keras.initializers.he_normal': 'tf.compat.v1.keras.initializers.he_normal',
-    'tf.keras.initializers.he_uniform': 'tf.compat.v1.keras.initializers.he_uniform',
-    'tf.keras.initializers.identity': 'tf.compat.v1.keras.initializers.identity',
-    'tf.keras.initializers.lecun_normal': 'tf.compat.v1.keras.initializers.lecun_normal',
-    'tf.keras.initializers.lecun_uniform': 'tf.compat.v1.keras.initializers.lecun_uniform',
-    'tf.keras.initializers.normal': 'tf.compat.v1.keras.initializers.normal',
-    'tf.keras.initializers.ones': 'tf.compat.v1.keras.initializers.ones',
-    'tf.keras.initializers.orthogonal': 'tf.compat.v1.keras.initializers.orthogonal',
-    'tf.keras.initializers.random_normal': 'tf.compat.v1.keras.initializers.random_normal',
-    'tf.keras.initializers.random_uniform': 'tf.compat.v1.keras.initializers.random_uniform',
-    'tf.keras.initializers.truncated_normal': 'tf.compat.v1.keras.initializers.truncated_normal',
-    'tf.keras.initializers.uniform': 'tf.compat.v1.keras.initializers.uniform',
-    'tf.keras.initializers.zeros': 'tf.compat.v1.keras.initializers.zeros',
-    'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
-    'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
-    'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
-    'tf.layers.BatchNormalization': 'tf.compat.v1.layers.BatchNormalization',
-    'tf.layers.Conv1D': 'tf.compat.v1.layers.Conv1D',
-    'tf.layers.Conv2D': 'tf.compat.v1.layers.Conv2D',
-    'tf.layers.Conv2DTranspose': 'tf.compat.v1.layers.Conv2DTranspose',
-    'tf.layers.Conv3D': 'tf.compat.v1.layers.Conv3D',
-    'tf.layers.Conv3DTranspose': 'tf.compat.v1.layers.Conv3DTranspose',
-    'tf.layers.Dense': 'tf.compat.v1.layers.Dense',
-    'tf.layers.Dropout': 'tf.compat.v1.layers.Dropout',
-    'tf.layers.Flatten': 'tf.compat.v1.layers.Flatten',
-    'tf.layers.InputSpec': 'tf.keras.layers.InputSpec',
-    'tf.layers.Layer': 'tf.compat.v1.layers.Layer',
-    'tf.layers.MaxPooling1D': 'tf.compat.v1.layers.MaxPooling1D',
-    'tf.layers.MaxPooling2D': 'tf.compat.v1.layers.MaxPooling2D',
-    'tf.layers.MaxPooling3D': 'tf.compat.v1.layers.MaxPooling3D',
-    'tf.layers.SeparableConv1D': 'tf.compat.v1.layers.SeparableConv1D',
-    'tf.layers.SeparableConv2D': 'tf.compat.v1.layers.SeparableConv2D',
-    'tf.layers.average_pooling1d': 'tf.compat.v1.layers.average_pooling1d',
-    'tf.layers.average_pooling2d': 'tf.compat.v1.layers.average_pooling2d',
-    'tf.layers.average_pooling3d': 'tf.compat.v1.layers.average_pooling3d',
-    'tf.layers.batch_normalization': 'tf.compat.v1.layers.batch_normalization',
-    'tf.layers.conv1d': 'tf.compat.v1.layers.conv1d',
-    'tf.layers.conv2d': 'tf.compat.v1.layers.conv2d',
-    'tf.layers.conv2d_transpose': 'tf.compat.v1.layers.conv2d_transpose',
-    'tf.layers.conv3d': 'tf.compat.v1.layers.conv3d',
-    'tf.layers.conv3d_transpose': 'tf.compat.v1.layers.conv3d_transpose',
-    'tf.layers.dense': 'tf.compat.v1.layers.dense',
-    'tf.layers.dropout': 'tf.compat.v1.layers.dropout',
-    'tf.layers.experimental.keras_style_scope': 'tf.compat.v1.layers.experimental.keras_style_scope',
-    'tf.layers.experimental.set_keras_style': 'tf.compat.v1.layers.experimental.set_keras_style',
-    'tf.layers.flatten': 'tf.compat.v1.layers.flatten',
-    'tf.layers.max_pooling1d': 'tf.compat.v1.layers.max_pooling1d',
-    'tf.layers.max_pooling2d': 'tf.compat.v1.layers.max_pooling2d',
-    'tf.layers.max_pooling3d': 'tf.compat.v1.layers.max_pooling3d',
-    'tf.layers.separable_conv1d': 'tf.compat.v1.layers.separable_conv1d',
-    'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
-    'tf.lbeta': 'tf.math.lbeta',
-    'tf.lgamma': 'tf.math.lgamma',
-    'tf.lin_space': 'tf.linspace',
-    'tf.local_variables': 'tf.compat.v1.local_variables',
-    'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
-    'tf.log': 'tf.math.log',
-    'tf.log1p': 'tf.math.log1p',
-    'tf.log_sigmoid': 'tf.math.log_sigmoid',
-    'tf.logging.DEBUG': 'tf.compat.v1.logging.DEBUG',
-    'tf.logging.ERROR': 'tf.compat.v1.logging.ERROR',
-    'tf.logging.FATAL': 'tf.compat.v1.logging.FATAL',
-    'tf.logging.INFO': 'tf.compat.v1.logging.INFO',
-    'tf.logging.TaskLevelStatusMessage': 'tf.compat.v1.logging.TaskLevelStatusMessage',
-    'tf.logging.WARN': 'tf.compat.v1.logging.WARN',
-    'tf.logging.debug': 'tf.compat.v1.logging.debug',
-    'tf.logging.error': 'tf.compat.v1.logging.error',
-    'tf.logging.fatal': 'tf.compat.v1.logging.fatal',
-    'tf.logging.flush': 'tf.compat.v1.logging.flush',
-    'tf.logging.get_verbosity': 'tf.compat.v1.logging.get_verbosity',
-    'tf.logging.info': 'tf.compat.v1.logging.info',
-    'tf.logging.log': 'tf.compat.v1.logging.log',
-    'tf.logging.log_every_n': 'tf.compat.v1.logging.log_every_n',
-    'tf.logging.log_first_n': 'tf.compat.v1.logging.log_first_n',
-    'tf.logging.log_if': 'tf.compat.v1.logging.log_if',
-    'tf.logging.set_verbosity': 'tf.compat.v1.logging.set_verbosity',
-    'tf.logging.vlog': 'tf.compat.v1.logging.vlog',
-    'tf.logging.warn': 'tf.compat.v1.logging.warn',
-    'tf.logging.warning': 'tf.compat.v1.logging.warning',
-    'tf.logical_xor': 'tf.math.logical_xor',
-    'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
-    'tf.losses.add_loss': 'tf.compat.v1.losses.add_loss',
-    'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
-    'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
-    'tf.losses.get_losses': 'tf.compat.v1.losses.get_losses',
-    'tf.losses.get_regularization_loss': 'tf.compat.v1.losses.get_regularization_loss',
-    'tf.losses.get_regularization_losses': 'tf.compat.v1.losses.get_regularization_losses',
-    'tf.losses.get_total_loss': 'tf.compat.v1.losses.get_total_loss',
-    'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
-    'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
-    'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
-    'tf.losses.mean_pairwise_squared_error': 'tf.compat.v1.losses.mean_pairwise_squared_error',
-    'tf.losses.mean_squared_error': 'tf.compat.v1.losses.mean_squared_error',
-    'tf.losses.sigmoid_cross_entropy': 'tf.compat.v1.losses.sigmoid_cross_entropy',
-    'tf.losses.softmax_cross_entropy': 'tf.compat.v1.losses.softmax_cross_entropy',
-    'tf.losses.sparse_softmax_cross_entropy': 'tf.compat.v1.losses.sparse_softmax_cross_entropy',
-    'tf.make_template': 'tf.compat.v1.make_template',
-    'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
-    'tf.manip.gather_nd': 'tf.gather_nd',
-    'tf.manip.reshape': 'tf.reshape',
-    'tf.manip.reverse': 'tf.reverse',
-    'tf.manip.roll': 'tf.roll',
-    'tf.manip.scatter_nd': 'tf.scatter_nd',
-    'tf.manip.space_to_batch_nd': 'tf.space_to_batch_nd',
-    'tf.manip.tile': 'tf.tile',
-    'tf.matching_files': 'tf.io.matching_files',
-    'tf.matrix_band_part': 'tf.linalg.band_part',
-    'tf.matrix_determinant': 'tf.linalg.det',
-    'tf.matrix_diag': 'tf.linalg.diag',
-    'tf.matrix_diag_part': 'tf.linalg.diag_part',
-    'tf.matrix_inverse': 'tf.linalg.inv',
-    'tf.matrix_set_diag': 'tf.linalg.set_diag',
-    'tf.matrix_solve': 'tf.linalg.solve',
-    'tf.matrix_solve_ls': 'tf.linalg.lstsq',
-    'tf.matrix_transpose': 'tf.linalg.transpose',
-    'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
-    'tf.metrics.accuracy': 'tf.compat.v1.metrics.accuracy',
-    'tf.metrics.auc': 'tf.compat.v1.metrics.auc',
-    'tf.metrics.average_precision_at_k': 'tf.compat.v1.metrics.average_precision_at_k',
-    'tf.metrics.false_negatives': 'tf.compat.v1.metrics.false_negatives',
-    'tf.metrics.false_negatives_at_thresholds': 'tf.compat.v1.metrics.false_negatives_at_thresholds',
-    'tf.metrics.false_positives': 'tf.compat.v1.metrics.false_positives',
-    'tf.metrics.false_positives_at_thresholds': 'tf.compat.v1.metrics.false_positives_at_thresholds',
-    'tf.metrics.mean': 'tf.compat.v1.metrics.mean',
-    'tf.metrics.mean_absolute_error': 'tf.compat.v1.metrics.mean_absolute_error',
-    'tf.metrics.mean_cosine_distance': 'tf.compat.v1.metrics.mean_cosine_distance',
-    'tf.metrics.mean_iou': 'tf.compat.v1.metrics.mean_iou',
-    'tf.metrics.mean_per_class_accuracy': 'tf.compat.v1.metrics.mean_per_class_accuracy',
-    'tf.metrics.mean_relative_error': 'tf.compat.v1.metrics.mean_relative_error',
-    'tf.metrics.mean_squared_error': 'tf.compat.v1.metrics.mean_squared_error',
-    'tf.metrics.mean_tensor': 'tf.compat.v1.metrics.mean_tensor',
-    'tf.metrics.percentage_below': 'tf.compat.v1.metrics.percentage_below',
-    'tf.metrics.precision': 'tf.compat.v1.metrics.precision',
-    'tf.metrics.precision_at_k': 'tf.compat.v1.metrics.precision_at_k',
-    'tf.metrics.precision_at_thresholds': 'tf.compat.v1.metrics.precision_at_thresholds',
-    'tf.metrics.precision_at_top_k': 'tf.compat.v1.metrics.precision_at_top_k',
-    'tf.metrics.recall': 'tf.compat.v1.metrics.recall',
-    'tf.metrics.recall_at_k': 'tf.compat.v1.metrics.recall_at_k',
-    'tf.metrics.recall_at_thresholds': 'tf.compat.v1.metrics.recall_at_thresholds',
-    'tf.metrics.recall_at_top_k': 'tf.compat.v1.metrics.recall_at_top_k',
-    'tf.metrics.root_mean_squared_error': 'tf.compat.v1.metrics.root_mean_squared_error',
-    'tf.metrics.sensitivity_at_specificity': 'tf.compat.v1.metrics.sensitivity_at_specificity',
-    'tf.metrics.sparse_average_precision_at_k': 'tf.compat.v1.metrics.sparse_average_precision_at_k',
-    'tf.metrics.sparse_precision_at_k': 'tf.compat.v1.metrics.sparse_precision_at_k',
-    'tf.metrics.specificity_at_sensitivity': 'tf.compat.v1.metrics.specificity_at_sensitivity',
-    'tf.metrics.true_negatives': 'tf.compat.v1.metrics.true_negatives',
-    'tf.metrics.true_negatives_at_thresholds': 'tf.compat.v1.metrics.true_negatives_at_thresholds',
-    'tf.metrics.true_positives': 'tf.compat.v1.metrics.true_positives',
-    'tf.metrics.true_positives_at_thresholds': 'tf.compat.v1.metrics.true_positives_at_thresholds',
-    'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
-    'tf.model_variables': 'tf.compat.v1.model_variables',
-    'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
-    'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
-    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
-    'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
-    'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
-    'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
-    'tf.nn.depthwise_conv2d_native_backprop_filter': 'tf.nn.depthwise_conv2d_backprop_filter',
-    'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
-    'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
-    'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
-    'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
-    'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
-    'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
-    'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
-    'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
-    'tf.nn.relu_layer': 'tf.compat.v1.nn.relu_layer',
-    'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
-    'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
-    'tf.nn.rnn_cell.DropoutWrapper': 'tf.compat.v1.nn.rnn_cell.DropoutWrapper',
-    'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
-    'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
-    'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
-    'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
-    'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
-    'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
-    'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
-    'tf.op_scope': 'tf.compat.v1.op_scope',
-    'tf.orthogonal_initializer': 'tf.compat.v1.orthogonal_initializer',
-    'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
-    'tf.parse_tensor': 'tf.io.parse_tensor',
-    'tf.placeholder': 'tf.compat.v1.placeholder',
-    'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
-    'tf.polygamma': 'tf.math.polygamma',
-    'tf.profiler.AdviceProto': 'tf.compat.v1.profiler.AdviceProto',
-    'tf.profiler.GraphNodeProto': 'tf.compat.v1.profiler.GraphNodeProto',
-    'tf.profiler.MultiGraphNodeProto': 'tf.compat.v1.profiler.MultiGraphNodeProto',
-    'tf.profiler.OpLogProto': 'tf.compat.v1.profiler.OpLogProto',
-    'tf.profiler.ProfileOptionBuilder': 'tf.compat.v1.profiler.ProfileOptionBuilder',
-    'tf.profiler.Profiler': 'tf.compat.v1.profiler.Profiler',
-    'tf.profiler.advise': 'tf.compat.v1.profiler.advise',
-    'tf.profiler.profile': 'tf.compat.v1.profiler.profile',
-    'tf.profiler.write_op_log': 'tf.compat.v1.profiler.write_op_log',
-    'tf.py_func': 'tf.compat.v1.py_func',
-    'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
-    'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
-    'tf.python_io.TFRecordWriter': 'tf.io.TFRecordWriter',
-    'tf.python_io.tf_record_iterator': 'tf.compat.v1.python_io.tf_record_iterator',
-    'tf.qr': 'tf.linalg.qr',
-    'tf.quantize': 'tf.quantization.quantize',
-    'tf.quantized_concat': 'tf.quantization.quantized_concat',
-    'tf.ragged.RaggedTensorValue': 'tf.compat.v1.ragged.RaggedTensorValue',
-    'tf.ragged.constant_value': 'tf.compat.v1.ragged.constant_value',
-    'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
-    'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
-    'tf.random_crop': 'tf.image.random_crop',
-    'tf.random_gamma': 'tf.random.gamma',
-    'tf.random_normal': 'tf.random.normal',
-    'tf.random_shuffle': 'tf.random.shuffle',
-    'tf.random_uniform': 'tf.random.uniform',
-    'tf.read_file': 'tf.io.read_file',
-    'tf.real': 'tf.math.real',
-    'tf.reciprocal': 'tf.math.reciprocal',
-    'tf.regex_replace': 'tf.strings.regex_replace',
-    'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
-    'tf.reset_default_graph': 'tf.compat.v1.reset_default_graph',
-    'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
-    'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
-    'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
-    'tf.resource_loader.load_resource': 'tf.compat.v1.resource_loader.load_resource',
-    'tf.resource_loader.readahead_file_path': 'tf.compat.v1.resource_loader.readahead_file_path',
-    'tf.reverse_v2': 'tf.reverse',
-    'tf.rint': 'tf.math.rint',
-    'tf.rsqrt': 'tf.math.rsqrt',
-    'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
-    'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
-    'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
-    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
-    'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
-    'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
-    'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
-    'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
-    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
-    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.compat.v1.saved_model.constants.MAIN_OP_KEY',
-    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
-    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
-    'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
-    'tf.saved_model.constants.VARIABLES_DIRECTORY': 'tf.saved_model.VARIABLES_DIRECTORY',
-    'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
-    'tf.saved_model.experimental.save': 'tf.saved_model.save',
-    'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
-    'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
-    'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
-    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
-    'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
-    'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
-    'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
-    'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
-    'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
-    'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
-    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
-    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES': 'tf.saved_model.CLASSIFY_OUTPUT_SCORES',
-    'tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY': 'tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY',
-    'tf.saved_model.signature_constants.PREDICT_INPUTS': 'tf.saved_model.PREDICT_INPUTS',
-    'tf.saved_model.signature_constants.PREDICT_METHOD_NAME': 'tf.saved_model.PREDICT_METHOD_NAME',
-    'tf.saved_model.signature_constants.PREDICT_OUTPUTS': 'tf.saved_model.PREDICT_OUTPUTS',
-    'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
-    'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
-    'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
-    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
-    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
-    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
-    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
-    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
-    'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
-    'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
-    'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
-    'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
-    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
-    'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
-    'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
-    'tf.scatter_add': 'tf.compat.v1.scatter_add',
-    'tf.scatter_nd_add': 'tf.compat.v1.scatter_nd_add',
-    'tf.scatter_nd_sub': 'tf.compat.v1.scatter_nd_sub',
-    'tf.scatter_nd_update': 'tf.compat.v1.scatter_nd_update',
-    'tf.scatter_sub': 'tf.compat.v1.scatter_sub',
-    'tf.scatter_update': 'tf.compat.v1.scatter_update',
-    'tf.segment_max': 'tf.math.segment_max',
-    'tf.segment_mean': 'tf.math.segment_mean',
-    'tf.segment_min': 'tf.math.segment_min',
-    'tf.segment_prod': 'tf.math.segment_prod',
-    'tf.segment_sum': 'tf.math.segment_sum',
-    'tf.self_adjoint_eig': 'tf.linalg.eigh',
-    'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
-    'tf.serialize_many_sparse': 'tf.compat.v1.serialize_many_sparse',
-    'tf.serialize_sparse': 'tf.compat.v1.serialize_sparse',
-    'tf.serialize_tensor': 'tf.io.serialize_tensor',
-    'tf.set_random_seed': 'tf.compat.v1.set_random_seed',
-    'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
-    'tf.sets.set_difference': 'tf.sets.difference',
-    'tf.sets.set_intersection': 'tf.sets.intersection',
-    'tf.sets.set_size': 'tf.sets.size',
-    'tf.sets.set_union': 'tf.sets.union',
-    'tf.space_to_depth': 'tf.compat.v1.space_to_depth',
-    'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
-    'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
-    'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
-    'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
-    'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
-    'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
-    'tf.sparse_mask': 'tf.sparse.mask',
-    'tf.sparse_maximum': 'tf.sparse.maximum',
-    'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
-    'tf.sparse_minimum': 'tf.sparse.minimum',
-    'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
-    'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
-    'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
-    'tf.sparse_reorder': 'tf.sparse.reorder',
-    'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
-    'tf.sparse_reshape': 'tf.sparse.reshape',
-    'tf.sparse_retain': 'tf.sparse.retain',
-    'tf.sparse_segment_mean': 'tf.compat.v1.sparse_segment_mean',
-    'tf.sparse_segment_sqrt_n': 'tf.compat.v1.sparse_segment_sqrt_n',
-    'tf.sparse_segment_sum': 'tf.compat.v1.sparse_segment_sum',
-    'tf.sparse_slice': 'tf.sparse.slice',
-    'tf.sparse_softmax': 'tf.sparse.softmax',
-    'tf.sparse_tensor_dense_matmul': 'tf.sparse.sparse_dense_matmul',
-    'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
-    'tf.sparse_to_dense': 'tf.compat.v1.sparse_to_dense',
-    'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
-    'tf.sparse_transpose': 'tf.sparse.transpose',
-    'tf.spectral.dct': 'tf.signal.dct',
-    'tf.spectral.fft': 'tf.signal.fft',
-    'tf.spectral.fft2d': 'tf.signal.fft2d',
-    'tf.spectral.fft3d': 'tf.signal.fft3d',
-    'tf.spectral.idct': 'tf.signal.idct',
-    'tf.spectral.ifft': 'tf.signal.ifft',
-    'tf.spectral.ifft2d': 'tf.signal.ifft2d',
-    'tf.spectral.ifft3d': 'tf.signal.ifft3d',
-    'tf.spectral.irfft': 'tf.signal.irfft',
-    'tf.spectral.irfft2d': 'tf.signal.irfft2d',
-    'tf.spectral.irfft3d': 'tf.signal.irfft3d',
-    'tf.spectral.rfft': 'tf.signal.rfft',
-    'tf.spectral.rfft2d': 'tf.signal.rfft2d',
-    'tf.spectral.rfft3d': 'tf.signal.rfft3d',
-    'tf.squared_difference': 'tf.math.squared_difference',
-    'tf.string_join': 'tf.strings.join',
-    'tf.string_strip': 'tf.strings.strip',
-    'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
-    'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
-    'tf.summary.Event': 'tf.compat.v1.summary.Event',
-    'tf.summary.FileWriter': 'tf.compat.v1.summary.FileWriter',
-    'tf.summary.FileWriterCache': 'tf.compat.v1.summary.FileWriterCache',
-    'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
-    'tf.summary.audio': 'tf.compat.v1.summary.audio',
-    'tf.summary.Summary': 'tf.compat.v1.summary.Summary',
-    'tf.summary.SummaryDescription': 'tf.compat.v1.summary.SummaryDescription',
-    'tf.summary.TaggedRunMetadata': 'tf.compat.v1.summary.TaggedRunMetadata',
-    'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
-    'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
-    'tf.summary.image': 'tf.compat.v1.summary.image',
-    'tf.summary.merge': 'tf.compat.v1.summary.merge',
-    'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
-    'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
-    'tf.summary.tensor_summary': 'tf.compat.v1.summary.tensor_summary',
-    'tf.summary.text': 'tf.compat.v1.summary.text',
-    'tf.svd': 'tf.linalg.svd',
-    'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
-    'tf.test.StubOutForTesting': 'tf.compat.v1.test.StubOutForTesting',
-    'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
-    'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
-    'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
-    'tf.test.mock': 'tf.compat.v1.test.mock',
-    'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
-    'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
-    'tf.to_complex128': 'tf.compat.v1.to_complex128',
-    'tf.to_complex64': 'tf.compat.v1.to_complex64',
-    'tf.to_double': 'tf.compat.v1.to_double',
-    'tf.to_float': 'tf.compat.v1.to_float',
-    'tf.to_int32': 'tf.compat.v1.to_int32',
-    'tf.to_int64': 'tf.compat.v1.to_int64',
-    'tf.trace': 'tf.linalg.trace',
-    'tf.train.AdadeltaOptimizer': 'tf.compat.v1.train.AdadeltaOptimizer',
-    'tf.train.AdagradDAOptimizer': 'tf.compat.v1.train.AdagradDAOptimizer',
-    'tf.train.AdagradOptimizer': 'tf.compat.v1.train.AdagradOptimizer',
-    'tf.train.AdamOptimizer': 'tf.compat.v1.train.AdamOptimizer',
-    'tf.train.CheckpointSaverHook': 'tf.estimator.CheckpointSaverHook',
-    'tf.train.CheckpointSaverListener': 'tf.estimator.CheckpointSaverListener',
-    'tf.train.ChiefSessionCreator': 'tf.compat.v1.train.ChiefSessionCreator',
-    'tf.train.FeedFnHook': 'tf.estimator.FeedFnHook',
-    'tf.train.FinalOpsHook': 'tf.estimator.FinalOpsHook',
-    'tf.train.FtrlOptimizer': 'tf.compat.v1.train.FtrlOptimizer',
-    'tf.train.GlobalStepWaiterHook': 'tf.estimator.GlobalStepWaiterHook',
-    'tf.train.GradientDescentOptimizer': 'tf.compat.v1.train.GradientDescentOptimizer',
-    'tf.train.LoggingTensorHook': 'tf.estimator.LoggingTensorHook',
-    'tf.train.LooperThread': 'tf.compat.v1.train.LooperThread',
-    'tf.train.MomentumOptimizer': 'tf.compat.v1.train.MomentumOptimizer',
-    'tf.train.MonitoredSession': 'tf.compat.v1.train.MonitoredSession',
-    'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
-    'tf.train.NanLossDuringTrainingError': 'tf.estimator.NanLossDuringTrainingError',
-    'tf.train.NanTensorHook': 'tf.estimator.NanTensorHook',
-    'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
-    'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
-    'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
-    'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
-    'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
-    'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
-    'tf.train.Saver': 'tf.compat.v1.train.Saver',
-    'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
-    'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
-    'tf.train.SecondOrStepTimer': 'tf.estimator.SecondOrStepTimer',
-    'tf.train.Server': 'tf.distribute.Server',
-    'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
-    'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
-    'tf.train.SessionRunArgs': 'tf.estimator.SessionRunArgs',
-    'tf.train.SessionRunContext': 'tf.estimator.SessionRunContext',
-    'tf.train.SessionRunHook': 'tf.estimator.SessionRunHook',
-    'tf.train.SessionRunValues': 'tf.estimator.SessionRunValues',
-    'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
-    'tf.train.StepCounterHook': 'tf.estimator.StepCounterHook',
-    'tf.train.StopAtStepHook': 'tf.estimator.StopAtStepHook',
-    'tf.train.SummarySaverHook': 'tf.estimator.SummarySaverHook',
-    'tf.train.Supervisor': 'tf.compat.v1.train.Supervisor',
-    'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
-    'tf.train.VocabInfo': 'tf.estimator.VocabInfo',
-    'tf.train.WorkerSessionCreator': 'tf.compat.v1.train.WorkerSessionCreator',
-    'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
-    'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
-    'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
-    'tf.train.batch': 'tf.compat.v1.train.batch',
-    'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
-    'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
-    'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
-    'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
-    'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
-    'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
-    'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
-    'tf.train.get_global_step': 'tf.compat.v1.train.get_global_step',
-    'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
-    'tf.train.global_step': 'tf.compat.v1.train.global_step',
-    'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
-    'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
-    'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
-    'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
-    'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
-    'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
-    'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
-    'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
-    'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
-    'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
-    'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
-    'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
-    'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
-    'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
-    'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
-    'tf.train.replica_device_setter': 'tf.compat.v1.train.replica_device_setter',
-    'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
-    'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
-    'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
-    'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
-    'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
-    'tf.train.summary_iterator': 'tf.compat.v1.train.summary_iterator',
-    'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
-    'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
-    'tf.train.write_graph': 'tf.io.write_graph',
-    'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
-    'tf.truncated_normal': 'tf.random.truncated_normal',
-    'tf.uniform_unit_scaling_initializer': 'tf.compat.v1.uniform_unit_scaling_initializer',
-    'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
-    'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
-    'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
-    'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
-    'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
-    'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
-    'tf.variable_axis_size_partitioner': 'tf.compat.v1.variable_axis_size_partitioner',
-    'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
-    'tf.variable_scope': 'tf.compat.v1.variable_scope',
-    'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
-    'tf.variance_scaling_initializer': 'tf.compat.v1.variance_scaling_initializer',
-    'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
-    'tf.wrap_function': 'tf.compat.v1.wrap_function',
-    'tf.write_file': 'tf.io.write_file',
-    'tf.zeta': 'tf.math.zeta'
+    'tf.AUTO_REUSE':
+        'tf.compat.v1.AUTO_REUSE',
+    'tf.AttrValue':
+        'tf.compat.v1.AttrValue',
+    'tf.COMPILER_VERSION':
+        'tf.version.COMPILER_VERSION',
+    'tf.CXX11_ABI_FLAG':
+        'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.ConditionalAccumulator':
+        'tf.compat.v1.ConditionalAccumulator',
+    'tf.ConditionalAccumulatorBase':
+        'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.ConfigProto':
+        'tf.compat.v1.ConfigProto',
+    'tf.DeviceSpec':
+        'tf.compat.v1.DeviceSpec',
+    'tf.Dimension':
+        'tf.compat.v1.Dimension',
+    'tf.Event':
+        'tf.compat.v1.Event',
+    'tf.FIFOQueue':
+        'tf.queue.FIFOQueue',
+    'tf.FixedLenFeature':
+        'tf.io.FixedLenFeature',
+    'tf.FixedLenSequenceFeature':
+        'tf.io.FixedLenSequenceFeature',
+    'tf.FixedLengthRecordReader':
+        'tf.compat.v1.FixedLengthRecordReader',
+    'tf.GIT_VERSION':
+        'tf.version.GIT_VERSION',
+    'tf.GPUOptions':
+        'tf.compat.v1.GPUOptions',
+    'tf.GRAPH_DEF_VERSION':
+        'tf.version.GRAPH_DEF_VERSION',
+    'tf.GRAPH_DEF_VERSION_MIN_CONSUMER':
+        'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
+    'tf.GRAPH_DEF_VERSION_MIN_PRODUCER':
+        'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphDef':
+        'tf.compat.v1.GraphDef',
+    'tf.GraphKeys':
+        'tf.compat.v1.GraphKeys',
+    'tf.GraphOptions':
+        'tf.compat.v1.GraphOptions',
+    'tf.HistogramProto':
+        'tf.compat.v1.HistogramProto',
+    'tf.IdentityReader':
+        'tf.compat.v1.IdentityReader',
+    'tf.InteractiveSession':
+        'tf.compat.v1.InteractiveSession',
+    'tf.LMDBReader':
+        'tf.compat.v1.LMDBReader',
+    'tf.LogMessage':
+        'tf.compat.v1.LogMessage',
+    'tf.MONOLITHIC_BUILD':
+        'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.MetaGraphDef':
+        'tf.compat.v1.MetaGraphDef',
+    'tf.NameAttrList':
+        'tf.compat.v1.NameAttrList',
+    'tf.NoGradient':
+        'tf.no_gradient',
+    'tf.NodeDef':
+        'tf.compat.v1.NodeDef',
+    'tf.NotDifferentiable':
+        'tf.no_gradient',
+    'tf.OpError':
+        'tf.errors.OpError',
+    'tf.OptimizerOptions':
+        'tf.compat.v1.OptimizerOptions',
+    'tf.PaddingFIFOQueue':
+        'tf.queue.PaddingFIFOQueue',
+    'tf.Print':
+        'tf.compat.v1.Print',
+    'tf.PriorityQueue':
+        'tf.queue.PriorityQueue',
+    'tf.QUANTIZED_DTYPES':
+        'tf.dtypes.QUANTIZED_DTYPES',
+    'tf.QueueBase':
+        'tf.queue.QueueBase',
+    'tf.RandomShuffleQueue':
+        'tf.queue.RandomShuffleQueue',
+    'tf.ReaderBase':
+        'tf.compat.v1.ReaderBase',
+    'tf.RunMetadata':
+        'tf.compat.v1.RunMetadata',
+    'tf.RunOptions':
+        'tf.compat.v1.RunOptions',
+    'tf.Session':
+        'tf.compat.v1.Session',
+    'tf.SessionLog':
+        'tf.compat.v1.SessionLog',
+    'tf.SparseConditionalAccumulator':
+        'tf.sparse.SparseConditionalAccumulator',
+    'tf.SparseFeature':
+        'tf.io.SparseFeature',
+    'tf.SparseTensorValue':
+        'tf.compat.v1.SparseTensorValue',
+    'tf.Summary':
+        'tf.compat.v1.Summary',
+    'tf.SummaryMetadata':
+        'tf.compat.v1.SummaryMetadata',
+    'tf.TFRecordReader':
+        'tf.compat.v1.TFRecordReader',
+    'tf.TensorInfo':
+        'tf.compat.v1.TensorInfo',
+    'tf.TextLineReader':
+        'tf.compat.v1.TextLineReader',
+    'tf.VERSION':
+        'tf.version.VERSION',
+    'tf.VarLenFeature':
+        'tf.io.VarLenFeature',
+    'tf.VariableScope':
+        'tf.compat.v1.VariableScope',
+    'tf.WholeFileReader':
+        'tf.compat.v1.WholeFileReader',
+    'tf.accumulate_n':
+        'tf.math.accumulate_n',
+    'tf.add_check_numerics_ops':
+        'tf.compat.v1.add_check_numerics_ops',
+    'tf.add_to_collection':
+        'tf.compat.v1.add_to_collection',
+    'tf.add_to_collections':
+        'tf.compat.v1.add_to_collections',
+    'tf.all_variables':
+        'tf.compat.v1.all_variables',
+    'tf.angle':
+        'tf.math.angle',
+    'tf.app.run':
+        'tf.compat.v1.app.run',
+    'tf.assert_greater_equal':
+        'tf.compat.v1.assert_greater_equal',
+    'tf.assert_integer':
+        'tf.compat.v1.assert_integer',
+    'tf.assert_less_equal':
+        'tf.compat.v1.assert_less_equal',
+    'tf.assert_near':
+        'tf.compat.v1.assert_near',
+    'tf.assert_negative':
+        'tf.compat.v1.assert_negative',
+    'tf.assert_non_negative':
+        'tf.compat.v1.assert_non_negative',
+    'tf.assert_non_positive':
+        'tf.compat.v1.assert_non_positive',
+    'tf.assert_none_equal':
+        'tf.compat.v1.assert_none_equal',
+    'tf.assert_positive':
+        'tf.compat.v1.assert_positive',
+    'tf.assert_proper_iterable':
+        'tf.debugging.assert_proper_iterable',
+    'tf.assert_rank_at_least':
+        'tf.compat.v1.assert_rank_at_least',
+    'tf.assert_rank_in':
+        'tf.compat.v1.assert_rank_in',
+    'tf.assert_same_float_dtype':
+        'tf.debugging.assert_same_float_dtype',
+    'tf.assert_scalar':
+        'tf.compat.v1.assert_scalar',
+    'tf.assert_type':
+        'tf.compat.v1.assert_type',
+    'tf.assert_variables_initialized':
+        'tf.compat.v1.assert_variables_initialized',
+    'tf.assign':
+        'tf.compat.v1.assign',
+    'tf.assign_add':
+        'tf.compat.v1.assign_add',
+    'tf.assign_sub':
+        'tf.compat.v1.assign_sub',
+    'tf.batch_scatter_update':
+        'tf.compat.v1.batch_scatter_update',
+    'tf.betainc':
+        'tf.math.betainc',
+    'tf.ceil':
+        'tf.math.ceil',
+    'tf.check_numerics':
+        'tf.debugging.check_numerics',
+    'tf.cholesky':
+        'tf.linalg.cholesky',
+    'tf.cholesky_solve':
+        'tf.linalg.cholesky_solve',
+    'tf.clip_by_average_norm':
+        'tf.compat.v1.clip_by_average_norm',
+    'tf.colocate_with':
+        'tf.compat.v1.colocate_with',
+    'tf.conj':
+        'tf.math.conj',
+    'tf.container':
+        'tf.compat.v1.container',
+    'tf.convert_to_tensor_or_indexed_slices':
+        'tf.compat.v1.convert_to_tensor_or_indexed_slices',
+    'tf.convert_to_tensor_or_sparse_tensor':
+        'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_up_to':
+        'tf.compat.v1.count_up_to',
+    'tf.create_partitioned_variables':
+        'tf.compat.v1.create_partitioned_variables',
+    'tf.cross':
+        'tf.linalg.cross',
+    'tf.cumprod':
+        'tf.math.cumprod',
+    'tf.data.make_initializable_iterator':
+        'tf.compat.v1.data.make_initializable_iterator',
+    'tf.data.make_one_shot_iterator':
+        'tf.compat.v1.data.make_one_shot_iterator',
+    'tf.debugging.is_finite':
+        'tf.math.is_finite',
+    'tf.debugging.is_inf':
+        'tf.math.is_inf',
+    'tf.debugging.is_nan':
+        'tf.math.is_nan',
+    'tf.debugging.is_non_decreasing':
+        'tf.math.is_non_decreasing',
+    'tf.debugging.is_strictly_increasing':
+        'tf.math.is_strictly_increasing',
+    'tf.decode_base64':
+        'tf.io.decode_base64',
+    'tf.decode_compressed':
+        'tf.io.decode_compressed',
+    'tf.decode_json_example':
+        'tf.io.decode_json_example',
+    'tf.decode_raw':
+        'tf.io.decode_raw',
+    'tf.delete_session_tensor':
+        'tf.compat.v1.delete_session_tensor',
+    'tf.depth_to_space':
+        'tf.compat.v1.depth_to_space',
+    'tf.dequantize':
+        'tf.quantization.dequantize',
+    'tf.deserialize_many_sparse':
+        'tf.io.deserialize_many_sparse',
+    'tf.diag':
+        'tf.linalg.tensor_diag',
+    'tf.diag_part':
+        'tf.linalg.tensor_diag_part',
+    'tf.digamma':
+        'tf.math.digamma',
+    'tf.dimension_at_index':
+        'tf.compat.dimension_at_index',
+    'tf.dimension_value':
+        'tf.compat.dimension_value',
+    'tf.disable_eager_execution':
+        'tf.compat.v1.disable_eager_execution',
+    'tf.disable_resource_variables':
+        'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_batch_normalization':
+        'tf.compat.v1.disable_v2_batch_normalization',
+    'tf.disable_v2_behavior':
+        'tf.compat.v1.disable_v2_behavior',
+    'tf.disable_v2_tensorshape':
+        'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distribute.get_loss_reduction':
+        'tf.compat.v1.distribute.get_loss_reduction',
+    'tf.distributions.Bernoulli':
+        'tf.compat.v1.distributions.Bernoulli',
+    'tf.distributions.Beta':
+        'tf.compat.v1.distributions.Beta',
+    'tf.distributions.Categorical':
+        'tf.compat.v1.distributions.Categorical',
+    'tf.distributions.Dirichlet':
+        'tf.compat.v1.distributions.Dirichlet',
+    'tf.distributions.DirichletMultinomial':
+        'tf.compat.v1.distributions.DirichletMultinomial',
+    'tf.distributions.Distribution':
+        'tf.compat.v1.distributions.Distribution',
+    'tf.distributions.Exponential':
+        'tf.compat.v1.distributions.Exponential',
+    'tf.distributions.FULLY_REPARAMETERIZED':
+        'tf.compat.v1.distributions.FULLY_REPARAMETERIZED',
+    'tf.distributions.Gamma':
+        'tf.compat.v1.distributions.Gamma',
+    'tf.distributions.Laplace':
+        'tf.compat.v1.distributions.Laplace',
+    'tf.distributions.Multinomial':
+        'tf.compat.v1.distributions.Multinomial',
+    'tf.distributions.NOT_REPARAMETERIZED':
+        'tf.compat.v1.distributions.NOT_REPARAMETERIZED',
+    'tf.distributions.Normal':
+        'tf.compat.v1.distributions.Normal',
+    'tf.distributions.RegisterKL':
+        'tf.compat.v1.distributions.RegisterKL',
+    'tf.distributions.ReparameterizationType':
+        'tf.compat.v1.distributions.ReparameterizationType',
+    'tf.distributions.StudentT':
+        'tf.compat.v1.distributions.StudentT',
+    'tf.distributions.Uniform':
+        'tf.compat.v1.distributions.Uniform',
+    'tf.distributions.kl_divergence':
+        'tf.compat.v1.distributions.kl_divergence',
+    'tf.div':
+        'tf.compat.v1.div',
+    'tf.div_no_nan':
+        'tf.math.divide_no_nan',
+    'tf.dtypes.as_string':
+        'tf.strings.as_string',
+    'tf.enable_eager_execution':
+        'tf.compat.v1.enable_eager_execution',
+    'tf.enable_resource_variables':
+        'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_batch_normalization':
+        'tf.compat.v1.enable_v2_batch_normalization',
+    'tf.enable_v2_behavior':
+        'tf.compat.v1.enable_v2_behavior',
+    'tf.enable_v2_tensorshape':
+        'tf.compat.v1.enable_v2_tensorshape',
+    'tf.encode_base64':
+        'tf.io.encode_base64',
+    'tf.erf':
+        'tf.math.erf',
+    'tf.erfc':
+        'tf.math.erfc',
+    'tf.estimator.experimental.KMeans':
+        'tf.compat.v1.estimator.experimental.KMeans',
+    'tf.estimator.inputs.numpy_input_fn':
+        'tf.compat.v1.estimator.inputs.numpy_input_fn',
+    'tf.estimator.inputs.pandas_input_fn':
+        'tf.compat.v1.estimator.inputs.pandas_input_fn',
+    'tf.expm1':
+        'tf.math.expm1',
+    'tf.fake_quant_with_min_max_args':
+        'tf.quantization.fake_quant_with_min_max_args',
+    'tf.fake_quant_with_min_max_args_gradient':
+        'tf.quantization.fake_quant_with_min_max_args_gradient',
+    'tf.fake_quant_with_min_max_vars':
+        'tf.quantization.fake_quant_with_min_max_vars',
+    'tf.fake_quant_with_min_max_vars_gradient':
+        'tf.quantization.fake_quant_with_min_max_vars_gradient',
+    'tf.fake_quant_with_min_max_vars_per_channel':
+        'tf.quantization.fake_quant_with_min_max_vars_per_channel',
+    'tf.fake_quant_with_min_max_vars_per_channel_gradient':
+        'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
+    'tf.feature_column.input_layer':
+        'tf.compat.v1.feature_column.input_layer',
+    'tf.feature_column.linear_model':
+        'tf.compat.v1.feature_column.linear_model',
+    'tf.feature_column.shared_embedding_columns':
+        'tf.compat.v1.feature_column.shared_embedding_columns',
+    'tf.fft':
+        'tf.signal.fft',
+    'tf.fft2d':
+        'tf.signal.fft2d',
+    'tf.fft3d':
+        'tf.signal.fft3d',
+    'tf.fixed_size_partitioner':
+        'tf.compat.v1.fixed_size_partitioner',
+    'tf.floordiv':
+        'tf.math.floordiv',
+    'tf.get_collection':
+        'tf.compat.v1.get_collection',
+    'tf.get_collection_ref':
+        'tf.compat.v1.get_collection_ref',
+    'tf.get_default_graph':
+        'tf.compat.v1.get_default_graph',
+    'tf.get_default_session':
+        'tf.compat.v1.get_default_session',
+    'tf.get_local_variable':
+        'tf.compat.v1.get_local_variable',
+    'tf.get_seed':
+        'tf.compat.v1.get_seed',
+    'tf.get_session_handle':
+        'tf.compat.v1.get_session_handle',
+    'tf.get_session_tensor':
+        'tf.compat.v1.get_session_tensor',
+    'tf.get_variable':
+        'tf.compat.v1.get_variable',
+    'tf.get_variable_scope':
+        'tf.compat.v1.get_variable_scope',
+    'tf.gfile.FastGFile':
+        'tf.compat.v1.gfile.FastGFile',
+    'tf.global_norm':
+        'tf.linalg.global_norm',
+    'tf.global_variables':
+        'tf.compat.v1.global_variables',
+    'tf.global_variables_initializer':
+        'tf.compat.v1.global_variables_initializer',
+    'tf.glorot_normal_initializer':
+        'tf.compat.v1.glorot_normal_initializer',
+    'tf.glorot_uniform_initializer':
+        'tf.compat.v1.glorot_uniform_initializer',
+    'tf.graph_util.convert_variables_to_constants':
+        'tf.compat.v1.graph_util.convert_variables_to_constants',
+    'tf.graph_util.extract_sub_graph':
+        'tf.compat.v1.graph_util.extract_sub_graph',
+    'tf.graph_util.must_run_on_cpu':
+        'tf.compat.v1.graph_util.must_run_on_cpu',
+    'tf.graph_util.remove_training_nodes':
+        'tf.compat.v1.graph_util.remove_training_nodes',
+    'tf.graph_util.tensor_shape_from_node_def_name':
+        'tf.compat.v1.graph_util.tensor_shape_from_node_def_name',
+    'tf.ifft':
+        'tf.signal.ifft',
+    'tf.ifft2d':
+        'tf.signal.ifft2d',
+    'tf.ifft3d':
+        'tf.signal.ifft3d',
+    'tf.igamma':
+        'tf.math.igamma',
+    'tf.igammac':
+        'tf.math.igammac',
+    'tf.imag':
+        'tf.math.imag',
+    'tf.image.resize_area':
+        'tf.compat.v1.image.resize_area',
+    'tf.image.resize_bicubic':
+        'tf.compat.v1.image.resize_bicubic',
+    'tf.image.resize_bilinear':
+        'tf.compat.v1.image.resize_bilinear',
+    'tf.image.resize_image_with_pad':
+        'tf.compat.v1.image.resize_image_with_pad',
+    'tf.image.resize_nearest_neighbor':
+        'tf.compat.v1.image.resize_nearest_neighbor',
+    'tf.image.transpose_image':
+        'tf.compat.v1.image.transpose_image',
+    'tf.initialize_all_tables':
+        'tf.compat.v1.initialize_all_tables',
+    'tf.initialize_all_variables':
+        'tf.compat.v1.initialize_all_variables',
+    'tf.initialize_local_variables':
+        'tf.compat.v1.initialize_local_variables',
+    'tf.initialize_variables':
+        'tf.compat.v1.initialize_variables',
+    'tf.initializers.constant':
+        'tf.compat.v1.initializers.constant',
+    'tf.initializers.global_variables':
+        'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.glorot_normal':
+        'tf.compat.v1.initializers.glorot_normal',
+    'tf.initializers.glorot_uniform':
+        'tf.compat.v1.initializers.glorot_uniform',
+    'tf.initializers.he_normal':
+        'tf.compat.v1.initializers.he_normal',
+    'tf.initializers.he_uniform':
+        'tf.compat.v1.initializers.he_uniform',
+    'tf.initializers.identity':
+        'tf.compat.v1.initializers.identity',
+    'tf.initializers.lecun_normal':
+        'tf.compat.v1.initializers.lecun_normal',
+    'tf.initializers.lecun_uniform':
+        'tf.compat.v1.initializers.lecun_uniform',
+    'tf.initializers.local_variables':
+        'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.ones':
+        'tf.compat.v1.initializers.ones',
+    'tf.initializers.orthogonal':
+        'tf.compat.v1.initializers.orthogonal',
+    'tf.initializers.random_normal':
+        'tf.compat.v1.initializers.random_normal',
+    'tf.initializers.random_uniform':
+        'tf.compat.v1.initializers.random_uniform',
+    'tf.initializers.tables_initializer':
+        'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.truncated_normal':
+        'tf.compat.v1.initializers.truncated_normal',
+    'tf.initializers.uniform_unit_scaling':
+        'tf.compat.v1.initializers.uniform_unit_scaling',
+    'tf.initializers.variables':
+        'tf.compat.v1.initializers.variables',
+    'tf.initializers.variance_scaling':
+        'tf.compat.v1.initializers.variance_scaling',
+    'tf.initializers.zeros':
+        'tf.compat.v1.initializers.zeros',
+    'tf.invert_permutation':
+        'tf.math.invert_permutation',
+    'tf.io.PaddingFIFOQueue':
+        'tf.queue.PaddingFIFOQueue',
+    'tf.io.PriorityQueue':
+        'tf.queue.PriorityQueue',
+    'tf.io.QueueBase':
+        'tf.queue.QueueBase',
+    'tf.io.RandomShuffleQueue':
+        'tf.queue.RandomShuffleQueue',
+    'tf.io.tf_record_iterator':
+        'tf.compat.v1.io.tf_record_iterator',
+    'tf.is_finite':
+        'tf.math.is_finite',
+    'tf.is_inf':
+        'tf.math.is_inf',
+    'tf.is_nan':
+        'tf.math.is_nan',
+    'tf.is_non_decreasing':
+        'tf.math.is_non_decreasing',
+    'tf.is_numeric_tensor':
+        'tf.debugging.is_numeric_tensor',
+    'tf.is_strictly_increasing':
+        'tf.math.is_strictly_increasing',
+    'tf.is_variable_initialized':
+        'tf.compat.v1.is_variable_initialized',
+    'tf.keras.backend.get_session':
+        'tf.compat.v1.keras.backend.get_session',
+    'tf.keras.initializers.normal':
+        'tf.compat.v1.keras.initializers.normal',
+    'tf.keras.initializers.random_normal':
+        'tf.compat.v1.keras.initializers.random_normal',
+    'tf.keras.initializers.random_uniform':
+        'tf.compat.v1.keras.initializers.random_uniform',
+    'tf.keras.initializers.truncated_normal':
+        'tf.compat.v1.keras.initializers.truncated_normal',
+    'tf.keras.initializers.uniform':
+        'tf.compat.v1.keras.initializers.uniform',
+    'tf.keras.layers.CuDNNGRU':
+        'tf.compat.v1.keras.layers.CuDNNGRU',
+    'tf.keras.layers.CuDNNLSTM':
+        'tf.compat.v1.keras.layers.CuDNNLSTM',
+    'tf.keras.losses.cosine':
+        'tf.keras.losses.cosine_similarity',
+    'tf.keras.losses.cosine_proximity':
+        'tf.keras.losses.cosine_similarity',
+    'tf.keras.metrics.cosine':
+        'tf.keras.losses.cosine_similarity',
+    'tf.keras.metrics.cosine_proximity':
+        'tf.keras.losses.cosine_similarity',
+    'tf.layers.AveragePooling1D':
+        'tf.compat.v1.layers.AveragePooling1D',
+    'tf.layers.AveragePooling2D':
+        'tf.compat.v1.layers.AveragePooling2D',
+    'tf.layers.AveragePooling3D':
+        'tf.compat.v1.layers.AveragePooling3D',
+    'tf.layers.BatchNormalization':
+        'tf.compat.v1.layers.BatchNormalization',
+    'tf.layers.Conv1D':
+        'tf.compat.v1.layers.Conv1D',
+    'tf.layers.Conv2D':
+        'tf.compat.v1.layers.Conv2D',
+    'tf.layers.Conv2DTranspose':
+        'tf.compat.v1.layers.Conv2DTranspose',
+    'tf.layers.Conv3D':
+        'tf.compat.v1.layers.Conv3D',
+    'tf.layers.Conv3DTranspose':
+        'tf.compat.v1.layers.Conv3DTranspose',
+    'tf.layers.Dense':
+        'tf.compat.v1.layers.Dense',
+    'tf.layers.Dropout':
+        'tf.compat.v1.layers.Dropout',
+    'tf.layers.Flatten':
+        'tf.compat.v1.layers.Flatten',
+    'tf.layers.InputSpec':
+        'tf.keras.layers.InputSpec',
+    'tf.layers.Layer':
+        'tf.compat.v1.layers.Layer',
+    'tf.layers.MaxPooling1D':
+        'tf.compat.v1.layers.MaxPooling1D',
+    'tf.layers.MaxPooling2D':
+        'tf.compat.v1.layers.MaxPooling2D',
+    'tf.layers.MaxPooling3D':
+        'tf.compat.v1.layers.MaxPooling3D',
+    'tf.layers.SeparableConv1D':
+        'tf.compat.v1.layers.SeparableConv1D',
+    'tf.layers.SeparableConv2D':
+        'tf.compat.v1.layers.SeparableConv2D',
+    'tf.layers.average_pooling1d':
+        'tf.compat.v1.layers.average_pooling1d',
+    'tf.layers.average_pooling2d':
+        'tf.compat.v1.layers.average_pooling2d',
+    'tf.layers.average_pooling3d':
+        'tf.compat.v1.layers.average_pooling3d',
+    'tf.layers.batch_normalization':
+        'tf.compat.v1.layers.batch_normalization',
+    'tf.layers.conv1d':
+        'tf.compat.v1.layers.conv1d',
+    'tf.layers.conv2d':
+        'tf.compat.v1.layers.conv2d',
+    'tf.layers.conv2d_transpose':
+        'tf.compat.v1.layers.conv2d_transpose',
+    'tf.layers.conv3d':
+        'tf.compat.v1.layers.conv3d',
+    'tf.layers.conv3d_transpose':
+        'tf.compat.v1.layers.conv3d_transpose',
+    'tf.layers.dense':
+        'tf.compat.v1.layers.dense',
+    'tf.layers.dropout':
+        'tf.compat.v1.layers.dropout',
+    'tf.layers.experimental.keras_style_scope':
+        'tf.compat.v1.layers.experimental.keras_style_scope',
+    'tf.layers.experimental.set_keras_style':
+        'tf.compat.v1.layers.experimental.set_keras_style',
+    'tf.layers.flatten':
+        'tf.compat.v1.layers.flatten',
+    'tf.layers.max_pooling1d':
+        'tf.compat.v1.layers.max_pooling1d',
+    'tf.layers.max_pooling2d':
+        'tf.compat.v1.layers.max_pooling2d',
+    'tf.layers.max_pooling3d':
+        'tf.compat.v1.layers.max_pooling3d',
+    'tf.layers.separable_conv1d':
+        'tf.compat.v1.layers.separable_conv1d',
+    'tf.layers.separable_conv2d':
+        'tf.compat.v1.layers.separable_conv2d',
+    'tf.lbeta':
+        'tf.math.lbeta',
+    'tf.lgamma':
+        'tf.math.lgamma',
+    'tf.lin_space':
+        'tf.linspace',
+    'tf.lite.TocoConverter':
+        'tf.compat.v1.lite.TocoConverter',
+    'tf.lite.toco_convert':
+        'tf.compat.v1.lite.toco_convert',
+    'tf.local_variables':
+        'tf.compat.v1.local_variables',
+    'tf.local_variables_initializer':
+        'tf.compat.v1.local_variables_initializer',
+    'tf.log':
+        'tf.math.log',
+    'tf.log1p':
+        'tf.math.log1p',
+    'tf.log_sigmoid':
+        'tf.math.log_sigmoid',
+    'tf.logging.DEBUG':
+        'tf.compat.v1.logging.DEBUG',
+    'tf.logging.ERROR':
+        'tf.compat.v1.logging.ERROR',
+    'tf.logging.FATAL':
+        'tf.compat.v1.logging.FATAL',
+    'tf.logging.INFO':
+        'tf.compat.v1.logging.INFO',
+    'tf.logging.TaskLevelStatusMessage':
+        'tf.compat.v1.logging.TaskLevelStatusMessage',
+    'tf.logging.WARN':
+        'tf.compat.v1.logging.WARN',
+    'tf.logging.debug':
+        'tf.compat.v1.logging.debug',
+    'tf.logging.error':
+        'tf.compat.v1.logging.error',
+    'tf.logging.fatal':
+        'tf.compat.v1.logging.fatal',
+    'tf.logging.flush':
+        'tf.compat.v1.logging.flush',
+    'tf.logging.get_verbosity':
+        'tf.compat.v1.logging.get_verbosity',
+    'tf.logging.info':
+        'tf.compat.v1.logging.info',
+    'tf.logging.log':
+        'tf.compat.v1.logging.log',
+    'tf.logging.log_every_n':
+        'tf.compat.v1.logging.log_every_n',
+    'tf.logging.log_first_n':
+        'tf.compat.v1.logging.log_first_n',
+    'tf.logging.log_if':
+        'tf.compat.v1.logging.log_if',
+    'tf.logging.set_verbosity':
+        'tf.compat.v1.logging.set_verbosity',
+    'tf.logging.vlog':
+        'tf.compat.v1.logging.vlog',
+    'tf.logging.warn':
+        'tf.compat.v1.logging.warn',
+    'tf.logging.warning':
+        'tf.compat.v1.logging.warning',
+    'tf.logical_xor':
+        'tf.math.logical_xor',
+    'tf.losses.Reduction':
+        'tf.compat.v1.losses.Reduction',
+    'tf.losses.absolute_difference':
+        'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.add_loss':
+        'tf.compat.v1.losses.add_loss',
+    'tf.losses.compute_weighted_loss':
+        'tf.compat.v1.losses.compute_weighted_loss',
+    'tf.losses.cosine_distance':
+        'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.get_losses':
+        'tf.compat.v1.losses.get_losses',
+    'tf.losses.get_regularization_loss':
+        'tf.compat.v1.losses.get_regularization_loss',
+    'tf.losses.get_regularization_losses':
+        'tf.compat.v1.losses.get_regularization_losses',
+    'tf.losses.get_total_loss':
+        'tf.compat.v1.losses.get_total_loss',
+    'tf.losses.hinge_loss':
+        'tf.compat.v1.losses.hinge_loss',
+    'tf.losses.huber_loss':
+        'tf.compat.v1.losses.huber_loss',
+    'tf.losses.log_loss':
+        'tf.compat.v1.losses.log_loss',
+    'tf.losses.mean_pairwise_squared_error':
+        'tf.compat.v1.losses.mean_pairwise_squared_error',
+    'tf.losses.mean_squared_error':
+        'tf.compat.v1.losses.mean_squared_error',
+    'tf.losses.sigmoid_cross_entropy':
+        'tf.compat.v1.losses.sigmoid_cross_entropy',
+    'tf.losses.softmax_cross_entropy':
+        'tf.compat.v1.losses.softmax_cross_entropy',
+    'tf.losses.sparse_softmax_cross_entropy':
+        'tf.compat.v1.losses.sparse_softmax_cross_entropy',
+    'tf.make_template':
+        'tf.compat.v1.make_template',
+    'tf.make_tensor_proto':
+        'tf.compat.v1.make_tensor_proto',
+    'tf.manip.gather_nd':
+        'tf.gather_nd',
+    'tf.manip.reshape':
+        'tf.reshape',
+    'tf.manip.reverse':
+        'tf.reverse',
+    'tf.manip.roll':
+        'tf.roll',
+    'tf.manip.scatter_nd':
+        'tf.scatter_nd',
+    'tf.manip.space_to_batch_nd':
+        'tf.space_to_batch_nd',
+    'tf.manip.tile':
+        'tf.tile',
+    'tf.matching_files':
+        'tf.io.matching_files',
+    'tf.matrix_band_part':
+        'tf.linalg.band_part',
+    'tf.matrix_determinant':
+        'tf.linalg.det',
+    'tf.matrix_diag':
+        'tf.linalg.diag',
+    'tf.matrix_diag_part':
+        'tf.linalg.diag_part',
+    'tf.matrix_inverse':
+        'tf.linalg.inv',
+    'tf.matrix_set_diag':
+        'tf.linalg.set_diag',
+    'tf.matrix_solve':
+        'tf.linalg.solve',
+    'tf.matrix_solve_ls':
+        'tf.linalg.lstsq',
+    'tf.matrix_transpose':
+        'tf.linalg.transpose',
+    'tf.matrix_triangular_solve':
+        'tf.linalg.triangular_solve',
+    'tf.metrics.accuracy':
+        'tf.compat.v1.metrics.accuracy',
+    'tf.metrics.auc':
+        'tf.compat.v1.metrics.auc',
+    'tf.metrics.average_precision_at_k':
+        'tf.compat.v1.metrics.average_precision_at_k',
+    'tf.metrics.false_negatives':
+        'tf.compat.v1.metrics.false_negatives',
+    'tf.metrics.false_negatives_at_thresholds':
+        'tf.compat.v1.metrics.false_negatives_at_thresholds',
+    'tf.metrics.false_positives':
+        'tf.compat.v1.metrics.false_positives',
+    'tf.metrics.false_positives_at_thresholds':
+        'tf.compat.v1.metrics.false_positives_at_thresholds',
+    'tf.metrics.mean':
+        'tf.compat.v1.metrics.mean',
+    'tf.metrics.mean_absolute_error':
+        'tf.compat.v1.metrics.mean_absolute_error',
+    'tf.metrics.mean_cosine_distance':
+        'tf.compat.v1.metrics.mean_cosine_distance',
+    'tf.metrics.mean_iou':
+        'tf.compat.v1.metrics.mean_iou',
+    'tf.metrics.mean_per_class_accuracy':
+        'tf.compat.v1.metrics.mean_per_class_accuracy',
+    'tf.metrics.mean_relative_error':
+        'tf.compat.v1.metrics.mean_relative_error',
+    'tf.metrics.mean_squared_error':
+        'tf.compat.v1.metrics.mean_squared_error',
+    'tf.metrics.mean_tensor':
+        'tf.compat.v1.metrics.mean_tensor',
+    'tf.metrics.percentage_below':
+        'tf.compat.v1.metrics.percentage_below',
+    'tf.metrics.precision':
+        'tf.compat.v1.metrics.precision',
+    'tf.metrics.precision_at_k':
+        'tf.compat.v1.metrics.precision_at_k',
+    'tf.metrics.precision_at_thresholds':
+        'tf.compat.v1.metrics.precision_at_thresholds',
+    'tf.metrics.precision_at_top_k':
+        'tf.compat.v1.metrics.precision_at_top_k',
+    'tf.metrics.recall':
+        'tf.compat.v1.metrics.recall',
+    'tf.metrics.recall_at_k':
+        'tf.compat.v1.metrics.recall_at_k',
+    'tf.metrics.recall_at_thresholds':
+        'tf.compat.v1.metrics.recall_at_thresholds',
+    'tf.metrics.recall_at_top_k':
+        'tf.compat.v1.metrics.recall_at_top_k',
+    'tf.metrics.root_mean_squared_error':
+        'tf.compat.v1.metrics.root_mean_squared_error',
+    'tf.metrics.sensitivity_at_specificity':
+        'tf.compat.v1.metrics.sensitivity_at_specificity',
+    'tf.metrics.sparse_average_precision_at_k':
+        'tf.compat.v1.metrics.sparse_average_precision_at_k',
+    'tf.metrics.sparse_precision_at_k':
+        'tf.compat.v1.metrics.sparse_precision_at_k',
+    'tf.metrics.specificity_at_sensitivity':
+        'tf.compat.v1.metrics.specificity_at_sensitivity',
+    'tf.metrics.true_negatives':
+        'tf.compat.v1.metrics.true_negatives',
+    'tf.metrics.true_negatives_at_thresholds':
+        'tf.compat.v1.metrics.true_negatives_at_thresholds',
+    'tf.metrics.true_positives':
+        'tf.compat.v1.metrics.true_positives',
+    'tf.metrics.true_positives_at_thresholds':
+        'tf.compat.v1.metrics.true_positives_at_thresholds',
+    'tf.min_max_variable_partitioner':
+        'tf.compat.v1.min_max_variable_partitioner',
+    'tf.model_variables':
+        'tf.compat.v1.model_variables',
+    'tf.moving_average_variables':
+        'tf.compat.v1.moving_average_variables',
+    'tf.nn.avg_pool_v2':
+        'tf.nn.avg_pool',
+    'tf.nn.bidirectional_dynamic_rnn':
+        'tf.compat.v1.nn.bidirectional_dynamic_rnn',
+    'tf.nn.conv2d_backprop_filter':
+        'tf.compat.v1.nn.conv2d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter':
+        'tf.compat.v1.nn.conv3d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter_v2':
+        'tf.compat.v1.nn.conv3d_backprop_filter_v2',
+    'tf.nn.ctc_beam_search_decoder_v2':
+        'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.ctc_loss_v2':
+        'tf.nn.ctc_loss',
+    'tf.nn.depthwise_conv2d_native':
+        'tf.compat.v1.nn.depthwise_conv2d_native',
+    'tf.nn.depthwise_conv2d_native_backprop_filter':
+        'tf.nn.depthwise_conv2d_backprop_filter',
+    'tf.nn.depthwise_conv2d_native_backprop_input':
+        'tf.nn.depthwise_conv2d_backprop_input',
+    'tf.nn.dynamic_rnn':
+        'tf.compat.v1.nn.dynamic_rnn',
+    'tf.nn.log_uniform_candidate_sampler':
+        'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.max_pool_v2':
+        'tf.nn.max_pool',
+    'tf.nn.quantized_avg_pool':
+        'tf.compat.v1.nn.quantized_avg_pool',
+    'tf.nn.quantized_conv2d':
+        'tf.compat.v1.nn.quantized_conv2d',
+    'tf.nn.quantized_max_pool':
+        'tf.compat.v1.nn.quantized_max_pool',
+    'tf.nn.quantized_relu_x':
+        'tf.compat.v1.nn.quantized_relu_x',
+    'tf.nn.raw_rnn':
+        'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.relu_layer':
+        'tf.compat.v1.nn.relu_layer',
+    'tf.nn.rnn_cell.BasicLSTMCell':
+        'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
+    'tf.nn.rnn_cell.BasicRNNCell':
+        'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
+    'tf.nn.rnn_cell.DeviceWrapper':
+        'tf.compat.v1.nn.rnn_cell.DeviceWrapper',
+    'tf.nn.rnn_cell.DropoutWrapper':
+        'tf.compat.v1.nn.rnn_cell.DropoutWrapper',
+    'tf.nn.rnn_cell.GRUCell':
+        'tf.compat.v1.nn.rnn_cell.GRUCell',
+    'tf.nn.rnn_cell.LSTMCell':
+        'tf.compat.v1.nn.rnn_cell.LSTMCell',
+    'tf.nn.rnn_cell.LSTMStateTuple':
+        'tf.compat.v1.nn.rnn_cell.LSTMStateTuple',
+    'tf.nn.rnn_cell.MultiRNNCell':
+        'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
+    'tf.nn.rnn_cell.RNNCell':
+        'tf.compat.v1.nn.rnn_cell.RNNCell',
+    'tf.nn.rnn_cell.ResidualWrapper':
+        'tf.compat.v1.nn.rnn_cell.ResidualWrapper',
+    'tf.nn.static_bidirectional_rnn':
+        'tf.compat.v1.nn.static_bidirectional_rnn',
+    'tf.nn.static_rnn':
+        'tf.compat.v1.nn.static_rnn',
+    'tf.nn.uniform_candidate_sampler':
+        'tf.random.uniform_candidate_sampler',
+    'tf.nn.xw_plus_b':
+        'tf.compat.v1.nn.xw_plus_b',
+    'tf.op_scope':
+        'tf.compat.v1.op_scope',
+    'tf.orthogonal_initializer':
+        'tf.compat.v1.orthogonal_initializer',
+    'tf.parse_single_sequence_example':
+        'tf.io.parse_single_sequence_example',
+    'tf.parse_tensor':
+        'tf.io.parse_tensor',
+    'tf.placeholder':
+        'tf.compat.v1.placeholder',
+    'tf.placeholder_with_default':
+        'tf.compat.v1.placeholder_with_default',
+    'tf.polygamma':
+        'tf.math.polygamma',
+    'tf.profiler.AdviceProto':
+        'tf.compat.v1.profiler.AdviceProto',
+    'tf.profiler.GraphNodeProto':
+        'tf.compat.v1.profiler.GraphNodeProto',
+    'tf.profiler.MultiGraphNodeProto':
+        'tf.compat.v1.profiler.MultiGraphNodeProto',
+    'tf.profiler.OpLogProto':
+        'tf.compat.v1.profiler.OpLogProto',
+    'tf.profiler.ProfileOptionBuilder':
+        'tf.compat.v1.profiler.ProfileOptionBuilder',
+    'tf.profiler.Profiler':
+        'tf.compat.v1.profiler.Profiler',
+    'tf.profiler.advise':
+        'tf.compat.v1.profiler.advise',
+    'tf.profiler.profile':
+        'tf.compat.v1.profiler.profile',
+    'tf.profiler.write_op_log':
+        'tf.compat.v1.profiler.write_op_log',
+    'tf.py_func':
+        'tf.compat.v1.py_func',
+    'tf.python_io.TFRecordCompressionType':
+        'tf.io.TFRecordCompressionType',
+    'tf.python_io.TFRecordOptions':
+        'tf.io.TFRecordOptions',
+    'tf.python_io.TFRecordWriter':
+        'tf.io.TFRecordWriter',
+    'tf.python_io.tf_record_iterator':
+        'tf.compat.v1.python_io.tf_record_iterator',
+    'tf.qr':
+        'tf.linalg.qr',
+    'tf.quantize':
+        'tf.quantization.quantize',
+    'tf.quantized_concat':
+        'tf.quantization.quantized_concat',
+    'tf.ragged.RaggedTensorValue':
+        'tf.compat.v1.ragged.RaggedTensorValue',
+    'tf.ragged.constant_value':
+        'tf.compat.v1.ragged.constant_value',
+    'tf.random.get_seed':
+        'tf.compat.v1.random.get_seed',
+    'tf.random.set_random_seed':
+        'tf.compat.v1.random.set_random_seed',
+    'tf.random_crop':
+        'tf.image.random_crop',
+    'tf.random_gamma':
+        'tf.random.gamma',
+    'tf.random_normal':
+        'tf.random.normal',
+    'tf.random_shuffle':
+        'tf.random.shuffle',
+    'tf.random_uniform':
+        'tf.random.uniform',
+    'tf.read_file':
+        'tf.io.read_file',
+    'tf.real':
+        'tf.math.real',
+    'tf.reciprocal':
+        'tf.math.reciprocal',
+    'tf.regex_replace':
+        'tf.strings.regex_replace',
+    'tf.report_uninitialized_variables':
+        'tf.compat.v1.report_uninitialized_variables',
+    'tf.reset_default_graph':
+        'tf.compat.v1.reset_default_graph',
+    'tf.resource_loader.get_data_files_path':
+        'tf.compat.v1.resource_loader.get_data_files_path',
+    'tf.resource_loader.get_path_to_datafile':
+        'tf.compat.v1.resource_loader.get_path_to_datafile',
+    'tf.resource_loader.get_root_dir_with_all_resources':
+        'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
+    'tf.resource_loader.load_resource':
+        'tf.compat.v1.resource_loader.load_resource',
+    'tf.resource_loader.readahead_file_path':
+        'tf.compat.v1.resource_loader.readahead_file_path',
+    'tf.resource_variables_enabled':
+        'tf.compat.v1.resource_variables_enabled',
+    'tf.reverse_v2':
+        'tf.reverse',
+    'tf.rint':
+        'tf.math.rint',
+    'tf.rsqrt':
+        'tf.math.rsqrt',
+    'tf.saved_model.Builder':
+        'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.LEGACY_INIT_OP_KEY':
+        'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.MAIN_OP_KEY':
+        'tf.compat.v1.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.build_signature_def':
+        'tf.compat.v1.saved_model.build_signature_def',
+    'tf.saved_model.build_tensor_info':
+        'tf.compat.v1.saved_model.build_tensor_info',
+    'tf.saved_model.builder.SavedModelBuilder':
+        'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.classification_signature_def':
+        'tf.compat.v1.saved_model.classification_signature_def',
+    'tf.saved_model.constants.ASSETS_DIRECTORY':
+        'tf.saved_model.ASSETS_DIRECTORY',
+    'tf.saved_model.constants.ASSETS_KEY':
+        'tf.saved_model.ASSETS_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY':
+        'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY':
+        'tf.compat.v1.saved_model.constants.MAIN_OP_KEY',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB':
+        'tf.saved_model.SAVED_MODEL_FILENAME_PB',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT':
+        'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
+    'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION':
+        'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
+    'tf.saved_model.constants.VARIABLES_DIRECTORY':
+        'tf.saved_model.VARIABLES_DIRECTORY',
+    'tf.saved_model.constants.VARIABLES_FILENAME':
+        'tf.saved_model.VARIABLES_FILENAME',
+    'tf.saved_model.experimental.save':
+        'tf.saved_model.save',
+    'tf.saved_model.get_tensor_from_tensor_info':
+        'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.is_valid_signature':
+        'tf.compat.v1.saved_model.is_valid_signature',
+    'tf.saved_model.loader.load':
+        'tf.compat.v1.saved_model.loader.load',
+    'tf.saved_model.loader.maybe_saved_model_directory':
+        'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
+    'tf.saved_model.main_op.main_op':
+        'tf.compat.v1.saved_model.main_op.main_op',
+    'tf.saved_model.main_op.main_op_with_restore':
+        'tf.compat.v1.saved_model.main_op.main_op_with_restore',
+    'tf.saved_model.main_op_with_restore':
+        'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.maybe_saved_model_directory':
+        'tf.compat.v1.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.predict_signature_def':
+        'tf.compat.v1.saved_model.predict_signature_def',
+    'tf.saved_model.regression_signature_def':
+        'tf.compat.v1.saved_model.regression_signature_def',
+    'tf.saved_model.signature_constants.CLASSIFY_INPUTS':
+        'tf.saved_model.CLASSIFY_INPUTS',
+    'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME':
+        'tf.saved_model.CLASSIFY_METHOD_NAME',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES':
+        'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES':
+        'tf.saved_model.CLASSIFY_OUTPUT_SCORES',
+    'tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY':
+        'tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY',
+    'tf.saved_model.signature_constants.PREDICT_INPUTS':
+        'tf.saved_model.PREDICT_INPUTS',
+    'tf.saved_model.signature_constants.PREDICT_METHOD_NAME':
+        'tf.saved_model.PREDICT_METHOD_NAME',
+    'tf.saved_model.signature_constants.PREDICT_OUTPUTS':
+        'tf.saved_model.PREDICT_OUTPUTS',
+    'tf.saved_model.signature_constants.REGRESS_INPUTS':
+        'tf.saved_model.REGRESS_INPUTS',
+    'tf.saved_model.signature_constants.REGRESS_METHOD_NAME':
+        'tf.saved_model.REGRESS_METHOD_NAME',
+    'tf.saved_model.signature_constants.REGRESS_OUTPUTS':
+        'tf.saved_model.REGRESS_OUTPUTS',
+    'tf.saved_model.signature_def_utils.build_signature_def':
+        'tf.compat.v1.saved_model.signature_def_utils.build_signature_def',
+    'tf.saved_model.signature_def_utils.classification_signature_def':
+        'tf.compat.v1.saved_model.signature_def_utils.classification_signature_def',
+    'tf.saved_model.signature_def_utils.is_valid_signature':
+        'tf.compat.v1.saved_model.signature_def_utils.is_valid_signature',
+    'tf.saved_model.signature_def_utils.predict_signature_def':
+        'tf.compat.v1.saved_model.signature_def_utils.predict_signature_def',
+    'tf.saved_model.signature_def_utils.regression_signature_def':
+        'tf.compat.v1.saved_model.signature_def_utils.regression_signature_def',
+    'tf.saved_model.simple_save':
+        'tf.compat.v1.saved_model.simple_save',
+    'tf.saved_model.tag_constants.GPU':
+        'tf.saved_model.GPU',
+    'tf.saved_model.tag_constants.SERVING':
+        'tf.saved_model.SERVING',
+    'tf.saved_model.tag_constants.TPU':
+        'tf.saved_model.TPU',
+    'tf.saved_model.tag_constants.TRAINING':
+        'tf.saved_model.TRAINING',
+    'tf.saved_model.utils.build_tensor_info':
+        'tf.compat.v1.saved_model.utils.build_tensor_info',
+    'tf.saved_model.utils.get_tensor_from_tensor_info':
+        'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
+    'tf.scatter_add':
+        'tf.compat.v1.scatter_add',
+    'tf.scatter_div':
+        'tf.compat.v1.scatter_div',
+    'tf.scatter_max':
+        'tf.compat.v1.scatter_max',
+    'tf.scatter_min':
+        'tf.compat.v1.scatter_min',
+    'tf.scatter_mul':
+        'tf.compat.v1.scatter_mul',
+    'tf.scatter_nd_add':
+        'tf.compat.v1.scatter_nd_add',
+    'tf.scatter_nd_sub':
+        'tf.compat.v1.scatter_nd_sub',
+    'tf.scatter_nd_update':
+        'tf.compat.v1.scatter_nd_update',
+    'tf.scatter_sub':
+        'tf.compat.v1.scatter_sub',
+    'tf.scatter_update':
+        'tf.compat.v1.scatter_update',
+    'tf.segment_max':
+        'tf.math.segment_max',
+    'tf.segment_mean':
+        'tf.math.segment_mean',
+    'tf.segment_min':
+        'tf.math.segment_min',
+    'tf.segment_prod':
+        'tf.math.segment_prod',
+    'tf.segment_sum':
+        'tf.math.segment_sum',
+    'tf.self_adjoint_eig':
+        'tf.linalg.eigh',
+    'tf.self_adjoint_eigvals':
+        'tf.linalg.eigvalsh',
+    'tf.serialize_many_sparse':
+        'tf.compat.v1.serialize_many_sparse',
+    'tf.serialize_sparse':
+        'tf.compat.v1.serialize_sparse',
+    'tf.serialize_tensor':
+        'tf.io.serialize_tensor',
+    'tf.set_random_seed':
+        'tf.compat.v1.set_random_seed',
+    'tf.setdiff1d':
+        'tf.compat.v1.setdiff1d',
+    'tf.sets.set_difference':
+        'tf.sets.difference',
+    'tf.sets.set_intersection':
+        'tf.sets.intersection',
+    'tf.sets.set_size':
+        'tf.sets.size',
+    'tf.sets.set_union':
+        'tf.sets.union',
+    'tf.space_to_depth':
+        'tf.compat.v1.space_to_depth',
+    'tf.sparse.matmul':
+        'tf.sparse.sparse_dense_matmul',
+    'tf.sparse.merge':
+        'tf.compat.v1.sparse.merge',
+    'tf.sparse.placeholder':
+        'tf.compat.v1.sparse.placeholder',
+    'tf.sparse.reduce_max_sparse':
+        'tf.compat.v1.sparse.reduce_max_sparse',
+    'tf.sparse.reduce_sum_sparse':
+        'tf.compat.v1.sparse.reduce_sum_sparse',
+    'tf.sparse_fill_empty_rows':
+        'tf.sparse.fill_empty_rows',
+    'tf.sparse_mask':
+        'tf.sparse.mask',
+    'tf.sparse_maximum':
+        'tf.sparse.maximum',
+    'tf.sparse_merge':
+        'tf.compat.v1.sparse_merge',
+    'tf.sparse_minimum':
+        'tf.sparse.minimum',
+    'tf.sparse_placeholder':
+        'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max_sparse':
+        'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum_sparse':
+        'tf.compat.v1.sparse_reduce_sum_sparse',
+    'tf.sparse_reorder':
+        'tf.sparse.reorder',
+    'tf.sparse_reset_shape':
+        'tf.sparse.reset_shape',
+    'tf.sparse_reshape':
+        'tf.sparse.reshape',
+    'tf.sparse_retain':
+        'tf.sparse.retain',
+    'tf.sparse_segment_mean':
+        'tf.compat.v1.sparse_segment_mean',
+    'tf.sparse_segment_sqrt_n':
+        'tf.compat.v1.sparse_segment_sqrt_n',
+    'tf.sparse_segment_sum':
+        'tf.compat.v1.sparse_segment_sum',
+    'tf.sparse_slice':
+        'tf.sparse.slice',
+    'tf.sparse_softmax':
+        'tf.sparse.softmax',
+    'tf.sparse_tensor_dense_matmul':
+        'tf.sparse.sparse_dense_matmul',
+    'tf.sparse_tensor_to_dense':
+        'tf.sparse.to_dense',
+    'tf.sparse_to_dense':
+        'tf.compat.v1.sparse_to_dense',
+    'tf.sparse_to_indicator':
+        'tf.sparse.to_indicator',
+    'tf.sparse_transpose':
+        'tf.sparse.transpose',
+    'tf.spectral.dct':
+        'tf.signal.dct',
+    'tf.spectral.fft':
+        'tf.signal.fft',
+    'tf.spectral.fft2d':
+        'tf.signal.fft2d',
+    'tf.spectral.fft3d':
+        'tf.signal.fft3d',
+    'tf.spectral.idct':
+        'tf.signal.idct',
+    'tf.spectral.ifft':
+        'tf.signal.ifft',
+    'tf.spectral.ifft2d':
+        'tf.signal.ifft2d',
+    'tf.spectral.ifft3d':
+        'tf.signal.ifft3d',
+    'tf.spectral.irfft':
+        'tf.signal.irfft',
+    'tf.spectral.irfft2d':
+        'tf.signal.irfft2d',
+    'tf.spectral.irfft3d':
+        'tf.signal.irfft3d',
+    'tf.spectral.rfft':
+        'tf.signal.rfft',
+    'tf.spectral.rfft2d':
+        'tf.signal.rfft2d',
+    'tf.spectral.rfft3d':
+        'tf.signal.rfft3d',
+    'tf.squared_difference':
+        'tf.math.squared_difference',
+    'tf.string_join':
+        'tf.strings.join',
+    'tf.string_strip':
+        'tf.strings.strip',
+    'tf.string_to_hash_bucket_fast':
+        'tf.strings.to_hash_bucket_fast',
+    'tf.string_to_hash_bucket_strong':
+        'tf.strings.to_hash_bucket_strong',
+    'tf.summary.Event':
+        'tf.compat.v1.summary.Event',
+    'tf.summary.FileWriter':
+        'tf.compat.v1.summary.FileWriter',
+    'tf.summary.FileWriterCache':
+        'tf.compat.v1.summary.FileWriterCache',
+    'tf.summary.SessionLog':
+        'tf.compat.v1.summary.SessionLog',
+    'tf.summary.Summary':
+        'tf.compat.v1.summary.Summary',
+    'tf.summary.SummaryDescription':
+        'tf.compat.v1.summary.SummaryDescription',
+    'tf.summary.TaggedRunMetadata':
+        'tf.compat.v1.summary.TaggedRunMetadata',
+    'tf.summary.audio':
+        'tf.compat.v1.summary.audio',
+    'tf.summary.get_summary_description':
+        'tf.compat.v1.summary.get_summary_description',
+    'tf.summary.histogram':
+        'tf.compat.v1.summary.histogram',
+    'tf.summary.image':
+        'tf.compat.v1.summary.image',
+    'tf.summary.initialize':
+        'tf.compat.v1.summary.initialize',
+    'tf.summary.merge':
+        'tf.compat.v1.summary.merge',
+    'tf.summary.merge_all':
+        'tf.compat.v1.summary.merge_all',
+    'tf.summary.scalar':
+        'tf.compat.v1.summary.scalar',
+    'tf.summary.tensor_summary':
+        'tf.compat.v1.summary.tensor_summary',
+    'tf.summary.text':
+        'tf.compat.v1.summary.text',
+    'tf.svd':
+        'tf.linalg.svd',
+    'tf.tables_initializer':
+        'tf.compat.v1.tables_initializer',
+    'tf.tensor_scatter_add':
+        'tf.tensor_scatter_nd_add',
+    'tf.tensor_scatter_sub':
+        'tf.tensor_scatter_nd_sub',
+    'tf.tensor_scatter_update':
+        'tf.tensor_scatter_nd_update',
+    'tf.test.StubOutForTesting':
+        'tf.compat.v1.test.StubOutForTesting',
+    'tf.test.compute_gradient_error':
+        'tf.compat.v1.test.compute_gradient_error',
+    'tf.test.get_temp_dir':
+        'tf.compat.v1.test.get_temp_dir',
+    'tf.test.mock':
+        'tf.compat.v1.test.mock',
+    'tf.test.test_src_dir_path':
+        'tf.compat.v1.test.test_src_dir_path',
+    'tf.to_bfloat16':
+        'tf.compat.v1.to_bfloat16',
+    'tf.to_complex128':
+        'tf.compat.v1.to_complex128',
+    'tf.to_complex64':
+        'tf.compat.v1.to_complex64',
+    'tf.to_double':
+        'tf.compat.v1.to_double',
+    'tf.to_float':
+        'tf.compat.v1.to_float',
+    'tf.to_int32':
+        'tf.compat.v1.to_int32',
+    'tf.to_int64':
+        'tf.compat.v1.to_int64',
+    'tf.trace':
+        'tf.linalg.trace',
+    'tf.train.AdadeltaOptimizer':
+        'tf.compat.v1.train.AdadeltaOptimizer',
+    'tf.train.AdagradDAOptimizer':
+        'tf.compat.v1.train.AdagradDAOptimizer',
+    'tf.train.AdagradOptimizer':
+        'tf.compat.v1.train.AdagradOptimizer',
+    'tf.train.AdamOptimizer':
+        'tf.compat.v1.train.AdamOptimizer',
+    'tf.train.CheckpointSaverHook':
+        'tf.estimator.CheckpointSaverHook',
+    'tf.train.CheckpointSaverListener':
+        'tf.estimator.CheckpointSaverListener',
+    'tf.train.ChiefSessionCreator':
+        'tf.compat.v1.train.ChiefSessionCreator',
+    'tf.train.FeedFnHook':
+        'tf.estimator.FeedFnHook',
+    'tf.train.FinalOpsHook':
+        'tf.estimator.FinalOpsHook',
+    'tf.train.FtrlOptimizer':
+        'tf.compat.v1.train.FtrlOptimizer',
+    'tf.train.GlobalStepWaiterHook':
+        'tf.estimator.GlobalStepWaiterHook',
+    'tf.train.GradientDescentOptimizer':
+        'tf.compat.v1.train.GradientDescentOptimizer',
+    'tf.train.LoggingTensorHook':
+        'tf.estimator.LoggingTensorHook',
+    'tf.train.LooperThread':
+        'tf.compat.v1.train.LooperThread',
+    'tf.train.MomentumOptimizer':
+        'tf.compat.v1.train.MomentumOptimizer',
+    'tf.train.MonitoredSession':
+        'tf.compat.v1.train.MonitoredSession',
+    'tf.train.MonitoredTrainingSession':
+        'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.NanLossDuringTrainingError':
+        'tf.estimator.NanLossDuringTrainingError',
+    'tf.train.NanTensorHook':
+        'tf.estimator.NanTensorHook',
+    'tf.train.NewCheckpointReader':
+        'tf.compat.v1.train.NewCheckpointReader',
+    'tf.train.Optimizer':
+        'tf.compat.v1.train.Optimizer',
+    'tf.train.ProfilerHook':
+        'tf.estimator.ProfilerHook',
+    'tf.train.ProximalAdagradOptimizer':
+        'tf.compat.v1.train.ProximalAdagradOptimizer',
+    'tf.train.ProximalGradientDescentOptimizer':
+        'tf.compat.v1.train.ProximalGradientDescentOptimizer',
+    'tf.train.QueueRunner':
+        'tf.compat.v1.train.QueueRunner',
+    'tf.train.RMSPropOptimizer':
+        'tf.compat.v1.train.RMSPropOptimizer',
+    'tf.train.Saver':
+        'tf.compat.v1.train.Saver',
+    'tf.train.SaverDef':
+        'tf.compat.v1.train.SaverDef',
+    'tf.train.Scaffold':
+        'tf.compat.v1.train.Scaffold',
+    'tf.train.SecondOrStepTimer':
+        'tf.estimator.SecondOrStepTimer',
+    'tf.train.Server':
+        'tf.distribute.Server',
+    'tf.train.SessionCreator':
+        'tf.compat.v1.train.SessionCreator',
+    'tf.train.SessionManager':
+        'tf.compat.v1.train.SessionManager',
+    'tf.train.SessionRunArgs':
+        'tf.estimator.SessionRunArgs',
+    'tf.train.SessionRunContext':
+        'tf.estimator.SessionRunContext',
+    'tf.train.SessionRunHook':
+        'tf.estimator.SessionRunHook',
+    'tf.train.SessionRunValues':
+        'tf.estimator.SessionRunValues',
+    'tf.train.SingularMonitoredSession':
+        'tf.compat.v1.train.SingularMonitoredSession',
+    'tf.train.StepCounterHook':
+        'tf.estimator.StepCounterHook',
+    'tf.train.StopAtStepHook':
+        'tf.estimator.StopAtStepHook',
+    'tf.train.SummarySaverHook':
+        'tf.estimator.SummarySaverHook',
+    'tf.train.Supervisor':
+        'tf.compat.v1.train.Supervisor',
+    'tf.train.SyncReplicasOptimizer':
+        'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.VocabInfo':
+        'tf.estimator.VocabInfo',
+    'tf.train.WorkerSessionCreator':
+        'tf.compat.v1.train.WorkerSessionCreator',
+    'tf.train.add_queue_runner':
+        'tf.compat.v1.train.add_queue_runner',
+    'tf.train.assert_global_step':
+        'tf.compat.v1.train.assert_global_step',
+    'tf.train.basic_train_loop':
+        'tf.compat.v1.train.basic_train_loop',
+    'tf.train.batch':
+        'tf.compat.v1.train.batch',
+    'tf.train.batch_join':
+        'tf.compat.v1.train.batch_join',
+    'tf.train.checkpoint_exists':
+        'tf.compat.v1.train.checkpoint_exists',
+    'tf.train.cosine_decay':
+        'tf.compat.v1.train.cosine_decay',
+    'tf.train.cosine_decay_restarts':
+        'tf.compat.v1.train.cosine_decay_restarts',
+    'tf.train.create_global_step':
+        'tf.compat.v1.train.create_global_step',
+    'tf.train.do_quantize_training_on_graphdef':
+        'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.exponential_decay':
+        'tf.compat.v1.train.exponential_decay',
+    'tf.train.export_meta_graph':
+        'tf.compat.v1.train.export_meta_graph',
+    'tf.train.generate_checkpoint_state_proto':
+        'tf.compat.v1.train.generate_checkpoint_state_proto',
+    'tf.train.get_checkpoint_mtimes':
+        'tf.compat.v1.train.get_checkpoint_mtimes',
+    'tf.train.get_global_step':
+        'tf.compat.v1.train.get_global_step',
+    'tf.train.get_or_create_global_step':
+        'tf.compat.v1.train.get_or_create_global_step',
+    'tf.train.global_step':
+        'tf.compat.v1.train.global_step',
+    'tf.train.import_meta_graph':
+        'tf.compat.v1.train.import_meta_graph',
+    'tf.train.init_from_checkpoint':
+        'tf.compat.v1.train.init_from_checkpoint',
+    'tf.train.input_producer':
+        'tf.compat.v1.train.input_producer',
+    'tf.train.inverse_time_decay':
+        'tf.compat.v1.train.inverse_time_decay',
+    'tf.train.limit_epochs':
+        'tf.compat.v1.train.limit_epochs',
+    'tf.train.linear_cosine_decay':
+        'tf.compat.v1.train.linear_cosine_decay',
+    'tf.train.match_filenames_once':
+        'tf.io.match_filenames_once',
+    'tf.train.maybe_batch':
+        'tf.compat.v1.train.maybe_batch',
+    'tf.train.maybe_batch_join':
+        'tf.compat.v1.train.maybe_batch_join',
+    'tf.train.maybe_shuffle_batch':
+        'tf.compat.v1.train.maybe_shuffle_batch',
+    'tf.train.maybe_shuffle_batch_join':
+        'tf.compat.v1.train.maybe_shuffle_batch_join',
+    'tf.train.natural_exp_decay':
+        'tf.compat.v1.train.natural_exp_decay',
+    'tf.train.noisy_linear_cosine_decay':
+        'tf.compat.v1.train.noisy_linear_cosine_decay',
+    'tf.train.piecewise_constant':
+        'tf.compat.v1.train.piecewise_constant',
+    'tf.train.piecewise_constant_decay':
+        'tf.compat.v1.train.piecewise_constant_decay',
+    'tf.train.polynomial_decay':
+        'tf.compat.v1.train.polynomial_decay',
+    'tf.train.queue_runner.QueueRunner':
+        'tf.compat.v1.train.queue_runner.QueueRunner',
+    'tf.train.queue_runner.add_queue_runner':
+        'tf.compat.v1.train.queue_runner.add_queue_runner',
+    'tf.train.queue_runner.start_queue_runners':
+        'tf.compat.v1.train.queue_runner.start_queue_runners',
+    'tf.train.range_input_producer':
+        'tf.compat.v1.train.range_input_producer',
+    'tf.train.remove_checkpoint':
+        'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.replica_device_setter':
+        'tf.compat.v1.train.replica_device_setter',
+    'tf.train.shuffle_batch':
+        'tf.compat.v1.train.shuffle_batch',
+    'tf.train.shuffle_batch_join':
+        'tf.compat.v1.train.shuffle_batch_join',
+    'tf.train.slice_input_producer':
+        'tf.compat.v1.train.slice_input_producer',
+    'tf.train.start_queue_runners':
+        'tf.compat.v1.train.start_queue_runners',
+    'tf.train.string_input_producer':
+        'tf.compat.v1.train.string_input_producer',
+    'tf.train.summary_iterator':
+        'tf.compat.v1.train.summary_iterator',
+    'tf.train.update_checkpoint_state':
+        'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.warm_start':
+        'tf.compat.v1.train.warm_start',
+    'tf.train.write_graph':
+        'tf.io.write_graph',
+    'tf.trainable_variables':
+        'tf.compat.v1.trainable_variables',
+    'tf.truncated_normal':
+        'tf.random.truncated_normal',
+    'tf.uniform_unit_scaling_initializer':
+        'tf.compat.v1.uniform_unit_scaling_initializer',
+    'tf.unsorted_segment_max':
+        'tf.math.unsorted_segment_max',
+    'tf.unsorted_segment_mean':
+        'tf.math.unsorted_segment_mean',
+    'tf.unsorted_segment_min':
+        'tf.math.unsorted_segment_min',
+    'tf.unsorted_segment_prod':
+        'tf.math.unsorted_segment_prod',
+    'tf.unsorted_segment_sqrt_n':
+        'tf.math.unsorted_segment_sqrt_n',
+    'tf.unsorted_segment_sum':
+        'tf.math.unsorted_segment_sum',
+    'tf.variable_axis_size_partitioner':
+        'tf.compat.v1.variable_axis_size_partitioner',
+    'tf.variable_op_scope':
+        'tf.compat.v1.variable_op_scope',
+    'tf.variable_scope':
+        'tf.compat.v1.variable_scope',
+    'tf.variables_initializer':
+        'tf.compat.v1.variables_initializer',
+    'tf.variance_scaling_initializer':
+        'tf.compat.v1.variance_scaling_initializer',
+    'tf.verify_tensor_all_finite':
+        'tf.compat.v1.verify_tensor_all_finite',
+    'tf.wrap_function':
+        'tf.compat.v1.wrap_function',
+    'tf.write_file':
+        'tf.io.write_file',
+    'tf.zeta':
+        'tf.math.zeta'
 }
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index f9b0e3f9d8e6107701b01768b9674680d0e4b64a..b94d51bb48a52e481e66bfd09c0cbaae6b7d402f 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -28,15 +28,26 @@ from __future__ import print_function
 reorders = {
     'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
     'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.batch_gather': ['params', 'indices', 'name'],
-    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
+    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name', 'block_shape'],
     'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
     'tf.cond': ['pred', 'true_fn', 'false_fn', 'strict', 'name', 'fn1', 'fn2'],
     'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
-    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
+    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype', 'dtype_hint'],
     'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
     'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.estimator.BaselineClassifier': ['model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'config', 'loss_reduction'],
+    'tf.estimator.BaselineRegressor': ['model_dir', 'label_dimension', 'weight_column', 'optimizer', 'config', 'loss_reduction'],
+    'tf.estimator.DNNClassifier': ['hidden_units', 'feature_columns', 'model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'activation_fn', 'dropout', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm'],
+    'tf.estimator.DNNLinearCombinedClassifier': ['model_dir', 'linear_feature_columns', 'linear_optimizer', 'dnn_feature_columns', 'dnn_optimizer', 'dnn_hidden_units', 'dnn_activation_fn', 'dnn_dropout', 'n_classes', 'weight_column', 'label_vocabulary', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm', 'linear_sparse_combiner'],
+    'tf.estimator.DNNLinearCombinedRegressor': ['model_dir', 'linear_feature_columns', 'linear_optimizer', 'dnn_feature_columns', 'dnn_optimizer', 'dnn_hidden_units', 'dnn_activation_fn', 'dnn_dropout', 'label_dimension', 'weight_column', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm', 'linear_sparse_combiner'],
+    'tf.estimator.DNNRegressor': ['hidden_units', 'feature_columns', 'model_dir', 'label_dimension', 'weight_column', 'optimizer', 'activation_fn', 'dropout', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm'],
+    'tf.estimator.LinearClassifier': ['feature_columns', 'model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'config', 'partitioner', 'warm_start_from', 'loss_reduction', 'sparse_combiner'],
+    'tf.estimator.LinearRegressor': ['feature_columns', 'model_dir', 'label_dimension', 'weight_column', 'optimizer', 'config', 'partitioner', 'warm_start_from', 'loss_reduction', 'sparse_combiner'],
     'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.gradients': ['ys', 'xs', 'grad_ys', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method', 'stop_gradients', 'unconnected_gradients'],
+    'tf.hessians': ['ys', 'xs', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method'],
+    'tf.image.sample_distorted_bounding_box': ['image_size', 'bounding_boxes', 'seed', 'seed2', 'min_object_covered', 'aspect_ratio_range', 'area_range', 'max_attempts', 'use_image_if_no_bounding_boxes', 'name'],
+    'tf.initializers.uniform_unit_scaling': ['factor', 'seed', 'dtype'],
     'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
     'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
     'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
@@ -56,20 +67,29 @@ reorders = {
     'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
-    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format'],
+    'tf.nn.avg_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.avg_pool2d': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.conv1d': ['value', 'filters', 'stride', 'padding', 'use_cudnn_on_gpu', 'data_format', 'name', 'input', 'dilations'],
+    'tf.nn.conv2d': ['input', 'filter', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
+    'tf.nn.conv2d_backprop_input': ['input_sizes', 'filter', 'out_backprop', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
+    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format', 'filters', 'dilations'],
     'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.ctc_beam_search_decoder': ['inputs', 'sequence_length', 'beam_width', 'top_paths', 'merge_repeated'],
     'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
     'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
     'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.fractional_avg_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
+    'tf.nn.fractional_max_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
-    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
-    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
-    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
-    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name'],
-    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.nn.max_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims', 'keepdims'],
+    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format', 'dilations'],
+    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
+    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name', 'axis'],
+    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
     'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
+    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims', 'keepdims'],
     'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
     'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
     'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
@@ -80,7 +100,7 @@ reorders = {
     'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
     'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
     'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
@@ -92,28 +112,31 @@ reorders = {
     'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
     'tf.shape': ['input', 'name', 'out_type'],
     'tf.size': ['input', 'name', 'out_type'],
-    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
     'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
     'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
     'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
     'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
     'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
     'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
     'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
     'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse_segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.string_split': ['source', 'delimiter', 'skip_empty'],
     'tf.strings.length': ['input', 'name', 'unit'],
-    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
     'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
     'tf.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.test.assert_equal_graph_def': ['actual', 'expected', 'checkpoint_v2'],
     'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
     'tf.tuple': ['tensors', 'name', 'control_inputs'],
+    'tf.uniform_unit_scaling_initializer': ['factor', 'seed', 'dtype'],
     'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
 }
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
index 917236da4b4b75a1a1ca65e11d49d722cc178571..70a00e1220b30297c375f51c369f3f825c8aa2fc 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
@@ -126,21 +126,12 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       self.assertAllEqual(tf.squeeze(tf.expand_dims(a, 1), [1]).eval(),
                           a)
       self.assertAllEqual(
-          tf.expand_dims(
-              tf.squeeze(
-                  [[1, 2, 3]], squeeze_dims=[0]), dim=0).eval(),
-          a)
+          tf.expand_dims(tf.squeeze([[1, 2, 3]], axis=[0]), dim=0).eval(), a)
       self.assertAllEqual(
-          tf.squeeze(
-              tf.expand_dims(
-                  [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-          a)
+          tf.squeeze(tf.expand_dims([[1, 2, 3]], dim=1), axis=[1]).eval(), a)
 
       self.assertAllEqual(
-          tf.squeeze(
-              tf.expand_dims(
-                  [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-          a)
+          tf.squeeze(tf.expand_dims([[1, 2, 3]], dim=1), axis=[1]).eval(), a)
 
   @test_util.run_v1_only("b/120545219")
   def testArithmeticRenames(self):
@@ -203,7 +194,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       # make some variables
       _ = [tf.Variable([1, 2, 3], dtype=tf.float32),
            tf.Variable([1, 2, 3], dtype=tf.int32)]
-      s.run(tf.initialize_all_variables())
+      s.run(tf.global_variables_initializer())
       _ = [v.name for v in tf.all_variables()]
       _ = [v.name for v in tf.local_variables()]
 
@@ -211,7 +202,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def testSummaries(self):
     with self.cached_session() as s:
       var = tf.Variable([1, 2, 3], dtype=tf.float32)
-      s.run(tf.initialize_all_variables())
+      s.run(tf.global_variables_initializer())
       x, y = np.meshgrid(np.linspace(-10, 10, 256), np.linspace(-10, 10, 256))
       image = np.sin(x**2 + y**2) / np.sqrt(x**2 + y**2) * .5 + .5
       image = image[None, :, :, None]
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 2663762aa70253f54037393c0cb3cd791a040d56..3f86596f9bcfbc410242fc58da508f68b3625f1c 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -28,13 +28,10 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def setUp(self):
     tf.enable_eager_execution()
 
-  @test_util.run_v1_only("b/120545219")
   def testRenames(self):
-    with self.cached_session():
-      self.assertAllClose(1.04719755, tf.acos(0.5))
-      self.assertAllClose(0.5, tf.rsqrt(4.0))
+    self.assertAllClose(1.04719755, tf.acos(0.5))
+    self.assertAllClose(0.5, tf.rsqrt(4.0))
 
-  @test_util.run_v1_only("b/120545219")
   def testSerializeSparseTensor(self):
     sp_input = tf.SparseTensor(
         indices=tf.constant([[1]], dtype=tf.int64),
@@ -46,7 +43,6 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       self.assertEqual((3,), serialized_sp.shape)
       self.assertTrue(serialized_sp[0].numpy())  # check non-empty
 
-  @test_util.run_v1_only("b/120545219")
   def testSerializeManySparse(self):
     sp_input = tf.SparseTensor(
         indices=tf.constant([[0, 1]], dtype=tf.int64),
@@ -58,7 +54,6 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           sp_input, 'serialize_name', tf.string)
       self.assertEqual((1, 3), serialized_sp.shape)
 
-  @test_util.run_v1_only("b/120545219")
   def testArgMaxMin(self):
     self.assertAllClose(
         [1],
@@ -70,7 +65,6 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         [0],
         tf.argmin([[1, 3, 2]], name='abc', dimension=1))
 
-  @test_util.run_v1_only("b/120545219")
   def testSoftmaxCrossEntropyWithLogits(self):
     out = tf.nn.softmax_cross_entropy_with_logits(
         logits=[0.1, 0.8], labels=[0, 1])
@@ -79,6 +73,30 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         logits=[0.1, 0.8], labels=[0, 1])
     self.assertAllClose(out, 0.40318608)
 
+  def testLinearClassifier(self):
+    feature_column = tf.feature_column.numeric_column(
+        'feature', shape=(1,))
+
+    classifier = tf.estimator.LinearClassifier(
+        n_classes=2, feature_columns=[feature_column])
+
+    data = {'feature': [1, 20, 3]}
+    target = [0, 1, 0]
+    classifier.train(
+        input_fn=lambda: (data, target),
+        steps=100)
+    scores = classifier.evaluate(
+        input_fn=lambda: (data, target),
+        steps=100)
+    self.assertGreater(scores['accuracy'], 0.99)
+
+  def testUniformUnitScalingInitializer(self):
+    init = tf.initializers.uniform_unit_scaling(0.5, seed=1)
+    self.assertArrayNear(
+        [-0.45200047, 0.72815341],
+        init((2,)).numpy(),
+        err=1e-6)
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 241b08510f6b1c7b62ab3563752b042bd1366f99..5dd548c8214992e95774c477f52ac6fc22b1fb4c 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -177,12 +177,15 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
 
     # Warnings that should be printed if corresponding functions are used.
     self.function_warnings = {
-        "tf.reverse":
-            "ERROR: tf.reverse has had its argument semantics changed "
+        "tf.reverse": (
+            ast_edits.ERROR,
+            "tf.reverse has had its argument semantics changed "
             "significantly. The converter cannot detect this reliably, so "
-            "you need to inspect this usage manually.\n",
+            "you need to inspect this usage manually.\n"),
     }
 
+    self.module_deprecations = {}
+
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser(
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 3f9032bb330dc0b72820d39fcc24b84b5f598637..4835005e27e36fa804fe256202a5de7ae0f8cea7 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -19,14 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
+import functools
+import sys
 
 import pasta
-import six
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
 from tensorflow.tools.compatibility import reorders_v2
 
+# These pylint warnings are a mistake.
+# pylint: disable=g-explicit-bool-comparison,g-bool-id-comparison
+
 
 class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   """List of maps that describe what changed in the API."""
@@ -38,6 +42,18 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Only keyword args are handled, so make sure to also put any function in
     # function_reorders to ensure that all args are made into keywords first.
     self.function_keyword_renames = {
+        "tf.string_split": {
+            "delimiter": "sep",
+        },
+        "tf.test.assert_equal_graph_def": {
+            "checkpoint_v2": None,
+        },
+        "tf.nn.embedding_lookup": {
+            "validate_indices": None,
+        },
+        "tf.image.sample_distorted_bounding_box": {
+            "seed2": None,
+        },
         "tf.gradients": {
             "colocate_gradients_with_ops": None,
         },
@@ -79,6 +95,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.image.extract_image_patches": {
             "ksizes": "sizes",
         },
+        "tf.image.resize": {
+            "align_corners": None,
+        },
+        "tf.image.resize_images": {
+            "align_corners": None,
+        },
         "tf.extract_image_patches": {
             "ksizes": "sizes",
         },
@@ -209,6 +231,15 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.max_pool_with_argmax": {
             "Targmax": "output_dtype",
         },
+        "tf.nn.max_pool": {
+            "value": "input"
+        },
+        "tf.nn.avg_pool": {
+            "value": "input"
+        },
+        "tf.nn.avg_pool2d": {
+            "value": "input"
+        },
         "tf.multinomial": {
             "output_dtype": "dtype",
         },
@@ -380,6 +411,38 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.weighted_moments": {
             "keep_dims": "keepdims"
         },
+        "tf.nn.conv1d": {
+            "value": "input",
+            "use_cudnn_on_gpu": None,
+        },
+        "tf.nn.conv2d": {
+            "filter": "filters",
+            "use_cudnn_on_gpu": None,
+        },
+        "tf.nn.conv2d_backprop_input": {
+            "use_cudnn_on_gpu": None,
+            "input_sizes": "output_shape",
+            "out_backprop": "input",
+            "filter": "filters",
+        },
+        "tf.contrib.summary.audio": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.histogram": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.image": {
+            "tensor": "data",
+            "bad_color": None,
+            "max_images": "max_outputs",
+            "family": None,
+        },
+        "tf.contrib.summary.scalar": {
+            "tensor": "data",
+            "family": None,
+        },
     }
 
     # pylint: disable=line-too-long
@@ -391,7 +454,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.batch_to_space_nd":
             "tf.batch_to_space",
         "tf.batch_gather":
-            "tf.gather",
+            "tf.compat.v1.batch_gather",
         "tf.space_to_batch_nd":
             "tf.space_to_batch",
         "tf.nn.space_to_batch":
@@ -408,6 +471,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.exists",
         "tf.gfile.Glob":
             "tf.io.gfile.glob",
+        "tf.gfile.GFile":
+            "tf.io.gfile.GFile",
         "tf.gfile.IsDirectory":
             "tf.io.gfile.isdir",
         "tf.gfile.ListDirectory":
@@ -416,6 +481,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.makedirs",
         "tf.gfile.MkDir":
             "tf.io.gfile.mkdir",
+        "tf.gfile.Open":
+            "tf.io.gfile.GFile",
         "tf.gfile.Remove":
             "tf.io.gfile.remove",
         "tf.gfile.Rename":
@@ -508,14 +575,64 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.data.experimental.unbatch",
         "tf.contrib.data.unique":
             "tf.data.experimental.unique",
+        "tf.contrib.estimator.make_early_stopping_hook":
+            "tf.estimator.experimental.make_early_stopping_hook",
+        "tf.contrib.estimator.stop_if_higher_hook":
+            "tf.estimator.experimental.stop_if_higher_hook",
+        "tf.contrib.estimator.stop_if_lower_hook":
+            "tf.estimator.experimental.stop_if_lower_hook",
+        "tf.contrib.estimator.stop_if_no_decrease_hook":
+            "tf.estimator.experimental.stop_if_no_decrease_hook",
+        "tf.contrib.estimator.stop_if_no_increase_hook":
+            "tf.estimator.experimental.stop_if_no_increase_hook",
+        "tf.contrib.framework.CriticalSection":
+            "tf.CriticalSection",
+        "tf.contrib.framework.is_tensor":
+            "tf.is_tensor",
+        "tf.contrib.framework.nest.assert_same_structure":
+            "tf.nest.assert_same_structure",
+        "tf.contrib.framework.nest.flatten":
+            "tf.nest.flatten",
+        "tf.contrib.framework.nest.is_sequence":
+            "tf.nest.is_nested",
+        "tf.contrib.framework.nest.map_structure":
+            "tf.nest.map_structure",
+        "tf.contrib.framework.nest.pack_sequence_as":
+            "tf.nest.pack_sequence_as",
+        "tf.contrib.util.constant_value":
+            "tf.get_static_value",
+        "tf.contrib.saved_model.load_keras_model":
+            "tf.keras.experimental.load_from_saved_model",
+        "tf.contrib.saved_model.save_keras_model":
+            "tf.keras.experimental.export_saved_model",
         "tf.contrib.rnn.RNNCell":
-            "tf.nn.rnn_cell.RNNCell",
+            "tf.compat.v1.nn.rnn_cell.RNNCell",
         "tf.contrib.rnn.LSTMStateTuple":
             "tf.nn.rnn_cell.LSTMStateTuple",
+        "tf.contrib.rnn.BasicLSTMCell":
+            "tf.compat.v1.nn.rnn_cell.BasicLSTMCell",
+        "tf.contrib.rnn.BasicRNNCell":
+            "tf.compat.v1.nn.rnn_cell.BasicRNNCell",
+        "tf.contrib.rnn.GRUCell":
+            "tf.compat.v1.nn.rnn_cell.GRUCell",
+        "tf.contrib.rnn.LSTMCell":
+            "tf.compat.v1.nn.rnn_cell.LSTMCell",
+        "tf.contrib.rnn.MultiRNNCell":
+            "tf.compat.v1.nn.rnn_cell.MultiRNNCell",
         "tf.contrib.framework.sort":
             "tf.sort",
         "tf.contrib.framework.argsort":
             "tf.argsort",
+        "tf.contrib.summary.audio":
+            "tf.compat.v2.summary.audio",
+        "tf.contrib.summary.histogram":
+            "tf.compat.v2.summary.histogram",
+        "tf.contrib.summary.image":
+            "tf.compat.v2.summary.image",
+        "tf.contrib.summary.initialize":
+            "tf.compat.v1.summary.initialize",
+        "tf.contrib.summary.scalar":
+            "tf.compat.v2.summary.scalar",
         "tf.count_nonzero":
             "tf.math.count_nonzero",
         "tf.manip.batch_to_space_nd":
@@ -538,6 +655,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.random.stateless_categorical",
         "tf.substr":
             "tf.strings.substr",
+        "tf.string_split":
+            "tf.strings.split",
         "tf.string_to_hash_bucket":
             "tf.strings.to_hash_bucket",
         "tf.string_to_number":
@@ -570,6 +689,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.compat.v1.nn.fused_batch_norm",
         "tf.nn.softmax_cross_entropy_with_logits_v2":
             "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.nn.sigmoid_cross_entropy_with_logits":
+            "tf.nn.sigmoid_cross_entropy_with_logits",
         "tf.losses.Reduction.MEAN":
             "tf.compat.v1.losses.Reduction.MEAN",
         "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
@@ -594,18 +715,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         # changed significantly.
         "tf.nn.ctc_loss":
             "tf.compat.v1.nn.ctc_loss",
-        "tf.zeros_initializer":
-            "tf.compat.v1.initializers.zeros",
-        "tf.ones_initializer":
-            "tf.compat.v1.initializers.ones",
-        "tf.constant_initializer":
-            "tf.compat.v1.initializers.constant",
-        "tf.random_uniform_initializer":
-            "tf.compat.v1.initializers.random_uniform",
-        "tf.random_normal_initializer":
-            "tf.compat.v1.initializers.random_normal",
-        "tf.truncated_normal_initializer":
-            "tf.compat.v1.initializers.truncated_normal",
+        # tf.saved_model.load in 1.x has no equivalent in 2.x, but there is a
+        # symbol with the same name.
+        "tf.saved_model.load":
+            "tf.compat.v1.saved_model.load",
+        "tf.saved_model.load_v2":
+            "tf.compat.v2.saved_model.load",
         "tf.image.resize_images":
             "tf.image.resize",
         "tf.random_poisson":
@@ -652,12 +767,126 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.compat.v1.debugging.assert_rank_in",
         "tf.assert_rank":
             "tf.compat.v1.assert_rank",
+        "tf.nn.max_pool":
+            "tf.nn.max_pool2d",
+        "tf.nn.avg_pool":
+            "tf.nn.avg_pool2d",
+        "tf.keras.initializers.zeros":
+            "tf.compat.v1.keras.initializers.zeros",
+        "tf.keras.initializers.Zeros":
+            "tf.compat.v1.keras.initializers.Zeros",
+        "tf.keras.initializers.ones":
+            "tf.compat.v1.keras.initializers.ones",
+        "tf.keras.initializers.Ones":
+            "tf.compat.v1.keras.initializers.Ones",
+        "tf.keras.initializers.constant":
+            "tf.compat.v1.keras.initializers.constant",
+        "tf.keras.initializers.Constant":
+            "tf.compat.v1.keras.initializers.Constant",
+        "tf.keras.initializers.VarianceScaling":
+            "tf.compat.v1.keras.initializers.VarianceScaling",
+        "tf.keras.initializers.Orthogonal":
+            "tf.compat.v1.keras.initializers.Orthogonal",
+        "tf.keras.initializers.orthogonal":
+            "tf.compat.v1.keras.initializers.orthogonal",
+        "tf.keras.initializers.Identity":
+            "tf.compat.v1.keras.initializers.Identity",
+        "tf.keras.initializers.identity":
+            "tf.compat.v1.keras.initializers.identity",
+        "tf.keras.initializers.glorot_uniform":
+            "tf.compat.v1.keras.initializers.glorot_uniform",
+        "tf.keras.initializers.glorot_normal":
+            "tf.compat.v1.keras.initializers.glorot_normal",
+        "tf.keras.initializers.lecun_normal":
+            "tf.compat.v1.keras.initializers.lecun_normal",
+        "tf.keras.initializers.lecun_uniform":
+            "tf.compat.v1.keras.initializers.lecun_uniform",
+        "tf.keras.initializers.he_normal":
+            "tf.compat.v1.keras.initializers.he_normal",
+        "tf.keras.initializers.he_uniform":
+            "tf.compat.v1.keras.initializers.he_uniform",
+        "tf.keras.initializers.TruncatedNormal":
+            "tf.compat.v1.keras.initializers.TruncatedNormal",
+        "tf.keras.initializers.truncated_normal":
+            "tf.compat.v1.keras.initializers.truncated_normal",
+        "tf.keras.initializers.RandomUniform":
+            "tf.compat.v1.keras.initializers.RandomUniform",
+        "tf.keras.initializers.uniform":
+            "tf.compat.v1.keras.initializers.uniform",
+        "tf.keras.initializers.random_uniform":
+            "tf.compat.v1.keras.initializers.random_uniform",
+        "tf.keras.initializers.RandomNormal":
+            "tf.compat.v1.keras.initializers.RandomNormal",
+        "tf.keras.initializers.normal":
+            "tf.compat.v1.keras.initializers.normal",
+        "tf.keras.initializers.random_normal":
+            "tf.compat.v1.keras.initializers.random_normal",
+        "tf.zeros_initializer":
+            "tf.compat.v1.zeros_initializer",
+        "tf.initializers.zeros":
+            "tf.compat.v1.initializers.zeros",
+        "tf.ones_initializer":
+            "tf.compat.v1.ones_initializer",
+        "tf.initializers.ones":
+            "tf.compat.v1.initializers.ones",
+        "tf.constant_initializer":
+            "tf.compat.v1.constant_initializer",
+        "tf.initializers.constant":
+            "tf.compat.v1.initializers.constant",
+        "tf.random_uniform_initializer":
+            "tf.compat.v1.random_uniform_initializer",
+        "tf.initializers.random_uniform":
+            "tf.compat.v1.initializers.random_uniform",
+        "tf.random_normal_initializer":
+            "tf.compat.v1.random_normal_initializer",
+        "tf.initializers.random_normal":
+            "tf.compat.v1.initializers.random_normal",
+        "tf.truncated_normal_initializer":
+            "tf.compat.v1.truncated_normal_initializer",
+        "tf.initializers.truncated_normal":
+            "tf.compat.v1.initializers.truncated_normal",
+        "tf.variance_scaling_initializer":
+            "tf.compat.v1.variance_scaling_initializer",
+        "tf.initializers.variance_scaling":
+            "tf.compat.v1.initializers.variance_scaling",
+        "tf.orthogonal_initializer":
+            "tf.compat.v1.orthogonal_initializer",
+        "tf.initializers.orthogonal":
+            "tf.compat.v1.initializers.orthogonal",
+        "tf.glorot_uniform_initializer":
+            "tf.compat.v1.glorot_uniform_initializer",
+        "tf.initializers.glorot_uniform":
+            "tf.compat.v1.initializers.glorot_uniform",
+        "tf.glorot_normal_initializer":
+            "tf.compat.v1.glorot_normal_initializer",
+        "tf.initializers.glorot_normal":
+            "tf.compat.v1.initializers.glorot_normal",
+        "tf.initializers.identity":
+            "tf.compat.v1.initializers.identity",
+        "tf.initializers.lecun_normal":
+            "tf.compat.v1.initializers.lecun_normal",
+        "tf.initializers.lecun_uniform":
+            "tf.compat.v1.initializers.lecun_uniform",
+        "tf.initializers.he_normal":
+            "tf.compat.v1.initializers.he_normal",
+        "tf.initializers.he_uniform":
+            "tf.compat.v1.initializers.he_uniform",
+        "tf.data.experimental.map_and_batch_with_legacy_function":
+            "tf.compat.v1.data.experimental.map_and_batch_with_legacy_function",
+        "tf.nn.conv2d_backprop_input":
+            "tf.nn.conv2d_transpose",
+        "tf.test.compute_gradient":
+            "tf.compat.v1.test.compute_gradient",
     }
     # pylint: enable=line-too-long
 
     # Mapping from function to the new name of the function
     self.symbol_renames = renames_v2.renames
     self.symbol_renames.update(self.manual_symbol_renames)
+    self.symbol_renames = {
+        name: new_name
+        for name, new_name in self.symbol_renames.items()
+    }
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
@@ -674,12 +903,15 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.io.serialize_many_sparse",
         "tf.argmax",
         "tf.argmin",
-        "tf.batch_gather",
         "tf.batch_to_space",
         "tf.cond",
         "tf.nn.space_to_batch",
         "tf.boolean_mask",
         "tf.convert_to_tensor",
+        "tf.nn.conv1d",
+        "tf.nn.conv2d",
+        "tf.nn.conv2d_backprop_input",
+        "tf.nn.ctc_beam_search_decoder",
         "tf.nn.moments",
         "tf.nn.convolution",
         "tf.nn.crelu",
@@ -694,6 +926,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.feature_column.categorical_column_with_vocabulary_file",
         "tf.shape",
         "tf.size",
+        "tf.string_split",
         "tf.random.poisson",
         "tf.sparse.add",
         "tf.sparse_add",
@@ -742,6 +975,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.embedding_lookup_sparse",
         "tf.nn.in_top_k",
         "tf.nn.space_to_depth",
+        "tf.test.assert_equal_graph_def",
         "tf.linalg.norm",
         "tf.norm",
         "tf.reverse_sequence",
@@ -750,114 +984,116 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         # keyword arguments. Add keyword arguments in rare case when they
         # are not specified.
         "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.nn.fractional_avg_pool",
+        "tf.nn.fractional_max_pool",
+        "tf.image.sample_distorted_bounding_box",
+        "tf.gradients",
+        "tf.hessians",
+        "tf.nn.max_pool",
+        "tf.nn.avg_pool",
+        "tf.estimator.LinearClassifier",
+        "tf.estimator.LinearRegressor",
+        "tf.estimator.DNNLinearCombinedClassifier",
+        "tf.estimator.DNNLinearCombinedRegressor",
+        "tf.estimator.DNNRegressor",
+        "tf.estimator.DNNClassifier",
+        "tf.estimator.BaselineClassifier",
+        "tf.estimator.BaselineRegressor",
+        "tf.initializers.uniform_unit_scaling",
+        "tf.uniform_unit_scaling_initializer",
     }
 
+    # Manual mapping of function names to be reordered to their list of argument
+    # names, in order. Only use this if argument names cannot be autodetected,
+    # e.g. if the functions are in contrib.
+    self.manual_function_reorders = {
+        "tf.contrib.summary.audio": [
+            "name", "tensor", "sample_rate", "max_outputs", "family", "step"],
+        "tf.contrib.summary.histogram": [
+            "name", "tensor", "family", "step"],
+        "tf.contrib.summary.image": [
+            "name", "tensor", "bad_color", "max_images", "family", "step"],
+        "tf.contrib.summary.scalar": [
+            "name", "tensor", "family", "step"],
+    }
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = reorders_v2.reorders
+    self.function_reorders = dict(reorders_v2.reorders)
+    self.function_reorders.update(self.manual_function_reorders)
 
-    # Specially handled functions (pasta version)
-    # Each transformer is a callable which will be called with the arguments
-    #   transformer(parent, node, full_name, name, logs, errors)
-    # Where logs and errors are lists to which (line, col, msg) tuples can be
-    # appended, full_name is the FQN of the function called (or None if that is
-    # unknown), name is the name of the function called (or None is that is
-    # unknown). node is an ast.Call node representing this function call, and
-    # parent is its parent in the AST.
-    # The function may modify node (but not parent), and must return
-    # - none, if nothing was modified
-    # - node, if node was modified in place (make sure to use
-    #   pasta.ast_utils.replace_child to swap out children, otherwise formatting
-    #   may get messy)
-    # - a replacement for node, if the whole call node was replaced. The caller
-    #   will take care of changing parent.
-    self.function_transformers = {
-        "tf.nn.dropout": self._dropout_transformer,
-        "tf.batch_gather": self._batch_gather_transformer,
-        "tf.to_bfloat16": self._cast_transformer,
-        "tf.to_complex128": self._cast_transformer,
-        "tf.to_complex64": self._cast_transformer,
-        "tf.to_double": self._cast_transformer,
-        "tf.to_float": self._cast_transformer,
-        "tf.to_int32": self._cast_transformer,
-        "tf.to_int64": self._cast_transformer,
-        "tf.nn.softmax_cross_entropy_with_logits":
-            self._softmax_cross_entropy_with_logits_transformer,
-        "tf.image.resize_area": self._image_resize_transformer,
-        "tf.image.resize_bicubic": self._image_resize_transformer,
-        "tf.image.resize_bilinear": self._image_resize_transformer,
-        "tf.image.resize_nearest_neighbor": self._image_resize_transformer,
+    contrib_warning = (
+        ast_edits.ERROR,
+        "<function name> cannot be converted automatically. tf.contrib will not"
+        " be distributed with TensorFlow 2.0, please consider an alternative in"
+        " non-contrib TensorFlow, a community-maintained repository, or fork "
+        "the required code."
+    )
 
-    }
+    flags_warning = (
+        ast_edits.ERROR,
+        "tf.flags has been removed, please use the argparse or absl"
+        " modules if you need command line parsing.")
 
     decay_function_comment = (
-        "WARNING: <function name> has been changed to return a callable instead"
-        " of a tensor when graph building, but its functionality remains "
-        "unchanged during eager execution (returns a callable like "
-        "before). The converter cannot detect and fix this reliably, so "
-        "this usage has been converted to compat.v1 (even though it may already"
-        " be correct).\n"
-    )
-
-    # TODO(b/118888586): add default value change to update script.
-    default_loss_reduction_changed = (
-        "WARNING: default value of loss_reduction has been changed to "
-        "SUM_OVER_BATCH_SIZE.\n"
+        ast_edits.INFO,
+        "To use learning rate decay schedules with TensorFlow 2.0, switch to "
+        "the schedules in `tf.keras.optimizers.schedules`.\n"
     )
 
     assert_return_type_comment = (
-        "WARNING: assert_* functions have been changed to return None, the "
+        ast_edits.INFO,
+        "<function name> has been changed to return None, the "
         "data argument has been removed, and arguments have been reordered."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
     assert_rank_comment = (
-        "WARNING: assert_rank_* functions have been changed to return None, and"
+        ast_edits.INFO,
+        "<function name> has been changed to return None, and"
         " the data and summarize arguments have been removed."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
-    tf_01s_like_no_optimize_comment = (
-        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
-        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
-        "`input')."
-        "\nThe calls have been converted to compat.v1 for safety (even though "
-        " they may already have been correct)."
-    )
-
-    deprecate_partition_strategy_comment = (
-        "WARNING: `partition_strategy` has been removed from `%s` "
-        " The 'div' strategy is used by default.")
-
     initializers_no_dtype_comment = (
-        "WARNING: tf.initializers and tf.keras.initializers no longer have the "
+        ast_edits.INFO,
+        "Initializers no longer have the "
         "dtype argument in the constructor or partition_info argument in the "
-        "call method in TF 2.0 and after. The only API symbols are now "
-        "tf.keras.initializers.* or tf.initializers.*."
-        "\nThe calls have been converted to compat.v1 for safety (even though "
-        "they may already have been correct).")
-
-    uniform_unit_scaling_initializer_comment = (
-        "WARNING: uniform_unit_scaling_initializer has been removed. Please use"
-        " tf.initializers.variance_scaling instead with distribution=uniform "
-        "to get equivalent behaviour.")
+        "__call__ method.\nThe calls have been converted to compat.v1 for"
+        "safety (even though they may already have been correct).")
 
     metrics_comment = (
-        "WARNING: tf.metrics have been converted to object oriented versions in"
+        ast_edits.INFO,
+        "tf.metrics have been replaced with object oriented versions in"
         " TF 2.0 and after. The metric function calls have been converted to "
         "compat.v1 for backward compatibility. Please update these calls to "
         "the TF 2.0 versions.")
 
     losses_comment = (
-        "WARNING: tf.losses have been converted to object oriented versions in"
+        ast_edits.INFO,
+        "tf.losses have been replaced with object oriented versions in"
         " TF 2.0 and after. The loss function calls have been converted to "
         "compat.v1 for backward compatibility. Please update these calls to "
         "the TF 2.0 versions.")
 
+    # This could be done with a _rename_if_arg_not_found_transformer
+    deprecate_partition_strategy_comment = (
+        ast_edits.WARNING,
+        "`partition_strategy` has been removed from <function name>. "
+        " The 'div' strategy will be used by default.")
+
+    # make change instead
+    uniform_unit_scaling_initializer_comment = (
+        ast_edits.ERROR,
+        "uniform_unit_scaling_initializer has been removed. Please use"
+        " tf.initializers.variance_scaling instead with distribution=uniform "
+        "to get equivalent behaviour.")
+
+    # Make change instead (issue warning about strip_...)
     export_saved_model_renamed = (
+        ast_edits.ERROR,
         "(Manual edit required) Please rename the method export_savedmodel() "
         "to export_saved_model(). Two things to note:\n\t(1) The argument "
         "strip_default_attributes has been removed. The function will always "
@@ -866,24 +1102,13 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "only effects core estimator. If you are using "
         "tf.contrib.learn.Estimator, please switch to using core estimator.")
 
-    make_initializable_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_initializable_iterator()` method has been "
-        "removed. If you are using the Estimator API, you can return a dataset "
-        "directly from your input functions without creating an iterator. "
-        "As a last resort, please replace calls to that method on `dataset` "
-        "with a call to "
-        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
-
-    make_one_shot_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
-        "removed. If you are using eager execution, you can iterate over "
-        "`dataset` using a Python `for` loop. If you are using the Estimator "
-        "API, you can return a dataset directly from your input functions "
-        "without creating an iterator. As a last resort, please replace calls "
-        "to that method on `dataset` with a call to "
-        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+    # TODO(b/124529441): if possible eliminate need for manual checking.
+    contrib_summary_comment = (
+        ast_edits.WARNING,
+        "(Manual check required) tf.contrib.summary.* functions have been "
+        "migrated best-effort to tf.compat.v2.summary.* equivalents where "
+        "possible, but the resulting code may not always work. Please check "
+        "manually; you can report migration failures on b/124529441.")
 
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
@@ -892,18 +1117,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.function_warnings = {
         "*.export_savedmodel":
             export_saved_model_renamed,
-        "*.make_initializable_iterator":
-            make_initializable_iterator_deprecation,
-        "*.make_one_shot_iterator":
-            make_one_shot_iterator_deprecation,
-        "tf.assert_greater":
-            assert_return_type_comment,
         "tf.assert_equal":
             assert_return_type_comment,
         "tf.assert_none_equal":
             assert_return_type_comment,
-        "tf.assert_less":
-            assert_return_type_comment,
         "tf.assert_negative":
             assert_return_type_comment,
         "tf.assert_positive":
@@ -934,6 +1151,14 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_rank_comment,
         "tf.assert_rank_in":
             assert_rank_comment,
+        "tf.contrib.summary.audio":
+            contrib_summary_comment,
+        "tf.contrib.summary.histogram":
+            contrib_summary_comment,
+        "tf.contrib.summary.image":
+            contrib_summary_comment,
+        "tf.contrib.summary.scalar":
+            contrib_summary_comment,
         "tf.debugging.assert_equal":
             assert_return_type_comment,
         "tf.debugging.assert_greater":
@@ -968,13 +1193,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_rank_comment,
         "tf.debugging.assert_rank_in":
             assert_rank_comment,
-        "tf.device":
-            "tf.device no longer takes function as an argument. "
-            "'devide_name_or_function' argument has been renamed to "
-            "'device_name'.",
-        "tf.flags":
-            "tf.flags has been removed, please use the argparse or absl"
-            " module if you need command line parsing.",
         "tf.train.exponential_decay":
             decay_function_comment,
         "tf.train.piecewise_constant_decay":
@@ -993,71 +1211,16 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             decay_function_comment,
         "tf.train.noisy_linear_cosine_decay":
             decay_function_comment,
-        "tf.estimator.LinearClassifier":
-            default_loss_reduction_changed,
-        "tf.estimator.LinearRegressor":
-            default_loss_reduction_changed,
-        "tf.estimator.DNNLinearCombinedClassifier":
-            default_loss_reduction_changed,
-        "tf.estimator.DNNLinearCombinedRegressor":
-            default_loss_reduction_changed,
-        "tf.estimator.DNNRegressor":
-            default_loss_reduction_changed,
-        "tf.estimator.DNNClassifier":
-            default_loss_reduction_changed,
-        "tf.estimator.BaselineClassifier":
-            default_loss_reduction_changed,
-        "tf.estimator.BaselineRegressor":
-            default_loss_reduction_changed,
-        "tf.nn.conv1d":
-            "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
-            " was renamed to \"input\"",
-        "tf.nn.conv2d":
-            "WARNING: use_cudnn_on_gpu argument has been removed and "
-            "\"filter\" was renamed to \"filters\"",
-        "tf.nn.conv2d_backprop_filter":
-            "WARNING: use_cudnn_on_gpu argument has been removed",
-        "tf.nn.conv2d_backprop_input":
-            "WARNING: use_cudnn_on_gpu argument has been removed and "
-            "\"filter\" was renamed to \"filters\"",
-        "tf.nn.erosion2d":
-            "WARNING: <function name> now requires a data_format argument",
+        "tf.nn.embedding_lookup":
+            deprecate_partition_strategy_comment,
+        "tf.nn.embedding_lookup_sparse":
+            deprecate_partition_strategy_comment,
         "tf.nn.nce_loss":
-            deprecate_partition_strategy_comment % "tf.nn.nce_loss",
+            deprecate_partition_strategy_comment,
         "tf.nn.safe_embedding_lookup_sparse":
-            deprecate_partition_strategy_comment %
-            "tf.nn.safe_embedding_lookup_sparse",
+            deprecate_partition_strategy_comment,
         "tf.nn.sampled_softmax_loss":
-            deprecate_partition_strategy_comment % "tf.nn.sampled_softmax_loss",
-        "tf.zeros_like":
-            tf_01s_like_no_optimize_comment,
-        "tf.ones_like":
-            tf_01s_like_no_optimize_comment,
-        "tf.nn.embedding_lookup":
-            "WARNING: validate_indices argument has been removed.",
-        "tf.while_loop":
-            "tf.while_loop no longer takes 'return_same_structure' argument. "
-            "'return_same_structure' now defaults to True. Also, 'name'"
-            "argument is now the last argument.",
-        "tf.image.sample_distorted_bounding_box":
-            "tf.image.sample_distorted_bounding_box no longer takes 'seed2' "
-            "argument.",
-        "tf.nn.ctc_beam_search_decoder":
-            "tf.nn.ctc_beam_search_decoder no longer takes 'merge_repeated' "
-            "argument. 'merge_repeated' now defaults to False.",
-        "tf.nn.fractional_avg_pool":
-            "tf.nn.fractional_avg_pool no longer takes 'seed2' and "
-            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
-            "'seed' is zero, the execution is random and deterministic "
-            "otherwise",
-        "tf.nn.fractional_max_pool":
-            "tf.nn.fractional_max_pool no longer takes 'seed2' and "
-            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
-            "'seed' is zero, the execution is random and deterministic "
-            "otherwise",
-        "tf.test.assert_equal_graph_def":
-            "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
-            "argument. 'checkpoint_v2' now defaults to True.",
+            deprecate_partition_strategy_comment,
         "tf.keras.initializers.Zeros":
             initializers_no_dtype_comment,
         "tf.keras.initializers.zeros":
@@ -1126,10 +1289,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             initializers_no_dtype_comment,
         "tf.initializers.glorot_normal":
             initializers_no_dtype_comment,
-        "tf.initializers.uniform_unit_scaling":
-            uniform_unit_scaling_initializer_comment,
-        "tf.uniform_unit_scaling_initializer":
-            uniform_unit_scaling_initializer_comment,
         "tf.losses.absolute_difference":
             losses_comment,
         "tf.losses.add_loss":
@@ -1228,210 +1387,853 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             metrics_comment,
         "tf.metrics.true_positives_at_thresholds":
             metrics_comment,
+        "tf.get_variable":
+            (ast_edits.WARNING,
+             "<function name> returns ResourceVariables by default in 2.0, "
+             "which have well-defined semantics and are stricter about shapes. "
+             "You can disable this behavior by passing use_resource=False, or "
+             "by calling tf.compat.v1.disable_resource_variables().")
     }
 
     # Warnings that are emitted only if a specific arg is found.
     self.function_arg_warnings = {
+        "tf.nn.conv1d": {
+            ("use_cudnn_on_gpu", 4): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d": {
+            ("use_cudnn_on_gpu", 4): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d_backprop_filter": {
+            ("use_cudnn_on_gpu", 5): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d_backprop_input": {
+            ("use_cudnn_on_gpu", 5): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
         "tf.gradients": {
-            ("colocate_gradients_with_ops", 4):
+            ("colocate_gradients_with_ops", 4): (
+                ast_edits.INFO,
                 "tf.gradients no longer takes "
                 "'colocate_gradients_with_ops' argument, it behaves as if it "
-                "was set to True.",
+                "was set to True."),
         },
         "*.minimize": {
-            ("colocate_gradients_with_ops", 5):
+            ("colocate_gradients_with_ops", 5): (
+                ast_edits.INFO,
                 "Optimizer.minimize no longer takes "
                 "'colocate_gradients_with_ops' argument, it behaves as if it "
-                "was set to True.",
+                "was set to True."),
         },
         "*.compute_gradients": {
-            ("colocate_gradients_with_ops", 4):
+            ("colocate_gradients_with_ops", 4): (
+                ast_edits.INFO,
                 "Optimizer.compute_gradients no "
                 "longer takes 'colocate_gradients_with_ops' argument, it "
-                "behaves as if it was set to True.",
+                "behaves as if it was set to True."),
         },
         "tf.cond": {
-            ("strict", 3):
+            ("strict", 3): (
+                ast_edits.WARNING,
                 "tf.cond no longer takes 'strict' argument, it behaves as "
-                "if was set to True."
+                "if was set to True.")
+        },
+        "tf.contrib.summary.audio": {
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.histogram": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.image": {
+            ("bad_color", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.image no longer takes the 'bad_color' "
+                "argument; caller must now preprocess if needed. This call "
+                "site specifies a bad_color argument so it cannot be converted "
+                "safely."),
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.scalar": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.image.resize": {
+            ("align_corners",
+             3): (ast_edits.WARNING,
+                  "align_corners is not supported by tf.image.resize, the new "
+                  "default transformation is close to what v1 provided. If you "
+                  "require exactly the same transformation as before, use "
+                  "compat.v1.image.resize."),
+        },
+        "tf.image.resize_bilinear": {
+            ("align_corners",
+             2): (ast_edits.WARNING,
+                  "align_corners is not supported by tf.image.resize, the new "
+                  "default transformation is close to what v1 provided. If you "
+                  "require exactly the same transformation as before, use "
+                  "compat.v1.image.resize_bilinear."),
+        },
+        "tf.image.resize_area": {
+            ("align_corners",
+             2): (ast_edits.WARNING,
+                  "align_corners is not supported by tf.image.resize, the new "
+                  "default transformation is close to what v1 provided. If you "
+                  "require exactly the same transformation as before, use "
+                  "compat.v1.image.resize_area."),
+        },
+        "tf.image.resize_bicubic": {
+            ("align_corners",
+             2): (ast_edits.WARNING,
+                  "align_corners is not supported by tf.image.resize, the new "
+                  "default transformation is close to what v1 provided. If you "
+                  "require exactly the same transformation as before, use "
+                  "compat.v1.image.resize_bicubic."),
+        },
+        "tf.image.resize_nearest_neighbor": {
+            ("align_corners",
+             2): (ast_edits.WARNING,
+                  "align_corners is not supported by tf.image.resize, the new "
+                  "default transformation is close to what v1 provided. If you "
+                  "require exactly the same transformation as before, use "
+                  "compat.v1.image.resize_nearest_neighbor."),
         },
     }
 
-    self.symbol_renames = {
-        name: new_name
-        for name, new_name in self.symbol_renames.items()
+    # Specially handled functions
+    # Each transformer is a callable which will be called with the arguments
+    #   transformer(parent, node, full_name, name, logs)
+    # Where logs is a list to which (level, line, col, msg) tuples can be
+    # appended, full_name is the FQN of the function called (or None if that is
+    # unknown), name is the name of the function called (or None is that is
+    # unknown). node is an ast.Call node representing this function call, and
+    # parent is its parent in the AST.
+    # The function may modify node (but not parent), and must return
+    # - none, if nothing was modified
+    # - node, if node was modified in place (make sure to use
+    #   pasta.ast_utils.replace_child to swap out children, otherwise formatting
+    #   may get messy)
+    # - a replacement for node, if the whole call node was replaced. The caller
+    #   will take care of changing parent.
+    self.function_transformers = {
+        "*.make_initializable_iterator": _iterator_transformer,
+        "*.make_one_shot_iterator": _iterator_transformer,
+        "tf.nn.dropout": _dropout_transformer,
+        "tf.to_bfloat16": _cast_transformer,
+        "tf.to_complex128": _cast_transformer,
+        "tf.to_complex64": _cast_transformer,
+        "tf.to_double": _cast_transformer,
+        "tf.to_float": _cast_transformer,
+        "tf.to_int32": _cast_transformer,
+        "tf.to_int64": _cast_transformer,
+        "tf.nn.softmax_cross_entropy_with_logits":
+            _softmax_cross_entropy_with_logits_transformer,
+        "tf.image.extract_glimpse": _extract_glimpse_transformer,
+        "tf.image.resize_area": _image_resize_transformer,
+        "tf.image.resize_bicubic": _image_resize_transformer,
+        "tf.image.resize_bilinear": _image_resize_transformer,
+        "tf.image.resize_nearest_neighbor": _image_resize_transformer,
+        "tf.nn.fractional_avg_pool": _pool_seed_transformer,
+        "tf.nn.fractional_max_pool": _pool_seed_transformer,
+        "tf.name_scope": _name_scope_transformer,
+        "tf.estimator.DNNEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNEstimator no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNClassifier no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNRegressor no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearEstimator no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearClassifier no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearRegressor no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedEstimator no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedClassifier no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedRegressor no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.string_split": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="skip_empty",
+            arg_ok_predicate=_is_ast_false, remove_if_ok=True,
+            message="tf.string_split's replacement no longer takes the "
+            "skip_empty argument. Since the argument was present, the call was "
+            "converted to compat.v1."),
+        "tf.device": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="device_name",
+            arg_ok_predicate=_is_ast_str, remove_if_ok=False,
+            message="tf.device no longer takes functions as an argument. "
+            "We could not determine that the argument value is a string, so "
+            "the call was converted to compat.v1."),
+        "tf.zeros_like": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="optimize",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.zeros_like no longer takes an optimize argument, and "
+            "behaves as if optimize=True. This call site specifies something "
+            "other than optimize=True, so it was converted to compat.v1."),
+        "tf.ones_like": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="optimize",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.ones_like no longer takes an optimize argument, and "
+            "behaves as if optimize=True. This call site specifies something "
+            "other than optimize=True, so it was converted to compat.v1."),
+        "tf.while_loop": functools.partial(
+            _rename_if_arg_found_transformer,
+            arg_name="return_same_structure",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.while_loop no longer takes 'return_same_structure' "
+            "argument and behaves as if return_same_structure=True. This call "
+            "site specifies something other than return_same_structure=True, "
+            "so it was converted to compat.v1."),
+        "tf.nn.ctc_beam_search_decoder": functools.partial(
+            _rename_if_arg_found_transformer,
+            arg_name="merge_repeated",
+            arg_ok_predicate=_is_ast_false, remove_if_ok=True,
+            message="tf.nn.ctc_beam_search_decoder no longer takes the "
+            "'merge_repeated' argument and behaves as if merge_repeated=False. "
+            "This call site specifies something other than "
+            "merge_repeated=False, so it was converted to compat.v1."),
+        "tf.nn.erosion2d": functools.partial(
+            _add_argument_transformer,
+            arg_name="data_format",
+            arg_value_ast=ast.Str("NHWC")),
+        "tf.contrib.summary.audio": _add_summary_step_transformer,
+        "tf.contrib.summary.histogram": _add_summary_step_transformer,
+        "tf.contrib.summary.image": _add_summary_step_transformer,
+        "tf.contrib.summary.scalar": _add_summary_step_transformer,
+        "tf.estimator.BaselineClassifier": _add_loss_reduction_transformer,
+        "tf.estimator.BaselineRegressor": _add_loss_reduction_transformer,
+        "tf.initializers.uniform_unit_scaling":
+            _add_uniform_scaling_initializer_transformer,
+        "tf.uniform_unit_scaling_initializer":
+            _add_uniform_scaling_initializer_transformer,
+    }
+
+    self.module_deprecations = {
+        "tf.contrib": contrib_warning,
+        "tf.flags": flags_warning,
     }
 
-  @staticmethod
-  def _dropout_transformer(parent, node, full_name, name, logs, errors):
-    def _replace_keep_prob_node(parent, old_value):
-      """Replaces old_value with 1-(old_value)."""
-      one = ast.Num(n=1)
-      one.lineno = 0
-      one.col_offset = 0
-      new_value = ast.BinOp(left=one, op=ast.Sub(),
-                            right=old_value)
-      # This copies the prefix and suffix on old_value to new_value.
-      pasta.ast_utils.replace_child(parent, old_value, new_value)
-      ast.copy_location(new_value, old_value)
-      # Put parentheses around keep_prob.value (and remove the old prefix/
-      # suffix, they should only be around new_value).
-      pasta.base.formatting.set(old_value, "prefix", "(")
-      pasta.base.formatting.set(old_value, "suffix", ")")
-
-    # Check if we have a keep_prob keyword arg
-    for keep_prob in node.keywords:
-      if keep_prob.arg == "keep_prob":
-        logs.append((node.lineno, node.col_offset,
-                     "Changing keep_prob arg of tf.nn.dropout to rate, and "
-                     "recomputing value. Please check this transformation.\n"))
-        keep_prob.arg = "rate"
-        _replace_keep_prob_node(keep_prob, keep_prob.value)
-        return node
-
-    # Maybe it was a positional arg
-    if len(node.args) < 2:
-      errors.append((node.lineno, node.col_offset,
-                     "ERROR: tf.nn.dropout called without arguments, so "
-                     "automatic fix was disabled. tf.nn.dropout has changed "
-                     "the semantics of the second argument."))
+
+def _is_ast_str(node):
+  """Determine whether this node represents a string."""
+  allowed_types = [ast.Str]
+  if hasattr(ast, "Bytes"):
+    allowed_types += [ast.Bytes]
+  if hasattr(ast, "JoinedStr"):
+    allowed_types += [ast.JoinedStr]
+  if hasattr(ast, "FormattedValue"):
+    allowed_types += [ast.FormattedValue]
+  return isinstance(node, allowed_types)
+
+
+def _is_ast_true(node):
+  if hasattr(ast, "NameConstant"):
+    return isinstance(node, ast.NameConstant) and node.value is True
+  else:
+    return isinstance(node, ast.Name) and node.id == "True"
+
+
+def _is_ast_false(node):
+  if hasattr(ast, "NameConstant"):
+    return isinstance(node, ast.NameConstant) and node.value is False
+  else:
+    return isinstance(node, ast.Name) and node.id == "False"
+
+
+# Lots of unused arguments below, since these are called in a standard manner.
+# pylint: disable=unused-argument
+
+
+def _rename_if_arg_found_transformer(parent, node, full_name, name, logs,
+                                     arg_name=None,
+                                     arg_ok_predicate=None,
+                                     remove_if_ok=False,
+                                     message=None):
+  """Replaces the given call with tf.compat.v1 if the given arg is found.
+
+  This requires the function to be called with all named args, so for using
+  this transformer, the function should also be added to renames.
+
+  If the arg is not found, the call site is left alone.
+
+  If the arg is found, and if arg_ok_predicate is given, it is called with
+  the ast Expression representing the argument value found. If it returns
+  True, the function is left alone.
+
+  If the arg is found, arg_ok_predicate is not None and returns ok, and
+  remove_if_ok is True, the argument is removed from the call.
+
+  Otherwise, `compat.v1` is inserted between tf and the function name.
+
+  Args:
+    parent: Parent of node.
+    node: ast.Call node to maybe modify.
+    full_name: full name of function to modify
+    name: name of function to modify
+    logs: list of logs to append to
+    arg_name: name of the argument to look for
+    arg_ok_predicate: predicate callable with the ast of the argument value,
+      returns whether the argument value is allowed.
+    remove_if_ok: remove the argument if present and ok as determined by
+      arg_ok_predicate.
+    message: message to print if a non-ok arg is found (and hence, the function
+      is renamed to its compat.v1 version).
+
+  Returns:
+    node, if it was modified, else None.
+  """
+  # Check whether arg is there.
+  arg_present, arg_value = ast_edits.get_arg_value(node, arg_name)
+  if not arg_present:
+    return
+
+  # Check whether arg is problematic (and if not, maybe remove it).
+  if arg_ok_predicate and arg_ok_predicate(arg_value):
+    if remove_if_ok:
+      for i, kw in enumerate(node.keywords):
+        if kw.arg == arg_name:
+          node.keywords.pop(i)
+          logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                       "Removed argument %s for function %s" % (
+                           arg_name, full_name or name)))
+          break
+      return node
     else:
-      _replace_keep_prob_node(node, node.args[1])
-      logs.append((node.lineno, node.col_offset,
-                   "Changing keep_prob arg of tf.nn.dropout to rate, and "
-                   "recomputing value.\n"))
-      errors.append((node.lineno, node.col_offset,
-                     "WARNING: tf.nn.dropout has changed the semantics of the "
-                     "second argument. Please check the applied transformation."
-                    ))
+      return
+
+  # All conditions met, insert v1 and log what we did.
+  # We must have a full name, so the func is an attribute.
+  new_name = full_name.replace("tf.", "tf.compat.v1.", 1)
+  node.func = ast_edits.full_name_node(new_name)
+  logs.append((
+      ast_edits.INFO, node.lineno, node.col_offset,
+      "Renaming %s to %s because argument %s is present. %s" %
+      (full_name, new_name, arg_name, message if message is not None else "")
+  ))
+  return node
+
+
+def _add_argument_transformer(parent, node, full_name, name, logs,
+                              arg_name, arg_value_ast):
+  """Adds an argument (as a final kwarg arg_name=arg_value_ast)."""
+  node.keywords.append(ast.keyword(arg=arg_name, value=arg_value_ast))
+  logs.append((
+      ast_edits.INFO, node.lineno, node.col_offset,
+      "Adding argument '%s' to call to %s." % (pasta.dump(node.keywords[-1],
+                                                          full_name or name))
+  ))
+  return node
+
+
+def _iterator_transformer(parent, node, full_name, name, logs):
+  """Transform iterator methods to compat function calls."""
+  # First, check that node.func.value is not already something we like
+  # (tf.compat.v1.data), or something which is handled in the rename
+  # (tf.data). This transformer only handles the method call to function call
+  # conversion.
+  if full_name and (full_name.startswith("tf.compat.v1.data") or
+                    full_name.startswith("tf.data")):
+    return
+
+  # This should never happen, since we're only called for Attribute nodes.
+  if not isinstance(node.func, ast.Attribute):
+    return
+
+  # Transform from x.f(y) to tf.compat.v1.data.f(x, y)
+  # Fortunately, node.func.value should already have valid position info
+  node.args = [node.func.value] + node.args
+  node.func.value = ast_edits.full_name_node("tf.compat.v1.data")
+
+  logs.append((ast_edits.WARNING, node.lineno, node.col_offset,
+               "Changing dataset.%s() to tf.compat.v1.data.%s(dataset). "
+               "Please check this transformation.\n" % (name, name)))
+
+  return node
+
+
+def _dropout_transformer(parent, node, full_name, name, logs):
+  """Replace keep_prob with 1-rate."""
+  def _replace_keep_prob_node(parent, old_value):
+    """Replaces old_value with 1-(old_value)."""
+    one = ast.Num(n=1)
+    one.lineno = 0
+    one.col_offset = 0
+    new_value = ast.BinOp(left=one, op=ast.Sub(),
+                          right=old_value)
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    # Put parentheses around keep_prob.value (and remove the old prefix/
+    # suffix, they should only be around new_value).
+    pasta.base.formatting.set(old_value, "prefix", "(")
+    pasta.base.formatting.set(old_value, "suffix", ")")
+
+  # Check if we have a keep_prob keyword arg
+  for keep_prob in node.keywords:
+    if keep_prob.arg == "keep_prob":
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "Changing keep_prob arg of tf.nn.dropout to rate\n"))
+      keep_prob.arg = "rate"
+      _replace_keep_prob_node(keep_prob, keep_prob.value)
       return node
 
-  @staticmethod
-  def _cast_transformer(parent, node, full_name, name, logs, errors):
-    """Transforms to_int and to_float to cast(..., dtype=...)."""
-
-    # Find out the dtype to cast to from the function name
-    dtype_str = name[3:]
-    # Special cases where the full dtype is not given
-    if dtype_str == "float":
-      dtype_str = "float32"
-    elif dtype_str == "double":
-      dtype_str = "float64"
-    new_arg = ast.keyword(arg="dtype",
-                          value=ast.Attribute(value=ast.Name(id="tf",
-                                                             ctx=ast.Load()),
-                                              attr=dtype_str, ctx=ast.Load()))
-    # Ensures a valid transformation when a positional name arg is given
-    if len(node.args) == 2:
-      name_arg = ast.keyword(arg="name",
-                             value=node.args[-1])
-      node.args = node.args[:-1]
-      node.keywords.append(name_arg)
-
-    # Python3 ast requires the args for the Attribute, but codegen will mess up
-    # the arg order if we just set them to 0.
-    new_arg.value.lineno = node.lineno
-    new_arg.value.col_offset = node.col_offset+100
-
-    node.keywords.append(new_arg)
-    if isinstance(node.func, ast.Attribute):
-      node.func.attr = "cast"
-    else:
-      assert isinstance(node.func, ast.Name)
-      node.func.id = "cast"
+  # Maybe it was a positional arg
+  if len(node.args) < 2:
+    logs.append((ast_edits.ERROR, node.lineno, node.col_offset,
+                 "tf.nn.dropout called without arguments, so "
+                 "automatic fix was disabled. tf.nn.dropout has changed "
+                 "the semantics of the second argument."))
+  else:
+    _replace_keep_prob_node(node, node.args[1])
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changing keep_prob arg of tf.nn.dropout to rate, and "
+                 "recomputing value.\n"))
 
-    logs.append((node.lineno, node.col_offset,
-                 "Changed %s call to tf.cast(..., dtype=tf.%s)." % (full_name,
-                                                                    dtype_str)))
     return node
 
-  @staticmethod
-  def _softmax_cross_entropy_with_logits_transformer(
-      parent, node, full_name, name, logs, errors):
-    def _wrap_label(parent, old_value):
-      """Wrap labels with tf.stop_gradient."""
-      if six.PY3:
-        new_value = ast.Call(
-            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
-            [old_value], [])
-      else:
-        new_value = ast.Call(
-            ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
-            [old_value], [], None, None)
-
-      # This copies the prefix and suffix on old_value to new_value.
-      pasta.ast_utils.replace_child(parent, old_value, new_value)
-      ast.copy_location(new_value, old_value)
-
-    # Check if we have a labels keyword arg
-    for karg in node.keywords:
-      if karg.arg == "labels":
-        logs.append((node.lineno, node.col_offset,
+
+def _cast_transformer(parent, node, full_name, name, logs):
+  """Transforms to_int and to_float to cast(..., dtype=...)."""
+
+  # Find out the dtype to cast to from the function name
+  dtype_str = name[3:]
+  # Special cases where the full dtype is not given
+  if dtype_str == "float":
+    dtype_str = "float32"
+  elif dtype_str == "double":
+    dtype_str = "float64"
+  new_arg = ast.keyword(arg="dtype",
+                        value=ast.Attribute(value=ast.Name(id="tf",
+                                                           ctx=ast.Load()),
+                                            attr=dtype_str, ctx=ast.Load()))
+  # Ensures a valid transformation when a positional name arg is given
+  if len(node.args) == 2:
+    name_arg = ast.keyword(arg="name",
+                           value=node.args[-1])
+    node.args = node.args[:-1]
+    node.keywords.append(name_arg)
+
+  # Python3 ast requires the args for the Attribute, but codegen will mess up
+  # the arg order if we just set them to 0.
+  new_arg.value.lineno = node.lineno
+  new_arg.value.col_offset = node.col_offset+100
+
+  node.keywords.append(new_arg)
+  if isinstance(node.func, ast.Attribute):
+    node.func.attr = "cast"
+  else:
+    assert isinstance(node.func, ast.Name)
+    node.func.id = "cast"
+
+  logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+               "Changed %s call to tf.cast(..., dtype=tf.%s)." % (full_name,
+                                                                  dtype_str)))
+  return node
+
+
+def _softmax_cross_entropy_with_logits_transformer(
+    parent, node, full_name, name, logs):
+  """Wrap labels argument with stop_gradients."""
+  def _wrap_label(parent, old_value):
+    """Wrap labels with tf.stop_gradient."""
+    already_stop_grad = (isinstance(old_value, ast.Call) and
+                         isinstance(old_value.func, ast.Attribute) and
+                         old_value.func.attr == "stop_gradient" and
+                         isinstance(old_value.func.value, ast.Name) and
+                         old_value.func.value.id == "tf")
+    if already_stop_grad:
+      return False
+    try:
+      new_value = ast.Call(
+          ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+          [old_value], [])
+    except TypeError:
+      new_value = ast.Call(
+          ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+          [old_value], [], None, None)
+
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    return True
+
+  # Check if we have a labels keyword arg
+  for karg in node.keywords:
+    if karg.arg == "labels":
+      if _wrap_label(karg, karg.value):
+        logs.append((ast_edits.INFO, node.lineno, node.col_offset,
                      "Changing labels arg of "
                      "tf.nn.softmax_cross_entropy_with_logits to "
                      "tf.stop_gradient(labels). Please check this "
                      "transformation.\n"))
-        _wrap_label(karg, karg.value)
-        return node
+      return node
+  return node
+
+
+def _image_resize_transformer(parent, node, full_name, name, logs):
+  """Transforms image.resize_* to image.resize(..., method=*, ...)."""
+  resize_method = name[7:].upper()
+  new_arg = ast.keyword(arg="method",
+                        value=ast.Attribute(
+                            value=ast.Attribute(
+                                value=ast.Attribute(
+                                    value=ast.Name(id="tf", ctx=ast.Load()),
+                                    attr="image", ctx=ast.Load()),
+                                attr="ResizeMethod", ctx=ast.Load()),
+                            attr=resize_method, ctx=ast.Load()))
+
+  # Ensures a valid transformation when a positional name arg is given
+  if len(node.args) == 4:
+    pos_arg = ast.keyword(arg="preserve_aspect_ratio",
+                          value=node.args[-1])
+    node.args = node.args[:-1]
+    node.keywords.append(pos_arg)
+  if len(node.args) == 3:
+    pos_arg = ast.keyword(arg="align_corners",
+                          value=node.args[-1])
+    node.args = node.args[:-1]
+
+  new_keywords = []
+  for kw in node.keywords:
+    if kw.arg != "align_corners":
+      new_keywords.append(kw)
+  node.keywords = new_keywords
+
+  # Python3 ast requires the args for the Attribute, but codegen will mess up
+  # the arg order if we just set them to 0.
+  new_arg.value.lineno = node.lineno
+  new_arg.value.col_offset = node.col_offset+100
+
+  node.keywords.append(new_arg)
+  if isinstance(node.func, ast.Attribute):
+    node.func.attr = "resize"
+  else:
+    assert isinstance(node.func, ast.Name)
+    node.func.id = "resize"
+
+  logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+               "Changed %s call to tf.image.resize(..., "
+               "method=tf.image.ResizeMethod.%s)." % (full_name,
+                                                      resize_method)))
+  return node
+
+
+def _pool_seed_transformer(parent, node, full_name, name, logs):
+  """Removes seed2 and deterministic, and adds non-zero seed if needed."""
+  # This requires that this function uses all kwargs (add to renames!).
+  seed_arg = None
+  deterministic = False
+  modified = False
+  new_keywords = []
+
+  for kw in node.keywords:
+    if sys.version_info[:2] >= (3, 5) and isinstance(kw, ast.Starred):
+      pass
+    elif kw.arg == "seed":
+      seed_arg = kw
+    elif kw.arg == "seed2" or kw.arg == "deterministic":
+      lineno = getattr(kw, "lineno", node.lineno)
+      col_offset = getattr(kw, "col_offset", node.col_offset)
+      logs.append((ast_edits.INFO, lineno, col_offset,
+                   "Removed argument %s for function %s" % (
+                       kw.arg, full_name or name)))
+      if kw.arg == "deterministic":
+        if not _is_ast_false(kw.value):
+          deterministic = True
+      modified = True
+      continue
+    new_keywords.append(kw)
+
+  if deterministic:
+    if seed_arg is None:
+      new_keywords.append(ast.keyword(arg="seed", value=ast.Num(42)))
+      logs.add((
+          ast_edits.INFO, node.lineno, node.col_offset,
+          "Adding seed=42 to call to %s since determinism was requested" % (
+              full_name or name)
+      ))
+    else:
+      logs.add((
+          ast_edits.WARNING, node.lineno, node.col_offset,
+          "The deterministic argument is deprecated for %s, pass a "
+          "non-zero seed for determinism. The deterministic argument is "
+          "present, possibly not False, and the seed is already set. The "
+          "converter cannot determine whether it is nonzero, please check."
+      ))
+
+  if modified:
+    node.keywords = new_keywords
     return node
+  else:
+    return
+
+
+def _extract_glimpse_transformer(parent, node, full_name, name, logs):
 
-  @staticmethod
-  def _batch_gather_transformer(parent, node, full_name, name, logs, errors):
-    # Check if the call already has a batch_dims argument
-    if any([kw.arg == "batch_dims" for kw in node.keywords]):
-      logs.append((node.lineno, node.col_offset, "tf.batch_gather already has "
-                   "batch_dims argument. Neat."))
-      return None
-
-    minus_one = ast.Num(n=-1)
-    minus_one.lineno = 0
-    minus_one.col_offset = 0
-    new_arg = ast.keyword("batch_dims", minus_one)
-    node.keywords.append(new_arg)
-    logs.append((node.lineno, node.col_offset,
-                 "Added keyword argument batch_dims=-1 to tf.batch_gather."))
+  def _replace_uniform_noise_node(parent, old_value):
+    """Replaces old_value with 'uniform' or 'guassian'."""
+    uniform = ast.Str(s="uniform")
+    gaussian = ast.Str(s="gaussian")
+    new_value = ast.IfExp(body=uniform, test=old_value, orelse=gaussian)
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    # Put parentheses around noise.value.test (and remove the old prefix/
+    # suffix, they should only be around new_value.test), so that:
+    # "uniform" if (a if b else c) else "gaussian" is valid.
+    pasta.base.formatting.set(new_value.test, "prefix", "(")
+    pasta.base.formatting.set(new_value.test, "suffix", ")")
+
+  # Check if we have a uniform_noise keyword arg
+  for uniform_noise in node.keywords:
+    if uniform_noise.arg == "uniform_noise":
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "Changing uniform_noise arg of tf.image.extract_glimpse "
+                   "to noise, and recomputing value. Please check this "
+                   "transformation.\n"))
+      uniform_noise.arg = "noise"
+      value = "uniform" if uniform_noise.value else "gaussian"
+      _replace_uniform_noise_node(uniform_noise, uniform_noise.value)
+      return node
+
+  # Since `noise`/`uniform_noise` is optional arg, nothing needs to be
+  # done if len(node.args) < 5.
+  if len(node.args) >= 5:
+    _replace_uniform_noise_node(node, node.args[5])
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changing uniform_noise arg of tf.image.extract_glimpse to "
+                 "noise, and recomputing value.\n"))
     return node
 
-  @staticmethod
-  def _image_resize_transformer(parent, node, full_name, name, logs, errors):
-    """Transforms image.resize_* to image.resize(..., method=*, ...)."""
-
-    resize_method = name[7:].upper()
-    new_arg = ast.keyword(arg="method",
-                          value=ast.Attribute(
-                              value=ast.Attribute(
-                                  value=ast.Attribute(
-                                      value=ast.Name(id="tf", ctx=ast.Load()),
-                                      attr="image", ctx=ast.Load()),
-                                  attr="ResizeMethod", ctx=ast.Load()),
-                              attr=resize_method, ctx=ast.Load()))
-
-    # Ensures a valid transformation when a positional name arg is given
-    if len(node.args) == 4:
-      pos_arg = ast.keyword(arg="preserve_aspect_ratio",
-                            value=node.args[-1])
-      node.args = node.args[:-1]
-      node.keywords.append(pos_arg)
-    if len(node.args) == 3:
-      pos_arg = ast.keyword(arg="align_corners",
-                            value=node.args[-1])
-      node.args = node.args[:-1]
-      node.keywords.append(pos_arg)
-
-    # Python3 ast requires the args for the Attribute, but codegen will mess up
-    # the arg order if we just set them to 0.
-    new_arg.value.lineno = node.lineno
-    new_arg.value.col_offset = node.col_offset+100
-
-    node.keywords.append(new_arg)
-    if isinstance(node.func, ast.Attribute):
-      node.func.attr = "resize"
-    else:
-      assert isinstance(node.func, ast.Name)
-      node.func.id = "resize"
 
-    logs.append((node.lineno, node.col_offset,
-                 "Changed %s call to tf.image.resize(..., "
-                 "method=tf.image.ResizeMethod.%s)." % (full_name,
-                                                        resize_method)))
+def _add_summary_step_transformer(parent, node, full_name, name, logs):
+  """Adds a step argument to the summary API call if not specified.
+
+  The inserted argument value is tf.compat.v1.train.get_or_create_global_step().
+  """
+  for keyword_arg in node.keywords:
+    if keyword_arg.arg == "step":
+      return node
+  default_value = "tf.compat.v1.train.get_or_create_global_step()"
+  # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
+  ast_value = pasta.parse(default_value)
+  node.keywords.append(ast.keyword(arg="step", value=ast_value))
+  logs.append((
+      ast_edits.WARNING, node.lineno, node.col_offset,
+      "Summary API writing function %s now requires a 'step' argument; "
+      "inserting default of %s." % (full_name or name, default_value)))
+  return node
+
+
+def _add_loss_reduction_transformer(parent, node, full_name, name, logs):
+  """Adds a loss_reduction argument if not specified.
+
+  Default value for tf.estimator.*Classifier and tf.estimator.*Regressor
+  loss_reduction argument changed to SUM_OVER_BATCH_SIZE. So, we update
+  existing calls to use the old default value `tf.losses.Reduction.SUM`.
+
+  Note: to apply this transformation, symbol must be added
+  to reordered_function_names above.
+  """
+  for keyword_arg in node.keywords:
+    if keyword_arg.arg == "loss_reduction":
+      return node
+  # TODO(annarev): this should be updated to tf.keras.losses.Reduction.SUM
+  # once b/125525822 is fixed.
+  default_value = "tf.compat.v1.losses.Reduction.SUM"
+  # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
+  ast_value = pasta.parse(default_value)
+  node.keywords.append(ast.keyword(arg="loss_reduction", value=ast_value))
+  logs.append((
+      ast_edits.INFO, node.lineno, node.col_offset,
+      "%s: Default value of loss_reduction has been changed to "
+      "SUM_OVER_BATCH_SIZE; inserting old default value %s.\n"
+      % (full_name or name, default_value)))
+  return node
+
+
+def _rename_if_arg_found_and_add_loss_reduction_transformer(
+    parent,
+    node,
+    full_name,
+    name,
+    logs,
+    arg_name=None,
+    arg_ok_predicate=None,
+    remove_if_ok=False,
+    message=None):
+  """Combination of _rename_if_arg_found and _add_loss_reduction transformers.
+
+  Args:
+    parent: Parent of node.
+    node: ast.Call node to maybe modify.
+    full_name: full name of function to modify
+    name: name of function to modify
+    logs: list of logs to append to
+    arg_name: name of the argument to look for
+    arg_ok_predicate: predicate callable with the ast of the argument value,
+      returns whether the argument value is allowed.
+    remove_if_ok: remove the argument if present and ok as determined by
+      arg_ok_predicate.
+    message: message to print if a non-ok arg is found (and hence, the function
+      is renamed to its compat.v1 version).
+
+  Returns:
+    node, if it was modified, else None.
+  """
+
+  add_loss_node = _add_loss_reduction_transformer(parent, node, full_name, name,
+                                                  logs)
+  rename_node = _rename_if_arg_found_transformer(
+      parent, add_loss_node, full_name, name, logs, arg_name, arg_ok_predicate,
+      remove_if_ok, message)
+
+  return rename_node
+
+
+def _add_uniform_scaling_initializer_transformer(
+    parent, node, full_name, name, logs):
+  """Updates references to uniform_unit_scaling_initializer.
+
+  Transforms:
+  tf.uniform_unit_scaling_initializer(factor, seed, dtype) to
+  tf.compat.v1.keras.initializers.VarianceScaling(
+      scale=factor, distribution="uniform", seed=seed)
+
+  Note: to apply this transformation, symbol must be added
+  to reordered_function_names above.
+  """
+  for keyword_arg in node.keywords:
+    if keyword_arg.arg == "factor":
+      keyword_arg.arg = "scale"
+
+  distribution_value = "\"uniform\""
+  # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
+  ast_value = pasta.parse(distribution_value)
+  node.keywords.append(ast.keyword(arg="distribution", value=ast_value))
+
+  lineno = node.func.value.lineno
+  col_offset = node.func.value.col_offset
+  node.func.value = ast_edits.full_name_node("tf.compat.v1.keras.initializers")
+  node.func.value.lineno = lineno
+  node.func.value.col_offset = col_offset
+  node.func.attr = "VarianceScaling"
+  return node
+
+
+def _name_scope_transformer(parent, node, full_name, name, logs):
+  """Fix name scope invocation to use 'default_name' and omit 'values' args."""
+
+  name_found, name = ast_edits.get_arg_value(node, "name", 0)
+  default_found, default_name = ast_edits.get_arg_value(node, "default_name", 1)
+
+  # If an actual name was given...
+  if name_found and pasta.dump(name) != "None":
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "`name` passed to `name_scope`. Because you may be re-entering"
+                 " an existing scope, it is not safe to convert automatically, "
+                 " the v2 name_scope does not support re-entering scopes by"
+                 " name.\n"))
+    # Rename to compat.v1
+    new_name = "tf.compat.v1.name_scope"
+    logs.append((ast_edits.INFO, node.func.lineno, node.func.col_offset,
+                 "Renamed %r to %r" % (full_name, new_name)))
+    new_name_node = ast_edits.full_name_node(new_name, node.func.ctx)
+    ast.copy_location(new_name_node, node.func)
+    pasta.ast_utils.replace_child(node, node.func, new_name_node)
     return node
+
+  if default_found:
+    # New name scope doesn't have name, but it has a default name. We use
+    # name=default_name, and values can be dropped (it's only for
+    # error reporting and useless outside of graph mode).
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Using default_name as name in call to name_scope.\n"))
+    # Remove all args other than name
+    node.args = []
+    node.keywords = [ast.keyword(arg="name", value=default_name)]
+    return node
+
+  logs.append((ast_edits.ERROR, node.lineno, node.col_offset,
+               "name_scope call with neither name nor default_name cannot be "
+               "converted properly."))
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 870bc6f2163f91eb4fd1e3c71a99bed022bf472f..36e30f559e34871b263a7ffae2fe85866586c707 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+"""Upgrader for Python scripts from 1.x TensorFlow to 2.0 TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,15 +22,33 @@ import argparse
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
+from tensorflow.tools.compatibility import ipynb
+
+
+def process_file(in_filename, out_filename, upgrader):
+  """Process a file of type `.py` or `.ipynb`."""
+
+  if in_filename.endswith(".py"):
+    files_processed, report_text, errors = \
+      upgrader.process_file(in_filename, out_filename)
+  elif in_filename.endswith(".ipynb"):
+    files_processed, report_text, errors = \
+      ipynb.process_file(in_filename, out_filename, upgrader)
+  else:
+    raise NotImplementedError(
+        "Currently converter only supports python or ipynb")
+
+  return files_processed, report_text, errors
 
 
 def main():
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
+      description="""Convert a TensorFlow Python file from 1.x to 2.0
 
 Simple usage:
   tf_upgrade_v2.py --infile foo.py --outfile bar.py
+  tf_upgrade_v2.py --infile foo.ipynb --outfile bar.ipynb
   tf_upgrade_v2.py --intree ~/code/old --outtree ~/code/new
 """)
   parser.add_argument(
@@ -62,11 +80,10 @@ Simple usage:
   parser.add_argument(
       "--inplace",
       dest="in_place",
-      help=("If converting a whole tree of files, whether to "
+      help=("If converting a set of files, whether to "
             "allow the conversion to be performed on the "
-            "files in the input tree."),
-      type=bool,
-      default=False)
+            "input files."),
+      action="store_true")
   parser.add_argument(
       "--reportfile",
       dest="report_filename",
@@ -81,32 +98,56 @@ Simple usage:
   report_filename = args.report_filename
   files_processed = 0
   if args.input_file:
-    if not args.output_file:
+    if not args.in_place and not args.output_file:
       raise ValueError(
           "--outfile=<output file> argument is required when converting a "
           "single file.")
-    files_processed, report_text, errors = upgrade.process_file(
-        args.input_file, args.output_file)
+    if args.in_place and args.output_file:
+      raise ValueError(
+          "--outfile argument is invalid when when converting in place")
+    output_file = args.input_file if args.in_place else args.output_file
+    files_processed, report_text, errors = process_file(
+        args.input_file, output_file, upgrade)
+    errors = {args.input_file: errors}
     files_processed = 1
   elif args.input_tree:
-    if not args.output_tree:
+    if not args.in_place and not args.output_tree:
       raise ValueError(
           "--outtree=<output directory> argument is required when converting a "
           "file tree.")
+    if args.in_place and args.output_tree:
+      raise ValueError(
+          "--outtree argument is invalid when when converting in place")
+    output_tree = args.input_tree if args.in_place else args.output_tree
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files, args.in_place)
+        args.input_tree, output_tree, args.copy_other_files)
   else:
     parser.print_help()
   if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+    num_errors = 0
+    report = []
+    for f in errors:
+      if errors[f]:
+        num_errors += len(errors[f])
+        report.append("-" * 80 + "\n")
+        report.append("File: %s\n" % f)
+        report.append("-" * 80 + "\n")
+        report.append("\n".join(errors[f]) + "\n")
+
+    report = ("TensorFlow 2.0 Upgrade Script\n"
+              "-----------------------------\n"
+              "Converted %d files\n" % files_processed +
+              "Detected %d issues that require attention" % num_errors + "\n" +
+              "-" * 80 + "\n") + "".join(report)
+    with open(report_filename, "w") as report_file:
+      report_file.write(report)
+      report_file.write("=" * 80 + "\n")
+      report_file.write("Detailed log follows:\n\n")
+      report_file.write("=" * 80 + "\n")
+      report_file.write(report_text)
 
+    print(report)
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
 
 if __name__ == "__main__":
   main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 23446e30b12d2ca8d9611f6b0b8adfb697bec02c..52497ca6dac08e24d9f164019151e680e05487c1 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -86,18 +86,31 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   @classmethod
   def setUpClass(cls):
     cls.v2_symbols = {}
-    if not hasattr(tf.compat, "v2"):
-      return
+    cls.v1_symbols = {}
+    if hasattr(tf.compat, "v2"):
 
-    def symbol_collector(unused_path, unused_parent, children):
-      for child in children:
-        _, attr = tf_decorator.unwrap(child[1])
-        api_names_v2 = tf_export.get_v2_names(attr)
-        for name in api_names_v2:
-          cls.v2_symbols["tf." + name] = attr
+      def symbol_collector(unused_path, unused_parent, children):
+        for child in children:
+          _, attr = tf_decorator.unwrap(child[1])
+          api_names_v2 = tf_export.get_v2_names(attr)
+          for name in api_names_v2:
+            cls.v2_symbols["tf." + name] = attr
+
+      visitor = public_api.PublicAPIVisitor(symbol_collector)
+      visitor.private_map["tf.compat"] = ["v1"]
+      traverse.traverse(tf.compat.v2, visitor)
 
-    visitor = public_api.PublicAPIVisitor(symbol_collector)
-    traverse.traverse(tf.compat.v2, visitor)
+    if hasattr(tf.compat, "v1"):
+
+      def symbol_collector_v1(unused_path, unused_parent, children):
+        for child in children:
+          _, attr = tf_decorator.unwrap(child[1])
+          api_names_v1 = tf_export.get_v1_names(attr)
+          for name in api_names_v1:
+            cls.v1_symbols["tf." + name] = attr
+
+      visitor = public_api.PublicAPIVisitor(symbol_collector_v1)
+      traverse.traverse(tf.compat.v1, visitor)
 
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
@@ -145,7 +158,11 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           _, _, _, text = self._upgrade("tf." + name)
           if (text and
               not text.startswith("tf.compat.v1") and
-              text not in self.v2_symbols):
+              not text.startswith("tf.compat.v2") and
+              text not in self.v2_symbols and
+              # Builds currently install old version of estimator that doesn't
+              # have some 2.0 symbols.
+              not text.startswith("tf.estimator")):
             self.assertFalse(
                 True, "Symbol %s generated from %s not in v2 API" % (
                     text, name))
@@ -159,11 +176,6 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     collect = True
     v1_symbols = set([])
 
-    # Symbols which may be generated by the conversion script which do not exist
-    # in TF 1.x. This should be a very short list of symbols which are
-    # experimental in 1.x but stable for 2.x.
-    whitelisted_v2_only_symbols = set(["tf.saved_model.save"])
-
     # Converts all symbols in the v1 namespace to the v2 namespace, raising
     # an error if the target of the conversion is not in the v1 namespace.
     def conversion_visitor(unused_path, unused_parent, children):
@@ -177,9 +189,9 @@ class TestUpgrade(test_util.TensorFlowTestCase):
             _, _, _, text = self._upgrade("tf." + name)
             if (text and
                 not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.compat.v2") and
                 not text.startswith("tf.estimator") and
-                text not in v1_symbols and
-                text not in whitelisted_v2_only_symbols):
+                text not in v1_symbols):
               self.assertFalse(
                   True, "Symbol %s generated from %s not in v1 API" % (
                       text, name))
@@ -278,6 +290,10 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
                   (new_function_name, text_input, text))
             continue
+          if new_function_name.startswith("tf.compat.v2"):
+            self.assertIn(new_function_name.replace("tf.compat.v2.", "tf."),
+                          self.v2_symbols)
+            continue
           # 3. Verify V2 function and arguments.
           args_v2 = get_args(self.v2_symbols[new_function_name])
           args_v2.extend(v2_arg_exceptions)
@@ -287,17 +303,49 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                 "Invalid argument '%s' in 2.0 when converting\n%s\nto\n%s.\n"
                 "Supported arguments: %s" % (
                     new_arg, text_input, text, str(args_v2)))
+          # 4. Verify that the argument exists in v1 as well.
+          if new_function_name in set(["tf.nn.ctc_loss",
+                                       "tf.saved_model.save"]):
+            continue
+          args_v1 = get_args(self.v1_symbols[new_function_name])
+          args_v1.extend(v2_arg_exceptions)
+          for new_arg in new_args:
+            self.assertIn(
+                new_arg, args_v1,
+                "Invalid argument '%s' in 1.0 when converting\n%s\nto\n%s.\n"
+                "Supported arguments: %s" % (
+                    new_arg, text_input, text, str(args_v1)))
 
     visitor = public_api.PublicAPIVisitor(conversion_visitor)
     visitor.do_not_descend_map["tf"].append("contrib")
     visitor.private_map["tf.compat"] = ["v1", "v2"]
     traverse.traverse(tf.compat.v1, visitor)
 
+  def testPositionsMatchArgGiven(self):
+    full_dict = tf_upgrade_v2.TFAPIChangeSpec().function_arg_warnings
+    method_names = full_dict.keys()
+    for method_name in method_names:
+      args = full_dict[method_name].keys()
+      # special case for optimizer methods
+      if method_name.startswith("*."):
+        method = method_name.replace("*", "tf.train.Optimizer")
+      else:
+        method = method_name
+      method = get_symbol_for_name(tf, method)
+      arg_spec = tf_inspect.getfullargspec(method)
+      for (arg, pos) in args:
+        # to deal with the self argument on methods on objects
+        if method_name.startswith("*."):
+          pos += 1
+        self.assertEqual(arg_spec[0][pos], arg)
+
   def testReorderFileNeedsUpdate(self):
     reordered_function_names = (
         tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
     function_reorders = (
         tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+    manual_function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().manual_function_reorders)
 
     added_names_message = """Some function names in
 self.reordered_function_names are not in reorders_v2.py.
@@ -317,6 +365,8 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     # function_reorders should contain reordered_function_names
     # and their TensorFlow V1 aliases.
     for name in function_reorders:
+      if name in manual_function_reorders:
+        continue
       # get other names for this function
       attr = get_symbol_for_name(tf.compat.v1, name)
       _, attr = tf_decorator.unwrap(attr)
@@ -358,19 +408,87 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
                   "tf.train.inverse_time_decay", "tf.train.cosine_decay",
                   "tf.train.cosine_decay_restarts",
                   "tf.train.linear_cosine_decay",
-                  "tf.train.noisy_linear_cosine_decay"]:
+                  "tf.train.noisy_linear_cosine_decay",
+                  "tf.train.piecewise_constant_decay",
+                 ]:
 
       text = "%s(a, b)\n" % decay
-      _, report, errors, _ = self._upgrade(text)
-      self.assertIn("%s requires manual check" % decay, errors[0])
-      self.assertIn("%s has been changed" % decay, report)
-
-  def testPiecewiseDecay(self):
-    text = "tf.train.piecewise_constant_decay(a, b)\n"
-    _, report, errors, _ = self._upgrade(text)
-    self.assertIn("tf.train.piecewise_constant_decay requires manual check",
-                  errors[0])
-    self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
+      _, report, unused_errors, _ = self._upgrade(text)
+      self.assertIn("switch to the schedules in "
+                    "`tf.keras.optimizers.schedules`", report)
+
+  def verify_compat_v1_rename_correctness(self, values, ns_prefix=""):
+    if ns_prefix:
+      ns_prefix += "."
+    for v in values:
+      text = "tf." + ns_prefix + v + "(a, b)"
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1." + ns_prefix + v + "(a, b)", new_text)
+
+  def testIntializers(self):
+    initializers = [
+        "zeros",
+        "ones",
+        "constant",
+        "random_uniform",
+        "random_normal",
+        "truncated_normal",
+        "variance_scaling",
+        "orthogonal",
+        "glorot_uniform",
+        "glorot_normal",
+        "identity",
+        "lecun_normal",
+        "lecun_uniform",
+        "he_normal",
+        "he_uniform",
+    ]
+    self.verify_compat_v1_rename_correctness(
+        initializers, ns_prefix="initializers")
+
+    initializers = [
+        "zeros_initializer",
+        "ones_initializer",
+        "constant_initializer",
+        "random_uniform_initializer",
+        "random_normal_initializer",
+        "truncated_normal_initializer",
+        "variance_scaling_initializer",
+        "orthogonal_initializer",
+        "glorot_uniform_initializer",
+        "glorot_normal_initializer",
+    ]
+    self.verify_compat_v1_rename_correctness(initializers)
+
+    initializers = [
+        "zeros",
+        "ones",
+        "Ones",
+        "Zeros",
+        "constant",
+        "Constant",
+        "VarianceScaling",
+        "Orthogonal",
+        "orthogonal",
+        "Identity",
+        "identity",
+        "glorot_uniform",
+        "glorot_normal",
+        "lecun_normal",
+        "lecun_uniform",
+        "he_normal",
+        "he_uniform",
+        "TruncatedNormal",
+        "truncated_normal",
+        "RandomUniform",
+        "uniform",
+        "random_uniform",
+        "RandomNormal",
+        "normal",
+        "random_normal",
+    ]
+    self.verify_compat_v1_rename_correctness(
+        initializers, ns_prefix="keras.initializers")
 
   def testMetrics(self):
     metrics = [
@@ -409,16 +527,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         "true_positives_at_thresholds",
     ]
     for m in metrics:
-      ns = "tf.metrics." + m
-      text = ns + "(a, b)"
-      _, report, errors, new_text = self._upgrade(text)
+      text = "tf.metrics." + m + "(a, b)"
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual("tf.compat.v1.metrics." + m + "(a, b)", new_text)
-      self.assertIn("test.py:1:0: %s requires manual check" % ns, errors[0])
       self.assertIn(
-          "WARNING: tf.metrics have been converted to object oriented"
-          " versions in TF 2.0 and after. The metric function calls have been "
-          "converted to compat.v1 for backward compatibility. Please update "
-          "these calls to the TF 2.0 versions.", report)
+          "tf.metrics have been replaced with object oriented versions", report)
 
   def testLosses(self):
     losses = [
@@ -440,16 +553,11 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         "sparse_softmax_cross_entropy",
     ]
     for l in losses:
-      ns = "tf.losses." + l
-      text = ns + "(a, b)"
-      _, report, errors, new_text = self._upgrade(text)
+      text = "tf.losses." + l + "(a, b)"
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual("tf.compat.v1.losses." + l + "(a, b)", new_text)
-      self.assertIn("test.py:1:0: %s requires manual check" % ns, errors[0])
       self.assertIn(
-          "WARNING: tf.losses have been converted to object oriented"
-          " versions in TF 2.0 and after. The loss function calls have been "
-          "converted to compat.v1 for backward compatibility. Please update "
-          "these calls to the TF 2.0 versions.", report)
+          "tf.losses have been replaced with object oriented versions", report)
 
   def testEstimatorLossReductionChange(self):
     classes = [
@@ -459,11 +567,98 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     ]
     for c in classes:
       ns = "tf.estimator." + c
-      text = ns + "(a, b)"
+      text = ns + "()"
+      expected_text = ns + "(loss_reduction=tf.compat.v1.losses.Reduction.SUM)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+      text = ns + "(loss_reduction=TEST)"
+      expected_text = ns + "(loss_reduction=TEST)"
       _, report, errors, new_text = self._upgrade(text)
       self.assertEqual(text, new_text)
-      self.assertIn("%s requires manual check" % ns, errors[0])
-      self.assertIn("loss_reduction has been changed", report)
+    text = "tf.estimator.BaselineClassifier(m, c, w, v, o, c, lr)"
+    expected_text = (
+        "tf.estimator.BaselineClassifier(" +
+        "model_dir=m, n_classes=c, weight_column=w, label_vocabulary=v, "
+        "optimizer=o, config=c, loss_reduction=lr)")
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.estimator.BaselineClassifier(model_dir=model_dir)"
+    expected_text = ("tf.estimator.BaselineClassifier(" +
+                     "model_dir=model_dir, "
+                     "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testBaseEstimatorPartitioner(self):
+    classes = ["LinearEstimator", "DNNLinearCombinedEstimator", "DNNEstimator"]
+    for c in classes:
+      ns = "tf.estimator." + c
+      suffix = "(input_layer_partitioner=TEST)"
+      text = ns + suffix
+      expected_text = "tf.compat.v1.estimator." + c + suffix
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(new_text, expected_text)
+
+  def testCannedEstimatorPartitioner(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier"
+    ]
+
+    for c in classes:
+      ns = "tf.estimator." + c
+      suffix = "(input_layer_partitioner=TEST)"
+      text = ns + suffix
+      suffix = ("(input_layer_partitioner=TEST, "
+                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+      expected_text = "tf.compat.v1.estimator." + c + suffix
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(new_text, expected_text)
+
+  def testExtractGlimpse(self):
+    text = ("tf.image.extract_glimpse(x, size, off, False, "
+            "False, False, name=\"foo\")\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.image.extract_glimpse(x, size, off, False, "
+        "False, 'uniform' if (False) else 'gaussian', name=\"foo\")\n",
+    )
+
+    text = ("tf.image.extract_glimpse(x, size, off, centered=False, "
+            "normalized=False, uniform_noise=True if uniform_noise else "
+            "False, name=\"foo\")\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.image.extract_glimpse(x, size, off, centered=False, "
+        "normalized=False, noise='uniform' if (True if uniform_noise else "
+        "False) else 'gaussian', name=\"foo\")\n",
+    )
+
+    text = ("tf.image.extract_glimpse(x,\n"
+            "                         size,\n"
+            "                         off,\n"
+            "                         centered=True,\n"
+            "                         normalized=True, # Stuff before\n"
+            "                         uniform_noise=False,\n"
+            "                         name=\"foo\")# Stuff after\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text, "tf.image.extract_glimpse(x,\n"
+        "                         size,\n"
+        "                         off,\n"
+        "                         centered=True,\n"
+        "                         normalized=True, # Stuff before\n"
+        "                         noise='uniform' if (False) else 'gaussian',\n"
+        "                         name=\"foo\")# Stuff after\n")
+
+    text = "tf.image.extract_glimpse(x)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)
+    self.assertEqual(errors, [])
 
   def testDropout(self):
     text = "tf.nn.dropout(x, keep_prob, name=\"foo\")\n"
@@ -581,16 +776,21 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(errors, [])
 
   def testColocateGradientsWithOps(self):
-    text = "tf.gradients(a, foo=False)\n"
+    text = "tf.gradients(yx=a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
     self.assertEqual(errors, [])
 
-    text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
+    text = "tf.gradients(yx=a, colocate_gradients_with_ops=False)\n"
+    _, report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual("tf.gradients(yx=a)\n", new_text)
+    self.assertIn("tf.gradients no longer takes", report)
+
+    text = "tf.gradients(y, x, grad_ys, name, colocate, gate)\n"
+    expected = ("tf.gradients(ys=y, xs=x, grad_ys=grad_ys, name=name, "
+                "gate_gradients=gate)\n")
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual("tf.gradients(a)\n", new_text)
-    self.assertIn("tf.gradients", errors[0])
-    self.assertIn("requires manual check", errors[0])
+    self.assertEqual(expected, new_text)
 
   def testColocateGradientsWithOpsMinimize(self):
     text = "optimizer.minimize(a, foo=False)\n"
@@ -599,10 +799,9 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(errors, [])
 
     text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
-    _, unused_report, errors, new_text = self._upgrade(text)
+    _, report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual("optimizer.minimize(a)\n", new_text)
-    self.assertIn("requires manual check", errors[0])
-    self.assertIn("minimize", errors[0])
+    self.assertIn("Optimizer.minimize no longer takes", report)
 
   def testColocateGradientsWithOpsComputeGradients(self):
     text = "optimizer.compute_gradients(a, foo=False)\n"
@@ -611,10 +810,9 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(errors, [])
 
     text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
-    _, unused_report, errors, new_text = self._upgrade(text)
+    _, report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual("optimizer.compute_gradients(a)\n", new_text)
-    self.assertIn("requires manual check", errors[0])
-    self.assertIn("compute_gradients", errors[0])
+    self.assertIn("Optimizer.compute_gradients no longer takes", report)
 
   def testExportSavedModelRename(self):
     text = "self.est.export_savedmodel(path)"
@@ -694,6 +892,16 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testKerasSavedModel(self):
+    text = (
+        "tf.contrib.saved_model.save_keras_model(model, './saved_models')\n"
+        "tf.contrib.saved_model.load_keras_model(saved_model_path)\n")
+    expected_text = (
+        "tf.keras.experimental.export_saved_model(model, './saved_models')\n"
+        "tf.keras.experimental.load_from_saved_model(saved_model_path)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testStatelessMultinomial(self):
     text = (
         "tf.random.stateless_multinomial(logits, num_samples, seed, "
@@ -732,6 +940,36 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
+  def testSoftMaxCrossEntropyWithLogitsDoesntNest(self):
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(labels), logits=logits, dim=2)")
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(foo(bar)))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo().zz())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo().zz()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
     expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
@@ -770,6 +1008,45 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testConv2D(self):
+    text = (
+        "tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu, "
+        "data_format)")
+    expected_text = (
+        "tf.nn.conv2d(input=input, filters=filter, strides=strides, "
+        "padding=padding, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.nn.conv2d(input, filter=filter, strides=strides, padding=padding, "
+        "use_cudnn_on_gpu=use_cudnn_on_gpu)")
+    expected_text = ("tf.nn.conv2d(input=input, filters=filter, "
+                     "strides=strides, padding=padding)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testConv2DBackpropFilter(self):
+    text = (
+        "tf.nn.conv2d_backprop_filter(input, filter_sizes, out_backprop, "
+        "strides, padding, use_cudnn_on_gpu, data_format)")
+    expected_text = (
+        "tf.compat.v1.nn.conv2d_backprop_filter(input, filter_sizes, "
+        "out_backprop, strides, padding, use_cudnn_on_gpu, data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testConv2DBackpropInput(self):
+    text = (
+        "tf.nn.conv2d_backprop_input(input_sizes, filter, out_backprop, "
+        "strides, padding, use_cudnn_on_gpu, data_format)")
+    expected_text = (
+        "tf.nn.conv2d_transpose(output_shape=input_sizes, filters=filter, "
+        "input=out_backprop, strides=strides, padding=padding, "
+        "data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testSpacetoBatch(self):
     text = "tf.space_to_batch_nd(input, shape, paddings, name)"
     expected_text = "tf.space_to_batch(input, shape, paddings, name)"
@@ -803,7 +1080,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
             "validate_indices, max_norm)")
     expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
                      "partition_strategy=partition_strategy, name=name, "
-                     "validate_indices=validate_indices, max_norm=max_norm)")
+                     "max_norm=max_norm)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -868,18 +1145,55 @@ tf.print('abc')
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
-  def testBatchGather(self):
-    text = "tf.batch_gather(foo, bar)"
-    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
-    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
-    _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertIn(new_text, [expected_text1, expected_text2])
-
-    text = "tf.batch_gather(params=foo, indices=bar)"
-    expected_text1 = "tf.gather(params=foo, indices=bar, batch_dims=-1)"
-    expected_text2 = "tf.gather(batch_dims=-1, params=foo, indices=bar)"
-    _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertIn(new_text, [expected_text1, expected_text2])
+  def testIterators(self):
+    for (text, expected) in [
+        ("(expr + yielding(data)).make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator((expr + yielding(data)))"),
+        ("dataset.make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("dataset.make_one_shot_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("dataset.make_one_shot_iterator(x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("dataset.make_initializable_iterator()",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("ds.make_initializable_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("dataset.make_initializable_iterator(x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)")]:
+      _, unused_report, unused_errors, actual = self._upgrade(text)
+      self.assertEqual(actual, expected)
+
+  def testMapAndBatch(self):
+    suffix = ".data.experimental.map_and_batch_with_legacy_function(args)"
+    text = "tf" + suffix
+    expected = "tf.compat.v1" + suffix
+    _, unused_report, unused_errors, actual = self._upgrade(text)
+    self.assertEqual(actual, expected)
 
   def testCast(self):
     for (name, dtype) in [("int32", "int32"),
@@ -918,9 +1232,10 @@ tf.print('abc')
   def testImageResizeExtraPositionalArgs(self):
     for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
       text = "tf.image.resize_%s(i, s, a, p)" % method
-      expected_text = ["tf.image.resize(i, s, ", "align_corners=a, ",
-                       "preserve_aspect_ratio=p, ",
-                       "method=tf.image.ResizeMethod.%s)" % method.upper()]
+      expected_text = [
+          "tf.image.resize(i, s, ", "preserve_aspect_ratio=p, ",
+          "method=tf.image.ResizeMethod.%s)" % method.upper()
+      ]
       _, unused_report, unused_errors, new_text = self._upgrade(text)
       for s in expected_text:
         self.assertIn(s, new_text)
@@ -958,29 +1273,252 @@ def _log_prob(self, x):
                  "assert_scalar"]:
       text = "tf.%s(a)" % name
       expected_text = "tf.compat.v1.%s(a)" % name
-      _, unused_report, errors, new_text = self._upgrade(text)
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(expected_text, new_text)
-      self.assertIn("assert_* functions", errors[0])
+      self.assertIn("%s has been" % name, report)
 
       text = "tf.debugging.%s(a)" % name
       expected_text = "tf.compat.v1.debugging.%s(a)" % name
-      _, unused_report, errors, new_text = self._upgrade(text)
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(expected_text, new_text)
-      self.assertIn("assert_* functions", errors[0])
+      self.assertIn("%s has been" % name, report)
 
   def testAssertRankStatements(self):
     for name in ["assert_rank", "assert_rank_at_least", "assert_rank_in"]:
       text = "tf.%s(a)" % name
       expected_text = "tf.compat.v1.%s(a)" % name
-      _, unused_report, errors, new_text = self._upgrade(text)
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(expected_text, new_text)
-      self.assertIn("assert_rank_* functions", errors[0])
+      self.assertIn("%s has been" % name, report)
 
       text = "tf.debugging.%s(a)" % name
       expected_text = "tf.compat.v1.debugging.%s(a)" % name
-      _, unused_report, errors, new_text = self._upgrade(text)
+      _, report, unused_errors, new_text = self._upgrade(text)
       self.assertEqual(expected_text, new_text)
-      self.assertIn("assert_rank_* functions", errors[0])
+      self.assertIn("%s has been" % name, report)
+
+  def test_assert_equal_graph_def(self):
+    text = "tf.test.assert_equal_graph_def(a, b, checkpoint_v2=x)"
+    expected = "tf.test.assert_equal_graph_def(actual=a, expected=b)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_is_tensor_upgrade(self):
+    text = "tf.contrib.framework.is_tensor(x)"
+    expected = "tf.is_tensor(x)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_CriticalSection_upgrade(self):
+    text = "tf.contrib.framework.CriticalSection(shared_name='blah')"
+    expected = "tf.CriticalSection(shared_name='blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_sample_distorted_bounding_box(self):
+    # pylint: disable=line-too-long
+    text = "tf.image.sample_distorted_bounding_box(a, b, c, d, e, f, g, h, i, j)"
+    expected = "tf.image.sample_distorted_bounding_box(image_size=a, bounding_boxes=b, seed=c, min_object_covered=e, aspect_ratio_range=f, area_range=g, max_attempts=h, use_image_if_no_bounding_boxes=i, name=j)"
+    # pylint: enable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_contrib_initialize(self):
+    text = "tf.contrib.summary.initialize"
+    expected = "tf.compat.v1.summary.initialize"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_contrib_framework_argsort(self):
+    text = "tf.contrib.framework.argsort"
+    expected = "tf.argsort"
+    # pylint: enable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_contrib_rnn_cell(self):
+    text = "tf.contrib.rnn.RNNCell"
+    expected = "tf.compat.v1.nn.rnn_cell.RNNCell"
+    # pylint: enable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_flags_bare(self):
+    _, _, errors, _ = self._upgrade("tf.flags")
+    self.assertIn("tf.flags has been removed", errors[0])
+
+  def test_flags_flags(self):
+    _, _, errors, _ = self._upgrade("tf.flags.FLAGS")
+    self.assertIn("tf.flags has been removed", errors[0])
+
+  def test_max_pool_2d(self):
+    text = "tf.nn.max_pool(value=4)"
+    expected_text = "tf.nn.max_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_contrib_estimator_early_stopping(self):
+    api_symbols = [
+        "make_early_stopping_hook", "stop_if_higher_hook", "stop_if_lower_hook",
+        "stop_if_no_decrease_hook", "stop_if_no_increase_hook"
+    ]
+    for symbol in api_symbols:
+      text = "tf.contrib.estimator." + symbol
+      expected_text = "tf.estimator.experimental." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_rnn(self):
+    api_symbols = ["BasicLSTMCell", "BasicRNNCell", "GRUCell", "LSTMCell",
+                   "MultiRNNCell"]
+    for symbol in api_symbols:
+      text = "tf.contrib.rnn." + symbol
+      expected_text = "tf.compat.v1.nn.rnn_cell." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_summary_audio(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram(self):
+    text = "tf.contrib.summary.histogram('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image(self):
+    text = "tf.contrib.summary.image('foo', myval, red, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'bad_color' argument", errors[0])
+    self.assertIn("'family' argument", errors[1])
+    self.assertIn("Manual check required", errors[2])
+
+  def test_contrib_summary_scalar(self):
+    text = "tf.contrib.summary.scalar('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_audio_nostep(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram_nostep(self):
+    text = "tf.contrib.summary.histogram('foo', myval)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image_nostep(self):
+    text = "tf.contrib.summary.image('foo', myval)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_scalar_nostep(self):
+    text = "tf.contrib.summary.scalar('foo', myval)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_avg_pool_2d(self):
+    text = "tf.nn.avg_pool(value=4)"
+    expected_text = "tf.nn.avg_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_saved_model_load(self):
+    text = "tf.saved_model.load(sess, ['foo_graph'])"
+    expected = "tf.compat.v1.saved_model.load(sess, ['foo_graph'])"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_saved_model_load_v2(self):
+    text = "tf.saved_model.load_v2('/tmp/blah')"
+    expected = "tf.compat.v2.saved_model.load('/tmp/blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_uniform_unit_scaling_initializer(self):
+    text = "tf.uniform_unit_scaling_initializer(0.5)"
+    expected_text = ("tf.compat.v1.keras.initializers.VarianceScaling("
+                     "scale=0.5, distribution=\"uniform\")")
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.initializers.uniform_unit_scaling(0.5)"
+    expected_text = ("tf.compat.v1.keras.initializers.VarianceScaling("
+                     "scale=0.5, distribution=\"uniform\")")
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_name_scope(self):
+    text = "tf.name_scope(None, default_name, [some, values])"
+    expected_text = "tf.name_scope(name=default_name)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.name_scope(default_name=default_name, values=stuff)"
+    expected_text = "tf.name_scope(name=default_name)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.name_scope(name=n, default_name=d, values=s)"
+    expected_text = "tf.compat.v1.name_scope(name=n, default_name=d, values=s)"
+    _, report, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+    self.assertIn("`name` passed to `name_scope`", report)
+
+    text = "tf.name_scope(name=None, values=stuff)"
+    _, _, errors, _ = self._upgrade(text)
+    self.assertIn("name_scope call with neither name nor default_name",
+                  errors[0])
+
+  def test_string_split(self):
+    text = "tf.string_split('test', delimiter=' ')"
+    expected_text = "tf.strings.split(source='test', sep=' ')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.string_split('test', ' ', True)"
+    expected_text = "tf.compat.v1.string_split(source='test', sep=' ', skip_empty=True)"  # pylint: disable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "tf.string_split('test', ' ', skip_empty=False)"
+    expected_text = "tf.strings.split(source='test', sep=' ')"  # pylint: disable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
 
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
index 0eb942d39617c7fe17bc62ff19c98047900d33cf..c4fd8aab90c46cca2b220bfb2d991dbc7c9df2cc 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -86,7 +86,16 @@ def collect_function_arg_names(function_names):
       matches_function_names = any(
           name in function_names for name in api_names_v1)
       if matches_function_names:
-        arg_list = tf_inspect.getargspec(attr)[0]
+        if tf_inspect.isclass(attr):
+          # Get constructor arguments if attr is a class
+          arg_list = tf_inspect.getargspec(
+              getattr(attr, '__init__'))[0]
+          arg_list = arg_list[1:]  # skip 'self' argument
+        else:
+          # Get function arguments.
+          # getargspec returns a tuple of (args, varargs, keywords, defaults)
+          # we just look at args.
+          arg_list = tf_inspect.getargspec(attr)[0]
         for name in api_names_v1:
           function_to_args[name] = arg_list
 
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index 4bfcc2570cce9c8dac369b7c9cf882356c428df5..fda6b86fc2c203d14567330aff44a95abf15be92 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -89,7 +89,7 @@ def get_args():
                       help="paths to input def file",
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
-  parser.add_argument("--target", help="name of the target", required=True)
+  parser.add_argument("--target", help="name of the target")
   args = parser.parse_args()
   return args
 
@@ -119,7 +119,8 @@ def main():
     taken = set()
 
     # Header for the def file.
-    def_fp.write("LIBRARY " + args.target + "\n")
+    if args.target:
+      def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
     def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
 
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
deleted file mode 100644
index 2a7605bbc960f1caccd6163fb5867639c48fa70c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Docker image for testing distributed (GRPC) TensorFlow on Google Container
-# Engine (GKE).
-#
-# See ./remote_test.sh for usage example.
-
-FROM ubuntu:16.04
-
-LABEL maintainer="Shanqing Cai <cais@google.com>"
-
-RUN apt-get update
-RUN apt-get install -y \
-    curl \
-    python \
-    python-pip \
-    && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Google Cloud SDK
-RUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash
-RUN chmod +x install_google_cloud_sdk.bash
-RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
-
-# Install kubectl
-RUN /var/gcloud/google-cloud-sdk/bin/gcloud components install kubectl
-
-# Install TensorFlow pip whl
-# TODO(cais): Should we build it locally instead?
-COPY tensorflow-*.whl /
-RUN pip install /tensorflow-*.whl
-RUN rm -f /tensorflow-*.whl
-
-# Copy test files
-COPY scripts /var/tf-dist-test/scripts
-COPY python /var/tf-dist-test/python
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
deleted file mode 100644
index 1e29977788176477492a03c4683cc489ec9fae44..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/README.md
+++ /dev/null
@@ -1,159 +0,0 @@
-# Testing Distributed Runtime in TensorFlow
-
-This folder contains tools and test suites for GRPC-based and Allreduce-based
-distributed runtimes in TensorFlow.
-
-There are three general modes of testing:
-
-**1) Launch a docker container and run parameters servers and workers as
- separate processes therein.**
-
-For example:
-
-    ./local_test.sh
-
-By default, local_test.sh runs the MNIST-with-replicas model as a test.
-However, you can use the --model_name flag to run the tf-learn/wide&deep
-cesnsu model:
-
-    ./local_test.sh --model_name CENSUS_WIDENDEEP
-
-You can test specify version of TensorFlow:
-
-```shell
-./local_test.sh ${whl_file_url}
-```
-
-For example, you can find these TensorFlow python package URLs from [here](https://www.tensorflow.org/install/pip) for Ubuntu.
-
-**2) Launch a remote k8s cluster on Google Kubernetes Engine (GKE) and run the
-test suite on it**
-
-For example:
-
-    export TF_DIST_GCLOUD_PROJECT="tensorflow-testing"
-    export TF_DIST_GCLOUD_COMPUTE_ZONE="us-central1-f"
-    export TF_DIST_CONTAINER_CLUSTER="test-cluster-1"
-    export TF_DIST_GCLOUD_KEY_FILE="/var/gcloud-secrets/my-gcloud-key.json"
-    ./remote_test.sh
-
-Here you specify the Google Compute Engine (GCE) project, compute zone and
-container cluster with the first three environment variables, in that order.
-The environment variable "TF_DIST_GCLOUD_KEY_FILE_DIR" is a directory in which
-the JSON service account key file named "tensorflow-testing.json" is located.
-You can use the flag "--setup-cluster-only" to perform only the cluster setup
-step and skip the testing step:
-
-    ./remote_test.sh --setup_cluster_only
-
-**3) Run the test suite on an existing k8s TensorFlow cluster**
-
-For example:
-
-    export TF_DIST_GRPC_SERVER_URL="grpc://11.22.33.44:2222"
-    ./remote_test.sh
-
-The IP address above is a dummy example. Such a cluster may have been set up
-using the command described at the end of the previous section.
-
-
-**Asynchronous and synchronous parameter updates**
-
-There are two modes for the coordination of the parameters from multiple
-workers: asynchronous and synchronous.
-
-In the asynchronous mode, the parameter updates (gradients) from the workers
-are applied to the parameters without any explicit coordination. This is the
-default mode in the tests.
-
-In the synchronous mode, a certain number of parameter updates are aggregated
-from the model replicas before the update is applied to the model parameters.
-To use this mode, do:
-
-    # For remote testing
-    ./remote_test.sh --sync_replicas
-
-    # For local testing
-    ./local_test.sh --sync_replicas
-
-
-**Specifying the number of workers**
-
-You can specify the number of workers by using the --num-workers option flag,
-e.g.,
-
-    # For remote testing
-    ./remote_test.sh --num_workers 4
-
-    # For local testing
-    ./local_test.sh --num_workers 4
-
-
-**Building the GRPC server Docker image**
-
-To build the Docker image for a test server of TensorFlow distributed runtime,
-run:
-
-    ./build_server.sh <docker_image_name>
-
-**Using the GRPC server Docker image**
-To launch a container as a TensorFlow GRPC server, do as the following example:
-
-    docker run tensorflow/tf_grpc_server --cluster_spec="worker|localhost:2222;foo:2222,ps|bar:2222;qux:2222" --job_name=worker --task_id=0
-
-**Generating configuration file for TensorFlow k8s clusters**
-
-The script at "scripts/k8s_tensorflow.py" can be used to generate yaml
-configuration files for a TensorFlow k8s cluster consisting of a number of
-workers and parameter servers. For example:
-
-    scripts/k8s_tensorflow.py \
-        --num_workers 2 \
-        --num_parameter_servers 2 \
-        --grpc_port 2222 \
-        --request_load_balancer true \
-        --docker_image "tensorflow/tf_grpc_server" \
-        > tf-k8s-with-lb.yaml
-
-The yaml configuration file generated in the previous step can be used to a
-create a k8s cluster running the specified numbers of worker and parameter
-servers. For example:
-
-    kubectl create -f tf-k8s-with-lb.yaml
-
-See [Kubernetes kubectl documentation](http://kubernetes.io/docs/user-guide/kubectl-overview/)
-for more details.
-
-**Create allreduce-based Tensorflow k8s deployment**
-
-The allreduce-based Tensorflow, Horovod, is an open source distributed deep
-learning framework for TensorFlow, detailed information can be found in
-https://arxiv.org/pdf/1802.05799.pdf.
-
-The script "scripts_allreduce/k8s_deploy_tensorflow.sh" can be used to create or
-delete an allreduce-based Tensorflow k8s deployment with specified number of
-containers.
-
-Create a deployment containing a number of containers and enable passwordless
-ssh between the containers (optional: enable host network mode with --hostnet
-and --port <container_ssh_port>):
-
-    scripts_allreduce/k8s_deploy_tensorflow.sh \
-        --num_containers <num_of_containers> \
-        --image <docker_image> \
-        --deployment <deployment_name> \
-        --config_map <config_map>
-
-Delete a deployment and config_map in k8s cluster:
-
-    scripts_allreduce/k8s_deploy_tensorflow.sh \
-        --deployment <deployment_name> \
-        --config_map <config_map> \
-        --delete
-
-Upload file or directory to all the containers of a deployment:
-
-    scripts_allreduce/k8s_deploy_tensorflow.sh \
-        --cp --src <path_to_local_directory> \
-        --dest <path_to_directory_on_containers> \
-        --deployment <deployment_name>
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
deleted file mode 100755
index 345217d733acec62c599dd6dfeffd4839e5a79bc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/build_server.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds the test server for distributed (GRPC) TensorFlow
-#
-# Usage: build_server.sh <docker_image_name> <whl_file_location> [--test]
-#
-# Arguments:
-#   docker_image_name: Name of the docker image to build.
-#     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
-#
-#   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
-#     E.g.: https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
-#     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
-#
-# The optional flag --test lets the script to use the Dockerfile for the
-# testing GRPC server. Without the flag, the script will build the non-test
-# GRPC server.
-#
-# Note that the Dockerfile is located in ./server/ but the docker build should
-# use the current directory as the context.
-
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-# Check arguments
-if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> <whl_location> [--test]"
-fi
-
-DOCKER_IMG_NAME=$1
-WHL_FILE_LOCATION=$2
-shift 2
-
-# Current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-BUILD_DIR=$(mktemp -d)
-echo ""
-echo "Using whl file URL: ${WHL_FILE_LOCATION}"
-echo "Building in temporary directory: ${BUILD_DIR}"
-
-cp -r ${DIR}/* "${BUILD_DIR}"/ || \
-    die "Failed to copy files to ${BUILD_DIR}"
-
-DOCKER_FILE="${BUILD_DIR}/server/Dockerfile"
-if [[ $1 == "--test" ]]; then
-  DOCKER_FILE="${BUILD_DIR}/server/Dockerfile.test"
-fi
-echo "Using Docker file: ${DOCKER_FILE}"
-
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
-    # Download whl file into the build context directory.
-    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
-        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
-else
-    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
-fi
-
-# Download whl file into the build context directory.
-
-if [[ ! -f "${DOCKER_FILE}" ]]; then
-  die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
-fi
-echo "Dockerfile: ${DOCKER_FILE}"
-
-# Call docker build
-docker build --no-cache -t "${DOCKER_IMG_NAME}" \
-   -f "${DOCKER_FILE}" "${BUILD_DIR}" || \
-   die "Failed to build docker image: ${DOCKER_IMG_NAME}"
-
-# Clean up docker build context directory.
-rm -rf "${BUILD_DIR}"
diff --git a/tensorflow/tools/dist_test/local/Dockerfile b/tensorflow/tools/dist_test/local/Dockerfile
deleted file mode 100644
index 383c3c2f4ca426b7e73ec074a452bdb3125c2efb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/local/Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-FROM jpetazzo/dind
-
-LABEL maintainer="Shanqing Cai <cais@google.com>"
-
-RUN apt-get update
-
-RUN apt-get install -y --no-install-recommends \
-    build-essential \
-    git \
-    software-properties-common
-
-# Install the latest golang
-RUN wget https://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz
-RUN tar -C /usr/local -xzf go1.4.2.linux-amd64.tar.gz
-RUN rm -f go1.4.2.linux-amd64.tar.gz
-RUN echo 'PATH=/usr/local/go/bin:${PATH}' >> /root/.bashrc
-
-ADD start_local_k8s_cluster.sh /var/k8s/start_local_k8s_cluster.sh
-ADD ../scripts /var/k8s/dist_test/scripts
-ADD ../python /var/k8s/dist_test/python
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
deleted file mode 100755
index b0114721bd2435dd2d4b8ee667250d3b824f1207..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/local_test.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Tests distributed TensorFlow on a locally running TF GRPC cluster.
-#
-# This script performs the following steps:
-# 1) Build the docker image capable of running distributed TensorFlow in docker.
-# 2) Run a container from the aforementioned image and start docker service
-#    in it
-# 3) Call a script to launch a distributed TensorFlow GRPC cluster inside the container
-#    and run the distributed test suite.
-#
-# Usage: local_test.sh <whl_file_location>
-#                      [--leave_container_running]
-#                      [--model_name <MODEL_NAME>]
-#                      [--num_workers <NUM_WORKERS>]
-#                      [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
-#                      [--sync_replicas]
-#
-# E.g., local_test.sh <whl_file_location> --model_name CENSUS_WIDENDEEP
-#       local_test.sh <whl_file_location> --num_workers 3 --num_parameter_servers 3
-#
-# Arguments:
-# whl_file_location: URL from which the TensorFlow whl file will be acquired.
-#   E.g.: https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
-#   E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
-#
-# --leave_container_running:  Do not stop the docker-in-docker container after
-#                             the termination of the tests, e.g., for debugging
-#
-# --num_workers <NUM_WORKERS>:
-#   Specifies the number of worker pods to start
-#
-# --num_parameter_server <NUM_PARAMETER_SERVERS>:
-#   Specifies the number of parameter servers to start
-#
-# --sync_replicas
-#   Use the synchronized-replica mode. The parameter updates from the replicas
-#   (workers) will be aggregated before applied, which avoids stale parameter
-#   updates.
-#
-#
-# In addition, this script obeys the following environment variables:
-# TF_DIST_DOCKER_NO_CACHE:      do not use cache when building docker images
-
-die() {
-  echo $@
-  exit 1
-}
-
-# Configurations
-DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
-
-# Parse input arguments
-LEAVE_CONTAINER_RUNNING=0
-MODEL_NAME=""
-MODEL_NAME_FLAG=""
-NUM_WORKERS=2
-NUM_PARAMETER_SERVERS=2
-SYNC_REPLICAS_FLAG=""
-
-WHL_FILE_LOCATION=${1}
-if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
-fi
-
-while true; do
-  if [[ $1 == "--leave_container_running" ]]; then
-    LEAVE_CONTAINER_RUNNING=1
-  elif [[ $1 == "--model_name" ]]; then
-    MODEL_NAME="$2"
-    MODEL_NAME_FLAG="--model_name ${MODEL_NAME}"
-  elif [[ $1 == "--num_workers" ]]; then
-    NUM_WORKERS=$2
-  elif [[ $1 == "--num_parameter_servers" ]]; then
-    NUM_PARAMETER_SERVERS=$2
-  elif [[ $1 == "--sync_replicas" ]]; then
-    SYNC_REPLICAS_FLAG="--sync_replicas"
-  elif [[ $1 == "--WHL_FILE_LOCATION" ]]; then
-    WHL_FILE_LOCATION=$2
-  fi
-
-  shift
-  if [[ -z $1 ]]; then
-    break
-  fi
-done
-
-echo "LEAVE_CONTAINER_RUNNING: ${LEAVE_CONTAINER_RUNNING}"
-echo "MODEL_NAME: \"${MODEL_NAME}\""
-echo "NUM_WORKERS: ${NUM_WORKERS}"
-echo "NUM_PARAMETER_SERVERS: ${NUM_PARAMETER_SERVERS}"
-echo "SYNC_REPLICAS_FLAG: \"${SYNC_REPLICAS_FLAG}\""
-
-# Current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Get utility functions
-source ${DIR}/scripts/utils.sh
-
-# Build docker image for local distributed TensorFlow cluster.
-NO_CACHE_FLAG=""
-if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
-   [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then
-  NO_CACHE_FLAG="--no-cache"
-fi
-
-# Create docker build context directory.
-BUILD_DIR=$(mktemp -d)
-echo ""
-echo "Using whl file location: ${WHL_FILE_LOCATION}"
-echo "Building in temporary directory: ${BUILD_DIR}"
-
-cp -r ${DIR}/* "${BUILD_DIR}"/ || \
-  die "Failed to copy files to ${BUILD_DIR}"
-
-# Download whl file into the build context directory.
-if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
-    # Download whl file into the build context directory.
-    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
-        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
-else
-    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
-fi
-
-# Build docker image for test.
-docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \
-   -f "${BUILD_DIR}/Dockerfile.local" "${BUILD_DIR}" || \
-   die "Failed to build docker image: ${DOCKER_IMG_NAME}"
-
-# Clean up docker build context directory.
-rm -rf "${BUILD_DIR}"
-
-# Run docker image for test.
-docker run ${DOCKER_IMG_NAME} \
-    /var/tf_dist_test/scripts/dist_mnist_test.sh \
-    --ps_hosts $(seq -f "localhost:%g" -s "," \
-                 2000 $((2000 + NUM_PARAMETER_SERVERS - 1))) \
-    --worker_hosts $(seq -f "localhost:%g" -s "," \
-                     3000 $((3000 + NUM_WORKERS - 1))) \
-    --num_gpus 0 ${SYNC_REPLICAS_FLAG}
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
deleted file mode 100644
index 8feb5386e9881596c20fba9e537a0439c8187ac4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Distributed training and evaluation of a wide and deep model."""
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import json
-import os
-import sys
-
-from six.moves import urllib
-import tensorflow as tf
-
-from tensorflow.contrib.learn.python.learn import learn_runner
-from tensorflow.contrib.learn.python.learn.estimators import run_config
-
-
-# Constants: Data download URLs
-TRAIN_DATA_URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data"
-TEST_DATA_URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test"
-
-
-# Define features for the model
-def census_model_config():
-  """Configuration for the census Wide & Deep model.
-
-  Returns:
-    columns: Column names to retrieve from the data source
-    label_column: Name of the label column
-    wide_columns: List of wide columns
-    deep_columns: List of deep columns
-    categorical_column_names: Names of the categorical columns
-    continuous_column_names: Names of the continuous columns
-  """
-  # 1. Categorical base columns.
-  gender = tf.contrib.layers.sparse_column_with_keys(
-      column_name="gender", keys=["female", "male"])
-  race = tf.contrib.layers.sparse_column_with_keys(
-      column_name="race",
-      keys=["Amer-Indian-Eskimo",
-            "Asian-Pac-Islander",
-            "Black",
-            "Other",
-            "White"])
-  education = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "education", hash_bucket_size=1000)
-  marital_status = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "marital_status", hash_bucket_size=100)
-  relationship = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "relationship", hash_bucket_size=100)
-  workclass = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "workclass", hash_bucket_size=100)
-  occupation = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "occupation", hash_bucket_size=1000)
-  native_country = tf.contrib.layers.sparse_column_with_hash_bucket(
-      "native_country", hash_bucket_size=1000)
-
-  # 2. Continuous base columns.
-  age = tf.contrib.layers.real_valued_column("age")
-  age_buckets = tf.contrib.layers.bucketized_column(
-      age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-  education_num = tf.contrib.layers.real_valued_column("education_num")
-  capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
-  capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
-  hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
-
-  wide_columns = [
-      gender, native_country, education, occupation, workclass,
-      marital_status, relationship, age_buckets,
-      tf.contrib.layers.crossed_column([education, occupation],
-                                       hash_bucket_size=int(1e4)),
-      tf.contrib.layers.crossed_column([native_country, occupation],
-                                       hash_bucket_size=int(1e4)),
-      tf.contrib.layers.crossed_column([age_buckets, race, occupation],
-                                       hash_bucket_size=int(1e6))]
-
-  deep_columns = [
-      tf.contrib.layers.embedding_column(workclass, dimension=8),
-      tf.contrib.layers.embedding_column(education, dimension=8),
-      tf.contrib.layers.embedding_column(marital_status, dimension=8),
-      tf.contrib.layers.embedding_column(gender, dimension=8),
-      tf.contrib.layers.embedding_column(relationship, dimension=8),
-      tf.contrib.layers.embedding_column(race, dimension=8),
-      tf.contrib.layers.embedding_column(native_country, dimension=8),
-      tf.contrib.layers.embedding_column(occupation, dimension=8),
-      age, education_num, capital_gain, capital_loss, hours_per_week]
-
-  # Define the column names for the data sets.
-  columns = ["age", "workclass", "fnlwgt", "education", "education_num",
-             "marital_status", "occupation", "relationship", "race", "gender",
-             "capital_gain", "capital_loss", "hours_per_week",
-             "native_country", "income_bracket"]
-  label_column = "label"
-  categorical_columns = ["workclass", "education", "marital_status",
-                         "occupation", "relationship", "race", "gender",
-                         "native_country"]
-  continuous_columns = ["age", "education_num", "capital_gain",
-                        "capital_loss", "hours_per_week"]
-
-  return (columns, label_column, wide_columns, deep_columns,
-          categorical_columns, continuous_columns)
-
-
-class CensusDataSource(object):
-  """Source of census data."""
-
-  def __init__(self, data_dir, train_data_url, test_data_url,
-               columns, label_column,
-               categorical_columns, continuous_columns):
-    """Constructor of CensusDataSource.
-
-    Args:
-      data_dir: Directory to save/load the data files
-      train_data_url: URL from which the training data can be downloaded
-      test_data_url: URL from which the test data can be downloaded
-      columns: Columns to retrieve from the data files (A list of strings)
-      label_column: Name of the label column
-      categorical_columns: Names of the categorical columns (A list of strings)
-      continuous_columns: Names of the continuous columns (A list of strings)
-    """
-
-    # Retrieve data from disk (if available) or download from the web.
-    train_file_path = os.path.join(data_dir, "adult.data")
-    if os.path.isfile(train_file_path):
-      print("Loading training data from file: %s" % train_file_path)
-      train_file = open(train_file_path)
-    else:
-      urllib.urlretrieve(train_data_url, train_file_path)
-
-    test_file_path = os.path.join(data_dir, "adult.test")
-    if os.path.isfile(test_file_path):
-      print("Loading test data from file: %s" % test_file_path)
-      test_file = open(test_file_path)
-    else:
-      test_file = open(test_file_path)
-      urllib.urlretrieve(test_data_url, test_file_path)
-
-    # Read the training and testing data sets into Pandas DataFrame.
-    import pandas  # pylint: disable=g-import-not-at-top
-    self._df_train = pandas.read_csv(train_file, names=columns,
-                                     skipinitialspace=True)
-    self._df_test = pandas.read_csv(test_file, names=columns,
-                                    skipinitialspace=True, skiprows=1)
-
-    # Remove the NaN values in the last rows of the tables
-    self._df_train = self._df_train[:-1]
-    self._df_test = self._df_test[:-1]
-
-    # Apply the threshold to get the labels.
-    income_thresh = lambda x: ">50K" in x
-    self._df_train[label_column] = (
-        self._df_train["income_bracket"].apply(income_thresh)).astype(int)
-    self._df_test[label_column] = (
-        self._df_test["income_bracket"].apply(income_thresh)).astype(int)
-
-    self.label_column = label_column
-    self.categorical_columns = categorical_columns
-    self.continuous_columns = continuous_columns
-
-  def input_train_fn(self):
-    return self._input_fn(self._df_train)
-
-  def input_test_fn(self):
-    return self._input_fn(self._df_test)
-
-  # TODO(cais): Turn into minibatch feeder
-  def _input_fn(self, df):
-    """Input data function.
-
-    Creates a dictionary mapping from each continuous feature column name
-    (k) to the values of that column stored in a constant Tensor.
-
-    Args:
-      df: data feed
-
-    Returns:
-      feature columns and labels
-    """
-    continuous_cols = {k: tf.constant(df[k].values)
-                       for k in self.continuous_columns}
-    # Creates a dictionary mapping from each categorical feature column name (k)
-    # to the values of that column stored in a tf.SparseTensor.
-    categorical_cols = {
-        k: tf.SparseTensor(
-            indices=[[i, 0] for i in range(df[k].size)],
-            values=df[k].values,
-            dense_shape=[df[k].size, 1])
-        for k in self.categorical_columns}
-    # Merges the two dictionaries into one.
-    feature_cols = dict(continuous_cols.items() + categorical_cols.items())
-    # Converts the label column into a constant Tensor.
-    label = tf.constant(df[self.label_column].values)
-    # Returns the feature columns and the label.
-    return feature_cols, label
-
-
-def _create_experiment_fn(output_dir):  # pylint: disable=unused-argument
-  """Experiment creation function."""
-  (columns, label_column, wide_columns, deep_columns, categorical_columns,
-   continuous_columns) = census_model_config()
-
-  census_data_source = CensusDataSource(FLAGS.data_dir,
-                                        TRAIN_DATA_URL, TEST_DATA_URL,
-                                        columns, label_column,
-                                        categorical_columns,
-                                        continuous_columns)
-
-  os.environ["TF_CONFIG"] = json.dumps({
-      "cluster": {
-          tf.contrib.learn.TaskType.PS: ["fake_ps"] *
-                                        FLAGS.num_parameter_servers
-      },
-      "task": {
-          "index": FLAGS.worker_index
-      }
-  })
-  config = run_config.RunConfig(master=FLAGS.master_grpc_url)
-
-  estimator = tf.contrib.learn.DNNLinearCombinedClassifier(
-      model_dir=FLAGS.model_dir,
-      linear_feature_columns=wide_columns,
-      dnn_feature_columns=deep_columns,
-      dnn_hidden_units=[5],
-      config=config)
-
-  return tf.contrib.learn.Experiment(
-      estimator=estimator,
-      train_input_fn=census_data_source.input_train_fn,
-      eval_input_fn=census_data_source.input_test_fn,
-      train_steps=FLAGS.train_steps,
-      eval_steps=FLAGS.eval_steps
-  )
-
-
-def main(unused_argv):
-  print("Worker index: %d" % FLAGS.worker_index)
-  learn_runner.run(experiment_fn=_create_experiment_fn,
-                   output_dir=FLAGS.output_dir,
-                   schedule=FLAGS.schedule)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/census-data",
-      help="Directory for storing the census data")
-  parser.add_argument(
-      "--model_dir",
-      type=str,
-      default="/tmp/census_wide_and_deep_model",
-      help="Directory for storing the model"
-  )
-  parser.add_argument(
-      "--output_dir",
-      type=str,
-      default="",
-      help="Base output directory."
-  )
-  parser.add_argument(
-      "--schedule",
-      type=str,
-      default="local_run",
-      help="Schedule to run for this experiment."
-  )
-  parser.add_argument(
-      "--master_grpc_url",
-      type=str,
-      default="",
-      help="URL to master GRPC tensorflow server, e.g.,grpc://127.0.0.1:2222"
-  )
-  parser.add_argument(
-      "--num_parameter_servers",
-      type=int,
-      default=0,
-      help="Number of parameter servers"
-  )
-  parser.add_argument(
-      "--worker_index",
-      type=int,
-      default=0,
-      help="Worker index (>=0)"
-  )
-  parser.add_argument(
-      "--train_steps",
-      type=int,
-      default=1000,
-      help="Number of training steps"
-  )
-  parser.add_argument(
-      "--eval_steps",
-      type=int,
-      default=1,
-      help="Number of evaluation steps"
-  )
-  global FLAGS  # pylint:disable=global-at-module-level
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
deleted file mode 100644
index d6e7f317dd0b52203e354676425dbbbcd53e1973..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distributed MNIST training and validation, with model replicas.
-
-A simple softmax model with one hidden layer is defined. The parameters
-(weights and biases) are located on one parameter server (ps), while the ops
-are executed on two worker nodes by default. The TF sessions also run on the
-worker node.
-Multiple invocations of this script can be done in parallel, with different
-values for --task_index. There should be exactly one invocation with
---task_index, which will create a master session that carries out variable
-initialization. The other, non-master, sessions will wait for the master
-session to finish the initialization before proceeding to the training stage.
-
-The coordination between the multiple worker invocations occurs due to
-the definition of the parameters on the same ps devices. The parameter updates
-from one worker is visible to all other workers. As such, the workers can
-perform forward computation and gradient calculation in parallel, which
-should lead to increased training speed for the simple model.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import sys
-import tempfile
-import time
-
-import tensorflow as tf
-from tensorflow.examples.tutorials.mnist import input_data
-
-flags = tf.app.flags
-flags.DEFINE_string("data_dir", "/tmp/mnist-data",
-                    "Directory for storing mnist data")
-flags.DEFINE_boolean("download_only", False,
-                     "Only perform downloading of data; Do not proceed to "
-                     "session preparation, model definition or training")
-flags.DEFINE_integer("task_index", None,
-                     "Worker task index, should be >= 0. task_index=0 is "
-                     "the master worker task the performs the variable "
-                     "initialization ")
-flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
-                     "If you don't use GPU, please set it to '0'")
-flags.DEFINE_integer("replicas_to_aggregate", None,
-                     "Number of replicas to aggregate before parameter update "
-                     "is applied (For sync_replicas mode only; default: "
-                     "num_workers)")
-flags.DEFINE_integer("hidden_units", 100,
-                     "Number of units in the hidden layer of the NN")
-flags.DEFINE_integer("train_steps", 200,
-                     "Number of (global) training steps to perform")
-flags.DEFINE_integer("batch_size", 100, "Training batch size")
-flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
-flags.DEFINE_boolean(
-    "sync_replicas", False,
-    "Use the sync_replicas (synchronized replicas) mode, "
-    "wherein the parameter updates from workers are aggregated "
-    "before applied to avoid stale gradients")
-flags.DEFINE_boolean(
-    "existing_servers", False, "Whether servers already exists. If True, "
-    "will use the worker hosts via their GRPC URLs (one client process "
-    "per worker host). Otherwise, will create an in-process TensorFlow "
-    "server.")
-flags.DEFINE_string("ps_hosts", "localhost:2222",
-                    "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
-                    "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("job_name", None, "job name: worker or ps")
-
-FLAGS = flags.FLAGS
-
-IMAGE_PIXELS = 28
-
-
-def main(unused_argv):
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
-  if FLAGS.download_only:
-    sys.exit(0)
-
-  if FLAGS.job_name is None or FLAGS.job_name == "":
-    raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index == "":
-    raise ValueError("Must specify an explicit `task_index`")
-
-  print("job name = %s" % FLAGS.job_name)
-  print("task index = %d" % FLAGS.task_index)
-
-  #Construct the cluster and start the server
-  ps_spec = FLAGS.ps_hosts.split(",")
-  worker_spec = FLAGS.worker_hosts.split(",")
-
-  # Get the number of workers.
-  num_workers = len(worker_spec)
-
-  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
-
-  if not FLAGS.existing_servers:
-    # Not using existing servers. Create an in-process server.
-    server = tf.train.Server(
-        cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
-    if FLAGS.job_name == "ps":
-      server.join()
-
-  is_chief = (FLAGS.task_index == 0)
-  if FLAGS.num_gpus > 0:
-    # Avoid gpu allocation conflict: now allocate task_num -> #gpu
-    # for each worker in the corresponding machine
-    gpu = (FLAGS.task_index % FLAGS.num_gpus)
-    worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
-  elif FLAGS.num_gpus == 0:
-    # Just allocate the CPU to worker server
-    cpu = 0
-    worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
-  # The device setter will automatically place Variables ops on separate
-  # parameter servers (ps). The non-Variable ops will be placed on the workers.
-  # The ps use CPU and workers use corresponding GPU
-  with tf.device(
-      tf.train.replica_device_setter(
-          worker_device=worker_device,
-          ps_device="/job:ps/cpu:0",
-          cluster=cluster)):
-    global_step = tf.Variable(0, name="global_step", trainable=False)
-
-    # Variables of the hidden layer
-    hid_w = tf.Variable(
-        tf.truncated_normal(
-            [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
-            stddev=1.0 / IMAGE_PIXELS),
-        name="hid_w")
-    hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
-
-    # Variables of the softmax layer
-    sm_w = tf.Variable(
-        tf.truncated_normal(
-            [FLAGS.hidden_units, 10],
-            stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
-        name="sm_w")
-    sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
-
-    # Ops: located on the worker specified with FLAGS.task_index
-    x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
-    y_ = tf.placeholder(tf.float32, [None, 10])
-
-    hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
-    hid = tf.nn.relu(hid_lin)
-
-    y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
-    cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
-
-    opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
-
-    if FLAGS.sync_replicas:
-      if FLAGS.replicas_to_aggregate is None:
-        replicas_to_aggregate = num_workers
-      else:
-        replicas_to_aggregate = FLAGS.replicas_to_aggregate
-
-      opt = tf.train.SyncReplicasOptimizer(
-          opt,
-          replicas_to_aggregate=replicas_to_aggregate,
-          total_num_replicas=num_workers,
-          name="mnist_sync_replicas")
-
-    train_step = opt.minimize(cross_entropy, global_step=global_step)
-
-    if FLAGS.sync_replicas:
-      local_init_op = opt.local_step_init_op
-      if is_chief:
-        local_init_op = opt.chief_init_op
-
-      ready_for_local_init_op = opt.ready_for_local_init_op
-
-      # Initial token and chief queue runners required by the sync_replicas mode
-      chief_queue_runner = opt.get_chief_queue_runner()
-      sync_init_op = opt.get_init_tokens_op()
-
-    init_op = tf.global_variables_initializer()
-    train_dir = tempfile.mkdtemp()
-
-    if FLAGS.sync_replicas:
-      sv = tf.train.Supervisor(
-          is_chief=is_chief,
-          logdir=train_dir,
-          init_op=init_op,
-          local_init_op=local_init_op,
-          ready_for_local_init_op=ready_for_local_init_op,
-          recovery_wait_secs=1,
-          global_step=global_step)
-    else:
-      sv = tf.train.Supervisor(
-          is_chief=is_chief,
-          logdir=train_dir,
-          init_op=init_op,
-          recovery_wait_secs=1,
-          global_step=global_step)
-
-    sess_config = tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=False,
-        device_filters=["/job:ps",
-                        "/job:worker/task:%d" % FLAGS.task_index])
-
-    # The chief worker (task_index==0) session will prepare the session,
-    # while the remaining workers will wait for the preparation to complete.
-    if is_chief:
-      print("Worker %d: Initializing session..." % FLAGS.task_index)
-    else:
-      print("Worker %d: Waiting for session to be initialized..." %
-            FLAGS.task_index)
-
-    if FLAGS.existing_servers:
-      server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
-      print("Using existing server at: %s" % server_grpc_url)
-
-      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
-    else:
-      sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
-
-    print("Worker %d: Session initialization complete." % FLAGS.task_index)
-
-    if FLAGS.sync_replicas and is_chief:
-      # Chief worker will start the chief queue runner and call the init op.
-      sess.run(sync_init_op)
-      sv.start_queue_runners(sess, [chief_queue_runner])
-
-    # Perform training
-    time_begin = time.time()
-    print("Training begins @ %f" % time_begin)
-
-    local_step = 0
-    while True:
-      # Training feed
-      batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
-      train_feed = {x: batch_xs, y_: batch_ys}
-
-      _, step = sess.run([train_step, global_step], feed_dict=train_feed)
-      local_step += 1
-
-      now = time.time()
-      print("%f: Worker %d: training step %d done (global step: %d)" %
-            (now, FLAGS.task_index, local_step, step))
-
-      if step >= FLAGS.train_steps:
-        break
-
-    time_end = time.time()
-    print("Training ends @ %f" % time_end)
-    training_time = time_end - time_begin
-    print("Training elapsed time: %f s" % training_time)
-
-    # Validation feed
-    val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
-    val_xent = sess.run(cross_entropy, feed_dict=val_feed)
-    print("After %d training step(s), validation cross entropy = %g" %
-          (FLAGS.train_steps, val_xent))
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
deleted file mode 100755
index e188c88c8fa725daa619e244072fdb58765ea0a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This is the entry-point script to testing TensorFlow's distributed runtime.
-# It builds a docker image with the necessary gcloud and Kubernetes (k8s) tools
-# installed, and then execute k8s cluster preparation and distributed TensorFlow
-# runs from within a container based on the image.
-#
-# Usage:
-#   remote_test.sh <whl_url>
-#                  [--setup_cluster_only]
-#                  [--num_workers <NUM_WORKERS>]
-#                  [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
-#                  [--sync_replicas]
-#
-# Arguments:
-# <whl_url>
-#   Specify custom TensorFlow whl file URL to install in the test Docker image.
-#
-# --setup_cluster_only:
-#       Setup the TensorFlow k8s cluster only, and do not perform testing of
-#       the distributed runtime.
-#
-# --num_workers <NUM_WORKERS>:
-#   Specifies the number of worker pods to start
-#
-# --num_parameter_server <NUM_PARAMETER_SERVERS>:
-#   Specifies the number of parameter servers to start
-#
-# --sync_replicas
-#   Use the synchronized-replica mode. The parameter updates from the replicas
-#   (workers) will be aggregated before applied, which avoids stale parameter
-#   updates.
-#
-#
-#
-# If any of the following environment variable has non-empty values, it will
-# be mapped into the docker container to override the default values (see
-# dist_test.sh)
-#   TF_DIST_GRPC_SERVER_URL:      URL to an existing TensorFlow GRPC server.
-#                                 If set to any non-empty and valid value (e.g.,
-#                                 grpc://1.2.3.4:2222), it will cause the test
-#                                 to bypass the k8s cluster setup and
-#                                 teardown process, and just use the this URL
-#                                 as the master session.
-#   TF_DIST_GCLOUD_PROJECT:       gcloud project in which the GKE cluster
-#                                 will be created (takes effect only if
-#                                 TF_DIST_GRPC_SERVER_URL is empty, same below)
-#   TF_DIST_GCLOUD_COMPUTE_ZONE:  gcloud compute zone.
-#   TF_DIST_CONTAINER_CLUSTER:    name of the GKE cluster
-#   TF_DIST_GCLOUD_KEY_FILE:      path to the gloud service JSON key file
-#   TF_DIST_GRPC_PORT:            port on which to create the TensorFlow GRPC
-#                                 servers
-#   TF_DIST_DOCKER_NO_CACHE:      do not use cache when building docker images
-
-die() {
-  echo $@
-  exit 1
-}
-
-DOCKER_IMG_NAME="tensorflow/tf-dist-test-client"
-
-# Get current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Prepare environment variables for the docker container
-DOCKER_ENV_FLAGS=""
-if [[ ! -z "$TF_DIST_GRPC_SERVER_URL" ]]; then
-  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
-"-e TF_DIST_GRPC_SERVER_URL=${TF_DIST_GRPC_SERVER_URL}"
-fi
-if [[ ! -z "$TF_DIST_GCLOUD_PROJECT" ]]; then
-  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
-"-e TF_DIST_GCLOUD_PROJECT=${TF_DIST_GCLOUD_PROJECT}"
-fi
-if [[ ! -z "$TF_DIST_GCLOUD_COMPUTE_ZONE" ]]; then
-  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
-"-e TF_DIST_GCLOUD_COMPUTE_ZONE=${TF_DIST_GCLOUD_COMPUTE_ZONE}"
-fi
-if [[ ! -z "$TF_DIST_CONTAINER_CLUSTER" ]]; then
-  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
-"-e TF_DIST_CONTAINER_CLUSTER=${TF_DIST_CONTAINER_CLUSTER}"
-fi
-if [[ ! -z "$TF_DIST_GRPC_PORT" ]]; then
-  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
-"-e TF_DIST_GRPC_PORT=${TF_DIST_GRPC_PORT}"
-fi
-
-NO_CACHE_FLAG=""
-if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
-   [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then
-  NO_CACHE_FLAG="--no-cache"
-fi
-
-# Parse command-line arguments.
-WHL_URL=${1}
-if [[ -z "${WHL_URL}" ]]; then
-  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
-fi
-
-# Create docker build context directory.
-BUILD_DIR=$(mktemp -d)
-echo ""
-echo "Using custom whl file URL: ${WHL_URL}"
-echo "Building in temporary directory: ${BUILD_DIR}"
-
-cp -r ${DIR}/* ${BUILD_DIR}/ || \
-  die "Failed to copy files to ${BUILD_DIR}"
-
-# Download whl file into the build context directory.
-if [[ -z "${WHL_URL}" ]]; then
-  pip2 download --no-deps tf-nightly
-  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
-else
-  wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
-fi
-
-# Build docker image for test.
-docker build ${NO_CACHE_FLAG} \
-    -t ${DOCKER_IMG_NAME} -f "${BUILD_DIR}/Dockerfile" "${BUILD_DIR}" || \
-    die "Failed to build docker image: ${DOCKER_IMG_NAME}"
-
-# Clean up docker build context directory.
-rm -rf "${BUILD_DIR}"
-
-# Run docker image for test.
-KEY_FILE=${TF_DIST_GCLOUD_KEY_FILE:-"${HOME}/gcloud-secrets/tensorflow-testing.json"}
-
-docker run --rm -v ${KEY_FILE}:/var/gcloud/secrets/tensorflow-testing.json \
-  ${DOCKER_ENV_FLAGS} \
-  ${DOCKER_IMG_NAME} \
-  /var/tf-dist-test/scripts/dist_test.sh $@
diff --git a/tensorflow/tools/dist_test/scripts/BUILD b/tensorflow/tools/dist_test/scripts/BUILD
deleted file mode 100644
index 6df5c6b660b8cdbc5044a1ad133c0f5c64ff745f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-# Tools for running distributed benchmarks.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["k8s_tensorflow.py"])
-
-py_library(
-    name = "k8s_tensorflow_lib",
-    srcs = ["k8s_tensorflow_lib.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_test(
-    name = "k8s_tensorflow_test",
-    srcs = ["k8s_tensorflow_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":k8s_tensorflow_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
diff --git a/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh b/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh
deleted file mode 100755
index 1da6a540f10e2c0d6c9b10bd523d50fe968d8865..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh
+++ /dev/null
@@ -1,283 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Create a Kubernetes (k8s) cluster of TensorFlow workers
-#
-# Usage:
-#   create_tf_cluster.sh <num_workers> <num_parameter_servers>
-#
-# In addition, this script obeys values in the following environment variables:
-#   TF_DIST_LOCAL_CLUSTER:        create TensorFlow cluster on local machine
-#   TF_DIST_SERVER_DOCKER_IMAGE:  overrides the default docker image to launch
-#                                 TensorFlow (GRPC) servers with
-#   TF_DIST_GCLOUD_PROJECT:       gcloud project in which the GKE cluster
-#                                 will be created (valid only if aforementioned
-#                                 TF_DIST_GRPC_SERVER_URL is empty).
-#   TF_DIST_GCLOUD_COMPUTE_ZONE:  gcloud compute zone.
-#   TF_DIST_CONTAINER_CLUSTER:    name of the GKE cluster
-#   TF_DIST_GCLOUD_KEY_FILE:      if non-empty, will override GCLOUD_KEY_FILE
-#   TF_DIST_GRPC_PORT:            overrides the default port (2222)
-#                                 to run the GRPC servers on
-
-# Configurations
-# gcloud operation timeout (steps)
-GCLOUD_OP_MAX_STEPS=360
-
-GRPC_PORT=${TF_DIST_GRPC_PORT:-2222}
-
-DEFAULT_GCLOUD_BIN=/var/gcloud/google-cloud-sdk/bin/gcloud
-GCLOUD_KEY_FILE=${TF_DIST_GCLOUD_KEY_FILE:-\
-"/var/gcloud/secrets/tensorflow-testing.json"}
-GCLOUD_PROJECT=${TF_DIST_GCLOUD_PROJECT:-"tensorflow-testing"}
-
-GCLOUD_COMPUTE_ZONE=${TF_DIST_GCLOUD_COMPUTE_ZONE:-"us-central1-f"}
-CONTAINER_CLUSTER=${TF_DIST_CONTAINER_CLUSTER:-"test-cluster"}
-
-SERVER_DOCKER_IMAGE=${TF_DIST_SERVER_DOCKER_IMAGE:-\
-"tensorflow/tf_grpc_test_server"}
-
-# Get current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Get utility functions
-source "${DIR}/utils.sh"
-
-# Check input arguments
-if [[ $# != 2 ]]; then
-  die "Usage: $0 <num_workers> <num_parameter_servers>"
-fi
-
-NUM_WORKERS=$1
-NUM_PARAMETER_SERVERS=$2
-
-# Verify port string
-if [[ -z $(echo "${GRPC_PORT}" | grep -E "^[0-9]{1,5}") ]]; then
-  die "Invalid GRPC port: \"${GRPC_PORT}\""
-fi
-echo "GRPC port to be used when creating the k8s TensorFlow cluster: "\
-"${GRPC_PORT}"
-
-if [[ -z "${TF_DIST_LOCAL_CLUSTER}" ]] ||
-   [[ "${TF_DIST_LOCAL_CLUSTER}" == "0" ]]; then
-  IS_LOCAL_CLUSTER="0"
-else
-  IS_LOCAL_CLUSTER="1"
-fi
-
-if [[ ${IS_LOCAL_CLUSTER} == "0" ]]; then
-  # Locate gcloud binary path
-  GCLOUD_BIN=$(which gcloud)
-  if [[ -z "${GCLOUD_BIN}" ]]; then
-    GCLOUD_BIN="${DEFAULT_GCLOUD_BIN}"
-  fi
-
-  if [[ ! -f "${GCLOUD_BIN}" ]]; then
-    die "gcloud binary cannot be found at: ${GCLOUD_BIN}"
-  fi
-  echo "Path to gcloud binary: ${GCLOUD_BIN}"
-
-  # Path to gcloud service key file
-  if [[ ! -f "${GCLOUD_KEY_FILE}" ]]; then
-    die "gcloud service account key file cannot be found at: ${GCLOUD_KEY_FILE}"
-  fi
-  echo "Path to gcloud key file: ${GCLOUD_KEY_FILE}"
-
-  echo "GCLOUD_PROJECT: ${GCLOUD_PROJECT}"
-  echo "GCLOUD_COMPUTER_ZONE: ${GCLOUD_COMPUTE_ZONE}"
-  echo "CONTAINER_CLUSTER: ${CONTAINER_CLUSTER}"
-
-  # Activate gcloud service account
-  "${GCLOUD_BIN}" auth activate-service-account --key-file "${GCLOUD_KEY_FILE}"
-
-  # See: https://github.com/kubernetes/kubernetes/issues/30617
-  "${GCLOUD_BIN}" config set container/use_client_certificate True
-
-  # Set gcloud project
-  "${GCLOUD_BIN}" config set project "${GCLOUD_PROJECT}"
-
-  # Set compute zone
-  "${GCLOUD_BIN}" config set compute/zone "${GCLOUD_COMPUTE_ZONE}"
-
-  # Set container cluster
-  "${GCLOUD_BIN}" config set container/cluster "${CONTAINER_CLUSTER}"
-
-  # Get container cluster credentials
-  "${GCLOUD_BIN}" container clusters get-credentials "${CONTAINER_CLUSTER}"
-  if [[ $? != "0" ]]; then
-    die "FAILED to get credentials for container cluster: ${CONTAINER_CLUSTER}"
-  fi
-
-  # If there is any existing tf k8s cluster, delete it first
-  "${DIR}/delete_tf_cluster.sh" "${GCLOUD_OP_MAX_STEPS}"
-fi
-
-# Path to kubectl binary
-KUBECTL_BIN=$(dirname "${GCLOUD_BIN}")/kubectl
-if [[ ! -f "${KUBECTL_BIN}" ]]; then
-  die "kubectl binary cannot be found at: ${KUBECTL_BIN}"
-fi
-echo "Path to kubectl binary: ${KUBECTL_BIN}"
-
-# Create yaml file for k8s TensorFlow cluster creation
-# Path to the (Python) script for generating k8s yaml file
-K8S_GEN_TF_YAML="${DIR}/k8s_tensorflow.py"
-if [[ ! -f ${K8S_GEN_TF_YAML} ]]; then
-  die "FAILED to find yaml-generating script at: ${K8S_GEN_TF_YAML}"
-fi
-
-K8S_YAML="/tmp/k8s_tf_lb.yaml"
-rm -f "${K8S_YAML}"
-
-echo ""
-echo "Generating k8s cluster yaml config file with the following settings"
-echo "  Server docker image: ${SERVER_DOCKER_IMAGE}"
-echo "  Number of workers: ${NUM_WORKERS}"
-echo "  Number of parameter servers: ${NUM_PARAMETER_SERVERS}"
-echo "  GRPC port: ${GRPC_PORT}"
-echo ""
-
-${K8S_GEN_TF_YAML} \
-    --docker_image "${SERVER_DOCKER_IMAGE}" \
-    --num_workers "${NUM_WORKERS}" \
-    --num_parameter_servers "${NUM_PARAMETER_SERVERS}" \
-    --grpc_port "${GRPC_PORT}" \
-    --request_load_balancer=True \
-    > "${K8S_YAML}" || \
-    die "Generation of the yaml configuration file for k8s cluster FAILED"
-
-if [[ ! -f "${K8S_YAML}" ]]; then
-    die "FAILED to generate yaml file for TensorFlow k8s container cluster"
-else
-    echo "Generated yaml configuration file for k8s TensorFlow cluster: "\
-"${K8S_YAML}"
-    cat "${K8S_YAML}"
-fi
-
-# Create tf k8s container cluster
-"${KUBECTL_BIN}" create -f "${K8S_YAML}"
-
-# Wait for external IP of worker services to become available
-get_tf_external_ip() {
-  # Usage: gen_tf_worker_external_ip <JOB_NAME> <TASK_INDEX>
-  # E.g.,  gen_tf_worker_external_ip ps 2
-  echo $("${KUBECTL_BIN}" get svc | grep "^tf-${1}${2}" | \
-         awk '{print $3}' | grep -E "[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+")
-}
-
-if [[ ${IS_LOCAL_CLUSTER} == "0" ]]; then
-  echo "Waiting for external IP of tf-worker0 service to emerge..."
-  echo ""
-
-  COUNTER=0
-  while true; do
-    sleep 1
-    ((COUNTER++))
-    if [[ "${COUNTER}" -gt "${GCLOUD_OP_MAX_STEPS}" ]]; then
-      die "Reached maximum polling steps while waiting for external IP "\
-"of tf-worker0 service to emerge"
-    fi
-
-    WORKER_EXTERN_IPS=""
-    WORKER_INDEX=0
-    N_AVAILABLE_WORKER_EXTERNAL_IPS=0
-    while true; do
-      SVC_EXTERN_IP=$(get_tf_external_ip worker ${WORKER_INDEX})
-
-      if [[ ! -z "${SVC_EXTERN_IP}" ]]; then
-        WORKER_EXTERN_IPS="${WORKER_EXTERN_IPS} ${SVC_EXTERN_IP}"
-
-        ((N_AVAILABLE_WORKER_EXTERNAL_IPS++))
-      fi
-
-      ((WORKER_INDEX++))
-      if [[ ${WORKER_INDEX} == ${NUM_WORKERS} ]]; then
-        break;
-      fi
-    done
-
-    PS_EXTERN_IPS=""
-    PS_INDEX=0
-    N_AVAILABLE_PS_EXTERNAL_IPS=0
-    while true; do
-      SVC_EXTERN_IP=$(get_tf_external_ip ps ${PS_INDEX})
-
-      if [[ ! -z "${SVC_EXTERN_IP}" ]]; then
-        PS_EXTERN_IPS="${PS_EXTERN_IPS} ${SVC_EXTERN_IP}"
-
-        ((N_AVAILABLE_PS_EXTERNAL_IPS++))
-      fi
-
-      ((PS_INDEX++))
-      if [[ ${PS_INDEX} == ${NUM_PARAMETER_SERVERS} ]]; then
-        break;
-      fi
-    done
-
-    if [[ ${N_AVAILABLE_WORKER_EXTERNAL_IPS} == ${NUM_WORKERS} ]] && \
-       [[ ${N_AVAILABLE_PS_EXTERNAL_IPS} == ${NUM_PARAMETER_SERVERS} ]]; then
-      break;
-    fi
-  done
-
-  GRPC_SERVER_URLS=""
-  for IP in ${WORKER_EXTERN_IPS}; do
-    GRPC_SERVER_URLS="${GRPC_SERVER_URLS} grpc://${IP}:${GRPC_PORT}"
-  done
-
-  GRPC_PS_URLS=""
-  for IP in ${PS_EXTERN_IPS}; do
-    GRPC_PS_URLS="${GRPC_PS_URLS} grpc://${IP}:${GRPC_PORT}"
-  done
-
-  echo "GRPC URLs of tf-worker instances: ${GRPC_SERVER_URLS}"
-  echo "GRPC URLs of tf-ps instances: ${GRPC_PS_URLS}"
-
-else
-  echo "Waiting for tf pods to be all running..."
-  echo ""
-
-  COUNTER=0
-  while true; do
-    sleep 1
-    ((COUNTER++))
-    if [[ "${COUNTER}" -gt "${GCLOUD_OP_MAX_STEPS}" ]]; then
-      die "Reached maximum polling steps while waiting for all tf pods to "\
-"be running in local k8s TensorFlow cluster"
-    fi
-
-    PODS_STAT=$(are_all_pods_running "${KUBECTL_BIN}")
-
-    if [[ ${PODS_STAT} == "2" ]]; then
-      # Error has occurred
-      die "Error(s) occurred while tring to launch tf k8s cluster. "\
-"One possible cause is that the Docker image used to launch the cluster is "\
-"invalid: \"${SERVER_DOCKER_IMAGE}\""
-    fi
-
-    if [[ ${PODS_STAT} == "1" ]]; then
-      break
-    fi
-  done
-
-  # Determine the tf-worker0 docker container id
-  WORKER0_ID=$(docker ps | grep "k8s_tf-worker0" | awk '{print $1}')
-  echo "WORKER0 Docker container ID: ${WORKER0_ID}"
-
-fi
-
-
-echo "Cluster setup complete."
-echo ""
diff --git a/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh b/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh
deleted file mode 100755
index ce0bf63f9924c76f71f5aec86f2f3d816507af49..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/delete_tf_cluster.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This script checks for any existing TensorFlow worker services, replication
-# controllers and pods in the Kubernetes (k8s) container cluster and delete
-# them if there are any.
-#
-# Usage: delete_tf_cluster [max_steps]
-#
-# max_steps: Maximum number polling steps for kubectl operations
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-# Path to kubectl binary
-DEFAULT_KUBECTL_BIN=/var/gcloud/google-cloud-sdk/bin/kubectl
-KUBECTL_BIN=$(which kubectl)
-if [[ -z "${KUBECTL_BIN}" ]]; then
-  KUBECTL_BIN="${DEFAULT_KUBECTL_BIN}"
-fi
-if [[ ! -f "${KUBECTL_BIN}" ]]; then
-  die "kubectl binary cannot be found at: \"${KUBECTL_BIN}\""
-else
-  echo "Path to kubectl binary: ${KUBECTL_BIN}"
-fi
-
-MAX_STEPS=${1:-240}
-
-
-# Helper functions for kubectl workflow
-get_tf_svc_count() {
-  echo $("${KUBECTL_BIN}" get svc | grep "tf-" | wc -l)
-}
-
-get_tf_rc_count() {
-  echo $("${KUBECTL_BIN}" get rc | grep "tf-" | wc -l)
-}
-
-get_tf_pods_count() {
-  echo $("${KUBECTL_BIN}" get pods | grep "tf-" | wc -l)
-}
-
-
-# Delete all running services, replication-controllers and pods, in that order
-ITEMS_TO_DELETE="svc rc pods"
-for ITEM in ${ITEMS_TO_DELETE}; do
-  K8S_ITEM_COUNT=$(get_tf_${ITEM}_count)
-  if [[ ${K8S_ITEM_COUNT} != "0" ]]; then
-    echo "There are currently ${K8S_ITEM_COUNT} tf ${ITEM}(s) running. "
-    echo "Attempting to delete those..."
-
-    "${KUBECTL_BIN}" delete --all ${ITEM}
-
-    # Wait until all are deleted
-    # TODO(cais): Add time out
-    COUNTER=0
-    while true; do
-      sleep 1
-
-      ((COUNTER++))
-      if [[ "${COUNTER}" -gt "${MAX_STEPS}" ]]; then
-        die "Reached maximum polling steps while trying to delete all tf ${ITEM}"
-      fi
-
-      if [[ $(get_tf_${ITEM}_count) == "0" ]]; then
-        break
-      fi
-    done
-  fi
-
-done
diff --git a/tensorflow/tools/dist_test/scripts/dist_census_widendeep_test.sh b/tensorflow/tools/dist_test/scripts/dist_census_widendeep_test.sh
deleted file mode 100755
index 0baf4b0375981f9faad3558e355ed35baf21845b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/dist_census_widendeep_test.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This script invokes dist_mnist.py multiple times concurrently to test the
-# TensorFlow's distributed runtime over a Kubernetes (k8s) cluster with the
-# grpc pods and service set up.
-#
-# Usage:
-#    dist_census_widendeep_test.sh <worker_grpc_urls>
-#        --num-workers <NUM_WORKERS>
-#        --num-parameter-servers <NUM_PARAMETER_SERVERS>
-#
-# worker_grp_url is the list of IP addresses or the GRPC URLs of the worker of
-# the worker sessions, separated with spaces,
-# e.g., "grpc://1.2.3.4:2222 grpc://5.6.7.8:2222"
-#
-# --num-workers <NUM_WORKERS>:
-#   Specifies the number of worker pods to use
-#
-# --num-parameter-server <NUM_PARAMETER_SERVERS>:
-#   Specifies the number of parameter servers to use
-
-# Configurations
-TIMEOUT=120  # Timeout for MNIST replica sessions
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-# Parse command-line arguments
-WORKER_GRPC_URLS=$1
-shift
-
-# Process additional input arguments
-N_WORKERS=2  # Default value
-N_PS=2  # Default value
-SYNC_REPLICAS=0
-
-while true; do
-  if [[ "$1" == "--num-workers" ]]; then
-    N_WORKERS=$2
-  elif [[ "$1" == "--num-parameter-servers" ]]; then
-    N_PS=$2
-  elif [[ "$1" == "--sync-replicas" ]]; then
-    SYNC_REPLICAS="1"
-    die "ERROR: --sync-replicas (synchronized-replicas) mode is not fully "\
-"supported by this test yet."
-    # TODO(cais): Remove error message once sync-replicas is fully supported
-  fi
-  shift
-
-  if [[ -z "$1" ]]; then
-    break
-  fi
-done
-
-echo "N_WORKERS = ${N_WORKERS}"
-echo "N_PS = ${N_PS}"
-
-# Dierctory to store the trained model and evaluation results.
-# The root (e.g., /shared) must be a directory shared among the workers.
-# See volumeMounts fields in k8s_tensorflow.py
-MODEL_DIR="/shared/census_widendeep_model"
-
-rm -rf ${MODEL_DIR} || \
-    die "Failed to remove existing model directory: ${MODEL_DIR}"
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PY_PATH="${SCRIPT_DIR}/../python/census_widendeep.py"
-if [[ ! -f "${PY_PATH}" ]]; then
-  echo "ERROR: Python file does not exist: ${PY_PATH}"
-  exit 1
-fi
-
-STAGGERED_START_DELAY_SEC=0
-WKR_LOG_PREFIX="/tmp/worker_"
-
-IDX=0
-LOG_FILES=""
-for WORKER_GRPC_URL in ${WORKER_GRPC_URLS}; do
-  if [[ ${IDX} != "0" ]]; then
-    sleep ${STAGGERED_START_DELAY_SEC}
-  fi
-
-  LOG_FILE="${WKR_LOG_PREFIX}${IDX}.log"
-  LOG_FILES="${LOG_FILES} ${LOG_FILE}"
-  python ${PY_PATH} \
-      --master_grpc_url="${WORKER_GRPC_URL}" \
-      --num_parameter_servers="${N_PS}" \
-      --worker_index="${IDX}" \
-      --model_dir="${MODEL_DIR}" \
-      --output_dir="/shared/output" \
-      --train_steps=1000 \
-      --eval_steps=2 2>&1 | tee "${LOG_FILE}" &
-
-  echo "Worker ${IDX}: "
-  echo "  GRPC URL: ${WORKER_GRPC_URL}"
-  echo "  log file: ${LOG_FILE}"
-
-  ((IDX++))
-done
-
-# Wait for all concurrent jobs to finish
-wait
-
-# Print logs from the workers
-ORD=1
-for LOG_FILE in ${LOG_FILES}; do
-  echo "==================================================="
-  echo "===        Log file from worker ${ORD} / ${N_WORKERS}          ==="
-  cat "${LOG_FILE}"
-  echo "==================================================="
-  echo ""
-
-  ((ORD++))
-done
-
-echo "Test for distributed training of Census Wide & Deep model PASSED"
\ No newline at end of file
diff --git a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
deleted file mode 100755
index e703e78531bf7d34285b5faef874ddff94495950..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# This script invokes dist_mnist.py multiple times concurrently to test the
-# TensorFlow's distributed runtime over a Kubernetes (k8s) cluster with the
-# grpc pods and service set up.
-#
-# Usage:
-#    dist_mnist_test.sh [--existing_servers (True|False)]
-#                       [--ps_hosts <PS_HOSTS>]
-#                       [--worker_hosts <WORKER_HOSTS>]
-#                       [--num_gpus <NUM_GPUS>]
-#                       [--sync_replicas]
-#
-# --existing_servers
-#   Use TensorFlow GRPC servers that are already created and running.
-#
-# --sync_replicas
-#   Use the synchronized-replica mode. The parameter updates from the replicas
-#   (workers) will be aggregated before applied, which avoids stale parameter
-#   updates.
-#
-# ps_hosts/worker_hosts is the list of IP addresses or the GRPC URLs of the ps/worker of
-# the worker sessions, separated with ","
-# e.g., "localhost:2222,localhost:2223"
-#
-# --num_gpus <NUM_GPUS>:
-#   Specifies the number of gpus to use
-#
-# NOTES:
-# If you have the error "$'\r': command not found"
-# Please run the command below to remove trailing '\r' character that causes the error:
-#   sed -i 's/\r$//' dist_mnist_test.sh
-
-
-# Configurations
-TIMEOUT=120  # Timeout for MNIST replica sessions
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-if [[ $# == "0" ]]; then
-  die "Usage: $0 [--ps_hosts <PS_HOSTS>] [--worker_hosts <WORKER_HOSTS>] "\
-"[--num_gpus <NUM_GPUS>] [--sync_replicas]"
-fi
-
-# Process additional input arguments
-SYNC_REPLICAS=0
-N_GPUS=0
-EXISTING_SERVERS=False
-
-while true; do
-  if [[ "$1" == "--ps_hosts" ]]; then
-    PS_HOSTS=$2
-    shift 2
-  elif [[ "$1" == "--worker_hosts" ]]; then
-    WORKER_HOSTS=$2
-    shift 2
-  elif [[ "$1" == "--existing_servers" ]]; then
-    EXISTING_SERVERS=$2
-    shift 2
-    if [[ "${EXISTING_SERVERS}" != "True" ]] && \
-       [[ "${EXISTING_SERVERS}" != "False" ]]; then
-      die "Invalid value for --existing_servers: should be (True|False)"
-    fi
-  elif [[ "$1" == "--num_gpus" ]]; then
-    N_GPUS=$2
-    shift 2
-  elif [[ "$1" == "--sync_replicas" ]]; then
-    SYNC_REPLICAS="1"
-    shift 1
-  fi
-
-  if [[ -z "$1" ]]; then
-    break
-  fi
-done
-
-if [[ ${SYNC_REPLICAS} == "1" ]] && [[ EXISTING_SERVERS == "1" ]]; then
-  die "ERROR: --sync_replicas (synchronized-replicas) mode is not fully "\
-"supported under the --existing_servers mode yet."
-  # TODO(cais): Remove error message once sync_replicas is fully supported.
-fi
-
-SYNC_REPLICAS_FLAG=""
-if [[ ${SYNC_REPLICAS} == "1" ]]; then
-  SYNC_REPLICAS_FLAG="True"
-else
-  SYNC_REPLICAS_FLAG="False"
-fi
-
-echo "EXISTING_SERVERS = ${EXISTING_SERVERS}"
-echo "PS_HOSTS = ${PS_HOSTS}"
-echo "WORKER_HOSTS = ${WORKER_HOSTS}"
-echo "NUM_GPUS = ${N_GPUS}"
-echo "SYNC_REPLICAS = ${SYNC_REPLICAS}"
-echo "SYNC_REPLICAS_FLAG = ${SYNC_REPLICAS_FLAG}"
-
-
-# Current working directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PY_DIR=$(dirname "${DIR}")/python
-
-MNIST_REPLICA="${PY_DIR}/mnist_replica.py"
-
-WKR_LOG_PREFIX="/tmp/worker"
-PS_LOG_PREFIX="/tmp/ps"
-
-# First, download the data from a single process, to avoid race-condition
-# during data downloading
-
-# Pre-download data files.
-timeout ${TIMEOUT} python "${MNIST_REPLICA}" \
-    --ps_hosts="${PS_HOSTS}" \
-    --worker_hosts="${WORKER_HOSTS}" \
-    --job_name="worker" \
-    --task_index=0 \
-    --num_gpus=${N_GPUS} \
-    --sync_replicas=${SYNC_REPLICAS_FLAG} \
-    --download_only || \
-    die "Download-only step of MNIST replica FAILED"
-
-
-# Get N_PS by PS_HOSTS
-N_PS=$(echo ${PS_HOSTS} | awk -F "," '{printf NF}')
-# Replace the delimiter with " "
-PS_ARRAY=($(echo ${PS_HOSTS} | awk -F "," '{for(i=1;i<=NF;i++){printf $i" "}}'))
-# Run a number of ps in parallel. In general, we only set 1 ps.
-echo "${N_PS} ps process(es) running in parallel..."
-
-if [[ ${EXISTING_SERVERS} == "False" ]]; then
-  echo "Hello"
-  # Create parameter servers.
-  IDX=0
-  PS=($PS_HOSTS)
-  while true; do
-    python "${MNIST_REPLICA}" \
-        --existing_servers="${EXISTING_SERVERS}" \
-        --ps_hosts="${PS_HOSTS}" \
-        --worker_hosts="${WORKER_HOSTS}" \
-        --job_name="ps" \
-        --task_index=${IDX} \
-        --num_gpus=${N_GPUS} \
-        --sync_replicas=${SYNC_REPLICAS_FLAG} 2>&1 | tee "${PS_LOG_PREFIX}${IDX}.log" &
-    echo "PS ${IDX}: "
-    echo "  PS HOST: ${PS_ARRAY[IDX]}"
-    echo "  log file: ${PS_LOG_PREFIX}${IDX}.log"
-
-    ((IDX++))
-    if [[ "${IDX}" == "${N_PS}" ]]; then
-      break
-    fi
-  done
-fi
-
-
-# Get N_WORKERS by WORKER_HOSTS
-N_WORKERS=$(echo ${WORKER_HOSTS} | awk -F "," '{printf NF}')
-# Replace the delimiter with " "
-WORKER_ARRAY=($(echo ${WORKER_HOSTS} | awk -F "," '{for(i=1;i<=NF;i++){printf $i" "}}'))
-# Run a number of workers in parallel
-echo "${N_WORKERS} worker process(es) running in parallel..."
-
-INDICES=""
-IDX=0
-while true; do
-  timeout ${TIMEOUT} python "${MNIST_REPLICA}" \
-      --existing_servers="${EXISTING_SERVERS}" \
-      --ps_hosts="${PS_HOSTS}" \
-      --worker_hosts="${WORKER_HOSTS}" \
-      --job_name="worker" \
-      --task_index=${IDX} \
-      --num_gpus=${N_GPUS} \
-      --train_steps=500 \
-      --sync_replicas=${SYNC_REPLICAS_FLAG} 2>&1 | tee "${WKR_LOG_PREFIX}${IDX}.log" &
-  echo "Worker ${IDX}: "
-  echo "  WORKER HOST: ${WORKER_ARRAY[IDX]}"
-  echo "  log file: ${WKR_LOG_PREFIX}${IDX}.log"
-
-  INDICES="${INDICES} ${IDX}"
-
-  ((IDX++))
-  if [[ "${IDX}" == "${N_WORKERS}" ]]; then
-    break
-  fi
-
-done
-
-
-# Poll until all final validation cross entropy values become available or
-# operation times out
-COUNTER=0
-while true; do
-  ((COUNTER++))
-  if [[ "${COUNTER}" -gt "${TIMEOUT}" ]]; then
-    die "Reached maximum polling steps while polling for final validation "\
-"cross entropies from all workers"
-  fi
-
-  N_AVAIL=0
-  VAL_XENT=""
-  for N in ${INDICES}; do
-    if [[ ! -z $(grep "Training ends " "${WKR_LOG_PREFIX}${N}.log") ]]; then
-      ((N_AVAIL++))
-    fi
-  done
-
-  if [[ "${N_AVAIL}" == "${N_WORKERS}" ]]; then
-    # Print out the content of the log files
-    for M in ${INDICES}; do
-      ORD=$(expr ${M} + 1)
-      echo "==================================================="
-      echo "===        Log file from worker ${ORD} / ${N_WORKERS}          ==="
-      cat "${WKR_LOG_PREFIX}${M}.log"
-      echo "==================================================="
-      echo ""
-    done
-
-    break
-  else
-    sleep 1
-  fi
-done
-
-# Function for getting final validation cross entropy from worker log files
-get_final_val_xent() {
-  echo $(cat $1 | grep "^After.*validation cross entropy = " | \
-      awk '{print $NF}')
-}
-
-VAL_XENT=$(get_final_val_xent "${WKR_LOG_PREFIX}0.log")
-
-# Sanity check on the validation entropies
-# TODO(cais): In addition to this basic sanity check, we could run the training
-# with 1 and 2 workers, each for a few times and use scipy.stats to do a t-test
-# to verify that the 2-worker training gives significantly lower final cross
-# entropy
-echo "Final validation cross entropy from worker0: ${VAL_XENT}"
-if [[ $(python -c "print(${VAL_XENT}>0)") != "True" ]]; then
-  die "Sanity checks on the final validation cross entropy values FAILED"
-fi
diff --git a/tensorflow/tools/dist_test/scripts/dist_test.sh b/tensorflow/tools/dist_test/scripts/dist_test.sh
deleted file mode 100755
index 5c107fb030df8c7a7b1f6f75f35b06d898c062ce..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/dist_test.sh
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Performs tests of TensorFlow's distributed runtime over a Kubernetes (k8s)
-# container cluster.
-#
-# This script tears down any existing TensorFlow cluster, consisting of
-# services, replication controllers and pods, before creating a new cluster.
-# The cluster containers a number of parameter server services and a number of
-# worker services. The parameter servers will hold parameters of the ML model,
-# e.g., weights and biases of the NN layers, while the workers will hold the
-# TensorFlow ops.
-#
-# Usage:
-#   dist_test.sh [--setup_cluster_only]
-#                [--model_name (MNIST | CENSUS_WIDENDEEP)]
-#                [--num_workers <NUM_WORKERS>]
-#                [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
-#                [--sync_replicas]
-#
-# --setup_cluster_only:
-#   Lets the script only set up the k8s container network
-#
-# --model_name
-#   Name of the model to test. Default is MNIST.
-#
-# --num-workers <NUM_WORKERS>:
-#   Specifies the number of worker pods to start
-#
-# --num_parameter_servers <NUM_PARAMETER_SERVERS>:
-#   Specifies the number of parameter servers to start
-#
-# --sync_replicas
-#   Use the synchronized-replica mode. The parameter updates from the replicas
-#   (workers) will be aggregated before applied, which avoids stale parameter
-#   updates.
-#
-#
-# This script obeys values in the following environment variables:
-#   TF_DIST_GRPC_SERVER_URLS:     If it is set to a list of valid server urls,
-#                                 separated with spaces or commas
-#                                 (e.g., "grpc://1.2.3.4:2222 grpc//5.6.7.8:2222"),
-#                                 the script will bypass the cluster setup and
-#                                 teardown processes and just use this URL.
-
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-# Parse input arguments: number of workers
-# Default values:
-MODEL_NAME="MNIST"  # Model name, default is "MNIST"
-NUM_WORKERS=2  # Number of worker container
-NUM_PARAMETER_SERVERS=2  # Number of parameter servers
-SYNC_REPLICAS=0
-SETUP_CLUSTER_ONLY=0
-
-while true; do
-  if [[ "$1" == "--model_name" ]]; then
-    MODEL_NAME=$2
-  elif [[ "$1" == "--num_workers" ]]; then
-    NUM_WORKERS=$2
-  elif [[ "$1" == "--num_parameter_servers" ]]; then
-    NUM_PARAMETER_SERVERS=$2
-  elif [[ "$1" == "--sync_replicas" ]]; then
-    SYNC_REPLICAS=1
-  elif [[ "$1" == "--setup_cluster_only" ]]; then
-    SETUP_CLUSTER_ONLY=1
-  fi
-  shift
-
-  if [[ -z "$1" ]]; then
-    break
-  fi
-done
-
-echo "MODEL_NAME = \"MODEL_NAME\""
-echo "NUM_WORKERS = ${NUM_WORKERS}"
-echo "NUM_PARAMETER_SERVERS = ${NUM_PARAMETER_SERVERS}"
-echo "SETUP_CLUSTER_ONLY = ${SETUP_CLUSTER_ONLY}"
-
-# gcloud operation timeout (steps)
-GCLOUD_OP_MAX_STEPS=240
-
-if [[ ! -z ${TF_DIST_GRPC_SERVER_URLS} ]]; then
-  GRPC_SERVER_URLS=${TF_DIST_GRPC_SERVER_URLS}
-  GRPC_SERVER_URLS=$(echo ${GRPC_SERVER_URLS} | sed -e 's/,/ /g')
-fi
-
-# Report gcloud / GKE parameters
-echo "GRPC_SERVER_URLS: ${GRPC_SERVER_URLS}"
-echo "SYNC_REPLICAS: ${SYNC_REPLICAS}"
-
-# Get current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Locate path to kubectl binary
-TEARDOWN_WHEN_DONE=1
-if [[ ! -z "${GRPC_SERVER_URLS}" ]]; then
-  TEARDOWN_WHEN_DONE=0
-  # Verify the validity of the GRPC URL
-  for GRPC_SERVER_URL in ${GRPC_SERVER_URLS}; do
-    if [[ -z $(echo "${GRPC_SERVER_URL}" | \
-      grep -E "^grpc://.+:[0-9]+") ]]; then
-      die "Invalid GRPC_SERVER_URL: \"${GRPC_SERVER_URL}\""
-    fi
-  done
-
-  echo "The preset GRPC_SERVER_URLS appears to be valid: ${GRPC_SERVER_URLS}"
-  echo "Will bypass the TensorFlow k8s cluster setup and teardown process"
-  echo ""
-
-else
-  TMP=$(mktemp)
-  "${DIR}/create_tf_cluster.sh" ${NUM_WORKERS} ${NUM_PARAMETER_SERVERS} 2>&1 | \
-      tee "${TMP}" || \
-      die "Creation of TensorFlow k8s cluster FAILED"
-
-  GRPC_SERVER_URLS=$(cat ${TMP} | grep "GRPC URLs of tf-worker instances: .*" | \
-      sed -e 's/GRPC URLs of tf-worker instances://g')
-
-  GRPC_PS_URLS=$(cat ${TMP} | grep "GRPC URLs of tf-ps instances: .*" | \
-      sed -e 's/GRPC URLs of tf-ps instances://g')
-
-  if [[ $(echo ${GRPC_SERVER_URLS} | wc -w) != ${NUM_WORKERS} ]]; then
-    die "FAILED to determine GRPC server URLs of all workers"
-  fi
-  if [[ $(echo ${GRPC_PS_URLS} | wc -w) != ${NUM_PARAMETER_SERVERS} ]]; then
-    die "FAILED to determine GRPC server URLs of all parameter servers"
-  fi
-
-  WORKER_HOSTS=$(echo "${GRPC_SERVER_URLS}" | sed -e 's/^[[:space:]]*//' | \
-                 sed -e 's/grpc:\/\///g' | sed -e 's/ /,/g')
-  PS_HOSTS=$(echo "${GRPC_PS_URLS}" | sed -e 's/^[[:space:]]*//' | \
-             sed -e 's/grpc:\/\///g' | sed -e 's/ /,/g')
-
-  echo "WORKER_HOSTS = ${WORKER_HOSTS}"
-  echo "PS_HOSTS = ${PS_HOSTS}"
-
-  rm -f ${TMP}
-
-  if [[ ${SETUP_CLUSTER_ONLY} == "1" ]]; then
-    echo "Skipping testing of distributed runtime due to "\
-"option flag --setup_cluster_only"
-    exit 0
-  fi
-fi
-
-
-# Test routine for model "MNIST"
-test_MNIST() {
-  # Invoke script to perform distributed MNIST training
-  MNIST_DIST_TEST_BIN="${DIR}/dist_mnist_test.sh"
-  if [[ ! -f "${MNIST_DIST_TEST_BIN}" ]]; then
-    echo "FAILED to find distributed mnist client test script at "\
-  "${MNIST_DIST_TEST_BIN}"
-    return 1
-  fi
-
-  echo "Performing distributed MNIST training through worker grpc sessions @ "\
-  "${GRPC_SERVER_URLS}..."
-
-  echo "and ps grpc sessions @ ${GRPC_PS_URLS}"
-
-  SYNC_REPLICAS_FLAG=""
-  if [[ ${SYNC_REPLICAS} == "1" ]]; then
-    SYNC_REPLICAS_FLAG="--sync_replicas"
-  fi
-
-  "${MNIST_DIST_TEST_BIN}" \
-      --existing_servers True \
-      --ps_hosts "${PS_HOSTS}" \
-      --worker_hosts "${WORKER_HOSTS}" \
-      --num_gpus 0 \
-      ${SYNC_REPLICAS_FLAG}
-
-  if [[ $? == "0" ]]; then
-    echo "MNIST-replica test PASSED"
-  else
-    echo "MNIST-replica test FAILED"
-    return 1
-  fi
-  echo ""
-}
-
-# Test routine for model "CENSUS_WIDENDEEP"
-test_CENSUS_WIDENDEEP() {
-  # Invoke script to perform distributed census_widendeep training
-  CENSUS_WIDENDEEP_DIST_TEST_BIN="${DIR}/dist_census_widendeep_test.sh"
-  if [[ ! -f "${CENSUS_WIDENDEEP_DIST_TEST_BIN}" ]]; then
-    echo "FAILED to find distributed widen&deep client test script at "\
-  "${CENSUS_WIDENDEEP_DIST_TEST_BIN}"
-    return 1
-  fi
-
-  echo "Performing distributed wide&deep (census) training through grpc "\
-  "sessions @ ${GRPC_SERVER_URLS}..."
-
-  "${CENSUS_WIDENDEEP_DIST_TEST_BIN}" "${GRPC_SERVER_URLS}" \
-      --num-workers "${NUM_WORKERS}" \
-      --num-parameter-servers "${NUM_PARAMETER_SERVERS}"
-
-  if [[ $? == "0" ]]; then
-    echo "Census Wide & Deep test PASSED"
-    echo ""
-  else
-    echo "Census Wide & Deep test FAILED"
-    echo ""
-    return 1
-  fi
-}
-
-# Validate model name
-if [[ $(type -t "test_${MODEL_NAME}") != "function" ]]; then
-  die "ERROR: Unsupported model: \"${MODEL_NAME}\""
-fi
-
-# Invoke test routine according to model name
-"test_${MODEL_NAME}" && \
-    FAILED=0 || \
-    FAILED=1
-
-# Tear down current k8s TensorFlow cluster
-if [[ "${TEARDOWN_WHEN_DONE}" == "1" ]]; then
-  echo "Tearing down k8s TensorFlow cluster..."
-  "${DIR}/delete_tf_cluster.sh" "${GCLOUD_OP_MAX_STEPS}" && \
-      echo "Cluster tear-down SUCCEEDED" || \
-      die "Cluster tear-down FAILED"
-fi
-
-if [[ "${FAILED}" == 1 ]]; then
-  die "Test of distributed training of model ${MODEL_NAME} FAILED"
-else
-  echo "SUCCESS: Test of distributed TensorFlow runtime PASSED"
-  echo ""
-fi
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py
deleted file mode 100755
index b325f030e3639bc7baeb51bfd31be4e741de28ab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/k8s_tensorflow.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/python
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Generates YAML configuration files for distributed TensorFlow workers.
-
-The workers will be run in a Kubernetes (k8s) container cluster.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import k8s_tensorflow_lib
-
-# Note: It is intentional that we do not import tensorflow in this script. The
-# machine that launches a TensorFlow k8s cluster does not have to have the
-# Python package of TensorFlow installed on it.
-
-
-DEFAULT_DOCKER_IMAGE = 'tensorflow/tf_grpc_test_server'
-DEFAULT_PORT = 2222
-
-
-def main():
-  """Do arg parsing."""
-  parser = argparse.ArgumentParser()
-  parser.register(
-      'type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes'))
-  parser.add_argument('--num_workers',
-                      type=int,
-                      default=2,
-                      help='How many worker pods to run')
-  parser.add_argument('--num_parameter_servers',
-                      type=int,
-                      default=1,
-                      help='How many paramater server pods to run')
-  parser.add_argument('--grpc_port',
-                      type=int,
-                      default=DEFAULT_PORT,
-                      help='GRPC server port (Default: %d)' % DEFAULT_PORT)
-  parser.add_argument('--request_load_balancer',
-                      type='bool',
-                      default=False,
-                      help='To request worker0 to be exposed on a public IP '
-                      'address via an external load balancer, enabling you to '
-                      'run client processes from outside the cluster')
-  parser.add_argument('--docker_image',
-                      type=str,
-                      default=DEFAULT_DOCKER_IMAGE,
-                      help='Override default docker image for the TensorFlow '
-                      'GRPC server')
-  parser.add_argument('--name_prefix',
-                      type=str,
-                      default='tf',
-                      help='Prefix for job names. Jobs will be named as '
-                      '<name_prefix>_worker|ps<task_id>')
-  parser.add_argument('--use_shared_volume',
-                      type='bool',
-                      default=True,
-                      help='Whether to mount /shared directory from host to '
-                      'the pod')
-  args = parser.parse_args()
-
-  if args.num_workers <= 0:
-    sys.stderr.write('--num_workers must be greater than 0; received %d\n'
-                     % args.num_workers)
-    sys.exit(1)
-  if args.num_parameter_servers <= 0:
-    sys.stderr.write(
-        '--num_parameter_servers must be greater than 0; received %d\n'
-        % args.num_parameter_servers)
-    sys.exit(1)
-
-  # Generate contents of yaml config
-  yaml_config = k8s_tensorflow_lib.GenerateConfig(
-      args.num_workers,
-      args.num_parameter_servers,
-      args.grpc_port,
-      args.request_load_balancer,
-      args.docker_image,
-      args.name_prefix,
-      env_vars=None,
-      use_shared_volume=args.use_shared_volume)
-  print(yaml_config)  # pylint: disable=superfluous-parens
-
-
-if __name__ == '__main__':
-  main()
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py
deleted file mode 100644
index 8adbe387ba33557c56682eda73b84f9da80eaaff..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_lib.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Generates YAML configuration files for distributed TensorFlow workers.
-
-The workers will be run in a Kubernetes (k8s) container cluster.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Note: It is intentional that we do not import tensorflow in this script. The
-# machine that launches a TensorFlow k8s cluster does not have to have the
-# Python package of TensorFlow installed on it.
-
-# TODO(cais): Consider adding resource requests/limits to the pods.
-
-# Worker pods will mount host volume /shared, as a convenient way to create
-# shared storage among workers during local tests.
-WORKER_RC = (
-    """apiVersion: v1
-kind: ReplicationController
-metadata:
-  name: {name_prefix}-worker{worker_id}
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        tf-worker: "{worker_id}"
-        name-prefix: "{name_prefix}"
-        job: "worker"
-    spec:
-      containers:
-      - name: tf-worker{worker_id}
-        image: {docker_image}
-        args: [{args}]
-        ports:
-        - containerPort: {port}
-        env: [{env_vars}]
-        volumeMounts: [{volume_mounts}]
-      volumes: [{volumes}]
-""")
-WORKER_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: {name_prefix}-worker{worker_id}
-  labels:
-    tf-worker: "{worker_id}"
-spec:
-  ports:
-  - port: {port}
-    targetPort: {port}
-  selector:
-    tf-worker: "{worker_id}"
-""")
-WORKER_LB_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: {name_prefix}-worker{worker_id}
-  labels:
-    tf-worker: "{worker_id}"
-spec:
-  type: LoadBalancer
-  ports:
-  - port: {port}
-  selector:
-    tf-worker: "{worker_id}"
-""")
-PARAM_SERVER_RC = (
-    """apiVersion: v1
-kind: ReplicationController
-metadata:
-  name: {name_prefix}-ps{param_server_id}
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        tf-ps: "{param_server_id}"
-        name-prefix: "{name_prefix}"
-        job: "ps"
-    spec:
-      containers:
-      - name: tf-ps{param_server_id}
-        image: {docker_image}
-        args: [{args}]
-        ports:
-        - containerPort: {port}
-        env: [{env_vars}]
-        volumeMounts: [{volume_mounts}]
-      volumes: [{volumes}]
-""")
-PARAM_SERVER_SVC = (
-    """apiVersion: v1
-kind: Service
-metadata:
-  name: {name_prefix}-ps{param_server_id}
-  labels:
-    tf-ps: "{param_server_id}"
-spec:
-  ports:
-  - port: {port}
-  selector:
-    tf-ps: "{param_server_id}"
-""")
-PARAM_LB_SVC = ("""apiVersion: v1
-kind: Service
-metadata:
-  name: {name_prefix}-ps{param_server_id}
-  labels:
-    tf-ps: "{param_server_id}"
-spec:
-  type: LoadBalancer
-  ports:
-  - port: {port}
-  selector:
-    tf-ps: "{param_server_id}"
-""")
-VOLUME_MOUNTS = '{name: shared, mountPath: /shared}'
-VOLUMES = '{name: shared, hostPath: {path: /shared}}'
-_ENV_VAR_TEMPLATE = '{name: "%s", value: "%s"}'
-_ARG_TEMPLATE = '"--%s=%s"'
-
-
-def GenerateConfig(num_workers,
-                   num_param_servers,
-                   port,
-                   request_load_balancer,
-                   docker_image,
-                   name_prefix,
-                   env_vars=None,
-                   use_shared_volume=True,
-                   use_cluster_spec=True):
-  """Generate configuration strings.
-
-  Args:
-    num_workers: number of worker jobs.
-    num_param_servers: number of ps server jobs.
-    port: GRPC server port.
-    request_load_balancer: request worker0 to be exposed on a public IP
-      address via an external load balancer.
-    docker_image: docker image to use.
-    name_prefix: name to prepend to pod job names.
-    env_vars: dictionary of environment variables to set.
-    use_shared_volume: whether to add hostPath to /shared directory
-      to the kubernetes config.
-    use_cluster_spec: if true, pass --cluster_spec to worker and ps jobs.
-      If false, pass --worker_hosts and --ps_hosts to worker and ps jobs.
-
-  Returns:
-    Kubernetes yaml config.
-  """
-  if env_vars is None:
-    env_vars = {}
-  env_str = ', '.join([_ENV_VAR_TEMPLATE % (name, value)
-                       for name, value in env_vars.items()])
-  config = ''
-  common_args = GetCommonArgs(
-      num_workers, num_param_servers, port, name_prefix, use_cluster_spec)
-  for worker in range(num_workers):
-    worker_args = {
-        'job_name': 'worker',
-        'task_id': worker
-    }
-    worker_args.update(common_args)
-    arg_str = ', '.join([_ARG_TEMPLATE % (name, value)
-                         for name, value in worker_args.items()])
-    config += WORKER_RC.format(
-        port=port,
-        worker_id=worker,
-        docker_image=docker_image,
-        name_prefix=name_prefix,
-        volume_mounts=VOLUME_MOUNTS if use_shared_volume else '',
-        volumes=VOLUMES if use_shared_volume else '',
-        args=arg_str,
-        env_vars=env_str)
-    config += '---\n'
-    if request_load_balancer:
-      config += WORKER_LB_SVC.format(port=port,
-                                     worker_id=worker,
-                                     name_prefix=name_prefix)
-    else:
-      config += WORKER_SVC.format(port=port,
-                                  worker_id=worker,
-                                  name_prefix=name_prefix)
-    config += '---\n'
-
-  for param_server in range(num_param_servers):
-    ps_args = {
-        'job_name': 'ps',
-        'task_id': param_server
-    }
-    ps_args.update(common_args)
-    arg_str = ', '.join([_ARG_TEMPLATE % (name, value)
-                         for name, value in ps_args.items()])
-    config += PARAM_SERVER_RC.format(
-        port=port,
-        param_server_id=param_server,
-        docker_image=docker_image,
-        name_prefix=name_prefix,
-        volume_mounts=VOLUME_MOUNTS if use_shared_volume else '',
-        volumes=VOLUMES if use_shared_volume else '',
-        args=arg_str,
-        env_vars=env_str)
-    config += '---\n'
-    if request_load_balancer:
-      config += PARAM_LB_SVC.format(
-          port=port, param_server_id=param_server, name_prefix=name_prefix)
-    else:
-      config += PARAM_SERVER_SVC.format(
-          port=port, param_server_id=param_server, name_prefix=name_prefix)
-    config += '---\n'
-
-  return config
-
-
-def WorkerClusterSpecString(num_workers,
-                            num_param_servers,
-                            port,
-                            name_prefix):
-  """Generates worker cluster spec."""
-  return ClusterSpecString(num_workers, num_param_servers, port, name_prefix)
-
-
-def ParamServerClusterSpecString(num_workers,
-                                 num_param_servers,
-                                 port,
-                                 name_prefix):
-  """Generates parameter server spec."""
-  return ClusterSpecString(num_workers, num_param_servers, port,
-                           name_prefix)
-
-
-def ClusterSpecString(num_workers,
-                      num_param_servers,
-                      port,
-                      name_prefix):
-  """Generates general cluster spec."""
-  spec = 'worker|'
-  for worker in range(num_workers):
-    spec += '%s-worker%d:%d' % (name_prefix, worker, port)
-    if worker != num_workers-1:
-      spec += ';'
-
-  spec += ',ps|'
-  for param_server in range(num_param_servers):
-    spec += '%s-ps%d:%d' % (name_prefix, param_server, port)
-    if param_server != num_param_servers-1:
-      spec += ';'
-
-  return spec
-
-
-def GetCommonArgs(num_workers,
-                  num_param_servers,
-                  port,
-                  name_prefix,
-                  use_cluster_spec):
-  """Get arguments common to both worker and ps jobs.
-
-  Args:
-    num_workers: number of workers.
-    num_param_servers: number of ps servers.
-    port: worker and ps port number.
-    name_prefix: prefix to prepend to job names.
-    use_cluster_spec: if true, pass --cluster_spec argument.
-      If false, parse --worker_hosts and --ps_hosts arguments.
-
-  Returns:
-    A dictionary of argument names mapping to argument values.
-  """
-  common_args = {}
-  if use_cluster_spec:
-    common_args['cluster_spec'] = WorkerClusterSpecString(
-        num_workers,
-        num_param_servers,
-        port,
-        name_prefix)
-  else:
-    common_args['worker_hosts'] = WorkerHosts(num_workers, port, name_prefix)
-    common_args['ps_hosts'] = PsHosts(num_param_servers, port, name_prefix)
-  return common_args
-
-
-def WorkerHosts(num_workers, port, name_prefix):
-  worker_hosts = ['%s-worker%d:%d' % (name_prefix, i, port)
-                  for i in range(num_workers)]
-  return ','.join(worker_hosts)
-
-
-def PsHosts(num_ps, port, name_prefix):
-  ps_hosts = ['%s-ps%d:%d' % (name_prefix, i, port)
-              for i in range(num_ps)]
-  return ','.join(ps_hosts)
diff --git a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py b/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py
deleted file mode 100644
index 7d9b3f83f51f13f23bde6a74373e06ba41c8d096..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/k8s_tensorflow_test.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.tools.dist_test.scripts.k8s_tensorflow_lib."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import googletest
-from tensorflow.tools.dist_test.scripts import k8s_tensorflow_lib
-
-
-class K8sTensorflowTest(googletest.TestCase):
-
-  def testGenerateConfig_LoadBalancer(self):
-    # Use loadbalancer
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=True,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False)
-    self.assertTrue('LoadBalancer' in config)
-
-    # Don't use loadbalancer
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=False,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False)
-    self.assertFalse('LoadBalancer' in config)
-
-  def testGenerateConfig_SharedVolume(self):
-    # Use shared directory
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=False,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=True)
-    self.assertTrue('/shared' in config)
-
-    # Don't use shared directory
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=False,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False)
-    self.assertFalse('/shared' in config)
-
-  def testEnvVar(self):
-    # Use loadbalancer
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=True,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False,
-        env_vars={'test1': 'test1_value', 'test2': 'test2_value'})
-    self.assertTrue('{name: "test1", value: "test1_value"}' in config)
-    self.assertTrue('{name: "test2", value: "test2_value"}' in config)
-
-  def testClusterSpec(self):
-    # Use cluster_spec
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=True,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False,
-        use_cluster_spec=True)
-    self.assertFalse('worker_hosts' in config)
-    self.assertFalse('ps_hosts' in config)
-    self.assertTrue(
-        '"--cluster_spec=worker|abc-worker0:5000,ps|abc-ps0:5000"' in config)
-
-    # Don't use cluster_spec
-    config = k8s_tensorflow_lib.GenerateConfig(
-        num_workers=1,
-        num_param_servers=1,
-        port=5000,
-        request_load_balancer=True,
-        docker_image='test_image',
-        name_prefix='abc',
-        use_shared_volume=False,
-        use_cluster_spec=False)
-    self.assertFalse('cluster_spec' in config)
-    self.assertTrue('"--worker_hosts=abc-worker0:5000"' in config)
-    self.assertTrue('"--ps_hosts=abc-ps0:5000"' in config)
-
-  def testWorkerHosts(self):
-    self.assertEquals(
-        'test_prefix-worker0:1234',
-        k8s_tensorflow_lib.WorkerHosts(1, 1234, 'test_prefix'))
-    self.assertEquals(
-        'test_prefix-worker0:1234,test_prefix-worker1:1234',
-        k8s_tensorflow_lib.WorkerHosts(2, 1234, 'test_prefix'))
-
-  def testPsHosts(self):
-    self.assertEquals(
-        'test_prefix-ps0:1234,test_prefix-ps1:1234',
-        k8s_tensorflow_lib.PsHosts(2, 1234, 'test_prefix'))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/tools/dist_test/scripts/utils.sh b/tensorflow/tools/dist_test/scripts/utils.sh
deleted file mode 100644
index 2fe57f520ed8c9f8debf0051e63ac3717661c103..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts/utils.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Utility functions for dist_test scripts
-
-
-# Print info and exit with code 1
-die() {
-  echo $@
-  exit 1
-}
-
-
-# Determine if all k8s pods in a namespace are all in the "Running" state
-are_all_pods_running() {
-  # Usage: are_all_pods_running <KUBECTL_BIN> [namespace]
-  KUBECTL_BIN=$1
-
-  if [[ -z "$2" ]]; then
-    NS_FLAG=""
-  else
-    NS_FLAG="--namespace=$2"
-  fi
-
-  sleep 1  # Wait for the status to settle
-  NPODS=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | wc -l)
-  NRUNNING=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | \
-      grep "Running" | wc -l)
-  NERR=$("${KUBECTL_BIN}" "${NS_FLAG}" get pods | tail -n +2 | \
-      grep "Err" | wc -l)
-
-  if [[ ${NERR} != "0" ]]; then
-    # "2" signifies that error has occurred
-    echo "2"
-  elif [[ ${NPODS} == ${NRUNNING} ]]; then
-    # "1" signifies that all pods are in Running state
-    echo "1"
-  else
-    # "0" signifies that some pods have not entered Running state, but
-    # no error has occurred
-    echo "0"
-  fi
-}
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh b/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh
deleted file mode 100755
index 2f83c36fad1b0e5cffb90e73e230cc23c21338f5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-function usage {
-  script_name=$0
-  echo "Usage:"
-  echo "  $script_name [--image docker_image] [--num_containers num_of_containers]"
-  echo "               [--deployment deployment_name] [--config_map config_map]"
-  echo "               [--cp] [--src local_src_dir] [--dest container_dest_dir]"
-  echo "               [--port container_ssh_port] [--hostnet] [--shared_volume]"
-  echo "               [--delete] [--help]"
-  echo ""
-  echo "  Parameters:"
-  echo "    image:          docker image used to create container."
-  echo "    num_containers: number of containers that will be launched."
-  echo "    deployment:     deployment name. (default: k8s-ml-deployment)"
-  echo "    config_map:     config map name. (default: k8s-config-map)"
-  echo "    cp:             upload file to all containers. (src and dest must"
-  echo "                    be provided along with cp option)"
-  echo "    src:            path to local source file. (used for cp option)"
-  echo "    dest:           path to destination in container. (used for cp option)"
-  echo "    port:           ssh port in container. Set ssh port (other than 22)"
-  echo "                    when host network mode is enabled"
-  echo "    hostnet:        enable host network mode. (default: disable)"
-  echo "    shared_volume:  mount shared volume. (default: disable)"
-  echo "    delete:         delete deployment and configmap."
-  echo "                    (default: k8s-ml-deployment and k8s-config-map)"
-  echo "    help:           print usage."
-}
-
-# Create temporary directory
-TMP_DIR=$(mktemp -d)
-
-# Temporary k8s yaml file
-YAML_TMP_FILE="${TMP_DIR}/k8s_ml.yaml"
-
-# Temporary hostfile
-HOST_FILE="${TMP_DIR}/hostfile"
-
-# Docker image and number of containers
-DOCKER_IMAGE=""
-NUM_CONTAINERS=0
-
-# Default ssh port
-SSH_PORT=22
-
-# Default config map
-CONFIG_MAP="k8s-config-map"
-
-# Default Deployment
-DEPLOYMENT="k8s-ml-deployment"
-
-# Used for uploading file to all docker containers
-CP=0
-SRC=""
-DEST=""
-
-# Python script to generate yaml file for k8s TensorFlow cluster
-CUR_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-K8S_GEN_ALLREDUCE_TF_YAML="${CUR_SCRIPT_DIR}/k8s_generate_yaml.py"
-
-# Create or delete tensorflow cluster
-# DELETE=0: Create cluster
-# DELETE=1: Delete cluster
-DELETE=0
-
-# Used to enable host network mode to achieve best performance
-# USE_HOSTNET=0: Flannel network mode
-# USE_HOSTNET=1: Host network mode
-USE_HOSTNET=0
-
-# Used to mount shared volume
-USE_SHARED_VOLUME=0
-
-if [[ $# -lt 1 ]]; then
-  echo "Error: illegal number of parameters"
-  usage
-  exit 1
-fi
-
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case $key in
-    --image)
-      DOCKER_IMAGE="$2"
-      shift
-      ;;
-    --num_containers)
-      NUM_CONTAINERS="$2"
-      shift
-      ;;
-    --config_map)
-      CONFIG_MAP="$2"
-      shift
-      ;;
-    --deployment)
-      DEPLOYMENT="$2"
-      shift
-      ;;
-    --cp)
-      CP=1
-      ;;
-    --src)
-      SRC="$2"
-      shift
-      ;;
-    --dest)
-      DEST="$2"
-      shift
-      ;;
-    --port)
-      SSH_PORT="$2"
-      shift
-      ;;
-    --hostnet)
-      USE_HOSTNET=1
-      ;;
-    --shared_volume)
-      USE_SHARED_VOLUME=1
-      ;;
-    --delete)
-      DELETE=1
-      ;;
-    --help)
-      usage
-      exit 0
-      ;;
-    *)
-      echo "Unknown option: $key"
-      usage
-      exit 1
-      ;;
-  esac
-  shift
-done
-
-function generate_yaml_file {
-  if [[ ! -f ${K8S_GEN_ALLREDUCE_TF_YAML} ]]; then
-    echo "Error: can not find yaml-generating script ${K8S_GEN_ALLREDUCE_TF_YAML}"
-    exit 1
-  fi
-
-  echo ""
-  echo "Generating k8s cluster yaml config file with the following settings"
-  echo "  Docker image: ${DOCKER_IMAGE}"
-  echo "  Number of containers: ${NUM_CONTAINERS}"
-  echo "  Config map: ${CONFIG_MAP}"
-  echo "  Deployment: ${DEPLOYMENT}"
-
-  if [[ $USE_HOSTNET -eq 1 ]]; then
-    echo "  Host network mode: True"
-    echo "  Container ssh port: ${SSH_PORT}"
-  fi
-
-  python ${K8S_GEN_ALLREDUCE_TF_YAML} \
-    --docker_image ${DOCKER_IMAGE} \
-    --num_containers ${NUM_CONTAINERS} \
-    --config_map ${CONFIG_MAP} \
-    --deployment ${DEPLOYMENT} \
-    --ssh_port ${SSH_PORT} \
-    --use_hostnet ${USE_HOSTNET} \
-    --use_shared_volume ${USE_SHARED_VOLUME} \
-    > ${YAML_TMP_FILE}
-}
-
-# Note: this function remove the yaml file to make sure that the key automatically
-# generated inside the container is not reused in other deployment
-function remove_yaml_file {
-  rm -rf ${YAML_TMP_FILE}
-}
-
-function upload_file_to_all_containers {
-  ${KUBECTL_BIN} get pods | grep ${DEPLOYMENT} \
-    | awk '{print $1}' | \
-    while read line;
-    do
-      echo "Uploading $1 to $line:$2"
-      ${KUBECTL_BIN} cp $1 $line:$2
-    done
-}
-
-function generate_container_hostfile {
-  # This line assumes that --output=wide prints the IP addresses
-  # in the 6th column
-  ${KUBECTL_BIN} get pods --output=wide | grep ${DEPLOYMENT} \
-      | awk '{print $6}' > ${HOST_FILE}
-
-  echo ""
-  echo "Containers hostfile locates at ${HOST_FILE}"
-}
-
-function launch_container {
-  generate_yaml_file
-  echo ""
-  echo "Launching k8s cluster..."
-  ${KUBECTL_BIN} create -f ${YAML_TMP_FILE}
-  generate_container_hostfile
-  remove_yaml_file
-}
-
-function delete_deployment_configmap {
-  ${KUBECTL_BIN} delete deployment ${DEPLOYMENT}
-  ${KUBECTL_BIN} delete configmap ${CONFIG_MAP}
-}
-
-# Check kubectl binary
-KUBECTL_BIN=kubectl
-if [[ ! -x "$(command -v ${KUBECTL_BIN})" ]]; then
-  echo 'Error: cannot find kubectl binary'
-  exit 1
-fi
-
-if [[ $DELETE -eq 1 ]]; then
-  echo "Deleting deployment ${DEPLOYMENT} and config map ${CONFIG_MAP}..."
-  delete_deployment_configmap
-elif [[ $CP -eq 1 || -n "$SRC" || -n "$DEST" ]] ; then
-  if [[ "$CP" -eq 1 && -n "$SRC" && -n "$DEST" ]]; then
-    upload_file_to_all_containers $SRC $DEST
-  else
-    echo "Error: all cp, src and dest are required to upload file to container"
-    exit 1
-  fi
-else
-  if [[ -z "$DOCKER_IMAGE" ]]; then
-    echo "Error: docker image is missing"
-    exit 1
-  fi
-
-  if [[ "$NUM_CONTAINERS" -le 0 ]]; then
-    echo "Error: illegal number of containers"
-    exit 1
-  fi
-
-  if [[ $USE_HOSTNET -eq 1 && $SSH_PORT -eq 22 ]]; then
-    echo "Error: please set container ssh port with --port (other than 22)" \
-        "when host network mode is enabled"
-    exit 1
-  fi
-
-  launch_container
-fi
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py
deleted file mode 100644
index cd3d49af9b3b7ae62447f56e98f34638b25705c1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/python
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generates YAML configuration file for allreduce-based distributed TensorFlow.
-
-The workers will be run in a Kubernetes (k8s) container cluster.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import k8s_generate_yaml_lib
-
-# Note: It is intentional that we do not import tensorflow in this script. The
-# machine that launches a TensorFlow k8s cluster does not have to have the
-# Python package of TensorFlow installed on it.
-
-DEFAULT_DOCKER_IMAGE = 'tensorflow/tensorflow:latest-devel'
-DEFAULT_PORT = 22
-
-DEFAULT_CONFIG_MAP = 'k8s-config-map'
-DEFAULT_DEPLOYMENT = 'k8s-ml-deployment'
-
-
-def main():
-  """Do arg parsing."""
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--docker_image',
-      type=str,
-      default=DEFAULT_DOCKER_IMAGE,
-      help='Override default docker image for the TensorFlow')
-  parser.add_argument(
-      '--num_containers',
-      type=int,
-      default=0,
-      help='How many docker containers to launch')
-  parser.add_argument(
-      '--config_map',
-      type=str,
-      default=DEFAULT_CONFIG_MAP,
-      help='Override default config map')
-  parser.add_argument(
-      '--deployment',
-      type=str,
-      default=DEFAULT_DEPLOYMENT,
-      help='Override default deployment')
-  parser.add_argument(
-      '--ssh_port',
-      type=int,
-      default=DEFAULT_PORT,
-      help='Override default ssh port (Default: %d)' % DEFAULT_PORT)
-  parser.add_argument(
-      '--use_hostnet',
-      type=int,
-      default=0,
-      help='Used to enable host network mode (Default: 0)')
-  parser.add_argument(
-      '--use_shared_volume',
-      type=int,
-      default=0,
-      help='Used to mount shared volume (Default: 0)')
-  args = parser.parse_args()
-
-  if args.num_containers <= 0:
-    sys.stderr.write('--num_containers must be greater than 0; received %d\n' %
-                     args.num_containers)
-    sys.exit(1)
-
-  # Generate contents of yaml config
-  yaml_config = k8s_generate_yaml_lib.GenerateConfig(
-      args.docker_image, args.num_containers, args.config_map, args.deployment,
-      args.ssh_port, args.use_hostnet, args.use_shared_volume)
-  print(yaml_config)  # pylint: disable=superfluous-parens
-
-
-if __name__ == '__main__':
-  main()
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
deleted file mode 100644
index 038a712d538fbaeb8d0d176287704993cff07799..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/python
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generates YAML configuration file for allreduce-based distributed TensorFlow.
-
-The workers will be run in a Kubernetes (k8s) container cluster.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from Crypto.PublicKey import RSA
-
-# Note: It is intentional that we do not import tensorflow in this script. The
-# machine that launches a TensorFlow k8s cluster does not have to have the
-# Python package of TensorFlow installed on it.
-
-CONFIG_MAP = ("""apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: {config_map}
-data:
-  privatekey: |+
-    {private_key}
-
-  publickey: |+
-    {public_key}
-
-  start: |+
-    mkdir /root/.ssh
-    mkdir /var/run/sshd
-    cp /tmp/configs/* /root/.ssh
-    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
-    chmod 600 -R /root/.ssh
-    {change_ssh_port}
-    /usr/bin/ssh-keygen -A
-    /usr/sbin/sshd -De
-
-  sshconfig: |+
-    Host *
-      Port {port}
-      StrictHostKeyChecking no
-
-""")
-
-DEPLOYMENT = ("""apiVersion: apps/v1beta1
-kind: Deployment
-metadata:
-  name: {deployment}
-  labels:
-    app: k8s-ml
-spec:
-  replicas: {num_containers}
-  selector:
-    matchLabels:
-      app: k8s-ml
-  template:
-    metadata:
-      labels:
-        app: k8s-ml
-    spec: {hostnet}
-      securityContext:
-        runAsUser: 0
-      containers:
-      - name: ml
-        image: {docker_image}
-        command:
-        - /bin/bash
-        - -x
-        - /tmp/scripts/start.sh
-        ports:
-        - containerPort: {port}
-        env: [{env_vars}]
-        securityContext:
-          privileged: true
-        volumeMounts: {volume_mounts}
-        - name: dshm
-          mountPath: /dev/shm
-        - name: sshkeys
-          mountPath: /tmp/configs
-        - name: scripts
-          mountPath: /tmp/scripts
-      volumes: {volumes}
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      - name: sshkeys
-        configMap:
-          name: {config_map}
-          items:
-          - key: publickey
-            path: id_rsa.pub
-          - key: privatekey
-            path: id_rsa
-          - key: sshconfig
-            path: config
-      - name: scripts
-        configMap:
-          name: {config_map}
-          items:
-          - key: start
-            path: start.sh
-""")
-_ENV_VAR_TEMPLATE = '{name: "%s", value: "%s"}'
-
-
-def GenerateConfig(docker_image,
-                   num_containers,
-                   config_map,
-                   deployment,
-                   port,
-                   use_hostnet,
-                   use_shared_volume,
-                   env_vars=None):
-  """Generate configuration strings.
-
-  Args:
-    docker_image: docker image to use.
-    num_containers: number of containers.
-    config_map: config map.
-    deployment: deployment.
-    port: ssh port.
-    use_hostnet: Used to enable host network mode.
-    use_shared_volume: Used to mount shared volume.
-    env_vars: dictionary of environment variables to set.
-
-  Returns:
-    Kubernetes yaml config.
-  """
-
-  if env_vars is None:
-    env_vars = {}
-  env_str = ', '.join(
-      [_ENV_VAR_TEMPLATE % (name, value) for name, value in env_vars.items()])
-
-  private_key, public_key = generate_RSA(2048)
-
-  CHANGE_SSH_PORT = get_change_ssh_port(use_hostnet, port)
-
-  config = CONFIG_MAP.format(
-      port=port,
-      config_map=config_map,
-      private_key=private_key,
-      public_key=public_key,
-      change_ssh_port=CHANGE_SSH_PORT,
-      env_vars=env_str)
-  config += '---\n\n'
-
-  HOST_NET = get_hostnet(use_hostnet)
-  VOLUME_MOUNTS = get_volume_mounts(use_shared_volume)
-  VOLUMES = get_volumes(use_shared_volume)
-
-  config += DEPLOYMENT.format(
-      deployment=deployment,
-      num_containers=num_containers,
-      docker_image=docker_image,
-      port=port,
-      config_map=config_map,
-      hostnet=HOST_NET,
-      volume_mounts=VOLUME_MOUNTS,
-      volumes=VOLUMES,
-      env_vars=env_str)
-
-  return config
-
-
-def generate_RSA(bits=2048, exponent=65537):
-  key = RSA.generate(bits, e=exponent)
-  pubkey = key.publickey()
-
-  private_key = key.exportKey('PEM')
-  public_key = pubkey.exportKey('OpenSSH')
-
-  # Format private_key in yaml file
-  space_before = ' ' * 4
-  private_key_split = private_key.split('\n')
-  private_key = ''.join(('' if index == 0 else space_before) + line.strip() \
-        + ('\n' if index != len(private_key_split) - 1 else '') \
-        for index, line in enumerate(private_key_split))
-
-  return private_key, public_key
-
-
-def get_change_ssh_port(use_hostnet, port):
-  if use_hostnet == 1:
-    return r"sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
-
-  return ''
-
-
-def get_hostnet(use_hostnet):
-  if use_hostnet == 1:
-    return """
-      hostNetwork: true
-      hostIPC: true"""
-
-  return ''
-
-
-def get_volume_mounts(use_shared_volume):
-  if use_shared_volume == 1:
-    return """
-        - name: shared
-          mountPath: /shared"""
-
-  return ''
-
-
-def get_volumes(use_shared_volume):
-  if use_shared_volume == 1:
-    return """
-       - name: shared
-         hostPath:
-           path: /shared"""
-
-  return ''
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
deleted file mode 100644
index 3aa53a5615db27fd5d3c32bbbbee68ccc7dc4f2f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/server/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-# Description:
-# TensorFlow GRPC distributed runtime server and tests
-
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "py_binary")
-
-py_binary(
-    name = "grpc_tensorflow_server",
-    srcs = [
-        "grpc_tensorflow_server.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_test(
-    name = "parse_cluster_spec_test",
-    size = "small",
-    srcs = [
-        "parse_cluster_spec_test.py",
-    ],
-    main = "parse_cluster_spec_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":grpc_tensorflow_server",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-    ],
-)
diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile
deleted file mode 100644
index 1359428f1140b6fd6ecf3b14fc5b968b49d4576a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Test server for TensorFlow GRPC server
-#
-# To build the image, use ../build_server.sh
-
-FROM ubuntu:16.04
-
-LABEL maintainer="Shanqing Cai <cais@google.com>"
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y \
-        curl \
-        python-numpy \
-        python-pip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-# Install TensorFlow wheel
-COPY tensorflow-*.whl /
-RUN pip install /tensorflow-*.whl && \
-    rm -f /tensorflow-*.whl
-
-# Copy files, including the GRPC server binary at
-# server/grpc_tensorflow_server.py
-ADD . /var/tf-k8s
-
-# Container entry point
-ENTRYPOINT ["/var/tf-k8s/server/grpc_tensorflow_server.py"]
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
deleted file mode 100644
index ce7e783a1a846db175d9da9ff66572452c3573cd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Test server for TensorFlow GRPC server
-#
-# To build the image, use ../build_server.sh --test
-
-FROM ubuntu:16.04
-
-LABEL maintainer="Shanqing Cai <cais@google.com>"
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y \
-        curl \
-        dnsutils \
-        python \
-        python-dev \
-        python-numpy \
-        python-pip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
-
-# Install python panda for the census wide&deep test
-RUN pip install --upgrade pandas==0.18.1
-
-# Install TensorFlow wheel
-COPY tensorflow-*.whl /
-RUN pip install /tensorflow-*.whl && \
-    rm -f /tensorflow-*.whl
-
-# Copy files, including the GRPC server binary at
-# server/grpc_tensorflow_server.py
-ADD . /var/tf-k8s
-
-# Download MNIST data for tests
-RUN mkdir -p /tmp/mnist-data
-RUN curl -o /tmp/mnist-data/train-labels-idx1-ubyte.gz \
-    https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz
-RUN curl -o /tmp/mnist-data/train-images-idx3-ubyte.gz \
-    https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz
-RUN curl -o /tmp/mnist-data/t10k-labels-idx1-ubyte.gz \
-    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz
-RUN curl -o /tmp/mnist-data/t10k-images-idx3-ubyte.gz \
-    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz
-
-# Download Census data for Wide & Deep test
-RUN mkdir -p /tmp/census-data
-RUN curl -o /tmp/census-data/adult.data \
-    http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data
-RUN curl -o /tmp/census-data/adult.test \
-    http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test
-
-# Container entry point
-ENTRYPOINT ["/var/tf-k8s/server/grpc_tensorflow_server_wrapper.sh"]
diff --git a/tensorflow/tools/dist_test/server/parse_cluster_spec_test.py b/tensorflow/tools/dist_test/server/parse_cluster_spec_test.py
deleted file mode 100644
index 28b786ce2c0af60dc0dd67af4f760527bae8d2ac..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dist_test/server/parse_cluster_spec_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for cluster-spec string parser in GRPC TensorFlow server."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.protobuf import tensorflow_server_pb2
-from tensorflow.python.platform import test
-from tensorflow.tools.dist_test.server import grpc_tensorflow_server
-
-
-class ParseClusterSpecStringTest(test.TestCase):
-
-  def setUp(self):
-    self._cluster = tensorflow_server_pb2.ServerDef(protocol="grpc").cluster
-
-  def test_parse_multi_jobs_sunnyday(self):
-    cluster_spec = ("worker|worker0:2220;worker1:2221;worker2:2222,"
-                    "ps|ps0:3220;ps1:3221")
-
-    grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-
-    self.assertEqual(2, len(self._cluster.job))
-
-    self.assertEqual("worker", self._cluster.job[0].name)
-    self.assertEqual(3, len(self._cluster.job[0].tasks))
-    self.assertEqual("worker0:2220", self._cluster.job[0].tasks[0])
-    self.assertEqual("worker1:2221", self._cluster.job[0].tasks[1])
-    self.assertEqual("worker2:2222", self._cluster.job[0].tasks[2])
-
-    self.assertEqual("ps", self._cluster.job[1].name)
-    self.assertEqual(2, len(self._cluster.job[1].tasks))
-    self.assertEqual("ps0:3220", self._cluster.job[1].tasks[0])
-    self.assertEqual("ps1:3221", self._cluster.job[1].tasks[1])
-
-  def test_empty_cluster_spec_string(self):
-    cluster_spec = ""
-
-    with self.assertRaisesRegexp(ValueError, "Empty cluster_spec string"):
-      grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-
-  def test_parse_misused_comma_for_semicolon(self):
-    cluster_spec = "worker|worker0:2220,worker1:2221"
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "Not exactly one instance of \\'\\|\\'"):
-      grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-
-  def test_parse_misused_semicolon_for_comma(self):
-    cluster_spec = "worker|worker0:2220;ps|ps0:3220"
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "Not exactly one instance of \\'\\|\\'"):
-      grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-
-  def test_parse_empty_job_name(self):
-    cluster_spec = "worker|worker0:2220,|ps0:3220"
-
-    with self.assertRaisesRegexp(ValueError, "Empty job_name in cluster_spec"):
-      grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-      print(self._cluster)
-
-  def test_parse_empty_task(self):
-    cluster_spec = "worker|worker0:2220,ps|"
-
-    with self.assertRaisesRegexp(ValueError, "Empty task string at position 0"):
-      grpc_tensorflow_server.parse_cluster_spec(cluster_spec, self._cluster)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 9ea29c0e201e9cb1630e7bb682d1d7694665decd..c26fa018f9d2c56b0e0cc8627ee6f0f26e9c4142 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -78,7 +78,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.12 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.13 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index e085ee7170c83729cb103811d5e2ba45e3d8cb96..f7450187aed7354490414e1f491e5417a4b41eab 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -93,7 +93,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.12 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.13 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index 4eefd31d0097913e9ff5cb9d0415c0427dcf1de7..32aa00bdffb53d8491ce531dee29d0b14ffffab9 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -3,13 +3,18 @@ FROM ubuntu:18.04
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.12
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
 ARG PIP="pip"
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         git \
@@ -17,35 +22,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng-dev \
-        libzmq3-dev \
         libssl-dev \
+        libzmq3-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         pkg-config \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless
-
-#install Python 3
-RUN if [ ${PYTHON} = "python3.6" ]; then \
-      curl https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tar.xz -o /opt/python.tar.xz && \
-      cd /opt && tar xvf python.tar.xz && \
-      cd /opt/*/ && ./configure && \
-      make && make install; \
-    else \
-      apt-get install -y --no-install-recommends \
-        python-dev \
-        ${PYTHON3_DEV}; \
-    fi
-
-RUN    apt-get clean && \
+        && \
+    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -57,17 +47,12 @@ RUN ${PIP} --no-cache-dir install \
         matplotlib \
         mock \
         numpy \
+        pandas \
         scipy \
         sklearn \
-        pandas \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-      ln -s -f /usr/bin/python3 /usr/bin/python; \
-  elif [ "${PYTHON}" = "python3.6" ]; then \
-      ln -s -f /usr/local/bin/python3.6 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index 3810daefa570210cfba3f044ccb95816d4393e09..21140918aa9ddf752ee8b24fd9a21f19ccace506 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -3,42 +3,43 @@ FROM ubuntu:18.04
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.11
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
 ARG PIP="pip"
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         git \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
+        libnuma-dev \
         libpng-dev \
         libzmq3-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        openssh-client \
+        openssh-server \
         pkg-config \
-        python-dev \
-        ${PYTHON3_DEV} \
         rsync \
         software-properties-common \
         unzip \
+        wget \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -56,9 +57,6 @@ RUN ${PIP} --no-cache-dir install \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index dad27697fa142ac80d7237510b8b7d7ebda2b621..3f7729ba59d88d6eefb939a397558f7c0bbcd79b 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -6,13 +6,18 @@ LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 ARG TF_WHL_URL
 
 # Optional parameters
-ARG TF_BUILD_VERSION=r1.9
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON_DEV="python-dev"
 ARG PIP="pip"
 
 # Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         libfreetype6-dev \
@@ -20,8 +25,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng-dev \
         libzmq3-dev \
         pkg-config \
-        ${PYTHON} \
-        ${PYTHON_DEV} \
         rsync \
         software-properties-common \
         unzip \
@@ -29,9 +32,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -48,13 +48,11 @@ RUN ${PIP} --no-cache-dir install \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
+
 COPY ${TF_WHL_URL} /
 RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
     rm -rf /${TF_WHL_URL}
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 19dc45c62cbc79bf931d89f275b5a7816e9924c8..b0afd637279f7016060559dd678d75cab2451300 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -6,36 +6,36 @@ LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 ARG TF_WHL_URL
 
 # Optional parameters
-ARG TF_BUILD_VERSION=r1.11
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON_DEV="python-dev"
 ARG PIP="pip"
 
 # Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
+        libnuma-dev \
         libpng-dev \
         libzmq3-dev \
+        openssh-client \
+        openssh-server \
         pkg-config \
-        python \
-        ${PYTHON_DEV} \
         rsync \
         software-properties-common \
         unzip \
         wget \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -50,15 +50,13 @@ RUN ${PIP} --no-cache-dir install \
         scipy \
         sklearn \
         && \
-    python -m ipykernel.kernelspec
+    ${PYTHON} -m ipykernel.kernelspec
+
 
 COPY ${TF_WHL_URL} /
 RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
     rm -rf /${TF_WHL_URL}
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 09537b7314491819d06d3bfda2f2446c5af93067..83b72cb5bb8d9686efe37f11357fc610902ebcb9 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -34,6 +34,7 @@ import errno
 import itertools
 import multiprocessing
 import os
+import platform
 import re
 import shutil
 import sys
@@ -552,6 +553,13 @@ def main(argv):
       if not FLAGS.build_images:
         continue
 
+      # Only build images for host architecture
+      proc_arch = platform.processor()
+      is_x86 = proc_arch.startswith('x86')
+      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+          not is_x86 and proc_arch not in tag):
+        continue
+
       # Generate a temporary Dockerfile to use to build, since docker-py
       # needs a filepath relative to the build context (i.e. the current
       # directory)
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index d8fabadec280cc136bd6cc9a30e79390a9a167bd..c806fa4eacd38d6676333153e45631ed5f96ff42 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -47,13 +47,18 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 857b5e20471a82bd162e55b146854d0a5c165db8..a82577b53be4045171b685bbe4076ffdad4d3824 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -47,8 +47,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index c1f6dafbe0b023e5f4885cfd14ac34c96fcd9843..dc5b5d49b90e57e8a6acec1e8cd883d62a40dfba 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -105,6 +106,8 @@ COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index b4dfc8b09975c49f16686353cf7ec2fe1b02585b..da813970a83e289ec7f9237c9b050fd37d7c1e55 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 6d76c06332bef15e5bbf33492a37971d9e5498f6..d0d544f8edfdc16a574d0ba3e33aabf3b5f6401c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -21,23 +21,33 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -48,14 +58,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends \
+            libnvinfer5=5.0.2-1+cuda${CUDA} \
+            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
@@ -63,12 +76,13 @@ ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -125,6 +139,8 @@ COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 160abc876395cf048aa850301de701c950cba149..e36851fa254a77536c3a029151d72810476dd17f 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -21,23 +21,33 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -48,14 +58,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends \
+            libnvinfer5=5.0.2-1+cuda${CUDA} \
+            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
@@ -63,12 +76,13 @@ ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index 46252c541384c91f63cec54af299a945f28a8ccb..85a32fae1b10038956699db276e0c41973a77996 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -21,32 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -75,13 +84,18 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index 80e427f824a186b64031b5325042ba374c9b0021..c661341eaeeb5ec331055943397fa5245aea76fb 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -21,32 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -75,8 +84,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..63bf205a7ae0392a3ab30e142d9e6b100608fa86
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..083d61bf9a2adc69ac821841f628096c91af3524
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c8384f7af2020e142ec0a2c28418252e6028b083
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..08f880e7b530cf6dd0e38dd0ff1e46eecbda17b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6935dd19861bf3de20f2875ccde10c63d0fd4df8
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,160 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends \
+            libnvinfer5=5.0.2-1+cuda${CUDA} \
+            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..42eca62ccb4e05ae22f33df96b36a26adb73c68f
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -0,0 +1,141 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends \
+            libnvinfer5=5.0.2-1+cuda${CUDA} \
+            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e82ca282208373f57923fda2619b51a43a0d52c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e9572756e53d6c400948b2f82e792826c19c7543
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -0,0 +1,112 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index c4ec6095c0cae43b9d5756cd4391ca3ddd329fbe..c056d915d655965583f9f256297a538fbd51ba8c 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,4 +1,6 @@
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e79574a34de7e15bccc68136269962d375459a0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
@@ -0,0 +1,28 @@
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 76758bd147ef9d52b3db072bd0091190e132667c..2ae840687df4fa2419f92b73adc11dca5b3a9f7b 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -3,5 +3,8 @@
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0397ab5fa8569dc0274f9550cf4ecae65489c248
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -0,0 +1,33 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index 0652ac4151d907b660557c991b082541de587a42..a1fd901b343bd80bde2061e29ee7f3abbf7e762d 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -7,7 +7,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -20,9 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index 2b4494ac5955f828b519bb2a61db72f91dace6ef..e02eb053f052fb7347e9fc51a0ed0c3dc07d8a0c 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,20 +1,30 @@
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-dev-10-0 \
-        cuda-cudart-dev-10-0 \
-        cuda-cufft-dev-10-0 \
-        cuda-curand-dev-10-0 \
-        cuda-cusolver-dev-10-0 \
-        cuda-cusparse-dev-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
-        libcudnn7-dev=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -25,14 +35,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         wget \
         git \
         && \
-    find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0 \
-        && rm -rf /var/lib/apt/lists/*
+        && apt-get install -y --no-install-recommends \
+            libnvinfer5=5.0.2-1+cuda${CUDA} \
+            libnvinfer-dev=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
@@ -40,9 +53,10 @@ ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=10.0
-ENV TF_CUDNN_VERSION=7
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index a6393a3280c6eb7cf2d356b02734865be8eb5a04..041ee87839938b80489f750530c47c8519ab6171 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,29 +1,38 @@
-FROM nvidia/cuda:10.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 # Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-10-0 \
-        cuda-cublas-10-0 \
-        cuda-cufft-10-0 \
-        cuda-curand-10-0 \
-        cuda-cusolver-10-0 \
-        cuda-cusparse-10-0 \
-        libcudnn7=7.4.1.5-1+cuda10.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
         && apt-get update \
-        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda10.0 \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
         && apt-get clean \
-        && rm -rf /var/lib/apt/lists/*
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 19d96e7a3df4468ff82f2029a1945a02b1e58932..6fddfe000c60dadd05ff172d4cb036e648377deb 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -55,6 +55,8 @@ releases:
         tag_specs:
             - "{ubuntu}{jupyter}"
             - "{ubuntu-devel}{jupyter}"
+            - "{ubuntu-ppc64le}{jupyter}"
+            - "{ubuntu-devel-ppc64le}{jupyter}"
 
 slice_sets:
 
@@ -122,6 +124,70 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
+    ubuntu-ppc64le:
+        - add_to_name: "-ppc64le"
+          dockerfile_exclusive_name: "cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+        - add_to_name: "-gpu-ppc64le"
+          dockerfile_exclusive_name: "gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
+
+    ubuntu-devel-ppc64le:
+        - add_to_name: "devel-ppc64le"
+          dockerfile_exclusive_name: "devel-cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu-ppc64le"
+          dockerfile_exclusive_name: "devel-gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - LIB_DIR_PREFIX=powerpc64le
+              - CHECKOUT_TF_SRC=1
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+
     nightly:
         - add_to_name: "nightly"
           partials:
diff --git a/tensorflow/tools/dockerfiles/tools.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
index e8929295a5ee397acbe46ebf96894174ca01fca2..a96b2578cba7579c605d25ee6068d2cde278e1f4 100644
--- a/tensorflow/tools/dockerfiles/tools.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -17,7 +17,7 @@
 #
 # You can use this image to quickly develop changes to the Dockerfile assembler
 # or set of TF Docker partials. See README.md for usage instructions.
-FROM debian:stretch
+FROM ubuntu:16.04
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
 RUN apt-get update && apt-get install -y python3 python3-pip bash curl
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index b072853a4ec298ce5c15afc1307a966ecefb743f..cc106b5955ba07f4f166638ba51699060788e6ae 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -81,7 +81,7 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-py_binary(
+py_library(
     name = "generate_lib",
     srcs = ["generate_lib.py"],
     srcs_version = "PY2AND3",
@@ -155,7 +155,7 @@ py_test(
         "optonly",
     ],
     deps = [
-        ":generate2",
+        ":generate2_lib",
     ],
 )
 
@@ -163,7 +163,17 @@ py_binary(
     name = "generate2",
     srcs = ["generate2.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = [":generate2_lib"],
+)
+
+py_library(
+    name = "generate2_lib",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:util",
+    ],
 )
 
 py_library(
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index a66f3e449377fef3d4c7bf4e0b8810cd6111eb85..6157eb1b7fc68f7a88284468d4323780aae004a0 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -194,10 +194,11 @@ class DocGeneratorVisitor(object):
       contrib_score = 1
 
     while parts:
-      parts.pop()
       container = self._index['.'.join(parts)]
       if tf_inspect.ismodule(container):
         break
+      parts.pop()
+
     module_length = len(parts)
     if len(parts) == 2:
       # `tf.submodule.thing` is better than `tf.thing`
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index fba909d26defffad2d7dbaffa4463695685ae50c..2e1c08afcd26d9126df6d31b92d3db6d80762a8f 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -31,16 +31,32 @@ from os import path
 
 from absl import app
 from absl import flags
-
 import tensorflow as tf
 
+from tensorflow_docs.api_generator import doc_controls
+from tensorflow_docs.api_generator import doc_generator_visitor
 from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import parser
+
+import tensorboard
+import tensorflow_estimator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# Use tensorflow's `tf_inspect`, which is aware of `tf_decorator`.
+parser.tf_inspect = tf_inspect
+
+# `tf` has an `__all__` that doesn't list important things like `keras`.
+# The doc generator recognizes `__all__` as the list of public symbols.
+# So patch `tf.__all__` to list everything.
+tf.__all__ = [item_name for item_name, value in tf_inspect.getmembers(tf)]
+
 
 FLAGS = flags.FLAGS
 
 flags.DEFINE_string(
     "code_url_prefix",
-    "/code/stable/tensorflow/",
+    "/code/stable/tensorflow",
     "A url to prepend to code paths when creating links to defining code")
 
 flags.DEFINE_string(
@@ -50,6 +66,28 @@ flags.DEFINE_string(
 flags.DEFINE_bool("search_hints", True,
                   "Include meta-data search hints at the top of each file.")
 
+flags.DEFINE_string("site_path", "",
+                    "The prefix ({site-path}/api_docs/python/...) used in the "
+                    "`_toc.yaml` and `_redirects.yaml` files")
+
+
+# The doc generator isn't aware of tf_export.
+# So prefix the score tuples with -1 when this is the canonical name, +1
+# otherwise. The generator chooses the name with the lowest score.
+class TfExportAwareDocGeneratorVisitor(
+    doc_generator_visitor.DocGeneratorVisitor):
+  """A `tf_export` aware doc_visitor."""
+
+  def _score_name(self, name):
+    canonical = tf_export.get_canonical_name_for_symbol(self._index[name])
+
+    canonical_score = 1
+    if canonical is not None and name == "tf." + canonical:
+      canonical_score = -1
+
+    scores = super(TfExportAwareDocGeneratorVisitor, self)._score_name(name)
+    return (canonical_score,) + scores
+
 
 def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
@@ -59,14 +97,34 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
     code_url_prefix: prefix for "Defined in" links.
     search_hints: Bool. Include meta-data search hints at the top of each file.
   """
+  try:
+    doc_controls.do_not_generate_docs(tf.tools)
+  except AttributeError:
+    pass
+
   base_dir = path.dirname(tf.__file__)
+  base_dirs = (
+      base_dir,
+      path.normpath(path.join(base_dir, "../../tensorflow")),
+      path.dirname(tensorboard.__file__),
+      path.dirname(tensorflow_estimator.__file__),
+  )
+
+  code_url_prefixes = (
+      code_url_prefix,
+      # External packages source repositories
+      "https://github.com/tensorflow/tensorboard/tree/master/tensorboard"
+      "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator"
+  )
+
   doc_generator = generate_lib.DocGenerator(
       root_title="TensorFlow 2.0 Preview",
       py_modules=[("tf", tf)],
-      base_dir=base_dir,
+      base_dir=base_dirs,
       search_hints=search_hints,
-      code_url_prefix=code_url_prefix,
-      site_path="api_docs/")
+      code_url_prefix=code_url_prefixes,
+      site_path=FLAGS.site_path,
+      visitor_cls=TfExportAwareDocGeneratorVisitor)
 
   doc_generator.build(output_dir)
 
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 83b4bf812881f423195f65cc98dc8f3189af3931..d87f9585f2066c96a270a92450e327d637b50a1a 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -502,7 +502,10 @@ def _gen_pairs(items):
   assert len(items) % 2 == 0
   items = iter(items)
   while True:
-    yield next(items), next(items)
+    try:
+      yield next(items), next(items)
+    except StopIteration:
+      return
 
 
 class _FunctionDetail(
@@ -1681,7 +1684,7 @@ def _get_defined_in(py_object, parser_config):
     path = path[:-1]
 
   # Never include links outside this code base.
-  if path.startswith('..'):
+  if path.startswith('..') or re.search(r'\b_api\b', path):
     return None
 
   if re.match(r'.*/gen_[^/]*\.py$', path):
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index c2449da9239df74eac5c6b1cd91df666e170a108..4d52c1fccf957e201ac64c1964a9822aad255815 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -189,7 +189,7 @@ def write_version_info(filename, git_version):
     git_version: the result of a git describe.
   """
   if b"\"" in git_version or b"\\" in git_version:
-    git_version = "git_version_is_invalid"  # do not cause build to fail!
+    git_version = b"git_version_is_invalid"  # do not cause build to fail!
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
@@ -216,7 +216,7 @@ const int tf_monolithic_build() {
   return 0;
 #endif
 }
-""" % git_version
+""" % git_version.decode("utf-8")
   open(filename, "w").write(contents)
 
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index f229099e493d720d3658a06efd7aec9720de27d8..2145b3b0d5bfb788cea05c348f4fb881f7d12fb7 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -136,6 +136,28 @@ cc_library(
     ] + if_not_windows([
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
+        "//tensorflow/core:sparse_ops_op_lib",
+        "//tensorflow/core:parsing_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:io_ops_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
+        "//tensorflow/core:user_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
     ]) + if_not_v2([
         "//tensorflow/contrib/rnn:gru_ops_op_lib",
         "//tensorflow/contrib/rnn:lstm_ops_op_lib",
@@ -175,6 +197,7 @@ tf_cc_test(
         ":transforms_lib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index c5c0f2da896378405b8b0da72935d5d677cfe741..a90916cd1b935d38b6db3dbbbf0f656861e6858d 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -152,10 +152,10 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 
 The batch norm folding is included twice because there are two different flavors
 of batch normalization used in TensorFlow. The older version was implemented
-with a single BatchNormWithGlobalNormalization op, but it was deprecated in
-favor of a more recent approach using individual ops to implement the same
-computation. The two transforms are in there so that both styles are recognized
-and optimized.
+with a single op like BatchNormWithGlobalNormalization or FusedBatchNorm, and
+BatchNormWithGlobalNormalization was deprecated in favor of a more recent
+approach using individual ops to implement the same computation. The two
+transforms are in there so that both styles are recognized and optimized.
 
 ### Fixing Missing Kernel Errors on Mobile
 
@@ -405,13 +405,14 @@ to continue on past transient errors, since this is just an optimization phase.
 Args: None \
 Prerequisites: None
 
-In the early days of TensorFlow, batch normalization was implemented using a
-single monolithic `BatchNormWithGlobalNormalization` op. In modern versions,
-adding batch normalization from Python will give you a series of smaller math
-ops instead, to achieve the same effect without special-purpose code. If you
-have a graph that uses the older-style, this transform will recognize and
-optimize those ops for inference, in the same way that the
-[fold_batch_norms](#fold_batch_norms) transform does for the new approach.
+In the early days of TensorFlow, batch normalization was implemented using
+single monolithic ops like `BatchNormWithGlobalNormalization` or
+`FusedBatchNorm`. In modern versions, adding batch normalization from Python
+will give you a series of smaller math ops instead, to achieve the same effect
+without special-purpose code. If you have a graph that uses the older-style,
+this transform will recognize and optimize those ops for inference, in the same
+way that the [fold_batch_norms](#fold_batch_norms) transform does for the new
+approach.
 
 ### freeze_requantization_ranges
 
@@ -805,7 +806,7 @@ Status RenameOp(const GraphDef& input_graph_def,
       !context.params.count("new_op_name") ||
       (context.params.at("new_op_name").size() != 1)) {
     return errors::InvalidArgument(
-        "remove_nodes expects exactly one 'old_op_name' and 'new_op_name' "
+        "rename_op expects exactly one 'old_op_name' and 'new_op_name' "
         "argument, e.g. rename_op(old_op_name=Mul, new_op_name=Multiply)");
   }
 
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 16a0f7d58df66be06224d58de623ee7e2dc41880..f59a7abbea93d7b9c838938689009d4d90c68095 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -37,7 +37,7 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
       input_graph_def,  // clang-format off
       {"Mul",                // mul_node
         {
-          {"Conv2D|MatMul",  // conv_node
+          {"Conv2D|MatMul|DepthwiseConv2dNative",  // conv_node
             {
               {"*"},         // input_node
               {"Const"},     // weights_node
@@ -72,8 +72,15 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
 
         // Make sure all the inputs really are vectors, with as many entries as
         // there are columns in the weights.
-        const int weights_cols_index = conv_node.op() == "Conv2D" ? 3 : 1;
-        const int64 weights_cols = weights.shape().dim_size(weights_cols_index);
+        int64 weights_cols;
+        if (conv_node.op() == "Conv2D") {
+          weights_cols = weights.shape().dim_size(3);
+        } else if (conv_node.op() == "DepthwiseConv2dNative") {
+          weights_cols =
+              weights.shape().dim_size(2) * weights.shape().dim_size(3);
+        } else {
+          weights_cols = weights.shape().dim_size(1);
+        }
         if ((mul_values.shape().dims() != 1) ||
             (mul_values.shape().dim_size(0) != weights_cols)) {
           return errors::InvalidArgument(
@@ -82,14 +89,13 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
         }
 
         // Multiply the original weights by the scale vector.
-        auto weights_matrix = weights.flat_inner_dims<float>();
+        auto weights_vector = weights.flat<float>();
         Tensor scaled_weights(DT_FLOAT, weights.shape());
-        auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-        for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-          for (int64 col = 0; col < weights_cols; ++col) {
-            scaled_weights_matrix(row, col) =
-                weights_matrix(row, col) * mul_values.flat<float>()(col);
-          }
+        auto scaled_weights_vector = scaled_weights.flat<float>();
+        for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+          scaled_weights_vector(row) =
+              weights_vector(row) *
+              mul_values.flat<float>()(row % weights_cols);
         }
 
         // Construct the new nodes.
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
index a5d541feb6f4dbfd5a0f61b171fd05160a6d67c8..885fbd59b7797c35639d0a33dbb895d8589b6b4d 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
@@ -87,6 +87,57 @@ class FoldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldBatchNormsDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mul_values_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mul_values_data, {2.0f, 3.0f, 4.0f, 5.0f});
+    Output mul_values_op = Const(root.WithOpName("mul_values"),
+                                 Input::Initializer(mul_values_data));
+
+    Output mul_op = Mul(root.WithOpName("output"), conv_op, mul_values_op);
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(
+        FoldBatchNorms(original_graph_def, {{}, {"output"}}, &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("Mul", node.op());
+    }
+  }
+
   void TestFoldBatchNormsConv2DShared() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -202,6 +253,9 @@ TEST_F(FoldBatchNormsTest, TestFoldBatchNormsConv2D) {
 TEST_F(FoldBatchNormsTest, TestFoldBatchNormsMatMul) {
   TestFoldBatchNormsMatMul();
 }
+TEST_F(FoldBatchNormsTest, TestFoldBatchNormsDepthwiseConv2dNative) {
+  TestFoldBatchNormsDepthwiseConv2dNative();
+}
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index dcc36b1a8557cf30ac030302fcb7545da55c7886..d16d829a16e5672e28aa9d35906a2e5684568468 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -298,13 +298,19 @@ Status FoldConstants(const GraphDef& input_graph_def,
   cf_opts.shape_map = &shape_map;
 
   // Exclude specified nodes from constant folding.
+  std::set<string> excluded_ops, excluded_nodes;
   if (context.params.count("exclude_op") > 0) {
-    const auto& excluded_nodes = context.params.at("exclude_op");
-    const std::set<string> excluded_nodes_set(excluded_nodes.begin(),
-                                              excluded_nodes.end());
-    cf_opts.consider = [excluded_nodes_set](const Node* n) {
-      return excluded_nodes_set.find(n->op_def().name()) ==
-             excluded_nodes_set.end();
+    const auto& ops = context.params.at("exclude_op");
+    excluded_ops = std::set<string>(ops.begin(), ops.end());
+  }
+  if (context.params.count("exclude_node") > 0) {
+    const auto& nodes = context.params.at("exclude_node");
+    excluded_nodes = std::set<string>(nodes.begin(), nodes.end());
+  }
+  if (!excluded_ops.empty() || !excluded_nodes.empty()) {
+    cf_opts.consider = [excluded_ops, excluded_nodes](const Node* n) {
+      return excluded_ops.find(n->op_def().name()) == excluded_ops.end() &&
+             excluded_nodes.find(n->name()) == excluded_nodes.end();
     };
   }
 
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index fd546f812c0dafc5d2e71c94710c3c3f5b75250e..532b4600973cbc2ef2826be1bf551984a1f1f8d6 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -109,24 +109,29 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
                                     const string& conv_output_name,
                                     std::vector<NodeDef>* new_nodes) {
   const NodeDef& conv_node = conv_node_match.node;
-  CHECK_EQ("Conv2D", conv_node.op());
+  // CHECK_EQ("Conv2D", conv_node.op());
   const NodeDef& input_node = conv_node_match.inputs[0].node;
   const NodeDef& weights_node = conv_node_match.inputs[1].node;
   CHECK_EQ("Const", weights_node.op());
 
   Tensor weights = GetNodeTensorAttr(weights_node, "value");
-  const int64 weights_cols = weights.shape().dim_size(3);
+  int64 weights_cols;
+  if (conv_node.op() == "Conv2D") {
+    weights_cols = weights.shape().dim_size(3);
+  } else if (conv_node.op() == "DepthwiseConv2dNative") {
+    weights_cols = weights.shape().dim_size(2) * weights.shape().dim_size(3);
+  } else {
+    weights_cols = weights.shape().dim_size(1);
+  }
   CHECK_EQ(weights_cols, scale_values.size());
 
   // Multiply the original weights by the scale vector.
-  auto weights_matrix = weights.flat_inner_dims<float>();
+  auto weights_vector = weights.flat<float>();
   Tensor scaled_weights(DT_FLOAT, weights.shape());
-  auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-  for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-    for (int64 col = 0; col < weights_cols; ++col) {
-      scaled_weights_matrix(row, col) =
-          weights_matrix(row, col) * scale_values[col];
-    }
+  auto scaled_weights_vector = scaled_weights.flat<float>();
+  for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+    scaled_weights_vector(row) =
+        weights_vector(row) * scale_values[row % weights_cols];
   }
   // Figure out the remaining bias to add on.
   Tensor bias_offset(DT_FLOAT, {weights_cols});
@@ -158,7 +163,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (conv_node.attr().count("data_format") > 0) {
+  if (!conv_node.attr().count("data_format")) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
@@ -185,7 +190,7 @@ Status FuseBatchNormWithConv(const NodeMatch& match,
 }
 
 Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
-                             std::vector<NodeDef>* new_nodes) {
+                                     std::vector<NodeDef>* new_nodes) {
   // Calculate the scale and offset values to apply.
   std::vector<float> scale_values;
   std::vector<float> offset_values;
@@ -200,9 +205,8 @@ Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
   const NodeDef& conv_node = conv_node_match.node;
 
   string biasadd_name = conv_node.name() + "/biasadd";
-  TF_RETURN_IF_ERROR(
-      FuseScaleOffsetToConvWeights(scale_values, offset_values, conv_node_match,
-                                   biasadd_name , new_nodes));
+  TF_RETURN_IF_ERROR(FuseScaleOffsetToConvWeights(
+      scale_values, offset_values, conv_node_match, biasadd_name, new_nodes));
 
   NodeDef new_batch_to_space_node = batch_to_space_node;
   // reuse batch_norm node name
@@ -292,7 +296,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         current_graph_def,  // clang-format off
       {"BatchNormWithGlobalNormalization|FusedBatchNorm",    // batch_norm_node
         {
-          {"Conv2D",                          // conv_node
+          {"Conv2D|DepthwiseConv2dNative",                          // conv_node
             {
               {"*"},                          // input_node
               {"Const"},                      // weights_node
@@ -325,7 +329,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
          {
              {"BatchToSpaceND",                  // batch_to_space_node
               {
-                  {"Conv2D",                     // conv_node
+                  {"Conv2D|DepthwiseConv2dNative",                     // conv_node
                    {
                        {"*"},                    // input_node
                        {"Const"},                // weights_node
@@ -363,13 +367,13 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         {
           {"ConcatV2|Concat",                     // concat two conv2d.
             {
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
                 }
               },
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 6c7174926d06460556ce673a5fe738901134543d..c5fa9b16b0c91e6c069462f0663908737bb6f835 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -121,6 +121,84 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldOldBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("BatchNormWithGlobalNormalization");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("variance_epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("scale_after_normalization", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+    original_graph_def.mutable_versions()->set_producer(8);
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("BatchNormWithGlobalNormalization", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNorms() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -198,6 +276,83 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldFusedBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("FusedBatchNorm");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("is_training", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 2e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("FusedBatchNorm", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNormsWithConcat(const bool split) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -321,16 +476,17 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor block_shape_data(DT_INT32, TensorShape({2}));
   test::FillValues<int32>(&block_shape_data, {1, 2});
-  Output block_shape_op =
-      Const(root.WithOpName("block_shape_op"), Input::Initializer(block_shape_data));
+  Output block_shape_op = Const(root.WithOpName("block_shape_op"),
+                                Input::Initializer(block_shape_data));
 
   Tensor crops_data(DT_INT32, TensorShape({2, 2}));
   test::FillValues<int32>(&crops_data, {0, 0, 0, 1});
   Output crops_op =
       Const(root.WithOpName("crops_op"), Input::Initializer(crops_data));
 
-  Output batch_to_space_op = BatchToSpaceND(root.WithOpName("batch_to_space_op"),
-                                            conv_op, block_shape_op, crops_data);
+  Output batch_to_space_op =
+      BatchToSpaceND(root.WithOpName("batch_to_space_op"), conv_op,
+                     block_shape_op, crops_data);
 
   Tensor mean_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&mean_data, {10.0f, 20.0f});
@@ -339,8 +495,8 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor variance_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&variance_data, {0.25f, 0.5f});
-  Output variance_op = Const(root.WithOpName("variance_op"),
-                             Input::Initializer(variance_data));
+  Output variance_op =
+      Const(root.WithOpName("variance_op"), Input::Initializer(variance_data));
 
   Tensor beta_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&beta_data, {0.1f, 0.6f});
@@ -410,5 +566,14 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithBatchToSpace) {
   TestFoldFusedBatchNormsWithBatchToSpace();
 }
 
+TEST_F(FoldOldBatchNormsTest, TestFoldOldBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldOldBatchNormsAfterDepthwiseConv2dNative();
+}
+
+TEST_F(FoldOldBatchNormsTest,
+       TestFoldFusedBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldFusedBatchNormsAfterDepthwiseConv2dNative();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index 0a76c2b2052a2c26ee66691b361fff2be70bbf30..19bc1dc73576c697dc3b17eacd62966ad9ee3ca2 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -41,7 +41,7 @@ Status RemoveAttribute(const GraphDef& input_graph_def,
   if (context.params.count("op_name")) {
     if (context.params.at("op_name").size() != 1) {
       return errors::InvalidArgument(
-          "remove_nodes expects a single op_name argument, but found ",
+          "remove_attribute expects a single op_name argument, but found ",
           context.params.at("op_name").size());
     }
     op_name = context.params.at("op_name")[0];
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index 62897d43a8ca774418c7b45c1f886cd8cd7fd850..f92e6a3cf9e5a9bc702a58595abd11162747e77a 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -34,9 +34,9 @@ Status RenameAttribute(const GraphDef& input_graph_def,
       !context.params.count("new_attribute_name") ||
       (context.params.at("new_attribute_name").size() != 1)) {
     return errors::InvalidArgument(
-        "remove_nodes expects exactly one 'old_attribute_name' and one "
+        "rename_attribute expects exactly one 'old_attribute_name' and one "
         "'new_attribute_name' argument, e.g. "
-        "remove_attribute(old_attribute_name=foo, new_attribute_name=bar)");
+        "rename_attribute(old_attribute_name=foo, new_attribute_name=bar)");
   }
 
   string op_name;
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index 9deee8bbffbbda41c1e59480c5e642d4c6ce1de9..7a35619f792c5d2d7e6ee9beaade8a47b228221c 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -35,7 +35,7 @@ Status RenameOp(const GraphDef& input_graph_def,
       !context.params.count("new_op_name") ||
       (context.params.at("new_op_name").size() != 1)) {
     return errors::InvalidArgument(
-        "remove_nodes expects exactly one 'old_op_name' and 'new_op_name' "
+        "rename_op expects exactly one 'old_op_name' and 'new_op_name' "
         "argument, e.g. rename_op(old_op_name=Mul, new_op_name=Multiply)");
   }
 
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index d466f21c17ddfec9c0b0181f844b1b608f95246a..5ca45ac90e9a13f176c49764031431e12007ddb7 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -79,7 +79,7 @@ Status ShapeForPlaceholder(const TransformFuncContext& context,
   *result = {};
 
   // Check to see if we have been given a default for all placeholders.
-  if (context.params.count("type")) {
+  if (context.params.count("shape")) {
     if (context.params.at("shape").size() != 1) {
       return errors::InvalidArgument(
           "You must pass no more than one default 'shape' to "
@@ -90,10 +90,10 @@ Status ShapeForPlaceholder(const TransformFuncContext& context,
   }
 
   // See if there's a particular type specified for this placeholder.
-  if (context.params.count("name") || context.params.count("type_for_name")) {
+  if (context.params.count("name") || context.params.count("shape_for_name")) {
     if (!context.params.count("name") ||
-        !context.params.count("type_for_name") ||
-        (context.params.at("type_for_name").size() !=
+        !context.params.count("shape_for_name") ||
+        (context.params.at("shape_for_name").size() !=
          context.params.at("name").size())) {
       return errors::InvalidArgument(
           "You must pass a 'shape_for_name' arg for every 'name', e.g. "
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 86bd5107924ec4627b955264b179a06231ef8532..65259c2639054363e548fe35c80aad264819fe32 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -7,6 +7,7 @@ load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
 
 genrule(
@@ -38,6 +39,7 @@ pkg_tar(
     extension = "tar.gz",
     files = [
         "include/tensorflow/jni/LICENSE",
+        "//:LICENSE",
         "//tensorflow/java:libtensorflow_jni",
     ],
     # Mark as "manual" till
@@ -86,7 +88,13 @@ pkg_tar(
 
 pkg_tar(
     name = "clib",
-    files = ["//tensorflow:libtensorflow.so"],
+    files = select({
+        "//tensorflow:windows": [
+            "//tensorflow:tensorflow.dll",
+            "//tensorflow:tensorflow_dll_import_lib",
+        ],
+        "//conditions:default": ["//tensorflow:libtensorflow.so"],
+    }),
     package_dir = "lib",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
@@ -99,7 +107,10 @@ pkg_tar(
 
 pkg_tar(
     name = "clicenses",
-    files = [":include/tensorflow/c/LICENSE"],
+    files = [
+        ":include/tensorflow/c/LICENSE",
+        "//:LICENSE",
+    ],
     package_dir = "include/tensorflow/c",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
@@ -126,9 +137,9 @@ genrule(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:COPYING",
         "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
-        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
@@ -137,7 +148,9 @@ genrule(
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
-    ] + select({
+    ] + if_not_windows([
+        "@llvm//:LICENSE.TXT",
+    ]) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
@@ -195,9 +208,9 @@ genrule(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:COPYING",
         "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@jpeg//:LICENSE.md",
-        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
@@ -208,7 +221,9 @@ genrule(
         "@zlib_archive//:zlib.h",
         "@grpc//:LICENSE",
         "@grpc//third_party/address_sorting:LICENSE",
-    ] + select({
+    ] + if_not_windows([
+        "@llvm//:LICENSE.TXT",
+    ]) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c51b45a49c4010229bc8a7c20958b57c23139e6a..c7b4314bb3e89528bac5a8efb667f8d2d822f24a 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -59,29 +59,27 @@ COMMON_PIP_DEPS = [
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
-    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
-    # "//tensorflow/python/autograph/converters:converters",
-    # "//tensorflow/python/autograph/core:core",
     "//tensorflow/python/autograph/core:test_lib",
-    # "//tensorflow/python/autograph/impl:impl",
-    # "//tensorflow/python/autograph/lang:lang",
-    # "//tensorflow/python/autograph/operators:operators",
-    # "//tensorflow/python/autograph/pyct:pyct",
-    # "//tensorflow/python/autograph/pyct/testing:testing",
-    # "//tensorflow/python/autograph/pyct/static_analysis:static_analysis",
+    "//tensorflow/python/autograph/pyct/testing:test_modules",
     "//tensorflow/python/autograph/pyct/common_transformers:common_transformers",
     "//tensorflow/python:cond_v2",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:spectral_ops_test_util",
     "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/data/benchmarks:benchmark_base",
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
+    "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
+    "//tensorflow/python/data/kernel_tests:filter_test_base",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/keras/mixed_precision/experimental:test_util",
+    "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/ops/ragged:ragged_test_util",
@@ -90,7 +88,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/tools/api/generator:create_python_api",
     "//tensorflow/python:test_ops",
     "//tensorflow/python:while_v2",
-    "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
 ]
 
 COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
@@ -99,6 +96,7 @@ COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
     "//tensorflow/contrib/compiler:xla",
     "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
+    "//tensorflow/contrib/distribute/python:distribute_test_lib_pip",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
     "//tensorflow/contrib/gan:gan",
@@ -146,7 +144,11 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl:LICENSE",
+        "@absl_py//absl/logging:LICENSE",
         "@absl_py//absl/flags:LICENSE",
+        "@absl_py//absl/testing:LICENSE",
+        "@absl_py//absl/third_party/unittest3_backport:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astor_archive//:LICENSE",
         "@boringssl//:LICENSE",
@@ -155,6 +157,7 @@ filegroup(
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
+        "@enum34_archive//:LICENSE",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
         "@flatbuffers//:LICENSE.txt",
@@ -162,6 +165,7 @@ filegroup(
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
+        "@hwloc//:COPYING",
         "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
         "@keras_applications_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 952c71c61580fba72dbf1a4b2e1bd836816b1420..3bcc4fc81bdbefcc3c1e5481d5a1c18ee8f15768 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -34,6 +34,7 @@ PIP_PACKAGE_QUERY_EXPRESSION = (
 # pip smoke test.
 BUILD_BLACKLIST = [
     "tensorflow/lite/examples/android",
+    "tensorflow/lite/experimental/objc",
     "tensorflow/lite/experimental/swift",
 ]
 
@@ -88,8 +89,8 @@ DEPENDENCY_BLACKLIST = [
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
     # lite
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell",
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell.py",
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test.py",  # pylint:disable=line-too-long
     "//tensorflow/lite/python:interpreter",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6e182a5e5c2a9df516ed2ee104f717f6e19c0eca..648216262342fe52a0fc30462400328db2a99dc2 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,20 +45,20 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.12.0'
+_VERSION = '1.13.1'
 
 REQUIRED_PACKAGES = [
-    'absl-py >= 0.1.6',
+    'absl-py >= 0.7.0',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'google_pasta >= 0.1.1',
+    'google_pasta >= 0.1.2',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
-    'numpy >= 1.13.3',
+    'numpy >= 1.14.5, < 2.0',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
-    'tensorboard >= 1.12.0, < 1.13.0',
-    'tensorflow_estimator >= 1.10.0',
+    'tensorboard >= 1.13.0, < 1.14.0',
+    'tensorflow_estimator >= 1.13.0rc0, < 1.14.0rc0',
     'termcolor >= 1.1.0',
 ]
 
@@ -87,7 +87,7 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.14.0a0, < 1.15.0a0'
     elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
       REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
     elif 'tensorflow_estimator' in pkg:
@@ -284,6 +284,7 @@ setup(
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index b4b70e0a78e1c86d01aa1f56438e5f7798f7be56..60c0f42330320580dd1917fb45027c9158388c54 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -50,7 +50,7 @@ cc_library(
     copts = if_ios(["-DGOOGLE_LOGGING"]),
     linkopts = select({
         "//tensorflow:windows": [],
-        "//tensorflow:darwin": [
+        "//tensorflow:macos": [
             "-lm",
             "-lpthread",
         ],
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 4b2026b9472b651f8e0571155dab8952d20aa8b2..ef12226ec001cc2ddcb09980fcf38a0aeb794742 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -57,6 +57,14 @@ py_binary(
     srcs = ["run_and_gather_logs.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":run_and_gather_logs_main_lib"],
+)
+
+py_library(
+    name = "run_and_gather_logs_main_lib",
+    srcs = ["run_and_gather_logs.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":run_and_gather_logs_lib",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 3486871080c78dc7a1cc201ea2a4d45ebc342758..97861110346b62659ac97da95727250abaf3b928 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -4,60 +4,66 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # Create a benchmark test target of a TensorFlow C++ test (tf_cc_*_test)
 def tf_cc_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix="",
-    benchmark_type="cpp_microbenchmark"):
-  if not name:
-    fail("Must provide a name")
-  if not target:
-    fail("Must provide a target")
-  if (not ":" in target
-      or not target.startswith("//")
-      or target.endswith(":all")
-      or target.endswith(".")):
-    fail(" ".join(("Target must be a single well-defined test, e.g.,",
-                   "//path/to:test. Received: %s" % target)))
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = "",
+        benchmark_type = "cpp_microbenchmark"):
+    if not name:
+        fail("Must provide a name")
+    if not target:
+        fail("Must provide a target")
+    if (not ":" in target or
+        not target.startswith("//") or
+        target.endswith(":all") or
+        target.endswith(".")):
+        fail(" ".join((
+            "Target must be a single well-defined test, e.g.,",
+            "//path/to:test. Received: %s" % target,
+        )))
 
-  all_tags = (
-    depset(tags) + depset(
-      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
+    all_tags = (
+        depset(tags) + depset(
+            ["benchmark-test", "local", "manual", "regression-test"],
+        )
+    ).to_list()
 
-  tf_py_test(
-      name = name,
-      tags = all_tags,
-      size = "large",
-      srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
-      args = [
-          "--name=//%s:%s" % (native.package_name(), name),
-          "--test_name=" + target,
-          "--test_args=--benchmarks=%s" % benchmarks,
-          "--benchmark_type=%s" % benchmark_type,
-      ],
-      data = [
-        target,
-      ],
-      main = "run_and_gather_logs.py",
-      additional_deps = [
-          "//tensorflow/tools/test:run_and_gather_logs"
-      ])
+    tf_py_test(
+        name = name,
+        tags = all_tags,
+        size = "large",
+        srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
+        args = [
+            "--name=//%s:%s" % (native.package_name(), name),
+            "--test_name=" + target,
+            "--test_args=--benchmarks=%s" % benchmarks,
+            "--benchmark_type=%s" % benchmark_type,
+        ],
+        data = [
+            target,
+        ],
+        main = "run_and_gather_logs.py",
+        additional_deps = [
+            "//tensorflow/tools/test:run_and_gather_logs",
+        ],
+    )
 
 # Create a benchmark test target of a TensorFlow python test (*py_tests)
 def tf_py_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix=""):
-  # For now generating a py benchmark is the same as generating a C++
-  # benchmark target. In the future this may change, so we have
-  # two macros just in case
-  tf_cc_logged_benchmark(
-    name=name,
-    target=target,
-    benchmarks=benchmarks,
-    tags=tags,
-    test_log_output_prefix=test_log_output_prefix,
-    benchmark_type="python_benchmark")
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = ""):
+    # For now generating a py benchmark is the same as generating a C++
+    # benchmark target. In the future this may change, so we have
+    # two macros just in case
+    tf_cc_logged_benchmark(
+        name = name,
+        target = target,
+        benchmarks = benchmarks,
+        tags = tags,
+        test_log_output_prefix = test_log_output_prefix,
+        benchmark_type = "python_benchmark",
+    )
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ad73b19ff16a8165421b5056ab0625918edcdd4d..822150118f9af53ffe37a135019219c53c3a6036 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,7 +7,6 @@ load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
@@ -83,31 +82,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     mkl_repository(
         name = "mkl_linux",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "f00dc3b142a5be399bdeebd7e7ea369545a35d4fb84c86f98b6b048d72685295",
-        strip_prefix = "mklml_lnx_2019.0.1.20180928",
+        sha256 = "f84f92b047edad0467d68a925410b782e54eac9e7af61f4cc33d3d38b29bee5d",
+        strip_prefix = "mklml_lnx_2019.0.3.20190125",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_lnx_2019.0.3.20190125.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_lnx_2019.0.3.20190125.tgz",
         ],
     )
     mkl_repository(
         name = "mkl_windows",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "efef90b7b9613fab10f44c8ac4ff28db613a112c64ed94826d7e44df09c44b0b",
-        strip_prefix = "mklml_win_2019.0.1.20180928",
+        sha256 = "8f968cdb175242f887efa9a6dbced76e65a584fbb35e5f5b05883a3584a2382a",
+        strip_prefix = "mklml_win_2019.0.3.20190125",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_win_2019.0.3.20190125.zip",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_win_2019.0.3.20190125.zip",
         ],
     )
     mkl_repository(
         name = "mkl_darwin",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "83f02938a0c095274db7b8b7b694157abafa3837c5cbaef740440d466c86a477",
-        strip_prefix = "mklml_mac_2019.0.1.20180928",
+        sha256 = "60d6500f0e1a98f011324180fbf7a51a177f45494b4089e02867684d413c4293",
+        strip_prefix = "mklml_mac_2019.0.3.20190125",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_mac_2019.0.3.20190125.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_mac_2019.0.3.20190125.tgz",
         ],
     )
 
@@ -115,36 +114,42 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
+    # Important: If you are upgrading MKL-DNN, then update the version numbers
+    # in third_party/mkl_dnn/mkldnn.BUILD. In addition, the new version of
+    # MKL-DNN might require upgrading MKL ML libraries also. If they need to be
+    # upgraded then update the version numbers on all three versions above
+    # (Linux, Mac, Windows).
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "b100f57af4a2b59a3a37a1ba38f77b644d2107d758a1a7f4e51310063cd21e73",
-        strip_prefix = "mkl-dnn-733fc908874c71a5285043931a1cf80aa923165c",
+        sha256 = "4d0522fc609b4194738dbbe14c8ee1546a2736b03886a07f498250cde53f38fb",
+        strip_prefix = "mkl-dnn-bdd1c7be2cbc0b451d3541ab140742db67f17684",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/bdd1c7be2cbc0b451d3541ab140742db67f17684.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/bdd1c7be2cbc0b451d3541ab140742db67f17684.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "583e5801372a0bb12eb561858532e3bb9a3528f15f65cfc87b2c0f4c1ab1a0ca",
-        strip_prefix = "abseil-cpp-111ca7060a6ff50115ca85b59f6b5d8c8c5e9105",
+        sha256 = "4ca486dfff63a9b7c3ece54895f7b92c5c2444df6a61942bb66d888dd310f358",
+        strip_prefix = "abseil-cpp-7c7754fb3ed9ffb57d35fe8658f3ba4d73a31e72",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/7c7754fb3ed9ffb57d35fe8658f3ba4d73a31e72.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/7c7754fb3ed9ffb57d35fe8658f3ba4d73a31e72.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "df54d805573871d03aaef6d27d45439c18ff6e8db215a3b922b2daefd6914147",
-        strip_prefix = "eigen-eigen-ad3bcd81cc49",
+        patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
+        sha256 = "13a8885ab17cadb6c7e55538081f1f31d90e58d6415858d43ea72199bc0f5e22",
+        strip_prefix = "eigen-eigen-9632304bf806",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/ad3bcd81cc49.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/ad3bcd81cc49.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/9632304bf806.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/9632304bf806.tar.gz",
         ],
     )
 
@@ -185,15 +190,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "44eee8bd47cbd5ff192e895b45f9f913e2e117f10fdb9af0fd3b1a87a7b53bc3",
-        strip_prefix = "google-cloud-cpp-0.4.0",
+        sha256 = "06bc735a117ec7ea92ea580e7f2ffa4b1cd7539e0e04f847bf500588d7f0fe90",
+        strip_prefix = "google-cloud-cpp-0.7.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.4.0.tar.gz",
+            "https://mirror.bazel.build/github.com/googleapis/google-cloud-cpp/archive/v0.7.0.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v0.7.0.tar.gz",
         ],
     )
 
@@ -337,7 +342,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
             "https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
         ],
         sha256 = "8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
-        build_file = clean_dep("//third_party/systemlibs:enum34.BUILD"),
+        build_file = clean_dep("//third_party:enum34.BUILD"),
+        strip_prefix = "enum34-1.1.6/enum",
     )
 
     tf_http_archive(
@@ -476,12 +482,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
-        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
+        sha256 = "e1e3a9edbfbe4230bee174d4aa45a15c1ec2b203cedb02d20df3e6345d8fa63e",
+        strip_prefix = "grpc-62688b6a05cc85b47fb77dd408611734253e47e2",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
-            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
+            "https://github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
         ],
     )
 
@@ -512,11 +518,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "986122dee6053db98d4cf8a60efedd6d94e20c557cc97299a0c7d33b9efae12d",
-        strip_prefix = "llvm-329f768b5fc380a4bfa327396f108a8d8f33e77b",
+        sha256 = "fc5a898c5062140bbd2ebec46f44ed4e0637c2b1510800079057355af8997a6c",
+        strip_prefix = "llvm-e83676d5104ca4d90e404066ea4196a2488451f8",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/329f768b5fc380a4bfa327396f108a8d8f33e77b.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/329f768b5fc380a4bfa327396f108a8d8f33e77b.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/e83676d5104ca4d90e404066ea4196a2488451f8.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/e83676d5104ca4d90e404066ea4196a2488451f8.tar.gz",
         ],
     )
 
@@ -716,16 +722,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "bazel_toolchains",
-        sha256 = "07dfbe80638eb1fe681f7c07e61b34b579c6710c691e49ee90ccdc6e9e75ebbb",
-        strip_prefix = "bazel-toolchains-9a111bd82161c1fbe8ed17a593ca1023fd941c70",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-            "https://github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),